1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// SPDX-License-Identifier: GPL-2.0
#include <net/ip.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <asm/checksum.h>

#ifndef _HAVE_ARCH_IPV6_CSUM
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
                        const struct in6_addr *daddr,
                        __u32 len, __u8 proto, __wsum csum)
{

        int carry;
        __u32 ulen;
        __u32 uproto;
        __u32 sum = (__force u32)csum;

        sum += (__force u32)saddr->s6_addr32[0];
        carry = (sum < (__force u32)saddr->s6_addr32[0]);
        sum += carry;

        sum += (__force u32)saddr->s6_addr32[1];
        carry = (sum < (__force u32)saddr->s6_addr32[1]);
        sum += carry;

        sum += (__force u32)saddr->s6_addr32[2];
        carry = (sum < (__force u32)saddr->s6_addr32[2]);
        sum += carry;

        sum += (__force u32)saddr->s6_addr32[3];
        carry = (sum < (__force u32)saddr->s6_addr32[3]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[0];
        carry = (sum < (__force u32)daddr->s6_addr32[0]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[1];
        carry = (sum < (__force u32)daddr->s6_addr32[1]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[2];
        carry = (sum < (__force u32)daddr->s6_addr32[2]);
        sum += carry;

        sum += (__force u32)daddr->s6_addr32[3];
        carry = (sum < (__force u32)daddr->s6_addr32[3]);
        sum += carry;

        ulen = (__force u32)htonl((__u32) len);
        sum += ulen;
        carry = (sum < ulen);
        sum += carry;

        uproto = (__force u32)htonl(proto);
        sum += uproto;
        carry = (sum < uproto);
        sum += carry;

        return csum_fold((__force __wsum)sum);
}
EXPORT_SYMBOL(csum_ipv6_magic);
#endif

int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
{
        int err;

        UDP_SKB_CB(skb)->partial_cov = 0;
        UDP_SKB_CB(skb)->cscov = skb->len;

        if (proto == IPPROTO_UDPLITE) {
                err = udplite_checksum_init(skb, uh);
                if (err)
                        return err;

                if (UDP_SKB_CB(skb)->partial_cov) {
                        skb->csum = ip6_compute_pseudo(skb, proto);
                        return 0;
                }
        }

        /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels)
         * we accept a checksum of zero here. When we find the socket
         * for the UDP packet we'll check if that socket allows zero checksum
         * for IPv6 (set by socket option).
         *
         * Note, we are only interested in != 0 or == 0, thus the
         * force to int.
         */
        err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
                                                        ip6_compute_pseudo);
        if (err)
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
                /* If SW calculated the value, we know it's bad */
                if (skb->csum_complete_sw)
                        return 1;

                /* HW says the value is bad. Let's validate that.
                 * skb->csum is no longer the full packet checksum,
                 * so don't treat is as such.
                 */
                skb_checksum_complete_unset(skb);
        }

        return 0;
}
EXPORT_SYMBOL(udp6_csum_init);

/* Function to set UDP checksum for an IPv6 UDP packet. This is intended
 * for the simple case like when setting the checksum for a UDP tunnel.
 */
void udp6_set_csum(bool nocheck, struct sk_buff *skb,
                   const struct in6_addr *saddr,
                   const struct in6_addr *daddr, int len)
{
        struct udphdr *uh = udp_hdr(skb);

        if (nocheck)
                uh->check = 0;
        else if (skb_is_gso(skb))
                uh->check = ~udp_v6_check(len, saddr, daddr, 0);
        else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                uh->check = 0;
                uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb));
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        } else {
                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
                uh->check = ~udp_v6_check(len, saddr, daddr, 0);
        }
}
EXPORT_SYMBOL(udp6_set_csum);



















































































































































































































































































































































































































    2 








































    3 






















































































    3 








    3 










    3 





    3 




















    3 




















    3 


























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H

#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/pid.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>

/*
 * Types defining task->signal and task->sighand and APIs using them:
 */

struct sighand_struct {
        spinlock_t                siglock;
        refcount_t                count;
        wait_queue_head_t        signalfd_wqh;
        struct k_sigaction        action[_NSIG];
};

/*
 * Per-process accounting stats:
 */
struct pacct_struct {
        int                        ac_flag;
        long                        ac_exitcode;
        unsigned long                ac_mem;
        u64                        ac_utime, ac_stime;
        unsigned long                ac_minflt, ac_majflt;
};

struct cpu_itimer {
        u64 expires;
        u64 incr;
};

/*
 * This is the atomic variant of task_cputime, which can be used for
 * storing and updating task_cputime statistics without locking.
 */
struct task_cputime_atomic {
        atomic64_t utime;
        atomic64_t stime;
        atomic64_t sum_exec_runtime;
};

#define INIT_CPUTIME_ATOMIC \
        (struct task_cputime_atomic) {                                \
                .utime = ATOMIC64_INIT(0),                        \
                .stime = ATOMIC64_INIT(0),                        \
                .sum_exec_runtime = ATOMIC64_INIT(0),                \
        }
/**
 * struct thread_group_cputimer - thread group interval timer counts
 * @cputime_atomic:        atomic thread group interval timers.
 *
 * This structure contains the version of task_cputime, above, that is
 * used for thread group CPU timer calculations.
 */
struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
};

struct multiprocess_signals {
        sigset_t signal;
        struct hlist_node node;
};

struct core_thread {
        struct task_struct *task;
        struct core_thread *next;
};

struct core_state {
        atomic_t nr_threads;
        struct core_thread dumper;
        struct completion startup;
};

/*
 * NOTE! "signal_struct" does not have its own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
        refcount_t                sigcnt;
        atomic_t                live;
        int                        nr_threads;
        int                        quick_threads;
        struct list_head        thread_head;

        wait_queue_head_t        wait_chldexit;        /* for wait4() */

        /* current thread group signal load-balancing target: */
        struct task_struct        *curr_target;

        /* shared signal handling: */
        struct sigpending        shared_pending;

        /* For collecting multiprocess signals during fork */
        struct hlist_head        multiprocess;

        /* thread group exit support */
        int                        group_exit_code;
        /* notify group_exec_task when notify_count is less or equal to 0 */
        int                        notify_count;
        struct task_struct        *group_exec_task;

        /* thread group stop support, overloads group_exit_code too */
        int                        group_stop_count;
        unsigned int                flags; /* see SIGNAL_* flags below */

        struct core_state *core_state; /* coredumping support */

        /*
         * PR_SET_CHILD_SUBREAPER marks a process, like a service
         * manager, to re-parent orphan (double-forking) child processes
         * to this process instead of 'init'. The service manager is
         * able to receive SIGCHLD signals and is able to investigate
         * the process until it calls wait(). All children of this
         * process will inherit a flag if they should look for a
         * child_subreaper process at exit.
         */
        unsigned int                is_child_subreaper:1;
        unsigned int                has_child_subreaper:1;

#ifdef CONFIG_POSIX_TIMERS

        /* POSIX.1b Interval Timers */
        unsigned int                next_posix_timer_id;
        struct list_head        posix_timers;

        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
        ktime_t it_real_incr;

        /*
         * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
         * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
         * values are defined to 0 and 1 respectively
         */
        struct cpu_itimer it[2];

        /*
         * Thread group totals for process CPU timers.
         * See thread_group_cputimer(), et al, for details.
         */
        struct thread_group_cputimer cputimer;

#endif
        /* Empty if CONFIG_POSIX_TIMERS=n */
        struct posix_cputimers posix_cputimers;

        /* PID/PID hash table linkage. */
        struct pid *pids[PIDTYPE_MAX];

#ifdef CONFIG_NO_HZ_FULL
        atomic_t tick_dep_mask;
#endif

        struct pid *tty_old_pgrp;

        /* boolean value for session group leader */
        int leader;

        struct tty_struct *tty; /* NULL if no tty */

#ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
        seqlock_t stats_lock;
        u64 utime, stime, cutime, cstime;
        u64 gtime;
        u64 cgtime;
        struct prev_cputime prev_cputime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
        unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;

        /*
         * Cumulative ns of schedule CPU time fo dead threads in the
         * group, not including a zombie group leader, (This only differs
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
        unsigned long long sum_sched_runtime;

        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         * to get both rlim_cur and rlim_max atomically, and either one
         * alone is a single word that can safely be read normally.
         * getrlimit/setrlimit use task_lock(current->group_leader) to
         * protect this instead of the siglock, because they really
         * have no need to disable irqs.
         */
        struct rlimit rlim[RLIM_NLIMITS];

#ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;        /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
        struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
#endif

        /*
         * Thread is the potential origin of an oom condition; kill first on
         * oom
         */
        bool oom_flag_origin;
        short oom_score_adj;                /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
        struct mm_struct *oom_mm;        /* recorded mm when the thread group got
                                         * killed by the oom killer */

        struct mutex cred_guard_mutex;        /* guard against foreign influences on
                                         * credential calculations
                                         * (notably. ptrace)
                                         * Deprecated do not use in new code.
                                         * Use exec_update_lock instead.
                                         */
        struct rw_semaphore exec_update_lock;        /* Held while task_struct is
                                                 * being updated during exec,
                                                 * and may have inconsistent
                                                 * permissions.
                                                 */
} __randomize_layout;

/*
 * Bits in flags field of signal_struct.
 */
#define SIGNAL_STOP_STOPPED        0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED        0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT        0x00000004 /* group exit in progress */
/*
 * Pending notifications to parent.
 */
#define SIGNAL_CLD_STOPPED        0x00000010
#define SIGNAL_CLD_CONTINUED        0x00000020
#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)

#define SIGNAL_UNKILLABLE        0x00000040 /* for init: ignore fatal signals */

#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
                          SIGNAL_STOP_CONTINUED)

static inline void signal_set_stop_flags(struct signal_struct *sig,
                                         unsigned int flags)
{
        WARN_ON(sig->flags & SIGNAL_GROUP_EXIT);
        sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}

extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *task, sigset_t *mask,
                          kernel_siginfo_t *info, enum pid_type *type);

static inline int kernel_dequeue_signal(void)
{
        struct task_struct *task = current;
        kernel_siginfo_t __info;
        enum pid_type __type;
        int ret;

        spin_lock_irq(&task->sighand->siglock);
        ret = dequeue_signal(task, &task->blocked, &__info, &__type);
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static inline void kernel_signal_stop(void)
{
        spin_lock_irq(&current->sighand->siglock);
        if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
        }
        spin_unlock_irq(&current->sighand->siglock);

        schedule();
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr);
int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);

int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);
int send_sig_perf(void __user *addr, u32 type, u64 sig_data);

int force_sig_ptrace_errno_trap(int errno, void __user *addr);
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno);
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                        struct task_struct *t);
int force_sig_seccomp(int syscall, int reason, bool force_coredump);

extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
                                const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern void force_fatal_sig(int);
extern void force_exit_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);

static inline void clear_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
}

/*
 * Returns 'true' if kick_process() is needed to force a transition from
 * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
 */
static inline bool __set_notify_signal(struct task_struct *task)
{
        return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
               !wake_up_state(task, TASK_INTERRUPTIBLE);
}

/*
 * Called to break out of interruptible wait loops, and enter the
 * exit_to_user_mode_loop().
 */
static inline void set_notify_signal(struct task_struct *task)
{
        if (__set_notify_signal(task))
                kick_process(task);
}

static inline int restart_syscall(void)
{
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        return -ERESTARTNOINTR;
}

static inline int task_sigpending(struct task_struct *p)
{
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}

static inline int signal_pending(struct task_struct *p)
{
        /*
         * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
         * behavior in terms of ensuring that we break out of wait loops
         * so that notify signal callbacks can be processed.
         */
        if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
                return 1;
        return task_sigpending(p);
}

static inline int __fatal_signal_pending(struct task_struct *p)
{
        return unlikely(sigismember(&p->pending.signal, SIGKILL));
}

static inline int fatal_signal_pending(struct task_struct *p)
{
        return task_sigpending(p) && __fatal_signal_pending(p);
}

static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
        if (!signal_pending(p))
                return 0;

        return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}

/*
 * This should only be used in fault handlers to decide whether we
 * should stop the current fault routine to handle the signals
 * instead, especially with the case where we've got interrupted with
 * a VM_FAULT_RETRY.
 */
static inline bool fault_signal_pending(vm_fault_t fault_flags,
                                        struct pt_regs *regs)
{
        return unlikely((fault_flags & VM_FAULT_RETRY) &&
                        (fatal_signal_pending(current) ||
                         (user_mode(regs) && signal_pending(current))));
}

/*
 * Reevaluate whether the task has signals pending delivery.
 * Wake the task if so.
 * This is required every time the blocked sigset_t changes.
 * callers must hold sighand->siglock.
 */
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);

extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
        unsigned int state = 0;
        if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
                t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
                state = TASK_WAKEKILL | __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
        unsigned int state = 0;
        if (resume) {
                t->jobctl &= ~JOBCTL_TRACED;
                state = __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}

void task_join_group_stop(struct task_struct *task);

#ifdef TIF_RESTORE_SIGMASK
/*
 * Legacy restore_sigmask accessors.  These are inefficient on
 * SMP architectures because they require atomic operations.
 */

/**
 * set_restore_sigmask() - make sure saved_sigmask processing gets done
 *
 * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
 * will run before returning to user mode, to process the flag.  For
 * all callers, TIF_SIGPENDING is already set or it's no harm to set
 * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
 * arch code will notice on return to user mode, in case those bits
 * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
 * signal code always gets run when TIF_RESTORE_SIGMASK is set.
 */
static inline void set_restore_sigmask(void)
{
        set_thread_flag(TIF_RESTORE_SIGMASK);
}

static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}

static inline void clear_restore_sigmask(void)
{
        clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
        return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
        return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}

#else        /* TIF_RESTORE_SIGMASK */

/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
        current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
        current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
        return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
        if (!current->restore_sigmask)
                return false;
        current->restore_sigmask = false;
        return true;
}
#endif

static inline void restore_saved_sigmask(void)
{
        if (test_and_clear_restore_sigmask())
                __set_current_blocked(&current->saved_sigmask);
}

extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
        if (interrupted)
                WARN_ON(!signal_pending(current));
        else
                restore_saved_sigmask();
}

static inline sigset_t *sigmask_to_save(void)
{
        sigset_t *res = &current->blocked;
        if (unlikely(test_restore_sigmask()))
                res = &current->saved_sigmask;
        return res;
}

static inline int kill_cad_pid(int sig, int priv)
{
        return kill_pid(cad_pid, sig, priv);
}

/* These can be the second arg to send_sig_info/send_group_sig_info.  */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV        ((struct kernel_siginfo *) 1)

static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
        return sp >= current->sas_ss_sp &&
                sp - current->sas_ss_sp < current->sas_ss_size;
#else
        return sp > current->sas_ss_sp &&
                sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}

/*
 * True if we are on the alternate signal stack.
 */
static inline int on_sig_stack(unsigned long sp)
{
        /*
         * If the signal stack is SS_AUTODISARM then, by construction, we
         * can't be on the signal stack unless user code deliberately set
         * SS_AUTODISARM when we were already on it.
         *
         * This improves reliability: if user state gets corrupted such that
         * the stack pointer points very close to the end of the signal stack,
         * then this check will enable the signal to be handled anyway.
         */
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;

        return __on_sig_stack(sp);
}

static inline int sas_ss_flags(unsigned long sp)
{
        if (!current->sas_ss_size)
                return SS_DISABLE;

        return on_sig_stack(sp) ? SS_ONSTACK : 0;
}

static inline void sas_ss_reset(struct task_struct *p)
{
        p->sas_ss_sp = 0;
        p->sas_ss_size = 0;
        p->sas_ss_flags = SS_DISABLE;
}

static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
        if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
                return current->sas_ss_sp;
#else
                return current->sas_ss_sp + current->sas_ss_size;
#endif
        return sp;
}

extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);

#define tasklist_empty() \
        list_empty(&init_task.tasks)

#define next_task(p) \
        list_entry_rcu((p)->tasks.next, struct task_struct, tasks)

#define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )

extern bool current_is_single_threaded(void);

/*
 * Without tasklist/siglock it is only rcu-safe if g can't exit/exec,
 * otherwise next_thread(t) will never reach g after list_del_rcu(g).
 */
#define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)

#define for_other_threads(p, t)        \
        for (t = p; (t = next_thread(t)) != p; )

#define __for_each_thread(signal, t)        \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
                lockdep_is_held(&tasklist_lock))

#define for_each_thread(p, t)                \
        __for_each_thread((p)->signal, t)

/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t)        \
        for_each_process(p) for_each_thread(p, t)

typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);

static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        if (type == PIDTYPE_PID)
                pid = task_pid(task);
        else
                pid = task->signal->pids[type];
        return pid;
}

static inline struct pid *task_tgid(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_TGID];
}

/*
 * Without tasklist or RCU lock it is not safe to dereference
 * the result of task_pgrp/task_session even if task == current,
 * we can race with another thread doing sys_setsid/sys_setpgid.
 */
static inline struct pid *task_pgrp(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_PGID];
}

static inline struct pid *task_session(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_SID];
}

static inline int get_nr_threads(struct task_struct *task)
{
        return task->signal->nr_threads;
}

static inline bool thread_group_leader(struct task_struct *p)
{
        return p->exit_signal >= 0;
}

static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
        return p1->signal == p2->signal;
}

/*
 * returns NULL if p is the last thread in the thread group
 */
static inline struct task_struct *__next_thread(struct task_struct *p)
{
        return list_next_or_null_rcu(&p->signal->thread_head,
                                        &p->thread_node,
                                        struct task_struct,
                                        thread_node);
}

static inline struct task_struct *next_thread(struct task_struct *p)
{
        return __next_thread(p) ?: p->group_leader;
}

static inline int thread_group_empty(struct task_struct *p)
{
        return thread_group_leader(p) &&
               list_is_last(&p->thread_node, &p->signal->thread_head);
}

#define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))

extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
                                                        unsigned long *flags);

static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
                                                       unsigned long *flags)
{
        struct sighand_struct *ret;

        ret = __lock_task_sighand(task, flags);
        (void)__cond_lock(&task->sighand->siglock, ret);
        return ret;
}

static inline void unlock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
{
        spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_task_sighand_held(struct task_struct *task);
#else
static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { }
#endif

static inline unsigned long task_rlimit(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
{
        return task_rlimit(current, limit);
}

static inline unsigned long rlimit_max(unsigned int limit)
{
        return task_rlimit_max(current, limit);
}

#endif /* _LINUX_SCHED_SIGNAL_H */





















































    1 






    1 






















































    1 






































    1 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VIRTIO_NET_H
#define _LINUX_VIRTIO_NET_H

#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/virtio_net.h>

static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
{
        switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
                return protocol == cpu_to_be16(ETH_P_IP);
        case VIRTIO_NET_HDR_GSO_TCPV6:
                return protocol == cpu_to_be16(ETH_P_IPV6);
        case VIRTIO_NET_HDR_GSO_UDP:
        case VIRTIO_NET_HDR_GSO_UDP_L4:
                return protocol == cpu_to_be16(ETH_P_IP) ||
                       protocol == cpu_to_be16(ETH_P_IPV6);
        default:
                return false;
        }
}

static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
                                           const struct virtio_net_hdr *hdr)
{
        if (skb->protocol)
                return 0;

        switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
        case VIRTIO_NET_HDR_GSO_UDP:
        case VIRTIO_NET_HDR_GSO_UDP_L4:
                skb->protocol = cpu_to_be16(ETH_P_IP);
                break;
        case VIRTIO_NET_HDR_GSO_TCPV6:
                skb->protocol = cpu_to_be16(ETH_P_IPV6);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
                                        const struct virtio_net_hdr *hdr,
                                        bool little_endian)
{
        unsigned int nh_min_len = sizeof(struct iphdr);
        unsigned int gso_type = 0;
        unsigned int thlen = 0;
        unsigned int p_off = 0;
        unsigned int ip_proto;

        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
                case VIRTIO_NET_HDR_GSO_TCPV4:
                        gso_type = SKB_GSO_TCPV4;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_TCPV6:
                        gso_type = SKB_GSO_TCPV6;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        nh_min_len = sizeof(struct ipv6hdr);
                        break;
                case VIRTIO_NET_HDR_GSO_UDP:
                        gso_type = SKB_GSO_UDP;
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_UDP_L4:
                        gso_type = SKB_GSO_UDP_L4;
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
                default:
                        return -EINVAL;
                }

                if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
                        gso_type |= SKB_GSO_TCP_ECN;

                if (hdr->gso_size == 0)
                        return -EINVAL;
        }

        skb_reset_mac_header(skb);

        if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
                u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
                u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
                u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));

                if (!pskb_may_pull(skb, needed))
                        return -EINVAL;

                if (!skb_partial_csum_set(skb, start, off))
                        return -EINVAL;

                nh_min_len = max_t(u32, nh_min_len, skb_transport_offset(skb));
                p_off = nh_min_len + thlen;
                if (!pskb_may_pull(skb, p_off))
                        return -EINVAL;
        } else {
                /* gso packets without NEEDS_CSUM do not set transport_offset.
                 * probe and drop if does not match one of the above types.
                 */
                if (gso_type && skb->network_header) {
                        struct flow_keys_basic keys;

                        if (!skb->protocol) {
                                __be16 protocol = dev_parse_header_protocol(skb);

                                if (!protocol)
                                        virtio_net_hdr_set_proto(skb, hdr);
                                else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type))
                                        return -EINVAL;
                                else
                                        skb->protocol = protocol;
                        }
retry:
                        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                                              NULL, 0, 0, 0,
                                                              0)) {
                                /* UFO does not specify ipv4 or 6: try both */
                                if (gso_type & SKB_GSO_UDP &&
                                    skb->protocol == htons(ETH_P_IP)) {
                                        skb->protocol = htons(ETH_P_IPV6);
                                        goto retry;
                                }
                                return -EINVAL;
                        }

                        p_off = keys.control.thoff + thlen;
                        if (!pskb_may_pull(skb, p_off) ||
                            keys.basic.ip_proto != ip_proto)
                                return -EINVAL;

                        skb_set_transport_header(skb, keys.control.thoff);
                } else if (gso_type) {
                        p_off = nh_min_len + thlen;
                        if (!pskb_may_pull(skb, p_off))
                                return -EINVAL;
                }
        }

        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
                unsigned int nh_off = p_off;
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                switch (gso_type & ~SKB_GSO_TCP_ECN) {
                case SKB_GSO_UDP:
                        /* UFO may not include transport header in gso_size. */
                        nh_off -= thlen;
                        break;
                case SKB_GSO_UDP_L4:
                        if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
                                return -EINVAL;
                        if (skb->csum_offset != offsetof(struct udphdr, check))
                                return -EINVAL;
                        if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS)
                                return -EINVAL;
                        if (gso_type != SKB_GSO_UDP_L4)
                                return -EINVAL;
                        break;
                }

                /* Kernel has a special handling for GSO_BY_FRAGS. */
                if (gso_size == GSO_BY_FRAGS)
                        return -EINVAL;

                /* Too small packets are not really GSO ones. */
                if (skb->len - nh_off > gso_size) {
                        shinfo->gso_size = gso_size;
                        shinfo->gso_type = gso_type;

                        /* Header must be checked, and gso_segs computed. */
                        shinfo->gso_type |= SKB_GSO_DODGY;
                        shinfo->gso_segs = 0;
                }
        }

        return 0;
}

static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
                                          struct virtio_net_hdr *hdr,
                                          bool little_endian,
                                          bool has_data_valid,
                                          int vlan_hlen)
{
        memset(hdr, 0, sizeof(*hdr));   /* no info leak */

        if (skb_is_gso(skb)) {
                struct skb_shared_info *sinfo = skb_shinfo(skb);

                /* This is a hint as to how much should be linear. */
                hdr->hdr_len = __cpu_to_virtio16(little_endian,
                                                 skb_headlen(skb));
                hdr->gso_size = __cpu_to_virtio16(little_endian,
                                                  sinfo->gso_size);
                if (sinfo->gso_type & SKB_GSO_TCPV4)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
                else if (sinfo->gso_type & SKB_GSO_TCPV6)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
                else if (sinfo->gso_type & SKB_GSO_UDP_L4)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4;
                else
                        return -EINVAL;
                if (sinfo->gso_type & SKB_GSO_TCP_ECN)
                        hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
        } else
                hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
                hdr->csum_start = __cpu_to_virtio16(little_endian,
                        skb_checksum_start_offset(skb) + vlan_hlen);
                hdr->csum_offset = __cpu_to_virtio16(little_endian,
                                skb->csum_offset);
        } else if (has_data_valid &&
                   skb->ip_summed == CHECKSUM_UNNECESSARY) {
                hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
        } /* else everything is zero */

        return 0;
}

#endif /* _LINUX_VIRTIO_NET_H */






























    1 













    1 



    1 



















    1 

    1 


    1 


    1 
    1 













    1 

    1 


    1 
    1 







































































    1 







    1 
    1 

    1 



    1 



    1 
    1 

    1 
    1 




    1 

    1 





    1 

    1 














    1 



    1 



    1 

    1 













    1 
    1 
    1 


















    1 









    1 


    1 
    1 


    1 




    1 































    1 

    1 

























    1 

    1 







































    1 















    1 



















































































































































































    3 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H

#include <linux/atomic.h>
#include <linux/huge_mm.h>
#include <linux/mm_types.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/userfaultfd_k.h>
#include <linux/swapops.h>

/**
 * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
 * @folio: The folio to test.
 *
 * We would like to get this info without a page flag, but the state
 * needs to survive until the folio is last deleted from the LRU, which
 * could be as far down as __page_cache_release.
 *
 * Return: An integer (not a boolean!) used to sort a folio onto the
 * right LRU list and to account folios correctly.
 * 1 if @folio is a regular filesystem backed page cache folio
 * or a lazily freed anonymous folio (e.g. via MADV_FREE).
 * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
 * ram or swap backed folio.
 */
static inline int folio_is_file_lru(struct folio *folio)
{
        return !folio_test_swapbacked(folio);
}

static inline int page_is_file_lru(struct page *page)
{
        return folio_is_file_lru(page_folio(page));
}

static __always_inline void __update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        lockdep_assert_held(&lruvec->lru_lock);
        WARN_ON_ONCE(nr_pages != (int)nr_pages);

        __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
        __mod_zone_page_state(&pgdat->node_zones[zid],
                                NR_ZONE_LRU_BASE + lru, nr_pages);
}

static __always_inline void update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
        mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}

/**
 * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
 * @folio: The folio that was on lru and now has a zero reference.
 */
static __always_inline void __folio_clear_lru_flags(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);

        __folio_clear_lru(folio);

        /* this shouldn't happen, so leave the flags to bad_page() */
        if (folio_test_active(folio) && folio_test_unevictable(folio))
                return;

        __folio_clear_active(folio);
        __folio_clear_unevictable(folio);
}

/**
 * folio_lru_list - Which LRU list should a folio be on?
 * @folio: The folio to test.
 *
 * Return: The LRU list a folio should be on, as an index
 * into the array of LRU lists.
 */
static __always_inline enum lru_list folio_lru_list(struct folio *folio)
{
        enum lru_list lru;

        VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);

        if (folio_test_unevictable(folio))
                return LRU_UNEVICTABLE;

        lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
        if (folio_test_active(folio))
                lru += LRU_ACTIVE;

        return lru;
}

#ifdef CONFIG_LRU_GEN

#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
}
#else
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
}
#endif

static inline bool lru_gen_in_fault(void)
{
        return current->in_lru_fault;
}

static inline int lru_gen_from_seq(unsigned long seq)
{
        return seq % MAX_NR_GENS;
}

static inline int lru_hist_from_seq(unsigned long seq)
{
        return seq % NR_HIST_GENS;
}

static inline int lru_tier_from_refs(int refs)
{
        VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));

        /* see the comment in folio_lru_refs() */
        return order_base_2(refs + 1);
}

static inline int folio_lru_refs(struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags);
        bool workingset = flags & BIT(PG_workingset);

        /*
         * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
         * total number of accesses is N>1, since N=0,1 both map to the first
         * tier. lru_tier_from_refs() will account for this off-by-one. Also see
         * the comment on MAX_NR_TIERS.
         */
        return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
}

static inline int folio_lru_gen(struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags);

        return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}

static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
        unsigned long max_seq = lruvec->lrugen.max_seq;

        VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);

        /* see the comment on MIN_NR_GENS */
        return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
}

static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
                                       int old_gen, int new_gen)
{
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        int delta = folio_nr_pages(folio);
        enum lru_list lru = type * LRU_INACTIVE_FILE;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);

        if (old_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
                           lrugen->nr_pages[old_gen][type][zone] - delta);
        if (new_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
                           lrugen->nr_pages[new_gen][type][zone] + delta);

        /* addition */
        if (old_gen < 0) {
                if (lru_gen_is_active(lruvec, new_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, delta);
                return;
        }

        /* deletion */
        if (new_gen < 0) {
                if (lru_gen_is_active(lruvec, old_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, -delta);
                return;
        }

        /* promotion */
        if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
                __update_lru_size(lruvec, lru, zone, -delta);
                __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
        }

        /* demotion requires isolation, e.g., lru_deactivate_fn() */
        VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long seq;
        unsigned long flags;
        int gen = folio_lru_gen(folio);
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);

        if (folio_test_unevictable(folio) || !lrugen->enabled)
                return false;
        /*
         * There are four common cases for this page:
         * 1. If it's hot, i.e., freshly faulted in, add it to the youngest
         *    generation, and it's protected over the rest below.
         * 2. If it can't be evicted immediately, i.e., a dirty page pending
         *    writeback, add it to the second youngest generation.
         * 3. If it should be evicted first, e.g., cold and clean from
         *    folio_rotate_reclaimable(), add it to the oldest generation.
         * 4. Everything else falls between 2 & 3 above and is added to the
         *    second oldest generation if it's considered inactive, or the
         *    oldest generation otherwise. See lru_gen_is_active().
         */
        if (folio_test_active(folio))
                seq = lrugen->max_seq;
        else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
                 (folio_test_reclaim(folio) &&
                  (folio_test_dirty(folio) || folio_test_writeback(folio))))
                seq = lrugen->max_seq - 1;
        else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
                seq = lrugen->min_seq[type];
        else
                seq = lrugen->min_seq[type] + 1;

        gen = lru_gen_from_seq(seq);
        flags = (gen + 1UL) << LRU_GEN_PGOFF;
        /* see the comment on MIN_NR_GENS about PG_active */
        set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);

        lru_gen_update_size(lruvec, folio, -1, gen);
        /* for folio_rotate_reclaimable() */
        if (reclaiming)
                list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
        else
                list_add(&folio->lru, &lrugen->folios[gen][type][zone]);

        return true;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long flags;
        int gen = folio_lru_gen(folio);

        if (gen < 0)
                return false;

        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);

        /* for folio_migrate_flags() */
        flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
        flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
        gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;

        lru_gen_update_size(lruvec, folio, gen, -1);
        list_del(&folio->lru);

        return true;
}

#else /* !CONFIG_LRU_GEN */

static inline bool lru_gen_enabled(void)
{
        return false;
}

static inline bool lru_gen_in_fault(void)
{
        return false;
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

#endif /* CONFIG_LRU_GEN */

static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_add_folio(lruvec, folio, false))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        if (lru != LRU_UNEVICTABLE)
                list_add(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_add_folio(lruvec, folio, true))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        /* This is not expected to be used on LRU_UNEVICTABLE */
        list_add_tail(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_del_folio(lruvec, folio, false))
                return;

        if (lru != LRU_UNEVICTABLE)
                list_del(&folio->lru);
        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        -folio_nr_pages(folio));
}

#ifdef CONFIG_ANON_VMA_NAME
/* mmap_lock should be read-locked */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_get(&anon_name->kref);
}

static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_put(&anon_name->kref, anon_vma_name_free);
}

static inline
struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
{
        /* Prevent anon_name refcount saturation early on */
        if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
                anon_vma_name_get(anon_name);
                return anon_name;

        }
        return anon_vma_name_alloc(anon_name->name);
}

static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma)
{
        struct anon_vma_name *anon_name = anon_vma_name(orig_vma);

        if (anon_name)
                new_vma->anon_name = anon_vma_name_reuse(anon_name);
}

static inline void free_anon_vma_name(struct vm_area_struct *vma)
{
        /*
         * Not using anon_vma_name because it generates a warning if mmap_lock
         * is not held, which might be the case here.
         */
        anon_vma_name_put(vma->anon_name);
}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        if (anon_name1 == anon_name2)
                return true;

        return anon_name1 && anon_name2 &&
                !strcmp(anon_name1->name, anon_name2->name);
}

#else /* CONFIG_ANON_VMA_NAME */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma) {}
static inline void free_anon_vma_name(struct vm_area_struct *vma) {}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        return true;
}

#endif  /* CONFIG_ANON_VMA_NAME */

static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_set(&mm->tlb_flush_pending, 0);
}

static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_inc(&mm->tlb_flush_pending);
        /*
         * The only time this value is relevant is when there are indeed pages
         * to flush. And we'll only flush pages after changing them, which
         * requires the PTL.
         *
         * So the ordering here is:
         *
         *        atomic_inc(&mm->tlb_flush_pending);
         *        spin_lock(&ptl);
         *        ...
         *        set_pte_at();
         *        spin_unlock(&ptl);
         *
         *                                spin_lock(&ptl)
         *                                mm_tlb_flush_pending();
         *                                ....
         *                                spin_unlock(&ptl);
         *
         *        flush_tlb_range();
         *        atomic_dec(&mm->tlb_flush_pending);
         *
         * Where the increment if constrained by the PTL unlock, it thus
         * ensures that the increment is visible if the PTE modification is
         * visible. After all, if there is no PTE modification, nobody cares
         * about TLB flushes either.
         *
         * This very much relies on users (mm_tlb_flush_pending() and
         * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
         * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
         * locks (PPC) the unlock of one doesn't order against the lock of
         * another PTL.
         *
         * The decrement is ordered by the flush_tlb_range(), such that
         * mm_tlb_flush_pending() will not return false unless all flushes have
         * completed.
         */
}

static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * See inc_tlb_flush_pending().
         *
         * This cannot be smp_mb__before_atomic() because smp_mb() simply does
         * not order against TLB invalidate completion, which is what we need.
         *
         * Therefore we must rely on tlb_flush_*() to guarantee order.
         */
        atomic_dec(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * Must be called after having acquired the PTL; orders against that
         * PTLs release and therefore ensures that if we observe the modified
         * PTE we must also observe the increment from inc_tlb_flush_pending().
         *
         * That is, it only guarantees to return true if there is a flush
         * pending for _this_ PTL.
         */
        return atomic_read(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
        /*
         * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
         * for which there is a TLB flush pending in order to guarantee
         * we've seen both that PTE modification and the increment.
         *
         * (no requirement on actually still holding the PTL, that is irrelevant)
         */
        return atomic_read(&mm->tlb_flush_pending) > 1;
}

#ifdef CONFIG_MMU
/*
 * Computes the pte marker to copy from the given source entry into dst_vma.
 * If no marker should be copied, returns 0.
 * The caller should insert a new pte created with make_pte_marker().
 */
static inline pte_marker copy_pte_marker(
                swp_entry_t entry, struct vm_area_struct *dst_vma)
{
        pte_marker srcm = pte_marker_get(entry);
        /* Always copy error entries. */
        pte_marker dstm = srcm & PTE_MARKER_POISONED;

        /* Only copy PTE markers if UFFD register matches. */
        if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
                dstm |= PTE_MARKER_UFFD_WP;

        return dstm;
}
#endif

/*
 * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
 * replace a none pte.  NOTE!  This should only be called when *pte is already
 * cleared so we will never accidentally replace something valuable.  Meanwhile
 * none pte also means we are not demoting the pte so tlb flushed is not needed.
 * E.g., when pte cleared the caller should have taken care of the tlb flush.
 *
 * Must be called with pgtable lock held so that no thread will see the none
 * pte, and if they see it, they'll fault and serialize at the pgtable lock.
 *
 * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled.
 */
static inline void
pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
                              pte_t *pte, pte_t pteval)
{
#ifdef CONFIG_PTE_MARKER_UFFD_WP
        bool arm_uffd_pte = false;

        /* The current status of the pte should be "cleared" before calling */
        WARN_ON_ONCE(!pte_none(ptep_get(pte)));

        /*
         * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
         * thing, because when zapping either it means it's dropping the
         * page, or in TTU where the present pte will be quickly replaced
         * with a swap pte.  There's no way of leaking the bit.
         */
        if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
                return;

        /* A uffd-wp wr-protected normal pte */
        if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
                arm_uffd_pte = true;

        /*
         * A uffd-wp wr-protected swap pte.  Note: this should even cover an
         * existing pte marker with uffd-wp bit set.
         */
        if (unlikely(pte_swp_uffd_wp_any(pteval)))
                arm_uffd_pte = true;

        if (unlikely(arm_uffd_pte))
                set_pte_at(vma->vm_mm, addr, pte,
                           make_pte_marker(PTE_MARKER_UFFD_WP));
#endif
}

static inline bool vma_has_recency(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
                return false;

        if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
                return false;

        return true;
}

#endif



































































































    1 




























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM irq_vectors

#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IRQ_VECTORS_H

#include <linux/tracepoint.h>
#include <asm/trace/common.h>

#ifdef CONFIG_X86_LOCAL_APIC

DECLARE_EVENT_CLASS(x86_irq_vector,

        TP_PROTO(int vector),

        TP_ARGS(vector),

        TP_STRUCT__entry(
                __field(                int,        vector        )
        ),

        TP_fast_assign(
                __entry->vector = vector;
        ),

        TP_printk("vector=%d", __entry->vector) );

#define DEFINE_IRQ_VECTOR_EVENT(name)                \
DEFINE_EVENT_FN(x86_irq_vector, name##_entry,        \
        TP_PROTO(int vector),                        \
        TP_ARGS(vector), NULL, NULL);                \
DEFINE_EVENT_FN(x86_irq_vector, name##_exit,        \
        TP_PROTO(int vector),                        \
        TP_ARGS(vector), NULL, NULL);

/*
 * local_timer - called when entering/exiting a local timer interrupt
 * vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(local_timer);

/*
 * spurious_apic - called when entering/exiting a spurious apic vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(spurious_apic);

/*
 * error_apic - called when entering/exiting an error apic vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(error_apic);

/*
 * x86_platform_ipi - called when entering/exiting a x86 platform ipi interrupt
 * vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi);

#ifdef CONFIG_IRQ_WORK
/*
 * irq_work - called when entering/exiting a irq work interrupt
 * vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(irq_work);

/*
 * We must dis-allow sampling irq_work_exit() because perf event sampling
 * itself can cause irq_work, which would lead to an infinite loop;
 *
 *  1) irq_work_exit happens
 *  2) generates perf sample
 *  3) generates irq_work
 *  4) goto 1
 */
TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0);
#endif

/*
 * The ifdef is required because that tracepoint macro hell emits tracepoint
 * code in files which include this header even if the tracepoint is not
 * enabled. Brilliant stuff that.
 */
#ifdef CONFIG_SMP
/*
 * reschedule - called when entering/exiting a reschedule vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(reschedule);

/*
 * call_function - called when entering/exiting a call function interrupt
 * vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(call_function);

/*
 * call_function_single - called when entering/exiting a call function
 * single interrupt vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(call_function_single);
#endif

#ifdef CONFIG_X86_MCE_THRESHOLD
/*
 * threshold_apic - called when entering/exiting a threshold apic interrupt
 * vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
#endif

#ifdef CONFIG_X86_MCE_AMD
/*
 * deferred_error_apic - called when entering/exiting a deferred apic interrupt
 * vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
#endif

#ifdef CONFIG_X86_THERMAL_VECTOR
/*
 * thermal_apic - called when entering/exiting a thermal apic interrupt
 * vector handler
 */
DEFINE_IRQ_VECTOR_EVENT(thermal_apic);
#endif

TRACE_EVENT(vector_config,

        TP_PROTO(unsigned int irq, unsigned int vector,
                 unsigned int cpu, unsigned int apicdest),

        TP_ARGS(irq, vector, cpu, apicdest),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        unsigned int,        vector                )
                __field(        unsigned int,        cpu                )
                __field(        unsigned int,        apicdest        )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->vector                = vector;
                __entry->cpu                = cpu;
                __entry->apicdest        = apicdest;
        ),

        TP_printk("irq=%u vector=%u cpu=%u apicdest=0x%08x",
                  __entry->irq, __entry->vector, __entry->cpu,
                  __entry->apicdest)
);

DECLARE_EVENT_CLASS(vector_mod,

        TP_PROTO(unsigned int irq, unsigned int vector,
                 unsigned int cpu, unsigned int prev_vector,
                 unsigned int prev_cpu),

        TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        unsigned int,        vector                )
                __field(        unsigned int,        cpu                )
                __field(        unsigned int,        prev_vector        )
                __field(        unsigned int,        prev_cpu        )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->vector                = vector;
                __entry->cpu                = cpu;
                __entry->prev_vector        = prev_vector;
                __entry->prev_cpu        = prev_cpu;

        ),

        TP_printk("irq=%u vector=%u cpu=%u prev_vector=%u prev_cpu=%u",
                  __entry->irq, __entry->vector, __entry->cpu,
                  __entry->prev_vector, __entry->prev_cpu)
);

#define DEFINE_IRQ_VECTOR_MOD_EVENT(name)                                \
DEFINE_EVENT_FN(vector_mod, name,                                        \
        TP_PROTO(unsigned int irq, unsigned int vector,                        \
                 unsigned int cpu, unsigned int prev_vector,                \
                 unsigned int prev_cpu),                                \
        TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu), NULL, NULL);        \

DEFINE_IRQ_VECTOR_MOD_EVENT(vector_update);
DEFINE_IRQ_VECTOR_MOD_EVENT(vector_clear);

DECLARE_EVENT_CLASS(vector_reserve,

        TP_PROTO(unsigned int irq, int ret),

        TP_ARGS(irq, ret),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->irq = irq;
                __entry->ret = ret;
        ),

        TP_printk("irq=%u ret=%d", __entry->irq, __entry->ret)
);

#define DEFINE_IRQ_VECTOR_RESERVE_EVENT(name)        \
DEFINE_EVENT_FN(vector_reserve, name,        \
        TP_PROTO(unsigned int irq, int ret),        \
        TP_ARGS(irq, ret), NULL, NULL);                \

DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve_managed);
DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve);

TRACE_EVENT(vector_alloc,

        TP_PROTO(unsigned int irq, unsigned int vector, bool reserved,
                 int ret),

        TP_ARGS(irq, vector, reserved, ret),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        unsigned int,        vector                )
                __field(        bool,                reserved        )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->vector                = ret < 0 ? 0 : vector;
                __entry->reserved        = reserved;
                __entry->ret                = ret > 0 ? 0 : ret;
        ),

        TP_printk("irq=%u vector=%u reserved=%d ret=%d",
                  __entry->irq, __entry->vector,
                  __entry->reserved, __entry->ret)
);

TRACE_EVENT(vector_alloc_managed,

        TP_PROTO(unsigned int irq, unsigned int vector,
                 int ret),

        TP_ARGS(irq, vector, ret),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        unsigned int,        vector                )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->vector                = ret < 0 ? 0 : vector;
                __entry->ret                = ret > 0 ? 0 : ret;
        ),

        TP_printk("irq=%u vector=%u ret=%d",
                  __entry->irq, __entry->vector, __entry->ret)
);

DECLARE_EVENT_CLASS(vector_activate,

        TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
                 bool reserve),

        TP_ARGS(irq, is_managed, can_reserve, reserve),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        bool,                is_managed        )
                __field(        bool,                can_reserve        )
                __field(        bool,                reserve                )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->is_managed        = is_managed;
                __entry->can_reserve        = can_reserve;
                __entry->reserve        = reserve;
        ),

        TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
                  __entry->irq, __entry->is_managed, __entry->can_reserve,
                  __entry->reserve)
);

#define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name)                                \
DEFINE_EVENT_FN(vector_activate, name,                                        \
        TP_PROTO(unsigned int irq, bool is_managed,                        \
                 bool can_reserve, bool reserve),                        \
        TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL);        \

DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);

TRACE_EVENT(vector_teardown,

        TP_PROTO(unsigned int irq, bool is_managed, bool has_reserved),

        TP_ARGS(irq, is_managed, has_reserved),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        bool,                is_managed        )
                __field(        bool,                has_reserved        )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->is_managed        = is_managed;
                __entry->has_reserved        = has_reserved;
        ),

        TP_printk("irq=%u is_managed=%d has_reserved=%d",
                  __entry->irq, __entry->is_managed, __entry->has_reserved)
);

TRACE_EVENT(vector_setup,

        TP_PROTO(unsigned int irq, bool is_legacy, int ret),

        TP_ARGS(irq, is_legacy, ret),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        bool,                is_legacy        )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->is_legacy        = is_legacy;
                __entry->ret                = ret;
        ),

        TP_printk("irq=%u is_legacy=%d ret=%d",
                  __entry->irq, __entry->is_legacy, __entry->ret)
);

TRACE_EVENT(vector_free_moved,

        TP_PROTO(unsigned int irq, unsigned int cpu, unsigned int vector,
                 bool is_managed),

        TP_ARGS(irq, cpu, vector, is_managed),

        TP_STRUCT__entry(
                __field(        unsigned int,        irq                )
                __field(        unsigned int,        cpu                )
                __field(        unsigned int,        vector                )
                __field(        bool,                is_managed        )
        ),

        TP_fast_assign(
                __entry->irq                = irq;
                __entry->cpu                = cpu;
                __entry->vector                = vector;
                __entry->is_managed        = is_managed;
        ),

        TP_printk("irq=%u cpu=%u vector=%u is_managed=%d",
                  __entry->irq, __entry->cpu, __entry->vector,
                  __entry->is_managed)
);


#endif /* CONFIG_X86_LOCAL_APIC */

#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE irq_vectors
#endif /*  _TRACE_IRQ_VECTORS_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





























    5 





















    1 




    1 















































































    1 






































































































    7 

    2 



    5 

    1 

    1 





















    8 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* I/O iterator iteration building functions.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_IOV_ITER_H
#define _LINUX_IOV_ITER_H

#include <linux/uio.h>
#include <linux/bvec.h>

typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
                             void *priv, void *priv2);
typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
                              void *priv, void *priv2);

/*
 * Handle ITER_UBUF.
 */
static __always_inline
size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_ustep_f step)
{
        void __user *base = iter->ubuf;
        size_t progress = 0, remain;

        remain = step(base + iter->iov_offset, 0, len, priv, priv2);
        progress = len - remain;
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_IOVEC.
 */
static __always_inline
size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                     iov_ustep_f step)
{
        const struct iovec *p = iter->__iov;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->__iov;
        iter->__iov = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_KVEC.
 */
static __always_inline
size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct kvec *p = iter->kvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->kvec;
        iter->kvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_BVEC.
 */
static __always_inline
size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct bio_vec *p = iter->bvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t offset = p->bv_offset + skip, part;
                void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);

                part = min3(len,
                           (size_t)(p->bv_len - skip),
                           (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
                remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
                kunmap_local(kaddr);
                consumed = part - remain;
                len -= consumed;
                progress += consumed;
                skip += consumed;
                if (skip >= p->bv_len) {
                        skip = 0;
                        p++;
                }
                if (remain)
                        break;
        } while (len);

        iter->nr_segs -= p - iter->bvec;
        iter->bvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_XARRAY.
 */
static __always_inline
size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        struct folio *folio;
        size_t progress = 0;
        loff_t start = iter->xarray_start + iter->iov_offset;
        pgoff_t index = start / PAGE_SIZE;
        XA_STATE(xas, iter->xarray, index);

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                size_t remain, consumed, offset, part, flen;

                if (xas_retry(&xas, folio))
                        continue;
                if (WARN_ON(xa_is_value(folio)))
                        break;
                if (WARN_ON(folio_test_hugetlb(folio)))
                        break;

                offset = offset_in_folio(folio, start + progress);
                flen = min(folio_size(folio) - offset, len);

                while (flen) {
                        void *base = kmap_local_folio(folio, offset);

                        part = min_t(size_t, flen,
                                     PAGE_SIZE - offset_in_page(offset));
                        remain = step(base, progress, part, priv, priv2);
                        kunmap_local(base);

                        consumed = part - remain;
                        progress += consumed;
                        len -= consumed;

                        if (remain || len == 0)
                                goto out;
                        flen -= consumed;
                        offset += consumed;
                }
        }

out:
        rcu_read_unlock();
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_DISCARD.
 */
static __always_inline
size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        size_t progress = len;

        iter->count -= progress;
        return progress;
}

/**
 * iterate_and_advance2 - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * Two step functions, @step and @ustep, must be provided, one for handling
 * mapped kernel addresses and the other is given user addresses which have the
 * potential to fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
                            void *priv2, iov_ustep_f ustep, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;

        if (likely(iter_is_ubuf(iter)))
                return iterate_ubuf(iter, len, priv, priv2, ustep);
        if (likely(iter_is_iovec(iter)))
                return iterate_iovec(iter, len, priv, priv2, ustep);
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

/**
 * iterate_and_advance - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * As iterate_and_advance2(), but priv2 is always NULL.
 */
static __always_inline
size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
                           iov_ustep_f ustep, iov_step_f step)
{
        return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
}

#endif /* _LINUX_IOV_ITER_H */








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/posix_acl.h

  (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/


#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H

#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <uapi/linux/posix_acl.h>

struct user_namespace;

struct posix_acl_entry {
        short                        e_tag;
        unsigned short                e_perm;
        union {
                kuid_t                e_uid;
                kgid_t                e_gid;
        };
};

struct posix_acl {
        refcount_t                a_refcount;
        struct rcu_head                a_rcu;
        unsigned int                a_count;
        struct posix_acl_entry        a_entries[];
};

#define FOREACH_ACL_ENTRY(pa, acl, pe) \
        for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++)


/*
 * Duplicate an ACL handle.
 */
static inline struct posix_acl *
posix_acl_dup(struct posix_acl *acl)
{
        if (acl)
                refcount_inc(&acl->a_refcount);
        return acl;
}

/*
 * Free an ACL handle.
 */
static inline void
posix_acl_release(struct posix_acl *acl)
{
        if (acl && refcount_dec_and_test(&acl->a_refcount))
                kfree_rcu(acl, a_rcu);
}


/* posix_acl.c */

extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(int, gfp_t);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);

extern struct posix_acl *get_posix_acl(struct inode *, int);
int set_posix_acl(struct mnt_idmap *, struct dentry *, int,
                  struct posix_acl *);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags);

#ifdef CONFIG_FS_POSIX_ACL
int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t);
extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
                struct posix_acl **);
int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *,
                          struct posix_acl **);

int simple_set_acl(struct mnt_idmap *, struct dentry *,
                   struct posix_acl *, int);
extern int simple_acl_create(struct inode *, struct inode *);

struct posix_acl *get_cached_acl(struct inode *inode, int type);
void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
void forget_cached_acl(struct inode *inode, int type);
void forget_all_cached_acls(struct inode *inode);
int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
int posix_acl_permission(struct mnt_idmap *, struct inode *,
                         const struct posix_acl *, int);

static inline void cache_no_acl(struct inode *inode)
{
        inode->i_acl = NULL;
        inode->i_default_acl = NULL;
}

int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *acl_name, struct posix_acl *kacl);
struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name);
int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name);
int posix_acl_listxattr(struct inode *inode, char **buffer,
                        ssize_t *remaining_size);
#else
static inline int posix_acl_chmod(struct mnt_idmap *idmap,
                                  struct dentry *dentry, umode_t mode)
{
        return 0;
}

#define simple_set_acl                NULL

static inline int simple_acl_create(struct inode *dir, struct inode *inode)
{
        return 0;
}
static inline void cache_no_acl(struct inode *inode)
{
}

static inline int posix_acl_create(struct inode *inode, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        *default_acl = *acl = NULL;
        return 0;
}

static inline void forget_all_cached_acls(struct inode *inode)
{
}

static inline int vfs_set_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *name,
                              struct posix_acl *acl)
{
        return -EOPNOTSUPP;
}

static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                                            struct dentry *dentry,
                                            const char *acl_name)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int vfs_remove_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name)
{
        return -EOPNOTSUPP;
}
static inline int posix_acl_listxattr(struct inode *inode, char **buffer,
                                      ssize_t *remaining_size)
{
        return 0;
}
#endif /* CONFIG_FS_POSIX_ACL */

struct posix_acl *get_inode_acl(struct inode *inode, int type);

#endif  /* __LINUX_POSIX_ACL_H */


























































































































































































































































































































































































    1 

















    1 

















    1 



    1 





































    1 





















    1 













































    1 

















    1 













    1 


    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * algif_aead: User-space interface for AEAD algorithms
 *
 * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de>
 *
 * This file provides the user-space API for AEAD ciphers.
 *
 * The following concept of the memory management is used:
 *
 * The kernel maintains two SGLs, the TX SGL and the RX SGL. The TX SGL is
 * filled by user space with the data submitted via sendmsg (maybe with
 * MSG_SPLICE_PAGES).  Filling up the TX SGL does not cause a crypto operation
 * -- the data will only be tracked by the kernel. Upon receipt of one recvmsg
 * call, the caller must provide a buffer which is tracked with the RX SGL.
 *
 * During the processing of the recvmsg operation, the cipher request is
 * allocated and prepared. As part of the recvmsg operation, the processed
 * TX buffers are extracted from the TX SGL into a separate SGL.
 *
 * After the completion of the crypto operation, the RX SGL and the cipher
 * request is released. The extracted TX SGL parts are released together with
 * the RX SGL release.
 */

#include <crypto/internal/aead.h>
#include <crypto/scatterwalk.h>
#include <crypto/if_alg.h>
#include <crypto/skcipher.h>
#include <crypto/null.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/net.h>
#include <net/sock.h>

struct aead_tfm {
        struct crypto_aead *aead;
        struct crypto_sync_skcipher *null_tfm;
};

static inline bool aead_sufficient_data(struct sock *sk)
{
        struct alg_sock *ask = alg_sk(sk);
        struct sock *psk = ask->parent;
        struct alg_sock *pask = alg_sk(psk);
        struct af_alg_ctx *ctx = ask->private;
        struct aead_tfm *aeadc = pask->private;
        struct crypto_aead *tfm = aeadc->aead;
        unsigned int as = crypto_aead_authsize(tfm);

        /*
         * The minimum amount of memory needed for an AEAD cipher is
         * the AAD and in case of decryption the tag.
         */
        return ctx->used >= ctx->aead_assoclen + (ctx->enc ? 0 : as);
}

static int aead_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct sock *psk = ask->parent;
        struct alg_sock *pask = alg_sk(psk);
        struct aead_tfm *aeadc = pask->private;
        struct crypto_aead *tfm = aeadc->aead;
        unsigned int ivsize = crypto_aead_ivsize(tfm);

        return af_alg_sendmsg(sock, msg, size, ivsize);
}

static int crypto_aead_copy_sgl(struct crypto_sync_skcipher *null_tfm,
                                struct scatterlist *src,
                                struct scatterlist *dst, unsigned int len)
{
        SYNC_SKCIPHER_REQUEST_ON_STACK(skreq, null_tfm);

        skcipher_request_set_sync_tfm(skreq, null_tfm);
        skcipher_request_set_callback(skreq, CRYPTO_TFM_REQ_MAY_SLEEP,
                                      NULL, NULL);
        skcipher_request_set_crypt(skreq, src, dst, len, NULL);

        return crypto_skcipher_encrypt(skreq);
}

static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
                         size_t ignored, int flags)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct sock *psk = ask->parent;
        struct alg_sock *pask = alg_sk(psk);
        struct af_alg_ctx *ctx = ask->private;
        struct aead_tfm *aeadc = pask->private;
        struct crypto_aead *tfm = aeadc->aead;
        struct crypto_sync_skcipher *null_tfm = aeadc->null_tfm;
        unsigned int i, as = crypto_aead_authsize(tfm);
        struct af_alg_async_req *areq;
        struct af_alg_tsgl *tsgl, *tmp;
        struct scatterlist *rsgl_src, *tsgl_src = NULL;
        int err = 0;
        size_t used = 0;                /* [in]  TX bufs to be en/decrypted */
        size_t outlen = 0;                /* [out] RX bufs produced by kernel */
        size_t usedpages = 0;                /* [in]  RX bufs to be used from user */
        size_t processed = 0;                /* [in]  TX bufs to be consumed */

        if (!ctx->init || ctx->more) {
                err = af_alg_wait_for_data(sk, flags, 0);
                if (err)
                        return err;
        }

        /*
         * Data length provided by caller via sendmsg that has not yet been
         * processed.
         */
        used = ctx->used;

        /*
         * Make sure sufficient data is present -- note, the same check is also
         * present in sendmsg. The checks in sendmsg shall provide an
         * information to the data sender that something is wrong, but they are
         * irrelevant to maintain the kernel integrity.  We need this check
         * here too in case user space decides to not honor the error message
         * in sendmsg and still call recvmsg. This check here protects the
         * kernel integrity.
         */
        if (!aead_sufficient_data(sk))
                return -EINVAL;

        /*
         * Calculate the minimum output buffer size holding the result of the
         * cipher operation. When encrypting data, the receiving buffer is
         * larger by the tag length compared to the input buffer as the
         * encryption operation generates the tag. For decryption, the input
         * buffer provides the tag which is consumed resulting in only the
         * plaintext without a buffer for the tag returned to the caller.
         */
        if (ctx->enc)
                outlen = used + as;
        else
                outlen = used - as;

        /*
         * The cipher operation input data is reduced by the associated data
         * length as this data is processed separately later on.
         */
        used -= ctx->aead_assoclen;

        /* Allocate cipher request for current operation. */
        areq = af_alg_alloc_areq(sk, sizeof(struct af_alg_async_req) +
                                     crypto_aead_reqsize(tfm));
        if (IS_ERR(areq))
                return PTR_ERR(areq);

        /* convert iovecs of output buffers into RX SGL */
        err = af_alg_get_rsgl(sk, msg, flags, areq, outlen, &usedpages);
        if (err)
                goto free;

        /*
         * Ensure output buffer is sufficiently large. If the caller provides
         * less buffer space, only use the relative required input size. This
         * allows AIO operation where the caller sent all data to be processed
         * and the AIO operation performs the operation on the different chunks
         * of the input data.
         */
        if (usedpages < outlen) {
                size_t less = outlen - usedpages;

                if (used < less) {
                        err = -EINVAL;
                        goto free;
                }
                used -= less;
                outlen -= less;
        }

        processed = used + ctx->aead_assoclen;
        list_for_each_entry_safe(tsgl, tmp, &ctx->tsgl_list, list) {
                for (i = 0; i < tsgl->cur; i++) {
                        struct scatterlist *process_sg = tsgl->sg + i;

                        if (!(process_sg->length) || !sg_page(process_sg))
                                continue;
                        tsgl_src = process_sg;
                        break;
                }
                if (tsgl_src)
                        break;
        }
        if (processed && !tsgl_src) {
                err = -EFAULT;
                goto free;
        }

        /*
         * Copy of AAD from source to destination
         *
         * The AAD is copied to the destination buffer without change. Even
         * when user space uses an in-place cipher operation, the kernel
         * will copy the data as it does not see whether such in-place operation
         * is initiated.
         *
         * To ensure efficiency, the following implementation ensure that the
         * ciphers are invoked to perform a crypto operation in-place. This
         * is achieved by memory management specified as follows.
         */

        /* Use the RX SGL as source (and destination) for crypto op. */
        rsgl_src = areq->first_rsgl.sgl.sgt.sgl;

        if (ctx->enc) {
                /*
                 * Encryption operation - The in-place cipher operation is
                 * achieved by the following operation:
                 *
                 * TX SGL: AAD || PT
                 *            |           |
                 *            | copy |
                 *            v           v
                 * RX SGL: AAD || PT || Tag
                 */
                err = crypto_aead_copy_sgl(null_tfm, tsgl_src,
                                           areq->first_rsgl.sgl.sgt.sgl,
                                           processed);
                if (err)
                        goto free;
                af_alg_pull_tsgl(sk, processed, NULL, 0);
        } else {
                /*
                 * Decryption operation - To achieve an in-place cipher
                 * operation, the following  SGL structure is used:
                 *
                 * TX SGL: AAD || CT || Tag
                 *            |           |         ^
                 *            | copy |         | Create SGL link.
                 *            v           v         |
                 * RX SGL: AAD || CT ----+
                 */

                 /* Copy AAD || CT to RX SGL buffer for in-place operation. */
                err = crypto_aead_copy_sgl(null_tfm, tsgl_src,
                                           areq->first_rsgl.sgl.sgt.sgl,
                                           outlen);
                if (err)
                        goto free;

                /* Create TX SGL for tag and chain it to RX SGL. */
                areq->tsgl_entries = af_alg_count_tsgl(sk, processed,
                                                       processed - as);
                if (!areq->tsgl_entries)
                        areq->tsgl_entries = 1;
                areq->tsgl = sock_kmalloc(sk, array_size(sizeof(*areq->tsgl),
                                                         areq->tsgl_entries),
                                          GFP_KERNEL);
                if (!areq->tsgl) {
                        err = -ENOMEM;
                        goto free;
                }
                sg_init_table(areq->tsgl, areq->tsgl_entries);

                /* Release TX SGL, except for tag data and reassign tag data. */
                af_alg_pull_tsgl(sk, processed, areq->tsgl, processed - as);

                /* chain the areq TX SGL holding the tag with RX SGL */
                if (usedpages) {
                        /* RX SGL present */
                        struct af_alg_sgl *sgl_prev = &areq->last_rsgl->sgl;
                        struct scatterlist *sg = sgl_prev->sgt.sgl;

                        sg_unmark_end(sg + sgl_prev->sgt.nents - 1);
                        sg_chain(sg, sgl_prev->sgt.nents + 1, areq->tsgl);
                } else
                        /* no RX SGL present (e.g. authentication only) */
                        rsgl_src = areq->tsgl;
        }

        /* Initialize the crypto operation */
        aead_request_set_crypt(&areq->cra_u.aead_req, rsgl_src,
                               areq->first_rsgl.sgl.sgt.sgl, used, ctx->iv);
        aead_request_set_ad(&areq->cra_u.aead_req, ctx->aead_assoclen);
        aead_request_set_tfm(&areq->cra_u.aead_req, tfm);

        if (msg->msg_iocb && !is_sync_kiocb(msg->msg_iocb)) {
                /* AIO operation */
                sock_hold(sk);
                areq->iocb = msg->msg_iocb;

                /* Remember output size that will be generated. */
                areq->outlen = outlen;

                aead_request_set_callback(&areq->cra_u.aead_req,
                                          CRYPTO_TFM_REQ_MAY_SLEEP,
                                          af_alg_async_cb, areq);
                err = ctx->enc ? crypto_aead_encrypt(&areq->cra_u.aead_req) :
                                 crypto_aead_decrypt(&areq->cra_u.aead_req);

                /* AIO operation in progress */
                if (err == -EINPROGRESS)
                        return -EIOCBQUEUED;

                sock_put(sk);
        } else {
                /* Synchronous operation */
                aead_request_set_callback(&areq->cra_u.aead_req,
                                          CRYPTO_TFM_REQ_MAY_SLEEP |
                                          CRYPTO_TFM_REQ_MAY_BACKLOG,
                                          crypto_req_done, &ctx->wait);
                err = crypto_wait_req(ctx->enc ?
                                crypto_aead_encrypt(&areq->cra_u.aead_req) :
                                crypto_aead_decrypt(&areq->cra_u.aead_req),
                                &ctx->wait);
        }


free:
        af_alg_free_resources(areq);

        return err ? err : outlen;
}

static int aead_recvmsg(struct socket *sock, struct msghdr *msg,
                        size_t ignored, int flags)
{
        struct sock *sk = sock->sk;
        int ret = 0;

        lock_sock(sk);
        while (msg_data_left(msg)) {
                int err = _aead_recvmsg(sock, msg, ignored, flags);

                /*
                 * This error covers -EIOCBQUEUED which implies that we can
                 * only handle one AIO request. If the caller wants to have
                 * multiple AIO requests in parallel, he must make multiple
                 * separate AIO calls.
                 *
                 * Also return the error if no data has been processed so far.
                 */
                if (err <= 0) {
                        if (err == -EIOCBQUEUED || err == -EBADMSG || !ret)
                                ret = err;
                        goto out;
                }

                ret += err;
        }

out:
        af_alg_wmem_wakeup(sk);
        release_sock(sk);
        return ret;
}

static struct proto_ops algif_aead_ops = {
        .family                =        PF_ALG,

        .connect        =        sock_no_connect,
        .socketpair        =        sock_no_socketpair,
        .getname        =        sock_no_getname,
        .ioctl                =        sock_no_ioctl,
        .listen                =        sock_no_listen,
        .shutdown        =        sock_no_shutdown,
        .mmap                =        sock_no_mmap,
        .bind                =        sock_no_bind,
        .accept                =        sock_no_accept,

        .release        =        af_alg_release,
        .sendmsg        =        aead_sendmsg,
        .recvmsg        =        aead_recvmsg,
        .poll                =        af_alg_poll,
};

static int aead_check_key(struct socket *sock)
{
        int err = 0;
        struct sock *psk;
        struct alg_sock *pask;
        struct aead_tfm *tfm;
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);

        lock_sock(sk);
        if (!atomic_read(&ask->nokey_refcnt))
                goto unlock_child;

        psk = ask->parent;
        pask = alg_sk(ask->parent);
        tfm = pask->private;

        err = -ENOKEY;
        lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
        if (crypto_aead_get_flags(tfm->aead) & CRYPTO_TFM_NEED_KEY)
                goto unlock;

        atomic_dec(&pask->nokey_refcnt);
        atomic_set(&ask->nokey_refcnt, 0);

        err = 0;

unlock:
        release_sock(psk);
unlock_child:
        release_sock(sk);

        return err;
}

static int aead_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
                                  size_t size)
{
        int err;

        err = aead_check_key(sock);
        if (err)
                return err;

        return aead_sendmsg(sock, msg, size);
}

static int aead_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
                                  size_t ignored, int flags)
{
        int err;

        err = aead_check_key(sock);
        if (err)
                return err;

        return aead_recvmsg(sock, msg, ignored, flags);
}

static struct proto_ops algif_aead_ops_nokey = {
        .family                =        PF_ALG,

        .connect        =        sock_no_connect,
        .socketpair        =        sock_no_socketpair,
        .getname        =        sock_no_getname,
        .ioctl                =        sock_no_ioctl,
        .listen                =        sock_no_listen,
        .shutdown        =        sock_no_shutdown,
        .mmap                =        sock_no_mmap,
        .bind                =        sock_no_bind,
        .accept                =        sock_no_accept,

        .release        =        af_alg_release,
        .sendmsg        =        aead_sendmsg_nokey,
        .recvmsg        =        aead_recvmsg_nokey,
        .poll                =        af_alg_poll,
};

static void *aead_bind(const char *name, u32 type, u32 mask)
{
        struct aead_tfm *tfm;
        struct crypto_aead *aead;
        struct crypto_sync_skcipher *null_tfm;

        tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
        if (!tfm)
                return ERR_PTR(-ENOMEM);

        aead = crypto_alloc_aead(name, type, mask);
        if (IS_ERR(aead)) {
                kfree(tfm);
                return ERR_CAST(aead);
        }

        null_tfm = crypto_get_default_null_skcipher();
        if (IS_ERR(null_tfm)) {
                crypto_free_aead(aead);
                kfree(tfm);
                return ERR_CAST(null_tfm);
        }

        tfm->aead = aead;
        tfm->null_tfm = null_tfm;

        return tfm;
}

static void aead_release(void *private)
{
        struct aead_tfm *tfm = private;

        crypto_free_aead(tfm->aead);
        crypto_put_default_null_skcipher();
        kfree(tfm);
}

static int aead_setauthsize(void *private, unsigned int authsize)
{
        struct aead_tfm *tfm = private;

        return crypto_aead_setauthsize(tfm->aead, authsize);
}

static int aead_setkey(void *private, const u8 *key, unsigned int keylen)
{
        struct aead_tfm *tfm = private;

        return crypto_aead_setkey(tfm->aead, key, keylen);
}

static void aead_sock_destruct(struct sock *sk)
{
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        struct sock *psk = ask->parent;
        struct alg_sock *pask = alg_sk(psk);
        struct aead_tfm *aeadc = pask->private;
        struct crypto_aead *tfm = aeadc->aead;
        unsigned int ivlen = crypto_aead_ivsize(tfm);

        af_alg_pull_tsgl(sk, ctx->used, NULL, 0);
        sock_kzfree_s(sk, ctx->iv, ivlen);
        sock_kfree_s(sk, ctx, ctx->len);
        af_alg_release_parent(sk);
}

static int aead_accept_parent_nokey(void *private, struct sock *sk)
{
        struct af_alg_ctx *ctx;
        struct alg_sock *ask = alg_sk(sk);
        struct aead_tfm *tfm = private;
        struct crypto_aead *aead = tfm->aead;
        unsigned int len = sizeof(*ctx);
        unsigned int ivlen = crypto_aead_ivsize(aead);

        ctx = sock_kmalloc(sk, len, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;
        memset(ctx, 0, len);

        ctx->iv = sock_kmalloc(sk, ivlen, GFP_KERNEL);
        if (!ctx->iv) {
                sock_kfree_s(sk, ctx, len);
                return -ENOMEM;
        }
        memset(ctx->iv, 0, ivlen);

        INIT_LIST_HEAD(&ctx->tsgl_list);
        ctx->len = len;
        crypto_init_wait(&ctx->wait);

        ask->private = ctx;

        sk->sk_destruct = aead_sock_destruct;

        return 0;
}

static int aead_accept_parent(void *private, struct sock *sk)
{
        struct aead_tfm *tfm = private;

        if (crypto_aead_get_flags(tfm->aead) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return aead_accept_parent_nokey(private, sk);
}

static const struct af_alg_type algif_type_aead = {
        .bind                =        aead_bind,
        .release        =        aead_release,
        .setkey                =        aead_setkey,
        .setauthsize        =        aead_setauthsize,
        .accept                =        aead_accept_parent,
        .accept_nokey        =        aead_accept_parent_nokey,
        .ops                =        &algif_aead_ops,
        .ops_nokey        =        &algif_aead_ops_nokey,
        .name                =        "aead",
        .owner                =        THIS_MODULE
};

static int __init algif_aead_init(void)
{
        return af_alg_register_type(&algif_type_aead);
}

static void __exit algif_aead_exit(void)
{
        int err = af_alg_unregister_type(&algif_type_aead);
        BUG_ON(err);
}

module_init(algif_aead_init);
module_exit(algif_aead_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
MODULE_DESCRIPTION("AEAD kernel crypto API user space interface");






































































    2 



















































    2 




































































    1 

    1 

    1 

    1 

    1 

    1 

    1 
    1 
    1 









    1 








    1 




    1 
    1 









    2 










    2 



































































    1 





    1 

    1 














    1 






    1 








































































































































































    1 







    1 





    1 




    1 








    1 











    1 










    1 




    1 



    1 


    1 

    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2006 IBM Corporation
 *
 *  Author: Serge Hallyn <serue@us.ibm.com>
 *
 *  Jun 2006 - namespaces support
 *             OpenVZ, SWsoft Inc.
 *             Pavel Emelianov <xemul@openvz.org>
 */

#include <linux/slab.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/init_task.h>
#include <linux/mnt_namespace.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
#include <linux/fs_struct.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>

static struct kmem_cache *nsproxy_cachep;

struct nsproxy init_nsproxy = {
        .count                        = REFCOUNT_INIT(1),
        .uts_ns                        = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
        .ipc_ns                        = &init_ipc_ns,
#endif
        .mnt_ns                        = NULL,
        .pid_ns_for_children        = &init_pid_ns,
#ifdef CONFIG_NET
        .net_ns                        = &init_net,
#endif
#ifdef CONFIG_CGROUPS
        .cgroup_ns                = &init_cgroup_ns,
#endif
#ifdef CONFIG_TIME_NS
        .time_ns                = &init_time_ns,
        .time_ns_for_children        = &init_time_ns,
#endif
};

static inline struct nsproxy *create_nsproxy(void)
{
        struct nsproxy *nsproxy;

        nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
        if (nsproxy)
                refcount_set(&nsproxy->count, 1);
        return nsproxy;
}

/*
 * Create new nsproxy and all of its the associated namespaces.
 * Return the newly created nsproxy.  Do not attach this to the task,
 * leave it to the caller to do proper locking and attach it to task.
 */
static struct nsproxy *create_new_namespaces(unsigned long flags,
        struct task_struct *tsk, struct user_namespace *user_ns,
        struct fs_struct *new_fs)
{
        struct nsproxy *new_nsp;
        int err;

        new_nsp = create_nsproxy();
        if (!new_nsp)
                return ERR_PTR(-ENOMEM);

        new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
        if (IS_ERR(new_nsp->mnt_ns)) {
                err = PTR_ERR(new_nsp->mnt_ns);
                goto out_ns;
        }

        new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
        if (IS_ERR(new_nsp->uts_ns)) {
                err = PTR_ERR(new_nsp->uts_ns);
                goto out_uts;
        }

        new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
        if (IS_ERR(new_nsp->ipc_ns)) {
                err = PTR_ERR(new_nsp->ipc_ns);
                goto out_ipc;
        }

        new_nsp->pid_ns_for_children =
                copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
        if (IS_ERR(new_nsp->pid_ns_for_children)) {
                err = PTR_ERR(new_nsp->pid_ns_for_children);
                goto out_pid;
        }

        new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
                                            tsk->nsproxy->cgroup_ns);
        if (IS_ERR(new_nsp->cgroup_ns)) {
                err = PTR_ERR(new_nsp->cgroup_ns);
                goto out_cgroup;
        }

        new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
        if (IS_ERR(new_nsp->net_ns)) {
                err = PTR_ERR(new_nsp->net_ns);
                goto out_net;
        }

        new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
                                        tsk->nsproxy->time_ns_for_children);
        if (IS_ERR(new_nsp->time_ns_for_children)) {
                err = PTR_ERR(new_nsp->time_ns_for_children);
                goto out_time;
        }
        new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);

        return new_nsp;

out_time:
        put_net(new_nsp->net_ns);
out_net:
        put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
        if (new_nsp->pid_ns_for_children)
                put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
        if (new_nsp->ipc_ns)
                put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
        if (new_nsp->uts_ns)
                put_uts_ns(new_nsp->uts_ns);
out_uts:
        if (new_nsp->mnt_ns)
                put_mnt_ns(new_nsp->mnt_ns);
out_ns:
        kmem_cache_free(nsproxy_cachep, new_nsp);
        return ERR_PTR(err);
}

/*
 * called from clone.  This now handles copy for nsproxy and all
 * namespaces therein.
 */
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
        struct nsproxy *old_ns = tsk->nsproxy;
        struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
        struct nsproxy *new_ns;

        if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                              CLONE_NEWPID | CLONE_NEWNET |
                              CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
                if ((flags & CLONE_VM) ||
                    likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
                        get_nsproxy(old_ns);
                        return 0;
                }
        } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /*
         * CLONE_NEWIPC must detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
         * means share undolist with parent, so we must forbid using
         * it along with CLONE_NEWIPC.
         */
        if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
                (CLONE_NEWIPC | CLONE_SYSVSEM))
                return -EINVAL;

        new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
        if (IS_ERR(new_ns))
                return  PTR_ERR(new_ns);

        if ((flags & CLONE_VM) == 0)
                timens_on_fork(new_ns, tsk);

        tsk->nsproxy = new_ns;
        return 0;
}

void free_nsproxy(struct nsproxy *ns)
{
        if (ns->mnt_ns)
                put_mnt_ns(ns->mnt_ns);
        if (ns->uts_ns)
                put_uts_ns(ns->uts_ns);
        if (ns->ipc_ns)
                put_ipc_ns(ns->ipc_ns);
        if (ns->pid_ns_for_children)
                put_pid_ns(ns->pid_ns_for_children);
        if (ns->time_ns)
                put_time_ns(ns->time_ns);
        if (ns->time_ns_for_children)
                put_time_ns(ns->time_ns_for_children);
        put_cgroup_ns(ns->cgroup_ns);
        put_net(ns->net_ns);
        kmem_cache_free(nsproxy_cachep, ns);
}

/*
 * Called from unshare. Unshare all the namespaces part of nsproxy.
 * On success, returns the new nsproxy.
 */
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
        struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
        struct user_namespace *user_ns;
        int err = 0;

        if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                               CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
                               CLONE_NEWTIME)))
                return 0;

        user_ns = new_cred ? new_cred->user_ns : current_user_ns();
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
                                         new_fs ? new_fs : current->fs);
        if (IS_ERR(*new_nsp)) {
                err = PTR_ERR(*new_nsp);
                goto out;
        }

out:
        return err;
}

void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
{
        struct nsproxy *ns;

        might_sleep();

        task_lock(p);
        ns = p->nsproxy;
        p->nsproxy = new;
        task_unlock(p);

        if (ns)
                put_nsproxy(ns);
}

void exit_task_namespaces(struct task_struct *p)
{
        switch_task_namespaces(p, NULL);
}

int exec_task_namespaces(void)
{
        struct task_struct *tsk = current;
        struct nsproxy *new;

        if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
                return 0;

        new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
        if (IS_ERR(new))
                return PTR_ERR(new);

        timens_on_fork(new, tsk);
        switch_task_namespaces(tsk, new);
        return 0;
}

static int check_setns_flags(unsigned long flags)
{
        if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                                 CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
                                 CLONE_NEWPID | CLONE_NEWCGROUP)))
                return -EINVAL;

#ifndef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER)
                return -EINVAL;
#endif
#ifndef CONFIG_PID_NS
        if (flags & CLONE_NEWPID)
                return -EINVAL;
#endif
#ifndef CONFIG_UTS_NS
        if (flags & CLONE_NEWUTS)
                return -EINVAL;
#endif
#ifndef CONFIG_IPC_NS
        if (flags & CLONE_NEWIPC)
                return -EINVAL;
#endif
#ifndef CONFIG_CGROUPS
        if (flags & CLONE_NEWCGROUP)
                return -EINVAL;
#endif
#ifndef CONFIG_NET_NS
        if (flags & CLONE_NEWNET)
                return -EINVAL;
#endif
#ifndef CONFIG_TIME_NS
        if (flags & CLONE_NEWTIME)
                return -EINVAL;
#endif

        return 0;
}

static void put_nsset(struct nsset *nsset)
{
        unsigned flags = nsset->flags;

        if (flags & CLONE_NEWUSER)
                put_cred(nsset_cred(nsset));
        /*
         * We only created a temporary copy if we attached to more than just
         * the mount namespace.
         */
        if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
                free_fs_struct(nsset->fs);
        if (nsset->nsproxy)
                free_nsproxy(nsset->nsproxy);
}

static int prepare_nsset(unsigned flags, struct nsset *nsset)
{
        struct task_struct *me = current;

        nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
        if (IS_ERR(nsset->nsproxy))
                return PTR_ERR(nsset->nsproxy);

        if (flags & CLONE_NEWUSER)
                nsset->cred = prepare_creds();
        else
                nsset->cred = current_cred();
        if (!nsset->cred)
                goto out;

        /* Only create a temporary copy of fs_struct if we really need to. */
        if (flags == CLONE_NEWNS) {
                nsset->fs = me->fs;
        } else if (flags & CLONE_NEWNS) {
                nsset->fs = copy_fs_struct(me->fs);
                if (!nsset->fs)
                        goto out;
        }

        nsset->flags = flags;
        return 0;

out:
        put_nsset(nsset);
        return -ENOMEM;
}

static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
{
        return ns->ops->install(nsset, ns);
}

/*
 * This is the inverse operation to unshare().
 * Ordering is equivalent to the standard ordering used everywhere else
 * during unshare and process creation. The switch to the new set of
 * namespaces occurs at the point of no return after installation of
 * all requested namespaces was successful in commit_nsset().
 */
static int validate_nsset(struct nsset *nsset, struct pid *pid)
{
        int ret = 0;
        unsigned flags = nsset->flags;
        struct user_namespace *user_ns = NULL;
        struct pid_namespace *pid_ns = NULL;
        struct nsproxy *nsp;
        struct task_struct *tsk;

        /* Take a "snapshot" of the target task's namespaces. */
        rcu_read_lock();
        tsk = pid_task(pid, PIDTYPE_PID);
        if (!tsk) {
                rcu_read_unlock();
                return -ESRCH;
        }

        if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                return -EPERM;
        }

        task_lock(tsk);
        nsp = tsk->nsproxy;
        if (nsp)
                get_nsproxy(nsp);
        task_unlock(tsk);
        if (!nsp) {
                rcu_read_unlock();
                return -ESRCH;
        }

#ifdef CONFIG_PID_NS
        if (flags & CLONE_NEWPID) {
                pid_ns = task_active_pid_ns(tsk);
                if (unlikely(!pid_ns)) {
                        rcu_read_unlock();
                        ret = -ESRCH;
                        goto out;
                }
                get_pid_ns(pid_ns);
        }
#endif

#ifdef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER)
                user_ns = get_user_ns(__task_cred(tsk)->user_ns);
#endif
        rcu_read_unlock();

        /*
         * Install requested namespaces. The caller will have
         * verified earlier that the requested namespaces are
         * supported on this kernel. We don't report errors here
         * if a namespace is requested that isn't supported.
         */
#ifdef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER) {
                ret = validate_ns(nsset, &user_ns->ns);
                if (ret)
                        goto out;
        }
#endif

        if (flags & CLONE_NEWNS) {
                ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
                if (ret)
                        goto out;
        }

#ifdef CONFIG_UTS_NS
        if (flags & CLONE_NEWUTS) {
                ret = validate_ns(nsset, &nsp->uts_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_IPC_NS
        if (flags & CLONE_NEWIPC) {
                ret = validate_ns(nsset, &nsp->ipc_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_PID_NS
        if (flags & CLONE_NEWPID) {
                ret = validate_ns(nsset, &pid_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_CGROUPS
        if (flags & CLONE_NEWCGROUP) {
                ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_NET_NS
        if (flags & CLONE_NEWNET) {
                ret = validate_ns(nsset, &nsp->net_ns->ns);
                if (ret)
                        goto out;
        }
#endif

#ifdef CONFIG_TIME_NS
        if (flags & CLONE_NEWTIME) {
                ret = validate_ns(nsset, &nsp->time_ns->ns);
                if (ret)
                        goto out;
        }
#endif

out:
        if (pid_ns)
                put_pid_ns(pid_ns);
        if (nsp)
                put_nsproxy(nsp);
        put_user_ns(user_ns);

        return ret;
}

/*
 * This is the point of no return. There are just a few namespaces
 * that do some actual work here and it's sufficiently minimal that
 * a separate ns_common operation seems unnecessary for now.
 * Unshare is doing the same thing. If we'll end up needing to do
 * more in a given namespace or a helper here is ultimately not
 * exported anymore a simple commit handler for each namespace
 * should be added to ns_common.
 */
static void commit_nsset(struct nsset *nsset)
{
        unsigned flags = nsset->flags;
        struct task_struct *me = current;

#ifdef CONFIG_USER_NS
        if (flags & CLONE_NEWUSER) {
                /* transfer ownership */
                commit_creds(nsset_cred(nsset));
                nsset->cred = NULL;
        }
#endif

        /* We only need to commit if we have used a temporary fs_struct. */
        if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
                set_fs_root(me->fs, &nsset->fs->root);
                set_fs_pwd(me->fs, &nsset->fs->pwd);
        }

#ifdef CONFIG_IPC_NS
        if (flags & CLONE_NEWIPC)
                exit_sem(me);
#endif

#ifdef CONFIG_TIME_NS
        if (flags & CLONE_NEWTIME)
                timens_commit(me, nsset->nsproxy->time_ns);
#endif

        /* transfer ownership */
        switch_task_namespaces(me, nsset->nsproxy);
        nsset->nsproxy = NULL;
}

SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
        struct fd f = fdget(fd);
        struct ns_common *ns = NULL;
        struct nsset nsset = {};
        int err = 0;

        if (!f.file)
                return -EBADF;

        if (proc_ns_file(f.file)) {
                ns = get_proc_ns(file_inode(f.file));
                if (flags && (ns->ops->type != flags))
                        err = -EINVAL;
                flags = ns->ops->type;
        } else if (!IS_ERR(pidfd_pid(f.file))) {
                err = check_setns_flags(flags);
        } else {
                err = -EINVAL;
        }
        if (err)
                goto out;

        err = prepare_nsset(flags, &nsset);
        if (err)
                goto out;

        if (proc_ns_file(f.file))
                err = validate_ns(&nsset, ns);
        else
                err = validate_nsset(&nsset, pidfd_pid(f.file));
        if (!err) {
                commit_nsset(&nsset);
                perf_event_namespaces(current);
        }
        put_nsset(&nsset);
out:
        fdput(f);
        return err;
}

int __init nsproxy_cache_init(void)
{
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
        return 0;
}

































































































































































































































































































































































































































    1 










    1 

    1 





    1 











    1 




    1 






























































    1 






    1 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/tcp.h>
#include <linux/rcupdate.h>
#include <net/tcp.h>

void tcp_fastopen_init_key_once(struct net *net)
{
        u8 key[TCP_FASTOPEN_KEY_LENGTH];
        struct tcp_fastopen_context *ctxt;

        rcu_read_lock();
        ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
        if (ctxt) {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();

        /* tcp_fastopen_reset_cipher publishes the new context
         * atomically, so we allow this race happening here.
         *
         * All call sites of tcp_fastopen_cookie_gen also check
         * for a valid cookie, so this is an acceptable risk.
         */
        get_random_bytes(key, sizeof(key));
        tcp_fastopen_reset_cipher(net, NULL, key, NULL);
}

static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
        struct tcp_fastopen_context *ctx =
            container_of(head, struct tcp_fastopen_context, rcu);

        kfree_sensitive(ctx);
}

void tcp_fastopen_destroy_cipher(struct sock *sk)
{
        struct tcp_fastopen_context *ctx;

        ctx = rcu_dereference_protected(
                        inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1);
        if (ctx)
                call_rcu(&ctx->rcu, tcp_fastopen_ctx_free);
}

void tcp_fastopen_ctx_destroy(struct net *net)
{
        struct tcp_fastopen_context *ctxt;

        ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL));

        if (ctxt)
                call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
}

int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
                              void *primary_key, void *backup_key)
{
        struct tcp_fastopen_context *ctx, *octx;
        struct fastopen_queue *q;
        int err = 0;

        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx) {
                err = -ENOMEM;
                goto out;
        }

        ctx->key[0].key[0] = get_unaligned_le64(primary_key);
        ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
        if (backup_key) {
                ctx->key[1].key[0] = get_unaligned_le64(backup_key);
                ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
                ctx->num = 2;
        } else {
                ctx->num = 1;
        }

        if (sk) {
                q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
                octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx)));
        } else {
                octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx,
                                          RCU_INITIALIZER(ctx)));
        }

        if (octx)
                call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
out:
        return err;
}

int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
                            u64 *key)
{
        struct tcp_fastopen_context *ctx;
        int n_keys = 0, i;

        rcu_read_lock();
        if (icsk)
                ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
        else
                ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
        if (ctx) {
                n_keys = tcp_fastopen_context_len(ctx);
                for (i = 0; i < n_keys; i++) {
                        put_unaligned_le64(ctx->key[i].key[0], key + (i * 2));
                        put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1);
                }
        }
        rcu_read_unlock();

        return n_keys;
}

static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
                                             struct sk_buff *syn,
                                             const siphash_key_t *key,
                                             struct tcp_fastopen_cookie *foc)
{
        BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));

        if (req->rsk_ops->family == AF_INET) {
                const struct iphdr *iph = ip_hdr(syn);

                foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
                                          sizeof(iph->saddr) +
                                          sizeof(iph->daddr),
                                          key));
                foc->len = TCP_FASTOPEN_COOKIE_SIZE;
                return true;
        }
#if IS_ENABLED(CONFIG_IPV6)
        if (req->rsk_ops->family == AF_INET6) {
                const struct ipv6hdr *ip6h = ipv6_hdr(syn);

                foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
                                          sizeof(ip6h->saddr) +
                                          sizeof(ip6h->daddr),
                                          key));
                foc->len = TCP_FASTOPEN_COOKIE_SIZE;
                return true;
        }
#endif
        return false;
}

/* Generate the fastopen cookie by applying SipHash to both the source and
 * destination addresses.
 */
static void tcp_fastopen_cookie_gen(struct sock *sk,
                                    struct request_sock *req,
                                    struct sk_buff *syn,
                                    struct tcp_fastopen_cookie *foc)
{
        struct tcp_fastopen_context *ctx;

        rcu_read_lock();
        ctx = tcp_fastopen_get_ctx(sk);
        if (ctx)
                __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
        rcu_read_unlock();
}

/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
 * queue this additional data / FIN.
 */
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
                return;

        skb = skb_clone(skb, GFP_ATOMIC);
        if (!skb)
                return;

        skb_dst_drop(skb);
        /* segs_in has been initialized to 1 in tcp_create_openreq_child().
         * Hence, reset segs_in to 0 before calling tcp_segs_in()
         * to avoid double counting.  Also, tcp_segs_in() expects
         * skb->len to include the tcp_hdrlen.  Hence, it should
         * be called before __skb_pull().
         */
        tp->segs_in = 0;
        tcp_segs_in(tp, skb);
        __skb_pull(skb, tcp_hdrlen(skb));
        sk_forced_mem_schedule(sk, skb->truesize);
        skb_set_owner_r(skb, sk);

        TCP_SKB_CB(skb)->seq++;
        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;

        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
        __skb_queue_tail(&sk->sk_receive_queue, skb);
        tp->syn_data_acked = 1;

        /* u64_stats_update_begin(&tp->syncp) not needed here,
         * as we certainly are not changing upper 32bit value (0)
         */
        tp->bytes_received = skb->len;

        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                tcp_fin(sk);
}

/* returns 0 - no key match, 1 for primary, 2 for backup */
static int tcp_fastopen_cookie_gen_check(struct sock *sk,
                                         struct request_sock *req,
                                         struct sk_buff *syn,
                                         struct tcp_fastopen_cookie *orig,
                                         struct tcp_fastopen_cookie *valid_foc)
{
        struct tcp_fastopen_cookie search_foc = { .len = -1 };
        struct tcp_fastopen_cookie *foc = valid_foc;
        struct tcp_fastopen_context *ctx;
        int i, ret = 0;

        rcu_read_lock();
        ctx = tcp_fastopen_get_ctx(sk);
        if (!ctx)
                goto out;
        for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
                __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
                if (tcp_fastopen_cookie_match(foc, orig)) {
                        ret = i + 1;
                        goto out;
                }
                foc = &search_foc;
        }
out:
        rcu_read_unlock();
        return ret;
}

static struct sock *tcp_fastopen_create_child(struct sock *sk,
                                              struct sk_buff *skb,
                                              struct request_sock *req)
{
        struct tcp_sock *tp;
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
        struct sock *child;
        bool own_req;

        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
                                                         NULL, &own_req);
        if (!child)
                return NULL;

        spin_lock(&queue->fastopenq.lock);
        queue->fastopenq.qlen++;
        spin_unlock(&queue->fastopenq.lock);

        /* Initialize the child socket. Have to fix some values to take
         * into account the child is a Fast Open socket and is created
         * only out of the bits carried in the SYN packet.
         */
        tp = tcp_sk(child);

        rcu_assign_pointer(tp->fastopen_rsk, req);
        tcp_rsk(req)->tfo_listener = true;

        /* RFC1323: The window in SYN & SYN/ACK segments is never
         * scaled. So correct it appropriately.
         */
        tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
        tp->max_window = tp->snd_wnd;

        /* Activate the retrans timer so that SYNACK can be retransmitted.
         * The request socket is not added to the ehash
         * because it's been added to the accept queue directly.
         */
        req->timeout = tcp_timeout_init(child);
        inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
                                  req->timeout, TCP_RTO_MAX);

        refcount_set(&req->rsk_refcnt, 2);

        /* Now finish processing the fastopen child socket. */
        tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);

        tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;

        tcp_fastopen_add_skb(child, skb);

        tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
        tp->rcv_wup = tp->rcv_nxt;
        /* tcp_conn_request() is sending the SYNACK,
         * and queues the child into listener accept queue.
         */
        return child;
}

static bool tcp_fastopen_queue_check(struct sock *sk)
{
        struct fastopen_queue *fastopenq;
        int max_qlen;

        /* Make sure the listener has enabled fastopen, and we don't
         * exceed the max # of pending TFO requests allowed before trying
         * to validating the cookie in order to avoid burning CPU cycles
         * unnecessarily.
         *
         * XXX (TFO) - The implication of checking the max_qlen before
         * processing a cookie request is that clients can't differentiate
         * between qlen overflow causing Fast Open to be disabled
         * temporarily vs a server not supporting Fast Open at all.
         */
        fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
        max_qlen = READ_ONCE(fastopenq->max_qlen);
        if (max_qlen == 0)
                return false;

        if (fastopenq->qlen >= max_qlen) {
                struct request_sock *req1;
                spin_lock(&fastopenq->lock);
                req1 = fastopenq->rskq_rst_head;
                if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
                        __NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
                        spin_unlock(&fastopenq->lock);
                        return false;
                }
                fastopenq->rskq_rst_head = req1->dl_next;
                fastopenq->qlen--;
                spin_unlock(&fastopenq->lock);
                reqsk_put(req1);
        }
        return true;
}

static bool tcp_fastopen_no_cookie(const struct sock *sk,
                                   const struct dst_entry *dst,
                                   int flag)
{
        return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) ||
               tcp_sk(sk)->fastopen_no_cookie ||
               (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
}

/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
 * may be updated and return the client in the SYN-ACK later. E.g., Fast Open
 * cookie request (foc->len == 0).
 */
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              const struct dst_entry *dst)
{
        bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
        int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
        struct tcp_fastopen_cookie valid_foc = { .len = -1 };
        struct sock *child;
        int ret = 0;

        if (foc->len == 0) /* Client requests a cookie */
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);

        if (!((tcp_fastopen & TFO_SERVER_ENABLE) &&
              (syn_data || foc->len >= 0) &&
              tcp_fastopen_queue_check(sk))) {
                foc->len = -1;
                return NULL;
        }

        if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
                goto fastopen;

        if (foc->len == 0) {
                /* Client requests a cookie. */
                tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
        } else if (foc->len > 0) {
                ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
                                                    &valid_foc);
                if (!ret) {
                        NET_INC_STATS(sock_net(sk),
                                      LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
                } else {
                        /* Cookie is valid. Create a (full) child socket to
                         * accept the data in SYN before returning a SYN-ACK to
                         * ack the data. If we fail to create the socket, fall
                         * back and ack the ISN only but includes the same
                         * cookie.
                         *
                         * Note: Data-less SYN with valid cookie is allowed to
                         * send data in SYN_RECV state.
                         */
fastopen:
                        child = tcp_fastopen_create_child(sk, skb, req);
                        if (child) {
                                if (ret == 2) {
                                        valid_foc.exp = foc->exp;
                                        *foc = valid_foc;
                                        NET_INC_STATS(sock_net(sk),
                                                      LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
                                } else {
                                        foc->len = -1;
                                }
                                NET_INC_STATS(sock_net(sk),
                                              LINUX_MIB_TCPFASTOPENPASSIVE);
                                return child;
                        }
                        NET_INC_STATS(sock_net(sk),
                                      LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
                }
        }
        valid_foc.exp = foc->exp;
        *foc = valid_foc;
        return NULL;
}

bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
                               struct tcp_fastopen_cookie *cookie)
{
        const struct dst_entry *dst;

        tcp_fastopen_cache_get(sk, mss, cookie);

        /* Firewall blackhole issue check */
        if (tcp_fastopen_active_should_disable(sk)) {
                cookie->len = -1;
                return false;
        }

        dst = __sk_dst_get(sk);

        if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) {
                cookie->len = -1;
                return true;
        }
        if (cookie->len > 0)
                return true;
        tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
        return false;
}

/* This function checks if we want to defer sending SYN until the first
 * write().  We defer under the following conditions:
 * 1. fastopen_connect sockopt is set
 * 2. we have a valid cookie
 * Return value: return true if we want to defer until application writes data
 *               return false if we want to send out SYN immediately
 */
bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
{
        struct tcp_fastopen_cookie cookie = { .len = 0 };
        struct tcp_sock *tp = tcp_sk(sk);
        u16 mss;

        if (tp->fastopen_connect && !tp->fastopen_req) {
                if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
                        inet_set_bit(DEFER_CONNECT, sk);
                        return true;
                }

                /* Alloc fastopen_req in order for FO option to be included
                 * in SYN
                 */
                tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
                                           sk->sk_allocation);
                if (tp->fastopen_req)
                        tp->fastopen_req->cookie = cookie;
                else
                        *err = -ENOBUFS;
        }
        return false;
}
EXPORT_SYMBOL(tcp_fastopen_defer_connect);

/*
 * The following code block is to deal with middle box issues with TFO:
 * Middlebox firewall issues can potentially cause server's data being
 * blackholed after a successful 3WHS using TFO.
 * The proposed solution is to disable active TFO globally under the
 * following circumstances:
 *   1. client side TFO socket receives out of order FIN
 *   2. client side TFO socket receives out of order RST
 *   3. client side TFO socket has timed out three times consecutively during
 *      or after handshake
 * We disable active side TFO globally for 1hr at first. Then if it
 * happens again, we disable it for 2h, then 4h, 8h, ...
 * And we reset the timeout back to 1hr when we see a successful active
 * TFO connection with data exchanges.
 */

/* Disable active TFO and record current jiffies and
 * tfo_active_disable_times
 */
void tcp_fastopen_active_disable(struct sock *sk)
{
        struct net *net = sock_net(sk);

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout))
                return;

        /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */
        WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies);

        /* Paired with smp_rmb() in tcp_fastopen_active_should_disable().
         * We want net->ipv4.tfo_active_disable_stamp to be updated first.
         */
        smp_mb__before_atomic();
        atomic_inc(&net->ipv4.tfo_active_disable_times);

        NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
}

/* Calculate timeout for tfo active disable
 * Return true if we are still in the active TFO disable period
 * Return false if timeout already expired and we should use active TFO
 */
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
        unsigned int tfo_bh_timeout =
                READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout);
        unsigned long timeout;
        int tfo_da_times;
        int multiplier;

        if (!tfo_bh_timeout)
                return false;

        tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
        if (!tfo_da_times)
                return false;

        /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */
        smp_rmb();

        /* Limit timeout to max: 2^6 * initial timeout */
        multiplier = 1 << min(tfo_da_times - 1, 6);

        /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */
        timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) +
                  multiplier * tfo_bh_timeout * HZ;
        if (time_before(jiffies, timeout))
                return true;

        /* Mark check bit so we can check for successful active TFO
         * condition and reset tfo_active_disable_times
         */
        tcp_sk(sk)->syn_fastopen_ch = 1;
        return false;
}

/* Disable active TFO if FIN is the only packet in the ofo queue
 * and no data is received.
 * Also check if we can reset tfo_active_disable_times if data is
 * received successfully on a marked active TFO sockets opened on
 * a non-loopback interface
 */
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct dst_entry *dst;
        struct sk_buff *skb;

        if (!tp->syn_fastopen)
                return;

        if (!tp->data_segs_in) {
                skb = skb_rb_first(&tp->out_of_order_queue);
                if (skb && !skb_rb_next(skb)) {
                        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
                                tcp_fastopen_active_disable(sk);
                                return;
                        }
                }
        } else if (tp->syn_fastopen_ch &&
                   atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
                dst = sk_dst_get(sk);
                if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
                        atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
                dst_release(dst);
        }
}

void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired)
{
        u32 timeouts = inet_csk(sk)->icsk_retransmits;
        struct tcp_sock *tp = tcp_sk(sk);

        /* Broken middle-boxes may black-hole Fast Open connection during or
         * even after the handshake. Be extremely conservative and pause
         * Fast Open globally after hitting the third consecutive timeout or
         * exceeding the configured timeout limit.
         */
        if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) &&
            (timeouts == 2 || (timeouts < 2 && expired))) {
                tcp_fastopen_active_disable(sk);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
        }
}













































































































































































































































































































































































































    1 










    1 


























    1 
















































































































    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *
 * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
 * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
 * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net)
 * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi)
 */

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/stat.h>
#include <net/net_namespace.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/uaccess.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <net/rose.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/tcp_states.h>
#include <net/ip.h>
#include <net/arp.h>

static int rose_ndevs = 10;

int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0;
int sysctl_rose_call_request_timeout    = ROSE_DEFAULT_T1;
int sysctl_rose_reset_request_timeout   = ROSE_DEFAULT_T2;
int sysctl_rose_clear_request_timeout   = ROSE_DEFAULT_T3;
int sysctl_rose_no_activity_timeout     = ROSE_DEFAULT_IDLE;
int sysctl_rose_ack_hold_back_timeout   = ROSE_DEFAULT_HB;
int sysctl_rose_routing_control         = ROSE_DEFAULT_ROUTING;
int sysctl_rose_link_fail_timeout       = ROSE_DEFAULT_FAIL_TIMEOUT;
int sysctl_rose_maximum_vcs             = ROSE_DEFAULT_MAXVC;
int sysctl_rose_window_size             = ROSE_DEFAULT_WINDOW_SIZE;

static HLIST_HEAD(rose_list);
static DEFINE_SPINLOCK(rose_list_lock);

static const struct proto_ops rose_proto_ops;

ax25_address rose_callsign;

/*
 * ROSE network devices are virtual network devices encapsulating ROSE
 * frames into AX.25 which will be sent through an AX.25 device, so form a
 * special "super class" of normal net devices; split their locks off into a
 * separate class since they always nest.
 */
static struct lock_class_key rose_netdev_xmit_lock_key;
static struct lock_class_key rose_netdev_addr_lock_key;

static void rose_set_lockdep_one(struct net_device *dev,
                                 struct netdev_queue *txq,
                                 void *_unused)
{
        lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key);
}

static void rose_set_lockdep_key(struct net_device *dev)
{
        lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key);
        netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL);
}

/*
 *        Convert a ROSE address into text.
 */
char *rose2asc(char *buf, const rose_address *addr)
{
        if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 &&
            addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 &&
            addr->rose_addr[4] == 0x00) {
                strcpy(buf, "*");
        } else {
                sprintf(buf, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF,
                                                addr->rose_addr[1] & 0xFF,
                                                addr->rose_addr[2] & 0xFF,
                                                addr->rose_addr[3] & 0xFF,
                                                addr->rose_addr[4] & 0xFF);
        }

        return buf;
}

/*
 *        Compare two ROSE addresses, 0 == equal.
 */
int rosecmp(const rose_address *addr1, const rose_address *addr2)
{
        int i;

        for (i = 0; i < 5; i++)
                if (addr1->rose_addr[i] != addr2->rose_addr[i])
                        return 1;

        return 0;
}

/*
 *        Compare two ROSE addresses for only mask digits, 0 == equal.
 */
int rosecmpm(const rose_address *addr1, const rose_address *addr2,
             unsigned short mask)
{
        unsigned int i, j;

        if (mask > 10)
                return 1;

        for (i = 0; i < mask; i++) {
                j = i / 2;

                if ((i % 2) != 0) {
                        if ((addr1->rose_addr[j] & 0x0F) != (addr2->rose_addr[j] & 0x0F))
                                return 1;
                } else {
                        if ((addr1->rose_addr[j] & 0xF0) != (addr2->rose_addr[j] & 0xF0))
                                return 1;
                }
        }

        return 0;
}

/*
 *        Socket removal during an interrupt is now safe.
 */
static void rose_remove_socket(struct sock *sk)
{
        spin_lock_bh(&rose_list_lock);
        sk_del_node_init(sk);
        spin_unlock_bh(&rose_list_lock);
}

/*
 *        Kill all bound sockets on a broken link layer connection to a
 *        particular neighbour.
 */
void rose_kill_by_neigh(struct rose_neigh *neigh)
{
        struct sock *s;

        spin_lock_bh(&rose_list_lock);
        sk_for_each(s, &rose_list) {
                struct rose_sock *rose = rose_sk(s);

                if (rose->neighbour == neigh) {
                        rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
                        rose->neighbour->use--;
                        rose->neighbour = NULL;
                }
        }
        spin_unlock_bh(&rose_list_lock);
}

/*
 *        Kill all bound sockets on a dropped device.
 */
static void rose_kill_by_device(struct net_device *dev)
{
        struct sock *sk, *array[16];
        struct rose_sock *rose;
        bool rescan;
        int i, cnt;

start:
        rescan = false;
        cnt = 0;
        spin_lock_bh(&rose_list_lock);
        sk_for_each(sk, &rose_list) {
                rose = rose_sk(sk);
                if (rose->device == dev) {
                        if (cnt == ARRAY_SIZE(array)) {
                                rescan = true;
                                break;
                        }
                        sock_hold(sk);
                        array[cnt++] = sk;
                }
        }
        spin_unlock_bh(&rose_list_lock);

        for (i = 0; i < cnt; i++) {
                sk = array[cnt];
                rose = rose_sk(sk);
                lock_sock(sk);
                spin_lock_bh(&rose_list_lock);
                if (rose->device == dev) {
                        rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0);
                        if (rose->neighbour)
                                rose->neighbour->use--;
                        netdev_put(rose->device, &rose->dev_tracker);
                        rose->device = NULL;
                }
                spin_unlock_bh(&rose_list_lock);
                release_sock(sk);
                sock_put(sk);
                cond_resched();
        }
        if (rescan)
                goto start;
}

/*
 *        Handle device status changes.
 */
static int rose_device_event(struct notifier_block *this,
                             unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (!net_eq(dev_net(dev), &init_net))
                return NOTIFY_DONE;

        if (event != NETDEV_DOWN)
                return NOTIFY_DONE;

        switch (dev->type) {
        case ARPHRD_ROSE:
                rose_kill_by_device(dev);
                break;
        case ARPHRD_AX25:
                rose_link_device_down(dev);
                rose_rt_device_down(dev);
                break;
        }

        return NOTIFY_DONE;
}

/*
 *        Add a socket to the bound sockets list.
 */
static void rose_insert_socket(struct sock *sk)
{

        spin_lock_bh(&rose_list_lock);
        sk_add_node(sk, &rose_list);
        spin_unlock_bh(&rose_list_lock);
}

/*
 *        Find a socket that wants to accept the Call Request we just
 *        received.
 */
static struct sock *rose_find_listener(rose_address *addr, ax25_address *call)
{
        struct sock *s;

        spin_lock_bh(&rose_list_lock);
        sk_for_each(s, &rose_list) {
                struct rose_sock *rose = rose_sk(s);

                if (!rosecmp(&rose->source_addr, addr) &&
                    !ax25cmp(&rose->source_call, call) &&
                    !rose->source_ndigis && s->sk_state == TCP_LISTEN)
                        goto found;
        }

        sk_for_each(s, &rose_list) {
                struct rose_sock *rose = rose_sk(s);

                if (!rosecmp(&rose->source_addr, addr) &&
                    !ax25cmp(&rose->source_call, &null_ax25_address) &&
                    s->sk_state == TCP_LISTEN)
                        goto found;
        }
        s = NULL;
found:
        spin_unlock_bh(&rose_list_lock);
        return s;
}

/*
 *        Find a connected ROSE socket given my LCI and device.
 */
struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh)
{
        struct sock *s;

        spin_lock_bh(&rose_list_lock);
        sk_for_each(s, &rose_list) {
                struct rose_sock *rose = rose_sk(s);

                if (rose->lci == lci && rose->neighbour == neigh)
                        goto found;
        }
        s = NULL;
found:
        spin_unlock_bh(&rose_list_lock);
        return s;
}

/*
 *        Find a unique LCI for a given device.
 */
unsigned int rose_new_lci(struct rose_neigh *neigh)
{
        int lci;

        if (neigh->dce_mode) {
                for (lci = 1; lci <= sysctl_rose_maximum_vcs; lci++)
                        if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL)
                                return lci;
        } else {
                for (lci = sysctl_rose_maximum_vcs; lci > 0; lci--)
                        if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL)
                                return lci;
        }

        return 0;
}

/*
 *        Deferred destroy.
 */
void rose_destroy_socket(struct sock *);

/*
 *        Handler for deferred kills.
 */
static void rose_destroy_timer(struct timer_list *t)
{
        struct sock *sk = from_timer(sk, t, sk_timer);

        rose_destroy_socket(sk);
}

/*
 *        This is called from user mode and the timers. Thus it protects itself
 *        against interrupt users but doesn't worry about being called during
 *        work.  Once it is removed from the queue no interrupt or bottom half
 *        will touch it and we are (fairly 8-) ) safe.
 */
void rose_destroy_socket(struct sock *sk)
{
        struct sk_buff *skb;

        rose_remove_socket(sk);
        rose_stop_heartbeat(sk);
        rose_stop_idletimer(sk);
        rose_stop_timer(sk);

        rose_clear_queues(sk);                /* Flush the queues */

        while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
                if (skb->sk != sk) {        /* A pending connection */
                        /* Queue the unaccepted socket for death */
                        sock_set_flag(skb->sk, SOCK_DEAD);
                        rose_start_heartbeat(skb->sk);
                        rose_sk(skb->sk)->state = ROSE_STATE_0;
                }

                kfree_skb(skb);
        }

        if (sk_has_allocations(sk)) {
                /* Defer: outstanding buffers */
                timer_setup(&sk->sk_timer, rose_destroy_timer, 0);
                sk->sk_timer.expires  = jiffies + 10 * HZ;
                add_timer(&sk->sk_timer);
        } else
                sock_put(sk);
}

/*
 *        Handling for system calls applied via the various interfaces to a
 *        ROSE socket object.
 */

static int rose_setsockopt(struct socket *sock, int level, int optname,
                sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        int opt;

        if (level != SOL_ROSE)
                return -ENOPROTOOPT;

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&opt, optval, sizeof(int)))
                return -EFAULT;

        switch (optname) {
        case ROSE_DEFER:
                rose->defer = opt ? 1 : 0;
                return 0;

        case ROSE_T1:
                if (opt < 1)
                        return -EINVAL;
                rose->t1 = opt * HZ;
                return 0;

        case ROSE_T2:
                if (opt < 1)
                        return -EINVAL;
                rose->t2 = opt * HZ;
                return 0;

        case ROSE_T3:
                if (opt < 1)
                        return -EINVAL;
                rose->t3 = opt * HZ;
                return 0;

        case ROSE_HOLDBACK:
                if (opt < 1)
                        return -EINVAL;
                rose->hb = opt * HZ;
                return 0;

        case ROSE_IDLE:
                if (opt < 0)
                        return -EINVAL;
                rose->idle = opt * 60 * HZ;
                return 0;

        case ROSE_QBITINCL:
                rose->qbitincl = opt ? 1 : 0;
                return 0;

        default:
                return -ENOPROTOOPT;
        }
}

static int rose_getsockopt(struct socket *sock, int level, int optname,
        char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        int val = 0;
        int len;

        if (level != SOL_ROSE)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case ROSE_DEFER:
                val = rose->defer;
                break;

        case ROSE_T1:
                val = rose->t1 / HZ;
                break;

        case ROSE_T2:
                val = rose->t2 / HZ;
                break;

        case ROSE_T3:
                val = rose->t3 / HZ;
                break;

        case ROSE_HOLDBACK:
                val = rose->hb / HZ;
                break;

        case ROSE_IDLE:
                val = rose->idle / (60 * HZ);
                break;

        case ROSE_QBITINCL:
                val = rose->qbitincl;
                break;

        default:
                return -ENOPROTOOPT;
        }

        len = min_t(unsigned int, len, sizeof(int));

        if (put_user(len, optlen))
                return -EFAULT;

        return copy_to_user(optval, &val, len) ? -EFAULT : 0;
}

static int rose_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;

        lock_sock(sk);
        if (sock->state != SS_UNCONNECTED) {
                release_sock(sk);
                return -EINVAL;
        }

        if (sk->sk_state != TCP_LISTEN) {
                struct rose_sock *rose = rose_sk(sk);

                rose->dest_ndigis = 0;
                memset(&rose->dest_addr, 0, ROSE_ADDR_LEN);
                memset(&rose->dest_call, 0, AX25_ADDR_LEN);
                memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS);
                sk->sk_max_ack_backlog = backlog;
                sk->sk_state           = TCP_LISTEN;
                release_sock(sk);
                return 0;
        }
        release_sock(sk);

        return -EOPNOTSUPP;
}

static struct proto rose_proto = {
        .name          = "ROSE",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct rose_sock),
};

static int rose_create(struct net *net, struct socket *sock, int protocol,
                       int kern)
{
        struct sock *sk;
        struct rose_sock *rose;

        if (!net_eq(net, &init_net))
                return -EAFNOSUPPORT;

        if (sock->type != SOCK_SEQPACKET || protocol != 0)
                return -ESOCKTNOSUPPORT;

        sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern);
        if (sk == NULL)
                return -ENOMEM;

        rose = rose_sk(sk);

        sock_init_data(sock, sk);

        skb_queue_head_init(&rose->ack_queue);
#ifdef M_BIT
        skb_queue_head_init(&rose->frag_queue);
        rose->fraglen    = 0;
#endif

        sock->ops    = &rose_proto_ops;
        sk->sk_protocol = protocol;

        timer_setup(&rose->timer, NULL, 0);
        timer_setup(&rose->idletimer, NULL, 0);

        rose->t1   = msecs_to_jiffies(sysctl_rose_call_request_timeout);
        rose->t2   = msecs_to_jiffies(sysctl_rose_reset_request_timeout);
        rose->t3   = msecs_to_jiffies(sysctl_rose_clear_request_timeout);
        rose->hb   = msecs_to_jiffies(sysctl_rose_ack_hold_back_timeout);
        rose->idle = msecs_to_jiffies(sysctl_rose_no_activity_timeout);

        rose->state = ROSE_STATE_0;

        return 0;
}

static struct sock *rose_make_new(struct sock *osk)
{
        struct sock *sk;
        struct rose_sock *rose, *orose;

        if (osk->sk_type != SOCK_SEQPACKET)
                return NULL;

        sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0);
        if (sk == NULL)
                return NULL;

        rose = rose_sk(sk);

        sock_init_data(NULL, sk);

        skb_queue_head_init(&rose->ack_queue);
#ifdef M_BIT
        skb_queue_head_init(&rose->frag_queue);
        rose->fraglen  = 0;
#endif

        sk->sk_type     = osk->sk_type;
        sk->sk_priority = READ_ONCE(osk->sk_priority);
        sk->sk_protocol = osk->sk_protocol;
        sk->sk_rcvbuf   = osk->sk_rcvbuf;
        sk->sk_sndbuf   = osk->sk_sndbuf;
        sk->sk_state    = TCP_ESTABLISHED;
        sock_copy_flags(sk, osk);

        timer_setup(&rose->timer, NULL, 0);
        timer_setup(&rose->idletimer, NULL, 0);

        orose                = rose_sk(osk);
        rose->t1        = orose->t1;
        rose->t2        = orose->t2;
        rose->t3        = orose->t3;
        rose->hb        = orose->hb;
        rose->idle        = orose->idle;
        rose->defer        = orose->defer;
        rose->device        = orose->device;
        if (rose->device)
                netdev_hold(rose->device, &rose->dev_tracker, GFP_ATOMIC);
        rose->qbitincl        = orose->qbitincl;

        return sk;
}

static int rose_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose;

        if (sk == NULL) return 0;

        sock_hold(sk);
        sock_orphan(sk);
        lock_sock(sk);
        rose = rose_sk(sk);

        switch (rose->state) {
        case ROSE_STATE_0:
                release_sock(sk);
                rose_disconnect(sk, 0, -1, -1);
                lock_sock(sk);
                rose_destroy_socket(sk);
                break;

        case ROSE_STATE_2:
                rose->neighbour->use--;
                release_sock(sk);
                rose_disconnect(sk, 0, -1, -1);
                lock_sock(sk);
                rose_destroy_socket(sk);
                break;

        case ROSE_STATE_1:
        case ROSE_STATE_3:
        case ROSE_STATE_4:
        case ROSE_STATE_5:
                rose_clear_queues(sk);
                rose_stop_idletimer(sk);
                rose_write_internal(sk, ROSE_CLEAR_REQUEST);
                rose_start_t3timer(sk);
                rose->state  = ROSE_STATE_2;
                sk->sk_state    = TCP_CLOSE;
                sk->sk_shutdown |= SEND_SHUTDOWN;
                sk->sk_state_change(sk);
                sock_set_flag(sk, SOCK_DEAD);
                sock_set_flag(sk, SOCK_DESTROY);
                break;

        default:
                break;
        }

        spin_lock_bh(&rose_list_lock);
        netdev_put(rose->device, &rose->dev_tracker);
        rose->device = NULL;
        spin_unlock_bh(&rose_list_lock);
        sock->sk = NULL;
        release_sock(sk);
        sock_put(sk);

        return 0;
}

static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
        struct net_device *dev;
        ax25_address *source;
        ax25_uid_assoc *user;
        int n;

        if (!sock_flag(sk, SOCK_ZAPPED))
                return -EINVAL;

        if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose))
                return -EINVAL;

        if (addr->srose_family != AF_ROSE)
                return -EINVAL;

        if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
                return -EINVAL;

        if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
                return -EINVAL;

        if ((dev = rose_dev_get(&addr->srose_addr)) == NULL)
                return -EADDRNOTAVAIL;

        source = &addr->srose_call;

        user = ax25_findbyuid(current_euid());
        if (user) {
                rose->source_call = user->call;
                ax25_uid_put(user);
        } else {
                if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
                        dev_put(dev);
                        return -EACCES;
                }
                rose->source_call   = *source;
        }

        rose->source_addr   = addr->srose_addr;
        rose->device        = dev;
        netdev_tracker_alloc(rose->device, &rose->dev_tracker, GFP_KERNEL);
        rose->source_ndigis = addr->srose_ndigis;

        if (addr_len == sizeof(struct full_sockaddr_rose)) {
                struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr;
                for (n = 0 ; n < addr->srose_ndigis ; n++)
                        rose->source_digis[n] = full_addr->srose_digis[n];
        } else {
                if (rose->source_ndigis == 1) {
                        rose->source_digis[0] = addr->srose_digi;
                }
        }

        rose_insert_socket(sk);

        sock_reset_flag(sk, SOCK_ZAPPED);

        return 0;
}

static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr;
        unsigned char cause, diagnostic;
        ax25_uid_assoc *user;
        int n, err = 0;

        if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose))
                return -EINVAL;

        if (addr->srose_family != AF_ROSE)
                return -EINVAL;

        if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
                return -EINVAL;

        if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
                return -EINVAL;

        /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */
        if ((rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS)
                return -EINVAL;

        lock_sock(sk);

        if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
                /* Connect completed during a ERESTARTSYS event */
                sock->state = SS_CONNECTED;
                goto out_release;
        }

        if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) {
                sock->state = SS_UNCONNECTED;
                err = -ECONNREFUSED;
                goto out_release;
        }

        if (sk->sk_state == TCP_ESTABLISHED) {
                /* No reconnect on a seqpacket socket */
                err = -EISCONN;
                goto out_release;
        }

        sk->sk_state   = TCP_CLOSE;
        sock->state = SS_UNCONNECTED;

        rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause,
                                         &diagnostic, 0);
        if (!rose->neighbour) {
                err = -ENETUNREACH;
                goto out_release;
        }

        rose->lci = rose_new_lci(rose->neighbour);
        if (!rose->lci) {
                err = -ENETUNREACH;
                goto out_release;
        }

        if (sock_flag(sk, SOCK_ZAPPED)) {        /* Must bind first - autobinding in this may or may not work */
                struct net_device *dev;

                sock_reset_flag(sk, SOCK_ZAPPED);

                dev = rose_dev_first();
                if (!dev) {
                        err = -ENETUNREACH;
                        goto out_release;
                }

                user = ax25_findbyuid(current_euid());
                if (!user) {
                        err = -EINVAL;
                        dev_put(dev);
                        goto out_release;
                }

                memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN);
                rose->source_call = user->call;
                rose->device      = dev;
                netdev_tracker_alloc(rose->device, &rose->dev_tracker,
                                     GFP_KERNEL);
                ax25_uid_put(user);

                rose_insert_socket(sk);                /* Finish the bind */
        }
        rose->dest_addr   = addr->srose_addr;
        rose->dest_call   = addr->srose_call;
        rose->rand        = ((long)rose & 0xFFFF) + rose->lci;
        rose->dest_ndigis = addr->srose_ndigis;

        if (addr_len == sizeof(struct full_sockaddr_rose)) {
                struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr;
                for (n = 0 ; n < addr->srose_ndigis ; n++)
                        rose->dest_digis[n] = full_addr->srose_digis[n];
        } else {
                if (rose->dest_ndigis == 1) {
                        rose->dest_digis[0] = addr->srose_digi;
                }
        }

        /* Move to connecting socket, start sending Connect Requests */
        sock->state   = SS_CONNECTING;
        sk->sk_state     = TCP_SYN_SENT;

        rose->state = ROSE_STATE_1;

        rose->neighbour->use++;

        rose_write_internal(sk, ROSE_CALL_REQUEST);
        rose_start_heartbeat(sk);
        rose_start_t1timer(sk);

        /* Now the loop */
        if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) {
                err = -EINPROGRESS;
                goto out_release;
        }

        /*
         * A Connect Ack with Choke or timeout or failed routing will go to
         * closed.
         */
        if (sk->sk_state == TCP_SYN_SENT) {
                DEFINE_WAIT(wait);

                for (;;) {
                        prepare_to_wait(sk_sleep(sk), &wait,
                                        TASK_INTERRUPTIBLE);
                        if (sk->sk_state != TCP_SYN_SENT)
                                break;
                        if (!signal_pending(current)) {
                                release_sock(sk);
                                schedule();
                                lock_sock(sk);
                                continue;
                        }
                        err = -ERESTARTSYS;
                        break;
                }
                finish_wait(sk_sleep(sk), &wait);

                if (err)
                        goto out_release;
        }

        if (sk->sk_state != TCP_ESTABLISHED) {
                sock->state = SS_UNCONNECTED;
                err = sock_error(sk);        /* Always set at this point */
                goto out_release;
        }

        sock->state = SS_CONNECTED;

out_release:
        release_sock(sk);

        return err;
}

static int rose_accept(struct socket *sock, struct socket *newsock,
                       struct proto_accept_arg *arg)
{
        struct sk_buff *skb;
        struct sock *newsk;
        DEFINE_WAIT(wait);
        struct sock *sk;
        int err = 0;

        if ((sk = sock->sk) == NULL)
                return -EINVAL;

        lock_sock(sk);
        if (sk->sk_type != SOCK_SEQPACKET) {
                err = -EOPNOTSUPP;
                goto out_release;
        }

        if (sk->sk_state != TCP_LISTEN) {
                err = -EINVAL;
                goto out_release;
        }

        /*
         *        The write queue this time is holding sockets ready to use
         *        hooked into the SABM we saved
         */
        for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);

                skb = skb_dequeue(&sk->sk_receive_queue);
                if (skb)
                        break;

                if (arg->flags & O_NONBLOCK) {
                        err = -EWOULDBLOCK;
                        break;
                }
                if (!signal_pending(current)) {
                        release_sock(sk);
                        schedule();
                        lock_sock(sk);
                        continue;
                }
                err = -ERESTARTSYS;
                break;
        }
        finish_wait(sk_sleep(sk), &wait);
        if (err)
                goto out_release;

        newsk = skb->sk;
        sock_graft(newsk, newsock);

        /* Now attach up the new socket */
        skb->sk = NULL;
        kfree_skb(skb);
        sk_acceptq_removed(sk);

out_release:
        release_sock(sk);

        return err;
}

static int rose_getname(struct socket *sock, struct sockaddr *uaddr,
        int peer)
{
        struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr;
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        int n;

        memset(srose, 0, sizeof(*srose));
        if (peer != 0) {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -ENOTCONN;
                srose->srose_family = AF_ROSE;
                srose->srose_addr   = rose->dest_addr;
                srose->srose_call   = rose->dest_call;
                srose->srose_ndigis = rose->dest_ndigis;
                for (n = 0; n < rose->dest_ndigis; n++)
                        srose->srose_digis[n] = rose->dest_digis[n];
        } else {
                srose->srose_family = AF_ROSE;
                srose->srose_addr   = rose->source_addr;
                srose->srose_call   = rose->source_call;
                srose->srose_ndigis = rose->source_ndigis;
                for (n = 0; n < rose->source_ndigis; n++)
                        srose->srose_digis[n] = rose->source_digis[n];
        }

        return sizeof(struct full_sockaddr_rose);
}

int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci)
{
        struct sock *sk;
        struct sock *make;
        struct rose_sock *make_rose;
        struct rose_facilities_struct facilities;
        int n;

        skb->sk = NULL;                /* Initially we don't know who it's for */

        /*
         *        skb->data points to the rose frame start
         */
        memset(&facilities, 0x00, sizeof(struct rose_facilities_struct));

        if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF,
                                   skb->len - ROSE_CALL_REQ_FACILITIES_OFF,
                                   &facilities)) {
                rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76);
                return 0;
        }

        sk = rose_find_listener(&facilities.source_addr, &facilities.source_call);

        /*
         * We can't accept the Call Request.
         */
        if (sk == NULL || sk_acceptq_is_full(sk) ||
            (make = rose_make_new(sk)) == NULL) {
                rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120);
                return 0;
        }

        skb->sk     = make;
        make->sk_state = TCP_ESTABLISHED;
        make_rose = rose_sk(make);

        make_rose->lci           = lci;
        make_rose->dest_addr     = facilities.dest_addr;
        make_rose->dest_call     = facilities.dest_call;
        make_rose->dest_ndigis   = facilities.dest_ndigis;
        for (n = 0 ; n < facilities.dest_ndigis ; n++)
                make_rose->dest_digis[n] = facilities.dest_digis[n];
        make_rose->source_addr   = facilities.source_addr;
        make_rose->source_call   = facilities.source_call;
        make_rose->source_ndigis = facilities.source_ndigis;
        for (n = 0 ; n < facilities.source_ndigis ; n++)
                make_rose->source_digis[n] = facilities.source_digis[n];
        make_rose->neighbour     = neigh;
        make_rose->device        = dev;
        /* Caller got a reference for us. */
        netdev_tracker_alloc(make_rose->device, &make_rose->dev_tracker,
                             GFP_ATOMIC);
        make_rose->facilities    = facilities;

        make_rose->neighbour->use++;

        if (rose_sk(sk)->defer) {
                make_rose->state = ROSE_STATE_5;
        } else {
                rose_write_internal(make, ROSE_CALL_ACCEPTED);
                make_rose->state = ROSE_STATE_3;
                rose_start_idletimer(make);
        }

        make_rose->condition = 0x00;
        make_rose->vs        = 0;
        make_rose->va        = 0;
        make_rose->vr        = 0;
        make_rose->vl        = 0;
        sk_acceptq_added(sk);

        rose_insert_socket(make);

        skb_queue_head(&sk->sk_receive_queue, skb);

        rose_start_heartbeat(make);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);

        return 1;
}

static int rose_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_rose *, usrose, msg->msg_name);
        int err;
        struct full_sockaddr_rose srose;
        struct sk_buff *skb;
        unsigned char *asmptr;
        int n, size, qbit = 0;

        if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
                return -EINVAL;

        if (sock_flag(sk, SOCK_ZAPPED))
                return -EADDRNOTAVAIL;

        if (sk->sk_shutdown & SEND_SHUTDOWN) {
                send_sig(SIGPIPE, current, 0);
                return -EPIPE;
        }

        if (rose->neighbour == NULL || rose->device == NULL)
                return -ENETUNREACH;

        if (usrose != NULL) {
                if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose))
                        return -EINVAL;
                memset(&srose, 0, sizeof(struct full_sockaddr_rose));
                memcpy(&srose, usrose, msg->msg_namelen);
                if (rosecmp(&rose->dest_addr, &srose.srose_addr) != 0 ||
                    ax25cmp(&rose->dest_call, &srose.srose_call) != 0)
                        return -EISCONN;
                if (srose.srose_ndigis != rose->dest_ndigis)
                        return -EISCONN;
                if (srose.srose_ndigis == rose->dest_ndigis) {
                        for (n = 0 ; n < srose.srose_ndigis ; n++)
                                if (ax25cmp(&rose->dest_digis[n],
                                            &srose.srose_digis[n]))
                                        return -EISCONN;
                }
                if (srose.srose_family != AF_ROSE)
                        return -EINVAL;
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -ENOTCONN;

                srose.srose_family = AF_ROSE;
                srose.srose_addr   = rose->dest_addr;
                srose.srose_call   = rose->dest_call;
                srose.srose_ndigis = rose->dest_ndigis;
                for (n = 0 ; n < rose->dest_ndigis ; n++)
                        srose.srose_digis[n] = rose->dest_digis[n];
        }

        /* Build a packet */
        /* Sanity check the packet size */
        if (len > 65535)
                return -EMSGSIZE;

        size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN;

        if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL)
                return err;

        skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN);

        /*
         *        Put the data on the end
         */

        skb_reset_transport_header(skb);
        skb_put(skb, len);

        err = memcpy_from_msg(skb_transport_header(skb), msg, len);
        if (err) {
                kfree_skb(skb);
                return err;
        }

        /*
         *        If the Q BIT Include socket option is in force, the first
         *        byte of the user data is the logical value of the Q Bit.
         */
        if (rose->qbitincl) {
                qbit = skb->data[0];
                skb_pull(skb, 1);
        }

        /*
         *        Push down the ROSE header
         */
        asmptr = skb_push(skb, ROSE_MIN_LEN);

        /* Build a ROSE Network header */
        asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI;
        asmptr[1] = (rose->lci >> 0) & 0xFF;
        asmptr[2] = ROSE_DATA;

        if (qbit)
                asmptr[0] |= ROSE_Q_BIT;

        if (sk->sk_state != TCP_ESTABLISHED) {
                kfree_skb(skb);
                return -ENOTCONN;
        }

#ifdef M_BIT
#define ROSE_PACLEN (256-ROSE_MIN_LEN)
        if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) {
                unsigned char header[ROSE_MIN_LEN];
                struct sk_buff *skbn;
                int frontlen;
                int lg;

                /* Save a copy of the Header */
                skb_copy_from_linear_data(skb, header, ROSE_MIN_LEN);
                skb_pull(skb, ROSE_MIN_LEN);

                frontlen = skb_headroom(skb);

                while (skb->len > 0) {
                        if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, &err)) == NULL) {
                                kfree_skb(skb);
                                return err;
                        }

                        skbn->sk   = sk;
                        skbn->free = 1;
                        skbn->arp  = 1;

                        skb_reserve(skbn, frontlen);

                        lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN;

                        /* Copy the user data */
                        skb_copy_from_linear_data(skb, skb_put(skbn, lg), lg);
                        skb_pull(skb, lg);

                        /* Duplicate the Header */
                        skb_push(skbn, ROSE_MIN_LEN);
                        skb_copy_to_linear_data(skbn, header, ROSE_MIN_LEN);

                        if (skb->len > 0)
                                skbn->data[2] |= M_BIT;

                        skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */
                }

                skb->free = 1;
                kfree_skb(skb);
        } else {
                skb_queue_tail(&sk->sk_write_queue, skb);                /* Throw it on the queue */
        }
#else
        skb_queue_tail(&sk->sk_write_queue, skb);        /* Shove it onto the queue */
#endif

        rose_kick(sk);

        return len;
}


static int rose_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        size_t copied;
        unsigned char *asmptr;
        struct sk_buff *skb;
        int n, er, qbit;

        /*
         * This works for seqpacket too. The receiver has ordered the queue for
         * us! We do one quick check first though
         */
        if (sk->sk_state != TCP_ESTABLISHED)
                return -ENOTCONN;

        /* Now we can treat all alike */
        skb = skb_recv_datagram(sk, flags, &er);
        if (!skb)
                return er;

        qbit = (skb->data[0] & ROSE_Q_BIT) == ROSE_Q_BIT;

        skb_pull(skb, ROSE_MIN_LEN);

        if (rose->qbitincl) {
                asmptr  = skb_push(skb, 1);
                *asmptr = qbit;
        }

        skb_reset_transport_header(skb);
        copied     = skb->len;

        if (copied > size) {
                copied = size;
                msg->msg_flags |= MSG_TRUNC;
        }

        skb_copy_datagram_msg(skb, 0, msg, copied);

        if (msg->msg_name) {
                struct sockaddr_rose *srose;
                DECLARE_SOCKADDR(struct full_sockaddr_rose *, full_srose,
                                 msg->msg_name);

                memset(msg->msg_name, 0, sizeof(struct full_sockaddr_rose));
                srose = msg->msg_name;
                srose->srose_family = AF_ROSE;
                srose->srose_addr   = rose->dest_addr;
                srose->srose_call   = rose->dest_call;
                srose->srose_ndigis = rose->dest_ndigis;
                for (n = 0 ; n < rose->dest_ndigis ; n++)
                        full_srose->srose_digis[n] = rose->dest_digis[n];
                msg->msg_namelen = sizeof(struct full_sockaddr_rose);
        }

        skb_free_datagram(sk, skb);

        return copied;
}


static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct sock *sk = sock->sk;
        struct rose_sock *rose = rose_sk(sk);
        void __user *argp = (void __user *)arg;

        switch (cmd) {
        case TIOCOUTQ: {
                long amount;

                amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
                if (amount < 0)
                        amount = 0;
                return put_user(amount, (unsigned int __user *) argp);
        }

        case TIOCINQ: {
                struct sk_buff *skb;
                long amount = 0L;

                spin_lock_irq(&sk->sk_receive_queue.lock);
                if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL)
                        amount = skb->len;
                spin_unlock_irq(&sk->sk_receive_queue.lock);
                return put_user(amount, (unsigned int __user *) argp);
        }

        case SIOCGIFADDR:
        case SIOCSIFADDR:
        case SIOCGIFDSTADDR:
        case SIOCSIFDSTADDR:
        case SIOCGIFBRDADDR:
        case SIOCSIFBRDADDR:
        case SIOCGIFNETMASK:
        case SIOCSIFNETMASK:
        case SIOCGIFMETRIC:
        case SIOCSIFMETRIC:
                return -EINVAL;

        case SIOCADDRT:
        case SIOCDELRT:
        case SIOCRSCLRRT:
                if (!capable(CAP_NET_ADMIN))
                        return -EPERM;
                return rose_rt_ioctl(cmd, argp);

        case SIOCRSGCAUSE: {
                struct rose_cause_struct rose_cause;
                rose_cause.cause      = rose->cause;
                rose_cause.diagnostic = rose->diagnostic;
                return copy_to_user(argp, &rose_cause, sizeof(struct rose_cause_struct)) ? -EFAULT : 0;
        }

        case SIOCRSSCAUSE: {
                struct rose_cause_struct rose_cause;
                if (copy_from_user(&rose_cause, argp, sizeof(struct rose_cause_struct)))
                        return -EFAULT;
                rose->cause      = rose_cause.cause;
                rose->diagnostic = rose_cause.diagnostic;
                return 0;
        }

        case SIOCRSSL2CALL:
                if (!capable(CAP_NET_ADMIN)) return -EPERM;
                if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
                        ax25_listen_release(&rose_callsign, NULL);
                if (copy_from_user(&rose_callsign, argp, sizeof(ax25_address)))
                        return -EFAULT;
                if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
                        return ax25_listen_register(&rose_callsign, NULL);

                return 0;

        case SIOCRSGL2CALL:
                return copy_to_user(argp, &rose_callsign, sizeof(ax25_address)) ? -EFAULT : 0;

        case SIOCRSACCEPT:
                if (rose->state == ROSE_STATE_5) {
                        rose_write_internal(sk, ROSE_CALL_ACCEPTED);
                        rose_start_idletimer(sk);
                        rose->condition = 0x00;
                        rose->vs        = 0;
                        rose->va        = 0;
                        rose->vr        = 0;
                        rose->vl        = 0;
                        rose->state     = ROSE_STATE_3;
                }
                return 0;

        default:
                return -ENOIOCTLCMD;
        }

        return 0;
}

#ifdef CONFIG_PROC_FS
static void *rose_info_start(struct seq_file *seq, loff_t *pos)
        __acquires(rose_list_lock)
{
        spin_lock_bh(&rose_list_lock);
        return seq_hlist_start_head(&rose_list, *pos);
}

static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_hlist_next(v, &rose_list, pos);
}

static void rose_info_stop(struct seq_file *seq, void *v)
        __releases(rose_list_lock)
{
        spin_unlock_bh(&rose_list_lock);
}

static int rose_info_show(struct seq_file *seq, void *v)
{
        char buf[11], rsbuf[11];

        if (v == SEQ_START_TOKEN)
                seq_puts(seq,
                         "dest_addr  dest_call src_addr   src_call  dev   lci neigh st vs vr va   t  t1  t2  t3  hb    idle Snd-Q Rcv-Q inode\n");

        else {
                struct sock *s = sk_entry(v);
                struct rose_sock *rose = rose_sk(s);
                const char *devname, *callsign;
                const struct net_device *dev = rose->device;

                if (!dev)
                        devname = "???";
                else
                        devname = dev->name;

                seq_printf(seq, "%-10s %-9s ",
                           rose2asc(rsbuf, &rose->dest_addr),
                           ax2asc(buf, &rose->dest_call));

                if (ax25cmp(&rose->source_call, &null_ax25_address) == 0)
                        callsign = "??????-?";
                else
                        callsign = ax2asc(buf, &rose->source_call);

                seq_printf(seq,
                           "%-10s %-9s %-5s %3.3X %05d  %d  %d  %d  %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n",
                        rose2asc(rsbuf, &rose->source_addr),
                        callsign,
                        devname,
                        rose->lci & 0x0FFF,
                        (rose->neighbour) ? rose->neighbour->number : 0,
                        rose->state,
                        rose->vs,
                        rose->vr,
                        rose->va,
                        ax25_display_timer(&rose->timer) / HZ,
                        rose->t1 / HZ,
                        rose->t2 / HZ,
                        rose->t3 / HZ,
                        rose->hb / HZ,
                        ax25_display_timer(&rose->idletimer) / (60 * HZ),
                        rose->idle / (60 * HZ),
                        sk_wmem_alloc_get(s),
                        sk_rmem_alloc_get(s),
                        s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L);
        }

        return 0;
}

static const struct seq_operations rose_info_seqops = {
        .start = rose_info_start,
        .next = rose_info_next,
        .stop = rose_info_stop,
        .show = rose_info_show,
};
#endif        /* CONFIG_PROC_FS */

static const struct net_proto_family rose_family_ops = {
        .family                =        PF_ROSE,
        .create                =        rose_create,
        .owner                =        THIS_MODULE,
};

static const struct proto_ops rose_proto_ops = {
        .family                =        PF_ROSE,
        .owner                =        THIS_MODULE,
        .release        =        rose_release,
        .bind                =        rose_bind,
        .connect        =        rose_connect,
        .socketpair        =        sock_no_socketpair,
        .accept                =        rose_accept,
        .getname        =        rose_getname,
        .poll                =        datagram_poll,
        .ioctl                =        rose_ioctl,
        .gettstamp        =        sock_gettstamp,
        .listen                =        rose_listen,
        .shutdown        =        sock_no_shutdown,
        .setsockopt        =        rose_setsockopt,
        .getsockopt        =        rose_getsockopt,
        .sendmsg        =        rose_sendmsg,
        .recvmsg        =        rose_recvmsg,
        .mmap                =        sock_no_mmap,
};

static struct notifier_block rose_dev_notifier = {
        .notifier_call        =        rose_device_event,
};

static struct net_device **dev_rose;

static struct ax25_protocol rose_pid = {
        .pid        = AX25_P_ROSE,
        .func        = rose_route_frame
};

static struct ax25_linkfail rose_linkfail_notifier = {
        .func        = rose_link_failed
};

static int __init rose_proto_init(void)
{
        int i;
        int rc;

        if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) {
                printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter too large\n");
                rc = -EINVAL;
                goto out;
        }

        rc = proto_register(&rose_proto, 0);
        if (rc != 0)
                goto out;

        rose_callsign = null_ax25_address;

        dev_rose = kcalloc(rose_ndevs, sizeof(struct net_device *),
                           GFP_KERNEL);
        if (dev_rose == NULL) {
                printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n");
                rc = -ENOMEM;
                goto out_proto_unregister;
        }

        for (i = 0; i < rose_ndevs; i++) {
                struct net_device *dev;
                char name[IFNAMSIZ];

                sprintf(name, "rose%d", i);
                dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, rose_setup);
                if (!dev) {
                        printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n");
                        rc = -ENOMEM;
                        goto fail;
                }
                rc = register_netdev(dev);
                if (rc) {
                        printk(KERN_ERR "ROSE: netdevice registration failed\n");
                        free_netdev(dev);
                        goto fail;
                }
                rose_set_lockdep_key(dev);
                dev_rose[i] = dev;
        }

        sock_register(&rose_family_ops);
        register_netdevice_notifier(&rose_dev_notifier);

        ax25_register_pid(&rose_pid);
        ax25_linkfail_register(&rose_linkfail_notifier);

#ifdef CONFIG_SYSCTL
        rose_register_sysctl();
#endif
        rose_loopback_init();

        rose_add_loopback_neigh();

        proc_create_seq("rose", 0444, init_net.proc_net, &rose_info_seqops);
        proc_create_seq("rose_neigh", 0444, init_net.proc_net,
                    &rose_neigh_seqops);
        proc_create_seq("rose_nodes", 0444, init_net.proc_net,
                    &rose_node_seqops);
        proc_create_seq("rose_routes", 0444, init_net.proc_net,
                    &rose_route_seqops);
out:
        return rc;
fail:
        while (--i >= 0) {
                unregister_netdev(dev_rose[i]);
                free_netdev(dev_rose[i]);
        }
        kfree(dev_rose);
out_proto_unregister:
        proto_unregister(&rose_proto);
        goto out;
}
module_init(rose_proto_init);

module_param(rose_ndevs, int, 0);
MODULE_PARM_DESC(rose_ndevs, "number of ROSE devices");

MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>");
MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_ROSE);

static void __exit rose_exit(void)
{
        int i;

        remove_proc_entry("rose", init_net.proc_net);
        remove_proc_entry("rose_neigh", init_net.proc_net);
        remove_proc_entry("rose_nodes", init_net.proc_net);
        remove_proc_entry("rose_routes", init_net.proc_net);
        rose_loopback_clear();

        rose_rt_free();

        ax25_protocol_release(AX25_P_ROSE);
        ax25_linkfail_release(&rose_linkfail_notifier);

        if (ax25cmp(&rose_callsign, &null_ax25_address) != 0)
                ax25_listen_release(&rose_callsign, NULL);

#ifdef CONFIG_SYSCTL
        rose_unregister_sysctl();
#endif
        unregister_netdevice_notifier(&rose_dev_notifier);

        sock_unregister(PF_ROSE);

        for (i = 0; i < rose_ndevs; i++) {
                struct net_device *dev = dev_rose[i];

                if (dev) {
                        unregister_netdev(dev);
                        free_netdev(dev);
                }
        }

        kfree(dev_rose);
        proto_unregister(&rose_proto);
}

module_exit(rose_exit);


































































































































    1 








    1 




































































































































































































































































































































































































































































































































































































































































































































































































    1 







































    1 























    1 

































































































































































    1 






    1 



















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
// SPDX-License-Identifier: GPL-2.0-only
/*
 * File: pep.c
 *
 * Phonet pipe protocol end point socket
 *
 * Copyright (C) 2008 Nokia Corporation.
 *
 * Author: Rémi Denis-Courmont
 */

#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <asm/ioctls.h>

#include <linux/phonet.h>
#include <linux/module.h>
#include <net/phonet/phonet.h>
#include <net/phonet/pep.h>
#include <net/phonet/gprs.h>

/* sk_state values:
 * TCP_CLOSE                sock not in use yet
 * TCP_CLOSE_WAIT        disconnected pipe
 * TCP_LISTEN                listening pipe endpoint
 * TCP_SYN_RECV                connected pipe in disabled state
 * TCP_ESTABLISHED        connected pipe in enabled state
 *
 * pep_sock locking:
 *  - sk_state, hlist: sock lock needed
 *  - listener: read only
 *  - pipe_handle: read only
 */

#define CREDITS_MAX        10
#define CREDITS_THR        7

#define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */

/* Get the next TLV sub-block. */
static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen,
                                        void *buf)
{
        void *data = NULL;
        struct {
                u8 sb_type;
                u8 sb_len;
        } *ph, h;
        int buflen = *plen;

        ph = skb_header_pointer(skb, 0, 2, &h);
        if (ph == NULL || ph->sb_len < 2 || !pskb_may_pull(skb, ph->sb_len))
                return NULL;
        ph->sb_len -= 2;
        *ptype = ph->sb_type;
        *plen = ph->sb_len;

        if (buflen > ph->sb_len)
                buflen = ph->sb_len;
        data = skb_header_pointer(skb, 2, buflen, buf);
        __skb_pull(skb, 2 + ph->sb_len);
        return data;
}

static struct sk_buff *pep_alloc_skb(struct sock *sk, const void *payload,
                                        int len, gfp_t priority)
{
        struct sk_buff *skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
        if (!skb)
                return NULL;
        skb_set_owner_w(skb, sk);

        skb_reserve(skb, MAX_PNPIPE_HEADER);
        __skb_put(skb, len);
        skb_copy_to_linear_data(skb, payload, len);
        __skb_push(skb, sizeof(struct pnpipehdr));
        skb_reset_transport_header(skb);
        return skb;
}

static int pep_reply(struct sock *sk, struct sk_buff *oskb, u8 code,
                        const void *data, int len, gfp_t priority)
{
        const struct pnpipehdr *oph = pnp_hdr(oskb);
        struct pnpipehdr *ph;
        struct sk_buff *skb;
        struct sockaddr_pn peer;

        skb = pep_alloc_skb(sk, data, len, priority);
        if (!skb)
                return -ENOMEM;

        ph = pnp_hdr(skb);
        ph->utid = oph->utid;
        ph->message_id = oph->message_id + 1; /* REQ -> RESP */
        ph->pipe_handle = oph->pipe_handle;
        ph->error_code = code;

        pn_skb_get_src_sockaddr(oskb, &peer);
        return pn_skb_send(sk, skb, &peer);
}

static int pep_indicate(struct sock *sk, u8 id, u8 code,
                        const void *data, int len, gfp_t priority)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *ph;
        struct sk_buff *skb;

        skb = pep_alloc_skb(sk, data, len, priority);
        if (!skb)
                return -ENOMEM;

        ph = pnp_hdr(skb);
        ph->utid = 0;
        ph->message_id = id;
        ph->pipe_handle = pn->pipe_handle;
        ph->error_code = code;
        return pn_skb_send(sk, skb, NULL);
}

#define PAD 0x00

static int pipe_handler_request(struct sock *sk, u8 id, u8 code,
                                const void *data, int len)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *ph;
        struct sk_buff *skb;

        skb = pep_alloc_skb(sk, data, len, GFP_KERNEL);
        if (!skb)
                return -ENOMEM;

        ph = pnp_hdr(skb);
        ph->utid = id; /* whatever */
        ph->message_id = id;
        ph->pipe_handle = pn->pipe_handle;
        ph->error_code = code;
        return pn_skb_send(sk, skb, NULL);
}

static int pipe_handler_send_created_ind(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);
        u8 data[4] = {
                PN_PIPE_SB_NEGOTIATED_FC, pep_sb_size(2),
                pn->tx_fc, pn->rx_fc,
        };

        return pep_indicate(sk, PNS_PIPE_CREATED_IND, 1 /* sub-blocks */,
                                data, 4, GFP_ATOMIC);
}

static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
{
        static const u8 data[20] = {
                PAD, PAD, PAD, 2 /* sub-blocks */,
                PN_PIPE_SB_REQUIRED_FC_TX, pep_sb_size(5), 3, PAD,
                        PN_MULTI_CREDIT_FLOW_CONTROL,
                        PN_ONE_CREDIT_FLOW_CONTROL,
                        PN_LEGACY_FLOW_CONTROL,
                        PAD,
                PN_PIPE_SB_PREFERRED_FC_RX, pep_sb_size(5), 3, PAD,
                        PN_MULTI_CREDIT_FLOW_CONTROL,
                        PN_ONE_CREDIT_FLOW_CONTROL,
                        PN_LEGACY_FLOW_CONTROL,
                        PAD,
        };

        might_sleep();
        return pep_reply(sk, skb, PN_PIPE_NO_ERROR, data, sizeof(data),
                                GFP_KERNEL);
}

static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code,
                                gfp_t priority)
{
        static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ };
        WARN_ON(code == PN_PIPE_NO_ERROR);
        return pep_reply(sk, skb, code, data, sizeof(data), priority);
}

/* Control requests are not sent by the pipe service and have a specific
 * message format. */
static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code,
                                gfp_t priority)
{
        const struct pnpipehdr *oph = pnp_hdr(oskb);
        struct sk_buff *skb;
        struct pnpipehdr *ph;
        struct sockaddr_pn dst;
        u8 data[4] = {
                oph->pep_type, /* PEP type */
                code, /* error code, at an unusual offset */
                PAD, PAD,
        };

        skb = pep_alloc_skb(sk, data, 4, priority);
        if (!skb)
                return -ENOMEM;

        ph = pnp_hdr(skb);
        ph->utid = oph->utid;
        ph->message_id = PNS_PEP_CTRL_RESP;
        ph->pipe_handle = oph->pipe_handle;
        ph->data0 = oph->data[0]; /* CTRL id */

        pn_skb_get_src_sockaddr(oskb, &dst);
        return pn_skb_send(sk, skb, &dst);
}

static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
{
        u8 data[4] = { type, PAD, PAD, status };

        return pep_indicate(sk, PNS_PEP_STATUS_IND, PN_PEP_TYPE_COMMON,
                                data, 4, priority);
}

/* Send our RX flow control information to the sender.
 * Socket must be locked. */
static void pipe_grant_credits(struct sock *sk, gfp_t priority)
{
        struct pep_sock *pn = pep_sk(sk);

        BUG_ON(sk->sk_state != TCP_ESTABLISHED);

        switch (pn->rx_fc) {
        case PN_LEGACY_FLOW_CONTROL: /* TODO */
                break;
        case PN_ONE_CREDIT_FLOW_CONTROL:
                if (pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL,
                                        PEP_IND_READY, priority) == 0)
                        pn->rx_credits = 1;
                break;
        case PN_MULTI_CREDIT_FLOW_CONTROL:
                if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX)
                        break;
                if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS,
                                        CREDITS_MAX - pn->rx_credits,
                                        priority) == 0)
                        pn->rx_credits = CREDITS_MAX;
                break;
        }
}

static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *hdr;
        int wake = 0;

        if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
                return -EINVAL;

        hdr = pnp_hdr(skb);
        if (hdr->pep_type != PN_PEP_TYPE_COMMON) {
                net_dbg_ratelimited("Phonet unknown PEP type: %u\n",
                                    (unsigned int)hdr->pep_type);
                return -EOPNOTSUPP;
        }

        switch (hdr->data[0]) {
        case PN_PEP_IND_FLOW_CONTROL:
                switch (pn->tx_fc) {
                case PN_LEGACY_FLOW_CONTROL:
                        switch (hdr->data[3]) {
                        case PEP_IND_BUSY:
                                atomic_set(&pn->tx_credits, 0);
                                break;
                        case PEP_IND_READY:
                                atomic_set(&pn->tx_credits, wake = 1);
                                break;
                        }
                        break;
                case PN_ONE_CREDIT_FLOW_CONTROL:
                        if (hdr->data[3] == PEP_IND_READY)
                                atomic_set(&pn->tx_credits, wake = 1);
                        break;
                }
                break;

        case PN_PEP_IND_ID_MCFC_GRANT_CREDITS:
                if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL)
                        break;
                atomic_add(wake = hdr->data[3], &pn->tx_credits);
                break;

        default:
                net_dbg_ratelimited("Phonet unknown PEP indication: %u\n",
                                    (unsigned int)hdr->data[0]);
                return -EOPNOTSUPP;
        }
        if (wake)
                sk->sk_write_space(sk);
        return 0;
}

static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *hdr = pnp_hdr(skb);
        u8 n_sb = hdr->data0;

        pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
        __skb_pull(skb, sizeof(*hdr));
        while (n_sb > 0) {
                u8 type, buf[2], len = sizeof(buf);
                u8 *data = pep_get_sb(skb, &type, &len, buf);

                if (data == NULL)
                        return -EINVAL;
                switch (type) {
                case PN_PIPE_SB_NEGOTIATED_FC:
                        if (len < 2 || (data[0] | data[1]) > 3)
                                break;
                        pn->tx_fc = data[0] & 3;
                        pn->rx_fc = data[1] & 3;
                        break;
                }
                n_sb--;
        }
        return 0;
}

/* Queue an skb to a connected sock.
 * Socket lock must be held. */
static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *hdr = pnp_hdr(skb);
        struct sk_buff_head *queue;
        int err = 0;

        BUG_ON(sk->sk_state == TCP_CLOSE_WAIT);

        switch (hdr->message_id) {
        case PNS_PEP_CONNECT_REQ:
                pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC);
                break;

        case PNS_PEP_DISCONNECT_REQ:
                pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
                sk->sk_state = TCP_CLOSE_WAIT;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk->sk_state_change(sk);
                break;

        case PNS_PEP_ENABLE_REQ:
                /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
                pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
                break;

        case PNS_PEP_RESET_REQ:
                switch (hdr->state_after_reset) {
                case PN_PIPE_DISABLE:
                        pn->init_enable = 0;
                        break;
                case PN_PIPE_ENABLE:
                        pn->init_enable = 1;
                        break;
                default: /* not allowed to send an error here!? */
                        err = -EINVAL;
                        goto out;
                }
                fallthrough;
        case PNS_PEP_DISABLE_REQ:
                atomic_set(&pn->tx_credits, 0);
                pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
                break;

        case PNS_PEP_CTRL_REQ:
                if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
                        atomic_inc(&sk->sk_drops);
                        break;
                }
                __skb_pull(skb, 4);
                queue = &pn->ctrlreq_queue;
                goto queue;

        case PNS_PIPE_ALIGNED_DATA:
                __skb_pull(skb, 1);
                fallthrough;
        case PNS_PIPE_DATA:
                __skb_pull(skb, 3); /* Pipe data header */
                if (!pn_flow_safe(pn->rx_fc)) {
                        err = sock_queue_rcv_skb(sk, skb);
                        if (!err)
                                return NET_RX_SUCCESS;
                        err = -ENOBUFS;
                        break;
                }

                if (pn->rx_credits == 0) {
                        atomic_inc(&sk->sk_drops);
                        err = -ENOBUFS;
                        break;
                }
                pn->rx_credits--;
                queue = &sk->sk_receive_queue;
                goto queue;

        case PNS_PEP_STATUS_IND:
                pipe_rcv_status(sk, skb);
                break;

        case PNS_PIPE_REDIRECTED_IND:
                err = pipe_rcv_created(sk, skb);
                break;

        case PNS_PIPE_CREATED_IND:
                err = pipe_rcv_created(sk, skb);
                if (err)
                        break;
                fallthrough;
        case PNS_PIPE_RESET_IND:
                if (!pn->init_enable)
                        break;
                fallthrough;
        case PNS_PIPE_ENABLED_IND:
                if (!pn_flow_safe(pn->tx_fc)) {
                        atomic_set(&pn->tx_credits, 1);
                        sk->sk_write_space(sk);
                }
                if (sk->sk_state == TCP_ESTABLISHED)
                        break; /* Nothing to do */
                sk->sk_state = TCP_ESTABLISHED;
                pipe_grant_credits(sk, GFP_ATOMIC);
                break;

        case PNS_PIPE_DISABLED_IND:
                sk->sk_state = TCP_SYN_RECV;
                pn->rx_credits = 0;
                break;

        default:
                net_dbg_ratelimited("Phonet unknown PEP message: %u\n",
                                    hdr->message_id);
                err = -EINVAL;
        }
out:
        kfree_skb(skb);
        return (err == -ENOBUFS) ? NET_RX_DROP : NET_RX_SUCCESS;

queue:
        skb->dev = NULL;
        skb_set_owner_r(skb, sk);
        skb_queue_tail(queue, skb);
        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);
        return NET_RX_SUCCESS;
}

/* Destroy connected sock. */
static void pipe_destruct(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);

        skb_queue_purge(&sk->sk_receive_queue);
        skb_queue_purge(&pn->ctrlreq_queue);
}

static u8 pipe_negotiate_fc(const u8 *fcs, unsigned int n)
{
        unsigned int i;
        u8 final_fc = PN_NO_FLOW_CONTROL;

        for (i = 0; i < n; i++) {
                u8 fc = fcs[i];

                if (fc > final_fc && fc < PN_MAX_FLOW_CONTROL)
                        final_fc = fc;
        }
        return final_fc;
}

static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *hdr;
        u8 n_sb;

        if (!pskb_pull(skb, sizeof(*hdr) + 4))
                return -EINVAL;

        hdr = pnp_hdr(skb);
        if (hdr->error_code != PN_PIPE_NO_ERROR)
                return -ECONNREFUSED;

        /* Parse sub-blocks */
        n_sb = hdr->data[3];
        while (n_sb > 0) {
                u8 type, buf[6], len = sizeof(buf);
                const u8 *data = pep_get_sb(skb, &type, &len, buf);

                if (data == NULL)
                        return -EINVAL;

                switch (type) {
                case PN_PIPE_SB_REQUIRED_FC_TX:
                        if (len < 2 || len < data[0])
                                break;
                        pn->tx_fc = pipe_negotiate_fc(data + 2, len - 2);
                        break;

                case PN_PIPE_SB_PREFERRED_FC_RX:
                        if (len < 2 || len < data[0])
                                break;
                        pn->rx_fc = pipe_negotiate_fc(data + 2, len - 2);
                        break;

                }
                n_sb--;
        }

        return pipe_handler_send_created_ind(sk);
}

static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct pnpipehdr *hdr = pnp_hdr(skb);

        if (hdr->error_code != PN_PIPE_NO_ERROR)
                return -ECONNREFUSED;

        return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
                NULL, 0, GFP_ATOMIC);

}

static void pipe_start_flow_control(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);

        if (!pn_flow_safe(pn->tx_fc)) {
                atomic_set(&pn->tx_credits, 1);
                sk->sk_write_space(sk);
        }
        pipe_grant_credits(sk, GFP_ATOMIC);
}

/* Queue an skb to an actively connected sock.
 * Socket lock must be held. */
static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *hdr = pnp_hdr(skb);
        int err = NET_RX_SUCCESS;

        switch (hdr->message_id) {
        case PNS_PIPE_ALIGNED_DATA:
                __skb_pull(skb, 1);
                fallthrough;
        case PNS_PIPE_DATA:
                __skb_pull(skb, 3); /* Pipe data header */
                if (!pn_flow_safe(pn->rx_fc)) {
                        err = sock_queue_rcv_skb(sk, skb);
                        if (!err)
                                return NET_RX_SUCCESS;
                        err = NET_RX_DROP;
                        break;
                }

                if (pn->rx_credits == 0) {
                        atomic_inc(&sk->sk_drops);
                        err = NET_RX_DROP;
                        break;
                }
                pn->rx_credits--;
                skb->dev = NULL;
                skb_set_owner_r(skb, sk);
                skb_queue_tail(&sk->sk_receive_queue, skb);
                if (!sock_flag(sk, SOCK_DEAD))
                        sk->sk_data_ready(sk);
                return NET_RX_SUCCESS;

        case PNS_PEP_CONNECT_RESP:
                if (sk->sk_state != TCP_SYN_SENT)
                        break;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk->sk_state_change(sk);
                if (pep_connresp_rcv(sk, skb)) {
                        sk->sk_state = TCP_CLOSE_WAIT;
                        break;
                }
                if (pn->init_enable == PN_PIPE_DISABLE)
                        sk->sk_state = TCP_SYN_RECV;
                else {
                        sk->sk_state = TCP_ESTABLISHED;
                        pipe_start_flow_control(sk);
                }
                break;

        case PNS_PEP_ENABLE_RESP:
                if (sk->sk_state != TCP_SYN_SENT)
                        break;

                if (pep_enableresp_rcv(sk, skb)) {
                        sk->sk_state = TCP_CLOSE_WAIT;
                        break;
                }

                sk->sk_state = TCP_ESTABLISHED;
                pipe_start_flow_control(sk);
                break;

        case PNS_PEP_DISCONNECT_RESP:
                /* sock should already be dead, nothing to do */
                break;

        case PNS_PEP_STATUS_IND:
                pipe_rcv_status(sk, skb);
                break;
        }
        kfree_skb(skb);
        return err;
}

/* Listening sock must be locked */
static struct sock *pep_find_pipe(const struct hlist_head *hlist,
                                        const struct sockaddr_pn *dst,
                                        u8 pipe_handle)
{
        struct sock *sknode;
        u16 dobj = pn_sockaddr_get_object(dst);

        sk_for_each(sknode, hlist) {
                struct pep_sock *pnnode = pep_sk(sknode);

                /* Ports match, but addresses might not: */
                if (pnnode->pn_sk.sobject != dobj)
                        continue;
                if (pnnode->pipe_handle != pipe_handle)
                        continue;
                if (sknode->sk_state == TCP_CLOSE_WAIT)
                        continue;

                sock_hold(sknode);
                return sknode;
        }
        return NULL;
}

/*
 * Deliver an skb to a listening sock.
 * Socket lock must be held.
 * We then queue the skb to the right connected sock (if any).
 */
static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct pep_sock *pn = pep_sk(sk);
        struct sock *sknode;
        struct pnpipehdr *hdr;
        struct sockaddr_pn dst;
        u8 pipe_handle;

        if (!pskb_may_pull(skb, sizeof(*hdr)))
                goto drop;

        hdr = pnp_hdr(skb);
        pipe_handle = hdr->pipe_handle;
        if (pipe_handle == PN_PIPE_INVALID_HANDLE)
                goto drop;

        pn_skb_get_dst_sockaddr(skb, &dst);

        /* Look for an existing pipe handle */
        sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
        if (sknode)
                return sk_receive_skb(sknode, skb, 1);

        switch (hdr->message_id) {
        case PNS_PEP_CONNECT_REQ:
                if (sk->sk_state != TCP_LISTEN || sk_acceptq_is_full(sk)) {
                        pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE,
                                        GFP_ATOMIC);
                        break;
                }
                skb_queue_head(&sk->sk_receive_queue, skb);
                sk_acceptq_added(sk);
                if (!sock_flag(sk, SOCK_DEAD))
                        sk->sk_data_ready(sk);
                return NET_RX_SUCCESS;

        case PNS_PEP_DISCONNECT_REQ:
                pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
                break;

        case PNS_PEP_CTRL_REQ:
                pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE, GFP_ATOMIC);
                break;

        case PNS_PEP_RESET_REQ:
        case PNS_PEP_ENABLE_REQ:
        case PNS_PEP_DISABLE_REQ:
                /* invalid handle is not even allowed here! */
                break;

        default:
                if ((1 << sk->sk_state)
                                & ~(TCPF_CLOSE|TCPF_LISTEN|TCPF_CLOSE_WAIT))
                        /* actively connected socket */
                        return pipe_handler_do_rcv(sk, skb);
        }
drop:
        kfree_skb(skb);
        return NET_RX_SUCCESS;
}

static int pipe_do_remove(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *ph;
        struct sk_buff *skb;

        skb = pep_alloc_skb(sk, NULL, 0, GFP_KERNEL);
        if (!skb)
                return -ENOMEM;

        ph = pnp_hdr(skb);
        ph->utid = 0;
        ph->message_id = PNS_PIPE_REMOVE_REQ;
        ph->pipe_handle = pn->pipe_handle;
        ph->data0 = PAD;
        return pn_skb_send(sk, skb, NULL);
}

/* associated socket ceases to exist */
static void pep_sock_close(struct sock *sk, long timeout)
{
        struct pep_sock *pn = pep_sk(sk);
        int ifindex = 0;

        sock_hold(sk); /* keep a reference after sk_common_release() */
        sk_common_release(sk);

        lock_sock(sk);
        if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) {
                if (sk->sk_backlog_rcv == pipe_do_rcv)
                        /* Forcefully remove dangling Phonet pipe */
                        pipe_do_remove(sk);
                else
                        pipe_handler_request(sk, PNS_PEP_DISCONNECT_REQ, PAD,
                                                NULL, 0);
        }
        sk->sk_state = TCP_CLOSE;

        ifindex = pn->ifindex;
        pn->ifindex = 0;
        release_sock(sk);

        if (ifindex)
                gprs_detach(sk);
        sock_put(sk);
}

static struct sock *pep_sock_accept(struct sock *sk,
                                    struct proto_accept_arg *arg)
{
        struct pep_sock *pn = pep_sk(sk), *newpn;
        struct sock *newsk = NULL;
        struct sk_buff *skb;
        struct pnpipehdr *hdr;
        struct sockaddr_pn dst, src;
        int err;
        u16 peer_type;
        u8 pipe_handle, enabled, n_sb;
        u8 aligned = 0;

        skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
                                &arg->err);
        if (!skb)
                return NULL;

        lock_sock(sk);
        if (sk->sk_state != TCP_LISTEN) {
                err = -EINVAL;
                goto drop;
        }
        sk_acceptq_removed(sk);

        err = -EPROTO;
        if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
                goto drop;

        hdr = pnp_hdr(skb);
        pipe_handle = hdr->pipe_handle;
        switch (hdr->state_after_connect) {
        case PN_PIPE_DISABLE:
                enabled = 0;
                break;
        case PN_PIPE_ENABLE:
                enabled = 1;
                break;
        default:
                pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM,
                                GFP_KERNEL);
                goto drop;
        }
        peer_type = hdr->other_pep_type << 8;

        /* Parse sub-blocks (options) */
        n_sb = hdr->data[3];
        while (n_sb > 0) {
                u8 type, buf[1], len = sizeof(buf);
                const u8 *data = pep_get_sb(skb, &type, &len, buf);

                if (data == NULL)
                        goto drop;
                switch (type) {
                case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE:
                        if (len < 1)
                                goto drop;
                        peer_type = (peer_type & 0xff00) | data[0];
                        break;
                case PN_PIPE_SB_ALIGNED_DATA:
                        aligned = data[0] != 0;
                        break;
                }
                n_sb--;
        }

        /* Check for duplicate pipe handle */
        newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle);
        if (unlikely(newsk)) {
                __sock_put(newsk);
                newsk = NULL;
                pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_KERNEL);
                goto drop;
        }

        /* Create a new to-be-accepted sock */
        newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot,
                         arg->kern);
        if (!newsk) {
                pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
                err = -ENOBUFS;
                goto drop;
        }

        sock_init_data(NULL, newsk);
        newsk->sk_state = TCP_SYN_RECV;
        newsk->sk_backlog_rcv = pipe_do_rcv;
        newsk->sk_protocol = sk->sk_protocol;
        newsk->sk_destruct = pipe_destruct;

        newpn = pep_sk(newsk);
        pn_skb_get_dst_sockaddr(skb, &dst);
        pn_skb_get_src_sockaddr(skb, &src);
        newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst);
        newpn->pn_sk.dobject = pn_sockaddr_get_object(&src);
        newpn->pn_sk.resource = pn_sockaddr_get_resource(&dst);
        sock_hold(sk);
        newpn->listener = sk;
        skb_queue_head_init(&newpn->ctrlreq_queue);
        newpn->pipe_handle = pipe_handle;
        atomic_set(&newpn->tx_credits, 0);
        newpn->ifindex = 0;
        newpn->peer_type = peer_type;
        newpn->rx_credits = 0;
        newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL;
        newpn->init_enable = enabled;
        newpn->aligned = aligned;

        err = pep_accept_conn(newsk, skb);
        if (err) {
                __sock_put(sk);
                sock_put(newsk);
                newsk = NULL;
                goto drop;
        }
        sk_add_node(newsk, &pn->hlist);
drop:
        release_sock(sk);
        kfree_skb(skb);
        arg->err = err;
        return newsk;
}

static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
{
        struct pep_sock *pn = pep_sk(sk);
        int err;
        u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };

        if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
                pn->pipe_handle = 1; /* anything but INVALID_HANDLE */

        err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
                                pn->init_enable, data, 4);
        if (err) {
                pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
                return err;
        }

        sk->sk_state = TCP_SYN_SENT;

        return 0;
}

static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
{
        int err;

        err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
                                NULL, 0);
        if (err)
                return err;

        sk->sk_state = TCP_SYN_SENT;

        return 0;
}

static unsigned int pep_first_packet_length(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);
        struct sk_buff_head *q;
        struct sk_buff *skb;
        unsigned int len = 0;
        bool found = false;

        if (sock_flag(sk, SOCK_URGINLINE)) {
                q = &pn->ctrlreq_queue;
                spin_lock_bh(&q->lock);
                skb = skb_peek(q);
                if (skb) {
                        len = skb->len;
                        found = true;
                }
                spin_unlock_bh(&q->lock);
        }

        if (likely(!found)) {
                q = &sk->sk_receive_queue;
                spin_lock_bh(&q->lock);
                skb = skb_peek(q);
                if (skb)
                        len = skb->len;
                spin_unlock_bh(&q->lock);
        }

        return len;
}

static int pep_ioctl(struct sock *sk, int cmd, int *karg)
{
        struct pep_sock *pn = pep_sk(sk);
        int ret = -ENOIOCTLCMD;

        switch (cmd) {
        case SIOCINQ:
                if (sk->sk_state == TCP_LISTEN) {
                        ret = -EINVAL;
                        break;
                }

                *karg = pep_first_packet_length(sk);
                ret = 0;
                break;

        case SIOCPNENABLEPIPE:
                lock_sock(sk);
                if (sk->sk_state == TCP_SYN_SENT)
                        ret =  -EBUSY;
                else if (sk->sk_state == TCP_ESTABLISHED)
                        ret = -EISCONN;
                else if (!pn->pn_sk.sobject)
                        ret = -EADDRNOTAVAIL;
                else
                        ret = pep_sock_enable(sk, NULL, 0);
                release_sock(sk);
                break;
        }

        return ret;
}

static int pep_init(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);

        sk->sk_destruct = pipe_destruct;
        INIT_HLIST_HEAD(&pn->hlist);
        pn->listener = NULL;
        skb_queue_head_init(&pn->ctrlreq_queue);
        atomic_set(&pn->tx_credits, 0);
        pn->ifindex = 0;
        pn->peer_type = 0;
        pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
        pn->rx_credits = 0;
        pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL;
        pn->init_enable = 1;
        pn->aligned = 0;
        return 0;
}

static int pep_setsockopt(struct sock *sk, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        struct pep_sock *pn = pep_sk(sk);
        int val = 0, err = 0;

        if (level != SOL_PNPIPE)
                return -ENOPROTOOPT;
        if (optlen >= sizeof(int)) {
                if (copy_from_sockptr(&val, optval, sizeof(int)))
                        return -EFAULT;
        }

        lock_sock(sk);
        switch (optname) {
        case PNPIPE_ENCAP:
                if (val && val != PNPIPE_ENCAP_IP) {
                        err = -EINVAL;
                        break;
                }
                if (!pn->ifindex == !val)
                        break; /* Nothing to do! */
                if (!capable(CAP_NET_ADMIN)) {
                        err = -EPERM;
                        break;
                }
                if (val) {
                        release_sock(sk);
                        err = gprs_attach(sk);
                        if (err > 0) {
                                pn->ifindex = err;
                                err = 0;
                        }
                } else {
                        pn->ifindex = 0;
                        release_sock(sk);
                        gprs_detach(sk);
                        err = 0;
                }
                goto out_norel;

        case PNPIPE_HANDLE:
                if ((sk->sk_state == TCP_CLOSE) &&
                        (val >= 0) && (val < PN_PIPE_INVALID_HANDLE))
                        pn->pipe_handle = val;
                else
                        err = -EINVAL;
                break;

        case PNPIPE_INITSTATE:
                pn->init_enable = !!val;
                break;

        default:
                err = -ENOPROTOOPT;
        }
        release_sock(sk);

out_norel:
        return err;
}

static int pep_getsockopt(struct sock *sk, int level, int optname,
                                char __user *optval, int __user *optlen)
{
        struct pep_sock *pn = pep_sk(sk);
        int len, val;

        if (level != SOL_PNPIPE)
                return -ENOPROTOOPT;
        if (get_user(len, optlen))
                return -EFAULT;

        switch (optname) {
        case PNPIPE_ENCAP:
                val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
                break;

        case PNPIPE_IFINDEX:
                val = pn->ifindex;
                break;

        case PNPIPE_HANDLE:
                val = pn->pipe_handle;
                if (val == PN_PIPE_INVALID_HANDLE)
                        return -EINVAL;
                break;

        case PNPIPE_INITSTATE:
                val = pn->init_enable;
                break;

        default:
                return -ENOPROTOOPT;
        }

        len = min_t(unsigned int, sizeof(int), len);
        if (put_user(len, optlen))
                return -EFAULT;
        if (put_user(val, (int __user *) optval))
                return -EFAULT;
        return 0;
}

static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
{
        struct pep_sock *pn = pep_sk(sk);
        struct pnpipehdr *ph;
        int err;

        if (pn_flow_safe(pn->tx_fc) &&
            !atomic_add_unless(&pn->tx_credits, -1, 0)) {
                kfree_skb(skb);
                return -ENOBUFS;
        }

        skb_push(skb, 3 + pn->aligned);
        skb_reset_transport_header(skb);
        ph = pnp_hdr(skb);
        ph->utid = 0;
        if (pn->aligned) {
                ph->message_id = PNS_PIPE_ALIGNED_DATA;
                ph->data0 = 0; /* padding */
        } else
                ph->message_id = PNS_PIPE_DATA;
        ph->pipe_handle = pn->pipe_handle;
        err = pn_skb_send(sk, skb, NULL);

        if (err && pn_flow_safe(pn->tx_fc))
                atomic_inc(&pn->tx_credits);
        return err;

}

static int pep_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        struct pep_sock *pn = pep_sk(sk);
        struct sk_buff *skb;
        long timeo;
        int flags = msg->msg_flags;
        int err, done;

        if (len > USHRT_MAX)
                return -EMSGSIZE;

        if ((msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|
                                MSG_CMSG_COMPAT)) ||
                        !(msg->msg_flags & MSG_EOR))
                return -EOPNOTSUPP;

        skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len,
                                        flags & MSG_DONTWAIT, &err);
        if (!skb)
                return err;

        skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned);
        err = memcpy_from_msg(skb_put(skb, len), msg, len);
        if (err < 0)
                goto outfree;

        lock_sock(sk);
        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
        if ((1 << sk->sk_state) & (TCPF_LISTEN|TCPF_CLOSE)) {
                err = -ENOTCONN;
                goto out;
        }
        if (sk->sk_state != TCP_ESTABLISHED) {
                /* Wait until the pipe gets to enabled state */
disabled:
                err = sk_stream_wait_connect(sk, &timeo);
                if (err)
                        goto out;

                if (sk->sk_state == TCP_CLOSE_WAIT) {
                        err = -ECONNRESET;
                        goto out;
                }
        }
        BUG_ON(sk->sk_state != TCP_ESTABLISHED);

        /* Wait until flow control allows TX */
        done = atomic_read(&pn->tx_credits);
        while (!done) {
                DEFINE_WAIT_FUNC(wait, woken_wake_function);

                if (!timeo) {
                        err = -EAGAIN;
                        goto out;
                }
                if (signal_pending(current)) {
                        err = sock_intr_errno(timeo);
                        goto out;
                }

                add_wait_queue(sk_sleep(sk), &wait);
                done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits), &wait);
                remove_wait_queue(sk_sleep(sk), &wait);

                if (sk->sk_state != TCP_ESTABLISHED)
                        goto disabled;
        }

        err = pipe_skb_send(sk, skb);
        if (err >= 0)
                err = len; /* success! */
        skb = NULL;
out:
        release_sock(sk);
outfree:
        kfree_skb(skb);
        return err;
}

int pep_writeable(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);

        return atomic_read(&pn->tx_credits);
}

int pep_write(struct sock *sk, struct sk_buff *skb)
{
        struct sk_buff *rskb, *fs;
        int flen = 0;

        if (pep_sk(sk)->aligned)
                return pipe_skb_send(sk, skb);

        rskb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
        if (!rskb) {
                kfree_skb(skb);
                return -ENOMEM;
        }
        skb_shinfo(rskb)->frag_list = skb;
        rskb->len += skb->len;
        rskb->data_len += rskb->len;
        rskb->truesize += rskb->len;

        /* Avoid nested fragments */
        skb_walk_frags(skb, fs)
                flen += fs->len;
        skb->next = skb_shinfo(skb)->frag_list;
        skb_frag_list_init(skb);
        skb->len -= flen;
        skb->data_len -= flen;
        skb->truesize -= flen;

        skb_reserve(rskb, MAX_PHONET_HEADER + 3);
        return pipe_skb_send(sk, rskb);
}

struct sk_buff *pep_read(struct sock *sk)
{
        struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue);

        if (sk->sk_state == TCP_ESTABLISHED)
                pipe_grant_credits(sk, GFP_ATOMIC);
        return skb;
}

static int pep_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                       int flags, int *addr_len)
{
        struct sk_buff *skb;
        int err;

        if (flags & ~(MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_WAITALL|
                        MSG_NOSIGNAL|MSG_CMSG_COMPAT))
                return -EOPNOTSUPP;

        if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE)))
                return -ENOTCONN;

        if ((flags & MSG_OOB) || sock_flag(sk, SOCK_URGINLINE)) {
                /* Dequeue and acknowledge control request */
                struct pep_sock *pn = pep_sk(sk);

                if (flags & MSG_PEEK)
                        return -EOPNOTSUPP;
                skb = skb_dequeue(&pn->ctrlreq_queue);
                if (skb) {
                        pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR,
                                                GFP_KERNEL);
                        msg->msg_flags |= MSG_OOB;
                        goto copy;
                }
                if (flags & MSG_OOB)
                        return -EINVAL;
        }

        skb = skb_recv_datagram(sk, flags, &err);
        lock_sock(sk);
        if (skb == NULL) {
                if (err == -ENOTCONN && sk->sk_state == TCP_CLOSE_WAIT)
                        err = -ECONNRESET;
                release_sock(sk);
                return err;
        }

        if (sk->sk_state == TCP_ESTABLISHED)
                pipe_grant_credits(sk, GFP_KERNEL);
        release_sock(sk);
copy:
        msg->msg_flags |= MSG_EOR;
        if (skb->len > len)
                msg->msg_flags |= MSG_TRUNC;
        else
                len = skb->len;

        err = skb_copy_datagram_msg(skb, 0, msg, len);
        if (!err)
                err = (flags & MSG_TRUNC) ? skb->len : len;

        skb_free_datagram(sk, skb);
        return err;
}

static void pep_sock_unhash(struct sock *sk)
{
        struct pep_sock *pn = pep_sk(sk);
        struct sock *skparent = NULL;

        lock_sock(sk);

        if (pn->listener != NULL) {
                skparent = pn->listener;
                pn->listener = NULL;
                release_sock(sk);

                pn = pep_sk(skparent);
                lock_sock(skparent);
                sk_del_node_init(sk);
                sk = skparent;
        }

        /* Unhash a listening sock only when it is closed
         * and all of its active connected pipes are closed. */
        if (hlist_empty(&pn->hlist))
                pn_sock_unhash(&pn->pn_sk.sk);
        release_sock(sk);

        if (skparent)
                sock_put(skparent);
}

static struct proto pep_proto = {
        .close                = pep_sock_close,
        .accept                = pep_sock_accept,
        .connect        = pep_sock_connect,
        .ioctl                = pep_ioctl,
        .init                = pep_init,
        .setsockopt        = pep_setsockopt,
        .getsockopt        = pep_getsockopt,
        .sendmsg        = pep_sendmsg,
        .recvmsg        = pep_recvmsg,
        .backlog_rcv        = pep_do_rcv,
        .hash                = pn_sock_hash,
        .unhash                = pep_sock_unhash,
        .get_port        = pn_sock_get_port,
        .obj_size        = sizeof(struct pep_sock),
        .owner                = THIS_MODULE,
        .name                = "PNPIPE",
};

static const struct phonet_protocol pep_pn_proto = {
        .ops                = &phonet_stream_ops,
        .prot                = &pep_proto,
        .sock_type        = SOCK_SEQPACKET,
};

static int __init pep_register(void)
{
        return phonet_proto_register(PN_PROTO_PIPE, &pep_pn_proto);
}

static void __exit pep_unregister(void)
{
        phonet_proto_unregister(PN_PROTO_PIPE, &pep_pn_proto);
}

module_init(pep_register);
module_exit(pep_unregister);
MODULE_AUTHOR("Remi Denis-Courmont, Nokia");
MODULE_DESCRIPTION("Phonet pipe protocol");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO(PF_PHONET, PN_PROTO_PIPE);


















































































































































































































































































































































































































































































































































































































    3 
    3 

    3 
    3 














































































































































































































































































































































































































































































    1 








    1 

































    1 





















    1 

























































































    1 




































































































































































































































    1 
















































































































    1 





    1 



















































































































































































































































































































































































































































































    6 








    6 










































    6 








    1 




    5 



    5 











































    4 














    5 




























































































    3 





    2 





    4 





    4 


    5 


    6 







    6 























































































































































    2 

    2 






    1 



    1 
    1 

    1 
    1 


    2 



    2 


















    2 




    1 
    2 














    1 







    1 








    1 


    2 




    1 
    2 


    1 






    2 





    1 





































    5 




    5 



















































    1 






    1 



    1 























































    2 










    2 

























































































































































































    1 






















    2 
    1 




    1 

    1 











    2 













    1 
    1 











    1 








    1 


    1 





    4 


    4 







    4 

    4 




    1 












    1 




    1 















































    1 
    1 




    1 


    1 







    1 
































































































    1 




    1 













    2 
    2 

    2 


    2 










    2 





















































































    1 































    1 


    2 
    2 
    1 
    1 
    2 















    2 


    3 

    2 
    2 
    1 
    2 

    3 
    2 



    1 


    1 





    1 
    1 




    1 


    1 






























    1 

    1 






    3 
    2 
    3 




    1 
    1 
    1 











    6 















    6 






















































    5 

    5 







   11 





   11 
    1 






   12 

   10 
    3 

    9 
    3 


   11 
   12 










































































































































    1 


































































































































































































































































































































































































































































































































    1 











    1 


    1 




    1 


    1 



    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic socket support routines. Memory allocators, socket lock/release
 *                handler for protocols to use and generic option handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Fixes:
 *                Alan Cox        :         Numerous verify_area() problems
 *                Alan Cox        :        Connecting on a connecting socket
 *                                        now returns an error for tcp.
 *                Alan Cox        :        sock->protocol is set correctly.
 *                                        and is not sometimes left as 0.
 *                Alan Cox        :        connect handles icmp errors on a
 *                                        connect properly. Unfortunately there
 *                                        is a restart syscall nasty there. I
 *                                        can't match BSD without hacking the C
 *                                        library. Ideas urgently sought!
 *                Alan Cox        :        Disallow bind() to addresses that are
 *                                        not ours - especially broadcast ones!!
 *                Alan Cox        :        Socket 1024 _IS_ ok for users. (fencepost)
 *                Alan Cox        :        sock_wfree/sock_rfree don't destroy sockets,
 *                                        instead they leave that for the DESTROY timer.
 *                Alan Cox        :        Clean up error flag in accept
 *                Alan Cox        :        TCP ack handling is buggy, the DESTROY timer
 *                                        was buggy. Put a remove_sock() in the handler
 *                                        for memory when we hit 0. Also altered the timer
 *                                        code. The ACK stuff can wait and needs major
 *                                        TCP layer surgery.
 *                Alan Cox        :        Fixed TCP ack bug, removed remove sock
 *                                        and fixed timer/inet_bh race.
 *                Alan Cox        :        Added zapped flag for TCP
 *                Alan Cox        :        Move kfree_skb into skbuff.c and tidied up surplus code
 *                Alan Cox        :        for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 *                Alan Cox        :        kfree_s calls now are kfree_skbmem so we can track skb resources
 *                Alan Cox        :        Supports socket option broadcast now as does udp. Packet and raw need fixing.
 *                Alan Cox        :        Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 *                Rick Sladkey        :        Relaxed UDP rules for matching packets.
 *                C.E.Hawkins        :        IFF_PROMISC/SIOCGHWADDR support
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Fixed connect() taking signals I think.
 *                Alan Cox        :        SO_LINGER supported
 *                Alan Cox        :        Error reporting fixes
 *                Anonymous        :        inet_create tidied up (sk->reuse setting)
 *                Alan Cox        :        inet sockets don't set sk->type!
 *                Alan Cox        :        Split socket option code
 *                Alan Cox        :        Callbacks
 *                Alan Cox        :        Nagle flag for Charles & Johannes stuff
 *                Alex                :        Removed restriction on inet fioctl
 *                Alan Cox        :        Splitting INET from NET core
 *                Alan Cox        :        Fixed bogus SO_TYPE handling in getsockopt()
 *                Adam Caldwell        :        Missing return in SO_DONTROUTE/SO_DEBUG code
 *                Alan Cox        :        Split IP from generic code
 *                Alan Cox        :        New kfree_skbmem()
 *                Alan Cox        :        Make SO_DEBUG superuser only.
 *                Alan Cox        :        Allow anyone to clear SO_DEBUG
 *                                        (compatibility fix)
 *                Alan Cox        :        Added optimistic memory grabbing for AF_UNIX throughput.
 *                Alan Cox        :        Allocator for a socket is settable.
 *                Alan Cox        :        SO_ERROR includes soft errors.
 *                Alan Cox        :        Allow NULL arguments on some SO_ opts
 *                Alan Cox        :         Generic socket allocation to make hooks
 *                                        easier (suggested by Craig Metz).
 *                Michael Pall        :        SO_ERROR returns positive errno again
 *              Steve Whitehouse:       Added default destructor to free
 *                                      protocol private data.
 *              Steve Whitehouse:       Added various other default routines
 *                                      common to several socket families.
 *              Chris Evans     :       Call suser() check last on F_SETOWN
 *                Jay Schulist        :        Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 *                Andi Kleen        :        Add sock_kmalloc()/sock_kfree_s()
 *                Andi Kleen        :        Fix write_space callback
 *                Chris Evans        :        Security fixes - signedness again
 *                Arnaldo C. Melo :       cleanups, use skb_queue_purge
 *
 * To Fix:
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <asm/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/compat.h>
#include <linux/mroute.h>
#include <linux/mroute6.h>
#include <linux/icmpv6.h>

#include <linux/uaccess.h>

#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <net/proto_memory.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
#include <linux/sock_diag.h>

#include <linux/filter.h>
#include <net/sock_reuseport.h>
#include <net/bpf_sk_storage.h>

#include <trace/events/sock.h>

#include <net/tcp.h>
#include <net/busy_poll.h>
#include <net/phonet/phonet.h>

#include <linux/ethtool.h>

#include "dev.h"

static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);

static void sock_def_write_space_wfree(struct sock *sk);
static void sock_def_write_space(struct sock *sk);

/**
 * sk_ns_capable - General socket capability test
 * @sk: Socket to use a capability on or through
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in the user
 * namespace @user_ns.
 */
bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap)
{
        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(sk_ns_capable);

/**
 * sk_capable - Socket global capability test
 * @sk: Socket to use a capability on or through
 * @cap: The global capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in all user
 * namespaces.
 */
bool sk_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, &init_user_ns, cap);
}
EXPORT_SYMBOL(sk_capable);

/**
 * sk_net_capable - Network namespace socket capability test
 * @sk: Socket to use a capability on or through
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was created
 * and the current process has the capability @cap over the network namespace
 * the socket is a member of.
 */
bool sk_net_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
}
EXPORT_SYMBOL(sk_net_capable);

/*
 * Each address family might have different locking rules, so we have
 * one slock key per address family and separate keys for internal and
 * userspace sockets.
 */
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
static struct lock_class_key af_family_kern_slock_keys[AF_MAX];

/*
 * Make lock validator output more readable. (we pre-construct these
 * strings build-time, so that runtime initialization of socket
 * locks is fast):
 */

#define _sock_locks(x)                                                  \
  x "AF_UNSPEC",        x "AF_UNIX"     ,        x "AF_INET"     , \
  x "AF_AX25"  ,        x "AF_IPX"      ,        x "AF_APPLETALK", \
  x "AF_NETROM",        x "AF_BRIDGE"   ,        x "AF_ATMPVC"   , \
  x "AF_X25"   ,        x "AF_INET6"    ,        x "AF_ROSE"     , \
  x "AF_DECnet",        x "AF_NETBEUI"  ,        x "AF_SECURITY" , \
  x "AF_KEY"   ,        x "AF_NETLINK"  ,        x "AF_PACKET"   , \
  x "AF_ASH"   ,        x "AF_ECONET"   ,        x "AF_ATMSVC"   , \
  x "AF_RDS"   ,        x "AF_SNA"      ,        x "AF_IRDA"     , \
  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,        x "AF_LLC"      , \
  x "27"       ,        x "28"          ,        x "AF_CAN"      , \
  x "AF_TIPC"  ,        x "AF_BLUETOOTH",        x "IUCV"        , \
  x "AF_RXRPC" ,        x "AF_ISDN"     ,        x "AF_PHONET"   , \
  x "AF_IEEE802154",        x "AF_CAIF"        ,        x "AF_ALG"      , \
  x "AF_NFC"   ,        x "AF_VSOCK"    ,        x "AF_KCM"      , \
  x "AF_QIPCRTR",        x "AF_SMC"        ,        x "AF_XDP"        , \
  x "AF_MCTP"  , \
  x "AF_MAX"

static const char *const af_family_key_strings[AF_MAX+1] = {
        _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
        _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
        _sock_locks("clock-")
};

static const char *const af_family_kern_key_strings[AF_MAX+1] = {
        _sock_locks("k-sk_lock-")
};
static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
        _sock_locks("k-slock-")
};
static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
        _sock_locks("k-clock-")
};
static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
        _sock_locks("rlock-")
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
        _sock_locks("wlock-")
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
        _sock_locks("elock-")
};

/*
 * sk_callback_lock and sk queues locking rules are per-address-family,
 * so split the lock classes by using a per-AF key:
 */
static struct lock_class_key af_callback_keys[AF_MAX];
static struct lock_class_key af_rlock_keys[AF_MAX];
static struct lock_class_key af_wlock_keys[AF_MAX];
static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];

/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;

int sysctl_tstamp_allow_data __read_mostly = 1;

DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);

/**
 * sk_set_memalloc - sets %SOCK_MEMALLOC
 * @sk: socket to set it on
 *
 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 * It's the responsibility of the admin to adjust min_free_kbytes
 * to meet the requirements
 */
void sk_set_memalloc(struct sock *sk)
{
        sock_set_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation |= __GFP_MEMALLOC;
        static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);

void sk_clear_memalloc(struct sock *sk)
{
        sock_reset_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation &= ~__GFP_MEMALLOC;
        static_branch_dec(&memalloc_socks_key);

        /*
         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
         * progress of swapping. SOCK_MEMALLOC may be cleared while
         * it has rmem allocations due to the last swapfile being deactivated
         * but there is a risk that the socket is unusable due to exceeding
         * the rmem limits. Reclaim the reserves and obey rmem limits again.
         */
        sk_mem_reclaim(sk);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        int ret;
        unsigned int noreclaim_flag;

        /* these should have been dropped before queueing */
        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));

        noreclaim_flag = memalloc_noreclaim_save();
        ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                 tcp_v6_do_rcv,
                                 tcp_v4_do_rcv,
                                 sk, skb);
        memalloc_noreclaim_restore(noreclaim_flag);

        return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);

void sk_error_report(struct sock *sk)
{
        sk->sk_error_report(sk);

        switch (sk->sk_family) {
        case AF_INET:
                fallthrough;
        case AF_INET6:
                trace_inet_sk_error_report(sk);
                break;
        default:
                break;
        }
}
EXPORT_SYMBOL(sk_error_report);

int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
        struct __kernel_sock_timeval tv;

        if (timeo == MAX_SCHEDULE_TIMEOUT) {
                tv.tv_sec = 0;
                tv.tv_usec = 0;
        } else {
                tv.tv_sec = timeo / HZ;
                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
        }

        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
                *(struct old_timeval32 *)optval = tv32;
                return sizeof(tv32);
        }

        if (old_timeval) {
                struct __kernel_old_timeval old_tv;
                old_tv.tv_sec = tv.tv_sec;
                old_tv.tv_usec = tv.tv_usec;
                *(struct __kernel_old_timeval *)optval = old_tv;
                return sizeof(old_tv);
        }

        *(struct __kernel_sock_timeval *)optval = tv;
        return sizeof(tv);
}
EXPORT_SYMBOL(sock_get_timeout);

int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval)
{
        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32;

                if (optlen < sizeof(tv32))
                        return -EINVAL;

                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
                        return -EFAULT;
                tv->tv_sec = tv32.tv_sec;
                tv->tv_usec = tv32.tv_usec;
        } else if (old_timeval) {
                struct __kernel_old_timeval old_tv;

                if (optlen < sizeof(old_tv))
                        return -EINVAL;
                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
                        return -EFAULT;
                tv->tv_sec = old_tv.tv_sec;
                tv->tv_usec = old_tv.tv_usec;
        } else {
                if (optlen < sizeof(*tv))
                        return -EINVAL;
                if (copy_from_sockptr(tv, optval, sizeof(*tv)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL(sock_copy_user_timeval);

static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
                            bool old_timeval)
{
        struct __kernel_sock_timeval tv;
        int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
        long val;

        if (err)
                return err;

        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
                return -EDOM;

        if (tv.tv_sec < 0) {
                static int warned __read_mostly;

                WRITE_ONCE(*timeo_p, 0);
                if (warned < 10 && net_ratelimit()) {
                        warned++;
                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
                                __func__, current->comm, task_pid_nr(current));
                }
                return 0;
        }
        val = MAX_SCHEDULE_TIMEOUT;
        if ((tv.tv_sec || tv.tv_usec) &&
            (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
                val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
                                                    USEC_PER_SEC / HZ);
        WRITE_ONCE(*timeo_p, val);
        return 0;
}

static bool sock_needs_netstamp(const struct sock *sk)
{
        switch (sk->sk_family) {
        case AF_UNSPEC:
        case AF_UNIX:
                return false;
        default:
                return true;
        }
}

static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
        if (sk->sk_flags & flags) {
                sk->sk_flags &= ~flags;
                if (sock_needs_netstamp(sk) &&
                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                        net_disable_timestamp();
        }
}


int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        unsigned long flags;
        struct sk_buff_head *list = &sk->sk_receive_queue;

        if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
                atomic_inc(&sk->sk_drops);
                trace_sock_rcvqueue_full(sk, skb);
                return -ENOMEM;
        }

        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
                atomic_inc(&sk->sk_drops);
                return -ENOBUFS;
        }

        skb->dev = NULL;
        skb_set_owner_r(skb, sk);

        /* we escape from rcu protected region, make sure we dont leak
         * a norefcounted dst
         */
        skb_dst_force(skb);

        spin_lock_irqsave(&list->lock, flags);
        sock_skb_set_dropcount(sk, skb);
        __skb_queue_tail(list, skb);
        spin_unlock_irqrestore(&list->lock, flags);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);
        return 0;
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);

int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
                              enum skb_drop_reason *reason)
{
        enum skb_drop_reason drop_reason;
        int err;

        err = sk_filter(sk, skb);
        if (err) {
                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                goto out;
        }
        err = __sock_queue_rcv_skb(sk, skb);
        switch (err) {
        case -ENOMEM:
                drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
                break;
        case -ENOBUFS:
                drop_reason = SKB_DROP_REASON_PROTO_MEM;
                break;
        default:
                drop_reason = SKB_NOT_DROPPED_YET;
                break;
        }
out:
        if (reason)
                *reason = drop_reason;
        return err;
}
EXPORT_SYMBOL(sock_queue_rcv_skb_reason);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                     const int nested, unsigned int trim_cap, bool refcounted)
{
        int rc = NET_RX_SUCCESS;

        if (sk_filter_trim_cap(sk, skb, trim_cap))
                goto discard_and_relse;

        skb->dev = NULL;

        if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }
        if (nested)
                bh_lock_sock_nested(sk);
        else
                bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                /*
                 * trylock + unlock semantics:
                 */
                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

                rc = sk_backlog_rcv(sk, skb);

                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
                bh_unlock_sock(sk);
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }

        bh_unlock_sock(sk);
out:
        if (refcounted)
                sock_put(sk);
        return rc;
discard_and_relse:
        kfree_skb(skb);
        goto out;
}
EXPORT_SYMBOL(__sk_receive_skb);

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->obsolete &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_tx_queue_clear(sk);
                WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(__sk_dst_check);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = sk_dst_get(sk);

        if (dst && dst->obsolete &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_dst_reset(sk);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(sk_dst_check);

static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);

        /* Sorry... */
        ret = -EPERM;
        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out;

        ret = -EINVAL;
        if (ifindex < 0)
                goto out;

        /* Paired with all READ_ONCE() done locklessly. */
        WRITE_ONCE(sk->sk_bound_dev_if, ifindex);

        if (sk->sk_prot->rehash)
                sk->sk_prot->rehash(sk);
        sk_dst_reset(sk);

        ret = 0;

out:
#endif

        return ret;
}

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
{
        int ret;

        if (lock_sk)
                lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, ifindex);
        if (lock_sk)
                release_sock(sk);

        return ret;
}
EXPORT_SYMBOL(sock_bindtoindex);

static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];
        int index;

        ret = -EINVAL;
        if (optlen < 0)
                goto out;

        /* Bind this socket to a particular device like "eth0",
         * as specified in the passed interface name. If the
         * name is "" or the option length is zero the socket
         * is not bound.
         */
        if (optlen > IFNAMSIZ - 1)
                optlen = IFNAMSIZ - 1;
        memset(devname, 0, sizeof(devname));

        ret = -EFAULT;
        if (copy_from_sockptr(devname, optval, optlen))
                goto out;

        index = 0;
        if (devname[0] != '\0') {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_name_rcu(net, devname);
                if (dev)
                        index = dev->ifindex;
                rcu_read_unlock();
                ret = -ENODEV;
                if (!dev)
                        goto out;
        }

        sockopt_lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, index);
        sockopt_release_sock(sk);
out:
#endif

        return ret;
}

static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
                                sockptr_t optlen, int len)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];

        if (bound_dev_if == 0) {
                len = 0;
                goto zero;
        }

        ret = -EINVAL;
        if (len < IFNAMSIZ)
                goto out;

        ret = netdev_get_name(net, devname, bound_dev_if);
        if (ret)
                goto out;

        len = strlen(devname) + 1;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, devname, len))
                goto out;

zero:
        ret = -EFAULT;
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                goto out;

        ret = 0;

out:
#endif

        return ret;
}

bool sk_mc_loop(const struct sock *sk)
{
        if (dev_recursion_level())
                return false;
        if (!sk)
                return true;
        /* IPV6_ADDRFORM can change sk->sk_family under us. */
        switch (READ_ONCE(sk->sk_family)) {
        case AF_INET:
                return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return inet6_test_bit(MC6_LOOP, sk);
#endif
        }
        WARN_ON_ONCE(1);
        return true;
}
EXPORT_SYMBOL(sk_mc_loop);

void sock_set_reuseaddr(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuse = SK_CAN_REUSE;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseaddr);

void sock_set_reuseport(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuseport = true;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseport);

void sock_no_linger(struct sock *sk)
{
        lock_sock(sk);
        WRITE_ONCE(sk->sk_lingertime, 0);
        sock_set_flag(sk, SOCK_LINGER);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_no_linger);

void sock_set_priority(struct sock *sk, u32 priority)
{
        WRITE_ONCE(sk->sk_priority, priority);
}
EXPORT_SYMBOL(sock_set_priority);

void sock_set_sndtimeo(struct sock *sk, s64 secs)
{
        lock_sock(sk);
        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
                WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
        else
                WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_sndtimeo);

static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
{
        if (val)  {
                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
                sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
                sock_set_flag(sk, SOCK_RCVTSTAMP);
                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        } else {
                sock_reset_flag(sk, SOCK_RCVTSTAMP);
                sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
        }
}

void sock_enable_timestamps(struct sock *sk)
{
        lock_sock(sk);
        __sock_set_timestamps(sk, true, false, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_enable_timestamps);

void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
{
        switch (optname) {
        case SO_TIMESTAMP_OLD:
                __sock_set_timestamps(sk, valbool, false, false);
                break;
        case SO_TIMESTAMP_NEW:
                __sock_set_timestamps(sk, valbool, true, false);
                break;
        case SO_TIMESTAMPNS_OLD:
                __sock_set_timestamps(sk, valbool, false, true);
                break;
        case SO_TIMESTAMPNS_NEW:
                __sock_set_timestamps(sk, valbool, true, true);
                break;
        }
}

static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
{
        struct net *net = sock_net(sk);
        struct net_device *dev = NULL;
        bool match = false;
        int *vclock_index;
        int i, num;

        if (sk->sk_bound_dev_if)
                dev = dev_get_by_index(net, sk->sk_bound_dev_if);

        if (!dev) {
                pr_err("%s: sock not bind to device\n", __func__);
                return -EOPNOTSUPP;
        }

        num = ethtool_get_phc_vclocks(dev, &vclock_index);
        dev_put(dev);

        for (i = 0; i < num; i++) {
                if (*(vclock_index + i) == phc_index) {
                        match = true;
                        break;
                }
        }

        if (num > 0)
                kfree(vclock_index);

        if (!match)
                return -EINVAL;

        WRITE_ONCE(sk->sk_bind_phc, phc_index);

        return 0;
}

int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping)
{
        int val = timestamping.flags;
        int ret;

        if (val & ~SOF_TIMESTAMPING_MASK)
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
            !(val & SOF_TIMESTAMPING_OPT_ID))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID &&
            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
                if (sk_is_tcp(sk)) {
                        if ((1 << sk->sk_state) &
                            (TCPF_CLOSE | TCPF_LISTEN))
                                return -EINVAL;
                        if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
                        else
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
                } else {
                        atomic_set(&sk->sk_tskey, 0);
                }
        }

        if (val & SOF_TIMESTAMPING_OPT_STATS &&
            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_BIND_PHC) {
                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(sk->sk_tsflags, val);
        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);

        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
                sock_enable_timestamp(sk,
                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
        else
                sock_disable_timestamp(sk,
                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
        return 0;
}

void sock_set_keepalive(struct sock *sk)
{
        lock_sock(sk);
        if (sk->sk_prot->keepalive)
                sk->sk_prot->keepalive(sk, true);
        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_keepalive);

static void __sock_set_rcvbuf(struct sock *sk, int val)
{
        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
         * as a negative value.
         */
        val = min_t(int, val, INT_MAX / 2);
        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;

        /* We double it on the way in to account for "struct sk_buff" etc.
         * overhead.   Applications assume that the SO_RCVBUF setting they make
         * will allow that much actual data to be received on that socket.
         *
         * Applications are unaware that "struct sk_buff" and other overheads
         * allocate from the receive buffer during socket buffer allocation.
         *
         * And after considering the possible alternatives, returning the value
         * we actually used in getsockopt is the most desirable behavior.
         */
        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
}

void sock_set_rcvbuf(struct sock *sk, int val)
{
        lock_sock(sk);
        __sock_set_rcvbuf(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_rcvbuf);

static void __sock_set_mark(struct sock *sk, u32 val)
{
        if (val != sk->sk_mark) {
                WRITE_ONCE(sk->sk_mark, val);
                sk_dst_reset(sk);
        }
}

void sock_set_mark(struct sock *sk, u32 val)
{
        lock_sock(sk);
        __sock_set_mark(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_mark);

static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
        /* Round down bytes to multiple of pages */
        bytes = round_down(bytes, PAGE_SIZE);

        WARN_ON(bytes > sk->sk_reserved_mem);
        WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
        sk_mem_reclaim(sk);
}

static int sock_reserve_memory(struct sock *sk, int bytes)
{
        long allocated;
        bool charged;
        int pages;

        if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
                return -EOPNOTSUPP;

        if (!bytes)
                return 0;

        pages = sk_mem_pages(bytes);

        /* pre-charge to memcg */
        charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
                                          GFP_KERNEL | __GFP_RETRY_MAYFAIL);
        if (!charged)
                return -ENOMEM;

        /* pre-charge to forward_alloc */
        sk_memory_allocated_add(sk, pages);
        allocated = sk_memory_allocated(sk);
        /* If the system goes into memory pressure with this
         * precharge, give up and return error.
         */
        if (allocated > sk_prot_mem_limits(sk, 1)) {
                sk_memory_allocated_sub(sk, pages);
                mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
                return -ENOMEM;
        }
        sk_forward_alloc_add(sk, pages << PAGE_SHIFT);

        WRITE_ONCE(sk->sk_reserved_mem,
                   sk->sk_reserved_mem + (pages << PAGE_SHIFT));

        return 0;
}

void sockopt_lock_sock(struct sock *sk)
{
        /* When current->bpf_ctx is set, the setsockopt is called from
         * a bpf prog.  bpf has ensured the sk lock has been
         * acquired before calling setsockopt().
         */
        if (has_current_bpf_ctx())
                return;

        lock_sock(sk);
}
EXPORT_SYMBOL(sockopt_lock_sock);

void sockopt_release_sock(struct sock *sk)
{
        if (has_current_bpf_ctx())
                return;

        release_sock(sk);
}
EXPORT_SYMBOL(sockopt_release_sock);

bool sockopt_ns_capable(struct user_namespace *ns, int cap)
{
        return has_current_bpf_ctx() || ns_capable(ns, cap);
}
EXPORT_SYMBOL(sockopt_ns_capable);

bool sockopt_capable(int cap)
{
        return has_current_bpf_ctx() || capable(cap);
}
EXPORT_SYMBOL(sockopt_capable);

static int sockopt_validate_clockid(__kernel_clockid_t value)
{
        switch (value) {
        case CLOCK_REALTIME:
        case CLOCK_MONOTONIC:
        case CLOCK_TAI:
                return 0;
        }
        return -EINVAL;
}

/*
 *        This is meant for all protocols to use and covers goings on
 *        at the socket level. Everything here is generic.
 */

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen)
{
        struct so_timestamping timestamping;
        struct socket *sock = sk->sk_socket;
        struct sock_txtime sk_txtime;
        int val;
        int valbool;
        struct linger ling;
        int ret = 0;

        /*
         *        Options without arguments
         */

        if (optname == SO_BINDTODEVICE)
                return sock_setbindtodevice(sk, optval, optlen);

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        /* handle options which do not require locking the socket. */
        switch (optname) {
        case SO_PRIORITY:
                if ((val >= 0 && val <= 6) ||
                    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
                    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        sock_set_priority(sk, val);
                        return 0;
                }
                return -EPERM;
        case SO_PASSSEC:
                assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
                return 0;
        case SO_PASSCRED:
                assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
                return 0;
        case SO_PASSPIDFD:
                assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
                return 0;
        case SO_TYPE:
        case SO_PROTOCOL:
        case SO_DOMAIN:
        case SO_ERROR:
                return -ENOPROTOOPT;
#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                if (val < 0)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_ll_usec, val);
                return 0;
        case SO_PREFER_BUSY_POLL:
                if (valbool && !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
                return 0;
        case SO_BUSY_POLL_BUDGET:
                if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
                    !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                if (val < 0 || val > U16_MAX)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_busy_poll_budget, val);
                return 0;
#endif
        case SO_MAX_PACING_RATE:
                {
                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
                unsigned long pacing_rate;

                if (sizeof(ulval) != sizeof(val) &&
                    optlen >= sizeof(ulval) &&
                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
                        return -EFAULT;
                }
                if (ulval != ~0UL)
                        cmpxchg(&sk->sk_pacing_status,
                                SK_PACING_NONE,
                                SK_PACING_NEEDED);
                /* Pairs with READ_ONCE() from sk_getsockopt() */
                WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
                pacing_rate = READ_ONCE(sk->sk_pacing_rate);
                if (ulval < pacing_rate)
                        WRITE_ONCE(sk->sk_pacing_rate, ulval);
                return 0;
                }
        case SO_TXREHASH:
                if (val < -1 || val > 1)
                        return -EINVAL;
                if ((u8)val == SOCK_TXREHASH_DEFAULT)
                        val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
                /* Paired with READ_ONCE() in tcp_rtx_synack()
                 * and sk_getsockopt().
                 */
                WRITE_ONCE(sk->sk_txrehash, (u8)val);
                return 0;
        case SO_PEEK_OFF:
                {
                int (*set_peek_off)(struct sock *sk, int val);

                set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
                if (set_peek_off)
                        ret = set_peek_off(sk, val);
                else
                        ret = -EOPNOTSUPP;
                return ret;
                }
        }

        sockopt_lock_sock(sk);

        switch (optname) {
        case SO_DEBUG:
                if (val && !sockopt_capable(CAP_NET_ADMIN))
                        ret = -EACCES;
                else
                        sock_valbool_flag(sk, SOCK_DBG, valbool);
                break;
        case SO_REUSEADDR:
                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
                break;
        case SO_REUSEPORT:
                sk->sk_reuseport = valbool;
                break;
        case SO_DONTROUTE:
                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
                sk_dst_reset(sk);
                break;
        case SO_BROADCAST:
                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
                break;
        case SO_SNDBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
                /* Ensure val * 2 fits into an int, to prevent max_t()
                 * from treating it as a negative value.
                 */
                val = min_t(int, val, INT_MAX / 2);
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                WRITE_ONCE(sk->sk_sndbuf,
                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
                /* Wake up sending tasks if we upped the value. */
                sk->sk_write_space(sk);
                break;

        case SO_SNDBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                if (val < 0)
                        val = 0;
                goto set_sndbuf;

        case SO_RCVBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
                break;

        case SO_RCVBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                __sock_set_rcvbuf(sk, max(val, 0));
                break;

        case SO_KEEPALIVE:
                if (sk->sk_prot->keepalive)
                        sk->sk_prot->keepalive(sk, valbool);
                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
                break;

        case SO_OOBINLINE:
                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
                break;

        case SO_NO_CHECK:
                sk->sk_no_check_tx = valbool;
                break;

        case SO_LINGER:
                if (optlen < sizeof(ling)) {
                        ret = -EINVAL;        /* 1003.1g */
                        break;
                }
                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
                        ret = -EFAULT;
                        break;
                }
                if (!ling.l_onoff) {
                        sock_reset_flag(sk, SOCK_LINGER);
                } else {
                        unsigned long t_sec = ling.l_linger;

                        if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
                                WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
                        else
                                WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
                        sock_set_flag(sk, SOCK_LINGER);
                }
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
        case SO_TIMESTAMP_NEW:
        case SO_TIMESTAMPNS_OLD:
        case SO_TIMESTAMPNS_NEW:
                sock_set_timestamp(sk, optname, valbool);
                break;

        case SO_TIMESTAMPING_NEW:
        case SO_TIMESTAMPING_OLD:
                if (optlen == sizeof(timestamping)) {
                        if (copy_from_sockptr(&timestamping, optval,
                                              sizeof(timestamping))) {
                                ret = -EFAULT;
                                break;
                        }
                } else {
                        memset(&timestamping, 0, sizeof(timestamping));
                        timestamping.flags = val;
                }
                ret = sock_set_timestamping(sk, optname, timestamping);
                break;

        case SO_RCVLOWAT:
                {
                int (*set_rcvlowat)(struct sock *sk, int val) = NULL;

                if (val < 0)
                        val = INT_MAX;
                if (sock)
                        set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
                if (set_rcvlowat)
                        ret = set_rcvlowat(sk, val);
                else
                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
                break;
                }
        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
                                       optlen, optname == SO_RCVTIMEO_OLD);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
                                       optlen, optname == SO_SNDTIMEO_OLD);
                break;

        case SO_ATTACH_FILTER: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_BPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_attach_bpf(ufd, sk);
                }
                break;

        case SO_ATTACH_REUSEPORT_CBPF: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_reuseport_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_REUSEPORT_EBPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_reuseport_attach_bpf(ufd, sk);
                }
                break;

        case SO_DETACH_REUSEPORT_BPF:
                ret = reuseport_detach_prog(sk);
                break;

        case SO_DETACH_FILTER:
                ret = sk_detach_filter(sk);
                break;

        case SO_LOCK_FILTER:
                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
                        ret = -EPERM;
                else
                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
                break;

        case SO_MARK:
                if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                __sock_set_mark(sk, val);
                break;
        case SO_RCVMARK:
                sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
                break;

        case SO_RXQ_OVFL:
                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
                break;

        case SO_WIFI_STATUS:
                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
                break;

        case SO_NOFCS:
                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
                break;

        case SO_SELECT_ERR_QUEUE:
                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
                break;


        case SO_INCOMING_CPU:
                reuseport_update_incoming_cpu(sk, val);
                break;

        case SO_CNX_ADVICE:
                if (val == 1)
                        dst_negative_advice(sk);
                break;

        case SO_ZEROCOPY:
                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
                        if (!(sk_is_tcp(sk) ||
                              (sk->sk_type == SOCK_DGRAM &&
                               sk->sk_protocol == IPPROTO_UDP)))
                                ret = -EOPNOTSUPP;
                } else if (sk->sk_family != PF_RDS) {
                        ret = -EOPNOTSUPP;
                }
                if (!ret) {
                        if (val < 0 || val > 1)
                                ret = -EINVAL;
                        else
                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
                }
                break;

        case SO_TXTIME:
                if (optlen != sizeof(struct sock_txtime)) {
                        ret = -EINVAL;
                        break;
                } else if (copy_from_sockptr(&sk_txtime, optval,
                           sizeof(struct sock_txtime))) {
                        ret = -EFAULT;
                        break;
                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
                        ret = -EINVAL;
                        break;
                }
                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
                 * scheduler has enough safe guards.
                 */
                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                ret = sockopt_validate_clockid(sk_txtime.clockid);
                if (ret)
                        break;

                sock_valbool_flag(sk, SOCK_TXTIME, true);
                sk->sk_clockid = sk_txtime.clockid;
                sk->sk_txtime_deadline_mode =
                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
                sk->sk_txtime_report_errors =
                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
                break;

        case SO_BINDTOIFINDEX:
                ret = sock_bindtoindex_locked(sk, val);
                break;

        case SO_BUF_LOCK:
                if (val & ~SOCK_BUF_LOCK_MASK) {
                        ret = -EINVAL;
                        break;
                }
                sk->sk_userlocks = val | (sk->sk_userlocks &
                                          ~SOCK_BUF_LOCK_MASK);
                break;

        case SO_RESERVE_MEM:
        {
                int delta;

                if (val < 0) {
                        ret = -EINVAL;
                        break;
                }

                delta = val - sk->sk_reserved_mem;
                if (delta < 0)
                        sock_release_reserved_memory(sk, -delta);
                else
                        ret = sock_reserve_memory(sk, delta);
                break;
        }

        default:
                ret = -ENOPROTOOPT;
                break;
        }
        sockopt_release_sock(sk);
        return ret;
}

int sock_setsockopt(struct socket *sock, int level, int optname,
                    sockptr_t optval, unsigned int optlen)
{
        return sk_setsockopt(sock->sk, level, optname,
                             optval, optlen);
}
EXPORT_SYMBOL(sock_setsockopt);

static const struct cred *sk_get_peer_cred(struct sock *sk)
{
        const struct cred *cred;

        spin_lock(&sk->sk_peer_lock);
        cred = get_cred(sk->sk_peer_cred);
        spin_unlock(&sk->sk_peer_lock);

        return cred;
}

static void cred_to_ucred(struct pid *pid, const struct cred *cred,
                          struct ucred *ucred)
{
        ucred->pid = pid_vnr(pid);
        ucred->uid = ucred->gid = -1;
        if (cred) {
                struct user_namespace *current_ns = current_user_ns();

                ucred->uid = from_kuid_munged(current_ns, cred->euid);
                ucred->gid = from_kgid_munged(current_ns, cred->egid);
        }
}

static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;

        for (i = 0; i < src->ngroups; i++) {
                gid_t gid = from_kgid_munged(user_ns, src->gid[i]);

                if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
                        return -EFAULT;
        }

        return 0;
}

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen)
{
        struct socket *sock = sk->sk_socket;

        union {
                int val;
                u64 val64;
                unsigned long ulval;
                struct linger ling;
                struct old_timeval32 tm32;
                struct __kernel_old_timeval tm;
                struct  __kernel_sock_timeval stm;
                struct sock_txtime txtime;
                struct so_timestamping timestamping;
        } v;

        int lv = sizeof(int);
        int len;

        if (copy_from_sockptr(&len, optlen, sizeof(int)))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        memset(&v, 0, sizeof(v));

        switch (optname) {
        case SO_DEBUG:
                v.val = sock_flag(sk, SOCK_DBG);
                break;

        case SO_DONTROUTE:
                v.val = sock_flag(sk, SOCK_LOCALROUTE);
                break;

        case SO_BROADCAST:
                v.val = sock_flag(sk, SOCK_BROADCAST);
                break;

        case SO_SNDBUF:
                v.val = READ_ONCE(sk->sk_sndbuf);
                break;

        case SO_RCVBUF:
                v.val = READ_ONCE(sk->sk_rcvbuf);
                break;

        case SO_REUSEADDR:
                v.val = sk->sk_reuse;
                break;

        case SO_REUSEPORT:
                v.val = sk->sk_reuseport;
                break;

        case SO_KEEPALIVE:
                v.val = sock_flag(sk, SOCK_KEEPOPEN);
                break;

        case SO_TYPE:
                v.val = sk->sk_type;
                break;

        case SO_PROTOCOL:
                v.val = sk->sk_protocol;
                break;

        case SO_DOMAIN:
                v.val = sk->sk_family;
                break;

        case SO_ERROR:
                v.val = -sock_error(sk);
                if (v.val == 0)
                        v.val = xchg(&sk->sk_err_soft, 0);
                break;

        case SO_OOBINLINE:
                v.val = sock_flag(sk, SOCK_URGINLINE);
                break;

        case SO_NO_CHECK:
                v.val = sk->sk_no_check_tx;
                break;

        case SO_PRIORITY:
                v.val = READ_ONCE(sk->sk_priority);
                break;

        case SO_LINGER:
                lv                = sizeof(v.ling);
                v.ling.l_onoff        = sock_flag(sk, SOCK_LINGER);
                v.ling.l_linger        = READ_ONCE(sk->sk_lingertime) / HZ;
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
                break;

        case SO_TIMESTAMPNS_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMP_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPNS_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                lv = sizeof(v.timestamping);
                /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
                 * returning the flags when they were set through the same option.
                 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
                 */
                if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
                        v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
                        v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
                }
                break;

        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
                                      SO_RCVTIMEO_OLD == optname);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
                                      SO_SNDTIMEO_OLD == optname);
                break;

        case SO_RCVLOWAT:
                v.val = READ_ONCE(sk->sk_rcvlowat);
                break;

        case SO_SNDLOWAT:
                v.val = 1;
                break;

        case SO_PASSCRED:
                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
                break;

        case SO_PASSPIDFD:
                v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
                break;

        case SO_PEERCRED:
        {
                struct ucred peercred;
                if (len > sizeof(peercred))
                        len = sizeof(peercred);

                spin_lock(&sk->sk_peer_lock);
                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
                spin_unlock(&sk->sk_peer_lock);

                if (copy_to_sockptr(optval, &peercred, len))
                        return -EFAULT;
                goto lenout;
        }

        case SO_PEERPIDFD:
        {
                struct pid *peer_pid;
                struct file *pidfd_file = NULL;
                int pidfd;

                if (len > sizeof(pidfd))
                        len = sizeof(pidfd);

                spin_lock(&sk->sk_peer_lock);
                peer_pid = get_pid(sk->sk_peer_pid);
                spin_unlock(&sk->sk_peer_lock);

                if (!peer_pid)
                        return -ENODATA;

                pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
                put_pid(peer_pid);
                if (pidfd < 0)
                        return pidfd;

                if (copy_to_sockptr(optval, &pidfd, len) ||
                    copy_to_sockptr(optlen, &len, sizeof(int))) {
                        put_unused_fd(pidfd);
                        fput(pidfd_file);

                        return -EFAULT;
                }

                fd_install(pidfd, pidfd_file);
                return 0;
        }

        case SO_PEERGROUPS:
        {
                const struct cred *cred;
                int ret, n;

                cred = sk_get_peer_cred(sk);
                if (!cred)
                        return -ENODATA;

                n = cred->group_info->ngroups;
                if (len < n * sizeof(gid_t)) {
                        len = n * sizeof(gid_t);
                        put_cred(cred);
                        return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
                }
                len = n * sizeof(gid_t);

                ret = groups_to_user(optval, cred->group_info);
                put_cred(cred);
                if (ret)
                        return ret;
                goto lenout;
        }

        case SO_PEERNAME:
        {
                struct sockaddr_storage address;

                lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
                if (lv < 0)
                        return -ENOTCONN;
                if (lv < len)
                        return -EINVAL;
                if (copy_to_sockptr(optval, &address, len))
                        return -EFAULT;
                goto lenout;
        }

        /* Dubious BSD thing... Probably nobody even uses it, but
         * the UNIX standard wants it for whatever reason... -DaveM
         */
        case SO_ACCEPTCONN:
                v.val = sk->sk_state == TCP_LISTEN;
                break;

        case SO_PASSSEC:
                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
                break;

        case SO_PEERSEC:
                return security_socket_getpeersec_stream(sock,
                                                         optval, optlen, len);

        case SO_MARK:
                v.val = READ_ONCE(sk->sk_mark);
                break;

        case SO_RCVMARK:
                v.val = sock_flag(sk, SOCK_RCVMARK);
                break;

        case SO_RXQ_OVFL:
                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
                break;

        case SO_WIFI_STATUS:
                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
                break;

        case SO_PEEK_OFF:
                if (!READ_ONCE(sock->ops)->set_peek_off)
                        return -EOPNOTSUPP;

                v.val = READ_ONCE(sk->sk_peek_off);
                break;
        case SO_NOFCS:
                v.val = sock_flag(sk, SOCK_NOFCS);
                break;

        case SO_BINDTODEVICE:
                return sock_getbindtodevice(sk, optval, optlen, len);

        case SO_GET_FILTER:
                len = sk_get_filter(sk, optval, len);
                if (len < 0)
                        return len;

                goto lenout;

        case SO_LOCK_FILTER:
                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
                break;

        case SO_BPF_EXTENSIONS:
                v.val = bpf_tell_extensions();
                break;

        case SO_SELECT_ERR_QUEUE:
                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
                break;

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_ll_usec);
                break;
        case SO_PREFER_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
                break;
#endif

        case SO_MAX_PACING_RATE:
                /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
                        lv = sizeof(v.ulval);
                        v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
                } else {
                        /* 32bit version */
                        v.val = min_t(unsigned long, ~0U,
                                      READ_ONCE(sk->sk_max_pacing_rate));
                }
                break;

        case SO_INCOMING_CPU:
                v.val = READ_ONCE(sk->sk_incoming_cpu);
                break;

        case SO_MEMINFO:
        {
                u32 meminfo[SK_MEMINFO_VARS];

                sk_get_meminfo(sk, meminfo);

                len = min_t(unsigned int, len, sizeof(meminfo));
                if (copy_to_sockptr(optval, &meminfo, len))
                        return -EFAULT;

                goto lenout;
        }

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_INCOMING_NAPI_ID:
                v.val = READ_ONCE(sk->sk_napi_id);

                /* aggregate non-NAPI IDs down to 0 */
                if (v.val < MIN_NAPI_ID)
                        v.val = 0;

                break;
#endif

        case SO_COOKIE:
                lv = sizeof(u64);
                if (len < lv)
                        return -EINVAL;
                v.val64 = sock_gen_cookie(sk);
                break;

        case SO_ZEROCOPY:
                v.val = sock_flag(sk, SOCK_ZEROCOPY);
                break;

        case SO_TXTIME:
                lv = sizeof(v.txtime);
                v.txtime.clockid = sk->sk_clockid;
                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
                                  SOF_TXTIME_DEADLINE_MODE : 0;
                v.txtime.flags |= sk->sk_txtime_report_errors ?
                                  SOF_TXTIME_REPORT_ERRORS : 0;
                break;

        case SO_BINDTOIFINDEX:
                v.val = READ_ONCE(sk->sk_bound_dev_if);
                break;

        case SO_NETNS_COOKIE:
                lv = sizeof(u64);
                if (len != lv)
                        return -EINVAL;
                v.val64 = sock_net(sk)->net_cookie;
                break;

        case SO_BUF_LOCK:
                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
                break;

        case SO_RESERVE_MEM:
                v.val = READ_ONCE(sk->sk_reserved_mem);
                break;

        case SO_TXREHASH:
                /* Paired with WRITE_ONCE() in sk_setsockopt() */
                v.val = READ_ONCE(sk->sk_txrehash);
                break;

        default:
                /* We implement the SO_SNDLOWAT etc to not be settable
                 * (1003.1g 7).
                 */
                return -ENOPROTOOPT;
        }

        if (len > lv)
                len = lv;
        if (copy_to_sockptr(optval, &v, len))
                return -EFAULT;
lenout:
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                return -EFAULT;
        return 0;
}

/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
static inline void sock_lock_init(struct sock *sk)
{
        if (sk->sk_kern_sock)
                sock_lock_init_class_and_name(
                        sk,
                        af_family_kern_slock_key_strings[sk->sk_family],
                        af_family_kern_slock_keys + sk->sk_family,
                        af_family_kern_key_strings[sk->sk_family],
                        af_family_kern_keys + sk->sk_family);
        else
                sock_lock_init_class_and_name(
                        sk,
                        af_family_slock_key_strings[sk->sk_family],
                        af_family_slock_keys + sk->sk_family,
                        af_family_key_strings[sk->sk_family],
                        af_family_keys + sk->sk_family);
}

/*
 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 * even temporarly, because of RCU lookups. sk_node should also be left as is.
 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
 */
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
        const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
        void *sptr = nsk->sk_security;
#endif

        /* If we move sk_tx_queue_mapping out of the private section,
         * we must check if sk_tx_queue_clear() is called after
         * sock_copy() in sk_clone_lock().
         */
        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
                     offsetof(struct sock, sk_dontcopy_begin) ||
                     offsetof(struct sock, sk_tx_queue_mapping) >=
                     offsetof(struct sock, sk_dontcopy_end));

        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));

        unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
                      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
                      /* alloc is larger than struct, see sk_prot_alloc() */);

#ifdef CONFIG_SECURITY_NETWORK
        nsk->sk_security = sptr;
        security_sk_clone(osk, nsk);
#endif
}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                int family)
{
        struct sock *sk;
        struct kmem_cache *slab;

        slab = prot->slab;
        if (slab != NULL) {
                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
                if (!sk)
                        return sk;
                if (want_init_on_alloc(priority))
                        sk_prot_clear_nulls(sk, prot->obj_size);
        } else
                sk = kmalloc(prot->obj_size, priority);

        if (sk != NULL) {
                if (security_sk_alloc(sk, family, priority))
                        goto out_free;

                if (!try_module_get(prot->owner))
                        goto out_free_sec;
        }

        return sk;

out_free_sec:
        security_sk_free(sk);
out_free:
        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        return NULL;
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
        struct kmem_cache *slab;
        struct module *owner;

        owner = prot->owner;
        slab = prot->slab;

        cgroup_sk_free(&sk->sk_cgrp_data);
        mem_cgroup_sk_free(sk);
        security_sk_free(sk);
        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        module_put(owner);
}

/**
 *        sk_alloc - All socket objects are allocated here
 *        @net: the applicable net namespace
 *        @family: protocol family
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *        @prot: struct proto associated with this new sock instance
 *        @kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern)
{
        struct sock *sk;

        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
        if (sk) {
                sk->sk_family = family;
                /*
                 * See comment in struct sock definition to understand
                 * why we need sk_prot_creator -acme
                 */
                sk->sk_prot = sk->sk_prot_creator = prot;
                sk->sk_kern_sock = kern;
                sock_lock_init(sk);
                sk->sk_net_refcnt = kern ? 0 : 1;
                if (likely(sk->sk_net_refcnt)) {
                        get_net_track(net, &sk->ns_tracker, priority);
                        sock_inuse_add(net, 1);
                } else {
                        __netns_tracker_alloc(net, &sk->ns_tracker,
                                              false, priority);
                }

                sock_net_set(sk, net);
                refcount_set(&sk->sk_wmem_alloc, 1);

                mem_cgroup_sk_alloc(sk);
                cgroup_sk_alloc(&sk->sk_cgrp_data);
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
                sk_tx_queue_clear(sk);
        }

        return sk;
}
EXPORT_SYMBOL(sk_alloc);

/* Sockets having SOCK_RCU_FREE will call this function after one RCU
 * grace period. This is the case for UDP sockets and TCP listeners.
 */
static void __sk_destruct(struct rcu_head *head)
{
        struct sock *sk = container_of(head, struct sock, sk_rcu);
        struct sk_filter *filter;

        if (sk->sk_destruct)
                sk->sk_destruct(sk);

        filter = rcu_dereference_check(sk->sk_filter,
                                       refcount_read(&sk->sk_wmem_alloc) == 0);
        if (filter) {
                sk_filter_uncharge(sk, filter);
                RCU_INIT_POINTER(sk->sk_filter, NULL);
        }

        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);

#ifdef CONFIG_BPF_SYSCALL
        bpf_sk_storage_free(sk);
#endif

        if (atomic_read(&sk->sk_omem_alloc))
                pr_debug("%s: optmem leakage (%d bytes) detected\n",
                         __func__, atomic_read(&sk->sk_omem_alloc));

        if (sk->sk_frag.page) {
                put_page(sk->sk_frag.page);
                sk->sk_frag.page = NULL;
        }

        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
        put_cred(sk->sk_peer_cred);
        put_pid(sk->sk_peer_pid);

        if (likely(sk->sk_net_refcnt))
                put_net_track(sock_net(sk), &sk->ns_tracker);
        else
                __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);

        sk_prot_free(sk->sk_prot_creator, sk);
}

void sk_destruct(struct sock *sk)
{
        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);

        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
                reuseport_detach_sock(sk);
                use_call_rcu = true;
        }

        if (use_call_rcu)
                call_rcu(&sk->sk_rcu, __sk_destruct);
        else
                __sk_destruct(&sk->sk_rcu);
}

static void __sk_free(struct sock *sk)
{
        if (likely(sk->sk_net_refcnt))
                sock_inuse_add(sock_net(sk), -1);

        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
                sock_diag_broadcast_destroy(sk);
        else
                sk_destruct(sk);
}

void sk_free(struct sock *sk)
{
        /*
         * We subtract one from sk_wmem_alloc and can know if
         * some packets are still in some tx queue.
         * If not null, sock_wfree() will call __sk_free(sk) later
         */
        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sk_free);

static void sk_init_common(struct sock *sk)
{
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);

        rwlock_init(&sk->sk_callback_lock);
        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
                        af_rlock_keys + sk->sk_family,
                        af_family_rlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
                        af_wlock_keys + sk->sk_family,
                        af_family_wlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
                        af_elock_keys + sk->sk_family,
                        af_family_elock_key_strings[sk->sk_family]);
        if (sk->sk_kern_sock)
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_kern_callback_keys + sk->sk_family,
                        af_family_kern_clock_key_strings[sk->sk_family]);
        else
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
                        af_family_clock_key_strings[sk->sk_family]);
}

/**
 *        sk_clone_lock - clone a socket, and lock its clone
 *        @sk: the socket to clone
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *
 *        Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 */
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
        struct proto *prot = READ_ONCE(sk->sk_prot);
        struct sk_filter *filter;
        bool is_charged = true;
        struct sock *newsk;

        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
        if (!newsk)
                goto out;

        sock_copy(newsk, sk);

        newsk->sk_prot_creator = prot;

        /* SANITY */
        if (likely(newsk->sk_net_refcnt)) {
                get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
                sock_inuse_add(sock_net(newsk), 1);
        } else {
                /* Kernel sockets are not elevating the struct net refcount.
                 * Instead, use a tracker to more easily detect if a layer
                 * is not properly dismantling its kernel sockets at netns
                 * destroy time.
                 */
                __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
                                      false, priority);
        }
        sk_node_init(&newsk->sk_node);
        sock_lock_init(newsk);
        bh_lock_sock(newsk);
        newsk->sk_backlog.head        = newsk->sk_backlog.tail = NULL;
        newsk->sk_backlog.len = 0;

        atomic_set(&newsk->sk_rmem_alloc, 0);

        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
        refcount_set(&newsk->sk_wmem_alloc, 1);

        atomic_set(&newsk->sk_omem_alloc, 0);
        sk_init_common(newsk);

        newsk->sk_dst_cache        = NULL;
        newsk->sk_dst_pending_confirm = 0;
        newsk->sk_wmem_queued        = 0;
        newsk->sk_forward_alloc = 0;
        newsk->sk_reserved_mem  = 0;
        atomic_set(&newsk->sk_drops, 0);
        newsk->sk_send_head        = NULL;
        newsk->sk_userlocks        = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
        atomic_set(&newsk->sk_zckey, 0);

        sock_reset_flag(newsk, SOCK_DONE);

        /* sk->sk_memcg will be populated at accept() time */
        newsk->sk_memcg = NULL;

        cgroup_sk_clone(&newsk->sk_cgrp_data);

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
                /* though it's an empty new sock, the charging may fail
                 * if sysctl_optmem_max was changed between creation of
                 * original socket and cloning
                 */
                is_charged = sk_filter_charge(newsk, filter);
        RCU_INIT_POINTER(newsk->sk_filter, filter);
        rcu_read_unlock();

        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
                /* We need to make sure that we don't uncharge the new
                 * socket if we couldn't charge it in the first place
                 * as otherwise we uncharge the parent's filter.
                 */
                if (!is_charged)
                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
                sk_free_unlock_clone(newsk);
                newsk = NULL;
                goto out;
        }
        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);

        if (bpf_sk_storage_clone(sk, newsk)) {
                sk_free_unlock_clone(newsk);
                newsk = NULL;
                goto out;
        }

        /* Clear sk_user_data if parent had the pointer tagged
         * as not suitable for copying when cloning.
         */
        if (sk_user_data_is_nocopy(newsk))
                newsk->sk_user_data = NULL;

        newsk->sk_err           = 0;
        newsk->sk_err_soft = 0;
        newsk->sk_priority = 0;
        newsk->sk_incoming_cpu = raw_smp_processor_id();

        /* Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&newsk->sk_refcnt, 2);

        sk_set_socket(newsk, NULL);
        sk_tx_queue_clear(newsk);
        RCU_INIT_POINTER(newsk->sk_wq, NULL);

        if (newsk->sk_prot->sockets_allocated)
                sk_sockets_allocated_inc(newsk);

        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                net_enable_timestamp();
out:
        return newsk;
}
EXPORT_SYMBOL_GPL(sk_clone_lock);

void sk_free_unlock_clone(struct sock *sk)
{
        /* It is still raw copy of parent, so invalidate
         * destructor and make plain sk_free() */
        sk->sk_destruct = NULL;
        bh_unlock_sock(sk);
        sk_free(sk);
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);

static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{
        bool is_ipv6 = false;
        u32 max_size;

#if IS_ENABLED(CONFIG_IPV6)
        is_ipv6 = (sk->sk_family == AF_INET6 &&
                   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
        /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
        max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
                        READ_ONCE(dst->dev->gso_ipv4_max_size);
        if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
                max_size = GSO_LEGACY_MAX_SIZE;

        return max_size - (MAX_TCP_HEADER + 1);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
        u32 max_segs = 1;

        sk->sk_route_caps = dst->dev->features;
        if (sk_is_tcp(sk))
                sk->sk_route_caps |= NETIF_F_GSO;
        if (sk->sk_route_caps & NETIF_F_GSO)
                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
        if (unlikely(sk->sk_gso_disabled))
                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
        if (sk_can_gso(sk)) {
                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
                } else {
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                        sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
                        max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
                }
        }
        sk->sk_gso_max_segs = max_segs;
        sk_dst_set(sk, dst);
}
EXPORT_SYMBOL_GPL(sk_setup_caps);

/*
 *        Simple resource managers for sockets.
 */


/*
 * Write buffer destructor automatically called from kfree_skb.
 */
void sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;
        bool free;

        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
                if (sock_flag(sk, SOCK_RCU_FREE) &&
                    sk->sk_write_space == sock_def_write_space) {
                        rcu_read_lock();
                        free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
                        sock_def_write_space_wfree(sk);
                        rcu_read_unlock();
                        if (unlikely(free))
                                __sk_free(sk);
                        return;
                }

                /*
                 * Keep a reference on sk_wmem_alloc, this will be released
                 * after sk_write_space() call
                 */
                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
                sk->sk_write_space(sk);
                len = 1;
        }
        /*
         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
         * could not do because of in-flight packets
         */
        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);

/* This variant of sock_wfree() is used by TCP,
 * since it sets SOCK_USE_WRITE_QUEUE.
 */
void __sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
                __sk_free(sk);
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
#ifdef CONFIG_INET
        if (unlikely(!sk_fullsock(sk))) {
                skb->destructor = sock_edemux;
                sock_hold(sk);
                return;
        }
#endif
        skb->destructor = sock_wfree;
        skb_set_hash_from_sk(skb, sk);
        /*
         * We used to take a refcount on sk, but following operation
         * is enough to guarantee sk_free() wont free this sock until
         * all in-flight packets are completed
         */
        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(skb_set_owner_w);

static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
        /* Drivers depend on in-order delivery for crypto offload,
         * partial orphan breaks out-of-order-OK logic.
         */
        if (skb_is_decrypted(skb))
                return false;

        return (skb->destructor == sock_wfree ||
                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
}

/* This helper is used by netem, as it can hold packets in its
 * delay queue. We want to allow the owner socket to send more
 * packets, as if they were already TX completed by a typical driver.
 * But we also want to keep skb->sk set because some packet schedulers
 * rely on it (sch_fq for example).
 */
void skb_orphan_partial(struct sk_buff *skb)
{
        if (skb_is_tcp_pure_ack(skb))
                return;

        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
                return;

        skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);

/*
 * Read buffer destructor automatically called from kfree_skb.
 */
void sock_rfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;

        atomic_sub(len, &sk->sk_rmem_alloc);
        sk_mem_uncharge(sk, len);
}
EXPORT_SYMBOL(sock_rfree);

/*
 * Buffer destructor for skbs that are not used directly in read or write
 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
 */
void sock_efree(struct sk_buff *skb)
{
        sock_put(skb->sk);
}
EXPORT_SYMBOL(sock_efree);

/* Buffer destructor for prefetch/receive path where reference count may
 * not be held, e.g. for listen sockets.
 */
#ifdef CONFIG_INET
void sock_pfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (!sk_is_refcounted(sk))
                return;

        if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
                inet_reqsk(sk)->rsk_listener = NULL;
                reqsk_free(inet_reqsk(sk));
                return;
        }

        sock_gen_put(sk);
}
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */

kuid_t sock_i_uid(struct sock *sk)
{
        kuid_t uid;

        read_lock_bh(&sk->sk_callback_lock);
        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
        read_unlock_bh(&sk->sk_callback_lock);
        return uid;
}
EXPORT_SYMBOL(sock_i_uid);

unsigned long __sock_i_ino(struct sock *sk)
{
        unsigned long ino;

        read_lock(&sk->sk_callback_lock);
        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
        read_unlock(&sk->sk_callback_lock);
        return ino;
}
EXPORT_SYMBOL(__sock_i_ino);

unsigned long sock_i_ino(struct sock *sk)
{
        unsigned long ino;

        local_bh_disable();
        ino = __sock_i_ino(sk);
        local_bh_enable();
        return ino;
}
EXPORT_SYMBOL(sock_i_ino);

/*
 * Allocate a skb from the socket's send buffer.
 */
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority)
{
        if (force ||
            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
                struct sk_buff *skb = alloc_skb(size, priority);

                if (skb) {
                        skb_set_owner_w(skb, sk);
                        return skb;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(sock_wmalloc);

static void sock_ofree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
}

struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority)
{
        struct sk_buff *skb;

        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
            READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
                return NULL;

        skb = alloc_skb(size, priority);
        if (!skb)
                return NULL;

        atomic_add(skb->truesize, &sk->sk_omem_alloc);
        skb->sk = sk;
        skb->destructor = sock_ofree;
        return skb;
}

/*
 * Allocate a memory block from the socket's option memory buffer.
 */
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);

        if ((unsigned int)size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
                void *mem;
                /* First do the add, to avoid the race if kmalloc
                 * might sleep.
                 */
                atomic_add(size, &sk->sk_omem_alloc);
                mem = kmalloc(size, priority);
                if (mem)
                        return mem;
                atomic_sub(size, &sk->sk_omem_alloc);
        }
        return NULL;
}
EXPORT_SYMBOL(sock_kmalloc);

/* Free an option memory block. Note, we actually want the inline
 * here as this allows gcc to detect the nullify and fold away the
 * condition entirely.
 */
static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
                                  const bool nullify)
{
        if (WARN_ON_ONCE(!mem))
                return;
        if (nullify)
                kfree_sensitive(mem);
        else
                kfree(mem);
        atomic_sub(size, &sk->sk_omem_alloc);
}

void sock_kfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, false);
}
EXPORT_SYMBOL(sock_kfree_s);

void sock_kzfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, true);
}
EXPORT_SYMBOL(sock_kzfree_s);

/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
   I think, these locks should be removed for datagram sockets.
 */
static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
        DEFINE_WAIT(wait);

        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
        for (;;) {
                if (!timeo)
                        break;
                if (signal_pending(current))
                        break;
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
                        break;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        break;
                if (READ_ONCE(sk->sk_err))
                        break;
                timeo = schedule_timeout(timeo);
        }
        finish_wait(sk_sleep(sk), &wait);
        return timeo;
}


/*
 *        Generic send/receive buffer handlers
 */

struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order)
{
        struct sk_buff *skb;
        long timeo;
        int err;

        timeo = sock_sndtimeo(sk, noblock);
        for (;;) {
                err = sock_error(sk);
                if (err != 0)
                        goto failure;

                err = -EPIPE;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        goto failure;

                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
                        break;

                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                err = -EAGAIN;
                if (!timeo)
                        goto failure;
                if (signal_pending(current))
                        goto interrupted;
                timeo = sock_wait_for_wmem(sk, timeo);
        }
        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
                                   errcode, sk->sk_allocation);
        if (skb)
                skb_set_owner_w(skb, sk);
        return skb;

interrupted:
        err = sock_intr_errno(timeo);
failure:
        *errcode = err;
        return NULL;
}
EXPORT_SYMBOL(sock_alloc_send_pskb);

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc)
{
        u32 tsflags;

        switch (cmsg->cmsg_type) {
        case SO_MARK:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
                break;
        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;

                tsflags = *(u32 *)CMSG_DATA(cmsg);
                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
                        return -EINVAL;

                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
                sockc->tsflags |= tsflags;
                break;
        case SCM_TXTIME:
                if (!sock_flag(sk, SOCK_TXTIME))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
                        return -EINVAL;
                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
                break;
        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
        case SCM_RIGHTS:
        case SCM_CREDENTIALS:
                break;
        default:
                return -EINVAL;
        }
        return 0;
}
EXPORT_SYMBOL(__sock_cmsg_send);

int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc)
{
        struct cmsghdr *cmsg;
        int ret;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
                if (cmsg->cmsg_level != SOL_SOCKET)
                        continue;
                ret = __sock_cmsg_send(sk, cmsg, sockc);
                if (ret)
                        return ret;
        }
        return 0;
}
EXPORT_SYMBOL(sock_cmsg_send);

static void sk_enter_memory_pressure(struct sock *sk)
{
        if (!sk->sk_prot->enter_memory_pressure)
                return;

        sk->sk_prot->enter_memory_pressure(sk);
}

static void sk_leave_memory_pressure(struct sock *sk)
{
        if (sk->sk_prot->leave_memory_pressure) {
                INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
                                     tcp_leave_memory_pressure, sk);
        } else {
                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;

                if (memory_pressure && READ_ONCE(*memory_pressure))
                        WRITE_ONCE(*memory_pressure, 0);
        }
}

DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

/**
 * skb_page_frag_refill - check that a page_frag contains enough room
 * @sz: minimum size of the fragment we want to get
 * @pfrag: pointer to page_frag
 * @gfp: priority for memory allocation
 *
 * Note: While this allocator tries to use high order pages, there is
 * no guarantee that allocations succeed. Therefore, @sz MUST be
 * less or equal than PAGE_SIZE.
 */
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
        if (pfrag->page) {
                if (page_ref_count(pfrag->page) == 1) {
                        pfrag->offset = 0;
                        return true;
                }
                if (pfrag->offset + sz <= pfrag->size)
                        return true;
                put_page(pfrag->page);
        }

        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER &&
            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
                /* Avoid direct reclaim but allow kswapd to wake */
                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
                                          __GFP_COMP | __GFP_NOWARN |
                                          __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
                        return true;
                }
        }
        pfrag->page = alloc_page(gfp);
        if (likely(pfrag->page)) {
                pfrag->size = PAGE_SIZE;
                return true;
        }
        return false;
}
EXPORT_SYMBOL(skb_page_frag_refill);

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
                return true;

        sk_enter_memory_pressure(sk);
        sk_stream_moderate_sndbuf(sk);
        return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);

void __lock_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        DEFINE_WAIT(wait);

        for (;;) {
                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
                                        TASK_UNINTERRUPTIBLE);
                spin_unlock_bh(&sk->sk_lock.slock);
                schedule();
                spin_lock_bh(&sk->sk_lock.slock);
                if (!sock_owned_by_user(sk))
                        break;
        }
        finish_wait(&sk->sk_lock.wq, &wait);
}

void __release_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        struct sk_buff *skb, *next;

        while ((skb = sk->sk_backlog.head) != NULL) {
                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;

                spin_unlock_bh(&sk->sk_lock.slock);

                do {
                        next = skb->next;
                        prefetch(next);
                        DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
                        skb_mark_not_on_list(skb);
                        sk_backlog_rcv(sk, skb);

                        cond_resched();

                        skb = next;
                } while (skb != NULL);

                spin_lock_bh(&sk->sk_lock.slock);
        }

        /*
         * Doing the zeroing here guarantee we can not loop forever
         * while a wild producer attempts to flood us.
         */
        sk->sk_backlog.len = 0;
}

void __sk_flush_backlog(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        __release_sock(sk);

        if (sk->sk_prot->release_cb)
                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
                                     tcp_release_cb, sk);

        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);

/**
 * sk_wait_data - wait for data to arrive at sk_receive_queue
 * @sk:    sock to wait on
 * @timeo: for how long
 * @skb:   last skb seen on sk_receive_queue
 *
 * Now socket state including sk->sk_err is changed only under lock,
 * hence we may omit checks after joining wait queue.
 * We check receive queue before schedule() only as optimization;
 * it is very likely that release_sock() added new data.
 */
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        int rc;

        add_wait_queue(sk_sleep(sk), &wait);
        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        remove_wait_queue(sk_sleep(sk), &wait);
        return rc;
}
EXPORT_SYMBOL(sk_wait_data);

/**
 *        __sk_mem_raise_allocated - increase memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @amt: pages to allocate
 *        @kind: allocation type
 *
 *        Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
 *
 *        Unlike the globally shared limits among the sockets under same protocol,
 *        consuming the budget of a memcg won't have direct effect on other ones.
 *        So be optimistic about memcg's tolerance, and leave the callers to decide
 *        whether or not to raise allocated through sk_under_memory_pressure() or
 *        its variants.
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
        struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
        struct proto *prot = sk->sk_prot;
        bool charged = false;
        long allocated;

        sk_memory_allocated_add(sk, amt);
        allocated = sk_memory_allocated(sk);

        if (memcg) {
                if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
                        goto suppress_allocation;
                charged = true;
        }

        /* Under limit. */
        if (allocated <= sk_prot_mem_limits(sk, 0)) {
                sk_leave_memory_pressure(sk);
                return 1;
        }

        /* Under pressure. */
        if (allocated > sk_prot_mem_limits(sk, 1))
                sk_enter_memory_pressure(sk);

        /* Over hard limit. */
        if (allocated > sk_prot_mem_limits(sk, 2))
                goto suppress_allocation;

        /* Guarantee minimum buffer size under pressure (either global
         * or memcg) to make sure features described in RFC 7323 (TCP
         * Extensions for High Performance) work properly.
         *
         * This rule does NOT stand when exceeds global or memcg's hard
         * limit, or else a DoS attack can be taken place by spawning
         * lots of sockets whose usage are under minimum buffer size.
         */
        if (kind == SK_MEM_RECV) {
                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
                        return 1;

        } else { /* SK_MEM_SEND */
                int wmem0 = sk_get_wmem0(sk, prot);

                if (sk->sk_type == SOCK_STREAM) {
                        if (sk->sk_wmem_queued < wmem0)
                                return 1;
                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
                                return 1;
                }
        }

        if (sk_has_memory_pressure(sk)) {
                u64 alloc;

                /* The following 'average' heuristic is within the
                 * scope of global accounting, so it only makes
                 * sense for global memory pressure.
                 */
                if (!sk_under_global_memory_pressure(sk))
                        return 1;

                /* Try to be fair among all the sockets under global
                 * pressure by allowing the ones that below average
                 * usage to raise.
                 */
                alloc = sk_sockets_allocated_read_positive(sk);
                if (sk_prot_mem_limits(sk, 2) > alloc *
                    sk_mem_pages(sk->sk_wmem_queued +
                                 atomic_read(&sk->sk_rmem_alloc) +
                                 sk->sk_forward_alloc))
                        return 1;
        }

suppress_allocation:

        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
                sk_stream_moderate_sndbuf(sk);

                /* Fail only if socket is _under_ its sndbuf.
                 * In this case we cannot block, so that we have to fail.
                 */
                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
                        /* Force charge with __GFP_NOFAIL */
                        if (memcg && !charged) {
                                mem_cgroup_charge_skmem(memcg, amt,
                                        gfp_memcg_charge() | __GFP_NOFAIL);
                        }
                        return 1;
                }
        }

        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);

        sk_memory_allocated_sub(sk, amt);

        if (charged)
                mem_cgroup_uncharge_skmem(memcg, amt);

        return 0;
}

/**
 *        __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @kind: allocation type
 *
 *        If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
 *        rmem allocation. This function assumes that protocols which have
 *        memory_pressure use sk_wmem_queued as write buffer accounting.
 */
int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
        int ret, amt = sk_mem_pages(size);

        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
        if (!ret)
                sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
        return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);

/**
 *        __sk_mem_reduce_allocated - reclaim memory_allocated
 *        @sk: socket
 *        @amount: number of quanta
 *
 *        Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
 */
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
        sk_memory_allocated_sub(sk, amount);

        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);

        if (sk_under_global_memory_pressure(sk) &&
            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
                sk_leave_memory_pressure(sk);
}

/**
 *        __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
 */
void __sk_mem_reclaim(struct sock *sk, int amount)
{
        amount >>= PAGE_SHIFT;
        sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
        __sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);

int sk_set_peek_off(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_peek_off, val);
        return 0;
}
EXPORT_SYMBOL_GPL(sk_set_peek_off);

/*
 * Set of default routines for initialising struct proto_ops when
 * the protocol does not support a particular function. In certain
 * cases where it makes no sense for a protocol to have a "do nothing"
 * function, some default processing is provided.
 */

int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_bind);

int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
                    int len, int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_connect);

int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_socketpair);

int sock_no_accept(struct socket *sock, struct socket *newsock,
                   struct proto_accept_arg *arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_accept);

int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
                    int peer)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_getname);

int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_ioctl);

int sock_no_listen(struct socket *sock, int backlog)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_listen);

int sock_no_shutdown(struct socket *sock, int how)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_shutdown);

int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg);

int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg_locked);

int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
                    int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_recvmsg);

int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
        /* Mirror missing mmap method error code */
        return -ENODEV;
}
EXPORT_SYMBOL(sock_no_mmap);

/*
 * When a file is received (via SCM_RIGHTS, etc), we must bump the
 * various sock-based usage counts.
 */
void __receive_sock(struct file *file)
{
        struct socket *sock;

        sock = sock_from_file(file);
        if (sock) {
                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
                sock_update_classid(&sock->sk->sk_cgrp_data);
        }
}

/*
 *        Default Socket Callbacks
 */

static void sock_def_wakeup(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_all(&wq->wait);
        rcu_read_unlock();
}

static void sock_def_error_report(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
        sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
        rcu_read_unlock();
}

void sock_def_readable(struct sock *sk)
{
        struct socket_wq *wq;

        trace_sk_data_ready(sk);

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                                                EPOLLRDNORM | EPOLLRDBAND);
        sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
}

static void sock_def_write_space(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();

        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (sock_writeable(sk)) {
                wq = rcu_dereference(sk->sk_wq);
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }

        rcu_read_unlock();
}

/* An optimised version of sock_def_write_space(), should only be called
 * for SOCK_RCU_FREE sockets under RCU read section and after putting
 * ->sk_wmem_alloc.
 */
static void sock_def_write_space_wfree(struct sock *sk)
{
        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (sock_writeable(sk)) {
                struct socket_wq *wq = rcu_dereference(sk->sk_wq);

                /* rely on refcount_sub from sock_wfree() */
                smp_mb__after_atomic();
                if (wq && waitqueue_active(&wq->wait))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
}

static void sock_def_destruct(struct sock *sk)
{
}

void sk_send_sigurg(struct sock *sk)
{
        if (sk->sk_socket && sk->sk_socket->file)
                if (send_sigurg(&sk->sk_socket->file->f_owner))
                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);

void sk_reset_timer(struct sock *sk, struct timer_list* timer,
                    unsigned long expires)
{
        if (!mod_timer(timer, expires))
                sock_hold(sk);
}
EXPORT_SYMBOL(sk_reset_timer);

void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
        if (del_timer(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
        if (del_timer_sync(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer_sync);

void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
        sk_init_common(sk);
        sk->sk_send_head        =        NULL;

        timer_setup(&sk->sk_timer, NULL, 0);

        sk->sk_allocation        =        GFP_KERNEL;
        sk->sk_rcvbuf                =        READ_ONCE(sysctl_rmem_default);
        sk->sk_sndbuf                =        READ_ONCE(sysctl_wmem_default);
        sk->sk_state                =        TCP_CLOSE;
        sk->sk_use_task_frag        =        true;
        sk_set_socket(sk, sock);

        sock_set_flag(sk, SOCK_ZAPPED);

        if (sock) {
                sk->sk_type        =        sock->type;
                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
                sock->sk        =        sk;
        } else {
                RCU_INIT_POINTER(sk->sk_wq, NULL);
        }
        sk->sk_uid        =        uid;

        sk->sk_state_change        =        sock_def_wakeup;
        sk->sk_data_ready        =        sock_def_readable;
        sk->sk_write_space        =        sock_def_write_space;
        sk->sk_error_report        =        sock_def_error_report;
        sk->sk_destruct                =        sock_def_destruct;

        sk->sk_frag.page        =        NULL;
        sk->sk_frag.offset        =        0;
        sk->sk_peek_off                =        -1;

        sk->sk_peer_pid         =        NULL;
        sk->sk_peer_cred        =        NULL;
        spin_lock_init(&sk->sk_peer_lock);

        sk->sk_write_pending        =        0;
        sk->sk_rcvlowat                =        1;
        sk->sk_rcvtimeo                =        MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo                =        MAX_SCHEDULE_TIMEOUT;

        sk->sk_stamp = SK_DEFAULT_STAMP;
#if BITS_PER_LONG==32
        seqlock_init(&sk->sk_stamp_seq);
#endif
        atomic_set(&sk->sk_zckey, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
        sk->sk_napi_id                =        0;
        sk->sk_ll_usec                =        READ_ONCE(sysctl_net_busy_read);
#endif

        sk->sk_max_pacing_rate = ~0UL;
        sk->sk_pacing_rate = ~0UL;
        WRITE_ONCE(sk->sk_pacing_shift, 10);
        sk->sk_incoming_cpu = -1;

        sk_rx_queue_clear(sk);
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&sk->sk_refcnt, 1);
        atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data_uid);

void sock_init_data(struct socket *sock, struct sock *sk)
{
        kuid_t uid = sock ?
                SOCK_INODE(sock)->i_uid :
                make_kuid(sock_net(sk)->user_ns, 0);

        sock_init_data_uid(sock, sk, uid);
}
EXPORT_SYMBOL(sock_init_data);

void lock_sock_nested(struct sock *sk, int subclass)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);

        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);
        if (sock_owned_by_user_nocheck(sk))
                __lock_sock(sk);
        sk->sk_lock.owned = 1;
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(lock_sock_nested);

void release_sock(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        if (sk->sk_backlog.tail)
                __release_sock(sk);

        if (sk->sk_prot->release_cb)
                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
                                     tcp_release_cb, sk);

        sock_release_ownership(sk);
        if (waitqueue_active(&sk->sk_lock.wq))
                wake_up(&sk->sk_lock.wq);
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(release_sock);

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);

        if (!sock_owned_by_user_nocheck(sk)) {
                /*
                 * Fast path return with bottom halves disabled and
                 * sock::sk_lock.slock held.
                 *
                 * The 'mutex' is not contended and holding
                 * sock::sk_lock.slock prevents all other lockers to
                 * proceed so the corresponding unlock_sock_fast() can
                 * avoid the slow path of release_sock() completely and
                 * just release slock.
                 *
                 * From a semantical POV this is equivalent to 'acquiring'
                 * the 'mutex', hence the corresponding lockdep
                 * mutex_release() has to happen in the fast path of
                 * unlock_sock_fast().
                 */
                return false;
        }

        __lock_sock(sk);
        sk->sk_lock.owned = 1;
        __acquire(&sk->sk_lock.slock);
        spin_unlock_bh(&sk->sk_lock.slock);
        return true;
}
EXPORT_SYMBOL(__lock_sock_fast);

int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32)
{
        struct sock *sk = sock->sk;
        struct timespec64 ts;

        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        ts = ktime_to_timespec64(sock_read_timestamp(sk));
        if (ts.tv_sec == -1)
                return -ENOENT;
        if (ts.tv_sec == 0) {
                ktime_t kt = ktime_get_real();
                sock_write_timestamp(sk, kt);
                ts = ktime_to_timespec64(kt);
        }

        if (timeval)
                ts.tv_nsec /= 1000;

#ifdef CONFIG_COMPAT_32BIT_TIME
        if (time32)
                return put_old_timespec32(&ts, userstamp);
#endif
#ifdef CONFIG_SPARC64
        /* beware of padding in sparc64 timeval */
        if (timeval && !in_compat_syscall()) {
                struct __kernel_old_timeval __user tv = {
                        .tv_sec = ts.tv_sec,
                        .tv_usec = ts.tv_nsec,
                };
                if (copy_to_user(userstamp, &tv, sizeof(tv)))
                        return -EFAULT;
                return 0;
        }
#endif
        return put_timespec64(&ts, userstamp);
}
EXPORT_SYMBOL(sock_gettstamp);

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
        if (!sock_flag(sk, flag)) {
                unsigned long previous_flags = sk->sk_flags;

                sock_set_flag(sk, flag);
                /*
                 * we just set one of the two flags which require net
                 * time stamping, but time stamping might have been on
                 * already because of the other one
                 */
                if (sock_needs_netstamp(sk) &&
                    !(previous_flags & SK_FLAGS_TIMESTAMP))
                        net_enable_timestamp();
        }
}

int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
                       int level, int type)
{
        struct sock_exterr_skb *serr;
        struct sk_buff *skb;
        int copied, err;

        err = -EAGAIN;
        skb = sock_dequeue_err_skb(sk);
        if (skb == NULL)
                goto out;

        copied = skb->len;
        if (copied > len) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }
        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto out_free_skb;

        sock_recv_timestamp(msg, sk, skb);

        serr = SKB_EXT_ERR(skb);
        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);

        msg->msg_flags |= MSG_ERRQUEUE;
        err = copied;

out_free_skb:
        kfree_skb(skb);
out:
        return err;
}
EXPORT_SYMBOL(sock_recv_errqueue);

/*
 *        Get a socket option on an socket.
 *
 *        FIX: POSIX 1003.1g is very ambiguous here. It states that
 *        asynchronous errors should be reported by getsockopt. We assume
 *        this means if you specify SO_ERROR (otherwise whats the point of it).
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                           char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);

int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags)
{
        struct sock *sk = sock->sk;
        int addr_len = 0;
        int err;

        err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
        if (err >= 0)
                msg->msg_namelen = addr_len;
        return err;
}
EXPORT_SYMBOL(sock_common_recvmsg);

/*
 *        Set socket options on an inet socket.
 */
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);

void sk_common_release(struct sock *sk)
{
        if (sk->sk_prot->destroy)
                sk->sk_prot->destroy(sk);

        /*
         * Observation: when sk_common_release is called, processes have
         * no access to socket. But net still has.
         * Step one, detach it from networking:
         *
         * A. Remove from hash tables.
         */

        sk->sk_prot->unhash(sk);

        if (sk->sk_socket)
                sk->sk_socket->sk = NULL;

        /*
         * In this point socket cannot receive new packets, but it is possible
         * that some packets are in flight because some CPU runs receiver and
         * did hash table lookup before we unhashed socket. They will achieve
         * receive queue and will be purged by socket destructor.
         *
         * Also we still have packets pending on receive queue and probably,
         * our own packets waiting in device queues. sock_destroy will drain
         * receive queue, but transmitted packets will delay socket destruction
         * until the last reference will be released.
         */

        sock_orphan(sk);

        xfrm_sk_free_policy(sk);

        sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);

void sk_get_meminfo(const struct sock *sk, u32 *mem)
{
        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);

        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
        mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}

#ifdef CONFIG_PROC_FS
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);

int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
        int cpu, idx = prot->inuse_idx;
        int res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];

        return res >= 0 ? res : 0;
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);

int sock_inuse_get(struct net *net)
{
        int cpu, res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;

        return res;
}

EXPORT_SYMBOL_GPL(sock_inuse_get);

static int __net_init sock_inuse_init_net(struct net *net)
{
        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
        if (net->core.prot_inuse == NULL)
                return -ENOMEM;
        return 0;
}

static void __net_exit sock_inuse_exit_net(struct net *net)
{
        free_percpu(net->core.prot_inuse);
}

static struct pernet_operations net_inuse_ops = {
        .init = sock_inuse_init_net,
        .exit = sock_inuse_exit_net,
};

static __init int net_inuse_init(void)
{
        if (register_pernet_subsys(&net_inuse_ops))
                panic("Cannot initialize net inuse counters");

        return 0;
}

core_initcall(net_inuse_init);

static int assign_proto_idx(struct proto *prot)
{
        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);

        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
                pr_err("PROTO_INUSE_NR exhausted\n");
                return -ENOSPC;
        }

        set_bit(prot->inuse_idx, proto_inuse_idx);
        return 0;
}

static void release_proto_idx(struct proto *prot)
{
        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
                clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
static inline int assign_proto_idx(struct proto *prot)
{
        return 0;
}

static inline void release_proto_idx(struct proto *prot)
{
}

#endif

static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
        if (!twsk_prot)
                return;
        kfree(twsk_prot->twsk_slab_name);
        twsk_prot->twsk_slab_name = NULL;
        kmem_cache_destroy(twsk_prot->twsk_slab);
        twsk_prot->twsk_slab = NULL;
}

static int tw_prot_init(const struct proto *prot)
{
        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;

        if (!twsk_prot)
                return 0;

        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
                                              prot->name);
        if (!twsk_prot->twsk_slab_name)
                return -ENOMEM;

        twsk_prot->twsk_slab =
                kmem_cache_create(twsk_prot->twsk_slab_name,
                                  twsk_prot->twsk_obj_size, 0,
                                  SLAB_ACCOUNT | prot->slab_flags,
                                  NULL);
        if (!twsk_prot->twsk_slab) {
                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }

        return 0;
}

static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
        if (!rsk_prot)
                return;
        kfree(rsk_prot->slab_name);
        rsk_prot->slab_name = NULL;
        kmem_cache_destroy(rsk_prot->slab);
        rsk_prot->slab = NULL;
}

static int req_prot_init(const struct proto *prot)
{
        struct request_sock_ops *rsk_prot = prot->rsk_prot;

        if (!rsk_prot)
                return 0;

        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
                                        prot->name);
        if (!rsk_prot->slab_name)
                return -ENOMEM;

        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                           rsk_prot->obj_size, 0,
                                           SLAB_ACCOUNT | prot->slab_flags,
                                           NULL);

        if (!rsk_prot->slab) {
                pr_crit("%s: Can't create request sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }
        return 0;
}

int proto_register(struct proto *prot, int alloc_slab)
{
        int ret = -ENOBUFS;

        if (prot->memory_allocated && !prot->sysctl_mem) {
                pr_err("%s: missing sysctl_mem\n", prot->name);
                return -EINVAL;
        }
        if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
                pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
                return -EINVAL;
        }
        if (alloc_slab) {
                prot->slab = kmem_cache_create_usercopy(prot->name,
                                        prot->obj_size, 0,
                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
                                        prot->slab_flags,
                                        prot->useroffset, prot->usersize,
                                        NULL);

                if (prot->slab == NULL) {
                        pr_crit("%s: Can't create sock SLAB cache!\n",
                                prot->name);
                        goto out;
                }

                if (req_prot_init(prot))
                        goto out_free_request_sock_slab;

                if (tw_prot_init(prot))
                        goto out_free_timewait_sock_slab;
        }

        mutex_lock(&proto_list_mutex);
        ret = assign_proto_idx(prot);
        if (ret) {
                mutex_unlock(&proto_list_mutex);
                goto out_free_timewait_sock_slab;
        }
        list_add(&prot->node, &proto_list);
        mutex_unlock(&proto_list_mutex);
        return ret;

out_free_timewait_sock_slab:
        if (alloc_slab)
                tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
        if (alloc_slab) {
                req_prot_cleanup(prot->rsk_prot);

                kmem_cache_destroy(prot->slab);
                prot->slab = NULL;
        }
out:
        return ret;
}
EXPORT_SYMBOL(proto_register);

void proto_unregister(struct proto *prot)
{
        mutex_lock(&proto_list_mutex);
        release_proto_idx(prot);
        list_del(&prot->node);
        mutex_unlock(&proto_list_mutex);

        kmem_cache_destroy(prot->slab);
        prot->slab = NULL;

        req_prot_cleanup(prot->rsk_prot);
        tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);

int sock_load_diag_module(int family, int protocol)
{
        if (!protocol) {
                if (!sock_is_registered(family))
                        return -ENOENT;

                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
                                      NETLINK_SOCK_DIAG, family);
        }

#ifdef CONFIG_INET
        if (family == AF_INET &&
            protocol != IPPROTO_RAW &&
            protocol < MAX_INET_PROTOS &&
            !rcu_access_pointer(inet_protos[protocol]))
                return -ENOENT;
#endif

        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
                              NETLINK_SOCK_DIAG, family, protocol);
}
EXPORT_SYMBOL(sock_load_diag_module);

#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(proto_list_mutex)
{
        mutex_lock(&proto_list_mutex);
        return seq_list_start_head(&proto_list, *pos);
}

static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &proto_list, pos);
}

static void proto_seq_stop(struct seq_file *seq, void *v)
        __releases(proto_list_mutex)
{
        mutex_unlock(&proto_list_mutex);
}

static char proto_method_implemented(const void *method)
{
        return method == NULL ? 'n' : 'y';
}
static long sock_prot_memory_allocated(struct proto *proto)
{
        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}

static const char *sock_prot_memory_pressure(struct proto *proto)
{
        return proto->memory_pressure != NULL ?
        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
}

static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{

        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
                   proto->name,
                   proto->obj_size,
                   sock_prot_inuse_get(seq_file_net(seq), proto),
                   sock_prot_memory_allocated(proto),
                   sock_prot_memory_pressure(proto),
                   proto->max_header,
                   proto->slab == NULL ? "no" : "yes",
                   module_name(proto->owner),
                   proto_method_implemented(proto->close),
                   proto_method_implemented(proto->connect),
                   proto_method_implemented(proto->disconnect),
                   proto_method_implemented(proto->accept),
                   proto_method_implemented(proto->ioctl),
                   proto_method_implemented(proto->init),
                   proto_method_implemented(proto->destroy),
                   proto_method_implemented(proto->shutdown),
                   proto_method_implemented(proto->setsockopt),
                   proto_method_implemented(proto->getsockopt),
                   proto_method_implemented(proto->sendmsg),
                   proto_method_implemented(proto->recvmsg),
                   proto_method_implemented(proto->bind),
                   proto_method_implemented(proto->backlog_rcv),
                   proto_method_implemented(proto->hash),
                   proto_method_implemented(proto->unhash),
                   proto_method_implemented(proto->get_port),
                   proto_method_implemented(proto->enter_memory_pressure));
}

static int proto_seq_show(struct seq_file *seq, void *v)
{
        if (v == &proto_list)
                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
                           "protocol",
                           "size",
                           "sockets",
                           "memory",
                           "press",
                           "maxhdr",
                           "slab",
                           "module",
                           "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
        else
                proto_seq_printf(seq, list_entry(v, struct proto, node));
        return 0;
}

static const struct seq_operations proto_seq_ops = {
        .start  = proto_seq_start,
        .next   = proto_seq_next,
        .stop   = proto_seq_stop,
        .show   = proto_seq_show,
};

static __net_init int proto_init_net(struct net *net)
{
        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;

        return 0;
}

static __net_exit void proto_exit_net(struct net *net)
{
        remove_proc_entry("protocols", net->proc_net);
}


static __net_initdata struct pernet_operations proto_net_ops = {
        .init = proto_init_net,
        .exit = proto_exit_net,
};

static int __init proto_init(void)
{
        return register_pernet_subsys(&proto_net_ops);
}

subsys_initcall(proto_init);

#endif /* PROC_FS */

#ifdef CONFIG_NET_RX_BUSY_POLL
bool sk_busy_loop_end(void *p, unsigned long start_time)
{
        struct sock *sk = p;

        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                return true;

        if (sk_is_udp(sk) &&
            !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
                return true;

        return sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
{
        if (!sk->sk_prot->bind_add)
                return -EOPNOTSUPP;
        return sk->sk_prot->bind_add(sk, addr, addr_len);
}
EXPORT_SYMBOL(sock_bind_add);

/* Copy 'size' bytes from userspace and return `size` back to userspace */
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size)
{
        int ret;

        if (copy_from_user(karg, arg, size))
                return -EFAULT;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
        if (ret)
                return ret;

        if (copy_to_user(arg, karg, size))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(sock_ioctl_inout);

/* This is the most common ioctl prep function, where the result (4 bytes) is
 * copied back to userspace if the ioctl() returns successfully. No input is
 * copied from userspace as input argument.
 */
static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int ret, karg = 0;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
        if (ret)
                return ret;

        return put_user(karg, (int __user *)arg);
}

/* A wrapper around sock ioctls, which copies the data from userspace
 * (depending on the protocol/ioctl), and copies back the result to userspace.
 * The main motivation for this function is to pass kernel memory to the
 * protocol ioctl callbacks, instead of userspace memory.
 */
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int rc = 1;

        if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
                rc = ipmr_sk_ioctl(sk, cmd, arg);
        else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
                rc = ip6mr_sk_ioctl(sk, cmd, arg);
        else if (sk_is_phonet(sk))
                rc = phonet_sk_ioctl(sk, cmd, arg);

        /* If ioctl was processed, returns its value */
        if (rc <= 0)
                return rc;

        /* Otherwise call the default handler */
        return sock_ioctl_out(sk, cmd, arg);
}
EXPORT_SYMBOL(sk_ioctl);

static int __init sock_struct_check(void)
{
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
        return 0;
}

core_initcall(sock_struct_check);





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _LINUX_RANDOM_H
#define _LINUX_RANDOM_H

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/list.h>

#include <uapi/linux/random.h>

struct notifier_block;

void add_device_randomness(const void *buf, size_t len);
void __init add_bootloader_randomness(const void *buf, size_t len);
void add_input_randomness(unsigned int type, unsigned int code,
                          unsigned int value) __latent_entropy;
void add_interrupt_randomness(int irq) __latent_entropy;
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);

static inline void add_latent_entropy(void)
{
#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
        add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy));
#else
        add_device_randomness(NULL, 0);
#endif
}

#if IS_ENABLED(CONFIG_VMGENID)
void add_vmfork_randomness(const void *unique_vm_id, size_t len);
int register_random_vmfork_notifier(struct notifier_block *nb);
int unregister_random_vmfork_notifier(struct notifier_block *nb);
#else
static inline int register_random_vmfork_notifier(struct notifier_block *nb) { return 0; }
static inline int unregister_random_vmfork_notifier(struct notifier_block *nb) { return 0; }
#endif

void get_random_bytes(void *buf, size_t len);
u8 get_random_u8(void);
u16 get_random_u16(void);
u32 get_random_u32(void);
u64 get_random_u64(void);
static inline unsigned long get_random_long(void)
{
#if BITS_PER_LONG == 64
        return get_random_u64();
#else
        return get_random_u32();
#endif
}

u32 __get_random_u32_below(u32 ceil);

/*
 * Returns a random integer in the interval [0, ceil), with uniform
 * distribution, suitable for all uses. Fastest when ceil is a constant, but
 * still fast for variable ceil as well.
 */
static inline u32 get_random_u32_below(u32 ceil)
{
        if (!__builtin_constant_p(ceil))
                return __get_random_u32_below(ceil);

        /*
         * For the fast path, below, all operations on ceil are precomputed by
         * the compiler, so this incurs no overhead for checking pow2, doing
         * divisions, or branching based on integer size. The resultant
         * algorithm does traditional reciprocal multiplication (typically
         * optimized by the compiler into shifts and adds), rejecting samples
         * whose lower half would indicate a range indivisible by ceil.
         */
        BUILD_BUG_ON_MSG(!ceil, "get_random_u32_below() must take ceil > 0");
        if (ceil <= 1)
                return 0;
        for (;;) {
                if (ceil <= 1U << 8) {
                        u32 mult = ceil * get_random_u8();
                        if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil))
                                return mult >> 8;
                } else if (ceil <= 1U << 16) {
                        u32 mult = ceil * get_random_u16();
                        if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil))
                                return mult >> 16;
                } else {
                        u64 mult = (u64)ceil * get_random_u32();
                        if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil))
                                return mult >> 32;
                }
        }
}

/*
 * Returns a random integer in the interval (floor, U32_MAX], with uniform
 * distribution, suitable for all uses. Fastest when floor is a constant, but
 * still fast for variable floor as well.
 */
static inline u32 get_random_u32_above(u32 floor)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && floor == U32_MAX,
                         "get_random_u32_above() must take floor < U32_MAX");
        return floor + 1 + get_random_u32_below(U32_MAX - floor);
}

/*
 * Returns a random integer in the interval [floor, ceil], with uniform
 * distribution, suitable for all uses. Fastest when floor and ceil are
 * constant, but still fast for variable floor and ceil as well.
 */
static inline u32 get_random_u32_inclusive(u32 floor, u32 ceil)
{
        BUILD_BUG_ON_MSG(__builtin_constant_p(floor) && __builtin_constant_p(ceil) &&
                         (floor > ceil || ceil - floor == U32_MAX),
                         "get_random_u32_inclusive() must take floor <= ceil");
        return floor + get_random_u32_below(ceil - floor + 1);
}

void __init random_init_early(const char *command_line);
void __init random_init(void);
bool rng_is_initialized(void);
int wait_for_random_bytes(void);
int execute_with_initialized_rng(struct notifier_block *nb);

/* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes).
 * Returns the result of the call to wait_for_random_bytes. */
static inline int get_random_bytes_wait(void *buf, size_t nbytes)
{
        int ret = wait_for_random_bytes();
        get_random_bytes(buf, nbytes);
        return ret;
}

#define declare_get_random_var_wait(name, ret_type) \
        static inline int get_random_ ## name ## _wait(ret_type *out) { \
                int ret = wait_for_random_bytes(); \
                if (unlikely(ret)) \
                        return ret; \
                *out = get_random_ ## name(); \
                return 0; \
        }
declare_get_random_var_wait(u8, u8)
declare_get_random_var_wait(u16, u16)
declare_get_random_var_wait(u32, u32)
declare_get_random_var_wait(u64, u32)
declare_get_random_var_wait(long, unsigned long)
#undef declare_get_random_var

/*
 * This is designed to be standalone for just prandom
 * users, but for now we include it from <linux/random.h>
 * for legacy reasons.
 */
#include <linux/prandom.h>

#ifdef CONFIG_SMP
int random_prepare_cpu(unsigned int cpu);
int random_online_cpu(unsigned int cpu);
#endif

#ifndef MODULE
extern const struct file_operations random_fops, urandom_fops;
#endif

#endif /* _LINUX_RANDOM_H */

































































































































































































































































































































    1 









    2 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for inet_sock
 *
 * Authors:        Many, reorganised here by
 *                 Arnaldo Carvalho de Melo <acme@mandriva.com>
 */
#ifndef _INET_SOCK_H
#define _INET_SOCK_H

#include <linux/bitops.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/jhash.h>
#include <linux/netdevice.h>

#include <net/flow.h>
#include <net/sock.h>
#include <net/request_sock.h>
#include <net/netns/hash.h>
#include <net/tcp_states.h>
#include <net/l3mdev.h>

/** struct ip_options - IP Options
 *
 * @faddr - Saved first hop address
 * @nexthop - Saved nexthop address in LSRR and SSRR
 * @is_strictroute - Strict source route
 * @srr_is_hit - Packet destination addr was our one
 * @is_changed - IP checksum more not valid
 * @rr_needaddr - Need to record addr of outgoing dev
 * @ts_needtime - Need to record timestamp
 * @ts_needaddr - Need to record addr of outgoing dev
 */
struct ip_options {
        __be32                faddr;
        __be32                nexthop;
        unsigned char        optlen;
        unsigned char        srr;
        unsigned char        rr;
        unsigned char        ts;
        unsigned char        is_strictroute:1,
                        srr_is_hit:1,
                        is_changed:1,
                        rr_needaddr:1,
                        ts_needtime:1,
                        ts_needaddr:1;
        unsigned char        router_alert;
        unsigned char        cipso;
        unsigned char        __pad2;
        unsigned char        __data[];
};

struct ip_options_rcu {
        struct rcu_head rcu;
        struct ip_options opt;
};

struct ip_options_data {
        struct ip_options_rcu        opt;
        char                        data[40];
};

struct inet_request_sock {
        struct request_sock        req;
#define ir_loc_addr                req.__req_common.skc_rcv_saddr
#define ir_rmt_addr                req.__req_common.skc_daddr
#define ir_num                        req.__req_common.skc_num
#define ir_rmt_port                req.__req_common.skc_dport
#define ir_v6_rmt_addr                req.__req_common.skc_v6_daddr
#define ir_v6_loc_addr                req.__req_common.skc_v6_rcv_saddr
#define ir_iif                        req.__req_common.skc_bound_dev_if
#define ir_cookie                req.__req_common.skc_cookie
#define ireq_net                req.__req_common.skc_net
#define ireq_state                req.__req_common.skc_state
#define ireq_family                req.__req_common.skc_family

        u16                        snd_wscale : 4,
                                rcv_wscale : 4,
                                tstamp_ok  : 1,
                                sack_ok           : 1,
                                wscale_ok  : 1,
                                ecn_ok           : 1,
                                acked           : 1,
                                no_srccheck: 1,
                                smc_ok           : 1;
        u32                     ir_mark;
        union {
                struct ip_options_rcu __rcu        *ireq_opt;
#if IS_ENABLED(CONFIG_IPV6)
                struct {
                        struct ipv6_txoptions        *ipv6_opt;
                        struct sk_buff                *pktopts;
                };
#endif
        };
};

static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
{
        return (struct inet_request_sock *)sk;
}

static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
{
        u32 mark = READ_ONCE(sk->sk_mark);

        if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept))
                return skb->mark;

        return mark;
}

static inline int inet_request_bound_dev_if(const struct sock *sk,
                                            struct sk_buff *skb)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
#endif

        return bound_dev_if;
}

static inline int inet_sk_bound_l3mdev(const struct sock *sk)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        struct net *net = sock_net(sk);

        if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
                return l3mdev_master_ifindex_by_index(net,
                                                      sk->sk_bound_dev_if);
#endif

        return 0;
}

static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
                                     int dif, int sdif)
{
        if (!bound_dev_if)
                return !sdif || l3mdev_accept;
        return bound_dev_if == dif || bound_dev_if == sdif;
}

static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
                                        int dif, int sdif)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
                                 bound_dev_if, dif, sdif);
#else
        return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
#endif
}

struct inet_cork {
        unsigned int                flags;
        __be32                        addr;
        struct ip_options        *opt;
        unsigned int                fragsize;
        int                        length; /* Total length of all frames */
        struct dst_entry        *dst;
        u8                        tx_flags;
        __u8                        ttl;
        __s16                        tos;
        char                        priority;
        __u16                        gso_size;
        u64                        transmit_time;
        u32                        mark;
};

struct inet_cork_full {
        struct inet_cork        base;
        struct flowi                fl;
};

struct ip_mc_socklist;
struct ipv6_pinfo;
struct rtable;

/** struct inet_sock - representation of INET sockets
 *
 * @sk - ancestor class
 * @pinet6 - pointer to IPv6 control block
 * @inet_daddr - Foreign IPv4 addr
 * @inet_rcv_saddr - Bound local IPv4 addr
 * @inet_dport - Destination port
 * @inet_num - Local port
 * @inet_flags - various atomic flags
 * @inet_saddr - Sending source
 * @uc_ttl - Unicast TTL
 * @inet_sport - Source port
 * @inet_id - ID counter for DF pkts
 * @tos - TOS
 * @mc_ttl - Multicasting TTL
 * @uc_index - Unicast outgoing device index
 * @mc_index - Multicast device index
 * @mc_list - Group array
 * @cork - info to build ip hdr on each ip frag while socket is corked
 */
struct inet_sock {
        /* sk and pinet6 has to be the first two members of inet_sock */
        struct sock                sk;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo        *pinet6;
#endif
        /* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr                sk.__sk_common.skc_daddr
#define inet_rcv_saddr                sk.__sk_common.skc_rcv_saddr
#define inet_dport                sk.__sk_common.skc_dport
#define inet_num                sk.__sk_common.skc_num

        unsigned long                inet_flags;
        __be32                        inet_saddr;
        __s16                        uc_ttl;
        __be16                        inet_sport;
        struct ip_options_rcu __rcu        *inet_opt;
        atomic_t                inet_id;

        __u8                        tos;
        __u8                        min_ttl;
        __u8                        mc_ttl;
        __u8                        pmtudisc;
        __u8                        rcv_tos;
        __u8                        convert_csum;
        int                        uc_index;
        int                        mc_index;
        __be32                        mc_addr;
        u32                        local_port_range;        /* high << 16 | low */

        struct ip_mc_socklist __rcu        *mc_list;
        struct inet_cork_full        cork;
};

#define IPCORK_OPT        1        /* ip-options has been held in ipcork.opt */

enum {
        INET_FLAGS_PKTINFO        = 0,
        INET_FLAGS_TTL                = 1,
        INET_FLAGS_TOS                = 2,
        INET_FLAGS_RECVOPTS        = 3,
        INET_FLAGS_RETOPTS        = 4,
        INET_FLAGS_PASSSEC        = 5,
        INET_FLAGS_ORIGDSTADDR        = 6,
        INET_FLAGS_CHECKSUM        = 7,
        INET_FLAGS_RECVFRAGSIZE        = 8,

        INET_FLAGS_RECVERR        = 9,
        INET_FLAGS_RECVERR_RFC4884 = 10,
        INET_FLAGS_FREEBIND        = 11,
        INET_FLAGS_HDRINCL        = 12,
        INET_FLAGS_MC_LOOP        = 13,
        INET_FLAGS_MC_ALL        = 14,
        INET_FLAGS_TRANSPARENT        = 15,
        INET_FLAGS_IS_ICSK        = 16,
        INET_FLAGS_NODEFRAG        = 17,
        INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
        INET_FLAGS_DEFER_CONNECT = 19,
        INET_FLAGS_MC6_LOOP        = 20,
        INET_FLAGS_RECVERR6_RFC4884 = 21,
        INET_FLAGS_MC6_ALL        = 22,
        INET_FLAGS_AUTOFLOWLABEL_SET = 23,
        INET_FLAGS_AUTOFLOWLABEL = 24,
        INET_FLAGS_DONTFRAG        = 25,
        INET_FLAGS_RECVERR6        = 26,
        INET_FLAGS_REPFLOW        = 27,
        INET_FLAGS_RTALERT_ISOLATE = 28,
        INET_FLAGS_SNDFLOW        = 29,
        INET_FLAGS_RTALERT        = 30,
};

/* cmsg flags for inet */
#define IP_CMSG_PKTINFO                BIT(INET_FLAGS_PKTINFO)
#define IP_CMSG_TTL                BIT(INET_FLAGS_TTL)
#define IP_CMSG_TOS                BIT(INET_FLAGS_TOS)
#define IP_CMSG_RECVOPTS        BIT(INET_FLAGS_RECVOPTS)
#define IP_CMSG_RETOPTS                BIT(INET_FLAGS_RETOPTS)
#define IP_CMSG_PASSSEC                BIT(INET_FLAGS_PASSSEC)
#define IP_CMSG_ORIGDSTADDR        BIT(INET_FLAGS_ORIGDSTADDR)
#define IP_CMSG_CHECKSUM        BIT(INET_FLAGS_CHECKSUM)
#define IP_CMSG_RECVFRAGSIZE        BIT(INET_FLAGS_RECVFRAGSIZE)

#define IP_CMSG_ALL        (IP_CMSG_PKTINFO | IP_CMSG_TTL |                \
                         IP_CMSG_TOS | IP_CMSG_RECVOPTS |                \
                         IP_CMSG_RETOPTS | IP_CMSG_PASSSEC |                \
                         IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM |        \
                         IP_CMSG_RECVFRAGSIZE)

static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
{
        return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL;
}

#define inet_test_bit(nr, sk)                        \
        test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_set_bit(nr, sk)                        \
        set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_clear_bit(nr, sk)                        \
        clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet_assign_bit(nr, sk, val)                \
        assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)

/**
 * sk_to_full_sk - Access to a full socket
 * @sk: pointer to a socket
 *
 * SYNACK messages might be attached to request sockets.
 * Some places want to reach the listener in this case.
 */
static inline struct sock *sk_to_full_sk(struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
                sk = inet_reqsk(sk)->rsk_listener;
#endif
        return sk;
}

/* sk_to_full_sk() variant with a const argument */
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
                sk = ((const struct request_sock *)sk)->rsk_listener;
#endif
        return sk;
}

static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
{
        return sk_to_full_sk(skb->sk);
}

#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)

static inline void __inet_sk_copy_descendant(struct sock *sk_to,
                                             const struct sock *sk_from,
                                             const int ancestor_size)
{
        memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
               sk_from->sk_prot->obj_size - ancestor_size);
}

int inet_sk_rebuild_header(struct sock *sk);

/**
 * inet_sk_state_load - read sk->sk_state for lockless contexts
 * @sk: socket pointer
 *
 * Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
 * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
 */
static inline int inet_sk_state_load(const struct sock *sk)
{
        /* state change might impact lockless readers. */
        return smp_load_acquire(&sk->sk_state);
}

/**
 * inet_sk_state_store - update sk->sk_state
 * @sk: socket pointer
 * @newstate: new state
 *
 * Paired with inet_sk_state_load(). Should be used in contexts where
 * state change might impact lockless readers.
 */
void inet_sk_state_store(struct sock *sk, int newstate);

void inet_sk_set_state(struct sock *sk, int state);

static inline unsigned int __inet_ehashfn(const __be32 laddr,
                                          const __u16 lport,
                                          const __be32 faddr,
                                          const __be16 fport,
                                          u32 initval)
{
        return jhash_3words((__force __u32) laddr,
                            (__force __u32) faddr,
                            ((__u32) lport) << 16 | (__force __u32)fport,
                            initval);
}

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
                                      struct sock *sk_listener,
                                      bool attach_listener);

static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
{
        __u8 flags = 0;

        if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk))
                flags |= FLOWI_FLAG_ANYSRC;
        return flags;
}

static inline void inet_inc_convert_csum(struct sock *sk)
{
        inet_sk(sk)->convert_csum++;
}

static inline void inet_dec_convert_csum(struct sock *sk)
{
        if (inet_sk(sk)->convert_csum > 0)
                inet_sk(sk)->convert_csum--;
}

static inline bool inet_get_convert_csum(struct sock *sk)
{
        return !!inet_sk(sk)->convert_csum;
}


static inline bool inet_can_nonlocal_bind(struct net *net,
                                          struct inet_sock *inet)
{
        return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
                test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
                test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

static inline bool inet_addr_valid_or_nonlocal(struct net *net,
                                               struct inet_sock *inet,
                                               __be32 addr,
                                               int addr_type)
{
        return inet_can_nonlocal_bind(net, inet) ||
                addr == htonl(INADDR_ANY) ||
                addr_type == RTN_LOCAL ||
                addr_type == RTN_MULTICAST ||
                addr_type == RTN_BROADCAST;
}

#endif        /* _INET_SOCK_H */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    7 
    7 









    2 

    2 


    2 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2005-2010 IBM Corporation
 *
 * Author:
 * Mimi Zohar <zohar@us.ibm.com>
 * Kylene Hall <kjhall@us.ibm.com>
 *
 * File: evm_main.c
 *        implements evm_inode_setxattr, evm_inode_post_setxattr,
 *        evm_inode_removexattr, evm_verifyxattr, and evm_inode_set_acl.
 */

#define pr_fmt(fmt) "EVM: "fmt

#include <linux/init.h>
#include <linux/audit.h>
#include <linux/xattr.h>
#include <linux/integrity.h>
#include <linux/evm.h>
#include <linux/magic.h>
#include <linux/posix_acl_xattr.h>
#include <linux/lsm_hooks.h>

#include <crypto/hash.h>
#include <crypto/hash_info.h>
#include <crypto/utils.h>
#include "evm.h"

int evm_initialized;

static const char * const integrity_status_msg[] = {
        "pass", "pass_immutable", "fail", "fail_immutable", "no_label",
        "no_xattrs", "unknown"
};
int evm_hmac_attrs;

static struct xattr_list evm_config_default_xattrnames[] = {
        {
         .name = XATTR_NAME_SELINUX,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SELINUX)
        },
        {
         .name = XATTR_NAME_SMACK,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SMACK)
        },
        {
         .name = XATTR_NAME_SMACKEXEC,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKTRANSMUTE,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKMMAP,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_APPARMOR,
         .enabled = IS_ENABLED(CONFIG_SECURITY_APPARMOR)
        },
        {
         .name = XATTR_NAME_IMA,
         .enabled = IS_ENABLED(CONFIG_IMA_APPRAISE)
        },
        {
         .name = XATTR_NAME_CAPS,
         .enabled = true
        },
};

LIST_HEAD(evm_config_xattrnames);

static int evm_fixmode __ro_after_init;
static int __init evm_set_fixmode(char *str)
{
        if (strncmp(str, "fix", 3) == 0)
                evm_fixmode = 1;
        else
                pr_err("invalid \"%s\" mode", str);

        return 1;
}
__setup("evm=", evm_set_fixmode);

static void __init evm_init_config(void)
{
        int i, xattrs;

        xattrs = ARRAY_SIZE(evm_config_default_xattrnames);

        pr_info("Initialising EVM extended attributes:\n");
        for (i = 0; i < xattrs; i++) {
                pr_info("%s%s\n", evm_config_default_xattrnames[i].name,
                        !evm_config_default_xattrnames[i].enabled ?
                        " (disabled)" : "");
                list_add_tail(&evm_config_default_xattrnames[i].list,
                              &evm_config_xattrnames);
        }

#ifdef CONFIG_EVM_ATTR_FSUUID
        evm_hmac_attrs |= EVM_ATTR_FSUUID;
#endif
        pr_info("HMAC attrs: 0x%x\n", evm_hmac_attrs);
}

static bool evm_key_loaded(void)
{
        return (bool)(evm_initialized & EVM_KEY_MASK);
}

/*
 * This function determines whether or not it is safe to ignore verification
 * errors, based on the ability of EVM to calculate HMACs. If the HMAC key
 * is not loaded, and it cannot be loaded in the future due to the
 * EVM_SETUP_COMPLETE initialization flag, allowing an operation despite the
 * attrs/xattrs being found invalid will not make them valid.
 */
static bool evm_hmac_disabled(void)
{
        if (evm_initialized & EVM_INIT_HMAC)
                return false;

        if (!(evm_initialized & EVM_SETUP_COMPLETE))
                return false;

        return true;
}

static int evm_find_protected_xattrs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct xattr_list *xattr;
        int error;
        int count = 0;

        if (!(inode->i_opflags & IOP_XATTR))
                return -EOPNOTSUPP;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0);
                if (error < 0) {
                        if (error == -ENODATA)
                                continue;
                        return error;
                }
                count++;
        }

        return count;
}

static int is_unsupported_hmac_fs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) {
                pr_info_once("%s not supported\n", inode->i_sb->s_type->name);
                return 1;
        }
        return 0;
}

/*
 * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr
 *
 * Compute the HMAC on the dentry's protected set of extended attributes
 * and compare it against the stored security.evm xattr.
 *
 * For performance:
 * - use the previoulsy retrieved xattr value and length to calculate the
 *   HMAC.)
 * - cache the verification result in the iint, when available.
 *
 * Returns integrity status
 */
static enum integrity_status evm_verify_hmac(struct dentry *dentry,
                                             const char *xattr_name,
                                             char *xattr_value,
                                             size_t xattr_value_len)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        struct signature_v2_hdr *hdr;
        enum integrity_status evm_status = INTEGRITY_PASS;
        struct evm_digest digest;
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        int rc, xattr_len, evm_immutable = 0;

        if (iint && (iint->evm_status == INTEGRITY_PASS ||
                     iint->evm_status == INTEGRITY_PASS_IMMUTABLE))
                return iint->evm_status;

        /*
         * On unsupported filesystems without EVM_INIT_X509 enabled, skip
         * signature verification.
         */
        if (!(evm_initialized & EVM_INIT_X509) &&
            is_unsupported_hmac_fs(dentry))
                return INTEGRITY_UNKNOWN;

        /* if status is not PASS, try to check again - against -ENOMEM */

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0) {
                evm_status = INTEGRITY_FAIL;
                if (rc == -ENODATA) {
                        rc = evm_find_protected_xattrs(dentry);
                        if (rc > 0)
                                evm_status = INTEGRITY_NOLABEL;
                        else if (rc == 0)
                                evm_status = INTEGRITY_NOXATTRS; /* new file */
                } else if (rc == -EOPNOTSUPP) {
                        evm_status = INTEGRITY_UNKNOWN;
                }
                goto out;
        }

        xattr_len = rc;

        /* check value type */
        switch (xattr_data->type) {
        case EVM_XATTR_HMAC:
                if (xattr_len != sizeof(struct evm_xattr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                digest.hdr.algo = HASH_ALGO_SHA1;
                rc = evm_calc_hmac(dentry, xattr_name, xattr_value,
                                   xattr_value_len, &digest, iint);
                if (rc)
                        break;
                rc = crypto_memneq(xattr_data->data, digest.digest,
                                   SHA1_DIGEST_SIZE);
                if (rc)
                        rc = -EINVAL;
                break;
        case EVM_XATTR_PORTABLE_DIGSIG:
                evm_immutable = 1;
                fallthrough;
        case EVM_IMA_XATTR_DIGSIG:
                /* accept xattr with non-empty signature field */
                if (xattr_len <= sizeof(struct signature_v2_hdr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                hdr = (struct signature_v2_hdr *)xattr_data;
                digest.hdr.algo = hdr->hash_algo;
                rc = evm_calc_hash(dentry, xattr_name, xattr_value,
                                   xattr_value_len, xattr_data->type, &digest,
                                   iint);
                if (rc)
                        break;
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM,
                                        (const char *)xattr_data, xattr_len,
                                        digest.digest, digest.hdr.length);
                if (!rc) {
                        if (xattr_data->type == EVM_XATTR_PORTABLE_DIGSIG) {
                                if (iint)
                                        iint->flags |= EVM_IMMUTABLE_DIGSIG;
                                evm_status = INTEGRITY_PASS_IMMUTABLE;
                        } else if (!IS_RDONLY(inode) &&
                                   !(inode->i_sb->s_readonly_remount) &&
                                   !IS_IMMUTABLE(inode) &&
                                   !is_unsupported_hmac_fs(dentry)) {
                                evm_update_evmxattr(dentry, xattr_name,
                                                    xattr_value,
                                                    xattr_value_len);
                        }
                }
                break;
        default:
                rc = -EINVAL;
                break;
        }

        if (rc) {
                if (rc == -ENODATA)
                        evm_status = INTEGRITY_NOXATTRS;
                else if (evm_immutable)
                        evm_status = INTEGRITY_FAIL_IMMUTABLE;
                else
                        evm_status = INTEGRITY_FAIL;
        }
        pr_debug("digest: (%d) [%*phN]\n", digest.hdr.length, digest.hdr.length,
                  digest.digest);
out:
        if (iint)
                iint->evm_status = evm_status;
        kfree(xattr_data);
        return evm_status;
}

static int evm_protected_xattr_common(const char *req_xattr_name,
                                      bool all_xattrs)
{
        int namelen;
        int found = 0;
        struct xattr_list *xattr;

        namelen = strlen(req_xattr_name);
        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                if (!all_xattrs && !xattr->enabled)
                        continue;

                if ((strlen(xattr->name) == namelen)
                    && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) {
                        found = 1;
                        break;
                }
                if (strncmp(req_xattr_name,
                            xattr->name + XATTR_SECURITY_PREFIX_LEN,
                            strlen(req_xattr_name)) == 0) {
                        found = 1;
                        break;
                }
        }

        return found;
}

int evm_protected_xattr(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, false);
}

int evm_protected_xattr_if_enabled(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, true);
}

/**
 * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values
 * @dentry: dentry of the read xattrs
 * @buffer: buffer xattr names, lengths or values are copied to
 * @buffer_size: size of buffer
 * @type: n: names, l: lengths, v: values
 * @canonical_fmt: data format (true: little endian, false: native format)
 *
 * Read protected xattr names (separated by |), lengths (u32) or values for a
 * given dentry and return the total size of copied data. If buffer is NULL,
 * just return the total size.
 *
 * Returns the total size on success, a negative value on error.
 */
int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
                              int buffer_size, char type, bool canonical_fmt)
{
        struct xattr_list *xattr;
        int rc, size, total_size = 0;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                rc = __vfs_getxattr(dentry, d_backing_inode(dentry),
                                    xattr->name, NULL, 0);
                if (rc < 0 && rc == -ENODATA)
                        continue;
                else if (rc < 0)
                        return rc;

                switch (type) {
                case 'n':
                        size = strlen(xattr->name) + 1;
                        if (buffer) {
                                if (total_size)
                                        *(buffer + total_size - 1) = '|';

                                memcpy(buffer + total_size, xattr->name, size);
                        }
                        break;
                case 'l':
                        size = sizeof(u32);
                        if (buffer) {
                                if (canonical_fmt)
                                        rc = (__force int)cpu_to_le32(rc);

                                *(u32 *)(buffer + total_size) = rc;
                        }
                        break;
                case 'v':
                        size = rc;
                        if (buffer) {
                                rc = __vfs_getxattr(dentry,
                                        d_backing_inode(dentry), xattr->name,
                                        buffer + total_size,
                                        buffer_size - total_size);
                                if (rc < 0)
                                        return rc;
                        }
                        break;
                default:
                        return -EINVAL;
                }

                total_size += size;
        }

        return total_size;
}

/**
 * evm_verifyxattr - verify the integrity of the requested xattr
 * @dentry: object of the verify xattr
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Calculate the HMAC for the given dentry and verify it against the stored
 * security.evm xattr. For performance, use the xattr value and length
 * previously retrieved to calculate the HMAC.
 *
 * Returns the xattr integrity status.
 *
 * This function requires the caller to lock the inode's i_mutex before it
 * is executed.
 */
enum integrity_status evm_verifyxattr(struct dentry *dentry,
                                      const char *xattr_name,
                                      void *xattr_value, size_t xattr_value_len)
{
        if (!evm_key_loaded() || !evm_protected_xattr(xattr_name))
                return INTEGRITY_UNKNOWN;

        return evm_verify_hmac(dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}
EXPORT_SYMBOL_GPL(evm_verifyxattr);

/*
 * evm_verify_current_integrity - verify the dentry's metadata integrity
 * @dentry: pointer to the affected dentry
 *
 * Verify and return the dentry's metadata integrity. The exceptions are
 * before EVM is initialized or in 'fix' mode.
 */
static enum integrity_status evm_verify_current_integrity(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (!evm_key_loaded() || !S_ISREG(inode->i_mode) || evm_fixmode)
                return INTEGRITY_PASS;
        return evm_verify_hmac(dentry, NULL, NULL, 0);
}

/*
 * evm_xattr_change - check if passed xattr value differs from current value
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Check if passed xattr value differs from current value.
 *
 * Returns 1 if passed xattr value differs from current value, 0 otherwise.
 */
static int evm_xattr_change(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *xattr_name,
                            const void *xattr_value, size_t xattr_value_len)
{
        char *xattr_data = NULL;
        int rc = 0;

        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data,
                                0, GFP_NOFS);
        if (rc < 0) {
                rc = 1;
                goto out;
        }

        if (rc == xattr_value_len)
                rc = !!memcmp(xattr_value, xattr_data, rc);
        else
                rc = 1;

out:
        kfree(xattr_data);
        return rc;
}

/*
 * evm_protect_xattr - protect the EVM extended attribute
 *
 * Prevent security.evm from being modified or removed without the
 * necessary permissions or when the existing value is invalid.
 *
 * The posix xattr acls are 'system' prefixed, which normally would not
 * affect security.evm.  An interesting side affect of writing posix xattr
 * acls is their modifying of the i_mode, which is included in security.evm.
 * For posix xattr acls only, permit security.evm, even if it currently
 * doesn't exist, to be updated unless the EVM signature is immutable.
 */
static int evm_protect_xattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *xattr_name,
                             const void *xattr_value, size_t xattr_value_len)
{
        enum integrity_status evm_status;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (is_unsupported_hmac_fs(dentry))
                        return -EPERM;
        } else if (!evm_protected_xattr(xattr_name)) {
                if (!posix_xattr_acl(xattr_name))
                        return 0;
                if (is_unsupported_hmac_fs(dentry))
                        return 0;

                evm_status = evm_verify_current_integrity(dentry);
                if ((evm_status == INTEGRITY_PASS) ||
                    (evm_status == INTEGRITY_NOXATTRS))
                        return 0;
                goto out;
        } else if (is_unsupported_hmac_fs(dentry))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if (evm_status == INTEGRITY_NOXATTRS) {
                struct evm_iint_cache *iint;

                /* Exception if the HMAC is not going to be calculated. */
                if (evm_hmac_disabled())
                        return 0;

                iint = evm_iint_inode(d_backing_inode(dentry));
                if (iint && (iint->flags & EVM_NEW_FILE))
                        return 0;

                /* exception for pseudo filesystems */
                if (dentry->d_sb->s_magic == TMPFS_MAGIC
                    || dentry->d_sb->s_magic == SYSFS_MAGIC)
                        return 0;

                integrity_audit_msg(AUDIT_INTEGRITY_METADATA,
                                    dentry->d_inode, dentry->d_name.name,
                                    "update_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        }
out:
        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_xattr_change(idmap, dentry, xattr_name, xattr_value,
                              xattr_value_len))
                return 0;

        if (evm_status != INTEGRITY_PASS &&
            evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return evm_status == INTEGRITY_PASS ? 0 : -EPERM;
}

/**
 * evm_inode_setxattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Before allowing the 'security.evm' protected xattr to be updated,
 * verify the existing value is valid.  As only the kernel should have
 * access to the EVM encrypted key needed to calculate the HMAC, prevent
 * userspace from writing HMAC value.  Writing 'security.evm' requires
 * requires CAP_SYS_ADMIN privileges.
 */
static int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                              const char *xattr_name, const void *xattr_value,
                              size_t xattr_value_len, int flags)
{
        const struct evm_ima_xattr_data *xattr_data = xattr_value;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!xattr_value_len)
                        return -EINVAL;
                if (xattr_data->type != EVM_IMA_XATTR_DIGSIG &&
                    xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG)
                        return -EPERM;
        }
        return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}

/**
 * evm_inode_removexattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that
 * the current value is valid.
 */
static int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 const char *xattr_name)
{
        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0);
}

#ifdef CONFIG_FS_POSIX_ACL
static int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *name,
                                    struct posix_acl *kacl)
{
        int rc;

        umode_t mode;
        struct inode *inode = d_backing_inode(dentry);

        if (!kacl)
                return 1;

        rc = posix_acl_update_mode(idmap, inode, &mode, &kacl);
        if (rc || (inode->i_mode != mode))
                return 1;

        return 0;
}
#else
static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                           struct dentry *dentry,
                                           const char *name,
                                           struct posix_acl *kacl)
{
        return 0;
}
#endif

/**
 * evm_inode_set_acl - protect the EVM extended attribute from posix acls
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Prevent modifying posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                             const char *acl_name, struct posix_acl *kacl)
{
        enum integrity_status evm_status;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS))
                return 0;

        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl))
                return 0;

        if (evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_remove_acl - Protect the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Prevent removing posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                                const char *acl_name)
{
        return evm_inode_set_acl(idmap, dentry, acl_name, NULL);
}

static void evm_reset_status(struct inode *inode)
{
        struct evm_iint_cache *iint;

        iint = evm_iint_inode(inode);
        if (iint)
                iint->evm_status = INTEGRITY_UNKNOWN;
}

/**
 * evm_metadata_changed: Detect changes to the metadata
 * @inode: a file's inode
 * @metadata_inode: metadata inode
 *
 * On a stacked filesystem detect whether the metadata has changed. If this is
 * the case reset the evm_status associated with the inode that represents the
 * file.
 */
bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        bool ret = false;

        if (iint) {
                ret = (!IS_I_VERSION(metadata_inode) ||
                       integrity_inode_attrs_changed(&iint->metadata_inode,
                                                     metadata_inode));
                if (ret)
                        iint->evm_status = INTEGRITY_UNKNOWN;
        }

        return ret;
}

/**
 * evm_revalidate_status - report whether EVM status re-validation is necessary
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Report whether callers of evm_verifyxattr() should re-validate the
 * EVM status.
 *
 * Return true if re-validation is necessary, false otherwise.
 */
bool evm_revalidate_status(const char *xattr_name)
{
        if (!evm_key_loaded())
                return false;

        /* evm_inode_post_setattr() passes NULL */
        if (!xattr_name)
                return true;

        if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) &&
            strcmp(xattr_name, XATTR_NAME_EVM))
                return false;

        return true;
}

/**
 * evm_inode_post_setxattr - update 'security.evm' to reflect the changes
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Update the HMAC stored in 'security.evm' to reflect the change.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * __vfs_setxattr_noperm().  The caller of which has taken the inode's
 * i_mutex lock.
 */
static void evm_inode_post_setxattr(struct dentry *dentry,
                                    const char *xattr_name,
                                    const void *xattr_value,
                                    size_t xattr_value_len,
                                    int flags)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len);
}

/**
 * evm_inode_post_set_acl - Update the EVM extended attribute from posix acls
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after setting
 * posix acls.
 */
static void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                   struct posix_acl *kacl)
{
        return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0);
}

/**
 * evm_inode_post_removexattr - update 'security.evm' after removing the xattr
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Update the HMAC stored in 'security.evm' to reflect removal of the xattr.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * vfs_removexattr() which takes the i_mutex.
 */
static void evm_inode_post_removexattr(struct dentry *dentry,
                                       const char *xattr_name)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        evm_update_evmxattr(dentry, xattr_name, NULL, 0);
}

/**
 * evm_inode_post_remove_acl - Update the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after
 * removing posix acls.
 */
static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap,
                                             struct dentry *dentry,
                                             const char *acl_name)
{
        evm_inode_post_removexattr(dentry, acl_name);
}

static int evm_attr_change(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = attr->ia_valid;

        if (!i_uid_needs_update(idmap, attr, inode) &&
            !i_gid_needs_update(idmap, attr, inode) &&
            (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode))
                return 0;

        return 1;
}

/**
 * evm_inode_setattr - prevent updating an invalid EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @attr: iattr structure containing the new file attributes
 *
 * Permit update of file attributes when files have a valid EVM signature,
 * except in the case of them having an immutable portable signature.
 */
static int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                             struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;
        enum integrity_status evm_status;

        /* Policy permits modification of the protected attrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (is_unsupported_hmac_fs(dentry))
                return 0;

        if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        /*
         * Writing attrs is safe for portable signatures, as portable signatures
         * are immutable and can never be updated.
         */
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS) ||
            (evm_status == INTEGRITY_FAIL_IMMUTABLE) ||
            (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
             evm_status == INTEGRITY_UNKNOWN)))
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_attr_change(idmap, dentry, attr))
                return 0;

        integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                            dentry->d_name.name, "appraise_metadata",
                            integrity_status_msg[evm_status], -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_post_setattr - update 'security.evm' after modifying metadata
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @ia_valid: for the UID and GID status
 *
 * For now, update the HMAC stored in 'security.evm' to reflect UID/GID
 * changes.
 *
 * This function is called from notify_change(), which expects the caller
 * to lock the inode's i_mutex.
 */
static void evm_inode_post_setattr(struct mnt_idmap *idmap,
                                   struct dentry *dentry, int ia_valid)
{
        if (!evm_revalidate_status(NULL))
                return;

        evm_reset_status(dentry->d_inode);

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
                evm_update_evmxattr(dentry, NULL, NULL, 0);
}

static int evm_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        int rc;

        if (strcmp(name, XATTR_NAME_EVM) != 0)
                return -EOPNOTSUPP;

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, src, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0)
                return -EPERM;

        if (rc < offsetof(struct evm_ima_xattr_data, type) +
                 sizeof(xattr_data->type))
                return -EPERM;

        switch (xattr_data->type) {
        case EVM_XATTR_PORTABLE_DIGSIG:
                rc = 0; /* allow copy-up */
                break;
        case EVM_XATTR_HMAC:
        case EVM_IMA_XATTR_DIGSIG:
        default:
                rc = 1; /* discard */
        }

        kfree(xattr_data);
        return rc;
}

/*
 * evm_inode_init_security - initializes security.evm HMAC value
 */
int evm_inode_init_security(struct inode *inode, struct inode *dir,
                            const struct qstr *qstr, struct xattr *xattrs,
                            int *xattr_count)
{
        struct evm_xattr *xattr_data;
        struct xattr *xattr, *evm_xattr;
        bool evm_protected_xattrs = false;
        int rc;

        if (!(evm_initialized & EVM_INIT_HMAC) || !xattrs)
                return 0;

        /*
         * security_inode_init_security() makes sure that the xattrs array is
         * contiguous, there is enough space for security.evm, and that there is
         * a terminator at the end of the array.
         */
        for (xattr = xattrs; xattr->name; xattr++) {
                if (evm_protected_xattr(xattr->name))
                        evm_protected_xattrs = true;
        }

        /* EVM xattr not needed. */
        if (!evm_protected_xattrs)
                return 0;

        evm_xattr = lsm_get_xattr_slot(xattrs, xattr_count);
        /*
         * Array terminator (xattr name = NULL) must be the first non-filled
         * xattr slot.
         */
        WARN_ONCE(evm_xattr != xattr,
                  "%s: xattrs terminator is not the first non-filled slot\n",
                  __func__);

        xattr_data = kzalloc(sizeof(*xattr_data), GFP_NOFS);
        if (!xattr_data)
                return -ENOMEM;

        xattr_data->data.type = EVM_XATTR_HMAC;
        rc = evm_init_hmac(inode, xattrs, xattr_data->digest);
        if (rc < 0)
                goto out;

        evm_xattr->value = xattr_data;
        evm_xattr->value_len = sizeof(*xattr_data);
        evm_xattr->name = XATTR_EVM_SUFFIX;
        return 0;
out:
        kfree(xattr_data);
        return rc;
}
EXPORT_SYMBOL_GPL(evm_inode_init_security);

static int evm_inode_alloc_security(struct inode *inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        /* Called by security_inode_alloc(), it cannot be NULL. */
        iint->flags = 0UL;
        iint->evm_status = INTEGRITY_UNKNOWN;

        return 0;
}

static void evm_file_release(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        fmode_t mode = file->f_mode;

        if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE))
                return;

        if (iint && atomic_read(&inode->i_writecount) == 1)
                iint->flags &= ~EVM_NEW_FILE;
}

static void evm_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        if (!S_ISREG(inode->i_mode))
                return;

        if (iint)
                iint->flags |= EVM_NEW_FILE;
}

#ifdef CONFIG_EVM_LOAD_X509
void __init evm_load_x509(void)
{
        int rc;

        rc = integrity_load_x509(INTEGRITY_KEYRING_EVM, CONFIG_EVM_X509_PATH);
        if (!rc)
                evm_initialized |= EVM_INIT_X509;
}
#endif

static int __init init_evm(void)
{
        int error;
        struct list_head *pos, *q;

        evm_init_config();

        error = integrity_init_keyring(INTEGRITY_KEYRING_EVM);
        if (error)
                goto error;

        error = evm_init_secfs();
        if (error < 0) {
                pr_info("Error registering secfs\n");
                goto error;
        }

error:
        if (error != 0) {
                if (!list_empty(&evm_config_xattrnames)) {
                        list_for_each_safe(pos, q, &evm_config_xattrnames)
                                list_del(pos);
                }
        }

        return error;
}

static struct security_hook_list evm_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_setattr, evm_inode_setattr),
        LSM_HOOK_INIT(inode_post_setattr, evm_inode_post_setattr),
        LSM_HOOK_INIT(inode_copy_up_xattr, evm_inode_copy_up_xattr),
        LSM_HOOK_INIT(inode_setxattr, evm_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, evm_inode_post_setxattr),
        LSM_HOOK_INIT(inode_set_acl, evm_inode_set_acl),
        LSM_HOOK_INIT(inode_post_set_acl, evm_inode_post_set_acl),
        LSM_HOOK_INIT(inode_remove_acl, evm_inode_remove_acl),
        LSM_HOOK_INIT(inode_post_remove_acl, evm_inode_post_remove_acl),
        LSM_HOOK_INIT(inode_removexattr, evm_inode_removexattr),
        LSM_HOOK_INIT(inode_post_removexattr, evm_inode_post_removexattr),
        LSM_HOOK_INIT(inode_init_security, evm_inode_init_security),
        LSM_HOOK_INIT(inode_alloc_security, evm_inode_alloc_security),
        LSM_HOOK_INIT(file_release, evm_file_release),
        LSM_HOOK_INIT(path_post_mknod, evm_post_path_mknod),
};

static const struct lsm_id evm_lsmid = {
        .name = "evm",
        .id = LSM_ID_EVM,
};

static int __init init_evm_lsm(void)
{
        security_add_hooks(evm_hooks, ARRAY_SIZE(evm_hooks), &evm_lsmid);
        return 0;
}

struct lsm_blob_sizes evm_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct evm_iint_cache),
        .lbs_xattr_count = 1,
};

DEFINE_LSM(evm) = {
        .name = "evm",
        .init = init_evm_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &evm_blob_sizes,
};

late_initcall(init_evm);


























































    2 







    3 

    2 

    2 







































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2019 Facebook
 * Copyright 2020 Google LLC.
 */

#include <linux/rculist.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>

DEFINE_BPF_STORAGE_CACHE(inode_cache);

static struct bpf_local_storage __rcu **
inode_storage_ptr(void *owner)
{
        struct inode *inode = owner;
        struct bpf_storage_blob *bsb;

        bsb = bpf_inode(inode);
        if (!bsb)
                return NULL;
        return &bsb->storage;
}

static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
                                                           struct bpf_map *map,
                                                           bool cacheit_lockit)
{
        struct bpf_local_storage *inode_storage;
        struct bpf_local_storage_map *smap;
        struct bpf_storage_blob *bsb;

        bsb = bpf_inode(inode);
        if (!bsb)
                return NULL;

        inode_storage =
                rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
        if (!inode_storage)
                return NULL;

        smap = (struct bpf_local_storage_map *)map;
        return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
}

void bpf_inode_storage_free(struct inode *inode)
{
        struct bpf_local_storage *local_storage;
        struct bpf_storage_blob *bsb;

        bsb = bpf_inode(inode);
        if (!bsb)
                return;

        rcu_read_lock();

        local_storage = rcu_dereference(bsb->storage);
        if (!local_storage) {
                rcu_read_unlock();
                return;
        }

        bpf_local_storage_destroy(local_storage);
        rcu_read_unlock();
}

static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_local_storage_data *sdata;
        struct fd f = fdget_raw(*(int *)key);

        if (!f.file)
                return ERR_PTR(-EBADF);

        sdata = inode_storage_lookup(file_inode(f.file), map, true);
        fdput(f);
        return sdata ? sdata->data : NULL;
}

static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
                                             void *value, u64 map_flags)
{
        struct bpf_local_storage_data *sdata;
        struct fd f = fdget_raw(*(int *)key);

        if (!f.file)
                return -EBADF;
        if (!inode_storage_ptr(file_inode(f.file))) {
                fdput(f);
                return -EBADF;
        }

        sdata = bpf_local_storage_update(file_inode(f.file),
                                         (struct bpf_local_storage_map *)map,
                                         value, map_flags, GFP_ATOMIC);
        fdput(f);
        return PTR_ERR_OR_ZERO(sdata);
}

static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
{
        struct bpf_local_storage_data *sdata;

        sdata = inode_storage_lookup(inode, map, false);
        if (!sdata)
                return -ENOENT;

        bpf_selem_unlink(SELEM(sdata), false);

        return 0;
}

static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
        struct fd f = fdget_raw(*(int *)key);
        int err;

        if (!f.file)
                return -EBADF;

        err = inode_storage_delete(file_inode(f.file), map);
        fdput(f);
        return err;
}

/* *gfp_flags* is a hidden argument provided by the verifier */
BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
           void *, value, u64, flags, gfp_t, gfp_flags)
{
        struct bpf_local_storage_data *sdata;

        WARN_ON_ONCE(!bpf_rcu_lock_held());
        if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
                return (unsigned long)NULL;

        /* explicitly check that the inode_storage_ptr is not
         * NULL as inode_storage_lookup returns NULL in this case and
         * bpf_local_storage_update expects the owner to have a
         * valid storage pointer.
         */
        if (!inode || !inode_storage_ptr(inode))
                return (unsigned long)NULL;

        sdata = inode_storage_lookup(inode, map, true);
        if (sdata)
                return (unsigned long)sdata->data;

        /* This helper must only called from where the inode is guaranteed
         * to have a refcount and cannot be freed.
         */
        if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
                sdata = bpf_local_storage_update(
                        inode, (struct bpf_local_storage_map *)map, value,
                        BPF_NOEXIST, gfp_flags);
                return IS_ERR(sdata) ? (unsigned long)NULL :
                                             (unsigned long)sdata->data;
        }

        return (unsigned long)NULL;
}

BPF_CALL_2(bpf_inode_storage_delete,
           struct bpf_map *, map, struct inode *, inode)
{
        WARN_ON_ONCE(!bpf_rcu_lock_held());
        if (!inode)
                return -EINVAL;

        /* This helper must only called from where the inode is guaranteed
         * to have a refcount and cannot be freed.
         */
        return inode_storage_delete(inode, map);
}

static int notsupp_get_next_key(struct bpf_map *map, void *key,
                                void *next_key)
{
        return -ENOTSUPP;
}

static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
        return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}

static void inode_storage_map_free(struct bpf_map *map)
{
        bpf_local_storage_map_free(map, &inode_cache, NULL);
}

const struct bpf_map_ops inode_storage_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = bpf_local_storage_map_alloc_check,
        .map_alloc = inode_storage_map_alloc,
        .map_free = inode_storage_map_free,
        .map_get_next_key = notsupp_get_next_key,
        .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
        .map_update_elem = bpf_fd_inode_storage_update_elem,
        .map_delete_elem = bpf_fd_inode_storage_delete_elem,
        .map_check_btf = bpf_local_storage_map_check_btf,
        .map_mem_usage = bpf_local_storage_map_mem_usage,
        .map_btf_id = &bpf_local_storage_map_btf_id[0],
        .map_owner_storage_ptr = inode_storage_ptr,
};

BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode)

const struct bpf_func_proto bpf_inode_storage_get_proto = {
        .func                = bpf_inode_storage_get,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_MAP_VALUE_OR_NULL,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_BTF_ID_OR_NULL,
        .arg2_btf_id        = &bpf_inode_storage_btf_ids[0],
        .arg3_type        = ARG_PTR_TO_MAP_VALUE_OR_NULL,
        .arg4_type        = ARG_ANYTHING,
};

const struct bpf_func_proto bpf_inode_storage_delete_proto = {
        .func                = bpf_inode_storage_delete,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_BTF_ID_OR_NULL,
        .arg2_btf_id        = &bpf_inode_storage_btf_ids[0],
};































































































    1 




















    3 

    1 






























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
// SPDX-License-Identifier: GPL-2.0-only
/*
 * net/ipv6/fib6_rules.c        IPv6 Routing Policy Rules
 *
 * Copyright (C)2003-2006 Helsinki University of Technology
 * Copyright (C)2003-2006 USAGI/WIDE Project
 *
 * Authors
 *        Thomas Graf                <tgraf@suug.ch>
 *        Ville Nuorvala                <vnuorval@tcs.hut.fi>
 */

#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/export.h>
#include <linux/indirect_call_wrapper.h>

#include <net/fib_rules.h>
#include <net/inet_dscp.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/netlink.h>

struct fib6_rule {
        struct fib_rule                common;
        struct rt6key                src;
        struct rt6key                dst;
        dscp_t                        dscp;
};

static bool fib6_rule_matchall(const struct fib_rule *rule)
{
        struct fib6_rule *r = container_of(rule, struct fib6_rule, common);

        if (r->dst.plen || r->src.plen || r->dscp)
                return false;
        return fib_rule_matchall(rule);
}

bool fib6_rule_default(const struct fib_rule *rule)
{
        if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
            rule->l3mdev)
                return false;
        if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN)
                return false;
        return true;
}
EXPORT_SYMBOL_GPL(fib6_rule_default);

int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack)
{
        return fib_rules_dump(net, nb, AF_INET6, extack);
}

unsigned int fib6_rules_seq_read(struct net *net)
{
        return fib_rules_seq_read(net, AF_INET6);
}

/* called with rcu lock held; no reference taken on fib6_info */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags)
{
        int err;

        if (net->ipv6.fib6_has_custom_rules) {
                struct fib_lookup_arg arg = {
                        .lookup_ptr = fib6_table_lookup,
                        .lookup_data = &oif,
                        .result = res,
                        .flags = FIB_LOOKUP_NOREF,
                };

                l3mdev_update_flow(net, flowi6_to_flowi(fl6));

                err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
                                       flowi6_to_flowi(fl6), flags, &arg);
        } else {
                err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif,
                                        fl6, res, flags);
                if (err || res->f6i == net->ipv6.fib6_null_entry)
                        err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
                                                oif, fl6, res, flags);
        }

        return err;
}

struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup)
{
        if (net->ipv6.fib6_has_custom_rules) {
                struct fib6_result res = {};
                struct fib_lookup_arg arg = {
                        .lookup_ptr = lookup,
                        .lookup_data = skb,
                        .result = &res,
                        .flags = FIB_LOOKUP_NOREF,
                };

                /* update flow if oif or iif point to device enslaved to l3mdev */
                l3mdev_update_flow(net, flowi6_to_flowi(fl6));

                fib_rules_lookup(net->ipv6.fib6_rules_ops,
                                 flowi6_to_flowi(fl6), flags, &arg);

                if (res.rt6)
                        return &res.rt6->dst;
        } else {
                struct rt6_info *rt;

                rt = pol_lookup_func(lookup,
                             net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
                if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
                        return &rt->dst;
                ip6_rt_put_flags(rt, flags);
                rt = pol_lookup_func(lookup,
                             net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
                if (rt->dst.error != -EAGAIN)
                        return &rt->dst;
                ip6_rt_put_flags(rt, flags);
        }

        if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                dst_hold(&net->ipv6.ip6_null_entry->dst);
        return &net->ipv6.ip6_null_entry->dst;
}

static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
                           struct flowi6 *flp6, const struct net_device *dev)
{
        struct fib6_rule *r = (struct fib6_rule *)rule;

        /* If we need to find a source address for this traffic,
         * we check the result if it meets requirement of the rule.
         */
        if ((rule->flags & FIB_RULE_FIND_SADDR) &&
            r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
                struct in6_addr saddr;

                if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
                                       rt6_flags2srcprefs(flags), &saddr))
                        return -EAGAIN;

                if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
                        return -EAGAIN;

                flp6->saddr = saddr;
        }

        return 0;
}

static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
                                int flags, struct fib_lookup_arg *arg)
{
        struct fib6_result *res = arg->result;
        struct flowi6 *flp6 = &flp->u.ip6;
        struct net *net = rule->fr_net;
        struct fib6_table *table;
        int err, *oif;
        u32 tb_id;

        switch (rule->action) {
        case FR_ACT_TO_TBL:
                break;
        case FR_ACT_UNREACHABLE:
                return -ENETUNREACH;
        case FR_ACT_PROHIBIT:
                return -EACCES;
        case FR_ACT_BLACKHOLE:
        default:
                return -EINVAL;
        }

        tb_id = fib_rule_get_table(rule, arg);
        table = fib6_get_table(net, tb_id);
        if (!table)
                return -EAGAIN;

        oif = (int *)arg->lookup_data;
        err = fib6_table_lookup(net, table, *oif, flp6, res, flags);
        if (!err && res->f6i != net->ipv6.fib6_null_entry)
                err = fib6_rule_saddr(net, rule, flags, flp6,
                                      res->nh->fib_nh_dev);
        else
                err = -EAGAIN;

        return err;
}

static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
                              int flags, struct fib_lookup_arg *arg)
{
        struct fib6_result *res = arg->result;
        struct flowi6 *flp6 = &flp->u.ip6;
        struct rt6_info *rt = NULL;
        struct fib6_table *table;
        struct net *net = rule->fr_net;
        pol_lookup_t lookup = arg->lookup_ptr;
        int err = 0;
        u32 tb_id;

        switch (rule->action) {
        case FR_ACT_TO_TBL:
                break;
        case FR_ACT_UNREACHABLE:
                err = -ENETUNREACH;
                rt = net->ipv6.ip6_null_entry;
                goto discard_pkt;
        default:
        case FR_ACT_BLACKHOLE:
                err = -EINVAL;
                rt = net->ipv6.ip6_blk_hole_entry;
                goto discard_pkt;
        case FR_ACT_PROHIBIT:
                err = -EACCES;
                rt = net->ipv6.ip6_prohibit_entry;
                goto discard_pkt;
        }

        tb_id = fib_rule_get_table(rule, arg);
        table = fib6_get_table(net, tb_id);
        if (!table) {
                err = -EAGAIN;
                goto out;
        }

        rt = pol_lookup_func(lookup,
                             net, table, flp6, arg->lookup_data, flags);
        if (rt != net->ipv6.ip6_null_entry) {
                struct inet6_dev *idev = ip6_dst_idev(&rt->dst);

                if (!idev)
                        goto again;
                err = fib6_rule_saddr(net, rule, flags, flp6,
                                      idev->dev);

                if (err == -EAGAIN)
                        goto again;

                err = rt->dst.error;
                if (err != -EAGAIN)
                        goto out;
        }
again:
        ip6_rt_put_flags(rt, flags);
        err = -EAGAIN;
        rt = NULL;
        goto out;

discard_pkt:
        if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                dst_hold(&rt->dst);
out:
        res->rt6 = rt;
        return err;
}

INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule,
                                             struct flowi *flp, int flags,
                                             struct fib_lookup_arg *arg)
{
        if (arg->lookup_ptr == fib6_table_lookup)
                return fib6_rule_action_alt(rule, flp, flags, arg);

        return __fib6_rule_action(rule, flp, flags, arg);
}

INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule,
                                                int flags,
                                                struct fib_lookup_arg *arg)
{
        struct fib6_result *res = arg->result;
        struct rt6_info *rt = res->rt6;
        struct net_device *dev = NULL;

        if (!rt)
                return false;

        if (rt->rt6i_idev)
                dev = rt->rt6i_idev->dev;

        /* do not accept result if the route does
         * not meet the required prefix length
         */
        if (rt->rt6i_dst.plen <= rule->suppress_prefixlen)
                goto suppress_route;

        /* do not accept result if the route uses a device
         * belonging to a forbidden interface group
         */
        if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
                goto suppress_route;

        return false;

suppress_route:
        ip6_rt_put_flags(rt, flags);
        return true;
}

INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
                                            struct flowi *fl, int flags)
{
        struct fib6_rule *r = (struct fib6_rule *) rule;
        struct flowi6 *fl6 = &fl->u.ip6;

        if (r->dst.plen &&
            !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen))
                return 0;

        /*
         * If FIB_RULE_FIND_SADDR is set and we do not have a
         * source address for the traffic, we defer check for
         * source address.
         */
        if (r->src.plen) {
                if (flags & RT6_LOOKUP_F_HAS_SADDR) {
                        if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr,
                                               r->src.plen))
                                return 0;
                } else if (!(r->common.flags & FIB_RULE_FIND_SADDR))
                        return 0;
        }

        if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel))
                return 0;

        if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
                return 0;

        if (fib_rule_port_range_set(&rule->sport_range) &&
            !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport))
                return 0;

        if (fib_rule_port_range_set(&rule->dport_range) &&
            !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport))
                return 0;

        return 1;
}

static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                               struct fib_rule_hdr *frh,
                               struct nlattr **tb,
                               struct netlink_ext_ack *extack)
{
        int err = -EINVAL;
        struct net *net = sock_net(skb->sk);
        struct fib6_rule *rule6 = (struct fib6_rule *) rule;

        if (!inet_validate_dscp(frh->tos)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid dsfield (tos): ECN bits must be 0");
                goto errout;
        }
        rule6->dscp = inet_dsfield_to_dscp(frh->tos);

        if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
                if (rule->table == RT6_TABLE_UNSPEC) {
                        NL_SET_ERR_MSG(extack, "Invalid table");
                        goto errout;
                }

                if (fib6_new_table(net, rule->table) == NULL) {
                        err = -ENOBUFS;
                        goto errout;
                }
        }

        if (frh->src_len)
                rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]);

        if (frh->dst_len)
                rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]);

        rule6->src.plen = frh->src_len;
        rule6->dst.plen = frh->dst_len;

        if (fib_rule_requires_fldissect(rule))
                net->ipv6.fib6_rules_require_fldissect++;

        net->ipv6.fib6_has_custom_rules = true;
        err = 0;
errout:
        return err;
}

static int fib6_rule_delete(struct fib_rule *rule)
{
        struct net *net = rule->fr_net;

        if (net->ipv6.fib6_rules_require_fldissect &&
            fib_rule_requires_fldissect(rule))
                net->ipv6.fib6_rules_require_fldissect--;

        return 0;
}

static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                             struct nlattr **tb)
{
        struct fib6_rule *rule6 = (struct fib6_rule *) rule;

        if (frh->src_len && (rule6->src.plen != frh->src_len))
                return 0;

        if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
                return 0;

        if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos)
                return 0;

        if (frh->src_len &&
            nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
                return 0;

        if (frh->dst_len &&
            nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
                return 0;

        return 1;
}

static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
                          struct fib_rule_hdr *frh)
{
        struct fib6_rule *rule6 = (struct fib6_rule *) rule;

        frh->dst_len = rule6->dst.plen;
        frh->src_len = rule6->src.plen;
        frh->tos = inet_dscp_to_dsfield(rule6->dscp);

        if ((rule6->dst.plen &&
             nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
            (rule6->src.plen &&
             nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -ENOBUFS;
}

static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
{
        return nla_total_size(16) /* dst */
               + nla_total_size(16); /* src */
}

static void fib6_rule_flush_cache(struct fib_rules_ops *ops)
{
        rt_genid_bump_ipv6(ops->fro_net);
}

static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
        .family                        = AF_INET6,
        .rule_size                = sizeof(struct fib6_rule),
        .addr_size                = sizeof(struct in6_addr),
        .action                        = fib6_rule_action,
        .match                        = fib6_rule_match,
        .suppress                = fib6_rule_suppress,
        .configure                = fib6_rule_configure,
        .delete                        = fib6_rule_delete,
        .compare                = fib6_rule_compare,
        .fill                        = fib6_rule_fill,
        .nlmsg_payload                = fib6_rule_nlmsg_payload,
        .flush_cache                = fib6_rule_flush_cache,
        .nlgroup                = RTNLGRP_IPV6_RULE,
        .owner                        = THIS_MODULE,
        .fro_net                = &init_net,
};

static int __net_init fib6_rules_net_init(struct net *net)
{
        struct fib_rules_ops *ops;
        int err;

        ops = fib_rules_register(&fib6_rules_ops_template, net);
        if (IS_ERR(ops))
                return PTR_ERR(ops);

        err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL);
        if (err)
                goto out_fib6_rules_ops;

        err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN);
        if (err)
                goto out_fib6_rules_ops;

        net->ipv6.fib6_rules_ops = ops;
        net->ipv6.fib6_rules_require_fldissect = 0;
out:
        return err;

out_fib6_rules_ops:
        fib_rules_unregister(ops);
        goto out;
}

static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
{
        struct net *net;

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                fib_rules_unregister(net->ipv6.fib6_rules_ops);
                cond_resched();
        }
        rtnl_unlock();
}

static struct pernet_operations fib6_rules_net_ops = {
        .init = fib6_rules_net_init,
        .exit_batch = fib6_rules_net_exit_batch,
};

int __init fib6_rules_init(void)
{
        return register_pernet_subsys(&fib6_rules_net_ops);
}


void fib6_rules_cleanup(void)
{
        unregister_pernet_subsys(&fib6_rules_net_ops);
}







   21 




1
2
3
4
5
6
7
8
9
10
11
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/processor.h>

static inline int phys_addr_valid(resource_size_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
        return !(addr >> boot_cpu_data.x86_phys_bits);
#else
        return 1;
#endif
}

























































































































































































































































































































































































































































































































































































































































    8 

















    1 

    1 


































    1 

    1 


































    1 

    1 
































    7 

    7 


    7 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/network.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/* Structure for holding inet domain socket's address. */
struct tomoyo_inet_addr_info {
        __be16 port;           /* In network byte order. */
        const __be32 *address; /* In network byte order. */
        bool is_ipv6;
};

/* Structure for holding unix domain socket's address. */
struct tomoyo_unix_addr_info {
        u8 *addr; /* This may not be '\0' terminated string. */
        unsigned int addr_len;
};

/* Structure for holding socket address. */
struct tomoyo_addr_info {
        u8 protocol;
        u8 operation;
        struct tomoyo_inet_addr_info inet;
        struct tomoyo_unix_addr_info unix0;
};

/* String table for socket's protocols. */
const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX] = {
        [SOCK_STREAM]    = "stream",
        [SOCK_DGRAM]     = "dgram",
        [SOCK_RAW]       = "raw",
        [SOCK_SEQPACKET] = "seqpacket",
        [0] = " ", /* Dummy for avoiding NULL pointer dereference. */
        [4] = " ", /* Dummy for avoiding NULL pointer dereference. */
};

/**
 * tomoyo_parse_ipaddr_union - Parse an IP address.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_ipaddr_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param,
                               struct tomoyo_ipaddr_union *ptr)
{
        u8 * const min = ptr->ip[0].in6_u.u6_addr8;
        u8 * const max = ptr->ip[1].in6_u.u6_addr8;
        char *address = tomoyo_read_token(param);
        const char *end;

        if (!strchr(address, ':') &&
            in4_pton(address, -1, min, '-', &end) > 0) {
                ptr->is_ipv6 = false;
                if (!*end)
                        ptr->ip[1].s6_addr32[0] = ptr->ip[0].s6_addr32[0];
                else if (*end++ != '-' ||
                         in4_pton(end, -1, max, '\0', &end) <= 0 || *end)
                        return false;
                return true;
        }
        if (in6_pton(address, -1, min, '-', &end) > 0) {
                ptr->is_ipv6 = true;
                if (!*end)
                        memmove(max, min, sizeof(u16) * 8);
                else if (*end++ != '-' ||
                         in6_pton(end, -1, max, '\0', &end) <= 0 || *end)
                        return false;
                return true;
        }
        return false;
}

/**
 * tomoyo_print_ipv4 - Print an IPv4 address.
 *
 * @buffer:     Buffer to write to.
 * @buffer_len: Size of @buffer.
 * @min_ip:     Pointer to __be32.
 * @max_ip:     Pointer to __be32.
 *
 * Returns nothing.
 */
static void tomoyo_print_ipv4(char *buffer, const unsigned int buffer_len,
                              const __be32 *min_ip, const __be32 *max_ip)
{
        snprintf(buffer, buffer_len, "%pI4%c%pI4", min_ip,
                 *min_ip == *max_ip ? '\0' : '-', max_ip);
}

/**
 * tomoyo_print_ipv6 - Print an IPv6 address.
 *
 * @buffer:     Buffer to write to.
 * @buffer_len: Size of @buffer.
 * @min_ip:     Pointer to "struct in6_addr".
 * @max_ip:     Pointer to "struct in6_addr".
 *
 * Returns nothing.
 */
static void tomoyo_print_ipv6(char *buffer, const unsigned int buffer_len,
                              const struct in6_addr *min_ip,
                              const struct in6_addr *max_ip)
{
        snprintf(buffer, buffer_len, "%pI6c%c%pI6c", min_ip,
                 !memcmp(min_ip, max_ip, 16) ? '\0' : '-', max_ip);
}

/**
 * tomoyo_print_ip - Print an IP address.
 *
 * @buf:  Buffer to write to.
 * @size: Size of @buf.
 * @ptr:  Pointer to "struct ipaddr_union".
 *
 * Returns nothing.
 */
void tomoyo_print_ip(char *buf, const unsigned int size,
                     const struct tomoyo_ipaddr_union *ptr)
{
        if (ptr->is_ipv6)
                tomoyo_print_ipv6(buf, size, &ptr->ip[0], &ptr->ip[1]);
        else
                tomoyo_print_ipv4(buf, size, &ptr->ip[0].s6_addr32[0],
                                  &ptr->ip[1].s6_addr32[0]);
}

/*
 * Mapping table from "enum tomoyo_network_acl_index" to
 * "enum tomoyo_mac_index" for inet domain socket.
 */
static const u8 tomoyo_inet2mac
[TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = {
        [SOCK_STREAM] = {
                [TOMOYO_NETWORK_BIND]    = TOMOYO_MAC_NETWORK_INET_STREAM_BIND,
                [TOMOYO_NETWORK_LISTEN]  =
                TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN,
                [TOMOYO_NETWORK_CONNECT] =
                TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT,
        },
        [SOCK_DGRAM] = {
                [TOMOYO_NETWORK_BIND]    = TOMOYO_MAC_NETWORK_INET_DGRAM_BIND,
                [TOMOYO_NETWORK_SEND]    = TOMOYO_MAC_NETWORK_INET_DGRAM_SEND,
        },
        [SOCK_RAW]    = {
                [TOMOYO_NETWORK_BIND]    = TOMOYO_MAC_NETWORK_INET_RAW_BIND,
                [TOMOYO_NETWORK_SEND]    = TOMOYO_MAC_NETWORK_INET_RAW_SEND,
        },
};

/*
 * Mapping table from "enum tomoyo_network_acl_index" to
 * "enum tomoyo_mac_index" for unix domain socket.
 */
static const u8 tomoyo_unix2mac
[TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = {
        [SOCK_STREAM] = {
                [TOMOYO_NETWORK_BIND]    = TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND,
                [TOMOYO_NETWORK_LISTEN]  =
                TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN,
                [TOMOYO_NETWORK_CONNECT] =
                TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT,
        },
        [SOCK_DGRAM] = {
                [TOMOYO_NETWORK_BIND]    = TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND,
                [TOMOYO_NETWORK_SEND]    = TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND,
        },
        [SOCK_SEQPACKET] = {
                [TOMOYO_NETWORK_BIND]    =
                TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND,
                [TOMOYO_NETWORK_LISTEN]  =
                TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN,
                [TOMOYO_NETWORK_CONNECT] =
                TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT,
        },
};

/**
 * tomoyo_same_inet_acl - Check for duplicated "struct tomoyo_inet_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_inet_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_inet_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_inet_acl *p2 = container_of(b, typeof(*p2), head);

        return p1->protocol == p2->protocol &&
                tomoyo_same_ipaddr_union(&p1->address, &p2->address) &&
                tomoyo_same_number_union(&p1->port, &p2->port);
}

/**
 * tomoyo_same_unix_acl - Check for duplicated "struct tomoyo_unix_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_unix_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_unix_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_unix_acl *p2 = container_of(b, typeof(*p2), head);

        return p1->protocol == p2->protocol &&
                tomoyo_same_name_union(&p1->name, &p2->name);
}

/**
 * tomoyo_merge_inet_acl - Merge duplicated "struct tomoyo_inet_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_inet_acl(struct tomoyo_acl_info *a,
                                  struct tomoyo_acl_info *b,
                                  const bool is_delete)
{
        u8 * const a_perm =
                &container_of(a, struct tomoyo_inet_acl, head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_inet_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_merge_unix_acl - Merge duplicated "struct tomoyo_unix_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_unix_acl(struct tomoyo_acl_info *a,
                                  struct tomoyo_acl_info *b,
                                  const bool is_delete)
{
        u8 * const a_perm =
                &container_of(a, struct tomoyo_unix_acl, head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_unix_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_write_inet_network - Write "struct tomoyo_inet_acl" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_inet_network(struct tomoyo_acl_param *param)
{
        struct tomoyo_inet_acl e = { .head.type = TOMOYO_TYPE_INET_ACL };
        int error = -EINVAL;
        u8 type;
        const char *protocol = tomoyo_read_token(param);
        const char *operation = tomoyo_read_token(param);

        for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++)
                if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol]))
                        break;
        for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++)
                if (tomoyo_permstr(operation, tomoyo_socket_keyword[type]))
                        e.perm |= 1 << type;
        if (e.protocol == TOMOYO_SOCK_MAX || !e.perm)
                return -EINVAL;
        if (param->data[0] == '@') {
                param->data++;
                e.address.group =
                        tomoyo_get_group(param, TOMOYO_ADDRESS_GROUP);
                if (!e.address.group)
                        return -ENOMEM;
        } else {
                if (!tomoyo_parse_ipaddr_union(param, &e.address))
                        goto out;
        }
        if (!tomoyo_parse_number_union(param, &e.port) ||
            e.port.values[1] > 65535)
                goto out;
        error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                     tomoyo_same_inet_acl,
                                     tomoyo_merge_inet_acl);
out:
        tomoyo_put_group(e.address.group);
        tomoyo_put_number_union(&e.port);
        return error;
}

/**
 * tomoyo_write_unix_network - Write "struct tomoyo_unix_acl" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_write_unix_network(struct tomoyo_acl_param *param)
{
        struct tomoyo_unix_acl e = { .head.type = TOMOYO_TYPE_UNIX_ACL };
        int error;
        u8 type;
        const char *protocol = tomoyo_read_token(param);
        const char *operation = tomoyo_read_token(param);

        for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++)
                if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol]))
                        break;
        for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++)
                if (tomoyo_permstr(operation, tomoyo_socket_keyword[type]))
                        e.perm |= 1 << type;
        if (e.protocol == TOMOYO_SOCK_MAX || !e.perm)
                return -EINVAL;
        if (!tomoyo_parse_name_union(param, &e.name))
                return -EINVAL;
        error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                     tomoyo_same_unix_acl,
                                     tomoyo_merge_unix_acl);
        tomoyo_put_name_union(&e.name);
        return error;
}

/**
 * tomoyo_audit_net_log - Audit network log.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @family:    Name of socket family ("inet" or "unix").
 * @protocol:  Name of protocol in @family.
 * @operation: Name of socket operation.
 * @address:   Name of address.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_net_log(struct tomoyo_request_info *r,
                                const char *family, const u8 protocol,
                                const u8 operation, const char *address)
{
        return tomoyo_supervisor(r, "network %s %s %s %s\n", family,
                                 tomoyo_proto_keyword[protocol],
                                 tomoyo_socket_keyword[operation], address);
}

/**
 * tomoyo_audit_inet_log - Audit INET network log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_inet_log(struct tomoyo_request_info *r)
{
        char buf[128];
        int len;
        const __be32 *address = r->param.inet_network.address;

        if (r->param.inet_network.is_ipv6)
                tomoyo_print_ipv6(buf, sizeof(buf), (const struct in6_addr *)
                                  address, (const struct in6_addr *) address);
        else
                tomoyo_print_ipv4(buf, sizeof(buf), address, address);
        len = strlen(buf);
        snprintf(buf + len, sizeof(buf) - len, " %u",
                 r->param.inet_network.port);
        return tomoyo_audit_net_log(r, "inet", r->param.inet_network.protocol,
                                    r->param.inet_network.operation, buf);
}

/**
 * tomoyo_audit_unix_log - Audit UNIX network log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_unix_log(struct tomoyo_request_info *r)
{
        return tomoyo_audit_net_log(r, "unix", r->param.unix_network.protocol,
                                    r->param.unix_network.operation,
                                    r->param.unix_network.address->name);
}

/**
 * tomoyo_check_inet_acl - Check permission for inet domain socket operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_inet_acl(struct tomoyo_request_info *r,
                                  const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_inet_acl *acl =
                container_of(ptr, typeof(*acl), head);
        const u8 size = r->param.inet_network.is_ipv6 ? 16 : 4;

        if (!(acl->perm & (1 << r->param.inet_network.operation)) ||
            !tomoyo_compare_number_union(r->param.inet_network.port,
                                         &acl->port))
                return false;
        if (acl->address.group)
                return tomoyo_address_matches_group
                        (r->param.inet_network.is_ipv6,
                         r->param.inet_network.address, acl->address.group);
        return acl->address.is_ipv6 == r->param.inet_network.is_ipv6 &&
                memcmp(&acl->address.ip[0],
                       r->param.inet_network.address, size) <= 0 &&
                memcmp(r->param.inet_network.address,
                       &acl->address.ip[1], size) <= 0;
}

/**
 * tomoyo_check_unix_acl - Check permission for unix domain socket operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_unix_acl(struct tomoyo_request_info *r,
                                  const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_unix_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.unix_network.operation)) &&
                tomoyo_compare_name_union(r->param.unix_network.address,
                                          &acl->name);
}

/**
 * tomoyo_inet_entry - Check permission for INET network operation.
 *
 * @address: Pointer to "struct tomoyo_addr_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_inet_entry(const struct tomoyo_addr_info *address)
{
        const int idx = tomoyo_read_lock();
        struct tomoyo_request_info r;
        int error = 0;
        const u8 type = tomoyo_inet2mac[address->protocol][address->operation];

        if (type && tomoyo_init_request_info(&r, NULL, type)
            != TOMOYO_CONFIG_DISABLED) {
                r.param_type = TOMOYO_TYPE_INET_ACL;
                r.param.inet_network.protocol = address->protocol;
                r.param.inet_network.operation = address->operation;
                r.param.inet_network.is_ipv6 = address->inet.is_ipv6;
                r.param.inet_network.address = address->inet.address;
                r.param.inet_network.port = ntohs(address->inet.port);
                do {
                        tomoyo_check_acl(&r, tomoyo_check_inet_acl);
                        error = tomoyo_audit_inet_log(&r);
                } while (error == TOMOYO_RETRY_REQUEST);
        }
        tomoyo_read_unlock(idx);
        return error;
}

/**
 * tomoyo_check_inet_address - Check permission for inet domain socket's operation.
 *
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 * @port:     Port number.
 * @address:  Pointer to "struct tomoyo_addr_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_check_inet_address(const struct sockaddr *addr,
                                     const unsigned int addr_len,
                                     const u16 port,
                                     struct tomoyo_addr_info *address)
{
        struct tomoyo_inet_addr_info *i = &address->inet;

        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return 0;
        switch (addr->sa_family) {
        case AF_INET6:
                if (addr_len < SIN6_LEN_RFC2133)
                        goto skip;
                i->is_ipv6 = true;
                i->address = (__be32 *)
                        ((struct sockaddr_in6 *) addr)->sin6_addr.s6_addr;
                i->port = ((struct sockaddr_in6 *) addr)->sin6_port;
                break;
        case AF_INET:
                if (addr_len < sizeof(struct sockaddr_in))
                        goto skip;
                i->is_ipv6 = false;
                i->address = (__be32 *)
                        &((struct sockaddr_in *) addr)->sin_addr;
                i->port = ((struct sockaddr_in *) addr)->sin_port;
                break;
        default:
                goto skip;
        }
        if (address->protocol == SOCK_RAW)
                i->port = htons(port);
        return tomoyo_inet_entry(address);
skip:
        return 0;
}

/**
 * tomoyo_unix_entry - Check permission for UNIX network operation.
 *
 * @address: Pointer to "struct tomoyo_addr_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_unix_entry(const struct tomoyo_addr_info *address)
{
        const int idx = tomoyo_read_lock();
        struct tomoyo_request_info r;
        int error = 0;
        const u8 type = tomoyo_unix2mac[address->protocol][address->operation];

        if (type && tomoyo_init_request_info(&r, NULL, type)
            != TOMOYO_CONFIG_DISABLED) {
                char *buf = address->unix0.addr;
                int len = address->unix0.addr_len - sizeof(sa_family_t);

                if (len <= 0) {
                        buf = "anonymous";
                        len = 9;
                } else if (buf[0]) {
                        len = strnlen(buf, len);
                }
                buf = tomoyo_encode2(buf, len);
                if (buf) {
                        struct tomoyo_path_info addr;

                        addr.name = buf;
                        tomoyo_fill_path_info(&addr);
                        r.param_type = TOMOYO_TYPE_UNIX_ACL;
                        r.param.unix_network.protocol = address->protocol;
                        r.param.unix_network.operation = address->operation;
                        r.param.unix_network.address = &addr;
                        do {
                                tomoyo_check_acl(&r, tomoyo_check_unix_acl);
                                error = tomoyo_audit_unix_log(&r);
                        } while (error == TOMOYO_RETRY_REQUEST);
                        kfree(buf);
                } else
                        error = -ENOMEM;
        }
        tomoyo_read_unlock(idx);
        return error;
}

/**
 * tomoyo_check_unix_address - Check permission for unix domain socket's operation.
 *
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 * @address:  Pointer to "struct tomoyo_addr_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_check_unix_address(struct sockaddr *addr,
                                     const unsigned int addr_len,
                                     struct tomoyo_addr_info *address)
{
        struct tomoyo_unix_addr_info *u = &address->unix0;

        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return 0;
        if (addr->sa_family != AF_UNIX)
                return 0;
        u->addr = ((struct sockaddr_un *) addr)->sun_path;
        u->addr_len = addr_len;
        return tomoyo_unix_entry(address);
}

/**
 * tomoyo_kernel_service - Check whether I'm kernel service or not.
 *
 * Returns true if I'm kernel service, false otherwise.
 */
static bool tomoyo_kernel_service(void)
{
        /* Nothing to do if I am a kernel service. */
        return current->flags & PF_KTHREAD;
}

/**
 * tomoyo_sock_family - Get socket's family.
 *
 * @sk: Pointer to "struct sock".
 *
 * Returns one of PF_INET, PF_INET6, PF_UNIX or 0.
 */
static u8 tomoyo_sock_family(struct sock *sk)
{
        u8 family;

        if (tomoyo_kernel_service())
                return 0;
        family = sk->sk_family;
        switch (family) {
        case PF_INET:
        case PF_INET6:
        case PF_UNIX:
                return family;
        default:
                return 0;
        }
}

/**
 * tomoyo_socket_listen_permission - Check permission for listening a socket.
 *
 * @sock: Pointer to "struct socket".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_socket_listen_permission(struct socket *sock)
{
        struct tomoyo_addr_info address;
        const u8 family = tomoyo_sock_family(sock->sk);
        const unsigned int type = sock->type;
        struct sockaddr_storage addr;
        int addr_len;

        if (!family || (type != SOCK_STREAM && type != SOCK_SEQPACKET))
                return 0;
        {
                const int error = sock->ops->getname(sock, (struct sockaddr *)
                                                     &addr, 0);

                if (error < 0)
                        return error;
                addr_len = error;
        }
        address.protocol = type;
        address.operation = TOMOYO_NETWORK_LISTEN;
        if (family == PF_UNIX)
                return tomoyo_check_unix_address((struct sockaddr *) &addr,
                                                 addr_len, &address);
        return tomoyo_check_inet_address((struct sockaddr *) &addr, addr_len,
                                         0, &address);
}

/**
 * tomoyo_socket_connect_permission - Check permission for setting the remote address of a socket.
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_socket_connect_permission(struct socket *sock,
                                     struct sockaddr *addr, int addr_len)
{
        struct tomoyo_addr_info address;
        const u8 family = tomoyo_sock_family(sock->sk);
        const unsigned int type = sock->type;

        if (!family)
                return 0;
        address.protocol = type;
        switch (type) {
        case SOCK_DGRAM:
        case SOCK_RAW:
                address.operation = TOMOYO_NETWORK_SEND;
                break;
        case SOCK_STREAM:
        case SOCK_SEQPACKET:
                address.operation = TOMOYO_NETWORK_CONNECT;
                break;
        default:
                return 0;
        }
        if (family == PF_UNIX)
                return tomoyo_check_unix_address(addr, addr_len, &address);
        return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol,
                                         &address);
}

/**
 * tomoyo_socket_bind_permission - Check permission for setting the local address of a socket.
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr,
                                  int addr_len)
{
        struct tomoyo_addr_info address;
        const u8 family = tomoyo_sock_family(sock->sk);
        const unsigned int type = sock->type;

        if (!family)
                return 0;
        switch (type) {
        case SOCK_STREAM:
        case SOCK_DGRAM:
        case SOCK_RAW:
        case SOCK_SEQPACKET:
                address.protocol = type;
                address.operation = TOMOYO_NETWORK_BIND;
                break;
        default:
                return 0;
        }
        if (family == PF_UNIX)
                return tomoyo_check_unix_address(addr, addr_len, &address);
        return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol,
                                         &address);
}

/**
 * tomoyo_socket_sendmsg_permission - Check permission for sending a datagram.
 *
 * @sock: Pointer to "struct socket".
 * @msg:  Pointer to "struct msghdr".
 * @size: Unused.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg,
                                     int size)
{
        struct tomoyo_addr_info address;
        const u8 family = tomoyo_sock_family(sock->sk);
        const unsigned int type = sock->type;

        if (!msg->msg_name || !family ||
            (type != SOCK_DGRAM && type != SOCK_RAW))
                return 0;
        address.protocol = type;
        address.operation = TOMOYO_NETWORK_SEND;
        if (family == PF_UNIX)
                return tomoyo_check_unix_address((struct sockaddr *)
                                                 msg->msg_name,
                                                 msg->msg_namelen, &address);
        return tomoyo_check_inet_address((struct sockaddr *) msg->msg_name,
                                         msg->msg_namelen,
                                         sock->sk->sk_protocol, &address);
}




































































































































































































































    1 



































    1 


































    1 




















    1 






































    1 






    1 
























































    1 
















    1 





    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * algif_hash: User-space interface for hash algorithms
 *
 * This file provides the user-space API for hash algorithms.
 *
 * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/hash.h>
#include <crypto/if_alg.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/net.h>
#include <net/sock.h>

struct hash_ctx {
        struct af_alg_sgl sgl;

        u8 *result;

        struct crypto_wait wait;

        unsigned int len;
        bool more;

        struct ahash_request req;
};

static int hash_alloc_result(struct sock *sk, struct hash_ctx *ctx)
{
        unsigned ds;

        if (ctx->result)
                return 0;

        ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req));

        ctx->result = sock_kmalloc(sk, ds, GFP_KERNEL);
        if (!ctx->result)
                return -ENOMEM;

        memset(ctx->result, 0, ds);

        return 0;
}

static void hash_free_result(struct sock *sk, struct hash_ctx *ctx)
{
        unsigned ds;

        if (!ctx->result)
                return;

        ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req));

        sock_kzfree_s(sk, ctx->result, ds);
        ctx->result = NULL;
}

static int hash_sendmsg(struct socket *sock, struct msghdr *msg,
                        size_t ignored)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct hash_ctx *ctx = ask->private;
        ssize_t copied = 0;
        size_t len, max_pages, npages;
        bool continuing, need_init = false;
        int err;

        max_pages = min_t(size_t, ALG_MAX_PAGES,
                          DIV_ROUND_UP(sk->sk_sndbuf, PAGE_SIZE));

        lock_sock(sk);
        continuing = ctx->more;

        if (!continuing) {
                /* Discard a previous request that wasn't marked MSG_MORE. */
                hash_free_result(sk, ctx);
                if (!msg_data_left(msg))
                        goto done; /* Zero-length; don't start new req */
                need_init = true;
        } else if (!msg_data_left(msg)) {
                /*
                 * No data - finalise the prev req if MSG_MORE so any error
                 * comes out here.
                 */
                if (!(msg->msg_flags & MSG_MORE)) {
                        err = hash_alloc_result(sk, ctx);
                        if (err)
                                goto unlock_free_result;
                        ahash_request_set_crypt(&ctx->req, NULL,
                                                ctx->result, 0);
                        err = crypto_wait_req(crypto_ahash_final(&ctx->req),
                                              &ctx->wait);
                        if (err)
                                goto unlock_free_result;
                }
                goto done_more;
        }

        while (msg_data_left(msg)) {
                ctx->sgl.sgt.sgl = ctx->sgl.sgl;
                ctx->sgl.sgt.nents = 0;
                ctx->sgl.sgt.orig_nents = 0;

                err = -EIO;
                npages = iov_iter_npages(&msg->msg_iter, max_pages);
                if (npages == 0)
                        goto unlock_free;

                sg_init_table(ctx->sgl.sgl, npages);

                ctx->sgl.need_unpin = iov_iter_extract_will_pin(&msg->msg_iter);

                err = extract_iter_to_sg(&msg->msg_iter, LONG_MAX,
                                         &ctx->sgl.sgt, npages, 0);
                if (err < 0)
                        goto unlock_free;
                len = err;
                sg_mark_end(ctx->sgl.sgt.sgl + ctx->sgl.sgt.nents - 1);

                if (!msg_data_left(msg)) {
                        err = hash_alloc_result(sk, ctx);
                        if (err)
                                goto unlock_free;
                }

                ahash_request_set_crypt(&ctx->req, ctx->sgl.sgt.sgl,
                                        ctx->result, len);

                if (!msg_data_left(msg) && !continuing &&
                    !(msg->msg_flags & MSG_MORE)) {
                        err = crypto_ahash_digest(&ctx->req);
                } else {
                        if (need_init) {
                                err = crypto_wait_req(
                                        crypto_ahash_init(&ctx->req),
                                        &ctx->wait);
                                if (err)
                                        goto unlock_free;
                                need_init = false;
                        }

                        if (msg_data_left(msg) || (msg->msg_flags & MSG_MORE))
                                err = crypto_ahash_update(&ctx->req);
                        else
                                err = crypto_ahash_finup(&ctx->req);
                        continuing = true;
                }

                err = crypto_wait_req(err, &ctx->wait);
                if (err)
                        goto unlock_free;

                copied += len;
                af_alg_free_sg(&ctx->sgl);
        }

done_more:
        ctx->more = msg->msg_flags & MSG_MORE;
done:
        err = 0;
unlock:
        release_sock(sk);
        return copied ?: err;

unlock_free:
        af_alg_free_sg(&ctx->sgl);
unlock_free_result:
        hash_free_result(sk, ctx);
        ctx->more = false;
        goto unlock;
}

static int hash_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                        int flags)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct hash_ctx *ctx = ask->private;
        unsigned ds = crypto_ahash_digestsize(crypto_ahash_reqtfm(&ctx->req));
        bool result;
        int err;

        if (len > ds)
                len = ds;
        else if (len < ds)
                msg->msg_flags |= MSG_TRUNC;

        lock_sock(sk);
        result = ctx->result;
        err = hash_alloc_result(sk, ctx);
        if (err)
                goto unlock;

        ahash_request_set_crypt(&ctx->req, NULL, ctx->result, 0);

        if (!result && !ctx->more) {
                err = crypto_wait_req(crypto_ahash_init(&ctx->req),
                                      &ctx->wait);
                if (err)
                        goto unlock;
        }

        if (!result || ctx->more) {
                ctx->more = false;
                err = crypto_wait_req(crypto_ahash_final(&ctx->req),
                                      &ctx->wait);
                if (err)
                        goto unlock;
        }

        err = memcpy_to_msg(msg, ctx->result, len);

unlock:
        hash_free_result(sk, ctx);
        release_sock(sk);

        return err ?: len;
}

static int hash_accept(struct socket *sock, struct socket *newsock,
                       struct proto_accept_arg *arg)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct hash_ctx *ctx = ask->private;
        struct ahash_request *req = &ctx->req;
        struct crypto_ahash *tfm;
        struct sock *sk2;
        struct alg_sock *ask2;
        struct hash_ctx *ctx2;
        char *state;
        bool more;
        int err;

        tfm = crypto_ahash_reqtfm(req);
        state = kmalloc(crypto_ahash_statesize(tfm), GFP_KERNEL);
        err = -ENOMEM;
        if (!state)
                goto out;

        lock_sock(sk);
        more = ctx->more;
        err = more ? crypto_ahash_export(req, state) : 0;
        release_sock(sk);

        if (err)
                goto out_free_state;

        err = af_alg_accept(ask->parent, newsock, arg);
        if (err)
                goto out_free_state;

        sk2 = newsock->sk;
        ask2 = alg_sk(sk2);
        ctx2 = ask2->private;
        ctx2->more = more;

        if (!more)
                goto out_free_state;

        err = crypto_ahash_import(&ctx2->req, state);
        if (err) {
                sock_orphan(sk2);
                sock_put(sk2);
        }

out_free_state:
        kfree_sensitive(state);

out:
        return err;
}

static struct proto_ops algif_hash_ops = {
        .family                =        PF_ALG,

        .connect        =        sock_no_connect,
        .socketpair        =        sock_no_socketpair,
        .getname        =        sock_no_getname,
        .ioctl                =        sock_no_ioctl,
        .listen                =        sock_no_listen,
        .shutdown        =        sock_no_shutdown,
        .mmap                =        sock_no_mmap,
        .bind                =        sock_no_bind,

        .release        =        af_alg_release,
        .sendmsg        =        hash_sendmsg,
        .recvmsg        =        hash_recvmsg,
        .accept                =        hash_accept,
};

static int hash_check_key(struct socket *sock)
{
        int err = 0;
        struct sock *psk;
        struct alg_sock *pask;
        struct crypto_ahash *tfm;
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);

        lock_sock(sk);
        if (!atomic_read(&ask->nokey_refcnt))
                goto unlock_child;

        psk = ask->parent;
        pask = alg_sk(ask->parent);
        tfm = pask->private;

        err = -ENOKEY;
        lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                goto unlock;

        atomic_dec(&pask->nokey_refcnt);
        atomic_set(&ask->nokey_refcnt, 0);

        err = 0;

unlock:
        release_sock(psk);
unlock_child:
        release_sock(sk);

        return err;
}

static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg,
                              size_t size)
{
        int err;

        err = hash_check_key(sock);
        if (err)
                return err;

        return hash_sendmsg(sock, msg, size);
}

static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
                              size_t ignored, int flags)
{
        int err;

        err = hash_check_key(sock);
        if (err)
                return err;

        return hash_recvmsg(sock, msg, ignored, flags);
}

static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
                             struct proto_accept_arg *arg)
{
        int err;

        err = hash_check_key(sock);
        if (err)
                return err;

        return hash_accept(sock, newsock, arg);
}

static struct proto_ops algif_hash_ops_nokey = {
        .family                =        PF_ALG,

        .connect        =        sock_no_connect,
        .socketpair        =        sock_no_socketpair,
        .getname        =        sock_no_getname,
        .ioctl                =        sock_no_ioctl,
        .listen                =        sock_no_listen,
        .shutdown        =        sock_no_shutdown,
        .mmap                =        sock_no_mmap,
        .bind                =        sock_no_bind,

        .release        =        af_alg_release,
        .sendmsg        =        hash_sendmsg_nokey,
        .recvmsg        =        hash_recvmsg_nokey,
        .accept                =        hash_accept_nokey,
};

static void *hash_bind(const char *name, u32 type, u32 mask)
{
        return crypto_alloc_ahash(name, type, mask);
}

static void hash_release(void *private)
{
        crypto_free_ahash(private);
}

static int hash_setkey(void *private, const u8 *key, unsigned int keylen)
{
        return crypto_ahash_setkey(private, key, keylen);
}

static void hash_sock_destruct(struct sock *sk)
{
        struct alg_sock *ask = alg_sk(sk);
        struct hash_ctx *ctx = ask->private;

        hash_free_result(sk, ctx);
        sock_kfree_s(sk, ctx, ctx->len);
        af_alg_release_parent(sk);
}

static int hash_accept_parent_nokey(void *private, struct sock *sk)
{
        struct crypto_ahash *tfm = private;
        struct alg_sock *ask = alg_sk(sk);
        struct hash_ctx *ctx;
        unsigned int len = sizeof(*ctx) + crypto_ahash_reqsize(tfm);

        ctx = sock_kmalloc(sk, len, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->result = NULL;
        ctx->len = len;
        ctx->more = false;
        crypto_init_wait(&ctx->wait);

        ask->private = ctx;

        ahash_request_set_tfm(&ctx->req, tfm);
        ahash_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
                                   crypto_req_done, &ctx->wait);

        sk->sk_destruct = hash_sock_destruct;

        return 0;
}

static int hash_accept_parent(void *private, struct sock *sk)
{
        struct crypto_ahash *tfm = private;

        if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return hash_accept_parent_nokey(private, sk);
}

static const struct af_alg_type algif_type_hash = {
        .bind                =        hash_bind,
        .release        =        hash_release,
        .setkey                =        hash_setkey,
        .accept                =        hash_accept_parent,
        .accept_nokey        =        hash_accept_parent_nokey,
        .ops                =        &algif_hash_ops,
        .ops_nokey        =        &algif_hash_ops_nokey,
        .name                =        "hash",
        .owner                =        THIS_MODULE
};

static int __init algif_hash_init(void)
{
        return af_alg_register_type(&algif_type_hash);
}

static void __exit algif_hash_exit(void)
{
        int err = af_alg_unregister_type(&algif_type_hash);
        BUG_ON(err);
}

module_init(algif_hash_init);
module_exit(algif_hash_exit);
MODULE_LICENSE("GPL");
















































    4 
    4 




    2 




    2 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 */

#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/log2.h>
#include <linux/sched.h>
#include <linux/rculist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <linux/rhashtable.h>
#include <linux/err.h>
#include <linux/export.h>

#define HASH_DEFAULT_SIZE        64UL
#define HASH_MIN_SIZE                4U

union nested_table {
        union nested_table __rcu *table;
        struct rhash_lock_head __rcu *bucket;
};

static u32 head_hashfn(struct rhashtable *ht,
                       const struct bucket_table *tbl,
                       const struct rhash_head *he)
{
        return rht_head_hashfn(ht, tbl, he, ht->p);
}

#ifdef CONFIG_PROVE_LOCKING
#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))

int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);

int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
        if (!debug_locks)
                return 1;
        if (unlikely(tbl->nest))
                return 1;
        return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#else
#define ASSERT_RHT_MUTEX(HT)
#endif

static inline union nested_table *nested_table_top(
        const struct bucket_table *tbl)
{
        /* The top-level bucket entry does not need RCU protection
         * because it's set at the same time as tbl->nest.
         */
        return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
}

static void nested_table_free(union nested_table *ntbl, unsigned int size)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        const unsigned int len = 1 << shift;
        unsigned int i;

        ntbl = rcu_dereference_protected(ntbl->table, 1);
        if (!ntbl)
                return;

        if (size > len) {
                size >>= shift;
                for (i = 0; i < len; i++)
                        nested_table_free(ntbl + i, size);
        }

        kfree(ntbl);
}

static void nested_bucket_table_free(const struct bucket_table *tbl)
{
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int len = 1 << tbl->nest;
        union nested_table *ntbl;
        unsigned int i;

        ntbl = nested_table_top(tbl);

        for (i = 0; i < len; i++)
                nested_table_free(ntbl + i, size);

        kfree(ntbl);
}

static void bucket_table_free(const struct bucket_table *tbl)
{
        if (tbl->nest)
                nested_bucket_table_free(tbl);

        kvfree(tbl);
}

static void bucket_table_free_rcu(struct rcu_head *head)
{
        bucket_table_free(container_of(head, struct bucket_table, rcu));
}

static union nested_table *nested_table_alloc(struct rhashtable *ht,
                                              union nested_table __rcu **prev,
                                              bool leaf)
{
        union nested_table *ntbl;
        int i;

        ntbl = rcu_dereference(*prev);
        if (ntbl)
                return ntbl;

        ntbl = alloc_hooks_tag(ht->alloc_tag,
                        kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO));

        if (ntbl && leaf) {
                for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
                        INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
        }

        if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
                return ntbl;
        /* Raced with another thread. */
        kfree(ntbl);
        return rcu_dereference(*prev);
}

static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
                                                      size_t nbuckets,
                                                      gfp_t gfp)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        struct bucket_table *tbl;
        size_t size;

        if (nbuckets < (1 << (shift + 1)))
                return NULL;

        size = sizeof(*tbl) + sizeof(tbl->buckets[0]);

        tbl = alloc_hooks_tag(ht->alloc_tag,
                        kmalloc_noprof(size, gfp|__GFP_ZERO));
        if (!tbl)
                return NULL;

        if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
                                false)) {
                kfree(tbl);
                return NULL;
        }

        tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;

        return tbl;
}

static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
                                               size_t nbuckets,
                                               gfp_t gfp)
{
        struct bucket_table *tbl = NULL;
        size_t size;
        int i;
        static struct lock_class_key __key;

        tbl = alloc_hooks_tag(ht->alloc_tag,
                        kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets),
                                             gfp|__GFP_ZERO, NUMA_NO_NODE));

        size = nbuckets;

        if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
                tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
                nbuckets = 0;
        }

        if (tbl == NULL)
                return NULL;

        lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0);

        tbl->size = size;

        rcu_head_init(&tbl->rcu);
        INIT_LIST_HEAD(&tbl->walkers);

        tbl->hash_rnd = get_random_u32();

        for (i = 0; i < nbuckets; i++)
                INIT_RHT_NULLS_HEAD(tbl->buckets[i]);

        return tbl;
}

static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
                                                  struct bucket_table *tbl)
{
        struct bucket_table *new_tbl;

        do {
                new_tbl = tbl;
                tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        } while (tbl);

        return new_tbl;
}

static int rhashtable_rehash_one(struct rhashtable *ht,
                                 struct rhash_lock_head __rcu **bkt,
                                 unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
        int err = -EAGAIN;
        struct rhash_head *head, *next, *entry;
        struct rhash_head __rcu **pprev = NULL;
        unsigned int new_hash;
        unsigned long flags;

        if (new_tbl->nest)
                goto out;

        err = -ENOENT;

        rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
                          old_tbl, old_hash) {
                err = 0;
                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);

                if (rht_is_a_nulls(next))
                        break;

                pprev = &entry->next;
        }

        if (err)
                goto out;

        new_hash = head_hashfn(ht, new_tbl, entry);

        flags = rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash],
                                SINGLE_DEPTH_NESTING);

        head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);

        RCU_INIT_POINTER(entry->next, head);

        rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry, flags);

        if (pprev)
                rcu_assign_pointer(*pprev, next);
        else
                /* Need to preserved the bit lock. */
                rht_assign_locked(bkt, next);

out:
        return err;
}

static int rhashtable_rehash_chain(struct rhashtable *ht,
                                    unsigned int old_hash)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
        unsigned long flags;
        int err;

        if (!bkt)
                return 0;
        flags = rht_lock(old_tbl, bkt);

        while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
                ;

        if (err == -ENOENT)
                err = 0;
        rht_unlock(old_tbl, bkt, flags);

        return err;
}

static int rhashtable_rehash_attach(struct rhashtable *ht,
                                    struct bucket_table *old_tbl,
                                    struct bucket_table *new_tbl)
{
        /* Make insertions go into the new, empty table right away. Deletions
         * and lookups will be attempted in both tables until we synchronize.
         * As cmpxchg() provides strong barriers, we do not need
         * rcu_assign_pointer().
         */

        if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
                    new_tbl) != NULL)
                return -EEXIST;

        return 0;
}

static int rhashtable_rehash_table(struct rhashtable *ht)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        struct bucket_table *new_tbl;
        struct rhashtable_walker *walker;
        unsigned int old_hash;
        int err;

        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
        if (!new_tbl)
                return 0;

        for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
                err = rhashtable_rehash_chain(ht, old_hash);
                if (err)
                        return err;
                cond_resched();
        }

        /* Publish the new table pointer. */
        rcu_assign_pointer(ht->tbl, new_tbl);

        spin_lock(&ht->lock);
        list_for_each_entry(walker, &old_tbl->walkers, list)
                walker->tbl = NULL;

        /* Wait for readers. All new readers will see the new
         * table, and thus no references to the old table will
         * remain.
         * We do this inside the locked region so that
         * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
         * to check if it should not re-link the table.
         */
        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
        spin_unlock(&ht->lock);

        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}

static int rhashtable_rehash_alloc(struct rhashtable *ht,
                                   struct bucket_table *old_tbl,
                                   unsigned int size)
{
        struct bucket_table *new_tbl;
        int err;

        ASSERT_RHT_MUTEX(ht);

        new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (new_tbl == NULL)
                return -ENOMEM;

        err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
        if (err)
                bucket_table_free(new_tbl);

        return err;
}

/**
 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 * @ht:                the hash table to shrink
 *
 * This function shrinks the hash table to fit, i.e., the smallest
 * size would not cause it to expand right away automatically.
 *
 * The caller must ensure that no concurrent resizing occurs by holding
 * ht->mutex.
 *
 * The caller must ensure that no concurrent table mutations take place.
 * It is however valid to have concurrent lookups if they are RCU protected.
 *
 * It is valid to have concurrent insertions and deletions protected by per
 * bucket locks or concurrent RCU protected lookups and traversals.
 */
static int rhashtable_shrink(struct rhashtable *ht)
{
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        unsigned int nelems = atomic_read(&ht->nelems);
        unsigned int size = 0;

        if (nelems)
                size = roundup_pow_of_two(nelems * 3 / 2);
        if (size < ht->p.min_size)
                size = ht->p.min_size;

        if (old_tbl->size <= size)
                return 0;

        if (rht_dereference(old_tbl->future_tbl, ht))
                return -EEXIST;

        return rhashtable_rehash_alloc(ht, old_tbl, size);
}

static void rht_deferred_worker(struct work_struct *work)
{
        struct rhashtable *ht;
        struct bucket_table *tbl;
        int err = 0;

        ht = container_of(work, struct rhashtable, run_work);
        mutex_lock(&ht->mutex);

        tbl = rht_dereference(ht->tbl, ht);
        tbl = rhashtable_last_table(ht, tbl);

        if (rht_grow_above_75(ht, tbl))
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
                err = rhashtable_shrink(ht);
        else if (tbl->nest)
                err = rhashtable_rehash_alloc(ht, tbl, tbl->size);

        if (!err || err == -EEXIST) {
                int nerr;

                nerr = rhashtable_rehash_table(ht);
                err = err ?: nerr;
        }

        mutex_unlock(&ht->mutex);

        if (err)
                schedule_work(&ht->run_work);
}

static int rhashtable_insert_rehash(struct rhashtable *ht,
                                    struct bucket_table *tbl)
{
        struct bucket_table *old_tbl;
        struct bucket_table *new_tbl;
        unsigned int size;
        int err;

        old_tbl = rht_dereference_rcu(ht->tbl, ht);

        size = tbl->size;

        err = -EBUSY;

        if (rht_grow_above_75(ht, tbl))
                size *= 2;
        /* Do not schedule more than one rehash */
        else if (old_tbl != tbl)
                goto fail;

        err = -ENOMEM;

        new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
        if (new_tbl == NULL)
                goto fail;

        err = rhashtable_rehash_attach(ht, tbl, new_tbl);
        if (err) {
                bucket_table_free(new_tbl);
                if (err == -EEXIST)
                        err = 0;
        } else
                schedule_work(&ht->run_work);

        return err;

fail:
        /* Do not fail the insert if someone else did a rehash. */
        if (likely(rcu_access_pointer(tbl->future_tbl)))
                return 0;

        /* Schedule async rehash to retry allocation in process context. */
        if (err == -ENOMEM)
                schedule_work(&ht->run_work);

        return err;
}

static void *rhashtable_lookup_one(struct rhashtable *ht,
                                   struct rhash_lock_head __rcu **bkt,
                                   struct bucket_table *tbl, unsigned int hash,
                                   const void *key, struct rhash_head *obj)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_head __rcu **pprev = NULL;
        struct rhash_head *head;
        int elasticity;

        elasticity = RHT_ELASTICITY;
        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;
                struct rhlist_head *plist;

                elasticity--;
                if (!key ||
                    (ht->p.obj_cmpfn ?
                     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                if (!ht->rhlist)
                        return rht_obj(ht, head);

                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev)
                        rcu_assign_pointer(*pprev, obj);
                else
                        /* Need to preserve the bit lock */
                        rht_assign_locked(bkt, obj);

                return NULL;
        }

        if (elasticity <= 0)
                return ERR_PTR(-EAGAIN);

        return ERR_PTR(-ENOENT);
}

static struct bucket_table *rhashtable_insert_one(
        struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
        struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
        void *data)
{
        struct bucket_table *new_tbl;
        struct rhash_head *head;

        if (!IS_ERR_OR_NULL(data))
                return ERR_PTR(-EEXIST);

        if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (new_tbl)
                return new_tbl;

        if (PTR_ERR(data) != -ENOENT)
                return ERR_CAST(data);

        if (unlikely(rht_grow_above_max(ht, tbl)))
                return ERR_PTR(-E2BIG);

        if (unlikely(rht_grow_above_100(ht, tbl)))
                return ERR_PTR(-EAGAIN);

        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (ht->rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        /* bkt is always the head of the list, so it holds
         * the lock, which we need to preserve
         */
        rht_assign_locked(bkt, obj);

        atomic_inc(&ht->nelems);
        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);

        return NULL;
}

static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
                                   struct rhash_head *obj)
{
        struct bucket_table *new_tbl;
        struct bucket_table *tbl;
        struct rhash_lock_head __rcu **bkt;
        unsigned long flags;
        unsigned int hash;
        void *data;

        new_tbl = rcu_dereference(ht->tbl);

        do {
                tbl = new_tbl;
                hash = rht_head_hashfn(ht, tbl, obj, ht->p);
                if (rcu_access_pointer(tbl->future_tbl))
                        /* Failure is OK */
                        bkt = rht_bucket_var(tbl, hash);
                else
                        bkt = rht_bucket_insert(ht, tbl, hash);
                if (bkt == NULL) {
                        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
                        data = ERR_PTR(-EAGAIN);
                } else {
                        flags = rht_lock(tbl, bkt);
                        data = rhashtable_lookup_one(ht, bkt, tbl,
                                                     hash, key, obj);
                        new_tbl = rhashtable_insert_one(ht, bkt, tbl,
                                                        hash, obj, data);
                        if (PTR_ERR(new_tbl) != -EEXIST)
                                data = ERR_CAST(new_tbl);

                        rht_unlock(tbl, bkt, flags);
                }
        } while (!IS_ERR_OR_NULL(new_tbl));

        if (PTR_ERR(data) == -EAGAIN)
                data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
                               -EAGAIN);

        return data;
}

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj)
{
        void *data;

        do {
                rcu_read_lock();
                data = rhashtable_try_insert(ht, key, obj);
                rcu_read_unlock();
        } while (PTR_ERR(data) == -EAGAIN);

        return data;
}
EXPORT_SYMBOL_GPL(rhashtable_insert_slow);

/**
 * rhashtable_walk_enter - Initialise an iterator
 * @ht:                Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
{
        iter->ht = ht;
        iter->p = NULL;
        iter->slot = 0;
        iter->skip = 0;
        iter->end_of_table = 0;

        spin_lock(&ht->lock);
        iter->walker.tbl =
                rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
        list_add(&iter->walker.list, &iter->walker.tbl->walkers);
        spin_unlock(&ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_enter);

/**
 * rhashtable_walk_exit - Free an iterator
 * @iter:        Hash table Iterator
 *
 * This function frees resources allocated by rhashtable_walk_enter.
 */
void rhashtable_walk_exit(struct rhashtable_iter *iter)
{
        spin_lock(&iter->ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&iter->ht->lock);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_exit);

/**
 * rhashtable_walk_start_check - Start a hash table walk
 * @iter:        Hash table iterator
 *
 * Start a hash table walk at the current iterator position.  Note that we take
 * the RCU lock in all cases including when we return an error.  So you must
 * always call rhashtable_walk_stop to clean up.
 *
 * Returns zero if successful.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may use it immediately
 * by calling rhashtable_walk_next.
 *
 * rhashtable_walk_start is defined as an inline variant that returns
 * void. This is preferred in cases where the caller would ignore
 * resize events and always continue.
 */
int rhashtable_walk_start_check(struct rhashtable_iter *iter)
        __acquires(RCU)
{
        struct rhashtable *ht = iter->ht;
        bool rhlist = ht->rhlist;

        rcu_read_lock();

        spin_lock(&ht->lock);
        if (iter->walker.tbl)
                list_del(&iter->walker.list);
        spin_unlock(&ht->lock);

        if (iter->end_of_table)
                return 0;
        if (!iter->walker.tbl) {
                iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
                iter->slot = 0;
                iter->skip = 0;
                return -EAGAIN;
        }

        if (iter->p && !rhlist) {
                /*
                 * We need to validate that 'p' is still in the table, and
                 * if so, update 'skip'
                 */
                struct rhash_head *p;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        skip++;
                        if (p == iter->p) {
                                iter->skip = skip;
                                goto found;
                        }
                }
                iter->p = NULL;
        } else if (iter->p && rhlist) {
                /* Need to validate that 'list' is still in the table, and
                 * if so, update 'skip' and 'p'.
                 */
                struct rhash_head *p;
                struct rhlist_head *list;
                int skip = 0;
                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
                        for (list = container_of(p, struct rhlist_head, rhead);
                             list;
                             list = rcu_dereference(list->next)) {
                                skip++;
                                if (list == iter->list) {
                                        iter->p = p;
                                        iter->skip = skip;
                                        goto found;
                                }
                        }
                }
                iter->p = NULL;
        }
found:
        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);

/**
 * __rhashtable_walk_find_next - Find the next element in a table (or the first
 * one in case of a new walk).
 *
 * @iter:        Hash table iterator
 *
 * Returns the found object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.
 */
static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
{
        struct bucket_table *tbl = iter->walker.tbl;
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (!tbl)
                return NULL;

        for (; iter->slot < tbl->size; iter->slot++) {
                int skip = iter->skip;

                rht_for_each_rcu(p, tbl, iter->slot) {
                        if (rhlist) {
                                list = container_of(p, struct rhlist_head,
                                                    rhead);
                                do {
                                        if (!skip)
                                                goto next;
                                        skip--;
                                        list = rcu_dereference(list->next);
                                } while (list);

                                continue;
                        }
                        if (!skip)
                                break;
                        skip--;
                }

next:
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                iter->skip = 0;
        }

        iter->p = NULL;

        /* Ensure we see any new tables. */
        smp_rmb();

        iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (iter->walker.tbl) {
                iter->slot = 0;
                iter->skip = 0;
                return ERR_PTR(-EAGAIN);
        } else {
                iter->end_of_table = true;
        }

        return NULL;
}

/**
 * rhashtable_walk_next - Return the next object and advance the iterator
 * @iter:        Hash table iterator
 *
 * Note that you must call rhashtable_walk_stop when you are finished
 * with the walk.
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_next(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;
        bool rhlist = ht->rhlist;

        if (p) {
                if (!rhlist || !(list = rcu_dereference(list->next))) {
                        p = rcu_dereference(p->next);
                        list = container_of(p, struct rhlist_head, rhead);
                }
                if (!rht_is_a_nulls(p)) {
                        iter->skip++;
                        iter->p = p;
                        iter->list = list;
                        return rht_obj(ht, rhlist ? &list->rhead : p);
                }

                /* At the end of this slot, switch to next one and then find
                 * next entry from that point.
                 */
                iter->skip = 0;
                iter->slot++;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_next);

/**
 * rhashtable_walk_peek - Return the next object but don't advance the iterator
 * @iter:        Hash table iterator
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_peek(struct rhashtable_iter *iter)
{
        struct rhlist_head *list = iter->list;
        struct rhashtable *ht = iter->ht;
        struct rhash_head *p = iter->p;

        if (p)
                return rht_obj(ht, ht->rhlist ? &list->rhead : p);

        /* No object found in current iter, find next one in the table. */

        if (iter->skip) {
                /* A nonzero skip value points to the next entry in the table
                 * beyond that last one that was found. Decrement skip so
                 * we find the current value. __rhashtable_walk_find_next
                 * will restore the original value of skip assuming that
                 * the table hasn't changed.
                 */
                iter->skip--;
        }

        return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_peek);

/**
 * rhashtable_walk_stop - Finish a hash table walk
 * @iter:        Hash table iterator
 *
 * Finish a hash table walk.  Does not reset the iterator to the start of the
 * hash table.
 */
void rhashtable_walk_stop(struct rhashtable_iter *iter)
        __releases(RCU)
{
        struct rhashtable *ht;
        struct bucket_table *tbl = iter->walker.tbl;

        if (!tbl)
                goto out;

        ht = iter->ht;

        spin_lock(&ht->lock);
        if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
                /* This bucket table is being freed, don't re-link it. */
                iter->walker.tbl = NULL;
        else
                list_add(&iter->walker.list, &tbl->walkers);
        spin_unlock(&ht->lock);

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rhashtable_walk_stop);

static size_t rounded_hashtable_size(const struct rhashtable_params *params)
{
        size_t retsize;

        if (params->nelem_hint)
                retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
                              (unsigned long)params->min_size);
        else
                retsize = max(HASH_DEFAULT_SIZE,
                              (unsigned long)params->min_size);

        return retsize;
}

static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
{
        return jhash2(key, length, seed);
}

/**
 * rhashtable_init - initialize a new hash table
 * @ht:                hash table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash table based on the provided configuration
 * parameters. A table can be configured either with a variable or
 * fixed length key:
 *
 * Configuration Example 1: Fixed length keys
 * struct test_obj {
 *        int                        key;
 *        void *                        my_member;
 *        struct rhash_head        node;
 * };
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .key_offset = offsetof(struct test_obj, key),
 *        .key_len = sizeof(int),
 *        .hashfn = jhash,
 * };
 *
 * Configuration Example 2: Variable length keys
 * struct test_obj {
 *        [...]
 *        struct rhash_head        node;
 * };
 *
 * u32 my_hash_fn(const void *data, u32 len, u32 seed)
 * {
 *        struct test_obj *obj = data;
 *
 *        return [... hash ...];
 * }
 *
 * struct rhashtable_params params = {
 *        .head_offset = offsetof(struct test_obj, node),
 *        .hashfn = jhash,
 *        .obj_hashfn = my_hash_fn,
 * };
 */
int rhashtable_init_noprof(struct rhashtable *ht,
                    const struct rhashtable_params *params)
{
        struct bucket_table *tbl;
        size_t size;

        if ((!params->key_len && !params->obj_hashfn) ||
            (params->obj_hashfn && !params->obj_cmpfn))
                return -EINVAL;

        memset(ht, 0, sizeof(*ht));
        mutex_init(&ht->mutex);
        spin_lock_init(&ht->lock);
        memcpy(&ht->p, params, sizeof(*params));

        alloc_tag_record(ht->alloc_tag);

        if (params->min_size)
                ht->p.min_size = roundup_pow_of_two(params->min_size);

        /* Cap total entries at 2^31 to avoid nelems overflow. */
        ht->max_elems = 1u << 31;

        if (params->max_size) {
                ht->p.max_size = rounddown_pow_of_two(params->max_size);
                if (ht->p.max_size < ht->max_elems / 2)
                        ht->max_elems = ht->p.max_size * 2;
        }

        ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);

        size = rounded_hashtable_size(&ht->p);

        ht->key_len = ht->p.key_len;
        if (!params->hashfn) {
                ht->p.hashfn = jhash;

                if (!(ht->key_len & (sizeof(u32) - 1))) {
                        ht->key_len /= sizeof(u32);
                        ht->p.hashfn = rhashtable_jhash2;
                }
        }

        /*
         * This is api initialization and thus we need to guarantee the
         * initial rhashtable allocation. Upon failure, retry with the
         * smallest possible size with __GFP_NOFAIL semantics.
         */
        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (unlikely(tbl == NULL)) {
                size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
                tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
        }

        atomic_set(&ht->nelems, 0);

        RCU_INIT_POINTER(ht->tbl, tbl);

        INIT_WORK(&ht->run_work, rht_deferred_worker);

        return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_init_noprof);

/**
 * rhltable_init - initialize a new hash list table
 * @hlt:        hash list table to be initialized
 * @params:        configuration parameters
 *
 * Initializes a new hash list table.
 *
 * See documentation for rhashtable_init.
 */
int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params)
{
        int err;

        err = rhashtable_init_noprof(&hlt->ht, params);
        hlt->ht.rhlist = true;
        return err;
}
EXPORT_SYMBOL_GPL(rhltable_init_noprof);

static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
                                void (*free_fn)(void *ptr, void *arg),
                                void *arg)
{
        struct rhlist_head *list;

        if (!ht->rhlist) {
                free_fn(rht_obj(ht, obj), arg);
                return;
        }

        list = container_of(obj, struct rhlist_head, rhead);
        do {
                obj = &list->rhead;
                list = rht_dereference(list->next, ht);
                free_fn(rht_obj(ht, obj), arg);
        } while (list);
}

/**
 * rhashtable_free_and_destroy - free elements and destroy hash table
 * @ht:                the hash table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * Stops an eventual async resize. If defined, invokes free_fn for each
 * element to releasal resources. Please note that RCU protected
 * readers may still be accessing the elements. Releasing of resources
 * must occur in a compatible manner. Then frees the bucket array.
 *
 * This function will eventually sleep to wait for an async resize
 * to complete. The caller is responsible that no further write operations
 * occurs in parallel.
 */
void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg)
{
        struct bucket_table *tbl, *next_tbl;
        unsigned int i;

        cancel_work_sync(&ht->run_work);

        mutex_lock(&ht->mutex);
        tbl = rht_dereference(ht->tbl, ht);
restart:
        if (free_fn) {
                for (i = 0; i < tbl->size; i++) {
                        struct rhash_head *pos, *next;

                        cond_resched();
                        for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL;
                             !rht_is_a_nulls(pos);
                             pos = next,
                             next = !rht_is_a_nulls(pos) ?
                                        rht_dereference(pos->next, ht) : NULL)
                                rhashtable_free_one(ht, pos, free_fn, arg);
                }
        }

        next_tbl = rht_dereference(tbl->future_tbl, ht);
        bucket_table_free(tbl);
        if (next_tbl) {
                tbl = next_tbl;
                goto restart;
        }
        mutex_unlock(&ht->mutex);
}
EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);

void rhashtable_destroy(struct rhashtable *ht)
{
        return rhashtable_free_and_destroy(ht, NULL, NULL);
}
EXPORT_SYMBOL_GPL(rhashtable_destroy);

struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        unsigned int subhash = hash;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
        subhash >>= tbl->nest;

        while (ntbl && size > (1 << shift)) {
                index = subhash & ((1 << shift) - 1);
                ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
                                                  tbl, hash);
                size >>= shift;
                subhash >>= shift;
        }

        if (!ntbl)
                return NULL;

        return &ntbl[subhash].bucket;

}
EXPORT_SYMBOL_GPL(__rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash)
{
        static struct rhash_lock_head __rcu *rhnull;

        if (!rhnull)
                INIT_RHT_NULLS_HEAD(rhnull);
        return __rht_bucket_nested(tbl, hash) ?: &rhnull;
}
EXPORT_SYMBOL_GPL(rht_bucket_nested);

struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
        unsigned int index = hash & ((1 << tbl->nest) - 1);
        unsigned int size = tbl->size >> tbl->nest;
        union nested_table *ntbl;

        ntbl = nested_table_top(tbl);
        hash >>= tbl->nest;
        ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                  size <= (1 << shift));

        while (ntbl && size > (1 << shift)) {
                index = hash & ((1 << shift) - 1);
                size >>= shift;
                hash >>= shift;
                ntbl = nested_table_alloc(ht, &ntbl[index].table,
                                          size <= (1 << shift));
        }

        if (!ntbl)
                return NULL;

        return &ntbl[hash].bucket;

}
EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);

























































    1 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_UNALIGNED_H
#define __ASM_GENERIC_UNALIGNED_H

/*
 * This is the most generic implementation of unaligned accesses
 * and should work almost anywhere.
 */
#include <linux/unaligned/packed_struct.h>
#include <asm/byteorder.h>

#define __get_unaligned_t(type, ptr) ({                                                \
        const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);        \
        __pptr->x;                                                                \
})

#define __put_unaligned_t(type, val, ptr) do {                                        \
        struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr);                \
        __pptr->x = (val);                                                        \
} while (0)

#define get_unaligned(ptr)        __get_unaligned_t(typeof(*(ptr)), (ptr))
#define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr))

static inline u16 get_unaligned_le16(const void *p)
{
        return le16_to_cpu(__get_unaligned_t(__le16, p));
}

static inline u32 get_unaligned_le32(const void *p)
{
        return le32_to_cpu(__get_unaligned_t(__le32, p));
}

static inline u64 get_unaligned_le64(const void *p)
{
        return le64_to_cpu(__get_unaligned_t(__le64, p));
}

static inline void put_unaligned_le16(u16 val, void *p)
{
        __put_unaligned_t(__le16, cpu_to_le16(val), p);
}

static inline void put_unaligned_le32(u32 val, void *p)
{
        __put_unaligned_t(__le32, cpu_to_le32(val), p);
}

static inline void put_unaligned_le64(u64 val, void *p)
{
        __put_unaligned_t(__le64, cpu_to_le64(val), p);
}

static inline u16 get_unaligned_be16(const void *p)
{
        return be16_to_cpu(__get_unaligned_t(__be16, p));
}

static inline u32 get_unaligned_be32(const void *p)
{
        return be32_to_cpu(__get_unaligned_t(__be32, p));
}

static inline u64 get_unaligned_be64(const void *p)
{
        return be64_to_cpu(__get_unaligned_t(__be64, p));
}

static inline void put_unaligned_be16(u16 val, void *p)
{
        __put_unaligned_t(__be16, cpu_to_be16(val), p);
}

static inline void put_unaligned_be32(u32 val, void *p)
{
        __put_unaligned_t(__be32, cpu_to_be32(val), p);
}

static inline void put_unaligned_be64(u64 val, void *p)
{
        __put_unaligned_t(__be64, cpu_to_be64(val), p);
}

static inline u32 __get_unaligned_be24(const u8 *p)
{
        return p[0] << 16 | p[1] << 8 | p[2];
}

static inline u32 get_unaligned_be24(const void *p)
{
        return __get_unaligned_be24(p);
}

static inline u32 __get_unaligned_le24(const u8 *p)
{
        return p[0] | p[1] << 8 | p[2] << 16;
}

static inline u32 get_unaligned_le24(const void *p)
{
        return __get_unaligned_le24(p);
}

static inline void __put_unaligned_be24(const u32 val, u8 *p)
{
        *p++ = (val >> 16) & 0xff;
        *p++ = (val >> 8) & 0xff;
        *p++ = val & 0xff;
}

static inline void put_unaligned_be24(const u32 val, void *p)
{
        __put_unaligned_be24(val, p);
}

static inline void __put_unaligned_le24(const u32 val, u8 *p)
{
        *p++ = val & 0xff;
        *p++ = (val >> 8) & 0xff;
        *p++ = (val >> 16) & 0xff;
}

static inline void put_unaligned_le24(const u32 val, void *p)
{
        __put_unaligned_le24(val, p);
}

static inline void __put_unaligned_be48(const u64 val, u8 *p)
{
        *p++ = (val >> 40) & 0xff;
        *p++ = (val >> 32) & 0xff;
        *p++ = (val >> 24) & 0xff;
        *p++ = (val >> 16) & 0xff;
        *p++ = (val >> 8) & 0xff;
        *p++ = val & 0xff;
}

static inline void put_unaligned_be48(const u64 val, void *p)
{
        __put_unaligned_be48(val, p);
}

static inline u64 __get_unaligned_be48(const u8 *p)
{
        return (u64)p[0] << 40 | (u64)p[1] << 32 | (u64)p[2] << 24 |
                p[3] << 16 | p[4] << 8 | p[5];
}

static inline u64 get_unaligned_be48(const void *p)
{
        return __get_unaligned_be48(p);
}

#endif /* __ASM_GENERIC_UNALIGNED_H */































































































    2 

    2 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// SPDX-License-Identifier: GPL-2.0
/*
 * SafeSetID Linux Security Module
 *
 * Author: Micah Morton <mortonm@chromium.org>
 *
 * Copyright (C) 2018 The Chromium OS Authors.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2, as
 * published by the Free Software Foundation.
 *
 */

#define pr_fmt(fmt) "SafeSetID: " fmt

#include <linux/lsm_hooks.h>
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <uapi/linux/lsm.h>
#include "lsm.h"

/* Flag indicating whether initialization completed */
int safesetid_initialized __initdata;

struct setid_ruleset __rcu *safesetid_setuid_rules;
struct setid_ruleset __rcu *safesetid_setgid_rules;


/* Compute a decision for a transition from @src to @dst under @policy. */
enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy,
                kid_t src, kid_t dst)
{
        struct setid_rule *rule;
        enum sid_policy_type result = SIDPOL_DEFAULT;

        if (policy->type == UID) {
                hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) {
                        if (!uid_eq(rule->src_id.uid, src.uid))
                                continue;
                        if (uid_eq(rule->dst_id.uid, dst.uid))
                                return SIDPOL_ALLOWED;
                        result = SIDPOL_CONSTRAINED;
                }
        } else if (policy->type == GID) {
                hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) {
                        if (!gid_eq(rule->src_id.gid, src.gid))
                                continue;
                        if (gid_eq(rule->dst_id.gid, dst.gid)){
                                return SIDPOL_ALLOWED;
                        }
                        result = SIDPOL_CONSTRAINED;
                }
        } else {
                /* Should not reach here, report the ID as contrainsted */
                result = SIDPOL_CONSTRAINED;
        }
        return result;
}

/*
 * Compute a decision for a transition from @src to @dst under the active
 * policy.
 */
static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type)
{
        enum sid_policy_type result = SIDPOL_DEFAULT;
        struct setid_ruleset *pol;

        rcu_read_lock();
        if (new_type == UID)
                pol = rcu_dereference(safesetid_setuid_rules);
        else if (new_type == GID)
                pol = rcu_dereference(safesetid_setgid_rules);
        else { /* Should not reach here */
                result = SIDPOL_CONSTRAINED;
                rcu_read_unlock();
                return result;
        }

        if (pol) {
                pol->type = new_type;
                result = _setid_policy_lookup(pol, src, dst);
        }
        rcu_read_unlock();
        return result;
}

static int safesetid_security_capable(const struct cred *cred,
                                      struct user_namespace *ns,
                                      int cap,
                                      unsigned int opts)
{
        /* We're only interested in CAP_SETUID and CAP_SETGID. */
        if (cap != CAP_SETUID && cap != CAP_SETGID)
                return 0;

        /*
         * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we
         * want to let it go through here; the real security check happens later, in
         * the task_fix_set{u/g}id or task_fix_setgroups hooks.
         */
        if ((opts & CAP_OPT_INSETID) != 0)
                return 0;

        switch (cap) {
        case CAP_SETUID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETUID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*uid() (e.g. setting up userns uid mappings).
                 */
                pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n",
                        __kuid_val(cred->uid));
                return -EPERM;
        case CAP_SETGID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETGID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*gid() (e.g. setting up userns gid mappings).
                 */
                pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n",
                        __kgid_val(cred->gid));
                return -EPERM;
        default:
                /* Error, the only capabilities were checking for is CAP_SETUID/GID */
                return 0;
        }
        return 0;
}

/*
 * Check whether a caller with old credentials @old is allowed to switch to
 * credentials that contain @new_id.
 */
static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type)
{
        bool permitted;

        /* If our old creds already had this ID in it, it's fine. */
        if (new_type == UID) {
                if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) ||
                        uid_eq(new_id.uid, old->suid))
                        return true;
        } else if (new_type == GID){
                if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) ||
                        gid_eq(new_id.gid, old->sgid))
                        return true;
        } else /* Error, new_type is an invalid type */
                return false;

        /*
         * Transitions to new UIDs require a check against the policy of the old
         * RUID.
         */
        permitted =
            setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED;

        if (!permitted) {
                if (new_type == UID) {
                        pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kuid_val(old->uid), __kuid_val(old->euid),
                                __kuid_val(old->suid), __kuid_val(new_id.uid));
                } else if (new_type == GID) {
                        pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kgid_val(old->gid), __kgid_val(old->egid),
                                __kgid_val(old->sgid), __kgid_val(new_id.gid));
                } else /* Error, new_type is an invalid type */
                        return false;
        }
        return permitted;
}

/*
 * Check whether there is either an exception for user under old cred struct to
 * set*uid to user under new cred struct, or the UID transition is allowed (by
 * Linux set*uid rules) even without CAP_SETUID.
 */
static int safesetid_task_fix_setuid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setuid restrictions for our old RUID. */
        if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        int i;

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        get_group_info(new->group_info);
        for (i = 0; i < new->group_info->ngroups; i++) {
                if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) {
                        put_group_info(new->group_info);
                        /*
                         * Kill this process to avoid potential security vulnerabilities
                         * that could arise from a missing allowlist entry preventing a
                         * privileged process from dropping to a lesser-privileged one.
                         */
                        force_sig(SIGKILL);
                        return -EACCES;
                }
        }

        put_group_info(new->group_info);
        return 0;
}

static const struct lsm_id safesetid_lsmid = {
        .name = "safesetid",
        .id = LSM_ID_SAFESETID,
};

static struct security_hook_list safesetid_security_hooks[] = {
        LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
        LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid),
        LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups),
        LSM_HOOK_INIT(capable, safesetid_security_capable)
};

static int __init safesetid_security_init(void)
{
        security_add_hooks(safesetid_security_hooks,
                           ARRAY_SIZE(safesetid_security_hooks),
                           &safesetid_lsmid);

        /* Report that SafeSetID successfully initialized */
        safesetid_initialized = 1;

        return 0;
}

DEFINE_LSM(safesetid_security_init) = {
        .init = safesetid_security_init,
        .name = "safesetid",
};




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2020 ARM Ltd.
 */
#ifndef __ASM_VDSO_PROCESSOR_H
#define __ASM_VDSO_PROCESSOR_H

#ifndef __ASSEMBLY__

/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static __always_inline void rep_nop(void)
{
        asm volatile("rep; nop" ::: "memory");
}

static __always_inline void cpu_relax(void)
{
        rep_nop();
}

struct getcpu_cache;

notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);

#endif /* __ASSEMBLY__ */

#endif /* __ASM_VDSO_PROCESSOR_H */




















































































































































































































































































    2 























    2 


    3 

























































    2 




    1 



































































































































































































































































































































    1 


    1 


























    1 



































    1 









    2 






    1 



    2 






    2 
    3 


    3 





    2 
    3 


















































































































































































































































































    1 







    2 









































































































































































    1 
















    3 




















































    2 








































































    1 



























    2 







    2 



























    1 

























    3 



























    1 


    1 








































































































    2 






































































    3 











































































































































































































































































































































































    2 







    1 
    2 

























    3 
    3 

    3 












    2 












    3 




























    1 
    2 










































































    2 












































































































































    3 













    3 






















































































































































































































































    1 




    1 

    1 




    1 


    1 






































































    3 


















    2 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the TCP module.
 *
 * Version:        @(#)tcp.h        1.0.5        05/23/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _TCP_H
#define _TCP_H

#define FASTRETRANS_DEBUG 1

#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/skbuff.h>
#include <linux/kref.h>
#include <linux/ktime.h>
#include <linux/indirect_call_wrapper.h>

#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <net/inet_hashtables.h>
#include <net/checksum.h>
#include <net/request_sock.h>
#include <net/sock_reuseport.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/tcp_states.h>
#include <net/tcp_ao.h>
#include <net/inet_ecn.h>
#include <net/dst.h>
#include <net/mptcp.h>

#include <linux/seq_file.h>
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>

extern struct inet_hashinfo tcp_hashinfo;

DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);

DECLARE_PER_CPU(u32, tcp_tw_isn);

void tcp_time_wait(struct sock *sk, int state, int timeo);

#define MAX_TCP_HEADER        L1_CACHE_ALIGN(128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
#define TCP_MIN_SND_MSS                48
#define TCP_MIN_GSO_SIZE        (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)

/*
 * Never offer a window over 32767 without using window scaling. Some
 * poor stacks do signed 16bit maths!
 */
#define MAX_TCP_WINDOW                32767U

/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS                88U

/* The initial MTU to use for probing */
#define TCP_BASE_MSS                1024

/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL        600

/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD        8

/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3

/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS        16U

/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE                14U

/* urg_data states */
#define TCP_URG_VALID        0x0100
#define TCP_URG_NOTYET        0x0200
#define TCP_URG_READ        0x0400

#define TCP_RETR1        3        /*
                                 * This is how many retries it does before it
                                 * tries to figure out if the gateway is
                                 * down. Minimal RFC value is 3; it corresponds
                                 * to ~3sec-8min depending on RTO.
                                 */

#define TCP_RETR2        15        /*
                                 * This should take at least
                                 * 90 minutes to time out.
                                 * RFC1122 says that the limit is 100 sec.
                                 * 15 is ~13-30min depending on RTO.
                                 */

#define TCP_SYN_RETRIES         6        /* This is how many retries are done
                                 * when active opening a connection.
                                 * RFC1122 says the minimum retry MUST
                                 * be at least 180secs.  Nevertheless
                                 * this value is corresponding to
                                 * 63secs of retransmission with the
                                 * current initial RTO.
                                 */

#define TCP_SYNACK_RETRIES 5        /* This is how may retries are done
                                 * when passive opening a connection.
                                 * This is corresponding to 31secs of
                                 * retransmission with the current
                                 * initial RTO.
                                 */

#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
                                  * state, about 60 seconds        */
#define TCP_FIN_TIMEOUT        TCP_TIMEWAIT_LEN
                                 /* BSD style FIN_WAIT2 deadlock breaker.
                                  * It used to be 3min, new value is 60sec,
                                  * to combine FIN-WAIT-2 timeout with
                                  * TIME-WAIT timer.
                                  */
#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */

#define TCP_DELACK_MAX        ((unsigned)(HZ/5))        /* maximal time to delay before sending an ACK */
static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);

#if HZ >= 100
#define TCP_DELACK_MIN        ((unsigned)(HZ/25))        /* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN        ((unsigned)(HZ/25))
#else
#define TCP_DELACK_MIN        4U
#define TCP_ATO_MIN        4U
#endif
#define TCP_RTO_MAX        ((unsigned)(120*HZ))
#define TCP_RTO_MIN        ((unsigned)(HZ/5))
#define TCP_TIMEOUT_MIN        (2U) /* Min timeout for TCP timers in jiffies */

#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */

#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))        /* RFC6298 2.1 initial RTO value        */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))        /* RFC 1122 initial RTO value, now
                                                 * used as a fallback RTO for the
                                                 * initial data transmission if no
                                                 * valid RTT sample has been acquired,
                                                 * most likely due to retrans in 3WHS.
                                                 */

#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
                                                         * for local resources.
                                                         */
#define TCP_KEEPALIVE_TIME        (120*60*HZ)        /* two hours */
#define TCP_KEEPALIVE_PROBES        9                /* Max of 9 keepalive probes        */
#define TCP_KEEPALIVE_INTVL        (75*HZ)

#define MAX_TCP_KEEPIDLE        32767
#define MAX_TCP_KEEPINTVL        32767
#define MAX_TCP_KEEPCNT                127
#define MAX_TCP_SYNCNT                127

/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
 * to avoid overflows. This assumes a clock smaller than 1 Mhz.
 * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
 */
#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)

#define TCP_PAWS_MSL        60                /* Per-host timestamps are invalidated
                                         * after this time. It should be equal
                                         * (or greater than) TCP_TIMEWAIT_LEN
                                         * to provide reliability equal to one
                                         * provided by timewait state.
                                         */
#define TCP_PAWS_WINDOW        1                /* Replay window for per-host
                                         * timestamps. It must be less than
                                         * minimal timewait lifetime.
                                         */
/*
 *        TCP option
 */

#define TCPOPT_NOP                1        /* Padding */
#define TCPOPT_EOL                0        /* End of options */
#define TCPOPT_MSS                2        /* Segment size negotiating */
#define TCPOPT_WINDOW                3        /* Window scaling */
#define TCPOPT_SACK_PERM        4       /* SACK Permitted */
#define TCPOPT_SACK             5       /* SACK Block */
#define TCPOPT_TIMESTAMP        8        /* Better RTT estimations/PAWS */
#define TCPOPT_MD5SIG                19        /* MD5 Signature (RFC2385) */
#define TCPOPT_AO                29        /* Authentication Option (RFC5925) */
#define TCPOPT_MPTCP                30        /* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN                34        /* Fast open (RFC7413) */
#define TCPOPT_EXP                254        /* Experimental */
/* Magic number to be after the option value for sharing TCP
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
 */
#define TCPOPT_FASTOPEN_MAGIC        0xF989
#define TCPOPT_SMC_MAGIC        0xE2D4C3D9

/*
 *     TCP option lengths
 */

#define TCPOLEN_MSS            4
#define TCPOLEN_WINDOW         3
#define TCPOLEN_SACK_PERM      2
#define TCPOLEN_TIMESTAMP      10
#define TCPOLEN_MD5SIG         18
#define TCPOLEN_FASTOPEN_BASE  2
#define TCPOLEN_EXP_FASTOPEN_BASE  4
#define TCPOLEN_EXP_SMC_BASE   6

/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED                12
#define TCPOLEN_WSCALE_ALIGNED                4
#define TCPOLEN_SACKPERM_ALIGNED        4
#define TCPOLEN_SACK_BASE                2
#define TCPOLEN_SACK_BASE_ALIGNED        4
#define TCPOLEN_SACK_PERBLOCK                8
#define TCPOLEN_MD5SIG_ALIGNED                20
#define TCPOLEN_MSS_ALIGNED                4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED        8

/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF                1        /* Nagle's algo is disabled */
#define TCP_NAGLE_CORK                2        /* Socket is corked            */
#define TCP_NAGLE_PUSH                4        /* Cork is overridden for already queued data */

/* TCP thin-stream limits */
#define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */

/* TCP initial congestion window as per rfc6928 */
#define TCP_INIT_CWND                10

/* Bit Flags for sysctl_tcp_fastopen */
#define        TFO_CLIENT_ENABLE        1
#define        TFO_SERVER_ENABLE        2
#define        TFO_CLIENT_NO_COOKIE        4        /* Data in SYN w/o cookie option */

/* Accept SYN data w/o any cookie option */
#define        TFO_SERVER_COOKIE_NOT_REQD        0x200

/* Force enable TFO on all listeners, i.e., not requiring the
 * TCP_FASTOPEN socket option.
 */
#define        TFO_SERVER_WO_SOCKOPT1        0x400


/* sysctl variables for tcp */
extern int sysctl_tcp_max_orphans;
extern long sysctl_tcp_mem[3];

#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */

extern atomic_long_t tcp_memory_allocated;
DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);

extern struct percpu_counter tcp_sockets_allocated;
extern unsigned long tcp_memory_pressure;

/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
            mem_cgroup_under_socket_pressure(sk->sk_memcg))
                return true;

        return READ_ONCE(tcp_memory_pressure);
}
/*
 * The next routines deal with comparing 32 bit unsigned ints
 * and worry about wraparound (automatic with unsigned arithmetic).
 */

static inline bool before(__u32 seq1, __u32 seq2)
{
        return (__s32)(seq1-seq2) < 0;
}
#define after(seq2, seq1)         before(seq1, seq2)

/* is s2<=s1<=s3 ? */
static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
{
        return seq3 - seq2 >= seq1 - seq2;
}

static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
        sk_wmem_queued_add(sk, -skb->truesize);
        if (!skb_zcopy_pure(skb))
                sk_mem_uncharge(sk, skb->truesize);
        else
                sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb)));
        __kfree_skb(skb);
}

void sk_forced_mem_schedule(struct sock *sk, int size);

bool tcp_check_oom(const struct sock *sk, int shift);


extern struct proto tcp_prot;

#define TCP_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define __TCP_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define TCP_DEC_STATS(net, field)        SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
#define TCP_ADD_STATS(net, field, val)        SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)

void tcp_tasklet_init(void);

int tcp_v4_err(struct sk_buff *skb, u32);

void tcp_shutdown(struct sock *sk, int how);

int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb);

void tcp_remove_empty_skb(struct sock *sk);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
                         size_t size, struct ubuf_info *uarg);
void tcp_splice_eof(struct socket *sock);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
int tcp_wmem_schedule(struct sock *sk, int copy);
void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
              int size_goal);
void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
void tcp_twsk_purge(struct list_head *net_exit_list);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags);
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
                                     bool force_schedule);

static inline void tcp_dec_quickack_mode(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ack.quick) {
                /* How many ACKs S/ACKing new data have we sent? */
                const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0;

                if (pkts >= icsk->icsk_ack.quick) {
                        icsk->icsk_ack.quick = 0;
                        /* Leaving quickack mode we deflate ATO. */
                        icsk->icsk_ack.ato   = TCP_ATO_MIN;
                } else
                        icsk->icsk_ack.quick -= pkts;
        }
}

#define        TCP_ECN_OK                1
#define        TCP_ECN_QUEUE_CWR        2
#define        TCP_ECN_DEMAND_CWR        4
#define        TCP_ECN_SEEN                8

enum tcp_tw_status {
        TCP_TW_SUCCESS = 0,
        TCP_TW_RST = 1,
        TCP_TW_ACK = 2,
        TCP_TW_SYN = 3
};


enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
                                              struct sk_buff *skb,
                                              const struct tcphdr *th,
                                              u32 *tw_isn);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           struct request_sock *req, bool fastopen,
                           bool *lost_race);
enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
                                       struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk);
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
void __tcp_close(struct sock *sk, long timeout);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
__poll_t tcp_poll(struct file *file, struct socket *sock,
                      struct poll_table_struct *wait);
int do_tcp_getsockopt(struct sock *sk, int level,
                      int optname, sockptr_t optval, sockptr_t optlen);
int tcp_getsockopt(struct sock *sk, int level, int optname,
                   char __user *optval, int __user *optlen);
bool tcp_bpf_bypass_getsockopt(int level, int optname);
int do_tcp_setsockopt(struct sock *sk, int level, int optname,
                      sockptr_t optval, unsigned int optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                   unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                int flags, int *addr_len);
int tcp_set_rcvlowat(struct sock *sk, int val);
int tcp_set_window_clamp(struct sock *sk, int val);
void tcp_update_recv_tstamps(struct sk_buff *skb,
                             struct scm_timestamping_internal *tss);
void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
                        struct scm_timestamping_internal *tss);
void tcp_data_ready(struct sock *sk);
#ifdef CONFIG_MMU
int tcp_mmap(struct file *file, struct socket *sock,
             struct vm_area_struct *vma);
#endif
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
                       struct tcp_options_received *opt_rx,
                       int estab, struct tcp_fastopen_cookie *foc);

/*
 *        BPF SKB-less helpers
 */
u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
                         struct tcphdr *th, u32 *cookie);
u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
                         struct tcphdr *th, u32 *cookie);
u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss);
u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
                          const struct tcp_request_sock_ops *af_ops,
                          struct sock *sk, struct tcphdr *th);
/*
 *        TCP v4 functions exported for the inet6 API
 */

void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
void tcp_v4_mtu_reduced(struct sock *sk);
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(const struct sock *sk,
                                      struct request_sock *req,
                                      struct sk_buff *skb);
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req,
                                  struct dst_entry *dst,
                                  struct request_sock *req_unhash,
                                  bool *own_req);
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int tcp_connect(struct sock *sk);
enum tcp_synack_type {
        TCP_SYNACK_NORMAL,
        TCP_SYNACK_FASTOPEN,
        TCP_SYNACK_COOKIE,
};
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
                                struct tcp_fastopen_cookie *foc,
                                enum tcp_synack_type synack_type,
                                struct sk_buff *syn_skb);
int tcp_disconnect(struct sock *sk, int flags);

void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);

/* From syncookies.c */
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
                                 struct request_sock *req,
                                 struct dst_entry *dst);
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
                                            struct sock *sk, struct sk_buff *skb,
                                            struct tcp_options_received *tcp_opt,
                                            int mss, u32 tsoff);

#if IS_ENABLED(CONFIG_BPF)
struct bpf_tcp_req_attrs {
        u32 rcv_tsval;
        u32 rcv_tsecr;
        u16 mss;
        u8 rcv_wscale;
        u8 snd_wscale;
        u8 ecn_ok;
        u8 wscale_ok;
        u8 sack_ok;
        u8 tstamp_ok;
        u8 usec_ts_ok;
        u8 reserved[3];
};
#endif

#ifdef CONFIG_SYN_COOKIES

/* Syncookies use a monotonic timer which increments every 60 seconds.
 * This counter is used both as a hash input and partially encoded into
 * the cookie value.  A cookie is only validated further if the delta
 * between the current counter value and the encoded one is less than this,
 * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
 * the counter advances immediately after a cookie is generated).
 */
#define MAX_SYNCOOKIE_AGE        2
#define TCP_SYNCOOKIE_PERIOD        (60 * HZ)
#define TCP_SYNCOOKIE_VALID        (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)

/* syncookies: remember time of last synqueue overflow
 * But do not dirty this field too often (once per second is enough)
 * It is racy as we do not hold a lock, but race is very minor.
 */
static inline void tcp_synq_overflow(const struct sock *sk)
{
        unsigned int last_overflow;
        unsigned int now = jiffies;

        if (sk->sk_reuseport) {
                struct sock_reuseport *reuse;

                reuse = rcu_dereference(sk->sk_reuseport_cb);
                if (likely(reuse)) {
                        last_overflow = READ_ONCE(reuse->synq_overflow_ts);
                        if (!time_between32(now, last_overflow,
                                            last_overflow + HZ))
                                WRITE_ONCE(reuse->synq_overflow_ts, now);
                        return;
                }
        }

        last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
        if (!time_between32(now, last_overflow, last_overflow + HZ))
                WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now);
}

/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
        unsigned int last_overflow;
        unsigned int now = jiffies;

        if (sk->sk_reuseport) {
                struct sock_reuseport *reuse;

                reuse = rcu_dereference(sk->sk_reuseport_cb);
                if (likely(reuse)) {
                        last_overflow = READ_ONCE(reuse->synq_overflow_ts);
                        return !time_between32(now, last_overflow - HZ,
                                               last_overflow +
                                               TCP_SYNCOOKIE_VALID);
                }
        }

        last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);

        /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
         * then we're under synflood. However, we have to use
         * 'last_overflow - HZ' as lower bound. That's because a concurrent
         * tcp_synq_overflow() could update .ts_recent_stamp after we read
         * jiffies but before we store .ts_recent_stamp into last_overflow,
         * which could lead to rejecting a valid syncookie.
         */
        return !time_between32(now, last_overflow - HZ,
                               last_overflow + TCP_SYNCOOKIE_VALID);
}

static inline u32 tcp_cookie_time(void)
{
        u64 val = get_jiffies_64();

        do_div(val, TCP_SYNCOOKIE_PERIOD);
        return val;
}

/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */
static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val)
{
        if (usec_ts)
                return div_u64(val, NSEC_PER_USEC);

        return div_u64(val, NSEC_PER_MSEC);
}

u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
                              u16 *mssp);
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
u64 cookie_init_timestamp(struct request_sock *req, u64 now);
bool cookie_timestamp_decode(const struct net *net,
                             struct tcp_options_received *opt);

static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst)
{
        return READ_ONCE(net->ipv4.sysctl_tcp_ecn) ||
                dst_feature(dst, RTAX_FEATURE_ECN);
}

#if IS_ENABLED(CONFIG_BPF)
static inline bool cookie_bpf_ok(struct sk_buff *skb)
{
        return skb->sk;
}

struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb);
#else
static inline bool cookie_bpf_ok(struct sk_buff *skb)
{
        return false;
}

static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk,
                                                    struct sk_buff *skb)
{
        return NULL;
}
#endif

/* From net/ipv6/syncookies.c */
int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);

u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
                              const struct tcphdr *th, u16 *mssp);
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */

void tcp_skb_entail(struct sock *sk, struct sk_buff *skb);
void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
                               int nonagle);
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
enum tcp_queue {
        TCP_FRAG_IN_WRITE_QUEUE,
        TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
                 struct sk_buff *skb, u32 len,
                 unsigned int mss_now, gfp_t gfp);

void tcp_send_probe0(struct sock *);
int tcp_write_wakeup(struct sock *, int mib);
void tcp_send_fin(struct sock *sk);
void tcp_send_active_reset(struct sock *sk, gfp_t priority,
                           enum sk_rst_reason reason);
int tcp_send_synack(struct sock *);
void tcp_push_one(struct sock *, unsigned int mss_now);
void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk);
bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
                             const struct sk_buff *next_skb);

/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_done_with_error(struct sock *sk, int err);
void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
void tcp_check_space(struct sock *sk);
void tcp_sack_compress_send_ack(struct sock *sk);

/* tcp_timer.c */
void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
        if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
                __sock_put(sk);

        if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
                __sock_put(sk);

        inet_csk_clear_xmit_timers(sk);
}

unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
unsigned int tcp_current_mss(struct sock *sk);
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);

/* Bound MSS / TSO packet size with the half of the window */
static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{
        int cutoff;

        /* When peer uses tiny windows, there is no use in packetizing
         * to sub-MSS pieces for the sake of SWS or making sure there
         * are enough packets in the pipe for fast recovery.
         *
         * On the other hand, for extremely large MSS devices, handling
         * smaller than MSS windows in this way does make sense.
         */
        if (tp->max_window > TCP_MSS_DEFAULT)
                cutoff = (tp->max_window >> 1);
        else
                cutoff = tp->max_window;

        if (cutoff && pktsize > cutoff)
                return max_t(int, cutoff, 68U - tp->tcp_header_len);
        else
                return pktsize;
}

/* tcp.c */
void tcp_get_info(struct sock *, struct tcp_info *);

/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                  sk_read_actor_t recv_actor);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
void tcp_read_done(struct sock *sk, size_t len);

void tcp_initialize_rcv_mss(struct sock *sk);

int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);

static inline void tcp_bound_rto(struct sock *sk)
{
        if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
                inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
        return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}

static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
        /* mptcp hooks are only on the slow path */
        if (sk_is_mptcp((struct sock *)tp))
                return;

        tp->pred_flags = htonl((tp->tcp_header_len << 26) |
                               ntohl(TCP_FLAG_ACK) |
                               snd_wnd);
}

static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
        __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}

static inline void tcp_fast_path_check(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
            tp->rcv_wnd &&
            atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
            !tp->urg_data)
                tcp_fast_path_on(tp);
}

u32 tcp_delack_max(const struct sock *sk);

/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(const struct sock *sk)
{
        const struct dst_entry *dst = __sk_dst_get(sk);
        u32 rto_min = inet_csk(sk)->icsk_rto_min;

        if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
                rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
        return rto_min;
}

static inline u32 tcp_rto_min_us(const struct sock *sk)
{
        return jiffies_to_usecs(tcp_rto_min(sk));
}

static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
        return dst_metric_locked(dst, RTAX_CC_ALGO);
}

/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
        return minmax_get(&tp->rtt_min);
}

/* Compute the actual receive window we are currently advertising.
 * Rcv_nxt can be after the window if our peer push more data
 * than the offered window.
 */
static inline u32 tcp_receive_window(const struct tcp_sock *tp)
{
        s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;

        if (win < 0)
                win = 0;
        return (u32) win;
}

/* Choose a new window, without checks for shrinking, and without
 * scaling applied to the result.  The caller does these things
 * if necessary.  This is a "raw" window selection.
 */
u32 __tcp_select_window(struct sock *sk);

void tcp_send_window_probe(struct sock *sk);

/* TCP uses 32bit jiffies to save some space.
 * Note that this is different from tcp_time_stamp, which
 * historically has been the same until linux-4.13.
 */
#define tcp_jiffies32 ((u32)jiffies)

/*
 * Deliver a 32bit value for TCP timestamp option (RFC 7323)
 * It is no longer tied to jiffies, but to 1 ms clock.
 * Note: double check if you want to use tcp_jiffies32 instead of this.
 */
#define TCP_TS_HZ        1000

static inline u64 tcp_clock_ns(void)
{
        return ktime_get_ns();
}

static inline u64 tcp_clock_us(void)
{
        return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

static inline u64 tcp_clock_ms(void)
{
        return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
}

/* TCP Timestamp included in TS option (RFC 1323) can either use ms
 * or usec resolution. Each socket carries a flag to select one or other
 * resolution, as the route attribute could change anytime.
 * Each flow must stick to initial resolution.
 */
static inline u32 tcp_clock_ts(bool usec_ts)
{
        return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}

static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
{
        return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
}

static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
{
        if (tp->tcp_usec_ts)
                return tp->tcp_mstamp;
        return tcp_time_stamp_ms(tp);
}

void tcp_mstamp_refresh(struct tcp_sock *tp);

static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
        return max_t(s64, t1 - t0, 0);
}

/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
        return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}

/* Provide skb TSval in usec or ms unit */
static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
{
        if (usec_ts)
                return tcp_skb_timestamp_us(skb);

        return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
}

static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
{
        return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
}

static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
{
        return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
}

#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

#define TCPHDR_FIN 0x01
#define TCPHDR_SYN 0x02
#define TCPHDR_RST 0x04
#define TCPHDR_PSH 0x08
#define TCPHDR_ACK 0x10
#define TCPHDR_URG 0x20
#define TCPHDR_ECE 0x40
#define TCPHDR_CWR 0x80

#define TCPHDR_SYN_ECN        (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)

/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
        TCPCB_SACKED_ACKED        = (1 << 0),        /* SKB ACK'd by a SACK block        */
        TCPCB_SACKED_RETRANS        = (1 << 1),        /* SKB retransmitted                */
        TCPCB_LOST                = (1 << 2),        /* SKB is lost                        */
        TCPCB_TAGBITS                = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS |
                                   TCPCB_LOST),        /* All tag bits                        */
        TCPCB_REPAIRED                = (1 << 4),        /* SKB repaired (no skb_mstamp_ns)        */
        TCPCB_EVER_RETRANS        = (1 << 7),        /* Ever retransmitted frame        */
        TCPCB_RETRANS                = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS |
                                   TCPCB_REPAIRED),
};

/* This is what the send packet queuing engine uses to pass
 * TCP per-packet control information to the transmission code.
 * We also store the host-order sequence numbers in here too.
 * This is 44 bytes if IPV6 is enabled.
 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
 */
struct tcp_skb_cb {
        __u32                seq;                /* Starting sequence number        */
        __u32                end_seq;        /* SEQ + FIN + SYN + datalen        */
        union {
                /* Note :
                 *           tcp_gso_segs/size are used in write queue only,
                 *          cf tcp_skb_pcount()/tcp_skb_mss()
                 */
                struct {
                        u16        tcp_gso_segs;
                        u16        tcp_gso_size;
                };
        };
        __u8                tcp_flags;        /* TCP header flags. (tcp[13])        */

        __u8                sacked;                /* State flags for SACK.        */
        __u8                ip_dsfield;        /* IPv4 tos or IPv6 dsfield        */
        __u8                txstamp_ack:1,        /* Record TX timestamp for ack? */
                        eor:1,                /* Is skb MSG_EOR marked? */
                        has_rxtstamp:1,        /* SKB has a RX timestamp        */
                        unused:5;
        __u32                ack_seq;        /* Sequence number ACK'd        */
        union {
                struct {
#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
                        /* There is space for up to 24 bytes */
                        __u32 is_app_limited:1, /* cwnd not fully used? */
                              delivered_ce:20,
                              unused:11;
                        /* pkts S/ACKed so far upon tx of skb, incl retrans: */
                        __u32 delivered;
                        /* start of send pipeline phase */
                        u64 first_tx_mstamp;
                        /* when we reached the "delivered" count */
                        u64 delivered_mstamp;
                } tx;   /* only used for outgoing skbs */
                union {
                        struct inet_skb_parm        h4;
#if IS_ENABLED(CONFIG_IPV6)
                        struct inet6_skb_parm        h6;
#endif
                } header;        /* For incoming skbs */
        };
};

#define TCP_SKB_CB(__skb)        ((struct tcp_skb_cb *)&((__skb)->cb[0]))

extern const struct inet_connection_sock_af_ops ipv4_specific;

#if IS_ENABLED(CONFIG_IPV6)
/* This is the variant of inet6_iif() that must be used by TCP,
 * as TCP moves IP6CB into a different location in skb->cb[]
 */
static inline int tcp_v6_iif(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->header.h6.iif;
}

static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
{
        bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);

        return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
}

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
                return TCP_SKB_CB(skb)->header.h6.iif;
#endif
        return 0;
}

extern const struct inet_connection_sock_af_ops ipv6_specific;

INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
void tcp_v6_early_demux(struct sk_buff *skb);

#endif

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v4_sdif(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
                return TCP_SKB_CB(skb)->header.h4.iif;
#endif
        return 0;
}

/* Due to TSO, an SKB can be composed of multiple actual
 * packets.  To keep these tracked properly, we use this.
 */
static inline int tcp_skb_pcount(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->tcp_gso_segs;
}

static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
{
        TCP_SKB_CB(skb)->tcp_gso_segs = segs;
}

static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
{
        TCP_SKB_CB(skb)->tcp_gso_segs += segs;
}

/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
static inline int tcp_skb_mss(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->tcp_gso_size;
}

static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
{
        return likely(!TCP_SKB_CB(skb)->eor);
}

static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
                                        const struct sk_buff *from)
{
        /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */
        return likely(tcp_skb_can_collapse_to(to) &&
                      mptcp_skb_can_collapse(to, from) &&
                      skb_pure_zcopy_same(to, from));
}

static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to,
                                           const struct sk_buff *from)
{
        return likely(mptcp_skb_can_collapse(to, from) &&
                      !skb_cmp_decrypted(to, from));
}

/* Events passed to congestion control interface */
enum tcp_ca_event {
        CA_EVENT_TX_START,        /* first transmit when no packets in flight */
        CA_EVENT_CWND_RESTART,        /* congestion window restart */
        CA_EVENT_COMPLETE_CWR,        /* end of congestion recovery */
        CA_EVENT_LOSS,                /* loss timeout */
        CA_EVENT_ECN_NO_CE,        /* ECT set, but not CE marked */
        CA_EVENT_ECN_IS_CE,        /* received CE marked IP packet */
};

/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
enum tcp_ca_ack_event_flags {
        CA_ACK_SLOWPATH                = (1 << 0),        /* In slow path processing */
        CA_ACK_WIN_UPDATE        = (1 << 1),        /* ACK updated window */
        CA_ACK_ECE                = (1 << 2),        /* ECE bit is set on ack */
};

/*
 * Interface for adding new TCP congestion control handlers
 */
#define TCP_CA_NAME_MAX        16
#define TCP_CA_MAX        128
#define TCP_CA_BUF_MAX        (TCP_CA_NAME_MAX*TCP_CA_MAX)

#define TCP_CA_UNSPEC        0

/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN        0x2
#define TCP_CONG_MASK        (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)

union tcp_cc_info;

struct ack_sample {
        u32 pkts_acked;
        s32 rtt_us;
        u32 in_flight;
};

/* A rate sample measures the number of (original/retransmitted) data
 * packets delivered "delivered" over an interval of time "interval_us".
 * The tcp_rate.c code fills in the rate sample, and congestion
 * control modules that define a cong_control function to run at the end
 * of ACK processing can optionally chose to consult this sample when
 * setting cwnd and pacing rate.
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
        u64  prior_mstamp; /* starting timestamp for interval */
        u32  prior_delivered;        /* tp->delivered at "prior_mstamp" */
        u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
        s32  delivered;                /* number of packets delivered over interval */
        s32  delivered_ce;        /* number of packets delivered w/ CE marks*/
        long interval_us;        /* time for tp->delivered to incr "delivered" */
        u32 snd_interval_us;        /* snd interval for delivered packets */
        u32 rcv_interval_us;        /* rcv interval for delivered packets */
        long rtt_us;                /* RTT of last (S)ACKed packet (or -1) */
        int  losses;                /* number of packets marked lost upon ACK */
        u32  acked_sacked;        /* number of packets newly (S)ACKed upon ACK */
        u32  prior_in_flight;        /* in flight before this ACK */
        u32  last_end_seq;        /* end_seq of most recently ACKed packet */
        bool is_app_limited;        /* is sample from packet with bubble in pipe? */
        bool is_retrans;        /* is sample from retransmission? */
        bool is_ack_delayed;        /* is this (likely) a delayed ACK? */
};

struct tcp_congestion_ops {
/* fast path fields are put first to fill one cache line */

        /* return slow start threshold (required) */
        u32 (*ssthresh)(struct sock *sk);

        /* do new cwnd calculation (required) */
        void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);

        /* call before changing ca_state (optional) */
        void (*set_state)(struct sock *sk, u8 new_state);

        /* call when cwnd event occurs (optional) */
        void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);

        /* call when ack arrives (optional) */
        void (*in_ack_event)(struct sock *sk, u32 flags);

        /* hook for packet ack accounting (optional) */
        void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);

        /* override sysctl_tcp_min_tso_segs */
        u32 (*min_tso_segs)(struct sock *sk);

        /* call when packets are delivered to update cwnd and pacing rate,
         * after all the ca_state processing. (optional)
         */
        void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs);


        /* new value of cwnd after loss (required) */
        u32  (*undo_cwnd)(struct sock *sk);
        /* returns the multiplier used in tcp_sndbuf_expand (optional) */
        u32 (*sndbuf_expand)(struct sock *sk);

/* control/slow paths put last */
        /* get info for inet_diag (optional) */
        size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
                           union tcp_cc_info *info);

        char                         name[TCP_CA_NAME_MAX];
        struct module                *owner;
        struct list_head        list;
        u32                        key;
        u32                        flags;

        /* initialize private data (optional) */
        void (*init)(struct sock *sk);
        /* cleanup private data  (optional) */
        void (*release)(struct sock *sk);
} ____cacheline_aligned_in_smp;

int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
int tcp_update_congestion_control(struct tcp_congestion_ops *type,
                                  struct tcp_congestion_ops *old_type);
int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);

void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk);
int tcp_set_default_congestion_control(struct net *net, const char *name);
void tcp_get_default_congestion_control(struct net *net, char *name);
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
                               bool cap_net_admin);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);

u32 tcp_reno_ssthresh(struct sock *sk);
u32 tcp_reno_undo_cwnd(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;

struct tcp_congestion_ops *tcp_ca_find(const char *name);
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
        return NULL;
}
#endif

static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
}

static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ca_ops->cwnd_event)
                icsk->icsk_ca_ops->cwnd_event(sk, event);
}

/* From tcp_cong.c */
void tcp_set_ca_state(struct sock *sk, const u8 ca_state);

/* From tcp_rate.c */
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
                            struct rate_sample *rs);
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
                  bool is_sack_reneg, struct rate_sample *rs);
void tcp_rate_check_app_limited(struct sock *sk);

static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
{
        return t1 > t2 || (t1 == t2 && after(seq1, seq2));
}

/* These functions determine how the current flow behaves in respect of SACK
 * handling. SACK is negotiated with the peer, and therefore it can vary
 * between different flows.
 *
 * tcp_is_sack - SACK enabled
 * tcp_is_reno - No SACK
 */
static inline int tcp_is_sack(const struct tcp_sock *tp)
{
        return likely(tp->rx_opt.sack_ok);
}

static inline bool tcp_is_reno(const struct tcp_sock *tp)
{
        return !tcp_is_sack(tp);
}

static inline unsigned int tcp_left_out(const struct tcp_sock *tp)
{
        return tp->sacked_out + tp->lost_out;
}

/* This determines how many packets are "in the network" to the best
 * of our knowledge.  In many cases it is conservative, but where
 * detailed information is available from the receiver (via SACK
 * blocks etc.) we can make more aggressive calculations.
 *
 * Use this for decisions involving congestion control, use just
 * tp->packets_out to determine if the send queue is empty or not.
 *
 * Read this equation as:
 *
 *        "Packets sent once on transmission queue" MINUS
 *        "Packets left network, but not honestly ACKed yet" PLUS
 *        "Packets fast retransmitted"
 */
static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
{
        return tp->packets_out - tcp_left_out(tp) + tp->retrans_out;
}

#define TCP_INFINITE_SSTHRESH        0x7fffffff

static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
{
        return tp->snd_cwnd;
}

static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
{
        WARN_ON_ONCE((int)val <= 0);
        tp->snd_cwnd = val;
}

static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
{
        return tcp_snd_cwnd(tp) < tp->snd_ssthresh;
}

static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp)
{
        return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
}

static inline bool tcp_in_cwnd_reduction(const struct sock *sk)
{
        return (TCPF_CA_CWR | TCPF_CA_Recovery) &
               (1 << inet_csk(sk)->icsk_ca_state);
}

/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
 * The exception is cwnd reduction phase, when cwnd is decreasing towards
 * ssthresh.
 */
static inline __u32 tcp_current_ssthresh(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_in_cwnd_reduction(sk))
                return tp->snd_ssthresh;
        else
                return max(tp->snd_ssthresh,
                           ((tcp_snd_cwnd(tp) >> 1) +
                            (tcp_snd_cwnd(tp) >> 2)));
}

/* Use define here intentionally to get WARN_ON location shown at the caller */
#define tcp_verify_left_out(tp)        WARN_ON(tcp_left_out(tp) > tp->packets_out)

void tcp_enter_cwr(struct sock *sk);
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);

/* The maximum number of MSS of available cwnd for which TSO defers
 * sending if not using sysctl_tcp_tso_win_divisor.
 */
static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp)
{
        return 3;
}

/* Returns end sequence number of the receiver's advertised window */
static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
{
        return tp->snd_una + tp->snd_wnd;
}

/* We follow the spirit of RFC2861 to validate cwnd but implement a more
 * flexible approach. The RFC suggests cwnd should not be raised unless
 * it was fully used previously. And that's exactly what we do in
 * congestion avoidance mode. But in slow start we allow cwnd to grow
 * as long as the application has used half the cwnd.
 * Example :
 *    cwnd is 10 (IW10), but application sends 9 frames.
 *    We allow cwnd to reach 18 when all frames are ACKed.
 * This check is safe because it's as aggressive as slow start which already
 * risks 100% overshoot. The advantage is that we discourage application to
 * either send more filler packets or data to artificially blow up the cwnd
 * usage, and allow application-limited process to probe bw more aggressively.
 */
static inline bool tcp_is_cwnd_limited(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (tp->is_cwnd_limited)
                return true;

        /* If in slow start, ensure cwnd grows to twice what was ACKed. */
        if (tcp_in_slow_start(tp))
                return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out;

        return false;
}

/* BBR congestion control needs pacing.
 * Same remark for SO_MAX_PACING_RATE.
 * sch_fq packet scheduler is efficiently handling pacing,
 * but is not always installed/used.
 * Return true if TCP stack should pace packets itself.
 */
static inline bool tcp_needs_internal_pacing(const struct sock *sk)
{
        return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
}

/* Estimates in how many jiffies next packet for this flow can be sent.
 * Scheduling a retransmit timer too early would be silly.
 */
static inline unsigned long tcp_pacing_delay(const struct sock *sk)
{
        s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache;

        return delay > 0 ? nsecs_to_jiffies(delay) : 0;
}

static inline void tcp_reset_xmit_timer(struct sock *sk,
                                        const int what,
                                        unsigned long when,
                                        const unsigned long max_when)
{
        inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
                                  max_when);
}

/* Something is really bad, we could not queue an additional packet,
 * because qdisc is full or receiver sent a 0 window, or we are paced.
 * We do not want to add fuel to the fire, or abort too early,
 * so make sure the timer we arm now is at least 200ms in the future,
 * regardless of current icsk_rto value (as it could be ~2ms)
 */
static inline unsigned long tcp_probe0_base(const struct sock *sk)
{
        return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
}

/* Variant of inet_csk_rto_backoff() used for zero window probes */
static inline unsigned long tcp_probe0_when(const struct sock *sk,
                                            unsigned long max_when)
{
        u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1,
                           inet_csk(sk)->icsk_backoff);
        u64 when = (u64)tcp_probe0_base(sk) << backoff;

        return (unsigned long)min_t(u64, when, max_when);
}

static inline void tcp_check_probe_timer(struct sock *sk)
{
        if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
                tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
                                     tcp_probe0_base(sk), TCP_RTO_MAX);
}

static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
{
        tp->snd_wl1 = seq;
}

static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq)
{
        tp->snd_wl1 = seq;
}

/*
 * Calculate(/check) TCP checksum
 */
static inline __sum16 tcp_v4_check(int len, __be32 saddr,
                                   __be32 daddr, __wsum base)
{
        return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
}

static inline bool tcp_checksum_complete(struct sk_buff *skb)
{
        return !skb_csum_unnecessary(skb) &&
                __skb_checksum_complete(skb);
}

bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
                     enum skb_drop_reason *reason);


int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
int tcp_abort(struct sock *sk, int err);

static inline void tcp_sack_reset(struct tcp_options_received *rx_opt)
{
        rx_opt->dsack = 0;
        rx_opt->num_sacks = 0;
}

void tcp_cwnd_restart(struct sock *sk, s32 delta);

static inline void tcp_slow_start_after_idle_check(struct sock *sk)
{
        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        struct tcp_sock *tp = tcp_sk(sk);
        s32 delta;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) ||
            tp->packets_out || ca_ops->cong_control)
                return;
        delta = tcp_jiffies32 - tp->lsndtime;
        if (delta > inet_csk(sk)->icsk_rto)
                tcp_cwnd_restart(sk, delta);
}

/* Determine a window scaling and initial window to offer. */
void tcp_select_initial_window(const struct sock *sk, int __space,
                               __u32 mss, __u32 *rcv_wnd,
                               __u32 *window_clamp, int wscale_ok,
                               __u8 *rcv_wscale, __u32 init_rcv_wnd);

static inline int __tcp_win_from_space(u8 scaling_ratio, int space)
{
        s64 scaled_space = (s64)space * scaling_ratio;

        return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
}

static inline int tcp_win_from_space(const struct sock *sk, int space)
{
        return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space);
}

/* inverse of __tcp_win_from_space() */
static inline int __tcp_space_from_win(u8 scaling_ratio, int win)
{
        u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;

        do_div(val, scaling_ratio);
        return val;
}

static inline int tcp_space_from_win(const struct sock *sk, int win)
{
        return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
}

/* Assume a 50% default for skb->len/skb->truesize ratio.
 * This may be adjusted later in tcp_measure_rcv_mss().
 */
#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1))

static inline void tcp_scaling_ratio_init(struct sock *sk)
{
        tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
}

/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
        return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
                                  READ_ONCE(sk->sk_backlog.len) -
                                  atomic_read(&sk->sk_rmem_alloc));
}

static inline int tcp_full_space(const struct sock *sk)
{
        return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}

static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh)
{
        int unused_mem = sk_unused_reserved_mem(sk);
        struct tcp_sock *tp = tcp_sk(sk);

        tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh);
        if (unused_mem)
                tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh,
                                         tcp_win_from_space(sk, unused_mem));
}

static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
{
        __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss);
}

void tcp_cleanup_rbuf(struct sock *sk, int copied);
void __tcp_cleanup_rbuf(struct sock *sk, int copied);


/* We provision sk_rcvbuf around 200% of sk_rcvlowat.
 * If 87.5 % (7/8) of the space has been consumed, we want to override
 * SO_RCVLOWAT constraint, since we are receiving skbs with too small
 * len/truesize ratio.
 */
static inline bool tcp_rmem_pressure(const struct sock *sk)
{
        int rcvbuf, threshold;

        if (tcp_under_memory_pressure(sk))
                return true;

        rcvbuf = READ_ONCE(sk->sk_rcvbuf);
        threshold = rcvbuf - (rcvbuf >> 3);

        return atomic_read(&sk->sk_rmem_alloc) > threshold;
}

static inline bool tcp_epollin_ready(const struct sock *sk, int target)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);

        if (avail <= 0)
                return false;

        return (avail >= target) || tcp_rmem_pressure(sk) ||
               (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss);
}

extern void tcp_openreq_init_rwin(struct request_sock *req,
                                  const struct sock *sk_listener,
                                  const struct dst_entry *dst);

void tcp_enter_memory_pressure(struct sock *sk);
void tcp_leave_memory_pressure(struct sock *sk);

static inline int keepalive_intvl_when(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl()
         * and do_tcp_setsockopt().
         */
        val = READ_ONCE(tp->keepalive_intvl);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl);
}

static inline int keepalive_time_when(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */
        val = READ_ONCE(tp->keepalive_time);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time);
}

static inline int keepalive_probes(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        int val;

        /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt()
         * and do_tcp_setsockopt().
         */
        val = READ_ONCE(tp->keepalive_probes);

        return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes);
}

static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
{
        const struct inet_connection_sock *icsk = &tp->inet_conn;

        return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
                          tcp_jiffies32 - tp->rcv_tstamp);
}

static inline int tcp_fin_time(const struct sock *sk)
{
        int fin_timeout = tcp_sk(sk)->linger2 ? :
                READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout);
        const int rto = inet_csk(sk)->icsk_rto;

        if (fin_timeout < (rto << 2) - (rto >> 1))
                fin_timeout = (rto << 2) - (rto >> 1);

        return fin_timeout;
}

static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
                                  int paws_win)
{
        if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
                return true;
        if (unlikely(!time_before32(ktime_get_seconds(),
                                    rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
                return true;
        /*
         * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
         * then following tcp messages have valid values. Ignore 0 value,
         * or else 'negative' tsval might forbid us to accept their packets.
         */
        if (!rx_opt->ts_recent)
                return true;
        return false;
}

static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
                                   int rst)
{
        if (tcp_paws_check(rx_opt, 0))
                return false;

        /* RST segments are not recommended to carry timestamp,
           and, if they do, it is recommended to ignore PAWS because
           "their cleanup function should take precedence over timestamps."
           Certainly, it is mistake. It is necessary to understand the reasons
           of this constraint to relax it: if peer reboots, clock may go
           out-of-sync and half-open connections will not be reset.
           Actually, the problem would be not existing if all
           the implementations followed draft about maintaining clock
           via reboots. Linux-2.2 DOES NOT!

           However, we can relax time bounds for RST segments to MSL.
         */
        if (rst && !time_before32(ktime_get_seconds(),
                                  rx_opt->ts_recent_stamp + TCP_PAWS_MSL))
                return false;
        return true;
}

bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
                          int mib_idx, u32 *last_oow_ack_time);

static inline void tcp_mib_init(struct net *net)
{
        /* See RFC 2012 */
        TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1);
        TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ);
        TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ);
        TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1);
}

/* from STCP */
static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
{
        tp->lost_skb_hint = NULL;
}

static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
{
        tcp_clear_retrans_hints_partial(tp);
        tp->retransmit_skb_hint = NULL;
}

#define tcp_md5_addr tcp_ao_addr

/* - key database */
struct tcp_md5sig_key {
        struct hlist_node        node;
        u8                        keylen;
        u8                        family; /* AF_INET or AF_INET6 */
        u8                        prefixlen;
        u8                        flags;
        union tcp_md5_addr        addr;
        int                        l3index; /* set if key added with L3 scope */
        u8                        key[TCP_MD5SIG_MAXKEYLEN];
        struct rcu_head                rcu;
};

/* - sock block */
struct tcp_md5sig_info {
        struct hlist_head        head;
        struct rcu_head                rcu;
};

/* - pseudo header */
struct tcp4_pseudohdr {
        __be32                saddr;
        __be32                daddr;
        __u8                pad;
        __u8                protocol;
        __be16                len;
};

struct tcp6_pseudohdr {
        struct in6_addr        saddr;
        struct in6_addr daddr;
        __be32                len;
        __be32                protocol;        /* including padding */
};

union tcp_md5sum_block {
        struct tcp4_pseudohdr ip4;
#if IS_ENABLED(CONFIG_IPV6)
        struct tcp6_pseudohdr ip6;
#endif
};

/*
 * struct tcp_sigpool - per-CPU pool of ahash_requests
 * @scratch: per-CPU temporary area, that can be used between
 *             tcp_sigpool_start() and tcp_sigpool_end() to perform
 *             crypto request
 * @req: pre-allocated ahash request
 */
struct tcp_sigpool {
        void *scratch;
        struct ahash_request *req;
};

int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size);
void tcp_sigpool_get(unsigned int id);
void tcp_sigpool_release(unsigned int id);
int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp,
                              const struct sk_buff *skb,
                              unsigned int header_len);

/**
 * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash
 * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
 * @c: returned tcp_sigpool for usage (uninitialized on failure)
 *
 * Returns 0 on success, error otherwise.
 */
int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c);
/**
 * tcp_sigpool_end - enable bh and stop using tcp_sigpool
 * @c: tcp_sigpool context that was returned by tcp_sigpool_start()
 */
void tcp_sigpool_end(struct tcp_sigpool *c);
size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len);
/* - functions */
int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
                        const struct sock *sk, const struct sk_buff *skb);
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index, u8 flags,
                   const u8 *newkey, u8 newkeylen);
int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
                     int family, u8 prefixlen, int l3index,
                     struct tcp_md5sig_key *key);

int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index, u8 flags);
void tcp_clear_md5_list(struct sock *sk);
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
                                         const struct sock *addr_sk);

#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
                                           const union tcp_md5_addr *addr,
                                           int family, bool any_l3index);
static inline struct tcp_md5sig_key *
tcp_md5_do_lookup(const struct sock *sk, int l3index,
                  const union tcp_md5_addr *addr, int family)
{
        if (!static_branch_unlikely(&tcp_md5_needed.key))
                return NULL;
        return __tcp_md5_do_lookup(sk, l3index, addr, family, false);
}

static inline struct tcp_md5sig_key *
tcp_md5_do_lookup_any_l3index(const struct sock *sk,
                              const union tcp_md5_addr *addr, int family)
{
        if (!static_branch_unlikely(&tcp_md5_needed.key))
                return NULL;
        return __tcp_md5_do_lookup(sk, 0, addr, family, true);
}

#define tcp_twsk_md5_key(twsk)        ((twsk)->tw_md5_key)
#else
static inline struct tcp_md5sig_key *
tcp_md5_do_lookup(const struct sock *sk, int l3index,
                  const union tcp_md5_addr *addr, int family)
{
        return NULL;
}

static inline struct tcp_md5sig_key *
tcp_md5_do_lookup_any_l3index(const struct sock *sk,
                              const union tcp_md5_addr *addr, int family)
{
        return NULL;
}

#define tcp_twsk_md5_key(twsk)        NULL
#endif

int tcp_md5_alloc_sigpool(void);
void tcp_md5_release_sigpool(void);
void tcp_md5_add_sigpool(void);
extern int tcp_md5_sigpool_id;

int tcp_md5_hash_key(struct tcp_sigpool *hp,
                     const struct tcp_md5sig_key *key);

/* From tcp_fastopen.c */
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
                            struct tcp_fastopen_cookie *cookie);
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
                            struct tcp_fastopen_cookie *cookie, bool syn_lost,
                            u16 try_exp);
struct tcp_fastopen_request {
        /* Fast Open cookie. Size 0 means a cookie request */
        struct tcp_fastopen_cookie        cookie;
        struct msghdr                        *data;  /* data in MSG_FASTOPEN */
        size_t                                size;
        int                                copied;        /* queued in tcp_connect() */
        struct ubuf_info                *uarg;
};
void tcp_free_fastopen_req(struct tcp_sock *tp);
void tcp_fastopen_destroy_cipher(struct sock *sk);
void tcp_fastopen_ctx_destroy(struct net *net);
int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
                              void *primary_key, void *backup_key);
int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
                            u64 *key);
void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              const struct dst_entry *dst);
void tcp_fastopen_init_key_once(struct net *net);
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
                             struct tcp_fastopen_cookie *cookie);
bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
#define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t)
#define TCP_FASTOPEN_KEY_MAX 2
#define TCP_FASTOPEN_KEY_BUF_LENGTH \
        (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX)

/* Fastopen key context */
struct tcp_fastopen_context {
        siphash_key_t        key[TCP_FASTOPEN_KEY_MAX];
        int                num;
        struct rcu_head        rcu;
};

void tcp_fastopen_active_disable(struct sock *sk);
bool tcp_fastopen_active_should_disable(struct sock *sk);
void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired);

/* Caller needs to wrap with rcu_read_(un)lock() */
static inline
struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk)
{
        struct tcp_fastopen_context *ctx;

        ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
        if (!ctx)
                ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
        return ctx;
}

static inline
bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc,
                               const struct tcp_fastopen_cookie *orig)
{
        if (orig->len == TCP_FASTOPEN_COOKIE_SIZE &&
            orig->len == foc->len &&
            !memcmp(orig->val, foc->val, foc->len))
                return true;
        return false;
}

static inline
int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx)
{
        return ctx->num;
}

/* Latencies incurred by various limits for a sender. They are
 * chronograph-like stats that are mutually exclusive.
 */
enum tcp_chrono {
        TCP_CHRONO_UNSPEC,
        TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
        TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
        TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
        __TCP_CHRONO_MAX,
};

void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);

/* This helper is needed, because skb->tcp_tsorted_anchor uses
 * the same memory storage than skb->destructor/_skb_refdst
 */
static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
{
        skb->destructor = NULL;
        skb->_skb_refdst = 0UL;
}

#define tcp_skb_tsorted_save(skb) {                \
        unsigned long _save = skb->_skb_refdst;        \
        skb->_skb_refdst = 0UL;

#define tcp_skb_tsorted_restore(skb)                \
        skb->_skb_refdst = _save;                \
}

void tcp_write_queue_purge(struct sock *sk);

static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
{
        return skb_rb_first(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk)
{
        return skb_rb_last(&sk->tcp_rtx_queue);
}

static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk)
{
        return skb_peek_tail(&sk->sk_write_queue);
}

#define tcp_for_write_queue_from_safe(skb, tmp, sk)                        \
        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)

static inline struct sk_buff *tcp_send_head(const struct sock *sk)
{
        return skb_peek(&sk->sk_write_queue);
}

static inline bool tcp_skb_is_last(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        return skb_queue_is_last(&sk->sk_write_queue, skb);
}

/**
 * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
 * @sk: socket
 *
 * Since the write queue can have a temporary empty skb in it,
 * we must not use "return skb_queue_empty(&sk->sk_write_queue)"
 */
static inline bool tcp_write_queue_empty(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        return tp->write_seq == tp->snd_nxt;
}

static inline bool tcp_rtx_queue_empty(const struct sock *sk)
{
        return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
}

static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
{
        return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
}

static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
        __skb_queue_tail(&sk->sk_write_queue, skb);

        /* Queue it, remembering where we must start sending. */
        if (sk->sk_write_queue.next == skb)
                tcp_chrono_start(sk, TCP_CHRONO_BUSY);
}

/* Insert new before skb on the write queue of sk.  */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
                                                  struct sk_buff *skb,
                                                  struct sock *sk)
{
        __skb_queue_before(&sk->sk_write_queue, skb, new);
}

static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
        tcp_skb_tsorted_anchor_cleanup(skb);
        __skb_unlink(skb, &sk->sk_write_queue);
}

void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);

static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
{
        tcp_skb_tsorted_anchor_cleanup(skb);
        rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
}

static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
{
        list_del(&skb->tcp_tsorted_anchor);
        tcp_rtx_queue_unlink(skb, sk);
        tcp_wmem_free_skb(sk, skb);
}

static inline void tcp_write_collapse_fence(struct sock *sk)
{
        struct sk_buff *skb = tcp_write_queue_tail(sk);

        if (skb)
                TCP_SKB_CB(skb)->eor = 1;
}

static inline void tcp_push_pending_frames(struct sock *sk)
{
        if (tcp_send_head(sk)) {
                struct tcp_sock *tp = tcp_sk(sk);

                __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle);
        }
}

/* Start sequence of the skb just after the highest skb with SACKed
 * bit, valid only if sacked_out > 0 or when the caller has ensured
 * validity by itself.
 */
static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
{
        if (!tp->sacked_out)
                return tp->snd_una;

        if (tp->highest_sack == NULL)
                return tp->snd_nxt;

        return TCP_SKB_CB(tp->highest_sack)->seq;
}

static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
{
        tcp_sk(sk)->highest_sack = skb_rb_next(skb);
}

static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
{
        return tcp_sk(sk)->highest_sack;
}

static inline void tcp_highest_sack_reset(struct sock *sk)
{
        tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk);
}

/* Called when old skb is about to be deleted and replaced by new skb */
static inline void tcp_highest_sack_replace(struct sock *sk,
                                            struct sk_buff *old,
                                            struct sk_buff *new)
{
        if (old == tcp_highest_sack(sk))
                tcp_sk(sk)->highest_sack = new;
}

/* This helper checks if socket has IP_TRANSPARENT set */
static inline bool inet_sk_transparent(const struct sock *sk)
{
        switch (sk->sk_state) {
        case TCP_TIME_WAIT:
                return inet_twsk(sk)->tw_transparent;
        case TCP_NEW_SYN_RECV:
                return inet_rsk(inet_reqsk(sk))->no_srccheck;
        }
        return inet_test_bit(TRANSPARENT, sk);
}

/* Determines whether this is a thin stream (which may suffer from
 * increased latency). Used to trigger latency-reducing mechanisms.
 */
static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
{
        return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp);
}

/* /proc */
enum tcp_seq_states {
        TCP_SEQ_STATE_LISTENING,
        TCP_SEQ_STATE_ESTABLISHED,
};

void *tcp_seq_start(struct seq_file *seq, loff_t *pos);
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
void tcp_seq_stop(struct seq_file *seq, void *v);

struct tcp_seq_afinfo {
        sa_family_t                        family;
};

struct tcp_iter_state {
        struct seq_net_private        p;
        enum tcp_seq_states        state;
        struct sock                *syn_wait_sk;
        int                        bucket, offset, sbucket, num;
        loff_t                        last_pos;
};

extern struct request_sock_ops tcp_request_sock_ops;
extern struct request_sock_ops tcp6_request_sock_ops;

void tcp_v4_destroy_sock(struct sock *sk);

struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
                                netdev_features_t features);
struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
                                struct tcphdr *th);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
#ifdef CONFIG_INET
void tcp_gro_complete(struct sk_buff *skb);
#else
static inline void tcp_gro_complete(struct sk_buff *skb) { }
#endif

void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);

static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
{
        struct net *net = sock_net((struct sock *)tp);
        u32 val;

        val = READ_ONCE(tp->notsent_lowat);

        return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
}

bool tcp_stream_memory_free(const struct sock *sk, int wake);

#ifdef CONFIG_PROC_FS
int tcp4_proc_init(void);
void tcp4_proc_exit(void);
#endif

int tcp_rtx_synack(const struct sock *sk, struct request_sock *req);
int tcp_conn_request(struct request_sock_ops *rsk_ops,
                     const struct tcp_request_sock_ops *af_ops,
                     struct sock *sk, struct sk_buff *skb);

/* TCP af-specific functions */
struct tcp_sock_af_ops {
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key        *(*md5_lookup) (const struct sock *sk,
                                                const struct sock *addr_sk);
        int                (*calc_md5_hash)(char *location,
                                         const struct tcp_md5sig_key *md5,
                                         const struct sock *sk,
                                         const struct sk_buff *skb);
        int                (*md5_parse)(struct sock *sk,
                                     int optname,
                                     sockptr_t optval,
                                     int optlen);
#endif
#ifdef CONFIG_TCP_AO
        int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen);
        struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
                                        struct sock *addr_sk,
                                        int sndid, int rcvid);
        int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key,
                              const struct sock *sk,
                              __be32 sisn, __be32 disn, bool send);
        int (*calc_ao_hash)(char *location, struct tcp_ao_key *ao,
                            const struct sock *sk, const struct sk_buff *skb,
                            const u8 *tkey, int hash_offset, u32 sne);
#endif
};

struct tcp_request_sock_ops {
        u16 mss_clamp;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
                                                 const struct sock *addr_sk);
        int                (*calc_md5_hash) (char *location,
                                          const struct tcp_md5sig_key *md5,
                                          const struct sock *sk,
                                          const struct sk_buff *skb);
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_key *(*ao_lookup)(const struct sock *sk,
                                        struct request_sock *req,
                                        int sndid, int rcvid);
        int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk);
        int (*ao_synack_hash)(char *ao_hash, struct tcp_ao_key *mkt,
                              struct request_sock *req, const struct sk_buff *skb,
                              int hash_offset, u32 sne);
#endif
#ifdef CONFIG_SYN_COOKIES
        __u32 (*cookie_init_seq)(const struct sk_buff *skb,
                                 __u16 *mss);
#endif
        struct dst_entry *(*route_req)(const struct sock *sk,
                                       struct sk_buff *skb,
                                       struct flowi *fl,
                                       struct request_sock *req,
                                       u32 tw_isn);
        u32 (*init_seq)(const struct sk_buff *skb);
        u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
        int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
                           struct flowi *fl, struct request_sock *req,
                           struct tcp_fastopen_cookie *foc,
                           enum tcp_synack_type synack_type,
                           struct sk_buff *syn_skb);
};

extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
#if IS_ENABLED(CONFIG_IPV6)
extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
#endif

#ifdef CONFIG_SYN_COOKIES
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
{
        tcp_synq_overflow(sk);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
        return ops->cookie_init_seq(skb, mss);
}
#else
static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
{
        return 0;
}
#endif

struct tcp_key {
        union {
                struct {
                        struct tcp_ao_key *ao_key;
                        char *traffic_key;
                        u32 sne;
                        u8 rcv_next;
                };
                struct tcp_md5sig_key *md5_key;
        };
        enum {
                TCP_KEY_NONE = 0,
                TCP_KEY_MD5,
                TCP_KEY_AO,
        } type;
};

static inline void tcp_get_current_key(const struct sock *sk,
                                       struct tcp_key *out)
{
#if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG)
        const struct tcp_sock *tp = tcp_sk(sk);
#endif

#ifdef CONFIG_TCP_AO
        if (static_branch_unlikely(&tcp_ao_needed.key)) {
                struct tcp_ao_info *ao;

                ao = rcu_dereference_protected(tp->ao_info,
                                               lockdep_sock_is_held(sk));
                if (ao) {
                        out->ao_key = READ_ONCE(ao->current_key);
                        out->type = TCP_KEY_AO;
                        return;
                }
        }
#endif
#ifdef CONFIG_TCP_MD5SIG
        if (static_branch_unlikely(&tcp_md5_needed.key) &&
            rcu_access_pointer(tp->md5sig_info)) {
                out->md5_key = tp->af_specific->md5_lookup(sk, sk);
                if (out->md5_key) {
                        out->type = TCP_KEY_MD5;
                        return;
                }
        }
#endif
        out->type = TCP_KEY_NONE;
}

static inline bool tcp_key_is_md5(const struct tcp_key *key)
{
        if (static_branch_tcp_md5())
                return key->type == TCP_KEY_MD5;
        return false;
}

static inline bool tcp_key_is_ao(const struct tcp_key *key)
{
        if (static_branch_tcp_ao())
                return key->type == TCP_KEY_AO;
        return false;
}

int tcpv4_offload_init(void);

void tcp_v4_init(void);
void tcp_init(void);

/* tcp_recovery.c */
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
                                u32 reo_wnd);
extern bool tcp_rack_mark_lost(struct sock *sk);
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
                             u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);
extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);

/* tcp_plb.c */

/*
 * Scaling factor for fractions in PLB. For example, tcp_plb_update_state
 * expects cong_ratio which represents fraction of traffic that experienced
 * congestion over a single RTT. In order to avoid floating point operations,
 * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in.
 */
#define TCP_PLB_SCALE 8

/* State for PLB (Protective Load Balancing) for a single TCP connection. */
struct tcp_plb_state {
        u8        consec_cong_rounds:5, /* consecutive congested rounds */
                unused:3;
        u32        pause_until; /* jiffies32 when PLB can resume rerouting */
};

static inline void tcp_plb_init(const struct sock *sk,
                                struct tcp_plb_state *plb)
{
        plb->consec_cong_rounds = 0;
        plb->pause_until = 0;
}
void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
                          const int cong_ratio);
void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb);
void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb);

/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
        const struct sk_buff *skb = tcp_rtx_queue_head(sk);
        u32 rto = inet_csk(sk)->icsk_rto;
        u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto);

        return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
}

/*
 * Save and compile IPv4 options, return a pointer to it
 */
static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
                                                         struct sk_buff *skb)
{
        const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
        struct ip_options_rcu *dopt = NULL;

        if (opt->optlen) {
                int opt_size = sizeof(*dopt) + opt->optlen;

                dopt = kmalloc(opt_size, GFP_ATOMIC);
                if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
                        kfree(dopt);
                        dopt = NULL;
                }
        }
        return dopt;
}

/* locally generated TCP pure ACKs have skb->truesize == 2
 * (check tcp_send_ack() in net/ipv4/tcp_output.c )
 * This is much faster than dissecting the packet to find out.
 * (Think of GRE encapsulations, IPv4, IPv6, ...)
 */
static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb)
{
        return skb->truesize == 2;
}

static inline void skb_set_tcp_pure_ack(struct sk_buff *skb)
{
        skb->truesize = 2;
}

static inline int tcp_inq(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int answ;

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                answ = 0;
        } else if (sock_flag(sk, SOCK_URGINLINE) ||
                   !tp->urg_data ||
                   before(tp->urg_seq, tp->copied_seq) ||
                   !before(tp->urg_seq, tp->rcv_nxt)) {

                answ = tp->rcv_nxt - tp->copied_seq;

                /* Subtract 1, if FIN was received */
                if (answ && sock_flag(sk, SOCK_DONE))
                        answ--;
        } else {
                answ = tp->urg_seq - tp->copied_seq;
        }

        return answ;
}

int tcp_peek_len(struct socket *sock);

static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
{
        u16 segs_in;

        segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        /* We update these fields while other threads might
         * read them from tcp_get_info()
         */
        WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in);
        if (skb->len > tcp_hdrlen(skb))
                WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in);
}

/*
 * TCP listen path runs lockless.
 * We forced "struct sock" to be const qualified to make sure
 * we don't modify one of its field by mistake.
 * Here, we increment sk_drops which is an atomic_t, so we can safely
 * make sock writable again.
 */
static inline void tcp_listendrop(const struct sock *sk)
{
        atomic_inc(&((struct sock *)sk)->sk_drops);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
}

enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);

/*
 * Interface for adding Upper Level Protocols over TCP
 */

#define TCP_ULP_NAME_MAX        16
#define TCP_ULP_MAX                128
#define TCP_ULP_BUF_MAX                (TCP_ULP_NAME_MAX*TCP_ULP_MAX)

struct tcp_ulp_ops {
        struct list_head        list;

        /* initialize ulp */
        int (*init)(struct sock *sk);
        /* update ulp */
        void (*update)(struct sock *sk, struct proto *p,
                       void (*write_space)(struct sock *sk));
        /* cleanup ulp */
        void (*release)(struct sock *sk);
        /* diagnostic */
        int (*get_info)(struct sock *sk, struct sk_buff *skb);
        size_t (*get_info_size)(const struct sock *sk);
        /* clone ulp */
        void (*clone)(const struct request_sock *req, struct sock *newsk,
                      const gfp_t priority);

        char                name[TCP_ULP_NAME_MAX];
        struct module        *owner;
};
int tcp_register_ulp(struct tcp_ulp_ops *type);
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
int tcp_set_ulp(struct sock *sk, const char *name);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
void tcp_update_ulp(struct sock *sk, struct proto *p,
                    void (*write_space)(struct sock *sk));

#define MODULE_ALIAS_TCP_ULP(name)                                \
        __MODULE_INFO(alias, alias_userspace, name);                \
        __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)

#ifdef CONFIG_NET_SOCK_MSG
struct sk_msg;
struct sk_psock;

#ifdef CONFIG_BPF_SYSCALL
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
#endif /* CONFIG_BPF_SYSCALL */

#ifdef CONFIG_INET
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb);
#else
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
{
}
#endif

int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
                          struct sk_msg *msg, u32 bytes, int flags);
#endif /* CONFIG_NET_SOCK_MSG */

#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
{
}
#endif

#ifdef CONFIG_CGROUP_BPF
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
{
        skops->skb = skb;
        skops->skb_data_end = skb->data + end_offset;
}
#else
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
{
}
#endif

/* Call BPF_SOCK_OPS program that returns an int. If the return value
 * is < 0, then the BPF op failed (for example if the loaded BPF
 * program does not support the chosen operation or there is no BPF
 * program loaded).
 */
#ifdef CONFIG_BPF
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
        struct bpf_sock_ops_kern sock_ops;
        int ret;

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        if (sk_fullsock(sk)) {
                sock_ops.is_fullsock = 1;
                sock_owned_by_me(sk);
        }

        sock_ops.sk = sk;
        sock_ops.op = op;
        if (nargs > 0)
                memcpy(sock_ops.args, args, nargs * sizeof(*args));

        ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
        if (ret == 0)
                ret = sock_ops.reply;
        else
                ret = -1;
        return ret;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
        u32 args[2] = {arg1, arg2};

        return tcp_call_bpf(sk, op, 2, args);
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
                                    u32 arg3)
{
        u32 args[3] = {arg1, arg2, arg3};

        return tcp_call_bpf(sk, op, 3, args);
}

#else
static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
        return -EPERM;
}

static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
{
        return -EPERM;
}

static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
                                    u32 arg3)
{
        return -EPERM;
}

#endif

static inline u32 tcp_timeout_init(struct sock *sk)
{
        int timeout;

        timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);

        if (timeout <= 0)
                timeout = TCP_TIMEOUT_INIT;
        return min_t(int, timeout, TCP_RTO_MAX);
}

static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
{
        int rwnd;

        rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);

        if (rwnd < 0)
                rwnd = 0;
        return rwnd;
}

static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
        return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}

static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
{
        if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
                tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt);
}

#if IS_ENABLED(CONFIG_SMC)
extern struct static_key_false tcp_have_smc;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
void clean_acked_data_enable(struct inet_connection_sock *icsk,
                             void (*cad)(struct sock *sk, u32 ack_seq));
void clean_acked_data_disable(struct inet_connection_sock *icsk);
void clean_acked_data_flush(void);
#endif

DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
static inline void tcp_add_tx_delay(struct sk_buff *skb,
                                    const struct tcp_sock *tp)
{
        if (static_branch_unlikely(&tcp_tx_delay_enabled))
                skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC;
}

/* Compute Earliest Departure Time for some control packets
 * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets.
 */
static inline u64 tcp_transmit_time(const struct sock *sk)
{
        if (static_branch_unlikely(&tcp_tx_delay_enabled)) {
                u32 delay = (sk->sk_state == TCP_TIME_WAIT) ?
                        tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay;

                return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC;
        }
        return 0;
}

static inline int tcp_parse_auth_options(const struct tcphdr *th,
                const u8 **md5_hash, const struct tcp_ao_hdr **aoh)
{
        const u8 *md5_tmp, *ao_tmp;
        int ret;

        ret = tcp_do_parse_auth_options(th, &md5_tmp, &ao_tmp);
        if (ret)
                return ret;

        if (md5_hash)
                *md5_hash = md5_tmp;

        if (aoh) {
                if (!ao_tmp)
                        *aoh = NULL;
                else
                        *aoh = (struct tcp_ao_hdr *)(ao_tmp - 2);
        }

        return 0;
}

static inline bool tcp_ao_required(struct sock *sk, const void *saddr,
                                   int family, int l3index, bool stat_inc)
{
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao_info;
        struct tcp_ao_key *ao_key;

        if (!static_branch_unlikely(&tcp_ao_needed.key))
                return false;

        ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info,
                                        lockdep_sock_is_held(sk));
        if (!ao_info)
                return false;

        ao_key = tcp_ao_do_lookup(sk, l3index, saddr, family, -1, -1);
        if (ao_info->ao_required || ao_key) {
                if (stat_inc) {
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED);
                        atomic64_inc(&ao_info->counters.ao_required);
                }
                return true;
        }
#endif
        return false;
}

enum skb_drop_reason tcp_inbound_hash(struct sock *sk,
                const struct request_sock *req, const struct sk_buff *skb,
                const void *saddr, const void *daddr,
                int family, int dif, int sdif);

#endif        /* _TCP_H */







































































































































    1 





















    3 







    1 





    1 
























    1 







    3 


    3 























    1 



    1 


    1 

























































    3 
    3 



    3 




    3 
    3 



    3 





    3 
    3 




    3 






    3 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/stat.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/hash.h>
#include <linux/kmemleak.h>
#include <linux/user_namespace.h>

struct ucounts init_ucounts = {
        .ns    = &init_user_ns,
        .uid   = GLOBAL_ROOT_UID,
        .count = ATOMIC_INIT(1),
};

#define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
static DEFINE_SPINLOCK(ucounts_lock);

#define ucounts_hashfn(ns, uid)                                                \
        hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
                  UCOUNTS_HASHTABLE_BITS)
#define ucounts_hashentry(ns, uid)        \
        (ucounts_hashtable + ucounts_hashfn(ns, uid))


#ifdef CONFIG_SYSCTL
static struct ctl_table_set *
set_lookup(struct ctl_table_root *root)
{
        return &current_user_ns()->set;
}

static int set_is_seen(struct ctl_table_set *set)
{
        return &current_user_ns()->set == set;
}

static int set_permissions(struct ctl_table_header *head,
                           const struct ctl_table *table)
{
        struct user_namespace *user_ns =
                container_of(head->set, struct user_namespace, set);
        int mode;

        /* Allow users with CAP_SYS_RESOURCE unrestrained access */
        if (ns_capable(user_ns, CAP_SYS_RESOURCE))
                mode = (table->mode & S_IRWXU) >> 6;
        else
        /* Allow all others at most read-only access */
                mode = table->mode & S_IROTH;
        return (mode << 6) | (mode << 3) | mode;
}

static struct ctl_table_root set_root = {
        .lookup = set_lookup,
        .permissions = set_permissions,
};

static long ue_zero = 0;
static long ue_int_max = INT_MAX;

#define UCOUNT_ENTRY(name)                                        \
        {                                                        \
                .procname        = name,                                \
                .maxlen                = sizeof(long),                        \
                .mode                = 0644,                                \
                .proc_handler        = proc_doulongvec_minmax,        \
                .extra1                = &ue_zero,                        \
                .extra2                = &ue_int_max,                        \
        }
static struct ctl_table user_table[] = {
        UCOUNT_ENTRY("max_user_namespaces"),
        UCOUNT_ENTRY("max_pid_namespaces"),
        UCOUNT_ENTRY("max_uts_namespaces"),
        UCOUNT_ENTRY("max_ipc_namespaces"),
        UCOUNT_ENTRY("max_net_namespaces"),
        UCOUNT_ENTRY("max_mnt_namespaces"),
        UCOUNT_ENTRY("max_cgroup_namespaces"),
        UCOUNT_ENTRY("max_time_namespaces"),
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_ENTRY("max_inotify_instances"),
        UCOUNT_ENTRY("max_inotify_watches"),
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_ENTRY("max_fanotify_groups"),
        UCOUNT_ENTRY("max_fanotify_marks"),
#endif
};
#endif /* CONFIG_SYSCTL */

bool setup_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
        struct ctl_table *tbl;

        BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS);
        setup_sysctl_set(&ns->set, &set_root, set_is_seen);
        tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
        if (tbl) {
                int i;
                for (i = 0; i < UCOUNT_COUNTS; i++) {
                        tbl[i].data = &ns->ucount_max[i];
                }
                ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl,
                                                      ARRAY_SIZE(user_table));
        }
        if (!ns->sysctls) {
                kfree(tbl);
                retire_sysctl_set(&ns->set);
                return false;
        }
#endif
        return true;
}

void retire_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
        const struct ctl_table *tbl;

        tbl = ns->sysctls->ctl_table_arg;
        unregister_sysctl_table(ns->sysctls);
        retire_sysctl_set(&ns->set);
        kfree(tbl);
#endif
}

static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
{
        struct ucounts *ucounts;

        hlist_for_each_entry(ucounts, hashent, node) {
                if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
                        return ucounts;
        }
        return NULL;
}

static void hlist_add_ucounts(struct ucounts *ucounts)
{
        struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
        spin_lock_irq(&ucounts_lock);
        hlist_add_head(&ucounts->node, hashent);
        spin_unlock_irq(&ucounts_lock);
}

static inline bool get_ucounts_or_wrap(struct ucounts *ucounts)
{
        /* Returns true on a successful get, false if the count wraps. */
        return !atomic_add_negative(1, &ucounts->count);
}

struct ucounts *get_ucounts(struct ucounts *ucounts)
{
        if (!get_ucounts_or_wrap(ucounts)) {
                put_ucounts(ucounts);
                ucounts = NULL;
        }
        return ucounts;
}

struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
        struct hlist_head *hashent = ucounts_hashentry(ns, uid);
        struct ucounts *ucounts, *new;
        bool wrapped;

        spin_lock_irq(&ucounts_lock);
        ucounts = find_ucounts(ns, uid, hashent);
        if (!ucounts) {
                spin_unlock_irq(&ucounts_lock);

                new = kzalloc(sizeof(*new), GFP_KERNEL);
                if (!new)
                        return NULL;

                new->ns = ns;
                new->uid = uid;
                atomic_set(&new->count, 1);

                spin_lock_irq(&ucounts_lock);
                ucounts = find_ucounts(ns, uid, hashent);
                if (ucounts) {
                        kfree(new);
                } else {
                        hlist_add_head(&new->node, hashent);
                        get_user_ns(new->ns);
                        spin_unlock_irq(&ucounts_lock);
                        return new;
                }
        }
        wrapped = !get_ucounts_or_wrap(ucounts);
        spin_unlock_irq(&ucounts_lock);
        if (wrapped) {
                put_ucounts(ucounts);
                return NULL;
        }
        return ucounts;
}

void put_ucounts(struct ucounts *ucounts)
{
        unsigned long flags;

        if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
                hlist_del_init(&ucounts->node);
                spin_unlock_irqrestore(&ucounts_lock, flags);
                put_user_ns(ucounts->ns);
                kfree(ucounts);
        }
}

static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
        long c, old;
        c = atomic_long_read(v);
        for (;;) {
                if (unlikely(c >= u))
                        return false;
                old = atomic_long_cmpxchg(v, c, c+1);
                if (likely(old == c))
                        return true;
                c = old;
        }
}

struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
                           enum ucount_type type)
{
        struct ucounts *ucounts, *iter, *bad;
        struct user_namespace *tns;
        ucounts = alloc_ucounts(ns, uid);
        for (iter = ucounts; iter; iter = tns->ucounts) {
                long max;
                tns = iter->ns;
                max = READ_ONCE(tns->ucount_max[type]);
                if (!atomic_long_inc_below(&iter->ucount[type], max))
                        goto fail;
        }
        return ucounts;
fail:
        bad = iter;
        for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
                atomic_long_dec(&iter->ucount[type]);

        put_ucounts(ucounts);
        return NULL;
}

void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
        struct ucounts *iter;
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
                WARN_ON_ONCE(dec < 0);
        }
        put_ucounts(ucounts);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
        struct ucounts *iter;
        long max = LONG_MAX;
        long ret = 0;

        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long new = atomic_long_add_return(v, &iter->rlimit[type]);
                if (new < 0 || new > max)
                        ret = LONG_MAX;
                else if (iter == ucounts)
                        ret = new;
                max = get_userns_rlimit_max(iter->ns, type);
        }
        return ret;
}

bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
        struct ucounts *iter;
        long new = -1; /* Silence compiler warning */
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long dec = atomic_long_sub_return(v, &iter->rlimit[type]);
                WARN_ON_ONCE(dec < 0);
                if (iter == ucounts)
                        new = dec;
        }
        return (new == 0);
}

static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
                                struct ucounts *last, enum rlimit_type type)
{
        struct ucounts *iter, *next;
        for (iter = ucounts; iter != last; iter = next) {
                long dec = atomic_long_sub_return(1, &iter->rlimit[type]);
                WARN_ON_ONCE(dec < 0);
                next = iter->ns->ucounts;
                if (dec == 0)
                        put_ucounts(iter);
        }
}

void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
{
        do_dec_rlimit_put_ucounts(ucounts, NULL, type);
}

long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
{
        /* Caller must hold a reference to ucounts */
        struct ucounts *iter;
        long max = LONG_MAX;
        long dec, ret = 0;

        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long new = atomic_long_add_return(1, &iter->rlimit[type]);
                if (new < 0 || new > max)
                        goto unwind;
                if (iter == ucounts)
                        ret = new;
                max = get_userns_rlimit_max(iter->ns, type);
                /*
                 * Grab an extra ucount reference for the caller when
                 * the rlimit count was previously 0.
                 */
                if (new != 1)
                        continue;
                if (!get_ucounts(iter))
                        goto dec_unwind;
        }
        return ret;
dec_unwind:
        dec = atomic_long_sub_return(1, &iter->rlimit[type]);
        WARN_ON_ONCE(dec < 0);
unwind:
        do_dec_rlimit_put_ucounts(ucounts, iter, type);
        return 0;
}

bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit)
{
        struct ucounts *iter;
        long max = rlimit;
        if (rlimit > LONG_MAX)
                max = LONG_MAX;
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long val = get_rlimit_value(iter, type);
                if (val < 0 || val > max)
                        return true;
                max = get_userns_rlimit_max(iter->ns, type);
        }
        return false;
}

static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
        static struct ctl_table_header *user_header;
        static struct ctl_table empty[1];
        /*
         * It is necessary to register the user directory in the
         * default set so that registrations in the child sets work
         * properly.
         */
        user_header = register_sysctl_sz("user", empty, 0);
        kmemleak_ignore(user_header);
        BUG_ON(!user_header);
        BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
        hlist_add_ucounts(&init_ucounts);
        inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
        return 0;
}
subsys_initcall(user_namespace_sysctl_init);




























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kernfs.h - pseudo filesystem decoupled from vfs locking
 */

#ifndef __LINUX_KERNFS_H
#define __LINUX_KERNFS_H

#include <linux/err.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/lockdep.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/uidgid.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
#include <linux/cache.h>

struct file;
struct dentry;
struct iattr;
struct seq_file;
struct vm_area_struct;
struct vm_operations_struct;
struct super_block;
struct file_system_type;
struct poll_table_struct;
struct fs_context;

struct kernfs_fs_context;
struct kernfs_open_node;
struct kernfs_iattrs;

/*
 * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash
 * table of locks.
 * Having a small hash table would impact scalability, since
 * more and more kernfs_node objects will end up using same lock
 * and having a very large hash table would waste memory.
 *
 * At the moment size of hash table of locks is being set based on
 * the number of CPUs as follows:
 *
 * NR_CPU      NR_KERNFS_LOCK_BITS      NR_KERNFS_LOCKS
 *   1                  1                       2
 *  2-3                 2                       4
 *  4-7                 4                       16
 *  8-15                6                       64
 *  16-31               8                       256
 *  32 and more         10                      1024
 *
 * The above relation between NR_CPU and number of locks is based
 * on some internal experimentation which involved booting qemu
 * with different values of smp, performing some sysfs operations
 * on all CPUs and observing how increase in number of locks impacts
 * completion time of these sysfs operations on each CPU.
 */
#ifdef CONFIG_SMP
#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
#else
#define NR_KERNFS_LOCK_BITS     1
#endif

#define NR_KERNFS_LOCKS     (1 << NR_KERNFS_LOCK_BITS)

/*
 * There's one kernfs_open_file for each open file and one kernfs_open_node
 * for each kernfs_node with one or more open files.
 *
 * filp->private_data points to seq_file whose ->private points to
 * kernfs_open_file.
 *
 * kernfs_open_files are chained at kernfs_open_node->files, which is
 * protected by kernfs_global_locks.open_file_mutex[i].
 *
 * To reduce possible contention in sysfs access, arising due to single
 * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node
 * object address as hash keys to get the index of these locks.
 *
 * Hashed mutexes are safe to use here because operations using these don't
 * rely on global exclusion.
 *
 * In future we intend to replace other global locks with hashed ones as well.
 * kernfs_global_locks acts as a holder for all such hash tables.
 */
struct kernfs_global_locks {
        struct mutex open_file_mutex[NR_KERNFS_LOCKS];
};

enum kernfs_node_type {
        KERNFS_DIR                = 0x0001,
        KERNFS_FILE                = 0x0002,
        KERNFS_LINK                = 0x0004,
};

#define KERNFS_TYPE_MASK                0x000f
#define KERNFS_FLAG_MASK                ~KERNFS_TYPE_MASK
#define KERNFS_MAX_USER_XATTRS                128
#define KERNFS_USER_XATTR_SIZE_LIMIT        (128 << 10)

enum kernfs_node_flag {
        KERNFS_ACTIVATED        = 0x0010,
        KERNFS_NS                = 0x0020,
        KERNFS_HAS_SEQ_SHOW        = 0x0040,
        KERNFS_HAS_MMAP                = 0x0080,
        KERNFS_LOCKDEP                = 0x0100,
        KERNFS_HIDDEN                = 0x0200,
        KERNFS_SUICIDAL                = 0x0400,
        KERNFS_SUICIDED                = 0x0800,
        KERNFS_EMPTY_DIR        = 0x1000,
        KERNFS_HAS_RELEASE        = 0x2000,
        KERNFS_REMOVING                = 0x4000,
};

/* @flags for kernfs_create_root() */
enum kernfs_root_flag {
        /*
         * kernfs_nodes are created in the deactivated state and invisible.
         * They require explicit kernfs_activate() to become visible.  This
         * can be used to make related nodes become visible atomically
         * after all nodes are created successfully.
         */
        KERNFS_ROOT_CREATE_DEACTIVATED                = 0x0001,

        /*
         * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2)
         * succeeds regardless of the RW permissions.  sysfs had an extra
         * layer of enforcement where open(2) fails with -EACCES regardless
         * of CAP_DAC_OVERRIDE if the permission doesn't have the
         * respective read or write access at all (none of S_IRUGO or
         * S_IWUGO) or the respective operation isn't implemented.  The
         * following flag enables that behavior.
         */
        KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK        = 0x0002,

        /*
         * The filesystem supports exportfs operation, so userspace can use
         * fhandle to access nodes of the fs.
         */
        KERNFS_ROOT_SUPPORT_EXPORTOP                = 0x0004,

        /*
         * Support user xattrs to be written to nodes rooted at this root.
         */
        KERNFS_ROOT_SUPPORT_USER_XATTR                = 0x0008,
};

/* type-specific structures for kernfs_node union members */
struct kernfs_elem_dir {
        unsigned long                subdirs;
        /* children rbtree starts here and goes through kn->rb */
        struct rb_root                children;

        /*
         * The kernfs hierarchy this directory belongs to.  This fits
         * better directly in kernfs_node but is here to save space.
         */
        struct kernfs_root        *root;
        /*
         * Monotonic revision counter, used to identify if a directory
         * node has changed during negative dentry revalidation.
         */
        unsigned long                rev;
};

struct kernfs_elem_symlink {
        struct kernfs_node        *target_kn;
};

struct kernfs_elem_attr {
        const struct kernfs_ops        *ops;
        struct kernfs_open_node __rcu        *open;
        loff_t                        size;
        struct kernfs_node        *notify_next;        /* for kernfs_notify() */
};

/*
 * kernfs_node - the building block of kernfs hierarchy.  Each and every
 * kernfs node is represented by single kernfs_node.  Most fields are
 * private to kernfs and shouldn't be accessed directly by kernfs users.
 *
 * As long as count reference is held, the kernfs_node itself is
 * accessible.  Dereferencing elem or any other outer entity requires
 * active reference.
 */
struct kernfs_node {
        atomic_t                count;
        atomic_t                active;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
        /*
         * Use kernfs_get_parent() and kernfs_name/path() instead of
         * accessing the following two fields directly.  If the node is
         * never moved to a different parent, it is safe to access the
         * parent directly.
         */
        struct kernfs_node        *parent;
        const char                *name;

        struct rb_node                rb;

        const void                *ns;        /* namespace tag */
        unsigned int                hash;        /* ns + name hash */
        unsigned short                flags;
        umode_t                        mode;

        union {
                struct kernfs_elem_dir                dir;
                struct kernfs_elem_symlink        symlink;
                struct kernfs_elem_attr                attr;
        };

        /*
         * 64bit unique ID.  On 64bit ino setups, id is the ino.  On 32bit,
         * the low 32bits are ino and upper generation.
         */
        u64                        id;

        void                        *priv;
        struct kernfs_iattrs        *iattr;

        struct rcu_head                rcu;
};

/*
 * kernfs_syscall_ops may be specified on kernfs_create_root() to support
 * syscalls.  These optional callbacks are invoked on the matching syscalls
 * and can perform any kernfs operations which don't necessarily have to be
 * the exact operation requested.  An active reference is held for each
 * kernfs_node parameter.
 */
struct kernfs_syscall_ops {
        int (*show_options)(struct seq_file *sf, struct kernfs_root *root);

        int (*mkdir)(struct kernfs_node *parent, const char *name,
                     umode_t mode);
        int (*rmdir)(struct kernfs_node *kn);
        int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
                      const char *new_name);
        int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
                         struct kernfs_root *root);
};

struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root);

struct kernfs_open_file {
        /* published fields */
        struct kernfs_node        *kn;
        struct file                *file;
        struct seq_file                *seq_file;
        void                        *priv;

        /* private fields, do not use outside kernfs proper */
        struct mutex                mutex;
        struct mutex                prealloc_mutex;
        int                        event;
        struct list_head        list;
        char                        *prealloc_buf;

        size_t                        atomic_write_len;
        bool                        mmapped:1;
        bool                        released:1;
        const struct vm_operations_struct *vm_ops;
};

struct kernfs_ops {
        /*
         * Optional open/release methods.  Both are called with
         * @of->seq_file populated.
         */
        int (*open)(struct kernfs_open_file *of);
        void (*release)(struct kernfs_open_file *of);

        /*
         * Read is handled by either seq_file or raw_read().
         *
         * If seq_show() is present, seq_file path is active.  Other seq
         * operations are optional and if not implemented, the behavior is
         * equivalent to single_open().  @sf->private points to the
         * associated kernfs_open_file.
         *
         * read() is bounced through kernel buffer and a read larger than
         * PAGE_SIZE results in partial operation of PAGE_SIZE.
         */
        int (*seq_show)(struct seq_file *sf, void *v);

        void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
        void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
        void (*seq_stop)(struct seq_file *sf, void *v);

        ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
                        loff_t off);

        /*
         * write() is bounced through kernel buffer.  If atomic_write_len
         * is not set, a write larger than PAGE_SIZE results in partial
         * operations of PAGE_SIZE chunks.  If atomic_write_len is set,
         * writes upto the specified size are executed atomically but
         * larger ones are rejected with -E2BIG.
         */
        size_t atomic_write_len;
        /*
         * "prealloc" causes a buffer to be allocated at open for
         * all read/write requests.  As ->seq_show uses seq_read()
         * which does its own allocation, it is incompatible with
         * ->prealloc.  Provide ->read and ->write with ->prealloc.
         */
        bool prealloc;
        ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
                         loff_t off);

        __poll_t (*poll)(struct kernfs_open_file *of,
                         struct poll_table_struct *pt);

        int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
        loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence);
};

/*
 * The kernfs superblock creation/mount parameter context.
 */
struct kernfs_fs_context {
        struct kernfs_root        *root;                /* Root of the hierarchy being mounted */
        void                        *ns_tag;        /* Namespace tag of the mount (or NULL) */
        unsigned long                magic;                /* File system specific magic number */

        /* The following are set/used by kernfs_mount() */
        bool                        new_sb_created;        /* Set to T if we allocated a new sb */
};

#ifdef CONFIG_KERNFS

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_TYPE_MASK;
}

static inline ino_t kernfs_id_ino(u64 id)
{
        /* id is ino if ino_t is 64bit; otherwise, low 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return id;
        else
                return (u32)id;
}

static inline u32 kernfs_id_gen(u64 id)
{
        /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */
        if (sizeof(ino_t) >= sizeof(u64))
                return 1;
        else
                return id >> 32;
}

static inline ino_t kernfs_ino(struct kernfs_node *kn)
{
        return kernfs_id_ino(kn->id);
}

static inline ino_t kernfs_gen(struct kernfs_node *kn)
{
        return kernfs_id_gen(kn->id);
}

/**
 * kernfs_enable_ns - enable namespace under a directory
 * @kn: directory of interest, should be empty
 *
 * This is to be called right after @kn is created to enable namespace
 * under it.  All children of @kn must have non-NULL namespace tags and
 * only the ones which match the super_block's tag will be visible.
 */
static inline void kernfs_enable_ns(struct kernfs_node *kn)
{
        WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
        kn->flags |= KERNFS_NS;
}

/**
 * kernfs_ns_enabled - test whether namespace is enabled
 * @kn: the node to test
 *
 * Test whether namespace filtering is enabled for the children of @ns.
 */
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{
        return kn->flags & KERNFS_NS;
}

int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn,
                          char *buf, size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
void pr_cont_kernfs_path(struct kernfs_node *kn);
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
                                           const char *name, const void *ns);
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
                                           const char *path, const void *ns);
void kernfs_get(struct kernfs_node *kn);
void kernfs_put(struct kernfs_node *kn);

struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);

struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
                                  struct super_block *sb);
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                       unsigned int flags, void *priv);
void kernfs_destroy_root(struct kernfs_root *root);

struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         void *priv, const void *ns);
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
                                            const char *name);
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const void *ns,
                                         struct lock_class_key *key);
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                                       const char *name,
                                       struct kernfs_node *target);
void kernfs_activate(struct kernfs_node *kn);
void kernfs_show(struct kernfs_node *kn, bool show);
void kernfs_remove(struct kernfs_node *kn);
void kernfs_break_active_protection(struct kernfs_node *kn);
void kernfs_unbreak_active_protection(struct kernfs_node *kn);
bool kernfs_remove_self(struct kernfs_node *kn);
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const void *ns);
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const void *new_ns);
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
                             struct poll_table_struct *pt);
void kernfs_notify(struct kernfs_node *kn);

int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                     void *value, size_t size);
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                     const void *value, size_t size, int flags);

const void *kernfs_super_ns(struct super_block *sb);
int kernfs_get_tree(struct fs_context *fc);
void kernfs_free_fs_context(struct fs_context *fc);
void kernfs_kill_sb(struct super_block *sb);

void kernfs_init(void);

struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
                                                   u64 id);
#else        /* CONFIG_KERNFS */

static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{ return 0; }        /* whatever */

static inline void kernfs_enable_ns(struct kernfs_node *kn) { }

static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{ return -ENOSYS; }

static inline int kernfs_path_from_node(struct kernfs_node *root_kn,
                                        struct kernfs_node *kn,
                                        char *buf, size_t buflen)
{ return -ENOSYS; }

static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_node *
kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
                       const void *ns)
{ return NULL; }
static inline struct kernfs_node *
kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path,
                       const void *ns)
{ return NULL; }

static inline void kernfs_get(struct kernfs_node *kn) { }
static inline void kernfs_put(struct kernfs_node *kn) { }

static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{ return NULL; }

static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
{ return NULL; }

static inline struct inode *
kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{ return NULL; }

static inline struct kernfs_root *
kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
                   void *priv)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_destroy_root(struct kernfs_root *root) { }

static inline struct kernfs_node *
kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     void *priv, const void *ns)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
__kernfs_create_file(struct kernfs_node *parent, const char *name,
                     umode_t mode, kuid_t uid, kgid_t gid,
                     loff_t size, const struct kernfs_ops *ops,
                     void *priv, const void *ns, struct lock_class_key *key)
{ return ERR_PTR(-ENOSYS); }

static inline struct kernfs_node *
kernfs_create_link(struct kernfs_node *parent, const char *name,
                   struct kernfs_node *target)
{ return ERR_PTR(-ENOSYS); }

static inline void kernfs_activate(struct kernfs_node *kn) { }

static inline void kernfs_remove(struct kernfs_node *kn) { }

static inline bool kernfs_remove_self(struct kernfs_node *kn)
{ return false; }

static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
                                           const char *name, const void *ns)
{ return -ENOSYS; }

static inline int kernfs_rename_ns(struct kernfs_node *kn,
                                   struct kernfs_node *new_parent,
                                   const char *new_name, const void *new_ns)
{ return -ENOSYS; }

static inline int kernfs_setattr(struct kernfs_node *kn,
                                 const struct iattr *iattr)
{ return -ENOSYS; }

static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of,
                                           struct poll_table_struct *pt)
{ return -ENOSYS; }

static inline void kernfs_notify(struct kernfs_node *kn) { }

static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                                   void *value, size_t size)
{ return -ENOSYS; }

static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                                   const void *value, size_t size, int flags)
{ return -ENOSYS; }

static inline const void *kernfs_super_ns(struct super_block *sb)
{ return NULL; }

static inline int kernfs_get_tree(struct fs_context *fc)
{ return -ENOSYS; }

static inline void kernfs_free_fs_context(struct fs_context *fc) { }

static inline void kernfs_kill_sb(struct super_block *sb) { }

static inline void kernfs_init(void) { }

#endif        /* CONFIG_KERNFS */

/**
 * kernfs_path - build full path of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * If @kn is NULL result will be "(null)".
 *
 * Returns the length of the full path.  If the full length is equal to or
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
        return kernfs_path_from_node(kn, NULL, buf, buflen);
}

static inline struct kernfs_node *
kernfs_find_and_get(struct kernfs_node *kn, const char *name)
{
        return kernfs_find_and_get_ns(kn, name, NULL);
}

static inline struct kernfs_node *
kernfs_walk_and_get(struct kernfs_node *kn, const char *path)
{
        return kernfs_walk_and_get_ns(kn, path, NULL);
}

static inline struct kernfs_node *
kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
                  void *priv)
{
        return kernfs_create_dir_ns(parent, name, mode,
                                    GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                                    priv, NULL);
}

static inline int kernfs_remove_by_name(struct kernfs_node *parent,
                                        const char *name)
{
        return kernfs_remove_by_name_ns(parent, name, NULL);
}

static inline int kernfs_rename(struct kernfs_node *kn,
                                struct kernfs_node *new_parent,
                                const char *new_name)
{
        return kernfs_rename_ns(kn, new_parent, new_name, NULL);
}

#endif        /* __LINUX_KERNFS_H */















































    1 
    1 



































    1 














    1 





    1 

    1 











    1 






    1 
    1 




































































































































































































































    1 





    1 
    1 




    1 
    1 














    1 







    1 


























































































































































































































































































    1 



    1 







    1 



    1 














































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright 2007 Hewlett-Packard Development Company, L.P.
 *
 * This file is part of the SCTP kernel implementation
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *   Vlad Yasevich     <vladislav.yasevich@hp.com>
 */

#include <crypto/hash.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/scatterlist.h>
#include <net/sctp/sctp.h>
#include <net/sctp/auth.h>

static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
        {
                /* id 0 is reserved.  as all 0 */
                .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0,
        },
        {
                .hmac_id = SCTP_AUTH_HMAC_ID_SHA1,
                .hmac_name = "hmac(sha1)",
                .hmac_len = SCTP_SHA1_SIG_SIZE,
        },
        {
                /* id 2 is reserved as well */
                .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
        },
#if IS_ENABLED(CONFIG_CRYPTO_SHA256)
        {
                .hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
                .hmac_name = "hmac(sha256)",
                .hmac_len = SCTP_SHA256_SIG_SIZE,
        }
#endif
};


void sctp_auth_key_put(struct sctp_auth_bytes *key)
{
        if (!key)
                return;

        if (refcount_dec_and_test(&key->refcnt)) {
                kfree_sensitive(key);
                SCTP_DBG_OBJCNT_DEC(keys);
        }
}

/* Create a new key structure of a given length */
static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp)
{
        struct sctp_auth_bytes *key;

        /* Verify that we are not going to overflow INT_MAX */
        if (key_len > (INT_MAX - sizeof(struct sctp_auth_bytes)))
                return NULL;

        /* Allocate the shared key */
        key = kmalloc(sizeof(struct sctp_auth_bytes) + key_len, gfp);
        if (!key)
                return NULL;

        key->len = key_len;
        refcount_set(&key->refcnt, 1);
        SCTP_DBG_OBJCNT_INC(keys);

        return key;
}

/* Create a new shared key container with a give key id */
struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp)
{
        struct sctp_shared_key *new;

        /* Allocate the shared key container */
        new = kzalloc(sizeof(struct sctp_shared_key), gfp);
        if (!new)
                return NULL;

        INIT_LIST_HEAD(&new->key_list);
        refcount_set(&new->refcnt, 1);
        new->key_id = key_id;

        return new;
}

/* Free the shared key structure */
static void sctp_auth_shkey_destroy(struct sctp_shared_key *sh_key)
{
        BUG_ON(!list_empty(&sh_key->key_list));
        sctp_auth_key_put(sh_key->key);
        sh_key->key = NULL;
        kfree(sh_key);
}

void sctp_auth_shkey_release(struct sctp_shared_key *sh_key)
{
        if (refcount_dec_and_test(&sh_key->refcnt))
                sctp_auth_shkey_destroy(sh_key);
}

void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key)
{
        refcount_inc(&sh_key->refcnt);
}

/* Destroy the entire key list.  This is done during the
 * associon and endpoint free process.
 */
void sctp_auth_destroy_keys(struct list_head *keys)
{
        struct sctp_shared_key *ep_key;
        struct sctp_shared_key *tmp;

        if (list_empty(keys))
                return;

        key_for_each_safe(ep_key, tmp, keys) {
                list_del_init(&ep_key->key_list);
                sctp_auth_shkey_release(ep_key);
        }
}

/* Compare two byte vectors as numbers.  Return values
 * are:
 *           0 - vectors are equal
 *         < 0 - vector 1 is smaller than vector2
 *         > 0 - vector 1 is greater than vector2
 *
 * Algorithm is:
 *         This is performed by selecting the numerically smaller key vector...
 *        If the key vectors are equal as numbers but differ in length ...
 *        the shorter vector is considered smaller
 *
 * Examples (with small values):
 *         000123456789 > 123456789 (first number is longer)
 *         000123456789 < 234567891 (second number is larger numerically)
 *         123456789 > 2345678          (first number is both larger & longer)
 */
static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1,
                              struct sctp_auth_bytes *vector2)
{
        int diff;
        int i;
        const __u8 *longer;

        diff = vector1->len - vector2->len;
        if (diff) {
                longer = (diff > 0) ? vector1->data : vector2->data;

                /* Check to see if the longer number is
                 * lead-zero padded.  If it is not, it
                 * is automatically larger numerically.
                 */
                for (i = 0; i < abs(diff); i++) {
                        if (longer[i] != 0)
                                return diff;
                }
        }

        /* lengths are the same, compare numbers */
        return memcmp(vector1->data, vector2->data, vector1->len);
}

/*
 * Create a key vector as described in SCTP-AUTH, Section 6.1
 *    The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO
 *    parameter sent by each endpoint are concatenated as byte vectors.
 *    These parameters include the parameter type, parameter length, and
 *    the parameter value, but padding is omitted; all padding MUST be
 *    removed from this concatenation before proceeding with further
 *    computation of keys.  Parameters which were not sent are simply
 *    omitted from the concatenation process.  The resulting two vectors
 *    are called the two key vectors.
 */
static struct sctp_auth_bytes *sctp_auth_make_key_vector(
                        struct sctp_random_param *random,
                        struct sctp_chunks_param *chunks,
                        struct sctp_hmac_algo_param *hmacs,
                        gfp_t gfp)
{
        struct sctp_auth_bytes *new;
        __u32        len;
        __u32        offset = 0;
        __u16        random_len, hmacs_len, chunks_len = 0;

        random_len = ntohs(random->param_hdr.length);
        hmacs_len = ntohs(hmacs->param_hdr.length);
        if (chunks)
                chunks_len = ntohs(chunks->param_hdr.length);

        len = random_len + hmacs_len + chunks_len;

        new = sctp_auth_create_key(len, gfp);
        if (!new)
                return NULL;

        memcpy(new->data, random, random_len);
        offset += random_len;

        if (chunks) {
                memcpy(new->data + offset, chunks, chunks_len);
                offset += chunks_len;
        }

        memcpy(new->data + offset, hmacs, hmacs_len);

        return new;
}


/* Make a key vector based on our local parameters */
static struct sctp_auth_bytes *sctp_auth_make_local_vector(
                                    const struct sctp_association *asoc,
                                    gfp_t gfp)
{
        return sctp_auth_make_key_vector(
                        (struct sctp_random_param *)asoc->c.auth_random,
                        (struct sctp_chunks_param *)asoc->c.auth_chunks,
                        (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs, gfp);
}

/* Make a key vector based on peer's parameters */
static struct sctp_auth_bytes *sctp_auth_make_peer_vector(
                                    const struct sctp_association *asoc,
                                    gfp_t gfp)
{
        return sctp_auth_make_key_vector(asoc->peer.peer_random,
                                         asoc->peer.peer_chunks,
                                         asoc->peer.peer_hmacs,
                                         gfp);
}


/* Set the value of the association shared key base on the parameters
 * given.  The algorithm is:
 *    From the endpoint pair shared keys and the key vectors the
 *    association shared keys are computed.  This is performed by selecting
 *    the numerically smaller key vector and concatenating it to the
 *    endpoint pair shared key, and then concatenating the numerically
 *    larger key vector to that.  The result of the concatenation is the
 *    association shared key.
 */
static struct sctp_auth_bytes *sctp_auth_asoc_set_secret(
                        struct sctp_shared_key *ep_key,
                        struct sctp_auth_bytes *first_vector,
                        struct sctp_auth_bytes *last_vector,
                        gfp_t gfp)
{
        struct sctp_auth_bytes *secret;
        __u32 offset = 0;
        __u32 auth_len;

        auth_len = first_vector->len + last_vector->len;
        if (ep_key->key)
                auth_len += ep_key->key->len;

        secret = sctp_auth_create_key(auth_len, gfp);
        if (!secret)
                return NULL;

        if (ep_key->key) {
                memcpy(secret->data, ep_key->key->data, ep_key->key->len);
                offset += ep_key->key->len;
        }

        memcpy(secret->data + offset, first_vector->data, first_vector->len);
        offset += first_vector->len;

        memcpy(secret->data + offset, last_vector->data, last_vector->len);

        return secret;
}

/* Create an association shared key.  Follow the algorithm
 * described in SCTP-AUTH, Section 6.1
 */
static struct sctp_auth_bytes *sctp_auth_asoc_create_secret(
                                 const struct sctp_association *asoc,
                                 struct sctp_shared_key *ep_key,
                                 gfp_t gfp)
{
        struct sctp_auth_bytes *local_key_vector;
        struct sctp_auth_bytes *peer_key_vector;
        struct sctp_auth_bytes        *first_vector,
                                *last_vector;
        struct sctp_auth_bytes        *secret = NULL;
        int        cmp;


        /* Now we need to build the key vectors
         * SCTP-AUTH , Section 6.1
         *    The RANDOM parameter, the CHUNKS parameter and the HMAC-ALGO
         *    parameter sent by each endpoint are concatenated as byte vectors.
         *    These parameters include the parameter type, parameter length, and
         *    the parameter value, but padding is omitted; all padding MUST be
         *    removed from this concatenation before proceeding with further
         *    computation of keys.  Parameters which were not sent are simply
         *    omitted from the concatenation process.  The resulting two vectors
         *    are called the two key vectors.
         */

        local_key_vector = sctp_auth_make_local_vector(asoc, gfp);
        peer_key_vector = sctp_auth_make_peer_vector(asoc, gfp);

        if (!peer_key_vector || !local_key_vector)
                goto out;

        /* Figure out the order in which the key_vectors will be
         * added to the endpoint shared key.
         * SCTP-AUTH, Section 6.1:
         *   This is performed by selecting the numerically smaller key
         *   vector and concatenating it to the endpoint pair shared
         *   key, and then concatenating the numerically larger key
         *   vector to that.  If the key vectors are equal as numbers
         *   but differ in length, then the concatenation order is the
         *   endpoint shared key, followed by the shorter key vector,
         *   followed by the longer key vector.  Otherwise, the key
         *   vectors are identical, and may be concatenated to the
         *   endpoint pair key in any order.
         */
        cmp = sctp_auth_compare_vectors(local_key_vector,
                                        peer_key_vector);
        if (cmp < 0) {
                first_vector = local_key_vector;
                last_vector = peer_key_vector;
        } else {
                first_vector = peer_key_vector;
                last_vector = local_key_vector;
        }

        secret = sctp_auth_asoc_set_secret(ep_key, first_vector, last_vector,
                                            gfp);
out:
        sctp_auth_key_put(local_key_vector);
        sctp_auth_key_put(peer_key_vector);

        return secret;
}

/*
 * Populate the association overlay list with the list
 * from the endpoint.
 */
int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep,
                                struct sctp_association *asoc,
                                gfp_t gfp)
{
        struct sctp_shared_key *sh_key;
        struct sctp_shared_key *new;

        BUG_ON(!list_empty(&asoc->endpoint_shared_keys));

        key_for_each(sh_key, &ep->endpoint_shared_keys) {
                new = sctp_auth_shkey_create(sh_key->key_id, gfp);
                if (!new)
                        goto nomem;

                new->key = sh_key->key;
                sctp_auth_key_hold(new->key);
                list_add(&new->key_list, &asoc->endpoint_shared_keys);
        }

        return 0;

nomem:
        sctp_auth_destroy_keys(&asoc->endpoint_shared_keys);
        return -ENOMEM;
}


/* Public interface to create the association shared key.
 * See code above for the algorithm.
 */
int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp)
{
        struct sctp_auth_bytes        *secret;
        struct sctp_shared_key *ep_key;
        struct sctp_chunk *chunk;

        /* If we don't support AUTH, or peer is not capable
         * we don't need to do anything.
         */
        if (!asoc->peer.auth_capable)
                return 0;

        /* If the key_id is non-zero and we couldn't find an
         * endpoint pair shared key, we can't compute the
         * secret.
         * For key_id 0, endpoint pair shared key is a NULL key.
         */
        ep_key = sctp_auth_get_shkey(asoc, asoc->active_key_id);
        BUG_ON(!ep_key);

        secret = sctp_auth_asoc_create_secret(asoc, ep_key, gfp);
        if (!secret)
                return -ENOMEM;

        sctp_auth_key_put(asoc->asoc_shared_key);
        asoc->asoc_shared_key = secret;
        asoc->shkey = ep_key;

        /* Update send queue in case any chunk already in there now
         * needs authenticating
         */
        list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) {
                if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) {
                        chunk->auth = 1;
                        if (!chunk->shkey) {
                                chunk->shkey = asoc->shkey;
                                sctp_auth_shkey_hold(chunk->shkey);
                        }
                }
        }

        return 0;
}


/* Find the endpoint pair shared key based on the key_id */
struct sctp_shared_key *sctp_auth_get_shkey(
                                const struct sctp_association *asoc,
                                __u16 key_id)
{
        struct sctp_shared_key *key;

        /* First search associations set of endpoint pair shared keys */
        key_for_each(key, &asoc->endpoint_shared_keys) {
                if (key->key_id == key_id) {
                        if (!key->deactivated)
                                return key;
                        break;
                }
        }

        return NULL;
}

/*
 * Initialize all the possible digest transforms that we can use.  Right
 * now, the supported digests are SHA1 and SHA256.  We do this here once
 * because of the restrictiong that transforms may only be allocated in
 * user context.  This forces us to pre-allocated all possible transforms
 * at the endpoint init time.
 */
int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
{
        struct crypto_shash *tfm = NULL;
        __u16   id;

        /* If the transforms are already allocated, we are done */
        if (ep->auth_hmacs)
                return 0;

        /* Allocated the array of pointers to transorms */
        ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS,
                                 sizeof(struct crypto_shash *),
                                 gfp);
        if (!ep->auth_hmacs)
                return -ENOMEM;

        for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) {

                /* See is we support the id.  Supported IDs have name and
                 * length fields set, so that we can allocated and use
                 * them.  We can safely just check for name, for without the
                 * name, we can't allocate the TFM.
                 */
                if (!sctp_hmac_list[id].hmac_name)
                        continue;

                /* If this TFM has been allocated, we are all set */
                if (ep->auth_hmacs[id])
                        continue;

                /* Allocate the ID */
                tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0);
                if (IS_ERR(tfm))
                        goto out_err;

                ep->auth_hmacs[id] = tfm;
        }

        return 0;

out_err:
        /* Clean up any successful allocations */
        sctp_auth_destroy_hmacs(ep->auth_hmacs);
        ep->auth_hmacs = NULL;
        return -ENOMEM;
}

/* Destroy the hmac tfm array */
void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[])
{
        int i;

        if (!auth_hmacs)
                return;

        for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) {
                crypto_free_shash(auth_hmacs[i]);
        }
        kfree(auth_hmacs);
}


struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
{
        return &sctp_hmac_list[hmac_id];
}

/* Get an hmac description information that we can use to build
 * the AUTH chunk
 */
struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
{
        struct sctp_hmac_algo_param *hmacs;
        __u16 n_elt;
        __u16 id = 0;
        int i;

        /* If we have a default entry, use it */
        if (asoc->default_hmac_id)
                return &sctp_hmac_list[asoc->default_hmac_id];

        /* Since we do not have a default entry, find the first entry
         * we support and return that.  Do not cache that id.
         */
        hmacs = asoc->peer.peer_hmacs;
        if (!hmacs)
                return NULL;

        n_elt = (ntohs(hmacs->param_hdr.length) -
                 sizeof(struct sctp_paramhdr)) >> 1;
        for (i = 0; i < n_elt; i++) {
                id = ntohs(hmacs->hmac_ids[i]);

                /* Check the id is in the supported range. And
                 * see if we support the id.  Supported IDs have name and
                 * length fields set, so that we can allocate and use
                 * them.  We can safely just check for name, for without the
                 * name, we can't allocate the TFM.
                 */
                if (id > SCTP_AUTH_HMAC_ID_MAX ||
                    !sctp_hmac_list[id].hmac_name) {
                        id = 0;
                        continue;
                }

                break;
        }

        if (id == 0)
                return NULL;

        return &sctp_hmac_list[id];
}

static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id)
{
        int  found = 0;
        int  i;

        for (i = 0; i < n_elts; i++) {
                if (hmac_id == hmacs[i]) {
                        found = 1;
                        break;
                }
        }

        return found;
}

/* See if the HMAC_ID is one that we claim as supported */
int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc,
                                    __be16 hmac_id)
{
        struct sctp_hmac_algo_param *hmacs;
        __u16 n_elt;

        if (!asoc)
                return 0;

        hmacs = (struct sctp_hmac_algo_param *)asoc->c.auth_hmacs;
        n_elt = (ntohs(hmacs->param_hdr.length) -
                 sizeof(struct sctp_paramhdr)) >> 1;

        return __sctp_auth_find_hmacid(hmacs->hmac_ids, n_elt, hmac_id);
}


/* Cache the default HMAC id.  This to follow this text from SCTP-AUTH:
 * Section 6.1:
 *   The receiver of a HMAC-ALGO parameter SHOULD use the first listed
 *   algorithm it supports.
 */
void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
                                     struct sctp_hmac_algo_param *hmacs)
{
        struct sctp_endpoint *ep;
        __u16   id;
        int        i;
        int        n_params;

        /* if the default id is already set, use it */
        if (asoc->default_hmac_id)
                return;

        n_params = (ntohs(hmacs->param_hdr.length) -
                    sizeof(struct sctp_paramhdr)) >> 1;
        ep = asoc->ep;
        for (i = 0; i < n_params; i++) {
                id = ntohs(hmacs->hmac_ids[i]);

                /* Check the id is in the supported range */
                if (id > SCTP_AUTH_HMAC_ID_MAX)
                        continue;

                /* If this TFM has been allocated, use this id */
                if (ep->auth_hmacs[id]) {
                        asoc->default_hmac_id = id;
                        break;
                }
        }
}


/* Check to see if the given chunk is supposed to be authenticated */
static int __sctp_auth_cid(enum sctp_cid chunk, struct sctp_chunks_param *param)
{
        unsigned short len;
        int found = 0;
        int i;

        if (!param || param->param_hdr.length == 0)
                return 0;

        len = ntohs(param->param_hdr.length) - sizeof(struct sctp_paramhdr);

        /* SCTP-AUTH, Section 3.2
         *    The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH
         *    chunks MUST NOT be listed in the CHUNKS parameter.  However, if
         *    a CHUNKS parameter is received then the types for INIT, INIT-ACK,
         *    SHUTDOWN-COMPLETE and AUTH chunks MUST be ignored.
         */
        for (i = 0; !found && i < len; i++) {
                switch (param->chunks[i]) {
                case SCTP_CID_INIT:
                case SCTP_CID_INIT_ACK:
                case SCTP_CID_SHUTDOWN_COMPLETE:
                case SCTP_CID_AUTH:
                        break;

                default:
                        if (param->chunks[i] == chunk)
                                found = 1;
                        break;
                }
        }

        return found;
}

/* Check if peer requested that this chunk is authenticated */
int sctp_auth_send_cid(enum sctp_cid chunk, const struct sctp_association *asoc)
{
        if (!asoc)
                return 0;

        if (!asoc->peer.auth_capable)
                return 0;

        return __sctp_auth_cid(chunk, asoc->peer.peer_chunks);
}

/* Check if we requested that peer authenticate this chunk. */
int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc)
{
        if (!asoc)
                return 0;

        if (!asoc->peer.auth_capable)
                return 0;

        return __sctp_auth_cid(chunk,
                              (struct sctp_chunks_param *)asoc->c.auth_chunks);
}

/* SCTP-AUTH: Section 6.2:
 *    The sender MUST calculate the MAC as described in RFC2104 [2] using
 *    the hash function H as described by the MAC Identifier and the shared
 *    association key K based on the endpoint pair shared key described by
 *    the shared key identifier.  The 'data' used for the computation of
 *    the AUTH-chunk is given by the AUTH chunk with its HMAC field set to
 *    zero (as shown in Figure 6) followed by all chunks that are placed
 *    after the AUTH chunk in the SCTP packet.
 */
void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
                              struct sk_buff *skb, struct sctp_auth_chunk *auth,
                              struct sctp_shared_key *ep_key, gfp_t gfp)
{
        struct sctp_auth_bytes *asoc_key;
        struct crypto_shash *tfm;
        __u16 key_id, hmac_id;
        unsigned char *end;
        int free_key = 0;
        __u8 *digest;

        /* Extract the info we need:
         * - hmac id
         * - key id
         */
        key_id = ntohs(auth->auth_hdr.shkey_id);
        hmac_id = ntohs(auth->auth_hdr.hmac_id);

        if (key_id == asoc->active_key_id)
                asoc_key = asoc->asoc_shared_key;
        else {
                /* ep_key can't be NULL here */
                asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp);
                if (!asoc_key)
                        return;

                free_key = 1;
        }

        /* set up scatter list */
        end = skb_tail_pointer(skb);

        tfm = asoc->ep->auth_hmacs[hmac_id];

        digest = (u8 *)(&auth->auth_hdr + 1);
        if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
                goto free;

        crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth,
                                digest);

free:
        if (free_key)
                sctp_auth_key_put(asoc_key);
}

/* API Helpers */

/* Add a chunk to the endpoint authenticated chunk list */
int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id)
{
        struct sctp_chunks_param *p = ep->auth_chunk_list;
        __u16 nchunks;
        __u16 param_len;

        /* If this chunk is already specified, we are done */
        if (__sctp_auth_cid(chunk_id, p))
                return 0;

        /* Check if we can add this chunk to the array */
        param_len = ntohs(p->param_hdr.length);
        nchunks = param_len - sizeof(struct sctp_paramhdr);
        if (nchunks == SCTP_NUM_CHUNK_TYPES)
                return -EINVAL;

        p->chunks[nchunks] = chunk_id;
        p->param_hdr.length = htons(param_len + 1);
        return 0;
}

/* Add hmac identifires to the endpoint list of supported hmac ids */
int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
                           struct sctp_hmacalgo *hmacs)
{
        int has_sha1 = 0;
        __u16 id;
        int i;

        /* Scan the list looking for unsupported id.  Also make sure that
         * SHA1 is specified.
         */
        for (i = 0; i < hmacs->shmac_num_idents; i++) {
                id = hmacs->shmac_idents[i];

                if (id > SCTP_AUTH_HMAC_ID_MAX)
                        return -EOPNOTSUPP;

                if (SCTP_AUTH_HMAC_ID_SHA1 == id)
                        has_sha1 = 1;

                if (!sctp_hmac_list[id].hmac_name)
                        return -EOPNOTSUPP;
        }

        if (!has_sha1)
                return -EINVAL;

        for (i = 0; i < hmacs->shmac_num_idents; i++)
                ep->auth_hmacs_list->hmac_ids[i] =
                                htons(hmacs->shmac_idents[i]);
        ep->auth_hmacs_list->param_hdr.length =
                        htons(sizeof(struct sctp_paramhdr) +
                        hmacs->shmac_num_idents * sizeof(__u16));
        return 0;
}

/* Set a new shared key on either endpoint or association.  If the
 * key with a same ID already exists, replace the key (remove the
 * old key and add a new one).
 */
int sctp_auth_set_key(struct sctp_endpoint *ep,
                      struct sctp_association *asoc,
                      struct sctp_authkey *auth_key)
{
        struct sctp_shared_key *cur_key, *shkey;
        struct sctp_auth_bytes *key;
        struct list_head *sh_keys;
        int replace = 0;

        /* Try to find the given key id to see if
         * we are doing a replace, or adding a new key
         */
        if (asoc) {
                if (!asoc->peer.auth_capable)
                        return -EACCES;
                sh_keys = &asoc->endpoint_shared_keys;
        } else {
                if (!ep->auth_enable)
                        return -EACCES;
                sh_keys = &ep->endpoint_shared_keys;
        }

        key_for_each(shkey, sh_keys) {
                if (shkey->key_id == auth_key->sca_keynumber) {
                        replace = 1;
                        break;
                }
        }

        cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, GFP_KERNEL);
        if (!cur_key)
                return -ENOMEM;

        /* Create a new key data based on the info passed in */
        key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL);
        if (!key) {
                kfree(cur_key);
                return -ENOMEM;
        }

        memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength);
        cur_key->key = key;

        if (!replace) {
                list_add(&cur_key->key_list, sh_keys);
                return 0;
        }

        list_del_init(&shkey->key_list);
        list_add(&cur_key->key_list, sh_keys);

        if (asoc && asoc->active_key_id == auth_key->sca_keynumber &&
            sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) {
                list_del_init(&cur_key->key_list);
                sctp_auth_shkey_release(cur_key);
                list_add(&shkey->key_list, sh_keys);
                return -ENOMEM;
        }

        sctp_auth_shkey_release(shkey);
        return 0;
}

int sctp_auth_set_active_key(struct sctp_endpoint *ep,
                             struct sctp_association *asoc,
                             __u16  key_id)
{
        struct sctp_shared_key *key;
        struct list_head *sh_keys;
        int found = 0;

        /* The key identifier MUST correst to an existing key */
        if (asoc) {
                if (!asoc->peer.auth_capable)
                        return -EACCES;
                sh_keys = &asoc->endpoint_shared_keys;
        } else {
                if (!ep->auth_enable)
                        return -EACCES;
                sh_keys = &ep->endpoint_shared_keys;
        }

        key_for_each(key, sh_keys) {
                if (key->key_id == key_id) {
                        found = 1;
                        break;
                }
        }

        if (!found || key->deactivated)
                return -EINVAL;

        if (asoc) {
                __u16  active_key_id = asoc->active_key_id;

                asoc->active_key_id = key_id;
                if (sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) {
                        asoc->active_key_id = active_key_id;
                        return -ENOMEM;
                }
        } else
                ep->active_key_id = key_id;

        return 0;
}

int sctp_auth_del_key_id(struct sctp_endpoint *ep,
                         struct sctp_association *asoc,
                         __u16  key_id)
{
        struct sctp_shared_key *key;
        struct list_head *sh_keys;
        int found = 0;

        /* The key identifier MUST NOT be the current active key
         * The key identifier MUST correst to an existing key
         */
        if (asoc) {
                if (!asoc->peer.auth_capable)
                        return -EACCES;
                if (asoc->active_key_id == key_id)
                        return -EINVAL;

                sh_keys = &asoc->endpoint_shared_keys;
        } else {
                if (!ep->auth_enable)
                        return -EACCES;
                if (ep->active_key_id == key_id)
                        return -EINVAL;

                sh_keys = &ep->endpoint_shared_keys;
        }

        key_for_each(key, sh_keys) {
                if (key->key_id == key_id) {
                        found = 1;
                        break;
                }
        }

        if (!found)
                return -EINVAL;

        /* Delete the shared key */
        list_del_init(&key->key_list);
        sctp_auth_shkey_release(key);

        return 0;
}

int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
                           struct sctp_association *asoc, __u16  key_id)
{
        struct sctp_shared_key *key;
        struct list_head *sh_keys;
        int found = 0;

        /* The key identifier MUST NOT be the current active key
         * The key identifier MUST correst to an existing key
         */
        if (asoc) {
                if (!asoc->peer.auth_capable)
                        return -EACCES;
                if (asoc->active_key_id == key_id)
                        return -EINVAL;

                sh_keys = &asoc->endpoint_shared_keys;
        } else {
                if (!ep->auth_enable)
                        return -EACCES;
                if (ep->active_key_id == key_id)
                        return -EINVAL;

                sh_keys = &ep->endpoint_shared_keys;
        }

        key_for_each(key, sh_keys) {
                if (key->key_id == key_id) {
                        found = 1;
                        break;
                }
        }

        if (!found)
                return -EINVAL;

        /* refcnt == 1 and !list_empty mean it's not being used anywhere
         * and deactivated will be set, so it's time to notify userland
         * that this shkey can be freed.
         */
        if (asoc && !list_empty(&key->key_list) &&
            refcount_read(&key->refcnt) == 1) {
                struct sctp_ulpevent *ev;

                ev = sctp_ulpevent_make_authkey(asoc, key->key_id,
                                                SCTP_AUTH_FREE_KEY, GFP_KERNEL);
                if (ev)
                        asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
        }

        key->deactivated = 1;

        return 0;
}

int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp)
{
        int err = -ENOMEM;

        /* Allocate space for HMACS and CHUNKS authentication
         * variables.  There are arrays that we encode directly
         * into parameters to make the rest of the operations easier.
         */
        if (!ep->auth_hmacs_list) {
                struct sctp_hmac_algo_param *auth_hmacs;

                auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids,
                                                 SCTP_AUTH_NUM_HMACS), gfp);
                if (!auth_hmacs)
                        goto nomem;
                /* Initialize the HMACS parameter.
                 * SCTP-AUTH: Section 3.3
                 *    Every endpoint supporting SCTP chunk authentication MUST
                 *    support the HMAC based on the SHA-1 algorithm.
                 */
                auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO;
                auth_hmacs->param_hdr.length =
                                htons(sizeof(struct sctp_paramhdr) + 2);
                auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1);
                ep->auth_hmacs_list = auth_hmacs;
        }

        if (!ep->auth_chunk_list) {
                struct sctp_chunks_param *auth_chunks;

                auth_chunks = kzalloc(sizeof(*auth_chunks) +
                                      SCTP_NUM_CHUNK_TYPES, gfp);
                if (!auth_chunks)
                        goto nomem;
                /* Initialize the CHUNKS parameter */
                auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS;
                auth_chunks->param_hdr.length =
                                htons(sizeof(struct sctp_paramhdr));
                ep->auth_chunk_list = auth_chunks;
        }

        /* Allocate and initialize transorms arrays for supported
         * HMACs.
         */
        err = sctp_auth_init_hmacs(ep, gfp);
        if (err)
                goto nomem;

        return 0;

nomem:
        /* Free all allocations */
        kfree(ep->auth_hmacs_list);
        kfree(ep->auth_chunk_list);
        ep->auth_hmacs_list = NULL;
        ep->auth_chunk_list = NULL;
        return err;
}

void sctp_auth_free(struct sctp_endpoint *ep)
{
        kfree(ep->auth_hmacs_list);
        kfree(ep->auth_chunk_list);
        ep->auth_hmacs_list = NULL;
        ep->auth_chunk_list = NULL;
        sctp_auth_destroy_hmacs(ep->auth_hmacs);
        ep->auth_hmacs = NULL;
}





























































































































































































































































































































































































































































































































































































































































































































































































































    1 






    3 
    1 





































    1 



    1 






































    1 























    1 


























































































































































































































    4 

















    1 














    1 








    1 












    1 
    1 





    1 








    1 
    1 
































































































































    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_XFRM_H
#define _NET_XFRM_H

#include <linux/compiler.h>
#include <linux/xfrm.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/pfkeyv2.h>
#include <linux/ipsec.h>
#include <linux/in6.h>
#include <linux/mutex.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/refcount.h>
#include <linux/sockptr.h>

#include <net/sock.h>
#include <net/dst.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/flow.h>
#include <net/gro_cells.h>

#include <linux/interrupt.h>

#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif

#define XFRM_PROTO_ESP                50
#define XFRM_PROTO_AH                51
#define XFRM_PROTO_COMP                108
#define XFRM_PROTO_IPIP                4
#define XFRM_PROTO_IPV6                41
#define XFRM_PROTO_ROUTING        IPPROTO_ROUTING
#define XFRM_PROTO_DSTOPTS        IPPROTO_DSTOPTS

#define XFRM_ALIGN4(len)        (((len) + 3) & ~3)
#define XFRM_ALIGN8(len)        (((len) + 7) & ~7)
#define MODULE_ALIAS_XFRM_MODE(family, encap) \
        MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
#define MODULE_ALIAS_XFRM_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-type-" __stringify(family) "-" __stringify(proto))
#define MODULE_ALIAS_XFRM_OFFLOAD_TYPE(family, proto) \
        MODULE_ALIAS("xfrm-offload-" __stringify(family) "-" __stringify(proto))

#ifdef CONFIG_XFRM_STATISTICS
#define XFRM_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.xfrm_statistics, field)
#define XFRM_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.xfrm_statistics, field, val)
#else
#define XFRM_INC_STATS(net, field)        ((void)(net))
#define XFRM_ADD_STATS(net, field, val) ((void)(net))
#endif


/* Organization of SPD aka "XFRM rules"
   ------------------------------------

   Basic objects:
   - policy rule, struct xfrm_policy (=SPD entry)
   - bundle of transformations, struct dst_entry == struct xfrm_dst (=SA bundle)
   - instance of a transformer, struct xfrm_state (=SA)
   - template to clone xfrm_state, struct xfrm_tmpl

   SPD is plain linear list of xfrm_policy rules, ordered by priority.
   (To be compatible with existing pfkeyv2 implementations,
   many rules with priority of 0x7fffffff are allowed to exist and
   such rules are ordered in an unpredictable way, thanks to bsd folks.)

   Lookup is plain linear search until the first match with selector.

   If "action" is "block", then we prohibit the flow, otherwise:
   if "xfrms_nr" is zero, the flow passes untransformed. Otherwise,
   policy entry has list of up to XFRM_MAX_DEPTH transformations,
   described by templates xfrm_tmpl. Each template is resolved
   to a complete xfrm_state (see below) and we pack bundle of transformations
   to a dst_entry returned to requestor.

   dst -. xfrm  .-> xfrm_state #1
    |---. child .-> dst -. xfrm .-> xfrm_state #2
                     |---. child .-> dst -. xfrm .-> xfrm_state #3
                                      |---. child .-> NULL

   Bundles are cached at xrfm_policy struct (field ->bundles).


   Resolution of xrfm_tmpl
   -----------------------
   Template contains:
   1. ->mode                Mode: transport or tunnel
   2. ->id.proto        Protocol: AH/ESP/IPCOMP
   3. ->id.daddr        Remote tunnel endpoint, ignored for transport mode.
      Q: allow to resolve security gateway?
   4. ->id.spi          If not zero, static SPI.
   5. ->saddr                Local tunnel endpoint, ignored for transport mode.
   6. ->algos                List of allowed algos. Plain bitmask now.
      Q: ealgos, aalgos, calgos. What a mess...
   7. ->share                Sharing mode.
      Q: how to implement private sharing mode? To add struct sock* to
      flow id?

   Having this template we search through SAD searching for entries
   with appropriate mode/proto/algo, permitted by selector.
   If no appropriate entry found, it is requested from key manager.

   PROBLEMS:
   Q: How to find all the bundles referring to a physical path for
      PMTU discovery? Seems, dst should contain list of all parents...
      and enter to infinite locking hierarchy disaster.
      No! It is easier, we will not search for them, let them find us.
      We add genid to each dst plus pointer to genid of raw IP route,
      pmtu disc will update pmtu on raw IP route and increase its genid.
      dst_check() will see this for top level and trigger resyncing
      metrics. Plus, it will be made via sk->sk_dst_cache. Solved.
 */

struct xfrm_state_walk {
        struct list_head        all;
        u8                        state;
        u8                        dying;
        u8                        proto;
        u32                        seq;
        struct xfrm_address_filter *filter;
};

enum {
        XFRM_DEV_OFFLOAD_IN = 1,
        XFRM_DEV_OFFLOAD_OUT,
        XFRM_DEV_OFFLOAD_FWD,
};

enum {
        XFRM_DEV_OFFLOAD_UNSPECIFIED,
        XFRM_DEV_OFFLOAD_CRYPTO,
        XFRM_DEV_OFFLOAD_PACKET,
};

enum {
        XFRM_DEV_OFFLOAD_FLAG_ACQ = 1,
};

struct xfrm_dev_offload {
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        struct net_device        *real_dev;
        unsigned long                offload_handle;
        u8                        dir : 2;
        u8                        type : 2;
        u8                        flags : 2;
};

struct xfrm_mode {
        u8 encap;
        u8 family;
        u8 flags;
};

/* Flags for xfrm_mode. */
enum {
        XFRM_MODE_FLAG_TUNNEL = 1,
};

enum xfrm_replay_mode {
        XFRM_REPLAY_MODE_LEGACY,
        XFRM_REPLAY_MODE_BMP,
        XFRM_REPLAY_MODE_ESN,
};

/* Full description of state of transformer. */
struct xfrm_state {
        possible_net_t                xs_net;
        union {
                struct hlist_node        gclist;
                struct hlist_node        bydst;
        };
        struct hlist_node        bysrc;
        struct hlist_node        byspi;
        struct hlist_node        byseq;

        refcount_t                refcnt;
        spinlock_t                lock;

        struct xfrm_id                id;
        struct xfrm_selector        sel;
        struct xfrm_mark        mark;
        u32                        if_id;
        u32                        tfcpad;

        u32                        genid;

        /* Key manager bits */
        struct xfrm_state_walk        km;

        /* Parameters of this state. */
        struct {
                u32                reqid;
                u8                mode;
                u8                replay_window;
                u8                aalgo, ealgo, calgo;
                u8                flags;
                u16                family;
                xfrm_address_t        saddr;
                int                header_len;
                int                trailer_len;
                u32                extra_flags;
                struct xfrm_mark        smark;
        } props;

        struct xfrm_lifetime_cfg lft;

        /* Data for transformer */
        struct xfrm_algo_auth        *aalg;
        struct xfrm_algo        *ealg;
        struct xfrm_algo        *calg;
        struct xfrm_algo_aead        *aead;
        const char                *geniv;

        /* mapping change rate limiting */
        __be16 new_mapping_sport;
        u32 new_mapping;        /* seconds */
        u32 mapping_maxage;        /* seconds for input SA */

        /* Data for encapsulator */
        struct xfrm_encap_tmpl        *encap;
        struct sock __rcu        *encap_sk;

        /* Data for care-of address */
        xfrm_address_t        *coaddr;

        /* IPComp needs an IPIP tunnel for handling uncompressed packets */
        struct xfrm_state        *tunnel;

        /* If a tunnel, number of users + 1 */
        atomic_t                tunnel_users;

        /* State for replay detection */
        struct xfrm_replay_state replay;
        struct xfrm_replay_state_esn *replay_esn;

        /* Replay detection state at the time we sent the last notification */
        struct xfrm_replay_state preplay;
        struct xfrm_replay_state_esn *preplay_esn;

        /* replay detection mode */
        enum xfrm_replay_mode    repl_mode;
        /* internal flag that only holds state for delayed aevent at the
         * moment
        */
        u32                        xflags;

        /* Replay detection notification settings */
        u32                        replay_maxage;
        u32                        replay_maxdiff;

        /* Replay detection notification timer */
        struct timer_list        rtimer;

        /* Statistics */
        struct xfrm_stats        stats;

        struct xfrm_lifetime_cur curlft;
        struct hrtimer                mtimer;

        struct xfrm_dev_offload xso;

        /* used to fix curlft->add_time when changing date */
        long                saved_tmo;

        /* Last used time */
        time64_t                lastused;

        struct page_frag xfrag;

        /* Reference to data common to all the instances of this
         * transformer. */
        const struct xfrm_type        *type;
        struct xfrm_mode        inner_mode;
        struct xfrm_mode        inner_mode_iaf;
        struct xfrm_mode        outer_mode;

        const struct xfrm_type_offload        *type_offload;

        /* Security context */
        struct xfrm_sec_ctx        *security;

        /* Private data of this transformer, format is opaque,
         * interpreted by xfrm_type methods. */
        void                        *data;
        u8                        dir;
};

static inline struct net *xs_net(struct xfrm_state *x)
{
        return read_pnet(&x->xs_net);
}

/* xflags - make enum if more show up */
#define XFRM_TIME_DEFER        1
#define XFRM_SOFT_EXPIRE 2

enum {
        XFRM_STATE_VOID,
        XFRM_STATE_ACQ,
        XFRM_STATE_VALID,
        XFRM_STATE_ERROR,
        XFRM_STATE_EXPIRED,
        XFRM_STATE_DEAD
};

/* callback structure passed from either netlink or pfkey */
struct km_event {
        union {
                u32 hard;
                u32 proto;
                u32 byid;
                u32 aevent;
                u32 type;
        } data;

        u32        seq;
        u32        portid;
        u32        event;
        struct net *net;
};

struct xfrm_if_decode_session_result {
        struct net *net;
        u32 if_id;
};

struct xfrm_if_cb {
        bool (*decode_session)(struct sk_buff *skb,
                               unsigned short family,
                               struct xfrm_if_decode_session_result *res);
};

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
void xfrm_if_unregister_cb(void);

struct net_device;
struct xfrm_type;
struct xfrm_dst;
struct xfrm_policy_afinfo {
        struct dst_ops                *dst_ops;
        struct dst_entry        *(*dst_lookup)(struct net *net,
                                               int tos, int oif,
                                               const xfrm_address_t *saddr,
                                               const xfrm_address_t *daddr,
                                               u32 mark);
        int                        (*get_saddr)(struct net *net, int oif,
                                             xfrm_address_t *saddr,
                                             xfrm_address_t *daddr,
                                             u32 mark);
        int                        (*fill_dst)(struct xfrm_dst *xdst,
                                            struct net_device *dev,
                                            const struct flowi *fl);
        struct dst_entry        *(*blackhole_route)(struct net *net, struct dst_entry *orig);
};

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family);
void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
void km_policy_notify(struct xfrm_policy *xp, int dir,
                      const struct km_event *c);
void km_state_notify(struct xfrm_state *x, const struct km_event *c);

struct xfrm_tmpl;
int km_query(struct xfrm_state *x, struct xfrm_tmpl *t,
             struct xfrm_policy *pol);
void km_state_expired(struct xfrm_state *x, int hard, u32 portid);
int __xfrm_state_delete(struct xfrm_state *x);

struct xfrm_state_afinfo {
        u8                                family;
        u8                                proto;

        const struct xfrm_type_offload *type_offload_esp;

        const struct xfrm_type                *type_esp;
        const struct xfrm_type                *type_ipip;
        const struct xfrm_type                *type_ipip6;
        const struct xfrm_type                *type_comp;
        const struct xfrm_type                *type_ah;
        const struct xfrm_type                *type_routing;
        const struct xfrm_type                *type_dstopts;

        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int                        (*transport_finish)(struct sk_buff *skb,
                                                    int async);
        void                        (*local_error)(struct sk_buff *skb, u32 mtu);
};

int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo);
struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family);

struct xfrm_input_afinfo {
        u8                        family;
        bool                        is_ipip;
        int                        (*callback)(struct sk_buff *skb, u8 protocol,
                                            int err);
};

int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo);
int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo);

void xfrm_flush_gc(void);
void xfrm_state_delete_tunnel(struct xfrm_state *x);

struct xfrm_type {
        struct module                *owner;
        u8                        proto;
        u8                        flags;
#define XFRM_TYPE_NON_FRAGMENT        1
#define XFRM_TYPE_REPLAY_PROT        2
#define XFRM_TYPE_LOCAL_COADDR        4
#define XFRM_TYPE_REMOTE_COADDR        8

        int                        (*init_state)(struct xfrm_state *x,
                                              struct netlink_ext_ack *extack);
        void                        (*destructor)(struct xfrm_state *);
        int                        (*input)(struct xfrm_state *, struct sk_buff *skb);
        int                        (*output)(struct xfrm_state *, struct sk_buff *pskb);
        int                        (*reject)(struct xfrm_state *, struct sk_buff *,
                                          const struct flowi *);
};

int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);

struct xfrm_type_offload {
        struct module        *owner;
        u8                proto;
        void                (*encap)(struct xfrm_state *, struct sk_buff *pskb);
        int                (*input_tail)(struct xfrm_state *x, struct sk_buff *skb);
        int                (*xmit)(struct xfrm_state *, struct sk_buff *pskb, netdev_features_t features);
};

int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);

static inline int xfrm_af2proto(unsigned int family)
{
        switch(family) {
        case AF_INET:
                return IPPROTO_IPIP;
        case AF_INET6:
                return IPPROTO_IPV6;
        default:
                return 0;
        }
}

static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
{
        if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
            (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
                return &x->inner_mode;
        else
                return &x->inner_mode_iaf;
}

struct xfrm_tmpl {
/* id in template is interpreted as:
 * daddr - destination of tunnel, may be zero for transport mode.
 * spi   - zero to acquire spi. Not zero if spi is static, then
 *           daddr must be fixed too.
 * proto - AH/ESP/IPCOMP
 */
        struct xfrm_id                id;

/* Source address of tunnel. Ignored, if it is not a tunnel. */
        xfrm_address_t                saddr;

        unsigned short                encap_family;

        u32                        reqid;

/* Mode: transport, tunnel etc. */
        u8                        mode;

/* Sharing mode: unique, this session only, this user only etc. */
        u8                        share;

/* May skip this transfomration if no SA is found */
        u8                        optional;

/* Skip aalgos/ealgos/calgos checks. */
        u8                        allalgs;

/* Bit mask of algos allowed for acquisition */
        u32                        aalgos;
        u32                        ealgos;
        u32                        calgos;
};

#define XFRM_MAX_DEPTH                6
#define XFRM_MAX_OFFLOAD_DEPTH        1

struct xfrm_policy_walk_entry {
        struct list_head        all;
        u8                        dead;
};

struct xfrm_policy_walk {
        struct xfrm_policy_walk_entry walk;
        u8 type;
        u32 seq;
};

struct xfrm_policy_queue {
        struct sk_buff_head        hold_queue;
        struct timer_list        hold_timer;
        unsigned long                timeout;
};

struct xfrm_policy {
        possible_net_t                xp_net;
        struct hlist_node        bydst;
        struct hlist_node        byidx;

        /* This lock only affects elements except for entry. */
        rwlock_t                lock;
        refcount_t                refcnt;
        u32                        pos;
        struct timer_list        timer;

        atomic_t                genid;
        u32                        priority;
        u32                        index;
        u32                        if_id;
        struct xfrm_mark        mark;
        struct xfrm_selector        selector;
        struct xfrm_lifetime_cfg lft;
        struct xfrm_lifetime_cur curlft;
        struct xfrm_policy_walk_entry walk;
        struct xfrm_policy_queue polq;
        bool                    bydst_reinsert;
        u8                        type;
        u8                        action;
        u8                        flags;
        u8                        xfrm_nr;
        u16                        family;
        struct xfrm_sec_ctx        *security;
        struct xfrm_tmpl               xfrm_vec[XFRM_MAX_DEPTH];
        struct hlist_node        bydst_inexact_list;
        struct rcu_head                rcu;

        struct xfrm_dev_offload xdo;
};

static inline struct net *xp_net(const struct xfrm_policy *xp)
{
        return read_pnet(&xp->xp_net);
}

struct xfrm_kmaddress {
        xfrm_address_t          local;
        xfrm_address_t          remote;
        u32                        reserved;
        u16                        family;
};

struct xfrm_migrate {
        xfrm_address_t                old_daddr;
        xfrm_address_t                old_saddr;
        xfrm_address_t                new_daddr;
        xfrm_address_t                new_saddr;
        u8                        proto;
        u8                        mode;
        u16                        reserved;
        u32                        reqid;
        u16                        old_family;
        u16                        new_family;
};

#define XFRM_KM_TIMEOUT                30
/* what happened */
#define XFRM_REPLAY_UPDATE        XFRM_AE_CR
#define XFRM_REPLAY_TIMEOUT        XFRM_AE_CE

/* default aevent timeout in units of 100ms */
#define XFRM_AE_ETIME                        10
/* Async Event timer multiplier */
#define XFRM_AE_ETH_M                        10
/* default seq threshold size */
#define XFRM_AE_SEQT_SIZE                2

struct xfrm_mgr {
        struct list_head        list;
        int                        (*notify)(struct xfrm_state *x, const struct km_event *c);
        int                        (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp);
        struct xfrm_policy        *(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir);
        int                        (*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
        int                        (*notify_policy)(struct xfrm_policy *x, int dir, const struct km_event *c);
        int                        (*report)(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr);
        int                        (*migrate)(const struct xfrm_selector *sel,
                                           u8 dir, u8 type,
                                           const struct xfrm_migrate *m,
                                           int num_bundles,
                                           const struct xfrm_kmaddress *k,
                                           const struct xfrm_encap_tmpl *encap);
        bool                        (*is_alive)(const struct km_event *c);
};

void xfrm_register_km(struct xfrm_mgr *km);
void xfrm_unregister_km(struct xfrm_mgr *km);

struct xfrm_tunnel_skb_cb {
        union {
                struct inet_skb_parm h4;
                struct inet6_skb_parm h6;
        } header;

        union {
                struct ip_tunnel *ip4;
                struct ip6_tnl *ip6;
        } tunnel;
};

#define XFRM_TUNNEL_SKB_CB(__skb) ((struct xfrm_tunnel_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used for the duration where packets are being
 * transformed by IPsec.  As soon as the packet leaves IPsec the
 * area beyond the generic IP part may be overwritten.
 */
struct xfrm_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Sequence number for replay protection. */
        union {
                struct {
                        __u32 low;
                        __u32 hi;
                } output;
                struct {
                        __be32 low;
                        __be32 hi;
                } input;
        } seq;
};

#define XFRM_SKB_CB(__skb) ((struct xfrm_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the afinfo prepare_input/prepare_output functions
 * to transmit header information to the mode input/output functions.
 */
struct xfrm_mode_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        /* Copied from header for IPv4, always set to zero and DF for IPv6. */
        __be16 id;
        __be16 frag_off;

        /* IP header length (excluding options or extension headers). */
        u8 ihl;

        /* TOS for IPv4, class for IPv6. */
        u8 tos;

        /* TTL for IPv4, hop limitfor IPv6. */
        u8 ttl;

        /* Protocol for IPv4, NH for IPv6. */
        u8 protocol;

        /* Option length for IPv4, zero for IPv6. */
        u8 optlen;

        /* Used by IPv6 only, zero for IPv4. */
        u8 flow_lbl[3];
};

#define XFRM_MODE_SKB_CB(__skb) ((struct xfrm_mode_skb_cb *)&((__skb)->cb[0]))

/*
 * This structure is used by the input processing to locate the SPI and
 * related information.
 */
struct xfrm_spi_skb_cb {
        struct xfrm_tunnel_skb_cb header;

        unsigned int daddroff;
        unsigned int family;
        __be32 seq;
};

#define XFRM_SPI_SKB_CB(__skb) ((struct xfrm_spi_skb_cb *)&((__skb)->cb[0]))

#ifdef CONFIG_AUDITSYSCALL
static inline struct audit_buffer *xfrm_audit_start(const char *op)
{
        struct audit_buffer *audit_buf = NULL;

        if (audit_enabled == AUDIT_OFF)
                return NULL;
        audit_buf = audit_log_start(audit_context(), GFP_ATOMIC,
                                    AUDIT_MAC_IPSEC_EVENT);
        if (audit_buf == NULL)
                return NULL;
        audit_log_format(audit_buf, "op=%s", op);
        return audit_buf;
}

static inline void xfrm_audit_helper_usrinfo(bool task_valid,
                                             struct audit_buffer *audit_buf)
{
        const unsigned int auid = from_kuid(&init_user_ns, task_valid ?
                                            audit_get_loginuid(current) :
                                            INVALID_UID);
        const unsigned int ses = task_valid ? audit_get_sessionid(current) :
                AUDIT_SID_UNSET;

        audit_log_format(audit_buf, " auid=%u ses=%u", auid, ses);
        audit_log_task_context(audit_buf);
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid);
void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid);
void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid);
void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                      struct sk_buff *skb);
void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb,
                             __be32 net_seq);
void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family);
void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi,
                               __be32 net_seq);
void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb,
                              u8 proto);
#else

static inline void xfrm_audit_policy_add(struct xfrm_policy *xp, int result,
                                         bool task_valid)
{
}

static inline void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                                            bool task_valid)
{
}

static inline void xfrm_audit_state_add(struct xfrm_state *x, int result,
                                        bool task_valid)
{
}

static inline void xfrm_audit_state_delete(struct xfrm_state *x, int result,
                                           bool task_valid)
{
}

static inline void xfrm_audit_state_replay_overflow(struct xfrm_state *x,
                                             struct sk_buff *skb)
{
}

static inline void xfrm_audit_state_replay(struct xfrm_state *x,
                                           struct sk_buff *skb, __be32 net_seq)
{
}

static inline void xfrm_audit_state_notfound_simple(struct sk_buff *skb,
                                      u16 family)
{
}

static inline void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family,
                                      __be32 net_spi, __be32 net_seq)
{
}

static inline void xfrm_audit_state_icvfail(struct xfrm_state *x,
                                     struct sk_buff *skb, u8 proto)
{
}
#endif /* CONFIG_AUDITSYSCALL */

static inline void xfrm_pol_hold(struct xfrm_policy *policy)
{
        if (likely(policy != NULL))
                refcount_inc(&policy->refcnt);
}

void xfrm_policy_destroy(struct xfrm_policy *policy);

static inline void xfrm_pol_put(struct xfrm_policy *policy)
{
        if (refcount_dec_and_test(&policy->refcnt))
                xfrm_policy_destroy(policy);
}

static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
{
        int i;
        for (i = npols - 1; i >= 0; --i)
                xfrm_pol_put(pols[i]);
}

void __xfrm_state_destroy(struct xfrm_state *, bool);

static inline void __xfrm_state_put(struct xfrm_state *x)
{
        refcount_dec(&x->refcnt);
}

static inline void xfrm_state_put(struct xfrm_state *x)
{
        if (refcount_dec_and_test(&x->refcnt))
                __xfrm_state_destroy(x, false);
}

static inline void xfrm_state_put_sync(struct xfrm_state *x)
{
        if (refcount_dec_and_test(&x->refcnt))
                __xfrm_state_destroy(x, true);
}

static inline void xfrm_state_hold(struct xfrm_state *x)
{
        refcount_inc(&x->refcnt);
}

static inline bool addr_match(const void *token1, const void *token2,
                              unsigned int prefixlen)
{
        const __be32 *a1 = token1;
        const __be32 *a2 = token2;
        unsigned int pdw;
        unsigned int pbi;

        pdw = prefixlen >> 5;          /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pdw)
                if (memcmp(a1, a2, pdw << 2))
                        return false;

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                if ((a1[pdw] ^ a2[pdw]) & mask)
                        return false;
        }

        return true;
}

static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
{
        /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
        if (sizeof(long) == 4 && prefixlen == 0)
                return true;
        return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen)));
}

static __inline__
__be16 xfrm_flowi_sport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.sport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.type);
                break;
        case IPPROTO_MH:
                port = htons(uli->mht.type);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) >> 16);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

static __inline__
__be16 xfrm_flowi_dport(const struct flowi *fl, const union flowi_uli *uli)
{
        __be16 port;
        switch(fl->flowi_proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_SCTP:
                port = uli->ports.dport;
                break;
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                port = htons(uli->icmpt.code);
                break;
        case IPPROTO_GRE:
                port = htons(ntohl(uli->gre_key) & 0xffff);
                break;
        default:
                port = 0;        /*XXX*/
        }
        return port;
}

bool xfrm_selector_match(const struct xfrm_selector *sel,
                         const struct flowi *fl, unsigned short family);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/*        If neither has a context --> match
 *         Otherwise, both must have a context and the sids, doi, alg must match
 */
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return ((!s1 && !s2) ||
                (s1 && s2 &&
                 (s1->ctx_sid == s2->ctx_sid) &&
                 (s1->ctx_doi == s2->ctx_doi) &&
                 (s1->ctx_alg == s2->ctx_alg)));
}
#else
static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2)
{
        return true;
}
#endif

/* A struct encoding bundle of transformations to apply to some set of flow.
 *
 * xdst->child points to the next element of bundle.
 * dst->xfrm  points to an instanse of transformer.
 *
 * Due to unfortunate limitations of current routing cache, which we
 * have no time to fix, it mirrors struct rtable and bound to the same
 * routing key, including saddr,daddr. However, we can have many of
 * bundles differing by session id. All the bundles grow from a parent
 * policy rule.
 */
struct xfrm_dst {
        union {
                struct dst_entry        dst;
                struct rtable                rt;
                struct rt6_info                rt6;
        } u;
        struct dst_entry *route;
        struct dst_entry *child;
        struct dst_entry *path;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols, num_xfrms;
        u32 xfrm_genid;
        u32 policy_genid;
        u32 route_mtu_cached;
        u32 child_mtu_cached;
        u32 route_cookie;
        u32 path_cookie;
};

static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst;

                return xdst->path;
        }
#endif
        return (struct dst_entry *) dst;
}

static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst)
{
#ifdef CONFIG_XFRM
        if (dst->xfrm || (dst->flags & DST_XFRM_QUEUE)) {
                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
                return xdst->child;
        }
#endif
        return NULL;
}

#ifdef CONFIG_XFRM
static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child)
{
        xdst->child = child;
}

static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
{
        xfrm_pols_put(xdst->pols, xdst->num_pols);
        dst_release(xdst->route);
        if (likely(xdst->u.dst.xfrm))
                xfrm_state_put(xdst->u.dst.xfrm);
}
#endif

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);

struct xfrm_if_parms {
        int link;                /* ifindex of underlying L2 interface */
        u32 if_id;                /* interface identifyer */
        bool collect_md;
};

struct xfrm_if {
        struct xfrm_if __rcu *next;        /* next interface in list */
        struct net_device *dev;                /* virtual device associated with interface */
        struct net *net;                /* netns for packet i/o */
        struct xfrm_if_parms p;                /* interface parms */

        struct gro_cells gro_cells;
};

struct xfrm_offload {
        /* Output sequence number for replay protection on offloading. */
        struct {
                __u32 low;
                __u32 hi;
        } seq;

        __u32                        flags;
#define        SA_DELETE_REQ                1
#define        CRYPTO_DONE                2
#define        CRYPTO_NEXT_DONE        4
#define        CRYPTO_FALLBACK                8
#define        XFRM_GSO_SEGMENT        16
#define        XFRM_GRO                32
/* 64 is free */
#define        XFRM_DEV_RESUME                128
#define        XFRM_XMIT                256

        __u32                        status;
#define CRYPTO_SUCCESS                                1
#define CRYPTO_GENERIC_ERROR                        2
#define CRYPTO_TRANSPORT_AH_AUTH_FAILED                4
#define CRYPTO_TRANSPORT_ESP_AUTH_FAILED        8
#define CRYPTO_TUNNEL_AH_AUTH_FAILED                16
#define CRYPTO_TUNNEL_ESP_AUTH_FAILED                32
#define CRYPTO_INVALID_PACKET_SYNTAX                64
#define CRYPTO_INVALID_PROTOCOL                        128

        /* Used to keep whole l2 header for transport mode GRO */
        __u32                        orig_mac_len;

        __u8                        proto;
        __u8                        inner_ipproto;
};

struct sec_path {
        int                        len;
        int                        olen;
        int                        verified_cnt;

        struct xfrm_state        *xvec[XFRM_MAX_DEPTH];
        struct xfrm_offload        ovec[XFRM_MAX_OFFLOAD_DEPTH];
};

struct sec_path *secpath_set(struct sk_buff *skb);

static inline void
secpath_reset(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        skb_ext_del(skb, SKB_EXT_SEC_PATH);
#endif
}

static inline int
xfrm_addr_any(const xfrm_address_t *addr, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return addr->a4 == 0;
        case AF_INET6:
                return ipv6_addr_any(&addr->in6);
        }
        return 0;
}

static inline int
__xfrm4_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (tmpl->saddr.a4 &&
                 tmpl->saddr.a4 != x->props.saddr.a4);
}

static inline int
__xfrm6_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x)
{
        return        (!ipv6_addr_any((struct in6_addr*)&tmpl->saddr) &&
                 !ipv6_addr_equal((struct in6_addr *)&tmpl->saddr, (struct in6_addr*)&x->props.saddr));
}

static inline int
xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_cmp(tmpl, x);
        case AF_INET6:
                return __xfrm6_state_addr_cmp(tmpl, x);
        }
        return !0;
}

#ifdef CONFIG_XFRM
static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
{
        struct sec_path *sp = skb_sec_path(skb);

        return sp->xvec[sp->len - 1];
}
#endif

static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        struct sec_path *sp = skb_sec_path(skb);

        if (!sp || !sp->olen || sp->len != sp->olen)
                return NULL;

        return &sp->ovec[sp->olen - 1];
#else
        return NULL;
#endif
}

#ifdef CONFIG_XFRM
int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
                        unsigned short family);

static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb,
                                         int dir)
{
        if (!net->xfrm.policy_count[dir] && !secpath_exists(skb))
                return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT;

        return false;
}

static inline bool __xfrm_check_dev_nopolicy(struct sk_buff *skb,
                                             int dir, unsigned short family)
{
        if (dir != XFRM_POLICY_OUT && family == AF_INET) {
                /* same dst may be used for traffic originating from
                 * devices with different policy settings.
                 */
                return IPCB(skb)->flags & IPSKB_NOPOLICY;
        }
        return skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY);
}

static inline int __xfrm_policy_check2(struct sock *sk, int dir,
                                       struct sk_buff *skb,
                                       unsigned int family, int reverse)
{
        struct net *net = dev_net(skb->dev);
        int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0);
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct xfrm_state *x;

        if (sk && sk->sk_policy[XFRM_POLICY_IN])
                return __xfrm_policy_check(sk, ndir, skb, family);

        if (xo) {
                x = xfrm_input_state(skb);
                if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
                        return (xo->flags & CRYPTO_DONE) &&
                               (xo->status & CRYPTO_SUCCESS);
        }

        return __xfrm_check_nopolicy(net, skb, dir) ||
               __xfrm_check_dev_nopolicy(skb, dir, family) ||
               __xfrm_policy_check(sk, ndir, skb, family);
}

static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return __xfrm_policy_check2(sk, dir, skb, family, 0);
}

static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET);
}

static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return xfrm_policy_check(sk, dir, skb, AF_INET6);
}

static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET, 1);
}

static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return __xfrm_policy_check2(sk, dir, skb, AF_INET6, 1);
}

int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse);

static inline int xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                                      unsigned int family)
{
        return __xfrm_decode_session(net, skb, fl, family, 0);
}

static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return __xfrm_decode_session(net, skb, fl, family, 1);
}

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);

static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);

        if (!net->xfrm.policy_count[XFRM_POLICY_OUT] &&
            net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT)
                return true;

        return (skb_dst(skb)->flags & DST_NOXFRM) ||
               __xfrm_route_forward(skb, family);
}

static inline int xfrm4_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET);
}

static inline int xfrm6_route_forward(struct sk_buff *skb)
{
        return xfrm_route_forward(skb, AF_INET6);
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);

static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        if (!sk_fullsock(osk))
                return 0;
        sk->sk_policy[0] = NULL;
        sk->sk_policy[1] = NULL;
        if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
                return __xfrm_sk_clone_policy(sk, osk);
        return 0;
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir);

static inline void xfrm_sk_free_policy(struct sock *sk)
{
        struct xfrm_policy *pol;

        pol = rcu_dereference_protected(sk->sk_policy[0], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX);
                sk->sk_policy[0] = NULL;
        }
        pol = rcu_dereference_protected(sk->sk_policy[1], 1);
        if (unlikely(pol != NULL)) {
                xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
                sk->sk_policy[1] = NULL;
        }
}

#else

static inline void xfrm_sk_free_policy(struct sock *sk) {}
static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family)
{
        return 1;
}
static inline int xfrm_decode_session_reverse(struct net *net, struct sk_buff *skb,
                                              struct flowi *fl,
                                              unsigned int family)
{
        return -ENOSYS;
}
static inline int xfrm4_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
                                             struct sk_buff *skb)
{
        return 1;
}
#endif

static __inline__
xfrm_address_t *xfrm_flowi_daddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.daddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.daddr;
        }
        return NULL;
}

static __inline__
xfrm_address_t *xfrm_flowi_saddr(const struct flowi *fl, unsigned short family)
{
        switch (family){
        case AF_INET:
                return (xfrm_address_t *)&fl->u.ip4.saddr;
        case AF_INET6:
                return (xfrm_address_t *)&fl->u.ip6.saddr;
        }
        return NULL;
}

static __inline__
void xfrm_flowi_addr_get(const struct flowi *fl,
                         xfrm_address_t *saddr, xfrm_address_t *daddr,
                         unsigned short family)
{
        switch(family) {
        case AF_INET:
                memcpy(&saddr->a4, &fl->u.ip4.saddr, sizeof(saddr->a4));
                memcpy(&daddr->a4, &fl->u.ip4.daddr, sizeof(daddr->a4));
                break;
        case AF_INET6:
                saddr->in6 = fl->u.ip6.saddr;
                daddr->in6 = fl->u.ip6.daddr;
                break;
        }
}

static __inline__ int
__xfrm4_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (daddr->a4 == x->id.daddr.a4 &&
            (saddr->a4 == x->props.saddr.a4 || !saddr->a4 || !x->props.saddr.a4))
                return 1;
        return 0;
}

static __inline__ int
__xfrm6_state_addr_check(const struct xfrm_state *x,
                         const xfrm_address_t *daddr, const xfrm_address_t *saddr)
{
        if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) &&
            (ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) ||
             ipv6_addr_any((struct in6_addr *)saddr) ||
             ipv6_addr_any((struct in6_addr *)&x->props.saddr)))
                return 1;
        return 0;
}

static __inline__ int
xfrm_state_addr_check(const struct xfrm_state *x,
                      const xfrm_address_t *daddr, const xfrm_address_t *saddr,
                      unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x, daddr, saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x, daddr, saddr);
        }
        return 0;
}

static __inline__ int
xfrm_state_addr_flow_check(const struct xfrm_state *x, const struct flowi *fl,
                           unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip4.daddr,
                                                (const xfrm_address_t *)&fl->u.ip4.saddr);
        case AF_INET6:
                return __xfrm6_state_addr_check(x,
                                                (const xfrm_address_t *)&fl->u.ip6.daddr,
                                                (const xfrm_address_t *)&fl->u.ip6.saddr);
        }
        return 0;
}

static inline int xfrm_state_kern(const struct xfrm_state *x)
{
        return atomic_read(&x->tunnel_users);
}

static inline bool xfrm_id_proto_valid(u8 proto)
{
        switch (proto) {
        case IPPROTO_AH:
        case IPPROTO_ESP:
        case IPPROTO_COMP:
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ROUTING:
        case IPPROTO_DSTOPTS:
#endif
                return true;
        default:
                return false;
        }
}

/* IPSEC_PROTO_ANY only matches 3 IPsec protocols, 0 could match all. */
static inline int xfrm_id_proto_match(u8 proto, u8 userproto)
{
        return (!userproto || proto == userproto ||
                (userproto == IPSEC_PROTO_ANY && (proto == IPPROTO_AH ||
                                                  proto == IPPROTO_ESP ||
                                                  proto == IPPROTO_COMP)));
}

/*
 * xfrm algorithm information
 */
struct xfrm_algo_aead_info {
        char *geniv;
        u16 icv_truncbits;
};

struct xfrm_algo_auth_info {
        u16 icv_truncbits;
        u16 icv_fullbits;
};

struct xfrm_algo_encr_info {
        char *geniv;
        u16 blockbits;
        u16 defkeybits;
};

struct xfrm_algo_comp_info {
        u16 threshold;
};

struct xfrm_algo_desc {
        char *name;
        char *compat;
        u8 available:1;
        u8 pfkey_supported:1;
        union {
                struct xfrm_algo_aead_info aead;
                struct xfrm_algo_auth_info auth;
                struct xfrm_algo_encr_info encr;
                struct xfrm_algo_comp_info comp;
        } uinfo;
        struct sadb_alg desc;
};

/* XFRM protocol handlers.  */
struct xfrm4_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm4_protocol __rcu *next;
        int priority;
};

struct xfrm6_protocol {
        int (*handler)(struct sk_buff *skb);
        int (*input_handler)(struct sk_buff *skb, int nexthdr, __be32 spi,
                             int encap_type);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);

        struct xfrm6_protocol __rcu *next;
        int priority;
};

/* XFRM tunnel handlers.  */
struct xfrm_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, u32 info);

        struct xfrm_tunnel __rcu *next;
        int priority;
};

struct xfrm6_tunnel {
        int (*handler)(struct sk_buff *skb);
        int (*cb_handler)(struct sk_buff *skb, int err);
        int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
                           u8 type, u8 code, int offset, __be32 info);
        struct xfrm6_tunnel __rcu *next;
        int priority;
};

void xfrm_init(void);
void xfrm4_init(void);
int xfrm_state_init(struct net *net);
void xfrm_state_fini(struct net *net);
void xfrm4_state_init(void);
void xfrm4_protocol_init(void);
#ifdef CONFIG_XFRM
int xfrm6_init(void);
void xfrm6_fini(void);
int xfrm6_state_init(void);
void xfrm6_state_fini(void);
int xfrm6_protocol_init(void);
void xfrm6_protocol_fini(void);
#else
static inline int xfrm6_init(void)
{
        return 0;
}
static inline void xfrm6_fini(void)
{
        ;
}
#endif

#ifdef CONFIG_XFRM_STATISTICS
int xfrm_proc_init(struct net *net);
void xfrm_proc_fini(struct net *net);
#endif

int xfrm_sysctl_init(struct net *net);
#ifdef CONFIG_SYSCTL
void xfrm_sysctl_fini(struct net *net);
#else
static inline void xfrm_sysctl_fini(struct net *net)
{
}
#endif

void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto,
                          struct xfrm_address_filter *filter);
int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk,
                    int (*func)(struct xfrm_state *, int, void*), void *);
void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net);
struct xfrm_state *xfrm_state_alloc(struct net *net);
void xfrm_state_free(struct xfrm_state *x);
struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr,
                                   const xfrm_address_t *saddr,
                                   const struct flowi *fl,
                                   struct xfrm_tmpl *tmpl,
                                   struct xfrm_policy *pol, int *err,
                                   unsigned short family, u32 if_id);
struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
                                       xfrm_address_t *daddr,
                                       xfrm_address_t *saddr,
                                       unsigned short family,
                                       u8 mode, u8 proto, u32 reqid);
struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
                                              unsigned short family);
int xfrm_state_check_expire(struct xfrm_state *x);
void xfrm_state_update_stats(struct net *net);
#ifdef CONFIG_XFRM_OFFLOAD
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xdo = &x->xso;
        struct net_device *dev = xdo->dev;

        if (dev && dev->xfrmdev_ops &&
            dev->xfrmdev_ops->xdo_dev_state_update_stats)
                dev->xfrmdev_ops->xdo_dev_state_update_stats(x);

}
#else
static inline void xfrm_dev_state_update_stats(struct xfrm_state *x) {}
#endif
void xfrm_state_insert(struct xfrm_state *x);
int xfrm_state_add(struct xfrm_state *x);
int xfrm_state_update(struct xfrm_state *x);
struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark,
                                     const xfrm_address_t *daddr, __be32 spi,
                                     u8 proto, unsigned short family);
struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark,
                                            const xfrm_address_t *daddr,
                                            const xfrm_address_t *saddr,
                                            u8 proto,
                                            unsigned short family);
#ifdef CONFIG_XFRM_SUB_POLICY
void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
                    unsigned short family);
void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
                     unsigned short family);
#else
static inline void xfrm_tmpl_sort(struct xfrm_tmpl **d, struct xfrm_tmpl **s,
                                  int n, unsigned short family)
{
}

static inline void xfrm_state_sort(struct xfrm_state **d, struct xfrm_state **s,
                                   int n, unsigned short family)
{
}
#endif

struct xfrmk_sadinfo {
        u32 sadhcnt; /* current hash bkts */
        u32 sadhmcnt; /* max allowed hash bkts */
        u32 sadcnt; /* current running count */
};

struct xfrmk_spdinfo {
        u32 incnt;
        u32 outcnt;
        u32 fwdcnt;
        u32 inscnt;
        u32 outscnt;
        u32 fwdscnt;
        u32 spdhcnt;
        u32 spdhmcnt;
};

struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
int xfrm_state_delete(struct xfrm_state *x);
int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync);
int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
                          bool task_valid);
void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack);
u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
                      struct netlink_ext_ack *extack);
int xfrm_init_state(struct xfrm_state *x);
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
                         int (*finish)(struct net *, struct sock *,
                                       struct sk_buff *));
int xfrm_trans_queue(struct sk_buff *skb,
                     int (*finish)(struct net *, struct sock *,
                                   struct sk_buff *));
int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err);
int xfrm_output(struct sock *sk, struct sk_buff *skb);

#if IS_ENABLED(CONFIG_NET_PKTGEN)
int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
#endif

void xfrm_local_error(struct sk_buff *skb, int mtu);
int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm4_transport_finish(struct sk_buff *skb, int async);
int xfrm4_rcv(struct sk_buff *skb);

static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
{
        XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
        XFRM_SPI_SKB_CB(skb)->family = AF_INET;
        XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
        return xfrm_input(skb, nexthdr, spi, 0);
}

int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol);
int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family);
void xfrm4_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
                  struct ip6_tnl *t);
int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
                    int encap_type);
int xfrm6_transport_finish(struct sk_buff *skb, int async);
int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t);
int xfrm6_rcv(struct sk_buff *skb);
int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
                     xfrm_address_t *saddr, u8 proto);
void xfrm6_local_error(struct sk_buff *skb, u32 mtu);
int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol);
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family);
__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr);
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb);

#ifdef CONFIG_XFRM
void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu);
int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb);
struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
                                        struct sk_buff *skb);
struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
                                        struct sk_buff *skb);
int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval,
                     int optlen);
#else
static inline int xfrm_user_policy(struct sock *sk, int optname,
                                   sockptr_t optval, int optlen)
{
         return -ENOPROTOOPT;
}
#endif

struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr,
                                    int family, u32 mark);

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type);
int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *);
void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net);
int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net,
                                          const struct xfrm_mark *mark,
                                          u32 if_id, u8 type, int dir,
                                          struct xfrm_selector *sel,
                                          struct xfrm_sec_ctx *ctx, int delete,
                                          int *err);
struct xfrm_policy *xfrm_policy_byid(struct net *net,
                                     const struct xfrm_mark *mark, u32 if_id,
                                     u8 type, int dir, u32 id, int delete,
                                     int *err);
int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
void xfrm_policy_hash_rebuild(struct net *net);
u32 xfrm_get_acqseq(void);
int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack);
int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi,
                   struct netlink_ext_ack *extack);
struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
                                 u8 mode, u32 reqid, u32 if_id, u8 proto,
                                 const xfrm_address_t *daddr,
                                 const xfrm_address_t *saddr, int create,
                                 unsigned short family);
int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);

#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
               const struct xfrm_migrate *m, int num_bundles,
               const struct xfrm_kmaddress *k,
               const struct xfrm_encap_tmpl *encap);
struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net,
                                                u32 if_id);
struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
                                      struct xfrm_migrate *m,
                                      struct xfrm_encap_tmpl *encap);
int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_bundles,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id,
                 struct netlink_ext_ack *extack);
#endif

int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid);
int km_report(struct net *net, u8 proto, struct xfrm_selector *sel,
              xfrm_address_t *addr);

void xfrm_input_init(void);
int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq);

void xfrm_probe_algs(void);
int xfrm_count_pfkey_auth_supported(void);
int xfrm_count_pfkey_enc_supported(void);
struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx);
struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id);
struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe);
struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len,
                                            int probe);

static inline bool xfrm6_addr_equal(const xfrm_address_t *a,
                                    const xfrm_address_t *b)
{
        return ipv6_addr_equal((const struct in6_addr *)a,
                               (const struct in6_addr *)b);
}

static inline bool xfrm_addr_equal(const xfrm_address_t *a,
                                   const xfrm_address_t *b,
                                   sa_family_t family)
{
        switch (family) {
        default:
        case AF_INET:
                return ((__force u32)a->a4 ^ (__force u32)b->a4) == 0;
        case AF_INET6:
                return xfrm6_addr_equal(a, b);
        }
}

static inline int xfrm_policy_id2dir(u32 index)
{
        return index & 7;
}

#ifdef CONFIG_XFRM
void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq);
int xfrm_replay_check(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq);
void xfrm_replay_notify(struct xfrm_state *x, int event);
int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb);
int xfrm_replay_recheck(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq);

static inline int xfrm_aevent_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_AEVENTS);
        rcu_read_unlock();
        return ret;
}

static inline int xfrm_acquire_is_on(struct net *net)
{
        struct sock *nlsk;
        int ret = 0;

        rcu_read_lock();
        nlsk = rcu_dereference(net->xfrm.nlsk);
        if (nlsk)
                ret = netlink_has_listeners(nlsk, XFRMNLGRP_ACQUIRE);
        rcu_read_unlock();

        return ret;
}
#endif

static inline unsigned int aead_len(struct xfrm_algo_aead *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg)
{
        return sizeof(*alg) + ((alg->alg_key_len + 7) / 8);
}

static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn)
{
        return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32);
}

#ifdef CONFIG_XFRM_MIGRATE
static inline int xfrm_replay_clone(struct xfrm_state *x,
                                     struct xfrm_state *orig)
{

        x->replay_esn = kmemdup(orig->replay_esn,
                                xfrm_replay_state_esn_len(orig->replay_esn),
                                GFP_KERNEL);
        if (!x->replay_esn)
                return -ENOMEM;
        x->preplay_esn = kmemdup(orig->preplay_esn,
                                 xfrm_replay_state_esn_len(orig->preplay_esn),
                                 GFP_KERNEL);
        if (!x->preplay_esn)
                return -ENOMEM;

        return 0;
}

static inline struct xfrm_algo_aead *xfrm_algo_aead_clone(struct xfrm_algo_aead *orig)
{
        return kmemdup(orig, aead_len(orig), GFP_KERNEL);
}


static inline struct xfrm_algo *xfrm_algo_clone(struct xfrm_algo *orig)
{
        return kmemdup(orig, xfrm_alg_len(orig), GFP_KERNEL);
}

static inline struct xfrm_algo_auth *xfrm_algo_auth_clone(struct xfrm_algo_auth *orig)
{
        return kmemdup(orig, xfrm_alg_auth_len(orig), GFP_KERNEL);
}

static inline void xfrm_states_put(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_put(*(states + i));
}

static inline void xfrm_states_delete(struct xfrm_state **states, int n)
{
        int i;
        for (i = 0; i < n; i++)
                xfrm_state_delete(*(states + i));
}
#endif

void __init xfrm_dev_init(void);

#ifdef CONFIG_XFRM_OFFLOAD
void xfrm_dev_resume(struct sk_buff *skb);
void xfrm_dev_backlog(struct softnet_data *sd);
struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again);
int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo,
                       struct netlink_ext_ack *extack);
int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                        struct xfrm_user_offload *xuo, u8 dir,
                        struct netlink_ext_ack *extack);
bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;

        if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn)
                xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        struct xfrm_state *x = dst->xfrm;
        struct xfrm_dst *xdst;

        if (!x || !x->type_offload)
                return false;

        xdst = (struct xfrm_dst *) dst;
        if (!x->xso.offload_handle && !xdst->child->xfrm)
                return true;
        if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) &&
            !xdst->child->xfrm)
                return true;

        return false;
}

static inline void xfrm_dev_state_delete(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;

        if (xso->dev)
                xso->dev->xfrmdev_ops->xdo_dev_state_delete(x);
}

static inline void xfrm_dev_state_free(struct xfrm_state *x)
{
        struct xfrm_dev_offload *xso = &x->xso;
        struct net_device *dev = xso->dev;

        if (dev && dev->xfrmdev_ops) {
                if (dev->xfrmdev_ops->xdo_dev_state_free)
                        dev->xfrmdev_ops->xdo_dev_state_free(x);
                xso->dev = NULL;
                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                netdev_put(dev, &xso->dev_tracker);
        }
}

static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
{
        struct xfrm_dev_offload *xdo = &x->xdo;
        struct net_device *dev = xdo->dev;

        if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete)
                dev->xfrmdev_ops->xdo_dev_policy_delete(x);
}

static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
{
        struct xfrm_dev_offload *xdo = &x->xdo;
        struct net_device *dev = xdo->dev;

        if (dev && dev->xfrmdev_ops) {
                if (dev->xfrmdev_ops->xdo_dev_policy_free)
                        dev->xfrmdev_ops->xdo_dev_policy_free(x);
                xdo->dev = NULL;
                netdev_put(dev, &xdo->dev_tracker);
        }
}
#else
static inline void xfrm_dev_resume(struct sk_buff *skb)
{
}

static inline void xfrm_dev_backlog(struct softnet_data *sd)
{
}

static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
        return skb;
}

static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack)
{
        return 0;
}

static inline void xfrm_dev_state_delete(struct xfrm_state *x)
{
}

static inline void xfrm_dev_state_free(struct xfrm_state *x)
{
}

static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                                      struct xfrm_user_offload *xuo, u8 dir,
                                      struct netlink_ext_ack *extack)
{
        return 0;
}

static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
{
}

static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
{
}

static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
{
        return false;
}

static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
{
}

static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
{
        return false;
}
#endif

static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
{
        if (attrs[XFRMA_MARK])
                memcpy(m, nla_data(attrs[XFRMA_MARK]), sizeof(struct xfrm_mark));
        else
                m->v = m->m = 0;

        return m->v & m->m;
}

static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
{
        int ret = 0;

        if (m->m | m->v)
                ret = nla_put(skb, XFRMA_MARK, sizeof(struct xfrm_mark), m);
        return ret;
}

static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x)
{
        struct xfrm_mark *m = &x->props.smark;

        return (m->v & m->m) | (mark & ~m->m);
}

static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id)
{
        int ret = 0;

        if (if_id)
                ret = nla_put_u32(skb, XFRMA_IF_ID, if_id);
        return ret;
}

static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
                                    unsigned int family)
{
        bool tunnel = false;

        switch(family) {
        case AF_INET:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
                        tunnel = true;
                break;
        case AF_INET6:
                if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
                        tunnel = true;
                break;
        }
        if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL))
                return -EINVAL;

        return 0;
}

extern const int xfrm_msg_min[XFRM_NR_MSGTYPES];
extern const struct nla_policy xfrma_policy[XFRMA_MAX+1];

struct xfrm_translator {
        /* Allocate frag_list and put compat translation there */
        int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src);

        /* Allocate nlmsg with 64-bit translaton of received 32-bit message */
        struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh,
                        int maxtype, const struct nla_policy *policy,
                        struct netlink_ext_ack *extack);

        /* Translate 32-bit user_policy from sockptr */
        int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen);

        struct module *owner;
};

#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
extern int xfrm_register_translator(struct xfrm_translator *xtr);
extern int xfrm_unregister_translator(struct xfrm_translator *xtr);
extern struct xfrm_translator *xfrm_get_translator(void);
extern void xfrm_put_translator(struct xfrm_translator *xtr);
#else
static inline struct xfrm_translator *xfrm_get_translator(void)
{
        return NULL;
}
static inline void xfrm_put_translator(struct xfrm_translator *xtr)
{
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline bool xfrm6_local_dontfrag(const struct sock *sk)
{
        int proto;

        if (!sk || sk->sk_family != AF_INET6)
                return false;

        proto = sk->sk_protocol;
        if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
                return inet6_test_bit(DONTFRAG, sk);

        return false;
}
#endif

#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))

extern struct metadata_dst __percpu *xfrm_bpf_md_dst;

int register_xfrm_interface_bpf(void);

#else

static inline int register_xfrm_interface_bpf(void)
{
        return 0;
}

#endif

#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
int register_xfrm_state_bpf(void);
#else
static inline int register_xfrm_state_bpf(void)
{
        return 0;
}
#endif

#endif        /* _NET_XFRM_H */


















































































































































































    1 


























    2 
    2 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Multipath TCP
 *
 * Copyright (c) 2017 - 2019, Intel Corporation.
 */

#ifndef __NET_MPTCP_H
#define __NET_MPTCP_H

#include <linux/skbuff.h>
#include <linux/tcp.h>
#include <linux/types.h>

struct mptcp_info;
struct mptcp_sock;
struct seq_file;

/* MPTCP sk_buff extension data */
struct mptcp_ext {
        union {
                u64        data_ack;
                u32        data_ack32;
        };
        u64                data_seq;
        u32                subflow_seq;
        u16                data_len;
        __sum16                csum;
        u8                use_map:1,
                        dsn64:1,
                        data_fin:1,
                        use_ack:1,
                        ack64:1,
                        mpc_map:1,
                        frozen:1,
                        reset_transient:1;
        u8                reset_reason:4,
                        csum_reqd:1,
                        infinite_map:1;
};

#define MPTCPOPT_HMAC_LEN        20
#define MPTCP_RM_IDS_MAX        8

struct mptcp_rm_list {
        u8 ids[MPTCP_RM_IDS_MAX];
        u8 nr;
};

struct mptcp_addr_info {
        u8                        id;
        sa_family_t                family;
        __be16                        port;
        union {
                struct in_addr        addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                struct in6_addr        addr6;
#endif
        };
};

struct mptcp_out_options {
#if IS_ENABLED(CONFIG_MPTCP)
        u16 suboptions;
        struct mptcp_rm_list rm_list;
        u8 join_id;
        u8 backup;
        u8 reset_reason:4,
           reset_transient:1,
           csum_reqd:1,
           allow_join_id0:1;
        union {
                struct {
                        u64 sndr_key;
                        u64 rcvr_key;
                        u64 data_seq;
                        u32 subflow_seq;
                        u16 data_len;
                        __sum16 csum;
                };
                struct {
                        struct mptcp_addr_info addr;
                        u64 ahmac;
                };
                struct {
                        struct mptcp_ext ext_copy;
                        u64 fail_seq;
                };
                struct {
                        u32 nonce;
                        u32 token;
                        u64 thmac;
                        u8 hmac[MPTCPOPT_HMAC_LEN];
                };
        };
#endif
};

#define MPTCP_SCHED_NAME_MAX        16
#define MPTCP_SCHED_MAX                128
#define MPTCP_SCHED_BUF_MAX        (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX)

#define MPTCP_SUBFLOWS_MAX        8

struct mptcp_sched_data {
        bool        reinject;
        u8        subflows;
        struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
};

struct mptcp_sched_ops {
        int (*get_subflow)(struct mptcp_sock *msk,
                           struct mptcp_sched_data *data);

        char                        name[MPTCP_SCHED_NAME_MAX];
        struct module                *owner;
        struct list_head        list;

        void (*init)(struct mptcp_sock *msk);
        void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;

#ifdef CONFIG_MPTCP
void mptcp_init(void);

static inline bool sk_is_mptcp(const struct sock *sk)
{
        return tcp_sk(sk)->is_mptcp;
}

static inline bool rsk_is_mptcp(const struct request_sock *req)
{
        return tcp_rsk(req)->is_mptcp;
}

static inline bool rsk_drop_req(const struct request_sock *req)
{
        return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
}

void mptcp_space(const struct sock *ssk, int *space, int *full_space);
bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
                       unsigned int *size, struct mptcp_out_options *opts);
bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
                          struct mptcp_out_options *opts);
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
                               unsigned int *size, unsigned int remaining,
                               struct mptcp_out_options *opts);
bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);

void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
                         struct mptcp_out_options *opts);

void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info);

/* move the skb extension owership, with the assumption that 'to' is
 * newly allocated
 */
static inline void mptcp_skb_ext_move(struct sk_buff *to,
                                      struct sk_buff *from)
{
        if (!skb_ext_exist(from, SKB_EXT_MPTCP))
                return;

        if (WARN_ON_ONCE(to->active_extensions))
                skb_ext_put(to);

        to->active_extensions = from->active_extensions;
        to->extensions = from->extensions;
        from->active_extensions = 0;
}

static inline void mptcp_skb_ext_copy(struct sk_buff *to,
                                      struct sk_buff *from)
{
        struct mptcp_ext *from_ext;

        from_ext = skb_ext_find(from, SKB_EXT_MPTCP);
        if (!from_ext)
                return;

        from_ext->frozen = 1;
        skb_ext_copy(to, from);
}

static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext,
                                     const struct mptcp_ext *from_ext)
{
        /* MPTCP always clears the ext when adding it to the skb, so
         * holes do not bother us here
         */
        return !from_ext ||
               (to_ext && from_ext &&
                !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext)));
}

/* check if skbs can be collapsed.
 * MPTCP collapse is allowed if neither @to or @from carry an mptcp data
 * mapping, or if the extension of @to is the same as @from.
 * Collapsing is not possible if @to lacks an extension, but @from carries one.
 */
static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
                                          const struct sk_buff *from)
{
        return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP),
                                 skb_ext_find(from, SKB_EXT_MPTCP));
}

void mptcp_seq_show(struct seq_file *seq);
int mptcp_subflow_init_cookie_req(struct request_sock *req,
                                  const struct sock *sk_listener,
                                  struct sk_buff *skb);
struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
                                               struct sock *sk_listener,
                                               bool attach_listener);

__be32 mptcp_get_reset_option(const struct sk_buff *skb);

static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
{
        if (skb_ext_exist(skb, SKB_EXT_MPTCP))
                return mptcp_get_reset_option(skb);

        return htonl(0u);
}
#else

static inline void mptcp_init(void)
{
}

static inline bool sk_is_mptcp(const struct sock *sk)
{
        return false;
}

static inline bool rsk_is_mptcp(const struct request_sock *req)
{
        return false;
}

static inline bool rsk_drop_req(const struct request_sock *req)
{
        return false;
}

static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
                                     unsigned int *size,
                                     struct mptcp_out_options *opts)
{
        return false;
}

static inline bool mptcp_synack_options(const struct request_sock *req,
                                        unsigned int *size,
                                        struct mptcp_out_options *opts)
{
        return false;
}

static inline bool mptcp_established_options(struct sock *sk,
                                             struct sk_buff *skb,
                                             unsigned int *size,
                                             unsigned int remaining,
                                             struct mptcp_out_options *opts)
{
        return false;
}

static inline bool mptcp_incoming_options(struct sock *sk,
                                          struct sk_buff *skb)
{
        return true;
}

static inline void mptcp_skb_ext_move(struct sk_buff *to,
                                      const struct sk_buff *from)
{
}

static inline void mptcp_skb_ext_copy(struct sk_buff *to,
                                      struct sk_buff *from)
{
}

static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
                                          const struct sk_buff *from)
{
        return true;
}

static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
static inline void mptcp_seq_show(struct seq_file *seq) { }

static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
                                                const struct sock *sk_listener,
                                                struct sk_buff *skb)
{
        return 0; /* TCP fallback */
}

static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
                                                             struct sock *sk_listener,
                                                             bool attach_listener)
{
        return NULL;
}

static inline __be32 mptcp_reset_option(const struct sk_buff *skb)  { return htonl(0u); }
#endif /* CONFIG_MPTCP */

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
int mptcpv6_init(void);
void mptcpv6_handle_mapped(struct sock *sk, bool mapped);
#elif IS_ENABLED(CONFIG_IPV6)
static inline int mptcpv6_init(void) { return 0; }
static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
#endif

#if defined(CONFIG_MPTCP) && defined(CONFIG_BPF_SYSCALL)
struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk);
#else
static inline struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { return NULL; }
#endif

#if !IS_ENABLED(CONFIG_MPTCP)
struct mptcp_sock { };
#endif

#endif /* __NET_MPTCP_H */


































































































































    2 




    2 


























    2 





























    2 










    2 













    1 









































    2 













    2 































































































































    2 





    2 



















































    2 

    2 


    1 
















    2 



    1 



















    2 


    2 















    1 



    2 



    2 














    3 

    3 






















































    2 







    2 

    1 










    2 

    2 






































    2 







    2 


    2 



    2 
    2 



    2 


    2 


































    2 






    2 













    1 




























    2 








    2 


    2 









































































    2 
















    2 



    2 












    2 




















    1 


    1 













    2 














    1 

    1 
    1 

    1 






    2 


    2 




















    1 











    1 












    1 















    2 
    2 
    1 














































































    1 

















































































































































































































































































    1 




    1 


    1 










    1 


    1 




















    1 

    1 






























































    2 










    1 












    1 






















    2 

    2 













    1 

    1 
















    1 



    1 

















    1 
    1 















    1 








    2 











    1 











    2 
    1 








    1 

    1 

    1 












    1 
























    1 
    1 










    1 











    2 

    2 














    2 
























    2 


    2 











    2 
    1 









    1 

    1 


    1 








    1 



    1 

    1 




    1 




    1 



    1 


    2 


    1 













    2 


    2 




    2 
    1 





    1 



    2 
    1 








    2 



    1 













































































    1 
    1 




































    2 









    1 




    2 

    1 



    2 














































































































    3 



    3 








    1 

















































    2 


































































    2 








    1 













    2 
    2 


    2 


    2 












    2 














    2 












    2 





    2 



    2 





    2 



    2 











    2 










    2 






    2 
    2 























    1 











    1 


    1 
    1 







































    1 



























































































    1 












    1 





    1 
    1 







    1 

    1 
































































































































































































































































































































































































































































    1 
    1 







    1 


































































































































    1 















































































    1 




























    1 











    1 







    1 










































































































































    1 


    1 


























    1 

    1 





















    1 
    1 










    1 












    1 




























    1 








    1 













    1 










    1 
    1 
    1 




    1 

    1 





    1 






    1 


    1 


















    1 



    1 

    1 









    1 








    1 








    1 

    1 

    1 



    1 

































































































































    1 














    1 


    1 



    1 
















    1 




    1 

    1 

    1 

    1 































    1 



































    1 












    1 










    1 




















































































































































































    1 










    1 




    1 




    1 





    1 







    1 




    1 

    1 



    1 








    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/wordpart.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>

#include "internal.h"
#include "mount.h"

/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
 * the name is a symlink pointing to a non-existent name.
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *        inside the path - always follow.
 *        in the last component in creation/removal/renaming - never follow.
 *        if LOOKUP_FOLLOW passed - follow.
 *        if the pathname has trailing slashes - follow.
 *        otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */

#define EMBEDDED_NAME_MAX        (PATH_MAX - offsetof(struct filename, iname))

struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
        struct filename *result;
        char *kname;
        int len;

        result = audit_reusename(filename);
        if (result)
                return result;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
        kname = (char *)result->iname;
        result->name = kname;

        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
        if (unlikely(len < 0)) {
                __putname(result);
                return ERR_PTR(len);
        }

        /*
         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
         * separate struct filename so we can dedicate the entire
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
        if (unlikely(len == EMBEDDED_NAME_MAX)) {
                const size_t size = offsetof(struct filename, iname[1]);
                kname = (char *)result;

                /*
                 * size is chosen that way we to guarantee that
                 * result->iname[0] is within the same object and that
                 * kname can't be equal to result->iname, no matter what.
                 */
                result = kzalloc(size, GFP_KERNEL);
                if (unlikely(!result)) {
                        __putname(kname);
                        return ERR_PTR(-ENOMEM);
                }
                result->name = kname;
                len = strncpy_from_user(kname, filename, PATH_MAX);
                if (unlikely(len < 0)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(len);
                }
                if (unlikely(len == PATH_MAX)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(-ENAMETOOLONG);
                }
        }

        atomic_set(&result->refcnt, 1);
        /* The empty path is special. */
        if (unlikely(!len)) {
                if (empty)
                        *empty = 1;
                if (!(flags & LOOKUP_EMPTY)) {
                        putname(result);
                        return ERR_PTR(-ENOENT);
                }
        }

        result->uptr = filename;
        result->aname = NULL;
        audit_getname(result);
        return result;
}

struct filename *
getname_uflags(const char __user *filename, int uflags)
{
        int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return getname_flags(filename, flags, NULL);
}

struct filename *
getname(const char __user * filename)
{
        return getname_flags(filename, 0, NULL);
}

struct filename *
getname_kernel(const char * filename)
{
        struct filename *result;
        int len = strlen(filename) + 1;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        if (len <= EMBEDDED_NAME_MAX) {
                result->name = (char *)result->iname;
        } else if (len <= PATH_MAX) {
                const size_t size = offsetof(struct filename, iname[1]);
                struct filename *tmp;

                tmp = kmalloc(size, GFP_KERNEL);
                if (unlikely(!tmp)) {
                        __putname(result);
                        return ERR_PTR(-ENOMEM);
                }
                tmp->name = (char *)result;
                result = tmp;
        } else {
                __putname(result);
                return ERR_PTR(-ENAMETOOLONG);
        }
        memcpy((char *)result->name, filename, len);
        result->uptr = NULL;
        result->aname = NULL;
        atomic_set(&result->refcnt, 1);
        audit_getname(result);

        return result;
}
EXPORT_SYMBOL(getname_kernel);

void putname(struct filename *name)
{
        if (IS_ERR(name))
                return;

        if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
                return;

        if (!atomic_dec_and_test(&name->refcnt))
                return;

        if (name->name != name->iname) {
                __putname(name->name);
                kfree(name);
        } else
                __putname(name);
}
EXPORT_SYMBOL(putname);

/**
 * check_acl - perform ACL permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the ACL permission checking. Since this function
 * retrieve POSIX acls it needs to know whether it is called from a blocking or
 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int check_acl(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl *acl;

        if (mask & MAY_NOT_BLOCK) {
                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
                if (!acl)
                        return -EAGAIN;
                /* no ->get_inode_acl() calls in RCU mode... */
                if (is_uncached_acl(acl))
                        return -ECHILD;
                return posix_acl_permission(idmap, inode, acl, mask);
        }

        acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
                int error = posix_acl_permission(idmap, inode, acl, mask);
                posix_acl_release(acl);
                return error;
        }
#endif

        return -EAGAIN;
}

/**
 * acl_permission_check - perform basic UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the basic UNIX permission checking. Since this
 * function may retrieve POSIX acls it needs to know whether it is called from a
 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int acl_permission_check(struct mnt_idmap *idmap,
                                struct inode *inode, int mask)
{
        unsigned int mode = inode->i_mode;
        vfsuid_t vfsuid;

        /* Are we the owner? If so, ACL's don't matter */
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
                mask &= 7;
                mode >>= 6;
                return (mask & ~mode) ? -EACCES : 0;
        }

        /* Do we have ACL's? */
        if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                int error = check_acl(idmap, inode, mask);
                if (error != -EAGAIN)
                        return error;
        }

        /* Only RWX matters for group/other mode bits */
        mask &= 7;

        /*
         * Are the group permissions different from
         * the other permissions in the bits we care
         * about? Need to check group ownership if so.
         */
        if (mask & (mode ^ (mode >> 3))) {
                vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
                if (vfsgid_in_group_p(vfsgid))
                        mode >>= 3;
        }

        /* Bits in 'mode' clear that we require? */
        return (mask & ~mode) ? -EACCES : 0;
}

/**
 * generic_permission -  check for access rights on a Posix-like filesystem
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check access rights for
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *                %MAY_NOT_BLOCK ...)
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
                       int mask)
{
        int ret;

        /*
         * Do the basic permission checks.
         */
        ret = acl_permission_check(idmap, inode, mask);
        if (ret != -EACCES)
                return ret;

        if (S_ISDIR(inode->i_mode)) {
                /* DACs are overridable for directories */
                if (!(mask & MAY_WRITE))
                        if (capable_wrt_inode_uidgid(idmap, inode,
                                                     CAP_DAC_READ_SEARCH))
                                return 0;
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;
                return -EACCES;
        }

        /*
         * Searching includes executable on directories, else just read.
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ)
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_READ_SEARCH))
                        return 0;
        /*
         * Read/write DACs are always overridable.
         * Executable DACs are overridable when there is
         * at least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;

        return -EACCES;
}
EXPORT_SYMBOL(generic_permission);

/**
 * do_inode_permission - UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct mnt_idmap *idmap,
                                      struct inode *inode, int mask)
{
        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
                if (likely(inode->i_op->permission))
                        return inode->i_op->permission(idmap, inode, mask);

                /* This gets set once for the inode lifetime */
                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_FASTPERM;
                spin_unlock(&inode->i_lock);
        }
        return generic_permission(idmap, inode, mask);
}

/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
        if (unlikely(mask & MAY_WRITE)) {
                umode_t mode = inode->i_mode;

                /* Nobody gets write access to a read-only fs. */
                if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
                        return -EROFS;
        }
        return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        Inode to check permission on
 * @mask:        Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
        int retval;

        retval = sb_permission(inode->i_sb, inode, mask);
        if (retval)
                return retval;

        if (unlikely(mask & MAY_WRITE)) {
                /*
                 * Nobody gets write access to an immutable file.
                 */
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                /*
                 * Updating mtime will likely cause i_uid and i_gid to be
                 * written back improperly if their true value is unknown
                 * to the vfs.
                 */
                if (HAS_UNMAPPED_ID(idmap, inode))
                        return -EACCES;
        }

        retval = do_inode_permission(idmap, inode, mask);
        if (retval)
                return retval;

        retval = devcgroup_inode_permission(inode, mask);
        if (retval)
                return retval;

        return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);

/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
void path_get(const struct path *path)
{
        mntget(path->mnt);
        dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(const struct path *path)
{
        dput(path->dentry);
        mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

#define EMBEDDED_LEVELS 2
struct nameidata {
        struct path        path;
        struct qstr        last;
        struct path        root;
        struct inode        *inode; /* path.dentry.d_inode */
        unsigned int        flags, state;
        unsigned        seq, next_seq, m_seq, r_seq;
        int                last_type;
        unsigned        depth;
        int                total_link_count;
        struct saved {
                struct path link;
                struct delayed_call done;
                const char *name;
                unsigned seq;
        } *stack, internal[EMBEDDED_LEVELS];
        struct filename        *name;
        struct nameidata *saved;
        unsigned        root_seq;
        int                dfd;
        vfsuid_t        dir_vfsuid;
        umode_t                dir_mode;
} __randomize_layout;

#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4

static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
        struct nameidata *old = current->nameidata;
        p->stack = p->internal;
        p->depth = 0;
        p->dfd = dfd;
        p->name = name;
        p->path.mnt = NULL;
        p->path.dentry = NULL;
        p->total_link_count = old ? old->total_link_count : 0;
        p->saved = old;
        current->nameidata = p;
}

static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
                          const struct path *root)
{
        __set_nameidata(p, dfd, name);
        p->state = 0;
        if (unlikely(root)) {
                p->state = ND_ROOT_PRESET;
                p->root = *root;
        }
}

static void restore_nameidata(void)
{
        struct nameidata *now = current->nameidata, *old = now->saved;

        current->nameidata = old;
        if (old)
                old->total_link_count = now->total_link_count;
        if (now->stack != now->internal)
                kfree(now->stack);
}

static bool nd_alloc_stack(struct nameidata *nd)
{
        struct saved *p;

        p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
                         nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
        if (unlikely(!p))
                return false;
        memcpy(p, nd->internal, sizeof(nd->internal));
        nd->stack = p;
        return true;
}

/**
 * path_connected - Verify that a dentry is below mnt.mnt_root
 * @mnt: The mountpoint to check.
 * @dentry: The dentry to check.
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
        struct super_block *sb = mnt->mnt_sb;

        /* Bind mounts can have disconnected paths */
        if (mnt->mnt_root == sb->s_root)
                return true;

        return is_subdir(dentry, mnt->mnt_root);
}

static void drop_links(struct nameidata *nd)
{
        int i = nd->depth;
        while (i--) {
                struct saved *last = nd->stack + i;
                do_delayed_call(&last->done);
                clear_delayed_call(&last->done);
        }
}

static void leave_rcu(struct nameidata *nd)
{
        nd->flags &= ~LOOKUP_RCU;
        nd->seq = nd->next_seq = 0;
        rcu_read_unlock();
}

static void terminate_walk(struct nameidata *nd)
{
        drop_links(nd);
        if (!(nd->flags & LOOKUP_RCU)) {
                int i;
                path_put(&nd->path);
                for (i = 0; i < nd->depth; i++)
                        path_put(&nd->stack[i].link);
                if (nd->state & ND_ROOT_GRABBED) {
                        path_put(&nd->root);
                        nd->state &= ~ND_ROOT_GRABBED;
                }
        } else {
                leave_rcu(nd);
        }
        nd->depth = 0;
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
}

/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
        int res = __legitimize_mnt(path->mnt, mseq);
        if (unlikely(res)) {
                if (res > 0)
                        path->mnt = NULL;
                path->dentry = NULL;
                return false;
        }
        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
                path->dentry = NULL;
                return false;
        }
        return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static inline bool legitimize_path(struct nameidata *nd,
                            struct path *path, unsigned seq)
{
        return __legitimize_path(path, seq, nd->m_seq);
}

static bool legitimize_links(struct nameidata *nd)
{
        int i;
        if (unlikely(nd->flags & LOOKUP_CACHED)) {
                drop_links(nd);
                nd->depth = 0;
                return false;
        }
        for (i = 0; i < nd->depth; i++) {
                struct saved *last = nd->stack + i;
                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
                        drop_links(nd);
                        nd->depth = i + 1;
                        return false;
                }
        }
        return true;
}

static bool legitimize_root(struct nameidata *nd)
{
        /* Nothing to do if nd->root is zero or is managed by the VFS user. */
        if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
                return true;
        nd->state |= ND_ROOT_GRABBED;
        return legitimize_path(nd, &nd->root, nd->root_seq);
}

/*
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
 */

/**
 * try_to_unlazy - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * Returns: true on success, false on failure
 *
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy() failure and
 * terminate_walk().
 */
static bool try_to_unlazy(struct nameidata *nd)
{
        struct dentry *parent = nd->path.dentry;

        BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(!legitimize_links(nd)))
                goto out1;
        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
                goto out;
        if (unlikely(!legitimize_root(nd)))
                goto out;
        leave_rcu(nd);
        BUG_ON(nd->inode != parent->d_inode);
        return true;

out1:
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
}

/**
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * Returns: true on success, false on failure
 *
 * Similar to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
        int res;
        BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(!legitimize_links(nd)))
                goto out2;
        res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
        if (unlikely(res)) {
                if (res > 0)
                        goto out2;
                goto out1;
        }
        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
                goto out1;

        /*
         * We need to move both the parent and the dentry from the RCU domain
         * to be properly refcounted. And the sequence number in the dentry
         * validates *both* dentry counters, since we checked the sequence
         * number of the parent after we got the child sequence number. So we
         * know the parent must still be valid if the child sequence number is
         */
        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
                goto out;
        if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
                goto out_dput;
        /*
         * Sequence counts matched. Now make sure that the root is
         * still valid and get it if required.
         */
        if (unlikely(!legitimize_root(nd)))
                goto out_dput;
        leave_rcu(nd);
        return true;

out2:
        nd->path.mnt = NULL;
out1:
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
out_dput:
        leave_rcu(nd);
        dput(dentry);
        return false;
}

static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                return dentry->d_op->d_revalidate(dentry, flags);
        else
                return 1;
}

/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
 *
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
 */
static int complete_walk(struct nameidata *nd)
{
        struct dentry *dentry = nd->path.dentry;
        int status;

        if (nd->flags & LOOKUP_RCU) {
                /*
                 * We don't want to zero nd->root for scoped-lookups or
                 * externally-managed nd->root.
                 */
                if (!(nd->state & ND_ROOT_PRESET))
                        if (!(nd->flags & LOOKUP_IS_SCOPED))
                                nd->root.mnt = NULL;
                nd->flags &= ~LOOKUP_CACHED;
                if (!try_to_unlazy(nd))
                        return -ECHILD;
        }

        if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                /*
                 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
                 * ever step outside the root during lookup" and should already
                 * be guaranteed by the rest of namei, we want to avoid a namei
                 * BUG resulting in userspace being given a path that was not
                 * scoped within the root at some point during the lookup.
                 *
                 * So, do a final sanity-check to make sure that in the
                 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
                 * we won't silently return an fd completely outside of the
                 * requested root to userspace.
                 *
                 * Userspace could move the path outside the root after this
                 * check, but as discussed elsewhere this is not a concern (the
                 * resolved file was inside the root at some point).
                 */
                if (!path_is_under(&nd->path, &nd->root))
                        return -EXDEV;
        }

        if (likely(!(nd->state & ND_JUMPED)))
                return 0;

        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
                return 0;

        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
        if (status > 0)
                return 0;

        if (!status)
                status = -ESTALE;

        return status;
}

static int set_root(struct nameidata *nd)
{
        struct fs_struct *fs = current->fs;

        /*
         * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
         * still have to ensure it doesn't happen because it will cause a breakout
         * from the dirfd.
         */
        if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
                return -ENOTRECOVERABLE;

        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;

                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        } else {
                get_fs_root(fs, &nd->root);
                nd->state |= ND_ROOT_GRABBED;
        }
        return 0;
}

static int nd_jump_root(struct nameidata *nd)
{
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                /* Absolute path arguments to path_init() are allowed. */
                if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
                        return -EXDEV;
        }
        if (!nd->root.mnt) {
                int error = set_root(nd);
                if (error)
                        return error;
        }
        if (nd->flags & LOOKUP_RCU) {
                struct dentry *d;
                nd->path = nd->root;
                d = nd->path.dentry;
                nd->inode = d->d_inode;
                nd->seq = nd->root_seq;
                if (read_seqcount_retry(&d->d_seq, nd->seq))
                        return -ECHILD;
        } else {
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->path);
                nd->inode = nd->path.dentry->d_inode;
        }
        nd->state |= ND_JUMPED;
        return 0;
}

/*
 * Helper to directly jump to a known parsed path from ->get_link,
 * caller must have taken a reference to path beforehand.
 */
int nd_jump_link(const struct path *path)
{
        int error = -ELOOP;
        struct nameidata *nd = current->nameidata;

        if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
                goto err;

        error = -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                if (nd->path.mnt != path->mnt)
                        goto err;
        }
        /* Not currently safe for scoped-lookups. */
        if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
                goto err;

        path_put(&nd->path);
        nd->path = *path;
        nd->inode = nd->path.dentry->d_inode;
        nd->state |= ND_JUMPED;
        return 0;

err:
        path_put(path);
        return error;
}

static inline void put_link(struct nameidata *nd)
{
        struct saved *last = nd->stack + --nd->depth;
        do_delayed_call(&last->done);
        if (!(nd->flags & LOOKUP_RCU))
                path_put(&last->link);
}

static int sysctl_protected_symlinks __read_mostly;
static int sysctl_protected_hardlinks __read_mostly;
static int sysctl_protected_fifos __read_mostly;
static int sysctl_protected_regular __read_mostly;

#ifdef CONFIG_SYSCTL
static struct ctl_table namei_sysctls[] = {
        {
                .procname        = "protected_symlinks",
                .data                = &sysctl_protected_symlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_hardlinks",
                .data                = &sysctl_protected_hardlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_fifos",
                .data                = &sysctl_protected_fifos,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "protected_regular",
                .data                = &sysctl_protected_regular,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __init init_fs_namei_sysctls(void)
{
        register_sysctl_init("fs", namei_sysctls);
        return 0;
}
fs_initcall(init_fs_namei_sysctls);

#endif /* CONFIG_SYSCTL */

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @nd: nameidata pathwalk data
 * @inode: Used for idmapping.
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
        struct mnt_idmap *idmap;
        vfsuid_t vfsuid;

        if (!sysctl_protected_symlinks)
                return 0;

        idmap = mnt_idmap(nd->path.mnt);
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        /* Allowed if owner and follower match. */
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return 0;

        /* Allowed if parent directory not sticky and world-writable. */
        if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
                return 0;

        /* Allowed if parent directory and link owner match. */
        if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
                return 0;

        if (nd->flags & LOOKUP_RCU)
                return -ECHILD;

        audit_inode(nd->name, nd->stack[0].link.dentry, 0);
        audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
        return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @idmap: idmap of the mount the inode was found from
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct mnt_idmap *idmap,
                                 struct inode *inode)
{
        umode_t mode = inode->i_mode;

        /* Special files should not get pinned to the filesystem. */
        if (!S_ISREG(mode))
                return false;

        /* Setuid files should not get pinned to the filesystem. */
        if (mode & S_ISUID)
                return false;

        /* Executable setgid files should not get pinned to the filesystem. */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
                return false;

        /* Hardlinking to unreadable or unwritable sources is dangerous. */
        if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
                return false;

        return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @idmap: idmap of the mount the inode was found from
 * @link:  the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if successful, -ve on error.
 */
int may_linkat(struct mnt_idmap *idmap, const struct path *link)
{
        struct inode *inode = link->dentry->d_inode;

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        if (!sysctl_protected_hardlinks)
                return 0;

        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
         * otherwise, it must be a safe source.
         */
        if (safe_hardlink_source(idmap, inode) ||
            inode_owner_or_capable(idmap, inode))
                return 0;

        audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
        return -EPERM;
}

/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *                          should be allowed, or not, on files that already
 *                          exist.
 * @idmap: idmap of the mount the inode was found from
 * @nd: nameidata pathwalk data
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(struct mnt_idmap *idmap,
                                struct nameidata *nd, struct inode *const inode)
{
        umode_t dir_mode = nd->dir_mode;
        vfsuid_t dir_vfsuid = nd->dir_vfsuid;

        if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
            (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
            likely(!(dir_mode & S_ISVTX)) ||
            vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) ||
            vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid()))
                return 0;

        if (likely(dir_mode & 0002) ||
            (dir_mode & 0020 &&
             ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
              (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
                const char *operation = S_ISFIFO(inode->i_mode) ?
                                        "sticky_create_fifo" :
                                        "sticky_create_regular";
                audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
                return -EACCES;
        }
        return 0;
}

/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
int follow_up(struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        struct mount *parent;
        struct dentry *mountpoint;

        read_seqlock_excl(&mount_lock);
        parent = mnt->mnt_parent;
        if (parent == mnt) {
                read_sequnlock_excl(&mount_lock);
                return 0;
        }
        mntget(&parent->mnt);
        mountpoint = dget(mnt->mnt_mountpoint);
        read_sequnlock_excl(&mount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
        path->mnt = &parent->mnt;
        return 1;
}
EXPORT_SYMBOL(follow_up);

static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
                                  struct path *path, unsigned *seqp)
{
        while (mnt_has_parent(m)) {
                struct dentry *mountpoint = m->mnt_mountpoint;

                m = m->mnt_parent;
                if (unlikely(root->dentry == mountpoint &&
                             root->mnt == &m->mnt))
                        break;
                if (mountpoint != m->mnt.mnt_root) {
                        path->mnt = &m->mnt;
                        path->dentry = mountpoint;
                        *seqp = read_seqcount_begin(&mountpoint->d_seq);
                        return true;
                }
        }
        return false;
}

static bool choose_mountpoint(struct mount *m, const struct path *root,
                              struct path *path)
{
        bool found;

        rcu_read_lock();
        while (1) {
                unsigned seq, mseq = read_seqbegin(&mount_lock);

                found = choose_mountpoint_rcu(m, root, path, &seq);
                if (unlikely(!found)) {
                        if (!read_seqretry(&mount_lock, mseq))
                                break;
                } else {
                        if (likely(__legitimize_path(path, seq, mseq)))
                                break;
                        rcu_read_unlock();
                        path_put(path);
                        rcu_read_lock();
                }
        }
        rcu_read_unlock();
        return found;
}

/*
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
 */
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
        struct dentry *dentry = path->dentry;

        /* We don't want to mount if someone's just doing a stat -
         * unless they're stat'ing a directory and appended a '/' to
         * the name.
         *
         * We do, however, want to mount if someone wants to open or
         * create a file of any type under the mountpoint, wants to
         * traverse through the mountpoint or wants to open the
         * mounted directory.  Also, autofs may mark negative dentries
         * as being automount points.  These will need the attentions
         * of the daemon to instantiate them before they can be used.
         */
        if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
            dentry->d_inode)
                return -EISDIR;

        if (count && (*count)++ >= MAXSYMLINKS)
                return -ELOOP;

        return finish_automount(dentry->d_op->d_automount(path), path);
}

/*
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
 */
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
                             int *count, unsigned lookup_flags)
{
        struct vfsmount *mnt = path->mnt;
        bool need_mntput = false;
        int ret = 0;

        while (flags & DCACHE_MANAGED_DENTRY) {
                /* Allow the filesystem to manage the transit without i_mutex
                 * being held. */
                if (flags & DCACHE_MANAGE_TRANSIT) {
                        ret = path->dentry->d_op->d_manage(path, false);
                        flags = smp_load_acquire(&path->dentry->d_flags);
                        if (ret < 0)
                                break;
                }

                if (flags & DCACHE_MOUNTED) {        // something's mounted on it..
                        struct vfsmount *mounted = lookup_mnt(path);
                        if (mounted) {                // ... in our namespace
                                dput(path->dentry);
                                if (need_mntput)
                                        mntput(path->mnt);
                                path->mnt = mounted;
                                path->dentry = dget(mounted->mnt_root);
                                // here we know it's positive
                                flags = path->dentry->d_flags;
                                need_mntput = true;
                                continue;
                        }
                }

                if (!(flags & DCACHE_NEED_AUTOMOUNT))
                        break;

                // uncovered automount point
                ret = follow_automount(path, count, lookup_flags);
                flags = smp_load_acquire(&path->dentry->d_flags);
                if (ret < 0)
                        break;
        }

        if (ret == -EISDIR)
                ret = 0;
        // possible if you race with several mount --move
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
        if (!ret && unlikely(d_flags_negative(flags)))
                ret = -ENOENT;
        *jumped = need_mntput;
        return ret;
}

static inline int traverse_mounts(struct path *path, bool *jumped,
                                  int *count, unsigned lookup_flags)
{
        unsigned flags = smp_load_acquire(&path->dentry->d_flags);

        /* fastpath */
        if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
                *jumped = false;
                if (unlikely(d_flags_negative(flags)))
                        return -ENOENT;
                return 0;
        }
        return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

int follow_down_one(struct path *path)
{
        struct vfsmount *mounted;

        mounted = lookup_mnt(path);
        if (mounted) {
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
}
EXPORT_SYMBOL(follow_down_one);

/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path, unsigned int flags)
{
        struct vfsmount *mnt = path->mnt;
        bool jumped;
        int ret = traverse_mounts(path, &jumped, NULL, flags);

        if (path->mnt != mnt)
                mntput(mnt);
        return ret;
}
EXPORT_SYMBOL(follow_down);

/*
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
{
        struct dentry *dentry = path->dentry;
        unsigned int flags = dentry->d_flags;

        if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
                return true;

        if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                return false;

        for (;;) {
                /*
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
                if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
                        int res = dentry->d_op->d_manage(path, true);
                        if (res)
                                return res == -EISDIR;
                        flags = dentry->d_flags;
                }

                if (flags & DCACHE_MOUNTED) {
                        struct mount *mounted = __lookup_mnt(path->mnt, dentry);
                        if (mounted) {
                                path->mnt = &mounted->mnt;
                                dentry = path->dentry = mounted->mnt.mnt_root;
                                nd->state |= ND_JUMPED;
                                nd->next_seq = read_seqcount_begin(&dentry->d_seq);
                                flags = dentry->d_flags;
                                // makes sure that non-RCU pathwalk could reach
                                // this state.
                                if (read_seqretry(&mount_lock, nd->m_seq))
                                        return false;
                                continue;
                        }
                        if (read_seqretry(&mount_lock, nd->m_seq))
                                return false;
                }
                return !(flags & DCACHE_NEED_AUTOMOUNT);
        }
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
                          struct path *path)
{
        bool jumped;
        int ret;

        path->mnt = nd->path.mnt;
        path->dentry = dentry;
        if (nd->flags & LOOKUP_RCU) {
                unsigned int seq = nd->next_seq;
                if (likely(__follow_mount_rcu(nd, path)))
                        return 0;
                // *path and nd->next_seq might've been clobbered
                path->mnt = nd->path.mnt;
                path->dentry = dentry;
                nd->next_seq = seq;
                if (!try_to_unlazy_next(nd, dentry))
                        return -ECHILD;
        }
        ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
        if (jumped) {
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        ret = -EXDEV;
                else
                        nd->state |= ND_JUMPED;
        }
        if (unlikely(ret)) {
                dput(path->dentry);
                if (path->mnt != nd->path.mnt)
                        mntput(path->mnt);
        }
        return ret;
}

/*
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
 */
static struct dentry *lookup_dcache(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry = d_lookup(dir, name);
        if (dentry) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error)
                                d_invalidate(dentry);
                        dput(dentry);
                        return ERR_PTR(error);
                }
        }
        return dentry;
}

/*
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
 */
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
                                    struct dentry *base,
                                    unsigned int flags)
{
        struct dentry *dentry = lookup_dcache(name, base, flags);
        struct dentry *old;
        struct inode *dir = base->d_inode;

        if (dentry)
                return dentry;

        /* Don't create child dentry for a dead directory. */
        if (unlikely(IS_DEADDIR(dir)))
                return ERR_PTR(-ENOENT);

        dentry = d_alloc(base, name);
        if (unlikely(!dentry))
                return ERR_PTR(-ENOMEM);

        old = dir->i_op->lookup(dir, dentry, flags);
        if (unlikely(old)) {
                dput(dentry);
                dentry = old;
        }
        return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);

static struct dentry *lookup_fast(struct nameidata *nd)
{
        struct dentry *dentry, *parent = nd->path.dentry;
        int status = 1;

        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, the caller is
         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
                if (unlikely(!dentry)) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                        return NULL;
                }

                /*
                 * This sequence count validates that the parent had no
                 * changes while we did the lookup of the dentry above.
                 */
                if (read_seqcount_retry(&parent->d_seq, nd->seq))
                        return ERR_PTR(-ECHILD);

                status = d_revalidate(dentry, nd->flags);
                if (likely(status > 0))
                        return dentry;
                if (!try_to_unlazy_next(nd, dentry))
                        return ERR_PTR(-ECHILD);
                if (status == -ECHILD)
                        /* we'd been told to redo it in non-rcu mode */
                        status = d_revalidate(dentry, nd->flags);
        } else {
                dentry = __d_lookup(parent, &nd->last);
                if (unlikely(!dentry))
                        return NULL;
                status = d_revalidate(dentry, nd->flags);
        }
        if (unlikely(status <= 0)) {
                if (!status)
                        d_invalidate(dentry);
                dput(dentry);
                return ERR_PTR(status);
        }
        return dentry;
}

/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry, *old;
        struct inode *inode = dir->d_inode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        /* Don't go there if it's already dead */
        if (unlikely(IS_DEADDIR(inode)))
                return ERR_PTR(-ENOENT);
again:
        dentry = d_alloc_parallel(dir, name, &wq);
        if (IS_ERR(dentry))
                return dentry;
        if (unlikely(!d_in_lookup(dentry))) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error) {
                                d_invalidate(dentry);
                                dput(dentry);
                                goto again;
                        }
                        dput(dentry);
                        dentry = ERR_PTR(error);
                }
        } else {
                old = inode->i_op->lookup(inode, dentry, flags);
                d_lookup_done(dentry);
                if (unlikely(old)) {
                        dput(dentry);
                        dentry = old;
                }
        }
        return dentry;
}

static struct dentry *lookup_slow(const struct qstr *name,
                                  struct dentry *dir,
                                  unsigned int flags)
{
        struct inode *inode = dir->d_inode;
        struct dentry *res;
        inode_lock_shared(inode);
        res = __lookup_slow(name, dir, flags);
        inode_unlock_shared(inode);
        return res;
}

static inline int may_lookup(struct mnt_idmap *idmap,
                             struct nameidata *nd)
{
        if (nd->flags & LOOKUP_RCU) {
                int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
                if (!err)                // success, keep going
                        return 0;
                if (!try_to_unlazy(nd))
                        return -ECHILD;        // redo it all non-lazy
                if (err != -ECHILD)        // hard error
                        return err;
        }
        return inode_permission(idmap, nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link)
{
        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
                return -ELOOP;

        if (likely(nd->depth != EMBEDDED_LEVELS))
                return 0;
        if (likely(nd->stack != nd->internal))
                return 0;
        if (likely(nd_alloc_stack(nd)))
                return 0;

        if (nd->flags & LOOKUP_RCU) {
                // we need to grab link before we do unlazy.  And we can't skip
                // unlazy even if we fail to grab the link - cleanup needs it
                bool grabbed_link = legitimize_path(nd, link, nd->next_seq);

                if (!try_to_unlazy(nd) || !grabbed_link)
                        return -ECHILD;

                if (nd_alloc_stack(nd))
                        return 0;
        }
        return -ENOMEM;
}

enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
                     struct inode *inode, int flags)
{
        struct saved *last;
        const char *res;
        int error = reserve_stack(nd, link);

        if (unlikely(error)) {
                if (!(nd->flags & LOOKUP_RCU))
                        path_put(link);
                return ERR_PTR(error);
        }
        last = nd->stack + nd->depth++;
        last->link = *link;
        clear_delayed_call(&last->done);
        last->seq = nd->next_seq;

        if (flags & WALK_TRAILING) {
                error = may_follow_link(nd, inode);
                if (unlikely(error))
                        return ERR_PTR(error);
        }

        if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
                        unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
                return ERR_PTR(-ELOOP);

        if (!(nd->flags & LOOKUP_RCU)) {
                touch_atime(&last->link);
                cond_resched();
        } else if (atime_needs_update(&last->link, inode)) {
                if (!try_to_unlazy(nd))
                        return ERR_PTR(-ECHILD);
                touch_atime(&last->link);
        }

        error = security_inode_follow_link(link->dentry, inode,
                                           nd->flags & LOOKUP_RCU);
        if (unlikely(error))
                return ERR_PTR(error);

        res = READ_ONCE(inode->i_link);
        if (!res) {
                const char * (*get)(struct dentry *, struct inode *,
                                struct delayed_call *);
                get = inode->i_op->get_link;
                if (nd->flags & LOOKUP_RCU) {
                        res = get(NULL, inode, &last->done);
                        if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
                                res = get(link->dentry, inode, &last->done);
                } else {
                        res = get(link->dentry, inode, &last->done);
                }
                if (!res)
                        goto all_done;
                if (IS_ERR(res))
                        return res;
        }
        if (*res == '/') {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                while (unlikely(*++res == '/'))
                        ;
        }
        if (*res)
                return res;
all_done: // pure jump
        put_link(nd);
        return NULL;
}

/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 *
 * NOTE: dentry must be what nd->next_seq had been sampled from.
 */
static const char *step_into(struct nameidata *nd, int flags,
                     struct dentry *dentry)
{
        struct path path;
        struct inode *inode;
        int err = handle_mounts(nd, dentry, &path);

        if (err < 0)
                return ERR_PTR(err);
        inode = path.dentry->d_inode;
        if (likely(!d_is_symlink(path.dentry)) ||
           ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
           (flags & WALK_NOFOLLOW)) {
                /* not a symlink or should not follow */
                if (nd->flags & LOOKUP_RCU) {
                        if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
                                return ERR_PTR(-ECHILD);
                        if (unlikely(!inode))
                                return ERR_PTR(-ENOENT);
                } else {
                        dput(nd->path.dentry);
                        if (nd->path.mnt != path.mnt)
                                mntput(nd->path.mnt);
                }
                nd->path = path;
                nd->inode = inode;
                nd->seq = nd->next_seq;
                return NULL;
        }
        if (nd->flags & LOOKUP_RCU) {
                /* make sure that d_is_symlink above matches inode */
                if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
                        return ERR_PTR(-ECHILD);
        } else {
                if (path.mnt == nd->path.mnt)
                        mntget(path.mnt);
        }
        return pick_link(nd, &path, inode, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
{
        struct dentry *parent, *old;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;
                unsigned seq;
                if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
                                           &nd->root, &path, &seq))
                        goto in_root;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-ECHILD);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                nd->seq = seq;
                // makes sure that non-RCU pathwalk could reach this state
                if (read_seqretry(&mount_lock, nd->m_seq))
                        return ERR_PTR(-ECHILD);
                /* we know that mountpoint was pinned */
        }
        old = nd->path.dentry;
        parent = old->d_parent;
        nd->next_seq = read_seqcount_begin(&parent->d_seq);
        // makes sure that non-RCU pathwalk could reach this state
        if (read_seqcount_retry(&old->d_seq, nd->seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(!path_connected(nd->path.mnt, parent)))
                return ERR_PTR(-ECHILD);
        return parent;
in_root:
        if (read_seqretry(&mount_lock, nd->m_seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-ECHILD);
        nd->next_seq = nd->seq;
        return nd->path.dentry;
}

static struct dentry *follow_dotdot(struct nameidata *nd)
{
        struct dentry *parent;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;

                if (!choose_mountpoint(real_mount(nd->path.mnt),
                                       &nd->root, &path))
                        goto in_root;
                path_put(&nd->path);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-EXDEV);
        }
        /* rare case of legitimate dget_parent()... */
        parent = dget_parent(nd->path.dentry);
        if (unlikely(!path_connected(nd->path.mnt, parent))) {
                dput(parent);
                return ERR_PTR(-ENOENT);
        }
        return parent;

in_root:
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-EXDEV);
        return dget(nd->path.dentry);
}

static const char *handle_dots(struct nameidata *nd, int type)
{
        if (type == LAST_DOTDOT) {
                const char *error = NULL;
                struct dentry *parent;

                if (!nd->root.mnt) {
                        error = ERR_PTR(set_root(nd));
                        if (error)
                                return error;
                }
                if (nd->flags & LOOKUP_RCU)
                        parent = follow_dotdot_rcu(nd);
                else
                        parent = follow_dotdot(nd);
                if (IS_ERR(parent))
                        return ERR_CAST(parent);
                error = step_into(nd, WALK_NOFOLLOW, parent);
                if (unlikely(error))
                        return error;

                if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                        /*
                         * If there was a racing rename or mount along our
                         * path, then we can't be sure that ".." hasn't jumped
                         * above nd->root (and so userspace should retry or use
                         * some fallback).
                         */
                        smp_rmb();
                        if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
                                return ERR_PTR(-EAGAIN);
                        if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
                                return ERR_PTR(-EAGAIN);
                }
        }
        return NULL;
}

static const char *walk_component(struct nameidata *nd, int flags)
{
        struct dentry *dentry;
        /*
         * "." and ".." are special - ".." especially so because it has
         * to be able to know about the current root directory and
         * parent relationships.
         */
        if (unlikely(nd->last_type != LAST_NORM)) {
                if (!(flags & WALK_MORE) && nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }
        dentry = lookup_fast(nd);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);
        if (unlikely(!dentry)) {
                dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
        }
        if (!(flags & WALK_MORE) && nd->depth)
                put_link(nd);
        return step_into(nd, flags, dentry);
}

/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>

#ifdef HASH_MIX

/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */

#elif defined(CONFIG_64BIT)
/*
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol64(x,12),\
        x += y,        y = rol64(y,45),\
        y *= 9                        )

/*
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
 */
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        y ^= x * GOLDEN_RATIO_64;
        y *= GOLDEN_RATIO_64;
        return y >> 32;
}

#else        /* 32-bit case */

/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol32(x, 7),\
        x += y,        y = rol32(y,20),\
        y *= 9                        )

static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        /* Use arch-optimized multiply if one exists */
        return __hash_32(y ^ __hash_32(x));
}

#endif

/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long a, x = 0, y = (unsigned long)salt;

        for (;;) {
                if (!len)
                        goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
                HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
        }
        x ^= a & bytemask_from_count(len);
done:
        return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long a = 0, x = 0, y = (unsigned long)salt;
        unsigned long adata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
        } while (!has_zero(a, &adata, &constants));

        adata = prep_zero_mask(a, adata, &constants);
        mask = create_zero_mask(adata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);

/*
 * Calculate the length and hash of the path component, and
 * return the "hash_len" as the result.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
        unsigned long adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
        mask = create_zero_mask(adata | bdata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}

#else        /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */

/* Return the hash of a string of known length */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long hash = init_name_hash(salt);
        while (len--)
                hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        while (c) {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        }
        return hashlen_create(end_name_hash(hash), len);
}
EXPORT_SYMBOL(hashlen_string);

/*
 * We know there's a real path component here of at least
 * one character.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        do {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');
        return hashlen_create(end_name_hash(hash), len);
}

#endif

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int link_path_walk(const char *name, struct nameidata *nd)
{
        int depth = 0; // depth <= nd->depth
        int err;

        nd->last_type = LAST_ROOT;
        nd->flags |= LOOKUP_PARENT;
        if (IS_ERR(name))
                return PTR_ERR(name);
        while (*name=='/')
                name++;
        if (!*name) {
                nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
                return 0;
        }

        /* At this point we know we have a real path component. */
        for(;;) {
                struct mnt_idmap *idmap;
                const char *link;
                u64 hash_len;
                int type;

                idmap = mnt_idmap(nd->path.mnt);
                err = may_lookup(idmap, nd);
                if (err)
                        return err;

                hash_len = hash_name(nd->path.dentry, name);

                type = LAST_NORM;
                if (name[0] == '.') switch (hashlen_len(hash_len)) {
                        case 2:
                                if (name[1] == '.') {
                                        type = LAST_DOTDOT;
                                        nd->state |= ND_JUMPED;
                                }
                                break;
                        case 1:
                                type = LAST_DOT;
                }
                if (likely(type == LAST_NORM)) {
                        struct dentry *parent = nd->path.dentry;
                        nd->state &= ~ND_JUMPED;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
                                struct qstr this = { { .hash_len = hash_len }, .name = name };
                                err = parent->d_op->d_hash(parent, &this);
                                if (err < 0)
                                        return err;
                                hash_len = this.hash_len;
                                name = this.name;
                        }
                }

                nd->last.hash_len = hash_len;
                nd->last.name = name;
                nd->last_type = type;

                name += hashlen_len(hash_len);
                if (!*name)
                        goto OK;
                /*
                 * If it wasn't NUL, we know it was '/'. Skip that
                 * slash, and continue until no more slashes.
                 */
                do {
                        name++;
                } while (unlikely(*name == '/'));
                if (unlikely(!*name)) {
OK:
                        /* pathname or trailing symlink, done */
                        if (!depth) {
                                nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
                                nd->dir_mode = nd->inode->i_mode;
                                nd->flags &= ~LOOKUP_PARENT;
                                return 0;
                        }
                        /* last component of nested symlink */
                        name = nd->stack[--depth].name;
                        link = walk_component(nd, 0);
                } else {
                        /* not the last component */
                        link = walk_component(nd, WALK_MORE);
                }
                if (unlikely(link)) {
                        if (IS_ERR(link))
                                return PTR_ERR(link);
                        /* a symlink to follow */
                        nd->stack[depth++].name = name;
                        name = link;
                        continue;
                }
                if (unlikely(!d_can_lookup(nd->path.dentry))) {
                        if (nd->flags & LOOKUP_RCU) {
                                if (!try_to_unlazy(nd))
                                        return -ECHILD;
                        }
                        return -ENOTDIR;
                }
        }
}

/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
        int error;
        const char *s = nd->name->name;

        /* LOOKUP_CACHED requires RCU, ask caller to retry */
        if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
                return ERR_PTR(-EAGAIN);

        if (!*s)
                flags &= ~LOOKUP_RCU;
        if (flags & LOOKUP_RCU)
                rcu_read_lock();
        else
                nd->seq = nd->next_seq = 0;

        nd->flags = flags;
        nd->state |= ND_JUMPED;

        nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
        nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
        smp_rmb();

        if (nd->state & ND_ROOT_PRESET) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
                if (*s && unlikely(!d_can_lookup(root)))
                        return ERR_PTR(-ENOTDIR);
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->path);
                }
                return s;
        }

        nd->root.mnt = NULL;

        /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
        if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                return s;
        }

        /* Relative pathname -- get the starting-point it is relative to. */
        if (nd->dfd == AT_FDCWD) {
                if (flags & LOOKUP_RCU) {
                        struct fs_struct *fs = current->fs;
                        unsigned seq;

                        do {
                                seq = read_seqcount_begin(&fs->seq);
                                nd->path = fs->pwd;
                                nd->inode = nd->path.dentry->d_inode;
                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        } while (read_seqcount_retry(&fs->seq, seq));
                } else {
                        get_fs_pwd(current->fs, &nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        } else {
                /* Caller must check execute permissions on the starting path component */
                struct fd f = fdget_raw(nd->dfd);
                struct dentry *dentry;

                if (!f.file)
                        return ERR_PTR(-EBADF);

                if (flags & LOOKUP_LINKAT_EMPTY) {
                        if (f.file->f_cred != current_cred() &&
                            !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
                                fdput(f);
                                return ERR_PTR(-ENOENT);
                        }
                }

                dentry = f.file->f_path.dentry;

                if (*s && unlikely(!d_can_lookup(dentry))) {
                        fdput(f);
                        return ERR_PTR(-ENOTDIR);
                }

                nd->path = f.file->f_path;
                if (flags & LOOKUP_RCU) {
                        nd->inode = nd->path.dentry->d_inode;
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
                        path_get(&nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
                fdput(f);
        }

        /* For scoped-lookups we need to set the root to the dirfd as well. */
        if (flags & LOOKUP_IS_SCOPED) {
                nd->root = nd->path;
                if (flags & LOOKUP_RCU) {
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->root);
                        nd->state |= ND_ROOT_GRABBED;
                }
        }
        return s;
}

static inline const char *lookup_last(struct nameidata *nd)
{
        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        return walk_component(nd, WALK_TRAILING);
}

static int handle_lookup_down(struct nameidata *nd)
{
        if (!(nd->flags & LOOKUP_RCU))
                dget(nd->path.dentry);
        nd->next_seq = nd->seq;
        return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
        const char *s = path_init(nd, flags);
        int err;

        if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
                err = handle_lookup_down(nd);
                if (unlikely(err < 0))
                        s = ERR_PTR(err);
        }

        while (!(err = link_path_walk(s, nd)) &&
               (s = lookup_last(nd)) != NULL)
                ;
        if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
                err = handle_lookup_down(nd);
                nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
        }
        if (!err)
                err = complete_walk(nd);

        if (!err && nd->flags & LOOKUP_DIRECTORY)
                if (!d_can_lookup(nd->path.dentry))
                        err = -ENOTDIR;
        if (!err) {
                *path = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

int filename_lookup(int dfd, struct filename *name, unsigned flags,
                    struct path *path, struct path *root)
{
        int retval;
        struct nameidata nd;
        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
        if (unlikely(retval == -ECHILD))
                retval = path_lookupat(&nd, flags, path);
        if (unlikely(retval == -ESTALE))
                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);

        if (likely(!retval))
                audit_inode(name, path->dentry,
                            flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
        restore_nameidata();
        return retval;
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
                                struct path *parent)
{
        const char *s = path_init(nd, flags);
        int err = link_path_walk(s, nd);
        if (!err)
                err = complete_walk(nd);
        if (!err) {
                *parent = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

/* Note: this does not consume "name" */
static int __filename_parentat(int dfd, struct filename *name,
                               unsigned int flags, struct path *parent,
                               struct qstr *last, int *type,
                               const struct path *root)
{
        int retval;
        struct nameidata nd;

        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
        if (unlikely(retval == -ECHILD))
                retval = path_parentat(&nd, flags, parent);
        if (unlikely(retval == -ESTALE))
                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
        if (likely(!retval)) {
                *last = nd.last;
                *type = nd.last_type;
                audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
        }
        restore_nameidata();
        return retval;
}

static int filename_parentat(int dfd, struct filename *name,
                             unsigned int flags, struct path *parent,
                             struct qstr *last, int *type)
{
        return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
}

/* does lookup, returns the object with parent locked */
static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
{
        struct dentry *d;
        struct qstr last;
        int type, error;

        error = filename_parentat(dfd, name, 0, path, &last, &type);
        if (error)
                return ERR_PTR(error);
        if (unlikely(type != LAST_NORM)) {
                path_put(path);
                return ERR_PTR(-EINVAL);
        }
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        d = lookup_one_qstr_excl(&last, path->dentry, 0);
        if (IS_ERR(d)) {
                inode_unlock(path->dentry->d_inode);
                path_put(path);
        }
        return d;
}

struct dentry *kern_path_locked(const char *name, struct path *path)
{
        struct filename *filename = getname_kernel(name);
        struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);

        putname(filename);
        return res;
}

struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
{
        struct filename *filename = getname(name);
        struct dentry *res = __kern_path_locked(dfd, filename, path);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(user_path_locked_at);

int kern_path(const char *name, unsigned int flags, struct path *path)
{
        struct filename *filename = getname_kernel(name);
        int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);

        putname(filename);
        return ret;

}
EXPORT_SYMBOL(kern_path);

/**
 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
 * @filename: filename structure
 * @flags: lookup flags
 * @parent: pointer to struct path to fill
 * @last: last component
 * @type: type of the last component
 * @root: pointer to struct path of the base directory
 */
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root)
{
        return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
                                    type, root);
}
EXPORT_SYMBOL(vfs_path_parent_lookup);

/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
 * @path: pointer to struct path to fill
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
{
        struct filename *filename;
        struct path root = {.mnt = mnt, .dentry = dentry};
        int ret;

        filename = getname_kernel(name);
        /* the first argument of filename_lookup() is ignored with root */
        ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
        putname(filename);
        return ret;
}
EXPORT_SYMBOL(vfs_path_lookup);

static int lookup_one_common(struct mnt_idmap *idmap,
                             const char *name, struct dentry *base, int len,
                             struct qstr *this)
{
        this->name = name;
        this->len = len;
        this->hash = full_name_hash(base, name, len);
        if (!len)
                return -EACCES;

        if (is_dot_dotdot(name, len))
                return -EACCES;

        while (len--) {
                unsigned int c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
                        return -EACCES;
        }
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (base->d_flags & DCACHE_OP_HASH) {
                int err = base->d_op->d_hash(base, this);
                if (err < 0)
                        return err;
        }

        return inode_permission(idmap, base->d_inode, MAY_EXEC);
}

/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one_len);

/**
 * lookup_one - filesystem helper to lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name,
                          struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one);

/**
 * lookup_one_unlocked - filesystem helper to lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
                                   const char *name, struct dentry *base,
                                   int len)
{
        struct qstr this;
        int err;
        struct dentry *ret;

        err = lookup_one_common(idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(&this, base, 0);
        if (!ret)
                ret = lookup_slow(&this, base, 0);
        return ret;
}
EXPORT_SYMBOL(lookup_one_unlocked);

/**
 * lookup_one_positive_unlocked - filesystem helper to lookup single
 *                                  pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
 * known positive or ERR_PTR(). This is what most of the users want.
 *
 * Note that pinned negative with unlocked parent _can_ become positive at any
 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
 * positives have >d_inode stable, so this one avoids such problems.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The helper should be called without i_mutex held.
 */
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            const char *name,
                                            struct dentry *base, int len)
{
        struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);

        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_one_positive_unlocked);

/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
}
EXPORT_SYMBOL(lookup_positive_unlocked);

#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
        /* Find something mounted on "pts" in the same directory as
         * the input path.
         */
        struct dentry *parent = dget_parent(path->dentry);
        struct dentry *child;
        struct qstr this = QSTR_INIT("pts", 3);

        if (unlikely(!path_connected(path->mnt, parent))) {
                dput(parent);
                return -ENOENT;
        }
        dput(path->dentry);
        path->dentry = parent;
        child = d_hash_and_lookup(parent, &this);
        if (IS_ERR_OR_NULL(child))
                return -ENOENT;

        path->dentry = child;
        dput(parent);
        follow_down(path, 0);
        return 0;
}
#endif

int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
{
        struct filename *filename = getname_flags(name, flags, empty);
        int ret = filename_lookup(dfd, filename, flags, path, NULL);

        putname(filename);
        return ret;
}
EXPORT_SYMBOL(user_path_at_empty);

int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode)
{
        kuid_t fsuid = current_fsuid();

        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
                return 0;
        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
                return 0;
        return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);

/*
 *        Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *        a. be owner of dir, or
 *        b. be owner of victim, or
 *        c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
static int may_delete(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *victim, bool isdir)
{
        struct inode *inode = d_backing_inode(victim);
        int error;

        if (d_is_negative(victim))
                return -ENOENT;
        BUG_ON(!inode);

        BUG_ON(victim->d_parent->d_inode != dir);

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;

        if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
            HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
                return -EBUSY;
        return 0;
}

/*        Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
 */
static inline int may_create(struct mnt_idmap *idmap,
                             struct inode *dir, struct dentry *child)
{
        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (!fsuidgid_has_mapping(dir->i_sb, idmap))
                return -EOVERFLOW;

        return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}

// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p = p1, *q = p2, *r;

        while ((r = p->d_parent) != p2 && r != p)
                p = r;
        if (r == p2) {
                // p is a child of p2 and an ancestor of p1 or p1 itself
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
                return p;
        }
        // p is the root of connected component that contains p1
        // p2 does not occur on the path from p to p1
        while ((r = q->d_parent) != p1 && r != p && r != q)
                q = r;
        if (r == p1) {
                // q is a child of p1 and an ancestor of p2 or p2 itself
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return q;
        } else if (likely(r == p)) {
                // both p2 and p1 are descendents of p
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return NULL;
        } else { // no common ancestor at the time we'd been called
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
                return ERR_PTR(-EXDEV);
        }
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
        if (p1 == p2) {
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                return NULL;
        }

        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
        return lock_two_directories(p1, p2);
}
EXPORT_SYMBOL(lock_rename);

/*
 * c1 and p2 should be on the same fs.
 */
struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
{
        if (READ_ONCE(c1->d_parent) == p2) {
                /*
                 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
                 */
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                /*
                 * now that p2 is locked, nobody can move in or out of it,
                 * so the test below is safe.
                 */
                if (likely(c1->d_parent == p2))
                        return NULL;

                /*
                 * c1 got moved out of p2 while we'd been taking locks;
                 * unlock and fall back to slow case.
                 */
                inode_unlock(p2->d_inode);
        }

        mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
        /*
         * nobody can move out of any directories on this fs.
         */
        if (likely(c1->d_parent != p2))
                return lock_two_directories(c1->d_parent, p2);

        /*
         * c1 got moved into p2 while we were taking locks;
         * we need p2 locked and ->s_vfs_rename_mutex unlocked,
         * for consistency with lock_rename().
         */
        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
        mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
        return NULL;
}
EXPORT_SYMBOL(lock_rename_child);

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
        inode_unlock(p1->d_inode);
        if (p1 != p2) {
                inode_unlock(p2->d_inode);
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
        }
}
EXPORT_SYMBOL(unlock_rename);

/**
 * vfs_prepare_mode - prepare the mode to be used for a new inode
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode
 * @mask_perms:        allowed permission by the vfs
 * @type:        type of file to be created
 *
 * This helper consolidates and enforces vfs restrictions on the @mode of a new
 * object to be created.
 *
 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
 * the kernel documentation for mode_strip_umask()). Moving umask stripping
 * after setgid stripping allows the same ordering for both non-POSIX ACL and
 * POSIX ACL supporting filesystems.
 *
 * Note that it's currently valid for @type to be 0 if a directory is created.
 * Filesystems raise that flag individually and we need to check whether each
 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
 * non-zero type.
 *
 * Returns: mode to be passed to the filesystem
 */
static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
                                       const struct inode *dir, umode_t mode,
                                       umode_t mask_perms, umode_t type)
{
        mode = mode_strip_sgid(idmap, dir, mode);
        mode = mode_strip_umask(dir, mode);

        /*
         * Apply the vfs mandated allowed permission mask and set the type of
         * file to be created before we call into the filesystem.
         */
        mode &= (mask_perms & ~S_IFMT);
        mode |= (type & S_IFMT);

        return mode;
}

/**
 * vfs_create - create new file
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @mode:        mode of the new file
 * @want_excl:        whether the file must not yet exist
 *
 * Create a new file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
               struct dentry *dentry, umode_t mode, bool want_excl)
{
        int error;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->create)
                return -EACCES;        /* shouldn't it be ENOSYS? */

        mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_create);

int vfs_mkobj(struct dentry *dentry, umode_t mode,
                int (*f)(struct dentry *, umode_t, void *),
                void *arg)
{
        struct inode *dir = dentry->d_parent->d_inode;
        int error = may_create(&nop_mnt_idmap, dir, dentry);
        if (error)
                return error;

        mode &= S_IALLUGO;
        mode |= S_IFREG;
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = f(dentry, mode, arg);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkobj);

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

static int may_open(struct mnt_idmap *idmap, const struct path *path,
                    int acc_mode, int flag)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;

        if (!inode)
                return -ENOENT;

        switch (inode->i_mode & S_IFMT) {
        case S_IFLNK:
                return -ELOOP;
        case S_IFDIR:
                if (acc_mode & MAY_WRITE)
                        return -EISDIR;
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                break;
        case S_IFBLK:
        case S_IFCHR:
                if (!may_open_dev(path))
                        return -EACCES;
                fallthrough;
        case S_IFIFO:
        case S_IFSOCK:
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                flag &= ~O_TRUNC;
                break;
        case S_IFREG:
                if ((acc_mode & MAY_EXEC) && path_noexec(path))
                        return -EACCES;
                break;
        }

        error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
        if (error)
                return error;

        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
                        return -EPERM;
                if (flag & O_TRUNC)
                        return -EPERM;
        }

        /* O_NOATIME can only be set by the owner or superuser */
        if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
                return -EPERM;

        return 0;
}

static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
{
        const struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
                return error;

        error = security_file_truncate(filp);
        if (!error) {
                error = do_truncate(idmap, path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                    filp);
        }
        put_write_access(inode);
        return error;
}

static inline int open_to_namei_flags(int flag)
{
        if ((flag & O_ACCMODE) == 3)
                flag--;
        return flag;
}

static int may_o_create(struct mnt_idmap *idmap,
                        const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
                return error;

        if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
                return -EOVERFLOW;

        error = inode_permission(idmap, dir->dentry->d_inode,
                                 MAY_WRITE | MAY_EXEC);
        if (error)
                return error;

        return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
                                  struct file *file,
                                  int open_flag, umode_t mode)
{
        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
        struct inode *dir =  nd->path.dentry->d_inode;
        int error;

        if (nd->flags & LOOKUP_DIRECTORY)
                open_flag |= O_DIRECTORY;

        file->f_path.dentry = DENTRY_NOT_SET;
        file->f_path.mnt = nd->path.mnt;
        error = dir->i_op->atomic_open(dir, dentry, file,
                                       open_to_namei_flags(open_flag), mode);
        d_lookup_done(dentry);
        if (!error) {
                if (file->f_mode & FMODE_OPENED) {
                        if (unlikely(dentry != file->f_path.dentry)) {
                                dput(dentry);
                                dentry = dget(file->f_path.dentry);
                        }
                } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                        error = -EIO;
                } else {
                        if (file->f_path.dentry) {
                                dput(dentry);
                                dentry = file->f_path.dentry;
                        }
                        if (unlikely(d_is_negative(dentry)))
                                error = -ENOENT;
                }
        }
        if (error) {
                dput(dentry);
                dentry = ERR_PTR(error);
        }
        return dentry;
}

/*
 * Look up and maybe create and open the last component.
 *
 * Must be called with parent locked (exclusive in O_CREAT case).
 *
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
 *
 * An error code is returned on failure.
 */
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
                                  const struct open_flags *op,
                                  bool got_write)
{
        struct mnt_idmap *idmap;
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
        int open_flag = op->open_flag;
        struct dentry *dentry;
        int error, create_error = 0;
        umode_t mode = op->mode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        if (unlikely(IS_DEADDIR(dir_inode)))
                return ERR_PTR(-ENOENT);

        file->f_mode &= ~FMODE_CREATED;
        dentry = d_lookup(dir, &nd->last);
        for (;;) {
                if (!dentry) {
                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
                        if (IS_ERR(dentry))
                                return dentry;
                }
                if (d_in_lookup(dentry))
                        break;

                error = d_revalidate(dentry, nd->flags);
                if (likely(error > 0))
                        break;
                if (error)
                        goto out_dput;
                d_invalidate(dentry);
                dput(dentry);
                dentry = NULL;
        }
        if (dentry->d_inode) {
                /* Cached positive dentry: will open in f_op->open */
                return dentry;
        }

        /*
         * Checking write permission is tricky, bacuse we don't know if we are
         * going to actually need it: O_CREAT opens should work as long as the
         * file exists.  But checking existence breaks atomicity.  The trick is
         * to check access and if not granted clear O_CREAT from the flags.
         *
         * Another problem is returing the "right" error value (e.g. for an
         * O_EXCL open we want to return EEXIST not EROFS).
         */
        if (unlikely(!got_write))
                open_flag &= ~O_TRUNC;
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if (open_flag & O_EXCL)
                        open_flag &= ~O_TRUNC;
                mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
                if (likely(got_write))
                        create_error = may_o_create(idmap, &nd->path,
                                                    dentry, mode);
                else
                        create_error = -EROFS;
        }
        if (create_error)
                open_flag &= ~O_CREAT;
        if (dir_inode->i_op->atomic_open) {
                dentry = atomic_open(nd, dentry, file, open_flag, mode);
                if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
                        dentry = ERR_PTR(create_error);
                return dentry;
        }

        if (d_in_lookup(dentry)) {
                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
                                                             nd->flags);
                d_lookup_done(dentry);
                if (unlikely(res)) {
                        if (IS_ERR(res)) {
                                error = PTR_ERR(res);
                                goto out_dput;
                        }
                        dput(dentry);
                        dentry = res;
                }
        }

        /* Negative dentry, just create the file */
        if (!dentry->d_inode && (open_flag & O_CREAT)) {
                file->f_mode |= FMODE_CREATED;
                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
                if (!dir_inode->i_op->create) {
                        error = -EACCES;
                        goto out_dput;
                }

                error = dir_inode->i_op->create(idmap, dir_inode, dentry,
                                                mode, open_flag & O_EXCL);
                if (error)
                        goto out_dput;
        }
        if (unlikely(create_error) && !dentry->d_inode) {
                error = create_error;
                goto out_dput;
        }
        return dentry;

out_dput:
        dput(dentry);
        return ERR_PTR(error);
}

static const char *open_last_lookups(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
        bool got_write = false;
        struct dentry *dentry;
        const char *res;

        nd->flags |= op->intent;

        if (nd->last_type != LAST_NORM) {
                if (nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }

        if (!(open_flag & O_CREAT)) {
                if (nd->last.name[nd->last.len])
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                /* we _can_ be in RCU mode here */
                dentry = lookup_fast(nd);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
                if (likely(dentry))
                        goto finish_lookup;

                if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
                        return ERR_PTR(-ECHILD);
        } else {
                /* create side of things */
                if (nd->flags & LOOKUP_RCU) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                }
                audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
                /* trailing slashes? */
                if (unlikely(nd->last.name[nd->last.len]))
                        return ERR_PTR(-EISDIR);
        }

        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
                got_write = !mnt_want_write(nd->path.mnt);
                /*
                 * do _not_ fail yet - we might not need that or fail with
                 * a different error; let lookup_open() decide; we'll be
                 * dropping this one anyway.
                 */
        }
        if (open_flag & O_CREAT)
                inode_lock(dir->d_inode);
        else
                inode_lock_shared(dir->d_inode);
        dentry = lookup_open(nd, file, op, got_write);
        if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
                fsnotify_create(dir->d_inode, dentry);
        if (open_flag & O_CREAT)
                inode_unlock(dir->d_inode);
        else
                inode_unlock_shared(dir->d_inode);

        if (got_write)
                mnt_drop_write(nd->path.mnt);

        if (IS_ERR(dentry))
                return ERR_CAST(dentry);

        if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
                dput(nd->path.dentry);
                nd->path.dentry = dentry;
                return NULL;
        }

finish_lookup:
        if (nd->depth)
                put_link(nd);
        res = step_into(nd, WALK_TRAILING, dentry);
        if (unlikely(res))
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
        return res;
}

/*
 * Handle the last step of open()
 */
static int do_open(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct mnt_idmap *idmap;
        int open_flag = op->open_flag;
        bool do_truncate;
        int acc_mode;
        int error;

        if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
                error = complete_walk(nd);
                if (error)
                        return error;
        }
        if (!(file->f_mode & FMODE_CREATED))
                audit_inode(nd->name, nd->path.dentry, 0);
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
                        return -EEXIST;
                if (d_is_dir(nd->path.dentry))
                        return -EISDIR;
                error = may_create_in_sticky(idmap, nd,
                                             d_backing_inode(nd->path.dentry));
                if (unlikely(error))
                        return error;
        }
        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
                return -ENOTDIR;

        do_truncate = false;
        acc_mode = op->acc_mode;
        if (file->f_mode & FMODE_CREATED) {
                /* Don't check for write permission, don't truncate */
                open_flag &= ~O_TRUNC;
                acc_mode = 0;
        } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        return error;
                do_truncate = true;
        }
        error = may_open(idmap, &nd->path, acc_mode, open_flag);
        if (!error && !(file->f_mode & FMODE_OPENED))
                error = vfs_open(&nd->path, file);
        if (!error)
                error = security_file_post_open(file, op->acc_mode);
        if (!error && do_truncate)
                error = handle_truncate(idmap, file);
        if (unlikely(error > 0)) {
                WARN_ON(1);
                error = -EINVAL;
        }
        if (do_truncate)
                mnt_drop_write(nd->path.mnt);
        return error;
}

/**
 * vfs_tmpfile - create tmpfile
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        pointer to the path of the base directory
 * @file:        file descriptor of the new tmpfile
 * @mode:        mode of the new tmpfile
 *
 * Create a temporary file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode)
{
        struct dentry *child;
        struct inode *dir = d_inode(parentpath->dentry);
        struct inode *inode;
        int error;
        int open_flag = file->f_flags;

        /* we want directory to be writable */
        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (!dir->i_op->tmpfile)
                return -EOPNOTSUPP;
        child = d_alloc(parentpath->dentry, &slash_name);
        if (unlikely(!child))
                return -ENOMEM;
        file->f_path.mnt = parentpath->mnt;
        file->f_path.dentry = child;
        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = dir->i_op->tmpfile(idmap, dir, file, mode);
        dput(child);
        if (error)
                return error;
        /* Don't check for other permissions, the inode was just created */
        error = may_open(idmap, &file->f_path, 0, file->f_flags);
        if (error)
                return error;
        inode = file_inode(file);
        if (!(open_flag & O_EXCL)) {
                spin_lock(&inode->i_lock);
                inode->i_state |= I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        security_inode_post_create_tmpfile(idmap, inode);
        return 0;
}

/**
 * kernel_tmpfile_open - open a tmpfile for kernel internal use
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        path of the base directory
 * @mode:        mode of the new tmpfile
 * @open_flag:        flags
 * @cred:        credentials for open
 *
 * Create and open a temporary file.  The file is not accounted in nr_files,
 * hence this is only for kernel internal use, and must not be installed into
 * file tables or such.
 */
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred)
{
        struct file *file;
        int error;

        file = alloc_empty_file_noaccount(open_flag, cred);
        if (IS_ERR(file))
                return file;

        error = vfs_tmpfile(idmap, parentpath, file, mode);
        if (error) {
                fput(file);
                file = ERR_PTR(error);
        }
        return file;
}
EXPORT_SYMBOL(kernel_tmpfile_open);

static int do_tmpfile(struct nameidata *nd, unsigned flags,
                const struct open_flags *op,
                struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);

        if (unlikely(error))
                return error;
        error = mnt_want_write(path.mnt);
        if (unlikely(error))
                goto out;
        error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
        if (error)
                goto out2;
        audit_inode(nd->name, file->f_path.dentry, 0);
out2:
        mnt_drop_write(path.mnt);
out:
        path_put(&path);
        return error;
}

static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags, &path);
        if (!error) {
                audit_inode(nd->name, path.dentry, 0);
                error = vfs_open(&path, file);
                path_put(&path);
        }
        return error;
}

static struct file *path_openat(struct nameidata *nd,
                        const struct open_flags *op, unsigned flags)
{
        struct file *file;
        int error;

        file = alloc_empty_file(op->open_flag, current_cred());
        if (IS_ERR(file))
                return file;

        if (unlikely(file->f_flags & __O_TMPFILE)) {
                error = do_tmpfile(nd, flags, op, file);
        } else if (unlikely(file->f_flags & O_PATH)) {
                error = do_o_path(nd, flags, file);
        } else {
                const char *s = path_init(nd, flags);
                while (!(error = link_path_walk(s, nd)) &&
                       (s = open_last_lookups(nd, file, op)) != NULL)
                        ;
                if (!error)
                        error = do_open(nd, file, op);
                terminate_walk(nd);
        }
        if (likely(!error)) {
                if (likely(file->f_mode & FMODE_OPENED))
                        return file;
                WARN_ON(1);
                error = -EINVAL;
        }
        fput(file);
        if (error == -EOPENSTALE) {
                if (flags & LOOKUP_RCU)
                        error = -ECHILD;
                else
                        error = -ESTALE;
        }
        return ERR_PTR(error);
}

struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op)
{
        struct nameidata nd;
        int flags = op->lookup_flags;
        struct file *filp;

        set_nameidata(&nd, dfd, pathname, NULL);
        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
                filp = path_openat(&nd, op, flags);
        if (unlikely(filp == ERR_PTR(-ESTALE)))
                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        return filp;
}

struct file *do_file_open_root(const struct path *root,
                const char *name, const struct open_flags *op)
{
        struct nameidata nd;
        struct file *file;
        struct filename *filename;
        int flags = op->lookup_flags;

        if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);

        filename = getname_kernel(name);
        if (IS_ERR(filename))
                return ERR_CAST(filename);

        set_nameidata(&nd, -1, filename, root);
        file = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
                file = path_openat(&nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        putname(filename);
        return file;
}

static struct dentry *filename_create(int dfd, struct filename *name,
                                      struct path *path, unsigned int lookup_flags)
{
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct qstr last;
        bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
        unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
        unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
        int type;
        int err2;
        int error;

        error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
        if (error)
                return ERR_PTR(error);

        /*
         * Yucky last component or no last component at all?
         * (foo/., foo/.., /////)
         */
        if (unlikely(type != LAST_NORM))
                goto out;

        /* don't fail immediately if it's r/o, at least try to report other errors */
        err2 = mnt_want_write(path->mnt);
        /*
         * Do the final lookup.  Suppress 'create' if there is a trailing
         * '/', and a directory wasn't requested.
         */
        if (last.name[last.len] && !want_dir)
                create_flags = 0;
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path->dentry,
                                      reval_flag | create_flags);
        if (IS_ERR(dentry))
                goto unlock;

        error = -EEXIST;
        if (d_is_positive(dentry))
                goto fail;

        /*
         * Special case - lookup gave negative, but... we had foo/bar/
         * From the vfs_mknod() POV we just have a negative dentry -
         * all is fine. Let's be bastards - you had / on the end, you've
         * been asking for (non-existent) directory. -ENOENT for you.
         */
        if (unlikely(!create_flags)) {
                error = -ENOENT;
                goto fail;
        }
        if (unlikely(err2)) {
                error = err2;
                goto fail;
        }
        return dentry;
fail:
        dput(dentry);
        dentry = ERR_PTR(error);
unlock:
        inode_unlock(path->dentry->d_inode);
        if (!err2)
                mnt_drop_write(path->mnt);
out:
        path_put(path);
        return dentry;
}

struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct filename *filename = getname_kernel(pathname);
        struct dentry *res = filename_create(dfd, filename, path, lookup_flags);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(kern_path_create);

void done_path_create(struct path *path, struct dentry *dentry)
{
        dput(dentry);
        inode_unlock(path->dentry->d_inode);
        mnt_drop_write(path->mnt);
        path_put(path);
}
EXPORT_SYMBOL(done_path_create);

inline struct dentry *user_path_create(int dfd, const char __user *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct filename *filename = getname(pathname);
        struct dentry *res = filename_create(dfd, filename, path, lookup_flags);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(user_path_create);

/**
 * vfs_mknod - create device node or file
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @mode:        mode of the new device node or file
 * @dev:        device number of device to create
 *
 * Create a device node or file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
              struct dentry *dentry, umode_t mode, dev_t dev)
{
        bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
        int error = may_create(idmap, dir, dentry);

        if (error)
                return error;

        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
            !capable(CAP_MKNOD))
                return -EPERM;

        if (!dir->i_op->mknod)
                return -EPERM;

        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = devcgroup_inode_mknod(mode, dev);
        if (error)
                return error;

        error = security_inode_mknod(dir, dentry, mode, dev);
        if (error)
                return error;

        error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mknod);

static int may_mknod(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
        case 0: /* zero mode translates to S_IFREG */
                return 0;
        case S_IFDIR:
                return -EPERM;
        default:
                return -EINVAL;
        }
}

static int do_mknodat(int dfd, struct filename *name, umode_t mode,
                unsigned int dev)
{
        struct mnt_idmap *idmap;
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = 0;

        error = may_mknod(mode);
        if (error)
                goto out1;
retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out1;

        error = security_path_mknod(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode), dev);
        if (error)
                goto out2;

        idmap = mnt_idmap(path.mnt);
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(idmap, path.dentry->d_inode,
                                           dentry, mode, true);
                        if (!error)
                                security_path_post_mknod(idmap, dentry);
                        break;
                case S_IFCHR: case S_IFBLK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, new_decode_dev(dev));
                        break;
                case S_IFIFO: case S_IFSOCK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, 0);
                        break;
        }
out2:
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out1:
        putname(name);
        return error;
}

SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                unsigned int, dev)
{
        return do_mknodat(dfd, getname(filename), mode, dev);
}

SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
        return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
}

/**
 * vfs_mkdir - create directory
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @mode:        mode of the new directory
 *
 * Create a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
              struct dentry *dentry, umode_t mode)
{
        int error;
        unsigned max_links = dir->i_sb->s_max_links;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->mkdir)
                return -EPERM;

        mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
        error = security_inode_mkdir(dir, dentry, mode);
        if (error)
                return error;

        if (max_links && dir->i_nlink >= max_links)
                return -EMLINK;

        error = dir->i_op->mkdir(idmap, dir, dentry, mode);
        if (!error)
                fsnotify_mkdir(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkdir);

int do_mkdirat(int dfd, struct filename *name, umode_t mode)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_DIRECTORY;

retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putname;

        error = security_path_mkdir(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode));
        if (!error) {
                error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
                                  dentry, mode);
        }
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putname:
        putname(name);
        return error;
}

SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(dfd, getname(pathname), mode);
}

SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(AT_FDCWD, getname(pathname), mode);
}

/**
 * vfs_rmdir - remove directory
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 *
 * Remove a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
                     struct dentry *dentry)
{
        int error = may_delete(idmap, dir, dentry, 1);

        if (error)
                return error;

        if (!dir->i_op->rmdir)
                return -EPERM;

        dget(dentry);
        inode_lock(dentry->d_inode);

        error = -EBUSY;
        if (is_local_mountpoint(dentry) ||
            (dentry->d_inode->i_flags & S_KERNEL_FILE))
                goto out;

        error = security_inode_rmdir(dir, dentry);
        if (error)
                goto out;

        error = dir->i_op->rmdir(dir, dentry);
        if (error)
                goto out;

        shrink_dcache_parent(dentry);
        dentry->d_inode->i_flags |= S_DEAD;
        dont_mount(dentry);
        detach_mounts(dentry);

out:
        inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
                d_delete_notify(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_rmdir);

int do_rmdir(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        unsigned int lookup_flags = 0;
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;

        switch (type) {
        case LAST_DOTDOT:
                error = -ENOTEMPTY;
                goto exit2;
        case LAST_DOT:
                error = -EINVAL;
                goto exit2;
        case LAST_ROOT:
                error = -EBUSY;
                goto exit2;
        }

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit2;

        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit3;
        if (!dentry->d_inode) {
                error = -ENOENT;
                goto exit4;
        }
        error = security_path_rmdir(&path, dentry);
        if (error)
                goto exit4;
        error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
exit4:
        dput(dentry);
exit3:
        inode_unlock(path.dentry->d_inode);
        mnt_drop_write(path.mnt);
exit2:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
exit1:
        putname(name);
        return error;
}

SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
        return do_rmdir(AT_FDCWD, getname(pathname));
}

/**
 * vfs_unlink - unlink a filesystem object
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory
 * @dentry:        victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
               struct dentry *dentry, struct inode **delegated_inode)
{
        struct inode *target = dentry->d_inode;
        int error = may_delete(idmap, dir, dentry, 0);

        if (error)
                return error;

        if (!dir->i_op->unlink)
                return -EPERM;

        inode_lock(target);
        if (IS_SWAPFILE(target))
                error = -EPERM;
        else if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
                if (!error) {
                        error = try_break_deleg(target, delegated_inode);
                        if (error)
                                goto out;
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error) {
                                dont_mount(dentry);
                                detach_mounts(dentry);
                        }
                }
        }
out:
        inode_unlock(target);

        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
        if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                fsnotify_unlink(dir, dentry);
        } else if (!error) {
                fsnotify_link_count(target);
                d_delete_notify(dir, dentry);
        }

        return error;
}
EXPORT_SYMBOL(vfs_unlink);

/*
 * Make sure that the actual truncation of the file will occur outside its
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
int do_unlinkat(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        struct inode *inode = NULL;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0;
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;

        error = -EISDIR;
        if (type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit2;
retry_deleg:
        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {

                /* Why not before? Because we want correct error value */
                if (last.name[last.len] || d_is_negative(dentry))
                        goto slashes;
                inode = dentry->d_inode;
                ihold(inode);
                error = security_path_unlink(&path, dentry);
                if (error)
                        goto exit3;
                error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                                   dentry, &delegated_inode);
exit3:
                dput(dentry);
        }
        inode_unlock(path.dentry->d_inode);
        if (inode)
                iput(inode);        /* truncate the inode here */
        inode = NULL;
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path.mnt);
exit2:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                inode = NULL;
                goto retry;
        }
exit1:
        putname(name);
        return error;

slashes:
        if (d_is_negative(dentry))
                error = -ENOENT;
        else if (d_is_dir(dentry))
                error = -EISDIR;
        else
                error = -ENOTDIR;
        goto exit3;
}

SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
        if ((flag & ~AT_REMOVEDIR) != 0)
                return -EINVAL;

        if (flag & AT_REMOVEDIR)
                return do_rmdir(dfd, getname(pathname));
        return do_unlinkat(dfd, getname(pathname));
}

SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
        return do_unlinkat(AT_FDCWD, getname(pathname));
}

/**
 * vfs_symlink - create symlink
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @oldname:        name of the file to link to
 *
 * Create a symlink.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
                struct dentry *dentry, const char *oldname)
{
        int error;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->symlink)
                return -EPERM;

        error = security_inode_symlink(dir, dentry, oldname);
        if (error)
                return error;

        error = dir->i_op->symlink(idmap, dir, dentry, oldname);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_symlink);

int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
{
        int error;
        struct dentry *dentry;
        struct path path;
        unsigned int lookup_flags = 0;

        if (IS_ERR(from)) {
                error = PTR_ERR(from);
                goto out_putnames;
        }
retry:
        dentry = filename_create(newdfd, to, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putnames;

        error = security_path_symlink(&path, dentry, from->name);
        if (!error)
                error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                                    dentry, from->name);
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putnames:
        putname(to);
        putname(from);
        return error;
}

SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_symlinkat(getname(oldname), newdfd, getname(newname));
}

SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
        return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
}

/**
 * vfs_link - create a new link
 * @old_dentry:        object to be linked
 * @idmap:        idmap of the mount
 * @dir:        new parent
 * @new_dentry:        where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
             struct inode *dir, struct dentry *new_dentry,
             struct inode **delegated_inode)
{
        struct inode *inode = old_dentry->d_inode;
        unsigned max_links = dir->i_sb->s_max_links;
        int error;

        if (!inode)
                return -ENOENT;

        error = may_create(idmap, dir, new_dentry);
        if (error)
                return error;

        if (dir->i_sb != inode->i_sb)
                return -EXDEV;

        /*
         * A link to an append-only or immutable file cannot be created.
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
        /*
         * Updating the link count will likely cause i_uid and i_gid to
         * be writen back improperly if their true value is unknown to
         * the vfs.
         */
        if (HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;

        error = security_inode_link(old_dentry, dir, new_dentry);
        if (error)
                return error;

        inode_lock(inode);
        /* Make sure we don't allow creating hardlink to an unlinked file */
        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
                error =  -ENOENT;
        else if (max_links && inode->i_nlink >= max_links)
                error = -EMLINK;
        else {
                error = try_break_deleg(inode, delegated_inode);
                if (!error)
                        error = dir->i_op->link(old_dentry, dir, new_dentry);
        }

        if (!error && (inode->i_state & I_LINKABLE)) {
                spin_lock(&inode->i_lock);
                inode->i_state &= ~I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        inode_unlock(inode);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
        return error;
}
EXPORT_SYMBOL(vfs_link);

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
int do_linkat(int olddfd, struct filename *old, int newdfd,
              struct filename *new, int flags)
{
        struct mnt_idmap *idmap;
        struct dentry *new_dentry;
        struct path old_path, new_path;
        struct inode *delegated_inode = NULL;
        int how = 0;
        int error;

        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
                error = -EINVAL;
                goto out_putnames;
        }
        /*
         * To use null names we require CAP_DAC_READ_SEARCH or
         * that the open-time creds of the dfd matches current.
         * This ensures that not everyone will be able to create
         * a hardlink using the passed file descriptor.
         */
        if (flags & AT_EMPTY_PATH)
                how |= LOOKUP_LINKAT_EMPTY;

        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
retry:
        error = filename_lookup(olddfd, old, how, &old_path, NULL);
        if (error)
                goto out_putnames;

        new_dentry = filename_create(newdfd, new, &new_path,
                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out_putpath;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto out_dput;
        idmap = mnt_idmap(new_path.mnt);
        error = may_linkat(idmap, &old_path);
        if (unlikely(error))
                goto out_dput;
        error = security_path_link(old_path.dentry, &new_path, new_dentry);
        if (error)
                goto out_dput;
        error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
                         new_dentry, &delegated_inode);
out_dput:
        done_path_create(&new_path, new_dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error) {
                        path_put(&old_path);
                        goto retry;
                }
        }
        if (retry_estale(error, how)) {
                path_put(&old_path);
                how |= LOOKUP_REVAL;
                goto retry;
        }
out_putpath:
        path_put(&old_path);
out_putnames:
        putname(old);
        putname(new);

        return error;
}

SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, int, flags)
{
        return do_linkat(olddfd, getname_uflags(oldname, flags),
                newdfd, getname(newname), flags);
}

SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
        return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
}

/**
 * vfs_rename - rename a filesystem object
 * @rd:                pointer to &struct renamedata info
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
 *
 *        a) we can get into loop creation.
 *        b) race potential - two innocent renames can create a loop together.
 *           That's where 4.4BSD screws up. Current fix: serialization on
 *           sb->s_vfs_rename_mutex. We might be more accurate, but that's another
 *           story.
 *        c) we may have to lock up to _four_ objects - parents and victim (if it exists),
 *           and source (if it's a non-directory or a subdirectory that moves to
 *           different parent).
 *           And that - after we got ->i_mutex on parents (until then we don't know
 *           whether the target exists).  Solution: try to be smart with locking
 *           order for inodes.  We rely on the fact that tree topology may change
 *           only under ->s_vfs_rename_mutex _and_ that parent of the object we
 *           move will be locked.  Thus we can rank directories by the tree
 *           (ancestors first) and rank all non-directories after them.
 *           That works since everybody except rename does "lock parent, lookup,
 *           lock child" and rename is under ->s_vfs_rename_mutex.
 *           HOWEVER, it relies on the assumption that any object with ->lookup()
 *           has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *           we'd better make sure that there's no link(2) for them.
 *        d) conversion from fhandle to dentry may come in the wrong moment - when
 *           we are removing the target. Solution: we will have to grab ->i_mutex
 *           in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *           ->i_mutex on parents, which works but leads to some truly excessive
 *           locking].
 */
int vfs_rename(struct renamedata *rd)
{
        int error;
        struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
        struct dentry *old_dentry = rd->old_dentry;
        struct dentry *new_dentry = rd->new_dentry;
        struct inode **delegated_inode = rd->delegated_inode;
        unsigned int flags = rd->flags;
        bool is_dir = d_is_dir(old_dentry);
        struct inode *source = old_dentry->d_inode;
        struct inode *target = new_dentry->d_inode;
        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
        struct name_snapshot old_name;
        bool lock_old_subdir, lock_new_subdir;

        if (source == target)
                return 0;

        error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
        if (error)
                return error;

        if (!target) {
                error = may_create(rd->new_mnt_idmap, new_dir, new_dentry);
        } else {
                new_is_dir = d_is_dir(new_dentry);

                if (!(flags & RENAME_EXCHANGE))
                        error = may_delete(rd->new_mnt_idmap, new_dir,
                                           new_dentry, is_dir);
                else
                        error = may_delete(rd->new_mnt_idmap, new_dir,
                                           new_dentry, new_is_dir);
        }
        if (error)
                return error;

        if (!old_dir->i_op->rename)
                return -EPERM;

        /*
         * If we are going to change the parent - check write permissions,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
                if (is_dir) {
                        error = inode_permission(rd->old_mnt_idmap, source,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
                        error = inode_permission(rd->new_mnt_idmap, target,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
        }

        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
                                      flags);
        if (error)
                return error;

        take_dentry_name_snapshot(&old_name, old_dentry);
        dget(new_dentry);
        /*
         * Lock children.
         * The source subdirectory needs to be locked on cross-directory
         * rename or cross-directory exchange since its parent changes.
         * The target subdirectory needs to be locked on cross-directory
         * exchange due to parent change and on any rename due to becoming
         * a victim.
         * Non-directories need locking in all cases (for NFS reasons);
         * they get locked after any subdirectories (in inode address order).
         *
         * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
         * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
         */
        lock_old_subdir = new_dir != old_dir;
        lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
        if (is_dir) {
                if (lock_old_subdir)
                        inode_lock_nested(source, I_MUTEX_CHILD);
                if (target && (!new_is_dir || lock_new_subdir))
                        inode_lock(target);
        } else if (new_is_dir) {
                if (lock_new_subdir)
                        inode_lock_nested(target, I_MUTEX_CHILD);
                inode_lock(source);
        } else {
                lock_two_nondirectories(source, target);
        }

        error = -EPERM;
        if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
                goto out;

        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
                goto out;

        if (max_links && new_dir != old_dir) {
                error = -EMLINK;
                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
                        goto out;
                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
                    old_dir->i_nlink >= max_links)
                        goto out;
        }
        if (!is_dir) {
                error = try_break_deleg(source, delegated_inode);
                if (error)
                        goto out;
        }
        if (target && !new_is_dir) {
                error = try_break_deleg(target, delegated_inode);
                if (error)
                        goto out;
        }
        error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
                                      new_dir, new_dentry, flags);
        if (error)
                goto out;

        if (!(flags & RENAME_EXCHANGE) && target) {
                if (is_dir) {
                        shrink_dcache_parent(new_dentry);
                        target->i_flags |= S_DEAD;
                }
                dont_mount(new_dentry);
                detach_mounts(new_dentry);
        }
        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
                if (!(flags & RENAME_EXCHANGE))
                        d_move(old_dentry, new_dentry);
                else
                        d_exchange(old_dentry, new_dentry);
        }
out:
        if (!is_dir || lock_old_subdir)
                inode_unlock(source);
        if (target && (!new_is_dir || lock_new_subdir))
                inode_unlock(target);
        dput(new_dentry);
        if (!error) {
                fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
                if (flags & RENAME_EXCHANGE) {
                        fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
                                      new_is_dir, NULL, new_dentry);
                }
        }
        release_dentry_name_snapshot(&old_name);

        return error;
}
EXPORT_SYMBOL(vfs_rename);

int do_renameat2(int olddfd, struct filename *from, int newdfd,
                 struct filename *to, unsigned int flags)
{
        struct renamedata rd;
        struct dentry *old_dentry, *new_dentry;
        struct dentry *trap;
        struct path old_path, new_path;
        struct qstr old_last, new_last;
        int old_type, new_type;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
        bool should_retry = false;
        int error = -EINVAL;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                goto put_names;

        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
            (flags & RENAME_EXCHANGE))
                goto put_names;

        if (flags & RENAME_EXCHANGE)
                target_flags = 0;

retry:
        error = filename_parentat(olddfd, from, lookup_flags, &old_path,
                                  &old_last, &old_type);
        if (error)
                goto put_names;

        error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
                                  &new_type);
        if (error)
                goto exit1;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto exit2;

        error = -EBUSY;
        if (old_type != LAST_NORM)
                goto exit2;

        if (flags & RENAME_NOREPLACE)
                error = -EEXIST;
        if (new_type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(old_path.mnt);
        if (error)
                goto exit2;

retry_deleg:
        trap = lock_rename(new_path.dentry, old_path.dentry);
        if (IS_ERR(trap)) {
                error = PTR_ERR(trap);
                goto exit_lock_rename;
        }

        old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
                                          lookup_flags);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
        /* source must exist */
        error = -ENOENT;
        if (d_is_negative(old_dentry))
                goto exit4;
        new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
                                          lookup_flags | target_flags);
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto exit4;
        error = -EEXIST;
        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
                goto exit5;
        if (flags & RENAME_EXCHANGE) {
                error = -ENOENT;
                if (d_is_negative(new_dentry))
                        goto exit5;

                if (!d_is_dir(new_dentry)) {
                        error = -ENOTDIR;
                        if (new_last.name[new_last.len])
                                goto exit5;
                }
        }
        /* unless the source is a directory trailing slashes give -ENOTDIR */
        if (!d_is_dir(old_dentry)) {
                error = -ENOTDIR;
                if (old_last.name[old_last.len])
                        goto exit5;
                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
                        goto exit5;
        }
        /* source should not be ancestor of target */
        error = -EINVAL;
        if (old_dentry == trap)
                goto exit5;
        /* target should not be an ancestor of source */
        if (!(flags & RENAME_EXCHANGE))
                error = -ENOTEMPTY;
        if (new_dentry == trap)
                goto exit5;

        error = security_path_rename(&old_path, old_dentry,
                                     &new_path, new_dentry, flags);
        if (error)
                goto exit5;

        rd.old_dir           = old_path.dentry->d_inode;
        rd.old_dentry           = old_dentry;
        rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
        rd.new_dir           = new_path.dentry->d_inode;
        rd.new_dentry           = new_dentry;
        rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
        rd.delegated_inode = &delegated_inode;
        rd.flags           = flags;
        error = vfs_rename(&rd);
exit5:
        dput(new_dentry);
exit4:
        dput(old_dentry);
exit3:
        unlock_rename(new_path.dentry, old_path.dentry);
exit_lock_rename:
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(old_path.mnt);
exit2:
        if (retry_estale(error, lookup_flags))
                should_retry = true;
        path_put(&new_path);
exit1:
        path_put(&old_path);
        if (should_retry) {
                should_retry = false;
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
put_names:
        putname(from);
        putname(to);
        return error;
}

SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, unsigned int, flags)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                flags);
}

SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                0);
}

SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
        return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
                                getname(newname), 0);
}

int readlink_copy(char __user *buffer, int buflen, const char *link)
{
        int len = PTR_ERR(link);
        if (IS_ERR(link))
                goto out;

        len = strlen(link);
        if (len > (unsigned) buflen)
                len = buflen;
        if (copy_to_user(buffer, link, len))
                len = -EFAULT;
out:
        return len;
}

/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        DEFINE_DELAYED_CALL(done);
        const char *link;
        int res;

        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
                if (unlikely(inode->i_op->readlink))
                        return inode->i_op->readlink(dentry, buffer, buflen);

                if (!d_is_symlink(dentry))
                        return -EINVAL;

                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_DEFAULT_READLINK;
                spin_unlock(&inode->i_lock);
        }

        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        res = readlink_copy(buffer, buflen, link);
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(vfs_readlink);

/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
        const char *res = ERR_PTR(-EINVAL);
        struct inode *inode = d_inode(dentry);

        if (d_is_symlink(dentry)) {
                res = ERR_PTR(security_inode_readlink(dentry));
                if (!res)
                        res = inode->i_op->get_link(dentry, inode, done);
        }
        return res;
}
EXPORT_SYMBOL(vfs_get_link);

/* get the link contents into pagecache */
const char *page_get_link(struct dentry *dentry, struct inode *inode,
                          struct delayed_call *callback)
{
        char *kaddr;
        struct page *page;
        struct address_space *mapping = inode->i_mapping;

        if (!dentry) {
                page = find_get_page(mapping, 0);
                if (!page)
                        return ERR_PTR(-ECHILD);
                if (!PageUptodate(page)) {
                        put_page(page);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                page = read_mapping_page(mapping, 0, NULL);
                if (IS_ERR(page))
                        return (char*)page;
        }
        set_delayed_call(callback, page_put_link, page);
        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
        kaddr = page_address(page);
        nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
        return kaddr;
}

EXPORT_SYMBOL(page_get_link);

void page_put_link(void *arg)
{
        put_page(arg);
}
EXPORT_SYMBOL(page_put_link);

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        DEFINE_DELAYED_CALL(done);
        int res = readlink_copy(buffer, buflen,
                                page_get_link(dentry, d_inode(dentry),
                                              &done));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(page_readlink);

int page_symlink(struct inode *inode, const char *symname, int len)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
        struct page *page;
        void *fsdata = NULL;
        int err;
        unsigned int flags;

retry:
        if (nofs)
                flags = memalloc_nofs_save();
        err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
        if (nofs)
                memalloc_nofs_restore(flags);
        if (err)
                goto fail;

        memcpy(page_address(page), symname, len-1);

        err = aops->write_end(NULL, mapping, 0, len-1, len-1,
                                                        page, fsdata);
        if (err < 0)
                goto fail;
        if (err < len-1)
                goto retry;

        mark_inode_dirty(inode);
        return 0;
fail:
        return err;
}
EXPORT_SYMBOL(page_symlink);

const struct inode_operations page_symlink_inode_operations = {
        .get_link        = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);
































    1 






    1 

    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP cryptographic functions
 * Copyright (c) 2017 - 2019, Intel Corporation.
 *
 * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and
 *       mptcp_ipv6 from multipath-tcp.org, authored by:
 *
 *       Sébastien Barré <sebastien.barre@uclouvain.be>
 *       Christoph Paasch <christoph.paasch@uclouvain.be>
 *       Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
 *       Gregory Detal <gregory.detal@uclouvain.be>
 *       Fabien Duchêne <fabien.duchene@uclouvain.be>
 *       Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
 *       Lavkesh Lahngir <lavkesh51@gmail.com>
 *       Andreas Ripke <ripke@neclab.eu>
 *       Vlad Dogaru <vlad.dogaru@intel.com>
 *       Octavian Purdila <octavian.purdila@intel.com>
 *       John Ronan <jronan@tssg.org>
 *       Catalin Nicutar <catalin.nicutar@gmail.com>
 *       Brandon Heller <brandonh@stanford.edu>
 */

#include <linux/kernel.h>
#include <crypto/sha2.h>
#include <asm/unaligned.h>

#include "protocol.h"

#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4)

void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
{
        __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
        __be64 input = cpu_to_be64(key);

        sha256((__force u8 *)&input, sizeof(input), (u8 *)mptcp_hashed_key);

        if (token)
                *token = be32_to_cpu(mptcp_hashed_key[0]);
        if (idsn)
                *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
}

void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
        u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
        u8 key1be[8];
        u8 key2be[8];
        int i;

        if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
                len = SHA256_DIGEST_SIZE;

        put_unaligned_be64(key1, key1be);
        put_unaligned_be64(key2, key2be);

        /* Generate key xored with ipad */
        memset(input, 0x36, SHA256_BLOCK_SIZE);
        for (i = 0; i < 8; i++)
                input[i] ^= key1be[i];
        for (i = 0; i < 8; i++)
                input[i + 8] ^= key2be[i];

        memcpy(&input[SHA256_BLOCK_SIZE], msg, len);

        /* emit sha256(K1 || msg) on the second input block, so we can
         * reuse 'input' for the last hashing
         */
        sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]);

        /* Prepare second part of hmac */
        memset(input, 0x5C, SHA256_BLOCK_SIZE);
        for (i = 0; i < 8; i++)
                input[i] ^= key1be[i];
        for (i = 0; i < 8; i++)
                input[i + 8] ^= key2be[i];

        sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac);
}

#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha);
#endif

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 










    2 




















































































































    2 
    2 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                ROUTE - implementation of the IP router.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 *                Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Fixes:
 *                Alan Cox        :        Verify area fixes.
 *                Alan Cox        :        cli() protects routing changes
 *                Rui Oliveira        :        ICMP routing table updates
 *                (rco@di.uminho.pt)        Routing table insertion and update
 *                Linus Torvalds        :        Rewrote bits to be sensible
 *                Alan Cox        :        Added BSD route gw semantics
 *                Alan Cox        :        Super /proc >4K
 *                Alan Cox        :        MTU in route table
 *                Alan Cox        :        MSS actually. Also added the window
 *                                        clamper.
 *                Sam Lantinga        :        Fixed route matching in rt_del()
 *                Alan Cox        :        Routing cache support.
 *                Alan Cox        :        Removed compatibility cruft.
 *                Alan Cox        :        RTF_REJECT support.
 *                Alan Cox        :        TCP irtt support.
 *                Jonathan Naylor        :        Added Metric support.
 *        Miquel van Smoorenburg        :        BSD API fixes.
 *        Miquel van Smoorenburg        :        Metrics.
 *                Alan Cox        :        Use __u32 properly
 *                Alan Cox        :        Aligned routing errors more closely with BSD
 *                                        our system is still very different.
 *                Alan Cox        :        Faster /proc handling
 *        Alexey Kuznetsov        :        Massive rework to support tree based routing,
 *                                        routing caches and better behaviour.
 *
 *                Olaf Erb        :        irtt wasn't being copied right.
 *                Bjorn Ekwall        :        Kerneld route support.
 *                Alan Cox        :        Multicast fixed (I hope)
 *                Pavel Krauz        :        Limited broadcast fixed
 *                Mike McLagan        :        Routing by source
 *        Alexey Kuznetsov        :        End of old history. Split to fib.c and
 *                                        route.c and rewritten from scratch.
 *                Andi Kleen        :        Load-limit warning messages.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year coma.
 *        Vitaly E. Lavrov        :        Race condition in ip_route_input_slow.
 *        Tobias Ringstrom        :        Uninitialized res.type in ip_route_output_slow.
 *        Vladimir V. Ivanov        :        IP rule info (flowid) is really useful.
 *                Marc Boucher        :        routing by fwmark
 *        Robert Olsson                :        Added rt_cache statistics
 *        Arnaldo C. Melo                :        Convert proc stuff to seq_file
 *        Eric Dumazet                :        hashed spinlocks and rt_check_expire() fixes.
 *        Ilia Sotnikov                :        Ignore TOS on PMTUD and Redirect
 *        Ilia Sotnikov                :        Removed TOS from hash calculations
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/memblock.h>
#include <linux/socket.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/pkt_sched.h>
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>

#include "fib_lookup.h"

#define RT_GC_TIMEOUT (300*HZ)

#define DEFAULT_MIN_PMTU (512 + 20 + 20)
#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
static int ip_rt_redirect_number __read_mostly        = 9;
static int ip_rt_redirect_load __read_mostly        = HZ / 50;
static int ip_rt_redirect_silence __read_mostly        = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly        = HZ;
static int ip_rt_error_burst __read_mostly        = 5 * HZ;

static int ip_rt_gc_timeout __read_mostly        = RT_GC_TIMEOUT;

/*
 *        Interface to generic destination cache.
 */

INDIRECT_CALLABLE_SCOPE
struct dst_entry        *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int         ipv4_default_advmss(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int                ipv4_mtu(const struct dst_entry *dst);
static void                ipv4_negative_advice(struct sock *sk,
                                             struct dst_entry *dst);
static void                 ipv4_link_failure(struct sk_buff *skb);
static void                 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu,
                                           bool confirm_neigh);
static void                 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
static void                ipv4_dst_destroy(struct dst_entry *dst);

static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
        WARN_ON(1);
        return NULL;
}

static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr);
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);

static struct dst_ops ipv4_dst_ops = {
        .family =                AF_INET,
        .check =                ipv4_dst_check,
        .default_advmss =        ipv4_default_advmss,
        .mtu =                        ipv4_mtu,
        .cow_metrics =                ipv4_cow_metrics,
        .destroy =                ipv4_dst_destroy,
        .negative_advice =        ipv4_negative_advice,
        .link_failure =                ipv4_link_failure,
        .update_pmtu =                ip_rt_update_pmtu,
        .redirect =                ip_do_redirect,
        .local_out =                __ip_local_out,
        .neigh_lookup =                ipv4_neigh_lookup,
        .confirm_neigh =        ipv4_confirm_neigh,
};

#define ECN_OR_COST(class)        TC_PRIO_##class

const __u8 ip_tos2prio[16] = {
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BULK,
        ECN_OR_COST(BULK),
        TC_PRIO_BULK,
        ECN_OR_COST(BULK),
        TC_PRIO_INTERACTIVE,
        ECN_OR_COST(INTERACTIVE),
        TC_PRIO_INTERACTIVE,
        ECN_OR_COST(INTERACTIVE),
        TC_PRIO_INTERACTIVE_BULK,
        ECN_OR_COST(INTERACTIVE_BULK),
        TC_PRIO_INTERACTIVE_BULK,
        ECN_OR_COST(INTERACTIVE_BULK)
};
EXPORT_SYMBOL(ip_tos2prio);

static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)

#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
        if (*pos)
                return NULL;
        return SEQ_START_TOKEN;
}

static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return NULL;
}

static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
}

static int rt_cache_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_printf(seq, "%-127s\n",
                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
                           "HHUptod\tSpecDst");
        return 0;
}

static const struct seq_operations rt_cache_seq_ops = {
        .start  = rt_cache_seq_start,
        .next   = rt_cache_seq_next,
        .stop   = rt_cache_seq_stop,
        .show   = rt_cache_seq_show,
};

static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
        int cpu;

        if (*pos == 0)
                return SEQ_START_TOKEN;

        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return &per_cpu(rt_cache_stat, cpu);
        }
        return NULL;
}

static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        int cpu;

        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return &per_cpu(rt_cache_stat, cpu);
        }
        (*pos)++;
        return NULL;

}

static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
{

}

static int rt_cpu_seq_show(struct seq_file *seq, void *v)
{
        struct rt_cache_stat *st = v;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
                return 0;
        }

        seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
                        "%08x       %08x %08x     %08x    %08x %08x   "
                        "%08x     %08x        %08x        %08x\n",
                   dst_entries_get_slow(&ipv4_dst_ops),
                   0, /* st->in_hit */
                   st->in_slow_tot,
                   st->in_slow_mc,
                   st->in_no_route,
                   st->in_brd,
                   st->in_martian_dst,
                   st->in_martian_src,

                   0, /* st->out_hit */
                   st->out_slow_tot,
                   st->out_slow_mc,

                   0, /* st->gc_total */
                   0, /* st->gc_ignored */
                   0, /* st->gc_goal_miss */
                   0, /* st->gc_dst_overflow */
                   0, /* st->in_hlist_search */
                   0  /* st->out_hlist_search */
                );
        return 0;
}

static const struct seq_operations rt_cpu_seq_ops = {
        .start  = rt_cpu_seq_start,
        .next   = rt_cpu_seq_next,
        .stop   = rt_cpu_seq_stop,
        .show   = rt_cpu_seq_show,
};

#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
        struct ip_rt_acct *dst, *src;
        unsigned int i, j;

        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
        if (!dst)
                return -ENOMEM;

        for_each_possible_cpu(i) {
                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
                for (j = 0; j < 256; j++) {
                        dst[j].o_bytes   += src[j].o_bytes;
                        dst[j].o_packets += src[j].o_packets;
                        dst[j].i_bytes   += src[j].i_bytes;
                        dst[j].i_packets += src[j].i_packets;
                }
        }

        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
        kfree(dst);
        return 0;
}
#endif

static int __net_init ip_rt_do_proc_init(struct net *net)
{
        struct proc_dir_entry *pde;

        pde = proc_create_seq("rt_cache", 0444, net->proc_net,
                              &rt_cache_seq_ops);
        if (!pde)
                goto err1;

        pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
                              &rt_cpu_seq_ops);
        if (!pde)
                goto err2;

#ifdef CONFIG_IP_ROUTE_CLASSID
        pde = proc_create_single("rt_acct", 0, net->proc_net,
                        rt_acct_proc_show);
        if (!pde)
                goto err3;
#endif
        return 0;

#ifdef CONFIG_IP_ROUTE_CLASSID
err3:
        remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
err2:
        remove_proc_entry("rt_cache", net->proc_net);
err1:
        return -ENOMEM;
}

static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
        remove_proc_entry("rt_cache", net->proc_net_stat);
        remove_proc_entry("rt_cache", net->proc_net);
#ifdef CONFIG_IP_ROUTE_CLASSID
        remove_proc_entry("rt_acct", net->proc_net);
#endif
}

static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
        .init = ip_rt_do_proc_init,
        .exit = ip_rt_do_proc_exit,
};

static int __init ip_rt_proc_init(void)
{
        return register_pernet_subsys(&ip_rt_proc_ops);
}

#else
static inline int ip_rt_proc_init(void)
{
        return 0;
}
#endif /* CONFIG_PROC_FS */

static inline bool rt_is_expired(const struct rtable *rth)
{
        return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
}

void rt_cache_flush(struct net *net)
{
        rt_genid_bump_ipv4(net);
}

static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr)
{
        const struct rtable *rt = container_of(dst, struct rtable, dst);
        struct net_device *dev = dst->dev;
        struct neighbour *n;

        rcu_read_lock();

        if (likely(rt->rt_gw_family == AF_INET)) {
                n = ip_neigh_gw4(dev, rt->rt_gw4);
        } else if (rt->rt_gw_family == AF_INET6) {
                n = ip_neigh_gw6(dev, &rt->rt_gw6);
        } else {
                __be32 pkey;

                pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
                n = ip_neigh_gw4(dev, pkey);
        }

        if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;

        rcu_read_unlock();

        return n;
}

static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct rtable *rt = container_of(dst, struct rtable, dst);
        struct net_device *dev = dst->dev;
        const __be32 *pkey = daddr;

        if (rt->rt_gw_family == AF_INET) {
                pkey = (const __be32 *)&rt->rt_gw4;
        } else if (rt->rt_gw_family == AF_INET6) {
                return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
        } else if (!daddr ||
                 (rt->rt_flags &
                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
                return;
        }
        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}

/* Hash tables of size 2048..262144 depending on RAM size.
 * Each bucket uses 8 bytes.
 */
static u32 ip_idents_mask __read_mostly;
static atomic_t *ip_idents __read_mostly;
static u32 *ip_tstamps __read_mostly;

/* In order to protect privacy, we add a perturbation to identifiers
 * if one generator is seldom used. This makes hard for an attacker
 * to infer how many packets were sent between two points in time.
 */
static u32 ip_idents_reserve(u32 hash, int segs)
{
        u32 bucket, old, now = (u32)jiffies;
        atomic_t *p_id;
        u32 *p_tstamp;
        u32 delta = 0;

        bucket = hash & ip_idents_mask;
        p_tstamp = ip_tstamps + bucket;
        p_id = ip_idents + bucket;
        old = READ_ONCE(*p_tstamp);

        if (old != now && cmpxchg(p_tstamp, old, now) == old)
                delta = get_random_u32_below(now - old);

        /* If UBSAN reports an error there, please make sure your compiler
         * supports -fno-strict-overflow before reporting it that was a bug
         * in UBSAN, and it has been fixed in GCC-8.
         */
        return atomic_add_return(segs + delta, p_id) - segs;
}

void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
        u32 hash, id;

        /* Note the following code is not safe, but this is okay. */
        if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
                get_random_bytes(&net->ipv4.ip_id_key,
                                 sizeof(net->ipv4.ip_id_key));

        hash = siphash_3u32((__force u32)iph->daddr,
                            (__force u32)iph->saddr,
                            iph->protocol,
                            &net->ipv4.ip_id_key);
        id = ip_idents_reserve(hash, segs);
        iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);

static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
                             const struct sock *sk, const struct iphdr *iph,
                             int oif, __u8 tos, u8 prot, u32 mark,
                             int flow_flags)
{
        __u8 scope = RT_SCOPE_UNIVERSE;

        if (sk) {
                oif = sk->sk_bound_dev_if;
                mark = READ_ONCE(sk->sk_mark);
                tos = ip_sock_rt_tos(sk);
                scope = ip_sock_rt_scope(sk);
                prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
                                                    sk->sk_protocol;
        }

        flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
                           prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
                           sock_net_uid(net, sk));
}

static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
                               const struct sock *sk)
{
        const struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = ip_hdr(skb);
        int oif = skb->dev->ifindex;
        u8 prot = iph->protocol;
        u32 mark = skb->mark;
        __u8 tos = iph->tos;

        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}

static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);
        const struct ip_options_rcu *inet_opt;
        __be32 daddr = inet->inet_daddr;

        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;
        flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
                           ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
                           ip_sock_rt_scope(sk),
                           inet_test_bit(HDRINCL, sk) ?
                                IPPROTO_RAW : sk->sk_protocol,
                           inet_sk_flowi_flags(sk),
                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
        rcu_read_unlock();
}

static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
                                 const struct sk_buff *skb)
{
        if (skb)
                build_skb_flow_key(fl4, skb, sk);
        else
                build_sk_flow_key(fl4, sk);
}

static DEFINE_SPINLOCK(fnhe_lock);

static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
{
        struct rtable *rt;

        rt = rcu_dereference(fnhe->fnhe_rth_input);
        if (rt) {
                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
                dst_dev_put(&rt->dst);
                dst_release(&rt->dst);
        }
        rt = rcu_dereference(fnhe->fnhe_rth_output);
        if (rt) {
                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
                dst_dev_put(&rt->dst);
                dst_release(&rt->dst);
        }
}

static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
{
        struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
        struct fib_nh_exception *fnhe, *oldest = NULL;

        for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
                fnhe = rcu_dereference_protected(*fnhe_p,
                                                 lockdep_is_held(&fnhe_lock));
                if (!fnhe)
                        break;
                if (!oldest ||
                    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
                        oldest = fnhe;
                        oldest_p = fnhe_p;
                }
        }
        fnhe_flush_routes(oldest);
        *oldest_p = oldest->fnhe_next;
        kfree_rcu(oldest, rcu);
}

static u32 fnhe_hashfun(__be32 daddr)
{
        static siphash_aligned_key_t fnhe_hash_key;
        u64 hval;

        net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
        hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
        return hash_64(hval, FNHE_HASH_SHIFT);
}

static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
{
        rt->rt_pmtu = fnhe->fnhe_pmtu;
        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
        rt->dst.expires = fnhe->fnhe_expires;

        if (fnhe->fnhe_gw) {
                rt->rt_flags |= RTCF_REDIRECTED;
                rt->rt_uses_gateway = 1;
                rt->rt_gw_family = AF_INET;
                rt->rt_gw4 = fnhe->fnhe_gw;
        }
}

static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
                                  __be32 gw, u32 pmtu, bool lock,
                                  unsigned long expires)
{
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe;
        struct rtable *rt;
        u32 genid, hval;
        unsigned int i;
        int depth;

        genid = fnhe_genid(dev_net(nhc->nhc_dev));
        hval = fnhe_hashfun(daddr);

        spin_lock_bh(&fnhe_lock);

        hash = rcu_dereference(nhc->nhc_exceptions);
        if (!hash) {
                hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
                if (!hash)
                        goto out_unlock;
                rcu_assign_pointer(nhc->nhc_exceptions, hash);
        }

        hash += hval;

        depth = 0;
        for (fnhe = rcu_dereference(hash->chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
                if (fnhe->fnhe_daddr == daddr)
                        break;
                depth++;
        }

        if (fnhe) {
                if (fnhe->fnhe_genid != genid)
                        fnhe->fnhe_genid = genid;
                if (gw)
                        fnhe->fnhe_gw = gw;
                if (pmtu) {
                        fnhe->fnhe_pmtu = pmtu;
                        fnhe->fnhe_mtu_locked = lock;
                }
                fnhe->fnhe_expires = max(1UL, expires);
                /* Update all cached dsts too */
                rt = rcu_dereference(fnhe->fnhe_rth_input);
                if (rt)
                        fill_route_from_fnhe(rt, fnhe);
                rt = rcu_dereference(fnhe->fnhe_rth_output);
                if (rt)
                        fill_route_from_fnhe(rt, fnhe);
        } else {
                /* Randomize max depth to avoid some side channels attacks. */
                int max_depth = FNHE_RECLAIM_DEPTH +
                                get_random_u32_below(FNHE_RECLAIM_DEPTH);

                while (depth > max_depth) {
                        fnhe_remove_oldest(hash);
                        depth--;
                }

                fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
                if (!fnhe)
                        goto out_unlock;

                fnhe->fnhe_next = hash->chain;

                fnhe->fnhe_genid = genid;
                fnhe->fnhe_daddr = daddr;
                fnhe->fnhe_gw = gw;
                fnhe->fnhe_pmtu = pmtu;
                fnhe->fnhe_mtu_locked = lock;
                fnhe->fnhe_expires = max(1UL, expires);

                rcu_assign_pointer(hash->chain, fnhe);

                /* Exception created; mark the cached routes for the nexthop
                 * stale, so anyone caching it rechecks if this exception
                 * applies to them.
                 */
                rt = rcu_dereference(nhc->nhc_rth_input);
                if (rt)
                        rt->dst.obsolete = DST_OBSOLETE_KILL;

                for_each_possible_cpu(i) {
                        struct rtable __rcu **prt;

                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
                        rt = rcu_dereference(*prt);
                        if (rt)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
                }
        }

        fnhe->fnhe_stamp = jiffies;

out_unlock:
        spin_unlock_bh(&fnhe_lock);
}

static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
                             bool kill_route)
{
        __be32 new_gw = icmp_hdr(skb)->un.gateway;
        __be32 old_gw = ip_hdr(skb)->saddr;
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct fib_result res;
        struct neighbour *n;
        struct net *net;

        switch (icmp_hdr(skb)->code & 7) {
        case ICMP_REDIR_NET:
        case ICMP_REDIR_NETTOS:
        case ICMP_REDIR_HOST:
        case ICMP_REDIR_HOSTTOS:
                break;

        default:
                return;
        }

        if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
                return;

        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                return;

        net = dev_net(dev);
        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
            ipv4_is_zeronet(new_gw))
                goto reject_redirect;

        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
                        goto reject_redirect;
                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
                        goto reject_redirect;
        } else {
                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
                        goto reject_redirect;
        }

        n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
        if (!n)
                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
        if (!IS_ERR(n)) {
                if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
                        neigh_event_send(n, NULL);
                } else {
                        if (fib_lookup(net, fl4, &res, 0) == 0) {
                                struct fib_nh_common *nhc;

                                fib_select_path(net, &res, fl4, skb);
                                nhc = FIB_RES_NHC(res);
                                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
                                                0, false,
                                                jiffies + ip_rt_gc_timeout);
                        }
                        if (kill_route)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
                }
                neigh_release(n);
        }
        return;

reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev)) {
                const struct iphdr *iph = (const struct iphdr *) skb->data;
                __be32 daddr = iph->daddr;
                __be32 saddr = iph->saddr;

                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
                                     "  Advised path = %pI4 -> %pI4\n",
                                     &old_gw, dev->name, &new_gw,
                                     &saddr, &daddr);
        }
#endif
        ;
}

static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
        struct rtable *rt;
        struct flowi4 fl4;
        const struct iphdr *iph = (const struct iphdr *) skb->data;
        struct net *net = dev_net(skb->dev);
        int oif = skb->dev->ifindex;
        u8 prot = iph->protocol;
        u32 mark = skb->mark;
        __u8 tos = iph->tos;

        rt = dst_rtable(dst);

        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
        __ip_do_redirect(rt, skb, &fl4, true);
}

static void ipv4_negative_advice(struct sock *sk,
                                 struct dst_entry *dst)
{
        struct rtable *rt = dst_rtable(dst);

        if ((dst->obsolete > 0) ||
            (rt->rt_flags & RTCF_REDIRECTED) ||
            rt->dst.expires)
                sk_dst_reset(sk);
}

/*
 * Algorithm:
 *        1. The first ip_rt_redirect_number redirects are sent
 *           with exponential backoff, then we stop sending them at all,
 *           assuming that the host ignores our redirects.
 *        2. If we did not see packets requiring redirects
 *           during ip_rt_redirect_silence, we assume that the host
 *           forgot redirected route and start to send redirects again.
 *
 * This algorithm is much cheaper and more intelligent than dumb load limiting
 * in icmp.c.
 *
 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 */

void ip_rt_send_redirect(struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct in_device *in_dev;
        struct inet_peer *peer;
        struct net *net;
        int log_martians;
        int vif;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(rt->dst.dev);
        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
                rcu_read_unlock();
                return;
        }
        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
        rcu_read_unlock();

        net = dev_net(rt->dst.dev);
        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
        if (!peer) {
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
                          rt_nexthop(rt, ip_hdr(skb)->daddr));
                return;
        }

        /* No redirected packets during ip_rt_redirect_silence;
         * reset the algorithm.
         */
        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
                peer->rate_tokens = 0;
                peer->n_redirects = 0;
        }

        /* Too many ignored redirects; do not send anything
         * set dst.rate_last to the last seen redirected packet.
         */
        if (peer->n_redirects >= ip_rt_redirect_number) {
                peer->rate_last = jiffies;
                goto out_put_peer;
        }

        /* Check for load limit; set rate_last to the latest sent
         * redirect.
         */
        if (peer->n_redirects == 0 ||
            time_after(jiffies,
                       (peer->rate_last +
                        (ip_rt_redirect_load << peer->n_redirects)))) {
                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);

                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
                peer->rate_last = jiffies;
                ++peer->n_redirects;
                if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
                    peer->n_redirects == ip_rt_redirect_number)
                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
                                             &ip_hdr(skb)->saddr, inet_iif(skb),
                                             &ip_hdr(skb)->daddr, &gw);
        }
out_put_peer:
        inet_putpeer(peer);
}

static int ip_error(struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct inet_peer *peer;
        unsigned long now;
        struct net *net;
        SKB_DR(reason);
        bool send;
        int code;

        if (netif_is_l3_master(skb->dev)) {
                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
                if (!dev)
                        goto out;
        }

        in_dev = __in_dev_get_rcu(dev);

        /* IP on this device is disabled. */
        if (!in_dev)
                goto out;

        net = dev_net(rt->dst.dev);
        if (!IN_DEV_FORWARD(in_dev)) {
                switch (rt->dst.error) {
                case EHOSTUNREACH:
                        SKB_DR_SET(reason, IP_INADDRERRORS);
                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
                        break;

                case ENETUNREACH:
                        SKB_DR_SET(reason, IP_INNOROUTES);
                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
                        break;
                }
                goto out;
        }

        switch (rt->dst.error) {
        case EINVAL:
        default:
                goto out;
        case EHOSTUNREACH:
                code = ICMP_HOST_UNREACH;
                break;
        case ENETUNREACH:
                code = ICMP_NET_UNREACH;
                SKB_DR_SET(reason, IP_INNOROUTES);
                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
                break;
        case EACCES:
                code = ICMP_PKT_FILTERED;
                break;
        }

        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
                               l3mdev_master_ifindex(skb->dev), 1);

        send = true;
        if (peer) {
                now = jiffies;
                peer->rate_tokens += now - peer->rate_last;
                if (peer->rate_tokens > ip_rt_error_burst)
                        peer->rate_tokens = ip_rt_error_burst;
                peer->rate_last = now;
                if (peer->rate_tokens >= ip_rt_error_cost)
                        peer->rate_tokens -= ip_rt_error_cost;
                else
                        send = false;
                inet_putpeer(peer);
        }
        if (send)
                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);

out:        kfree_skb_reason(skb, reason);
        return 0;
}

static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
        struct dst_entry *dst = &rt->dst;
        struct net *net = dev_net(dst->dev);
        struct fib_result res;
        bool lock = false;
        u32 old_mtu;

        if (ip_mtu_locked(dst))
                return;

        old_mtu = ipv4_mtu(dst);
        if (old_mtu < mtu)
                return;

        if (mtu < net->ipv4.ip_rt_min_pmtu) {
                lock = true;
                mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
        }

        if (rt->rt_pmtu == mtu && !lock &&
            time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
                return;

        rcu_read_lock();
        if (fib_lookup(net, fl4, &res, 0) == 0) {
                struct fib_nh_common *nhc;

                fib_select_path(net, &res, fl4, NULL);
                nhc = FIB_RES_NHC(res);
                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
                                      jiffies + net->ipv4.ip_rt_mtu_expires);
        }
        rcu_read_unlock();
}

static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                              struct sk_buff *skb, u32 mtu,
                              bool confirm_neigh)
{
        struct rtable *rt = dst_rtable(dst);
        struct flowi4 fl4;

        ip_rt_build_flow_key(&fl4, sk, skb);

        /* Don't make lookup fail for bridged encapsulations */
        if (skb && netif_is_any_bridge_port(skb->dev))
                fl4.flowi4_oif = 0;

        __ip_rt_update_pmtu(rt, &fl4, mtu);
}

void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
                      int oif, u8 protocol)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        u32 mark = IP4_REPLY_MARK(net, skb->mark);

        __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
                         0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_update_pmtu);

static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;

        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);

        if (!fl4.flowi4_mark)
                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);

        rt = __ip_route_output_key(sock_net(sk), &fl4);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
        }
}

void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        struct dst_entry *odst = NULL;
        bool new = false;
        struct net *net = sock_net(sk);

        bh_lock_sock(sk);

        if (!ip_sk_accept_pmtu(sk))
                goto out;

        odst = sk_dst_get(sk);

        if (sock_owned_by_user(sk) || !odst) {
                __ipv4_sk_update_pmtu(skb, sk, mtu);
                goto out;
        }

        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);

        rt = dst_rtable(odst);
        if (odst->obsolete && !odst->ops->check(odst, 0)) {
                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
                if (IS_ERR(rt))
                        goto out;

                new = true;
        }

        __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);

        if (!dst_check(&rt->dst, 0)) {
                if (new)
                        dst_release(&rt->dst);

                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
                if (IS_ERR(rt))
                        goto out;

                new = true;
        }

        if (new)
                sk_dst_set(sk, &rt->dst);

out:
        bh_unlock_sock(sk);
        dst_release(odst);
}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);

void ipv4_redirect(struct sk_buff *skb, struct net *net,
                   int oif, u8 protocol)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;

        __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_redirect);

void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        struct net *net = sock_net(sk);

        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);

INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
                                                         u32 cookie)
{
        struct rtable *rt = dst_rtable(dst);

        /* All IPV4 dsts are created with ->obsolete set to the value
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         *
         * When a PMTU/redirect information update invalidates a route,
         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
         * DST_OBSOLETE_DEAD.
         */
        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
                return NULL;
        return dst;
}
EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);

static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
        struct net_device *dev;
        struct ip_options opt;
        int res;

        /* Recompile ip options since IPCB may not be valid anymore.
         * Also check we have a reasonable ipv4 header.
         */
        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
                return;

        memset(&opt, 0, sizeof(opt));
        if (ip_hdr(skb)->ihl > 5) {
                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
                        return;
                opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);

                rcu_read_lock();
                dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
                res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
                rcu_read_unlock();

                if (res)
                        return;
        }
        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
}

static void ipv4_link_failure(struct sk_buff *skb)
{
        struct rtable *rt;

        ipv4_send_dest_unreach(skb);

        rt = skb_rtable(skb);
        if (rt)
                dst_set_expires(&rt->dst, 0);
}

static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        pr_debug("%s: %pI4 -> %pI4, %s\n",
                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
                 skb->dev ? skb->dev->name : "?");
        kfree_skb(skb);
        WARN_ON(1);
        return 0;
}

/*
 * We do not cache source address of outgoing interface,
 * because it is used only by IP RR, TS and SRR options,
 * so that it out of fast path.
 *
 * BTW remember: "addr" is allowed to be not aligned
 * in IP options!
 */

void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
{
        __be32 src;

        if (rt_is_output_route(rt))
                src = ip_hdr(skb)->saddr;
        else {
                struct fib_result res;
                struct iphdr *iph = ip_hdr(skb);
                struct flowi4 fl4 = {
                        .daddr = iph->daddr,
                        .saddr = iph->saddr,
                        .flowi4_tos = RT_TOS(iph->tos),
                        .flowi4_oif = rt->dst.dev->ifindex,
                        .flowi4_iif = skb->dev->ifindex,
                        .flowi4_mark = skb->mark,
                };

                rcu_read_lock();
                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
                        src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
                else
                        src = inet_select_addr(rt->dst.dev,
                                               rt_nexthop(rt, iph->daddr),
                                               RT_SCOPE_UNIVERSE);
                rcu_read_unlock();
        }
        memcpy(addr, &src, 4);
}

#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
        if (!(rt->dst.tclassid & 0xFFFF))
                rt->dst.tclassid |= tag & 0xFFFF;
        if (!(rt->dst.tclassid & 0xFFFF0000))
                rt->dst.tclassid |= tag & 0xFFFF0000;
}
#endif

static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
        struct net *net = dev_net(dst->dev);
        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
        unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
                                    net->ipv4.ip_rt_min_advmss);

        return min(advmss, IPV4_MAX_PMTU - header_size);
}

INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
{
        return ip_dst_mtu_maybe_forward(dst, false);
}
EXPORT_INDIRECT_CALLABLE(ipv4_mtu);

static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
        u32 hval = fnhe_hashfun(daddr);

        spin_lock_bh(&fnhe_lock);

        hash = rcu_dereference_protected(nhc->nhc_exceptions,
                                         lockdep_is_held(&fnhe_lock));
        hash += hval;

        fnhe_p = &hash->chain;
        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
        while (fnhe) {
                if (fnhe->fnhe_daddr == daddr) {
                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
                        /* set fnhe_daddr to 0 to ensure it won't bind with
                         * new dsts in rt_bind_exception().
                         */
                        fnhe->fnhe_daddr = 0;
                        fnhe_flush_routes(fnhe);
                        kfree_rcu(fnhe, rcu);
                        break;
                }
                fnhe_p = &fnhe->fnhe_next;
                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
                                                 lockdep_is_held(&fnhe_lock));
        }

        spin_unlock_bh(&fnhe_lock);
}

static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
                                               __be32 daddr)
{
        struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
        struct fib_nh_exception *fnhe;
        u32 hval;

        if (!hash)
                return NULL;

        hval = fnhe_hashfun(daddr);

        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
                if (fnhe->fnhe_daddr == daddr) {
                        if (fnhe->fnhe_expires &&
                            time_after(jiffies, fnhe->fnhe_expires)) {
                                ip_del_fnhe(nhc, daddr);
                                break;
                        }
                        return fnhe;
                }
        }
        return NULL;
}

/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 */

u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
{
        struct fib_nh_common *nhc = res->nhc;
        struct net_device *dev = nhc->nhc_dev;
        struct fib_info *fi = res->fi;
        u32 mtu = 0;

        if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
                mtu = fi->fib_mtu;

        if (likely(!mtu)) {
                struct fib_nh_exception *fnhe;

                fnhe = find_exception(nhc, daddr);
                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
                        mtu = fnhe->fnhe_pmtu;
        }

        if (likely(!mtu))
                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);

        return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
}

static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
                              __be32 daddr, const bool do_cache)
{
        bool ret = false;

        spin_lock_bh(&fnhe_lock);

        if (daddr == fnhe->fnhe_daddr) {
                struct rtable __rcu **porig;
                struct rtable *orig;
                int genid = fnhe_genid(dev_net(rt->dst.dev));

                if (rt_is_input_route(rt))
                        porig = &fnhe->fnhe_rth_input;
                else
                        porig = &fnhe->fnhe_rth_output;
                orig = rcu_dereference(*porig);

                if (fnhe->fnhe_genid != genid) {
                        fnhe->fnhe_genid = genid;
                        fnhe->fnhe_gw = 0;
                        fnhe->fnhe_pmtu = 0;
                        fnhe->fnhe_expires = 0;
                        fnhe->fnhe_mtu_locked = false;
                        fnhe_flush_routes(fnhe);
                        orig = NULL;
                }
                fill_route_from_fnhe(rt, fnhe);
                if (!rt->rt_gw4) {
                        rt->rt_gw4 = daddr;
                        rt->rt_gw_family = AF_INET;
                }

                if (do_cache) {
                        dst_hold(&rt->dst);
                        rcu_assign_pointer(*porig, rt);
                        if (orig) {
                                dst_dev_put(&orig->dst);
                                dst_release(&orig->dst);
                        }
                        ret = true;
                }

                fnhe->fnhe_stamp = jiffies;
        }
        spin_unlock_bh(&fnhe_lock);

        return ret;
}

static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
{
        struct rtable *orig, *prev, **p;
        bool ret = true;

        if (rt_is_input_route(rt)) {
                p = (struct rtable **)&nhc->nhc_rth_input;
        } else {
                p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
        }
        orig = *p;

        /* hold dst before doing cmpxchg() to avoid race condition
         * on this dst
         */
        dst_hold(&rt->dst);
        prev = cmpxchg(p, orig, rt);
        if (prev == orig) {
                if (orig) {
                        rt_add_uncached_list(orig);
                        dst_release(&orig->dst);
                }
        } else {
                dst_release(&rt->dst);
                ret = false;
        }

        return ret;
}

struct uncached_list {
        spinlock_t                lock;
        struct list_head        head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);

void rt_add_uncached_list(struct rtable *rt)
{
        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);

        rt->dst.rt_uncached_list = ul;

        spin_lock_bh(&ul->lock);
        list_add_tail(&rt->dst.rt_uncached, &ul->head);
        spin_unlock_bh(&ul->lock);
}

void rt_del_uncached_list(struct rtable *rt)
{
        if (!list_empty(&rt->dst.rt_uncached)) {
                struct uncached_list *ul = rt->dst.rt_uncached_list;

                spin_lock_bh(&ul->lock);
                list_del_init(&rt->dst.rt_uncached);
                spin_unlock_bh(&ul->lock);
        }
}

static void ipv4_dst_destroy(struct dst_entry *dst)
{
        ip_dst_metrics_put(dst);
        rt_del_uncached_list(dst_rtable(dst));
}

void rt_flush_dev(struct net_device *dev)
{
        struct rtable *rt, *safe;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);

                if (list_empty(&ul->head))
                        continue;

                spin_lock_bh(&ul->lock);
                list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
                        if (rt->dst.dev != dev)
                                continue;
                        rt->dst.dev = blackhole_netdev;
                        netdev_ref_replace(dev, blackhole_netdev,
                                           &rt->dst.dev_tracker, GFP_ATOMIC);
                        list_del_init(&rt->dst.rt_uncached);
                }
                spin_unlock_bh(&ul->lock);
        }
}

static bool rt_cache_valid(const struct rtable *rt)
{
        return        rt &&
                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
                !rt_is_expired(rt);
}

static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                           const struct fib_result *res,
                           struct fib_nh_exception *fnhe,
                           struct fib_info *fi, u16 type, u32 itag,
                           const bool do_cache)
{
        bool cached = false;

        if (fi) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
                        rt->rt_uses_gateway = 1;
                        rt->rt_gw_family = nhc->nhc_gw_family;
                        /* only INET and INET6 are supported */
                        if (likely(nhc->nhc_gw_family == AF_INET))
                                rt->rt_gw4 = nhc->nhc_gw.ipv4;
                        else
                                rt->rt_gw6 = nhc->nhc_gw.ipv6;
                }

                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);

#ifdef CONFIG_IP_ROUTE_CLASSID
                if (nhc->nhc_family == AF_INET) {
                        struct fib_nh *nh;

                        nh = container_of(nhc, struct fib_nh, nh_common);
                        rt->dst.tclassid = nh->nh_tclassid;
                }
#endif
                rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
                if (unlikely(fnhe))
                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
                else if (do_cache)
                        cached = rt_cache_route(nhc, rt);
                if (unlikely(!cached)) {
                        /* Routes we intend to cache in nexthop exception or
                         * FIB nexthop have the DST_NOCACHE bit clear.
                         * However, if we are unsuccessful at storing this
                         * route into the cache we really need to set it.
                         */
                        if (!rt->rt_gw4) {
                                rt->rt_gw_family = AF_INET;
                                rt->rt_gw4 = daddr;
                        }
                        rt_add_uncached_list(rt);
                }
        } else
                rt_add_uncached_list(rt);

#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
        set_class_tag(rt, res->tclassid);
#endif
        set_class_tag(rt, itag);
#endif
}

struct rtable *rt_dst_alloc(struct net_device *dev,
                            unsigned int flags, u16 type,
                            bool noxfrm)
{
        struct rtable *rt;

        rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
                       (noxfrm ? DST_NOXFRM : 0));

        if (rt) {
                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
                rt->rt_flags = flags;
                rt->rt_type = type;
                rt->rt_is_input = 0;
                rt->rt_iif = 0;
                rt->rt_pmtu = 0;
                rt->rt_mtu_locked = 0;
                rt->rt_uses_gateway = 0;
                rt->rt_gw_family = 0;
                rt->rt_gw4 = 0;

                rt->dst.output = ip_output;
                if (flags & RTCF_LOCAL)
                        rt->dst.input = ip_local_deliver;
        }

        return rt;
}
EXPORT_SYMBOL(rt_dst_alloc);

struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
{
        struct rtable *new_rt;

        new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
                           rt->dst.flags);

        if (new_rt) {
                new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
                new_rt->rt_flags = rt->rt_flags;
                new_rt->rt_type = rt->rt_type;
                new_rt->rt_is_input = rt->rt_is_input;
                new_rt->rt_iif = rt->rt_iif;
                new_rt->rt_pmtu = rt->rt_pmtu;
                new_rt->rt_mtu_locked = rt->rt_mtu_locked;
                new_rt->rt_gw_family = rt->rt_gw_family;
                if (rt->rt_gw_family == AF_INET)
                        new_rt->rt_gw4 = rt->rt_gw4;
                else if (rt->rt_gw_family == AF_INET6)
                        new_rt->rt_gw6 = rt->rt_gw6;

                new_rt->dst.input = rt->dst.input;
                new_rt->dst.output = rt->dst.output;
                new_rt->dst.error = rt->dst.error;
                new_rt->dst.lastuse = jiffies;
                new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
        }
        return new_rt;
}
EXPORT_SYMBOL(rt_dst_clone);

/* called in rcu_read_lock() section */
int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                          u8 tos, struct net_device *dev,
                          struct in_device *in_dev, u32 *itag)
{
        int err;

        /* Primary sanity checks. */
        if (!in_dev)
                return -EINVAL;

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
            skb->protocol != htons(ETH_P_IP))
                return -EINVAL;

        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
                return -EINVAL;

        if (ipv4_is_zeronet(saddr)) {
                if (!ipv4_is_local_multicast(daddr) &&
                    ip_hdr(skb)->protocol != IPPROTO_IGMP)
                        return -EINVAL;
        } else {
                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
                                          in_dev, itag);
                if (err < 0)
                        return err;
        }
        return 0;
}

/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                             u8 tos, struct net_device *dev, int our)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        unsigned int flags = RTCF_MULTICAST;
        struct rtable *rth;
        u32 itag = 0;
        int err;

        err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
        if (err)
                return err;

        if (our)
                flags |= RTCF_LOCAL;

        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
                           false);
        if (!rth)
                return -ENOBUFS;

#ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
#endif
        rth->dst.output = ip_rt_bug;
        rth->rt_is_input= 1;

#ifdef CONFIG_IP_MROUTE
        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
                rth->dst.input = ip_mr_input;
#endif
        RT_CACHE_STAT_INC(in_slow_mc);

        skb_dst_drop(skb);
        skb_dst_set(skb, &rth->dst);
        return 0;
}


static void ip_handle_martian_source(struct net_device *dev,
                                     struct in_device *in_dev,
                                     struct sk_buff *skb,
                                     __be32 daddr,
                                     __be32 saddr)
{
        RT_CACHE_STAT_INC(in_martian_src);
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
                /*
                 *        RFC1812 recommendation, if source is martian,
                 *        the only hint is MAC header.
                 */
                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
                        &daddr, &saddr, dev->name);
                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
                        print_hex_dump(KERN_WARNING, "ll header: ",
                                       DUMP_PREFIX_OFFSET, 16, 1,
                                       skb_mac_header(skb),
                                       dev->hard_header_len, false);
                }
        }
#endif
}

/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
                           const struct fib_result *res,
                           struct in_device *in_dev,
                           __be32 daddr, __be32 saddr, u32 tos)
{
        struct fib_nh_common *nhc = FIB_RES_NHC(*res);
        struct net_device *dev = nhc->nhc_dev;
        struct fib_nh_exception *fnhe;
        struct rtable *rth;
        int err;
        struct in_device *out_dev;
        bool do_cache;
        u32 itag = 0;

        /* get a working reference to the output device */
        out_dev = __in_dev_get_rcu(dev);
        if (!out_dev) {
                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
                return -EINVAL;
        }

        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
                                  in_dev->dev, in_dev, &itag);
        if (err < 0) {
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                                         saddr);

                goto cleanup;
        }

        do_cache = res->fi && !itag;
        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
            skb->protocol == htons(ETH_P_IP)) {
                __be32 gw;

                gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
                if (IN_DEV_SHARED_MEDIA(out_dev) ||
                    inet_addr_onlink(out_dev, saddr, gw))
                        IPCB(skb)->flags |= IPSKB_DOREDIRECT;
        }

        if (skb->protocol != htons(ETH_P_IP)) {
                /* Not IP (i.e. ARP). Do not create route, if it is
                 * invalid for proxy arp. DNAT routes are always valid.
                 *
                 * Proxy arp feature have been extended to allow, ARP
                 * replies back to the same interface, to support
                 * Private VLAN switch technologies. See arp.c.
                 */
                if (out_dev == in_dev &&
                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
                        err = -EINVAL;
                        goto cleanup;
                }
        }

        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        fnhe = find_exception(nhc, daddr);
        if (do_cache) {
                if (fnhe)
                        rth = rcu_dereference(fnhe->fnhe_rth_input);
                else
                        rth = rcu_dereference(nhc->nhc_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        goto out;
                }
        }

        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
                           IN_DEV_ORCONF(out_dev, NOXFRM));
        if (!rth) {
                err = -ENOBUFS;
                goto cleanup;
        }

        rth->rt_is_input = 1;
        RT_CACHE_STAT_INC(in_slow_tot);

        rth->dst.input = ip_forward;

        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
                       do_cache);
        lwtunnel_set_redirect(&rth->dst);
        skb_dst_set(skb, &rth->dst);
out:
        err = 0;
 cleanup:
        return err;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
/* To make ICMP packets follow the right flow, the multipath hash is
 * calculated from the inner IP addresses.
 */
static void ip_multipath_l3_keys(const struct sk_buff *skb,
                                 struct flow_keys *hash_keys)
{
        const struct iphdr *outer_iph = ip_hdr(skb);
        const struct iphdr *key_iph = outer_iph;
        const struct iphdr *inner_iph;
        const struct icmphdr *icmph;
        struct iphdr _inner_iph;
        struct icmphdr _icmph;

        if (likely(outer_iph->protocol != IPPROTO_ICMP))
                goto out;

        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
                goto out;

        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
                                   &_icmph);
        if (!icmph)
                goto out;

        if (!icmp_is_err(icmph->type))
                goto out;

        inner_iph = skb_header_pointer(skb,
                                       outer_iph->ihl * 4 + sizeof(_icmph),
                                       sizeof(_inner_iph), &_inner_iph);
        if (!inner_iph)
                goto out;

        key_iph = inner_iph;
out:
        hash_keys->addrs.v4addrs.src = key_iph->saddr;
        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
}

static u32 fib_multipath_custom_hash_outer(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool *p_has_inner)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys keys, hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 fib_multipath_custom_hash_inner(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool has_inner)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys keys, hash_keys;

        /* We assume the packet carries an encapsulation, but if none was
         * encountered during dissection of the outer flow, then there is no
         * point in calling the flow dissector again.
         */
        if (!has_inner)
                return 0;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, 0);

        if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
                return 0;

        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
                        hash_keys.tags.flow_label = keys.tags.flow_label;
        }

        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 fib_multipath_custom_hash_skb(const struct net *net,
                                         const struct sk_buff *skb)
{
        u32 mhash, mhash_inner;
        bool has_inner = true;

        mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
        mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);

        return jhash_2words(mhash, mhash_inner, 0);
}

static u32 fib_multipath_custom_hash_fl4(const struct net *net,
                                         const struct flowi4 *fl4)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v4addrs.src = fl4->saddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v4addrs.dst = fl4->daddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = fl4->flowi4_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = fl4->fl4_sport;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = fl4->fl4_dport;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

/* if skb is set it will be used and fl4 can be NULL */
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
                       const struct sk_buff *skb, struct flow_keys *flkeys)
{
        u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
        struct flow_keys hash_keys;
        u32 mhash = 0;

        switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
        case 0:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (skb) {
                        ip_multipath_l3_keys(skb, &hash_keys);
                } else {
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 1:
                /* skb is currently provided only when forwarding */
                if (skb) {
                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
                        struct flow_keys keys;

                        /* short-circuit if we already have L4 hash present */
                        if (skb->l4_hash)
                                return skb_get_hash_raw(skb) >> 1;

                        memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, flag);
                                flkeys = &keys;
                        }

                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
                        hash_keys.ports.src = flkeys->ports.src;
                        hash_keys.ports.dst = flkeys->ports.dst;
                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                } else {
                        memset(&hash_keys, 0, sizeof(hash_keys));
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                        hash_keys.ports.src = fl4->fl4_sport;
                        hash_keys.ports.dst = fl4->fl4_dport;
                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 2:
                memset(&hash_keys, 0, sizeof(hash_keys));
                /* skb is currently provided only when forwarding */
                if (skb) {
                        struct flow_keys keys;

                        skb_flow_dissect_flow_keys(skb, &keys, 0);
                        /* Inner can be v4 or v6 */
                        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
                        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                                hash_keys.tags.flow_label = keys.tags.flow_label;
                                hash_keys.basic.ip_proto = keys.basic.ip_proto;
                        } else {
                                /* Same as case 0 */
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                ip_multipath_l3_keys(skb, &hash_keys);
                        }
                } else {
                        /* Same as case 0 */
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 3:
                if (skb)
                        mhash = fib_multipath_custom_hash_skb(net, skb);
                else
                        mhash = fib_multipath_custom_hash_fl4(net, fl4);
                break;
        }

        if (multipath_hash)
                mhash = jhash_2words(mhash, multipath_hash, 0);

        return mhash >> 1;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */

static int ip_mkroute_input(struct sk_buff *skb,
                            struct fib_result *res,
                            struct in_device *in_dev,
                            __be32 daddr, __be32 saddr, u32 tos,
                            struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && fib_info_num_path(res->fi) > 1) {
                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);

                fib_select_multipath(res, h);
                IPCB(skb)->flags |= IPSKB_MULTIPATH;
        }
#endif

        /* create a routing cache entry */
        return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
}

/* Implements all the saddr-related checks as ip_route_input_slow(),
 * assuming daddr is valid and the destination is not a local broadcast one.
 * Uses the provided hint instead of performing a route lookup.
 */
int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                      u8 tos, struct net_device *dev,
                      const struct sk_buff *hint)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct rtable *rt = skb_rtable(hint);
        struct net *net = dev_net(dev);
        int err = -EINVAL;
        u32 tag = 0;

        if (!in_dev)
                return -EINVAL;

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
                goto martian_source;

        if (ipv4_is_zeronet(saddr))
                goto martian_source;

        if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                goto martian_source;

        if (rt->rt_type != RTN_LOCAL)
                goto skip_validate_source;

        tos &= IPTOS_RT_MASK;
        err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
        if (err < 0)
                goto martian_source;

skip_validate_source:
        skb_dst_copy(skb, hint);
        return 0;

martian_source:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
        return err;
}

/* get device for dst_alloc with local routes */
static struct net_device *ip_rt_get_dev(struct net *net,
                                        const struct fib_result *res)
{
        struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
        struct net_device *dev = NULL;

        if (nhc)
                dev = l3mdev_master_dev_rcu(nhc->nhc_dev);

        return dev ? : net->loopback_dev;
}

/*
 *        NOTE. We drop all the packets that has local source
 *        addresses, because every properly looped back packet
 *        must have correct destination already attached by output routine.
 *        Changes in the enforced policies must be applied also to
 *        ip_route_use_hint().
 *
 *        Such approach solves two big problems:
 *        1. Not simplex devices are handled properly.
 *        2. IP spoofing attempts are filtered with 100% of guarantee.
 *        called with rcu_read_lock()
 */

static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                               u8 tos, struct net_device *dev,
                               struct fib_result *res)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct flow_keys *flkeys = NULL, _flkeys;
        struct net    *net = dev_net(dev);
        struct ip_tunnel_info *tun_info;
        int                err = -EINVAL;
        unsigned int        flags = 0;
        u32                itag = 0;
        struct rtable        *rth;
        struct flowi4        fl4;
        bool do_cache = true;

        /* IP on this device is disabled. */

        if (!in_dev)
                goto out;

        /* Check for the most weird martians, which can be not detected
         * by fib_lookup.
         */

        tun_info = skb_tunnel_info(skb);
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
        else
                fl4.flowi4_tun_key.tun_id = 0;
        skb_dst_drop(skb);

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
                goto martian_source;

        res->fi = NULL;
        res->table = NULL;
        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;

        /* Accept zero addresses only to limited broadcast;
         * I even do not know to fix it or not. Waiting for complains :-)
         */
        if (ipv4_is_zeronet(saddr))
                goto martian_source;

        if (ipv4_is_zeronet(daddr))
                goto martian_destination;

        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
         * and call it once if daddr or/and saddr are loopback addresses
         */
        if (ipv4_is_loopback(daddr)) {
                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_destination;
        } else if (ipv4_is_loopback(saddr)) {
                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_source;
        }

        /*
         *        Now we are ready to route packet.
         */
        fl4.flowi4_l3mdev = 0;
        fl4.flowi4_oif = 0;
        fl4.flowi4_iif = dev->ifindex;
        fl4.flowi4_mark = skb->mark;
        fl4.flowi4_tos = tos;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;
        fl4.daddr = daddr;
        fl4.saddr = saddr;
        fl4.flowi4_uid = sock_net_uid(net, NULL);
        fl4.flowi4_multipath_hash = 0;

        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
                flkeys = &_flkeys;
        } else {
                fl4.flowi4_proto = 0;
                fl4.fl4_sport = 0;
                fl4.fl4_dport = 0;
        }

        err = fib_lookup(net, &fl4, res, 0);
        if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
                        err = -EHOSTUNREACH;
                goto no_route;
        }

        if (res->type == RTN_BROADCAST) {
                if (IN_DEV_BFORWARD(in_dev))
                        goto make_route;
                /* not do cache if bc_forwarding is enabled */
                if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
                        do_cache = false;
                goto brd_input;
        }

        if (res->type == RTN_LOCAL) {
                err = fib_validate_source(skb, saddr, daddr, tos,
                                          0, dev, in_dev, &itag);
                if (err < 0)
                        goto martian_source;
                goto local_input;
        }

        if (!IN_DEV_FORWARD(in_dev)) {
                err = -EHOSTUNREACH;
                goto no_route;
        }
        if (res->type != RTN_UNICAST)
                goto martian_destination;

make_route:
        err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
out:        return err;

brd_input:
        if (skb->protocol != htons(ETH_P_IP))
                goto e_inval;

        if (!ipv4_is_zeronet(saddr)) {
                err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
                                          in_dev, &itag);
                if (err < 0)
                        goto martian_source;
        }
        flags |= RTCF_BROADCAST;
        res->type = RTN_BROADCAST;
        RT_CACHE_STAT_INC(in_brd);

local_input:
        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        do_cache &= res->fi && !itag;
        if (do_cache) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                rth = rcu_dereference(nhc->nhc_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        err = 0;
                        goto out;
                }
        }

        rth = rt_dst_alloc(ip_rt_get_dev(net, res),
                           flags | RTCF_LOCAL, res->type, false);
        if (!rth)
                goto e_nobufs;

        rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
#endif
        rth->rt_is_input = 1;

        RT_CACHE_STAT_INC(in_slow_tot);
        if (res->type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
                rth->rt_flags        &= ~RTCF_LOCAL;
        }

        if (do_cache) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
                        WARN_ON(rth->dst.input == lwtunnel_input);
                        rth->dst.lwtstate->orig_input = rth->dst.input;
                        rth->dst.input = lwtunnel_input;
                }

                if (unlikely(!rt_cache_route(nhc, rth)))
                        rt_add_uncached_list(rth);
        }
        skb_dst_set(skb, &rth->dst);
        err = 0;
        goto out;

no_route:
        RT_CACHE_STAT_INC(in_no_route);
        res->type = RTN_UNREACHABLE;
        res->fi = NULL;
        res->table = NULL;
        goto local_input;

        /*
         *        Do not cache martian addresses: they should be logged (RFC1812)
         */
martian_destination:
        RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev))
                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
                                     &daddr, &saddr, dev->name);
#endif

e_inval:
        err = -EINVAL;
        goto out;

e_nobufs:
        err = -ENOBUFS;
        goto out;

martian_source:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
        goto out;
}

/* called with rcu_read_lock held */
static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                              u8 tos, struct net_device *dev, struct fib_result *res)
{
        /* Multicast recognition logic is moved from route cache to here.
         * The problem was that too many Ethernet cards have broken/missing
         * hardware multicast filters :-( As result the host on multicasting
         * network acquires a lot of useless route cache entries, sort of
         * SDR messages from all the world. Now we try to get rid of them.
         * Really, provided software IP multicast filter is organized
         * reasonably (at least, hashed), it does not result in a slowdown
         * comparing with route cache reject entries.
         * Note, that multicast routers are not affected, because
         * route cache entry is created eventually.
         */
        if (ipv4_is_multicast(daddr)) {
                struct in_device *in_dev = __in_dev_get_rcu(dev);
                int our = 0;
                int err = -EINVAL;

                if (!in_dev)
                        return err;
                our = ip_check_mc_rcu(in_dev, daddr, saddr,
                                      ip_hdr(skb)->protocol);

                /* check l3 master if no match yet */
                if (!our && netif_is_l3_slave(dev)) {
                        struct in_device *l3_in_dev;

                        l3_in_dev = __in_dev_get_rcu(skb->dev);
                        if (l3_in_dev)
                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
                                                      ip_hdr(skb)->protocol);
                }

                if (our
#ifdef CONFIG_IP_MROUTE
                        ||
                    (!ipv4_is_local_multicast(daddr) &&
                     IN_DEV_MFORWARD(in_dev))
#endif
                   ) {
                        err = ip_route_input_mc(skb, daddr, saddr,
                                                tos, dev, our);
                }
                return err;
        }

        return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
}

int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                         u8 tos, struct net_device *dev)
{
        struct fib_result res;
        int err;

        tos &= IPTOS_RT_MASK;
        rcu_read_lock();
        err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
        rcu_read_unlock();

        return err;
}
EXPORT_SYMBOL(ip_route_input_noref);

/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
                                       const struct flowi4 *fl4, int orig_oif,
                                       struct net_device *dev_out,
                                       unsigned int flags)
{
        struct fib_info *fi = res->fi;
        struct fib_nh_exception *fnhe;
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
        bool do_cache;

        in_dev = __in_dev_get_rcu(dev_out);
        if (!in_dev)
                return ERR_PTR(-EINVAL);

        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
                if (ipv4_is_loopback(fl4->saddr) &&
                    !(dev_out->flags & IFF_LOOPBACK) &&
                    !netif_is_l3_master(dev_out))
                        return ERR_PTR(-EINVAL);

        if (ipv4_is_lbcast(fl4->daddr))
                type = RTN_BROADCAST;
        else if (ipv4_is_multicast(fl4->daddr))
                type = RTN_MULTICAST;
        else if (ipv4_is_zeronet(fl4->daddr))
                return ERR_PTR(-EINVAL);

        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;

        do_cache = true;
        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
                fi = NULL;
        } else if (type == RTN_MULTICAST) {
                flags |= RTCF_MULTICAST | RTCF_LOCAL;
                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
                                     fl4->flowi4_proto))
                        flags &= ~RTCF_LOCAL;
                else
                        do_cache = false;
                /* If multicast route do not exist use
                 * default one, but do not gateway in this case.
                 * Yes, it is hack.
                 */
                if (fi && res->prefixlen < 4)
                        fi = NULL;
        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
                   (orig_oif != dev_out->ifindex)) {
                /* For local routes that require a particular output interface
                 * we do not want to cache the result.  Caching the result
                 * causes incorrect behaviour when there are multiple source
                 * addresses on the interface, the end result being that if the
                 * intended recipient is waiting on that interface for the
                 * packet he won't receive it because it will be delivered on
                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
                 * be set to the loopback interface as well.
                 */
                do_cache = false;
        }

        fnhe = NULL;
        do_cache &= fi != NULL;
        if (fi) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
                struct rtable __rcu **prth;

                fnhe = find_exception(nhc, fl4->daddr);
                if (!do_cache)
                        goto add;
                if (fnhe) {
                        prth = &fnhe->fnhe_rth_output;
                } else {
                        if (unlikely(fl4->flowi4_flags &
                                     FLOWI_FLAG_KNOWN_NH &&
                                     !(nhc->nhc_gw_family &&
                                       nhc->nhc_scope == RT_SCOPE_LINK))) {
                                do_cache = false;
                                goto add;
                        }
                        prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
                }
                rth = rcu_dereference(*prth);
                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
                        return rth;
        }

add:
        rth = rt_dst_alloc(dev_out, flags, type,
                           IN_DEV_ORCONF(in_dev, NOXFRM));
        if (!rth)
                return ERR_PTR(-ENOBUFS);

        rth->rt_iif = orig_oif;

        RT_CACHE_STAT_INC(out_slow_tot);

        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
                if (flags & RTCF_LOCAL &&
                    !(dev_out->flags & IFF_LOOPBACK)) {
                        rth->dst.output = ip_mc_output;
                        RT_CACHE_STAT_INC(out_slow_mc);
                }
#ifdef CONFIG_IP_MROUTE
                if (type == RTN_MULTICAST) {
                        if (IN_DEV_MFORWARD(in_dev) &&
                            !ipv4_is_local_multicast(fl4->daddr)) {
                                rth->dst.input = ip_mr_input;
                                rth->dst.output = ip_mc_output;
                        }
                }
#endif
        }

        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
        lwtunnel_set_redirect(&rth->dst);

        return rth;
}

/*
 * Major route resolver routine.
 */

struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
                                        const struct sk_buff *skb)
{
        struct fib_result res = {
                .type                = RTN_UNSPEC,
                .fi                = NULL,
                .table                = NULL,
                .tclassid        = 0,
        };
        struct rtable *rth;

        fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_tos &= IPTOS_RT_MASK;

        rcu_read_lock();
        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
        rcu_read_unlock();

        return rth;
}
EXPORT_SYMBOL_GPL(ip_route_output_key_hash);

struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
                                            struct fib_result *res,
                                            const struct sk_buff *skb)
{
        struct net_device *dev_out = NULL;
        int orig_oif = fl4->flowi4_oif;
        unsigned int flags = 0;
        struct rtable *rth;
        int err;

        if (fl4->saddr) {
                if (ipv4_is_multicast(fl4->saddr) ||
                    ipv4_is_lbcast(fl4->saddr) ||
                    ipv4_is_zeronet(fl4->saddr)) {
                        rth = ERR_PTR(-EINVAL);
                        goto out;
                }

                rth = ERR_PTR(-ENETUNREACH);

                /* I removed check for oif == dev_out->oif here.
                 * It was wrong for two reasons:
                 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
                 *    is assigned to multiple interfaces.
                 * 2. Moreover, we are allowed to send packets with saddr
                 *    of another iface. --ANK
                 */

                if (fl4->flowi4_oif == 0 &&
                    (ipv4_is_multicast(fl4->daddr) ||
                     ipv4_is_lbcast(fl4->daddr))) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
                        dev_out = __ip_dev_find(net, fl4->saddr, false);
                        if (!dev_out)
                                goto out;

                        /* Special hack: user can direct multicasts
                         * and limited broadcast via necessary interface
                         * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
                         * This hack is not just for fun, it allows
                         * vic,vat and friends to work.
                         * They bind socket to loopback, set ttl to zero
                         * and expect that it will work.
                         * From the viewpoint of routing cache they are broken,
                         * because we are not allowed to build multicast path
                         * with loopback source addr (look, routing cache
                         * cannot know, that ttl is zero, so that packet
                         * will not leave this host and route is valid).
                         * Luckily, this hack is good workaround.
                         */

                        fl4->flowi4_oif = dev_out->ifindex;
                        goto make_route;
                }

                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
                        if (!__ip_dev_find(net, fl4->saddr, false))
                                goto out;
                }
        }


        if (fl4->flowi4_oif) {
                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
                rth = ERR_PTR(-ENODEV);
                if (!dev_out)
                        goto out;

                /* RACE: Check return value of inet_select_addr instead. */
                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
                        rth = ERR_PTR(-ENETUNREACH);
                        goto out;
                }
                if (ipv4_is_local_multicast(fl4->daddr) ||
                    ipv4_is_lbcast(fl4->daddr) ||
                    fl4->flowi4_proto == IPPROTO_IGMP) {
                        if (!fl4->saddr)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        goto make_route;
                }
                if (!fl4->saddr) {
                        if (ipv4_is_multicast(fl4->daddr))
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              fl4->flowi4_scope);
                        else if (!fl4->daddr)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_HOST);
                }
        }

        if (!fl4->daddr) {
                fl4->daddr = fl4->saddr;
                if (!fl4->daddr)
                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
                dev_out = net->loopback_dev;
                fl4->flowi4_oif = LOOPBACK_IFINDEX;
                res->type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
        }

        err = fib_lookup(net, fl4, res, 0);
        if (err) {
                res->fi = NULL;
                res->table = NULL;
                if (fl4->flowi4_oif &&
                    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
                        /* Apparently, routing tables are wrong. Assume,
                         * that the destination is on link.
                         *
                         * WHY? DW.
                         * Because we are allowed to send to iface
                         * even if it has NO routes and NO assigned
                         * addresses. When oif is specified, routing
                         * tables are looked up with only one purpose:
                         * to catch if destination is gatewayed, rather than
                         * direct. Moreover, if MSG_DONTROUTE is set,
                         * we send packet, ignoring both routing tables
                         * and ifaddr state. --ANK
                         *
                         *
                         * We could make it even if oif is unknown,
                         * likely IPv6, but we do not.
                         */

                        if (fl4->saddr == 0)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        res->type = RTN_UNICAST;
                        goto make_route;
                }
                rth = ERR_PTR(err);
                goto out;
        }

        if (res->type == RTN_LOCAL) {
                if (!fl4->saddr) {
                        if (res->fi->fib_prefsrc)
                                fl4->saddr = res->fi->fib_prefsrc;
                        else
                                fl4->saddr = fl4->daddr;
                }

                /* L3 master device is the loopback for that domain */
                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
                        net->loopback_dev;

                /* make sure orig_oif points to fib result device even
                 * though packet rx/tx happens over loopback or l3mdev
                 */
                orig_oif = FIB_RES_OIF(*res);

                fl4->flowi4_oif = dev_out->ifindex;
                flags |= RTCF_LOCAL;
                goto make_route;
        }

        fib_select_path(net, res, fl4, skb);

        dev_out = FIB_RES_DEV(*res);

make_route:
        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);

out:
        return rth;
}

static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                        = AF_INET,
        .default_advmss                = ipv4_default_advmss,
        .neigh_lookup                = ipv4_neigh_lookup,
        .check                        = dst_blackhole_check,
        .cow_metrics                = dst_blackhole_cow_metrics,
        .update_pmtu                = dst_blackhole_update_pmtu,
        .redirect                = dst_blackhole_redirect,
        .mtu                        = dst_blackhole_mtu,
};

struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
        struct rtable *ort = dst_rtable(dst_orig);
        struct rtable *rt;

        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
        if (rt) {
                struct dst_entry *new = &rt->dst;

                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard_out;

                new->dev = net->loopback_dev;
                netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);

                rt->rt_is_input = ort->rt_is_input;
                rt->rt_iif = ort->rt_iif;
                rt->rt_pmtu = ort->rt_pmtu;
                rt->rt_mtu_locked = ort->rt_mtu_locked;

                rt->rt_genid = rt_genid_ipv4(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
                rt->rt_uses_gateway = ort->rt_uses_gateway;
                rt->rt_gw_family = ort->rt_gw_family;
                if (rt->rt_gw_family == AF_INET)
                        rt->rt_gw4 = ort->rt_gw4;
                else if (rt->rt_gw_family == AF_INET6)
                        rt->rt_gw6 = ort->rt_gw6;
        }

        dst_release(dst_orig);

        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
}

struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
                                    const struct sock *sk)
{
        struct rtable *rt = __ip_route_output_key(net, flp4);

        if (IS_ERR(rt))
                return rt;

        if (flp4->flowi4_proto) {
                flp4->flowi4_oif = rt->dst.dev->ifindex;
                rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
                                                  flowi4_to_flowi(flp4),
                                                  sk, 0));
        }

        return rt;
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);

/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
                        struct rtable *rt, u32 table_id, struct flowi4 *fl4,
                        struct sk_buff *skb, u32 portid, u32 seq,
                        unsigned int flags)
{
        struct rtmsg *r;
        struct nlmsghdr *nlh;
        unsigned long expires = 0;
        u32 error;
        u32 metrics[RTAX_MAX];

        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
        if (!nlh)
                return -EMSGSIZE;

        r = nlmsg_data(nlh);
        r->rtm_family         = AF_INET;
        r->rtm_dst_len        = 32;
        r->rtm_src_len        = 0;
        r->rtm_tos        = fl4 ? fl4->flowi4_tos : 0;
        r->rtm_table        = table_id < 256 ? table_id : RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, table_id))
                goto nla_put_failure;
        r->rtm_type        = rt->rt_type;
        r->rtm_scope        = RT_SCOPE_UNIVERSE;
        r->rtm_protocol = RTPROT_UNSPEC;
        r->rtm_flags        = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
        if (rt->rt_flags & RTCF_NOTIFY)
                r->rtm_flags |= RTM_F_NOTIFY;
        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
                r->rtm_flags |= RTCF_DOREDIRECT;

        if (nla_put_in_addr(skb, RTA_DST, dst))
                goto nla_put_failure;
        if (src) {
                r->rtm_src_len = 32;
                if (nla_put_in_addr(skb, RTA_SRC, src))
                        goto nla_put_failure;
        }
        if (rt->dst.dev &&
            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
                goto nla_put_failure;
        if (rt->dst.lwtstate &&
            lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (rt->dst.tclassid &&
            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
                goto nla_put_failure;
#endif
        if (fl4 && !rt_is_input_route(rt) &&
            fl4->saddr != src) {
                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
                        goto nla_put_failure;
        }
        if (rt->rt_uses_gateway) {
                if (rt->rt_gw_family == AF_INET &&
                    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
                        goto nla_put_failure;
                } else if (rt->rt_gw_family == AF_INET6) {
                        int alen = sizeof(struct in6_addr);
                        struct nlattr *nla;
                        struct rtvia *via;

                        nla = nla_reserve(skb, RTA_VIA, alen + 2);
                        if (!nla)
                                goto nla_put_failure;

                        via = nla_data(nla);
                        via->rtvia_family = AF_INET6;
                        memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
                }
        }

        expires = rt->dst.expires;
        if (expires) {
                unsigned long now = jiffies;

                if (time_before(now, expires))
                        expires -= now;
                else
                        expires = 0;
        }

        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
        if (rt->rt_pmtu && expires)
                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
        if (rt->rt_mtu_locked && expires)
                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;

        if (fl4) {
                if (fl4->flowi4_mark &&
                    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
                        goto nla_put_failure;

                if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
                    nla_put_u32(skb, RTA_UID,
                                from_kuid_munged(current_user_ns(),
                                                 fl4->flowi4_uid)))
                        goto nla_put_failure;

                if (rt_is_input_route(rt)) {
#ifdef CONFIG_IP_MROUTE
                        if (ipv4_is_multicast(dst) &&
                            !ipv4_is_local_multicast(dst) &&
                            IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
                                int err = ipmr_get_route(net, skb,
                                                         fl4->saddr, fl4->daddr,
                                                         r, portid);

                                if (err <= 0) {
                                        if (err == 0)
                                                return 0;
                                        goto nla_put_failure;
                                }
                        } else
#endif
                                if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
                                        goto nla_put_failure;
                }
        }

        error = rt->dst.error;

        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
                            struct netlink_callback *cb, u32 table_id,
                            struct fnhe_hash_bucket *bucket, int genid,
                            int *fa_index, int fa_start, unsigned int flags)
{
        int i;

        for (i = 0; i < FNHE_HASH_SIZE; i++) {
                struct fib_nh_exception *fnhe;

                for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
                     fnhe = rcu_dereference(fnhe->fnhe_next)) {
                        struct rtable *rt;
                        int err;

                        if (*fa_index < fa_start)
                                goto next;

                        if (fnhe->fnhe_genid != genid)
                                goto next;

                        if (fnhe->fnhe_expires &&
                            time_after(jiffies, fnhe->fnhe_expires))
                                goto next;

                        rt = rcu_dereference(fnhe->fnhe_rth_input);
                        if (!rt)
                                rt = rcu_dereference(fnhe->fnhe_rth_output);
                        if (!rt)
                                goto next;

                        err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
                                           table_id, NULL, skb,
                                           NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, flags);
                        if (err)
                                return err;
next:
                        (*fa_index)++;
                }
        }

        return 0;
}

int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
                       u32 table_id, struct fib_info *fi,
                       int *fa_index, int fa_start, unsigned int flags)
{
        struct net *net = sock_net(cb->skb->sk);
        int nhsel, genid = fnhe_genid(net);

        for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
                struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
                struct fnhe_hash_bucket *bucket;
                int err;

                if (nhc->nhc_flags & RTNH_F_DEAD)
                        continue;

                rcu_read_lock();
                bucket = rcu_dereference(nhc->nhc_exceptions);
                err = 0;
                if (bucket)
                        err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
                                               genid, fa_index, fa_start,
                                               flags);
                rcu_read_unlock();
                if (err)
                        return err;
        }

        return 0;
}

static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
                                                   u8 ip_proto, __be16 sport,
                                                   __be16 dport)
{
        struct sk_buff *skb;
        struct iphdr *iph;

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return NULL;

        /* Reserve room for dummy headers, this skb can pass
         * through good chunk of routing engine.
         */
        skb_reset_mac_header(skb);
        skb_reset_network_header(skb);
        skb->protocol = htons(ETH_P_IP);
        iph = skb_put(skb, sizeof(struct iphdr));
        iph->protocol = ip_proto;
        iph->saddr = src;
        iph->daddr = dst;
        iph->version = 0x4;
        iph->frag_off = 0;
        iph->ihl = 0x5;
        skb_set_transport_header(skb, skb->len);

        switch (iph->protocol) {
        case IPPROTO_UDP: {
                struct udphdr *udph;

                udph = skb_put_zero(skb, sizeof(struct udphdr));
                udph->source = sport;
                udph->dest = dport;
                udph->len = htons(sizeof(struct udphdr));
                udph->check = 0;
                break;
        }
        case IPPROTO_TCP: {
                struct tcphdr *tcph;

                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
                tcph->source        = sport;
                tcph->dest        = dport;
                tcph->doff        = sizeof(struct tcphdr) / 4;
                tcph->rst = 1;
                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
                                            src, dst, 0);
                break;
        }
        case IPPROTO_ICMP: {
                struct icmphdr *icmph;

                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
                icmph->type = ICMP_ECHO;
                icmph->code = 0;
        }
        }

        return skb;
}

static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
                NL_SET_ERR_MSG(extack,
                               "ipv4: Invalid header for route get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                              rtm_ipv4_policy, extack);

        rtm = nlmsg_data(nlh);
        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
            rtm->rtm_table || rtm->rtm_protocol ||
            rtm->rtm_scope || rtm->rtm_type) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
                return -EINVAL;
        }

        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
                               RTM_F_LOOKUP_TABLE |
                               RTM_F_FIB_MATCH)) {
                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv4_policy, extack);
        if (err)
                return err;

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
                return -EINVAL;
        }

        for (i = 0; i <= RTA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_IIF:
                case RTA_OIF:
                case RTA_SRC:
                case RTA_DST:
                case RTA_IP_PROTO:
                case RTA_SPORT:
                case RTA_DPORT:
                case RTA_MARK:
                case RTA_UID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX+1];
        u32 table_id = RT_TABLE_MAIN;
        __be16 sport = 0, dport = 0;
        struct fib_result res = {};
        u8 ip_proto = IPPROTO_UDP;
        struct rtable *rt = NULL;
        struct sk_buff *skb;
        struct rtmsg *rtm;
        struct flowi4 fl4 = {};
        __be32 dst = 0;
        __be32 src = 0;
        kuid_t uid;
        u32 iif;
        int err;
        int mark;

        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        rtm = nlmsg_data(nlh);
        src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
        dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
        if (tb[RTA_UID])
                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
        else
                uid = (iif ? INVALID_UID : current_uid());

        if (tb[RTA_IP_PROTO]) {
                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
                                                  &ip_proto, AF_INET, extack);
                if (err)
                        return err;
        }

        if (tb[RTA_SPORT])
                sport = nla_get_be16(tb[RTA_SPORT]);

        if (tb[RTA_DPORT])
                dport = nla_get_be16(tb[RTA_DPORT]);

        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
        if (!skb)
                return -ENOBUFS;

        fl4.daddr = dst;
        fl4.saddr = src;
        fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
        fl4.flowi4_mark = mark;
        fl4.flowi4_uid = uid;
        if (sport)
                fl4.fl4_sport = sport;
        if (dport)
                fl4.fl4_dport = dport;
        fl4.flowi4_proto = ip_proto;

        rcu_read_lock();

        if (iif) {
                struct net_device *dev;

                dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
                        err = -ENODEV;
                        goto errout_rcu;
                }

                fl4.flowi4_iif = iif; /* for rt_fill_info */
                skb->dev        = dev;
                skb->mark        = mark;
                err = ip_route_input_rcu(skb, dst, src,
                                         rtm->rtm_tos & IPTOS_RT_MASK, dev,
                                         &res);

                rt = skb_rtable(skb);
                if (err == 0 && rt->dst.error)
                        err = -rt->dst.error;
        } else {
                fl4.flowi4_iif = LOOPBACK_IFINDEX;
                skb->dev = net->loopback_dev;
                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
                err = 0;
                if (IS_ERR(rt))
                        err = PTR_ERR(rt);
                else
                        skb_dst_set(skb, &rt->dst);
        }

        if (err)
                goto errout_rcu;

        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;

        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
                table_id = res.table ? res.table->tb_id : 0;

        /* reset skb for netlink reply msg */
        skb_trim(skb, 0);
        skb_reset_network_header(skb);
        skb_reset_transport_header(skb);
        skb_reset_mac_header(skb);

        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
                struct fib_rt_info fri;

                if (!res.fi) {
                        err = fib_props[res.type].error;
                        if (!err)
                                err = -EHOSTUNREACH;
                        goto errout_rcu;
                }
                fri.fi = res.fi;
                fri.tb_id = table_id;
                fri.dst = res.prefix;
                fri.dst_len = res.prefixlen;
                fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
                fri.type = rt->rt_type;
                fri.offload = 0;
                fri.trap = 0;
                fri.offload_failed = 0;
                if (res.fa_head) {
                        struct fib_alias *fa;

                        hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
                                u8 slen = 32 - fri.dst_len;

                                if (fa->fa_slen == slen &&
                                    fa->tb_id == fri.tb_id &&
                                    fa->fa_dscp == fri.dscp &&
                                    fa->fa_info == res.fi &&
                                    fa->fa_type == fri.type) {
                                        fri.offload = READ_ONCE(fa->offload);
                                        fri.trap = READ_ONCE(fa->trap);
                                        fri.offload_failed =
                                                READ_ONCE(fa->offload_failed);
                                        break;
                                }
                        }
                }
                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
                                    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
        } else {
                err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
                                   NETLINK_CB(in_skb).portid,
                                   nlh->nlmsg_seq, 0);
        }
        if (err < 0)
                goto errout_rcu;

        rcu_read_unlock();

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);

errout_free:
        return err;
errout_rcu:
        rcu_read_unlock();
        kfree_skb(skb);
        goto errout_free;
}

void ip_rt_multicast_event(struct in_device *in_dev)
{
        rt_cache_flush(dev_net(in_dev->dev));
}

#ifdef CONFIG_SYSCTL
static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly        = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly        = 8;
static int ip_min_valid_pmtu __read_mostly        = IPV4_MIN_MTU;

static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct net *net = (struct net *)__ctl->extra1;

        if (write) {
                rt_cache_flush(net);
                fnhe_genid_bump(net);
                return 0;
        }

        return -EINVAL;
}

static struct ctl_table ipv4_route_table[] = {
        {
                .procname        = "gc_thresh",
                .data                = &ipv4_dst_ops.gc_thresh,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "max_size",
                .data                = &ip_rt_max_size,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                /*  Deprecated. Use gc_min_interval_ms */

                .procname        = "gc_min_interval",
                .data                = &ip_rt_gc_min_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "gc_min_interval_ms",
                .data                = &ip_rt_gc_min_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "gc_timeout",
                .data                = &ip_rt_gc_timeout,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "gc_interval",
                .data                = &ip_rt_gc_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "redirect_load",
                .data                = &ip_rt_redirect_load,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "redirect_number",
                .data                = &ip_rt_redirect_number,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "redirect_silence",
                .data                = &ip_rt_redirect_silence,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "error_cost",
                .data                = &ip_rt_error_cost,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "error_burst",
                .data                = &ip_rt_error_burst,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "gc_elasticity",
                .data                = &ip_rt_gc_elasticity,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};

static const char ipv4_route_flush_procname[] = "flush";

static struct ctl_table ipv4_route_netns_table[] = {
        {
                .procname        = ipv4_route_flush_procname,
                .maxlen                = sizeof(int),
                .mode                = 0200,
                .proc_handler        = ipv4_sysctl_rtcache_flush,
        },
        {
                .procname       = "min_pmtu",
                .data           = &init_net.ipv4.ip_rt_min_pmtu,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &ip_min_valid_pmtu,
        },
        {
                .procname       = "mtu_expires",
                .data           = &init_net.ipv4.ip_rt_mtu_expires,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
        {
                .procname   = "min_adv_mss",
                .data       = &init_net.ipv4.ip_rt_min_advmss,
                .maxlen     = sizeof(int),
                .mode       = 0644,
                .proc_handler   = proc_dointvec,
        },
};

static __net_init int sysctl_route_net_init(struct net *net)
{
        struct ctl_table *tbl;
        size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);

        tbl = ipv4_route_netns_table;
        if (!net_eq(net, &init_net)) {
                int i;

                tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
                if (!tbl)
                        goto err_dup;

                /* Don't export non-whitelisted sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns) {
                        if (tbl[0].procname != ipv4_route_flush_procname)
                                table_size = 0;
                }

                /* Update the variables to point into the current struct net
                 * except for the first element flush
                 */
                for (i = 1; i < table_size; i++)
                        tbl[i].data += (void *)net - (void *)&init_net;
        }
        tbl[0].extra1 = net;

        net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
                                                     tbl, table_size);
        if (!net->ipv4.route_hdr)
                goto err_reg;
        return 0;

err_reg:
        if (tbl != ipv4_route_netns_table)
                kfree(tbl);
err_dup:
        return -ENOMEM;
}

static __net_exit void sysctl_route_net_exit(struct net *net)
{
        const struct ctl_table *tbl;

        tbl = net->ipv4.route_hdr->ctl_table_arg;
        unregister_net_sysctl_table(net->ipv4.route_hdr);
        BUG_ON(tbl == ipv4_route_netns_table);
        kfree(tbl);
}

static __net_initdata struct pernet_operations sysctl_route_ops = {
        .init = sysctl_route_net_init,
        .exit = sysctl_route_net_exit,
};
#endif

static __net_init int netns_ip_rt_init(struct net *net)
{
        /* Set default value for namespaceified sysctls */
        net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
        net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
        net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
        return 0;
}

static struct pernet_operations __net_initdata ip_rt_ops = {
        .init = netns_ip_rt_init,
};

static __net_init int rt_genid_init(struct net *net)
{
        atomic_set(&net->ipv4.rt_genid, 0);
        atomic_set(&net->fnhe_genid, 0);
        atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
        return 0;
}

static __net_initdata struct pernet_operations rt_genid_ops = {
        .init = rt_genid_init,
};

static int __net_init ipv4_inetpeer_init(struct net *net)
{
        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

        if (!bp)
                return -ENOMEM;
        inet_peer_base_init(bp);
        net->ipv4.peers = bp;
        return 0;
}

static void __net_exit ipv4_inetpeer_exit(struct net *net)
{
        struct inet_peer_base *bp = net->ipv4.peers;

        net->ipv4.peers = NULL;
        inetpeer_invalidate_tree(bp);
        kfree(bp);
}

static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
        .init        =        ipv4_inetpeer_init,
        .exit        =        ipv4_inetpeer_exit,
};

#ifdef CONFIG_IP_ROUTE_CLASSID
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
#endif /* CONFIG_IP_ROUTE_CLASSID */

int __init ip_rt_init(void)
{
        void *idents_hash;
        int cpu;

        /* For modern hosts, this will use 2 MB of memory */
        idents_hash = alloc_large_system_hash("IP idents",
                                              sizeof(*ip_idents) + sizeof(*ip_tstamps),
                                              0,
                                              16, /* one bucket per 64 KB */
                                              HASH_ZERO,
                                              NULL,
                                              &ip_idents_mask,
                                              2048,
                                              256*1024);

        ip_idents = idents_hash;

        get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));

        ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
                spin_lock_init(&ul->lock);
        }
#ifdef CONFIG_IP_ROUTE_CLASSID
        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
#endif

        ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
                                              SLAB_HWCACHE_ALIGN | SLAB_PANIC);

        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;

        if (dst_entries_init(&ipv4_dst_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_ops counter\n");

        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");

        ipv4_dst_ops.gc_thresh = ~0;
        ip_rt_max_size = INT_MAX;

        devinet_init();
        ip_fib_init();

        if (ip_rt_proc_init())
                pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
        xfrm_init();
        xfrm4_init();
#endif
        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
                      RTNL_FLAG_DOIT_UNLOCKED);

#ifdef CONFIG_SYSCTL
        register_pernet_subsys(&sysctl_route_ops);
#endif
        register_pernet_subsys(&ip_rt_ops);
        register_pernet_subsys(&rt_genid_ops);
        register_pernet_subsys(&ipv4_inetpeer_ops);
        return 0;
}

#ifdef CONFIG_SYSCTL
/*
 * We really need to sanitize the damn ipv4 init order, then all
 * this nonsense will go away.
 */
void __init ip_static_sysctl_init(void)
{
        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
}
#endif

















   13 








   14 




   13 








    5 




    5 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/kernel.h>
#include <linux/nospec.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/wordpart.h>

/* out-of-line parts */

#ifndef INLINE_COPY_FROM_USER
unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
                instrument_copy_from_user_before(to, from, n);
                res = raw_copy_from_user(to, from, n);
                instrument_copy_from_user_after(to, from, n, res);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
EXPORT_SYMBOL(_copy_from_user);
#endif

#ifndef INLINE_COPY_TO_USER
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (likely(access_ok(to, n))) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
EXPORT_SYMBOL(_copy_to_user);
#endif

/**
 * check_zeroed_user: check if a userspace buffer only contains zero bytes
 * @from: Source address, in userspace.
 * @size: Size of buffer.
 *
 * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
 * userspace addresses (and is more efficient because we don't care where the
 * first non-zero byte is).
 *
 * Returns:
 *  * 0: There were non-zero bytes present in the buffer.
 *  * 1: The buffer was full of zero bytes.
 *  * -EFAULT: access to userspace failed.
 */
int check_zeroed_user(const void __user *from, size_t size)
{
        unsigned long val;
        uintptr_t align = (uintptr_t) from % sizeof(unsigned long);

        if (unlikely(size == 0))
                return 1;

        from -= align;
        size += align;

        if (!user_read_access_begin(from, size))
                return -EFAULT;

        unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        if (align)
                val &= ~aligned_byte_mask(align);

        while (size > sizeof(unsigned long)) {
                if (unlikely(val))
                        goto done;

                from += sizeof(unsigned long);
                size -= sizeof(unsigned long);

                unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        }

        if (size < sizeof(unsigned long))
                val &= aligned_byte_mask(size);

done:
        user_read_access_end();
        return (val == 0);
err_fault:
        user_read_access_end();
        return -EFAULT;
}
EXPORT_SYMBOL(check_zeroed_user);










































    7 































































































    1 


    1 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * net busy poll support
 * Copyright(c) 2013 Intel Corporation.
 *
 * Author: Eliezer Tamir
 *
 * Contact Information:
 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
 */

#ifndef _LINUX_NET_BUSY_POLL_H
#define _LINUX_NET_BUSY_POLL_H

#include <linux/netdevice.h>
#include <linux/sched/clock.h>
#include <linux/sched/signal.h>
#include <net/ip.h>
#include <net/xdp.h>

/*                0 - Reserved to indicate value not set
 *     1..NR_CPUS - Reserved for sender_cpu
 *  NR_CPUS+1..~0 - Region available for NAPI IDs
 */
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))

#define BUSY_POLL_BUDGET 8

#ifdef CONFIG_NET_RX_BUSY_POLL

struct napi_struct;
extern unsigned int sysctl_net_busy_read __read_mostly;
extern unsigned int sysctl_net_busy_poll __read_mostly;

static inline bool net_busy_loop_on(void)
{
        return READ_ONCE(sysctl_net_busy_poll);
}

static inline bool sk_can_busy_loop(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
}

bool sk_busy_loop_end(void *p, unsigned long start_time);

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg, bool prefer_busy_poll, u16 budget);

void napi_busy_loop_rcu(unsigned int napi_id,
                        bool (*loop_end)(void *, unsigned long),
                        void *loop_end_arg, bool prefer_busy_poll, u16 budget);

#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
{
        return 0;
}

static inline bool sk_can_busy_loop(struct sock *sk)
{
        return false;
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

static inline unsigned long busy_loop_current_time(void)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return (unsigned long)(local_clock() >> 10);
#else
        return 0;
#endif
}

/* in poll/select we use the global sysctl_net_ll_poll value */
static inline bool busy_loop_timeout(unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline bool sk_busy_loop_timeout(struct sock *sk,
                                        unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline void sk_busy_loop(struct sock *sk, int nonblock)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int napi_id = READ_ONCE(sk->sk_napi_id);

        if (napi_id >= MIN_NAPI_ID)
                napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
                               READ_ONCE(sk->sk_prefer_busy_poll),
                               READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET);
#endif
}

/* used in the NIC receive handler to mark the skb */
static inline void skb_mark_napi_id(struct sk_buff *skb,
                                    struct napi_struct *napi)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        /* If the skb was already marked with a valid NAPI ID, avoid overwriting
         * it.
         */
        if (skb->napi_id < MIN_NAPI_ID)
                skb->napi_id = napi->napi_id;
#endif
}

/* used in the protocol hanlder to propagate the napi_id to the socket */
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
                WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
        sk_rx_queue_update(sk, skb);
}

/* Variant of sk_mark_napi_id() for passive flow setup,
 * as sk->sk_napi_id and sk->sk_rx_queue_mapping content
 * needs to be set.
 */
static inline void sk_mark_napi_id_set(struct sock *sk,
                                       const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
        sk_rx_queue_set(sk, skb);
}

static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        if (!READ_ONCE(sk->sk_napi_id))
                WRITE_ONCE(sk->sk_napi_id, napi_id);
#endif
}

/* variant used for unconnected sockets */
static inline void sk_mark_napi_id_once(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        __sk_mark_napi_id_once(sk, skb->napi_id);
#endif
}

static inline void sk_mark_napi_id_once_xdp(struct sock *sk,
                                            const struct xdp_buff *xdp)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        __sk_mark_napi_id_once(sk, xdp->rxq->napi_id);
#endif
}

#endif /* _LINUX_NET_BUSY_POLL_H */









































































    2 


    2 

































    2 


    2 
























    1 


    1 























    1 


    1 






















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */

#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>

#include "internal.h"

/*
 * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
 * never from raw values. These are just internal helpers.
 */
#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }

struct mnt_idmap {
        struct uid_gid_map uid_map;
        struct uid_gid_map gid_map;
        refcount_t count;
};

/*
 * Carries the initial idmapping of 0:0:4294967295 which is an identity
 * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
 * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
 */
struct mnt_idmap nop_mnt_idmap = {
        .count        = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(nop_mnt_idmap);

/**
 * initial_idmapping - check whether this is the initial mapping
 * @ns: idmapping to check
 *
 * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
 * [...], 1000 to 1000 [...].
 *
 * Return: true if this is the initial mapping, false if not.
 */
static inline bool initial_idmapping(const struct user_namespace *ns)
{
        return ns == &init_user_ns;
}

/**
 * make_vfsuid - map a filesystem kuid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kuid : kuid to be mapped
 *
 * Take a @kuid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kuid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kuid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kuid won't change when calling
 * from_kuid() so we can simply retrieve the value via __kuid_val()
 * directly.
 *
 * Return: @kuid mapped according to @idmap.
 * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is
 * returned.
 */

vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns,
                     kuid_t kuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return VFSUIDT_INIT(kuid);
        if (initial_idmapping(fs_userns))
                uid = __kuid_val(kuid);
        else
                uid = from_kuid(fs_userns, kuid);
        if (uid == (uid_t)-1)
                return INVALID_VFSUID;
        return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
}
EXPORT_SYMBOL_GPL(make_vfsuid);

/**
 * make_vfsgid - map a filesystem kgid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kgid : kgid to be mapped
 *
 * Take a @kgid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kgid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kgid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kgid won't change when calling
 * from_kgid() so we can simply retrieve the value via __kgid_val()
 * directly.
 *
 * Return: @kgid mapped according to @idmap.
 * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is
 * returned.
 */
vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns, kgid_t kgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return VFSGIDT_INIT(kgid);
        if (initial_idmapping(fs_userns))
                gid = __kgid_val(kgid);
        else
                gid = from_kgid(fs_userns, kgid);
        if (gid == (gid_t)-1)
                return INVALID_VFSGID;
        return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
}
EXPORT_SYMBOL_GPL(make_vfsgid);

/**
 * from_vfsuid - map a vfsuid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsuid : vfsuid to be mapped
 *
 * Map @vfsuid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsuid to inode->i_uid.
 *
 * Return: @vfsuid mapped into the filesystem idmapping
 */
kuid_t from_vfsuid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsuid_t vfsuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return AS_KUIDT(vfsuid);
        uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
        if (uid == (uid_t)-1)
                return INVALID_UID;
        if (initial_idmapping(fs_userns))
                return KUIDT_INIT(uid);
        return make_kuid(fs_userns, uid);
}
EXPORT_SYMBOL_GPL(from_vfsuid);

/**
 * from_vfsgid - map a vfsgid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsgid : vfsgid to be mapped
 *
 * Map @vfsgid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsgid to inode->i_gid.
 *
 * Return: @vfsgid mapped into the filesystem idmapping
 */
kgid_t from_vfsgid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsgid_t vfsgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return AS_KGIDT(vfsgid);
        gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
        if (gid == (gid_t)-1)
                return INVALID_GID;
        if (initial_idmapping(fs_userns))
                return KGIDT_INIT(gid);
        return make_kgid(fs_userns, gid);
}
EXPORT_SYMBOL_GPL(from_vfsgid);

#ifdef CONFIG_MULTIUSER
/**
 * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups
 * @vfsgid: the mnt gid to match
 *
 * This function can be used to determine whether @vfsuid matches any of the
 * caller's groups.
 *
 * Return: 1 if vfsuid matches caller's groups, 0 if not.
 */
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return in_group_p(AS_KGIDT(vfsgid));
}
#else
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return 1;
}
#endif
EXPORT_SYMBOL_GPL(vfsgid_in_group_p);

static int copy_mnt_idmap(struct uid_gid_map *map_from,
                          struct uid_gid_map *map_to)
{
        struct uid_gid_extent *forward, *reverse;
        u32 nr_extents = READ_ONCE(map_from->nr_extents);
        /* Pairs with smp_wmb() when writing the idmapping. */
        smp_rmb();

        /*
         * Don't blindly copy @map_to into @map_from if nr_extents is
         * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
         * read @nr_extents someone could have written an idmapping and
         * then we might end up with inconsistent data. So just don't do
         * anything at all.
         */
        if (nr_extents == 0)
                return -EINVAL;

        /*
         * Here we know that nr_extents is greater than zero which means
         * a map has been written. Since idmappings can't be changed
         * once they have been written we know that we can safely copy
         * from @map_to into @map_from.
         */

        if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                *map_to = *map_from;
                return 0;
        }

        forward = kmemdup(map_from->forward,
                          nr_extents * sizeof(struct uid_gid_extent),
                          GFP_KERNEL_ACCOUNT);
        if (!forward)
                return -ENOMEM;

        reverse = kmemdup(map_from->reverse,
                          nr_extents * sizeof(struct uid_gid_extent),
                          GFP_KERNEL_ACCOUNT);
        if (!reverse) {
                kfree(forward);
                return -ENOMEM;
        }

        /*
         * The idmapping isn't exposed anywhere so we don't need to care
         * about ordering between extent pointers and @nr_extents
         * initialization.
         */
        map_to->forward = forward;
        map_to->reverse = reverse;
        map_to->nr_extents = nr_extents;
        return 0;
}

static void free_mnt_idmap(struct mnt_idmap *idmap)
{
        if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->uid_map.forward);
                kfree(idmap->uid_map.reverse);
        }
        if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->gid_map.forward);
                kfree(idmap->gid_map.reverse);
        }
        kfree(idmap);
}

struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
{
        struct mnt_idmap *idmap;
        int ret;

        idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
        if (!idmap)
                return ERR_PTR(-ENOMEM);

        refcount_set(&idmap->count, 1);
        ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
        if (!ret)
                ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
        if (ret) {
                free_mnt_idmap(idmap);
                idmap = ERR_PTR(ret);
        }
        return idmap;
}

/**
 * mnt_idmap_get - get a reference to an idmapping
 * @idmap: the idmap to bump the reference on
 *
 * If @idmap is not the @nop_mnt_idmap bump the reference count.
 *
 * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
 */
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap)
                refcount_inc(&idmap->count);

        return idmap;
}
EXPORT_SYMBOL_GPL(mnt_idmap_get);

/**
 * mnt_idmap_put - put a reference to an idmapping
 * @idmap: the idmap to put the reference on
 *
 * If this is a non-initial idmapping, put the reference count when a mount is
 * released and free it if we're the last user.
 */
void mnt_idmap_put(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
                free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);


























































    2 









    2 






    2 





































































































    1 







    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock LSM - Network management and hooks
 *
 * Copyright © 2022-2023 Huawei Tech. Co., Ltd.
 * Copyright © 2022-2023 Microsoft Corporation
 */

#include <linux/in.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <net/ipv6.h>

#include "common.h"
#include "cred.h"
#include "limits.h"
#include "net.h"
#include "ruleset.h"

int landlock_append_net_rule(struct landlock_ruleset *const ruleset,
                             const u16 port, access_mask_t access_rights)
{
        int err;
        const struct landlock_id id = {
                .key.data = (__force uintptr_t)htons(port),
                .type = LANDLOCK_KEY_NET_PORT,
        };

        BUILD_BUG_ON(sizeof(port) > sizeof(id.key.data));

        /* Transforms relative access rights to absolute ones. */
        access_rights |= LANDLOCK_MASK_ACCESS_NET &
                         ~landlock_get_net_access_mask(ruleset, 0);

        mutex_lock(&ruleset->lock);
        err = landlock_insert_rule(ruleset, id, access_rights);
        mutex_unlock(&ruleset->lock);

        return err;
}

static access_mask_t
get_raw_handled_net_accesses(const struct landlock_ruleset *const domain)
{
        access_mask_t access_dom = 0;
        size_t layer_level;

        for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
                access_dom |= landlock_get_net_access_mask(domain, layer_level);
        return access_dom;
}

static const struct landlock_ruleset *get_current_net_domain(void)
{
        const struct landlock_ruleset *const dom =
                landlock_get_current_domain();

        if (!dom || !get_raw_handled_net_accesses(dom))
                return NULL;

        return dom;
}

static int current_check_access_socket(struct socket *const sock,
                                       struct sockaddr *const address,
                                       const int addrlen,
                                       access_mask_t access_request)
{
        __be16 port;
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_NET] = {};
        const struct landlock_rule *rule;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_NET_PORT,
        };
        const struct landlock_ruleset *const dom = get_current_net_domain();

        if (!dom)
                return 0;
        if (WARN_ON_ONCE(dom->num_layers < 1))
                return -EACCES;

        /* Checks if it's a (potential) TCP socket. */
        if (sock->type != SOCK_STREAM)
                return 0;

        /* Checks for minimal header length to safely read sa_family. */
        if (addrlen < offsetofend(typeof(*address), sa_family))
                return -EINVAL;

        switch (address->sa_family) {
        case AF_UNSPEC:
        case AF_INET:
                if (addrlen < sizeof(struct sockaddr_in))
                        return -EINVAL;
                port = ((struct sockaddr_in *)address)->sin_port;
                break;

#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                if (addrlen < SIN6_LEN_RFC2133)
                        return -EINVAL;
                port = ((struct sockaddr_in6 *)address)->sin6_port;
                break;
#endif /* IS_ENABLED(CONFIG_IPV6) */

        default:
                return 0;
        }

        /* Specific AF_UNSPEC handling. */
        if (address->sa_family == AF_UNSPEC) {
                /*
                 * Connecting to an address with AF_UNSPEC dissolves the TCP
                 * association, which have the same effect as closing the
                 * connection while retaining the socket object (i.e., the file
                 * descriptor).  As for dropping privileges, closing
                 * connections is always allowed.
                 *
                 * For a TCP access control system, this request is legitimate.
                 * Let the network stack handle potential inconsistencies and
                 * return -EINVAL if needed.
                 */
                if (access_request == LANDLOCK_ACCESS_NET_CONNECT_TCP)
                        return 0;

                /*
                 * For compatibility reason, accept AF_UNSPEC for bind
                 * accesses (mapped to AF_INET) only if the address is
                 * INADDR_ANY (cf. __inet_bind).  Checking the address is
                 * required to not wrongfully return -EACCES instead of
                 * -EAFNOSUPPORT.
                 *
                 * We could return 0 and let the network stack handle these
                 * checks, but it is safer to return a proper error and test
                 * consistency thanks to kselftest.
                 */
                if (access_request == LANDLOCK_ACCESS_NET_BIND_TCP) {
                        /* addrlen has already been checked for AF_UNSPEC. */
                        const struct sockaddr_in *const sockaddr =
                                (struct sockaddr_in *)address;

                        if (sock->sk->__sk_common.skc_family != AF_INET)
                                return -EINVAL;

                        if (sockaddr->sin_addr.s_addr != htonl(INADDR_ANY))
                                return -EAFNOSUPPORT;
                }
        } else {
                /*
                 * Checks sa_family consistency to not wrongfully return
                 * -EACCES instead of -EINVAL.  Valid sa_family changes are
                 * only (from AF_INET or AF_INET6) to AF_UNSPEC.
                 *
                 * We could return 0 and let the network stack handle this
                 * check, but it is safer to return a proper error and test
                 * consistency thanks to kselftest.
                 */
                if (address->sa_family != sock->sk->__sk_common.skc_family)
                        return -EINVAL;
        }

        id.key.data = (__force uintptr_t)port;
        BUILD_BUG_ON(sizeof(port) > sizeof(id.key.data));

        rule = landlock_find_rule(dom, id);
        access_request = landlock_init_layer_masks(
                dom, access_request, &layer_masks, LANDLOCK_KEY_NET_PORT);
        if (landlock_unmask_layers(rule, access_request, &layer_masks,
                                   ARRAY_SIZE(layer_masks)))
                return 0;

        return -EACCES;
}

static int hook_socket_bind(struct socket *const sock,
                            struct sockaddr *const address, const int addrlen)
{
        return current_check_access_socket(sock, address, addrlen,
                                           LANDLOCK_ACCESS_NET_BIND_TCP);
}

static int hook_socket_connect(struct socket *const sock,
                               struct sockaddr *const address,
                               const int addrlen)
{
        return current_check_access_socket(sock, address, addrlen,
                                           LANDLOCK_ACCESS_NET_CONNECT_TCP);
}

static struct security_hook_list landlock_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(socket_bind, hook_socket_bind),
        LSM_HOOK_INIT(socket_connect, hook_socket_connect),
};

__init void landlock_add_net_hooks(void)
{
        security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
                           &landlock_lsmid);
}


























































































    2 





    2 
    2 

    1 
    1 










    2 


    2 







    1 





    1 
    1 











    1 


    1 
































































































































































































    5 

























































































































    5 




    9 








    9 












    5 
    5 



    5 









    5 









    5 




















    5 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
 * Authors: David Chinner and Glauber Costa
 *
 * Generic LRU infrastructure
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
#include "internal.h"

#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return lru->memcg_aware;
}

static void list_lru_register(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_add(&lru->list, &memcg_list_lrus);
        mutex_unlock(&list_lrus_mutex);
}

static void list_lru_unregister(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_del(&lru->list);
        mutex_unlock(&list_lrus_mutex);
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return lru->shrinker_id;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        if (list_lru_memcg_aware(lru) && idx >= 0) {
                struct list_lru_memcg *mlru = xa_load(&lru->xa, idx);

                return mlru ? &mlru->node[nid] : NULL;
        }
        return &lru->node[nid].lru;
}
#else
static void list_lru_register(struct list_lru *lru)
{
}

static void list_lru_unregister(struct list_lru *lru)
{
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return -1;
}

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return false;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        return &lru->node[nid].lru;
}
#endif /* CONFIG_MEMCG_KMEM */

bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
                    struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;

        spin_lock(&nlru->lock);
        if (list_empty(item)) {
                l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
                list_add_tail(item, &l->list);
                /* Set shrinker bit if the first element was added */
                if (!l->nr_items++)
                        set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
                nlru->nr_items++;
                spin_unlock(&nlru->lock);
                return true;
        }
        spin_unlock(&nlru->lock);
        return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);

bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
{
        int nid = page_to_nid(virt_to_page(item));
        struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ?
                mem_cgroup_from_slab_obj(item) : NULL;

        return list_lru_add(lru, item, nid, memcg);
}
EXPORT_SYMBOL_GPL(list_lru_add_obj);

bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
                    struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;

        spin_lock(&nlru->lock);
        if (!list_empty(item)) {
                l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
                list_del_init(item);
                l->nr_items--;
                nlru->nr_items--;
                spin_unlock(&nlru->lock);
                return true;
        }
        spin_unlock(&nlru->lock);
        return false;
}
EXPORT_SYMBOL_GPL(list_lru_del);

bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
{
        int nid = page_to_nid(virt_to_page(item));
        struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ?
                mem_cgroup_from_slab_obj(item) : NULL;

        return list_lru_del(lru, item, nid, memcg);
}
EXPORT_SYMBOL_GPL(list_lru_del_obj);

void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
{
        list_del_init(item);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate);

void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
                           struct list_head *head)
{
        list_move(item, head);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);

unsigned long list_lru_count_one(struct list_lru *lru,
                                 int nid, struct mem_cgroup *memcg)
{
        struct list_lru_one *l;
        long count;

        rcu_read_lock();
        l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
        count = l ? READ_ONCE(l->nr_items) : 0;
        rcu_read_unlock();

        if (unlikely(count < 0))
                count = 0;

        return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);

unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
        struct list_lru_node *nlru;

        nlru = &lru->node[nid];
        return nlru->nr_items;
}
EXPORT_SYMBOL_GPL(list_lru_count_node);

static unsigned long
__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
                    list_lru_walk_cb isolate, void *cb_arg,
                    unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;
        struct list_head *item, *n;
        unsigned long isolated = 0;

restart:
        l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
        if (!l)
                goto out;

        list_for_each_safe(item, n, &l->list) {
                enum lru_status ret;

                /*
                 * decrement nr_to_walk first so that we don't livelock if we
                 * get stuck on large numbers of LRU_RETRY items
                 */
                if (!*nr_to_walk)
                        break;
                --*nr_to_walk;

                ret = isolate(item, l, &nlru->lock, cb_arg);
                switch (ret) {
                case LRU_REMOVED_RETRY:
                        assert_spin_locked(&nlru->lock);
                        fallthrough;
                case LRU_REMOVED:
                        isolated++;
                        nlru->nr_items--;
                        /*
                         * If the lru lock has been dropped, our list
                         * traversal is now invalid and so we have to
                         * restart from scratch.
                         */
                        if (ret == LRU_REMOVED_RETRY)
                                goto restart;
                        break;
                case LRU_ROTATE:
                        list_move_tail(item, &l->list);
                        break;
                case LRU_SKIP:
                        break;
                case LRU_RETRY:
                        /*
                         * The lru lock has been dropped, our list traversal is
                         * now invalid and so we have to restart from scratch.
                         */
                        assert_spin_locked(&nlru->lock);
                        goto restart;
                case LRU_STOP:
                        assert_spin_locked(&nlru->lock);
                        goto out;
                default:
                        BUG();
                }
        }
out:
        return isolated;
}

unsigned long
list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                  list_lru_walk_cb isolate, void *cb_arg,
                  unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        unsigned long ret;

        spin_lock(&nlru->lock);
        ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
                                  cb_arg, nr_to_walk);
        spin_unlock(&nlru->lock);
        return ret;
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);

unsigned long
list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                      list_lru_walk_cb isolate, void *cb_arg,
                      unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        unsigned long ret;

        spin_lock_irq(&nlru->lock);
        ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
                                  cb_arg, nr_to_walk);
        spin_unlock_irq(&nlru->lock);
        return ret;
}

unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk)
{
        long isolated = 0;

        isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
                                      nr_to_walk);

#ifdef CONFIG_MEMCG_KMEM
        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
                struct list_lru_memcg *mlru;
                unsigned long index;

                xa_for_each(&lru->xa, index, mlru) {
                        struct list_lru_node *nlru = &lru->node[nid];

                        spin_lock(&nlru->lock);
                        isolated += __list_lru_walk_one(lru, nid, index,
                                                        isolate, cb_arg,
                                                        nr_to_walk);
                        spin_unlock(&nlru->lock);

                        if (*nr_to_walk <= 0)
                                break;
                }
        }
#endif

        return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

static void init_one_lru(struct list_lru_one *l)
{
        INIT_LIST_HEAD(&l->list);
        l->nr_items = 0;
}

#ifdef CONFIG_MEMCG_KMEM
static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
{
        int nid;
        struct list_lru_memcg *mlru;

        mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp);
        if (!mlru)
                return NULL;

        for_each_node(nid)
                init_one_lru(&mlru->node[nid]);

        return mlru;
}

static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
{
        struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);

        /*
         * The __list_lru_walk_one() can walk the list of this node.
         * We need kvfree_rcu() here. And the walking of the list
         * is under lru->node[nid]->lock, which can serve as a RCU
         * read-side critical section.
         */
        if (mlru)
                kvfree_rcu(mlru, rcu);
}

static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
        if (memcg_aware)
                xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
        lru->memcg_aware = memcg_aware;
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
        XA_STATE(xas, &lru->xa, 0);
        struct list_lru_memcg *mlru;

        if (!list_lru_memcg_aware(lru))
                return;

        xas_lock_irq(&xas);
        xas_for_each(&xas, mlru, ULONG_MAX) {
                kfree(mlru);
                xas_store(&xas, NULL);
        }
        xas_unlock_irq(&xas);
}

static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
                                         int src_idx, struct mem_cgroup *dst_memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        int dst_idx = dst_memcg->kmemcg_id;
        struct list_lru_one *src, *dst;

        /*
         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
         * we have to use IRQ-safe primitives here to avoid deadlock.
         */
        spin_lock_irq(&nlru->lock);

        src = list_lru_from_memcg_idx(lru, nid, src_idx);
        if (!src)
                goto out;
        dst = list_lru_from_memcg_idx(lru, nid, dst_idx);

        list_splice_init(&src->list, &dst->list);

        if (src->nr_items) {
                dst->nr_items += src->nr_items;
                set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
                src->nr_items = 0;
        }
out:
        spin_unlock_irq(&nlru->lock);
}

static void memcg_reparent_list_lru(struct list_lru *lru,
                                    int src_idx, struct mem_cgroup *dst_memcg)
{
        int i;

        for_each_node(i)
                memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg);

        memcg_list_lru_free(lru, src_idx);
}

void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
        struct cgroup_subsys_state *css;
        struct list_lru *lru;
        int src_idx = memcg->kmemcg_id;

        /*
         * Change kmemcg_id of this cgroup and all its descendants to the
         * parent's id, and then move all entries from this cgroup's list_lrus
         * to ones of the parent.
         *
         * After we have finished, all list_lrus corresponding to this cgroup
         * are guaranteed to remain empty. So we can safely free this cgroup's
         * list lrus in memcg_list_lru_free().
         *
         * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc()
         * from allocating list lrus for this cgroup after memcg_list_lru_free()
         * call.
         */
        rcu_read_lock();
        css_for_each_descendant_pre(css, &memcg->css) {
                struct mem_cgroup *child;

                child = mem_cgroup_from_css(css);
                WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
        }
        rcu_read_unlock();

        mutex_lock(&list_lrus_mutex);
        list_for_each_entry(lru, &memcg_list_lrus, list)
                memcg_reparent_list_lru(lru, src_idx, parent);
        mutex_unlock(&list_lrus_mutex);
}

static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
                                            struct list_lru *lru)
{
        int idx = memcg->kmemcg_id;

        return idx < 0 || xa_load(&lru->xa, idx);
}

int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
                         gfp_t gfp)
{
        int i;
        unsigned long flags;
        struct list_lru_memcg_table {
                struct list_lru_memcg *mlru;
                struct mem_cgroup *memcg;
        } *table;
        XA_STATE(xas, &lru->xa, 0);

        if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
                return 0;

        gfp &= GFP_RECLAIM_MASK;
        table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
        if (!table)
                return -ENOMEM;

        /*
         * Because the list_lru can be reparented to the parent cgroup's
         * list_lru, we should make sure that this cgroup and all its
         * ancestors have allocated list_lru_memcg.
         */
        for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
                if (memcg_list_lru_allocated(memcg, lru))
                        break;

                table[i].memcg = memcg;
                table[i].mlru = memcg_init_list_lru_one(gfp);
                if (!table[i].mlru) {
                        while (i--)
                                kfree(table[i].mlru);
                        kfree(table);
                        return -ENOMEM;
                }
        }

        xas_lock_irqsave(&xas, flags);
        while (i--) {
                int index = READ_ONCE(table[i].memcg->kmemcg_id);
                struct list_lru_memcg *mlru = table[i].mlru;

                xas_set(&xas, index);
retry:
                if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
                        kfree(mlru);
                } else {
                        xas_store(&xas, mlru);
                        if (xas_error(&xas) == -ENOMEM) {
                                xas_unlock_irqrestore(&xas, flags);
                                if (xas_nomem(&xas, gfp))
                                        xas_set_err(&xas, 0);
                                xas_lock_irqsave(&xas, flags);
                                /*
                                 * The xas lock has been released, this memcg
                                 * can be reparented before us. So reload
                                 * memcg id. More details see the comments
                                 * in memcg_reparent_list_lrus().
                                 */
                                index = READ_ONCE(table[i].memcg->kmemcg_id);
                                if (index < 0)
                                        xas_set_err(&xas, 0);
                                else if (!xas_error(&xas) && index != xas.xa_index)
                                        xas_set(&xas, index);
                                goto retry;
                        }
                }
        }
        /* xas_nomem() is used to free memory instead of memory allocation. */
        if (xas.xa_alloc)
                xas_nomem(&xas, gfp);
        xas_unlock_irqrestore(&xas, flags);
        kfree(table);

        return xas_error(&xas);
}
#else
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
#endif /* CONFIG_MEMCG_KMEM */

int __list_lru_init(struct list_lru *lru, bool memcg_aware,
                    struct lock_class_key *key, struct shrinker *shrinker)
{
        int i;

#ifdef CONFIG_MEMCG_KMEM
        if (shrinker)
                lru->shrinker_id = shrinker->id;
        else
                lru->shrinker_id = -1;

        if (mem_cgroup_kmem_disabled())
                memcg_aware = false;
#endif

        lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
        if (!lru->node)
                return -ENOMEM;

        for_each_node(i) {
                spin_lock_init(&lru->node[i].lock);
                if (key)
                        lockdep_set_class(&lru->node[i].lock, key);
                init_one_lru(&lru->node[i].lru);
        }

        memcg_init_list_lru(lru, memcg_aware);
        list_lru_register(lru);

        return 0;
}
EXPORT_SYMBOL_GPL(__list_lru_init);

void list_lru_destroy(struct list_lru *lru)
{
        /* Already destroyed or not yet initialized? */
        if (!lru->node)
                return;

        list_lru_unregister(lru);

        memcg_destroy_list_lru(lru);
        kfree(lru->node);
        lru->node = NULL;

#ifdef CONFIG_MEMCG_KMEM
        lru->shrinker_id = -1;
#endif
}
EXPORT_SYMBOL_GPL(list_lru_destroy);























































































    2 





























    2 



















    1 






























    1 

















































    1 



































    1 



    1 

    1 
















    2 



    2 
    2 
    2 
    1 


































































































































































































































































































    1 


    1 








    1 
    1 

























































































































    2 








    2 






    2 
    1 






    2 

    2 

    2 
















    1 























    1 







    1 


































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/file.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/*
 * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index".
 */
static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = TOMOYO_MAC_FILE_EXECUTE,
        [TOMOYO_TYPE_READ]       = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_WRITE]      = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_APPEND]     = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_UNLINK]     = TOMOYO_MAC_FILE_UNLINK,
        [TOMOYO_TYPE_GETATTR]    = TOMOYO_MAC_FILE_GETATTR,
        [TOMOYO_TYPE_RMDIR]      = TOMOYO_MAC_FILE_RMDIR,
        [TOMOYO_TYPE_TRUNCATE]   = TOMOYO_MAC_FILE_TRUNCATE,
        [TOMOYO_TYPE_SYMLINK]    = TOMOYO_MAC_FILE_SYMLINK,
        [TOMOYO_TYPE_CHROOT]     = TOMOYO_MAC_FILE_CHROOT,
        [TOMOYO_TYPE_UMOUNT]     = TOMOYO_MAC_FILE_UMOUNT,
};

/*
 * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = {
        [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK,
        [TOMOYO_TYPE_MKCHAR]  = TOMOYO_MAC_FILE_MKCHAR,
};

/*
 * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = {
        [TOMOYO_TYPE_LINK]       = TOMOYO_MAC_FILE_LINK,
        [TOMOYO_TYPE_RENAME]     = TOMOYO_MAC_FILE_RENAME,
        [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT,
};

/*
 * Mapping table from "enum tomoyo_path_number_acl_index" to
 * "enum tomoyo_mac_index".
 */
const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = {
        [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE,
        [TOMOYO_TYPE_MKDIR]  = TOMOYO_MAC_FILE_MKDIR,
        [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO,
        [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK,
        [TOMOYO_TYPE_IOCTL]  = TOMOYO_MAC_FILE_IOCTL,
        [TOMOYO_TYPE_CHMOD]  = TOMOYO_MAC_FILE_CHMOD,
        [TOMOYO_TYPE_CHOWN]  = TOMOYO_MAC_FILE_CHOWN,
        [TOMOYO_TYPE_CHGRP]  = TOMOYO_MAC_FILE_CHGRP,
};

/**
 * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union".
 *
 * @ptr: Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
void tomoyo_put_name_union(struct tomoyo_name_union *ptr)
{
        tomoyo_put_group(ptr->group);
        tomoyo_put_name(ptr->filename);
}

/**
 * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not.
 *
 * @name: Pointer to "struct tomoyo_path_info".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise.
 */
const struct tomoyo_path_info *
tomoyo_compare_name_union(const struct tomoyo_path_info *name,
                          const struct tomoyo_name_union *ptr)
{
        if (ptr->group)
                return tomoyo_path_matches_group(name, ptr->group);
        if (tomoyo_path_matches_pattern(name, ptr->filename))
                return ptr->filename;
        return NULL;
}

/**
 * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union".
 *
 * @ptr: Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
void tomoyo_put_number_union(struct tomoyo_number_union *ptr)
{
        tomoyo_put_group(ptr->group);
}

/**
 * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not.
 *
 * @value: Number to check.
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @value matches @ptr, false otherwise.
 */
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr)
{
        if (ptr->group)
                return tomoyo_number_matches_group(value, value, ptr->group);
        return value >= ptr->values[0] && value <= ptr->values[1];
}

/**
 * tomoyo_add_slash - Add trailing '/' if needed.
 *
 * @buf: Pointer to "struct tomoyo_path_info".
 *
 * Returns nothing.
 *
 * @buf must be generated by tomoyo_encode() because this function does not
 * allocate memory for adding '/'.
 */
static void tomoyo_add_slash(struct tomoyo_path_info *buf)
{
        if (buf->is_dir)
                return;
        /*
         * This is OK because tomoyo_encode() reserves space for appending "/".
         */
        strcat((char *) buf->name, "/");
        tomoyo_fill_path_info(buf);
}

/**
 * tomoyo_get_realpath - Get realpath.
 *
 * @buf:  Pointer to "struct tomoyo_path_info".
 * @path: Pointer to "struct path".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path)
{
        buf->name = tomoyo_realpath_from_path(path);
        if (buf->name) {
                tomoyo_fill_path_info(buf);
                return true;
        }
        return false;
}

/**
 * tomoyo_audit_path_log - Audit path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword
                                 [r->param.path.operation],
                                 r->param.path.filename->name);
}

/**
 * tomoyo_audit_path2_log - Audit path/path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path2_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pp2mac[r->param.path2.operation]],
                                 r->param.path2.filename1->name,
                                 r->param.path2.filename2->name);
}

/**
 * tomoyo_audit_mkdev_log - Audit path/number/number/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n",
                                 tomoyo_mac_keywords
                                 [tomoyo_pnnn2mac[r->param.mkdev.operation]],
                                 r->param.mkdev.filename->name,
                                 r->param.mkdev.mode, r->param.mkdev.major,
                                 r->param.mkdev.minor);
}

/**
 * tomoyo_audit_path_number_log - Audit path/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r)
{
        const u8 type = r->param.path_number.operation;
        u8 radix;
        char buffer[64];

        switch (type) {
        case TOMOYO_TYPE_CREATE:
        case TOMOYO_TYPE_MKDIR:
        case TOMOYO_TYPE_MKFIFO:
        case TOMOYO_TYPE_MKSOCK:
        case TOMOYO_TYPE_CHMOD:
                radix = TOMOYO_VALUE_TYPE_OCTAL;
                break;
        case TOMOYO_TYPE_IOCTL:
                radix = TOMOYO_VALUE_TYPE_HEXADECIMAL;
                break;
        default:
                radix = TOMOYO_VALUE_TYPE_DECIMAL;
                break;
        }
        tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number,
                           radix);
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pn2mac[type]],
                                 r->param.path_number.filename->name, buffer);
}

/**
 * tomoyo_check_path_acl - Check permission for path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 *
 * To be able to use wildcard for domain transition, this function sets
 * matching entry on success. Since the caller holds tomoyo_read_lock(),
 * it is safe to set matching entry.
 */
static bool tomoyo_check_path_acl(struct tomoyo_request_info *r,
                                  const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl),
                                                         head);

        if (acl->perm & (1 << r->param.path.operation)) {
                r->param.path.matched_path =
                        tomoyo_compare_name_union(r->param.path.filename,
                                                  &acl->name);
                return r->param.path.matched_path != NULL;
        }
        return false;
}

/**
 * tomoyo_check_path_number_acl - Check permission for path number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r,
                                         const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_number_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path_number.operation)) &&
                tomoyo_compare_number_union(r->param.path_number.number,
                                            &acl->number) &&
                tomoyo_compare_name_union(r->param.path_number.filename,
                                          &acl->name);
}

/**
 * tomoyo_check_path2_acl - Check permission for path path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path2_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path2.operation)) &&
                tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1)
                && tomoyo_compare_name_union(r->param.path2.filename2,
                                             &acl->name2);
}

/**
 * tomoyo_check_mkdev_acl - Check permission for path number number number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_mkdev_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.mkdev.operation)) &&
                tomoyo_compare_number_union(r->param.mkdev.mode,
                                            &acl->mode) &&
                tomoyo_compare_number_union(r->param.mkdev.major,
                                            &acl->major) &&
                tomoyo_compare_number_union(r->param.mkdev.minor,
                                            &acl->minor) &&
                tomoyo_compare_name_union(r->param.mkdev.filename,
                                          &acl->name);
}

/**
 * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name);
}

/**
 * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a,
                                  struct tomoyo_acl_info *b,
                                  const bool is_delete)
{
        u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head)
                ->perm;
        u16 perm = READ_ONCE(*a_perm);
        const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path_acl(const u16 perm,
                                  struct tomoyo_acl_param *param)
{
        struct tomoyo_path_acl e = {
                .head.type = TOMOYO_TYPE_PATH_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_acl,
                                             tomoyo_merge_path_acl);
        tomoyo_put_name_union(&e.name);
        return error;
}

/**
 * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a,
                                         const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->mode, &p2->mode) &&
                tomoyo_same_number_union(&p1->major, &p2->major) &&
                tomoyo_same_number_union(&p1->minor, &p2->minor);
}

/**
 * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl,
                                         head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mkdev_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_mkdev_acl e = {
                .head.type = TOMOYO_TYPE_MKDEV_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.mode) ||
            !tomoyo_parse_number_union(param, &e.major) ||
            !tomoyo_parse_number_union(param, &e.minor))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mkdev_acl,
                                             tomoyo_merge_mkdev_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.mode);
        tomoyo_put_number_union(&e.major);
        tomoyo_put_number_union(&e.minor);
        return error;
}

/**
 * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name1, &p2->name1) &&
                tomoyo_same_name_union(&p1->name2, &p2->name2);
}

/**
 * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head)
                ->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path2_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_path2_acl e = {
                .head.type = TOMOYO_TYPE_PATH2_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name1) ||
            !tomoyo_parse_name_union(param, &e.name2))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path2_acl,
                                             tomoyo_merge_path2_acl);
        tomoyo_put_name_union(&e.name1);
        tomoyo_put_name_union(&e.name2);
        return error;
}

/**
 * tomoyo_path_permission - Check permission for single path operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @operation: Type of operation.
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation,
                                  const struct tomoyo_path_info *filename)
{
        int error;

        r->type = tomoyo_p2mac[operation];
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        if (r->mode == TOMOYO_CONFIG_DISABLED)
                return 0;
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = operation;
        do {
                tomoyo_check_acl(r, tomoyo_check_path_acl);
                error = tomoyo_audit_path_log(r);
        } while (error == TOMOYO_RETRY_REQUEST);
        return error;
}

/**
 * tomoyo_execute_permission - Check permission for execute operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename)
{
        /*
         * Unlike other permission checks, this check is done regardless of
         * profile mode settings in order to check for domain transition
         * preference.
         */
        r->type = TOMOYO_MAC_FILE_EXECUTE;
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = TOMOYO_TYPE_EXECUTE;
        tomoyo_check_acl(r, tomoyo_check_path_acl);
        r->ee->transition = r->matched_acl && r->matched_acl->cond ?
                r->matched_acl->cond->transit : NULL;
        if (r->mode != TOMOYO_CONFIG_DISABLED)
                return tomoyo_audit_path_log(r);
        return 0;
}

/**
 * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1),
                                                               head);
        const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2),
                                                               head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->number, &p2->number);
}

/**
 * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a,
                                         struct tomoyo_acl_info *b,
                                         const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl,
                                          head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_update_path_number_acl(const u8 perm,
                                         struct tomoyo_acl_param *param)
{
        struct tomoyo_path_number_acl e = {
                .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.number))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_number_acl,
                                             tomoyo_merge_path_number_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.number);
        return error;
}

/**
 * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp".
 *
 * @type:   Type of operation.
 * @path:   Pointer to "struct path".
 * @number: Number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_number_perm(const u8 type, const struct path *path,
                            unsigned long number)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        if (type == TOMOYO_TYPE_MKDIR)
                tomoyo_add_slash(&buf);
        r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL;
        r.param.path_number.operation = type;
        r.param.path_number.filename = &buf;
        r.param.path_number.number = number;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path_number_acl);
                error = tomoyo_audit_path_number_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
        kfree(buf.name);
 out:
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_check_open_permission - Check permission for "read" and "write".
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @path:   Pointer to "struct path".
 * @flag:   Flags for open().
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag)
{
        const u8 acc_mode = ACC_MODE(flag);
        int error = 0;
        struct tomoyo_path_info buf;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int idx;

        buf.name = NULL;
        r.mode = TOMOYO_CONFIG_DISABLED;
        idx = tomoyo_read_lock();
        if (acc_mode &&
            tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN)
            != TOMOYO_CONFIG_DISABLED) {
                if (!tomoyo_get_realpath(&buf, path)) {
                        error = -ENOMEM;
                        goto out;
                }
                r.obj = &obj;
                if (acc_mode & MAY_READ)
                        error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ,
                                                       &buf);
                if (!error && (acc_mode & MAY_WRITE))
                        error = tomoyo_path_permission(&r, (flag & O_APPEND) ?
                                                       TOMOYO_TYPE_APPEND :
                                                       TOMOYO_TYPE_WRITE,
                                                       &buf);
        }
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount".
 *
 * @operation: Type of operation.
 * @path:      Pointer to "struct path".
 * @target:    Symlink's target if @operation is TOMOYO_TYPE_SYMLINK,
 *             NULL otherwise.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error;
        struct tomoyo_path_info buf;
        bool is_enforce;
        struct tomoyo_path_info symlink_target;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING);
        error = -ENOMEM;
        buf.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        switch (operation) {
        case TOMOYO_TYPE_RMDIR:
        case TOMOYO_TYPE_CHROOT:
                tomoyo_add_slash(&buf);
                break;
        case TOMOYO_TYPE_SYMLINK:
                symlink_target.name = tomoyo_encode(target);
                if (!symlink_target.name)
                        goto out;
                tomoyo_fill_path_info(&symlink_target);
                obj.symlink_target = &symlink_target;
                break;
        }
        error = tomoyo_path_permission(&r, operation, &buf);
        if (operation == TOMOYO_TYPE_SYMLINK)
                kfree(symlink_target.name);
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (!is_enforce)
                error = 0;
        return error;
}

/**
 * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar".
 *
 * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK)
 * @path:      Pointer to "struct path".
 * @mode:      Create mode.
 * @dev:       Device number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        error = -ENOMEM;
        if (tomoyo_get_realpath(&buf, path)) {
                r.obj = &obj;
                dev = new_decode_dev(dev);
                r.param_type = TOMOYO_TYPE_MKDEV_ACL;
                r.param.mkdev.filename = &buf;
                r.param.mkdev.operation = operation;
                r.param.mkdev.mode = mode;
                r.param.mkdev.major = MAJOR(dev);
                r.param.mkdev.minor = MINOR(dev);
                tomoyo_check_acl(&r, tomoyo_check_mkdev_acl);
                error = tomoyo_audit_mkdev_log(&r);
                kfree(buf.name);
        }
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root".
 *
 * @operation: Type of operation.
 * @path1:      Pointer to "struct path".
 * @path2:      Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2)
{
        int error = -ENOMEM;
        struct tomoyo_path_info buf1;
        struct tomoyo_path_info buf2;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path1->mnt, .dentry = path1->dentry },
                .path2 = { .mnt = path2->mnt, .dentry = path2->dentry }
        };
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        buf1.name = NULL;
        buf2.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf1, path1) ||
            !tomoyo_get_realpath(&buf2, path2))
                goto out;
        switch (operation) {
        case TOMOYO_TYPE_RENAME:
        case TOMOYO_TYPE_LINK:
                if (!d_is_dir(path1->dentry))
                        break;
                fallthrough;
        case TOMOYO_TYPE_PIVOT_ROOT:
                tomoyo_add_slash(&buf1);
                tomoyo_add_slash(&buf2);
                break;
        }
        r.obj = &obj;
        r.param_type = TOMOYO_TYPE_PATH2_ACL;
        r.param.path2.operation = operation;
        r.param.path2.filename1 = &buf1;
        r.param.path2.filename2 = &buf2;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path2_acl);
                error = tomoyo_audit_path2_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
 out:
        kfree(buf1.name);
        kfree(buf2.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) &&
                tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) &&
                tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) &&
                tomoyo_same_number_union(&p1->flags, &p2->flags);
}

/**
 * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param)
{
        struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL };
        int error;

        if (!tomoyo_parse_name_union(param, &e.dev_name) ||
            !tomoyo_parse_name_union(param, &e.dir_name) ||
            !tomoyo_parse_name_union(param, &e.fs_type) ||
            !tomoyo_parse_number_union(param, &e.flags))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mount_acl, NULL);
        tomoyo_put_name_union(&e.dev_name);
        tomoyo_put_name_union(&e.dir_name);
        tomoyo_put_name_union(&e.fs_type);
        tomoyo_put_number_union(&e.flags);
        return error;
}

/**
 * tomoyo_write_file - Update file related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_file(struct tomoyo_acl_param *param)
{
        u16 perm = 0;
        u8 type;
        const char *operation = tomoyo_read_token(param);

        for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++)
                if (tomoyo_permstr(operation, tomoyo_path_keyword[type]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pp2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path2_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_number_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pnnn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_mkdev_acl(perm, param);
        if (tomoyo_permstr(operation,
                           tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT]))
                return tomoyo_update_mount_acl(param);
        return -EINVAL;
}





























































































































































































































































    1 




    4 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H

#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>

#include <trace/events/tlb.h>

#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>

extern atomic64_t last_mm_ctx_id;

#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
 * ldt_structs can be allocated, used, and freed, but they are never
 * modified while live.
 */
struct ldt_struct {
        /*
         * Xen requires page-aligned LDTs with special permissions.  This is
         * needed to prevent us from installing evil descriptors such as
         * call gates.  On native, we could merge the ldt_struct and LDT
         * allocations, but it's not worth trying to optimize.
         */
        struct desc_struct        *entries;
        unsigned int                nr_entries;

        /*
         * If PTI is in use, then the entries array is not mapped while we're
         * in user mode.  The whole array will be aliased at the addressed
         * given by ldt_slot_va(slot).  We use two slots so that we can allocate
         * and map, and enable a new LDT without invalidating the mapping
         * of an older, still-in-use LDT.
         *
         * slot will be -1 if this LDT doesn't have an alias mapping.
         */
        int                        slot;
};

/*
 * Used for LDT copy/destruction.
 */
static inline void init_new_context_ldt(struct mm_struct *mm)
{
        mm->context.ldt = NULL;
        init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else        /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
        return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
        clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
        DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif

#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        return mm->context.lam_cr3_mask;
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
        mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
        mm->context.untag_mask = oldmm->context.untag_mask;
}

#define mm_untag_mask mm_untag_mask
static inline unsigned long mm_untag_mask(struct mm_struct *mm)
{
        return mm->context.untag_mask;
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
        mm->context.untag_mask = -1UL;
}

#define arch_pgtable_dma_compat arch_pgtable_dma_compat
static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
{
        return !mm_lam_cr3_mask(mm) ||
                test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
}
#else

static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        return 0;
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
}
#endif

#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

/*
 * Init a new mm.  Used on mm copies, like at fork()
 * and on mm's that are brand-new, like at execve().
 */
#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
{
        mutex_init(&mm->context.lock);

        mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
        atomic64_set(&mm->context.tlb_gen, 0);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and allocated implicitly */
                mm->context.pkey_allocation_map = 0x1;
                /* -1 means unallocated or invalid */
                mm->context.execute_only_pkey = -1;
        }
#endif
        mm_reset_untag_mask(mm);
        init_new_context_ldt(mm);
        return 0;
}

#define destroy_context destroy_context
static inline void destroy_context(struct mm_struct *mm)
{
        destroy_context_ldt(mm);
}

extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                      struct task_struct *tsk);

extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                               struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off

#define activate_mm(prev, next)                        \
do {                                                \
        paravirt_enter_mmap(next);                \
        switch_mm((prev), (next), NULL);        \
} while (0);

#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        loadsegment(gs, 0);                        \
} while (0)
#else
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        shstk_free(tsk);                        \
        load_gs_index(0);                        \
        loadsegment(fs, 0);                        \
} while (0)
#endif

static inline void arch_dup_pkeys(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /* Duplicate the oldmm pkey state in mm: */
        mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
        mm->context.execute_only_pkey   = oldmm->context.execute_only_pkey;
#endif
}

static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
        arch_dup_pkeys(oldmm, mm);
        paravirt_enter_mmap(mm);
        dup_lam(oldmm, mm);
        return ldt_dup_context(oldmm, mm);
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
        paravirt_arch_exit_mmap(mm);
        ldt_arch_exit_mmap(mm);
}

#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return        !IS_ENABLED(CONFIG_IA32_EMULATION) ||
                !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return false;
}
#endif

static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
                              unsigned long end)
{
}

/*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
 * processes or any way to tell *which * PKRU in a threaded
 * process we could use.
 *
 * So do not enforce things if the VMA is not from the current
 * mm, or if we are in a kernel thread.
 */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
                bool write, bool execute, bool foreign)
{
        /* pkeys never affect instruction fetches */
        if (execute)
                return true;
        /* allow access if the VMA is not one from this process */
        if (foreign || vma_is_foreign(vma))
                return true;
        return __pkru_allows_pkey(vma_pkey(vma), write);
}

unsigned long __get_current_cr3_fast(void);

#include <asm-generic/mmu_context.h>

#endif /* _ASM_X86_MMU_CONTEXT_H */




























































































































































































































    1 









    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel Implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (C) 1999-2001 Cisco, Motorola
 *
 * This file is part of the SCTP kernel implementation
 *
 * These are the definitions needed for the command object.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *   La Monte H.P. Yarroll <piggy@acm.org>
 *   Karl Knutson <karl@athena.chicago.il.us>
 *   Ardelle Fan <ardelle.fan@intel.com>
 *   Sridhar Samudrala <sri@us.ibm.com>
 */

#ifndef __net_sctp_command_h__
#define __net_sctp_command_h__

#include <net/sctp/constants.h>
#include <net/sctp/structs.h>


enum sctp_verb {
        SCTP_CMD_NOP = 0,        /* Do nothing. */
        SCTP_CMD_NEW_ASOC,        /* Register a new association.  */
        SCTP_CMD_DELETE_TCB,        /* Delete the current association. */
        SCTP_CMD_NEW_STATE,        /* Enter a new state.  */
        SCTP_CMD_REPORT_TSN,        /* Record the arrival of a TSN.  */
        SCTP_CMD_GEN_SACK,        /* Send a Selective ACK (maybe).  */
        SCTP_CMD_PROCESS_SACK,        /* Process an inbound SACK.  */
        SCTP_CMD_GEN_INIT_ACK,        /* Generate an INIT ACK chunk.  */
        SCTP_CMD_PEER_INIT,        /* Process a INIT from the peer.  */
        SCTP_CMD_GEN_COOKIE_ECHO, /* Generate a COOKIE ECHO chunk. */
        SCTP_CMD_CHUNK_ULP,        /* Send a chunk to the sockets layer.  */
        SCTP_CMD_EVENT_ULP,        /* Send a notification to the sockets layer. */
        SCTP_CMD_REPLY,                /* Send a chunk to our peer.  */
        SCTP_CMD_SEND_PKT,        /* Send a full packet to our peer.  */
        SCTP_CMD_RETRAN,        /* Mark a transport for retransmission.  */
        SCTP_CMD_ECN_CE,        /* Do delayed CE processing.   */
        SCTP_CMD_ECN_ECNE,        /* Do delayed ECNE processing. */
        SCTP_CMD_ECN_CWR,        /* Do delayed CWR processing.  */
        SCTP_CMD_TIMER_START,        /* Start a timer.  */
        SCTP_CMD_TIMER_START_ONCE, /* Start a timer once */
        SCTP_CMD_TIMER_RESTART,        /* Restart a timer. */
        SCTP_CMD_TIMER_STOP,        /* Stop a timer. */
        SCTP_CMD_INIT_CHOOSE_TRANSPORT, /* Choose transport for an INIT. */
        SCTP_CMD_INIT_COUNTER_RESET, /* Reset init counter. */
        SCTP_CMD_INIT_COUNTER_INC,   /* Increment init counter. */
        SCTP_CMD_INIT_RESTART,  /* High level, do init timer work. */
        SCTP_CMD_COOKIEECHO_RESTART,  /* High level, do cookie-echo timer work. */
        SCTP_CMD_INIT_FAILED,   /* High level, do init failure work. */
        SCTP_CMD_REPORT_DUP,        /* Report a duplicate TSN.  */
        SCTP_CMD_STRIKE,        /* Mark a strike against a transport.  */
        SCTP_CMD_HB_TIMERS_START,    /* Start the heartbeat timers. */
        SCTP_CMD_HB_TIMER_UPDATE,    /* Update a heartbeat timers.  */
        SCTP_CMD_HB_TIMERS_STOP,     /* Stop the heartbeat timers.  */
        SCTP_CMD_PROBE_TIMER_UPDATE, /* Update a probe timer.  */
        SCTP_CMD_TRANSPORT_HB_SENT,  /* Reset the status of a transport. */
        SCTP_CMD_TRANSPORT_IDLE,     /* Do manipulations on idle transport */
        SCTP_CMD_TRANSPORT_ON,       /* Mark the transport as active. */
        SCTP_CMD_REPORT_ERROR,   /* Pass this error back out of the sm. */
        SCTP_CMD_REPORT_BAD_TAG, /* Verification tags didn't match. */
        SCTP_CMD_PROCESS_CTSN,   /* Sideeffect from shutdown. */
        SCTP_CMD_ASSOC_FAILED,         /* Handle association failure. */
        SCTP_CMD_DISCARD_PACKET, /* Discard the whole packet. */
        SCTP_CMD_GEN_SHUTDOWN,   /* Generate a SHUTDOWN chunk. */
        SCTP_CMD_PURGE_OUTQUEUE, /* Purge all data waiting to be sent. */
        SCTP_CMD_SETUP_T2,       /* Hi-level, setup T2-shutdown parms.  */
        SCTP_CMD_RTO_PENDING,         /* Set transport's rto_pending. */
        SCTP_CMD_PART_DELIVER,         /* Partial data delivery considerations. */
        SCTP_CMD_RENEGE,         /* Renege data on an association. */
        SCTP_CMD_SETUP_T4,         /* ADDIP, setup T4 RTO timer parms. */
        SCTP_CMD_PROCESS_OPERR,  /* Process an ERROR chunk. */
        SCTP_CMD_REPORT_FWDTSN,         /* Report new cumulative TSN Ack. */
        SCTP_CMD_PROCESS_FWDTSN, /* Skips were reported, so process further. */
        SCTP_CMD_CLEAR_INIT_TAG, /* Clears association peer's inittag. */
        SCTP_CMD_DEL_NON_PRIMARY, /* Removes non-primary peer transports. */
        SCTP_CMD_T3_RTX_TIMERS_STOP, /* Stops T3-rtx pending timers */
        SCTP_CMD_FORCE_PRIM_RETRAN,  /* Forces retrans. over primary path. */
        SCTP_CMD_SET_SK_ERR,         /* Set sk_err */
        SCTP_CMD_ASSOC_CHANGE,         /* generate and send assoc_change event */
        SCTP_CMD_ADAPTATION_IND, /* generate and send adaptation event */
        SCTP_CMD_PEER_NO_AUTH,   /* generate and send authentication event */
        SCTP_CMD_ASSOC_SHKEY,    /* generate the association shared keys */
        SCTP_CMD_T1_RETRAN,         /* Mark for retransmission after T1 timeout  */
        SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */
        SCTP_CMD_SEND_MSG,         /* Send the whole use message */
        SCTP_CMD_PURGE_ASCONF_QUEUE, /* Purge all asconf queues.*/
        SCTP_CMD_SET_ASOC,         /* Restore association context */
        SCTP_CMD_LAST
};

/* How many commands can you put in an struct sctp_cmd_seq?
 * This is a rather arbitrary number, ideally derived from a careful
 * analysis of the state functions, but in reality just taken from
 * thin air in the hopes othat we don't trigger a kernel panic.
 */
#define SCTP_MAX_NUM_COMMANDS 20

union sctp_arg {
        void *zero_all;        /* Set to NULL to clear the entire union */
        __s32 i32;
        __u32 u32;
        __be32 be32;
        __u16 u16;
        __u8 u8;
        int error;
        __be16 err;
        enum sctp_state state;
        enum sctp_event_timeout to;
        struct sctp_chunk *chunk;
        struct sctp_association *asoc;
        struct sctp_transport *transport;
        struct sctp_bind_addr *bp;
        struct sctp_init_chunk *init;
        struct sctp_ulpevent *ulpevent;
        struct sctp_packet *packet;
        struct sctp_sackhdr *sackh;
        struct sctp_datamsg *msg;
};

/* We are simulating ML type constructors here.
 *
 * SCTP_ARG_CONSTRUCTOR(NAME, TYPE, ELT) builds a function called
 * SCTP_NAME() which takes an argument of type TYPE and returns an
 * union sctp_arg.  It does this by inserting the sole argument into
 * the ELT union element of a local union sctp_arg.
 *
 * E.g., SCTP_ARG_CONSTRUCTOR(I32, __s32, i32) builds SCTP_I32(arg),
 * which takes an __s32 and returns a union sctp_arg containing the
 * __s32.  So, after foo = SCTP_I32(arg), foo.i32 == arg.
 */

#define SCTP_ARG_CONSTRUCTOR(name, type, elt) \
static inline union sctp_arg        \
SCTP_## name (type arg)                \
{ union sctp_arg retval;\
  retval.zero_all = NULL;\
  retval.elt = arg;\
  return retval;\
}

SCTP_ARG_CONSTRUCTOR(I32,        __s32, i32)
SCTP_ARG_CONSTRUCTOR(U32,        __u32, u32)
SCTP_ARG_CONSTRUCTOR(BE32,        __be32, be32)
SCTP_ARG_CONSTRUCTOR(U16,        __u16, u16)
SCTP_ARG_CONSTRUCTOR(U8,        __u8, u8)
SCTP_ARG_CONSTRUCTOR(ERROR,     int, error)
SCTP_ARG_CONSTRUCTOR(PERR,      __be16, err)        /* protocol error */
SCTP_ARG_CONSTRUCTOR(STATE,        enum sctp_state, state)
SCTP_ARG_CONSTRUCTOR(TO,        enum sctp_event_timeout, to)
SCTP_ARG_CONSTRUCTOR(CHUNK,        struct sctp_chunk *, chunk)
SCTP_ARG_CONSTRUCTOR(ASOC,        struct sctp_association *, asoc)
SCTP_ARG_CONSTRUCTOR(TRANSPORT,        struct sctp_transport *, transport)
SCTP_ARG_CONSTRUCTOR(BA,        struct sctp_bind_addr *, bp)
SCTP_ARG_CONSTRUCTOR(PEER_INIT,        struct sctp_init_chunk *, init)
SCTP_ARG_CONSTRUCTOR(ULPEVENT,  struct sctp_ulpevent *, ulpevent)
SCTP_ARG_CONSTRUCTOR(PACKET,        struct sctp_packet *, packet)
SCTP_ARG_CONSTRUCTOR(SACKH,        struct sctp_sackhdr *, sackh)
SCTP_ARG_CONSTRUCTOR(DATAMSG,        struct sctp_datamsg *, msg)

static inline union sctp_arg SCTP_FORCE(void)
{
        return SCTP_I32(1);
}

static inline union sctp_arg SCTP_NOFORCE(void)
{
        return SCTP_I32(0);
}

static inline union sctp_arg SCTP_NULL(void)
{
        union sctp_arg retval;
        retval.zero_all = NULL;
        return retval;
}

struct sctp_cmd {
        union sctp_arg obj;
        enum sctp_verb verb;
};

struct sctp_cmd_seq {
        struct sctp_cmd cmds[SCTP_MAX_NUM_COMMANDS];
        struct sctp_cmd *last_used_slot;
        struct sctp_cmd *next_cmd;
};


/* Initialize a block of memory as a command sequence.
 * Return 0 if the initialization fails.
 */
static inline int sctp_init_cmd_seq(struct sctp_cmd_seq *seq)
{
        /* cmds[] is filled backwards to simplify the overflow BUG() check */
        seq->last_used_slot = seq->cmds + SCTP_MAX_NUM_COMMANDS;
        seq->next_cmd = seq->last_used_slot;
        return 1;                /* We always succeed.  */
}


/* Add a command to an struct sctp_cmd_seq.
 *
 * Use the SCTP_* constructors defined by SCTP_ARG_CONSTRUCTOR() above
 * to wrap data which goes in the obj argument.
 */
static inline void sctp_add_cmd_sf(struct sctp_cmd_seq *seq,
                                   enum sctp_verb verb, union sctp_arg obj)
{
        struct sctp_cmd *cmd = seq->last_used_slot - 1;

        BUG_ON(cmd < seq->cmds);

        cmd->verb = verb;
        cmd->obj = obj;
        seq->last_used_slot = cmd;
}

/* Return the next command structure in an sctp_cmd_seq.
 * Return NULL at the end of the sequence.
 */
static inline struct sctp_cmd *sctp_next_cmd(struct sctp_cmd_seq *seq)
{
        if (seq->next_cmd <= seq->last_used_slot)
                return NULL;

        return --seq->next_cmd;
}

#endif /* __net_sctp_command_h__ */






































































































































    2 

























































































































    3 












    5 



    5 



    7 
















































































































    7 








    4 

    8 
































































































































































































































































































































































































































    1 




    1 


    1 







    1 




    3 





    5 


    4 





    5 



























    3 


    4 


    2 


    5 
    4 















































































































    3 


    2 















    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 

    2 

    3 














































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel timekeeping code and accessor functions. Based on code from
 *  timer.c, moved in commit 8524070b7982.
 */
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
#include <linux/random.h>

#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"

#define TK_CLEAR_NTP                (1 << 0)
#define TK_MIRROR                (1 << 1)
#define TK_CLOCK_WAS_SET        (1 << 2)

enum timekeeping_adv_mode {
        /* Update timekeeper when a tick has passed */
        TK_ADV_TICK,

        /* Update timekeeper on a direct frequency change */
        TK_ADV_FREQ
};

DEFINE_RAW_SPINLOCK(timekeeper_lock);

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
static struct {
        seqcount_raw_spinlock_t        seq;
        struct timekeeper        timekeeper;
} tk_core ____cacheline_aligned = {
        .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};

static struct timekeeper shadow_timekeeper;

/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;

/**
 * struct tk_fast - NMI safe timekeeper
 * @seq:        Sequence counter for protecting updates. The lowest bit
 *                is the index for the tk_read_base array
 * @base:        tk_read_base array. Access is indexed by the lowest bit of
 *                @seq.
 *
 * See @update_fast_timekeeper() below.
 */
struct tk_fast {
        seqcount_latch_t        seq;
        struct tk_read_base        base[2];
};

/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend;

static u64 dummy_clock_read(struct clocksource *cs)
{
        if (timekeeping_suspended)
                return cycles_at_suspend;
        return local_clock();
}

static struct clocksource dummy_clock = {
        .read = dummy_clock_read,
};

/*
 * Boot time initialization which allows local_clock() to be utilized
 * during early boot when clocksources are not available. local_clock()
 * returns nanoseconds already so no conversion is required, hence mult=1
 * and shift=0. When the first proper clocksource is installed then
 * the fast time keepers are updated with the correct values.
 */
#define FAST_TK_INIT                                                \
        {                                                        \
                .clock                = &dummy_clock,                        \
                .mask                = CLOCKSOURCE_MASK(64),                \
                .mult                = 1,                                \
                .shift                = 0,                                \
        }

static struct tk_fast tk_fast_mono ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static inline void tk_normalize_xtime(struct timekeeper *tk)
{
        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
        while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
                tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
                tk->raw_sec++;
        }
}

static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
}

static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec = ts->tv_sec;
        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}

static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec += ts->tv_sec;
        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
}

static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
        struct timespec64 tmp;

        /*
         * Verify consistency of: offset_real = -wall_to_monotonic
         * before modifying anything
         */
        set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
                                        -tk->wall_to_monotonic.tv_nsec);
        WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        tk->offs_real = timespec64_to_ktime(tmp);
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
        tk->offs_boot = ktime_add(tk->offs_boot, delta);
        /*
         * Timespec representation for VDSO update to avoid 64bit division
         * on every update.
         */
        tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}

/*
 * tk_clock_read - atomic clocksource read() helper
 *
 * This helper is necessary to use in the read paths because, while the
 * seqcount ensures we don't return a bad value while structures are updated,
 * it doesn't protect from potential crashes. There is the possibility that
 * the tkr's clocksource may change between the read reference, and the
 * clock reference passed to the read function.  This can cause crashes if
 * the wrong clocksource is passed to the wrong read function.
 * This isn't necessary to use when holding the timekeeper_lock or doing
 * a read of the fast-timekeeper tkrs (which is protected by its own locking
 * and update logic).
 */
static inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
        struct clocksource *clock = READ_ONCE(tkr->clock);

        return clock->read(clock);
}

#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */

static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{

        u64 max_cycles = tk->tkr_mono.clock->max_cycles;
        const char *name = tk->tkr_mono.clock->name;

        if (offset > max_cycles) {
                printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
                                offset, name, max_cycles);
                printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
        } else {
                if (offset > (max_cycles >> 1)) {
                        printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
                                        offset, name, max_cycles >> 1);
                        printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
                }
        }

        if (tk->underflow_seen) {
                if (jiffies - tk->last_warning > WARNING_FREQ) {
                        printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
                        printk_deferred("         Your kernel is probably still fine.\n");
                        tk->last_warning = jiffies;
                }
                tk->underflow_seen = 0;
        }

        if (tk->overflow_seen) {
                if (jiffies - tk->last_warning > WARNING_FREQ) {
                        printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
                        printk_deferred("         Your kernel is probably still fine.\n");
                        tk->last_warning = jiffies;
                }
                tk->overflow_seen = 0;
        }
}

static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles);

static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 now, last, mask, max, delta;
        unsigned int seq;

        /*
         * Since we're called holding a seqcount, the data may shift
         * under us while we're doing the calculation. This can cause
         * false positives, since we'd note a problem but throw the
         * results away. So nest another seqcount here to atomically
         * grab the points we are checking with.
         */
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(tkr);
                last = tkr->cycle_last;
                mask = tkr->mask;
                max = tkr->clock->max_cycles;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        delta = clocksource_delta(now, last, mask);

        /*
         * Try to catch underflows by checking if we are seeing small
         * mask-relative negative values.
         */
        if (unlikely((~delta & mask) < (mask >> 3)))
                tk->underflow_seen = 1;

        /* Check for multiplication overflows */
        if (unlikely(delta > max))
                tk->overflow_seen = 1;

        /* timekeeping_cycles_to_ns() handles both under and overflow */
        return timekeeping_cycles_to_ns(tkr, now);
}
#else
static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
        BUG();
}
#endif

/**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
 * @tk:                The target timekeeper to setup.
 * @clock:                Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
 * pair and interval request.
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
        u64 interval;
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;

        ++tk->cs_was_changed_seq;
        old_clock = tk->tkr_mono.clock;
        tk->tkr_mono.clock = clock;
        tk->tkr_mono.mask = clock->mask;
        tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);

        tk->tkr_raw.clock = clock;
        tk->tkr_raw.mask = clock->mask;
        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;

        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
                tmp = 1;

        interval = (u64) tmp;
        tk->cycle_interval = interval;

        /* Go back from cycles -> shifted ns */
        tk->xtime_interval = interval * clock->mult;
        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
        tk->raw_interval = interval * clock->mult;

         /* if changing clocks, convert xtime_nsec shift units */
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0) {
                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                        tk->tkr_raw.xtime_nsec >>= -shift_change;
                } else {
                        tk->tkr_mono.xtime_nsec <<= shift_change;
                        tk->tkr_raw.xtime_nsec <<= shift_change;
                }
        }

        tk->tkr_mono.shift = clock->shift;
        tk->tkr_raw.shift = clock->shift;

        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        tk->ntp_tick = ntpinterval << tk->ntp_error_shift;

        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
        tk->tkr_mono.mult = clock->mult;
        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
        tk->skip_second_overflow = 0;
}

/* Timekeeper helper functions. */
static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
{
        return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}

static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
        /* Calculate the delta since the last update_wall_time() */
        u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;

        /*
         * This detects both negative motion and the case where the delta
         * overflows the multiplication with tkr->mult.
         */
        if (unlikely(delta > tkr->clock->max_cycles)) {
                /*
                 * Handle clocksource inconsistency between CPUs to prevent
                 * time from going backwards by checking for the MSB of the
                 * mask being set in the delta.
                 */
                if (delta & ~(mask >> 1))
                        return tkr->xtime_nsec >> tkr->shift;

                return delta_to_ns_safe(tkr, delta);
        }

        return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}

static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
{
        return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}

static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
        if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
                return timekeeping_debug_get_ns(tkr);

        return __timekeeping_get_ns(tkr);
}

/**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
 * @tkf: Pointer to NMI safe timekeeper
 *
 * We want to use this from any context including NMI and tracing /
 * instrumenting the timekeeping code itself.
 *
 * Employ the latch technique; see @raw_write_seqcount_latch.
 *
 * So if a NMI hits the update of base[0] then it will use base[1]
 * which is still consistent. In the worst case this can result is a
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
static void update_fast_timekeeper(const struct tk_read_base *tkr,
                                   struct tk_fast *tkf)
{
        struct tk_read_base *base = tkf->base;

        /* Force readers off to base[1] */
        raw_write_seqcount_latch(&tkf->seq);

        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));

        /* Force readers back to base[0] */
        raw_write_seqcount_latch(&tkf->seq);

        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));
}

static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                now = ktime_to_ns(tkr->base);
                now += __timekeeping_get_ns(tkr);
        } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));

        return now;
}

/**
 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
 *
 * This timestamp is not guaranteed to be monotonic across an update.
 * The timestamp is calculated by:
 *
 *        now = base_mono + clock_delta * slope
 *
 * So if the update lowers the slope, readers who are forced to the
 * not yet updated second array are still using the old steeper slope.
 *
 * tmono
 * ^
 * |    o  n
 * |   o n
 * |  u
 * | o
 * |o
 * |12345678---> reader order
 *
 * o = old slope
 * u = update
 * n = new slope
 *
 * So reader 6 will observe time going backwards versus reader 5.
 *
 * While other CPUs are likely to be able to observe that, the only way
 * for a CPU local observation is when an NMI hits in the middle of
 * the update. Timestamps taken from that NMI context might be ahead
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
u64 notrace ktime_get_mono_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);

/**
 * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
 *
 * Contrary to ktime_get_mono_fast_ns() this is always correct because the
 * conversion factor is not affected by NTP/PTP correction.
 */
u64 notrace ktime_get_raw_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);

/**
 * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
 *
 * To keep it NMI safe since we're accessing from tracing, we're not using a
 * separate timekeeper with updates to monotonic clock and boot offset
 * protected with seqcounts. This has the following minor side effects:
 *
 * (1) Its possible that a timestamp be taken after the boot offset is updated
 * but before the timekeeper is updated. If this happens, the new boot offset
 * is added to the old timekeeping making the clock appear to update slightly
 * earlier:
 *    CPU 0                                        CPU 1
 *    timekeeping_inject_sleeptime64()
 *    __timekeeping_inject_sleeptime(tk, delta);
 *                                                 timestamp();
 *    timekeeping_update(tk, TK_CLEAR_NTP...);
 *
 * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
 * partially updated.  Since the tk->offs_boot update is a rare event, this
 * should be a rare occurrence which postprocessing should be able to handle.
 *
 * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
 * apply as well.
 */
u64 notrace ktime_get_boot_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

/**
 * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
 *
 * The same limitations as described for ktime_get_boot_fast_ns() apply. The
 * mono time and the TAI offset are not read atomically which may yield wrong
 * readouts. However, an update of the TAI offset is an rare event e.g., caused
 * by settime or adjtimex with an offset. The user of this function has to deal
 * with the possibility of wrong timestamps in post processing.
 */
u64 notrace ktime_get_tai_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
}
EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);

static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
        struct tk_read_base *tkr;
        u64 basem, baser, delta;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                basem = ktime_to_ns(tkr->base);
                baser = ktime_to_ns(tkr->base_real);
                delta = __timekeeping_get_ns(tkr);
        } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));

        if (mono)
                *mono = basem + delta;
        return baser + delta;
}

/**
 * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
 *
 * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
 */
u64 ktime_get_real_fast_ns(void)
{
        return __ktime_get_real_fast(&tk_fast_mono, NULL);
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);

/**
 * ktime_get_fast_timestamps: - NMI safe timestamps
 * @snapshot:        Pointer to timestamp storage
 *
 * Stores clock monotonic, boottime and realtime timestamps.
 *
 * Boot time is a racy access on 32bit systems if the sleep time injection
 * happens late during resume and not in timekeeping_resume(). That could
 * be avoided by expanding struct tk_read_base with boot offset for 32bit
 * and adding more overhead to the update. As this is a hard to observe
 * once per resume event which can be filtered with reasonable effort using
 * the accurate mono/real timestamps, it's probably not worth the trouble.
 *
 * Aside of that it might be possible on 32 and 64 bit to observe the
 * following when the sleep time injection happens late:
 *
 * CPU 0                                CPU 1
 * timekeeping_resume()
 * ktime_get_fast_timestamps()
 *        mono, real = __ktime_get_real_fast()
 *                                        inject_sleep_time()
 *                                           update boot offset
 *        boot = mono + bootoffset;
 *
 * That means that boot time already has the sleep time adjustment, but
 * real time does not. On the next readout both are in sync again.
 *
 * Preventing this for 64bit is not really feasible without destroying the
 * careful cache layout of the timekeeper because the sequence count and
 * struct tk_read_base would then need two cache lines instead of one.
 *
 * Access to the time keeper clock source is disabled across the innermost
 * steps of suspend/resume. The accessors still work, but the timestamps
 * are frozen until time keeping is resumed which happens very early.
 *
 * For regular suspend/resume there is no observable difference vs. sched
 * clock, but it might affect some of the nasty low level debug printks.
 *
 * OTOH, access to sched clock is not guaranteed across suspend/resume on
 * all systems either so it depends on the hardware in use.
 *
 * If that turns out to be a real problem then this could be mitigated by
 * using sched clock in a similar way as during early boot. But it's not as
 * trivial as on early boot because it needs some careful protection
 * against the clock monotonic timestamp jumping backwards on resume.
 */
void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
        snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
}

/**
 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
 * @tk: Timekeeper to snapshot.
 *
 * It generally is unsafe to access the clocksource after timekeeping has been
 * suspended, so take a snapshot of the readout base of @tk and use it as the
 * fast timekeeper's readout base while suspended.  It will return the same
 * number of cycles every time until timekeeping is resumed at which time the
 * proper readout base for the fast timekeeper will be restored automatically.
 */
static void halt_fast_timekeeper(const struct timekeeper *tk)
{
        static struct tk_read_base tkr_dummy;
        const struct tk_read_base *tkr = &tk->tkr_mono;

        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tk_clock_read(tkr);
        tkr_dummy.clock = &dummy_clock;
        tkr_dummy.base_real = tkr->base + tk->offs_real;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

        tkr = &tk->tkr_raw;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        tkr_dummy.clock = &dummy_clock;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}

static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);

static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
{
        raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
}

/**
 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
 * @nb: Pointer to the notifier block to register
 */
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
        update_pvclock_gtod(tk, true);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);

/**
 * pvclock_gtod_unregister_notifier - unregister a pvclock
 * timedata update listener
 * @nb: Pointer to the notifier block to unregister
 */
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);

/*
 * tk_update_leap_state - helper to update the next_leap_ktime
 */
static inline void tk_update_leap_state(struct timekeeper *tk)
{
        tk->next_leap_ktime = ntp_get_next_leap();
        if (tk->next_leap_ktime != KTIME_MAX)
                /* Convert to monotonic time */
                tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}

/*
 * Update the ktime_t based scalar nsec members of the timekeeper
 */
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
        u64 seconds;
        u32 nsec;

        /*
         * The xtime based monotonic readout is:
         *        nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
         * The ktime based monotonic readout is:
         *        nsec = base_mono + now();
         * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;

        /* Update the monotonic raw base */
        tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}

/* must hold timekeeper_lock */
static void timekeeping_update(struct timekeeper *tk, unsigned int action)
{
        if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear();
        }

        tk_update_leap_state(tk);
        tk_update_ktime_data(tk);

        update_vsyscall(tk);
        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);

        tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
        update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
        update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);

        if (action & TK_CLOCK_WAS_SET)
                tk->clock_was_set_seq++;
        /*
         * The mirroring of the data to the shadow-timekeeper needs
         * to happen last here to ensure we don't over-write the
         * timekeeper structure on the next update with stale data
         */
        if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
}

/**
 * timekeeping_forward_now - update clock to the current time
 * @tk:                Pointer to the timekeeper to update
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        u64 cycle_now, delta;

        cycle_now = tk_clock_read(&tk->tkr_mono);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        while (delta > 0) {
                u64 max = tk->tkr_mono.clock->max_cycles;
                u64 incr = delta < max ? delta : max;

                tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
                tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
                tk_normalize_xtime(tk);
                delta -= incr;
        }
}

/**
 * ktime_get_real_ts64 - Returns the time of day in a timespec64.
 * @ts:                pointer to the timespec to be set
 *
 * Returns the time of day in a timespec64 (WARN if suspended).
 */
void ktime_get_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_real_ts64);

ktime_t ktime_get(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);

u32 ktime_get_resolution_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u32 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);

static ktime_t *offsets[TK_OFFS_MAX] = {
        [TK_OFFS_REAL]        = &tk_core.timekeeper.offs_real,
        [TK_OFFS_BOOT]        = &tk_core.timekeeper.offs_boot,
        [TK_OFFS_TAI]        = &tk_core.timekeeper.offs_tai,
};

ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);

}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);

ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);

/**
 * ktime_mono_to_any() - convert monotonic time to any other time
 * @tmono:        time to convert.
 * @offs:        which offset to use
 */
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
        ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t tconv;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                tconv = ktime_add(tmono, *offset);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);

/**
 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
 */
ktime_t ktime_get_raw(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_raw.base;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_raw);

/**
 * ktime_get_ts64 - get the monotonic clock in timespec64 format
 * @ts:                pointer to timespec variable
 *
 * The function calculates the monotonic clock from the realtime
 * clock and the wall_to_monotonic offset and stores the result
 * in normalized timespec64 format in the variable pointed to by @ts.
 */
void ktime_get_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 tomono;
        unsigned int seq;
        u64 nsec;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_sec += tomono.tv_sec;
        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);

/**
 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
 *
 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
 * works on both 32 and 64 bit systems. On 32 bit systems the readout
 * covers ~136 years of uptime which should be enough to prevent
 * premature wrap arounds.
 */
time64_t ktime_get_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        WARN_ON(timekeeping_suspended);
        return tk->ktime_sec;
}
EXPORT_SYMBOL_GPL(ktime_get_seconds);

/**
 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
 *
 * Returns the wall clock seconds since 1970.
 *
 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
 * 32bit systems the access must be protected with the sequence
 * counter to provide "atomic" access to the 64bit tk->xtime_sec
 * value.
 */
time64_t ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        time64_t seconds;
        unsigned int seq;

        if (IS_ENABLED(CONFIG_64BIT))
                return tk->xtime_sec;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                seconds = tk->xtime_sec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return seconds;
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);

/**
 * __ktime_get_real_seconds - The same as ktime_get_real_seconds
 * but without the sequence counter protect. This internal function
 * is called just when timekeeping lock is already held.
 */
noinstr time64_t __ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return tk->xtime_sec;
}

/**
 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
 * @systime_snapshot:        pointer to struct receiving the system time snapshot
 */
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base_raw;
        ktime_t base_real;
        u64 nsec_raw;
        u64 nsec_real;
        u64 now;

        WARN_ON_ONCE(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
                systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;
                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
                nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        systime_snapshot->cycles = now;
        systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
        systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);

/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
        u64 tmp, rem;

        tmp = div64_u64_rem(*base, div, &rem);

        if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
            ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
                return -EOVERFLOW;
        tmp *= mult;

        rem = div64_u64(rem * mult, div);
        *base = tmp + rem;
        return 0;
}

/**
 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
 * @history:                        Snapshot representing start of history
 * @partial_history_cycles:        Cycle offset into history (fractional part)
 * @total_history_cycles:        Total history length in cycles
 * @discontinuity:                True indicates clock was set on history period
 * @ts:                                Cross timestamp that should be adjusted using
 *        partial/total ratio
 *
 * Helper function used by get_device_system_crosststamp() to correct the
 * crosstimestamp corresponding to the start of the current interval to the
 * system counter value (timestamp point) provided by the driver. The
 * total_history_* quantities are the total history starting at the provided
 * reference point and ending at the start of the current interval. The cycle
 * count between the driver timestamp point and the start of the current
 * interval is partial_history_cycles.
 */
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                                         u64 partial_history_cycles,
                                         u64 total_history_cycles,
                                         bool discontinuity,
                                         struct system_device_crosststamp *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 corr_raw, corr_real;
        bool interp_forward;
        int ret;

        if (total_history_cycles == 0 || partial_history_cycles == 0)
                return 0;

        /* Interpolate shortest distance from beginning or end of history */
        interp_forward = partial_history_cycles > total_history_cycles / 2;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;

        /*
         * Scale the monotonic raw time delta by:
         *        partial_history_cycles / total_history_cycles
         */
        corr_raw = (u64)ktime_to_ns(
                ktime_sub(ts->sys_monoraw, history->raw));
        ret = scale64_check_overflow(partial_history_cycles,
                                     total_history_cycles, &corr_raw);
        if (ret)
                return ret;

        /*
         * If there is a discontinuity in the history, scale monotonic raw
         *        correction by:
         *        mult(real)/mult(raw) yielding the realtime correction
         * Otherwise, calculate the realtime correction similar to monotonic
         *        raw calculation
         */
        if (discontinuity) {
                corr_real = mul_u64_u32_div
                        (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
        } else {
                corr_real = (u64)ktime_to_ns(
                        ktime_sub(ts->sys_realtime, history->real));
                ret = scale64_check_overflow(partial_history_cycles,
                                             total_history_cycles, &corr_real);
                if (ret)
                        return ret;
        }

        /* Fixup monotonic raw and real time time values */
        if (interp_forward) {
                ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
                ts->sys_realtime = ktime_add_ns(history->real, corr_real);
        } else {
                ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
                ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
        }

        return 0;
}

/*
 * timestamp_in_interval - true if ts is chronologically in [start, end]
 *
 * True if ts occurs chronologically at or after start, and before or at end.
 */
static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
        if (ts >= start && ts <= end)
                return true;
        if (start > end && (ts >= start || ts <= end))
                return true;
        return false;
}

/**
 * get_device_system_crosststamp - Synchronously capture system/device timestamp
 * @get_time_fn:        Callback to get simultaneous device time and
 *        system counter from the device driver
 * @ctx:                Context passed to get_time_fn()
 * @history_begin:        Historical reference point used to interpolate system
 *        time when counter provided by the driver is before the current interval
 * @xtstamp:                Receives simultaneously captured system and device time
 *
 * Reads a timestamp from a device and correlates it to system time
 */
int get_device_system_crosststamp(int (*get_time_fn)
                                  (ktime_t *device_time,
                                   struct system_counterval_t *sys_counterval,
                                   void *ctx),
                                  void *ctx,
                                  struct system_time_snapshot *history_begin,
                                  struct system_device_crosststamp *xtstamp)
{
        struct system_counterval_t system_counterval;
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 cycles, now, interval_start;
        unsigned int clock_was_set_seq = 0;
        ktime_t base_real, base_raw;
        u64 nsec_real, nsec_raw;
        u8 cs_was_changed_seq;
        unsigned int seq;
        bool do_interp;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                /*
                 * Try to synchronously capture device time and a system
                 * counter value calling back into the device driver
                 */
                ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
                if (ret)
                        return ret;

                /*
                 * Verify that the clocksource ID associated with the captured
                 * system counter value is the same as for the currently
                 * installed timekeeper clocksource
                 */
                if (system_counterval.cs_id == CSID_GENERIC ||
                    tk->tkr_mono.clock->id != system_counterval.cs_id)
                        return -ENODEV;
                cycles = system_counterval.cycles;

                /*
                 * Check whether the system counter value provided by the
                 * device driver is on the current timekeeping interval.
                 */
                now = tk_clock_read(&tk->tkr_mono);
                interval_start = tk->tkr_mono.cycle_last;
                if (!timestamp_in_interval(interval_start, now, cycles)) {
                        clock_was_set_seq = tk->clock_was_set_seq;
                        cs_was_changed_seq = tk->cs_was_changed_seq;
                        cycles = interval_start;
                        do_interp = true;
                } else {
                        do_interp = false;
                }

                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;

                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
                nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
        xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);

        /*
         * Interpolate if necessary, adjusting back from the start of the
         * current interval
         */
        if (do_interp) {
                u64 partial_history_cycles, total_history_cycles;
                bool discontinuity;

                /*
                 * Check that the counter value is not before the provided
                 * history reference and that the history doesn't cross a
                 * clocksource change
                 */
                if (!history_begin ||
                    !timestamp_in_interval(history_begin->cycles,
                                           cycles, system_counterval.cycles) ||
                    history_begin->cs_was_changed_seq != cs_was_changed_seq)
                        return -EINVAL;
                partial_history_cycles = cycles - system_counterval.cycles;
                total_history_cycles = cycles - history_begin->cycles;
                discontinuity =
                        history_begin->clock_was_set_seq != clock_was_set_seq;

                ret = adjust_historical_crosststamp(history_begin,
                                                    partial_history_cycles,
                                                    total_history_cycles,
                                                    discontinuity, xtstamp);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);

/**
 * do_settimeofday64 - Sets the time of day.
 * @ts:     pointer to the timespec64 variable containing the new time
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
int do_settimeofday64(const struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 ts_delta, xt;
        unsigned long flags;
        int ret = 0;

        if (!timespec64_valid_settod(ts))
                return -EINVAL;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        xt = tk_xtime(tk);
        ts_delta = timespec64_sub(*ts, xt);

        if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
                ret = -EINVAL;
                goto out;
        }

        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));

        tk_set_xtime(tk, ts);
out:
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);

        if (!ret) {
                audit_tk_injoffset(ts_delta);
                add_device_randomness(ts, sizeof(*ts));
        }

        return ret;
}
EXPORT_SYMBOL(do_settimeofday64);

/**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @ts:                Pointer to the timespec variable containing the offset
 *
 * Adds or subtracts an offset value from the current time.
 */
static int timekeeping_inject_offset(const struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        struct timespec64 tmp;
        int ret = 0;

        if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        /* Make sure the proposed value is valid */
        tmp = timespec64_add(tk_xtime(tk), *ts);
        if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
            !timespec64_valid_settod(&tmp)) {
                ret = -EINVAL;
                goto error;
        }

        tk_xtime_add(tk, ts);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));

error: /* even if we error out, we forwarded the time, so call update */
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);

        return ret;
}

/*
 * Indicates if there is an offset between the system clock and the hardware
 * clock/persistent clock/rtc.
 */
int persistent_clock_is_local;

/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
 * This is ugly, but preferable to the alternatives.  Otherwise we
 * would either need to write a program to do it in /etc/rc (and risk
 * confusion if the program gets run more than once; it would also be
 * hard to make the program warp the clock precisely n hours)  or
 * compile in the timezone information into the kernel.  Bad, bad....
 *
 *                                                - TYT, 1992-01-01
 *
 * The best thing to do is to keep the CMOS clock in universal time (UTC)
 * as real UNIX machines always do it. This avoids all headaches about
 * daylight saving times and warping kernel clocks.
 */
void timekeeping_warp_clock(void)
{
        if (sys_tz.tz_minuteswest != 0) {
                struct timespec64 adjust;

                persistent_clock_is_local = 1;
                adjust.tv_sec = sys_tz.tz_minuteswest * 60;
                adjust.tv_nsec = 0;
                timekeeping_inject_offset(&adjust);
        }
}

/*
 * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
 */
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
        tk->tai_offset = tai_offset;
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}

/*
 * change_clocksource - Swaps clocksources if a new one is available
 *
 * Accumulates current time interval and initializes new clocksource
 */
static int change_clocksource(void *data)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *new, *old = NULL;
        unsigned long flags;
        bool change = false;

        new = (struct clocksource *) data;

        /*
         * If the cs is in module, get a module reference. Succeeds
         * for built-in code (owner == NULL) as well.
         */
        if (try_module_get(new->owner)) {
                if (!new->enable || new->enable(new) == 0)
                        change = true;
                else
                        module_put(new->owner);
        }

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        if (change) {
                old = tk->tkr_mono.clock;
                tk_setup_internals(tk, new);
        }

        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        if (old) {
                if (old->disable)
                        old->disable(old);

                module_put(old->owner);
        }

        return 0;
}

/**
 * timekeeping_notify - Install a new clock source
 * @clock:                pointer to the clock source
 *
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
int timekeeping_notify(struct clocksource *clock)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
        return tk->tkr_mono.clock == clock ? 0 : -1;
}

/**
 * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the raw monotonic time (completely un-modified by ntp)
 */
void ktime_get_raw_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->raw_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_raw_ts64);


/**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
int timekeeping_valid_for_hres(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
 */
u64 timekeeping_max_deferment(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->max_idle_ns;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * read_persistent_clock64 -  Return time from the persistent clock.
 * @ts: Pointer to the storage for the readout value
 *
 * Weak dummy function for arches that do not yet support it.
 * Reads the time from the battery backed persistent clock.
 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
 *
 *  XXX - Do be sure to remove it once all arches implement it.
 */
void __weak read_persistent_clock64(struct timespec64 *ts)
{
        ts->tv_sec = 0;
        ts->tv_nsec = 0;
}

/**
 * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
 *                                        from the boot.
 * @wall_time:          current time as returned by persistent clock
 * @boot_offset:  offset that is defined as wall_time - boot_time
 *
 * Weak dummy function for arches that do not yet support it.
 *
 * The default function calculates offset based on the current value of
 * local_clock(). This way architectures that support sched_clock() but don't
 * support dedicated boot time clock will provide the best estimate of the
 * boot time.
 */
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
                                     struct timespec64 *boot_offset)
{
        read_persistent_clock64(wall_time);
        *boot_offset = ns_to_timespec64(local_clock());
}

/*
 * Flag reflecting whether timekeeping_resume() has injected sleeptime.
 *
 * The flag starts of false and is only set when a suspend reaches
 * timekeeping_suspend(), timekeeping_resume() sets it to false when the
 * timekeeper clocksource is not stopping across suspend and has been
 * used to update sleep time. If the timekeeper clocksource has stopped
 * then the flag stays true and is used by the RTC resume code to decide
 * whether sleeptime must be injected and if so the flag gets false then.
 *
 * If a suspend fails before reaching timekeeping_resume() then the flag
 * stays false and prevents erroneous sleeptime injection.
 */
static bool suspend_timing_needed;

/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists;

/*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
void __init timekeeping_init(void)
{
        struct timespec64 wall_time, boot_offset, wall_to_mono;
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock;
        unsigned long flags;

        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
        if (timespec64_valid_settod(&wall_time) &&
            timespec64_to_ns(&wall_time) > 0) {
                persistent_clock_exists = true;
        } else if (timespec64_to_ns(&wall_time) != 0) {
                pr_warn("Persistent clock returned invalid value");
                wall_time = (struct timespec64){0};
        }

        if (timespec64_compare(&wall_time, &boot_offset) < 0)
                boot_offset = (struct timespec64){0};

        /*
         * We want set wall_to_mono, so the following is true:
         * wall time + wall_to_mono = boot time
         */
        wall_to_mono = timespec64_sub(boot_offset, wall_time);

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        ntp_init();

        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
        tk_setup_internals(tk, clock);

        tk_set_xtime(tk, &wall_time);
        tk->raw_sec = 0;

        tk_set_wall_to_mono(tk, wall_to_mono);

        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}

/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;

/**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @tk:                Pointer to the timekeeper to be updated
 * @delta:        Pointer to the delta value in timespec64 format
 *
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                           const struct timespec64 *delta)
{
        if (!timespec64_valid_strict(delta)) {
                printk_deferred(KERN_WARNING
                                "__timekeeping_inject_sleeptime: Invalid "
                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
        tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
        tk_debug_account_sleep_time(delta);
}

#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/*
 * We have three kinds of time sources to use for sleep time
 * injection, the preference order is:
 * 1) non-stop clocksource
 * 2) persistent clock (ie: RTC accessible when irqs are off)
 * 3) RTC
 *
 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
 * If system has neither 1) nor 2), 3) will be used finally.
 *
 *
 * If timekeeping has injected sleeptime via either 1) or 2),
 * 3) becomes needless, so in this case we don't need to call
 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
 * means.
 */
bool timekeeping_rtc_skipresume(void)
{
        return !suspend_timing_needed;
}

/*
 * 1) can be determined whether to use or not only when doing
 * timekeeping_resume() which is invoked after rtc_suspend(),
 * so we can't skip rtc_suspend() surely if system has 1).
 *
 * But if system has 2), 2) will definitely be used, so in this
 * case we don't need to call rtc_suspend(), and this is what
 * timekeeping_rtc_skipsuspend() means.
 */
bool timekeeping_rtc_skipsuspend(void)
{
        return persistent_clock_exists;
}

/**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
 * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
 * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
 */
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        suspend_timing_needed = false;

        timekeeping_forward_now(tk);

        __timekeeping_inject_sleeptime(tk, delta);

        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif

/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 */
void timekeeping_resume(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock = tk->tkr_mono.clock;
        unsigned long flags;
        struct timespec64 ts_new, ts_delta;
        u64 cycle_now, nsec;
        bool inject_sleeptime = false;

        read_persistent_clock64(&ts_new);

        clockevents_resume();
        clocksource_resume();

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        /*
         * After system resumes, we need to calculate the suspended time and
         * compensate it for the OS time. There are 3 sources that could be
         * used: Nonstop clocksource during suspend, persistent clock and rtc
         * device.
         *
         * One specific platform may have 1 or 2 or all of them, and the
         * preference will be:
         *        suspend-nonstop clocksource -> persistent clock -> rtc
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
        cycle_now = tk_clock_read(&tk->tkr_mono);
        nsec = clocksource_stop_suspend_timing(clock, cycle_now);
        if (nsec > 0) {
                ts_delta = ns_to_timespec64(nsec);
                inject_sleeptime = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
                inject_sleeptime = true;
        }

        if (inject_sleeptime) {
                suspend_timing_needed = false;
                __timekeeping_inject_sleeptime(tk, &ts_delta);
        }

        /* Re-base the last cycle value */
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        tk->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        touch_softlockup_watchdog();

        /* Resume the clockevent device(s) and hrtimers */
        tick_resume();
        /* Notify timerfd as resume is equivalent to clock_was_set() */
        timerfd_resume();
}

int timekeeping_suspend(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        struct timespec64                delta, delta_delta;
        static struct timespec64        old_delta;
        struct clocksource *curr_clock;
        u64 cycle_now;

        read_persistent_clock64(&timekeeping_suspend_time);

        /*
         * On some systems the persistent_clock can not be detected at
         * timekeeping_init by its return value, so if we see a valid
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
                persistent_clock_exists = true;

        suspend_timing_needed = true;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;

        /*
         * Since we've called forward_now, cycle_last stores the value
         * just read from the current clocksource. Save this to potentially
         * use in suspend timing.
         */
        curr_clock = tk->tkr_mono.clock;
        cycle_now = tk->tkr_mono.cycle_last;
        clocksource_start_suspend_timing(curr_clock, cycle_now);

        if (persistent_clock_exists) {
                /*
                 * To avoid drift caused by repeated suspend/resumes,
                 * which each can add ~1 second drift error,
                 * try to compensate so the difference in system time
                 * and persistent_clock time stays close to constant.
                 */
                delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
                delta_delta = timespec64_sub(delta, old_delta);
                if (abs(delta_delta.tv_sec) >= 2) {
                        /*
                         * if delta_delta is too large, assume time correction
                         * has occurred and set old_delta to the current delta.
                         */
                        old_delta = delta;
                } else {
                        /* Otherwise try to adjust old_system to compensate */
                        timekeeping_suspend_time =
                                timespec64_add(timekeeping_suspend_time, delta_delta);
                }
        }

        timekeeping_update(tk, TK_MIRROR);
        halt_fast_timekeeper(tk);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();

        return 0;
}

/* sysfs resume/suspend bits for timekeeping */
static struct syscore_ops timekeeping_syscore_ops = {
        .resume                = timekeeping_resume,
        .suspend        = timekeeping_suspend,
};

static int __init timekeeping_init_ops(void)
{
        register_syscore_ops(&timekeeping_syscore_ops);
        return 0;
}
device_initcall(timekeeping_init_ops);

/*
 * Apply a multiplier adjustment to the timekeeper
 */
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
                                                         s64 offset,
                                                         s32 mult_adj)
{
        s64 interval = tk->cycle_interval;

        if (mult_adj == 0) {
                return;
        } else if (mult_adj == -1) {
                interval = -interval;
                offset = -offset;
        } else if (mult_adj != 1) {
                interval *= mult_adj;
                offset *= mult_adj;
        }

        /*
         * So the following can be confusing.
         *
         * To keep things simple, lets assume mult_adj == 1 for now.
         *
         * When mult_adj != 1, remember that the interval and offset values
         * have been appropriately scaled so the math is the same.
         *
         * The basic idea here is that we're increasing the multiplier
         * by one, this causes the xtime_interval to be incremented by
         * one cycle_interval. This is because:
         *        xtime_interval = cycle_interval * mult
         * So if mult is being incremented by one:
         *        xtime_interval = cycle_interval * (mult + 1)
         * Its the same as:
         *        xtime_interval = (cycle_interval * mult) + cycle_interval
         * Which can be shortened to:
         *        xtime_interval += cycle_interval
         *
         * So offset stores the non-accumulated cycles. Thus the current
         * time (in shifted nanoseconds) is:
         *        now = (offset * adj) + xtime_nsec
         * Now, even though we're adjusting the clock frequency, we have
         * to keep time consistent. In other words, we can't jump back
         * in time, and we also want to avoid jumping forward in time.
         *
         * So given the same offset value, we need the time to be the same
         * both before and after the freq adjustment.
         *        now = (offset * adj_1) + xtime_nsec_1
         *        now = (offset * adj_2) + xtime_nsec_2
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_2) + xtime_nsec_2
         * And we know:
         *        adj_2 = adj_1 + 1
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * (adj_1+1)) + xtime_nsec_2
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_1) + offset + xtime_nsec_2
         * Canceling the sides:
         *        xtime_nsec_1 = offset + xtime_nsec_2
         * Which gives us:
         *        xtime_nsec_2 = xtime_nsec_1 - offset
         * Which simplifies to:
         *        xtime_nsec -= offset
         */
        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }

        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
        tk->tkr_mono.xtime_nsec -= offset;
}

/*
 * Adjust the timekeeper's multiplier to the correct frequency
 * and also to reduce the accumulated error value.
 */
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
        u32 mult;

        /*
         * Determine the multiplier from the current NTP tick length.
         * Avoid expensive division when the tick length doesn't change.
         */
        if (likely(tk->ntp_tick == ntp_tick_length())) {
                mult = tk->tkr_mono.mult - tk->ntp_err_mult;
        } else {
                tk->ntp_tick = ntp_tick_length();
                mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
                                 tk->xtime_remainder, tk->cycle_interval);
        }

        /*
         * If the clock is behind the NTP time, increase the multiplier by 1
         * to catch up with it. If it's ahead and there was a remainder in the
         * tick division, the clock will slow down. Otherwise it will stay
         * ahead until the tick length changes to a non-divisible value.
         */
        tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
        mult += tk->ntp_err_mult;

        timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);

        if (unlikely(tk->tkr_mono.clock->maxadj &&
                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }

        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
         * in the code above, its possible the required corrective factor to
         * xtime_nsec could cause it to underflow.
         *
         * Now, since we have already accumulated the second and the NTP
         * subsystem has been notified via second_overflow(), we need to skip
         * the next update.
         */
        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
                tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
                                                        tk->tkr_mono.shift;
                tk->xtime_sec--;
                tk->skip_second_overflow = 1;
        }
}

/*
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
 * Helper function that accumulates the nsecs greater than a second
 * from the xtime_nsec field to the xtime_secs field.
 * It also calls into the NTP code to handle leapsecond processing.
 */
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;

        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;

                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;

                /*
                 * Skip NTP update if this second was accumulated before,
                 * i.e. xtime_nsec underflowed in timekeeping_adjust()
                 */
                if (unlikely(tk->skip_second_overflow)) {
                        tk->skip_second_overflow = 0;
                        continue;
                }

                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
                if (unlikely(leap)) {
                        struct timespec64 ts;

                        tk->xtime_sec += leap;

                        ts.tv_sec = leap;
                        ts.tv_nsec = 0;
                        tk_set_wall_to_mono(tk,
                                timespec64_sub(tk->wall_to_monotonic, ts));

                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);

                        clock_set = TK_CLOCK_WAS_SET;
                }
        }
        return clock_set;
}

/*
 * logarithmic_accumulation - shifted accumulation of cycles
 *
 * This functions accumulates a shifted interval of cycles into
 * a shifted interval nanoseconds. Allows for O(log) accumulation
 * loop.
 *
 * Returns the unconsumed cycles.
 */
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
                                    u32 shift, unsigned int *clock_set)
{
        u64 interval = tk->cycle_interval << shift;
        u64 snsec_per_sec;

        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < interval)
                return offset;

        /* Accumulate one shifted interval */
        offset -= interval;
        tk->tkr_mono.cycle_last += interval;
        tk->tkr_raw.cycle_last  += interval;

        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);

        /* Accumulate raw time */
        tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
        snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
        while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
                tk->tkr_raw.xtime_nsec -= snsec_per_sec;
                tk->raw_sec++;
        }

        /* Accumulate error between NTP and clock interval */
        tk->ntp_error += tk->ntp_tick << shift;
        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
                                                (tk->ntp_error_shift + shift);

        return offset;
}

/*
 * timekeeping_advance - Updates the timekeeper to the current time and
 * current NTP tick length
 */
static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
        struct timekeeper *real_tk = &tk_core.timekeeper;
        struct timekeeper *tk = &shadow_timekeeper;
        u64 offset;
        int shift = 0, maxshift;
        unsigned int clock_set = 0;
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);

        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                goto out;

        offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);

        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
                goto out;

        /* Do some additional sanity checking */
        timekeeping_check_update(tk, offset);

        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= tk->cycle_interval) {
                offset = logarithmic_accumulation(tk, offset, shift,
                                                        &clock_set);
                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }

        /* Adjust the multiplier to correct NTP error */
        timekeeping_adjust(tk, offset);

        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
        clock_set |= accumulate_nsecs_to_secs(tk);

        write_seqcount_begin(&tk_core.seq);
        /*
         * Update the real timekeeper.
         *
         * We could avoid this memcpy by switching pointers, but that
         * requires changes to all other timekeeper usage sites as
         * well, i.e. move the timekeeper pointer getter into the
         * spinlocked/seqcount protected sections. And we trade this
         * memcpy under the tk_core.seq against one before we start
         * updating.
         */
        timekeeping_update(tk, clock_set);
        memcpy(real_tk, tk, sizeof(*tk));
        /* The memcpy must come last. Do not put anything here! */
        write_seqcount_end(&tk_core.seq);
out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return !!clock_set;
}

/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 */
void update_wall_time(void)
{
        if (timekeeping_advance(TK_ADV_TICK))
                clock_was_set_delayed();
}

/**
 * getboottime64 - Return the real time of system boot.
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
void getboottime64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);

        *ts = ktime_to_timespec64(t);
}
EXPORT_SYMBOL_GPL(getboottime64);

void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                *ts = tk_xtime(tk);
        } while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);

void ktime_get_coarse_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now, mono;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                now = tk_xtime(tk);
                mono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);

/*
 * Must hold jiffies_lock
 */
void do_timer(unsigned long ticks)
{
        jiffies_64 += ticks;
        calc_global_load();
}

/**
 * ktime_get_update_offsets_now - hrtimer helper
 * @cwsseq:        pointer to check and store the clock was set sequence number
 * @offs_real:        pointer to storage for monotonic -> realtime offset
 * @offs_boot:        pointer to storage for monotonic -> boottime offset
 * @offs_tai:        pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets if the
 * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
 * different.
 *
 * Called from hrtimer_interrupt() or retrigger_next_event()
 */
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
                                     ktime_t *offs_boot, ktime_t *offs_tai)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                base = ktime_add_ns(base, nsecs);

                if (*cwsseq != tk->clock_was_set_seq) {
                        *cwsseq = tk->clock_was_set_seq;
                        *offs_real = tk->offs_real;
                        *offs_boot = tk->offs_boot;
                        *offs_tai = tk->offs_tai;
                }

                /* Handle leapsecond insertion adjustments */
                if (unlikely(base >= tk->next_leap_ktime))
                        *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return base;
}

/*
 * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
 */
static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
        if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
                if (!(txc->modes & ADJ_OFFSET_READONLY) &&
                    !capable(CAP_SYS_TIME))
                        return -EPERM;
        } else {
                /* In order to modify anything, you gotta be super-user! */
                if (txc->modes && !capable(CAP_SYS_TIME))
                        return -EPERM;
                /*
                 * if the quartz is off by more than 10% then
                 * something is VERY wrong!
                 */
                if (txc->modes & ADJ_TICK &&
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
        }

        if (txc->modes & ADJ_SETOFFSET) {
                /* In order to inject time, you gotta be super-user! */
                if (!capable(CAP_SYS_TIME))
                        return -EPERM;

                /*
                 * Validate if a timespec/timeval used to inject a time
                 * offset is valid.  Offsets can be positive or negative, so
                 * we don't check tv_sec. The value of the timeval/timespec
                 * is the sum of its fields,but *NOTE*:
                 * The field tv_usec/tv_nsec must always be non-negative and
                 * we can't have more nanoseconds/microseconds than a second.
                 */
                if (txc->time.tv_usec < 0)
                        return -EINVAL;

                if (txc->modes & ADJ_NANO) {
                        if (txc->time.tv_usec >= NSEC_PER_SEC)
                                return -EINVAL;
                } else {
                        if (txc->time.tv_usec >= USEC_PER_SEC)
                                return -EINVAL;
                }
        }

        /*
         * Check for potential multiplication overflows that can
         * only happen on 64-bit systems:
         */
        if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
                if (LLONG_MIN / PPM_SCALE > txc->freq)
                        return -EINVAL;
                if (LLONG_MAX / PPM_SCALE < txc->freq)
                        return -EINVAL;
        }

        return 0;
}

/**
 * random_get_entropy_fallback - Returns the raw clock source value,
 * used by random.c for platforms with no valid random_get_entropy().
 */
unsigned long random_get_entropy_fallback(void)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
        struct clocksource *clock = READ_ONCE(tkr->clock);

        if (unlikely(timekeeping_suspended || !clock))
                return 0;
        return clock->read(clock);
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);

/**
 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
 */
int do_adjtimex(struct __kernel_timex *txc)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct audit_ntp_data ad;
        bool clock_set = false;
        struct timespec64 ts;
        unsigned long flags;
        s32 orig_tai, tai;
        int ret;

        /* Validate the data before disabling interrupts */
        ret = timekeeping_validate_timex(txc);
        if (ret)
                return ret;
        add_device_randomness(txc, sizeof(*txc));

        if (txc->modes & ADJ_SETOFFSET) {
                struct timespec64 delta;
                delta.tv_sec  = txc->time.tv_sec;
                delta.tv_nsec = txc->time.tv_usec;
                if (!(txc->modes & ADJ_NANO))
                        delta.tv_nsec *= 1000;
                ret = timekeeping_inject_offset(&delta);
                if (ret)
                        return ret;

                audit_tk_injoffset(delta);
        }

        audit_ntp_init(&ad);

        ktime_get_real_ts64(&ts);
        add_device_randomness(&ts, sizeof(ts));

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        orig_tai = tai = tk->tai_offset;
        ret = __do_adjtimex(txc, &ts, &tai, &ad);

        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tk, tai);
                timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
                clock_set = true;
        }
        tk_update_leap_state(tk);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        audit_ntp_log(&ad);

        /* Update the multiplier immediately if frequency was set directly */
        if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
                clock_set |= timekeeping_advance(TK_ADV_FREQ);

        if (clock_set)
                clock_was_set(CLOCK_REALTIME);

        ntp_notify_cmos_timer();

        return ret;
}

#ifdef CONFIG_NTP_PPS
/**
 * hardpps() - Accessor function to NTP __hardpps function
 */
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        __hardpps(phase_ts, raw_ts);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */

































    3 

































































    2 







    3 



    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * connection tracking event cache.
 */

#ifndef _NF_CONNTRACK_ECACHE_H
#define _NF_CONNTRACK_ECACHE_H
#include <net/netfilter/nf_conntrack.h>

#include <net/net_namespace.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <net/netfilter/nf_conntrack_extend.h>

enum nf_ct_ecache_state {
        NFCT_ECACHE_DESTROY_FAIL,        /* tried but failed to send destroy event */
        NFCT_ECACHE_DESTROY_SENT,        /* sent destroy event after failure */
};

struct nf_conntrack_ecache {
        unsigned long cache;                /* bitops want long */
        u16 ctmask;                        /* bitmask of ct events to be delivered */
        u16 expmask;                        /* bitmask of expect events to be delivered */
        u32 missed;                        /* missed events */
        u32 portid;                        /* netlink portid of destroyer */
};

static inline struct nf_conntrack_ecache *
nf_ct_ecache_find(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE);
#else
        return NULL;
#endif
}

static inline bool nf_ct_ecache_exist(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        return nf_ct_ext_exist(ct, NF_CT_EXT_ECACHE);
#else
        return false;
#endif
}

#ifdef CONFIG_NF_CONNTRACK_EVENTS

/* This structure is passed to event handler */
struct nf_ct_event {
        struct nf_conn *ct;
        u32 portid;
        int report;
};

struct nf_exp_event {
        struct nf_conntrack_expect *exp;
        u32 portid;
        int report;
};

struct nf_ct_event_notifier {
        int (*ct_event)(unsigned int events, const struct nf_ct_event *item);
        int (*exp_event)(unsigned int events, const struct nf_exp_event *item);
};

void nf_conntrack_register_notifier(struct net *net,
                                   const struct nf_ct_event_notifier *nb);
void nf_conntrack_unregister_notifier(struct net *net);

void nf_ct_deliver_cached_events(struct nf_conn *ct);
int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
                                  u32 portid, int report);

bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp);
#else

static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct)
{
}

static inline int nf_conntrack_eventmask_report(unsigned int eventmask,
                                                struct nf_conn *ct,
                                                u32 portid,
                                                int report)
{
        return 0;
}

static inline bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
        return false;
}
#endif

static inline void
nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        struct net *net = nf_ct_net(ct);
        struct nf_conntrack_ecache *e;

        if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
                return;

        e = nf_ct_ecache_find(ct);
        if (e == NULL)
                return;

        set_bit(event, &e->cache);
#endif
}

static inline int
nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct,
                          u32 portid, int report)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        if (nf_ct_ecache_exist(ct))
                return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
#endif
        return 0;
}

static inline int
nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        if (nf_ct_ecache_exist(ct))
                return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
#endif
        return 0;
}

#ifdef CONFIG_NF_CONNTRACK_EVENTS
void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
                               struct nf_conntrack_expect *exp,
                               u32 portid, int report);

void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state);

void nf_conntrack_ecache_pernet_init(struct net *net);
void nf_conntrack_ecache_pernet_fini(struct net *net);

struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net);

static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net)
{
        return net->ct.ecache_dwork_pending;
}
#else /* CONFIG_NF_CONNTRACK_EVENTS */

static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e,
                                             struct nf_conntrack_expect *exp,
                                             u32 portid,
                                             int report)
{
}

static inline void nf_conntrack_ecache_work(struct net *net,
                                            enum nf_ct_ecache_state s)
{
}

static inline void nf_conntrack_ecache_pernet_init(struct net *net)
{
}

static inline void nf_conntrack_ecache_pernet_fini(struct net *net)
{
}
static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; }
#endif /* CONFIG_NF_CONNTRACK_EVENTS */
#endif /*_NF_CONNTRACK_ECACHE_H*/

































    9 


















    4 





















































































    2 






















































    1 




























    1 




































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timer

#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMER_H

#include <linux/tracepoint.h>
#include <linux/hrtimer.h>
#include <linux/timer.h>

DECLARE_EVENT_CLASS(timer_class,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field( void *,        timer        )
        ),

        TP_fast_assign(
                __entry->timer        = timer;
        ),

        TP_printk("timer=%p", __entry->timer)
);

/**
 * timer_init - called when the timer is initialized
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_init,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_timer_flags(flags)                        \
        __print_flags(flags, "|",                        \
                {  TIMER_MIGRATING,        "M" },                \
                {  TIMER_DEFERRABLE,        "D" },                \
                {  TIMER_PINNED,        "P" },                \
                {  TIMER_IRQSAFE,        "I" })

/**
 * timer_start - called when the timer is started
 * @timer:                pointer to struct timer_list
 * @bucket_expiry:        the bucket expiry time
 */
TRACE_EVENT(timer_start,

        TP_PROTO(struct timer_list *timer,
                unsigned long bucket_expiry),

        TP_ARGS(timer, bucket_expiry),

        TP_STRUCT__entry(
                __field( void *,        timer                )
                __field( void *,        function        )
                __field( unsigned long,        expires                )
                __field( unsigned long,        bucket_expiry        )
                __field( unsigned long,        now                )
                __field( unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->function        = timer->function;
                __entry->expires        = timer->expires;
                __entry->bucket_expiry        = bucket_expiry;
                __entry->now                = jiffies;
                __entry->flags                = timer->flags;
        ),

        TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now,
                  __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
                  __entry->flags >> TIMER_ARRAYSHIFT,
                  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);

/**
 * timer_expire_entry - called immediately before the timer callback
 * @timer:        pointer to struct timer_list
 * @baseclk:        value of timer_base::clk when timer expires
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(timer_expire_entry,

        TP_PROTO(struct timer_list *timer, unsigned long baseclk),

        TP_ARGS(timer, baseclk),

        TP_STRUCT__entry(
                __field( void *,        timer        )
                __field( unsigned long,        now        )
                __field( void *,        function)
                __field( unsigned long,        baseclk        )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->now                = jiffies;
                __entry->function        = timer->function;
                __entry->baseclk        = baseclk;
        ),

        TP_printk("timer=%p function=%ps now=%lu baseclk=%lu",
                  __entry->timer, __entry->function, __entry->now,
                  __entry->baseclk)
);

/**
 * timer_expire_exit - called immediately after the timer callback returns
 * @timer:        pointer to struct timer_list
 *
 * When used in combination with the timer_expire_entry tracepoint we can
 * determine the runtime of the timer callback function.
 *
 * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might
 * be invalid. We solely track the pointer.
 */
DEFINE_EVENT(timer_class, timer_expire_exit,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

/**
 * timer_cancel - called when the timer is canceled
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_cancel,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

TRACE_EVENT(timer_base_idle,

        TP_PROTO(bool is_idle, unsigned int cpu),

        TP_ARGS(is_idle, cpu),

        TP_STRUCT__entry(
                __field( bool,                is_idle        )
                __field( unsigned int,        cpu        )
        ),

        TP_fast_assign(
                __entry->is_idle        = is_idle;
                __entry->cpu                = cpu;
        ),

        TP_printk("is_idle=%d cpu=%d",
                  __entry->is_idle, __entry->cpu)
);

#define decode_clockid(type)                                                \
        __print_symbolic(type,                                                \
                { CLOCK_REALTIME,        "CLOCK_REALTIME"        },        \
                { CLOCK_MONOTONIC,        "CLOCK_MONOTONIC"        },        \
                { CLOCK_BOOTTIME,        "CLOCK_BOOTTIME"        },        \
                { CLOCK_TAI,                "CLOCK_TAI"                })

#define decode_hrtimer_mode(mode)                                        \
        __print_symbolic(mode,                                                \
                { HRTIMER_MODE_ABS,                "ABS"                },        \
                { HRTIMER_MODE_REL,                "REL"                },        \
                { HRTIMER_MODE_ABS_PINNED,        "ABS|PINNED"        },        \
                { HRTIMER_MODE_REL_PINNED,        "REL|PINNED"        },        \
                { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"        },        \
                { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"        },        \
                { HRTIMER_MODE_ABS_PINNED_SOFT,        "ABS|PINNED|SOFT" },        \
                { HRTIMER_MODE_REL_PINNED_SOFT,        "REL|PINNED|SOFT" },        \
                { HRTIMER_MODE_ABS_HARD,        "ABS|HARD" },                \
                { HRTIMER_MODE_REL_HARD,        "REL|HARD" },                \
                { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },        \
                { HRTIMER_MODE_REL_PINNED_HARD,        "REL|PINNED|HARD" })

/**
 * hrtimer_init - called when the hrtimer is initialized
 * @hrtimer:        pointer to struct hrtimer
 * @clockid:        the hrtimers clock
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_init,

        TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),

        TP_ARGS(hrtimer, clockid, mode),

        TP_STRUCT__entry(
                __field( void *,                hrtimer                )
                __field( clockid_t,                clockid                )
                __field( enum hrtimer_mode,        mode                )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  decode_clockid(__entry->clockid),
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_start - called when the hrtimer is started
 * @hrtimer:        pointer to struct hrtimer
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_start,

        TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),

        TP_ARGS(hrtimer, mode),

        TP_STRUCT__entry(
                __field( void *,        hrtimer                )
                __field( void *,        function        )
                __field( s64,                expires                )
                __field( s64,                softexpires        )
                __field( enum hrtimer_mode,        mode        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->function        = hrtimer->function;
                __entry->expires        = hrtimer_get_expires(hrtimer);
                __entry->softexpires        = hrtimer_get_softexpires(hrtimer);
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu "
                  "mode=%s", __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->expires,
                  (unsigned long long) __entry->softexpires,
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_expire_entry - called immediately before the hrtimer callback
 * @hrtimer:        pointer to struct hrtimer
 * @now:        pointer to variable which contains current time of the
 *                timers base.
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(hrtimer_expire_entry,

        TP_PROTO(struct hrtimer *hrtimer, ktime_t *now),

        TP_ARGS(hrtimer, now),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
                __field( s64,                now        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->now                = *now;
                __entry->function        = hrtimer->function;
        ),

        TP_printk("hrtimer=%p function=%ps now=%llu",
                  __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->now)
);

DECLARE_EVENT_CLASS(hrtimer_class,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
        ),

        TP_printk("hrtimer=%p", __entry->hrtimer)
);

/**
 * hrtimer_expire_exit - called immediately after the hrtimer callback returns
 * @hrtimer:        pointer to struct hrtimer
 *
 * When used in combination with the hrtimer_expire_entry tracepoint we can
 * determine the runtime of the callback function.
 */
DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_cancel - called when the hrtimer is canceled
 * @hrtimer:        pointer to struct hrtimer
 */
DEFINE_EVENT(hrtimer_class, hrtimer_cancel,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * itimer_state - called when itimer is started or canceled
 * @which:        name of the interval timer
 * @value:        the itimers value, itimer is canceled if value->it_value is
 *                zero, otherwise it is started
 * @expires:        the itimers expiry time
 */
TRACE_EVENT(itimer_state,

        TP_PROTO(int which, const struct itimerspec64 *const value,
                 unsigned long long expires),

        TP_ARGS(which, value, expires),

        TP_STRUCT__entry(
                __field(        int,                        which                )
                __field(        unsigned long long,        expires                )
                __field(        long,                        value_sec        )
                __field(        long,                        value_nsec        )
                __field(        long,                        interval_sec        )
                __field(        long,                        interval_nsec        )
        ),

        TP_fast_assign(
                __entry->which                = which;
                __entry->expires        = expires;
                __entry->value_sec        = value->it_value.tv_sec;
                __entry->value_nsec        = value->it_value.tv_nsec;
                __entry->interval_sec        = value->it_interval.tv_sec;
                __entry->interval_nsec        = value->it_interval.tv_nsec;
        ),

        TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC,
                  __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC)
);

/**
 * itimer_expire - called when itimer expires
 * @which:        type of the interval timer
 * @pid:        pid of the process which owns the timer
 * @now:        current time, used to calculate the latency of itimer
 */
TRACE_EVENT(itimer_expire,

        TP_PROTO(int which, struct pid *pid, unsigned long long now),

        TP_ARGS(which, pid, now),

        TP_STRUCT__entry(
                __field( int ,                        which        )
                __field( pid_t,                        pid        )
                __field( unsigned long long,        now        )
        ),

        TP_fast_assign(
                __entry->which        = which;
                __entry->now        = now;
                __entry->pid        = pid_nr(pid);
        ),

        TP_printk("which=%d pid=%d now=%llu", __entry->which,
                  (int) __entry->pid, __entry->now)
);

#ifdef CONFIG_NO_HZ_COMMON

#define TICK_DEP_NAMES                                        \
                tick_dep_mask_name(NONE)                \
                tick_dep_name(POSIX_TIMER)                \
                tick_dep_name(PERF_EVENTS)                \
                tick_dep_name(SCHED)                        \
                tick_dep_name(CLOCK_UNSTABLE)                \
                tick_dep_name(RCU)                        \
                tick_dep_name_end(RCU_EXP)

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

/* The MASK will convert to their bits and they need to be processed too */
#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
/* NONE only has a mask defined for it */
#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);

TICK_DEP_NAMES

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }

#define show_tick_dep_name(val)                                \
        __print_symbolic(val, TICK_DEP_NAMES)

TRACE_EVENT(tick_stop,

        TP_PROTO(int success, int dependency),

        TP_ARGS(success, dependency),

        TP_STRUCT__entry(
                __field( int ,                success        )
                __field( int ,                dependency )
        ),

        TP_fast_assign(
                __entry->success        = success;
                __entry->dependency        = dependency;
        ),

        TP_printk("success=%d dependency=%s",  __entry->success, \
                        show_tick_dep_name(__entry->dependency))
);
#endif

#endif /*  _TRACE_TIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *                INETPEER - A storage for permanent information about peers
 *
 *  Authors:        Andrey V. Savochkin <saw@msu.ru>
 */

#ifndef _NET_INETPEER_H
#define _NET_INETPEER_H

#include <linux/types.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/spinlock.h>
#include <linux/rtnetlink.h>
#include <net/ipv6.h>
#include <linux/atomic.h>

/* IPv4 address key for cache lookups */
struct ipv4_addr_key {
        __be32        addr;
        int        vif;
};

#define INETPEER_MAXKEYSZ   (sizeof(struct in6_addr) / sizeof(u32))

struct inetpeer_addr {
        union {
                struct ipv4_addr_key        a4;
                struct in6_addr                a6;
                u32                        key[INETPEER_MAXKEYSZ];
        };
        __u16                                family;
};

struct inet_peer {
        struct rb_node                rb_node;
        struct inetpeer_addr        daddr;

        u32                        metrics[RTAX_MAX];
        u32                        rate_tokens;        /* rate limiting for ICMP */
        u32                        n_redirects;
        unsigned long                rate_last;
        /*
         * Once inet_peer is queued for deletion (refcnt == 0), following field
         * is not available: rid
         * We can share memory with rcu_head to help keep inet_peer small.
         */
        union {
                struct {
                        atomic_t                        rid;                /* Frag reception counter */
                };
                struct rcu_head         rcu;
        };

        /* following fields might be frequently dirtied */
        __u32                        dtime;        /* the time of last use of not referenced entries */
        refcount_t                refcnt;
};

struct inet_peer_base {
        struct rb_root                rb_root;
        seqlock_t                lock;
        int                        total;
};

void inet_peer_base_init(struct inet_peer_base *);

void inet_initpeers(void) __init;

#define INETPEER_METRICS_NEW        (~(u32) 0)

static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip)
{
        iaddr->a4.addr = ip;
        iaddr->a4.vif = 0;
        iaddr->family = AF_INET;
}

static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr)
{
        return iaddr->a4.addr;
}

static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr,
                                        struct in6_addr *in6)
{
        iaddr->a6 = *in6;
        iaddr->family = AF_INET6;
}

static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr)
{
        return &iaddr->a6;
}

/* can be called with or without local BH being disabled */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
                               const struct inetpeer_addr *daddr,
                               int create);

static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base,
                                                __be32 v4daddr,
                                                int vif, int create)
{
        struct inetpeer_addr daddr;

        daddr.a4.addr = v4daddr;
        daddr.a4.vif = vif;
        daddr.family = AF_INET;
        return inet_getpeer(base, &daddr, create);
}

static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
                                                const struct in6_addr *v6daddr,
                                                int create)
{
        struct inetpeer_addr daddr;

        daddr.a6 = *v6daddr;
        daddr.family = AF_INET6;
        return inet_getpeer(base, &daddr, create);
}

static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
                                    const struct inetpeer_addr *b)
{
        int i, n;

        if (a->family == AF_INET)
                n = sizeof(a->a4) / sizeof(u32);
        else
                n = sizeof(a->a6) / sizeof(u32);

        for (i = 0; i < n; i++) {
                if (a->key[i] == b->key[i])
                        continue;
                if (a->key[i] < b->key[i])
                        return -1;
                return 1;
        }

        return 0;
}

/* can be called from BH context or outside */
void inet_putpeer(struct inet_peer *p);
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout);

void inetpeer_invalidate_tree(struct inet_peer_base *);

#endif /* _NET_INETPEER_H */













































































































































    2 



































































































    4 

























































    7 





    6 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sock

#if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SOCK_H

#include <net/sock.h>
#include <net/ipv6.h>
#include <linux/tracepoint.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <trace/events/net_probe_common.h>

#define family_names                        \
                EM(AF_INET)                                \
                EMe(AF_INET6)

/* The protocol traced by inet_sock_set_state */
#define inet_protocol_names                \
                EM(IPPROTO_TCP)                        \
                EM(IPPROTO_DCCP)                \
                EM(IPPROTO_SCTP)                \
                EMe(IPPROTO_MPTCP)

#define tcp_state_names                        \
                EM(TCP_ESTABLISHED)                \
                EM(TCP_SYN_SENT)                \
                EM(TCP_SYN_RECV)                \
                EM(TCP_FIN_WAIT1)                \
                EM(TCP_FIN_WAIT2)                \
                EM(TCP_TIME_WAIT)                \
                EM(TCP_CLOSE)                        \
                EM(TCP_CLOSE_WAIT)                \
                EM(TCP_LAST_ACK)                \
                EM(TCP_LISTEN)                        \
                EM(TCP_CLOSING)                        \
                EMe(TCP_NEW_SYN_RECV)

#define skmem_kind_names                        \
                EM(SK_MEM_SEND)                        \
                EMe(SK_MEM_RECV)

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a)       TRACE_DEFINE_ENUM(a);
#define EMe(a)      TRACE_DEFINE_ENUM(a);

family_names
inet_protocol_names
tcp_state_names
skmem_kind_names

#undef EM
#undef EMe
#define EM(a)       { a, #a },
#define EMe(a)      { a, #a }

#define show_family_name(val)                        \
        __print_symbolic(val, family_names)

#define show_inet_protocol_name(val)    \
        __print_symbolic(val, inet_protocol_names)

#define show_tcp_state_name(val)        \
        __print_symbolic(val, tcp_state_names)

#define show_skmem_kind_names(val)        \
        __print_symbolic(val, skmem_kind_names)

TRACE_EVENT(sock_rcvqueue_full,

        TP_PROTO(struct sock *sk, struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(int, rmem_alloc)
                __field(unsigned int, truesize)
                __field(int, sk_rcvbuf)
        ),

        TP_fast_assign(
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->truesize   = skb->truesize;
                __entry->sk_rcvbuf  = READ_ONCE(sk->sk_rcvbuf);
        ),

        TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d",
                __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf)
);

TRACE_EVENT(sock_exceed_buf_limit,

        TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),

        TP_ARGS(sk, prot, allocated, kind),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __array(long, sysctl_mem, 3)
                __field(long, allocated)
                __field(int, sysctl_rmem)
                __field(int, rmem_alloc)
                __field(int, sysctl_wmem)
                __field(int, wmem_alloc)
                __field(int, wmem_queued)
                __field(int, kind)
        ),

        TP_fast_assign(
                strscpy(__entry->name, prot->name, 32);
                __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]);
                __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]);
                __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]);
                __entry->allocated = allocated;
                __entry->sysctl_rmem = sk_get_rmem0(sk, prot);
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->sysctl_wmem = sk_get_wmem0(sk, prot);
                __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
                __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
                __entry->kind = kind;
        ),

        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
                __entry->name,
                __entry->sysctl_mem[0],
                __entry->sysctl_mem[1],
                __entry->sysctl_mem[2],
                __entry->allocated,
                __entry->sysctl_rmem,
                __entry->rmem_alloc,
                __entry->sysctl_wmem,
                __entry->wmem_alloc,
                __entry->wmem_queued,
                show_skmem_kind_names(__entry->kind)
        )
);

TRACE_EVENT(inet_sock_set_state,

        TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),

        TP_ARGS(sk, oldstate, newstate),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(int, oldstate)
                __field(int, newstate)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->oldstate = oldstate;
                __entry->newstate = newstate;

                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
                        show_family_name(__entry->family),
                        show_inet_protocol_name(__entry->protocol),
                        __entry->sport, __entry->dport,
                        __entry->saddr, __entry->daddr,
                        __entry->saddr_v6, __entry->daddr_v6,
                        show_tcp_state_name(__entry->oldstate),
                        show_tcp_state_name(__entry->newstate))
);

TRACE_EVENT(inet_sk_error_report,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(int, error)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->error = sk->sk_err;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d",
                  show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->error)
);

TRACE_EVENT(sk_data_ready,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(unsigned long, ip)
        ),

        TP_fast_assign(
                __entry->skaddr = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ip = _RET_IP_;
        ),

        TP_printk("family=%u protocol=%u func=%ps",
                  __entry->family, __entry->protocol, (void *)__entry->ip)
);

/*
 * sock send/recv msg length
 */
DECLARE_EVENT_CLASS(sock_msg_length,

        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags),

        TP_STRUCT__entry(
                __field(void *, sk)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(int, ret)
                __field(int, flags)
        ),

        TP_fast_assign(
                __entry->sk = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ret = ret;
                __entry->flags = flags;
        ),

        TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
                  __entry->sk, show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  !(__entry->flags & MSG_PEEK) ?
                  (__entry->ret > 0 ? __entry->ret : 0) : 0,
                  __entry->ret < 0 ? __entry->ret : 0,
                  __entry->flags)
);

DEFINE_EVENT(sock_msg_length, sock_send_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);

DEFINE_EVENT(sock_msg_length, sock_recv_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);
#endif /* _TRACE_SOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>










































































































































    1 





    1 

















































    2 



    2 



    1 








    1 




























    1 











    1 






    1 




    1 


























































































































































    1 

    1 























    1 



    1 






















































































































    1 















































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Integrity Measurement Architecture
 *
 * Copyright (C) 2005,2006,2007,2008 IBM Corporation
 *
 * Authors:
 * Reiner Sailer <sailer@watson.ibm.com>
 * Serge Hallyn <serue@us.ibm.com>
 * Kylene Hall <kylene@us.ibm.com>
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_main.c
 *        implements the IMA hooks: ima_bprm_check, ima_file_mmap,
 *        and ima_file_check.
 */

#include <linux/module.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/kernel_read_file.h>
#include <linux/mount.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/ima.h>
#include <linux/fs.h>
#include <linux/iversion.h>
#include <linux/evm.h>

#include "ima.h"

#ifdef CONFIG_IMA_APPRAISE
int ima_appraise = IMA_APPRAISE_ENFORCE;
#else
int ima_appraise;
#endif

int __ro_after_init ima_hash_algo = HASH_ALGO_SHA1;
static int hash_setup_done;

static struct notifier_block ima_lsm_policy_notifier = {
        .notifier_call = ima_lsm_policy_change,
};

static int __init hash_setup(char *str)
{
        struct ima_template_desc *template_desc = ima_template_desc_current();
        int i;

        if (hash_setup_done)
                return 1;

        if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) {
                if (strncmp(str, "sha1", 4) == 0) {
                        ima_hash_algo = HASH_ALGO_SHA1;
                } else if (strncmp(str, "md5", 3) == 0) {
                        ima_hash_algo = HASH_ALGO_MD5;
                } else {
                        pr_err("invalid hash algorithm \"%s\" for template \"%s\"",
                                str, IMA_TEMPLATE_IMA_NAME);
                        return 1;
                }
                goto out;
        }

        i = match_string(hash_algo_name, HASH_ALGO__LAST, str);
        if (i < 0) {
                pr_err("invalid hash algorithm \"%s\"", str);
                return 1;
        }

        ima_hash_algo = i;
out:
        hash_setup_done = 1;
        return 1;
}
__setup("ima_hash=", hash_setup);

enum hash_algo ima_get_current_hash_algo(void)
{
        return ima_hash_algo;
}

/* Prevent mmap'ing a file execute that is already mmap'ed write */
static int mmap_violation_check(enum ima_hooks func, struct file *file,
                                char **pathbuf, const char **pathname,
                                char *filename)
{
        struct inode *inode;
        int rc = 0;

        if ((func == MMAP_CHECK || func == MMAP_CHECK_REQPROT) &&
            mapping_writably_mapped(file->f_mapping)) {
                rc = -ETXTBSY;
                inode = file_inode(file);

                if (!*pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                        *pathname = ima_d_path(&file->f_path, pathbuf,
                                               filename);
                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, *pathname,
                                    "mmap_file", "mmapped_writers", rc, 0);
        }
        return rc;
}

/*
 * ima_rdwr_violation_check
 *
 * Only invalidate the PCR for measured files:
 *        - Opening a file for write when already open for read,
 *          results in a time of measure, time of use (ToMToU) error.
 *        - Opening a file for read when already open for write,
 *          could result in a file measurement error.
 *
 */
static void ima_rdwr_violation_check(struct file *file,
                                     struct ima_iint_cache *iint,
                                     int must_measure,
                                     char **pathbuf,
                                     const char **pathname,
                                     char *filename)
{
        struct inode *inode = file_inode(file);
        fmode_t mode = file->f_mode;
        bool send_tomtou = false, send_writers = false;

        if (mode & FMODE_WRITE) {
                if (atomic_read(&inode->i_readcount) && IS_IMA(inode)) {
                        if (!iint)
                                iint = ima_iint_find(inode);
                        /* IMA_MEASURE is set from reader side */
                        if (iint && test_bit(IMA_MUST_MEASURE,
                                                &iint->atomic_flags))
                                send_tomtou = true;
                }
        } else {
                if (must_measure)
                        set_bit(IMA_MUST_MEASURE, &iint->atomic_flags);
                if (inode_is_open_for_write(inode) && must_measure)
                        send_writers = true;
        }

        if (!send_tomtou && !send_writers)
                return;

        *pathname = ima_d_path(&file->f_path, pathbuf, filename);

        if (send_tomtou)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "ToMToU");
        if (send_writers)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "open_writers");
}

static void ima_check_last_writer(struct ima_iint_cache *iint,
                                  struct inode *inode, struct file *file)
{
        fmode_t mode = file->f_mode;
        bool update;

        if (!(mode & FMODE_WRITE))
                return;

        mutex_lock(&iint->mutex);
        if (atomic_read(&inode->i_writecount) == 1) {
                struct kstat stat;

                update = test_and_clear_bit(IMA_UPDATE_XATTR,
                                            &iint->atomic_flags);
                if ((iint->flags & IMA_NEW_FILE) ||
                    vfs_getattr_nosec(&file->f_path, &stat,
                                      STATX_CHANGE_COOKIE,
                                      AT_STATX_SYNC_AS_STAT) ||
                    !(stat.result_mask & STATX_CHANGE_COOKIE) ||
                    stat.change_cookie != iint->real_inode.version) {
                        iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
                        iint->measured_pcrs = 0;
                        if (update)
                                ima_update_xattr(iint, file);
                }
        }
        mutex_unlock(&iint->mutex);
}

/**
 * ima_file_free - called on __fput()
 * @file: pointer to file structure being freed
 *
 * Flag files that changed, based on i_version
 */
static void ima_file_free(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct ima_iint_cache *iint;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        iint = ima_iint_find(inode);
        if (!iint)
                return;

        ima_check_last_writer(iint, inode, file);
}

static int process_measurement(struct file *file, const struct cred *cred,
                               u32 secid, char *buf, loff_t size, int mask,
                               enum ima_hooks func)
{
        struct inode *real_inode, *inode = file_inode(file);
        struct ima_iint_cache *iint = NULL;
        struct ima_template_desc *template_desc = NULL;
        struct inode *metadata_inode;
        char *pathbuf = NULL;
        char filename[NAME_MAX];
        const char *pathname = NULL;
        int rc = 0, action, must_appraise = 0;
        int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
        struct evm_ima_xattr_data *xattr_value = NULL;
        struct modsig *modsig = NULL;
        int xattr_len = 0;
        bool violation_check;
        enum hash_algo hash_algo;
        unsigned int allowed_algos = 0;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return 0;

        /* Return an IMA_MEASURE, IMA_APPRAISE, IMA_AUDIT action
         * bitmask based on the appraise/audit/measurement policy.
         * Included is the appraise submask.
         */
        action = ima_get_action(file_mnt_idmap(file), inode, cred, secid,
                                mask, func, &pcr, &template_desc, NULL,
                                &allowed_algos);
        violation_check = ((func == FILE_CHECK || func == MMAP_CHECK ||
                            func == MMAP_CHECK_REQPROT) &&
                           (ima_policy_flag & IMA_MEASURE));
        if (!action && !violation_check)
                return 0;

        must_appraise = action & IMA_APPRAISE;

        /*  Is the appraise rule hook specific?  */
        if (action & IMA_FILE_APPRAISE)
                func = FILE_CHECK;

        inode_lock(inode);

        if (action) {
                iint = ima_inode_get(inode);
                if (!iint)
                        rc = -ENOMEM;
        }

        if (!rc && violation_check)
                ima_rdwr_violation_check(file, iint, action & IMA_MEASURE,
                                         &pathbuf, &pathname, filename);

        inode_unlock(inode);

        if (rc)
                goto out;
        if (!action)
                goto out;

        mutex_lock(&iint->mutex);

        if (test_and_clear_bit(IMA_CHANGE_ATTR, &iint->atomic_flags))
                /* reset appraisal flags if ima_inode_post_setattr was called */
                iint->flags &= ~(IMA_APPRAISE | IMA_APPRAISED |
                                 IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK |
                                 IMA_NONACTION_FLAGS);

        /*
         * Re-evaulate the file if either the xattr has changed or the
         * kernel has no way of detecting file change on the filesystem.
         * (Limited to privileged mounted filesystems.)
         */
        if (test_and_clear_bit(IMA_CHANGE_XATTR, &iint->atomic_flags) ||
            ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) &&
             !(inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) &&
             !(action & IMA_FAIL_UNVERIFIABLE_SIGS))) {
                iint->flags &= ~IMA_DONE_MASK;
                iint->measured_pcrs = 0;
        }

        /*
         * On stacked filesystems, detect and re-evaluate file data and
         * metadata changes.
         */
        real_inode = d_real_inode(file_dentry(file));
        if (real_inode != inode &&
            (action & IMA_DO_MASK) && (iint->flags & IMA_DONE_MASK)) {
                if (!IS_I_VERSION(real_inode) ||
                    integrity_inode_attrs_changed(&iint->real_inode,
                                                  real_inode)) {
                        iint->flags &= ~IMA_DONE_MASK;
                        iint->measured_pcrs = 0;
                }

                /*
                 * Reset the EVM status when metadata changed.
                 */
                metadata_inode = d_inode(d_real(file_dentry(file),
                                         D_REAL_METADATA));
                if (evm_metadata_changed(inode, metadata_inode))
                        iint->flags &= ~(IMA_APPRAISED |
                                         IMA_APPRAISED_SUBMASK);
        }

        /* Determine if already appraised/measured based on bitmask
         * (IMA_MEASURE, IMA_MEASURED, IMA_XXXX_APPRAISE, IMA_XXXX_APPRAISED,
         *  IMA_AUDIT, IMA_AUDITED)
         */
        iint->flags |= action;
        action &= IMA_DO_MASK;
        action &= ~((iint->flags & (IMA_DONE_MASK ^ IMA_MEASURED)) >> 1);

        /* If target pcr is already measured, unset IMA_MEASURE action */
        if ((action & IMA_MEASURE) && (iint->measured_pcrs & (0x1 << pcr)))
                action ^= IMA_MEASURE;

        /* HASH sets the digital signature and update flags, nothing else */
        if ((action & IMA_HASH) &&
            !(test_bit(IMA_DIGSIG, &iint->atomic_flags))) {
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);
                if ((xattr_value && xattr_len > 2) &&
                    (xattr_value->type == EVM_IMA_XATTR_DIGSIG))
                        set_bit(IMA_DIGSIG, &iint->atomic_flags);
                iint->flags |= IMA_HASHED;
                action ^= IMA_HASH;
                set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }

        /* Nothing to do, just return existing appraised status */
        if (!action) {
                if (must_appraise) {
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
                        if (!rc)
                                rc = ima_get_cache_status(iint, func);
                }
                goto out_locked;
        }

        if ((action & IMA_APPRAISE_SUBMASK) ||
            strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) {
                /* read 'security.ima' */
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);

                /*
                 * Read the appended modsig if allowed by the policy, and allow
                 * an additional measurement list entry, if needed, based on the
                 * template format and whether the file was already measured.
                 */
                if (iint->flags & IMA_MODSIG_ALLOWED) {
                        rc = ima_read_modsig(func, buf, size, &modsig);

                        if (!rc && ima_template_has_modsig(template_desc) &&
                            iint->flags & IMA_MEASURED)
                                action |= IMA_MEASURE;
                }
        }

        hash_algo = ima_get_hash_algo(xattr_value, xattr_len);

        rc = ima_collect_measurement(iint, file, buf, size, hash_algo, modsig);
        if (rc != 0 && rc != -EBADF && rc != -EINVAL)
                goto out_locked;

        if (!pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                pathname = ima_d_path(&file->f_path, &pathbuf, filename);

        if (action & IMA_MEASURE)
                ima_store_measurement(iint, file, pathname,
                                      xattr_value, xattr_len, modsig, pcr,
                                      template_desc);
        if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) {
                rc = ima_check_blacklist(iint, modsig, pcr);
                if (rc != -EPERM) {
                        inode_lock(inode);
                        rc = ima_appraise_measurement(func, iint, file,
                                                      pathname, xattr_value,
                                                      xattr_len, modsig);
                        inode_unlock(inode);
                }
                if (!rc)
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
        }
        if (action & IMA_AUDIT)
                ima_audit_measurement(iint, pathname);

        if ((file->f_flags & O_DIRECT) && (iint->flags & IMA_PERMIT_DIRECTIO))
                rc = 0;

        /* Ensure the digest was generated using an allowed algorithm */
        if (rc == 0 && must_appraise && allowed_algos != 0 &&
            (allowed_algos & (1U << hash_algo)) == 0) {
                rc = -EACCES;

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, file_inode(file),
                                    pathname, "collect_data",
                                    "denied-hash-algorithm", rc, 0);
        }
out_locked:
        if ((mask & MAY_WRITE) && test_bit(IMA_DIGSIG, &iint->atomic_flags) &&
             !(iint->flags & IMA_NEW_FILE))
                rc = -EACCES;
        mutex_unlock(&iint->mutex);
        kfree(xattr_value);
        ima_free_modsig(modsig);
out:
        if (pathbuf)
                __putname(pathbuf);
        if (must_appraise) {
                if (rc && (ima_appraise & IMA_APPRAISE_ENFORCE))
                        return -EACCES;
                if (file->f_mode & FMODE_WRITE)
                        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }
        return 0;
}

/**
 * ima_file_mmap - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured (May be NULL)
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 * @flags: operational flags
 *
 * Measure files being mmapped executable based on the ima_must_measure()
 * policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_mmap(struct file *file, unsigned long reqprot,
                         unsigned long prot, unsigned long flags)
{
        u32 secid;
        int ret;

        if (!file)
                return 0;

        security_current_getsecid_subj(&secid);

        if (reqprot & PROT_EXEC) {
                ret = process_measurement(file, current_cred(), secid, NULL,
                                          0, MAY_EXEC, MMAP_CHECK_REQPROT);
                if (ret)
                        return ret;
        }

        if (prot & PROT_EXEC)
                return process_measurement(file, current_cred(), secid, NULL,
                                           0, MAY_EXEC, MMAP_CHECK);

        return 0;
}

/**
 * ima_file_mprotect - based on policy, limit mprotect change
 * @vma: vm_area_struct protection is set to
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 *
 * Files can be mmap'ed read/write and later changed to execute to circumvent
 * IMA's mmap appraisal policy rules.  Due to locking issues (mmap semaphore
 * would be taken before i_mutex), files can not be measured or appraised at
 * this point.  Eliminate this integrity gap by denying the mprotect
 * PROT_EXECUTE change, if an mmap appraise policy rule exists.
 *
 * On mprotect change success, return 0.  On failure, return -EACESS.
 */
static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                             unsigned long prot)
{
        struct ima_template_desc *template = NULL;
        struct file *file;
        char filename[NAME_MAX];
        char *pathbuf = NULL;
        const char *pathname = NULL;
        struct inode *inode;
        int result = 0;
        int action;
        u32 secid;
        int pcr;

        /* Is mprotect making an mmap'ed file executable? */
        if (!(ima_policy_flag & IMA_APPRAISE) || !vma->vm_file ||
            !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC))
                return 0;

        security_current_getsecid_subj(&secid);
        inode = file_inode(vma->vm_file);
        action = ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                current_cred(), secid, MAY_EXEC, MMAP_CHECK,
                                &pcr, &template, NULL, NULL);
        action |= ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                 current_cred(), secid, MAY_EXEC,
                                 MMAP_CHECK_REQPROT, &pcr, &template, NULL,
                                 NULL);

        /* Is the mmap'ed file in policy? */
        if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
                return 0;

        if (action & IMA_APPRAISE_SUBMASK)
                result = -EPERM;

        file = vma->vm_file;
        pathname = ima_d_path(&file->f_path, &pathbuf, filename);
        integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, pathname,
                            "collect_data", "failed-mprotect", result, 0);
        if (pathbuf)
                __putname(pathbuf);

        return result;
}

/**
 * ima_bprm_check - based on policy, collect/store measurement.
 * @bprm: contains the linux_binprm structure
 *
 * The OS protects against an executable file, already open for write,
 * from being executed in deny_write_access() and an executable file,
 * already open for execute, from being modified in get_write_access().
 * So we can be certain that what we verify and measure here is actually
 * what is being executed.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_bprm_check(struct linux_binprm *bprm)
{
        int ret;
        u32 secid;

        security_current_getsecid_subj(&secid);
        ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0,
                                  MAY_EXEC, BPRM_CHECK);
        if (ret)
                return ret;

        security_cred_getsecid(bprm->cred, &secid);
        return process_measurement(bprm->file, bprm->cred, secid, NULL, 0,
                                   MAY_EXEC, CREDS_CHECK);
}

/**
 * ima_file_check - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured
 * @mask: contains MAY_READ, MAY_WRITE, MAY_EXEC or MAY_APPEND
 *
 * Measure files based on the ima_must_measure() policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_check(struct file *file, int mask)
{
        u32 secid;

        security_current_getsecid_subj(&secid);
        return process_measurement(file, current_cred(), secid, NULL, 0,
                                   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
                                           MAY_APPEND), FILE_CHECK);
}

static int __ima_inode_hash(struct inode *inode, struct file *file, char *buf,
                            size_t buf_size)
{
        struct ima_iint_cache *iint = NULL, tmp_iint;
        int rc, hash_algo;

        if (ima_policy_flag) {
                iint = ima_iint_find(inode);
                if (iint)
                        mutex_lock(&iint->mutex);
        }

        if ((!iint || !(iint->flags & IMA_COLLECTED)) && file) {
                if (iint)
                        mutex_unlock(&iint->mutex);

                memset(&tmp_iint, 0, sizeof(tmp_iint));
                mutex_init(&tmp_iint.mutex);

                rc = ima_collect_measurement(&tmp_iint, file, NULL, 0,
                                             ima_hash_algo, NULL);
                if (rc < 0) {
                        /* ima_hash could be allocated in case of failure. */
                        if (rc != -ENOMEM)
                                kfree(tmp_iint.ima_hash);

                        return -EOPNOTSUPP;
                }

                iint = &tmp_iint;
                mutex_lock(&iint->mutex);
        }

        if (!iint)
                return -EOPNOTSUPP;

        /*
         * ima_file_hash can be called when ima_collect_measurement has still
         * not been called, we might not always have a hash.
         */
        if (!iint->ima_hash || !(iint->flags & IMA_COLLECTED)) {
                mutex_unlock(&iint->mutex);
                return -EOPNOTSUPP;
        }

        if (buf) {
                size_t copied_size;

                copied_size = min_t(size_t, iint->ima_hash->length, buf_size);
                memcpy(buf, iint->ima_hash->digest, copied_size);
        }
        hash_algo = iint->ima_hash->algo;
        mutex_unlock(&iint->mutex);

        if (iint == &tmp_iint)
                kfree(iint->ima_hash);

        return hash_algo;
}

/**
 * ima_file_hash - return a measurement of the file
 * @file: pointer to the file
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The file hash returned is based on the entire file, including the appended
 * signature.
 *
 * If the measurement cannot be performed, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_file_hash(struct file *file, char *buf, size_t buf_size)
{
        if (!file)
                return -EINVAL;

        return __ima_inode_hash(file_inode(file), file, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_file_hash);

/**
 * ima_inode_hash - return the stored measurement if the inode has been hashed
 * and is in the iint cache.
 * @inode: pointer to the inode
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The hash returned is based on the entire contents, including the appended
 * signature.
 *
 * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
{
        if (!inode)
                return -EINVAL;

        return __ima_inode_hash(inode, NULL, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_inode_hash);

/**
 * ima_post_create_tmpfile - mark newly created tmpfile as new
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode of the newly created tmpfile
 *
 * No measuring, appraising or auditing of newly created tmpfiles is needed.
 * Skip calling process_measurement(), but indicate which newly, created
 * tmpfiles are in policy.
 */
static void ima_post_create_tmpfile(struct mnt_idmap *idmap,
                                    struct inode *inode)

{
        struct ima_iint_cache *iint;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for writing the security xattrs */
        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        iint->ima_file_status = INTEGRITY_PASS;
}

/**
 * ima_post_path_mknod - mark as a new inode
 * @idmap: idmap of the mount the inode was found from
 * @dentry: newly created dentry
 *
 * Mark files created via the mknodat syscall as new, so that the
 * file data can be written later.
 */
static void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct ima_iint_cache *iint;
        struct inode *inode = dentry->d_inode;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for re-opening empty files */
        iint->flags |= IMA_NEW_FILE;
}

/**
 * ima_read_file - pre-measure/appraise hook decision based on policy
 * @file: pointer to the file to be measured/appraised/audit
 * @read_id: caller identifier
 * @contents: whether a subsequent call will be made to ima_post_read_file()
 *
 * Permit reading a file based on policy. The policy rules are written
 * in terms of the policy identifier.  Appraising the integrity of
 * a file requires a file descriptor.
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
                         bool contents)
{
        enum ima_hooks func;
        u32 secid;

        /*
         * Do devices using pre-allocated memory run the risk of the
         * firmware being accessible to the device prior to the completion
         * of IMA's signature verification any more than when using two
         * buffers? It may be desirable to include the buffer address
         * in this API and walk all the dma_map_single() mappings to check.
         */

        /*
         * There will be a call made to ima_post_read_file() with
         * a filled buffer, so we don't need to perform an extra
         * read early here.
         */
        if (contents)
                return 0;

        /* Read entire file for all partial reads. */
        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getsecid_subj(&secid);
        return process_measurement(file, current_cred(), secid, NULL,
                                   0, MAY_READ, func);
}

const int read_idmap[READING_MAX_ID] = {
        [READING_FIRMWARE] = FIRMWARE_CHECK,
        [READING_MODULE] = MODULE_CHECK,
        [READING_KEXEC_IMAGE] = KEXEC_KERNEL_CHECK,
        [READING_KEXEC_INITRAMFS] = KEXEC_INITRAMFS_CHECK,
        [READING_POLICY] = POLICY_CHECK
};

/**
 * ima_post_read_file - in memory collect/appraise/audit measurement
 * @file: pointer to the file to be measured/appraised/audit
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @read_id: caller identifier
 *
 * Measure/appraise/audit in memory file based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_read_file(struct file *file, char *buf, loff_t size,
                              enum kernel_read_file_id read_id)
{
        enum ima_hooks func;
        u32 secid;

        /* permit signed certs */
        if (!file && read_id == READING_X509_CERTIFICATE)
                return 0;

        if (!file || !buf || size == 0) { /* should never happen */
                if (ima_appraise & IMA_APPRAISE_ENFORCE)
                        return -EACCES;
                return 0;
        }

        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getsecid_subj(&secid);
        return process_measurement(file, current_cred(), secid, buf, size,
                                   MAY_READ, func);
}

/**
 * ima_load_data - appraise decision based on policy
 * @id: kernel load data caller identifier
 * @contents: whether the full contents will be available in a later
 *              call to ima_post_load_data().
 *
 * Callers of this LSM hook can not measure, appraise, or audit the
 * data provided by userspace.  Enforce policy rules requiring a file
 * signature (eg. kexec'ed kernel image).
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_load_data(enum kernel_load_data_id id, bool contents)
{
        bool ima_enforce, sig_enforce;

        ima_enforce =
                (ima_appraise & IMA_APPRAISE_ENFORCE) == IMA_APPRAISE_ENFORCE;

        switch (id) {
        case LOADING_KEXEC_IMAGE:
                if (IS_ENABLED(CONFIG_KEXEC_SIG)
                    && arch_ima_get_secureboot()) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;
                }

                if (ima_enforce && (ima_appraise & IMA_APPRAISE_KEXEC)) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_FIRMWARE:
                if (ima_enforce && (ima_appraise & IMA_APPRAISE_FIRMWARE) && !contents) {
                        pr_err("Prevent firmware sysfs fallback loading.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_MODULE:
                sig_enforce = is_module_sig_enforced();

                if (ima_enforce && (!sig_enforce
                                    && (ima_appraise & IMA_APPRAISE_MODULES))) {
                        pr_err("impossible to appraise a module without a file descriptor. sig_enforce kernel parameter might help\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        default:
                break;
        }
        return 0;
}

/**
 * ima_post_load_data - appraise decision based on policy
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @load_id: kernel load data caller identifier
 * @description: @load_id-specific description of contents
 *
 * Measure/appraise/audit in memory buffer based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_load_data(char *buf, loff_t size,
                              enum kernel_load_data_id load_id,
                              char *description)
{
        if (load_id == LOADING_FIRMWARE) {
                if ((ima_appraise & IMA_APPRAISE_FIRMWARE) &&
                    (ima_appraise & IMA_APPRAISE_ENFORCE)) {
                        pr_err("Prevent firmware loading_store.\n");
                        return -EACCES; /* INTEGRITY_UNKNOWN */
                }
                return 0;
        }

        /*
         * Measure the init_module syscall buffer containing the ELF image.
         */
        if (load_id == LOADING_MODULE)
                ima_measure_critical_data("modules", "init_module",
                                          buf, size, true, NULL, 0);

        return 0;
}

/**
 * process_buffer_measurement - Measure the buffer or the buffer data hash
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
 * @buf: pointer to the buffer that needs to be added to the log.
 * @size: size of buffer(in bytes).
 * @eventname: event name to be used for the buffer entry.
 * @func: IMA hook
 * @pcr: pcr to extend the measurement
 * @func_data: func specific data, may be NULL
 * @buf_hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Based on policy, either the buffer data or buffer data hash is measured
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int process_buffer_measurement(struct mnt_idmap *idmap,
                               struct inode *inode, const void *buf, int size,
                               const char *eventname, enum ima_hooks func,
                               int pcr, const char *func_data,
                               bool buf_hash, u8 *digest, size_t digest_len)
{
        int ret = 0;
        const char *audit_cause = "ENOMEM";
        struct ima_template_entry *entry = NULL;
        struct ima_iint_cache iint = {};
        struct ima_event_data event_data = {.iint = &iint,
                                            .filename = eventname,
                                            .buf = buf,
                                            .buf_len = size};
        struct ima_template_desc *template;
        struct ima_max_digest_data hash;
        struct ima_digest_data *hash_hdr = container_of(&hash.hdr,
                                                struct ima_digest_data, hdr);
        char digest_hash[IMA_MAX_DIGEST_SIZE];
        int digest_hash_len = hash_digest_size[ima_hash_algo];
        int violation = 0;
        int action = 0;
        u32 secid;

        if (digest && digest_len < digest_hash_len)
                return -EINVAL;

        if (!ima_policy_flag && !digest)
                return -ENOENT;

        template = ima_template_desc_buf();
        if (!template) {
                ret = -EINVAL;
                audit_cause = "ima_template_desc_buf";
                goto out;
        }

        /*
         * Both LSM hooks and auxilary based buffer measurements are
         * based on policy.  To avoid code duplication, differentiate
         * between the LSM hooks and auxilary buffer measurements,
         * retrieving the policy rule information only for the LSM hook
         * buffer measurements.
         */
        if (func) {
                security_current_getsecid_subj(&secid);
                action = ima_get_action(idmap, inode, current_cred(),
                                        secid, 0, func, &pcr, &template,
                                        func_data, NULL);
                if (!(action & IMA_MEASURE) && !digest)
                        return -ENOENT;
        }

        if (!pcr)
                pcr = CONFIG_IMA_MEASURE_PCR_IDX;

        iint.ima_hash = hash_hdr;
        iint.ima_hash->algo = ima_hash_algo;
        iint.ima_hash->length = hash_digest_size[ima_hash_algo];

        ret = ima_calc_buffer_hash(buf, size, iint.ima_hash);
        if (ret < 0) {
                audit_cause = "hashing_error";
                goto out;
        }

        if (buf_hash) {
                memcpy(digest_hash, hash_hdr->digest, digest_hash_len);

                ret = ima_calc_buffer_hash(digest_hash, digest_hash_len,
                                           iint.ima_hash);
                if (ret < 0) {
                        audit_cause = "hashing_error";
                        goto out;
                }

                event_data.buf = digest_hash;
                event_data.buf_len = digest_hash_len;
        }

        if (digest)
                memcpy(digest, iint.ima_hash->digest, digest_hash_len);

        if (!ima_policy_flag || (func && !(action & IMA_MEASURE)))
                return 1;

        ret = ima_alloc_init_template(&event_data, &entry, template);
        if (ret < 0) {
                audit_cause = "alloc_entry";
                goto out;
        }

        ret = ima_store_template(entry, violation, NULL, event_data.buf, pcr);
        if (ret < 0) {
                audit_cause = "store_entry";
                ima_free_template_entry(entry);
        }

out:
        if (ret < 0)
                integrity_audit_message(AUDIT_INTEGRITY_PCR, NULL, eventname,
                                        func_measure_str(func),
                                        audit_cause, ret, 0, ret);

        return ret;
}

/**
 * ima_kexec_cmdline - measure kexec cmdline boot args
 * @kernel_fd: file descriptor of the kexec kernel being loaded
 * @buf: pointer to buffer
 * @size: size of buffer
 *
 * Buffers can only be measured, not appraised.
 */
void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
{
        struct fd f;

        if (!buf || !size)
                return;

        f = fdget(kernel_fd);
        if (!f.file)
                return;

        process_buffer_measurement(file_mnt_idmap(f.file), file_inode(f.file),
                                   buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
                                   NULL, false, NULL, 0);
        fdput(f);
}

/**
 * ima_measure_critical_data - measure kernel integrity critical data
 * @event_label: unique event label for grouping and limiting critical data
 * @event_name: event name for the record in the IMA measurement list
 * @buf: pointer to buffer data
 * @buf_len: length of buffer data (in bytes)
 * @hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Measure data critical to the integrity of the kernel into the IMA log
 * and extend the pcr.  Examples of critical data could be various data
 * structures, policies, and states stored in kernel memory that can
 * impact the integrity of the system.
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int ima_measure_critical_data(const char *event_label,
                              const char *event_name,
                              const void *buf, size_t buf_len,
                              bool hash, u8 *digest, size_t digest_len)
{
        if (!event_name || !event_label || !buf || !buf_len)
                return -ENOPARAM;

        return process_buffer_measurement(&nop_mnt_idmap, NULL, buf, buf_len,
                                          event_name, CRITICAL_DATA, 0,
                                          event_label, hash, digest,
                                          digest_len);
}
EXPORT_SYMBOL_GPL(ima_measure_critical_data);

#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS

/**
 * ima_kernel_module_request - Prevent crypto-pkcs1pad(rsa,*) requests
 * @kmod_name: kernel module name
 *
 * Avoid a verification loop where verifying the signature of the modprobe
 * binary requires executing modprobe itself. Since the modprobe iint->mutex
 * is already held when the signature verification is performed, a deadlock
 * occurs as soon as modprobe is executed within the critical region, since
 * the same lock cannot be taken again.
 *
 * This happens when public_key_verify_signature(), in case of RSA algorithm,
 * use alg_name to store internal information in order to construct an
 * algorithm on the fly, but crypto_larval_lookup() will try to use alg_name
 * in order to load a kernel module with same name.
 *
 * Since we don't have any real "crypto-pkcs1pad(rsa,*)" kernel modules,
 * we are safe to fail such module request from crypto_larval_lookup(), and
 * avoid the verification loop.
 *
 * Return: Zero if it is safe to load the kernel module, -EINVAL otherwise.
 */
static int ima_kernel_module_request(char *kmod_name)
{
        if (strncmp(kmod_name, "crypto-pkcs1pad(rsa,", 20) == 0)
                return -EINVAL;

        return 0;
}

#endif /* CONFIG_INTEGRITY_ASYMMETRIC_KEYS */

static int __init init_ima(void)
{
        int error;

        ima_appraise_parse_cmdline();
        ima_init_template_list();
        hash_setup(CONFIG_IMA_DEFAULT_HASH);
        error = ima_init();

        if (error && strcmp(hash_algo_name[ima_hash_algo],
                            CONFIG_IMA_DEFAULT_HASH) != 0) {
                pr_info("Allocating %s failed, going to use default hash algorithm %s\n",
                        hash_algo_name[ima_hash_algo], CONFIG_IMA_DEFAULT_HASH);
                hash_setup_done = 0;
                hash_setup(CONFIG_IMA_DEFAULT_HASH);
                error = ima_init();
        }

        if (error)
                return error;

        error = register_blocking_lsm_notifier(&ima_lsm_policy_notifier);
        if (error)
                pr_warn("Couldn't register LSM notifier, error %d\n", error);

        if (!error)
                ima_update_policy_flags();

        return error;
}

static struct security_hook_list ima_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(bprm_check_security, ima_bprm_check),
        LSM_HOOK_INIT(file_post_open, ima_file_check),
        LSM_HOOK_INIT(inode_post_create_tmpfile, ima_post_create_tmpfile),
        LSM_HOOK_INIT(file_release, ima_file_free),
        LSM_HOOK_INIT(mmap_file, ima_file_mmap),
        LSM_HOOK_INIT(file_mprotect, ima_file_mprotect),
        LSM_HOOK_INIT(kernel_load_data, ima_load_data),
        LSM_HOOK_INIT(kernel_post_load_data, ima_post_load_data),
        LSM_HOOK_INIT(kernel_read_file, ima_read_file),
        LSM_HOOK_INIT(kernel_post_read_file, ima_post_read_file),
        LSM_HOOK_INIT(path_post_mknod, ima_post_path_mknod),
#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(key_post_create_or_update, ima_post_key_create_or_update),
#endif
#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(kernel_module_request, ima_kernel_module_request),
#endif
        LSM_HOOK_INIT(inode_free_security, ima_inode_free),
};

static const struct lsm_id ima_lsmid = {
        .name = "ima",
        .id = LSM_ID_IMA,
};

static int __init init_ima_lsm(void)
{
        ima_iintcache_init();
        security_add_hooks(ima_hooks, ARRAY_SIZE(ima_hooks), &ima_lsmid);
        init_ima_appraise_lsm(&ima_lsmid);
        return 0;
}

struct lsm_blob_sizes ima_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct ima_iint_cache *),
};

DEFINE_LSM(ima) = {
        .name = "ima",
        .init = init_ima_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &ima_blob_sizes,
};

late_initcall(init_ima);        /* Start IMA after the TPM is available */










    2 
















































    1 




    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM csd

#if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CSD_H

#include <linux/tracepoint.h>

TRACE_EVENT(csd_queue_cpu,

        TP_PROTO(const unsigned int cpu,
                unsigned long callsite,
                smp_call_func_t func,
                call_single_data_t *csd),

        TP_ARGS(cpu, callsite, func, csd),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, func)
                __field(void *, csd)
                ),

            TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->func = func;
                __entry->csd  = csd;
                ),

        TP_printk("cpu=%u callsite=%pS func=%ps csd=%p",
                __entry->cpu, __entry->callsite, __entry->func, __entry->csd)
        );

/*
 * Tracepoints for a function which is called as an effect of smp_call_function.*
 */
DECLARE_EVENT_CLASS(csd_function,

        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),

        TP_ARGS(func, csd),

        TP_STRUCT__entry(
                __field(void *,        func)
                __field(void *,        csd)
        ),

        TP_fast_assign(
                __entry->func        = func;
                __entry->csd        = csd;
        ),

        TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd)
);

DEFINE_EVENT(csd_function, csd_function_entry,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

DEFINE_EVENT(csd_function, csd_function_exit,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

#endif /* _TRACE_CSD_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




















































































































































    3 













    1 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H

#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/rwsem.h>
#include <linux/sysctl.h>
#include <linux/err.h>

#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340

struct uid_gid_extent {
        u32 first;
        u32 lower_first;
        u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
        u32 nr_extents;
        union {
                struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
                struct {
                        struct uid_gid_extent *forward;
                        struct uid_gid_extent *reverse;
                };
        };
};

#define USERNS_SETGROUPS_ALLOWED 1UL

#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED

struct ucounts;

enum ucount_type {
        UCOUNT_USER_NAMESPACES,
        UCOUNT_PID_NAMESPACES,
        UCOUNT_UTS_NAMESPACES,
        UCOUNT_IPC_NAMESPACES,
        UCOUNT_NET_NAMESPACES,
        UCOUNT_MNT_NAMESPACES,
        UCOUNT_CGROUP_NAMESPACES,
        UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_FANOTIFY_GROUPS,
        UCOUNT_FANOTIFY_MARKS,
#endif
        UCOUNT_COUNTS,
};

enum rlimit_type {
        UCOUNT_RLIMIT_NPROC,
        UCOUNT_RLIMIT_MSGQUEUE,
        UCOUNT_RLIMIT_SIGPENDING,
        UCOUNT_RLIMIT_MEMLOCK,
        UCOUNT_RLIMIT_COUNTS,
};

#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc;
#endif

struct user_namespace {
        struct uid_gid_map        uid_map;
        struct uid_gid_map        gid_map;
        struct uid_gid_map        projid_map;
        struct user_namespace        *parent;
        int                        level;
        kuid_t                        owner;
        kgid_t                        group;
        struct ns_common        ns;
        unsigned long                flags;
        /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
         * in its effective capability set at the child ns creation time. */
        bool                        parent_could_setfcap;

#ifdef CONFIG_KEYS
        /* List of joinable keyrings in this namespace.  Modification access of
         * these pointers is controlled by keyring_sem.  Once
         * user_keyring_register is set, it won't be changed, so it can be
         * accessed directly with READ_ONCE().
         */
        struct list_head        keyring_name_list;
        struct key                *user_keyring_register;
        struct rw_semaphore        keyring_sem;
#endif

        /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
        struct key                *persistent_keyring_register;
#endif
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#endif
        struct ucounts                *ucounts;
        long ucount_max[UCOUNT_COUNTS];
        long rlimit_max[UCOUNT_RLIMIT_COUNTS];

#if IS_ENABLED(CONFIG_BINFMT_MISC)
        struct binfmt_misc *binfmt_misc;
#endif
} __randomize_layout;

struct ucounts {
        struct hlist_node node;
        struct user_namespace *ns;
        kuid_t uid;
        atomic_t count;
        atomic_long_t ucount[UCOUNT_COUNTS];
        atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
};

extern struct user_namespace init_user_ns;
extern struct ucounts init_ucounts;

bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
void put_ucounts(struct ucounts *ucounts);

static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
{
        return atomic_long_read(&ucounts->rlimit[type]);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);

static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type)
{
        return READ_ONCE(ns->rlimit_max[type]);
}

static inline void set_userns_rlimit_max(struct user_namespace *ns,
                enum rlimit_type type, unsigned long max)
{
        ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}

#ifdef CONFIG_USER_NS

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->ns.count);
        return ns;
}

extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
extern void __put_user_ns(struct user_namespace *ns);

static inline void put_user_ns(struct user_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->ns.count))
                __put_user_ns(ns);
}

struct seq_operations;
extern const struct seq_operations proc_uid_seq_operations;
extern const struct seq_operations proc_gid_seq_operations;
extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool in_userns(const struct user_namespace *ancestor,
                       const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        return &init_user_ns;
}

static inline int create_user_ns(struct cred *new)
{
        return -EINVAL;
}

static inline int unshare_userns(unsigned long unshare_flags,
                                 struct cred **new_cred)
{
        if (unshare_flags & CLONE_NEWUSER)
                return -EINVAL;
        return 0;
}

static inline void put_user_ns(struct user_namespace *ns)
{
}

static inline bool userns_may_setgroups(const struct user_namespace *ns)
{
        return true;
}

static inline bool in_userns(const struct user_namespace *ancestor,
                             const struct user_namespace *child)
{
        return true;
}

static inline bool current_in_userns(const struct user_namespace *target_ns)
{
        return true;
}

static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
        return ERR_PTR(-EPERM);
}
#endif

#endif /* _LINUX_USER_H */









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Extension Header handling for IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Andi Kleen                <ak@muc.de>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 */

/* Changes:
 *        yoshfuji                : ensure not to overrun while parsing
 *                                  tlv options.
 *        Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs().
 *        YOSHIFUJI Hideaki @USAGI  Register inbound extension header
 *                                  handlers as inet6_protocol{}.
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <linux/slab.h>
#include <linux/export.h>

#include <net/dst.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/calipso.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/xfrm.h>
#endif
#include <linux/seg6.h>
#include <net/seg6.h>
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
#include <net/rpl.h>
#include <linux/ioam6.h>
#include <linux/ioam6_genl.h>
#include <net/ioam6.h>
#include <net/dst_metadata.h>

#include <linux/uaccess.h>

/*********************
  Generic functions
 *********************/

/* An unknown option is detected, decide what to do */

static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff,
                               bool disallow_unknowns)
{
        if (disallow_unknowns) {
                /* If unknown TLVs are disallowed by configuration
                 * then always silently drop packet. Note this also
                 * means no ICMP parameter problem is sent which
                 * could be a good property to mitigate a reflection DOS
                 * attack.
                 */

                goto drop;
        }

        switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) {
        case 0: /* ignore */
                return true;

        case 1: /* drop packet */
                break;

        case 3: /* Send ICMP if not a multicast address and drop packet */
                /* Actually, it is redundant check. icmp_send
                   will recheck in any case.
                 */
                if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
                        break;
                fallthrough;
        case 2: /* send ICMP PARM PROB regardless and drop packet */
                icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff,
                                         SKB_DROP_REASON_UNHANDLED_PROTO);
                return false;
        }

drop:
        kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
        return false;
}

static bool ipv6_hop_ra(struct sk_buff *skb, int optoff);
static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff);
static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff);
static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
static bool ipv6_dest_hao(struct sk_buff *skb, int optoff);
#endif

/* Parse tlv encoded option header (hop-by-hop or destination) */

static bool ip6_parse_tlv(bool hopbyhop,
                          struct sk_buff *skb,
                          int max_count)
{
        int len = (skb_transport_header(skb)[1] + 1) << 3;
        const unsigned char *nh = skb_network_header(skb);
        int off = skb_network_header_len(skb);
        bool disallow_unknowns = false;
        int tlv_count = 0;
        int padlen = 0;

        if (unlikely(max_count < 0)) {
                disallow_unknowns = true;
                max_count = -max_count;
        }

        off += 2;
        len -= 2;

        while (len > 0) {
                int optlen, i;

                if (nh[off] == IPV6_TLV_PAD1) {
                        padlen++;
                        if (padlen > 7)
                                goto bad;
                        off++;
                        len--;
                        continue;
                }
                if (len < 2)
                        goto bad;
                optlen = nh[off + 1] + 2;
                if (optlen > len)
                        goto bad;

                if (nh[off] == IPV6_TLV_PADN) {
                        /* RFC 2460 states that the purpose of PadN is
                         * to align the containing header to multiples
                         * of 8. 7 is therefore the highest valid value.
                         * See also RFC 4942, Section 2.1.9.5.
                         */
                        padlen += optlen;
                        if (padlen > 7)
                                goto bad;
                        /* RFC 4942 recommends receiving hosts to
                         * actively check PadN payload to contain
                         * only zeroes.
                         */
                        for (i = 2; i < optlen; i++) {
                                if (nh[off + i] != 0)
                                        goto bad;
                        }
                } else {
                        tlv_count++;
                        if (tlv_count > max_count)
                                goto bad;

                        if (hopbyhop) {
                                switch (nh[off]) {
                                case IPV6_TLV_ROUTERALERT:
                                        if (!ipv6_hop_ra(skb, off))
                                                return false;
                                        break;
                                case IPV6_TLV_IOAM:
                                        if (!ipv6_hop_ioam(skb, off))
                                                return false;

                                        nh = skb_network_header(skb);
                                        break;
                                case IPV6_TLV_JUMBO:
                                        if (!ipv6_hop_jumbo(skb, off))
                                                return false;
                                        break;
                                case IPV6_TLV_CALIPSO:
                                        if (!ipv6_hop_calipso(skb, off))
                                                return false;
                                        break;
                                default:
                                        if (!ip6_tlvopt_unknown(skb, off,
                                                                disallow_unknowns))
                                                return false;
                                        break;
                                }
                        } else {
                                switch (nh[off]) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                                case IPV6_TLV_HAO:
                                        if (!ipv6_dest_hao(skb, off))
                                                return false;
                                        break;
#endif
                                default:
                                        if (!ip6_tlvopt_unknown(skb, off,
                                                                disallow_unknowns))
                                                return false;
                                        break;
                                }
                        }
                        padlen = 0;
                }
                off += optlen;
                len -= optlen;
        }

        if (len == 0)
                return true;
bad:
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

/*****************************
  Destination options header.
 *****************************/

#if IS_ENABLED(CONFIG_IPV6_MIP6)
static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
{
        struct ipv6_destopt_hao *hao;
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        SKB_DR(reason);
        int ret;

        if (opt->dsthao) {
                net_dbg_ratelimited("hao duplicated\n");
                goto discard;
        }
        opt->dsthao = opt->dst1;
        opt->dst1 = 0;

        hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff);

        if (hao->length != 16) {
                net_dbg_ratelimited("hao invalid option length = %d\n",
                                    hao->length);
                SKB_DR_SET(reason, IP_INHDR);
                goto discard;
        }

        if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
                net_dbg_ratelimited("hao is not an unicast addr: %pI6\n",
                                    &hao->addr);
                SKB_DR_SET(reason, INVALID_PROTO);
                goto discard;
        }

        ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
                               (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
        if (unlikely(ret < 0)) {
                SKB_DR_SET(reason, XFRM_POLICY);
                goto discard;
        }

        if (skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
                        goto discard;

                /* update all variable using below by copied skbuff */
                hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) +
                                                  optoff);
                ipv6h = ipv6_hdr(skb);
        }

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;

        swap(ipv6h->saddr, hao->addr);

        if (skb->tstamp == 0)
                __net_timestamp(skb);

        return true;

 discard:
        kfree_skb_reason(skb, reason);
        return false;
}
#endif

static int ipv6_destopt_rcv(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);
        struct inet6_skb_parm *opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        __u16 dstbuf;
#endif
        struct dst_entry *dst = skb_dst(skb);
        struct net *net = dev_net(skb->dev);
        int extlen;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
            !pskb_may_pull(skb, (skb_transport_offset(skb) +
                                 ((skb_transport_header(skb)[1] + 1) << 3)))) {
                __IP6_INC_STATS(dev_net(dst->dev), idev,
                                IPSTATS_MIB_INHDRERRORS);
fail_and_free:
                kfree_skb(skb);
                return -1;
        }

        extlen = (skb_transport_header(skb)[1] + 1) << 3;
        if (extlen > net->ipv6.sysctl.max_dst_opts_len)
                goto fail_and_free;

        opt->lastopt = opt->dst1 = skb_network_header_len(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        dstbuf = opt->dst1;
#endif

        if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) {
                skb->transport_header += extlen;
                opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                opt->nhoff = dstbuf;
#else
                opt->nhoff = opt->dst1;
#endif
                return 1;
        }

        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
        return -1;
}

static void seg6_update_csum(struct sk_buff *skb)
{
        struct ipv6_sr_hdr *hdr;
        struct in6_addr *addr;
        __be32 from, to;

        /* srh is at transport offset and seg_left is already decremented
         * but daddr is not yet updated with next segment
         */

        hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
        addr = hdr->segments + hdr->segments_left;

        hdr->segments_left++;
        from = *(__be32 *)hdr;

        hdr->segments_left--;
        to = *(__be32 *)hdr;

        /* update skb csum with diff resulting from seg_left decrement */

        update_csum_diff4(skb, from, to);

        /* compute csum diff between current and next segment and update */

        update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr),
                           (__be32 *)addr);
}

static int ipv6_srh_rcv(struct sk_buff *skb)
{
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net(skb->dev);
        struct ipv6_sr_hdr *hdr;
        struct inet6_dev *idev;
        struct in6_addr *addr;
        int accept_seg6;

        hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);

        idev = __in6_dev_get(skb->dev);

        accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled),
                          READ_ONCE(idev->cnf.seg6_enabled));

        if (!accept_seg6) {
                kfree_skb(skb);
                return -1;
        }

#ifdef CONFIG_IPV6_SEG6_HMAC
        if (!seg6_hmac_validate_skb(skb)) {
                kfree_skb(skb);
                return -1;
        }
#endif

looped_back:
        if (hdr->segments_left == 0) {
                if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) {
                        int offset = (hdr->hdrlen + 1) << 3;

                        skb_postpull_rcsum(skb, skb_network_header(skb),
                                           skb_network_header_len(skb));
                        skb_pull(skb, offset);
                        skb_postpull_rcsum(skb, skb_transport_header(skb),
                                           offset);

                        skb_reset_network_header(skb);
                        skb_reset_transport_header(skb);
                        skb->encapsulation = 0;
                        if (hdr->nexthdr == NEXTHDR_IPV4)
                                skb->protocol = htons(ETH_P_IP);
                        __skb_tunnel_rx(skb, skb->dev, net);

                        netif_rx(skb);
                        return -1;
                }

                opt->srcrt = skb_network_header_len(skb);
                opt->lastopt = opt->srcrt;
                skb->transport_header += (hdr->hdrlen + 1) << 3;
                opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);

                return 1;
        }

        if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                                  ((&hdr->segments_left) -
                                   skb_network_header(skb)));
                return -1;
        }

        if (skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
                        __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                                        IPSTATS_MIB_OUTDISCARDS);
                        kfree_skb(skb);
                        return -1;
                }

                hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
        }

        hdr->segments_left--;
        addr = hdr->segments + hdr->segments_left;

        skb_push(skb, sizeof(struct ipv6hdr));

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                seg6_update_csum(skb);

        ipv6_hdr(skb)->daddr = *addr;

        ip6_route_input(skb);

        if (skb_dst(skb)->error) {
                dst_input(skb);
                return -1;
        }

        if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
                if (ipv6_hdr(skb)->hop_limit <= 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        icmpv6_send(skb, ICMPV6_TIME_EXCEED,
                                    ICMPV6_EXC_HOPLIMIT, 0);
                        kfree_skb(skb);
                        return -1;
                }
                ipv6_hdr(skb)->hop_limit--;

                skb_pull(skb, sizeof(struct ipv6hdr));
                goto looped_back;
        }

        dst_input(skb);

        return -1;
}

static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
{
        struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net(skb->dev);
        struct inet6_dev *idev;
        struct ipv6hdr *oldhdr;
        unsigned char *buf;
        int accept_rpl_seg;
        int i, err;
        u64 n = 0;
        u32 r;

        idev = __in6_dev_get(skb->dev);

        accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled;
        if (accept_rpl_seg > idev->cnf.rpl_seg_enabled)
                accept_rpl_seg = idev->cnf.rpl_seg_enabled;

        if (!accept_rpl_seg) {
                kfree_skb(skb);
                return -1;
        }

looped_back:
        hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);

        if (hdr->segments_left == 0) {
                if (hdr->nexthdr == NEXTHDR_IPV6) {
                        int offset = (hdr->hdrlen + 1) << 3;

                        skb_postpull_rcsum(skb, skb_network_header(skb),
                                           skb_network_header_len(skb));
                        skb_pull(skb, offset);
                        skb_postpull_rcsum(skb, skb_transport_header(skb),
                                           offset);

                        skb_reset_network_header(skb);
                        skb_reset_transport_header(skb);
                        skb->encapsulation = 0;

                        __skb_tunnel_rx(skb, skb->dev, net);

                        netif_rx(skb);
                        return -1;
                }

                opt->srcrt = skb_network_header_len(skb);
                opt->lastopt = opt->srcrt;
                skb->transport_header += (hdr->hdrlen + 1) << 3;
                opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);

                return 1;
        }

        n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
        r = do_div(n, (16 - hdr->cmpri));
        /* checks if calculation was without remainder and n fits into
         * unsigned char which is segments_left field. Should not be
         * higher than that.
         */
        if (r || (n + 1) > 255) {
                kfree_skb(skb);
                return -1;
        }

        if (hdr->segments_left > n + 1) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                                  ((&hdr->segments_left) -
                                   skb_network_header(skb)));
                return -1;
        }

        hdr->segments_left--;
        i = n - hdr->segments_left;

        buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC);
        if (unlikely(!buf)) {
                kfree_skb(skb);
                return -1;
        }

        ohdr = (struct ipv6_rpl_sr_hdr *)buf;
        ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
        chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));

        if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) {
                kfree_skb(skb);
                kfree(buf);
                return -1;
        }

        err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
        if (err) {
                icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
                kfree_skb(skb);
                kfree(buf);
                return -1;
        }

        swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]);

        ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);

        oldhdr = ipv6_hdr(skb);

        skb_pull(skb, ((hdr->hdrlen + 1) << 3));
        skb_postpull_rcsum(skb, oldhdr,
                           sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
        if (unlikely(!hdr->segments_left)) {
                if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0,
                                     GFP_ATOMIC)) {
                        __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS);
                        kfree_skb(skb);
                        kfree(buf);
                        return -1;
                }

                oldhdr = ipv6_hdr(skb);
        }
        skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        skb_mac_header_rebuild(skb);
        skb_set_transport_header(skb, sizeof(struct ipv6hdr));

        memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
        memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);

        ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
        skb_postpush_rcsum(skb, ipv6_hdr(skb),
                           sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));

        kfree(buf);

        ip6_route_input(skb);

        if (skb_dst(skb)->error) {
                dst_input(skb);
                return -1;
        }

        if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
                if (ipv6_hdr(skb)->hop_limit <= 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        icmpv6_send(skb, ICMPV6_TIME_EXCEED,
                                    ICMPV6_EXC_HOPLIMIT, 0);
                        kfree_skb(skb);
                        return -1;
                }
                ipv6_hdr(skb)->hop_limit--;

                skb_pull(skb, sizeof(struct ipv6hdr));
                goto looped_back;
        }

        dst_input(skb);

        return -1;
}

/********************************
  Routing header.
 ********************************/

/* called with rcu_read_lock() */
static int ipv6_rthdr_rcv(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct in6_addr *addr = NULL;
        int n, i;
        struct ipv6_rt_hdr *hdr;
        struct rt0_hdr *rthdr;
        struct net *net = dev_net(skb->dev);
        int accept_source_route;

        accept_source_route = READ_ONCE(net->ipv6.devconf_all->accept_source_route);

        if (idev)
                accept_source_route = min(accept_source_route,
                                          READ_ONCE(idev->cnf.accept_source_route));

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
            !pskb_may_pull(skb, (skb_transport_offset(skb) +
                                 ((skb_transport_header(skb)[1] + 1) << 3)))) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                kfree_skb(skb);
                return -1;
        }

        hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);

        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
            skb->pkt_type != PACKET_HOST) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                kfree_skb(skb);
                return -1;
        }

        switch (hdr->type) {
        case IPV6_SRCRT_TYPE_4:
                /* segment routing */
                return ipv6_srh_rcv(skb);
        case IPV6_SRCRT_TYPE_3:
                /* rpl segment routing */
                return ipv6_rpl_srh_rcv(skb);
        default:
                break;
        }

looped_back:
        if (hdr->segments_left == 0) {
                switch (hdr->type) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                case IPV6_SRCRT_TYPE_2:
                        /* Silently discard type 2 header unless it was
                         * processed by own
                         */
                        if (!addr) {
                                __IP6_INC_STATS(net, idev,
                                                IPSTATS_MIB_INADDRERRORS);
                                kfree_skb(skb);
                                return -1;
                        }
                        break;
#endif
                default:
                        break;
                }

                opt->lastopt = opt->srcrt = skb_network_header_len(skb);
                skb->transport_header += (hdr->hdrlen + 1) << 3;
                opt->dst0 = opt->dst1;
                opt->dst1 = 0;
                opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
                return 1;
        }

        switch (hdr->type) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case IPV6_SRCRT_TYPE_2:
                if (accept_source_route < 0)
                        goto unknown_rh;
                /* Silently discard invalid RTH type 2 */
                if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        kfree_skb(skb);
                        return -1;
                }
                break;
#endif
        default:
                goto unknown_rh;
        }

        /*
         *        This is the routing header forwarding algorithm from
         *        RFC 2460, page 16.
         */

        n = hdr->hdrlen >> 1;

        if (hdr->segments_left > n) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                                  ((&hdr->segments_left) -
                                   skb_network_header(skb)));
                return -1;
        }

        /* We are about to mangle packet header. Be careful!
           Do not damage packets queued somewhere.
         */
        if (skb_cloned(skb)) {
                /* the copy is a forwarded packet */
                if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
                        __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                                        IPSTATS_MIB_OUTDISCARDS);
                        kfree_skb(skb);
                        return -1;
                }
                hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb);
        }

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;

        i = n - --hdr->segments_left;

        rthdr = (struct rt0_hdr *) hdr;
        addr = rthdr->addr;
        addr += i - 1;

        switch (hdr->type) {
#if IS_ENABLED(CONFIG_IPV6_MIP6)
        case IPV6_SRCRT_TYPE_2:
                if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
                                     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
                                     IPPROTO_ROUTING) < 0) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                        kfree_skb(skb);
                        return -1;
                }
                if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                        kfree_skb(skb);
                        return -1;
                }
                break;
#endif
        default:
                break;
        }

        if (ipv6_addr_is_multicast(addr)) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                kfree_skb(skb);
                return -1;
        }

        swap(*addr, ipv6_hdr(skb)->daddr);

        ip6_route_input(skb);
        if (skb_dst(skb)->error) {
                skb_push(skb, -skb_network_offset(skb));
                dst_input(skb);
                return -1;
        }

        if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
                if (ipv6_hdr(skb)->hop_limit <= 1) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
                                    0);
                        kfree_skb(skb);
                        return -1;
                }
                ipv6_hdr(skb)->hop_limit--;
                goto looped_back;
        }

        skb_push(skb, -skb_network_offset(skb));
        dst_input(skb);
        return -1;

unknown_rh:
        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
        icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
                          (&hdr->type) - skb_network_header(skb));
        return -1;
}

static const struct inet6_protocol rthdr_protocol = {
        .handler        =        ipv6_rthdr_rcv,
        .flags                =        INET6_PROTO_NOPOLICY,
};

static const struct inet6_protocol destopt_protocol = {
        .handler        =        ipv6_destopt_rcv,
        .flags                =        INET6_PROTO_NOPOLICY,
};

static const struct inet6_protocol nodata_protocol = {
        .handler        =        dst_discard,
        .flags                =        INET6_PROTO_NOPOLICY,
};

int __init ipv6_exthdrs_init(void)
{
        int ret;

        ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING);
        if (ret)
                goto out;

        ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
        if (ret)
                goto out_rthdr;

        ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE);
        if (ret)
                goto out_destopt;

out:
        return ret;
out_destopt:
        inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
out_rthdr:
        inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
        goto out;
};

void ipv6_exthdrs_exit(void)
{
        inet6_del_protocol(&nodata_protocol, IPPROTO_NONE);
        inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS);
        inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING);
}

/**********************************
  Hop-by-hop options.
 **********************************/

/* Router Alert as of RFC 2711 */

static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
{
        const unsigned char *nh = skb_network_header(skb);

        if (nh[optoff + 1] == 2) {
                IP6CB(skb)->flags |= IP6SKB_ROUTERALERT;
                memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra));
                return true;
        }
        net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n",
                            nh[optoff + 1]);
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

/* IOAM */

static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
{
        struct ioam6_trace_hdr *trace;
        struct ioam6_namespace *ns;
        struct ioam6_hdr *hdr;

        /* Bad alignment (must be 4n-aligned) */
        if (optoff & 3)
                goto drop;

        /* Ignore if IOAM is not enabled on ingress */
        if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled))
                goto ignore;

        /* Truncated Option header */
        hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
        if (hdr->opt_len < 2)
                goto drop;

        switch (hdr->type) {
        case IOAM6_TYPE_PREALLOC:
                /* Truncated Pre-allocated Trace header */
                if (hdr->opt_len < 2 + sizeof(*trace))
                        goto drop;

                /* Malformed Pre-allocated Trace header */
                trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
                if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4)
                        goto drop;

                /* Ignore if the IOAM namespace is unknown */
                ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id);
                if (!ns)
                        goto ignore;

                if (!skb_valid_dst(skb))
                        ip6_route_input(skb);

                /* About to mangle packet header */
                if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len))
                        goto drop;

                /* Trace pointer may have changed */
                trace = (struct ioam6_trace_hdr *)(skb_network_header(skb)
                                                   + optoff + sizeof(*hdr));

                ioam6_fill_trace_data(skb, ns, trace, true);

                ioam6_event(IOAM6_EVENT_TRACE, dev_net(skb->dev),
                            GFP_ATOMIC, (void *)trace, hdr->opt_len - 2);
                break;
        default:
                break;
        }

ignore:
        return true;

drop:
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

/* Jumbo payload */

static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
{
        const unsigned char *nh = skb_network_header(skb);
        SKB_DR(reason);
        u32 pkt_len;

        if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
                net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
                                    nh[optoff+1]);
                SKB_DR_SET(reason, IP_INHDR);
                goto drop;
        }

        pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
        if (pkt_len <= IPV6_MAXPLEN) {
                icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2,
                                         SKB_DROP_REASON_IP_INHDR);
                return false;
        }
        if (ipv6_hdr(skb)->payload_len) {
                icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff,
                                         SKB_DROP_REASON_IP_INHDR);
                return false;
        }

        if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
                SKB_DR_SET(reason, PKT_TOO_SMALL);
                goto drop;
        }

        if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
                goto drop;

        IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM;
        return true;

drop:
        kfree_skb_reason(skb, reason);
        return false;
}

/* CALIPSO RFC 5570 */

static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
{
        const unsigned char *nh = skb_network_header(skb);

        if (nh[optoff + 1] < 8)
                goto drop;

        if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1])
                goto drop;

        if (!calipso_validate(skb, nh + optoff))
                goto drop;

        return true;

drop:
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
        return false;
}

int ipv6_parse_hopopts(struct sk_buff *skb)
{
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net(skb->dev);
        int extlen;

        /*
         * skb_network_header(skb) is equal to skb->data, and
         * skb_network_header_len(skb) is always equal to
         * sizeof(struct ipv6hdr) by definition of
         * hop-by-hop options.
         */
        if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) ||
            !pskb_may_pull(skb, (sizeof(struct ipv6hdr) +
                                 ((skb_transport_header(skb)[1] + 1) << 3)))) {
fail_and_free:
                kfree_skb(skb);
                return -1;
        }

        extlen = (skb_transport_header(skb)[1] + 1) << 3;
        if (extlen > net->ipv6.sysctl.max_hbh_opts_len)
                goto fail_and_free;

        opt->flags |= IP6SKB_HOPBYHOP;
        if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) {
                skb->transport_header += extlen;
                opt = IP6CB(skb);
                opt->nhoff = sizeof(struct ipv6hdr);
                return 1;
        }
        return -1;
}

/*
 *        Creating outbound headers.
 *
 *        "build" functions work when skb is filled from head to tail (datagram)
 *        "push"        functions work when headers are added from tail to head (tcp)
 *
 *        In both cases we assume, that caller reserved enough room
 *        for headers.
 */

static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto,
                             struct ipv6_rt_hdr *opt,
                             struct in6_addr **addr_p, struct in6_addr *saddr)
{
        struct rt0_hdr *phdr, *ihdr;
        int hops;

        ihdr = (struct rt0_hdr *) opt;

        phdr = skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
        memcpy(phdr, ihdr, sizeof(struct rt0_hdr));

        hops = ihdr->rt_hdr.hdrlen >> 1;

        if (hops > 1)
                memcpy(phdr->addr, ihdr->addr + 1,
                       (hops - 1) * sizeof(struct in6_addr));

        phdr->addr[hops - 1] = **addr_p;
        *addr_p = ihdr->addr;

        phdr->rt_hdr.nexthdr = *proto;
        *proto = NEXTHDR_ROUTING;
}

static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
                             struct ipv6_rt_hdr *opt,
                             struct in6_addr **addr_p, struct in6_addr *saddr)
{
        struct ipv6_sr_hdr *sr_phdr, *sr_ihdr;
        int plen, hops;

        sr_ihdr = (struct ipv6_sr_hdr *)opt;
        plen = (sr_ihdr->hdrlen + 1) << 3;

        sr_phdr = skb_push(skb, plen);
        memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr));

        hops = sr_ihdr->first_segment + 1;
        memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1,
               (hops - 1) * sizeof(struct in6_addr));

        sr_phdr->segments[0] = **addr_p;
        *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left];

        if (sr_ihdr->hdrlen > hops * 2) {
                int tlvs_offset, tlvs_length;

                tlvs_offset = (1 + hops * 2) << 3;
                tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3;
                memcpy((char *)sr_phdr + tlvs_offset,
                       (char *)sr_ihdr + tlvs_offset, tlvs_length);
        }

#ifdef CONFIG_IPV6_SEG6_HMAC
        if (sr_has_hmac(sr_phdr)) {
                struct net *net = NULL;

                if (skb->dev)
                        net = dev_net(skb->dev);
                else if (skb->sk)
                        net = sock_net(skb->sk);

                WARN_ON(!net);

                if (net)
                        seg6_push_hmac(net, saddr, sr_phdr);
        }
#endif

        sr_phdr->nexthdr = *proto;
        *proto = NEXTHDR_ROUTING;
}

static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
                            struct ipv6_rt_hdr *opt,
                            struct in6_addr **addr_p, struct in6_addr *saddr)
{
        switch (opt->type) {
        case IPV6_SRCRT_TYPE_0:
        case IPV6_SRCRT_STRICT:
        case IPV6_SRCRT_TYPE_2:
                ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
                break;
        case IPV6_SRCRT_TYPE_4:
                ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
                break;
        default:
                break;
        }
}

static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
{
        struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt));

        memcpy(h, opt, ipv6_optlen(opt));
        h->nexthdr = *proto;
        *proto = type;
}

void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                          u8 *proto,
                          struct in6_addr **daddr, struct in6_addr *saddr)
{
        if (opt->srcrt) {
                ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
                /*
                 * IPV6_RTHDRDSTOPTS is ignored
                 * unless IPV6_RTHDR is set (RFC3542).
                 */
                if (opt->dst0opt)
                        ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
        }
        if (opt->hopopt)
                ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
}

void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto)
{
        if (opt->dst1opt)
                ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
}
EXPORT_SYMBOL(ipv6_push_frag_opts);

struct ipv6_txoptions *
ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
{
        struct ipv6_txoptions *opt2;

        opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
        if (opt2) {
                long dif = (char *)opt2 - (char *)opt;
                memcpy(opt2, opt, opt->tot_len);
                if (opt2->hopopt)
                        *((char **)&opt2->hopopt) += dif;
                if (opt2->dst0opt)
                        *((char **)&opt2->dst0opt) += dif;
                if (opt2->dst1opt)
                        *((char **)&opt2->dst1opt) += dif;
                if (opt2->srcrt)
                        *((char **)&opt2->srcrt) += dif;
                refcount_set(&opt2->refcnt, 1);
        }
        return opt2;
}
EXPORT_SYMBOL_GPL(ipv6_dup_options);

static void ipv6_renew_option(int renewtype,
                              struct ipv6_opt_hdr **dest,
                              struct ipv6_opt_hdr *old,
                              struct ipv6_opt_hdr *new,
                              int newtype, char **p)
{
        struct ipv6_opt_hdr *src;

        src = (renewtype == newtype ? new : old);
        if (!src)
                return;

        memcpy(*p, src, ipv6_optlen(src));
        *dest = (struct ipv6_opt_hdr *)*p;
        *p += CMSG_ALIGN(ipv6_optlen(*dest));
}

/**
 * ipv6_renew_options - replace a specific ext hdr with a new one.
 *
 * @sk: sock from which to allocate memory
 * @opt: original options
 * @newtype: option type to replace in @opt
 * @newopt: new option of type @newtype to replace (user-mem)
 *
 * Returns a new set of options which is a copy of @opt with the
 * option type @newtype replaced with @newopt.
 *
 * @opt may be NULL, in which case a new set of options is returned
 * containing just @newopt.
 *
 * @newopt may be NULL, in which case the specified option type is
 * not copied into the new set of options.
 *
 * The new set of options is allocated from the socket option memory
 * buffer of @sk.
 */
struct ipv6_txoptions *
ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
                   int newtype, struct ipv6_opt_hdr *newopt)
{
        int tot_len = 0;
        char *p;
        struct ipv6_txoptions *opt2;

        if (opt) {
                if (newtype != IPV6_HOPOPTS && opt->hopopt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt));
                if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt));
                if (newtype != IPV6_RTHDR && opt->srcrt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt));
                if (newtype != IPV6_DSTOPTS && opt->dst1opt)
                        tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
        }

        if (newopt)
                tot_len += CMSG_ALIGN(ipv6_optlen(newopt));

        if (!tot_len)
                return NULL;

        tot_len += sizeof(*opt2);
        opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC);
        if (!opt2)
                return ERR_PTR(-ENOBUFS);

        memset(opt2, 0, tot_len);
        refcount_set(&opt2->refcnt, 1);
        opt2->tot_len = tot_len;
        p = (char *)(opt2 + 1);

        ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt,
                          (opt ? opt->hopopt : NULL),
                          newopt, newtype, &p);
        ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt,
                          (opt ? opt->dst0opt : NULL),
                          newopt, newtype, &p);
        ipv6_renew_option(IPV6_RTHDR,
                          (struct ipv6_opt_hdr **)&opt2->srcrt,
                          (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL),
                          newopt, newtype, &p);
        ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt,
                          (opt ? opt->dst1opt : NULL),
                          newopt, newtype, &p);

        opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
                          (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
                          (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0);
        opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);

        return opt2;
}

struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
                                            struct ipv6_txoptions *opt)
{
        /*
         * ignore the dest before srcrt unless srcrt is being included.
         * --yoshfuji
         */
        if (opt->dst0opt && !opt->srcrt) {
                if (opt_space != opt) {
                        memcpy(opt_space, opt, sizeof(*opt_space));
                        opt = opt_space;
                }
                opt->opt_nflen -= ipv6_optlen(opt->dst0opt);
                opt->dst0opt = NULL;
        }

        return opt;
}
EXPORT_SYMBOL_GPL(__ipv6_fixup_options);

/**
 * fl6_update_dst - update flowi destination address with info given
 *                  by srcrt option, if any.
 *
 * @fl6: flowi6 for which daddr is to be updated
 * @opt: struct ipv6_txoptions in which to look for srcrt opt
 * @orig: copy of original daddr address if modified
 *
 * Returns NULL if no txoptions or no srcrt, otherwise returns orig
 * and initial value of fl6->daddr set in orig
 */
struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
                                const struct ipv6_txoptions *opt,
                                struct in6_addr *orig)
{
        if (!opt || !opt->srcrt)
                return NULL;

        *orig = fl6->daddr;

        switch (opt->srcrt->type) {
        case IPV6_SRCRT_TYPE_0:
        case IPV6_SRCRT_STRICT:
        case IPV6_SRCRT_TYPE_2:
                fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr;
                break;
        case IPV6_SRCRT_TYPE_4:
        {
                struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt;

                fl6->daddr = srh->segments[srh->segments_left];
                break;
        }
        default:
                return NULL;
        }

        return orig;
}
EXPORT_SYMBOL_GPL(fl6_update_dst);









































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (c) 2020 Christoph Hellwig.
 *
 * Support for "universal" pointers that can point to either kernel or userspace
 * memory.
 */
#ifndef _LINUX_SOCKPTR_H
#define _LINUX_SOCKPTR_H

#include <linux/slab.h>
#include <linux/uaccess.h>

typedef struct {
        union {
                void                *kernel;
                void __user        *user;
        };
        bool                is_kernel : 1;
} sockptr_t;

static inline bool sockptr_is_kernel(sockptr_t sockptr)
{
        return sockptr.is_kernel;
}

static inline sockptr_t KERNEL_SOCKPTR(void *p)
{
        return (sockptr_t) { .kernel = p, .is_kernel = true };
}

static inline sockptr_t USER_SOCKPTR(void __user *p)
{
        return (sockptr_t) { .user = p };
}

static inline bool sockptr_is_null(sockptr_t sockptr)
{
        if (sockptr_is_kernel(sockptr))
                return !sockptr.kernel;
        return !sockptr.user;
}

static inline int copy_from_sockptr_offset(void *dst, sockptr_t src,
                size_t offset, size_t size)
{
        if (!sockptr_is_kernel(src))
                return copy_from_user(dst, src.user + offset, size);
        memcpy(dst, src.kernel + offset, size);
        return 0;
}

/* Deprecated.
 * This is unsafe, unless caller checked user provided optlen.
 * Prefer copy_safe_from_sockptr() instead.
 */
static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size)
{
        return copy_from_sockptr_offset(dst, src, 0, size);
}

/**
 * copy_safe_from_sockptr: copy a struct from sockptr
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @optval: Source address. (in user or kernel space)
 * @optlen: Size of @optval data.
 *
 * Returns:
 *  * -EINVAL: @optlen < @ksize
 *  * -EFAULT: access to userspace failed.
 *  * 0 : @ksize bytes were copied
 */
static inline int copy_safe_from_sockptr(void *dst, size_t ksize,
                                         sockptr_t optval, unsigned int optlen)
{
        if (optlen < ksize)
                return -EINVAL;
        return copy_from_sockptr(dst, optval, ksize);
}

static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
                sockptr_t src, size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        if (!sockptr_is_kernel(src))
                return copy_struct_from_user(dst, ksize, src.user, size);

        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                char *p = src.kernel;

                while (rest--) {
                        if (*p++)
                                return -E2BIG;
                }
        }
        memcpy(dst, src.kernel, size);
        return 0;
}

static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset,
                const void *src, size_t size)
{
        if (!sockptr_is_kernel(dst))
                return copy_to_user(dst.user + offset, src, size);
        memcpy(dst.kernel + offset, src, size);
        return 0;
}

static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
{
        return copy_to_sockptr_offset(dst, 0, src, size);
}

static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len)
{
        void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        return p;
}
#define memdup_sockptr(...)        alloc_hooks(memdup_sockptr_noprof(__VA_ARGS__))

static inline void *memdup_sockptr_nul_noprof(sockptr_t src, size_t len)
{
        char *p = kmalloc_track_caller_noprof(len + 1, GFP_KERNEL);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_sockptr(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';
        return p;
}
#define memdup_sockptr_nul(...)        alloc_hooks(memdup_sockptr_nul_noprof(__VA_ARGS__))

static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
{
        if (sockptr_is_kernel(src)) {
                size_t len = min(strnlen(src.kernel, count - 1) + 1, count);

                memcpy(dst, src.kernel, len);
                return len;
        }
        return strncpy_from_user(dst, src.user, count);
}

static inline int check_zeroed_sockptr(sockptr_t src, size_t offset,
                                       size_t size)
{
        if (!sockptr_is_kernel(src))
                return check_zeroed_user(src.user + offset, size);
        return memchr_inv(src.kernel + offset, 0, size) == NULL;
}

#endif /* _LINUX_SOCKPTR_H */





















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H

#include "msr-index.h"

#ifndef __ASSEMBLY__

#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>
#include <asm/shared/msr.h>

#include <linux/percpu.h>

struct msr_info {
        u32                        msr_no;
        struct msr                reg;
        struct msr __percpu        *msrs;
        int                        err;
};

struct msr_regs_info {
        u32 *regs;
        int err;
};

struct saved_msr {
        bool valid;
        struct msr_info info;
};

struct saved_msrs {
        unsigned int num;
        struct saved_msr *array;
};

/*
 * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
 * constraint has different meanings. For i386, "A" means exactly
 * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
 * it means rax *or* rdx.
 */
#ifdef CONFIG_X86_64
/* Using 64-bit values saves one instruction clearing the high half of low */
#define DECLARE_ARGS(val, low, high)        unsigned long low, high
#define EAX_EDX_VAL(val, low, high)        ((low) | (high) << 32)
#define EAX_EDX_RET(val, low, high)        "=a" (low), "=d" (high)
#else
#define DECLARE_ARGS(val, low, high)        unsigned long long val
#define EAX_EDX_VAL(val, low, high)        (val)
#define EAX_EDX_RET(val, low, high)        "=A" (val)
#endif

/*
 * Be very careful with includes. This header is prone to include loops.
 */
#include <asm/atomic.h>
#include <linux/tracepoint-defs.h>

#ifdef CONFIG_TRACEPOINTS
DECLARE_TRACEPOINT(read_msr);
DECLARE_TRACEPOINT(write_msr);
DECLARE_TRACEPOINT(rdpmc);
extern void do_trace_write_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_read_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed);
#else
static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
#endif

/*
 * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
 * accessors and should not have any tracing or other functionality piggybacking
 * on them - those are *purely* for accessing MSRs and nothing more. So don't even
 * think of extending them - you will be slapped with a stinking trout or a frozen
 * shark will reach you, wherever you are! You've been warned.
 */
static __always_inline unsigned long long __rdmsr(unsigned int msr)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
                     : EAX_EDX_RET(val, low, high) : "c" (msr));

        return EAX_EDX_VAL(val, low, high);
}

static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
        asm volatile("1: wrmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a"(low), "d" (high) : "memory");
}

/*
 * WRMSRNS behaves exactly like WRMSR with the only difference being
 * that it is not a serializing instruction by default.
 */
static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high)
{
        /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */
        asm volatile("1: .byte 0x0f,0x01,0xc6\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a"(low), "d" (high));
}

#define native_rdmsr(msr, val1, val2)                        \
do {                                                        \
        u64 __val = __rdmsr((msr));                        \
        (void)((val1) = (u32)__val);                        \
        (void)((val2) = (u32)(__val >> 32));                \
} while (0)

#define native_wrmsr(msr, low, high)                        \
        __wrmsr(msr, low, high)

#define native_wrmsrl(msr, val)                                \
        __wrmsr((msr), (u32)((u64)(val)),                \
                       (u32)((u64)(val) >> 32))

static inline unsigned long long native_read_msr(unsigned int msr)
{
        unsigned long long val;

        val = __rdmsr(msr);

        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, val, 0);

        return val;
}

static inline unsigned long long native_read_msr_safe(unsigned int msr,
                                                      int *err)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
                     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
                     : "c" (msr));
        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
        return EAX_EDX_VAL(val, low, high);
}

/* Can be uninlined because referenced by paravirt */
static inline void notrace
native_write_msr(unsigned int msr, u32 low, u32 high)
{
        __wrmsr(msr, low, high);

        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}

/* Can be uninlined because referenced by paravirt */
static inline int notrace
native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
        int err;

        asm volatile("1: wrmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
                     : [err] "=a" (err)
                     : "c" (msr), "0" (low), "d" (high)
                     : "memory");
        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, ((u64)high << 32 | low), err);
        return err;
}

extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);

/**
 * rdtsc() - returns the current TSC without ordering constraints
 *
 * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
 * only ordering constraint it supplies is the ordering implied by
 * "asm volatile": it will put the RDTSC in the place you expect.  The
 * CPU can and will speculatively execute that RDTSC, though, so the
 * results can be non-monotonic if compared on different CPUs.
 */
static __always_inline unsigned long long rdtsc(void)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

        return EAX_EDX_VAL(val, low, high);
}

/**
 * rdtsc_ordered() - read the current TSC in program order
 *
 * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
 * It is ordered like a load to a global in-memory counter.  It should
 * be impossible to observe non-monotonic rdtsc_unordered() behavior
 * across multiple CPUs as long as the TSC is synced.
 */
static __always_inline unsigned long long rdtsc_ordered(void)
{
        DECLARE_ARGS(val, low, high);

        /*
         * The RDTSC instruction is not ordered relative to memory
         * access.  The Intel SDM and the AMD APM are both vague on this
         * point, but empirically an RDTSC instruction can be
         * speculatively executed before prior loads.  An RDTSC
         * immediately after an appropriate barrier appears to be
         * ordered as a normal load, that is, it provides the same
         * ordering guarantees as reading from a global memory location
         * that some other imaginary CPU is updating continuously with a
         * time stamp.
         *
         * Thus, use the preferred barrier on the respective CPU, aiming for
         * RDTSCP as the default.
         */
        asm volatile(ALTERNATIVE_2("rdtsc",
                                   "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC,
                                   "rdtscp", X86_FEATURE_RDTSCP)
                        : EAX_EDX_RET(val, low, high)
                        /* RDTSCP clobbers ECX with MSR_TSC_AUX. */
                        :: "ecx");

        return EAX_EDX_VAL(val, low, high);
}

static inline unsigned long long native_read_pmc(int counter)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
        if (tracepoint_enabled(rdpmc))
                do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
        return EAX_EDX_VAL(val, low, high);
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
/*
 * Access to machine-specific registers (available on 586 and better only)
 * Note: the rd* operations modify the parameters directly (without using
 * pointer indirection), this allows gcc to optimize better
 */

#define rdmsr(msr, low, high)                                        \
do {                                                                \
        u64 __val = native_read_msr((msr));                        \
        (void)((low) = (u32)__val);                                \
        (void)((high) = (u32)(__val >> 32));                        \
} while (0)

static inline void wrmsr(unsigned int msr, u32 low, u32 high)
{
        native_write_msr(msr, low, high);
}

#define rdmsrl(msr, val)                        \
        ((val) = native_read_msr((msr)))

static inline void wrmsrl(unsigned int msr, u64 val)
{
        native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}

/* wrmsr with exception handling */
static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high)
{
        return native_write_msr_safe(msr, low, high);
}

/* rdmsr with exception handling */
#define rdmsr_safe(msr, low, high)                                \
({                                                                \
        int __err;                                                \
        u64 __val = native_read_msr_safe((msr), &__err);        \
        (*low) = (u32)__val;                                        \
        (*high) = (u32)(__val >> 32);                                \
        __err;                                                        \
})

static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p)
{
        int err;

        *p = native_read_msr_safe(msr, &err);
        return err;
}

#define rdpmc(counter, low, high)                        \
do {                                                        \
        u64 _l = native_read_pmc((counter));                \
        (low)  = (u32)_l;                                \
        (high) = (u32)(_l >> 32);                        \
} while (0)

#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))

#endif        /* !CONFIG_PARAVIRT_XXL */

static __always_inline void wrmsrns(u32 msr, u64 val)
{
        __wrmsrns(msr, val, val >> 32);
}

/*
 * 64-bit version of wrmsr_safe():
 */
static inline int wrmsrl_safe(u32 msr, u64 val)
{
        return wrmsr_safe(msr, (u32)val,  (u32)(val >> 32));
}

struct msr __percpu *msrs_alloc(void);
void msrs_free(struct msr __percpu *msrs);
int msr_set_bit(u32 msr, u8 bit);
int msr_clear_bit(u32 msr, u8 bit);

#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
#else  /*  CONFIG_SMP  */
static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
        rdmsr(msr_no, *l, *h);
        return 0;
}
static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        wrmsr(msr_no, l, h);
        return 0;
}
static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        rdmsrl(msr_no, *q);
        return 0;
}
static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        wrmsrl(msr_no, q);
        return 0;
}
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h));
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
                                    u32 *l, u32 *h)
{
        return rdmsr_safe(msr_no, l, h);
}
static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        return wrmsr_safe(msr_no, l, h);
}
static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        return rdmsrl_safe(msr_no, q);
}
static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        return wrmsrl_safe(msr_no, q);
}
static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return rdmsr_safe_regs(regs);
}
static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return wrmsr_safe_regs(regs);
}
#endif  /* CONFIG_SMP */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_MSR_H */











































































































































































































































































    1 

    1 


    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Intel SMP support routines.
 *
 *        (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
 *        (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
 *      (c) 2002,2003 Andi Kleen, SuSE Labs.
 *
 *        i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
 */

#include <linux/init.h>

#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/gfp.h>
#include <linux/kexec.h>

#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/idtentry.h>
#include <asm/nmi.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
#include <asm/kexec.h>
#include <asm/reboot.h>

/*
 *        Some notes on x86 processor bugs affecting SMP operation:
 *
 *        Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
 *        The Linux implications for SMP are handled as follows:
 *
 *        Pentium III / [Xeon]
 *                None of the E1AP-E3AP errata are visible to the user.
 *
 *        E1AP.        see PII A1AP
 *        E2AP.        see PII A2AP
 *        E3AP.        see PII A3AP
 *
 *        Pentium II / [Xeon]
 *                None of the A1AP-A3AP errata are visible to the user.
 *
 *        A1AP.        see PPro 1AP
 *        A2AP.        see PPro 2AP
 *        A3AP.        see PPro 7AP
 *
 *        Pentium Pro
 *                None of 1AP-9AP errata are visible to the normal user,
 *        except occasional delivery of 'spurious interrupt' as trap #15.
 *        This is very rare and a non-problem.
 *
 *        1AP.        Linux maps APIC as non-cacheable
 *        2AP.        worked around in hardware
 *        3AP.        fixed in C0 and above steppings microcode update.
 *                Linux does not use excessive STARTUP_IPIs.
 *        4AP.        worked around in hardware
 *        5AP.        symmetric IO mode (normal Linux operation) not affected.
 *                'noapic' mode has vector 0xf filled out properly.
 *        6AP.        'noapic' mode might be affected - fixed in later steppings
 *        7AP.        We do not assume writes to the LVT deasserting IRQs
 *        8AP.        We do not enable low power mode (deep sleep) during MP bootup
 *        9AP.        We do not use mixed mode
 *
 *        Pentium
 *                There is a marginal case where REP MOVS on 100MHz SMP
 *        machines with B stepping processors can fail. XXX should provide
 *        an L1cache=Writethrough or L1cache=off option.
 *
 *                B stepping CPUs may hang. There are hardware work arounds
 *        for this. We warn about it in case your board doesn't have the work
 *        arounds. Basically that's so I can tell anyone with a B stepping
 *        CPU and SMP problems "tough".
 *
 *        Specific items [From Pentium Processor Specification Update]
 *
 *        1AP.        Linux doesn't use remote read
 *        2AP.        Linux doesn't trust APIC errors
 *        3AP.        We work around this
 *        4AP.        Linux never generated 3 interrupts of the same priority
 *                to cause a lost local interrupt.
 *        5AP.        Remote read is never used
 *        6AP.        not affected - worked around in hardware
 *        7AP.        not affected - worked around in hardware
 *        8AP.        worked around in hardware - we get explicit CS errors if not
 *        9AP.        only 'noapic' mode affected. Might generate spurious
 *                interrupts, we log only the first one and count the
 *                rest silently.
 *        10AP.        not affected - worked around in hardware
 *        11AP.        Linux reads the APIC between writes to avoid this, as per
 *                the documentation. Make sure you preserve this as it affects
 *                the C stepping chips too.
 *        12AP.        not affected - worked around in hardware
 *        13AP.        not affected - worked around in hardware
 *        14AP.        we always deassert INIT during bootup
 *        15AP.        not affected - worked around in hardware
 *        16AP.        not affected - worked around in hardware
 *        17AP.        not affected - worked around in hardware
 *        18AP.        not affected - worked around in hardware
 *        19AP.        not affected - worked around in BIOS
 *
 *        If this sounds worrying believe me these bugs are either ___RARE___,
 *        or are signal timing bugs worked around in hardware and there's
 *        about nothing of note with C stepping upwards.
 */

static atomic_t stopping_cpu = ATOMIC_INIT(-1);
static bool smp_no_nmi_ipi = false;

static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
{
        /* We are registered on stopping cpu too, avoid spurious NMI */
        if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
                return NMI_HANDLED;

        cpu_emergency_disable_virtualization();
        stop_this_cpu(NULL);

        return NMI_HANDLED;
}

/*
 * this function calls the 'stop' function on all other CPUs in the system.
 */
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
        apic_eoi();
        cpu_emergency_disable_virtualization();
        stop_this_cpu(NULL);
}

static int register_stop_handler(void)
{
        return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
                                    NMI_FLAG_FIRST, "smp_stop");
}

static void native_stop_other_cpus(int wait)
{
        unsigned int old_cpu, this_cpu;
        unsigned long flags, timeout;

        if (reboot_force)
                return;

        /* Only proceed if this is the first CPU to reach this code */
        old_cpu = -1;
        this_cpu = smp_processor_id();
        if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu))
                return;

        /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
        if (kexec_in_progress)
                smp_kick_mwait_play_dead();

        /*
         * 1) Send an IPI on the reboot vector to all other CPUs.
         *
         *    The other CPUs should react on it after leaving critical
         *    sections and re-enabling interrupts. They might still hold
         *    locks, but there is nothing which can be done about that.
         *
         * 2) Wait for all other CPUs to report that they reached the
         *    HLT loop in stop_this_cpu()
         *
         * 3) If #2 timed out send an NMI to the CPUs which did not
         *    yet report
         *
         * 4) Wait for all other CPUs to report that they reached the
         *    HLT loop in stop_this_cpu()
         *
         * #3 can obviously race against a CPU reaching the HLT loop late.
         * That CPU will have reported already and the "have all CPUs
         * reached HLT" condition will be true despite the fact that the
         * other CPU is still handling the NMI. Again, there is no
         * protection against that as "disabled" APICs still respond to
         * NMIs.
         */
        cpumask_copy(&cpus_stop_mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, &cpus_stop_mask);

        if (!cpumask_empty(&cpus_stop_mask)) {
                apic_send_IPI_allbutself(REBOOT_VECTOR);

                /*
                 * Don't wait longer than a second for IPI completion. The
                 * wait request is not checked here because that would
                 * prevent an NMI shutdown attempt in case that not all
                 * CPUs reach shutdown state.
                 */
                timeout = USEC_PER_SEC;
                while (!cpumask_empty(&cpus_stop_mask) && timeout--)
                        udelay(1);
        }

        /* if the REBOOT_VECTOR didn't work, try with the NMI */
        if (!cpumask_empty(&cpus_stop_mask)) {
                /*
                 * If NMI IPI is enabled, try to register the stop handler
                 * and send the IPI. In any case try to wait for the other
                 * CPUs to stop.
                 */
                if (!smp_no_nmi_ipi && !register_stop_handler()) {
                        unsigned int cpu;

                        pr_emerg("Shutting down cpus with NMI\n");

                        for_each_cpu(cpu, &cpus_stop_mask)
                                __apic_send_IPI(cpu, NMI_VECTOR);
                }
                /*
                 * Don't wait longer than 10 ms if the caller didn't
                 * request it. If wait is true, the machine hangs here if
                 * one or more CPUs do not reach shutdown state.
                 */
                timeout = USEC_PER_MSEC * 10;
                while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
                        udelay(1);
        }

        local_irq_save(flags);
        disable_local_APIC();
        mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
        local_irq_restore(flags);

        /*
         * Ensure that the cpus_stop_mask cache lines are invalidated on
         * the other CPUs. See comment vs. SME in stop_this_cpu().
         */
        cpumask_clear(&cpus_stop_mask);
}

/*
 * Reschedule call back. KVM uses this interrupt to force a cpu out of
 * guest mode.
 */
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
{
        apic_eoi();
        trace_reschedule_entry(RESCHEDULE_VECTOR);
        inc_irq_stat(irq_resched_count);
        scheduler_ipi();
        trace_reschedule_exit(RESCHEDULE_VECTOR);
}

DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
{
        apic_eoi();
        trace_call_function_entry(CALL_FUNCTION_VECTOR);
        inc_irq_stat(irq_call_count);
        generic_smp_call_function_interrupt();
        trace_call_function_exit(CALL_FUNCTION_VECTOR);
}

DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
{
        apic_eoi();
        trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
        inc_irq_stat(irq_call_count);
        generic_smp_call_function_single_interrupt();
        trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
}

static int __init nonmi_ipi_setup(char *str)
{
        smp_no_nmi_ipi = true;
        return 1;
}

__setup("nonmi_ipi", nonmi_ipi_setup);

struct smp_ops smp_ops = {
        .smp_prepare_boot_cpu        = native_smp_prepare_boot_cpu,
        .smp_prepare_cpus        = native_smp_prepare_cpus,
        .smp_cpus_done                = native_smp_cpus_done,

        .stop_other_cpus        = native_stop_other_cpus,
#if defined(CONFIG_CRASH_DUMP)
        .crash_stop_other_cpus        = kdump_nmi_shootdown_cpus,
#endif
        .smp_send_reschedule        = native_smp_send_reschedule,

        .kick_ap_alive                = native_kick_ap,
        .cpu_disable                = native_cpu_disable,
        .play_dead                = native_play_dead,

        .send_call_func_ipi        = native_send_call_func_ipi,
        .send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
/* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
 * Copyright (c) 2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

#ifndef CAN_ML_H
#define CAN_ML_H

#include <linux/can.h>
#include <linux/list.h>
#include <linux/netdevice.h>

#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
#define CAN_EFF_RCV_HASH_BITS 10
#define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS)

enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX };

struct can_dev_rcv_lists {
        struct hlist_head rx[RX_MAX];
        struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ];
        struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ];
        int entries;
};

struct can_ml_priv {
        struct can_dev_rcv_lists dev_rcv_lists;
#ifdef CAN_J1939
        struct j1939_priv *j1939_priv;
#endif
};

static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev)
{
        return netdev_get_ml_priv(dev, ML_PRIV_CAN);
}

static inline void can_set_ml_priv(struct net_device *dev,
                                   struct can_ml_priv *ml_priv)
{
        netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN);
}

#endif /* CAN_ML_H */
























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  (C) 2002  David Woodhouse <dwmw2@infradead.org>
  (C) 2012  Michel Lespinasse <walken@google.com>


  linux/include/linux/rbtree_augmented.h
*/

#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H

#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

/*
 * Please note - only struct rb_augment_callbacks and the prototypes for
 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 * The rest are implementation details you are not expected to depend on.
 *
 * See Documentation/core-api/rbtree.rst for documentation and samples.
 */

struct rb_augment_callbacks {
        void (*propagate)(struct rb_node *node, struct rb_node *stop);
        void (*copy)(struct rb_node *old, struct rb_node *new);
        void (*rotate)(struct rb_node *old, struct rb_node *new);
};

extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

/*
 * Fixup the rbtree and update the augmented information when rebalancing.
 *
 * On insertion, the user must update the augmented information on the path
 * leading to the inserted node, then call rb_link_node() as usual and
 * rb_insert_augmented() instead of the usual rb_insert_color() call.
 * If rb_insert_augmented() rebalances the rbtree, it will callback into
 * a user provided function to update the augmented information on the
 * affected subtrees.
 */
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
                    const struct rb_augment_callbacks *augment)
{
        __rb_insert_augmented(node, root, augment->rotate);
}

static inline void
rb_insert_augmented_cached(struct rb_node *node,
                           struct rb_root_cached *root, bool newleft,
                           const struct rb_augment_callbacks *augment)
{
        if (newleft)
                root->rb_leftmost = node;
        rb_insert_augmented(node, &root->rb_root, augment);
}

static __always_inline struct rb_node *
rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
                        bool (*less)(struct rb_node *, const struct rb_node *),
                        const struct rb_augment_callbacks *augment)
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        augment->propagate(parent, NULL); /* suboptimal */
        rb_insert_augmented_cached(node, tree, leftmost, augment);

        return leftmost ? node : NULL;
}

/*
 * Template for declaring augmented rbtree callbacks (generic case)
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
 */

#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                \
                             RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)        \
static inline void                                                        \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)                \
{                                                                        \
        while (rb != stop) {                                                \
                RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);        \
                if (RBCOMPUTE(node, true))                                \
                        break;                                                \
                rb = rb_parent(&node->RBFIELD);                                \
        }                                                                \
}                                                                        \
static inline void                                                        \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
}                                                                        \
static void                                                                \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)        \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
        RBCOMPUTE(old, false);                                                \
}                                                                        \
RBSTATIC const struct rb_augment_callbacks RBNAME = {                        \
        .propagate = RBNAME ## _propagate,                                \
        .copy = RBNAME ## _copy,                                        \
        .rotate = RBNAME ## _rotate                                        \
};

/*
 * Template for declaring augmented rbtree callbacks,
 * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBTYPE:      type of the RBAUGMENTED field
 * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
 */

#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,              \
                                 RBTYPE, RBAUGMENTED, RBCOMPUTE)              \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)              \
{                                                                              \
        RBSTRUCT *child;                                                      \
        RBTYPE max = RBCOMPUTE(node);                                              \
        if (node->RBFIELD.rb_left) {                                              \
                child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (node->RBFIELD.rb_right) {                                              \
                child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (exit && node->RBAUGMENTED == max)                                      \
                return true;                                                      \
        node->RBAUGMENTED = max;                                              \
        return false;                                                              \
}                                                                              \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                              \
                     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)


#define        RB_RED                0
#define        RB_BLACK        1

#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))

#define __rb_color(pc)     ((pc) & 1)
#define __rb_is_black(pc)  __rb_color(pc)
#define __rb_is_red(pc)    (!__rb_color(pc))
#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
        rb->__rb_parent_color = rb_color(rb) + (unsigned long)p;
}

static inline void rb_set_parent_color(struct rb_node *rb,
                                       struct rb_node *p, int color)
{
        rb->__rb_parent_color = (unsigned long)p + color;
}

static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
                  struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        WRITE_ONCE(parent->rb_left, new);
                else
                        WRITE_ONCE(parent->rb_right, new);
        } else
                WRITE_ONCE(root->rb_node, new);
}

static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
                      struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        rcu_assign_pointer(parent->rb_left, new);
                else
                        rcu_assign_pointer(parent->rb_right, new);
        } else
                rcu_assign_pointer(root->rb_node, new);
}

extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                     const struct rb_augment_callbacks *augment)
{
        struct rb_node *child = node->rb_right;
        struct rb_node *tmp = node->rb_left;
        struct rb_node *parent, *rebalance;
        unsigned long pc;

        if (!tmp) {
                /*
                 * Case 1: node to erase has no more than 1 child (easy!)
                 *
                 * Note that if there is one child it must be red due to 5)
                 * and node must be black due to 4). We adjust colors locally
                 * so as to bypass __rb_erase_color() later on.
                 */
                pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, child, parent, root);
                if (child) {
                        child->__rb_parent_color = pc;
                        rebalance = NULL;
                } else
                        rebalance = __rb_is_black(pc) ? parent : NULL;
                tmp = parent;
        } else if (!child) {
                /* Still case 1, but this time the child is node->rb_left */
                tmp->__rb_parent_color = pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, tmp, parent, root);
                rebalance = NULL;
                tmp = parent;
        } else {
                struct rb_node *successor = child, *child2;

                tmp = child->rb_left;
                if (!tmp) {
                        /*
                         * Case 2: node's successor is its right child
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (s)  ->  (x) (c)
                         *        \
                         *        (c)
                         */
                        parent = successor;
                        child2 = successor->rb_right;

                        augment->copy(node, successor);
                } else {
                        /*
                         * Case 3: node's successor is leftmost under
                         * node's right child subtree
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (y)  ->  (x) (y)
                         *      /            /
                         *    (p)          (p)
                         *    /            /
                         *  (s)          (c)
                         *    \
                         *    (c)
                         */
                        do {
                                parent = successor;
                                successor = tmp;
                                tmp = tmp->rb_left;
                        } while (tmp);
                        child2 = successor->rb_right;
                        WRITE_ONCE(parent->rb_left, child2);
                        WRITE_ONCE(successor->rb_right, child);
                        rb_set_parent(child, successor);

                        augment->copy(node, successor);
                        augment->propagate(parent, successor);
                }

                tmp = node->rb_left;
                WRITE_ONCE(successor->rb_left, tmp);
                rb_set_parent(tmp, successor);

                pc = node->__rb_parent_color;
                tmp = __rb_parent(pc);
                __rb_change_child(node, successor, tmp, root);

                if (child2) {
                        rb_set_parent_color(child2, parent, RB_BLACK);
                        rebalance = NULL;
                } else {
                        rebalance = rb_is_black(successor) ? parent : NULL;
                }
                successor->__rb_parent_color = pc;
                tmp = successor;
        }

        augment->propagate(tmp, NULL);
        return rebalance;
}

static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                   const struct rb_augment_callbacks *augment)
{
        struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
        if (rebalance)
                __rb_erase_color(rebalance, root, augment->rotate);
}

static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
                          const struct rb_augment_callbacks *augment)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase_augmented(node, &root->rb_root, augment);
}

#endif        /* _LINUX_RBTREE_AUGMENTED_H */














































    1 





























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM block

#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BLOCK_H

#include <linux/blktrace_api.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/tracepoint.h>

#define RWBS_LEN        8

#ifdef CONFIG_BUFFER_HEAD
DECLARE_EVENT_CLASS(block_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh),

        TP_STRUCT__entry (
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  size_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = bh->b_bdev->bd_dev;
                __entry->sector                = bh->b_blocknr;
                __entry->size                = bh->b_size;
        ),

        TP_printk("%d,%d sector=%llu size=%zu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                (unsigned long long)__entry->sector, __entry->size
        )
);

/**
 * block_touch_buffer - mark a buffer accessed
 * @bh: buffer_head being touched
 *
 * Called from touch_buffer().
 */
DEFINE_EVENT(block_buffer, block_touch_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);

/**
 * block_dirty_buffer - mark a buffer dirty
 * @bh: buffer_head being dirtied
 *
 * Called from mark_buffer_dirty().
 */
DEFINE_EVENT(block_buffer, block_dirty_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);
#endif /* CONFIG_BUFFER_HEAD */

/**
 * block_rq_requeue - place block IO request back on a queue
 * @rq: block IO operation request
 *
 * The block operation request @rq is being placed back into queue
 * @q.  For some reason the request was not completed and needs to be
 * put back in the queue.
 */
TRACE_EVENT(block_rq_requeue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, 0)
);

DECLARE_EVENT_CLASS(block_rq_completion,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  int        ,        error                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_pos(rq);
                __entry->nr_sector = nr_bytes >> 9;
                __entry->error     = blk_status_to_errno(error);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

/**
 * block_rq_complete - block IO operation completed by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_complete tracepoint event indicates that some portion
 * of operation request has been completed by the device driver.  If
 * the @rq->bio is %NULL, then there is absolutely no additional work to
 * do for the request. If @rq->bio is non-NULL then there is
 * additional work required to complete the request.
 */
DEFINE_EVENT(block_rq_completion, block_rq_complete,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes)
);

/**
 * block_rq_error - block IO operation error reported by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_error tracepoint event indicates that some portion
 * of operation request has failed as reported by the device driver.
 */
DEFINE_EVENT(block_rq_completion, block_rq_error,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes)
);

DECLARE_EVENT_CLASS(block_rq,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  unsigned int,        bytes                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __array(  char,         comm,   TASK_COMM_LEN   )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
                __entry->bytes     = blk_rq_bytes(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __entry->bytes, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_rq_insert - insert block operation request into queue
 * @rq: block IO operation request
 *
 * Called immediately before block operation request @rq is inserted
 * into queue @q.  The fields in the operation request @rq struct can
 * be examined to determine which device and sectors the pending
 * operation would access.
 */
DEFINE_EVENT(block_rq, block_rq_insert,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_issue - issue pending block IO request operation to device driver
 * @rq: block IO operation request
 *
 * Called when block operation request @rq from queue @q is sent to a
 * device driver for processing.
 */
DEFINE_EVENT(block_rq, block_rq_issue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_merge - merge request with another one in the elevator
 * @rq: block IO operation request
 *
 * Called when block operation request @rq from queue @q is merged to another
 * request queued in the elevator.
 */
DEFINE_EVENT(block_rq, block_rq_merge,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_io_start - insert a request for execution
 * @rq: block IO operation request
 *
 * Called when block operation request @rq is queued for execution
 */
DEFINE_EVENT(block_rq, block_io_start,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_io_done - block IO operation request completed
 * @rq: block IO operation request
 *
 * Called when block operation request @rq is completed
 */
DEFINE_EVENT(block_rq, block_io_done,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_bio_complete - completed all work on the block operation
 * @q: queue holding the block operation
 * @bio: block operation completed
 *
 * This tracepoint indicates there is no further work to do on this
 * block IO operation @bio.
 */
TRACE_EVENT(block_bio_complete,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned,        nr_sector        )
                __field( int,                error                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->error                = blk_status_to_errno(bio->bi_status);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
        ),

        TP_printk("%d,%d %s %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

DECLARE_EVENT_CLASS(block_bio,

        TP_PROTO(struct bio *bio),

        TP_ARGS(bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_bio_bounce - used bounce buffer when processing block operation
 * @bio: block operation
 *
 * A bounce buffer was used to handle the block operation @bio in @q.
 * This occurs when hardware limitations prevent a direct transfer of
 * data between the @bio data memory area and the IO device.  Use of a
 * bounce buffer requires extra copying of data and decreases
 * performance.
 */
DEFINE_EVENT(block_bio, block_bio_bounce,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_backmerge - merging block operation to the end of an existing operation
 * @bio: new block operation to merge
 *
 * Merging block request @bio to the end of an existing block request.
 */
DEFINE_EVENT(block_bio, block_bio_backmerge,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_frontmerge - merging block operation to the beginning of an existing operation
 * @bio: new block operation to merge
 *
 * Merging block IO operation @bio to the beginning of an existing block request.
 */
DEFINE_EVENT(block_bio, block_bio_frontmerge,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_queue - putting new block IO operation in queue
 * @bio: new block operation
 *
 * About to place the block IO operation @bio into queue @q.
 */
DEFINE_EVENT(block_bio, block_bio_queue,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_getrq - get a free request entry in queue for block IO operations
 * @bio: pending block IO operation (can be %NULL)
 *
 * A request struct has been allocated to handle the block IO operation @bio.
 */
DEFINE_EVENT(block_bio, block_getrq,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_plug - keep operations requests in request queue
 * @q: request queue to plug
 *
 * Plug the request queue @q.  Do not allow block operation requests
 * to be sent to the device driver. Instead, accumulate requests in
 * the queue to improve throughput performance of the block device.
 */
TRACE_EVENT(block_plug,

        TP_PROTO(struct request_queue *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s]", __entry->comm)
);

DECLARE_EVENT_CLASS(block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit),

        TP_STRUCT__entry(
                __field( int,                nr_rq                        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->nr_rq = depth;
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
);

/**
 * block_unplug - release of operations requests in request queue
 * @q: request queue to unplug
 * @depth: number of requests just added to the queue
 * @explicit: whether this was an explicit unplug, or one from schedule()
 *
 * Unplug request queue @q because device driver is scheduled to work
 * on elements in the request queue.
 */
DEFINE_EVENT(block_unplug, block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit)
);

/**
 * block_split - split a single bio struct into two bio structs
 * @bio: block operation being split
 * @new_sector: The starting sector for the new bio
 *
 * The bio request @bio needs to be split into two bio requests.  The newly
 * created @bio request starts at @new_sector. This split may be required due to
 * hardware limitations such as operation crossing device boundaries in a RAID
 * system.
 */
TRACE_EVENT(block_split,

        TP_PROTO(struct bio *bio, unsigned int new_sector),

        TP_ARGS(bio, new_sector),

        TP_STRUCT__entry(
                __field( dev_t,                dev                                )
                __field( sector_t,        sector                                )
                __field( sector_t,        new_sector                        )
                __array( char,                rwbs,                RWBS_LEN        )
                __array( char,                comm,                TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->new_sector        = new_sector;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu / %llu [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  (unsigned long long)__entry->new_sector,
                  __entry->comm)
);

/**
 * block_bio_remap - map request for a logical device to the raw device
 * @bio: revised operation
 * @dev: original device for the operation
 * @from: original sector for the operation
 *
 * An operation for a logical device has been mapped to the
 * raw block device.
 */
TRACE_EVENT(block_bio_remap,

        TP_PROTO(struct bio *bio, dev_t dev, sector_t from),

        TP_ARGS(bio, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector)
);

/**
 * block_rq_remap - map request for a block operation request
 * @rq: block IO operation request
 * @dev: device for the operation
 * @from: original sector for the operation
 *
 * The block operation request @rq in @q has been remapped.  The block
 * operation request @rq holds the current information and @from hold
 * the original sector.
 */
TRACE_EVENT(block_rq_remap,

        TP_PROTO(struct request *rq, dev_t dev, sector_t from),

        TP_ARGS(rq, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __field( unsigned int,        nr_bios                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = disk_devt(rq->q->disk);
                __entry->sector                = blk_rq_pos(rq);
                __entry->nr_sector        = blk_rq_sectors(rq);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                __entry->nr_bios        = blk_rq_count_bios(rq);
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector, __entry->nr_bios)
);

#endif /* _TRACE_BLOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





















































































































































































































    1 


    1 
    1 
    1 

    1 





































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/mlock.c
 *
 *  (C) Copyright 1995 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 */

#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/sched/user.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/pagewalk.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/secretmem.h>

#include "internal.h"

struct mlock_fbatch {
        local_lock_t lock;
        struct folio_batch fbatch;
};

static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

bool can_do_mlock(void)
{
        if (rlimit(RLIMIT_MEMLOCK) != 0)
                return true;
        if (capable(CAP_IPC_LOCK))
                return true;
        return false;
}
EXPORT_SYMBOL(can_do_mlock);

/*
 * Mlocked folios are marked with the PG_mlocked flag for efficient testing
 * in vmscan and, possibly, the fault path; and to support semi-accurate
 * statistics.
 *
 * An mlocked folio [folio_test_mlocked(folio)] is unevictable.  As such, it
 * will be ostensibly placed on the LRU "unevictable" list (actually no such
 * list exists), rather than the [in]active lists. PG_unevictable is set to
 * indicate the unevictable state.
 */

static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        /* There is nothing more we can do while it's off LRU */
        if (!folio_test_clear_lru(folio))
                return lruvec;

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (unlikely(folio_evictable(folio))) {
                /*
                 * This is a little surprising, but quite possible: PG_mlocked
                 * must have got cleared already by another CPU.  Could this
                 * folio be unevictable?  I'm not sure, but move it now if so.
                 */
                if (folio_test_unevictable(folio)) {
                        lruvec_del_folio(lruvec, folio);
                        folio_clear_unevictable(folio);
                        lruvec_add_folio(lruvec, folio);

                        __count_vm_events(UNEVICTABLE_PGRESCUED,
                                          folio_nr_pages(folio));
                }
                goto out;
        }

        if (folio_test_unevictable(folio)) {
                if (folio_test_mlocked(folio))
                        folio->mlock_count++;
                goto out;
        }

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        lruvec_add_folio(lruvec, folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        /* As above, this is a little surprising, but possible */
        if (unlikely(folio_evictable(folio)))
                goto out;

        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        lruvec_add_folio(lruvec, folio);
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        int nr_pages = folio_nr_pages(folio);
        bool isolated = false;

        if (!folio_test_clear_lru(folio))
                goto munlock;

        isolated = true;
        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (folio_test_unevictable(folio)) {
                /* Then mlock_count is maintained, but might undercount */
                if (folio->mlock_count)
                        folio->mlock_count--;
                if (folio->mlock_count)
                        goto out;
        }
        /* else assume that was the last mlock: reclaim will fix it if not */

munlock:
        if (folio_test_clear_mlocked(folio)) {
                __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
                if (isolated || !folio_test_unevictable(folio))
                        __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
                else
                        __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
        }

        /* folio_evictable() has to be checked *after* clearing Mlocked */
        if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
                lruvec_del_folio(lruvec, folio);
                folio_clear_unevictable(folio);
                lruvec_add_folio(lruvec, folio);
                __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        }
out:
        if (isolated)
                folio_set_lru(folio);
        return lruvec;
}

/*
 * Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
 */
#define LRU_FOLIO 0x1
#define NEW_FOLIO 0x2
static inline struct folio *mlock_lru(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + LRU_FOLIO);
}

static inline struct folio *mlock_new(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + NEW_FOLIO);
}

/*
 * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
 * make use of such folio pointer flags in future, but for now just keep it for
 * mlock.  We could use three separate folio batches instead, but one feels
 * better (munlocking a full folio batch does not need to drain mlocking folio
 * batches first).
 */
static void mlock_folio_batch(struct folio_batch *fbatch)
{
        struct lruvec *lruvec = NULL;
        unsigned long mlock;
        struct folio *folio;
        int i;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                folio = fbatch->folios[i];
                mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
                folio = (struct folio *)((unsigned long)folio - mlock);
                fbatch->folios[i] = folio;

                if (mlock & LRU_FOLIO)
                        lruvec = __mlock_folio(folio, lruvec);
                else if (mlock & NEW_FOLIO)
                        lruvec = __mlock_new_folio(folio, lruvec);
                else
                        lruvec = __munlock_folio(folio, lruvec);
        }

        if (lruvec)
                unlock_page_lruvec_irq(lruvec);
        folios_put(fbatch);
}

void mlock_drain_local(void)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

void mlock_drain_remote(int cpu)
{
        struct folio_batch *fbatch;

        WARN_ON_ONCE(cpu_online(cpu));
        fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
}

bool need_mlock_drain(int cpu)
{
        return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
}

/**
 * mlock_folio - mlock a folio already on (or temporarily off) LRU
 * @folio: folio to be mlocked.
 */
void mlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);

        if (!folio_test_set_mlocked(folio)) {
                int nr_pages = folio_nr_pages(folio);

                zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
                __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
        }

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * mlock_new_folio - mlock a newly allocated folio not yet on LRU
 * @folio: folio to be mlocked, either normal or a THP head.
 */
void mlock_new_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int nr_pages = folio_nr_pages(folio);

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        folio_set_mlocked(folio);

        zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
        __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_new(folio)) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * munlock_folio - munlock a folio
 * @folio: folio to be munlocked, either normal or a THP head.
 */
void munlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        /*
         * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
         * which will check whether the folio is multiply mlocked.
         */
        folio_get(folio);
        if (!folio_batch_add(fbatch, folio) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

static inline unsigned int folio_mlock_step(struct folio *folio,
                pte_t *pte, unsigned long addr, unsigned long end)
{
        unsigned int count, i, nr = folio_nr_pages(folio);
        unsigned long pfn = folio_pfn(folio);
        pte_t ptent = ptep_get(pte);

        if (!folio_test_large(folio))
                return 1;

        count = pfn + nr - pte_pfn(ptent);
        count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);

        for (i = 0; i < count; i++, pte++) {
                pte_t entry = ptep_get(pte);

                if (!pte_present(entry))
                        break;
                if (pte_pfn(entry) - pfn >= nr)
                        break;
        }

        return i;
}

static inline bool allow_mlock_munlock(struct folio *folio,
                struct vm_area_struct *vma, unsigned long start,
                unsigned long end, unsigned int step)
{
        /*
         * For unlock, allow munlock large folio which is partially
         * mapped to VMA. As it's possible that large folio is
         * mlocked and VMA is split later.
         *
         * During memory pressure, such kind of large folio can
         * be split. And the pages are not in VM_LOCKed VMA
         * can be reclaimed.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return true;

        /* folio_within_range() cannot take KSM, but any small folio is OK */
        if (!folio_test_large(folio))
                return true;

        /* folio not in range [start, end), skip mlock */
        if (!folio_within_range(folio, vma, start, end))
                return false;

        /* folio is not fully mapped, skip mlock */
        if (step != folio_nr_pages(folio))
                return false;

        return true;
}

static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
                           unsigned long end, struct mm_walk *walk)

{
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *start_pte, *pte;
        pte_t ptent;
        struct folio *folio;
        unsigned int step = 1;
        unsigned long start = addr;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                if (!pmd_present(*pmd))
                        goto out;
                if (is_huge_zero_pmd(*pmd))
                        goto out;
                folio = pmd_folio(*pmd);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);
                goto out;
        }

        start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!start_pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }

        for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = ptep_get(pte);
                if (!pte_present(ptent))
                        continue;
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                step = folio_mlock_step(folio, pte, addr, end);
                if (!allow_mlock_munlock(folio, vma, start, end, step))
                        goto next_entry;

                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);

next_entry:
                pte += step - 1;
                addr += (step - 1) << PAGE_SHIFT;
        }
        pte_unmap(start_pte);
out:
        spin_unlock(ptl);
        cond_resched();
        return 0;
}

/*
 * mlock_vma_pages_range() - mlock any pages already in the range,
 *                           or munlock all pages in the range.
 * @vma - vma containing range to be mlock()ed or munlock()ed
 * @start - start address in @vma of the range
 * @end - end of range in @vma
 * @newflags - the new set of flags for @vma.
 *
 * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
 * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
 */
static void mlock_vma_pages_range(struct vm_area_struct *vma,
        unsigned long start, unsigned long end, vm_flags_t newflags)
{
        static const struct mm_walk_ops mlock_walk_ops = {
                .pmd_entry = mlock_pte_range,
                .walk_lock = PGWALK_WRLOCK_VERIFY,
        };

        /*
         * There is a slight chance that concurrent page migration,
         * or page reclaim finding a page of this now-VM_LOCKED vma,
         * will call mlock_vma_folio() and raise page's mlock_count:
         * double counting, leaving the page unevictable indefinitely.
         * Communicate this danger to mlock_vma_folio() with VM_IO,
         * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
         * mmap_lock is held in write mode here, so this weird
         * combination should not be visible to other mmap_lock users;
         * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
         */
        if (newflags & VM_LOCKED)
                newflags |= VM_IO;
        vma_start_write(vma);
        vm_flags_reset_once(vma, newflags);

        lru_add_drain();
        walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
        lru_add_drain();

        if (newflags & VM_IO) {
                newflags &= ~VM_IO;
                vm_flags_reset_once(vma, newflags);
        }
}

/*
 * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 *
 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 * munlock is a no-op.  However, for some special vmas, we go ahead and
 * populate the ptes.
 *
 * For vmas that pass the filters, merge/split as appropriate.
 */
static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
               struct vm_area_struct **prev, unsigned long start,
               unsigned long end, vm_flags_t newflags)
{
        struct mm_struct *mm = vma->vm_mm;
        int nr_pages;
        int ret = 0;
        vm_flags_t oldflags = vma->vm_flags;

        if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
            vma_is_dax(vma) || vma_is_secretmem(vma))
                /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
                goto out;

        vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        /*
         * Keep track of amount of locked VM.
         */
        nr_pages = (end - start) >> PAGE_SHIFT;
        if (!(newflags & VM_LOCKED))
                nr_pages = -nr_pages;
        else if (oldflags & VM_LOCKED)
                nr_pages = 0;
        mm->locked_vm += nr_pages;

        /*
         * vm_flags is protected by the mmap_lock held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
         * set VM_LOCKED, populate_vma_page_range will bring it back.
         */
        if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
                /* No work to do, and mlocking twice would be wrong */
                vma_start_write(vma);
                vm_flags_reset(vma, newflags);
        } else {
                mlock_vma_pages_range(vma, start, end, newflags);
        }
out:
        *prev = vma;
        return ret;
}

static int apply_vma_lock_flags(unsigned long start, size_t len,
                                vm_flags_t flags)
{
        unsigned long nstart, end, tmp;
        struct vm_area_struct *vma, *prev;
        VMA_ITERATOR(vmi, current->mm, start);

        VM_BUG_ON(offset_in_page(start));
        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        vma = vma_iter_load(&vmi);
        if (!vma)
                return -ENOMEM;

        prev = vma_prev(&vmi);
        if (start > vma->vm_start)
                prev = vma;

        nstart = start;
        tmp = vma->vm_start;
        for_each_vma_range(vmi, vma, end) {
                int error;
                vm_flags_t newflags;

                if (vma->vm_start != tmp)
                        return -ENOMEM;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= flags;
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
                tmp = vma->vm_end;
                if (tmp > end)
                        tmp = end;
                error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
                if (error)
                        return error;
                tmp = vma_iter_end(&vmi);
                nstart = tmp;
        }

        if (tmp < end)
                return -ENOMEM;

        return 0;
}

/*
 * Go through vma areas and sum size of mlocked
 * vma pages, as return value.
 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
 * is also counted.
 * Return value: previously mlocked page counts
 */
static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
                unsigned long start, size_t len)
{
        struct vm_area_struct *vma;
        unsigned long count = 0;
        unsigned long end;
        VMA_ITERATOR(vmi, mm, start);

        /* Don't overflow past ULONG_MAX */
        if (unlikely(ULONG_MAX - len < start))
                end = ULONG_MAX;
        else
                end = start + len;

        for_each_vma_range(vmi, vma, end) {
                if (vma->vm_flags & VM_LOCKED) {
                        if (start > vma->vm_start)
                                count -= (start - vma->vm_start);
                        if (end < vma->vm_end) {
                                count += end - vma->vm_start;
                                break;
                        }
                        count += vma->vm_end - vma->vm_start;
                }
        }

        return count >> PAGE_SHIFT;
}

/*
 * convert get_user_pages() return value to posix mlock() error
 */
static int __mlock_posix_error_return(long retval)
{
        if (retval == -EFAULT)
                retval = -ENOMEM;
        else if (retval == -ENOMEM)
                retval = -EAGAIN;
        return retval;
}

static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
        unsigned long locked;
        unsigned long lock_limit;
        int error = -ENOMEM;

        start = untagged_addr(start);

        if (!can_do_mlock())
                return -EPERM;

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = len >> PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        locked += current->mm->locked_vm;
        if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
                /*
                 * It is possible that the regions requested intersect with
                 * previously mlocked areas, that part area in "mm->locked_vm"
                 * should not be counted to new mlock increment count. So check
                 * and adjust locked count if necessary.
                 */
                locked -= count_mm_mlocked_page_nr(current->mm,
                                start, len);
        }

        /* check against resource limits */
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = apply_vma_lock_flags(start, len, flags);

        mmap_write_unlock(current->mm);
        if (error)
                return error;

        error = __mm_populate(start, len, 0);
        if (error)
                return __mlock_posix_error_return(error);
        return 0;
}

SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
        return do_mlock(start, len, VM_LOCKED);
}

SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
{
        vm_flags_t vm_flags = VM_LOCKED;

        if (flags & ~MLOCK_ONFAULT)
                return -EINVAL;

        if (flags & MLOCK_ONFAULT)
                vm_flags |= VM_LOCKONFAULT;

        return do_mlock(start, len, vm_flags);
}

SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
        int ret;

        start = untagged_addr(start);

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_vma_lock_flags(start, len, 0);
        mmap_write_unlock(current->mm);

        return ret;
}

/*
 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
 * and translate into the appropriate modifications to mm->def_flags and/or the
 * flags for all current VMAs.
 *
 * There are a couple of subtleties with this.  If mlockall() is called multiple
 * times with different flags, the values do not necessarily stack.  If mlockall
 * is called once including the MCL_FUTURE flag and then a second time without
 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
 */
static int apply_mlockall_flags(int flags)
{
        VMA_ITERATOR(vmi, current->mm, 0);
        struct vm_area_struct *vma, *prev = NULL;
        vm_flags_t to_add = 0;

        current->mm->def_flags &= ~VM_LOCKED_MASK;
        if (flags & MCL_FUTURE) {
                current->mm->def_flags |= VM_LOCKED;

                if (flags & MCL_ONFAULT)
                        current->mm->def_flags |= VM_LOCKONFAULT;

                if (!(flags & MCL_CURRENT))
                        goto out;
        }

        if (flags & MCL_CURRENT) {
                to_add |= VM_LOCKED;
                if (flags & MCL_ONFAULT)
                        to_add |= VM_LOCKONFAULT;
        }

        for_each_vma(vmi, vma) {
                vm_flags_t newflags;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= to_add;

                /* Ignore errors */
                mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
                            newflags);
                cond_resched();
        }
out:
        return 0;
}

SYSCALL_DEFINE1(mlockall, int, flags)
{
        unsigned long lock_limit;
        int ret;

        if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
            flags == MCL_ONFAULT)
                return -EINVAL;

        if (!can_do_mlock())
                return -EPERM;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        ret = -ENOMEM;
        if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
            capable(CAP_IPC_LOCK))
                ret = apply_mlockall_flags(flags);
        mmap_write_unlock(current->mm);
        if (!ret && (flags & MCL_CURRENT))
                mm_populate(0, TASK_SIZE);

        return ret;
}

SYSCALL_DEFINE0(munlockall)
{
        int ret;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_mlockall_flags(0);
        mmap_write_unlock(current->mm);
        return ret;
}

/*
 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 * shm segments) get accounted against the user_struct instead.
 */
static DEFINE_SPINLOCK(shmlock_user_lock);

int user_shm_lock(size_t size, struct ucounts *ucounts)
{
        unsigned long lock_limit, locked;
        long memlock;
        int allowed = 0;

        locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        if (lock_limit != RLIM_INFINITY)
                lock_limit >>= PAGE_SHIFT;
        spin_lock(&shmlock_user_lock);
        memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);

        if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                goto out;
        }
        if (!get_ucounts(ucounts)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                allowed = 0;
                goto out;
        }
        allowed = 1;
out:
        spin_unlock(&shmlock_user_lock);
        return allowed;
}

void user_shm_unlock(size_t size, struct ucounts *ucounts)
{
        spin_lock(&shmlock_user_lock);
        dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
        spin_unlock(&shmlock_user_lock);
        put_ucounts(ucounts);
}
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 












































    2 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * security/tomoyo/common.h
 *
 * Header file for TOMOYO.
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#ifndef _SECURITY_TOMOYO_COMMON_H
#define _SECURITY_TOMOYO_COMMON_H

#define pr_fmt(fmt) fmt

#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/kmod.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/list.h>
#include <linux/cred.h>
#include <linux/poll.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/un.h>
#include <linux/lsm_hooks.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/udp.h>

/********** Constants definitions. **********/

/*
 * TOMOYO uses this hash only when appending a string into the string
 * table. Frequency of appending strings is very low. So we don't need
 * large (e.g. 64k) hash size. 256 will be sufficient.
 */
#define TOMOYO_HASH_BITS  8
#define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS)

/*
 * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET.
 * Therefore, we don't need SOCK_MAX.
 */
#define TOMOYO_SOCK_MAX 6

#define TOMOYO_EXEC_TMPSIZE     4096

/* Garbage collector is trying to kfree() this element. */
#define TOMOYO_GC_IN_PROGRESS -1

/* Profile number is an integer between 0 and 255. */
#define TOMOYO_MAX_PROFILES 256

/* Group number is an integer between 0 and 255. */
#define TOMOYO_MAX_ACL_GROUPS 256

/* Index numbers for "struct tomoyo_condition". */
enum tomoyo_conditions_index {
        TOMOYO_TASK_UID,             /* current_uid()   */
        TOMOYO_TASK_EUID,            /* current_euid()  */
        TOMOYO_TASK_SUID,            /* current_suid()  */
        TOMOYO_TASK_FSUID,           /* current_fsuid() */
        TOMOYO_TASK_GID,             /* current_gid()   */
        TOMOYO_TASK_EGID,            /* current_egid()  */
        TOMOYO_TASK_SGID,            /* current_sgid()  */
        TOMOYO_TASK_FSGID,           /* current_fsgid() */
        TOMOYO_TASK_PID,             /* sys_getpid()   */
        TOMOYO_TASK_PPID,            /* sys_getppid()  */
        TOMOYO_EXEC_ARGC,            /* "struct linux_binprm *"->argc */
        TOMOYO_EXEC_ENVC,            /* "struct linux_binprm *"->envc */
        TOMOYO_TYPE_IS_SOCKET,       /* S_IFSOCK */
        TOMOYO_TYPE_IS_SYMLINK,      /* S_IFLNK */
        TOMOYO_TYPE_IS_FILE,         /* S_IFREG */
        TOMOYO_TYPE_IS_BLOCK_DEV,    /* S_IFBLK */
        TOMOYO_TYPE_IS_DIRECTORY,    /* S_IFDIR */
        TOMOYO_TYPE_IS_CHAR_DEV,     /* S_IFCHR */
        TOMOYO_TYPE_IS_FIFO,         /* S_IFIFO */
        TOMOYO_MODE_SETUID,          /* S_ISUID */
        TOMOYO_MODE_SETGID,          /* S_ISGID */
        TOMOYO_MODE_STICKY,          /* S_ISVTX */
        TOMOYO_MODE_OWNER_READ,      /* S_IRUSR */
        TOMOYO_MODE_OWNER_WRITE,     /* S_IWUSR */
        TOMOYO_MODE_OWNER_EXECUTE,   /* S_IXUSR */
        TOMOYO_MODE_GROUP_READ,      /* S_IRGRP */
        TOMOYO_MODE_GROUP_WRITE,     /* S_IWGRP */
        TOMOYO_MODE_GROUP_EXECUTE,   /* S_IXGRP */
        TOMOYO_MODE_OTHERS_READ,     /* S_IROTH */
        TOMOYO_MODE_OTHERS_WRITE,    /* S_IWOTH */
        TOMOYO_MODE_OTHERS_EXECUTE,  /* S_IXOTH */
        TOMOYO_EXEC_REALPATH,
        TOMOYO_SYMLINK_TARGET,
        TOMOYO_PATH1_UID,
        TOMOYO_PATH1_GID,
        TOMOYO_PATH1_INO,
        TOMOYO_PATH1_MAJOR,
        TOMOYO_PATH1_MINOR,
        TOMOYO_PATH1_PERM,
        TOMOYO_PATH1_TYPE,
        TOMOYO_PATH1_DEV_MAJOR,
        TOMOYO_PATH1_DEV_MINOR,
        TOMOYO_PATH2_UID,
        TOMOYO_PATH2_GID,
        TOMOYO_PATH2_INO,
        TOMOYO_PATH2_MAJOR,
        TOMOYO_PATH2_MINOR,
        TOMOYO_PATH2_PERM,
        TOMOYO_PATH2_TYPE,
        TOMOYO_PATH2_DEV_MAJOR,
        TOMOYO_PATH2_DEV_MINOR,
        TOMOYO_PATH1_PARENT_UID,
        TOMOYO_PATH1_PARENT_GID,
        TOMOYO_PATH1_PARENT_INO,
        TOMOYO_PATH1_PARENT_PERM,
        TOMOYO_PATH2_PARENT_UID,
        TOMOYO_PATH2_PARENT_GID,
        TOMOYO_PATH2_PARENT_INO,
        TOMOYO_PATH2_PARENT_PERM,
        TOMOYO_MAX_CONDITION_KEYWORD,
        TOMOYO_NUMBER_UNION,
        TOMOYO_NAME_UNION,
        TOMOYO_ARGV_ENTRY,
        TOMOYO_ENVP_ENTRY,
};


/* Index numbers for stat(). */
enum tomoyo_path_stat_index {
        /* Do not change this order. */
        TOMOYO_PATH1,
        TOMOYO_PATH1_PARENT,
        TOMOYO_PATH2,
        TOMOYO_PATH2_PARENT,
        TOMOYO_MAX_PATH_STAT
};

/* Index numbers for operation mode. */
enum tomoyo_mode_index {
        TOMOYO_CONFIG_DISABLED,
        TOMOYO_CONFIG_LEARNING,
        TOMOYO_CONFIG_PERMISSIVE,
        TOMOYO_CONFIG_ENFORCING,
        TOMOYO_CONFIG_MAX_MODE,
        TOMOYO_CONFIG_WANT_REJECT_LOG =  64,
        TOMOYO_CONFIG_WANT_GRANT_LOG  = 128,
        TOMOYO_CONFIG_USE_DEFAULT     = 255,
};

/* Index numbers for entry type. */
enum tomoyo_policy_id {
        TOMOYO_ID_GROUP,
        TOMOYO_ID_ADDRESS_GROUP,
        TOMOYO_ID_PATH_GROUP,
        TOMOYO_ID_NUMBER_GROUP,
        TOMOYO_ID_TRANSITION_CONTROL,
        TOMOYO_ID_AGGREGATOR,
        TOMOYO_ID_MANAGER,
        TOMOYO_ID_CONDITION,
        TOMOYO_ID_NAME,
        TOMOYO_ID_ACL,
        TOMOYO_ID_DOMAIN,
        TOMOYO_MAX_POLICY
};

/* Index numbers for domain's attributes. */
enum tomoyo_domain_info_flags_index {
        /* Quota warnning flag.   */
        TOMOYO_DIF_QUOTA_WARNED,
        /*
         * This domain was unable to create a new domain at
         * tomoyo_find_next_domain() because the name of the domain to be
         * created was too long or it could not allocate memory.
         * More than one process continued execve() without domain transition.
         */
        TOMOYO_DIF_TRANSITION_FAILED,
        TOMOYO_MAX_DOMAIN_INFO_FLAGS
};

/* Index numbers for audit type. */
enum tomoyo_grant_log {
        /* Follow profile's configuration. */
        TOMOYO_GRANTLOG_AUTO,
        /* Do not generate grant log. */
        TOMOYO_GRANTLOG_NO,
        /* Generate grant_log. */
        TOMOYO_GRANTLOG_YES,
};

/* Index numbers for group entries. */
enum tomoyo_group_id {
        TOMOYO_PATH_GROUP,
        TOMOYO_NUMBER_GROUP,
        TOMOYO_ADDRESS_GROUP,
        TOMOYO_MAX_GROUP
};

/* Index numbers for type of numeric values. */
enum tomoyo_value_type {
        TOMOYO_VALUE_TYPE_INVALID,
        TOMOYO_VALUE_TYPE_DECIMAL,
        TOMOYO_VALUE_TYPE_OCTAL,
        TOMOYO_VALUE_TYPE_HEXADECIMAL,
};

/* Index numbers for domain transition control keywords. */
enum tomoyo_transition_type {
        /* Do not change this order, */
        TOMOYO_TRANSITION_CONTROL_NO_RESET,
        TOMOYO_TRANSITION_CONTROL_RESET,
        TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_NO_KEEP,
        TOMOYO_TRANSITION_CONTROL_KEEP,
        TOMOYO_MAX_TRANSITION_TYPE
};

/* Index numbers for Access Controls. */
enum tomoyo_acl_entry_type_index {
        TOMOYO_TYPE_PATH_ACL,
        TOMOYO_TYPE_PATH2_ACL,
        TOMOYO_TYPE_PATH_NUMBER_ACL,
        TOMOYO_TYPE_MKDEV_ACL,
        TOMOYO_TYPE_MOUNT_ACL,
        TOMOYO_TYPE_INET_ACL,
        TOMOYO_TYPE_UNIX_ACL,
        TOMOYO_TYPE_ENV_ACL,
        TOMOYO_TYPE_MANUAL_TASK_ACL,
};

/* Index numbers for access controls with one pathname. */
enum tomoyo_path_acl_index {
        TOMOYO_TYPE_EXECUTE,
        TOMOYO_TYPE_READ,
        TOMOYO_TYPE_WRITE,
        TOMOYO_TYPE_APPEND,
        TOMOYO_TYPE_UNLINK,
        TOMOYO_TYPE_GETATTR,
        TOMOYO_TYPE_RMDIR,
        TOMOYO_TYPE_TRUNCATE,
        TOMOYO_TYPE_SYMLINK,
        TOMOYO_TYPE_CHROOT,
        TOMOYO_TYPE_UMOUNT,
        TOMOYO_MAX_PATH_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_memory_stat_type {
        TOMOYO_MEMORY_POLICY,
        TOMOYO_MEMORY_AUDIT,
        TOMOYO_MEMORY_QUERY,
        TOMOYO_MAX_MEMORY_STAT
};

enum tomoyo_mkdev_acl_index {
        TOMOYO_TYPE_MKBLOCK,
        TOMOYO_TYPE_MKCHAR,
        TOMOYO_MAX_MKDEV_OPERATION
};

/* Index numbers for socket operations. */
enum tomoyo_network_acl_index {
        TOMOYO_NETWORK_BIND,    /* bind() operation. */
        TOMOYO_NETWORK_LISTEN,  /* listen() operation. */
        TOMOYO_NETWORK_CONNECT, /* connect() operation. */
        TOMOYO_NETWORK_SEND,    /* send() operation. */
        TOMOYO_MAX_NETWORK_OPERATION
};

/* Index numbers for access controls with two pathnames. */
enum tomoyo_path2_acl_index {
        TOMOYO_TYPE_LINK,
        TOMOYO_TYPE_RENAME,
        TOMOYO_TYPE_PIVOT_ROOT,
        TOMOYO_MAX_PATH2_OPERATION
};

/* Index numbers for access controls with one pathname and one number. */
enum tomoyo_path_number_acl_index {
        TOMOYO_TYPE_CREATE,
        TOMOYO_TYPE_MKDIR,
        TOMOYO_TYPE_MKFIFO,
        TOMOYO_TYPE_MKSOCK,
        TOMOYO_TYPE_IOCTL,
        TOMOYO_TYPE_CHMOD,
        TOMOYO_TYPE_CHOWN,
        TOMOYO_TYPE_CHGRP,
        TOMOYO_MAX_PATH_NUMBER_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */
enum tomoyo_securityfs_interface_index {
        TOMOYO_DOMAINPOLICY,
        TOMOYO_EXCEPTIONPOLICY,
        TOMOYO_PROCESS_STATUS,
        TOMOYO_STAT,
        TOMOYO_AUDIT,
        TOMOYO_VERSION,
        TOMOYO_PROFILE,
        TOMOYO_QUERY,
        TOMOYO_MANAGER
};

/* Index numbers for special mount operations. */
enum tomoyo_special_mount {
        TOMOYO_MOUNT_BIND,            /* mount --bind /source /dest   */
        TOMOYO_MOUNT_MOVE,            /* mount --move /old /new       */
        TOMOYO_MOUNT_REMOUNT,         /* mount -o remount /dir        */
        TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */
        TOMOYO_MOUNT_MAKE_PRIVATE,    /* mount --make-private /dir    */
        TOMOYO_MOUNT_MAKE_SLAVE,      /* mount --make-slave /dir      */
        TOMOYO_MOUNT_MAKE_SHARED,     /* mount --make-shared /dir     */
        TOMOYO_MAX_SPECIAL_MOUNT
};

/* Index numbers for functionality. */
enum tomoyo_mac_index {
        TOMOYO_MAC_FILE_EXECUTE,
        TOMOYO_MAC_FILE_OPEN,
        TOMOYO_MAC_FILE_CREATE,
        TOMOYO_MAC_FILE_UNLINK,
        TOMOYO_MAC_FILE_GETATTR,
        TOMOYO_MAC_FILE_MKDIR,
        TOMOYO_MAC_FILE_RMDIR,
        TOMOYO_MAC_FILE_MKFIFO,
        TOMOYO_MAC_FILE_MKSOCK,
        TOMOYO_MAC_FILE_TRUNCATE,
        TOMOYO_MAC_FILE_SYMLINK,
        TOMOYO_MAC_FILE_MKBLOCK,
        TOMOYO_MAC_FILE_MKCHAR,
        TOMOYO_MAC_FILE_LINK,
        TOMOYO_MAC_FILE_RENAME,
        TOMOYO_MAC_FILE_CHMOD,
        TOMOYO_MAC_FILE_CHOWN,
        TOMOYO_MAC_FILE_CHGRP,
        TOMOYO_MAC_FILE_IOCTL,
        TOMOYO_MAC_FILE_CHROOT,
        TOMOYO_MAC_FILE_MOUNT,
        TOMOYO_MAC_FILE_UMOUNT,
        TOMOYO_MAC_FILE_PIVOT_ROOT,
        TOMOYO_MAC_NETWORK_INET_STREAM_BIND,
        TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_INET_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_INET_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_INET_RAW_BIND,
        TOMOYO_MAC_NETWORK_INET_RAW_SEND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT,
        TOMOYO_MAC_ENVIRON,
        TOMOYO_MAX_MAC_INDEX
};

/* Index numbers for category of functionality. */
enum tomoyo_mac_category_index {
        TOMOYO_MAC_CATEGORY_FILE,
        TOMOYO_MAC_CATEGORY_NETWORK,
        TOMOYO_MAC_CATEGORY_MISC,
        TOMOYO_MAX_MAC_CATEGORY_INDEX
};

/*
 * Retry this request. Returned by tomoyo_supervisor() if policy violation has
 * occurred in enforcing mode and the userspace daemon decided to retry.
 *
 * We must choose a positive value in order to distinguish "granted" (which is
 * 0) and "rejected" (which is a negative value) and "retry".
 */
#define TOMOYO_RETRY_REQUEST 1

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_policy_stat_type {
        /* Do not change this order. */
        TOMOYO_STAT_POLICY_UPDATES,
        TOMOYO_STAT_POLICY_LEARNING,   /* == TOMOYO_CONFIG_LEARNING */
        TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */
        TOMOYO_STAT_POLICY_ENFORCING,  /* == TOMOYO_CONFIG_ENFORCING */
        TOMOYO_MAX_POLICY_STAT
};

/* Index numbers for profile's PREFERENCE values. */
enum tomoyo_pref_index {
        TOMOYO_PREF_MAX_AUDIT_LOG,
        TOMOYO_PREF_MAX_LEARNING_ENTRY,
        TOMOYO_MAX_PREF
};

/********** Structure definitions. **********/

/* Common header for holding ACL entries. */
struct tomoyo_acl_head {
        struct list_head list;
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
} __packed;

/* Common header for shared entries. */
struct tomoyo_shared_acl_head {
        struct list_head list;
        atomic_t users;
} __packed;

struct tomoyo_policy_namespace;

/* Structure for request info. */
struct tomoyo_request_info {
        /*
         * For holding parameters specific to operations which deal files.
         * NULL if not dealing files.
         */
        struct tomoyo_obj_info *obj;
        /*
         * For holding parameters specific to execve() request.
         * NULL if not dealing execve().
         */
        struct tomoyo_execve *ee;
        struct tomoyo_domain_info *domain;
        /* For holding parameters. */
        union {
                struct {
                        const struct tomoyo_path_info *filename;
                        /* For using wildcards at tomoyo_find_next_domain(). */
                        const struct tomoyo_path_info *matched_path;
                        /* One of values in "enum tomoyo_path_acl_index". */
                        u8 operation;
                } path;
                struct {
                        const struct tomoyo_path_info *filename1;
                        const struct tomoyo_path_info *filename2;
                        /* One of values in "enum tomoyo_path2_acl_index". */
                        u8 operation;
                } path2;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned int mode;
                        unsigned int major;
                        unsigned int minor;
                        /* One of values in "enum tomoyo_mkdev_acl_index". */
                        u8 operation;
                } mkdev;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned long number;
                        /*
                         * One of values in
                         * "enum tomoyo_path_number_acl_index".
                         */
                        u8 operation;
                } path_number;
                struct {
                        const struct tomoyo_path_info *name;
                } environ;
                struct {
                        const __be32 *address;
                        u16 port;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                        bool is_ipv6;
                } inet_network;
                struct {
                        const struct tomoyo_path_info *address;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                } unix_network;
                struct {
                        const struct tomoyo_path_info *type;
                        const struct tomoyo_path_info *dir;
                        const struct tomoyo_path_info *dev;
                        unsigned long flags;
                        int need_dev;
                } mount;
                struct {
                        const struct tomoyo_path_info *domainname;
                } task;
        } param;
        struct tomoyo_acl_info *matched_acl;
        u8 param_type;
        bool granted;
        u8 retry;
        u8 profile;
        u8 mode; /* One of tomoyo_mode_index . */
        u8 type;
};

/* Structure for holding a token. */
struct tomoyo_path_info {
        const char *name;
        u32 hash;          /* = full_name_hash(name, strlen(name)) */
        u16 const_len;     /* = tomoyo_const_part_length(name)     */
        bool is_dir;       /* = tomoyo_strendswith(name, "/")      */
        bool is_patterned; /* = tomoyo_path_contains_pattern(name) */
};

/* Structure for holding string data. */
struct tomoyo_name {
        struct tomoyo_shared_acl_head head;
        struct tomoyo_path_info entry;
};

/* Structure for holding a word. */
struct tomoyo_name_union {
        /* Either @filename or @group is NULL. */
        const struct tomoyo_path_info *filename;
        struct tomoyo_group *group;
};

/* Structure for holding a number. */
struct tomoyo_number_union {
        unsigned long values[2];
        struct tomoyo_group *group; /* Maybe NULL. */
        /* One of values in "enum tomoyo_value_type". */
        u8 value_type[2];
};

/* Structure for holding an IP address. */
struct tomoyo_ipaddr_union {
        struct in6_addr ip[2]; /* Big endian. */
        struct tomoyo_group *group; /* Pointer to address group. */
        bool is_ipv6; /* Valid only if @group == NULL. */
};

/* Structure for "path_group"/"number_group"/"address_group" directive. */
struct tomoyo_group {
        struct tomoyo_shared_acl_head head;
        const struct tomoyo_path_info *group_name;
        struct list_head member_list;
};

/* Structure for "path_group" directive. */
struct tomoyo_path_group {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *member_name;
};

/* Structure for "number_group" directive. */
struct tomoyo_number_group {
        struct tomoyo_acl_head head;
        struct tomoyo_number_union number;
};

/* Structure for "address_group" directive. */
struct tomoyo_address_group {
        struct tomoyo_acl_head head;
        /* Structure for holding an IP address. */
        struct tomoyo_ipaddr_union address;
};

/* Subset of "struct stat". Used by conditional ACL and audit logs. */
struct tomoyo_mini_stat {
        kuid_t uid;
        kgid_t gid;
        ino_t ino;
        umode_t mode;
        dev_t dev;
        dev_t rdev;
};

/* Structure for dumping argv[] and envp[] of "struct linux_binprm". */
struct tomoyo_page_dump {
        struct page *page;    /* Previously dumped page. */
        char *data;           /* Contents of "page". Size is PAGE_SIZE. */
};

/* Structure for attribute checks in addition to pathname checks. */
struct tomoyo_obj_info {
        /*
         * True if tomoyo_get_attributes() was already called, false otherwise.
         */
        bool validate_done;
        /* True if @stat[] is valid. */
        bool stat_valid[TOMOYO_MAX_PATH_STAT];
        /* First pathname. Initialized with { NULL, NULL } if no path. */
        struct path path1;
        /* Second pathname. Initialized with { NULL, NULL } if no path. */
        struct path path2;
        /*
         * Information on @path1, @path1's parent directory, @path2, @path2's
         * parent directory.
         */
        struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT];
        /*
         * Content of symbolic link to be created. NULL for operations other
         * than symlink().
         */
        struct tomoyo_path_info *symlink_target;
};

/* Structure for argv[]. */
struct tomoyo_argv {
        unsigned long index;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for envp[]. */
struct tomoyo_envp {
        const struct tomoyo_path_info *name;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for execve() operation. */
struct tomoyo_execve {
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj;
        struct linux_binprm *bprm;
        const struct tomoyo_path_info *transition;
        /* For dumping argv[] and envp[]. */
        struct tomoyo_page_dump dump;
        /* For temporary use. */
        char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
};

/* Structure for entries which follows "struct tomoyo_condition". */
struct tomoyo_condition_element {
        /*
         * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a
         * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail
         * of the array of this struct.
         */
        u8 left;
        /*
         * Right hand operand. A "struct tomoyo_number_union" for
         * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for
         * TOMOYO_NAME_UNION is attached to the tail of the array of this
         * struct.
         */
        u8 right;
        /* Equation operator. True if equals or overlaps, false otherwise. */
        bool equals;
};

/* Structure for optional arguments. */
struct tomoyo_condition {
        struct tomoyo_shared_acl_head head;
        u32 size; /* Memory size allocated for this entry. */
        u16 condc; /* Number of conditions in this struct. */
        u16 numbers_count; /* Number of "struct tomoyo_number_union values". */
        u16 names_count; /* Number of "struct tomoyo_name_union names". */
        u16 argc; /* Number of "struct tomoyo_argv". */
        u16 envc; /* Number of "struct tomoyo_envp". */
        u8 grant_log; /* One of values in "enum tomoyo_grant_log". */
        const struct tomoyo_path_info *transit; /* Maybe NULL. */
        /*
         * struct tomoyo_condition_element condition[condc];
         * struct tomoyo_number_union values[numbers_count];
         * struct tomoyo_name_union names[names_count];
         * struct tomoyo_argv argv[argc];
         * struct tomoyo_envp envp[envc];
         */
};

/* Common header for individual entries. */
struct tomoyo_acl_info {
        struct list_head list;
        struct tomoyo_condition *cond; /* Maybe NULL. */
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
        u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */
} __packed;

/* Structure for domain information. */
struct tomoyo_domain_info {
        struct list_head list;
        struct list_head acl_info_list;
        /* Name of this domain. Never NULL.          */
        const struct tomoyo_path_info *domainname;
        /* Namespace for this domain. Never NULL. */
        struct tomoyo_policy_namespace *ns;
        /* Group numbers to use.   */
        unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG];
        u8 profile;        /* Profile number to use. */
        bool is_deleted;   /* Delete flag.           */
        bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
        atomic_t users; /* Number of referring tasks. */
};

/*
 * Structure for "task manual_domain_transition" directive.
 */
struct tomoyo_task_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */
        /* Pointer to domainname. */
        const struct tomoyo_path_info *domainname;
};

/*
 * Structure for "file execute", "file read", "file write", "file append",
 * "file unlink", "file getattr", "file rmdir", "file truncate",
 * "file symlink", "file chroot" and "file unmount" directive.
 */
struct tomoyo_path_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */
        u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */
        struct tomoyo_name_union name;
};

/*
 * Structure for "file create", "file mkdir", "file mkfifo", "file mksock",
 * "file ioctl", "file chmod", "file chown" and "file chgrp" directive.
 */
struct tomoyo_path_number_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */
        /* Bitmask of values in "enum tomoyo_path_number_acl_index". */
        u8 perm;
        struct tomoyo_name_union name;
        struct tomoyo_number_union number;
};

/* Structure for "file mkblock" and "file mkchar" directive. */
struct tomoyo_mkdev_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */
        struct tomoyo_name_union name;
        struct tomoyo_number_union mode;
        struct tomoyo_number_union major;
        struct tomoyo_number_union minor;
};

/*
 * Structure for "file rename", "file link" and "file pivot_root" directive.
 */
struct tomoyo_path2_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */
        struct tomoyo_name_union name1;
        struct tomoyo_name_union name2;
};

/* Structure for "file mount" directive. */
struct tomoyo_mount_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */
        struct tomoyo_name_union dev_name;
        struct tomoyo_name_union dir_name;
        struct tomoyo_name_union fs_type;
        struct tomoyo_number_union flags;
};

/* Structure for "misc env" directive in domain policy. */
struct tomoyo_env_acl {
        struct tomoyo_acl_info head;        /* type = TOMOYO_TYPE_ENV_ACL  */
        const struct tomoyo_path_info *env; /* environment variable */
};

/* Structure for "network inet" directive. */
struct tomoyo_inet_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_ipaddr_union address;
        struct tomoyo_number_union port;
};

/* Structure for "network unix" directive. */
struct tomoyo_unix_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_name_union name;
};

/* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */
struct tomoyo_acl_param {
        char *data;
        struct list_head *list;
        struct tomoyo_policy_namespace *ns;
        bool is_delete;
};

#define TOMOYO_MAX_IO_READ_QUEUE 64

/*
 * Structure for reading/writing policy via /sys/kernel/security/tomoyo
 * interfaces.
 */
struct tomoyo_io_buffer {
        void (*read)(struct tomoyo_io_buffer *head);
        int (*write)(struct tomoyo_io_buffer *head);
        __poll_t (*poll)(struct file *file, poll_table *wait);
        /* Exclusive lock for this structure.   */
        struct mutex io_sem;
        char __user *read_user_buf;
        size_t read_user_buf_avail;
        struct {
                struct list_head *ns;
                struct list_head *domain;
                struct list_head *group;
                struct list_head *acl;
                size_t avail;
                unsigned int step;
                unsigned int query_index;
                u16 index;
                u16 cond_index;
                u8 acl_group_index;
                u8 cond_step;
                u8 bit;
                u8 w_pos;
                bool eof;
                bool print_this_domain_only;
                bool print_transition_related_only;
                bool print_cond_part;
                const char *w[TOMOYO_MAX_IO_READ_QUEUE];
        } r;
        struct {
                struct tomoyo_policy_namespace *ns;
                /* The position currently writing to.   */
                struct tomoyo_domain_info *domain;
                /* Bytes available for writing.         */
                size_t avail;
                bool is_delete;
        } w;
        /* Buffer for reading.                  */
        char *read_buf;
        /* Size of read buffer.                 */
        size_t readbuf_size;
        /* Buffer for writing.                  */
        char *write_buf;
        /* Size of write buffer.                */
        size_t writebuf_size;
        /* Type of this interface.              */
        enum tomoyo_securityfs_interface_index type;
        /* Users counter protected by tomoyo_io_buffer_list_lock. */
        u8 users;
        /* List for telling GC not to kfree() elements. */
        struct list_head list;
};

/*
 * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/
 * "no_keep_domain" keyword.
 */
struct tomoyo_transition_control {
        struct tomoyo_acl_head head;
        u8 type; /* One of values in "enum tomoyo_transition_type".  */
        /* True if the domainname is tomoyo_get_last_name(). */
        bool is_last_name;
        const struct tomoyo_path_info *domainname; /* Maybe NULL */
        const struct tomoyo_path_info *program;    /* Maybe NULL */
};

/* Structure for "aggregator" keyword. */
struct tomoyo_aggregator {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *original_name;
        const struct tomoyo_path_info *aggregated_name;
};

/* Structure for policy manager. */
struct tomoyo_manager {
        struct tomoyo_acl_head head;
        /* A path to program or a domainname. */
        const struct tomoyo_path_info *manager;
};

struct tomoyo_preference {
        unsigned int learning_max_entry;
        bool enforcing_verbose;
        bool learning_verbose;
        bool permissive_verbose;
};

/* Structure for /sys/kernel/security/tomnoyo/profile interface. */
struct tomoyo_profile {
        const struct tomoyo_path_info *comment;
        struct tomoyo_preference *learning;
        struct tomoyo_preference *permissive;
        struct tomoyo_preference *enforcing;
        struct tomoyo_preference preference;
        u8 default_config;
        u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX];
        unsigned int pref[TOMOYO_MAX_PREF];
};

/* Structure for representing YYYY/MM/DD hh/mm/ss. */
struct tomoyo_time {
        u16 year;
        u8 month;
        u8 day;
        u8 hour;
        u8 min;
        u8 sec;
};

/* Structure for policy namespace. */
struct tomoyo_policy_namespace {
        /* Profile table. Memory is allocated as needed. */
        struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES];
        /* List of "struct tomoyo_group". */
        struct list_head group_list[TOMOYO_MAX_GROUP];
        /* List of policy. */
        struct list_head policy_list[TOMOYO_MAX_POLICY];
        /* The global ACL referred by "use_group" keyword. */
        struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS];
        /* List for connecting to tomoyo_namespace_list list. */
        struct list_head namespace_list;
        /* Profile version. Currently only 20150505 is defined. */
        unsigned int profile_version;
        /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */
        const char *name;
};

/* Structure for "struct task_struct"->security. */
struct tomoyo_task {
        struct tomoyo_domain_info *domain_info;
        struct tomoyo_domain_info *old_domain_info;
};

/********** Function prototypes. **********/

bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address,
                                  const struct tomoyo_group *group);
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr);
bool tomoyo_condition(struct tomoyo_request_info *r,
                      const struct tomoyo_condition *cond);
bool tomoyo_correct_domain(const unsigned char *domainname);
bool tomoyo_correct_path(const char *filename);
bool tomoyo_correct_word(const char *string);
bool tomoyo_domain_def(const unsigned char *buffer);
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r);
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump);
bool tomoyo_memory_ok(void *ptr);
bool tomoyo_number_matches_group(const unsigned long min,
                                 const unsigned long max,
                                 const struct tomoyo_group *group);
bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param,
                               struct tomoyo_ipaddr_union *ptr);
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr);
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr);
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern);
bool tomoyo_permstr(const char *string, const char *keyword);
bool tomoyo_str_starts(char **src, const char *find);
char *tomoyo_encode(const char *str);
char *tomoyo_encode2(const char *str, int str_len);
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args) __printf(3, 0);
char *tomoyo_read_token(struct tomoyo_acl_param *param);
char *tomoyo_realpath_from_path(const struct path *path);
char *tomoyo_realpath_nofollow(const char *pathname);
const char *tomoyo_get_exe(void);
const struct tomoyo_path_info *tomoyo_compare_name_union
(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr);
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param);
const struct tomoyo_path_info *tomoyo_get_name(const char *name);
const struct tomoyo_path_info *tomoyo_path_matches_group
(const struct tomoyo_path_info *pathname, const struct tomoyo_group *group);
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag);
void tomoyo_close_control(struct tomoyo_io_buffer *head);
int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env);
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename);
int tomoyo_find_next_domain(struct linux_binprm *bprm);
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index);
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain,
                             const u8 index);
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev);
int tomoyo_mount_permission(const char *dev_name, const struct path *path,
                            const char *type, unsigned long flags,
                            void *data_page);
int tomoyo_open_control(const u8 type, struct file *file);
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2);
int tomoyo_path_number_perm(const u8 operation, const struct path *path,
                            unsigned long number);
int tomoyo_path_perm(const u8 operation, const struct path *path,
                     const char *target);
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait);
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait);
int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr,
                                  int addr_len);
int tomoyo_socket_connect_permission(struct socket *sock,
                                     struct sockaddr *addr, int addr_len);
int tomoyo_socket_listen_permission(struct socket *sock);
int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg,
                                     int size);
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
        __printf(2, 3);
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_info *,
                          const struct tomoyo_acl_info *),
                         bool (*merge_duplicate)
                         (struct tomoyo_acl_info *, struct tomoyo_acl_info *,
                          const bool));
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_head *,
                          const struct tomoyo_acl_head *));
int tomoyo_write_aggregator(struct tomoyo_acl_param *param);
int tomoyo_write_file(struct tomoyo_acl_param *param);
int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type);
int tomoyo_write_misc(struct tomoyo_acl_param *param);
int tomoyo_write_inet_network(struct tomoyo_acl_param *param);
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type);
int tomoyo_write_unix_network(struct tomoyo_acl_param *param);
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len);
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len);
struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param);
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit);
struct tomoyo_domain_info *tomoyo_domain(void);
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
                                      const u8 idx);
struct tomoyo_policy_namespace *tomoyo_assign_namespace
(const char *domainname);
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile);
u8 tomoyo_parse_ulong(unsigned long *result, char **str);
void *tomoyo_commit_ok(void *data, const unsigned int size);
void __init tomoyo_load_builtin_policy(void);
void __init tomoyo_mm_init(void);
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *));
void tomoyo_check_profile(void);
void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp);
void tomoyo_del_condition(struct list_head *element);
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr);
void tomoyo_get_attributes(struct tomoyo_obj_info *obj);
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns);
void tomoyo_load_policy(const char *filename);
void tomoyo_normalize_line(unsigned char *buffer);
void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register);
void tomoyo_print_ip(char *buf, const unsigned int size,
                     const struct tomoyo_ipaddr_union *ptr);
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type);
void tomoyo_put_name_union(struct tomoyo_name_union *ptr);
void tomoyo_put_number_union(struct tomoyo_number_union *ptr);
void tomoyo_read_log(struct tomoyo_io_buffer *head);
void tomoyo_update_stat(const u8 index);
void tomoyo_warn_oom(const char *function);
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
        __printf(2, 3);
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args) __printf(3, 0);

/********** External variable definitions. **********/

extern bool tomoyo_policy_loaded;
extern int tomoyo_enabled;
extern const char * const tomoyo_condition_keyword
[TOMOYO_MAX_CONDITION_KEYWORD];
extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                              + TOMOYO_MAX_MAC_CATEGORY_INDEX];
extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE];
extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION];
extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX];
extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION];
extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX];
extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION];
extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION];
extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION];
extern struct list_head tomoyo_condition_list;
extern struct list_head tomoyo_domain_list;
extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH];
extern struct list_head tomoyo_namespace_list;
extern struct mutex tomoyo_policy_lock;
extern struct srcu_struct tomoyo_ss;
extern struct tomoyo_domain_info tomoyo_kernel_domain;
extern struct tomoyo_policy_namespace tomoyo_kernel_namespace;
extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT];
extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT];
extern struct lsm_blob_sizes tomoyo_blob_sizes;

/********** Inlined functions. **********/

/**
 * tomoyo_read_lock - Take lock for protecting policy.
 *
 * Returns index number for tomoyo_read_unlock().
 */
static inline int tomoyo_read_lock(void)
{
        return srcu_read_lock(&tomoyo_ss);
}

/**
 * tomoyo_read_unlock - Release lock for protecting policy.
 *
 * @idx: Index number returned by tomoyo_read_lock().
 *
 * Returns nothing.
 */
static inline void tomoyo_read_unlock(int idx)
{
        srcu_read_unlock(&tomoyo_ss, idx);
}

/**
 * tomoyo_sys_getppid - Copy of getppid().
 *
 * Returns parent process's PID.
 *
 * Alpha does not have getppid() defined. To be able to build this module on
 * Alpha, I have to copy getppid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getppid(void)
{
        pid_t pid;

        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();
        return pid;
}

/**
 * tomoyo_sys_getpid - Copy of getpid().
 *
 * Returns current thread's PID.
 *
 * Alpha does not have getpid() defined. To be able to build this module on
 * Alpha, I have to copy getpid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getpid(void)
{
        return task_tgid_vnr(current);
}

/**
 * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure.
 *
 * @a: Pointer to "struct tomoyo_path_info".
 * @b: Pointer to "struct tomoyo_path_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a,
                                  const struct tomoyo_path_info *b)
{
        return a->hash != b->hash || strcmp(a->name, b->name);
}

/**
 * tomoyo_put_name - Drop reference on "struct tomoyo_name".
 *
 * @name: Pointer to "struct tomoyo_path_info". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_name(const struct tomoyo_path_info *name)
{
        if (name) {
                struct tomoyo_name *ptr =
                        container_of(name, typeof(*ptr), entry);
                atomic_dec(&ptr->head.users);
        }
}

/**
 * tomoyo_put_condition - Drop reference on "struct tomoyo_condition".
 *
 * @cond: Pointer to "struct tomoyo_condition". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_condition(struct tomoyo_condition *cond)
{
        if (cond)
                atomic_dec(&cond->head.users);
}

/**
 * tomoyo_put_group - Drop reference on "struct tomoyo_group".
 *
 * @group: Pointer to "struct tomoyo_group". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_group(struct tomoyo_group *group)
{
        if (group)
                atomic_dec(&group->head.users);
}

/**
 * tomoyo_task - Get "struct tomoyo_task" for specified thread.
 *
 * @task - Pointer to "struct task_struct".
 *
 * Returns pointer to "struct tomoyo_task" for specified thread.
 */
static inline struct tomoyo_task *tomoyo_task(struct task_struct *task)
{
        return task->security + tomoyo_blob_sizes.lbs_task;
}

/**
 * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry.
 *
 * @a: Pointer to "struct tomoyo_name_union".
 * @b: Pointer to "struct tomoyo_name_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_name_union
(const struct tomoyo_name_union *a, const struct tomoyo_name_union *b)
{
        return a->filename == b->filename && a->group == b->group;
}

/**
 * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry.
 *
 * @a: Pointer to "struct tomoyo_number_union".
 * @b: Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_number_union
(const struct tomoyo_number_union *a, const struct tomoyo_number_union *b)
{
        return a->values[0] == b->values[0] && a->values[1] == b->values[1] &&
                a->group == b->group && a->value_type[0] == b->value_type[0] &&
                a->value_type[1] == b->value_type[1];
}

/**
 * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry.
 *
 * @a: Pointer to "struct tomoyo_ipaddr_union".
 * @b: Pointer to "struct tomoyo_ipaddr_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_ipaddr_union
(const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b)
{
        return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group &&
                a->is_ipv6 == b->is_ipv6;
}

/**
 * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" for current thread.
 */
static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void)
{
        return tomoyo_domain()->ns;
}

/**
 * list_for_each_cookie - iterate over a list with cookie.
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:       the head for your list.
 */
#define list_for_each_cookie(pos, head)                                        \
        if (!pos)                                                        \
                pos =  srcu_dereference((head)->next, &tomoyo_ss);        \
        for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss))

#endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */




















































































































    2 







    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Kernel Electric-Fence (KFENCE). For more info please see
 * Documentation/dev-tools/kfence.rst.
 *
 * Copyright (C) 2020, Google LLC.
 */

#ifndef MM_KFENCE_KFENCE_H
#define MM_KFENCE_KFENCE_H

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/types.h>

#include "../slab.h" /* for struct kmem_cache */

/*
 * Get the canary byte pattern for @addr. Use a pattern that varies based on the
 * lower 3 bits of the address, to detect memory corruptions with higher
 * probability, where similar constants are used.
 */
#define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))

/*
 * Define a continuous 8-byte canary starting from a multiple of 8. The canary
 * of each byte is only related to the lowest three bits of its address, so the
 * canary of every 8 bytes is the same. 64-bit memory can be filled and checked
 * at a time instead of byte by byte to improve performance.
 */
#define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(le64_to_cpu(0x0706050403020100)))

/* Maximum stack depth for reports. */
#define KFENCE_STACK_DEPTH 64

/* KFENCE object states. */
enum kfence_object_state {
        KFENCE_OBJECT_UNUSED,                /* Object is unused. */
        KFENCE_OBJECT_ALLOCATED,        /* Object is currently allocated. */
        KFENCE_OBJECT_FREED,                /* Object was allocated, and then freed. */
};

/* Alloc/free tracking information. */
struct kfence_track {
        pid_t pid;
        int cpu;
        u64 ts_nsec;
        int num_stack_entries;
        unsigned long stack_entries[KFENCE_STACK_DEPTH];
};

/* KFENCE metadata per guarded allocation. */
struct kfence_metadata {
        struct list_head list;                /* Freelist node; access under kfence_freelist_lock. */
        struct rcu_head rcu_head;        /* For delayed freeing. */

        /*
         * Lock protecting below data; to ensure consistency of the below data,
         * since the following may execute concurrently: __kfence_alloc(),
         * __kfence_free(), kfence_handle_page_fault(). However, note that we
         * cannot grab the same metadata off the freelist twice, and multiple
         * __kfence_alloc() cannot run concurrently on the same metadata.
         */
        raw_spinlock_t lock;

        /* The current state of the object; see above. */
        enum kfence_object_state state;

        /*
         * Allocated object address; cannot be calculated from size, because of
         * alignment requirements.
         *
         * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant.
         */
        unsigned long addr;

        /*
         * The size of the original allocation.
         */
        size_t size;

        /*
         * The kmem_cache cache of the last allocation; NULL if never allocated
         * or the cache has already been destroyed.
         */
        struct kmem_cache *cache;

        /*
         * In case of an invalid access, the page that was unprotected; we
         * optimistically only store one address.
         */
        unsigned long unprotected_page;

        /* Allocation and free stack information. */
        struct kfence_track alloc_track;
        struct kfence_track free_track;
        /* For updating alloc_covered on frees. */
        u32 alloc_stack_hash;
#ifdef CONFIG_MEMCG_KMEM
        struct slabobj_ext obj_exts;
#endif
};

#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \
                                        CONFIG_KFENCE_NUM_OBJECTS)

extern struct kfence_metadata *kfence_metadata;

static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
{
        long index;

        /* The checks do not affect performance; only called from slow-paths. */

        if (!is_kfence_address((void *)addr))
                return NULL;

        /*
         * May be an invalid index if called with an address at the edge of
         * __kfence_pool, in which case we would report an "invalid access"
         * error.
         */
        index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
        if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
                return NULL;

        return &kfence_metadata[index];
}

/* KFENCE error types for report generation. */
enum kfence_error_type {
        KFENCE_ERROR_OOB,                /* Detected a out-of-bounds access. */
        KFENCE_ERROR_UAF,                /* Detected a use-after-free access. */
        KFENCE_ERROR_CORRUPTION,        /* Detected a memory corruption on free. */
        KFENCE_ERROR_INVALID,                /* Invalid access of unknown type. */
        KFENCE_ERROR_INVALID_FREE,        /* Invalid free. */
};

void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
                         const struct kfence_metadata *meta, enum kfence_error_type type);

void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);

#endif /* MM_KFENCE_KFENCE_H */


































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUSET_H
#define _LINUX_CPUSET_H
/*
 *  cpuset interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/sched/topology.h>
#include <linux/sched/task.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/jump_label.h>

#ifdef CONFIG_CPUSETS

/*
 * Static branch rewrites can happen in an arbitrary order for a given
 * key. In code paths where we need to loop with read_mems_allowed_begin() and
 * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
 * to ensure that begin() always gets rewritten before retry() in the
 * disabled -> enabled transition. If not, then if local irqs are disabled
 * around the loop, we can deadlock since retry() would always be
 * comparing the latest value of the mems_allowed seqcount against 0 as
 * begin() still would see cpusets_enabled() as false. The enabled -> disabled
 * transition should happen in reverse order for the same reasons (want to stop
 * looking at real value of mems_allowed.sequence in retry() first).
 */
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
extern struct static_key_false cpusets_insane_config_key;

static inline bool cpusets_enabled(void)
{
        return static_branch_unlikely(&cpusets_enabled_key);
}

static inline void cpuset_inc(void)
{
        static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
        static_branch_inc_cpuslocked(&cpusets_enabled_key);
}

static inline void cpuset_dec(void)
{
        static_branch_dec_cpuslocked(&cpusets_enabled_key);
        static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}

/*
 * This will get enabled whenever a cpuset configuration is considered
 * unsupportable in general. E.g. movable only node which cannot satisfy
 * any non movable allocations (see update_nodemask). Page allocator
 * needs to make additional checks for those configurations and this
 * check is meant to guard those checks without any overhead for sane
 * configurations.
 */
static inline bool cpusets_insane_config(void)
{
        return static_branch_unlikely(&cpusets_insane_config_key);
}

extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern bool cpuset_cpu_is_isolated(int cpu);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);

extern bool cpuset_node_allowed(int node, gfp_t gfp_mask);

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        if (cpusets_enabled())
                return __cpuset_zone_allowed(z, gfp_mask);
        return true;
}

extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                          const struct task_struct *tsk2);

#define cpuset_memory_pressure_bump()                                 \
        do {                                                        \
                if (cpuset_memory_pressure_enabled)                \
                        __cpuset_memory_pressure_bump();        \
        } while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);

extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *tsk);

extern int cpuset_mem_spread_node(void);
extern int cpuset_slab_spread_node(void);

static inline int cpuset_do_page_mem_spread(void)
{
        return task_spread_page(current);
}

extern bool current_cpuset_is_being_rebound(void);

extern void rebuild_sched_domains(void);

extern void cpuset_print_current_mems_allowed(void);

/*
 * read_mems_allowed_begin is required when making decisions involving
 * mems_allowed such as during page allocation. mems_allowed can be updated in
 * parallel and depending on the new value an operation can fail potentially
 * causing process failure. A retry loop with read_mems_allowed_begin and
 * read_mems_allowed_retry prevents these artificial failures.
 */
static inline unsigned int read_mems_allowed_begin(void)
{
        if (!static_branch_unlikely(&cpusets_pre_enable_key))
                return 0;

        return read_seqcount_begin(&current->mems_allowed_seq);
}

/*
 * If this returns true, the operation that took place after
 * read_mems_allowed_begin may have failed artificially due to a concurrent
 * update of mems_allowed. It is up to the caller to retry the operation if
 * appropriate.
 */
static inline bool read_mems_allowed_retry(unsigned int seq)
{
        if (!static_branch_unlikely(&cpusets_enabled_key))
                return false;

        return read_seqcount_retry(&current->mems_allowed_seq, seq);
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
        unsigned long flags;

        task_lock(current);
        local_irq_save(flags);
        write_seqcount_begin(&current->mems_allowed_seq);
        current->mems_allowed = nodemask;
        write_seqcount_end(&current->mems_allowed_seq);
        local_irq_restore(flags);
        task_unlock(current);
}

#else /* !CONFIG_CPUSETS */

static inline bool cpusets_enabled(void) { return false; }

static inline bool cpusets_insane_config(void) { return false; }

static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}

static inline void cpuset_force_rebuild(void) { }

static inline void cpuset_update_active_cpus(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void inc_dl_tasks_cs(struct task_struct *task) { }
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
static inline void cpuset_unlock(void) { }

static inline void cpuset_cpus_allowed(struct task_struct *p,
                                       struct cpumask *mask)
{
        cpumask_copy(mask, task_cpu_possible_mask(p));
}

static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
{
        return false;
}

static inline bool cpuset_cpu_is_isolated(int cpu)
{
        return false;
}

static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
        return node_possible_map;
}

#define cpuset_current_mems_allowed (node_states[N_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}

static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return 1;
}

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                                 const struct task_struct *tsk2)
{
        return 1;
}

static inline void cpuset_memory_pressure_bump(void) {}

static inline void cpuset_task_status_allowed(struct seq_file *m,
                                                struct task_struct *task)
{
}

static inline int cpuset_mem_spread_node(void)
{
        return 0;
}

static inline int cpuset_slab_spread_node(void)
{
        return 0;
}

static inline int cpuset_do_page_mem_spread(void)
{
        return 0;
}

static inline bool current_cpuset_is_being_rebound(void)
{
        return false;
}

static inline void rebuild_sched_domains(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_print_current_mems_allowed(void)
{
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
}

static inline unsigned int read_mems_allowed_begin(void)
{
        return 0;
}

static inline bool read_mems_allowed_retry(unsigned int seq)
{
        return false;
}

#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */


























































































































































































































































































































































































































































    1 

    1 



    1 

    1 





    1 






























    1 
    1 


    1 








    1 


    1 






































































































    1 


    1 






















    1 
    1 

































































































































































































































































































































































































































































    1 





























































































































































    1 






















































    1 



    1 




    1 
    1 
    1 




























    1 

















































    1 



    1 









    1 












    1 



    1 
    1 



































































































































































































































































































































































    1 
    1 
    1 

    1 








    1 




    1 

    1 
    1 
    1 
    1 










    1 


    1 




    1 











































































































































































































    1 







    1 
    1 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
 *
 * Copyright (c) 2017 - 2019, Intel Corporation.
 */

#define pr_fmt(fmt) "MPTCP: " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <crypto/sha2.h>
#include <crypto/utils.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/ip6_route.h>
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>

#include "protocol.h"
#include "mib.h"

#include <trace/events/mptcp.h>
#include <trace/events/sock.h>

static void mptcp_subflow_ops_undo_override(struct sock *ssk);

static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
                                  enum linux_mptcp_mib_field field)
{
        MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
}

static void subflow_req_destructor(struct request_sock *req)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);

        pr_debug("subflow_req=%p", subflow_req);

        if (subflow_req->msk)
                sock_put((struct sock *)subflow_req->msk);

        mptcp_token_destroy_request(req);
}

static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
                                  void *hmac)
{
        u8 msg[8];

        put_unaligned_be32(nonce1, &msg[0]);
        put_unaligned_be32(nonce2, &msg[4]);

        mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
}

static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
{
        return mptcp_is_fully_established((void *)msk) &&
                ((mptcp_pm_is_userspace(msk) &&
                  mptcp_userspace_pm_active(msk)) ||
                 READ_ONCE(msk->pm.accept_subflow));
}

/* validate received token and create truncated hmac and nonce for SYN-ACK */
static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
{
        struct mptcp_sock *msk = subflow_req->msk;
        u8 hmac[SHA256_DIGEST_SIZE];

        get_random_bytes(&subflow_req->local_nonce, sizeof(u32));

        subflow_generate_hmac(READ_ONCE(msk->local_key),
                              READ_ONCE(msk->remote_key),
                              subflow_req->local_nonce,
                              subflow_req->remote_nonce, hmac);

        subflow_req->thmac = get_unaligned_be64(hmac);
}

static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct mptcp_sock *msk;
        int local_id;

        msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token);
        if (!msk) {
                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
                return NULL;
        }

        local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
        if (local_id < 0) {
                sock_put((struct sock *)msk);
                return NULL;
        }
        subflow_req->local_id = local_id;

        return msk;
}

static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);

        subflow_req->mp_capable = 0;
        subflow_req->mp_join = 0;
        subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener));
        subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener));
        subflow_req->msk = NULL;
        mptcp_token_init_request(req);
}

static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
{
        return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
}

static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
{
        struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP);

        if (mpext) {
                memset(mpext, 0, sizeof(*mpext));
                mpext->reset_reason = reason;
        }
}

/* Init mptcp request socket.
 *
 * Returns an error code if a JOIN has failed and a TCP reset
 * should be sent.
 */
static int subflow_check_req(struct request_sock *req,
                             const struct sock *sk_listener,
                             struct sk_buff *skb)
{
        struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct mptcp_options_received mp_opt;
        bool opt_mp_capable, opt_mp_join;

        pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);

#ifdef CONFIG_TCP_MD5SIG
        /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
         * TCP option space.
         */
        if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) {
                subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
                return -EINVAL;
        }
#endif

        mptcp_get_options(skb, &mp_opt);

        opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN);
        opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN);
        if (opt_mp_capable) {
                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);

                if (opt_mp_join)
                        return 0;
        } else if (opt_mp_join) {
                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
        }

        if (opt_mp_capable && listener->request_mptcp) {
                int err, retries = MPTCP_TOKEN_MAX_RETRIES;

                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
again:
                do {
                        get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
                } while (subflow_req->local_key == 0);

                if (unlikely(req->syncookie)) {
                        mptcp_crypto_key_sha(subflow_req->local_key,
                                             &subflow_req->token,
                                             &subflow_req->idsn);
                        if (mptcp_token_exists(subflow_req->token)) {
                                if (retries-- > 0)
                                        goto again;
                                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
                        } else {
                                subflow_req->mp_capable = 1;
                        }
                        return 0;
                }

                err = mptcp_token_new_request(req);
                if (err == 0)
                        subflow_req->mp_capable = 1;
                else if (retries-- > 0)
                        goto again;
                else
                        SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);

        } else if (opt_mp_join && listener->request_mptcp) {
                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
                subflow_req->mp_join = 1;
                subflow_req->backup = mp_opt.backup;
                subflow_req->remote_id = mp_opt.join_id;
                subflow_req->token = mp_opt.token;
                subflow_req->remote_nonce = mp_opt.nonce;
                subflow_req->msk = subflow_token_join_request(req);

                /* Can't fall back to TCP in this case. */
                if (!subflow_req->msk) {
                        subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
                        return -EPERM;
                }

                if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
                        pr_debug("syn inet_sport=%d %d",
                                 ntohs(inet_sk(sk_listener)->inet_sport),
                                 ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
                        if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
                                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
                                subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
                                return -EPERM;
                        }
                        SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
                }

                subflow_req_create_thmac(subflow_req);

                if (unlikely(req->syncookie)) {
                        if (!mptcp_can_accept_new_subflow(subflow_req->msk)) {
                                subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
                                return -EPERM;
                        }

                        subflow_init_req_cookie_join_save(subflow_req, skb);
                }

                pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
                         subflow_req->remote_nonce, subflow_req->msk);
        }

        return 0;
}

int mptcp_subflow_init_cookie_req(struct request_sock *req,
                                  const struct sock *sk_listener,
                                  struct sk_buff *skb)
{
        struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct mptcp_options_received mp_opt;
        bool opt_mp_capable, opt_mp_join;
        int err;

        subflow_init_req(req, sk_listener);
        mptcp_get_options(skb, &mp_opt);

        opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK);
        opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK);
        if (opt_mp_capable && opt_mp_join)
                return -EINVAL;

        if (opt_mp_capable && listener->request_mptcp) {
                if (mp_opt.sndr_key == 0)
                        return -EINVAL;

                subflow_req->local_key = mp_opt.rcvr_key;
                err = mptcp_token_new_request(req);
                if (err)
                        return err;

                subflow_req->mp_capable = 1;
                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
        } else if (opt_mp_join && listener->request_mptcp) {
                if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
                        return -EINVAL;

                subflow_req->mp_join = 1;
                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);

static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb)
{
        const struct mptcp_ext *mpext = mptcp_get_ext(skb);

        if (!mpext)
                return SK_RST_REASON_NOT_SPECIFIED;

        return sk_rst_convert_mptcp_reason(mpext->reset_reason);
}

static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
                                              struct sk_buff *skb,
                                              struct flowi *fl,
                                              struct request_sock *req,
                                              u32 tw_isn)
{
        struct dst_entry *dst;
        int err;

        tcp_rsk(req)->is_mptcp = 1;
        subflow_init_req(req, sk);

        dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn);
        if (!dst)
                return NULL;

        err = subflow_check_req(req, sk, skb);
        if (err == 0)
                return dst;

        dst_release(dst);
        if (!req->syncookie)
                tcp_request_sock_ops.send_reset(sk, skb,
                                                mptcp_get_rst_reason(skb));
        return NULL;
}

static void subflow_prep_synack(const struct sock *sk, struct request_sock *req,
                                struct tcp_fastopen_cookie *foc,
                                enum tcp_synack_type synack_type)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct inet_request_sock *ireq = inet_rsk(req);

        /* clear tstamp_ok, as needed depending on cookie */
        if (foc && foc->len > -1)
                ireq->tstamp_ok = 0;

        if (synack_type == TCP_SYNACK_FASTOPEN)
                mptcp_fastopen_subflow_synack_set_params(subflow, req);
}

static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
                                  struct flowi *fl,
                                  struct request_sock *req,
                                  struct tcp_fastopen_cookie *foc,
                                  enum tcp_synack_type synack_type,
                                  struct sk_buff *syn_skb)
{
        subflow_prep_synack(sk, req, foc, synack_type);

        return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc,
                                                     synack_type, syn_skb);
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
                                  struct flowi *fl,
                                  struct request_sock *req,
                                  struct tcp_fastopen_cookie *foc,
                                  enum tcp_synack_type synack_type,
                                  struct sk_buff *syn_skb)
{
        subflow_prep_synack(sk, req, foc, synack_type);

        return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc,
                                                     synack_type, syn_skb);
}

static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
                                              struct sk_buff *skb,
                                              struct flowi *fl,
                                              struct request_sock *req,
                                              u32 tw_isn)
{
        struct dst_entry *dst;
        int err;

        tcp_rsk(req)->is_mptcp = 1;
        subflow_init_req(req, sk);

        dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn);
        if (!dst)
                return NULL;

        err = subflow_check_req(req, sk, skb);
        if (err == 0)
                return dst;

        dst_release(dst);
        if (!req->syncookie)
                tcp6_request_sock_ops.send_reset(sk, skb,
                                                 mptcp_get_rst_reason(skb));
        return NULL;
}
#endif

/* validate received truncated hmac and create hmac for third ACK */
static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
{
        u8 hmac[SHA256_DIGEST_SIZE];
        u64 thmac;

        subflow_generate_hmac(subflow->remote_key, subflow->local_key,
                              subflow->remote_nonce, subflow->local_nonce,
                              hmac);

        thmac = get_unaligned_be64(hmac);
        pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
                 subflow, subflow->token, thmac, subflow->thmac);

        return thmac == subflow->thmac;
}

void mptcp_subflow_reset(struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct sock *sk = subflow->conn;

        /* mptcp_mp_fail_no_response() can reach here on an already closed
         * socket
         */
        if (ssk->sk_state == TCP_CLOSE)
                return;

        /* must hold: tcp_done() could drop last reference on parent */
        sock_hold(sk);

        mptcp_send_active_reset_reason(ssk);
        tcp_done(ssk);
        if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags))
                mptcp_schedule_work(sk);

        sock_put(sk);
}

static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
{
        return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
}

void __mptcp_sync_state(struct sock *sk, int state)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct sock *ssk = msk->first;

        subflow = mptcp_subflow_ctx(ssk);
        __mptcp_propagate_sndbuf(sk, ssk);
        if (!msk->rcvspace_init)
                mptcp_rcv_space_init(msk, ssk);

        if (sk->sk_state == TCP_SYN_SENT) {
                /* subflow->idsn is always available is TCP_SYN_SENT state,
                 * even for the FASTOPEN scenarios
                 */
                WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
                WRITE_ONCE(msk->snd_nxt, msk->write_seq);
                mptcp_set_state(sk, state);
                sk->sk_state_change(sk);
        }
}

static void subflow_set_remote_key(struct mptcp_sock *msk,
                                   struct mptcp_subflow_context *subflow,
                                   const struct mptcp_options_received *mp_opt)
{
        /* active MPC subflow will reach here multiple times:
         * at subflow_finish_connect() time and at 4th ack time
         */
        if (subflow->remote_key_valid)
                return;

        subflow->remote_key_valid = 1;
        subflow->remote_key = mp_opt->sndr_key;
        mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn);
        subflow->iasn++;

        WRITE_ONCE(msk->remote_key, subflow->remote_key);
        WRITE_ONCE(msk->ack_seq, subflow->iasn);
        WRITE_ONCE(msk->can_ack, true);
        atomic64_set(&msk->rcv_wnd_sent, subflow->iasn);
}

static void mptcp_propagate_state(struct sock *sk, struct sock *ssk,
                                  struct mptcp_subflow_context *subflow,
                                  const struct mptcp_options_received *mp_opt)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        mptcp_data_lock(sk);
        if (mp_opt) {
                /* Options are available only in the non fallback cases
                 * avoid updating rx path fields otherwise
                 */
                WRITE_ONCE(msk->snd_una, subflow->idsn + 1);
                WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd);
                subflow_set_remote_key(msk, subflow, mp_opt);
        }

        if (!sock_owned_by_user(sk)) {
                __mptcp_sync_state(sk, ssk->sk_state);
        } else {
                msk->pending_state = ssk->sk_state;
                __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags);
        }
        mptcp_data_unlock(sk);
}

static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_options_received mp_opt;
        struct sock *parent = subflow->conn;
        struct mptcp_sock *msk;

        subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);

        /* be sure no special action on any packet other than syn-ack */
        if (subflow->conn_finished)
                return;

        msk = mptcp_sk(parent);
        subflow->rel_write_seq = 1;
        subflow->conn_finished = 1;
        subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
        pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);

        mptcp_get_options(skb, &mp_opt);
        if (subflow->request_mptcp) {
                if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) {
                        MPTCP_INC_STATS(sock_net(sk),
                                        MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
                        mptcp_do_fallback(sk);
                        pr_fallback(msk);
                        goto fallback;
                }

                if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD)
                        WRITE_ONCE(msk->csum_enabled, true);
                if (mp_opt.deny_join_id0)
                        WRITE_ONCE(msk->pm.remote_deny_join_id0, true);
                subflow->mp_capable = 1;
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
                mptcp_finish_connect(sk);
                mptcp_propagate_state(parent, sk, subflow, &mp_opt);
        } else if (subflow->request_join) {
                u8 hmac[SHA256_DIGEST_SIZE];

                if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) {
                        subflow->reset_reason = MPTCP_RST_EMPTCP;
                        goto do_reset;
                }

                subflow->backup = mp_opt.backup;
                subflow->thmac = mp_opt.thmac;
                subflow->remote_nonce = mp_opt.nonce;
                WRITE_ONCE(subflow->remote_id, mp_opt.join_id);
                pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
                         subflow, subflow->thmac, subflow->remote_nonce,
                         subflow->backup);

                if (!subflow_thmac_valid(subflow)) {
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
                        subflow->reset_reason = MPTCP_RST_EMPTCP;
                        goto do_reset;
                }

                if (!mptcp_finish_join(sk))
                        goto do_reset;

                subflow_generate_hmac(subflow->local_key, subflow->remote_key,
                                      subflow->local_nonce,
                                      subflow->remote_nonce,
                                      hmac);
                memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);

                subflow->mp_join = 1;
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);

                if (subflow_use_different_dport(msk, sk)) {
                        pr_debug("synack inet_dport=%d %d",
                                 ntohs(inet_sk(sk)->inet_dport),
                                 ntohs(inet_sk(parent)->inet_dport));
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
                }
        } else if (mptcp_check_fallback(sk)) {
fallback:
                mptcp_propagate_state(parent, sk, subflow, NULL);
        }
        return;

do_reset:
        subflow->reset_transient = 0;
        mptcp_subflow_reset(sk);
}

static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id)
{
        WARN_ON_ONCE(local_id < 0 || local_id > 255);
        WRITE_ONCE(subflow->local_id, local_id);
}

static int subflow_chk_local_id(struct sock *sk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        int err;

        if (likely(subflow->local_id >= 0))
                return 0;

        err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
        if (err < 0)
                return err;

        subflow_set_local_id(subflow, err);
        return 0;
}

static int subflow_rebuild_header(struct sock *sk)
{
        int err = subflow_chk_local_id(sk);

        if (unlikely(err < 0))
                return err;

        return inet_sk_rebuild_header(sk);
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static int subflow_v6_rebuild_header(struct sock *sk)
{
        int err = subflow_chk_local_id(sk);

        if (unlikely(err < 0))
                return err;

        return inet6_sk_rebuild_header(sk);
}
#endif

static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init;
static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;

static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        pr_debug("subflow=%p", subflow);

        /* Never answer to SYNs sent to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
                goto drop;

        return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops,
                                &subflow_request_sock_ipv4_ops,
                                sk, skb);
drop:
        tcp_listendrop(sk);
        return 0;
}

static void subflow_v4_req_destructor(struct request_sock *req)
{
        subflow_req_destructor(req);
        tcp_request_sock_ops.destructor(req);
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init;
static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
static struct proto tcpv6_prot_override __ro_after_init;

static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        pr_debug("subflow=%p", subflow);

        if (skb->protocol == htons(ETH_P_IP))
                return subflow_v4_conn_request(sk, skb);

        if (!ipv6_unicast_destination(skb))
                goto drop;

        if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
                __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
                return 0;
        }

        return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops,
                                &subflow_request_sock_ipv6_ops, sk, skb);

drop:
        tcp_listendrop(sk);
        return 0; /* don't send reset */
}

static void subflow_v6_req_destructor(struct request_sock *req)
{
        subflow_req_destructor(req);
        tcp6_request_sock_ops.destructor(req);
}
#endif

struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
                                               struct sock *sk_listener,
                                               bool attach_listener)
{
        if (ops->family == AF_INET)
                ops = &mptcp_subflow_v4_request_sock_ops;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        else if (ops->family == AF_INET6)
                ops = &mptcp_subflow_v6_request_sock_ops;
#endif

        return inet_reqsk_alloc(ops, sk_listener, attach_listener);
}
EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc);

/* validate hmac received in third ACK */
static bool subflow_hmac_valid(const struct request_sock *req,
                               const struct mptcp_options_received *mp_opt)
{
        const struct mptcp_subflow_request_sock *subflow_req;
        u8 hmac[SHA256_DIGEST_SIZE];
        struct mptcp_sock *msk;

        subflow_req = mptcp_subflow_rsk(req);
        msk = subflow_req->msk;
        if (!msk)
                return false;

        subflow_generate_hmac(READ_ONCE(msk->remote_key),
                              READ_ONCE(msk->local_key),
                              subflow_req->remote_nonce,
                              subflow_req->local_nonce, hmac);

        return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
}

static void subflow_ulp_fallback(struct sock *sk,
                                 struct mptcp_subflow_context *old_ctx)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        mptcp_subflow_tcp_fallback(sk, old_ctx);
        icsk->icsk_ulp_ops = NULL;
        rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
        tcp_sk(sk)->is_mptcp = 0;

        mptcp_subflow_ops_undo_override(sk);
}

void mptcp_subflow_drop_ctx(struct sock *ssk)
{
        struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);

        if (!ctx)
                return;

        list_del(&mptcp_subflow_ctx(ssk)->node);
        if (inet_csk(ssk)->icsk_ulp_ops) {
                subflow_ulp_fallback(ssk, ctx);
                if (ctx->conn)
                        sock_put(ctx->conn);
        }

        kfree_rcu(ctx, rcu);
}

void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
                                       struct mptcp_subflow_context *subflow,
                                       const struct mptcp_options_received *mp_opt)
{
        subflow_set_remote_key(msk, subflow, mp_opt);
        subflow->fully_established = 1;
        WRITE_ONCE(msk->fully_established, true);

        if (subflow->is_mptfo)
                __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt);
}

static struct sock *subflow_syn_recv_sock(const struct sock *sk,
                                          struct sk_buff *skb,
                                          struct request_sock *req,
                                          struct dst_entry *dst,
                                          struct request_sock *req_unhash,
                                          bool *own_req)
{
        struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
        struct mptcp_subflow_request_sock *subflow_req;
        struct mptcp_options_received mp_opt;
        bool fallback, fallback_is_fatal;
        enum sk_rst_reason reason;
        struct mptcp_sock *owner;
        struct sock *child;

        pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);

        /* After child creation we must look for MPC even when options
         * are not parsed
         */
        mp_opt.suboptions = 0;

        /* hopefully temporary handling for MP_JOIN+syncookie */
        subflow_req = mptcp_subflow_rsk(req);
        fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
        fallback = !tcp_rsk(req)->is_mptcp;
        if (fallback)
                goto create_child;

        /* if the sk is MP_CAPABLE, we try to fetch the client key */
        if (subflow_req->mp_capable) {
                /* we can receive and accept an in-window, out-of-order pkt,
                 * which may not carry the MP_CAPABLE opt even on mptcp enabled
                 * paths: always try to extract the peer key, and fallback
                 * for packets missing it.
                 * Even OoO DSS packets coming legitly after dropped or
                 * reordered MPC will cause fallback, but we don't have other
                 * options.
                 */
                mptcp_get_options(skb, &mp_opt);
                if (!(mp_opt.suboptions &
                      (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK)))
                        fallback = true;

        } else if (subflow_req->mp_join) {
                mptcp_get_options(skb, &mp_opt);
                if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) ||
                    !subflow_hmac_valid(req, &mp_opt) ||
                    !mptcp_can_accept_new_subflow(subflow_req->msk)) {
                        SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
                        fallback = true;
                }
        }

create_child:
        child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
                                                     req_unhash, own_req);

        if (child && *own_req) {
                struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);

                tcp_rsk(req)->drop_req = false;

                /* we need to fallback on ctx allocation failure and on pre-reqs
                 * checking above. In the latter scenario we additionally need
                 * to reset the context to non MPTCP status.
                 */
                if (!ctx || fallback) {
                        if (fallback_is_fatal) {
                                subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
                                goto dispose_child;
                        }
                        goto fallback;
                }

                /* ssk inherits options of listener sk */
                ctx->setsockopt_seq = listener->setsockopt_seq;

                if (ctx->mp_capable) {
                        ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req);
                        if (!ctx->conn)
                                goto fallback;

                        ctx->subflow_id = 1;
                        owner = mptcp_sk(ctx->conn);
                        mptcp_pm_new_connection(owner, child, 1);

                        /* with OoO packets we can reach here without ingress
                         * mpc option
                         */
                        if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) {
                                mptcp_pm_fully_established(owner, child);
                                ctx->pm_notified = 1;
                        }
                } else if (ctx->mp_join) {
                        owner = subflow_req->msk;
                        if (!owner) {
                                subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
                                goto dispose_child;
                        }

                        /* move the msk reference ownership to the subflow */
                        subflow_req->msk = NULL;
                        ctx->conn = (struct sock *)owner;

                        if (subflow_use_different_sport(owner, sk)) {
                                pr_debug("ack inet_sport=%d %d",
                                         ntohs(inet_sk(sk)->inet_sport),
                                         ntohs(inet_sk((struct sock *)owner)->inet_sport));
                                if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
                                        SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
                                        subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
                                        goto dispose_child;
                                }
                                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
                        }

                        if (!mptcp_finish_join(child)) {
                                struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child);

                                subflow_add_reset_reason(skb, subflow->reset_reason);
                                goto dispose_child;
                        }

                        SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
                        tcp_rsk(req)->drop_req = true;
                }
        }

        /* check for expected invariant - should never trigger, just help
         * catching earlier subtle bugs
         */
        WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
                     (!mptcp_subflow_ctx(child) ||
                      !mptcp_subflow_ctx(child)->conn));
        return child;

dispose_child:
        mptcp_subflow_drop_ctx(child);
        tcp_rsk(req)->drop_req = true;
        inet_csk_prepare_for_destroy_sock(child);
        tcp_done(child);
        reason = mptcp_get_rst_reason(skb);
        req->rsk_ops->send_reset(sk, skb, reason);

        /* The last child reference will be released by the caller */
        return child;

fallback:
        if (fallback)
                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
        mptcp_subflow_drop_ctx(child);
        return child;
}

static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
static struct proto tcp_prot_override __ro_after_init;

enum mapping_status {
        MAPPING_OK,
        MAPPING_INVALID,
        MAPPING_EMPTY,
        MAPPING_DATA_FIN,
        MAPPING_DUMMY,
        MAPPING_BAD_CSUM
};

static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
{
        pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
                 ssn, subflow->map_subflow_seq, subflow->map_data_len);
}

static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        unsigned int skb_consumed;

        skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
        if (WARN_ON_ONCE(skb_consumed >= skb->len))
                return true;

        return skb->len - skb_consumed <= subflow->map_data_len -
                                          mptcp_subflow_get_map_offset(subflow);
}

static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;

        if (unlikely(before(ssn, subflow->map_subflow_seq))) {
                /* Mapping covers data later in the subflow stream,
                 * currently unsupported.
                 */
                dbg_bad_map(subflow, ssn);
                return false;
        }
        if (unlikely(!before(ssn, subflow->map_subflow_seq +
                                  subflow->map_data_len))) {
                /* Mapping does covers past subflow data, invalid */
                dbg_bad_map(subflow, ssn);
                return false;
        }
        return true;
}

static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb,
                                              bool csum_reqd)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        u32 offset, seq, delta;
        __sum16 csum;
        int len;

        if (!csum_reqd)
                return MAPPING_OK;

        /* mapping already validated on previous traversal */
        if (subflow->map_csum_len == subflow->map_data_len)
                return MAPPING_OK;

        /* traverse the receive queue, ensuring it contains a full
         * DSS mapping and accumulating the related csum.
         * Preserve the accoumlate csum across multiple calls, to compute
         * the csum only once
         */
        delta = subflow->map_data_len - subflow->map_csum_len;
        for (;;) {
                seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len;
                offset = seq - TCP_SKB_CB(skb)->seq;

                /* if the current skb has not been accounted yet, csum its contents
                 * up to the amount covered by the current DSS
                 */
                if (offset < skb->len) {
                        __wsum csum;

                        len = min(skb->len - offset, delta);
                        csum = skb_checksum(skb, offset, len, 0);
                        subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum,
                                                                subflow->map_csum_len);

                        delta -= len;
                        subflow->map_csum_len += len;
                }
                if (delta == 0)
                        break;

                if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) {
                        /* if this subflow is closed, the partial mapping
                         * will be never completed; flush the pending skbs, so
                         * that subflow_sched_work_if_closed() can kick in
                         */
                        if (unlikely(ssk->sk_state == TCP_CLOSE))
                                while ((skb = skb_peek(&ssk->sk_receive_queue)))
                                        sk_eat_skb(ssk, skb);

                        /* not enough data to validate the csum */
                        return MAPPING_EMPTY;
                }

                /* the DSS mapping for next skbs will be validated later,
                 * when a get_mapping_status call will process such skb
                 */
                skb = skb->next;
        }

        /* note that 'map_data_len' accounts only for the carried data, does
         * not include the eventual seq increment due to the data fin,
         * while the pseudo header requires the original DSS data len,
         * including that
         */
        csum = __mptcp_make_csum(subflow->map_seq,
                                 subflow->map_subflow_seq,
                                 subflow->map_data_len + subflow->map_data_fin,
                                 subflow->map_data_csum);
        if (unlikely(csum)) {
                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
                return MAPPING_BAD_CSUM;
        }

        subflow->valid_csum_seen = 1;
        return MAPPING_OK;
}

static enum mapping_status get_mapping_status(struct sock *ssk,
                                              struct mptcp_sock *msk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        bool csum_reqd = READ_ONCE(msk->csum_enabled);
        struct mptcp_ext *mpext;
        struct sk_buff *skb;
        u16 data_len;
        u64 map_seq;

        skb = skb_peek(&ssk->sk_receive_queue);
        if (!skb)
                return MAPPING_EMPTY;

        if (mptcp_check_fallback(ssk))
                return MAPPING_DUMMY;

        mpext = mptcp_get_ext(skb);
        if (!mpext || !mpext->use_map) {
                if (!subflow->map_valid && !skb->len) {
                        /* the TCP stack deliver 0 len FIN pkt to the receive
                         * queue, that is the only 0len pkts ever expected here,
                         * and we can admit no mapping only for 0 len pkts
                         */
                        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
                                WARN_ONCE(1, "0len seq %d:%d flags %x",
                                          TCP_SKB_CB(skb)->seq,
                                          TCP_SKB_CB(skb)->end_seq,
                                          TCP_SKB_CB(skb)->tcp_flags);
                        sk_eat_skb(ssk, skb);
                        return MAPPING_EMPTY;
                }

                if (!subflow->map_valid)
                        return MAPPING_INVALID;

                goto validate_seq;
        }

        trace_get_mapping_status(mpext);

        data_len = mpext->data_len;
        if (data_len == 0) {
                pr_debug("infinite mapping received");
                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
                subflow->map_data_len = 0;
                return MAPPING_INVALID;
        }

        if (mpext->data_fin == 1) {
                u64 data_fin_seq;

                if (data_len == 1) {
                        bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
                                                                 mpext->dsn64);
                        pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
                        if (subflow->map_valid) {
                                /* A DATA_FIN might arrive in a DSS
                                 * option before the previous mapping
                                 * has been fully consumed. Continue
                                 * handling the existing mapping.
                                 */
                                skb_ext_del(skb, SKB_EXT_MPTCP);
                                return MAPPING_OK;
                        }

                        if (updated)
                                mptcp_schedule_work((struct sock *)msk);

                        return MAPPING_DATA_FIN;
                }

                data_fin_seq = mpext->data_seq + data_len - 1;

                /* If mpext->data_seq is a 32-bit value, data_fin_seq must also
                 * be limited to 32 bits.
                 */
                if (!mpext->dsn64)
                        data_fin_seq &= GENMASK_ULL(31, 0);

                mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
                pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
                         data_fin_seq, mpext->dsn64);

                /* Adjust for DATA_FIN using 1 byte of sequence space */
                data_len--;
        }

        map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64);
        WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);

        if (subflow->map_valid) {
                /* Allow replacing only with an identical map */
                if (subflow->map_seq == map_seq &&
                    subflow->map_subflow_seq == mpext->subflow_seq &&
                    subflow->map_data_len == data_len &&
                    subflow->map_csum_reqd == mpext->csum_reqd) {
                        skb_ext_del(skb, SKB_EXT_MPTCP);
                        goto validate_csum;
                }

                /* If this skb data are fully covered by the current mapping,
                 * the new map would need caching, which is not supported
                 */
                if (skb_is_fully_mapped(ssk, skb)) {
                        MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
                        return MAPPING_INVALID;
                }

                /* will validate the next map after consuming the current one */
                goto validate_csum;
        }

        subflow->map_seq = map_seq;
        subflow->map_subflow_seq = mpext->subflow_seq;
        subflow->map_data_len = data_len;
        subflow->map_valid = 1;
        subflow->map_data_fin = mpext->data_fin;
        subflow->mpc_map = mpext->mpc_map;
        subflow->map_csum_reqd = mpext->csum_reqd;
        subflow->map_csum_len = 0;
        subflow->map_data_csum = csum_unfold(mpext->csum);

        /* Cfr RFC 8684 Section 3.3.0 */
        if (unlikely(subflow->map_csum_reqd != csum_reqd))
                return MAPPING_INVALID;

        pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
                 subflow->map_seq, subflow->map_subflow_seq,
                 subflow->map_data_len, subflow->map_csum_reqd,
                 subflow->map_data_csum);

validate_seq:
        /* we revalidate valid mapping on new skb, because we must ensure
         * the current skb is completely covered by the available mapping
         */
        if (!validate_mapping(ssk, skb)) {
                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH);
                return MAPPING_INVALID;
        }

        skb_ext_del(skb, SKB_EXT_MPTCP);

validate_csum:
        return validate_data_csum(ssk, skb, csum_reqd);
}

static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
                                       u64 limit)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
        u32 incr;

        incr = limit >= skb->len ? skb->len + fin : limit;

        pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
                 subflow->map_subflow_seq);
        MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
        tcp_sk(ssk)->copied_seq += incr;
        if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
                sk_eat_skb(ssk, skb);
        if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
                subflow->map_valid = 0;
}

/* sched mptcp worker to remove the subflow if no more data is pending */
static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
{
        if (likely(ssk->sk_state != TCP_CLOSE))
                return;

        if (skb_queue_empty(&ssk->sk_receive_queue) &&
            !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
                mptcp_schedule_work((struct sock *)msk);
}

static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
{
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);

        if (subflow->mp_join)
                return false;
        else if (READ_ONCE(msk->csum_enabled))
                return !subflow->valid_csum_seen;
        else
                return !subflow->fully_established;
}

static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        unsigned long fail_tout;

        /* graceful failure can happen only on the MPC subflow */
        if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
                return;

        /* since the close timeout take precedence on the fail one,
         * no need to start the latter when the first is already set
         */
        if (sock_flag((struct sock *)msk, SOCK_DEAD))
                return;

        /* we don't need extreme accuracy here, use a zero fail_tout as special
         * value meaning no fail timeout at all;
         */
        fail_tout = jiffies + TCP_RTO_MAX;
        if (!fail_tout)
                fail_tout = 1;
        WRITE_ONCE(subflow->fail_tout, fail_tout);
        tcp_send_ack(ssk);

        mptcp_reset_tout_timer(msk, subflow->fail_tout);
}

static bool subflow_check_data_avail(struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        enum mapping_status status;
        struct mptcp_sock *msk;
        struct sk_buff *skb;

        if (!skb_peek(&ssk->sk_receive_queue))
                WRITE_ONCE(subflow->data_avail, false);
        if (subflow->data_avail)
                return true;

        msk = mptcp_sk(subflow->conn);
        for (;;) {
                u64 ack_seq;
                u64 old_ack;

                status = get_mapping_status(ssk, msk);
                trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
                if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
                             status == MAPPING_BAD_CSUM))
                        goto fallback;

                if (status != MAPPING_OK)
                        goto no_data;

                skb = skb_peek(&ssk->sk_receive_queue);
                if (WARN_ON_ONCE(!skb))
                        goto no_data;

                if (unlikely(!READ_ONCE(msk->can_ack)))
                        goto fallback;

                old_ack = READ_ONCE(msk->ack_seq);
                ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
                pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
                         ack_seq);
                if (unlikely(before64(ack_seq, old_ack))) {
                        mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
                        continue;
                }

                WRITE_ONCE(subflow->data_avail, true);
                break;
        }
        return true;

no_data:
        subflow_sched_work_if_closed(msk, ssk);
        return false;

fallback:
        if (!__mptcp_check_fallback(msk)) {
                /* RFC 8684 section 3.7. */
                if (status == MAPPING_BAD_CSUM &&
                    (subflow->mp_join || subflow->valid_csum_seen)) {
                        subflow->send_mp_fail = 1;

                        if (!READ_ONCE(msk->allow_infinite_fallback)) {
                                subflow->reset_transient = 0;
                                subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
                                goto reset;
                        }
                        mptcp_subflow_fail(msk, ssk);
                        WRITE_ONCE(subflow->data_avail, true);
                        return true;
                }

                if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
                        /* fatal protocol error, close the socket.
                         * subflow_error_report() will introduce the appropriate barriers
                         */
                        subflow->reset_transient = 0;
                        subflow->reset_reason = MPTCP_RST_EMPTCP;

reset:
                        WRITE_ONCE(ssk->sk_err, EBADMSG);
                        tcp_set_state(ssk, TCP_CLOSE);
                        while ((skb = skb_peek(&ssk->sk_receive_queue)))
                                sk_eat_skb(ssk, skb);
                        mptcp_send_active_reset_reason(ssk);
                        WRITE_ONCE(subflow->data_avail, false);
                        return false;
                }

                mptcp_do_fallback(ssk);
        }

        skb = skb_peek(&ssk->sk_receive_queue);
        subflow->map_valid = 1;
        subflow->map_seq = READ_ONCE(msk->ack_seq);
        subflow->map_data_len = skb->len;
        subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
        WRITE_ONCE(subflow->data_avail, true);
        return true;
}

bool mptcp_subflow_data_available(struct sock *sk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        /* check if current mapping is still valid */
        if (subflow->map_valid &&
            mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
                subflow->map_valid = 0;
                WRITE_ONCE(subflow->data_avail, false);

                pr_debug("Done with mapping: seq=%u data_len=%u",
                         subflow->map_subflow_seq,
                         subflow->map_data_len);
        }

        return subflow_check_data_avail(sk);
}

/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
 * not the ssk one.
 *
 * In mptcp, rwin is about the mptcp-level connection data.
 *
 * Data that is still on the ssk rx queue can thus be ignored,
 * as far as mptcp peer is concerned that data is still inflight.
 * DSS ACK is updated when skb is moved to the mptcp rx queue.
 */
void mptcp_space(const struct sock *ssk, int *space, int *full_space)
{
        const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        const struct sock *sk = subflow->conn;

        *space = __mptcp_space(sk);
        *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}

static void subflow_error_report(struct sock *ssk)
{
        struct sock *sk = mptcp_subflow_ctx(ssk)->conn;

        /* bail early if this is a no-op, so that we avoid introducing a
         * problematic lockdep dependency between TCP accept queue lock
         * and msk socket spinlock
         */
        if (!sk->sk_socket)
                return;

        mptcp_data_lock(sk);
        if (!sock_owned_by_user(sk))
                __mptcp_error_report(sk);
        else
                __set_bit(MPTCP_ERROR_REPORT,  &mptcp_sk(sk)->cb_flags);
        mptcp_data_unlock(sk);
}

static void subflow_data_ready(struct sock *sk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        u16 state = 1 << inet_sk_state_load(sk);
        struct sock *parent = subflow->conn;
        struct mptcp_sock *msk;

        trace_sk_data_ready(sk);

        msk = mptcp_sk(parent);
        if (state & TCPF_LISTEN) {
                /* MPJ subflow are removed from accept queue before reaching here,
                 * avoid stray wakeups
                 */
                if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
                        return;

                parent->sk_data_ready(parent);
                return;
        }

        WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
                     !subflow->mp_join && !(state & TCPF_CLOSE));

        if (mptcp_subflow_data_available(sk)) {
                mptcp_data_ready(parent, sk);

                /* subflow-level lowat test are not relevant.
                 * respect the msk-level threshold eventually mandating an immediate ack
                 */
                if (mptcp_data_avail(msk) < parent->sk_rcvlowat &&
                    (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss)
                        inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
        } else if (unlikely(sk->sk_err)) {
                subflow_error_report(sk);
        }
}

static void subflow_write_space(struct sock *ssk)
{
        struct sock *sk = mptcp_subflow_ctx(ssk)->conn;

        mptcp_propagate_sndbuf(sk, ssk);
        mptcp_write_space(sk);
}

static const struct inet_connection_sock_af_ops *
subflow_default_af_ops(struct sock *sk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        if (sk->sk_family == AF_INET6)
                return &subflow_v6_specific;
#endif
        return &subflow_specific;
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        const struct inet_connection_sock_af_ops *target;

        target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);

        pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
                 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);

        if (likely(icsk->icsk_af_ops == target))
                return;

        subflow->icsk_af_ops = icsk->icsk_af_ops;
        icsk->icsk_af_ops = target;
}
#endif

void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
                         struct sockaddr_storage *addr,
                         unsigned short family)
{
        memset(addr, 0, sizeof(*addr));
        addr->ss_family = family;
        if (addr->ss_family == AF_INET) {
                struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;

                if (info->family == AF_INET)
                        in_addr->sin_addr = info->addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                else if (ipv6_addr_v4mapped(&info->addr6))
                        in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
#endif
                in_addr->sin_port = info->port;
        }
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        else if (addr->ss_family == AF_INET6) {
                struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;

                if (info->family == AF_INET)
                        ipv6_addr_set_v4mapped(info->addr.s_addr,
                                               &in6_addr->sin6_addr);
                else
                        in6_addr->sin6_addr = info->addr6;
                in6_addr->sin6_port = info->port;
        }
#endif
}

int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
                            const struct mptcp_addr_info *remote)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_subflow_context *subflow;
        struct sockaddr_storage addr;
        int remote_id = remote->id;
        int local_id = loc->id;
        int err = -ENOTCONN;
        struct socket *sf;
        struct sock *ssk;
        u32 remote_token;
        int addrlen;
        int ifindex;
        u8 flags;

        if (!mptcp_is_fully_established(sk))
                goto err_out;

        err = mptcp_subflow_create_socket(sk, loc->family, &sf);
        if (err)
                goto err_out;

        ssk = sf->sk;
        subflow = mptcp_subflow_ctx(ssk);
        do {
                get_random_bytes(&subflow->local_nonce, sizeof(u32));
        } while (!subflow->local_nonce);

        if (local_id)
                subflow_set_local_id(subflow, local_id);

        mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
                                             &flags, &ifindex);
        subflow->remote_key_valid = 1;
        subflow->remote_key = READ_ONCE(msk->remote_key);
        subflow->local_key = READ_ONCE(msk->local_key);
        subflow->token = msk->token;
        mptcp_info2sockaddr(loc, &addr, ssk->sk_family);

        addrlen = sizeof(struct sockaddr_in);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        if (addr.ss_family == AF_INET6)
                addrlen = sizeof(struct sockaddr_in6);
#endif
        ssk->sk_bound_dev_if = ifindex;
        err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
        if (err)
                goto failed;

        mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
        pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
                 remote_token, local_id, remote_id);
        subflow->remote_token = remote_token;
        WRITE_ONCE(subflow->remote_id, remote_id);
        subflow->request_join = 1;
        subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
        subflow->subflow_id = msk->subflow_id++;
        mptcp_info2sockaddr(remote, &addr, ssk->sk_family);

        sock_hold(ssk);
        list_add_tail(&subflow->node, &msk->conn_list);
        err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
        if (err && err != -EINPROGRESS)
                goto failed_unlink;

        /* discard the subflow socket */
        mptcp_sock_graft(ssk, sk->sk_socket);
        iput(SOCK_INODE(sf));
        WRITE_ONCE(msk->allow_infinite_fallback, false);
        mptcp_stop_tout_timer(sk);
        return 0;

failed_unlink:
        list_del(&subflow->node);
        sock_put(mptcp_subflow_tcp_sock(subflow));

failed:
        subflow->disposable = 1;
        sock_release(sf);

err_out:
        /* we account subflows before the creation, and this failures will not
         * be caught by sk_state_change()
         */
        mptcp_pm_close_subflow(msk);
        return err;
}

static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
        struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
                                *child_skcd = &child->sk_cgrp_data;

        /* only the additional subflows created by kworkers have to be modified */
        if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
            cgroup_id(sock_cgroup_ptr(child_skcd))) {
#ifdef CONFIG_MEMCG
                struct mem_cgroup *memcg = parent->sk_memcg;

                mem_cgroup_sk_free(child);
                if (memcg && css_tryget(&memcg->css))
                        child->sk_memcg = memcg;
#endif /* CONFIG_MEMCG */

                cgroup_sk_free(child_skcd);
                *child_skcd = *parent_skcd;
                cgroup_sk_clone(child_skcd);
        }
#endif /* CONFIG_SOCK_CGROUP_DATA */
}

static void mptcp_subflow_ops_override(struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        if (ssk->sk_prot == &tcpv6_prot)
                ssk->sk_prot = &tcpv6_prot_override;
        else
#endif
                ssk->sk_prot = &tcp_prot_override;
}

static void mptcp_subflow_ops_undo_override(struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        if (ssk->sk_prot == &tcpv6_prot_override)
                ssk->sk_prot = &tcpv6_prot;
        else
#endif
                ssk->sk_prot = &tcp_prot;
}

int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
                                struct socket **new_sock)
{
        struct mptcp_subflow_context *subflow;
        struct net *net = sock_net(sk);
        struct socket *sf;
        int err;

        /* un-accepted server sockets can reach here - on bad configuration
         * bail early to avoid greater trouble later
         */
        if (unlikely(!sk->sk_socket))
                return -EINVAL;

        err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf);
        if (err)
                return err;

        lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING);

        err = security_mptcp_add_subflow(sk, sf->sk);
        if (err)
                goto err_free;

        /* the newly created socket has to be in the same cgroup as its parent */
        mptcp_attach_cgroup(sk, sf->sk);

        /* kernel sockets do not by default acquire net ref, but TCP timer
         * needs it.
         * Update ns_tracker to current stack trace and refcounted tracker.
         */
        __netns_tracker_free(net, &sf->sk->ns_tracker, false);
        sf->sk->sk_net_refcnt = 1;
        get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
        sock_inuse_add(net, 1);
        err = tcp_set_ulp(sf->sk, "mptcp");
        if (err)
                goto err_free;

        mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk);
        release_sock(sf->sk);

        /* the newly created socket really belongs to the owning MPTCP
         * socket, even if for additional subflows the allocation is performed
         * by a kernel workqueue. Adjust inode references, so that the
         * procfs/diag interfaces really show this one belonging to the correct
         * user.
         */
        SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
        SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
        SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;

        subflow = mptcp_subflow_ctx(sf->sk);
        pr_debug("subflow=%p", subflow);

        *new_sock = sf;
        sock_hold(sk);
        subflow->conn = sk;
        mptcp_subflow_ops_override(sf->sk);

        return 0;

err_free:
        release_sock(sf->sk);
        sock_release(sf);
        return err;
}

static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
                                                        gfp_t priority)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct mptcp_subflow_context *ctx;

        ctx = kzalloc(sizeof(*ctx), priority);
        if (!ctx)
                return NULL;

        rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
        INIT_LIST_HEAD(&ctx->node);
        INIT_LIST_HEAD(&ctx->delegated_node);

        pr_debug("subflow=%p", ctx);

        ctx->tcp_sock = sk;
        WRITE_ONCE(ctx->local_id, -1);

        return ctx;
}

static void __subflow_state_change(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_all(&wq->wait);
        rcu_read_unlock();
}

static bool subflow_is_done(const struct sock *sk)
{
        return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
}

static void subflow_state_change(struct sock *sk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct sock *parent = subflow->conn;
        struct mptcp_sock *msk;

        __subflow_state_change(sk);

        msk = mptcp_sk(parent);
        if (subflow_simultaneous_connect(sk)) {
                mptcp_do_fallback(sk);
                pr_fallback(msk);
                subflow->conn_finished = 1;
                mptcp_propagate_state(parent, sk, subflow, NULL);
        }

        /* as recvmsg() does not acquire the subflow socket for ssk selection
         * a fin packet carrying a DSS can be unnoticed if we don't trigger
         * the data available machinery here.
         */
        if (mptcp_subflow_data_available(sk))
                mptcp_data_ready(parent, sk);
        else if (unlikely(sk->sk_err))
                subflow_error_report(sk);

        subflow_sched_work_if_closed(mptcp_sk(parent), sk);

        /* when the fallback subflow closes the rx side, trigger a 'dummy'
         * ingress data fin, so that the msk state will follow along
         */
        if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk &&
            mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
                mptcp_schedule_work(parent);
}

void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
{
        struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
        struct request_sock *req, *head, *tail;
        struct mptcp_subflow_context *subflow;
        struct sock *sk, *ssk;

        /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
         * Splice the req list, so that accept() can not reach the pending ssk after
         * the listener socket is released below.
         */
        spin_lock_bh(&queue->rskq_lock);
        head = queue->rskq_accept_head;
        tail = queue->rskq_accept_tail;
        queue->rskq_accept_head = NULL;
        queue->rskq_accept_tail = NULL;
        spin_unlock_bh(&queue->rskq_lock);

        if (!head)
                return;

        /* can't acquire the msk socket lock under the subflow one,
         * or will cause ABBA deadlock
         */
        release_sock(listener_ssk);

        for (req = head; req; req = req->dl_next) {
                ssk = req->sk;
                if (!sk_is_mptcp(ssk))
                        continue;

                subflow = mptcp_subflow_ctx(ssk);
                if (!subflow || !subflow->conn)
                        continue;

                sk = subflow->conn;
                sock_hold(sk);

                lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
                __mptcp_unaccepted_force_close(sk);
                release_sock(sk);

                /* lockdep will report a false positive ABBA deadlock
                 * between cancel_work_sync and the listener socket.
                 * The involved locks belong to different sockets WRT
                 * the existing AB chain.
                 * Using a per socket key is problematic as key
                 * deregistration requires process context and must be
                 * performed at socket disposal time, in atomic
                 * context.
                 * Just tell lockdep to consider the listener socket
                 * released here.
                 */
                mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
                mptcp_cancel_work(sk);
                mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_);

                sock_put(sk);
        }

        /* we are still under the listener msk socket lock */
        lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);

        /* restore the listener queue, to let the TCP code clean it up */
        spin_lock_bh(&queue->rskq_lock);
        WARN_ON_ONCE(queue->rskq_accept_head);
        queue->rskq_accept_head = head;
        queue->rskq_accept_tail = tail;
        spin_unlock_bh(&queue->rskq_lock);
}

static int subflow_ulp_init(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct mptcp_subflow_context *ctx;
        struct tcp_sock *tp = tcp_sk(sk);
        int err = 0;

        /* disallow attaching ULP to a socket unless it has been
         * created with sock_create_kern()
         */
        if (!sk->sk_kern_sock) {
                err = -EOPNOTSUPP;
                goto out;
        }

        ctx = subflow_create_ctx(sk, GFP_KERNEL);
        if (!ctx) {
                err = -ENOMEM;
                goto out;
        }

        pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);

        tp->is_mptcp = 1;
        ctx->icsk_af_ops = icsk->icsk_af_ops;
        icsk->icsk_af_ops = subflow_default_af_ops(sk);
        ctx->tcp_state_change = sk->sk_state_change;
        ctx->tcp_error_report = sk->sk_error_report;

        WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable);
        WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space);

        sk->sk_data_ready = subflow_data_ready;
        sk->sk_write_space = subflow_write_space;
        sk->sk_state_change = subflow_state_change;
        sk->sk_error_report = subflow_error_report;
out:
        return err;
}

static void subflow_ulp_release(struct sock *ssk)
{
        struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
        bool release = true;
        struct sock *sk;

        if (!ctx)
                return;

        sk = ctx->conn;
        if (sk) {
                /* if the msk has been orphaned, keep the ctx
                 * alive, will be freed by __mptcp_close_ssk(),
                 * when the subflow is still unaccepted
                 */
                release = ctx->disposable || list_empty(&ctx->node);

                /* inet_child_forget() does not call sk_state_change(),
                 * explicitly trigger the socket close machinery
                 */
                if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW,
                                                  &mptcp_sk(sk)->flags))
                        mptcp_schedule_work(sk);
                sock_put(sk);
        }

        mptcp_subflow_ops_undo_override(ssk);
        if (release)
                kfree_rcu(ctx, rcu);
}

static void subflow_ulp_clone(const struct request_sock *req,
                              struct sock *newsk,
                              const gfp_t priority)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
        struct mptcp_subflow_context *new_ctx;

        if (!tcp_rsk(req)->is_mptcp ||
            (!subflow_req->mp_capable && !subflow_req->mp_join)) {
                subflow_ulp_fallback(newsk, old_ctx);
                return;
        }

        new_ctx = subflow_create_ctx(newsk, priority);
        if (!new_ctx) {
                subflow_ulp_fallback(newsk, old_ctx);
                return;
        }

        new_ctx->conn_finished = 1;
        new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
        new_ctx->tcp_state_change = old_ctx->tcp_state_change;
        new_ctx->tcp_error_report = old_ctx->tcp_error_report;
        new_ctx->rel_write_seq = 1;
        new_ctx->tcp_sock = newsk;

        if (subflow_req->mp_capable) {
                /* see comments in subflow_syn_recv_sock(), MPTCP connection
                 * is fully established only after we receive the remote key
                 */
                new_ctx->mp_capable = 1;
                new_ctx->local_key = subflow_req->local_key;
                new_ctx->token = subflow_req->token;
                new_ctx->ssn_offset = subflow_req->ssn_offset;
                new_ctx->idsn = subflow_req->idsn;

                /* this is the first subflow, id is always 0 */
                subflow_set_local_id(new_ctx, 0);
        } else if (subflow_req->mp_join) {
                new_ctx->ssn_offset = subflow_req->ssn_offset;
                new_ctx->mp_join = 1;
                new_ctx->fully_established = 1;
                new_ctx->remote_key_valid = 1;
                new_ctx->backup = subflow_req->backup;
                WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id);
                new_ctx->token = subflow_req->token;
                new_ctx->thmac = subflow_req->thmac;

                /* the subflow req id is valid, fetched via subflow_check_req()
                 * and subflow_token_join_request()
                 */
                subflow_set_local_id(new_ctx, subflow_req->local_id);
        }
}

static void tcp_release_cb_override(struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        long status;

        /* process and clear all the pending actions, but leave the subflow into
         * the napi queue. To respect locking, only the same CPU that originated
         * the action can touch the list. mptcp_napi_poll will take care of it.
         */
        status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0);
        if (status)
                mptcp_subflow_process_delegated(ssk, status);

        tcp_release_cb(ssk);
}

static int tcp_abort_override(struct sock *ssk, int err)
{
        /* closing a listener subflow requires a great deal of care.
         * keep it simple and just prevent such operation
         */
        if (inet_sk_state_load(ssk) == TCP_LISTEN)
                return -EINVAL;

        return tcp_abort(ssk, err);
}

static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
        .name                = "mptcp",
        .owner                = THIS_MODULE,
        .init                = subflow_ulp_init,
        .release        = subflow_ulp_release,
        .clone                = subflow_ulp_clone,
};

static int subflow_ops_init(struct request_sock_ops *subflow_ops)
{
        subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);

        subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
                                              subflow_ops->obj_size, 0,
                                              SLAB_ACCOUNT |
                                              SLAB_TYPESAFE_BY_RCU,
                                              NULL);
        if (!subflow_ops->slab)
                return -ENOMEM;

        return 0;
}

void __init mptcp_subflow_init(void)
{
        mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops;
        mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4";
        mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor;

        if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0)
                panic("MPTCP: failed to init subflow v4 request sock ops\n");

        subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
        subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
        subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack;

        subflow_specific = ipv4_specific;
        subflow_specific.conn_request = subflow_v4_conn_request;
        subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
        subflow_specific.sk_rx_dst_set = subflow_finish_connect;
        subflow_specific.rebuild_header = subflow_rebuild_header;

        tcp_prot_override = tcp_prot;
        tcp_prot_override.release_cb = tcp_release_cb_override;
        tcp_prot_override.diag_destroy = tcp_abort_override;

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        /* In struct mptcp_subflow_request_sock, we assume the TCP request sock
         * structures for v4 and v6 have the same size. It should not changed in
         * the future but better to make sure to be warned if it is no longer
         * the case.
         */
        BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock));

        mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops;
        mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6";
        mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor;

        if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0)
                panic("MPTCP: failed to init subflow v6 request sock ops\n");

        subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
        subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
        subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack;

        subflow_v6_specific = ipv6_specific;
        subflow_v6_specific.conn_request = subflow_v6_conn_request;
        subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
        subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
        subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header;

        subflow_v6m_specific = subflow_v6_specific;
        subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
        subflow_v6m_specific.send_check = ipv4_specific.send_check;
        subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
        subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
        subflow_v6m_specific.rebuild_header = subflow_rebuild_header;

        tcpv6_prot_override = tcpv6_prot;
        tcpv6_prot_override.release_cb = tcp_release_cb_override;
        tcpv6_prot_override.diag_destroy = tcp_abort_override;
#endif

        mptcp_diag_subflow_init(&subflow_ulp_ops);

        if (tcp_register_ulp(&subflow_ulp_ops) != 0)
                panic("MPTCP: failed to register subflows to ULP\n");
}



















































































    1 
















































































































































    1 


















    1 








    1 














    1 









    1 

















































    1 























    1 



    1 






    1 






    1 




















    1 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
// SPDX-License-Identifier: GPL-2.0-only
/*
 * spectrum management
 *
 * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi>
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005-2006, Devicescape Software, Inc.
 * Copyright 2006-2007  Jiri Benc <jbenc@suse.cz>
 * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
 * Copyright 2007-2008, Intel Corporation
 * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
 * Copyright (C) 2018, 2020, 2022-2024 Intel Corporation
 */

#include <linux/ieee80211.h>
#include <net/cfg80211.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
#include "sta_info.h"
#include "wme.h"

static bool
wbcs_elem_to_chandef(const struct ieee80211_wide_bw_chansw_ie *wbcs_elem,
                     struct cfg80211_chan_def *chandef)
{
        u8 ccfs0 = wbcs_elem->new_center_freq_seg0;
        u8 ccfs1 = wbcs_elem->new_center_freq_seg1;
        u32 cf0 = ieee80211_channel_to_frequency(ccfs0, chandef->chan->band);
        u32 cf1 = ieee80211_channel_to_frequency(ccfs1, chandef->chan->band);

        switch (wbcs_elem->new_channel_width) {
        case IEEE80211_VHT_CHANWIDTH_160MHZ:
                /* deprecated encoding */
                chandef->width = NL80211_CHAN_WIDTH_160;
                chandef->center_freq1 = cf0;
                break;
        case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
                /* deprecated encoding */
                chandef->width = NL80211_CHAN_WIDTH_80P80;
                chandef->center_freq1 = cf0;
                chandef->center_freq2 = cf1;
                break;
        case IEEE80211_VHT_CHANWIDTH_80MHZ:
                chandef->width = NL80211_CHAN_WIDTH_80;
                chandef->center_freq1 = cf0;

                if (ccfs1) {
                        u8 diff = abs(ccfs0 - ccfs1);

                        if (diff == 8) {
                                chandef->width = NL80211_CHAN_WIDTH_160;
                                chandef->center_freq1 = cf1;
                        } else if (diff > 8) {
                                chandef->width = NL80211_CHAN_WIDTH_80P80;
                                chandef->center_freq2 = cf1;
                        }
                }
                break;
        case IEEE80211_VHT_CHANWIDTH_USE_HT:
        default:
                /* If the WBCS Element is present, new channel bandwidth is
                 * at least 40 MHz.
                 */
                chandef->width = NL80211_CHAN_WIDTH_40;
                chandef->center_freq1 = cf0;
                break;
        }

        return cfg80211_chandef_valid(chandef);
}

static void
validate_chandef_by_ht_vht_oper(struct ieee80211_sub_if_data *sdata,
                                struct ieee80211_conn_settings *conn,
                                u32 vht_cap_info,
                                struct cfg80211_chan_def *chandef)
{
        u32 control_freq, center_freq1, center_freq2;
        enum nl80211_chan_width chan_width;
        struct ieee80211_ht_operation ht_oper;
        struct ieee80211_vht_operation vht_oper;

        if (conn->mode < IEEE80211_CONN_MODE_HT ||
            conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) {
                chandef->chan = NULL;
                return;
        }

        control_freq = chandef->chan->center_freq;
        center_freq1 = chandef->center_freq1;
        center_freq2 = chandef->center_freq2;
        chan_width = chandef->width;

        ht_oper.primary_chan = ieee80211_frequency_to_channel(control_freq);
        if (control_freq != center_freq1)
                ht_oper.ht_param = control_freq > center_freq1 ?
                        IEEE80211_HT_PARAM_CHA_SEC_BELOW :
                        IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
        else
                ht_oper.ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE;

        ieee80211_chandef_ht_oper(&ht_oper, chandef);

        if (conn->mode < IEEE80211_CONN_MODE_VHT)
                return;

        vht_oper.center_freq_seg0_idx =
                ieee80211_frequency_to_channel(center_freq1);
        vht_oper.center_freq_seg1_idx = center_freq2 ?
                ieee80211_frequency_to_channel(center_freq2) : 0;

        switch (chan_width) {
        case NL80211_CHAN_WIDTH_320:
                WARN_ON(1);
                break;
        case NL80211_CHAN_WIDTH_160:
                vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
                vht_oper.center_freq_seg1_idx = vht_oper.center_freq_seg0_idx;
                vht_oper.center_freq_seg0_idx +=
                        control_freq < center_freq1 ? -8 : 8;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
                break;
        case NL80211_CHAN_WIDTH_80:
                vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
                break;
        default:
                vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT;
                break;
        }

        ht_oper.operation_mode =
                le16_encode_bits(vht_oper.center_freq_seg1_idx,
                                 IEEE80211_HT_OP_MODE_CCFS2_MASK);

        if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info,
                                        &vht_oper, &ht_oper, chandef))
                chandef->chan = NULL;
}

static void
validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_conn_settings *conn,
                                     struct cfg80211_chan_def *chandef)
{
        struct ieee80211_local *local = sdata->local;
        u32 control_freq, center_freq1, center_freq2;
        enum nl80211_chan_width chan_width;
        struct {
                struct ieee80211_he_operation _oper;
                struct ieee80211_he_6ghz_oper _6ghz_oper;
        } __packed he;
        struct {
                struct ieee80211_eht_operation _oper;
                struct ieee80211_eht_operation_info _oper_info;
        } __packed eht;
        const struct ieee80211_eht_operation *eht_oper;

        if (conn->mode < IEEE80211_CONN_MODE_HE) {
                chandef->chan = NULL;
                return;
        }

        control_freq = chandef->chan->center_freq;
        center_freq1 = chandef->center_freq1;
        center_freq2 = chandef->center_freq2;
        chan_width = chandef->width;

        he._oper.he_oper_params =
                le32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO);
        he._6ghz_oper.primary =
                ieee80211_frequency_to_channel(control_freq);
        he._6ghz_oper.ccfs0 = ieee80211_frequency_to_channel(center_freq1);
        he._6ghz_oper.ccfs1 = center_freq2 ?
                ieee80211_frequency_to_channel(center_freq2) : 0;

        switch (chan_width) {
        case NL80211_CHAN_WIDTH_320:
                he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0;
                he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -16 : 16;
                he._6ghz_oper.control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ;
                break;
        case NL80211_CHAN_WIDTH_160:
                he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0;
                he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -8 : 8;
                fallthrough;
        case NL80211_CHAN_WIDTH_80P80:
                he._6ghz_oper.control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ;
                break;
        case NL80211_CHAN_WIDTH_80:
                he._6ghz_oper.control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ;
                break;
        case NL80211_CHAN_WIDTH_40:
                he._6ghz_oper.control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ;
                break;
        default:
                he._6ghz_oper.control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ;
                break;
        }

        if (conn->mode < IEEE80211_CONN_MODE_EHT) {
                eht_oper = NULL;
        } else {
                eht._oper.params = IEEE80211_EHT_OPER_INFO_PRESENT;
                eht._oper_info.control = he._6ghz_oper.control;
                eht._oper_info.ccfs0 = he._6ghz_oper.ccfs0;
                eht._oper_info.ccfs1 = he._6ghz_oper.ccfs1;
                eht_oper = &eht._oper;
        }

        if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper,
                                            eht_oper, chandef))
                chandef->chan = NULL;
}

int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
                                 struct ieee802_11_elems *elems,
                                 enum nl80211_band current_band,
                                 u32 vht_cap_info,
                                 struct ieee80211_conn_settings *conn,
                                 u8 *bssid,
                                 struct ieee80211_csa_ie *csa_ie)
{
        enum nl80211_band new_band = current_band;
        int new_freq;
        u8 new_chan_no = 0, new_op_class = 0;
        struct ieee80211_channel *new_chan;
        struct cfg80211_chan_def new_chandef = {};
        const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
        const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
        const struct ieee80211_bandwidth_indication *bwi;
        const struct ieee80211_ext_chansw_ie *ext_chansw_elem;
        int secondary_channel_offset = -1;

        memset(csa_ie, 0, sizeof(*csa_ie));

        sec_chan_offs = elems->sec_chan_offs;
        wide_bw_chansw_ie = elems->wide_bw_chansw_ie;
        bwi = elems->bandwidth_indication;
        ext_chansw_elem = elems->ext_chansw_ie;

        if (conn->mode < IEEE80211_CONN_MODE_HT ||
            conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) {
                sec_chan_offs = NULL;
                wide_bw_chansw_ie = NULL;
        }

        if (conn->mode < IEEE80211_CONN_MODE_VHT)
                wide_bw_chansw_ie = NULL;

        if (ext_chansw_elem) {
                new_op_class = ext_chansw_elem->new_operating_class;

                if (!ieee80211_operating_class_to_band(new_op_class, &new_band)) {
                        new_op_class = 0;
                        sdata_info(sdata, "cannot understand ECSA IE operating class, %d, ignoring\n",
                                   ext_chansw_elem->new_operating_class);
                } else {
                        new_chan_no = ext_chansw_elem->new_ch_num;
                        csa_ie->count = ext_chansw_elem->count;
                        csa_ie->mode = ext_chansw_elem->mode;
                }
        }

        if (!new_op_class && elems->ch_switch_ie) {
                new_chan_no = elems->ch_switch_ie->new_ch_num;
                csa_ie->count = elems->ch_switch_ie->count;
                csa_ie->mode = elems->ch_switch_ie->mode;
        }

        /* nothing here we understand */
        if (!new_chan_no)
                return 1;

        /* Mesh Channel Switch Parameters Element */
        if (elems->mesh_chansw_params_ie) {
                csa_ie->ttl = elems->mesh_chansw_params_ie->mesh_ttl;
                csa_ie->mode = elems->mesh_chansw_params_ie->mesh_flags;
                csa_ie->pre_value = le16_to_cpu(
                                elems->mesh_chansw_params_ie->mesh_pre_value);

                if (elems->mesh_chansw_params_ie->mesh_flags &
                                WLAN_EID_CHAN_SWITCH_PARAM_REASON)
                        csa_ie->reason_code = le16_to_cpu(
                                elems->mesh_chansw_params_ie->mesh_reason);
        }

        new_freq = ieee80211_channel_to_frequency(new_chan_no, new_band);
        new_chan = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
        if (!new_chan || new_chan->flags & IEEE80211_CHAN_DISABLED) {
                sdata_info(sdata,
                           "BSS %pM switches to unsupported channel (%d MHz), disconnecting\n",
                           bssid, new_freq);
                return -EINVAL;
        }

        if (sec_chan_offs) {
                secondary_channel_offset = sec_chan_offs->sec_chan_offs;
        } else if (conn->mode >= IEEE80211_CONN_MODE_HT) {
                /* If the secondary channel offset IE is not present,
                 * we can't know what's the post-CSA offset, so the
                 * best we can do is use 20MHz.
                */
                secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
        }

        switch (secondary_channel_offset) {
        default:
                /* secondary_channel_offset was present but is invalid */
        case IEEE80211_HT_PARAM_CHA_SEC_NONE:
                cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
                                        NL80211_CHAN_HT20);
                break;
        case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
                cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
                                        NL80211_CHAN_HT40PLUS);
                break;
        case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
                cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
                                        NL80211_CHAN_HT40MINUS);
                break;
        case -1:
                cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan,
                                        NL80211_CHAN_NO_HT);
                /* keep width for 5/10 MHz channels */
                switch (sdata->vif.bss_conf.chanreq.oper.width) {
                case NL80211_CHAN_WIDTH_5:
                case NL80211_CHAN_WIDTH_10:
                        csa_ie->chanreq.oper.width =
                                sdata->vif.bss_conf.chanreq.oper.width;
                        break;
                default:
                        break;
                }
                break;
        }

        /* parse one of the Elements to build a new chandef */
        memset(&new_chandef, 0, sizeof(new_chandef));
        new_chandef.chan = new_chan;
        if (bwi) {
                /* start with the CSA one */
                new_chandef = csa_ie->chanreq.oper;
                /* and update the width accordingly */
                ieee80211_chandef_eht_oper(&bwi->info, &new_chandef);

                if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT)
                        new_chandef.punctured =
                                get_unaligned_le16(bwi->info.optional);
        } else if (!wide_bw_chansw_ie || !wbcs_elem_to_chandef(wide_bw_chansw_ie,
                                                               &new_chandef)) {
                if (!ieee80211_operating_class_to_chandef(new_op_class, new_chan,
                                                          &new_chandef))
                        new_chandef = csa_ie->chanreq.oper;
        }

        /* check if the new chandef fits the capabilities */
        if (new_band == NL80211_BAND_6GHZ)
                validate_chandef_by_6ghz_he_eht_oper(sdata, conn, &new_chandef);
        else
                validate_chandef_by_ht_vht_oper(sdata, conn, vht_cap_info,
                                                &new_chandef);

        /* capture the AP chandef before (potential) downgrading */
        csa_ie->chanreq.ap = new_chandef;

        /* if data is there validate the bandwidth & use it */
        if (new_chandef.chan) {
                if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320 &&
                    new_chandef.width == NL80211_CHAN_WIDTH_320)
                        ieee80211_chandef_downgrade(&new_chandef, NULL);

                if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160 &&
                    (new_chandef.width == NL80211_CHAN_WIDTH_80P80 ||
                     new_chandef.width == NL80211_CHAN_WIDTH_160))
                        ieee80211_chandef_downgrade(&new_chandef, NULL);

                if (!cfg80211_chandef_compatible(&new_chandef,
                                                 &csa_ie->chanreq.oper)) {
                        sdata_info(sdata,
                                   "BSS %pM: CSA has inconsistent channel data, disconnecting\n",
                                   bssid);
                        return -EINVAL;
                }

                csa_ie->chanreq.oper = new_chandef;
        }

        if (elems->max_channel_switch_time)
                csa_ie->max_switch_time =
                        (elems->max_channel_switch_time[0] << 0) |
                        (elems->max_channel_switch_time[1] <<  8) |
                        (elems->max_channel_switch_time[2] << 16);

        return 0;
}

static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata,
                                        struct ieee80211_msrment_ie *request_ie,
                                        const u8 *da, const u8 *bssid,
                                        u8 dialog_token)
{
        struct ieee80211_local *local = sdata->local;
        struct sk_buff *skb;
        struct ieee80211_mgmt *msr_report;

        skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom +
                                sizeof(struct ieee80211_msrment_ie));
        if (!skb)
                return;

        skb_reserve(skb, local->hw.extra_tx_headroom);
        msr_report = skb_put_zero(skb, 24);
        memcpy(msr_report->da, da, ETH_ALEN);
        memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN);
        memcpy(msr_report->bssid, bssid, ETH_ALEN);
        msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
                                                IEEE80211_STYPE_ACTION);

        skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement));
        msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
        msr_report->u.action.u.measurement.action_code =
                                WLAN_ACTION_SPCT_MSR_RPRT;
        msr_report->u.action.u.measurement.dialog_token = dialog_token;

        msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT;
        msr_report->u.action.u.measurement.length =
                        sizeof(struct ieee80211_msrment_ie);

        memset(&msr_report->u.action.u.measurement.msr_elem, 0,
                sizeof(struct ieee80211_msrment_ie));
        msr_report->u.action.u.measurement.msr_elem.token = request_ie->token;
        msr_report->u.action.u.measurement.msr_elem.mode |=
                        IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED;
        msr_report->u.action.u.measurement.msr_elem.type = request_ie->type;

        ieee80211_tx_skb(sdata, skb);
}

void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_mgmt *mgmt,
                                       size_t len)
{
        /*
         * Ignoring measurement request is spec violation.
         * Mandatory measurements must be reported optional
         * measurements might be refused or reported incapable
         * For now just refuse
         * TODO: Answer basic measurement as unmeasured
         */
        ieee80211_send_refuse_measurement_request(sdata,
                        &mgmt->u.action.u.measurement.msr_elem,
                        mgmt->sa, mgmt->bssid,
                        mgmt->u.action.u.measurement.dialog_token);
}









































































































































   14 






   11 







   11 















   11 


















































































































































































































































































































































































































































































































































    3 






    3 



































    3 
    3 












    3 




    3 





















   13 



   15 




























   33 


   11 


   28 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1994 Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *  General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */
#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>

#include <uapi/asm/kvm.h>

#include <linux/hardirq.h>
#include <linux/pkeys.h>
#include <linux/vmalloc.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

#ifdef CONFIG_X86_64
DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
DEFINE_PER_CPU(u64, xfd_state);
#endif

/* The FPU state configuration data for kernel and user space */
struct fpu_state_config        fpu_kernel_cfg __ro_after_init;
struct fpu_state_config fpu_user_cfg __ro_after_init;

/*
 * Represents the initial FPU state. It's mostly (but not completely) zeroes,
 * depending on the FPU hardware format:
 */
struct fpstate init_fpstate __ro_after_init;

/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);

/*
 * Track which context is using the FPU on the CPU:
 */
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * Can we use the FPU in kernel mode with the
 * whole "kernel_fpu_begin/end()" sequence?
 */
bool irq_fpu_usable(void)
{
        if (WARN_ON_ONCE(in_nmi()))
                return false;

        /* In kernel FPU usage already active? */
        if (this_cpu_read(in_kernel_fpu))
                return false;

        /*
         * When not in NMI or hard interrupt context, FPU can be used in:
         *
         * - Task context except from within fpregs_lock()'ed critical
         *   regions.
         *
         * - Soft interrupt processing context which cannot happen
         *   while in a fpregs_lock()'ed critical region.
         */
        if (!in_hardirq())
                return true;

        /*
         * In hard interrupt context it's safe when soft interrupts
         * are enabled, which means the interrupt did not hit in
         * a fpregs_lock()'ed critical region.
         */
        return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);

/*
 * Track AVX512 state use because it is known to slow the max clock
 * speed of the core.
 */
static void update_avx_timestamp(struct fpu *fpu)
{

#define AVX512_TRACKING_MASK        (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)

        if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
                fpu->avx512_timestamp = jiffies;
}

/*
 * Save the FPU register state in fpu->fpstate->regs. The register state is
 * preserved.
 *
 * Must be called with fpregs_lock() held.
 *
 * The legacy FNSAVE instruction clears all FPU state unconditionally, so
 * register state has to be reloaded. That might be a pointless exercise
 * when the FPU is going to be used by another task right after that. But
 * this only affects 20+ years old 32bit systems and avoids conditionals all
 * over the place.
 *
 * FXSAVE and all XSAVE variants preserve the FPU register state.
 */
void save_fpregs_to_fpstate(struct fpu *fpu)
{
        if (likely(use_xsave())) {
                os_xsave(fpu->fpstate);
                update_avx_timestamp(fpu);
                return;
        }

        if (likely(use_fxsr())) {
                fxsave(&fpu->fpstate->regs.fxsave);
                return;
        }

        /*
         * Legacy FPU register saving, FNSAVE always clears FPU registers,
         * so we have to reload them from the memory state.
         */
        asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
        frstor(&fpu->fpstate->regs.fsave);
}

void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
{
        /*
         * AMD K7/K8 and later CPUs up to Zen don't save/restore
         * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
         * here by setting it to fixed values.  "m" is a random variable
         * that should be in L1.
         */
        if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
                        "fildl %[addr]"        /* set F?P to defined value */
                        : : [addr] "m" (*fpstate));
        }

        if (use_xsave()) {
                /*
                 * Dynamically enabled features are enabled in XCR0, but
                 * usage requires also that the corresponding bits in XFD
                 * are cleared.  If the bits are set then using a related
                 * instruction will raise #NM. This allows to do the
                 * allocation of the larger FPU buffer lazy from #NM or if
                 * the task has no permission to kill it which would happen
                 * via #UD if the feature is disabled in XCR0.
                 *
                 * XFD state is following the same life time rules as
                 * XSTATE and to restore state correctly XFD has to be
                 * updated before XRSTORS otherwise the component would
                 * stay in or go into init state even if the bits are set
                 * in fpstate::regs::xsave::xfeatures.
                 */
                xfd_update_state(fpstate);

                /*
                 * Restoring state always needs to modify all features
                 * which are in @mask even if the current task cannot use
                 * extended features.
                 *
                 * So fpstate->xfeatures cannot be used here, because then
                 * a feature for which the task has no permission but was
                 * used by the previous task would not go into init state.
                 */
                mask = fpu_kernel_cfg.max_features & mask;

                os_xrstor(fpstate, mask);
        } else {
                if (use_fxsr())
                        fxrstor(&fpstate->regs.fxsave);
                else
                        frstor(&fpstate->regs.fsave);
        }
}

void fpu_reset_from_exception_fixup(void)
{
        restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
}

#if IS_ENABLED(CONFIG_KVM)
static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);

static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
{
        struct fpu_state_perm *fpuperm;
        u64 perm;

        if (!IS_ENABLED(CONFIG_X86_64))
                return;

        spin_lock_irq(&current->sighand->siglock);
        fpuperm = &current->group_leader->thread.fpu.guest_perm;
        perm = fpuperm->__state_perm;

        /* First fpstate allocation locks down permissions. */
        WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);

        spin_unlock_irq(&current->sighand->siglock);

        gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
}

bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fpstate;
        unsigned int size;

        size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
        fpstate = vzalloc(size);
        if (!fpstate)
                return false;

        /* Leave xfd to 0 (the reset value defined by spec) */
        __fpstate_reset(fpstate, 0);
        fpstate_init_user(fpstate);
        fpstate->is_valloc        = true;
        fpstate->is_guest        = true;

        gfpu->fpstate                = fpstate;
        gfpu->xfeatures                = fpu_user_cfg.default_features;
        gfpu->perm                = fpu_user_cfg.default_features;

        /*
         * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
         * to userspace, even when XSAVE is unsupported, so that restoring FPU
         * state on a different CPU that does support XSAVE can cleanly load
         * the incoming state using its natural XSAVE.  In other words, KVM's
         * uABI size may be larger than this host's default size.  Conversely,
         * the default size should never be larger than KVM's base uABI size;
         * all features that can expand the uABI size must be opt-in.
         */
        gfpu->uabi_size                = sizeof(struct kvm_xsave);
        if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
                gfpu->uabi_size = fpu_user_cfg.default_size;

        fpu_init_guest_permissions(gfpu);

        return true;
}
EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);

void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fps = gfpu->fpstate;

        if (!fps)
                return;

        if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
                return;

        gfpu->fpstate = NULL;
        vfree(fps);
}
EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);

/*
  * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
  * @guest_fpu:         Pointer to the guest FPU container
  * @xfeatures:         Features requested by guest CPUID
  *
  * Enable all dynamic xfeatures according to guest perm and requested CPUID.
  *
  * Return: 0 on success, error code otherwise
  */
int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
{
        lockdep_assert_preemption_enabled();

        /* Nothing to do if all requested features are already enabled. */
        xfeatures &= ~guest_fpu->xfeatures;
        if (!xfeatures)
                return 0;

        return __xfd_enable_feature(xfeatures, guest_fpu);
}
EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);

#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
{
        fpregs_lock();
        guest_fpu->fpstate->xfd = xfd;
        if (guest_fpu->fpstate->in_use)
                xfd_update_state(guest_fpu->fpstate);
        fpregs_unlock();
}
EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);

/**
 * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
 *
 * Must be invoked from KVM after a VMEXIT before enabling interrupts when
 * XFD write emulation is disabled. This is required because the guest can
 * freely modify XFD and the state at VMEXIT is not guaranteed to be the
 * same as the state on VMENTER. So software state has to be updated before
 * any operation which depends on it can take place.
 *
 * Note: It can be invoked unconditionally even when write emulation is
 * enabled for the price of a then pointless MSR read.
 */
void fpu_sync_guest_vmexit_xfd_state(void)
{
        struct fpstate *fps = current->thread.fpu.fpstate;

        lockdep_assert_irqs_disabled();
        if (fpu_state_size_dynamic()) {
                rdmsrl(MSR_IA32_XFD, fps->xfd);
                __this_cpu_write(xfd_state, fps->xfd);
        }
}
EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
#endif /* CONFIG_X86_64 */

int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
{
        struct fpstate *guest_fps = guest_fpu->fpstate;
        struct fpu *fpu = &current->thread.fpu;
        struct fpstate *cur_fps = fpu->fpstate;

        fpregs_lock();
        if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        /* Swap fpstate */
        if (enter_guest) {
                fpu->__task_fpstate = cur_fps;
                fpu->fpstate = guest_fps;
                guest_fps->in_use = true;
        } else {
                guest_fps->in_use = false;
                fpu->fpstate = fpu->__task_fpstate;
                fpu->__task_fpstate = NULL;
        }

        cur_fps = fpu->fpstate;

        if (!cur_fps->is_confidential) {
                /* Includes XFD update */
                restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
        } else {
                /*
                 * XSTATE is restored by firmware from encrypted
                 * memory. Make sure XFD state is correct while
                 * running with guest fpstate
                 */
                xfd_update_state(cur_fps);
        }

        fpregs_mark_activate();
        fpregs_unlock();
        return 0;
}
EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);

void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                    unsigned int size, u64 xfeatures, u32 pkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        union fpregs_state *ustate = buf;
        struct membuf mb = { .p = buf, .left = size };

        if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
                                          XSTATE_COPY_XSAVE);
        } else {
                memcpy(&ustate->fxsave, &kstate->regs.fxsave,
                       sizeof(ustate->fxsave));
                /* Make it restorable on a XSAVE enabled host */
                ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
        }
}
EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);

int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
                                   u64 xcr0, u32 *vpkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        const union fpregs_state *ustate = buf;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
                        return -EINVAL;
                if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
                        return -EINVAL;
                memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
                return 0;
        }

        if (ustate->xsave.header.xfeatures & ~xcr0)
                return -EINVAL;

        /*
         * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
         * in the header.  KVM's odd ABI is to leave PKRU untouched in this
         * case (all other components are eventually re-initialized).
         */
        if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
                vpkru = NULL;

        return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
        preempt_disable();

        WARN_ON_FPU(!irq_fpu_usable());
        WARN_ON_FPU(this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, true);

        if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
            !test_thread_flag(TIF_NEED_FPU_LOAD)) {
                set_thread_flag(TIF_NEED_FPU_LOAD);
                save_fpregs_to_fpstate(&current->thread.fpu);
        }
        __cpu_invalidate_fpregs_state();

        /* Put sane initial values into the control registers. */
        if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
                ldmxcsr(MXCSR_DEFAULT);

        if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
                asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
        WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, false);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

/*
 * Sync the FPU register state to current's memory register state when the
 * current task owns the FPU. The hardware register state is preserved.
 */
void fpu_sync_fpstate(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        fpregs_lock();
        trace_x86_fpu_before_save(fpu);

        if (!test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        trace_x86_fpu_after_save(fpu);
        fpregs_unlock();
}

static inline unsigned int init_fpstate_copy_size(void)
{
        if (!use_xsave())
                return fpu_kernel_cfg.default_size;

        /* XSAVE(S) just needs the legacy and the xstate header part */
        return sizeof(init_fpstate.regs.xsave);
}

static inline void fpstate_init_fxstate(struct fpstate *fpstate)
{
        fpstate->regs.fxsave.cwd = 0x37f;
        fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
}

/*
 * Legacy x87 fpstate state init:
 */
static inline void fpstate_init_fstate(struct fpstate *fpstate)
{
        fpstate->regs.fsave.cwd = 0xffff037fu;
        fpstate->regs.fsave.swd = 0xffff0000u;
        fpstate->regs.fsave.twd = 0xffffffffu;
        fpstate->regs.fsave.fos = 0xffff0000u;
}

/*
 * Used in two places:
 * 1) Early boot to setup init_fpstate for non XSAVE systems
 * 2) fpu_init_fpstate_user() which is invoked from KVM
 */
void fpstate_init_user(struct fpstate *fpstate)
{
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpstate_init_soft(&fpstate->regs.soft);
                return;
        }

        xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);

        if (cpu_feature_enabled(X86_FEATURE_FXSR))
                fpstate_init_fxstate(fpstate);
        else
                fpstate_init_fstate(fpstate);
}

static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
{
        /* Initialize sizes and feature masks */
        fpstate->size                = fpu_kernel_cfg.default_size;
        fpstate->user_size        = fpu_user_cfg.default_size;
        fpstate->xfeatures        = fpu_kernel_cfg.default_features;
        fpstate->user_xfeatures        = fpu_user_cfg.default_features;
        fpstate->xfd                = xfd;
}

void fpstate_reset(struct fpu *fpu)
{
        /* Set the fpstate pointer to the default fpstate */
        fpu->fpstate = &fpu->__fpstate;
        __fpstate_reset(fpu->fpstate, init_fpstate.xfd);

        /* Initialize the permission related info in fpu */
        fpu->perm.__state_perm                = fpu_kernel_cfg.default_features;
        fpu->perm.__state_size                = fpu_kernel_cfg.default_size;
        fpu->perm.__user_state_size        = fpu_user_cfg.default_size;
        /* Same defaults for guests */
        fpu->guest_perm = fpu->perm;
}

static inline void fpu_inherit_perms(struct fpu *dst_fpu)
{
        if (fpu_state_size_dynamic()) {
                struct fpu *src_fpu = &current->group_leader->thread.fpu;

                spin_lock_irq(&current->sighand->siglock);
                /* Fork also inherits the permissions of the parent */
                dst_fpu->perm = src_fpu->perm;
                dst_fpu->guest_perm = src_fpu->guest_perm;
                spin_unlock_irq(&current->sighand->siglock);
        }
}

/* A passed ssp of zero will not cause any update */
static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
{
#ifdef CONFIG_X86_USER_SHADOW_STACK
        struct cet_user_state *xstate;

        /* If ssp update is not needed. */
        if (!ssp)
                return 0;

        xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave,
                                XFEATURE_CET_USER);

        /*
         * If there is a non-zero ssp, then 'dst' must be configured with a shadow
         * stack and the fpu state should be up to date since it was just copied
         * from the parent in fpu_clone(). So there must be a valid non-init CET
         * state location in the buffer.
         */
        if (WARN_ON_ONCE(!xstate))
                return 1;

        xstate->user_ssp = (u64)ssp;
#endif
        return 0;
}

/* Clone current's FPU state on fork */
int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
              unsigned long ssp)
{
        struct fpu *src_fpu = &current->thread.fpu;
        struct fpu *dst_fpu = &dst->thread.fpu;

        /* The new task's FPU state cannot be valid in the hardware. */
        dst_fpu->last_cpu = -1;

        fpstate_reset(dst_fpu);

        if (!cpu_feature_enabled(X86_FEATURE_FPU))
                return 0;

        /*
         * Enforce reload for user space tasks and prevent kernel threads
         * from trying to save the FPU registers on context switch.
         */
        set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);

        /*
         * No FPU state inheritance for kernel threads and IO
         * worker threads.
         */
        if (minimal) {
                /* Clear out the minimal state */
                memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
                       init_fpstate_copy_size());
                return 0;
        }

        /*
         * If a new feature is added, ensure all dynamic features are
         * caller-saved from here!
         */
        BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);

        /*
         * Save the default portion of the current FPU state into the
         * clone. Assume all dynamic features to be defined as caller-
         * saved, which enables skipping both the expansion of fpstate
         * and the copying of any dynamic state.
         *
         * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
         * copying is not valid when current uses non-default states.
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
        save_fpregs_to_fpstate(dst_fpu);
        fpregs_unlock();
        if (!(clone_flags & CLONE_THREAD))
                fpu_inherit_perms(dst_fpu);

        /*
         * Children never inherit PASID state.
         * Force it to have its init value:
         */
        if (use_xsave())
                dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;

        /*
         * Update shadow stack pointer, in case it changed during clone.
         */
        if (update_fpu_shstk(dst, ssp))
                return 1;

        trace_x86_fpu_copy_src(src_fpu);
        trace_x86_fpu_copy_dst(dst_fpu);

        return 0;
}

/*
 * Whitelist the FPU register state embedded into task_struct for hardened
 * usercopy.
 */
void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        *offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
        *size = fpu_kernel_cfg.default_size;
}

/*
 * Drops current FPU state: deactivates the fpregs and
 * the fpstate. NOTE: it still leaves previous contents
 * in the fpregs in the eager-FPU case.
 *
 * This function can be used in cases where we know that
 * a state-restore is coming: either an explicit one,
 * or a reschedule.
 */
void fpu__drop(struct fpu *fpu)
{
        preempt_disable();

        if (fpu == &current->thread.fpu) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
                             "2:\n"
                             _ASM_EXTABLE(1b, 2b));
                fpregs_deactivate(fpu);
        }

        trace_x86_fpu_dropped(fpu);

        preempt_enable();
}

/*
 * Clear FPU registers by setting them up from the init fpstate.
 * Caller must do fpregs_[un]lock() around it.
 */
static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
        if (use_xsave())
                os_xrstor(&init_fpstate, features_mask);
        else if (use_fxsr())
                fxrstor(&init_fpstate.regs.fxsave);
        else
                frstor(&init_fpstate.regs.fsave);

        pkru_write_default();
}

/*
 * Reset current->fpu memory state to the init values.
 */
static void fpu_reset_fpregs(void)
{
        struct fpu *fpu = &current->thread.fpu;

        fpregs_lock();
        __fpu_invalidate_fpregs_state(fpu);
        /*
         * This does not change the actual hardware registers. It just
         * resets the memory image and sets TIF_NEED_FPU_LOAD so a
         * subsequent return to usermode will reload the registers from the
         * task's memory image.
         *
         * Do not use fpstate_init() here. Just copy init_fpstate which has
         * the correct content already except for PKRU.
         *
         * PKRU handling does not rely on the xstate when restoring for
         * user space as PKRU is eagerly written in switch_to() and
         * flush_thread().
         */
        memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
        set_thread_flag(TIF_NEED_FPU_LOAD);
        fpregs_unlock();
}

/*
 * Reset current's user FPU states to the init states.  current's
 * supervisor states, if any, are not modified by this function.  The
 * caller guarantees that the XSTATE header in memory is intact.
 */
void fpu__clear_user_states(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        fpregs_lock();
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpu_reset_fpregs();
                fpregs_unlock();
                return;
        }

        /*
         * Ensure that current's supervisor states are loaded into their
         * corresponding registers.
         */
        if (xfeatures_mask_supervisor() &&
            !fpregs_state_valid(fpu, smp_processor_id()))
                os_xrstor_supervisor(fpu->fpstate);

        /* Reset user states in registers. */
        restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);

        /*
         * Now all FPU registers have their desired values.  Inform the FPU
         * state machine that current's FPU registers are in the hardware
         * registers. The memory image does not need to be updated because
         * any operation relying on it has to save the registers first when
         * current's FPU is marked active.
         */
        fpregs_mark_activate();
        fpregs_unlock();
}

void fpu_flush_thread(void)
{
        fpstate_reset(&current->thread.fpu);
        fpu_reset_fpregs();
}
/*
 * Load FPU context before returning to userspace.
 */
void switch_fpu_return(void)
{
        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        fpregs_restore_userregs();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);

void fpregs_lock_and_load(void)
{
        /*
         * fpregs_lock() only disables preemption (mostly). So modifying state
         * in an interrupt could screw up some in progress fpregs operation.
         * Warn about it.
         */
        WARN_ON_ONCE(!irq_fpu_usable());
        WARN_ON_ONCE(current->flags & PF_KTHREAD);

        fpregs_lock();

        fpregs_assert_state_consistent();

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
}

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * If current FPU state according to its tracking (loaded FPU context on this
 * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
 * loaded on return to userland.
 */
void fpregs_assert_state_consistent(void)
{
        struct fpu *fpu = &current->thread.fpu;

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                return;

        WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
#endif

void fpregs_mark_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;

        fpregs_activate(fpu);
        fpu->last_cpu = smp_processor_id();
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

/*
 * x87 math exception handling:
 */

int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
        int err;

        if (trap_nr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
                 * status.  0x3f is the exception bits in these regs, 0x200 is the
                 * C1 reg you need in case of a stack fault, 0x040 is the stack
                 * fault bit.  We should only be taking one exception at a time,
                 * so if this combination doesn't produce any single exception,
                 * then we have a bad program that isn't synchronizing its FPU usage
                 * and it will suffer the consequences since we won't be able to
                 * fully reproduce the context of the exception.
                 */
                if (boot_cpu_has(X86_FEATURE_FXSR)) {
                        cwd = fpu->fpstate->regs.fxsave.cwd;
                        swd = fpu->fpstate->regs.fxsave.swd;
                } else {
                        cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
                        swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
                }

                err = swd & ~cwd;
        } else {
                /*
                 * The SIMD FPU exceptions are handled a little differently, as there
                 * is only a single status/control register.  Thus, to determine which
                 * unmasked exception was caught we must mask the exception mask bits
                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
                 */
                unsigned short mxcsr = MXCSR_DEFAULT;

                if (boot_cpu_has(X86_FEATURE_XMM))
                        mxcsr = fpu->fpstate->regs.fxsave.mxcsr;

                err = ~(mxcsr >> 7) & mxcsr;
        }

        if (err & 0x001) {        /* Invalid op */
                /*
                 * swd & 0x240 == 0x040: Stack Underflow
                 * swd & 0x240 == 0x240: Stack Overflow
                 * User must clear the SF bit (0x40) if set
                 */
                return FPE_FLTINV;
        } else if (err & 0x004) { /* Divide by Zero */
                return FPE_FLTDIV;
        } else if (err & 0x008) { /* Overflow */
                return FPE_FLTOVF;
        } else if (err & 0x012) { /* Denormal, Underflow */
                return FPE_FLTUND;
        } else if (err & 0x020) { /* Precision */
                return FPE_FLTRES;
        }

        /*
         * If we're using IRQ 13, or supposedly even some trap
         * X86_TRAP_MF implementations, it's possible
         * we get a spurious trap, which is not an error.
         */
        return 0;
}

/*
 * Initialize register state that may prevent from entering low-power idle.
 * This function will be invoked from the cpuidle driver only when needed.
 */
noinstr void fpu_idle_fpregs(void)
{
        /* Note: AMX_TILE being enabled implies XGETBV1 support */
        if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
            (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
                tile_release();
                __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        }
}




















































































































































































































































































    1 























    1 


























    1 










    1 


























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
// SPDX-License-Identifier: GPL-2.0-only
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *                Linus Torvalds, <torvalds@cs.helsinki.fi>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Matthew Dillon, <dillon@apollo.west.oic.com>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 */

#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>

static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
        if (seq == s_win)
                return true;
        if (after(end_seq, s_win) && before(seq, e_win))
                return true;
        return seq == e_win && seq == end_seq;
}

static enum tcp_tw_status
tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
                                  const struct sk_buff *skb, int mib_idx)
{
        struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);

        if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
                                  &tcptw->tw_last_oow_ack_time)) {
                /* Send ACK. Note, we do not put the bucket,
                 * it will be released by caller.
                 */
                return TCP_TW_ACK;
        }

        /* We are rate-limiting, so just release the tw sock and drop skb. */
        inet_twsk_put(tw);
        return TCP_TW_SUCCESS;
}

static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq)
{
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao;

        ao = rcu_dereference(tcptw->ao_info);
        if (unlikely(ao && seq < tcptw->tw_rcv_nxt))
                WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1);
#endif
        tcptw->tw_rcv_nxt = seq;
}

/*
 * * Main purpose of TIME-WAIT state is to close connection gracefully,
 *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
 *   (and, probably, tail of data) and one or more our ACKs are lost.
 * * What is TIME-WAIT timeout? It is associated with maximal packet
 *   lifetime in the internet, which results in wrong conclusion, that
 *   it is set to catch "old duplicate segments" wandering out of their path.
 *   It is not quite correct. This timeout is calculated so that it exceeds
 *   maximal retransmission timeout enough to allow to lose one (or more)
 *   segments sent by peer and our ACKs. This time may be calculated from RTO.
 * * When TIME-WAIT socket receives RST, it means that another end
 *   finally closed and we are allowed to kill TIME-WAIT too.
 * * Second purpose of TIME-WAIT is catching old duplicate segments.
 *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
 *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
 * * If we invented some more clever way to catch duplicates
 *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
 *
 * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
 * When you compare it to RFCs, please, read section SEGMENT ARRIVES
 * from the very beginning.
 *
 * NOTE. With recycling (and later with fin-wait-2) TW bucket
 * is _not_ stateless. It means, that strictly speaking we must
 * spinlock it. I do not want! Well, probability of misbehaviour
 * is ridiculously low and, seems, we could use some mb() tricks
 * to avoid misread sequence numbers, states etc.  --ANK
 *
 * We don't need to initialize tmp_out.sack_ok as we don't use the results
 */
enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
                           const struct tcphdr *th, u32 *tw_isn)
{
        struct tcp_options_received tmp_opt;
        struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
        bool paws_reject = false;
        int ts_recent_stamp;

        tmp_opt.saw_tstamp = 0;
        ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
        if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
                tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);

                if (tmp_opt.saw_tstamp) {
                        if (tmp_opt.rcv_tsecr)
                                tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
                        tmp_opt.ts_recent        = READ_ONCE(tcptw->tw_ts_recent);
                        tmp_opt.ts_recent_stamp        = ts_recent_stamp;
                        paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
                }
        }

        if (tw->tw_substate == TCP_FIN_WAIT2) {
                /* Just repeat all the checks of tcp_rcv_state_process() */

                /* Out of window, send ACK */
                if (paws_reject ||
                    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
                                   tcptw->tw_rcv_nxt,
                                   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
                        return tcp_timewait_check_oow_rate_limit(
                                tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);

                if (th->rst)
                        goto kill;

                if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
                        return TCP_TW_RST;

                /* Dup ACK? */
                if (!th->ack ||
                    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
                    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
                        inet_twsk_put(tw);
                        return TCP_TW_SUCCESS;
                }

                /* New data or FIN. If new data arrive after half-duplex close,
                 * reset.
                 */
                if (!th->fin ||
                    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
                        return TCP_TW_RST;

                /* FIN arrived, enter true time-wait state. */
                tw->tw_substate          = TCP_TIME_WAIT;
                twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq);

                if (tmp_opt.saw_tstamp) {
                        WRITE_ONCE(tcptw->tw_ts_recent_stamp,
                                  ktime_get_seconds());
                        WRITE_ONCE(tcptw->tw_ts_recent,
                                   tmp_opt.rcv_tsval);
                }

                inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
                return TCP_TW_ACK;
        }

        /*
         *        Now real TIME-WAIT state.
         *
         *        RFC 1122:
         *        "When a connection is [...] on TIME-WAIT state [...]
         *        [a TCP] MAY accept a new SYN from the remote TCP to
         *        reopen the connection directly, if it:
         *
         *        (1)  assigns its initial sequence number for the new
         *        connection to be larger than the largest sequence
         *        number it used on the previous connection incarnation,
         *        and
         *
         *        (2)  returns to TIME-WAIT state if the SYN turns out
         *        to be an old duplicate".
         */

        if (!paws_reject &&
            (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
             (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
                /* In window segment, it may be only reset or bare ack. */

                if (th->rst) {
                        /* This is TIME_WAIT assassination, in two flavors.
                         * Oh well... nobody has a sufficient solution to this
                         * protocol bug yet.
                         */
                        if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
kill:
                                inet_twsk_deschedule_put(tw);
                                return TCP_TW_SUCCESS;
                        }
                } else {
                        inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
                }

                if (tmp_opt.saw_tstamp) {
                        WRITE_ONCE(tcptw->tw_ts_recent,
                                   tmp_opt.rcv_tsval);
                        WRITE_ONCE(tcptw->tw_ts_recent_stamp,
                                   ktime_get_seconds());
                }

                inet_twsk_put(tw);
                return TCP_TW_SUCCESS;
        }

        /* Out of window segment.

           All the segments are ACKed immediately.

           The only exception is new SYN. We accept it, if it is
           not old duplicate and we are not in danger to be killed
           by delayed old duplicates. RFC check is that it has
           newer sequence number works at rates <40Mbit/sec.
           However, if paws works, it is reliable AND even more,
           we even may relax silly seq space cutoff.

           RED-PEN: we violate main RFC requirement, if this SYN will appear
           old duplicate (i.e. we receive RST in reply to SYN-ACK),
           we must return socket to time-wait state. It is not good,
           but not fatal yet.
         */

        if (th->syn && !th->rst && !th->ack && !paws_reject &&
            (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
             (tmp_opt.saw_tstamp &&
              (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) {
                u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
                if (isn == 0)
                        isn++;
                *tw_isn = isn;
                return TCP_TW_SYN;
        }

        if (paws_reject)
                __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);

        if (!th->rst) {
                /* In this case we must reset the TIMEWAIT timer.
                 *
                 * If it is ACKless SYN it may be both old duplicate
                 * and new good SYN with random sequence number <rcv_nxt.
                 * Do not reschedule in the last case.
                 */
                if (paws_reject || th->ack)
                        inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);

                return tcp_timewait_check_oow_rate_limit(
                        tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
        }
        inet_twsk_put(tw);
        return TCP_TW_SUCCESS;
}
EXPORT_SYMBOL(tcp_timewait_state_process);

static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
{
#ifdef CONFIG_TCP_MD5SIG
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;

        /*
         * The timewait bucket does not have the key DB from the
         * sock structure. We just make a quick copy of the
         * md5 key being used (if indeed we are using one)
         * so the timewait ack generating code has the key.
         */
        tcptw->tw_md5_key = NULL;
        if (!static_branch_unlikely(&tcp_md5_needed.key))
                return;

        key = tp->af_specific->md5_lookup(sk, sk);
        if (key) {
                tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
                if (!tcptw->tw_md5_key)
                        return;
                if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
                        goto out_free;
                tcp_md5_add_sigpool();
        }
        return;
out_free:
        WARN_ON_ONCE(1);
        kfree(tcptw->tw_md5_key);
        tcptw->tw_md5_key = NULL;
#endif
}

/*
 * Move a socket to time-wait or dead fin-wait-2 state.
 */
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct inet_timewait_sock *tw;

        tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);

        if (tw) {
                struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
                const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);

                tw->tw_transparent        = inet_test_bit(TRANSPARENT, sk);
                tw->tw_mark                = sk->sk_mark;
                tw->tw_priority                = READ_ONCE(sk->sk_priority);
                tw->tw_rcv_wscale        = tp->rx_opt.rcv_wscale;
                tcptw->tw_rcv_nxt        = tp->rcv_nxt;
                tcptw->tw_snd_nxt        = tp->snd_nxt;
                tcptw->tw_rcv_wnd        = tcp_receive_window(tp);
                tcptw->tw_ts_recent        = tp->rx_opt.ts_recent;
                tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
                tcptw->tw_ts_offset        = tp->tsoffset;
                tw->tw_usec_ts                = tp->tcp_usec_ts;
                tcptw->tw_last_oow_ack_time = 0;
                tcptw->tw_tx_delay        = tp->tcp_tx_delay;
                tw->tw_txhash                = sk->sk_txhash;
#if IS_ENABLED(CONFIG_IPV6)
                if (tw->tw_family == PF_INET6) {
                        struct ipv6_pinfo *np = inet6_sk(sk);

                        tw->tw_v6_daddr = sk->sk_v6_daddr;
                        tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
                        tw->tw_tclass = np->tclass;
                        tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
                        tw->tw_ipv6only = sk->sk_ipv6only;
                }
#endif

                tcp_time_wait_init(sk, tcptw);
                tcp_ao_time_wait(tcptw, tp);

                /* Get the TIME_WAIT timeout firing. */
                if (timeo < rto)
                        timeo = rto;

                if (state == TCP_TIME_WAIT)
                        timeo = TCP_TIMEWAIT_LEN;

                /* Linkage updates.
                 * Note that access to tw after this point is illegal.
                 */
                inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo);
        } else {
                /* Sorry, if we're out of memory, just CLOSE this
                 * socket up.  We've got bigger problems than
                 * non-graceful socket closings.
                 */
                NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW);
        }

        tcp_update_metrics(sk);
        tcp_done(sk);
}
EXPORT_SYMBOL(tcp_time_wait);

#ifdef CONFIG_TCP_MD5SIG
static void tcp_md5_twsk_free_rcu(struct rcu_head *head)
{
        struct tcp_md5sig_key *key;

        key = container_of(head, struct tcp_md5sig_key, rcu);
        kfree(key);
        static_branch_slow_dec_deferred(&tcp_md5_needed);
        tcp_md5_release_sigpool();
}
#endif

void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
        if (static_branch_unlikely(&tcp_md5_needed.key)) {
                struct tcp_timewait_sock *twsk = tcp_twsk(sk);

                if (twsk->tw_md5_key)
                        call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
        }
#endif
        tcp_ao_destroy_sock(sk, true);
}
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);

void tcp_twsk_purge(struct list_head *net_exit_list)
{
        bool purged_once = false;
        struct net *net;

        list_for_each_entry(net, net_exit_list, exit_list) {
                if (net->ipv4.tcp_death_row.hashinfo->pernet) {
                        /* Even if tw_refcount == 1, we must clean up kernel reqsk */
                        inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
                } else if (!purged_once) {
                        inet_twsk_purge(&tcp_hashinfo);
                        purged_once = true;
                }
        }
}

/* Warning : This function is called without sk_listener being locked.
 * Be sure to read socket fields once, as their value could change under us.
 */
void tcp_openreq_init_rwin(struct request_sock *req,
                           const struct sock *sk_listener,
                           const struct dst_entry *dst)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct tcp_sock *tp = tcp_sk(sk_listener);
        int full_space = tcp_full_space(sk_listener);
        u32 window_clamp;
        __u8 rcv_wscale;
        u32 rcv_wnd;
        int mss;

        mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
        window_clamp = READ_ONCE(tp->window_clamp);
        /* Set this up on the first call only */
        req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);

        /* limit the window selection if the user enforce a smaller rx buffer */
        if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
            (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
                req->rsk_window_clamp = full_space;

        rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req);
        if (rcv_wnd == 0)
                rcv_wnd = dst_metric(dst, RTAX_INITRWND);
        else if (full_space < rcv_wnd * mss)
                full_space = rcv_wnd * mss;

        /* tcp_full_space because it is guaranteed to be the first packet */
        tcp_select_initial_window(sk_listener, full_space,
                mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
                &req->rsk_rcv_wnd,
                &req->rsk_window_clamp,
                ireq->wscale_ok,
                &rcv_wscale,
                rcv_wnd);
        ireq->rcv_wscale = rcv_wscale;
}
EXPORT_SYMBOL(tcp_openreq_init_rwin);

static void tcp_ecn_openreq_child(struct tcp_sock *tp,
                                  const struct request_sock *req)
{
        tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
}

void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
        bool ca_got_dst = false;

        if (ca_key != TCP_CA_UNSPEC) {
                const struct tcp_congestion_ops *ca;

                rcu_read_lock();
                ca = tcp_ca_find_key(ca_key);
                if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
                        icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
                        icsk->icsk_ca_ops = ca;
                        ca_got_dst = true;
                }
                rcu_read_unlock();
        }

        /* If no valid choice made yet, assign current system default ca. */
        if (!ca_got_dst &&
            (!icsk->icsk_ca_setsockopt ||
             !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
                tcp_assign_congestion_control(sk);

        tcp_set_ca_state(sk, TCP_CA_Open);
}
EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);

static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
                                    struct request_sock *req,
                                    struct tcp_sock *newtp)
{
#if IS_ENABLED(CONFIG_SMC)
        struct inet_request_sock *ireq;

        if (static_branch_unlikely(&tcp_have_smc)) {
                ireq = inet_rsk(req);
                if (oldtp->syn_smc && !ireq->smc_ok)
                        newtp->syn_smc = 0;
        }
#endif
}

/* This is not only more efficient than what we used to do, it eliminates
 * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 *
 * Actually, we could lots of memory writes here. tp of listening
 * socket contains all necessary default parameters.
 */
struct sock *tcp_create_openreq_child(const struct sock *sk,
                                      struct request_sock *req,
                                      struct sk_buff *skb)
{
        struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct tcp_request_sock *treq = tcp_rsk(req);
        struct inet_connection_sock *newicsk;
        const struct tcp_sock *oldtp;
        struct tcp_sock *newtp;
        u32 seq;
#ifdef CONFIG_TCP_AO
        struct tcp_ao_key *ao_key;
#endif

        if (!newsk)
                return NULL;

        newicsk = inet_csk(newsk);
        newtp = tcp_sk(newsk);
        oldtp = tcp_sk(sk);

        smc_check_reset_syn_req(oldtp, req, newtp);

        /* Now setup tcp_sock */
        newtp->pred_flags = 0;

        seq = treq->rcv_isn + 1;
        newtp->rcv_wup = seq;
        WRITE_ONCE(newtp->copied_seq, seq);
        WRITE_ONCE(newtp->rcv_nxt, seq);
        newtp->segs_in = 1;

        seq = treq->snt_isn + 1;
        newtp->snd_sml = newtp->snd_una = seq;
        WRITE_ONCE(newtp->snd_nxt, seq);
        newtp->snd_up = seq;

        INIT_LIST_HEAD(&newtp->tsq_node);
        INIT_LIST_HEAD(&newtp->tsorted_sent_queue);

        tcp_init_wl(newtp, treq->rcv_isn);

        minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
        newicsk->icsk_ack.lrcvtime = tcp_jiffies32;

        newtp->lsndtime = tcp_jiffies32;
        newsk->sk_txhash = READ_ONCE(treq->txhash);
        newtp->total_retrans = req->num_retrans;

        tcp_init_xmit_timers(newsk);
        WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);

        if (sock_flag(newsk, SOCK_KEEPOPEN))
                inet_csk_reset_keepalive_timer(newsk,
                                               keepalive_time_when(newtp));

        newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
        newtp->rx_opt.sack_ok = ireq->sack_ok;
        newtp->window_clamp = req->rsk_window_clamp;
        newtp->rcv_ssthresh = req->rsk_rcv_wnd;
        newtp->rcv_wnd = req->rsk_rcv_wnd;
        newtp->rx_opt.wscale_ok = ireq->wscale_ok;
        if (newtp->rx_opt.wscale_ok) {
                newtp->rx_opt.snd_wscale = ireq->snd_wscale;
                newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
        } else {
                newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
                newtp->window_clamp = min(newtp->window_clamp, 65535U);
        }
        newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
        newtp->max_window = newtp->snd_wnd;

        if (newtp->rx_opt.tstamp_ok) {
                newtp->tcp_usec_ts = treq->req_usec_ts;
                newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
                newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
                newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
        } else {
                newtp->tcp_usec_ts = 0;
                newtp->rx_opt.ts_recent_stamp = 0;
                newtp->tcp_header_len = sizeof(struct tcphdr);
        }
        if (req->num_timeout) {
                newtp->total_rto = req->num_timeout;
                newtp->undo_marker = treq->snt_isn;
                if (newtp->tcp_usec_ts) {
                        newtp->retrans_stamp = treq->snt_synack;
                        newtp->total_rto_time = (u32)(tcp_clock_us() -
                                                      newtp->retrans_stamp) / USEC_PER_MSEC;
                } else {
                        newtp->retrans_stamp = div_u64(treq->snt_synack,
                                                       USEC_PER_SEC / TCP_TS_HZ);
                        newtp->total_rto_time = tcp_clock_ms() -
                                                newtp->retrans_stamp;
                }
                newtp->total_rto_recoveries = 1;
        }
        newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
        newtp->md5sig_info = NULL;        /*XXX*/
#endif
#ifdef CONFIG_TCP_AO
        newtp->ao_info = NULL;
        ao_key = treq->af_specific->ao_lookup(sk, req,
                                tcp_rsk(req)->ao_keyid, -1);
        if (ao_key)
                newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
 #endif
        if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
                newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
        newtp->rx_opt.mss_clamp = req->mss;
        tcp_ecn_openreq_child(newtp, req);
        newtp->fastopen_req = NULL;
        RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);

        newtp->bpf_chg_cc_inprogress = 0;
        tcp_bpf_clone(sk, newsk);

        __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);

        return newsk;
}
EXPORT_SYMBOL(tcp_create_openreq_child);

/*
 * Process an incoming packet for SYN_RECV sockets represented as a
 * request_sock. Normally sk is the listener socket but for TFO it
 * points to the child socket.
 *
 * XXX (TFO) - The current impl contains a special check for ack
 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
 *
 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
 *
 * Note: If @fastopen is true, this can be called from process context.
 *       Otherwise, this is from BH context.
 */

struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           struct request_sock *req,
                           bool fastopen, bool *req_stolen)
{
        struct tcp_options_received tmp_opt;
        struct sock *child;
        const struct tcphdr *th = tcp_hdr(skb);
        __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
        bool paws_reject = false;
        bool own_req;

        tmp_opt.saw_tstamp = 0;
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
                tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);

                if (tmp_opt.saw_tstamp) {
                        tmp_opt.ts_recent = READ_ONCE(req->ts_recent);
                        if (tmp_opt.rcv_tsecr)
                                tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
                        /* We do not store true stamp, but it is not required,
                         * it can be estimated (approximately)
                         * from another data.
                         */
                        tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ;
                        paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
                }
        }

        /* Check for pure retransmitted SYN. */
        if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
            flg == TCP_FLAG_SYN &&
            !paws_reject) {
                /*
                 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
                 * this case on figure 6 and figure 8, but formal
                 * protocol description says NOTHING.
                 * To be more exact, it says that we should send ACK,
                 * because this segment (at least, if it has no data)
                 * is out of window.
                 *
                 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
                 *  describe SYN-RECV state. All the description
                 *  is wrong, we cannot believe to it and should
                 *  rely only on common sense and implementation
                 *  experience.
                 *
                 * Enforce "SYN-ACK" according to figure 8, figure 6
                 * of RFC793, fixed by RFC1122.
                 *
                 * Note that even if there is new data in the SYN packet
                 * they will be thrown away too.
                 *
                 * Reset timer after retransmitting SYNACK, similar to
                 * the idea of fast retransmit in recovery.
                 */
                if (!tcp_oow_rate_limited(sock_net(sk), skb,
                                          LINUX_MIB_TCPACKSKIPPEDSYNRECV,
                                          &tcp_rsk(req)->last_oow_ack_time) &&

                    !inet_rtx_syn_ack(sk, req)) {
                        unsigned long expires = jiffies;

                        expires += reqsk_timeout(req, TCP_RTO_MAX);
                        if (!fastopen)
                                mod_timer_pending(&req->rsk_timer, expires);
                        else
                                req->rsk_timer.expires = expires;
                }
                return NULL;
        }

        /* Further reproduces section "SEGMENT ARRIVES"
           for state SYN-RECEIVED of RFC793.
           It is broken, however, it does not work only
           when SYNs are crossed.

           You would think that SYN crossing is impossible here, since
           we should have a SYN_SENT socket (from connect()) on our end,
           but this is not true if the crossed SYNs were sent to both
           ends by a malicious third party.  We must defend against this,
           and to do that we first verify the ACK (as per RFC793, page
           36) and reset if it is invalid.  Is this a true full defense?
           To convince ourselves, let us consider a way in which the ACK
           test can still pass in this 'malicious crossed SYNs' case.
           Malicious sender sends identical SYNs (and thus identical sequence
           numbers) to both A and B:

                A: gets SYN, seq=7
                B: gets SYN, seq=7

           By our good fortune, both A and B select the same initial
           send sequence number of seven :-)

                A: sends SYN|ACK, seq=7, ack_seq=8
                B: sends SYN|ACK, seq=7, ack_seq=8

           So we are now A eating this SYN|ACK, ACK test passes.  So
           does sequence test, SYN is truncated, and thus we consider
           it a bare ACK.

           If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this
           bare ACK.  Otherwise, we create an established connection.  Both
           ends (listening sockets) accept the new incoming connection and try
           to talk to each other. 8-)

           Note: This case is both harmless, and rare.  Possibility is about the
           same as us discovering intelligent life on another plant tomorrow.

           But generally, we should (RFC lies!) to accept ACK
           from SYNACK both here and in tcp_rcv_state_process().
           tcp_rcv_state_process() does not, hence, we do not too.

           Note that the case is absolutely generic:
           we cannot optimize anything here without
           violating protocol. All the checks must be made
           before attempt to create socket.
         */

        /* RFC793 page 36: "If the connection is in any non-synchronized state ...
         *                  and the incoming segment acknowledges something not yet
         *                  sent (the segment carries an unacceptable ACK) ...
         *                  a reset is sent."
         *
         * Invalid ACK: reset will be sent by listening socket.
         * Note that the ACK validity check for a Fast Open socket is done
         * elsewhere and is checked directly against the child socket rather
         * than req because user data may have been sent out.
         */
        if ((flg & TCP_FLAG_ACK) && !fastopen &&
            (TCP_SKB_CB(skb)->ack_seq !=
             tcp_rsk(req)->snt_isn + 1))
                return sk;

        /* Also, it would be not so bad idea to check rcv_tsecr, which
         * is essentially ACK extension and too early or too late values
         * should cause reset in unsynchronized states.
         */

        /* RFC793: "first check sequence number". */

        if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq,
                                          TCP_SKB_CB(skb)->end_seq,
                                          tcp_rsk(req)->rcv_nxt,
                                          tcp_rsk(req)->rcv_nxt +
                                          tcp_synack_window(req))) {
                /* Out of window: send ACK and drop. */
                if (!(flg & TCP_FLAG_RST) &&
                    !tcp_oow_rate_limited(sock_net(sk), skb,
                                          LINUX_MIB_TCPACKSKIPPEDSYNRECV,
                                          &tcp_rsk(req)->last_oow_ack_time))
                        req->rsk_ops->send_ack(sk, skb, req);
                if (paws_reject)
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
                return NULL;
        }

        /* In sequence, PAWS is OK. */

        /* TODO: We probably should defer ts_recent change once
         * we take ownership of @req.
         */
        if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
                WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval);

        if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
                /* Truncate SYN, it is out of window starting
                   at tcp_rsk(req)->rcv_isn + 1. */
                flg &= ~TCP_FLAG_SYN;
        }

        /* RFC793: "second check the RST bit" and
         *           "fourth, check the SYN bit"
         */
        if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
                TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
                goto embryonic_reset;
        }

        /* ACK sequence verified above, just make sure ACK is
         * set.  If ACK not set, just silently drop the packet.
         *
         * XXX (TFO) - if we ever allow "data after SYN", the
         * following check needs to be removed.
         */
        if (!(flg & TCP_FLAG_ACK))
                return NULL;

        /* For Fast Open no more processing is needed (sk is the
         * child socket).
         */
        if (fastopen)
                return sk;

        /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
        if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
            TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
                inet_rsk(req)->acked = 1;
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
                return NULL;
        }

        /* OK, ACK is valid, create big socket and
         * feed this segment to it. It will repeat all
         * the tests. THIS SEGMENT MUST MOVE SOCKET TO
         * ESTABLISHED STATE. If it will be dropped after
         * socket is created, wait for troubles.
         */
        child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
                                                         req, &own_req);
        if (!child)
                goto listen_overflow;

        if (own_req && rsk_drop_req(req)) {
                reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
                inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
                return child;
        }

        sock_rps_save_rxhash(child, skb);
        tcp_synack_rtt_meas(child, req);
        *req_stolen = !own_req;
        return inet_csk_complete_hashdance(sk, child, req, own_req);

listen_overflow:
        if (sk != req->rsk_listener)
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
                inet_rsk(req)->acked = 1;
                return NULL;
        }

embryonic_reset:
        if (!(flg & TCP_FLAG_RST)) {
                /* Received a bad SYN pkt - for TFO We try not to reset
                 * the local connection unless it's really necessary to
                 * avoid becoming vulnerable to outside attack aiming at
                 * resetting legit local connections.
                 */
                req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN);
        } else if (fastopen) { /* received a valid RST pkt */
                reqsk_fastopen_remove(sk, req, true);
                tcp_reset(sk, skb);
        }
        if (!fastopen) {
                bool unlinked = inet_csk_reqsk_queue_drop(sk, req);

                if (unlinked)
                        __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
                *req_stolen = !unlinked;
        }
        return NULL;
}
EXPORT_SYMBOL(tcp_check_req);

/*
 * Queue segment on the new socket if the new socket is active,
 * otherwise we just shortcircuit this and continue with
 * the new socket.
 *
 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
 * when entering. But other states are possible due to a race condition
 * where after __inet_lookup_established() fails but before the listener
 * locked is obtained, other packets cause the same connection to
 * be created.
 */

enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
                                       struct sk_buff *skb)
        __releases(&((child)->sk_lock.slock))
{
        enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
        int state = child->sk_state;

        /* record sk_napi_id and sk_rx_queue_mapping of child. */
        sk_mark_napi_id_set(child, skb);

        tcp_segs_in(tcp_sk(child), skb);
        if (!sock_owned_by_user(child)) {
                reason = tcp_rcv_state_process(child, skb);
                /* Wakeup parent, send SIGIO */
                if (state == TCP_SYN_RECV && child->sk_state != state)
                        parent->sk_data_ready(parent);
        } else {
                /* Alas, it is possible again, because we do lookup
                 * in main socket hash table and lock on listening
                 * socket does not protect us more.
                 */
                __sk_add_backlog(child, skb);
        }

        bh_unlock_sock(child);
        sock_put(child);
        return reason;
}
EXPORT_SYMBOL(tcp_child_process);



















    1 
    1 










    1 
    1 








    1 














    1 
    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Skb ref helpers.
 *
 */

#ifndef _LINUX_SKBUFF_REF_H
#define _LINUX_SKBUFF_REF_H

#include <linux/skbuff.h>

/**
 * __skb_frag_ref - take an addition reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Takes an additional reference on the paged fragment @frag.
 */
static inline void __skb_frag_ref(skb_frag_t *frag)
{
        get_page(skb_frag_page(frag));
}

/**
 * skb_frag_ref - take an addition reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset.
 *
 * Takes an additional reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_ref(struct sk_buff *skb, int f)
{
        __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}

bool napi_pp_put_page(struct page *page);

static inline void
skb_page_unref(struct page *page, bool recycle)
{
#ifdef CONFIG_PAGE_POOL
        if (recycle && napi_pp_put_page(page))
                return;
#endif
        put_page(page);
}

/**
 * __skb_frag_unref - release a reference on a paged fragment.
 * @frag: the paged fragment
 * @recycle: recycle the page if allocated via page_pool
 *
 * Releases a reference on the paged fragment @frag
 * or recycles the page via the page_pool API.
 */
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
        skb_page_unref(skb_frag_page(frag), recycle);
}

/**
 * skb_frag_unref - release a reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset
 *
 * Releases a reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (!skb_zcopy_managed(skb))
                __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}

#endif        /* _LINUX_SKBUFF_REF_H */






























































































    1 


    1 



    1 

















    1 

    1 







    1 


    1 


    1 



    1 





























    1 
























    1 



    1 


    1 






























    1 






    1 

    1 





    1 









    1 



    1 




























































































































































































    1 









    1 






    1 
    1 





    1 
    1 






    1 
    1 
    1 















































































































































































































































































    1 








    1 









    1 
    1 













    1 





















































    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2003 International Business Machines Corp.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This file is part of the SCTP kernel implementation
 *
 * This module provides the abstraction for an SCTP transport representing
 * a remote transport address.  For local transport addresses, we just use
 * union sctp_addr.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Xingang Guo           <xingang.guo@intel.com>
 *    Hui Huang             <hui.huang@nokia.com>
 *    Sridhar Samudrala            <sri@us.ibm.com>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/slab.h>
#include <linux/types.h>
#include <linux/random.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

/* 1st Level Abstractions.  */

/* Initialize a new transport from provided memory.  */
static struct sctp_transport *sctp_transport_init(struct net *net,
                                                  struct sctp_transport *peer,
                                                  const union sctp_addr *addr,
                                                  gfp_t gfp)
{
        /* Copy in the address.  */
        peer->af_specific = sctp_get_af_specific(addr->sa.sa_family);
        memcpy(&peer->ipaddr, addr, peer->af_specific->sockaddr_len);
        memset(&peer->saddr, 0, sizeof(union sctp_addr));

        peer->sack_generation = 0;

        /* From 6.3.1 RTO Calculation:
         *
         * C1) Until an RTT measurement has been made for a packet sent to the
         * given destination transport address, set RTO to the protocol
         * parameter 'RTO.Initial'.
         */
        peer->rto = msecs_to_jiffies(net->sctp.rto_initial);

        peer->last_time_heard = 0;
        peer->last_time_ecne_reduced = jiffies;

        peer->param_flags = SPP_HB_DISABLE |
                            SPP_PMTUD_ENABLE |
                            SPP_SACKDELAY_ENABLE;

        /* Initialize the default path max_retrans.  */
        peer->pathmaxrxt  = net->sctp.max_retrans_path;
        peer->pf_retrans  = net->sctp.pf_retrans;

        INIT_LIST_HEAD(&peer->transmitted);
        INIT_LIST_HEAD(&peer->send_ready);
        INIT_LIST_HEAD(&peer->transports);

        timer_setup(&peer->T3_rtx_timer, sctp_generate_t3_rtx_event, 0);
        timer_setup(&peer->hb_timer, sctp_generate_heartbeat_event, 0);
        timer_setup(&peer->reconf_timer, sctp_generate_reconf_event, 0);
        timer_setup(&peer->probe_timer, sctp_generate_probe_event, 0);
        timer_setup(&peer->proto_unreach_timer,
                    sctp_generate_proto_unreach_event, 0);

        /* Initialize the 64-bit random nonce sent with heartbeat. */
        get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce));

        refcount_set(&peer->refcnt, 1);

        return peer;
}

/* Allocate and initialize a new transport.  */
struct sctp_transport *sctp_transport_new(struct net *net,
                                          const union sctp_addr *addr,
                                          gfp_t gfp)
{
        struct sctp_transport *transport;

        transport = kzalloc(sizeof(*transport), gfp);
        if (!transport)
                goto fail;

        if (!sctp_transport_init(net, transport, addr, gfp))
                goto fail_init;

        SCTP_DBG_OBJCNT_INC(transport);

        return transport;

fail_init:
        kfree(transport);

fail:
        return NULL;
}

/* This transport is no longer needed.  Free up if possible, or
 * delay until it last reference count.
 */
void sctp_transport_free(struct sctp_transport *transport)
{
        /* Try to delete the heartbeat timer.  */
        if (del_timer(&transport->hb_timer))
                sctp_transport_put(transport);

        /* Delete the T3_rtx timer if it's active.
         * There is no point in not doing this now and letting
         * structure hang around in memory since we know
         * the transport is going away.
         */
        if (del_timer(&transport->T3_rtx_timer))
                sctp_transport_put(transport);

        if (del_timer(&transport->reconf_timer))
                sctp_transport_put(transport);

        if (del_timer(&transport->probe_timer))
                sctp_transport_put(transport);

        /* Delete the ICMP proto unreachable timer if it's active. */
        if (del_timer(&transport->proto_unreach_timer))
                sctp_transport_put(transport);

        sctp_transport_put(transport);
}

static void sctp_transport_destroy_rcu(struct rcu_head *head)
{
        struct sctp_transport *transport;

        transport = container_of(head, struct sctp_transport, rcu);

        dst_release(transport->dst);
        kfree(transport);
        SCTP_DBG_OBJCNT_DEC(transport);
}

/* Destroy the transport data structure.
 * Assumes there are no more users of this structure.
 */
static void sctp_transport_destroy(struct sctp_transport *transport)
{
        if (unlikely(refcount_read(&transport->refcnt))) {
                WARN(1, "Attempt to destroy undead transport %p!\n", transport);
                return;
        }

        sctp_packet_free(&transport->packet);

        if (transport->asoc)
                sctp_association_put(transport->asoc);

        call_rcu(&transport->rcu, sctp_transport_destroy_rcu);
}

/* Start T3_rtx timer if it is not already running and update the heartbeat
 * timer.  This routine is called every time a DATA chunk is sent.
 */
void sctp_transport_reset_t3_rtx(struct sctp_transport *transport)
{
        /* RFC 2960 6.3.2 Retransmission Timer Rules
         *
         * R1) Every time a DATA chunk is sent to any address(including a
         * retransmission), if the T3-rtx timer of that address is not running
         * start it running so that it will expire after the RTO of that
         * address.
         */

        if (!timer_pending(&transport->T3_rtx_timer))
                if (!mod_timer(&transport->T3_rtx_timer,
                               jiffies + transport->rto))
                        sctp_transport_hold(transport);
}

void sctp_transport_reset_hb_timer(struct sctp_transport *transport)
{
        unsigned long expires;

        /* When a data chunk is sent, reset the heartbeat interval.  */
        expires = jiffies + sctp_transport_timeout(transport);
        if (!mod_timer(&transport->hb_timer,
                       expires + get_random_u32_below(transport->rto)))
                sctp_transport_hold(transport);
}

void sctp_transport_reset_reconf_timer(struct sctp_transport *transport)
{
        if (!timer_pending(&transport->reconf_timer))
                if (!mod_timer(&transport->reconf_timer,
                               jiffies + transport->rto))
                        sctp_transport_hold(transport);
}

void sctp_transport_reset_probe_timer(struct sctp_transport *transport)
{
        if (!mod_timer(&transport->probe_timer,
                       jiffies + transport->probe_interval))
                sctp_transport_hold(transport);
}

void sctp_transport_reset_raise_timer(struct sctp_transport *transport)
{
        if (!mod_timer(&transport->probe_timer,
                       jiffies + transport->probe_interval * 30))
                sctp_transport_hold(transport);
}

/* This transport has been assigned to an association.
 * Initialize fields from the association or from the sock itself.
 * Register the reference count in the association.
 */
void sctp_transport_set_owner(struct sctp_transport *transport,
                              struct sctp_association *asoc)
{
        transport->asoc = asoc;
        sctp_association_hold(asoc);
}

/* Initialize the pmtu of a transport. */
void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
{
        /* If we don't have a fresh route, look one up */
        if (!transport->dst || transport->dst->obsolete) {
                sctp_transport_dst_release(transport);
                transport->af_specific->get_dst(transport, &transport->saddr,
                                                &transport->fl, sk);
        }

        if (transport->param_flags & SPP_PMTUD_DISABLE) {
                struct sctp_association *asoc = transport->asoc;

                if (!transport->pathmtu && asoc && asoc->pathmtu)
                        transport->pathmtu = asoc->pathmtu;
                if (transport->pathmtu)
                        return;
        }

        if (transport->dst)
                transport->pathmtu = sctp_dst_mtu(transport->dst);
        else
                transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;

        sctp_transport_pl_update(transport);
}

void sctp_transport_pl_send(struct sctp_transport *t)
{
        if (t->pl.probe_count < SCTP_MAX_PROBES)
                goto out;

        t->pl.probe_count = 0;
        if (t->pl.state == SCTP_PL_BASE) {
                if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */
                        t->pl.state = SCTP_PL_ERROR; /* Base -> Error */

                        t->pl.pmtu = SCTP_BASE_PLPMTU;
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        sctp_assoc_sync_pmtu(t->asoc);
                }
        } else if (t->pl.state == SCTP_PL_SEARCH) {
                if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */
                        t->pl.state = SCTP_PL_BASE;  /* Search -> Base */
                        t->pl.probe_size = SCTP_BASE_PLPMTU;
                        t->pl.probe_high = 0;

                        t->pl.pmtu = SCTP_BASE_PLPMTU;
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        sctp_assoc_sync_pmtu(t->asoc);
                } else { /* Normal probe failure. */
                        t->pl.probe_high = t->pl.probe_size;
                        t->pl.probe_size = t->pl.pmtu;
                }
        } else if (t->pl.state == SCTP_PL_COMPLETE) {
                if (t->pl.pmtu == t->pl.probe_size) { /* Black Hole Detected */
                        t->pl.state = SCTP_PL_BASE;  /* Search Complete -> Base */
                        t->pl.probe_size = SCTP_BASE_PLPMTU;

                        t->pl.pmtu = SCTP_BASE_PLPMTU;
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        sctp_assoc_sync_pmtu(t->asoc);
                }
        }

out:
        pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n",
                 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high);
        t->pl.probe_count++;
}

bool sctp_transport_pl_recv(struct sctp_transport *t)
{
        pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n",
                 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high);

        t->pl.pmtu = t->pl.probe_size;
        t->pl.probe_count = 0;
        if (t->pl.state == SCTP_PL_BASE) {
                t->pl.state = SCTP_PL_SEARCH; /* Base -> Search */
                t->pl.probe_size += SCTP_PL_BIG_STEP;
        } else if (t->pl.state == SCTP_PL_ERROR) {
                t->pl.state = SCTP_PL_SEARCH; /* Error -> Search */

                t->pl.pmtu = t->pl.probe_size;
                t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                sctp_assoc_sync_pmtu(t->asoc);
                t->pl.probe_size += SCTP_PL_BIG_STEP;
        } else if (t->pl.state == SCTP_PL_SEARCH) {
                if (!t->pl.probe_high) {
                        if (t->pl.probe_size < SCTP_MAX_PLPMTU) {
                                t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP,
                                                       SCTP_MAX_PLPMTU);
                                return false;
                        }
                        t->pl.probe_high = SCTP_MAX_PLPMTU;
                }
                t->pl.probe_size += SCTP_PL_MIN_STEP;
                if (t->pl.probe_size >= t->pl.probe_high) {
                        t->pl.probe_high = 0;
                        t->pl.state = SCTP_PL_COMPLETE; /* Search -> Search Complete */

                        t->pl.probe_size = t->pl.pmtu;
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        sctp_assoc_sync_pmtu(t->asoc);
                        sctp_transport_reset_raise_timer(t);
                }
        } else if (t->pl.state == SCTP_PL_COMPLETE) {
                /* Raise probe_size again after 30 * interval in Search Complete */
                t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */
                t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_MIN_STEP, SCTP_MAX_PLPMTU);
        }

        return t->pl.state == SCTP_PL_COMPLETE;
}

static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu)
{
        pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, ptb: %d\n",
                 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, pmtu);

        if (pmtu < SCTP_MIN_PLPMTU || pmtu >= t->pl.probe_size)
                return false;

        if (t->pl.state == SCTP_PL_BASE) {
                if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) {
                        t->pl.state = SCTP_PL_ERROR; /* Base -> Error */

                        t->pl.pmtu = SCTP_BASE_PLPMTU;
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        return true;
                }
        } else if (t->pl.state == SCTP_PL_SEARCH) {
                if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) {
                        t->pl.state = SCTP_PL_BASE;  /* Search -> Base */
                        t->pl.probe_size = SCTP_BASE_PLPMTU;
                        t->pl.probe_count = 0;

                        t->pl.probe_high = 0;
                        t->pl.pmtu = SCTP_BASE_PLPMTU;
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        return true;
                } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) {
                        t->pl.probe_size = pmtu;
                        t->pl.probe_count = 0;
                }
        } else if (t->pl.state == SCTP_PL_COMPLETE) {
                if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) {
                        t->pl.state = SCTP_PL_BASE;  /* Complete -> Base */
                        t->pl.probe_size = SCTP_BASE_PLPMTU;
                        t->pl.probe_count = 0;

                        t->pl.probe_high = 0;
                        t->pl.pmtu = SCTP_BASE_PLPMTU;
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        sctp_transport_reset_probe_timer(t);
                        return true;
                }
        }

        return false;
}

bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
{
        struct sock *sk = t->asoc->base.sk;
        struct dst_entry *dst;
        bool change = true;

        if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
                pr_warn_ratelimited("%s: Reported pmtu %d too low, using default minimum of %d\n",
                                    __func__, pmtu, SCTP_DEFAULT_MINSEGMENT);
                /* Use default minimum segment instead */
                pmtu = SCTP_DEFAULT_MINSEGMENT;
        }
        pmtu = SCTP_TRUNC4(pmtu);

        if (sctp_transport_pl_enabled(t))
                return sctp_transport_pl_toobig(t, pmtu - sctp_transport_pl_hlen(t));

        dst = sctp_transport_dst_check(t);
        if (dst) {
                struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family);
                union sctp_addr addr;

                pf->af->from_sk(&addr, sk);
                pf->to_sk_daddr(&t->ipaddr, sk);
                dst->ops->update_pmtu(dst, sk, NULL, pmtu, true);
                pf->to_sk_daddr(&addr, sk);

                dst = sctp_transport_dst_check(t);
        }

        if (!dst) {
                t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
                dst = t->dst;
        }

        if (dst) {
                /* Re-fetch, as under layers may have a higher minimum size */
                pmtu = sctp_dst_mtu(dst);
                change = t->pathmtu != pmtu;
        }
        t->pathmtu = pmtu;

        return change;
}

/* Caches the dst entry and source address for a transport's destination
 * address.
 */
void sctp_transport_route(struct sctp_transport *transport,
                          union sctp_addr *saddr, struct sctp_sock *opt)
{
        struct sctp_association *asoc = transport->asoc;
        struct sctp_af *af = transport->af_specific;

        sctp_transport_dst_release(transport);
        af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt));

        if (saddr)
                memcpy(&transport->saddr, saddr, sizeof(union sctp_addr));
        else
                af->get_saddr(opt, transport, &transport->fl);

        sctp_transport_pmtu(transport, sctp_opt2sk(opt));

        /* Initialize sk->sk_rcv_saddr, if the transport is the
         * association's active path for getsockname().
         */
        if (transport->dst && asoc &&
            (!asoc->peer.primary_path || transport == asoc->peer.active_path))
                opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk);
}

/* Hold a reference to a transport.  */
int sctp_transport_hold(struct sctp_transport *transport)
{
        return refcount_inc_not_zero(&transport->refcnt);
}

/* Release a reference to a transport and clean up
 * if there are no more references.
 */
void sctp_transport_put(struct sctp_transport *transport)
{
        if (refcount_dec_and_test(&transport->refcnt))
                sctp_transport_destroy(transport);
}

/* Update transport's RTO based on the newly calculated RTT. */
void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
{
        if (unlikely(!tp->rto_pending))
                /* We should not be doing any RTO updates unless rto_pending is set.  */
                pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp);

        if (tp->rttvar || tp->srtt) {
                struct net *net = tp->asoc->base.net;
                /* 6.3.1 C3) When a new RTT measurement R' is made, set
                 * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'|
                 * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R'
                 */

                /* Note:  The above algorithm has been rewritten to
                 * express rto_beta and rto_alpha as inverse powers
                 * of two.
                 * For example, assuming the default value of RTO.Alpha of
                 * 1/8, rto_alpha would be expressed as 3.
                 */
                tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta)
                        + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta);
                tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha)
                        + (rtt >> net->sctp.rto_alpha);
        } else {
                /* 6.3.1 C2) When the first RTT measurement R is made, set
                 * SRTT <- R, RTTVAR <- R/2.
                 */
                tp->srtt = rtt;
                tp->rttvar = rtt >> 1;
        }

        /* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then
         * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY.
         */
        if (tp->rttvar == 0)
                tp->rttvar = SCTP_CLOCK_GRANULARITY;

        /* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */
        tp->rto = tp->srtt + (tp->rttvar << 2);

        /* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min
         * seconds then it is rounded up to RTO.Min seconds.
         */
        if (tp->rto < tp->asoc->rto_min)
                tp->rto = tp->asoc->rto_min;

        /* 6.3.1 C7) A maximum value may be placed on RTO provided it is
         * at least RTO.max seconds.
         */
        if (tp->rto > tp->asoc->rto_max)
                tp->rto = tp->asoc->rto_max;

        sctp_max_rto(tp->asoc, tp);
        tp->rtt = rtt;

        /* Reset rto_pending so that a new RTT measurement is started when a
         * new data chunk is sent.
         */
        tp->rto_pending = 0;

        pr_debug("%s: transport:%p, rtt:%d, srtt:%d rttvar:%d, rto:%ld\n",
                 __func__, tp, rtt, tp->srtt, tp->rttvar, tp->rto);
}

/* This routine updates the transport's cwnd and partial_bytes_acked
 * parameters based on the bytes acked in the received SACK.
 */
void sctp_transport_raise_cwnd(struct sctp_transport *transport,
                               __u32 sack_ctsn, __u32 bytes_acked)
{
        struct sctp_association *asoc = transport->asoc;
        __u32 cwnd, ssthresh, flight_size, pba, pmtu;

        cwnd = transport->cwnd;
        flight_size = transport->flight_size;

        /* See if we need to exit Fast Recovery first */
        if (asoc->fast_recovery &&
            TSN_lte(asoc->fast_recovery_exit, sack_ctsn))
                asoc->fast_recovery = 0;

        ssthresh = transport->ssthresh;
        pba = transport->partial_bytes_acked;
        pmtu = transport->asoc->pathmtu;

        if (cwnd <= ssthresh) {
                /* RFC 4960 7.2.1
                 * o  When cwnd is less than or equal to ssthresh, an SCTP
                 *    endpoint MUST use the slow-start algorithm to increase
                 *    cwnd only if the current congestion window is being fully
                 *    utilized, an incoming SACK advances the Cumulative TSN
                 *    Ack Point, and the data sender is not in Fast Recovery.
                 *    Only when these three conditions are met can the cwnd be
                 *    increased; otherwise, the cwnd MUST not be increased.
                 *    If these conditions are met, then cwnd MUST be increased
                 *    by, at most, the lesser of 1) the total size of the
                 *    previously outstanding DATA chunk(s) acknowledged, and
                 *    2) the destination's path MTU.  This upper bound protects
                 *    against the ACK-Splitting attack outlined in [SAVAGE99].
                 */
                if (asoc->fast_recovery)
                        return;

                /* The appropriate cwnd increase algorithm is performed
                 * if, and only if the congestion window is being fully
                 * utilized.  Note that RFC4960 Errata 3.22 removed the
                 * other condition on ctsn moving.
                 */
                if (flight_size < cwnd)
                        return;

                if (bytes_acked > pmtu)
                        cwnd += pmtu;
                else
                        cwnd += bytes_acked;

                pr_debug("%s: slow start: transport:%p, bytes_acked:%d, "
                         "cwnd:%d, ssthresh:%d, flight_size:%d, pba:%d\n",
                         __func__, transport, bytes_acked, cwnd, ssthresh,
                         flight_size, pba);
        } else {
                /* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh,
                 * upon each SACK arrival, increase partial_bytes_acked
                 * by the total number of bytes of all new chunks
                 * acknowledged in that SACK including chunks
                 * acknowledged by the new Cumulative TSN Ack and by Gap
                 * Ack Blocks. (updated by RFC4960 Errata 3.22)
                 *
                 * When partial_bytes_acked is greater than cwnd and
                 * before the arrival of the SACK the sender had less
                 * bytes of data outstanding than cwnd (i.e., before
                 * arrival of the SACK, flightsize was less than cwnd),
                 * reset partial_bytes_acked to cwnd. (RFC 4960 Errata
                 * 3.26)
                 *
                 * When partial_bytes_acked is equal to or greater than
                 * cwnd and before the arrival of the SACK the sender
                 * had cwnd or more bytes of data outstanding (i.e.,
                 * before arrival of the SACK, flightsize was greater
                 * than or equal to cwnd), partial_bytes_acked is reset
                 * to (partial_bytes_acked - cwnd). Next, cwnd is
                 * increased by MTU. (RFC 4960 Errata 3.12)
                 */
                pba += bytes_acked;
                if (pba > cwnd && flight_size < cwnd)
                        pba = cwnd;
                if (pba >= cwnd && flight_size >= cwnd) {
                        pba = pba - cwnd;
                        cwnd += pmtu;
                }

                pr_debug("%s: congestion avoidance: transport:%p, "
                         "bytes_acked:%d, cwnd:%d, ssthresh:%d, "
                         "flight_size:%d, pba:%d\n", __func__,
                         transport, bytes_acked, cwnd, ssthresh,
                         flight_size, pba);
        }

        transport->cwnd = cwnd;
        transport->partial_bytes_acked = pba;
}

/* This routine is used to lower the transport's cwnd when congestion is
 * detected.
 */
void sctp_transport_lower_cwnd(struct sctp_transport *transport,
                               enum sctp_lower_cwnd reason)
{
        struct sctp_association *asoc = transport->asoc;

        switch (reason) {
        case SCTP_LOWER_CWND_T3_RTX:
                /* RFC 2960 Section 7.2.3, sctpimpguide
                 * When the T3-rtx timer expires on an address, SCTP should
                 * perform slow start by:
                 *      ssthresh = max(cwnd/2, 4*MTU)
                 *      cwnd = 1*MTU
                 *      partial_bytes_acked = 0
                 */
                transport->ssthresh = max(transport->cwnd/2,
                                          4*asoc->pathmtu);
                transport->cwnd = asoc->pathmtu;

                /* T3-rtx also clears fast recovery */
                asoc->fast_recovery = 0;
                break;

        case SCTP_LOWER_CWND_FAST_RTX:
                /* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the
                 * destination address(es) to which the missing DATA chunks
                 * were last sent, according to the formula described in
                 * Section 7.2.3.
                 *
                 * RFC 2960 7.2.3, sctpimpguide Upon detection of packet
                 * losses from SACK (see Section 7.2.4), An endpoint
                 * should do the following:
                 *      ssthresh = max(cwnd/2, 4*MTU)
                 *      cwnd = ssthresh
                 *      partial_bytes_acked = 0
                 */
                if (asoc->fast_recovery)
                        return;

                /* Mark Fast recovery */
                asoc->fast_recovery = 1;
                asoc->fast_recovery_exit = asoc->next_tsn - 1;

                transport->ssthresh = max(transport->cwnd/2,
                                          4*asoc->pathmtu);
                transport->cwnd = transport->ssthresh;
                break;

        case SCTP_LOWER_CWND_ECNE:
                /* RFC 2481 Section 6.1.2.
                 * If the sender receives an ECN-Echo ACK packet
                 * then the sender knows that congestion was encountered in the
                 * network on the path from the sender to the receiver. The
                 * indication of congestion should be treated just as a
                 * congestion loss in non-ECN Capable TCP. That is, the TCP
                 * source halves the congestion window "cwnd" and reduces the
                 * slow start threshold "ssthresh".
                 * A critical condition is that TCP does not react to
                 * congestion indications more than once every window of
                 * data (or more loosely more than once every round-trip time).
                 */
                if (time_after(jiffies, transport->last_time_ecne_reduced +
                                        transport->rtt)) {
                        transport->ssthresh = max(transport->cwnd/2,
                                                  4*asoc->pathmtu);
                        transport->cwnd = transport->ssthresh;
                        transport->last_time_ecne_reduced = jiffies;
                }
                break;

        case SCTP_LOWER_CWND_INACTIVE:
                /* RFC 2960 Section 7.2.1, sctpimpguide
                 * When the endpoint does not transmit data on a given
                 * transport address, the cwnd of the transport address
                 * should be adjusted to max(cwnd/2, 4*MTU) per RTO.
                 * NOTE: Although the draft recommends that this check needs
                 * to be done every RTO interval, we do it every hearbeat
                 * interval.
                 */
                transport->cwnd = max(transport->cwnd/2,
                                         4*asoc->pathmtu);
                /* RFC 4960 Errata 3.27.2: also adjust sshthresh */
                transport->ssthresh = transport->cwnd;
                break;
        }

        transport->partial_bytes_acked = 0;

        pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d\n",
                 __func__, transport, reason, transport->cwnd,
                 transport->ssthresh);
}

/* Apply Max.Burst limit to the congestion window:
 * sctpimpguide-05 2.14.2
 * D) When the time comes for the sender to
 * transmit new DATA chunks, the protocol parameter Max.Burst MUST
 * first be applied to limit how many new DATA chunks may be sent.
 * The limit is applied by adjusting cwnd as follows:
 *         if ((flightsize+ Max.Burst * MTU) < cwnd)
 *                 cwnd = flightsize + Max.Burst * MTU
 */

void sctp_transport_burst_limited(struct sctp_transport *t)
{
        struct sctp_association *asoc = t->asoc;
        u32 old_cwnd = t->cwnd;
        u32 max_burst_bytes;

        if (t->burst_limited || asoc->max_burst == 0)
                return;

        max_burst_bytes = t->flight_size + (asoc->max_burst * asoc->pathmtu);
        if (max_burst_bytes < old_cwnd) {
                t->cwnd = max_burst_bytes;
                t->burst_limited = old_cwnd;
        }
}

/* Restore the old cwnd congestion window, after the burst had it's
 * desired effect.
 */
void sctp_transport_burst_reset(struct sctp_transport *t)
{
        if (t->burst_limited) {
                t->cwnd = t->burst_limited;
                t->burst_limited = 0;
        }
}

/* What is the next timeout value for this transport? */
unsigned long sctp_transport_timeout(struct sctp_transport *trans)
{
        /* RTO + timer slack +/- 50% of RTO */
        unsigned long timeout = trans->rto >> 1;

        if (trans->state != SCTP_UNCONFIRMED &&
            trans->state != SCTP_PF)
                timeout += trans->hbinterval;

        return max_t(unsigned long, timeout, HZ / 5);
}

/* Reset transport variables to their initial values */
void sctp_transport_reset(struct sctp_transport *t)
{
        struct sctp_association *asoc = t->asoc;

        /* RFC 2960 (bis), Section 5.2.4
         * All the congestion control parameters (e.g., cwnd, ssthresh)
         * related to this peer MUST be reset to their initial values
         * (see Section 6.2.1)
         */
        t->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
        t->burst_limited = 0;
        t->ssthresh = asoc->peer.i.a_rwnd;
        t->rto = asoc->rto_initial;
        sctp_max_rto(asoc, t);
        t->rtt = 0;
        t->srtt = 0;
        t->rttvar = 0;

        /* Reset these additional variables so that we have a clean slate. */
        t->partial_bytes_acked = 0;
        t->flight_size = 0;
        t->error_count = 0;
        t->rto_pending = 0;
        t->hb_sent = 0;

        /* Initialize the state information for SFR-CACC */
        t->cacc.changeover_active = 0;
        t->cacc.cycling_changeover = 0;
        t->cacc.next_tsn_at_change = 0;
        t->cacc.cacc_saw_newack = 0;
}

/* Schedule retransmission on the given transport */
void sctp_transport_immediate_rtx(struct sctp_transport *t)
{
        /* Stop pending T3_rtx_timer */
        if (del_timer(&t->T3_rtx_timer))
                sctp_transport_put(t);

        sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX);
        if (!timer_pending(&t->T3_rtx_timer)) {
                if (!mod_timer(&t->T3_rtx_timer, jiffies + t->rto))
                        sctp_transport_hold(t);
        }
}

/* Drop dst */
void sctp_transport_dst_release(struct sctp_transport *t)
{
        dst_release(t->dst);
        t->dst = NULL;
        t->dst_pending_confirm = 0;
}

/* Schedule neighbour confirm */
void sctp_transport_dst_confirm(struct sctp_transport *t)
{
        t->dst_pending_confirm = 1;
}





















































































































    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NAMEI_H
#define _LINUX_NAMEI_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/path.h>
#include <linux/fcntl.h>
#include <linux/errno.h>

enum { MAX_NESTED_LINKS = 8 };

#define MAXSYMLINKS 40

/*
 * Type of the last component on LOOKUP_PARENT
 */
enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};

/* pathwalk mode */
#define LOOKUP_FOLLOW                0x0001        /* follow links at the end */
#define LOOKUP_DIRECTORY        0x0002        /* require a directory */
#define LOOKUP_AUTOMOUNT        0x0004  /* force terminal automount */
#define LOOKUP_EMPTY                0x4000        /* accept empty path [user_... only] */
#define LOOKUP_DOWN                0x8000        /* follow mounts in the starting point */
#define LOOKUP_MOUNTPOINT        0x0080        /* follow mounts in the end */

#define LOOKUP_REVAL                0x0020        /* tell ->d_revalidate() to trust no cache */
#define LOOKUP_RCU                0x0040        /* RCU pathwalk mode; semi-internal */

/* These tell filesystem methods that we are dealing with the final component... */
#define LOOKUP_OPEN                0x0100        /* ... in open */
#define LOOKUP_CREATE                0x0200        /* ... in object creation */
#define LOOKUP_EXCL                0x0400        /* ... in exclusive creation */
#define LOOKUP_RENAME_TARGET        0x0800        /* ... in destination of rename() */

/* internal use only */
#define LOOKUP_PARENT                0x0010

/* Scoping flags for lookup. */
#define LOOKUP_NO_SYMLINKS        0x010000 /* No symlink crossing. */
#define LOOKUP_NO_MAGICLINKS        0x020000 /* No nd_jump_link() crossing. */
#define LOOKUP_NO_XDEV                0x040000 /* No mountpoint crossing. */
#define LOOKUP_BENEATH                0x080000 /* No escaping from starting point. */
#define LOOKUP_IN_ROOT                0x100000 /* Treat dirfd as fs root. */
#define LOOKUP_CACHED                0x200000 /* Only do cached lookup */
#define LOOKUP_LINKAT_EMPTY        0x400000 /* Linkat request with empty path. */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)

extern int path_pts(struct path *path);

extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);

static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
{
        return user_path_at_empty(dfd, name, flags, path, NULL);
}

struct dentry *lookup_one_qstr_excl(const struct qstr *name,
                                    struct dentry *base,
                                    unsigned int flags);
extern int kern_path(const char *, unsigned, struct path *);

extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int);
extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern struct dentry *user_path_locked_at(int , const char __user *, struct path *);
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root);
int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *,
                    unsigned int, struct path *);

extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);
struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int);
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
                                   const char *name, struct dentry *base,
                                   int len);
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            const char *name,
                                            struct dentry *base, int len);

extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
extern int follow_up(struct path *);

extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);

/**
 * mode_strip_umask - handle vfs umask stripping
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode to be created in @dir
 *
 * In most filesystems, umask stripping depends on whether or not the
 * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
 * stripping is done directly in here. If the filesystem does support POSIX
 * ACLs umask stripping is deferred until the filesystem calls
 * posix_acl_create().
 *
 * Some filesystems (like NFSv4) also want to avoid umask stripping by the
 * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
 * to get this effect without declaring that they support POSIX ACLs.
 *
 * Returns: mode
 */
static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
{
        if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
                mode &= ~current_umask();
        return mode;
}

extern int __must_check nd_jump_link(const struct path *path);

static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
{
        ((char *) name)[min(len, maxlen)] = '\0';
}

/**
 * retry_estale - determine whether the caller should retry an operation
 * @error: the error that would currently be returned
 * @flags: flags being used for next lookup attempt
 *
 * Check to see if the error code was -ESTALE, and then determine whether
 * to retry the call based on whether "flags" already has LOOKUP_REVAL set.
 *
 * Returns true if the caller should try the operation again.
 */
static inline bool
retry_estale(const long error, const unsigned int flags)
{
        return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
}

#endif /* _LINUX_NAMEI_H */

























































































































































    1 






    1 



    1 


    1 

































































    1 
    1 






    1 
    1 































    1 











    1 

    1 





    1 








    1 









    1 
    1 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2002 International Business Machines, Corp.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This file is part of the SCTP kernel implementation
 *
 * This abstraction represents an SCTP endpoint.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson <karl@athena.chicago.il.us>
 *    Jon Grimm <jgrimm@austin.ibm.com>
 *    Daisy Chang <daisyc@us.ibm.com>
 *    Dajiang Zhang <dajiang.zhang@nokia.com>
 */

#include <linux/types.h>
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/random.h>        /* get_random_bytes() */
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

/* Forward declarations for internal helpers. */
static void sctp_endpoint_bh_rcv(struct work_struct *work);

/*
 * Initialize the base fields of the endpoint structure.
 */
static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
                                                struct sock *sk,
                                                gfp_t gfp)
{
        struct net *net = sock_net(sk);
        struct sctp_shared_key *null_key;

        ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp);
        if (!ep->digest)
                return NULL;

        ep->asconf_enable = net->sctp.addip_enable;
        ep->auth_enable = net->sctp.auth_enable;
        if (ep->auth_enable) {
                if (sctp_auth_init(ep, gfp))
                        goto nomem;
                if (ep->asconf_enable) {
                        sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
                        sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
                }
        }

        /* Initialize the base structure. */
        /* What type of endpoint are we?  */
        ep->base.type = SCTP_EP_TYPE_SOCKET;

        /* Initialize the basic object fields. */
        refcount_set(&ep->base.refcnt, 1);
        ep->base.dead = false;

        /* Create an input queue.  */
        sctp_inq_init(&ep->base.inqueue);

        /* Set its top-half handler */
        sctp_inq_set_th_handler(&ep->base.inqueue, sctp_endpoint_bh_rcv);

        /* Initialize the bind addr area */
        sctp_bind_addr_init(&ep->base.bind_addr, 0);

        /* Create the lists of associations.  */
        INIT_LIST_HEAD(&ep->asocs);

        /* Use SCTP specific send buffer space queues.  */
        ep->sndbuf_policy = net->sctp.sndbuf_policy;

        sk->sk_data_ready = sctp_data_ready;
        sk->sk_write_space = sctp_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

        /* Get the receive buffer policy for this endpoint */
        ep->rcvbuf_policy = net->sctp.rcvbuf_policy;

        /* Initialize the secret key used with cookie. */
        get_random_bytes(ep->secret_key, sizeof(ep->secret_key));

        /* SCTP-AUTH extensions*/
        INIT_LIST_HEAD(&ep->endpoint_shared_keys);
        null_key = sctp_auth_shkey_create(0, gfp);
        if (!null_key)
                goto nomem_shkey;

        list_add(&null_key->key_list, &ep->endpoint_shared_keys);

        /* Add the null key to the endpoint shared keys list and
         * set the hmcas and chunks pointers.
         */
        ep->prsctp_enable = net->sctp.prsctp_enable;
        ep->reconf_enable = net->sctp.reconf_enable;
        ep->ecn_enable = net->sctp.ecn_enable;

        /* Remember who we are attached to.  */
        ep->base.sk = sk;
        ep->base.net = sock_net(sk);
        sock_hold(ep->base.sk);

        return ep;

nomem_shkey:
        sctp_auth_free(ep);
nomem:
        kfree(ep->digest);
        return NULL;

}

/* Create a sctp_endpoint with all that boring stuff initialized.
 * Returns NULL if there isn't enough memory.
 */
struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp)
{
        struct sctp_endpoint *ep;

        /* Build a local endpoint. */
        ep = kzalloc(sizeof(*ep), gfp);
        if (!ep)
                goto fail;

        if (!sctp_endpoint_init(ep, sk, gfp))
                goto fail_init;

        SCTP_DBG_OBJCNT_INC(ep);
        return ep;

fail_init:
        kfree(ep);
fail:
        return NULL;
}

/* Add an association to an endpoint.  */
void sctp_endpoint_add_asoc(struct sctp_endpoint *ep,
                            struct sctp_association *asoc)
{
        struct sock *sk = ep->base.sk;

        /* If this is a temporary association, don't bother
         * since we'll be removing it shortly and don't
         * want anyone to find it anyway.
         */
        if (asoc->temp)
                return;

        /* Now just add it to our list of asocs */
        list_add_tail(&asoc->asocs, &ep->asocs);

        /* Increment the backlog value for a TCP-style listening socket. */
        if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
                sk_acceptq_added(sk);
}

/* Free the endpoint structure.  Delay cleanup until
 * all users have released their reference count on this structure.
 */
void sctp_endpoint_free(struct sctp_endpoint *ep)
{
        ep->base.dead = true;

        inet_sk_set_state(ep->base.sk, SCTP_SS_CLOSED);

        /* Unlink this endpoint, so we can't find it again! */
        sctp_unhash_endpoint(ep);

        sctp_endpoint_put(ep);
}

/* Final destructor for endpoint.  */
static void sctp_endpoint_destroy_rcu(struct rcu_head *head)
{
        struct sctp_endpoint *ep = container_of(head, struct sctp_endpoint, rcu);
        struct sock *sk = ep->base.sk;

        sctp_sk(sk)->ep = NULL;
        sock_put(sk);

        kfree(ep);
        SCTP_DBG_OBJCNT_DEC(ep);
}

static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
{
        struct sock *sk;

        if (unlikely(!ep->base.dead)) {
                WARN(1, "Attempt to destroy undead endpoint %p!\n", ep);
                return;
        }

        /* Free the digest buffer */
        kfree(ep->digest);

        /* SCTP-AUTH: Free up AUTH releated data such as shared keys
         * chunks and hmacs arrays that were allocated
         */
        sctp_auth_destroy_keys(&ep->endpoint_shared_keys);
        sctp_auth_free(ep);

        /* Cleanup. */
        sctp_inq_free(&ep->base.inqueue);
        sctp_bind_addr_free(&ep->base.bind_addr);

        memset(ep->secret_key, 0, sizeof(ep->secret_key));

        sk = ep->base.sk;
        /* Remove and free the port */
        if (sctp_sk(sk)->bind_hash)
                sctp_put_port(sk);

        call_rcu(&ep->rcu, sctp_endpoint_destroy_rcu);
}

/* Hold a reference to an endpoint. */
int sctp_endpoint_hold(struct sctp_endpoint *ep)
{
        return refcount_inc_not_zero(&ep->base.refcnt);
}

/* Release a reference to an endpoint and clean up if there are
 * no more references.
 */
void sctp_endpoint_put(struct sctp_endpoint *ep)
{
        if (refcount_dec_and_test(&ep->base.refcnt))
                sctp_endpoint_destroy(ep);
}

/* Is this the endpoint we are looking for?  */
struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
                                               struct net *net,
                                               const union sctp_addr *laddr,
                                               int dif, int sdif)
{
        int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if);
        struct sctp_endpoint *retval = NULL;

        if (net_eq(ep->base.net, net) &&
            sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) &&
            (htons(ep->base.bind_addr.port) == laddr->v4.sin_port)) {
                if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
                                         sctp_sk(ep->base.sk)))
                        retval = ep;
        }

        return retval;
}

/* Find the association that goes with this chunk.
 * We lookup the transport from hashtable at first, then get association
 * through t->assoc.
 */
struct sctp_association *sctp_endpoint_lookup_assoc(
        const struct sctp_endpoint *ep,
        const union sctp_addr *paddr,
        struct sctp_transport **transport)
{
        struct sctp_association *asoc = NULL;
        struct sctp_transport *t;

        *transport = NULL;

        /* If the local port is not set, there can't be any associations
         * on this endpoint.
         */
        if (!ep->base.bind_addr.port)
                return NULL;

        rcu_read_lock();
        t = sctp_epaddr_lookup_transport(ep, paddr);
        if (!t)
                goto out;

        *transport = t;
        asoc = t->asoc;
out:
        rcu_read_unlock();
        return asoc;
}

/* Look for any peeled off association from the endpoint that matches the
 * given peer address.
 */
bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
                                 const union sctp_addr *paddr)
{
        int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if);
        struct sctp_sockaddr_entry *addr;
        struct net *net = ep->base.net;
        struct sctp_bind_addr *bp;

        bp = &ep->base.bind_addr;
        /* This function is called with the socket lock held,
         * so the address_list can not change.
         */
        list_for_each_entry(addr, &bp->address_list, list) {
                if (sctp_has_association(net, &addr->a, paddr,
                                         bound_dev_if, bound_dev_if))
                        return true;
        }

        return false;
}

/* Do delayed input processing.  This is scheduled by sctp_rcv().
 * This may be called on BH or task time.
 */
static void sctp_endpoint_bh_rcv(struct work_struct *work)
{
        struct sctp_endpoint *ep =
                container_of(work, struct sctp_endpoint,
                             base.inqueue.immediate);
        struct sctp_association *asoc;
        struct sock *sk;
        struct net *net;
        struct sctp_transport *transport;
        struct sctp_chunk *chunk;
        struct sctp_inq *inqueue;
        union sctp_subtype subtype;
        enum sctp_state state;
        int error = 0;
        int first_time = 1;        /* is this the first time through the loop */

        if (ep->base.dead)
                return;

        asoc = NULL;
        inqueue = &ep->base.inqueue;
        sk = ep->base.sk;
        net = sock_net(sk);

        while (NULL != (chunk = sctp_inq_pop(inqueue))) {
                subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);

                /* If the first chunk in the packet is AUTH, do special
                 * processing specified in Section 6.3 of SCTP-AUTH spec
                 */
                if (first_time && (subtype.chunk == SCTP_CID_AUTH)) {
                        struct sctp_chunkhdr *next_hdr;

                        next_hdr = sctp_inq_peek(inqueue);
                        if (!next_hdr)
                                goto normal;

                        /* If the next chunk is COOKIE-ECHO, skip the AUTH
                         * chunk while saving a pointer to it so we can do
                         * Authentication later (during cookie-echo
                         * processing).
                         */
                        if (next_hdr->type == SCTP_CID_COOKIE_ECHO) {
                                chunk->auth_chunk = skb_clone(chunk->skb,
                                                                GFP_ATOMIC);
                                chunk->auth = 1;
                                continue;
                        }
                }
normal:
                /* We might have grown an association since last we
                 * looked, so try again.
                 *
                 * This happens when we've just processed our
                 * COOKIE-ECHO chunk.
                 */
                if (NULL == chunk->asoc) {
                        asoc = sctp_endpoint_lookup_assoc(ep,
                                                          sctp_source(chunk),
                                                          &transport);
                        chunk->asoc = asoc;
                        chunk->transport = transport;
                }

                state = asoc ? asoc->state : SCTP_STATE_CLOSED;
                if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth)
                        continue;

                /* Remember where the last DATA chunk came from so we
                 * know where to send the SACK.
                 */
                if (asoc && sctp_chunk_is_data(chunk))
                        asoc->peer.last_data_from = chunk->transport;
                else {
                        SCTP_INC_STATS(ep->base.net, SCTP_MIB_INCTRLCHUNKS);
                        if (asoc)
                                asoc->stats.ictrlchunks++;
                }

                if (chunk->transport)
                        chunk->transport->last_time_heard = ktime_get();

                error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype, state,
                                   ep, asoc, chunk, GFP_ATOMIC);

                if (error && chunk)
                        chunk->pdiscard = 1;

                /* Check to see if the endpoint is freed in response to
                 * the incoming chunk. If so, get out of the while loop.
                 */
                if (!sctp_sk(sk)->ep)
                        break;

                if (first_time)
                        first_time = 0;
        }
}













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





















    4 










































































































































































    6 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-instrumented.sh
// DO NOT MODIFY THIS FILE DIRECTLY

/*
 * This file provoides atomic operations with explicit instrumentation (e.g.
 * KASAN, KCSAN), which should be used unless it is necessary to avoid
 * instrumentation. Where it is necessary to aovid instrumenation, the
 * raw_atomic*() operations should be used.
 */
#ifndef _LINUX_ATOMIC_INSTRUMENTED_H
#define _LINUX_ATOMIC_INSTRUMENTED_H

#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/instrumented.h>

/**
 * atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read(v);
}

/**
 * atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read_acquire(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read_acquire(v);
}

/**
 * atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set(atomic_t *v, int i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set(v, i);
}

/**
 * atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set_release(atomic_t *v, int i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set_release(v, i);
}

/**
 * atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_add(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_add(i, v);
}

/**
 * atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return(i, v);
}

/**
 * atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_acquire(i, v);
}

/**
 * atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_release(i, v);
}

/**
 * atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_relaxed(i, v);
}

/**
 * atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add(i, v);
}

/**
 * atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_acquire(i, v);
}

/**
 * atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_release(i, v);
}

/**
 * atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_relaxed(i, v);
}

/**
 * atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_sub(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_sub(i, v);
}

/**
 * atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return(i, v);
}

/**
 * atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_acquire(i, v);
}

/**
 * atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_release(i, v);
}

/**
 * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_relaxed(i, v);
}

/**
 * atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub(i, v);
}

/**
 * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_acquire(i, v);
}

/**
 * atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_release(i, v);
}

/**
 * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_relaxed(i, v);
}

/**
 * atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_inc(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_inc(v);
}

/**
 * atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return(v);
}

/**
 * atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_acquire(v);
}

/**
 * atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_release(v);
}

/**
 * atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_relaxed(v);
}

/**
 * atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc(v);
}

/**
 * atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_acquire(v);
}

/**
 * atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_release(v);
}

/**
 * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_relaxed(v);
}

/**
 * atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_dec(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_dec(v);
}

/**
 * atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return(v);
}

/**
 * atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_acquire(v);
}

/**
 * atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_release(v);
}

/**
 * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_relaxed(v);
}

/**
 * atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec(v);
}

/**
 * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_acquire(v);
}

/**
 * atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_release(v);
}

/**
 * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_relaxed(v);
}

/**
 * atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_and(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_and(i, v);
}

/**
 * atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and(i, v);
}

/**
 * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_acquire(i, v);
}

/**
 * atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_release(i, v);
}

/**
 * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_relaxed(i, v);
}

/**
 * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_andnot(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_andnot(i, v);
}

/**
 * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot(i, v);
}

/**
 * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_acquire(i, v);
}

/**
 * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_release(i, v);
}

/**
 * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_or(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_or(i, v);
}

/**
 * atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or(i, v);
}

/**
 * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_acquire(i, v);
}

/**
 * atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_release(i, v);
}

/**
 * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_relaxed(i, v);
}

/**
 * atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_xor(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_xor(i, v);
}

/**
 * atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor(i, v);
}

/**
 * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_acquire(i, v);
}

/**
 * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_release(i, v);
}

/**
 * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_relaxed(i, v);
}

/**
 * atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg(atomic_t *v, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg(v, new);
}

/**
 * atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_acquire(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_acquire(v, new);
}

/**
 * atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_release(atomic_t *v, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_release(v, new);
}

/**
 * atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_relaxed(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_relaxed(v, new);
}

/**
 * atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg(atomic_t *v, int old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg(v, old, new);
}

/**
 * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_release(v, old, new);
}

/**
 * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg(v, old, new);
}

/**
 * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_sub_and_test(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_and_test(i, v);
}

/**
 * atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_dec_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_and_test(v);
}

/**
 * atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_inc_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_and_test(v);
}

/**
 * atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative(i, v);
}

/**
 * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_acquire(i, v);
}

/**
 * atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_release(i, v);
}

/**
 * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_relaxed(i, v);
}

/**
 * atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_unless(v, a, u);
}

/**
 * atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_unless(v, a, u);
}

/**
 * atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_not_zero(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_not_zero(v);
}

/**
 * atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_unless_negative(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_unless_negative(v);
}

/**
 * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_dec_unless_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_unless_positive(v);
}

/**
 * atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
atomic_dec_if_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_if_positive(v);
}

/**
 * atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read(v);
}

/**
 * atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read_acquire(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read_acquire(v);
}

/**
 * atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set(atomic64_t *v, s64 i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set(v, i);
}

/**
 * atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set_release(atomic64_t *v, s64 i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set_release(v, i);
}

/**
 * atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_add(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_add(i, v);
}

/**
 * atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return(i, v);
}

/**
 * atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_acquire(i, v);
}

/**
 * atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_release(i, v);
}

/**
 * atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_relaxed(i, v);
}

/**
 * atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add(i, v);
}

/**
 * atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_acquire(i, v);
}

/**
 * atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_release(i, v);
}

/**
 * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_relaxed(i, v);
}

/**
 * atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_sub(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_sub(i, v);
}

/**
 * atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return(i, v);
}

/**
 * atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_acquire(i, v);
}

/**
 * atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_release(i, v);
}

/**
 * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_relaxed(i, v);
}

/**
 * atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub(i, v);
}

/**
 * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_acquire(i, v);
}

/**
 * atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_release(i, v);
}

/**
 * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_relaxed(i, v);
}

/**
 * atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_inc(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_inc(v);
}

/**
 * atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return(v);
}

/**
 * atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_acquire(v);
}

/**
 * atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_release(v);
}

/**
 * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_relaxed(v);
}

/**
 * atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc(v);
}

/**
 * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_acquire(v);
}

/**
 * atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_release(v);
}

/**
 * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_relaxed(v);
}

/**
 * atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_dec(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_dec(v);
}

/**
 * atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return(v);
}

/**
 * atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_acquire(v);
}

/**
 * atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_release(v);
}

/**
 * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_relaxed(v);
}

/**
 * atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec(v);
}

/**
 * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_acquire(v);
}

/**
 * atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_release(v);
}

/**
 * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_relaxed(v);
}

/**
 * atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_and(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_and(i, v);
}

/**
 * atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and(i, v);
}

/**
 * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_acquire(i, v);
}

/**
 * atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_release(i, v);
}

/**
 * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_relaxed(i, v);
}

/**
 * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_andnot(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_andnot(i, v);
}

/**
 * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot(i, v);
}

/**
 * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_acquire(i, v);
}

/**
 * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_release(i, v);
}

/**
 * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_relaxed(i, v);
}

/**
 * atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_or(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_or(i, v);
}

/**
 * atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or(i, v);
}

/**
 * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_acquire(i, v);
}

/**
 * atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_release(i, v);
}

/**
 * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_relaxed(i, v);
}

/**
 * atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_xor(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_xor(i, v);
}

/**
 * atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor(i, v);
}

/**
 * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_acquire(i, v);
}

/**
 * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_release(i, v);
}

/**
 * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_relaxed(i, v);
}

/**
 * atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg(atomic64_t *v, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg(v, new);
}

/**
 * atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_acquire(v, new);
}

/**
 * atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_release(atomic64_t *v, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_release(v, new);
}

/**
 * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_relaxed(v, new);
}

/**
 * atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg(v, old, new);
}

/**
 * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_release(v, old, new);
}

/**
 * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg(v, old, new);
}

/**
 * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_release(v, old, new);
}

/**
 * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_and_test(i, v);
}

/**
 * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_dec_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_and_test(v);
}

/**
 * atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_inc_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_and_test(v);
}

/**
 * atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative(i, v);
}

/**
 * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_acquire(i, v);
}

/**
 * atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_release(i, v);
}

/**
 * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_relaxed(i, v);
}

/**
 * atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_unless(v, a, u);
}

/**
 * atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_unless(v, a, u);
}

/**
 * atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_not_zero(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_not_zero(v);
}

/**
 * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_unless_negative(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_unless_negative(v);
}

/**
 * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_dec_unless_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_unless_positive(v);
}

/**
 * atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
atomic64_dec_if_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_if_positive(v);
}

/**
 * atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read(v);
}

/**
 * atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read_acquire(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read_acquire(v);
}

/**
 * atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set(atomic_long_t *v, long i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set(v, i);
}

/**
 * atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set_release(atomic_long_t *v, long i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set_release(v, i);
}

/**
 * atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_add(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_add(i, v);
}

/**
 * atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return(i, v);
}

/**
 * atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_acquire(i, v);
}

/**
 * atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_release(i, v);
}

/**
 * atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add(i, v);
}

/**
 * atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_acquire(i, v);
}

/**
 * atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_release(i, v);
}

/**
 * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_relaxed(i, v);
}

/**
 * atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_sub(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_sub(i, v);
}

/**
 * atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return(i, v);
}

/**
 * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_acquire(i, v);
}

/**
 * atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_release(i, v);
}

/**
 * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub(i, v);
}

/**
 * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_acquire(i, v);
}

/**
 * atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_release(i, v);
}

/**
 * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_relaxed(i, v);
}

/**
 * atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_inc(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_inc(v);
}

/**
 * atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return(v);
}

/**
 * atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_acquire(v);
}

/**
 * atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_release(v);
}

/**
 * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_relaxed(v);
}

/**
 * atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc(v);
}

/**
 * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_acquire(v);
}

/**
 * atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_release(v);
}

/**
 * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_relaxed(v);
}

/**
 * atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_dec(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_dec(v);
}

/**
 * atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return(v);
}

/**
 * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_acquire(v);
}

/**
 * atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_release(v);
}

/**
 * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_relaxed(v);
}

/**
 * atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec(v);
}

/**
 * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_acquire(v);
}

/**
 * atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_release(v);
}

/**
 * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_relaxed(v);
}

/**
 * atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_and(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_and(i, v);
}

/**
 * atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and(i, v);
}

/**
 * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_acquire(i, v);
}

/**
 * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_release(i, v);
}

/**
 * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_relaxed(i, v);
}

/**
 * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_andnot(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_acquire(i, v);
}

/**
 * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_release(i, v);
}

/**
 * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_or(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_or(i, v);
}

/**
 * atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or(i, v);
}

/**
 * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_acquire(i, v);
}

/**
 * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_release(i, v);
}

/**
 * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_relaxed(i, v);
}

/**
 * atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_xor(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_xor(i, v);
}

/**
 * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor(i, v);
}

/**
 * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_acquire(i, v);
}

/**
 * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_release(i, v);
}

/**
 * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_relaxed(i, v);
}

/**
 * atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg(atomic_long_t *v, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg(v, new);
}

/**
 * atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_acquire(v, new);
}

/**
 * atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_release(atomic_long_t *v, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_release(v, new);
}

/**
 * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_relaxed(v, new);
}

/**
 * atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg(v, old, new);
}

/**
 * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_sub_and_test(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_and_test(i, v);
}

/**
 * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_and_test(v);
}

/**
 * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_and_test(v);
}

/**
 * atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative(i, v);
}

/**
 * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_acquire(i, v);
}

/**
 * atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_release(i, v);
}

/**
 * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_relaxed(i, v);
}

/**
 * atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_unless(v, a, u);
}

/**
 * atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_unless(v, a, u);
}

/**
 * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_not_zero(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_not_zero(v);
}

/**
 * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_unless_negative(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_unless_negative(v);
}

/**
 * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_unless_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_unless_positive(v);
}

/**
 * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
atomic_long_dec_if_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_if_positive(v);
}

#define xchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg(__ai_ptr, __VA_ARGS__); \
})

#define xchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define xchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_release(__ai_ptr, __VA_ARGS__); \
})

#define xchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define cmpxchg_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
})

#define sync_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define sync_try_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
})


#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
// 8829b337928e9508259079d32581775ececd415b



























    2 
    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _DELAYED_CALL_H
#define _DELAYED_CALL_H

/*
 * Poor man's closures; I wish we could've done them sanely polymorphic,
 * but...
 */

struct delayed_call {
        void (*fn)(void *);
        void *arg;
};

#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}

/* I really wish we had closures with sane typechecking... */
static inline void set_delayed_call(struct delayed_call *call,
                void (*fn)(void *), void *arg)
{
        call->fn = fn;
        call->arg = arg;
}

static inline void do_delayed_call(struct delayed_call *call)
{
        if (call->fn)
                call->fn(call->arg);
}

static inline void clear_delayed_call(struct delayed_call *call)
{
        call->fn = NULL;
}
#endif






































































































    2 

    3 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _NET_RPS_H
#define _NET_RPS_H

#include <linux/types.h>
#include <linux/static_key.h>
#include <net/sock.h>
#include <net/hotdata.h>

#ifdef CONFIG_RPS

extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;

/*
 * This structure holds an RPS map which can be of variable length.  The
 * map is an array of CPUs.
 */
struct rps_map {
        unsigned int        len;
        struct rcu_head        rcu;
        u16                cpus[];
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))

/*
 * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
 * tail pointer for that CPU's input queue at the time of last enqueue, and
 * a hardware filter index.
 */
struct rps_dev_flow {
        u16                cpu;
        u16                filter;
        unsigned int        last_qtail;
};
#define RPS_NO_FILTER 0xffff

/*
 * The rps_dev_flow_table structure contains a table of flow mappings.
 */
struct rps_dev_flow_table {
        unsigned int                mask;
        struct rcu_head                rcu;
        struct rps_dev_flow        flows[];
};
#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
    ((_num) * sizeof(struct rps_dev_flow)))

/*
 * The rps_sock_flow_table contains mappings of flows to the last CPU
 * on which they were processed by the application (set in recvmsg).
 * Each entry is a 32bit value. Upper part is the high-order bits
 * of flow hash, lower part is CPU number.
 * rps_cpu_mask is used to partition the space, depending on number of
 * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
 * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f,
 * meaning we use 32-6=26 bits for the hash.
 */
struct rps_sock_flow_table {
        u32        mask;

        u32        ents[] ____cacheline_aligned_in_smp;
};
#define        RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))

#define RPS_NO_CPU 0xffff

static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
                                        u32 hash)
{
        unsigned int index = hash & table->mask;
        u32 val = hash & ~net_hotdata.rps_cpu_mask;

        /* We only give a hint, preemption can change CPU under us */
        val |= raw_smp_processor_id();

        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in get_rps_cpu().
         */
        if (READ_ONCE(table->ents[index]) != val)
                WRITE_ONCE(table->ents[index], val);
}

#endif /* CONFIG_RPS */

static inline void sock_rps_record_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
        struct rps_sock_flow_table *sock_flow_table;

        if (!hash)
                return;
        rcu_read_lock();
        sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
        if (sock_flow_table)
                rps_record_sock_flow(sock_flow_table, hash);
        rcu_read_unlock();
#endif
}

static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rfs_needed)) {
                /* Reading sk->sk_rxhash might incur an expensive cache line
                 * miss.
                 *
                 * TCP_ESTABLISHED does cover almost all states where RFS
                 * might be useful, and is cheaper [1] than testing :
                 *        IPv4: inet_sk(sk)->inet_daddr
                 *         IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
                 * OR        an additional socket flag
                 * [1] : sk_state and sk_prot are in the same cache line.
                 */
                if (sk->sk_state == TCP_ESTABLISHED) {
                        /* This READ_ONCE() is paired with the WRITE_ONCE()
                         * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
                         */
                        sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
                }
        }
#endif
}

static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        return ++sd->input_queue_tail;
#else
        return 0;
#endif
}

static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
{
#ifdef CONFIG_RPS
        WRITE_ONCE(*dest, tail);
#endif
}

static inline void rps_input_queue_head_add(struct softnet_data *sd, int val)
{
#ifdef CONFIG_RPS
        WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val);
#endif
}

static inline void rps_input_queue_head_incr(struct softnet_data *sd)
{
        rps_input_queue_head_add(sd, 1);
}

#endif /* _NET_RPS_H */

































































































































































































   22 
   25 













   22 




   22 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
 */
#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
#include <linux/sysfs.h>
#include <linux/bug.h>
#include <linux/nmi.h>

#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>

static const char * const exception_stack_names[] = {
                [ ESTACK_DF        ]        = "#DF",
                [ ESTACK_NMI        ]        = "NMI",
                [ ESTACK_DB        ]        = "#DB",
                [ ESTACK_MCE        ]        = "#MC",
                [ ESTACK_VC        ]        = "#VC",
                [ ESTACK_VC2        ]        = "#VC2",
};

const char *stack_type_name(enum stack_type type)
{
        BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);

        if (type == STACK_TYPE_TASK)
                return "TASK";

        if (type == STACK_TYPE_IRQ)
                return "IRQ";

        if (type == STACK_TYPE_SOFTIRQ)
                return "SOFTIRQ";

        if (type == STACK_TYPE_ENTRY) {
                /*
                 * On 64-bit, we have a generic entry stack that we
                 * use for all the kernel entry points, including
                 * SYSENTER.
                 */
                return "ENTRY_TRAMPOLINE";
        }

        if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
                return exception_stack_names[type - STACK_TYPE_EXCEPTION];

        return NULL;
}

/**
 * struct estack_pages - Page descriptor for exception stacks
 * @offs:        Offset from the start of the exception stack area
 * @size:        Size of the exception stack
 * @type:        Type to store in the stack_info struct
 */
struct estack_pages {
        u32        offs;
        u16        size;
        u16        type;
};

#define EPAGERANGE(st)                                                        \
        [PFN_DOWN(CEA_ESTACK_OFFS(st)) ...                                \
         PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = {        \
                .offs        = CEA_ESTACK_OFFS(st),                                \
                .size        = CEA_ESTACK_SIZE(st),                                \
                .type        = STACK_TYPE_EXCEPTION + ESTACK_ ##st, }

/*
 * Array of exception stack page descriptors. If the stack is larger than
 * PAGE_SIZE, all pages covering a particular stack will have the same
 * info. The guard pages including the not mapped DB2 stack are zeroed
 * out.
 */
static const
struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
        EPAGERANGE(DF),
        EPAGERANGE(NMI),
        EPAGERANGE(DB),
        EPAGERANGE(MCE),
        EPAGERANGE(VC),
        EPAGERANGE(VC2),
};

static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
        unsigned long begin, end, stk = (unsigned long)stack;
        const struct estack_pages *ep;
        struct pt_regs *regs;
        unsigned int k;

        BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);

        begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
        /*
         * Handle the case where stack trace is collected _before_
         * cea_exception_stacks had been initialized.
         */
        if (!begin)
                return false;

        end = begin + sizeof(struct cea_exception_stacks);
        /* Bail if @stack is outside the exception stack area. */
        if (stk < begin || stk >= end)
                return false;

        /* Calc page offset from start of exception stacks */
        k = (stk - begin) >> PAGE_SHIFT;
        /* Lookup the page descriptor */
        ep = &estack_pages[k];
        /* Guard page? */
        if (!ep->size)
                return false;

        begin += (unsigned long)ep->offs;
        end = begin + (unsigned long)ep->size;
        regs = (struct pt_regs *)end - 1;

        info->type        = ep->type;
        info->begin        = (unsigned long *)begin;
        info->end        = (unsigned long *)end;
        info->next_sp        = (unsigned long *)regs->sp;
        return true;
}

static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
        unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
        unsigned long *begin;

        /*
         * @end points directly to the top most stack entry to avoid a -8
         * adjustment in the stack switch hotpath. Adjust it back before
         * calculating @begin.
         */
        end++;
        begin = end - (IRQ_STACK_SIZE / sizeof(long));

        /*
         * Due to the switching logic RSP can never be == @end because the
         * final operation is 'popq %rsp' which means after that RSP points
         * to the original stack and not to @end.
         */
        if (stack < begin || stack >= end)
                return false;

        info->type        = STACK_TYPE_IRQ;
        info->begin        = begin;
        info->end        = end;

        /*
         * The next stack pointer is stored at the top of the irq stack
         * before switching to the irq stack. Actual stack entries are all
         * below that.
         */
        info->next_sp = (unsigned long *)*(end - 1);

        return true;
}

bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
                                    struct stack_info *info)
{
        if (in_task_stack(stack, task, info))
                return true;

        if (task != current)
                return false;

        if (in_exception_stack(stack, info))
                return true;

        if (in_irq_stack(stack, info))
                return true;

        if (in_entry_stack(stack, info))
                return true;

        return false;
}

int get_stack_info(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info, unsigned long *visit_mask)
{
        task = task ? : current;

        if (!stack)
                goto unknown;

        if (!get_stack_info_noinstr(stack, task, info))
                goto unknown;

        /*
         * Make sure we don't iterate through any given stack more than once.
         * If it comes up a second time then there's something wrong going on:
         * just break out and report an unknown stack type.
         */
        if (visit_mask) {
                if (*visit_mask & (1UL << info->type)) {
                        if (task == current)
                                printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
                        goto unknown;
                }
                *visit_mask |= 1UL << info->type;
        }

        return 0;

unknown:
        info->type = STACK_TYPE_UNKNOWN;
        return -EINVAL;
}




























































































































































    3 







    3 
    4 







    4 


    4 














































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Ethernet-type device handling.
 *
 * Version:        @(#)eth.c        1.0.7        05/25/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Florian  La Roche, <rzsfl@rz.uni-sb.de>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Fixes:
 *                Mr Linux        : Arp problems
 *                Alan Cox        : Generic queue tidyup (very tiny here)
 *                Alan Cox        : eth_header ntohs should be htons
 *                Alan Cox        : eth_rebuild_header missing an htons and
 *                                  minor other things.
 *                Tegge                : Arp bug fixes.
 *                Florian                : Removed many unnecessary functions, code cleanup
 *                                  and changes for new arp and skbuff.
 *                Alan Cox        : Redid header building to reflect new format.
 *                Alan Cox        : ARP only when compiled with CONFIG_INET
 *                Greg Page        : 802.2 and SNAP stuff.
 *                Alan Cox        : MAC layer pointers/new format.
 *                Paul Gortmaker        : eth_copy_and_sum shouldn't csum padding.
 *                Alan Cox        : Protect against forwarding explosions with
 *                                  older network drivers and IFF_ALLMULTI.
 *        Christer Weinigel        : Better rebuild header message.
 *             Andrew Morton    : 26Feb01: kill ether_setup() - use netdev_boot_setup().
 */
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/nvmem-consumer.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/of_net.h>
#include <linux/pci.h>
#include <linux/property.h>
#include <net/dst.h>
#include <net/arp.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/ip.h>
#include <net/dsa.h>
#include <net/flow_dissector.h>
#include <net/gro.h>
#include <linux/uaccess.h>
#include <net/pkt_sched.h>

/**
 * eth_header - create the Ethernet header
 * @skb:        buffer to alter
 * @dev:        source device
 * @type:        Ethernet type field
 * @daddr: destination address (NULL leave destination address)
 * @saddr: source address (NULL use device source address)
 * @len:   packet length (<= skb->len)
 *
 *
 * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length
 * in here instead.
 */
int eth_header(struct sk_buff *skb, struct net_device *dev,
               unsigned short type,
               const void *daddr, const void *saddr, unsigned int len)
{
        struct ethhdr *eth = skb_push(skb, ETH_HLEN);

        if (type != ETH_P_802_3 && type != ETH_P_802_2)
                eth->h_proto = htons(type);
        else
                eth->h_proto = htons(len);

        /*
         *      Set the source hardware address.
         */

        if (!saddr)
                saddr = dev->dev_addr;
        memcpy(eth->h_source, saddr, ETH_ALEN);

        if (daddr) {
                memcpy(eth->h_dest, daddr, ETH_ALEN);
                return ETH_HLEN;
        }

        /*
         *      Anyway, the loopback-device should never use this function...
         */

        if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
                eth_zero_addr(eth->h_dest);
                return ETH_HLEN;
        }

        return -ETH_HLEN;
}
EXPORT_SYMBOL(eth_header);

/**
 * eth_get_headlen - determine the length of header for an ethernet frame
 * @dev: pointer to network device
 * @data: pointer to start of frame
 * @len: total length of frame
 *
 * Make a best effort attempt to pull the length for all of the headers for
 * a given frame in a linear buffer.
 */
u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)
{
        const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
        const struct ethhdr *eth = (const struct ethhdr *)data;
        struct flow_keys_basic keys;

        /* this should never happen, but better safe than sorry */
        if (unlikely(len < sizeof(*eth)))
                return len;

        /* parse any remaining L2/L3 headers, check for L4 */
        if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data,
                                              eth->h_proto, sizeof(*eth),
                                              len, flags))
                return max_t(u32, keys.control.thoff, sizeof(*eth));

        /* parse for any L4 headers */
        return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
}
EXPORT_SYMBOL(eth_get_headlen);

/**
 * eth_type_trans - determine the packet's protocol ID.
 * @skb: received socket data
 * @dev: receiving network device
 *
 * The rule here is that we
 * assume 802.3 if the type field is short enough to be a length.
 * This is normal practice and works for any 'now in use' protocol.
 */
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
        unsigned short _service_access_point;
        const unsigned short *sap;
        const struct ethhdr *eth;

        skb->dev = dev;
        skb_reset_mac_header(skb);

        eth = eth_skb_pull_mac(skb);
        eth_skb_pkt_type(skb, dev);

        /*
         * Some variants of DSA tagging don't have an ethertype field
         * at all, so we check here whether one of those tagging
         * variants has been configured on the receiving interface,
         * and if so, set skb->protocol without looking at the packet.
         */
        if (unlikely(netdev_uses_dsa(dev)))
                return htons(ETH_P_XDSA);

        if (likely(eth_proto_is_802_3(eth->h_proto)))
                return eth->h_proto;

        /*
         *      This is a magic hack to spot IPX packets. Older Novell breaks
         *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
         *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
         *      won't work for fault tolerant netware but does for the rest.
         */
        sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point);
        if (sap && *sap == 0xFFFF)
                return htons(ETH_P_802_3);

        /*
         *      Real 802.2 LLC
         */
        return htons(ETH_P_802_2);
}
EXPORT_SYMBOL(eth_type_trans);

/**
 * eth_header_parse - extract hardware address from packet
 * @skb: packet to extract header from
 * @haddr: destination buffer
 */
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr)
{
        const struct ethhdr *eth = eth_hdr(skb);
        memcpy(haddr, eth->h_source, ETH_ALEN);
        return ETH_ALEN;
}
EXPORT_SYMBOL(eth_header_parse);

/**
 * eth_header_cache - fill cache entry from neighbour
 * @neigh: source neighbour
 * @hh: destination cache entry
 * @type: Ethernet type field
 *
 * Create an Ethernet header template from the neighbour.
 */
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
{
        struct ethhdr *eth;
        const struct net_device *dev = neigh->dev;

        eth = (struct ethhdr *)
            (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));

        if (type == htons(ETH_P_802_3))
                return -1;

        eth->h_proto = type;
        memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
        memcpy(eth->h_dest, neigh->ha, ETH_ALEN);

        /* Pairs with READ_ONCE() in neigh_resolve_output(),
         * neigh_hh_output() and neigh_update_hhs().
         */
        smp_store_release(&hh->hh_len, ETH_HLEN);

        return 0;
}
EXPORT_SYMBOL(eth_header_cache);

/**
 * eth_header_cache_update - update cache entry
 * @hh: destination cache entry
 * @dev: network device
 * @haddr: new hardware address
 *
 * Called by Address Resolution module to notify changes in address.
 */
void eth_header_cache_update(struct hh_cache *hh,
                             const struct net_device *dev,
                             const unsigned char *haddr)
{
        memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)),
               haddr, ETH_ALEN);
}
EXPORT_SYMBOL(eth_header_cache_update);

/**
 * eth_header_parse_protocol - extract protocol from L2 header
 * @skb: packet to extract protocol from
 */
__be16 eth_header_parse_protocol(const struct sk_buff *skb)
{
        const struct ethhdr *eth = eth_hdr(skb);

        return eth->h_proto;
}
EXPORT_SYMBOL(eth_header_parse_protocol);

/**
 * eth_prepare_mac_addr_change - prepare for mac change
 * @dev: network device
 * @p: socket address
 */
int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
{
        struct sockaddr *addr = p;

        if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
                return -EBUSY;
        if (!is_valid_ether_addr(addr->sa_data))
                return -EADDRNOTAVAIL;
        return 0;
}
EXPORT_SYMBOL(eth_prepare_mac_addr_change);

/**
 * eth_commit_mac_addr_change - commit mac change
 * @dev: network device
 * @p: socket address
 */
void eth_commit_mac_addr_change(struct net_device *dev, void *p)
{
        struct sockaddr *addr = p;

        eth_hw_addr_set(dev, addr->sa_data);
}
EXPORT_SYMBOL(eth_commit_mac_addr_change);

/**
 * eth_mac_addr - set new Ethernet hardware address
 * @dev: network device
 * @p: socket address
 *
 * Change hardware address of device.
 *
 * This doesn't change hardware matching, so needs to be overridden
 * for most real devices.
 */
int eth_mac_addr(struct net_device *dev, void *p)
{
        int ret;

        ret = eth_prepare_mac_addr_change(dev, p);
        if (ret < 0)
                return ret;
        eth_commit_mac_addr_change(dev, p);
        return 0;
}
EXPORT_SYMBOL(eth_mac_addr);

int eth_validate_addr(struct net_device *dev)
{
        if (!is_valid_ether_addr(dev->dev_addr))
                return -EADDRNOTAVAIL;

        return 0;
}
EXPORT_SYMBOL(eth_validate_addr);

const struct header_ops eth_header_ops ____cacheline_aligned = {
        .create                = eth_header,
        .parse                = eth_header_parse,
        .cache                = eth_header_cache,
        .cache_update        = eth_header_cache_update,
        .parse_protocol        = eth_header_parse_protocol,
};

/**
 * ether_setup - setup Ethernet network device
 * @dev: network device
 *
 * Fill in the fields of the device structure with Ethernet-generic values.
 */
void ether_setup(struct net_device *dev)
{
        dev->header_ops                = &eth_header_ops;
        dev->type                = ARPHRD_ETHER;
        dev->hard_header_len         = ETH_HLEN;
        dev->min_header_len        = ETH_HLEN;
        dev->mtu                = ETH_DATA_LEN;
        dev->min_mtu                = ETH_MIN_MTU;
        dev->max_mtu                = ETH_DATA_LEN;
        dev->addr_len                = ETH_ALEN;
        dev->tx_queue_len        = DEFAULT_TX_QUEUE_LEN;
        dev->flags                = IFF_BROADCAST|IFF_MULTICAST;
        dev->priv_flags                |= IFF_TX_SKB_SHARING;

        eth_broadcast_addr(dev->broadcast);

}
EXPORT_SYMBOL(ether_setup);

/**
 * alloc_etherdev_mqs - Allocates and sets up an Ethernet device
 * @sizeof_priv: Size of additional driver-private structure to be allocated
 *        for this Ethernet device
 * @txqs: The number of TX queues this device has.
 * @rxqs: The number of RX queues this device has.
 *
 * Fill in the fields of the device structure with Ethernet-generic
 * values. Basically does everything except registering the device.
 *
 * Constructs a new net device, complete with a private data area of
 * size (sizeof_priv).  A 32-byte (not bit) alignment is enforced for
 * this private data area.
 */

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                      unsigned int rxqs)
{
        return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_ENUM,
                                ether_setup, txqs, rxqs);
}
EXPORT_SYMBOL(alloc_etherdev_mqs);

ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
{
        return sysfs_emit(buf, "%*phC\n", len, addr);
}
EXPORT_SYMBOL(sysfs_format_mac);

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
{
        const struct packet_offload *ptype;
        unsigned int hlen, off_eth;
        struct sk_buff *pp = NULL;
        struct ethhdr *eh, *eh2;
        struct sk_buff *p;
        __be16 type;
        int flush = 1;

        off_eth = skb_gro_offset(skb);
        hlen = off_eth + sizeof(*eh);
        eh = skb_gro_header(skb, hlen, off_eth);
        if (unlikely(!eh))
                goto out;

        flush = 0;

        list_for_each_entry(p, head, list) {
                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                eh2 = (struct ethhdr *)(p->data + off_eth);
                if (compare_ether_header(eh, eh2)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        type = eh->h_proto;

        ptype = gro_find_receive_by_type(type);
        if (ptype == NULL) {
                flush = 1;
                goto out;
        }

        skb_gro_pull(skb, sizeof(*eh));
        skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));

        pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
                                            ipv6_gro_receive, inet_gro_receive,
                                            head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}
EXPORT_SYMBOL(eth_gro_receive);

int eth_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
        __be16 type = eh->h_proto;
        struct packet_offload *ptype;
        int err = -ENOSYS;

        if (skb->encapsulation)
                skb_set_inner_mac_header(skb, nhoff);

        ptype = gro_find_complete_by_type(type);
        if (ptype != NULL)
                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
                                         ipv6_gro_complete, inet_gro_complete,
                                         skb, nhoff + sizeof(*eh));

        return err;
}
EXPORT_SYMBOL(eth_gro_complete);

static struct packet_offload eth_packet_offload __read_mostly = {
        .type = cpu_to_be16(ETH_P_TEB),
        .priority = 10,
        .callbacks = {
                .gro_receive = eth_gro_receive,
                .gro_complete = eth_gro_complete,
        },
};

static int __init eth_offload_init(void)
{
        dev_add_offload(&eth_packet_offload);

        return 0;
}

fs_initcall(eth_offload_init);

unsigned char * __weak arch_get_platform_mac_address(void)
{
        return NULL;
}

int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
{
        unsigned char *addr;
        int ret;

        ret = of_get_mac_address(dev->of_node, mac_addr);
        if (!ret)
                return 0;

        addr = arch_get_platform_mac_address();
        if (!addr)
                return -ENODEV;

        ether_addr_copy(mac_addr, addr);

        return 0;
}
EXPORT_SYMBOL(eth_platform_get_mac_address);

/**
 * platform_get_ethdev_address - Set netdev's MAC address from a given device
 * @dev:        Pointer to the device
 * @netdev:        Pointer to netdev to write the address to
 *
 * Wrapper around eth_platform_get_mac_address() which writes the address
 * directly to netdev->dev_addr.
 */
int platform_get_ethdev_address(struct device *dev, struct net_device *netdev)
{
        u8 addr[ETH_ALEN] __aligned(2);
        int ret;

        ret = eth_platform_get_mac_address(dev, addr);
        if (!ret)
                eth_hw_addr_set(netdev, addr);
        return ret;
}
EXPORT_SYMBOL(platform_get_ethdev_address);

/**
 * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named
 * 'mac-address' associated with given device.
 *
 * @dev:        Device with which the mac-address cell is associated.
 * @addrbuf:        Buffer to which the MAC address will be copied on success.
 *
 * Returns 0 on success or a negative error number on failure.
 */
int nvmem_get_mac_address(struct device *dev, void *addrbuf)
{
        struct nvmem_cell *cell;
        const void *mac;
        size_t len;

        cell = nvmem_cell_get(dev, "mac-address");
        if (IS_ERR(cell))
                return PTR_ERR(cell);

        mac = nvmem_cell_read(cell, &len);
        nvmem_cell_put(cell);

        if (IS_ERR(mac))
                return PTR_ERR(mac);

        if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
                kfree(mac);
                return -EINVAL;
        }

        ether_addr_copy(addrbuf, mac);
        kfree(mac);

        return 0;
}

static int fwnode_get_mac_addr(struct fwnode_handle *fwnode,
                               const char *name, char *addr)
{
        int ret;

        ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN);
        if (ret)
                return ret;

        if (!is_valid_ether_addr(addr))
                return -EINVAL;
        return 0;
}

/**
 * fwnode_get_mac_address - Get the MAC from the firmware node
 * @fwnode:        Pointer to the firmware node
 * @addr:        Address of buffer to store the MAC in
 *
 * Search the firmware node for the best MAC address to use.  'mac-address' is
 * checked first, because that is supposed to contain to "most recent" MAC
 * address. If that isn't set, then 'local-mac-address' is checked next,
 * because that is the default address.  If that isn't set, then the obsolete
 * 'address' is checked, just in case we're using an old device tree.
 *
 * Note that the 'address' property is supposed to contain a virtual address of
 * the register set, but some DTS files have redefined that property to be the
 * MAC address.
 *
 * All-zero MAC addresses are rejected, because those could be properties that
 * exist in the firmware tables, but were not updated by the firmware.  For
 * example, the DTS could define 'mac-address' and 'local-mac-address', with
 * zero MAC addresses.  Some older U-Boots only initialized 'local-mac-address'.
 * In this case, the real MAC is in 'local-mac-address', and 'mac-address'
 * exists but is all zeros.
 */
int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr)
{
        if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) ||
            !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) ||
            !fwnode_get_mac_addr(fwnode, "address", addr))
                return 0;

        return -ENOENT;
}
EXPORT_SYMBOL(fwnode_get_mac_address);

/**
 * device_get_mac_address - Get the MAC for a given device
 * @dev:        Pointer to the device
 * @addr:        Address of buffer to store the MAC in
 */
int device_get_mac_address(struct device *dev, char *addr)
{
        return fwnode_get_mac_address(dev_fwnode(dev), addr);
}
EXPORT_SYMBOL(device_get_mac_address);

/**
 * device_get_ethdev_address - Set netdev's MAC address from a given device
 * @dev:        Pointer to the device
 * @netdev:        Pointer to netdev to write the address to
 *
 * Wrapper around device_get_mac_address() which writes the address
 * directly to netdev->dev_addr.
 */
int device_get_ethdev_address(struct device *dev, struct net_device *netdev)
{
        u8 addr[ETH_ALEN];
        int ret;

        ret = device_get_mac_address(dev, addr);
        if (!ret)
                eth_hw_addr_set(netdev, addr);
        return ret;
}
EXPORT_SYMBOL(device_get_ethdev_address);




























    1 















    1 














    1 
















    1 





    1 




















































































    1 


































    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
// SPDX-License-Identifier: GPL-2.0
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>

#include "internal.h"

static struct vfsmount *nsfs_mnt;

static long ns_ioctl(struct file *filp, unsigned int ioctl,
                        unsigned long arg);
static const struct file_operations ns_file_operations = {
        .llseek                = no_llseek,
        .unlocked_ioctl = ns_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        struct ns_common *ns = inode->i_private;
        const struct proc_ns_operations *ns_ops = ns->ops;

        return dynamic_dname(buffer, buflen, "%s:[%lu]",
                ns_ops->name, inode->i_ino);
}

const struct dentry_operations ns_dentry_operations = {
        .d_delete        = always_delete_dentry,
        .d_dname        = ns_dname,
        .d_prune        = stashed_dentry_prune,
};

static void nsfs_evict(struct inode *inode)
{
        struct ns_common *ns = inode->i_private;
        clear_inode(inode);
        ns->ops->put(ns);
}

int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
                     void *private_data)
{
        struct ns_common *ns;

        ns = ns_get_cb(private_data);
        if (!ns)
                return -ENOENT;

        return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
}

struct ns_get_path_task_args {
        const struct proc_ns_operations *ns_ops;
        struct task_struct *task;
};

static struct ns_common *ns_get_path_task(void *private_data)
{
        struct ns_get_path_task_args *args = private_data;

        return args->ns_ops->get(args->task);
}

int ns_get_path(struct path *path, struct task_struct *task,
                  const struct proc_ns_operations *ns_ops)
{
        struct ns_get_path_task_args args = {
                .ns_ops        = ns_ops,
                .task        = task,
        };

        return ns_get_path_cb(path, ns_get_path_task, &args);
}

int open_related_ns(struct ns_common *ns,
                   struct ns_common *(*get_ns)(struct ns_common *ns))
{
        struct path path = {};
        struct ns_common *relative;
        struct file *f;
        int err;
        int fd;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;

        relative = get_ns(ns);
        if (IS_ERR(relative)) {
                put_unused_fd(fd);
                return PTR_ERR(relative);
        }

        err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
        if (err < 0) {
                put_unused_fd(fd);
                return err;
        }

        f = dentry_open(&path, O_RDONLY, current_cred());
        path_put(&path);
        if (IS_ERR(f)) {
                put_unused_fd(fd);
                fd = PTR_ERR(f);
        } else
                fd_install(fd, f);

        return fd;
}
EXPORT_SYMBOL_GPL(open_related_ns);

static long ns_ioctl(struct file *filp, unsigned int ioctl,
                        unsigned long arg)
{
        struct user_namespace *user_ns;
        struct ns_common *ns = get_proc_ns(file_inode(filp));
        uid_t __user *argp;
        uid_t uid;

        switch (ioctl) {
        case NS_GET_USERNS:
                return open_related_ns(ns, ns_get_owner);
        case NS_GET_PARENT:
                if (!ns->ops->get_parent)
                        return -EINVAL;
                return open_related_ns(ns, ns->ops->get_parent);
        case NS_GET_NSTYPE:
                return ns->ops->type;
        case NS_GET_OWNER_UID:
                if (ns->ops->type != CLONE_NEWUSER)
                        return -EINVAL;
                user_ns = container_of(ns, struct user_namespace, ns);
                argp = (uid_t __user *) arg;
                uid = from_kuid_munged(current_user_ns(), user_ns->owner);
                return put_user(uid, argp);
        default:
                return -ENOTTY;
        }
}

int ns_get_name(char *buf, size_t size, struct task_struct *task,
                        const struct proc_ns_operations *ns_ops)
{
        struct ns_common *ns;
        int res = -ENOENT;
        const char *name;
        ns = ns_ops->get(task);
        if (ns) {
                name = ns_ops->real_ns_name ? : ns_ops->name;
                res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
                ns_ops->put(ns);
        }
        return res;
}

bool proc_ns_file(const struct file *file)
{
        return file->f_op == &ns_file_operations;
}

/**
 * ns_match() - Returns true if current namespace matches dev/ino provided.
 * @ns: current namespace
 * @dev: dev_t from nsfs that will be matched against current nsfs
 * @ino: ino_t from nsfs that will be matched against current nsfs
 *
 * Return: true if dev and ino matches the current nsfs.
 */
bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
{
        return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev);
}


static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        const struct ns_common *ns = inode->i_private;
        const struct proc_ns_operations *ns_ops = ns->ops;

        seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
        return 0;
}

static const struct super_operations nsfs_ops = {
        .statfs = simple_statfs,
        .evict_inode = nsfs_evict,
        .show_path = nsfs_show_path,
};

static int nsfs_init_inode(struct inode *inode, void *data)
{
        struct ns_common *ns = data;

        inode->i_private = data;
        inode->i_mode |= S_IRUGO;
        inode->i_fop = &ns_file_operations;
        inode->i_ino = ns->inum;
        return 0;
}

static void nsfs_put_data(void *data)
{
        struct ns_common *ns = data;
        ns->ops->put(ns);
}

static const struct stashed_operations nsfs_stashed_ops = {
        .init_inode = nsfs_init_inode,
        .put_data = nsfs_put_data,
};

static int nsfs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &nsfs_ops;
        ctx->dops = &ns_dentry_operations;
        fc->s_fs_info = (void *)&nsfs_stashed_ops;
        return 0;
}

static struct file_system_type nsfs = {
        .name = "nsfs",
        .init_fs_context = nsfs_init_fs_context,
        .kill_sb = kill_anon_super,
};

void __init nsfs_init(void)
{
        nsfs_mnt = kern_mount(&nsfs);
        if (IS_ERR(nsfs_mnt))
                panic("can't set nsfs up\n");
        nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
}







































    1 








    1 

    1 


    1 



    1 

    1 






























    1 


    1 












    1 



















    1 

















    1 










    1 
























    1 

















































    1 























    1 
    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/skbuff.h>
#include <linux/sctp.h>
#include <net/gso.h>
#include <net/gro.h>

/**
 *        skb_eth_gso_segment - segmentation handler for ethernet protocols.
 *        @skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 *        @type: Ethernet Protocol ID
 */
struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
                                    netdev_features_t features, __be16 type)
{
        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
        struct packet_offload *ptype;

        rcu_read_lock();
        list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
                if (ptype->type == type && ptype->callbacks.gso_segment) {
                        segs = ptype->callbacks.gso_segment(skb, features);
                        break;
                }
        }
        rcu_read_unlock();

        return segs;
}
EXPORT_SYMBOL(skb_eth_gso_segment);

/**
 *        skb_mac_gso_segment - mac layer segmentation handler.
 *        @skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 */
struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
                                    netdev_features_t features)
{
        struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
        struct packet_offload *ptype;
        int vlan_depth = skb->mac_len;
        __be16 type = skb_network_protocol(skb, &vlan_depth);

        if (unlikely(!type))
                return ERR_PTR(-EINVAL);

        __skb_pull(skb, vlan_depth);

        rcu_read_lock();
        list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
                if (ptype->type == type && ptype->callbacks.gso_segment) {
                        segs = ptype->callbacks.gso_segment(skb, features);
                        break;
                }
        }
        rcu_read_unlock();

        __skb_push(skb, skb->data - skb_mac_header(skb));

        return segs;
}
EXPORT_SYMBOL(skb_mac_gso_segment);
/* openvswitch calls this on rx path, so we need a different check.
 */
static bool skb_needs_check(const struct sk_buff *skb, bool tx_path)
{
        if (tx_path)
                return skb->ip_summed != CHECKSUM_PARTIAL &&
                       skb->ip_summed != CHECKSUM_UNNECESSARY;

        return skb->ip_summed == CHECKSUM_NONE;
}

/**
 *        __skb_gso_segment - Perform segmentation on skb.
 *        @skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 *        @tx_path: whether it is called in TX path
 *
 *        This function segments the given skb and returns a list of segments.
 *
 *        It may return NULL if the skb requires no segmentation.  This is
 *        only possible when GSO is used for verifying header integrity.
 *
 *        Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
 */
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                                  netdev_features_t features, bool tx_path)
{
        struct sk_buff *segs;

        if (unlikely(skb_needs_check(skb, tx_path))) {
                int err;

                /* We're going to init ->check field in TCP or UDP header */
                err = skb_cow_head(skb, 0);
                if (err < 0)
                        return ERR_PTR(err);
        }

        /* Only report GSO partial support if it will enable us to
         * support segmentation on this frame without needing additional
         * work.
         */
        if (features & NETIF_F_GSO_PARTIAL) {
                netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
                struct net_device *dev = skb->dev;

                partial_features |= dev->features & dev->gso_partial_features;
                if (!skb_gso_ok(skb, features | partial_features))
                        features &= ~NETIF_F_GSO_PARTIAL;
        }

        BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
                     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));

        SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
        SKB_GSO_CB(skb)->encap_level = 0;

        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        segs = skb_mac_gso_segment(skb, features);

        if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
                skb_warn_bad_offload(skb);

        return segs;
}
EXPORT_SYMBOL(__skb_gso_segment);

/**
 * skb_gso_transport_seglen - Return length of individual segments of a gso packet
 *
 * @skb: GSO skb
 *
 * skb_gso_transport_seglen is used to determine the real size of the
 * individual segments, including Layer4 headers (TCP/UDP).
 *
 * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
 */
static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
        const struct skb_shared_info *shinfo = skb_shinfo(skb);
        unsigned int thlen = 0;

        if (skb->encapsulation) {
                thlen = skb_inner_transport_header(skb) -
                        skb_transport_header(skb);

                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
                        thlen += inner_tcp_hdrlen(skb);
        } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
                thlen = tcp_hdrlen(skb);
        } else if (unlikely(skb_is_gso_sctp(skb))) {
                thlen = sizeof(struct sctphdr);
        } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
                thlen = sizeof(struct udphdr);
        }
        /* UFO sets gso_size to the size of the fragmentation
         * payload, i.e. the size of the L4 (UDP) header is already
         * accounted for.
         */
        return thlen + shinfo->gso_size;
}

/**
 * skb_gso_network_seglen - Return length of individual segments of a gso packet
 *
 * @skb: GSO skb
 *
 * skb_gso_network_seglen is used to determine the real size of the
 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
 *
 * The MAC/L2 header is not accounted for.
 */
static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
{
        unsigned int hdr_len = skb_transport_header(skb) -
                               skb_network_header(skb);

        return hdr_len + skb_gso_transport_seglen(skb);
}

/**
 * skb_gso_mac_seglen - Return length of individual segments of a gso packet
 *
 * @skb: GSO skb
 *
 * skb_gso_mac_seglen is used to determine the real size of the
 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
 * headers (TCP/UDP).
 */
static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
{
        unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);

        return hdr_len + skb_gso_transport_seglen(skb);
}

/**
 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
 *
 * There are a couple of instances where we have a GSO skb, and we
 * want to determine what size it would be after it is segmented.
 *
 * We might want to check:
 * -    L3+L4+payload size (e.g. IP forwarding)
 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
 *
 * This is a helper to do that correctly considering GSO_BY_FRAGS.
 *
 * @skb: GSO skb
 *
 * @seg_len: The segmented length (from skb_gso_*_seglen). In the
 *           GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
 *
 * @max_len: The maximum permissible length.
 *
 * Returns true if the segmented length <= max length.
 */
static inline bool skb_gso_size_check(const struct sk_buff *skb,
                                      unsigned int seg_len,
                                      unsigned int max_len) {
        const struct skb_shared_info *shinfo = skb_shinfo(skb);
        const struct sk_buff *iter;

        if (shinfo->gso_size != GSO_BY_FRAGS)
                return seg_len <= max_len;

        /* Undo this so we can re-use header sizes */
        seg_len -= GSO_BY_FRAGS;

        skb_walk_frags(skb, iter) {
                if (seg_len + skb_headlen(iter) > max_len)
                        return false;
        }

        return true;
}

/**
 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
 *
 * @skb: GSO skb
 * @mtu: MTU to validate against
 *
 * skb_gso_validate_network_len validates if a given skb will fit a
 * wanted MTU once split. It considers L3 headers, L4 headers, and the
 * payload.
 */
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
{
        return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
}
EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);

/**
 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
 *
 * @skb: GSO skb
 * @len: length to validate against
 *
 * skb_gso_validate_mac_len validates if a given skb will fit a wanted
 * length once split, including L2, L3 and L4 headers and the payload.
 */
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
{
        return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
}
EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);










































































































































































































































    1 




























































































































































































































































































































































    1 





    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
// SPDX-License-Identifier: GPL-2.0

/*
 * fs/ext4/fast_commit.c
 *
 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
 *
 * Ext4 fast commits routines.
 */
#include "ext4.h"
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "mballoc.h"

/*
 * Ext4 Fast Commits
 * -----------------
 *
 * Ext4 fast commits implement fine grained journalling for Ext4.
 *
 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
 * TLV during the recovery phase. For the scenarios for which we currently
 * don't have replay code, fast commit falls back to full commits.
 * Fast commits record delta in one of the following three categories.
 *
 * (A) Directory entry updates:
 *
 * - EXT4_FC_TAG_UNLINK                - records directory entry unlink
 * - EXT4_FC_TAG_LINK                - records directory entry link
 * - EXT4_FC_TAG_CREAT                - records inode and directory entry creation
 *
 * (B) File specific data range updates:
 *
 * - EXT4_FC_TAG_ADD_RANGE        - records addition of new blocks to an inode
 * - EXT4_FC_TAG_DEL_RANGE        - records deletion of blocks from an inode
 *
 * (C) Inode metadata (mtime / ctime etc):
 *
 * - EXT4_FC_TAG_INODE                - record the inode that should be replayed
 *                                  during recovery. Note that iblocks field is
 *                                  not replayed and instead derived during
 *                                  replay.
 * Commit Operation
 * ----------------
 * With fast commits, we maintain all the directory entry operations in the
 * order in which they are issued in an in-memory queue. This queue is flushed
 * to disk during the commit operation. We also maintain a list of inodes
 * that need to be committed during a fast commit in another in memory queue of
 * inodes. During the commit operation, we commit in the following order:
 *
 * [1] Lock inodes for any further data updates by setting COMMITTING state
 * [2] Submit data buffers of all the inodes
 * [3] Wait for [2] to complete
 * [4] Commit all the directory entry updates in the fast commit space
 * [5] Commit all the changed inode structures
 * [6] Write tail tag (this tag ensures the atomicity, please read the following
 *     section for more details).
 * [7] Wait for [4], [5] and [6] to complete.
 *
 * All the inode updates must call ext4_fc_start_update() before starting an
 * update. If such an ongoing update is present, fast commit waits for it to
 * complete. The completion of such an update is marked by
 * ext4_fc_stop_update().
 *
 * Fast Commit Ineligibility
 * -------------------------
 *
 * Not all operations are supported by fast commits today (e.g extended
 * attributes). Fast commit ineligibility is marked by calling
 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
 * to full commit.
 *
 * Atomicity of commits
 * --------------------
 * In order to guarantee atomicity during the commit operation, fast commit
 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
 * tag contains CRC of the contents and TID of the transaction after which
 * this fast commit should be applied. Recovery code replays fast commit
 * logs only if there's at least 1 valid tail present. For every fast commit
 * operation, there is 1 tail. This means, we may end up with multiple tails
 * in the fast commit space. Here's an example:
 *
 * - Create a new file A and remove existing file B
 * - fsync()
 * - Append contents to file A
 * - Truncate file A
 * - fsync()
 *
 * The fast commit space at the end of above operations would look like this:
 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 *
 * Replay code should thus check for all the valid tails in the FC area.
 *
 * Fast Commit Replay Idempotence
 * ------------------------------
 *
 * Fast commits tags are idempotent in nature provided the recovery code follows
 * certain rules. The guiding principle that the commit path follows while
 * committing is that it stores the result of a particular operation instead of
 * storing the procedure.
 *
 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 * was associated with inode 10. During fast commit, instead of storing this
 * operation as a procedure "rename a to b", we store the resulting file system
 * state as a "series" of outcomes:
 *
 * - Link dirent b to inode 10
 * - Unlink dirent a
 * - Inode <10> with valid refcount
 *
 * Now when recovery code runs, it needs "enforce" this state on the file
 * system. This is what guarantees idempotence of fast commit replay.
 *
 * Let's take an example of a procedure that is not idempotent and see how fast
 * commits make it idempotent. Consider following sequence of operations:
 *
 *     rm A;    mv B A;    read A
 *  (x)     (y)        (z)
 *
 * (x), (y) and (z) are the points at which we can crash. If we store this
 * sequence of operations as is then the replay is not idempotent. Let's say
 * while in replay, we crash at (z). During the second replay, file A (which was
 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 * file named A would be absent when we try to read A. So, this sequence of
 * operations is not idempotent. However, as mentioned above, instead of storing
 * the procedure fast commits store the outcome of each procedure. Thus the fast
 * commit log for above procedure would be as follows:
 *
 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 * inode 11 before the replay)
 *
 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 * (w)          (x)                    (y)          (z)
 *
 * If we crash at (z), we will have file A linked to inode 11. During the second
 * replay, we will remove file A (inode 11). But we will create it back and make
 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 * similarly. Thus, by converting a non-idempotent procedure into a series of
 * idempotent outcomes, fast commits ensured idempotence during the replay.
 *
 * TODOs
 * -----
 *
 * 0) Fast commit replay path hardening: Fast commit replay code should use
 *    journal handles to make sure all the updates it does during the replay
 *    path are atomic. With that if we crash during fast commit replay, after
 *    trying to do recovery again, we will find a file system where fast commit
 *    area is invalid (because new full commit would be found). In order to deal
 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 *    superblock state is persisted before starting the replay, so that after
 *    the crash, fast commit recovery code can look at that flag and perform
 *    fast commit recovery even if that area is invalidated by later full
 *    commits.
 *
 * 1) Fast commit's commit path locks the entire file system during fast
 *    commit. This has significant performance penalty. Instead of that, we
 *    should use ext4_fc_start/stop_update functions to start inode level
 *    updates from ext4_journal_start/stop. Once we do that we can drop file
 *    system locking during commit path.
 *
 * 2) Handle more ineligible cases.
 */

#include <trace/events/ext4.h>
static struct kmem_cache *ext4_fc_dentry_cachep;

static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
        BUFFER_TRACE(bh, "");
        if (uptodate) {
                ext4_debug("%s: Block %lld up-to-date",
                           __func__, bh->b_blocknr);
                set_buffer_uptodate(bh);
        } else {
                ext4_debug("%s: Block %lld not up-to-date",
                           __func__, bh->b_blocknr);
                clear_buffer_uptodate(bh);
        }

        unlock_buffer(bh);
}

static inline void ext4_fc_reset_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ei->i_fc_lblk_start = 0;
        ei->i_fc_lblk_len = 0;
}

void ext4_fc_init_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_fc_reset_inode(inode);
        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
        INIT_LIST_HEAD(&ei->i_fc_list);
        INIT_LIST_HEAD(&ei->i_fc_dilist);
        init_waitqueue_head(&ei->i_fc_wait);
        atomic_set(&ei->i_fc_updates, 0);
}

/* This function must be called with sbi->s_fc_lock held. */
static void ext4_fc_wait_committing_inode(struct inode *inode)
__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
{
        wait_queue_head_t *wq;
        struct ext4_inode_info *ei = EXT4_I(inode);

#if (BITS_PER_LONG < 64)
        DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
                        EXT4_STATE_FC_COMMITTING);
        wq = bit_waitqueue(&ei->i_state_flags,
                                EXT4_STATE_FC_COMMITTING);
#else
        DEFINE_WAIT_BIT(wait, &ei->i_flags,
                        EXT4_STATE_FC_COMMITTING);
        wq = bit_waitqueue(&ei->i_flags,
                                EXT4_STATE_FC_COMMITTING);
#endif
        lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        schedule();
        finish_wait(wq, &wait.wq_entry);
}

static bool ext4_fc_disabled(struct super_block *sb)
{
        return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
                (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
}

/*
 * Inform Ext4's fast about start of an inode update
 *
 * This function is called by the high level call VFS callbacks before
 * performing any inode update. This function blocks if there's an ongoing
 * fast commit on the inode in question.
 */
void ext4_fc_start_update(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_fc_disabled(inode->i_sb))
                return;

restart:
        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        if (list_empty(&ei->i_fc_list))
                goto out;

        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
                ext4_fc_wait_committing_inode(inode);
                goto restart;
        }
out:
        atomic_inc(&ei->i_fc_updates);
        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
}

/*
 * Stop inode update and wake up waiting fast commits if any.
 */
void ext4_fc_stop_update(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (atomic_dec_and_test(&ei->i_fc_updates))
                wake_up_all(&ei->i_fc_wait);
}

/*
 * Remove inode from fast commit list. If the inode is being committed
 * we wait until inode commit is done.
 */
void ext4_fc_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_fc_dentry_update *fc_dentry;

        if (ext4_fc_disabled(inode->i_sb))
                return;

restart:
        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
                spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
                return;
        }

        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
                ext4_fc_wait_committing_inode(inode);
                goto restart;
        }

        if (!list_empty(&ei->i_fc_list))
                list_del_init(&ei->i_fc_list);

        /*
         * Since this inode is getting removed, let's also remove all FC
         * dentry create references, since it is not needed to log it anyways.
         */
        if (list_empty(&ei->i_fc_dilist)) {
                spin_unlock(&sbi->s_fc_lock);
                return;
        }

        fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
        WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
        list_del_init(&fc_dentry->fcd_list);
        list_del_init(&fc_dentry->fcd_dilist);

        WARN_ON(!list_empty(&ei->i_fc_dilist));
        spin_unlock(&sbi->s_fc_lock);

        if (fc_dentry->fcd_name.name &&
                fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
                kfree(fc_dentry->fcd_name.name);
        kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);

        return;
}

/*
 * Mark file system as fast commit ineligible, and record latest
 * ineligible transaction tid. This means until the recorded
 * transaction, commit operation would result in a full jbd2 commit.
 */
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        tid_t tid;

        if (ext4_fc_disabled(sb))
                return;

        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        if (handle && !IS_ERR(handle))
                tid = handle->h_transaction->t_tid;
        else {
                read_lock(&sbi->s_journal->j_state_lock);
                tid = sbi->s_journal->j_running_transaction ?
                                sbi->s_journal->j_running_transaction->t_tid : 0;
                read_unlock(&sbi->s_journal->j_state_lock);
        }
        spin_lock(&sbi->s_fc_lock);
        if (sbi->s_fc_ineligible_tid < tid)
                sbi->s_fc_ineligible_tid = tid;
        spin_unlock(&sbi->s_fc_lock);
        WARN_ON(reason >= EXT4_FC_REASON_MAX);
        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}

/*
 * Generic fast commit tracking function. If this is the first time this we are
 * called after a full commit, we initialize fast commit fields and then call
 * __fc_track_fn() with update = 0. If we have already been called after a full
 * commit, we pass update = 1. Based on that, the track function can determine
 * if it needs to track a field for the first time or if it needs to just
 * update the previously tracked value.
 *
 * If enqueue is set, this function enqueues the inode in fast commit list.
 */
static int ext4_fc_track_template(
        handle_t *handle, struct inode *inode,
        int (*__fc_track_fn)(struct inode *, void *, bool),
        void *args, int enqueue)
{
        bool update = false;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        tid_t tid = 0;
        int ret;

        tid = handle->h_transaction->t_tid;
        mutex_lock(&ei->i_fc_lock);
        if (tid == ei->i_sync_tid) {
                update = true;
        } else {
                ext4_fc_reset_inode(inode);
                ei->i_sync_tid = tid;
        }
        ret = __fc_track_fn(inode, args, update);
        mutex_unlock(&ei->i_fc_lock);

        if (!enqueue)
                return ret;

        spin_lock(&sbi->s_fc_lock);
        if (list_empty(&EXT4_I(inode)->i_fc_list))
                list_add_tail(&EXT4_I(inode)->i_fc_list,
                                (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
                                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
                                &sbi->s_fc_q[FC_Q_STAGING] :
                                &sbi->s_fc_q[FC_Q_MAIN]);
        spin_unlock(&sbi->s_fc_lock);

        return ret;
}

struct __track_dentry_update_args {
        struct dentry *dentry;
        int op;
};

/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
static int __track_dentry_update(struct inode *inode, void *arg, bool update)
{
        struct ext4_fc_dentry_update *node;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct __track_dentry_update_args *dentry_update =
                (struct __track_dentry_update_args *)arg;
        struct dentry *dentry = dentry_update->dentry;
        struct inode *dir = dentry->d_parent->d_inode;
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        mutex_unlock(&ei->i_fc_lock);

        if (IS_ENCRYPTED(dir)) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
                                        NULL);
                mutex_lock(&ei->i_fc_lock);
                return -EOPNOTSUPP;
        }

        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
        if (!node) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
                mutex_lock(&ei->i_fc_lock);
                return -ENOMEM;
        }

        node->fcd_op = dentry_update->op;
        node->fcd_parent = dir->i_ino;
        node->fcd_ino = inode->i_ino;
        if (dentry->d_name.len > DNAME_INLINE_LEN) {
                node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
                if (!node->fcd_name.name) {
                        kmem_cache_free(ext4_fc_dentry_cachep, node);
                        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
                        mutex_lock(&ei->i_fc_lock);
                        return -ENOMEM;
                }
                memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
                        dentry->d_name.len);
        } else {
                memcpy(node->fcd_iname, dentry->d_name.name,
                        dentry->d_name.len);
                node->fcd_name.name = node->fcd_iname;
        }
        node->fcd_name.len = dentry->d_name.len;
        INIT_LIST_HEAD(&node->fcd_dilist);
        spin_lock(&sbi->s_fc_lock);
        if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
                sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
                list_add_tail(&node->fcd_list,
                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
        else
                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);

        /*
         * This helps us keep a track of all fc_dentry updates which is part of
         * this ext4 inode. So in case the inode is getting unlinked, before
         * even we get a chance to fsync, we could remove all fc_dentry
         * references while evicting the inode in ext4_fc_del().
         * Also with this, we don't need to loop over all the inodes in
         * sbi->s_fc_q to get the corresponding inode in
         * ext4_fc_commit_dentry_updates().
         */
        if (dentry_update->op == EXT4_FC_TAG_CREAT) {
                WARN_ON(!list_empty(&ei->i_fc_dilist));
                list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
        }
        spin_unlock(&sbi->s_fc_lock);
        mutex_lock(&ei->i_fc_lock);

        return 0;
}

void __ext4_fc_track_unlink(handle_t *handle,
                struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_UNLINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
}

void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        __ext4_fc_track_unlink(handle, inode, dentry);
}

void __ext4_fc_track_link(handle_t *handle,
        struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_LINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_link(handle, inode, dentry, ret);
}

void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        __ext4_fc_track_link(handle, inode, dentry);
}

void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                          struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_CREAT;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_create(handle, inode, dentry, ret);
}

void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        __ext4_fc_track_create(handle, inode, dentry);
}

/* __track_fn for inode tracking */
static int __track_inode(struct inode *inode, void *arg, bool update)
{
        if (update)
                return -EEXIST;

        EXT4_I(inode)->i_fc_lblk_len = 0;

        return 0;
}

void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_should_journal_data(inode)) {
                ext4_fc_mark_ineligible(inode->i_sb,
                                        EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
                return;
        }

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
        trace_ext4_fc_track_inode(handle, inode, ret);
}

struct __track_range_args {
        ext4_lblk_t start, end;
};

/* __track_fn for tracking data updates */
static int __track_range(struct inode *inode, void *arg, bool update)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_lblk_t oldstart;
        struct __track_range_args *__arg =
                (struct __track_range_args *)arg;

        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
                ext4_debug("Special inode %ld being modified\n", inode->i_ino);
                return -ECANCELED;
        }

        oldstart = ei->i_fc_lblk_start;

        if (update && ei->i_fc_lblk_len > 0) {
                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
                ei->i_fc_lblk_len =
                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
                                ei->i_fc_lblk_start + 1;
        } else {
                ei->i_fc_lblk_start = __arg->start;
                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
        }

        return 0;
}

void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end)
{
        struct __track_range_args args;
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        args.start = start;
        args.end = end;

        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);

        trace_ext4_fc_track_range(handle, inode, start, end, ret);
}

static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
{
        blk_opf_t write_flags = REQ_SYNC;
        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;

        /* Add REQ_FUA | REQ_PREFLUSH only its tail */
        if (test_opt(sb, BARRIER) && is_tail)
                write_flags |= REQ_FUA | REQ_PREFLUSH;
        lock_buffer(bh);
        set_buffer_dirty(bh);
        set_buffer_uptodate(bh);
        bh->b_end_io = ext4_end_buffer_io_sync;
        submit_bh(REQ_OP_WRITE | write_flags, bh);
        EXT4_SB(sb)->s_fc_bh = NULL;
}

/* Ext4 commit path routines */

/*
 * Allocate len bytes on a fast commit buffer.
 *
 * During the commit time this function is used to manage fast commit
 * block space. We don't split a fast commit log onto different
 * blocks. So this function makes sure that if there's not enough space
 * on the current block, the remaining space in the current block is
 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 * new block is from jbd2 and CRC is updated to reflect the padding
 * we added.
 */
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
        struct ext4_fc_tl tl;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        int bsize = sbi->s_journal->j_blocksize;
        int ret, off = sbi->s_fc_bytes % bsize;
        int remaining;
        u8 *dst;

        /*
         * If 'len' is too long to fit in any block alongside a PAD tlv, then we
         * cannot fulfill the request.
         */
        if (len > bsize - EXT4_FC_TAG_BASE_LEN)
                return NULL;

        if (!sbi->s_fc_bh) {
                ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
                if (ret)
                        return NULL;
                sbi->s_fc_bh = bh;
        }
        dst = sbi->s_fc_bh->b_data + off;

        /*
         * Allocate the bytes in the current block if we can do so while still
         * leaving enough space for a PAD tlv.
         */
        remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
        if (len <= remaining) {
                sbi->s_fc_bytes += len;
                return dst;
        }

        /*
         * Else, terminate the current block with a PAD tlv, then allocate a new
         * block and allocate the bytes at the start of that new block.
         */

        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
        tl.fc_len = cpu_to_le16(remaining);
        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
        *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);

        ext4_fc_submit_bh(sb, false);

        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
        if (ret)
                return NULL;
        sbi->s_fc_bh = bh;
        sbi->s_fc_bytes += bsize - off + len;
        return sbi->s_fc_bh->b_data;
}

/*
 * Complete a fast commit by writing tail tag.
 *
 * Writing tail tag marks the end of a fast commit. In order to guarantee
 * atomicity, after writing tail tag, even if there's space remaining
 * in the block, next commit shouldn't use it. That's why tail tag
 * has the length as that of the remaining space on the block.
 */
static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl tl;
        struct ext4_fc_tail tail;
        int off, bsize = sbi->s_journal->j_blocksize;
        u8 *dst;

        /*
         * ext4_fc_reserve_space takes care of allocating an extra block if
         * there's no enough space on this block for accommodating this tail.
         */
        dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
        if (!dst)
                return -ENOSPC;

        off = sbi->s_fc_bytes % bsize;

        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
        tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
        memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
        dst += sizeof(tail.fc_tid);
        crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
                          dst - (u8 *)sbi->s_fc_bh->b_data);
        tail.fc_crc = cpu_to_le32(crc);
        memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
        dst += sizeof(tail.fc_crc);
        memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */

        ext4_fc_submit_bh(sb, true);

        return 0;
}

/*
 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 * Returns false if there's not enough space.
 */
static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
                           u32 *crc)
{
        struct ext4_fc_tl tl;
        u8 *dst;

        dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
        if (!dst)
                return false;

        tl.fc_tag = cpu_to_le16(tag);
        tl.fc_len = cpu_to_le16(len);

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);

        return true;
}

/* Same as above, but adds dentry tlv. */
static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
                                   struct ext4_fc_dentry_update *fc_dentry)
{
        struct ext4_fc_dentry_info fcd;
        struct ext4_fc_tl tl;
        int dlen = fc_dentry->fcd_name.len;
        u8 *dst = ext4_fc_reserve_space(sb,
                        EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);

        if (!dst)
                return false;

        fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
        fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
        tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        memcpy(dst, &fcd, sizeof(fcd));
        dst += sizeof(fcd);
        memcpy(dst, fc_dentry->fcd_name.name, dlen);

        return true;
}

/*
 * Writes inode in the fast commit space under TLV with tag @tag.
 * Returns 0 on success, error on failure.
 */
static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
        int ret;
        struct ext4_iloc iloc;
        struct ext4_fc_inode fc_inode;
        struct ext4_fc_tl tl;
        u8 *dst;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                inode_len = EXT4_INODE_SIZE(inode->i_sb);
        else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
                inode_len += ei->i_extra_isize;

        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));

        ret = -ECANCELED;
        dst = ext4_fc_reserve_space(inode->i_sb,
                EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
        if (!dst)
                goto err;

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        memcpy(dst, &fc_inode, sizeof(fc_inode));
        dst += sizeof(fc_inode);
        memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
        ret = 0;
err:
        brelse(iloc.bh);
        return ret;
}

/*
 * Writes updated data ranges for the inode in question. Updates CRC.
 * Returns 0 on success, error otherwise.
 */
static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
{
        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_map_blocks map;
        struct ext4_fc_add_range fc_ext;
        struct ext4_fc_del_range lrange;
        struct ext4_extent *ex;
        int ret;

        mutex_lock(&ei->i_fc_lock);
        if (ei->i_fc_lblk_len == 0) {
                mutex_unlock(&ei->i_fc_lock);
                return 0;
        }
        old_blk_size = ei->i_fc_lblk_start;
        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
        ei->i_fc_lblk_len = 0;
        mutex_unlock(&ei->i_fc_lock);

        cur_lblk_off = old_blk_size;
        ext4_debug("will try writing %d to %d for inode %ld\n",
                   cur_lblk_off, new_blk_size, inode->i_ino);

        while (cur_lblk_off <= new_blk_size) {
                map.m_lblk = cur_lblk_off;
                map.m_len = new_blk_size - cur_lblk_off + 1;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        return -ECANCELED;

                if (map.m_len == 0) {
                        cur_lblk_off++;
                        continue;
                }

                if (ret == 0) {
                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
                        lrange.fc_len = cpu_to_le32(map.m_len);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
                                            sizeof(lrange), (u8 *)&lrange, crc))
                                return -ENOSPC;
                } else {
                        unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
                                EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;

                        /* Limit the number of blocks in one extent */
                        map.m_len = min(max, map.m_len);

                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
                        ex->ee_block = cpu_to_le32(map.m_lblk);
                        ex->ee_len = cpu_to_le16(map.m_len);
                        ext4_ext_store_pblock(ex, map.m_pblk);
                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
                                ext4_ext_mark_unwritten(ex);
                        else
                                ext4_ext_mark_initialized(ex);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
                                return -ENOSPC;
                }

                cur_lblk_off += map.m_len;
        }

        return 0;
}


/* Submit data for all the fast commit inodes */
static int ext4_fc_submit_inode_data_all(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *ei;
        int ret = 0;

        spin_lock(&sbi->s_fc_lock);
        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
                while (atomic_read(&ei->i_fc_updates)) {
                        DEFINE_WAIT(wait);

                        prepare_to_wait(&ei->i_fc_wait, &wait,
                                                TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&ei->i_fc_updates)) {
                                spin_unlock(&sbi->s_fc_lock);
                                schedule();
                                spin_lock(&sbi->s_fc_lock);
                        }
                        finish_wait(&ei->i_fc_wait, &wait);
                }
                spin_unlock(&sbi->s_fc_lock);
                ret = jbd2_submit_inode_data(journal, ei->jinode);
                if (ret)
                        return ret;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        return ret;
}

/* Wait for completion of data for all the fast commit inodes */
static int ext4_fc_wait_inode_data_all(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *pos, *n;
        int ret = 0;

        spin_lock(&sbi->s_fc_lock);
        list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                if (!ext4_test_inode_state(&pos->vfs_inode,
                                           EXT4_STATE_FC_COMMITTING))
                        continue;
                spin_unlock(&sbi->s_fc_lock);

                ret = jbd2_wait_inode_data(journal, pos->jinode);
                if (ret)
                        return ret;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        return 0;
}

/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
__acquires(&sbi->s_fc_lock)
__releases(&sbi->s_fc_lock)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
        struct inode *inode;
        struct ext4_inode_info *ei;
        int ret;

        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
                return 0;
        list_for_each_entry_safe(fc_dentry, fc_dentry_n,
                                 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
                        spin_unlock(&sbi->s_fc_lock);
                        if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
                                ret = -ENOSPC;
                                goto lock_and_exit;
                        }
                        spin_lock(&sbi->s_fc_lock);
                        continue;
                }
                /*
                 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
                 * corresponding inode pointer
                 */
                WARN_ON(list_empty(&fc_dentry->fcd_dilist));
                ei = list_first_entry(&fc_dentry->fcd_dilist,
                                struct ext4_inode_info, i_fc_dilist);
                inode = &ei->vfs_inode;
                WARN_ON(inode->i_ino != fc_dentry->fcd_ino);

                spin_unlock(&sbi->s_fc_lock);

                /*
                 * We first write the inode and then the create dirent. This
                 * allows the recovery code to create an unnamed inode first
                 * and then link it to a directory entry. This allows us
                 * to use namei.c routines almost as is and simplifies
                 * the recovery code.
                 */
                ret = ext4_fc_write_inode(inode, crc);
                if (ret)
                        goto lock_and_exit;

                ret = ext4_fc_write_inode_data(inode, crc);
                if (ret)
                        goto lock_and_exit;

                if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
                        ret = -ENOSPC;
                        goto lock_and_exit;
                }

                spin_lock(&sbi->s_fc_lock);
        }
        return 0;
lock_and_exit:
        spin_lock(&sbi->s_fc_lock);
        return ret;
}

static int ext4_fc_perform_commit(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *iter;
        struct ext4_fc_head head;
        struct inode *inode;
        struct blk_plug plug;
        int ret = 0;
        u32 crc = 0;

        ret = ext4_fc_submit_inode_data_all(journal);
        if (ret)
                return ret;

        ret = ext4_fc_wait_inode_data_all(journal);
        if (ret)
                return ret;

        /*
         * If file system device is different from journal device, issue a cache
         * flush before we start writing fast commit blocks.
         */
        if (journal->j_fs_dev != journal->j_dev)
                blkdev_issue_flush(journal->j_fs_dev);

        blk_start_plug(&plug);
        if (sbi->s_fc_bytes == 0) {
                /*
                 * Add a head tag only if this is the first fast commit
                 * in this TID.
                 */
                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
                head.fc_tid = cpu_to_le32(
                        sbi->s_journal->j_running_transaction->t_tid);
                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
                        (u8 *)&head, &crc)) {
                        ret = -ENOSPC;
                        goto out;
                }
        }

        spin_lock(&sbi->s_fc_lock);
        ret = ext4_fc_commit_dentry_updates(journal, &crc);
        if (ret) {
                spin_unlock(&sbi->s_fc_lock);
                goto out;
        }

        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                inode = &iter->vfs_inode;
                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
                        continue;

                spin_unlock(&sbi->s_fc_lock);
                ret = ext4_fc_write_inode_data(inode, &crc);
                if (ret)
                        goto out;
                ret = ext4_fc_write_inode(inode, &crc);
                if (ret)
                        goto out;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        ret = ext4_fc_write_tail(sb, crc);

out:
        blk_finish_plug(&plug);
        return ret;
}

static void ext4_fc_update_stats(struct super_block *sb, int status,
                                 u64 commit_time, int nblks, tid_t commit_tid)
{
        struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;

        ext4_debug("Fast commit ended with status = %d for tid %u",
                        status, commit_tid);
        if (status == EXT4_FC_STATUS_OK) {
                stats->fc_num_commits++;
                stats->fc_numblks += nblks;
                if (likely(stats->s_fc_avg_commit_time))
                        stats->s_fc_avg_commit_time =
                                (commit_time +
                                 stats->s_fc_avg_commit_time * 3) / 4;
                else
                        stats->s_fc_avg_commit_time = commit_time;
        } else if (status == EXT4_FC_STATUS_FAILED ||
                   status == EXT4_FC_STATUS_INELIGIBLE) {
                if (status == EXT4_FC_STATUS_FAILED)
                        stats->fc_failed_commits++;
                stats->fc_ineligible_commits++;
        } else {
                stats->fc_skipped_commits++;
        }
        trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
}

/*
 * The main commit entry point. Performs a fast commit for transaction
 * commit_tid if needed. If it's not possible to perform a fast commit
 * due to various reasons, we fall back to full commit. Returns 0
 * on success, error otherwise.
 */
int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int nblks = 0, ret, bsize = journal->j_blocksize;
        int subtid = atomic_read(&sbi->s_fc_subtid);
        int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
        ktime_t start_time, commit_time;

        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
                return jbd2_complete_transaction(journal, commit_tid);

        trace_ext4_fc_commit_start(sb, commit_tid);

        start_time = ktime_get();

restart_fc:
        ret = jbd2_fc_begin_commit(journal, commit_tid);
        if (ret == -EALREADY) {
                /* There was an ongoing commit, check if we need to restart */
                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
                        commit_tid > journal->j_commit_sequence)
                        goto restart_fc;
                ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
                                commit_tid);
                return 0;
        } else if (ret) {
                /*
                 * Commit couldn't start. Just update stats and perform a
                 * full commit.
                 */
                ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
                                commit_tid);
                return jbd2_complete_transaction(journal, commit_tid);
        }

        /*
         * After establishing journal barrier via jbd2_fc_begin_commit(), check
         * if we are fast commit ineligible.
         */
        if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
                status = EXT4_FC_STATUS_INELIGIBLE;
                goto fallback;
        }

        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
        ret = ext4_fc_perform_commit(journal);
        if (ret < 0) {
                status = EXT4_FC_STATUS_FAILED;
                goto fallback;
        }
        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
        ret = jbd2_fc_wait_bufs(journal, nblks);
        if (ret < 0) {
                status = EXT4_FC_STATUS_FAILED;
                goto fallback;
        }
        atomic_inc(&sbi->s_fc_subtid);
        ret = jbd2_fc_end_commit(journal);
        /*
         * weight the commit time higher than the average time so we
         * don't react too strongly to vast changes in the commit time
         */
        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
        return ret;

fallback:
        ret = jbd2_fc_end_commit_fallback(journal);
        ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
        return ret;
}

/*
 * Fast commit cleanup routine. This is called after every fast commit and
 * full commit. full is true if we are called after a full commit.
 */
static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *iter, *iter_n;
        struct ext4_fc_dentry_update *fc_dentry;

        if (full && sbi->s_fc_bh)
                sbi->s_fc_bh = NULL;

        trace_ext4_fc_cleanup(journal, full, tid);
        jbd2_fc_release_bufs(journal);

        spin_lock(&sbi->s_fc_lock);
        list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
                                 i_fc_list) {
                list_del_init(&iter->i_fc_list);
                ext4_clear_inode_state(&iter->vfs_inode,
                                       EXT4_STATE_FC_COMMITTING);
                if (iter->i_sync_tid <= tid)
                        ext4_fc_reset_inode(&iter->vfs_inode);
                /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
                smp_mb();
#if (BITS_PER_LONG < 64)
                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
        }

        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
                                             struct ext4_fc_dentry_update,
                                             fcd_list);
                list_del_init(&fc_dentry->fcd_list);
                list_del_init(&fc_dentry->fcd_dilist);
                spin_unlock(&sbi->s_fc_lock);

                if (fc_dentry->fcd_name.name &&
                        fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
                        kfree(fc_dentry->fcd_name.name);
                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
                spin_lock(&sbi->s_fc_lock);
        }

        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
                                &sbi->s_fc_q[FC_Q_MAIN]);

        if (tid >= sbi->s_fc_ineligible_tid) {
                sbi->s_fc_ineligible_tid = 0;
                ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        }

        if (full)
                sbi->s_fc_bytes = 0;
        spin_unlock(&sbi->s_fc_lock);
        trace_ext4_fc_stats(sb);
}

/* Ext4 Replay Path Routines */

/* Helper struct for dentry replay routines */
struct dentry_info_args {
        int parent_ino, dname_len, ino, inode_len;
        char *dname;
};

/* Same as struct ext4_fc_tl, but uses native endianness fields */
struct ext4_fc_tl_mem {
        u16 fc_tag;
        u16 fc_len;
};

static inline void tl_to_darg(struct dentry_info_args *darg,
                              struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_dentry_info fcd;

        memcpy(&fcd, val, sizeof(fcd));

        darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
        darg->ino = le32_to_cpu(fcd.fc_ino);
        darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
        darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
}

static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_tl tl_disk;

        memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
        tl->fc_len = le16_to_cpu(tl_disk.fc_len);
        tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
}

/* Unlink replay function */
static int ext4_fc_replay_unlink(struct super_block *sb,
                                 struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode, *old_parent;
        struct qstr entry;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        entry.name = darg.dname;
        entry.len = darg.dname_len;
        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);

        if (IS_ERR(inode)) {
                ext4_debug("Inode %d not found", darg.ino);
                return 0;
        }

        old_parent = ext4_iget(sb, darg.parent_ino,
                                EXT4_IGET_NORMAL);
        if (IS_ERR(old_parent)) {
                ext4_debug("Dir with inode %d not found", darg.parent_ino);
                iput(inode);
                return 0;
        }

        ret = __ext4_unlink(old_parent, &entry, inode, NULL);
        /* -ENOENT ok coz it might not exist anymore. */
        if (ret == -ENOENT)
                ret = 0;
        iput(old_parent);
        iput(inode);
        return ret;
}

static int ext4_fc_replay_link_internal(struct super_block *sb,
                                struct dentry_info_args *darg,
                                struct inode *inode)
{
        struct inode *dir = NULL;
        struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
        int ret = 0;

        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
        if (IS_ERR(dir)) {
                ext4_debug("Dir with inode %d not found.", darg->parent_ino);
                dir = NULL;
                goto out;
        }

        dentry_dir = d_obtain_alias(dir);
        if (IS_ERR(dentry_dir)) {
                ext4_debug("Failed to obtain dentry");
                dentry_dir = NULL;
                goto out;
        }

        dentry_inode = d_alloc(dentry_dir, &qstr_dname);
        if (!dentry_inode) {
                ext4_debug("Inode dentry not created.");
                ret = -ENOMEM;
                goto out;
        }

        ret = __ext4_link(dir, inode, dentry_inode);
        /*
         * It's possible that link already existed since data blocks
         * for the dir in question got persisted before we crashed OR
         * we replayed this tag and crashed before the entire replay
         * could complete.
         */
        if (ret && ret != -EEXIST) {
                ext4_debug("Failed to link\n");
                goto out;
        }

        ret = 0;
out:
        if (dentry_dir) {
                d_drop(dentry_dir);
                dput(dentry_dir);
        } else if (dir) {
                iput(dir);
        }
        if (dentry_inode) {
                d_drop(dentry_inode);
                dput(dentry_inode);
        }

        return ret;
}

/* Link replay function */
static int ext4_fc_replay_link(struct super_block *sb,
                               struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);
        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return 0;
        }

        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        iput(inode);
        return ret;
}

/*
 * Record all the modified inodes during replay. We use this later to setup
 * block bitmaps correctly.
 */
static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
{
        struct ext4_fc_replay_state *state;
        int i;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++)
                if (state->fc_modified_inodes[i] == ino)
                        return 0;
        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
                int *fc_modified_inodes;

                fc_modified_inodes = krealloc(state->fc_modified_inodes,
                                sizeof(int) * (state->fc_modified_inodes_size +
                                EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                GFP_KERNEL);
                if (!fc_modified_inodes)
                        return -ENOMEM;
                state->fc_modified_inodes = fc_modified_inodes;
                state->fc_modified_inodes_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
        }
        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
        return 0;
}

/*
 * Inode replay function
 */
static int ext4_fc_replay_inode(struct super_block *sb,
                                struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_inode fc_inode;
        struct ext4_inode *raw_inode;
        struct ext4_inode *raw_fc_inode;
        struct inode *inode = NULL;
        struct ext4_iloc iloc;
        int inode_len, ino, ret, tag = tl->fc_tag;
        struct ext4_extent_header *eh;
        size_t off_gen = offsetof(struct ext4_inode, i_generation);

        memcpy(&fc_inode, val, sizeof(fc_inode));

        ino = le32_to_cpu(fc_inode.fc_ino);
        trace_ext4_fc_replay(sb, tag, ino, 0, 0);

        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (!IS_ERR(inode)) {
                ext4_ext_clear_bb(inode);
                iput(inode);
        }
        inode = NULL;

        ret = ext4_fc_record_modified_inode(sb, ino);
        if (ret)
                goto out;

        raw_fc_inode = (struct ext4_inode *)
                (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
        if (ret)
                goto out;

        inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
        raw_inode = ext4_raw_inode(&iloc);

        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
        memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
               inode_len - off_gen);
        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
                if (eh->eh_magic != EXT4_EXT_MAGIC) {
                        memset(eh, 0, sizeof(*eh));
                        eh->eh_magic = EXT4_EXT_MAGIC;
                        eh->eh_max = cpu_to_le16(
                                (sizeof(raw_inode->i_block) -
                                 sizeof(struct ext4_extent_header))
                                 / sizeof(struct ext4_extent));
                }
        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
                        sizeof(raw_inode->i_block));
        }

        /* Immediately update the inode on disk. */
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        if (ret)
                goto out;
        ret = sync_dirty_buffer(iloc.bh);
        if (ret)
                goto out;
        ret = ext4_mark_inode_used(sb, ino);
        if (ret)
                goto out;

        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return -EFSCORRUPTED;
        }

        /*
         * Our allocator could have made different decisions than before
         * crashing. This should be fixed but until then, we calculate
         * the number of blocks the inode.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                ext4_ext_replay_set_iblocks(inode);

        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
        ext4_reset_inode_seed(inode);

        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        sync_dirty_buffer(iloc.bh);
        brelse(iloc.bh);
out:
        iput(inode);
        if (!ret)
                blkdev_issue_flush(sb->s_bdev);

        return 0;
}

/*
 * Dentry create replay function.
 *
 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
 * inode for which we are trying to create a dentry here, should already have
 * been replayed before we start here.
 */
static int ext4_fc_replay_create(struct super_block *sb,
                                 struct ext4_fc_tl_mem *tl, u8 *val)
{
        int ret = 0;
        struct inode *inode = NULL;
        struct inode *dir = NULL;
        struct dentry_info_args darg;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
                        darg.parent_ino, darg.dname_len);

        /* This takes care of update group descriptor and other metadata */
        ret = ext4_mark_inode_used(sb, darg.ino);
        if (ret)
                goto out;

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("inode %d not found.", darg.ino);
                inode = NULL;
                ret = -EINVAL;
                goto out;
        }

        if (S_ISDIR(inode->i_mode)) {
                /*
                 * If we are creating a directory, we need to make sure that the
                 * dot and dot dot dirents are setup properly.
                 */
                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
                if (IS_ERR(dir)) {
                        ext4_debug("Dir %d not found.", darg.ino);
                        goto out;
                }
                ret = ext4_init_new_dir(NULL, dir, inode);
                iput(dir);
                if (ret) {
                        ret = 0;
                        goto out;
                }
        }
        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        if (ret)
                goto out;
        set_nlink(inode, 1);
        ext4_mark_inode_dirty(NULL, inode);
out:
        iput(inode);
        return ret;
}

/*
 * Record physical disk regions which are in use as per fast commit area,
 * and used by inodes during replay phase. Our simple replay phase
 * allocator excludes these regions from allocation.
 */
int ext4_fc_record_regions(struct super_block *sb, int ino,
                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{
        struct ext4_fc_replay_state *state;
        struct ext4_fc_alloc_region *region;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        /*
         * during replay phase, the fc_regions_valid may not same as
         * fc_regions_used, update it when do new additions.
         */
        if (replay && state->fc_regions_used != state->fc_regions_valid)
                state->fc_regions_used = state->fc_regions_valid;
        if (state->fc_regions_used == state->fc_regions_size) {
                struct ext4_fc_alloc_region *fc_regions;

                fc_regions = krealloc(state->fc_regions,
                                      sizeof(struct ext4_fc_alloc_region) *
                                      (state->fc_regions_size +
                                       EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                      GFP_KERNEL);
                if (!fc_regions)
                        return -ENOMEM;
                state->fc_regions_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
                state->fc_regions = fc_regions;
        }
        region = &state->fc_regions[state->fc_regions_used++];
        region->ino = ino;
        region->lblk = lblk;
        region->pblk = pblk;
        region->len = len;

        if (replay)
                state->fc_regions_valid++;

        return 0;
}

/* Replay add range tag */
static int ext4_fc_replay_add_range(struct super_block *sb,
                                    struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_add_range fc_add_ex;
        struct ext4_extent newex, *ex;
        struct inode *inode;
        ext4_lblk_t start, cur;
        int remaining, len;
        ext4_fsblk_t start_pblk;
        struct ext4_map_blocks map;
        struct ext4_ext_path *path = NULL;
        int ret;

        memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
        ex = (struct ext4_extent *)&fc_add_ex.fc_ex;

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
                le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
                ext4_ext_get_actual_len(ex));

        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        start = le32_to_cpu(ex->ee_block);
        start_pblk = ext4_ext_pblock(ex);
        len = ext4_ext_get_actual_len(ex);

        cur = start;
        remaining = len;
        ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
                  inode->i_ino);

        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;
                map.m_pblk = 0;
                ret = ext4_map_blocks(NULL, inode, &map, 0);

                if (ret < 0)
                        goto out;

                if (ret == 0) {
                        /* Range is not mapped */
                        path = ext4_find_extent(inode, cur, NULL, 0);
                        if (IS_ERR(path))
                                goto out;
                        memset(&newex, 0, sizeof(newex));
                        newex.ee_block = cpu_to_le32(cur);
                        ext4_ext_store_pblock(
                                &newex, start_pblk + cur - start);
                        newex.ee_len = cpu_to_le16(map.m_len);
                        if (ext4_ext_is_unwritten(ex))
                                ext4_ext_mark_unwritten(&newex);
                        down_write(&EXT4_I(inode)->i_data_sem);
                        ret = ext4_ext_insert_extent(
                                NULL, inode, &path, &newex, 0);
                        up_write((&EXT4_I(inode)->i_data_sem));
                        ext4_free_ext_path(path);
                        if (ret)
                                goto out;
                        goto next;
                }

                if (start_pblk + cur - start != map.m_pblk) {
                        /*
                         * Logical to physical mapping changed. This can happen
                         * if this range was removed and then reallocated to
                         * map to new physical blocks during a fast commit.
                         */
                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex),
                                        start_pblk + cur - start);
                        if (ret)
                                goto out;
                        /*
                         * Mark the old blocks as free since they aren't used
                         * anymore. We maintain an array of all the modified
                         * inodes. In case these blocks are still used at either
                         * a different logical range in the same inode or in
                         * some different inode, we will mark them as allocated
                         * at the end of the FC replay using our array of
                         * modified inodes.
                         */
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                        goto next;
                }

                /* Range is mapped and needs a state change */
                ext4_debug("Converting from %ld to %d %lld",
                                map.m_flags & EXT4_MAP_UNWRITTEN,
                        ext4_ext_is_unwritten(ex), map.m_pblk);
                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex), map.m_pblk);
                if (ret)
                        goto out;
                /*
                 * We may have split the extent tree while toggling the state.
                 * Try to shrink the extent tree now.
                 */
                ext4_ext_replay_shrink_inode(inode, start + len);
next:
                cur += map.m_len;
                remaining -= map.m_len;
        }
        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
                                        sb->s_blocksize_bits);
out:
        iput(inode);
        return 0;
}

/* Replay DEL_RANGE tag */
static int
ext4_fc_replay_del_range(struct super_block *sb,
                         struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode;
        struct ext4_fc_del_range lrange;
        struct ext4_map_blocks map;
        ext4_lblk_t cur, remaining;
        int ret;

        memcpy(&lrange, val, sizeof(lrange));
        cur = le32_to_cpu(lrange.fc_lblk);
        remaining = le32_to_cpu(lrange.fc_len);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
                le32_to_cpu(lrange.fc_ino), cur, remaining);

        inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
                        inode->i_ino, le32_to_cpu(lrange.fc_lblk),
                        le32_to_cpu(lrange.fc_len));
        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;

                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        goto out;
                if (ret > 0) {
                        remaining -= ret;
                        cur += ret;
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                } else {
                        remaining -= map.m_len;
                        cur += map.m_len;
                }
        }

        down_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
                                le32_to_cpu(lrange.fc_lblk) +
                                le32_to_cpu(lrange.fc_len) - 1);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (ret)
                goto out;
        ext4_ext_replay_shrink_inode(inode,
                i_size_read(inode) >> sb->s_blocksize_bits);
        ext4_mark_inode_dirty(NULL, inode);
out:
        iput(inode);
        return 0;
}

static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
{
        struct ext4_fc_replay_state *state;
        struct inode *inode;
        struct ext4_ext_path *path = NULL;
        struct ext4_map_blocks map;
        int i, ret, j;
        ext4_lblk_t cur, end;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++) {
                inode = ext4_iget(sb, state->fc_modified_inodes[i],
                        EXT4_IGET_NORMAL);
                if (IS_ERR(inode)) {
                        ext4_debug("Inode %d not found.",
                                state->fc_modified_inodes[i]);
                        continue;
                }
                cur = 0;
                end = EXT_MAX_BLOCKS;
                if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
                        iput(inode);
                        continue;
                }
                while (cur < end) {
                        map.m_lblk = cur;
                        map.m_len = end - cur;

                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret < 0)
                                break;

                        if (ret > 0) {
                                path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
                                if (!IS_ERR(path)) {
                                        for (j = 0; j < path->p_depth; j++)
                                                ext4_mb_mark_bb(inode->i_sb,
                                                        path[j].p_block, 1, true);
                                        ext4_free_ext_path(path);
                                }
                                cur += ret;
                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
                                                        map.m_len, true);
                        } else {
                                cur = cur + (map.m_len ? map.m_len : 1);
                        }
                }
                iput(inode);
        }
}

/*
 * Check if block is in excluded regions for block allocation. The simple
 * allocator that runs during replay phase is calls this function to see
 * if it is okay to use a block.
 */
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
{
        int i;
        struct ext4_fc_replay_state *state;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_regions_valid; i++) {
                if (state->fc_regions[i].ino == 0 ||
                        state->fc_regions[i].len == 0)
                        continue;
                if (in_range(blk, state->fc_regions[i].pblk,
                                        state->fc_regions[i].len))
                        return true;
        }
        return false;
}

/* Cleanup function called after replay */
void ext4_fc_replay_cleanup(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
        kfree(sbi->s_fc_replay_state.fc_regions);
        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}

static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
                                      int tag, int len)
{
        switch (tag) {
        case EXT4_FC_TAG_ADD_RANGE:
                return len == sizeof(struct ext4_fc_add_range);
        case EXT4_FC_TAG_DEL_RANGE:
                return len == sizeof(struct ext4_fc_del_range);
        case EXT4_FC_TAG_CREAT:
        case EXT4_FC_TAG_LINK:
        case EXT4_FC_TAG_UNLINK:
                len -= sizeof(struct ext4_fc_dentry_info);
                return len >= 1 && len <= EXT4_NAME_LEN;
        case EXT4_FC_TAG_INODE:
                len -= sizeof(struct ext4_fc_inode);
                return len >= EXT4_GOOD_OLD_INODE_SIZE &&
                        len <= sbi->s_inode_size;
        case EXT4_FC_TAG_PAD:
                return true; /* padding can have any length */
        case EXT4_FC_TAG_TAIL:
                return len >= sizeof(struct ext4_fc_tail);
        case EXT4_FC_TAG_HEAD:
                return len == sizeof(struct ext4_fc_head);
        }
        return false;
}

/*
 * Recovery Scan phase handler
 *
 * This function is called during the scan phase and is responsible
 * for doing following things:
 * - Make sure the fast commit area has valid tags for replay
 * - Count number of tags that need to be replayed by the replay handler
 * - Verify CRC
 * - Create a list of excluded blocks for allocation during replay phase
 *
 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
 * to indicate that scan has finished and JBD2 can now start replay phase.
 * It returns a negative error to indicate that there was an error. At the end
 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
 * to indicate the number of tags that need to replayed during the replay phase.
 */
static int ext4_fc_replay_scan(journal_t *journal,
                                struct buffer_head *bh, int off,
                                tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_replay_state *state;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_add_range ext;
        struct ext4_fc_tl_mem tl;
        struct ext4_fc_tail tail;
        __u8 *start, *end, *cur, *val;
        struct ext4_fc_head head;
        struct ext4_extent *ex;

        state = &sbi->s_fc_replay_state;

        start = (u8 *)bh->b_data;
        end = start + journal->j_blocksize;

        if (state->fc_replay_expected_off == 0) {
                state->fc_cur_tag = 0;
                state->fc_replay_num_tags = 0;
                state->fc_crc = 0;
                state->fc_regions = NULL;
                state->fc_regions_valid = state->fc_regions_used =
                        state->fc_regions_size = 0;
                /* Check if we can stop early */
                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
                        != EXT4_FC_TAG_HEAD)
                        return 0;
        }

        if (off != state->fc_replay_expected_off) {
                ret = -EFSCORRUPTED;
                goto out_err;
        }

        state->fc_replay_expected_off++;
        for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
             cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
                ext4_fc_get_tl(&tl, cur);
                val = cur + EXT4_FC_TAG_BASE_LEN;
                if (tl.fc_len > end - val ||
                    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
                        ret = state->fc_replay_num_tags ?
                                JBD2_FC_REPLAY_STOP : -ECANCELED;
                        goto out_err;
                }
                ext4_debug("Scan phase, tag:%s, blk %lld\n",
                           tag2str(tl.fc_tag), bh->b_blocknr);
                switch (tl.fc_tag) {
                case EXT4_FC_TAG_ADD_RANGE:
                        memcpy(&ext, val, sizeof(ext));
                        ex = (struct ext4_extent *)&ext.fc_ex;
                        ret = ext4_fc_record_regions(sb,
                                le32_to_cpu(ext.fc_ino),
                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
                                ext4_ext_get_actual_len(ex), 0);
                        if (ret < 0)
                                break;
                        ret = JBD2_FC_REPLAY_CONTINUE;
                        fallthrough;
                case EXT4_FC_TAG_DEL_RANGE:
                case EXT4_FC_TAG_LINK:
                case EXT4_FC_TAG_UNLINK:
                case EXT4_FC_TAG_CREAT:
                case EXT4_FC_TAG_INODE:
                case EXT4_FC_TAG_PAD:
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                EXT4_FC_TAG_BASE_LEN + tl.fc_len);
                        break;
                case EXT4_FC_TAG_TAIL:
                        state->fc_cur_tag++;
                        memcpy(&tail, val, sizeof(tail));
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                                EXT4_FC_TAG_BASE_LEN +
                                                offsetof(struct ext4_fc_tail,
                                                fc_crc));
                        if (le32_to_cpu(tail.fc_tid) == expected_tid &&
                                le32_to_cpu(tail.fc_crc) == state->fc_crc) {
                                state->fc_replay_num_tags = state->fc_cur_tag;
                                state->fc_regions_valid =
                                        state->fc_regions_used;
                        } else {
                                ret = state->fc_replay_num_tags ?
                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
                        }
                        state->fc_crc = 0;
                        break;
                case EXT4_FC_TAG_HEAD:
                        memcpy(&head, val, sizeof(head));
                        if (le32_to_cpu(head.fc_features) &
                                ~EXT4_FC_SUPPORTED_FEATURES) {
                                ret = -EOPNOTSUPP;
                                break;
                        }
                        if (le32_to_cpu(head.fc_tid) != expected_tid) {
                                ret = JBD2_FC_REPLAY_STOP;
                                break;
                        }
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                EXT4_FC_TAG_BASE_LEN + tl.fc_len);
                        break;
                default:
                        ret = state->fc_replay_num_tags ?
                                JBD2_FC_REPLAY_STOP : -ECANCELED;
                }
                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
                        break;
        }

out_err:
        trace_ext4_fc_replay_scan(sb, ret, off);
        return ret;
}

/*
 * Main recovery path entry point.
 * The meaning of return codes is similar as above.
 */
static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
                                enum passtype pass, int off, tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl_mem tl;
        __u8 *start, *end, *cur, *val;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
        struct ext4_fc_tail tail;

        if (pass == PASS_SCAN) {
                state->fc_current_pass = PASS_SCAN;
                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
        }

        if (state->fc_current_pass != pass) {
                state->fc_current_pass = pass;
                sbi->s_mount_state |= EXT4_FC_REPLAY;
        }
        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
                ext4_debug("Replay stops\n");
                ext4_fc_set_bitmaps_and_counters(sb);
                return 0;
        }

#ifdef CONFIG_EXT4_DEBUG
        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
                pr_warn("Dropping fc block %d because max_replay set\n", off);
                return JBD2_FC_REPLAY_STOP;
        }
#endif

        start = (u8 *)bh->b_data;
        end = start + journal->j_blocksize;

        for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
             cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
                ext4_fc_get_tl(&tl, cur);
                val = cur + EXT4_FC_TAG_BASE_LEN;

                if (state->fc_replay_num_tags == 0) {
                        ret = JBD2_FC_REPLAY_STOP;
                        ext4_fc_set_bitmaps_and_counters(sb);
                        break;
                }

                ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
                state->fc_replay_num_tags--;
                switch (tl.fc_tag) {
                case EXT4_FC_TAG_LINK:
                        ret = ext4_fc_replay_link(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_UNLINK:
                        ret = ext4_fc_replay_unlink(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_ADD_RANGE:
                        ret = ext4_fc_replay_add_range(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_CREAT:
                        ret = ext4_fc_replay_create(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_DEL_RANGE:
                        ret = ext4_fc_replay_del_range(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_INODE:
                        ret = ext4_fc_replay_inode(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_PAD:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
                                             tl.fc_len, 0);
                        break;
                case EXT4_FC_TAG_TAIL:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
                                             0, tl.fc_len, 0);
                        memcpy(&tail, val, sizeof(tail));
                        WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
                        break;
                case EXT4_FC_TAG_HEAD:
                        break;
                default:
                        trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
                        ret = -ECANCELED;
                        break;
                }
                if (ret < 0)
                        break;
                ret = JBD2_FC_REPLAY_CONTINUE;
        }
        return ret;
}

void ext4_fc_init(struct super_block *sb, journal_t *journal)
{
        /*
         * We set replay callback even if fast commit disabled because we may
         * could still have fast commit blocks that need to be replayed even if
         * fast commit has now been turned off.
         */
        journal->j_fc_replay_callback = ext4_fc_replay;
        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
                return;
        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}

static const char * const fc_ineligible_reasons[] = {
        [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
        [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
        [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
        [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
        [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
        [EXT4_FC_REASON_RESIZE] = "Resize",
        [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
        [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
        [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
        [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
};

int ext4_fc_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
        int i;

        if (v != SEQ_START_TOKEN)
                return 0;

        seq_printf(seq,
                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
                   stats->fc_num_commits, stats->fc_ineligible_commits,
                   stats->fc_numblks,
                   div_u64(stats->s_fc_avg_commit_time, 1000));
        seq_puts(seq, "Ineligible reasons:\n");
        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
                        stats->fc_ineligible_reason_count[i]);

        return 0;
}

int __init ext4_fc_init_dentry_cache(void)
{
        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
                                           SLAB_RECLAIM_ACCOUNT);

        if (ext4_fc_dentry_cachep == NULL)
                return -ENOMEM;

        return 0;
}

void ext4_fc_destroy_dentry_cache(void)
{
        kmem_cache_destroy(ext4_fc_dentry_cachep);
}



















































































































    2 
















    1 

    1 





























































    1 














































































































































    1 










    2 







    2 






















    2 
    2 












































    1 


















    1 






    1 


    1 


    1 




    1 







































    1 





    1 
    1 
    1 
    1 





    1 






























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic helpers for smp ipi calls
 *
 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include <linux/sched/clock.h>
#include <linux/nmi.h>
#include <linux/sched/debug.h>
#include <linux/jump_label.h>

#include <trace/events/ipi.h>
#define CREATE_TRACE_POINTS
#include <trace/events/csd.h>
#undef CREATE_TRACE_POINTS

#include "smpboot.h"
#include "sched/smp.h"

#define CSD_TYPE(_csd)        ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)

struct call_function_data {
        call_single_data_t        __percpu *csd;
        cpumask_var_t                cpumask;
        cpumask_var_t                cpumask_ipi;
};

static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);

static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);

static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);

static void __flush_smp_call_function_queue(bool warn_cpu_offline);

int smpcfd_prepare_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                     cpu_to_node(cpu)))
                return -ENOMEM;
        if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
                                     cpu_to_node(cpu))) {
                free_cpumask_var(cfd->cpumask);
                return -ENOMEM;
        }
        cfd->csd = alloc_percpu(call_single_data_t);
        if (!cfd->csd) {
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
                return -ENOMEM;
        }

        return 0;
}

int smpcfd_dead_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        free_cpumask_var(cfd->cpumask);
        free_cpumask_var(cfd->cpumask_ipi);
        free_percpu(cfd->csd);
        return 0;
}

int smpcfd_dying_cpu(unsigned int cpu)
{
        /*
         * The IPIs for the smp-call-function callbacks queued by other
         * CPUs might arrive late, either due to hardware latencies or
         * because this CPU disabled interrupts (inside stop-machine)
         * before the IPIs were sent. So flush out any pending callbacks
         * explicitly (without waiting for the IPIs to arrive), to
         * ensure that the outgoing CPU doesn't go offline with work
         * still pending.
         */
        __flush_smp_call_function_queue(false);
        irq_work_run();
        return 0;
}

void __init call_function_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(call_single_queue, i));

        smpcfd_prepare_cpu(smp_processor_id());
}

static __always_inline void
send_call_function_single_ipi(int cpu)
{
        if (call_function_single_prep_ipi(cpu)) {
                trace_ipi_send_cpu(cpu, _RET_IP_,
                                   generic_smp_call_function_single_interrupt);
                arch_send_call_function_single_ipi(cpu);
        }
}

static __always_inline void
send_call_function_ipi_mask(struct cpumask *mask)
{
        trace_ipi_send_cpumask(mask, _RET_IP_,
                               generic_smp_call_function_single_interrupt);
        arch_send_call_function_ipi_mask(mask);
}

static __always_inline void
csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
{
        trace_csd_function_entry(func, csd);
        func(info);
        trace_csd_function_exit(func, csd);
}

#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG

static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);

/*
 * Parse the csdlock_debug= kernel boot parameter.
 *
 * If you need to restore the old "ext" value that once provided
 * additional debugging information, reapply the following commits:
 *
 * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
 * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
 */
static int __init csdlock_debug(char *str)
{
        int ret;
        unsigned int val = 0;

        ret = get_option(&str, &val);
        if (ret) {
                if (val)
                        static_branch_enable(&csdlock_debug_enabled);
                else
                        static_branch_disable(&csdlock_debug_enabled);
        }

        return 1;
}
__setup("csdlock_debug=", csdlock_debug);

static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);

static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
module_param(csd_lock_timeout, ulong, 0444);
static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
module_param(panic_on_ipistall, int, 0444);

static atomic_t csd_bug_count = ATOMIC_INIT(0);

/* Record current CSD work for current CPU, NULL to erase. */
static void __csd_lock_record(call_single_data_t *csd)
{
        if (!csd) {
                smp_mb(); /* NULL cur_csd after unlock. */
                __this_cpu_write(cur_csd, NULL);
                return;
        }
        __this_cpu_write(cur_csd_func, csd->func);
        __this_cpu_write(cur_csd_info, csd->info);
        smp_wmb(); /* func and info before csd. */
        __this_cpu_write(cur_csd, csd);
        smp_mb(); /* Update cur_csd before function call. */
                  /* Or before unlock, as the case may be. */
}

static __always_inline void csd_lock_record(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled))
                __csd_lock_record(csd);
}

static int csd_lock_wait_getcpu(call_single_data_t *csd)
{
        unsigned int csd_type;

        csd_type = CSD_TYPE(csd);
        if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
                return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
        return -1;
}

/*
 * Complain if too much time spent waiting.  Note that only
 * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 * so waiting on other types gets much less information.
 */
static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
{
        int cpu = -1;
        int cpux;
        bool firsttime;
        u64 ts2, ts_delta;
        call_single_data_t *cpu_cur_csd;
        unsigned int flags = READ_ONCE(csd->node.u_flags);
        unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;

        if (!(flags & CSD_FLAG_LOCK)) {
                if (!unlikely(*bug_id))
                        return true;
                cpu = csd_lock_wait_getcpu(csd);
                pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
                         *bug_id, raw_smp_processor_id(), cpu);
                return true;
        }

        ts2 = sched_clock();
        /* How long since we last checked for a stuck CSD lock.*/
        ts_delta = ts2 - *ts1;
        if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
                return false;

        firsttime = !*bug_id;
        if (firsttime)
                *bug_id = atomic_inc_return(&csd_bug_count);
        cpu = csd_lock_wait_getcpu(csd);
        if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
                cpux = 0;
        else
                cpux = cpu;
        cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
        /* How long since this CSD lock was stuck. */
        ts_delta = ts2 - ts0;
        pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
                 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
                 cpu, csd->func, csd->info);
        /*
         * If the CSD lock is still stuck after 5 minutes, it is unlikely
         * to become unstuck. Use a signed comparison to avoid triggering
         * on underflows when the TSC is out of sync between sockets.
         */
        BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
        if (cpu_cur_csd && csd != cpu_cur_csd) {
                pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
                         *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
                         READ_ONCE(per_cpu(cur_csd_info, cpux)));
        } else {
                pr_alert("\tcsd: CSD lock (#%d) %s.\n",
                         *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
        }
        if (cpu >= 0) {
                if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
                        dump_cpu_task(cpu);
                if (!cpu_cur_csd) {
                        pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
                        arch_send_call_function_single_ipi(cpu);
                }
        }
        if (firsttime)
                dump_stack();
        *ts1 = ts2;

        return false;
}

/*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 *
 * For non-synchronous ipi calls the csd can still be in use by the
 * previous function call. For multi-cpu calls its even more interesting
 * as we'll have to ensure no other cpu is observing our csd.
 */
static void __csd_lock_wait(call_single_data_t *csd)
{
        int bug_id = 0;
        u64 ts0, ts1;

        ts1 = ts0 = sched_clock();
        for (;;) {
                if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
                        break;
                cpu_relax();
        }
        smp_acquire__after_ctrl_dep();
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled)) {
                __csd_lock_wait(csd);
                return;
        }

        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#else
static void csd_lock_record(call_single_data_t *csd)
{
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif

static __always_inline void csd_lock(call_single_data_t *csd)
{
        csd_lock_wait(csd);
        csd->node.u_flags |= CSD_FLAG_LOCK;

        /*
         * prevent CPU from reordering the above assignment
         * to ->flags with any subsequent assignments to other
         * fields of the specified call_single_data_t structure:
         */
        smp_wmb();
}

static __always_inline void csd_unlock(call_single_data_t *csd)
{
        WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));

        /*
         * ensure we're all done before releasing data:
         */
        smp_store_release(&csd->node.u_flags, 0);
}

static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);

void __smp_call_single_queue(int cpu, struct llist_node *node)
{
        /*
         * We have to check the type of the CSD before queueing it, because
         * once queued it can have its flags cleared by
         *   flush_smp_call_function_queue()
         * even if we haven't sent the smp_call IPI yet (e.g. the stopper
         * executes migration_cpu_stop() on the remote CPU).
         */
        if (trace_csd_queue_cpu_enabled()) {
                call_single_data_t *csd;
                smp_call_func_t func;

                csd = container_of(node, call_single_data_t, node.llist);
                func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
                        sched_ttwu_pending : csd->func;

                trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
        }

        /*
         * The list addition should be visible to the target CPU when it pops
         * the head of the list to pull the entry off it in the IPI handler
         * because of normal cache coherency rules implied by the underlying
         * llist ops.
         *
         * If IPIs can go out of order to the cache coherency protocol
         * in an architecture, sufficient synchronisation should be added
         * to arch code to make it appear to obey cache coherency WRT
         * locking and barrier primitives. Generic code isn't really
         * equipped to do the right thing...
         */
        if (llist_add(node, &per_cpu(call_single_queue, cpu)))
                send_call_function_single_ipi(cpu);
}

/*
 * Insert a previously allocated call_single_data_t element
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
static int generic_exec_single(int cpu, call_single_data_t *csd)
{
        if (cpu == smp_processor_id()) {
                smp_call_func_t func = csd->func;
                void *info = csd->info;
                unsigned long flags;

                /*
                 * We can unlock early even for the synchronous on-stack case,
                 * since we're doing this from the same CPU..
                 */
                csd_lock_record(csd);
                csd_unlock(csd);
                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                csd_lock_record(NULL);
                local_irq_restore(flags);
                return 0;
        }

        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                csd_unlock(csd);
                return -ENXIO;
        }

        __smp_call_single_queue(cpu, &csd->node.llist);

        return 0;
}

/**
 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 *
 * Invoked by arch to handle an IPI for call function single.
 * Must be called with interrupts disabled.
 */
void generic_smp_call_function_single_interrupt(void)
{
        __flush_smp_call_function_queue(true);
}

/**
 * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *
 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 *                      offline CPU. Skip this check if set to 'false'.
 *
 * Flush any pending smp-call-function callbacks queued on this CPU. This is
 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 * to ensure that all pending IPI callbacks are run before it goes completely
 * offline.
 *
 * Loop through the call_single_queue and run all the queued callbacks.
 * Must be called with interrupts disabled.
 */
static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
        call_single_data_t *csd, *csd_next;
        struct llist_node *entry, *prev;
        struct llist_head *head;
        static bool warned;
        atomic_t *tbt;

        lockdep_assert_irqs_disabled();

        /* Allow waiters to send backtrace NMI from here onwards */
        tbt = this_cpu_ptr(&trigger_backtrace);
        atomic_set_release(tbt, 1);

        head = this_cpu_ptr(&call_single_queue);
        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);

        /* There shouldn't be any pending callbacks on an offline CPU. */
        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
                     !warned && entry != NULL)) {
                warned = true;
                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());

                /*
                 * We don't have to use the _safe() variant here
                 * because we are not invoking the IPI handlers yet.
                 */
                llist_for_each_entry(csd, entry, node.llist) {
                        switch (CSD_TYPE(csd)) {
                        case CSD_TYPE_ASYNC:
                        case CSD_TYPE_SYNC:
                        case CSD_TYPE_IRQ_WORK:
                                pr_warn("IPI callback %pS sent to offline CPU\n",
                                        csd->func);
                                break;

                        case CSD_TYPE_TTWU:
                                pr_warn("IPI task-wakeup sent to offline CPU\n");
                                break;

                        default:
                                pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
                                        CSD_TYPE(csd));
                                break;
                        }
                }
        }

        /*
         * First; run all SYNC callbacks, people are waiting for us.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                /* Do we wait until *after* callback? */
                if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
                        smp_call_func_t func = csd->func;
                        void *info = csd->info;

                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        csd_lock_record(csd);
                        csd_do_func(func, info, csd);
                        csd_unlock(csd);
                        csd_lock_record(NULL);
                } else {
                        prev = &csd->node.llist;
                }
        }

        if (!entry)
                return;

        /*
         * Second; run all !SYNC callbacks.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                int type = CSD_TYPE(csd);

                if (type != CSD_TYPE_TTWU) {
                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        if (type == CSD_TYPE_ASYNC) {
                                smp_call_func_t func = csd->func;
                                void *info = csd->info;

                                csd_lock_record(csd);
                                csd_unlock(csd);
                                csd_do_func(func, info, csd);
                                csd_lock_record(NULL);
                        } else if (type == CSD_TYPE_IRQ_WORK) {
                                irq_work_single(csd);
                        }

                } else {
                        prev = &csd->node.llist;
                }
        }

        /*
         * Third; only CSD_TYPE_TTWU is left, issue those.
         */
        if (entry) {
                csd = llist_entry(entry, typeof(*csd), node.llist);
                csd_do_func(sched_ttwu_pending, entry, csd);
        }
}


/**
 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *                                   from task context (idle, migration thread)
 *
 * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
 * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
 * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
 * handle queued SMP function calls before scheduling.
 *
 * The migration thread has to ensure that an eventually pending wakeup has
 * been handled before it migrates a task.
 */
void flush_smp_call_function_queue(void)
{
        unsigned int was_pending;
        unsigned long flags;

        if (llist_empty(this_cpu_ptr(&call_single_queue)))
                return;

        local_irq_save(flags);
        /* Get the already pending soft interrupts for RT enabled kernels */
        was_pending = local_softirq_pending();
        __flush_smp_call_function_queue(true);
        if (local_softirq_pending())
                do_softirq_post_smp_call_flush(was_pending);

        local_irq_restore(flags);
}

/*
 * smp_call_function_single - Run a function on a specific CPU
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed on other CPUs.
 *
 * Returns 0 on success, else a negative status code.
 */
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
{
        call_single_data_t *csd;
        call_single_data_t csd_stack = {
                .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
        };
        int this_cpu;
        int err;

        /*
         * prevent preemption and reschedule on another processor,
         * as well as CPU removal
         */
        this_cpu = get_cpu();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress);

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        csd = &csd_stack;
        if (!wait) {
                csd = this_cpu_ptr(&csd_data);
                csd_lock(csd);
        }

        csd->func = func;
        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
        csd->node.src = smp_processor_id();
        csd->node.dst = cpu;
#endif

        err = generic_exec_single(cpu, csd);

        if (wait)
                csd_lock_wait(csd);

        put_cpu();

        return err;
}
EXPORT_SYMBOL(smp_call_function_single);

/**
 * smp_call_function_single_async() - Run an asynchronous function on a
 *                                  specific CPU.
 * @cpu: The CPU to run on.
 * @csd: Pre-allocated and setup data structure
 *
 * Like smp_call_function_single(), but the call is asynchonous and
 * can thus be done from contexts with disabled interrupts.
 *
 * The caller passes his own pre-allocated data structure
 * (ie: embedded in an object) and is responsible for synchronizing it
 * such that the IPIs performed on the @csd are strictly serialized.
 *
 * If the function is called with one csd which has not yet been
 * processed by previous call to smp_call_function_single_async(), the
 * function will return immediately with -EBUSY showing that the csd
 * object is still in progress.
 *
 * NOTE: Be careful, there is unfortunately no current debugging facility to
 * validate the correctness of this serialization.
 *
 * Return: %0 on success or negative errno value on error
 */
int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
        int err = 0;

        preempt_disable();

        if (csd->node.u_flags & CSD_FLAG_LOCK) {
                err = -EBUSY;
                goto out;
        }

        csd->node.u_flags = CSD_FLAG_LOCK;
        smp_wmb();

        err = generic_exec_single(cpu, csd);

out:
        preempt_enable();

        return err;
}
EXPORT_SYMBOL_GPL(smp_call_function_single_async);

/*
 * smp_call_function_any - Run a function on any of the given cpus
 * @mask: The mask of cpus it can run on.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed.
 *
 * Returns 0 on success, else a negative status code (if no cpus were online).
 *
 * Selection preference:
 *        1) current cpu if in @mask
 *        2) any cpu of current node if in @mask
 *        3) any other online cpu in @mask
 */
int smp_call_function_any(const struct cpumask *mask,
                          smp_call_func_t func, void *info, int wait)
{
        unsigned int cpu;
        const struct cpumask *nodemask;
        int ret;

        /* Try for same CPU (cheapest) */
        cpu = get_cpu();
        if (cpumask_test_cpu(cpu, mask))
                goto call;

        /* Try for same node. */
        nodemask = cpumask_of_node(cpu_to_node(cpu));
        for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
             cpu = cpumask_next_and(cpu, nodemask, mask)) {
                if (cpu_online(cpu))
                        goto call;
        }

        /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
        cpu = cpumask_any_and(mask, cpu_online_mask);
call:
        ret = smp_call_function_single(cpu, func, info, wait);
        put_cpu();
        return ret;
}
EXPORT_SYMBOL_GPL(smp_call_function_any);

/*
 * Flags to be used as scf_flags argument of smp_call_function_many_cond().
 *
 * %SCF_WAIT:                Wait until function execution is completed
 * %SCF_RUN_LOCAL:        Run also locally if local cpu is set in cpumask
 */
#define SCF_WAIT        (1U << 0)
#define SCF_RUN_LOCAL        (1U << 1)

static void smp_call_function_many_cond(const struct cpumask *mask,
                                        smp_call_func_t func, void *info,
                                        unsigned int scf_flags,
                                        smp_cond_func_t cond_func)
{
        int cpu, last_cpu, this_cpu = smp_processor_id();
        struct call_function_data *cfd;
        bool wait = scf_flags & SCF_WAIT;
        int nr_cpus = 0;
        bool run_remote = false;
        bool run_local = false;

        lockdep_assert_preemption_disabled();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        if (cpu_online(this_cpu) && !oops_in_progress &&
            !early_boot_irqs_disabled)
                lockdep_assert_irqs_enabled();

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        /* Check if we need local execution. */
        if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
                run_local = true;

        /* Check if we need remote execution, i.e., any CPU excluding this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
        if (cpu == this_cpu)
                cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
        if (cpu < nr_cpu_ids)
                run_remote = true;

        if (run_remote) {
                cfd = this_cpu_ptr(&cfd_data);
                cpumask_and(cfd->cpumask, mask, cpu_online_mask);
                __cpumask_clear_cpu(this_cpu, cfd->cpumask);

                cpumask_clear(cfd->cpumask_ipi);
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);

                        if (cond_func && !cond_func(cpu, info)) {
                                __cpumask_clear_cpu(cpu, cfd->cpumask);
                                continue;
                        }

                        csd_lock(csd);
                        if (wait)
                                csd->node.u_flags |= CSD_TYPE_SYNC;
                        csd->func = func;
                        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
                        csd->node.src = smp_processor_id();
                        csd->node.dst = cpu;
#endif
                        trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);

                        if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
                                __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
                                nr_cpus++;
                                last_cpu = cpu;
                        }
                }

                /*
                 * Choose the most efficient way to send an IPI. Note that the
                 * number of CPUs might be zero due to concurrent changes to the
                 * provided mask.
                 */
                if (nr_cpus == 1)
                        send_call_function_single_ipi(last_cpu);
                else if (likely(nr_cpus > 1))
                        send_call_function_ipi_mask(cfd->cpumask_ipi);
        }

        if (run_local && (!cond_func || cond_func(this_cpu, info))) {
                unsigned long flags;

                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                local_irq_restore(flags);
        }

        if (run_remote && wait) {
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }
}

/**
 * smp_call_function_many(): Run a function on a set of CPUs.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
 *        (atomically) until function has completed on other CPUs. If
 *        %SCF_RUN_LOCAL is set, the function will also be run locally
 *        if the local CPU is set in the @cpumask.
 *
 * If @wait is true, then returns once @func has returned.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler. Preemption
 * must be disabled when calling this function.
 */
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
        smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);

/**
 * smp_call_function(): Run a function on all other CPUs.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * Returns 0.
 *
 * If @wait is true, then returns once @func has returned; otherwise
 * it returns just before the target cpu calls @func.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
void smp_call_function(smp_call_func_t func, void *info, int wait)
{
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
        preempt_enable();
}
EXPORT_SYMBOL(smp_call_function);

/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);


/*
 * Setup routine for controlling SMP activation
 *
 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 * activation entirely (the MPS table probe still happens, though).
 *
 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 * greater than 0, limits the maximum number of CPUs activated in
 * SMP mode to <NUM>.
 */

void __weak __init arch_disable_smp_support(void) { }

static int __init nosmp(char *str)
{
        setup_max_cpus = 0;
        arch_disable_smp_support();

        return 0;
}

early_param("nosmp", nosmp);

/* this is hard limit */
static int __init nrcpus(char *str)
{
        int nr_cpus;

        if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
                set_nr_cpu_ids(nr_cpus);

        return 0;
}

early_param("nr_cpus", nrcpus);

static int __init maxcpus(char *str)
{
        get_option(&str, &setup_max_cpus);
        if (setup_max_cpus == 0)
                arch_disable_smp_support();

        return 0;
}

early_param("maxcpus", maxcpus);

#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
#endif

/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
        set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
}

/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
        int num_nodes, num_cpus;

        idle_threads_init();
        cpuhp_threads_init();

        pr_info("Bringing up secondary CPUs ...\n");

        bringup_nonboot_cpus(setup_max_cpus);

        num_nodes = num_online_nodes();
        num_cpus  = num_online_cpus();
        pr_info("Brought up %d node%s, %d CPU%s\n",
                num_nodes, (num_nodes > 1 ? "s" : ""),
                num_cpus,  (num_cpus  > 1 ? "s" : ""));

        /* Any cleanup work */
        smp_cpus_done(setup_max_cpus);
}

/*
 * on_each_cpu_cond(): Call a function on each processor for which
 * the supplied function cond_func returns true, optionally waiting
 * for all the required CPUs to finish. This may include the local
 * processor.
 * @cond_func:        A callback function that is passed a cpu id and
 *                the info parameter. The function is called
 *                with preemption disabled. The function should
 *                return a blooean value indicating whether to IPI
 *                the specified CPU.
 * @func:        The function to run on all applicable CPUs.
 *                This must be fast and non-blocking.
 * @info:        An arbitrary pointer to pass to both functions.
 * @wait:        If true, wait (atomically) until function has
 *                completed on other CPUs.
 *
 * Preemption is disabled to protect against CPUs going offline but not online.
 * CPUs going online during the call will not be seen or sent an IPI.
 *
 * You must not call this function with disabled interrupts or
 * from a hardware interrupt handler or from a bottom half handler.
 */
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask)
{
        unsigned int scf_flags = SCF_RUN_LOCAL;

        if (wait)
                scf_flags |= SCF_WAIT;

        preempt_disable();
        smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
        preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);

static void do_nothing(void *unused)
{
}

/**
 * kick_all_cpus_sync - Force all cpus out of idle
 *
 * Used to synchronize the update of pm_idle function pointer. It's
 * called after the pointer is updated and returns after the dummy
 * callback function has been executed on all cpus. The execution of
 * the function can only happen on the remote cpus after they have
 * left the idle function which had been called via pm_idle function
 * pointer. So it's guaranteed that nothing uses the previous pointer
 * anymore.
 */
void kick_all_cpus_sync(void)
{
        /* Make sure the change is visible before we kick the cpus */
        smp_mb();
        smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);

/**
 * wake_up_all_idle_cpus - break all cpus out of idle
 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
 * including idle polling cpus, for non-idle cpus, we will do nothing
 * for them.
 */
void wake_up_all_idle_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                preempt_disable();
                if (cpu != smp_processor_id() && cpu_online(cpu))
                        wake_up_if_idle(cpu);
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);

/**
 * struct smp_call_on_cpu_struct - Call a function on a specific CPU
 * @work: &work_struct
 * @done: &completion to signal
 * @func: function to call
 * @data: function's data argument
 * @ret: return value from @func
 * @cpu: target CPU (%-1 for any CPU)
 *
 * Used to call a function on a specific cpu and wait for it to return.
 * Optionally make sure the call is done on a specified physical cpu via vcpu
 * pinning in order to support virtualized environments.
 */
struct smp_call_on_cpu_struct {
        struct work_struct        work;
        struct completion        done;
        int                        (*func)(void *);
        void                        *data;
        int                        ret;
        int                        cpu;
};

static void smp_call_on_cpu_callback(struct work_struct *work)
{
        struct smp_call_on_cpu_struct *sscs;

        sscs = container_of(work, struct smp_call_on_cpu_struct, work);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(sscs->cpu);
        sscs->ret = sscs->func(sscs->data);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(-1);

        complete(&sscs->done);
}

int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
        struct smp_call_on_cpu_struct sscs = {
                .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
                .func = func,
                .data = par,
                .cpu  = phys ? cpu : -1,
        };

        INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);

        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                return -ENXIO;

        queue_work_on(cpu, system_wq, &sscs.work);
        wait_for_completion(&sscs.done);

        return sscs.ret;
}
EXPORT_SYMBOL_GPL(smp_call_on_cpu);

































































    1 










    1 









    1 









    1 































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "%s: " fmt, __func__

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/percpu-refcount.h>

/*
 * Initially, a percpu refcount is just a set of percpu counters. Initially, we
 * don't try to detect the ref hitting 0 - which means that get/put can just
 * increment or decrement the local counter. Note that the counter on a
 * particular cpu can (and will) wrap - this is fine, when we go to shutdown the
 * percpu counters will all sum to the correct value
 *
 * (More precisely: because modular arithmetic is commutative the sum of all the
 * percpu_count vars will be equal to what it would have been if all the gets
 * and puts were done to a single integer, even if some of the percpu integers
 * overflow or underflow).
 *
 * The real trick to implementing percpu refcounts is shutdown. We can't detect
 * the ref hitting 0 on every put - this would require global synchronization
 * and defeat the whole purpose of using percpu refs.
 *
 * What we do is require the user to keep track of the initial refcount; we know
 * the ref can't hit 0 before the user drops the initial ref, so as long as we
 * convert to non percpu mode before the initial ref is dropped everything
 * works.
 *
 * Converting to non percpu mode is done with some RCUish stuff in
 * percpu_ref_kill. Additionally, we need a bias value so that the
 * atomic_long_t can't hit 0 before we've added up all the percpu refs.
 */

#define PERCPU_COUNT_BIAS        (1LU << (BITS_PER_LONG - 1))

static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);

static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
{
        return (unsigned long __percpu *)
                (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD);
}

/**
 * percpu_ref_init - initialize a percpu refcount
 * @ref: percpu_ref to initialize
 * @release: function which will be called when refcount hits 0
 * @flags: PERCPU_REF_INIT_* flags
 * @gfp: allocation mask to use
 *
 * Initializes @ref.  @ref starts out in percpu mode with a refcount of 1 unless
 * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD.  These flags
 * change the start state to atomic with the latter setting the initial refcount
 * to 0.  See the definitions of PERCPU_REF_INIT_* flags for flag behaviors.
 *
 * Note that @release must not sleep - it may potentially be called from RCU
 * callback context by percpu_ref_kill().
 */
int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
                    unsigned int flags, gfp_t gfp)
{
        size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS,
                             __alignof__(unsigned long));
        unsigned long start_count = 0;
        struct percpu_ref_data *data;

        ref->percpu_count_ptr = (unsigned long)
                __alloc_percpu_gfp(sizeof(unsigned long), align, gfp);
        if (!ref->percpu_count_ptr)
                return -ENOMEM;

        data = kzalloc(sizeof(*ref->data), gfp);
        if (!data) {
                free_percpu((void __percpu *)ref->percpu_count_ptr);
                ref->percpu_count_ptr = 0;
                return -ENOMEM;
        }

        data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
        data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;

        if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
                ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
                data->allow_reinit = true;
        } else {
                start_count += PERCPU_COUNT_BIAS;
        }

        if (flags & PERCPU_REF_INIT_DEAD)
                ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        else
                start_count++;

        atomic_long_set(&data->count, start_count);

        data->release = release;
        data->confirm_switch = NULL;
        data->ref = ref;
        ref->data = data;
        return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);

static void __percpu_ref_exit(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);

        if (percpu_count) {
                /* non-NULL confirm_switch indicates switching in progress */
                WARN_ON_ONCE(ref->data && ref->data->confirm_switch);
                free_percpu(percpu_count);
                ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
        }
}

/**
 * percpu_ref_exit - undo percpu_ref_init()
 * @ref: percpu_ref to exit
 *
 * This function exits @ref.  The caller is responsible for ensuring that
 * @ref is no longer in active use.  The usual places to invoke this
 * function from are the @ref->release() callback or in init failure path
 * where percpu_ref_init() succeeded but other parts of the initialization
 * of the embedding object failed.
 */
void percpu_ref_exit(struct percpu_ref *ref)
{
        struct percpu_ref_data *data = ref->data;
        unsigned long flags;

        __percpu_ref_exit(ref);

        if (!data)
                return;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) <<
                __PERCPU_REF_FLAG_BITS;
        ref->data = NULL;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        kfree(data);
}
EXPORT_SYMBOL_GPL(percpu_ref_exit);

static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;

        data->confirm_switch(ref);
        data->confirm_switch = NULL;
        wake_up_all(&percpu_ref_switch_waitq);

        if (!data->allow_reinit)
                __percpu_ref_exit(ref);

        /* drop ref from percpu_ref_switch_to_atomic() */
        percpu_ref_put(ref);
}

static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        static atomic_t underflows;
        unsigned long count = 0;
        int cpu;

        for_each_possible_cpu(cpu)
                count += *per_cpu_ptr(percpu_count, cpu);

        pr_debug("global %lu percpu %lu\n",
                 atomic_long_read(&data->count), count);

        /*
         * It's crucial that we sum the percpu counters _before_ adding the sum
         * to &ref->count; since gets could be happening on one cpu while puts
         * happen on another, adding a single cpu's count could cause
         * @ref->count to hit 0 before we've got a consistent value - but the
         * sum of all the counts will be consistent and correct.
         *
         * Subtracting the bias value then has to happen _after_ adding count to
         * &ref->count; we need the bias value to prevent &ref->count from
         * reaching 0 before we add the percpu counts. But doing it at the same
         * time is equivalent and saves us atomic operations:
         */
        atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count);

        if (WARN_ONCE(atomic_long_read(&data->count) <= 0,
                      "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
                      data->release, atomic_long_read(&data->count)) &&
            atomic_inc_return(&underflows) < 4) {
                pr_err("%s(): percpu_ref underflow", __func__);
                mem_dump_obj(data);
        }

        /* @ref is viewed as dead on all CPUs, send out switch confirmation */
        percpu_ref_call_confirm_rcu(rcu);
}

static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
{
}

static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                          percpu_ref_func_t *confirm_switch)
{
        if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
                if (confirm_switch)
                        confirm_switch(ref);
                return;
        }

        /* switching from percpu to atomic */
        ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;

        /*
         * Non-NULL ->confirm_switch is used to indicate that switching is
         * in progress.  Use noop one if unspecified.
         */
        ref->data->confirm_switch = confirm_switch ?:
                percpu_ref_noop_confirm_switch;

        percpu_ref_get(ref);        /* put after confirmation */
        call_rcu_hurry(&ref->data->rcu,
                       percpu_ref_switch_to_atomic_rcu);
}

static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        int cpu;

        BUG_ON(!percpu_count);

        if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
                return;

        if (WARN_ON_ONCE(!ref->data->allow_reinit))
                return;

        atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count);

        /*
         * Restore per-cpu operation.  smp_store_release() is paired
         * with READ_ONCE() in __ref_is_percpu() and guarantees that the
         * zeroing is visible to all percpu accesses which can see the
         * following __PERCPU_REF_ATOMIC clearing.
         */
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(percpu_count, cpu) = 0;

        smp_store_release(&ref->percpu_count_ptr,
                          ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}

static void __percpu_ref_switch_mode(struct percpu_ref *ref,
                                     percpu_ref_func_t *confirm_switch)
{
        struct percpu_ref_data *data = ref->data;

        lockdep_assert_held(&percpu_ref_switch_lock);

        /*
         * If the previous ATOMIC switching hasn't finished yet, wait for
         * its completion.  If the caller ensures that ATOMIC switching
         * isn't in progress, this function can be called from any context.
         */
        wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch,
                            percpu_ref_switch_lock);

        if (data->force_atomic || percpu_ref_is_dying(ref))
                __percpu_ref_switch_to_atomic(ref, confirm_switch);
        else
                __percpu_ref_switch_to_percpu(ref);
}

/**
 * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 * @confirm_switch: optional confirmation callback
 *
 * There's no reason to use this function for the usual reference counting.
 * Use percpu_ref_kill[_and_confirm]().
 *
 * Schedule switching of @ref to atomic mode.  All its percpu counts will
 * be collected to the main atomic counter.  On completion, when all CPUs
 * are guaraneed to be in atomic mode, @confirm_switch, which may not
 * block, is invoked.  This function may be invoked concurrently with all
 * the get/put operations and can safely be mixed with kill and reinit
 * operations.  Note that @ref will stay in atomic mode across kill/reinit
 * cycles until percpu_ref_switch_to_percpu() is called.
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = true;
        __percpu_ref_switch_mode(ref, confirm_switch);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);

/**
 * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 *
 * Schedule switching the ref to atomic mode, and wait for the
 * switch to complete.  Caller must ensure that no other thread
 * will switch back to percpu mode.
 */
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
{
        percpu_ref_switch_to_atomic(ref, NULL);
        wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);

/**
 * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
 * @ref: percpu_ref to switch to percpu mode
 *
 * There's no reason to use this function for the usual reference counting.
 * To re-use an expired ref, use percpu_ref_reinit().
 *
 * Switch @ref to percpu mode.  This function may be invoked concurrently
 * with all the get/put operations and can safely be mixed with kill and
 * reinit operations.  This function reverses the sticky atomic state set
 * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic().  If @ref is
 * dying or dead, the actual switching takes place on the following
 * percpu_ref_reinit().
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = false;
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);

/**
 * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
 * @ref: percpu_ref to kill
 * @confirm_kill: optional confirmation callback
 *
 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
 * @confirm_kill is not NULL.  @confirm_kill, which may not block, will be
 * called after @ref is seen as dead from all CPUs at which point all
 * further invocations of percpu_ref_tryget_live() will fail.  See
 * percpu_ref_tryget_live() for details.
 *
 * This function normally doesn't block and can be called from any context
 * but it may block if @confirm_kill is specified and @ref is in the
 * process of switching to atomic mode by percpu_ref_switch_to_atomic().
 *
 * There are no implied RCU grace periods between kill and release.
 */
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ONCE(percpu_ref_is_dying(ref),
                  "%s called more than once on %ps!", __func__,
                  ref->data->release);

        ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        __percpu_ref_switch_mode(ref, confirm_kill);
        percpu_ref_put(ref);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);

/**
 * percpu_ref_is_zero - test whether a percpu refcount reached zero
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref reached zero.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
bool percpu_ref_is_zero(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long count, flags;

        if (__ref_is_percpu(ref, &percpu_count))
                return false;

        /* protect us from being destroyed */
        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        if (ref->data)
                count = atomic_long_read(&ref->data->count);
        else
                count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        return count == 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_is_zero);

/**
 * percpu_ref_reinit - re-initialize a percpu refcount
 * @ref: perpcu_ref to re-initialize
 *
 * Re-initialize @ref so that it's in the same state as when it finished
 * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD.  @ref must have been
 * initialized successfully and reached 0 but not exited.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_reinit(struct percpu_ref *ref)
{
        WARN_ON_ONCE(!percpu_ref_is_zero(ref));

        percpu_ref_resurrect(ref);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);

/**
 * percpu_ref_resurrect - modify a percpu refcount from dead to live
 * @ref: perpcu_ref to resurrect
 *
 * Modify @ref so that it's in the same state as before percpu_ref_kill() was
 * called. @ref must be dead but must not yet have exited.
 *
 * If @ref->release() frees @ref then the caller is responsible for
 * guaranteeing that @ref->release() does not get called while this
 * function is in progress.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_resurrect(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ON_ONCE(!percpu_ref_is_dying(ref));
        WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count));

        ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
        percpu_ref_get(ref);
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_resurrect);
















































































































































































































   16 































    7 







    8 
   14 


































    7 
























    2 


    7 
    1 







    5 






























































































































































































    2 


    1 




    6 

    5 

    2 














    3 

    3 






















    2 
















    1 
    1 













    6 
    1 
    1 


    3 










































































































    4 





























    2 























    1 














    2 






















    4 



































    6 




















    6 











































































































































































































   13 


















    3 



































    4 
    4 
    2 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Macros for manipulating and testing page->flags
 */

#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages. The "struct page" of such a page
 * should in general not be touched (e.g. set dirty) except by its owner.
 * Pages marked as PG_reserved include:
 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
 *   initrd, HW tables)
 * - Pages reserved or allocated early during boot (before the page allocator
 *   was initialized). This includes (depending on the architecture) the
 *   initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
 *   much more. Once (if ever) freed, PG_reserved is cleared and they will
 *   be given to the page allocator.
 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
 *   to read/write these pages might end badly. Don't touch!
 * - The zero page(s)
 * - Pages not added to the page allocator when onlining a section because
 *   they were excluded via the online_page_callback() or because they are
 *   PG_hwpoison.
 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
 *   control pages, vmcoreinfo)
 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
 *   not marked PG_reserved (as they might be in use by somebody else who does
 *   not respect the caching strategy).
 * - Pages part of an offline section (struct pages of offline sections should
 *   not be trusted as they will be initialized when first onlined).
 * - MCA pages on ia64
 * - Pages holding CPU notes for POWER Firmware Assisted Dump
 * - Device memory (e.g. PMEM, DAX, HMM)
 * Some PG_reserved pages will be excluded from the hibernation image.
 * PG_reserved does in general not hinder anybody from dumping or swapping
 * and is no longer required for remap_pfn_range(). ioremap might require it.
 * Consequently, PG_reserved for a page mapped into user space can indicate
 * the zero page, the vDSO, MMIO pages or device memory.
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_swapbacked is set when a page uses swap as a backing storage.  This are
 * usually PageAnon or shmem pages but please note that even anonymous pages
 * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
 * a result of MADV_FREE).
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_error is set to indicate that an I/O error occurred on this page.
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the pageflags directly.  Use the PageFoo macros.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
        PG_locked,                /* Page is locked. Don't touch. */
        PG_writeback,                /* Page is under writeback */
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_head,                /* Must be in bit 6 */
        PG_waiters,                /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
        PG_active,
        PG_workingset,
        PG_error,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use*/
        PG_arch_1,
        PG_reserved,
        PG_private,                /* If pagecache, has fs-private data */
        PG_private_2,                /* If pagecache, has fs aux data */
        PG_mappedtodisk,        /* Has blocks allocated on-disk */
        PG_reclaim,                /* To be reclaimed asap */
        PG_swapbacked,                /* Page is backed by RAM/swap */
        PG_unevictable,                /* Page is "unevictable"  */
#ifdef CONFIG_MMU
        PG_mlocked,                /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
        PG_uncached,                /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,                /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
        PG_arch_2,
        PG_arch_3,
#endif
        __NR_PAGEFLAGS,

        PG_readahead = PG_reclaim,

        /*
         * Depending on the way an anonymous folio can be mapped into a page
         * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
         * THP), PG_anon_exclusive may be set only for the head page or for
         * tail pages of an anonymous folio. For now, we only expect it to be
         * set on tail pages for PTE-mapped THP.
         */
        PG_anon_exclusive = PG_mappedtodisk,

        /* Filesystems */
        PG_checked = PG_owner_priv_1,

        /* SwapBacked */
        PG_swapcache = PG_owner_priv_1,        /* Swap page: swp_entry_t in private */

        /* Two page bits are conscripted by FS-Cache to maintain local caching
         * state.  These bits are set on pages belonging to the netfs's inodes
         * when those inodes are being locally cached.
         */
        PG_fscache = PG_private_2,        /* page backed by cache */

        /* XEN */
        /* Pinned in Xen as a read-only pagetable page. */
        PG_pinned = PG_owner_priv_1,
        /* Pinned as part of domain save (see xen_mm_pin_all()). */
        PG_savepinned = PG_dirty,
        /* Has a grant mapping of another (foreign) domain's page. */
        PG_foreign = PG_owner_priv_1,
        /* Remapped by swiotlb-xen. */
        PG_xen_remapped = PG_owner_priv_1,

        /* non-lru isolated movable page */
        PG_isolated = PG_reclaim,

        /* Only valid for buddy pages. Used to track pages that are reported */
        PG_reported = PG_uptodate,

#ifdef CONFIG_MEMORY_HOTPLUG
        /* For self-hosted memmap pages */
        PG_vmemmap_self_hosted = PG_owner_priv_1,
#endif

        /*
         * Flags only valid for compound pages.  Stored in first tail page's
         * flags word.  Cannot use the first 8 flags or any flag marked as
         * PF_ANY.
         */

        /* At least one page in this folio has the hwpoison flag set */
        PG_has_hwpoisoned = PG_error,
        PG_large_rmappable = PG_workingset, /* anon or file-backed */
};

#define PAGEFLAGS_MASK                ((1UL << NR_PAGEFLAGS) - 1)

#ifndef __GENERATING_BOUNDS_H

#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);

/*
 * Return the real head page struct iff the @page is a fake head page, otherwise
 * return the @page itself. See Documentation/mm/vmemmap_dedup.rst.
 */
static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
{
        if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
                return page;

        /*
         * Only addresses aligned with PAGE_SIZE of struct page may be fake head
         * struct page. The alignment check aims to avoid access the fields (
         * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly)
         * cold cacheline in some cases.
         */
        if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
            test_bit(PG_head, &page->flags)) {
                /*
                 * We can safely access the field of the @page[1] with PG_head
                 * because the @page is a compound page composed with at least
                 * two contiguous pages.
                 */
                unsigned long head = READ_ONCE(page[1].compound_head);

                if (likely(head & 1))
                        return (const struct page *)(head - 1);
        }
        return page;
}
#else
static inline const struct page *page_fixed_fake_head(const struct page *page)
{
        return page;
}
#endif

static __always_inline int page_is_fake_head(const struct page *page)
{
        return page_fixed_fake_head(page) != page;
}

static inline unsigned long _compound_head(const struct page *page)
{
        unsigned long head = READ_ONCE(page->compound_head);

        if (unlikely(head & 1))
                return head - 1;
        return (unsigned long)page_fixed_fake_head(page);
}

#define compound_head(page)        ((typeof(page))_compound_head(page))

/**
 * page_folio - Converts from page to folio.
 * @p: The page.
 *
 * Every page is part of a folio.  This function cannot be called on a
 * NULL pointer.
 *
 * Context: No reference, nor lock is required on @page.  If the caller
 * does not hold a reference, this call may race with a folio split, so
 * it should re-check the folio still contains this page after gaining
 * a reference on the folio.
 * Return: The folio which contains this page.
 */
#define page_folio(p)                (_Generic((p),                                \
        const struct page *:        (const struct folio *)_compound_head(p), \
        struct page *:                (struct folio *)_compound_head(p)))

/**
 * folio_page - Return a page from a folio.
 * @folio: The folio.
 * @n: The page number to return.
 *
 * @n is relative to the start of the folio.  This function does not
 * check that the page number lies within @folio; the caller is presumed
 * to have a reference to the page.
 */
#define folio_page(folio, n)        nth_page(&(folio)->page, n)

static __always_inline int PageTail(const struct page *page)
{
        return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
}

static __always_inline int PageCompound(const struct page *page)
{
        return test_bit(PG_head, &page->flags) ||
               READ_ONCE(page->compound_head) & 1;
}

#define        PAGE_POISON_PATTERN        -1l
static inline int PagePoisoned(const struct page *page)
{
        return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
}

#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif

static const unsigned long *const_folio_flags(const struct folio *folio,
                unsigned n)
{
        const struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
        return &page[n].flags;
}

static unsigned long *folio_flags(struct folio *folio, unsigned n)
{
        struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
        return &page[n].flags;
}

/*
 * Page flags policies wrt compound pages
 *
 * PF_POISONED_CHECK
 *     check if this struct page poisoned/uninitialized
 *
 * PF_ANY:
 *     the page flag is relevant for small, head and tail pages.
 *
 * PF_HEAD:
 *     for compound page all operations related to the page flag applied to
 *     head page.
 *
 * PF_NO_TAIL:
 *     modifications of the page flag must be done on small or head pages,
 *     checks can be done on tail pages too.
 *
 * PF_NO_COMPOUND:
 *     the page flag is not relevant for compound pages.
 *
 * PF_SECOND:
 *     the page flag is stored in the first tail page.
 */
#define PF_POISONED_CHECK(page) ({                                        \
                VM_BUG_ON_PGFLAGS(PagePoisoned(page), page);                \
                page; })
#define PF_ANY(page, enforce)        PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce)        PF_POISONED_CHECK(compound_head(page))
#define PF_NO_TAIL(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);        \
                PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({                                \
                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page);        \
                PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);                \
                PF_POISONED_CHECK(&page[1]); })

/* Which page is the flag stored in */
#define FOLIO_PF_ANY                0
#define FOLIO_PF_HEAD                0
#define FOLIO_PF_NO_TAIL        0
#define FOLIO_PF_NO_COMPOUND        0
#define FOLIO_PF_SECOND                1

#define FOLIO_HEAD_PAGE                0
#define FOLIO_SECOND_PAGE        1

/*
 * Macros to create function definitions for page flags
 */
#define FOLIO_TEST_FLAG(name, page)                                        \
static __always_inline bool folio_test_##name(const struct folio *folio) \
{ return test_bit(PG_##name, const_folio_flags(folio, page)); }

#define FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void folio_set_##name(struct folio *folio)        \
{ set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void folio_clear_##name(struct folio *folio)        \
{ clear_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void __folio_set_##name(struct folio *folio)        \
{ __set_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void __folio_clear_##name(struct folio *folio)        \
{ __clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_SET_FLAG(name, page)                                        \
static __always_inline bool folio_test_set_##name(struct folio *folio)        \
{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_CLEAR_FLAG(name, page)                                \
static __always_inline bool folio_test_clear_##name(struct folio *folio) \
{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_FLAG(name, page)                                                \
FOLIO_TEST_FLAG(name, page)                                                \
FOLIO_SET_FLAG(name, page)                                                \
FOLIO_CLEAR_FLAG(name, page)

#define TESTPAGEFLAG(uname, lname, policy)                                \
FOLIO_TEST_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline int Page##uname(const struct page *page)                \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }

#define SETPAGEFLAG(uname, lname, policy)                                \
FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void SetPage##uname(struct page *page)                \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }

#define CLEARPAGEFLAG(uname, lname, policy)                                \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void ClearPage##uname(struct page *page)                \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define __SETPAGEFLAG(uname, lname, policy)                                \
__FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }

#define __CLEARPAGEFLAG(uname, lname, policy)                                \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline void __ClearPage##uname(struct page *page)        \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTSETFLAG(uname, lname, policy)                                \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestSetPage##uname(struct page *page)        \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTCLEARFLAG(uname, lname, policy)                                \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestClearPage##uname(struct page *page)        \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define PAGEFLAG(uname, lname, policy)                                        \
        TESTPAGEFLAG(uname, lname, policy)                                \
        SETPAGEFLAG(uname, lname, policy)                                \
        CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                                \
        TESTPAGEFLAG(uname, lname, policy)                                \
        __SETPAGEFLAG(uname, lname, policy)                                \
        __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                                \
        TESTSETFLAG(uname, lname, policy)                                \
        TESTCLEARFLAG(uname, lname, policy)

#define FOLIO_TEST_FLAG_FALSE(name)                                        \
static inline bool folio_test_##name(const struct folio *folio)                \
{ return false; }
#define FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void folio_set_##name(struct folio *folio) { }
#define FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void folio_clear_##name(struct folio *folio) { }
#define __FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void __folio_set_##name(struct folio *folio) { }
#define __FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void __folio_clear_##name(struct folio *folio) { }
#define FOLIO_TEST_SET_FLAG_FALSE(name)                                        \
static inline bool folio_test_set_##name(struct folio *folio)                \
{ return false; }
#define FOLIO_TEST_CLEAR_FLAG_FALSE(name)                                \
static inline bool folio_test_clear_##name(struct folio *folio)                \
{ return false; }

#define FOLIO_FLAG_FALSE(name)                                                \
FOLIO_TEST_FLAG_FALSE(name)                                                \
FOLIO_SET_FLAG_NOOP(name)                                                \
FOLIO_CLEAR_FLAG_NOOP(name)

#define TESTPAGEFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_FLAG_FALSE(lname)                                                \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname, lname)                                        \
FOLIO_SET_FLAG_NOOP(lname)                                                \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname, lname)                                \
FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname, lname)                                \
__FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname, lname)                                        \
FOLIO_TEST_SET_FLAG_FALSE(lname)                                        \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_CLEAR_FLAG_FALSE(lname)                                        \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)        \
        SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)

#define TESTSCFLAG_FALSE(uname, lname)                                        \
        TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
        TESTCLEARFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
        TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
        TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)           /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
        TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause release_folio() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY)
PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
        TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
        TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND)

#ifdef CONFIG_HIGHMEM
/*
 * Must use a macro here due to header dependency issues. page_zone() is not
 * available at this point.
 */
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#define folio_test_highmem(__f)        is_highmem_idx(folio_zonenum(__f))
#else
PAGEFLAG_FALSE(HighMem, highmem)
#endif

#ifdef CONFIG_SWAP
static __always_inline bool folio_test_swapcache(const struct folio *folio)
{
        return folio_test_swapbacked(folio) &&
                        test_bit(PG_swapcache, const_folio_flags(folio, 0));
}

static __always_inline bool PageSwapCache(const struct page *page)
{
        return folio_test_swapcache(page_folio(page));
}

SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(SwapCache, swapcache)
#endif

PAGEFLAG(Unevictable, unevictable, PF_HEAD)
        __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
        TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)

#ifdef CONFIG_MMU
PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked)
        TESTSCFLAG_FALSE(Mlocked, mlocked)
#endif

#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached, uncached)
#endif

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#define MAGIC_HWPOISON        0x48575053U        /* HWPS */
extern void SetPageHWPoisonTakenOff(struct page *page);
extern void ClearPageHWPoisonTakenOff(struct page *page);
extern bool take_page_off_buddy(struct page *page);
extern bool put_page_back_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison, hwpoison)
#define __PG_HWPOISON 0
#endif

#ifdef CONFIG_PAGE_IDLE_FLAG
#ifdef CONFIG_64BIT
FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
#endif
/* See page_idle.h for !64BIT workaround */
#else /* !CONFIG_PAGE_IDLE_FLAG */
FOLIO_FLAG_FALSE(young)
FOLIO_TEST_CLEAR_FLAG_FALSE(young)
FOLIO_FLAG_FALSE(idle)
#endif

/*
 * PageReported() is used to track reported free pages within the Buddy
 * allocator. We can use the non-atomic version of the test and set
 * operations as both should be shielded with the zone lock to prevent
 * any possible races on the setting or clearing of the bit.
 */
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

#ifdef CONFIG_MEMORY_HOTPLUG
PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
#else
PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
#endif

/*
 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
 * bit; and then page->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
 * page and then page->mapping points to a struct movable_operations.
 *
 * Please note that, confusingly, "page_mapping" refers to the inode
 * address_space which maps the page from disk; whereas "page_mapped"
 * refers to user virtual address space into which the page is mapped.
 *
 * For slab pages, since slab reuses the bits in struct page to store its
 * internal states, the page->mapping does not exist as such, nor do these
 * flags below.  So in order to avoid testing non-existent bits, please
 * make sure that PageSlab(page) actually evaluates to false before calling
 * the following functions (e.g., PageAnon).  See mm/slab.h.
 */
#define PAGE_MAPPING_ANON        0x1
#define PAGE_MAPPING_MOVABLE        0x2
#define PAGE_MAPPING_KSM        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)

/*
 * Different with flags above, this flag is used only for fsdax mode.  It
 * indicates that this page->mapping is now under reflink case.
 */
#define PAGE_MAPPING_DAX_SHARED        ((void *)0x1)

static __always_inline bool folio_mapping_flags(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline bool PageMappingFlags(const struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline bool folio_test_anon(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
}

static __always_inline bool PageAnon(const struct page *page)
{
        return folio_test_anon(page_folio(page));
}

static __always_inline bool __folio_test_movable(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                        PAGE_MAPPING_MOVABLE;
}

static __always_inline bool __PageMovable(const struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_MOVABLE;
}

#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
static __always_inline bool folio_test_ksm(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_KSM;
}

static __always_inline bool PageKsm(const struct page *page)
{
        return folio_test_ksm(page_folio(page));
}
#else
TESTPAGEFLAG_FALSE(Ksm, ksm)
#endif

u64 stable_page_flags(const struct page *page);

/**
 * folio_xor_flags_has_waiters - Change some folio flags.
 * @folio: The folio.
 * @mask: Bits set in this word will be changed.
 *
 * This must only be used for flags which are changed with the folio
 * lock held.  For example, it is unsafe to use for PG_dirty as that
 * can be set without the folio lock held.  It can also only be used
 * on flags which are in the range 0-6 as some of the implementations
 * only affect those bits.
 *
 * Return: Whether there are tasks waiting on the folio.
 */
static inline bool folio_xor_flags_has_waiters(struct folio *folio,
                unsigned long mask)
{
        return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
}

/**
 * folio_test_uptodate - Is this folio up to date?
 * @folio: The folio.
 *
 * The uptodate flag is set on a folio when every byte in the folio is
 * at least as new as the corresponding bytes on storage.  Anonymous
 * and CoW folios are always uptodate.  If the folio is not uptodate,
 * some of the bytes in it may be; see the is_partially_uptodate()
 * address_space operation.
 */
static inline bool folio_test_uptodate(const struct folio *folio)
{
        bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0));
        /*
         * Must ensure that the data we read out of the folio is loaded
         * _after_ we've loaded folio->flags to check the uptodate bit.
         * We can skip the barrier if the folio is not uptodate, because
         * we wouldn't be reading anything from it.
         *
         * See folio_mark_uptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();

        return ret;
}

static inline bool PageUptodate(const struct page *page)
{
        return folio_test_uptodate(page_folio(page));
}

static __always_inline void __folio_mark_uptodate(struct folio *folio)
{
        smp_wmb();
        __set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void folio_mark_uptodate(struct folio *folio)
{
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the folio
         * uptodate are actually visible before folio_test_uptodate becomes true.
         */
        smp_wmb();
        set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void __SetPageUptodate(struct page *page)
{
        __folio_mark_uptodate((struct folio *)page);
}

static __always_inline void SetPageUptodate(struct page *page)
{
        folio_mark_uptodate((struct folio *)page);
}

CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)

void __folio_start_writeback(struct folio *folio, bool keep_write);
void set_page_writeback(struct page *page);

#define folio_start_writeback(folio)                        \
        __folio_start_writeback(folio, false)
#define folio_start_writeback_keepwrite(folio)        \
        __folio_start_writeback(folio, true)

static __always_inline bool folio_test_head(const struct folio *folio)
{
        return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY));
}

static __always_inline int PageHead(const struct page *page)
{
        PF_POISONED_CHECK(page);
        return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
}

__SETPAGEFLAG(Head, head, PF_ANY)
__CLEARPAGEFLAG(Head, head, PF_ANY)
CLEARPAGEFLAG(Head, head, PF_ANY)

/**
 * folio_test_large() - Does this folio contain more than one page?
 * @folio: The folio to test.
 *
 * Return: True if the folio is larger than one page.
 */
static inline bool folio_test_large(const struct folio *folio)
{
        return folio_test_head(folio);
}

static __always_inline void set_compound_head(struct page *page, struct page *head)
{
        WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}

static __always_inline void clear_compound_head(struct page *page)
{
        WRITE_ONCE(page->compound_head, 0);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
        BUG_ON(!PageHead(page));
        ClearPageHead(page);
}
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(large_rmappable)
#endif

#define PG_head_mask ((1UL << PG_head))

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * PageHuge() only returns true for hugetlbfs pages, but not for
 * normal or transparent huge pages.
 *
 * PageTransHuge() returns true for both transparent huge and
 * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
 * called only in the core VM paths where hugetlbfs pages can't exist.
 */
static inline int PageTransHuge(const struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        return PageHead(page);
}

/*
 * PageTransCompound returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransCompound(const struct page *page)
{
        return PageCompound(page);
}

/*
 * PageTransTail returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransTail(const struct page *page)
{
        return PageTail(page);
}
#else
TESTPAGEFLAG_FALSE(TransHuge, transhuge)
TESTPAGEFLAG_FALSE(TransCompound, transcompound)
TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap)
TESTPAGEFLAG_FALSE(TransTail, transtail)
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
 * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
 * compound page.
 *
 * This flag is set by hwpoison handler.  Cleared by THP split or free page.
 */
PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
        TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
#else
PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
        TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
#endif

/*
 * For pages that are never mapped to userspace,
 * page_type may be used.  Because it is initialised to -1, we invert the
 * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
 * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
 * low bits so that an underflow or overflow of _mapcount won't be
 * mistaken for a page type value.
 */

#define PAGE_TYPE_BASE        0xf0000000
/* Reserve                0x0000007f to catch underflows of _mapcount */
#define PAGE_MAPCOUNT_RESERVE        -128
#define PG_buddy        0x00000080
#define PG_offline        0x00000100
#define PG_table        0x00000200
#define PG_guard        0x00000400
#define PG_hugetlb        0x00000800
#define PG_slab                0x00001000

#define PageType(page, flag)                                                \
        ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
#define folio_test_type(folio, flag)                                        \
        ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)

static inline int page_type_has_type(unsigned int page_type)
{
        return (int)page_type < PAGE_MAPCOUNT_RESERVE;
}

static inline int page_has_type(const struct page *page)
{
        return page_type_has_type(page->page_type);
}

#define FOLIO_TYPE_OPS(lname, fname)                                        \
static __always_inline bool folio_test_##fname(const struct folio *folio)\
{                                                                        \
        return folio_test_type(folio, PG_##lname);                        \
}                                                                        \
static __always_inline void __folio_set_##fname(struct folio *folio)        \
{                                                                        \
        VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio);                \
        folio->page.page_type &= ~PG_##lname;                                \
}                                                                        \
static __always_inline void __folio_clear_##fname(struct folio *folio)        \
{                                                                        \
        VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio);                \
        folio->page.page_type |= PG_##lname;                                \
}

#define PAGE_TYPE_OPS(uname, lname, fname)                                \
FOLIO_TYPE_OPS(lname, fname)                                                \
static __always_inline int Page##uname(const struct page *page)                \
{                                                                        \
        return PageType(page, PG_##lname);                                \
}                                                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{                                                                        \
        VM_BUG_ON_PAGE(!PageType(page, 0), page);                        \
        page->page_type &= ~PG_##lname;                                        \
}                                                                        \
static __always_inline void __ClearPage##uname(struct page *page)        \
{                                                                        \
        VM_BUG_ON_PAGE(!Page##uname(page), page);                        \
        page->page_type |= PG_##lname;                                        \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy, buddy)

/*
 * PageOffline() indicates that the page is logically offline although the
 * containing section is online. (e.g. inflated in a balloon driver or
 * not onlined when onlining the section).
 * The content of these pages is effectively stale. Such pages should not
 * be touched (read/write/dump/save) except by their owner.
 *
 * If a driver wants to allow to offline unmovable PageOffline() pages without
 * putting them back to the buddy, it can do so via the memory notifier by
 * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
 * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
 * pages (now with a reference count of zero) are treated like free pages,
 * allowing the containing memory block to get offlined. A driver that
 * relies on this feature is aware that re-onlining the memory block will
 * require to re-set the pages PageOffline() and not giving them to the
 * buddy via online_page_callback_t.
 *
 * There are drivers that mark a page PageOffline() and expect there won't be
 * any further access to page content. PFN walkers that read content of random
 * pages should check PageOffline() and synchronize with such drivers using
 * page_offline_freeze()/page_offline_thaw().
 */
PAGE_TYPE_OPS(Offline, offline, offline)

extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
extern void page_offline_begin(void);
extern void page_offline_end(void);

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table, pgtable)

/*
 * Marks guardpages used with debug_pagealloc.
 */
PAGE_TYPE_OPS(Guard, guard, guard)

FOLIO_TYPE_OPS(slab, slab)

/**
 * PageSlab - Determine if the page belongs to the slab allocator
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for slab pages, false for any other kind of page.
 */
static inline bool PageSlab(const struct page *page)
{
        return folio_test_slab(page_folio(page));
}

#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
#else
FOLIO_TEST_FLAG_FALSE(hugetlb)
#endif

/**
 * PageHuge - Determine if the page belongs to hugetlbfs
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for hugetlbfs pages, false for anon pages or pages
 * belonging to other filesystems.
 */
static inline bool PageHuge(const struct page *page)
{
        return folio_test_hugetlb(page_folio(page));
}

/*
 * Check if a page is currently marked HWPoisoned. Note that this check is
 * best effort only and inherently racy: there is no way to synchronize with
 * failing hardware.
 */
static inline bool is_page_hwpoison(const struct page *page)
{
        const struct folio *folio;

        if (PageHWPoison(page))
                return true;
        folio = page_folio(page);
        return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
}

bool is_free_buddy_page(const struct page *page);

PAGEFLAG(Isolated, isolated, PF_ANY);

static __always_inline int PageAnonExclusive(const struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        /*
         * HugeTLB stores this information on the head page; THP keeps it per
         * page
         */
        if (PageHuge(page))
                page = compound_head(page);
        return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void SetPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

#ifdef CONFIG_MMU
#define __PG_MLOCKED                (1UL << PG_mlocked)
#else
#define __PG_MLOCKED                0
#endif

/*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  If they are, there is a problem.
 */
#define PAGE_FLAGS_CHECK_AT_FREE                                \
        (1UL << PG_lru                | 1UL << PG_locked        |        \
         1UL << PG_private        | 1UL << PG_private_2        |        \
         1UL << PG_writeback        | 1UL << PG_reserved        |        \
         1UL << PG_active         |                                \
         1UL << PG_unevictable        | __PG_MLOCKED | LRU_GEN_MASK)

/*
 * Flags checked when a page is prepped for return by the page allocator.
 * Pages being prepped should not have these flags set.  If they are set,
 * there has been a kernel bug or struct page corruption.
 *
 * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
 * alloc-free cycle to prevent from reusing the page.
 */
#define PAGE_FLAGS_CHECK_AT_PREP        \
        ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)

/*
 * Flags stored in the second page of a compound page.  They may overlap
 * the CHECK_AT_FREE flags above, so need to be cleared.
 */
#define PAGE_FLAGS_SECOND                                                \
        (0xffUL /* order */                | 1UL << PG_has_hwpoisoned |        \
         1UL << PG_large_rmappable)

#define PAGE_FLAGS_PRIVATE                                \
        (1UL << PG_private | 1UL << PG_private_2)
/**
 * page_has_private - Determine if page has private stuff
 * @page: The page to be checked
 *
 * Determine if a page has private stuff, indicating that release routines
 * should be invoked upon it.
 */
static inline int page_has_private(const struct page *page)
{
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
}

static inline bool folio_has_private(const struct folio *folio)
{
        return page_has_private(&folio->page);
}

#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */

#endif        /* PAGE_FLAGS_H */
















   19 











   22 







    8 





    7 





   13 












   13 



   14 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// SPDX-License-Identifier: GPL-2.0
#include <linux/memblock.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
#include <linux/mm.h>

#include <asm/page.h>
#include <linux/vmalloc.h>

#include "physaddr.h"

#ifdef CONFIG_X86_64

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
        }

        return x;
}
EXPORT_SYMBOL(__phys_addr);

unsigned long __phys_addr_symbol(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* only check upper bounds since lower bounds will trigger carry */
        VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);

        return y + phys_base;
}
EXPORT_SYMBOL(__phys_addr_symbol);
#endif

bool __virt_addr_valid(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                if (y >= KERNEL_IMAGE_SIZE)
                        return false;
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                if ((x > y) || !phys_addr_valid(x))
                        return false;
        }

        return pfn_valid(x >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#else

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long phys_addr = x - PAGE_OFFSET;
        /* VMALLOC_* aren't constants  */
        VIRTUAL_BUG_ON(x < PAGE_OFFSET);
        VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
        /* max_low_pfn is set early, but not _that_ early */
        if (max_low_pfn) {
                VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
                BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
        }
        return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif

bool __virt_addr_valid(unsigned long x)
{
        if (x < PAGE_OFFSET)
                return false;
        if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
                return false;
        if (x >= FIXADDR_START)
                return false;
        return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#endif        /* CONFIG_X86_64 */













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NFNETLINK_H
#define _NFNETLINK_H

#include <linux/netlink.h>
#include <linux/capability.h>
#include <net/netlink.h>
#include <uapi/linux/netfilter/nfnetlink.h>

struct nfnl_info {
        struct net                *net;
        struct sock                *sk;
        const struct nlmsghdr        *nlh;
        const struct nfgenmsg        *nfmsg;
        struct netlink_ext_ack        *extack;
};

enum nfnl_callback_type {
        NFNL_CB_UNSPEC        = 0,
        NFNL_CB_MUTEX,
        NFNL_CB_RCU,
        NFNL_CB_BATCH,
};

struct nfnl_callback {
        int (*call)(struct sk_buff *skb, const struct nfnl_info *info,
                    const struct nlattr * const cda[]);
        const struct nla_policy        *policy;
        enum nfnl_callback_type        type;
        __u16                        attr_count;
};

enum nfnl_abort_action {
        NFNL_ABORT_NONE                = 0,
        NFNL_ABORT_AUTOLOAD,
        NFNL_ABORT_VALIDATE,
};

struct nfnetlink_subsystem {
        const char *name;
        __u8 subsys_id;                        /* nfnetlink subsystem ID */
        __u8 cb_count;                        /* number of callbacks */
        const struct nfnl_callback *cb;        /* callback for individual types */
        struct module *owner;
        int (*commit)(struct net *net, struct sk_buff *skb);
        int (*abort)(struct net *net, struct sk_buff *skb,
                     enum nfnl_abort_action action);
        bool (*valid_genid)(struct net *net, u32 genid);
};

int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);

int nfnetlink_has_listeners(struct net *net, unsigned int group);
int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
                   unsigned int group, int echo, gfp_t flags);
int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid);
void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid,
                         __u32 group, gfp_t allocation);

static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type)
{
        return subsys << 8 | msg_type;
}

static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version,
                                 __be16 res_id)
{
        struct nfgenmsg *nfmsg;

        nfmsg = nlmsg_data(nlh);
        nfmsg->nfgen_family = family;
        nfmsg->version = version;
        nfmsg->res_id = res_id;
}

static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid,
                                            u32 seq, int type, int flags,
                                            u8 family, u8 version,
                                            __be16 res_id)
{
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
        if (!nlh)
                return NULL;

        nfnl_fill_hdr(nlh, family, version, res_id);

        return nlh;
}

void nfnl_lock(__u8 subsys_id);
void nfnl_unlock(__u8 subsys_id);
#ifdef CONFIG_PROVE_LOCKING
bool lockdep_nfnl_is_held(__u8 subsys_id);
#else
static inline bool lockdep_nfnl_is_held(__u8 subsys_id)
{
        return true;
}
#endif /* CONFIG_PROVE_LOCKING */

#define MODULE_ALIAS_NFNL_SUBSYS(subsys) \
        MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys))

#endif        /* _NFNETLINK_H */








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 





















    1 











    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/kernel/sys.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/export.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kmod.h>
#include <linux/ksm.h>
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/capability.h>
#include <linux/device.h>
#include <linux/key.h>
#include <linux/times.h>
#include <linux/posix-timers.h>
#include <linux/security.h>
#include <linux/random.h>
#include <linux/suspend.h>
#include <linux/tty.h>
#include <linux/signal.h>
#include <linux/cn_proc.h>
#include <linux/getcpu.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/seccomp.h>
#include <linux/cpu.h>
#include <linux/personality.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
#include <linux/syscall_user_dispatch.h>

#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>

#include <linux/sched.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/stat.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>

#include <linux/nospec.h>

#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>

#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>

#include "uid16.h"

#ifndef SET_UNALIGN_CTL
# define SET_UNALIGN_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
# define GET_UNALIGN_CTL(a, b)        (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
# define SET_FPEMU_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
# define GET_FPEMU_CTL(a, b)        (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
# define SET_FPEXC_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
# define GET_FPEXC_CTL(a, b)        (-EINVAL)
#endif
#ifndef GET_ENDIAN
# define GET_ENDIAN(a, b)        (-EINVAL)
#endif
#ifndef SET_ENDIAN
# define SET_ENDIAN(a, b)        (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a)                (-EINVAL)
#endif
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a)                (-EINVAL)
#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a)                (-EINVAL)
#endif
#ifndef SET_FP_MODE
# define SET_FP_MODE(a,b)        (-EINVAL)
#endif
#ifndef SVE_SET_VL
# define SVE_SET_VL(a)                (-EINVAL)
#endif
#ifndef SVE_GET_VL
# define SVE_GET_VL()                (-EINVAL)
#endif
#ifndef SME_SET_VL
# define SME_SET_VL(a)                (-EINVAL)
#endif
#ifndef SME_GET_VL
# define SME_GET_VL()                (-EINVAL)
#endif
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b)        (-EINVAL)
#endif
#ifndef PAC_SET_ENABLED_KEYS
# define PAC_SET_ENABLED_KEYS(a, b, c)        (-EINVAL)
#endif
#ifndef PAC_GET_ENABLED_KEYS
# define PAC_GET_ENABLED_KEYS(a)        (-EINVAL)
#endif
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a)        (-EINVAL)
#endif
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL()                (-EINVAL)
#endif
#ifndef RISCV_V_SET_CONTROL
# define RISCV_V_SET_CONTROL(a)                (-EINVAL)
#endif
#ifndef RISCV_V_GET_CONTROL
# define RISCV_V_GET_CONTROL()                (-EINVAL)
#endif
#ifndef RISCV_SET_ICACHE_FLUSH_CTX
# define RISCV_SET_ICACHE_FLUSH_CTX(a, b)        (-EINVAL)
#endif
#ifndef PPC_GET_DEXCR_ASPECT
# define PPC_GET_DEXCR_ASPECT(a, b)        (-EINVAL)
#endif
#ifndef PPC_SET_DEXCR_ASPECT
# define PPC_SET_DEXCR_ASPECT(a, b, c)        (-EINVAL)
#endif

/*
 * this is where the system-wide overflow UID and GID are defined, for
 * architectures that now have 32-bit UID/GID but didn't in the past
 */

int overflowuid = DEFAULT_OVERFLOWUID;
int overflowgid = DEFAULT_OVERFLOWGID;

EXPORT_SYMBOL(overflowuid);
EXPORT_SYMBOL(overflowgid);

/*
 * the same as above, but for filesystems which can only store a 16-bit
 * UID and GID. as such, this is needed on all architectures
 */

int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;

EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);

/*
 * Returns true if current's euid is same as p's uid or euid,
 * or has CAP_SYS_NICE to p's user_ns.
 *
 * Called with rcu_read_lock, creds are safe
 */
static bool set_one_prio_perm(struct task_struct *p)
{
        const struct cred *cred = current_cred(), *pcred = __task_cred(p);

        if (uid_eq(pcred->uid,  cred->euid) ||
            uid_eq(pcred->euid, cred->euid))
                return true;
        if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
                return true;
        return false;
}

/*
 * set the priority of a task
 * - the caller must hold the RCU read lock
 */
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
        int no_nice;

        if (!set_one_prio_perm(p)) {
                error = -EPERM;
                goto out;
        }
        if (niceval < task_nice(p) && !can_nice(p, niceval)) {
                error = -EACCES;
                goto out;
        }
        no_nice = security_task_setnice(p, niceval);
        if (no_nice) {
                error = no_nice;
                goto out;
        }
        if (error == -ESRCH)
                error = 0;
        set_user_nice(p, niceval);
out:
        return error;
}

SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
{
        struct task_struct *g, *p;
        struct user_struct *user;
        const struct cred *cred = current_cred();
        int error = -EINVAL;
        struct pid *pgrp;
        kuid_t uid;

        if (which > PRIO_USER || which < PRIO_PROCESS)
                goto out;

        /* normalize: avoid signed division (rounding problems) */
        error = -ESRCH;
        if (niceval < MIN_NICE)
                niceval = MIN_NICE;
        if (niceval > MAX_NICE)
                niceval = MAX_NICE;

        rcu_read_lock();
        switch (which) {
        case PRIO_PROCESS:
                if (who)
                        p = find_task_by_vpid(who);
                else
                        p = current;
                if (p)
                        error = set_one_prio(p, niceval, error);
                break;
        case PRIO_PGRP:
                if (who)
                        pgrp = find_vpid(who);
                else
                        pgrp = task_pgrp(current);
                read_lock(&tasklist_lock);
                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                        error = set_one_prio(p, niceval, error);
                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                read_unlock(&tasklist_lock);
                break;
        case PRIO_USER:
                uid = make_kuid(cred->user_ns, who);
                user = cred->user;
                if (!who)
                        uid = cred->uid;
                else if (!uid_eq(uid, cred->uid)) {
                        user = find_user(uid);
                        if (!user)
                                goto out_unlock;        /* No processes for this user */
                }
                for_each_process_thread(g, p) {
                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
                                error = set_one_prio(p, niceval, error);
                }
                if (!uid_eq(uid, cred->uid))
                        free_uid(user);                /* For find_user() */
                break;
        }
out_unlock:
        rcu_read_unlock();
out:
        return error;
}

/*
 * Ugh. To avoid negative return values, "getpriority()" will
 * not return the normal nice-value, but a negated value that
 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
 * to stay compatible.
 */
SYSCALL_DEFINE2(getpriority, int, which, int, who)
{
        struct task_struct *g, *p;
        struct user_struct *user;
        const struct cred *cred = current_cred();
        long niceval, retval = -ESRCH;
        struct pid *pgrp;
        kuid_t uid;

        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;

        rcu_read_lock();
        switch (which) {
        case PRIO_PROCESS:
                if (who)
                        p = find_task_by_vpid(who);
                else
                        p = current;
                if (p) {
                        niceval = nice_to_rlimit(task_nice(p));
                        if (niceval > retval)
                                retval = niceval;
                }
                break;
        case PRIO_PGRP:
                if (who)
                        pgrp = find_vpid(who);
                else
                        pgrp = task_pgrp(current);
                read_lock(&tasklist_lock);
                do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                        niceval = nice_to_rlimit(task_nice(p));
                        if (niceval > retval)
                                retval = niceval;
                } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                read_unlock(&tasklist_lock);
                break;
        case PRIO_USER:
                uid = make_kuid(cred->user_ns, who);
                user = cred->user;
                if (!who)
                        uid = cred->uid;
                else if (!uid_eq(uid, cred->uid)) {
                        user = find_user(uid);
                        if (!user)
                                goto out_unlock;        /* No processes for this user */
                }
                for_each_process_thread(g, p) {
                        if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        }
                }
                if (!uid_eq(uid, cred->uid))
                        free_uid(user);                /* for find_user() */
                break;
        }
out_unlock:
        rcu_read_unlock();

        return retval;
}

/*
 * Unprivileged users may change the real gid to the effective gid
 * or vice versa.  (BSD-style)
 *
 * If you set the real gid at all, or set the effective gid to a value not
 * equal to the real gid, then the saved gid is set to the new effective gid.
 *
 * This makes it possible for a setgid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setregid() will be
 * 100% compatible with BSD.  A program which uses just setgid() will be
 * 100% compatible with POSIX with saved IDs.
 *
 * SMP: There are not races, the GIDs are checked only by filesystem
 *      operations (as far as semantic preservation is concerned).
 */
#ifdef CONFIG_MULTIUSER
long __sys_setregid(gid_t rgid, gid_t egid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t krgid, kegid;

        krgid = make_kgid(ns, rgid);
        kegid = make_kgid(ns, egid);

        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
                return -EINVAL;
        if ((egid != (gid_t) -1) && !gid_valid(kegid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (rgid != (gid_t) -1) {
                if (gid_eq(old->gid, krgid) ||
                    gid_eq(old->egid, krgid) ||
                    ns_capable_setid(old->user_ns, CAP_SETGID))
                        new->gid = krgid;
                else
                        goto error;
        }
        if (egid != (gid_t) -1) {
                if (gid_eq(old->gid, kegid) ||
                    gid_eq(old->egid, kegid) ||
                    gid_eq(old->sgid, kegid) ||
                    ns_capable_setid(old->user_ns, CAP_SETGID))
                        new->egid = kegid;
                else
                        goto error;
        }

        if (rgid != (gid_t) -1 ||
            (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
                new->sgid = new->egid;
        new->fsgid = new->egid;

        retval = security_task_fix_setgid(new, old, LSM_SETID_RE);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
{
        return __sys_setregid(rgid, egid);
}

/*
 * setgid() is implemented like SysV w/ SAVED_IDS
 *
 * SMP: Same implicit races as above.
 */
long __sys_setgid(gid_t gid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t kgid;

        kgid = make_kgid(ns, gid);
        if (!gid_valid(kgid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ns_capable_setid(old->user_ns, CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = kgid;
        else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
                new->egid = new->fsgid = kgid;
        else
                goto error;

        retval = security_task_fix_setgid(new, old, LSM_SETID_ID);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE1(setgid, gid_t, gid)
{
        return __sys_setgid(gid);
}

/*
 * change the user struct in a credentials set to match the new UID
 */
static int set_user(struct cred *new)
{
        struct user_struct *new_user;

        new_user = alloc_uid(new->uid);
        if (!new_user)
                return -EAGAIN;

        free_uid(new->user);
        new->user = new_user;
        return 0;
}

static void flag_nproc_exceeded(struct cred *new)
{
        if (new->ucounts == current_ucounts())
                return;

        /*
         * We don't fail in case of NPROC limit excess here because too many
         * poorly written programs don't check set*uid() return code, assuming
         * it never fails if called by root.  We may still enforce NPROC limit
         * for programs doing set*uid()+execve() by harmlessly deferring the
         * failure to the execve() stage.
         */
        if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
                        new->user != INIT_USER)
                current->flags |= PF_NPROC_EXCEEDED;
        else
                current->flags &= ~PF_NPROC_EXCEEDED;
}

/*
 * Unprivileged users may change the real uid to the effective uid
 * or vice versa.  (BSD-style)
 *
 * If you set the real uid at all, or set the effective uid to a value not
 * equal to the real uid, then the saved uid is set to the new effective uid.
 *
 * This makes it possible for a setuid program to completely drop its
 * privileges, which is often a useful assertion to make when you are doing
 * a security audit over a program.
 *
 * The general idea is that a program which uses just setreuid() will be
 * 100% compatible with BSD.  A program which uses just setuid() will be
 * 100% compatible with POSIX with saved IDs.
 */
long __sys_setreuid(uid_t ruid, uid_t euid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kruid, keuid;

        kruid = make_kuid(ns, ruid);
        keuid = make_kuid(ns, euid);

        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
                return -EINVAL;
        if ((euid != (uid_t) -1) && !uid_valid(keuid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ruid != (uid_t) -1) {
                new->uid = kruid;
                if (!uid_eq(old->uid, kruid) &&
                    !uid_eq(old->euid, kruid) &&
                    !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }

        if (euid != (uid_t) -1) {
                new->euid = keuid;
                if (!uid_eq(old->uid, keuid) &&
                    !uid_eq(old->euid, keuid) &&
                    !uid_eq(old->suid, keuid) &&
                    !ns_capable_setid(old->user_ns, CAP_SETUID))
                        goto error;
        }

        if (!uid_eq(new->uid, old->uid)) {
                retval = set_user(new);
                if (retval < 0)
                        goto error;
        }
        if (ruid != (uid_t) -1 ||
            (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
                new->suid = new->euid;
        new->fsuid = new->euid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
        if (retval < 0)
                goto error;

        retval = set_cred_ucounts(new);
        if (retval < 0)
                goto error;

        flag_nproc_exceeded(new);
        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
        return __sys_setreuid(ruid, euid);
}

/*
 * setuid() is implemented like SysV with SAVED_IDS
 *
 * Note that SAVED_ID's is deficient in that a setuid root program
 * like sendmail, for example, cannot set its uid to be a normal
 * user and then switch back, because if you're root, setuid() sets
 * the saved uid too.  If you don't like this, blame the bright people
 * in the POSIX committee and/or USG.  Note that the BSD-style setreuid()
 * will allow a root program to temporarily drop privileges and be able to
 * regain them by swapping the real and effective uid.
 */
long __sys_setuid(uid_t uid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kuid;

        kuid = make_kuid(ns, uid);
        if (!uid_valid(kuid))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        old = current_cred();

        retval = -EPERM;
        if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
                new->suid = new->uid = kuid;
                if (!uid_eq(kuid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
                goto error;
        }

        new->fsuid = new->euid = kuid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
        if (retval < 0)
                goto error;

        retval = set_cred_ucounts(new);
        if (retval < 0)
                goto error;

        flag_nproc_exceeded(new);
        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE1(setuid, uid_t, uid)
{
        return __sys_setuid(uid);
}


/*
 * This function implements a generic ability to update ruid, euid,
 * and suid.  This allows you to implement the 4.4 compatible seteuid().
 */
long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kuid_t kruid, keuid, ksuid;
        bool ruid_new, euid_new, suid_new;

        kruid = make_kuid(ns, ruid);
        keuid = make_kuid(ns, euid);
        ksuid = make_kuid(ns, suid);

        if ((ruid != (uid_t) -1) && !uid_valid(kruid))
                return -EINVAL;

        if ((euid != (uid_t) -1) && !uid_valid(keuid))
                return -EINVAL;

        if ((suid != (uid_t) -1) && !uid_valid(ksuid))
                return -EINVAL;

        old = current_cred();

        /* check for no-op */
        if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
            (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
                                    uid_eq(keuid, old->fsuid))) &&
            (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
                return 0;

        ruid_new = ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
                   !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
        euid_new = euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
                   !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
        suid_new = suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
                   !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
        if ((ruid_new || euid_new || suid_new) &&
            !ns_capable_setid(old->user_ns, CAP_SETUID))
                return -EPERM;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (ruid != (uid_t) -1) {
                new->uid = kruid;
                if (!uid_eq(kruid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        }
        if (euid != (uid_t) -1)
                new->euid = keuid;
        if (suid != (uid_t) -1)
                new->suid = ksuid;
        new->fsuid = new->euid;

        retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
        if (retval < 0)
                goto error;

        retval = set_cred_ucounts(new);
        if (retval < 0)
                goto error;

        flag_nproc_exceeded(new);
        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
{
        return __sys_setresuid(ruid, euid, suid);
}

SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
{
        const struct cred *cred = current_cred();
        int retval;
        uid_t ruid, euid, suid;

        ruid = from_kuid_munged(cred->user_ns, cred->uid);
        euid = from_kuid_munged(cred->user_ns, cred->euid);
        suid = from_kuid_munged(cred->user_ns, cred->suid);

        retval = put_user(ruid, ruidp);
        if (!retval) {
                retval = put_user(euid, euidp);
                if (!retval)
                        return put_user(suid, suidp);
        }
        return retval;
}

/*
 * Same as above, but for rgid, egid, sgid.
 */
long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
{
        struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
        kgid_t krgid, kegid, ksgid;
        bool rgid_new, egid_new, sgid_new;

        krgid = make_kgid(ns, rgid);
        kegid = make_kgid(ns, egid);
        ksgid = make_kgid(ns, sgid);

        if ((rgid != (gid_t) -1) && !gid_valid(krgid))
                return -EINVAL;
        if ((egid != (gid_t) -1) && !gid_valid(kegid))
                return -EINVAL;
        if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
                return -EINVAL;

        old = current_cred();

        /* check for no-op */
        if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
            (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
                                    gid_eq(kegid, old->fsgid))) &&
            (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
                return 0;

        rgid_new = rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
                   !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
        egid_new = egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
                   !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
        sgid_new = sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
                   !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
        if ((rgid_new || egid_new || sgid_new) &&
            !ns_capable_setid(old->user_ns, CAP_SETGID))
                return -EPERM;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (rgid != (gid_t) -1)
                new->gid = krgid;
        if (egid != (gid_t) -1)
                new->egid = kegid;
        if (sgid != (gid_t) -1)
                new->sgid = ksgid;
        new->fsgid = new->egid;

        retval = security_task_fix_setgid(new, old, LSM_SETID_RES);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
{
        return __sys_setresgid(rgid, egid, sgid);
}

SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
{
        const struct cred *cred = current_cred();
        int retval;
        gid_t rgid, egid, sgid;

        rgid = from_kgid_munged(cred->user_ns, cred->gid);
        egid = from_kgid_munged(cred->user_ns, cred->egid);
        sgid = from_kgid_munged(cred->user_ns, cred->sgid);

        retval = put_user(rgid, rgidp);
        if (!retval) {
                retval = put_user(egid, egidp);
                if (!retval)
                        retval = put_user(sgid, sgidp);
        }

        return retval;
}


/*
 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
 * is used for "access()" and for the NFS daemon (letting nfsd stay at
 * whatever uid it wants to). It normally shadows "euid", except when
 * explicitly set by setfsuid() or for access..
 */
long __sys_setfsuid(uid_t uid)
{
        const struct cred *old;
        struct cred *new;
        uid_t old_fsuid;
        kuid_t kuid;

        old = current_cred();
        old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);

        kuid = make_kuid(old->user_ns, uid);
        if (!uid_valid(kuid))
                return old_fsuid;

        new = prepare_creds();
        if (!new)
                return old_fsuid;

        if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
            uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
            ns_capable_setid(old->user_ns, CAP_SETUID)) {
                if (!uid_eq(kuid, old->fsuid)) {
                        new->fsuid = kuid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
                                goto change_okay;
                }
        }

        abort_creds(new);
        return old_fsuid;

change_okay:
        commit_creds(new);
        return old_fsuid;
}

SYSCALL_DEFINE1(setfsuid, uid_t, uid)
{
        return __sys_setfsuid(uid);
}

/*
 * Samma på svenska..
 */
long __sys_setfsgid(gid_t gid)
{
        const struct cred *old;
        struct cred *new;
        gid_t old_fsgid;
        kgid_t kgid;

        old = current_cred();
        old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);

        kgid = make_kgid(old->user_ns, gid);
        if (!gid_valid(kgid))
                return old_fsgid;

        new = prepare_creds();
        if (!new)
                return old_fsgid;

        if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
            gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
            ns_capable_setid(old->user_ns, CAP_SETGID)) {
                if (!gid_eq(kgid, old->fsgid)) {
                        new->fsgid = kgid;
                        if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0)
                                goto change_okay;
                }
        }

        abort_creds(new);
        return old_fsgid;

change_okay:
        commit_creds(new);
        return old_fsgid;
}

SYSCALL_DEFINE1(setfsgid, gid_t, gid)
{
        return __sys_setfsgid(gid);
}
#endif /* CONFIG_MULTIUSER */

/**
 * sys_getpid - return the thread group id of the current process
 *
 * Note, despite the name, this returns the tgid not the pid.  The tgid and
 * the pid are identical unless CLONE_THREAD was specified on clone() in
 * which case the tgid is the same in all threads of the same group.
 *
 * This is SMP safe as current->tgid does not change.
 */
SYSCALL_DEFINE0(getpid)
{
        return task_tgid_vnr(current);
}

/* Thread ID - the internal kernel "pid" */
SYSCALL_DEFINE0(gettid)
{
        return task_pid_vnr(current);
}

/*
 * Accessing ->real_parent is not SMP-safe, it could
 * change from under us. However, we can use a stale
 * value of ->real_parent under rcu_read_lock(), see
 * release_task()->call_rcu(delayed_put_task_struct).
 */
SYSCALL_DEFINE0(getppid)
{
        int pid;

        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();

        return pid;
}

SYSCALL_DEFINE0(getuid)
{
        /* Only we change this so SMP safe */
        return from_kuid_munged(current_user_ns(), current_uid());
}

SYSCALL_DEFINE0(geteuid)
{
        /* Only we change this so SMP safe */
        return from_kuid_munged(current_user_ns(), current_euid());
}

SYSCALL_DEFINE0(getgid)
{
        /* Only we change this so SMP safe */
        return from_kgid_munged(current_user_ns(), current_gid());
}

SYSCALL_DEFINE0(getegid)
{
        /* Only we change this so SMP safe */
        return from_kgid_munged(current_user_ns(), current_egid());
}

static void do_sys_times(struct tms *tms)
{
        u64 tgutime, tgstime, cutime, cstime;

        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        tms->tms_utime = nsec_to_clock_t(tgutime);
        tms->tms_stime = nsec_to_clock_t(tgstime);
        tms->tms_cutime = nsec_to_clock_t(cutime);
        tms->tms_cstime = nsec_to_clock_t(cstime);
}

SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
{
        if (tbuf) {
                struct tms tmp;

                do_sys_times(&tmp);
                if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return (long) jiffies_64_to_clock_t(get_jiffies_64());
}

#ifdef CONFIG_COMPAT
static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
{
        return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
}

COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
{
        if (tbuf) {
                struct tms tms;
                struct compat_tms tmp;

                do_sys_times(&tms);
                /* Convert our struct tms to the compat version. */
                tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
                tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
                tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
                tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
                if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return compat_jiffies_to_clock_t(jiffies);
}
#endif

/*
 * This needs some heavy checking ...
 * I just haven't the stomach for it. I also don't fully
 * understand sessions/pgrp etc. Let somebody who does explain it.
 *
 * OK, I think I have the protection semantics right.... this is really
 * only important on a multi-user system anyway, to make sure one user
 * can't send a signal to a process owned by another.  -TYT, 12/12/91
 *
 * !PF_FORKNOEXEC check to conform completely to POSIX.
 */
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
        struct task_struct *p;
        struct task_struct *group_leader = current->group_leader;
        struct pid *pgrp;
        int err;

        if (!pid)
                pid = task_pid_vnr(group_leader);
        if (!pgid)
                pgid = pid;
        if (pgid < 0)
                return -EINVAL;
        rcu_read_lock();

        /* From this point forward we keep holding onto the tasklist lock
         * so that our parent does not change from under us. -DaveM
         */
        write_lock_irq(&tasklist_lock);

        err = -ESRCH;
        p = find_task_by_vpid(pid);
        if (!p)
                goto out;

        err = -EINVAL;
        if (!thread_group_leader(p))
                goto out;

        if (same_thread_group(p->real_parent, group_leader)) {
                err = -EPERM;
                if (task_session(p) != task_session(group_leader))
                        goto out;
                err = -EACCES;
                if (!(p->flags & PF_FORKNOEXEC))
                        goto out;
        } else {
                err = -ESRCH;
                if (p != group_leader)
                        goto out;
        }

        err = -EPERM;
        if (p->signal->leader)
                goto out;

        pgrp = task_pid(p);
        if (pgid != pid) {
                struct task_struct *g;

                pgrp = find_vpid(pgid);
                g = pid_task(pgrp, PIDTYPE_PGID);
                if (!g || task_session(g) != task_session(group_leader))
                        goto out;
        }

        err = security_task_setpgid(p, pgid);
        if (err)
                goto out;

        if (task_pgrp(p) != pgrp)
                change_pid(p, PIDTYPE_PGID, pgrp);

        err = 0;
out:
        /* All paths lead to here, thus we are safe. -DaveM */
        write_unlock_irq(&tasklist_lock);
        rcu_read_unlock();
        return err;
}

static int do_getpgid(pid_t pid)
{
        struct task_struct *p;
        struct pid *grp;
        int retval;

        rcu_read_lock();
        if (!pid)
                grp = task_pgrp(current);
        else {
                retval = -ESRCH;
                p = find_task_by_vpid(pid);
                if (!p)
                        goto out;
                grp = task_pgrp(p);
                if (!grp)
                        goto out;

                retval = security_task_getpgid(p);
                if (retval)
                        goto out;
        }
        retval = pid_vnr(grp);
out:
        rcu_read_unlock();
        return retval;
}

SYSCALL_DEFINE1(getpgid, pid_t, pid)
{
        return do_getpgid(pid);
}

#ifdef __ARCH_WANT_SYS_GETPGRP

SYSCALL_DEFINE0(getpgrp)
{
        return do_getpgid(0);
}

#endif

SYSCALL_DEFINE1(getsid, pid_t, pid)
{
        struct task_struct *p;
        struct pid *sid;
        int retval;

        rcu_read_lock();
        if (!pid)
                sid = task_session(current);
        else {
                retval = -ESRCH;
                p = find_task_by_vpid(pid);
                if (!p)
                        goto out;
                sid = task_session(p);
                if (!sid)
                        goto out;

                retval = security_task_getsid(p);
                if (retval)
                        goto out;
        }
        retval = pid_vnr(sid);
out:
        rcu_read_unlock();
        return retval;
}

static void set_special_pids(struct pid *pid)
{
        struct task_struct *curr = current->group_leader;

        if (task_session(curr) != pid)
                change_pid(curr, PIDTYPE_SID, pid);

        if (task_pgrp(curr) != pid)
                change_pid(curr, PIDTYPE_PGID, pid);
}

int ksys_setsid(void)
{
        struct task_struct *group_leader = current->group_leader;
        struct pid *sid = task_pid(group_leader);
        pid_t session = pid_vnr(sid);
        int err = -EPERM;

        write_lock_irq(&tasklist_lock);
        /* Fail if I am already a session leader */
        if (group_leader->signal->leader)
                goto out;

        /* Fail if a process group id already exists that equals the
         * proposed session id.
         */
        if (pid_task(sid, PIDTYPE_PGID))
                goto out;

        group_leader->signal->leader = 1;
        set_special_pids(sid);

        proc_clear_tty(group_leader);

        err = session;
out:
        write_unlock_irq(&tasklist_lock);
        if (err > 0) {
                proc_sid_connector(group_leader);
                sched_autogroup_create_attach(group_leader);
        }
        return err;
}

SYSCALL_DEFINE0(setsid)
{
        return ksys_setsid();
}

DECLARE_RWSEM(uts_sem);

#ifdef COMPAT_UTS_MACHINE
#define override_architecture(name) \
        (personality(current->personality) == PER_LINUX32 && \
         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
                      sizeof(COMPAT_UTS_MACHINE)))
#else
#define override_architecture(name)        0
#endif

/*
 * Work around broken programs that cannot handle "Linux 3.0".
 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be
 * 2.6.60.
 */
static int override_release(char __user *release, size_t len)
{
        int ret = 0;

        if (current->personality & UNAME26) {
                const char *rest = UTS_RELEASE;
                char buf[65] = { 0 };
                int ndots = 0;
                unsigned v;
                size_t copy;

                while (*rest) {
                        if (*rest == '.' && ++ndots >= 3)
                                break;
                        if (!isdigit(*rest) && *rest != '.')
                                break;
                        rest++;
                }
                v = LINUX_VERSION_PATCHLEVEL + 60;
                copy = clamp_t(size_t, len, 1, sizeof(buf));
                copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
                ret = copy_to_user(release, buf, copy + 1);
        }
        return ret;
}

SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
{
        struct new_utsname tmp;

        down_read(&uts_sem);
        memcpy(&tmp, utsname(), sizeof(tmp));
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        if (override_architecture(name))
                return -EFAULT;
        return 0;
}

#ifdef __ARCH_WANT_SYS_OLD_UNAME
/*
 * Old cruft
 */
SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
{
        struct old_utsname tmp;

        if (!name)
                return -EFAULT;

        down_read(&uts_sem);
        memcpy(&tmp, utsname(), sizeof(tmp));
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        if (override_architecture(name))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
{
        struct oldold_utsname tmp;

        if (!name)
                return -EFAULT;

        memset(&tmp, 0, sizeof(tmp));

        down_read(&uts_sem);
        memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN);
        memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
        memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN);
        memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN);
        memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN);
        up_read(&uts_sem);
        if (copy_to_user(name, &tmp, sizeof(tmp)))
                return -EFAULT;

        if (override_architecture(name))
                return -EFAULT;
        if (override_release(name->release, sizeof(name->release)))
                return -EFAULT;
        return 0;
}
#endif

SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
{
        int errno;
        char tmp[__NEW_UTS_LEN];

        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
                struct new_utsname *u;

                add_device_randomness(tmp, len);
                down_write(&uts_sem);
                u = utsname();
                memcpy(u->nodename, tmp, len);
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
                uts_proc_notify(UTS_PROC_HOSTNAME);
                up_write(&uts_sem);
        }
        return errno;
}

#ifdef __ARCH_WANT_SYS_GETHOSTNAME

SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
{
        int i;
        struct new_utsname *u;
        char tmp[__NEW_UTS_LEN + 1];

        if (len < 0)
                return -EINVAL;
        down_read(&uts_sem);
        u = utsname();
        i = 1 + strlen(u->nodename);
        if (i > len)
                i = len;
        memcpy(tmp, u->nodename, i);
        up_read(&uts_sem);
        if (copy_to_user(name, tmp, i))
                return -EFAULT;
        return 0;
}

#endif

/*
 * Only setdomainname; getdomainname can be implemented by calling
 * uname()
 */
SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
{
        int errno;
        char tmp[__NEW_UTS_LEN];

        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;

        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
                struct new_utsname *u;

                add_device_randomness(tmp, len);
                down_write(&uts_sem);
                u = utsname();
                memcpy(u->domainname, tmp, len);
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
                uts_proc_notify(UTS_PROC_DOMAINNAME);
                up_write(&uts_sem);
        }
        return errno;
}

/* make sure you are allowed to change @tsk limits before calling this */
static int do_prlimit(struct task_struct *tsk, unsigned int resource,
                      struct rlimit *new_rlim, struct rlimit *old_rlim)
{
        struct rlimit *rlim;
        int retval = 0;

        if (resource >= RLIM_NLIMITS)
                return -EINVAL;
        resource = array_index_nospec(resource, RLIM_NLIMITS);

        if (new_rlim) {
                if (new_rlim->rlim_cur > new_rlim->rlim_max)
                        return -EINVAL;
                if (resource == RLIMIT_NOFILE &&
                                new_rlim->rlim_max > sysctl_nr_open)
                        return -EPERM;
        }

        /* Holding a refcount on tsk protects tsk->signal from disappearing. */
        rlim = tsk->signal->rlim + resource;
        task_lock(tsk->group_leader);
        if (new_rlim) {
                /*
                 * Keep the capable check against init_user_ns until cgroups can
                 * contain all limits.
                 */
                if (new_rlim->rlim_max > rlim->rlim_max &&
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
                if (!retval)
                        retval = security_task_setrlimit(tsk, resource, new_rlim);
        }
        if (!retval) {
                if (old_rlim)
                        *old_rlim = *rlim;
                if (new_rlim)
                        *rlim = *new_rlim;
        }
        task_unlock(tsk->group_leader);

        /*
         * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
         * infinite. In case of RLIM_INFINITY the posix CPU timer code
         * ignores the rlimit.
         */
        if (!retval && new_rlim && resource == RLIMIT_CPU &&
            new_rlim->rlim_cur != RLIM_INFINITY &&
            IS_ENABLED(CONFIG_POSIX_TIMERS)) {
                /*
                 * update_rlimit_cpu can fail if the task is exiting, but there
                 * may be other tasks in the thread group that are not exiting,
                 * and they need their cpu timers adjusted.
                 *
                 * The group_leader is the last task to be released, so if we
                 * cannot update_rlimit_cpu on it, then the entire process is
                 * exiting and we do not need to update at all.
                 */
                update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
        }

        return retval;
}

SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
        struct rlimit value;
        int ret;

        ret = do_prlimit(current, resource, NULL, &value);
        if (!ret)
                ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;

        return ret;
}

#ifdef CONFIG_COMPAT

COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;
        struct compat_rlimit r32;

        if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
                return -EFAULT;

        if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
                r.rlim_cur = RLIM_INFINITY;
        else
                r.rlim_cur = r32.rlim_cur;
        if (r32.rlim_max == COMPAT_RLIM_INFINITY)
                r.rlim_max = RLIM_INFINITY;
        else
                r.rlim_max = r32.rlim_max;
        return do_prlimit(current, resource, &r, NULL);
}

COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;
        int ret;

        ret = do_prlimit(current, resource, NULL, &r);
        if (!ret) {
                struct compat_rlimit r32;
                if (r.rlim_cur > COMPAT_RLIM_INFINITY)
                        r32.rlim_cur = COMPAT_RLIM_INFINITY;
                else
                        r32.rlim_cur = r.rlim_cur;
                if (r.rlim_max > COMPAT_RLIM_INFINITY)
                        r32.rlim_max = COMPAT_RLIM_INFINITY;
                else
                        r32.rlim_max = r.rlim_max;

                if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
                        return -EFAULT;
        }
        return ret;
}

#endif

#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT

/*
 *        Back compatibility for getrlimit. Needed for some apps.
 */
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                struct rlimit __user *, rlim)
{
        struct rlimit x;
        if (resource >= RLIM_NLIMITS)
                return -EINVAL;

        resource = array_index_nospec(resource, RLIM_NLIMITS);
        task_lock(current->group_leader);
        x = current->signal->rlim[resource];
        task_unlock(current->group_leader);
        if (x.rlim_cur > 0x7FFFFFFF)
                x.rlim_cur = 0x7FFFFFFF;
        if (x.rlim_max > 0x7FFFFFFF)
                x.rlim_max = 0x7FFFFFFF;
        return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
                       struct compat_rlimit __user *, rlim)
{
        struct rlimit r;

        if (resource >= RLIM_NLIMITS)
                return -EINVAL;

        resource = array_index_nospec(resource, RLIM_NLIMITS);
        task_lock(current->group_leader);
        r = current->signal->rlim[resource];
        task_unlock(current->group_leader);
        if (r.rlim_cur > 0x7FFFFFFF)
                r.rlim_cur = 0x7FFFFFFF;
        if (r.rlim_max > 0x7FFFFFFF)
                r.rlim_max = 0x7FFFFFFF;

        if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
            put_user(r.rlim_max, &rlim->rlim_max))
                return -EFAULT;
        return 0;
}
#endif

#endif

static inline bool rlim64_is_infinity(__u64 rlim64)
{
#if BITS_PER_LONG < 64
        return rlim64 >= ULONG_MAX;
#else
        return rlim64 == RLIM64_INFINITY;
#endif
}

static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
{
        if (rlim->rlim_cur == RLIM_INFINITY)
                rlim64->rlim_cur = RLIM64_INFINITY;
        else
                rlim64->rlim_cur = rlim->rlim_cur;
        if (rlim->rlim_max == RLIM_INFINITY)
                rlim64->rlim_max = RLIM64_INFINITY;
        else
                rlim64->rlim_max = rlim->rlim_max;
}

static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
{
        if (rlim64_is_infinity(rlim64->rlim_cur))
                rlim->rlim_cur = RLIM_INFINITY;
        else
                rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
        if (rlim64_is_infinity(rlim64->rlim_max))
                rlim->rlim_max = RLIM_INFINITY;
        else
                rlim->rlim_max = (unsigned long)rlim64->rlim_max;
}

/* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task,
                                    unsigned int flags)
{
        const struct cred *cred = current_cred(), *tcred;
        bool id_match;

        if (current == task)
                return 0;

        tcred = __task_cred(task);
        id_match = (uid_eq(cred->uid, tcred->euid) &&
                    uid_eq(cred->uid, tcred->suid) &&
                    uid_eq(cred->uid, tcred->uid)  &&
                    gid_eq(cred->gid, tcred->egid) &&
                    gid_eq(cred->gid, tcred->sgid) &&
                    gid_eq(cred->gid, tcred->gid));
        if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
                return -EPERM;

        return security_task_prlimit(cred, tcred, flags);
}

SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
                const struct rlimit64 __user *, new_rlim,
                struct rlimit64 __user *, old_rlim)
{
        struct rlimit64 old64, new64;
        struct rlimit old, new;
        struct task_struct *tsk;
        unsigned int checkflags = 0;
        int ret;

        if (old_rlim)
                checkflags |= LSM_PRLIMIT_READ;

        if (new_rlim) {
                if (copy_from_user(&new64, new_rlim, sizeof(new64)))
                        return -EFAULT;
                rlim64_to_rlim(&new64, &new);
                checkflags |= LSM_PRLIMIT_WRITE;
        }

        rcu_read_lock();
        tsk = pid ? find_task_by_vpid(pid) : current;
        if (!tsk) {
                rcu_read_unlock();
                return -ESRCH;
        }
        ret = check_prlimit_permission(tsk, checkflags);
        if (ret) {
                rcu_read_unlock();
                return ret;
        }
        get_task_struct(tsk);
        rcu_read_unlock();

        ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
                        old_rlim ? &old : NULL);

        if (!ret && old_rlim) {
                rlim_to_rlim64(&old, &old64);
                if (copy_to_user(old_rlim, &old64, sizeof(old64)))
                        ret = -EFAULT;
        }

        put_task_struct(tsk);
        return ret;
}

SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
        struct rlimit new_rlim;

        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
                return -EFAULT;
        return do_prlimit(current, resource, &new_rlim, NULL);
}

/*
 * It would make sense to put struct rusage in the task_struct,
 * except that would make the task_struct be *really big*.  After
 * task_struct gets moved into malloc'ed memory, it would
 * make sense to do this.  It will make moving the rest of the information
 * a lot simpler!  (Which we're not doing right now because we're not
 * measuring them yet).
 *
 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
 * races with threads incrementing their own counters.  But since word
 * reads are atomic, we either get new values or old values and we don't
 * care which for the sums.  We always take the siglock to protect reading
 * the c* fields from p->signal from races with exit.c updating those
 * fields when reaping, so a sample either gets all the additions of a
 * given child after it's reaped, or none so this sample is before reaping.
 *
 * Locking:
 * We need to take the siglock for CHILDEREN, SELF and BOTH
 * for  the cases current multithreaded, non-current single threaded
 * non-current multithreaded.  Thread traversal is now safe with
 * the siglock held.
 * Strictly speaking, we donot need to take the siglock if we are current and
 * single threaded,  as no one else can take our signal_struct away, no one
 * else can  reap the  children to update signal->c* counters, and no one else
 * can race with the signal-> fields. If we do not take any lock, the
 * signal-> fields could be read out of order while another thread was just
 * exiting. So we should  place a read memory barrier when we avoid the lock.
 * On the writer side,  write memory barrier is implied in  __exit_signal
 * as __exit_signal releases  the siglock spinlock after updating the signal->
 * fields. But we don't do this yet to keep things simple.
 *
 */

static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
{
        r->ru_nvcsw += t->nvcsw;
        r->ru_nivcsw += t->nivcsw;
        r->ru_minflt += t->min_flt;
        r->ru_majflt += t->maj_flt;
        r->ru_inblock += task_io_get_inblock(t);
        r->ru_oublock += task_io_get_oublock(t);
}

void getrusage(struct task_struct *p, int who, struct rusage *r)
{
        struct task_struct *t;
        unsigned long flags;
        u64 tgutime, tgstime, utime, stime;
        unsigned long maxrss;
        struct mm_struct *mm;
        struct signal_struct *sig = p->signal;
        unsigned int seq = 0;

retry:
        memset(r, 0, sizeof(*r));
        utime = stime = 0;
        maxrss = 0;

        if (who == RUSAGE_THREAD) {
                task_cputime_adjusted(current, &utime, &stime);
                accumulate_thread_rusage(p, r);
                maxrss = sig->maxrss;
                goto out_thread;
        }

        flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);

        switch (who) {
        case RUSAGE_BOTH:
        case RUSAGE_CHILDREN:
                utime = sig->cutime;
                stime = sig->cstime;
                r->ru_nvcsw = sig->cnvcsw;
                r->ru_nivcsw = sig->cnivcsw;
                r->ru_minflt = sig->cmin_flt;
                r->ru_majflt = sig->cmaj_flt;
                r->ru_inblock = sig->cinblock;
                r->ru_oublock = sig->coublock;
                maxrss = sig->cmaxrss;

                if (who == RUSAGE_CHILDREN)
                        break;
                fallthrough;

        case RUSAGE_SELF:
                r->ru_nvcsw += sig->nvcsw;
                r->ru_nivcsw += sig->nivcsw;
                r->ru_minflt += sig->min_flt;
                r->ru_majflt += sig->maj_flt;
                r->ru_inblock += sig->inblock;
                r->ru_oublock += sig->oublock;
                if (maxrss < sig->maxrss)
                        maxrss = sig->maxrss;

                rcu_read_lock();
                __for_each_thread(sig, t)
                        accumulate_thread_rusage(t, r);
                rcu_read_unlock();

                break;

        default:
                BUG();
        }

        if (need_seqretry(&sig->stats_lock, seq)) {
                seq = 1;
                goto retry;
        }
        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);

        if (who == RUSAGE_CHILDREN)
                goto out_children;

        thread_group_cputime_adjusted(p, &tgutime, &tgstime);
        utime += tgutime;
        stime += tgstime;

out_thread:
        mm = get_task_mm(p);
        if (mm) {
                setmax_mm_hiwater_rss(&maxrss, mm);
                mmput(mm);
        }

out_children:
        r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
        r->ru_utime = ns_to_kernel_old_timeval(utime);
        r->ru_stime = ns_to_kernel_old_timeval(stime);
}

SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
{
        struct rusage r;

        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
            who != RUSAGE_THREAD)
                return -EINVAL;

        getrusage(current, who, &r);
        return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
{
        struct rusage r;

        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
            who != RUSAGE_THREAD)
                return -EINVAL;

        getrusage(current, who, &r);
        return put_compat_rusage(&r, ru);
}
#endif

SYSCALL_DEFINE1(umask, int, mask)
{
        mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
        return mask;
}

static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
        struct fd exe;
        struct inode *inode;
        int err;

        exe = fdget(fd);
        if (!exe.file)
                return -EBADF;

        inode = file_inode(exe.file);

        /*
         * Because the original mm->exe_file points to executable file, make
         * sure that this one is executable as well, to avoid breaking an
         * overall picture.
         */
        err = -EACCES;
        if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
                goto exit;

        err = file_permission(exe.file, MAY_EXEC);
        if (err)
                goto exit;

        err = replace_mm_exe_file(mm, exe.file);
exit:
        fdput(exe);
        return err;
}

/*
 * Check arithmetic relations of passed addresses.
 *
 * WARNING: we don't require any capability here so be very careful
 * in what is allowed for modification from userspace.
 */
static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
{
        unsigned long mmap_max_addr = TASK_SIZE;
        int error = -EINVAL, i;

        static const unsigned char offsets[] = {
                offsetof(struct prctl_mm_map, start_code),
                offsetof(struct prctl_mm_map, end_code),
                offsetof(struct prctl_mm_map, start_data),
                offsetof(struct prctl_mm_map, end_data),
                offsetof(struct prctl_mm_map, start_brk),
                offsetof(struct prctl_mm_map, brk),
                offsetof(struct prctl_mm_map, start_stack),
                offsetof(struct prctl_mm_map, arg_start),
                offsetof(struct prctl_mm_map, arg_end),
                offsetof(struct prctl_mm_map, env_start),
                offsetof(struct prctl_mm_map, env_end),
        };

        /*
         * Make sure the members are not somewhere outside
         * of allowed address space.
         */
        for (i = 0; i < ARRAY_SIZE(offsets); i++) {
                u64 val = *(u64 *)((char *)prctl_map + offsets[i]);

                if ((unsigned long)val >= mmap_max_addr ||
                    (unsigned long)val < mmap_min_addr)
                        goto out;
        }

        /*
         * Make sure the pairs are ordered.
         */
#define __prctl_check_order(__m1, __op, __m2)                                \
        ((unsigned long)prctl_map->__m1 __op                                \
         (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
        error  = __prctl_check_order(start_code, <, end_code);
        error |= __prctl_check_order(start_data,<=, end_data);
        error |= __prctl_check_order(start_brk, <=, brk);
        error |= __prctl_check_order(arg_start, <=, arg_end);
        error |= __prctl_check_order(env_start, <=, env_end);
        if (error)
                goto out;
#undef __prctl_check_order

        error = -EINVAL;

        /*
         * Neither we should allow to override limits if they set.
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
                              prctl_map->start_brk, prctl_map->end_data,
                              prctl_map->start_data))
                        goto out;

        error = 0;
out:
        return error;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
        struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
        unsigned long user_auxv[AT_VECTOR_SIZE];
        struct mm_struct *mm = current->mm;
        int error;

        BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
        BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);

        if (opt == PR_SET_MM_MAP_SIZE)
                return put_user((unsigned int)sizeof(prctl_map),
                                (unsigned int __user *)addr);

        if (data_size != sizeof(prctl_map))
                return -EINVAL;

        if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
                return -EFAULT;

        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                return error;

        if (prctl_map.auxv_size) {
                /*
                 * Someone is trying to cheat the auxv vector.
                 */
                if (!prctl_map.auxv ||
                                prctl_map.auxv_size > sizeof(mm->saved_auxv))
                        return -EINVAL;

                memset(user_auxv, 0, sizeof(user_auxv));
                if (copy_from_user(user_auxv,
                                   (const void __user *)prctl_map.auxv,
                                   prctl_map.auxv_size))
                        return -EFAULT;

                /* Last entry must be AT_NULL as specification requires */
                user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
                user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
        }

        if (prctl_map.exe_fd != (u32)-1) {
                /*
                 * Check if the current user is checkpoint/restore capable.
                 * At the time of this writing, it checks for CAP_SYS_ADMIN
                 * or CAP_CHECKPOINT_RESTORE.
                 * Note that a user with access to ptrace can masquerade an
                 * arbitrary program as any executable, even setuid ones.
                 * This may have implications in the tomoyo subsystem.
                 */
                if (!checkpoint_restore_ns_capable(current_user_ns()))
                        return -EPERM;

                error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
                if (error)
                        return error;
        }

        /*
         * arg_lock protects concurrent updates but we still need mmap_lock for
         * read to exclude races with sys_brk.
         */
        mmap_read_lock(mm);

        /*
         * We don't validate if these members are pointing to
         * real present VMAs because application may have correspond
         * VMAs already unmapped and kernel uses these members for statistics
         * output in procfs mostly, except
         *
         *  - @start_brk/@brk which are used in do_brk_flags but kernel lookups
         *    for VMAs when updating these members so anything wrong written
         *    here cause kernel to swear at userspace program but won't lead
         *    to any problem in kernel itself
         */

        spin_lock(&mm->arg_lock);
        mm->start_code        = prctl_map.start_code;
        mm->end_code        = prctl_map.end_code;
        mm->start_data        = prctl_map.start_data;
        mm->end_data        = prctl_map.end_data;
        mm->start_brk        = prctl_map.start_brk;
        mm->brk                = prctl_map.brk;
        mm->start_stack        = prctl_map.start_stack;
        mm->arg_start        = prctl_map.arg_start;
        mm->arg_end        = prctl_map.arg_end;
        mm->env_start        = prctl_map.env_start;
        mm->env_end        = prctl_map.env_end;
        spin_unlock(&mm->arg_lock);

        /*
         * Note this update of @saved_auxv is lockless thus
         * if someone reads this member in procfs while we're
         * updating -- it may get partly updated results. It's
         * known and acceptable trade off: we leave it as is to
         * not introduce additional locks here making the kernel
         * more complex.
         */
        if (prctl_map.auxv_size)
                memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));

        mmap_read_unlock(mm);
        return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */

static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
                          unsigned long len)
{
        /*
         * This doesn't move the auxiliary vector itself since it's pinned to
         * mm_struct, but it permits filling the vector with new values.  It's
         * up to the caller to provide sane values here, otherwise userspace
         * tools which use this vector might be unhappy.
         */
        unsigned long user_auxv[AT_VECTOR_SIZE] = {};

        if (len > sizeof(user_auxv))
                return -EINVAL;

        if (copy_from_user(user_auxv, (const void __user *)addr, len))
                return -EFAULT;

        /* Make sure the last entry is always AT_NULL */
        user_auxv[AT_VECTOR_SIZE - 2] = 0;
        user_auxv[AT_VECTOR_SIZE - 1] = 0;

        BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));

        task_lock(current);
        memcpy(mm->saved_auxv, user_auxv, len);
        task_unlock(current);

        return 0;
}

static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
{
        struct mm_struct *mm = current->mm;
        struct prctl_mm_map prctl_map = {
                .auxv = NULL,
                .auxv_size = 0,
                .exe_fd = -1,
        };
        struct vm_area_struct *vma;
        int error;

        if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
                              opt != PR_SET_MM_MAP &&
                              opt != PR_SET_MM_MAP_SIZE)))
                return -EINVAL;

#ifdef CONFIG_CHECKPOINT_RESTORE
        if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
                return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
#endif

        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;

        if (opt == PR_SET_MM_EXE_FILE)
                return prctl_set_mm_exe_file(mm, (unsigned int)addr);

        if (opt == PR_SET_MM_AUXV)
                return prctl_set_auxv(mm, addr, arg4);

        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;

        error = -EINVAL;

        /*
         * arg_lock protects concurrent updates of arg boundaries, we need
         * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
         * validation.
         */
        mmap_read_lock(mm);
        vma = find_vma(mm, addr);

        spin_lock(&mm->arg_lock);
        prctl_map.start_code        = mm->start_code;
        prctl_map.end_code        = mm->end_code;
        prctl_map.start_data        = mm->start_data;
        prctl_map.end_data        = mm->end_data;
        prctl_map.start_brk        = mm->start_brk;
        prctl_map.brk                = mm->brk;
        prctl_map.start_stack        = mm->start_stack;
        prctl_map.arg_start        = mm->arg_start;
        prctl_map.arg_end        = mm->arg_end;
        prctl_map.env_start        = mm->env_start;
        prctl_map.env_end        = mm->env_end;

        switch (opt) {
        case PR_SET_MM_START_CODE:
                prctl_map.start_code = addr;
                break;
        case PR_SET_MM_END_CODE:
                prctl_map.end_code = addr;
                break;
        case PR_SET_MM_START_DATA:
                prctl_map.start_data = addr;
                break;
        case PR_SET_MM_END_DATA:
                prctl_map.end_data = addr;
                break;
        case PR_SET_MM_START_STACK:
                prctl_map.start_stack = addr;
                break;
        case PR_SET_MM_START_BRK:
                prctl_map.start_brk = addr;
                break;
        case PR_SET_MM_BRK:
                prctl_map.brk = addr;
                break;
        case PR_SET_MM_ARG_START:
                prctl_map.arg_start = addr;
                break;
        case PR_SET_MM_ARG_END:
                prctl_map.arg_end = addr;
                break;
        case PR_SET_MM_ENV_START:
                prctl_map.env_start = addr;
                break;
        case PR_SET_MM_ENV_END:
                prctl_map.env_end = addr;
                break;
        default:
                goto out;
        }

        error = validate_prctl_map_addr(&prctl_map);
        if (error)
                goto out;

        switch (opt) {
        /*
         * If command line arguments and environment
         * are placed somewhere else on stack, we can
         * set them up here, ARG_START/END to setup
         * command line arguments and ENV_START/END
         * for environment.
         */
        case PR_SET_MM_START_STACK:
        case PR_SET_MM_ARG_START:
        case PR_SET_MM_ARG_END:
        case PR_SET_MM_ENV_START:
        case PR_SET_MM_ENV_END:
                if (!vma) {
                        error = -EFAULT;
                        goto out;
                }
        }

        mm->start_code        = prctl_map.start_code;
        mm->end_code        = prctl_map.end_code;
        mm->start_data        = prctl_map.start_data;
        mm->end_data        = prctl_map.end_data;
        mm->start_brk        = prctl_map.start_brk;
        mm->brk                = prctl_map.brk;
        mm->start_stack        = prctl_map.start_stack;
        mm->arg_start        = prctl_map.arg_start;
        mm->arg_end        = prctl_map.arg_end;
        mm->env_start        = prctl_map.env_start;
        mm->env_end        = prctl_map.env_end;

        error = 0;
out:
        spin_unlock(&mm->arg_lock);
        mmap_read_unlock(mm);
        return error;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
        return put_user(me->clear_child_tid, tid_addr);
}
#else
static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr)
{
        return -EINVAL;
}
#endif

static int propagate_has_child_subreaper(struct task_struct *p, void *data)
{
        /*
         * If task has has_child_subreaper - all its descendants
         * already have these flag too and new descendants will
         * inherit it on fork, skip them.
         *
         * If we've found child_reaper - skip descendants in
         * it's subtree as they will never get out pidns.
         */
        if (p->signal->has_child_subreaper ||
            is_child_reaper(task_pid(p)))
                return 0;

        p->signal->has_child_subreaper = 1;
        return 1;
}

int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
{
        return -EINVAL;
}

int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
                                    unsigned long ctrl)
{
        return -EINVAL;
}

#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)

#ifdef CONFIG_ANON_VMA_NAME

#define ANON_VMA_NAME_MAX_LEN                80
#define ANON_VMA_NAME_INVALID_CHARS        "\\`$[]"

static inline bool is_valid_name_char(char ch)
{
        /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
        return ch > 0x1f && ch < 0x7f &&
                !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
}

static int prctl_set_vma(unsigned long opt, unsigned long addr,
                         unsigned long size, unsigned long arg)
{
        struct mm_struct *mm = current->mm;
        const char __user *uname;
        struct anon_vma_name *anon_name = NULL;
        int error;

        switch (opt) {
        case PR_SET_VMA_ANON_NAME:
                uname = (const char __user *)arg;
                if (uname) {
                        char *name, *pch;

                        name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
                        if (IS_ERR(name))
                                return PTR_ERR(name);

                        for (pch = name; *pch != '\0'; pch++) {
                                if (!is_valid_name_char(*pch)) {
                                        kfree(name);
                                        return -EINVAL;
                                }
                        }
                        /* anon_vma has its own copy */
                        anon_name = anon_vma_name_alloc(name);
                        kfree(name);
                        if (!anon_name)
                                return -ENOMEM;

                }

                mmap_write_lock(mm);
                error = madvise_set_anon_name(mm, addr, size, anon_name);
                mmap_write_unlock(mm);
                anon_vma_name_put(anon_name);
                break;
        default:
                error = -EINVAL;
        }

        return error;
}

#else /* CONFIG_ANON_VMA_NAME */
static int prctl_set_vma(unsigned long opt, unsigned long start,
                         unsigned long size, unsigned long arg)
{
        return -EINVAL;
}
#endif /* CONFIG_ANON_VMA_NAME */

static inline unsigned long get_current_mdwe(void)
{
        unsigned long ret = 0;

        if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
                ret |= PR_MDWE_REFUSE_EXEC_GAIN;
        if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
                ret |= PR_MDWE_NO_INHERIT;

        return ret;
}

static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
                                 unsigned long arg4, unsigned long arg5)
{
        unsigned long current_bits;

        if (arg3 || arg4 || arg5)
                return -EINVAL;

        if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
                return -EINVAL;

        /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
        if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
                return -EINVAL;

        /*
         * EOPNOTSUPP might be more appropriate here in principle, but
         * existing userspace depends on EINVAL specifically.
         */
        if (!arch_memory_deny_write_exec_supported())
                return -EINVAL;

        current_bits = get_current_mdwe();
        if (current_bits && current_bits != bits)
                return -EPERM; /* Cannot unset the flags */

        if (bits & PR_MDWE_NO_INHERIT)
                set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
        if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
                set_bit(MMF_HAS_MDWE, &current->mm->flags);

        return 0;
}

static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
                                 unsigned long arg4, unsigned long arg5)
{
        if (arg2 || arg3 || arg4 || arg5)
                return -EINVAL;
        return get_current_mdwe();
}

static int prctl_get_auxv(void __user *addr, unsigned long len)
{
        struct mm_struct *mm = current->mm;
        unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len);

        if (size && copy_to_user(addr, mm->saved_auxv, size))
                return -EFAULT;
        return sizeof(mm->saved_auxv);
}

SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
{
        struct task_struct *me = current;
        unsigned char comm[sizeof(me->comm)];
        long error;

        error = security_task_prctl(option, arg2, arg3, arg4, arg5);
        if (error != -ENOSYS)
                return error;

        error = 0;
        switch (option) {
        case PR_SET_PDEATHSIG:
                if (!valid_signal(arg2)) {
                        error = -EINVAL;
                        break;
                }
                me->pdeath_signal = arg2;
                break;
        case PR_GET_PDEATHSIG:
                error = put_user(me->pdeath_signal, (int __user *)arg2);
                break;
        case PR_GET_DUMPABLE:
                error = get_dumpable(me->mm);
                break;
        case PR_SET_DUMPABLE:
                if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
                        error = -EINVAL;
                        break;
                }
                set_dumpable(me->mm, arg2);
                break;

        case PR_SET_UNALIGN:
                error = SET_UNALIGN_CTL(me, arg2);
                break;
        case PR_GET_UNALIGN:
                error = GET_UNALIGN_CTL(me, arg2);
                break;
        case PR_SET_FPEMU:
                error = SET_FPEMU_CTL(me, arg2);
                break;
        case PR_GET_FPEMU:
                error = GET_FPEMU_CTL(me, arg2);
                break;
        case PR_SET_FPEXC:
                error = SET_FPEXC_CTL(me, arg2);
                break;
        case PR_GET_FPEXC:
                error = GET_FPEXC_CTL(me, arg2);
                break;
        case PR_GET_TIMING:
                error = PR_TIMING_STATISTICAL;
                break;
        case PR_SET_TIMING:
                if (arg2 != PR_TIMING_STATISTICAL)
                        error = -EINVAL;
                break;
        case PR_SET_NAME:
                comm[sizeof(me->comm) - 1] = 0;
                if (strncpy_from_user(comm, (char __user *)arg2,
                                      sizeof(me->comm) - 1) < 0)
                        return -EFAULT;
                set_task_comm(me, comm);
                proc_comm_connector(me);
                break;
        case PR_GET_NAME:
                get_task_comm(comm, me);
                if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
                        return -EFAULT;
                break;
        case PR_GET_ENDIAN:
                error = GET_ENDIAN(me, arg2);
                break;
        case PR_SET_ENDIAN:
                error = SET_ENDIAN(me, arg2);
                break;
        case PR_GET_SECCOMP:
                error = prctl_get_seccomp();
                break;
        case PR_SET_SECCOMP:
                error = prctl_set_seccomp(arg2, (char __user *)arg3);
                break;
        case PR_GET_TSC:
                error = GET_TSC_CTL(arg2);
                break;
        case PR_SET_TSC:
                error = SET_TSC_CTL(arg2);
                break;
        case PR_TASK_PERF_EVENTS_DISABLE:
                error = perf_event_task_disable();
                break;
        case PR_TASK_PERF_EVENTS_ENABLE:
                error = perf_event_task_enable();
                break;
        case PR_GET_TIMERSLACK:
                if (current->timer_slack_ns > ULONG_MAX)
                        error = ULONG_MAX;
                else
                        error = current->timer_slack_ns;
                break;
        case PR_SET_TIMERSLACK:
                if (arg2 <= 0)
                        current->timer_slack_ns =
                                        current->default_timer_slack_ns;
                else
                        current->timer_slack_ns = arg2;
                break;
        case PR_MCE_KILL:
                if (arg4 | arg5)
                        return -EINVAL;
                switch (arg2) {
                case PR_MCE_KILL_CLEAR:
                        if (arg3 != 0)
                                return -EINVAL;
                        current->flags &= ~PF_MCE_PROCESS;
                        break;
                case PR_MCE_KILL_SET:
                        current->flags |= PF_MCE_PROCESS;
                        if (arg3 == PR_MCE_KILL_EARLY)
                                current->flags |= PF_MCE_EARLY;
                        else if (arg3 == PR_MCE_KILL_LATE)
                                current->flags &= ~PF_MCE_EARLY;
                        else if (arg3 == PR_MCE_KILL_DEFAULT)
                                current->flags &=
                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
                        else
                                return -EINVAL;
                        break;
                default:
                        return -EINVAL;
                }
                break;
        case PR_MCE_KILL_GET:
                if (arg2 | arg3 | arg4 | arg5)
                        return -EINVAL;
                if (current->flags & PF_MCE_PROCESS)
                        error = (current->flags & PF_MCE_EARLY) ?
                                PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
                else
                        error = PR_MCE_KILL_DEFAULT;
                break;
        case PR_SET_MM:
                error = prctl_set_mm(arg2, arg3, arg4, arg5);
                break;
        case PR_GET_TID_ADDRESS:
                error = prctl_get_tid_address(me, (int __user * __user *)arg2);
                break;
        case PR_SET_CHILD_SUBREAPER:
                me->signal->is_child_subreaper = !!arg2;
                if (!arg2)
                        break;

                walk_process_tree(me, propagate_has_child_subreaper, NULL);
                break;
        case PR_GET_CHILD_SUBREAPER:
                error = put_user(me->signal->is_child_subreaper,
                                 (int __user *)arg2);
                break;
        case PR_SET_NO_NEW_PRIVS:
                if (arg2 != 1 || arg3 || arg4 || arg5)
                        return -EINVAL;

                task_set_no_new_privs(current);
                break;
        case PR_GET_NO_NEW_PRIVS:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                return task_no_new_privs(current) ? 1 : 0;
        case PR_GET_THP_DISABLE:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
                break;
        case PR_SET_THP_DISABLE:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                if (mmap_write_lock_killable(me->mm))
                        return -EINTR;
                if (arg2)
                        set_bit(MMF_DISABLE_THP, &me->mm->flags);
                else
                        clear_bit(MMF_DISABLE_THP, &me->mm->flags);
                mmap_write_unlock(me->mm);
                break;
        case PR_MPX_ENABLE_MANAGEMENT:
        case PR_MPX_DISABLE_MANAGEMENT:
                /* No longer implemented: */
                return -EINVAL;
        case PR_SET_FP_MODE:
                error = SET_FP_MODE(me, arg2);
                break;
        case PR_GET_FP_MODE:
                error = GET_FP_MODE(me);
                break;
        case PR_SVE_SET_VL:
                error = SVE_SET_VL(arg2);
                break;
        case PR_SVE_GET_VL:
                error = SVE_GET_VL();
                break;
        case PR_SME_SET_VL:
                error = SME_SET_VL(arg2);
                break;
        case PR_SME_GET_VL:
                error = SME_GET_VL();
                break;
        case PR_GET_SPECULATION_CTRL:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_spec_ctrl_get(me, arg2);
                break;
        case PR_SET_SPECULATION_CTRL:
                if (arg4 || arg5)
                        return -EINVAL;
                error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
                break;
        case PR_PAC_RESET_KEYS:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = PAC_RESET_KEYS(me, arg2);
                break;
        case PR_PAC_SET_ENABLED_KEYS:
                if (arg4 || arg5)
                        return -EINVAL;
                error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
                break;
        case PR_PAC_GET_ENABLED_KEYS:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                error = PAC_GET_ENABLED_KEYS(me);
                break;
        case PR_SET_TAGGED_ADDR_CTRL:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = SET_TAGGED_ADDR_CTRL(arg2);
                break;
        case PR_GET_TAGGED_ADDR_CTRL:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;
                error = GET_TAGGED_ADDR_CTRL();
                break;
        case PR_SET_IO_FLUSHER:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;

                if (arg3 || arg4 || arg5)
                        return -EINVAL;

                if (arg2 == 1)
                        current->flags |= PR_IO_FLUSHER;
                else if (!arg2)
                        current->flags &= ~PR_IO_FLUSHER;
                else
                        return -EINVAL;
                break;
        case PR_GET_IO_FLUSHER:
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;

                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;

                error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
                break;
        case PR_SET_SYSCALL_USER_DISPATCH:
                error = set_syscall_user_dispatch(arg2, arg3, arg4,
                                                  (char __user *) arg5);
                break;
#ifdef CONFIG_SCHED_CORE
        case PR_SCHED_CORE:
                error = sched_core_share_pid(arg2, arg3, arg4, arg5);
                break;
#endif
        case PR_SET_MDWE:
                error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
                break;
        case PR_GET_MDWE:
                error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
                break;
        case PR_PPC_GET_DEXCR:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                error = PPC_GET_DEXCR_ASPECT(me, arg2);
                break;
        case PR_PPC_SET_DEXCR:
                if (arg4 || arg5)
                        return -EINVAL;
                error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3);
                break;
        case PR_SET_VMA:
                error = prctl_set_vma(arg2, arg3, arg4, arg5);
                break;
        case PR_GET_AUXV:
                if (arg4 || arg5)
                        return -EINVAL;
                error = prctl_get_auxv((void __user *)arg2, arg3);
                break;
#ifdef CONFIG_KSM
        case PR_SET_MEMORY_MERGE:
                if (arg3 || arg4 || arg5)
                        return -EINVAL;
                if (mmap_write_lock_killable(me->mm))
                        return -EINTR;

                if (arg2)
                        error = ksm_enable_merge_any(me->mm);
                else
                        error = ksm_disable_merge_any(me->mm);
                mmap_write_unlock(me->mm);
                break;
        case PR_GET_MEMORY_MERGE:
                if (arg2 || arg3 || arg4 || arg5)
                        return -EINVAL;

                error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
                break;
#endif
        case PR_RISCV_V_SET_CONTROL:
                error = RISCV_V_SET_CONTROL(arg2);
                break;
        case PR_RISCV_V_GET_CONTROL:
                error = RISCV_V_GET_CONTROL();
                break;
        case PR_RISCV_SET_ICACHE_FLUSH_CTX:
                error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
                break;
        default:
                error = -EINVAL;
                break;
        }
        return error;
}

SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
                struct getcpu_cache __user *, unused)
{
        int err = 0;
        int cpu = raw_smp_processor_id();

        if (cpup)
                err |= put_user(cpu, cpup);
        if (nodep)
                err |= put_user(cpu_to_node(cpu), nodep);
        return err ? -EFAULT : 0;
}

/**
 * do_sysinfo - fill in sysinfo struct
 * @info: pointer to buffer to fill
 */
static int do_sysinfo(struct sysinfo *info)
{
        unsigned long mem_total, sav_total;
        unsigned int mem_unit, bitcount;
        struct timespec64 tp;

        memset(info, 0, sizeof(struct sysinfo));

        ktime_get_boottime_ts64(&tp);
        timens_add_boottime(&tp);
        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);

        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);

        info->procs = nr_threads;

        si_meminfo(info);
        si_swapinfo(info);

        /*
         * If the sum of all the available memory (i.e. ram + swap)
         * is less than can be stored in a 32 bit unsigned long then
         * we can be binary compatible with 2.2.x kernels.  If not,
         * well, in that case 2.2.x was broken anyways...
         *
         *  -Erik Andersen <andersee@debian.org>
         */

        mem_total = info->totalram + info->totalswap;
        if (mem_total < info->totalram || mem_total < info->totalswap)
                goto out;
        bitcount = 0;
        mem_unit = info->mem_unit;
        while (mem_unit > 1) {
                bitcount++;
                mem_unit >>= 1;
                sav_total = mem_total;
                mem_total <<= 1;
                if (mem_total < sav_total)
                        goto out;
        }

        /*
         * If mem_total did not overflow, multiply all memory values by
         * info->mem_unit and set it to 1.  This leaves things compatible
         * with 2.2.x, and also retains compatibility with earlier 2.4.x
         * kernels...
         */

        info->mem_unit = 1;
        info->totalram <<= bitcount;
        info->freeram <<= bitcount;
        info->sharedram <<= bitcount;
        info->bufferram <<= bitcount;
        info->totalswap <<= bitcount;
        info->freeswap <<= bitcount;
        info->totalhigh <<= bitcount;
        info->freehigh <<= bitcount;

out:
        return 0;
}

SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
{
        struct sysinfo val;

        do_sysinfo(&val);

        if (copy_to_user(info, &val, sizeof(struct sysinfo)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
struct compat_sysinfo {
        s32 uptime;
        u32 loads[3];
        u32 totalram;
        u32 freeram;
        u32 sharedram;
        u32 bufferram;
        u32 totalswap;
        u32 freeswap;
        u16 procs;
        u16 pad;
        u32 totalhigh;
        u32 freehigh;
        u32 mem_unit;
        char _f[20-2*sizeof(u32)-sizeof(int)];
};

COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
{
        struct sysinfo s;
        struct compat_sysinfo s_32;

        do_sysinfo(&s);

        /* Check to see if any memory value is too large for 32-bit and scale
         *  down if needed
         */
        if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
                int bitcount = 0;

                while (s.mem_unit < PAGE_SIZE) {
                        s.mem_unit <<= 1;
                        bitcount++;
                }

                s.totalram >>= bitcount;
                s.freeram >>= bitcount;
                s.sharedram >>= bitcount;
                s.bufferram >>= bitcount;
                s.totalswap >>= bitcount;
                s.freeswap >>= bitcount;
                s.totalhigh >>= bitcount;
                s.freehigh >>= bitcount;
        }

        memset(&s_32, 0, sizeof(s_32));
        s_32.uptime = s.uptime;
        s_32.loads[0] = s.loads[0];
        s_32.loads[1] = s.loads[1];
        s_32.loads[2] = s.loads[2];
        s_32.totalram = s.totalram;
        s_32.freeram = s.freeram;
        s_32.sharedram = s.sharedram;
        s_32.bufferram = s.bufferram;
        s_32.totalswap = s.totalswap;
        s_32.freeswap = s.freeswap;
        s_32.procs = s.procs;
        s_32.totalhigh = s.totalhigh;
        s_32.freehigh = s.freehigh;
        s_32.mem_unit = s.mem_unit;
        if (copy_to_user(info, &s_32, sizeof(s_32)))
                return -EFAULT;
        return 0;
}
#endif /* CONFIG_COMPAT */

































































































    1 



    1 





    1 

















    1 












    1 




    1 









    1 
























    1 





    1 


    1 




    1 




    1 





    1 




    1 





















    1 




    1 





    1 























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IPV4 GSO/GRO offload support
 *        Linux INET implementation
 *
 *        TCPv4 GSO/GRO support
 */

#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
#include <net/gro.h>
#include <net/gso.h>
#include <net/tcp.h>
#include <net/protocol.h>

static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
                           unsigned int seq, unsigned int mss)
{
        while (skb) {
                if (before(ts_seq, seq + mss)) {
                        skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
                        skb_shinfo(skb)->tskey = ts_seq;
                        return;
                }

                skb = skb->next;
                seq += mss;
        }
}

static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
                                     __be32 *oldip, __be32 newip,
                                     __be16 *oldport, __be16 newport)
{
        struct tcphdr *th;
        struct iphdr *iph;

        if (*oldip == newip && *oldport == newport)
                return;

        th = tcp_hdr(seg);
        iph = ip_hdr(seg);

        inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
        inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
        *oldport = newport;

        csum_replace4(&iph->check, *oldip, newip);
        *oldip = newip;
}

static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
{
        const struct tcphdr *th;
        const struct iphdr *iph;
        struct sk_buff *seg;
        struct tcphdr *th2;
        struct iphdr *iph2;

        seg = segs;
        th = tcp_hdr(seg);
        iph = ip_hdr(seg);
        th2 = tcp_hdr(seg->next);
        iph2 = ip_hdr(seg->next);

        if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
            iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
                return segs;

        while ((seg = seg->next)) {
                th2 = tcp_hdr(seg);
                iph2 = ip_hdr(seg);

                __tcpv4_gso_segment_csum(seg,
                                         &iph2->saddr, iph->saddr,
                                         &th2->source, th->source);
                __tcpv4_gso_segment_csum(seg,
                                         &iph2->daddr, iph->daddr,
                                         &th2->dest, th->dest);
        }

        return segs;
}

static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
                                              netdev_features_t features)
{
        skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
        if (IS_ERR(skb))
                return skb;

        return __tcpv4_gso_segment_list_csum(skb);
}

static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
                                        netdev_features_t features)
{
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4))
                return ERR_PTR(-EINVAL);

        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                return ERR_PTR(-EINVAL);

        if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
                return __tcp4_gso_segment_list(skb, features);

        if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
                const struct iphdr *iph = ip_hdr(skb);
                struct tcphdr *th = tcp_hdr(skb);

                /* Set up checksum pseudo header, usually expect stack to
                 * have done this already.
                 */

                th->check = 0;
                skb->ip_summed = CHECKSUM_PARTIAL;
                __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
        }

        return tcp_gso_segment(skb, features);
}

struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
                                netdev_features_t features)
{
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        unsigned int sum_truesize = 0;
        struct tcphdr *th;
        unsigned int thlen;
        unsigned int seq;
        unsigned int oldlen;
        unsigned int mss;
        struct sk_buff *gso_skb = skb;
        __sum16 newcheck;
        bool ooo_okay, copy_destructor;
        __wsum delta;

        th = tcp_hdr(skb);
        thlen = th->doff * 4;
        if (thlen < sizeof(*th))
                goto out;

        if (!pskb_may_pull(skb, thlen))
                goto out;

        oldlen = ~skb->len;
        __skb_pull(skb, thlen);

        mss = skb_shinfo(skb)->gso_size;
        if (unlikely(skb->len <= mss))
                goto out;

        if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
                /* Packet is from an untrusted source, reset gso_segs. */

                skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);

                segs = NULL;
                goto out;
        }

        copy_destructor = gso_skb->destructor == tcp_wfree;
        ooo_okay = gso_skb->ooo_okay;
        /* All segments but the first should have ooo_okay cleared */
        skb->ooo_okay = 0;

        segs = skb_segment(skb, features);
        if (IS_ERR(segs))
                goto out;

        /* Only first segment might have ooo_okay set */
        segs->ooo_okay = ooo_okay;

        /* GSO partial and frag_list segmentation only requires splitting
         * the frame into an MSS multiple and possibly a remainder, both
         * cases return a GSO skb. So update the mss now.
         */
        if (skb_is_gso(segs))
                mss *= skb_shinfo(segs)->gso_segs;

        delta = (__force __wsum)htonl(oldlen + thlen + mss);

        skb = segs;
        th = tcp_hdr(skb);
        seq = ntohl(th->seq);

        if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
                tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);

        newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta));

        while (skb->next) {
                th->fin = th->psh = 0;
                th->check = newcheck;

                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        gso_reset_checksum(skb, ~th->check);
                else
                        th->check = gso_make_checksum(skb, ~th->check);

                seq += mss;
                if (copy_destructor) {
                        skb->destructor = gso_skb->destructor;
                        skb->sk = gso_skb->sk;
                        sum_truesize += skb->truesize;
                }
                skb = skb->next;
                th = tcp_hdr(skb);

                th->seq = htonl(seq);
                th->cwr = 0;
        }

        /* Following permits TCP Small Queues to work well with GSO :
         * The callback to TCP stack will be called at the time last frag
         * is freed at TX completion, and not right now when gso_skb
         * is freed by GSO engine
         */
        if (copy_destructor) {
                int delta;

                swap(gso_skb->sk, skb->sk);
                swap(gso_skb->destructor, skb->destructor);
                sum_truesize += skb->truesize;
                delta = sum_truesize - gso_skb->truesize;
                /* In some pathological cases, delta can be negative.
                 * We need to either use refcount_add() or refcount_sub_and_test()
                 */
                if (likely(delta >= 0))
                        refcount_add(delta, &skb->sk->sk_wmem_alloc);
                else
                        WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc));
        }

        delta = (__force __wsum)htonl(oldlen +
                                      (skb_tail_pointer(skb) -
                                       skb_transport_header(skb)) +
                                      skb->data_len);
        th->check = ~csum_fold(csum_add(csum_unfold(th->check), delta));
        if (skb->ip_summed == CHECKSUM_PARTIAL)
                gso_reset_checksum(skb, ~th->check);
        else
                th->check = gso_make_checksum(skb, ~th->check);
out:
        return segs;
}

struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
{
        struct tcphdr *th2;
        struct sk_buff *p;

        list_for_each_entry(p, head, list) {
                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                th2 = tcp_hdr(p);
                if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }

                return p;
        }

        return NULL;
}

struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
{
        unsigned int thlen, hlen, off;
        struct tcphdr *th;

        off = skb_gro_offset(skb);
        hlen = off + sizeof(*th);
        th = skb_gro_header(skb, hlen, off);
        if (unlikely(!th))
                return NULL;

        thlen = th->doff * 4;
        if (thlen < sizeof(*th))
                return NULL;

        hlen = off + thlen;
        if (!skb_gro_may_pull(skb, hlen)) {
                th = skb_gro_header_slow(skb, hlen, off);
                if (unlikely(!th))
                        return NULL;
        }

        skb_gro_pull(skb, thlen);

        return th;
}

struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
                                struct tcphdr *th)
{
        unsigned int thlen = th->doff * 4;
        struct sk_buff *pp = NULL;
        struct sk_buff *p;
        struct tcphdr *th2;
        unsigned int len;
        __be32 flags;
        unsigned int mss = 1;
        int flush = 1;
        int i;

        len = skb_gro_len(skb);
        flags = tcp_flag_word(th);

        p = tcp_gro_lookup(head, th);
        if (!p)
                goto out_check_final;

        th2 = tcp_hdr(p);
        flush = (__force int)(flags & TCP_FLAG_CWR);
        flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
                  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
        flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
        for (i = sizeof(*th); i < thlen; i += 4)
                flush |= *(u32 *)((u8 *)th + i) ^
                         *(u32 *)((u8 *)th2 + i);

        flush |= gro_receive_network_flush(th, th2, p);

        mss = skb_shinfo(p)->gso_size;

        /* If skb is a GRO packet, make sure its gso_size matches prior packet mss.
         * If it is a single frame, do not aggregate it if its length
         * is bigger than our mss.
         */
        if (unlikely(skb_is_gso(skb)))
                flush |= (mss != skb_shinfo(skb)->gso_size);
        else
                flush |= (len - 1) >= mss;

        flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
        flush |= skb_cmp_decrypted(p, skb);

        if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
                flush |= (__force int)(flags ^ tcp_flag_word(th2));
                flush |= skb->ip_summed != p->ip_summed;
                flush |= skb->csum_level != p->csum_level;
                flush |= NAPI_GRO_CB(p)->count >= 64;

                if (flush || skb_gro_receive_list(p, skb))
                        mss = 1;

                goto out_check_final;
        }

        if (flush || skb_gro_receive(p, skb)) {
                mss = 1;
                goto out_check_final;
        }

        tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);

out_check_final:
        /* Force a flush if last segment is smaller than mss. */
        if (unlikely(skb_is_gso(skb)))
                flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size;
        else
                flush = len < mss;

        flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
                                        TCP_FLAG_RST | TCP_FLAG_SYN |
                                        TCP_FLAG_FIN));

        if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
                pp = p;

        NAPI_GRO_CB(skb)->flush |= (flush != 0);

        return pp;
}

void tcp_gro_complete(struct sk_buff *skb)
{
        struct tcphdr *th = tcp_hdr(skb);
        struct skb_shared_info *shinfo;

        if (skb->encapsulation)
                skb->inner_transport_header = skb->transport_header;

        skb->csum_start = (unsigned char *)th - skb->head;
        skb->csum_offset = offsetof(struct tcphdr, check);
        skb->ip_summed = CHECKSUM_PARTIAL;

        shinfo = skb_shinfo(skb);
        shinfo->gso_segs = NAPI_GRO_CB(skb)->count;

        if (th->cwr)
                shinfo->gso_type |= SKB_GSO_TCP_ECN;
}
EXPORT_SYMBOL(tcp_gro_complete);

static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
                                    struct tcphdr *th)
{
        const struct iphdr *iph;
        struct sk_buff *p;
        struct sock *sk;
        struct net *net;
        int iif, sdif;

        if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
                return;

        p = tcp_gro_lookup(head, th);
        if (p) {
                NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
                return;
        }

        inet_get_iif_sdif(skb, &iif, &sdif);
        iph = skb_gro_network_header(skb);
        net = dev_net(skb->dev);
        sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
                                       iph->saddr, th->source,
                                       iph->daddr, ntohs(th->dest),
                                       iif, sdif);
        NAPI_GRO_CB(skb)->is_flist = !sk;
        if (sk)
                sock_put(sk);
}

INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
        struct tcphdr *th;

        /* Don't bother verifying checksum if we're going to flush anyway. */
        if (!NAPI_GRO_CB(skb)->flush &&
            skb_gro_checksum_validate(skb, IPPROTO_TCP,
                                      inet_gro_compute_pseudo))
                goto flush;

        th = tcp_gro_pull_header(skb);
        if (!th)
                goto flush;

        tcp4_check_fraglist_gro(head, skb, th);

        return tcp_gro_receive(head, skb, th);

flush:
        NAPI_GRO_CB(skb)->flush = 1;
        return NULL;
}

INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
        const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
        const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
        struct tcphdr *th = tcp_hdr(skb);

        if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
                skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
                skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;

                __skb_incr_checksum_unnecessary(skb);

                return 0;
        }

        th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
                                  iph->daddr, 0);

        skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
                        (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);

        tcp_gro_complete(skb);
        return 0;
}

int __init tcpv4_offload_init(void)
{
        net_hotdata.tcpv4_offload = (struct net_offload) {
                .callbacks = {
                        .gso_segment        =        tcp4_gso_segment,
                        .gro_receive        =        tcp4_gro_receive,
                        .gro_complete        =        tcp4_gro_complete,
                },
        };
        return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP);
}



































































































































































































































































































































































































    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These are definitions needed by the state machine.
 *
 * Please send any bug reports or fixes you make to the
 * email addresses:
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson <karl@athena.chicago.il.us>
 *    Xingang Guo <xingang.guo@intel.com>
 *    Jon Grimm <jgrimm@us.ibm.com>
 *    Dajiang Zhang <dajiang.zhang@nokia.com>
 *    Sridhar Samudrala <sri@us.ibm.com>
 *    Daisy Chang <daisyc@us.ibm.com>
 *    Ardelle Fan <ardelle.fan@intel.com>
 *    Kevin Gao <kevin.gao@intel.com>
 */

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/slab.h>
#include <linux/in.h>
#include <net/sctp/command.h>
#include <net/sctp/sctp.h>

#ifndef __sctp_sm_h__
#define __sctp_sm_h__

/*
 * Possible values for the disposition are:
 */
enum sctp_disposition {
        SCTP_DISPOSITION_DISCARD,         /* No further processing.  */
        SCTP_DISPOSITION_CONSUME,         /* Process return values normally.  */
        SCTP_DISPOSITION_NOMEM,                 /* We ran out of memory--recover.  */
        SCTP_DISPOSITION_DELETE_TCB,         /* Close the association.  */
        SCTP_DISPOSITION_ABORT,                 /* Close the association NOW.  */
        SCTP_DISPOSITION_VIOLATION,         /* The peer is misbehaving.  */
        SCTP_DISPOSITION_NOT_IMPL,         /* This entry is not implemented.  */
        SCTP_DISPOSITION_ERROR,                 /* This is plain old user error.  */
        SCTP_DISPOSITION_BUG,                 /* This is a bug.  */
};

typedef enum sctp_disposition (sctp_state_fn_t) (
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);
typedef void (sctp_timer_event_t) (struct timer_list *);
struct sctp_sm_table_entry {
        sctp_state_fn_t *fn;
        const char *name;
};

/* A naming convention of "sctp_sf_xxx" applies to all the state functions
 * currently in use.
 */

/* Prototypes for generic state functions. */
sctp_state_fn_t sctp_sf_not_impl;
sctp_state_fn_t sctp_sf_bug;

/* Prototypes for gener timer state functions. */
sctp_state_fn_t sctp_sf_timer_ignore;

/* Prototypes for chunk state functions. */
sctp_state_fn_t sctp_sf_do_9_1_abort;
sctp_state_fn_t sctp_sf_cookie_wait_abort;
sctp_state_fn_t sctp_sf_cookie_echoed_abort;
sctp_state_fn_t sctp_sf_shutdown_pending_abort;
sctp_state_fn_t sctp_sf_shutdown_sent_abort;
sctp_state_fn_t sctp_sf_shutdown_ack_sent_abort;
sctp_state_fn_t sctp_sf_do_5_1B_init;
sctp_state_fn_t sctp_sf_do_5_1C_ack;
sctp_state_fn_t sctp_sf_do_5_1D_ce;
sctp_state_fn_t sctp_sf_do_5_1E_ca;
sctp_state_fn_t sctp_sf_do_4_C;
sctp_state_fn_t sctp_sf_eat_data_6_2;
sctp_state_fn_t sctp_sf_eat_data_fast_4_4;
sctp_state_fn_t sctp_sf_eat_sack_6_2;
sctp_state_fn_t sctp_sf_operr_notify;
sctp_state_fn_t sctp_sf_t1_init_timer_expire;
sctp_state_fn_t sctp_sf_t1_cookie_timer_expire;
sctp_state_fn_t sctp_sf_t2_timer_expire;
sctp_state_fn_t sctp_sf_t4_timer_expire;
sctp_state_fn_t sctp_sf_t5_timer_expire;
sctp_state_fn_t sctp_sf_sendbeat_8_3;
sctp_state_fn_t sctp_sf_beat_8_3;
sctp_state_fn_t sctp_sf_backbeat_8_3;
sctp_state_fn_t sctp_sf_do_9_2_final;
sctp_state_fn_t sctp_sf_do_9_2_shutdown;
sctp_state_fn_t sctp_sf_do_9_2_shut_ctsn;
sctp_state_fn_t sctp_sf_do_ecn_cwr;
sctp_state_fn_t sctp_sf_do_ecne;
sctp_state_fn_t sctp_sf_ootb;
sctp_state_fn_t sctp_sf_pdiscard;
sctp_state_fn_t sctp_sf_violation;
sctp_state_fn_t sctp_sf_discard_chunk;
sctp_state_fn_t sctp_sf_do_5_2_1_siminit;
sctp_state_fn_t sctp_sf_do_5_2_2_dupinit;
sctp_state_fn_t sctp_sf_do_5_2_3_initack;
sctp_state_fn_t sctp_sf_do_5_2_4_dupcook;
sctp_state_fn_t sctp_sf_unk_chunk;
sctp_state_fn_t sctp_sf_do_8_5_1_E_sa;
sctp_state_fn_t sctp_sf_cookie_echoed_err;
sctp_state_fn_t sctp_sf_do_asconf;
sctp_state_fn_t sctp_sf_do_asconf_ack;
sctp_state_fn_t sctp_sf_do_reconf;
sctp_state_fn_t sctp_sf_do_9_2_reshutack;
sctp_state_fn_t sctp_sf_eat_fwd_tsn;
sctp_state_fn_t sctp_sf_eat_fwd_tsn_fast;
sctp_state_fn_t sctp_sf_eat_auth;

/* Prototypes for primitive event state functions.  */
sctp_state_fn_t sctp_sf_do_prm_asoc;
sctp_state_fn_t sctp_sf_do_prm_send;
sctp_state_fn_t sctp_sf_do_9_2_prm_shutdown;
sctp_state_fn_t sctp_sf_cookie_wait_prm_shutdown;
sctp_state_fn_t sctp_sf_cookie_echoed_prm_shutdown;
sctp_state_fn_t sctp_sf_do_9_1_prm_abort;
sctp_state_fn_t sctp_sf_cookie_wait_prm_abort;
sctp_state_fn_t sctp_sf_cookie_echoed_prm_abort;
sctp_state_fn_t sctp_sf_shutdown_pending_prm_abort;
sctp_state_fn_t sctp_sf_shutdown_sent_prm_abort;
sctp_state_fn_t sctp_sf_shutdown_ack_sent_prm_abort;
sctp_state_fn_t sctp_sf_error_closed;
sctp_state_fn_t sctp_sf_error_shutdown;
sctp_state_fn_t sctp_sf_ignore_primitive;
sctp_state_fn_t sctp_sf_do_prm_requestheartbeat;
sctp_state_fn_t sctp_sf_do_prm_asconf;
sctp_state_fn_t sctp_sf_do_prm_reconf;

/* Prototypes for other event state functions.  */
sctp_state_fn_t sctp_sf_do_no_pending_tsn;
sctp_state_fn_t sctp_sf_do_9_2_start_shutdown;
sctp_state_fn_t sctp_sf_do_9_2_shutdown_ack;
sctp_state_fn_t sctp_sf_ignore_other;
sctp_state_fn_t sctp_sf_cookie_wait_icmp_abort;

/* Prototypes for timeout event state functions.  */
sctp_state_fn_t sctp_sf_do_6_3_3_rtx;
sctp_state_fn_t sctp_sf_send_reconf;
sctp_state_fn_t sctp_sf_send_probe;
sctp_state_fn_t sctp_sf_do_6_2_sack;
sctp_state_fn_t sctp_sf_autoclose_timer_expire;

/* Prototypes for utility support functions.  */
const struct sctp_sm_table_entry *sctp_sm_lookup_event(
                                        struct net *net,
                                        enum sctp_event_type event_type,
                                        enum sctp_state state,
                                        union sctp_subtype event_subtype);
int sctp_chunk_iif(const struct sctp_chunk *);
struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *,
                                             struct sctp_chunk *,
                                             gfp_t gfp);

/* Prototypes for chunk-building functions.  */
struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
                                  const struct sctp_bind_addr *bp,
                                  gfp_t gfp, int vparam_len);
struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
                                      const struct sctp_chunk *chunk,
                                      const gfp_t gfp, const int unkparam_len);
struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
                                         const struct sctp_chunk *chunk);
struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk);
struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
                                 const __u32 lowest_tsn,
                                 const struct sctp_chunk *chunk);
struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
                                   __u8 flags, int paylen, gfp_t gfp);
struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc,
                                     __u32 new_cum_tsn, size_t nstreams,
                                     struct sctp_ifwdtsn_skip *skiplist);
struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
                                            const struct sctp_sndrcvinfo *sinfo,
                                            int len, __u8 flags, gfp_t gfp);
struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
                                  const __u32 lowest_tsn);
struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc);
struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
                                      const struct sctp_chunk *chunk);
struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
                                          const struct sctp_chunk *chunk);
struct sctp_chunk *sctp_make_shutdown_complete(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk);
int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
                                   const struct sctp_chunk *chunk,
                                   const size_t hint);
struct sctp_chunk *sctp_make_abort_no_data(const struct sctp_association *asoc,
                                           const struct sctp_chunk *chunk,
                                           __u32 tsn);
struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
                                        struct msghdr *msg, size_t msg_len);
struct sctp_chunk *sctp_make_abort_violation(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk,
                                        const __u8 *payload,
                                        const size_t paylen);
struct sctp_chunk *sctp_make_violation_paramlen(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk,
                                        struct sctp_paramhdr *param);
struct sctp_chunk *sctp_make_violation_max_retrans(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk);
struct sctp_chunk *sctp_make_new_encap_port(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk);
struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
                                       const struct sctp_transport *transport,
                                       __u32 probe_size);
struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
                                           const struct sctp_chunk *chunk,
                                           const void *payload,
                                           const size_t paylen);
struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len);
struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
                                      const struct sctp_chunk *chunk,
                                      __be16 cause_code, const void *payload,
                                      size_t paylen, size_t reserve_tail);

struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
                                              union sctp_addr *laddr,
                                              struct sockaddr *addrs,
                                              int addrcnt, __be16 flags);
struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
                                             union sctp_addr *addr);
bool sctp_verify_asconf(const struct sctp_association *asoc,
                        struct sctp_chunk *chunk, bool addr_param_needed,
                        struct sctp_paramhdr **errp);
struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
                                       struct sctp_chunk *asconf);
int sctp_process_asconf_ack(struct sctp_association *asoc,
                            struct sctp_chunk *asconf_ack);
struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
                                    __u32 new_cum_tsn, size_t nstreams,
                                    struct sctp_fwdtsn_skip *skiplist);
struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc,
                                  __u16 key_id);
struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc,
                                          __u16 stream_num, __be16 *stream_list,
                                          bool out, bool in);
struct sctp_chunk *sctp_make_strreset_tsnreq(
                                        const struct sctp_association *asoc);
struct sctp_chunk *sctp_make_strreset_addstrm(
                                        const struct sctp_association *asoc,
                                        __u16 out, __u16 in);
struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc,
                                           __u32 result, __u32 sn);
struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc,
                                              __u32 result, __u32 sn,
                                              __u32 sender_tsn,
                                              __u32 receiver_tsn);
bool sctp_verify_reconf(const struct sctp_association *asoc,
                        struct sctp_chunk *chunk,
                        struct sctp_paramhdr **errp);
void sctp_chunk_assign_tsn(struct sctp_chunk *chunk);
void sctp_chunk_assign_ssn(struct sctp_chunk *chunk);

/* Prototypes for stream-processing functions.  */
struct sctp_chunk *sctp_process_strreset_outreq(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp);
struct sctp_chunk *sctp_process_strreset_inreq(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp);
struct sctp_chunk *sctp_process_strreset_tsnreq(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp);
struct sctp_chunk *sctp_process_strreset_addstrm_out(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp);
struct sctp_chunk *sctp_process_strreset_addstrm_in(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp);
struct sctp_chunk *sctp_process_strreset_resp(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp);

/* Prototypes for statetable processing. */

int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
               union sctp_subtype subtype, enum sctp_state state,
               struct sctp_endpoint *ep, struct sctp_association *asoc,
               void *event_arg, gfp_t gfp);

/* 2nd level prototypes */
void sctp_generate_t3_rtx_event(struct timer_list *t);
void sctp_generate_heartbeat_event(struct timer_list *t);
void sctp_generate_reconf_event(struct timer_list *t);
void sctp_generate_probe_event(struct timer_list *t);
void sctp_generate_proto_unreach_event(struct timer_list *t);

void sctp_ootb_pkt_free(struct sctp_packet *packet);

struct sctp_association *sctp_unpack_cookie(
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk,
                                        gfp_t gfp, int *err,
                                        struct sctp_chunk **err_chk_p);

/* 3rd level prototypes */
__u32 sctp_generate_tag(const struct sctp_endpoint *ep);
__u32 sctp_generate_tsn(const struct sctp_endpoint *ep);

/* Extern declarations for major data structures.  */
extern sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES];


/* Get the size of a DATA chunk payload. */
static inline __u16 sctp_data_size(struct sctp_chunk *chunk)
{
        __u16 size;

        size = ntohs(chunk->chunk_hdr->length);
        size -= sctp_datachk_len(&chunk->asoc->stream);

        return size;
}

/* Compare two TSNs */
#define TSN_lt(a,b)        \
        (typecheck(__u32, a) && \
         typecheck(__u32, b) && \
         ((__s32)((a) - (b)) < 0))

#define TSN_lte(a,b)        \
        (typecheck(__u32, a) && \
         typecheck(__u32, b) && \
         ((__s32)((a) - (b)) <= 0))

/* Compare two MIDs */
#define MID_lt(a, b)        \
        (typecheck(__u32, a) && \
         typecheck(__u32, b) && \
         ((__s32)((a) - (b)) < 0))

/* Compare two SSNs */
#define SSN_lt(a,b)                \
        (typecheck(__u16, a) && \
         typecheck(__u16, b) && \
         ((__s16)((a) - (b)) < 0))

/* ADDIP 3.1.1 */
#define ADDIP_SERIAL_gte(a,b)        \
        (typecheck(__u32, a) && \
         typecheck(__u32, b) && \
         ((__s32)((b) - (a)) <= 0))

/* Check VTAG of the packet matches the sender's own tag. */
static inline int
sctp_vtag_verify(const struct sctp_chunk *chunk,
                 const struct sctp_association *asoc)
{
        /* RFC 2960 Sec 8.5 When receiving an SCTP packet, the endpoint
         * MUST ensure that the value in the Verification Tag field of
         * the received SCTP packet matches its own Tag. If the received
         * Verification Tag value does not match the receiver's own
         * tag value, the receiver shall silently discard the packet...
         */
        if (ntohl(chunk->sctp_hdr->vtag) != asoc->c.my_vtag)
                return 0;

        chunk->transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
        return 1;
}

/* Check VTAG of the packet matches the sender's own tag and the T bit is
 * not set, OR its peer's tag and the T bit is set in the Chunk Flags.
 */
static inline int
sctp_vtag_verify_either(const struct sctp_chunk *chunk,
                        const struct sctp_association *asoc)
{
        /* RFC 2960 Section 8.5.1, sctpimpguide Section 2.41
         *
         * B) The receiver of a ABORT MUST accept the packet
         *    if the Verification Tag field of the packet matches its own tag
         *    and the T bit is not set
         *    OR
         *    it is set to its peer's tag and the T bit is set in the Chunk
         *    Flags.
         *    Otherwise, the receiver MUST silently discard the packet
         *    and take no further action.
         *
         * C) The receiver of a SHUTDOWN COMPLETE shall accept the packet
         *    if the Verification Tag field of the packet matches its own tag
         *    and the T bit is not set
         *    OR
         *    it is set to its peer's tag and the T bit is set in the Chunk
         *    Flags.
         *    Otherwise, the receiver MUST silently discard the packet
         *    and take no further action.  An endpoint MUST ignore the
         *    SHUTDOWN COMPLETE if it is not in the SHUTDOWN-ACK-SENT state.
         */
        if ((!sctp_test_T_bit(chunk) &&
             (ntohl(chunk->sctp_hdr->vtag) == asoc->c.my_vtag)) ||
            (sctp_test_T_bit(chunk) && asoc->c.peer_vtag &&
             (ntohl(chunk->sctp_hdr->vtag) == asoc->c.peer_vtag))) {
                return 1;
        }

        return 0;
}

#endif /* __sctp_sm_h__ */



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    7 






    8 








































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
/*
 *  kernel/cpuset.c
 *
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
 *  Copyright (C) 2006 Google, Inc
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
 *  2008 Rework of the scheduler domains and CPU hotplug handling
 *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
#include <linux/workqueue.h>

DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

/*
 * There could be abnormal cpuset configurations for cpu or memory
 * node binding, add this key to provide a quick low-cost judgment
 * of the situation.
 */
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);

/* See "Frequency meter" comments, below. */

struct fmeter {
        int cnt;                /* unprocessed events count */
        int val;                /* most recent output value */
        time64_t time;                /* clock (secs) when val computed */
        spinlock_t lock;        /* guards read or write of above */
};

/*
 * Invalid partition error code
 */
enum prs_errcode {
        PERR_NONE = 0,
        PERR_INVCPUS,
        PERR_INVPARENT,
        PERR_NOTPART,
        PERR_NOTEXCL,
        PERR_NOCPUS,
        PERR_HOTPLUG,
        PERR_CPUSEMPTY,
        PERR_HKEEPING,
};

static const char * const perr_strings[] = {
        [PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive",
        [PERR_INVPARENT] = "Parent is an invalid partition root",
        [PERR_NOTPART]   = "Parent is not a partition root",
        [PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
        [PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
        [PERR_HOTPLUG]   = "No cpu available due to hotplug",
        [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
        [PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",
};

struct cpuset {
        struct cgroup_subsys_state css;

        unsigned long flags;                /* "unsigned long" so bitops work */

        /*
         * On default hierarchy:
         *
         * The user-configured masks can only be changed by writing to
         * cpuset.cpus and cpuset.mems, and won't be limited by the
         * parent masks.
         *
         * The effective masks is the real masks that apply to the tasks
         * in the cpuset. They may be changed if the configured masks are
         * changed or hotplug happens.
         *
         * effective_mask == configured_mask & parent's effective_mask,
         * and if it ends up empty, it will inherit the parent's mask.
         *
         *
         * On legacy hierarchy:
         *
         * The user-configured masks are always the same with effective masks.
         */

        /* user-configured CPUs and Memory Nodes allow to tasks */
        cpumask_var_t cpus_allowed;
        nodemask_t mems_allowed;

        /* effective CPUs and Memory Nodes allow to tasks */
        cpumask_var_t effective_cpus;
        nodemask_t effective_mems;

        /*
         * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
         *
         * This exclusive CPUs must be a subset of cpus_allowed. A parent
         * cgroup can only grant exclusive CPUs to one of its children.
         *
         * When the cgroup becomes a valid partition root, effective_xcpus
         * defaults to cpus_allowed if not set. The effective_cpus of a valid
         * partition root comes solely from its effective_xcpus and some of the
         * effective_xcpus may be distributed to sub-partitions below & hence
         * excluded from its effective_cpus.
         */
        cpumask_var_t effective_xcpus;

        /*
         * Exclusive CPUs as requested by the user (default hierarchy only)
         */
        cpumask_var_t exclusive_cpus;

        /*
         * This is old Memory Nodes tasks took on.
         *
         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
         * - A new cpuset's old_mems_allowed is initialized when some
         *   task is moved into it.
         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
         *   cpuset.mems_allowed and have tasks' nodemask updated, and
         *   then old_mems_allowed is updated to mems_allowed.
         */
        nodemask_t old_mems_allowed;

        struct fmeter fmeter;                /* memory_pressure filter */

        /*
         * Tasks are being attached to this cpuset.  Used to prevent
         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
         */
        int attach_in_progress;

        /* partition number for rebuild_sched_domains() */
        int pn;

        /* for custom sched domain */
        int relax_domain_level;

        /* number of valid sub-partitions */
        int nr_subparts;

        /* partition root state */
        int partition_root_state;

        /*
         * Default hierarchy only:
         * use_parent_ecpus - set if using parent's effective_cpus
         * child_ecpus_count - # of children with use_parent_ecpus set
         */
        int use_parent_ecpus;
        int child_ecpus_count;

        /*
         * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
         * know when to rebuild associated root domain bandwidth information.
         */
        int nr_deadline_tasks;
        int nr_migrate_dl_tasks;
        u64 sum_migrate_dl_bw;

        /* Invalid partition error code, not lock protected */
        enum prs_errcode prs_err;

        /* Handle for cpuset.cpus.partition */
        struct cgroup_file partition_file;

        /* Remote partition silbling list anchored at remote_children */
        struct list_head remote_sibling;
};

/*
 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
 */
struct cpuset_remove_tasks_struct {
        struct work_struct work;
        struct cpuset *cs;
};

/*
 * Exclusive CPUs distributed out to sub-partitions of top_cpuset
 */
static cpumask_var_t        subpartitions_cpus;

/*
 * Exclusive CPUs in isolated partitions
 */
static cpumask_var_t        isolated_cpus;

/* List of remote partition root children */
static struct list_head remote_children;

/*
 * Partition root states:
 *
 *   0 - member (not a partition root)
 *   1 - partition root
 *   2 - partition root without load balancing (isolated)
 *  -1 - invalid partition root
 *  -2 - invalid isolated partition root
 */
#define PRS_MEMBER                0
#define PRS_ROOT                1
#define PRS_ISOLATED                2
#define PRS_INVALID_ROOT        -1
#define PRS_INVALID_ISOLATED        -2

static inline bool is_prs_invalid(int prs_state)
{
        return prs_state < 0;
}

/*
 * Temporary cpumasks for working with partitions that are passed among
 * functions to avoid memory allocation in inner functions.
 */
struct tmpmasks {
        cpumask_var_t addmask, delmask;        /* For partition root */
        cpumask_var_t new_cpus;                /* For update_cpumasks_hier() */
};

static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct cpuset, css) : NULL;
}

/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
        return css_cs(task_css(task, cpuset_cgrp_id));
}

static inline struct cpuset *parent_cs(struct cpuset *cs)
{
        return css_cs(cs->css.parent);
}

void inc_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks++;
}

void dec_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks--;
}

/* bits in struct cpuset flags field */
typedef enum {
        CS_ONLINE,
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
        CS_SPREAD_SLAB,
} cpuset_flagbits_t;

/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
        return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}

static inline int is_cpu_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_hardwall(const struct cpuset *cs)
{
        return test_bit(CS_MEM_HARDWALL, &cs->flags);
}

static inline int is_sched_load_balance(const struct cpuset *cs)
{
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}

static inline int is_memory_migrate(const struct cpuset *cs)
{
        return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}

static inline int is_spread_page(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_PAGE, &cs->flags);
}

static inline int is_spread_slab(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

static inline int is_partition_valid(const struct cpuset *cs)
{
        return cs->partition_root_state > 0;
}

static inline int is_partition_invalid(const struct cpuset *cs)
{
        return cs->partition_root_state < 0;
}

/*
 * Callers should hold callback_lock to modify partition_root_state.
 */
static inline void make_partition_invalid(struct cpuset *cs)
{
        if (cs->partition_root_state > 0)
                cs->partition_root_state = -cs->partition_root_state;
}

/*
 * Send notification event of whenever partition_root_state changes.
 */
static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
        if (old_prs == cs->partition_root_state)
                return;
        cgroup_file_notify(&cs->partition_file);

        /* Reset prs_err if not invalid */
        if (is_partition_valid(cs))
                WRITE_ONCE(cs->prs_err, PERR_NONE);
}

static struct cpuset top_cpuset = {
        .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
                 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
        .partition_root_state = PRS_ROOT,
        .relax_domain_level = -1,
        .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};

/**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
 * @pos_css: used for iteration
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */
#define cpuset_for_each_child(child_cs, pos_css, parent_cs)                \
        css_for_each_child((pos_css), &(parent_cs)->css)                \
                if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))

/**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
 * with RCU read locked.  The caller may modify @pos_css by calling
 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 * iteration and the first node to be visited.
 */
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
        css_for_each_descendant_pre((pos_css), &(root_cs)->css)                \
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))

/*
 * There are two global locks guarding cpuset structures - cpuset_mutex and
 * callback_lock. We also require taking task_lock() when dereferencing a
 * task's cpuset pointer. See "The task_lock() exception", at the end of this
 * comment.  The cpuset code uses only cpuset_mutex. Other kernel subsystems
 * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
 * structures. Note that cpuset_mutex needs to be a mutex as it is used in
 * paths that rely on priority inheritance (e.g. scheduler - on RT) for
 * correctness.
 *
 * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, it blocks others, ensuring that it is the only task able to
 * also acquire callback_lock and be able to modify cpusets.  It can perform
 * various checks on the cpuset structure first, knowing nothing will change.
 * It can also allocate memory while just holding cpuset_mutex.  While it is
 * performing these checks, various callback routines can briefly acquire
 * callback_lock to query cpusets.  Once it is ready to make the changes, it
 * takes callback_lock, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
 * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
 * Accessing a task's cpuset should be done in accordance with the
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */

static DEFINE_MUTEX(cpuset_mutex);

void cpuset_lock(void)
{
        mutex_lock(&cpuset_mutex);
}

void cpuset_unlock(void)
{
        mutex_unlock(&cpuset_mutex);
}

static DEFINE_SPINLOCK(callback_lock);

static struct workqueue_struct *cpuset_migrate_mm_wq;

static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

static inline void check_insane_mems_config(nodemask_t *nodes)
{
        if (!cpusets_insane_config() &&
                movable_only_nodes(nodes)) {
                static_branch_enable(&cpusets_insane_config_key);
                pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
                        "Cpuset allocations might fail even with a lot of memory available.\n",
                        nodemask_pr_args(nodes));
        }
}

/*
 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
 * With v2 behavior, "cpus" and "mems" are always what the users have
 * requested and won't be changed by hotplug events. Only the effective
 * cpus or mems will be affected.
 */
static inline bool is_in_v2_mode(void)
{
        return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
              (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}

/**
 * partition_is_populated - check if partition has tasks
 * @cs: partition root to be checked
 * @excluded_child: a child cpuset to be excluded in task checking
 * Return: true if there are tasks, false otherwise
 *
 * It is assumed that @cs is a valid partition root. @excluded_child should
 * be non-NULL when this cpuset is going to become a partition itself.
 */
static inline bool partition_is_populated(struct cpuset *cs,
                                          struct cpuset *excluded_child)
{
        struct cgroup_subsys_state *css;
        struct cpuset *child;

        if (cs->css.cgroup->nr_populated_csets)
                return true;
        if (!excluded_child && !cs->nr_subparts)
                return cgroup_is_populated(cs->css.cgroup);

        rcu_read_lock();
        cpuset_for_each_child(child, css, cs) {
                if (child == excluded_child)
                        continue;
                if (is_partition_valid(child))
                        continue;
                if (cgroup_is_populated(child->css.cgroup)) {
                        rcu_read_unlock();
                        return true;
                }
        }
        rcu_read_unlock();
        return false;
}

/*
 * Return in pmask the portion of a task's cpusets's cpus_allowed that
 * are online and are capable of running the task.  If none are found,
 * walk up the cpuset hierarchy until we find one that does have some
 * appropriate cpus.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_cpus(struct task_struct *tsk,
                                  struct cpumask *pmask)
{
        const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
        struct cpuset *cs;

        if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
                cpumask_copy(pmask, cpu_online_mask);

        rcu_read_lock();
        cs = task_cs(tsk);

        while (!cpumask_intersects(cs->effective_cpus, pmask))
                cs = parent_cs(cs);

        cpumask_and(pmask, pmask, cs->effective_cpus);
        rcu_read_unlock();
}

/*
 * Return in *pmask the portion of a cpusets's mems_allowed that
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
 * online mems.  The top cpuset always has some mems online.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
        while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                cs = parent_cs(cs);
        nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}

/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
 * Call with callback_lock or cpuset_mutex held. The check can be skipped
 * if on default hierarchy.
 */
static void cpuset_update_task_spread_flags(struct cpuset *cs,
                                        struct task_struct *tsk)
{
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                return;

        if (is_spread_page(cs))
                task_set_spread_page(tsk);
        else
                task_clear_spread_page(tsk);

        if (is_spread_slab(cs))
                task_set_spread_slab(tsk);
        else
                task_clear_spread_slab(tsk);
}

/*
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
 * are only set if the other's are set.  Call holding cpuset_mutex.
 */

static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
        return        cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                is_mem_exclusive(p) <= is_mem_exclusive(q);
}

/**
 * alloc_cpumasks - allocate three cpumasks for cpuset
 * @cs:  the cpuset that have cpumasks to be allocated.
 * @tmp: the tmpmasks structure pointer
 * Return: 0 if successful, -ENOMEM otherwise.
 *
 * Only one of the two input arguments should be non-NULL.
 */
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;

        if (cs) {
                pmask1 = &cs->cpus_allowed;
                pmask2 = &cs->effective_cpus;
                pmask3 = &cs->effective_xcpus;
                pmask4 = &cs->exclusive_cpus;
        } else {
                pmask1 = &tmp->new_cpus;
                pmask2 = &tmp->addmask;
                pmask3 = &tmp->delmask;
                pmask4 = NULL;
        }

        if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
                return -ENOMEM;

        if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
                goto free_one;

        if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
                goto free_two;

        if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
                goto free_three;


        return 0;

free_three:
        free_cpumask_var(*pmask3);
free_two:
        free_cpumask_var(*pmask2);
free_one:
        free_cpumask_var(*pmask1);
        return -ENOMEM;
}

/**
 * free_cpumasks - free cpumasks in a tmpmasks structure
 * @cs:  the cpuset that have cpumasks to be free.
 * @tmp: the tmpmasks structure pointer
 */
static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        if (cs) {
                free_cpumask_var(cs->cpus_allowed);
                free_cpumask_var(cs->effective_cpus);
                free_cpumask_var(cs->effective_xcpus);
                free_cpumask_var(cs->exclusive_cpus);
        }
        if (tmp) {
                free_cpumask_var(tmp->new_cpus);
                free_cpumask_var(tmp->addmask);
                free_cpumask_var(tmp->delmask);
        }
}

/**
 * alloc_trial_cpuset - allocate a trial cpuset
 * @cs: the cpuset that the trial cpuset duplicates
 */
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
        struct cpuset *trial;

        trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
        if (!trial)
                return NULL;

        if (alloc_cpumasks(trial, NULL)) {
                kfree(trial);
                return NULL;
        }

        cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
        cpumask_copy(trial->effective_cpus, cs->effective_cpus);
        cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
        cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
        return trial;
}

/**
 * free_cpuset - free the cpuset
 * @cs: the cpuset to be freed
 */
static inline void free_cpuset(struct cpuset *cs)
{
        free_cpumasks(cs, NULL);
        kfree(cs);
}

static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
{
        return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
               cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
                                                  : cs->effective_xcpus;
}

/*
 * cpusets_are_exclusive() - check if two cpusets are exclusive
 *
 * Return true if exclusive, false if not
 */
static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
{
        struct cpumask *xcpus1 = fetch_xcpus(cs1);
        struct cpumask *xcpus2 = fetch_xcpus(cs2);

        if (cpumask_intersects(xcpus1, xcpus2))
                return false;
        return true;
}

/*
 * validate_change_legacy() - Validate conditions specific to legacy (v1)
 *                            behavior.
 */
static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
{
        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret;

        WARN_ON_ONCE(!rcu_read_lock_held());

        /* Each of our child cpusets must be a subset of us */
        ret = -EBUSY;
        cpuset_for_each_child(c, css, cur)
                if (!is_cpuset_subset(c, trial))
                        goto out;

        /* On legacy hierarchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
        par = parent_cs(cur);
        if (par && !is_cpuset_subset(trial, par))
                goto out;

        ret = 0;
out:
        return ret;
}

/*
 * validate_change() - Used to validate that any proposed cpuset change
 *                       follows the structural rules for cpusets.
 *
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
 * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
 * cpuset in the list must use cur below, not trial.
 *
 * 'trial' is the address of bulk structure copy of cur, with
 * perhaps one or more of the fields cpus_allowed, mems_allowed,
 * or flags changed to new, trial values.
 *
 * Return 0 if valid, -errno if not.
 */

static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret = 0;

        rcu_read_lock();

        if (!is_in_v2_mode())
                ret = validate_change_legacy(cur, trial);
        if (ret)
                goto out;

        /* Remaining checks don't apply to root cpuset */
        if (cur == &top_cpuset)
                goto out;

        par = parent_cs(cur);

        /*
         * Cpusets with tasks - existing or newly being attached - can't
         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
        if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                if (!cpumask_empty(cur->cpus_allowed) &&
                    cpumask_empty(trial->cpus_allowed))
                        goto out;
                if (!nodes_empty(cur->mems_allowed) &&
                    nodes_empty(trial->mems_allowed))
                        goto out;
        }

        /*
         * We can't shrink if we won't have enough room for SCHED_DEADLINE
         * tasks.
         */
        ret = -EBUSY;
        if (is_cpu_exclusive(cur) &&
            !cpuset_cpumask_can_shrink(cur->cpus_allowed,
                                       trial->cpus_allowed))
                goto out;

        /*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
        ret = -EINVAL;
        cpuset_for_each_child(c, css, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur) {
                        if (!cpusets_are_exclusive(trial, c))
                                goto out;
                }
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
                        goto out;
        }

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

#ifdef CONFIG_SMP
/*
 * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping effective cpus_allowed masks?
 */
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
        return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}

static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
        return;
}

static void update_domain_attr_tree(struct sched_domain_attr *dattr,
                                    struct cpuset *root_cs)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
                /* skip the whole subtree if @cp doesn't have any CPU */
                if (cpumask_empty(cp->cpus_allowed)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
        }
        rcu_read_unlock();
}

/* Must be called with cpuset_mutex held.  */
static inline int nr_cpusets(void)
{
        /* jump label reference count + the top-level cpuset */
        return static_key_count(&cpusets_enabled_key.key) + 1;
}

/*
 * generate_sched_domains()
 *
 * This function builds a partial partition of the systems CPUs
 * A 'partial partition' is a set of non-overlapping subsets whose
 * union is a subset of that set.
 * The output of this function needs to be passed to kernel/sched/core.c
 * partition_sched_domains() routine, which will rebuild the scheduler's
 * load balancing domains (sched domains) as specified by that partial
 * partition.
 *
 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
 * for a background explanation of this.
 *
 * Does not return errors, on the theory that the callers of this
 * routine would rather not worry about failures to rebuild sched
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
 * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    cp - cpuset pointer, used (together with pos_css) to perform a
 *           top-down scan of all cpusets. For our purposes, rebuilding
 *           the schedulers sched domains, we can ignore !is_sched_load_
 *           balance cpusets.
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 *           that need to be load balanced, for convenient iterative
 *           access by the subsequent code that finds the best partition,
 *           i.e the set of domains (subsets) of CPUs such that the
 *           cpus_allowed of every cpuset marked is_sched_load_balance
 *           is a subset of one of these domains, while there are as
 *           many such domains as possible, each as small as possible.
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 *           the kernel/sched/core.c routine partition_sched_domains() in a
 *           convenient format, that can be easily compared to the prior
 *           value to determine what partition elements (sched domains)
 *           were changed (added or removed.)
 *
 * Finding the best partition (set of domains):
 *        The triple nested loops below over i, j, k scan over the
 *        load balanced cpusets (using the array of cpuset pointers in
 *        csa[]) looking for pairs of cpusets that have overlapping
 *        cpus_allowed, but which don't have the same 'pn' partition
 *        number and gives them in the same partition number.  It keeps
 *        looping on the 'restart' label until it can no longer find
 *        any such pairs.
 *
 *        The union of the cpus_allowed masks from the set of
 *        all cpusets having the same 'pn' value then form the one
 *        element of the partition (one sched domain) to be passed to
 *        partition_sched_domains().
 */
static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
{
        struct cpuset *cp;        /* top-down scan of cpusets */
        struct cpuset **csa;        /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;                /* indices for partition finding loops */
        cpumask_var_t *doms;        /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;                /* number of sched domains in result */
        int nslot;                /* next empty doms[] struct cpumask slot */
        struct cgroup_subsys_state *pos_css;
        bool root_load_balance = is_sched_load_balance(&top_cpuset);

        doms = NULL;
        dattr = NULL;
        csa = NULL;

        /* Special case for the 99% of systems with one, full, sched domain */
        if (root_load_balance && !top_cpuset.nr_subparts) {
                ndoms = 1;
                doms = alloc_sched_domains(ndoms);
                if (!doms)
                        goto done;

                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                cpumask_and(doms[0], top_cpuset.effective_cpus,
                            housekeeping_cpumask(HK_TYPE_DOMAIN));

                goto done;
        }

        csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;

        rcu_read_lock();
        if (root_load_balance)
                csa[csn++] = &top_cpuset;
        cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
                if (cp == &top_cpuset)
                        continue;
                /*
                 * Continue traversing beyond @cp iff @cp has some CPUs and
                 * isn't load balancing.  The former is obvious.  The
                 * latter: All child cpusets contain a subset of the
                 * parent's cpus, so just skip them, and then we call
                 * update_domain_attr_tree() to calc relax_domain_level of
                 * the corresponding sched domain.
                 *
                 * If root is load-balancing, we can skip @cp if it
                 * is a subset of the root's effective_cpus.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    !(is_sched_load_balance(cp) &&
                      cpumask_intersects(cp->cpus_allowed,
                                         housekeeping_cpumask(HK_TYPE_DOMAIN))))
                        continue;

                if (root_load_balance &&
                    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
                        continue;

                if (is_sched_load_balance(cp) &&
                    !cpumask_empty(cp->effective_cpus))
                        csa[csn++] = cp;

                /* skip @cp's subtree if not a partition root */
                if (!is_partition_valid(cp))
                        pos_css = css_rightmost_descendant(pos_css);
        }
        rcu_read_unlock();

        for (i = 0; i < csn; i++)
                csa[i]->pn = i;
        ndoms = csn;

restart:
        /* Find the best partition (set of sched domains) */
        for (i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                int apn = a->pn;

                for (j = 0; j < csn; j++) {
                        struct cpuset *b = csa[j];
                        int bpn = b->pn;

                        if (apn != bpn && cpusets_overlap(a, b)) {
                                for (k = 0; k < csn; k++) {
                                        struct cpuset *c = csa[k];

                                        if (c->pn == bpn)
                                                c->pn = apn;
                                }
                                ndoms--;        /* one less element */
                                goto restart;
                        }
                }
        }

        /*
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
        doms = alloc_sched_domains(ndoms);
        if (!doms)
                goto done;

        /*
         * The rest of the code, including the scheduler, can deal with
         * dattr==NULL case. No need to abort if alloc fails.
         */
        dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
                              GFP_KERNEL);

        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                struct cpumask *dp;
                int apn = a->pn;

                if (apn < 0) {
                        /* Skip completed partitions */
                        continue;
                }

                dp = doms[nslot];

                if (nslot == ndoms) {
                        static int warnings = 10;
                        if (warnings) {
                                pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
                                        nslot, ndoms, csn, i, apn);
                                warnings--;
                        }
                        continue;
                }

                cpumask_clear(dp);
                if (dattr)
                        *(dattr + nslot) = SD_ATTR_INIT;
                for (j = i; j < csn; j++) {
                        struct cpuset *b = csa[j];

                        if (apn == b->pn) {
                                cpumask_or(dp, dp, b->effective_cpus);
                                cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);

                                /* Done with this partition */
                                b->pn = -1;
                        }
                }
                nslot++;
        }
        BUG_ON(nslot != ndoms);

done:
        kfree(csa);

        /*
         * Fallback to the default domain if kmalloc() failed.
         * See comments in partition_sched_domains().
         */
        if (doms == NULL)
                ndoms = 1;

        *domains    = doms;
        *attributes = dattr;
        return ndoms;
}

static void dl_update_tasks_root_domain(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        if (cs->nr_deadline_tasks == 0)
                return;

        css_task_iter_start(&cs->css, 0, &it);

        while ((task = css_task_iter_next(&it)))
                dl_add_task_root_domain(task);

        css_task_iter_end(&it);
}

static void dl_rebuild_rd_accounting(void)
{
        struct cpuset *cs = NULL;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);
        lockdep_assert_cpus_held();
        lockdep_assert_held(&sched_domains_mutex);

        rcu_read_lock();

        /*
         * Clear default root domain DL accounting, it will be computed again
         * if a task belongs to it.
         */
        dl_clear_root_domain(&def_root_domain);

        cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {

                if (cpumask_empty(cs->effective_cpus)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                css_get(&cs->css);

                rcu_read_unlock();

                dl_update_tasks_root_domain(cs);

                rcu_read_lock();
                css_put(&cs->css);
        }
        rcu_read_unlock();
}

static void
partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                                    struct sched_domain_attr *dattr_new)
{
        mutex_lock(&sched_domains_mutex);
        partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
        dl_rebuild_rd_accounting();
        mutex_unlock(&sched_domains_mutex);
}

/*
 * Rebuild scheduler domains.
 *
 * If the flag 'sched_load_balance' of any cpuset with non-empty
 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 * which has that flag enabled, or if any cpuset with a non-empty
 * 'cpus' is removed, then call this routine to rebuild the
 * scheduler's dynamic sched domains.
 *
 * Call with cpuset_mutex held.  Takes cpus_read_lock().
 */
static void rebuild_sched_domains_locked(void)
{
        struct cgroup_subsys_state *pos_css;
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        struct cpuset *cs;
        int ndoms;

        lockdep_assert_cpus_held();
        lockdep_assert_held(&cpuset_mutex);

        /*
         * If we have raced with CPU hotplug, return early to avoid
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
         *
         * With no CPUs in any subpartitions, top_cpuset's effective CPUs
         * should be the same as the active CPUs, so checking only top_cpuset
         * is enough to detect racing CPU offlines.
         */
        if (cpumask_empty(subpartitions_cpus) &&
            !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                return;

        /*
         * With subpartition CPUs, however, the effective CPUs of a partition
         * root should be only a subset of the active CPUs.  Since a CPU in any
         * partition root could be offlined, all must be checked.
         */
        if (top_cpuset.nr_subparts) {
                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (!is_partition_valid(cs)) {
                                pos_css = css_rightmost_descendant(pos_css);
                                continue;
                        }
                        if (!cpumask_subset(cs->effective_cpus,
                                            cpu_active_mask)) {
                                rcu_read_unlock();
                                return;
                        }
                }
                rcu_read_unlock();
        }

        /* Generate domain masks and attrs */
        ndoms = generate_sched_domains(&doms, &attr);

        /* Have scheduler rebuild the domains */
        partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
{
}
#endif /* CONFIG_SMP */

static void rebuild_sched_domains_cpuslocked(void)
{
        mutex_lock(&cpuset_mutex);
        rebuild_sched_domains_locked();
        mutex_unlock(&cpuset_mutex);
}

void rebuild_sched_domains(void)
{
        cpus_read_lock();
        rebuild_sched_domains_cpuslocked();
        cpus_read_unlock();
}

/**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 * @new_cpus: the temp variable for the new effective_cpus mask
 *
 * Iterate through each task of @cs updating its cpus_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
 * is used instead of effective_cpus to make sure all offline CPUs are also
 * included as hotplug code won't update cpumasks for tasks in top_cpuset.
 */
static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
        struct css_task_iter it;
        struct task_struct *task;
        bool top_cs = cs == &top_cpuset;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                const struct cpumask *possible_mask = task_cpu_possible_mask(task);

                if (top_cs) {
                        /*
                         * Percpu kthreads in top_cpuset are ignored
                         */
                        if (kthread_is_per_cpu(task))
                                continue;
                        cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
                } else {
                        cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
                }
                set_cpus_allowed_ptr(task, new_cpus);
        }
        css_task_iter_end(&it);
}

/**
 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
 * @new_cpus: the temp variable for the new effective_cpus mask
 * @cs: the cpuset the need to recompute the new effective_cpus mask
 * @parent: the parent cpuset
 *
 * The result is valid only if the given cpuset isn't a partition root.
 */
static void compute_effective_cpumask(struct cpumask *new_cpus,
                                      struct cpuset *cs, struct cpuset *parent)
{
        cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
}

/*
 * Commands for update_parent_effective_cpumask
 */
enum partition_cmd {
        partcmd_enable,                /* Enable partition root          */
        partcmd_enablei,        /* Enable isolated partition root */
        partcmd_disable,        /* Disable partition root          */
        partcmd_update,                /* Update parent's effective_cpus */
        partcmd_invalidate,        /* Make partition invalid          */
};

static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on);
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
                                    struct tmpmasks *tmp);

/*
 * Update partition exclusive flag
 *
 * Return: 0 if successful, an error code otherwise
 */
static int update_partition_exclusive(struct cpuset *cs, int new_prs)
{
        bool exclusive = (new_prs > 0);

        if (exclusive && !is_cpu_exclusive(cs)) {
                if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
                        return PERR_NOTEXCL;
        } else if (!exclusive && is_cpu_exclusive(cs)) {
                /* Turning off CS_CPU_EXCLUSIVE will not return error */
                update_flag(CS_CPU_EXCLUSIVE, cs, 0);
        }
        return 0;
}

/*
 * Update partition load balance flag and/or rebuild sched domain
 *
 * Changing load balance flag will automatically call
 * rebuild_sched_domains_locked().
 * This function is for cgroup v2 only.
 */
static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
{
        int new_prs = cs->partition_root_state;
        bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
        bool new_lb;

        /*
         * If cs is not a valid partition root, the load balance state
         * will follow its parent.
         */
        if (new_prs > 0) {
                new_lb = (new_prs != PRS_ISOLATED);
        } else {
                new_lb = is_sched_load_balance(parent_cs(cs));
        }
        if (new_lb != !!is_sched_load_balance(cs)) {
                rebuild_domains = true;
                if (new_lb)
                        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
                else
                        clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        }

        if (rebuild_domains)
                rebuild_sched_domains_locked();
}

/*
 * tasks_nocpu_error - Return true if tasks will have no effective_cpus
 */
static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
                              struct cpumask *xcpus)
{
        /*
         * A populated partition (cs or parent) can't have empty effective_cpus
         */
        return (cpumask_subset(parent->effective_cpus, xcpus) &&
                partition_is_populated(parent, cs)) ||
               (!cpumask_intersects(xcpus, cpu_active_mask) &&
                partition_is_populated(cs, NULL));
}

static void reset_partition_data(struct cpuset *cs)
{
        struct cpuset *parent = parent_cs(cs);

        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                return;

        lockdep_assert_held(&callback_lock);

        cs->nr_subparts = 0;
        if (cpumask_empty(cs->exclusive_cpus)) {
                cpumask_clear(cs->effective_xcpus);
                if (is_cpu_exclusive(cs))
                        clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
        }
        if (!cpumask_and(cs->effective_cpus,
                         parent->effective_cpus, cs->cpus_allowed)) {
                cs->use_parent_ecpus = true;
                parent->child_ecpus_count++;
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
        }
}

/*
 * partition_xcpus_newstate - Exclusive CPUs state change
 * @old_prs: old partition_root_state
 * @new_prs: new partition_root_state
 * @xcpus: exclusive CPUs with state change
 */
static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
{
        WARN_ON_ONCE(old_prs == new_prs);
        if (new_prs == PRS_ISOLATED)
                cpumask_or(isolated_cpus, isolated_cpus, xcpus);
        else
                cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
}

/*
 * partition_xcpus_add - Add new exclusive CPUs to partition
 * @new_prs: new partition_root_state
 * @parent: parent cpuset
 * @xcpus: exclusive CPUs to be added
 * Return: true if isolated_cpus modified, false otherwise
 *
 * Remote partition if parent == NULL
 */
static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
                                struct cpumask *xcpus)
{
        bool isolcpus_updated;

        WARN_ON_ONCE(new_prs < 0);
        lockdep_assert_held(&callback_lock);
        if (!parent)
                parent = &top_cpuset;


        if (parent == &top_cpuset)
                cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);

        isolcpus_updated = (new_prs != parent->partition_root_state);
        if (isolcpus_updated)
                partition_xcpus_newstate(parent->partition_root_state, new_prs,
                                         xcpus);

        cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
        return isolcpus_updated;
}

/*
 * partition_xcpus_del - Remove exclusive CPUs from partition
 * @old_prs: old partition_root_state
 * @parent: parent cpuset
 * @xcpus: exclusive CPUs to be removed
 * Return: true if isolated_cpus modified, false otherwise
 *
 * Remote partition if parent == NULL
 */
static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
                                struct cpumask *xcpus)
{
        bool isolcpus_updated;

        WARN_ON_ONCE(old_prs < 0);
        lockdep_assert_held(&callback_lock);
        if (!parent)
                parent = &top_cpuset;

        if (parent == &top_cpuset)
                cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);

        isolcpus_updated = (old_prs != parent->partition_root_state);
        if (isolcpus_updated)
                partition_xcpus_newstate(old_prs, parent->partition_root_state,
                                         xcpus);

        cpumask_and(xcpus, xcpus, cpu_active_mask);
        cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
        return isolcpus_updated;
}

static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
{
        int ret;

        lockdep_assert_cpus_held();

        if (!isolcpus_updated)
                return;

        ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
        WARN_ON_ONCE(ret < 0);
}

/**
 * cpuset_cpu_is_isolated - Check if the given CPU is isolated
 * @cpu: the CPU number to be checked
 * Return: true if CPU is used in an isolated partition, false otherwise
 */
bool cpuset_cpu_is_isolated(int cpu)
{
        return cpumask_test_cpu(cpu, isolated_cpus);
}
EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);

/*
 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
 * @cs: cpuset
 * @xcpus: effective exclusive CPUs value to be set
 * Return: true if xcpus is not empty, false otherwise.
 *
 * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
 * it must be a subset of cpus_allowed and parent's effective_xcpus.
 */
static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
                                                struct cpumask *xcpus)
{
        struct cpuset *parent = parent_cs(cs);

        if (!xcpus)
                xcpus = cs->effective_xcpus;

        if (!cpumask_empty(cs->exclusive_cpus))
                cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
        else
                cpumask_copy(xcpus, cs->cpus_allowed);

        return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
}

static inline bool is_remote_partition(struct cpuset *cs)
{
        return !list_empty(&cs->remote_sibling);
}

static inline bool is_local_partition(struct cpuset *cs)
{
        return is_partition_valid(cs) && !is_remote_partition(cs);
}

/*
 * remote_partition_enable - Enable current cpuset as a remote partition root
 * @cs: the cpuset to update
 * @new_prs: new partition_root_state
 * @tmp: temparary masks
 * Return: 1 if successful, 0 if error
 *
 * Enable the current cpuset to become a remote partition root taking CPUs
 * directly from the top cpuset. cpuset_mutex must be held by the caller.
 */
static int remote_partition_enable(struct cpuset *cs, int new_prs,
                                   struct tmpmasks *tmp)
{
        bool isolcpus_updated;

        /*
         * The user must have sysadmin privilege.
         */
        if (!capable(CAP_SYS_ADMIN))
                return 0;

        /*
         * The requested exclusive_cpus must not be allocated to other
         * partitions and it can't use up all the root's effective_cpus.
         *
         * Note that if there is any local partition root above it or
         * remote partition root underneath it, its exclusive_cpus must
         * have overlapped with subpartitions_cpus.
         */
        compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
        if (cpumask_empty(tmp->new_cpus) ||
            cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
            cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
                return 0;

        spin_lock_irq(&callback_lock);
        isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
        list_add(&cs->remote_sibling, &remote_children);
        if (cs->use_parent_ecpus) {
                struct cpuset *parent = parent_cs(cs);

                cs->use_parent_ecpus = false;
                parent->child_ecpus_count--;
        }
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        /*
         * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
         */
        update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
        update_sibling_cpumasks(&top_cpuset, NULL, tmp);
        return 1;
}

/*
 * remote_partition_disable - Remove current cpuset from remote partition list
 * @cs: the cpuset to update
 * @tmp: temparary masks
 *
 * The effective_cpus is also updated.
 *
 * cpuset_mutex must be held by the caller.
 */
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
        bool isolcpus_updated;

        compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
        WARN_ON_ONCE(!is_remote_partition(cs));
        WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));

        spin_lock_irq(&callback_lock);
        list_del_init(&cs->remote_sibling);
        isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
                                               NULL, tmp->new_cpus);
        cs->partition_root_state = -cs->partition_root_state;
        if (!cs->prs_err)
                cs->prs_err = PERR_INVCPUS;
        reset_partition_data(cs);
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        /*
         * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
         */
        update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
        update_sibling_cpumasks(&top_cpuset, NULL, tmp);
}

/*
 * remote_cpus_update - cpus_exclusive change of remote partition
 * @cs: the cpuset to be updated
 * @newmask: the new effective_xcpus mask
 * @tmp: temparary masks
 *
 * top_cpuset and subpartitions_cpus will be updated or partition can be
 * invalidated.
 */
static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
                               struct tmpmasks *tmp)
{
        bool adding, deleting;
        int prs = cs->partition_root_state;
        int isolcpus_updated = 0;

        if (WARN_ON_ONCE(!is_remote_partition(cs)))
                return;

        WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));

        if (cpumask_empty(newmask))
                goto invalidate;

        adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
        deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);

        /*
         * Additions of remote CPUs is only allowed if those CPUs are
         * not allocated to other partitions and there are effective_cpus
         * left in the top cpuset.
         */
        if (adding && (!capable(CAP_SYS_ADMIN) ||
                       cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
                       cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
                goto invalidate;

        spin_lock_irq(&callback_lock);
        if (adding)
                isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
        if (deleting)
                isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        /*
         * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
         */
        update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
        update_sibling_cpumasks(&top_cpuset, NULL, tmp);
        return;

invalidate:
        remote_partition_disable(cs, tmp);
}

/*
 * remote_partition_check - check if a child remote partition needs update
 * @cs: the cpuset to be updated
 * @newmask: the new effective_xcpus mask
 * @delmask: temporary mask for deletion (not in tmp)
 * @tmp: temparary masks
 *
 * This should be called before the given cs has updated its cpus_allowed
 * and/or effective_xcpus.
 */
static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
                                   struct cpumask *delmask, struct tmpmasks *tmp)
{
        struct cpuset *child, *next;
        int disable_cnt = 0;

        /*
         * Compute the effective exclusive CPUs that will be deleted.
         */
        if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
            !cpumask_intersects(delmask, subpartitions_cpus))
                return;        /* No deletion of exclusive CPUs in partitions */

        /*
         * Searching the remote children list to look for those that will
         * be impacted by the deletion of exclusive CPUs.
         *
         * Since a cpuset must be removed from the remote children list
         * before it can go offline and holding cpuset_mutex will prevent
         * any change in cpuset status. RCU read lock isn't needed.
         */
        lockdep_assert_held(&cpuset_mutex);
        list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
                if (cpumask_intersects(child->effective_cpus, delmask)) {
                        remote_partition_disable(child, tmp);
                        disable_cnt++;
                }
        if (disable_cnt)
                rebuild_sched_domains_locked();
}

/*
 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
 * @prstate: partition root state to be checked
 * @new_cpus: cpu mask
 * Return: true if there is conflict, false otherwise
 *
 * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
 * an isolated partition.
 */
static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
{
        const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
        bool all_in_hk = cpumask_subset(new_cpus, hk_domain);

        if (!all_in_hk && (prstate != PRS_ISOLATED))
                return true;

        return false;
}

/**
 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
 * @cs:      The cpuset that requests change in partition root state
 * @cmd:     Partition root state change command
 * @newmask: Optional new cpumask for partcmd_update
 * @tmp:     Temporary addmask and delmask
 * Return:   0 or a partition root state error code
 *
 * For partcmd_enable*, the cpuset is being transformed from a non-partition
 * root to a partition root. The effective_xcpus (cpus_allowed if
 * effective_xcpus not set) mask of the given cpuset will be taken away from
 * parent's effective_cpus. The function will return 0 if all the CPUs listed
 * in effective_xcpus can be granted or an error code will be returned.
 *
 * For partcmd_disable, the cpuset is being transformed from a partition
 * root back to a non-partition root. Any CPUs in effective_xcpus will be
 * given back to parent's effective_cpus. 0 will always be returned.
 *
 * For partcmd_update, if the optional newmask is specified, the cpu list is
 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
 * assumed to remain the same. The cpuset should either be a valid or invalid
 * partition root. The partition root state may change from valid to invalid
 * or vice versa. An error code will be returned if transitioning from
 * invalid to valid violates the exclusivity rule.
 *
 * For partcmd_invalidate, the current partition will be made invalid.
 *
 * The partcmd_enable* and partcmd_disable commands are used by
 * update_prstate(). An error code may be returned and the caller will check
 * for error.
 *
 * The partcmd_update command is used by update_cpumasks_hier() with newmask
 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
 * by update_cpumask() with NULL newmask. In both cases, the callers won't
 * check for error and so partition_root_state and prs_error will be updated
 * directly.
 */
static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
                                           struct cpumask *newmask,
                                           struct tmpmasks *tmp)
{
        struct cpuset *parent = parent_cs(cs);
        int adding;        /* Adding cpus to parent's effective_cpus        */
        int deleting;        /* Deleting cpus from parent's effective_cpus        */
        int old_prs, new_prs;
        int part_error = PERR_NONE;        /* Partition error? */
        int subparts_delta = 0;
        struct cpumask *xcpus;                /* cs effective_xcpus */
        int isolcpus_updated = 0;
        bool nocpu;

        lockdep_assert_held(&cpuset_mutex);

        /*
         * new_prs will only be changed for the partcmd_update and
         * partcmd_invalidate commands.
         */
        adding = deleting = false;
        old_prs = new_prs = cs->partition_root_state;
        xcpus = !cpumask_empty(cs->exclusive_cpus)
                ? cs->effective_xcpus : cs->cpus_allowed;

        if (cmd == partcmd_invalidate) {
                if (is_prs_invalid(old_prs))
                        return 0;

                /*
                 * Make the current partition invalid.
                 */
                if (is_partition_valid(parent))
                        adding = cpumask_and(tmp->addmask,
                                             xcpus, parent->effective_xcpus);
                if (old_prs > 0) {
                        new_prs = -old_prs;
                        subparts_delta--;
                }
                goto write_error;
        }

        /*
         * The parent must be a partition root.
         * The new cpumask, if present, or the current cpus_allowed must
         * not be empty.
         */
        if (!is_partition_valid(parent)) {
                return is_partition_invalid(parent)
                       ? PERR_INVPARENT : PERR_NOTPART;
        }
        if (!newmask && cpumask_empty(cs->cpus_allowed))
                return PERR_CPUSEMPTY;

        nocpu = tasks_nocpu_error(parent, cs, xcpus);

        if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
                /*
                 * Enabling partition root is not allowed if its
                 * effective_xcpus is empty or doesn't overlap with
                 * parent's effective_xcpus.
                 */
                if (cpumask_empty(xcpus) ||
                    !cpumask_intersects(xcpus, parent->effective_xcpus))
                        return PERR_INVCPUS;

                if (prstate_housekeeping_conflict(new_prs, xcpus))
                        return PERR_HKEEPING;

                /*
                 * A parent can be left with no CPU as long as there is no
                 * task directly associated with the parent partition.
                 */
                if (nocpu)
                        return PERR_NOCPUS;

                cpumask_copy(tmp->delmask, xcpus);
                deleting = true;
                subparts_delta++;
                new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
        } else if (cmd == partcmd_disable) {
                /*
                 * May need to add cpus to parent's effective_cpus for
                 * valid partition root.
                 */
                adding = !is_prs_invalid(old_prs) &&
                          cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
                if (adding)
                        subparts_delta--;
                new_prs = PRS_MEMBER;
        } else if (newmask) {
                /*
                 * Empty cpumask is not allowed
                 */
                if (cpumask_empty(newmask)) {
                        part_error = PERR_CPUSEMPTY;
                        goto write_error;
                }

                /*
                 * partcmd_update with newmask:
                 *
                 * Compute add/delete mask to/from effective_cpus
                 *
                 * For valid partition:
                 *   addmask = exclusive_cpus & ~newmask
                 *                              & parent->effective_xcpus
                 *   delmask = newmask & ~exclusive_cpus
                 *                       & parent->effective_xcpus
                 *
                 * For invalid partition:
                 *   delmask = newmask & parent->effective_xcpus
                 */
                if (is_prs_invalid(old_prs)) {
                        adding = false;
                        deleting = cpumask_and(tmp->delmask,
                                        newmask, parent->effective_xcpus);
                } else {
                        cpumask_andnot(tmp->addmask, xcpus, newmask);
                        adding = cpumask_and(tmp->addmask, tmp->addmask,
                                             parent->effective_xcpus);

                        cpumask_andnot(tmp->delmask, newmask, xcpus);
                        deleting = cpumask_and(tmp->delmask, tmp->delmask,
                                               parent->effective_xcpus);
                }
                /*
                 * Make partition invalid if parent's effective_cpus could
                 * become empty and there are tasks in the parent.
                 */
                if (nocpu && (!adding ||
                    !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
                        part_error = PERR_NOCPUS;
                        deleting = false;
                        adding = cpumask_and(tmp->addmask,
                                             xcpus, parent->effective_xcpus);
                }
        } else {
                /*
                 * partcmd_update w/o newmask
                 *
                 * delmask = effective_xcpus & parent->effective_cpus
                 *
                 * This can be called from:
                 * 1) update_cpumasks_hier()
                 * 2) cpuset_hotplug_update_tasks()
                 *
                 * Check to see if it can be transitioned from valid to
                 * invalid partition or vice versa.
                 *
                 * A partition error happens when parent has tasks and all
                 * its effective CPUs will have to be distributed out.
                 */
                WARN_ON_ONCE(!is_partition_valid(parent));
                if (nocpu) {
                        part_error = PERR_NOCPUS;
                        if (is_partition_valid(cs))
                                adding = cpumask_and(tmp->addmask,
                                                xcpus, parent->effective_xcpus);
                } else if (is_partition_invalid(cs) &&
                           cpumask_subset(xcpus, parent->effective_xcpus)) {
                        struct cgroup_subsys_state *css;
                        struct cpuset *child;
                        bool exclusive = true;

                        /*
                         * Convert invalid partition to valid has to
                         * pass the cpu exclusivity test.
                         */
                        rcu_read_lock();
                        cpuset_for_each_child(child, css, parent) {
                                if (child == cs)
                                        continue;
                                if (!cpusets_are_exclusive(cs, child)) {
                                        exclusive = false;
                                        break;
                                }
                        }
                        rcu_read_unlock();
                        if (exclusive)
                                deleting = cpumask_and(tmp->delmask,
                                                xcpus, parent->effective_cpus);
                        else
                                part_error = PERR_NOTEXCL;
                }
        }

write_error:
        if (part_error)
                WRITE_ONCE(cs->prs_err, part_error);

        if (cmd == partcmd_update) {
                /*
                 * Check for possible transition between valid and invalid
                 * partition root.
                 */
                switch (cs->partition_root_state) {
                case PRS_ROOT:
                case PRS_ISOLATED:
                        if (part_error) {
                                new_prs = -old_prs;
                                subparts_delta--;
                        }
                        break;
                case PRS_INVALID_ROOT:
                case PRS_INVALID_ISOLATED:
                        if (!part_error) {
                                new_prs = -old_prs;
                                subparts_delta++;
                        }
                        break;
                }
        }

        if (!adding && !deleting && (new_prs == old_prs))
                return 0;

        /*
         * Transitioning between invalid to valid or vice versa may require
         * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
         * validate_change() has already been successfully called and
         * CPU lists in cs haven't been updated yet. So defer it to later.
         */
        if ((old_prs != new_prs) && (cmd != partcmd_update))  {
                int err = update_partition_exclusive(cs, new_prs);

                if (err)
                        return err;
        }

        /*
         * Change the parent's effective_cpus & effective_xcpus (top cpuset
         * only).
         *
         * Newly added CPUs will be removed from effective_cpus and
         * newly deleted ones will be added back to effective_cpus.
         */
        spin_lock_irq(&callback_lock);
        if (old_prs != new_prs) {
                cs->partition_root_state = new_prs;
                if (new_prs <= 0)
                        cs->nr_subparts = 0;
        }
        /*
         * Adding to parent's effective_cpus means deletion CPUs from cs
         * and vice versa.
         */
        if (adding)
                isolcpus_updated += partition_xcpus_del(old_prs, parent,
                                                        tmp->addmask);
        if (deleting)
                isolcpus_updated += partition_xcpus_add(new_prs, parent,
                                                        tmp->delmask);

        if (is_partition_valid(parent)) {
                parent->nr_subparts += subparts_delta;
                WARN_ON_ONCE(parent->nr_subparts < 0);
        }
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        if ((old_prs != new_prs) && (cmd == partcmd_update))
                update_partition_exclusive(cs, new_prs);

        if (adding || deleting) {
                update_tasks_cpumask(parent, tmp->addmask);
                update_sibling_cpumasks(parent, cs, tmp);
        }

        /*
         * For partcmd_update without newmask, it is being called from
         * cpuset_handle_hotplug(). Update the load balance flag and
         * scheduling domain accordingly.
         */
        if ((cmd == partcmd_update) && !newmask)
                update_partition_sd_lb(cs, old_prs);

        notify_partition_change(cs, old_prs);
        return 0;
}

/**
 * compute_partition_effective_cpumask - compute effective_cpus for partition
 * @cs: partition root cpuset
 * @new_ecpus: previously computed effective_cpus to be updated
 *
 * Compute the effective_cpus of a partition root by scanning effective_xcpus
 * of child partition roots and excluding their effective_xcpus.
 *
 * This has the side effect of invalidating valid child partition roots,
 * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
 * or update_cpumasks_hier() where parent and children are modified
 * successively, we don't need to call update_parent_effective_cpumask()
 * and the child's effective_cpus will be updated in later iterations.
 *
 * Note that rcu_read_lock() is assumed to be held.
 */
static void compute_partition_effective_cpumask(struct cpuset *cs,
                                                struct cpumask *new_ecpus)
{
        struct cgroup_subsys_state *css;
        struct cpuset *child;
        bool populated = partition_is_populated(cs, NULL);

        /*
         * Check child partition roots to see if they should be
         * invalidated when
         *  1) child effective_xcpus not a subset of new
         *     excluisve_cpus
         *  2) All the effective_cpus will be used up and cp
         *     has tasks
         */
        compute_effective_exclusive_cpumask(cs, new_ecpus);
        cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);

        rcu_read_lock();
        cpuset_for_each_child(child, css, cs) {
                if (!is_partition_valid(child))
                        continue;

                child->prs_err = 0;
                if (!cpumask_subset(child->effective_xcpus,
                                    cs->effective_xcpus))
                        child->prs_err = PERR_INVCPUS;
                else if (populated &&
                         cpumask_subset(new_ecpus, child->effective_xcpus))
                        child->prs_err = PERR_NOCPUS;

                if (child->prs_err) {
                        int old_prs = child->partition_root_state;

                        /*
                         * Invalidate child partition
                         */
                        spin_lock_irq(&callback_lock);
                        make_partition_invalid(child);
                        cs->nr_subparts--;
                        child->nr_subparts = 0;
                        spin_unlock_irq(&callback_lock);
                        notify_partition_change(child, old_prs);
                        continue;
                }
                cpumask_andnot(new_ecpus, new_ecpus,
                               child->effective_xcpus);
        }
        rcu_read_unlock();
}

/*
 * update_cpumasks_hier() flags
 */
#define HIER_CHECKALL                0x01        /* Check all cpusets with no skipping */
#define HIER_NO_SD_REBUILD        0x02        /* Don't rebuild sched domains */

/*
 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
 * @cs:  the cpuset to consider
 * @tmp: temp variables for calculating effective_cpus & partition setup
 * @force: don't skip any descendant cpusets if set
 *
 * When configured cpumask is changed, the effective cpumasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
                                 int flags)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        bool need_rebuild_sched_domains = false;
        int old_prs, new_prs;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);
                bool remote = is_remote_partition(cp);
                bool update_parent = false;

                /*
                 * Skip descendent remote partition that acquires CPUs
                 * directly from top cpuset unless it is cs.
                 */
                if (remote && (cp != cs)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                /*
                 * Update effective_xcpus if exclusive_cpus set.
                 * The case when exclusive_cpus isn't set is handled later.
                 */
                if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
                        spin_lock_irq(&callback_lock);
                        compute_effective_exclusive_cpumask(cp, NULL);
                        spin_unlock_irq(&callback_lock);
                }

                old_prs = new_prs = cp->partition_root_state;
                if (remote || (is_partition_valid(parent) &&
                               is_partition_valid(cp)))
                        compute_partition_effective_cpumask(cp, tmp->new_cpus);
                else
                        compute_effective_cpumask(tmp->new_cpus, cp, parent);

                /*
                 * A partition with no effective_cpus is allowed as long as
                 * there is no task associated with it. Call
                 * update_parent_effective_cpumask() to check it.
                 */
                if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
                        update_parent = true;
                        goto update_parent_effective;
                }

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs unless
                 * it is a partition root that has explicitly distributed
                 * out all its CPUs.
                 */
                if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
                        cpumask_copy(tmp->new_cpus, parent->effective_cpus);
                        if (!cp->use_parent_ecpus) {
                                cp->use_parent_ecpus = true;
                                parent->child_ecpus_count++;
                        }
                } else if (cp->use_parent_ecpus) {
                        cp->use_parent_ecpus = false;
                        WARN_ON_ONCE(!parent->child_ecpus_count);
                        parent->child_ecpus_count--;
                }

                if (remote)
                        goto get_css;

                /*
                 * Skip the whole subtree if
                 * 1) the cpumask remains the same,
                 * 2) has no partition root state,
                 * 3) HIER_CHECKALL flag not set, and
                 * 4) for v2 load balance state same as its parent.
                 */
                if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
                    cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
                    (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

update_parent_effective:
                /*
                 * update_parent_effective_cpumask() should have been called
                 * for cs already in update_cpumask(). We should also call
                 * update_tasks_cpumask() again for tasks in the parent
                 * cpuset if the parent's effective_cpus changes.
                 */
                if ((cp != cs) && old_prs) {
                        switch (parent->partition_root_state) {
                        case PRS_ROOT:
                        case PRS_ISOLATED:
                                update_parent = true;
                                break;

                        default:
                                /*
                                 * When parent is not a partition root or is
                                 * invalid, child partition roots become
                                 * invalid too.
                                 */
                                if (is_partition_valid(cp))
                                        new_prs = -cp->partition_root_state;
                                WRITE_ONCE(cp->prs_err,
                                           is_partition_invalid(parent)
                                           ? PERR_INVPARENT : PERR_NOTPART);
                                break;
                        }
                }
get_css:
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                if (update_parent) {
                        update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
                        /*
                         * The cpuset partition_root_state may become
                         * invalid. Capture it.
                         */
                        new_prs = cp->partition_root_state;
                }

                spin_lock_irq(&callback_lock);
                cpumask_copy(cp->effective_cpus, tmp->new_cpus);
                cp->partition_root_state = new_prs;
                /*
                 * Make sure effective_xcpus is properly set for a valid
                 * partition root.
                 */
                if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
                        cpumask_and(cp->effective_xcpus,
                                    cp->cpus_allowed, parent->effective_xcpus);
                else if (new_prs < 0)
                        reset_partition_data(cp);
                spin_unlock_irq(&callback_lock);

                notify_partition_change(cp, old_prs);

                WARN_ON(!is_in_v2_mode() &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));

                update_tasks_cpumask(cp, cp->effective_cpus);

                /*
                 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
                 * from parent if current cpuset isn't a valid partition root
                 * and their load balance states differ.
                 */
                if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                    !is_partition_valid(cp) &&
                    (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
                        if (is_sched_load_balance(parent))
                                set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
                        else
                                clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
                }

                /*
                 * On legacy hierarchy, if the effective cpumask of any non-
                 * empty cpuset is changed, we need to rebuild sched domains.
                 * On default hierarchy, the cpuset needs to be a partition
                 * root as well.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    is_sched_load_balance(cp) &&
                   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    is_partition_valid(cp)))
                        need_rebuild_sched_domains = true;

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();

        if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
                rebuild_sched_domains_locked();
}

/**
 * update_sibling_cpumasks - Update siblings cpumasks
 * @parent:  Parent cpuset
 * @cs:      Current cpuset
 * @tmp:     Temp variables
 */
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
                                    struct tmpmasks *tmp)
{
        struct cpuset *sibling;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);

        /*
         * Check all its siblings and call update_cpumasks_hier()
         * if their effective_cpus will need to be changed.
         *
         * With the addition of effective_xcpus which is a subset of
         * cpus_allowed. It is possible a change in parent's effective_cpus
         * due to a change in a child partition's effective_xcpus will impact
         * its siblings even if they do not inherit parent's effective_cpus
         * directly.
         *
         * The update_cpumasks_hier() function may sleep. So we have to
         * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
         * flag is used to suppress rebuild of sched domains as the callers
         * will take care of that.
         */
        rcu_read_lock();
        cpuset_for_each_child(sibling, pos_css, parent) {
                if (sibling == cs)
                        continue;
                if (!sibling->use_parent_ecpus &&
                    !is_partition_valid(sibling)) {
                        compute_effective_cpumask(tmp->new_cpus, sibling,
                                                  parent);
                        if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
                                continue;
                }
                if (!css_tryget_online(&sibling->css))
                        continue;

                rcu_read_unlock();
                update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
                rcu_read_lock();
                css_put(&sibling->css);
        }
        rcu_read_unlock();
}

/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 */
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
{
        int retval;
        struct tmpmasks tmp;
        struct cpuset *parent = parent_cs(cs);
        bool invalidate = false;
        int hier_flags = 0;
        int old_prs = cs->partition_root_state;

        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
                return -EACCES;

        /*
         * An empty cpus_allowed is ok only if the cpuset has no tasks.
         * Since cpulist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
        if (!*buf) {
                cpumask_clear(trialcs->cpus_allowed);
                cpumask_clear(trialcs->effective_xcpus);
        } else {
                retval = cpulist_parse(buf, trialcs->cpus_allowed);
                if (retval < 0)
                        return retval;

                if (!cpumask_subset(trialcs->cpus_allowed,
                                    top_cpuset.cpus_allowed))
                        return -EINVAL;

                /*
                 * When exclusive_cpus isn't explicitly set, it is constrainted
                 * by cpus_allowed and parent's effective_xcpus. Otherwise,
                 * trialcs->effective_xcpus is used as a temporary cpumask
                 * for checking validity of the partition root.
                 */
                if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
                        compute_effective_exclusive_cpumask(trialcs, NULL);
        }

        /* Nothing to do if the cpus didn't change */
        if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
                return 0;

        if (alloc_cpumasks(NULL, &tmp))
                return -ENOMEM;

        if (old_prs) {
                if (is_partition_valid(cs) &&
                    cpumask_empty(trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_INVCPUS;
                } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_HKEEPING;
                } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_NOCPUS;
                }
        }

        /*
         * Check all the descendants in update_cpumasks_hier() if
         * effective_xcpus is to be changed.
         */
        if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
                hier_flags = HIER_CHECKALL;

        retval = validate_change(cs, trialcs);

        if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                struct cgroup_subsys_state *css;
                struct cpuset *cp;

                /*
                 * The -EINVAL error code indicates that partition sibling
                 * CPU exclusivity rule has been violated. We still allow
                 * the cpumask change to proceed while invalidating the
                 * partition. However, any conflicting sibling partitions
                 * have to be marked as invalid too.
                 */
                invalidate = true;
                rcu_read_lock();
                cpuset_for_each_child(cp, css, parent) {
                        struct cpumask *xcpus = fetch_xcpus(trialcs);

                        if (is_partition_valid(cp) &&
                            cpumask_intersects(xcpus, cp->effective_xcpus)) {
                                rcu_read_unlock();
                                update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
                                rcu_read_lock();
                        }
                }
                rcu_read_unlock();
                retval = 0;
        }

        if (retval < 0)
                goto out_free;

        if (is_partition_valid(cs) ||
           (is_partition_invalid(cs) && !invalidate)) {
                struct cpumask *xcpus = trialcs->effective_xcpus;

                if (cpumask_empty(xcpus) && is_partition_invalid(cs))
                        xcpus = trialcs->cpus_allowed;

                /*
                 * Call remote_cpus_update() to handle valid remote partition
                 */
                if (is_remote_partition(cs))
                        remote_cpus_update(cs, xcpus, &tmp);
                else if (invalidate)
                        update_parent_effective_cpumask(cs, partcmd_invalidate,
                                                        NULL, &tmp);
                else
                        update_parent_effective_cpumask(cs, partcmd_update,
                                                        xcpus, &tmp);
        } else if (!cpumask_empty(cs->exclusive_cpus)) {
                /*
                 * Use trialcs->effective_cpus as a temp cpumask
                 */
                remote_partition_check(cs, trialcs->effective_xcpus,
                                       trialcs->effective_cpus, &tmp);
        }

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
        if ((old_prs > 0) && !is_partition_valid(cs))
                reset_partition_data(cs);
        spin_unlock_irq(&callback_lock);

        /* effective_cpus/effective_xcpus will be updated here */
        update_cpumasks_hier(cs, &tmp, hier_flags);

        /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
        if (cs->partition_root_state)
                update_partition_sd_lb(cs, old_prs);
out_free:
        free_cpumasks(NULL, &tmp);
        return retval;
}

/**
 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 *
 * The tasks' cpumask will be updated if cs is a valid partition root.
 */
static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                                    const char *buf)
{
        int retval;
        struct tmpmasks tmp;
        struct cpuset *parent = parent_cs(cs);
        bool invalidate = false;
        int hier_flags = 0;
        int old_prs = cs->partition_root_state;

        if (!*buf) {
                cpumask_clear(trialcs->exclusive_cpus);
                cpumask_clear(trialcs->effective_xcpus);
        } else {
                retval = cpulist_parse(buf, trialcs->exclusive_cpus);
                if (retval < 0)
                        return retval;
                if (!is_cpu_exclusive(cs))
                        set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
        }

        /* Nothing to do if the CPUs didn't change */
        if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
                return 0;

        if (*buf)
                compute_effective_exclusive_cpumask(trialcs, NULL);

        /*
         * Check all the descendants in update_cpumasks_hier() if
         * effective_xcpus is to be changed.
         */
        if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
                hier_flags = HIER_CHECKALL;

        retval = validate_change(cs, trialcs);
        if (retval)
                return retval;

        if (alloc_cpumasks(NULL, &tmp))
                return -ENOMEM;

        if (old_prs) {
                if (cpumask_empty(trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_INVCPUS;
                } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_HKEEPING;
                } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_NOCPUS;
                }

                if (is_remote_partition(cs)) {
                        if (invalidate)
                                remote_partition_disable(cs, &tmp);
                        else
                                remote_cpus_update(cs, trialcs->effective_xcpus,
                                                   &tmp);
                } else if (invalidate) {
                        update_parent_effective_cpumask(cs, partcmd_invalidate,
                                                        NULL, &tmp);
                } else {
                        update_parent_effective_cpumask(cs, partcmd_update,
                                                trialcs->effective_xcpus, &tmp);
                }
        } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
                /*
                 * Use trialcs->effective_cpus as a temp cpumask
                 */
                remote_partition_check(cs, trialcs->effective_xcpus,
                                       trialcs->effective_cpus, &tmp);
        }
        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
        cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
        if ((old_prs > 0) && !is_partition_valid(cs))
                reset_partition_data(cs);
        spin_unlock_irq(&callback_lock);

        /*
         * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
         * of the subtree when it is a valid partition root or effective_xcpus
         * is updated.
         */
        if (is_partition_valid(cs) || hier_flags)
                update_cpumasks_hier(cs, &tmp, hier_flags);

        /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
        if (cs->partition_root_state)
                update_partition_sd_lb(cs, old_prs);

        free_cpumasks(NULL, &tmp);
        return 0;
}

/*
 * Migrate memory region from one set of nodes to another.  This is
 * performed asynchronously as it can be called from process migration path
 * holding locks involved in process management.  All mm migrations are
 * performed in the queued order and can be waited for by flushing
 * cpuset_migrate_mm_wq.
 */

struct cpuset_migrate_mm_work {
        struct work_struct        work;
        struct mm_struct        *mm;
        nodemask_t                from;
        nodemask_t                to;
};

static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
        struct cpuset_migrate_mm_work *mwork =
                container_of(work, struct cpuset_migrate_mm_work, work);

        /* on a wq worker, no need to worry about %current's mems_allowed */
        do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
        mmput(mwork->mm);
        kfree(mwork);
}

static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
{
        struct cpuset_migrate_mm_work *mwork;

        if (nodes_equal(*from, *to)) {
                mmput(mm);
                return;
        }

        mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
        if (mwork) {
                mwork->mm = mm;
                mwork->from = *from;
                mwork->to = *to;
                INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
                queue_work(cpuset_migrate_mm_wq, &mwork->work);
        } else {
                mmput(mm);
        }
}

static void cpuset_post_attach(void)
{
        flush_workqueue(cpuset_migrate_mm_wq);
}

/*
 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
 * @tsk: the task to change
 * @newmems: new nodes that the task will be set
 *
 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
 * and rebind an eventual tasks' mempolicy. If the task is allocating in
 * parallel, it might temporarily see an empty intersection, which results in
 * a seqlock check and retry before OOM or allocation failure.
 */
static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
{
        task_lock(tsk);

        local_irq_disable();
        write_seqcount_begin(&tsk->mems_allowed_seq);

        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems);
        tsk->mems_allowed = *newmems;

        write_seqcount_end(&tsk->mems_allowed_seq);
        local_irq_enable();

        task_unlock(tsk);
}

static void *cpuset_being_rebound;

/**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its mems_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */
static void update_tasks_nodemask(struct cpuset *cs)
{
        static nodemask_t newmems;        /* protected by cpuset_mutex */
        struct css_task_iter it;
        struct task_struct *task;

        cpuset_being_rebound = cs;                /* causes mpol_dup() rebind */

        guarantee_online_mems(cs, &newmems);

        /*
         * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                struct mm_struct *mm;
                bool migrate;

                cpuset_change_task_nodemask(task, &newmems);

                mm = get_task_mm(task);
                if (!mm)
                        continue;

                migrate = is_memory_migrate(cs);

                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
                        cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
                else
                        mmput(mm);
        }
        css_task_iter_end(&it);

        /*
         * All the tasks' nodemasks have been updated, update
         * cs->old_mems_allowed.
         */
        cs->old_mems_allowed = newmems;

        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
        cpuset_being_rebound = NULL;
}

/*
 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_mems: a temp variable for calculating new effective_mems
 *
 * When configured nodemask is changed, the effective nodemasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierarchy, effective_mems will be the same with mems_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);

                nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
                if (is_in_v2_mode() && nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;

                /* Skip the whole subtree if the nodemask remains the same. */
                if (nodes_equal(*new_mems, cp->effective_mems)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                spin_lock_irq(&callback_lock);
                cp->effective_mems = *new_mems;
                spin_unlock_irq(&callback_lock);

                WARN_ON(!is_in_v2_mode() &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));

                update_tasks_nodemask(cp);

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();
}

/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed, and for each task in the cpuset,
 * update mems_allowed and rebind task's mempolicy and any vma
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
 * Call with cpuset_mutex held. May take callback_lock during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_lock, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 */
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
{
        int retval;

        /*
         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
         * it's read-only
         */
        if (cs == &top_cpuset) {
                retval = -EACCES;
                goto done;
        }

        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
         * Since nodelist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have memory.
         */
        if (!*buf) {
                nodes_clear(trialcs->mems_allowed);
        } else {
                retval = nodelist_parse(buf, trialcs->mems_allowed);
                if (retval < 0)
                        goto done;

                if (!nodes_subset(trialcs->mems_allowed,
                                  top_cpuset.mems_allowed)) {
                        retval = -EINVAL;
                        goto done;
                }
        }

        if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
                retval = 0;                /* Too easy - nothing to do */
                goto done;
        }
        retval = validate_change(cs, trialcs);
        if (retval < 0)
                goto done;

        check_insane_mems_config(&trialcs->mems_allowed);

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = trialcs->mems_allowed;
        spin_unlock_irq(&callback_lock);

        /* use trialcs->mems_allowed as a temp variable */
        update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
        return retval;
}

bool current_cpuset_is_being_rebound(void)
{
        bool ret;

        rcu_read_lock();
        ret = task_cs(current) == cpuset_being_rebound;
        rcu_read_unlock();

        return ret;
}

static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
        if (val < -1 || val > sched_domain_level_max + 1)
                return -EINVAL;
#endif

        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
                        rebuild_sched_domains_locked();
        }

        return 0;
}

/**
 * update_tasks_flags - update the spread flags of tasks in the cpuset.
 * @cs: the cpuset in which each task's spread flags needs to be changed
 *
 * Iterate through each task of @cs updating its spread flags.  As this
 * function is called with cpuset_mutex held, cpuset membership stays
 * stable.
 */
static void update_tasks_flags(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                cpuset_update_task_spread_flags(cs, task);
        css_task_iter_end(&it);
}

/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:                the bit to update (see cpuset_flagbits_t)
 * cs:                the cpuset to update
 * turning_on:         whether the flag is being set or cleared
 *
 * Call with cpuset_mutex held.
 */

static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
{
        struct cpuset *trialcs;
        int balance_flag_changed;
        int spread_flag_changed;
        int err;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs)
                return -ENOMEM;

        if (turning_on)
                set_bit(bit, &trialcs->flags);
        else
                clear_bit(bit, &trialcs->flags);

        err = validate_change(cs, trialcs);
        if (err < 0)
                goto out;

        balance_flag_changed = (is_sched_load_balance(cs) !=
                                is_sched_load_balance(trialcs));

        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
                        || (is_spread_page(cs) != is_spread_page(trialcs)));

        spin_lock_irq(&callback_lock);
        cs->flags = trialcs->flags;
        spin_unlock_irq(&callback_lock);

        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                rebuild_sched_domains_locked();

        if (spread_flag_changed)
                update_tasks_flags(cs);
out:
        free_cpuset(trialcs);
        return err;
}

/**
 * update_prstate - update partition_root_state
 * @cs: the cpuset to update
 * @new_prs: new partition root state
 * Return: 0 if successful, != 0 if error
 *
 * Call with cpuset_mutex held.
 */
static int update_prstate(struct cpuset *cs, int new_prs)
{
        int err = PERR_NONE, old_prs = cs->partition_root_state;
        struct cpuset *parent = parent_cs(cs);
        struct tmpmasks tmpmask;
        bool new_xcpus_state = false;

        if (old_prs == new_prs)
                return 0;

        /*
         * Treat a previously invalid partition root as if it is a "member".
         */
        if (new_prs && is_prs_invalid(old_prs))
                old_prs = PRS_MEMBER;

        if (alloc_cpumasks(NULL, &tmpmask))
                return -ENOMEM;

        /*
         * Setup effective_xcpus if not properly set yet, it will be cleared
         * later if partition becomes invalid.
         */
        if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
                spin_lock_irq(&callback_lock);
                cpumask_and(cs->effective_xcpus,
                            cs->cpus_allowed, parent->effective_xcpus);
                spin_unlock_irq(&callback_lock);
        }

        err = update_partition_exclusive(cs, new_prs);
        if (err)
                goto out;

        if (!old_prs) {
                enum partition_cmd cmd = (new_prs == PRS_ROOT)
                                       ? partcmd_enable : partcmd_enablei;

                /*
                 * cpus_allowed cannot be empty.
                 */
                if (cpumask_empty(cs->cpus_allowed)) {
                        err = PERR_CPUSEMPTY;
                        goto out;
                }

                err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
                /*
                 * If an attempt to become local partition root fails,
                 * try to become a remote partition root instead.
                 */
                if (err && remote_partition_enable(cs, new_prs, &tmpmask))
                        err = 0;
        } else if (old_prs && new_prs) {
                /*
                 * A change in load balance state only, no change in cpumasks.
                 */
                new_xcpus_state = true;
        } else {
                /*
                 * Switching back to member is always allowed even if it
                 * disables child partitions.
                 */
                if (is_remote_partition(cs))
                        remote_partition_disable(cs, &tmpmask);
                else
                        update_parent_effective_cpumask(cs, partcmd_disable,
                                                        NULL, &tmpmask);

                /*
                 * Invalidation of child partitions will be done in
                 * update_cpumasks_hier().
                 */
        }
out:
        /*
         * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
         * happens.
         */
        if (err) {
                new_prs = -new_prs;
                update_partition_exclusive(cs, new_prs);
        }

        spin_lock_irq(&callback_lock);
        cs->partition_root_state = new_prs;
        WRITE_ONCE(cs->prs_err, err);
        if (!is_partition_valid(cs))
                reset_partition_data(cs);
        else if (new_xcpus_state)
                partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(new_xcpus_state);

        /* Force update if switching back to member */
        update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);

        /* Update sched domains and load balance flag */
        update_partition_sd_lb(cs, old_prs);

        notify_partition_change(cs, old_prs);
        free_cpumasks(NULL, &tmpmask);
        return 0;
}

/*
 * Frequency meter - How fast is some event occurring?
 *
 * These routines manage a digitally filtered, constant time based,
 * event frequency meter.  There are four routines:
 *   fmeter_init() - initialize a frequency meter.
 *   fmeter_markevent() - called each time the event happens.
 *   fmeter_getrate() - returns the recent rate of such events.
 *   fmeter_update() - internal routine used to update fmeter.
 *
 * A common data structure is passed to each of these routines,
 * which is used to keep track of the state required to manage the
 * frequency meter and its digital filter.
 *
 * The filter works on the number of events marked per unit time.
 * The filter is single-pole low-pass recursive (IIR).  The time unit
 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 * simulate 3 decimal digits of precision (multiplied by 1000).
 *
 * With an FM_COEF of 933, and a time base of 1 second, the filter
 * has a half-life of 10 seconds, meaning that if the events quit
 * happening, then the rate returned from the fmeter_getrate()
 * will be cut in half each 10 seconds, until it converges to zero.
 *
 * It is not worth doing a real infinitely recursive filter.  If more
 * than FM_MAXTICKS ticks have elapsed since the last filter event,
 * just compute FM_MAXTICKS ticks worth, by which point the level
 * will be stable.
 *
 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 * arithmetic overflow in the fmeter_update() routine.
 *
 * Given the simple 32 bit integer arithmetic used, this meter works
 * best for reporting rates between one per millisecond (msec) and
 * one per 32 (approx) seconds.  At constant rates faster than one
 * per msec it maxes out at values just under 1,000,000.  At constant
 * rates between one per msec, and one per second it will stabilize
 * to a value N*1000, where N is the rate of events per second.
 * At constant rates between one per second and one per 32 seconds,
 * it will be choppy, moving up on the seconds that have an event,
 * and then decaying until the next event.  At rates slower than
 * about one in 32 seconds, it decays all the way back to zero between
 * each event.
 */

#define FM_COEF 933                /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
#define FM_MAXCNT 1000000        /* limit cnt to avoid overflow */
#define FM_SCALE 1000                /* faux fixed point scale */

/* Initialize a frequency meter */
static void fmeter_init(struct fmeter *fmp)
{
        fmp->cnt = 0;
        fmp->val = 0;
        fmp->time = 0;
        spin_lock_init(&fmp->lock);
}

/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
        time64_t now;
        u32 ticks;

        now = ktime_get_seconds();
        ticks = now - fmp->time;

        if (ticks == 0)
                return;

        ticks = min(FM_MAXTICKS, ticks);
        while (ticks-- > 0)
                fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
        fmp->time = now;

        fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
        fmp->cnt = 0;
}

/* Process any previous ticks, then bump cnt by one (times scale). */
static void fmeter_markevent(struct fmeter *fmp)
{
        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
        spin_unlock(&fmp->lock);
}

/* Process any previous ticks, then return current value. */
static int fmeter_getrate(struct fmeter *fmp)
{
        int val;

        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        val = fmp->val;
        spin_unlock(&fmp->lock);
        return val;
}

static struct cpuset *cpuset_attach_old_cs;

/*
 * Check to see if a cpuset can accept a new task
 * For v1, cpus_allowed and mems_allowed can't be empty.
 * For v2, effective_cpus can't be empty.
 * Note that in v1, effective_cpus = cpus_allowed.
 */
static int cpuset_can_attach_check(struct cpuset *cs)
{
        if (cpumask_empty(cs->effective_cpus) ||
           (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
                return -ENOSPC;
        return 0;
}

static void reset_migrate_dl_data(struct cpuset *cs)
{
        cs->nr_migrate_dl_tasks = 0;
        cs->sum_migrate_dl_bw = 0;
}

/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs, *oldcs;
        struct task_struct *task;
        bool cpus_updated, mems_updated;
        int ret;

        /* used later by cpuset_attach() */
        cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
        oldcs = cpuset_attach_old_cs;
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);

        /* Check to see if task is allowed in the cpuset */
        ret = cpuset_can_attach_check(cs);
        if (ret)
                goto out_unlock;

        cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
        mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);

        cgroup_taskset_for_each(task, css, tset) {
                ret = task_can_attach(task);
                if (ret)
                        goto out_unlock;

                /*
                 * Skip rights over task check in v2 when nothing changes,
                 * migration permission derives from hierarchy ownership in
                 * cgroup_procs_write_permission()).
                 */
                if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    (cpus_updated || mems_updated)) {
                        ret = security_task_setscheduler(task);
                        if (ret)
                                goto out_unlock;
                }

                if (dl_task(task)) {
                        cs->nr_migrate_dl_tasks++;
                        cs->sum_migrate_dl_bw += task->dl.dl_bw;
                }
        }

        if (!cs->nr_migrate_dl_tasks)
                goto out_success;

        if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
                int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);

                if (unlikely(cpu >= nr_cpu_ids)) {
                        reset_migrate_dl_data(cs);
                        ret = -EINVAL;
                        goto out_unlock;
                }

                ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
                if (ret) {
                        reset_migrate_dl_data(cs);
                        goto out_unlock;
                }
        }

out_success:
        /*
         * Mark attach is in progress.  This makes validate_change() fail
         * changes which zero cpus/mems_allowed.
         */
        cs->attach_in_progress++;
out_unlock:
        mutex_unlock(&cpuset_mutex);
        return ret;
}

static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);
        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        if (cs->nr_migrate_dl_tasks) {
                int cpu = cpumask_any(cs->effective_cpus);

                dl_bw_free(cpu, cs->sum_migrate_dl_bw);
                reset_migrate_dl_data(cs);
        }

        mutex_unlock(&cpuset_mutex);
}

/*
 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
 * but we can't allocate it dynamically there.  Define it global and
 * allocate from cpuset_init().
 */
static cpumask_var_t cpus_attach;
static nodemask_t cpuset_attach_nodemask_to;

static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
        lockdep_assert_held(&cpuset_mutex);

        if (cs != &top_cpuset)
                guarantee_online_cpus(task, cpus_attach);
        else
                cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
                               subpartitions_cpus);
        /*
         * can_attach beforehand should guarantee that this doesn't
         * fail.  TODO: have a better way to handle failure here
         */
        WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));

        cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
        cpuset_update_task_spread_flags(cs, task);
}

static void cpuset_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct task_struct *leader;
        struct cgroup_subsys_state *css;
        struct cpuset *cs;
        struct cpuset *oldcs = cpuset_attach_old_cs;
        bool cpus_updated, mems_updated;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        lockdep_assert_cpus_held();        /* see cgroup_attach_lock() */
        mutex_lock(&cpuset_mutex);
        cpus_updated = !cpumask_equal(cs->effective_cpus,
                                      oldcs->effective_cpus);
        mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);

        /*
         * In the default hierarchy, enabling cpuset in the child cgroups
         * will trigger a number of cpuset_attach() calls with no change
         * in effective cpus and mems. In that case, we can optimize out
         * by skipping the task iteration and update.
         */
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            !cpus_updated && !mems_updated) {
                cpuset_attach_nodemask_to = cs->effective_mems;
                goto out;
        }

        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

        cgroup_taskset_for_each(task, css, tset)
                cpuset_attach_task(cs, task);

        /*
         * Change mm for all threadgroup leaders. This is expensive and may
         * sleep and should be moved outside migration path proper. Skip it
         * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
         * not set.
         */
        cpuset_attach_nodemask_to = cs->effective_mems;
        if (!is_memory_migrate(cs) && !mems_updated)
                goto out;

        cgroup_taskset_for_each_leader(leader, css, tset) {
                struct mm_struct *mm = get_task_mm(leader);

                if (mm) {
                        mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);

                        /*
                         * old_mems_allowed is the same with mems_allowed
                         * here, except if this task is being moved
                         * automatically due to hotplug.  In that case
                         * @mems_allowed has been updated and is empty, so
                         * @old_mems_allowed is the right nodesets that we
                         * migrate mm from.
                         */
                        if (is_memory_migrate(cs))
                                cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                                  &cpuset_attach_nodemask_to);
                        else
                                mmput(mm);
                }
        }

out:
        cs->old_mems_allowed = cpuset_attach_nodemask_to;

        if (cs->nr_migrate_dl_tasks) {
                cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
                oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
                reset_migrate_dl_data(cs);
        }

        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        mutex_unlock(&cpuset_mutex);
}

/* The various types of files and directories in a cpuset file system */

typedef enum {
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
        FILE_EFFECTIVE_CPULIST,
        FILE_EFFECTIVE_MEMLIST,
        FILE_SUBPARTS_CPULIST,
        FILE_EXCLUSIVE_CPULIST,
        FILE_EFFECTIVE_XCPULIST,
        FILE_ISOLATED_CPULIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_PARTITION_ROOT,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
        FILE_MEMORY_PRESSURE,
        FILE_SPREAD_PAGE,
        FILE_SPREAD_SLAB,
} cpuset_filetype_t;

static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
                            u64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = 0;

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs)) {
                retval = -ENODEV;
                goto out_unlock;
        }

        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_EXCLUSIVE:
                retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_HARDWALL:
                retval = update_flag(CS_MEM_HARDWALL, cs, val);
                break;
        case FILE_SCHED_LOAD_BALANCE:
                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
        case FILE_MEMORY_MIGRATE:
                retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
                break;
        case FILE_MEMORY_PRESSURE_ENABLED:
                cpuset_memory_pressure_enabled = !!val;
                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                break;
        case FILE_SPREAD_SLAB:
                retval = update_flag(CS_SPREAD_SLAB, cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        return retval;
}

static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
                            s64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = -ENODEV;

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        return retval;
}

/*
 * Common handling for a write to a "cpus" or "mems" file.
 */
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        struct cpuset *trialcs;
        int retval = -ENODEV;

        buf = strstrip(buf);

        /*
         * CPU or memory hotunplug may leave @cs w/o any execution
         * resources, in which case the hotplug code asynchronously updates
         * configuration and transfers all tasks to the nearest ancestor
         * which can execute.
         *
         * As writes to "cpus" or "mems" may restore @cs's execution
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
         *
         * cpuset_handle_hotplug may call back into cgroup core asynchronously
         * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
         * operation like this one can lead to a deadlock through kernfs
         * active_ref protection.  Let's break the protection.  Losing the
         * protection is okay as we check whether @cs is online after
         * grabbing cpuset_mutex anyway.  This only happens on the legacy
         * hierarchies.
         */
        css_get(&cs->css);
        kernfs_break_active_protection(of->kn);

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs) {
                retval = -ENOMEM;
                goto out_unlock;
        }

        switch (of_cft(of)->private) {
        case FILE_CPULIST:
                retval = update_cpumask(cs, trialcs, buf);
                break;
        case FILE_EXCLUSIVE_CPULIST:
                retval = update_exclusive_cpumask(cs, trialcs, buf);
                break;
        case FILE_MEMLIST:
                retval = update_nodemask(cs, trialcs, buf);
                break;
        default:
                retval = -EINVAL;
                break;
        }

        free_cpuset(trialcs);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        kernfs_unbreak_active_protection(of->kn);
        css_put(&cs->css);
        flush_workqueue(cpuset_migrate_mm_wq);
        return retval ?: nbytes;
}

/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
 * used, list of ranges of sequential numbers, is variable length,
 * and since these maps can change value dynamically, one could read
 * gibberish by doing partial reads while a list was changing.
 */
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
        struct cpuset *cs = css_cs(seq_css(sf));
        cpuset_filetype_t type = seq_cft(sf)->private;
        int ret = 0;

        spin_lock_irq(&callback_lock);

        switch (type) {
        case FILE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
                break;
        case FILE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
                break;
        case FILE_EFFECTIVE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
                break;
        case FILE_EFFECTIVE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                break;
        case FILE_EXCLUSIVE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
                break;
        case FILE_EFFECTIVE_XCPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
                break;
        case FILE_SUBPARTS_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
                break;
        case FILE_ISOLATED_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
                break;
        default:
                ret = -EINVAL;
        }

        spin_unlock_irq(&callback_lock);
        return ret;
}

static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                return is_cpu_exclusive(cs);
        case FILE_MEM_EXCLUSIVE:
                return is_mem_exclusive(cs);
        case FILE_MEM_HARDWALL:
                return is_mem_hardwall(cs);
        case FILE_SCHED_LOAD_BALANCE:
                return is_sched_load_balance(cs);
        case FILE_MEMORY_MIGRATE:
                return is_memory_migrate(cs);
        case FILE_MEMORY_PRESSURE_ENABLED:
                return cpuset_memory_pressure_enabled;
        case FILE_MEMORY_PRESSURE:
                return fmeter_getrate(&cs->fmeter);
        case FILE_SPREAD_PAGE:
                return is_spread_page(cs);
        case FILE_SPREAD_SLAB:
                return is_spread_slab(cs);
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                return cs->relax_domain_level;
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

static int sched_partition_show(struct seq_file *seq, void *v)
{
        struct cpuset *cs = css_cs(seq_css(seq));
        const char *err, *type = NULL;

        switch (cs->partition_root_state) {
        case PRS_ROOT:
                seq_puts(seq, "root\n");
                break;
        case PRS_ISOLATED:
                seq_puts(seq, "isolated\n");
                break;
        case PRS_MEMBER:
                seq_puts(seq, "member\n");
                break;
        case PRS_INVALID_ROOT:
                type = "root";
                fallthrough;
        case PRS_INVALID_ISOLATED:
                if (!type)
                        type = "isolated";
                err = perr_strings[READ_ONCE(cs->prs_err)];
                if (err)
                        seq_printf(seq, "%s invalid (%s)\n", type, err);
                else
                        seq_printf(seq, "%s invalid\n", type);
                break;
        }
        return 0;
}

static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
                                     size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        int val;
        int retval = -ENODEV;

        buf = strstrip(buf);

        if (!strcmp(buf, "root"))
                val = PRS_ROOT;
        else if (!strcmp(buf, "member"))
                val = PRS_MEMBER;
        else if (!strcmp(buf, "isolated"))
                val = PRS_ISOLATED;
        else
                return -EINVAL;

        css_get(&cs->css);
        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        retval = update_prstate(cs, val);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        css_put(&cs->css);
        return retval ?: nbytes;
}

/*
 * for the common functions, 'private' gives the type of file
 */

static struct cftype legacy_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },

        {
                .name = "effective_cpus",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "effective_mems",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_CPU_EXCLUSIVE,
        },

        {
                .name = "mem_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_EXCLUSIVE,
        },

        {
                .name = "mem_hardwall",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_HARDWALL,
        },

        {
                .name = "sched_load_balance",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SCHED_LOAD_BALANCE,
        },

        {
                .name = "sched_relax_domain_level",
                .read_s64 = cpuset_read_s64,
                .write_s64 = cpuset_write_s64,
                .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
        },

        {
                .name = "memory_migrate",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_MIGRATE,
        },

        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
                .private = FILE_MEMORY_PRESSURE,
        },

        {
                .name = "memory_spread_page",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_PAGE,
        },

        {
                /* obsolete, may be removed in the future */
                .name = "memory_spread_slab",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },

        {
                .name = "memory_pressure_enabled",
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_PRESSURE_ENABLED,
        },

        { }        /* terminate */
};

/*
 * This is currently a minimal set for the default hierarchy. It can be
 * expanded later on by migrating more features and control files from v1.
 */
static struct cftype dfl_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "mems.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpus.partition",
                .seq_show = sched_partition_show,
                .write = sched_partition_write,
                .private = FILE_PARTITION_ROOT,
                .flags = CFTYPE_NOT_ON_ROOT,
                .file_offset = offsetof(struct cpuset, partition_file),
        },

        {
                .name = "cpus.exclusive",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_EXCLUSIVE_CPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.exclusive.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_XCPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.subpartitions",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_SUBPARTS_CPULIST,
                .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
        },

        {
                .name = "cpus.isolated",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_ISOLATED_CPULIST,
                .flags = CFTYPE_ONLY_ON_ROOT,
        },

        { }        /* terminate */
};


/**
 * cpuset_css_alloc - Allocate a cpuset css
 * @parent_css: Parent css of the control group that the new cpuset will be
 *              part of
 * Return: cpuset css on success, -ENOMEM on failure.
 *
 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
 * top cpuset css otherwise.
 */
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct cpuset *cs;

        if (!parent_css)
                return &top_cpuset.css;

        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);

        if (alloc_cpumasks(cs, NULL)) {
                kfree(cs);
                return ERR_PTR(-ENOMEM);
        }

        __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        nodes_clear(cs->mems_allowed);
        nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
        INIT_LIST_HEAD(&cs->remote_sibling);

        /* Set CS_MEMORY_MIGRATE for default hierarchy */
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                __set_bit(CS_MEMORY_MIGRATE, &cs->flags);

        return &cs->css;
}

static int cpuset_css_online(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);
        struct cpuset *parent = parent_cs(cs);
        struct cpuset *tmp_cs;
        struct cgroup_subsys_state *pos_css;

        if (!parent)
                return 0;

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);

        set_bit(CS_ONLINE, &cs->flags);
        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
        if (is_spread_slab(parent))
                set_bit(CS_SPREAD_SLAB, &cs->flags);

        cpuset_inc();

        spin_lock_irq(&callback_lock);
        if (is_in_v2_mode()) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
                cs->use_parent_ecpus = true;
                parent->child_ecpus_count++;
        }

        /*
         * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
         */
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            !is_sched_load_balance(parent))
                clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);

        spin_unlock_irq(&callback_lock);

        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;

        /*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
         * set.  This flag handling is implemented in cgroup core for
         * historical reasons - the flag may be specified during mount.
         *
         * Currently, if any sibling cpusets have exclusive cpus or mem, we
         * refuse to clone the configuration - thereby refusing the task to
         * be entered, and as a result refusing the sys_unshare() or
         * clone() which initiated it.  If this becomes a problem for some
         * users who wish to allow that scenario, then this could be
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
        rcu_read_lock();
        cpuset_for_each_child(tmp_cs, pos_css, parent) {
                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
                        rcu_read_unlock();
                        goto out_unlock;
                }
        }
        rcu_read_unlock();

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = parent->mems_allowed;
        cs->effective_mems = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
        cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
        spin_unlock_irq(&callback_lock);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        return 0;
}

/*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
 * will call rebuild_sched_domains_locked(). That is not needed
 * in the default hierarchy where only changes in partition
 * will cause repartitioning.
 *
 * If the cpuset has the 'sched.partition' flag enabled, simulate
 * turning 'sched.partition" off.
 */

static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);

        if (is_partition_valid(cs))
                update_prstate(cs, 0);

        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);

        if (cs->use_parent_ecpus) {
                struct cpuset *parent = parent_cs(cs);

                cs->use_parent_ecpus = false;
                parent->child_ecpus_count--;
        }

        cpuset_dec();
        clear_bit(CS_ONLINE, &cs->flags);

        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
}

static void cpuset_css_free(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        free_cpuset(cs);
}

static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
        mutex_lock(&cpuset_mutex);
        spin_lock_irq(&callback_lock);

        if (is_in_v2_mode()) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
                top_cpuset.mems_allowed = node_possible_map;
        } else {
                cpumask_copy(top_cpuset.cpus_allowed,
                             top_cpuset.effective_cpus);
                top_cpuset.mems_allowed = top_cpuset.effective_mems;
        }

        spin_unlock_irq(&callback_lock);
        mutex_unlock(&cpuset_mutex);
}

/*
 * In case the child is cloned into a cpuset different from its parent,
 * additional checks are done to see if the move is allowed.
 */
static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
{
        struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
        bool same_cs;
        int ret;

        rcu_read_lock();
        same_cs = (cs == task_cs(current));
        rcu_read_unlock();

        if (same_cs)
                return 0;

        lockdep_assert_held(&cgroup_mutex);
        mutex_lock(&cpuset_mutex);

        /* Check to see if task is allowed in the cpuset */
        ret = cpuset_can_attach_check(cs);
        if (ret)
                goto out_unlock;

        ret = task_can_attach(task);
        if (ret)
                goto out_unlock;

        ret = security_task_setscheduler(task);
        if (ret)
                goto out_unlock;

        /*
         * Mark attach is in progress.  This makes validate_change() fail
         * changes which zero cpus/mems_allowed.
         */
        cs->attach_in_progress++;
out_unlock:
        mutex_unlock(&cpuset_mutex);
        return ret;
}

static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
{
        struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
        bool same_cs;

        rcu_read_lock();
        same_cs = (cs == task_cs(current));
        rcu_read_unlock();

        if (same_cs)
                return;

        mutex_lock(&cpuset_mutex);
        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);
        mutex_unlock(&cpuset_mutex);
}

/*
 * Make sure the new task conform to the current state of its parent,
 * which could have been changed by cpuset just after it inherits the
 * state from the parent and before it sits on the cgroup's task list.
 */
static void cpuset_fork(struct task_struct *task)
{
        struct cpuset *cs;
        bool same_cs;

        rcu_read_lock();
        cs = task_cs(task);
        same_cs = (cs == task_cs(current));
        rcu_read_unlock();

        if (same_cs) {
                if (cs == &top_cpuset)
                        return;

                set_cpus_allowed_ptr(task, current->cpus_ptr);
                task->mems_allowed = current->mems_allowed;
                return;
        }

        /* CLONE_INTO_CGROUP */
        mutex_lock(&cpuset_mutex);
        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
        cpuset_attach_task(cs, task);

        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        mutex_unlock(&cpuset_mutex);
}

struct cgroup_subsys cpuset_cgrp_subsys = {
        .css_alloc        = cpuset_css_alloc,
        .css_online        = cpuset_css_online,
        .css_offline        = cpuset_css_offline,
        .css_free        = cpuset_css_free,
        .can_attach        = cpuset_can_attach,
        .cancel_attach        = cpuset_cancel_attach,
        .attach                = cpuset_attach,
        .post_attach        = cpuset_post_attach,
        .bind                = cpuset_bind,
        .can_fork        = cpuset_can_fork,
        .cancel_fork        = cpuset_cancel_fork,
        .fork                = cpuset_fork,
        .legacy_cftypes        = legacy_files,
        .dfl_cftypes        = dfl_files,
        .early_init        = true,
        .threaded        = true,
};

/**
 * cpuset_init - initialize cpusets at system boot
 *
 * Description: Initialize top_cpuset
 **/

int __init cpuset_init(void)
{
        BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));

        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        cpumask_setall(top_cpuset.effective_cpus);
        cpumask_setall(top_cpuset.effective_xcpus);
        cpumask_setall(top_cpuset.exclusive_cpus);
        nodes_setall(top_cpuset.effective_mems);

        fmeter_init(&top_cpuset.fmeter);
        INIT_LIST_HEAD(&remote_children);

        BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));

        return 0;
}

/*
 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
 */
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
        struct cpuset *parent;

        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
                parent = parent_cs(parent);

        if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
                pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
                pr_cont_cgroup_name(cs->css.cgroup);
                pr_cont("\n");
        }
}

static void cpuset_migrate_tasks_workfn(struct work_struct *work)
{
        struct cpuset_remove_tasks_struct *s;

        s = container_of(work, struct cpuset_remove_tasks_struct, work);
        remove_tasks_in_empty_cpuset(s->cs);
        css_put(&s->cs->css);
        kfree(s);
}

static void
hotplug_update_tasks_legacy(struct cpuset *cs,
                            struct cpumask *new_cpus, nodemask_t *new_mems,
                            bool cpus_updated, bool mems_updated)
{
        bool is_empty;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        /*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
         * as the tasks will be migrated to an ancestor.
         */
        if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
                update_tasks_cpumask(cs, new_cpus);
        if (mems_updated && !nodes_empty(cs->mems_allowed))
                update_tasks_nodemask(cs);

        is_empty = cpumask_empty(cs->cpus_allowed) ||
                   nodes_empty(cs->mems_allowed);

        /*
         * Move tasks to the nearest ancestor with execution resources,
         * This is full cgroup operation which will also call back into
         * cpuset. Execute it asynchronously using workqueue.
         */
        if (is_empty && cs->css.cgroup->nr_populated_csets &&
            css_tryget_online(&cs->css)) {
                struct cpuset_remove_tasks_struct *s;

                s = kzalloc(sizeof(*s), GFP_KERNEL);
                if (WARN_ON_ONCE(!s)) {
                        css_put(&cs->css);
                        return;
                }

                s->cs = cs;
                INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
                schedule_work(&s->work);
        }
}

static void
hotplug_update_tasks(struct cpuset *cs,
                     struct cpumask *new_cpus, nodemask_t *new_mems,
                     bool cpus_updated, bool mems_updated)
{
        /* A partition root is allowed to have empty effective cpus */
        if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
                cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
        if (nodes_empty(*new_mems))
                *new_mems = parent_cs(cs)->effective_mems;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        if (cpus_updated)
                update_tasks_cpumask(cs, new_cpus);
        if (mems_updated)
                update_tasks_nodemask(cs);
}

static bool force_rebuild;

void cpuset_force_rebuild(void)
{
        force_rebuild = true;
}

/**
 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
 * @cs: cpuset in interest
 * @tmp: the tmpmasks structure pointer
 *
 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
 * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
 * all its tasks are moved to the nearest ancestor with both resources.
 */
static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated;
        bool mems_updated;
        bool remote;
        int partcmd = -1;
        struct cpuset *parent;
retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);

        mutex_lock(&cpuset_mutex);

        /*
         * We have raced with task attaching. We wait until attaching
         * is finished, so we won't attach a task to an empty cpuset.
         */
        if (cs->attach_in_progress) {
                mutex_unlock(&cpuset_mutex);
                goto retry;
        }

        parent = parent_cs(cs);
        compute_effective_cpumask(&new_cpus, cs, parent);
        nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);

        if (!tmp || !cs->partition_root_state)
                goto update_tasks;

        /*
         * Compute effective_cpus for valid partition root, may invalidate
         * child partition roots if necessary.
         */
        remote = is_remote_partition(cs);
        if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
                compute_partition_effective_cpumask(cs, &new_cpus);

        if (remote && cpumask_empty(&new_cpus) &&
            partition_is_populated(cs, NULL)) {
                remote_partition_disable(cs, tmp);
                compute_effective_cpumask(&new_cpus, cs, parent);
                remote = false;
                cpuset_force_rebuild();
        }

        /*
         * Force the partition to become invalid if either one of
         * the following conditions hold:
         * 1) empty effective cpus but not valid empty partition.
         * 2) parent is invalid or doesn't grant any cpus to child
         *    partitions.
         */
        if (is_local_partition(cs) && (!is_partition_valid(parent) ||
                                tasks_nocpu_error(parent, cs, &new_cpus)))
                partcmd = partcmd_invalidate;
        /*
         * On the other hand, an invalid partition root may be transitioned
         * back to a regular one.
         */
        else if (is_partition_valid(parent) && is_partition_invalid(cs))
                partcmd = partcmd_update;

        if (partcmd >= 0) {
                update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
                if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
                        compute_partition_effective_cpumask(cs, &new_cpus);
                        cpuset_force_rebuild();
                }
        }

update_tasks:
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
        if (!cpus_updated && !mems_updated)
                goto unlock;        /* Hotplug doesn't affect this cpuset */

        if (mems_updated)
                check_insane_mems_config(&new_mems);

        if (is_in_v2_mode())
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
        else
                hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
                                            cpus_updated, mems_updated);

unlock:
        mutex_unlock(&cpuset_mutex);
}

/**
 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
 *
 * This function is called after either CPU or memory configuration has
 * changed and updates cpuset accordingly.  The top_cpuset is always
 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
 * order to make cpusets transparent (of no affect) on systems that are
 * actively using CPU hotplug but making no active use of cpusets.
 *
 * Non-root cpusets are only affected by offlining.  If any CPUs or memory
 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
 * all descendants.
 *
 * Note that CPU offlining during suspend is ignored.  We don't modify
 * cpusets across suspend/resume cycles at all.
 *
 * CPU / memory hotplug is handled synchronously.
 */
static void cpuset_handle_hotplug(void)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
        bool on_dfl = is_in_v2_mode();
        struct tmpmasks tmp, *ptmp = NULL;

        if (on_dfl && !alloc_cpumasks(NULL, &tmp))
                ptmp = &tmp;

        lockdep_assert_cpus_held();
        mutex_lock(&cpuset_mutex);

        /* fetch the available cpus/mems and find out which changed how */
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];

        /*
         * If subpartitions_cpus is populated, it is likely that the check
         * below will produce a false positive on cpus_updated when the cpu
         * list isn't changed. It is extra work, but it is better to be safe.
         */
        cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
                       !cpumask_empty(subpartitions_cpus);
        mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);

        /*
         * In the rare case that hotplug removes all the cpus in
         * subpartitions_cpus, we assumed that cpus are updated.
         */
        if (!cpus_updated && top_cpuset.nr_subparts)
                cpus_updated = true;

        /* For v1, synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                /*
                 * Make sure that CPUs allocated to child partitions
                 * do not show up in effective_cpus. If no CPU is left,
                 * we clear the subpartitions_cpus & let the child partitions
                 * fight for the CPUs again.
                 */
                if (!cpumask_empty(subpartitions_cpus)) {
                        if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
                                top_cpuset.nr_subparts = 0;
                                cpumask_clear(subpartitions_cpus);
                        } else {
                                cpumask_andnot(&new_cpus, &new_cpus,
                                               subpartitions_cpus);
                        }
                }
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                spin_unlock_irq(&callback_lock);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }

        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        top_cpuset.mems_allowed = new_mems;
                top_cpuset.effective_mems = new_mems;
                spin_unlock_irq(&callback_lock);
                update_tasks_nodemask(&top_cpuset);
        }

        mutex_unlock(&cpuset_mutex);

        /* if cpus or mems changed, we need to propagate to descendants */
        if (cpus_updated || mems_updated) {
                struct cpuset *cs;
                struct cgroup_subsys_state *pos_css;

                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (cs == &top_cpuset || !css_tryget_online(&cs->css))
                                continue;
                        rcu_read_unlock();

                        cpuset_hotplug_update_tasks(cs, ptmp);

                        rcu_read_lock();
                        css_put(&cs->css);
                }
                rcu_read_unlock();
        }

        /* rebuild sched domains if cpus_allowed has changed */
        if (cpus_updated || force_rebuild) {
                force_rebuild = false;
                rebuild_sched_domains_cpuslocked();
        }

        free_cpumasks(NULL, ptmp);
}

void cpuset_update_active_cpus(void)
{
        /*
         * We're inside cpu hotplug critical region which usually nests
         * inside cgroup synchronization.  Bounce actual hotplug processing
         * to a work item to avoid reverse locking order.
         */
        cpuset_handle_hotplug();
}

/*
 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
 * Call this routine anytime after node_states[N_MEMORY] changes.
 * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
{
        cpuset_handle_hotplug();
        return NOTIFY_OK;
}

/**
 * cpuset_init_smp - initialize cpus_allowed
 *
 * Description: Finish top cpuset after cpu, node maps are initialized
 */
void __init cpuset_init_smp(void)
{
        /*
         * cpus_allowd/mems_allowed set to v2 values in the initial
         * cpuset_bind() call will be reset to v1 values in another
         * cpuset_bind() call when v1 cpuset is mounted.
         */
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;

        cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
        top_cpuset.effective_mems = node_states[N_MEMORY];

        hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);

        cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
        BUG_ON(!cpuset_migrate_mm_wq);
}

/**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
 *
 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of cpu_online_mask, even if this means going outside the
 * tasks cpuset, except when the task is in the top cpuset.
 **/

void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
        unsigned long flags;
        struct cpuset *cs;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();

        cs = task_cs(tsk);
        if (cs != &top_cpuset)
                guarantee_online_cpus(tsk, pmask);
        /*
         * Tasks in the top cpuset won't get update to their cpumasks
         * when a hotplug online/offline event happens. So we include all
         * offline cpus in the allowed cpu list.
         */
        if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
                const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);

                /*
                 * We first exclude cpus allocated to partitions. If there is no
                 * allowable online cpu left, we fall back to all possible cpus.
                 */
                cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
                if (!cpumask_intersects(pmask, cpu_online_mask))
                        cpumask_copy(pmask, possible_mask);
        }

        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);
}

/**
 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
 * @tsk: pointer to task_struct with which the scheduler is struggling
 *
 * Description: In the case that the scheduler cannot find an allowed cpu in
 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
 * which will not contain a sane cpumask during cases such as cpu hotplugging.
 * This is the absolute last resort for the scheduler and it is only used if
 * _every_ other avenue has been traveled.
 *
 * Returns true if the affinity of @tsk was changed, false otherwise.
 **/

bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
        const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
        const struct cpumask *cs_mask;
        bool changed = false;

        rcu_read_lock();
        cs_mask = task_cs(tsk)->cpus_allowed;
        if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
                do_set_cpus_allowed(tsk, cs_mask);
                changed = true;
        }
        rcu_read_unlock();

        /*
         * We own tsk->cpus_allowed, nobody can change it under us.
         *
         * But we used cs && cs->cpus_allowed lockless and thus can
         * race with cgroup_attach_task() or update_cpumask() and get
         * the wrong tsk->cpus_allowed. However, both cases imply the
         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
         * which takes task_rq_lock().
         *
         * If we are called after it dropped the lock we must see all
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
         *
         * select_fallback_rq() will fix things ups and set cpu_possible_mask
         * if required.
         */
        return changed;
}

void __init cpuset_init_current_mems_allowed(void)
{
        nodes_setall(current->mems_allowed);
}

/**
 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
 *
 * Description: Returns the nodemask_t mems_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of node_states[N_MEMORY], even if this means going outside the
 * tasks cpuset.
 **/

nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
        nodemask_t mask;
        unsigned long flags;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);

        return mask;
}

/**
 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
 * @nodemask: the nodemask to be checked
 *
 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
 */
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return nodes_intersects(*nodemask, current->mems_allowed);
}

/*
 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
 * mem_hardwall ancestor to the specified cpuset.  Call holding
 * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
                cs = parent_cs(cs);
        return cs;
}

/*
 * cpuset_node_allowed - Can we allocate on a memory node?
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
 * If we're in interrupt, yes, we can always allocate.  If @node is set in
 * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
 * yes.  If current has access to memory reserves as an oom victim, yes.
 * Otherwise, no.
 *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed.
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
 * Scanning up parent cpusets requires callback_lock.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
 * current tasks mems_allowed came up empty on the first pass over
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
 * cpuset are short of memory, might require taking the callback_lock.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
 * so no allocation on a node outside the cpuset is allowed (unless
 * in interrupt, of course).
 *
 * The second pass through get_page_from_freelist() doesn't even call
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
 * in alloc_flags.  That logic and the checks below have the combined
 * affect that:
 *        in_interrupt - any node ok (current task context irrelevant)
 *        GFP_ATOMIC   - any node ok
 *        tsk_is_oom_victim   - any node ok
 *        GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *        GFP_USER     - only nodes in current tasks mems allowed ok.
 */
bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
        struct cpuset *cs;                /* current cpuset ancestors */
        bool allowed;                        /* is allocation in zone z allowed? */
        unsigned long flags;

        if (in_interrupt())
                return true;
        if (node_isset(node, current->mems_allowed))
                return true;
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
         */
        if (unlikely(tsk_is_oom_victim(current)))
                return true;
        if (gfp_mask & __GFP_HARDWALL)        /* If hardwall request, stop here */
                return false;

        if (current->flags & PF_EXITING) /* Let dying task have memory */
                return true;

        /* Not hardwall and node outside mems_allowed: scan up cpusets */
        spin_lock_irqsave(&callback_lock, flags);

        rcu_read_lock();
        cs = nearest_hardwall_ancestor(task_cs(current));
        allowed = node_isset(node, cs->mems_allowed);
        rcu_read_unlock();

        spin_unlock_irqrestore(&callback_lock, flags);
        return allowed;
}

/**
 * cpuset_spread_node() - On which node to begin search for a page
 * @rotor: round robin rotor
 *
 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
 * tasks in a cpuset with is_spread_page or is_spread_slab set),
 * and if the memory allocation used cpuset_mem_spread_node()
 * to determine on which node to start looking, as it will for
 * certain page cache or slab cache pages such as used for file
 * system buffers and inode caches, then instead of starting on the
 * local node to look for a free page, rather spread the starting
 * node around the tasks mems_allowed nodes.
 *
 * We don't have to worry about the returned node being offline
 * because "it can't happen", and even if it did, it would be ok.
 *
 * The routines calling guarantee_online_mems() are careful to
 * only set nodes in task->mems_allowed that are online.  So it
 * should not be possible for the following code to return an
 * offline node.  But if it did, that would be ok, as this routine
 * is not returning the node where the allocation must be, only
 * the node where the search should start.  The zonelist passed to
 * __alloc_pages() will include all nodes.  If the slab allocator
 * is passed an offline node, it will fall back to the local node.
 * See kmem_cache_alloc_node().
 */
static int cpuset_spread_node(int *rotor)
{
        return *rotor = next_node_in(*rotor, current->mems_allowed);
}

/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 */
int cpuset_mem_spread_node(void)
{
        if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
                current->cpuset_mem_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}

/**
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 */
int cpuset_slab_spread_node(void)
{
        if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
                current->cpuset_slab_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

/**
 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
 * @tsk1: pointer to task_struct of some task.
 * @tsk2: pointer to task_struct of some other task.
 *
 * Description: Return true if @tsk1's mems_allowed intersects the
 * mems_allowed of @tsk2.  Used by the OOM killer to determine if
 * one of the task's memory usage might impact the memory available
 * to the other.
 **/

int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                   const struct task_struct *tsk2)
{
        return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}

/**
 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
 *
 * Description: Prints current's name, cpuset name, and cached copy of its
 * mems_allowed to the kernel log.
 */
void cpuset_print_current_mems_allowed(void)
{
        struct cgroup *cgrp;

        rcu_read_lock();

        cgrp = task_cs(current)->css.cgroup;
        pr_cont(",cpuset=");
        pr_cont_cgroup_name(cgrp);
        pr_cont(",mems_allowed=%*pbl",
                nodemask_pr_args(&current->mems_allowed));

        rcu_read_unlock();
}

/*
 * Collection of memory_pressure is suppressed unless
 * this flag is enabled by writing "1" to the special
 * cpuset file 'memory_pressure_enabled' in the root cpuset.
 */

int cpuset_memory_pressure_enabled __read_mostly;

/*
 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
 *
 * Keep a running average of the rate of synchronous (direct)
 * page reclaim efforts initiated by tasks in each cpuset.
 *
 * This represents the rate at which some task in the cpuset
 * ran low on memory on all nodes it was allowed to use, and
 * had to enter the kernels page reclaim code in an effort to
 * create more free memory by tossing clean pages or swapping
 * or writing dirty pages.
 *
 * Display to user space in the per-cpuset read-only file
 * "memory_pressure".  Value displayed is an integer
 * representing the recent rate of entry into the synchronous
 * (direct) page reclaim by any task attached to the cpuset.
 */

void __cpuset_memory_pressure_bump(void)
{
        rcu_read_lock();
        fmeter_markevent(&task_cs(current)->fmeter);
        rcu_read_unlock();
}

#ifdef CONFIG_PROC_PID_CPUSET
/*
 * proc_cpuset_show()
 *  - Print tasks cpuset path into seq_file.
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        struct cgroup_subsys_state *css;
        int retval;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        css = task_get_css(tsk, cpuset_cgrp_id);
        retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
                                current->nsproxy->cgroup_ns);
        css_put(css);
        if (retval == -E2BIG)
                retval = -ENAMETOOLONG;
        if (retval < 0)
                goto out_free;
        seq_puts(m, buf);
        seq_putc(m, '\n');
        retval = 0;
out_free:
        kfree(buf);
out:
        return retval;
}
#endif /* CONFIG_PROC_PID_CPUSET */

/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
        seq_printf(m, "Mems_allowed:\t%*pb\n",
                   nodemask_pr_args(&task->mems_allowed));
        seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
                   nodemask_pr_args(&task->mems_allowed));
}



































































































   25 































































































































   25 









































































































































   24 



   26 























   27 








   20 
   25 














   25 

































   23 


   23 













   26 












   20 











   25 
   24 









   23 











   23 



   24 































































   24 










   25 














































   27 




   24 
















   25 






   25 






    1 







   23 

























   24 















   25 



























   26 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/objtool.h>
#include <linux/module.h>
#include <linux/sort.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
#include <asm/orc_types.h>
#include <asm/orc_lookup.h>
#include <asm/orc_header.h>

ORC_HEADER;

#define orc_warn(fmt, ...) \
        printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)

#define orc_warn_current(args...)                                        \
({                                                                        \
        static bool dumped_before;                                        \
        if (state->task == current && !state->error) {                        \
                orc_warn(args);                                                \
                if (unwind_debug && !dumped_before) {                        \
                        dumped_before = true;                                \
                        unwind_dump(state);                                \
                }                                                        \
        }                                                                \
})

extern int __start_orc_unwind_ip[];
extern int __stop_orc_unwind_ip[];
extern struct orc_entry __start_orc_unwind[];
extern struct orc_entry __stop_orc_unwind[];

static bool orc_init __ro_after_init;
static bool unwind_debug __ro_after_init;
static unsigned int lookup_num_blocks __ro_after_init;

static int __init unwind_debug_cmdline(char *str)
{
        unwind_debug = true;

        return 0;
}
early_param("unwind_debug", unwind_debug_cmdline);

static void unwind_dump(struct unwind_state *state)
{
        static bool dumped_before;
        unsigned long word, *sp;
        struct stack_info stack_info = {0};
        unsigned long visit_mask = 0;

        if (dumped_before)
                return;

        dumped_before = true;

        printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
                        state->stack_info.type, state->stack_info.next_sp,
                        state->stack_mask, state->graph_idx);

        for (sp = __builtin_frame_address(0); sp;
             sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
                if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
                        break;

                for (; sp < stack_info.end; sp++) {

                        word = READ_ONCE_NOCHECK(*sp);

                        printk_deferred("%0*lx: %0*lx (%pB)\n", BITS_PER_LONG/4,
                                        (unsigned long)sp, BITS_PER_LONG/4,
                                        word, (void *)word);
                }
        }
}

static inline unsigned long orc_ip(const int *ip)
{
        return (unsigned long)ip + *ip;
}

static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
                                    unsigned int num_entries, unsigned long ip)
{
        int *first = ip_table;
        int *last = ip_table + num_entries - 1;
        int *mid, *found = first;

        if (!num_entries)
                return NULL;

        /*
         * Do a binary range search to find the rightmost duplicate of a given
         * starting address.  Some entries are section terminators which are
         * "weak" entries for ensuring there are no gaps.  They should be
         * ignored when they conflict with a real entry.
         */
        while (first <= last) {
                mid = first + ((last - first) / 2);

                if (orc_ip(mid) <= ip) {
                        found = mid;
                        first = mid + 1;
                } else
                        last = mid - 1;
        }

        return u_table + (found - ip_table);
}

#ifdef CONFIG_MODULES
static struct orc_entry *orc_module_find(unsigned long ip)
{
        struct module *mod;

        mod = __module_address(ip);
        if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
                return NULL;
        return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
                          mod->arch.num_orcs, ip);
}
#else
static struct orc_entry *orc_module_find(unsigned long ip)
{
        return NULL;
}
#endif

#ifdef CONFIG_DYNAMIC_FTRACE
static struct orc_entry *orc_find(unsigned long ip);

/*
 * Ftrace dynamic trampolines do not have orc entries of their own.
 * But they are copies of the ftrace entries that are static and
 * defined in ftrace_*.S, which do have orc entries.
 *
 * If the unwinder comes across a ftrace trampoline, then find the
 * ftrace function that was used to create it, and use that ftrace
 * function's orc entry, as the placement of the return code in
 * the stack will be identical.
 */
static struct orc_entry *orc_ftrace_find(unsigned long ip)
{
        struct ftrace_ops *ops;
        unsigned long tramp_addr, offset;

        ops = ftrace_ops_trampoline(ip);
        if (!ops)
                return NULL;

        /* Set tramp_addr to the start of the code copied by the trampoline */
        if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
                tramp_addr = (unsigned long)ftrace_regs_caller;
        else
                tramp_addr = (unsigned long)ftrace_caller;

        /* Now place tramp_addr to the location within the trampoline ip is at */
        offset = ip - ops->trampoline;
        tramp_addr += offset;

        /* Prevent unlikely recursion */
        if (ip == tramp_addr)
                return NULL;

        return orc_find(tramp_addr);
}
#else
static struct orc_entry *orc_ftrace_find(unsigned long ip)
{
        return NULL;
}
#endif

/*
 * If we crash with IP==0, the last successfully executed instruction
 * was probably an indirect function call with a NULL function pointer,
 * and we don't have unwind information for NULL.
 * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
 * pointer into its parent and then continue normally from there.
 */
static struct orc_entry null_orc_entry = {
        .sp_offset = sizeof(long),
        .sp_reg = ORC_REG_SP,
        .bp_reg = ORC_REG_UNDEFINED,
        .type = ORC_TYPE_CALL
};

/* Fake frame pointer entry -- used as a fallback for generated code */
static struct orc_entry orc_fp_entry = {
        .type                = ORC_TYPE_CALL,
        .sp_reg                = ORC_REG_BP,
        .sp_offset        = 16,
        .bp_reg                = ORC_REG_PREV_SP,
        .bp_offset        = -16,
};

static struct orc_entry *orc_find(unsigned long ip)
{
        static struct orc_entry *orc;

        if (ip == 0)
                return &null_orc_entry;

        /* For non-init vmlinux addresses, use the fast lookup table: */
        if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
                unsigned int idx, start, stop;

                idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;

                if (unlikely((idx >= lookup_num_blocks-1))) {
                        orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n",
                                 idx, lookup_num_blocks, (void *)ip);
                        return NULL;
                }

                start = orc_lookup[idx];
                stop = orc_lookup[idx + 1] + 1;

                if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
                             (__start_orc_unwind + stop > __stop_orc_unwind))) {
                        orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n",
                                 idx, lookup_num_blocks, start, stop, (void *)ip);
                        return NULL;
                }

                return __orc_find(__start_orc_unwind_ip + start,
                                  __start_orc_unwind + start, stop - start, ip);
        }

        /* vmlinux .init slow lookup: */
        if (is_kernel_inittext(ip))
                return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
                                  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);

        /* Module lookup: */
        orc = orc_module_find(ip);
        if (orc)
                return orc;

        return orc_ftrace_find(ip);
}

#ifdef CONFIG_MODULES

static DEFINE_MUTEX(sort_mutex);
static int *cur_orc_ip_table = __start_orc_unwind_ip;
static struct orc_entry *cur_orc_table = __start_orc_unwind;

static void orc_sort_swap(void *_a, void *_b, int size)
{
        struct orc_entry *orc_a, *orc_b;
        int *a = _a, *b = _b, tmp;
        int delta = _b - _a;

        /* Swap the .orc_unwind_ip entries: */
        tmp = *a;
        *a = *b + delta;
        *b = tmp - delta;

        /* Swap the corresponding .orc_unwind entries: */
        orc_a = cur_orc_table + (a - cur_orc_ip_table);
        orc_b = cur_orc_table + (b - cur_orc_ip_table);
        swap(*orc_a, *orc_b);
}

static int orc_sort_cmp(const void *_a, const void *_b)
{
        struct orc_entry *orc_a;
        const int *a = _a, *b = _b;
        unsigned long a_val = orc_ip(a);
        unsigned long b_val = orc_ip(b);

        if (a_val > b_val)
                return 1;
        if (a_val < b_val)
                return -1;

        /*
         * The "weak" section terminator entries need to always be first
         * to ensure the lookup code skips them in favor of real entries.
         * These terminator entries exist to handle any gaps created by
         * whitelisted .o files which didn't get objtool generation.
         */
        orc_a = cur_orc_table + (a - cur_orc_ip_table);
        return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
}

void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
                        void *_orc, size_t orc_size)
{
        int *orc_ip = _orc_ip;
        struct orc_entry *orc = _orc;
        unsigned int num_entries = orc_ip_size / sizeof(int);

        WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
                     orc_size % sizeof(*orc) != 0 ||
                     num_entries != orc_size / sizeof(*orc));

        /*
         * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
         * associate an .orc_unwind_ip table entry with its corresponding
         * .orc_unwind entry so they can both be swapped.
         */
        mutex_lock(&sort_mutex);
        cur_orc_ip_table = orc_ip;
        cur_orc_table = orc;
        sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
        mutex_unlock(&sort_mutex);

        mod->arch.orc_unwind_ip = orc_ip;
        mod->arch.orc_unwind = orc;
        mod->arch.num_orcs = num_entries;
}
#endif

void __init unwind_init(void)
{
        size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
        size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
        size_t num_entries = orc_ip_size / sizeof(int);
        struct orc_entry *orc;
        int i;

        if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
            orc_size % sizeof(struct orc_entry) != 0 ||
            num_entries != orc_size / sizeof(struct orc_entry)) {
                orc_warn("WARNING: Bad or missing .orc_unwind table.  Disabling unwinder.\n");
                return;
        }

        /*
         * Note, the orc_unwind and orc_unwind_ip tables were already
         * sorted at build time via the 'sorttable' tool.
         * It's ready for binary search straight away, no need to sort it.
         */

        /* Initialize the fast lookup table: */
        lookup_num_blocks = orc_lookup_end - orc_lookup;
        for (i = 0; i < lookup_num_blocks-1; i++) {
                orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
                                 num_entries,
                                 LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
                if (!orc) {
                        orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
                        return;
                }

                orc_lookup[i] = orc - __start_orc_unwind;
        }

        /* Initialize the ending block: */
        orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
                         LOOKUP_STOP_IP);
        if (!orc) {
                orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
                return;
        }
        orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;

        orc_init = true;
}

unsigned long unwind_get_return_address(struct unwind_state *state)
{
        if (unwind_done(state))
                return 0;

        return __kernel_text_address(state->ip) ? state->ip : 0;
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);

unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
{
        if (unwind_done(state))
                return NULL;

        if (state->regs)
                return &state->regs->ip;

        if (state->sp)
                return (unsigned long *)state->sp - 1;

        return NULL;
}

static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
                            size_t len)
{
        struct stack_info *info = &state->stack_info;
        void *addr = (void *)_addr;

        if (on_stack(info, addr, len))
                return true;

        return !get_stack_info(addr, state->task, info, &state->stack_mask) &&
                on_stack(info, addr, len);
}

static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
                            unsigned long *val)
{
        if (!stack_access_ok(state, addr, sizeof(long)))
                return false;

        *val = READ_ONCE_NOCHECK(*(unsigned long *)addr);
        return true;
}

static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
                             unsigned long *ip, unsigned long *sp)
{
        struct pt_regs *regs = (struct pt_regs *)addr;

        /* x86-32 support will be more complicated due to the &regs->sp hack */
        BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));

        if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
                return false;

        *ip = READ_ONCE_NOCHECK(regs->ip);
        *sp = READ_ONCE_NOCHECK(regs->sp);
        return true;
}

static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
                                  unsigned long *ip, unsigned long *sp)
{
        struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;

        if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
                return false;

        *ip = READ_ONCE_NOCHECK(regs->ip);
        *sp = READ_ONCE_NOCHECK(regs->sp);
        return true;
}

/*
 * If state->regs is non-NULL, and points to a full pt_regs, just get the reg
 * value from state->regs.
 *
 * Otherwise, if state->regs just points to IRET regs, and the previous frame
 * had full regs, it's safe to get the value from the previous regs.  This can
 * happen when early/late IRQ entry code gets interrupted by an NMI.
 */
static bool get_reg(struct unwind_state *state, unsigned int reg_off,
                    unsigned long *val)
{
        unsigned int reg = reg_off/8;

        if (!state->regs)
                return false;

        if (state->full_regs) {
                *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
                return true;
        }

        if (state->prev_regs) {
                *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
                return true;
        }

        return false;
}

bool unwind_next_frame(struct unwind_state *state)
{
        unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
        enum stack_type prev_type = state->stack_info.type;
        struct orc_entry *orc;
        bool indirect = false;

        if (unwind_done(state))
                return false;

        /* Don't let modules unload while we're reading their ORC data. */
        preempt_disable();

        /* End-of-stack check for user tasks: */
        if (state->regs && user_mode(state->regs))
                goto the_end;

        /*
         * Find the orc_entry associated with the text address.
         *
         * For a call frame (as opposed to a signal frame), state->ip points to
         * the instruction after the call.  That instruction's stack layout
         * could be different from the call instruction's layout, for example
         * if the call was to a noreturn function.  So get the ORC data for the
         * call instruction itself.
         */
        orc = orc_find(state->signal ? state->ip : state->ip - 1);
        if (!orc) {
                /*
                 * As a fallback, try to assume this code uses a frame pointer.
                 * This is useful for generated code, like BPF, which ORC
                 * doesn't know about.  This is just a guess, so the rest of
                 * the unwind is no longer considered reliable.
                 */
                orc = &orc_fp_entry;
                state->error = true;
        } else {
                if (orc->type == ORC_TYPE_UNDEFINED)
                        goto err;

                if (orc->type == ORC_TYPE_END_OF_STACK)
                        goto the_end;
        }

        state->signal = orc->signal;

        /* Find the previous frame's stack: */
        switch (orc->sp_reg) {
        case ORC_REG_SP:
                sp = state->sp + orc->sp_offset;
                break;

        case ORC_REG_BP:
                sp = state->bp + orc->sp_offset;
                break;

        case ORC_REG_SP_INDIRECT:
                sp = state->sp;
                indirect = true;
                break;

        case ORC_REG_BP_INDIRECT:
                sp = state->bp + orc->sp_offset;
                indirect = true;
                break;

        case ORC_REG_R10:
                if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
                        orc_warn_current("missing R10 value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        case ORC_REG_R13:
                if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
                        orc_warn_current("missing R13 value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        case ORC_REG_DI:
                if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
                        orc_warn_current("missing RDI value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        case ORC_REG_DX:
                if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
                        orc_warn_current("missing DX value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        default:
                orc_warn("unknown SP base reg %d at %pB\n",
                         orc->sp_reg, (void *)state->ip);
                goto err;
        }

        if (indirect) {
                if (!deref_stack_reg(state, sp, &sp))
                        goto err;

                if (orc->sp_reg == ORC_REG_SP_INDIRECT)
                        sp += orc->sp_offset;
        }

        /* Find IP, SP and possibly regs: */
        switch (orc->type) {
        case ORC_TYPE_CALL:
                ip_p = sp - sizeof(long);

                if (!deref_stack_reg(state, ip_p, &state->ip))
                        goto err;

                state->ip = unwind_recover_ret_addr(state, state->ip,
                                                    (unsigned long *)ip_p);
                state->sp = sp;
                state->regs = NULL;
                state->prev_regs = NULL;
                break;

        case ORC_TYPE_REGS:
                if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn_current("can't access registers at %pB\n",
                                         (void *)orig_ip);
                        goto err;
                }
                /*
                 * There is a small chance to interrupt at the entry of
                 * arch_rethook_trampoline() where the ORC info doesn't exist.
                 * That point is right after the RET to arch_rethook_trampoline()
                 * which was modified return address.
                 * At that point, the @addr_p of the unwind_recover_rethook()
                 * (this has to point the address of the stack entry storing
                 * the modified return address) must be "SP - (a stack entry)"
                 * because SP is incremented by the RET.
                 */
                state->ip = unwind_recover_rethook(state, state->ip,
                                (unsigned long *)(state->sp - sizeof(long)));
                state->regs = (struct pt_regs *)sp;
                state->prev_regs = NULL;
                state->full_regs = true;
                break;

        case ORC_TYPE_REGS_PARTIAL:
                if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn_current("can't access iret registers at %pB\n",
                                         (void *)orig_ip);
                        goto err;
                }
                /* See ORC_TYPE_REGS case comment. */
                state->ip = unwind_recover_rethook(state, state->ip,
                                (unsigned long *)(state->sp - sizeof(long)));

                if (state->full_regs)
                        state->prev_regs = state->regs;
                state->regs = (void *)sp - IRET_FRAME_OFFSET;
                state->full_regs = false;
                break;

        default:
                orc_warn("unknown .orc_unwind entry type %d at %pB\n",
                         orc->type, (void *)orig_ip);
                goto err;
        }

        /* Find BP: */
        switch (orc->bp_reg) {
        case ORC_REG_UNDEFINED:
                if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
                        state->bp = tmp;
                break;

        case ORC_REG_PREV_SP:
                if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
                        goto err;
                break;

        case ORC_REG_BP:
                if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
                        goto err;
                break;

        default:
                orc_warn("unknown BP base reg %d for ip %pB\n",
                         orc->bp_reg, (void *)orig_ip);
                goto err;
        }

        /* Prevent a recursive loop due to bad ORC data: */
        if (state->stack_info.type == prev_type &&
            on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
            state->sp <= prev_sp) {
                orc_warn_current("stack going in the wrong direction? at %pB\n",
                                 (void *)orig_ip);
                goto err;
        }

        preempt_enable();
        return true;

err:
        state->error = true;

the_end:
        preempt_enable();
        state->stack_info.type = STACK_TYPE_UNKNOWN;
        return false;
}
EXPORT_SYMBOL_GPL(unwind_next_frame);

void __unwind_start(struct unwind_state *state, struct task_struct *task,
                    struct pt_regs *regs, unsigned long *first_frame)
{
        memset(state, 0, sizeof(*state));
        state->task = task;

        if (!orc_init)
                goto err;

        /*
         * Refuse to unwind the stack of a task while it's executing on another
         * CPU.  This check is racy, but that's ok: the unwinder has other
         * checks to prevent it from going off the rails.
         */
        if (task_on_another_cpu(task))
                goto err;

        if (regs) {
                if (user_mode(regs))
                        goto the_end;

                state->ip = regs->ip;
                state->sp = regs->sp;
                state->bp = regs->bp;
                state->regs = regs;
                state->full_regs = true;
                state->signal = true;

        } else if (task == current) {
                asm volatile("lea (%%rip), %0\n\t"
                             "mov %%rsp, %1\n\t"
                             "mov %%rbp, %2\n\t"
                             : "=r" (state->ip), "=r" (state->sp),
                               "=r" (state->bp));

        } else {
                struct inactive_task_frame *frame = (void *)task->thread.sp;

                state->sp = task->thread.sp + sizeof(*frame);
                state->bp = READ_ONCE_NOCHECK(frame->bp);
                state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
                state->signal = (void *)state->ip == ret_from_fork;
        }

        if (get_stack_info((unsigned long *)state->sp, state->task,
                           &state->stack_info, &state->stack_mask)) {
                /*
                 * We weren't on a valid stack.  It's possible that
                 * we overflowed a valid stack into a guard page.
                 * See if the next page up is valid so that we can
                 * generate some kind of backtrace if this happens.
                 */
                void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
                state->error = true;
                if (get_stack_info(next_page, state->task, &state->stack_info,
                                   &state->stack_mask))
                        return;
        }

        /*
         * The caller can provide the address of the first frame directly
         * (first_frame) or indirectly (regs->sp) to indicate which stack frame
         * to start unwinding at.  Skip ahead until we reach it.
         */

        /* When starting from regs, skip the regs frame: */
        if (regs) {
                unwind_next_frame(state);
                return;
        }

        /* Otherwise, skip ahead to the user-specified starting frame: */
        while (!unwind_done(state) &&
               (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
                        state->sp <= (unsigned long)first_frame))
                unwind_next_frame(state);

        return;

err:
        state->error = true;
the_end:
        state->stack_info.type = STACK_TYPE_UNKNOWN;
}
EXPORT_SYMBOL_GPL(__unwind_start);

















































































































































































































































































































































































































    3 


























































































































    1 




    1 










































    6 





    5 










    2 


    3 
























































































































































































































































































    1 










    1 



















    1 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  This file contains the interface functions for the various time related
 *  system calls: time, stime, gettimeofday, settimeofday, adjtime
 *
 * Modification history:
 *
 * 1993-09-02    Philip Gladstone
 *      Created file with time related functions from sched/core.c and adjtimex()
 * 1993-10-08    Torsten Duwe
 *      adjtime interface update and CMOS clock write code
 * 1995-08-13    Torsten Duwe
 *      kernel PLL updated to 1994-12-13 specs (rfc-1589)
 * 1999-01-16    Ulrich Windl
 *        Introduced error checking for many cases in adjtimex().
 *        Updated NTP code according to technical memorandum Jan '96
 *        "A Kernel Model for Precision Timekeeping" by Dave Mills
 *        Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
 *        (Even though the technical memorandum forbids it)
 * 2004-07-14         Christoph Lameter
 *        Added getnstimeofday to allow the posix timer functions to return
 *        with nanosecond accuracy
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/math64.h>
#include <linux/ptrace.h>

#include <linux/uaccess.h>
#include <linux/compat.h>
#include <asm/unistd.h>

#include <generated/timeconst.h>
#include "timekeeping.h"

/*
 * The timezone where the local system is located.  Used as a default by some
 * programs who obtain this value by using gettimeofday.
 */
struct timezone sys_tz;

EXPORT_SYMBOL(sys_tz);

#ifdef __ARCH_WANT_SYS_TIME

/*
 * sys_time() can be implemented in user-level using
 * sys_gettimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */
SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
        __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

/*
 * sys_stime() can be implemented in user-level using
 * sys_settimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */

SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME */

#ifdef CONFIG_COMPAT_32BIT_TIME
#ifdef __ARCH_WANT_SYS_TIME32

/* old_time32_t is a 32 bit "long" and needs to get converted. */
SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
        old_time32_t i;

        i = (old_time32_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME32 */
#endif

SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        if (likely(tv != NULL)) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (unlikely(tz != NULL)) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }
        return 0;
}

/*
 * In case for some reason the CMOS clock has not already been running
 * in UTC, but in some local time: The first time we set the timezone,
 * we will warp the clock so that it is ticking UTC time instead of
 * local time. Presumably, if someone is setting the timezone then we
 * are running in an environment where the programs understand about
 * timezones. This should be done at boot time in the /etc/rc script,
 * as soon as possible, so that the clock can be set right. Otherwise,
 * various programs will get confused when the clock gets warped.
 */

int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
{
        static int firsttime = 1;
        int error = 0;

        if (tv && !timespec64_valid_settod(tv))
                return -EINVAL;

        error = security_settime64(tv, tz);
        if (error)
                return error;

        if (tz) {
                /* Verify we're within the +-15 hrs range */
                if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
                        return -EINVAL;

                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
                        firsttime = 0;
                        if (!tv)
                                timekeeping_warp_clock();
                }
        }
        if (tv)
                return do_settimeofday64(tv);
        return 0;
}

SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        if (tv) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (tz) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }

        return 0;
}

COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
#endif

#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
        struct __kernel_timex txc;                /* Local copy of parameter */
        int ret;

        /* Copy the user data space into the kernel copy
         * structure. But bear in mind that the structures
         * may change
         */
        if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
                return -EFAULT;
        ret = do_adjtimex(&txc);
        return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
#endif

#ifdef CONFIG_COMPAT_32BIT_TIME
int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
{
        struct old_timex32 tx32;

        memset(txc, 0, sizeof(struct __kernel_timex));
        if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
                return -EFAULT;

        txc->modes = tx32.modes;
        txc->offset = tx32.offset;
        txc->freq = tx32.freq;
        txc->maxerror = tx32.maxerror;
        txc->esterror = tx32.esterror;
        txc->status = tx32.status;
        txc->constant = tx32.constant;
        txc->precision = tx32.precision;
        txc->tolerance = tx32.tolerance;
        txc->time.tv_sec = tx32.time.tv_sec;
        txc->time.tv_usec = tx32.time.tv_usec;
        txc->tick = tx32.tick;
        txc->ppsfreq = tx32.ppsfreq;
        txc->jitter = tx32.jitter;
        txc->shift = tx32.shift;
        txc->stabil = tx32.stabil;
        txc->jitcnt = tx32.jitcnt;
        txc->calcnt = tx32.calcnt;
        txc->errcnt = tx32.errcnt;
        txc->stbcnt = tx32.stbcnt;

        return 0;
}

int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
{
        struct old_timex32 tx32;

        memset(&tx32, 0, sizeof(struct old_timex32));
        tx32.modes = txc->modes;
        tx32.offset = txc->offset;
        tx32.freq = txc->freq;
        tx32.maxerror = txc->maxerror;
        tx32.esterror = txc->esterror;
        tx32.status = txc->status;
        tx32.constant = txc->constant;
        tx32.precision = txc->precision;
        tx32.tolerance = txc->tolerance;
        tx32.time.tv_sec = txc->time.tv_sec;
        tx32.time.tv_usec = txc->time.tv_usec;
        tx32.tick = txc->tick;
        tx32.ppsfreq = txc->ppsfreq;
        tx32.jitter = txc->jitter;
        tx32.shift = txc->shift;
        tx32.stabil = txc->stabil;
        tx32.jitcnt = txc->jitcnt;
        tx32.calcnt = txc->calcnt;
        tx32.errcnt = txc->errcnt;
        tx32.stbcnt = txc->stbcnt;
        tx32.tai = txc->tai;
        if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
        struct __kernel_timex txc;
        int err, ret;

        err = get_old_timex32(&txc, utp);
        if (err)
                return err;

        ret = do_adjtimex(&txc);

        err = put_old_timex32(utp, &txc);
        if (err)
                return err;

        return ret;
}
#endif

/**
 * jiffies_to_msecs - Convert jiffies to milliseconds
 * @j: jiffies value
 *
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases.
 *
 * Return: milliseconds value
 */
unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
               HZ_TO_MSEC_SHR32;
# else
        return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);

/**
 * jiffies_to_usecs - Convert jiffies to microseconds
 * @j: jiffies value
 *
 * Return: microseconds value
 */
unsigned int jiffies_to_usecs(const unsigned long j)
{
        /*
         * Hz usually doesn't go much further MSEC_PER_SEC.
         * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
         */
        BUILD_BUG_ON(HZ > USEC_PER_SEC);

#if !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
# else
        return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);

/**
 * mktime64 - Converts date to seconds.
 * @year0: year to convert
 * @mon0: month to convert
 * @day: day to convert
 * @hour: hour to convert
 * @min: minute to convert
 * @sec: second to convert
 *
 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
 *
 * [For the Julian calendar (which was used in Russia before 1917,
 * Britain & colonies before 1752, anywhere else before 1582,
 * and is still in use by some communities) leave out the
 * -year/100+year/400 terms, and add 10.]
 *
 * This algorithm was first published by Gauss (I think).
 *
 * A leap second can be indicated by calling this function with sec as
 * 60 (allowable under ISO 8601).  The leap second is treated the same
 * as the following second since they don't exist in UNIX time.
 *
 * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
 * tomorrow - (allowable under ISO 8601) is supported.
 *
 * Return: seconds since the epoch time for the given input date
 */
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
                const unsigned int day, const unsigned int hour,
                const unsigned int min, const unsigned int sec)
{
        unsigned int mon = mon0, year = year0;

        /* 1..12 -> 11,12,1..10 */
        if (0 >= (int) (mon -= 2)) {
                mon += 12;        /* Puts Feb last since it has leap day */
                year -= 1;
        }

        return ((((time64_t)
                  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
                  year*365 - 719499
            )*24 + hour /* now have hours - midnight tomorrow handled here */
          )*60 + min /* now have minutes */
        )*60 + sec; /* finally seconds */
}
EXPORT_SYMBOL(mktime64);

struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
{
        struct timespec64 ts = ns_to_timespec64(nsec);
        struct __kernel_old_timeval tv;

        tv.tv_sec = ts.tv_sec;
        tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;

        return tv;
}
EXPORT_SYMBOL(ns_to_kernel_old_timeval);

/**
 * set_normalized_timespec64 - set timespec sec and nsec parts and normalize
 *
 * @ts:                pointer to timespec variable to be set
 * @sec:        seconds to set
 * @nsec:        nanoseconds to set
 *
 * Set seconds and nanoseconds field of a timespec variable and
 * normalize to the timespec storage format
 *
 * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
 * For negative values only the tv_sec field is negative !
 */
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
{
        while (nsec >= NSEC_PER_SEC) {
                /*
                 * The following asm() prevents the compiler from
                 * optimising this loop into a modulo operation. See
                 * also __iter_div_u64_rem() in include/linux/time.h
                 */
                asm("" : "+rm"(nsec));
                nsec -= NSEC_PER_SEC;
                ++sec;
        }
        while (nsec < 0) {
                asm("" : "+rm"(nsec));
                nsec += NSEC_PER_SEC;
                --sec;
        }
        ts->tv_sec = sec;
        ts->tv_nsec = nsec;
}
EXPORT_SYMBOL(set_normalized_timespec64);

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:       the nanoseconds value to be converted
 *
 * Return: the timespec64 representation of the nsec parameter.
 */
struct timespec64 ns_to_timespec64(s64 nsec)
{
        struct timespec64 ts = { 0, 0 };
        s32 rem;

        if (likely(nsec > 0)) {
                ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
                ts.tv_nsec = rem;
        } else if (nsec < 0) {
                /*
                 * With negative times, tv_sec points to the earlier
                 * second, and tv_nsec counts the nanoseconds since
                 * then, so tv_nsec is always a positive number.
                 */
                ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
                ts.tv_nsec = NSEC_PER_SEC - rem - 1;
        }

        return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);

/**
 * __msecs_to_jiffies: - convert milliseconds to jiffies
 * @m:        time in milliseconds
 *
 * conversion is done as follows:
 *
 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows.
 *   for the details see __msecs_to_jiffies()
 *
 * __msecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code, __msecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * The _msecs_to_jiffies helpers are the HZ dependent conversion
 * routines found in include/linux/jiffies.h
 *
 * Return: jiffies value
 */
unsigned long __msecs_to_jiffies(const unsigned int m)
{
        /*
         * Negative value, means infinite timeout:
         */
        if ((int)m < 0)
                return MAX_JIFFY_OFFSET;
        return _msecs_to_jiffies(m);
}
EXPORT_SYMBOL(__msecs_to_jiffies);

/**
 * __usecs_to_jiffies: - convert microseconds to jiffies
 * @u:        time in milliseconds
 *
 * Return: jiffies value
 */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
        if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
        return _usecs_to_jiffies(u);
}
EXPORT_SYMBOL(__usecs_to_jiffies);

/**
 * timespec64_to_jiffies - convert a timespec64 value to jiffies
 * @value: pointer to &struct timespec64
 *
 * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
 * that a remainder subtract here would not do the right thing as the
 * resolution values don't fall on second boundaries.  I.e. the line:
 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
 * Note that due to the small error in the multiplier here, this
 * rounding is incorrect for sufficiently large values of tv_nsec, but
 * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
 * OK.
 *
 * Rather, we just shift the bits off the right.
 *
 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
 * value to a scaled second value.
 *
 * Return: jiffies value
 */
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
        u64 sec = value->tv_sec;
        long nsec = value->tv_nsec + TICK_NSEC - 1;

        if (sec >= MAX_SEC_IN_JIFFIES){
                sec = MAX_SEC_IN_JIFFIES;
                nsec = 0;
        }
        return ((sec * SEC_CONVERSION) +
                (((u64)nsec * NSEC_CONVERSION) >>
                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;

}
EXPORT_SYMBOL(timespec64_to_jiffies);

/**
 * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
 * @jiffies: jiffies value
 * @value: pointer to &struct timespec64
 */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
        /*
         * Convert jiffies to nanoseconds and separate with
         * one divide.
         */
        u32 rem;
        value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
                                    NSEC_PER_SEC, &rem);
        value->tv_nsec = rem;
}
EXPORT_SYMBOL(jiffies_to_timespec64);

/*
 * Convert jiffies/jiffies_64 to clock_t and back.
 */

/**
 * jiffies_to_clock_t - Convert jiffies to clock_t
 * @x: jiffies value
 *
 * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
 */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        return x * (USER_HZ / HZ);
# else
        return x / (HZ / USER_HZ);
# endif
#else
        return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

/**
 * clock_t_to_jiffies - Convert clock_t to jiffies
 * @x: clock_t value
 *
 * Return: clock_t value converted to jiffies
 */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
        if (x >= ~0UL / (HZ / USER_HZ))
                return ~0UL;
        return x * (HZ / USER_HZ);
#else
        /* Don't worry about loss of precision here .. */
        if (x >= ~0UL / HZ * USER_HZ)
                return ~0UL;

        /* .. but do try to contain it here */
        return div_u64((u64)x * HZ, USER_HZ);
#endif
}
EXPORT_SYMBOL(clock_t_to_jiffies);

/**
 * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
 * @x: jiffies_64 value
 *
 * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        x = div_u64(x * USER_HZ, HZ);
# elif HZ > USER_HZ
        x = div_u64(x, HZ / USER_HZ);
# else
        /* Nothing to do */
# endif
#else
        /*
         * There are better ways that don't overflow early,
         * but even this doesn't overflow in hundreds of years
         * in 64 bits, so..
         */
        x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
#endif
        return x;
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);

/**
 * nsec_to_clock_t - Convert nsec value to clock_t
 * @x: nsec value
 *
 * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
        return div_u64(x, NSEC_PER_SEC / USER_HZ);
#elif (USER_HZ % 512) == 0
        return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
         * overflow after 64.99 years.
         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
         */
        return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
#endif
}

/**
 * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
 * @j: jiffies64 value
 *
 * Return: nanoseconds value
 */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
        return (NSEC_PER_SEC / HZ) * j;
# else
        return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_nsecs);

/**
 * jiffies64_to_msecs - Convert jiffies64 to milliseconds
 * @j: jiffies64 value
 *
 * Return: milliseconds value
 */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#else
        return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_msecs);

/**
 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies64 value
 */
u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
        return div_u64(n, NSEC_PER_SEC / HZ);
#elif (HZ % 512) == 0
        /* overflow after 292 years if HZ = 1024 */
        return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * Generic case - optimized for cases where HZ is a multiple of 3.
         * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
         */
        return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
EXPORT_SYMBOL(nsecs_to_jiffies64);

/**
 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies value
 */
unsigned long nsecs_to_jiffies(u64 n)
{
        return (unsigned long)nsecs_to_jiffies64(n);
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);

/**
 * timespec64_add_safe - Add two timespec64 values and do a safety check
 * for overflow.
 * @lhs: first (left) timespec64 to add
 * @rhs: second (right) timespec64 to add
 *
 * It's assumed that both values are valid (>= 0).
 * And, each timespec64 is in normalized form.
 *
 * Return: sum of @lhs + @rhs
 */
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                const struct timespec64 rhs)
{
        struct timespec64 res;

        set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
                        lhs.tv_nsec + rhs.tv_nsec);

        if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
                res.tv_sec = TIME64_MAX;
                res.tv_nsec = 0;
        }

        return res;
}

/**
 * get_timespec64 - get user's time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's time value as &struct __kernel_timespec
 *
 * Handles compat or 32-bit modes.
 *
 * Return: %0 on success or negative errno on error
 */
int get_timespec64(struct timespec64 *ts,
                   const struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts;
        int ret;

        ret = copy_from_user(&kts, uts, sizeof(kts));
        if (ret)
                return -EFAULT;

        ts->tv_sec = kts.tv_sec;

        /* Zero out the padding in compat mode */
        if (in_compat_syscall())
                kts.tv_nsec &= 0xFFFFFFFFUL;

        /* In 32-bit mode, this drops the padding */
        ts->tv_nsec = kts.tv_nsec;

        return 0;
}
EXPORT_SYMBOL_GPL(get_timespec64);

/**
 * put_timespec64 - convert timespec64 value to __kernel_timespec format and
 *                     copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct __kernel_timespec
 *
 * Return: %0 on success or negative errno on error
 */
int put_timespec64(const struct timespec64 *ts,
                   struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts = {
                .tv_sec = ts->tv_sec,
                .tv_nsec = ts->tv_nsec
        };

        return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);

static int __get_old_timespec32(struct timespec64 *ts64,
                                   const struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts;
        int ret;

        ret = copy_from_user(&ts, cts, sizeof(ts));
        if (ret)
                return -EFAULT;

        ts64->tv_sec = ts.tv_sec;
        ts64->tv_nsec = ts.tv_nsec;

        return 0;
}

static int __put_old_timespec32(const struct timespec64 *ts64,
                                   struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts = {
                .tv_sec = ts64->tv_sec,
                .tv_nsec = ts64->tv_nsec
        };
        return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}

/**
 * get_old_timespec32 - get user's old-format time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's old-format time value (&struct old_timespec32)
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: %0 on success or negative errno on error
 */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __get_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(get_old_timespec32);

/**
 * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
 *                         copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct old_timespec32
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: %0 on success or negative errno on error
 */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __put_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(put_old_timespec32);

/**
 * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
 * @it: destination &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: %0 on success or negative errno on error
 */
int get_itimerspec64(struct itimerspec64 *it,
                        const struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = get_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = get_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(get_itimerspec64);

/**
 * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
 *                       and copy the latter to userspace
 * @it: input &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: %0 on success or negative errno on error
 */
int put_itimerspec64(const struct itimerspec64 *it,
                        struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = put_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = put_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(put_itimerspec64);

/**
 * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
 * @its: destination &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: %0 on success or negative errno on error
 */
int get_old_itimerspec32(struct itimerspec64 *its,
                        const struct old_itimerspec32 __user *uits)
{

        if (__get_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __get_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);

/**
 * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
 *                          old_itimerspec32 and copy the latter to userspace
 * @its: input &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: %0 on success or negative errno on error
 */
int put_old_itimerspec32(const struct itimerspec64 *its,
                        struct old_itimerspec32 __user *uits)
{
        if (__put_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __put_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(put_old_itimerspec32);












































































































































































































































































































































































































































































































































































































































































    1 








    1 



































































































































































    1 
    1 




    1 














































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        PF_INET6 socket protocol family
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Adapted from linux/net/ipv4/af_inet.c
 *
 *        Fixes:
 *        piggy, Karl Knutson        :        Socket protocol table
 *        Hideaki YOSHIFUJI        :        sin6_scope_id support
 *        Arnaldo Melo                :        check proc_net_create return, cleanups
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/module.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/slab.h>

#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/icmpv6.h>
#include <linux/netfilter_ipv6.h>

#include <net/ip.h>
#include <net/ipv6.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/tcp.h>
#include <net/ping.h>
#include <net/protocol.h>
#include <net/inet_common.h>
#include <net/route.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/ipv6_stubs.h>
#include <net/ndisc.h>
#ifdef CONFIG_IPV6_TUNNEL
#include <net/ip6_tunnel.h>
#endif
#include <net/calipso.h>
#include <net/seg6.h>
#include <net/rpl.h>
#include <net/compat.h>
#include <net/xfrm.h>
#include <net/ioam6.h>
#include <net/rawv6.h>
#include <net/rps.h>

#include <linux/uaccess.h>
#include <linux/mroute6.h>

#include "ip6_offload.h"

MODULE_AUTHOR("Cast of dozens");
MODULE_DESCRIPTION("IPv6 protocol stack for Linux");
MODULE_LICENSE("GPL");

/* The inetsw6 table contains everything that inet6_create needs to
 * build a new socket.
 */
static struct list_head inetsw6[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw6_lock);

struct ipv6_params ipv6_defaults = {
        .disable_ipv6 = 0,
        .autoconf = 1,
};

static int disable_ipv6_mod;

module_param_named(disable, disable_ipv6_mod, int, 0444);
MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional");

module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444);
MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces");

module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444);
MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces");

bool ipv6_mod_enabled(void)
{
        return disable_ipv6_mod == 0;
}
EXPORT_SYMBOL_GPL(ipv6_mod_enabled);

static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
{
        const int offset = sk->sk_prot->ipv6_pinfo_offset;

        return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}

void inet6_sock_destruct(struct sock *sk)
{
        inet6_cleanup_sock(sk);
        inet_sock_destruct(sk);
}
EXPORT_SYMBOL_GPL(inet6_sock_destruct);

static int inet6_create(struct net *net, struct socket *sock, int protocol,
                        int kern)
{
        struct inet_sock *inet;
        struct ipv6_pinfo *np;
        struct sock *sk;
        struct inet_protosw *answer;
        struct proto *answer_prot;
        unsigned char answer_flags;
        int try_loading_module = 0;
        int err;

        if (protocol < 0 || protocol >= IPPROTO_MAX)
                return -EINVAL;

        /* Look for the requested type/protocol pair. */
lookup_protocol:
        err = -ESOCKTNOSUPPORT;
        rcu_read_lock();
        list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) {

                err = 0;
                /* Check the non-wild match. */
                if (protocol == answer->protocol) {
                        if (protocol != IPPROTO_IP)
                                break;
                } else {
                        /* Check for the two wild cases. */
                        if (IPPROTO_IP == protocol) {
                                protocol = answer->protocol;
                                break;
                        }
                        if (IPPROTO_IP == answer->protocol)
                                break;
                }
                err = -EPROTONOSUPPORT;
        }

        if (err) {
                if (try_loading_module < 2) {
                        rcu_read_unlock();
                        /*
                         * Be more specific, e.g. net-pf-10-proto-132-type-1
                         * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM)
                         */
                        if (++try_loading_module == 1)
                                request_module("net-pf-%d-proto-%d-type-%d",
                                                PF_INET6, protocol, sock->type);
                        /*
                         * Fall back to generic, e.g. net-pf-10-proto-132
                         * (net-pf-PF_INET6-proto-IPPROTO_SCTP)
                         */
                        else
                                request_module("net-pf-%d-proto-%d",
                                                PF_INET6, protocol);
                        goto lookup_protocol;
                } else
                        goto out_rcu_unlock;
        }

        err = -EPERM;
        if (sock->type == SOCK_RAW && !kern &&
            !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out_rcu_unlock;

        sock->ops = answer->ops;
        answer_prot = answer->prot;
        answer_flags = answer->flags;
        rcu_read_unlock();

        WARN_ON(!answer_prot->slab);

        err = -ENOBUFS;
        sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern);
        if (!sk)
                goto out;

        sock_init_data(sock, sk);

        err = 0;
        if (INET_PROTOSW_REUSE & answer_flags)
                sk->sk_reuse = SK_CAN_REUSE;

        if (INET_PROTOSW_ICSK & answer_flags)
                inet_init_csk_locks(sk);

        inet = inet_sk(sk);
        inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);

        if (SOCK_RAW == sock->type) {
                inet->inet_num = protocol;
                if (IPPROTO_RAW == protocol)
                        inet_set_bit(HDRINCL, sk);
        }

        sk->sk_destruct                = inet6_sock_destruct;
        sk->sk_family                = PF_INET6;
        sk->sk_protocol                = protocol;

        sk->sk_backlog_rcv        = answer->prot->backlog_rcv;

        inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
        np->hop_limit        = -1;
        np->mcast_hops        = IPV6_DEFAULT_MCASTHOPS;
        inet6_set_bit(MC6_LOOP, sk);
        inet6_set_bit(MC6_ALL, sk);
        np->pmtudisc        = IPV6_PMTUDISC_WANT;
        inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect &
                                     FLOWLABEL_REFLECT_ESTABLISHED);
        sk->sk_ipv6only        = net->ipv6.sysctl.bindv6only;
        sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);

        /* Init the ipv4 part of the socket since we can have sockets
         * using v6 API for ipv4.
         */
        inet->uc_ttl        = -1;

        inet_set_bit(MC_LOOP, sk);
        inet->mc_ttl        = 1;
        inet->mc_index        = 0;
        RCU_INIT_POINTER(inet->mc_list, NULL);
        inet->rcv_tos        = 0;

        if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
                inet->pmtudisc = IP_PMTUDISC_DONT;
        else
                inet->pmtudisc = IP_PMTUDISC_WANT;

        if (inet->inet_num) {
                /* It assumes that any protocol which allows
                 * the user to assign a number at socket
                 * creation time automatically shares.
                 */
                inet->inet_sport = htons(inet->inet_num);
                err = sk->sk_prot->hash(sk);
                if (err) {
                        sk_common_release(sk);
                        goto out;
                }
        }
        if (sk->sk_prot->init) {
                err = sk->sk_prot->init(sk);
                if (err) {
                        sk_common_release(sk);
                        goto out;
                }
        }

        if (!kern) {
                err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
                if (err) {
                        sk_common_release(sk);
                        goto out;
                }
        }
out:
        return err;
out_rcu_unlock:
        rcu_read_unlock();
        goto out;
}

static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
                        u32 flags)
{
        struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct net *net = sock_net(sk);
        __be32 v4addr = 0;
        unsigned short snum;
        bool saved_ipv6only;
        int addr_type = 0;
        int err = 0;

        if (addr->sin6_family != AF_INET6)
                return -EAFNOSUPPORT;

        addr_type = ipv6_addr_type(&addr->sin6_addr);
        if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM)
                return -EINVAL;

        snum = ntohs(addr->sin6_port);
        if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
            snum && inet_port_requires_bind_service(net, snum) &&
            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                return -EACCES;

        if (flags & BIND_WITH_LOCK)
                lock_sock(sk);

        /* Check these errors (active socket, double bind). */
        if (sk->sk_state != TCP_CLOSE || inet->inet_num) {
                err = -EINVAL;
                goto out;
        }

        /* Check if the address belongs to the host. */
        if (addr_type == IPV6_ADDR_MAPPED) {
                struct net_device *dev = NULL;
                int chk_addr_ret;

                /* Binding to v4-mapped address on a v6-only socket
                 * makes no sense
                 */
                if (ipv6_only_sock(sk)) {
                        err = -EINVAL;
                        goto out;
                }

                rcu_read_lock();
                if (sk->sk_bound_dev_if) {
                        dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
                        if (!dev) {
                                err = -ENODEV;
                                goto out_unlock;
                        }
                }

                /* Reproduce AF_INET checks to make the bindings consistent */
                v4addr = addr->sin6_addr.s6_addr32[3];
                chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr);
                rcu_read_unlock();

                if (!inet_addr_valid_or_nonlocal(net, inet, v4addr,
                                                 chk_addr_ret)) {
                        err = -EADDRNOTAVAIL;
                        goto out;
                }
        } else {
                if (addr_type != IPV6_ADDR_ANY) {
                        struct net_device *dev = NULL;

                        rcu_read_lock();
                        if (__ipv6_addr_needs_scope_id(addr_type)) {
                                if (addr_len >= sizeof(struct sockaddr_in6) &&
                                    addr->sin6_scope_id) {
                                        /* Override any existing binding, if another one
                                         * is supplied by user.
                                         */
                                        sk->sk_bound_dev_if = addr->sin6_scope_id;
                                }

                                /* Binding to link-local address requires an interface */
                                if (!sk->sk_bound_dev_if) {
                                        err = -EINVAL;
                                        goto out_unlock;
                                }
                        }

                        if (sk->sk_bound_dev_if) {
                                dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
                                if (!dev) {
                                        err = -ENODEV;
                                        goto out_unlock;
                                }
                        }

                        /* ipv4 addr of the socket is invalid.  Only the
                         * unspecified and mapped address have a v4 equivalent.
                         */
                        v4addr = LOOPBACK4_IPV6;
                        if (!(addr_type & IPV6_ADDR_MULTICAST))        {
                                if (!ipv6_can_nonlocal_bind(net, inet) &&
                                    !ipv6_chk_addr(net, &addr->sin6_addr,
                                                   dev, 0)) {
                                        err = -EADDRNOTAVAIL;
                                        goto out_unlock;
                                }
                        }
                        rcu_read_unlock();
                }
        }

        inet->inet_rcv_saddr = v4addr;
        inet->inet_saddr = v4addr;

        sk->sk_v6_rcv_saddr = addr->sin6_addr;

        if (!(addr_type & IPV6_ADDR_MULTICAST))
                np->saddr = addr->sin6_addr;

        saved_ipv6only = sk->sk_ipv6only;
        if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED)
                sk->sk_ipv6only = 1;

        /* Make sure we are allowed to bind here. */
        if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
                      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
                err = sk->sk_prot->get_port(sk, snum);
                if (err) {
                        sk->sk_ipv6only = saved_ipv6only;
                        inet_reset_saddr(sk);
                        goto out;
                }
                if (!(flags & BIND_FROM_BPF)) {
                        err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
                        if (err) {
                                sk->sk_ipv6only = saved_ipv6only;
                                inet_reset_saddr(sk);
                                if (sk->sk_prot->put_port)
                                        sk->sk_prot->put_port(sk);
                                goto out;
                        }
                }
        }

        if (addr_type != IPV6_ADDR_ANY)
                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
        if (snum)
                sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
        inet->inet_sport = htons(inet->inet_num);
        inet->inet_dport = 0;
        inet->inet_daddr = 0;
out:
        if (flags & BIND_WITH_LOCK)
                release_sock(sk);
        return err;
out_unlock:
        rcu_read_unlock();
        goto out;
}

int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        u32 flags = BIND_WITH_LOCK;
        const struct proto *prot;
        int err = 0;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);
        /* If the socket has its own bind function then use it. */
        if (prot->bind)
                return prot->bind(sk, uaddr, addr_len);

        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;

        /* BPF prog is run before any checks are done so that if the prog
         * changes context in a wrong way it will be caught.
         */
        err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
                                                 CGROUP_INET6_BIND, &flags);
        if (err)
                return err;

        return __inet6_bind(sk, uaddr, addr_len, flags);
}

/* bind for INET6 API */
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        return inet6_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet6_bind);

int inet6_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        if (!sk)
                return -EINVAL;

        /* Free mc lists */
        ipv6_sock_mc_close(sk);

        /* Free ac lists */
        ipv6_sock_ac_close(sk);

        return inet_release(sock);
}
EXPORT_SYMBOL(inet6_release);

void inet6_cleanup_sock(struct sock *sk)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sk_buff *skb;
        struct ipv6_txoptions *opt;

        /* Release rx options */

        skb = xchg(&np->pktoptions, NULL);
        kfree_skb(skb);

        skb = xchg(&np->rxpmtu, NULL);
        kfree_skb(skb);

        /* Free flowlabels */
        fl6_free_socklist(sk);

        /* Free tx options */

        opt = unrcu_pointer(xchg(&np->opt, NULL));
        if (opt) {
                atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
                txopt_put(opt);
        }
}
EXPORT_SYMBOL_GPL(inet6_cleanup_sock);

/*
 *        This does both peername and sockname.
 */
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                  int peer)
{
        struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
        int sin_addr_len = sizeof(*sin);
        struct sock *sk = sock->sk;
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);

        sin->sin6_family = AF_INET6;
        sin->sin6_flowinfo = 0;
        sin->sin6_scope_id = 0;
        lock_sock(sk);
        if (peer) {
                if (!inet->inet_dport ||
                    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
                    peer == 1)) {
                        release_sock(sk);
                        return -ENOTCONN;
                }
                sin->sin6_port = inet->inet_dport;
                sin->sin6_addr = sk->sk_v6_daddr;
                if (inet6_test_bit(SNDFLOW, sk))
                        sin->sin6_flowinfo = np->flow_label;
                BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
                                       CGROUP_INET6_GETPEERNAME);
        } else {
                if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
                        sin->sin6_addr = np->saddr;
                else
                        sin->sin6_addr = sk->sk_v6_rcv_saddr;
                sin->sin6_port = inet->inet_sport;
                BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
                                       CGROUP_INET6_GETSOCKNAME);
        }
        sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
                                                 sk->sk_bound_dev_if);
        release_sock(sk);
        return sin_addr_len;
}
EXPORT_SYMBOL(inet6_getname);

int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        void __user *argp = (void __user *)arg;
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        const struct proto *prot;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT: {
                struct in6_rtmsg rtmsg;

                if (copy_from_user(&rtmsg, argp, sizeof(rtmsg)))
                        return -EFAULT;
                return ipv6_route_ioctl(net, cmd, &rtmsg);
        }
        case SIOCSIFADDR:
                return addrconf_add_ifaddr(net, argp);
        case SIOCDIFADDR:
                return addrconf_del_ifaddr(net, argp);
        case SIOCSIFDSTADDR:
                return addrconf_set_dstaddr(net, argp);
        default:
                /* IPV6_ADDRFORM can change sk->sk_prot under us. */
                prot = READ_ONCE(sk->sk_prot);
                if (!prot->ioctl)
                        return -ENOIOCTLCMD;
                return sk_ioctl(sk, cmd, (void __user *)arg);
        }
        /*NOTREACHED*/
        return 0;
}
EXPORT_SYMBOL(inet6_ioctl);

#ifdef CONFIG_COMPAT
struct compat_in6_rtmsg {
        struct in6_addr                rtmsg_dst;
        struct in6_addr                rtmsg_src;
        struct in6_addr                rtmsg_gateway;
        u32                        rtmsg_type;
        u16                        rtmsg_dst_len;
        u16                        rtmsg_src_len;
        u32                        rtmsg_metric;
        u32                        rtmsg_info;
        u32                        rtmsg_flags;
        s32                        rtmsg_ifindex;
};

static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
                struct compat_in6_rtmsg __user *ur)
{
        struct in6_rtmsg rt;

        if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst,
                        3 * sizeof(struct in6_addr)) ||
            get_user(rt.rtmsg_type, &ur->rtmsg_type) ||
            get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) ||
            get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) ||
            get_user(rt.rtmsg_metric, &ur->rtmsg_metric) ||
            get_user(rt.rtmsg_info, &ur->rtmsg_info) ||
            get_user(rt.rtmsg_flags, &ur->rtmsg_flags) ||
            get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex))
                return -EFAULT;


        return ipv6_route_ioctl(sock_net(sk), cmd, &rt);
}

int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT:
                return inet6_compat_routing_ioctl(sk, cmd, argp);
        default:
                return -ENOIOCTLCMD;
        }
}
EXPORT_SYMBOL_GPL(inet6_compat_ioctl);
#endif /* CONFIG_COMPAT */

INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *,
                                            size_t));
int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;

        if (unlikely(inet_send_prepare(sk)))
                return -EAGAIN;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);
        return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg,
                               sk, msg, size);
}

INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *,
                                            size_t, int, int *));
int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                  int flags)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;
        int addr_len = 0;
        int err;

        if (likely(!(flags & MSG_ERRQUEUE)))
                sock_rps_record_flow(sk);

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);
        err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg,
                              sk, msg, size, flags, &addr_len);
        if (err >= 0)
                msg->msg_namelen = addr_len;
        return err;
}

const struct proto_ops inet6_stream_ops = {
        .family                   = PF_INET6,
        .owner                   = THIS_MODULE,
        .release           = inet6_release,
        .bind                   = inet6_bind,
        .connect           = inet_stream_connect,        /* ok                */
        .socketpair           = sock_no_socketpair,        /* a do nothing        */
        .accept                   = inet_accept,                /* ok                */
        .getname           = inet6_getname,
        .poll                   = tcp_poll,                        /* ok                */
        .ioctl                   = inet6_ioctl,                /* must change  */
        .gettstamp           = sock_gettstamp,
        .listen                   = inet_listen,                /* ok                */
        .shutdown           = inet_shutdown,                /* ok                */
        .setsockopt           = sock_common_setsockopt,        /* ok                */
        .getsockopt           = sock_common_getsockopt,        /* ok                */
        .sendmsg           = inet6_sendmsg,                /* retpoline's sake */
        .recvmsg           = inet6_recvmsg,                /* retpoline's sake */
#ifdef CONFIG_MMU
        .mmap                   = tcp_mmap,
#endif
        .splice_eof           = inet_splice_eof,
        .sendmsg_locked    = tcp_sendmsg_locked,
        .splice_read           = tcp_splice_read,
        .read_sock           = tcp_read_sock,
        .read_skb           = tcp_read_skb,
        .peek_len           = tcp_peek_len,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet6_compat_ioctl,
#endif
        .set_rcvlowat           = tcp_set_rcvlowat,
};

const struct proto_ops inet6_dgram_ops = {
        .family                   = PF_INET6,
        .owner                   = THIS_MODULE,
        .release           = inet6_release,
        .bind                   = inet6_bind,
        .connect           = inet_dgram_connect,        /* ok                */
        .socketpair           = sock_no_socketpair,        /* a do nothing        */
        .accept                   = sock_no_accept,                /* a do nothing        */
        .getname           = inet6_getname,
        .poll                   = udp_poll,                        /* ok                */
        .ioctl                   = inet6_ioctl,                /* must change  */
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,                /* ok                */
        .shutdown           = inet_shutdown,                /* ok                */
        .setsockopt           = sock_common_setsockopt,        /* ok                */
        .getsockopt           = sock_common_getsockopt,        /* ok                */
        .sendmsg           = inet6_sendmsg,                /* retpoline's sake */
        .recvmsg           = inet6_recvmsg,                /* retpoline's sake */
        .read_skb           = udp_read_skb,
        .mmap                   = sock_no_mmap,
        .set_peek_off           = udp_set_peek_off,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet6_compat_ioctl,
#endif
};

static const struct net_proto_family inet6_family_ops = {
        .family = PF_INET6,
        .create = inet6_create,
        .owner        = THIS_MODULE,
};

int inet6_register_protosw(struct inet_protosw *p)
{
        struct list_head *lh;
        struct inet_protosw *answer;
        struct list_head *last_perm;
        int protocol = p->protocol;
        int ret;

        spin_lock_bh(&inetsw6_lock);

        ret = -EINVAL;
        if (p->type >= SOCK_MAX)
                goto out_illegal;

        /* If we are trying to override a permanent protocol, bail. */
        answer = NULL;
        ret = -EPERM;
        last_perm = &inetsw6[p->type];
        list_for_each(lh, &inetsw6[p->type]) {
                answer = list_entry(lh, struct inet_protosw, list);

                /* Check only the non-wild match. */
                if (INET_PROTOSW_PERMANENT & answer->flags) {
                        if (protocol == answer->protocol)
                                break;
                        last_perm = lh;
                }

                answer = NULL;
        }
        if (answer)
                goto out_permanent;

        /* Add the new entry after the last permanent entry if any, so that
         * the new entry does not override a permanent entry when matched with
         * a wild-card protocol. But it is allowed to override any existing
         * non-permanent entry.  This means that when we remove this entry, the
         * system automatically returns to the old behavior.
         */
        list_add_rcu(&p->list, last_perm);
        ret = 0;
out:
        spin_unlock_bh(&inetsw6_lock);
        return ret;

out_permanent:
        pr_err("Attempt to override permanent protocol %d\n", protocol);
        goto out;

out_illegal:
        pr_err("Ignoring attempt to register invalid socket type %d\n",
               p->type);
        goto out;
}
EXPORT_SYMBOL(inet6_register_protosw);

void
inet6_unregister_protosw(struct inet_protosw *p)
{
        if (INET_PROTOSW_PERMANENT & p->flags) {
                pr_err("Attempt to unregister permanent protocol %d\n",
                       p->protocol);
        } else {
                spin_lock_bh(&inetsw6_lock);
                list_del_rcu(&p->list);
                spin_unlock_bh(&inetsw6_lock);

                synchronize_net();
        }
}
EXPORT_SYMBOL(inet6_unregister_protosw);

int inet6_sk_rebuild_header(struct sock *sk)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct dst_entry *dst;

        dst = __sk_dst_check(sk, np->dst_cookie);

        if (!dst) {
                struct inet_sock *inet = inet_sk(sk);
                struct in6_addr *final_p, final;
                struct flowi6 fl6;

                memset(&fl6, 0, sizeof(fl6));
                fl6.flowi6_proto = sk->sk_protocol;
                fl6.daddr = sk->sk_v6_daddr;
                fl6.saddr = np->saddr;
                fl6.flowlabel = np->flow_label;
                fl6.flowi6_oif = sk->sk_bound_dev_if;
                fl6.flowi6_mark = sk->sk_mark;
                fl6.fl6_dport = inet->inet_dport;
                fl6.fl6_sport = inet->inet_sport;
                fl6.flowi6_uid = sk->sk_uid;
                security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));

                rcu_read_lock();
                final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
                                         &final);
                rcu_read_unlock();

                dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
                if (IS_ERR(dst)) {
                        sk->sk_route_caps = 0;
                        WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
                        return PTR_ERR(dst);
                }

                ip6_dst_store(sk, dst, NULL, NULL);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);

bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
                       const struct inet6_skb_parm *opt)
{
        const struct ipv6_pinfo *np = inet6_sk(sk);

        if (np->rxopt.all) {
                if (((opt->flags & IP6SKB_HOPBYHOP) &&
                     (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) ||
                    (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) &&
                     np->rxopt.bits.rxflow) ||
                    (opt->srcrt && (np->rxopt.bits.srcrt ||
                     np->rxopt.bits.osrcrt)) ||
                    ((opt->dst1 || opt->dst0) &&
                     (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts)))
                        return true;
        }
        return false;
}
EXPORT_SYMBOL_GPL(ipv6_opt_accepted);

static struct packet_type ipv6_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_IPV6),
        .func = ipv6_rcv,
        .list_func = ipv6_list_rcv,
};

static int __init ipv6_packet_init(void)
{
        dev_add_pack(&ipv6_packet_type);
        return 0;
}

static void ipv6_packet_cleanup(void)
{
        dev_remove_pack(&ipv6_packet_type);
}

static int __net_init ipv6_init_mibs(struct net *net)
{
        int i;

        net->mib.udp_stats_in6 = alloc_percpu(struct udp_mib);
        if (!net->mib.udp_stats_in6)
                return -ENOMEM;
        net->mib.udplite_stats_in6 = alloc_percpu(struct udp_mib);
        if (!net->mib.udplite_stats_in6)
                goto err_udplite_mib;
        net->mib.ipv6_statistics = alloc_percpu(struct ipstats_mib);
        if (!net->mib.ipv6_statistics)
                goto err_ip_mib;

        for_each_possible_cpu(i) {
                struct ipstats_mib *af_inet6_stats;
                af_inet6_stats = per_cpu_ptr(net->mib.ipv6_statistics, i);
                u64_stats_init(&af_inet6_stats->syncp);
        }


        net->mib.icmpv6_statistics = alloc_percpu(struct icmpv6_mib);
        if (!net->mib.icmpv6_statistics)
                goto err_icmp_mib;
        net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib),
                                                GFP_KERNEL);
        if (!net->mib.icmpv6msg_statistics)
                goto err_icmpmsg_mib;
        return 0;

err_icmpmsg_mib:
        free_percpu(net->mib.icmpv6_statistics);
err_icmp_mib:
        free_percpu(net->mib.ipv6_statistics);
err_ip_mib:
        free_percpu(net->mib.udplite_stats_in6);
err_udplite_mib:
        free_percpu(net->mib.udp_stats_in6);
        return -ENOMEM;
}

static void ipv6_cleanup_mibs(struct net *net)
{
        free_percpu(net->mib.udp_stats_in6);
        free_percpu(net->mib.udplite_stats_in6);
        free_percpu(net->mib.ipv6_statistics);
        free_percpu(net->mib.icmpv6_statistics);
        kfree(net->mib.icmpv6msg_statistics);
}

static int __net_init inet6_net_init(struct net *net)
{
        int err = 0;

        net->ipv6.sysctl.bindv6only = 0;
        net->ipv6.sysctl.icmpv6_time = 1*HZ;
        net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
        net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
        net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
        net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0;

        /* By default, rate limit error messages.
         * Except for pmtu discovery, it would break it.
         * proc_do_large_bitmap needs pointer to the bitmap.
         */
        bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1);
        bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1);
        net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask;

        net->ipv6.sysctl.flowlabel_consistency = 1;
        net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
        net->ipv6.sysctl.idgen_retries = 3;
        net->ipv6.sysctl.idgen_delay = 1 * HZ;
        net->ipv6.sysctl.flowlabel_state_ranges = 0;
        net->ipv6.sysctl.max_dst_opts_cnt = IP6_DEFAULT_MAX_DST_OPTS_CNT;
        net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
        net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
        net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
        net->ipv6.sysctl.fib_notify_on_flag_change = 0;
        atomic_set(&net->ipv6.fib6_sernum, 1);

        net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID;
        net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE;

        err = ipv6_init_mibs(net);
        if (err)
                return err;
#ifdef CONFIG_PROC_FS
        err = udp6_proc_init(net);
        if (err)
                goto out;
        err = tcp6_proc_init(net);
        if (err)
                goto proc_tcp6_fail;
        err = ac6_proc_init(net);
        if (err)
                goto proc_ac6_fail;
#endif
        return err;

#ifdef CONFIG_PROC_FS
proc_ac6_fail:
        tcp6_proc_exit(net);
proc_tcp6_fail:
        udp6_proc_exit(net);
out:
        ipv6_cleanup_mibs(net);
        return err;
#endif
}

static void __net_exit inet6_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        udp6_proc_exit(net);
        tcp6_proc_exit(net);
        ac6_proc_exit(net);
#endif
        ipv6_cleanup_mibs(net);
}

static struct pernet_operations inet6_net_ops = {
        .init = inet6_net_init,
        .exit = inet6_net_exit,
};

static int ipv6_route_input(struct sk_buff *skb)
{
        ip6_route_input(skb);
        return skb_dst(skb)->error;
}

static const struct ipv6_stub ipv6_stub_impl = {
        .ipv6_sock_mc_join = ipv6_sock_mc_join,
        .ipv6_sock_mc_drop = ipv6_sock_mc_drop,
        .ipv6_dst_lookup_flow = ip6_dst_lookup_flow,
        .ipv6_route_input  = ipv6_route_input,
        .fib6_get_table           = fib6_get_table,
        .fib6_table_lookup = fib6_table_lookup,
        .fib6_lookup       = fib6_lookup,
        .fib6_select_path  = fib6_select_path,
        .ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
        .fib6_nh_init           = fib6_nh_init,
        .fib6_nh_release   = fib6_nh_release,
        .fib6_nh_release_dsts = fib6_nh_release_dsts,
        .fib6_update_sernum = fib6_update_sernum_stub,
        .fib6_rt_update           = fib6_rt_update,
        .ip6_del_rt           = ip6_del_rt,
        .udpv6_encap_enable = udpv6_encap_enable,
        .ndisc_send_na = ndisc_send_na,
#if IS_ENABLED(CONFIG_XFRM)
        .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu,
        .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv,
        .xfrm6_gro_udp_encap_rcv = xfrm6_gro_udp_encap_rcv,
        .xfrm6_rcv_encap = xfrm6_rcv_encap,
#endif
        .nd_tbl        = &nd_tbl,
        .ipv6_fragment = ip6_fragment,
        .ipv6_dev_find = ipv6_dev_find,
};

static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
        .inet6_bind = __inet6_bind,
        .udp6_lib_lookup = __udp6_lib_lookup,
        .ipv6_setsockopt = do_ipv6_setsockopt,
        .ipv6_getsockopt = do_ipv6_getsockopt,
        .ipv6_dev_get_saddr = ipv6_dev_get_saddr,
};

static int __init inet6_init(void)
{
        struct list_head *r;
        int err = 0;

        sock_skb_cb_check_size(sizeof(struct inet6_skb_parm));

        /* Register the socket-side information for inet6_create.  */
        for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
                INIT_LIST_HEAD(r);

        raw_hashinfo_init(&raw_v6_hashinfo);

        if (disable_ipv6_mod) {
                pr_info("Loaded, but administratively disabled, reboot required to enable\n");
                goto out;
        }

        err = proto_register(&tcpv6_prot, 1);
        if (err)
                goto out;

        err = proto_register(&udpv6_prot, 1);
        if (err)
                goto out_unregister_tcp_proto;

        err = proto_register(&udplitev6_prot, 1);
        if (err)
                goto out_unregister_udp_proto;

        err = proto_register(&rawv6_prot, 1);
        if (err)
                goto out_unregister_udplite_proto;

        err = proto_register(&pingv6_prot, 1);
        if (err)
                goto out_unregister_raw_proto;

        /* We MUST register RAW sockets before we create the ICMP6,
         * IGMP6, or NDISC control sockets.
         */
        err = rawv6_init();
        if (err)
                goto out_unregister_ping_proto;

        /* Register the family here so that the init calls below will
         * be able to create sockets. (?? is this dangerous ??)
         */
        err = sock_register(&inet6_family_ops);
        if (err)
                goto out_sock_register_fail;

        /*
         *        ipngwg API draft makes clear that the correct semantics
         *        for TCP and UDP is to consider one TCP and UDP instance
         *        in a host available by both INET and INET6 APIs and
         *        able to communicate via both network protocols.
         */

        err = register_pernet_subsys(&inet6_net_ops);
        if (err)
                goto register_pernet_fail;
        err = ip6_mr_init();
        if (err)
                goto ipmr_fail;
        err = icmpv6_init();
        if (err)
                goto icmp_fail;
        err = ndisc_init();
        if (err)
                goto ndisc_fail;
        err = igmp6_init();
        if (err)
                goto igmp_fail;

        err = ipv6_netfilter_init();
        if (err)
                goto netfilter_fail;
        /* Create /proc/foo6 entries. */
#ifdef CONFIG_PROC_FS
        err = -ENOMEM;
        if (raw6_proc_init())
                goto proc_raw6_fail;
        if (udplite6_proc_init())
                goto proc_udplite6_fail;
        if (ipv6_misc_proc_init())
                goto proc_misc6_fail;
        if (if6_proc_init())
                goto proc_if6_fail;
#endif
        err = ip6_route_init();
        if (err)
                goto ip6_route_fail;
        err = ndisc_late_init();
        if (err)
                goto ndisc_late_fail;
        err = ip6_flowlabel_init();
        if (err)
                goto ip6_flowlabel_fail;
        err = ipv6_anycast_init();
        if (err)
                goto ipv6_anycast_fail;
        err = addrconf_init();
        if (err)
                goto addrconf_fail;

        /* Init v6 extension headers. */
        err = ipv6_exthdrs_init();
        if (err)
                goto ipv6_exthdrs_fail;

        err = ipv6_frag_init();
        if (err)
                goto ipv6_frag_fail;

        /* Init v6 transport protocols. */
        err = udpv6_init();
        if (err)
                goto udpv6_fail;

        err = udplitev6_init();
        if (err)
                goto udplitev6_fail;

        err = udpv6_offload_init();
        if (err)
                goto udpv6_offload_fail;

        err = tcpv6_init();
        if (err)
                goto tcpv6_fail;

        err = ipv6_packet_init();
        if (err)
                goto ipv6_packet_fail;

        err = pingv6_init();
        if (err)
                goto pingv6_fail;

        err = calipso_init();
        if (err)
                goto calipso_fail;

        err = seg6_init();
        if (err)
                goto seg6_fail;

        err = rpl_init();
        if (err)
                goto rpl_fail;

        err = ioam6_init();
        if (err)
                goto ioam6_fail;

        err = igmp6_late_init();
        if (err)
                goto igmp6_late_err;

#ifdef CONFIG_SYSCTL
        err = ipv6_sysctl_register();
        if (err)
                goto sysctl_fail;
#endif

        /* ensure that ipv6 stubs are visible only after ipv6 is ready */
        wmb();
        ipv6_stub = &ipv6_stub_impl;
        ipv6_bpf_stub = &ipv6_bpf_stub_impl;
out:
        return err;

#ifdef CONFIG_SYSCTL
sysctl_fail:
        igmp6_late_cleanup();
#endif
igmp6_late_err:
        ioam6_exit();
ioam6_fail:
        rpl_exit();
rpl_fail:
        seg6_exit();
seg6_fail:
        calipso_exit();
calipso_fail:
        pingv6_exit();
pingv6_fail:
        ipv6_packet_cleanup();
ipv6_packet_fail:
        tcpv6_exit();
tcpv6_fail:
        udpv6_offload_exit();
udpv6_offload_fail:
        udplitev6_exit();
udplitev6_fail:
        udpv6_exit();
udpv6_fail:
        ipv6_frag_exit();
ipv6_frag_fail:
        ipv6_exthdrs_exit();
ipv6_exthdrs_fail:
        addrconf_cleanup();
addrconf_fail:
        ipv6_anycast_cleanup();
ipv6_anycast_fail:
        ip6_flowlabel_cleanup();
ip6_flowlabel_fail:
        ndisc_late_cleanup();
ndisc_late_fail:
        ip6_route_cleanup();
ip6_route_fail:
#ifdef CONFIG_PROC_FS
        if6_proc_exit();
proc_if6_fail:
        ipv6_misc_proc_exit();
proc_misc6_fail:
        udplite6_proc_exit();
proc_udplite6_fail:
        raw6_proc_exit();
proc_raw6_fail:
#endif
        ipv6_netfilter_fini();
netfilter_fail:
        igmp6_cleanup();
igmp_fail:
        ndisc_cleanup();
ndisc_fail:
        icmpv6_cleanup();
icmp_fail:
        ip6_mr_cleanup();
ipmr_fail:
        unregister_pernet_subsys(&inet6_net_ops);
register_pernet_fail:
        sock_unregister(PF_INET6);
        rtnl_unregister_all(PF_INET6);
out_sock_register_fail:
        rawv6_exit();
out_unregister_ping_proto:
        proto_unregister(&pingv6_prot);
out_unregister_raw_proto:
        proto_unregister(&rawv6_prot);
out_unregister_udplite_proto:
        proto_unregister(&udplitev6_prot);
out_unregister_udp_proto:
        proto_unregister(&udpv6_prot);
out_unregister_tcp_proto:
        proto_unregister(&tcpv6_prot);
        goto out;
}
module_init(inet6_init);

MODULE_ALIAS_NETPROTO(PF_INET6);































































































































































































































    2 
    3 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/types.h>
#include <linux/netfilter.h>
#include <net/tcp.h>

#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>

int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                      s32 off)
{
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        struct nf_conn_seqadj *seqadj;
        struct nf_ct_seqadj *this_way;

        if (off == 0)
                return 0;

        set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);

        seqadj = nfct_seqadj(ct);
        this_way = &seqadj->seq[dir];
        this_way->offset_before         = off;
        this_way->offset_after         = off;
        return 0;
}
EXPORT_SYMBOL_GPL(nf_ct_seqadj_init);

int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                     __be32 seq, s32 off)
{
        struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        struct nf_ct_seqadj *this_way;

        if (off == 0)
                return 0;

        if (unlikely(!seqadj)) {
                WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n");
                return 0;
        }

        set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);

        spin_lock_bh(&ct->lock);
        this_way = &seqadj->seq[dir];
        if (this_way->offset_before == this_way->offset_after ||
            before(this_way->correction_pos, ntohl(seq))) {
                this_way->correction_pos = ntohl(seq);
                this_way->offset_before         = this_way->offset_after;
                this_way->offset_after        += off;
        }
        spin_unlock_bh(&ct->lock);
        return 0;
}
EXPORT_SYMBOL_GPL(nf_ct_seqadj_set);

void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
                          struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                          s32 off)
{
        const struct tcphdr *th;

        if (nf_ct_protonum(ct) != IPPROTO_TCP)
                return;

        th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb));
        nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
}
EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set);

/* Adjust one found SACK option including checksum correction */
static void nf_ct_sack_block_adjust(struct sk_buff *skb,
                                    struct tcphdr *tcph,
                                    unsigned int sackoff,
                                    unsigned int sackend,
                                    struct nf_ct_seqadj *seq)
{
        while (sackoff < sackend) {
                struct tcp_sack_block_wire *sack;
                __be32 new_start_seq, new_end_seq;

                sack = (void *)skb->data + sackoff;
                if (after(ntohl(sack->start_seq) - seq->offset_before,
                          seq->correction_pos))
                        new_start_seq = htonl(ntohl(sack->start_seq) -
                                        seq->offset_after);
                else
                        new_start_seq = htonl(ntohl(sack->start_seq) -
                                        seq->offset_before);

                if (after(ntohl(sack->end_seq) - seq->offset_before,
                          seq->correction_pos))
                        new_end_seq = htonl(ntohl(sack->end_seq) -
                                      seq->offset_after);
                else
                        new_end_seq = htonl(ntohl(sack->end_seq) -
                                      seq->offset_before);

                pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n",
                         ntohl(sack->start_seq), ntohl(new_start_seq),
                         ntohl(sack->end_seq), ntohl(new_end_seq));

                inet_proto_csum_replace4(&tcph->check, skb,
                                         sack->start_seq, new_start_seq, false);
                inet_proto_csum_replace4(&tcph->check, skb,
                                         sack->end_seq, new_end_seq, false);
                sack->start_seq = new_start_seq;
                sack->end_seq = new_end_seq;
                sackoff += sizeof(*sack);
        }
}

/* TCP SACK sequence number adjustment */
static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
                                      unsigned int protoff,
                                      struct nf_conn *ct,
                                      enum ip_conntrack_info ctinfo)
{
        struct tcphdr *tcph = (void *)skb->data + protoff;
        struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
        unsigned int dir, optoff, optend;

        optoff = protoff + sizeof(struct tcphdr);
        optend = protoff + tcph->doff * 4;

        if (skb_ensure_writable(skb, optend))
                return 0;

        tcph = (void *)skb->data + protoff;
        dir = CTINFO2DIR(ctinfo);

        while (optoff < optend) {
                /* Usually: option, length. */
                unsigned char *op = skb->data + optoff;

                switch (op[0]) {
                case TCPOPT_EOL:
                        return 1;
                case TCPOPT_NOP:
                        optoff++;
                        continue;
                default:
                        /* no partial options */
                        if (optoff + 1 == optend ||
                            optoff + op[1] > optend ||
                            op[1] < 2)
                                return 0;
                        if (op[0] == TCPOPT_SACK &&
                            op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
                            ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
                                nf_ct_sack_block_adjust(skb, tcph, optoff + 2,
                                                        optoff+op[1],
                                                        &seqadj->seq[!dir]);
                        optoff += op[1];
                }
        }
        return 1;
}

/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
int nf_ct_seq_adjust(struct sk_buff *skb,
                     struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                     unsigned int protoff)
{
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        struct tcphdr *tcph;
        __be32 newseq, newack;
        s32 seqoff, ackoff;
        struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
        struct nf_ct_seqadj *this_way, *other_way;
        int res = 1;

        this_way  = &seqadj->seq[dir];
        other_way = &seqadj->seq[!dir];

        if (skb_ensure_writable(skb, protoff + sizeof(*tcph)))
                return 0;

        tcph = (void *)skb->data + protoff;
        spin_lock_bh(&ct->lock);
        if (after(ntohl(tcph->seq), this_way->correction_pos))
                seqoff = this_way->offset_after;
        else
                seqoff = this_way->offset_before;

        newseq = htonl(ntohl(tcph->seq) + seqoff);
        inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false);
        pr_debug("Adjusting sequence number from %u->%u\n",
                 ntohl(tcph->seq), ntohl(newseq));
        tcph->seq = newseq;

        if (!tcph->ack)
                goto out;

        if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
                  other_way->correction_pos))
                ackoff = other_way->offset_after;
        else
                ackoff = other_way->offset_before;

        newack = htonl(ntohl(tcph->ack_seq) - ackoff);
        inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack,
                                 false);
        pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n",
                 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
                 ntohl(newack));
        tcph->ack_seq = newack;

        res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo);
out:
        spin_unlock_bh(&ct->lock);

        return res;
}
EXPORT_SYMBOL_GPL(nf_ct_seq_adjust);

s32 nf_ct_seq_offset(const struct nf_conn *ct,
                     enum ip_conntrack_dir dir,
                     u32 seq)
{
        struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
        struct nf_ct_seqadj *this_way;

        if (!seqadj)
                return 0;

        this_way = &seqadj->seq[dir];
        return after(seq, this_way->correction_pos) ?
                 this_way->offset_after : this_way->offset_before;
}
EXPORT_SYMBOL_GPL(nf_ct_seq_offset);




















    3 











    2 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_TSTAMP_H
#define _NF_CONNTRACK_TSTAMP_H

#include <net/net_namespace.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>

struct nf_conn_tstamp {
        u_int64_t start;
        u_int64_t stop;
};

static inline
struct nf_conn_tstamp *nf_conn_tstamp_find(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
        return nf_ct_ext_find(ct, NF_CT_EXT_TSTAMP);
#else
        return NULL;
#endif
}

static inline
struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
        struct net *net = nf_ct_net(ct);

        if (!net->ct.sysctl_tstamp)
                return NULL;

        return nf_ct_ext_add(ct, NF_CT_EXT_TSTAMP, gfp);
#else
        return NULL;
#endif
};

#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
void nf_conntrack_tstamp_pernet_init(struct net *net);
#else
static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {}
#endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */

#endif /* _NF_CONNTRACK_TSTAMP_H */






































    2 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ipi

#if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IPI_H

#include <linux/tracepoint.h>

/**
 * ipi_raise - called when a smp cross call is made
 *
 * @mask: mask of recipient CPUs for the IPI
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string.
 */
TRACE_EVENT(ipi_raise,

        TP_PROTO(const struct cpumask *mask, const char *reason),

        TP_ARGS(mask, reason),

        TP_STRUCT__entry(
                __bitmask(target_cpus, nr_cpumask_bits)
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits);
                __entry->reason = reason;
        ),

        TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
);

TRACE_EVENT(ipi_send_cpu,

        TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),

        TP_ARGS(cpu, callsite, callback),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpu=%u callsite=%pS callback=%pS",
                  __entry->cpu, __entry->callsite, __entry->callback)
);

TRACE_EVENT(ipi_send_cpumask,

        TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),

        TP_ARGS(cpumask, callsite, callback),

        TP_STRUCT__entry(
                __cpumask(cpumask)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __assign_cpumask(cpumask, cpumask_bits(cpumask));
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpumask=%s callsite=%pS callback=%pS",
                  __get_cpumask(cpumask), __entry->callsite, __entry->callback)
);

DECLARE_EVENT_CLASS(ipi_handler,

        TP_PROTO(const char *reason),

        TP_ARGS(reason),

        TP_STRUCT__entry(
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __entry->reason = reason;
        ),

        TP_printk("(%s)", __entry->reason)
);

/**
 * ipi_entry - called immediately before the IPI handler
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise
 * for that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_entry,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

/**
 * ipi_exit - called immediately after the IPI handler returns
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise for
 * that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_exit,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

#endif /* _TRACE_IPI_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


















































































































































































































































































































































































































































































































































    5 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
/* CPU control.
 * (C) 2001, 2002, 2003, 2004 Rusty Russell
 *
 * This code is licenced under the GPL.
 */
#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
#include <linux/random.h>
#include <linux/cc_platform.h>

#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cpuhp.h>

#include "smpboot.h"

/**
 * struct cpuhp_cpu_state - Per cpu hotplug state storage
 * @state:        The current cpu state
 * @target:        The target state
 * @fail:        Current CPU hotplug callback state
 * @thread:        Pointer to the hotplug thread
 * @should_run:        Thread should execute
 * @rollback:        Perform a rollback
 * @single:        Single callback invocation
 * @bringup:        Single callback bringup or teardown selector
 * @node:        Remote CPU node; for multi-instance, do a
 *                single entry callback for install/remove
 * @last:        For multi-instance rollback, remember how far we got
 * @cb_state:        The state for a single callback (install/uninstall)
 * @result:        Result of the operation
 * @ap_sync_state:        State for AP synchronization
 * @done_up:        Signal completion to the issuer of the task for cpu-up
 * @done_down:        Signal completion to the issuer of the task for cpu-down
 */
struct cpuhp_cpu_state {
        enum cpuhp_state        state;
        enum cpuhp_state        target;
        enum cpuhp_state        fail;
#ifdef CONFIG_SMP
        struct task_struct        *thread;
        bool                        should_run;
        bool                        rollback;
        bool                        single;
        bool                        bringup;
        struct hlist_node        *node;
        struct hlist_node        *last;
        enum cpuhp_state        cb_state;
        int                        result;
        atomic_t                ap_sync_state;
        struct completion        done_up;
        struct completion        done_down;
#endif
};

static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
        .fail = CPUHP_INVALID,
};

#ifdef CONFIG_SMP
cpumask_t cpus_booted_once_mask;
#endif

#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
static struct lockdep_map cpuhp_state_down_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);


static inline void cpuhp_lock_acquire(bool bringup)
{
        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}

static inline void cpuhp_lock_release(bool bringup)
{
        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else

static inline void cpuhp_lock_acquire(bool bringup) { }
static inline void cpuhp_lock_release(bool bringup) { }

#endif

/**
 * struct cpuhp_step - Hotplug state machine step
 * @name:        Name of the step
 * @startup:        Startup function of the step
 * @teardown:        Teardown function of the step
 * @cant_stop:        Bringup/teardown can't be stopped at this step
 * @multi_instance:        State has multiple instances which get added afterwards
 */
struct cpuhp_step {
        const char                *name;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } startup;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } teardown;
        /* private: */
        struct hlist_head        list;
        /* public: */
        bool                        cant_stop;
        bool                        multi_instance;
};

static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_hp_states[];

static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
        return cpuhp_hp_states + state;
}

static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
{
        return bringup ? !step->startup.single : !step->teardown.single;
}

/**
 * cpuhp_invoke_callback - Invoke the callbacks for a given state
 * @cpu:        The cpu for which the callback should be invoked
 * @state:        The state to do callbacks for
 * @bringup:        True if the bringup callback should be invoked
 * @node:        For multi-instance, do a single entry callback for install/remove
 * @lastp:        For multi-instance rollback, remember how far we got
 *
 * Called from cpu hotplug and from the state register machinery.
 *
 * Return: %0 on success or a negative errno code
 */
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
                                 bool bringup, struct hlist_node *node,
                                 struct hlist_node **lastp)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct cpuhp_step *step = cpuhp_get_step(state);
        int (*cbm)(unsigned int cpu, struct hlist_node *node);
        int (*cb)(unsigned int cpu);
        int ret, cnt;

        if (st->fail == state) {
                st->fail = CPUHP_INVALID;
                return -EAGAIN;
        }

        if (cpuhp_step_empty(bringup, step)) {
                WARN_ON_ONCE(1);
                return 0;
        }

        if (!step->multi_instance) {
                WARN_ON_ONCE(lastp && *lastp);
                cb = bringup ? step->startup.single : step->teardown.single;

                trace_cpuhp_enter(cpu, st->target, state, cb);
                ret = cb(cpu);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }
        cbm = bringup ? step->startup.multi : step->teardown.multi;

        /* Single invocation for instance add/remove */
        if (node) {
                WARN_ON_ONCE(lastp && *lastp);
                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }

        /* State transition. Invoke on all instances */
        cnt = 0;
        hlist_for_each(node, &step->list) {
                if (lastp && node == *lastp)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                if (ret) {
                        if (!lastp)
                                goto err;

                        *lastp = node;
                        return ret;
                }
                cnt++;
        }
        if (lastp)
                *lastp = NULL;
        return 0;
err:
        /* Rollback the instances if one failed */
        cbm = !bringup ? step->startup.multi : step->teardown.multi;
        if (!cbm)
                return ret;

        hlist_for_each(node, &step->list) {
                if (!cnt--)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                /*
                 * Rollback must not fail,
                 */
                WARN_ON_ONCE(ret);
        }
        return ret;
}

#ifdef CONFIG_SMP
static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
        /*
         * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
         * purposes as that state is handled explicitly in cpu_down.
         */
        return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}

static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        wait_for_completion(done);
}

static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        complete(done);
}

/*
 * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
 */
static bool cpuhp_is_atomic_state(enum cpuhp_state state)
{
        return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}

/* Synchronization state management */
enum cpuhp_sync_state {
        SYNC_STATE_DEAD,
        SYNC_STATE_KICKED,
        SYNC_STATE_SHOULD_DIE,
        SYNC_STATE_ALIVE,
        SYNC_STATE_SHOULD_ONLINE,
        SYNC_STATE_ONLINE,
};

#ifdef CONFIG_HOTPLUG_CORE_SYNC
/**
 * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
 * @state:        The synchronization state to set
 *
 * No synchronization point. Just update of the synchronization state, but implies
 * a full barrier so that the AP changes are visible before the control CPU proceeds.
 */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        (void)atomic_xchg(st, state);
}

void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }

static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
                                      enum cpuhp_sync_state next_state)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        ktime_t now, end, start = ktime_get();
        int sync;

        end = start + 10ULL * NSEC_PER_SEC;

        sync = atomic_read(st);
        while (1) {
                if (sync == state) {
                        if (!atomic_try_cmpxchg(st, &sync, next_state))
                                continue;
                        return true;
                }

                now = ktime_get();
                if (now > end) {
                        /* Timeout. Leave the state unchanged */
                        return false;
                } else if (now - start < NSEC_PER_MSEC) {
                        /* Poll for one millisecond */
                        arch_cpuhp_sync_state_poll();
                } else {
                        usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE);
                }
                sync = atomic_read(st);
        }
        return true;
}
#else  /* CONFIG_HOTPLUG_CORE_SYNC */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
/**
 * cpuhp_ap_report_dead - Update synchronization state to DEAD
 *
 * No synchronization point. Just update of the synchronization state.
 */
void cpuhp_ap_report_dead(void)
{
        cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
}

void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }

/*
 * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
 * because the AP cannot issue complete() at this stage.
 */
static void cpuhp_bp_sync_dead(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

        do {
                /* CPU can have reported dead already. Don't overwrite that! */
                if (sync == SYNC_STATE_DEAD)
                        break;
        } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));

        if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
                /* CPU reached dead state. Invoke the cleanup function */
                arch_cpuhp_cleanup_dead_cpu(cpu);
                return;
        }

        /* No further action possible. Emit message and give up. */
        pr_err("CPU%u failed to report dead state\n", cpu);
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
/**
 * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
 *
 * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
 * for the BP to release it.
 */
void cpuhp_ap_sync_alive(void)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);

        /* Wait for the control CPU to release it. */
        while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
                cpu_relax();
}

static bool cpuhp_can_boot_ap(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

again:
        switch (sync) {
        case SYNC_STATE_DEAD:
                /* CPU is properly dead */
                break;
        case SYNC_STATE_KICKED:
                /* CPU did not come up in previous attempt */
                break;
        case SYNC_STATE_ALIVE:
                /* CPU is stuck cpuhp_ap_sync_alive(). */
                break;
        default:
                /* CPU failed to report online or dead and is in limbo state. */
                return false;
        }

        /* Prepare for booting */
        if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
                goto again;

        return true;
}

void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }

/*
 * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
 * because the AP cannot issue complete() so early in the bringup.
 */
static int cpuhp_bp_sync_alive(unsigned int cpu)
{
        int ret = 0;

        if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
                return 0;

        if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
                pr_err("CPU%u failed to report alive state\n", cpu);
                ret = -EIO;
        }

        /* Let the architecture cleanup the kick alive mechanics. */
        arch_cpuhp_cleanup_kick_cpu(cpu);
        return ret;
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */

/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);

/*
 * The following two APIs (cpu_maps_update_begin/done) must be used when
 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
 */
void cpu_maps_update_begin(void)
{
        mutex_lock(&cpu_add_remove_lock);
}

void cpu_maps_update_done(void)
{
        mutex_unlock(&cpu_add_remove_lock);
}

/*
 * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 * Should always be manipulated under cpu_add_remove_lock
 */
static int cpu_hotplug_disabled;

#ifdef CONFIG_HOTPLUG_CPU

DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);

void cpus_read_lock(void)
{
        percpu_down_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_lock);

int cpus_read_trylock(void)
{
        return percpu_down_read_trylock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_trylock);

void cpus_read_unlock(void)
{
        percpu_up_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_unlock);

void cpus_write_lock(void)
{
        percpu_down_write(&cpu_hotplug_lock);
}

void cpus_write_unlock(void)
{
        percpu_up_write(&cpu_hotplug_lock);
}

void lockdep_assert_cpus_held(void)
{
        /*
         * We can't have hotplug operations before userspace starts running,
         * and some init codepaths will knowingly not take the hotplug lock.
         * This is all valid, so mute lockdep until it makes sense to report
         * unheld locks.
         */
        if (system_state < SYSTEM_RUNNING)
                return;

        percpu_rwsem_assert_held(&cpu_hotplug_lock);
}

#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
{
        return percpu_rwsem_is_held(&cpu_hotplug_lock);
}
#endif

static void lockdep_acquire_cpus_lock(void)
{
        rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}

static void lockdep_release_cpus_lock(void)
{
        rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}

/*
 * Wait for currently running CPU hotplug operations to complete (if any) and
 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
 * hotplug path before performing hotplug operations. So acquiring that lock
 * guarantees mutual exclusion from any currently running hotplug operations.
 */
void cpu_hotplug_disable(void)
{
        cpu_maps_update_begin();
        cpu_hotplug_disabled++;
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);

static void __cpu_hotplug_enable(void)
{
        if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
                return;
        cpu_hotplug_disabled--;
}

void cpu_hotplug_enable(void)
{
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);

#else

static void lockdep_acquire_cpus_lock(void)
{
}

static void lockdep_release_cpus_lock(void)
{
}

#endif        /* CONFIG_HOTPLUG_CPU */

/*
 * Architectures that need SMT-specific errata handling during SMT hotplug
 * should override this.
 */
void __weak arch_smt_update(void) { }

#ifdef CONFIG_HOTPLUG_SMT

enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
static unsigned int cpu_smt_max_threads __ro_after_init;
unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;

void __init cpu_smt_disable(bool force)
{
        if (!cpu_smt_possible())
                return;

        if (force) {
                pr_info("SMT: Force disabled\n");
                cpu_smt_control = CPU_SMT_FORCE_DISABLED;
        } else {
                pr_info("SMT: disabled\n");
                cpu_smt_control = CPU_SMT_DISABLED;
        }
        cpu_smt_num_threads = 1;
}

/*
 * The decision whether SMT is supported can only be done after the full
 * CPU identification. Called from architecture code.
 */
void __init cpu_smt_set_num_threads(unsigned int num_threads,
                                    unsigned int max_threads)
{
        WARN_ON(!num_threads || (num_threads > max_threads));

        if (max_threads == 1)
                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;

        cpu_smt_max_threads = max_threads;

        /*
         * If SMT has been disabled via the kernel command line or SMT is
         * not supported, set cpu_smt_num_threads to 1 for consistency.
         * If enabled, take the architecture requested number of threads
         * to bring up into account.
         */
        if (cpu_smt_control != CPU_SMT_ENABLED)
                cpu_smt_num_threads = 1;
        else if (num_threads < cpu_smt_num_threads)
                cpu_smt_num_threads = num_threads;
}

static int __init smt_cmdline_disable(char *str)
{
        cpu_smt_disable(str && !strcmp(str, "force"));
        return 0;
}
early_param("nosmt", smt_cmdline_disable);

/*
 * For Archicture supporting partial SMT states check if the thread is allowed.
 * Otherwise this has already been checked through cpu_smt_max_threads when
 * setting the SMT level.
 */
static inline bool cpu_smt_thread_allowed(unsigned int cpu)
{
#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
        return topology_smt_thread_allowed(cpu);
#else
        return true;
#endif
}

static inline bool cpu_bootable(unsigned int cpu)
{
        if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                return true;

        /* All CPUs are bootable if controls are not configured */
        if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
                return true;

        /* All CPUs are bootable if CPU is not SMT capable */
        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return true;

        if (topology_is_primary_thread(cpu))
                return true;

        /*
         * On x86 it's required to boot all logical CPUs at least once so
         * that the init code can get a chance to set CR4.MCE on each
         * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
         * core will shutdown the machine.
         */
        return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}

/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
        return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
                cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);

#else
static inline bool cpu_bootable(unsigned int cpu) { return true; }
#endif

static inline enum cpuhp_state
cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        bool bringup = st->state < target;

        st->rollback = false;
        st->last = NULL;

        st->target = target;
        st->single = false;
        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);

        return prev_state;
}

static inline void
cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
                  enum cpuhp_state prev_state)
{
        bool bringup = !st->bringup;

        st->target = prev_state;

        /*
         * Already rolling back. No need invert the bringup value or to change
         * the current state.
         */
        if (st->rollback)
                return;

        st->rollback = true;

        /*
         * If we have st->last we need to undo partial multi_instance of this
         * state first. Otherwise start undo at the previous state.
         */
        if (!st->last) {
                if (st->bringup)
                        st->state--;
                else
                        st->state++;
        }

        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);
}

/* Regular hotplug invocation of the AP hotplug thread */
static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
{
        if (!st->single && st->state == st->target)
                return;

        st->result = 0;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
         */
        smp_mb();
        st->should_run = true;
        wake_up_process(st->thread);
        wait_for_ap_thread(st, st->bringup);
}

static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
                         enum cpuhp_state target)
{
        enum cpuhp_state prev_state;
        int ret;

        prev_state = cpuhp_set_state(cpu, st, target);
        __cpuhp_kick_ap(st);
        if ((ret = st->result)) {
                cpuhp_reset_state(cpu, st, prev_state);
                __cpuhp_kick_ap(st);
        }

        return ret;
}

static int bringup_wait_for_ap_online(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

        /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
        wait_for_ap_thread(st, true);
        if (WARN_ON_ONCE((!cpu_online(cpu))))
                return -ECANCELED;

        /* Unpark the hotplug thread of the target cpu */
        kthread_unpark(st->thread);

        /*
         * SMT soft disabling on X86 requires to bring the CPU out of the
         * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
         * CPU marked itself as booted_once in notify_cpu_starting() so the
         * cpu_bootable() check will now return false if this is not the
         * primary sibling.
         */
        if (!cpu_bootable(cpu))
                return -ECANCELED;
        return 0;
}

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
static int cpuhp_kick_ap_alive(unsigned int cpu)
{
        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
}

static int cpuhp_bringup_ap(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         * Prevent irq alloc/free across the bringup.
         */
        irq_lock_sparse();

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#else
static int bringup_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;

        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         *
         * Prevent irq alloc/free across the bringup by acquiring the
         * sparse irq lock. Hold it until the upcoming CPU completes the
         * startup in cpuhp_online_idle() which allows to avoid
         * intermediate synchronization points in the architecture code.
         */
        irq_lock_sparse();

        ret = __cpu_up(cpu, idle);
        if (ret)
                goto out_unlock;

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#endif

static int finish_cpu(unsigned int cpu)
{
        struct task_struct *idle = idle_thread_get(cpu);
        struct mm_struct *mm = idle->active_mm;

        /*
         * idle_task_exit() will have switched to &init_mm, now
         * clean up any remaining active_mm state.
         */
        if (mm != &init_mm)
                idle->active_mm = &init_mm;
        mmdrop_lazy_tlb(mm);
        return 0;
}

/*
 * Hotplug state machine related functions
 */

/*
 * Get the next state to run. Empty ones will be skipped. Returns true if a
 * state must be run.
 *
 * st->state will be modified ahead of time, to match state_to_run, as if it
 * has already ran.
 */
static bool cpuhp_next_state(bool bringup,
                             enum cpuhp_state *state_to_run,
                             struct cpuhp_cpu_state *st,
                             enum cpuhp_state target)
{
        do {
                if (bringup) {
                        if (st->state >= target)
                                return false;

                        *state_to_run = ++st->state;
                } else {
                        if (st->state <= target)
                                return false;

                        *state_to_run = st->state--;
                }

                if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
                        break;
        } while (true);

        return true;
}

static int __cpuhp_invoke_callback_range(bool bringup,
                                         unsigned int cpu,
                                         struct cpuhp_cpu_state *st,
                                         enum cpuhp_state target,
                                         bool nofail)
{
        enum cpuhp_state state;
        int ret = 0;

        while (cpuhp_next_state(bringup, &state, st, target)) {
                int err;

                err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
                if (!err)
                        continue;

                if (nofail) {
                        pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
                                cpu, bringup ? "UP" : "DOWN",
                                cpuhp_get_step(st->state)->name,
                                st->state, err);
                        ret = -1;
                } else {
                        ret = err;
                        break;
                }
        }

        return ret;
}

static inline int cpuhp_invoke_callback_range(bool bringup,
                                              unsigned int cpu,
                                              struct cpuhp_cpu_state *st,
                                              enum cpuhp_state target)
{
        return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
}

static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
                                                      unsigned int cpu,
                                                      struct cpuhp_cpu_state *st,
                                                      enum cpuhp_state target)
{
        __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}

static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
{
        if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        /*
         * When CPU hotplug is disabled, then taking the CPU down is not
         * possible because takedown_cpu() and the architecture and
         * subsystem specific mechanisms are not available. So the CPU
         * which would be completely unplugged again needs to stay around
         * in the current state.
         */
        return st->state <= CPUHP_BRINGUP_CPU;
}

static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                              enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(true, cpu, st, target);
        if (ret) {
                pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);
                if (can_rollback_cpu(st))
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
                                                            prev_state));
        }
        return ret;
}

/*
 * The cpu hotplug threads manage the bringup and teardown of the cpus
 */
static int cpuhp_should_run(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        return st->should_run;
}

/*
 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
 * callbacks when a state gets [un]installed at runtime.
 *
 * Each invocation of this function by the smpboot thread does a single AP
 * state callback.
 *
 * It has 3 modes of operation:
 *  - single: runs st->cb_state
 *  - up:     runs ++st->state, while st->state < st->target
 *  - down:   runs st->state--, while st->state > st->target
 *
 * When complete or on error, should_run is cleared and the completion is fired.
 */
static void cpuhp_thread_fun(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        bool bringup = st->bringup;
        enum cpuhp_state state;

        if (WARN_ON_ONCE(!st->should_run))
                return;

        /*
         * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
         * that if we see ->should_run we also see the rest of the state.
         */
        smp_mb();

        /*
         * The BP holds the hotplug lock, but we're now running on the AP,
         * ensure that anybody asserting the lock is held, will actually find
         * it so.
         */
        lockdep_acquire_cpus_lock();
        cpuhp_lock_acquire(bringup);

        if (st->single) {
                state = st->cb_state;
                st->should_run = false;
        } else {
                st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
                if (!st->should_run)
                        goto end;
        }

        WARN_ON_ONCE(!cpuhp_is_ap_state(state));

        if (cpuhp_is_atomic_state(state)) {
                local_irq_disable();
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
                local_irq_enable();

                /*
                 * STARTING/DYING must not fail!
                 */
                WARN_ON_ONCE(st->result);
        } else {
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
        }

        if (st->result) {
                /*
                 * If we fail on a rollback, we're up a creek without no
                 * paddle, no way forward, no way back. We loose, thanks for
                 * playing.
                 */
                WARN_ON_ONCE(st->rollback);
                st->should_run = false;
        }

end:
        cpuhp_lock_release(bringup);
        lockdep_release_cpus_lock();

        if (!st->should_run)
                complete_ap_thread(st, bringup);
}

/* Invoke a single callback on a remote cpu */
static int
cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
                         struct hlist_node *node)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        if (!cpu_online(cpu))
                return 0;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        /*
         * If we are up and running, use the hotplug thread. For early calls
         * we invoke the thread function directly.
         */
        if (!st->thread)
                return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);

        st->rollback = false;
        st->last = NULL;

        st->node = node;
        st->bringup = bringup;
        st->cb_state = state;
        st->single = true;

        __cpuhp_kick_ap(st);

        /*
         * If we failed and did a partial, do a rollback.
         */
        if ((ret = st->result) && st->last) {
                st->rollback = true;
                st->bringup = !bringup;

                __cpuhp_kick_ap(st);
        }

        /*
         * Clean up the leftovers so the next hotplug operation wont use stale
         * data.
         */
        st->node = st->last = NULL;
        return ret;
}

static int cpuhp_kick_ap_work(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state prev_state = st->state;
        int ret;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
        ret = cpuhp_kick_ap(cpu, st, st->target);
        trace_cpuhp_exit(cpu, st->state, prev_state, ret);

        return ret;
}

static struct smp_hotplug_thread cpuhp_threads = {
        .store                        = &cpuhp_state.thread,
        .thread_should_run        = cpuhp_should_run,
        .thread_fn                = cpuhp_thread_fun,
        .thread_comm                = "cpuhp/%u",
        .selfparking                = true,
};

static __init void cpuhp_init_state(void)
{
        struct cpuhp_cpu_state *st;
        int cpu;

        for_each_possible_cpu(cpu) {
                st = per_cpu_ptr(&cpuhp_state, cpu);
                init_completion(&st->done_up);
                init_completion(&st->done_down);
        }
}

void __init cpuhp_threads_init(void)
{
        cpuhp_init_state();
        BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
        kthread_unpark(this_cpu_read(cpuhp_state.thread));
}

#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
#endif

/**
 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
 * @cpu: a CPU id
 *
 * This function walks all processes, finds a valid mm struct for each one and
 * then clears a corresponding bit in mm's cpumask.  While this all sounds
 * trivial, there are various non-obvious corner cases, which this function
 * tries to solve in a safe manner.
 *
 * Also note that the function uses a somewhat relaxed locking scheme, so it may
 * be called only for an already offlined CPU.
 */
void clear_tasks_mm_cpumask(int cpu)
{
        struct task_struct *p;

        /*
         * This function is called after the cpu is taken down and marked
         * offline, so its not like new tasks will ever get this cpu set in
         * their mm mask. -- Peter Zijlstra
         * Thus, we may use rcu_read_lock() here, instead of grabbing
         * full-fledged tasklist_lock.
         */
        WARN_ON(cpu_online(cpu));
        rcu_read_lock();
        for_each_process(p) {
                struct task_struct *t;

                /*
                 * Main thread might exit, but other threads may still have
                 * a valid mm. Find one.
                 */
                t = find_lock_task_mm(p);
                if (!t)
                        continue;
                arch_clear_mm_cpumask_cpu(cpu, t->mm);
                task_unlock(t);
        }
        rcu_read_unlock();
}

/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();

        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;

        /*
         * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
         * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
         */
        WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));

        /*
         * Invoke the former CPU_DYING callbacks. DYING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(false, cpu, st, target);

        /* Park the stopper thread */
        stop_machine_park(cpu);
        return 0;
}

static int takedown_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int err;

        /* Park the smpboot threads */
        kthread_park(st->thread);

        /*
         * Prevent irq alloc/free while the dying cpu reorganizes the
         * interrupt affinities.
         */
        irq_lock_sparse();

        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
        err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
        if (err) {
                /* CPU refused to die */
                irq_unlock_sparse();
                /* Unpark the hotplug thread so we can rollback there */
                kthread_unpark(st->thread);
                return err;
        }
        BUG_ON(cpu_online(cpu));

        /*
         * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
         * all runnable tasks from the CPU, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
         * Wait for the stop thread to go away.
         */
        wait_for_ap_thread(st, false);
        BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);

        /* Interrupts are moved away from the dying cpu, reenable alloc/free */
        irq_unlock_sparse();

        hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);

        cpuhp_bp_sync_dead(cpu);

        tick_cleanup_dead_cpu(cpu);

        /*
         * Callbacks must be re-integrated right away to the RCU state machine.
         * Otherwise an RCU callback could block a further teardown function
         * waiting for its completion.
         */
        rcutree_migrate_callbacks(cpu);

        return 0;
}

static void cpuhp_complete_idle_dead(void *arg)
{
        struct cpuhp_cpu_state *st = arg;

        complete_ap_thread(st, false);
}

void cpuhp_report_idle_dead(void)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        BUG_ON(st->state != CPUHP_AP_OFFLINE);
        tick_assert_timekeeping_handover();
        rcutree_report_cpu_dead();
        st->state = CPUHP_AP_IDLE_DEAD;
        /*
         * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
         * to an online cpu.
         */
        smp_call_function_single(cpumask_first(cpu_online_mask),
                                 cpuhp_complete_idle_dead, st, 0);
}

static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                                enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(false, cpu, st, target);
        if (ret) {
                pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);

                if (st->state < prev_state)
                        WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
                                                            prev_state));
        }

        return ret;
}

/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
                           enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int prev_state, ret = 0;

        if (num_online_cpus() == 1)
                return -EBUSY;

        if (!cpu_present(cpu))
                return -EINVAL;

        cpus_write_lock();

        cpuhp_tasks_frozen = tasks_frozen;

        prev_state = cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread.
         */
        if (st->state > CPUHP_TEARDOWN_CPU) {
                st->target = max((int)target, CPUHP_TEARDOWN_CPU);
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;

                /*
                 * We might have stopped still in the range of the AP hotplug
                 * thread. Nothing to do anymore.
                 */
                if (st->state > CPUHP_TEARDOWN_CPU)
                        goto out;

                st->target = target;
        }
        /*
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
         * to do the further cleanups.
         */
        ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state < prev_state) {
                if (st->state == CPUHP_TEARDOWN_CPU) {
                        cpuhp_reset_state(cpu, st, prev_state);
                        __cpuhp_kick_ap(st);
                } else {
                        WARN(1, "DEAD callback error for CPU%d", cpu);
                }
        }

out:
        cpus_write_unlock();
        /*
         * Do post unplug cleanup. This is still protected against
         * concurrent CPU hotplug via cpu_add_remove_lock.
         */
        lockup_detector_cleanup();
        arch_smt_update();
        return ret;
}

struct cpu_down_work {
        unsigned int                cpu;
        enum cpuhp_state        target;
};

static long __cpu_down_maps_locked(void *arg)
{
        struct cpu_down_work *work = arg;

        return _cpu_down(work->cpu, 0, work->target);
}

static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
        struct cpu_down_work work = { .cpu = cpu, .target = target, };

        /*
         * If the platform does not support hotplug, report it explicitly to
         * differentiate it from a transient offlining failure.
         */
        if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
                return -EOPNOTSUPP;
        if (cpu_hotplug_disabled)
                return -EBUSY;

        /*
         * Ensure that the control task does not run on the to be offlined
         * CPU to prevent a deadlock against cfs_b->period_timer.
         * Also keep at least one housekeeping cpu onlined to avoid generating
         * an empty sched_domain span.
         */
        for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
                if (cpu != work.cpu)
                        return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
        }
        return -EBUSY;
}

static int cpu_down(unsigned int cpu, enum cpuhp_state target)
{
        int err;

        cpu_maps_update_begin();
        err = cpu_down_maps_locked(cpu, target);
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_down - Bring down a cpu device
 * @dev: Pointer to the cpu device to offline
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use remove_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_down(struct device *dev)
{
        return cpu_down(dev->id, CPUHP_OFFLINE);
}

int remove_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_offline(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(remove_cpu);

void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
{
        unsigned int cpu;
        int error;

        cpu_maps_update_begin();

        /*
         * Make certain the cpu I'm about to reboot on is online.
         *
         * This is inline to what migrate_to_reboot_cpu() already do.
         */
        if (!cpu_online(primary_cpu))
                primary_cpu = cpumask_first(cpu_online_mask);

        for_each_online_cpu(cpu) {
                if (cpu == primary_cpu)
                        continue;

                error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (error) {
                        pr_err("Failed to offline CPU%d - error=%d",
                                cpu, error);
                        break;
                }
        }

        /*
         * Ensure all but the reboot CPU are offline.
         */
        BUG_ON(num_online_cpus() > 1);

        /*
         * Make sure the CPUs won't be enabled by someone else after this
         * point. Kexec will reboot to a new kernel shortly resetting
         * everything along the way.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
}

#else
#define takedown_cpu                NULL
#endif /*CONFIG_HOTPLUG_CPU*/

/**
 * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
 * @cpu: cpu that just started
 *
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
void notify_cpu_starting(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);

        rcutree_report_cpu_starting(cpu);        /* Enables RCU usage on this CPU. */
        cpumask_set_cpu(cpu, &cpus_booted_once_mask);

        /*
         * STARTING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}

/*
 * Called from the idle task. Wake up the controlling task which brings the
 * hotplug thread of the upcoming CPU up and then delegates the rest of the
 * online bringup to the hotplug thread.
 */
void cpuhp_online_idle(enum cpuhp_state state)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        /* Happens for the boot cpu */
        if (state != CPUHP_AP_ONLINE_IDLE)
                return;

        cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);

        /*
         * Unpark the stopper thread before we start the idle loop (and start
         * scheduling); this ensures the stopper task is always available.
         */
        stop_machine_unpark(smp_processor_id());

        st->state = CPUHP_AP_ONLINE_IDLE;
        complete_ap_thread(st, true);
}

/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle;
        int ret = 0;

        cpus_write_lock();

        if (!cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * The caller of cpu_up() might have raced with another
         * caller. Nothing to do.
         */
        if (st->state >= target)
                goto out;

        if (st->state == CPUHP_OFFLINE) {
                /* Let it fail before we try to bring the cpu up */
                idle = idle_thread_get(cpu);
                if (IS_ERR(idle)) {
                        ret = PTR_ERR(idle);
                        goto out;
                }

                /*
                 * Reset stale stack state from the last time this CPU was online.
                 */
                scs_task_reset(idle);
                kasan_unpoison_task_stack(idle);
        }

        cpuhp_tasks_frozen = tasks_frozen;

        cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread once more.
         */
        if (st->state > CPUHP_BRINGUP_CPU) {
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;
        }

        /*
         * Try to reach the target state. We max out on the BP at
         * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
         * responsible for bringing it up to the target state.
         */
        target = min((int)target, CPUHP_BRINGUP_CPU);
        ret = cpuhp_up_callbacks(cpu, st, target);
out:
        cpus_write_unlock();
        arch_smt_update();
        return ret;
}

static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
        int err = 0;

        if (!cpu_possible(cpu)) {
                pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
                       cpu);
                return -EINVAL;
        }

        err = try_online_node(cpu_to_node(cpu));
        if (err)
                return err;

        cpu_maps_update_begin();

        if (cpu_hotplug_disabled) {
                err = -EBUSY;
                goto out;
        }
        if (!cpu_bootable(cpu)) {
                err = -EPERM;
                goto out;
        }

        err = _cpu_up(cpu, 0, target);
out:
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_up - Bring up a cpu device
 * @dev: Pointer to the cpu device to online
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use add_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_up(struct device *dev)
{
        return cpu_up(dev->id, CPUHP_ONLINE);
}

int add_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_online(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(add_cpu);

/**
 * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
 * @sleep_cpu: The cpu we hibernated on and should be brought up.
 *
 * On some architectures like arm64, we can hibernate on any CPU, but on
 * wake up the CPU we hibernated on might be offline as a side effect of
 * using maxcpus= for example.
 *
 * Return: %0 on success or a negative errno code
 */
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
        int ret;

        if (!cpu_online(sleep_cpu)) {
                pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
                ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
                if (ret) {
                        pr_err("Failed to bring hibernate-CPU up!\n");
                        return ret;
                }
        }
        return 0;
}

static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
                                      enum cpuhp_state target)
{
        unsigned int cpu;

        for_each_cpu(cpu, mask) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

                if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
                        /*
                         * If this failed then cpu_up() might have only
                         * rolled back to CPUHP_BP_KICK_AP for the final
                         * online. Clean it up. NOOP if already rolled back.
                         */
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
                }

                if (!--ncpus)
                        break;
        }
}

#ifdef CONFIG_HOTPLUG_PARALLEL
static bool __cpuhp_parallel_bringup __ro_after_init = true;

static int __init parallel_bringup_parse_param(char *arg)
{
        return kstrtobool(arg, &__cpuhp_parallel_bringup);
}
early_param("cpuhp.parallel", parallel_bringup_parse_param);

static inline bool cpuhp_smt_aware(void)
{
        return cpu_smt_max_threads > 1;
}

static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
{
        return cpu_primary_thread_mask;
}

/*
 * On architectures which have enabled parallel bringup this invokes all BP
 * prepare states for each of the to be onlined APs first. The last state
 * sends the startup IPI to the APs. The APs proceed through the low level
 * bringup code in parallel and then wait for the control CPU to release
 * them one by one for the final onlining procedure.
 *
 * This avoids waiting for each AP to respond to the startup IPI in
 * CPUHP_BRINGUP_CPU.
 */
static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
{
        const struct cpumask *mask = cpu_present_mask;

        if (__cpuhp_parallel_bringup)
                __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
        if (!__cpuhp_parallel_bringup)
                return false;

        if (cpuhp_smt_aware()) {
                const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
                static struct cpumask tmp_mask __initdata;

                /*
                 * X86 requires to prevent that SMT siblings stopped while
                 * the primary thread does a microcode update for various
                 * reasons. Bring the primary threads up first.
                 */
                cpumask_and(&tmp_mask, mask, pmask);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
                /* Account for the online CPUs */
                ncpus -= num_online_cpus();
                if (!ncpus)
                        return true;
                /* Create the mask for secondary CPUs */
                cpumask_andnot(&tmp_mask, mask, pmask);
                mask = &tmp_mask;
        }

        /* Bring the not-yet started CPUs up */
        cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
        cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
        return true;
}
#else
static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
#endif /* CONFIG_HOTPLUG_PARALLEL */

void __init bringup_nonboot_cpus(unsigned int max_cpus)
{
        /* Try parallel bringup optimization if enabled */
        if (cpuhp_bringup_cpus_parallel(max_cpus))
                return;

        /* Full per CPU serialized bringup */
        cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE);
}

#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;

int freeze_secondary_cpus(int primary)
{
        int cpu, error = 0;

        cpu_maps_update_begin();
        if (primary == -1) {
                primary = cpumask_first(cpu_online_mask);
                if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
                        primary = housekeeping_any_cpu(HK_TYPE_TIMER);
        } else {
                if (!cpu_online(primary))
                        primary = cpumask_first(cpu_online_mask);
        }

        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);

        pr_info("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == primary)
                        continue;

                if (pm_wakeup_pending()) {
                        pr_info("Wakeup pending. Abort CPU freeze\n");
                        error = -EBUSY;
                        break;
                }

                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
                else {
                        pr_err("Error taking CPU%d down: %d\n", cpu, error);
                        break;
                }
        }

        if (!error)
                BUG_ON(num_online_cpus() > 1);
        else
                pr_err("Non-boot CPUs are not disabled\n");

        /*
         * Make sure the CPUs won't be enabled by someone else. We need to do
         * this even in case of failure as all freeze_secondary_cpus() users are
         * supposed to do thaw_secondary_cpus() on the failure path.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
        return error;
}

void __weak arch_thaw_secondary_cpus_begin(void)
{
}

void __weak arch_thaw_secondary_cpus_end(void)
{
}

void thaw_secondary_cpus(void)
{
        int cpu, error;

        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        if (cpumask_empty(frozen_cpus))
                goto out;

        pr_info("Enabling non-boot CPUs ...\n");

        arch_thaw_secondary_cpus_begin();

        for_each_cpu(cpu, frozen_cpus) {
                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
                error = _cpu_up(cpu, 1, CPUHP_ONLINE);
                trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                if (!error) {
                        pr_info("CPU%d is up\n", cpu);
                        continue;
                }
                pr_warn("Error taking CPU%d up: %d\n", cpu, error);
        }

        arch_thaw_secondary_cpus_end();

        cpumask_clear(frozen_cpus);
out:
        cpu_maps_update_done();
}

static int __init alloc_frozen_cpus(void)
{
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
        return 0;
}
core_initcall(alloc_frozen_cpus);

/*
 * When callbacks for CPU hotplug notifications are being executed, we must
 * ensure that the state of the system with respect to the tasks being frozen
 * or not, as reported by the notification, remains unchanged *throughout the
 * duration* of the execution of the callbacks.
 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
 *
 * This synchronization is implemented by mutually excluding regular CPU
 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
 * Hibernate notifications.
 */
static int
cpu_hotplug_pm_callback(struct notifier_block *nb,
                        unsigned long action, void *ptr)
{
        switch (action) {

        case PM_SUSPEND_PREPARE:
        case PM_HIBERNATION_PREPARE:
                cpu_hotplug_disable();
                break;

        case PM_POST_SUSPEND:
        case PM_POST_HIBERNATION:
                cpu_hotplug_enable();
                break;

        default:
                return NOTIFY_DONE;
        }

        return NOTIFY_OK;
}


static int __init cpu_hotplug_pm_sync_init(void)
{
        /*
         * cpu_hotplug_pm_callback has higher priority than x86
         * bsp_pm_callback which depends on cpu_hotplug_pm_callback
         * to disable cpu hotplug to avoid cpu hotplug race.
         */
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
}
core_initcall(cpu_hotplug_pm_sync_init);

#endif /* CONFIG_PM_SLEEP_SMP */

int __boot_cpu_id;

#endif /* CONFIG_SMP */

/* Boot processor state steps */
static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_OFFLINE] = {
                .name                        = "offline",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
#ifdef CONFIG_SMP
        [CPUHP_CREATE_THREADS]= {
                .name                        = "threads:prepare",
                .startup.single                = smpboot_create_threads,
                .teardown.single        = NULL,
                .cant_stop                = true,
        },
        [CPUHP_PERF_PREPARE] = {
                .name                        = "perf:prepare",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_RANDOM_PREPARE] = {
                .name                        = "random:prepare",
                .startup.single                = random_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_WORKQUEUE_PREP] = {
                .name                        = "workqueue:prepare",
                .startup.single                = workqueue_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_HRTIMERS_PREPARE] = {
                .name                        = "hrtimers:prepare",
                .startup.single                = hrtimers_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_SMPCFD_PREPARE] = {
                .name                        = "smpcfd:prepare",
                .startup.single                = smpcfd_prepare_cpu,
                .teardown.single        = smpcfd_dead_cpu,
        },
        [CPUHP_RELAY_PREPARE] = {
                .name                        = "relay:prepare",
                .startup.single                = relay_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_RCUTREE_PREP] = {
                .name                        = "RCU/tree:prepare",
                .startup.single                = rcutree_prepare_cpu,
                .teardown.single        = rcutree_dead_cpu,
        },
        /*
         * On the tear-down path, timers_dead_cpu() must be invoked
         * before blk_mq_queue_reinit_notify() from notify_dead(),
         * otherwise a RCU stall occurs.
         */
        [CPUHP_TIMERS_PREPARE] = {
                .name                        = "timers:prepare",
                .startup.single                = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
        /*
         * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
         * the next step will release it.
         */
        [CPUHP_BP_KICK_AP] = {
                .name                        = "cpu:kick_ap",
                .startup.single                = cpuhp_kick_ap_alive,
        },

        /*
         * Waits for the AP to reach cpuhp_ap_sync_alive() and then
         * releases it for the complete bringup.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = cpuhp_bringup_ap,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#else
        /*
         * All-in-one CPU bringup state which includes the kick alive.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = bringup_cpu,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#endif
        /* Final state before CPU kills itself */
        [CPUHP_AP_IDLE_DEAD] = {
                .name                        = "idle:dead",
        },
        /*
         * Last state before CPU enters the idle loop to die. Transient state
         * for synchronization.
         */
        [CPUHP_AP_OFFLINE] = {
                .name                        = "ap:offline",
                .cant_stop                = true,
        },
        /* First state is scheduler control. Interrupts are disabled */
        [CPUHP_AP_SCHED_STARTING] = {
                .name                        = "sched:starting",
                .startup.single                = sched_cpu_starting,
                .teardown.single        = sched_cpu_dying,
        },
        [CPUHP_AP_RCUTREE_DYING] = {
                .name                        = "RCU/tree:dying",
                .startup.single                = NULL,
                .teardown.single        = rcutree_dying_cpu,
        },
        [CPUHP_AP_SMPCFD_DYING] = {
                .name                        = "smpcfd:dying",
                .startup.single                = NULL,
                .teardown.single        = smpcfd_dying_cpu,
        },
        [CPUHP_AP_HRTIMERS_DYING] = {
                .name                        = "hrtimers:dying",
                .startup.single                = NULL,
                .teardown.single        = hrtimers_cpu_dying,
        },
        [CPUHP_AP_TICK_DYING] = {
                .name                        = "tick:dying",
                .startup.single                = NULL,
                .teardown.single        = tick_cpu_dying,
        },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
                .name                        = "ap:online",
        },
        /*
         * Handled on control processor until the plugged processor manages
         * this itself.
         */
        [CPUHP_TEARDOWN_CPU] = {
                .name                        = "cpu:teardown",
                .startup.single                = NULL,
                .teardown.single        = takedown_cpu,
                .cant_stop                = true,
        },

        [CPUHP_AP_SCHED_WAIT_EMPTY] = {
                .name                        = "sched:waitempty",
                .startup.single                = NULL,
                .teardown.single        = sched_cpu_wait_empty,
        },

        /* Handle smpboot threads park/unpark */
        [CPUHP_AP_SMPBOOT_THREADS] = {
                .name                        = "smpboot/threads:online",
                .startup.single                = smpboot_unpark_threads,
                .teardown.single        = smpboot_park_threads,
        },
        [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
                .name                        = "irq/affinity:online",
                .startup.single                = irq_affinity_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_PERF_ONLINE] = {
                .name                        = "perf:online",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_AP_WATCHDOG_ONLINE] = {
                .name                        = "lockup_detector:online",
                .startup.single                = lockup_detector_online_cpu,
                .teardown.single        = lockup_detector_offline_cpu,
        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
                .name                        = "workqueue:online",
                .startup.single                = workqueue_online_cpu,
                .teardown.single        = workqueue_offline_cpu,
        },
        [CPUHP_AP_RANDOM_ONLINE] = {
                .name                        = "random:online",
                .startup.single                = random_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_RCUTREE_ONLINE] = {
                .name                        = "RCU/tree:online",
                .startup.single                = rcutree_online_cpu,
                .teardown.single        = rcutree_offline_cpu,
        },
#endif
        /*
         * The dynamically registered state space is here
         */

#ifdef CONFIG_SMP
        /* Last state is scheduler control setting the cpu active */
        [CPUHP_AP_ACTIVE] = {
                .name                        = "sched:active",
                .startup.single                = sched_cpu_activate,
                .teardown.single        = sched_cpu_deactivate,
        },
#endif

        /* CPU is fully up and running. */
        [CPUHP_ONLINE] = {
                .name                        = "online",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
};

/* Sanity check for callbacks */
static int cpuhp_cb_check(enum cpuhp_state state)
{
        if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
                return -EINVAL;
        return 0;
}

/*
 * Returns a free for dynamic slot assignment of the Online state. The states
 * are protected by the cpuhp_slot_states mutex and an empty slot is identified
 * by having no name assigned.
 */
static int cpuhp_reserve_state(enum cpuhp_state state)
{
        enum cpuhp_state i, end;
        struct cpuhp_step *step;

        switch (state) {
        case CPUHP_AP_ONLINE_DYN:
                step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
                end = CPUHP_AP_ONLINE_DYN_END;
                break;
        case CPUHP_BP_PREPARE_DYN:
                step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
                end = CPUHP_BP_PREPARE_DYN_END;
                break;
        default:
                return -EINVAL;
        }

        for (i = state; i <= end; i++, step++) {
                if (!step->name)
                        return i;
        }
        WARN(1, "No more dynamic states available for CPU hotplug\n");
        return -ENOSPC;
}

static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
                                 int (*startup)(unsigned int cpu),
                                 int (*teardown)(unsigned int cpu),
                                 bool multi_instance)
{
        /* (Un)Install the callbacks for further cpu hotplug operations */
        struct cpuhp_step *sp;
        int ret = 0;

        /*
         * If name is NULL, then the state gets removed.
         *
         * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
         * the first allocation from these dynamic ranges, so the removal
         * would trigger a new allocation and clear the wrong (already
         * empty) state, leaving the callbacks of the to be cleared state
         * dangling, which causes wreckage on the next hotplug operation.
         */
        if (name && (state == CPUHP_AP_ONLINE_DYN ||
                     state == CPUHP_BP_PREPARE_DYN)) {
                ret = cpuhp_reserve_state(state);
                if (ret < 0)
                        return ret;
                state = ret;
        }
        sp = cpuhp_get_step(state);
        if (name && sp->name)
                return -EBUSY;

        sp->startup.single = startup;
        sp->teardown.single = teardown;
        sp->name = name;
        sp->multi_instance = multi_instance;
        INIT_HLIST_HEAD(&sp->list);
        return ret;
}

static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
        return cpuhp_get_step(state)->teardown.single;
}

/*
 * Call the startup/teardown function for a step either on the AP or
 * on the current CPU.
 */
static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
                            struct hlist_node *node)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;

        /*
         * If there's nothing to do, we done.
         * Relies on the union for multi_instance.
         */
        if (cpuhp_step_empty(bringup, sp))
                return 0;
        /*
         * The non AP bound callbacks can fail on bringup. On teardown
         * e.g. module removal we crash for now.
         */
#ifdef CONFIG_SMP
        if (cpuhp_is_ap_state(state))
                ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
        ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
        BUG_ON(ret && !bringup);
        return ret;
}

/*
 * Called from __cpuhp_setup_state on a recoverable failure.
 *
 * Note: The teardown callbacks for rollback are not allowed to fail!
 */
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
                                   struct hlist_node *node)
{
        int cpu;

        /* Roll back the already executed steps on the other cpus */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpu >= failedcpu)
                        break;

                /* Did we invoke the startup call on that cpu ? */
                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }
}

int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
                                          struct hlist_node *node,
                                          bool invoke)
{
        struct cpuhp_step *sp;
        int cpu;
        int ret;

        lockdep_assert_cpus_held();

        sp = cpuhp_get_step(state);
        if (sp->multi_instance == false)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !sp->startup.multi)
                goto add_node;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, node);
                if (ret) {
                        if (sp->teardown.multi)
                                cpuhp_rollback_install(cpu, state, node);
                        goto unlock;
                }
        }
add_node:
        ret = 0;
        hlist_add_head(node, &sp->list);
unlock:
        mutex_unlock(&cpuhp_state_mutex);
        return ret;
}

int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
                               bool invoke)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);

/**
 * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
 * @state:                The state to setup
 * @name:                Name of the step
 * @invoke:                If true, the startup function is invoked for cpus where
 *                        cpu state >= @state
 * @startup:                startup callback function
 * @teardown:                teardown callback function
 * @multi_instance:        State is set up for multiple instances which get
 *                        added afterwards.
 *
 * The caller needs to hold cpus read locked while calling this function.
 * Return:
 *   On success:
 *      Positive state number if @state is CPUHP_AP_ONLINE_DYN;
 *      0 for all other states
 *   On failure: proper (negative) error code
 */
int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
                                   const char *name, bool invoke,
                                   int (*startup)(unsigned int cpu),
                                   int (*teardown)(unsigned int cpu),
                                   bool multi_instance)
{
        int cpu, ret = 0;
        bool dynstate;

        lockdep_assert_cpus_held();

        if (cpuhp_cb_check(state) || !name)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        ret = cpuhp_store_callbacks(state, name, startup, teardown,
                                    multi_instance);

        dynstate = state == CPUHP_AP_ONLINE_DYN;
        if (ret > 0 && dynstate) {
                state = ret;
                ret = 0;
        }

        if (ret || !invoke || !startup)
                goto out;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, NULL);
                if (ret) {
                        if (teardown)
                                cpuhp_rollback_install(cpu, state, NULL);
                        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
                        goto out;
                }
        }
out:
        mutex_unlock(&cpuhp_state_mutex);
        /*
         * If the requested state is CPUHP_AP_ONLINE_DYN, return the
         * dynamically allocated state in case of success.
         */
        if (!ret && dynstate)
                return state;
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);

int __cpuhp_setup_state(enum cpuhp_state state,
                        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
                        int (*teardown)(unsigned int cpu),
                        bool multi_instance)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
                                             teardown, multi_instance);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state);

int __cpuhp_state_remove_instance(enum cpuhp_state state,
                                  struct hlist_node *node, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        if (!sp->multi_instance)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;
        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }

remove:
        hlist_del(node);
        mutex_unlock(&cpuhp_state_mutex);
        cpus_read_unlock();

        return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);

/**
 * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
 * @state:        The state to remove
 * @invoke:        If true, the teardown function is invoked for cpus where
 *                cpu state >= @state
 *
 * The caller needs to hold cpus read locked while calling this function.
 * The teardown callback is currently not allowed to fail. Think
 * about module removal!
 */
void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        lockdep_assert_cpus_held();

        mutex_lock(&cpuhp_state_mutex);
        if (sp->multi_instance) {
                WARN(!hlist_empty(&sp->list),
                     "Error: Removing state %d which has instances left.\n",
                     state);
                goto remove;
        }

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;

        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, NULL);
        }
remove:
        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        mutex_unlock(&cpuhp_state_mutex);
}
EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);

void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
        cpus_read_lock();
        __cpuhp_remove_state_cpuslocked(state, invoke);
        cpus_read_unlock();
}
EXPORT_SYMBOL(__cpuhp_remove_state);

#ifdef CONFIG_HOTPLUG_SMT
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = true;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}

static void cpuhp_online_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = false;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}

int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        for_each_online_cpu(cpu) {
                if (topology_is_primary_thread(cpu))
                        continue;
                /*
                 * Disable can be called with CPU_SMT_ENABLED when changing
                 * from a higher to lower number of SMT threads per core.
                 */
                if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                        continue;
                ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (ret)
                        break;
                /*
                 * As this needs to hold the cpu maps lock it's impossible
                 * to call device_offline() because that ends up calling
                 * cpu_down() which takes cpu maps lock. cpu maps lock
                 * needs to be held as this might race against in kernel
                 * abusers of the hotplug machinery (thermal management).
                 *
                 * So nothing would update device:offline state. That would
                 * leave the sysfs entry stale and prevent onlining after
                 * smt control has been changed to 'off' again. This is
                 * called under the sysfs hotplug lock, so it is properly
                 * serialized against the regular offline usage.
                 */
                cpuhp_offline_cpu_device(cpu);
        }
        if (!ret)
                cpu_smt_control = ctrlval;
        cpu_maps_update_done();
        return ret;
}

int cpuhp_smt_enable(void)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        cpu_smt_control = CPU_SMT_ENABLED;
        for_each_present_cpu(cpu) {
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
                        continue;
                if (!cpu_smt_thread_allowed(cpu))
                        continue;
                ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
                if (ret)
                        break;
                /* See comment in cpuhp_smt_disable() */
                cpuhp_online_cpu_device(cpu);
        }
        cpu_maps_update_done();
        return ret;
}
#endif

#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t state_show(struct device *dev,
                          struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->state);
}
static DEVICE_ATTR_RO(state);

static ssize_t target_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int target, ret;

        ret = kstrtoint(buf, 10, &target);
        if (ret)
                return ret;

#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
        if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
                return -EINVAL;
#else
        if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
                return -EINVAL;
#endif

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(target);
        ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                goto out;

        if (st->state < target)
                ret = cpu_up(dev->id, target);
        else if (st->state > target)
                ret = cpu_down(dev->id, target);
        else if (WARN_ON(st->target != target))
                st->target = target;
out:
        unlock_device_hotplug();
        return ret ? ret : count;
}

static ssize_t target_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->target);
}
static DEVICE_ATTR_RW(target);

static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
                          const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int fail, ret;

        ret = kstrtoint(buf, 10, &fail);
        if (ret)
                return ret;

        if (fail == CPUHP_INVALID) {
                st->fail = fail;
                return count;
        }

        if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
                return -EINVAL;

        /*
         * Cannot fail STARTING/DYING callbacks.
         */
        if (cpuhp_is_atomic_state(fail))
                return -EINVAL;

        /*
         * DEAD callbacks cannot fail...
         * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
         * triggering STARTING callbacks, a failure in this state would
         * hinder rollback.
         */
        if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
                return -EINVAL;

        /*
         * Cannot fail anything that doesn't have callbacks.
         */
        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(fail);
        if (!sp->startup.single && !sp->teardown.single)
                ret = -EINVAL;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                return ret;

        st->fail = fail;

        return count;
}

static ssize_t fail_show(struct device *dev,
                         struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->fail);
}

static DEVICE_ATTR_RW(fail);

static struct attribute *cpuhp_cpu_attrs[] = {
        &dev_attr_state.attr,
        &dev_attr_target.attr,
        &dev_attr_fail.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_attr_group = {
        .attrs = cpuhp_cpu_attrs,
        .name = "hotplug",
        NULL
};

static ssize_t states_show(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        ssize_t cur, res = 0;
        int i;

        mutex_lock(&cpuhp_state_mutex);
        for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
                struct cpuhp_step *sp = cpuhp_get_step(i);

                if (sp->name) {
                        cur = sprintf(buf, "%3d: %s\n", i, sp->name);
                        buf += cur;
                        res += cur;
                }
        }
        mutex_unlock(&cpuhp_state_mutex);
        return res;
}
static DEVICE_ATTR_RO(states);

static struct attribute *cpuhp_cpu_root_attrs[] = {
        &dev_attr_states.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_root_attr_group = {
        .attrs = cpuhp_cpu_root_attrs,
        .name = "hotplug",
        NULL
};

#ifdef CONFIG_HOTPLUG_SMT

static bool cpu_smt_num_threads_valid(unsigned int threads)
{
        if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
                return threads >= 1 && threads <= cpu_smt_max_threads;
        return threads == 1 || threads == cpu_smt_max_threads;
}

static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        int ctrlval, ret, num_threads, orig_threads;
        bool force_off;

        if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
                return -EPERM;

        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return -ENODEV;

        if (sysfs_streq(buf, "on")) {
                ctrlval = CPU_SMT_ENABLED;
                num_threads = cpu_smt_max_threads;
        } else if (sysfs_streq(buf, "off")) {
                ctrlval = CPU_SMT_DISABLED;
                num_threads = 1;
        } else if (sysfs_streq(buf, "forceoff")) {
                ctrlval = CPU_SMT_FORCE_DISABLED;
                num_threads = 1;
        } else if (kstrtoint(buf, 10, &num_threads) == 0) {
                if (num_threads == 1)
                        ctrlval = CPU_SMT_DISABLED;
                else if (cpu_smt_num_threads_valid(num_threads))
                        ctrlval = CPU_SMT_ENABLED;
                else
                        return -EINVAL;
        } else {
                return -EINVAL;
        }

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        orig_threads = cpu_smt_num_threads;
        cpu_smt_num_threads = num_threads;

        force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;

        if (num_threads > orig_threads)
                ret = cpuhp_smt_enable();
        else if (num_threads < orig_threads || force_off)
                ret = cpuhp_smt_disable(ctrlval);

        unlock_device_hotplug();
        return ret ? ret : count;
}

#else /* !CONFIG_HOTPLUG_SMT */
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        return -ENODEV;
}
#endif /* CONFIG_HOTPLUG_SMT */

static const char *smt_states[] = {
        [CPU_SMT_ENABLED]                = "on",
        [CPU_SMT_DISABLED]                = "off",
        [CPU_SMT_FORCE_DISABLED]        = "forceoff",
        [CPU_SMT_NOT_SUPPORTED]                = "notsupported",
        [CPU_SMT_NOT_IMPLEMENTED]        = "notimplemented",
};

static ssize_t control_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        const char *state = smt_states[cpu_smt_control];

#ifdef CONFIG_HOTPLUG_SMT
        /*
         * If SMT is enabled but not all threads are enabled then show the
         * number of threads. If all threads are enabled show "on". Otherwise
         * show the state name.
         */
        if (cpu_smt_control == CPU_SMT_ENABLED &&
            cpu_smt_num_threads != cpu_smt_max_threads)
                return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
#endif

        return sysfs_emit(buf, "%s\n", state);
}

static ssize_t control_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        return __store_smt_control(dev, attr, buf, count);
}
static DEVICE_ATTR_RW(control);

static ssize_t active_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", sched_smt_active());
}
static DEVICE_ATTR_RO(active);

static struct attribute *cpuhp_smt_attrs[] = {
        &dev_attr_control.attr,
        &dev_attr_active.attr,
        NULL
};

static const struct attribute_group cpuhp_smt_attr_group = {
        .attrs = cpuhp_smt_attrs,
        .name = "smt",
        NULL
};

static int __init cpu_smt_sysfs_init(void)
{
        struct device *dev_root;
        int ret = -ENODEV;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
                put_device(dev_root);
        }
        return ret;
}

static int __init cpuhp_sysfs_init(void)
{
        struct device *dev_root;
        int cpu, ret;

        ret = cpu_smt_sysfs_init();
        if (ret)
                return ret;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
                put_device(dev_root);
                if (ret)
                        return ret;
        }

        for_each_possible_cpu(cpu) {
                struct device *dev = get_cpu_device(cpu);

                if (!dev)
                        continue;
                ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
                if (ret)
                        return ret;
        }
        return 0;
}
device_initcall(cpuhp_sysfs_init);
#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */

/*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
 * represents all NR_CPUS bits binary values of 1<<nr.
 *
 * It is used by cpumask_of() to get a constant address to a CPU
 * mask value that has a single bit set only.
 */

/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x)        [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x)        MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x)        MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x)        MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)

const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {

        MASK_DECLARE_8(0),        MASK_DECLARE_8(8),
        MASK_DECLARE_8(16),        MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
        MASK_DECLARE_8(32),        MASK_DECLARE_8(40),
        MASK_DECLARE_8(48),        MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);

const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);

#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __ro_after_init
        = {CPU_BITS_ALL};
#else
struct cpumask __cpu_possible_mask __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);

struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);

struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);

struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);

struct cpumask __cpu_dying_mask __read_mostly;
EXPORT_SYMBOL(__cpu_dying_mask);

atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);

void init_cpu_present(const struct cpumask *src)
{
        cpumask_copy(&__cpu_present_mask, src);
}

void init_cpu_possible(const struct cpumask *src)
{
        cpumask_copy(&__cpu_possible_mask, src);
}

void init_cpu_online(const struct cpumask *src)
{
        cpumask_copy(&__cpu_online_mask, src);
}

void set_cpu_online(unsigned int cpu, bool online)
{
        /*
         * atomic_inc/dec() is required to handle the horrid abuse of this
         * function by the reboot and kexec code which invoke it from
         * IPI/NMI broadcasts when shutting down CPUs. Invocation from
         * regular CPU hotplug is properly serialized.
         *
         * Note, that the fact that __num_online_cpus is of type atomic_t
         * does not protect readers which are not serialized against
         * concurrent hotplug operations.
         */
        if (online) {
                if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
                        atomic_inc(&__num_online_cpus);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
                        atomic_dec(&__num_online_cpus);
        }
}

/*
 * Activate the first processor.
 */
void __init boot_cpu_init(void)
{
        int cpu = smp_processor_id();

        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
        set_cpu_online(cpu, true);
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
        __boot_cpu_id = cpu;
#endif
}

/*
 * Must be called _AFTER_ setting up the per_cpu areas
 */
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
        cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
        atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
        this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
        this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}

#ifdef CONFIG_CPU_MITIGATIONS
/*
 * These are used for a global "mitigations=" cmdline option for toggling
 * optional CPU mitigations.
 */
enum cpu_mitigations {
        CPU_MITIGATIONS_OFF,
        CPU_MITIGATIONS_AUTO,
        CPU_MITIGATIONS_AUTO_NOSMT,
};

static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;

static int __init mitigations_parse_cmdline(char *arg)
{
        if (!strcmp(arg, "off"))
                cpu_mitigations = CPU_MITIGATIONS_OFF;
        else if (!strcmp(arg, "auto"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO;
        else if (!strcmp(arg, "auto,nosmt"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
        else
                pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
                        arg);

        return 0;
}

/* mitigations=off */
bool cpu_mitigations_off(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_OFF;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_off);

/* mitigations=auto,nosmt */
bool cpu_mitigations_auto_nosmt(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
#else
static int __init mitigations_parse_cmdline(char *arg)
{
        pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n");
        return 0;
}
#endif
early_param("mitigations", mitigations_parse_cmdline);







































    1 

    1 












    2 









    2 
















    2 
    1 


    2 










































    1 




    1 













































































    1 






    1 
























































































































































































































































































































































    1 







    1 



    1 

    1 



































    1 










    1 















    2 



































    1 



    1 












































































































    1 






    1 
















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include "internal.h"
#include "swap.h"

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
        if (!is_kernel_rodata((unsigned long)x))
                kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
noinline
char *kstrdup(const char *s, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strlen(s) + 1;
        buf = kmalloc_track_caller(len, gfp);
        if (buf)
                memcpy(buf, s, len);
        return buf;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
 * must not be passed to krealloc().
 *
 * Return: source string if it is in .rodata section otherwise
 * fallback to kstrdup.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strnlen(s, max);
        buf = kmalloc_track_caller(len+1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kmemdup_noprof);

/**
 * kmemdup_array - duplicate a given array.
 *
 * @src: array to duplicate.
 * @count: number of elements to duplicate from array.
 * @element_size: size of each element of array.
 * @gfp: GFP mask to use.
 *
 * Return: duplicated array of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
{
        return kmemdup(src, size_mul(element_size, count), gfp);
}
EXPORT_SYMBOL(kmemdup_array);

/**
 * kvmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result may be not physically contiguous. Use kvfree() to free.
 */
void *kvmemdup(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kvmalloc(len, gfp);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kvmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        char *buf;

        if (!s)
                return NULL;

        buf = kmalloc_track_caller(len + 1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kmemdup_nul);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result is physically
 * contiguous, to be freed by kfree().
 */
void *memdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(memdup_user);

/**
 * vmemdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result may be not
 * physically contiguous.  Use kvfree() to free.
 */
void *vmemdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kvmalloc(len, GFP_USER);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(vmemdup_user);

/**
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
 */
char *strndup_user(const char __user *s, long n)
{
        char *p;
        long length;

        length = strnlen_user(s, n);

        if (!length)
                return ERR_PTR(-EFAULT);

        if (length > n)
                return ERR_PTR(-EINVAL);

        p = memdup_user(s, length);

        if (IS_ERR(p))
                return p;

        p[length - 1] = '\0';

        return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
        char *p;

        /*
         * Always use GFP_KERNEL, since copy_from_user() can sleep and
         * cause pagefault, which makes it pointless to use GFP_NOFS
         * or GFP_ATOMIC.
         */
        p = kmalloc_track_caller(len + 1, GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';

        return p;
}
EXPORT_SYMBOL(memdup_user_nul);

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
        struct task_struct * __maybe_unused t = current;

        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

/*
 * Change backing file, only valid to use during initial VMA setup.
 */
void vma_set_file(struct vm_area_struct *vma, struct file *file)
{
        /* Changing an anonymous vma with this is illegal */
        get_file(file);
        swap(vma->vm_file, file);
        fput(file);
}
EXPORT_SYMBOL(vma_set_file);

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned long random_variable = 0;

        if (current->flags & PF_RANDOMIZE) {
                random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

/**
 * randomize_page - Generate a random, page aligned address
 * @start:        The smallest acceptable address the caller will take.
 * @range:        The size of the area, starting at @start, within which the
 *                random address must fall.
 *
 * If @start + @range would overflow, @range is capped.
 *
 * NOTE: Historical use of randomize_range, which this replaces, presumed that
 * @start was already page aligned.  We now align it regardless.
 *
 * Return: A page aligned address within [start, start + range).  On error,
 * @start is returned.
 */
unsigned long randomize_page(unsigned long start, unsigned long range)
{
        if (!PAGE_ALIGNED(start)) {
                range -= PAGE_ALIGN(start) - start;
                start = PAGE_ALIGN(start);
        }

        if (start > ULONG_MAX - range)
                range = ULONG_MAX - start;

        range >>= PAGE_SHIFT;

        if (range == 0)
                return start;

        return start + (get_random_long() % range << PAGE_SHIFT);
}

#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
{
        /* Is the current task 32bit ? */
        if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
                return randomize_page(mm->brk, SZ_32M);

        return randomize_page(mm->brk, SZ_1G);
}

unsigned long arch_mmap_rnd(void)
{
        unsigned long rnd;

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
        if (is_compat_task())
                rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
                rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);

        return rnd << PAGE_SHIFT;
}

static int mmap_is_legacy(struct rlimit *rlim_stack)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        /* On parisc the stack always grows up - so a unlimited stack should
         * not be an indicator to use the legacy memory layout. */
        if (rlim_stack->rlim_cur == RLIM_INFINITY &&
                !IS_ENABLED(CONFIG_STACK_GROWSUP))
                return 1;

        return sysctl_legacy_va_layout;
}

/*
 * Leave enough space between the mmap area and the stack to honour ulimit in
 * the face of randomisation.
 */
#define MIN_GAP                (SZ_128M)
#define MAX_GAP                (STACK_TOP / 6 * 5)

static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
#ifdef CONFIG_STACK_GROWSUP
        /*
         * For an upwards growing stack the calculation is much simpler.
         * Memory for the maximum stack size is reserved at the top of the
         * task. mmap_base starts directly below the stack and grows
         * downwards.
         */
        return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
#else
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;

        /* Account for stack randomization if necessary */
        if (current->flags & PF_RANDOMIZE)
                pad += (STACK_RND_MASK << PAGE_SHIFT);

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        if (gap < MIN_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(STACK_TOP - gap - rnd);
#endif
}

void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        unsigned long random_factor = 0UL;

        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();

        if (mmap_is_legacy(rlim_stack)) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                clear_bit(MMF_TOPDOWN, &mm->flags);
        } else {
                mm->mmap_base = mmap_base(random_factor, rlim_stack);
                set_bit(MMF_TOPDOWN, &mm->flags);
        }
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        mm->mmap_base = TASK_UNMAPPED_BASE;
        clear_bit(MMF_TOPDOWN, &mm->flags);
}
#endif

/**
 * __account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 * @task:        task used to check RLIMIT_MEMLOCK
 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
 *
 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
 * that mmap_lock is held as writer.
 *
 * Return:
 * * 0       on success
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim)
{
        unsigned long locked_vm, limit;
        int ret = 0;

        mmap_assert_write_locked(mm);

        locked_vm = mm->locked_vm;
        if (inc) {
                if (!bypass_rlim) {
                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
                        if (locked_vm + pages > limit)
                                ret = -ENOMEM;
                }
                if (!ret)
                        mm->locked_vm = locked_vm + pages;
        } else {
                WARN_ON_ONCE(pages > locked_vm);
                mm->locked_vm = locked_vm - pages;
        }

        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
                 ret ? " - exceeded" : "");

        return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);

/**
 * account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against, may be NULL
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 *
 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
 *
 * Return:
 * * 0       on success, or if mm is NULL
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
        int ret;

        if (pages == 0 || !mm)
                return 0;

        mmap_write_lock(mm);
        ret = __account_locked_vm(mm, pages, inc, current,
                                  capable(CAP_IPC_LOCK));
        mmap_write_unlock(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
{
        unsigned long ret;
        struct mm_struct *mm = current->mm;
        unsigned long populate;
        LIST_HEAD(uf);

        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
                ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
                              &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
        }
        return ret;
}

unsigned long vm_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long offset)
{
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
        if (unlikely(offset_in_page(offset)))
                return -EINVAL;

        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

/**
 * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
 * failure, fall back to non-contiguous (vmalloc) allocation.
 * @size: size of the request.
 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
 * @node: numa node to allocate from
 *
 * Uses kmalloc to get the memory but if the allocation fails then falls back
 * to the vmalloc allocator. Use kvfree for freeing the memory.
 *
 * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
 * preferable to the vmalloc fallback, due to visible performance drawbacks.
 *
 * Return: pointer to the allocated memory of %NULL in case of failure
 */
void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
        gfp_t kmalloc_flags = flags;
        void *ret;

        /*
         * We want to attempt a large physically contiguous block first because
         * it is less likely to fragment multiple larger blocks and therefore
         * contribute to a long term fragmentation less than vmalloc fallback.
         * However make sure that larger requests are not too disruptive - no
         * OOM killer and no allocation failure warnings as we have a fallback.
         */
        if (size > PAGE_SIZE) {
                kmalloc_flags |= __GFP_NOWARN;

                if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
                        kmalloc_flags |= __GFP_NORETRY;

                /* nofail semantic is implemented by the vmalloc fallback */
                kmalloc_flags &= ~__GFP_NOFAIL;
        }

        ret = kmalloc_node_noprof(size, kmalloc_flags, node);

        /*
         * It doesn't really make sense to fallback to vmalloc for sub page
         * requests
         */
        if (ret || size <= PAGE_SIZE)
                return ret;

        /* non-sleeping allocations are not supported by vmalloc */
        if (!gfpflags_allow_blocking(flags))
                return NULL;

        /* Don't even allow crazy sizes */
        if (unlikely(size > INT_MAX)) {
                WARN_ON_ONCE(!(flags & __GFP_NOWARN));
                return NULL;
        }

        /*
         * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
         * since the callers already cannot assume anything
         * about the resulting pointer, and cannot play
         * protection games.
         */
        return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
                        flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
                        node, __builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node_noprof);

/**
 * kvfree() - Free memory.
 * @addr: Pointer to allocated memory.
 *
 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
 * It is slightly more efficient to use kfree() or vfree() if you are certain
 * that you know which one to use.
 *
 * Context: Either preemptible task context or not-NMI interrupt.
 */
void kvfree(const void *addr)
{
        if (is_vmalloc_addr(addr))
                vfree(addr);
        else
                kfree(addr);
}
EXPORT_SYMBOL(kvfree);

/**
 * kvfree_sensitive - Free a data object containing sensitive information.
 * @addr: address of the data object to be freed.
 * @len: length of the data object.
 *
 * Use the special memzero_explicit() function to clear the content of a
 * kvmalloc'ed object containing sensitive data to make sure that the
 * compiler won't optimize out the data clearing.
 */
void kvfree_sensitive(const void *addr, size_t len)
{
        if (likely(!ZERO_OR_NULL_PTR(addr))) {
                memzero_explicit((void *)addr, len);
                kvfree(addr);
        }
}
EXPORT_SYMBOL(kvfree_sensitive);

void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
{
        void *newp;

        if (oldsize >= newsize)
                return (void *)p;
        newp = kvmalloc_noprof(newsize, flags);
        if (!newp)
                return NULL;
        memcpy(newp, p, oldsize);
        kvfree(p);
        return newp;
}
EXPORT_SYMBOL(kvrealloc_noprof);

/**
 * __vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return __vmalloc_noprof(bytes, flags);
}
EXPORT_SYMBOL(__vmalloc_array_noprof);

/**
 * vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vmalloc_array_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_array_noprof);

/**
 * __vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
{
        return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(__vcalloc_noprof);

/**
 * vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vcalloc_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vcalloc_noprof);

struct anon_vma *folio_anon_vma(struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                return NULL;
        return (void *)(mapping - PAGE_MAPPING_ANON);
}

/**
 * folio_mapping - Find the mapping where this folio is stored.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Folios in the swap cache return the swap mapping
 * this page is stored in (which is different from the mapping for the
 * swap file or swap device where the data is stored).
 *
 * You can call this for folios which aren't in the swap cache or page
 * cache and it will return NULL.
 */
struct address_space *folio_mapping(struct folio *folio)
{
        struct address_space *mapping;

        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(folio_test_slab(folio)))
                return NULL;

        if (unlikely(folio_test_swapcache(folio)))
                return swap_address_space(folio->swap);

        mapping = folio->mapping;
        if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
                return NULL;

        return mapping;
}
EXPORT_SYMBOL(folio_mapping);

/**
 * folio_copy - Copy the contents of one folio to another.
 * @dst: Folio to copy to.
 * @src: Folio to copy from.
 *
 * The bytes in the folio represented by @src are copied to @dst.
 * Assumes the caller has validated that @dst is at least as large as @src.
 * Can be called in atomic context for order-0 folios, but if the folio is
 * larger, it may sleep.
 */
void folio_copy(struct folio *dst, struct folio *src)
{
        long i = 0;
        long nr = folio_nr_pages(src);

        for (;;) {
                copy_highpage(folio_page(dst, i), folio_page(src, i));
                if (++i == nr)
                        break;
                cond_resched();
        }
}
EXPORT_SYMBOL(folio_copy);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_kbytes = 0;
        return ret;
}

static void sync_overcommit_as(struct work_struct *dummy)
{
        percpu_counter_sync(&vm_committed_as);
}

int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int new_policy = -1;
        int ret;

        /*
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
         * with the strict "NEVER", and to avoid possible race condition (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *        1. changing the batch
         *        2. sync percpu count on each CPU
         *        3. switch the policy
         */
        if (write) {
                t = *table;
                t.data = &new_policy;
                ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
                if (ret || new_policy == -1)
                        return ret;

                mm_compute_batch(new_policy);
                if (new_policy == OVERCOMMIT_NEVER)
                        schedule_on_each_cpu(sync_overcommit_as);
                sysctl_overcommit_memory = new_policy;
        } else {
                ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        }

        return ret;
}

int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_ratio = 0;
        return ret;
}

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
        unsigned long allowed;

        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;

        return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 *
 * The time cost of this is very low for small platforms, and for big
 * platform like a 2S/36C/72T Skylake server, in worst case where
 * vm_committed_as's spinlock is under severe contention, the time cost
 * could be about 30~40 microseconds.
 */
unsigned long vm_memory_committed(void)
{
        return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
        long allowed;
        unsigned long bytes_failed;

        vm_acct_memory(pages);

        /*
         * Sometimes we want to use more memory than we have
         */
        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
                return 0;

        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                if (pages > totalram_pages() + total_swap_pages)
                        goto error;
                return 0;
        }

        allowed = vm_commit_limit();
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

        /*
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }

        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
error:
        bytes_failed = pages << PAGE_SHIFT;
        pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
                            __func__, current->pid, current->comm, bytes_failed);
        vm_unacct_memory(pages);

        return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 *
 * Return: the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
        int res = 0;
        unsigned int len;
        struct mm_struct *mm = get_task_mm(task);
        unsigned long arg_start, arg_end, env_start, env_end;
        if (!mm)
                goto out;
        if (!mm->arg_end)
                goto out_mm;        /* Shh! No looking before we're done */

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        len = arg_end - arg_start;

        if (len > buflen)
                len = buflen;

        res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

        /*
         * If the nul at the end of args has been overwritten, then
         * assume application is using setproctitle(3).
         */
        if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
                len = strnlen(buffer, res);
                if (len < res) {
                        res = len;
                } else {
                        len = env_end - env_start;
                        if (len > buflen - res)
                                len = buflen - res;
                        res += access_process_vm(task, env_start,
                                                 buffer+res, len,
                                                 FOLL_FORCE);
                        res = strnlen(buffer, res);
                }
        }
out_mm:
        mmput(mm);
out:
        return res;
}

int __weak memcmp_pages(struct page *page1, struct page *page2)
{
        char *addr1, *addr2;
        int ret;

        addr1 = kmap_local_page(page1);
        addr2 = kmap_local_page(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
        kunmap_local(addr2);
        kunmap_local(addr1);
        return ret;
}

#ifdef CONFIG_PRINTK
/**
 * mem_dump_obj - Print available provenance information
 * @object: object for which to find provenance information.
 *
 * This function uses pr_cont(), so that the caller is expected to have
 * printed out whatever preamble is appropriate.  The provenance information
 * depends on the type of object and on how much debugging is enabled.
 * For example, for a slab-cache object, the slab name is printed, and,
 * if available, the return address and stack trace from the allocation
 * and last free path of that object.
 */
void mem_dump_obj(void *object)
{
        const char *type;

        if (kmem_dump_obj(object))
                return;

        if (vmalloc_dump_obj(object))
                return;

        if (is_vmalloc_addr(object))
                type = "vmalloc memory";
        else if (virt_addr_valid(object))
                type = "non-slab/vmalloc memory";
        else if (object == NULL)
                type = "NULL pointer";
        else if (object == ZERO_SIZE_PTR)
                type = "zero-size pointer";
        else
                type = "non-paged memory";

        pr_cont(" %s\n", type);
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif

/*
 * A driver might set a page logically offline -- PageOffline() -- and
 * turn the page inaccessible in the hypervisor; after that, access to page
 * content can be fatal.
 *
 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
 * pages after checking PageOffline(); however, these PFN walkers can race
 * with drivers that set PageOffline().
 *
 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
 * synchronize with such drivers, achieving that a page cannot be set
 * PageOffline() while frozen.
 *
 * page_offline_begin()/page_offline_end() is used by drivers that care about
 * such races when setting a page PageOffline().
 */
static DECLARE_RWSEM(page_offline_rwsem);

void page_offline_freeze(void)
{
        down_read(&page_offline_rwsem);
}

void page_offline_thaw(void)
{
        up_read(&page_offline_rwsem);
}

void page_offline_begin(void)
{
        down_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_begin);

void page_offline_end(void)
{
        up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);

#ifndef flush_dcache_folio
void flush_dcache_folio(struct folio *folio)
{
        long i, nr = folio_nr_pages(folio);

        for (i = 0; i < nr; i++)
                flush_dcache_page(folio_page(folio, i));
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif



































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_INTERNAL_H
#define _LINUX_HIGHMEM_INTERNAL_H

/*
 * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
 */
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
void __kmap_local_sched_in(void);
static inline void kmap_assert_nomap(void)
{
        DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
}
#else
static inline void kmap_local_fork(struct task_struct *tsk) { }
static inline void kmap_assert_nomap(void) { }
#endif

#ifdef CONFIG_HIGHMEM
#include <asm/highmem.h>

#ifndef ARCH_HAS_KMAP_FLUSH_TLB
static inline void kmap_flush_tlb(unsigned long addr) { }
#endif

#ifndef kmap_prot
#define kmap_prot PAGE_KERNEL
#endif

void *kmap_high(struct page *page);
void kunmap_high(struct page *page);
void __kmap_flush_unused(void);
struct page *__kmap_to_page(void *addr);

static inline void *kmap(struct page *page)
{
        void *addr;

        might_sleep();
        if (!PageHighMem(page))
                addr = page_address(page);
        else
                addr = kmap_high(page);
        kmap_flush_tlb((unsigned long)addr);
        return addr;
}

static inline void kunmap(struct page *page)
{
        might_sleep();
        if (!PageHighMem(page))
                return;
        kunmap_high(page);
}

static inline struct page *kmap_to_page(void *addr)
{
        return __kmap_to_page(addr);
}

static inline void kmap_flush_unused(void)
{
        __kmap_flush_unused();
}

static inline void *kmap_local_page(struct page *page)
{
        return __kmap_local_page_prot(page, kmap_prot);
}

static inline void *kmap_local_folio(struct folio *folio, size_t offset)
{
        struct page *page = folio_page(folio, offset / PAGE_SIZE);
        return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
}

static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
{
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_local(const void *vaddr)
{
        kunmap_local_indexed(vaddr);
}

static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_atomic(struct page *page)
{
        return kmap_atomic_prot(page, kmap_prot);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_atomic(const void *addr)
{
        kunmap_local_indexed(addr);
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

unsigned int __nr_free_highpages(void);
extern atomic_long_t _totalhigh_pages;

static inline unsigned int nr_free_highpages(void)
{
        return __nr_free_highpages();
}

static inline unsigned long totalhigh_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalhigh_pages);
}

static inline void totalhigh_pages_add(long count)
{
        atomic_long_add(count, &_totalhigh_pages);
}

static inline bool is_kmap_addr(const void *x)
{
        unsigned long addr = (unsigned long)x;

        return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) ||
                (addr >= __fix_to_virt(FIX_KMAP_END) &&
                 addr < __fix_to_virt(FIX_KMAP_BEGIN));
}
#else /* CONFIG_HIGHMEM */

static inline struct page *kmap_to_page(void *addr)
{
        return virt_to_page(addr);
}

static inline void *kmap(struct page *page)
{
        might_sleep();
        return page_address(page);
}

static inline void kunmap_high(struct page *page) { }
static inline void kmap_flush_unused(void) { }

static inline void kunmap(struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(page_address(page));
#endif
}

static inline void *kmap_local_page(struct page *page)
{
        return page_address(page);
}

static inline void *kmap_local_folio(struct folio *folio, size_t offset)
{
        return page_address(&folio->page) + offset;
}

static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
{
        return kmap_local_page(page);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return kmap_local_page(pfn_to_page(pfn));
}

static inline void __kunmap_local(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
}

static inline void *kmap_atomic(struct page *page)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();
        pagefault_disable();
        return page_address(page);
}

static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
        return kmap_atomic(page);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        return kmap_atomic(pfn_to_page(pfn));
}

static inline void __kunmap_atomic(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

static inline unsigned int nr_free_highpages(void) { return 0; }
static inline unsigned long totalhigh_pages(void) { return 0UL; }

static inline bool is_kmap_addr(const void *x)
{
        return false;
}

#endif /* CONFIG_HIGHMEM */

/**
 * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated!
 * @__addr:       Virtual address to be unmapped
 *
 * Unmaps an address previously mapped by kmap_atomic() and re-enables
 * pagefaults. Depending on PREEMP_RT configuration, re-enables also
 * migration and preemption. Users should not count on these side effects.
 *
 * Mappings should be unmapped in the reverse order that they were mapped.
 * See kmap_local_page() for details on nesting.
 *
 * @__addr can be any address within the mapped page, so there is no need
 * to subtract any offset that has been added. In contrast to kunmap(),
 * this function takes the address returned from kmap_atomic(), not the
 * page passed to it. The compiler will warn you if you pass the page.
 */
#define kunmap_atomic(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_atomic(__addr);                                \
} while (0)

/**
 * kunmap_local - Unmap a page mapped via kmap_local_page().
 * @__addr: An address within the page mapped
 *
 * @__addr can be any address within the mapped page.  Commonly it is the
 * address return from kmap_local_page(), but it can also include offsets.
 *
 * Unmapping should be done in the reverse order of the mapping.  See
 * kmap_local_page() for details.
 */
#define kunmap_local(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_local(__addr);                                        \
} while (0)

#endif


































    1 












    3 
   19 
































    4 

    5 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wrapper functions for accessing the file_struct fd array.
 */

#ifndef __LINUX_FILE_H
#define __LINUX_FILE_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>

struct file;

extern void fput(struct file *);

struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
struct path;
extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_clone(struct file *, int flags,
        const struct file_operations *);

static inline void fput_light(struct file *file, int fput_needed)
{
        if (fput_needed)
                fput(file);
}

struct fd {
        struct file *file;
        unsigned int flags;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

static inline void fdput(struct fd fd)
{
        if (fd.flags & FDPUT_FPUT)
                fput(fd.file);
}

extern struct file *fget(unsigned int fd);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern unsigned long __fdget(unsigned int fd);
extern unsigned long __fdget_raw(unsigned int fd);
extern unsigned long __fdget_pos(unsigned int fd);
extern void __f_unlock_pos(struct file *);

static inline struct fd __to_fd(unsigned long v)
{
        return (struct fd){(struct file *)(v & ~3),v & 3};
}

static inline struct fd fdget(unsigned int fd)
{
        return __to_fd(__fdget(fd));
}

static inline struct fd fdget_raw(unsigned int fd)
{
        return __to_fd(__fdget_raw(fd));
}

static inline struct fd fdget_pos(int fd)
{
        return __to_fd(__fdget_pos(fd));
}

static inline void fdput_pos(struct fd f)
{
        if (f.flags & FDPUT_POS_UNLOCK)
                __f_unlock_pos(f.file);
        fdput(f);
}

DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);

DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
             get_unused_fd_flags(flags), unsigned flags)

extern void fd_install(unsigned int fd, struct file *file);

int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);

extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);

extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;

#endif /* __LINUX_FILE_H */



















   18 





   20 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
 */

#include <linux/preempt.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/printk.h>
#include <linux/kprobes.h>

#include "internal.h"

static DEFINE_PER_CPU(int, printk_context);

/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
        this_cpu_inc(printk_context);
}

/* Can be preempted by NMI. */
void __printk_safe_exit(void)
{
        this_cpu_dec(printk_context);
}

asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
        /* Allow to pass printk() to kdb but avoid a recursion. */
        if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
                return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
#endif

        /*
         * Use the main logbuf even in NMI. But avoid calling console
         * drivers that might have their own locks.
         */
        if (this_cpu_read(printk_context) || in_nmi())
                return vprintk_deferred(fmt, args);

        /* No obstacles. */
        return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);

























    2 










    2 

    2 

    2 
    1 





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Access kernel or user memory without faulting.
 */
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>

bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
                size_t size)
{
        return true;
}

#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __get_kernel_nofault(dst, src, type, err_label);                \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        if (!copy_from_kernel_nofault_allowed(src, size))
                return -ERANGE;

        pagefault_disable();
        if (!(align & 7))
                copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);

#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __put_kernel_nofault(dst, src, type, err_label);                \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        pagefault_disable();
        if (!(align & 7))
                copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
        const void *src = unsafe_addr;

        if (unlikely(count <= 0))
                return 0;
        if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
                return -ERANGE;

        pagefault_disable();
        do {
                __get_kernel_nofault(dst, src, u8, Efault);
                dst++;
                src++;
        } while (dst[-1] && src - unsafe_addr < count);
        pagefault_enable();

        dst[-1] = '\0';
        return src - unsafe_addr;
Efault:
        pagefault_enable();
        dst[0] = '\0';
        return -EFAULT;
}

/**
 * copy_from_user_nofault(): safely attempt to read from a user-space location
 * @dst: pointer to the buffer that shall take the data
 * @src: address to read from. This must be a user address.
 * @size: size of the data chunk
 *
 * Safely read from user address @src to the buffer at @dst. If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
        long ret = -EFAULT;

        if (!__access_ok(src, size))
                return ret;

        if (!nmi_uaccess_okay())
                return ret;

        pagefault_disable();
        ret = __copy_from_user_inatomic(dst, src, size);
        pagefault_enable();

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_from_user_nofault);

/**
 * copy_to_user_nofault(): safely attempt to write to a user-space location
 * @dst: address to write to
 * @src: pointer to the data that shall be written
 * @size: size of the data chunk
 *
 * Safely write to address @dst from the buffer at @src.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
        long ret = -EFAULT;

        if (access_ok(dst, size)) {
                pagefault_disable();
                ret = __copy_to_user_inatomic(dst, src, size);
                pagefault_enable();
        }

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_to_user_nofault);

/**
 * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
 *                                address.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @unsafe_addr: Unsafe user address.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from unsafe user address to kernel buffer.
 *
 * On success, returns the length of the string INCLUDING the trailing NUL.
 *
 * If access fails, returns -EFAULT (some data may have been copied
 * and the trailing NUL added).
 *
 * If @count is smaller than the length of the string, copies @count-1 bytes,
 * sets the last byte of @dst buffer to NUL and returns @count.
 */
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                              long count)
{
        long ret;

        if (unlikely(count <= 0))
                return 0;

        pagefault_disable();
        ret = strncpy_from_user(dst, unsafe_addr, count);
        pagefault_enable();

        if (ret >= count) {
                ret = count;
                dst[ret - 1] = '\0';
        } else if (ret > 0) {
                ret++;
        }

        return ret;
}

/**
 * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
 * @unsafe_addr: The string to measure.
 * @count: Maximum count (including NUL)
 *
 * Get the size of a NUL-terminated string in user space without pagefault.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
 *
 * If the string is too long, returns a number larger than @count. User
 * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
 *
 * Unlike strnlen_user, this can be used from IRQ handler etc. because
 * it disables pagefaults.
 */
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
        int ret;

        pagefault_disable();
        ret = strnlen_user(unsafe_addr, count);
        pagefault_enable();

        return ret;
}

void __copy_overflow(int size, unsigned long count)
{
        WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}
EXPORT_SYMBOL(__copy_overflow);










































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















    1 








    1 












    1 







    1 





    1 


    1 



    1 
    1 







    1 






    1 



























    1 











































































    1 













































































































    1 



    1 




    1 



































    1 













    1 










    1 






























    1 






















    1 





































































    1 


















    1 








    1 











    1 




    1 




































































    2 








    2 



    2 





















    2 

















    2 














































































    6 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/open.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
#include <linux/mnt_idmapping.h>
#include <linux/filelock.h>

#include "internal.h"

int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
                loff_t length, unsigned int time_attrs, struct file *filp)
{
        int ret;
        struct iattr newattrs;

        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
        if (length < 0)
                return -EINVAL;

        newattrs.ia_size = length;
        newattrs.ia_valid = ATTR_SIZE | time_attrs;
        if (filp) {
                newattrs.ia_file = filp;
                newattrs.ia_valid |= ATTR_FILE;
        }

        /* Remove suid, sgid, and file capabilities on truncate too */
        ret = dentry_needs_remove_privs(idmap, dentry);
        if (ret < 0)
                return ret;
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;

        inode_lock(dentry->d_inode);
        /* Note any delegations or leases have already been broken: */
        ret = notify_change(idmap, dentry, &newattrs, NULL);
        inode_unlock(dentry->d_inode);
        return ret;
}

long vfs_truncate(const struct path *path, loff_t length)
{
        struct mnt_idmap *idmap;
        struct inode *inode;
        long error;

        inode = path->dentry->d_inode;

        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        if (S_ISDIR(inode->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;

        error = mnt_want_write(path->mnt);
        if (error)
                goto out;

        idmap = mnt_idmap(path->mnt);
        error = inode_permission(idmap, inode, MAY_WRITE);
        if (error)
                goto mnt_drop_write_and_out;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;

        error = get_write_access(inode);
        if (error)
                goto mnt_drop_write_and_out;

        /*
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;

        error = security_path_truncate(path);
        if (!error)
                error = do_truncate(idmap, path->dentry, length, 0, NULL);

put_write_and_out:
        put_write_access(inode);
mnt_drop_write_and_out:
        mnt_drop_write(path->mnt);
out:
        return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);

long do_sys_truncate(const char __user *pathname, loff_t length)
{
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        struct path path;
        int error;

        if (length < 0)        /* sorry, but loff_t says... */
                return -EINVAL;

retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_truncate(&path, length);
                path_put(&path);
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
        return do_sys_truncate(path, length);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
        return do_sys_truncate(path, length);
}
#endif

long do_ftruncate(struct file *file, loff_t length, int small)
{
        struct inode *inode;
        struct dentry *dentry;
        int error;

        /* explicitly opened as large or we are on 64-bit box */
        if (file->f_flags & O_LARGEFILE)
                small = 0;

        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
        if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
                return -EINVAL;

        /* Cannot ftruncate over 2^31 bytes without large file support */
        if (small && length > MAX_NON_LFS)
                return -EINVAL;

        /* Check IS_APPEND on real upper inode */
        if (IS_APPEND(file_inode(file)))
                return -EPERM;
        sb_start_write(inode->i_sb);
        error = security_file_truncate(file);
        if (!error)
                error = do_truncate(file_mnt_idmap(file), dentry, length,
                                    ATTR_MTIME | ATTR_CTIME, file);
        sb_end_write(inode->i_sb);

        return error;
}

long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
        struct fd f;
        int error;

        if (length < 0)
                return -EINVAL;
        f = fdget(fd);
        if (!f.file)
                return -EBADF;

        error = do_ftruncate(f.file, length, small);

        fdput(f);
        return error;
}

SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
{
        return do_sys_ftruncate(fd, length, 1);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
{
        return do_sys_ftruncate(fd, length, 1);
}
#endif

/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
        return do_sys_truncate(path, length);
}

SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
        return do_sys_ftruncate(fd, length, 0);
}
#endif /* BITS_PER_LONG == 32 */

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
                       compat_arg_u64_dual(length))
{
        return ksys_truncate(pathname, compat_arg_u64_glue(length));
}
#endif

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
                       compat_arg_u64_dual(length))
{
        return ksys_ftruncate(fd, compat_arg_u64_glue(length));
}
#endif

int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        long ret;

        if (offset < 0 || len <= 0)
                return -EINVAL;

        /* Return error if mode is not supported */
        if (mode & ~FALLOC_FL_SUPPORTED_MASK)
                return -EOPNOTSUPP;

        /* Punch hole and zero range are mutually exclusive */
        if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
            (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;

        /* Punch hole must have keep size set */
        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;

        /* Collapse range should only be used exclusively. */
        if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
                return -EINVAL;

        /* Insert range should only be used exclusively. */
        if ((mode & FALLOC_FL_INSERT_RANGE) &&
            (mode & ~FALLOC_FL_INSERT_RANGE))
                return -EINVAL;

        /* Unshare range should only be used with allocate mode. */
        if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
            (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
                return -EINVAL;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;

        /*
         * We can only allow pure fallocate on append only files
         */
        if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                return -EPERM;

        if (IS_IMMUTABLE(inode))
                return -EPERM;

        /*
         * We cannot allow any fallocate operation on an active swapfile
         */
        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
                return ret;

        ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
        if (ret)
                return ret;

        if (S_ISFIFO(inode->i_mode))
                return -ESPIPE;

        if (S_ISDIR(inode->i_mode))
                return -EISDIR;

        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -ENODEV;

        /* Check for wrap through zero too */
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                return -EFBIG;

        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;

        file_start_write(file);
        ret = file->f_op->fallocate(file, mode, offset, len);

        /*
         * Create inotify and fanotify events.
         *
         * To keep the logic simple always create events if fallocate succeeds.
         * This implies that events are even created if the file size remains
         * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
         */
        if (ret == 0)
                fsnotify_modify(file);

        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);

int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fallocate(f.file, mode, offset, len);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
        return ksys_fallocate(fd, mode, offset, len);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
                       compat_arg_u64_dual(len))
{
        return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
                              compat_arg_u64_glue(len));
}
#endif

/*
 * access() needs to use the real uid/gid, not the effective uid/gid.
 * We do this by temporarily clearing all FS-related capabilities and
 * switching the fsuid/fsgid around to the real ones.
 *
 * Creating new credentials is expensive, so we try to skip doing it,
 * which we can if the result would match what we already got.
 */
static bool access_need_override_creds(int flags)
{
        const struct cred *cred;

        if (flags & AT_EACCESS)
                return false;

        cred = current_cred();
        if (!uid_eq(cred->fsuid, cred->uid) ||
            !gid_eq(cred->fsgid, cred->gid))
                return true;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                kuid_t root_uid = make_kuid(cred->user_ns, 0);
                if (!uid_eq(cred->uid, root_uid)) {
                        if (!cap_isclear(cred->cap_effective))
                                return true;
                } else {
                        if (!cap_isidentical(cred->cap_effective,
                            cred->cap_permitted))
                                return true;
                }
        }

        return false;
}

static const struct cred *access_override_creds(void)
{
        const struct cred *old_cred;
        struct cred *override_cred;

        override_cred = prepare_creds();
        if (!override_cred)
                return NULL;

        /*
         * XXX access_need_override_creds performs checks in hopes of skipping
         * this work. Make sure it stays in sync if making any changes in this
         * routine.
         */

        override_cred->fsuid = override_cred->uid;
        override_cred->fsgid = override_cred->gid;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
                if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
                                override_cred->cap_permitted;
        }

        /*
         * The new set of credentials can *only* be used in
         * task-synchronous circumstances, and does not need
         * RCU freeing, unless somebody then takes a separate
         * reference to it.
         *
         * NOTE! This is _only_ true because this credential
         * is used purely for override_creds() that installs
         * it as the subjective cred. Other threads will be
         * accessing ->real_cred, not the subjective cred.
         *
         * If somebody _does_ make a copy of this (using the
         * 'get_current_cred()' function), that will clear the
         * non_rcu field, because now that other user may be
         * expecting RCU freeing. But normal thread-synchronous
         * cred accesses will keep things non-racy to avoid RCU
         * freeing.
         */
        override_cred->non_rcu = 1;

        old_cred = override_creds(override_cred);

        /* override_cred() gets its own ref */
        put_cred(override_cred);

        return old_cred;
}

static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
        struct path path;
        struct inode *inode;
        int res;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        const struct cred *old_cred = NULL;

        if (mode & ~S_IRWXO)        /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;

        if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
                return -EINVAL;

        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (access_need_override_creds(flags)) {
                old_cred = access_override_creds();
                if (!old_cred)
                        return -ENOMEM;
        }

retry:
        res = user_path_at(dfd, filename, lookup_flags, &path);
        if (res)
                goto out;

        inode = d_backing_inode(path.dentry);

        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                /*
                 * MAY_EXEC on regular files is denied if the fs is mounted
                 * with the "noexec" flag.
                 */
                res = -EACCES;
                if (path_noexec(&path))
                        goto out_path_release;
        }

        res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS);
        /* SuS v2 requires we report a read only fs too */
        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                goto out_path_release;
        /*
         * This is a rare case where using __mnt_is_readonly()
         * is OK without a mnt_want/drop_write() pair.  Since
         * no actual write to the fs is performed here, we do
         * not need to telegraph to that to anyone.
         *
         * By doing this, we accept that this access is
         * inherently racy and know that the fs may change
         * state before we even see this result.
         */
        if (__mnt_is_readonly(path.mnt))
                res = -EROFS;

out_path_release:
        path_put(&path);
        if (retry_estale(res, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        if (old_cred)
                revert_creds(old_cred);

        return res;
}

SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
        return do_faccessat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
                int, flags)
{
        return do_faccessat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
        return do_faccessat(AT_FDCWD, filename, mode, 0);
}

SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        set_fs_pwd(current->fs, &path);

dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
        struct fd f = fdget_raw(fd);
        int error;

        error = -EBADF;
        if (!f.file)
                goto out;

        error = -ENOTDIR;
        if (!d_can_lookup(f.file->f_path.dentry))
                goto out_putf;

        error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
        fdput(f);
out:
        return error;
}

SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        error = -EPERM;
        if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
                goto dput_and_out;
        error = security_path_chroot(&path);
        if (error)
                goto dput_and_out;

        set_fs_root(current->fs, &path);
        error = 0;
dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

int chmod_common(const struct path *path, umode_t mode)
{
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        struct iattr newattrs;
        int error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;
retry_deleg:
        inode_lock(inode);
        error = security_path_chmod(path, mode);
        if (error)
                goto out_unlock;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(mnt_idmap(path->mnt), path->dentry,
                              &newattrs, &delegated_inode);
out_unlock:
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path->mnt);
        return error;
}

int vfs_fchmod(struct file *file, umode_t mode)
{
        audit_file(file);
        return chmod_common(&file->f_path, mode);
}

SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
        struct fd f = fdget(fd);
        int err = -EBADF;

        if (f.file) {
                err = vfs_fchmod(f.file, mode);
                fdput(f);
        }
        return err;
}

static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
                       unsigned int flags)
{
        struct path path;
        int error;
        unsigned int lookup_flags;

        if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
                return -EINVAL;

        lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (!error) {
                error = chmod_common(&path, mode);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
                umode_t, mode, unsigned int, flags)
{
        return do_fchmodat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
                umode_t, mode)
{
        return do_fchmodat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
        return do_fchmodat(AT_FDCWD, filename, mode, 0);
}

/*
 * Check whether @kuid is valid and if so generate and set vfsuid_t in
 * ia_vfsuid.
 *
 * Return: true if @kuid is valid, false if not.
 */
static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
{
        if (!uid_valid(kuid))
                return false;
        attr->ia_valid |= ATTR_UID;
        attr->ia_vfsuid = VFSUIDT_INIT(kuid);
        return true;
}

/*
 * Check whether @kgid is valid and if so generate and set vfsgid_t in
 * ia_vfsgid.
 *
 * Return: true if @kgid is valid, false if not.
 */
static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
{
        if (!gid_valid(kgid))
                return false;
        attr->ia_valid |= ATTR_GID;
        attr->ia_vfsgid = VFSGIDT_INIT(kgid);
        return true;
}

int chown_common(const struct path *path, uid_t user, gid_t group)
{
        struct mnt_idmap *idmap;
        struct user_namespace *fs_userns;
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;
        struct iattr newattrs;
        kuid_t uid;
        kgid_t gid;

        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);

        idmap = mnt_idmap(path->mnt);
        fs_userns = i_user_ns(inode);

retry_deleg:
        newattrs.ia_vfsuid = INVALID_VFSUID;
        newattrs.ia_vfsgid = INVALID_VFSGID;
        newattrs.ia_valid =  ATTR_CTIME;
        if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
                return -EINVAL;
        if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
                return -EINVAL;
        inode_lock(inode);
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
                                     setattr_should_drop_sgid(idmap, inode);
        /* Continue to send actual fs values, not the mount values. */
        error = security_path_chown(
                path,
                from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid),
                from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid));
        if (!error)
                error = notify_change(idmap, path->dentry, &newattrs,
                                      &delegated_inode);
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        return error;
}

int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag)
{
        struct path path;
        int error = -EINVAL;
        int lookup_flags;

        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;

        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
out_release:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
                gid_t, group, int, flag)
{
        return do_fchownat(dfd, filename, user, group, flag);
}

SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group, 0);
}

SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group,
                           AT_SYMLINK_NOFOLLOW);
}

int vfs_fchown(struct file *file, uid_t user, gid_t group)
{
        int error;

        error = mnt_want_write_file(file);
        if (error)
                return error;
        audit_file(file);
        error = chown_common(&file->f_path, user, group);
        mnt_drop_write_file(file);
        return error;
}

int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fchown(f.file, user, group);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
        return ksys_fchown(fd, user, group);
}

static inline int file_get_write_access(struct file *f)
{
        int error;

        error = get_write_access(f->f_inode);
        if (unlikely(error))
                return error;
        error = mnt_get_write_access(f->f_path.mnt);
        if (unlikely(error))
                goto cleanup_inode;
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                error = mnt_get_write_access(backing_file_user_path(f)->mnt);
                if (unlikely(error))
                        goto cleanup_mnt;
        }
        return 0;

cleanup_mnt:
        mnt_put_write_access(f->f_path.mnt);
cleanup_inode:
        put_write_access(f->f_inode);
        return error;
}

static int do_dentry_open(struct file *f,
                          int (*open)(struct inode *, struct file *))
{
        static const struct file_operations empty_fops = {};
        struct inode *inode = f->f_path.dentry->d_inode;
        int error;

        path_get(&f->f_path);
        f->f_inode = inode;
        f->f_mapping = inode->i_mapping;
        f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
        f->f_sb_err = file_sample_sb_err(f);

        if (unlikely(f->f_flags & O_PATH)) {
                f->f_mode = FMODE_PATH | FMODE_OPENED;
                f->f_op = &empty_fops;
                return 0;
        }

        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_inc(inode);
        } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
                error = file_get_write_access(f);
                if (unlikely(error))
                        goto cleanup_file;
                f->f_mode |= FMODE_WRITER;
        }

        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;

        f->f_op = fops_get(inode->i_fop);
        if (WARN_ON(!f->f_op)) {
                error = -ENODEV;
                goto cleanup_all;
        }

        error = security_file_open(f);
        if (error)
                goto cleanup_all;

        error = break_lease(file_inode(f), f->f_flags);
        if (error)
                goto cleanup_all;

        /* normally all 3 are set; ->open() can clear them if needed */
        f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        if (!open)
                open = f->f_op->open;
        if (open) {
                error = open(inode, f);
                if (error)
                        goto cleanup_all;
        }
        f->f_mode |= FMODE_OPENED;
        if ((f->f_mode & FMODE_READ) &&
             likely(f->f_op->read || f->f_op->read_iter))
                f->f_mode |= FMODE_CAN_READ;
        if ((f->f_mode & FMODE_WRITE) &&
             likely(f->f_op->write || f->f_op->write_iter))
                f->f_mode |= FMODE_CAN_WRITE;
        if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
                f->f_mode &= ~FMODE_LSEEK;
        if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
                f->f_mode |= FMODE_CAN_ODIRECT;

        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
        f->f_iocb_flags = iocb_flags(f);

        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

        if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        /*
         * XXX: Huge page cache doesn't support writing yet. Drop all page
         * cache for this file before processing writes.
         */
        if (f->f_mode & FMODE_WRITE) {
                /*
                 * Paired with smp_mb() in collapse_file() to ensure nr_thps
                 * is up to date and the update to i_writecount by
                 * get_write_access() is visible. Ensures subsequent insertion
                 * of THPs into the page cache will fail.
                 */
                smp_mb();
                if (filemap_nr_thps(inode->i_mapping)) {
                        struct address_space *mapping = inode->i_mapping;

                        filemap_invalidate_lock(inode->i_mapping);
                        /*
                         * unmap_mapping_range just need to be called once
                         * here, because the private pages is not need to be
                         * unmapped mapping (e.g. data segment of dynamic
                         * shared libraries here).
                         */
                        unmap_mapping_range(mapping, 0, 0, 0);
                        truncate_inode_pages(mapping, 0);
                        filemap_invalidate_unlock(inode->i_mapping);
                }
        }

        /*
         * Once we return a file with FMODE_OPENED, __fput() will call
         * fsnotify_close(), so we need fsnotify_open() here for symmetry.
         */
        fsnotify_open(f);
        return 0;

cleanup_all:
        if (WARN_ON_ONCE(error > 0))
                error = -EINVAL;
        fops_put(f->f_op);
        put_file_access(f);
cleanup_file:
        path_put(&f->f_path);
        f->f_path.mnt = NULL;
        f->f_path.dentry = NULL;
        f->f_inode = NULL;
        return error;
}

/**
 * finish_open - finish opening a file
 * @file: file pointer
 * @dentry: pointer to dentry
 * @open: open callback
 *
 * This can be used to finish opening a file passed to i_op->atomic_open().
 *
 * If the open callback is set to NULL, then the standard f_op->open()
 * filesystem callback is substituted.
 *
 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
 * the return value of d_splice_alias(), then the caller needs to perform dput()
 * on it after finish_open().
 *
 * Returns zero on success or -errno if the open failed.
 */
int finish_open(struct file *file, struct dentry *dentry,
                int (*open)(struct inode *, struct file *))
{
        BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */

        file->f_path.dentry = dentry;
        return do_dentry_open(file, open);
}
EXPORT_SYMBOL(finish_open);

/**
 * finish_no_open - finish ->atomic_open() without opening the file
 *
 * @file: file pointer
 * @dentry: dentry or NULL (as returned from ->lookup())
 *
 * This can be used to set the result of a successful lookup in ->atomic_open().
 *
 * NB: unlike finish_open() this function does consume the dentry reference and
 * the caller need not dput() it.
 *
 * Returns "0" which must be the return value of ->atomic_open() after having
 * called this function.
 */
int finish_no_open(struct file *file, struct dentry *dentry)
{
        file->f_path.dentry = dentry;
        return 0;
}
EXPORT_SYMBOL(finish_no_open);

char *file_path(struct file *filp, char *buf, int buflen)
{
        return d_path(&filp->f_path, buf, buflen);
}
EXPORT_SYMBOL(file_path);

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 */
int vfs_open(const struct path *path, struct file *file)
{
        file->f_path = *path;
        return do_dentry_open(file, NULL);
}

struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *cred)
{
        int error;
        struct file *f;

        /* We must always pass in a valid mount pointer. */
        BUG_ON(!path->mnt);

        f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(dentry_open);

/**
 * dentry_create - Create and open a file
 * @path: path to create
 * @flags: O_ flags
 * @mode: mode bits for new file
 * @cred: credentials to use
 *
 * Caller must hold the parent directory's lock, and have prepared
 * a negative dentry, placed in @path->dentry, for the new file.
 *
 * Caller sets @path->mnt to the vfsmount of the filesystem where
 * the new file is to be created. The parent directory and the
 * negative dentry must reside on the same filesystem instance.
 *
 * On success, returns a "struct file *". Otherwise a ERR_PTR
 * is returned.
 */
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred)
{
        struct file *f;
        int error;

        f = alloc_empty_file(flags, cred);
        if (IS_ERR(f))
                return f;

        error = vfs_create(mnt_idmap(path->mnt),
                           d_inode(path->dentry->d_parent),
                           path->dentry, mode, true);
        if (!error)
                error = vfs_open(path, f);

        if (unlikely(error)) {
                fput(f);
                return ERR_PTR(error);
        }
        return f;
}
EXPORT_SYMBOL(dentry_create);

/**
 * kernel_file_open - open a file for kernel internal use
 * @path:        path of the file to open
 * @flags:        open flags
 * @cred:        credentials for open
 *
 * Open a file for use by in-kernel consumers. The file is not accounted
 * against nr_files and must not be installed into the file descriptor
 * table.
 *
 * Return: Opened file on success, an error pointer on failure.
 */
struct file *kernel_file_open(const struct path *path, int flags,
                                const struct cred *cred)
{
        struct file *f;
        int error;

        f = alloc_empty_file_noaccount(flags, cred);
        if (IS_ERR(f))
                return f;

        f->f_path = *path;
        error = do_dentry_open(f, NULL);
        if (error) {
                fput(f);
                f = ERR_PTR(error);
        }
        return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);

#define WILL_CREATE(flags)        (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS                (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)

inline struct open_how build_open_how(int flags, umode_t mode)
{
        struct open_how how = {
                .flags = flags & VALID_OPEN_FLAGS,
                .mode = mode & S_IALLUGO,
        };

        /* O_PATH beats everything else. */
        if (how.flags & O_PATH)
                how.flags &= O_PATH_FLAGS;
        /* Modes should only be set for create-like flags. */
        if (!WILL_CREATE(how.flags))
                how.mode = 0;
        return how;
}

inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
        u64 flags = how->flags;
        u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
        int lookup_flags = 0;
        int acc_mode = ACC_MODE(flags);

        BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
                         "struct open_flags doesn't yet handle flags > 32 bits");

        /*
         * Strip flags that either shouldn't be set by userspace like
         * FMODE_NONOTIFY or that aren't relevant in determining struct
         * open_flags like O_CLOEXEC.
         */
        flags &= ~strip;

        /*
         * Older syscalls implicitly clear all of the invalid flags or argument
         * values before calling build_open_flags(), but openat2(2) checks all
         * of its arguments.
         */
        if (flags & ~VALID_OPEN_FLAGS)
                return -EINVAL;
        if (how->resolve & ~VALID_RESOLVE_FLAGS)
                return -EINVAL;

        /* Scoping flags are mutually exclusive. */
        if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
                return -EINVAL;

        /* Deal with the mode. */
        if (WILL_CREATE(flags)) {
                if (how->mode & ~S_IALLUGO)
                        return -EINVAL;
                op->mode = how->mode | S_IFREG;
        } else {
                if (how->mode != 0)
                        return -EINVAL;
                op->mode = 0;
        }

        /*
         * Block bugs where O_DIRECTORY | O_CREAT created regular files.
         * Note, that blocking O_DIRECTORY | O_CREAT here also protects
         * O_TMPFILE below which requires O_DIRECTORY being raised.
         */
        if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
                return -EINVAL;

        /* Now handle the creative implementation of O_TMPFILE. */
        if (flags & __O_TMPFILE) {
                /*
                 * In order to ensure programs get explicit errors when trying
                 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
                 * is raised alongside __O_TMPFILE.
                 */
                if (!(flags & O_DIRECTORY))
                        return -EINVAL;
                if (!(acc_mode & MAY_WRITE))
                        return -EINVAL;
        }
        if (flags & O_PATH) {
                /* O_PATH only permits certain other flags to be set. */
                if (flags & ~O_PATH_FLAGS)
                        return -EINVAL;
                acc_mode = 0;
        }

        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
         * always set instead of having to deal with possibly weird behaviour
         * for malicious applications setting only __O_SYNC.
         */
        if (flags & __O_SYNC)
                flags |= O_DSYNC;

        op->open_flag = flags;

        /* O_TRUNC implies we need access checks for write permissions */
        if (flags & O_TRUNC)
                acc_mode |= MAY_WRITE;

        /* Allow the LSM permission hook to distinguish append
           access from general write access. */
        if (flags & O_APPEND)
                acc_mode |= MAY_APPEND;

        op->acc_mode = acc_mode;

        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;

        if (flags & O_CREAT) {
                op->intent |= LOOKUP_CREATE;
                if (flags & O_EXCL) {
                        op->intent |= LOOKUP_EXCL;
                        flags |= O_NOFOLLOW;
                }
        }

        if (flags & O_DIRECTORY)
                lookup_flags |= LOOKUP_DIRECTORY;
        if (!(flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;

        if (how->resolve & RESOLVE_NO_XDEV)
                lookup_flags |= LOOKUP_NO_XDEV;
        if (how->resolve & RESOLVE_NO_MAGICLINKS)
                lookup_flags |= LOOKUP_NO_MAGICLINKS;
        if (how->resolve & RESOLVE_NO_SYMLINKS)
                lookup_flags |= LOOKUP_NO_SYMLINKS;
        if (how->resolve & RESOLVE_BENEATH)
                lookup_flags |= LOOKUP_BENEATH;
        if (how->resolve & RESOLVE_IN_ROOT)
                lookup_flags |= LOOKUP_IN_ROOT;
        if (how->resolve & RESOLVE_CACHED) {
                /* Don't bother even trying for create/truncate/tmpfile open */
                if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                lookup_flags |= LOOKUP_CACHED;
        }

        op->lookup_flags = lookup_flags;
        return 0;
}

/**
 * file_open_name - open file and return file pointer
 *
 * @name:        struct filename containing path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_filp_open(AT_FDCWD, name, &op);
}

/**
 * filp_open - open file and return file pointer
 *
 * @filename:        path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
        struct filename *name = getname_kernel(filename);
        struct file *file = ERR_CAST(name);

        if (!IS_ERR(name)) {
                file = file_open_name(name, flags, mode);
                putname(name);
        }
        return file;
}
EXPORT_SYMBOL(filp_open);

struct file *file_open_root(const struct path *root,
                            const char *filename, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open_root(root, filename, &op);
}
EXPORT_SYMBOL(file_open_root);

static long do_sys_openat2(int dfd, const char __user *filename,
                           struct open_how *how)
{
        struct open_flags op;
        int fd = build_open_flags(how, &op);
        struct filename *tmp;

        if (fd)
                return fd;

        tmp = getname(filename);
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);

        fd = get_unused_fd_flags(how->flags);
        if (fd >= 0) {
                struct file *f = do_filp_open(dfd, tmp, &op);
                if (IS_ERR(f)) {
                        put_unused_fd(fd);
                        fd = PTR_ERR(f);
                } else {
                        fd_install(fd, f);
                }
        }
        putname(tmp);
        return fd;
}

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
        struct open_how how = build_open_how(flags, mode);
        return do_sys_openat2(dfd, filename, &how);
}


SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
                umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(dfd, filename, flags, mode);
}

SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
                struct open_how __user *, how, size_t, usize)
{
        int err;
        struct open_how tmp;

        BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);

        if (unlikely(usize < OPEN_HOW_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
        if (err)
                return err;

        audit_openat2_how(&tmp);

        /* O_LARGEFILE is only allowed for non-O_PATH. */
        if (!(tmp.flags & O_PATH) && force_o_largefile())
                tmp.flags |= O_LARGEFILE;

        return do_sys_openat2(dfd, filename, &tmp);
}

#ifdef CONFIG_COMPAT
/*
 * Exactly like sys_open(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

/*
 * Exactly like sys_openat(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(dfd, filename, flags, mode);
}
#endif

#ifndef __alpha__

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
        int flags = O_CREAT | O_WRONLY | O_TRUNC;

        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, pathname, flags, mode);
}
#endif

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
static int filp_flush(struct file *filp, fl_owner_t id)
{
        int retval = 0;

        if (CHECK_DATA_CORRUPTION(file_count(filp) == 0,
                        "VFS: Close: file count is 0 (f_op=%ps)",
                        filp->f_op)) {
                return 0;
        }

        if (filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);

        if (likely(!(filp->f_mode & FMODE_PATH))) {
                dnotify_flush(filp, id);
                locks_remove_posix(filp, id);
        }
        return retval;
}

int filp_close(struct file *filp, fl_owner_t id)
{
        int retval;

        retval = filp_flush(filp, id);
        fput(filp);

        return retval;
}
EXPORT_SYMBOL(filp_close);

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
        int retval;
        struct file *file;

        file = file_close_fd(fd);
        if (!file)
                return -EBADF;

        retval = filp_flush(file, current->files);

        /*
         * We're returning to user space. Don't bother
         * with any delayed fput() cases.
         */
        __fput_sync(file);

        /* can't restart close syscall because file table entry was cleared */
        if (unlikely(retval == -ERESTARTSYS ||
                     retval == -ERESTARTNOINTR ||
                     retval == -ERESTARTNOHAND ||
                     retval == -ERESTART_RESTARTBLOCK))
                retval = -EINTR;

        return retval;
}

/**
 * sys_close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  reserved for future extensions
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
                unsigned int, flags)
{
        return __close_range(fd, max_fd, flags);
}

/*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.
 */
SYSCALL_DEFINE0(vhangup)
{
        if (capable(CAP_SYS_TTY_CONFIG)) {
                tty_vhangup_self();
                return 0;
        }
        return -EPERM;
}

/*
 * Called when an inode is about to be open.
 * We use this to disallow opening large files on 32bit systems if
 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
 * on this flag in sys_open.
 */
int generic_file_open(struct inode * inode, struct file * filp)
{
        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
        return 0;
}

EXPORT_SYMBOL(generic_file_open);

/*
 * This is used by subsystems that don't want seekable
 * file descriptors. The function is not supposed to ever fail, the only
 * reason it returns an 'int' and not 'void' is so that it can be plugged
 * directly into file_operations structure.
 */
int nonseekable_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
        return 0;
}

EXPORT_SYMBOL(nonseekable_open);

/*
 * stream_open is used by subsystems that want stream-like file descriptors.
 * Such file descriptors are not seekable and don't have notion of position
 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
 * Contrary to file descriptors of other regular files, .read() and .write()
 * can run simultaneously.
 *
 * stream_open never fails and is marked to return int so that it could be
 * directly used as file_operations.open .
 */
int stream_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
        filp->f_mode |= FMODE_STREAM;
        return 0;
}

EXPORT_SYMBOL(stream_open);




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Copyright (C) 2020 Google LLC.
 */

#ifndef _LINUX_BPF_LSM_H
#define _LINUX_BPF_LSM_H

#include <linux/sched.h>
#include <linux/bpf.h>
#include <linux/lsm_hooks.h>

#ifdef CONFIG_BPF_LSM

#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        RET bpf_lsm_##NAME(__VA_ARGS__);
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

struct bpf_storage_blob {
        struct bpf_local_storage __rcu *storage;
};

extern struct lsm_blob_sizes bpf_lsm_blob_sizes;

int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                        const struct bpf_prog *prog);

bool bpf_lsm_is_sleepable_hook(u32 btf_id);
bool bpf_lsm_is_trusted(const struct bpf_prog *prog);

static inline struct bpf_storage_blob *bpf_inode(
        const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;

        return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
}

extern const struct bpf_func_proto bpf_inode_storage_get_proto;
extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
void bpf_inode_storage_free(struct inode *inode);

void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);

#else /* !CONFIG_BPF_LSM */

static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
        return false;
}

static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
{
        return false;
}

static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                                      const struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}

static inline struct bpf_storage_blob *bpf_inode(
        const struct inode *inode)
{
        return NULL;
}

static inline void bpf_inode_storage_free(struct inode *inode)
{
}

static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
                                           bpf_func_t *bpf_func)
{
}

#endif /* CONFIG_BPF_LSM */

#endif /* _LINUX_BPF_LSM_H */

















































































































































































































































    2 
    2 




































    2 



    2 









    2 



















































    2 













    2 









    2 





    2 



    2 
    2 














































































































    2 
























    2 







    2 












    2 









    2 
    2 
























































































































































































































































































































































































































































































































































































    1 
    2 




















    2 
    2 


    2 







    2 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
// SPDX-License-Identifier: GPL-2.0
/*
 * KFENCE guarded object allocator and fault handling.
 *
 * Copyright (C) 2020, Google LLC.
 */

#define pr_fmt(fmt) "kfence: " fmt

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
#include <linux/hash.h>
#include <linux/irq_work.h>
#include <linux/jhash.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/notifier.h>
#include <linux/panic_notifier.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>

#include <asm/kfence.h>

#include "kfence.h"

/* Disables KFENCE on the first warning assuming an irrecoverable error. */
#define KFENCE_WARN_ON(cond)                                                   \
        ({                                                                     \
                const bool __cond = WARN_ON(cond);                             \
                if (unlikely(__cond)) {                                        \
                        WRITE_ONCE(kfence_enabled, false);                     \
                        disabled_by_warn = true;                               \
                }                                                              \
                __cond;                                                        \
        })

/* === Data ================================================================= */

static bool kfence_enabled __read_mostly;
static bool disabled_by_warn __read_mostly;

unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */

#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "kfence."

static int kfence_enable_late(void);
static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
{
        unsigned long num;
        int ret = kstrtoul(val, 0, &num);

        if (ret < 0)
                return ret;

        /* Using 0 to indicate KFENCE is disabled. */
        if (!num && READ_ONCE(kfence_enabled)) {
                pr_info("disabled\n");
                WRITE_ONCE(kfence_enabled, false);
        }

        *((unsigned long *)kp->arg) = num;

        if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
                return disabled_by_warn ? -EINVAL : kfence_enable_late();
        return 0;
}

static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
{
        if (!READ_ONCE(kfence_enabled))
                return sprintf(buffer, "0\n");

        return param_get_ulong(buffer, kp);
}

static const struct kernel_param_ops sample_interval_param_ops = {
        .set = param_set_sample_interval,
        .get = param_get_sample_interval,
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);

/* Pool usage% threshold when currently covered allocations are skipped. */
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);

/* If true, use a deferrable timer. */
static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
module_param_named(deferrable, kfence_deferrable, bool, 0444);

/* If true, check all canary bytes on panic. */
static bool kfence_check_on_panic __read_mostly;
module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444);

/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __read_mostly;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */

/*
 * Per-object metadata, with one-to-one mapping of object metadata to
 * backing pages (in __kfence_pool).
 */
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
struct kfence_metadata *kfence_metadata __read_mostly;

/*
 * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache().
 * So introduce kfence_metadata_init to initialize metadata, and then make
 * kfence_metadata visible after initialization is successful. This prevents
 * potential UAF or access to uninitialized metadata.
 */
static struct kfence_metadata *kfence_metadata_init __read_mostly;

/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */

/*
 * The static key to set up a KFENCE allocation; or if static keys are not used
 * to gate allocations, to avoid a load and compare if KFENCE is disabled.
 */
DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);

/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);

/*
 * A Counting Bloom filter of allocation coverage: limits currently covered
 * allocations of the same source filling up the pool.
 *
 * Assuming a range of 15%-85% unique allocations in the pool at any point in
 * time, the below parameters provide a probablity of 0.02-0.33 for false
 * positive hits respectively:
 *
 *        P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
 */
#define ALLOC_COVERED_HNUM        2
#define ALLOC_COVERED_ORDER        (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
#define ALLOC_COVERED_SIZE        (1 << ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_HNEXT(h)        hash_32(h, ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_MASK        (ALLOC_COVERED_SIZE - 1)
static atomic_t alloc_covered[ALLOC_COVERED_SIZE];

/* Stack depth used to determine uniqueness of an allocation. */
#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)

/*
 * Randomness for stack hashes, making the same collisions across reboots and
 * different machines less likely.
 */
static u32 stack_hash_seed __ro_after_init;

/* Statistics counters for debugfs. */
enum kfence_counter_id {
        KFENCE_COUNTER_ALLOCATED,
        KFENCE_COUNTER_ALLOCS,
        KFENCE_COUNTER_FREES,
        KFENCE_COUNTER_ZOMBIES,
        KFENCE_COUNTER_BUGS,
        KFENCE_COUNTER_SKIP_INCOMPAT,
        KFENCE_COUNTER_SKIP_CAPACITY,
        KFENCE_COUNTER_SKIP_COVERED,
        KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
static const char *const counter_names[] = {
        [KFENCE_COUNTER_ALLOCATED]        = "currently allocated",
        [KFENCE_COUNTER_ALLOCS]                = "total allocations",
        [KFENCE_COUNTER_FREES]                = "total frees",
        [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
        [KFENCE_COUNTER_BUGS]                = "total bugs",
        [KFENCE_COUNTER_SKIP_INCOMPAT]        = "skipped allocations (incompatible)",
        [KFENCE_COUNTER_SKIP_CAPACITY]        = "skipped allocations (capacity)",
        [KFENCE_COUNTER_SKIP_COVERED]        = "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);

/* === Internals ============================================================ */

static inline bool should_skip_covered(void)
{
        unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;

        return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
}

static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
{
        num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
        num_entries = filter_irq_stacks(stack_entries, num_entries);
        return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
}

/*
 * Adds (or subtracts) count @val for allocation stack trace hash
 * @alloc_stack_hash from Counting Bloom filter.
 */
static void alloc_covered_add(u32 alloc_stack_hash, int val)
{
        int i;

        for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
                atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
                alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
        }
}

/*
 * Returns true if the allocation stack trace hash @alloc_stack_hash is
 * currently contained (non-zero count) in Counting Bloom filter.
 */
static bool alloc_covered_contains(u32 alloc_stack_hash)
{
        int i;

        for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
                if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
                        return false;
                alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
        }

        return true;
}

static bool kfence_protect(unsigned long addr)
{
        return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
}

static bool kfence_unprotect(unsigned long addr)
{
        return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
}

static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
{
        unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
        unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];

        /* The checks do not affect performance; only called from slow-paths. */

        /* Only call with a pointer into kfence_metadata. */
        if (KFENCE_WARN_ON(meta < kfence_metadata ||
                           meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
                return 0;

        /*
         * This metadata object only ever maps to 1 page; verify that the stored
         * address is in the expected range.
         */
        if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
                return 0;

        return pageaddr;
}

/*
 * Update the object's metadata state, including updating the alloc/free stacks
 * depending on the state transition.
 */
static noinline void
metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
                      unsigned long *stack_entries, size_t num_stack_entries)
{
        struct kfence_track *track =
                next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;

        lockdep_assert_held(&meta->lock);

        if (stack_entries) {
                memcpy(track->stack_entries, stack_entries,
                       num_stack_entries * sizeof(stack_entries[0]));
        } else {
                /*
                 * Skip over 1 (this) functions; noinline ensures we do not
                 * accidentally skip over the caller by never inlining.
                 */
                num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
        }
        track->num_stack_entries = num_stack_entries;
        track->pid = task_pid_nr(current);
        track->cpu = raw_smp_processor_id();
        track->ts_nsec = local_clock(); /* Same source as printk timestamps. */

        /*
         * Pairs with READ_ONCE() in
         *        kfence_shutdown_cache(),
         *        kfence_handle_page_fault().
         */
        WRITE_ONCE(meta->state, next);
}

/* Check canary byte at @addr. */
static inline bool check_canary_byte(u8 *addr)
{
        struct kfence_metadata *meta;
        unsigned long flags;

        if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr)))
                return true;

        atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);

        meta = addr_to_metadata((unsigned long)addr);
        raw_spin_lock_irqsave(&meta->lock, flags);
        kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
        raw_spin_unlock_irqrestore(&meta->lock, flags);

        return false;
}

static inline void set_canary(const struct kfence_metadata *meta)
{
        const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
        unsigned long addr = pageaddr;

        /*
         * The canary may be written to part of the object memory, but it does
         * not affect it. The user should initialize the object before using it.
         */
        for (; addr < meta->addr; addr += sizeof(u64))
                *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;

        addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64));
        for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64))
                *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
}

static inline void check_canary(const struct kfence_metadata *meta)
{
        const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
        unsigned long addr = pageaddr;

        /*
         * We'll iterate over each canary byte per-side until a corrupted byte
         * is found. However, we'll still iterate over the canary bytes to the
         * right of the object even if there was an error in the canary bytes to
         * the left of the object. Specifically, if check_canary_byte()
         * generates an error, showing both sides might give more clues as to
         * what the error is about when displaying which bytes were corrupted.
         */

        /* Apply to left of object. */
        for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) {
                if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
                        break;
        }

        /*
         * If the canary is corrupted in a certain 64 bytes, or the canary
         * memory cannot be completely covered by multiple consecutive 64 bytes,
         * it needs to be checked one by one.
         */
        for (; addr < meta->addr; addr++) {
                if (unlikely(!check_canary_byte((u8 *)addr)))
                        break;
        }

        /* Apply to right of object. */
        for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) {
                if (unlikely(!check_canary_byte((u8 *)addr)))
                        return;
        }
        for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) {
                if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) {

                        for (; addr - pageaddr < PAGE_SIZE; addr++) {
                                if (!check_canary_byte((u8 *)addr))
                                        return;
                        }
                }
        }
}

static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
                                  unsigned long *stack_entries, size_t num_stack_entries,
                                  u32 alloc_stack_hash)
{
        struct kfence_metadata *meta = NULL;
        unsigned long flags;
        struct slab *slab;
        void *addr;
        const bool random_right_allocate = get_random_u32_below(2);
        const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
                                  !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS);

        /* Try to obtain a free object. */
        raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
        if (!list_empty(&kfence_freelist)) {
                meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
                list_del_init(&meta->list);
        }
        raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
        if (!meta) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
                return NULL;
        }

        if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
                /*
                 * This is extremely unlikely -- we are reporting on a
                 * use-after-free, which locked meta->lock, and the reporting
                 * code via printk calls kmalloc() which ends up in
                 * kfence_alloc() and tries to grab the same object that we're
                 * reporting on. While it has never been observed, lockdep does
                 * report that there is a possibility of deadlock. Fix it by
                 * using trylock and bailing out gracefully.
                 */
                raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
                /* Put the object back on the freelist. */
                list_add_tail(&meta->list, &kfence_freelist);
                raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);

                return NULL;
        }

        meta->addr = metadata_to_pageaddr(meta);
        /* Unprotect if we're reusing this page. */
        if (meta->state == KFENCE_OBJECT_FREED)
                kfence_unprotect(meta->addr);

        /*
         * Note: for allocations made before RNG initialization, will always
         * return zero. We still benefit from enabling KFENCE as early as
         * possible, even when the RNG is not yet available, as this will allow
         * KFENCE to detect bugs due to earlier allocations. The only downside
         * is that the out-of-bounds accesses detected are deterministic for
         * such allocations.
         */
        if (random_right_allocate) {
                /* Allocate on the "right" side, re-calculate address. */
                meta->addr += PAGE_SIZE - size;
                meta->addr = ALIGN_DOWN(meta->addr, cache->align);
        }

        addr = (void *)meta->addr;

        /* Update remaining metadata. */
        metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
        /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
        WRITE_ONCE(meta->cache, cache);
        meta->size = size;
        meta->alloc_stack_hash = alloc_stack_hash;
        raw_spin_unlock_irqrestore(&meta->lock, flags);

        alloc_covered_add(alloc_stack_hash, 1);

        /* Set required slab fields. */
        slab = virt_to_slab((void *)meta->addr);
        slab->slab_cache = cache;
        slab->objects = 1;

        /* Memory initialization. */
        set_canary(meta);

        /*
         * We check slab_want_init_on_alloc() ourselves, rather than letting
         * SL*B do the initialization, as otherwise we might overwrite KFENCE's
         * redzone.
         */
        if (unlikely(slab_want_init_on_alloc(gfp, cache)))
                memzero_explicit(addr, size);
        if (cache->ctor)
                cache->ctor(addr);

        if (random_fault)
                kfence_protect(meta->addr); /* Random "faults" by protecting the object. */

        atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
        atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);

        return addr;
}

static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
{
        struct kcsan_scoped_access assert_page_exclusive;
        unsigned long flags;
        bool init;

        raw_spin_lock_irqsave(&meta->lock, flags);

        if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
                /* Invalid or double-free, bail out. */
                atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
                kfence_report_error((unsigned long)addr, false, NULL, meta,
                                    KFENCE_ERROR_INVALID_FREE);
                raw_spin_unlock_irqrestore(&meta->lock, flags);
                return;
        }

        /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
        kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
                                  KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
                                  &assert_page_exclusive);

        if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
                kfence_unprotect((unsigned long)addr); /* To check canary bytes. */

        /* Restore page protection if there was an OOB access. */
        if (meta->unprotected_page) {
                memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
                kfence_protect(meta->unprotected_page);
                meta->unprotected_page = 0;
        }

        /* Mark the object as freed. */
        metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
        init = slab_want_init_on_free(meta->cache);
        raw_spin_unlock_irqrestore(&meta->lock, flags);

        alloc_covered_add(meta->alloc_stack_hash, -1);

        /* Check canary bytes for memory corruption. */
        check_canary(meta);

        /*
         * Clear memory if init-on-free is set. While we protect the page, the
         * data is still there, and after a use-after-free is detected, we
         * unprotect the page, so the data is still accessible.
         */
        if (!zombie && unlikely(init))
                memzero_explicit(addr, meta->size);

        /* Protect to detect use-after-frees. */
        kfence_protect((unsigned long)addr);

        kcsan_end_scoped_access(&assert_page_exclusive);
        if (!zombie) {
                /* Add it to the tail of the freelist for reuse. */
                raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
                KFENCE_WARN_ON(!list_empty(&meta->list));
                list_add_tail(&meta->list, &kfence_freelist);
                raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);

                atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
                atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
        } else {
                /* See kfence_shutdown_cache(). */
                atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
        }
}

static void rcu_guarded_free(struct rcu_head *h)
{
        struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);

        kfence_guarded_free((void *)meta->addr, meta, false);
}

/*
 * Initialization of the KFENCE pool after its allocation.
 * Returns 0 on success; otherwise returns the address up to
 * which partial initialization succeeded.
 */
static unsigned long kfence_init_pool(void)
{
        unsigned long addr;
        struct page *pages;
        int i;

        if (!arch_kfence_init_pool())
                return (unsigned long)__kfence_pool;

        addr = (unsigned long)__kfence_pool;
        pages = virt_to_page(__kfence_pool);

        /*
         * Set up object pages: they must have PG_slab set, to avoid freeing
         * these as real pages.
         *
         * We also want to avoid inserting kfence_free() in the kfree()
         * fast-path in SLUB, and therefore need to ensure kfree() correctly
         * enters __slab_free() slow-path.
         */
        for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
                struct slab *slab = page_slab(nth_page(pages, i));

                if (!i || (i % 2))
                        continue;

                __folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG_KMEM
                slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
                                 MEMCG_DATA_OBJEXTS;
#endif
        }

        /*
         * Protect the first 2 pages. The first page is mostly unnecessary, and
         * merely serves as an extended guard page. However, adding one
         * additional page in the beginning gives us an even number of pages,
         * which simplifies the mapping of address to metadata index.
         */
        for (i = 0; i < 2; i++) {
                if (unlikely(!kfence_protect(addr)))
                        return addr;

                addr += PAGE_SIZE;
        }

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                struct kfence_metadata *meta = &kfence_metadata_init[i];

                /* Initialize metadata. */
                INIT_LIST_HEAD(&meta->list);
                raw_spin_lock_init(&meta->lock);
                meta->state = KFENCE_OBJECT_UNUSED;
                meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
                list_add_tail(&meta->list, &kfence_freelist);

                /* Protect the right redzone. */
                if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
                        goto reset_slab;

                addr += 2 * PAGE_SIZE;
        }

        /*
         * Make kfence_metadata visible only when initialization is successful.
         * Otherwise, if the initialization fails and kfence_metadata is freed,
         * it may cause UAF in kfence_shutdown_cache().
         */
        smp_store_release(&kfence_metadata, kfence_metadata_init);
        return 0;

reset_slab:
        for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
                struct slab *slab = page_slab(nth_page(pages, i));

                if (!i || (i % 2))
                        continue;
#ifdef CONFIG_MEMCG_KMEM
                slab->obj_exts = 0;
#endif
                __folio_clear_slab(slab_folio(slab));
        }

        return addr;
}

static bool __init kfence_init_pool_early(void)
{
        unsigned long addr;

        if (!__kfence_pool)
                return false;

        addr = kfence_init_pool();

        if (!addr) {
                /*
                 * The pool is live and will never be deallocated from this point on.
                 * Ignore the pool object from the kmemleak phys object tree, as it would
                 * otherwise overlap with allocations returned by kfence_alloc(), which
                 * are registered with kmemleak through the slab post-alloc hook.
                 */
                kmemleak_ignore_phys(__pa(__kfence_pool));
                return true;
        }

        /*
         * Only release unprotected pages, and do not try to go back and change
         * page attributes due to risk of failing to do so as well. If changing
         * page attributes for some pages fails, it is very likely that it also
         * fails for the first page, and therefore expect addr==__kfence_pool in
         * most failure cases.
         */
        memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
        __kfence_pool = NULL;

        memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
        kfence_metadata_init = NULL;

        return false;
}

/* === DebugFS Interface ==================================================== */

static int stats_show(struct seq_file *seq, void *v)
{
        int i;

        seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
        for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
                seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(stats);

/*
 * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
 * start_object() and next_object() return the object index + 1, because NULL is used
 * to stop iteration.
 */
static void *start_object(struct seq_file *seq, loff_t *pos)
{
        if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
                return (void *)((long)*pos + 1);
        return NULL;
}

static void stop_object(struct seq_file *seq, void *v)
{
}

static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
                return (void *)((long)*pos + 1);
        return NULL;
}

static int show_object(struct seq_file *seq, void *v)
{
        struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
        unsigned long flags;

        raw_spin_lock_irqsave(&meta->lock, flags);
        kfence_print_object(seq, meta);
        raw_spin_unlock_irqrestore(&meta->lock, flags);
        seq_puts(seq, "---------------------------------\n");

        return 0;
}

static const struct seq_operations objects_sops = {
        .start = start_object,
        .next = next_object,
        .stop = stop_object,
        .show = show_object,
};
DEFINE_SEQ_ATTRIBUTE(objects);

static int kfence_debugfs_init(void)
{
        struct dentry *kfence_dir;

        if (!READ_ONCE(kfence_enabled))
                return 0;

        kfence_dir = debugfs_create_dir("kfence", NULL);
        debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
        debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
        return 0;
}

late_initcall(kfence_debugfs_init);

/* === Panic Notifier ====================================================== */

static void kfence_check_all_canary(void)
{
        int i;

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                struct kfence_metadata *meta = &kfence_metadata[i];

                if (meta->state == KFENCE_OBJECT_ALLOCATED)
                        check_canary(meta);
        }
}

static int kfence_check_canary_callback(struct notifier_block *nb,
                                        unsigned long reason, void *arg)
{
        kfence_check_all_canary();
        return NOTIFY_OK;
}

static struct notifier_block kfence_check_canary_notifier = {
        .notifier_call = kfence_check_canary_callback,
};

/* === Allocation Gate Timer ================================================ */

static struct delayed_work kfence_timer;

#ifdef CONFIG_KFENCE_STATIC_KEYS
/* Wait queue to wake up allocation-gate timer task. */
static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);

static void wake_up_kfence_timer(struct irq_work *work)
{
        wake_up(&allocation_wait);
}
static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
#endif

/*
 * Set up delayed work, which will enable and disable the static key. We need to
 * use a work queue (rather than a simple timer), since enabling and disabling a
 * static key cannot be done from an interrupt.
 *
 * Note: Toggling a static branch currently causes IPIs, and here we'll end up
 * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
 * more aggressive sampling intervals), we could get away with a variant that
 * avoids IPIs, at the cost of not immediately capturing allocations if the
 * instructions remain cached.
 */
static void toggle_allocation_gate(struct work_struct *work)
{
        if (!READ_ONCE(kfence_enabled))
                return;

        atomic_set(&kfence_allocation_gate, 0);
#ifdef CONFIG_KFENCE_STATIC_KEYS
        /* Enable static key, and await allocation to happen. */
        static_branch_enable(&kfence_allocation_key);

        wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));

        /* Disable static key and reset timer. */
        static_branch_disable(&kfence_allocation_key);
#endif
        queue_delayed_work(system_unbound_wq, &kfence_timer,
                           msecs_to_jiffies(kfence_sample_interval));
}

/* === Public interface ===================================================== */

void __init kfence_alloc_pool_and_metadata(void)
{
        if (!kfence_sample_interval)
                return;

        /*
         * If the pool has already been initialized by arch, there is no need to
         * re-allocate the memory pool.
         */
        if (!__kfence_pool)
                __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);

        if (!__kfence_pool) {
                pr_err("failed to allocate pool\n");
                return;
        }

        /* The memory allocated by memblock has been zeroed out. */
        kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE);
        if (!kfence_metadata_init) {
                pr_err("failed to allocate metadata\n");
                memblock_free(__kfence_pool, KFENCE_POOL_SIZE);
                __kfence_pool = NULL;
        }
}

static void kfence_init_enable(void)
{
        if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
                static_branch_enable(&kfence_allocation_key);

        if (kfence_deferrable)
                INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate);
        else
                INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);

        if (kfence_check_on_panic)
                atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);

        WRITE_ONCE(kfence_enabled, true);
        queue_delayed_work(system_unbound_wq, &kfence_timer, 0);

        pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
                CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
                (void *)(__kfence_pool + KFENCE_POOL_SIZE));
}

void __init kfence_init(void)
{
        stack_hash_seed = get_random_u32();

        /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
        if (!kfence_sample_interval)
                return;

        if (!kfence_init_pool_early()) {
                pr_err("%s failed\n", __func__);
                return;
        }

        kfence_init_enable();
}

static int kfence_init_late(void)
{
        const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE;
        const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE;
        unsigned long addr = (unsigned long)__kfence_pool;
        unsigned long free_size = KFENCE_POOL_SIZE;
        int err = -ENOMEM;

#ifdef CONFIG_CONTIG_ALLOC
        struct page *pages;

        pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node,
                                   NULL);
        if (!pages)
                return -ENOMEM;

        __kfence_pool = page_to_virt(pages);
        pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node,
                                   NULL);
        if (pages)
                kfence_metadata_init = page_to_virt(pages);
#else
        if (nr_pages_pool > MAX_ORDER_NR_PAGES ||
            nr_pages_meta > MAX_ORDER_NR_PAGES) {
                pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
                return -EINVAL;
        }

        __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
        if (!__kfence_pool)
                return -ENOMEM;

        kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL);
#endif

        if (!kfence_metadata_init)
                goto free_pool;

        memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE);
        addr = kfence_init_pool();
        if (!addr) {
                kfence_init_enable();
                kfence_debugfs_init();
                return 0;
        }

        pr_err("%s failed\n", __func__);
        free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
        err = -EBUSY;

#ifdef CONFIG_CONTIG_ALLOC
        free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)),
                          nr_pages_meta);
free_pool:
        free_contig_range(page_to_pfn(virt_to_page((void *)addr)),
                          free_size / PAGE_SIZE);
#else
        free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE);
free_pool:
        free_pages_exact((void *)addr, free_size);
#endif

        kfence_metadata_init = NULL;
        __kfence_pool = NULL;
        return err;
}

static int kfence_enable_late(void)
{
        if (!__kfence_pool)
                return kfence_init_late();

        WRITE_ONCE(kfence_enabled, true);
        queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
        pr_info("re-enabled\n");
        return 0;
}

void kfence_shutdown_cache(struct kmem_cache *s)
{
        unsigned long flags;
        struct kfence_metadata *meta;
        int i;

        /* Pairs with release in kfence_init_pool(). */
        if (!smp_load_acquire(&kfence_metadata))
                return;

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                bool in_use;

                meta = &kfence_metadata[i];

                /*
                 * If we observe some inconsistent cache and state pair where we
                 * should have returned false here, cache destruction is racing
                 * with either kmem_cache_alloc() or kmem_cache_free(). Taking
                 * the lock will not help, as different critical section
                 * serialization will have the same outcome.
                 */
                if (READ_ONCE(meta->cache) != s ||
                    READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
                        continue;

                raw_spin_lock_irqsave(&meta->lock, flags);
                in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
                raw_spin_unlock_irqrestore(&meta->lock, flags);

                if (in_use) {
                        /*
                         * This cache still has allocations, and we should not
                         * release them back into the freelist so they can still
                         * safely be used and retain the kernel's default
                         * behaviour of keeping the allocations alive (leak the
                         * cache); however, they effectively become "zombie
                         * allocations" as the KFENCE objects are the only ones
                         * still in use and the owning cache is being destroyed.
                         *
                         * We mark them freed, so that any subsequent use shows
                         * more useful error messages that will include stack
                         * traces of the user of the object, the original
                         * allocation, and caller to shutdown_cache().
                         */
                        kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
                }
        }

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                meta = &kfence_metadata[i];

                /* See above. */
                if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
                        continue;

                raw_spin_lock_irqsave(&meta->lock, flags);
                if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
                        meta->cache = NULL;
                raw_spin_unlock_irqrestore(&meta->lock, flags);
        }
}

void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
        unsigned long stack_entries[KFENCE_STACK_DEPTH];
        size_t num_stack_entries;
        u32 alloc_stack_hash;

        /*
         * Perform size check before switching kfence_allocation_gate, so that
         * we don't disable KFENCE without making an allocation.
         */
        if (size > PAGE_SIZE) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
                return NULL;
        }

        /*
         * Skip allocations from non-default zones, including DMA. We cannot
         * guarantee that pages in the KFENCE pool will have the requested
         * properties (e.g. reside in DMAable memory).
         */
        if ((flags & GFP_ZONEMASK) ||
            (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
                return NULL;
        }

        /*
         * Skip allocations for this slab, if KFENCE has been disabled for
         * this slab.
         */
        if (s->flags & SLAB_SKIP_KFENCE)
                return NULL;

        if (atomic_inc_return(&kfence_allocation_gate) > 1)
                return NULL;
#ifdef CONFIG_KFENCE_STATIC_KEYS
        /*
         * waitqueue_active() is fully ordered after the update of
         * kfence_allocation_gate per atomic_inc_return().
         */
        if (waitqueue_active(&allocation_wait)) {
                /*
                 * Calling wake_up() here may deadlock when allocations happen
                 * from within timer code. Use an irq_work to defer it.
                 */
                irq_work_queue(&wake_up_kfence_timer_work);
        }
#endif

        if (!READ_ONCE(kfence_enabled))
                return NULL;

        num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);

        /*
         * Do expensive check for coverage of allocation in slow-path after
         * allocation_gate has already become non-zero, even though it might
         * mean not making any allocation within a given sample interval.
         *
         * This ensures reasonable allocation coverage when the pool is almost
         * full, including avoiding long-lived allocations of the same source
         * filling up the pool (e.g. pagecache allocations).
         */
        alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
        if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
                return NULL;
        }

        return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
                                    alloc_stack_hash);
}

size_t kfence_ksize(const void *addr)
{
        const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

        /*
         * Read locklessly -- if there is a race with __kfence_alloc(), this is
         * either a use-after-free or invalid access.
         */
        return meta ? meta->size : 0;
}

void *kfence_object_start(const void *addr)
{
        const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

        /*
         * Read locklessly -- if there is a race with __kfence_alloc(), this is
         * either a use-after-free or invalid access.
         */
        return meta ? (void *)meta->addr : NULL;
}

void __kfence_free(void *addr)
{
        struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

#ifdef CONFIG_MEMCG_KMEM
        KFENCE_WARN_ON(meta->obj_exts.objcg);
#endif
        /*
         * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
         * the object, as the object page may be recycled for other-typed
         * objects once it has been freed. meta->cache may be NULL if the cache
         * was destroyed.
         */
        if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
                call_rcu(&meta->rcu_head, rcu_guarded_free);
        else
                kfence_guarded_free(addr, meta, false);
}

bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
{
        const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
        struct kfence_metadata *to_report = NULL;
        enum kfence_error_type error_type;
        unsigned long flags;

        if (!is_kfence_address((void *)addr))
                return false;

        if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
                return kfence_unprotect(addr); /* ... unprotect and proceed. */

        atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);

        if (page_index % 2) {
                /* This is a redzone, report a buffer overflow. */
                struct kfence_metadata *meta;
                int distance = 0;

                meta = addr_to_metadata(addr - PAGE_SIZE);
                if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
                        to_report = meta;
                        /* Data race ok; distance calculation approximate. */
                        distance = addr - data_race(meta->addr + meta->size);
                }

                meta = addr_to_metadata(addr + PAGE_SIZE);
                if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
                        /* Data race ok; distance calculation approximate. */
                        if (!to_report || distance > data_race(meta->addr) - addr)
                                to_report = meta;
                }

                if (!to_report)
                        goto out;

                raw_spin_lock_irqsave(&to_report->lock, flags);
                to_report->unprotected_page = addr;
                error_type = KFENCE_ERROR_OOB;

                /*
                 * If the object was freed before we took the look we can still
                 * report this as an OOB -- the report will simply show the
                 * stacktrace of the free as well.
                 */
        } else {
                to_report = addr_to_metadata(addr);
                if (!to_report)
                        goto out;

                raw_spin_lock_irqsave(&to_report->lock, flags);
                error_type = KFENCE_ERROR_UAF;
                /*
                 * We may race with __kfence_alloc(), and it is possible that a
                 * freed object may be reallocated. We simply report this as a
                 * use-after-free, with the stack trace showing the place where
                 * the object was re-allocated.
                 */
        }

out:
        if (to_report) {
                kfence_report_error(addr, is_write, regs, to_report, error_type);
                raw_spin_unlock_irqrestore(&to_report->lock, flags);
        } else {
                /* This may be a UAF or OOB access, but we can't be sure. */
                kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
        }

        return kfence_unprotect(addr); /* Unprotect and let access proceed. */
}






















































    2 









    2 













    2 
    2 


    2 




    2 



















    2 
    2 



















































    2 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                IP/TCP/UDP checksumming routines
 *
 * Authors:        Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Tom May, <ftom@netcom.com>
 *                Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de>
 *                Lots of code moved from tcp.c and ip.c; see those files
 *                for more names.
 *
 * 03/02/96        Jes Sorensen, Andreas Schwab, Roman Hodek:
 *                Fixed some nasty bugs, causing some horrible crashes.
 *                A: At some points, the sum (%0) was used as
 *                length-counter instead of the length counter
 *                (%1). Thanks to Roman Hodek for pointing this out.
 *                B: GCC seems to mess up if one uses too many
 *                data-registers to hold input values and one tries to
 *                specify d0 and d1 as scratch registers. Letting gcc
 *                choose these registers itself solves the problem.
 */

/* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access
 kills, so most of the assembly has to go. */

#include <linux/export.h>
#include <net/checksum.h>

#include <asm/byteorder.h>

#ifndef do_csum
static inline unsigned short from32to16(unsigned int x)
{
        /* add up 16-bit and 16-bit for 16+c bit */
        x = (x & 0xffff) + (x >> 16);
        /* add up carry.. */
        x = (x & 0xffff) + (x >> 16);
        return x;
}

static unsigned int do_csum(const unsigned char *buff, int len)
{
        int odd;
        unsigned int result = 0;

        if (len <= 0)
                goto out;
        odd = 1 & (unsigned long) buff;
        if (odd) {
#ifdef __LITTLE_ENDIAN
                result += (*buff << 8);
#else
                result = *buff;
#endif
                len--;
                buff++;
        }
        if (len >= 2) {
                if (2 & (unsigned long) buff) {
                        result += *(unsigned short *) buff;
                        len -= 2;
                        buff += 2;
                }
                if (len >= 4) {
                        const unsigned char *end = buff + ((unsigned)len & ~3);
                        unsigned int carry = 0;
                        do {
                                unsigned int w = *(unsigned int *) buff;
                                buff += 4;
                                result += carry;
                                result += w;
                                carry = (w > result);
                        } while (buff < end);
                        result += carry;
                        result = (result & 0xffff) + (result >> 16);
                }
                if (len & 2) {
                        result += *(unsigned short *) buff;
                        buff += 2;
                }
        }
        if (len & 1)
#ifdef __LITTLE_ENDIAN
                result += *buff;
#else
                result += (*buff << 8);
#endif
        result = from32to16(result);
        if (odd)
                result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
out:
        return result;
}
#endif

#ifndef ip_fast_csum
/*
 *        This is a version of ip_compute_csum() optimized for IP headers,
 *        which always checksum on 4 octet boundaries.
 */
__sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
        return (__force __sum16)~do_csum(iph, ihl*4);
}
EXPORT_SYMBOL(ip_fast_csum);
#endif

/*
 * computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit)
 *
 * returns a 32-bit number suitable for feeding into itself
 * or csum_tcpudp_magic
 *
 * this function must be called with even lengths, except
 * for the last fragment, which may be odd
 *
 * it's best to have buff aligned on a 32-bit boundary
 */
__wsum csum_partial(const void *buff, int len, __wsum wsum)
{
        unsigned int sum = (__force unsigned int)wsum;
        unsigned int result = do_csum(buff, len);

        /* add in old sum, and carry.. */
        result += sum;
        if (sum > result)
                result += 1;
        return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);

/*
 * this routine is used for miscellaneous IP-like checksums, mainly
 * in icmp.c
 */
__sum16 ip_compute_csum(const void *buff, int len)
{
        return (__force __sum16)~do_csum(buff, len);
}
EXPORT_SYMBOL(ip_compute_csum);

#ifndef csum_tcpudp_nofold
static inline u32 from64to32(u64 x)
{
        /* add up 32-bit and 32-bit for 32+c bit */
        x = (x & 0xffffffff) + (x >> 32);
        /* add up carry.. */
        x = (x & 0xffffffff) + (x >> 32);
        return (u32)x;
}

__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
                          __u32 len, __u8 proto, __wsum sum)
{
        unsigned long long s = (__force u32)sum;

        s += (__force u32)saddr;
        s += (__force u32)daddr;
#ifdef __BIG_ENDIAN
        s += proto + len;
#else
        s += (proto + len) << 8;
#endif
        return (__force __wsum)from64to32(s);
}
EXPORT_SYMBOL(csum_tcpudp_nofold);
#endif


























































































































































































































































































































































































































































































































































































































    4 











































    4 

    1 
    4 

    2 































    1 








































































    5 

























    5 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * VLAN                An implementation of 802.1Q VLAN tagging.
 *
 * Authors:        Ben Greear <greearb@candelatech.com>
 */
#ifndef _LINUX_IF_VLAN_H_
#define _LINUX_IF_VLAN_H_

#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/bug.h>
#include <uapi/linux/if_vlan.h>

#define VLAN_HLEN        4                /* The additional bytes required by VLAN
                                         * (in addition to the Ethernet header)
                                         */
#define VLAN_ETH_HLEN        18                /* Total octets in header.         */
#define VLAN_ETH_ZLEN        64                /* Min. octets in frame sans FCS */

/*
 * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan
 */
#define VLAN_ETH_DATA_LEN        1500        /* Max. octets in payload         */
#define VLAN_ETH_FRAME_LEN        1518        /* Max. octets in frame sans FCS */

#define VLAN_MAX_DEPTH        8                /* Max. number of nested VLAN tags parsed */

/*
 *         struct vlan_hdr - vlan header
 *         @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_hdr {
        __be16        h_vlan_TCI;
        __be16        h_vlan_encapsulated_proto;
};

/**
 *        struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr)
 *        @h_dest: destination ethernet address
 *        @h_source: source ethernet address
 *        @h_vlan_proto: ethernet protocol
 *        @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_ethhdr {
        struct_group(addrs,
                unsigned char        h_dest[ETH_ALEN];
                unsigned char        h_source[ETH_ALEN];
        );
        __be16                h_vlan_proto;
        __be16                h_vlan_TCI;
        __be16                h_vlan_encapsulated_proto;
};

#include <linux/skbuff.h>

static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + vlan_eth_hdr()
 */
static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb->data;
}

#define VLAN_PRIO_MASK                0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT                13
#define VLAN_CFI_MASK                0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
#define VLAN_VID_MASK                0x0fff /* VLAN Identifier */
#define VLAN_N_VID                4096

/* found in socket.c */
extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));

static inline bool is_vlan_dev(const struct net_device *dev)
{
        return dev->priv_flags & IFF_802_1Q_VLAN;
}

#define skb_vlan_tag_present(__skb)        (!!(__skb)->vlan_all)
#define skb_vlan_tag_get(__skb)                ((__skb)->vlan_tci)
#define skb_vlan_tag_get_id(__skb)        ((__skb)->vlan_tci & VLAN_VID_MASK)
#define skb_vlan_tag_get_cfi(__skb)        (!!((__skb)->vlan_tci & VLAN_CFI_MASK))
#define skb_vlan_tag_get_prio(__skb)        (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT)

static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev);
}

static inline int vlan_get_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev);
}

/**
 *        struct vlan_pcpu_stats - VLAN percpu rx/tx stats
 *        @rx_packets: number of received packets
 *        @rx_bytes: number of received bytes
 *        @rx_multicast: number of received multicast packets
 *        @tx_packets: number of transmitted packets
 *        @tx_bytes: number of transmitted bytes
 *        @syncp: synchronization point for 64bit counters
 *        @rx_errors: number of rx errors
 *        @tx_dropped: number of tx drops
 */
struct vlan_pcpu_stats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                rx_multicast;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync        syncp;
        u32                        rx_errors;
        u32                        tx_dropped;
};

#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)

extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id);
extern int vlan_for_each(struct net_device *dev,
                         int (*action)(struct net_device *dev, int vid,
                                       void *arg), void *arg);
extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
extern u16 vlan_dev_vlan_id(const struct net_device *dev);
extern __be16 vlan_dev_vlan_proto(const struct net_device *dev);

/**
 *        struct vlan_priority_tci_mapping - vlan egress priority mappings
 *        @priority: skb priority
 *        @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000
 *        @next: pointer to next struct
 */
struct vlan_priority_tci_mapping {
        u32                                        priority;
        u16                                        vlan_qos;
        struct vlan_priority_tci_mapping        *next;
};

struct proc_dir_entry;
struct netpoll;

/**
 *        struct vlan_dev_priv - VLAN private device data
 *        @nr_ingress_mappings: number of ingress priority mappings
 *        @ingress_priority_map: ingress priority mappings
 *        @nr_egress_mappings: number of egress priority mappings
 *        @egress_priority_map: hash of egress priority mappings
 *        @vlan_proto: VLAN encapsulation protocol
 *        @vlan_id: VLAN identifier
 *        @flags: device flags
 *        @real_dev: underlying netdevice
 *        @dev_tracker: refcount tracker for @real_dev reference
 *        @real_dev_addr: address of underlying netdevice
 *        @dent: proc dir entry
 *        @vlan_pcpu_stats: ptr to percpu rx stats
 */
struct vlan_dev_priv {
        unsigned int                                nr_ingress_mappings;
        u32                                        ingress_priority_map[8];
        unsigned int                                nr_egress_mappings;
        struct vlan_priority_tci_mapping        *egress_priority_map[16];

        __be16                                        vlan_proto;
        u16                                        vlan_id;
        u16                                        flags;

        struct net_device                        *real_dev;
        netdevice_tracker                        dev_tracker;

        unsigned char                                real_dev_addr[ETH_ALEN];

        struct proc_dir_entry                        *dent;
        struct vlan_pcpu_stats __percpu                *vlan_pcpu_stats;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll                                *netpoll;
#endif
};

static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
{
        return netdev_priv(dev);
}

static inline u16
vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
{
        struct vlan_priority_tci_mapping *mp;

        smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */

        mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)];
        while (mp) {
                if (mp->priority == skprio) {
                        return mp->vlan_qos; /* This should already be shifted
                                              * to mask correctly with the
                                              * VLAN's TCI */
                }
                mp = mp->next;
        }
        return 0;
}

extern bool vlan_do_receive(struct sk_buff **skb);

extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);

extern int vlan_vids_add_by_dev(struct net_device *dev,
                                const struct net_device *by_dev);
extern void vlan_vids_del_by_dev(struct net_device *dev,
                                 const struct net_device *by_dev);

extern bool vlan_uses_dev(const struct net_device *dev);

#else
static inline struct net_device *
__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                     __be16 vlan_proto, u16 vlan_id)
{
        return NULL;
}

static inline int
vlan_for_each(struct net_device *dev,
              int (*action)(struct net_device *dev, int vid, void *arg),
              void *arg)
{
        return 0;
}

static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        BUG();
        return NULL;
}

static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        BUG();
        return 0;
}

static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        BUG();
        return 0;
}

static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev,
                                               u32 skprio)
{
        return 0;
}

static inline bool vlan_do_receive(struct sk_buff **skb)
{
        return false;
}

static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        return 0;
}

static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
}

static inline int vlan_vids_add_by_dev(struct net_device *dev,
                                       const struct net_device *by_dev)
{
        return 0;
}

static inline void vlan_vids_del_by_dev(struct net_device *dev,
                                        const struct net_device *by_dev)
{
}

static inline bool vlan_uses_dev(const struct net_device *dev)
{
        return false;
}
#endif

/**
 * eth_type_vlan - check for valid vlan ether type.
 * @ethertype: ether type to check
 *
 * Returns true if the ether type is a vlan ether type.
 */
static inline bool eth_type_vlan(__be16 ethertype)
{
        switch (ethertype) {
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline bool vlan_hw_offload_capable(netdev_features_t features,
                                           __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
                return true;
        if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX)
                return true;
        return false;
}

/**
 * __vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Returns error if skb_cow_head fails.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci,
                                          unsigned int mac_len)
{
        struct vlan_ethhdr *veth;

        if (skb_cow_head(skb, VLAN_HLEN) < 0)
                return -ENOMEM;

        skb_push(skb, VLAN_HLEN);

        /* Move the mac header sans proto to the beginning of the new header. */
        if (likely(mac_len > ETH_TLEN))
                memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
        if (skb_mac_header_was_set(skb))
                skb->mac_header -= VLAN_HLEN;

        veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);

        /* first, the ethernet type */
        if (likely(mac_len >= ETH_TLEN)) {
                /* h_vlan_encapsulated_proto should already be populated, and
                 * skb->data has space for h_vlan_proto
                 */
                veth->h_vlan_proto = vlan_proto;
        } else {
                /* h_vlan_encapsulated_proto should not be populated, and
                 * skb->data has no space for h_vlan_proto
                 */
                veth->h_vlan_encapsulated_proto = skb->protocol;
        }

        /* now, the TCI */
        veth->h_vlan_TCI = htons(vlan_tci);

        return 0;
}

/**
 * __vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns error if skb_cow_head fails.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline int __vlan_insert_tag(struct sk_buff *skb,
                                    __be16 vlan_proto, u16 vlan_tci)
{
        return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
                                                    __be16 vlan_proto,
                                                    u16 vlan_tci,
                                                    unsigned int mac_len)
{
        int err;

        err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
        if (err) {
                dev_kfree_skb_any(skb);
                return NULL;
        }
        return skb;
}

/**
 * vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 */
static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
                                              __be16 vlan_proto, u16 vlan_tci)
{
        return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_tag_set_proto - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 */
static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
                                                        __be16 vlan_proto,
                                                        u16 vlan_tci)
{
        skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
        if (skb)
                skb->protocol = vlan_proto;
        return skb;
}

/**
 * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info
 * @skb: skbuff to clear
 *
 * Clears the VLAN information from @skb
 */
static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
{
        skb->vlan_all = 0;
}

/**
 * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb
 * @dst: skbuff to copy to
 * @src: skbuff to copy from
 *
 * Copies VLAN information from @src to @dst (for branchless code)
 */
static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src)
{
        dst->vlan_all = src->vlan_all;
}

/*
 * __vlan_hwaccel_push_inside - pushes vlan tag to the payload
 * @skb: skbuff to tag
 *
 * Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 */
static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
{
        skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
        if (likely(skb))
                __vlan_hwaccel_clear_tag(skb);
        return skb;
}

/**
 * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
 */
static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci)
{
        skb->vlan_proto = vlan_proto;
        skb->vlan_tci = vlan_tci;
}

/**
 * __vlan_get_tag - get the VLAN ID that is part of the payload
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns error if the skb is not of VLAN type
 */
static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);

        if (!eth_type_vlan(veth->h_vlan_proto))
                return -ENODATA;

        *vlan_tci = ntohs(veth->h_vlan_TCI);
        return 0;
}

/**
 * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[]
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns error if @skb->vlan_tci is not set correctly
 */
static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
                                         u16 *vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                *vlan_tci = skb_vlan_tag_get(skb);
                return 0;
        } else {
                *vlan_tci = 0;
                return -ENODATA;
        }
}

/**
 * vlan_get_tag - get the VLAN ID from the skb
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns error if the skb is not VLAN tagged
 */
static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
                return __vlan_hwaccel_get_tag(skb, vlan_tci);
        } else {
                return __vlan_get_tag(skb, vlan_tci);
        }
}

/**
 * vlan_get_protocol - get protocol EtherType.
 * @skb: skbuff to query
 * @type: first vlan protocol
 * @depth: buffer to store length of eth and vlan tags in bytes
 *
 * Returns the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type,
                                         int *depth)
{
        unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;

        /* if type is 802.1Q/AD then the header should already be
         * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
         * ETH_HLEN otherwise
         */
        if (eth_type_vlan(type)) {
                if (vlan_depth) {
                        if (WARN_ON(vlan_depth < VLAN_HLEN))
                                return 0;
                        vlan_depth -= VLAN_HLEN;
                } else {
                        vlan_depth = ETH_HLEN;
                }
                do {
                        struct vlan_hdr vhdr, *vh;

                        vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr);
                        if (unlikely(!vh || !--parse_depth))
                                return 0;

                        type = vh->h_vlan_encapsulated_proto;
                        vlan_depth += VLAN_HLEN;
                } while (eth_type_vlan(type));
        }

        if (depth)
                *depth = vlan_depth;

        return type;
}

/**
 * vlan_get_protocol - get protocol EtherType.
 * @skb: skbuff to query
 *
 * Returns the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
{
        return __vlan_get_protocol(skb, skb->protocol, NULL);
}

/* This version of __vlan_get_protocol() also pulls mac header in skb->head */
static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb,
                                                 __be16 type, int *depth)
{
        int maclen;

        type = __vlan_get_protocol(skb, type, &maclen);

        if (type) {
                if (!pskb_may_pull(skb, maclen))
                        type = 0;
                else if (depth)
                        *depth = maclen;
        }
        return type;
}

/* A getter for the SKB protocol field which will handle VLAN tags consistently
 * whether VLAN acceleration is enabled or not.
 */
static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan)
{
        if (!skip_vlan)
                /* VLAN acceleration strips the VLAN header from the skb and
                 * moves it to skb->vlan_proto
                 */
                return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol;

        return vlan_get_protocol(skb);
}

static inline void vlan_set_encap_proto(struct sk_buff *skb,
                                        struct vlan_hdr *vhdr)
{
        __be16 proto;
        unsigned short *rawp;

        /*
         * Was a VLAN packet, grab the encapsulated protocol, which the layer
         * three protocols care about.
         */

        proto = vhdr->h_vlan_encapsulated_proto;
        if (eth_proto_is_802_3(proto)) {
                skb->protocol = proto;
                return;
        }

        rawp = (unsigned short *)(vhdr + 1);
        if (*rawp == 0xFFFF)
                /*
                 * This is a magic hack to spot IPX packets. Older Novell
                 * breaks the protocol design and runs IPX over 802.3 without
                 * an 802.2 LLC layer. We look for FFFF which isn't a used
                 * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
                 * but does for the rest.
                 */
                skb->protocol = htons(ETH_P_802_3);
        else
                /*
                 * Real 802.2 LLC
                 */
                skb->protocol = htons(ETH_P_802_2);
}

/**
 * vlan_remove_tag - remove outer VLAN tag from payload
 * @skb: skbuff to remove tag from
 * @vlan_tci: buffer to store value
 *
 * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
 * pointing at the MAC header.
 *
 * Returns a new pointer to skb->data, or NULL on failure to pull.
 */
static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);

        *vlan_tci = ntohs(vhdr->h_vlan_TCI);

        memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
        vlan_set_encap_proto(skb, vhdr);
        return __skb_pull(skb, VLAN_HLEN);
}

/**
 * skb_vlan_tagged - check if skb is vlan tagged.
 * @skb: skbuff to query
 *
 * Returns true if the skb is tagged, regardless of whether it is hardware
 * accelerated or not.
 */
static inline bool skb_vlan_tagged(const struct sk_buff *skb)
{
        if (!skb_vlan_tag_present(skb) &&
            likely(!eth_type_vlan(skb->protocol)))
                return false;

        return true;
}

/**
 * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers.
 * @skb: skbuff to query
 *
 * Returns true if the skb is tagged with multiple vlan headers, regardless
 * of whether it is hardware accelerated or not.
 */
static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
{
        __be16 protocol = skb->protocol;

        if (!skb_vlan_tag_present(skb)) {
                struct vlan_ethhdr *veh;

                if (likely(!eth_type_vlan(protocol)))
                        return false;

                if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
                        return false;

                veh = skb_vlan_eth_hdr(skb);
                protocol = veh->h_vlan_encapsulated_proto;
        }

        if (!eth_type_vlan(protocol))
                return false;

        return true;
}

/**
 * vlan_features_check - drop unsafe features for skb with multiple tags.
 * @skb: skbuff to query
 * @features: features to be checked
 *
 * Returns features without unsafe ones if the skb has multiple tags.
 */
static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
                                                    netdev_features_t features)
{
        if (skb_vlan_tagged_multi(skb)) {
                /* In the case of multi-tagged packets, use a direct mask
                 * instead of using netdev_interesect_features(), to make
                 * sure that only devices supporting NETIF_F_HW_CSUM will
                 * have checksum offloading support.
                 */
                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
                            NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX |
                            NETIF_F_HW_VLAN_STAG_TX;
        }

        return features;
}

/**
 * compare_vlan_header - Compare two vlan headers
 * @h1: Pointer to vlan header
 * @h2: Pointer to vlan header
 *
 * Compare two vlan headers, returns 0 if equal.
 *
 * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
 */
static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
                                                const struct vlan_hdr *h2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return *(u32 *)h1 ^ *(u32 *)h2;
#else
        return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) |
               ((__force u32)h1->h_vlan_encapsulated_proto ^
                (__force u32)h2->h_vlan_encapsulated_proto);
#endif
}
#endif /* !(_LINUX_IF_VLAN_H_) */




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PATH_H
#define _LINUX_PATH_H

struct dentry;
struct vfsmount;

struct path {
        struct vfsmount *mnt;
        struct dentry *dentry;
} __randomize_layout;

extern void path_get(const struct path *);
extern void path_put(const struct path *);

static inline int path_equal(const struct path *path1, const struct path *path2)
{
        return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
}

static inline void path_put_init(struct path *path)
{
        path_put(path);
        *path = (struct path) { };
}

#endif  /* _LINUX_PATH_H */








































    1 
    1 













    1 




    1 






































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
 *
 * Copyright (c) 2019, Tessares SA.
 */

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#include <net/net_namespace.h>
#include <net/netns/generic.h>

#include "protocol.h"

#define MPTCP_SYSCTL_PATH "net/mptcp"

static int mptcp_pernet_id;

#ifdef CONFIG_SYSCTL
static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX;
#endif

struct mptcp_pernet {
#ifdef CONFIG_SYSCTL
        struct ctl_table_header *ctl_table_hdr;
#endif

        unsigned int add_addr_timeout;
        unsigned int close_timeout;
        unsigned int stale_loss_cnt;
        u8 mptcp_enabled;
        u8 checksum_enabled;
        u8 allow_join_initial_addr_port;
        u8 pm_type;
        char scheduler[MPTCP_SCHED_NAME_MAX];
};

static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
{
        return net_generic(net, mptcp_pernet_id);
}

int mptcp_is_enabled(const struct net *net)
{
        return mptcp_get_pernet(net)->mptcp_enabled;
}

unsigned int mptcp_get_add_addr_timeout(const struct net *net)
{
        return mptcp_get_pernet(net)->add_addr_timeout;
}

int mptcp_is_checksum_enabled(const struct net *net)
{
        return mptcp_get_pernet(net)->checksum_enabled;
}

int mptcp_allow_join_id0(const struct net *net)
{
        return mptcp_get_pernet(net)->allow_join_initial_addr_port;
}

unsigned int mptcp_stale_loss_cnt(const struct net *net)
{
        return mptcp_get_pernet(net)->stale_loss_cnt;
}

unsigned int mptcp_close_timeout(const struct sock *sk)
{
        if (sock_flag(sk, SOCK_DEAD))
                return TCP_TIMEWAIT_LEN;
        return mptcp_get_pernet(sock_net(sk))->close_timeout;
}

int mptcp_get_pm_type(const struct net *net)
{
        return mptcp_get_pernet(net)->pm_type;
}

const char *mptcp_get_scheduler(const struct net *net)
{
        return mptcp_get_pernet(net)->scheduler;
}

static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
        pernet->mptcp_enabled = 1;
        pernet->add_addr_timeout = TCP_RTO_MAX;
        pernet->close_timeout = TCP_TIMEWAIT_LEN;
        pernet->checksum_enabled = 0;
        pernet->allow_join_initial_addr_port = 1;
        pernet->stale_loss_cnt = 4;
        pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
        strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler));
}

#ifdef CONFIG_SYSCTL
static int mptcp_set_scheduler(const struct net *net, const char *name)
{
        struct mptcp_pernet *pernet = mptcp_get_pernet(net);
        struct mptcp_sched_ops *sched;
        int ret = 0;

        rcu_read_lock();
        sched = mptcp_sched_find(name);
        if (sched)
                strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX);
        else
                ret = -ENOENT;
        rcu_read_unlock();

        return ret;
}

static int proc_scheduler(struct ctl_table *ctl, int write,
                          void *buffer, size_t *lenp, loff_t *ppos)
{
        const struct net *net = current->nsproxy->net_ns;
        char val[MPTCP_SCHED_NAME_MAX];
        struct ctl_table tbl = {
                .data = val,
                .maxlen = MPTCP_SCHED_NAME_MAX,
        };
        int ret;

        strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX);

        ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
        if (write && ret == 0)
                ret = mptcp_set_scheduler(net, val);

        return ret;
}

static int proc_available_schedulers(struct ctl_table *ctl,
                                     int write, void *buffer,
                                     size_t *lenp, loff_t *ppos)
{
        struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, };
        int ret;

        tbl.data = kmalloc(tbl.maxlen, GFP_USER);
        if (!tbl.data)
                return -ENOMEM;

        mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX);
        ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
        kfree(tbl.data);

        return ret;
}

static struct ctl_table mptcp_sysctl_table[] = {
        {
                .procname = "enabled",
                .maxlen = sizeof(u8),
                .mode = 0644,
                /* users with CAP_NET_ADMIN or root (not and) can change this
                 * value, same as other sysctl or the 'net' tree.
                 */
                .proc_handler = proc_dou8vec_minmax,
                .extra1       = SYSCTL_ZERO,
                .extra2       = SYSCTL_ONE
        },
        {
                .procname = "add_addr_timeout",
                .maxlen = sizeof(unsigned int),
                .mode = 0644,
                .proc_handler = proc_dointvec_jiffies,
        },
        {
                .procname = "checksum_enabled",
                .maxlen = sizeof(u8),
                .mode = 0644,
                .proc_handler = proc_dou8vec_minmax,
                .extra1       = SYSCTL_ZERO,
                .extra2       = SYSCTL_ONE
        },
        {
                .procname = "allow_join_initial_addr_port",
                .maxlen = sizeof(u8),
                .mode = 0644,
                .proc_handler = proc_dou8vec_minmax,
                .extra1       = SYSCTL_ZERO,
                .extra2       = SYSCTL_ONE
        },
        {
                .procname = "stale_loss_cnt",
                .maxlen = sizeof(unsigned int),
                .mode = 0644,
                .proc_handler = proc_douintvec_minmax,
        },
        {
                .procname = "pm_type",
                .maxlen = sizeof(u8),
                .mode = 0644,
                .proc_handler = proc_dou8vec_minmax,
                .extra1       = SYSCTL_ZERO,
                .extra2       = &mptcp_pm_type_max
        },
        {
                .procname = "scheduler",
                .maxlen        = MPTCP_SCHED_NAME_MAX,
                .mode = 0644,
                .proc_handler = proc_scheduler,
        },
        {
                .procname = "available_schedulers",
                .maxlen        = MPTCP_SCHED_BUF_MAX,
                .mode = 0644,
                .proc_handler = proc_available_schedulers,
        },
        {
                .procname = "close_timeout",
                .maxlen = sizeof(unsigned int),
                .mode = 0644,
                .proc_handler = proc_dointvec_jiffies,
        },
};

static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
{
        struct ctl_table_header *hdr;
        struct ctl_table *table;

        table = mptcp_sysctl_table;
        if (!net_eq(net, &init_net)) {
                table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL);
                if (!table)
                        goto err_alloc;
        }

        table[0].data = &pernet->mptcp_enabled;
        table[1].data = &pernet->add_addr_timeout;
        table[2].data = &pernet->checksum_enabled;
        table[3].data = &pernet->allow_join_initial_addr_port;
        table[4].data = &pernet->stale_loss_cnt;
        table[5].data = &pernet->pm_type;
        table[6].data = &pernet->scheduler;
        /* table[7] is for available_schedulers which is read-only info */
        table[8].data = &pernet->close_timeout;

        hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
                                     ARRAY_SIZE(mptcp_sysctl_table));
        if (!hdr)
                goto err_reg;

        pernet->ctl_table_hdr = hdr;

        return 0;

err_reg:
        if (!net_eq(net, &init_net))
                kfree(table);
err_alloc:
        return -ENOMEM;
}

static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
{
        const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;

        unregister_net_sysctl_table(pernet->ctl_table_hdr);

        kfree(table);
}

#else

static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
{
        return 0;
}

static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {}

#endif /* CONFIG_SYSCTL */

static int __net_init mptcp_net_init(struct net *net)
{
        struct mptcp_pernet *pernet = mptcp_get_pernet(net);

        mptcp_pernet_set_defaults(pernet);

        return mptcp_pernet_new_table(net, pernet);
}

/* Note: the callback will only be called per extra netns */
static void __net_exit mptcp_net_exit(struct net *net)
{
        struct mptcp_pernet *pernet = mptcp_get_pernet(net);

        mptcp_pernet_del_table(pernet);
}

static struct pernet_operations mptcp_pernet_ops = {
        .init = mptcp_net_init,
        .exit = mptcp_net_exit,
        .id = &mptcp_pernet_id,
        .size = sizeof(struct mptcp_pernet),
};

void __init mptcp_init(void)
{
        mptcp_join_cookie_init();
        mptcp_proto_init();

        if (register_pernet_subsys(&mptcp_pernet_ops) < 0)
                panic("Failed to register MPTCP pernet subsystem.\n");
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
int __init mptcpv6_init(void)
{
        int err;

        err = mptcp_proto_v6_init();

        return err;
}
#endif






















































    1 







    1 

    1 






    1 















    1 


































    1 





    1 






    1 








































    1 



    1 






    1 

















    1 










    1 




























    1 
    1 












































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/group.c - Operations for adding/removing multiple files at once.
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
 * Copyright (c) 2013 Greg Kroah-Hartman
 * Copyright (c) 2013 The Linux Foundation
 */

#include <linux/kobject.h>
#include <linux/module.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/err.h>
#include <linux/fs.h>
#include "sysfs.h"


static void remove_files(struct kernfs_node *parent,
                         const struct attribute_group *grp)
{
        struct attribute *const *attr;
        struct bin_attribute *const *bin_attr;

        if (grp->attrs)
                for (attr = grp->attrs; *attr; attr++)
                        kernfs_remove_by_name(parent, (*attr)->name);
        if (grp->bin_attrs)
                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
                        kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
}

static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj)
{
        if (grp->attrs && grp->attrs[0] && grp->is_visible)
                return grp->is_visible(kobj, grp->attrs[0], 0);

        if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible)
                return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0);

        return 0;
}

static int create_files(struct kernfs_node *parent, struct kobject *kobj,
                        kuid_t uid, kgid_t gid,
                        const struct attribute_group *grp, int update)
{
        struct attribute *const *attr;
        struct bin_attribute *const *bin_attr;
        int error = 0, i;

        if (grp->attrs) {
                for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
                        umode_t mode = (*attr)->mode;

                        /*
                         * In update mode, we're changing the permissions or
                         * visibility.  Do this by first removing then
                         * re-adding (if required) the file.
                         */
                        if (update)
                                kernfs_remove_by_name(parent, (*attr)->name);
                        if (grp->is_visible) {
                                mode = grp->is_visible(kobj, *attr, i);
                                mode &= ~SYSFS_GROUP_INVISIBLE;
                                if (!mode)
                                        continue;
                        }

                        WARN(mode & ~(SYSFS_PREALLOC | 0664),
                             "Attribute %s: Invalid permissions 0%o\n",
                             (*attr)->name, mode);

                        mode &= SYSFS_PREALLOC | 0664;
                        error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
                                                       gid, NULL);
                        if (unlikely(error))
                                break;
                }
                if (error) {
                        remove_files(parent, grp);
                        goto exit;
                }
        }

        if (grp->bin_attrs) {
                for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
                        umode_t mode = (*bin_attr)->attr.mode;

                        if (update)
                                kernfs_remove_by_name(parent,
                                                (*bin_attr)->attr.name);
                        if (grp->is_bin_visible) {
                                mode = grp->is_bin_visible(kobj, *bin_attr, i);
                                mode &= ~SYSFS_GROUP_INVISIBLE;
                                if (!mode)
                                        continue;
                        }

                        WARN(mode & ~(SYSFS_PREALLOC | 0664),
                             "Attribute %s: Invalid permissions 0%o\n",
                             (*bin_attr)->attr.name, mode);

                        mode &= SYSFS_PREALLOC | 0664;
                        error = sysfs_add_bin_file_mode_ns(parent, *bin_attr,
                                                           mode, uid, gid,
                                                           NULL);
                        if (error)
                                break;
                }
                if (error)
                        remove_files(parent, grp);
        }
exit:
        return error;
}


static int internal_create_group(struct kobject *kobj, int update,
                                 const struct attribute_group *grp)
{
        struct kernfs_node *kn;
        kuid_t uid;
        kgid_t gid;
        int error;

        if (WARN_ON(!kobj || (!update && !kobj->sd)))
                return -EINVAL;

        /* Updates may happen before the object has been instantiated */
        if (unlikely(update && !kobj->sd))
                return -EINVAL;

        if (!grp->attrs && !grp->bin_attrs) {
                pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n",
                         kobj->name, grp->name ?: "");
                return 0;
        }

        kobject_get_ownership(kobj, &uid, &gid);
        if (grp->name) {
                umode_t mode = __first_visible(grp, kobj);

                if (mode & SYSFS_GROUP_INVISIBLE)
                        mode = 0;
                else
                        mode = S_IRWXU | S_IRUGO | S_IXUGO;

                if (update) {
                        kn = kernfs_find_and_get(kobj->sd, grp->name);
                        if (!kn) {
                                pr_debug("attr grp %s/%s not created yet\n",
                                         kobj->name, grp->name);
                                /* may have been invisible prior to this update */
                                update = 0;
                        } else if (!mode) {
                                sysfs_remove_group(kobj, grp);
                                kernfs_put(kn);
                                return 0;
                        }
                }

                if (!update) {
                        if (!mode)
                                return 0;
                        kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode,
                                                  uid, gid, kobj, NULL);
                        if (IS_ERR(kn)) {
                                if (PTR_ERR(kn) == -EEXIST)
                                        sysfs_warn_dup(kobj->sd, grp->name);
                                return PTR_ERR(kn);
                        }
                }
        } else {
                kn = kobj->sd;
        }

        kernfs_get(kn);
        error = create_files(kn, kobj, uid, gid, grp, update);
        if (error) {
                if (grp->name)
                        kernfs_remove(kn);
        }
        kernfs_put(kn);

        if (grp->name && update)
                kernfs_put(kn);

        return error;
}

/**
 * sysfs_create_group - given a directory kobject, create an attribute group
 * @kobj:        The kobject to create the group on
 * @grp:        The attribute group to create
 *
 * This function creates a group for the first time.  It will explicitly
 * warn and error if any of the attribute files being created already exist.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_create_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return internal_create_group(kobj, 0, grp);
}
EXPORT_SYMBOL_GPL(sysfs_create_group);

static int internal_create_groups(struct kobject *kobj, int update,
                                  const struct attribute_group **groups)
{
        int error = 0;
        int i;

        if (!groups)
                return 0;

        for (i = 0; groups[i]; i++) {
                error = internal_create_group(kobj, update, groups[i]);
                if (error) {
                        while (--i >= 0)
                                sysfs_remove_group(kobj, groups[i]);
                        break;
                }
        }
        return error;
}

/**
 * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups
 * @kobj:        The kobject to create the group on
 * @groups:        The attribute groups to create, NULL terminated
 *
 * This function creates a bunch of attribute groups.  If an error occurs when
 * creating a group, all previously created groups will be removed, unwinding
 * everything back to the original state when this function was called.
 * It will explicitly warn and error if any of the attribute files being
 * created already exist.
 *
 * Returns 0 on success or error code from sysfs_create_group on failure.
 */
int sysfs_create_groups(struct kobject *kobj,
                        const struct attribute_group **groups)
{
        return internal_create_groups(kobj, 0, groups);
}
EXPORT_SYMBOL_GPL(sysfs_create_groups);

/**
 * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups
 * @kobj:        The kobject to update the group on
 * @groups:        The attribute groups to update, NULL terminated
 *
 * This function update a bunch of attribute groups.  If an error occurs when
 * updating a group, all previously updated groups will be removed together
 * with already existing (not updated) attributes.
 *
 * Returns 0 on success or error code from sysfs_update_group on failure.
 */
int sysfs_update_groups(struct kobject *kobj,
                        const struct attribute_group **groups)
{
        return internal_create_groups(kobj, 1, groups);
}
EXPORT_SYMBOL_GPL(sysfs_update_groups);

/**
 * sysfs_update_group - given a directory kobject, update an attribute group
 * @kobj:        The kobject to update the group on
 * @grp:        The attribute group to update
 *
 * This function updates an attribute group.  Unlike
 * sysfs_create_group(), it will explicitly not warn or error if any
 * of the attribute files being created already exist.  Furthermore,
 * if the visibility of the files has changed through the is_visible()
 * callback, it will update the permissions and add or remove the
 * relevant files. Changing a group's name (subdirectory name under
 * kobj's directory in sysfs) is not allowed.
 *
 * The primary use for this function is to call it after making a change
 * that affects group visibility.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_update_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return internal_create_group(kobj, 1, grp);
}
EXPORT_SYMBOL_GPL(sysfs_update_group);

/**
 * sysfs_remove_group: remove a group from a kobject
 * @kobj:        kobject to remove the group from
 * @grp:        group to remove
 *
 * This function removes a group of attributes from a kobject.  The attributes
 * previously have to have been created for this group, otherwise it will fail.
 */
void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp)
{
        struct kernfs_node *parent = kobj->sd;
        struct kernfs_node *kn;

        if (grp->name) {
                kn = kernfs_find_and_get(parent, grp->name);
                if (!kn) {
                        pr_debug("sysfs group '%s' not found for kobject '%s'\n",
                                 grp->name, kobject_name(kobj));
                        return;
                }
        } else {
                kn = parent;
                kernfs_get(kn);
        }

        remove_files(kn, grp);
        if (grp->name)
                kernfs_remove(kn);

        kernfs_put(kn);
}
EXPORT_SYMBOL_GPL(sysfs_remove_group);

/**
 * sysfs_remove_groups - remove a list of groups
 *
 * @kobj:        The kobject for the groups to be removed from
 * @groups:        NULL terminated list of groups to be removed
 *
 * If groups is not NULL, remove the specified groups from the kobject.
 */
void sysfs_remove_groups(struct kobject *kobj,
                         const struct attribute_group **groups)
{
        int i;

        if (!groups)
                return;
        for (i = 0; groups[i]; i++)
                sysfs_remove_group(kobj, groups[i]);
}
EXPORT_SYMBOL_GPL(sysfs_remove_groups);

/**
 * sysfs_merge_group - merge files into a pre-existing named attribute group.
 * @kobj:        The kobject containing the group.
 * @grp:        The files to create and the attribute group they belong to.
 *
 * This function returns an error if the group doesn't exist, the .name field is
 * NULL or any of the files already exist in that group, in which case none of
 * the new files are created.
 */
int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        struct kernfs_node *parent;
        kuid_t uid;
        kgid_t gid;
        int error = 0;
        struct attribute *const *attr;
        int i;

        parent = kernfs_find_and_get(kobj->sd, grp->name);
        if (!parent)
                return -ENOENT;

        kobject_get_ownership(kobj, &uid, &gid);

        for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
                error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode,
                                               uid, gid, NULL);
        if (error) {
                while (--i >= 0)
                        kernfs_remove_by_name(parent, (*--attr)->name);
        }
        kernfs_put(parent);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_merge_group);

/**
 * sysfs_unmerge_group - remove files from a pre-existing named attribute group.
 * @kobj:        The kobject containing the group.
 * @grp:        The files to remove and the attribute group they belong to.
 */
void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        struct kernfs_node *parent;
        struct attribute *const *attr;

        parent = kernfs_find_and_get(kobj->sd, grp->name);
        if (parent) {
                for (attr = grp->attrs; *attr; ++attr)
                        kernfs_remove_by_name(parent, (*attr)->name);
                kernfs_put(parent);
        }
}
EXPORT_SYMBOL_GPL(sysfs_unmerge_group);

/**
 * sysfs_add_link_to_group - add a symlink to an attribute group.
 * @kobj:        The kobject containing the group.
 * @group_name:        The name of the group.
 * @target:        The target kobject of the symlink to create.
 * @link_name:        The name of the symlink to create.
 */
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name)
{
        struct kernfs_node *parent;
        int error = 0;

        parent = kernfs_find_and_get(kobj->sd, group_name);
        if (!parent)
                return -ENOENT;

        error = sysfs_create_link_sd(parent, target, link_name);
        kernfs_put(parent);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);

/**
 * sysfs_remove_link_from_group - remove a symlink from an attribute group.
 * @kobj:        The kobject containing the group.
 * @group_name:        The name of the group.
 * @link_name:        The name of the symlink to remove.
 */
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name)
{
        struct kernfs_node *parent;

        parent = kernfs_find_and_get(kobj->sd, group_name);
        if (parent) {
                kernfs_remove_by_name(parent, link_name);
                kernfs_put(parent);
        }
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);

/**
 * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
 * to a group or an attribute
 * @kobj:                The kobject containing the group.
 * @target_kobj:        The target kobject.
 * @target_name:        The name of the target group or attribute.
 * @symlink_name:        The name of the symlink file (target_name will be
 *                        considered if symlink_name is NULL).
 */
int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                         struct kobject *target_kobj,
                                         const char *target_name,
                                         const char *symlink_name)
{
        struct kernfs_node *target;
        struct kernfs_node *entry;
        struct kernfs_node *link;

        /*
         * We don't own @target_kobj and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir()
         * for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
        target = target_kobj->sd;
        if (target)
                kernfs_get(target);
        spin_unlock(&sysfs_symlink_target_lock);
        if (!target)
                return -ENOENT;

        entry = kernfs_find_and_get(target, target_name);
        if (!entry) {
                kernfs_put(target);
                return -ENOENT;
        }

        if (!symlink_name)
                symlink_name = target_name;

        link = kernfs_create_link(kobj->sd, symlink_name, entry);
        if (PTR_ERR(link) == -EEXIST)
                sysfs_warn_dup(kobj->sd, symlink_name);

        kernfs_put(entry);
        kernfs_put(target);
        return PTR_ERR_OR_ZERO(link);
}
EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj);

static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
                                          const struct attribute_group *grp,
                                          struct iattr *newattrs)
{
        struct kernfs_node *kn;
        int error;

        if (grp->attrs) {
                struct attribute *const *attr;

                for (attr = grp->attrs; *attr; attr++) {
                        kn = kernfs_find_and_get(grp_kn, (*attr)->name);
                        if (!kn)
                                return -ENOENT;

                        error = kernfs_setattr(kn, newattrs);
                        kernfs_put(kn);
                        if (error)
                                return error;
                }
        }

        if (grp->bin_attrs) {
                struct bin_attribute *const *bin_attr;

                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
                        kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
                        if (!kn)
                                return -ENOENT;

                        error = kernfs_setattr(kn, newattrs);
                        kernfs_put(kn);
                        if (error)
                                return error;
                }
        }

        return 0;
}

/**
 * sysfs_group_change_owner - change owner of an attribute group.
 * @kobj:        The kobject containing the group.
 * @grp:        The attribute group.
 * @kuid:        new owner's kuid
 * @kgid:        new owner's kgid
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_group_change_owner(struct kobject *kobj,
                             const struct attribute_group *grp, kuid_t kuid,
                             kgid_t kgid)
{
        struct kernfs_node *grp_kn;
        int error;
        struct iattr newattrs = {
                .ia_valid = ATTR_UID | ATTR_GID,
                .ia_uid = kuid,
                .ia_gid = kgid,
        };

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        if (grp->name) {
                grp_kn = kernfs_find_and_get(kobj->sd, grp->name);
        } else {
                kernfs_get(kobj->sd);
                grp_kn = kobj->sd;
        }
        if (!grp_kn)
                return -ENOENT;

        error = kernfs_setattr(grp_kn, &newattrs);
        if (!error)
                error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs);

        kernfs_put(grp_kn);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_group_change_owner);

/**
 * sysfs_groups_change_owner - change owner of a set of attribute groups.
 * @kobj:        The kobject containing the groups.
 * @groups:        The attribute groups.
 * @kuid:        new owner's kuid
 * @kgid:        new owner's kgid
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_groups_change_owner(struct kobject *kobj,
                              const struct attribute_group **groups,
                              kuid_t kuid, kgid_t kgid)
{
        int error = 0, i;

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        if (!groups)
                return 0;

        for (i = 0; groups[i]; i++) {
                error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid);
                if (error)
                        break;
        }

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 













































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
/*
 * Performance events:
 *
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
 *
 * Data type definitions, declarations, prototypes.
 *
 *    Started by: Thomas Gleixner and Ingo Molnar
 *
 * For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H

#include <uapi/linux/perf_event.h>
#include <uapi/linux/bpf_perf_event.h>

/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
#endif

#define PERF_GUEST_ACTIVE        0x01
#define PERF_GUEST_USER        0x02

struct perf_guest_info_callbacks {
        unsigned int                        (*state)(void);
        unsigned long                        (*get_ip)(void);
        unsigned int                        (*handle_intel_pt_intr)(void);
};

#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <linux/rhashtable-types.h>
#include <asm/hw_breakpoint.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/irq_work.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
#include <linux/security.h>
#include <linux/static_call.h>
#include <linux/lockdep.h>
#include <asm/local.h>

struct perf_callchain_entry {
        __u64                                nr;
        __u64                                ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_callchain_entry_ctx {
        struct perf_callchain_entry *entry;
        u32                            max_stack;
        u32                            nr;
        short                            contexts;
        bool                            contexts_maxed;
};

typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
                                     unsigned long off, unsigned long len);

struct perf_raw_frag {
        union {
                struct perf_raw_frag        *next;
                unsigned long                pad;
        };
        perf_copy_f                        copy;
        void                                *data;
        u32                                size;
} __packed;

struct perf_raw_record {
        struct perf_raw_frag                frag;
        u32                                size;
};

static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
{
        return frag->pad < sizeof(u64);
}

/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *  hw_idx: The low level index of raw branch records
 *          for the most recent branch.
 *          -1ULL means invalid/unknown.
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 * The entries[] is an abstraction of raw branch records,
 * which may not be stored in age order in HW, e.g. Intel LBR.
 * The hw_idx is to expose the low level index of raw
 * branch record for the most recent branch aka entries[0].
 * The hw_idx index is between -1 (unknown) and max depth,
 * which can be retrieved in /sys/devices/cpu/caps/branches.
 * For the architectures whose raw branch records are
 * already stored in age order, the hw_idx should be 0.
 */
struct perf_branch_stack {
        __u64                                nr;
        __u64                                hw_idx;
        struct perf_branch_entry        entries[];
};

struct task_struct;

/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
        u64                config;        /* register value */
        unsigned int        reg;        /* register address or index */
        int                alloc;        /* extra register already allocated */
        int                idx;        /* index in shared_regs->regs[] */
};

/**
 * hw_perf_event::flag values
 *
 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
 * usage.
 */
#define PERF_EVENT_FLAG_ARCH                        0x000fffff
#define PERF_EVENT_FLAG_USER_READ_CNT                0x80000000

static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);

/**
 * struct hw_perf_event - performance event hardware details:
 */
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
        union {
                struct { /* hardware */
                        u64                config;
                        u64                last_tag;
                        unsigned long        config_base;
                        unsigned long        event_base;
                        int                event_base_rdpmc;
                        int                idx;
                        int                last_cpu;
                        int                flags;

                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
                };
                struct { /* software */
                        struct hrtimer        hrtimer;
                };
                struct { /* tracepoint */
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
                struct { /* amd_power */
                        u64        pwr_acc;
                        u64        ptsc;
                };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct arch_hw_breakpoint        info;
                        struct rhlist_head                bp_list;
                };
#endif
                struct { /* amd_iommu */
                        u8        iommu_bank;
                        u8        iommu_cntr;
                        u16        padding;
                        u64        conf;
                        u64        conf1;
                };
        };
        /*
         * If the event is a per task event, this will point to the task in
         * question. See the comment in perf_event_alloc().
         */
        struct task_struct                *target;

        /*
         * PMU would store hardware filter configuration
         * here.
         */
        void                                *addr_filters;

        /* Last sync'ed generation of filters */
        unsigned long                        addr_filters_gen;

/*
 * hw_perf_event::state flags; used to track the PERF_EF_* state.
 */
#define PERF_HES_STOPPED        0x01 /* the counter is stopped */
#define PERF_HES_UPTODATE        0x02 /* event->count up-to-date */
#define PERF_HES_ARCH                0x04

        int                                state;

        /*
         * The last observed hardware counter value, updated with a
         * local64_cmpxchg() such that pmu::read() can be called nested.
         */
        local64_t                        prev_count;

        /*
         * The period to start the next sample with.
         */
        u64                                sample_period;

        union {
                struct { /* Sampling */
                        /*
                         * The period we started this sample with.
                         */
                        u64                                last_period;

                        /*
                         * However much is left of the current period;
                         * note that this is a full 64bit value and
                         * allows for generation of periods longer
                         * than hardware might allow.
                         */
                        local64_t                        period_left;
                };
                struct { /* Topdown events counting for context switch */
                        u64                                saved_metric;
                        u64                                saved_slots;
                };
        };

        /*
         * State for throttling the event, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                             interrupts_seq;
        u64                                interrupts;

        /*
         * State for freq target events, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                                freq_time_stamp;
        u64                                freq_count_stamp;
#endif
};

struct perf_event;
struct perf_event_pmu_context;

/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */
#define PERF_PMU_TXN_ADD  0x1                /* txn to add/schedule event on PMU */
#define PERF_PMU_TXN_READ 0x2                /* txn to read event group from PMU */

/**
 * pmu::capabilities flags
 */
#define PERF_PMU_CAP_NO_INTERRUPT                0x0001
#define PERF_PMU_CAP_NO_NMI                        0x0002
#define PERF_PMU_CAP_AUX_NO_SG                        0x0004
#define PERF_PMU_CAP_EXTENDED_REGS                0x0008
#define PERF_PMU_CAP_EXCLUSIVE                        0x0010
#define PERF_PMU_CAP_ITRACE                        0x0020
#define PERF_PMU_CAP_NO_EXCLUDE                        0x0040
#define PERF_PMU_CAP_AUX_OUTPUT                        0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE                0x0100

struct perf_output_handle;

#define PMU_NULL_DEV        ((void *)(~0UL))

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
        struct list_head                entry;

        struct module                        *module;
        struct device                        *dev;
        struct device                        *parent;
        const struct attribute_group        **attr_groups;
        const struct attribute_group        **attr_update;
        const char                        *name;
        int                                type;

        /*
         * various common per-pmu feature flags
         */
        int                                capabilities;

        int __percpu                        *pmu_disable_count;
        struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
        atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
        int                                task_ctx_nr;
        int                                hrtimer_interval_ms;

        /* number of address filters this PMU can do */
        unsigned int                        nr_addr_filters;

        /*
         * Fully disable/enable this PMU, can be used to protect from the PMI
         * as well as for lazy/batch writing of the MSRs.
         */
        void (*pmu_enable)                (struct pmu *pmu); /* optional */
        void (*pmu_disable)                (struct pmu *pmu); /* optional */

        /*
         * Try and initialize the event for this PMU.
         *
         * Returns:
         *  -ENOENT        -- @event is not for this PMU
         *
         *  -ENODEV        -- @event is for this PMU but PMU not present
         *  -EBUSY        -- @event is for this PMU but PMU temporarily unavailable
         *  -EINVAL        -- @event is for this PMU but @event is not valid
         *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
         *  -EACCES        -- @event is for this PMU, @event is valid, but no privileges
         *
         *  0                -- @event is for this PMU and valid
         *
         * Other error return values are allowed.
         */
        int (*event_init)                (struct perf_event *event);

        /*
         * Notification that the event was mapped or unmapped.  Called
         * in the context of the mapping task.
         */
        void (*event_mapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */
        void (*event_unmapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */

        /*
         * Flags for ->add()/->del()/ ->start()/->stop(). There are
         * matching hw_perf_event::state flags.
         */
#define PERF_EF_START        0x01                /* start the counter when adding    */
#define PERF_EF_RELOAD        0x02                /* reload the counter when starting */
#define PERF_EF_UPDATE        0x04                /* update the counter when stopping */

        /*
         * Adds/Removes a counter to/from the PMU, can be done inside a
         * transaction, see the ->*_txn() methods.
         *
         * The add/del callbacks will reserve all hardware resources required
         * to service the event, this includes any counter constraint
         * scheduling etc.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on.
         *
         * ->add() called without PERF_EF_START should result in the same state
         *  as ->add() followed by ->stop().
         *
         * ->del() must always PERF_EF_UPDATE stop an event. If it calls
         *  ->stop() that must deal with already being stopped without
         *  PERF_EF_UPDATE.
         */
        int  (*add)                        (struct perf_event *event, int flags);
        void (*del)                        (struct perf_event *event, int flags);

        /*
         * Starts/Stops a counter present on the PMU.
         *
         * The PMI handler should stop the counter when perf_event_overflow()
         * returns !0. ->start() will be used to continue.
         *
         * Also used to change the sample period.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on -- will be called from NMI context with the PMU generates
         * NMIs.
         *
         * ->stop() with PERF_EF_UPDATE will read the counter and update
         *  period/count values like ->read() would.
         *
         * ->start() with PERF_EF_RELOAD will reprogram the counter
         *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
         */
        void (*start)                        (struct perf_event *event, int flags);
        void (*stop)                        (struct perf_event *event, int flags);

        /*
         * Updates the counter value of the event.
         *
         * For sampling capable PMUs this will also update the software period
         * hw_perf_event::period_left field.
         */
        void (*read)                        (struct perf_event *event);

        /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
         *
         * Start the transaction, after this ->add() doesn't need to
         * do schedulability tests.
         *
         * Optional.
         */
        void (*start_txn)                (struct pmu *pmu, unsigned int txn_flags);
        /*
         * If ->start_txn() disabled the ->add() schedulability test
         * then ->commit_txn() is required to perform one. On success
         * the transaction is closed. On error the transaction is kept
         * open until ->cancel_txn() is called.
         *
         * Optional.
         */
        int  (*commit_txn)                (struct pmu *pmu);
        /*
         * Will cancel the transaction, assumes ->del() is called
         * for each successful ->add() during the transaction.
         *
         * Optional.
         */
        void (*cancel_txn)                (struct pmu *pmu);

        /*
         * Will return the value for perf_event_mmap_page::index for this event,
         * if no implementation is provided it will default to 0 (see
         * perf_event_idx_default).
         */
        int (*event_idx)                (struct perf_event *event); /*optional */

        /*
         * context-switches callback
         */
        void (*sched_task)                (struct perf_event_pmu_context *pmu_ctx,
                                        bool sched_in);

        /*
         * Kmem cache of PMU specific data
         */
        struct kmem_cache                *task_ctx_cache;

        /*
         * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
         * can be synchronized using this function. See Intel LBR callstack support
         * implementation and Perf core context switch handling callbacks for usage
         * examples.
         */
        void (*swap_task_ctx)                (struct perf_event_pmu_context *prev_epc,
                                         struct perf_event_pmu_context *next_epc);
                                        /* optional */

        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)                (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */

        /*
         * Free pmu-private AUX data structures
         */
        void (*free_aux)                (void *aux); /* optional */

        /*
         * Take a snapshot of the AUX buffer without touching the event
         * state, so that preempting ->start()/->stop() callbacks does
         * not interfere with their logic. Called in PMI context.
         *
         * Returns the size of AUX data copied to the output handle.
         *
         * Optional.
         */
        long (*snapshot_aux)                (struct perf_event *event,
                                         struct perf_output_handle *handle,
                                         unsigned long size);

        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
         * supplied filters are valid, -errno otherwise.
         *
         * Runs in the context of the ioctl()ing process and is not serialized
         * with the rest of the PMU callbacks.
         */
        int (*addr_filters_validate)        (struct list_head *filters);
                                        /* optional */

        /*
         * Synchronize address range filter configuration:
         * translate hw-agnostic filters into hardware configuration in
         * event::hw::addr_filters.
         *
         * Runs as a part of filter sync sequence that is done in ->start()
         * callback by calling perf_event_addr_filters_sync().
         *
         * May (and should) traverse event::addr_filters::list, for which its
         * caller provides necessary serialization.
         */
        void (*addr_filters_sync)        (struct perf_event *event);
                                        /* optional */

        /*
         * Check if event can be used for aux_output purposes for
         * events of this PMU.
         *
         * Runs from perf_event_open(). Should return 0 for "no match"
         * or non-zero for "match".
         */
        int (*aux_output_match)                (struct perf_event *event);
                                        /* optional */

        /*
         * Skip programming this PMU on the given CPU. Typically needed for
         * big.LITTLE things.
         */
        bool (*filter)                        (struct pmu *pmu, int cpu); /* optional */

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period)                (struct perf_event *event, u64 value); /* optional */
};

enum perf_addr_filter_action_t {
        PERF_ADDR_FILTER_ACTION_STOP = 0,
        PERF_ADDR_FILTER_ACTION_START,
        PERF_ADDR_FILTER_ACTION_FILTER,
};

/**
 * struct perf_addr_filter - address range filter definition
 * @entry:        event's filter list linkage
 * @path:        object file's path for file-based filters
 * @offset:        filter range offset
 * @size:        filter range size (size==0 means single address trigger)
 * @action:        filter/start/stop
 *
 * This is a hardware-agnostic filter configuration as specified by the user.
 */
struct perf_addr_filter {
        struct list_head        entry;
        struct path                path;
        unsigned long                offset;
        unsigned long                size;
        enum perf_addr_filter_action_t        action;
};

/**
 * struct perf_addr_filters_head - container for address range filters
 * @list:        list of filters for this event
 * @lock:        spinlock that serializes accesses to the @list and event's
 *                (and its children's) filter generations.
 * @nr_file_filters:        number of file-based filters
 *
 * A child event will use parent's @list (and therefore @lock), so they are
 * bundled together; see perf_event_addr_filters().
 */
struct perf_addr_filters_head {
        struct list_head        list;
        raw_spinlock_t                lock;
        unsigned int                nr_file_filters;
};

struct perf_addr_filter_range {
        unsigned long                start;
        unsigned long                size;
};

/**
 * enum perf_event_state - the states of an event:
 */
enum perf_event_state {
        PERF_EVENT_STATE_DEAD                = -4,
        PERF_EVENT_STATE_EXIT                = -3,
        PERF_EVENT_STATE_ERROR                = -2,
        PERF_EVENT_STATE_OFF                = -1,
        PERF_EVENT_STATE_INACTIVE        =  0,
        PERF_EVENT_STATE_ACTIVE                =  1,
};

struct file;
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);

/*
 * Event capabilities. For event_caps and groups caps.
 *
 * PERF_EV_CAP_SOFTWARE: Is a software event.
 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
 * from any CPU in the package where it is active.
 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
 * cannot be a group leader. If an event with this flag is detached from the
 * group it is scheduled out and moved into an unrecoverable ERROR state.
 */
#define PERF_EV_CAP_SOFTWARE                BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG        BIT(1)
#define PERF_EV_CAP_SIBLING                BIT(2)

#define SWEVENT_HLIST_BITS                8
#define SWEVENT_HLIST_SIZE                (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
        struct hlist_head                heads[SWEVENT_HLIST_SIZE];
        struct rcu_head                        rcu_head;
};

#define PERF_ATTACH_CONTEXT        0x01
#define PERF_ATTACH_GROUP        0x02
#define PERF_ATTACH_TASK        0x04
#define PERF_ATTACH_TASK_DATA        0x08
#define PERF_ATTACH_ITRACE        0x10
#define PERF_ATTACH_SCHED_CB        0x20
#define PERF_ATTACH_CHILD        0x40

struct bpf_prog;
struct perf_cgroup;
struct perf_buffer;

struct pmu_event_list {
        raw_spinlock_t                lock;
        struct list_head        list;
};

/*
 * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex
 * as such iteration must hold either lock. However, since ctx->lock is an IRQ
 * safe lock, and is only held by the CPU doing the modification, having IRQs
 * disabled is sufficient since it will hold-off the IPIs.
 */
#ifdef CONFIG_PROVE_LOCKING
#define lockdep_assert_event_ctx(event)                                \
        WARN_ON_ONCE(__lockdep_enabled &&                        \
                     (this_cpu_read(hardirqs_enabled) &&        \
                      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
#else
#define lockdep_assert_event_ctx(event)
#endif

#define for_each_sibling_event(sibling, event)                        \
        lockdep_assert_event_ctx(event);                        \
        if ((event)->group_leader == (event))                        \
                list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
        /*
         * entry onto perf_event_context::event_list;
         *   modifications require ctx->lock
         *   RCU safe iterations.
         */
        struct list_head                event_entry;

        /*
         * Locked for modification by both ctx->mutex and ctx->lock; holding
         * either sufficies for read.
         */
        struct list_head                sibling_list;
        struct list_head                active_list;
        /*
         * Node on the pinned or flexible tree located at the event context;
         */
        struct rb_node                        group_node;
        u64                                group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
         */
        struct list_head                migrate_entry;

        struct hlist_node                hlist_entry;
        struct list_head                active_entry;
        int                                nr_siblings;

        /* Not serialized. Only written during event initialization. */
        int                                event_caps;
        /* The cumulative AND of all event_caps for events in this group. */
        int                                group_caps;

        unsigned int                        group_generation;
        struct perf_event                *group_leader;
        /*
         * event->pmu will always point to pmu in which this event belongs.
         * Whereas event->pmu_ctx->pmu may point to other pmu when group of
         * different pmu events is created.
         */
        struct pmu                        *pmu;
        void                                *pmu_private;

        enum perf_event_state                state;
        unsigned int                        attach_state;
        local64_t                        count;
        atomic64_t                        child_count;

        /*
         * These are the total time in nanoseconds that the event
         * has been enabled (i.e. eligible to run, and the task has
         * been scheduled in, if this is a per-task event)
         * and running (scheduled onto the CPU), respectively.
         */
        u64                                total_time_enabled;
        u64                                total_time_running;
        u64                                tstamp;

        struct perf_event_attr                attr;
        u16                                header_size;
        u16                                id_header_size;
        u16                                read_size;
        struct hw_perf_event                hw;

        struct perf_event_context        *ctx;
        /*
         * event->pmu_ctx points to perf_event_pmu_context in which the event
         * is added. This pmu_ctx can be of other pmu for sw event when that
         * sw event is part of a group which also contains non-sw events.
         */
        struct perf_event_pmu_context        *pmu_ctx;
        atomic_long_t                        refcount;

        /*
         * These accumulate total time (in nanoseconds) that children
         * events have been enabled and running, respectively.
         */
        atomic64_t                        child_total_time_enabled;
        atomic64_t                        child_total_time_running;

        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                        child_mutex;
        struct list_head                child_list;
        struct perf_event                *parent;

        int                                oncpu;
        int                                cpu;

        struct list_head                owner_entry;
        struct task_struct                *owner;

        /* mmap bits */
        struct mutex                        mmap_mutex;
        atomic_t                        mmap_count;

        struct perf_buffer                *rb;
        struct list_head                rb_entry;
        unsigned long                        rcu_batches;
        int                                rcu_pending;

        /* poll related */
        wait_queue_head_t                waitq;
        struct fasync_struct                *fasync;

        /* delayed work for NMIs and such */
        unsigned int                        pending_wakeup;
        unsigned int                        pending_kill;
        unsigned int                        pending_disable;
        unsigned int                        pending_sigtrap;
        unsigned long                        pending_addr;        /* SIGTRAP */
        struct irq_work                        pending_irq;
        struct callback_head                pending_task;
        unsigned int                        pending_work;

        atomic_t                        event_limit;

        /* address range filters */
        struct perf_addr_filters_head        addr_filters;
        /* vma address array for file-based filders */
        struct perf_addr_filter_range        *addr_filter_ranges;
        unsigned long                        addr_filters_gen;

        /* for aux_output events */
        struct perf_event                *aux_event;

        void (*destroy)(struct perf_event *);
        struct rcu_head                        rcu_head;

        struct pid_namespace                *ns;
        u64                                id;

        atomic64_t                        lost_samples;

        u64                                (*clock)(void);
        perf_overflow_handler_t                overflow_handler;
        void                                *overflow_handler_context;
        struct bpf_prog                        *prog;
        u64                                bpf_cookie;

#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call                *tp_event;
        struct event_filter                *filter;
#ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops               ftrace_ops;
#endif
#endif

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp; /* cgroup event is attach to */
#endif

#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct list_head                sb_list;

        /*
         * Certain events gets forwarded to another pmu internally by over-
         * writing kernel copy of event->attr.type without user being aware
         * of it. event->orig_type contains original 'type' requested by
         * user.
         */
        __u32                                orig_type;
#endif /* CONFIG_PERF_EVENTS */
};

/*
 *           ,-----------------------[1:n]------------------------.
 *           V                                                    V
 * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
 *                                        |                       |
 *                                        `--[n:1]-> pmu <-[1:n]--'
 *
 *
 * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
 * (similar to perf_event_context). Locking is as if it were a member of
 * perf_event_context; specifically:
 *
 *   modification, both: ctx->mutex && ctx->lock
 *   reading, either:    ctx->mutex || ctx->lock
 *
 * There is one exception to this; namely put_pmu_ctx() isn't always called
 * with ctx->mutex held; this means that as long as we can guarantee the epc
 * has events the above rules hold.
 *
 * Specificially, sys_perf_event_open()'s group_leader case depends on
 * ctx->mutex pinning the configuration. Since we hold a reference on
 * group_leader (through the filedesc) it can't go away, therefore it's
 * associated pmu_ctx must exist and cannot change due to ctx->mutex.
 *
 * perf_event holds a refcount on perf_event_context
 * perf_event holds a refcount on perf_event_pmu_context
 */
struct perf_event_pmu_context {
        struct pmu                        *pmu;
        struct perf_event_context       *ctx;

        struct list_head                pmu_ctx_entry;

        struct list_head                pinned_active;
        struct list_head                flexible_active;

        /* Used to avoid freeing per-cpu perf_event_pmu_context */
        unsigned int                        embedded : 1;

        unsigned int                        nr_events;
        unsigned int                        nr_cgroups;
        unsigned int                        nr_freq;

        atomic_t                        refcount; /* event <-> epc */
        struct rcu_head                        rcu_head;

        void                                *task_ctx_data; /* pmu specific data */
        /*
         * Set when one or more (plausibly active) event can't be scheduled
         * due to pmu overcommit or pmu constraints, except tolerant to
         * events not necessary to be active due to scheduling constraints,
         * such as cgroups.
         */
        int                                rotate_necessary;
};

static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
{
        return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
}

struct perf_event_groups {
        struct rb_root        tree;
        u64                index;
};


/**
 * struct perf_event_context - event context structure
 *
 * Used as a container for task events and CPU events as well:
 */
struct perf_event_context {
        /*
         * Protect the states of the events in the list,
         * nr_active, and the list:
         */
        raw_spinlock_t                        lock;
        /*
         * Protect the list of events.  Locking either mutex or lock
         * is sufficient to ensure the list doesn't change; to change
         * the list you need to lock both the mutex and the spinlock.
         */
        struct mutex                        mutex;

        struct list_head                pmu_ctx_list;
        struct perf_event_groups        pinned_groups;
        struct perf_event_groups        flexible_groups;
        struct list_head                event_list;

        int                                nr_events;
        int                                nr_user;
        int                                is_active;

        int                                nr_task_data;
        int                                nr_stat;
        int                                nr_freq;
        int                                rotate_disable;

        refcount_t                        refcount; /* event <-> ctx */
        struct task_struct                *task;

        /*
         * Context clock, runs when context enabled.
         */
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;

        /*
         * These fields let us detect when two contexts have both
         * been cloned (inherited) from a common ancestor.
         */
        struct perf_event_context        *parent_ctx;
        u64                                parent_gen;
        u64                                generation;
        int                                pin_count;
#ifdef CONFIG_CGROUP_PERF
        int                                nr_cgroups;         /* cgroup evts */
#endif
        struct rcu_head                        rcu_head;

        /*
         * Sum (event->pending_sigtrap + event->pending_work)
         *
         * The SIGTRAP is targeted at ctx->task, as such it won't do changing
         * that until the signal is delivered.
         */
        local_t                                nr_pending;
};

/*
 * Number of contexts where an event can trigger:
 *        task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

struct perf_cpu_pmu_context {
        struct perf_event_pmu_context        epc;
        struct perf_event_pmu_context        *task_epc;

        struct list_head                sched_cb_entry;
        int                                sched_cb_usage;

        int                                active_oncpu;
        int                                exclusive;

        raw_spinlock_t                        hrtimer_lock;
        struct hrtimer                        hrtimer;
        ktime_t                                hrtimer_interval;
        unsigned int                        hrtimer_active;
};

/**
 * struct perf_event_cpu_context - per cpu event context structure
 */
struct perf_cpu_context {
        struct perf_event_context        ctx;
        struct perf_event_context        *task_ctx;
        int                                online;

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp;
#endif

        /*
         * Per-CPU storage for iterators used in visit_groups_merge. The default
         * storage is of size 2 to hold the CPU and any CPU event iterators.
         */
        int                                heap_size;
        struct perf_event                **heap;
        struct perf_event                *heap_default[2];
};

struct perf_output_handle {
        struct perf_event                *event;
        struct perf_buffer                *rb;
        unsigned long                        wakeup;
        unsigned long                        size;
        u64                                aux_flags;
        union {
                void                        *addr;
                unsigned long                head;
        };
        int                                page;
};

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

#ifdef CONFIG_CGROUP_PERF

/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;
        int                                active;
};

struct perf_cgroup {
        struct cgroup_subsys_state        css;
        struct perf_cgroup_info        __percpu *info;
};

/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
 */
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
{
        return container_of(task_css_check(task, perf_event_cgrp_id,
                                           ctx ? lockdep_is_held(&ctx->lock)
                                               : true),
                            struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */

#ifdef CONFIG_PERF_EVENTS

extern struct perf_event_context *perf_cpu_task_ctx(void);

extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
                                unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);

extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);

extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
                                        struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
extern void perf_sched_cb_dec(struct pmu *pmu);
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

extern void perf_pmu_resched(struct pmu *pmu);

extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                int cpu,
                                struct task_struct *task,
                                perf_overflow_handler_t callback,
                                void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
                                int src_cpu, int dst_cpu);
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);

extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);

static inline bool branch_sample_no_flags(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS;
}

static inline bool branch_sample_no_cycles(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES;
}

static inline bool branch_sample_type(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE;
}

static inline bool branch_sample_hw_index(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
}

static inline bool branch_sample_priv(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
}

static inline bool branch_sample_counters(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
}

static inline bool branch_sample_call_stack(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
}

struct perf_sample_data {
        /*
         * Fields set by perf_sample_data_init() unconditionally,
         * group so as to minimize the cachelines touched.
         */
        u64                                sample_flags;
        u64                                period;
        u64                                dyn_size;

        /*
         * Fields commonly set by __perf_event_header__init_id(),
         * group so as to minimize the cachelines touched.
         */
        u64                                type;
        struct {
                u32        pid;
                u32        tid;
        }                                tid_entry;
        u64                                time;
        u64                                id;
        struct {
                u32        cpu;
                u32        reserved;
        }                                cpu_entry;

        /*
         * The other fields, optionally {set,used} by
         * perf_{prepare,output}_sample().
         */
        u64                                ip;
        struct perf_callchain_entry        *callchain;
        struct perf_raw_record                *raw;
        struct perf_branch_stack        *br_stack;
        u64                                *br_stack_cntr;
        union perf_sample_weight        weight;
        union  perf_mem_data_src        data_src;
        u64                                txn;

        struct perf_regs                regs_user;
        struct perf_regs                regs_intr;
        u64                                stack_user_size;

        u64                                stream_id;
        u64                                cgroup;
        u64                                addr;
        u64                                phys_addr;
        u64                                data_page_size;
        u64                                code_page_size;
        u64                                aux_size;
} ____cacheline_aligned;

/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
                    PERF_MEM_S(SNOOP, NA) |\
                    PERF_MEM_S(LOCK, NA)  |\
                    PERF_MEM_S(TLB, NA)   |\
                    PERF_MEM_S(LVLNUM, NA))

static inline void perf_sample_data_init(struct perf_sample_data *data,
                                         u64 addr, u64 period)
{
        /* remaining struct members initialized in perf_prepare_sample() */
        data->sample_flags = PERF_SAMPLE_PERIOD;
        data->period = period;
        data->dyn_size = 0;

        if (addr) {
                data->addr = addr;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }
}

static inline void perf_sample_save_callchain(struct perf_sample_data *data,
                                              struct perf_event *event,
                                              struct pt_regs *regs)
{
        int size = 1;

        data->callchain = perf_callchain(event, regs);
        size += data->callchain->nr;

        data->dyn_size += size * sizeof(u64);
        data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
}

static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
                                             struct perf_raw_record *raw)
{
        struct perf_raw_frag *frag = &raw->frag;
        u32 sum = 0;
        int size;

        do {
                sum += frag->size;
                if (perf_raw_frag_last(frag))
                        break;
                frag = frag->next;
        } while (1);

        size = round_up(sum + sizeof(u32), sizeof(u64));
        raw->size = size - sizeof(u32);
        frag->pad = raw->size - sum;

        data->raw = raw;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_RAW;
}

static inline void perf_sample_save_brstack(struct perf_sample_data *data,
                                            struct perf_event *event,
                                            struct perf_branch_stack *brs,
                                            u64 *brs_cntr)
{
        int size = sizeof(u64); /* nr */

        if (branch_sample_hw_index(event))
                size += sizeof(u64);
        size += brs->nr * sizeof(struct perf_branch_entry);

        /*
         * The extension space for counters is appended after the
         * struct perf_branch_stack. It is used to store the occurrences
         * of events of each branch.
         */
        if (brs_cntr)
                size += brs->nr * sizeof(u64);

        data->br_stack = brs;
        data->br_stack_cntr = brs_cntr;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}

static inline u32 perf_sample_data_size(struct perf_sample_data *data,
                                        struct perf_event *event)
{
        u32 size = sizeof(struct perf_event_header);

        size += event->header_size + event->id_header_size;
        size += data->dyn_size;

        return size;
}

/*
 * Clear all bitfields in the perf_branch_entry.
 * The to and from fields are not cleared because they are
 * systematically modified by caller.
 */
static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br)
{
        br->mispred = 0;
        br->predicted = 0;
        br->in_tx = 0;
        br->abort = 0;
        br->cycles = 0;
        br->type = 0;
        br->spec = PERF_BR_SPEC_NA;
        br->reserved = 0;
}

extern void perf_output_sample(struct perf_output_handle *handle,
                               struct perf_event_header *header,
                               struct perf_sample_data *data,
                               struct perf_event *event);
extern void perf_prepare_sample(struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);
extern void perf_prepare_header(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);

extern int perf_event_overflow(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);

extern void perf_event_output_forward(struct perf_event *event,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs);
extern void perf_event_output_backward(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs);
extern int perf_event_output(struct perf_event *event,
                             struct perf_sample_data *data,
                             struct pt_regs *regs);

static inline bool
is_default_overflow_handler(struct perf_event *event)
{
        perf_overflow_handler_t overflow_handler = event->overflow_handler;

        if (likely(overflow_handler == perf_event_output_forward))
                return true;
        if (unlikely(overflow_handler == perf_event_output_backward))
                return true;
        return false;
}

extern void
perf_event_header__init_id(struct perf_event_header *header,
                           struct perf_sample_data *data,
                           struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
                             struct perf_output_handle *handle,
                             struct perf_sample_data *sample);

extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        return attr->exclude_idle || attr->exclude_user ||
               attr->exclude_kernel || attr->exclude_hv ||
               attr->exclude_guest || attr->exclude_host;
}

static inline bool is_sampling_event(struct perf_event *event)
{
        return event->attr.sample_period != 0;
}

/*
 * Return 1 for a software event, 0 for a hardware event
 */
static inline int is_software_event(struct perf_event *event)
{
        return event->event_caps & PERF_EV_CAP_SOFTWARE;
}

/*
 * Return 1 for event in sw context, 0 for event in hw context
 */
static inline int in_software_context(struct perf_event *event)
{
        return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}

static inline int is_exclusive_pmu(struct pmu *pmu)
{
        return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
}

extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);

#ifndef perf_arch_fetch_caller_regs
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
#endif

/*
 * When generating a perf sample in-line, instead of from an interrupt /
 * exception, we lack a pt_regs. This is typically used from software events
 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
 *
 * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - sp for PERF_SAMPLE_CALLCHAIN
 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
 *
 * NOTE: assumes @regs is otherwise already 0 filled; this is important for
 * things like PERF_SAMPLE_REGS_INTR.
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

static __always_inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id]))
                __perf_sw_event(event_id, nr, regs, addr);
}

DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);

/*
 * 'Special' version for the scheduler, it hard assumes no recursion,
 * which is guaranteed by us not actually scheduling inside other swevents
 * because those disable preemption.
 */
static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
        struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

        perf_fetch_caller_regs(regs);
        ___perf_sw_event(event_id, nr, regs, addr);
}

extern struct static_key_false perf_sched_events;

static __always_inline bool __perf_sw_enabled(int swevt)
{
        return static_key_false(&perf_swevent_enabled[swevt]);
}

static inline void perf_event_task_migrate(struct task_struct *task)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS))
                task->sched_migrated = 1;
}

static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
{
        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);

        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
            task->sched_migrated) {
                __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
                task->sched_migrated = 0;
        }
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
                                             struct task_struct *next)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
                __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

#ifdef CONFIG_CGROUP_PERF
        if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
            perf_cgroup_from_task(prev, NULL) !=
            perf_cgroup_from_task(next, NULL))
                __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
#endif

        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
}

extern void perf_event_mmap(struct vm_area_struct *vma);

extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                               bool unregister, const char *sym);
extern void perf_event_bpf_event(struct bpf_prog *prog,
                                 enum perf_bpf_event_type type,
                                 u16 flags);

#ifdef CONFIG_GUEST_PERF_EVENTS
extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);

static inline unsigned int perf_guest_state(void)
{
        return static_call(__perf_guest_state)();
}
static inline unsigned long perf_guest_get_ip(void)
{
        return static_call(__perf_guest_get_ip)();
}
static inline unsigned int perf_guest_handle_intel_pt_intr(void)
{
        return static_call(__perf_guest_handle_intel_pt_intr)();
}
extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
#else
static inline unsigned int perf_guest_state(void)                 { return 0; }
static inline unsigned long perf_guest_get_ip(void)                 { return 0; }
static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; }
#endif /* CONFIG_GUEST_PERF_EVENTS */

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
extern void perf_event_text_poke(const void *addr,
                                 const void *old_bytes, size_t old_len,
                                 const void *new_bytes, size_t new_len);

/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                   u32 max_stack, bool crosstask, bool add_mark);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->contexts;
                return 0;
        } else {
                ctx->contexts_maxed = true;
                return -1; /* no more room, stop walking the stack */
        }
}

static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
        }
}

extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_mlock;
extern int sysctl_perf_event_sample_rate;
extern int sysctl_perf_cpu_time_max_percent;

extern void perf_sample_event_took(u64 sample_len_ns);

int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_event_max_stack_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);

/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN                0

/* Finer grained perf_event_open(2) access control. */
#define PERF_SECURITY_CPU                1
#define PERF_SECURITY_KERNEL                2
#define PERF_SECURITY_TRACEPOINT        3

static inline int perf_is_paranoid(void)
{
        return sysctl_perf_event_paranoid > -1;
}

static inline int perf_allow_kernel(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
}

static inline int perf_allow_cpu(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_CPU);
}

static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
                return -EPERM;

        return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}

extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
                          struct hlist_head *head, int rctx,
                          struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);

#ifndef perf_misc_flags
# define perf_misc_flags(regs) \
                (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_instruction_pointer(regs)        instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif

static inline bool has_branch_stack(struct perf_event *event)
{
        return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

static inline bool needs_branch_stack(struct perf_event *event)
{
        return event->attr.branch_sample_type != 0;
}

static inline bool has_aux(struct perf_event *event)
{
        return event->pmu->setup_aux;
}

static inline bool is_write_backward(struct perf_event *event)
{
        return !!event->attr.write_backward;
}

static inline bool has_addr_filter(struct perf_event *event)
{
        return event->pmu->nr_addr_filters;
}

/*
 * An inherited event uses parent's filters
 */
static inline struct perf_addr_filters_head *
perf_event_addr_filters(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = &event->addr_filters;

        if (event->parent)
                ifh = &event->parent->addr_filters;

        return ifh;
}

static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
        /* Only the parent has fasync state */
        if (event->parent)
                event = event->parent;
        return &event->fasync;
}

extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);

extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_sample_data *data,
                             struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
                                     struct perf_sample_data *data,
                                     struct perf_event *event,
                                     unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
                                      struct perf_sample_data *data,
                                      struct perf_event *event,
                                      unsigned int size);

extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                             const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
                                 struct perf_output_handle *handle,
                                 unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                                { return NULL; }
static inline void
perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                                                                        { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
static inline void *
perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
static inline void
perf_event_task_migrate(struct task_struct *task)                        { }
static inline void
perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                        { }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
                          struct task_struct *next)                        { }
static inline int perf_event_init_task(struct task_struct *child,
                                       u64 clone_flags)                        { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)        { }
static inline void perf_event_free_task(struct task_struct *task)        { }
static inline void perf_event_delayed_put(struct task_struct *task)        { }
static inline struct file *perf_event_get(unsigned int fd)        { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
        return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        return ERR_PTR(-EINVAL);
}
static inline int perf_event_read_local(struct perf_event *event, u64 *value,
                                        u64 *enabled, u64 *running)
{
        return -EINVAL;
}
static inline void perf_event_print_debug(void)                                { }
static inline int perf_event_task_disable(void)                                { return -EINVAL; }
static inline int perf_event_task_enable(void)                                { return -EINVAL; }
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
        return -EINVAL;
}

static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)        { }
static inline void
perf_bp_event(struct perf_event *event, void *data)                        { }

static inline void perf_event_mmap(struct vm_area_struct *vma)                { }

typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                                      bool unregister, const char *sym)        { }
static inline void perf_event_bpf_event(struct bpf_prog *prog,
                                        enum perf_bpf_event_type type,
                                        u16 flags)                        { }
static inline void perf_event_exec(void)                                { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec)        { }
static inline void perf_event_namespaces(struct task_struct *tsk)        { }
static inline void perf_event_fork(struct task_struct *tsk)                { }
static inline void perf_event_text_poke(const void *addr,
                                        const void *old_bytes,
                                        size_t old_len,
                                        const void *new_bytes,
                                        size_t new_len)                        { }
static inline void perf_event_init(void)                                { }
static inline int  perf_swevent_get_recursion_context(void)                { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx)                { }
static inline u64 perf_swevent_set_period(struct perf_event *event)        { return 0; }
static inline void perf_event_enable(struct perf_event *event)                { }
static inline void perf_event_disable(struct perf_event *event)                { }
static inline int __perf_event_disable(void *info)                        { return -1; }
static inline void perf_event_task_tick(void)                                { }
static inline int perf_event_release_kernel(struct perf_event *event)        { return 0; }
static inline int perf_event_period(struct perf_event *event, u64 value)
{
        return -EINVAL;
}
static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{
        return 0;
}
#endif

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern void perf_restore_debug_store(void);
#else
static inline void perf_restore_debug_store(void)                        { }
#endif

#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))

struct perf_pmu_events_attr {
        struct device_attribute attr;
        u64 id;
        const char *event_str;
};

struct perf_pmu_events_ht_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str_ht;
        const char                                *event_str_noht;
};

struct perf_pmu_events_hybrid_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str;
        u64                                        pmu_type;
};

struct perf_pmu_format_hybrid_attr {
        struct device_attribute                        attr;
        u64                                        pmu_type;
};

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page);

#define PMU_EVENT_ATTR(_name, _var, _id, _show)                                \
static struct perf_pmu_events_attr _var = {                                \
        .attr = __ATTR(_name, 0444, _show, NULL),                        \
        .id   =  _id,                                                        \
};

#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                            \
static struct perf_pmu_events_attr _var = {                                    \
        .attr                = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
        .id                = 0,                                                    \
        .event_str        = _str,                                                    \
};

#define PMU_EVENT_ATTR_ID(_name, _show, _id)                                \
        (&((struct perf_pmu_events_attr[]) {                                \
                { .attr = __ATTR(_name, 0444, _show, NULL),                \
                  .id = _id, }                                                \
        })[0].attr.attr)

#define PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
static ssize_t                                                                \
_name##_show(struct device *dev,                                        \
                               struct device_attribute *attr,                \
                               char *page)                                \
{                                                                        \
        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                        \
        return sprintf(page, _format "\n");                                \
}                                                                        \

#define PMU_FORMAT_ATTR(_name, _format)                                        \
        PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
                                                                        \
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
int perf_event_init_cpu(unsigned int cpu);
int perf_event_exit_cpu(unsigned int cpu);
#else
#define perf_event_init_cpu        NULL
#define perf_event_exit_cpu        NULL
#endif

extern void arch_perf_update_userpage(struct perf_event *event,
                                      struct perf_event_mmap_page *userpg,
                                      u64 now);

/*
 * Snapshot branch stack on software events.
 *
 * Branch stack can be very useful in understanding software events. For
 * example, when a long function, e.g. sys_perf_event_open, returns an
 * errno, it is not obvious why the function failed. Branch stack could
 * provide very helpful information in this type of scenarios.
 *
 * On software event, it is necessary to stop the hardware branch recorder
 * fast. Otherwise, the hardware register/buffer will be flushed with
 * entries of the triggering event. Therefore, static call is used to
 * stop the hardware recorder.
 */

/*
 * cnt is the number of entries allocated for entries.
 * Return number of entries copied to .
 */
typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
                                           unsigned int cnt);
DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);

#ifndef PERF_NEEDS_LOPWR_CB
static inline void perf_lopwr_cb(bool mode)
{
}
#endif

#endif /* _LINUX_PERF_EVENT_H */






















    2 






    2 







































































    2 





    2 







    2 






    2 










    2 

























    2 




    2 

    2 

    2 


    2 
    2 

    2 
    2 






    2 
    2 





    2 


    2 




































    2 



    2 


    2 

































































    2 








    2 









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/fs_struct.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"
#include "internal.h"

struct prepend_buffer {
        char *buf;
        int len;
};
#define DECLARE_BUFFER(__name, __buf, __len) \
        struct prepend_buffer __name = {.buf = __buf + __len, .len = __len}

static char *extract_string(struct prepend_buffer *p)
{
        if (likely(p->len >= 0))
                return p->buf;
        return ERR_PTR(-ENAMETOOLONG);
}

static bool prepend_char(struct prepend_buffer *p, unsigned char c)
{
        if (likely(p->len > 0)) {
                p->len--;
                *--p->buf = c;
                return true;
        }
        p->len = -1;
        return false;
}

/*
 * The source of the prepend data can be an optimistic load
 * of a dentry name and length. And because we don't hold any
 * locks, the length and the pointer to the name may not be
 * in sync if a concurrent rename happens, and the kernel
 * copy might fault as a result.
 *
 * The end result will correct itself when we check the
 * rename sequence count, but we need to be able to handle
 * the fault gracefully.
 */
static bool prepend_copy(void *dst, const void *src, int len)
{
        if (unlikely(copy_from_kernel_nofault(dst, src, len))) {
                memset(dst, 'x', len);
                return false;
        }
        return true;
}

static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
{
        // Already overflowed?
        if (p->len < 0)
                return false;

        // Will overflow?
        if (p->len < namelen) {
                // Fill as much as possible from the end of the name
                str += namelen - p->len;
                p->buf -= p->len;
                prepend_copy(p->buf, str, p->len);
                p->len = -1;
                return false;
        }

        // Fits fully
        p->len -= namelen;
        p->buf -= namelen;
        return prepend_copy(p->buf, str, namelen);
}

/**
 * prepend_name - prepend a pathname in front of current buffer pointer
 * @p: prepend buffer which contains buffer pointer and allocated length
 * @name: name string and length qstr structure
 *
 * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
 * make sure that either the old or the new name pointer and length are
 * fetched. However, there may be mismatch between length and pointer.
 * But since the length cannot be trusted, we need to copy the name very
 * carefully when doing the prepend_copy(). It also prepends "/" at
 * the beginning of the name. The sequence number check at the caller will
 * retry it again when a d_move() does happen. So any garbage in the buffer
 * due to mismatched pointer and length will be discarded.
 *
 * Load acquire is needed to make sure that we see the new name data even
 * if we might get the length wrong.
 */
static bool prepend_name(struct prepend_buffer *p, const struct qstr *name)
{
        const char *dname = smp_load_acquire(&name->name); /* ^^^ */
        u32 dlen = READ_ONCE(name->len);

        return prepend(p, dname, dlen) && prepend_char(p, '/');
}

static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
                          const struct path *root, struct prepend_buffer *p)
{
        while (dentry != root->dentry || &mnt->mnt != root->mnt) {
                const struct dentry *parent = READ_ONCE(dentry->d_parent);

                if (dentry == mnt->mnt.mnt_root) {
                        struct mount *m = READ_ONCE(mnt->mnt_parent);
                        struct mnt_namespace *mnt_ns;

                        if (likely(mnt != m)) {
                                dentry = READ_ONCE(mnt->mnt_mountpoint);
                                mnt = m;
                                continue;
                        }
                        /* Global root */
                        mnt_ns = READ_ONCE(mnt->mnt_ns);
                        /* open-coded is_mounted() to use local mnt_ns */
                        if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
                                return 1;        // absolute root
                        else
                                return 2;        // detached or not attached yet
                }

                if (unlikely(dentry == parent))
                        /* Escaped? */
                        return 3;

                prefetch(parent);
                if (!prepend_name(p, &dentry->d_name))
                        break;
                dentry = parent;
        }
        return 0;
}

/**
 * prepend_path - Prepend path string to a buffer
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @p: prepend buffer which contains buffer pointer and allocated length
 *
 * The function will first try to write out the pathname without taking any
 * lock other than the RCU read lock to make sure that dentries won't go away.
 * It only checks the sequence number of the global rename_lock as any change
 * in the dentry's d_seq will be preceded by changes in the rename_lock
 * sequence number. If the sequence number had been changed, it will restart
 * the whole pathname back-tracing sequence again by taking the rename_lock.
 * In this case, there is no need to take the RCU read lock as the recursive
 * parent pointer references will keep the dentry chain alive as long as no
 * rename operation is performed.
 */
static int prepend_path(const struct path *path,
                        const struct path *root,
                        struct prepend_buffer *p)
{
        unsigned seq, m_seq = 0;
        struct prepend_buffer b;
        int error;

        rcu_read_lock();
restart_mnt:
        read_seqbegin_or_lock(&mount_lock, &m_seq);
        seq = 0;
        rcu_read_lock();
restart:
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b);
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);

        if (!(m_seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&mount_lock, m_seq)) {
                m_seq = 1;
                goto restart_mnt;
        }
        done_seqretry(&mount_lock, m_seq);

        if (unlikely(error == 3))
                b = *p;

        if (b.len == p->len)
                prepend_char(&b, '/');

        *p = b;
        return error;
}

/**
 * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name.
 *
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
 * "buflen" should be positive.
 *
 * If the path is not reachable from the supplied root, return %NULL.
 */
char *__d_path(const struct path *path,
               const struct path *root,
               char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, root, &b) > 0))
                return NULL;
        return extract_string(&b);
}

char *d_absolute_path(const struct path *path,
               char *buf, int buflen)
{
        struct path root = {};
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, &root, &b) > 1))
                return ERR_PTR(-EINVAL);
        return extract_string(&b);
}

static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/**
 * d_path - return the path of a dentry
 * @path: path to report
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
 * Returns a pointer into the buffer or an error code if the path was
 * too long. Note: Callers should use the returned pointer, not the passed
 * in buffer, to use the name! The implementation often starts at an offset
 * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
char *d_path(const struct path *path, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);
        struct path root;

        /*
         * We have various synthetic filesystems that never get mounted.  On
         * these filesystems dentries are never used for lookup purposes, and
         * thus don't need to be hashed.  They also don't need a name until a
         * user wants to identify the object in /proc/pid/fd/.  The little hack
         * below allows us to generate a name for these objects on demand:
         *
         * Some pseudo inodes are mountable.  When they are mounted
         * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
         * and instead have d_path return the mounted path.
         */
        if (path->dentry->d_op && path->dentry->d_op->d_dname &&
            (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);

        rcu_read_lock();
        get_fs_root_rcu(current->fs, &root);
        if (unlikely(d_unlinked(path->dentry)))
                prepend(&b, " (deleted)", 11);
        else
                prepend_char(&b, 0);
        prepend_path(path, &root, &b);
        rcu_read_unlock();

        return extract_string(&b);
}
EXPORT_SYMBOL(d_path);

/*
 * Helper function for dentry_operations.d_dname() members
 */
char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...)
{
        va_list args;
        char temp[64];
        int sz;

        va_start(args, fmt);
        sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
        va_end(args);

        if (sz > sizeof(temp) || sz > buflen)
                return ERR_PTR(-ENAMETOOLONG);

        buffer += buflen - sz;
        return memcpy(buffer, temp, sz);
}

char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{
        DECLARE_BUFFER(b, buffer, buflen);
        /* these dentries are never renamed, so d_lock is not needed */
        prepend(&b, " (deleted)", 11);
        prepend(&b, dentry->d_name.name, dentry->d_name.len);
        prepend_char(&b, '/');
        return extract_string(&b);
}

/*
 * Write full pathname from the root of the filesystem into the buffer.
 */
static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p)
{
        const struct dentry *dentry;
        struct prepend_buffer b;
        int seq = 0;

        rcu_read_lock();
restart:
        dentry = d;
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
                const struct dentry *parent = dentry->d_parent;

                prefetch(parent);
                if (!prepend_name(&b, &dentry->d_name))
                        break;
                dentry = parent;
        }
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);
        if (b.len == p->len)
                prepend_char(&b, '/');
        return extract_string(&b);
}

char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}
EXPORT_SYMBOL(dentry_path_raw);

char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        if (unlikely(d_unlinked(dentry)))
                prepend(&b, "//deleted", 10);
        else
                prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}

static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
                                    struct path *pwd)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
                *pwd = fs->pwd;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/*
 * NOTE! The user-level library version returns a
 * character pointer. The kernel system call just
 * returns the length of the buffer filled (which
 * includes the ending '\0' character), or a negative
 * error value. So libc would do something like
 *
 *        char *getcwd(char * buf, size_t size)
 *        {
 *                int retval;
 *
 *                retval = sys_getcwd(buf, size);
 *                if (retval >= 0)
 *                        return buf;
 *                errno = -retval;
 *                return NULL;
 *        }
 */
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
{
        int error;
        struct path pwd, root;
        char *page = __getname();

        if (!page)
                return -ENOMEM;

        rcu_read_lock();
        get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);

        if (unlikely(d_unlinked(pwd.dentry))) {
                rcu_read_unlock();
                error = -ENOENT;
        } else {
                unsigned len;
                DECLARE_BUFFER(b, page, PATH_MAX);

                prepend_char(&b, 0);
                if (unlikely(prepend_path(&pwd, &root, &b) > 0))
                        prepend(&b, "(unreachable)", 13);
                rcu_read_unlock();

                len = PATH_MAX - b.len;
                if (unlikely(len > PATH_MAX))
                        error = -ENAMETOOLONG;
                else if (unlikely(len > size))
                        error = -ERANGE;
                else if (copy_to_user(buf, b.buf, len))
                        error = -EFAULT;
                else
                        error = len;
        }
        __putname(page);
        return error;
}




























































































































































































































    8 














    8 






























    3 













    3 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>

struct cred;
struct inode;

/*
 * COW Supplementary groups list
 */
struct group_info {
        refcount_t        usage;
        int                ngroups;
        kgid_t                gid[];
} __randomize_layout;

/**
 * get_group_info - Get a reference to a group info structure
 * @group_info: The group info to reference
 *
 * This gets a reference to a set of supplementary groups.
 *
 * If the caller is accessing a task's credentials, they must hold the RCU read
 * lock when reading.
 */
static inline struct group_info *get_group_info(struct group_info *gi)
{
        refcount_inc(&gi->usage);
        return gi;
}

/**
 * put_group_info - Release a reference to a group info structure
 * @group_info: The group info to release
 */
#define put_group_info(group_info)                        \
do {                                                        \
        if (refcount_dec_and_test(&(group_info)->usage))        \
                groups_free(group_info);                \
} while (0)

#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);

extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);

extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}

static inline int in_group_p(kgid_t grp)
{
        return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
        return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
        return 1;
}
#endif

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *        task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *        upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {
        atomic_long_t        usage;
        kuid_t                uid;                /* real UID of the task */
        kgid_t                gid;                /* real GID of the task */
        kuid_t                suid;                /* saved UID of the task */
        kgid_t                sgid;                /* saved GID of the task */
        kuid_t                euid;                /* effective UID of the task */
        kgid_t                egid;                /* effective GID of the task */
        kuid_t                fsuid;                /* UID for VFS ops */
        kgid_t                fsgid;                /* GID for VFS ops */
        unsigned        securebits;        /* SUID-less security management */
        kernel_cap_t        cap_inheritable; /* caps our children can inherit */
        kernel_cap_t        cap_permitted;        /* caps we're permitted */
        kernel_cap_t        cap_effective;        /* caps we can actually use */
        kernel_cap_t        cap_bset;        /* capability bounding set */
        kernel_cap_t        cap_ambient;        /* Ambient capability set */
#ifdef CONFIG_KEYS
        unsigned char        jit_keyring;        /* default keyring to attach requested
                                         * keys to */
        struct key        *session_keyring; /* keyring inherited over fork */
        struct key        *process_keyring; /* keyring private to this process */
        struct key        *thread_keyring; /* keyring private to this thread */
        struct key        *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
        void                *security;        /* LSM security */
#endif
        struct user_struct *user;        /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct ucounts *ucounts;
        struct group_info *group_info;        /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
                int non_rcu;                        /* Can we skip RCU deletion? */
                struct rcu_head        rcu;                /* RCU deletion hook */
        };
} __randomize_layout;

extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, unsigned long);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
extern void revert_creds(const struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);

static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
        return cap_issubset(cred->cap_ambient,
                            cap_intersect(cred->cap_permitted,
                                          cred->cap_inheritable));
}

/**
 * get_new_cred_many - Get references on a new set of credentials
 * @cred: The new credentials to reference
 * @nr: Number of references to acquire
 *
 * Get references on the specified set of new credentials.  The caller must
 * release all acquired references.
 */
static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
{
        atomic_long_add(nr, &cred->usage);
        return cred;
}

/**
 * get_new_cred - Get a reference on a new set of credentials
 * @cred: The new credentials to reference
 *
 * Get a reference on the specified set of new credentials.  The caller must
 * release the reference.
 */
static inline struct cred *get_new_cred(struct cred *cred)
{
        return get_new_cred_many(cred, 1);
}

/**
 * get_cred_many - Get references on a set of credentials
 * @cred: The credentials to reference
 * @nr: Number of references to acquire
 *
 * Get references on the specified set of credentials.  The caller must release
 * all acquired reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.  Although the
 * pointer is const, this will temporarily discard the const and increment the
 * usage count.  The purpose of this is to attempt to catch at compile time the
 * accidental alteration of a set of credentials that should be considered
 * immutable.
 */
static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return cred;
        nonconst_cred->non_rcu = 0;
        return get_new_cred_many(nonconst_cred, nr);
}

/*
 * get_cred - Get a reference on a set of credentials
 * @cred: The credentials to reference
 *
 * Get a reference on the specified set of credentials.  The caller must
 * release the reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.
 */
static inline const struct cred *get_cred(const struct cred *cred)
{
        return get_cred_many(cred, 1);
}

static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return NULL;
        if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
                return NULL;
        nonconst_cred->non_rcu = 0;
        return cred;
}

/**
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 * @nr: Number of references to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 *
 * This takes a const pointer to a set of credentials because the credentials
 * on task_struct are attached by const pointers to prevent accidental
 * alteration of otherwise immutable credential sets.
 */
static inline void put_cred_many(const struct cred *_cred, int nr)
{
        struct cred *cred = (struct cred *) _cred;

        if (cred) {
                if (atomic_long_sub_and_test(nr, &cred->usage))
                        __put_cred(cred);
        }
}

/*
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 */
static inline void put_cred(const struct cred *cred)
{
        put_cred_many(cred, 1);
}

/**
 * current_cred - Access the current task's subjective credentials
 *
 * Access the subjective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_cred() \
        rcu_dereference_protected(current->cred, 1)

/**
 * current_real_cred - Access the current task's objective credentials
 *
 * Access the objective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_real_cred() \
        rcu_dereference_protected(current->real_cred, 1)

/**
 * __task_cred - Access a task's objective credentials
 * @task: The task to query
 *
 * Access the objective credentials of a task.  The caller must hold the RCU
 * readlock.
 *
 * The result of this function should not be passed directly to get_cred();
 * rather get_task_cred() should be used instead.
 */
#define __task_cred(task)        \
        rcu_dereference((task)->real_cred)

/**
 * get_current_cred - Get the current task's subjective credentials
 *
 * Get the subjective credentials of the current task, pinning them so that
 * they can't go away.  Accessing the current task's credentials directly is
 * not permitted.
 */
#define get_current_cred()                                \
        (get_cred(current_cred()))

/**
 * get_current_user - Get the current task's user_struct
 *
 * Get the user record of the current task, pinning it so that it can't go
 * away.
 */
#define get_current_user()                                \
({                                                        \
        struct user_struct *__u;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __u = get_uid(__cred->user);                        \
        __u;                                                \
})

/**
 * get_current_groups - Get the current task's supplementary group list
 *
 * Get the supplementary group list of the current task, pinning it so that it
 * can't go away.
 */
#define get_current_groups()                                \
({                                                        \
        struct group_info *__groups;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __groups = get_group_info(__cred->group_info);        \
        __groups;                                        \
})

#define task_cred_xxx(task, xxx)                        \
({                                                        \
        __typeof__(((struct cred *)NULL)->xxx) ___val;        \
        rcu_read_lock();                                \
        ___val = __task_cred((task))->xxx;                \
        rcu_read_unlock();                                \
        ___val;                                                \
})

#define task_uid(task)                (task_cred_xxx((task), uid))
#define task_euid(task)                (task_cred_xxx((task), euid))
#define task_ucounts(task)        (task_cred_xxx((task), ucounts))

#define current_cred_xxx(xxx)                        \
({                                                \
        current_cred()->xxx;                        \
})

#define current_uid()                (current_cred_xxx(uid))
#define current_gid()                (current_cred_xxx(gid))
#define current_euid()                (current_cred_xxx(euid))
#define current_egid()                (current_cred_xxx(egid))
#define current_suid()                (current_cred_xxx(suid))
#define current_sgid()                (current_cred_xxx(sgid))
#define current_fsuid()         (current_cred_xxx(fsuid))
#define current_fsgid()         (current_cred_xxx(fsgid))
#define current_cap()                (current_cred_xxx(cap_effective))
#define current_user()                (current_cred_xxx(user))
#define current_ucounts()        (current_cred_xxx(ucounts))

extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns()        (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
        return &init_user_ns;
}
#endif


#define current_uid_gid(_uid, _gid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_uid) = __cred->uid;                        \
        *(_gid) = __cred->gid;                        \
} while(0)

#define current_euid_egid(_euid, _egid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_euid) = __cred->euid;                \
        *(_egid) = __cred->egid;                \
} while(0)

#define current_fsuid_fsgid(_fsuid, _fsgid)        \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_fsuid) = __cred->fsuid;                \
        *(_fsgid) = __cred->fsgid;                \
} while(0)

#endif /* _LINUX_CRED_H */
































































































































































































































    3 





























































































































    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * NET                Generic infrastructure for INET connection oriented protocols.
 *
 *                Definitions for inet_connection_sock 
 *
 * Authors:        Many people, see the TCP sources
 *
 *                 From code originally in TCP
 */
#ifndef _INET_CONNECTION_SOCK_H
#define _INET_CONNECTION_SOCK_H

#include <linux/compiler.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/poll.h>
#include <linux/kernel.h>
#include <linux/sockptr.h>

#include <net/inet_sock.h>
#include <net/request_sock.h>

/* Cancel timers, when they are not required. */
#undef INET_CSK_CLEAR_TIMERS

struct inet_bind_bucket;
struct inet_bind2_bucket;
struct tcp_congestion_ops;

/*
 * Pointers to address related TCP functions
 * (i.e. things that depend on the address family)
 */
struct inet_connection_sock_af_ops {
        int            (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
        void            (*send_check)(struct sock *sk, struct sk_buff *skb);
        int            (*rebuild_header)(struct sock *sk);
        void            (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb);
        int            (*conn_request)(struct sock *sk, struct sk_buff *skb);
        struct sock *(*syn_recv_sock)(const struct sock *sk, struct sk_buff *skb,
                                      struct request_sock *req,
                                      struct dst_entry *dst,
                                      struct request_sock *req_unhash,
                                      bool *own_req);
        u16            net_header_len;
        u16            sockaddr_len;
        int            (*setsockopt)(struct sock *sk, int level, int optname,
                                  sockptr_t optval, unsigned int optlen);
        int            (*getsockopt)(struct sock *sk, int level, int optname,
                                  char __user *optval, int __user *optlen);
        void            (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
        void            (*mtu_reduced)(struct sock *sk);
};

/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:           FIFO of established children
 * @icsk_bind_hash:           Bind node
 * @icsk_bind2_hash:           Bind node in the bhash2 table
 * @icsk_timeout:           Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:                   Retransmit timeout
 * @icsk_pmtu_cookie           Last pmtu seen by socket
 * @icsk_ca_ops                   Pluggable congestion control hook
 * @icsk_af_ops                   Operations which are AF_INET{4,6} specific
 * @icsk_ulp_ops           Pluggable ULP control hook
 * @icsk_ulp_data           ULP private data
 * @icsk_clean_acked           Clean acked data hook
 * @icsk_ca_state:           Congestion control state
 * @icsk_retransmits:           Number of unrecovered [RTO] timeouts
 * @icsk_pending:           Scheduled timer event
 * @icsk_backoff:           Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:           unanswered 0 window probes
 * @icsk_ext_hdr_len:           Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:                   Delayed ACK control data
 * @icsk_mtup;                   MTU probing control data
 * @icsk_probes_tstamp:    Probe timestamp (cleared by non-zero window ack)
 * @icsk_user_timeout:           TCP_USER_TIMEOUT value
 */
struct inet_connection_sock {
        /* inet_sock has to be the first member! */
        struct inet_sock          icsk_inet;
        struct request_sock_queue icsk_accept_queue;
        struct inet_bind_bucket          *icsk_bind_hash;
        struct inet_bind2_bucket  *icsk_bind2_hash;
        unsigned long                  icsk_timeout;
         struct timer_list          icsk_retransmit_timer;
         struct timer_list          icsk_delack_timer;
        __u32                          icsk_rto;
        __u32                     icsk_rto_min;
        __u32                     icsk_delack_max;
        __u32                          icsk_pmtu_cookie;
        const struct tcp_congestion_ops *icsk_ca_ops;
        const struct inet_connection_sock_af_ops *icsk_af_ops;
        const struct tcp_ulp_ops  *icsk_ulp_ops;
        void __rcu                  *icsk_ulp_data;
        void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
        unsigned int                  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
        __u8                          icsk_ca_state:5,
                                  icsk_ca_initialized:1,
                                  icsk_ca_setsockopt:1,
                                  icsk_ca_dst_locked:1;
        __u8                          icsk_retransmits;
        __u8                          icsk_pending;
        __u8                          icsk_backoff;
        __u8                          icsk_syn_retries;
        __u8                          icsk_probes_out;
        __u16                          icsk_ext_hdr_len;
        struct {
                __u8                  pending;         /* ACK is pending                           */
                __u8                  quick;         /* Scheduled number of quick acks           */
                __u8                  pingpong;         /* The session is interactive                   */
                __u8                  retry;         /* Number of attempts                           */
                #define ATO_BITS 8
                __u32                  ato:ATO_BITS,         /* Predicted tick of soft clock           */
                                  lrcv_flowlabel:20, /* last received ipv6 flowlabel           */
                                  unused:4;
                unsigned long          timeout;         /* Currently scheduled timeout                   */
                __u32                  lrcvtime;         /* timestamp of last received data packet */
                __u16                  last_seg_size; /* Size of last incoming segment           */
                __u16                  rcv_mss;         /* MSS used for delayed ACK decisions           */
        } icsk_ack;
        struct {
                /* Range of MTUs to search */
                int                  search_high;
                int                  search_low;

                /* Information on the current probe. */
                u32                  probe_size:31,
                /* Is the MTUP feature enabled for this connection? */
                                  enabled:1;

                u32                  probe_timestamp;
        } icsk_mtup;
        u32                          icsk_probes_tstamp;
        u32                          icsk_user_timeout;

        u64                          icsk_ca_priv[104 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE          sizeof_field(struct inet_connection_sock, icsk_ca_priv)
};

#define ICSK_TIME_RETRANS        1        /* Retransmit timer */
#define ICSK_TIME_DACK                2        /* Delayed ack timer */
#define ICSK_TIME_PROBE0        3        /* Zero window probe timer */
#define ICSK_TIME_LOSS_PROBE        5        /* Tail loss probe timer */
#define ICSK_TIME_REO_TIMEOUT        6        /* Reordering timer */

#define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk)

static inline void *inet_csk_ca(const struct sock *sk)
{
        return (void *)inet_csk(sk)->icsk_ca_priv;
}

struct sock *inet_csk_clone_lock(const struct sock *sk,
                                 const struct request_sock *req,
                                 const gfp_t priority);

enum inet_csk_ack_state_t {
        ICSK_ACK_SCHED        = 1,
        ICSK_ACK_TIMER  = 2,
        ICSK_ACK_PUSHED = 4,
        ICSK_ACK_PUSHED2 = 8,
        ICSK_ACK_NOW = 16,        /* Send the next ACK immediately (once) */
        ICSK_ACK_NOMEM = 32,
};

void inet_csk_init_xmit_timers(struct sock *sk,
                               void (*retransmit_handler)(struct timer_list *),
                               void (*delack_handler)(struct timer_list *),
                               void (*keepalive_handler)(struct timer_list *));
void inet_csk_clear_xmit_timers(struct sock *sk);
void inet_csk_clear_xmit_timers_sync(struct sock *sk);

static inline void inet_csk_schedule_ack(struct sock *sk)
{
        inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED;
}

static inline int inet_csk_ack_scheduled(const struct sock *sk)
{
        return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED;
}

static inline void inet_csk_delack_init(struct sock *sk)
{
        memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
}

void inet_csk_delete_keepalive_timer(struct sock *sk);
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout);

static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
                icsk->icsk_pending = 0;
#ifdef INET_CSK_CLEAR_TIMERS
                sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
#endif
        } else if (what == ICSK_TIME_DACK) {
                icsk->icsk_ack.pending = 0;
                icsk->icsk_ack.retry = 0;
#ifdef INET_CSK_CLEAR_TIMERS
                sk_stop_timer(sk, &icsk->icsk_delack_timer);
#endif
        } else {
                pr_debug("inet_csk BUG: unknown timer value\n");
        }
}

/*
 *        Reset the retransmission timer
 */
static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
                                             unsigned long when,
                                             const unsigned long max_when)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (when > max_when) {
                pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n",
                         sk, what, when, (void *)_THIS_IP_);
                when = max_when;
        }

        if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
            what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) {
                icsk->icsk_pending = what;
                icsk->icsk_timeout = jiffies + when;
                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
        } else if (what == ICSK_TIME_DACK) {
                icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
                icsk->icsk_ack.timeout = jiffies + when;
                sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
        } else {
                pr_debug("inet_csk BUG: unknown timer value\n");
        }
}

static inline unsigned long
inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
                     unsigned long max_when)
{
        u64 when = (u64)icsk->icsk_rto << icsk->icsk_backoff;

        return (unsigned long)min_t(u64, when, max_when);
}

struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg);

int inet_csk_get_port(struct sock *sk, unsigned short snum);

struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4,
                                     const struct request_sock *req);
struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
                                            struct sock *newsk,
                                            const struct request_sock *req);

struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
                                      struct request_sock *req,
                                      struct sock *child);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
                                   unsigned long timeout);
struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
                                         struct request_sock *req,
                                         bool own_req);

static inline void inet_csk_reqsk_queue_added(struct sock *sk)
{
        reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);
}

static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
{
        return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
}

static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
        return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog);
}

bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);

static inline unsigned long
reqsk_timeout(struct request_sock *req, unsigned long max_timeout)
{
        u64 timeout = (u64)req->timeout << req->num_timeout;

        return (unsigned long)min_t(u64, timeout, max_timeout);
}

static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
{
        /* The below has to be done to allow calling inet_csk_destroy_sock */
        sock_set_flag(sk, SOCK_DEAD);
        this_cpu_inc(*sk->sk_prot->orphan_count);
}

void inet_csk_destroy_sock(struct sock *sk);
void inet_csk_prepare_forced_close(struct sock *sk);

/*
 * LISTEN is a special case for poll..
 */
static inline __poll_t inet_csk_listen_poll(const struct sock *sk)
{
        return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ?
                        (EPOLLIN | EPOLLRDNORM) : 0;
}

int inet_csk_listen_start(struct sock *sk);
void inet_csk_listen_stop(struct sock *sk);

void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);

/* update the fast reuse flag when adding a socket */
void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
                               struct sock *sk);

struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);

static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
{
        inet_csk(sk)->icsk_ack.pingpong =
                READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
}

static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
{
        inet_csk(sk)->icsk_ack.pingpong = 0;
}

static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
{
        return inet_csk(sk)->icsk_ack.pingpong >=
               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
}

static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ack.pingpong < U8_MAX)
                icsk->icsk_ack.pingpong++;
}

static inline bool inet_csk_has_ulp(const struct sock *sk)
{
        return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops;
}

static inline void inet_init_csk_locks(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        spin_lock_init(&icsk->icsk_accept_queue.rskq_lock);
        spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock);
}

#endif /* _INET_CONNECTION_SOCK_H */













    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BSEARCH_H
#define _LINUX_BSEARCH_H

#include <linux/types.h>

static __always_inline
void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        const char *pivot;
        int result;

        while (num > 0) {
                pivot = base + (num >> 1) * size;
                result = cmp(key, pivot);

                if (result == 0)
                        return (void *)pivot;

                if (result > 0) {
                        base = pivot + size;
                        num--;
                }
                num >>= 1;
        }

        return NULL;
}

extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);

#endif /* _LINUX_BSEARCH_H */

























































































































































   14 



   16 
   16 

   15 












   13 



   15 
   15 


   18 

   16 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
// SPDX-License-Identifier: GPL-2.0-only
/*
 * AppArmor security module
 *
 * This file contains AppArmor network mediation
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2017 Canonical Ltd.
 */

#include "include/apparmor.h"
#include "include/audit.h"
#include "include/cred.h"
#include "include/label.h"
#include "include/net.h"
#include "include/policy.h"
#include "include/secid.h"

#include "net_names.h"


struct aa_sfs_entry aa_sfs_entry_network[] = {
        AA_SFS_FILE_STRING("af_mask",        AA_SFS_AF_MASK),
        { }
};

static const char * const net_mask_names[] = {
        "unknown",
        "send",
        "receive",
        "unknown",

        "create",
        "shutdown",
        "connect",
        "unknown",

        "setattr",
        "getattr",
        "setcred",
        "getcred",

        "chmod",
        "chown",
        "chgrp",
        "lock",

        "mmap",
        "mprot",
        "unknown",
        "unknown",

        "accept",
        "bind",
        "listen",
        "unknown",

        "setopt",
        "getopt",
        "unknown",
        "unknown",

        "unknown",
        "unknown",
        "unknown",
        "unknown",
};


/* audit callback for net specific fields */
void audit_net_cb(struct audit_buffer *ab, void *va)
{
        struct common_audit_data *sa = va;
        struct apparmor_audit_data *ad = aad(sa);

        if (address_family_names[sa->u.net->family])
                audit_log_format(ab, " family=\"%s\"",
                                 address_family_names[sa->u.net->family]);
        else
                audit_log_format(ab, " family=\"unknown(%d)\"",
                                 sa->u.net->family);
        if (sock_type_names[ad->net.type])
                audit_log_format(ab, " sock_type=\"%s\"",
                                 sock_type_names[ad->net.type]);
        else
                audit_log_format(ab, " sock_type=\"unknown(%d)\"",
                                 ad->net.type);
        audit_log_format(ab, " protocol=%d", ad->net.protocol);

        if (ad->request & NET_PERMS_MASK) {
                audit_log_format(ab, " requested_mask=");
                aa_audit_perm_mask(ab, ad->request, NULL, 0,
                                   net_mask_names, NET_PERMS_MASK);

                if (ad->denied & NET_PERMS_MASK) {
                        audit_log_format(ab, " denied_mask=");
                        aa_audit_perm_mask(ab, ad->denied, NULL, 0,
                                           net_mask_names, NET_PERMS_MASK);
                }
        }
        if (ad->peer) {
                audit_log_format(ab, " peer=");
                aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer,
                                FLAGS_NONE, GFP_ATOMIC);
        }
}

/* Generic af perm */
int aa_profile_af_perm(struct aa_profile *profile,
                       struct apparmor_audit_data *ad, u32 request, u16 family,
                       int type)
{
        struct aa_ruleset *rules = list_first_entry(&profile->rules,
                                                    typeof(*rules), list);
        struct aa_perms perms = { };
        aa_state_t state;
        __be16 buffer[2];

        AA_BUG(family >= AF_MAX);
        AA_BUG(type < 0 || type >= SOCK_MAX);

        if (profile_unconfined(profile))
                return 0;
        state = RULE_MEDIATES(rules, AA_CLASS_NET);
        if (!state)
                return 0;

        buffer[0] = cpu_to_be16(family);
        buffer[1] = cpu_to_be16((u16) type);
        state = aa_dfa_match_len(rules->policy->dfa, state, (char *) &buffer,
                                 4);
        perms = *aa_lookup_perms(rules->policy, state);
        aa_apply_modes_to_perms(profile, &perms);

        return aa_check_perms(profile, &perms, request, ad, audit_net_cb);
}

int aa_af_perm(const struct cred *subj_cred, struct aa_label *label,
               const char *op, u32 request, u16 family, int type, int protocol)
{
        struct aa_profile *profile;
        DEFINE_AUDIT_NET(ad, op, NULL, family, type, protocol);

        return fn_for_each_confined(label, profile,
                        aa_profile_af_perm(profile, &ad, request, family,
                                           type));
}

static int aa_label_sk_perm(const struct cred *subj_cred,
                            struct aa_label *label,
                            const char *op, u32 request,
                            struct sock *sk)
{
        struct aa_sk_ctx *ctx = SK_CTX(sk);
        int error = 0;

        AA_BUG(!label);
        AA_BUG(!sk);

        if (ctx->label != kernel_t && !unconfined(label)) {
                struct aa_profile *profile;
                DEFINE_AUDIT_SK(ad, op, sk);

                ad.subj_cred = subj_cred;
                error = fn_for_each_confined(label, profile,
                            aa_profile_af_sk_perm(profile, &ad, request, sk));
        }

        return error;
}

int aa_sk_perm(const char *op, u32 request, struct sock *sk)
{
        struct aa_label *label;
        int error;

        AA_BUG(!sk);
        AA_BUG(in_interrupt());

        /* TODO: switch to begin_current_label ???? */
        label = begin_current_label_crit_section();
        error = aa_label_sk_perm(current_cred(), label, op, request, sk);
        end_current_label_crit_section(label);

        return error;
}


int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label,
                      const char *op, u32 request, struct socket *sock)
{
        AA_BUG(!label);
        AA_BUG(!sock);
        AA_BUG(!sock->sk);

        return aa_label_sk_perm(subj_cred, label, op, request, sock->sk);
}

#ifdef CONFIG_NETWORK_SECMARK
static int apparmor_secmark_init(struct aa_secmark *secmark)
{
        struct aa_label *label;

        if (secmark->label[0] == '*') {
                secmark->secid = AA_SECID_WILDCARD;
                return 0;
        }

        label = aa_label_strn_parse(&root_ns->unconfined->label,
                                    secmark->label, strlen(secmark->label),
                                    GFP_ATOMIC, false, false);

        if (IS_ERR(label))
                return PTR_ERR(label);

        secmark->secid = label->secid;

        return 0;
}

static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid,
                           struct apparmor_audit_data *ad)
{
        int i, ret;
        struct aa_perms perms = { };
        struct aa_ruleset *rules = list_first_entry(&profile->rules,
                                                    typeof(*rules), list);

        if (rules->secmark_count == 0)
                return 0;

        for (i = 0; i < rules->secmark_count; i++) {
                if (!rules->secmark[i].secid) {
                        ret = apparmor_secmark_init(&rules->secmark[i]);
                        if (ret)
                                return ret;
                }

                if (rules->secmark[i].secid == secid ||
                    rules->secmark[i].secid == AA_SECID_WILDCARD) {
                        if (rules->secmark[i].deny)
                                perms.deny = ALL_PERMS_MASK;
                        else
                                perms.allow = ALL_PERMS_MASK;

                        if (rules->secmark[i].audit)
                                perms.audit = ALL_PERMS_MASK;
                }
        }

        aa_apply_modes_to_perms(profile, &perms);

        return aa_check_perms(profile, &perms, request, ad, audit_net_cb);
}

int apparmor_secmark_check(struct aa_label *label, char *op, u32 request,
                           u32 secid, const struct sock *sk)
{
        struct aa_profile *profile;
        DEFINE_AUDIT_SK(ad, op, sk);

        return fn_for_each_confined(label, profile,
                                    aa_secmark_perm(profile, request, secid,
                                                    &ad));
}
#endif
















































































































































   24 






   24 



































































































































































































































































































































































































































   24 

































































































































































































































































































































































































































































































































































































































   24 



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H

/*
 * Cpumasks provide a bitmap suitable for representing the
 * set of CPUs in a system, one bit position per CPU number.  In general,
 * only nr_cpu_ids (<= NR_CPUS) bits are valid.
 */
#include <linux/cleanup.h>
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/gfp_types.h>
#include <linux/numa.h>

/* Don't assign or return these: may not be this big! */
typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;

/**
 * cpumask_bits - get the bits in a cpumask
 * @maskp: the struct cpumask *
 *
 * You should only assume nr_cpu_ids bits of this mask are valid.  This is
 * a macro so it's const-correct.
 */
#define cpumask_bits(maskp) ((maskp)->bits)

/**
 * cpumask_pr_args - printf args to output a cpumask
 * @maskp: cpumask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
 */
#define cpumask_pr_args(maskp)                nr_cpu_ids, cpumask_bits(maskp)

#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
#define nr_cpu_ids ((unsigned int)NR_CPUS)
#else
extern unsigned int nr_cpu_ids;
#endif

static inline void set_nr_cpu_ids(unsigned int nr)
{
#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
        WARN_ON(nr != nr_cpu_ids);
#else
        nr_cpu_ids = nr;
#endif
}

/*
 * We have several different "preferred sizes" for the cpumask
 * operations, depending on operation.
 *
 * For example, the bitmap scanning and operating operations have
 * optimized routines that work for the single-word case, but only when
 * the size is constant. So if NR_CPUS fits in one single word, we are
 * better off using that small constant, in order to trigger the
 * optimized bit finding. That is 'small_cpumask_size'.
 *
 * The clearing and copying operations will similarly perform better
 * with a constant size, but we limit that size arbitrarily to four
 * words. We call this 'large_cpumask_size'.
 *
 * Finally, some operations just want the exact limit, either because
 * they set bits or just don't have any faster fixed-sized versions. We
 * call this just 'nr_cpumask_bits'.
 *
 * Note that these optional constants are always guaranteed to be at
 * least as big as 'nr_cpu_ids' itself is, and all our cpumask
 * allocations are at least that size (see cpumask_size()). The
 * optimization comes from being able to potentially use a compile-time
 * constant instead of a run-time generated exact number of CPUs.
 */
#if NR_CPUS <= BITS_PER_LONG
  #define small_cpumask_bits ((unsigned int)NR_CPUS)
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#elif NR_CPUS <= 4*BITS_PER_LONG
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#else
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits nr_cpu_ids
#endif
#define nr_cpumask_bits nr_cpu_ids

/*
 * The following particular system cpumasks and operations manage
 * possible, present, active and online cpus.
 *
 *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
 *  The cpu_possible_mask is fixed at boot time, as the set of CPU IDs
 *  that it is possible might ever be plugged in at anytime during the
 *  life of that system boot.  The cpu_present_mask is dynamic(*),
 *  representing which CPUs are currently plugged in.  And
 *  cpu_online_mask is the dynamic subset of cpu_present_mask,
 *  indicating those CPUs available for scheduling.
 *
 *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
 *  depending on what ACPI reports as currently plugged in, otherwise
 *  cpu_present_mask is just a copy of cpu_possible_mask.
 *
 *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
 *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
 *
 * Subtleties:
 * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
 *    assumption that their single CPU is online.  The UP
 *    cpu_{online,possible,present}_masks are placebos.  Changing them
 *    will have no useful affect on the following num_*_cpus()
 *    and cpu_*() macros in the UP case.  This ugliness is a UP
 *    optimization - don't waste any instructions or memory references
 *    asking if you're online or how many CPUs there are if there is
 *    only one CPU.
 */

extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
#define cpu_dying_mask    ((const struct cpumask *)&__cpu_dying_mask)

extern atomic_t __num_online_cpus;

extern cpumask_t cpus_booted_once_mask;

static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}

/* verify cpu argument to cpumask_* operators */
static __always_inline unsigned int cpumask_check(unsigned int cpu)
{
        cpu_max_bits_warn(cpu, small_cpumask_bits);
        return cpu;
}

/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_zero - get the first unset cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if all cpus are set.
 */
static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
{
        return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @srcp1: the first input
 * @srcp2: the second input
 *
 * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
static inline
unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
 * @srcp1: the first input
 * @srcp2: the second input
 * @srcp3: the third input
 *
 * Return: >= nr_cpu_ids if no cpus set in all.
 */
static inline
unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2,
                                   const struct cpumask *srcp3)
{
        return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                      cpumask_bits(srcp3), small_cpumask_bits);
}

/**
 * cpumask_last - get the last CPU in a cpumask
 * @srcp:        - the cpumask pointer
 *
 * Return:        >= nr_cpumask_bits if no CPUs set.
 */
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set.
 */
static inline
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus unset.
 */
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
}

#if NR_CPUS == 1
/* Uniprocessor: there is only one valid CPU */
static inline unsigned int cpumask_local_spread(unsigned int i, int node)
{
        return 0;
}

static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                                                      const struct cpumask *src2p)
{
        return cpumask_first_and(src1p, src2p);
}

static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
{
        return cpumask_first(srcp);
}
#else
unsigned int cpumask_local_spread(unsigned int i, int node);
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p);
unsigned int cpumask_any_distribute(const struct cpumask *srcp);
#endif /* NR_CPUS */

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set in both.
 */
static inline
unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
                     const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                                \
        for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)

#if NR_CPUS == 1
static inline
unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
{
        cpumask_check(start);
        if (n != -1)
                cpumask_check(n);

        /*
         * Return the first available CPU when wrapping, or when starting before cpu0,
         * since there is only one valid option.
         */
        if (wrap && n >= 0)
                return nr_cpumask_bits;

        return cpumask_first(mask);
}
#else
unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
#endif

/**
 * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 * @start: the start location
 *
 * The implementation does not assume any bit in @mask is set (including @start).
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_wrap(cpu, mask, start)                                \
        for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)

/**
 * for_each_cpu_and - iterate over every cpu in both masks
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_and(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_and(cpu, mask1, mask2)                                \
        for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
 *                         those present in another.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_andnot(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_andnot(cpu, mask1, mask2)                                \
        for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_or - iterate over every cpu present in either mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_or(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_or(cpu, mask1, mask2)                                \
        for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_from(cpu, mask)                                \
        for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * cpumask_any_but - return a "random" in a cpumask, but not this one.
 * @mask: the cpumask to search
 * @cpu: the cpu to ignore.
 *
 * Often used to find any cpu but smp_processor_id() in a mask.
 * Return: >= nr_cpu_ids if no cpus set.
 */
static inline
unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        for_each_cpu(i, mask)
                if (i != cpu)
                        break;
        return i;
}

/**
 * cpumask_any_and_but - pick a "random" cpu from *mask1 & *mask2, but not this one.
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 * @cpu: the cpu to ignore
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
static inline
unsigned int cpumask_any_and_but(const struct cpumask *mask1,
                                 const struct cpumask *mask2,
                                 unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        i = cpumask_first_and(mask1, mask2);
        if (i != cpu)
                return i;

        return cpumask_next_and(cpu, mask1, mask2);
}

/**
 * cpumask_nth - get the Nth cpu in a cpumask
 * @srcp: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
{
        return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and - get the Nth cpu in 2 cpumasks
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static inline
unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static inline
unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @srcp3: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2,
                                                        const struct cpumask *srcp3)
{
        return find_nth_and_andnot_bit(cpumask_bits(srcp1),
                                        cpumask_bits(srcp2),
                                        cpumask_bits(srcp3),
                                        small_cpumask_bits, cpumask_check(cpu));
}

#define CPU_BITS_NONE                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL                        \
}

#define CPU_BITS_CPU0                                                \
{                                                                \
        [0] =  1UL                                                \
}

/**
 * cpumask_set_cpu - set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}


/**
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_assign_cpu - assign a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 * @bool: the value to assign
 */
static __always_inline void cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
{
        assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
}

static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
{
        __assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Return: true if @cpu is set in @cpumask, else returns false
 */
static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
        return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_set_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_clear_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_setall(struct cpumask *dstp)
{
        if (small_const_nbits(small_cpumask_bits)) {
                cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits);
                return;
        }
        bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_clear(struct cpumask *dstp)
{
        bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
}

/**
 * cpumask_and - *dstp = *src1p & *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static inline bool cpumask_and(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_or - *dstp = *src1p | *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                      cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_xor - *dstp = *src1p ^ *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_xor(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_andnot - *dstp = *src1p & ~*src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static inline bool cpumask_andnot(struct cpumask *dstp,
                                  const struct cpumask *src1p,
                                  const struct cpumask *src2p)
{
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
                                          cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_equal - *src1p == *src2p
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if the cpumasks are equal, false if not
 */
static inline bool cpumask_equal(const struct cpumask *src1p,
                                const struct cpumask *src2p)
{
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                                                 small_cpumask_bits);
}

/**
 * cpumask_or_equal - *src1p | *src2p == *src3p
 * @src1p: the first input
 * @src2p: the second input
 * @src3p: the third input
 *
 * Return: true if first cpumask ORed with second cpumask == third cpumask,
 *           otherwise false
 */
static inline bool cpumask_or_equal(const struct cpumask *src1p,
                                    const struct cpumask *src2p,
                                    const struct cpumask *src3p)
{
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                               cpumask_bits(src3p), small_cpumask_bits);
}

/**
 * cpumask_intersects - (*src1p & *src2p) != 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if first cpumask ANDed with second cpumask is non-empty,
 *           otherwise false
 */
static inline bool cpumask_intersects(const struct cpumask *src1p,
                                     const struct cpumask *src2p)
{
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
                                                      small_cpumask_bits);
}

/**
 * cpumask_subset - (*src1p & ~*src2p) == 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if *@src1p is a subset of *@src2p, else returns false
 */
static inline bool cpumask_subset(const struct cpumask *src1p,
                                 const struct cpumask *src2p)
{
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
                                                  small_cpumask_bits);
}

/**
 * cpumask_empty - *srcp == 0
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
 *
 * Return: true if srcp is empty (has no bits set), else false
 */
static inline bool cpumask_empty(const struct cpumask *srcp)
{
        return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_full - *srcp == 0xFFFFFFFF...
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
 *
 * Return: true if srcp is full (has all bits set), else false
 */
static inline bool cpumask_full(const struct cpumask *srcp)
{
        return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_weight - Count of bits in *srcp
 * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in *srcp
 */
static inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
        return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
                                                const struct cpumask *srcp2)
{
        return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
                                                const struct cpumask *srcp2)
{
        return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_shift_right - *dstp = *srcp >> n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_right(struct cpumask *dstp,
                                       const struct cpumask *srcp, int n)
{
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                               small_cpumask_bits);
}

/**
 * cpumask_shift_left - *dstp = *srcp << n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_left(struct cpumask *dstp,
                                      const struct cpumask *srcp, int n)
{
        bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                              nr_cpumask_bits);
}

/**
 * cpumask_copy - *dstp = *srcp
 * @dstp: the result
 * @srcp: the input cpumask
 */
static inline void cpumask_copy(struct cpumask *dstp,
                                const struct cpumask *srcp)
{
        bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
}

/**
 * cpumask_any - pick a "random" cpu from *srcp
 * @srcp: the input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any(srcp) cpumask_first(srcp)

/**
 * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))

/**
 * cpumask_of - the cpumask containing just a given cpu
 * @cpu: the cpu (<= nr_cpu_ids)
 */
#define cpumask_of(cpu) (get_cpu_mask(cpu))

/**
 * cpumask_parse_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpumask_parse_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_parselist_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpumask_parselist_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
                                     nr_cpumask_bits);
}

/**
 * cpumask_parse - extract a cpumask from a string
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpulist_parse - extract a cpumask from a user string of ranges
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes
 *
 * Return: size to allocate for a &struct cpumask in bytes
 */
static inline unsigned int cpumask_size(void)
{
        return bitmap_size(large_cpumask_bits);
}

/*
 * cpumask_var_t: struct cpumask for stack usage.
 *
 * Oh, the wicked games we play!  In order to make kernel coding a
 * little more difficult, we typedef cpumask_var_t to an array or a
 * pointer: doing &mask on an array is a noop, so it still works.
 *
 * i.e.
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *          ... use 'tmpmask' like a normal struct cpumask * ...
 *
 *        free_cpumask_var(tmpmask);
 *
 *
 * However, one notable exception is there. alloc_cpumask_var() allocates
 * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has
 * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t.
 *
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *        var = *tmpmask;
 *
 * This code makes NR_CPUS length memcopy and brings to a memory corruption.
 * cpumask_copy() provide safe copy functionality.
 *
 * Note that there is another evil here: If you define a cpumask_var_t
 * as a percpu variable then the way to obtain the address of the cpumask
 * structure differently influences what this_cpu_* operation needs to be
 * used. Please use this_cpu_cpumask_var_t in those cases. The direct use
 * of this_cpu_ptr() or this_cpu_read() will lead to failures when the
 * other type of cpumask_var_t implementation is configured.
 *
 * Please also note that __cpumask_var_read_mostly can be used to declare
 * a cpumask_var_t variable itself (not its content) as read mostly.
 */
#ifdef CONFIG_CPUMASK_OFFSTACK
typedef struct cpumask *cpumask_var_t;

#define this_cpu_cpumask_var_ptr(x)        this_cpu_read(x)
#define __cpumask_var_read_mostly        __read_mostly

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);

static inline
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}

/**
 * alloc_cpumask_var - allocate a struct cpumask
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * See alloc_cpumask_var_node.
 *
 * Return: %true if allocation succeeded, %false if not
 */
static inline
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}

static inline
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}

void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);

static inline bool cpumask_available(cpumask_var_t mask)
{
        return mask != NULL;
}

#else
typedef struct cpumask cpumask_var_t[1];

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly

static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return true;
}

static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        return true;
}

static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        cpumask_clear(*mask);
        return true;
}

static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        cpumask_clear(*mask);
        return true;
}

static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}

static inline void free_cpumask_var(cpumask_var_t mask)
{
}

static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}

static inline bool cpumask_available(cpumask_var_t mask)
{
        return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T));

/* It's common to want to use cpu_all_mask in struct member initializers,
 * so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)

/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])

#if NR_CPUS == 1
/* Uniprocessor: the possible/online/present masks are always "1" */
#define for_each_possible_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_present_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#else
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
#endif

/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);
void init_cpu_online(const struct cpumask *src);

static inline void
set_cpu_possible(unsigned int cpu, bool possible)
{
        if (possible)
                cpumask_set_cpu(cpu, &__cpu_possible_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_possible_mask);
}

static inline void
set_cpu_present(unsigned int cpu, bool present)
{
        if (present)
                cpumask_set_cpu(cpu, &__cpu_present_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_present_mask);
}

void set_cpu_online(unsigned int cpu, bool online);

static inline void
set_cpu_active(unsigned int cpu, bool active)
{
        if (active)
                cpumask_set_cpu(cpu, &__cpu_active_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_active_mask);
}

static inline void
set_cpu_dying(unsigned int cpu, bool dying)
{
        if (dying)
                cpumask_set_cpu(cpu, &__cpu_dying_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_dying_mask);
}

/**
 * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
 * @bitmap: the bitmap
 *
 * There are a few places where cpumask_var_t isn't appropriate and
 * static cpumasks must be used (eg. very early boot), yet we don't
 * expose the definition of 'struct cpumask'.
 *
 * This does the conversion, and can be used as a constant initializer.
 */
#define to_cpumask(bitmap)                                                \
        ((struct cpumask *)(1 ? (bitmap)                                \
                            : (void *)sizeof(__check_is_bitmap(bitmap))))

static inline int __check_is_bitmap(const unsigned long *bitmap)
{
        return 1;
}

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */
extern const unsigned long
        cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];

static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

#if NR_CPUS > 1
/**
 * num_online_cpus() - Read the number of online CPUs
 *
 * Despite the fact that __num_online_cpus is of type atomic_t, this
 * interface gives only a momentary snapshot and is not protected against
 * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
 * region.
 *
 * Return: momentary snapshot of the number of online CPUs
 */
static __always_inline unsigned int num_online_cpus(void)
{
        return raw_atomic_read(&__num_online_cpus);
}
#define num_possible_cpus()        cpumask_weight(cpu_possible_mask)
#define num_present_cpus()        cpumask_weight(cpu_present_mask)
#define num_active_cpus()        cpumask_weight(cpu_active_mask)

static inline bool cpu_online(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_online_mask);
}

static inline bool cpu_possible(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_possible_mask);
}

static inline bool cpu_present(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_present_mask);
}

static inline bool cpu_active(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_active_mask);
}

static inline bool cpu_dying(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_dying_mask);
}

#else

#define num_online_cpus()        1U
#define num_possible_cpus()        1U
#define num_present_cpus()        1U
#define num_active_cpus()        1U

static inline bool cpu_online(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_possible(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_present(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_active(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_dying(unsigned int cpu)
{
        return false;
}

#endif /* NR_CPUS > 1 */

#define cpu_is_offline(cpu)        unlikely(!cpu_online(cpu))

#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL                                                \
{                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}

#else /* NR_CPUS > BITS_PER_LONG */

#define CPU_BITS_ALL                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}
#endif /* NR_CPUS > BITS_PER_LONG */

/**
 * cpumap_print_to_pagebuf  - copies the cpumask into the buffer either
 *        as comma-separated list of cpus or hex values of cpumask
 * @list: indicates whether the cpumap must be list
 * @mask: the cpumask to copy
 * @buf: the buffer to copy into
 *
 * Return: the length of the (null-terminated) @buf string, zero if
 * nothing is copied.
 */
static inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
        return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
                                      nr_cpu_ids);
}

/**
 * cpumap_print_bitmask_to_buf  - copies the cpumask into the buffer as
 *        hex values of cpumask
 *
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * The function prints the cpumask into the buffer as hex values of
 * cpumask; Typically used by bin_attribute to export cpumask bitmask
 * ABI.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static inline ssize_t
cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
                loff_t off, size_t count)
{
        return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

/**
 * cpumap_print_list_to_buf  - copies the cpumask into the buffer as
 *        comma-separated list of cpus
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * Everything is same with the above cpumap_print_bitmask_to_buf()
 * except the print format.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static inline ssize_t
cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
                loff_t off, size_t count)
{
        return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#else
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                        \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#endif /* NR_CPUS > BITS_PER_LONG */

#define CPU_MASK_NONE                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL                                \
} }

#define CPU_MASK_CPU0                                                        \
(cpumask_t) { {                                                                \
        [0] =  1UL                                                        \
} }

/*
 * Provide a valid theoretical max size for cpumap and cpulist sysfs files
 * to avoid breaking userspace which may allocate a buffer based on the size
 * reported by e.g. fstat.
 *
 * for cpumap NR_CPUS * 9/32 - 1 should be an exact length.
 *
 * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up
 * to 2 orders of magnitude larger than 8192. And then we divide by 2 to
 * cover a worst-case of every other cpu being on one of two nodes for a
 * very large NR_CPUS.
 *
 *  Use PAGE_SIZE as a minimum for smaller configurations while avoiding
 *  unsigned comparison to -1.
 */
#define CPUMAP_FILE_MAX_BYTES  (((NR_CPUS * 9)/32 > PAGE_SIZE) \
                                        ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
#define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)

#endif /* __LINUX_CPUMASK_H */






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



















































































































































































































































































































    1 





























































































































































































































































































































































    1 
























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NETLINK_H
#define __NET_NETLINK_H

#include <linux/types.h>
#include <linux/netlink.h>
#include <linux/jiffies.h>
#include <linux/in6.h>

/* ========================================================================
 *         Netlink Messages and Attributes Interface (As Seen On TV)
 * ------------------------------------------------------------------------
 *                          Messages Interface
 * ------------------------------------------------------------------------
 *
 * Message Format:
 *    <--- nlmsg_total_size(payload)  --->
 *    <-- nlmsg_msg_size(payload) ->
 *   +----------+- - -+-------------+- - -+-------- - -
 *   | nlmsghdr | Pad |   Payload   | Pad | nlmsghdr
 *   +----------+- - -+-------------+- - -+-------- - -
 *   nlmsg_data(nlh)---^                   ^
 *   nlmsg_next(nlh)-----------------------+
 *
 * Payload Format:
 *    <---------------------- nlmsg_len(nlh) --------------------->
 *    <------ hdrlen ------>       <- nlmsg_attrlen(nlh, hdrlen) ->
 *   +----------------------+- - -+--------------------------------+
 *   |     Family Header    | Pad |           Attributes           |
 *   +----------------------+- - -+--------------------------------+
 *   nlmsg_attrdata(nlh, hdrlen)---^
 *
 * Data Structures:
 *   struct nlmsghdr                        netlink message header
 *
 * Message Construction:
 *   nlmsg_new()                        create a new netlink message
 *   nlmsg_put()                        add a netlink message to an skb
 *   nlmsg_put_answer()                        callback based nlmsg_put()
 *   nlmsg_end()                        finalize netlink message
 *   nlmsg_get_pos()                        return current position in message
 *   nlmsg_trim()                        trim part of message
 *   nlmsg_cancel()                        cancel message construction
 *   nlmsg_consume()                        free a netlink message (expected)
 *   nlmsg_free()                        free a netlink message (drop)
 *
 * Message Sending:
 *   nlmsg_multicast()                        multicast message to several groups
 *   nlmsg_unicast()                        unicast a message to a single socket
 *   nlmsg_notify()                        send notification message
 *
 * Message Length Calculations:
 *   nlmsg_msg_size(payload)                length of message w/o padding
 *   nlmsg_total_size(payload)                length of message w/ padding
 *   nlmsg_padlen(payload)                length of padding at tail
 *
 * Message Payload Access:
 *   nlmsg_data(nlh)                        head of message payload
 *   nlmsg_len(nlh)                        length of message payload
 *   nlmsg_attrdata(nlh, hdrlen)        head of attributes data
 *   nlmsg_attrlen(nlh, hdrlen)                length of attributes data
 *
 * Message Parsing:
 *   nlmsg_ok(nlh, remaining)                does nlh fit into remaining bytes?
 *   nlmsg_next(nlh, remaining)                get next netlink message
 *   nlmsg_parse()                        parse attributes of a message
 *   nlmsg_find_attr()                        find an attribute in a message
 *   nlmsg_for_each_msg()                loop over all messages
 *   nlmsg_validate()                        validate netlink message incl. attrs
 *   nlmsg_for_each_attr()                loop over all attributes
 *
 * Misc:
 *   nlmsg_report()                        report back to application?
 *
 * ------------------------------------------------------------------------
 *                          Attributes Interface
 * ------------------------------------------------------------------------
 *
 * Attribute Format:
 *    <------- nla_total_size(payload) ------->
 *    <---- nla_attr_size(payload) ----->
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *   |  Header  | Pad |     Payload      | Pad |  Header
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *                     <- nla_len(nla) ->      ^
 *   nla_data(nla)----^                        |
 *   nla_next(nla)-----------------------------'
 *
 * Data Structures:
 *   struct nlattr                        netlink attribute header
 *
 * Attribute Construction:
 *   nla_reserve(skb, type, len)        reserve room for an attribute
 *   nla_reserve_nohdr(skb, len)        reserve room for an attribute w/o hdr
 *   nla_put(skb, type, len, data)        add attribute to skb
 *   nla_put_nohdr(skb, len, data)        add attribute w/o hdr
 *   nla_append(skb, len, data)                append data to skb
 *
 * Attribute Construction for Basic Types:
 *   nla_put_u8(skb, type, value)        add u8 attribute to skb
 *   nla_put_u16(skb, type, value)        add u16 attribute to skb
 *   nla_put_u32(skb, type, value)        add u32 attribute to skb
 *   nla_put_u64_64bit(skb, type,
 *                     value, padattr)        add u64 attribute to skb
 *   nla_put_s8(skb, type, value)        add s8 attribute to skb
 *   nla_put_s16(skb, type, value)        add s16 attribute to skb
 *   nla_put_s32(skb, type, value)        add s32 attribute to skb
 *   nla_put_s64(skb, type, value,
 *               padattr)                add s64 attribute to skb
 *   nla_put_string(skb, type, str)        add string attribute to skb
 *   nla_put_flag(skb, type)                add flag attribute to skb
 *   nla_put_msecs(skb, type, jiffies,
 *                 padattr)                add msecs attribute to skb
 *   nla_put_in_addr(skb, type, addr)        add IPv4 address attribute to skb
 *   nla_put_in6_addr(skb, type, addr)        add IPv6 address attribute to skb
 *
 * Nested Attributes Construction:
 *   nla_nest_start(skb, type)                start a nested attribute
 *   nla_nest_end(skb, nla)                finalize a nested attribute
 *   nla_nest_cancel(skb, nla)                cancel nested attribute construction
 *
 * Attribute Length Calculations:
 *   nla_attr_size(payload)                length of attribute w/o padding
 *   nla_total_size(payload)                length of attribute w/ padding
 *   nla_padlen(payload)                length of padding
 *
 * Attribute Payload Access:
 *   nla_data(nla)                        head of attribute payload
 *   nla_len(nla)                        length of attribute payload
 *
 * Attribute Payload Access for Basic Types:
 *   nla_get_uint(nla)                        get payload for a uint attribute
 *   nla_get_sint(nla)                        get payload for a sint attribute
 *   nla_get_u8(nla)                        get payload for a u8 attribute
 *   nla_get_u16(nla)                        get payload for a u16 attribute
 *   nla_get_u32(nla)                        get payload for a u32 attribute
 *   nla_get_u64(nla)                        get payload for a u64 attribute
 *   nla_get_s8(nla)                        get payload for a s8 attribute
 *   nla_get_s16(nla)                        get payload for a s16 attribute
 *   nla_get_s32(nla)                        get payload for a s32 attribute
 *   nla_get_s64(nla)                        get payload for a s64 attribute
 *   nla_get_flag(nla)                        return 1 if flag is true
 *   nla_get_msecs(nla)                        get payload for a msecs attribute
 *
 * Attribute Misc:
 *   nla_memcpy(dest, nla, count)        copy attribute into memory
 *   nla_memcmp(nla, data, size)        compare attribute with memory area
 *   nla_strscpy(dst, nla, size)        copy attribute to a sized string
 *   nla_strcmp(nla, str)                compare attribute with string
 *
 * Attribute Parsing:
 *   nla_ok(nla, remaining)                does nla fit into remaining bytes?
 *   nla_next(nla, remaining)                get next netlink attribute
 *   nla_validate()                        validate a stream of attributes
 *   nla_validate_nested()                validate a stream of nested attributes
 *   nla_find()                                find attribute in stream of attributes
 *   nla_find_nested()                        find attribute in nested attributes
 *   nla_parse()                        parse and validate stream of attrs
 *   nla_parse_nested()                        parse nested attributes
 *   nla_for_each_attr()                loop over all attributes
 *   nla_for_each_attr_type()                loop over all attributes with the
 *                                        given type
 *   nla_for_each_nested()                loop over the nested attributes
 *   nla_for_each_nested_type()                loop over the nested attributes with
 *                                        the given type
 *=========================================================================
 */

 /**
  * Standard attribute types to specify validation policy
  */
enum {
        NLA_UNSPEC,
        NLA_U8,
        NLA_U16,
        NLA_U32,
        NLA_U64,
        NLA_STRING,
        NLA_FLAG,
        NLA_MSECS,
        NLA_NESTED,
        NLA_NESTED_ARRAY,
        NLA_NUL_STRING,
        NLA_BINARY,
        NLA_S8,
        NLA_S16,
        NLA_S32,
        NLA_S64,
        NLA_BITFIELD32,
        NLA_REJECT,
        NLA_BE16,
        NLA_BE32,
        NLA_SINT,
        NLA_UINT,
        __NLA_TYPE_MAX,
};

#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)

struct netlink_range_validation {
        u64 min, max;
};

struct netlink_range_validation_signed {
        s64 min, max;
};

enum nla_policy_validation {
        NLA_VALIDATE_NONE,
        NLA_VALIDATE_RANGE,
        NLA_VALIDATE_RANGE_WARN_TOO_LONG,
        NLA_VALIDATE_MIN,
        NLA_VALIDATE_MAX,
        NLA_VALIDATE_MASK,
        NLA_VALIDATE_RANGE_PTR,
        NLA_VALIDATE_FUNCTION,
};

/**
 * struct nla_policy - attribute validation policy
 * @type: Type of attribute or NLA_UNSPEC
 * @validation_type: type of attribute validation done in addition to
 *        type-specific validation (e.g. range, function call), see
 *        &enum nla_policy_validation
 * @len: Type specific length of payload
 *
 * Policies are defined as arrays of this struct, the array must be
 * accessible by attribute type up to the highest identifier to be expected.
 *
 * Meaning of `len' field:
 *    NLA_STRING           Maximum length of string
 *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
 *    NLA_FLAG             Unused
 *    NLA_BINARY           Maximum length of attribute payload
 *                         (but see also below with the validation type)
 *    NLA_NESTED,
 *    NLA_NESTED_ARRAY     Length verification is done by checking len of
 *                         nested header (or empty); len field is used if
 *                         nested_policy is also used, for the max attr
 *                         number in the nested policy.
 *    NLA_SINT, NLA_UINT,
 *    NLA_U8, NLA_U16,
 *    NLA_U32, NLA_U64,
 *    NLA_S8, NLA_S16,
 *    NLA_S32, NLA_S64,
 *    NLA_BE16, NLA_BE32,
 *    NLA_MSECS            Leaving the length field zero will verify the
 *                         given type fits, using it verifies minimum length
 *                         just like "All other"
 *    NLA_BITFIELD32       Unused
 *    NLA_REJECT           Unused
 *    All other            Minimum length of attribute payload
 *
 * Meaning of validation union:
 *    NLA_BITFIELD32       This is a 32-bit bitmap/bitselector attribute and
 *                         `bitfield32_valid' is the u32 value of valid flags
 *    NLA_REJECT           This attribute is always rejected and `reject_message'
 *                         may point to a string to report as the error instead
 *                         of the generic one in extended ACK.
 *    NLA_NESTED           `nested_policy' to a nested policy to validate, must
 *                         also set `len' to the max attribute number. Use the
 *                         provided NLA_POLICY_NESTED() macro.
 *                         Note that nla_parse() will validate, but of course not
 *                         parse, the nested sub-policies.
 *    NLA_NESTED_ARRAY     `nested_policy' points to a nested policy to validate,
 *                         must also set `len' to the max attribute number. Use
 *                         the provided NLA_POLICY_NESTED_ARRAY() macro.
 *                         The difference to NLA_NESTED is the structure:
 *                         NLA_NESTED has the nested attributes directly inside
 *                         while an array has the nested attributes at another
 *                         level down and the attribute types directly in the
 *                         nesting don't matter.
 *    NLA_UINT,
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64,
 *    NLA_BE16,
 *    NLA_BE32,
 *    NLA_SINT,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              The `min' and `max' fields are used depending on the
 *                         validation_type field, if that is min/max/range then
 *                         the min, max or both are used (respectively) to check
 *                         the value of the integer attribute.
 *                         Note that in the interest of code simplicity and
 *                         struct size both limits are s16, so you cannot
 *                         enforce a range that doesn't fall within the range
 *                         of s16 - do that using the NLA_POLICY_FULL_RANGE()
 *                         or NLA_POLICY_FULL_RANGE_SIGNED() macros instead.
 *                         Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
 *                         NLA_POLICY_RANGE() macros.
 *    NLA_UINT,
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range' must be a pointer
 *                         to a struct netlink_range_validation that indicates
 *                         the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE().
 *    NLA_SINT,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range_signed' must be a
 *                         pointer to a struct netlink_range_validation_signed
 *                         that indicates the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE_SIGNED().
 *
 *    NLA_BINARY           If the validation type is like the ones for integers
 *                         above, then the min/max length (not value like for
 *                         integers) of the attribute is enforced.
 *
 *    All other            Unused - but note that it's a union
 *
 * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
 *    NLA_BINARY           Validation function called for the attribute.
 *    All other            Unused - but note that it's a union
 *
 * Example:
 *
 * static const u32 myvalidflags = 0xff231023;
 *
 * static const struct nla_policy my_policy[ATTR_MAX+1] = {
 *         [ATTR_FOO] = { .type = NLA_U16 },
 *        [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
 *        [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)),
 *        [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags),
 * };
 */
struct nla_policy {
        u8                type;
        u8                validation_type;
        u16                len;
        union {
                /**
                 * @strict_start_type: first attribute to validate strictly
                 *
                 * This entry is special, and used for the attribute at index 0
                 * only, and specifies special data about the policy, namely it
                 * specifies the "boundary type" where strict length validation
                 * starts for any attribute types >= this value, also, strict
                 * nesting validation starts here.
                 *
                 * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
                 * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to
                 * get the previous pure { .len = xyz } behaviour. The advantage
                 * of this is that types not specified in the policy will be
                 * rejected.
                 *
                 * For completely new families it should be set to 1 so that the
                 * validation is enforced for all attributes. For existing ones
                 * it should be set at least when new attributes are added to
                 * the enum used by the policy, and be set to the new value that
                 * was added to enforce strict validation from thereon.
                 */
                u16 strict_start_type;

                /* private: use NLA_POLICY_*() to set */
                const u32 bitfield32_valid;
                const u32 mask;
                const char *reject_message;
                const struct nla_policy *nested_policy;
                const struct netlink_range_validation *range;
                const struct netlink_range_validation_signed *range_signed;
                struct {
                        s16 min, max;
                };
                int (*validate)(const struct nlattr *attr,
                                struct netlink_ext_ack *extack);
        };
};

#define NLA_POLICY_ETH_ADDR                NLA_POLICY_EXACT_LEN(ETH_ALEN)
#define NLA_POLICY_ETH_ADDR_COMPAT        NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)

#define _NLA_POLICY_NESTED(maxattr, policy) \
        { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr }
#define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
        { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr }
#define NLA_POLICY_NESTED(policy) \
        _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_NESTED_ARRAY(policy) \
        _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_BITFIELD32(valid) \
        { .type = NLA_BITFIELD32, .bitfield32_valid = valid }

#define __NLA_IS_UINT_TYPE(tp)                                        \
        (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 ||        \
         tp == NLA_U64 || tp == NLA_UINT ||                        \
         tp == NLA_BE16 || tp == NLA_BE32)
#define __NLA_IS_SINT_TYPE(tp)                                                \
        (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \
         tp == NLA_SINT)

#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp)
#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||        \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_SINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp)
#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||                \
                      __NLA_IS_SINT_TYPE(tp) ||                \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_NO_VALIDATION_PTR(tp)                \
        (__NLA_ENSURE(tp != NLA_BITFIELD32 &&                \
                      tp != NLA_REJECT &&                \
                      tp != NLA_NESTED &&                \
                      tp != NLA_NESTED_ARRAY) + tp)

#define NLA_POLICY_RANGE(tp, _min, _max) {                \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE,                \
        .min = _min,                                        \
        .max = _max                                        \
}

#define NLA_POLICY_FULL_RANGE(tp, _range) {                \
        .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range = _range,                                \
}

#define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) {        \
        .type = NLA_ENSURE_SINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range_signed = _range,                                \
}

#define NLA_POLICY_MIN(tp, _min) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MIN,                \
        .min = _min,                                        \
}

#define NLA_POLICY_MAX(tp, _max) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MAX,                \
        .max = _max,                                        \
}

#define NLA_POLICY_MASK(tp, _mask) {                        \
        .type = NLA_ENSURE_UINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_MASK,                \
        .mask = _mask,                                        \
}

#define NLA_POLICY_VALIDATE_FN(tp, fn, ...) {                \
        .type = NLA_ENSURE_NO_VALIDATION_PTR(tp),        \
        .validation_type = NLA_VALIDATE_FUNCTION,        \
        .validate = fn,                                        \
        .len = __VA_ARGS__ + 0,                                \
}

#define NLA_POLICY_EXACT_LEN(_len)        NLA_POLICY_RANGE(NLA_BINARY, _len, _len)
#define NLA_POLICY_EXACT_LEN_WARN(_len) {                        \
        .type = NLA_BINARY,                                        \
        .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG,        \
        .min = _len,                                                \
        .max = _len                                                \
}
#define NLA_POLICY_MIN_LEN(_len)        NLA_POLICY_MIN(NLA_BINARY, _len)

/**
 * struct nl_info - netlink source information
 * @nlh: Netlink message header of original request
 * @nl_net: Network namespace
 * @portid: Netlink PORTID of requesting application
 * @skip_notify: Skip netlink notifications to user space
 * @skip_notify_kernel: Skip selected in-kernel notifications
 */
struct nl_info {
        struct nlmsghdr                *nlh;
        struct net                *nl_net;
        u32                        portid;
        u8                        skip_notify:1,
                                skip_notify_kernel:1;
};

/**
 * enum netlink_validation - netlink message/attribute validation levels
 * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
 *        extra data at the end of the message, attributes being longer than
 *        they should be, or unknown attributes being present.
 * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
 * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
 *        this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
 * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
 *        This can safely be set by the kernel when the given policy has no
 *        NLA_UNSPEC anymore, and can thus be used to ensure policy entries
 *        are enforced going forward.
 * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
 *        U8, U16, U32 must have exact size, etc.)
 * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
 *        and unset for other policies.
 */
enum netlink_validation {
        NL_VALIDATE_LIBERAL = 0,
        NL_VALIDATE_TRAILING = BIT(0),
        NL_VALIDATE_MAXTYPE = BIT(1),
        NL_VALIDATE_UNSPEC = BIT(2),
        NL_VALIDATE_STRICT_ATTRS = BIT(3),
        NL_VALIDATE_NESTED = BIT(4),
};

#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
                                       NL_VALIDATE_MAXTYPE)
#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
                            NL_VALIDATE_MAXTYPE |\
                            NL_VALIDATE_UNSPEC |\
                            NL_VALIDATE_STRICT_ATTRS |\
                            NL_VALIDATE_NESTED)

int netlink_rcv_skb(struct sk_buff *skb,
                    int (*cb)(struct sk_buff *, struct nlmsghdr *,
                              struct netlink_ext_ack *));
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags);

int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack);
int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
                int len, const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack);
int nla_policy_len(const struct nla_policy *, int);
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize);
char *nla_strdup(const struct nlattr *nla, gfp_t flags);
int nla_memcpy(void *dest, const struct nlattr *src, int count);
int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
int nla_strcmp(const struct nlattr *nla, const char *str);
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr);
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                 int attrlen, int padattr);
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
               const void *data);
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr);
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr);
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_append(struct sk_buff *skb, int attrlen, const void *data);

/**************************************************************************
 * Netlink Messages
 **************************************************************************/

/**
 * nlmsg_msg_size - length of netlink message not including padding
 * @payload: length of message payload
 */
static inline int nlmsg_msg_size(int payload)
{
        return NLMSG_HDRLEN + payload;
}

/**
 * nlmsg_total_size - length of netlink message including padding
 * @payload: length of message payload
 */
static inline int nlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(nlmsg_msg_size(payload));
}

/**
 * nlmsg_padlen - length of padding at the message's tail
 * @payload: length of message payload
 */
static inline int nlmsg_padlen(int payload)
{
        return nlmsg_total_size(payload) - nlmsg_msg_size(payload);
}

/**
 * nlmsg_data - head of message payload
 * @nlh: netlink message header
 */
static inline void *nlmsg_data(const struct nlmsghdr *nlh)
{
        return (unsigned char *) nlh + NLMSG_HDRLEN;
}

/**
 * nlmsg_len - length of message payload
 * @nlh: netlink message header
 */
static inline int nlmsg_len(const struct nlmsghdr *nlh)
{
        return nlh->nlmsg_len - NLMSG_HDRLEN;
}

/**
 * nlmsg_attrdata - head of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh,
                                            int hdrlen)
{
        unsigned char *data = nlmsg_data(nlh);
        return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen));
}

/**
 * nlmsg_attrlen - length of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen)
{
        return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen);
}

/**
 * nlmsg_ok - check if the netlink message fits into the remaining bytes
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 */
static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining)
{
        return (remaining >= (int) sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len >= sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len <= remaining);
}

/**
 * nlmsg_next - next netlink message in message stream
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 *
 * Returns the next netlink message in the message stream and
 * decrements remaining by the size of the current message.
 */
static inline struct nlmsghdr *
nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
{
        int totlen = NLMSG_ALIGN(nlh->nlmsg_len);

        *remaining -= totlen;

        return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
}

/**
 * nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected, policy must be specified, attributes
 * will be validated in the strictest way possible.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse(struct nlattr **tb, int maxtype,
                            const struct nlattr *head, int len,
                            const struct nla_policy *policy,
                            struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be ignored and attributes from the policy are not
 * always strictly validated (only for new attributes).
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
                                       const struct nlattr *head, int len,
                                       const struct nla_policy *policy,
                                       struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected as well as trailing data, but the
 * policy is not completely strictly validated (only for new attributes).
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
                                              const struct nlattr *head,
                                              int len,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * __nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
                NL_SET_ERR_MSG(extack, "Invalid header length");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
                           nlmsg_attrlen(nlh, hdrlen), policy, validate,
                           extack);
}

/**
 * nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_STRICT, extack);
}

/**
 * nlmsg_parse_deprecated - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
                                         struct nlattr *tb[], int maxtype,
                                         const struct nla_policy *policy,
                                         struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_LIBERAL, extack);
}

/**
 * nlmsg_parse_deprecated_strict - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated_strict()
 */
static inline int
nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * nlmsg_find_attr - find a specific attribute in a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of familiy specific header
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute which matches the specified type.
 */
static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
                                             int hdrlen, int attrtype)
{
        return nla_find(nlmsg_attrdata(nlh, hdrlen),
                        nlmsg_attrlen(nlh, hdrlen), attrtype);
}

/**
 * nla_validate_deprecated - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in liberal mode.
 * See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_validate_deprecated(const struct nlattr *head, int len,
                                          int maxtype,
                                          const struct nla_policy *policy,
                                          struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
                              extack);
}

/**
 * nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in strict mode.
 * See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
                              extack);
}

/**
 * nlmsg_validate_deprecated - validate a netlink message including attributes
 * @nlh: netlinket message header
 * @hdrlen: length of familiy specific header
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
                                            int hdrlen, int maxtype,
                                            const struct nla_policy *policy,
                                            struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
                return -EINVAL;

        return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
                              nlmsg_attrlen(nlh, hdrlen), maxtype,
                              policy, NL_VALIDATE_LIBERAL, extack);
}



/**
 * nlmsg_report - need to report back to application?
 * @nlh: netlink message header
 *
 * Returns 1 if a report back to the application is requested.
 */
static inline int nlmsg_report(const struct nlmsghdr *nlh)
{
        return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0;
}

/**
 * nlmsg_seq - return the seq number of netlink message
 * @nlh: netlink message header
 *
 * Returns 0 if netlink message is NULL
 */
static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
{
        return nlh ? nlh->nlmsg_seq : 0;
}

/**
 * nlmsg_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @nlh: netlink message header
 * @hdrlen: length of familiy specific header
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
        nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
                          nlmsg_attrlen(nlh, hdrlen), rem)

/**
 * nlmsg_put - Add a new netlink message to an skb
 * @skb: socket buffer to store message in
 * @portid: netlink PORTID of requesting application
 * @seq: sequence number of message
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                                         int type, int payload, int flags)
{
        if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload)))
                return NULL;

        return __nlmsg_put(skb, portid, seq, type, payload, flags);
}

/**
 * nlmsg_append - Add more data to a nlmsg in a skb
 * @skb: socket buffer to store message in
 * @size: length of message payload
 *
 * Append data to an existing nlmsg, used when constructing a message
 * with multiple fixed-format headers (which is rare).
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the extra payload.
 */
static inline void *nlmsg_append(struct sk_buff *skb, u32 size)
{
        if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size)))
                return NULL;

        if (NLMSG_ALIGN(size) - size)
                memset(skb_tail_pointer(skb) + size, 0,
                       NLMSG_ALIGN(size) - size);
        return __skb_put(skb, NLMSG_ALIGN(size));
}

/**
 * nlmsg_put_answer - Add a new callback based netlink message to an skb
 * @skb: socket buffer to store message in
 * @cb: netlink callback
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                int type, int payload,
                                                int flags)
{
        return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                         type, payload, flags);
}

/**
 * nlmsg_new - Allocate a new netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 *
 * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known
 * and a good default is needed.
 */
static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
{
        return alloc_skb(nlmsg_total_size(payload), flags);
}

/**
 * nlmsg_new_large - Allocate a new netlink message with non-contiguous
 * physical memory
 * @payload: size of the message payload
 *
 * The allocated skb is unable to have frag page for shinfo->frags*,
 * as the NULL setting for skb->head in netlink_skb_destructor() will
 * bypass most of the handling in skb_release_data()
 */
static inline struct sk_buff *nlmsg_new_large(size_t payload)
{
        return netlink_alloc_large_skb(nlmsg_total_size(payload), 0);
}

/**
 * nlmsg_end - Finalize a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Corrects the netlink message header to include the appeneded
 * attributes. Only necessary if attributes have been added to
 * the message.
 */
static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
}

/**
 * nlmsg_get_pos - return current position in netlink message
 * @skb: socket buffer the message is stored in
 *
 * Returns a pointer to the current tail of the message.
 */
static inline void *nlmsg_get_pos(struct sk_buff *skb)
{
        return skb_tail_pointer(skb);
}

/**
 * nlmsg_trim - Trim message to a mark
 * @skb: socket buffer the message is stored in
 * @mark: mark to trim to
 *
 * Trims the message to the provided mark.
 */
static inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
{
        if (mark) {
                WARN_ON((unsigned char *) mark < skb->data);
                skb_trim(skb, (unsigned char *) mark - skb->data);
        }
}

/**
 * nlmsg_cancel - Cancel construction of a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Removes the complete netlink message including all
 * attributes from the socket buffer again.
 */
static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlmsg_trim(skb, nlh);
}

/**
 * nlmsg_free - drop a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_free(struct sk_buff *skb)
{
        kfree_skb(skb);
}

/**
 * nlmsg_consume - free a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_consume(struct sk_buff *skb)
{
        consume_skb(skb);
}

/**
 * nlmsg_multicast_filtered - multicast a netlink message with filter function
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 * @filter: filter function
 * @filter_data: filter function private data
 *
 * Return: 0 on success, negative error code for failure.
 */
static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb,
                                           u32 portid, unsigned int group,
                                           gfp_t flags,
                                           netlink_filter_fn filter,
                                           void *filter_data)
{
        int err;

        NETLINK_CB(skb).dst_group = group;

        err = netlink_broadcast_filtered(sk, skb, portid, group, flags,
                                         filter, filter_data);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_multicast - multicast a netlink message
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 */
static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
                                  u32 portid, unsigned int group, gfp_t flags)
{
        return nlmsg_multicast_filtered(sk, skb, portid, group, flags,
                                        NULL, NULL);
}

/**
 * nlmsg_unicast - unicast a netlink message
 * @sk: netlink socket to spread message to
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid)
{
        int err;

        err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_for_each_msg - iterate over a stream of messages
 * @pos: loop counter, set to current message
 * @head: head of message stream
 * @len: length of message stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_msg(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nlmsg_ok(pos, rem); \
             pos = nlmsg_next(pos, &(rem)))

/**
 * nl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @nlh: netlink message header to write the flag to
 *
 * This function checks if the sequence (generation) number changed during dump
 * and if it did, advertises it in the netlink message header.
 *
 * The correct way to use it is to set cb->seq to the generation counter when
 * all locks for dumping have been acquired, and then call this function for
 * each message that is generated.
 *
 * Note that due to initialisation concerns, 0 is an invalid sequence number
 * and must not be used by code that uses this functionality.
 */
static inline void
nl_dump_check_consistent(struct netlink_callback *cb,
                         struct nlmsghdr *nlh)
{
        if (cb->prev_seq && cb->seq != cb->prev_seq)
                nlh->nlmsg_flags |= NLM_F_DUMP_INTR;
        cb->prev_seq = cb->seq;
}

/**************************************************************************
 * Netlink Attributes
 **************************************************************************/

/**
 * nla_attr_size - length of attribute not including padding
 * @payload: length of payload
 */
static inline int nla_attr_size(int payload)
{
        return NLA_HDRLEN + payload;
}

/**
 * nla_total_size - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload));
}

/**
 * nla_padlen - length of padding at the tail of attribute
 * @payload: length of payload
 */
static inline int nla_padlen(int payload)
{
        return nla_total_size(payload) - nla_attr_size(payload);
}

/**
 * nla_type - attribute type
 * @nla: netlink attribute
 */
static inline int nla_type(const struct nlattr *nla)
{
        return nla->nla_type & NLA_TYPE_MASK;
}

/**
 * nla_data - head of payload
 * @nla: netlink attribute
 */
static inline void *nla_data(const struct nlattr *nla)
{
        return (char *) nla + NLA_HDRLEN;
}

/**
 * nla_len - length of payload
 * @nla: netlink attribute
 */
static inline u16 nla_len(const struct nlattr *nla)
{
        return nla->nla_len - NLA_HDRLEN;
}

/**
 * nla_ok - check if the netlink attribute fits into the remaining bytes
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 */
static inline int nla_ok(const struct nlattr *nla, int remaining)
{
        return remaining >= (int) sizeof(*nla) &&
               nla->nla_len >= sizeof(*nla) &&
               nla->nla_len <= remaining;
}

/**
 * nla_next - next netlink attribute in attribute stream
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 *
 * Returns the next netlink attribute in the attribute stream and
 * decrements remaining by the size of the current attribute.
 */
static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
        unsigned int totlen = NLA_ALIGN(nla->nla_len);

        *remaining -= totlen;
        return (struct nlattr *) ((char *) nla + totlen);
}

/**
 * nla_find_nested - find attribute in a set of nested attributes
 * @nla: attribute containing the nested attributes
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute which matches the specified type.
 */
static inline struct nlattr *
nla_find_nested(const struct nlattr *nla, int attrtype)
{
        return nla_find(nla_data(nla), nla_len(nla), attrtype);
}

/**
 * nla_parse_nested - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
                                   const struct nlattr *nla,
                                   const struct nla_policy *policy,
                                   struct netlink_ext_ack *extack)
{
        if (!(nla->nla_type & NLA_F_NESTED)) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_nested_deprecated - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
                                              const struct nlattr *nla,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_put_u8 - Add a u8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
{
        /* temporary variables to work around GCC PR81715 with asan-stack=1 */
        u8 tmp = value;

        return nla_put(skb, attrtype, sizeof(u8), &tmp);
}

/**
 * nla_put_u16 - Add a u16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
{
        u16 tmp = value;

        return nla_put(skb, attrtype, sizeof(u16), &tmp);
}

/**
 * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be16), &tmp);
}

/**
 * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
{
        __le16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le16), &tmp);
}

/**
 * nla_put_u32 - Add a u32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
{
        u32 tmp = value;

        return nla_put(skb, attrtype, sizeof(u32), &tmp);
}

/**
 * nla_put_uint - Add a variable-size unsigned int to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value)
{
        u64 tmp64 = value;
        u32 tmp32 = value;

        if (tmp64 == tmp32)
                return nla_put_u32(skb, attrtype, tmp32);
        return nla_put(skb, attrtype, sizeof(u64), &tmp64);
}

/**
 * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be32), &tmp);
}

/**
 * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
{
        __le32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le32), &tmp);
}

/**
 * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
                                    u64 value, int padattr)
{
        u64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
                               int padattr)
{
        __be64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr);
}

/**
 * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
                                int padattr)
{
        __be64 tmp = value;

        return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp,
                            padattr);
}

/**
 * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
                               int padattr)
{
        __le64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr);
}

/**
 * nla_put_s8 - Add a s8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
{
        s8 tmp = value;

        return nla_put(skb, attrtype, sizeof(s8), &tmp);
}

/**
 * nla_put_s16 - Add a s16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
{
        s16 tmp = value;

        return nla_put(skb, attrtype, sizeof(s16), &tmp);
}

/**
 * nla_put_s32 - Add a s32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
{
        s32 tmp = value;

        return nla_put(skb, attrtype, sizeof(s32), &tmp);
}

/**
 * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
                              int padattr)
{
        s64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
}

/**
 * nla_put_sint - Add a variable-size signed int to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value)
{
        s64 tmp64 = value;
        s32 tmp32 = value;

        if (tmp64 == tmp32)
                return nla_put_s32(skb, attrtype, tmp32);
        return nla_put(skb, attrtype, sizeof(s64), &tmp64);
}

/**
 * nla_put_string - Add a string netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @str: NUL terminated string
 */
static inline int nla_put_string(struct sk_buff *skb, int attrtype,
                                 const char *str)
{
        return nla_put(skb, attrtype, strlen(str) + 1, str);
}

/**
 * nla_put_flag - Add a flag netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 */
static inline int nla_put_flag(struct sk_buff *skb, int attrtype)
{
        return nla_put(skb, attrtype, 0, NULL);
}

/**
 * nla_put_msecs - Add a msecs netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @njiffies: number of jiffies to convert to msecs
 * @padattr: attribute type for the padding
 */
static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
                                unsigned long njiffies, int padattr)
{
        u64 tmp = jiffies_to_msecs(njiffies);

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv4 address
 */
static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype,
                                  __be32 addr)
{
        __be32 tmp = addr;

        return nla_put_be32(skb, attrtype, tmp);
}

/**
 * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv6 address
 */
static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype,
                                   const struct in6_addr *addr)
{
        return nla_put(skb, attrtype, sizeof(*addr), addr);
}

/**
 * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: value carrying bits
 * @selector: selector of valid bits
 */
static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype,
                                     __u32 value, __u32 selector)
{
        struct nla_bitfield32 tmp = { value, selector, };

        return nla_put(skb, attrtype, sizeof(tmp), &tmp);
}

/**
 * nla_get_u32 - return payload of u32 attribute
 * @nla: u32 netlink attribute
 */
static inline u32 nla_get_u32(const struct nlattr *nla)
{
        return *(u32 *) nla_data(nla);
}

/**
 * nla_get_be32 - return payload of __be32 attribute
 * @nla: __be32 netlink attribute
 */
static inline __be32 nla_get_be32(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_le32 - return payload of __le32 attribute
 * @nla: __le32 netlink attribute
 */
static inline __le32 nla_get_le32(const struct nlattr *nla)
{
        return *(__le32 *) nla_data(nla);
}

/**
 * nla_get_u16 - return payload of u16 attribute
 * @nla: u16 netlink attribute
 */
static inline u16 nla_get_u16(const struct nlattr *nla)
{
        return *(u16 *) nla_data(nla);
}

/**
 * nla_get_be16 - return payload of __be16 attribute
 * @nla: __be16 netlink attribute
 */
static inline __be16 nla_get_be16(const struct nlattr *nla)
{
        return *(__be16 *) nla_data(nla);
}

/**
 * nla_get_le16 - return payload of __le16 attribute
 * @nla: __le16 netlink attribute
 */
static inline __le16 nla_get_le16(const struct nlattr *nla)
{
        return *(__le16 *) nla_data(nla);
}

/**
 * nla_get_u8 - return payload of u8 attribute
 * @nla: u8 netlink attribute
 */
static inline u8 nla_get_u8(const struct nlattr *nla)
{
        return *(u8 *) nla_data(nla);
}

/**
 * nla_get_u64 - return payload of u64 attribute
 * @nla: u64 netlink attribute
 */
static inline u64 nla_get_u64(const struct nlattr *nla)
{
        u64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_uint - return payload of uint attribute
 * @nla: uint netlink attribute
 */
static inline u64 nla_get_uint(const struct nlattr *nla)
{
        if (nla_len(nla) == sizeof(u32))
                return nla_get_u32(nla);
        return nla_get_u64(nla);
}

/**
 * nla_get_be64 - return payload of __be64 attribute
 * @nla: __be64 netlink attribute
 */
static inline __be64 nla_get_be64(const struct nlattr *nla)
{
        __be64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_le64 - return payload of __le64 attribute
 * @nla: __le64 netlink attribute
 */
static inline __le64 nla_get_le64(const struct nlattr *nla)
{
        return *(__le64 *) nla_data(nla);
}

/**
 * nla_get_s32 - return payload of s32 attribute
 * @nla: s32 netlink attribute
 */
static inline s32 nla_get_s32(const struct nlattr *nla)
{
        return *(s32 *) nla_data(nla);
}

/**
 * nla_get_s16 - return payload of s16 attribute
 * @nla: s16 netlink attribute
 */
static inline s16 nla_get_s16(const struct nlattr *nla)
{
        return *(s16 *) nla_data(nla);
}

/**
 * nla_get_s8 - return payload of s8 attribute
 * @nla: s8 netlink attribute
 */
static inline s8 nla_get_s8(const struct nlattr *nla)
{
        return *(s8 *) nla_data(nla);
}

/**
 * nla_get_s64 - return payload of s64 attribute
 * @nla: s64 netlink attribute
 */
static inline s64 nla_get_s64(const struct nlattr *nla)
{
        s64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_sint - return payload of uint attribute
 * @nla: uint netlink attribute
 */
static inline s64 nla_get_sint(const struct nlattr *nla)
{
        if (nla_len(nla) == sizeof(s32))
                return nla_get_s32(nla);
        return nla_get_s64(nla);
}

/**
 * nla_get_flag - return payload of flag attribute
 * @nla: flag netlink attribute
 */
static inline int nla_get_flag(const struct nlattr *nla)
{
        return !!nla;
}

/**
 * nla_get_msecs - return payload of msecs attribute
 * @nla: msecs netlink attribute
 *
 * Returns the number of milliseconds in jiffies.
 */
static inline unsigned long nla_get_msecs(const struct nlattr *nla)
{
        u64 msecs = nla_get_u64(nla);

        return msecs_to_jiffies((unsigned long) msecs);
}

/**
 * nla_get_in_addr - return payload of IPv4 address attribute
 * @nla: IPv4 address netlink attribute
 */
static inline __be32 nla_get_in_addr(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_in6_addr - return payload of IPv6 address attribute
 * @nla: IPv6 address netlink attribute
 */
static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
{
        struct in6_addr tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_get_bitfield32 - return payload of 32 bitfield attribute
 * @nla: nla_bitfield32 attribute
 */
static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
{
        struct nla_bitfield32 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_memdup - duplicate attribute memory (kmemdup)
 * @src: netlink attribute to duplicate from
 * @gfp: GFP mask
 */
static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp)
{
        return kmemdup_noprof(nla_data(src), nla_len(src), gfp);
}
#define nla_memdup(...)        alloc_hooks(nla_memdup_noprof(__VA_ARGS__))

/**
 * nla_nest_start_noflag - Start a new level of nested attributes
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * This function exists for backward compatibility to use in APIs which never
 * marked their nest attributes with NLA_F_NESTED flag. New APIs should use
 * nla_nest_start() which sets the flag.
 *
 * Returns the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
                                                   int attrtype)
{
        struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);

        if (nla_put(skb, attrtype, 0, NULL) < 0)
                return NULL;

        return start;
}

/**
 * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
 * flag. This is the preferred function to use in new code.
 *
 * Returns the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
{
        return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
}

/**
 * nla_nest_end - Finalize nesting of attributes
 * @skb: socket buffer the attributes are stored in
 * @start: container attribute
 *
 * Corrects the container attribute header to include the all
 * appeneded attributes.
 *
 * Returns the total data length of the skb.
 */
static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
{
        start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start;
        return skb->len;
}

/**
 * nla_nest_cancel - Cancel nesting of attributes
 * @skb: socket buffer the message is stored in
 * @start: container attribute
 *
 * Removes the container attribute and including all nested
 * attributes. Returns -EMSGSIZE
 */
static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
{
        nlmsg_trim(skb, start);
}

/**
 * __nla_validate_nested - Validate a stream of nested attributes
 * @start: container attribute
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the nested attribute stream against the
 * specified policy. Attributes with a type exceeding maxtype will be
 * ignored. See documenation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
                                        const struct nla_policy *policy,
                                        unsigned int validate,
                                        struct netlink_ext_ack *extack)
{
        return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
                              validate, extack);
}

static inline int
nla_validate_nested(const struct nlattr *start, int maxtype,
                    const struct nla_policy *policy,
                    struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_STRICT, extack);
}

static inline int
nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute
 * @skb: socket buffer the message is stored in
 *
 * Return true if padding is needed to align the next attribute (nla_data()) to
 * a 64-bit aligned area.
 */
static inline bool nla_need_padding_for_64bit(struct sk_buff *skb)
{
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        /* The nlattr header is 4 bytes in size, that's why we test
         * if the skb->data _is_ aligned.  A NOP attribute, plus
         * nlattr header for next attribute, will make nla_data()
         * 8-byte aligned.
         */
        if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
                return true;
#endif
        return false;
}

/**
 * nla_align_64bit - 64-bit align the nla_data() of next attribute
 * @skb: socket buffer the message is stored in
 * @padattr: attribute type for the padding
 *
 * Conditionally emit a padding netlink attribute in order to make
 * the next attribute we emit have a 64-bit aligned nla_data() area.
 * This will only be done in architectures which do not have
 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
 *
 * Returns zero on success or a negative error code.
 */
static inline int nla_align_64bit(struct sk_buff *skb, int padattr)
{
        if (nla_need_padding_for_64bit(skb) &&
            !nla_reserve(skb, padattr, 0))
                return -EMSGSIZE;

        return 0;
}

/**
 * nla_total_size_64bit - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size_64bit(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload))
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
                + NLA_ALIGN(nla_attr_size(0))
#endif
                ;
}

/**
 * nla_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nla_ok(pos, rem); \
             pos = nla_next(pos, &(rem)))

/**
 * nla_for_each_attr_type - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @type: required attribute type for @pos
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr_type(pos, type, head, len, rem) \
        nla_for_each_attr(pos, head, len, rem) \
                if (nla_type(pos) == type)

/**
 * nla_for_each_nested - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested(pos, nla, rem) \
        nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)

/**
 * nla_for_each_nested_type - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @type: required attribute type for @pos
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested_type(pos, type, nla, rem) \
        nla_for_each_nested(pos, nla, rem) \
                if (nla_type(pos) == type)

/**
 * nla_is_last - Test if attribute is last in stream
 * @nla: attribute to test
 * @rem: bytes remaining in stream
 */
static inline bool nla_is_last(const struct nlattr *nla, int rem)
{
        return nla->nla_len == rem;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range);
void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range);

struct netlink_policy_dump_state;

int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate,
                                   const struct nla_policy *policy,
                                   unsigned int maxtype);
int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state,
                                       const struct nla_policy *policy,
                                       unsigned int maxtype);
bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state);
int netlink_policy_dump_write(struct sk_buff *skb,
                              struct netlink_policy_dump_state *state);
int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt);
int netlink_policy_dump_write_attr(struct sk_buff *skb,
                                   const struct nla_policy *pt,
                                   int nestattr);
void netlink_policy_dump_free(struct netlink_policy_dump_state *state);

#endif


























































































































    1 

    1 




    1 























































































































































































































































































































































































































































































































































    1 












    1 











    1 


























    1 











































































































































    1 





    1 





    1 





























    1 


    1 




















































































































































    1 




















    1 




































    1 


    1 










    1 
    1 

    1 

    1 






























    1 


    1 










    1 












    1 





















































    1 







    1 

    1 











    1 

















    1 










    1 





    1 








    1 









    1 








    1 



















































































































































































    1 



    1 




















    1 


























































































































    1 



    1 

    1 

































    1 
















































































    1 




    1 












    1 










    1 
    1 
    1 













































































































































































    1 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <linux/filter.h>
#include <net/dsa.h>
#include <net/dst_metadata.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/gre.h>
#include <net/pptp.h>
#include <net/tipc.h>
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
#include <linux/dccp.h>
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/stddef.h>
#include <linux/if_ether.h>
#include <linux/if_hsr.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
#include <net/pkt_cls.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_labels.h>
#endif
#include <linux/bpf-netns.h>

static void dissector_set_key(struct flow_dissector *flow_dissector,
                              enum flow_dissector_key_id key_id)
{
        flow_dissector->used_keys |= (1ULL << key_id);
}

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count)
{
        unsigned int i;

        memset(flow_dissector, 0, sizeof(*flow_dissector));

        for (i = 0; i < key_count; i++, key++) {
                /* User should make sure that every key target offset is within
                 * boundaries of unsigned short.
                 */
                BUG_ON(key->offset > USHRT_MAX);
                BUG_ON(dissector_uses_key(flow_dissector,
                                          key->key_id));

                dissector_set_key(flow_dissector, key->key_id);
                flow_dissector->offset[key->key_id] = key->offset;
        }

        /* Ensure that the dissector always includes control and basic key.
         * That way we are able to avoid handling lack of these in fast path.
         */
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_CONTROL));
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_BASIC));
}
EXPORT_SYMBOL(skb_flow_dissector_init);

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog)
{
        enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;

        if (net == &init_net) {
                /* BPF flow dissector in the root namespace overrides
                 * any per-net-namespace one. When attaching to root,
                 * make sure we don't have any BPF program attached
                 * to the non-root namespaces.
                 */
                struct net *ns;

                for_each_net(ns) {
                        if (ns == &init_net)
                                continue;
                        if (rcu_access_pointer(ns->bpf.run_array[type]))
                                return -EEXIST;
                }
        } else {
                /* Make sure root flow dissector is not attached
                 * when attaching to the non-root namespace.
                 */
                if (rcu_access_pointer(init_net.bpf.run_array[type]))
                        return -EEXIST;
        }

        return 0;
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * __skb_flow_get_ports - extract the upper layer ports and return them
 * @skb: sk_buff to extract the ports from
 * @thoff: transport header offset
 * @ip_proto: protocol for which to get port offset
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 *
 * The function will try to retrieve the ports at offset thoff + poff where poff
 * is the protocol port offset returned from proto_ports_offset
 */
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                            const void *data, int hlen)
{
        int poff = proto_ports_offset(ip_proto);

        if (!data) {
                data = skb->data;
                hlen = skb_headlen(skb);
        }

        if (poff >= 0) {
                __be32 *ports, _ports;

                ports = __skb_header_pointer(skb, thoff + poff,
                                             sizeof(_ports), data, hlen, &_ports);
                if (ports)
                        return *ports;
        }

        return 0;
}
EXPORT_SYMBOL(__skb_flow_get_ports);

static bool icmp_has_id(u8 type)
{
        switch (type) {
        case ICMP_ECHO:
        case ICMP_ECHOREPLY:
        case ICMP_TIMESTAMP:
        case ICMP_TIMESTAMPREPLY:
        case ICMPV6_ECHO_REQUEST:
        case ICMPV6_ECHO_REPLY:
                return true;
        }

        return false;
}

/**
 * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
 * @skb: sk_buff to extract from
 * @key_icmp: struct flow_dissector_key_icmp to fill
 * @data: raw buffer pointer to the packet
 * @thoff: offset to extract at
 * @hlen: packet header length
 */
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           const void *data, int thoff, int hlen)
{
        struct icmphdr *ih, _ih;

        ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
        if (!ih)
                return;

        key_icmp->type = ih->type;
        key_icmp->code = ih->code;

        /* As we use 0 to signal that the Id field is not present,
         * avoid confusion with packets without such field
         */
        if (icmp_has_id(ih->type))
                key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
        else
                key_icmp->id = 0;
}
EXPORT_SYMBOL(skb_flow_get_icmp_tci);

/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
 * using skb_flow_get_icmp_tci().
 */
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, const void *data,
                                    int thoff, int hlen)
{
        struct flow_dissector_key_icmp *key_icmp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
                return;

        key_icmp = skb_flow_dissector_target(flow_dissector,
                                             FLOW_DISSECTOR_KEY_ICMP,
                                             target_container);

        skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
}

static void __skb_flow_dissect_ah(const struct sk_buff *skb,
                                  struct flow_dissector *flow_dissector,
                                  void *target_container, const void *data,
                                  int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_ah;
        struct ip_auth_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_ah = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IPSEC,
                                           target_container);

        key_ah->spi = hdr->spi;
}

static void __skb_flow_dissect_esp(const struct sk_buff *skb,
                                   struct flow_dissector *flow_dissector,
                                   void *target_container, const void *data,
                                   int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_esp;
        struct ip_esp_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_esp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_IPSEC,
                                            target_container);

        key_esp->spi = hdr->spi;
}

static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
                                      struct flow_dissector *flow_dissector,
                                      void *target_container, const void *data,
                                      int nhoff, int hlen)
{
        struct flow_dissector_key_l2tpv3 *key_l2tpv3;
        struct {
                __be32 session_id;
        } *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_L2TPV3,
                                               target_container);

        key_l2tpv3->session_id = hdr->session_id;
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_meta *meta;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
                return;

        meta = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_META,
                                         target_container);
        meta->ingress_ifindex = skb->skb_iif;
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        if (tc_skb_ext_tc_enabled()) {
                struct tc_skb_ext *ext;

                ext = skb_ext_find(skb, TC_SKB_EXT);
                if (ext)
                        meta->l2_miss = ext->l2_miss;
        }
#endif
}
EXPORT_SYMBOL(skb_flow_dissect_meta);

static void
skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
                                   struct flow_dissector *flow_dissector,
                                   void *target_container)
{
        struct flow_dissector_key_control *ctrl;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
                return;

        ctrl = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_ENC_CONTROL,
                                         target_container);
        ctrl->addr_type = type;
}

void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container, u16 *ctinfo_map,
                    size_t mapsize, bool post_ct, u16 zone)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        struct flow_dissector_key_ct *key;
        enum ip_conntrack_info ctinfo;
        struct nf_conn_labels *cl;
        struct nf_conn *ct;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
                return;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct && !post_ct)
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_CT,
                                        target_container);

        if (!ct) {
                key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
                                TCA_FLOWER_KEY_CT_FLAGS_INVALID;
                key->ct_zone = zone;
                return;
        }

        if (ctinfo < mapsize)
                key->ct_state = ctinfo_map[ctinfo];
#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
        key->ct_zone = ct->zone.id;
#endif
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
        key->ct_mark = READ_ONCE(ct->mark);
#endif

        cl = nf_ct_labels_find(ct);
        if (cl)
                memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
#endif /* CONFIG_NF_CONNTRACK */
}
EXPORT_SYMBOL(skb_flow_dissect_ct);

void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container)
{
        struct ip_tunnel_info *info;
        struct ip_tunnel_key *key;

        /* A quick check to see if there might be something to do. */
        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_KEYID) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_PORTS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IP) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_OPTS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_FLAGS))
                return;

        info = skb_tunnel_info(skb);
        if (!info)
                return;

        key = &info->key;

        switch (ip_tunnel_info_af(info)) {
        case AF_INET:
                skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                   flow_dissector,
                                                   target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
                        struct flow_dissector_key_ipv4_addrs *ipv4;

                        ipv4 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
                                                         target_container);
                        ipv4->src = key->u.ipv4.src;
                        ipv4->dst = key->u.ipv4.dst;
                }
                break;
        case AF_INET6:
                skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                   flow_dissector,
                                                   target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
                        struct flow_dissector_key_ipv6_addrs *ipv6;

                        ipv6 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
                                                         target_container);
                        ipv6->src = key->u.ipv6.src;
                        ipv6->dst = key->u.ipv6.dst;
                }
                break;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
                struct flow_dissector_key_keyid *keyid;

                keyid = skb_flow_dissector_target(flow_dissector,
                                                  FLOW_DISSECTOR_KEY_ENC_KEYID,
                                                  target_container);
                keyid->keyid = tunnel_id_to_key32(key->tun_id);
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
                struct flow_dissector_key_ports *tp;

                tp = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_PORTS,
                                               target_container);
                tp->src = key->tp_src;
                tp->dst = key->tp_dst;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
                struct flow_dissector_key_ip *ip;

                ip = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_IP,
                                               target_container);
                ip->tos = key->tos;
                ip->ttl = key->ttl;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
                struct flow_dissector_key_enc_opts *enc_opt;
                IP_TUNNEL_DECLARE_FLAGS(flags) = { };
                u32 val;

                enc_opt = skb_flow_dissector_target(flow_dissector,
                                                    FLOW_DISSECTOR_KEY_ENC_OPTS,
                                                    target_container);

                if (!info->options_len)
                        return;

                enc_opt->len = info->options_len;
                ip_tunnel_info_opts_get(enc_opt->data, info);

                ip_tunnel_set_options_present(flags);
                ip_tunnel_flags_and(flags, info->key.tun_flags, flags);

                val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
                                    IP_TUNNEL_GENEVE_OPT_BIT);
                enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_FLAGS)) {
                struct flow_dissector_key_enc_flags *enc_flags;
                IP_TUNNEL_DECLARE_FLAGS(flags) = {};

                enc_flags = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_ENC_FLAGS,
                                                      target_container);
                ip_tunnel_set_encflags_present(flags);
                ip_tunnel_flags_and(flags, flags, info->key.tun_flags);
                enc_flags->flags = bitmap_read(flags, IP_TUNNEL_CSUM_BIT, 32);
        }
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_hash *key;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH))
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_HASH,
                                        target_container);

        key->hash = skb_get_hash_raw(skb);
}
EXPORT_SYMBOL(skb_flow_dissect_hash);

static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data, int nhoff,
                        int hlen, int lse_index, bool *entropy_label)
{
        struct mpls_label *hdr, _hdr;
        u32 entry, label, bos;

        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
            !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
                return FLOW_DISSECT_RET_OUT_GOOD;

        if (lse_index >= FLOW_DIS_MPLS_MAX)
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                   hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        entry = ntohl(hdr->entry);
        label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
        bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
                struct flow_dissector_key_mpls *key_mpls;
                struct flow_dissector_mpls_lse *lse;

                key_mpls = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_MPLS,
                                                     target_container);
                lse = &key_mpls->ls[lse_index];

                lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
                lse->mpls_bos = bos;
                lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
                lse->mpls_label = label;
                dissector_set_mpls_lse(key_mpls, lse_index);
        }

        if (*entropy_label &&
            dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
                struct flow_dissector_key_keyid *key_keyid;

                key_keyid = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
                                                      target_container);
                key_keyid->keyid = cpu_to_be32(label);
        }

        *entropy_label = label == MPLS_LABEL_ENTROPY;

        return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
}

static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_arp *key_arp;
        struct {
                unsigned char ar_sha[ETH_ALEN];
                unsigned char ar_sip[4];
                unsigned char ar_tha[ETH_ALEN];
                unsigned char ar_tip[4];
        } *arp_eth, _arp_eth;
        const struct arphdr *arp;
        struct arphdr _arp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
                return FLOW_DISSECT_RET_OUT_GOOD;

        arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
                                   hlen, &_arp);
        if (!arp)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
            arp->ar_pro != htons(ETH_P_IP) ||
            arp->ar_hln != ETH_ALEN ||
            arp->ar_pln != 4 ||
            (arp->ar_op != htons(ARPOP_REPLY) &&
             arp->ar_op != htons(ARPOP_REQUEST)))
                return FLOW_DISSECT_RET_OUT_BAD;

        arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
                                       sizeof(_arp_eth), data,
                                       hlen, &_arp_eth);
        if (!arp_eth)
                return FLOW_DISSECT_RET_OUT_BAD;

        key_arp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_ARP,
                                            target_container);

        memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
        memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));

        /* Only store the lower byte of the opcode;
         * this covers ARPOP_REPLY and ARPOP_REQUEST.
         */
        key_arp->op = ntohs(arp->ar_op) & 0xff;

        ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
        ether_addr_copy(key_arp->tha, arp_eth->ar_tha);

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_cfm(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_cfm *key, *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM))
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM,
                                        target_container);

        key->mdl_ver = hdr->mdl_ver;
        key->opcode = hdr->opcode;

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
                       struct flow_dissector_key_control *key_control,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       __be16 *p_proto, int *p_nhoff, int *p_hlen,
                       unsigned int flags)
{
        struct flow_dissector_key_keyid *key_keyid;
        struct gre_base_hdr *hdr, _hdr;
        int offset = 0;
        u16 gre_ver;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr),
                                   data, *p_hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        /* Only look inside GRE without routing */
        if (hdr->flags & GRE_ROUTING)
                return FLOW_DISSECT_RET_OUT_GOOD;

        /* Only look inside GRE for version 0 and 1 */
        gre_ver = ntohs(hdr->flags & GRE_VERSION);
        if (gre_ver > 1)
                return FLOW_DISSECT_RET_OUT_GOOD;

        *p_proto = hdr->protocol;
        if (gre_ver) {
                /* Version1 must be PPTP, and check the flags */
                if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
                        return FLOW_DISSECT_RET_OUT_GOOD;
        }

        offset += sizeof(struct gre_base_hdr);

        if (hdr->flags & GRE_CSUM)
                offset += sizeof_field(struct gre_full_hdr, csum) +
                          sizeof_field(struct gre_full_hdr, reserved1);

        if (hdr->flags & GRE_KEY) {
                const __be32 *keyid;
                __be32 _keyid;

                keyid = __skb_header_pointer(skb, *p_nhoff + offset,
                                             sizeof(_keyid),
                                             data, *p_hlen, &_keyid);
                if (!keyid)
                        return FLOW_DISSECT_RET_OUT_BAD;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_GRE_KEYID)) {
                        key_keyid = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_GRE_KEYID,
                                                              target_container);
                        if (gre_ver == 0)
                                key_keyid->keyid = *keyid;
                        else
                                key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
                }
                offset += sizeof_field(struct gre_full_hdr, key);
        }

        if (hdr->flags & GRE_SEQ)
                offset += sizeof_field(struct pptp_gre_header, seq);

        if (gre_ver == 0) {
                if (*p_proto == htons(ETH_P_TEB)) {
                        const struct ethhdr *eth;
                        struct ethhdr _eth;

                        eth = __skb_header_pointer(skb, *p_nhoff + offset,
                                                   sizeof(_eth),
                                                   data, *p_hlen, &_eth);
                        if (!eth)
                                return FLOW_DISSECT_RET_OUT_BAD;
                        *p_proto = eth->h_proto;
                        offset += sizeof(*eth);

                        /* Cap headers that we access via pointers at the
                         * end of the Ethernet header as our maximum alignment
                         * at that point is only 2 bytes.
                         */
                        if (NET_IP_ALIGN)
                                *p_hlen = *p_nhoff + offset;
                }
        } else { /* version 1, must be PPTP */
                u8 _ppp_hdr[PPP_HDRLEN];
                u8 *ppp_hdr;

                if (hdr->flags & GRE_ACK)
                        offset += sizeof_field(struct pptp_gre_header, ack);

                ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
                                               sizeof(_ppp_hdr),
                                               data, *p_hlen, _ppp_hdr);
                if (!ppp_hdr)
                        return FLOW_DISSECT_RET_OUT_BAD;

                switch (PPP_PROTOCOL(ppp_hdr)) {
                case PPP_IP:
                        *p_proto = htons(ETH_P_IP);
                        break;
                case PPP_IPV6:
                        *p_proto = htons(ETH_P_IPV6);
                        break;
                default:
                        /* Could probably catch some more like MPLS */
                        break;
                }

                offset += PPP_HDRLEN;
        }

        *p_nhoff += offset;
        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

/**
 * __skb_flow_dissect_batadv() - dissect batman-adv header
 * @skb: sk_buff to with the batman-adv header
 * @key_control: flow dissectors control key
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @p_proto: pointer used to update the protocol to process next
 * @p_nhoff: pointer used to update inner network header offset
 * @hlen: packet header length
 * @flags: any combination of FLOW_DISSECTOR_F_*
 *
 * ETH_P_BATMAN packets are tried to be dissected. Only
 * &struct batadv_unicast packets are actually processed because they contain an
 * inner ethernet header and are usually followed by actual network header. This
 * allows the flow dissector to continue processing the packet.
 *
 * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
 *  FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
 *  otherwise FLOW_DISSECT_RET_OUT_BAD
 */
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
                          struct flow_dissector_key_control *key_control,
                          const void *data, __be16 *p_proto, int *p_nhoff,
                          int hlen, unsigned int flags)
{
        struct {
                struct batadv_unicast_packet batadv_unicast;
                struct ethhdr eth;
        } *hdr, _hdr;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
                                   &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
                return FLOW_DISSECT_RET_OUT_BAD;

        *p_proto = hdr->eth.h_proto;
        *p_nhoff += sizeof(*hdr);

        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int thoff, int hlen)
{
        struct flow_dissector_key_tcp *key_tcp;
        struct tcphdr *th, _th;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP))
                return;

        th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th);
        if (!th)
                return;

        if (unlikely(__tcp_hdrlen(th) < sizeof(_th)))
                return;

        key_tcp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_TCP,
                                            target_container);
        key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF));
}

static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
                         void *target_container, const void *data,
                         int nhoff, u8 ip_proto, int hlen)
{
        enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
        struct flow_dissector_key_ports *key_ports;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
                dissector_ports = FLOW_DISSECTOR_KEY_PORTS;
        else if (dissector_uses_key(flow_dissector,
                                    FLOW_DISSECTOR_KEY_PORTS_RANGE))
                dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE;

        if (dissector_ports == FLOW_DISSECTOR_KEY_MAX)
                return;

        key_ports = skb_flow_dissector_target(flow_dissector,
                                              dissector_ports,
                                              target_container);
        key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
                                                data, hlen);
}

static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct iphdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = iph->tos;
        key_ip->ttl = iph->ttl;
}

static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct ipv6hdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = ipv6_get_dsfield(iph);
        key_ip->ttl = iph->hop_limit;
}

/* Maximum number of protocol headers that can be parsed in
 * __skb_flow_dissect
 */
#define MAX_FLOW_DISSECT_HDRS        15

static bool skb_flow_dissect_allowed(int *num_hdrs)
{
        ++*num_hdrs;

        return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
}

static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
                                     struct flow_dissector *flow_dissector,
                                     void *target_container)
{
        struct flow_dissector_key_ports *key_ports = NULL;
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;

        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);
        key_control->thoff = flow_keys->thoff;
        if (flow_keys->is_frag)
                key_control->flags |= FLOW_DIS_IS_FRAGMENT;
        if (flow_keys->is_first_frag)
                key_control->flags |= FLOW_DIS_FIRST_FRAG;
        if (flow_keys->is_encap)
                key_control->flags |= FLOW_DIS_ENCAPSULATION;

        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);
        key_basic->n_proto = flow_keys->n_proto;
        key_basic->ip_proto = flow_keys->ip_proto;

        if (flow_keys->addr_proto == ETH_P_IP &&
            dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                      target_container);
                key_addrs->v4addrs.src = flow_keys->ipv4_src;
                key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
                   dissector_uses_key(flow_dissector,
                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                      target_container);
                memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
                       sizeof(key_addrs->v6addrs.src));
                memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
                       sizeof(key_addrs->v6addrs.dst));
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS,
                                                      target_container);
        else if (dissector_uses_key(flow_dissector,
                                    FLOW_DISSECTOR_KEY_PORTS_RANGE))
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS_RANGE,
                                                      target_container);

        if (key_ports) {
                key_ports->src = flow_keys->sport;
                key_ports->dst = flow_keys->dport;
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                key_tags = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                     target_container);
                key_tags->flow_label = ntohl(flow_keys->flow_label);
        }
}

u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct bpf_flow_keys *flow_keys = ctx->flow_keys;
        u32 result;

        /* Pass parameters to the BPF program */
        memset(flow_keys, 0, sizeof(*flow_keys));
        flow_keys->n_proto = proto;
        flow_keys->nhoff = nhoff;
        flow_keys->thoff = flow_keys->nhoff;

        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
                     (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
        flow_keys->flags = flags;

        result = bpf_prog_run_pin_on_cpu(prog, ctx);

        flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
        flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
                                   flow_keys->nhoff, hlen);

        return result;
}

static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
{
        return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0;
}

/**
 * __skb_flow_dissect - extract the flow_keys struct and return it
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
 * @flow_dissector: list of keys to dissect
 * @target_container: target structure to put dissected values into
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 * @flags: flags that control the dissection process, e.g.
 *         FLOW_DISSECTOR_F_STOP_AT_ENCAP.
 *
 * The function will try to retrieve individual keys into target specified
 * by flow_dissector from either the skbuff or a raw buffer specified by the
 * rest parameters.
 *
 * Caller must take care of zeroing target container memory.
 */
bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;
        struct flow_dissector_key_vlan *key_vlan;
        enum flow_dissect_ret fdret;
        enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
        bool mpls_el = false;
        int mpls_lse = 0;
        int num_hdrs = 0;
        u8 ip_proto = 0;
        bool ret;

        if (!data) {
                data = skb->data;
                proto = skb_vlan_tag_present(skb) ?
                         skb->vlan_proto : skb->protocol;
                nhoff = skb_network_offset(skb);
                hlen = skb_headlen(skb);
#if IS_ENABLED(CONFIG_NET_DSA)
                if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
                             proto == htons(ETH_P_XDSA))) {
                        struct metadata_dst *md_dst = skb_metadata_dst(skb);
                        const struct dsa_device_ops *ops;
                        int offset = 0;

                        ops = skb->dev->dsa_ptr->tag_ops;
                        /* Only DSA header taggers break flow dissection */
                        if (ops->needed_headroom &&
                            (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) {
                                if (ops->flow_dissect)
                                        ops->flow_dissect(skb, &proto, &offset);
                                else
                                        dsa_tag_generic_flow_dissect(skb,
                                                                     &proto,
                                                                     &offset);
                                hlen -= offset;
                                nhoff += offset;
                        }
                }
#endif
        }

        /* It is ensured by skb_flow_dissector_init() that control key will
         * be always present.
         */
        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);

        /* It is ensured by skb_flow_dissector_init() that basic key will
         * be always present.
         */
        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);

        if (skb) {
                if (!net) {
                        if (skb->dev)
                                net = dev_net(skb->dev);
                        else if (skb->sk)
                                net = sock_net(skb->sk);
                }
        }

        WARN_ON_ONCE(!net);
        if (net) {
                enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
                struct bpf_prog_array *run_array;

                rcu_read_lock();
                run_array = rcu_dereference(init_net.bpf.run_array[type]);
                if (!run_array)
                        run_array = rcu_dereference(net->bpf.run_array[type]);

                if (run_array) {
                        struct bpf_flow_keys flow_keys;
                        struct bpf_flow_dissector ctx = {
                                .flow_keys = &flow_keys,
                                .data = data,
                                .data_end = data + hlen,
                        };
                        __be16 n_proto = proto;
                        struct bpf_prog *prog;
                        u32 result;

                        if (skb) {
                                ctx.skb = skb;
                                /* we can't use 'proto' in the skb case
                                 * because it might be set to skb->vlan_proto
                                 * which has been pulled from the data
                                 */
                                n_proto = skb->protocol;
                        }

                        prog = READ_ONCE(run_array->items[0].prog);
                        result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
                                                  hlen, flags);
                        if (result == BPF_FLOW_DISSECTOR_CONTINUE)
                                goto dissect_continue;
                        __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
                                                 target_container);
                        rcu_read_unlock();
                        return result == BPF_OK;
                }
dissect_continue:
                rcu_read_unlock();
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
                struct ethhdr *eth = eth_hdr(skb);
                struct flow_dissector_key_eth_addrs *key_eth_addrs;

                key_eth_addrs = skb_flow_dissector_target(flow_dissector,
                                                          FLOW_DISSECTOR_KEY_ETH_ADDRS,
                                                          target_container);
                memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
                struct flow_dissector_key_num_of_vlans *key_num_of_vlans;

                key_num_of_vlans = skb_flow_dissector_target(flow_dissector,
                                                             FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                             target_container);
                key_num_of_vlans->num_of_vlans = 0;
        }

proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (proto) {
        case htons(ETH_P_IP): {
                const struct iphdr *iph;
                struct iphdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph || iph->ihl < 5) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += iph->ihl * 4;

                ip_proto = iph->protocol;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v4addrs.src, &iph->saddr,
                               sizeof(key_addrs->v4addrs.src));
                        memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v4addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                }

                __skb_flow_dissect_ipv4(skb, flow_dissector,
                                        target_container, data, iph);

                if (ip_is_fragment(iph)) {
                        key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                        if (iph->frag_off & htons(IP_OFFSET)) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        } else {
                                key_control->flags |= FLOW_DIS_FIRST_FRAG;
                                if (!(flags &
                                      FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
                                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                        break;
                                }
                        }
                }

                break;
        }
        case htons(ETH_P_IPV6): {
                const struct ipv6hdr *iph;
                struct ipv6hdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = iph->nexthdr;
                nhoff += sizeof(struct ipv6hdr);

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v6addrs.src, &iph->saddr,
                               sizeof(key_addrs->v6addrs.src));
                        memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v6addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                }

                if ((dissector_uses_key(flow_dissector,
                                        FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
                     (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
                    ip6_flowlabel(iph)) {
                        __be32 flow_label = ip6_flowlabel(iph);

                        if (dissector_uses_key(flow_dissector,
                                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                                key_tags = skb_flow_dissector_target(flow_dissector,
                                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                                     target_container);
                                key_tags->flow_label = ntohl(flow_label);
                        }
                        if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        }
                }

                __skb_flow_dissect_ipv6(skb, flow_dissector,
                                        target_container, data, iph);

                break;
        }
        case htons(ETH_P_8021AD):
        case htons(ETH_P_8021Q): {
                const struct vlan_hdr *vlan = NULL;
                struct vlan_hdr _vlan;
                __be16 saved_vlan_tpid = proto;

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
                    skb && skb_vlan_tag_present(skb)) {
                        proto = skb->protocol;
                } else {
                        vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
                                                    data, hlen, &_vlan);
                        if (!vlan) {
                                fdret = FLOW_DISSECT_RET_OUT_BAD;
                                break;
                        }

                        proto = vlan->h_vlan_encapsulated_proto;
                        nhoff += sizeof(*vlan);
                }

                if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
                    !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
                        struct flow_dissector_key_num_of_vlans *key_nvs;

                        key_nvs = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                            target_container);
                        key_nvs->num_of_vlans++;
                }

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
                } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
                } else {
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                        break;
                }

                if (dissector_uses_key(flow_dissector, dissector_vlan)) {
                        key_vlan = skb_flow_dissector_target(flow_dissector,
                                                             dissector_vlan,
                                                             target_container);

                        if (!vlan) {
                                key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
                                key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
                        } else {
                                key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
                                        VLAN_VID_MASK;
                                key_vlan->vlan_priority =
                                        (ntohs(vlan->h_vlan_TCI) &
                                         VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
                        }
                        key_vlan->vlan_tpid = saved_vlan_tpid;
                        key_vlan->vlan_eth_type = proto;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }
        case htons(ETH_P_PPP_SES): {
                struct {
                        struct pppoe_hdr hdr;
                        __be16 proto;
                } *hdr, _hdr;
                u16 ppp_proto;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                /* least significant bit of the most significant octet
                 * indicates if protocol field was compressed
                 */
                ppp_proto = ntohs(hdr->proto);
                if (ppp_proto & 0x0100) {
                        ppp_proto = ppp_proto >> 8;
                        nhoff += PPPOE_SES_HLEN - 1;
                } else {
                        nhoff += PPPOE_SES_HLEN;
                }

                if (ppp_proto == PPP_IP) {
                        proto = htons(ETH_P_IP);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_IPV6) {
                        proto = htons(ETH_P_IPV6);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_UC) {
                        proto = htons(ETH_P_MPLS_UC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_MC) {
                        proto = htons(ETH_P_MPLS_MC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto_is_valid(ppp_proto)) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                } else {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_PPPOE)) {
                        struct flow_dissector_key_pppoe *key_pppoe;

                        key_pppoe = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_PPPOE,
                                                              target_container);
                        key_pppoe->session_id = hdr->hdr.sid;
                        key_pppoe->ppp_proto = htons(ppp_proto);
                        key_pppoe->type = htons(ETH_P_PPP_SES);
                }
                break;
        }
        case htons(ETH_P_TIPC): {
                struct tipc_basic_hdr *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr),
                                           data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_TIPC)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_TIPC,
                                                              target_container);
                        key_addrs->tipckey.key = tipc_hdr_rps_key(hdr);
                        key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC;
                }
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_MPLS_UC):
        case htons(ETH_P_MPLS_MC):
                fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
                                                target_container, data,
                                                nhoff, hlen, mpls_lse,
                                                &mpls_el);
                nhoff += sizeof(struct mpls_label);
                mpls_lse++;
                break;
        case htons(ETH_P_FCOE):
                if ((hlen - nhoff) < FCOE_HEADER_LEN) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += FCOE_HEADER_LEN;
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;

        case htons(ETH_P_ARP):
        case htons(ETH_P_RARP):
                fdret = __skb_flow_dissect_arp(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        case htons(ETH_P_BATMAN):
                fdret = __skb_flow_dissect_batadv(skb, key_control, data,
                                                  &proto, &nhoff, hlen, flags);
                break;

        case htons(ETH_P_1588): {
                struct ptp_header *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                           hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += sizeof(struct ptp_header);
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_PRP):
        case htons(ETH_P_HSR): {
                struct hsr_tag *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
                                           &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                proto = hdr->encap_proto;
                nhoff += HSR_HLEN;
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }

        case htons(ETH_P_CFM):
                fdret = __skb_flow_dissect_cfm(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        default:
                fdret = FLOW_DISSECT_RET_OUT_BAD;
                break;
        }

        /* Process result of proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_OUT_GOOD:
                goto out_good;
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                goto out_good;
        case FLOW_DISSECT_RET_CONTINUE:
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

ip_proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (ip_proto) {
        case IPPROTO_GRE:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
                                               target_container, data,
                                               &proto, &nhoff, &hlen, flags);
                break;

        case NEXTHDR_HOP:
        case NEXTHDR_ROUTING:
        case NEXTHDR_DEST: {
                u8 _opthdr[2], *opthdr;

                if (proto != htons(ETH_P_IPV6))
                        break;

                opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
                                              data, hlen, &_opthdr);
                if (!opthdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = opthdr[0];
                nhoff += (opthdr[1] + 1) << 3;

                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                break;
        }
        case NEXTHDR_FRAGMENT: {
                struct frag_hdr _fh, *fh;

                if (proto != htons(ETH_P_IPV6))
                        break;

                fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
                                          data, hlen, &_fh);

                if (!fh) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                nhoff += sizeof(_fh);
                ip_proto = fh->nexthdr;

                if (!(fh->frag_off & htons(IP6_OFFSET))) {
                        key_control->flags |= FLOW_DIS_FIRST_FRAG;
                        if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
                                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                                break;
                        }
                }

                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }
        case IPPROTO_IPIP:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IP);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_IPV6:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IPV6);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;


        case IPPROTO_MPLS:
                proto = htons(ETH_P_MPLS_UC);
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_TCP:
                __skb_flow_dissect_tcp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;

        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                __skb_flow_dissect_icmp(skb, flow_dissector, target_container,
                                        data, nhoff, hlen);
                break;
        case IPPROTO_L2TP:
                __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
                                          data, nhoff, hlen);
                break;
        case IPPROTO_ESP:
                __skb_flow_dissect_esp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;
        case IPPROTO_AH:
                __skb_flow_dissect_ah(skb, flow_dissector, target_container,
                                      data, nhoff, hlen);
                break;
        default:
                break;
        }

        if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT))
                __skb_flow_dissect_ports(skb, flow_dissector, target_container,
                                         data, nhoff, ip_proto, hlen);

        /* Process result of IP proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                break;
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto ip_proto_again;
                break;
        case FLOW_DISSECT_RET_OUT_GOOD:
        case FLOW_DISSECT_RET_CONTINUE:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

out_good:
        ret = true;

out:
        key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
        key_basic->n_proto = proto;
        key_basic->ip_proto = ip_proto;

        return ret;

out_bad:
        ret = false;
        goto out;
}
EXPORT_SYMBOL(__skb_flow_dissect);

static siphash_aligned_key_t hashrnd;
static __always_inline void __flow_hash_secret_init(void)
{
        net_get_random_once(&hashrnd, sizeof(hashrnd));
}

static const void *flow_keys_hash_start(const struct flow_keys *flow)
{
        BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
        return &flow->FLOW_KEYS_HASH_START_FIELD;
}

static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
{
        size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);

        BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));

        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                diff -= sizeof(flow->addrs.v4addrs);
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                diff -= sizeof(flow->addrs.v6addrs);
                break;
        case FLOW_DISSECTOR_KEY_TIPC:
                diff -= sizeof(flow->addrs.tipckey);
                break;
        }
        return sizeof(*flow) - diff;
}

__be32 flow_get_u32_src(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.src;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.src);
        case FLOW_DISSECTOR_KEY_TIPC:
                return flow->addrs.tipckey.key;
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_src);

__be32 flow_get_u32_dst(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.dst;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.dst);
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_dst);

/* Sort the source and destination IP and the ports,
 * to have consistent hash within the two directions
 */
static inline void __flow_hash_consistentify(struct flow_keys *keys)
{
        int addr_diff, i;

        switch (keys->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                if ((__force u32)keys->addrs.v4addrs.dst <
                    (__force u32)keys->addrs.v4addrs.src)
                        swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);

                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                addr_diff = memcmp(&keys->addrs.v6addrs.dst,
                                   &keys->addrs.v6addrs.src,
                                   sizeof(keys->addrs.v6addrs.dst));
                if (addr_diff < 0) {
                        for (i = 0; i < 4; i++)
                                swap(keys->addrs.v6addrs.src.s6_addr32[i],
                                     keys->addrs.v6addrs.dst.s6_addr32[i]);
                }
                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        }
}

static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
                                        const siphash_key_t *keyval)
{
        u32 hash;

        __flow_hash_consistentify(keys);

        hash = siphash(flow_keys_hash_start(keys),
                       flow_keys_hash_length(keys), keyval);
        if (!hash)
                hash = 1;

        return hash;
}

u32 flow_hash_from_keys(struct flow_keys *keys)
{
        __flow_hash_secret_init();
        return __flow_hash_from_keys(keys, &hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);

u32 flow_hash_from_keys_seed(struct flow_keys *keys,
                             const siphash_key_t *keyval)
{
        return __flow_hash_from_keys(keys, keyval);
}
EXPORT_SYMBOL(flow_hash_from_keys_seed);

static inline u32 ___skb_get_hash(const struct sk_buff *skb,
                                  struct flow_keys *keys,
                                  const siphash_key_t *keyval)
{
        skb_flow_dissect_flow_keys(skb, keys,
                                   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        return __flow_hash_from_keys(keys, keyval);
}

struct _flow_keys_digest_data {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
        __be32        ports;
        __be32        src;
        __be32        dst;
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow)
{
        struct _flow_keys_digest_data *data =
            (struct _flow_keys_digest_data *)digest;

        BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));

        memset(digest, 0, sizeof(*digest));

        data->n_proto = flow->basic.n_proto;
        data->ip_proto = flow->basic.ip_proto;
        data->ports = flow->ports.ports;
        data->src = flow->addrs.v4addrs.src;
        data->dst = flow->addrs.v4addrs.dst;
}
EXPORT_SYMBOL(make_flow_keys_digest);

static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
{
        struct flow_keys keys;

        __flow_hash_secret_init();

        memset(&keys, 0, sizeof(keys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
                           &keys, NULL, 0, 0, 0, 0);

        return __flow_hash_from_keys(&keys, &hashrnd);
}
EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);

/**
 * __skb_get_hash_net: calculate a flow hash
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to calculate flow hash from
 *
 * This function calculates a flow hash based on src/dst addresses
 * and src/dst port numbers.  Sets hash in skb to non-zero hash value
 * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
 * if hash is a canonical 4-tuple hash over transport ports.
 */
void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        struct flow_keys keys;
        u32 hash;

        memset(&keys, 0, sizeof(keys));

        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           &keys, NULL, 0, 0, 0,
                           FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        __flow_hash_secret_init();

        hash = __flow_hash_from_keys(&keys, &hashrnd);

        __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash_net);

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb)
{
        struct flow_keys keys;

        return ___skb_get_hash(skb, &keys, perturb);
}
EXPORT_SYMBOL(skb_get_hash_perturb);

u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen)
{
        u32 poff = keys->control.thoff;

        /* skip L4 headers for fragments after the first */
        if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
            !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
                return poff;

        switch (keys->basic.ip_proto) {
        case IPPROTO_TCP: {
                /* access doff as u8 to avoid unaligned access */
                const u8 *doff;
                u8 _doff;

                doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
                                            data, hlen, &_doff);
                if (!doff)
                        return poff;

                poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
                break;
        }
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
                poff += sizeof(struct udphdr);
                break;
        /* For the rest, we do not really care about header
         * extensions at this point for now.
         */
        case IPPROTO_ICMP:
                poff += sizeof(struct icmphdr);
                break;
        case IPPROTO_ICMPV6:
                poff += sizeof(struct icmp6hdr);
                break;
        case IPPROTO_IGMP:
                poff += sizeof(struct igmphdr);
                break;
        case IPPROTO_DCCP:
                poff += sizeof(struct dccp_hdr);
                break;
        case IPPROTO_SCTP:
                poff += sizeof(struct sctphdr);
                break;
        }

        return poff;
}

/**
 * skb_get_poff - get the offset to the payload
 * @skb: sk_buff to get the payload offset from
 *
 * The function will get the offset to the payload as far as it could
 * be dissected.  The main user is currently BPF, so that we can dynamically
 * truncate packets without needing to push actual payload to the user
 * space and can analyze headers only, instead.
 */
u32 skb_get_poff(const struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                              NULL, 0, 0, 0, 0))
                return 0;

        return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}

__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
{
        memset(keys, 0, sizeof(*keys));

        memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
            sizeof(keys->addrs.v6addrs.src));
        memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
            sizeof(keys->addrs.v6addrs.dst));
        keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        keys->ports.src = fl6->fl6_sport;
        keys->ports.dst = fl6->fl6_dport;
        keys->keyid.keyid = fl6->fl6_gre_key;
        keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
        keys->basic.ip_proto = fl6->flowi6_proto;

        return flow_hash_from_keys(keys);
}
EXPORT_SYMBOL(__get_hash_from_flowi6);

static const struct flow_dissector_key flow_keys_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_TIPC,
                .offset = offsetof(struct flow_keys, addrs.tipckey),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_VLAN,
                .offset = offsetof(struct flow_keys, vlan),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
                .offset = offsetof(struct flow_keys, tags),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct flow_keys, keyid),
        },
};

static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
};

static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
};

struct flow_dissector flow_keys_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_dissector);

struct flow_dissector flow_keys_basic_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_basic_dissector);

static int __init init_default_flow_dissectors(void)
{
        skb_flow_dissector_init(&flow_keys_dissector,
                                flow_keys_dissector_keys,
                                ARRAY_SIZE(flow_keys_dissector_keys));
        skb_flow_dissector_init(&flow_keys_dissector_symmetric,
                                flow_keys_dissector_symmetric_keys,
                                ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
        skb_flow_dissector_init(&flow_keys_basic_dissector,
                                flow_keys_basic_dissector_keys,
                                ARRAY_SIZE(flow_keys_basic_dissector_keys));
        return 0;
}
core_initcall(init_default_flow_dissectors);
































































































    1 



























































    1 






























    1 




    1 









    1 











    1 







    1 

    1 
































    1 























    1 



    1 


























    1 

    1 


















































































    1 







    1 

















    1 








    1 






    1 













































































































































































































































































































































































    1 














    1 











































































































































































































































































































































    1 
    2 
















































































































































































































































    4 
    4 

    4 


    3 


    3 


    4 


































    2 

    2 



    2 



    2 
    2 















    2 
    2 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
/*
 * mm/rmap.c - physical to virtual reverse mappings
 *
 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
 * Released under the General Public License (GPL).
 *
 * Simple, low overhead reverse mapping scheme.
 * Please try to keep this thing as modular as possible.
 *
 * Provides methods for unmapping each kind of mapped page:
 * the anon methods track anonymous pages, and
 * the file methods track pages belonging to an inode.
 *
 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
 * Contributions by Hugh Dickins 2003, 2004
 */

/*
 * Lock ordering in mm:
 *
 * inode->i_rwsem        (while writing or truncating, not reading or faulting)
 *   mm->mmap_lock
 *     mapping->invalidate_lock (in filemap_fault)
 *       folio_lock
 *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
 *           vma_start_write
 *             mapping->i_mmap_rwsem
 *               anon_vma->rwsem
 *                 mm->page_table_lock or pte_lock
 *                   swap_lock (in swap_duplicate, swap_info_get)
 *                     mmlist_lock (in mmput, drain_mmlist and others)
 *                     mapping->private_lock (in block_dirty_folio)
 *                       folio_lock_memcg move_lock (in block_dirty_folio)
 *                         i_pages lock (widely used)
 *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
 *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
 *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
 *                       sb_lock (within inode_lock in fs/fs-writeback.c)
 *                       i_pages lock (widely used, in set_page_dirty,
 *                                 in arch-dependent flush_dcache_mmap_lock,
 *                                 within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 *
 * hugetlbfs PageHuge() take locks in this order:
 *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *     vma_lock (hugetlb specific lock for pmd_sharing)
 *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
 *         folio_lock
 */

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>

#include <asm/tlbflush.h>

#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include <trace/events/migrate.h>

#include "internal.h"

static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;

static inline struct anon_vma *anon_vma_alloc(void)
{
        struct anon_vma *anon_vma;

        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
                anon_vma->num_children = 0;
                anon_vma->num_active_vmas = 0;
                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
                 */
                anon_vma->root = anon_vma;
        }

        return anon_vma;
}

static inline void anon_vma_free(struct anon_vma *anon_vma)
{
        VM_BUG_ON(atomic_read(&anon_vma->refcount));

        /*
         * Synchronize against folio_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
         * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
         *
         * folio_lock_anon_vma_read()        VS        put_anon_vma()
         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                  MB
         *   atomic_read()                          rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
        might_sleep();
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
                anon_vma_unlock_write(anon_vma);
        }

        kmem_cache_free(anon_vma_cachep, anon_vma);
}

static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}

static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}

static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
{
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}

/**
 * __anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
 *
 * This makes sure the memory mapping described by 'vma' has
 * an 'anon_vma' attached to it, so that we can associate the
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, which
 * is handled inline by anon_vma_prepare(). But if
 * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
 * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
 * and that may actually touch the rwsem even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
 *
 * As a result, we need to do proper anon_vma locking even
 * for the new allocation. At the same time, we do not want
 * to do any locking for the common case of already having
 * an anon_vma.
 */
int __anon_vma_prepare(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;
        struct anon_vma *anon_vma, *allocated;
        struct anon_vma_chain *avc;

        mmap_assert_locked(mm);
        might_sleep();

        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_enomem;

        anon_vma = find_mergeable_anon_vma(vma);
        allocated = NULL;
        if (!anon_vma) {
                anon_vma = anon_vma_alloc();
                if (unlikely(!anon_vma))
                        goto out_enomem_free_avc;
                anon_vma->num_children++; /* self-parent link for new root */
                allocated = anon_vma;
        }

        anon_vma_lock_write(anon_vma);
        /* page_table_lock to protect against threads */
        spin_lock(&mm->page_table_lock);
        if (likely(!vma->anon_vma)) {
                vma->anon_vma = anon_vma;
                anon_vma_chain_link(vma, avc, anon_vma);
                anon_vma->num_active_vmas++;
                allocated = NULL;
                avc = NULL;
        }
        spin_unlock(&mm->page_table_lock);
        anon_vma_unlock_write(anon_vma);

        if (unlikely(allocated))
                put_anon_vma(allocated);
        if (unlikely(avc))
                anon_vma_chain_free(avc);

        return 0;

 out_enomem_free_avc:
        anon_vma_chain_free(avc);
 out_enomem:
        return -ENOMEM;
}

/*
 * This is a useful helper function for locking the anon_vma root as
 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 * have the same vma.
 *
 * Such anon_vma's should have the same root, so you'd expect to see
 * just a single mutex_lock for the whole traversal.
 */
static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
                        up_write(&root->rwsem);
                root = new_root;
                down_write(&root->rwsem);
        }
        return root;
}

static inline void unlock_anon_vma_root(struct anon_vma *root)
{
        if (root)
                up_write(&root->rwsem);
}

/*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
 *
 * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
 * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
 * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
 * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
 * call, we can identify this case by checking (!dst->anon_vma &&
 * src->anon_vma).
 *
 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
 * and reuse existing anon_vma which has no vmas and only one child anon_vma.
 * This prevents degradation of anon_vma hierarchy to endless linear chain in
 * case of constantly forking task. On the other hand, an anon_vma with more
 * than one child isn't reused even if there was no alive vma, thus rmap
 * walker has a good chance of avoiding scanning the whole hierarchy when it
 * searches where page is mapped.
 */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
        struct anon_vma_chain *avc, *pavc;
        struct anon_vma *root = NULL;

        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma;

                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!avc)) {
                        unlock_anon_vma_root(root);
                        root = NULL;
                        avc = anon_vma_chain_alloc(GFP_KERNEL);
                        if (!avc)
                                goto enomem_failure;
                }
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);

                /*
                 * Reuse existing anon_vma if it has no vma and only one
                 * anon_vma child.
                 *
                 * Root anon_vma is never reused:
                 * it has self-parent reference and at least one child.
                 */
                if (!dst->anon_vma && src->anon_vma &&
                    anon_vma->num_children < 2 &&
                    anon_vma->num_active_vmas == 0)
                        dst->anon_vma = anon_vma;
        }
        if (dst->anon_vma)
                dst->anon_vma->num_active_vmas++;
        unlock_anon_vma_root(root);
        return 0;

 enomem_failure:
        /*
         * dst->anon_vma is dropped here otherwise its num_active_vmas can
         * be incorrectly decremented in unlink_anon_vmas().
         * We can safely do this because callers of anon_vma_clone() don't care
         * about dst->anon_vma if anon_vma_clone() failed.
         */
        dst->anon_vma = NULL;
        unlink_anon_vmas(dst);
        return -ENOMEM;
}

/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
 */
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
        struct anon_vma_chain *avc;
        struct anon_vma *anon_vma;
        int error;

        /* Don't bother if the parent process has no anon_vma here. */
        if (!pvma->anon_vma)
                return 0;

        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
        vma->anon_vma = NULL;

        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
         */
        error = anon_vma_clone(vma, pvma);
        if (error)
                return error;

        /* An existing anon_vma has been reused, all done then. */
        if (vma->anon_vma)
                return 0;

        /* Then add our own anon_vma. */
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
        anon_vma->num_active_vmas++;
        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;

        /*
         * The root anon_vma's rwsem is the lock actually used when we
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
         * this anon_vma is freed, because the lock lives in the root.
         */
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma->parent->num_children++;
        anon_vma_unlock_write(anon_vma);

        return 0;

 out_error_free_anon_vma:
        put_anon_vma(anon_vma);
 out_error:
        unlink_anon_vmas(vma);
        return -ENOMEM;
}

void unlink_anon_vmas(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc, *next;
        struct anon_vma *root = NULL;

        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);

                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
                if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
                        anon_vma->parent->num_children--;
                        continue;
                }

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
        if (vma->anon_vma) {
                vma->anon_vma->num_active_vmas--;

                /*
                 * vma would still be needed after unlink, and anon_vma will be prepared
                 * when handle fault.
                 */
                vma->anon_vma = NULL;
        }
        unlock_anon_vma_root(root);

        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                VM_WARN_ON(anon_vma->num_children);
                VM_WARN_ON(anon_vma->num_active_vmas);
                put_anon_vma(anon_vma);

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
}

static void anon_vma_ctor(void *data)
{
        struct anon_vma *anon_vma = data;

        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT_CACHED;
}

void __init anon_vma_init(void)
{
        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
                        0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
                        anon_vma_ctor);
        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
                        SLAB_PANIC|SLAB_ACCOUNT);
}

/*
 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 *
 * Since there is no serialization what so ever against folio_remove_rmap_*()
 * the best this function can do is return a refcount increased anon_vma
 * that might have been relevant to this page.
 *
 * The page might have been remapped to a different anon_vma or the anon_vma
 * returned may already be freed (and even reused).
 *
 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 * ensure that any anon_vma obtained from the page will still be valid for as
 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 *
 * All users of this function must be very careful when walking the anon_vma
 * chain and verify that the page in question is indeed mapped in it
 * [ something equivalent to page_mapped_in_vma() ].
 *
 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
 * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
 * if there is a mapcount, we can dereference the anon_vma after observing
 * those.
 *
 * NOTE: the caller should normally hold folio lock when calling this.  If
 * not, the caller needs to double check the anon_vma didn't change after
 * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
 * concurrently without folio lock protection). See folio_lock_anon_vma_read()
 * which has already covered that, and comment above remap_pages().
 */
struct anon_vma *folio_get_anon_vma(struct folio *folio)
{
        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        /*
         * If this folio is still mapped, then its anon_vma cannot have been
         * freed.  But if it has been unmapped, we have no security against the
         * anon_vma structure being freed and reused (for another anon_vma:
         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
         * above cannot corrupt).
         */
        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }
out:
        rcu_read_unlock();

        return anon_vma;
}

/*
 * Similar to folio_get_anon_vma() except it locks the anon_vma.
 *
 * Its a little more complex as it tries to keep the fast path to a single
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with folio_get_anon_vma() and then block on the mutex
 * on !rwc->try_lock case.
 */
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
                                          struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
        unsigned long anon_mapping;

retry:
        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * folio_move_anon_rmap() might have changed the anon_vma as we
                 * might not hold the folio lock here.
                 */
                if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
                             anon_mapping)) {
                        up_read(&root_anon_vma->rwsem);
                        rcu_read_unlock();
                        goto retry;
                }

                /*
                 * If the folio is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!folio_mapped(folio)) {
                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
        }

        if (rwc && rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        /* trylock failed, we got to sleep */
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }

        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
        anon_vma_lock_read(anon_vma);

        /*
         * folio_move_anon_rmap() might have changed the anon_vma as we might
         * not hold the folio lock here.
         */
        if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
                     anon_mapping)) {
                anon_vma_unlock_read(anon_vma);
                put_anon_vma(anon_vma);
                anon_vma = NULL;
                goto retry;
        }

        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }

        return anon_vma;

out:
        rcu_read_unlock();
        return anon_vma;
}

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
 * important if a PTE was dirty when it was unmapped that it's flushed
 * before any IO is initiated on the page to prevent lost writes. Similarly,
 * it must be flushed before freeing to prevent data leakage.
 */
void try_to_unmap_flush(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (!tlb_ubc->flush_required)
                return;

        arch_tlbbatch_flush(&tlb_ubc->arch);
        tlb_ubc->flush_required = false;
        tlb_ubc->writable = false;
}

/* Flush iff there are potentially writable TLB entries that can race with IO */
void try_to_unmap_flush_dirty(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (tlb_ubc->writable)
                try_to_unmap_flush();
}

/*
 * Bits 0-14 of mm->tlb_flush_batched record pending generations.
 * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
 */
#define TLB_FLUSH_BATCH_FLUSHED_SHIFT        16
#define TLB_FLUSH_BATCH_PENDING_MASK                        \
        ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
#define TLB_FLUSH_BATCH_PENDING_LARGE                        \
        (TLB_FLUSH_BATCH_PENDING_MASK / 2)

static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                                      unsigned long uaddr)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
        int batch;
        bool writable = pte_dirty(pteval);

        if (!pte_accessible(mm, pteval))
                return;

        arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
        tlb_ubc->flush_required = true;

        /*
         * Ensure compiler does not re-order the setting of tlb_flush_batched
         * before the PTE is cleared.
         */
        barrier();
        batch = atomic_read(&mm->tlb_flush_batched);
retry:
        if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
                /*
                 * Prevent `pending' from catching up with `flushed' because of
                 * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
                 * `pending' becomes large.
                 */
                if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
                        goto retry;
        } else {
                atomic_inc(&mm->tlb_flush_batched);
        }

        /*
         * If the PTE was dirty then it's best to assume it's writable. The
         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
         * before the page is queued for IO.
         */
        if (writable)
                tlb_ubc->writable = true;
}

/*
 * Returns true if the TLB flush should be deferred to the end of a batch of
 * unmap operations to reduce IPIs.
 */
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        if (!(flags & TTU_BATCH_FLUSH))
                return false;

        return arch_tlbbatch_should_defer(mm);
}

/*
 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 * operation such as mprotect or munmap to race between reclaim unmapping
 * the page and flushing the page. If this race occurs, it potentially allows
 * access to data via a stale TLB entry. Tracking all mm's that have TLB
 * batching in flight would be expensive during reclaim so instead track
 * whether TLB batching occurred in the past and if so then do a flush here
 * if required. This will cost one additional flush per reclaim cycle paid
 * by the first operation at risk such as mprotect and mumap.
 *
 * This must be called under the PTL so that an access to tlb_flush_batched
 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 * via the PTL.
 */
void flush_tlb_batched_pending(struct mm_struct *mm)
{
        int batch = atomic_read(&mm->tlb_flush_batched);
        int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
        int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;

        if (pending != flushed) {
                arch_flush_tlb_batched_pending(mm);
                /*
                 * If the new TLB flushing is pending during flushing, leave
                 * mm->tlb_flush_batched as is, to avoid losing flushing.
                 */
                atomic_cmpxchg(&mm->tlb_flush_batched, batch,
                               pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
        }
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                                      unsigned long uaddr)
{
}

static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        return false;
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

/*
 * At what user virtual address is page expected in vma?
 * Caller should check the page is actually part of the vma.
 */
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
        struct folio *folio = page_folio(page);
        pgoff_t pgoff;

        if (folio_test_anon(folio)) {
                struct anon_vma *page__anon_vma = folio_anon_vma(folio);
                /*
                 * Note: swapoff's unuse_vma() is more efficient with this
                 * check, and needs it to match anon_vma when KSM is active.
                 */
                if (!vma->anon_vma || !page__anon_vma ||
                    vma->anon_vma->root != page__anon_vma->root)
                        return -EFAULT;
        } else if (!vma->vm_file) {
                return -EFAULT;
        } else if (vma->vm_file->f_mapping != folio->mapping) {
                return -EFAULT;
        }

        /* The !page__anon_vma above handles KSM folios */
        pgoff = folio->index + folio_page_idx(folio, page);
        return vma_address(vma, pgoff, 1);
}

/*
 * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
 * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
 * represents.
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd = NULL;

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
out:
        return pmd;
}

struct folio_referenced_arg {
        int mapcount;
        int referenced;
        unsigned long vm_flags;
        struct mem_cgroup *memcg;
};

/*
 * arg: folio_referenced_arg will be passed
 */
static bool folio_referenced_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        int referenced = 0;
        unsigned long start = address, ptes = 0;

        while (page_vma_mapped_walk(&pvmw)) {
                address = pvmw.address;

                if (vma->vm_flags & VM_LOCKED) {
                        if (!folio_test_large(folio) || !pvmw.pte) {
                                /* Restore the mlock which got missed */
                                mlock_vma_folio(folio, vma);
                                page_vma_mapped_walk_done(&pvmw);
                                pra->vm_flags |= VM_LOCKED;
                                return false; /* To break the loop */
                        }
                        /*
                         * For large folio fully mapped to VMA, will
                         * be handled after the pvmw loop.
                         *
                         * For large folio cross VMA boundaries, it's
                         * expected to be picked  by page reclaim. But
                         * should skip reference of pages which are in
                         * the range of VM_LOCKED vma. As page reclaim
                         * should just count the reference of pages out
                         * the range of VM_LOCKED vma.
                         */
                        ptes++;
                        pra->mapcount--;
                        continue;
                }

                if (pvmw.pte) {
                        if (lru_gen_enabled() &&
                            pte_young(ptep_get(pvmw.pte))) {
                                lru_gen_look_around(&pvmw);
                                referenced++;
                        }

                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte))
                                referenced++;
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
                                referenced++;
                } else {
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
                }

                pra->mapcount--;
        }

        if ((vma->vm_flags & VM_LOCKED) &&
                        folio_test_large(folio) &&
                        folio_within_vma(folio, vma)) {
                unsigned long s_align, e_align;

                s_align = ALIGN_DOWN(start, PMD_SIZE);
                e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);

                /* folio doesn't cross page table boundary and fully mapped */
                if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
                        /* Restore the mlock which got missed */
                        mlock_vma_folio(folio, vma);
                        pra->vm_flags |= VM_LOCKED;
                        return false; /* To break the loop */
                }
        }

        if (referenced)
                folio_clear_idle(folio);
        if (folio_test_clear_young(folio))
                referenced++;

        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
        }

        if (!pra->mapcount)
                return false; /* To break the loop */

        return true;
}

static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        struct mem_cgroup *memcg = pra->memcg;

        /*
         * Ignore references from this mapping if it has no recency. If the
         * folio has been used in another mapping, we will catch it; if this
         * other mapping is already gone, the unmap path will have set the
         * referenced flag or activated the folio in zap_pte_range().
         */
        if (!vma_has_recency(vma))
                return true;

        /*
         * If we are reclaiming on behalf of a cgroup, skip counting on behalf
         * of references from different cgroups.
         */
        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
                return true;

        return false;
}

/**
 * folio_referenced() - Test if the folio was referenced.
 * @folio: The folio to test.
 * @is_locked: Caller holds lock on the folio.
 * @memcg: target memory cgroup
 * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
 *
 * Quick test_and_clear_referenced for all mappings of a folio,
 *
 * Return: The number of mappings which referenced the folio. Return -1 if
 * the function bailed out due to rmap lock contention.
 */
int folio_referenced(struct folio *folio, int is_locked,
                     struct mem_cgroup *memcg, unsigned long *vm_flags)
{
        bool we_locked = false;
        struct folio_referenced_arg pra = {
                .mapcount = folio_mapcount(folio),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = folio_referenced_one,
                .arg = (void *)&pra,
                .anon_lock = folio_lock_anon_vma_read,
                .try_lock = true,
                .invalid_vma = invalid_folio_referenced_vma,
        };

        *vm_flags = 0;
        if (!pra.mapcount)
                return 0;

        if (!folio_raw_mapping(folio))
                return 0;

        if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
                we_locked = folio_trylock(folio);
                if (!we_locked)
                        return 1;
        }

        rmap_walk(folio, &rwc);
        *vm_flags = pra.vm_flags;

        if (we_locked)
                folio_unlock(folio);

        return rwc.contended ? -1 : pra.referenced;
}

static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
        int cleaned = 0;
        struct vm_area_struct *vma = pvmw->vma;
        struct mmu_notifier_range range;
        unsigned long address = pvmw->address;

        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the folio can not be freed from this function.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
                                vma->vm_mm, address, vma_address_end(pvmw));
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(pvmw)) {
                int ret = 0;

                address = pvmw->address;
                if (pvmw->pte) {
                        pte_t *pte = pvmw->pte;
                        pte_t entry = ptep_get(pte);

                        if (!pte_dirty(entry) && !pte_write(entry))
                                continue;

                        flush_cache_page(vma, address, pte_pfn(entry));
                        entry = ptep_clear_flush(vma, address, pte);
                        entry = pte_wrprotect(entry);
                        entry = pte_mkclean(entry);
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        ret = 1;
                } else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        pmd_t *pmd = pvmw->pmd;
                        pmd_t entry;

                        if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                continue;

                        flush_cache_range(vma, address,
                                          address + HPAGE_PMD_SIZE);
                        entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
                        set_pmd_at(vma->vm_mm, address, pmd, entry);
                        ret = 1;
#else
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
#endif
                }

                if (ret)
                        cleaned++;
        }

        mmu_notifier_invalidate_range_end(&range);

        return cleaned;
}

static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, void *arg)
{
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
        int *cleaned = arg;

        *cleaned += page_vma_mkclean_one(&pvmw);

        return true;
}

static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
        if (vma->vm_flags & VM_SHARED)
                return false;

        return true;
}

int folio_mkclean(struct folio *folio)
{
        int cleaned = 0;
        struct address_space *mapping;
        struct rmap_walk_control rwc = {
                .arg = (void *)&cleaned,
                .rmap_one = page_mkclean_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        BUG_ON(!folio_test_locked(folio));

        if (!folio_mapped(folio))
                return 0;

        mapping = folio_mapping(folio);
        if (!mapping)
                return 0;

        rmap_walk(folio, &rwc);

        return cleaned;
}
EXPORT_SYMBOL_GPL(folio_mkclean);

/**
 * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
 *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
 *                     within the @vma of shared mappings. And since clean PTEs
 *                     should also be readonly, write protects them too.
 * @pfn: start pfn.
 * @nr_pages: number of physically contiguous pages srarting with @pfn.
 * @pgoff: page offset that the @pfn mapped with.
 * @vma: vma that @pfn mapped within.
 *
 * Returns the number of cleaned PTEs (including PMDs).
 */
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma)
{
        struct page_vma_mapped_walk pvmw = {
                .pfn                = pfn,
                .nr_pages        = nr_pages,
                .pgoff                = pgoff,
                .vma                = vma,
                .flags                = PVMW_SYNC,
        };

        if (invalid_mkclean_vma(vma, NULL))
                return 0;

        pvmw.address = vma_address(vma, pgoff, nr_pages);
        VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);

        return page_vma_mkclean_one(&pvmw);
}

static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level,
                int *nr_pmdmapped)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        const int orig_nr_pages = nr_pages;
        int first, nr = 0;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_inc_and_test(&page->_mapcount);
                        break;
                }

                do {
                        first = atomic_inc_and_test(&page->_mapcount);
                        if (first) {
                                first = atomic_inc_return_relaxed(mapped);
                                if (first < ENTIRELY_MAPPED)
                                        nr++;
                        }
                } while (page++, --nr_pages > 0);
                atomic_add(orig_nr_pages, &folio->_large_mapcount);
                break;
        case RMAP_LEVEL_PMD:
                first = atomic_inc_and_test(&folio->_entire_mapcount);
                if (first) {
                        nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
                                *nr_pmdmapped = folio_nr_pages(folio);
                                nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
                                /* Raced ahead of a remove and another add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* Raced ahead of a remove of ENTIRELY_MAPPED */
                                nr = 0;
                        }
                }
                atomic_inc(&folio->_large_mapcount);
                break;
        }
        return nr;
}

/**
 * folio_move_anon_rmap - move a folio to our anon_vma
 * @folio:        The folio to move to our anon_vma
 * @vma:        The vma the folio belongs to
 *
 * When a folio belongs exclusively to one process after a COW event,
 * that folio can be moved into the anon_vma that belongs to just that
 * process, so the rmap code will not search the parent or sibling processes.
 */
void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
{
        void *anon_vma = vma->anon_vma;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_VMA(!anon_vma, vma);

        anon_vma += PAGE_MAPPING_ANON;
        /*
         * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
         * simultaneously, so a concurrent reader (eg folio_referenced()'s
         * folio_test_anon()) will not see one without the other.
         */
        WRITE_ONCE(folio->mapping, anon_vma);
}

/**
 * __folio_set_anon - set up a new anonymous rmap for a folio
 * @folio:        The folio to set up the new anonymous rmap for.
 * @vma:        VM area to add the folio to.
 * @address:        User virtual address of the mapping
 * @exclusive:        Whether the folio is exclusive to the process.
 */
static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, bool exclusive)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        BUG_ON(!anon_vma);

        /*
         * If the folio isn't exclusive to this vma, we must use the _oldest_
         * possible anon_vma for the folio mapping!
         */
        if (!exclusive)
                anon_vma = anon_vma->root;

        /*
         * page_idle does a lockless/optimistic rmap scan on folio->mapping.
         * Make sure the compiler doesn't split the stores of anon_vma and
         * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
         * could mistake the mapping for a struct address_space and crash.
         */
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
        folio->index = linear_page_index(vma, address);
}

/**
 * __page_check_anon_rmap - sanity check anonymous rmap addition
 * @folio:        The folio containing @page.
 * @page:        the page to check the mapping of
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 */
static void __page_check_anon_rmap(struct folio *folio, struct page *page,
        struct vm_area_struct *vma, unsigned long address)
{
        /*
         * The page's anon-rmap details (mapping and index) are guaranteed to
         * be set up correctly at this point.
         *
         * We have exclusion against folio_add_anon_rmap_*() because the caller
         * always holds the page locked.
         *
         * We have exclusion against folio_add_new_anon_rmap because those pages
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to folio_add_new_anon_rmap.
         */
        VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
                        folio);
        VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
                       page);
}

static __always_inline void __folio_add_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags, enum rmap_level level)
{
        int i, nr, nr_pmdmapped = 0;

        nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
        if (nr_pmdmapped)
                __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
        if (nr)
                __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);

        if (unlikely(!folio_test_anon(folio))) {
                VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
                /*
                 * For a PTE-mapped large folio, we only know that the single
                 * PTE is exclusive. Further, __folio_set_anon() might not get
                 * folio->index right when not given the address of the head
                 * page.
                 */
                VM_WARN_ON_FOLIO(folio_test_large(folio) &&
                                 level != RMAP_LEVEL_PMD, folio);
                __folio_set_anon(folio, vma, address,
                                 !!(flags & RMAP_EXCLUSIVE));
        } else if (likely(!folio_test_ksm(folio))) {
                __page_check_anon_rmap(folio, page, vma, address);
        }

        if (flags & RMAP_EXCLUSIVE) {
                switch (level) {
                case RMAP_LEVEL_PTE:
                        for (i = 0; i < nr_pages; i++)
                                SetPageAnonExclusive(page + i);
                        break;
                case RMAP_LEVEL_PMD:
                        SetPageAnonExclusive(page);
                        break;
                }
        }
        for (i = 0; i < nr_pages; i++) {
                struct page *cur_page = page + i;

                /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
                VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
                                  (folio_test_large(folio) &&
                                   folio_entire_mapcount(folio) > 1)) &&
                                 PageAnonExclusive(cur_page), folio);
        }

        /*
         * For large folio, only mlock it if it's fully mapped to VMA. It's
         * not easy to check whether the large folio is fully mapped to VMA
         * here. Only mlock normal 4K folio and leave page reclaim to handle
         * large folio.
         */
        if (!folio_test_large(folio))
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages which will be mapped
 * @vma:        The vm area in which the mappings are added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + nr_pages)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting,
 * and to ensure that an anon folio is not being upgraded racily to a KSM folio
 * (but KSM folios are never downgraded).
 */
void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma, unsigned long address,
                rmap_t flags)
{
        __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
                              RMAP_LEVEL_PTE);
}

/**
 * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting.
 */
void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
                              RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
 * @folio:        The folio to add the mapping to.
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 *
 * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
 * This means the inc-and-test can be bypassed.
 * The folio does not have to be locked.
 *
 * If the folio is pmd-mappable, it is accounted as a THP.  As the folio
 * is new, it's assumed to be mapped exclusively by a single process.
 */
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address)
{
        int nr = folio_nr_pages(folio);

        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
        VM_BUG_ON_VMA(address < vma->vm_start ||
                        address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
        __folio_set_swapbacked(folio);
        __folio_set_anon(folio, vma, address, true);

        if (likely(!folio_test_large(folio))) {
                /* increment count (starts at -1) */
                atomic_set(&folio->_mapcount, 0);
                SetPageAnonExclusive(&folio->page);
        } else if (!folio_test_pmd_mappable(folio)) {
                int i;

                for (i = 0; i < nr; i++) {
                        struct page *page = folio_page(folio, i);

                        /* increment count (starts at -1) */
                        atomic_set(&page->_mapcount, 0);
                        SetPageAnonExclusive(page);
                }

                /* increment count (starts at -1) */
                atomic_set(&folio->_large_mapcount, nr - 1);
                atomic_set(&folio->_nr_pages_mapped, nr);
        } else {
                /* increment count (starts at -1) */
                atomic_set(&folio->_entire_mapcount, 0);
                /* increment count (starts at -1) */
                atomic_set(&folio->_large_mapcount, 0);
                atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
                SetPageAnonExclusive(&folio->page);
                __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
        }

        __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
}

static __always_inline void __folio_add_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum rmap_level level)
{
        pg_data_t *pgdat = folio_pgdat(folio);
        int nr, nr_pmdmapped = 0;

        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
        if (nr_pmdmapped)
                __mod_node_page_state(pgdat, folio_test_swapbacked(folio) ?
                        NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
        if (nr)
                __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);

        /* See comments in folio_add_anon_rmap_*() */
        if (!folio_test_large(folio))
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages that will be mapped using PTEs
 * @vma:        The vm area in which the mappings are added
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
}

/**
 * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline void __folio_remove_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum rmap_level level)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        pg_data_t *pgdat = folio_pgdat(folio);
        int last, nr = 0, nr_pmdmapped = 0;
        bool partially_mapped = false;
        enum node_stat_item idx;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_add_negative(-1, &page->_mapcount);
                        break;
                }

                atomic_sub(nr_pages, &folio->_large_mapcount);
                do {
                        last = atomic_add_negative(-1, &page->_mapcount);
                        if (last) {
                                last = atomic_dec_return_relaxed(mapped);
                                if (last < ENTIRELY_MAPPED)
                                        nr++;
                        }
                } while (page++, --nr_pages > 0);

                partially_mapped = nr && atomic_read(mapped);
                break;
        case RMAP_LEVEL_PMD:
                atomic_dec(&folio->_large_mapcount);
                last = atomic_add_negative(-1, &folio->_entire_mapcount);
                if (last) {
                        nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED)) {
                                nr_pmdmapped = folio_nr_pages(folio);
                                nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
                                /* Raced ahead of another remove and an add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* An add of ENTIRELY_MAPPED raced ahead */
                                nr = 0;
                        }
                }

                partially_mapped = nr < nr_pmdmapped;
                break;
        }

        if (nr_pmdmapped) {
                /* NR_{FILE/SHMEM}_PMDMAPPED are not maintained per-memcg */
                if (folio_test_anon(folio))
                        __lruvec_stat_mod_folio(folio, NR_ANON_THPS, -nr_pmdmapped);
                else
                        __mod_node_page_state(pgdat,
                                        folio_test_swapbacked(folio) ?
                                        NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED,
                                        -nr_pmdmapped);
        }
        if (nr) {
                idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
                __lruvec_stat_mod_folio(folio, idx, -nr);

                /*
                 * Queue anon large folio for deferred split if at least one
                 * page of the folio is unmapped and at least one page
                 * is still mapped.
                 *
                 * Check partially_mapped first to ensure it is a large folio.
                 */
                if (folio_test_anon(folio) && partially_mapped &&
                    list_empty(&folio->_deferred_list))
                        deferred_split_folio(folio);
        }

        /*
         * It would be tidy to reset folio_test_anon mapping when fully
         * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
         * which increments mapcount after us but sets mapping before us:
         * so leave the reset to free_pages_prepare, and remember that
         * it's only reliable while mapped.
         */

        munlock_vma_folio(folio, vma);
}

/**
 * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
 * @folio:        The folio to remove the mappings from
 * @page:        The first page to remove
 * @nr_pages:        The number of pages that will be removed from the mapping
 * @vma:        The vm area from which the mappings are removed
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
}

/**
 * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
 * @folio:        The folio to remove the mapping from
 * @page:        The first page to remove
 * @vma:        The vm area from which the mapping is removed
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/*
 * @arg: enum ttu_flags will be passed to this argument
 */
static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
        bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
        unsigned long hsz = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_unmap() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        if (flags & TTU_SPLIT_HUGE_PMD)
                split_huge_pmd_address(vma, address, false, folio);

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the folio can not be freed in this function as call of
         * try_to_unmap() must hold a reference on the folio.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                /*
                 * If the folio is in an mlock()d vma, we must not swap it out.
                 */
                if (!(flags & TTU_IGNORE_MLOCK) &&
                    (vma->vm_flags & VM_LOCKED)) {
                        /* Restore the mlock which got missed */
                        if (!folio_test_large(folio))
                                mlock_vma_folio(folio, vma);
                        page_vma_mapped_walk_done(&pvmw);
                        ret = false;
                        break;
                }

                pfn = pte_pfn(ptep_get(pvmw.pte));
                subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * The try_to_unmap() is only passed a hugetlb page
                         * in the case where the hugetlb page is poisoned.
                         */
                        VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and fail
                         * if unsuccessful.
                         */
                        if (!anon) {
                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma)) {
                                        page_vma_mapped_walk_done(&pvmw);
                                        ret = false;
                                        break;
                                }
                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        flush_tlb_range(vma,
                                                range.start, range.end);
                                        /*
                                         * The ref count of the PMD page was
                                         * dropped which is part of the way map
                                         * counting is done for shared PMDs.
                                         * Return 'true' here.  When there is
                                         * no other sharing, huge_pmd_unshare
                                         * returns false and we will unmap the
                                         * actual page and drop map count
                                         * to zero.
                                         */
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                                hugetlb_vma_unlock_write(vma);
                        }
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                } else {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
                                /*
                                 * We clear the PTE but do not flush so potentially
                                 * a remote CPU could still be writing to the folio.
                                 * If the entry was previously clean then the
                                 * architecture must guarantee that a clear->dirty
                                 * transition on a cached TLB entry is written through
                                 * and traps if the PTE is unmapped.
                                 */
                                pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                                set_tlb_ubc_flush_pending(mm, pteval, address);
                        } else {
                                pteval = ptep_clear_flush(vma, address, pvmw.pte);
                        }
                }

                /*
                 * Now the pte is cleared. If this pte was uffd-wp armed,
                 * we may want to replace a none pte with a marker pte if
                 * it's file-backed, so we don't lose the tracking info.
                 */
                pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);

                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
                        folio_mark_dirty(folio);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }

                } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else if (folio_test_anon(folio)) {
                        swp_entry_t entry = page_swap_entry(subpage);
                        pte_t swp_pte;
                        /*
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
                        if (unlikely(folio_test_swapbacked(folio) !=
                                        folio_test_swapcache(folio))) {
                                WARN_ON_ONCE(1);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* MADV_FREE page check */
                        if (!folio_test_swapbacked(folio)) {
                                int ref_count, map_count;

                                /*
                                 * Synchronize with gup_pte_range():
                                 * - clear PTE; barrier; read refcount
                                 * - inc refcount; barrier; read PTE
                                 */
                                smp_mb();

                                ref_count = folio_ref_count(folio);
                                map_count = folio_mapcount(folio);

                                /*
                                 * Order reads for page refcount and dirty flag
                                 * (see comments in __remove_mapping()).
                                 */
                                smp_rmb();

                                /*
                                 * The only page refs must be one from isolation
                                 * plus the rmap(s) (dropped by discard:).
                                 */
                                if (ref_count == 1 + map_count &&
                                    !folio_test_dirty(folio)) {
                                        dec_mm_counter(mm, MM_ANONPAGES);
                                        goto discard;
                                }

                                /*
                                 * If the folio was redirtied, it cannot be
                                 * discarded. Remap the page to page table.
                                 */
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                folio_set_swapbacked(folio);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        if (swap_duplicate(entry) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* See folio_try_share_anon_rmap(): clear PTE first. */
                        if (anon_exclusive &&
                            folio_try_share_anon_rmap_pte(folio, subpage)) {
                                swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
                                        list_add(&mm->mmlist, &init_mm.mmlist);
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
                        swp_pte = swp_entry_to_pte(entry);
                        if (anon_exclusive)
                                swp_pte = pte_swp_mkexclusive(swp_pte);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                } else {
                        /*
                         * This is a locked file-backed folio,
                         * so it cannot be removed from the page
                         * cache and replaced by a new folio before
                         * mmu_notifier_invalidate_range_end, so no
                         * concurrent thread might update its page table
                         * to point at a new folio while a device is
                         * still using this folio.
                         *
                         * See Documentation/mm/mmu_notifier.rst
                         */
                        dec_mm_counter(mm, mm_counter_file(folio));
                }
discard:
                if (unlikely(folio_test_hugetlb(folio)))
                        hugetlb_remove_rmap(folio);
                else
                        folio_remove_rmap_pte(folio, subpage, vma);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put(folio);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
        return vma_is_temporary_stack(vma);
}

static int folio_not_mapped(struct folio *folio)
{
        return !folio_mapped(folio);
}

/**
 * try_to_unmap - Try to remove all page table mappings to a folio.
 * @folio: The folio to unmap.
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this
 * folio.  It is the caller's responsibility to check if the folio is
 * still mapped if needed (use TTU_SYNC to prevent accounting races).
 *
 * Context: Caller must hold the folio lock.
 */
void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

/*
 * @arg: enum ttu_flags will be passed to this argument.
 *
 * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
 * containing migration entries.
 */
static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
        bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
        unsigned long hsz = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_migrate() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /*
         * unmap_page() in mm/huge_memory.c is the only user of migration with
         * TTU_SPLIT_HUGE_PMD and it wants to freeze.
         */
        if (flags & TTU_SPLIT_HUGE_PMD)
                split_huge_pmd_address(vma, address, true, folio);

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                /* PMD-mapped THP migration entry */
                if (!pvmw.pte) {
                        subpage = folio_page(folio,
                                pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
                        VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
                                        !folio_test_pmd_mappable(folio), folio);

                        if (set_pmd_migration_entry(&pvmw, subpage)) {
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        continue;
                }
#endif

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                pfn = pte_pfn(ptep_get(pvmw.pte));

                if (folio_is_zone_device(folio)) {
                        /*
                         * Our PTE is a non-present device exclusive entry and
                         * calculating the subpage as for the common case would
                         * result in an invalid pointer.
                         *
                         * Since only PAGE_SIZE pages can currently be
                         * migrated, just set it to page. This will need to be
                         * changed when hugepage migrations to device private
                         * memory are supported.
                         */
                        VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
                        subpage = &folio->page;
                } else {
                        subpage = folio_page(folio, pfn - folio_pfn(folio));
                }
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and
                         * fail if unsuccessful.
                         */
                        if (!anon) {
                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma)) {
                                        page_vma_mapped_walk_done(&pvmw);
                                        ret = false;
                                        break;
                                }
                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        flush_tlb_range(vma,
                                                range.start, range.end);

                                        /*
                                         * The ref count of the PMD page was
                                         * dropped which is part of the way map
                                         * counting is done for shared PMDs.
                                         * Return 'true' here.  When there is
                                         * no other sharing, huge_pmd_unshare
                                         * returns false and we will unmap the
                                         * actual page and drop map count
                                         * to zero.
                                         */
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                                hugetlb_vma_unlock_write(vma);
                        }
                        /* Nuke the hugetlb page table entry */
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                } else {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
                                /*
                                 * We clear the PTE but do not flush so potentially
                                 * a remote CPU could still be writing to the folio.
                                 * If the entry was previously clean then the
                                 * architecture must guarantee that a clear->dirty
                                 * transition on a cached TLB entry is written through
                                 * and traps if the PTE is unmapped.
                                 */
                                pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                                set_tlb_ubc_flush_pending(mm, pteval, address);
                        } else {
                                pteval = ptep_clear_flush(vma, address, pvmw.pte);
                        }
                }

                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
                        folio_mark_dirty(folio);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (folio_is_device_private(folio)) {
                        unsigned long pfn = folio_pfn(folio);
                        swp_entry_t entry;
                        pte_t swp_pte;

                        if (anon_exclusive)
                                WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
                                                                           subpage));

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        entry = pte_to_swp_entry(pteval);
                        if (is_writable_device_private_entry(entry))
                                entry = make_writable_migration_entry(pfn);
                        else if (anon_exclusive)
                                entry = make_readable_exclusive_migration_entry(pfn);
                        else
                                entry = make_readable_migration_entry(pfn);
                        swp_pte = swp_entry_to_pte(entry);

                        /*
                         * pteval maps a zone device page and is therefore
                         * a swap pte.
                         */
                        if (pte_swp_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_swp_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                        trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
                                                folio_order(folio));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                } else if (PageHWPoison(subpage)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }

                } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                if (folio_test_hugetlb(folio))
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                else
                                        set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
                                       !anon_exclusive, subpage);

                        /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
                        if (folio_test_hugetlb(folio)) {
                                if (anon_exclusive &&
                                    hugetlb_try_share_anon_rmap(folio)) {
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                        ret = false;
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                        } else if (anon_exclusive &&
                                   folio_try_share_anon_rmap_pte(folio, subpage)) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        if (pte_write(pteval))
                                entry = make_writable_migration_entry(
                                                        page_to_pfn(subpage));
                        else if (anon_exclusive)
                                entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(subpage));
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
                        if (pte_young(pteval))
                                entry = make_migration_entry_young(entry);
                        if (pte_dirty(pteval))
                                entry = make_migration_entry_dirty(entry);
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        if (folio_test_hugetlb(folio))
                                set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
                                                hsz);
                        else
                                set_pte_at(mm, address, pvmw.pte, swp_pte);
                        trace_set_migration_pte(address, pte_val(swp_pte),
                                                folio_order(folio));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                }

                if (unlikely(folio_test_hugetlb(folio)))
                        hugetlb_remove_rmap(folio);
                else
                        folio_remove_rmap_pte(folio, subpage, vma);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put(folio);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

/**
 * try_to_migrate - try to replace all page table mappings with swap entries
 * @folio: the folio to replace page table entries for
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this folio and
 * replace them with special swap entries. Caller must hold the folio lock.
 */
void try_to_migrate(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_migrate_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        /*
         * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
         * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
         */
        if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
                                        TTU_SYNC | TTU_BATCH_FLUSH)))
                return;

        if (folio_is_zone_device(folio) &&
            (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
                return;

        /*
         * During exec, a temporary VMA is setup and later moved.
         * The VMA is moved under the anon_vma lock but not the
         * page tables leading to a race where migration cannot
         * find the migration ptes. Rather than increasing the
         * locking requirements of exec(), migration skips
         * temporary VMAs until after exec() completes.
         */
        if (!folio_test_ksm(folio) && folio_test_anon(folio))
                rwc.invalid_vma = invalid_migration_vma;

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

#ifdef CONFIG_DEVICE_PRIVATE
struct make_exclusive_args {
        struct mm_struct *mm;
        unsigned long address;
        void *owner;
        bool valid;
};

static bool page_make_device_exclusive_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *priv)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        struct make_exclusive_args *args = priv;
        pte_t pteval;
        struct page *subpage;
        bool ret = true;
        struct mmu_notifier_range range;
        swp_entry_t entry;
        pte_t swp_pte;
        pte_t ptent;

        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                      vma->vm_mm, address, min(vma->vm_end,
                                      address + folio_size(folio)),
                                      args->owner);
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                ptent = ptep_get(pvmw.pte);
                if (!pte_present(ptent)) {
                        ret = false;
                        page_vma_mapped_walk_done(&pvmw);
                        break;
                }

                subpage = folio_page(folio,
                                pte_pfn(ptent) - folio_pfn(folio));
                address = pvmw.address;

                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(ptent));
                pteval = ptep_clear_flush(vma, address, pvmw.pte);

                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
                        folio_mark_dirty(folio);

                /*
                 * Check that our target page is still mapped at the expected
                 * address.
                 */
                if (args->mm == mm && args->address == address &&
                    pte_write(pteval))
                        args->valid = true;

                /*
                 * Store the pfn of the page in a special migration
                 * pte. do_swap_page() will wait until the migration
                 * pte is removed and then restart fault handling.
                 */
                if (pte_write(pteval))
                        entry = make_writable_device_exclusive_entry(
                                                        page_to_pfn(subpage));
                else
                        entry = make_readable_device_exclusive_entry(
                                                        page_to_pfn(subpage));
                swp_pte = swp_entry_to_pte(entry);
                if (pte_soft_dirty(pteval))
                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                if (pte_uffd_wp(pteval))
                        swp_pte = pte_swp_mkuffd_wp(swp_pte);

                set_pte_at(mm, address, pvmw.pte, swp_pte);

                /*
                 * There is a reference on the page for the swap entry which has
                 * been removed, so shouldn't take another.
                 */
                folio_remove_rmap_pte(folio, subpage, vma);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

/**
 * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
 * @folio: The folio to replace page table entries for.
 * @mm: The mm_struct where the folio is expected to be mapped.
 * @address: Address where the folio is expected to be mapped.
 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
 *
 * Tries to remove all the page table entries which are mapping this
 * folio and replace them with special device exclusive swap entries to
 * grant a device exclusive access to the folio.
 *
 * Context: Caller must hold the folio lock.
 * Return: false if the page is still mapped, or if it could not be unmapped
 * from the expected address. Otherwise returns true (success).
 */
static bool folio_make_device_exclusive(struct folio *folio,
                struct mm_struct *mm, unsigned long address, void *owner)
{
        struct make_exclusive_args args = {
                .mm = mm,
                .address = address,
                .owner = owner,
                .valid = false,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = page_make_device_exclusive_one,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
                .arg = &args,
        };

        /*
         * Restrict to anonymous folios for now to avoid potential writeback
         * issues.
         */
        if (!folio_test_anon(folio))
                return false;

        rmap_walk(folio, &rwc);

        return args.valid && !folio_mapcount(folio);
}

/**
 * make_device_exclusive_range() - Mark a range for exclusive use by a device
 * @mm: mm_struct of associated target process
 * @start: start of the region to mark for exclusive device access
 * @end: end address of region
 * @pages: returns the pages which were successfully marked for exclusive access
 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
 *
 * Returns: number of pages found in the range by GUP. A page is marked for
 * exclusive access only if the page pointer is non-NULL.
 *
 * This function finds ptes mapping page(s) to the given address range, locks
 * them and replaces mappings with special swap entries preventing userspace CPU
 * access. On fault these entries are replaced with the original mapping after
 * calling MMU notifiers.
 *
 * A driver using this to program access from a device must use a mmu notifier
 * critical section to hold a device specific lock during programming. Once
 * programming is complete it should drop the page lock and reference after
 * which point CPU access to the page will revoke the exclusive access.
 */
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, struct page **pages,
                                void *owner)
{
        long npages = (end - start) >> PAGE_SHIFT;
        long i;

        npages = get_user_pages_remote(mm, start, npages,
                                       FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
                                       pages, NULL);
        if (npages < 0)
                return npages;

        for (i = 0; i < npages; i++, start += PAGE_SIZE) {
                struct folio *folio = page_folio(pages[i]);
                if (PageTail(pages[i]) || !folio_trylock(folio)) {
                        folio_put(folio);
                        pages[i] = NULL;
                        continue;
                }

                if (!folio_make_device_exclusive(folio, mm, start, owner)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        pages[i] = NULL;
                }
        }

        return npages;
}
EXPORT_SYMBOL_GPL(make_device_exclusive_range);
#endif

void __put_anon_vma(struct anon_vma *anon_vma)
{
        struct anon_vma *root = anon_vma->root;

        anon_vma_free(anon_vma);
        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
                anon_vma_free(root);
}

static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
                                            struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma;

        if (rwc->anon_lock)
                return rwc->anon_lock(folio, rwc);

        /*
         * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_lock. Users without mmap_lock are required to
         * take a reference count to prevent the anon_vma disappearing
         */
        anon_vma = folio_anon_vma(folio);
        if (!anon_vma)
                return NULL;

        if (anon_vma_trylock_read(anon_vma))
                goto out;

        if (rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        anon_vma_lock_read(anon_vma);
out:
        return anon_vma;
}

/*
 * rmap_walk_anon - do something to anonymous page using the object-based
 * rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma
 * chains contained in the anon_vma struct it points to.
 */
static void rmap_walk_anon(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
        struct anon_vma_chain *avc;

        if (locked) {
                anon_vma = folio_anon_vma(folio);
                /* anon_vma disappear under us? */
                VM_BUG_ON_FOLIO(!anon_vma, folio);
        } else {
                anon_vma = rmap_walk_anon_lock(folio, rwc);
        }
        if (!anon_vma)
                return;

        pgoff_start = folio_pgoff(folio);
        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
                        pgoff_start, pgoff_end) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(vma, pgoff_start,
                                folio_nr_pages(folio));

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        break;
                if (rwc->done && rwc->done(folio))
                        break;
        }

        if (!locked)
                anon_vma_unlock_read(anon_vma);
}

/*
 * rmap_walk_file - do something to file page using the object-based rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma chains
 * contained in the address_space struct it points to.
 */
static void rmap_walk_file(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        struct address_space *mapping = folio_mapping(folio);
        pgoff_t pgoff_start, pgoff_end;
        struct vm_area_struct *vma;

        /*
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
         * so we can safely take mapping->i_mmap_rwsem.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!mapping)
                return;

        pgoff_start = folio_pgoff(folio);
        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
        if (!locked) {
                if (i_mmap_trylock_read(mapping))
                        goto lookup;

                if (rwc->try_lock) {
                        rwc->contended = true;
                        return;
                }

                i_mmap_lock_read(mapping);
        }
lookup:
        vma_interval_tree_foreach(vma, &mapping->i_mmap,
                        pgoff_start, pgoff_end) {
                unsigned long address = vma_address(vma, pgoff_start,
                               folio_nr_pages(folio));

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        goto done;
                if (rwc->done && rwc->done(folio))
                        goto done;
        }

done:
        if (!locked)
                i_mmap_unlock_read(mapping);
}

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
        if (unlikely(folio_test_ksm(folio)))
                rmap_walk_ksm(folio, rwc);
        else if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, false);
        else
                rmap_walk_file(folio, rwc, false);
}

/* Like rmap_walk, but caller holds relevant rmap lock */
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
        /* no ksm support for now */
        VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
        if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, true);
        else
                rmap_walk_file(folio, rwc, true);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * The following two functions are for anonymous (private mapped) hugepages.
 * Unlike common anonymous pages, anonymous hugepages have no accounting code
 * and no lru code, because we handle hugepages differently from common pages.
 */
void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        if (flags & RMAP_EXCLUSIVE)
                SetPageAnonExclusive(&folio->page);
        VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
                         PageAnonExclusive(&folio->page), folio);
}

void hugetlb_add_new_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        /* increment count (starts at -1) */
        atomic_set(&folio->_entire_mapcount, 0);
        atomic_set(&folio->_large_mapcount, 0);
        folio_clear_hugetlb_restore_reserve(folio);
        __folio_set_anon(folio, vma, address, true);
        SetPageAnonExclusive(&folio->page);
}
#endif /* CONFIG_HUGETLB_PAGE */

































    4 



















    3 






    3 
    5 










    3 
    4 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_ZONES_H
#define _NF_CONNTRACK_ZONES_H

#include <linux/netfilter/nf_conntrack_zones_common.h>
#include <net/netfilter/nf_conntrack.h>

static inline const struct nf_conntrack_zone *
nf_ct_zone(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return &ct->zone;
#else
        return &nf_ct_zone_dflt;
#endif
}

static inline const struct nf_conntrack_zone *
nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
{
        zone->id = id;
        zone->flags = flags;
        zone->dir = dir;

        return zone;
}

static inline const struct nf_conntrack_zone *
nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
                struct nf_conntrack_zone *tmp)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        if (!tmpl)
                return &nf_ct_zone_dflt;

        if (tmpl->zone.flags & NF_CT_FLAG_MARK)
                return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0);
#endif
        return nf_ct_zone(tmpl);
}

static inline void nf_ct_zone_add(struct nf_conn *ct,
                                  const struct nf_conntrack_zone *zone)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        ct->zone = *zone;
#endif
}

static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
                                          enum ip_conntrack_dir dir)
{
        return zone->dir & (1 << dir);
}

static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone,
                                enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return nf_ct_zone_matches_dir(zone, dir) ?
               zone->id : NF_CT_DEFAULT_ZONE_ID;
#else
        return NF_CT_DEFAULT_ZONE_ID;
#endif
}

static inline bool nf_ct_zone_equal(const struct nf_conn *a,
                                    const struct nf_conntrack_zone *b,
                                    enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return nf_ct_zone_id(nf_ct_zone(a), dir) ==
               nf_ct_zone_id(b, dir);
#else
        return true;
#endif
}

static inline bool nf_ct_zone_equal_any(const struct nf_conn *a,
                                        const struct nf_conntrack_zone *b)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
        return nf_ct_zone(a)->id == b->id;
#else
        return true;
#endif
}

#endif /* _NF_CONNTRACK_ZONES_H */

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_SYNPROXY_H
#define _NF_CONNTRACK_SYNPROXY_H

#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netns/generic.h>

struct nf_conn_synproxy {
        u32        isn;
        u32        its;
        u32        tsoff;
};

static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct)
{
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
        return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY);
#else
        return NULL;
#endif
}

static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct)
{
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
        return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC);
#else
        return NULL;
#endif
}

static inline bool nf_ct_add_synproxy(struct nf_conn *ct,
                                      const struct nf_conn *tmpl)
{
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
        if (tmpl && nfct_synproxy(tmpl)) {
                if (!nfct_seqadj_ext_add(ct))
                        return false;

                if (!nfct_synproxy_ext_add(ct))
                        return false;
        }
#endif

        return true;
}

#endif /* _NF_CONNTRACK_SYNPROXY_H */



































































































































































    1 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Cryptographic API.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 */
#ifndef _CRYPTO_INTERNAL_H
#define _CRYPTO_INTERNAL_H

#include <crypto/algapi.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/jump_label.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/numa.h>
#include <linux/refcount.h>
#include <linux/rwsem.h>
#include <linux/scatterlist.h>
#include <linux/sched.h>
#include <linux/types.h>

struct akcipher_request;
struct crypto_akcipher;
struct crypto_instance;
struct crypto_template;

struct crypto_larval {
        struct crypto_alg alg;
        struct crypto_alg *adult;
        struct completion completion;
        u32 mask;
        bool test_started;
};

struct crypto_akcipher_sync_data {
        struct crypto_akcipher *tfm;
        const void *src;
        void *dst;
        unsigned int slen;
        unsigned int dlen;

        struct akcipher_request *req;
        struct crypto_wait cwait;
        struct scatterlist sg;
        u8 *buf;
};

enum {
        CRYPTOA_UNSPEC,
        CRYPTOA_ALG,
        CRYPTOA_TYPE,
        __CRYPTOA_MAX,
};

#define CRYPTOA_MAX (__CRYPTOA_MAX - 1)

/* Maximum number of (rtattr) parameters for each template. */
#define CRYPTO_MAX_ATTRS 32

extern struct list_head crypto_alg_list;
extern struct rw_semaphore crypto_alg_sem;
extern struct blocking_notifier_head crypto_chain;

int alg_test(const char *driver, const char *alg, u32 type, u32 mask);

#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
static inline bool crypto_boot_test_finished(void)
{
        return true;
}
static inline void set_crypto_boot_test_finished(void)
{
}
#else
DECLARE_STATIC_KEY_FALSE(__crypto_boot_test_finished);
static inline bool crypto_boot_test_finished(void)
{
        return static_branch_likely(&__crypto_boot_test_finished);
}
static inline void set_crypto_boot_test_finished(void)
{
        static_branch_enable(&__crypto_boot_test_finished);
}
#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */

#ifdef CONFIG_PROC_FS
void __init crypto_init_proc(void);
void __exit crypto_exit_proc(void);
#else
static inline void crypto_init_proc(void)
{ }
static inline void crypto_exit_proc(void)
{ }
#endif

static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize;
}

static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize;
}

struct crypto_alg *crypto_mod_get(struct crypto_alg *alg);
struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask);

struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask);
void crypto_larval_kill(struct crypto_alg *alg);
void crypto_wait_for_test(struct crypto_larval *larval);
void crypto_alg_tested(const char *name, int err);

void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
                          struct crypto_alg *nalg);
void crypto_remove_final(struct list_head *list);
void crypto_shoot_alg(struct crypto_alg *alg);
struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type,
                                         u32 mask, gfp_t gfp);
struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
                                      u32 mask);
void *crypto_create_tfm_node(struct crypto_alg *alg,
                        const struct crypto_type *frontend, int node);
void *crypto_clone_tfm(const struct crypto_type *frontend,
                       struct crypto_tfm *otfm);

int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data);
int crypto_akcipher_sync_post(struct crypto_akcipher_sync_data *data, int err);
int crypto_init_akcipher_ops_sig(struct crypto_tfm *tfm);

static inline void *crypto_create_tfm(struct crypto_alg *alg,
                        const struct crypto_type *frontend)
{
        return crypto_create_tfm_node(alg, frontend, NUMA_NO_NODE);
}

struct crypto_alg *crypto_find_alg(const char *alg_name,
                                   const struct crypto_type *frontend,
                                   u32 type, u32 mask);

void *crypto_alloc_tfm_node(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask,
                       int node);

static inline void *crypto_alloc_tfm(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask)
{
        return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE);
}

int crypto_probing_notify(unsigned long val, void *v);

unsigned int crypto_alg_extsize(struct crypto_alg *alg);

int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
                        u32 type, u32 mask);

static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
{
        refcount_inc(&alg->cra_refcnt);
        return alg;
}

static inline void crypto_alg_put(struct crypto_alg *alg)
{
        if (refcount_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy)
                alg->cra_destroy(alg);
}

static inline int crypto_tmpl_get(struct crypto_template *tmpl)
{
        return try_module_get(tmpl->module);
}

static inline void crypto_tmpl_put(struct crypto_template *tmpl)
{
        module_put(tmpl->module);
}

static inline int crypto_is_larval(struct crypto_alg *alg)
{
        return alg->cra_flags & CRYPTO_ALG_LARVAL;
}

static inline int crypto_is_dead(struct crypto_alg *alg)
{
        return alg->cra_flags & CRYPTO_ALG_DEAD;
}

static inline int crypto_is_moribund(struct crypto_alg *alg)
{
        return alg->cra_flags & (CRYPTO_ALG_DEAD | CRYPTO_ALG_DYING);
}

static inline void crypto_notify(unsigned long val, void *v)
{
        blocking_notifier_call_chain(&crypto_chain, val, v);
}

static inline void crypto_yield(u32 flags)
{
        if (flags & CRYPTO_TFM_REQ_MAY_SLEEP)
                cond_resched();
}

static inline int crypto_is_test_larval(struct crypto_larval *larval)
{
        return larval->alg.cra_driver_name[0];
}

static inline struct crypto_tfm *crypto_tfm_get(struct crypto_tfm *tfm)
{
        return refcount_inc_not_zero(&tfm->refcnt) ? tfm : ERR_PTR(-EOVERFLOW);
}

#endif        /* _CRYPTO_INTERNAL_H */
























































































































    2 



































































































































































































































    1 












    1 
























    1 



    1 















    1 




    1 


























































    1 


    1 












    1 
    1 



















































































































































    2 








    2 



















































    1 
    2 



    2 





















    1 




    1 


    1 


    1 










































































































































































































































































































































































































































































































































































    4 



    4 











    3 



















































    4 
    1 


    3 







   10 

    9 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 










    2 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h>                /* init_rootfs */
#include <linux/fs_struct.h>        /* get_fs_root et.al. */
#include <linux/fsnotify.h>        /* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/nospec.h>

#include "pnode.h"
#include "internal.h"

/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = 100000;

static unsigned int m_hash_mask __ro_after_init;
static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
        if (!str)
                return 0;
        mhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
        if (!str)
                return 0;
        mphash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mphash_entries=", set_mphash_entries);

static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);

/* Don't allow confusion with old 32bit mount ID */
static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);

static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);        /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */

struct mount_kattr {
        unsigned int attr_set;
        unsigned int attr_clr;
        unsigned int propagation;
        unsigned int lookup_flags;
        bool recurse;
        struct user_namespace *mnt_userns;
        struct mnt_idmap *mnt_idmap;
};

/* /sys/fs */
struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(fs_kobj);

/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static inline void lock_mount_hash(void)
{
        write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
        write_sequnlock(&mount_lock);
}

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
        tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> m_hash_shift);
        return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> mp_hash_shift);
        return &mountpoint_hashtable[tmp & mp_hash_mask];
}

static int mnt_alloc_id(struct mount *mnt)
{
        int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_id = res;
        mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
        return 0;
}

static void mnt_free_id(struct mount *mnt)
{
        ida_free(&mnt_id_ida, mnt->mnt_id);
}

/*
 * Allocate a new peer group ID
 */
static int mnt_alloc_group_id(struct mount *mnt)
{
        int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_group_id = res;
        return 0;
}

/*
 * Release a peer group ID
 */
void mnt_release_group_id(struct mount *mnt)
{
        ida_free(&mnt_group_ida, mnt->mnt_group_id);
        mnt->mnt_group_id = 0;
}

/*
 * vfsmount lock must be held for read
 */
static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
        preempt_disable();
        mnt->mnt_count += n;
        preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
        int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
        }

        return count;
#else
        return mnt->mnt_count;
#endif
}

static struct mount *alloc_vfsmnt(const char *name)
{
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
                int err;

                err = mnt_alloc_id(mnt);
                if (err)
                        goto out_free_cache;

                if (name) {
                        mnt->mnt_devname = kstrdup_const(name,
                                                         GFP_KERNEL_ACCOUNT);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }

#ifdef CONFIG_SMP
                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
                if (!mnt->mnt_pcp)
                        goto out_free_devname;

                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
                mnt->mnt_count = 1;
                mnt->mnt_writers = 0;
#endif

                INIT_HLIST_NODE(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
                INIT_LIST_HEAD(&mnt->mnt_list);
                INIT_LIST_HEAD(&mnt->mnt_expire);
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
                INIT_HLIST_NODE(&mnt->mnt_mp_list);
                INIT_LIST_HEAD(&mnt->mnt_umounting);
                INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
                mnt->mnt.mnt_idmap = &nop_mnt_idmap;
        }
        return mnt;

#ifdef CONFIG_SMP
out_free_devname:
        kfree_const(mnt->mnt_devname);
#endif
out_free_id:
        mnt_free_id(mnt);
out_free_cache:
        kmem_cache_free(mnt_cache, mnt);
        return NULL;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
bool __mnt_is_readonly(struct vfsmount *mnt)
{
        return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers++;
#endif
}

static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers--;
#endif
}

static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }

        return count;
#else
        return mnt->mnt_writers;
#endif
}

static int mnt_is_readonly(struct vfsmount *mnt)
{
        if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
                return 1;
        /*
         * The barrier pairs with the barrier in sb_start_ro_state_change()
         * making sure if we don't see s_readonly_remount set yet, we also will
         * not see any superblock / mount flag changes done by remount.
         * It also pairs with the barrier in sb_end_ro_state_change()
         * assuring that if we see s_readonly_remount already cleared, we will
         * see the values of superblock / mount flags updated by remount.
         */
        smp_rmb();
        return __mnt_is_readonly(mnt);
}

/*
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
 */
/**
 * mnt_get_write_access - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, mnt_put_write_access() must be
 * called. This is effectively a refcount.
 */
int mnt_get_write_access(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int ret = 0;

        preempt_disable();
        mnt_inc_writers(mnt);
        /*
         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
        smp_mb();
        might_lock(&mount_lock.lock);
        while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
                if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                        cpu_relax();
                } else {
                        /*
                         * This prevents priority inversion, if the task
                         * setting MNT_WRITE_HOLD got preempted on a remote
                         * CPU, and it prevents life lock if the task setting
                         * MNT_WRITE_HOLD has a lower priority and is bound to
                         * the same CPU as the task that is spinning here.
                         */
                        preempt_enable();
                        lock_mount_hash();
                        unlock_mount_hash();
                        preempt_disable();
                }
        }
        /*
         * The barrier pairs with the barrier sb_start_ro_state_change() making
         * sure that if we see MNT_WRITE_HOLD cleared, we will also see
         * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
         * mnt_is_readonly() and bail in case we are racing with remount
         * read-only.
         */
        smp_rmb();
        if (mnt_is_readonly(m)) {
                mnt_dec_writers(mnt);
                ret = -EROFS;
        }
        preempt_enable();

        return ret;
}
EXPORT_SYMBOL_GPL(mnt_get_write_access);

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
        int ret;

        sb_start_write(m->mnt_sb);
        ret = mnt_get_write_access(m);
        if (ret)
                sb_end_write(m->mnt_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

/**
 * mnt_get_write_access_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_get_write_access, but if @file is already open for write it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the check for emergency r/o remounts.  This must be
 * paired with mnt_put_write_access_file.
 */
int mnt_get_write_access_file(struct file *file)
{
        if (file->f_mode & FMODE_WRITER) {
                /*
                 * Superblock may have become readonly while there are still
                 * writable fd's, e.g. due to a fs error with errors=remount-ro
                 */
                if (__mnt_is_readonly(file->f_path.mnt))
                        return -EROFS;
                return 0;
        }
        return mnt_get_write_access(file->f_path.mnt);
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but if the file is already open for writing it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the freeze protection and the check for emergency r/o
 * remounts.  This must be paired with mnt_drop_write_file.
 */
int mnt_want_write_file(struct file *file)
{
        int ret;

        sb_start_write(file_inode(file)->i_sb);
        ret = mnt_get_write_access_file(file);
        if (ret)
                sb_end_write(file_inode(file)->i_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);

/**
 * mnt_put_write_access - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
 * mnt_get_write_access() call above.
 */
void mnt_put_write_access(struct vfsmount *mnt)
{
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_put_write_access);

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
        mnt_put_write_access(mnt);
        sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

void mnt_put_write_access_file(struct file *file)
{
        if (!(file->f_mode & FMODE_WRITER))
                mnt_put_write_access(file->f_path.mnt);
}

void mnt_drop_write_file(struct file *file)
{
        mnt_put_write_access_file(file);
        sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);

/**
 * mnt_hold_writers - prevent write access to the given mount
 * @mnt: mnt to prevent write access to
 *
 * Prevents write access to @mnt if there are no active writers for @mnt.
 * This function needs to be called and return successfully before changing
 * properties of @mnt that need to remain stable for callers with write access
 * to @mnt.
 *
 * After this functions has been called successfully callers must pair it with
 * a call to mnt_unhold_writers() in order to stop preventing write access to
 * @mnt.
 *
 * Context: This function expects lock_mount_hash() to be held serializing
 *          setting MNT_WRITE_HOLD.
 * Return: On success 0 is returned.
 *           On error, -EBUSY is returned.
 */
static inline int mnt_hold_writers(struct mount *mnt)
{
        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
         * should be visible before we do.
         */
        smp_mb();

        /*
         * With writers on hold, if this value is zero, then there are
         * definitely no active writers (although held writers may subsequently
         * increment the count, they'll have to wait, and decrement it after
         * seeing MNT_READONLY).
         *
         * It is OK to have counter incremented on one CPU and decremented on
         * another: the sum will add up correctly. The danger would be when we
         * sum up each counter, if we read a counter before it is incremented,
         * but then read another CPU's count which it has been subsequently
         * decremented from -- we would see more decrements than we should.
         * MNT_WRITE_HOLD protects against this scenario, because
         * mnt_want_write first increments count, then smp_mb, then spins on
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
        if (mnt_get_writers(mnt) > 0)
                return -EBUSY;

        return 0;
}

/**
 * mnt_unhold_writers - stop preventing write access to the given mount
 * @mnt: mnt to stop preventing write access to
 *
 * Stop preventing write access to @mnt allowing callers to gain write access
 * to @mnt again.
 *
 * This function can only be called after a successful call to
 * mnt_hold_writers().
 *
 * Context: This function expects lock_mount_hash() to be held.
 */
static inline void mnt_unhold_writers(struct mount *mnt)
{
        /*
         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
         * that become unheld will see MNT_READONLY.
         */
        smp_wmb();
        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}

static int mnt_make_readonly(struct mount *mnt)
{
        int ret;

        ret = mnt_hold_writers(mnt);
        if (!ret)
                mnt->mnt.mnt_flags |= MNT_READONLY;
        mnt_unhold_writers(mnt);
        return ret;
}

int sb_prepare_remount_readonly(struct super_block *sb)
{
        struct mount *mnt;
        int err = 0;

        /* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
        if (atomic_long_read(&sb->s_remove_count))
                return -EBUSY;

        lock_mount_hash();
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
                        err = mnt_hold_writers(mnt);
                        if (err)
                                break;
                }
        }
        if (!err && atomic_long_read(&sb->s_remove_count))
                err = -EBUSY;

        if (!err)
                sb_start_ro_state_change(sb);
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        }
        unlock_mount_hash();

        return err;
}

static void free_vfsmnt(struct mount *mnt)
{
        mnt_idmap_put(mnt_idmap(&mnt->mnt));
        kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
#endif
        kmem_cache_free(mnt_cache, mnt);
}

static void delayed_free_vfsmnt(struct rcu_head *head)
{
        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        struct mount *mnt;
        if (read_seqretry(&mount_lock, seq))
                return 1;
        if (bastard == NULL)
                return 0;
        mnt = real_mount(bastard);
        mnt_add_count(mnt, 1);
        smp_mb();                        // see mntput_no_expire()
        if (likely(!read_seqretry(&mount_lock, seq)))
                return 0;
        if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
                mnt_add_count(mnt, -1);
                return 1;
        }
        lock_mount_hash();
        if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
                mnt_add_count(mnt, -1);
                unlock_mount_hash();
                return 1;
        }
        unlock_mount_hash();
        /* caller will mntput() */
        return -1;
}

/* call under rcu_read_lock */
static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        int res = __legitimize_mnt(bastard, seq);
        if (likely(!res))
                return true;
        if (unlikely(res < 0)) {
                rcu_read_unlock();
                mntput(bastard);
                rcu_read_lock();
        }
        return false;
}

/**
 * __lookup_mnt - find first child mount
 * @mnt:        parent mount
 * @dentry:        mountpoint
 *
 * If @mnt has a child mount @c mounted @dentry find and return it.
 *
 * Note that the child mount @c need not be unique. There are cases
 * where shadow mounts are created. For example, during mount
 * propagation when a source mount @mnt whose root got overmounted by a
 * mount @o after path lookup but before @namespace_sem could be
 * acquired gets copied and propagated. So @mnt gets copied including
 * @o. When @mnt is propagated to a destination mount @d that already
 * has another mount @n mounted at the same mountpoint then the source
 * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
 * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
 * on @dentry.
 *
 * Return: The first child of @mnt mounted @dentry or NULL.
 */
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
        struct hlist_head *head = m_hash(mnt, dentry);
        struct mount *p;

        hlist_for_each_entry_rcu(p, head, mnt_hash)
                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                        return p;
        return NULL;
}

/*
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
 */
struct vfsmount *lookup_mnt(const struct path *path)
{
        struct mount *child_mnt;
        struct vfsmount *m;
        unsigned seq;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                child_mnt = __lookup_mnt(path->mnt, path->dentry);
                m = child_mnt ? &child_mnt->mnt : NULL;
        } while (!legitimize_mnt(m, seq));
        rcu_read_unlock();
        return m;
}

/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mount *mnt, *n;
        bool is_covered = false;

        down_read(&namespace_sem);
        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                is_covered = (mnt->mnt_mountpoint == dentry);
                if (is_covered)
                        break;
        }
        up_read(&namespace_sem);

        return is_covered;
}

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
        struct hlist_head *chain = mp_hash(dentry);
        struct mountpoint *mp;

        hlist_for_each_entry(mp, chain, m_hash) {
                if (mp->m_dentry == dentry) {
                        mp->m_count++;
                        return mp;
                }
        }
        return NULL;
}

static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
        struct mountpoint *mp, *new = NULL;
        int ret;

        if (d_mountpoint(dentry)) {
                /* might be worth a WARN_ON() */
                if (d_unlinked(dentry))
                        return ERR_PTR(-ENOENT);
mountpoint:
                read_seqlock_excl(&mount_lock);
                mp = lookup_mountpoint(dentry);
                read_sequnlock_excl(&mount_lock);
                if (mp)
                        goto done;
        }

        if (!new)
                new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);


        /* Exactly one processes may set d_mounted */
        ret = d_set_mounted(dentry);

        /* Someone else set d_mounted? */
        if (ret == -EBUSY)
                goto mountpoint;

        /* The dentry is not available as a mountpoint? */
        mp = ERR_PTR(ret);
        if (ret)
                goto done;

        /* Add the new mountpoint to the hash table */
        read_seqlock_excl(&mount_lock);
        new->m_dentry = dget(dentry);
        new->m_count = 1;
        hlist_add_head(&new->m_hash, mp_hash(dentry));
        INIT_HLIST_HEAD(&new->m_list);
        read_sequnlock_excl(&mount_lock);

        mp = new;
        new = NULL;
done:
        kfree(new);
        return mp;
}

/*
 * vfsmount lock must be held.  Additionally, the caller is responsible
 * for serializing calls for given disposal list.
 */
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
{
        if (!--mp->m_count) {
                struct dentry *dentry = mp->m_dentry;
                BUG_ON(!hlist_empty(&mp->m_list));
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_MOUNTED;
                spin_unlock(&dentry->d_lock);
                dput_to_list(dentry, list);
                hlist_del(&mp->m_hash);
                kfree(mp);
        }
}

/* called with namespace_lock and vfsmount lock */
static void put_mountpoint(struct mountpoint *mp)
{
        __put_mountpoint(mp, &ex_mountpoints);
}

static inline int check_mnt(struct mount *mnt)
{
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
}

/*
 * vfsmount lock must be held for write
 */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns) {
                ns->event = ++event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns && ns->event != event) {
                ns->event = event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static struct mountpoint *unhash_mnt(struct mount *mnt)
{
        struct mountpoint *mp;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
        hlist_del_init_rcu(&mnt->mnt_hash);
        hlist_del_init(&mnt->mnt_mp_list);
        mp = mnt->mnt_mp;
        mnt->mnt_mp = NULL;
        return mp;
}

/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
        put_mountpoint(unhash_mnt(mnt));
}

/*
 * vfsmount lock must be held for write
 */
void mnt_set_mountpoint(struct mount *mnt,
                        struct mountpoint *mp,
                        struct mount *child_mnt)
{
        mp->m_count++;
        mnt_add_count(mnt, 1);        /* essentially, that's mntget */
        child_mnt->mnt_mountpoint = mp->m_dentry;
        child_mnt->mnt_parent = mnt;
        child_mnt->mnt_mp = mp;
        hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}

/**
 * mnt_set_mountpoint_beneath - mount a mount beneath another one
 *
 * @new_parent: the source mount
 * @top_mnt:    the mount beneath which @new_parent is mounted
 * @new_mp:     the new mountpoint of @top_mnt on @new_parent
 *
 * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
 * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
 * @new_mp. And mount @new_parent on the old parent and old
 * mountpoint of @top_mnt.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void mnt_set_mountpoint_beneath(struct mount *new_parent,
                                       struct mount *top_mnt,
                                       struct mountpoint *new_mp)
{
        struct mount *old_top_parent = top_mnt->mnt_parent;
        struct mountpoint *old_top_mp = top_mnt->mnt_mp;

        mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
        mnt_change_mountpoint(new_parent, new_mp, top_mnt);
}


static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
        hlist_add_head_rcu(&mnt->mnt_hash,
                           m_hash(&parent->mnt, mnt->mnt_mountpoint));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

/**
 * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
 *              list of child mounts
 * @parent:  the parent
 * @mnt:     the new mount
 * @mp:      the new mountpoint
 * @beneath: whether to mount @mnt beneath or on top of @parent
 *
 * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
 * to @parent's child mount list and to @mount_hashtable.
 *
 * If @beneath is true, remove @mnt from its current parent and
 * mountpoint and mount it on @mp on @parent, and mount @parent on the
 * old parent and old mountpoint of @mnt. Finally, attach @parent to
 * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
 *
 * Note, when __attach_mnt() is called @mnt->mnt_parent already points
 * to the correct parent.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void attach_mnt(struct mount *mnt, struct mount *parent,
                       struct mountpoint *mp, bool beneath)
{
        if (beneath)
                mnt_set_mountpoint_beneath(mnt, parent, mp);
        else
                mnt_set_mountpoint(parent, mp, mnt);
        /*
         * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
         * beneath @parent then @mnt will need to be attached to
         * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
         * isn't the same mount as @parent.
         */
        __attach_mnt(mnt, mnt->mnt_parent);
}

void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
        struct mountpoint *old_mp = mnt->mnt_mp;
        struct mount *old_parent = mnt->mnt_parent;

        list_del_init(&mnt->mnt_child);
        hlist_del_init(&mnt->mnt_mp_list);
        hlist_del_init_rcu(&mnt->mnt_hash);

        attach_mnt(mnt, parent, mp, false);

        put_mountpoint(old_mp);
        mnt_add_count(old_parent, -1);
}

static inline struct mount *node_to_mount(struct rb_node *node)
{
        return node ? rb_entry(node, struct mount, mnt_node) : NULL;
}

static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{
        struct rb_node **link = &ns->mounts.rb_node;
        struct rb_node *parent = NULL;

        WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
        mnt->mnt_ns = ns;
        while (*link) {
                parent = *link;
                if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }
        rb_link_node(&mnt->mnt_node, parent, link);
        rb_insert_color(&mnt->mnt_node, &ns->mounts);
        mnt->mnt.mnt_flags |= MNT_ONRB;
}

/*
 * vfsmount lock must be held for write
 */
static void commit_tree(struct mount *mnt)
{
        struct mount *parent = mnt->mnt_parent;
        struct mount *m;
        LIST_HEAD(head);
        struct mnt_namespace *n = parent->mnt_ns;

        BUG_ON(parent == mnt);

        list_add_tail(&head, &mnt->mnt_list);
        while (!list_empty(&head)) {
                m = list_first_entry(&head, typeof(*m), mnt_list);
                list_del(&m->mnt_list);

                mnt_add_to_ns(n, m);
        }
        n->nr_mounts += n->pending_mounts;
        n->pending_mounts = 0;

        __attach_mnt(mnt, parent);
        touch_mnt_namespace(n);
}

static struct mount *next_mnt(struct mount *p, struct mount *root)
{
        struct list_head *next = p->mnt_mounts.next;
        if (next == &p->mnt_mounts) {
                while (1) {
                        if (p == root)
                                return NULL;
                        next = p->mnt_child.next;
                        if (next != &p->mnt_parent->mnt_mounts)
                                break;
                        p = p->mnt_parent;
                }
        }
        return list_entry(next, struct mount, mnt_child);
}

static struct mount *skip_mnt_tree(struct mount *p)
{
        struct list_head *prev = p->mnt_mounts.prev;
        while (prev != &p->mnt_mounts) {
                p = list_entry(prev, struct mount, mnt_child);
                prev = p->mnt_mounts.prev;
        }
        return p;
}

/**
 * vfs_create_mount - Create a mount for a configured superblock
 * @fc: The configuration context with the superblock attached
 *
 * Create a mount to an already configured superblock.  If necessary, the
 * caller should invoke vfs_get_tree() before calling this.
 *
 * Note that this does not attach the mount to anything.
 */
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
        struct mount *mnt;

        if (!fc->root)
                return ERR_PTR(-EINVAL);

        mnt = alloc_vfsmnt(fc->source ?: "none");
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (fc->sb_flags & SB_KERNMOUNT)
                mnt->mnt.mnt_flags = MNT_INTERNAL;

        atomic_inc(&fc->root->d_sb->s_active);
        mnt->mnt.mnt_sb                = fc->root->d_sb;
        mnt->mnt.mnt_root        = dget(fc->root);
        mnt->mnt_mountpoint        = mnt->mnt.mnt_root;
        mnt->mnt_parent                = mnt;

        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
        return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);

struct vfsmount *fc_mount(struct fs_context *fc)
{
        int err = vfs_get_tree(fc);
        if (!err) {
                up_write(&fc->root->d_sb->s_umount);
                return vfs_create_mount(fc);
        }
        return ERR_PTR(err);
}
EXPORT_SYMBOL(fc_mount);

struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                int flags, const char *name,
                                void *data)
{
        struct fs_context *fc;
        struct vfsmount *mnt;
        int ret = 0;

        if (!type)
                return ERR_PTR(-EINVAL);

        fc = fs_context_for_mount(type, flags);
        if (IS_ERR(fc))
                return ERR_CAST(fc);

        if (name)
                ret = vfs_parse_fs_string(fc, "source",
                                          name, strlen(name));
        if (!ret)
                ret = parse_monolithic_mount_data(fc, data);
        if (!ret)
                mnt = fc_mount(fc);
        else
                mnt = ERR_PTR(ret);

        put_fs_context(fc);
        return mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
             const char *name, void *data)
{
        /* Until it is worked out how to pass the user namespace
         * through from the parent mount to the submount don't support
         * unprivileged mounts with submounts.
         */
        if (mountpoint->d_sb->s_user_ns != &init_user_ns)
                return ERR_PTR(-EPERM);

        return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
}
EXPORT_SYMBOL_GPL(vfs_submount);

static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                                        int flag)
{
        struct super_block *sb = old->mnt.mnt_sb;
        struct mount *mnt;
        int err;

        mnt = alloc_vfsmnt(old->mnt_devname);
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
                mnt->mnt_group_id = 0; /* not a peer of original */
        else
                mnt->mnt_group_id = old->mnt_group_id;

        if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
                err = mnt_alloc_group_id(mnt);
                if (err)
                        goto out_free;
        }

        mnt->mnt.mnt_flags = old->mnt.mnt_flags;
        mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);

        atomic_inc(&sb->s_active);
        mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));

        mnt->mnt.mnt_sb = sb;
        mnt->mnt.mnt_root = dget(root);
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        mnt->mnt_parent = mnt;
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
        unlock_mount_hash();

        if ((flag & CL_SLAVE) ||
            ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
                list_add(&mnt->mnt_slave, &old->mnt_slave_list);
                mnt->mnt_master = old;
                CLEAR_MNT_SHARED(mnt);
        } else if (!(flag & CL_PRIVATE)) {
                if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
                        list_add(&mnt->mnt_share, &old->mnt_share);
                if (IS_MNT_SLAVE(old))
                        list_add(&mnt->mnt_slave, &old->mnt_slave);
                mnt->mnt_master = old->mnt_master;
        } else {
                CLEAR_MNT_SHARED(mnt);
        }
        if (flag & CL_MAKE_SHARED)
                set_mnt_shared(mnt);

        /* stick the duplicate mount on the same expiry list
         * as the original if that was on one */
        if (flag & CL_EXPIRE) {
                if (!list_empty(&old->mnt_expire))
                        list_add(&mnt->mnt_expire, &old->mnt_expire);
        }

        return mnt;

 out_free:
        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
}

static void cleanup_mnt(struct mount *mnt)
{
        struct hlist_node *p;
        struct mount *m;
        /*
         * The warning here probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this happens, the
         * filesystem was probably unable to make r/w->r/o transitions.
         * The locking used to deal with mnt_count decrement provides barriers,
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
        if (unlikely(mnt->mnt_pins.first))
                mnt_pin_kill(mnt);
        hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}

static void __cleanup_mnt(struct rcu_head *head)
{
        cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_mntput_list);
        struct mount *m, *t;

        llist_for_each_entry_safe(m, t, node, mnt_llist)
                cleanup_mnt(m);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

static void mntput_no_expire(struct mount *mnt)
{
        LIST_HEAD(list);
        int count;

        rcu_read_lock();
        if (likely(READ_ONCE(mnt->mnt_ns))) {
                /*
                 * Since we don't do lock_mount_hash() here,
                 * ->mnt_ns can change under us.  However, if it's
                 * non-NULL, then there's a reference that won't
                 * be dropped until after an RCU delay done after
                 * turning ->mnt_ns NULL.  So if we observe it
                 * non-NULL under rcu_read_lock(), the reference
                 * we are dropping is not the final one.
                 */
                mnt_add_count(mnt, -1);
                rcu_read_unlock();
                return;
        }
        lock_mount_hash();
        /*
         * make sure that if __legitimize_mnt() has not seen us grab
         * mount_lock, we'll see their refcount increment here.
         */
        smp_mb();
        mnt_add_count(mnt, -1);
        count = mnt_get_count(mnt);
        if (count != 0) {
                WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        mnt->mnt.mnt_flags |= MNT_DOOMED;
        rcu_read_unlock();

        list_del(&mnt->mnt_instance);

        if (unlikely(!list_empty(&mnt->mnt_mounts))) {
                struct mount *p, *tmp;
                list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
                        __put_mountpoint(unhash_mnt(p), &list);
                        hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
                }
        }
        unlock_mount_hash();
        shrink_dentry_list(&list);

        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }
        cleanup_mnt(mnt);
}

void mntput(struct vfsmount *mnt)
{
        if (mnt) {
                struct mount *m = real_mount(mnt);
                /* avoid cacheline pingpong */
                if (unlikely(m->mnt_expiry_mark))
                        WRITE_ONCE(m->mnt_expiry_mark, 0);
                mntput_no_expire(m);
        }
}
EXPORT_SYMBOL(mntput);

struct vfsmount *mntget(struct vfsmount *mnt)
{
        if (mnt)
                mnt_add_count(real_mount(mnt), 1);
        return mnt;
}
EXPORT_SYMBOL(mntget);

/*
 * Make a mount point inaccessible to new lookups.
 * Because there may still be current users, the caller MUST WAIT
 * for an RCU grace period before destroying the mount point.
 */
void mnt_make_shortterm(struct vfsmount *mnt)
{
        if (mnt)
                real_mount(mnt)->mnt_ns = NULL;
}

/**
 * path_is_mountpoint() - Check if path is a mount in the current namespace.
 * @path: path to check
 *
 *  d_mountpoint() can only be used reliably to establish if a dentry is
 *  not mounted in any namespace and that common case is handled inline.
 *  d_mountpoint() isn't aware of the possibility there may be multiple
 *  mounts using a given dentry in a different namespace. This function
 *  checks if the passed in path is a mountpoint rather than the dentry
 *  alone.
 */
bool path_is_mountpoint(const struct path *path)
{
        unsigned seq;
        bool res;

        if (!d_mountpoint(path->dentry))
                return false;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                res = __path_is_mountpoint(path);
        } while (read_seqretry(&mount_lock, seq));
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(path_is_mountpoint);

struct vfsmount *mnt_clone_internal(const struct path *path)
{
        struct mount *p;
        p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
        if (IS_ERR(p))
                return ERR_CAST(p);
        p->mnt.mnt_flags |= MNT_INTERNAL;
        return &p->mnt;
}

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * smallest id afer the specified one.
 */
static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{
        struct rb_node *node = ns->mounts.rb_node;
        struct mount *ret = NULL;

        while (node) {
                struct mount *m = node_to_mount(node);

                if (mnt_id <= m->mnt_id_unique) {
                        ret = node_to_mount(node);
                        if (mnt_id == m->mnt_id_unique)
                                break;
                        node = node->rb_left;
                } else {
                        node = node->rb_right;
                }
        }
        return ret;
}

#ifdef CONFIG_PROC_FS

/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
        struct proc_mounts *p = m->private;

        down_read(&namespace_sem);

        return mnt_find_id_at(p->ns, *pos);
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct mount *next = NULL, *mnt = v;
        struct rb_node *node = rb_next(&mnt->mnt_node);

        ++*pos;
        if (node) {
                next = node_to_mount(node);
                *pos = next->mnt_id_unique;
        }
        return next;
}

static void m_stop(struct seq_file *m, void *v)
{
        up_read(&namespace_sem);
}

static int m_show(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *r = v;
        return p->show(m, &r->mnt);
}

const struct seq_operations mounts_op = {
        .start        = m_start,
        .next        = m_next,
        .stop        = m_stop,
        .show        = m_show,
};

#endif  /* CONFIG_PROC_FS */

/**
 * may_umount_tree - check if a mount tree is busy
 * @m: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
int may_umount_tree(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int actual_refs = 0;
        int minimum_refs = 0;
        struct mount *p;
        BUG_ON(!m);

        /* write lock needed for mnt_get_count */
        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
        unlock_mount_hash();

        if (actual_refs > minimum_refs)
                return 0;

        return 1;
}

EXPORT_SYMBOL(may_umount_tree);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{
        int ret = 1;
        down_read(&namespace_sem);
        lock_mount_hash();
        if (propagate_mount_busy(real_mount(mnt), 2))
                ret = 0;
        unlock_mount_hash();
        up_read(&namespace_sem);
        return ret;
}

EXPORT_SYMBOL(may_umount);

static void namespace_unlock(void)
{
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);

        up_write(&namespace_sem);

        shrink_dentry_list(&list);

        if (likely(hlist_empty(&head)))
                return;

        synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
}

static inline void namespace_lock(void)
{
        down_write(&namespace_sem);
}

enum umount_tree_flags {
        UMOUNT_SYNC = 1,
        UMOUNT_PROPAGATE = 2,
        UMOUNT_CONNECTED = 4,
};

static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{
        /* Leaving mounts connected is only valid for lazy umounts */
        if (how & UMOUNT_SYNC)
                return true;

        /* A mount without a parent has nothing to be connected to */
        if (!mnt_has_parent(mnt))
                return true;

        /* Because the reference counting rules change when mounts are
         * unmounted and connected, umounted mounts may not be
         * connected to mounted mounts.
         */
        if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
                return true;

        /* Has it been requested that the mount remain connected? */
        if (how & UMOUNT_CONNECTED)
                return false;

        /* Is the mount locked such that it needs to remain connected? */
        if (IS_MNT_LOCKED(mnt))
                return false;

        /* By default disconnect the mount */
        return true;
}

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
        LIST_HEAD(tmp_list);
        struct mount *p;

        if (how & UMOUNT_PROPAGATE)
                propagate_mount_unlock(mnt);

        /* Gather the mounts to umount */
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt.mnt_flags |= MNT_UMOUNT;
                if (p->mnt.mnt_flags & MNT_ONRB)
                        move_from_ns(p, &tmp_list);
                else
                        list_move(&p->mnt_list, &tmp_list);
        }

        /* Hide the mounts from mnt_mounts */
        list_for_each_entry(p, &tmp_list, mnt_list) {
                list_del_init(&p->mnt_child);
        }

        /* Add propogated mounts to the tmp_list */
        if (how & UMOUNT_PROPAGATE)
                propagate_umount(&tmp_list);

        while (!list_empty(&tmp_list)) {
                struct mnt_namespace *ns;
                bool disconnect;
                p = list_first_entry(&tmp_list, struct mount, mnt_list);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                ns = p->mnt_ns;
                if (ns) {
                        ns->nr_mounts--;
                        __touch_mnt_namespace(ns);
                }
                p->mnt_ns = NULL;
                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

                disconnect = disconnect_mount(p, how);
                if (mnt_has_parent(p)) {
                        mnt_add_count(p->mnt_parent, -1);
                        if (!disconnect) {
                                /* Don't forget about p */
                                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
                        } else {
                                umount_mnt(p);
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
                if (disconnect)
                        hlist_add_head(&p->mnt_umount, &unmounted);
        }
}

static void shrink_submounts(struct mount *mnt);

static int do_umount_root(struct super_block *sb)
{
        int ret = 0;

        down_write(&sb->s_umount);
        if (!sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
                                                SB_RDONLY);
                if (IS_ERR(fc)) {
                        ret = PTR_ERR(fc);
                } else {
                        ret = parse_monolithic_mount_data(fc, NULL);
                        if (!ret)
                                ret = reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        up_write(&sb->s_umount);
        return ret;
}

static int do_umount(struct mount *mnt, int flags)
{
        struct super_block *sb = mnt->mnt.mnt_sb;
        int retval;

        retval = security_sb_umount(&mnt->mnt, flags);
        if (retval)
                return retval;

        /*
         * Allow userspace to request a mountpoint be expired rather than
         * unmounting unconditionally. Unmount only happens if:
         *  (1) the mark is already set (the mark is cleared by mntput())
         *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
         */
        if (flags & MNT_EXPIRE) {
                if (&mnt->mnt == current->fs->root.mnt ||
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;

                /*
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
                lock_mount_hash();
                if (mnt_get_count(mnt) != 2) {
                        unlock_mount_hash();
                        return -EBUSY;
                }
                unlock_mount_hash();

                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
        }

        /*
         * If we may have to abort operations to get out of this
         * mount, and they will themselves hold resources we must
         * allow the fs to do things. In the Unix tradition of
         * 'Gee thats tricky lets do it in userspace' the umount_begin
         * might fail to complete on the first run through as other tasks
         * must return, and the like. Thats for the mount program to worry
         * about for the moment.
         */

        if (flags & MNT_FORCE && sb->s_op->umount_begin) {
                sb->s_op->umount_begin(sb);
        }

        /*
         * No sense to grab the lock for this test, but test itself looks
         * somewhat bogus. Suggestions for better replacement?
         * Ho-hum... In principle, we might treat that as umount + switch
         * to rootfs. GC would eventually take care of the old vfsmount.
         * Actually it makes sense, especially if rootfs would contain a
         * /reboot - static binary that would close all descriptors and
         * call reboot(9). Then init(8) could umount root and exec /reboot.
         */
        if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
                /*
                 * Special case for "unmounting" root ...
                 * we just try to remount it readonly.
                 */
                if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                        return -EPERM;
                return do_umount_root(sb);
        }

        namespace_lock();
        lock_mount_hash();

        /* Recheck MNT_LOCKED with the locks held */
        retval = -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        event++;
        if (flags & MNT_DETACH) {
                if (mnt->mnt.mnt_flags & MNT_ONRB ||
                    !list_empty(&mnt->mnt_list))
                        umount_tree(mnt, UMOUNT_PROPAGATE);
                retval = 0;
        } else {
                shrink_submounts(mnt);
                retval = -EBUSY;
                if (!propagate_mount_busy(mnt, 2)) {
                        if (mnt->mnt.mnt_flags & MNT_ONRB ||
                            !list_empty(&mnt->mnt_list))
                                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                        retval = 0;
                }
        }
out:
        unlock_mount_hash();
        namespace_unlock();
        return retval;
}

/*
 * __detach_mounts - lazily unmount all mounts on the specified dentry
 *
 * During unlink, rmdir, and d_drop it is possible to loose the path
 * to an existing mountpoint, and wind up leaking the mount.
 * detach_mounts allows lazily unmounting those mounts instead of
 * leaking them.
 *
 * The caller may hold dentry->d_inode->i_mutex.
 */
void __detach_mounts(struct dentry *dentry)
{
        struct mountpoint *mp;
        struct mount *mnt;

        namespace_lock();
        lock_mount_hash();
        mp = lookup_mountpoint(dentry);
        if (!mp)
                goto out_unlock;

        event++;
        while (!hlist_empty(&mp->m_list)) {
                mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
                if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
                        umount_mnt(mnt);
                        hlist_add_head(&mnt->mnt_umount, &unmounted);
                }
                else umount_tree(mnt, UMOUNT_CONNECTED);
        }
        put_mountpoint(mp);
out_unlock:
        unlock_mount_hash();
        namespace_unlock();
}

/*
 * Is the caller allowed to modify his namespace?
 */
bool may_mount(void)
{
        return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}

/**
 * path_mounted - check whether path is mounted
 * @path: path to check
 *
 * Determine whether @path refers to the root of a mount.
 *
 * Return: true if @path is the root of a mount, false if not.
 */
static inline bool path_mounted(const struct path *path)
{
        return path->mnt->mnt_root == path->dentry;
}

static void warn_mandlock(void)
{
        pr_warn_once("=======================================================\n"
                     "WARNING: The mand mount option has been deprecated and\n"
                     "         and is ignored by this kernel. Remove the mand\n"
                     "         option from the mount to silence this warning.\n"
                     "=======================================================\n");
}

static int can_umount(const struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);

        if (!may_mount())
                return -EPERM;
        if (!path_mounted(path))
                return -EINVAL;
        if (!check_mnt(mnt))
                return -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
                return -EINVAL;
        if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        ret = can_umount(path, flags);
        if (!ret)
                ret = do_umount(mnt, flags);

        /* we mustn't call path_put() as that would clear mnt_expiry_mark */
        dput(path->dentry);
        mntput_no_expire(mnt);
        return ret;
}

static int ksys_umount(char __user *name, int flags)
{
        int lookup_flags = LOOKUP_MOUNTPOINT;
        struct path path;
        int ret;

        // basic validity checks done first
        if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
                return -EINVAL;

        if (!(flags & UMOUNT_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (ret)
                return ret;
        return path_umount(&path, flags);
}

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
        return ksys_umount(name, flags);
}

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
 *        The 2.0 compatible umount. No flags.
 */
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
        return ksys_umount(name, 0);
}

#endif

static bool is_mnt_ns_file(struct dentry *dentry)
{
        /* Is this a proxy for a mount namespace? */
        return dentry->d_op == &ns_dentry_operations &&
               dentry->d_fsdata == &mntns_operations;
}

static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
        return container_of(ns, struct mnt_namespace, ns);
}

struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
        return &mnt->ns;
}

static bool mnt_ns_loop(struct dentry *dentry)
{
        /* Could bind mounting the mount namespace inode cause a
         * mount namespace loop?
         */
        struct mnt_namespace *mnt_ns;
        if (!is_mnt_ns_file(dentry))
                return false;

        mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
        return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                        int flag)
{
        struct mount *res, *p, *q, *r, *parent;

        if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
                return ERR_PTR(-EINVAL);

        if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
                return ERR_PTR(-EINVAL);

        res = q = clone_mnt(mnt, dentry, flag);
        if (IS_ERR(q))
                return q;

        q->mnt_mountpoint = mnt->mnt_mountpoint;

        p = mnt;
        list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
                struct mount *s;
                if (!is_subdir(r->mnt_mountpoint, dentry))
                        continue;

                for (s = r; s; s = next_mnt(s, r)) {
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(s)) {
                                if (s->mnt.mnt_flags & MNT_LOCKED) {
                                        /* Both unbindable and locked. */
                                        q = ERR_PTR(-EPERM);
                                        goto out;
                                } else {
                                        s = skip_mnt_tree(s);
                                        continue;
                                }
                        }
                        if (!(flag & CL_COPY_MNT_NS_FILE) &&
                            is_mnt_ns_file(s->mnt.mnt_root)) {
                                s = skip_mnt_tree(s);
                                continue;
                        }
                        while (p != s->mnt_parent) {
                                p = p->mnt_parent;
                                q = q->mnt_parent;
                        }
                        p = s;
                        parent = q;
                        q = clone_mnt(p, p->mnt.mnt_root, flag);
                        if (IS_ERR(q))
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, parent, p->mnt_mp, false);
                        unlock_mount_hash();
                }
        }
        return res;
out:
        if (res) {
                lock_mount_hash();
                umount_tree(res, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return q;
}

/* Caller should check returned pointer for errors */

struct vfsmount *collect_mounts(const struct path *path)
{
        struct mount *tree;
        namespace_lock();
        if (!check_mnt(real_mount(path->mnt)))
                tree = ERR_PTR(-EINVAL);
        else
                tree = copy_tree(real_mount(path->mnt), path->dentry,
                                 CL_COPY_ALL | CL_PRIVATE);
        namespace_unlock();
        if (IS_ERR(tree))
                return ERR_CAST(tree);
        return &tree->mnt;
}

static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

void dissolve_on_fput(struct vfsmount *mnt)
{
        struct mnt_namespace *ns;
        namespace_lock();
        lock_mount_hash();
        ns = real_mount(mnt)->mnt_ns;
        if (ns) {
                if (is_anon_ns(ns))
                        umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
                else
                        ns = NULL;
        }
        unlock_mount_hash();
        namespace_unlock();
        if (ns)
                free_mnt_ns(ns);
}

void drop_collected_mounts(struct vfsmount *mnt)
{
        namespace_lock();
        lock_mount_hash();
        umount_tree(real_mount(mnt), 0);
        unlock_mount_hash();
        namespace_unlock();
}

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
        struct mount *child;

        list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                if (!is_subdir(child->mnt_mountpoint, dentry))
                        continue;

                if (child->mnt.mnt_flags & MNT_LOCKED)
                        return true;
        }
        return false;
}

/**
 * clone_private_mount - create a private clone of a path
 * @path: path to clone
 *
 * This creates a new vfsmount, which will be the clone of @path.  The new mount
 * will not be attached anywhere in the namespace and will be private (i.e.
 * changes to the originating mount won't be propagated into this).
 *
 * Release with mntput().
 */
struct vfsmount *clone_private_mount(const struct path *path)
{
        struct mount *old_mnt = real_mount(path->mnt);
        struct mount *new_mnt;

        down_read(&namespace_sem);
        if (IS_MNT_UNBINDABLE(old_mnt))
                goto invalid;

        if (!check_mnt(old_mnt))
                goto invalid;

        if (has_locked_children(old_mnt, path->dentry))
                goto invalid;

        new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
        up_read(&namespace_sem);

        if (IS_ERR(new_mnt))
                return ERR_CAST(new_mnt);

        /* Longterm mount to be removed by kern_unmount*() */
        new_mnt->mnt_ns = MNT_NS_INTERNAL;

        return &new_mnt->mnt;

invalid:
        up_read(&namespace_sem);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
                   struct vfsmount *root)
{
        struct mount *mnt;
        int res = f(root, arg);
        if (res)
                return res;
        list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
                res = f(&mnt->mnt, arg);
                if (res)
                        return res;
        }
        return 0;
}

static void lock_mnt_tree(struct mount *mnt)
{
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt)) {
                int flags = p->mnt.mnt_flags;
                /* Don't allow unprivileged users to change mount flags */
                flags |= MNT_LOCK_ATIME;

                if (flags & MNT_READONLY)
                        flags |= MNT_LOCK_READONLY;

                if (flags & MNT_NODEV)
                        flags |= MNT_LOCK_NODEV;

                if (flags & MNT_NOSUID)
                        flags |= MNT_LOCK_NOSUID;

                if (flags & MNT_NOEXEC)
                        flags |= MNT_LOCK_NOEXEC;
                /* Don't allow unprivileged users to reveal what is under a mount */
                if (list_empty(&p->mnt_expire))
                        flags |= MNT_LOCKED;
                p->mnt.mnt_flags = flags;
        }
}

static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
        struct mount *p;

        for (p = mnt; p != end; p = next_mnt(p, mnt)) {
                if (p->mnt_group_id && !IS_MNT_SHARED(p))
                        mnt_release_group_id(p);
        }
}

static int invent_group_ids(struct mount *mnt, bool recurse)
{
        struct mount *p;

        for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
                if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
                        int err = mnt_alloc_group_id(p);
                        if (err) {
                                cleanup_group_ids(mnt, p);
                                return err;
                        }
                }
        }

        return 0;
}

int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
        unsigned int max = READ_ONCE(sysctl_mount_max);
        unsigned int mounts = 0;
        struct mount *p;

        if (ns->nr_mounts >= max)
                return -ENOSPC;
        max -= ns->nr_mounts;
        if (ns->pending_mounts >= max)
                return -ENOSPC;
        max -= ns->pending_mounts;

        for (p = mnt; p; p = next_mnt(p, mnt))
                mounts++;

        if (mounts > max)
                return -ENOSPC;

        ns->pending_mounts += mounts;
        return 0;
}

enum mnt_tree_flags_t {
        MNT_TREE_MOVE = BIT(0),
        MNT_TREE_BENEATH = BIT(1),
};

/**
 * attach_recursive_mnt - attach a source mount tree
 * @source_mnt: mount tree to be attached
 * @top_mnt:    mount that @source_mnt will be mounted on or mounted beneath
 * @dest_mp:    the mountpoint @source_mnt will be mounted at
 * @flags:      modify how @source_mnt is supposed to be attached
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 *          tree of the destination mount and the cloned mount is added to
 *          the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 *          source mount.
 *
 * ---------------------------------------------------------------------------
 * |                         MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 *         all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 *         all the mounts belonging to the destination mount's propagation tree.
 *         the mount is marked as 'shared and slave'.
 * (*)        the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 *
 * Context: The function expects namespace_lock() to be held.
 * Return: If @source_mnt was successfully attached 0 is returned.
 *         Otherwise a negative error code is returned.
 */
static int attach_recursive_mnt(struct mount *source_mnt,
                                struct mount *top_mnt,
                                struct mountpoint *dest_mp,
                                enum mnt_tree_flags_t flags)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        HLIST_HEAD(tree_list);
        struct mnt_namespace *ns = top_mnt->mnt_ns;
        struct mountpoint *smp;
        struct mount *child, *dest_mnt, *p;
        struct hlist_node *n;
        int err = 0;
        bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;

        /*
         * Preallocate a mountpoint in case the new mounts need to be
         * mounted beneath mounts on the same mountpoint.
         */
        smp = get_mountpoint(source_mnt->mnt.mnt_root);
        if (IS_ERR(smp))
                return PTR_ERR(smp);

        /* Is there space to add these mounts to the mount namespace? */
        if (!moving) {
                err = count_mounts(ns, source_mnt);
                if (err)
                        goto out;
        }

        if (beneath)
                dest_mnt = top_mnt->mnt_parent;
        else
                dest_mnt = top_mnt;

        if (IS_MNT_SHARED(dest_mnt)) {
                err = invent_group_ids(source_mnt, true);
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
        }
        lock_mount_hash();
        if (err)
                goto out_cleanup_ids;

        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        }

        if (moving) {
                if (beneath)
                        dest_mp = smp;
                unhash_mnt(source_mnt);
                attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
                touch_mnt_namespace(source_mnt->mnt_ns);
        } else {
                if (source_mnt->mnt_ns) {
                        LIST_HEAD(head);

                        /* move from anon - the caller will destroy */
                        for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                                move_from_ns(p, &head);
                        list_del_init(&head);
                }
                if (beneath)
                        mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
                else
                        mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
                commit_tree(source_mnt);
        }

        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
                struct mount *q;
                hlist_del_init(&child->mnt_hash);
                q = __lookup_mnt(&child->mnt_parent->mnt,
                                 child->mnt_mountpoint);
                if (q)
                        mnt_change_mountpoint(child, smp, q);
                /* Notice when we are propagating across user namespaces */
                if (child->mnt_parent->mnt_ns->user_ns != user_ns)
                        lock_mnt_tree(child);
                child->mnt.mnt_flags &= ~MNT_LOCKED;
                commit_tree(child);
        }
        put_mountpoint(smp);
        unlock_mount_hash();

        return 0;

 out_cleanup_ids:
        while (!hlist_empty(&tree_list)) {
                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
                child->mnt_parent->mnt_ns->pending_mounts = 0;
                umount_tree(child, UMOUNT_SYNC);
        }
        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        ns->pending_mounts = 0;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(smp);
        read_sequnlock_excl(&mount_lock);

        return err;
}

/**
 * do_lock_mount - lock mount and mountpoint
 * @path:    target path
 * @beneath: whether the intention is to mount beneath @path
 *
 * Follow the mount stack on @path until the top mount @mnt is found. If
 * the initial @path->{mnt,dentry} is a mountpoint lookup the first
 * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
 * until nothing is stacked on top of it anymore.
 *
 * Acquire the inode_lock() on the top mount's ->mnt_root to protect
 * against concurrent removal of the new mountpoint from another mount
 * namespace.
 *
 * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
 * @mp on @mnt->mnt_parent must be acquired. This protects against a
 * concurrent unlink of @mp->mnt_dentry from another mount namespace
 * where @mnt doesn't have a child mount mounted @mp. A concurrent
 * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
 * on top of it for @beneath.
 *
 * In addition, @beneath needs to make sure that @mnt hasn't been
 * unmounted or moved from its current mountpoint in between dropping
 * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
 * being unmounted would be detected later by e.g., calling
 * check_mnt(mnt) in the function it's called from. For the @beneath
 * case however, it's useful to detect it directly in do_lock_mount().
 * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
 * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
 * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
 *
 * Return: Either the target mountpoint on the top mount or the top
 *         mount's mountpoint.
 */
static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
{
        struct vfsmount *mnt = path->mnt;
        struct dentry *dentry;
        struct mountpoint *mp = ERR_PTR(-ENOENT);

        for (;;) {
                struct mount *m;

                if (beneath) {
                        m = real_mount(mnt);
                        read_seqlock_excl(&mount_lock);
                        dentry = dget(m->mnt_mountpoint);
                        read_sequnlock_excl(&mount_lock);
                } else {
                        dentry = path->dentry;
                }

                inode_lock(dentry->d_inode);
                if (unlikely(cant_mount(dentry))) {
                        inode_unlock(dentry->d_inode);
                        goto out;
                }

                namespace_lock();

                if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
                        namespace_unlock();
                        inode_unlock(dentry->d_inode);
                        goto out;
                }

                mnt = lookup_mnt(path);
                if (likely(!mnt))
                        break;

                namespace_unlock();
                inode_unlock(dentry->d_inode);
                if (beneath)
                        dput(dentry);
                path_put(path);
                path->mnt = mnt;
                path->dentry = dget(mnt->mnt_root);
        }

        mp = get_mountpoint(dentry);
        if (IS_ERR(mp)) {
                namespace_unlock();
                inode_unlock(dentry->d_inode);
        }

out:
        if (beneath)
                dput(dentry);

        return mp;
}

static inline struct mountpoint *lock_mount(struct path *path)
{
        return do_lock_mount(path, false);
}

static void unlock_mount(struct mountpoint *where)
{
        struct dentry *dentry = where->m_dentry;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(where);
        read_sequnlock_excl(&mount_lock);

        namespace_unlock();
        inode_unlock(dentry->d_inode);
}

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
        if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
                return -EINVAL;

        if (d_is_dir(mp->m_dentry) !=
              d_is_dir(mnt->mnt.mnt_root))
                return -ENOTDIR;

        return attach_recursive_mnt(mnt, p, mp, 0);
}

/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int ms_flags)
{
        int type = ms_flags & ~(MS_REC | MS_SILENT);

        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return 0;
        /* Only one propagation flag should be set */
        if (!is_power_of_2(type))
                return 0;
        return type;
}

/*
 * recursively change the type of the mountpoint.
 */
static int do_change_type(struct path *path, int ms_flags)
{
        struct mount *m;
        struct mount *mnt = real_mount(path->mnt);
        int recurse = ms_flags & MS_REC;
        int type;
        int err = 0;

        if (!path_mounted(path))
                return -EINVAL;

        type = flags_to_propagation_type(ms_flags);
        if (!type)
                return -EINVAL;

        namespace_lock();
        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
                if (err)
                        goto out_unlock;
        }

        lock_mount_hash();
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
        unlock_mount_hash();

 out_unlock:
        namespace_unlock();
        return err;
}

static struct mount *__do_loopback(struct path *old_path, int recurse)
{
        struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);

        if (IS_MNT_UNBINDABLE(old))
                return mnt;

        if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
                return mnt;

        if (!recurse && has_locked_children(old, old_path->dentry))
                return mnt;

        if (recurse)
                mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
        else
                mnt = clone_mnt(old, old_path->dentry, 0);

        if (!IS_ERR(mnt))
                mnt->mnt.mnt_flags &= ~MNT_LOCKED;

        return mnt;
}

/*
 * do loopback mount.
 */
static int do_loopback(struct path *path, const char *old_name,
                                int recurse)
{
        struct path old_path;
        struct mount *mnt = NULL, *parent;
        struct mountpoint *mp;
        int err;
        if (!old_name || !*old_name)
                return -EINVAL;
        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;

        err = -EINVAL;
        if (mnt_ns_loop(old_path.dentry))
                goto out;

        mp = lock_mount(path);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto out;
        }

        parent = real_mount(path->mnt);
        if (!check_mnt(parent))
                goto out2;

        mnt = __do_loopback(&old_path, recurse);
        if (IS_ERR(mnt)) {
                err = PTR_ERR(mnt);
                goto out2;
        }

        err = graft_tree(mnt, parent, mp);
        if (err) {
                lock_mount_hash();
                umount_tree(mnt, UMOUNT_SYNC);
                unlock_mount_hash();
        }
out2:
        unlock_mount(mp);
out:
        path_put(&old_path);
        return err;
}

static struct file *open_detached_copy(struct path *path, bool recursive)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
        struct mount *mnt, *p;
        struct file *file;

        if (IS_ERR(ns))
                return ERR_CAST(ns);

        namespace_lock();
        mnt = __do_loopback(path, recursive);
        if (IS_ERR(mnt)) {
                namespace_unlock();
                free_mnt_ns(ns);
                return ERR_CAST(mnt);
        }

        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                mnt_add_to_ns(ns, p);
                ns->nr_mounts++;
        }
        ns->root = mnt;
        mntget(&mnt->mnt);
        unlock_mount_hash();
        namespace_unlock();

        mntput(path->mnt);
        path->mnt = &mnt->mnt;
        file = dentry_open(path, O_PATH, current_cred());
        if (IS_ERR(file))
                dissolve_on_fput(path->mnt);
        else
                file->f_mode |= FMODE_NEED_UNMOUNT;
        return file;
}

SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
        struct file *file;
        struct path path;
        int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
        bool detached = flags & OPEN_TREE_CLONE;
        int error;
        int fd;

        BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

        if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
                      OPEN_TREE_CLOEXEC))
                return -EINVAL;

        if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
                return -EINVAL;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (detached && !may_mount())
                return -EPERM;

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (unlikely(error)) {
                file = ERR_PTR(error);
        } else {
                if (detached)
                        file = open_detached_copy(&path, flags & AT_RECURSIVE);
                else
                        file = dentry_open(&path, O_PATH, current_cred());
                path_put(&path);
        }
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                return PTR_ERR(file);
        }
        fd_install(fd, file);
        return fd;
}

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
        unsigned int fl = mnt->mnt.mnt_flags;

        if ((fl & MNT_LOCK_READONLY) &&
            !(mnt_flags & MNT_READONLY))
                return false;

        if ((fl & MNT_LOCK_NODEV) &&
            !(mnt_flags & MNT_NODEV))
                return false;

        if ((fl & MNT_LOCK_NOSUID) &&
            !(mnt_flags & MNT_NOSUID))
                return false;

        if ((fl & MNT_LOCK_NOEXEC) &&
            !(mnt_flags & MNT_NOEXEC))
                return false;

        if ((fl & MNT_LOCK_ATIME) &&
            ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
                return false;

        return true;
}

static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
        bool readonly_request = (mnt_flags & MNT_READONLY);

        if (readonly_request == __mnt_is_readonly(&mnt->mnt))
                return 0;

        if (readonly_request)
                return mnt_make_readonly(mnt);

        mnt->mnt.mnt_flags &= ~MNT_READONLY;
        return 0;
}

static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{
        mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
        mnt->mnt.mnt_flags = mnt_flags;
        touch_mnt_namespace(mnt->mnt_ns);
}

static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
{
        struct super_block *sb = mnt->mnt_sb;

        if (!__mnt_is_readonly(mnt) &&
           (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
           (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
                char *buf = (char *)__get_free_page(GFP_KERNEL);
                char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);

                pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
                        sb->s_type->name,
                        is_mounted(mnt) ? "remounted" : "mounted",
                        mntpath, &sb->s_time_max,
                        (unsigned long long)sb->s_time_max);

                free_page((unsigned long)buf);
                sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
        }
}

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
{
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        /*
         * We're only checking whether the superblock is read-only not
         * changing it, so only take down_read(&sb->s_umount).
         */
        down_read(&sb->s_umount);
        lock_mount_hash();
        ret = change_mount_ro_state(mnt, mnt_flags);
        if (ret == 0)
                set_mount_attributes(mnt, mnt_flags);
        unlock_mount_hash();
        up_read(&sb->s_umount);

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        return ret;
}

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int ms_flags, int sb_flags,
                      int mnt_flags, void *data)
{
        int err;
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        struct fs_context *fc;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the remount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        err = parse_monolithic_mount_data(fc, data);
        if (!err) {
                down_write(&sb->s_umount);
                err = -EPERM;
                if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
                        err = reconfigure_super(fc);
                        if (!err) {
                                lock_mount_hash();
                                set_mount_attributes(mnt, mnt_flags);
                                unlock_mount_hash();
                        }
                }
                up_write(&sb->s_umount);
        }

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        put_fs_context(fc);
        return err;
}

static inline int tree_contains_unbindable(struct mount *mnt)
{
        struct mount *p;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                if (IS_MNT_UNBINDABLE(p))
                        return 1;
        }
        return 0;
}

/*
 * Check that there aren't references to earlier/same mount namespaces in the
 * specified subtree.  Such references can act as pins for mount namespaces
 * that aren't checked by the mount-cycle checking code, thereby allowing
 * cycles to be made.
 */
static bool check_for_nsfs_mounts(struct mount *subtree)
{
        struct mount *p;
        bool ret = false;

        lock_mount_hash();
        for (p = subtree; p; p = next_mnt(p, subtree))
                if (mnt_ns_loop(p->mnt.mnt_root))
                        goto out;

        ret = true;
out:
        unlock_mount_hash();
        return ret;
}

static int do_set_group(struct path *from_path, struct path *to_path)
{
        struct mount *from, *to;
        int err;

        from = real_mount(from_path->mnt);
        to = real_mount(to_path->mnt);

        namespace_lock();

        err = -EINVAL;
        /* To and From must be mounted */
        if (!is_mounted(&from->mnt))
                goto out;
        if (!is_mounted(&to->mnt))
                goto out;

        err = -EPERM;
        /* We should be allowed to modify mount namespaces of both mounts */
        if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
                goto out;
        if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
                goto out;

        err = -EINVAL;
        /* To and From paths should be mount roots */
        if (!path_mounted(from_path))
                goto out;
        if (!path_mounted(to_path))
                goto out;

        /* Setting sharing groups is only allowed across same superblock */
        if (from->mnt.mnt_sb != to->mnt.mnt_sb)
                goto out;

        /* From mount root should be wider than To mount root */
        if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
                goto out;

        /* From mount should not have locked children in place of To's root */
        if (has_locked_children(from, to->mnt.mnt_root))
                goto out;

        /* Setting sharing groups is only allowed on private mounts */
        if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
                goto out;

        /* From should not be private */
        if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
                goto out;

        if (IS_MNT_SLAVE(from)) {
                struct mount *m = from->mnt_master;

                list_add(&to->mnt_slave, &m->mnt_slave_list);
                to->mnt_master = m;
        }

        if (IS_MNT_SHARED(from)) {
                to->mnt_group_id = from->mnt_group_id;
                list_add(&to->mnt_share, &from->mnt_share);
                lock_mount_hash();
                set_mnt_shared(to);
                unlock_mount_hash();
        }

        err = 0;
out:
        namespace_unlock();
        return err;
}

/**
 * path_overmounted - check if path is overmounted
 * @path: path to check
 *
 * Check if path is overmounted, i.e., if there's a mount on top of
 * @path->mnt with @path->dentry as mountpoint.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: If path is overmounted true is returned, false if not.
 */
static inline bool path_overmounted(const struct path *path)
{
        rcu_read_lock();
        if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
                rcu_read_unlock();
                return true;
        }
        rcu_read_unlock();
        return false;
}

/**
 * can_move_mount_beneath - check that we can mount beneath the top mount
 * @from: mount to mount beneath
 * @to:   mount under which to mount
 * @mp:   mountpoint of @to
 *
 * - Make sure that @to->dentry is actually the root of a mount under
 *   which we can mount another mount.
 * - Make sure that nothing can be mounted beneath the caller's current
 *   root or the rootfs of the namespace.
 * - Make sure that the caller can unmount the topmost mount ensuring
 *   that the caller could reveal the underlying mountpoint.
 * - Ensure that nothing has been mounted on top of @from before we
 *   grabbed @namespace_sem to avoid creating pointless shadow mounts.
 * - Prevent mounting beneath a mount if the propagation relationship
 *   between the source mount, parent mount, and top mount would lead to
 *   nonsensical mount trees.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: On success 0, and on error a negative error code is returned.
 */
static int can_move_mount_beneath(const struct path *from,
                                  const struct path *to,
                                  const struct mountpoint *mp)
{
        struct mount *mnt_from = real_mount(from->mnt),
                     *mnt_to = real_mount(to->mnt),
                     *parent_mnt_to = mnt_to->mnt_parent;

        if (!mnt_has_parent(mnt_to))
                return -EINVAL;

        if (!path_mounted(to))
                return -EINVAL;

        if (IS_MNT_LOCKED(mnt_to))
                return -EINVAL;

        /* Avoid creating shadow mounts during mount propagation. */
        if (path_overmounted(from))
                return -EINVAL;

        /*
         * Mounting beneath the rootfs only makes sense when the
         * semantics of pivot_root(".", ".") are used.
         */
        if (&mnt_to->mnt == current->fs->root.mnt)
                return -EINVAL;
        if (parent_mnt_to == current->nsproxy->mnt_ns->root)
                return -EINVAL;

        for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
                if (p == mnt_to)
                        return -EINVAL;

        /*
         * If the parent mount propagates to the child mount this would
         * mean mounting @mnt_from on @mnt_to->mnt_parent and then
         * propagating a copy @c of @mnt_from on top of @mnt_to. This
         * defeats the whole purpose of mounting beneath another mount.
         */
        if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
                return -EINVAL;

        /*
         * If @mnt_to->mnt_parent propagates to @mnt_from this would
         * mean propagating a copy @c of @mnt_from on top of @mnt_from.
         * Afterwards @mnt_from would be mounted on top of
         * @mnt_to->mnt_parent and @mnt_to would be unmounted from
         * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
         * already mounted on @mnt_from, @mnt_to would ultimately be
         * remounted on top of @c. Afterwards, @mnt_from would be
         * covered by a copy @c of @mnt_from and @c would be covered by
         * @mnt_from itself. This defeats the whole purpose of mounting
         * @mnt_from beneath @mnt_to.
         */
        if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
                return -EINVAL;

        return 0;
}

static int do_move_mount(struct path *old_path, struct path *new_path,
                         bool beneath)
{
        struct mnt_namespace *ns;
        struct mount *p;
        struct mount *old;
        struct mount *parent;
        struct mountpoint *mp, *old_mp;
        int err;
        bool attached;
        enum mnt_tree_flags_t flags = 0;

        mp = do_lock_mount(new_path, beneath);
        if (IS_ERR(mp))
                return PTR_ERR(mp);

        old = real_mount(old_path->mnt);
        p = real_mount(new_path->mnt);
        parent = old->mnt_parent;
        attached = mnt_has_parent(old);
        if (attached)
                flags |= MNT_TREE_MOVE;
        old_mp = old->mnt_mp;
        ns = old->mnt_ns;

        err = -EINVAL;
        /* The mountpoint must be in our namespace. */
        if (!check_mnt(p))
                goto out;

        /* The thing moved must be mounted... */
        if (!is_mounted(&old->mnt))
                goto out;

        /* ... and either ours or the root of anon namespace */
        if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
                goto out;

        if (old->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        if (!path_mounted(old_path))
                goto out;

        if (d_is_dir(new_path->dentry) !=
            d_is_dir(old_path->dentry))
                goto out;
        /*
         * Don't move a mount residing in a shared parent.
         */
        if (attached && IS_MNT_SHARED(parent))
                goto out;

        if (beneath) {
                err = can_move_mount_beneath(old_path, new_path, mp);
                if (err)
                        goto out;

                err = -EINVAL;
                p = p->mnt_parent;
                flags |= MNT_TREE_BENEATH;
        }

        /*
         * Don't move a mount tree containing unbindable mounts to a destination
         * mount which is shared.
         */
        if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
                goto out;
        err = -ELOOP;
        if (!check_for_nsfs_mounts(old))
                goto out;
        for (; mnt_has_parent(p); p = p->mnt_parent)
                if (p == old)
                        goto out;

        err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
        if (err)
                goto out;

        /* if the mount is moved, it should no longer be expire
         * automatically */
        list_del_init(&old->mnt_expire);
        if (attached)
                put_mountpoint(old_mp);
out:
        unlock_mount(mp);
        if (!err) {
                if (attached)
                        mntput_no_expire(parent);
                else
                        free_mnt_ns(ns);
        }
        return err;
}

static int do_move_mount_old(struct path *path, const char *old_name)
{
        struct path old_path;
        int err;

        if (!old_name || !*old_name)
                return -EINVAL;

        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
        if (err)
                return err;

        err = do_move_mount(&old_path, path, false);
        path_put(&old_path);
        return err;
}

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
                        const struct path *path, int mnt_flags)
{
        struct mount *parent = real_mount(path->mnt);

        mnt_flags &= ~MNT_INTERNAL_FLAGS;

        if (unlikely(!check_mnt(parent))) {
                /* that's acceptable only for automounts done in private ns */
                if (!(mnt_flags & MNT_SHRINKABLE))
                        return -EINVAL;
                /* ... and for those we'd better have mountpoint still alive */
                if (!parent->mnt_ns)
                        return -EINVAL;
        }

        /* Refuse the same filesystem on the same mount point */
        if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
                return -EBUSY;

        if (d_is_symlink(newmnt->mnt.mnt_root))
                return -EINVAL;

        newmnt->mnt.mnt_flags = mnt_flags;
        return graft_tree(newmnt, parent, mp);
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
                           unsigned int mnt_flags)
{
        struct vfsmount *mnt;
        struct mountpoint *mp;
        struct super_block *sb = fc->root->d_sb;
        int error;

        error = security_sb_kern_mount(sb);
        if (!error && mount_too_revealing(sb, &mnt_flags))
                error = -EPERM;

        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        up_write(&sb->s_umount);

        mnt = vfs_create_mount(fc);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        mnt_warn_timestamp_expiry(mountpoint, mnt);

        mp = lock_mount(mountpoint);
        if (IS_ERR(mp)) {
                mntput(mnt);
                return PTR_ERR(mp);
        }
        error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
        unlock_mount(mp);
        if (error < 0)
                mntput(mnt);
        return error;
}

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
                        int mnt_flags, const char *name, void *data)
{
        struct file_system_type *type;
        struct fs_context *fc;
        const char *subtype = NULL;
        int err = 0;

        if (!fstype)
                return -EINVAL;

        type = get_fs_type(fstype);
        if (!type)
                return -ENODEV;

        if (type->fs_flags & FS_HAS_SUBTYPE) {
                subtype = strchr(fstype, '.');
                if (subtype) {
                        subtype++;
                        if (!*subtype) {
                                put_filesystem(type);
                                return -EINVAL;
                        }
                }
        }

        fc = fs_context_for_mount(type, sb_flags);
        put_filesystem(type);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the mount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        if (subtype)
                err = vfs_parse_fs_string(fc, "subtype",
                                          subtype, strlen(subtype));
        if (!err && name)
                err = vfs_parse_fs_string(fc, "source", name, strlen(name));
        if (!err)
                err = parse_monolithic_mount_data(fc, data);
        if (!err && !mount_capable(fc))
                err = -EPERM;
        if (!err)
                err = vfs_get_tree(fc);
        if (!err)
                err = do_new_mount_fc(fc, path, mnt_flags);

        put_fs_context(fc);
        return err;
}

int finish_automount(struct vfsmount *m, const struct path *path)
{
        struct dentry *dentry = path->dentry;
        struct mountpoint *mp;
        struct mount *mnt;
        int err;

        if (!m)
                return 0;
        if (IS_ERR(m))
                return PTR_ERR(m);

        mnt = real_mount(m);
        /* The new mount record should have at least 2 refs to prevent it being
         * expired before we get a chance to add it
         */
        BUG_ON(mnt_get_count(mnt) < 2);

        if (m->mnt_sb == path->mnt->mnt_sb &&
            m->mnt_root == dentry) {
                err = -ELOOP;
                goto discard;
        }

        /*
         * we don't want to use lock_mount() - in this case finding something
         * that overmounts our mountpoint to be means "quitely drop what we've
         * got", not "try to mount it on top".
         */
        inode_lock(dentry->d_inode);
        namespace_lock();
        if (unlikely(cant_mount(dentry))) {
                err = -ENOENT;
                goto discard_locked;
        }
        if (path_overmounted(path)) {
                err = 0;
                goto discard_locked;
        }
        mp = get_mountpoint(dentry);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto discard_locked;
        }

        err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
        unlock_mount(mp);
        if (unlikely(err))
                goto discard;
        mntput(m);
        return 0;

discard_locked:
        namespace_unlock();
        inode_unlock(dentry->d_inode);
discard:
        /* remove m from any expiration list it may be on */
        if (!list_empty(&mnt->mnt_expire)) {
                namespace_lock();
                list_del_init(&mnt->mnt_expire);
                namespace_unlock();
        }
        mntput(m);
        mntput(m);
        return err;
}

/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
        namespace_lock();

        list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

        namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{
        struct mount *mnt, *next;
        LIST_HEAD(graveyard);

        if (list_empty(mounts))
                return;

        namespace_lock();
        lock_mount_hash();

        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
         * - only referenced by its parent vfsmount
         * - still marked for expiry (marked on the last call here; marks are
         *   cleared by mntput())
         */
        list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
                if (!xchg(&mnt->mnt_expiry_mark, 1) ||
                        propagate_mount_busy(mnt, 1))
                        continue;
                list_move(&mnt->mnt_expire, &graveyard);
        }
        while (!list_empty(&graveyard)) {
                mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
        }
        unlock_mount_hash();
        namespace_unlock();
}

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
        struct mount *this_parent = parent;
        struct list_head *next;
        int found = 0;

repeat:
        next = this_parent->mnt_mounts.next;
resume:
        while (next != &this_parent->mnt_mounts) {
                struct list_head *tmp = next;
                struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

                next = tmp->next;
                if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
                        continue;
                /*
                 * Descend a level if the d_mounts list is non-empty.
                 */
                if (!list_empty(&mnt->mnt_mounts)) {
                        this_parent = mnt;
                        goto repeat;
                }

                if (!propagate_mount_busy(mnt, 1)) {
                        list_move_tail(&mnt->mnt_expire, graveyard);
                        found++;
                }
        }
        /*
         * All done at this level ... ascend and resume the search
         */
        if (this_parent != parent) {
                next = this_parent->mnt_child.next;
                this_parent = this_parent->mnt_parent;
                goto resume;
        }
        return found;
}

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 *
 * mount_lock must be held for write
 */
static void shrink_submounts(struct mount *mnt)
{
        LIST_HEAD(graveyard);
        struct mount *m;

        /* extract submounts of 'mountpoint' from the expiration list */
        while (select_submounts(mnt, &graveyard)) {
                while (!list_empty(&graveyard)) {
                        m = list_first_entry(&graveyard, struct mount,
                                                mnt_expire);
                        touch_mnt_namespace(m->mnt_ns);
                        umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                }
        }
}

static void *copy_mount_options(const void __user * data)
{
        char *copy;
        unsigned left, offset;

        if (!data)
                return NULL;

        copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!copy)
                return ERR_PTR(-ENOMEM);

        left = copy_from_user(copy, data, PAGE_SIZE);

        /*
         * Not all architectures have an exact copy_from_user(). Resort to
         * byte at a time.
         */
        offset = PAGE_SIZE - left;
        while (left) {
                char c;
                if (get_user(c, (const char __user *)data + offset))
                        break;
                copy[offset] = c;
                left--;
                offset++;
        }

        if (left == PAGE_SIZE) {
                kfree(copy);
                return ERR_PTR(-EFAULT);
        }

        return copy;
}

static char *copy_mount_string(const void __user *data)
{
        return data ? strndup_user(data, PATH_MAX) : NULL;
}

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page)
{
        unsigned int mnt_flags = 0, sb_flags;
        int ret;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        /* Basic sanity checks */
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;

        if (flags & MS_NOUSER)
                return -EINVAL;

        ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
        if (ret)
                return ret;
        if (!may_mount())
                return -EPERM;
        if (flags & SB_MANDLOCK)
                warn_mandlock();

        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;

        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (flags & MS_NODEV)
                mnt_flags |= MNT_NODEV;
        if (flags & MS_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (flags & MS_NOATIME)
                mnt_flags |= MNT_NOATIME;
        if (flags & MS_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (flags & MS_STRICTATIME)
                mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (flags & MS_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        /* The default atime for remount is preservation */
        if ((flags & MS_REMOUNT) &&
            ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
                       MS_STRICTATIME)) == 0)) {
                mnt_flags &= ~MNT_ATIME_MASK;
                mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
        }

        sb_flags = flags & (SB_RDONLY |
                            SB_SYNCHRONOUS |
                            SB_MANDLOCK |
                            SB_DIRSYNC |
                            SB_SILENT |
                            SB_POSIXACL |
                            SB_LAZYTIME |
                            SB_I_VERSION);

        if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
                return do_reconfigure_mnt(path, mnt_flags);
        if (flags & MS_REMOUNT)
                return do_remount(path, flags, sb_flags, mnt_flags, data_page);
        if (flags & MS_BIND)
                return do_loopback(path, dev_name, flags & MS_REC);
        if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return do_change_type(path, flags);
        if (flags & MS_MOVE)
                return do_move_mount_old(path, dev_name);

        return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
                            data_page);
}

long do_mount(const char *dev_name, const char __user *dir_name,
                const char *type_page, unsigned long flags, void *data_page)
{
        struct path path;
        int ret;

        ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
        if (ret)
                return ret;
        ret = path_mount(dev_name, &path, type_page, flags, data_page);
        path_put(&path);
        return ret;
}

static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
}

static void dec_mnt_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
}

static void free_mnt_ns(struct mnt_namespace *ns)
{
        if (!is_anon_ns(ns))
                ns_free_inum(&ns->ns);
        dec_mnt_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        kfree(ns);
}

/*
 * Assign a sequence number so we can detect when we attempt to bind
 * mount a reference to an older mount namespace into the current
 * mount namespace, preventing reference counting loops.  A 64bit
 * number incrementing at 10Ghz will take 12,427 years to wrap which
 * is effectively never, so we can ignore the possibility.
 */
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
        struct mnt_namespace *new_ns;
        struct ucounts *ucounts;
        int ret;

        ucounts = inc_mnt_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
        if (!new_ns) {
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(-ENOMEM);
        }
        if (!anon) {
                ret = ns_alloc_inum(&new_ns->ns);
                if (ret) {
                        kfree(new_ns);
                        dec_mnt_namespaces(ucounts);
                        return ERR_PTR(ret);
                }
        }
        new_ns->ns.ops = &mntns_operations;
        if (!anon)
                new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
        refcount_set(&new_ns->ns.count, 1);
        new_ns->mounts = RB_ROOT;
        init_waitqueue_head(&new_ns->poll);
        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        return new_ns;
}

__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                struct user_namespace *user_ns, struct fs_struct *new_fs)
{
        struct mnt_namespace *new_ns;
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct mount *p, *q;
        struct mount *old;
        struct mount *new;
        int copy_flags;

        BUG_ON(!ns);

        if (likely(!(flags & CLONE_NEWNS))) {
                get_mnt_ns(ns);
                return ns;
        }

        old = ns->root;

        new_ns = alloc_mnt_ns(user_ns, false);
        if (IS_ERR(new_ns))
                return new_ns;

        namespace_lock();
        /* First pass: copy the tree topology */
        copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
        if (user_ns != ns->user_ns)
                copy_flags |= CL_SHARED_TO_SLAVE;
        new = copy_tree(old, old->mnt.mnt_root, copy_flags);
        if (IS_ERR(new)) {
                namespace_unlock();
                free_mnt_ns(new_ns);
                return ERR_CAST(new);
        }
        if (user_ns != ns->user_ns) {
                lock_mount_hash();
                lock_mnt_tree(new);
                unlock_mount_hash();
        }
        new_ns->root = new;

        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
         * as belonging to new namespace.  We have already acquired a private
         * fs_struct, so tsk->fs->lock is not needed.
         */
        p = old;
        q = new;
        while (p) {
                mnt_add_to_ns(new_ns, q);
                new_ns->nr_mounts++;
                if (new_fs) {
                        if (&p->mnt == new_fs->root.mnt) {
                                new_fs->root.mnt = mntget(&q->mnt);
                                rootmnt = &p->mnt;
                        }
                        if (&p->mnt == new_fs->pwd.mnt) {
                                new_fs->pwd.mnt = mntget(&q->mnt);
                                pwdmnt = &p->mnt;
                        }
                }
                p = next_mnt(p, old);
                q = next_mnt(q, new);
                if (!q)
                        break;
                // an mntns binding we'd skipped?
                while (p->mnt.mnt_root != q->mnt.mnt_root)
                        p = next_mnt(skip_mnt_tree(p), old);
        }
        namespace_unlock();

        if (rootmnt)
                mntput(rootmnt);
        if (pwdmnt)
                mntput(pwdmnt);

        return new_ns;
}

struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
        struct mount *mnt = real_mount(m);
        struct mnt_namespace *ns;
        struct super_block *s;
        struct path path;
        int err;

        ns = alloc_mnt_ns(&init_user_ns, true);
        if (IS_ERR(ns)) {
                mntput(m);
                return ERR_CAST(ns);
        }
        ns->root = mnt;
        ns->nr_mounts++;
        mnt_add_to_ns(ns, mnt);

        err = vfs_path_lookup(m->mnt_root, m,
                        name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

        put_mnt_ns(ns);

        if (err)
                return ERR_PTR(err);

        /* trade a vfsmount reference for active sb one */
        s = path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mntput(path.mnt);
        /* lock the sucker */
        down_write(&s->s_umount);
        /* ... and return the root of (sub)tree on it */
        return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
{
        int ret;
        char *kernel_type;
        char *kernel_dev;
        void *options;

        kernel_type = copy_mount_string(type);
        ret = PTR_ERR(kernel_type);
        if (IS_ERR(kernel_type))
                goto out_type;

        kernel_dev = copy_mount_string(dev_name);
        ret = PTR_ERR(kernel_dev);
        if (IS_ERR(kernel_dev))
                goto out_dev;

        options = copy_mount_options(data);
        ret = PTR_ERR(options);
        if (IS_ERR(options))
                goto out_data;

        ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

        kfree(options);
out_data:
        kfree(kernel_dev);
out_dev:
        kfree(kernel_type);
out_type:
        return ret;
}

#define FSMOUNT_VALID_FLAGS                                                    \
        (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |            \
         MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME |       \
         MOUNT_ATTR_NOSYMFOLLOW)

#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)

#define MOUNT_SETATTR_PROPAGATION_FLAGS \
        (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)

static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
{
        unsigned int mnt_flags = 0;

        if (attr_flags & MOUNT_ATTR_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (attr_flags & MOUNT_ATTR_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (attr_flags & MOUNT_ATTR_NODEV)
                mnt_flags |= MNT_NODEV;
        if (attr_flags & MOUNT_ATTR_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (attr_flags & MOUNT_ATTR_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        return mnt_flags;
}

/*
 * Create a kernel mount representation for a new, prepared superblock
 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
 */
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
                unsigned int, attr_flags)
{
        struct mnt_namespace *ns;
        struct fs_context *fc;
        struct file *file;
        struct path newmount;
        struct mount *mnt;
        struct fd f;
        unsigned int mnt_flags = 0;
        long ret;

        if (!may_mount())
                return -EPERM;

        if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
                return -EINVAL;

        if (attr_flags & ~FSMOUNT_VALID_FLAGS)
                return -EINVAL;

        mnt_flags = attr_flags_to_mnt_flags(attr_flags);

        switch (attr_flags & MOUNT_ATTR__ATIME) {
        case MOUNT_ATTR_STRICTATIME:
                break;
        case MOUNT_ATTR_NOATIME:
                mnt_flags |= MNT_NOATIME;
                break;
        case MOUNT_ATTR_RELATIME:
                mnt_flags |= MNT_RELATIME;
                break;
        default:
                return -EINVAL;
        }

        f = fdget(fs_fd);
        if (!f.file)
                return -EBADF;

        ret = -EINVAL;
        if (f.file->f_op != &fscontext_fops)
                goto err_fsfd;

        fc = f.file->private_data;

        ret = mutex_lock_interruptible(&fc->uapi_mutex);
        if (ret < 0)
                goto err_fsfd;

        /* There must be a valid superblock or we can't mount it */
        ret = -EINVAL;
        if (!fc->root)
                goto err_unlock;

        ret = -EPERM;
        if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
                pr_warn("VFS: Mount too revealing\n");
                goto err_unlock;
        }

        ret = -EBUSY;
        if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
                goto err_unlock;

        if (fc->sb_flags & SB_MANDLOCK)
                warn_mandlock();

        newmount.mnt = vfs_create_mount(fc);
        if (IS_ERR(newmount.mnt)) {
                ret = PTR_ERR(newmount.mnt);
                goto err_unlock;
        }
        newmount.dentry = dget(fc->root);
        newmount.mnt->mnt_flags = mnt_flags;

        /* We've done the mount bit - now move the file context into more or
         * less the same state as if we'd done an fspick().  We don't want to
         * do any memory allocation or anything like that at this point as we
         * don't want to have to handle any errors incurred.
         */
        vfs_clean_context(fc);

        ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
        if (IS_ERR(ns)) {
                ret = PTR_ERR(ns);
                goto err_path;
        }
        mnt = real_mount(newmount.mnt);
        ns->root = mnt;
        ns->nr_mounts = 1;
        mnt_add_to_ns(ns, mnt);
        mntget(newmount.mnt);

        /* Attach to an apparent O_PATH fd with a note that we need to unmount
         * it, not just simply put it.
         */
        file = dentry_open(&newmount, O_PATH, fc->cred);
        if (IS_ERR(file)) {
                dissolve_on_fput(newmount.mnt);
                ret = PTR_ERR(file);
                goto err_path;
        }
        file->f_mode |= FMODE_NEED_UNMOUNT;

        ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
        if (ret >= 0)
                fd_install(ret, file);
        else
                fput(file);

err_path:
        path_put(&newmount);
err_unlock:
        mutex_unlock(&fc->uapi_mutex);
err_fsfd:
        fdput(f);
        return ret;
}

/*
 * Move a mount from one place to another.  In combination with
 * fsopen()/fsmount() this is used to install a new mount and in combination
 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
 * a mount subtree.
 *
 * Note the flags value is a combination of MOVE_MOUNT_* flags.
 */
SYSCALL_DEFINE5(move_mount,
                int, from_dfd, const char __user *, from_pathname,
                int, to_dfd, const char __user *, to_pathname,
                unsigned int, flags)
{
        struct path from_path, to_path;
        unsigned int lflags;
        int ret = 0;

        if (!may_mount())
                return -EPERM;

        if (flags & ~MOVE_MOUNT__MASK)
                return -EINVAL;

        if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
            (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
                return -EINVAL;

        /* If someone gives a pathname, they aren't permitted to move
         * from an fd that requires unmount as we can't get at the flag
         * to clear it afterwards.
         */
        lflags = 0;
        if (flags & MOVE_MOUNT_F_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_F_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_F_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
        if (ret < 0)
                return ret;

        lflags = 0;
        if (flags & MOVE_MOUNT_T_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_T_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_T_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
        if (ret < 0)
                goto out_from;

        ret = security_move_mount(&from_path, &to_path);
        if (ret < 0)
                goto out_to;

        if (flags & MOVE_MOUNT_SET_GROUP)
                ret = do_set_group(&from_path, &to_path);
        else
                ret = do_move_mount(&from_path, &to_path,
                                    (flags & MOVE_MOUNT_BENEATH));

out_to:
        path_put(&to_path);
out_from:
        path_put(&from_path);
        return ret;
}

/*
 * Return true if path is reachable from root
 *
 * namespace_sem or mount_lock is held
 */
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
                         const struct path *root)
{
        while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
                dentry = mnt->mnt_mountpoint;
                mnt = mnt->mnt_parent;
        }
        return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}

bool path_is_under(const struct path *path1, const struct path *path2)
{
        bool res;
        read_seqlock_excl(&mount_lock);
        res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
        read_sequnlock_excl(&mount_lock);
        return res;
}
EXPORT_SYMBOL(path_is_under);

/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
 * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
 * in this situation.
 *
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                const char __user *, put_old)
{
        struct path new, old, root;
        struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
        struct mountpoint *old_mp, *root_mp;
        int error;

        if (!may_mount())
                return -EPERM;

        error = user_path_at(AT_FDCWD, new_root,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
        if (error)
                goto out0;

        error = user_path_at(AT_FDCWD, put_old,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
        if (error)
                goto out1;

        error = security_sb_pivotroot(&old, &new);
        if (error)
                goto out2;

        get_fs_root(current->fs, &root);
        old_mp = lock_mount(&old);
        error = PTR_ERR(old_mp);
        if (IS_ERR(old_mp))
                goto out3;

        error = -EINVAL;
        new_mnt = real_mount(new.mnt);
        root_mnt = real_mount(root.mnt);
        old_mnt = real_mount(old.mnt);
        ex_parent = new_mnt->mnt_parent;
        root_parent = root_mnt->mnt_parent;
        if (IS_MNT_SHARED(old_mnt) ||
                IS_MNT_SHARED(ex_parent) ||
                IS_MNT_SHARED(root_parent))
                goto out4;
        if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
                goto out4;
        if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out4;
        error = -ENOENT;
        if (d_unlinked(new.dentry))
                goto out4;
        error = -EBUSY;
        if (new_mnt == root_mnt || old_mnt == root_mnt)
                goto out4; /* loop, on the same file system  */
        error = -EINVAL;
        if (!path_mounted(&root))
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(root_mnt))
                goto out4; /* not attached */
        if (!path_mounted(&new))
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(new_mnt))
                goto out4; /* not attached */
        /* make sure we can reach put_old from new_root */
        if (!is_path_reachable(old_mnt, old.dentry, &new))
                goto out4;
        /* make certain new is below the root */
        if (!is_path_reachable(new_mnt, new.dentry, &root))
                goto out4;
        lock_mount_hash();
        umount_mnt(new_mnt);
        root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
        if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
                new_mnt->mnt.mnt_flags |= MNT_LOCKED;
                root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
        }
        /* mount old root on put_old */
        attach_mnt(root_mnt, old_mnt, old_mp, false);
        /* mount new_root on / */
        attach_mnt(new_mnt, root_parent, root_mp, false);
        mnt_add_count(root_parent, -1);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        /* A moved mount should not expire automatically */
        list_del_init(&new_mnt->mnt_expire);
        put_mountpoint(root_mp);
        unlock_mount_hash();
        chroot_fs_refs(&root, &new);
        error = 0;
out4:
        unlock_mount(old_mp);
        if (!error)
                mntput_no_expire(ex_parent);
out3:
        path_put(&root);
out2:
        path_put(&old);
out1:
        path_put(&new);
out0:
        return error;
}

static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{
        unsigned int flags = mnt->mnt.mnt_flags;

        /*  flags to clear */
        flags &= ~kattr->attr_clr;
        /* flags to raise */
        flags |= kattr->attr_set;

        return flags;
}

static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        struct vfsmount *m = &mnt->mnt;
        struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;

        if (!kattr->mnt_idmap)
                return 0;

        /*
         * Creating an idmapped mount with the filesystem wide idmapping
         * doesn't make sense so block that. We don't allow mushy semantics.
         */
        if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
                return -EINVAL;

        /*
         * Once a mount has been idmapped we don't allow it to change its
         * mapping. It makes things simpler and callers can just create
         * another bind-mount they can idmap if they want to.
         */
        if (is_idmapped_mnt(m))
                return -EPERM;

        /* The underlying filesystem doesn't support idmapped mounts yet. */
        if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
                return -EINVAL;

        /* We're not controlling the superblock. */
        if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Mount has already been visible in the filesystem hierarchy. */
        if (!is_anon_ns(mnt->mnt_ns))
                return -EINVAL;

        return 0;
}

/**
 * mnt_allow_writers() - check whether the attribute change allows writers
 * @kattr: the new mount attributes
 * @mnt: the mount to which @kattr will be applied
 *
 * Check whether thew new mount attributes in @kattr allow concurrent writers.
 *
 * Return: true if writers need to be held, false if not
 */
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
                                     const struct mount *mnt)
{
        return (!(kattr->attr_set & MNT_READONLY) ||
                (mnt->mnt.mnt_flags & MNT_READONLY)) &&
               !kattr->mnt_idmap;
}

static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;
        int err;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
                        err = -EPERM;
                        break;
                }

                err = can_idmap_mount(kattr, m);
                if (err)
                        break;

                if (!mnt_allow_writers(kattr, m)) {
                        err = mnt_hold_writers(m);
                        if (err)
                                break;
                }

                if (!kattr->recurse)
                        return 0;
        }

        if (err) {
                struct mount *p;

                /*
                 * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
                 * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
                 * mounts and needs to take care to include the first mount.
                 */
                for (p = mnt; p; p = next_mnt(p, mnt)) {
                        /* If we had to hold writers unblock them. */
                        if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
                                mnt_unhold_writers(p);

                        /*
                         * We're done once the first mount we changed got
                         * MNT_WRITE_HOLD unset.
                         */
                        if (p == m)
                                break;
                }
        }
        return err;
}

static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        if (!kattr->mnt_idmap)
                return;

        /*
         * Pairs with smp_load_acquire() in mnt_idmap().
         *
         * Since we only allow a mount to change the idmapping once and
         * verified this in can_idmap_mount() we know that the mount has
         * @nop_mnt_idmap attached to it. So there's no need to drop any
         * references.
         */
        smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
}

static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                unsigned int flags;

                do_idmap_mount(kattr, m);
                flags = recalc_flags(kattr, m);
                WRITE_ONCE(m->mnt.mnt_flags, flags);

                /* If we had to hold writers unblock them. */
                if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt_unhold_writers(m);

                if (kattr->propagation)
                        change_mnt_propagation(m, kattr->propagation);
                if (!kattr->recurse)
                        break;
        }
        touch_mnt_namespace(mnt->mnt_ns);
}

static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{
        struct mount *mnt = real_mount(path->mnt);
        int err = 0;

        if (!path_mounted(path))
                return -EINVAL;

        if (kattr->mnt_userns) {
                struct mnt_idmap *mnt_idmap;

                mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
                if (IS_ERR(mnt_idmap))
                        return PTR_ERR(mnt_idmap);
                kattr->mnt_idmap = mnt_idmap;
        }

        if (kattr->propagation) {
                /*
                 * Only take namespace_lock() if we're actually changing
                 * propagation.
                 */
                namespace_lock();
                if (kattr->propagation == MS_SHARED) {
                        err = invent_group_ids(mnt, kattr->recurse);
                        if (err) {
                                namespace_unlock();
                                return err;
                        }
                }
        }

        err = -EINVAL;
        lock_mount_hash();

        /* Ensure that this isn't anything purely vfs internal. */
        if (!is_mounted(&mnt->mnt))
                goto out;

        /*
         * If this is an attached mount make sure it's located in the callers
         * mount namespace. If it's not don't let the caller interact with it.
         *
         * If this mount doesn't have a parent it's most often simply a
         * detached mount with an anonymous mount namespace. IOW, something
         * that's simply not attached yet. But there are apparently also users
         * that do change mount properties on the rootfs itself. That obviously
         * neither has a parent nor is it a detached mount so we cannot
         * unconditionally check for detached mounts.
         */
        if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
                goto out;

        /*
         * First, we get the mount tree in a shape where we can change mount
         * properties without failure. If we succeeded to do so we commit all
         * changes and if we failed we clean up.
         */
        err = mount_setattr_prepare(kattr, mnt);
        if (!err)
                mount_setattr_commit(kattr, mnt);

out:
        unlock_mount_hash();

        if (kattr->propagation) {
                if (err)
                        cleanup_group_ids(mnt, NULL);
                namespace_unlock();
        }

        return err;
}

static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
                                struct mount_kattr *kattr, unsigned int flags)
{
        int err = 0;
        struct ns_common *ns;
        struct user_namespace *mnt_userns;
        struct fd f;

        if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
                return 0;

        /*
         * We currently do not support clearing an idmapped mount. If this ever
         * is a use-case we can revisit this but for now let's keep it simple
         * and not allow it.
         */
        if (attr->attr_clr & MOUNT_ATTR_IDMAP)
                return -EINVAL;

        if (attr->userns_fd > INT_MAX)
                return -EINVAL;

        f = fdget(attr->userns_fd);
        if (!f.file)
                return -EBADF;

        if (!proc_ns_file(f.file)) {
                err = -EINVAL;
                goto out_fput;
        }

        ns = get_proc_ns(file_inode(f.file));
        if (ns->ops->type != CLONE_NEWUSER) {
                err = -EINVAL;
                goto out_fput;
        }

        /*
         * The initial idmapping cannot be used to create an idmapped
         * mount. We use the initial idmapping as an indicator of a mount
         * that is not idmapped. It can simply be passed into helpers that
         * are aware of idmapped mounts as a convenient shortcut. A user
         * can just create a dedicated identity mapping to achieve the same
         * result.
         */
        mnt_userns = container_of(ns, struct user_namespace, ns);
        if (mnt_userns == &init_user_ns) {
                err = -EPERM;
                goto out_fput;
        }

        /* We're not controlling the target namespace. */
        if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
                err = -EPERM;
                goto out_fput;
        }

        kattr->mnt_userns = get_user_ns(mnt_userns);

out_fput:
        fdput(f);
        return err;
}

static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
                             struct mount_kattr *kattr, unsigned int flags)
{
        unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        *kattr = (struct mount_kattr) {
                .lookup_flags        = lookup_flags,
                .recurse        = !!(flags & AT_RECURSIVE),
        };

        if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
                return -EINVAL;
        if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
                return -EINVAL;
        kattr->propagation = attr->propagation;

        if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
                return -EINVAL;

        kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
        kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);

        /*
         * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
         * users wanting to transition to a different atime setting cannot
         * simply specify the atime setting in @attr_set, but must also
         * specify MOUNT_ATTR__ATIME in the @attr_clr field.
         * So ensure that MOUNT_ATTR__ATIME can't be partially set in
         * @attr_clr and that @attr_set can't have any atime bits set if
         * MOUNT_ATTR__ATIME isn't set in @attr_clr.
         */
        if (attr->attr_clr & MOUNT_ATTR__ATIME) {
                if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
                        return -EINVAL;

                /*
                 * Clear all previous time settings as they are mutually
                 * exclusive.
                 */
                kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
                switch (attr->attr_set & MOUNT_ATTR__ATIME) {
                case MOUNT_ATTR_RELATIME:
                        kattr->attr_set |= MNT_RELATIME;
                        break;
                case MOUNT_ATTR_NOATIME:
                        kattr->attr_set |= MNT_NOATIME;
                        break;
                case MOUNT_ATTR_STRICTATIME:
                        break;
                default:
                        return -EINVAL;
                }
        } else {
                if (attr->attr_set & MOUNT_ATTR__ATIME)
                        return -EINVAL;
        }

        return build_mount_idmapped(attr, usize, kattr, flags);
}

static void finish_mount_kattr(struct mount_kattr *kattr)
{
        put_user_ns(kattr->mnt_userns);
        kattr->mnt_userns = NULL;

        if (kattr->mnt_idmap)
                mnt_idmap_put(kattr->mnt_idmap);
}

SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
                unsigned int, flags, struct mount_attr __user *, uattr,
                size_t, usize)
{
        int err;
        struct path target;
        struct mount_attr attr;
        struct mount_kattr kattr;

        BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);

        if (flags & ~(AT_EMPTY_PATH |
                      AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW |
                      AT_NO_AUTOMOUNT))
                return -EINVAL;

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
                return -EINVAL;

        if (!may_mount())
                return -EPERM;

        err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
        if (err)
                return err;

        /* Don't bother walking through the mounts if this is a nop. */
        if (attr.attr_set == 0 &&
            attr.attr_clr == 0 &&
            attr.propagation == 0)
                return 0;

        err = build_mount_kattr(&attr, usize, &kattr, flags);
        if (err)
                return err;

        err = user_path_at(dfd, path, kattr.lookup_flags, &target);
        if (!err) {
                err = do_mount_setattr(&target, &kattr);
                path_put(&target);
        }
        finish_mount_kattr(&kattr);
        return err;
}

int show_path(struct seq_file *m, struct dentry *root)
{
        if (root->d_sb->s_op->show_path)
                return root->d_sb->s_op->show_path(m, root);

        seq_dentry(m, root, " \t\n\\");
        return 0;
}

static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
{
        struct mount *mnt = mnt_find_id_at(ns, id);

        if (!mnt || mnt->mnt_id_unique != id)
                return NULL;

        return &mnt->mnt;
}

struct kstatmount {
        struct statmount __user *buf;
        size_t bufsize;
        struct vfsmount *mnt;
        u64 mask;
        struct path root;
        struct statmount sm;
        struct seq_file seq;
};

static u64 mnt_to_attr_flags(struct vfsmount *mnt)
{
        unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
        u64 attr_flags = 0;

        if (mnt_flags & MNT_READONLY)
                attr_flags |= MOUNT_ATTR_RDONLY;
        if (mnt_flags & MNT_NOSUID)
                attr_flags |= MOUNT_ATTR_NOSUID;
        if (mnt_flags & MNT_NODEV)
                attr_flags |= MOUNT_ATTR_NODEV;
        if (mnt_flags & MNT_NOEXEC)
                attr_flags |= MOUNT_ATTR_NOEXEC;
        if (mnt_flags & MNT_NODIRATIME)
                attr_flags |= MOUNT_ATTR_NODIRATIME;
        if (mnt_flags & MNT_NOSYMFOLLOW)
                attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;

        if (mnt_flags & MNT_NOATIME)
                attr_flags |= MOUNT_ATTR_NOATIME;
        else if (mnt_flags & MNT_RELATIME)
                attr_flags |= MOUNT_ATTR_RELATIME;
        else
                attr_flags |= MOUNT_ATTR_STRICTATIME;

        if (is_idmapped_mnt(mnt))
                attr_flags |= MOUNT_ATTR_IDMAP;

        return attr_flags;
}

static u64 mnt_to_propagation_flags(struct mount *m)
{
        u64 propagation = 0;

        if (IS_MNT_SHARED(m))
                propagation |= MS_SHARED;
        if (IS_MNT_SLAVE(m))
                propagation |= MS_SLAVE;
        if (IS_MNT_UNBINDABLE(m))
                propagation |= MS_UNBINDABLE;
        if (!propagation)
                propagation |= MS_PRIVATE;

        return propagation;
}

static void statmount_sb_basic(struct kstatmount *s)
{
        struct super_block *sb = s->mnt->mnt_sb;

        s->sm.mask |= STATMOUNT_SB_BASIC;
        s->sm.sb_dev_major = MAJOR(sb->s_dev);
        s->sm.sb_dev_minor = MINOR(sb->s_dev);
        s->sm.sb_magic = sb->s_magic;
        s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
}

static void statmount_mnt_basic(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_MNT_BASIC;
        s->sm.mnt_id = m->mnt_id_unique;
        s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
        s->sm.mnt_id_old = m->mnt_id;
        s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
        s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
        s->sm.mnt_propagation = mnt_to_propagation_flags(m);
        s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
        s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
}

static void statmount_propagate_from(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
        if (IS_MNT_SLAVE(m))
                s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
}

static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
{
        int ret;
        size_t start = seq->count;

        ret = show_path(seq, s->mnt->mnt_root);
        if (ret)
                return ret;

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        /*
         * Unescape the result. It would be better if supplied string was not
         * escaped in the first place, but that's a pretty invasive change.
         */
        seq->buf[seq->count] = '\0';
        seq->count = start;
        seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
        return 0;
}

static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
        int err;

        err = seq_path_root(seq, &mnt_path, &s->root, "");
        return err == SEQ_SKIP ? 0 : err;
}

static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;

        seq_puts(seq, sb->s_type->name);
        return 0;
}

static int statmount_string(struct kstatmount *s, u64 flag)
{
        int ret;
        size_t kbufsize;
        struct seq_file *seq = &s->seq;
        struct statmount *sm = &s->sm;

        switch (flag) {
        case STATMOUNT_FS_TYPE:
                sm->fs_type = seq->count;
                ret = statmount_fs_type(s, seq);
                break;
        case STATMOUNT_MNT_ROOT:
                sm->mnt_root = seq->count;
                ret = statmount_mnt_root(s, seq);
                break;
        case STATMOUNT_MNT_POINT:
                sm->mnt_point = seq->count;
                ret = statmount_mnt_point(s, seq);
                break;
        default:
                WARN_ON_ONCE(true);
                return -EINVAL;
        }

        if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
                return -EOVERFLOW;
        if (kbufsize >= s->bufsize)
                return -EOVERFLOW;

        /* signal a retry */
        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        if (ret)
                return ret;

        seq->buf[seq->count++] = '\0';
        sm->mask |= flag;
        return 0;
}

static int copy_statmount_to_user(struct kstatmount *s)
{
        struct statmount *sm = &s->sm;
        struct seq_file *seq = &s->seq;
        char __user *str = ((char __user *)s->buf) + sizeof(*sm);
        size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));

        if (seq->count && copy_to_user(str, seq->buf, seq->count))
                return -EFAULT;

        /* Return the number of bytes copied to the buffer */
        sm->size = copysize + seq->count;
        if (copy_to_user(s->buf, sm, copysize))
                return -EFAULT;

        return 0;
}

static int do_statmount(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);
        int err;

        /*
         * Don't trigger audit denials. We just want to determine what
         * mounts to show users.
         */
        if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
            !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        err = security_sb_statfs(s->mnt->mnt_root);
        if (err)
                return err;

        if (s->mask & STATMOUNT_SB_BASIC)
                statmount_sb_basic(s);

        if (s->mask & STATMOUNT_MNT_BASIC)
                statmount_mnt_basic(s);

        if (s->mask & STATMOUNT_PROPAGATE_FROM)
                statmount_propagate_from(s);

        if (s->mask & STATMOUNT_FS_TYPE)
                err = statmount_string(s, STATMOUNT_FS_TYPE);

        if (!err && s->mask & STATMOUNT_MNT_ROOT)
                err = statmount_string(s, STATMOUNT_MNT_ROOT);

        if (!err && s->mask & STATMOUNT_MNT_POINT)
                err = statmount_string(s, STATMOUNT_MNT_POINT);

        if (err)
                return err;

        return 0;
}

static inline bool retry_statmount(const long ret, size_t *seq_size)
{
        if (likely(ret != -EAGAIN))
                return false;
        if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
                return false;
        if (unlikely(*seq_size > MAX_RW_COUNT))
                return false;
        return true;
}

static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
                              struct statmount __user *buf, size_t bufsize,
                              size_t seq_size)
{
        if (!access_ok(buf, bufsize))
                return -EFAULT;

        memset(ks, 0, sizeof(*ks));
        ks->mask = kreq->param;
        ks->buf = buf;
        ks->bufsize = bufsize;
        ks->seq.size = seq_size;
        ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
        if (!ks->seq.buf)
                return -ENOMEM;
        return 0;
}

static int copy_mnt_id_req(const struct mnt_id_req __user *req,
                           struct mnt_id_req *kreq)
{
        int ret;
        size_t usize;

        BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);

        ret = get_user(usize, &req->size);
        if (ret)
                return -EFAULT;
        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
                return -EINVAL;
        memset(kreq, 0, sizeof(*kreq));
        ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
        if (ret)
                return ret;
        if (kreq->spare != 0)
                return -EINVAL;
        return 0;
}

SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
                struct statmount __user *, buf, size_t, bufsize,
                unsigned int, flags)
{
        struct vfsmount *mnt;
        struct mnt_id_req kreq;
        struct kstatmount ks;
        /* We currently support retrieval of 3 strings. */
        size_t seq_size = 3 * PATH_MAX;
        int ret;

        if (flags)
                return -EINVAL;

        ret = copy_mnt_id_req(req, &kreq);
        if (ret)
                return ret;

retry:
        ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
        if (ret)
                return ret;

        down_read(&namespace_sem);
        mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
        if (!mnt) {
                up_read(&namespace_sem);
                kvfree(ks.seq.buf);
                return -ENOENT;
        }

        ks.mnt = mnt;
        get_fs_root(current->fs, &ks.root);
        ret = do_statmount(&ks);
        path_put(&ks.root);
        up_read(&namespace_sem);

        if (!ret)
                ret = copy_statmount_to_user(&ks);
        kvfree(ks.seq.buf);
        if (retry_statmount(ret, &seq_size))
                goto retry;
        return ret;
}

static struct mount *listmnt_next(struct mount *curr)
{
        return node_to_mount(rb_next(&curr->mnt_node));
}

static ssize_t do_listmount(struct mount *first, struct path *orig,
                            u64 mnt_parent_id, u64 __user *mnt_ids,
                            size_t nr_mnt_ids, const struct path *root)
{
        struct mount *r;
        ssize_t ret;

        /*
         * Don't trigger audit denials. We just want to determine what
         * mounts to show users.
         */
        if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
            !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        ret = security_sb_statfs(orig->dentry);
        if (ret)
                return ret;

        for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
                if (r->mnt_id_unique == mnt_parent_id)
                        continue;
                if (!is_path_reachable(r, r->mnt.mnt_root, orig))
                        continue;
                if (put_user(r->mnt_id_unique, mnt_ids))
                        return -EFAULT;
                mnt_ids++;
                nr_mnt_ids--;
                ret++;
        }
        return ret;
}

SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
                mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mnt_id_req kreq;
        struct mount *first;
        struct path root, orig;
        u64 mnt_parent_id, last_mnt_id;
        const size_t maxcount = (size_t)-1 >> 3;
        ssize_t ret;

        if (flags)
                return -EINVAL;

        if (unlikely(nr_mnt_ids > maxcount))
                return -EFAULT;

        if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
                return -EFAULT;

        ret = copy_mnt_id_req(req, &kreq);
        if (ret)
                return ret;
        mnt_parent_id = kreq.mnt_id;
        last_mnt_id = kreq.param;

        down_read(&namespace_sem);
        get_fs_root(current->fs, &root);
        if (mnt_parent_id == LSMT_ROOT) {
                orig = root;
        } else {
                ret = -ENOENT;
                orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
                if (!orig.mnt)
                        goto err;
                orig.dentry = orig.mnt->mnt_root;
        }
        if (!last_mnt_id)
                first = node_to_mount(rb_first(&ns->mounts));
        else
                first = mnt_find_id_at(ns, last_mnt_id + 1);

        ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
err:
        path_put(&root);
        up_read(&namespace_sem);
        return ret;
}


static void __init init_mount_tree(void)
{
        struct vfsmount *mnt;
        struct mount *m;
        struct mnt_namespace *ns;
        struct path root;

        mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");

        ns = alloc_mnt_ns(&init_user_ns, false);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
        m = real_mount(mnt);
        ns->root = m;
        ns->nr_mounts = 1;
        mnt_add_to_ns(ns, m);
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);

        root.mnt = mnt;
        root.dentry = mnt->mnt_root;
        mnt->mnt_flags |= MNT_LOCKED;

        set_fs_pwd(current->fs, &root);
        set_fs_root(current->fs, &root);
}

void __init mnt_init(void)
{
        int err;

        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
                                mhash_entries, 19,
                                HASH_ZERO,
                                &m_hash_shift, &m_hash_mask, 0, 0);
        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
                                sizeof(struct hlist_head),
                                mphash_entries, 19,
                                HASH_ZERO,
                                &mp_hash_shift, &mp_hash_mask, 0, 0);

        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");

        kernfs_init();

        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __func__, err);
        fs_kobj = kobject_create_and_add("fs", NULL);
        if (!fs_kobj)
                printk(KERN_WARNING "%s: kobj create error\n", __func__);
        shmem_init();
        init_rootfs();
        init_mount_tree();
}

void put_mnt_ns(struct mnt_namespace *ns)
{
        if (!refcount_dec_and_test(&ns->ns.count))
                return;
        drop_collected_mounts(&ns->root->mnt);
        free_mnt_ns(ns);
}

struct vfsmount *kern_mount(struct file_system_type *type)
{
        struct vfsmount *mnt;
        mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
        if (!IS_ERR(mnt)) {
                /*
                 * it is a longterm mount, don't release mnt until
                 * we unmount before file sys is unregistered
                */
                real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
        }
        return mnt;
}
EXPORT_SYMBOL_GPL(kern_mount);

void kern_unmount(struct vfsmount *mnt)
{
        /* release long term mount so mount point can be released */
        if (!IS_ERR(mnt)) {
                mnt_make_shortterm(mnt);
                synchronize_rcu();        /* yecchhh... */
                mntput(mnt);
        }
}
EXPORT_SYMBOL(kern_unmount);

void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{
        unsigned int i;

        for (i = 0; i < num; i++)
                mnt_make_shortterm(mnt[i]);
        synchronize_rcu_expedited();
        for (i = 0; i < num; i++)
                mntput(mnt[i]);
}
EXPORT_SYMBOL(kern_unmount_array);

bool our_mnt(struct vfsmount *mnt)
{
        return check_mnt(real_mount(mnt));
}

bool current_chrooted(void)
{
        /* Does the current process have a non-standard root */
        struct path ns_root;
        struct path fs_root;
        bool chrooted;

        /* Find the namespace root */
        ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
        ns_root.dentry = ns_root.mnt->mnt_root;
        path_get(&ns_root);
        while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
                ;

        get_fs_root(current->fs, &fs_root);

        chrooted = !path_equal(&fs_root, &ns_root);

        path_put(&fs_root);
        path_put(&ns_root);

        return chrooted;
}

static bool mnt_already_visible(struct mnt_namespace *ns,
                                const struct super_block *sb,
                                int *new_mnt_flags)
{
        int new_flags = *new_mnt_flags;
        struct mount *mnt, *n;
        bool visible = false;

        down_read(&namespace_sem);
        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                struct mount *child;
                int mnt_flags;

                if (mnt->mnt.mnt_sb->s_type != sb->s_type)
                        continue;

                /* This mount is not fully visible if it's root directory
                 * is not the root directory of the filesystem.
                 */
                if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
                        continue;

                /* A local view of the mount flags */
                mnt_flags = mnt->mnt.mnt_flags;

                /* Don't miss readonly hidden in the superblock flags */
                if (sb_rdonly(mnt->mnt.mnt_sb))
                        mnt_flags |= MNT_LOCK_READONLY;

                /* Verify the mount flags are equal to or more permissive
                 * than the proposed new mount.
                 */
                if ((mnt_flags & MNT_LOCK_READONLY) &&
                    !(new_flags & MNT_READONLY))
                        continue;
                if ((mnt_flags & MNT_LOCK_ATIME) &&
                    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
                        continue;

                /* This mount is not fully visible if there are any
                 * locked child mounts that cover anything except for
                 * empty directories.
                 */
                list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        /* Only worry about locked mounts */
                        if (!(child->mnt.mnt_flags & MNT_LOCKED))
                                continue;
                        /* Is the directory permanetly empty? */
                        if (!is_empty_dir_inode(inode))
                                goto next;
                }
                /* Preserve the locked attributes */
                *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
                                               MNT_LOCK_ATIME);
                visible = true;
                goto found;
        next:        ;
        }
found:
        up_read(&namespace_sem);
        return visible;
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
        const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        unsigned long s_iflags;

        if (ns->user_ns == &init_user_ns)
                return false;

        /* Can this filesystem be too revealing? */
        s_iflags = sb->s_iflags;
        if (!(s_iflags & SB_I_USERNS_VISIBLE))
                return false;

        if ((s_iflags & required_iflags) != required_iflags) {
                WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
                          required_iflags);
                return true;
        }

        return !mnt_already_visible(ns, sb, new_mnt_flags);
}

bool mnt_may_suid(struct vfsmount *mnt)
{
        /*
         * Foreign mounts (accessed via fchdir or through /proc
         * symlinks) are always treated as if they are nosuid.  This
         * prevents namespaces from trusting potentially unsafe
         * suid/sgid bits, file caps, or security labels that originate
         * in other namespaces.
         */
        return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
               current_in_userns(mnt->mnt_sb->s_user_ns);
}

static struct ns_common *mntns_get(struct task_struct *task)
{
        struct ns_common *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = &nsproxy->mnt_ns->ns;
                get_mnt_ns(to_mnt_ns(ns));
        }
        task_unlock(task);

        return ns;
}

static void mntns_put(struct ns_common *ns)
{
        put_mnt_ns(to_mnt_ns(ns));
}

static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct fs_struct *fs = nsset->fs;
        struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
        struct user_namespace *user_ns = nsset->cred->user_ns;
        struct path root;
        int err;

        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(user_ns, CAP_SYS_CHROOT) ||
            !ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (is_anon_ns(mnt_ns))
                return -EINVAL;

        if (fs->users != 1)
                return -EINVAL;

        get_mnt_ns(mnt_ns);
        old_mnt_ns = nsproxy->mnt_ns;
        nsproxy->mnt_ns = mnt_ns;

        /* Find the root */
        err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
                                "/", LOOKUP_DOWN, &root);
        if (err) {
                /* revert to old namespace */
                nsproxy->mnt_ns = old_mnt_ns;
                put_mnt_ns(mnt_ns);
                return err;
        }

        put_mnt_ns(old_mnt_ns);

        /* Update the pwd and root */
        set_fs_pwd(fs, &root);
        set_fs_root(fs, &root);

        path_put(&root);
        return 0;
}

static struct user_namespace *mntns_owner(struct ns_common *ns)
{
        return to_mnt_ns(ns)->user_ns;
}

const struct proc_ns_operations mntns_operations = {
        .name                = "mnt",
        .type                = CLONE_NEWNS,
        .get                = mntns_get,
        .put                = mntns_put,
        .install        = mntns_install,
        .owner                = mntns_owner,
};

#ifdef CONFIG_SYSCTL
static struct ctl_table fs_namespace_sysctls[] = {
        {
                .procname        = "mount-max",
                .data                = &sysctl_mount_max,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ONE,
        },
};

static int __init init_fs_namespace_sysctls(void)
{
        register_sysctl_init("fs", fs_namespace_sysctls);
        return 0;
}
fs_initcall(init_fs_namespace_sysctls);

#endif /* CONFIG_SYSCTL */




































































































































































































    1 


























    1 















    1 



































































































































































































































    1 
































































    1 












    1 
















































    1 


    1 
















    1 





















    1 

    1 















    6 

    5 












































    1 

















    2 








































    6 

    5 












    1 





    6 






    6 





    1 












    1 













    2 










    6 




    1 









































































































































































    1 







    1 


    1 

    1 
    1 






























    1 




















    1 
    1 









    1 























































    1 

    1 
    1 







    1 

















    1 




















































    1 

    1 
    1 







    1 









    1 
















    1 











































    1 















    1 















    1 
















    5 
    1 







    6 






    3 







    1 






    1 

































    1 

    6 

























    1 
    1 





    1 

    1 














    1 










    1 























    1 






    1 




    1 








    1 

    1 


    1 















































    1 







    1 





































































    1 













    1 


    1 














































    1 

















    1 
    1 
    1 












































































































































































































































    1 












































































































































































































































    1 
    1 



    1 



    1 















































































































































































































































































































































































































































































































































    5 

















    6 
    6 


    4 






    5 
    3 








    5 
    6 


    5 

    5 


















































































































































































































































































































































































































































































































































































































































































































































































    1 

















    1 

    1 
    1 



















    1 




    1 


    1 

    1 




































































    1 
























































































































































































































    1 



    1 




    1 





    1 



    1 






    1 
    1 





    1 






    1 


















    1 


    1 











































































    1 
















    1 










    1 
    1 

    1 


    1 



    1 
    1 




































    1 


























































    1 




    1 


    1 








    1 



    1 


    1 


    1 

















    1 




    1 











    1 



















































































































































    1 








    1 
























































    1 














    1 











    1 




    2 




    1 







    2 



















    1 
    2 




    1 



    1 

























    1 









    1 















    1 






    1 

    1 



    1 













    1 


    1 


    1 
    1 



    1 







    1 















    2 











    1 



    1 

    1 






    1 






    1 


    1 
    1 




















    1 



    1 
    1 



    1 




    1 
    1 











































































































































































































    3 





    1 
    5 





    1 




    1 





























































































































































































































































































































































































































































    1 
























    1 


    1 




























































    1 


    1 






    1 













    1 


    1 


    1 













    1 






    1 


    1 













    1 


    1 



    1 








    1 










    1 




    1 

    1 




















    1 









    1 












    1 

















































































    1 



    1 
































































    1 



    1 






















































    1 





    1 

































































    1 





    2 
















    1 





    1 

























































    1 















































    1 

    1 


    1 
    1 















    1 



















    1 


    1 

























































































































































































































    1 

    1 


















































































































































































































































































































































































































































































































































































































    1 
    1 


    1 





































    4 











    4 


    4 


    4 





    1 

    1 



    2 


    4 
    1 


    3 

































































































    1 
    1 











    1 


    1 











    1 























































































































































































































    1 
















    1 



    1 
    1 




    1 

    1 










    1 
    1 




    1 
    1 

    1 




    1 





    1 
    1 







    1 


    1 

    1 
    1 















    1 





    1 
    1 



    1 
    1 

    1 


    1 
    1 










    1 




    1 


    1 
    1 

    1 





    1 






    1 






    1 














    1 


    1 




    1 





    1 





    1 




    1 









    1 





    1 
    1 

    1 





    1 


    1 




















    1 


    1 

    1 




    1 

    1 

    1 


    1 


    1 


    1 








    1 



    1 




    1 


    1 
    1 
    1 
    1 




    1 
    1 
    1 

    1 
    1 

    1 

    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
// SPDX-License-Identifier: GPL-2.0+
/*
 * Maple Tree implementation
 * Copyright (c) 2018-2022 Oracle Corporation
 * Authors: Liam R. Howlett <Liam.Howlett@oracle.com>
 *            Matthew Wilcox <willy@infradead.org>
 * Copyright (c) 2023 ByteDance
 * Author: Peng Zhang <zhangpeng.00@bytedance.com>
 */

/*
 * DOC: Interesting implementation details of the Maple Tree
 *
 * Each node type has a number of slots for entries and a number of slots for
 * pivots.  In the case of dense nodes, the pivots are implied by the position
 * and are simply the slot index + the minimum of the node.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges.  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 *
 *
 * The following illustrates the layout of a range64 nodes slots and pivots.
 *
 *
 *  Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 |
 *           ┬   ┬   ┬   ┬     ┬    ┬    ┬    ┬    ┬
 *           │   │   │   │     │    │    │    │    └─ Implied maximum
 *           │   │   │   │     │    │    │    └─ Pivot 14
 *           │   │   │   │     │    │    └─ Pivot 13
 *           │   │   │   │     │    └─ Pivot 12
 *           │   │   │   │     └─ Pivot 11
 *           │   │   │   └─ Pivot 2
 *           │   │   └─ Pivot 1
 *           │   └─ Pivot 0
 *           └─  Implied minimum
 *
 * Slot contents:
 *  Internal (non-leaf) nodes contain pointers to other nodes.
 *  Leaf nodes contain entries.
 *
 * The location of interest is often referred to as an offset.  All offsets have
 * a slot, but the last offset has an implied pivot from the node above (or
 * UINT_MAX for the root node.
 *
 * Ranges complicate certain write activities.  When modifying any of
 * the B-tree variants, it is known that one entry will either be added or
 * deleted.  When modifying the Maple Tree, one store operation may overwrite
 * the entire data set, or one half of the tree, or the middle half of the tree.
 *
 */


#include <linux/maple_tree.h>
#include <linux/xarray.h>
#include <linux/types.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/limits.h>
#include <asm/barrier.h>

#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>

#define MA_ROOT_PARENT 1

/*
 * Maple state flags
 * * MA_STATE_BULK                - Bulk insert mode
 * * MA_STATE_REBALANCE                - Indicate a rebalance during bulk insert
 * * MA_STATE_PREALLOC                - Preallocated nodes, WARN_ON allocation
 */
#define MA_STATE_BULK                1
#define MA_STATE_REBALANCE        2
#define MA_STATE_PREALLOC        4

#define ma_parent_ptr(x) ((struct maple_pnode *)(x))
#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT)
#define ma_mnode_ptr(x) ((struct maple_node *)(x))
#define ma_enode_ptr(x) ((struct maple_enode *)(x))
static struct kmem_cache *maple_node_cache;

#ifdef CONFIG_DEBUG_MAPLE_TREE
static const unsigned long mt_max[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = ULONG_MAX,
        [maple_range_64]        = ULONG_MAX,
        [maple_arange_64]        = ULONG_MAX,
};
#define mt_node_max(x) mt_max[mte_node_type(x)]
#endif

static const unsigned char mt_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS,
};
#define mt_slot_count(x) mt_slots[mte_node_type(x)]

static const unsigned char mt_pivots[] = {
        [maple_dense]                = 0,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS - 1,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS - 1,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS - 1,
};
#define mt_pivot_count(x) mt_pivots[mte_node_type(x)]

static const unsigned char mt_min_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS / 2,
        [maple_leaf_64]                = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_range_64]        = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_arange_64]        = (MAPLE_ARANGE64_SLOTS / 2) - 1,
};
#define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)]

#define MAPLE_BIG_NODE_SLOTS        (MAPLE_RANGE64_SLOTS * 2 + 2)
#define MAPLE_BIG_NODE_GAPS        (MAPLE_ARANGE64_SLOTS * 2 + 1)

struct maple_big_node {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1];
        union {
                struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS];
                struct {
                        unsigned long padding[MAPLE_BIG_NODE_GAPS];
                        unsigned long gap[MAPLE_BIG_NODE_GAPS];
                };
        };
        unsigned char b_end;
        enum maple_type type;
};

/*
 * The maple_subtree_state is used to build a tree to replace a segment of an
 * existing tree in a more atomic way.  Any walkers of the older tree will hit a
 * dead node and restart on updates.
 */
struct maple_subtree_state {
        struct ma_state *orig_l;        /* Original left side of subtree */
        struct ma_state *orig_r;        /* Original right side of subtree */
        struct ma_state *l;                /* New left side of subtree */
        struct ma_state *m;                /* New middle of subtree (rare) */
        struct ma_state *r;                /* New right side of subtree */
        struct ma_topiary *free;        /* nodes to be freed */
        struct ma_topiary *destroy;        /* Nodes to be destroyed (walked and freed) */
        struct maple_big_node *bn;
};

#ifdef CONFIG_KASAN_STACK
/* Prevent mas_wr_bnode() from exceeding the stack frame limit */
#define noinline_for_kasan noinline_for_stack
#else
#define noinline_for_kasan inline
#endif

/* Functions */
static inline struct maple_node *mt_alloc_one(gfp_t gfp)
{
        return kmem_cache_alloc(maple_node_cache, gfp);
}

static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
{
        return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
}

static inline void mt_free_one(struct maple_node *node)
{
        kmem_cache_free(maple_node_cache, node);
}

static inline void mt_free_bulk(size_t size, void __rcu **nodes)
{
        kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
}

static void mt_free_rcu(struct rcu_head *head)
{
        struct maple_node *node = container_of(head, struct maple_node, rcu);

        kmem_cache_free(maple_node_cache, node);
}

/*
 * ma_free_rcu() - Use rcu callback to free a maple node
 * @node: The node to free
 *
 * The maple tree uses the parent pointer to indicate this node is no longer in
 * use and will be freed.
 */
static void ma_free_rcu(struct maple_node *node)
{
        WARN_ON(node->parent != ma_parent_ptr(node));
        call_rcu(&node->rcu, mt_free_rcu);
}

static void mas_set_height(struct ma_state *mas)
{
        unsigned int new_flags = mas->tree->ma_flags;

        new_flags &= ~MT_FLAGS_HEIGHT_MASK;
        MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX);
        new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET;
        mas->tree->ma_flags = new_flags;
}

static unsigned int mas_mt_height(struct ma_state *mas)
{
        return mt_height(mas->tree);
}

static inline unsigned int mt_attr(struct maple_tree *mt)
{
        return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK;
}

static __always_inline enum maple_type mte_node_type(
                const struct maple_enode *entry)
{
        return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) &
                MAPLE_NODE_TYPE_MASK;
}

static __always_inline bool ma_is_dense(const enum maple_type type)
{
        return type < maple_leaf_64;
}

static __always_inline bool ma_is_leaf(const enum maple_type type)
{
        return type < maple_range_64;
}

static __always_inline bool mte_is_leaf(const struct maple_enode *entry)
{
        return ma_is_leaf(mte_node_type(entry));
}

/*
 * We also reserve values with the bottom two bits set to '10' which are
 * below 4096
 */
static __always_inline bool mt_is_reserved(const void *entry)
{
        return ((unsigned long)entry < MAPLE_RESERVED_RANGE) &&
                xa_is_internal(entry);
}

static __always_inline void mas_set_err(struct ma_state *mas, long err)
{
        mas->node = MA_ERROR(err);
        mas->status = ma_error;
}

static __always_inline bool mas_is_ptr(const struct ma_state *mas)
{
        return mas->status == ma_root;
}

static __always_inline bool mas_is_start(const struct ma_state *mas)
{
        return mas->status == ma_start;
}

static __always_inline bool mas_is_none(const struct ma_state *mas)
{
        return mas->status == ma_none;
}

static __always_inline bool mas_is_paused(const struct ma_state *mas)
{
        return mas->status == ma_pause;
}

static __always_inline bool mas_is_overflow(struct ma_state *mas)
{
        return mas->status == ma_overflow;
}

static inline bool mas_is_underflow(struct ma_state *mas)
{
        return mas->status == ma_underflow;
}

static __always_inline struct maple_node *mte_to_node(
                const struct maple_enode *entry)
{
        return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mte_to_mat() - Convert a maple encoded node to a maple topiary node.
 * @entry: The maple encoded node
 *
 * Return: a maple topiary pointer
 */
static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry)
{
        return (struct maple_topiary *)
                ((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mas_mn() - Get the maple state node.
 * @mas: The maple state
 *
 * Return: the maple node (not encoded - bare pointer).
 */
static inline struct maple_node *mas_mn(const struct ma_state *mas)
{
        return mte_to_node(mas->node);
}

/*
 * mte_set_node_dead() - Set a maple encoded node as dead.
 * @mn: The maple encoded node.
 */
static inline void mte_set_node_dead(struct maple_enode *mn)
{
        mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn));
        smp_wmb(); /* Needed for RCU */
}

/* Bit 1 indicates the root is a node */
#define MAPLE_ROOT_NODE                        0x02
/* maple_type stored bit 3-6 */
#define MAPLE_ENODE_TYPE_SHIFT                0x03
/* Bit 2 means a NULL somewhere below */
#define MAPLE_ENODE_NULL                0x04

static inline struct maple_enode *mt_mk_node(const struct maple_node *node,
                                             enum maple_type type)
{
        return (void *)((unsigned long)node |
                        (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL);
}

static inline void *mte_mk_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ROOT_NODE);
}

static inline void *mte_safe_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE);
}

static inline void *mte_set_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL);
}

static inline void *mte_clear_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ENODE_NULL);
}

static inline bool mte_has_null(const struct maple_enode *node)
{
        return (unsigned long)node & MAPLE_ENODE_NULL;
}

static __always_inline bool ma_is_root(struct maple_node *node)
{
        return ((unsigned long)node->parent & MA_ROOT_PARENT);
}

static __always_inline bool mte_is_root(const struct maple_enode *node)
{
        return ma_is_root(mte_to_node(node));
}

static inline bool mas_is_root_limits(const struct ma_state *mas)
{
        return !mas->min && mas->max == ULONG_MAX;
}

static __always_inline bool mt_is_alloc(struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE);
}

/*
 * The Parent Pointer
 * Excluding root, the parent pointer is 256B aligned like all other tree nodes.
 * When storing a 32 or 64 bit values, the offset can fit into 5 bits.  The 16
 * bit values need an extra bit to store the offset.  This extra bit comes from
 * a reuse of the last bit in the node type.  This is possible by using bit 1 to
 * indicate if bit 2 is part of the type or the slot.
 *
 * Note types:
 *  0x??1 = Root
 *  0x?00 = 16 bit nodes
 *  0x010 = 32 bit nodes
 *  0x110 = 64 bit nodes
 *
 * Slot size and alignment
 *  0b??1 : Root
 *  0b?00 : 16 bit values, type in 0-1, slot in 2-7
 *  0b010 : 32 bit values, type in 0-2, slot in 3-7
 *  0b110 : 64 bit values, type in 0-2, slot in 3-7
 */

#define MAPLE_PARENT_ROOT                0x01

#define MAPLE_PARENT_SLOT_SHIFT                0x03
#define MAPLE_PARENT_SLOT_MASK                0xF8

#define MAPLE_PARENT_16B_SLOT_SHIFT        0x02
#define MAPLE_PARENT_16B_SLOT_MASK        0xFC

#define MAPLE_PARENT_RANGE64                0x06
#define MAPLE_PARENT_RANGE32                0x04
#define MAPLE_PARENT_NOT_RANGE16        0x02

/*
 * mte_parent_shift() - Get the parent shift for the slot storage.
 * @parent: The parent pointer cast as an unsigned long
 * Return: The shift into that pointer to the star to of the slot
 */
static inline unsigned long mte_parent_shift(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_SHIFT;

        return MAPLE_PARENT_16B_SLOT_SHIFT;
}

/*
 * mte_parent_slot_mask() - Get the slot mask for the parent.
 * @parent: The parent pointer cast as an unsigned long.
 * Return: The slot mask for that parent.
 */
static inline unsigned long mte_parent_slot_mask(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_MASK;

        return MAPLE_PARENT_16B_SLOT_MASK;
}

/*
 * mas_parent_type() - Return the maple_type of the parent from the stored
 * parent type.
 * @mas: The maple state
 * @enode: The maple_enode to extract the parent's enum
 * Return: The node->parent maple_type
 */
static inline
enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode)
{
        unsigned long p_type;

        p_type = (unsigned long)mte_to_node(enode)->parent;
        if (WARN_ON(p_type & MAPLE_PARENT_ROOT))
                return 0;

        p_type &= MAPLE_NODE_MASK;
        p_type &= ~mte_parent_slot_mask(p_type);
        switch (p_type) {
        case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */
                if (mt_is_alloc(mas->tree))
                        return maple_arange_64;
                return maple_range_64;
        }

        return 0;
}

/*
 * mas_set_parent() - Set the parent node and encode the slot
 * @enode: The encoded maple node.
 * @parent: The encoded maple node that is the parent of @enode.
 * @slot: The slot that @enode resides in @parent.
 *
 * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the
 * parent type.
 */
static inline
void mas_set_parent(struct ma_state *mas, struct maple_enode *enode,
                    const struct maple_enode *parent, unsigned char slot)
{
        unsigned long val = (unsigned long)parent;
        unsigned long shift;
        unsigned long type;
        enum maple_type p_type = mte_node_type(parent);

        MAS_BUG_ON(mas, p_type == maple_dense);
        MAS_BUG_ON(mas, p_type == maple_leaf_64);

        switch (p_type) {
        case maple_range_64:
        case maple_arange_64:
                shift = MAPLE_PARENT_SLOT_SHIFT;
                type = MAPLE_PARENT_RANGE64;
                break;
        default:
        case maple_dense:
        case maple_leaf_64:
                shift = type = 0;
                break;
        }

        val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */
        val |= (slot << shift) | type;
        mte_to_node(enode)->parent = ma_parent_ptr(val);
}

/*
 * mte_parent_slot() - get the parent slot of @enode.
 * @enode: The encoded maple node.
 *
 * Return: The slot in the parent node where @enode resides.
 */
static __always_inline
unsigned int mte_parent_slot(const struct maple_enode *enode)
{
        unsigned long val = (unsigned long)mte_to_node(enode)->parent;

        if (unlikely(val & MA_ROOT_PARENT))
                return 0;

        /*
         * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost
         * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT
         */
        return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val);
}

/*
 * mte_parent() - Get the parent of @node.
 * @node: The encoded maple node.
 *
 * Return: The parent maple node.
 */
static __always_inline
struct maple_node *mte_parent(const struct maple_enode *enode)
{
        return (void *)((unsigned long)
                        (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK);
}

/*
 * ma_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool ma_dead_node(const struct maple_node *node)
{
        struct maple_node *parent;

        /* Do not reorder reads from the node prior to the parent check */
        smp_rmb();
        parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
        return (parent == node);
}

/*
 * mte_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool mte_dead_node(const struct maple_enode *enode)
{
        struct maple_node *parent, *node;

        node = mte_to_node(enode);
        /* Do not reorder reads from the node prior to the parent check */
        smp_rmb();
        parent = mte_parent(enode);
        return (parent == node);
}

/*
 * mas_allocated() - Get the number of nodes allocated in a maple state.
 * @mas: The maple state
 *
 * The ma_state alloc member is overloaded to hold a pointer to the first
 * allocated node or to the number of requested nodes to allocate.  If bit 0 is
 * set, then the alloc contains the number of requested nodes.  If there is an
 * allocated node, then the total allocated nodes is in that node.
 *
 * Return: The total number of nodes allocated
 */
static inline unsigned long mas_allocated(const struct ma_state *mas)
{
        if (!mas->alloc || ((unsigned long)mas->alloc & 0x1))
                return 0;

        return mas->alloc->total;
}

/*
 * mas_set_alloc_req() - Set the requested number of allocations.
 * @mas: the maple state
 * @count: the number of allocations.
 *
 * The requested number of allocations is either in the first allocated node,
 * located in @mas->alloc->request_count, or directly in @mas->alloc if there is
 * no allocated node.  Set the request either in the node or do the necessary
 * encoding to store in @mas->alloc directly.
 */
static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count)
{
        if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) {
                if (!count)
                        mas->alloc = NULL;
                else
                        mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U);
                return;
        }

        mas->alloc->request_count = count;
}

/*
 * mas_alloc_req() - get the requested number of allocations.
 * @mas: The maple state
 *
 * The alloc count is either stored directly in @mas, or in
 * @mas->alloc->request_count if there is at least one node allocated.  Decode
 * the request count if it's stored directly in @mas->alloc.
 *
 * Return: The allocation request count.
 */
static inline unsigned int mas_alloc_req(const struct ma_state *mas)
{
        if ((unsigned long)mas->alloc & 0x1)
                return (unsigned long)(mas->alloc) >> 1;
        else if (mas->alloc)
                return mas->alloc->request_count;
        return 0;
}

/*
 * ma_pivots() - Get a pointer to the maple node pivots.
 * @node - the maple node
 * @type - the node type
 *
 * In the event of a dead node, this array may be %NULL
 *
 * Return: A pointer to the maple node pivots
 */
static inline unsigned long *ma_pivots(struct maple_node *node,
                                           enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.pivot;
        case maple_range_64:
        case maple_leaf_64:
                return node->mr64.pivot;
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * ma_gaps() - Get a pointer to the maple node gaps.
 * @node - the maple node
 * @type - the node type
 *
 * Return: A pointer to the maple node gaps
 */
static inline unsigned long *ma_gaps(struct maple_node *node,
                                     enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.gap;
        case maple_range_64:
        case maple_leaf_64:
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * mas_safe_pivot() - get the pivot at @piv or mas->max.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @piv: The pivot to fetch
 * @type: The maple node type
 *
 * Return: The pivot at @piv within the limit of the @pivots array, @mas->max
 * otherwise.
 */
static __always_inline unsigned long
mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots,
               unsigned char piv, enum maple_type type)
{
        if (piv >= mt_pivots[type])
                return mas->max;

        return pivots[piv];
}

/*
 * mas_safe_min() - Return the minimum for a given offset.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @offset: The offset into the pivot array
 *
 * Return: The minimum range value that is contained in @offset.
 */
static inline unsigned long
mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset)
{
        if (likely(offset))
                return pivots[offset - 1] + 1;

        return mas->min;
}

/*
 * mte_set_pivot() - Set a pivot to a value in an encoded maple node.
 * @mn: The encoded maple node
 * @piv: The pivot offset
 * @val: The value of the pivot
 */
static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv,
                                unsigned long val)
{
        struct maple_node *node = mte_to_node(mn);
        enum maple_type type = mte_node_type(mn);

        BUG_ON(piv >= mt_pivots[type]);
        switch (type) {
        case maple_range_64:
        case maple_leaf_64:
                node->mr64.pivot[piv] = val;
                break;
        case maple_arange_64:
                node->ma64.pivot[piv] = val;
                break;
        case maple_dense:
                break;
        }

}

/*
 * ma_slots() - Get a pointer to the maple node slots.
 * @mn: The maple node
 * @mt: The maple node type
 *
 * Return: A pointer to the maple node slots
 */
static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return mn->ma64.slot;
        case maple_range_64:
        case maple_leaf_64:
                return mn->mr64.slot;
        case maple_dense:
                return mn->slot;
        }

        return NULL;
}

static inline bool mt_write_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_write_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline bool mt_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline void *mt_slot(const struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_check(slots[offset], mt_locked(mt));
}

static __always_inline void *mt_slot_locked(struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_protected(slots[offset], mt_write_locked(mt));
}
/*
 * mas_slot_locked() - Get the slot value when holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset.
 */
static __always_inline void *mas_slot_locked(struct ma_state *mas,
                void __rcu **slots, unsigned char offset)
{
        return mt_slot_locked(mas->tree, slots, offset);
}

/*
 * mas_slot() - Get the slot value when not holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset
 */
static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots,
                unsigned char offset)
{
        return mt_slot(mas->tree, slots, offset);
}

/*
 * mas_root() - Get the maple tree root.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static __always_inline void *mas_root(struct ma_state *mas)
{
        return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree));
}

static inline void *mt_root_locked(struct maple_tree *mt)
{
        return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt));
}

/*
 * mas_root_locked() - Get the maple tree root when holding the maple tree lock.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static inline void *mas_root_locked(struct ma_state *mas)
{
        return mt_root_locked(mas->tree);
}

static inline struct maple_metadata *ma_meta(struct maple_node *mn,
                                             enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return &mn->ma64.meta;
        default:
                return &mn->mr64.meta;
        }
}

/*
 * ma_set_meta() - Set the metadata information of a node.
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The offset of the highest sub-gap in this node.
 * @end: The end of the data in this node.
 */
static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
                               unsigned char offset, unsigned char end)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
        meta->end = end;
}

/*
 * mt_clear_meta() - clear the metadata information of a node, if it exists
 * @mt: The maple tree
 * @mn: The maple node
 * @type: The maple node type
 * @offset: The offset of the highest sub-gap in this node.
 * @end: The end of the data in this node.
 */
static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
                                  enum maple_type type)
{
        struct maple_metadata *meta;
        unsigned long *pivots;
        void __rcu **slots;
        void *next;

        switch (type) {
        case maple_range_64:
                pivots = mn->mr64.pivot;
                if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
                        slots = mn->mr64.slot;
                        next = mt_slot_locked(mt, slots,
                                              MAPLE_RANGE64_SLOTS - 1);
                        if (unlikely((mte_to_node(next) &&
                                      mte_node_type(next))))
                                return; /* no metadata, could be node */
                }
                fallthrough;
        case maple_arange_64:
                meta = ma_meta(mn, type);
                break;
        default:
                return;
        }

        meta->gap = 0;
        meta->end = 0;
}

/*
 * ma_meta_end() - Get the data end of a node from the metadata
 * @mn: The maple node
 * @mt: The maple node type
 */
static inline unsigned char ma_meta_end(struct maple_node *mn,
                                        enum maple_type mt)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        return meta->end;
}

/*
 * ma_meta_gap() - Get the largest gap location of a node from the metadata
 * @mn: The maple node
 */
static inline unsigned char ma_meta_gap(struct maple_node *mn)
{
        return mn->ma64.meta.gap;
}

/*
 * ma_set_meta_gap() - Set the largest gap location in a nodes metadata
 * @mn: The maple node
 * @mn: The maple node type
 * @offset: The location of the largest gap.
 */
static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt,
                                   unsigned char offset)
{

        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
}

/*
 * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes.
 * @mat - the ma_topiary, a linked list of dead nodes.
 * @dead_enode - the node to be marked as dead and added to the tail of the list
 *
 * Add the @dead_enode to the linked list in @mat.
 */
static inline void mat_add(struct ma_topiary *mat,
                           struct maple_enode *dead_enode)
{
        mte_set_node_dead(dead_enode);
        mte_to_mat(dead_enode)->next = NULL;
        if (!mat->tail) {
                mat->tail = mat->head = dead_enode;
                return;
        }

        mte_to_mat(mat->tail)->next = dead_enode;
        mat->tail = dead_enode;
}

static void mt_free_walk(struct rcu_head *head);
static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free);
/*
 * mas_mat_destroy() - Free all nodes and subtrees in a dead list.
 * @mas - the maple state
 * @mat - the ma_topiary linked list of dead nodes to free.
 *
 * Destroy walk a dead list.
 */
static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat)
{
        struct maple_enode *next;
        struct maple_node *node;
        bool in_rcu = mt_in_rcu(mas->tree);

        while (mat->head) {
                next = mte_to_mat(mat->head)->next;
                node = mte_to_node(mat->head);
                mt_destroy_walk(mat->head, mas->tree, !in_rcu);
                if (in_rcu)
                        call_rcu(&node->rcu, mt_free_walk);
                mat->head = next;
        }
}
/*
 * mas_descend() - Descend into the slot stored in the ma_state.
 * @mas - the maple state.
 *
 * Note: Not RCU safe, only use in write side or debug code.
 */
static inline void mas_descend(struct ma_state *mas)
{
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        void __rcu **slots;

        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);

        if (mas->offset)
                mas->min = pivots[mas->offset - 1] + 1;
        mas->max = mas_safe_pivot(mas, pivots, mas->offset, type);
        mas->node = mas_slot(mas, slots, mas->offset);
}

/*
 * mte_set_gap() - Set a maple node gap.
 * @mn: The encoded maple node
 * @gap: The offset of the gap to set
 * @val: The gap value
 */
static inline void mte_set_gap(const struct maple_enode *mn,
                                 unsigned char gap, unsigned long val)
{
        switch (mte_node_type(mn)) {
        default:
                break;
        case maple_arange_64:
                mte_to_node(mn)->ma64.gap[gap] = val;
                break;
        }
}

/*
 * mas_ascend() - Walk up a level of the tree.
 * @mas: The maple state
 *
 * Sets the @mas->max and @mas->min to the correct values when walking up.  This
 * may cause several levels of walking up to find the correct min and max.
 * May find a dead node which will cause a premature return.
 * Return: 1 on dead node, 0 otherwise
 */
static int mas_ascend(struct ma_state *mas)
{
        struct maple_enode *p_enode; /* parent enode. */
        struct maple_enode *a_enode; /* ancestor enode. */
        struct maple_node *a_node; /* ancestor node. */
        struct maple_node *p_node; /* parent node. */
        unsigned char a_slot;
        enum maple_type a_type;
        unsigned long min, max;
        unsigned long *pivots;
        bool set_max = false, set_min = false;

        a_node = mas_mn(mas);
        if (ma_is_root(a_node)) {
                mas->offset = 0;
                return 0;
        }

        p_node = mte_parent(mas->node);
        if (unlikely(a_node == p_node))
                return 1;

        a_type = mas_parent_type(mas, mas->node);
        mas->offset = mte_parent_slot(mas->node);
        a_enode = mt_mk_node(p_node, a_type);

        /* Check to make sure all parent information is still accurate */
        if (p_node != mte_parent(mas->node))
                return 1;

        mas->node = a_enode;

        if (mte_is_root(a_enode)) {
                mas->max = ULONG_MAX;
                mas->min = 0;
                return 0;
        }

        min = 0;
        max = ULONG_MAX;
        if (!mas->offset) {
                min = mas->min;
                set_min = true;
        }

        if (mas->max == ULONG_MAX)
                set_max = true;

        do {
                p_enode = a_enode;
                a_type = mas_parent_type(mas, p_enode);
                a_node = mte_parent(p_enode);
                a_slot = mte_parent_slot(p_enode);
                a_enode = mt_mk_node(a_node, a_type);
                pivots = ma_pivots(a_node, a_type);

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (!set_min && a_slot) {
                        set_min = true;
                        min = pivots[a_slot - 1] + 1;
                }

                if (!set_max && a_slot < mt_pivots[a_type]) {
                        set_max = true;
                        max = pivots[a_slot];
                }

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (unlikely(ma_is_root(a_node)))
                        break;

        } while (!set_min || !set_max);

        mas->max = max;
        mas->min = min;
        return 0;
}

/*
 * mas_pop_node() - Get a previously allocated maple node from the maple state.
 * @mas: The maple state
 *
 * Return: A pointer to a maple node.
 */
static inline struct maple_node *mas_pop_node(struct ma_state *mas)
{
        struct maple_alloc *ret, *node = mas->alloc;
        unsigned long total = mas_allocated(mas);
        unsigned int req = mas_alloc_req(mas);

        /* nothing or a request pending. */
        if (WARN_ON(!total))
                return NULL;

        if (total == 1) {
                /* single allocation in this ma_state */
                mas->alloc = NULL;
                ret = node;
                goto single_node;
        }

        if (node->node_count == 1) {
                /* Single allocation in this node. */
                mas->alloc = node->slot[0];
                mas->alloc->total = node->total - 1;
                ret = node;
                goto new_head;
        }
        node->total--;
        ret = node->slot[--node->node_count];
        node->slot[node->node_count] = NULL;

single_node:
new_head:
        if (req) {
                req++;
                mas_set_alloc_req(mas, req);
        }

        memset(ret, 0, sizeof(*ret));
        return (struct maple_node *)ret;
}

/*
 * mas_push_node() - Push a node back on the maple state allocation.
 * @mas: The maple state
 * @used: The used maple node
 *
 * Stores the maple node back into @mas->alloc for reuse.  Updates allocated and
 * requested node count as necessary.
 */
static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
{
        struct maple_alloc *reuse = (struct maple_alloc *)used;
        struct maple_alloc *head = mas->alloc;
        unsigned long count;
        unsigned int requested = mas_alloc_req(mas);

        count = mas_allocated(mas);

        reuse->request_count = 0;
        reuse->node_count = 0;
        if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
                head->slot[head->node_count++] = reuse;
                head->total++;
                goto done;
        }

        reuse->total = 1;
        if ((head) && !((unsigned long)head & 0x1)) {
                reuse->slot[0] = head;
                reuse->node_count = 1;
                reuse->total += head->total;
        }

        mas->alloc = reuse;
done:
        if (requested > 1)
                mas_set_alloc_req(mas, requested - 1);
}

/*
 * mas_alloc_nodes() - Allocate nodes into a maple state
 * @mas: The maple state
 * @gfp: The GFP Flags
 */
static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
{
        struct maple_alloc *node;
        unsigned long allocated = mas_allocated(mas);
        unsigned int requested = mas_alloc_req(mas);
        unsigned int count;
        void **slots = NULL;
        unsigned int max_req = 0;

        if (!requested)
                return;

        mas_set_alloc_req(mas, 0);
        if (mas->mas_flags & MA_STATE_PREALLOC) {
                if (allocated)
                        return;
                BUG_ON(!allocated);
                WARN_ON(!allocated);
        }

        if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
                node = (struct maple_alloc *)mt_alloc_one(gfp);
                if (!node)
                        goto nomem_one;

                if (allocated) {
                        node->slot[0] = mas->alloc;
                        node->node_count = 1;
                } else {
                        node->node_count = 0;
                }

                mas->alloc = node;
                node->total = ++allocated;
                requested--;
        }

        node = mas->alloc;
        node->request_count = 0;
        while (requested) {
                max_req = MAPLE_ALLOC_SLOTS - node->node_count;
                slots = (void **)&node->slot[node->node_count];
                max_req = min(requested, max_req);
                count = mt_alloc_bulk(gfp, max_req, slots);
                if (!count)
                        goto nomem_bulk;

                if (node->node_count == 0) {
                        node->slot[0]->node_count = 0;
                        node->slot[0]->request_count = 0;
                }

                node->node_count += count;
                allocated += count;
                node = node->slot[0];
                requested -= count;
        }
        mas->alloc->total = allocated;
        return;

nomem_bulk:
        /* Clean up potential freed allocations on bulk failure */
        memset(slots, 0, max_req * sizeof(unsigned long));
nomem_one:
        mas_set_alloc_req(mas, requested);
        if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
                mas->alloc->total = allocated;
        mas_set_err(mas, -ENOMEM);
}

/*
 * mas_free() - Free an encoded maple node
 * @mas: The maple state
 * @used: The encoded maple node to free.
 *
 * Uses rcu free if necessary, pushes @used back on the maple state allocations
 * otherwise.
 */
static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
{
        struct maple_node *tmp = mte_to_node(used);

        if (mt_in_rcu(mas->tree))
                ma_free_rcu(tmp);
        else
                mas_push_node(mas, tmp);
}

/*
 * mas_node_count_gfp() - Check if enough nodes are allocated and request more
 * if there is not enough nodes.
 * @mas: The maple state
 * @count: The number of nodes needed
 * @gfp: the gfp flags
 */
static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp)
{
        unsigned long allocated = mas_allocated(mas);

        if (allocated < count) {
                mas_set_alloc_req(mas, count - allocated);
                mas_alloc_nodes(mas, gfp);
        }
}

/*
 * mas_node_count() - Check if enough nodes are allocated and request more if
 * there is not enough nodes.
 * @mas: The maple state
 * @count: The number of nodes needed
 *
 * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags.
 */
static void mas_node_count(struct ma_state *mas, int count)
{
        return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN);
}

/*
 * mas_start() - Sets up maple state for operations.
 * @mas: The maple state.
 *
 * If mas->status == mas_start, then set the min, max and depth to
 * defaults.
 *
 * Return:
 * - If mas->node is an error or not mas_start, return NULL.
 * - If it's an empty tree:     NULL & mas->status == ma_none
 * - If it's a single entry:    The entry & mas->status == mas_root
 * - If it's a tree:            NULL & mas->status == safe root node.
 */
static inline struct maple_enode *mas_start(struct ma_state *mas)
{
        if (likely(mas_is_start(mas))) {
                struct maple_enode *root;

                mas->min = 0;
                mas->max = ULONG_MAX;

retry:
                mas->depth = 0;
                root = mas_root(mas);
                /* Tree with nodes */
                if (likely(xa_is_node(root))) {
                        mas->depth = 1;
                        mas->status = ma_active;
                        mas->node = mte_safe_root(root);
                        mas->offset = 0;
                        if (mte_dead_node(mas->node))
                                goto retry;

                        return NULL;
                }

                /* empty tree */
                if (unlikely(!root)) {
                        mas->node = NULL;
                        mas->status = ma_none;
                        mas->offset = MAPLE_NODE_SLOTS;
                        return NULL;
                }

                /* Single entry tree */
                mas->status = ma_root;
                mas->offset = MAPLE_NODE_SLOTS;

                /* Single entry tree. */
                if (mas->index > 0)
                        return NULL;

                return root;
        }

        return NULL;
}

/*
 * ma_data_end() - Find the end of the data in a node.
 * @node: The maple node
 * @type: The maple node type
 * @pivots: The array of pivots in the node
 * @max: The maximum value in the node
 *
 * Uses metadata to find the end of the data when possible.
 * Return: The zero indexed last slot with data (may be null).
 */
static __always_inline unsigned char ma_data_end(struct maple_node *node,
                enum maple_type type, unsigned long *pivots, unsigned long max)
{
        unsigned char offset;

        if (!pivots)
                return 0;

        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_data_end() - Find the end of the data (slot).
 * @mas: the maple state
 *
 * This method is optimized to check the metadata of a node if the node type
 * supports data end metadata.
 *
 * Return: The zero indexed last slot with data (may be null).
 */
static inline unsigned char mas_data_end(struct ma_state *mas)
{
        enum maple_type type;
        struct maple_node *node;
        unsigned char offset;
        unsigned long *pivots;

        type = mte_node_type(mas->node);
        node = mas_mn(mas);
        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        pivots = ma_pivots(node, type);
        if (unlikely(ma_dead_node(node)))
                return 0;

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == mas->max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_leaf_max_gap() - Returns the largest gap in a leaf node
 * @mas - the maple state
 *
 * Return: The maximum gap in the leaf.
 */
static unsigned long mas_leaf_max_gap(struct ma_state *mas)
{
        enum maple_type mt;
        unsigned long pstart, gap, max_gap;
        struct maple_node *mn;
        unsigned long *pivots;
        void __rcu **slots;
        unsigned char i;
        unsigned char max_piv;

        mt = mte_node_type(mas->node);
        mn = mas_mn(mas);
        slots = ma_slots(mn, mt);
        max_gap = 0;
        if (unlikely(ma_is_dense(mt))) {
                gap = 0;
                for (i = 0; i < mt_slots[mt]; i++) {
                        if (slots[i]) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                        } else {
                                gap++;
                        }
                }
                if (gap > max_gap)
                        max_gap = gap;
                return max_gap;
        }

        /*
         * Check the first implied pivot optimizes the loop below and slot 1 may
         * be skipped if there is a gap in slot 0.
         */
        pivots = ma_pivots(mn, mt);
        if (likely(!slots[0])) {
                max_gap = pivots[0] - mas->min + 1;
                i = 2;
        } else {
                i = 1;
        }

        /* reduce max_piv as the special case is checked before the loop */
        max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1;
        /*
         * Check end implied pivot which can only be a gap on the right most
         * node.
         */
        if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) {
                gap = ULONG_MAX - pivots[max_piv];
                if (gap > max_gap)
                        max_gap = gap;

                if (max_gap > pivots[max_piv] - mas->min)
                        return max_gap;
        }

        for (; i <= max_piv; i++) {
                /* data == no gap. */
                if (likely(slots[i]))
                        continue;

                pstart = pivots[i - 1];
                gap = pivots[i] - pstart;
                if (gap > max_gap)
                        max_gap = gap;

                /* There cannot be two gaps in a row. */
                i++;
        }
        return max_gap;
}

/*
 * ma_max_gap() - Get the maximum gap in a maple node (non-leaf)
 * @node: The maple node
 * @gaps: The pointer to the gaps
 * @mt: The maple node type
 * @*off: Pointer to store the offset location of the gap.
 *
 * Uses the metadata data end to scan backwards across set gaps.
 *
 * Return: The maximum gap value
 */
static inline unsigned long
ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt,
            unsigned char *off)
{
        unsigned char offset, i;
        unsigned long max_gap = 0;

        i = offset = ma_meta_end(node, mt);
        do {
                if (gaps[i] > max_gap) {
                        max_gap = gaps[i];
                        offset = i;
                }
        } while (i--);

        *off = offset;
        return max_gap;
}

/*
 * mas_max_gap() - find the largest gap in a non-leaf node and set the slot.
 * @mas: The maple state.
 *
 * Return: The gap value.
 */
static inline unsigned long mas_max_gap(struct ma_state *mas)
{
        unsigned long *gaps;
        unsigned char offset;
        enum maple_type mt;
        struct maple_node *node;

        mt = mte_node_type(mas->node);
        if (ma_is_leaf(mt))
                return mas_leaf_max_gap(mas);

        node = mas_mn(mas);
        MAS_BUG_ON(mas, mt != maple_arange_64);
        offset = ma_meta_gap(node);
        gaps = ma_gaps(node, mt);
        return gaps[offset];
}

/*
 * mas_parent_gap() - Set the parent gap and any gaps above, as needed
 * @mas: The maple state
 * @offset: The gap offset in the parent to set
 * @new: The new gap value.
 *
 * Set the parent gap then continue to set the gap upwards, using the metadata
 * of the parent to see if it is necessary to check the node above.
 */
static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset,
                unsigned long new)
{
        unsigned long meta_gap = 0;
        struct maple_node *pnode;
        struct maple_enode *penode;
        unsigned long *pgaps;
        unsigned char meta_offset;
        enum maple_type pmt;

        pnode = mte_parent(mas->node);
        pmt = mas_parent_type(mas, mas->node);
        penode = mt_mk_node(pnode, pmt);
        pgaps = ma_gaps(pnode, pmt);

ascend:
        MAS_BUG_ON(mas, pmt != maple_arange_64);
        meta_offset = ma_meta_gap(pnode);
        meta_gap = pgaps[meta_offset];

        pgaps[offset] = new;

        if (meta_gap == new)
                return;

        if (offset != meta_offset) {
                if (meta_gap > new)
                        return;

                ma_set_meta_gap(pnode, pmt, offset);
        } else if (new < meta_gap) {
                new = ma_max_gap(pnode, pgaps, pmt, &meta_offset);
                ma_set_meta_gap(pnode, pmt, meta_offset);
        }

        if (ma_is_root(pnode))
                return;

        /* Go to the parent node. */
        pnode = mte_parent(penode);
        pmt = mas_parent_type(mas, penode);
        pgaps = ma_gaps(pnode, pmt);
        offset = mte_parent_slot(penode);
        penode = mt_mk_node(pnode, pmt);
        goto ascend;
}

/*
 * mas_update_gap() - Update a nodes gaps and propagate up if necessary.
 * @mas - the maple state.
 */
static inline void mas_update_gap(struct ma_state *mas)
{
        unsigned char pslot;
        unsigned long p_gap;
        unsigned long max_gap;

        if (!mt_is_alloc(mas->tree))
                return;

        if (mte_is_root(mas->node))
                return;

        max_gap = mas_max_gap(mas);

        pslot = mte_parent_slot(mas->node);
        p_gap = ma_gaps(mte_parent(mas->node),
                        mas_parent_type(mas, mas->node))[pslot];

        if (p_gap != max_gap)
                mas_parent_gap(mas, pslot, max_gap);
}

/*
 * mas_adopt_children() - Set the parent pointer of all nodes in @parent to
 * @parent with the slot encoded.
 * @mas - the maple state (for the tree)
 * @parent - the maple encoded node containing the children.
 */
static inline void mas_adopt_children(struct ma_state *mas,
                struct maple_enode *parent)
{
        enum maple_type type = mte_node_type(parent);
        struct maple_node *node = mte_to_node(parent);
        void __rcu **slots = ma_slots(node, type);
        unsigned long *pivots = ma_pivots(node, type);
        struct maple_enode *child;
        unsigned char offset;

        offset = ma_data_end(node, type, pivots, mas->max);
        do {
                child = mas_slot_locked(mas, slots, offset);
                mas_set_parent(mas, child, parent, offset);
        } while (offset--);
}

/*
 * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old
 * node as dead.
 * @mas - the maple state with the new node
 * @old_enode - The old maple encoded node to replace.
 */
static inline void mas_put_in_tree(struct ma_state *mas,
                struct maple_enode *old_enode)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char offset;
        void __rcu **slots;

        if (mte_is_root(mas->node)) {
                mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas));
                rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
                mas_set_height(mas);
        } else {

                offset = mte_parent_slot(mas->node);
                slots = ma_slots(mte_parent(mas->node),
                                 mas_parent_type(mas, mas->node));
                rcu_assign_pointer(slots[offset], mas->node);
        }

        mte_set_node_dead(old_enode);
}

/*
 * mas_replace_node() - Replace a node by putting it in the tree, marking it
 * dead, and freeing it.
 * the parent encoding to locate the maple node in the tree.
 * @mas - the ma_state with @mas->node pointing to the new node.
 * @old_enode - The old maple encoded node.
 */
static inline void mas_replace_node(struct ma_state *mas,
                struct maple_enode *old_enode)
        __must_hold(mas->tree->ma_lock)
{
        mas_put_in_tree(mas, old_enode);
        mas_free(mas, old_enode);
}

/*
 * mas_find_child() - Find a child who has the parent @mas->node.
 * @mas: the maple state with the parent.
 * @child: the maple state to store the child.
 */
static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child)
        __must_hold(mas->tree->ma_lock)
{
        enum maple_type mt;
        unsigned char offset;
        unsigned char end;
        unsigned long *pivots;
        struct maple_enode *entry;
        struct maple_node *node;
        void __rcu **slots;

        mt = mte_node_type(mas->node);
        node = mas_mn(mas);
        slots = ma_slots(node, mt);
        pivots = ma_pivots(node, mt);
        end = ma_data_end(node, mt, pivots, mas->max);
        for (offset = mas->offset; offset <= end; offset++) {
                entry = mas_slot_locked(mas, slots, offset);
                if (mte_parent(entry) == node) {
                        *child = *mas;
                        mas->offset = offset + 1;
                        child->offset = offset;
                        mas_descend(child);
                        child->offset = 0;
                        return true;
                }
        }
        return false;
}

/*
 * mab_shift_right() - Shift the data in mab right. Note, does not clean out the
 * old data or set b_node->b_end.
 * @b_node: the maple_big_node
 * @shift: the shift count
 */
static inline void mab_shift_right(struct maple_big_node *b_node,
                                 unsigned char shift)
{
        unsigned long size = b_node->b_end * sizeof(unsigned long);

        memmove(b_node->pivot + shift, b_node->pivot, size);
        memmove(b_node->slot + shift, b_node->slot, size);
        if (b_node->type == maple_arange_64)
                memmove(b_node->gap + shift, b_node->gap, size);
}

/*
 * mab_middle_node() - Check if a middle node is needed (unlikely)
 * @b_node: the maple_big_node that contains the data.
 * @size: the amount of data in the b_node
 * @split: the potential split location
 * @slot_count: the size that can be stored in a single node being considered.
 *
 * Return: true if a middle node is required.
 */
static inline bool mab_middle_node(struct maple_big_node *b_node, int split,
                                   unsigned char slot_count)
{
        unsigned char size = b_node->b_end;

        if (size >= 2 * slot_count)
                return true;

        if (!b_node->slot[split] && (size >= 2 * slot_count - 1))
                return true;

        return false;
}

/*
 * mab_no_null_split() - ensure the split doesn't fall on a NULL
 * @b_node: the maple_big_node with the data
 * @split: the suggested split location
 * @slot_count: the number of slots in the node being considered.
 *
 * Return: the split location.
 */
static inline int mab_no_null_split(struct maple_big_node *b_node,
                                    unsigned char split, unsigned char slot_count)
{
        if (!b_node->slot[split]) {
                /*
                 * If the split is less than the max slot && the right side will
                 * still be sufficient, then increment the split on NULL.
                 */
                if ((split < slot_count - 1) &&
                    (b_node->b_end - split) > (mt_min_slots[b_node->type]))
                        split++;
                else
                        split--;
        }
        return split;
}

/*
 * mab_calc_split() - Calculate the split location and if there needs to be two
 * splits.
 * @bn: The maple_big_node with the data
 * @mid_split: The second split, if required.  0 otherwise.
 *
 * Return: The first split location.  The middle split is set in @mid_split.
 */
static inline int mab_calc_split(struct ma_state *mas,
         struct maple_big_node *bn, unsigned char *mid_split, unsigned long min)
{
        unsigned char b_end = bn->b_end;
        int split = b_end / 2; /* Assume equal split. */
        unsigned char slot_min, slot_count = mt_slots[bn->type];

        /*
         * To support gap tracking, all NULL entries are kept together and a node cannot
         * end on a NULL entry, with the exception of the left-most leaf.  The
         * limitation means that the split of a node must be checked for this condition
         * and be able to put more data in one direction or the other.
         */
        if (unlikely((mas->mas_flags & MA_STATE_BULK))) {
                *mid_split = 0;
                split = b_end - mt_min_slots[bn->type];

                if (!ma_is_leaf(bn->type))
                        return split;

                mas->mas_flags |= MA_STATE_REBALANCE;
                if (!bn->slot[split])
                        split--;
                return split;
        }

        /*
         * Although extremely rare, it is possible to enter what is known as the 3-way
         * split scenario.  The 3-way split comes about by means of a store of a range
         * that overwrites the end and beginning of two full nodes.  The result is a set
         * of entries that cannot be stored in 2 nodes.  Sometimes, these two nodes can
         * also be located in different parent nodes which are also full.  This can
         * carry upwards all the way to the root in the worst case.
         */
        if (unlikely(mab_middle_node(bn, split, slot_count))) {
                split = b_end / 3;
                *mid_split = split * 2;
        } else {
                slot_min = mt_min_slots[bn->type];

                *mid_split = 0;
                /*
                 * Avoid having a range less than the slot count unless it
                 * causes one node to be deficient.
                 * NOTE: mt_min_slots is 1 based, b_end and split are zero.
                 */
                while ((split < slot_count - 1) &&
                       ((bn->pivot[split] - min) < slot_count - 1) &&
                       (b_end - split > slot_min))
                        split++;
        }

        /* Avoid ending a node on a NULL entry */
        split = mab_no_null_split(bn, split, slot_count);

        if (unlikely(*mid_split))
                *mid_split = mab_no_null_split(bn, *mid_split, slot_count);

        return split;
}

/*
 * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node
 * and set @b_node->b_end to the next free slot.
 * @mas: The maple state
 * @mas_start: The starting slot to copy
 * @mas_end: The end slot to copy (inclusively)
 * @b_node: The maple_big_node to place the data
 * @mab_start: The starting location in maple_big_node to store the data.
 */
static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start,
                        unsigned char mas_end, struct maple_big_node *b_node,
                        unsigned char mab_start)
{
        enum maple_type mt;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots, *gaps;
        int i = mas_start, j = mab_start;
        unsigned char piv_end;

        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        if (!i) {
                b_node->pivot[j] = pivots[i++];
                if (unlikely(i > mas_end))
                        goto complete;
                j++;
        }

        piv_end = min(mas_end, mt_pivots[mt]);
        for (; i < piv_end; i++, j++) {
                b_node->pivot[j] = pivots[i];
                if (unlikely(!b_node->pivot[j]))
                        break;

                if (unlikely(mas->max == b_node->pivot[j]))
                        goto complete;
        }

        if (likely(i <= mas_end))
                b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt);

complete:
        b_node->b_end = ++j;
        j -= mab_start;
        slots = ma_slots(node, mt);
        memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j);
        if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) {
                gaps = ma_gaps(node, mt);
                memcpy(b_node->gap + mab_start, gaps + mas_start,
                       sizeof(unsigned long) * j);
        }
}

/*
 * mas_leaf_set_meta() - Set the metadata of a leaf if possible.
 * @node: The maple node
 * @mt: The maple type
 * @end: The node end
 */
static inline void mas_leaf_set_meta(struct maple_node *node,
                enum maple_type mt, unsigned char end)
{
        if (end < mt_slots[mt] - 1)
                ma_set_meta(node, mt, 0, end);
}

/*
 * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node.
 * @b_node: the maple_big_node that has the data
 * @mab_start: the start location in @b_node.
 * @mab_end: The end location in @b_node (inclusively)
 * @mas: The maple state with the maple encoded node.
 */
static inline void mab_mas_cp(struct maple_big_node *b_node,
                              unsigned char mab_start, unsigned char mab_end,
                              struct ma_state *mas, bool new_max)
{
        int i, j = 0;
        enum maple_type mt = mte_node_type(mas->node);
        struct maple_node *node = mte_to_node(mas->node);
        void __rcu **slots = ma_slots(node, mt);
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned long *gaps = NULL;
        unsigned char end;

        if (mab_end - mab_start > mt_pivots[mt])
                mab_end--;

        if (!pivots[mt_pivots[mt] - 1])
                slots[mt_pivots[mt]] = NULL;

        i = mab_start;
        do {
                pivots[j++] = b_node->pivot[i++];
        } while (i <= mab_end && likely(b_node->pivot[i]));

        memcpy(slots, b_node->slot + mab_start,
               sizeof(void *) * (i - mab_start));

        if (new_max)
                mas->max = b_node->pivot[i - 1];

        end = j - 1;
        if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) {
                unsigned long max_gap = 0;
                unsigned char offset = 0;

                gaps = ma_gaps(node, mt);
                do {
                        gaps[--j] = b_node->gap[--i];
                        if (gaps[j] > max_gap) {
                                offset = j;
                                max_gap = gaps[j];
                        }
                } while (j);

                ma_set_meta(node, mt, offset, end);
        } else {
                mas_leaf_set_meta(node, mt, end);
        }
}

/*
 * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert.
 * @mas: The maple state
 * @end: The maple node end
 * @mt: The maple node type
 */
static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end,
                                      enum maple_type mt)
{
        if (!(mas->mas_flags & MA_STATE_BULK))
                return;

        if (mte_is_root(mas->node))
                return;

        if (end > mt_min_slots[mt]) {
                mas->mas_flags &= ~MA_STATE_REBALANCE;
                return;
        }
}

/*
 * mas_store_b_node() - Store an @entry into the b_node while also copying the
 * data from a maple encoded node.
 * @wr_mas: the maple write state
 * @b_node: the maple_big_node to fill with data
 * @offset_end: the offset to end copying
 *
 * Return: The actual end of the data stored in @b_node
 */
static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
                struct maple_big_node *b_node, unsigned char offset_end)
{
        unsigned char slot;
        unsigned char b_end;
        /* Possible underflow of piv will wrap back to 0 before use. */
        unsigned long piv;
        struct ma_state *mas = wr_mas->mas;

        b_node->type = wr_mas->type;
        b_end = 0;
        slot = mas->offset;
        if (slot) {
                /* Copy start data up to insert. */
                mas_mab_cp(mas, 0, slot - 1, b_node, 0);
                b_end = b_node->b_end;
                piv = b_node->pivot[b_end - 1];
        } else
                piv = mas->min - 1;

        if (piv + 1 < mas->index) {
                /* Handle range starting after old range */
                b_node->slot[b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = mas->index - 1 - piv;
                b_node->pivot[b_end++] = mas->index - 1;
        }

        /* Store the new entry. */
        mas->offset = b_end;
        b_node->slot[b_end] = wr_mas->entry;
        b_node->pivot[b_end] = mas->last;

        /* Appended. */
        if (mas->last >= mas->max)
                goto b_end;

        /* Handle new range ending before old range ends */
        piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type);
        if (piv > mas->last) {
                if (piv == ULONG_MAX)
                        mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type);

                if (offset_end != slot)
                        wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                          offset_end);

                b_node->slot[++b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = piv - mas->last + 1;
                b_node->pivot[b_end] = piv;
        }

        slot = offset_end + 1;
        if (slot > mas->end)
                goto b_end;

        /* Copy end data to the end of the node. */
        mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end);
        b_node->b_end--;
        return;

b_end:
        b_node->b_end = b_end;
}

/*
 * mas_prev_sibling() - Find the previous node with the same parent.
 * @mas: the maple state
 *
 * Return: True if there is a previous sibling, false otherwise.
 */
static inline bool mas_prev_sibling(struct ma_state *mas)
{
        unsigned int p_slot = mte_parent_slot(mas->node);

        if (mte_is_root(mas->node))
                return false;

        if (!p_slot)
                return false;

        mas_ascend(mas);
        mas->offset = p_slot - 1;
        mas_descend(mas);
        return true;
}

/*
 * mas_next_sibling() - Find the next node with the same parent.
 * @mas: the maple state
 *
 * Return: true if there is a next sibling, false otherwise.
 */
static inline bool mas_next_sibling(struct ma_state *mas)
{
        MA_STATE(parent, mas->tree, mas->index, mas->last);

        if (mte_is_root(mas->node))
                return false;

        parent = *mas;
        mas_ascend(&parent);
        parent.offset = mte_parent_slot(mas->node) + 1;
        if (parent.offset > mas_data_end(&parent))
                return false;

        *mas = parent;
        mas_descend(mas);
        return true;
}

/*
 * mte_node_or_none() - Set the enode and state.
 * @enode: The encoded maple node.
 *
 * Set the node to the enode and the status.
 */
static inline void mas_node_or_none(struct ma_state *mas,
                struct maple_enode *enode)
{
        if (enode) {
                mas->node = enode;
                mas->status = ma_active;
        } else {
                mas->node = NULL;
                mas->status = ma_none;
        }
}

/*
 * mas_wr_node_walk() - Find the correct offset for the index in the @mas.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 */
static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char count, offset;

        if (unlikely(ma_is_dense(wr_mas->type))) {
                wr_mas->r_max = wr_mas->r_min = mas->index;
                mas->offset = mas->index = mas->min;
                return;
        }

        wr_mas->node = mas_mn(wr_mas->mas);
        wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type);
        count = mas->end = ma_data_end(wr_mas->node, wr_mas->type,
                                       wr_mas->pivots, mas->max);
        offset = mas->offset;

        while (offset < count && mas->index > wr_mas->pivots[offset])
                offset++;

        wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max;
        wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset);
        wr_mas->offset_end = mas->offset = offset;
}

/*
 * mast_rebalance_next() - Rebalance against the next node
 * @mast: The maple subtree state
 * @old_r: The encoded maple node to the right (next node).
 */
static inline void mast_rebalance_next(struct maple_subtree_state *mast)
{
        unsigned char b_end = mast->bn->b_end;

        mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node),
                   mast->bn, b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_rebalance_prev() - Rebalance against the previous node
 * @mast: The maple subtree state
 * @old_l: The encoded maple node to the left (previous node)
 */
static inline void mast_rebalance_prev(struct maple_subtree_state *mast)
{
        unsigned char end = mas_data_end(mast->orig_l) + 1;
        unsigned char b_end = mast->bn->b_end;

        mab_shift_right(mast->bn, end);
        mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0);
        mast->l->min = mast->orig_l->min;
        mast->orig_l->index = mast->orig_l->min;
        mast->bn->b_end = end + b_end;
        mast->l->offset += end;
}

/*
 * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring
 * the node to the right.  Checking the nodes to the right then the left at each
 * level upwards until root is reached.
 * Data is copied into the @mast->bn.
 * @mast: The maple_subtree_state.
 */
static inline
bool mast_spanning_rebalance(struct maple_subtree_state *mast)
{
        struct ma_state r_tmp = *mast->orig_r;
        struct ma_state l_tmp = *mast->orig_l;
        unsigned char depth = 0;

        do {
                mas_ascend(mast->orig_r);
                mas_ascend(mast->orig_l);
                depth++;
                if (mast->orig_r->offset < mas_data_end(mast->orig_r)) {
                        mast->orig_r->offset++;
                        do {
                                mas_descend(mast->orig_r);
                                mast->orig_r->offset = 0;
                        } while (--depth);

                        mast_rebalance_next(mast);
                        *mast->orig_l = l_tmp;
                        return true;
                } else if (mast->orig_l->offset != 0) {
                        mast->orig_l->offset--;
                        do {
                                mas_descend(mast->orig_l);
                                mast->orig_l->offset =
                                        mas_data_end(mast->orig_l);
                        } while (--depth);

                        mast_rebalance_prev(mast);
                        *mast->orig_r = r_tmp;
                        return true;
                }
        } while (!mte_is_root(mast->orig_r->node));

        *mast->orig_r = r_tmp;
        *mast->orig_l = l_tmp;
        return false;
}

/*
 * mast_ascend() - Ascend the original left and right maple states.
 * @mast: the maple subtree state.
 *
 * Ascend the original left and right sides.  Set the offsets to point to the
 * data already in the new tree (@mast->l and @mast->r).
 */
static inline void mast_ascend(struct maple_subtree_state *mast)
{
        MA_WR_STATE(wr_mas, mast->orig_r,  NULL);
        mas_ascend(mast->orig_l);
        mas_ascend(mast->orig_r);

        mast->orig_r->offset = 0;
        mast->orig_r->index = mast->r->max;
        /* last should be larger than or equal to index */
        if (mast->orig_r->last < mast->orig_r->index)
                mast->orig_r->last = mast->orig_r->index;

        wr_mas.type = mte_node_type(mast->orig_r->node);
        mas_wr_node_walk(&wr_mas);
        /* Set up the left side of things */
        mast->orig_l->offset = 0;
        mast->orig_l->index = mast->l->min;
        wr_mas.mas = mast->orig_l;
        wr_mas.type = mte_node_type(mast->orig_l->node);
        mas_wr_node_walk(&wr_mas);

        mast->bn->type = wr_mas.type;
}

/*
 * mas_new_ma_node() - Create and return a new maple node.  Helper function.
 * @mas: the maple state with the allocations.
 * @b_node: the maple_big_node with the type encoding.
 *
 * Use the node type from the maple_big_node to allocate a new node from the
 * ma_state.  This function exists mainly for code readability.
 *
 * Return: A new maple encoded node
 */
static inline struct maple_enode
*mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node)
{
        return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type);
}

/*
 * mas_mab_to_node() - Set up right and middle nodes
 *
 * @mas: the maple state that contains the allocations.
 * @b_node: the node which contains the data.
 * @left: The pointer which will have the left node
 * @right: The pointer which may have the right node
 * @middle: the pointer which may have the middle node (rare)
 * @mid_split: the split location for the middle node
 *
 * Return: the split of left.
 */
static inline unsigned char mas_mab_to_node(struct ma_state *mas,
        struct maple_big_node *b_node, struct maple_enode **left,
        struct maple_enode **right, struct maple_enode **middle,
        unsigned char *mid_split, unsigned long min)
{
        unsigned char split = 0;
        unsigned char slot_count = mt_slots[b_node->type];

        *left = mas_new_ma_node(mas, b_node);
        *right = NULL;
        *middle = NULL;
        *mid_split = 0;

        if (b_node->b_end < slot_count) {
                split = b_node->b_end;
        } else {
                split = mab_calc_split(mas, b_node, mid_split, min);
                *right = mas_new_ma_node(mas, b_node);
        }

        if (*mid_split)
                *middle = mas_new_ma_node(mas, b_node);

        return split;

}

/*
 * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end
 * pointer.
 * @b_node - the big node to add the entry
 * @mas - the maple state to get the pivot (mas->max)
 * @entry - the entry to add, if NULL nothing happens.
 */
static inline void mab_set_b_end(struct maple_big_node *b_node,
                                 struct ma_state *mas,
                                 void *entry)
{
        if (!entry)
                return;

        b_node->slot[b_node->b_end] = entry;
        if (mt_is_alloc(mas->tree))
                b_node->gap[b_node->b_end] = mas_max_gap(mas);
        b_node->pivot[b_node->b_end++] = mas->max;
}

/*
 * mas_set_split_parent() - combine_then_separate helper function.  Sets the parent
 * of @mas->node to either @left or @right, depending on @slot and @split
 *
 * @mas - the maple state with the node that needs a parent
 * @left - possible parent 1
 * @right - possible parent 2
 * @slot - the slot the mas->node was placed
 * @split - the split location between @left and @right
 */
static inline void mas_set_split_parent(struct ma_state *mas,
                                        struct maple_enode *left,
                                        struct maple_enode *right,
                                        unsigned char *slot, unsigned char split)
{
        if (mas_is_none(mas))
                return;

        if ((*slot) <= split)
                mas_set_parent(mas, mas->node, left, *slot);
        else if (right)
                mas_set_parent(mas, mas->node, right, (*slot) - split - 1);

        (*slot)++;
}

/*
 * mte_mid_split_check() - Check if the next node passes the mid-split
 * @**l: Pointer to left encoded maple node.
 * @**m: Pointer to middle encoded maple node.
 * @**r: Pointer to right encoded maple node.
 * @slot: The offset
 * @*split: The split location.
 * @mid_split: The middle split.
 */
static inline void mte_mid_split_check(struct maple_enode **l,
                                       struct maple_enode **r,
                                       struct maple_enode *right,
                                       unsigned char slot,
                                       unsigned char *split,
                                       unsigned char mid_split)
{
        if (*r == right)
                return;

        if (slot < mid_split)
                return;

        *l = *r;
        *r = right;
        *split = mid_split;
}

/*
 * mast_set_split_parents() - Helper function to set three nodes parents.  Slot
 * is taken from @mast->l.
 * @mast - the maple subtree state
 * @left - the left node
 * @right - the right node
 * @split - the split location.
 */
static inline void mast_set_split_parents(struct maple_subtree_state *mast,
                                          struct maple_enode *left,
                                          struct maple_enode *middle,
                                          struct maple_enode *right,
                                          unsigned char split,
                                          unsigned char mid_split)
{
        unsigned char slot;
        struct maple_enode *l = left;
        struct maple_enode *r = right;

        if (mas_is_none(mast->l))
                return;

        if (middle)
                r = middle;

        slot = mast->l->offset;

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->l, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->m, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->r, l, r, &slot, split);
}

/*
 * mas_topiary_node() - Dispose of a single node
 * @mas: The maple state for pushing nodes
 * @enode: The encoded maple node
 * @in_rcu: If the tree is in rcu mode
 *
 * The node will either be RCU freed or pushed back on the maple state.
 */
static inline void mas_topiary_node(struct ma_state *mas,
                struct ma_state *tmp_mas, bool in_rcu)
{
        struct maple_node *tmp;
        struct maple_enode *enode;

        if (mas_is_none(tmp_mas))
                return;

        enode = tmp_mas->node;
        tmp = mte_to_node(enode);
        mte_set_node_dead(enode);
        if (in_rcu)
                ma_free_rcu(tmp);
        else
                mas_push_node(mas, tmp);
}

/*
 * mas_topiary_replace() - Replace the data with new data, then repair the
 * parent links within the new tree.  Iterate over the dead sub-tree and collect
 * the dead subtrees and topiary the nodes that are no longer of use.
 *
 * The new tree will have up to three children with the correct parent.  Keep
 * track of the new entries as they need to be followed to find the next level
 * of new entries.
 *
 * The old tree will have up to three children with the old parent.  Keep track
 * of the old entries as they may have more nodes below replaced.  Nodes within
 * [index, last] are dead subtrees, others need to be freed and followed.
 *
 * @mas: The maple state pointing at the new data
 * @old_enode: The maple encoded node being replaced
 *
 */
static inline void mas_topiary_replace(struct ma_state *mas,
                struct maple_enode *old_enode)
{
        struct ma_state tmp[3], tmp_next[3];
        MA_TOPIARY(subtrees, mas->tree);
        bool in_rcu;
        int i, n;

        /* Place data in tree & then mark node as old */
        mas_put_in_tree(mas, old_enode);

        /* Update the parent pointers in the tree */
        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        while (!mte_is_leaf(tmp[0].node)) {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;
                                n++;
                        }

                        mas_adopt_children(&tmp[i], tmp[i].node);
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++)
                        tmp[i] = tmp_next[i];
        }

        /* Collect the old nodes that need to be discarded */
        if (mte_is_leaf(old_enode))
                return mas_free(mas, old_enode);

        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[0].node = old_enode;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        in_rcu = mt_in_rcu(mas->tree);
        do {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;

                                if ((tmp_next[n].min >= tmp_next->index) &&
                                    (tmp_next[n].max <= tmp_next->last)) {
                                        mat_add(&subtrees, tmp_next[n].node);
                                        tmp_next[n].status = ma_none;
                                } else {
                                        n++;
                                }
                        }
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++) {
                        mas_topiary_node(mas, &tmp[i], in_rcu);
                        tmp[i] = tmp_next[i];
                }
        } while (!mte_is_leaf(tmp[0].node));

        for (i = 0; i < 3; i++)
                mas_topiary_node(mas, &tmp[i], in_rcu);

        mas_mat_destroy(mas, &subtrees);
}

/*
 * mas_wmb_replace() - Write memory barrier and replace
 * @mas: The maple state
 * @old: The old maple encoded node that is being replaced.
 *
 * Updates gap as necessary.
 */
static inline void mas_wmb_replace(struct ma_state *mas,
                struct maple_enode *old_enode)
{
        /* Insert the new data in the tree */
        mas_topiary_replace(mas, old_enode);

        if (mte_is_leaf(mas->node))
                return;

        mas_update_gap(mas);
}

/*
 * mast_cp_to_nodes() - Copy data out to nodes.
 * @mast: The maple subtree state
 * @left: The left encoded maple node
 * @middle: The middle encoded maple node
 * @right: The right encoded maple node
 * @split: The location to split between left and (middle ? middle : right)
 * @mid_split: The location to split between middle and right.
 */
static inline void mast_cp_to_nodes(struct maple_subtree_state *mast,
        struct maple_enode *left, struct maple_enode *middle,
        struct maple_enode *right, unsigned char split, unsigned char mid_split)
{
        bool new_lmax = true;

        mas_node_or_none(mast->l, left);
        mas_node_or_none(mast->m, middle);
        mas_node_or_none(mast->r, right);

        mast->l->min = mast->orig_l->min;
        if (split == mast->bn->b_end) {
                mast->l->max = mast->orig_r->max;
                new_lmax = false;
        }

        mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax);

        if (middle) {
                mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true);
                mast->m->min = mast->bn->pivot[split] + 1;
                split = mid_split;
        }

        mast->r->max = mast->orig_r->max;
        if (right) {
                mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false);
                mast->r->min = mast->bn->pivot[split] + 1;
        }
}

/*
 * mast_combine_cp_left - Copy in the original left side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_left(struct maple_subtree_state *mast)
{
        unsigned char l_slot = mast->orig_l->offset;

        if (!l_slot)
                return;

        mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0);
}

/*
 * mast_combine_cp_right: Copy in the original right side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_right(struct maple_subtree_state *mast)
{
        if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max)
                return;

        mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1,
                   mt_slot_count(mast->orig_r->node), mast->bn,
                   mast->bn->b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_sufficient: Check if the maple subtree state has enough data in the big
 * node to create at least one sufficient node
 * @mast: the maple subtree state
 */
static inline bool mast_sufficient(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node))
                return true;

        return false;
}

/*
 * mast_overflow: Check if there is too much data in the subtree state for a
 * single node.
 * @mast: The maple subtree state
 */
static inline bool mast_overflow(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node))
                return true;

        return false;
}

static inline void *mtree_range_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next, *last;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;
        unsigned long max, min;
        unsigned long prev_max, prev_min;

        next = mas->node;
        min = mas->min;
        max = mas->max;
        do {
                last = next;
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = ma_data_end(node, type, pivots, max);
                prev_min = min;
                prev_max = max;
                if (pivots[0] >= mas->index) {
                        offset = 0;
                        max = pivots[0];
                        goto next;
                }

                offset = 1;
                while (offset < end) {
                        if (pivots[offset] >= mas->index) {
                                max = pivots[offset];
                                break;
                        }
                        offset++;
                }

                min = pivots[offset - 1] + 1;
next:
                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        mas->end = end;
        mas->offset = offset;
        mas->index = min;
        mas->last = max;
        mas->min = prev_min;
        mas->max = prev_max;
        mas->node = last;
        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

/*
 * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers.
 * @mas: The starting maple state
 * @mast: The maple_subtree_state, keeps track of 4 maple states.
 * @count: The estimated count of iterations needed.
 *
 * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root
 * is hit.  First @b_node is split into two entries which are inserted into the
 * next iteration of the loop.  @b_node is returned populated with the final
 * iteration. @mas is used to obtain allocations.  orig_l_mas keeps track of the
 * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last
 * to account of what has been copied into the new sub-tree.  The update of
 * orig_l_mas->last is used in mas_consume to find the slots that will need to
 * be either freed or destroyed.  orig_l_mas->depth keeps track of the height of
 * the new sub-tree in case the sub-tree becomes the full tree.
 *
 * Return: the number of elements in b_node during the last loop.
 */
static int mas_spanning_rebalance(struct ma_state *mas,
                struct maple_subtree_state *mast, unsigned char count)
{
        unsigned char split, mid_split;
        unsigned char slot = 0;
        struct maple_enode *left = NULL, *middle = NULL, *right = NULL;
        struct maple_enode *old_enode;

        MA_STATE(l_mas, mas->tree, mas->index, mas->index);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(m_mas, mas->tree, mas->index, mas->index);

        /*
         * The tree needs to be rebalanced and leaves need to be kept at the same level.
         * Rebalancing is done by use of the ``struct maple_topiary``.
         */
        mast->l = &l_mas;
        mast->m = &m_mas;
        mast->r = &r_mas;
        l_mas.status = r_mas.status = m_mas.status = ma_none;

        /* Check if this is not root and has sufficient data.  */
        if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) &&
            unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type]))
                mast_spanning_rebalance(mast);

        l_mas.depth = 0;

        /*
         * Each level of the tree is examined and balanced, pushing data to the left or
         * right, or rebalancing against left or right nodes is employed to avoid
         * rippling up the tree to limit the amount of churn.  Once a new sub-section of
         * the tree is created, there may be a mix of new and old nodes.  The old nodes
         * will have the incorrect parent pointers and currently be in two trees: the
         * original tree and the partially new tree.  To remedy the parent pointers in
         * the old tree, the new data is swapped into the active tree and a walk down
         * the tree is performed and the parent pointers are updated.
         * See mas_topiary_replace() for more information.
         */
        while (count--) {
                mast->bn->b_end--;
                mast->bn->type = mte_node_type(mast->orig_l->node);
                split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle,
                                        &mid_split, mast->orig_l->min);
                mast_set_split_parents(mast, left, middle, right, split,
                                       mid_split);
                mast_cp_to_nodes(mast, left, middle, right, split, mid_split);

                /*
                 * Copy data from next level in the tree to mast->bn from next
                 * iteration
                 */
                memset(mast->bn, 0, sizeof(struct maple_big_node));
                mast->bn->type = mte_node_type(left);
                l_mas.depth++;

                /* Root already stored in l->node. */
                if (mas_is_root_limits(mast->l))
                        goto new_root;

                mast_ascend(mast);
                mast_combine_cp_left(mast);
                l_mas.offset = mast->bn->b_end;
                mab_set_b_end(mast->bn, &l_mas, left);
                mab_set_b_end(mast->bn, &m_mas, middle);
                mab_set_b_end(mast->bn, &r_mas, right);

                /* Copy anything necessary out of the right node. */
                mast_combine_cp_right(mast);
                mast->orig_l->last = mast->orig_l->max;

                if (mast_sufficient(mast))
                        continue;

                if (mast_overflow(mast))
                        continue;

                /* May be a new root stored in mast->bn */
                if (mas_is_root_limits(mast->orig_l))
                        break;

                mast_spanning_rebalance(mast);

                /* rebalancing from other nodes may require another loop. */
                if (!count)
                        count++;
        }

        l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)),
                                mte_node_type(mast->orig_l->node));
        l_mas.depth++;
        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true);
        mas_set_parent(mas, left, l_mas.node, slot);
        if (middle)
                mas_set_parent(mas, middle, l_mas.node, ++slot);

        if (right)
                mas_set_parent(mas, right, l_mas.node, ++slot);

        if (mas_is_root_limits(mast->l)) {
new_root:
                mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas));
                while (!mte_is_root(mast->orig_l->node))
                        mast_ascend(mast);
        } else {
                mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent;
        }

        old_enode = mast->orig_l->node;
        mas->depth = l_mas.depth;
        mas->node = l_mas.node;
        mas->min = l_mas.min;
        mas->max = l_mas.max;
        mas->offset = l_mas.offset;
        mas_wmb_replace(mas, old_enode);
        mtree_range_walk(mas);
        return mast->bn->b_end;
}

/*
 * mas_rebalance() - Rebalance a given node.
 * @mas: The maple state
 * @b_node: The big maple node.
 *
 * Rebalance two nodes into a single node or two new nodes that are sufficient.
 * Continue upwards until tree is sufficient.
 *
 * Return: the number of elements in b_node during the last loop.
 */
static inline int mas_rebalance(struct ma_state *mas,
                                struct maple_big_node *b_node)
{
        char empty_count = mas_mt_height(mas);
        struct maple_subtree_state mast;
        unsigned char shift, b_end = ++b_node->b_end;

        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(__func__, mas);

        /*
         * Rebalancing occurs if a node is insufficient.  Data is rebalanced
         * against the node to the right if it exists, otherwise the node to the
         * left of this node is rebalanced against this node.  If rebalancing
         * causes just one node to be produced instead of two, then the parent
         * is also examined and rebalanced if it is insufficient.  Every level
         * tries to combine the data in the same way.  If one node contains the
         * entire range of the tree, then that node is used as a new root node.
         */
        mas_node_count(mas, empty_count * 2 - 1);
        if (mas_is_err(mas))
                return 0;

        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        mast.bn = b_node;
        mast.bn->type = mte_node_type(mas->node);

        l_mas = r_mas = *mas;

        if (mas_next_sibling(&r_mas)) {
                mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end);
                r_mas.last = r_mas.index = r_mas.max;
        } else {
                mas_prev_sibling(&l_mas);
                shift = mas_data_end(&l_mas) + 1;
                mab_shift_right(b_node, shift);
                mas->offset += shift;
                mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0);
                b_node->b_end = shift + b_end;
                l_mas.index = l_mas.last = l_mas.min;
        }

        return mas_spanning_rebalance(mas, &mast, empty_count);
}

/*
 * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple
 * state.
 * @mas: The maple state
 * @end: The end of the left-most node.
 *
 * During a mass-insert event (such as forking), it may be necessary to
 * rebalance the left-most node when it is not sufficient.
 */
static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end)
{
        enum maple_type mt = mte_node_type(mas->node);
        struct maple_node reuse, *newnode, *parent, *new_left, *left, *node;
        struct maple_enode *eparent, *old_eparent;
        unsigned char offset, tmp, split = mt_slots[mt] / 2;
        void __rcu **l_slots, **slots;
        unsigned long *l_pivs, *pivs, gap;
        bool in_rcu = mt_in_rcu(mas->tree);

        MA_STATE(l_mas, mas->tree, mas->index, mas->last);

        l_mas = *mas;
        mas_prev_sibling(&l_mas);

        /* set up node. */
        if (in_rcu) {
                /* Allocate for both left and right as well as parent. */
                mas_node_count(mas, 3);
                if (mas_is_err(mas))
                        return;

                newnode = mas_pop_node(mas);
        } else {
                newnode = &reuse;
        }

        node = mas_mn(mas);
        newnode->parent = node->parent;
        slots = ma_slots(newnode, mt);
        pivs = ma_pivots(newnode, mt);
        left = mas_mn(&l_mas);
        l_slots = ma_slots(left, mt);
        l_pivs = ma_pivots(left, mt);
        if (!l_slots[split])
                split++;
        tmp = mas_data_end(&l_mas) - split;

        memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp);
        memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp);
        pivs[tmp] = l_mas.max;
        memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end);
        memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end);

        l_mas.max = l_pivs[split];
        mas->min = l_mas.max + 1;
        old_eparent = mt_mk_node(mte_parent(l_mas.node),
                             mas_parent_type(&l_mas, l_mas.node));
        tmp += end;
        if (!in_rcu) {
                unsigned char max_p = mt_pivots[mt];
                unsigned char max_s = mt_slots[mt];

                if (tmp < max_p)
                        memset(pivs + tmp, 0,
                               sizeof(unsigned long) * (max_p - tmp));

                if (tmp < mt_slots[mt])
                        memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp));

                memcpy(node, newnode, sizeof(struct maple_node));
                ma_set_meta(node, mt, 0, tmp - 1);
                mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node),
                              l_pivs[split]);

                /* Remove data from l_pivs. */
                tmp = split + 1;
                memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp));
                memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp));
                ma_set_meta(left, mt, 0, split);
                eparent = old_eparent;

                goto done;
        }

        /* RCU requires replacing both l_mas, mas, and parent. */
        mas->node = mt_mk_node(newnode, mt);
        ma_set_meta(newnode, mt, 0, tmp);

        new_left = mas_pop_node(mas);
        new_left->parent = left->parent;
        mt = mte_node_type(l_mas.node);
        slots = ma_slots(new_left, mt);
        pivs = ma_pivots(new_left, mt);
        memcpy(slots, l_slots, sizeof(void *) * split);
        memcpy(pivs, l_pivs, sizeof(unsigned long) * split);
        ma_set_meta(new_left, mt, 0, split);
        l_mas.node = mt_mk_node(new_left, mt);

        /* replace parent. */
        offset = mte_parent_slot(mas->node);
        mt = mas_parent_type(&l_mas, l_mas.node);
        parent = mas_pop_node(mas);
        slots = ma_slots(parent, mt);
        pivs = ma_pivots(parent, mt);
        memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node));
        rcu_assign_pointer(slots[offset], mas->node);
        rcu_assign_pointer(slots[offset - 1], l_mas.node);
        pivs[offset - 1] = l_mas.max;
        eparent = mt_mk_node(parent, mt);
done:
        gap = mas_leaf_max_gap(mas);
        mte_set_gap(eparent, mte_parent_slot(mas->node), gap);
        gap = mas_leaf_max_gap(&l_mas);
        mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap);
        mas_ascend(mas);

        if (in_rcu) {
                mas_replace_node(mas, old_eparent);
                mas_adopt_children(mas, mas->node);
        }

        mas_update_gap(mas);
}

/*
 * mas_split_final_node() - Split the final node in a subtree operation.
 * @mast: the maple subtree state
 * @mas: The maple state
 * @height: The height of the tree in case it's a new root.
 */
static inline void mas_split_final_node(struct maple_subtree_state *mast,
                                        struct ma_state *mas, int height)
{
        struct maple_enode *ancestor;

        if (mte_is_root(mas->node)) {
                if (mt_is_alloc(mas->tree))
                        mast->bn->type = maple_arange_64;
                else
                        mast->bn->type = maple_range_64;
                mas->depth = height;
        }
        /*
         * Only a single node is used here, could be root.
         * The Big_node data should just fit in a single node.
         */
        ancestor = mas_new_ma_node(mas, mast->bn);
        mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset);
        mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset);
        mte_to_node(ancestor)->parent = mas_mn(mas)->parent;

        mast->l->node = ancestor;
        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true);
        mas->offset = mast->bn->b_end - 1;
}

/*
 * mast_fill_bnode() - Copy data into the big node in the subtree state
 * @mast: The maple subtree state
 * @mas: the maple state
 * @skip: The number of entries to skip for new nodes insertion.
 */
static inline void mast_fill_bnode(struct maple_subtree_state *mast,
                                         struct ma_state *mas,
                                         unsigned char skip)
{
        bool cp = true;
        unsigned char split;

        memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap));
        memset(mast->bn->slot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->slot));
        memset(mast->bn->pivot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->pivot));
        mast->bn->b_end = 0;

        if (mte_is_root(mas->node)) {
                cp = false;
        } else {
                mas_ascend(mas);
                mas->offset = mte_parent_slot(mas->node);
        }

        if (cp && mast->l->offset)
                mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0);

        split = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->l, mast->l->node);
        mast->r->offset = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->r, mast->r->node);
        if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max)
                cp = false;

        if (cp)
                mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1,
                           mast->bn, mast->bn->b_end);

        mast->bn->b_end--;
        mast->bn->type = mte_node_type(mas->node);
}

/*
 * mast_split_data() - Split the data in the subtree state big node into regular
 * nodes.
 * @mast: The maple subtree state
 * @mas: The maple state
 * @split: The location to split the big node
 */
static inline void mast_split_data(struct maple_subtree_state *mast,
           struct ma_state *mas, unsigned char split)
{
        unsigned char p_slot;

        mab_mas_cp(mast->bn, 0, split, mast->l, true);
        mte_set_pivot(mast->r->node, 0, mast->r->max);
        mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false);
        mast->l->offset = mte_parent_slot(mas->node);
        mast->l->max = mast->bn->pivot[split];
        mast->r->min = mast->l->max + 1;
        if (mte_is_leaf(mas->node))
                return;

        p_slot = mast->orig_l->offset;
        mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node,
                             &p_slot, split);
        mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node,
                             &p_slot, split);
}

/*
 * mas_push_data() - Instead of splitting a node, it is beneficial to push the
 * data to the right or left node if there is room.
 * @mas: The maple state
 * @height: The current height of the maple state
 * @mast: The maple subtree state
 * @left: Push left or not.
 *
 * Keeping the height of the tree low means faster lookups.
 *
 * Return: True if pushed, false otherwise.
 */
static inline bool mas_push_data(struct ma_state *mas, int height,
                                 struct maple_subtree_state *mast, bool left)
{
        unsigned char slot_total = mast->bn->b_end;
        unsigned char end, space, split;

        MA_STATE(tmp_mas, mas->tree, mas->index, mas->last);
        tmp_mas = *mas;
        tmp_mas.depth = mast->l->depth;

        if (left && !mas_prev_sibling(&tmp_mas))
                return false;
        else if (!left && !mas_next_sibling(&tmp_mas))
                return false;

        end = mas_data_end(&tmp_mas);
        slot_total += end;
        space = 2 * mt_slot_count(mas->node) - 2;
        /* -2 instead of -1 to ensure there isn't a triple split */
        if (ma_is_leaf(mast->bn->type))
                space--;

        if (mas->max == ULONG_MAX)
                space--;

        if (slot_total >= space)
                return false;

        /* Get the data; Fill mast->bn */
        mast->bn->b_end++;
        if (left) {
                mab_shift_right(mast->bn, end + 1);
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0);
                mast->bn->b_end = slot_total + 1;
        } else {
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end);
        }

        /* Configure mast for splitting of mast->bn */
        split = mt_slots[mast->bn->type] - 2;
        if (left) {
                /*  Switch mas to prev node  */
                *mas = tmp_mas;
                /* Start using mast->l for the left side. */
                tmp_mas.node = mast->l->node;
                *mast->l = tmp_mas;
        } else {
                tmp_mas.node = mast->r->node;
                *mast->r = tmp_mas;
                split = slot_total - split;
        }
        split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]);
        /* Update parent slot for split calculation. */
        if (left)
                mast->orig_l->offset += end + 1;

        mast_split_data(mast, mas, split);
        mast_fill_bnode(mast, mas, 2);
        mas_split_final_node(mast, mas, height + 1);
        return true;
}

/*
 * mas_split() - Split data that is too big for one node into two.
 * @mas: The maple state
 * @b_node: The maple big node
 * Return: 1 on success, 0 on failure.
 */
static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
{
        struct maple_subtree_state mast;
        int height = 0;
        unsigned char mid_split, split = 0;
        struct maple_enode *old;

        /*
         * Splitting is handled differently from any other B-tree; the Maple
         * Tree splits upwards.  Splitting up means that the split operation
         * occurs when the walk of the tree hits the leaves and not on the way
         * down.  The reason for splitting up is that it is impossible to know
         * how much space will be needed until the leaf is (or leaves are)
         * reached.  Since overwriting data is allowed and a range could
         * overwrite more than one range or result in changing one entry into 3
         * entries, it is impossible to know if a split is required until the
         * data is examined.
         *
         * Splitting is a balancing act between keeping allocations to a minimum
         * and avoiding a 'jitter' event where a tree is expanded to make room
         * for an entry followed by a contraction when the entry is removed.  To
         * accomplish the balance, there are empty slots remaining in both left
         * and right nodes after a split.
         */
        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(__func__, mas);
        mas->depth = mas_mt_height(mas);
        /* Allocation failures will happen early. */
        mas_node_count(mas, 1 + mas->depth * 2);
        if (mas_is_err(mas))
                return 0;

        mast.l = &l_mas;
        mast.r = &r_mas;
        mast.orig_l = &prev_l_mas;
        mast.orig_r = &prev_r_mas;
        mast.bn = b_node;

        while (height++ <= mas->depth) {
                if (mt_slots[b_node->type] > b_node->b_end) {
                        mas_split_final_node(&mast, mas, height);
                        break;
                }

                l_mas = r_mas = *mas;
                l_mas.node = mas_new_ma_node(mas, b_node);
                r_mas.node = mas_new_ma_node(mas, b_node);
                /*
                 * Another way that 'jitter' is avoided is to terminate a split up early if the
                 * left or right node has space to spare.  This is referred to as "pushing left"
                 * or "pushing right" and is similar to the B* tree, except the nodes left or
                 * right can rarely be reused due to RCU, but the ripple upwards is halted which
                 * is a significant savings.
                 */
                /* Try to push left. */
                if (mas_push_data(mas, height, &mast, true))
                        break;
                /* Try to push right. */
                if (mas_push_data(mas, height, &mast, false))
                        break;

                split = mab_calc_split(mas, b_node, &mid_split, prev_l_mas.min);
                mast_split_data(&mast, mas, split);
                /*
                 * Usually correct, mab_mas_cp in the above call overwrites
                 * r->max.
                 */
                mast.r->max = mas->max;
                mast_fill_bnode(&mast, mas, 1);
                prev_l_mas = *mast.l;
                prev_r_mas = *mast.r;
        }

        /* Set the original node as dead */
        old = mas->node;
        mas->node = l_mas.node;
        mas_wmb_replace(mas, old);
        mtree_range_walk(mas);
        return 1;
}

/*
 * mas_reuse_node() - Reuse the node to store the data.
 * @wr_mas: The maple write state
 * @bn: The maple big node
 * @end: The end of the data.
 *
 * Will always return false in RCU mode.
 *
 * Return: True if node was reused, false otherwise.
 */
static inline bool mas_reuse_node(struct ma_wr_state *wr_mas,
                          struct maple_big_node *bn, unsigned char end)
{
        /* Need to be rcu safe. */
        if (mt_in_rcu(wr_mas->mas->tree))
                return false;

        if (end > bn->b_end) {
                int clear = mt_slots[wr_mas->type] - bn->b_end;

                memset(wr_mas->slots + bn->b_end, 0, sizeof(void *) * clear--);
                memset(wr_mas->pivots + bn->b_end, 0, sizeof(void *) * clear);
        }
        mab_mas_cp(bn, 0, bn->b_end, wr_mas->mas, false);
        return true;
}

/*
 * mas_commit_b_node() - Commit the big node into the tree.
 * @wr_mas: The maple write state
 * @b_node: The maple big node
 * @end: The end of the data.
 */
static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas,
                            struct maple_big_node *b_node, unsigned char end)
{
        struct maple_node *node;
        struct maple_enode *old_enode;
        unsigned char b_end = b_node->b_end;
        enum maple_type b_type = b_node->type;

        old_enode = wr_mas->mas->node;
        if ((b_end < mt_min_slots[b_type]) &&
            (!mte_is_root(old_enode)) &&
            (mas_mt_height(wr_mas->mas) > 1))
                return mas_rebalance(wr_mas->mas, b_node);

        if (b_end >= mt_slots[b_type])
                return mas_split(wr_mas->mas, b_node);

        if (mas_reuse_node(wr_mas, b_node, end))
                goto reuse_node;

        mas_node_count(wr_mas->mas, 1);
        if (mas_is_err(wr_mas->mas))
                return 0;

        node = mas_pop_node(wr_mas->mas);
        node->parent = mas_mn(wr_mas->mas)->parent;
        wr_mas->mas->node = mt_mk_node(node, b_type);
        mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false);
        mas_replace_node(wr_mas->mas, old_enode);
reuse_node:
        mas_update_gap(wr_mas->mas);
        wr_mas->mas->end = b_end;
        return 1;
}

/*
 * mas_root_expand() - Expand a root to a node
 * @mas: The maple state
 * @entry: The entry to store into the tree
 */
static inline int mas_root_expand(struct ma_state *mas, void *entry)
{
        void *contents = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;
        int slot = 0;

        mas_node_count(mas, 1);
        if (unlikely(mas_is_err(mas)))
                return 0;

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;

        if (mas->index) {
                if (contents) {
                        rcu_assign_pointer(slots[slot], contents);
                        if (likely(mas->index > 1))
                                slot++;
                }
                pivots[slot++] = mas->index - 1;
        }

        rcu_assign_pointer(slots[slot], entry);
        mas->offset = slot;
        pivots[slot] = mas->last;
        if (mas->last != ULONG_MAX)
                pivots[++slot] = ULONG_MAX;

        mas->depth = 1;
        mas_set_height(mas);
        ma_set_meta(node, maple_leaf_64, 0, slot);
        /* swap the new root into the tree */
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
        return slot;
}

static inline void mas_store_root(struct ma_state *mas, void *entry)
{
        if (likely((mas->last != 0) || (mas->index != 0)))
                mas_root_expand(mas, entry);
        else if (((unsigned long) (entry) & 3) == 2)
                mas_root_expand(mas, entry);
        else {
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
        }
}

/*
 * mas_is_span_wr() - Check if the write needs to be treated as a write that
 * spans the node.
 * @mas: The maple state
 * @piv: The pivot value being written
 * @type: The maple node type
 * @entry: The data to write
 *
 * Spanning writes are writes that start in one node and end in another OR if
 * the write of a %NULL will cause the node to end with a %NULL.
 *
 * Return: True if this is a spanning write, false otherwise.
 */
static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
{
        unsigned long max = wr_mas->r_max;
        unsigned long last = wr_mas->mas->last;
        enum maple_type type = wr_mas->type;
        void *entry = wr_mas->entry;

        /* Contained in this pivot, fast path */
        if (last < max)
                return false;

        if (ma_is_leaf(type)) {
                max = wr_mas->mas->max;
                if (last < max)
                        return false;
        }

        if (last == max) {
                /*
                 * The last entry of leaf node cannot be NULL unless it is the
                 * rightmost node (writing ULONG_MAX), otherwise it spans slots.
                 */
                if (entry || last == ULONG_MAX)
                        return false;
        }

        trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry);
        return true;
}

static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas)
{
        wr_mas->type = mte_node_type(wr_mas->mas->node);
        mas_wr_node_walk(wr_mas);
        wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type);
}

static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas)
{
        wr_mas->mas->max = wr_mas->r_max;
        wr_mas->mas->min = wr_mas->r_min;
        wr_mas->mas->node = wr_mas->content;
        wr_mas->mas->offset = 0;
        wr_mas->mas->depth++;
}
/*
 * mas_wr_walk() - Walk the tree for a write.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 *
 * Return: True if it's contained in a node, false on spanning write.
 */
static bool mas_wr_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                if (unlikely(mas_is_span_wr(wr_mas)))
                        return false;

                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return true;

                mas_wr_walk_traverse(wr_mas);
        }

        return true;
}

static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return true;
                mas_wr_walk_traverse(wr_mas);

        }
        return true;
}
/*
 * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
 * @l_wr_mas: The left maple write state
 * @r_wr_mas: The right maple write state
 */
static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas,
                                            struct ma_wr_state *r_wr_mas)
{
        struct ma_state *r_mas = r_wr_mas->mas;
        struct ma_state *l_mas = l_wr_mas->mas;
        unsigned char l_slot;

        l_slot = l_mas->offset;
        if (!l_wr_mas->content)
                l_mas->index = l_wr_mas->r_min;

        if ((l_mas->index == l_wr_mas->r_min) &&
                 (l_slot &&
                  !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) {
                if (l_slot > 1)
                        l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1;
                else
                        l_mas->index = l_mas->min;

                l_mas->offset = l_slot - 1;
        }

        if (!r_wr_mas->content) {
                if (r_mas->last < r_wr_mas->r_max)
                        r_mas->last = r_wr_mas->r_max;
                r_mas->offset++;
        } else if ((r_mas->last == r_wr_mas->r_max) &&
            (r_mas->last < r_mas->max) &&
            !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) {
                r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots,
                                             r_wr_mas->type, r_mas->offset + 1);
                r_mas->offset++;
        }
}

static inline void *mas_state_walk(struct ma_state *mas)
{
        void *entry;

        entry = mas_start(mas);
        if (mas_is_none(mas))
                return NULL;

        if (mas_is_ptr(mas))
                return entry;

        return mtree_range_walk(mas);
}

/*
 * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up
 * to date.
 *
 * @mas: The maple state.
 *
 * Note: Leaves mas in undesirable state.
 * Return: The entry for @mas->index or %NULL on dead node.
 */
static inline void *mtree_lookup_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;

        next = mas->node;
        do {
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = mt_pivots[type];
                offset = 0;
                do {
                        if (pivots[offset] >= mas->index)
                                break;
                } while (++offset < end);

                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

static void mte_destroy_walk(struct maple_enode *, struct maple_tree *);
/*
 * mas_new_root() - Create a new root node that only contains the entry passed
 * in.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * Only valid when the index == 0 and the last == ULONG_MAX
 *
 * Return 0 on error, 1 on success.
 */
static inline int mas_new_root(struct ma_state *mas, void *entry)
{
        struct maple_enode *root = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;

        if (!entry && !mas->index && mas->last == ULONG_MAX) {
                mas->depth = 0;
                mas_set_height(mas);
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
                goto done;
        }

        mas_node_count(mas, 1);
        if (mas_is_err(mas))
                return 0;

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;
        rcu_assign_pointer(slots[0], entry);
        pivots[0] = mas->last;
        mas->depth = 1;
        mas_set_height(mas);
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));

done:
        if (xa_is_node(root))
                mte_destroy_walk(root, mas->tree);

        return 1;
}
/*
 * mas_wr_spanning_store() - Create a subtree with the store operation completed
 * and new nodes where necessary, then place the sub-tree in the actual tree.
 * Note that mas is expected to point to the node which caused the store to
 * span.
 * @wr_mas: The maple write state
 *
 * Return: 0 on error, positive on success.
 */
static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas)
{
        struct maple_subtree_state mast;
        struct maple_big_node b_node;
        struct ma_state *mas;
        unsigned char height;

        /* Left and Right side of spanning store */
        MA_STATE(l_mas, NULL, 0, 0);
        MA_STATE(r_mas, NULL, 0, 0);
        MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry);
        MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry);

        /*
         * A store operation that spans multiple nodes is called a spanning
         * store and is handled early in the store call stack by the function
         * mas_is_span_wr().  When a spanning store is identified, the maple
         * state is duplicated.  The first maple state walks the left tree path
         * to ``index``, the duplicate walks the right tree path to ``last``.
         * The data in the two nodes are combined into a single node, two nodes,
         * or possibly three nodes (see the 3-way split above).  A ``NULL``
         * written to the last entry of a node is considered a spanning store as
         * a rebalance is required for the operation to complete and an overflow
         * of data may happen.
         */
        mas = wr_mas->mas;
        trace_ma_op(__func__, mas);

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return mas_new_root(mas, wr_mas->entry);
        /*
         * Node rebalancing may occur due to this store, so there may be three new
         * entries per level plus a new root.
         */
        height = mas_mt_height(mas);
        mas_node_count(mas, 1 + height * 3);
        if (mas_is_err(mas))
                return 0;

        /*
         * Set up right side.  Need to get to the next offset after the spanning
         * store to ensure it's not NULL and to combine both the next node and
         * the node with the start together.
         */
        r_mas = *mas;
        /* Avoid overflow, walk to next slot in the tree. */
        if (r_mas.last + 1)
                r_mas.last++;

        r_mas.index = r_mas.last;
        mas_wr_walk_index(&r_wr_mas);
        r_mas.last = r_mas.index = mas->last;

        /* Set up left side. */
        l_mas = *mas;
        mas_wr_walk_index(&l_wr_mas);

        if (!wr_mas->entry) {
                mas_extend_spanning_null(&l_wr_mas, &r_wr_mas);
                mas->offset = l_mas.offset;
                mas->index = l_mas.index;
                mas->last = l_mas.last = r_mas.last;
        }

        /* expanding NULLs may make this cover the entire range */
        if (!l_mas.index && r_mas.last == ULONG_MAX) {
                mas_set_range(mas, 0, ULONG_MAX);
                return mas_new_root(mas, wr_mas->entry);
        }

        memset(&b_node, 0, sizeof(struct maple_big_node));
        /* Copy l_mas and store the value in b_node. */
        mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
        /* Copy r_mas into b_node. */
        if (r_mas.offset <= r_mas.end)
                mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
                           &b_node, b_node.b_end + 1);
        else
                b_node.b_end++;

        /* Stop spanning searches by searching for just index. */
        l_mas.index = l_mas.last = mas->index;

        mast.bn = &b_node;
        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        /* Combine l_mas and r_mas and split them up evenly again. */
        return mas_spanning_rebalance(mas, &mast, height + 1);
}

/*
 * mas_wr_node_store() - Attempt to store the value in a node
 * @wr_mas: The maple write state
 *
 * Attempts to reuse the node, but may allocate.
 *
 * Return: True if stored, false otherwise
 */
static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas,
                                     unsigned char new_end)
{
        struct ma_state *mas = wr_mas->mas;
        void __rcu **dst_slots;
        unsigned long *dst_pivots;
        unsigned char dst_offset, offset_end = wr_mas->offset_end;
        struct maple_node reuse, *newnode;
        unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type];
        bool in_rcu = mt_in_rcu(mas->tree);

        /* Check if there is enough data. The room is enough. */
        if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) &&
            !(mas->mas_flags & MA_STATE_BULK))
                return false;

        if (mas->last == wr_mas->end_piv)
                offset_end++; /* don't copy this offset */
        else if (unlikely(wr_mas->r_max == ULONG_MAX))
                mas_bulk_rebalance(mas, mas->end, wr_mas->type);

        /* set up node. */
        if (in_rcu) {
                mas_node_count(mas, 1);
                if (mas_is_err(mas))
                        return false;

                newnode = mas_pop_node(mas);
        } else {
                memset(&reuse, 0, sizeof(struct maple_node));
                newnode = &reuse;
        }

        newnode->parent = mas_mn(mas)->parent;
        dst_pivots = ma_pivots(newnode, wr_mas->type);
        dst_slots = ma_slots(newnode, wr_mas->type);
        /* Copy from start to insert point */
        memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset);
        memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset);

        /* Handle insert of new range starting after old range */
        if (wr_mas->r_min < mas->index) {
                rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content);
                dst_pivots[mas->offset++] = mas->index - 1;
        }

        /* Store the new entry and range end. */
        if (mas->offset < node_pivots)
                dst_pivots[mas->offset] = mas->last;
        rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry);

        /*
         * this range wrote to the end of the node or it overwrote the rest of
         * the data
         */
        if (offset_end > mas->end)
                goto done;

        dst_offset = mas->offset + 1;
        /* Copy to the end of node if necessary. */
        copy_size = mas->end - offset_end + 1;
        memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end,
               sizeof(void *) * copy_size);
        memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end,
               sizeof(unsigned long) * (copy_size - 1));

        if (new_end < node_pivots)
                dst_pivots[new_end] = mas->max;

done:
        mas_leaf_set_meta(newnode, maple_leaf_64, new_end);
        if (in_rcu) {
                struct maple_enode *old_enode = mas->node;

                mas->node = mt_mk_node(newnode, wr_mas->type);
                mas_replace_node(mas, old_enode);
        } else {
                memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
        }
        trace_ma_write(__func__, mas, 0, wr_mas->entry);
        mas_update_gap(mas);
        mas->end = new_end;
        return true;
}

/*
 * mas_wr_slot_store: Attempt to store a value in a slot.
 * @wr_mas: the maple write state
 *
 * Return: True if stored, false otherwise
 */
static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char offset = mas->offset;
        void __rcu **slots = wr_mas->slots;
        bool gap = false;

        gap |= !mt_slot_locked(mas->tree, slots, offset);
        gap |= !mt_slot_locked(mas->tree, slots, offset + 1);

        if (wr_mas->offset_end - offset == 1) {
                if (mas->index == wr_mas->r_min) {
                        /* Overwriting the range and a part of the next one */
                        rcu_assign_pointer(slots[offset], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->last;
                } else {
                        /* Overwriting a part of the range and the next one */
                        rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->index - 1;
                        mas->offset++; /* Keep mas accurate. */
                }
        } else if (!mt_in_rcu(mas->tree)) {
                /*
                 * Expand the range, only partially overwriting the previous and
                 * next ranges
                 */
                gap |= !mt_slot_locked(mas->tree, slots, offset + 2);
                rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                wr_mas->pivots[offset] = mas->index - 1;
                wr_mas->pivots[offset + 1] = mas->last;
                mas->offset++; /* Keep mas accurate. */
        } else {
                return false;
        }

        trace_ma_write(__func__, mas, 0, wr_mas->entry);
        /*
         * Only update gap when the new entry is empty or there is an empty
         * entry in the original two ranges.
         */
        if (!wr_mas->entry || gap)
                mas_update_gap(mas);

        return true;
}

static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!wr_mas->slots[wr_mas->offset_end]) {
                /* If this one is null, the next and prev are not */
                mas->last = wr_mas->end_piv;
        } else {
                /* Check next slot(s) if we are overwriting the end */
                if ((mas->last == wr_mas->end_piv) &&
                    (mas->end != wr_mas->offset_end) &&
                    !wr_mas->slots[wr_mas->offset_end + 1]) {
                        wr_mas->offset_end++;
                        if (wr_mas->offset_end == mas->end)
                                mas->last = mas->max;
                        else
                                mas->last = wr_mas->pivots[wr_mas->offset_end];
                        wr_mas->end_piv = mas->last;
                }
        }

        if (!wr_mas->content) {
                /* If this one is null, the next and prev are not */
                mas->index = wr_mas->r_min;
        } else {
                /* Check prev slot if we are overwriting the start */
                if (mas->index == wr_mas->r_min && mas->offset &&
                    !wr_mas->slots[mas->offset - 1]) {
                        mas->offset--;
                        wr_mas->r_min = mas->index =
                                mas_safe_min(mas, wr_mas->pivots, mas->offset);
                        wr_mas->r_max = wr_mas->pivots[mas->offset];
                }
        }
}

static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
{
        while ((wr_mas->offset_end < wr_mas->mas->end) &&
               (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
                wr_mas->offset_end++;

        if (wr_mas->offset_end < wr_mas->mas->end)
                wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
        else
                wr_mas->end_piv = wr_mas->mas->max;

        if (!wr_mas->entry)
                mas_wr_extend_null(wr_mas);
}

static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end = mas->end + 2;

        new_end -= wr_mas->offset_end - mas->offset;
        if (wr_mas->r_min == mas->index)
                new_end--;

        if (wr_mas->end_piv == mas->last)
                new_end--;

        return new_end;
}

/*
 * mas_wr_append: Attempt to append
 * @wr_mas: the maple write state
 * @new_end: The end of the node after the modification
 *
 * This is currently unsafe in rcu mode since the end of the node may be cached
 * by readers while the node contents may be updated which could result in
 * inaccurate information.
 *
 * Return: True if appended, false otherwise
 */
static inline bool mas_wr_append(struct ma_wr_state *wr_mas,
                unsigned char new_end)
{
        struct ma_state *mas;
        void __rcu **slots;
        unsigned char end;

        mas = wr_mas->mas;
        if (mt_in_rcu(mas->tree))
                return false;

        end = mas->end;
        if (mas->offset != end)
                return false;

        if (new_end < mt_pivots[wr_mas->type]) {
                wr_mas->pivots[new_end] = wr_mas->pivots[end];
                ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end);
        }

        slots = wr_mas->slots;
        if (new_end == end + 1) {
                if (mas->last == wr_mas->r_max) {
                        /* Append to end of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->entry);
                        wr_mas->pivots[end] = mas->index - 1;
                        mas->offset = new_end;
                } else {
                        /* Append to start of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->content);
                        wr_mas->pivots[end] = mas->last;
                        rcu_assign_pointer(slots[end], wr_mas->entry);
                }
        } else {
                /* Append to the range without touching any boundaries. */
                rcu_assign_pointer(slots[new_end], wr_mas->content);
                wr_mas->pivots[end + 1] = mas->last;
                rcu_assign_pointer(slots[end + 1], wr_mas->entry);
                wr_mas->pivots[end] = mas->index - 1;
                mas->offset = end + 1;
        }

        if (!wr_mas->content || !wr_mas->entry)
                mas_update_gap(mas);

        mas->end = new_end;
        trace_ma_write(__func__, mas, new_end, wr_mas->entry);
        return  true;
}

/*
 * mas_wr_bnode() - Slow path for a modification.
 * @wr_mas: The write maple state
 *
 * This is where split, rebalance end up.
 */
static void mas_wr_bnode(struct ma_wr_state *wr_mas)
{
        struct maple_big_node b_node;

        trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry);
        memset(&b_node, 0, sizeof(struct maple_big_node));
        mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
        mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end);
}

static inline void mas_wr_modify(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end;

        /* Direct replacement */
        if (wr_mas->r_min == mas->index && wr_mas->r_max == mas->last) {
                rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry);
                if (!!wr_mas->entry ^ !!wr_mas->content)
                        mas_update_gap(mas);
                return;
        }

        /*
         * new_end exceeds the size of the maple node and cannot enter the fast
         * path.
         */
        new_end = mas_wr_new_end(wr_mas);
        if (new_end >= mt_slots[wr_mas->type])
                goto slow_path;

        /* Attempt to append */
        if (mas_wr_append(wr_mas, new_end))
                return;

        if (new_end == mas->end && mas_wr_slot_store(wr_mas))
                return;

        if (mas_wr_node_store(wr_mas, new_end))
                return;

        if (mas_is_err(mas))
                return;

slow_path:
        mas_wr_bnode(wr_mas);
}

/*
 * mas_wr_store_entry() - Internal call to store a value
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * Return: The contents that was stored at the index.
 */
static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        wr_mas->content = mas_start(mas);
        if (mas_is_none(mas) || mas_is_ptr(mas)) {
                mas_store_root(mas, wr_mas->entry);
                return wr_mas->content;
        }

        if (unlikely(!mas_wr_walk(wr_mas))) {
                mas_wr_spanning_store(wr_mas);
                return wr_mas->content;
        }

        /* At this point, we are at the leaf node that needs to be altered. */
        mas_wr_end_piv(wr_mas);
        /* New root for a single pointer */
        if (unlikely(!mas->index && mas->last == ULONG_MAX)) {
                mas_new_root(mas, wr_mas->entry);
                return wr_mas->content;
        }

        mas_wr_modify(wr_mas);
        return wr_mas->content;
}

/**
 * mas_insert() - Internal call to insert a value
 * @mas: The maple state
 * @entry: The entry to store
 *
 * Return: %NULL or the contents that already exists at the requested index
 * otherwise.  The maple state needs to be checked for error conditions.
 */
static inline void *mas_insert(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        /*
         * Inserting a new range inserts either 0, 1, or 2 pivots within the
         * tree.  If the insert fits exactly into an existing gap with a value
         * of NULL, then the slot only needs to be written with the new value.
         * If the range being inserted is adjacent to another range, then only a
         * single pivot needs to be inserted (as well as writing the entry).  If
         * the new range is within a gap but does not touch any other ranges,
         * then two pivots need to be inserted: the start - 1, and the end.  As
         * usual, the entry must be written.  Most operations require a new node
         * to be allocated and replace an existing node to ensure RCU safety,
         * when in RCU mode.  The exception to requiring a newly allocated node
         * is when inserting at the end of a node (appending).  When done
         * carefully, appending can reuse the node in place.
         */
        wr_mas.content = mas_start(mas);
        if (wr_mas.content)
                goto exists;

        if (mas_is_none(mas) || mas_is_ptr(mas)) {
                mas_store_root(mas, entry);
                return NULL;
        }

        /* spanning writes always overwrite something */
        if (!mas_wr_walk(&wr_mas))
                goto exists;

        /* At this point, we are at the leaf node that needs to be altered. */
        wr_mas.offset_end = mas->offset;
        wr_mas.end_piv = wr_mas.r_max;

        if (wr_mas.content || (mas->last > wr_mas.r_max))
                goto exists;

        if (!entry)
                return NULL;

        mas_wr_modify(&wr_mas);
        return wr_mas.content;

exists:
        mas_set_err(mas, -EEXIST);
        return wr_mas.content;

}

/**
 * mas_alloc_cyclic() - Internal call to find somewhere to store an entry
 * @mas: The maple state.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, or -EBUSY if there are no
 * free entries.
 */
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        unsigned long min = range_lo;
        int ret = 0;

        range_lo = max(min, *next);
        ret = mas_empty_area(mas, range_lo, range_hi, 1);
        if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }
        if (ret < 0 && range_lo > min) {
                ret = mas_empty_area(mas, min, range_hi, 1);
                if (ret == 0)
                        ret = 1;
        }
        if (ret < 0)
                return ret;

        do {
                mas_insert(mas, entry);
        } while (mas_nomem(mas, gfp));
        if (mas_is_err(mas))
                return xa_err(mas->node);

        *startp = mas->index;
        *next = *startp + 1;
        if (*next == 0)
                mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED;

        return ret;
}
EXPORT_SYMBOL(mas_alloc_cyclic);

static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index)
{
retry:
        mas_set(mas, index);
        mas_state_walk(mas);
        if (mas_is_start(mas))
                goto retry;
}

static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas,
                struct maple_node *node, const unsigned long index)
{
        if (unlikely(ma_dead_node(node))) {
                mas_rewalk(mas, index);
                return true;
        }
        return false;
}

/*
 * mas_prev_node() - Find the prev non-null entry at the same level in the
 * tree.  The prev value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * @mas: The maple state
 * @min: The lower limit to search
 *
 * The prev node value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * Return: 1 if the node is dead, 0 otherwise.
 */
static int mas_prev_node(struct ma_state *mas, unsigned long min)
{
        enum maple_type mt;
        int offset, level;
        void __rcu **slots;
        struct maple_node *node;
        unsigned long *pivots;
        unsigned long max;

        node = mas_mn(mas);
        if (!mas->min)
                goto no_entry;

        max = mas->min - 1;
        if (max < min)
                goto no_entry;

        level = 0;
        do {
                if (ma_is_root(node))
                        goto no_entry;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;
                offset = mas->offset;
                level++;
                node = mas_mn(mas);
        } while (!offset);

        offset--;
        mt = mte_node_type(mas->node);
        while (level > 1) {
                level--;
                slots = ma_slots(node, mt);
                mas->node = mas_slot(mas, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        return 1;

                mt = mte_node_type(mas->node);
                node = mas_mn(mas);
                pivots = ma_pivots(node, mt);
                offset = ma_data_end(node, mt, pivots, max);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        slots = ma_slots(node, mt);
        mas->node = mas_slot(mas, slots, offset);
        pivots = ma_pivots(node, mt);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (likely(offset))
                mas->min = pivots[offset - 1] + 1;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        if (unlikely(mte_dead_node(mas->node)))
                return 1;

        mas->end = mas->offset;
        return 0;

no_entry:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_underflow;
        return 0;
}

/*
 * mas_prev_slot() - Get the entry in the previous slot
 *
 * @mas: The maple state
 * @max: The minimum starting range
 * @empty: Can be empty
 * @set_underflow: Set the @mas->node to underflow state on limit.
 *
 * Return: The entry in the previous slot which is possibly NULL
 */
static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
{
        void *entry;
        void __rcu **slots;
        unsigned long pivot;
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        unsigned long save_point = mas->index;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->min <= min) {
                pivot = mas_safe_min(mas, pivots, mas->offset);

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot <= min)
                        goto underflow;
        }

again:
        if (likely(mas->offset)) {
                mas->offset--;
                mas->last = mas->index - 1;
                mas->index = mas_safe_min(mas, pivots, mas->offset);
        } else  {
                if (mas->index <= min)
                        goto underflow;

                if (mas_prev_node(mas, min)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_underflow(mas)))
                        return NULL;

                mas->last = mas->max;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->index = pivots[mas->offset - 1] + 1;
        }

        slots = ma_slots(node, type);
        entry = mas_slot(mas, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;


        if (likely(entry))
                return entry;

        if (!empty) {
                if (mas->index <= min) {
                        mas->status = ma_underflow;
                        return NULL;
                }

                goto again;
        }

        return entry;

underflow:
        mas->status = ma_underflow;
        return NULL;
}

/*
 * mas_next_node() - Get the next node at the same level in the tree.
 * @mas: The maple state
 * @max: The maximum pivot value to check.
 *
 * The next value will be mas->node[mas->offset] or the status will have
 * overflowed.
 * Return: 1 on dead node, 0 otherwise.
 */
static int mas_next_node(struct ma_state *mas, struct maple_node *node,
                unsigned long max)
{
        unsigned long min;
        unsigned long *pivots;
        struct maple_enode *enode;
        struct maple_node *tmp;
        int level = 0;
        unsigned char node_end;
        enum maple_type mt;
        void __rcu **slots;

        if (mas->max >= max)
                goto overflow;

        min = mas->max + 1;
        level = 0;
        do {
                if (ma_is_root(node))
                        goto overflow;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;

                level++;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                pivots = ma_pivots(node, mt);
                node_end = ma_data_end(node, mt, pivots, mas->max);
                if (unlikely(ma_dead_node(node)))
                        return 1;

        } while (unlikely(mas->offset == node_end));

        slots = ma_slots(node, mt);
        mas->offset++;
        enode = mas_slot(mas, slots, mas->offset);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (level > 1)
                mas->offset = 0;

        while (unlikely(level > 1)) {
                level--;
                mas->node = enode;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                slots = ma_slots(node, mt);
                enode = mas_slot(mas, slots, 0);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        if (!mas->offset)
                pivots = ma_pivots(node, mt);

        mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt);
        tmp = mte_to_node(enode);
        mt = mte_node_type(enode);
        pivots = ma_pivots(tmp, mt);
        mas->end = ma_data_end(tmp, mt, pivots, mas->max);
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->node = enode;
        mas->min = min;
        return 0;

overflow:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_overflow;
        return 0;
}

/*
 * mas_next_slot() - Get the entry in the next slot
 *
 * @mas: The maple state
 * @max: The maximum starting range
 * @empty: Can be empty
 * @set_overflow: Should @mas->node be set to overflow when the limit is
 * reached.
 *
 * Return: The entry in the next slot which is possibly NULL
 */
static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
{
        void __rcu **slots;
        unsigned long *pivots;
        unsigned long pivot;
        enum maple_type type;
        struct maple_node *node;
        unsigned long save_point = mas->last;
        void *entry;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->max >= max) {
                if (likely(mas->offset < mas->end))
                        pivot = pivots[mas->offset];
                else
                        pivot = mas->max;

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot >= max) { /* Was at the limit, next will extend beyond */
                        mas->status = ma_overflow;
                        return NULL;
                }
        }

        if (likely(mas->offset < mas->end)) {
                mas->index = pivots[mas->offset] + 1;
again:
                mas->offset++;
                if (likely(mas->offset < mas->end))
                        mas->last = pivots[mas->offset];
                else
                        mas->last = mas->max;
        } else  {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                if (mas_next_node(mas, node, max)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_overflow(mas)))
                        return NULL;

                mas->offset = 0;
                mas->index = mas->min;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->last = pivots[0];
        }

        slots = ma_slots(node, type);
        entry = mt_slot(mas->tree, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (entry)
                return entry;


        if (!empty) {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                mas->index = mas->last + 1;
                goto again;
        }

        return entry;
}

/*
 * mas_next_entry() - Internal function to get the next entry.
 * @mas: The maple state
 * @limit: The maximum range start.
 *
 * Set the @mas->node to the next entry and the range_start to
 * the beginning value for the entry.  Does not check beyond @limit.
 * Sets @mas->index and @mas->last to the range, Does not update @mas->index and
 * @mas->last on overflow.
 * Restarts on dead nodes.
 *
 * Return: the next entry or %NULL.
 */
static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit)
{
        if (mas->last >= limit) {
                mas->status = ma_overflow;
                return NULL;
        }

        return mas_next_slot(mas, limit, false);
}

/*
 * mas_rev_awalk() - Internal function.  Reverse allocation walk.  Find the
 * highest gap address of a given size in a given node and descend.
 * @mas: The maple state
 * @size: The needed size.
 *
 * Return: True if found in a leaf, false otherwise.
 *
 */
static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
                unsigned long *gap_min, unsigned long *gap_max)
{
        enum maple_type type = mte_node_type(mas->node);
        struct maple_node *node = mas_mn(mas);
        unsigned long *pivots, *gaps;
        void __rcu **slots;
        unsigned long gap = 0;
        unsigned long max, min;
        unsigned char offset;

        if (unlikely(mas_is_err(mas)))
                return true;

        if (ma_is_dense(type)) {
                /* dense nodes. */
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        /* Skip out of bounds. */
        while (mas->last < min)
                min = mas_safe_min(mas, pivots, --offset);

        max = mas_safe_pivot(mas, pivots, offset, type);
        while (mas->index <= max) {
                gap = 0;
                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = max - min + 1;

                if (gap) {
                        if ((size <= gap) && (size <= mas->last - min + 1))
                                break;

                        if (!gaps) {
                                /* Skip the next slot, it cannot be a gap. */
                                if (offset < 2)
                                        goto ascend;

                                offset -= 2;
                                max = pivots[offset];
                                min = mas_safe_min(mas, pivots, offset);
                                continue;
                        }
                }

                if (!offset)
                        goto ascend;

                offset--;
                max = min - 1;
                min = mas_safe_min(mas, pivots, offset);
        }

        if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))
                goto no_space;

        if (unlikely(ma_is_leaf(type))) {
                mas->offset = offset;
                *gap_min = min;
                *gap_max = min + gap - 1;
                return true;
        }

        /* descend, only happens under lock. */
        mas->node = mas_slot(mas, slots, offset);
        mas->min = min;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        return false;

ascend:
        if (!mte_is_root(mas->node))
                return false;

no_space:
        mas_set_err(mas, -EBUSY);
        return false;
}

static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
{
        enum maple_type type = mte_node_type(mas->node);
        unsigned long pivot, min, gap = 0;
        unsigned char offset, data_end;
        unsigned long *gaps, *pivots;
        void __rcu **slots;
        struct maple_node *node;
        bool found = false;

        if (ma_is_dense(type)) {
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        node = mas_mn(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        data_end = ma_data_end(node, type, pivots, mas->max);
        for (; offset <= data_end; offset++) {
                pivot = mas_safe_pivot(mas, pivots, offset, type);

                /* Not within lower bounds */
                if (mas->index > pivot)
                        goto next_slot;

                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = min(pivot, mas->last) - max(mas->index, min) + 1;
                else
                        goto next_slot;

                if (gap >= size) {
                        if (ma_is_leaf(type)) {
                                found = true;
                                goto done;
                        }
                        if (mas->index <= pivot) {
                                mas->node = mas_slot(mas, slots, offset);
                                mas->min = min;
                                mas->max = pivot;
                                offset = 0;
                                break;
                        }
                }
next_slot:
                min = pivot + 1;
                if (mas->last <= pivot) {
                        mas_set_err(mas, -EBUSY);
                        return true;
                }
        }

        if (mte_is_root(mas->node))
                found = true;
done:
        mas->offset = offset;
        return found;
}

/**
 * mas_walk() - Search for @mas->index in the tree.
 * @mas: The maple state.
 *
 * mas->index and mas->last will be set to the range if there is a value.  If
 * mas->status is ma_none, reset to ma_start
 *
 * Return: the entry at the location or %NULL.
 */
void *mas_walk(struct ma_state *mas)
{
        void *entry;

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;
retry:
        entry = mas_state_walk(mas);
        if (mas_is_start(mas)) {
                goto retry;
        } else if (mas_is_none(mas)) {
                mas->index = 0;
                mas->last = ULONG_MAX;
        } else if (mas_is_ptr(mas)) {
                if (!mas->index) {
                        mas->last = 0;
                        return entry;
                }

                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return NULL;
        }

        return entry;
}
EXPORT_SYMBOL_GPL(mas_walk);

static inline bool mas_rewind_node(struct ma_state *mas)
{
        unsigned char slot;

        do {
                if (mte_is_root(mas->node)) {
                        slot = mas->offset;
                        if (!slot)
                                return false;
                } else {
                        mas_ascend(mas);
                        slot = mas->offset;
                }
        } while (!slot);

        mas->offset = --slot;
        return true;
}

/*
 * mas_skip_node() - Internal function.  Skip over a node.
 * @mas: The maple state.
 *
 * Return: true if there is another node, false otherwise.
 */
static inline bool mas_skip_node(struct ma_state *mas)
{
        if (mas_is_err(mas))
                return false;

        do {
                if (mte_is_root(mas->node)) {
                        if (mas->offset >= mas_data_end(mas)) {
                                mas_set_err(mas, -EBUSY);
                                return false;
                        }
                } else {
                        mas_ascend(mas);
                }
        } while (mas->offset >= mas_data_end(mas));

        mas->offset++;
        return true;
}

/*
 * mas_awalk() - Allocation walk.  Search from low address to high, for a gap of
 * @size
 * @mas: The maple state
 * @size: The size of the gap required
 *
 * Search between @mas->index and @mas->last for a gap of @size.
 */
static inline void mas_awalk(struct ma_state *mas, unsigned long size)
{
        struct maple_enode *last = NULL;

        /*
         * There are 4 options:
         * go to child (descend)
         * go back to parent (ascend)
         * no gap found. (return, slot == MAPLE_NODE_SLOTS)
         * found the gap. (return, slot != MAPLE_NODE_SLOTS)
         */
        while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) {
                if (last == mas->node)
                        mas_skip_node(mas);
                else
                        last = mas->node;
        }
}

/*
 * mas_sparse_area() - Internal function.  Return upper or lower limit when
 * searching for a gap in an empty tree.
 * @mas: The maple state
 * @min: the minimum range
 * @max: The maximum range
 * @size: The size of the gap
 * @fwd: Searching forward or back
 */
static inline int mas_sparse_area(struct ma_state *mas, unsigned long min,
                                unsigned long max, unsigned long size, bool fwd)
{
        if (!unlikely(mas_is_none(mas)) && min == 0) {
                min++;
                /*
                 * At this time, min is increased, we need to recheck whether
                 * the size is satisfied.
                 */
                if (min > max || max - min + 1 < size)
                        return -EBUSY;
        }
        /* mas_is_ptr */

        if (fwd) {
                mas->index = min;
                mas->last = min + size - 1;
        } else {
                mas->last = max;
                mas->index = max - size + 1;
        }
        return 0;
}

/*
 * mas_empty_area() - Get the lowest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        unsigned char offset;
        unsigned long *pivots;
        enum maple_type mt;
        struct maple_node *node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else if (!mas_skip_node(mas))
                return -EBUSY;

        /* Empty set */
        if (mas_is_none(mas) || mas_is_ptr(mas))
                return mas_sparse_area(mas, min, max, size, true);

        /* The start of the window can only be within these values */
        mas->index = min;
        mas->last = max;
        mas_awalk(mas, size);

        if (unlikely(mas_is_err(mas)))
                return xa_err(mas->node);

        offset = mas->offset;
        if (unlikely(offset == MAPLE_NODE_SLOTS))
                return -EBUSY;

        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        min = mas_safe_min(mas, pivots, offset);
        if (mas->index < min)
                mas->index = min;
        mas->last = mas->index + size - 1;
        mas->end = ma_data_end(node, mt, pivots, mas->max);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area);

/*
 * mas_empty_area_rev() - Get the highest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        struct maple_enode *last = mas->node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if ((mas->offset < 2) && (!mas_rewind_node(mas)))
                return -EBUSY;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return mas_sparse_area(mas, min, max, size, false);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else
                mas->offset = mas_data_end(mas);


        /* The start of the window can only be within these values. */
        mas->index = min;
        mas->last = max;

        while (!mas_rev_awalk(mas, size, &min, &max)) {
                if (last == mas->node) {
                        if (!mas_rewind_node(mas))
                                return -EBUSY;
                } else {
                        last = mas->node;
                }
        }

        if (mas_is_err(mas))
                return xa_err(mas->node);

        if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
                return -EBUSY;

        /* Trim the upper limit to the max. */
        if (max < mas->last)
                mas->last = max;

        mas->index = mas->last - size + 1;
        mas->end = mas_data_end(mas);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area_rev);

/*
 * mte_dead_leaves() - Mark all leaves of a node as dead.
 * @mas: The maple state
 * @slots: Pointer to the slot array
 * @type: The maple node type
 *
 * Must hold the write lock.
 *
 * Return: The number of leaves marked as dead.
 */
static inline
unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
                              void __rcu **slots)
{
        struct maple_node *node;
        enum maple_type type;
        void *entry;
        int offset;

        for (offset = 0; offset < mt_slot_count(enode); offset++) {
                entry = mt_slot(mt, slots, offset);
                type = mte_node_type(entry);
                node = mte_to_node(entry);
                /* Use both node and type to catch LE & BE metadata */
                if (!node || !type)
                        break;

                mte_set_node_dead(entry);
                node->type = type;
                rcu_assign_pointer(slots[offset], node);
        }

        return offset;
}

/**
 * mte_dead_walk() - Walk down a dead tree to just before the leaves
 * @enode: The maple encoded node
 * @offset: The starting offset
 *
 * Note: This can only be used from the RCU callback context.
 */
static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
{
        struct maple_node *node, *next;
        void __rcu **slots = NULL;

        next = mte_to_node(*enode);
        do {
                *enode = ma_enode_ptr(next);
                node = mte_to_node(*enode);
                slots = ma_slots(node, node->type);
                next = rcu_dereference_protected(slots[offset],
                                        lock_is_held(&rcu_callback_map));
                offset = 0;
        } while (!ma_is_leaf(next->type));

        return slots;
}

/**
 * mt_free_walk() - Walk & free a tree in the RCU callback context
 * @head: The RCU head that's within the node.
 *
 * Note: This can only be used from the RCU callback context.
 */
static void mt_free_walk(struct rcu_head *head)
{
        void __rcu **slots;
        struct maple_node *node, *start;
        struct maple_enode *enode;
        unsigned char offset;
        enum maple_type type;

        node = container_of(head, struct maple_node, rcu);

        if (ma_is_leaf(node->type))
                goto free_leaf;

        start = node;
        enode = mt_mk_node(node, node->type);
        slots = mte_dead_walk(&enode, 0);
        node = mte_to_node(enode);
        do {
                mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if ((offset < mt_slots[type]) &&
                    rcu_dereference_protected(slots[offset],
                                              lock_is_held(&rcu_callback_map)))
                        slots = mte_dead_walk(&enode, offset);
                node = mte_to_node(enode);
        } while ((node != start) || (node->slot_len < offset));

        slots = ma_slots(node, node->type);
        mt_free_bulk(node->slot_len, slots);

free_leaf:
        mt_free_rcu(&node->rcu);
}

static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
        struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
{
        struct maple_node *node;
        struct maple_enode *next = *enode;
        void __rcu **slots = NULL;
        enum maple_type type;
        unsigned char next_offset = 0;

        do {
                *enode = next;
                node = mte_to_node(*enode);
                type = mte_node_type(*enode);
                slots = ma_slots(node, type);
                next = mt_slot_locked(mt, slots, next_offset);
                if ((mte_dead_node(next)))
                        next = mt_slot_locked(mt, slots, ++next_offset);

                mte_set_node_dead(*enode);
                node->type = type;
                node->piv_parent = prev;
                node->parent_slot = offset;
                offset = next_offset;
                next_offset = 0;
                prev = *enode;
        } while (!mte_is_leaf(next));

        return slots;
}

static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free)
{
        void __rcu **slots;
        struct maple_node *node = mte_to_node(enode);
        struct maple_enode *start;

        if (mte_is_leaf(enode)) {
                node->type = mte_node_type(enode);
                goto free_leaf;
        }

        start = enode;
        slots = mte_destroy_descend(&enode, mt, start, 0);
        node = mte_to_node(enode); // Updated in the above call.
        do {
                enum maple_type type;
                unsigned char offset;
                struct maple_enode *parent, *tmp;

                node->slot_len = mte_dead_leaves(enode, mt, slots);
                if (free)
                        mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if (offset >= mt_slots[type])
                        goto next;

                tmp = mt_slot_locked(mt, slots, offset);
                if (mte_node_type(tmp) && mte_to_node(tmp)) {
                        parent = enode;
                        enode = tmp;
                        slots = mte_destroy_descend(&enode, mt, parent, offset);
                }
next:
                node = mte_to_node(enode);
        } while (start != enode);

        node = mte_to_node(enode);
        node->slot_len = mte_dead_leaves(enode, mt, slots);
        if (free)
                mt_free_bulk(node->slot_len, slots);

free_leaf:
        if (free)
                mt_free_rcu(&node->rcu);
        else
                mt_clear_meta(mt, node, node->type);
}

/*
 * mte_destroy_walk() - Free a tree or sub-tree.
 * @enode: the encoded maple node (maple_enode) to start
 * @mt: the tree to free - needed for node types.
 *
 * Must hold the write lock.
 */
static inline void mte_destroy_walk(struct maple_enode *enode,
                                    struct maple_tree *mt)
{
        struct maple_node *node = mte_to_node(enode);

        if (mt_in_rcu(mt)) {
                mt_destroy_walk(enode, mt, false);
                call_rcu(&node->rcu, mt_free_walk);
        } else {
                mt_destroy_walk(enode, mt, true);
        }
}

static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
{
        if (!mas_is_active(wr_mas->mas)) {
                if (mas_is_start(wr_mas->mas))
                        return;

                if (unlikely(mas_is_paused(wr_mas->mas)))
                        goto reset;

                if (unlikely(mas_is_none(wr_mas->mas)))
                        goto reset;

                if (unlikely(mas_is_overflow(wr_mas->mas)))
                        goto reset;

                if (unlikely(mas_is_underflow(wr_mas->mas)))
                        goto reset;
        }

        /*
         * A less strict version of mas_is_span_wr() where we allow spanning
         * writes within this node.  This is to stop partial walks in
         * mas_prealloc() from being reset.
         */
        if (wr_mas->mas->last > wr_mas->mas->max)
                goto reset;

        if (wr_mas->entry)
                return;

        if (mte_is_leaf(wr_mas->mas->node) &&
            wr_mas->mas->last == wr_mas->mas->max)
                goto reset;

        return;

reset:
        mas_reset(wr_mas->mas);
}

/* Interface */

/**
 * mas_store() - Store an @entry.
 * @mas: The maple state.
 * @entry: The entry to store.
 *
 * The @mas->index and @mas->last is used to set the range for the @entry.
 * Note: The @mas should have pre-allocated entries to ensure there is memory to
 * store the entry.  Please see mas_expected_entries()/mas_destroy() for more details.
 *
 * Return: the first entry between mas->index and mas->last or %NULL.
 */
void *mas_store(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        trace_ma_write(__func__, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
        if (MAS_WARN_ON(mas, mas->index > mas->last))
                pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry);

        if (mas->index > mas->last) {
                mas_set_err(mas, -EINVAL);
                return NULL;
        }

#endif

        /*
         * Storing is the same operation as insert with the added caveat that it
         * can overwrite entries.  Although this seems simple enough, one may
         * want to examine what happens if a single store operation was to
         * overwrite multiple entries within a self-balancing B-Tree.
         */
        mas_wr_store_setup(&wr_mas);
        mas_wr_store_entry(&wr_mas);
        return wr_mas.content;
}
EXPORT_SYMBOL_GPL(mas_store);

/**
 * mas_store_gfp() - Store a value into the tree.
 * @mas: The maple state
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations if necessary.
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp)
{
        MA_WR_STATE(wr_mas, mas, entry);

        mas_wr_store_setup(&wr_mas);
        trace_ma_write(__func__, mas, 0, entry);
retry:
        mas_wr_store_entry(&wr_mas);
        if (unlikely(mas_nomem(mas, gfp)))
                goto retry;

        if (unlikely(mas_is_err(mas)))
                return xa_err(mas->node);

        return 0;
}
EXPORT_SYMBOL_GPL(mas_store_gfp);

/**
 * mas_store_prealloc() - Store a value into the tree using memory
 * preallocated in the maple state.
 * @mas: The maple state
 * @entry: The entry to store.
 */
void mas_store_prealloc(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        mas_wr_store_setup(&wr_mas);
        trace_ma_write(__func__, mas, 0, entry);
        mas_wr_store_entry(&wr_mas);
        MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
        mas_destroy(mas);
}
EXPORT_SYMBOL_GPL(mas_store_prealloc);

/**
 * mas_preallocate() - Preallocate enough nodes for a store operation
 * @mas: The maple state
 * @entry: The entry that will be stored
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
{
        MA_WR_STATE(wr_mas, mas, entry);
        unsigned char node_size;
        int request = 1;
        int ret;


        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                goto ask_now;

        mas_wr_store_setup(&wr_mas);
        wr_mas.content = mas_start(mas);
        /* Root expand */
        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                goto ask_now;

        if (unlikely(!mas_wr_walk(&wr_mas))) {
                /* Spanning store, use worst case for now */
                request = 1 + mas_mt_height(mas) * 3;
                goto ask_now;
        }

        /* At this point, we are at the leaf node that needs to be altered. */
        /* Exact fit, no nodes needed. */
        if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last)
                return 0;

        mas_wr_end_piv(&wr_mas);
        node_size = mas_wr_new_end(&wr_mas);

        /* Slot store, does not require additional nodes */
        if (node_size == mas->end) {
                /* reuse node */
                if (!mt_in_rcu(mas->tree))
                        return 0;
                /* shifting boundary */
                if (wr_mas.offset_end - mas->offset == 1)
                        return 0;
        }

        if (node_size >= mt_slots[wr_mas.type]) {
                /* Split, worst case for now. */
                request = 1 + mas_mt_height(mas) * 2;
                goto ask_now;
        }

        /* New root needs a single node */
        if (unlikely(mte_is_root(mas->node)))
                goto ask_now;

        /* Potential spanning rebalance collapsing a node, use worst-case */
        if (node_size  - 1 <= mt_min_slots[wr_mas.type])
                request = mas_mt_height(mas) * 2 - 1;

        /* node store, slot store needs one node */
ask_now:
        mas_node_count_gfp(mas, request, gfp);
        mas->mas_flags |= MA_STATE_PREALLOC;
        if (likely(!mas_is_err(mas)))
                return 0;

        mas_set_alloc_req(mas, 0);
        ret = xa_err(mas->node);
        mas_reset(mas);
        mas_destroy(mas);
        mas_reset(mas);
        return ret;
}
EXPORT_SYMBOL_GPL(mas_preallocate);

/*
 * mas_destroy() - destroy a maple state.
 * @mas: The maple state
 *
 * Upon completion, check the left-most node and rebalance against the node to
 * the right if necessary.  Frees any allocated nodes associated with this maple
 * state.
 */
void mas_destroy(struct ma_state *mas)
{
        struct maple_alloc *node;
        unsigned long total;

        /*
         * When using mas_for_each() to insert an expected number of elements,
         * it is possible that the number inserted is less than the expected
         * number.  To fix an invalid final node, a check is performed here to
         * rebalance the previous node with the final node.
         */
        if (mas->mas_flags & MA_STATE_REBALANCE) {
                unsigned char end;

                mas_start(mas);
                mtree_range_walk(mas);
                end = mas->end + 1;
                if (end < mt_min_slot_count(mas->node) - 1)
                        mas_destroy_rebalance(mas, end);

                mas->mas_flags &= ~MA_STATE_REBALANCE;
        }
        mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);

        total = mas_allocated(mas);
        while (total) {
                node = mas->alloc;
                mas->alloc = node->slot[0];
                if (node->node_count > 1) {
                        size_t count = node->node_count - 1;

                        mt_free_bulk(count, (void __rcu **)&node->slot[1]);
                        total -= count;
                }
                mt_free_one(ma_mnode_ptr(node));
                total--;
        }

        mas->alloc = NULL;
}
EXPORT_SYMBOL_GPL(mas_destroy);

/*
 * mas_expected_entries() - Set the expected number of entries that will be inserted.
 * @mas: The maple state
 * @nr_entries: The number of expected entries.
 *
 * This will attempt to pre-allocate enough nodes to store the expected number
 * of entries.  The allocations will occur using the bulk allocator interface
 * for speed.  Please call mas_destroy() on the @mas after inserting the entries
 * to ensure any unused nodes are freed.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries)
{
        int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2;
        struct maple_enode *enode = mas->node;
        int nr_nodes;
        int ret;

        /*
         * Sometimes it is necessary to duplicate a tree to a new tree, such as
         * forking a process and duplicating the VMAs from one tree to a new
         * tree.  When such a situation arises, it is known that the new tree is
         * not going to be used until the entire tree is populated.  For
         * performance reasons, it is best to use a bulk load with RCU disabled.
         * This allows for optimistic splitting that favours the left and reuse
         * of nodes during the operation.
         */

        /* Optimize splitting for bulk insert in-order */
        mas->mas_flags |= MA_STATE_BULK;

        /*
         * Avoid overflow, assume a gap between each entry and a trailing null.
         * If this is wrong, it just means allocation can happen during
         * insertion of entries.
         */
        nr_nodes = max(nr_entries, nr_entries * 2 + 1);
        if (!mt_is_alloc(mas->tree))
                nonleaf_cap = MAPLE_RANGE64_SLOTS - 2;

        /* Leaves; reduce slots to keep space for expansion */
        nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2);
        /* Internal nodes */
        nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap);
        /* Add working room for split (2 nodes) + new parents */
        mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL);

        /* Detect if allocations run out */
        mas->mas_flags |= MA_STATE_PREALLOC;

        if (!mas_is_err(mas))
                return 0;

        ret = xa_err(mas->node);
        mas->node = enode;
        mas_destroy(mas);
        return ret;

}
EXPORT_SYMBOL_GPL(mas_expected_entries);

static bool mas_next_setup(struct ma_state *mas, unsigned long max,
                void **entry)
{
        bool was_none = mas_is_none(mas);

        if (unlikely(mas->last >= max)) {
                mas->status = ma_overflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                fallthrough;
        case ma_start:
                mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
                break;
        case ma_overflow:
                /* Overflowed before, but the max changed */
                mas->status = ma_active;
                break;
        case ma_underflow:
                /* The user expects the mas to be one before where it is */
                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (likely(mas_is_active(mas))) /* Fast path */
                return false;

        if (mas_is_ptr(mas)) {
                *entry = NULL;
                if (was_none && mas->index == 0) {
                        mas->index = mas->last = 0;
                        return true;
                }
                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return true;
        }

        if (mas_is_none(mas))
                return true;

        return false;
}

/**
 * mas_next() - Get the next entry.
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Returns the next entry after @mas->index.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, false);
}
EXPORT_SYMBOL_GPL(mas_next);

/**
 * mas_next_range() - Advance the maple state to the next range
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_next_range);

/**
 * mt_next() - get the next value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @max: The maximum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry higher than @index or %NULL if nothing is found.
 */
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_next(&mas, max);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_next);

static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry)
{
        if (unlikely(mas->index <= min)) {
                mas->status = ma_underflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_start:
                break;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* underflowed before but the min changed */
                mas->status = ma_active;
                break;
        case ma_overflow:
                /* User expects mas to be one after where it is */
                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas))
                mas_walk(mas);

        if (unlikely(mas_is_ptr(mas))) {
                if (!mas->index) {
                        mas->status = ma_none;
                        return true;
                }
                mas->index = mas->last = 0;
                *entry = mas_root(mas);
                return true;
        }

        if (mas_is_none(mas)) {
                if (mas->index) {
                        /* Walked to out-of-range pointer? */
                        mas->index = mas->last = 0;
                        mas->status = ma_root;
                        *entry = mas_root(mas);
                        return true;
                }
                return true;
        }

        return false;
}

/**
 * mas_prev() - Get the previous entry
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the status is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, false);
}
EXPORT_SYMBOL_GPL(mas_prev);

/**
 * mas_prev_range() - Advance to the previous range
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the node is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev_range(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_prev_range);

/**
 * mt_prev() - get the previous value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @min: The minimum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry before @index or %NULL if nothing is found.
 */
void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_prev(&mas, min);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_prev);

/**
 * mas_pause() - Pause a mas_find/mas_for_each to drop the lock.
 * @mas: The maple state to pause
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @mas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call mas_pause(), the mt_for_each()
 * iterator may be more appropriate.
 *
 */
void mas_pause(struct ma_state *mas)
{
        mas->status = ma_pause;
        mas->node = NULL;
}
EXPORT_SYMBOL_GPL(mas_pause);

/**
 * mas_find_setup() - Internal function to set up mas_find*().
 * @mas: The maple state
 * @max: The maximum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry)
{
        switch (mas->status) {
        case ma_active:
                if (mas->last < max)
                        return false;
                return true;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = ++mas->last;
                mas->status = ma_start;
                break;
        case ma_none:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = mas->last;
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* mas is pointing at entry before unable to go lower */
                if (unlikely(mas->index >= max)) {
                        mas->status = ma_overflow;
                        return true;
                }

                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_overflow:
                if (unlikely(mas->last >= max))
                        return true;

                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index > max)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;

        }

        if (unlikely(mas_is_ptr(mas)))
                goto ptr_out_of_range;

        if (unlikely(mas_is_none(mas)))
                return true;

        if (mas->index == max)
                return true;

        return false;

ptr_out_of_range:
        mas->status = ma_none;
        mas->index = 1;
        mas->last = ULONG_MAX;
        return true;
}

/**
 * mas_find() - On the first call, find the entry at or after mas->index up to
 * %max.  Otherwise, find the entry after mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        entry = mas_next_slot(mas, max, false);
        /* Ignore overflow */
        mas->status = ma_active;
        return entry;
}
EXPORT_SYMBOL_GPL(mas_find);

/**
 * mas_find_range() - On the first call, find the entry at or after
 * mas->index up to %max.  Otherwise, advance to the next slot mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_find_range);

/**
 * mas_find_rev_setup() - Internal function to set up mas_find_*_rev()
 * @mas: The maple state
 * @min: The minimum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
                void **entry)
{

        switch (mas->status) {
        case ma_active:
                goto active;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }
                mas->last = --mas->index;
                mas->status = ma_start;
                break;
        case ma_none:
                if (mas->index <= min)
                        goto none;

                mas->last = mas->index;
                mas->status = ma_start;
                break;
        case ma_overflow: /* user expects the mas to be one after where it is */
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }

                mas->status = ma_active;
                break;
        case ma_underflow: /* user expects the mas to be one before where it is */
                if (unlikely(mas->index <= min))
                        return true;

                mas->status = ma_active;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index < min)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;
        }

        if (unlikely(mas_is_ptr(mas)))
                goto none;

        if (unlikely(mas_is_none(mas))) {
                /*
                 * Walked to the location, and there was nothing so the previous
                 * location is 0.
                 */
                mas->last = mas->index = 0;
                mas->status = ma_root;
                *entry = mas_root(mas);
                return true;
        }

active:
        if (mas->index < min)
                return true;

        return false;

none:
        mas->status = ma_none;
        return true;
}

/**
 * mas_find_rev: On the first call, find the first non-null entry at or below
 * mas->index down to %min.  Otherwise find the first non-null entry below
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, false);

}
EXPORT_SYMBOL_GPL(mas_find_rev);

/**
 * mas_find_range_rev: On the first call, find the first non-null entry at or
 * below mas->index down to %min.  Otherwise advance to the previous slot after
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_find_range_rev);

/**
 * mas_erase() - Find the range in which index resides and erase the entire
 * range.
 * @mas: The maple state
 *
 * Must hold the write lock.
 * Searches for @mas->index, sets @mas->index and @mas->last to the range and
 * erases that range.
 *
 * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated.
 */
void *mas_erase(struct ma_state *mas)
{
        void *entry;
        MA_WR_STATE(wr_mas, mas, NULL);

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;

        /* Retry unnecessary when holding the write lock. */
        entry = mas_state_walk(mas);
        if (!entry)
                return NULL;

write_retry:
        /* Must reset to ensure spanning writes of last slot are detected */
        mas_reset(mas);
        mas_wr_store_setup(&wr_mas);
        mas_wr_store_entry(&wr_mas);
        if (mas_nomem(mas, GFP_KERNEL))
                goto write_retry;

        return entry;
}
EXPORT_SYMBOL_GPL(mas_erase);

/**
 * mas_nomem() - Check if there was an error allocating and do the allocation
 * if necessary If there are allocations, then free them.
 * @mas: The maple state
 * @gfp: The GFP_FLAGS to use for allocations
 * Return: true on allocation, false otherwise.
 */
bool mas_nomem(struct ma_state *mas, gfp_t gfp)
        __must_hold(mas->tree->ma_lock)
{
        if (likely(mas->node != MA_ERROR(-ENOMEM))) {
                mas_destroy(mas);
                return false;
        }

        if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) {
                mtree_unlock(mas->tree);
                mas_alloc_nodes(mas, gfp);
                mtree_lock(mas->tree);
        } else {
                mas_alloc_nodes(mas, gfp);
        }

        if (!mas_allocated(mas))
                return false;

        mas->status = ma_start;
        return true;
}

void __init maple_tree_init(void)
{
        maple_node_cache = kmem_cache_create("maple_node",
                        sizeof(struct maple_node), sizeof(struct maple_node),
                        SLAB_PANIC, NULL);
}

/**
 * mtree_load() - Load a value stored in a maple tree
 * @mt: The maple tree
 * @index: The index to load
 *
 * Return: the entry or %NULL
 */
void *mtree_load(struct maple_tree *mt, unsigned long index)
{
        MA_STATE(mas, mt, index, index);
        void *entry;

        trace_ma_read(__func__, &mas);
        rcu_read_lock();
retry:
        entry = mas_start(&mas);
        if (unlikely(mas_is_none(&mas)))
                goto unlock;

        if (unlikely(mas_is_ptr(&mas))) {
                if (index)
                        entry = NULL;

                goto unlock;
        }

        entry = mtree_lookup_walk(&mas);
        if (!entry && unlikely(mas_is_start(&mas)))
                goto retry;
unlock:
        rcu_read_unlock();
        if (xa_is_zero(entry))
                return NULL;

        return entry;
}
EXPORT_SYMBOL(mtree_load);

/**
 * mtree_store_range() - Store an entry at a given range.
 * @mt: The maple tree
 * @index: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store_range(struct maple_tree *mt, unsigned long index,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(mas, mt, index, last);
        MA_WR_STATE(wr_mas, &mas, entry);

        trace_ma_write(__func__, &mas, 0, entry);
        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (index > last)
                return -EINVAL;

        mtree_lock(mt);
retry:
        mas_wr_store_entry(&wr_mas);
        if (mas_nomem(&mas, gfp))
                goto retry;

        mtree_unlock(mt);
        if (mas_is_err(&mas))
                return xa_err(mas.node);

        return 0;
}
EXPORT_SYMBOL(mtree_store_range);

/**
 * mtree_store() - Store an entry at a given index.
 * @mt: The maple tree
 * @index: The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_store_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_store);

/**
 * mtree_insert_range() - Insert an entry at a given range if there is no value.
 * @mt: The maple tree
 * @first: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(ms, mt, first, last);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (first > last)
                return -EINVAL;

        mtree_lock(mt);
retry:
        mas_insert(&ms, entry);
        if (mas_nomem(&ms, gfp))
                goto retry;

        mtree_unlock(mt);
        if (mas_is_err(&ms))
                return xa_err(ms.node);

        return 0;
}
EXPORT_SYMBOL(mtree_insert_range);

/**
 * mtree_insert() - Insert an entry at a given index if there is no value.
 * @mt: The maple tree
 * @index : The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_insert_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_insert);

int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_range);

/**
 * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree.
 * @mt: The maple tree.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Finds an empty entry in @mt after @next, stores the new index into
 * the @id pointer, stores the entry at that index, then updates @next.
 *
 * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag.
 *
 * Context: Any context.  Takes and releases the mt.lock.  May sleep if
 * the @gfp flags permit.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no
 * free entries.
 */
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        int ret;

        MA_STATE(mas, mt, 0, 0);

        if (!mt_is_alloc(mt))
                return -EINVAL;
        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;
        mtree_lock(mt);
        ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi,
                               next, gfp);
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_cyclic);

int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area_rev(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_rrange);

/**
 * mtree_erase() - Find an index and erase the entire range.
 * @mt: The maple tree
 * @index: The index to erase
 *
 * Erasing is the same as a walk to an entry then a store of a NULL to that
 * ENTIRE range.  In fact, it is implemented as such using the advanced API.
 *
 * Return: The entry stored at the @index or %NULL
 */
void *mtree_erase(struct maple_tree *mt, unsigned long index)
{
        void *entry = NULL;

        MA_STATE(mas, mt, index, index);
        trace_ma_op(__func__, &mas);

        mtree_lock(mt);
        entry = mas_erase(&mas);
        mtree_unlock(mt);

        return entry;
}
EXPORT_SYMBOL(mtree_erase);

/*
 * mas_dup_free() - Free an incomplete duplication of a tree.
 * @mas: The maple state of a incomplete tree.
 *
 * The parameter @mas->node passed in indicates that the allocation failed on
 * this node. This function frees all nodes starting from @mas->node in the
 * reverse order of mas_dup_build(). There is no need to hold the source tree
 * lock at this time.
 */
static void mas_dup_free(struct ma_state *mas)
{
        struct maple_node *node;
        enum maple_type type;
        void __rcu **slots;
        unsigned char count, i;

        /* Maybe the first node allocation failed. */
        if (mas_is_none(mas))
                return;

        while (!mte_is_root(mas->node)) {
                mas_ascend(mas);
                if (mas->offset) {
                        mas->offset--;
                        do {
                                mas_descend(mas);
                                mas->offset = mas_data_end(mas);
                        } while (!mte_is_leaf(mas->node));

                        mas_ascend(mas);
                }

                node = mte_to_node(mas->node);
                type = mte_node_type(mas->node);
                slots = ma_slots(node, type);
                count = mas_data_end(mas) + 1;
                for (i = 0; i < count; i++)
                        ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK;
                mt_free_bulk(count, slots);
        }

        node = mte_to_node(mas->node);
        mt_free_one(node);
}

/*
 * mas_copy_node() - Copy a maple node and replace the parent.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @parent: The parent of the new node.
 *
 * Copy @mas->node to @new_mas->node, set @parent to be the parent of
 * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas,
                struct maple_pnode *parent)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        unsigned long val;

        /* Copy the node completely. */
        memcpy(new_node, node, sizeof(struct maple_node));
        /* Update the parent node pointer. */
        val = (unsigned long)node->parent & MAPLE_NODE_MASK;
        new_node->parent = ma_parent_ptr(val | (unsigned long)parent);
}

/*
 * mas_dup_alloc() - Allocate child nodes for a maple node.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function allocates child nodes for @new_mas->node during the duplication
 * process. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        enum maple_type type;
        unsigned char request, count, i;
        void __rcu **slots;
        void __rcu **new_slots;
        unsigned long val;

        /* Allocate memory for child nodes. */
        type = mte_node_type(mas->node);
        new_slots = ma_slots(new_node, type);
        request = mas_data_end(mas) + 1;
        count = mt_alloc_bulk(gfp, request, (void **)new_slots);
        if (unlikely(count < request)) {
                memset(new_slots, 0, request * sizeof(void *));
                mas_set_err(mas, -ENOMEM);
                return;
        }

        /* Restore node type information in slots. */
        slots = ma_slots(node, type);
        for (i = 0; i < count; i++) {
                val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
                val &= MAPLE_NODE_MASK;
                ((unsigned long *)new_slots)[i] |= val;
        }
}

/*
 * mas_dup_build() - Build a new maple tree from a source tree
 * @mas: The maple state of source tree, need to be in MAS_START state.
 * @new_mas: The maple state of new tree, need to be in MAS_START state.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function builds a new tree in DFS preorder. If the memory allocation
 * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the
 * last node. mas_dup_free() will free the incomplete duplication of a tree.
 *
 * Note that the attributes of the two trees need to be exactly the same, and the
 * new tree needs to be empty, otherwise -EINVAL will be set in @mas.
 */
static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node;
        struct maple_pnode *parent = NULL;
        struct maple_enode *root;
        enum maple_type type;

        if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) ||
            unlikely(!mtree_empty(new_mas->tree))) {
                mas_set_err(mas, -EINVAL);
                return;
        }

        root = mas_start(mas);
        if (mas_is_ptr(mas) || mas_is_none(mas))
                goto set_new_tree;

        node = mt_alloc_one(gfp);
        if (!node) {
                new_mas->status = ma_none;
                mas_set_err(mas, -ENOMEM);
                return;
        }

        type = mte_node_type(mas->node);
        root = mt_mk_node(node, type);
        new_mas->node = root;
        new_mas->min = 0;
        new_mas->max = ULONG_MAX;
        root = mte_mk_root(root);
        while (1) {
                mas_copy_node(mas, new_mas, parent);
                if (!mte_is_leaf(mas->node)) {
                        /* Only allocate child nodes for non-leaf nodes. */
                        mas_dup_alloc(mas, new_mas, gfp);
                        if (unlikely(mas_is_err(mas)))
                                return;
                } else {
                        /*
                         * This is the last leaf node and duplication is
                         * completed.
                         */
                        if (mas->max == ULONG_MAX)
                                goto done;

                        /* This is not the last leaf node and needs to go up. */
                        do {
                                mas_ascend(mas);
                                mas_ascend(new_mas);
                        } while (mas->offset == mas_data_end(mas));

                        /* Move to the next subtree. */
                        mas->offset++;
                        new_mas->offset++;
                }

                mas_descend(mas);
                parent = ma_parent_ptr(mte_to_node(new_mas->node));
                mas_descend(new_mas);
                mas->offset = 0;
                new_mas->offset = 0;
        }
done:
        /* Specially handle the parent of the root node. */
        mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas));
set_new_tree:
        /* Make them the same height */
        new_mas->tree->ma_flags = mas->tree->ma_flags;
        rcu_assign_pointer(new_mas->tree->ma_root, root);
}

/**
 * __mt_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 * Note that the user needs to manually lock the source tree and the new tree.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_dup_build(&mas, &new_mas, gfp);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        return ret;
}
EXPORT_SYMBOL(__mt_dup);

/**
 * mtree_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_lock(&new_mas);
        mas_lock_nested(&mas, SINGLE_DEPTH_NESTING);
        mas_dup_build(&mas, &new_mas, gfp);
        mas_unlock(&mas);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        mas_unlock(&new_mas);
        return ret;
}
EXPORT_SYMBOL(mtree_dup);

/**
 * __mt_destroy() - Walk and free all nodes of a locked maple tree.
 * @mt: The maple tree
 *
 * Note: Does not handle locking.
 */
void __mt_destroy(struct maple_tree *mt)
{
        void *root = mt_root_locked(mt);

        rcu_assign_pointer(mt->ma_root, NULL);
        if (xa_is_node(root))
                mte_destroy_walk(root, mt);

        mt->ma_flags = mt_attr(mt);
}
EXPORT_SYMBOL_GPL(__mt_destroy);

/**
 * mtree_destroy() - Destroy a maple tree
 * @mt: The maple tree
 *
 * Frees all resources used by the tree.  Handles locking.
 */
void mtree_destroy(struct maple_tree *mt)
{
        mtree_lock(mt);
        __mt_destroy(mt);
        mtree_unlock(mt);
}
EXPORT_SYMBOL(mtree_destroy);

/**
 * mt_find() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value of the search range
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * In case that an entry is found @index is updated to point to the next
 * possible entry independent whether the found entry is occupying a
 * single index or a range if indices.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
{
        MA_STATE(mas, mt, *index, *index);
        void *entry;
#ifdef CONFIG_DEBUG_MAPLE_TREE
        unsigned long copy = *index;
#endif

        trace_ma_read(__func__, &mas);

        if ((*index) > max)
                return NULL;

        rcu_read_lock();
retry:
        entry = mas_state_walk(&mas);
        if (mas_is_start(&mas))
                goto retry;

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;

        if (entry)
                goto unlock;

        while (mas_is_active(&mas) && (mas.last < max)) {
                entry = mas_next_entry(&mas, max);
                if (likely(entry && !xa_is_zero(entry)))
                        break;
        }

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;
unlock:
        rcu_read_unlock();
        if (likely(entry)) {
                *index = mas.last + 1;
#ifdef CONFIG_DEBUG_MAPLE_TREE
                if (MT_WARN_ON(mt, (*index) && ((*index) <= copy)))
                        pr_err("index not increased! %lx <= %lx\n",
                               *index, copy);
#endif
        }

        return entry;
}
EXPORT_SYMBOL(mt_find);

/**
 * mt_find_after() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value to check
 *
 * Same as mt_find() except that it checks @index for 0 before
 * searching. If @index == 0, the search is aborted. This covers a wrap
 * around of @index to 0 in an iterator loop.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max)
{
        if (!(*index))
                return NULL;

        return mt_find(mt, index, max);
}
EXPORT_SYMBOL(mt_find_after);

#ifdef CONFIG_DEBUG_MAPLE_TREE
atomic_t maple_tree_tests_run;
EXPORT_SYMBOL_GPL(maple_tree_tests_run);
atomic_t maple_tree_tests_passed;
EXPORT_SYMBOL_GPL(maple_tree_tests_passed);

#ifndef __KERNEL__
extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int);
void mt_set_non_kernel(unsigned int val)
{
        kmem_cache_set_non_kernel(maple_node_cache, val);
}

extern unsigned long kmem_cache_get_alloc(struct kmem_cache *);
unsigned long mt_get_alloc_size(void)
{
        return kmem_cache_get_alloc(maple_node_cache);
}

extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *);
void mt_zero_nr_tallocated(void)
{
        kmem_cache_zero_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *);
unsigned int mt_nr_tallocated(void)
{
        return kmem_cache_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *);
unsigned int mt_nr_allocated(void)
{
        return kmem_cache_nr_allocated(maple_node_cache);
}

void mt_cache_shrink(void)
{
}
#else
/*
 * mt_cache_shrink() - For testing, don't use this.
 *
 * Certain testcases can trigger an OOM when combined with other memory
 * debugging configuration options.  This function is used to reduce the
 * possibility of an out of memory even due to kmem_cache objects remaining
 * around for longer than usual.
 */
void mt_cache_shrink(void)
{
        kmem_cache_shrink(maple_node_cache);

}
EXPORT_SYMBOL_GPL(mt_cache_shrink);

#endif /* not defined __KERNEL__ */
/*
 * mas_get_slot() - Get the entry in the maple state node stored at @offset.
 * @mas: The maple state
 * @offset: The offset into the slot array to fetch.
 *
 * Return: The entry stored at @offset.
 */
static inline struct maple_enode *mas_get_slot(struct ma_state *mas,
                unsigned char offset)
{
        return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)),
                        offset);
}

/* Depth first search, post-order */
static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
{

        struct maple_enode *p, *mn = mas->node;
        unsigned long p_min, p_max;

        mas_next_node(mas, mas_mn(mas), max);
        if (!mas_is_overflow(mas))
                return;

        if (mte_is_root(mn))
                return;

        mas->node = mn;
        mas_ascend(mas);
        do {
                p = mas->node;
                p_min = mas->min;
                p_max = mas->max;
                mas_prev_node(mas, 0);
        } while (!mas_is_underflow(mas));

        mas->node = p;
        mas->max = p_max;
        mas->min = p_min;
}

/* Tree validations */
static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format);
static void mt_dump_range(unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        static const char spaces[] = "                                ";

        switch(format) {
        case mt_dump_hex:
                if (min == max)
                        pr_info("%.*s%lx: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max);
                break;
        case mt_dump_dec:
                if (min == max)
                        pr_info("%.*s%lu: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max);
        }
}

static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        mt_dump_range(min, max, depth, format);

        if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry),
                                xa_to_value(entry), entry);
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else if (mt_is_reserved(entry))
                pr_cont("UNKNOWN ENTRY (%p)\n", entry);
        else
                pr_cont("%p\n", entry);
}

static void mt_dump_range64(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_range_64 *node = &mte_to_node(entry)->mr64;
        bool leaf = mte_is_leaf(entry);
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) {
                switch(format) {
                case mt_dump_hex:
                        pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont("%p\n", node->slot[i]);
        for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_RANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i] && max != mt_node_max(entry))
                        break;
                if (last == 0 && i > 0)
                        break;
                if (leaf)
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);
                else if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
        unsigned long min, unsigned long max, unsigned int depth,
        enum mt_dump_format format)
{
        struct maple_arange_64 *node = &mte_to_node(entry)->ma64;
        bool leaf = mte_is_leaf(entry);
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont("%lx ", node->gap[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%lu ", node->gap[i]);
                }
        }
        pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont("%p\n", node->slot[i]);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_ARANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i])
                        break;
                if (last == 0 && i > 0)
                        break;
                if (leaf)
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);
                else if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        break;
                }
                first = last + 1;
        }
}

static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_node *node = mte_to_node(entry);
        unsigned int type = mte_node_type(entry);
        unsigned int i;

        mt_dump_range(min, max, depth, format);

        pr_cont("node %p depth %d type %d parent %p", node, depth, type,
                        node ? node->parent : NULL);
        switch (type) {
        case maple_dense:
                pr_cont("\n");
                for (i = 0; i < MAPLE_NODE_SLOTS; i++) {
                        if (min + i > max)
                                pr_cont("OUT OF RANGE: ");
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        min + i, min + i, depth, format);
                }
                break;
        case maple_leaf_64:
        case maple_range_64:
                mt_dump_range64(mt, entry, min, max, depth, format);
                break;
        case maple_arange_64:
                mt_dump_arange64(mt, entry, min, max, depth, format);
                break;

        default:
                pr_cont(" UNKNOWN TYPE\n");
        }
}

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format)
{
        void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt));

        pr_info("maple_tree(%p) flags %X, height %u root %p\n",
                 mt, mt->ma_flags, mt_height(mt), entry);
        if (!xa_is_node(entry))
                mt_dump_entry(entry, 0, 0, 0, format);
        else if (entry)
                mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format);
}
EXPORT_SYMBOL_GPL(mt_dump);

/*
 * Calculate the maximum gap in a node and check if that's what is reported in
 * the parent (unless root).
 */
static void mas_validate_gaps(struct ma_state *mas)
{
        struct maple_enode *mte = mas->node;
        struct maple_node *p_mn, *node = mte_to_node(mte);
        enum maple_type mt = mte_node_type(mas->node);
        unsigned long gap = 0, max_gap = 0;
        unsigned long p_end, p_start = mas->min;
        unsigned char p_slot, offset;
        unsigned long *gaps = NULL;
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned int i;

        if (ma_is_dense(mt)) {
                for (i = 0; i < mt_slot_count(mte); i++) {
                        if (mas_get_slot(mas, i)) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                                continue;
                        }
                        gap++;
                }
                goto counted;
        }

        gaps = ma_gaps(node, mt);
        for (i = 0; i < mt_slot_count(mte); i++) {
                p_end = mas_safe_pivot(mas, pivots, i, mt);

                if (!gaps) {
                        if (!mas_get_slot(mas, i))
                                gap = p_end - p_start + 1;
                } else {
                        void *entry = mas_get_slot(mas, i);

                        gap = gaps[i];
                        MT_BUG_ON(mas->tree, !entry);

                        if (gap > p_end - p_start + 1) {
                                pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n",
                                       mas_mn(mas), i, gap, p_end, p_start,
                                       p_end - p_start + 1);
                                MT_BUG_ON(mas->tree, gap > p_end - p_start + 1);
                        }
                }

                if (gap > max_gap)
                        max_gap = gap;

                p_start = p_end + 1;
                if (p_end >= mas->max)
                        break;
        }

counted:
        if (mt == maple_arange_64) {
                MT_BUG_ON(mas->tree, !gaps);
                offset = ma_meta_gap(node);
                if (offset > i) {
                        pr_err("gap offset %p[%u] is invalid\n", node, offset);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (gaps[offset] != max_gap) {
                        pr_err("gap %p[%u] is not the largest gap %lu\n",
                               node, offset, max_gap);
                        MT_BUG_ON(mas->tree, 1);
                }

                for (i++ ; i < mt_slot_count(mte); i++) {
                        if (gaps[i] != 0) {
                                pr_err("gap %p[%u] beyond node limit != 0\n",
                                       node, i);
                                MT_BUG_ON(mas->tree, 1);
                        }
                }
        }

        if (mte_is_root(mte))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_mn = mte_parent(mte);
        MT_BUG_ON(mas->tree, max_gap > mas->max);
        if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
                pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap);
                mt_dump(mas->tree, mt_dump_hex);
                MT_BUG_ON(mas->tree, 1);
        }
}

static void mas_validate_parent_slot(struct ma_state *mas)
{
        struct maple_node *parent;
        struct maple_enode *node;
        enum maple_type p_type;
        unsigned char p_slot;
        void __rcu **slots;
        int i;

        if (mte_is_root(mas->node))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_type = mas_parent_type(mas, mas->node);
        parent = mte_parent(mas->node);
        slots = ma_slots(parent, p_type);
        MT_BUG_ON(mas->tree, mas_mn(mas) == parent);

        /* Check prev/next parent slot for duplicate node entry */

        for (i = 0; i < mt_slots[p_type]; i++) {
                node = mas_slot(mas, slots, i);
                if (i == p_slot) {
                        if (node != mas->node)
                                pr_err("parent %p[%u] does not have %p\n",
                                        parent, i, mas_mn(mas));
                        MT_BUG_ON(mas->tree, node != mas->node);
                } else if (node == mas->node) {
                        pr_err("Invalid child %p at parent %p[%u] p_slot %u\n",
                               mas_mn(mas), parent, i, p_slot);
                        MT_BUG_ON(mas->tree, node == mas->node);
                }
        }
}

static void mas_validate_child_slot(struct ma_state *mas)
{
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type);
        struct maple_enode *child;
        unsigned char i;

        if (mte_is_leaf(mas->node))
                return;

        for (i = 0; i < mt_slots[type]; i++) {
                child = mas_slot(mas, slots, i);

                if (!child) {
                        pr_err("Non-leaf node lacks child at %p[%u]\n",
                               mas_mn(mas), i);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent_slot(child) != i) {
                        pr_err("Slot error at %p[%u]: child %p has pslot %u\n",
                               mas_mn(mas), i, mte_to_node(child),
                               mte_parent_slot(child));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent(child) != mte_to_node(mas->node)) {
                        pr_err("child %p has parent %p not %p\n",
                               mte_to_node(child), mte_parent(child),
                               mte_to_node(mas->node));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (i < mt_pivots[type] && pivots[i] == mas->max)
                        break;
        }
}

/*
 * Validate all pivots are within mas->min and mas->max, check metadata ends
 * where the maximum ends and ensure there is no slots or pivots set outside of
 * the end of the data.
 */
static void mas_validate_limits(struct ma_state *mas)
{
        int i;
        unsigned long prev_piv = 0;
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mas_mn(mas), type);

        for (i = 0; i < mt_slots[type]; i++) {
                unsigned long piv;

                piv = mas_safe_pivot(mas, pivots, i, type);

                if (!piv && (i != 0)) {
                        pr_err("Missing node limit pivot at %p[%u]",
                               mas_mn(mas), i);
                        MAS_WARN_ON(mas, 1);
                }

                if (prev_piv > piv) {
                        pr_err("%p[%u] piv %lu < prev_piv %lu\n",
                                mas_mn(mas), i, piv, prev_piv);
                        MAS_WARN_ON(mas, piv < prev_piv);
                }

                if (piv < mas->min) {
                        pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i,
                                piv, mas->min);
                        MAS_WARN_ON(mas, piv < mas->min);
                }
                if (piv > mas->max) {
                        pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i,
                                piv, mas->max);
                        MAS_WARN_ON(mas, piv > mas->max);
                }
                prev_piv = piv;
                if (piv == mas->max)
                        break;
        }

        if (mas_data_end(mas) != i) {
                pr_err("node%p: data_end %u != the last slot offset %u\n",
                       mas_mn(mas), mas_data_end(mas), i);
                MT_BUG_ON(mas->tree, 1);
        }

        for (i += 1; i < mt_slots[type]; i++) {
                void *entry = mas_slot(mas, slots, i);

                if (entry && (i != mt_slots[type] - 1)) {
                        pr_err("%p[%u] should not have entry %p\n", mas_mn(mas),
                               i, entry);
                        MT_BUG_ON(mas->tree, entry != NULL);
                }

                if (i < mt_pivots[type]) {
                        unsigned long piv = pivots[i];

                        if (!piv)
                                continue;

                        pr_err("%p[%u] should not have piv %lu\n",
                               mas_mn(mas), i, piv);
                        MAS_WARN_ON(mas, i < mt_pivots[type] - 1);
                }
        }
}

static void mt_validate_nulls(struct maple_tree *mt)
{
        void *entry, *last = (void *)1;
        unsigned char offset = 0;
        void __rcu **slots;
        MA_STATE(mas, mt, 0, 0);

        mas_start(&mas);
        if (mas_is_none(&mas) || (mas_is_ptr(&mas)))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node));
        do {
                entry = mas_slot(&mas, slots, offset);
                if (!last && !entry) {
                        pr_err("Sequential nulls end at %p[%u]\n",
                                mas_mn(&mas), offset);
                }
                MT_BUG_ON(mt, !last && !entry);
                last = entry;
                if (offset == mas_data_end(&mas)) {
                        mas_next_node(&mas, mas_mn(&mas), ULONG_MAX);
                        if (mas_is_overflow(&mas))
                                return;
                        offset = 0;
                        slots = ma_slots(mte_to_node(mas.node),
                                         mte_node_type(mas.node));
                } else {
                        offset++;
                }

        } while (!mas_is_overflow(&mas));
}

/*
 * validate a maple tree by checking:
 * 1. The limits (pivots are within mas->min to mas->max)
 * 2. The gap is correctly set in the parents
 */
void mt_validate(struct maple_tree *mt)
{
        unsigned char end;

        MA_STATE(mas, mt, 0, 0);
        rcu_read_lock();
        mas_start(&mas);
        if (!mas_is_active(&mas))
                goto done;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        while (!mas_is_overflow(&mas)) {
                MAS_WARN_ON(&mas, mte_dead_node(mas.node));
                end = mas_data_end(&mas);
                if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
                                (mas.max != ULONG_MAX))) {
                        pr_err("Invalid size %u of %p\n", end, mas_mn(&mas));
                }

                mas_validate_parent_slot(&mas);
                mas_validate_limits(&mas);
                mas_validate_child_slot(&mas);
                if (mt_is_alloc(mt))
                        mas_validate_gaps(&mas);
                mas_dfs_postorder(&mas, ULONG_MAX);
        }
        mt_validate_nulls(mt);
done:
        rcu_read_unlock();

}
EXPORT_SYMBOL_GPL(mt_validate);

void mas_dump(const struct ma_state *mas)
{
        pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node);
        switch (mas->status) {
        case ma_active:
                pr_err("(ma_active)");
                break;
        case ma_none:
                pr_err("(ma_none)");
                break;
        case ma_root:
                pr_err("(ma_root)");
                break;
        case ma_start:
                pr_err("(ma_start) ");
                break;
        case ma_pause:
                pr_err("(ma_pause) ");
                break;
        case ma_overflow:
                pr_err("(ma_overflow) ");
                break;
        case ma_underflow:
                pr_err("(ma_underflow) ");
                break;
        case ma_error:
                pr_err("(ma_error) ");
                break;
        }

        pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
               mas->index, mas->last);
        pr_err("     min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n",
               mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags);
        if (mas->index > mas->last)
                pr_err("Check index & last\n");
}
EXPORT_SYMBOL_GPL(mas_dump);

void mas_wr_dump(const struct ma_wr_state *wr_mas)
{
        pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n",
               wr_mas->node, wr_mas->r_min, wr_mas->r_max);
        pr_err("        type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
               wr_mas->type, wr_mas->offset_end, wr_mas->mas->end,
               wr_mas->end_piv);
}
EXPORT_SYMBOL_GPL(mas_wr_dump);

#endif /* CONFIG_DEBUG_MAPLE_TREE */



















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_BIT_H
#define _LINUX_WAIT_BIT_H

/*
 * Linux wait-bit related types and methods:
 */
#include <linux/wait.h>

struct wait_bit_key {
        void                        *flags;
        int                        bit_nr;
        unsigned long                timeout;
};

struct wait_bit_queue_entry {
        struct wait_bit_key        key;
        struct wait_queue_entry        wq_entry;
};

#define __WAIT_BIT_KEY_INITIALIZER(word, bit)                                        \
        { .flags = word, .bit_nr = bit, }

typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);

void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
void wake_up_bit(void *word, int bit);
int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
struct wait_queue_head *bit_waitqueue(void *word, int bit);
extern void __init wait_bit_init(void);

int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_BIT(name, word, bit)                                        \
        struct wait_bit_queue_entry name = {                                        \
                .key = __WAIT_BIT_KEY_INITIALIZER(word, bit),                        \
                .wq_entry = {                                                        \
                        .private        = current,                                \
                        .func                = wake_bit_function,                        \
                        .entry                =                                        \
                                LIST_HEAD_INIT((name).wq_entry.entry),                \
                },                                                                \
        }

extern int bit_wait(struct wait_bit_key *key, int mode);
extern int bit_wait_io(struct wait_bit_key *key, int mode);
extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);

/**
 * wait_on_bit - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * There is a standard hashed waitqueue table for generic use. This
 * is the part of the hashtable's accessor API that waits on a bit.
 * For instance, if one were to have waiters on a bitflag, one would
 * call wait_on_bit() in threads waiting for the bit to clear.
 * One uses wait_on_bit() where one is waiting for the bit to clear,
 * but has no intention of setting it.
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait,
                                       mode);
}

/**
 * wait_on_bit_io - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared.  This is similar to wait_on_bit(), but calls
 * io_schedule() instead of schedule() for the actual waiting.
 *
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait_io,
                                       mode);
}

/**
 * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 * @timeout: timeout, in jiffies
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared. This is similar to wait_on_bit(), except also takes a
 * timeout parameter.
 *
 * Returned value will be zero if the bit was cleared before the
 * @timeout elapsed, or non-zero if the @timeout elapsed or process
 * received a signal and the mode permitted wakeup on that signal.
 */
static inline int
wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
                    unsigned long timeout)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit_timeout(word, bit,
                                               bit_wait_timeout,
                                               mode, timeout);
}

/**
 * wait_on_bit_action - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared, and allow the waiting action to be specified.
 * This is like wait_on_bit() but allows fine control of how the waiting
 * is done.
 *
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
                   unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit, action, mode);
}

/**
 * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * There is a standard hashed waitqueue table for generic use. This
 * is the part of the hashtable's accessor API that waits on a bit
 * when one intends to set it, for instance, trying to lock bitflags.
 * For instance, if one were to have waiters trying to set bitflag
 * and waiting for it to clear before setting it, one would call
 * wait_on_bit() in threads waiting to be able to set the bit.
 * One uses wait_on_bit_lock() where one is waiting for the bit to
 * clear with the intention of setting it, and when done, clearing it.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
}

/**
 * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared and then to atomically set it.  This is similar
 * to wait_on_bit(), but calls io_schedule() instead of schedule()
 * for the actual waiting.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
}

/**
 * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared and then to set it, and allow the waiting action
 * to be specified.
 * This is like wait_on_bit() but allows fine control of how the waiting
 * is done.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
                        unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, action, mode);
}

extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
extern void wake_up_var(void *var);
extern wait_queue_head_t *__var_waitqueue(void *p);

#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)        \
({                                                                        \
        __label__ __out;                                                \
        struct wait_queue_head *__wq_head = __var_waitqueue(var);        \
        struct wait_bit_queue_entry __wbq_entry;                        \
        long __ret = ret; /* explicit shadow */                                \
                                                                        \
        init_wait_var_entry(&__wbq_entry, var,                                \
                            exclusive ? WQ_FLAG_EXCLUSIVE : 0);                \
        for (;;) {                                                        \
                long __int = prepare_to_wait_event(__wq_head,                \
                                                   &__wbq_entry.wq_entry, \
                                                   state);                \
                if (condition)                                                \
                        break;                                                \
                                                                        \
                if (___wait_is_interruptible(state) && __int) {                \
                        __ret = __int;                                        \
                        goto __out;                                        \
                }                                                        \
                                                                        \
                cmd;                                                        \
        }                                                                \
        finish_wait(__wq_head, &__wbq_entry.wq_entry);                        \
__out:        __ret;                                                                \
})

#define __wait_var_event(var, condition)                                \
        ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                          schedule())

#define wait_var_event(var, condition)                                        \
do {                                                                        \
        might_sleep();                                                        \
        if (condition)                                                        \
                break;                                                        \
        __wait_var_event(var, condition);                                \
} while (0)

#define __wait_var_event_killable(var, condition)                        \
        ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,                \
                          schedule())

#define wait_var_event_killable(var, condition)                                \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_killable(var, condition);        \
        __ret;                                                                \
})

#define __wait_var_event_timeout(var, condition, timeout)                \
        ___wait_var_event(var, ___wait_cond_timeout(condition),                \
                          TASK_UNINTERRUPTIBLE, 0, timeout,                \
                          __ret = schedule_timeout(__ret))

#define wait_var_event_timeout(var, condition, timeout)                        \
({                                                                        \
        long __ret = timeout;                                                \
        might_sleep();                                                        \
        if (!___wait_cond_timeout(condition))                                \
                __ret = __wait_var_event_timeout(var, condition, timeout); \
        __ret;                                                                \
})

#define __wait_var_event_interruptible(var, condition)                        \
        ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0,        \
                          schedule())

#define wait_var_event_interruptible(var, condition)                        \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_interruptible(var, condition);        \
        __ret;                                                                \
})

/**
 * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
 *
 * @bit: the bit of the word being waited on
 * @word: the word being waited on, a kernel virtual address
 *
 * You can use this helper if bitflags are manipulated atomically rather than
 * non-atomically under a lock.
 */
static inline void clear_and_wake_up_bit(int bit, void *word)
{
        clear_bit_unlock(bit, word);
        /* See wake_up_bit() for which memory barrier you need to use. */
        smp_mb__after_atomic();
        wake_up_bit(word, bit);
}

#endif /* _LINUX_WAIT_BIT_H */



















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SPECIAL_INSNS_H
#define _ASM_X86_SPECIAL_INSNS_H

#ifdef __KERNEL__
#include <asm/nops.h>
#include <asm/processor-flags.h>

#include <linux/errno.h>
#include <linux/irqflags.h>
#include <linux/jump_label.h>

/*
 * The compiler should not reorder volatile asm statements with respect to each
 * other: they should execute in program order. However GCC 4.9.x and 5.x have
 * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder
 * volatile asm. The write functions are not affected since they have memory
 * clobbers preventing reordering. To prevent reads from being reordered with
 * respect to writes, use a dummy memory operand.
 */

#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL)

void native_write_cr0(unsigned long val);

static inline unsigned long native_read_cr0(void)
{
        unsigned long val;
        asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline unsigned long native_read_cr2(void)
{
        unsigned long val;
        asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline void native_write_cr2(unsigned long val)
{
        asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
}

static inline unsigned long __native_read_cr3(void)
{
        unsigned long val;
        asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static inline void native_write_cr3(unsigned long val)
{
        asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
}

static inline unsigned long native_read_cr4(void)
{
        unsigned long val;
#ifdef CONFIG_X86_32
        /*
         * This could fault if CR4 does not exist.  Non-existent CR4
         * is functionally equivalent to CR4 == 0.  Keep it simple and pretend
         * that CR4 == 0 on CPUs that don't have CR4.
         */
        asm volatile("1: mov %%cr4, %0\n"
                     "2:\n"
                     _ASM_EXTABLE(1b, 2b)
                     : "=r" (val) : "0" (0), __FORCE_ORDER);
#else
        /* CR4 always exists on x86_64. */
        asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER);
#endif
        return val;
}

void native_write_cr4(unsigned long val);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static inline u32 rdpkru(void)
{
        u32 ecx = 0;
        u32 edx, pkru;

        /*
         * "rdpkru" instruction.  Places PKRU contents in to EAX,
         * clears EDX and requires that ecx=0.
         */
        asm volatile(".byte 0x0f,0x01,0xee\n\t"
                     : "=a" (pkru), "=d" (edx)
                     : "c" (ecx));
        return pkru;
}

static inline void wrpkru(u32 pkru)
{
        u32 ecx = 0, edx = 0;

        /*
         * "wrpkru" instruction.  Loads contents in EAX to PKRU,
         * requires that ecx = edx = 0.
         */
        asm volatile(".byte 0x0f,0x01,0xef\n\t"
                     : : "a" (pkru), "c"(ecx), "d"(edx));
}

#else
static inline u32 rdpkru(void)
{
        return 0;
}

static inline void wrpkru(u32 pkru)
{
}
#endif

static __always_inline void native_wbinvd(void)
{
        asm volatile("wbinvd": : :"memory");
}

static inline unsigned long __read_cr4(void)
{
        return native_read_cr4();
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else

static inline unsigned long read_cr0(void)
{
        return native_read_cr0();
}

static inline void write_cr0(unsigned long x)
{
        native_write_cr0(x);
}

static __always_inline unsigned long read_cr2(void)
{
        return native_read_cr2();
}

static __always_inline void write_cr2(unsigned long x)
{
        native_write_cr2(x);
}

/*
 * Careful!  CR3 contains more than just an address.  You probably want
 * read_cr3_pa() instead.
 */
static inline unsigned long __read_cr3(void)
{
        return __native_read_cr3();
}

static inline void write_cr3(unsigned long x)
{
        native_write_cr3(x);
}

static inline void __write_cr4(unsigned long x)
{
        native_write_cr4(x);
}

static __always_inline void wbinvd(void)
{
        native_wbinvd();
}

#endif /* CONFIG_PARAVIRT_XXL */

static __always_inline void clflush(volatile void *__p)
{
        asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
}

static inline void clflushopt(volatile void *__p)
{
        alternative_io(".byte 0x3e; clflush %0",
                       ".byte 0x66; clflush %0",
                       X86_FEATURE_CLFLUSHOPT,
                       "+m" (*(volatile char __force *)__p));
}

static inline void clwb(volatile void *__p)
{
        volatile struct { char x[64]; } *p = __p;

        asm volatile(ALTERNATIVE_2(
                ".byte 0x3e; clflush (%[pax])",
                ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
                X86_FEATURE_CLFLUSHOPT,
                ".byte 0x66, 0x0f, 0xae, 0x30",  /* clwb (%%rax) */
                X86_FEATURE_CLWB)
                : [p] "+m" (*p)
                : [pax] "a" (p));
}

#ifdef CONFIG_X86_USER_SHADOW_STACK
static inline int write_user_shstk_64(u64 __user *addr, u64 val)
{
        asm goto("1: wrussq %[val], %[addr]\n"
                          _ASM_EXTABLE(1b, %l[fail])
                          :: [addr] "m" (*addr), [val] "r" (val)
                          :: fail);
        return 0;
fail:
        return -EFAULT;
}
#endif /* CONFIG_X86_USER_SHADOW_STACK */

#define nop() asm volatile ("nop")

static inline void serialize(void)
{
        /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
        asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
}

/* The dst parameter must be 64-bytes aligned */
static inline void movdir64b(void *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } *__dst = dst;

        /*
         * MOVDIR64B %(rdx), rax.
         *
         * Both __src and __dst must be memory constraints in order to tell the
         * compiler that no other memory accesses should be reordered around
         * this one.
         *
         * Also, both must be supplied as lvalues because this tells
         * the compiler what the object is (its size) the instruction accesses.
         * I.e., not the pointers but what they point to, thus the deref'ing '*'.
         */
        asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
                     : "+m" (*__dst)
                     :  "m" (*__src), "a" (__dst), "d" (__src));
}

static inline void movdir64b_io(void __iomem *dst, const void *src)
{
        movdir64b((void __force *)dst, src);
}

/**
 * enqcmds - Enqueue a command in supervisor (CPL0) mode
 * @dst: destination, in MMIO space (must be 512-bit aligned)
 * @src: 512 bits memory operand
 *
 * The ENQCMDS instruction allows software to write a 512-bit command to
 * a 512-bit-aligned special MMIO region that supports the instruction.
 * A return status is loaded into the ZF flag in the RFLAGS register.
 * ZF = 0 equates to success, and ZF = 1 indicates retry or error.
 *
 * This function issues the ENQCMDS instruction to submit data from
 * kernel space to MMIO space, in a unit of 512 bits. Order of data access
 * is not guaranteed, nor is a memory barrier performed afterwards. It
 * returns 0 on success and -EAGAIN on failure.
 *
 * Warning: Do not use this helper unless your driver has checked that the
 * ENQCMDS instruction is supported on the platform and the device accepts
 * ENQCMDS.
 */
static inline int enqcmds(void __iomem *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } __iomem *__dst = dst;
        bool zf;

        /*
         * ENQCMDS %(rdx), rax
         *
         * See movdir64b()'s comment on operand specification.
         */
        asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90"
                     CC_SET(z)
                     : CC_OUT(z) (zf), "+m" (*__dst)
                     : "m" (*__src), "a" (__dst), "d" (__src));

        /* Submission failure is indicated via EFLAGS.ZF=1 */
        if (zf)
                return -EAGAIN;

        return 0;
}

static __always_inline void tile_release(void)
{
        /*
         * Instruction opcode for TILERELEASE; supported in binutils
         * version >= 2.36.
         */
        asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0");
}

#endif /* __KERNEL__ */

#endif /* _ASM_X86_SPECIAL_INSNS_H */






































































































































































































































































































































































































































































































































































































































































































































































































    1 








































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Support for INET connection oriented protocols.
 *
 * Authors:        See the TCP sources
 */

#include <linux/module.h>
#include <linux/jhash.h>

#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>

#if IS_ENABLED(CONFIG_IPV6)
/* match_sk*_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses
 *                                if IPv6 only, and any IPv4 addresses
 *                                if not IPv6 only
 * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
 *                                IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
 *                                and 0.0.0.0 equals to 0.0.0.0 only
 */
static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
                                 const struct in6_addr *sk2_rcv_saddr6,
                                 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
                                 bool sk1_ipv6only, bool sk2_ipv6only,
                                 bool match_sk1_wildcard,
                                 bool match_sk2_wildcard)
{
        int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
        int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;

        /* if both are mapped, treat as IPv4 */
        if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
                if (!sk2_ipv6only) {
                        if (sk1_rcv_saddr == sk2_rcv_saddr)
                                return true;
                        return (match_sk1_wildcard && !sk1_rcv_saddr) ||
                                (match_sk2_wildcard && !sk2_rcv_saddr);
                }
                return false;
        }

        if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
                return true;

        if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard &&
            !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
                return true;

        if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard &&
            !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
                return true;

        if (sk2_rcv_saddr6 &&
            ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6))
                return true;

        return false;
}
#endif

/* match_sk*_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
 * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
 *                                0.0.0.0 only equals to 0.0.0.0
 */
static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
                                 bool sk2_ipv6only, bool match_sk1_wildcard,
                                 bool match_sk2_wildcard)
{
        if (!sk2_ipv6only) {
                if (sk1_rcv_saddr == sk2_rcv_saddr)
                        return true;
                return (match_sk1_wildcard && !sk1_rcv_saddr) ||
                        (match_sk2_wildcard && !sk2_rcv_saddr);
        }
        return false;
}

bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
                          bool match_wildcard)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr,
                                            inet6_rcv_saddr(sk2),
                                            sk->sk_rcv_saddr,
                                            sk2->sk_rcv_saddr,
                                            ipv6_only_sock(sk),
                                            ipv6_only_sock(sk2),
                                            match_wildcard,
                                            match_wildcard);
#endif
        return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
                                    ipv6_only_sock(sk2), match_wildcard,
                                    match_wildcard);
}
EXPORT_SYMBOL(inet_rcv_saddr_equal);

bool inet_rcv_saddr_any(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
#endif
        return !sk->sk_rcv_saddr;
}

/**
 *        inet_sk_get_local_port_range - fetch ephemeral ports range
 *        @sk: socket
 *        @low: pointer to low port
 *        @high: pointer to high port
 *
 *        Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
 *        Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
 *        Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
 */
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
        int lo, hi, sk_lo, sk_hi;
        bool local_range = false;
        u32 sk_range;

        inet_get_local_port_range(sock_net(sk), &lo, &hi);

        sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
        if (unlikely(sk_range)) {
                sk_lo = sk_range & 0xffff;
                sk_hi = sk_range >> 16;

                if (lo <= sk_lo && sk_lo <= hi)
                        lo = sk_lo;
                if (lo <= sk_hi && sk_hi <= hi)
                        hi = sk_hi;
                local_range = true;
        }

        *low = lo;
        *high = hi;
        return local_range;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);

static bool inet_use_bhash2_on_bind(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6) {
                int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);

                if (addr_type == IPV6_ADDR_ANY)
                        return false;

                if (addr_type != IPV6_ADDR_MAPPED)
                        return true;
        }
#endif
        return sk->sk_rcv_saddr != htonl(INADDR_ANY);
}

static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
                               kuid_t sk_uid, bool relax,
                               bool reuseport_cb_ok, bool reuseport_ok)
{
        int bound_dev_if2;

        if (sk == sk2)
                return false;

        bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);

        if (!sk->sk_bound_dev_if || !bound_dev_if2 ||
            sk->sk_bound_dev_if == bound_dev_if2) {
                if (sk->sk_reuse && sk2->sk_reuse &&
                    sk2->sk_state != TCP_LISTEN) {
                        if (!relax || (!reuseport_ok && sk->sk_reuseport &&
                                       sk2->sk_reuseport && reuseport_cb_ok &&
                                       (sk2->sk_state == TCP_TIME_WAIT ||
                                        uid_eq(sk_uid, sock_i_uid(sk2)))))
                                return true;
                } else if (!reuseport_ok || !sk->sk_reuseport ||
                           !sk2->sk_reuseport || !reuseport_cb_ok ||
                           (sk2->sk_state != TCP_TIME_WAIT &&
                            !uid_eq(sk_uid, sock_i_uid(sk2)))) {
                        return true;
                }
        }
        return false;
}

static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
                                   kuid_t sk_uid, bool relax,
                                   bool reuseport_cb_ok, bool reuseport_ok)
{
        if (ipv6_only_sock(sk2)) {
                if (sk->sk_family == AF_INET)
                        return false;

#if IS_ENABLED(CONFIG_IPV6)
                if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
                        return false;
#endif
        }

        return inet_bind_conflict(sk, sk2, sk_uid, relax,
                                  reuseport_cb_ok, reuseport_ok);
}

static bool inet_bhash2_conflict(const struct sock *sk,
                                 const struct inet_bind2_bucket *tb2,
                                 kuid_t sk_uid,
                                 bool relax, bool reuseport_cb_ok,
                                 bool reuseport_ok)
{
        struct sock *sk2;

        sk_for_each_bound(sk2, &tb2->owners) {
                if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
                                           reuseport_cb_ok, reuseport_ok))
                        return true;
        }

        return false;
}

#define sk_for_each_bound_bhash(__sk, __tb2, __tb)                        \
        hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node)        \
                sk_for_each_bound(sk2, &(__tb2)->owners)

/* This should be called only when the tb and tb2 hashbuckets' locks are held */
static int inet_csk_bind_conflict(const struct sock *sk,
                                  const struct inet_bind_bucket *tb,
                                  const struct inet_bind2_bucket *tb2, /* may be null */
                                  bool relax, bool reuseport_ok)
{
        kuid_t uid = sock_i_uid((struct sock *)sk);
        struct sock_reuseport *reuseport_cb;
        bool reuseport_cb_ok;
        struct sock *sk2;

        rcu_read_lock();
        reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
        /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
        reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
        rcu_read_unlock();

        /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if
         * ipv4) should have been checked already. We need to do these two
         * checks separately because their spinlocks have to be acquired/released
         * independently of each other, to prevent possible deadlocks
         */
        if (inet_use_bhash2_on_bind(sk))
                return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax,
                                                   reuseport_cb_ok, reuseport_ok);

        /* Unlike other sk lookup places we do not check
         * for sk_net here, since _all_ the socks listed
         * in tb->owners and tb2->owners list belong
         * to the same net - the one this bucket belongs to.
         */
        sk_for_each_bound_bhash(sk2, tb2, tb) {
                if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok))
                        continue;

                if (inet_rcv_saddr_equal(sk, sk2, true))
                        return true;
        }

        return false;
}

/* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or
 * INADDR_ANY (if ipv4) socket.
 *
 * Caller must hold bhash hashbucket lock with local bh disabled, to protect
 * against concurrent binds on the port for addr any
 */
static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev,
                                          bool relax, bool reuseport_ok)
{
        kuid_t uid = sock_i_uid((struct sock *)sk);
        const struct net *net = sock_net(sk);
        struct sock_reuseport *reuseport_cb;
        struct inet_bind_hashbucket *head2;
        struct inet_bind2_bucket *tb2;
        bool conflict = false;
        bool reuseport_cb_ok;

        rcu_read_lock();
        reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
        /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
        reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
        rcu_read_unlock();

        head2 = inet_bhash2_addr_any_hashbucket(sk, net, port);

        spin_lock(&head2->lock);

        inet_bind_bucket_for_each(tb2, &head2->chain) {
                if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk))
                        continue;

                if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,        reuseport_ok))
                        continue;

                conflict = true;
                break;
        }

        spin_unlock(&head2->lock);

        return conflict;
}

/*
 * Find an open port number for the socket.  Returns with the
 * inet_bind_hashbucket locks held if successful.
 */
static struct inet_bind_hashbucket *
inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
                        struct inet_bind2_bucket **tb2_ret,
                        struct inet_bind_hashbucket **head2_ret, int *port_ret)
{
        struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
        int i, low, high, attempt_half, port, l3mdev;
        struct inet_bind_hashbucket *head, *head2;
        struct net *net = sock_net(sk);
        struct inet_bind2_bucket *tb2;
        struct inet_bind_bucket *tb;
        u32 remaining, offset;
        bool relax = false;

        l3mdev = inet_sk_bound_l3mdev(sk);
ports_exhausted:
        attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
        inet_sk_get_local_port_range(sk, &low, &high);
        high++; /* [32768, 60999] -> [32768, 61000[ */
        if (high - low < 4)
                attempt_half = 0;
        if (attempt_half) {
                int half = low + (((high - low) >> 2) << 1);

                if (attempt_half == 1)
                        high = half;
                else
                        low = half;
        }
        remaining = high - low;
        if (likely(remaining > 1))
                remaining &= ~1U;

        offset = get_random_u32_below(remaining);
        /* __inet_hash_connect() favors ports having @low parity
         * We do the opposite to not pollute connect() users.
         */
        offset |= 1U;

other_parity_scan:
        port = low + offset;
        for (i = 0; i < remaining; i += 2, port += 2) {
                if (unlikely(port >= high))
                        port -= remaining;
                if (inet_is_local_reserved_port(net, port))
                        continue;
                head = &hinfo->bhash[inet_bhashfn(net, port,
                                                  hinfo->bhash_size)];
                spin_lock_bh(&head->lock);
                if (inet_use_bhash2_on_bind(sk)) {
                        if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false))
                                goto next_port;
                }

                head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
                spin_lock(&head2->lock);
                tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
                inet_bind_bucket_for_each(tb, &head->chain)
                        if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
                                if (!inet_csk_bind_conflict(sk, tb, tb2,
                                                            relax, false))
                                        goto success;
                                spin_unlock(&head2->lock);
                                goto next_port;
                        }
                tb = NULL;
                goto success;
next_port:
                spin_unlock_bh(&head->lock);
                cond_resched();
        }

        offset--;
        if (!(offset & 1))
                goto other_parity_scan;

        if (attempt_half == 1) {
                /* OK we now try the upper half of the range */
                attempt_half = 2;
                goto other_half_scan;
        }

        if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) {
                /* We still have a chance to connect to different destinations */
                relax = true;
                goto ports_exhausted;
        }
        return NULL;
success:
        *port_ret = port;
        *tb_ret = tb;
        *tb2_ret = tb2;
        *head2_ret = head2;
        return head;
}

static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
                                     struct sock *sk)
{
        kuid_t uid = sock_i_uid(sk);

        if (tb->fastreuseport <= 0)
                return 0;
        if (!sk->sk_reuseport)
                return 0;
        if (rcu_access_pointer(sk->sk_reuseport_cb))
                return 0;
        if (!uid_eq(tb->fastuid, uid))
                return 0;
        /* We only need to check the rcv_saddr if this tb was once marked
         * without fastreuseport and then was reset, as we can only know that
         * the fast_*rcv_saddr doesn't have any conflicts with the socks on the
         * owners list.
         */
        if (tb->fastreuseport == FASTREUSEPORT_ANY)
                return 1;
#if IS_ENABLED(CONFIG_IPV6)
        if (tb->fast_sk_family == AF_INET6)
                return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr,
                                            inet6_rcv_saddr(sk),
                                            tb->fast_rcv_saddr,
                                            sk->sk_rcv_saddr,
                                            tb->fast_ipv6_only,
                                            ipv6_only_sock(sk), true, false);
#endif
        return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
                                    ipv6_only_sock(sk), true, false);
}

void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
                               struct sock *sk)
{
        kuid_t uid = sock_i_uid(sk);
        bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;

        if (hlist_empty(&tb->bhash2)) {
                tb->fastreuse = reuse;
                if (sk->sk_reuseport) {
                        tb->fastreuseport = FASTREUSEPORT_ANY;
                        tb->fastuid = uid;
                        tb->fast_rcv_saddr = sk->sk_rcv_saddr;
                        tb->fast_ipv6_only = ipv6_only_sock(sk);
                        tb->fast_sk_family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
                        tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
#endif
                } else {
                        tb->fastreuseport = 0;
                }
        } else {
                if (!reuse)
                        tb->fastreuse = 0;
                if (sk->sk_reuseport) {
                        /* We didn't match or we don't have fastreuseport set on
                         * the tb, but we have sk_reuseport set on this socket
                         * and we know that there are no bind conflicts with
                         * this socket in this tb, so reset our tb's reuseport
                         * settings so that any subsequent sockets that match
                         * our current socket will be put on the fast path.
                         *
                         * If we reset we need to set FASTREUSEPORT_STRICT so we
                         * do extra checking for all subsequent sk_reuseport
                         * socks.
                         */
                        if (!sk_reuseport_match(tb, sk)) {
                                tb->fastreuseport = FASTREUSEPORT_STRICT;
                                tb->fastuid = uid;
                                tb->fast_rcv_saddr = sk->sk_rcv_saddr;
                                tb->fast_ipv6_only = ipv6_only_sock(sk);
                                tb->fast_sk_family = sk->sk_family;
#if IS_ENABLED(CONFIG_IPV6)
                                tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
#endif
                        }
                } else {
                        tb->fastreuseport = 0;
                }
        }
}

/* Obtain a reference to a local port for the given sock,
 * if snum is zero it means select any available local port.
 * We try to allocate an odd port (and leave even ports for connect())
 */
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
        struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
        bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
        bool found_port = false, check_bind_conflict = true;
        bool bhash_created = false, bhash2_created = false;
        int ret = -EADDRINUSE, port = snum, l3mdev;
        struct inet_bind_hashbucket *head, *head2;
        struct inet_bind2_bucket *tb2 = NULL;
        struct inet_bind_bucket *tb = NULL;
        bool head2_lock_acquired = false;
        struct net *net = sock_net(sk);

        l3mdev = inet_sk_bound_l3mdev(sk);

        if (!port) {
                head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port);
                if (!head)
                        return ret;

                head2_lock_acquired = true;

                if (tb && tb2)
                        goto success;
                found_port = true;
        } else {
                head = &hinfo->bhash[inet_bhashfn(net, port,
                                                  hinfo->bhash_size)];
                spin_lock_bh(&head->lock);
                inet_bind_bucket_for_each(tb, &head->chain)
                        if (inet_bind_bucket_match(tb, net, port, l3mdev))
                                break;
        }

        if (!tb) {
                tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net,
                                             head, port, l3mdev);
                if (!tb)
                        goto fail_unlock;
                bhash_created = true;
        }

        if (!found_port) {
                if (!hlist_empty(&tb->bhash2)) {
                        if (sk->sk_reuse == SK_FORCE_REUSE ||
                            (tb->fastreuse > 0 && reuse) ||
                            sk_reuseport_match(tb, sk))
                                check_bind_conflict = false;
                }

                if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) {
                        if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true))
                                goto fail_unlock;
                }

                head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
                spin_lock(&head2->lock);
                head2_lock_acquired = true;
                tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
        }

        if (!tb2) {
                tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
                                               net, head2, tb, sk);
                if (!tb2)
                        goto fail_unlock;
                bhash2_created = true;
        }

        if (!found_port && check_bind_conflict) {
                if (inet_csk_bind_conflict(sk, tb, tb2, true, true))
                        goto fail_unlock;
        }

success:
        inet_csk_update_fastreuse(tb, sk);

        if (!inet_csk(sk)->icsk_bind_hash)
                inet_bind_hash(sk, tb, tb2, port);
        WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
        WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2);
        ret = 0;

fail_unlock:
        if (ret) {
                if (bhash2_created)
                        inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
                if (bhash_created)
                        inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
        }
        if (head2_lock_acquired)
                spin_unlock(&head2->lock);
        spin_unlock_bh(&head->lock);
        return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);

/*
 * Wait for an incoming connection, avoid race conditions. This must be called
 * with the socket locked.
 */
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        DEFINE_WAIT(wait);
        int err;

        /*
         * True wake-one mechanism for incoming connections: only
         * one process gets woken up, not the 'whole herd'.
         * Since we do not 'race & poll' for established sockets
         * anymore, the common case will execute the loop only once.
         *
         * Subtle issue: "add_wait_queue_exclusive()" will be added
         * after any current non-exclusive waiters, and we know that
         * it will always _stay_ after any new non-exclusive waiters
         * because all non-exclusive waiters are added at the
         * beginning of the wait-queue. As such, it's ok to "drop"
         * our exclusiveness temporarily when we get woken up without
         * having to remove and re-insert us on the wait queue.
         */
        for (;;) {
                prepare_to_wait_exclusive(sk_sleep(sk), &wait,
                                          TASK_INTERRUPTIBLE);
                release_sock(sk);
                if (reqsk_queue_empty(&icsk->icsk_accept_queue))
                        timeo = schedule_timeout(timeo);
                sched_annotate_sleep();
                lock_sock(sk);
                err = 0;
                if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
                        break;
                err = -EINVAL;
                if (sk->sk_state != TCP_LISTEN)
                        break;
                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        break;
                err = -EAGAIN;
                if (!timeo)
                        break;
        }
        finish_wait(sk_sleep(sk), &wait);
        return err;
}

/*
 * This will accept the next outstanding connection.
 */
struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct request_sock *req;
        struct sock *newsk;
        int error;

        lock_sock(sk);

        /* We need to make sure that this socket is listening,
         * and that it has something pending.
         */
        error = -EINVAL;
        if (sk->sk_state != TCP_LISTEN)
                goto out_err;

        /* Find already established connection */
        if (reqsk_queue_empty(queue)) {
                long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);

                /* If this is a non blocking socket don't sleep */
                error = -EAGAIN;
                if (!timeo)
                        goto out_err;

                error = inet_csk_wait_for_connect(sk, timeo);
                if (error)
                        goto out_err;
        }
        req = reqsk_queue_remove(queue, sk);
        arg->is_empty = reqsk_queue_empty(queue);
        newsk = req->sk;

        if (sk->sk_protocol == IPPROTO_TCP &&
            tcp_rsk(req)->tfo_listener) {
                spin_lock_bh(&queue->fastopenq.lock);
                if (tcp_rsk(req)->tfo_listener) {
                        /* We are still waiting for the final ACK from 3WHS
                         * so can't free req now. Instead, we set req->sk to
                         * NULL to signify that the child socket is taken
                         * so reqsk_fastopen_remove() will free the req
                         * when 3WHS finishes (or is aborted).
                         */
                        req->sk = NULL;
                        req = NULL;
                }
                spin_unlock_bh(&queue->fastopenq.lock);
        }

out:
        release_sock(sk);
        if (newsk && mem_cgroup_sockets_enabled) {
                int amt = 0;

                /* atomically get the memory usage, set and charge the
                 * newsk->sk_memcg.
                 */
                lock_sock(newsk);

                mem_cgroup_sk_alloc(newsk);
                if (newsk->sk_memcg) {
                        /* The socket has not been accepted yet, no need
                         * to look at newsk->sk_wmem_queued.
                         */
                        amt = sk_mem_pages(newsk->sk_forward_alloc +
                                           atomic_read(&newsk->sk_rmem_alloc));
                }

                if (amt)
                        mem_cgroup_charge_skmem(newsk->sk_memcg, amt,
                                                GFP_KERNEL | __GFP_NOFAIL);

                release_sock(newsk);
        }
        if (req)
                reqsk_put(req);

        if (newsk)
                inet_init_csk_locks(newsk);

        return newsk;
out_err:
        newsk = NULL;
        req = NULL;
        arg->err = error;
        goto out;
}
EXPORT_SYMBOL(inet_csk_accept);

/*
 * Using different timers for retransmit, delayed acks and probes
 * We may wish use just one timer maintaining a list of expire jiffies
 * to optimize.
 */
void inet_csk_init_xmit_timers(struct sock *sk,
                               void (*retransmit_handler)(struct timer_list *t),
                               void (*delack_handler)(struct timer_list *t),
                               void (*keepalive_handler)(struct timer_list *t))
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
        timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
        timer_setup(&sk->sk_timer, keepalive_handler, 0);
        icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
EXPORT_SYMBOL(inet_csk_init_xmit_timers);

void inet_csk_clear_xmit_timers(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        icsk->icsk_pending = icsk->icsk_ack.pending = 0;

        sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
        sk_stop_timer(sk, &icsk->icsk_delack_timer);
        sk_stop_timer(sk, &sk->sk_timer);
}
EXPORT_SYMBOL(inet_csk_clear_xmit_timers);

void inet_csk_clear_xmit_timers_sync(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        /* ongoing timer handlers need to acquire socket lock. */
        sock_not_owned_by_me(sk);

        icsk->icsk_pending = icsk->icsk_ack.pending = 0;

        sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer);
        sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
        sk_stop_timer_sync(sk, &sk->sk_timer);
}

void inet_csk_delete_keepalive_timer(struct sock *sk)
{
        sk_stop_timer(sk, &sk->sk_timer);
}
EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);

void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
{
        sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
}
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);

struct dst_entry *inet_csk_route_req(const struct sock *sk,
                                     struct flowi4 *fl4,
                                     const struct request_sock *req)
{
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct net *net = read_pnet(&ireq->ireq_net);
        struct ip_options_rcu *opt;
        struct rtable *rt;

        rcu_read_lock();
        opt = rcu_dereference(ireq->ireq_opt);

        flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
                           ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
                           sk->sk_protocol, inet_sk_flowi_flags(sk),
                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
                           ireq->ir_loc_addr, ireq->ir_rmt_port,
                           htons(ireq->ir_num), sk->sk_uid);
        security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
        rt = ip_route_output_flow(net, fl4, sk);
        if (IS_ERR(rt))
                goto no_route;
        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
                goto route_err;
        rcu_read_unlock();
        return &rt->dst;

route_err:
        ip_rt_put(rt);
no_route:
        rcu_read_unlock();
        __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
        return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);

struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
                                            struct sock *newsk,
                                            const struct request_sock *req)
{
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct net *net = read_pnet(&ireq->ireq_net);
        struct inet_sock *newinet = inet_sk(newsk);
        struct ip_options_rcu *opt;
        struct flowi4 *fl4;
        struct rtable *rt;

        opt = rcu_dereference(ireq->ireq_opt);
        fl4 = &newinet->cork.fl.u.ip4;

        flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
                           ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
                           sk->sk_protocol, inet_sk_flowi_flags(sk),
                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
                           ireq->ir_loc_addr, ireq->ir_rmt_port,
                           htons(ireq->ir_num), sk->sk_uid);
        security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
        rt = ip_route_output_flow(net, fl4, sk);
        if (IS_ERR(rt))
                goto no_route;
        if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
                goto route_err;
        return &rt->dst;

route_err:
        ip_rt_put(rt);
no_route:
        __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
        return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);

/* Decide when to expire the request and when to resend SYN-ACK */
static void syn_ack_recalc(struct request_sock *req,
                           const int max_syn_ack_retries,
                           const u8 rskq_defer_accept,
                           int *expire, int *resend)
{
        if (!rskq_defer_accept) {
                *expire = req->num_timeout >= max_syn_ack_retries;
                *resend = 1;
                return;
        }
        *expire = req->num_timeout >= max_syn_ack_retries &&
                  (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept);
        /* Do not resend while waiting for data after ACK,
         * start to resend on end of deferring period to give
         * last chance for data or ACK to create established socket.
         */
        *resend = !inet_rsk(req)->acked ||
                  req->num_timeout >= rskq_defer_accept - 1;
}

int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
{
        int err = req->rsk_ops->rtx_syn_ack(parent, req);

        if (!err)
                req->num_retrans++;
        return err;
}
EXPORT_SYMBOL(inet_rtx_syn_ack);

static struct request_sock *
reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
                   bool attach_listener)
{
        struct request_sock *req;

        req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
        if (!req)
                return NULL;
        req->rsk_listener = NULL;
        if (attach_listener) {
                if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) {
                        kmem_cache_free(ops->slab, req);
                        return NULL;
                }
                req->rsk_listener = sk_listener;
        }
        req->rsk_ops = ops;
        req_to_sk(req)->sk_prot = sk_listener->sk_prot;
        sk_node_init(&req_to_sk(req)->sk_node);
        sk_tx_queue_clear(req_to_sk(req));
        req->saved_syn = NULL;
        req->syncookie = 0;
        req->timeout = 0;
        req->num_timeout = 0;
        req->num_retrans = 0;
        req->sk = NULL;
        refcount_set(&req->rsk_refcnt, 0);

        return req;
}
#define reqsk_alloc(...)        alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__))

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
                                      struct sock *sk_listener,
                                      bool attach_listener)
{
        struct request_sock *req = reqsk_alloc(ops, sk_listener,
                                               attach_listener);

        if (req) {
                struct inet_request_sock *ireq = inet_rsk(req);

                ireq->ireq_opt = NULL;
#if IS_ENABLED(CONFIG_IPV6)
                ireq->pktopts = NULL;
#endif
                atomic64_set(&ireq->ir_cookie, 0);
                ireq->ireq_state = TCP_NEW_SYN_RECV;
                write_pnet(&ireq->ireq_net, sock_net(sk_listener));
                ireq->ireq_family = sk_listener->sk_family;
                req->timeout = TCP_TIMEOUT_INIT;
        }

        return req;
}
EXPORT_SYMBOL(inet_reqsk_alloc);

static struct request_sock *inet_reqsk_clone(struct request_sock *req,
                                             struct sock *sk)
{
        struct sock *req_sk, *nreq_sk;
        struct request_sock *nreq;

        nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
        if (!nreq) {
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);

                /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
                sock_put(sk);
                return NULL;
        }

        req_sk = req_to_sk(req);
        nreq_sk = req_to_sk(nreq);

        memcpy(nreq_sk, req_sk,
               offsetof(struct sock, sk_dontcopy_begin));
        unsafe_memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
                      req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end),
                      /* alloc is larger than struct, see above */);

        sk_node_init(&nreq_sk->sk_node);
        nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
#endif
        nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;

        nreq->rsk_listener = sk;

        /* We need not acquire fastopenq->lock
         * because the child socket is locked in inet_csk_listen_stop().
         */
        if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
                rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);

        return nreq;
}

static void reqsk_queue_migrated(struct request_sock_queue *queue,
                                 const struct request_sock *req)
{
        if (req->num_timeout == 0)
                atomic_inc(&queue->young);
        atomic_inc(&queue->qlen);
}

static void reqsk_migrate_reset(struct request_sock *req)
{
        req->saved_syn = NULL;
#if IS_ENABLED(CONFIG_IPV6)
        inet_rsk(req)->ipv6_opt = NULL;
        inet_rsk(req)->pktopts = NULL;
#else
        inet_rsk(req)->ireq_opt = NULL;
#endif
}

/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock *req)
{
        struct sock *sk = req_to_sk(req);
        bool found = false;

        if (sk_hashed(sk)) {
                struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
                spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);

                spin_lock(lock);
                found = __sk_nulls_del_node_init_rcu(sk);
                spin_unlock(lock);
        }
        if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
                reqsk_put(req);
        return found;
}

bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
{
        bool unlinked = reqsk_queue_unlink(req);

        if (unlinked) {
                reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
                reqsk_put(req);
        }
        return unlinked;
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);

void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
{
        inet_csk_reqsk_queue_drop(sk, req);
        reqsk_put(req);
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);

static void reqsk_timer_handler(struct timer_list *t)
{
        struct request_sock *req = from_timer(req, t, rsk_timer);
        struct request_sock *nreq = NULL, *oreq = req;
        struct sock *sk_listener = req->rsk_listener;
        struct inet_connection_sock *icsk;
        struct request_sock_queue *queue;
        struct net *net;
        int max_syn_ack_retries, qlen, expire = 0, resend = 0;

        if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
                struct sock *nsk;

                nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
                if (!nsk)
                        goto drop;

                nreq = inet_reqsk_clone(req, nsk);
                if (!nreq)
                        goto drop;

                /* The new timer for the cloned req can decrease the 2
                 * by calling inet_csk_reqsk_queue_drop_and_put(), so
                 * hold another count to prevent use-after-free and
                 * call reqsk_put() just before return.
                 */
                refcount_set(&nreq->rsk_refcnt, 2 + 1);
                timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
                reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req);

                req = nreq;
                sk_listener = nsk;
        }

        icsk = inet_csk(sk_listener);
        net = sock_net(sk_listener);
        max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
                READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
        /* Normally all the openreqs are young and become mature
         * (i.e. converted to established socket) for first timeout.
         * If synack was not acknowledged for 1 second, it means
         * one of the following things: synack was lost, ack was lost,
         * rtt is high or nobody planned to ack (i.e. synflood).
         * When server is a bit loaded, queue is populated with old
         * open requests, reducing effective size of queue.
         * When server is well loaded, queue size reduces to zero
         * after several minutes of work. It is not synflood,
         * it is normal operation. The solution is pruning
         * too old entries overriding normal timeout, when
         * situation becomes dangerous.
         *
         * Essentially, we reserve half of room for young
         * embrions; and abort old ones without pity, if old
         * ones are about to clog our table.
         */
        queue = &icsk->icsk_accept_queue;
        qlen = reqsk_queue_len(queue);
        if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
                int young = reqsk_queue_len_young(queue) << 1;

                while (max_syn_ack_retries > 2) {
                        if (qlen < young)
                                break;
                        max_syn_ack_retries--;
                        young <<= 1;
                }
        }
        syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
                       &expire, &resend);
        req->rsk_ops->syn_ack_timeout(req);
        if (!expire &&
            (!resend ||
             !inet_rtx_syn_ack(sk_listener, req) ||
             inet_rsk(req)->acked)) {
                if (req->num_timeout++ == 0)
                        atomic_dec(&queue->young);
                mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX));

                if (!nreq)
                        return;

                if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
                        /* delete timer */
                        inet_csk_reqsk_queue_drop(sk_listener, nreq);
                        goto no_ownership;
                }

                __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS);
                reqsk_migrate_reset(oreq);
                reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
                reqsk_put(oreq);

                reqsk_put(nreq);
                return;
        }

        /* Even if we can clone the req, we may need not retransmit any more
         * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
         * CPU may win the "own_req" race so that inet_ehash_insert() fails.
         */
        if (nreq) {
                __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE);
no_ownership:
                reqsk_migrate_reset(nreq);
                reqsk_queue_removed(queue, nreq);
                __reqsk_free(nreq);
        }

drop:
        inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq);
}

static void reqsk_queue_hash_req(struct request_sock *req,
                                 unsigned long timeout)
{
        timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
        mod_timer(&req->rsk_timer, jiffies + timeout);

        inet_ehash_insert(req_to_sk(req), NULL, NULL);
        /* before letting lookups find us, make sure all req fields
         * are committed to memory and refcnt initialized.
         */
        smp_wmb();
        refcount_set(&req->rsk_refcnt, 2 + 1);
}

void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
                                   unsigned long timeout)
{
        reqsk_queue_hash_req(req, timeout);
        inet_csk_reqsk_queue_added(sk);
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);

static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
                           const gfp_t priority)
{
        struct inet_connection_sock *icsk = inet_csk(newsk);

        if (!icsk->icsk_ulp_ops)
                return;

        icsk->icsk_ulp_ops->clone(req, newsk, priority);
}

/**
 *        inet_csk_clone_lock - clone an inet socket, and lock its clone
 *        @sk: the socket to clone
 *        @req: request_sock
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *
 *        Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 */
struct sock *inet_csk_clone_lock(const struct sock *sk,
                                 const struct request_sock *req,
                                 const gfp_t priority)
{
        struct sock *newsk = sk_clone_lock(sk, priority);

        if (newsk) {
                struct inet_connection_sock *newicsk = inet_csk(newsk);

                inet_sk_set_state(newsk, TCP_SYN_RECV);
                newicsk->icsk_bind_hash = NULL;
                newicsk->icsk_bind2_hash = NULL;

                inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
                inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
                inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);

                /* listeners have SOCK_RCU_FREE, not the children */
                sock_reset_flag(newsk, SOCK_RCU_FREE);

                inet_sk(newsk)->mc_list = NULL;

                newsk->sk_mark = inet_rsk(req)->ir_mark;
                atomic64_set(&newsk->sk_cookie,
                             atomic64_read(&inet_rsk(req)->ir_cookie));

                newicsk->icsk_retransmits = 0;
                newicsk->icsk_backoff          = 0;
                newicsk->icsk_probes_out  = 0;
                newicsk->icsk_probes_tstamp = 0;

                /* Deinitialize accept_queue to trap illegal accesses. */
                memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));

                inet_clone_ulp(req, newsk, priority);

                security_inet_csk_clone(newsk, req);
        }
        return newsk;
}
EXPORT_SYMBOL_GPL(inet_csk_clone_lock);

/*
 * At this point, there should be no process reference to this
 * socket, and thus no user references at all.  Therefore we
 * can assume the socket waitqueue is inactive and nobody will
 * try to jump onto it.
 */
void inet_csk_destroy_sock(struct sock *sk)
{
        WARN_ON(sk->sk_state != TCP_CLOSE);
        WARN_ON(!sock_flag(sk, SOCK_DEAD));

        /* It cannot be in hash table! */
        WARN_ON(!sk_unhashed(sk));

        /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
        WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);

        sk->sk_prot->destroy(sk);

        sk_stream_kill_queues(sk);

        xfrm_sk_free_policy(sk);

        this_cpu_dec(*sk->sk_prot->orphan_count);

        sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);

/* This function allows to force a closure of a socket after the call to
 * tcp/dccp_create_openreq_child().
 */
void inet_csk_prepare_forced_close(struct sock *sk)
        __releases(&sk->sk_lock.slock)
{
        /* sk_clone_lock locked the socket and set refcnt to 2 */
        bh_unlock_sock(sk);
        sock_put(sk);
        inet_csk_prepare_for_destroy_sock(sk);
        inet_sk(sk)->inet_num = 0;
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);

static int inet_ulp_can_listen(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone)
                return -EINVAL;

        return 0;
}

int inet_csk_listen_start(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_sock *inet = inet_sk(sk);
        int err;

        err = inet_ulp_can_listen(sk);
        if (unlikely(err))
                return err;

        reqsk_queue_alloc(&icsk->icsk_accept_queue);

        sk->sk_ack_backlog = 0;
        inet_csk_delack_init(sk);

        /* There is race window here: we announce ourselves listening,
         * but this transition is still not validated by get_port().
         * It is OK, because this socket enters to hash table only
         * after validation is complete.
         */
        inet_sk_state_store(sk, TCP_LISTEN);
        err = sk->sk_prot->get_port(sk, inet->inet_num);
        if (!err) {
                inet->inet_sport = htons(inet->inet_num);

                sk_dst_reset(sk);
                err = sk->sk_prot->hash(sk);

                if (likely(!err))
                        return 0;
        }

        inet_sk_set_state(sk, TCP_CLOSE);
        return err;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);

static void inet_child_forget(struct sock *sk, struct request_sock *req,
                              struct sock *child)
{
        sk->sk_prot->disconnect(child, O_NONBLOCK);

        sock_orphan(child);

        this_cpu_inc(*sk->sk_prot->orphan_count);

        if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
                BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
                BUG_ON(sk != req->rsk_listener);

                /* Paranoid, to prevent race condition if
                 * an inbound pkt destined for child is
                 * blocked by sock lock in tcp_v4_rcv().
                 * Also to satisfy an assertion in
                 * tcp_v4_destroy_sock().
                 */
                RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
        }
        inet_csk_destroy_sock(child);
}

struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
                                      struct request_sock *req,
                                      struct sock *child)
{
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;

        spin_lock(&queue->rskq_lock);
        if (unlikely(sk->sk_state != TCP_LISTEN)) {
                inet_child_forget(sk, req, child);
                child = NULL;
        } else {
                req->sk = child;
                req->dl_next = NULL;
                if (queue->rskq_accept_head == NULL)
                        WRITE_ONCE(queue->rskq_accept_head, req);
                else
                        queue->rskq_accept_tail->dl_next = req;
                queue->rskq_accept_tail = req;
                sk_acceptq_added(sk);
        }
        spin_unlock(&queue->rskq_lock);
        return child;
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_add);

struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
                                         struct request_sock *req, bool own_req)
{
        if (own_req) {
                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
                reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);

                if (sk != req->rsk_listener) {
                        /* another listening sk has been selected,
                         * migrate the req to it.
                         */
                        struct request_sock *nreq;

                        /* hold a refcnt for the nreq->rsk_listener
                         * which is assigned in inet_reqsk_clone()
                         */
                        sock_hold(sk);
                        nreq = inet_reqsk_clone(req, sk);
                        if (!nreq) {
                                inet_child_forget(sk, req, child);
                                goto child_put;
                        }

                        refcount_set(&nreq->rsk_refcnt, 1);
                        if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
                                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS);
                                reqsk_migrate_reset(req);
                                reqsk_put(req);
                                return child;
                        }

                        __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
                        reqsk_migrate_reset(nreq);
                        __reqsk_free(nreq);
                } else if (inet_csk_reqsk_queue_add(sk, req, child)) {
                        return child;
                }
        }
        /* Too bad, another child took ownership of the request, undo. */
child_put:
        bh_unlock_sock(child);
        sock_put(child);
        return NULL;
}
EXPORT_SYMBOL(inet_csk_complete_hashdance);

/*
 *        This routine closes sockets which have been at least partially
 *        opened, but not yet accepted.
 */
void inet_csk_listen_stop(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct request_sock_queue *queue = &icsk->icsk_accept_queue;
        struct request_sock *next, *req;

        /* Following specs, it would be better either to send FIN
         * (and enter FIN-WAIT-1, it is normal close)
         * or to send active reset (abort).
         * Certainly, it is pretty dangerous while synflood, but it is
         * bad justification for our negligence 8)
         * To be honest, we are not able to make either
         * of the variants now.                        --ANK
         */
        while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
                struct sock *child = req->sk, *nsk;
                struct request_sock *nreq;

                local_bh_disable();
                bh_lock_sock(child);
                WARN_ON(sock_owned_by_user(child));
                sock_hold(child);

                nsk = reuseport_migrate_sock(sk, child, NULL);
                if (nsk) {
                        nreq = inet_reqsk_clone(req, nsk);
                        if (nreq) {
                                refcount_set(&nreq->rsk_refcnt, 1);

                                if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
                                        __NET_INC_STATS(sock_net(nsk),
                                                        LINUX_MIB_TCPMIGRATEREQSUCCESS);
                                        reqsk_migrate_reset(req);
                                } else {
                                        __NET_INC_STATS(sock_net(nsk),
                                                        LINUX_MIB_TCPMIGRATEREQFAILURE);
                                        reqsk_migrate_reset(nreq);
                                        __reqsk_free(nreq);
                                }

                                /* inet_csk_reqsk_queue_add() has already
                                 * called inet_child_forget() on failure case.
                                 */
                                goto skip_child_forget;
                        }
                }

                inet_child_forget(sk, req, child);
skip_child_forget:
                reqsk_put(req);
                bh_unlock_sock(child);
                local_bh_enable();
                sock_put(child);

                cond_resched();
        }
        if (queue->fastopenq.rskq_rst_head) {
                /* Free all the reqs queued in rskq_rst_head. */
                spin_lock_bh(&queue->fastopenq.lock);
                req = queue->fastopenq.rskq_rst_head;
                queue->fastopenq.rskq_rst_head = NULL;
                spin_unlock_bh(&queue->fastopenq.lock);
                while (req != NULL) {
                        next = req->dl_next;
                        reqsk_put(req);
                        req = next;
                }
        }
        WARN_ON_ONCE(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);

void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
{
        struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
        const struct inet_sock *inet = inet_sk(sk);

        sin->sin_family                = AF_INET;
        sin->sin_addr.s_addr        = inet->inet_daddr;
        sin->sin_port                = inet->inet_dport;
}
EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);

static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
        const struct inet_sock *inet = inet_sk(sk);
        const struct ip_options_rcu *inet_opt;
        __be32 daddr = inet->inet_daddr;
        struct flowi4 *fl4;
        struct rtable *rt;

        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;
        fl4 = &fl->u.ip4;
        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
                                   inet->inet_saddr, inet->inet_dport,
                                   inet->inet_sport, sk->sk_protocol,
                                   ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
        if (IS_ERR(rt))
                rt = NULL;
        if (rt)
                sk_setup_caps(sk, &rt->dst);
        rcu_read_unlock();

        return &rt->dst;
}

struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
{
        struct dst_entry *dst = __sk_dst_check(sk, 0);
        struct inet_sock *inet = inet_sk(sk);

        if (!dst) {
                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
                if (!dst)
                        goto out;
        }
        dst->ops->update_pmtu(dst, sk, NULL, mtu, true);

        dst = __sk_dst_check(sk, 0);
        if (!dst)
                dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
out:
        return dst;
}
EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);



































































































































































































    5 







    1 






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
/* SPDX-License-Identifier: GPL-2.0 */
/* rwsem.h: R/W semaphores, public interface
 *
 * Written by David Howells (dhowells@redhat.com).
 * Derived from asm-i386/semaphore.h
 */

#ifndef _LINUX_RWSEM_H
#define _LINUX_RWSEM_H

#include <linux/linkage.h>

#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
#include <linux/cleanup.h>

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __RWSEM_DEP_MAP_INIT(lockname)                        \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_SLEEP,        \
        },
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

#ifndef CONFIG_PREEMPT_RT

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif

/*
 * For an uncontended rwsem, count and owner are the only fields a task
 * needs to touch when acquiring the rwsem. So they are put next to each
 * other to increase the chance that they will share the same cacheline.
 *
 * In a contended rwsem, the owner is likely the most frequently accessed
 * field in the structure as the optimistic waiter that holds the osq lock
 * will spin on owner. For an embedded rwsem, other hot fields in the
 * containing structure should be moved further away from the rwsem to
 * reduce the chance that they will share the same cacheline causing
 * cacheline bouncing problem.
 */
struct rw_semaphore {
        atomic_long_t count;
        /*
         * Write owner or one of the read owners as well flags regarding
         * the current state of the rwsem. Can be used as a speculative
         * check to see if the write owner is running on the cpu.
         */
        atomic_long_t owner;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
        raw_spinlock_t wait_lock;
        struct list_head wait_list;
#ifdef CONFIG_DEBUG_RWSEMS
        void *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define RWSEM_UNLOCKED_VALUE                0UL
#define RWSEM_WRITER_LOCKED                (1UL << 0)
#define __RWSEM_COUNT_INIT(name)        .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)

static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
        return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE;
}

static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE);
}

static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED));
}

/* Common initializer macros and functions */

#ifdef CONFIG_DEBUG_RWSEMS
# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
#else
# define __RWSEM_DEBUG_INIT(lockname)
#endif

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
#else
#define __RWSEM_OPT_INIT(lockname)
#endif

#define __RWSEM_INITIALIZER(name)                                \
        { __RWSEM_COUNT_INIT(name),                                \
          .owner = ATOMIC_LONG_INIT(0),                                \
          __RWSEM_OPT_INIT(name)                                \
          .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
          .wait_list = LIST_HEAD_INIT((name).wait_list),        \
          __RWSEM_DEBUG_INIT(name)                                \
          __RWSEM_DEP_MAP_INIT(name) }

#define DECLARE_RWSEM(name) \
        struct rw_semaphore name = __RWSEM_INITIALIZER(name)

extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
                         struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

/*
 * This is the same regardless of which rwsem implementation that is being used.
 * It is just a heuristic meant to be called by somebody already holding the
 * rwsem to see if somebody from an incompatible type is wanting access to the
 * lock.
 */
static inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return !list_empty(&sem->wait_list);
}

#else /* !CONFIG_PREEMPT_RT */

#include <linux/rwbase_rt.h>

struct rw_semaphore {
        struct rwbase_rt        rwbase;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define __RWSEM_INITIALIZER(name)                                \
        {                                                        \
                .rwbase = __RWBASE_INITIALIZER(name),                \
                __RWSEM_DEP_MAP_INIT(name)                        \
        }

#define DECLARE_RWSEM(lockname) \
        struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)

extern void  __init_rwsem(struct rw_semaphore *rwsem, const char *name,
                          struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem)
{
        return rw_base_is_locked(&sem->rwbase);
}

static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rwsem_is_locked(sem));
}

static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rw_base_is_write_locked(&sem->rwbase));
}

static __always_inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return rw_base_is_contended(&sem->rwbase);
}

#endif /* CONFIG_PREEMPT_RT */

/*
 * The functions below are the same for all rwsem implementations including
 * the RT specific variant.
 */

static inline void rwsem_assert_held(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held(sem);
        else
                rwsem_assert_held_nolockdep(sem);
}

static inline void rwsem_assert_held_write(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held_write(sem);
        else
                rwsem_assert_held_write_nolockdep(sem);
}

/*
 * lock for reading
 */
extern void down_read(struct rw_semaphore *sem);
extern int __must_check down_read_interruptible(struct rw_semaphore *sem);
extern int __must_check down_read_killable(struct rw_semaphore *sem);

/*
 * trylock for reading -- returns 1 if successful, 0 if contention
 */
extern int down_read_trylock(struct rw_semaphore *sem);

/*
 * lock for writing
 */
extern void down_write(struct rw_semaphore *sem);
extern int __must_check down_write_killable(struct rw_semaphore *sem);

/*
 * trylock for writing -- returns 1 if successful, 0 if contention
 */
extern int down_write_trylock(struct rw_semaphore *sem);

/*
 * release a read lock
 */
extern void up_read(struct rw_semaphore *sem);

/*
 * release a write lock
 */
extern void up_write(struct rw_semaphore *sem);

DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0)

DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))

/*
 * downgrade write lock to read lock
 */
extern void downgrade_write(struct rw_semaphore *sem);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * nested locking. NOTE: rwsems are not allowed to recurse
 * (which occurs if the same task tries to acquire the same
 * lock instance multiple times), but multiple locks of the
 * same lock class might be taken, if the order of the locks
 * is always the same. This ordering rule can be expressed
 * to lockdep via the _nested() APIs, but enumerating the
 * subclasses that are used. (If the nesting relationship is
 * static then another method for expressing nested locking is
 * the explicit definition of lock class keys and the use of
 * lockdep_set_class() at lock initialization time.
 * See Documentation/locking/lockdep-design.rst for more details.)
 */
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);

# define down_write_nest_lock(sem, nest_lock)                        \
do {                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _down_write_nest_lock(sem, &(nest_lock)->dep_map);        \
} while (0)

/*
 * Take/release a lock when not the owner will release it.
 *
 * [ This API should be avoided as much as possible - the
 *   proper abstraction for this case is completions. ]
 */
extern void down_read_non_owner(struct rw_semaphore *sem);
extern void up_read_non_owner(struct rw_semaphore *sem);
#else
# define down_read_nested(sem, subclass)                down_read(sem)
# define down_read_killable_nested(sem, subclass)        down_read_killable(sem)
# define down_write_nest_lock(sem, nest_lock)        down_write(sem)
# define down_write_nested(sem, subclass)        down_write(sem)
# define down_write_killable_nested(sem, subclass)        down_write_killable(sem)
# define down_read_non_owner(sem)                down_read(sem)
# define up_read_non_owner(sem)                        up_read(sem)
#endif

#endif /* _LINUX_RWSEM_H */

















































































































































































































































































































    1 









    1 

















    1 


    1 


























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains functions which manage clock event devices.
 *
 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
 */

#include <linux/clockchips.h>
#include <linux/hrtimer.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/device.h>

#include "tick-internal.h"

/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
static LIST_HEAD(clockevents_released);
/* Protection for the above */
static DEFINE_RAW_SPINLOCK(clockevents_lock);
/* Protection for unbind operations */
static DEFINE_MUTEX(clockevents_mutex);

struct ce_unbind {
        struct clock_event_device *ce;
        int res;
};

static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
                        bool ismax)
{
        u64 clc = (u64) latch << evt->shift;
        u64 rnd;

        if (WARN_ON(!evt->mult))
                evt->mult = 1;
        rnd = (u64) evt->mult - 1;

        /*
         * Upper bound sanity check. If the backwards conversion is
         * not equal latch, we know that the above shift overflowed.
         */
        if ((clc >> evt->shift) != (u64)latch)
                clc = ~0ULL;

        /*
         * Scaled math oddities:
         *
         * For mult <= (1 << shift) we can safely add mult - 1 to
         * prevent integer rounding loss. So the backwards conversion
         * from nsec to device ticks will be correct.
         *
         * For mult > (1 << shift), i.e. device frequency is > 1GHz we
         * need to be careful. Adding mult - 1 will result in a value
         * which when converted back to device ticks can be larger
         * than latch by up to (mult - 1) >> shift. For the min_delta
         * calculation we still want to apply this in order to stay
         * above the minimum device ticks limit. For the upper limit
         * we would end up with a latch value larger than the upper
         * limit of the device, so we omit the add to stay below the
         * device upper boundary.
         *
         * Also omit the add if it would overflow the u64 boundary.
         */
        if ((~0ULL - clc > rnd) &&
            (!ismax || evt->mult <= (1ULL << evt->shift)))
                clc += rnd;

        do_div(clc, evt->mult);

        /* Deltas less than 1usec are pointless noise */
        return clc > 1000 ? clc : 1000;
}

/**
 * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds
 * @latch:        value to convert
 * @evt:        pointer to clock event device descriptor
 *
 * Math helper, returns latch value converted to nanoseconds (bound checked)
 */
u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
{
        return cev_delta2ns(latch, evt, false);
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);

static int __clockevents_switch_state(struct clock_event_device *dev,
                                      enum clock_event_state state)
{
        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
                return 0;

        /* Transition with new state-specific callbacks */
        switch (state) {
        case CLOCK_EVT_STATE_DETACHED:
                /* The clockevent device is getting replaced. Shut it down. */

        case CLOCK_EVT_STATE_SHUTDOWN:
                if (dev->set_state_shutdown)
                        return dev->set_state_shutdown(dev);
                return 0;

        case CLOCK_EVT_STATE_PERIODIC:
                /* Core internal bug */
                if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
                        return -ENOSYS;
                if (dev->set_state_periodic)
                        return dev->set_state_periodic(dev);
                return 0;

        case CLOCK_EVT_STATE_ONESHOT:
                /* Core internal bug */
                if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
                        return -ENOSYS;
                if (dev->set_state_oneshot)
                        return dev->set_state_oneshot(dev);
                return 0;

        case CLOCK_EVT_STATE_ONESHOT_STOPPED:
                /* Core internal bug */
                if (WARN_ONCE(!clockevent_state_oneshot(dev),
                              "Current state: %d\n",
                              clockevent_get_state(dev)))
                        return -EINVAL;

                if (dev->set_state_oneshot_stopped)
                        return dev->set_state_oneshot_stopped(dev);
                else
                        return -ENOSYS;

        default:
                return -ENOSYS;
        }
}

/**
 * clockevents_switch_state - set the operating state of a clock event device
 * @dev:        device to modify
 * @state:        new state
 *
 * Must be called with interrupts disabled !
 */
void clockevents_switch_state(struct clock_event_device *dev,
                              enum clock_event_state state)
{
        if (clockevent_get_state(dev) != state) {
                if (__clockevents_switch_state(dev, state))
                        return;

                clockevent_set_state(dev, state);

                /*
                 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
                 * on it, so fix it up and emit a warning:
                 */
                if (clockevent_state_oneshot(dev)) {
                        if (WARN_ON(!dev->mult))
                                dev->mult = 1;
                }
        }
}

/**
 * clockevents_shutdown - shutdown the device and clear next_event
 * @dev:        device to shutdown
 */
void clockevents_shutdown(struct clock_event_device *dev)
{
        clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
        dev->next_event = KTIME_MAX;
}

/**
 * clockevents_tick_resume -        Resume the tick device before using it again
 * @dev:                        device to resume
 */
int clockevents_tick_resume(struct clock_event_device *dev)
{
        int ret = 0;

        if (dev->tick_resume)
                ret = dev->tick_resume(dev);

        return ret;
}

#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST

/* Limit min_delta to a jiffie */
#define MIN_DELTA_LIMIT                (NSEC_PER_SEC / HZ)

/**
 * clockevents_increase_min_delta - raise minimum delta of a clock event device
 * @dev:       device to increase the minimum delta
 *
 * Returns 0 on success, -ETIME when the minimum delta reached the limit.
 */
static int clockevents_increase_min_delta(struct clock_event_device *dev)
{
        /* Nothing to do if we already reached the limit */
        if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
                printk_deferred(KERN_WARNING
                                "CE: Reprogramming failure. Giving up\n");
                dev->next_event = KTIME_MAX;
                return -ETIME;
        }

        if (dev->min_delta_ns < 5000)
                dev->min_delta_ns = 5000;
        else
                dev->min_delta_ns += dev->min_delta_ns >> 1;

        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
                dev->min_delta_ns = MIN_DELTA_LIMIT;

        printk_deferred(KERN_WARNING
                        "CE: %s increased min_delta_ns to %llu nsec\n",
                        dev->name ? dev->name : "?",
                        (unsigned long long) dev->min_delta_ns);
        return 0;
}

/**
 * clockevents_program_min_delta - Set clock event device to the minimum delay.
 * @dev:        device to program
 *
 * Returns 0 on success, -ETIME when the retry loop failed.
 */
static int clockevents_program_min_delta(struct clock_event_device *dev)
{
        unsigned long long clc;
        int64_t delta;
        int i;

        for (i = 0;;) {
                delta = dev->min_delta_ns;
                dev->next_event = ktime_add_ns(ktime_get(), delta);

                if (clockevent_state_shutdown(dev))
                        return 0;

                dev->retries++;
                clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
                if (dev->set_next_event((unsigned long) clc, dev) == 0)
                        return 0;

                if (++i > 2) {
                        /*
                         * We tried 3 times to program the device with the
                         * given min_delta_ns. Try to increase the minimum
                         * delta, if that fails as well get out of here.
                         */
                        if (clockevents_increase_min_delta(dev))
                                return -ETIME;
                        i = 0;
                }
        }
}

#else  /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */

/**
 * clockevents_program_min_delta - Set clock event device to the minimum delay.
 * @dev:        device to program
 *
 * Returns 0 on success, -ETIME when the retry loop failed.
 */
static int clockevents_program_min_delta(struct clock_event_device *dev)
{
        unsigned long long clc;
        int64_t delta = 0;
        int i;

        for (i = 0; i < 10; i++) {
                delta += dev->min_delta_ns;
                dev->next_event = ktime_add_ns(ktime_get(), delta);

                if (clockevent_state_shutdown(dev))
                        return 0;

                dev->retries++;
                clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
                if (dev->set_next_event((unsigned long) clc, dev) == 0)
                        return 0;
        }
        return -ETIME;
}

#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */

/**
 * clockevents_program_event - Reprogram the clock event device.
 * @dev:        device to program
 * @expires:        absolute expiry time (monotonic clock)
 * @force:        program minimum delay if expires can not be set
 *
 * Returns 0 on success, -ETIME when the event is in the past.
 */
int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
                              bool force)
{
        unsigned long long clc;
        int64_t delta;
        int rc;

        if (WARN_ON_ONCE(expires < 0))
                return -ETIME;

        dev->next_event = expires;

        if (clockevent_state_shutdown(dev))
                return 0;

        /* We must be in ONESHOT state here */
        WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
                  clockevent_get_state(dev));

        /* Shortcut for clockevent devices that can deal with ktime. */
        if (dev->features & CLOCK_EVT_FEAT_KTIME)
                return dev->set_next_ktime(expires, dev);

        delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
        if (delta <= 0)
                return force ? clockevents_program_min_delta(dev) : -ETIME;

        delta = min(delta, (int64_t) dev->max_delta_ns);
        delta = max(delta, (int64_t) dev->min_delta_ns);

        clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
        rc = dev->set_next_event((unsigned long) clc, dev);

        return (rc && force) ? clockevents_program_min_delta(dev) : rc;
}

/*
 * Called after a notify add to make devices available which were
 * released from the notifier call.
 */
static void clockevents_notify_released(void)
{
        struct clock_event_device *dev;

        while (!list_empty(&clockevents_released)) {
                dev = list_entry(clockevents_released.next,
                                 struct clock_event_device, list);
                list_move(&dev->list, &clockevent_devices);
                tick_check_new_device(dev);
        }
}

/*
 * Try to install a replacement clock event device
 */
static int clockevents_replace(struct clock_event_device *ced)
{
        struct clock_event_device *dev, *newdev = NULL;

        list_for_each_entry(dev, &clockevent_devices, list) {
                if (dev == ced || !clockevent_state_detached(dev))
                        continue;

                if (!tick_check_replacement(newdev, dev))
                        continue;

                if (!try_module_get(dev->owner))
                        continue;

                if (newdev)
                        module_put(newdev->owner);
                newdev = dev;
        }
        if (newdev) {
                tick_install_replacement(newdev);
                list_del_init(&ced->list);
        }
        return newdev ? 0 : -EBUSY;
}

/*
 * Called with clockevents_mutex and clockevents_lock held
 */
static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
{
        /* Fast track. Device is unused */
        if (clockevent_state_detached(ced)) {
                list_del_init(&ced->list);
                return 0;
        }

        return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
}

/*
 * SMP function call to unbind a device
 */
static void __clockevents_unbind(void *arg)
{
        struct ce_unbind *cu = arg;
        int res;

        raw_spin_lock(&clockevents_lock);
        res = __clockevents_try_unbind(cu->ce, smp_processor_id());
        if (res == -EAGAIN)
                res = clockevents_replace(cu->ce);
        cu->res = res;
        raw_spin_unlock(&clockevents_lock);
}

/*
 * Issues smp function call to unbind a per cpu device. Called with
 * clockevents_mutex held.
 */
static int clockevents_unbind(struct clock_event_device *ced, int cpu)
{
        struct ce_unbind cu = { .ce = ced, .res = -ENODEV };

        smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
        return cu.res;
}

/*
 * Unbind a clockevents device.
 */
int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
{
        int ret;

        mutex_lock(&clockevents_mutex);
        ret = clockevents_unbind(ced, cpu);
        mutex_unlock(&clockevents_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(clockevents_unbind_device);

/**
 * clockevents_register_device - register a clock event device
 * @dev:        device to register
 */
void clockevents_register_device(struct clock_event_device *dev)
{
        unsigned long flags;

        /* Initialize state to DETACHED */
        clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);

        if (!dev->cpumask) {
                WARN_ON(num_possible_cpus() > 1);
                dev->cpumask = cpumask_of(smp_processor_id());
        }

        if (dev->cpumask == cpu_all_mask) {
                WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n",
                     dev->name);
                dev->cpumask = cpu_possible_mask;
        }

        raw_spin_lock_irqsave(&clockevents_lock, flags);

        list_add(&dev->list, &clockevent_devices);
        tick_check_new_device(dev);
        clockevents_notify_released();

        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_register_device);

static void clockevents_config(struct clock_event_device *dev, u32 freq)
{
        u64 sec;

        if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
                return;

        /*
         * Calculate the maximum number of seconds we can sleep. Limit
         * to 10 minutes for hardware which can program more than
         * 32bit ticks so we still get reasonable conversion values.
         */
        sec = dev->max_delta_ticks;
        do_div(sec, freq);
        if (!sec)
                sec = 1;
        else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
                sec = 600;

        clockevents_calc_mult_shift(dev, freq, sec);
        dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
        dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
}

/**
 * clockevents_config_and_register - Configure and register a clock event device
 * @dev:        device to register
 * @freq:        The clock frequency
 * @min_delta:        The minimum clock ticks to program in oneshot mode
 * @max_delta:        The maximum clock ticks to program in oneshot mode
 *
 * min/max_delta can be 0 for devices which do not support oneshot mode.
 */
void clockevents_config_and_register(struct clock_event_device *dev,
                                     u32 freq, unsigned long min_delta,
                                     unsigned long max_delta)
{
        dev->min_delta_ticks = min_delta;
        dev->max_delta_ticks = max_delta;
        clockevents_config(dev, freq);
        clockevents_register_device(dev);
}
EXPORT_SYMBOL_GPL(clockevents_config_and_register);

int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
        clockevents_config(dev, freq);

        if (clockevent_state_oneshot(dev))
                return clockevents_program_event(dev, dev->next_event, false);

        if (clockevent_state_periodic(dev))
                return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);

        return 0;
}

/**
 * clockevents_update_freq - Update frequency and reprogram a clock event device.
 * @dev:        device to modify
 * @freq:        new device frequency
 *
 * Reconfigure and reprogram a clock event device in oneshot
 * mode. Must be called on the cpu for which the device delivers per
 * cpu timer events. If called for the broadcast device the core takes
 * care of serialization.
 *
 * Returns 0 on success, -ETIME when the event is in the past.
 */
int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
{
        unsigned long flags;
        int ret;

        local_irq_save(flags);
        ret = tick_broadcast_update_freq(dev, freq);
        if (ret == -ENODEV)
                ret = __clockevents_update_freq(dev, freq);
        local_irq_restore(flags);
        return ret;
}

/*
 * Noop handler when we shut down an event device
 */
void clockevents_handle_noop(struct clock_event_device *dev)
{
}

/**
 * clockevents_exchange_device - release and request clock devices
 * @old:        device to release (can be NULL)
 * @new:        device to request (can be NULL)
 *
 * Called from various tick functions with clockevents_lock held and
 * interrupts disabled.
 */
void clockevents_exchange_device(struct clock_event_device *old,
                                 struct clock_event_device *new)
{
        /*
         * Caller releases a clock event device. We queue it into the
         * released list and do a notify add later.
         */
        if (old) {
                module_put(old->owner);
                clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
                list_move(&old->list, &clockevents_released);
        }

        if (new) {
                BUG_ON(!clockevent_state_detached(new));
                clockevents_shutdown(new);
        }
}

/**
 * clockevents_suspend - suspend clock devices
 */
void clockevents_suspend(void)
{
        struct clock_event_device *dev;

        list_for_each_entry_reverse(dev, &clockevent_devices, list)
                if (dev->suspend && !clockevent_state_detached(dev))
                        dev->suspend(dev);
}

/**
 * clockevents_resume - resume clock devices
 */
void clockevents_resume(void)
{
        struct clock_event_device *dev;

        list_for_each_entry(dev, &clockevent_devices, list)
                if (dev->resume && !clockevent_state_detached(dev))
                        dev->resume(dev);
}

#ifdef CONFIG_HOTPLUG_CPU

# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
/**
 * tick_offline_cpu - Take CPU out of the broadcast mechanism
 * @cpu:        The outgoing CPU
 *
 * Called on the outgoing CPU after it took itself offline.
 */
void tick_offline_cpu(unsigned int cpu)
{
        raw_spin_lock(&clockevents_lock);
        tick_broadcast_offline(cpu);
        raw_spin_unlock(&clockevents_lock);
}
# endif

/**
 * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
 * @cpu:        The dead CPU
 */
void tick_cleanup_dead_cpu(int cpu)
{
        struct clock_event_device *dev, *tmp;
        unsigned long flags;

        raw_spin_lock_irqsave(&clockevents_lock, flags);

        tick_shutdown(cpu);
        /*
         * Unregister the clock event devices which were
         * released from the users in the notify chain.
         */
        list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
                list_del(&dev->list);
        /*
         * Now check whether the CPU has left unused per cpu devices
         */
        list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
                if (cpumask_test_cpu(cpu, dev->cpumask) &&
                    cpumask_weight(dev->cpumask) == 1 &&
                    !tick_is_broadcast_device(dev)) {
                        BUG_ON(!clockevent_state_detached(dev));
                        list_del(&dev->list);
                }
        }
        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
}
#endif

#ifdef CONFIG_SYSFS
static const struct bus_type clockevents_subsys = {
        .name                = "clockevents",
        .dev_name       = "clockevent",
};

static DEFINE_PER_CPU(struct device, tick_percpu_dev);
static struct tick_device *tick_get_tick_dev(struct device *dev);

static ssize_t current_device_show(struct device *dev,
                                   struct device_attribute *attr,
                                   char *buf)
{
        struct tick_device *td;
        ssize_t count = 0;

        raw_spin_lock_irq(&clockevents_lock);
        td = tick_get_tick_dev(dev);
        if (td && td->evtdev)
                count = sysfs_emit(buf, "%s\n", td->evtdev->name);
        raw_spin_unlock_irq(&clockevents_lock);
        return count;
}
static DEVICE_ATTR_RO(current_device);

/* We don't support the abomination of removable broadcast devices */
static ssize_t unbind_device_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        char name[CS_NAME_LEN];
        ssize_t ret = sysfs_get_uname(buf, name, count);
        struct clock_event_device *ce = NULL, *iter;

        if (ret < 0)
                return ret;

        ret = -ENODEV;
        mutex_lock(&clockevents_mutex);
        raw_spin_lock_irq(&clockevents_lock);
        list_for_each_entry(iter, &clockevent_devices, list) {
                if (!strcmp(iter->name, name)) {
                        ret = __clockevents_try_unbind(iter, dev->id);
                        ce = iter;
                        break;
                }
        }
        raw_spin_unlock_irq(&clockevents_lock);
        /*
         * We hold clockevents_mutex, so ce can't go away
         */
        if (ret == -EAGAIN)
                ret = clockevents_unbind(ce, dev->id);
        mutex_unlock(&clockevents_mutex);
        return ret ? ret : count;
}
static DEVICE_ATTR_WO(unbind_device);

#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
static struct device tick_bc_dev = {
        .init_name        = "broadcast",
        .id                = 0,
        .bus                = &clockevents_subsys,
};

static struct tick_device *tick_get_tick_dev(struct device *dev)
{
        return dev == &tick_bc_dev ? tick_get_broadcast_device() :
                &per_cpu(tick_cpu_device, dev->id);
}

static __init int tick_broadcast_init_sysfs(void)
{
        int err = device_register(&tick_bc_dev);

        if (!err)
                err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
        return err;
}
#else
static struct tick_device *tick_get_tick_dev(struct device *dev)
{
        return &per_cpu(tick_cpu_device, dev->id);
}
static inline int tick_broadcast_init_sysfs(void) { return 0; }
#endif

static int __init tick_init_sysfs(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct device *dev = &per_cpu(tick_percpu_dev, cpu);
                int err;

                dev->id = cpu;
                dev->bus = &clockevents_subsys;
                err = device_register(dev);
                if (!err)
                        err = device_create_file(dev, &dev_attr_current_device);
                if (!err)
                        err = device_create_file(dev, &dev_attr_unbind_device);
                if (err)
                        return err;
        }
        return tick_broadcast_init_sysfs();
}

static int __init clockevents_init_sysfs(void)
{
        int err = subsys_system_register(&clockevents_subsys, NULL);

        if (!err)
                err = tick_init_sysfs();
        return err;
}
device_initcall(clockevents_init_sysfs);
#endif /* SYSFS */

































































































































































































































































    1 



























    1 




    1 





























































































    1 











    1 































































    1 









































    1 



























    1 





    1 












    1 








    1 



    1 

    1 



    1 










    1 


    1 




    1 





































    1 













    1 



    1 
    1 

    1 
















































































































































































































































































































































































































































































































































































































































































    1 



























































































































































































































































    1 


    1 















    1 













































    1 

    1 


    1 








    1 















    1 



    1 






    1 












    1 



    1 
    1 
    1 































































































































































































































































































































































































































































    1 


    1 

    1 
    1 
    1 


    1 



















































































































    1 




    1 


























































    1 

















    1 









    1 




    1 





    1 



    1 

    1 
    1 




















    1 
    1 
















































    1 








    1 
    1 



































    1 








    1 
    1 





    1 




    1 















































































































































































































































































































































































































































    1 




















    1 







































    1 



    1 
    1 

    1 
    1 

















    1 
    1 













    1 
    1 
    1 






































































    1 





    1 

























































































































































































































    1 
























































































































































































































































































































































































































































































































    1 
















    1 

































































    1 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
/*
 * net/tipc/socket.c: TIPC socket API
 *
 * Copyright (c) 2001-2007, 2012-2019, Ericsson AB
 * Copyright (c) 2004-2008, 2010-2013, Wind River Systems
 * Copyright (c) 2020-2021, Red Hat Inc
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <linux/rhashtable.h>
#include <linux/sched/signal.h>
#include <trace/events/sock.h>

#include "core.h"
#include "name_table.h"
#include "node.h"
#include "link.h"
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
#include "netlink.h"
#include "group.h"
#include "trace.h"

#define NAGLE_START_INIT        4
#define NAGLE_START_MAX                1024
#define CONN_TIMEOUT_DEFAULT    8000    /* default connect timeout = 8s */
#define CONN_PROBING_INTV        msecs_to_jiffies(3600000)  /* [ms] => 1 h */
#define TIPC_MAX_PORT                0xffffffff
#define TIPC_MIN_PORT                1
#define TIPC_ACK_RATE                4       /* ACK at 1/4 of rcv window size */

enum {
        TIPC_LISTEN = TCP_LISTEN,
        TIPC_ESTABLISHED = TCP_ESTABLISHED,
        TIPC_OPEN = TCP_CLOSE,
        TIPC_DISCONNECTING = TCP_CLOSE_WAIT,
        TIPC_CONNECTING = TCP_SYN_SENT,
};

struct sockaddr_pair {
        struct sockaddr_tipc sock;
        struct sockaddr_tipc member;
};

/**
 * struct tipc_sock - TIPC socket structure
 * @sk: socket - interacts with 'port' and with user via the socket API
 * @max_pkt: maximum packet size "hint" used when building messages sent by port
 * @maxnagle: maximum size of msg which can be subject to nagle
 * @portid: unique port identity in TIPC socket hash table
 * @phdr: preformatted message header used when sending messages
 * @cong_links: list of congested links
 * @publications: list of publications for port
 * @pub_count: total # of publications port has made during its lifetime
 * @conn_timeout: the time we can wait for an unresponded setup request
 * @probe_unacked: probe has not received ack yet
 * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
 * @cong_link_cnt: number of congested links
 * @snt_unacked: # messages sent by socket, and not yet acked by peer
 * @snd_win: send window size
 * @peer_caps: peer capabilities mask
 * @rcv_unacked: # messages read by user, but not yet acked back to peer
 * @rcv_win: receive window size
 * @peer: 'connected' peer for dgram/rdm
 * @node: hash table node
 * @mc_method: cookie for use between socket and broadcast layer
 * @rcu: rcu struct for tipc_sock
 * @group: TIPC communications group
 * @oneway: message count in one direction (FIXME)
 * @nagle_start: current nagle value
 * @snd_backlog: send backlog count
 * @msg_acc: messages accepted; used in managing backlog and nagle
 * @pkt_cnt: TIPC socket packet count
 * @expect_ack: whether this TIPC socket is expecting an ack
 * @nodelay: setsockopt() TIPC_NODELAY setting
 * @group_is_open: TIPC socket group is fully open (FIXME)
 * @published: true if port has one or more associated names
 * @conn_addrtype: address type used when establishing connection
 */
struct tipc_sock {
        struct sock sk;
        u32 max_pkt;
        u32 maxnagle;
        u32 portid;
        struct tipc_msg phdr;
        struct list_head cong_links;
        struct list_head publications;
        u32 pub_count;
        atomic_t dupl_rcvcnt;
        u16 conn_timeout;
        bool probe_unacked;
        u16 cong_link_cnt;
        u16 snt_unacked;
        u16 snd_win;
        u16 peer_caps;
        u16 rcv_unacked;
        u16 rcv_win;
        struct sockaddr_tipc peer;
        struct rhash_head node;
        struct tipc_mc_method mc_method;
        struct rcu_head rcu;
        struct tipc_group *group;
        u32 oneway;
        u32 nagle_start;
        u16 snd_backlog;
        u16 msg_acc;
        u16 pkt_cnt;
        bool expect_ack;
        bool nodelay;
        bool group_is_open;
        bool published;
        u8 conn_addrtype;
};

static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
static void tipc_data_ready(struct sock *sk);
static void tipc_write_space(struct sock *sk);
static void tipc_sock_destruct(struct sock *sk);
static int tipc_release(struct socket *sock);
static void tipc_sk_timeout(struct timer_list *t);
static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua);
static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua);
static int tipc_sk_leave(struct tipc_sock *tsk);
static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid);
static int tipc_sk_insert(struct tipc_sock *tsk);
static void tipc_sk_remove(struct tipc_sock *tsk);
static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack);
static int tipc_wait_for_connect(struct socket *sock, long *timeo_p);

static const struct proto_ops packet_ops;
static const struct proto_ops stream_ops;
static const struct proto_ops msg_ops;
static struct proto tipc_proto;
static const struct rhashtable_params tsk_rht_params;

static u32 tsk_own_node(struct tipc_sock *tsk)
{
        return msg_prevnode(&tsk->phdr);
}

static u32 tsk_peer_node(struct tipc_sock *tsk)
{
        return msg_destnode(&tsk->phdr);
}

static u32 tsk_peer_port(struct tipc_sock *tsk)
{
        return msg_destport(&tsk->phdr);
}

static  bool tsk_unreliable(struct tipc_sock *tsk)
{
        return msg_src_droppable(&tsk->phdr) != 0;
}

static void tsk_set_unreliable(struct tipc_sock *tsk, bool unreliable)
{
        msg_set_src_droppable(&tsk->phdr, unreliable ? 1 : 0);
}

static bool tsk_unreturnable(struct tipc_sock *tsk)
{
        return msg_dest_droppable(&tsk->phdr) != 0;
}

static void tsk_set_unreturnable(struct tipc_sock *tsk, bool unreturnable)
{
        msg_set_dest_droppable(&tsk->phdr, unreturnable ? 1 : 0);
}

static int tsk_importance(struct tipc_sock *tsk)
{
        return msg_importance(&tsk->phdr);
}

static struct tipc_sock *tipc_sk(const struct sock *sk)
{
        return container_of(sk, struct tipc_sock, sk);
}

int tsk_set_importance(struct sock *sk, int imp)
{
        if (imp > TIPC_CRITICAL_IMPORTANCE)
                return -EINVAL;
        msg_set_importance(&tipc_sk(sk)->phdr, (u32)imp);
        return 0;
}

static bool tsk_conn_cong(struct tipc_sock *tsk)
{
        return tsk->snt_unacked > tsk->snd_win;
}

static u16 tsk_blocks(int len)
{
        return ((len / FLOWCTL_BLK_SZ) + 1);
}

/* tsk_blocks(): translate a buffer size in bytes to number of
 * advertisable blocks, taking into account the ratio truesize(len)/len
 * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ
 */
static u16 tsk_adv_blocks(int len)
{
        return len / FLOWCTL_BLK_SZ / 4;
}

/* tsk_inc(): increment counter for sent or received data
 * - If block based flow control is not supported by peer we
 *   fall back to message based ditto, incrementing the counter
 */
static u16 tsk_inc(struct tipc_sock *tsk, int msglen)
{
        if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL))
                return ((msglen / FLOWCTL_BLK_SZ) + 1);
        return 1;
}

/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle
 */
static void tsk_set_nagle(struct tipc_sock *tsk)
{
        struct sock *sk = &tsk->sk;

        tsk->maxnagle = 0;
        if (sk->sk_type != SOCK_STREAM)
                return;
        if (tsk->nodelay)
                return;
        if (!(tsk->peer_caps & TIPC_NAGLE))
                return;
        /* Limit node local buffer size to avoid receive queue overflow */
        if (tsk->max_pkt == MAX_MSG_SIZE)
                tsk->maxnagle = 1500;
        else
                tsk->maxnagle = tsk->max_pkt;
}

/**
 * tsk_advance_rx_queue - discard first buffer in socket receive queue
 * @sk: network socket
 *
 * Caller must hold socket lock
 */
static void tsk_advance_rx_queue(struct sock *sk)
{
        trace_tipc_sk_advance_rx(sk, NULL, TIPC_DUMP_SK_RCVQ, " ");
        kfree_skb(__skb_dequeue(&sk->sk_receive_queue));
}

/* tipc_sk_respond() : send response message back to sender
 */
static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err)
{
        u32 selector;
        u32 dnode;
        u32 onode = tipc_own_addr(sock_net(sk));

        if (!tipc_msg_reverse(onode, &skb, err))
                return;

        trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_NONE, "@sk_respond!");
        dnode = msg_destnode(buf_msg(skb));
        selector = msg_origport(buf_msg(skb));
        tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector);
}

/**
 * tsk_rej_rx_queue - reject all buffers in socket receive queue
 * @sk: network socket
 * @error: response error code
 *
 * Caller must hold socket lock
 */
static void tsk_rej_rx_queue(struct sock *sk, int error)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue(&sk->sk_receive_queue)))
                tipc_sk_respond(sk, skb, error);
}

static bool tipc_sk_connected(const struct sock *sk)
{
        return READ_ONCE(sk->sk_state) == TIPC_ESTABLISHED;
}

/* tipc_sk_type_connectionless - check if the socket is datagram socket
 * @sk: socket
 *
 * Returns true if connection less, false otherwise
 */
static bool tipc_sk_type_connectionless(struct sock *sk)
{
        return sk->sk_type == SOCK_RDM || sk->sk_type == SOCK_DGRAM;
}

/* tsk_peer_msg - verify if message was sent by connected port's peer
 *
 * Handles cases where the node's network address has changed from
 * the default of <0.0.0> to its configured setting.
 */
static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
{
        struct sock *sk = &tsk->sk;
        u32 self = tipc_own_addr(sock_net(sk));
        u32 peer_port = tsk_peer_port(tsk);
        u32 orig_node, peer_node;

        if (unlikely(!tipc_sk_connected(sk)))
                return false;

        if (unlikely(msg_origport(msg) != peer_port))
                return false;

        orig_node = msg_orignode(msg);
        peer_node = tsk_peer_node(tsk);

        if (likely(orig_node == peer_node))
                return true;

        if (!orig_node && peer_node == self)
                return true;

        if (!peer_node && orig_node == self)
                return true;

        return false;
}

/* tipc_set_sk_state - set the sk_state of the socket
 * @sk: socket
 *
 * Caller must hold socket lock
 *
 * Returns 0 on success, errno otherwise
 */
static int tipc_set_sk_state(struct sock *sk, int state)
{
        int oldsk_state = sk->sk_state;
        int res = -EINVAL;

        switch (state) {
        case TIPC_OPEN:
                res = 0;
                break;
        case TIPC_LISTEN:
        case TIPC_CONNECTING:
                if (oldsk_state == TIPC_OPEN)
                        res = 0;
                break;
        case TIPC_ESTABLISHED:
                if (oldsk_state == TIPC_CONNECTING ||
                    oldsk_state == TIPC_OPEN)
                        res = 0;
                break;
        case TIPC_DISCONNECTING:
                if (oldsk_state == TIPC_CONNECTING ||
                    oldsk_state == TIPC_ESTABLISHED)
                        res = 0;
                break;
        }

        if (!res)
                sk->sk_state = state;

        return res;
}

static int tipc_sk_sock_err(struct socket *sock, long *timeout)
{
        struct sock *sk = sock->sk;
        int err = sock_error(sk);
        int typ = sock->type;

        if (err)
                return err;
        if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) {
                if (sk->sk_state == TIPC_DISCONNECTING)
                        return -EPIPE;
                else if (!tipc_sk_connected(sk))
                        return -ENOTCONN;
        }
        if (!*timeout)
                return -EAGAIN;
        if (signal_pending(current))
                return sock_intr_errno(*timeout);

        return 0;
}

#define tipc_wait_for_cond(sock_, timeo_, condition_)                               \
({                                                                             \
        DEFINE_WAIT_FUNC(wait_, woken_wake_function);                          \
        struct sock *sk_;                                                       \
        int rc_;                                                               \
                                                                               \
        while ((rc_ = !(condition_))) {                                               \
                /* coupled with smp_wmb() in tipc_sk_proto_rcv() */            \
                smp_rmb();                                                     \
                sk_ = (sock_)->sk;                                               \
                rc_ = tipc_sk_sock_err((sock_), timeo_);                       \
                if (rc_)                                                       \
                        break;                                                       \
                add_wait_queue(sk_sleep(sk_), &wait_);                         \
                release_sock(sk_);                                               \
                *(timeo_) = wait_woken(&wait_, TASK_INTERRUPTIBLE, *(timeo_)); \
                sched_annotate_sleep();                                               \
                lock_sock(sk_);                                                       \
                remove_wait_queue(sk_sleep(sk_), &wait_);                       \
        }                                                                       \
        rc_;                                                                       \
})

/**
 * tipc_sk_create - create a TIPC socket
 * @net: network namespace (must be default network)
 * @sock: pre-allocated socket structure
 * @protocol: protocol indicator (must be 0)
 * @kern: caused by kernel or by userspace?
 *
 * This routine creates additional data structures used by the TIPC socket,
 * initializes them, and links them together.
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_sk_create(struct net *net, struct socket *sock,
                          int protocol, int kern)
{
        const struct proto_ops *ops;
        struct sock *sk;
        struct tipc_sock *tsk;
        struct tipc_msg *msg;

        /* Validate arguments */
        if (unlikely(protocol != 0))
                return -EPROTONOSUPPORT;

        switch (sock->type) {
        case SOCK_STREAM:
                ops = &stream_ops;
                break;
        case SOCK_SEQPACKET:
                ops = &packet_ops;
                break;
        case SOCK_DGRAM:
        case SOCK_RDM:
                ops = &msg_ops;
                break;
        default:
                return -EPROTOTYPE;
        }

        /* Allocate socket's protocol area */
        sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern);
        if (sk == NULL)
                return -ENOMEM;

        tsk = tipc_sk(sk);
        tsk->max_pkt = MAX_PKT_DEFAULT;
        tsk->maxnagle = 0;
        tsk->nagle_start = NAGLE_START_INIT;
        INIT_LIST_HEAD(&tsk->publications);
        INIT_LIST_HEAD(&tsk->cong_links);
        msg = &tsk->phdr;

        /* Finish initializing socket data structures */
        sock->ops = ops;
        sock_init_data(sock, sk);
        tipc_set_sk_state(sk, TIPC_OPEN);
        if (tipc_sk_insert(tsk)) {
                sk_free(sk);
                pr_warn("Socket create failed; port number exhausted\n");
                return -EINVAL;
        }

        /* Ensure tsk is visible before we read own_addr. */
        smp_mb();

        tipc_msg_init(tipc_own_addr(net), msg, TIPC_LOW_IMPORTANCE,
                      TIPC_NAMED_MSG, NAMED_H_SIZE, 0);

        msg_set_origport(msg, tsk->portid);
        timer_setup(&sk->sk_timer, tipc_sk_timeout, 0);
        sk->sk_shutdown = 0;
        sk->sk_backlog_rcv = tipc_sk_backlog_rcv;
        sk->sk_rcvbuf = READ_ONCE(sysctl_tipc_rmem[1]);
        sk->sk_data_ready = tipc_data_ready;
        sk->sk_write_space = tipc_write_space;
        sk->sk_destruct = tipc_sock_destruct;
        tsk->conn_timeout = CONN_TIMEOUT_DEFAULT;
        tsk->group_is_open = true;
        atomic_set(&tsk->dupl_rcvcnt, 0);

        /* Start out with safe limits until we receive an advertised window */
        tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN);
        tsk->rcv_win = tsk->snd_win;

        if (tipc_sk_type_connectionless(sk)) {
                tsk_set_unreturnable(tsk, true);
                if (sock->type == SOCK_DGRAM)
                        tsk_set_unreliable(tsk, true);
        }
        __skb_queue_head_init(&tsk->mc_method.deferredq);
        trace_tipc_sk_create(sk, NULL, TIPC_DUMP_NONE, " ");
        return 0;
}

static void tipc_sk_callback(struct rcu_head *head)
{
        struct tipc_sock *tsk = container_of(head, struct tipc_sock, rcu);

        sock_put(&tsk->sk);
}

/* Caller should hold socket lock for the socket. */
static void __tipc_shutdown(struct socket *sock, int error)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct net *net = sock_net(sk);
        long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
        u32 dnode = tsk_peer_node(tsk);
        struct sk_buff *skb;

        /* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */
        tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
                                            !tsk_conn_cong(tsk)));

        /* Push out delayed messages if in Nagle mode */
        tipc_sk_push_backlog(tsk, false);
        /* Remove pending SYN */
        __skb_queue_purge(&sk->sk_write_queue);

        /* Remove partially received buffer if any */
        skb = skb_peek(&sk->sk_receive_queue);
        if (skb && TIPC_SKB_CB(skb)->bytes_read) {
                __skb_unlink(skb, &sk->sk_receive_queue);
                kfree_skb(skb);
        }

        /* Reject all unreceived messages if connectionless */
        if (tipc_sk_type_connectionless(sk)) {
                tsk_rej_rx_queue(sk, error);
                return;
        }

        switch (sk->sk_state) {
        case TIPC_CONNECTING:
        case TIPC_ESTABLISHED:
                tipc_set_sk_state(sk, TIPC_DISCONNECTING);
                tipc_node_remove_conn(net, dnode, tsk->portid);
                /* Send a FIN+/- to its peer */
                skb = __skb_dequeue(&sk->sk_receive_queue);
                if (skb) {
                        __skb_queue_purge(&sk->sk_receive_queue);
                        tipc_sk_respond(sk, skb, error);
                        break;
                }
                skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
                                      TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
                                      tsk_own_node(tsk), tsk_peer_port(tsk),
                                      tsk->portid, error);
                if (skb)
                        tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
                break;
        case TIPC_LISTEN:
                /* Reject all SYN messages */
                tsk_rej_rx_queue(sk, error);
                break;
        default:
                __skb_queue_purge(&sk->sk_receive_queue);
                break;
        }
}

/**
 * tipc_release - destroy a TIPC socket
 * @sock: socket to destroy
 *
 * This routine cleans up any messages that are still queued on the socket.
 * For DGRAM and RDM socket types, all queued messages are rejected.
 * For SEQPACKET and STREAM socket types, the first message is rejected
 * and any others are discarded.  (If the first message on a STREAM socket
 * is partially-read, it is discarded and the next one is rejected instead.)
 *
 * NOTE: Rejected messages are not necessarily returned to the sender!  They
 * are returned or discarded according to the "destination droppable" setting
 * specified for the message by the sender.
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk;

        /*
         * Exit if socket isn't fully initialized (occurs when a failed accept()
         * releases a pre-allocated child socket that was never used)
         */
        if (sk == NULL)
                return 0;

        tsk = tipc_sk(sk);
        lock_sock(sk);

        trace_tipc_sk_release(sk, NULL, TIPC_DUMP_ALL, " ");
        __tipc_shutdown(sock, TIPC_ERR_NO_PORT);
        sk->sk_shutdown = SHUTDOWN_MASK;
        tipc_sk_leave(tsk);
        tipc_sk_withdraw(tsk, NULL);
        __skb_queue_purge(&tsk->mc_method.deferredq);
        sk_stop_timer(sk, &sk->sk_timer);
        tipc_sk_remove(tsk);

        sock_orphan(sk);
        /* Reject any messages that accumulated in backlog queue */
        release_sock(sk);
        tipc_dest_list_purge(&tsk->cong_links);
        tsk->cong_link_cnt = 0;
        call_rcu(&tsk->rcu, tipc_sk_callback);
        sock->sk = NULL;

        return 0;
}

/**
 * __tipc_bind - associate or disassocate TIPC name(s) with a socket
 * @sock: socket structure
 * @skaddr: socket address describing name(s) and desired operation
 * @alen: size of socket address data structure
 *
 * Name and name sequence binding are indicated using a positive scope value;
 * a negative scope value unbinds the specified name.  Specifying no name
 * (i.e. a socket address length of 0) unbinds all names from the socket.
 *
 * Return: 0 on success, errno otherwise
 *
 * NOTE: This routine doesn't need to take the socket lock since it doesn't
 *       access any non-constant socket information.
 */
static int __tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
{
        struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr;
        struct tipc_sock *tsk = tipc_sk(sock->sk);
        bool unbind = false;

        if (unlikely(!alen))
                return tipc_sk_withdraw(tsk, NULL);

        if (ua->addrtype == TIPC_SERVICE_ADDR) {
                ua->addrtype = TIPC_SERVICE_RANGE;
                ua->sr.upper = ua->sr.lower;
        }
        if (ua->scope < 0) {
                unbind = true;
                ua->scope = -ua->scope;
        }
        /* Users may still use deprecated TIPC_ZONE_SCOPE */
        if (ua->scope != TIPC_NODE_SCOPE)
                ua->scope = TIPC_CLUSTER_SCOPE;

        if (tsk->group)
                return -EACCES;

        if (unbind)
                return tipc_sk_withdraw(tsk, ua);
        return tipc_sk_publish(tsk, ua);
}

int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
{
        int res;

        lock_sock(sock->sk);
        res = __tipc_bind(sock, skaddr, alen);
        release_sock(sock->sk);
        return res;
}

static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen)
{
        struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr;
        u32 atype = ua->addrtype;

        if (alen) {
                if (!tipc_uaddr_valid(ua, alen))
                        return -EINVAL;
                if (atype == TIPC_SOCKET_ADDR)
                        return -EAFNOSUPPORT;
                if (ua->sr.type < TIPC_RESERVED_TYPES) {
                        pr_warn_once("Can't bind to reserved service type %u\n",
                                     ua->sr.type);
                        return -EACCES;
                }
        }
        return tipc_sk_bind(sock, skaddr, alen);
}

/**
 * tipc_getname - get port ID of socket or peer socket
 * @sock: socket structure
 * @uaddr: area for returned socket address
 * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID
 *
 * Return: 0 on success, errno otherwise
 *
 * NOTE: This routine doesn't need to take the socket lock since it only
 *       accesses socket information that is unchanging (or which changes in
 *       a completely predictable manner).
 */
static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
                        int peer)
{
        struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);

        memset(addr, 0, sizeof(*addr));
        if (peer) {
                if ((!tipc_sk_connected(sk)) &&
                    ((peer != 2) || (sk->sk_state != TIPC_DISCONNECTING)))
                        return -ENOTCONN;
                addr->addr.id.ref = tsk_peer_port(tsk);
                addr->addr.id.node = tsk_peer_node(tsk);
        } else {
                addr->addr.id.ref = tsk->portid;
                addr->addr.id.node = tipc_own_addr(sock_net(sk));
        }

        addr->addrtype = TIPC_SOCKET_ADDR;
        addr->family = AF_TIPC;
        addr->scope = 0;
        addr->addr.name.domain = 0;

        return sizeof(*addr);
}

/**
 * tipc_poll - read and possibly block on pollmask
 * @file: file structure associated with the socket
 * @sock: socket for which to calculate the poll bits
 * @wait: ???
 *
 * Return: pollmask value
 *
 * COMMENTARY:
 * It appears that the usual socket locking mechanisms are not useful here
 * since the pollmask info is potentially out-of-date the moment this routine
 * exits.  TCP and other protocols seem to rely on higher level poll routines
 * to handle any preventable race conditions, so TIPC will do the same ...
 *
 * IMPORTANT: The fact that a read or write operation is indicated does NOT
 * imply that the operation will succeed, merely that it should be performed
 * and will not block.
 */
static __poll_t tipc_poll(struct file *file, struct socket *sock,
                              poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        __poll_t revents = 0;

        sock_poll_wait(file, sock, wait);
        trace_tipc_sk_poll(sk, NULL, TIPC_DUMP_ALL, " ");

        if (sk->sk_shutdown & RCV_SHUTDOWN)
                revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
        if (sk->sk_shutdown == SHUTDOWN_MASK)
                revents |= EPOLLHUP;

        switch (sk->sk_state) {
        case TIPC_ESTABLISHED:
                if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
                        revents |= EPOLLOUT;
                fallthrough;
        case TIPC_LISTEN:
        case TIPC_CONNECTING:
                if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                        revents |= EPOLLIN | EPOLLRDNORM;
                break;
        case TIPC_OPEN:
                if (tsk->group_is_open && !tsk->cong_link_cnt)
                        revents |= EPOLLOUT;
                if (!tipc_sk_type_connectionless(sk))
                        break;
                if (skb_queue_empty_lockless(&sk->sk_receive_queue))
                        break;
                revents |= EPOLLIN | EPOLLRDNORM;
                break;
        case TIPC_DISCONNECTING:
                revents = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
                break;
        }
        return revents;
}

/**
 * tipc_sendmcast - send multicast message
 * @sock: socket structure
 * @ua: destination address struct
 * @msg: message to send
 * @dlen: length of data to send
 * @timeout: timeout to wait for wakeup
 *
 * Called from function tipc_sendmsg(), which has done all sanity checks
 * Return: the number of bytes sent on success, or errno
 */
static int tipc_sendmcast(struct  socket *sock, struct tipc_uaddr *ua,
                          struct msghdr *msg, size_t dlen, long timeout)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_msg *hdr = &tsk->phdr;
        struct net *net = sock_net(sk);
        int mtu = tipc_bcast_get_mtu(net);
        struct sk_buff_head pkts;
        struct tipc_nlist dsts;
        int rc;

        if (tsk->group)
                return -EACCES;

        /* Block or return if any destination link is congested */
        rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt);
        if (unlikely(rc))
                return rc;

        /* Lookup destination nodes */
        tipc_nlist_init(&dsts, tipc_own_addr(net));
        tipc_nametbl_lookup_mcast_nodes(net, ua, &dsts);
        if (!dsts.local && !dsts.remote)
                return -EHOSTUNREACH;

        /* Build message header */
        msg_set_type(hdr, TIPC_MCAST_MSG);
        msg_set_hdr_sz(hdr, MCAST_H_SIZE);
        msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE);
        msg_set_destport(hdr, 0);
        msg_set_destnode(hdr, 0);
        msg_set_nametype(hdr, ua->sr.type);
        msg_set_namelower(hdr, ua->sr.lower);
        msg_set_nameupper(hdr, ua->sr.upper);

        /* Build message as chain of buffers */
        __skb_queue_head_init(&pkts);
        rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);

        /* Send message if build was successful */
        if (unlikely(rc == dlen)) {
                trace_tipc_sk_sendmcast(sk, skb_peek(&pkts),
                                        TIPC_DUMP_SK_SNDQ, " ");
                rc = tipc_mcast_xmit(net, &pkts, &tsk->mc_method, &dsts,
                                     &tsk->cong_link_cnt);
        }

        tipc_nlist_purge(&dsts);

        return rc ? rc : dlen;
}

/**
 * tipc_send_group_msg - send a message to a member in the group
 * @net: network namespace
 * @tsk: tipc socket
 * @m: message to send
 * @mb: group member
 * @dnode: destination node
 * @dport: destination port
 * @dlen: total length of message data
 */
static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk,
                               struct msghdr *m, struct tipc_member *mb,
                               u32 dnode, u32 dport, int dlen)
{
        u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group);
        struct tipc_mc_method *method = &tsk->mc_method;
        int blks = tsk_blocks(GROUP_H_SIZE + dlen);
        struct tipc_msg *hdr = &tsk->phdr;
        struct sk_buff_head pkts;
        int mtu, rc;

        /* Complete message header */
        msg_set_type(hdr, TIPC_GRP_UCAST_MSG);
        msg_set_hdr_sz(hdr, GROUP_H_SIZE);
        msg_set_destport(hdr, dport);
        msg_set_destnode(hdr, dnode);
        msg_set_grp_bc_seqno(hdr, bc_snd_nxt);

        /* Build message as chain of buffers */
        __skb_queue_head_init(&pkts);
        mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false);
        rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
        if (unlikely(rc != dlen))
                return rc;

        /* Send message */
        rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
        if (unlikely(rc == -ELINKCONG)) {
                tipc_dest_push(&tsk->cong_links, dnode, 0);
                tsk->cong_link_cnt++;
        }

        /* Update send window */
        tipc_group_update_member(mb, blks);

        /* A broadcast sent within next EXPIRE period must follow same path */
        method->rcast = true;
        method->mandatory = true;
        return dlen;
}

/**
 * tipc_send_group_unicast - send message to a member in the group
 * @sock: socket structure
 * @m: message to send
 * @dlen: total length of message data
 * @timeout: timeout to wait for wakeup
 *
 * Called from function tipc_sendmsg(), which has done all sanity checks
 * Return: the number of bytes sent on success, or errno
 */
static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m,
                                   int dlen, long timeout)
{
        struct sock *sk = sock->sk;
        struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
        int blks = tsk_blocks(GROUP_H_SIZE + dlen);
        struct tipc_sock *tsk = tipc_sk(sk);
        struct net *net = sock_net(sk);
        struct tipc_member *mb = NULL;
        u32 node, port;
        int rc;

        node = ua->sk.node;
        port = ua->sk.ref;
        if (!port && !node)
                return -EHOSTUNREACH;

        /* Block or return if destination link or member is congested */
        rc = tipc_wait_for_cond(sock, &timeout,
                                !tipc_dest_find(&tsk->cong_links, node, 0) &&
                                tsk->group &&
                                !tipc_group_cong(tsk->group, node, port, blks,
                                                 &mb));
        if (unlikely(rc))
                return rc;

        if (unlikely(!mb))
                return -EHOSTUNREACH;

        rc = tipc_send_group_msg(net, tsk, m, mb, node, port, dlen);

        return rc ? rc : dlen;
}

/**
 * tipc_send_group_anycast - send message to any member with given identity
 * @sock: socket structure
 * @m: message to send
 * @dlen: total length of message data
 * @timeout: timeout to wait for wakeup
 *
 * Called from function tipc_sendmsg(), which has done all sanity checks
 * Return: the number of bytes sent on success, or errno
 */
static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m,
                                   int dlen, long timeout)
{
        struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct list_head *cong_links = &tsk->cong_links;
        int blks = tsk_blocks(GROUP_H_SIZE + dlen);
        struct tipc_msg *hdr = &tsk->phdr;
        struct tipc_member *first = NULL;
        struct tipc_member *mbr = NULL;
        struct net *net = sock_net(sk);
        u32 node, port, exclude;
        struct list_head dsts;
        int lookups = 0;
        int dstcnt, rc;
        bool cong;

        INIT_LIST_HEAD(&dsts);
        ua->sa.type = msg_nametype(hdr);
        ua->scope = msg_lookup_scope(hdr);

        while (++lookups < 4) {
                exclude = tipc_group_exclude(tsk->group);

                first = NULL;

                /* Look for a non-congested destination member, if any */
                while (1) {
                        if (!tipc_nametbl_lookup_group(net, ua, &dsts, &dstcnt,
                                                       exclude, false))
                                return -EHOSTUNREACH;
                        tipc_dest_pop(&dsts, &node, &port);
                        cong = tipc_group_cong(tsk->group, node, port, blks,
                                               &mbr);
                        if (!cong)
                                break;
                        if (mbr == first)
                                break;
                        if (!first)
                                first = mbr;
                }

                /* Start over if destination was not in member list */
                if (unlikely(!mbr))
                        continue;

                if (likely(!cong && !tipc_dest_find(cong_links, node, 0)))
                        break;

                /* Block or return if destination link or member is congested */
                rc = tipc_wait_for_cond(sock, &timeout,
                                        !tipc_dest_find(cong_links, node, 0) &&
                                        tsk->group &&
                                        !tipc_group_cong(tsk->group, node, port,
                                                         blks, &mbr));
                if (unlikely(rc))
                        return rc;

                /* Send, unless destination disappeared while waiting */
                if (likely(mbr))
                        break;
        }

        if (unlikely(lookups >= 4))
                return -EHOSTUNREACH;

        rc = tipc_send_group_msg(net, tsk, m, mbr, node, port, dlen);

        return rc ? rc : dlen;
}

/**
 * tipc_send_group_bcast - send message to all members in communication group
 * @sock: socket structure
 * @m: message to send
 * @dlen: total length of message data
 * @timeout: timeout to wait for wakeup
 *
 * Called from function tipc_sendmsg(), which has done all sanity checks
 * Return: the number of bytes sent on success, or errno
 */
static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
                                 int dlen, long timeout)
{
        struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_nlist *dsts;
        struct tipc_mc_method *method = &tsk->mc_method;
        bool ack = method->mandatory && method->rcast;
        int blks = tsk_blocks(MCAST_H_SIZE + dlen);
        struct tipc_msg *hdr = &tsk->phdr;
        int mtu = tipc_bcast_get_mtu(net);
        struct sk_buff_head pkts;
        int rc = -EHOSTUNREACH;

        /* Block or return if any destination link or member is congested */
        rc = tipc_wait_for_cond(sock, &timeout,
                                !tsk->cong_link_cnt && tsk->group &&
                                !tipc_group_bc_cong(tsk->group, blks));
        if (unlikely(rc))
                return rc;

        dsts = tipc_group_dests(tsk->group);
        if (!dsts->local && !dsts->remote)
                return -EHOSTUNREACH;

        /* Complete message header */
        if (ua) {
                msg_set_type(hdr, TIPC_GRP_MCAST_MSG);
                msg_set_nameinst(hdr, ua->sa.instance);
        } else {
                msg_set_type(hdr, TIPC_GRP_BCAST_MSG);
                msg_set_nameinst(hdr, 0);
        }
        msg_set_hdr_sz(hdr, GROUP_H_SIZE);
        msg_set_destport(hdr, 0);
        msg_set_destnode(hdr, 0);
        msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(tsk->group));

        /* Avoid getting stuck with repeated forced replicasts */
        msg_set_grp_bc_ack_req(hdr, ack);

        /* Build message as chain of buffers */
        __skb_queue_head_init(&pkts);
        rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
        if (unlikely(rc != dlen))
                return rc;

        /* Send message */
        rc = tipc_mcast_xmit(net, &pkts, method, dsts, &tsk->cong_link_cnt);
        if (unlikely(rc))
                return rc;

        /* Update broadcast sequence number and send windows */
        tipc_group_update_bc_members(tsk->group, blks, ack);

        /* Broadcast link is now free to choose method for next broadcast */
        method->mandatory = false;
        method->expires = jiffies;

        return dlen;
}

/**
 * tipc_send_group_mcast - send message to all members with given identity
 * @sock: socket structure
 * @m: message to send
 * @dlen: total length of message data
 * @timeout: timeout to wait for wakeup
 *
 * Called from function tipc_sendmsg(), which has done all sanity checks
 * Return: the number of bytes sent on success, or errno
 */
static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m,
                                 int dlen, long timeout)
{
        struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_group *grp = tsk->group;
        struct tipc_msg *hdr = &tsk->phdr;
        struct net *net = sock_net(sk);
        struct list_head dsts;
        u32 dstcnt, exclude;

        INIT_LIST_HEAD(&dsts);
        ua->sa.type = msg_nametype(hdr);
        ua->scope = msg_lookup_scope(hdr);
        exclude = tipc_group_exclude(grp);

        if (!tipc_nametbl_lookup_group(net, ua, &dsts, &dstcnt, exclude, true))
                return -EHOSTUNREACH;

        if (dstcnt == 1) {
                tipc_dest_pop(&dsts, &ua->sk.node, &ua->sk.ref);
                return tipc_send_group_unicast(sock, m, dlen, timeout);
        }

        tipc_dest_list_purge(&dsts);
        return tipc_send_group_bcast(sock, m, dlen, timeout);
}

/**
 * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets
 * @net: the associated network namespace
 * @arrvq: queue with arriving messages, to be cloned after destination lookup
 * @inputq: queue with cloned messages, delivered to socket after dest lookup
 *
 * Multi-threaded: parallel calls with reference to same queues may occur
 */
void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
                       struct sk_buff_head *inputq)
{
        u32 self = tipc_own_addr(net);
        struct sk_buff *skb, *_skb;
        u32 portid, onode;
        struct sk_buff_head tmpq;
        struct list_head dports;
        struct tipc_msg *hdr;
        struct tipc_uaddr ua;
        int user, mtyp, hlen;

        __skb_queue_head_init(&tmpq);
        INIT_LIST_HEAD(&dports);
        ua.addrtype = TIPC_SERVICE_RANGE;

        /* tipc_skb_peek() increments the head skb's reference counter */
        skb = tipc_skb_peek(arrvq, &inputq->lock);
        for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) {
                hdr = buf_msg(skb);
                user = msg_user(hdr);
                mtyp = msg_type(hdr);
                hlen = skb_headroom(skb) + msg_hdr_sz(hdr);
                onode = msg_orignode(hdr);
                ua.sr.type = msg_nametype(hdr);
                ua.sr.lower = msg_namelower(hdr);
                ua.sr.upper = msg_nameupper(hdr);
                if (onode == self)
                        ua.scope = TIPC_ANY_SCOPE;
                else
                        ua.scope = TIPC_CLUSTER_SCOPE;

                if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) {
                        spin_lock_bh(&inputq->lock);
                        if (skb_peek(arrvq) == skb) {
                                __skb_dequeue(arrvq);
                                __skb_queue_tail(inputq, skb);
                        }
                        kfree_skb(skb);
                        spin_unlock_bh(&inputq->lock);
                        continue;
                }

                /* Group messages require exact scope match */
                if (msg_in_group(hdr)) {
                        ua.sr.lower = 0;
                        ua.sr.upper = ~0;
                        ua.scope = msg_lookup_scope(hdr);
                }

                /* Create destination port list: */
                tipc_nametbl_lookup_mcast_sockets(net, &ua, &dports);

                /* Clone message per destination */
                while (tipc_dest_pop(&dports, NULL, &portid)) {
                        _skb = __pskb_copy(skb, hlen, GFP_ATOMIC);
                        if (_skb) {
                                msg_set_destport(buf_msg(_skb), portid);
                                __skb_queue_tail(&tmpq, _skb);
                                continue;
                        }
                        pr_warn("Failed to clone mcast rcv buffer\n");
                }
                /* Append clones to inputq only if skb is still head of arrvq */
                spin_lock_bh(&inputq->lock);
                if (skb_peek(arrvq) == skb) {
                        skb_queue_splice_tail_init(&tmpq, inputq);
                        /* Decrement the skb's refcnt */
                        kfree_skb(__skb_dequeue(arrvq));
                }
                spin_unlock_bh(&inputq->lock);
                __skb_queue_purge(&tmpq);
                kfree_skb(skb);
        }
        tipc_sk_rcv(net, inputq);
}

/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue
 *                         when socket is in Nagle mode
 */
static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack)
{
        struct sk_buff_head *txq = &tsk->sk.sk_write_queue;
        struct sk_buff *skb = skb_peek_tail(txq);
        struct net *net = sock_net(&tsk->sk);
        u32 dnode = tsk_peer_node(tsk);
        int rc;

        if (nagle_ack) {
                tsk->pkt_cnt += skb_queue_len(txq);
                if (!tsk->pkt_cnt || tsk->msg_acc / tsk->pkt_cnt < 2) {
                        tsk->oneway = 0;
                        if (tsk->nagle_start < NAGLE_START_MAX)
                                tsk->nagle_start *= 2;
                        tsk->expect_ack = false;
                        pr_debug("tsk %10u: bad nagle %u -> %u, next start %u!\n",
                                 tsk->portid, tsk->msg_acc, tsk->pkt_cnt,
                                 tsk->nagle_start);
                } else {
                        tsk->nagle_start = NAGLE_START_INIT;
                        if (skb) {
                                msg_set_ack_required(buf_msg(skb));
                                tsk->expect_ack = true;
                        } else {
                                tsk->expect_ack = false;
                        }
                }
                tsk->msg_acc = 0;
                tsk->pkt_cnt = 0;
        }

        if (!skb || tsk->cong_link_cnt)
                return;

        /* Do not send SYN again after congestion */
        if (msg_is_syn(buf_msg(skb)))
                return;

        if (tsk->msg_acc)
                tsk->pkt_cnt += skb_queue_len(txq);
        tsk->snt_unacked += tsk->snd_backlog;
        tsk->snd_backlog = 0;
        rc = tipc_node_xmit(net, txq, dnode, tsk->portid);
        if (rc == -ELINKCONG)
                tsk->cong_link_cnt = 1;
}

/**
 * tipc_sk_conn_proto_rcv - receive a connection mng protocol message
 * @tsk: receiving socket
 * @skb: pointer to message buffer.
 * @inputq: buffer list containing the buffers
 * @xmitq: output message area
 */
static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
                                   struct sk_buff_head *inputq,
                                   struct sk_buff_head *xmitq)
{
        struct tipc_msg *hdr = buf_msg(skb);
        u32 onode = tsk_own_node(tsk);
        struct sock *sk = &tsk->sk;
        int mtyp = msg_type(hdr);
        bool was_cong;

        /* Ignore if connection cannot be validated: */
        if (!tsk_peer_msg(tsk, hdr)) {
                trace_tipc_sk_drop_msg(sk, skb, TIPC_DUMP_NONE, "@proto_rcv!");
                goto exit;
        }

        if (unlikely(msg_errcode(hdr))) {
                tipc_set_sk_state(sk, TIPC_DISCONNECTING);
                tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
                                      tsk_peer_port(tsk));
                sk->sk_state_change(sk);

                /* State change is ignored if socket already awake,
                 * - convert msg to abort msg and add to inqueue
                 */
                msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE);
                msg_set_type(hdr, TIPC_CONN_MSG);
                msg_set_size(hdr, BASIC_H_SIZE);
                msg_set_hdr_sz(hdr, BASIC_H_SIZE);
                __skb_queue_tail(inputq, skb);
                return;
        }

        tsk->probe_unacked = false;

        if (mtyp == CONN_PROBE) {
                msg_set_type(hdr, CONN_PROBE_REPLY);
                if (tipc_msg_reverse(onode, &skb, TIPC_OK))
                        __skb_queue_tail(xmitq, skb);
                return;
        } else if (mtyp == CONN_ACK) {
                was_cong = tsk_conn_cong(tsk);
                tipc_sk_push_backlog(tsk, msg_nagle_ack(hdr));
                tsk->snt_unacked -= msg_conn_ack(hdr);
                if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
                        tsk->snd_win = msg_adv_win(hdr);
                if (was_cong && !tsk_conn_cong(tsk))
                        sk->sk_write_space(sk);
        } else if (mtyp != CONN_PROBE_REPLY) {
                pr_warn("Received unknown CONN_PROTO msg\n");
        }
exit:
        kfree_skb(skb);
}

/**
 * tipc_sendmsg - send message in connectionless manner
 * @sock: socket structure
 * @m: message to send
 * @dsz: amount of user data to be sent
 *
 * Message must have an destination specified explicitly.
 * Used for SOCK_RDM and SOCK_DGRAM messages,
 * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections.
 * (Note: 'SYN+' is prohibited on SOCK_STREAM.)
 *
 * Return: the number of bytes sent on success, or errno otherwise
 */
static int tipc_sendmsg(struct socket *sock,
                        struct msghdr *m, size_t dsz)
{
        struct sock *sk = sock->sk;
        int ret;

        lock_sock(sk);
        ret = __tipc_sendmsg(sock, m, dsz);
        release_sock(sk);

        return ret;
}

static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name;
        long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
        struct list_head *clinks = &tsk->cong_links;
        bool syn = !tipc_sk_type_connectionless(sk);
        struct tipc_group *grp = tsk->group;
        struct tipc_msg *hdr = &tsk->phdr;
        struct tipc_socket_addr skaddr;
        struct sk_buff_head pkts;
        int atype, mtu, rc;

        if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
                return -EMSGSIZE;

        if (ua) {
                if (!tipc_uaddr_valid(ua, m->msg_namelen))
                        return -EINVAL;
                atype = ua->addrtype;
        }

        /* If socket belongs to a communication group follow other paths */
        if (grp) {
                if (!ua)
                        return tipc_send_group_bcast(sock, m, dlen, timeout);
                if (atype == TIPC_SERVICE_ADDR)
                        return tipc_send_group_anycast(sock, m, dlen, timeout);
                if (atype == TIPC_SOCKET_ADDR)
                        return tipc_send_group_unicast(sock, m, dlen, timeout);
                if (atype == TIPC_SERVICE_RANGE)
                        return tipc_send_group_mcast(sock, m, dlen, timeout);
                return -EINVAL;
        }

        if (!ua) {
                ua = (struct tipc_uaddr *)&tsk->peer;
                if (!syn && ua->family != AF_TIPC)
                        return -EDESTADDRREQ;
                atype = ua->addrtype;
        }

        if (unlikely(syn)) {
                if (sk->sk_state == TIPC_LISTEN)
                        return -EPIPE;
                if (sk->sk_state != TIPC_OPEN)
                        return -EISCONN;
                if (tsk->published)
                        return -EOPNOTSUPP;
                if (atype == TIPC_SERVICE_ADDR)
                        tsk->conn_addrtype = atype;
                msg_set_syn(hdr, 1);
        }

        memset(&skaddr, 0, sizeof(skaddr));

        /* Determine destination */
        if (atype == TIPC_SERVICE_RANGE) {
                return tipc_sendmcast(sock, ua, m, dlen, timeout);
        } else if (atype == TIPC_SERVICE_ADDR) {
                skaddr.node = ua->lookup_node;
                ua->scope = tipc_node2scope(skaddr.node);
                if (!tipc_nametbl_lookup_anycast(net, ua, &skaddr))
                        return -EHOSTUNREACH;
        } else if (atype == TIPC_SOCKET_ADDR) {
                skaddr = ua->sk;
        } else {
                return -EINVAL;
        }

        /* Block or return if destination link is congested */
        rc = tipc_wait_for_cond(sock, &timeout,
                                !tipc_dest_find(clinks, skaddr.node, 0));
        if (unlikely(rc))
                return rc;

        /* Finally build message header */
        msg_set_destnode(hdr, skaddr.node);
        msg_set_destport(hdr, skaddr.ref);
        if (atype == TIPC_SERVICE_ADDR) {
                msg_set_type(hdr, TIPC_NAMED_MSG);
                msg_set_hdr_sz(hdr, NAMED_H_SIZE);
                msg_set_nametype(hdr, ua->sa.type);
                msg_set_nameinst(hdr, ua->sa.instance);
                msg_set_lookup_scope(hdr, ua->scope);
        } else { /* TIPC_SOCKET_ADDR */
                msg_set_type(hdr, TIPC_DIRECT_MSG);
                msg_set_lookup_scope(hdr, 0);
                msg_set_hdr_sz(hdr, BASIC_H_SIZE);
        }

        /* Add message body */
        __skb_queue_head_init(&pkts);
        mtu = tipc_node_get_mtu(net, skaddr.node, tsk->portid, true);
        rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
        if (unlikely(rc != dlen))
                return rc;
        if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) {
                __skb_queue_purge(&pkts);
                return -ENOMEM;
        }

        /* Send message */
        trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " ");
        rc = tipc_node_xmit(net, &pkts, skaddr.node, tsk->portid);
        if (unlikely(rc == -ELINKCONG)) {
                tipc_dest_push(clinks, skaddr.node, 0);
                tsk->cong_link_cnt++;
                rc = 0;
        }

        if (unlikely(syn && !rc)) {
                tipc_set_sk_state(sk, TIPC_CONNECTING);
                if (dlen && timeout) {
                        timeout = msecs_to_jiffies(timeout);
                        tipc_wait_for_connect(sock, &timeout);
                }
        }

        return rc ? rc : dlen;
}

/**
 * tipc_sendstream - send stream-oriented data
 * @sock: socket structure
 * @m: data to send
 * @dsz: total length of data to be transmitted
 *
 * Used for SOCK_STREAM data.
 *
 * Return: the number of bytes sent on success (or partial success),
 * or errno if no data sent
 */
static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz)
{
        struct sock *sk = sock->sk;
        int ret;

        lock_sock(sk);
        ret = __tipc_sendstream(sock, m, dsz);
        release_sock(sk);

        return ret;
}

static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
{
        struct sock *sk = sock->sk;
        DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
        long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
        struct sk_buff_head *txq = &sk->sk_write_queue;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_msg *hdr = &tsk->phdr;
        struct net *net = sock_net(sk);
        struct sk_buff *skb;
        u32 dnode = tsk_peer_node(tsk);
        int maxnagle = tsk->maxnagle;
        int maxpkt = tsk->max_pkt;
        int send, sent = 0;
        int blocks, rc = 0;

        if (unlikely(dlen > INT_MAX))
                return -EMSGSIZE;

        /* Handle implicit connection setup */
        if (unlikely(dest && sk->sk_state == TIPC_OPEN)) {
                rc = __tipc_sendmsg(sock, m, dlen);
                if (dlen && dlen == rc) {
                        tsk->peer_caps = tipc_node_get_capabilities(net, dnode);
                        tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr));
                }
                return rc;
        }

        do {
                rc = tipc_wait_for_cond(sock, &timeout,
                                        (!tsk->cong_link_cnt &&
                                         !tsk_conn_cong(tsk) &&
                                         tipc_sk_connected(sk)));
                if (unlikely(rc))
                        break;
                send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
                blocks = tsk->snd_backlog;
                if (tsk->oneway++ >= tsk->nagle_start && maxnagle &&
                    send <= maxnagle) {
                        rc = tipc_msg_append(hdr, m, send, maxnagle, txq);
                        if (unlikely(rc < 0))
                                break;
                        blocks += rc;
                        tsk->msg_acc++;
                        if (blocks <= 64 && tsk->expect_ack) {
                                tsk->snd_backlog = blocks;
                                sent += send;
                                break;
                        } else if (blocks > 64) {
                                tsk->pkt_cnt += skb_queue_len(txq);
                        } else {
                                skb = skb_peek_tail(txq);
                                if (skb) {
                                        msg_set_ack_required(buf_msg(skb));
                                        tsk->expect_ack = true;
                                } else {
                                        tsk->expect_ack = false;
                                }
                                tsk->msg_acc = 0;
                                tsk->pkt_cnt = 0;
                        }
                } else {
                        rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq);
                        if (unlikely(rc != send))
                                break;
                        blocks += tsk_inc(tsk, send + MIN_H_SIZE);
                }
                trace_tipc_sk_sendstream(sk, skb_peek(txq),
                                         TIPC_DUMP_SK_SNDQ, " ");
                rc = tipc_node_xmit(net, txq, dnode, tsk->portid);
                if (unlikely(rc == -ELINKCONG)) {
                        tsk->cong_link_cnt = 1;
                        rc = 0;
                }
                if (likely(!rc)) {
                        tsk->snt_unacked += blocks;
                        tsk->snd_backlog = 0;
                        sent += send;
                }
        } while (sent < dlen && !rc);

        return sent ? sent : rc;
}

/**
 * tipc_send_packet - send a connection-oriented message
 * @sock: socket structure
 * @m: message to send
 * @dsz: length of data to be transmitted
 *
 * Used for SOCK_SEQPACKET messages.
 *
 * Return: the number of bytes sent on success, or errno otherwise
 */
static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz)
{
        if (dsz > TIPC_MAX_USER_MSG_SIZE)
                return -EMSGSIZE;

        return tipc_sendstream(sock, m, dsz);
}

/* tipc_sk_finish_conn - complete the setup of a connection
 */
static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
                                u32 peer_node)
{
        struct sock *sk = &tsk->sk;
        struct net *net = sock_net(sk);
        struct tipc_msg *msg = &tsk->phdr;

        msg_set_syn(msg, 0);
        msg_set_destnode(msg, peer_node);
        msg_set_destport(msg, peer_port);
        msg_set_type(msg, TIPC_CONN_MSG);
        msg_set_lookup_scope(msg, 0);
        msg_set_hdr_sz(msg, SHORT_H_SIZE);

        sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV);
        tipc_set_sk_state(sk, TIPC_ESTABLISHED);
        tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
        tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true);
        tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
        tsk_set_nagle(tsk);
        __skb_queue_purge(&sk->sk_write_queue);
        if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
                return;

        /* Fall back to message based flow control */
        tsk->rcv_win = FLOWCTL_MSG_WIN;
        tsk->snd_win = FLOWCTL_MSG_WIN;
}

/**
 * tipc_sk_set_orig_addr - capture sender's address for received message
 * @m: descriptor for message info
 * @skb: received message
 *
 * Note: Address is not captured if not requested by receiver.
 */
static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb)
{
        DECLARE_SOCKADDR(struct sockaddr_pair *, srcaddr, m->msg_name);
        struct tipc_msg *hdr = buf_msg(skb);

        if (!srcaddr)
                return;

        srcaddr->sock.family = AF_TIPC;
        srcaddr->sock.addrtype = TIPC_SOCKET_ADDR;
        srcaddr->sock.scope = 0;
        srcaddr->sock.addr.id.ref = msg_origport(hdr);
        srcaddr->sock.addr.id.node = msg_orignode(hdr);
        srcaddr->sock.addr.name.domain = 0;
        m->msg_namelen = sizeof(struct sockaddr_tipc);

        if (!msg_in_group(hdr))
                return;

        /* Group message users may also want to know sending member's id */
        srcaddr->member.family = AF_TIPC;
        srcaddr->member.addrtype = TIPC_SERVICE_ADDR;
        srcaddr->member.scope = 0;
        srcaddr->member.addr.name.name.type = msg_nametype(hdr);
        srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member;
        srcaddr->member.addr.name.domain = 0;
        m->msg_namelen = sizeof(*srcaddr);
}

/**
 * tipc_sk_anc_data_recv - optionally capture ancillary data for received message
 * @m: descriptor for message info
 * @skb: received message buffer
 * @tsk: TIPC port associated with message
 *
 * Note: Ancillary data is not captured if not requested by receiver.
 *
 * Return: 0 if successful, otherwise errno
 */
static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb,
                                 struct tipc_sock *tsk)
{
        struct tipc_msg *hdr;
        u32 data[3] = {0,};
        bool has_addr;
        int dlen, rc;

        if (likely(m->msg_controllen == 0))
                return 0;

        hdr = buf_msg(skb);
        dlen = msg_data_sz(hdr);

        /* Capture errored message object, if any */
        if (msg_errcode(hdr)) {
                if (skb_linearize(skb))
                        return -ENOMEM;
                hdr = buf_msg(skb);
                data[0] = msg_errcode(hdr);
                data[1] = dlen;
                rc = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, data);
                if (rc || !dlen)
                        return rc;
                rc = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, dlen, msg_data(hdr));
                if (rc)
                        return rc;
        }

        /* Capture TIPC_SERVICE_ADDR/RANGE destination address, if any */
        switch (msg_type(hdr)) {
        case TIPC_NAMED_MSG:
                has_addr = true;
                data[0] = msg_nametype(hdr);
                data[1] = msg_namelower(hdr);
                data[2] = data[1];
                break;
        case TIPC_MCAST_MSG:
                has_addr = true;
                data[0] = msg_nametype(hdr);
                data[1] = msg_namelower(hdr);
                data[2] = msg_nameupper(hdr);
                break;
        case TIPC_CONN_MSG:
                has_addr = !!tsk->conn_addrtype;
                data[0] = msg_nametype(&tsk->phdr);
                data[1] = msg_nameinst(&tsk->phdr);
                data[2] = data[1];
                break;
        default:
                has_addr = false;
        }
        if (!has_addr)
                return 0;
        return put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, data);
}

static struct sk_buff *tipc_sk_build_ack(struct tipc_sock *tsk)
{
        struct sock *sk = &tsk->sk;
        struct sk_buff *skb = NULL;
        struct tipc_msg *msg;
        u32 peer_port = tsk_peer_port(tsk);
        u32 dnode = tsk_peer_node(tsk);

        if (!tipc_sk_connected(sk))
                return NULL;
        skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0,
                              dnode, tsk_own_node(tsk), peer_port,
                              tsk->portid, TIPC_OK);
        if (!skb)
                return NULL;
        msg = buf_msg(skb);
        msg_set_conn_ack(msg, tsk->rcv_unacked);
        tsk->rcv_unacked = 0;

        /* Adjust to and advertize the correct window limit */
        if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) {
                tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf);
                msg_set_adv_win(msg, tsk->rcv_win);
        }
        return skb;
}

static void tipc_sk_send_ack(struct tipc_sock *tsk)
{
        struct sk_buff *skb;

        skb = tipc_sk_build_ack(tsk);
        if (!skb)
                return;

        tipc_node_xmit_skb(sock_net(&tsk->sk), skb, tsk_peer_node(tsk),
                           msg_link_selector(buf_msg(skb)));
}

static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop)
{
        struct sock *sk = sock->sk;
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        long timeo = *timeop;
        int err = sock_error(sk);

        if (err)
                return err;

        for (;;) {
                if (timeo && skb_queue_empty(&sk->sk_receive_queue)) {
                        if (sk->sk_shutdown & RCV_SHUTDOWN) {
                                err = -ENOTCONN;
                                break;
                        }
                        add_wait_queue(sk_sleep(sk), &wait);
                        release_sock(sk);
                        timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
                        sched_annotate_sleep();
                        lock_sock(sk);
                        remove_wait_queue(sk_sleep(sk), &wait);
                }
                err = 0;
                if (!skb_queue_empty(&sk->sk_receive_queue))
                        break;
                err = -EAGAIN;
                if (!timeo)
                        break;
                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        break;

                err = sock_error(sk);
                if (err)
                        break;
        }
        *timeop = timeo;
        return err;
}

/**
 * tipc_recvmsg - receive packet-oriented message
 * @sock: network socket
 * @m: descriptor for message info
 * @buflen: length of user buffer area
 * @flags: receive flags
 *
 * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages.
 * If the complete message doesn't fit in user area, truncate it.
 *
 * Return: size of returned message data, errno otherwise
 */
static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
                        size_t buflen,        int flags)
{
        struct sock *sk = sock->sk;
        bool connected = !tipc_sk_type_connectionless(sk);
        struct tipc_sock *tsk = tipc_sk(sk);
        int rc, err, hlen, dlen, copy;
        struct tipc_skb_cb *skb_cb;
        struct sk_buff_head xmitq;
        struct tipc_msg *hdr;
        struct sk_buff *skb;
        bool grp_evt;
        long timeout;

        /* Catch invalid receive requests */
        if (unlikely(!buflen))
                return -EINVAL;

        lock_sock(sk);
        if (unlikely(connected && sk->sk_state == TIPC_OPEN)) {
                rc = -ENOTCONN;
                goto exit;
        }
        timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        /* Step rcv queue to first msg with data or error; wait if necessary */
        do {
                rc = tipc_wait_for_rcvmsg(sock, &timeout);
                if (unlikely(rc))
                        goto exit;
                skb = skb_peek(&sk->sk_receive_queue);
                skb_cb = TIPC_SKB_CB(skb);
                hdr = buf_msg(skb);
                dlen = msg_data_sz(hdr);
                hlen = msg_hdr_sz(hdr);
                err = msg_errcode(hdr);
                grp_evt = msg_is_grp_evt(hdr);
                if (likely(dlen || err))
                        break;
                tsk_advance_rx_queue(sk);
        } while (1);

        /* Collect msg meta data, including error code and rejected data */
        tipc_sk_set_orig_addr(m, skb);
        rc = tipc_sk_anc_data_recv(m, skb, tsk);
        if (unlikely(rc))
                goto exit;
        hdr = buf_msg(skb);

        /* Capture data if non-error msg, otherwise just set return value */
        if (likely(!err)) {
                int offset = skb_cb->bytes_read;

                copy = min_t(int, dlen - offset, buflen);
                rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy);
                if (unlikely(rc))
                        goto exit;
                if (unlikely(offset + copy < dlen)) {
                        if (flags & MSG_EOR) {
                                if (!(flags & MSG_PEEK))
                                        skb_cb->bytes_read = offset + copy;
                        } else {
                                m->msg_flags |= MSG_TRUNC;
                                skb_cb->bytes_read = 0;
                        }
                } else {
                        if (flags & MSG_EOR)
                                m->msg_flags |= MSG_EOR;
                        skb_cb->bytes_read = 0;
                }
        } else {
                copy = 0;
                rc = 0;
                if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) {
                        rc = -ECONNRESET;
                        goto exit;
                }
        }

        /* Mark message as group event if applicable */
        if (unlikely(grp_evt)) {
                if (msg_grp_evt(hdr) == TIPC_WITHDRAWN)
                        m->msg_flags |= MSG_EOR;
                m->msg_flags |= MSG_OOB;
                copy = 0;
        }

        /* Caption of data or error code/rejected data was successful */
        if (unlikely(flags & MSG_PEEK))
                goto exit;

        /* Send group flow control advertisement when applicable */
        if (tsk->group && msg_in_group(hdr) && !grp_evt) {
                __skb_queue_head_init(&xmitq);
                tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen),
                                          msg_orignode(hdr), msg_origport(hdr),
                                          &xmitq);
                tipc_node_distr_xmit(sock_net(sk), &xmitq);
        }

        if (skb_cb->bytes_read)
                goto exit;

        tsk_advance_rx_queue(sk);

        if (likely(!connected))
                goto exit;

        /* Send connection flow control advertisement when applicable */
        tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen);
        if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)
                tipc_sk_send_ack(tsk);
exit:
        release_sock(sk);
        return rc ? rc : copy;
}

/**
 * tipc_recvstream - receive stream-oriented data
 * @sock: network socket
 * @m: descriptor for message info
 * @buflen: total size of user buffer area
 * @flags: receive flags
 *
 * Used for SOCK_STREAM messages only.  If not enough data is available
 * will optionally wait for more; never truncates data.
 *
 * Return: size of returned message data, errno otherwise
 */
static int tipc_recvstream(struct socket *sock, struct msghdr *m,
                           size_t buflen, int flags)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct sk_buff *skb;
        struct tipc_msg *hdr;
        struct tipc_skb_cb *skb_cb;
        bool peek = flags & MSG_PEEK;
        int offset, required, copy, copied = 0;
        int hlen, dlen, err, rc;
        long timeout;

        /* Catch invalid receive attempts */
        if (unlikely(!buflen))
                return -EINVAL;

        lock_sock(sk);

        if (unlikely(sk->sk_state == TIPC_OPEN)) {
                rc = -ENOTCONN;
                goto exit;
        }
        required = sock_rcvlowat(sk, flags & MSG_WAITALL, buflen);
        timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        do {
                /* Look at first msg in receive queue; wait if necessary */
                rc = tipc_wait_for_rcvmsg(sock, &timeout);
                if (unlikely(rc))
                        break;
                skb = skb_peek(&sk->sk_receive_queue);
                skb_cb = TIPC_SKB_CB(skb);
                hdr = buf_msg(skb);
                dlen = msg_data_sz(hdr);
                hlen = msg_hdr_sz(hdr);
                err = msg_errcode(hdr);

                /* Discard any empty non-errored (SYN-) message */
                if (unlikely(!dlen && !err)) {
                        tsk_advance_rx_queue(sk);
                        continue;
                }

                /* Collect msg meta data, incl. error code and rejected data */
                if (!copied) {
                        tipc_sk_set_orig_addr(m, skb);
                        rc = tipc_sk_anc_data_recv(m, skb, tsk);
                        if (rc)
                                break;
                        hdr = buf_msg(skb);
                }

                /* Copy data if msg ok, otherwise return error/partial data */
                if (likely(!err)) {
                        offset = skb_cb->bytes_read;
                        copy = min_t(int, dlen - offset, buflen - copied);
                        rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy);
                        if (unlikely(rc))
                                break;
                        copied += copy;
                        offset += copy;
                        if (unlikely(offset < dlen)) {
                                if (!peek)
                                        skb_cb->bytes_read = offset;
                                break;
                        }
                } else {
                        rc = 0;
                        if ((err != TIPC_CONN_SHUTDOWN) && !m->msg_control)
                                rc = -ECONNRESET;
                        if (copied || rc)
                                break;
                }

                if (unlikely(peek))
                        break;

                tsk_advance_rx_queue(sk);

                /* Send connection flow control advertisement when applicable */
                tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen);
                if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)
                        tipc_sk_send_ack(tsk);

                /* Exit if all requested data or FIN/error received */
                if (copied == buflen || err)
                        break;

        } while (!skb_queue_empty(&sk->sk_receive_queue) || copied < required);
exit:
        release_sock(sk);
        return copied ? copied : rc;
}

/**
 * tipc_write_space - wake up thread if port congestion is released
 * @sk: socket
 */
static void tipc_write_space(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);
        rcu_read_unlock();
}

/**
 * tipc_data_ready - wake up threads to indicate messages have been received
 * @sk: socket
 */
static void tipc_data_ready(struct sock *sk)
{
        struct socket_wq *wq;

        trace_sk_data_ready(sk);

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
                                                EPOLLRDNORM | EPOLLRDBAND);
        rcu_read_unlock();
}

static void tipc_sock_destruct(struct sock *sk)
{
        __skb_queue_purge(&sk->sk_receive_queue);
}

static void tipc_sk_proto_rcv(struct sock *sk,
                              struct sk_buff_head *inputq,
                              struct sk_buff_head *xmitq)
{
        struct sk_buff *skb = __skb_dequeue(inputq);
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_msg *hdr = buf_msg(skb);
        struct tipc_group *grp = tsk->group;
        bool wakeup = false;

        switch (msg_user(hdr)) {
        case CONN_MANAGER:
                tipc_sk_conn_proto_rcv(tsk, skb, inputq, xmitq);
                return;
        case SOCK_WAKEUP:
                tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0);
                /* coupled with smp_rmb() in tipc_wait_for_cond() */
                smp_wmb();
                tsk->cong_link_cnt--;
                wakeup = true;
                tipc_sk_push_backlog(tsk, false);
                break;
        case GROUP_PROTOCOL:
                tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq);
                break;
        case TOP_SRV:
                tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf,
                                      hdr, inputq, xmitq);
                break;
        default:
                break;
        }

        if (wakeup)
                sk->sk_write_space(sk);

        kfree_skb(skb);
}

/**
 * tipc_sk_filter_connect - check incoming message for a connection-based socket
 * @tsk: TIPC socket
 * @skb: pointer to message buffer.
 * @xmitq: for Nagle ACK if any
 * Return: true if message should be added to receive queue, false otherwise
 */
static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb,
                                   struct sk_buff_head *xmitq)
{
        struct sock *sk = &tsk->sk;
        struct net *net = sock_net(sk);
        struct tipc_msg *hdr = buf_msg(skb);
        bool con_msg = msg_connected(hdr);
        u32 pport = tsk_peer_port(tsk);
        u32 pnode = tsk_peer_node(tsk);
        u32 oport = msg_origport(hdr);
        u32 onode = msg_orignode(hdr);
        int err = msg_errcode(hdr);
        unsigned long delay;

        if (unlikely(msg_mcast(hdr)))
                return false;
        tsk->oneway = 0;

        switch (sk->sk_state) {
        case TIPC_CONNECTING:
                /* Setup ACK */
                if (likely(con_msg)) {
                        if (err)
                                break;
                        tipc_sk_finish_conn(tsk, oport, onode);
                        msg_set_importance(&tsk->phdr, msg_importance(hdr));
                        /* ACK+ message with data is added to receive queue */
                        if (msg_data_sz(hdr))
                                return true;
                        /* Empty ACK-, - wake up sleeping connect() and drop */
                        sk->sk_state_change(sk);
                        msg_set_dest_droppable(hdr, 1);
                        return false;
                }
                /* Ignore connectionless message if not from listening socket */
                if (oport != pport || onode != pnode)
                        return false;

                /* Rejected SYN */
                if (err != TIPC_ERR_OVERLOAD)
                        break;

                /* Prepare for new setup attempt if we have a SYN clone */
                if (skb_queue_empty(&sk->sk_write_queue))
                        break;
                get_random_bytes(&delay, 2);
                delay %= (tsk->conn_timeout / 4);
                delay = msecs_to_jiffies(delay + 100);
                sk_reset_timer(sk, &sk->sk_timer, jiffies + delay);
                return false;
        case TIPC_OPEN:
        case TIPC_DISCONNECTING:
                return false;
        case TIPC_LISTEN:
                /* Accept only SYN message */
                if (!msg_is_syn(hdr) &&
                    tipc_node_get_capabilities(net, onode) & TIPC_SYN_BIT)
                        return false;
                if (!con_msg && !err)
                        return true;
                return false;
        case TIPC_ESTABLISHED:
                if (!skb_queue_empty(&sk->sk_write_queue))
                        tipc_sk_push_backlog(tsk, false);
                /* Accept only connection-based messages sent by peer */
                if (likely(con_msg && !err && pport == oport &&
                           pnode == onode)) {
                        if (msg_ack_required(hdr)) {
                                struct sk_buff *skb;

                                skb = tipc_sk_build_ack(tsk);
                                if (skb) {
                                        msg_set_nagle_ack(buf_msg(skb));
                                        __skb_queue_tail(xmitq, skb);
                                }
                        }
                        return true;
                }
                if (!tsk_peer_msg(tsk, hdr))
                        return false;
                if (!err)
                        return true;
                tipc_set_sk_state(sk, TIPC_DISCONNECTING);
                tipc_node_remove_conn(net, pnode, tsk->portid);
                sk->sk_state_change(sk);
                return true;
        default:
                pr_err("Unknown sk_state %u\n", sk->sk_state);
        }
        /* Abort connection setup attempt */
        tipc_set_sk_state(sk, TIPC_DISCONNECTING);
        sk->sk_err = ECONNREFUSED;
        sk->sk_state_change(sk);
        return true;
}

/**
 * rcvbuf_limit - get proper overload limit of socket receive queue
 * @sk: socket
 * @skb: message
 *
 * For connection oriented messages, irrespective of importance,
 * default queue limit is 2 MB.
 *
 * For connectionless messages, queue limits are based on message
 * importance as follows:
 *
 * TIPC_LOW_IMPORTANCE       (2 MB)
 * TIPC_MEDIUM_IMPORTANCE    (4 MB)
 * TIPC_HIGH_IMPORTANCE      (8 MB)
 * TIPC_CRITICAL_IMPORTANCE  (16 MB)
 *
 * Return: overload limit according to corresponding message importance
 */
static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
{
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_msg *hdr = buf_msg(skb);

        if (unlikely(msg_in_group(hdr)))
                return READ_ONCE(sk->sk_rcvbuf);

        if (unlikely(!msg_connected(hdr)))
                return READ_ONCE(sk->sk_rcvbuf) << msg_importance(hdr);

        if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL))
                return READ_ONCE(sk->sk_rcvbuf);

        return FLOWCTL_MSG_LIM;
}

/**
 * tipc_sk_filter_rcv - validate incoming message
 * @sk: socket
 * @skb: pointer to message.
 * @xmitq: output message area (FIXME)
 *
 * Enqueues message on receive queue if acceptable; optionally handles
 * disconnect indication for a connected socket.
 *
 * Called with socket lock already taken
 */
static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
                               struct sk_buff_head *xmitq)
{
        bool sk_conn = !tipc_sk_type_connectionless(sk);
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_group *grp = tsk->group;
        struct tipc_msg *hdr = buf_msg(skb);
        struct net *net = sock_net(sk);
        struct sk_buff_head inputq;
        int mtyp = msg_type(hdr);
        int limit, err = TIPC_OK;

        trace_tipc_sk_filter_rcv(sk, skb, TIPC_DUMP_ALL, " ");
        TIPC_SKB_CB(skb)->bytes_read = 0;
        __skb_queue_head_init(&inputq);
        __skb_queue_tail(&inputq, skb);

        if (unlikely(!msg_isdata(hdr)))
                tipc_sk_proto_rcv(sk, &inputq, xmitq);

        if (unlikely(grp))
                tipc_group_filter_msg(grp, &inputq, xmitq);

        if (unlikely(!grp) && mtyp == TIPC_MCAST_MSG)
                tipc_mcast_filter_msg(net, &tsk->mc_method.deferredq, &inputq);

        /* Validate and add to receive buffer if there is space */
        while ((skb = __skb_dequeue(&inputq))) {
                hdr = buf_msg(skb);
                limit = rcvbuf_limit(sk, skb);
                if ((sk_conn && !tipc_sk_filter_connect(tsk, skb, xmitq)) ||
                    (!sk_conn && msg_connected(hdr)) ||
                    (!grp && msg_in_group(hdr)))
                        err = TIPC_ERR_NO_PORT;
                else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) {
                        trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL,
                                           "err_overload2!");
                        atomic_inc(&sk->sk_drops);
                        err = TIPC_ERR_OVERLOAD;
                }

                if (unlikely(err)) {
                        if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) {
                                trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_NONE,
                                                      "@filter_rcv!");
                                __skb_queue_tail(xmitq, skb);
                        }
                        err = TIPC_OK;
                        continue;
                }
                __skb_queue_tail(&sk->sk_receive_queue, skb);
                skb_set_owner_r(skb, sk);
                trace_tipc_sk_overlimit2(sk, skb, TIPC_DUMP_ALL,
                                         "rcvq >90% allocated!");
                sk->sk_data_ready(sk);
        }
}

/**
 * tipc_sk_backlog_rcv - handle incoming message from backlog queue
 * @sk: socket
 * @skb: message
 *
 * Caller must hold socket lock
 */
static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        unsigned int before = sk_rmem_alloc_get(sk);
        struct sk_buff_head xmitq;
        unsigned int added;

        __skb_queue_head_init(&xmitq);

        tipc_sk_filter_rcv(sk, skb, &xmitq);
        added = sk_rmem_alloc_get(sk) - before;
        atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt);

        /* Send pending response/rejected messages, if any */
        tipc_node_distr_xmit(sock_net(sk), &xmitq);
        return 0;
}

/**
 * tipc_sk_enqueue - extract all buffers with destination 'dport' from
 *                   inputq and try adding them to socket or backlog queue
 * @inputq: list of incoming buffers with potentially different destinations
 * @sk: socket where the buffers should be enqueued
 * @dport: port number for the socket
 * @xmitq: output queue
 *
 * Caller must hold socket lock
 */
static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
                            u32 dport, struct sk_buff_head *xmitq)
{
        unsigned long time_limit = jiffies + usecs_to_jiffies(20000);
        struct sk_buff *skb;
        unsigned int lim;
        atomic_t *dcnt;
        u32 onode;

        while (skb_queue_len(inputq)) {
                if (unlikely(time_after_eq(jiffies, time_limit)))
                        return;

                skb = tipc_skb_dequeue(inputq, dport);
                if (unlikely(!skb))
                        return;

                /* Add message directly to receive queue if possible */
                if (!sock_owned_by_user(sk)) {
                        tipc_sk_filter_rcv(sk, skb, xmitq);
                        continue;
                }

                /* Try backlog, compensating for double-counted bytes */
                dcnt = &tipc_sk(sk)->dupl_rcvcnt;
                if (!sk->sk_backlog.len)
                        atomic_set(dcnt, 0);
                lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
                if (likely(!sk_add_backlog(sk, skb, lim))) {
                        trace_tipc_sk_overlimit1(sk, skb, TIPC_DUMP_ALL,
                                                 "bklg & rcvq >90% allocated!");
                        continue;
                }

                trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!");
                /* Overload => reject message back to sender */
                onode = tipc_own_addr(sock_net(sk));
                atomic_inc(&sk->sk_drops);
                if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) {
                        trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL,
                                              "@sk_enqueue!");
                        __skb_queue_tail(xmitq, skb);
                }
                break;
        }
}

/**
 * tipc_sk_rcv - handle a chain of incoming buffers
 * @net: the associated network namespace
 * @inputq: buffer list containing the buffers
 * Consumes all buffers in list until inputq is empty
 * Note: may be called in multiple threads referring to the same queue
 */
void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
{
        struct sk_buff_head xmitq;
        u32 dnode, dport = 0;
        int err;
        struct tipc_sock *tsk;
        struct sock *sk;
        struct sk_buff *skb;

        __skb_queue_head_init(&xmitq);
        while (skb_queue_len(inputq)) {
                dport = tipc_skb_peek_port(inputq, dport);
                tsk = tipc_sk_lookup(net, dport);

                if (likely(tsk)) {
                        sk = &tsk->sk;
                        if (likely(spin_trylock_bh(&sk->sk_lock.slock))) {
                                tipc_sk_enqueue(inputq, sk, dport, &xmitq);
                                spin_unlock_bh(&sk->sk_lock.slock);
                        }
                        /* Send pending response/rejected messages, if any */
                        tipc_node_distr_xmit(sock_net(sk), &xmitq);
                        sock_put(sk);
                        continue;
                }
                /* No destination socket => dequeue skb if still there */
                skb = tipc_skb_dequeue(inputq, dport);
                if (!skb)
                        return;

                /* Try secondary lookup if unresolved named message */
                err = TIPC_ERR_NO_PORT;
                if (tipc_msg_lookup_dest(net, skb, &err))
                        goto xmit;

                /* Prepare for message rejection */
                if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err))
                        continue;

                trace_tipc_sk_rej_msg(NULL, skb, TIPC_DUMP_NONE, "@sk_rcv!");
xmit:
                dnode = msg_destnode(buf_msg(skb));
                tipc_node_xmit_skb(net, skb, dnode, dport);
        }
}

static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        struct sock *sk = sock->sk;
        int done;

        do {
                int err = sock_error(sk);
                if (err)
                        return err;
                if (!*timeo_p)
                        return -ETIMEDOUT;
                if (signal_pending(current))
                        return sock_intr_errno(*timeo_p);
                if (sk->sk_state == TIPC_DISCONNECTING)
                        break;

                add_wait_queue(sk_sleep(sk), &wait);
                done = sk_wait_event(sk, timeo_p, tipc_sk_connected(sk),
                                     &wait);
                remove_wait_queue(sk_sleep(sk), &wait);
        } while (!done);
        return 0;
}

static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr)
{
        if (addr->family != AF_TIPC)
                return false;
        if (addr->addrtype == TIPC_SERVICE_RANGE)
                return (addr->addr.nameseq.lower <= addr->addr.nameseq.upper);
        return (addr->addrtype == TIPC_SERVICE_ADDR ||
                addr->addrtype == TIPC_SOCKET_ADDR);
}

/**
 * tipc_connect - establish a connection to another TIPC port
 * @sock: socket structure
 * @dest: socket address for destination port
 * @destlen: size of socket address data structure
 * @flags: file-related flags associated with socket
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_connect(struct socket *sock, struct sockaddr *dest,
                        int destlen, int flags)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest;
        struct msghdr m = {NULL,};
        long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout;
        int previous;
        int res = 0;

        if (destlen != sizeof(struct sockaddr_tipc))
                return -EINVAL;

        lock_sock(sk);

        if (tsk->group) {
                res = -EINVAL;
                goto exit;
        }

        if (dst->family == AF_UNSPEC) {
                memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc));
                if (!tipc_sk_type_connectionless(sk))
                        res = -EINVAL;
                goto exit;
        }
        if (!tipc_sockaddr_is_sane(dst)) {
                res = -EINVAL;
                goto exit;
        }
        /* DGRAM/RDM connect(), just save the destaddr */
        if (tipc_sk_type_connectionless(sk)) {
                memcpy(&tsk->peer, dest, destlen);
                goto exit;
        } else if (dst->addrtype == TIPC_SERVICE_RANGE) {
                res = -EINVAL;
                goto exit;
        }

        previous = sk->sk_state;

        switch (sk->sk_state) {
        case TIPC_OPEN:
                /* Send a 'SYN-' to destination */
                m.msg_name = dest;
                m.msg_namelen = destlen;
                iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0);

                /* If connect is in non-blocking case, set MSG_DONTWAIT to
                 * indicate send_msg() is never blocked.
                 */
                if (!timeout)
                        m.msg_flags = MSG_DONTWAIT;

                res = __tipc_sendmsg(sock, &m, 0);
                if ((res < 0) && (res != -EWOULDBLOCK))
                        goto exit;

                /* Just entered TIPC_CONNECTING state; the only
                 * difference is that return value in non-blocking
                 * case is EINPROGRESS, rather than EALREADY.
                 */
                res = -EINPROGRESS;
                fallthrough;
        case TIPC_CONNECTING:
                if (!timeout) {
                        if (previous == TIPC_CONNECTING)
                                res = -EALREADY;
                        goto exit;
                }
                timeout = msecs_to_jiffies(timeout);
                /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
                res = tipc_wait_for_connect(sock, &timeout);
                break;
        case TIPC_ESTABLISHED:
                res = -EISCONN;
                break;
        default:
                res = -EINVAL;
        }

exit:
        release_sock(sk);
        return res;
}

/**
 * tipc_listen - allow socket to listen for incoming connections
 * @sock: socket structure
 * @len: (unused)
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_listen(struct socket *sock, int len)
{
        struct sock *sk = sock->sk;
        int res;

        lock_sock(sk);
        res = tipc_set_sk_state(sk, TIPC_LISTEN);
        release_sock(sk);

        return res;
}

static int tipc_wait_for_accept(struct socket *sock, long timeo)
{
        struct sock *sk = sock->sk;
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        int err;

        /* True wake-one mechanism for incoming connections: only
         * one process gets woken up, not the 'whole herd'.
         * Since we do not 'race & poll' for established sockets
         * anymore, the common case will execute the loop only once.
        */
        for (;;) {
                if (timeo && skb_queue_empty(&sk->sk_receive_queue)) {
                        add_wait_queue(sk_sleep(sk), &wait);
                        release_sock(sk);
                        timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
                        lock_sock(sk);
                        remove_wait_queue(sk_sleep(sk), &wait);
                }
                err = 0;
                if (!skb_queue_empty(&sk->sk_receive_queue))
                        break;
                err = -EAGAIN;
                if (!timeo)
                        break;
                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        break;
        }
        return err;
}

/**
 * tipc_accept - wait for connection request
 * @sock: listening socket
 * @new_sock: new socket that is to be connected
 * @arg: arguments for accept
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_accept(struct socket *sock, struct socket *new_sock,
                       struct proto_accept_arg *arg)
{
        struct sock *new_sk, *sk = sock->sk;
        struct tipc_sock *new_tsock;
        struct msghdr m = {NULL,};
        struct tipc_msg *msg;
        struct sk_buff *buf;
        long timeo;
        int res;

        lock_sock(sk);

        if (sk->sk_state != TIPC_LISTEN) {
                res = -EINVAL;
                goto exit;
        }
        timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
        res = tipc_wait_for_accept(sock, timeo);
        if (res)
                goto exit;

        buf = skb_peek(&sk->sk_receive_queue);

        res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, arg->kern);
        if (res)
                goto exit;
        security_sk_clone(sock->sk, new_sock->sk);

        new_sk = new_sock->sk;
        new_tsock = tipc_sk(new_sk);
        msg = buf_msg(buf);

        /* we lock on new_sk; but lockdep sees the lock on sk */
        lock_sock_nested(new_sk, SINGLE_DEPTH_NESTING);

        /*
         * Reject any stray messages received by new socket
         * before the socket lock was taken (very, very unlikely)
         */
        tsk_rej_rx_queue(new_sk, TIPC_ERR_NO_PORT);

        /* Connect new socket to it's peer */
        tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));

        tsk_set_importance(new_sk, msg_importance(msg));
        if (msg_named(msg)) {
                new_tsock->conn_addrtype = TIPC_SERVICE_ADDR;
                msg_set_nametype(&new_tsock->phdr, msg_nametype(msg));
                msg_set_nameinst(&new_tsock->phdr, msg_nameinst(msg));
        }

        /*
         * Respond to 'SYN-' by discarding it & returning 'ACK'.
         * Respond to 'SYN+' by queuing it on new socket & returning 'ACK'.
         */
        if (!msg_data_sz(msg)) {
                tsk_advance_rx_queue(sk);
        } else {
                __skb_dequeue(&sk->sk_receive_queue);
                __skb_queue_head(&new_sk->sk_receive_queue, buf);
                skb_set_owner_r(buf, new_sk);
        }
        iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0);
        __tipc_sendstream(new_sock, &m, 0);
        release_sock(new_sk);
exit:
        release_sock(sk);
        return res;
}

/**
 * tipc_shutdown - shutdown socket connection
 * @sock: socket structure
 * @how: direction to close (must be SHUT_RDWR)
 *
 * Terminates connection (if necessary), then purges socket's receive queue.
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_shutdown(struct socket *sock, int how)
{
        struct sock *sk = sock->sk;
        int res;

        if (how != SHUT_RDWR)
                return -EINVAL;

        lock_sock(sk);

        trace_tipc_sk_shutdown(sk, NULL, TIPC_DUMP_ALL, " ");
        __tipc_shutdown(sock, TIPC_CONN_SHUTDOWN);
        sk->sk_shutdown = SHUTDOWN_MASK;

        if (sk->sk_state == TIPC_DISCONNECTING) {
                /* Discard any unreceived messages */
                __skb_queue_purge(&sk->sk_receive_queue);

                res = 0;
        } else {
                res = -ENOTCONN;
        }
        /* Wake up anyone sleeping in poll. */
        sk->sk_state_change(sk);

        release_sock(sk);
        return res;
}

static void tipc_sk_check_probing_state(struct sock *sk,
                                        struct sk_buff_head *list)
{
        struct tipc_sock *tsk = tipc_sk(sk);
        u32 pnode = tsk_peer_node(tsk);
        u32 pport = tsk_peer_port(tsk);
        u32 self = tsk_own_node(tsk);
        u32 oport = tsk->portid;
        struct sk_buff *skb;

        if (tsk->probe_unacked) {
                tipc_set_sk_state(sk, TIPC_DISCONNECTING);
                sk->sk_err = ECONNABORTED;
                tipc_node_remove_conn(sock_net(sk), pnode, pport);
                sk->sk_state_change(sk);
                return;
        }
        /* Prepare new probe */
        skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0,
                              pnode, self, pport, oport, TIPC_OK);
        if (skb)
                __skb_queue_tail(list, skb);
        tsk->probe_unacked = true;
        sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV);
}

static void tipc_sk_retry_connect(struct sock *sk, struct sk_buff_head *list)
{
        struct tipc_sock *tsk = tipc_sk(sk);

        /* Try again later if dest link is congested */
        if (tsk->cong_link_cnt) {
                sk_reset_timer(sk, &sk->sk_timer,
                               jiffies + msecs_to_jiffies(100));
                return;
        }
        /* Prepare SYN for retransmit */
        tipc_msg_skb_clone(&sk->sk_write_queue, list);
}

static void tipc_sk_timeout(struct timer_list *t)
{
        struct sock *sk = from_timer(sk, t, sk_timer);
        struct tipc_sock *tsk = tipc_sk(sk);
        u32 pnode = tsk_peer_node(tsk);
        struct sk_buff_head list;
        int rc = 0;

        __skb_queue_head_init(&list);
        bh_lock_sock(sk);

        /* Try again later if socket is busy */
        if (sock_owned_by_user(sk)) {
                sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20);
                bh_unlock_sock(sk);
                sock_put(sk);
                return;
        }

        if (sk->sk_state == TIPC_ESTABLISHED)
                tipc_sk_check_probing_state(sk, &list);
        else if (sk->sk_state == TIPC_CONNECTING)
                tipc_sk_retry_connect(sk, &list);

        bh_unlock_sock(sk);

        if (!skb_queue_empty(&list))
                rc = tipc_node_xmit(sock_net(sk), &list, pnode, tsk->portid);

        /* SYN messages may cause link congestion */
        if (rc == -ELINKCONG) {
                tipc_dest_push(&tsk->cong_links, pnode, 0);
                tsk->cong_link_cnt = 1;
        }
        sock_put(sk);
}

static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua)
{
        struct sock *sk = &tsk->sk;
        struct net *net = sock_net(sk);
        struct tipc_socket_addr skaddr;
        struct publication *p;
        u32 key;

        if (tipc_sk_connected(sk))
                return -EINVAL;
        key = tsk->portid + tsk->pub_count + 1;
        if (key == tsk->portid)
                return -EADDRINUSE;
        skaddr.ref = tsk->portid;
        skaddr.node = tipc_own_addr(net);
        p = tipc_nametbl_publish(net, ua, &skaddr, key);
        if (unlikely(!p))
                return -EINVAL;

        list_add(&p->binding_sock, &tsk->publications);
        tsk->pub_count++;
        tsk->published = true;
        return 0;
}

static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua)
{
        struct net *net = sock_net(&tsk->sk);
        struct publication *safe, *p;
        struct tipc_uaddr _ua;
        int rc = -EINVAL;

        list_for_each_entry_safe(p, safe, &tsk->publications, binding_sock) {
                if (!ua) {
                        tipc_uaddr(&_ua, TIPC_SERVICE_RANGE, p->scope,
                                   p->sr.type, p->sr.lower, p->sr.upper);
                        tipc_nametbl_withdraw(net, &_ua, &p->sk, p->key);
                        continue;
                }
                /* Unbind specific publication */
                if (p->scope != ua->scope)
                        continue;
                if (p->sr.type != ua->sr.type)
                        continue;
                if (p->sr.lower != ua->sr.lower)
                        continue;
                if (p->sr.upper != ua->sr.upper)
                        break;
                tipc_nametbl_withdraw(net, ua, &p->sk, p->key);
                rc = 0;
                break;
        }
        if (list_empty(&tsk->publications)) {
                tsk->published = 0;
                rc = 0;
        }
        return rc;
}

/* tipc_sk_reinit: set non-zero address in all existing sockets
 *                 when we go from standalone to network mode.
 */
void tipc_sk_reinit(struct net *net)
{
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        struct rhashtable_iter iter;
        struct tipc_sock *tsk;
        struct tipc_msg *msg;

        rhashtable_walk_enter(&tn->sk_rht, &iter);

        do {
                rhashtable_walk_start(&iter);

                while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
                        sock_hold(&tsk->sk);
                        rhashtable_walk_stop(&iter);
                        lock_sock(&tsk->sk);
                        msg = &tsk->phdr;
                        msg_set_prevnode(msg, tipc_own_addr(net));
                        msg_set_orignode(msg, tipc_own_addr(net));
                        release_sock(&tsk->sk);
                        rhashtable_walk_start(&iter);
                        sock_put(&tsk->sk);
                }

                rhashtable_walk_stop(&iter);
        } while (tsk == ERR_PTR(-EAGAIN));

        rhashtable_walk_exit(&iter);
}

static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
{
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        struct tipc_sock *tsk;

        rcu_read_lock();
        tsk = rhashtable_lookup(&tn->sk_rht, &portid, tsk_rht_params);
        if (tsk)
                sock_hold(&tsk->sk);
        rcu_read_unlock();

        return tsk;
}

static int tipc_sk_insert(struct tipc_sock *tsk)
{
        struct sock *sk = &tsk->sk;
        struct net *net = sock_net(sk);
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        u32 remaining = (TIPC_MAX_PORT - TIPC_MIN_PORT) + 1;
        u32 portid = get_random_u32_below(remaining) + TIPC_MIN_PORT;

        while (remaining--) {
                portid++;
                if ((portid < TIPC_MIN_PORT) || (portid > TIPC_MAX_PORT))
                        portid = TIPC_MIN_PORT;
                tsk->portid = portid;
                sock_hold(&tsk->sk);
                if (!rhashtable_lookup_insert_fast(&tn->sk_rht, &tsk->node,
                                                   tsk_rht_params))
                        return 0;
                sock_put(&tsk->sk);
        }

        return -1;
}

static void tipc_sk_remove(struct tipc_sock *tsk)
{
        struct sock *sk = &tsk->sk;
        struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id);

        if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) {
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
}

static const struct rhashtable_params tsk_rht_params = {
        .nelem_hint = 192,
        .head_offset = offsetof(struct tipc_sock, node),
        .key_offset = offsetof(struct tipc_sock, portid),
        .key_len = sizeof(u32), /* portid */
        .max_size = 1048576,
        .min_size = 256,
        .automatic_shrinking = true,
};

int tipc_sk_rht_init(struct net *net)
{
        struct tipc_net *tn = net_generic(net, tipc_net_id);

        return rhashtable_init(&tn->sk_rht, &tsk_rht_params);
}

void tipc_sk_rht_destroy(struct net *net)
{
        struct tipc_net *tn = net_generic(net, tipc_net_id);

        /* Wait for socket readers to complete */
        synchronize_net();

        rhashtable_destroy(&tn->sk_rht);
}

static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq)
{
        struct net *net = sock_net(&tsk->sk);
        struct tipc_group *grp = tsk->group;
        struct tipc_msg *hdr = &tsk->phdr;
        struct tipc_uaddr ua;
        int rc;

        if (mreq->type < TIPC_RESERVED_TYPES)
                return -EACCES;
        if (mreq->scope > TIPC_NODE_SCOPE)
                return -EINVAL;
        if (mreq->scope != TIPC_NODE_SCOPE)
                mreq->scope = TIPC_CLUSTER_SCOPE;
        if (grp)
                return -EACCES;
        grp = tipc_group_create(net, tsk->portid, mreq, &tsk->group_is_open);
        if (!grp)
                return -ENOMEM;
        tsk->group = grp;
        msg_set_lookup_scope(hdr, mreq->scope);
        msg_set_nametype(hdr, mreq->type);
        msg_set_dest_droppable(hdr, true);
        tipc_uaddr(&ua, TIPC_SERVICE_RANGE, mreq->scope,
                   mreq->type, mreq->instance, mreq->instance);
        tipc_nametbl_build_group(net, grp, &ua);
        rc = tipc_sk_publish(tsk, &ua);
        if (rc) {
                tipc_group_delete(net, grp);
                tsk->group = NULL;
                return rc;
        }
        /* Eliminate any risk that a broadcast overtakes sent JOINs */
        tsk->mc_method.rcast = true;
        tsk->mc_method.mandatory = true;
        tipc_group_join(net, grp, &tsk->sk.sk_rcvbuf);
        return rc;
}

static int tipc_sk_leave(struct tipc_sock *tsk)
{
        struct net *net = sock_net(&tsk->sk);
        struct tipc_group *grp = tsk->group;
        struct tipc_uaddr ua;
        int scope;

        if (!grp)
                return -EINVAL;
        ua.addrtype = TIPC_SERVICE_RANGE;
        tipc_group_self(grp, &ua.sr, &scope);
        ua.scope = scope;
        tipc_group_delete(net, grp);
        tsk->group = NULL;
        tipc_sk_withdraw(tsk, &ua);
        return 0;
}

/**
 * tipc_setsockopt - set socket option
 * @sock: socket structure
 * @lvl: option level
 * @opt: option identifier
 * @ov: pointer to new option value
 * @ol: length of option value
 *
 * For stream sockets only, accepts and ignores all IPPROTO_TCP options
 * (to ease compatibility).
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
                           sockptr_t ov, unsigned int ol)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_group_req mreq;
        u32 value = 0;
        int res = 0;

        if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
                return 0;
        if (lvl != SOL_TIPC)
                return -ENOPROTOOPT;

        switch (opt) {
        case TIPC_IMPORTANCE:
        case TIPC_SRC_DROPPABLE:
        case TIPC_DEST_DROPPABLE:
        case TIPC_CONN_TIMEOUT:
        case TIPC_NODELAY:
                if (ol < sizeof(value))
                        return -EINVAL;
                if (copy_from_sockptr(&value, ov, sizeof(u32)))
                        return -EFAULT;
                break;
        case TIPC_GROUP_JOIN:
                if (ol < sizeof(mreq))
                        return -EINVAL;
                if (copy_from_sockptr(&mreq, ov, sizeof(mreq)))
                        return -EFAULT;
                break;
        default:
                if (!sockptr_is_null(ov) || ol)
                        return -EINVAL;
        }

        lock_sock(sk);

        switch (opt) {
        case TIPC_IMPORTANCE:
                res = tsk_set_importance(sk, value);
                break;
        case TIPC_SRC_DROPPABLE:
                if (sock->type != SOCK_STREAM)
                        tsk_set_unreliable(tsk, value);
                else
                        res = -ENOPROTOOPT;
                break;
        case TIPC_DEST_DROPPABLE:
                tsk_set_unreturnable(tsk, value);
                break;
        case TIPC_CONN_TIMEOUT:
                tipc_sk(sk)->conn_timeout = value;
                break;
        case TIPC_MCAST_BROADCAST:
                tsk->mc_method.rcast = false;
                tsk->mc_method.mandatory = true;
                break;
        case TIPC_MCAST_REPLICAST:
                tsk->mc_method.rcast = true;
                tsk->mc_method.mandatory = true;
                break;
        case TIPC_GROUP_JOIN:
                res = tipc_sk_join(tsk, &mreq);
                break;
        case TIPC_GROUP_LEAVE:
                res = tipc_sk_leave(tsk);
                break;
        case TIPC_NODELAY:
                tsk->nodelay = !!value;
                tsk_set_nagle(tsk);
                break;
        default:
                res = -EINVAL;
        }

        release_sock(sk);

        return res;
}

/**
 * tipc_getsockopt - get socket option
 * @sock: socket structure
 * @lvl: option level
 * @opt: option identifier
 * @ov: receptacle for option value
 * @ol: receptacle for length of option value
 *
 * For stream sockets only, returns 0 length result for all IPPROTO_TCP options
 * (to ease compatibility).
 *
 * Return: 0 on success, errno otherwise
 */
static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
                           char __user *ov, int __user *ol)
{
        struct sock *sk = sock->sk;
        struct tipc_sock *tsk = tipc_sk(sk);
        struct tipc_service_range seq;
        int len, scope;
        u32 value;
        int res;

        if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM))
                return put_user(0, ol);
        if (lvl != SOL_TIPC)
                return -ENOPROTOOPT;
        res = get_user(len, ol);
        if (res)
                return res;

        lock_sock(sk);

        switch (opt) {
        case TIPC_IMPORTANCE:
                value = tsk_importance(tsk);
                break;
        case TIPC_SRC_DROPPABLE:
                value = tsk_unreliable(tsk);
                break;
        case TIPC_DEST_DROPPABLE:
                value = tsk_unreturnable(tsk);
                break;
        case TIPC_CONN_TIMEOUT:
                value = tsk->conn_timeout;
                /* no need to set "res", since already 0 at this point */
                break;
        case TIPC_NODE_RECVQ_DEPTH:
                value = 0; /* was tipc_queue_size, now obsolete */
                break;
        case TIPC_SOCK_RECVQ_DEPTH:
                value = skb_queue_len(&sk->sk_receive_queue);
                break;
        case TIPC_SOCK_RECVQ_USED:
                value = sk_rmem_alloc_get(sk);
                break;
        case TIPC_GROUP_JOIN:
                seq.type = 0;
                if (tsk->group)
                        tipc_group_self(tsk->group, &seq, &scope);
                value = seq.type;
                break;
        default:
                res = -EINVAL;
        }

        release_sock(sk);

        if (res)
                return res;        /* "get" failed */

        if (len < sizeof(value))
                return -EINVAL;

        if (copy_to_user(ov, &value, sizeof(value)))
                return -EFAULT;

        return put_user(sizeof(value), ol);
}

static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct net *net = sock_net(sock->sk);
        struct tipc_sioc_nodeid_req nr = {0};
        struct tipc_sioc_ln_req lnr;
        void __user *argp = (void __user *)arg;

        switch (cmd) {
        case SIOCGETLINKNAME:
                if (copy_from_user(&lnr, argp, sizeof(lnr)))
                        return -EFAULT;
                if (!tipc_node_get_linkname(net,
                                            lnr.bearer_id & 0xffff, lnr.peer,
                                            lnr.linkname, TIPC_MAX_LINK_NAME)) {
                        if (copy_to_user(argp, &lnr, sizeof(lnr)))
                                return -EFAULT;
                        return 0;
                }
                return -EADDRNOTAVAIL;
        case SIOCGETNODEID:
                if (copy_from_user(&nr, argp, sizeof(nr)))
                        return -EFAULT;
                if (!tipc_node_get_id(net, nr.peer, nr.node_id))
                        return -EADDRNOTAVAIL;
                if (copy_to_user(argp, &nr, sizeof(nr)))
                        return -EFAULT;
                return 0;
        default:
                return -ENOIOCTLCMD;
        }
}

static int tipc_socketpair(struct socket *sock1, struct socket *sock2)
{
        struct tipc_sock *tsk2 = tipc_sk(sock2->sk);
        struct tipc_sock *tsk1 = tipc_sk(sock1->sk);
        u32 onode = tipc_own_addr(sock_net(sock1->sk));

        tsk1->peer.family = AF_TIPC;
        tsk1->peer.addrtype = TIPC_SOCKET_ADDR;
        tsk1->peer.scope = TIPC_NODE_SCOPE;
        tsk1->peer.addr.id.ref = tsk2->portid;
        tsk1->peer.addr.id.node = onode;
        tsk2->peer.family = AF_TIPC;
        tsk2->peer.addrtype = TIPC_SOCKET_ADDR;
        tsk2->peer.scope = TIPC_NODE_SCOPE;
        tsk2->peer.addr.id.ref = tsk1->portid;
        tsk2->peer.addr.id.node = onode;

        tipc_sk_finish_conn(tsk1, tsk2->portid, onode);
        tipc_sk_finish_conn(tsk2, tsk1->portid, onode);
        return 0;
}

/* Protocol switches for the various types of TIPC sockets */

static const struct proto_ops msg_ops = {
        .owner                = THIS_MODULE,
        .family                = AF_TIPC,
        .release        = tipc_release,
        .bind                = tipc_bind,
        .connect        = tipc_connect,
        .socketpair        = tipc_socketpair,
        .accept                = sock_no_accept,
        .getname        = tipc_getname,
        .poll                = tipc_poll,
        .ioctl                = tipc_ioctl,
        .listen                = sock_no_listen,
        .shutdown        = tipc_shutdown,
        .setsockopt        = tipc_setsockopt,
        .getsockopt        = tipc_getsockopt,
        .sendmsg        = tipc_sendmsg,
        .recvmsg        = tipc_recvmsg,
        .mmap                = sock_no_mmap,
};

static const struct proto_ops packet_ops = {
        .owner                = THIS_MODULE,
        .family                = AF_TIPC,
        .release        = tipc_release,
        .bind                = tipc_bind,
        .connect        = tipc_connect,
        .socketpair        = tipc_socketpair,
        .accept                = tipc_accept,
        .getname        = tipc_getname,
        .poll                = tipc_poll,
        .ioctl                = tipc_ioctl,
        .listen                = tipc_listen,
        .shutdown        = tipc_shutdown,
        .setsockopt        = tipc_setsockopt,
        .getsockopt        = tipc_getsockopt,
        .sendmsg        = tipc_send_packet,
        .recvmsg        = tipc_recvmsg,
        .mmap                = sock_no_mmap,
};

static const struct proto_ops stream_ops = {
        .owner                = THIS_MODULE,
        .family                = AF_TIPC,
        .release        = tipc_release,
        .bind                = tipc_bind,
        .connect        = tipc_connect,
        .socketpair        = tipc_socketpair,
        .accept                = tipc_accept,
        .getname        = tipc_getname,
        .poll                = tipc_poll,
        .ioctl                = tipc_ioctl,
        .listen                = tipc_listen,
        .shutdown        = tipc_shutdown,
        .setsockopt        = tipc_setsockopt,
        .getsockopt        = tipc_getsockopt,
        .sendmsg        = tipc_sendstream,
        .recvmsg        = tipc_recvstream,
        .mmap                = sock_no_mmap,
};

static const struct net_proto_family tipc_family_ops = {
        .owner                = THIS_MODULE,
        .family                = AF_TIPC,
        .create                = tipc_sk_create
};

static struct proto tipc_proto = {
        .name                = "TIPC",
        .owner                = THIS_MODULE,
        .obj_size        = sizeof(struct tipc_sock),
        .sysctl_rmem        = sysctl_tipc_rmem
};

/**
 * tipc_socket_init - initialize TIPC socket interface
 *
 * Return: 0 on success, errno otherwise
 */
int tipc_socket_init(void)
{
        int res;

        res = proto_register(&tipc_proto, 1);
        if (res) {
                pr_err("Failed to register TIPC protocol type\n");
                goto out;
        }

        res = sock_register(&tipc_family_ops);
        if (res) {
                pr_err("Failed to register TIPC socket type\n");
                proto_unregister(&tipc_proto);
                goto out;
        }
 out:
        return res;
}

/**
 * tipc_socket_stop - stop TIPC socket interface
 */
void tipc_socket_stop(void)
{
        sock_unregister(tipc_family_ops.family);
        proto_unregister(&tipc_proto);
}

/* Caller should hold socket lock for the passed tipc socket. */
static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk)
{
        u32 peer_node, peer_port;
        u32 conn_type, conn_instance;
        struct nlattr *nest;

        peer_node = tsk_peer_node(tsk);
        peer_port = tsk_peer_port(tsk);
        conn_type = msg_nametype(&tsk->phdr);
        conn_instance = msg_nameinst(&tsk->phdr);
        nest = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_CON);
        if (!nest)
                return -EMSGSIZE;

        if (nla_put_u32(skb, TIPC_NLA_CON_NODE, peer_node))
                goto msg_full;
        if (nla_put_u32(skb, TIPC_NLA_CON_SOCK, peer_port))
                goto msg_full;

        if (tsk->conn_addrtype != 0) {
                if (nla_put_flag(skb, TIPC_NLA_CON_FLAG))
                        goto msg_full;
                if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, conn_type))
                        goto msg_full;
                if (nla_put_u32(skb, TIPC_NLA_CON_INST, conn_instance))
                        goto msg_full;
        }
        nla_nest_end(skb, nest);

        return 0;

msg_full:
        nla_nest_cancel(skb, nest);

        return -EMSGSIZE;
}

static int __tipc_nl_add_sk_info(struct sk_buff *skb, struct tipc_sock
                          *tsk)
{
        struct net *net = sock_net(skb->sk);
        struct sock *sk = &tsk->sk;

        if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid) ||
            nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tipc_own_addr(net)))
                return -EMSGSIZE;

        if (tipc_sk_connected(sk)) {
                if (__tipc_nl_add_sk_con(skb, tsk))
                        return -EMSGSIZE;
        } else if (!list_empty(&tsk->publications)) {
                if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL))
                        return -EMSGSIZE;
        }
        return 0;
}

/* Caller should hold socket lock for the passed tipc socket. */
static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
                            struct tipc_sock *tsk)
{
        struct nlattr *attrs;
        void *hdr;

        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                          &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET);
        if (!hdr)
                goto msg_cancel;

        attrs = nla_nest_start_noflag(skb, TIPC_NLA_SOCK);
        if (!attrs)
                goto genlmsg_cancel;

        if (__tipc_nl_add_sk_info(skb, tsk))
                goto attr_msg_cancel;

        nla_nest_end(skb, attrs);
        genlmsg_end(skb, hdr);

        return 0;

attr_msg_cancel:
        nla_nest_cancel(skb, attrs);
genlmsg_cancel:
        genlmsg_cancel(skb, hdr);
msg_cancel:
        return -EMSGSIZE;
}

int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
                    int (*skb_handler)(struct sk_buff *skb,
                                       struct netlink_callback *cb,
                                       struct tipc_sock *tsk))
{
        struct rhashtable_iter *iter = (void *)cb->args[4];
        struct tipc_sock *tsk;
        int err;

        rhashtable_walk_start(iter);
        while ((tsk = rhashtable_walk_next(iter)) != NULL) {
                if (IS_ERR(tsk)) {
                        if (PTR_ERR(tsk) == -EAGAIN)
                                continue;
                        break;
                }

                sock_hold(&tsk->sk);
                rhashtable_walk_stop(iter);
                lock_sock(&tsk->sk);
                err = skb_handler(skb, cb, tsk);
                if (err) {
                        release_sock(&tsk->sk);
                        sock_put(&tsk->sk);
                        goto out;
                }
                release_sock(&tsk->sk);
                rhashtable_walk_start(iter);
                sock_put(&tsk->sk);
        }
        rhashtable_walk_stop(iter);
out:
        return skb->len;
}
EXPORT_SYMBOL(tipc_nl_sk_walk);

int tipc_dump_start(struct netlink_callback *cb)
{
        return __tipc_dump_start(cb, sock_net(cb->skb->sk));
}
EXPORT_SYMBOL(tipc_dump_start);

int __tipc_dump_start(struct netlink_callback *cb, struct net *net)
{
        /* tipc_nl_name_table_dump() uses cb->args[0...3]. */
        struct rhashtable_iter *iter = (void *)cb->args[4];
        struct tipc_net *tn = tipc_net(net);

        if (!iter) {
                iter = kmalloc(sizeof(*iter), GFP_KERNEL);
                if (!iter)
                        return -ENOMEM;

                cb->args[4] = (long)iter;
        }

        rhashtable_walk_enter(&tn->sk_rht, iter);
        return 0;
}

int tipc_dump_done(struct netlink_callback *cb)
{
        struct rhashtable_iter *hti = (void *)cb->args[4];

        rhashtable_walk_exit(hti);
        kfree(hti);
        return 0;
}
EXPORT_SYMBOL(tipc_dump_done);

int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
                           struct tipc_sock *tsk, u32 sk_filter_state,
                           u64 (*tipc_diag_gen_cookie)(struct sock *sk))
{
        struct sock *sk = &tsk->sk;
        struct nlattr *attrs;
        struct nlattr *stat;

        /*filter response w.r.t sk_state*/
        if (!(sk_filter_state & (1 << sk->sk_state)))
                return 0;

        attrs = nla_nest_start_noflag(skb, TIPC_NLA_SOCK);
        if (!attrs)
                goto msg_cancel;

        if (__tipc_nl_add_sk_info(skb, tsk))
                goto attr_msg_cancel;

        if (nla_put_u32(skb, TIPC_NLA_SOCK_TYPE, (u32)sk->sk_type) ||
            nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) ||
            nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) ||
            nla_put_u32(skb, TIPC_NLA_SOCK_UID,
                        from_kuid_munged(sk_user_ns(NETLINK_CB(cb->skb).sk),
                                         sock_i_uid(sk))) ||
            nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE,
                              tipc_diag_gen_cookie(sk),
                              TIPC_NLA_SOCK_PAD))
                goto attr_msg_cancel;

        stat = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_STAT);
        if (!stat)
                goto attr_msg_cancel;

        if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ,
                        skb_queue_len(&sk->sk_receive_queue)) ||
            nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ,
                        skb_queue_len(&sk->sk_write_queue)) ||
            nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP,
                        atomic_read(&sk->sk_drops)))
                goto stat_msg_cancel;

        if (tsk->cong_link_cnt &&
            nla_put_flag(skb, TIPC_NLA_SOCK_STAT_LINK_CONG))
                goto stat_msg_cancel;

        if (tsk_conn_cong(tsk) &&
            nla_put_flag(skb, TIPC_NLA_SOCK_STAT_CONN_CONG))
                goto stat_msg_cancel;

        nla_nest_end(skb, stat);

        if (tsk->group)
                if (tipc_group_fill_sock_diag(tsk->group, skb))
                        goto stat_msg_cancel;

        nla_nest_end(skb, attrs);

        return 0;

stat_msg_cancel:
        nla_nest_cancel(skb, stat);
attr_msg_cancel:
        nla_nest_cancel(skb, attrs);
msg_cancel:
        return -EMSGSIZE;
}
EXPORT_SYMBOL(tipc_sk_fill_sock_diag);

int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        return tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk);
}

/* Caller should hold socket lock for the passed tipc socket. */
static int __tipc_nl_add_sk_publ(struct sk_buff *skb,
                                 struct netlink_callback *cb,
                                 struct publication *publ)
{
        void *hdr;
        struct nlattr *attrs;

        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                          &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET);
        if (!hdr)
                goto msg_cancel;

        attrs = nla_nest_start_noflag(skb, TIPC_NLA_PUBL);
        if (!attrs)
                goto genlmsg_cancel;

        if (nla_put_u32(skb, TIPC_NLA_PUBL_KEY, publ->key))
                goto attr_msg_cancel;
        if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->sr.type))
                goto attr_msg_cancel;
        if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->sr.lower))
                goto attr_msg_cancel;
        if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->sr.upper))
                goto attr_msg_cancel;

        nla_nest_end(skb, attrs);
        genlmsg_end(skb, hdr);

        return 0;

attr_msg_cancel:
        nla_nest_cancel(skb, attrs);
genlmsg_cancel:
        genlmsg_cancel(skb, hdr);
msg_cancel:
        return -EMSGSIZE;
}

/* Caller should hold socket lock for the passed tipc socket. */
static int __tipc_nl_list_sk_publ(struct sk_buff *skb,
                                  struct netlink_callback *cb,
                                  struct tipc_sock *tsk, u32 *last_publ)
{
        int err;
        struct publication *p;

        if (*last_publ) {
                list_for_each_entry(p, &tsk->publications, binding_sock) {
                        if (p->key == *last_publ)
                                break;
                }
                if (list_entry_is_head(p, &tsk->publications, binding_sock)) {
                        /* We never set seq or call nl_dump_check_consistent()
                         * this means that setting prev_seq here will cause the
                         * consistence check to fail in the netlink callback
                         * handler. Resulting in the last NLMSG_DONE message
                         * having the NLM_F_DUMP_INTR flag set.
                         */
                        cb->prev_seq = 1;
                        *last_publ = 0;
                        return -EPIPE;
                }
        } else {
                p = list_first_entry(&tsk->publications, struct publication,
                                     binding_sock);
        }

        list_for_each_entry_from(p, &tsk->publications, binding_sock) {
                err = __tipc_nl_add_sk_publ(skb, cb, p);
                if (err) {
                        *last_publ = p->key;
                        return err;
                }
        }
        *last_publ = 0;

        return 0;
}

int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        int err;
        u32 tsk_portid = cb->args[0];
        u32 last_publ = cb->args[1];
        u32 done = cb->args[2];
        struct net *net = sock_net(skb->sk);
        struct tipc_sock *tsk;

        if (!tsk_portid) {
                struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs;
                struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];

                if (!attrs[TIPC_NLA_SOCK])
                        return -EINVAL;

                err = nla_parse_nested_deprecated(sock, TIPC_NLA_SOCK_MAX,
                                                  attrs[TIPC_NLA_SOCK],
                                                  tipc_nl_sock_policy, NULL);
                if (err)
                        return err;

                if (!sock[TIPC_NLA_SOCK_REF])
                        return -EINVAL;

                tsk_portid = nla_get_u32(sock[TIPC_NLA_SOCK_REF]);
        }

        if (done)
                return 0;

        tsk = tipc_sk_lookup(net, tsk_portid);
        if (!tsk)
                return -EINVAL;

        lock_sock(&tsk->sk);
        err = __tipc_nl_list_sk_publ(skb, cb, tsk, &last_publ);
        if (!err)
                done = 1;
        release_sock(&tsk->sk);
        sock_put(&tsk->sk);

        cb->args[0] = tsk_portid;
        cb->args[1] = last_publ;
        cb->args[2] = done;

        return skb->len;
}

/**
 * tipc_sk_filtering - check if a socket should be traced
 * @sk: the socket to be examined
 *
 * @sysctl_tipc_sk_filter is used as the socket tuple for filtering:
 * (portid, sock type, name type, name lower, name upper)
 *
 * Return: true if the socket meets the socket tuple data
 * (value 0 = 'any') or when there is no tuple set (all = 0),
 * otherwise false
 */
bool tipc_sk_filtering(struct sock *sk)
{
        struct tipc_sock *tsk;
        struct publication *p;
        u32 _port, _sktype, _type, _lower, _upper;
        u32 type = 0, lower = 0, upper = 0;

        if (!sk)
                return true;

        tsk = tipc_sk(sk);

        _port = sysctl_tipc_sk_filter[0];
        _sktype = sysctl_tipc_sk_filter[1];
        _type = sysctl_tipc_sk_filter[2];
        _lower = sysctl_tipc_sk_filter[3];
        _upper = sysctl_tipc_sk_filter[4];

        if (!_port && !_sktype && !_type && !_lower && !_upper)
                return true;

        if (_port)
                return (_port == tsk->portid);

        if (_sktype && _sktype != sk->sk_type)
                return false;

        if (tsk->published) {
                p = list_first_entry_or_null(&tsk->publications,
                                             struct publication, binding_sock);
                if (p) {
                        type = p->sr.type;
                        lower = p->sr.lower;
                        upper = p->sr.upper;
                }
        }

        if (!tipc_sk_type_connectionless(sk)) {
                type = msg_nametype(&tsk->phdr);
                lower = msg_nameinst(&tsk->phdr);
                upper = lower;
        }

        if ((_type && _type != type) || (_lower && _lower != lower) ||
            (_upper && _upper != upper))
                return false;

        return true;
}

u32 tipc_sock_get_portid(struct sock *sk)
{
        return (sk) ? (tipc_sk(sk))->portid : 0;
}

/**
 * tipc_sk_overlimit1 - check if socket rx queue is about to be overloaded,
 *                        both the rcv and backlog queues are considered
 * @sk: tipc sk to be checked
 * @skb: tipc msg to be checked
 *
 * Return: true if the socket rx queue allocation is > 90%, otherwise false
 */

bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb)
{
        atomic_t *dcnt = &tipc_sk(sk)->dupl_rcvcnt;
        unsigned int lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
        unsigned int qsize = sk->sk_backlog.len + sk_rmem_alloc_get(sk);

        return (qsize > lim * 90 / 100);
}

/**
 * tipc_sk_overlimit2 - check if socket rx queue is about to be overloaded,
 *                        only the rcv queue is considered
 * @sk: tipc sk to be checked
 * @skb: tipc msg to be checked
 *
 * Return: true if the socket rx queue allocation is > 90%, otherwise false
 */

bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb)
{
        unsigned int lim = rcvbuf_limit(sk, skb);
        unsigned int qsize = sk_rmem_alloc_get(sk);

        return (qsize > lim * 90 / 100);
}

/**
 * tipc_sk_dump - dump TIPC socket
 * @sk: tipc sk to be dumped
 * @dqueues: bitmask to decide if any socket queue to be dumped?
 *           - TIPC_DUMP_NONE: don't dump socket queues
 *           - TIPC_DUMP_SK_SNDQ: dump socket send queue
 *           - TIPC_DUMP_SK_RCVQ: dump socket rcv queue
 *           - TIPC_DUMP_SK_BKLGQ: dump socket backlog queue
 *           - TIPC_DUMP_ALL: dump all the socket queues above
 * @buf: returned buffer of dump data in format
 */
int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf)
{
        int i = 0;
        size_t sz = (dqueues) ? SK_LMAX : SK_LMIN;
        u32 conn_type, conn_instance;
        struct tipc_sock *tsk;
        struct publication *p;
        bool tsk_connected;

        if (!sk) {
                i += scnprintf(buf, sz, "sk data: (null)\n");
                return i;
        }

        tsk = tipc_sk(sk);
        tsk_connected = !tipc_sk_type_connectionless(sk);

        i += scnprintf(buf, sz, "sk data: %u", sk->sk_type);
        i += scnprintf(buf + i, sz - i, " %d", sk->sk_state);
        i += scnprintf(buf + i, sz - i, " %x", tsk_own_node(tsk));
        i += scnprintf(buf + i, sz - i, " %u", tsk->portid);
        i += scnprintf(buf + i, sz - i, " | %u", tsk_connected);
        if (tsk_connected) {
                i += scnprintf(buf + i, sz - i, " %x", tsk_peer_node(tsk));
                i += scnprintf(buf + i, sz - i, " %u", tsk_peer_port(tsk));
                conn_type = msg_nametype(&tsk->phdr);
                conn_instance = msg_nameinst(&tsk->phdr);
                i += scnprintf(buf + i, sz - i, " %u", conn_type);
                i += scnprintf(buf + i, sz - i, " %u", conn_instance);
        }
        i += scnprintf(buf + i, sz - i, " | %u", tsk->published);
        if (tsk->published) {
                p = list_first_entry_or_null(&tsk->publications,
                                             struct publication, binding_sock);
                i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.type : 0);
                i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.lower : 0);
                i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.upper : 0);
        }
        i += scnprintf(buf + i, sz - i, " | %u", tsk->snd_win);
        i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_win);
        i += scnprintf(buf + i, sz - i, " %u", tsk->max_pkt);
        i += scnprintf(buf + i, sz - i, " %x", tsk->peer_caps);
        i += scnprintf(buf + i, sz - i, " %u", tsk->cong_link_cnt);
        i += scnprintf(buf + i, sz - i, " %u", tsk->snt_unacked);
        i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_unacked);
        i += scnprintf(buf + i, sz - i, " %u", atomic_read(&tsk->dupl_rcvcnt));
        i += scnprintf(buf + i, sz - i, " %u", sk->sk_shutdown);
        i += scnprintf(buf + i, sz - i, " | %d", sk_wmem_alloc_get(sk));
        i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf);
        i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk));
        i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf);
        i += scnprintf(buf + i, sz - i, " | %d\n", READ_ONCE(sk->sk_backlog.len));

        if (dqueues & TIPC_DUMP_SK_SNDQ) {
                i += scnprintf(buf + i, sz - i, "sk_write_queue: ");
                i += tipc_list_dump(&sk->sk_write_queue, false, buf + i);
        }

        if (dqueues & TIPC_DUMP_SK_RCVQ) {
                i += scnprintf(buf + i, sz - i, "sk_receive_queue: ");
                i += tipc_list_dump(&sk->sk_receive_queue, false, buf + i);
        }

        if (dqueues & TIPC_DUMP_SK_BKLGQ) {
                i += scnprintf(buf + i, sz - i, "sk_backlog:\n  head ");
                i += tipc_skb_dump(sk->sk_backlog.head, false, buf + i);
                if (sk->sk_backlog.tail != sk->sk_backlog.head) {
                        i += scnprintf(buf + i, sz - i, "  tail ");
                        i += tipc_skb_dump(sk->sk_backlog.tail, false,
                                           buf + i);
                }
        }

        return i;
}

































































































    1 


































































    1 




    1 
































    1 







    1 





























































































































































    1 





    1 




















    1 












    1 







    1 



    1 









    1 















































































































    1 











    1 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
// SPDX-License-Identifier: GPL-2.0-only
/*
 * proc/fs/generic.c --- generic routines for the proc-fs
 *
 * This file contains generic proc-fs routines for handling
 * directories and files.
 * 
 * Copyright (C) 1991, 1992 Linus Torvalds.
 * Copyright (C) 1997 Theodore Ts'o
 */

#include <linux/cache.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/printk.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/idr.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>

#include "internal.h"

static DEFINE_RWLOCK(proc_subdir_lock);

struct kmem_cache *proc_dir_entry_cache __ro_after_init;

void pde_free(struct proc_dir_entry *pde)
{
        if (S_ISLNK(pde->mode))
                kfree(pde->data);
        if (pde->name != pde->inline_name)
                kfree(pde->name);
        kmem_cache_free(proc_dir_entry_cache, pde);
}

static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
{
        if (len < de->namelen)
                return -1;
        if (len > de->namelen)
                return 1;

        return memcmp(name, de->name, len);
}

static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
{
        return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
                             subdir_node);
}

static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
{
        return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry,
                             subdir_node);
}

static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
                                              const char *name,
                                              unsigned int len)
{
        struct rb_node *node = dir->subdir.rb_node;

        while (node) {
                struct proc_dir_entry *de = rb_entry(node,
                                                     struct proc_dir_entry,
                                                     subdir_node);
                int result = proc_match(name, de, len);

                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
                        node = node->rb_right;
                else
                        return de;
        }
        return NULL;
}

static bool pde_subdir_insert(struct proc_dir_entry *dir,
                              struct proc_dir_entry *de)
{
        struct rb_root *root = &dir->subdir;
        struct rb_node **new = &root->rb_node, *parent = NULL;

        /* Figure out where to put new node */
        while (*new) {
                struct proc_dir_entry *this = rb_entry(*new,
                                                       struct proc_dir_entry,
                                                       subdir_node);
                int result = proc_match(de->name, this, de->namelen);

                parent = *new;
                if (result < 0)
                        new = &(*new)->rb_left;
                else if (result > 0)
                        new = &(*new)->rb_right;
                else
                        return false;
        }

        /* Add new node and rebalance tree. */
        rb_link_node(&de->subdir_node, parent, new);
        rb_insert_color(&de->subdir_node, root);
        return true;
}

static int proc_notify_change(struct mnt_idmap *idmap,
                              struct dentry *dentry, struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct proc_dir_entry *de = PDE(inode);
        int error;

        error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
        if (error)
                return error;

        setattr_copy(&nop_mnt_idmap, inode, iattr);

        proc_set_user(de, inode->i_uid, inode->i_gid);
        de->mode = inode->i_mode;
        return 0;
}

static int proc_getattr(struct mnt_idmap *idmap,
                        const struct path *path, struct kstat *stat,
                        u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct proc_dir_entry *de = PDE(inode);
        if (de) {
                nlink_t nlink = READ_ONCE(de->nlink);
                if (nlink > 0) {
                        set_nlink(inode, nlink);
                }
        }

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        return 0;
}

static const struct inode_operations proc_file_inode_operations = {
        .setattr        = proc_notify_change,
};

/*
 * This function parses a name such as "tty/driver/serial", and
 * returns the struct proc_dir_entry for "/proc/tty/driver", and
 * returns "serial" in residual.
 */
static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
                             const char **residual)
{
        const char                     *cp = name, *next;
        struct proc_dir_entry        *de;

        de = *ret ?: &proc_root;
        while ((next = strchr(cp, '/')) != NULL) {
                de = pde_subdir_find(de, cp, next - cp);
                if (!de) {
                        WARN(1, "name '%s'\n", name);
                        return -ENOENT;
                }
                cp = next + 1;
        }
        *residual = cp;
        *ret = de;
        return 0;
}

static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
                           const char **residual)
{
        int rv;

        read_lock(&proc_subdir_lock);
        rv = __xlate_proc_name(name, ret, residual);
        read_unlock(&proc_subdir_lock);
        return rv;
}

static DEFINE_IDA(proc_inum_ida);

#define PROC_DYNAMIC_FIRST 0xF0000000U

/*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
 */
int proc_alloc_inum(unsigned int *inum)
{
        int i;

        i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
                           GFP_KERNEL);
        if (i < 0)
                return i;

        *inum = PROC_DYNAMIC_FIRST + (unsigned int)i;
        return 0;
}

void proc_free_inum(unsigned int inum)
{
        ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}

static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        if (flags & LOOKUP_RCU)
                return -ECHILD;

        if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0)
                return 0; /* revalidate */
        return 1;
}

static int proc_misc_d_delete(const struct dentry *dentry)
{
        return atomic_read(&PDE(d_inode(dentry))->in_use) < 0;
}

static const struct dentry_operations proc_misc_dentry_ops = {
        .d_revalidate        = proc_misc_d_revalidate,
        .d_delete        = proc_misc_d_delete,
};

/*
 * Don't create negative dentries here, return -ENOENT by hand
 * instead.
 */
struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
                              struct proc_dir_entry *de)
{
        struct inode *inode;

        read_lock(&proc_subdir_lock);
        de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
        if (de) {
                pde_get(de);
                read_unlock(&proc_subdir_lock);
                inode = proc_get_inode(dir->i_sb, de);
                if (!inode)
                        return ERR_PTR(-ENOMEM);
                d_set_d_op(dentry, de->proc_dops);
                return d_splice_alias(inode, dentry);
        }
        read_unlock(&proc_subdir_lock);
        return ERR_PTR(-ENOENT);
}

struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
                unsigned int flags)
{
        struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb);

        if (fs_info->pidonly == PROC_PIDONLY_ON)
                return ERR_PTR(-ENOENT);

        return proc_lookup_de(dir, dentry, PDE(dir));
}

/*
 * This returns non-zero if at EOF, so that the /proc
 * root directory can use this and check if it should
 * continue with the <pid> entries..
 *
 * Note that the VFS-layer doesn't care about the return
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
int proc_readdir_de(struct file *file, struct dir_context *ctx,
                    struct proc_dir_entry *de)
{
        int i;

        if (!dir_emit_dots(file, ctx))
                return 0;

        i = ctx->pos - 2;
        read_lock(&proc_subdir_lock);
        de = pde_subdir_first(de);
        for (;;) {
                if (!de) {
                        read_unlock(&proc_subdir_lock);
                        return 0;
                }
                if (!i)
                        break;
                de = pde_subdir_next(de);
                i--;
        }

        do {
                struct proc_dir_entry *next;
                pde_get(de);
                read_unlock(&proc_subdir_lock);
                if (!dir_emit(ctx, de->name, de->namelen,
                            de->low_ino, de->mode >> 12)) {
                        pde_put(de);
                        return 0;
                }
                ctx->pos++;
                read_lock(&proc_subdir_lock);
                next = pde_subdir_next(de);
                pde_put(de);
                de = next;
        } while (de);
        read_unlock(&proc_subdir_lock);
        return 1;
}

int proc_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);

        if (fs_info->pidonly == PROC_PIDONLY_ON)
                return 1;

        return proc_readdir_de(file, ctx, PDE(inode));
}

/*
 * These are the generic /proc directory operations. They
 * use the in-memory "struct proc_dir_entry" tree to parse
 * the /proc directory.
 */
static const struct file_operations proc_dir_operations = {
        .llseek                        = generic_file_llseek,
        .read                        = generic_read_dir,
        .iterate_shared                = proc_readdir,
};

static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        return 0;
}

const struct dentry_operations proc_net_dentry_ops = {
        .d_revalidate        = proc_net_d_revalidate,
        .d_delete        = always_delete_dentry,
};

/*
 * proc directories can do almost nothing..
 */
static const struct inode_operations proc_dir_inode_operations = {
        .lookup                = proc_lookup,
        .getattr        = proc_getattr,
        .setattr        = proc_notify_change,
};

/* returns the registered entry, or frees dp and returns NULL on failure */
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp)
{
        if (proc_alloc_inum(&dp->low_ino))
                goto out_free_entry;

        write_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
                write_unlock(&proc_subdir_lock);
                goto out_free_inum;
        }
        dir->nlink++;
        write_unlock(&proc_subdir_lock);

        return dp;
out_free_inum:
        proc_free_inum(dp->low_ino);
out_free_entry:
        pde_free(dp);
        return NULL;
}

static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
                                          const char *name,
                                          umode_t mode,
                                          nlink_t nlink)
{
        struct proc_dir_entry *ent = NULL;
        const char *fn;
        struct qstr qstr;

        if (xlate_proc_name(name, parent, &fn) != 0)
                goto out;
        qstr.name = fn;
        qstr.len = strlen(fn);
        if (qstr.len == 0 || qstr.len >= 256) {
                WARN(1, "name len %u\n", qstr.len);
                return NULL;
        }
        if (qstr.len == 1 && fn[0] == '.') {
                WARN(1, "name '.'\n");
                return NULL;
        }
        if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') {
                WARN(1, "name '..'\n");
                return NULL;
        }
        if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
                WARN(1, "create '/proc/%s' by hand\n", qstr.name);
                return NULL;
        }
        if (is_empty_pde(*parent)) {
                WARN(1, "attempt to add to permanently empty directory");
                return NULL;
        }

        ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
        if (!ent)
                goto out;

        if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) {
                ent->name = ent->inline_name;
        } else {
                ent->name = kmalloc(qstr.len + 1, GFP_KERNEL);
                if (!ent->name) {
                        pde_free(ent);
                        return NULL;
                }
        }

        memcpy(ent->name, fn, qstr.len + 1);
        ent->namelen = qstr.len;
        ent->mode = mode;
        ent->nlink = nlink;
        ent->subdir = RB_ROOT;
        refcount_set(&ent->refcnt, 1);
        spin_lock_init(&ent->pde_unload_lock);
        INIT_LIST_HEAD(&ent->pde_openers);
        proc_set_user(ent, (*parent)->uid, (*parent)->gid);

        ent->proc_dops = &proc_misc_dentry_ops;
        /* Revalidate everything under /proc/${pid}/net */
        if ((*parent)->proc_dops == &proc_net_dentry_ops)
                pde_force_lookup(ent);

out:
        return ent;
}

struct proc_dir_entry *proc_symlink(const char *name,
                struct proc_dir_entry *parent, const char *dest)
{
        struct proc_dir_entry *ent;

        ent = __proc_create(&parent, name,
                          (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);

        if (ent) {
                ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
                if (ent->data) {
                        strcpy((char*)ent->data,dest);
                        ent->proc_iops = &proc_link_inode_operations;
                        ent = proc_register(parent, ent);
                } else {
                        pde_free(ent);
                        ent = NULL;
                }
        }
        return ent;
}
EXPORT_SYMBOL(proc_symlink);

struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
                struct proc_dir_entry *parent, void *data, bool force_lookup)
{
        struct proc_dir_entry *ent;

        if (mode == 0)
                mode = S_IRUGO | S_IXUGO;

        ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
        if (ent) {
                ent->data = data;
                ent->proc_dir_ops = &proc_dir_operations;
                ent->proc_iops = &proc_dir_inode_operations;
                if (force_lookup) {
                        pde_force_lookup(ent);
                }
                ent = proc_register(parent, ent);
        }
        return ent;
}
EXPORT_SYMBOL_GPL(_proc_mkdir);

struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent, void *data)
{
        return _proc_mkdir(name, mode, parent, data, false);
}
EXPORT_SYMBOL_GPL(proc_mkdir_data);

struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
                                       struct proc_dir_entry *parent)
{
        return proc_mkdir_data(name, mode, parent, NULL);
}
EXPORT_SYMBOL(proc_mkdir_mode);

struct proc_dir_entry *proc_mkdir(const char *name,
                struct proc_dir_entry *parent)
{
        return proc_mkdir_data(name, 0, parent, NULL);
}
EXPORT_SYMBOL(proc_mkdir);

struct proc_dir_entry *proc_create_mount_point(const char *name)
{
        umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO;
        struct proc_dir_entry *ent, *parent = NULL;

        ent = __proc_create(&parent, name, mode, 2);
        if (ent) {
                ent->data = NULL;
                ent->proc_dir_ops = NULL;
                ent->proc_iops = NULL;
                ent = proc_register(parent, ent);
        }
        return ent;
}
EXPORT_SYMBOL(proc_create_mount_point);

struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data)
{
        struct proc_dir_entry *p;

        if ((mode & S_IFMT) == 0)
                mode |= S_IFREG;
        if ((mode & S_IALLUGO) == 0)
                mode |= S_IRUGO;
        if (WARN_ON_ONCE(!S_ISREG(mode)))
                return NULL;

        p = __proc_create(parent, name, mode, 1);
        if (p) {
                p->proc_iops = &proc_file_inode_operations;
                p->data = data;
        }
        return p;
}

static inline void pde_set_flags(struct proc_dir_entry *pde)
{
        if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
                pde->flags |= PROC_ENTRY_PERMANENT;
}

struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                const struct proc_ops *proc_ops, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = proc_ops;
        pde_set_flags(p);
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_data);
 
struct proc_dir_entry *proc_create(const char *name, umode_t mode,
                                   struct proc_dir_entry *parent,
                                   const struct proc_ops *proc_ops)
{
        return proc_create_data(name, mode, parent, proc_ops, NULL);
}
EXPORT_SYMBOL(proc_create);

static int proc_seq_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        if (de->state_size)
                return seq_open_private(file, de->seq_ops, de->state_size);
        return seq_open(file, de->seq_ops);
}

static int proc_seq_release(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        if (de->state_size)
                return seq_release_private(inode, file);
        return seq_release(inode, file);
}

static const struct proc_ops proc_seq_ops = {
        /* not permanent -- can call into arbitrary seq_operations */
        .proc_open        = proc_seq_open,
        .proc_read_iter        = seq_read_iter,
        .proc_lseek        = seq_lseek,
        .proc_release        = proc_seq_release,
};

struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
                struct proc_dir_entry *parent, const struct seq_operations *ops,
                unsigned int state_size, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = &proc_seq_ops;
        p->seq_ops = ops;
        p->state_size = state_size;
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);

static int proc_single_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);

        return single_open(file, de->single_show, de->data);
}

static const struct proc_ops proc_single_ops = {
        /* not permanent -- can call into arbitrary ->single_show */
        .proc_open        = proc_single_open,
        .proc_read_iter = seq_read_iter,
        .proc_lseek        = seq_lseek,
        .proc_release        = single_release,
};

struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                int (*show)(struct seq_file *, void *), void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        p->proc_ops = &proc_single_ops;
        p->single_show = show;
        return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);

void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
        de->size = size;
}
EXPORT_SYMBOL(proc_set_size);

void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
{
        de->uid = uid;
        de->gid = gid;
}
EXPORT_SYMBOL(proc_set_user);

void pde_put(struct proc_dir_entry *pde)
{
        if (refcount_dec_and_test(&pde->refcnt)) {
                proc_free_inum(pde->low_ino);
                pde_free(pde);
        }
}

/*
 * Remove a /proc entry and free it if it's not currently in use.
 */
void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
{
        struct proc_dir_entry *de = NULL;
        const char *fn = name;
        unsigned int len;

        write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
                write_unlock(&proc_subdir_lock);
                return;
        }
        len = strlen(fn);

        de = pde_subdir_find(parent, fn, len);
        if (de) {
                if (unlikely(pde_is_permanent(de))) {
                        WARN(1, "removing permanent /proc entry '%s'", de->name);
                        de = NULL;
                } else {
                        rb_erase(&de->subdir_node, &parent->subdir);
                        if (S_ISDIR(de->mode))
                                parent->nlink--;
                }
        }
        write_unlock(&proc_subdir_lock);
        if (!de) {
                WARN(1, "name '%s'\n", name);
                return;
        }

        proc_entry_rundown(de);

        WARN(pde_subdir_first(de),
             "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
             __func__, de->parent->name, de->name, pde_subdir_first(de)->name);
        pde_put(de);
}
EXPORT_SYMBOL(remove_proc_entry);

int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
{
        struct proc_dir_entry *root = NULL, *de, *next;
        const char *fn = name;
        unsigned int len;

        write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
                write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        len = strlen(fn);

        root = pde_subdir_find(parent, fn, len);
        if (!root) {
                write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        if (unlikely(pde_is_permanent(root))) {
                write_unlock(&proc_subdir_lock);
                WARN(1, "removing permanent /proc entry '%s/%s'",
                        root->parent->name, root->name);
                return -EINVAL;
        }
        rb_erase(&root->subdir_node, &parent->subdir);

        de = root;
        while (1) {
                next = pde_subdir_first(de);
                if (next) {
                        if (unlikely(pde_is_permanent(next))) {
                                write_unlock(&proc_subdir_lock);
                                WARN(1, "removing permanent /proc entry '%s/%s'",
                                        next->parent->name, next->name);
                                return -EINVAL;
                        }
                        rb_erase(&next->subdir_node, &de->subdir);
                        de = next;
                        continue;
                }
                next = de->parent;
                if (S_ISDIR(de->mode))
                        next->nlink--;
                write_unlock(&proc_subdir_lock);

                proc_entry_rundown(de);
                if (de == root)
                        break;
                pde_put(de);

                write_lock(&proc_subdir_lock);
                de = next;
        }
        pde_put(root);
        return 0;
}
EXPORT_SYMBOL(remove_proc_subtree);

void *proc_get_parent_data(const struct inode *inode)
{
        struct proc_dir_entry *de = PDE(inode);
        return de->parent->data;
}
EXPORT_SYMBOL_GPL(proc_get_parent_data);

void proc_remove(struct proc_dir_entry *de)
{
        if (de)
                remove_proc_subtree(de->name, de->parent);
}
EXPORT_SYMBOL(proc_remove);

/*
 * Pull a user buffer into memory and pass it to the file's write handler if
 * one is supplied.  The ->write() method is permitted to modify the
 * kernel-side buffer.
 */
ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size,
                          loff_t *_pos)
{
        struct proc_dir_entry *pde = PDE(file_inode(f));
        char *buf;
        int ret;

        if (!pde->write)
                return -EACCES;
        if (size == 0 || size > PAGE_SIZE - 1)
                return -EINVAL;
        buf = memdup_user_nul(ubuf, size);
        if (IS_ERR(buf))
                return PTR_ERR(buf);
        ret = pde->write(f, buf, size);
        kfree(buf);
        return ret == 0 ? size : ret;
}































































































    1 
    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/madvise.c
 *
 * Copyright (C) 1999  Linus Torvalds
 * Copyright (C) 2002  Christoph Hellwig
 */

#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>

#include <asm/tlb.h>

#include "internal.h"
#include "swap.h"

struct madvise_walk_private {
        struct mmu_gather *tlb;
        bool pageout;
};

/*
 * Any behaviour which results in changes to the vma->vm_flags needs to
 * take mmap_lock for writing. Others, which simply traverse vmas, need
 * to only take it for reading.
 */
static int madvise_need_mmap_write(int behavior)
{
        switch (behavior) {
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_FREE:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
        case MADV_COLLAPSE:
                return 0;
        default:
                /* be safe, default to 1. list exceptions explicitly */
                return 1;
        }
}

#ifdef CONFIG_ANON_VMA_NAME
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
        struct anon_vma_name *anon_name;
        size_t count;

        /* Add 1 for NUL terminator at the end of the anon_name->name */
        count = strlen(name) + 1;
        anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
        if (anon_name) {
                kref_init(&anon_name->kref);
                memcpy(anon_name->name, name, count);
        }

        return anon_name;
}

void anon_vma_name_free(struct kref *kref)
{
        struct anon_vma_name *anon_name =
                        container_of(kref, struct anon_vma_name, kref);
        kfree(anon_name);
}

struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);

        return vma->anon_name;
}

/* mmap_lock should be write-locked */
static int replace_anon_vma_name(struct vm_area_struct *vma,
                                 struct anon_vma_name *anon_name)
{
        struct anon_vma_name *orig_name = anon_vma_name(vma);

        if (!anon_name) {
                vma->anon_name = NULL;
                anon_vma_name_put(orig_name);
                return 0;
        }

        if (anon_vma_name_eq(orig_name, anon_name))
                return 0;

        vma->anon_name = anon_vma_name_reuse(anon_name);
        anon_vma_name_put(orig_name);

        return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
static int replace_anon_vma_name(struct vm_area_struct *vma,
                                 struct anon_vma_name *anon_name)
{
        if (anon_name)
                return -EINVAL;

        return 0;
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
 * Update the vm_flags on region of a vma, splitting it or merging it as
 * necessary.  Must be called with mmap_lock held for writing;
 * Caller should ensure anon_name stability by raising its refcount even when
 * anon_name belongs to a valid vma because this function might free that vma.
 */
static int madvise_update_vma(struct vm_area_struct *vma,
                              struct vm_area_struct **prev, unsigned long start,
                              unsigned long end, unsigned long new_flags,
                              struct anon_vma_name *anon_name)
{
        struct mm_struct *mm = vma->vm_mm;
        int error;
        VMA_ITERATOR(vmi, mm, start);

        if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
                *prev = vma;
                return 0;
        }

        vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
                                    anon_name);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;

        /* vm_flags is protected by the mmap_lock held in write mode. */
        vma_start_write(vma);
        vm_flags_reset(vma, new_flags);
        if (!vma->vm_file || vma_is_anon_shmem(vma)) {
                error = replace_anon_vma_name(vma, anon_name);
                if (error)
                        return error;
        }

        return 0;
}

#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
                unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->private;
        struct swap_iocb *splug = NULL;
        pte_t *ptep = NULL;
        spinlock_t *ptl;
        unsigned long addr;

        for (addr = start; addr < end; addr += PAGE_SIZE) {
                pte_t pte;
                swp_entry_t entry;
                struct folio *folio;

                if (!ptep++) {
                        ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
                        if (!ptep)
                                break;
                }

                pte = ptep_get(ptep);
                if (!is_swap_pte(pte))
                        continue;
                entry = pte_to_swp_entry(pte);
                if (unlikely(non_swap_entry(entry)))
                        continue;

                pte_unmap_unlock(ptep, ptl);
                ptep = NULL;

                folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
                                             vma, addr, &splug);
                if (folio)
                        folio_put(folio);
        }

        if (ptep)
                pte_unmap_unlock(ptep, ptl);
        swap_read_unplug(splug);
        cond_resched();

        return 0;
}

static const struct mm_walk_ops swapin_walk_ops = {
        .pmd_entry                = swapin_walk_pmd_entry,
        .walk_lock                = PGWALK_RDLOCK,
};

static void shmem_swapin_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct address_space *mapping)
{
        XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
        pgoff_t end_index = linear_page_index(vma, end) - 1;
        struct folio *folio;
        struct swap_iocb *splug = NULL;

        rcu_read_lock();
        xas_for_each(&xas, folio, end_index) {
                unsigned long addr;
                swp_entry_t entry;

                if (!xa_is_value(folio))
                        continue;
                entry = radix_to_swp_entry(folio);
                /* There might be swapin error entries in shmem mapping. */
                if (non_swap_entry(entry))
                        continue;

                addr = vma->vm_start +
                        ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
                xas_pause(&xas);
                rcu_read_unlock();

                folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
                                             vma, addr, &splug);
                if (folio)
                        folio_put(folio);

                rcu_read_lock();
        }
        rcu_read_unlock();
        swap_read_unplug(splug);
}
#endif                /* CONFIG_SWAP */

/*
 * Schedule all required I/O operations.  Do not wait for completion.
 */
static long madvise_willneed(struct vm_area_struct *vma,
                             struct vm_area_struct **prev,
                             unsigned long start, unsigned long end)
{
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
        loff_t offset;

        *prev = vma;
#ifdef CONFIG_SWAP
        if (!file) {
                walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
                lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }

        if (shmem_mapping(file->f_mapping)) {
                shmem_swapin_range(vma, start, end, file->f_mapping);
                lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }
#else
        if (!file)
                return -EBADF;
#endif

        if (IS_DAX(file_inode(file))) {
                /* no bad return value, but ignore advice */
                return 0;
        }

        /*
         * Filesystem's fadvise may need to take various locks.  We need to
         * explicitly grab a reference because the vma (and hence the
         * vma's reference to the file) can go away as soon as we drop
         * mmap_lock.
         */
        *prev = NULL;        /* tell sys_madvise we drop mmap_lock */
        get_file(file);
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
        mmap_read_unlock(mm);
        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
        fput(file);
        mmap_read_lock(mm);
        return 0;
}

static inline bool can_do_file_pageout(struct vm_area_struct *vma)
{
        if (!vma->vm_file)
                return false;
        /*
         * paging out pagecache only for non-anonymous mappings that correspond
         * to the files the calling process could (if tried) open for writing;
         * otherwise we'd be including shared non-exclusive mappings, which
         * opens a side channel.
         */
        return inode_owner_or_capable(&nop_mnt_idmap,
                                      file_inode(vma->vm_file)) ||
               file_permission(vma->vm_file, MAY_WRITE) == 0;
}

static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
                                          struct folio *folio, pte_t *ptep,
                                          pte_t pte, bool *any_young,
                                          bool *any_dirty)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        int max_nr = (end - addr) / PAGE_SIZE;

        return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
                               any_young, any_dirty);
}

static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
{
        struct madvise_walk_private *private = walk->private;
        struct mmu_gather *tlb = private->tlb;
        bool pageout = private->pageout;
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
        pte_t *start_pte, *pte, ptent;
        spinlock_t *ptl;
        struct folio *folio = NULL;
        LIST_HEAD(folio_list);
        bool pageout_anon_only_filter;
        unsigned int batch_count = 0;
        int nr;

        if (fatal_signal_pending(current))
                return -EINTR;

        pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
                                        !can_do_file_pageout(vma);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(*pmd)) {
                pmd_t orig_pmd;
                unsigned long next = pmd_addr_end(addr, end);

                tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
                ptl = pmd_trans_huge_lock(pmd, vma);
                if (!ptl)
                        return 0;

                orig_pmd = *pmd;
                if (is_huge_zero_pmd(orig_pmd))
                        goto huge_unlock;

                if (unlikely(!pmd_present(orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                        !is_pmd_migration_entry(orig_pmd));
                        goto huge_unlock;
                }

                folio = pmd_folio(orig_pmd);

                /* Do not interfere with other mappings of this folio */
                if (folio_likely_mapped_shared(folio))
                        goto huge_unlock;

                if (pageout_anon_only_filter && !folio_test_anon(folio))
                        goto huge_unlock;

                if (next - addr != HPAGE_PMD_SIZE) {
                        int err;

                        folio_get(folio);
                        spin_unlock(ptl);
                        folio_lock(folio);
                        err = split_folio(folio);
                        folio_unlock(folio);
                        folio_put(folio);
                        if (!err)
                                goto regular_folio;
                        return 0;
                }

                if (!pageout && pmd_young(orig_pmd)) {
                        pmdp_invalidate(vma, addr, pmd);
                        orig_pmd = pmd_mkold(orig_pmd);

                        set_pmd_at(mm, addr, pmd, orig_pmd);
                        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                }

                folio_clear_referenced(folio);
                folio_test_clear_young(folio);
                if (folio_test_active(folio))
                        folio_set_workingset(folio);
                if (pageout) {
                        if (folio_isolate_lru(folio)) {
                                if (folio_test_unevictable(folio))
                                        folio_putback_lru(folio);
                                else
                                        list_add(&folio->lru, &folio_list);
                        }
                } else
                        folio_deactivate(folio);
huge_unlock:
                spin_unlock(ptl);
                if (pageout)
                        reclaim_pages(&folio_list);
                return 0;
        }

regular_folio:
#endif
        tlb_change_page_size(tlb, PAGE_SIZE);
restart:
        start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!start_pte)
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
                nr = 1;
                ptent = ptep_get(pte);

                if (++batch_count == SWAP_CLUSTER_MAX) {
                        batch_count = 0;
                        if (need_resched()) {
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                cond_resched();
                                goto restart;
                        }
                }

                if (pte_none(ptent))
                        continue;

                if (!pte_present(ptent))
                        continue;

                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                /*
                 * If we encounter a large folio, only split it if it is not
                 * fully mapped within the range we are operating on. Otherwise
                 * leave it as is so that it can be swapped out whole. If we
                 * fail to split a folio, leave it in place and advance to the
                 * next pte in the range.
                 */
                if (folio_test_large(folio)) {
                        bool any_young;

                        nr = madvise_folio_pte_batch(addr, end, folio, pte,
                                                     ptent, &any_young, NULL);
                        if (any_young)
                                ptent = pte_mkyoung(ptent);

                        if (nr < folio_nr_pages(folio)) {
                                int err;

                                if (folio_likely_mapped_shared(folio))
                                        continue;
                                if (pageout_anon_only_filter && !folio_test_anon(folio))
                                        continue;
                                if (!folio_trylock(folio))
                                        continue;
                                folio_get(folio);
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                start_pte = NULL;
                                err = split_folio(folio);
                                folio_unlock(folio);
                                folio_put(folio);
                                start_pte = pte =
                                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                                if (!start_pte)
                                        break;
                                arch_enter_lazy_mmu_mode();
                                if (!err)
                                        nr = 0;
                                continue;
                        }
                }

                /*
                 * Do not interfere with other mappings of this folio and
                 * non-LRU folio. If we have a large folio at this point, we
                 * know it is fully mapped so if its mapcount is the same as its
                 * number of pages, it must be exclusive.
                 */
                if (!folio_test_lru(folio) ||
                    folio_mapcount(folio) != folio_nr_pages(folio))
                        continue;

                if (pageout_anon_only_filter && !folio_test_anon(folio))
                        continue;

                if (!pageout && pte_young(ptent)) {
                        clear_young_dirty_ptes(vma, addr, pte, nr,
                                               CYDP_CLEAR_YOUNG);
                        tlb_remove_tlb_entries(tlb, pte, nr, addr);
                }

                /*
                 * We are deactivating a folio for accelerating reclaiming.
                 * VM couldn't reclaim the folio unless we clear PG_young.
                 * As a side effect, it makes confuse idle-page tracking
                 * because they will miss recent referenced history.
                 */
                folio_clear_referenced(folio);
                folio_test_clear_young(folio);
                if (folio_test_active(folio))
                        folio_set_workingset(folio);
                if (pageout) {
                        if (folio_isolate_lru(folio)) {
                                if (folio_test_unevictable(folio))
                                        folio_putback_lru(folio);
                                else
                                        list_add(&folio->lru, &folio_list);
                        }
                } else
                        folio_deactivate(folio);
        }

        if (start_pte) {
                arch_leave_lazy_mmu_mode();
                pte_unmap_unlock(start_pte, ptl);
        }
        if (pageout)
                reclaim_pages(&folio_list);
        cond_resched();

        return 0;
}

static const struct mm_walk_ops cold_walk_ops = {
        .pmd_entry = madvise_cold_or_pageout_pte_range,
        .walk_lock = PGWALK_RDLOCK,
};

static void madvise_cold_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end)
{
        struct madvise_walk_private walk_private = {
                .pageout = false,
                .tlb = tlb,
        };

        tlb_start_vma(tlb, vma);
        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
        tlb_end_vma(tlb, vma);
}

static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
        return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}

static long madvise_cold(struct vm_area_struct *vma,
                        struct vm_area_struct **prev,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;

        *prev = vma;
        if (!can_madv_lru_vma(vma))
                return -EINVAL;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
        tlb_finish_mmu(&tlb);

        return 0;
}

static void madvise_pageout_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end)
{
        struct madvise_walk_private walk_private = {
                .pageout = true,
                .tlb = tlb,
        };

        tlb_start_vma(tlb, vma);
        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
        tlb_end_vma(tlb, vma);
}

static long madvise_pageout(struct vm_area_struct *vma,
                        struct vm_area_struct **prev,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;

        *prev = vma;
        if (!can_madv_lru_vma(vma))
                return -EINVAL;

        /*
         * If the VMA belongs to a private file mapping, there can be private
         * dirty pages which can be paged out if even this process is neither
         * owner nor write capable of the file. We allow private file mappings
         * further to pageout dirty anon pages.
         */
        if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
                                (vma->vm_flags & VM_MAYSHARE)))
                return 0;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
        tlb_finish_mmu(&tlb);

        return 0;
}

static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)

{
        const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
        struct mmu_gather *tlb = walk->private;
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *start_pte, *pte, ptent;
        struct folio *folio;
        int nr_swap = 0;
        unsigned long next;
        int nr, max_nr;

        next = pmd_addr_end(addr, end);
        if (pmd_trans_huge(*pmd))
                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
                        return 0;

        tlb_change_page_size(tlb, PAGE_SIZE);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!start_pte)
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
                nr = 1;
                ptent = ptep_get(pte);

                if (pte_none(ptent))
                        continue;
                /*
                 * If the pte has swp_entry, just clear page table to
                 * prevent swap-in which is more expensive rather than
                 * (page allocation + zeroing).
                 */
                if (!pte_present(ptent)) {
                        swp_entry_t entry;

                        entry = pte_to_swp_entry(ptent);
                        if (!non_swap_entry(entry)) {
                                max_nr = (end - addr) / PAGE_SIZE;
                                nr = swap_pte_batch(pte, max_nr, ptent);
                                nr_swap -= nr;
                                free_swap_and_cache_nr(entry, nr);
                                clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                        } else if (is_hwpoison_entry(entry) ||
                                   is_poisoned_swp_entry(entry)) {
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        }
                        continue;
                }

                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                /*
                 * If we encounter a large folio, only split it if it is not
                 * fully mapped within the range we are operating on. Otherwise
                 * leave it as is so that it can be marked as lazyfree. If we
                 * fail to split a folio, leave it in place and advance to the
                 * next pte in the range.
                 */
                if (folio_test_large(folio)) {
                        bool any_young, any_dirty;

                        nr = madvise_folio_pte_batch(addr, end, folio, pte,
                                                     ptent, &any_young, &any_dirty);

                        if (nr < folio_nr_pages(folio)) {
                                int err;

                                if (folio_likely_mapped_shared(folio))
                                        continue;
                                if (!folio_trylock(folio))
                                        continue;
                                folio_get(folio);
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                start_pte = NULL;
                                err = split_folio(folio);
                                folio_unlock(folio);
                                folio_put(folio);
                                pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
                                start_pte = pte;
                                if (!start_pte)
                                        break;
                                arch_enter_lazy_mmu_mode();
                                if (!err)
                                        nr = 0;
                                continue;
                        }

                        if (any_young)
                                ptent = pte_mkyoung(ptent);
                        if (any_dirty)
                                ptent = pte_mkdirty(ptent);
                }

                if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
                        if (!folio_trylock(folio))
                                continue;
                        /*
                         * If we have a large folio at this point, we know it is
                         * fully mapped so if its mapcount is the same as its
                         * number of pages, it must be exclusive.
                         */
                        if (folio_mapcount(folio) != folio_nr_pages(folio)) {
                                folio_unlock(folio);
                                continue;
                        }

                        if (folio_test_swapcache(folio) &&
                            !folio_free_swap(folio)) {
                                folio_unlock(folio);
                                continue;
                        }

                        folio_clear_dirty(folio);
                        folio_unlock(folio);
                }

                if (pte_young(ptent) || pte_dirty(ptent)) {
                        clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
                        tlb_remove_tlb_entries(tlb, pte, nr, addr);
                }
                folio_mark_lazyfree(folio);
        }

        if (nr_swap)
                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
        if (start_pte) {
                arch_leave_lazy_mmu_mode();
                pte_unmap_unlock(start_pte, ptl);
        }
        cond_resched();

        return 0;
}

static const struct mm_walk_ops madvise_free_walk_ops = {
        .pmd_entry                = madvise_free_pte_range,
        .walk_lock                = PGWALK_RDLOCK,
};

static int madvise_free_single_vma(struct vm_area_struct *vma,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        /* MADV_FREE works for only anon vma at the moment */
        if (!vma_is_anonymous(vma))
                return -EINVAL;

        range.start = max(vma->vm_start, start_addr);
        if (range.start >= vma->vm_end)
                return -EINVAL;
        range.end = min(vma->vm_end, end_addr);
        if (range.end <= vma->vm_start)
                return -EINVAL;
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                range.start, range.end);

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        update_hiwater_rss(mm);

        mmu_notifier_invalidate_range_start(&range);
        tlb_start_vma(&tlb, vma);
        walk_page_range(vma->vm_mm, range.start, range.end,
                        &madvise_free_walk_ops, &tlb);
        tlb_end_vma(&tlb, vma);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);

        return 0;
}

/*
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
 * data it wants to keep.  Be sure to free swap resources too.  The
 * zap_page_range_single call sets things up for shrink_active_list to actually
 * free these pages later if no one else has touched them in the meantime,
 * although we could add these pages to a global reuse list for
 * shrink_active_list to pick up before reclaiming other pages.
 *
 * NB: This interface discards data rather than pushes it out to swap,
 * as some implementations do.  This has performance implications for
 * applications like large transactional databases which want to discard
 * pages in anonymous maps after committing to backing store the data
 * that was kept in them.  There is no reason to write this data out to
 * the swap area if the application is discarding it.
 *
 * An interface that causes the system to free clean pages and flush
 * dirty pages is already available as msync(MS_INVALIDATE).
 */
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
                                        unsigned long start, unsigned long end)
{
        zap_page_range_single(vma, start, end - start, NULL);
        return 0;
}

static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
                                            unsigned long start,
                                            unsigned long *end,
                                            int behavior)
{
        if (!is_vm_hugetlb_page(vma)) {
                unsigned int forbidden = VM_PFNMAP;

                if (behavior != MADV_DONTNEED_LOCKED)
                        forbidden |= VM_LOCKED;

                return !(vma->vm_flags & forbidden);
        }

        if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
                return false;
        if (start & ~huge_page_mask(hstate_vma(vma)))
                return false;

        /*
         * Madvise callers expect the length to be rounded up to PAGE_SIZE
         * boundaries, and may be unaware that this VMA uses huge pages.
         * Avoid unexpected data loss by rounding down the number of
         * huge pages freed.
         */
        *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));

        return true;
}

static long madvise_dontneed_free(struct vm_area_struct *vma,
                                  struct vm_area_struct **prev,
                                  unsigned long start, unsigned long end,
                                  int behavior)
{
        struct mm_struct *mm = vma->vm_mm;

        *prev = vma;
        if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
                return -EINVAL;

        if (start == end)
                return 0;

        if (!userfaultfd_remove(vma, start, end)) {
                *prev = NULL; /* mmap_lock has been dropped, prev is stale */

                mmap_read_lock(mm);
                vma = vma_lookup(mm, start);
                if (!vma)
                        return -ENOMEM;
                /*
                 * Potential end adjustment for hugetlb vma is OK as
                 * the check below keeps end within vma.
                 */
                if (!madvise_dontneed_free_valid_vma(vma, start, &end,
                                                     behavior))
                        return -EINVAL;
                if (end > vma->vm_end) {
                        /*
                         * Don't fail if end > vma->vm_end. If the old
                         * vma was split while the mmap_lock was
                         * released the effect of the concurrent
                         * operation may not cause madvise() to
                         * have an undefined result. There may be an
                         * adjacent next vma that we'll walk
                         * next. userfaultfd_remove() will generate an
                         * UFFD_EVENT_REMOVE repetition on the
                         * end-vma->vm_end range, but the manager can
                         * handle a repetition fine.
                         */
                        end = vma->vm_end;
                }
                VM_WARN_ON(start >= end);
        }

        if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
                return madvise_dontneed_single_vma(vma, start, end);
        else if (behavior == MADV_FREE)
                return madvise_free_single_vma(vma, start, end);
        else
                return -EINVAL;
}

static long madvise_populate(struct mm_struct *mm, unsigned long start,
                unsigned long end, int behavior)
{
        const bool write = behavior == MADV_POPULATE_WRITE;
        int locked = 1;
        long pages;

        while (start < end) {
                /* Populate (prefault) page tables readable/writable. */
                pages = faultin_page_range(mm, start, end, write, &locked);
                if (!locked) {
                        mmap_read_lock(mm);
                        locked = 1;
                }
                if (pages < 0) {
                        switch (pages) {
                        case -EINTR:
                                return -EINTR;
                        case -EINVAL: /* Incompatible mappings / permissions. */
                                return -EINVAL;
                        case -EHWPOISON:
                                return -EHWPOISON;
                        case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
                                return -EFAULT;
                        default:
                                pr_warn_once("%s: unhandled return value: %ld\n",
                                             __func__, pages);
                                fallthrough;
                        case -ENOMEM: /* No VMA or out of memory. */
                                return -ENOMEM;
                        }
                }
                start += pages * PAGE_SIZE;
        }
        return 0;
}

/*
 * Application wants to free up the pages and associated backing store.
 * This is effectively punching a hole into the middle of a file.
 */
static long madvise_remove(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end)
{
        loff_t offset;
        int error;
        struct file *f;
        struct mm_struct *mm = vma->vm_mm;

        *prev = NULL;        /* tell sys_madvise we drop mmap_lock */

        if (vma->vm_flags & VM_LOCKED)
                return -EINVAL;

        f = vma->vm_file;

        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }

        if (!vma_is_shared_maywrite(vma))
                return -EACCES;

        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

        /*
         * Filesystem's fallocate may need to take i_rwsem.  We need to
         * explicitly grab a reference because the vma (and hence the
         * vma's reference to the file) can go away as soon as we drop
         * mmap_lock.
         */
        get_file(f);
        if (userfaultfd_remove(vma, start, end)) {
                /* mmap_lock was not released by userfaultfd_remove() */
                mmap_read_unlock(mm);
        }
        error = vfs_fallocate(f,
                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                                offset, end - start);
        fput(f);
        mmap_read_lock(mm);
        return error;
}

/*
 * Apply an madvise behavior to a region of a vma.  madvise_update_vma
 * will handle splitting a vm area into separate areas, each area with its own
 * behavior.
 */
static int madvise_vma_behavior(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end,
                                unsigned long behavior)
{
        int error;
        struct anon_vma_name *anon_name;
        unsigned long new_flags = vma->vm_flags;

        switch (behavior) {
        case MADV_REMOVE:
                return madvise_remove(vma, prev, start, end);
        case MADV_WILLNEED:
                return madvise_willneed(vma, prev, start, end);
        case MADV_COLD:
                return madvise_cold(vma, prev, start, end);
        case MADV_PAGEOUT:
                return madvise_pageout(vma, prev, start, end);
        case MADV_FREE:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
                return madvise_dontneed_free(vma, prev, start, end, behavior);
        case MADV_NORMAL:
                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
                break;
        case MADV_SEQUENTIAL:
                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
                break;
        case MADV_RANDOM:
                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
                break;
        case MADV_DONTFORK:
                new_flags |= VM_DONTCOPY;
                break;
        case MADV_DOFORK:
                if (vma->vm_flags & VM_IO)
                        return -EINVAL;
                new_flags &= ~VM_DONTCOPY;
                break;
        case MADV_WIPEONFORK:
                /* MADV_WIPEONFORK is only supported on anonymous memory. */
                if (vma->vm_file || vma->vm_flags & VM_SHARED)
                        return -EINVAL;
                new_flags |= VM_WIPEONFORK;
                break;
        case MADV_KEEPONFORK:
                new_flags &= ~VM_WIPEONFORK;
                break;
        case MADV_DONTDUMP:
                new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
                if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
                        return -EINVAL;
                new_flags &= ~VM_DONTDUMP;
                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
                error = ksm_madvise(vma, start, end, behavior, &new_flags);
                if (error)
                        goto out;
                break;
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
                error = hugepage_madvise(vma, &new_flags, behavior);
                if (error)
                        goto out;
                break;
        case MADV_COLLAPSE:
                return madvise_collapse(vma, prev, start, end);
        }

        anon_name = anon_vma_name(vma);
        anon_vma_name_get(anon_name);
        error = madvise_update_vma(vma, prev, start, end, new_flags,
                                   anon_name);
        anon_vma_name_put(anon_name);

out:
        /*
         * madvise() returns EAGAIN if kernel resources, such as
         * slab, are temporarily unavailable.
         */
        if (error == -ENOMEM)
                error = -EAGAIN;
        return error;
}

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Error injection support for memory error handling.
 */
static int madvise_inject_error(int behavior,
                unsigned long start, unsigned long end)
{
        unsigned long size;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;


        for (; start < end; start += size) {
                unsigned long pfn;
                struct page *page;
                int ret;

                ret = get_user_pages_fast(start, 1, 0, &page);
                if (ret != 1)
                        return ret;
                pfn = page_to_pfn(page);

                /*
                 * When soft offlining hugepages, after migrating the page
                 * we dissolve it, therefore in the second loop "page" will
                 * no longer be a compound page.
                 */
                size = page_size(compound_head(page));

                if (behavior == MADV_SOFT_OFFLINE) {
                        pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
                                 pfn, start);
                        ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
                } else {
                        pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
                                 pfn, start);
                        ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
                        if (ret == -EOPNOTSUPP)
                                ret = 0;
                }

                if (ret)
                        return ret;
        }

        return 0;
}
#endif

static bool
madvise_behavior_valid(int behavior)
{
        switch (behavior) {
        case MADV_DOFORK:
        case MADV_DONTFORK:
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
        case MADV_RANDOM:
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
        case MADV_FREE:
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
#ifdef CONFIG_KSM
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
        case MADV_COLLAPSE:
#endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
        case MADV_WIPEONFORK:
        case MADV_KEEPONFORK:
#ifdef CONFIG_MEMORY_FAILURE
        case MADV_SOFT_OFFLINE:
        case MADV_HWPOISON:
#endif
                return true;

        default:
                return false;
        }
}

static bool process_madvise_behavior_valid(int behavior)
{
        switch (behavior) {
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_WILLNEED:
        case MADV_COLLAPSE:
                return true;
        default:
                return false;
        }
}

/*
 * Walk the vmas in range [start,end), and call the visit function on each one.
 * The visit function will get start and end parameters that cover the overlap
 * between the current vma and the original range.  Any unmapped regions in the
 * original range will result in this function returning -ENOMEM while still
 * calling the visit function on all of the existing vmas in the range.
 * Must be called with the mmap_lock held for reading or writing.
 */
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
                      unsigned long end, unsigned long arg,
                      int (*visit)(struct vm_area_struct *vma,
                                   struct vm_area_struct **prev, unsigned long start,
                                   unsigned long end, unsigned long arg))
{
        struct vm_area_struct *vma;
        struct vm_area_struct *prev;
        unsigned long tmp;
        int unmapped_error = 0;

        /*
         * If the interval [start,end) covers some unmapped address
         * ranges, just ignore them, but return -ENOMEM at the end.
         * - different from the way of handling in mlock etc.
         */
        vma = find_vma_prev(mm, start, &prev);
        if (vma && start > vma->vm_start)
                prev = vma;

        for (;;) {
                int error;

                /* Still start < end. */
                if (!vma)
                        return -ENOMEM;

                /* Here start < (end|vma->vm_end). */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
                        start = vma->vm_start;
                        if (start >= end)
                                break;
                }

                /* Here vma->vm_start <= start < (end|vma->vm_end) */
                tmp = vma->vm_end;
                if (end < tmp)
                        tmp = end;

                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
                error = visit(vma, &prev, start, tmp, arg);
                if (error)
                        return error;
                start = tmp;
                if (prev && start < prev->vm_end)
                        start = prev->vm_end;
                if (start >= end)
                        break;
                if (prev)
                        vma = find_vma(mm, prev->vm_end);
                else        /* madvise_remove dropped mmap_lock */
                        vma = find_vma(mm, start);
        }

        return unmapped_error;
}

#ifdef CONFIG_ANON_VMA_NAME
static int madvise_vma_anon_name(struct vm_area_struct *vma,
                                 struct vm_area_struct **prev,
                                 unsigned long start, unsigned long end,
                                 unsigned long anon_name)
{
        int error;

        /* Only anonymous mappings can be named */
        if (vma->vm_file && !vma_is_anon_shmem(vma))
                return -EBADF;

        error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
                                   (struct anon_vma_name *)anon_name);

        /*
         * madvise() returns EAGAIN if kernel resources, such as
         * slab, are temporarily unavailable.
         */
        if (error == -ENOMEM)
                error = -EAGAIN;
        return error;
}

int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                          unsigned long len_in, struct anon_vma_name *anon_name)
{
        unsigned long end;
        unsigned long len;

        if (start & ~PAGE_MASK)
                return -EINVAL;
        len = (len_in + ~PAGE_MASK) & PAGE_MASK;

        /* Check to see whether len was rounded up from small -ve to zero */
        if (len_in && !len)
                return -EINVAL;

        end = start + len;
        if (end < start)
                return -EINVAL;

        if (end == start)
                return 0;

        return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
                                 madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
 * The madvise(2) system call.
 *
 * Applications can use madvise() to advise the kernel how it should
 * handle paging I/O in this VM area.  The idea is to help the kernel
 * use appropriate read-ahead and caching techniques.  The information
 * provided is advisory only, and can be safely disregarded by the
 * kernel without affecting the correct operation of the application.
 *
 * behavior values:
 *  MADV_NORMAL - the default behavior is to read clusters.  This
 *                results in some read-ahead and read-behind.
 *  MADV_RANDOM - the system should read the minimum amount of data
 *                on any access, since it is unlikely that the appli-
 *                cation will need more than what it asks for.
 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 *                once, so they can be aggressively read ahead, and
 *                can be freed soon after they are accessed.
 *  MADV_WILLNEED - the application is notifying the system to read
 *                some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *                so the kernel can free resources associated with it.
 *  MADV_FREE - the application marks pages in the given range as lazy free,
 *                where actual purges are postponed until memory pressure happens.
 *  MADV_REMOVE - the application wants to free up the given range of
 *                pages and associated backing store.
 *  MADV_DONTFORK - omit this area from child's address space when forking:
 *                typically, to avoid COWing pages pinned by get_user_pages().
 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
 *              range after a fork.
 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
 *                were corrupted by unrecoverable hardware memory failure.
 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 *                this area with pages of identical content from other such areas.
 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
 *                huge pages in the future. Existing pages might be coalesced and
 *                new pages might be allocated as THP.
 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
 *                transparent huge pages so the existing pages will not be
 *                coalesced into THP and new pages will not be allocated as THP.
 *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
 *                from being included in its core dump.
 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
 *  MADV_COLD - the application is not expected to use this memory soon,
 *                deactivate pages in this range so that they can be reclaimed
 *                easily if memory pressure happens.
 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
 *                page out the pages in this range immediately.
 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
 *                triggering read faults if required
 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
 *                triggering write faults if required
 *
 * return values:
 *  zero    - success
 *  -EINVAL - start + len < 0, start is not page-aligned,
 *                "behavior" is not a valid value, or application
 *                is attempting to release locked or shared pages,
 *                or the specified address range includes file, Huge TLB,
 *                MAP_SHARED or VMPFNMAP range.
 *  -ENOMEM - addresses in the specified range are not currently
 *                mapped, or are outside the AS of the process.
 *  -EIO    - an I/O error occurred while paging in data.
 *  -EBADF  - map exists, but area maps something that isn't a file.
 *  -EAGAIN - a kernel resource was temporarily unavailable.
 *  -EPERM  - memory is sealed.
 */
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
        unsigned long end;
        int error;
        int write;
        size_t len;
        struct blk_plug plug;

        if (!madvise_behavior_valid(behavior))
                return -EINVAL;

        if (!PAGE_ALIGNED(start))
                return -EINVAL;
        len = PAGE_ALIGN(len_in);

        /* Check to see whether len was rounded up from small -ve to zero */
        if (len_in && !len)
                return -EINVAL;

        end = start + len;
        if (end < start)
                return -EINVAL;

        if (end == start)
                return 0;

#ifdef CONFIG_MEMORY_FAILURE
        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
                return madvise_inject_error(behavior, start, start + len_in);
#endif

        write = madvise_need_mmap_write(behavior);
        if (write) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
        } else {
                mmap_read_lock(mm);
        }

        start = untagged_addr_remote(mm, start);
        end = start + len;

        /*
         * Check if the address range is sealed for do_madvise().
         * can_modify_mm_madv assumes we have acquired the lock on MM.
         */
        if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
                error = -EPERM;
                goto out;
        }

        blk_start_plug(&plug);
        switch (behavior) {
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
                error = madvise_populate(mm, start, end, behavior);
                break;
        default:
                error = madvise_walk_vmas(mm, start, end, behavior,
                                          madvise_vma_behavior);
                break;
        }
        blk_finish_plug(&plug);

out:
        if (write)
                mmap_write_unlock(mm);
        else
                mmap_read_unlock(mm);

        return error;
}

SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
        return do_madvise(current->mm, start, len_in, behavior);
}

SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                size_t, vlen, int, behavior, unsigned int, flags)
{
        ssize_t ret;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        struct task_struct *task;
        struct mm_struct *mm;
        size_t total_len;
        unsigned int f_flags;

        if (flags != 0) {
                ret = -EINVAL;
                goto out;
        }

        ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
        if (ret < 0)
                goto out;

        task = pidfd_get_task(pidfd, &f_flags);
        if (IS_ERR(task)) {
                ret = PTR_ERR(task);
                goto free_iov;
        }

        if (!process_madvise_behavior_valid(behavior)) {
                ret = -EINVAL;
                goto release_task;
        }

        /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR_OR_NULL(mm)) {
                ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
                goto release_task;
        }

        /*
         * Require CAP_SYS_NICE for influencing process performance. Note that
         * only non-destructive hints are currently supported.
         */
        if (!capable(CAP_SYS_NICE)) {
                ret = -EPERM;
                goto release_mm;
        }

        total_len = iov_iter_count(&iter);

        while (iov_iter_count(&iter)) {
                ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
                                        iter_iov_len(&iter), behavior);
                if (ret < 0)
                        break;
                iov_iter_advance(&iter, iter_iov_len(&iter));
        }

        ret = (total_len - iov_iter_count(&iter)) ? : ret;

release_mm:
        mmput(mm);
release_task:
        put_task_struct(task);
free_iov:
        kfree(iov);
out:
        return ret;
}








































































































































   16 
   17 






    3 
    3 































































   11 
   11 






    1 
    1 






    2 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
// SPDX-License-Identifier: GPL-2.0-or-later
/* bit search implementation
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * Copyright (C) 2008 IBM Corporation
 * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
 * (Inspired by David Howell's find_next_bit implementation)
 *
 * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
 * size and improve performance, 2015.
 */

#include <linux/bitops.h>
#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/swab.h>

/*
 * Common helper for find_bit() function family
 * @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
 * @MUNGE: The expression that post-processes a word containing found bit (may be empty)
 * @size: The bitmap size in bits
 */
#define FIND_FIRST_BIT(FETCH, MUNGE, size)                                        \
({                                                                                \
        unsigned long idx, val, sz = (size);                                        \
                                                                                \
        for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {                        \
                val = (FETCH);                                                        \
                if (val) {                                                        \
                        sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz);        \
                        break;                                                        \
                }                                                                \
        }                                                                        \
                                                                                \
        sz;                                                                        \
})

/*
 * Common helper for find_next_bit() function family
 * @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
 * @MUNGE: The expression that post-processes a word containing found bit (may be empty)
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 */
#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)                                \
({                                                                                \
        unsigned long mask, idx, tmp, sz = (size), __start = (start);                \
                                                                                \
        if (unlikely(__start >= sz))                                                \
                goto out;                                                        \
                                                                                \
        mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start));                                \
        idx = __start / BITS_PER_LONG;                                                \
                                                                                \
        for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) {                        \
                if ((idx + 1) * BITS_PER_LONG >= sz)                                \
                        goto out;                                                \
                idx++;                                                                \
        }                                                                        \
                                                                                \
        sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz);                        \
out:                                                                                \
        sz;                                                                        \
})

#define FIND_NTH_BIT(FETCH, size, num)                                                \
({                                                                                \
        unsigned long sz = (size), nr = (num), idx, w, tmp;                        \
                                                                                \
        for (idx = 0; (idx + 1) * BITS_PER_LONG <= sz; idx++) {                        \
                if (idx * BITS_PER_LONG + nr >= sz)                                \
                        goto out;                                                \
                                                                                \
                tmp = (FETCH);                                                        \
                w = hweight_long(tmp);                                                \
                if (w > nr)                                                        \
                        goto found;                                                \
                                                                                \
                nr -= w;                                                        \
        }                                                                        \
                                                                                \
        if (sz % BITS_PER_LONG)                                                        \
                tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz);                        \
found:                                                                                \
        sz = idx * BITS_PER_LONG + fns(tmp, nr);                                \
out:                                                                                \
        sz;                                                                        \
})

#ifndef find_first_bit
/*
 * Find the first set bit in a memory region.
 */
unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
{
        return FIND_FIRST_BIT(addr[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_bit);
#endif

#ifndef find_first_and_bit
/*
 * Find the first set bit in two memory regions.
 */
unsigned long _find_first_and_bit(const unsigned long *addr1,
                                  const unsigned long *addr2,
                                  unsigned long size)
{
        return FIND_FIRST_BIT(addr1[idx] & addr2[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_and_bit);
#endif

/*
 * Find the first set bit in three memory regions.
 */
unsigned long _find_first_and_and_bit(const unsigned long *addr1,
                                      const unsigned long *addr2,
                                      const unsigned long *addr3,
                                      unsigned long size)
{
        return FIND_FIRST_BIT(addr1[idx] & addr2[idx] & addr3[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_and_and_bit);

#ifndef find_first_zero_bit
/*
 * Find the first cleared bit in a memory region.
 */
unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        return FIND_FIRST_BIT(~addr[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_zero_bit);
#endif

#ifndef find_next_bit
unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_bit);
#endif

unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_bit);

unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                 unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr1[idx] & addr2[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_and_bit);

unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                 unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr1[idx] & ~addr2[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_andnot_bit);

unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        const unsigned long *addr3,
                                        unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_and_andnot_bit);

#ifndef find_next_and_bit
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr1[idx] & addr2[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_and_bit);
#endif

#ifndef find_next_andnot_bit
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr1[idx] & ~addr2[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_andnot_bit);
#endif

#ifndef find_next_or_bit
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_or_bit);
#endif

#ifndef find_next_zero_bit
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
                                         unsigned long start)
{
        return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_zero_bit);
#endif

#ifndef find_last_bit
unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (size) {
                unsigned long val = BITMAP_LAST_WORD_MASK(size);
                unsigned long idx = (size-1) / BITS_PER_LONG;

                do {
                        val &= addr[idx];
                        if (val)
                                return idx * BITS_PER_LONG + __fls(val);

                        val = ~0ul;
                } while (idx--);
        }
        return size;
}
EXPORT_SYMBOL(_find_last_bit);
#endif

unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
                               unsigned long size, unsigned long offset)
{
        offset = find_next_bit(addr, size, offset);
        if (offset == size)
                return size;

        offset = round_down(offset, 8);
        *clump = bitmap_get_value8(addr, offset);

        return offset;
}
EXPORT_SYMBOL(find_next_clump8);

#ifdef __BIG_ENDIAN

#ifndef find_first_zero_bit_le
/*
 * Find the first cleared bit in an LE memory region.
 */
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size)
{
        return FIND_FIRST_BIT(~addr[idx], swab, size);
}
EXPORT_SYMBOL(_find_first_zero_bit_le);

#endif

#ifndef find_next_zero_bit_le
unsigned long _find_next_zero_bit_le(const unsigned long *addr,
                                        unsigned long size, unsigned long offset)
{
        return FIND_NEXT_BIT(~addr[idx], swab, size, offset);
}
EXPORT_SYMBOL(_find_next_zero_bit_le);
#endif

#ifndef find_next_bit_le
unsigned long _find_next_bit_le(const unsigned long *addr,
                                unsigned long size, unsigned long offset)
{
        return FIND_NEXT_BIT(addr[idx], swab, size, offset);
}
EXPORT_SYMBOL(_find_next_bit_le);

#endif

#endif /* __BIG_ENDIAN */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
// SPDX-License-Identifier: GPL-2.0
/*
 * cfg80211 scan result handling
 *
 * Copyright 2008 Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright 2016        Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 */
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/wireless.h>
#include <linux/nl80211.h>
#include <linux/etherdevice.h>
#include <linux/crc32.h>
#include <linux/bitfield.h>
#include <net/arp.h>
#include <net/cfg80211.h>
#include <net/cfg80211-wext.h>
#include <net/iw_handler.h>
#include <kunit/visibility.h>
#include "core.h"
#include "nl80211.h"
#include "wext-compat.h"
#include "rdev-ops.h"

/**
 * DOC: BSS tree/list structure
 *
 * At the top level, the BSS list is kept in both a list in each
 * registered device (@bss_list) as well as an RB-tree for faster
 * lookup. In the RB-tree, entries can be looked up using their
 * channel, MESHID, MESHCONF (for MBSSes) or channel, BSSID, SSID
 * for other BSSes.
 *
 * Due to the possibility of hidden SSIDs, there's a second level
 * structure, the "hidden_list" and "hidden_beacon_bss" pointer.
 * The hidden_list connects all BSSes belonging to a single AP
 * that has a hidden SSID, and connects beacon and probe response
 * entries. For a probe response entry for a hidden SSID, the
 * hidden_beacon_bss pointer points to the BSS struct holding the
 * beacon's information.
 *
 * Reference counting is done for all these references except for
 * the hidden_list, so that a beacon BSS struct that is otherwise
 * not referenced has one reference for being on the bss_list and
 * one for each probe response entry that points to it using the
 * hidden_beacon_bss pointer. When a BSS struct that has such a
 * pointer is get/put, the refcount update is also propagated to
 * the referenced struct, this ensure that it cannot get removed
 * while somebody is using the probe response version.
 *
 * Note that the hidden_beacon_bss pointer never changes, due to
 * the reference counting. Therefore, no locking is needed for
 * it.
 *
 * Also note that the hidden_beacon_bss pointer is only relevant
 * if the driver uses something other than the IEs, e.g. private
 * data stored in the BSS struct, since the beacon IEs are
 * also linked into the probe response struct.
 */

/*
 * Limit the number of BSS entries stored in mac80211. Each one is
 * a bit over 4k at most, so this limits to roughly 4-5M of memory.
 * If somebody wants to really attack this though, they'd likely
 * use small beacons, and only one type of frame, limiting each of
 * the entries to a much smaller size (in order to generate more
 * entries in total, so overhead is bigger.)
 */
static int bss_entries_limit = 1000;
module_param(bss_entries_limit, int, 0644);
MODULE_PARM_DESC(bss_entries_limit,
                 "limit to number of scan BSS entries (per wiphy, default 1000)");

#define IEEE80211_SCAN_RESULT_EXPIRE        (30 * HZ)

static void bss_free(struct cfg80211_internal_bss *bss)
{
        struct cfg80211_bss_ies *ies;

        if (WARN_ON(atomic_read(&bss->hold)))
                return;

        ies = (void *)rcu_access_pointer(bss->pub.beacon_ies);
        if (ies && !bss->pub.hidden_beacon_bss)
                kfree_rcu(ies, rcu_head);
        ies = (void *)rcu_access_pointer(bss->pub.proberesp_ies);
        if (ies)
                kfree_rcu(ies, rcu_head);

        /*
         * This happens when the module is removed, it doesn't
         * really matter any more save for completeness
         */
        if (!list_empty(&bss->hidden_list))
                list_del(&bss->hidden_list);

        kfree(bss);
}

static inline void bss_ref_get(struct cfg80211_registered_device *rdev,
                               struct cfg80211_internal_bss *bss)
{
        lockdep_assert_held(&rdev->bss_lock);

        bss->refcount++;

        if (bss->pub.hidden_beacon_bss)
                bss_from_pub(bss->pub.hidden_beacon_bss)->refcount++;

        if (bss->pub.transmitted_bss)
                bss_from_pub(bss->pub.transmitted_bss)->refcount++;
}

static inline void bss_ref_put(struct cfg80211_registered_device *rdev,
                               struct cfg80211_internal_bss *bss)
{
        lockdep_assert_held(&rdev->bss_lock);

        if (bss->pub.hidden_beacon_bss) {
                struct cfg80211_internal_bss *hbss;

                hbss = bss_from_pub(bss->pub.hidden_beacon_bss);
                hbss->refcount--;
                if (hbss->refcount == 0)
                        bss_free(hbss);
        }

        if (bss->pub.transmitted_bss) {
                struct cfg80211_internal_bss *tbss;

                tbss = bss_from_pub(bss->pub.transmitted_bss);
                tbss->refcount--;
                if (tbss->refcount == 0)
                        bss_free(tbss);
        }

        bss->refcount--;
        if (bss->refcount == 0)
                bss_free(bss);
}

static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev,
                                  struct cfg80211_internal_bss *bss)
{
        lockdep_assert_held(&rdev->bss_lock);

        if (!list_empty(&bss->hidden_list)) {
                /*
                 * don't remove the beacon entry if it has
                 * probe responses associated with it
                 */
                if (!bss->pub.hidden_beacon_bss)
                        return false;
                /*
                 * if it's a probe response entry break its
                 * link to the other entries in the group
                 */
                list_del_init(&bss->hidden_list);
        }

        list_del_init(&bss->list);
        list_del_init(&bss->pub.nontrans_list);
        rb_erase(&bss->rbn, &rdev->bss_tree);
        rdev->bss_entries--;
        WARN_ONCE((rdev->bss_entries == 0) ^ list_empty(&rdev->bss_list),
                  "rdev bss entries[%d]/list[empty:%d] corruption\n",
                  rdev->bss_entries, list_empty(&rdev->bss_list));
        bss_ref_put(rdev, bss);
        return true;
}

bool cfg80211_is_element_inherited(const struct element *elem,
                                   const struct element *non_inherit_elem)
{
        u8 id_len, ext_id_len, i, loop_len, id;
        const u8 *list;

        if (elem->id == WLAN_EID_MULTIPLE_BSSID)
                return false;

        if (elem->id == WLAN_EID_EXTENSION && elem->datalen > 1 &&
            elem->data[0] == WLAN_EID_EXT_EHT_MULTI_LINK)
                return false;

        if (!non_inherit_elem || non_inherit_elem->datalen < 2)
                return true;

        /*
         * non inheritance element format is:
         * ext ID (56) | IDs list len | list | extension IDs list len | list
         * Both lists are optional. Both lengths are mandatory.
         * This means valid length is:
         * elem_len = 1 (extension ID) + 2 (list len fields) + list lengths
         */
        id_len = non_inherit_elem->data[1];
        if (non_inherit_elem->datalen < 3 + id_len)
                return true;

        ext_id_len = non_inherit_elem->data[2 + id_len];
        if (non_inherit_elem->datalen < 3 + id_len + ext_id_len)
                return true;

        if (elem->id == WLAN_EID_EXTENSION) {
                if (!ext_id_len)
                        return true;
                loop_len = ext_id_len;
                list = &non_inherit_elem->data[3 + id_len];
                id = elem->data[0];
        } else {
                if (!id_len)
                        return true;
                loop_len = id_len;
                list = &non_inherit_elem->data[2];
                id = elem->id;
        }

        for (i = 0; i < loop_len; i++) {
                if (list[i] == id)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL(cfg80211_is_element_inherited);

static size_t cfg80211_copy_elem_with_frags(const struct element *elem,
                                            const u8 *ie, size_t ie_len,
                                            u8 **pos, u8 *buf, size_t buf_len)
{
        if (WARN_ON((u8 *)elem < ie || elem->data > ie + ie_len ||
                    elem->data + elem->datalen > ie + ie_len))
                return 0;

        if (elem->datalen + 2 > buf + buf_len - *pos)
                return 0;

        memcpy(*pos, elem, elem->datalen + 2);
        *pos += elem->datalen + 2;

        /* Finish if it is not fragmented  */
        if (elem->datalen != 255)
                return *pos - buf;

        ie_len = ie + ie_len - elem->data - elem->datalen;
        ie = (const u8 *)elem->data + elem->datalen;

        for_each_element(elem, ie, ie_len) {
                if (elem->id != WLAN_EID_FRAGMENT)
                        break;

                if (elem->datalen + 2 > buf + buf_len - *pos)
                        return 0;

                memcpy(*pos, elem, elem->datalen + 2);
                *pos += elem->datalen + 2;

                if (elem->datalen != 255)
                        break;
        }

        return *pos - buf;
}

VISIBLE_IF_CFG80211_KUNIT size_t
cfg80211_gen_new_ie(const u8 *ie, size_t ielen,
                    const u8 *subie, size_t subie_len,
                    u8 *new_ie, size_t new_ie_len)
{
        const struct element *non_inherit_elem, *parent, *sub;
        u8 *pos = new_ie;
        u8 id, ext_id;
        unsigned int match_len;

        non_inherit_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
                                                  subie, subie_len);

        /* We copy the elements one by one from the parent to the generated
         * elements.
         * If they are not inherited (included in subie or in the non
         * inheritance element), then we copy all occurrences the first time
         * we see this element type.
         */
        for_each_element(parent, ie, ielen) {
                if (parent->id == WLAN_EID_FRAGMENT)
                        continue;

                if (parent->id == WLAN_EID_EXTENSION) {
                        if (parent->datalen < 1)
                                continue;

                        id = WLAN_EID_EXTENSION;
                        ext_id = parent->data[0];
                        match_len = 1;
                } else {
                        id = parent->id;
                        match_len = 0;
                }

                /* Find first occurrence in subie */
                sub = cfg80211_find_elem_match(id, subie, subie_len,
                                               &ext_id, match_len, 0);

                /* Copy from parent if not in subie and inherited */
                if (!sub &&
                    cfg80211_is_element_inherited(parent, non_inherit_elem)) {
                        if (!cfg80211_copy_elem_with_frags(parent,
                                                           ie, ielen,
                                                           &pos, new_ie,
                                                           new_ie_len))
                                return 0;

                        continue;
                }

                /* Already copied if an earlier element had the same type */
                if (cfg80211_find_elem_match(id, ie, (u8 *)parent - ie,
                                             &ext_id, match_len, 0))
                        continue;

                /* Not inheriting, copy all similar elements from subie */
                while (sub) {
                        if (!cfg80211_copy_elem_with_frags(sub,
                                                           subie, subie_len,
                                                           &pos, new_ie,
                                                           new_ie_len))
                                return 0;

                        sub = cfg80211_find_elem_match(id,
                                                       sub->data + sub->datalen,
                                                       subie_len + subie -
                                                       (sub->data +
                                                        sub->datalen),
                                                       &ext_id, match_len, 0);
                }
        }

        /* The above misses elements that are included in subie but not in the
         * parent, so do a pass over subie and append those.
         * Skip the non-tx BSSID caps and non-inheritance element.
         */
        for_each_element(sub, subie, subie_len) {
                if (sub->id == WLAN_EID_NON_TX_BSSID_CAP)
                        continue;

                if (sub->id == WLAN_EID_FRAGMENT)
                        continue;

                if (sub->id == WLAN_EID_EXTENSION) {
                        if (sub->datalen < 1)
                                continue;

                        id = WLAN_EID_EXTENSION;
                        ext_id = sub->data[0];
                        match_len = 1;

                        if (ext_id == WLAN_EID_EXT_NON_INHERITANCE)
                                continue;
                } else {
                        id = sub->id;
                        match_len = 0;
                }

                /* Processed if one was included in the parent */
                if (cfg80211_find_elem_match(id, ie, ielen,
                                             &ext_id, match_len, 0))
                        continue;

                if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len,
                                                   &pos, new_ie, new_ie_len))
                        return 0;
        }

        return pos - new_ie;
}
EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_gen_new_ie);

static bool is_bss(struct cfg80211_bss *a, const u8 *bssid,
                   const u8 *ssid, size_t ssid_len)
{
        const struct cfg80211_bss_ies *ies;
        const struct element *ssid_elem;

        if (bssid && !ether_addr_equal(a->bssid, bssid))
                return false;

        if (!ssid)
                return true;

        ies = rcu_access_pointer(a->ies);
        if (!ies)
                return false;
        ssid_elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len);
        if (!ssid_elem)
                return false;
        if (ssid_elem->datalen != ssid_len)
                return false;
        return memcmp(ssid_elem->data, ssid, ssid_len) == 0;
}

static int
cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss,
                           struct cfg80211_bss *nontrans_bss)
{
        const struct element *ssid_elem;
        struct cfg80211_bss *bss = NULL;

        rcu_read_lock();
        ssid_elem = ieee80211_bss_get_elem(nontrans_bss, WLAN_EID_SSID);
        if (!ssid_elem) {
                rcu_read_unlock();
                return -EINVAL;
        }

        /* check if nontrans_bss is in the list */
        list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) {
                if (is_bss(bss, nontrans_bss->bssid, ssid_elem->data,
                           ssid_elem->datalen)) {
                        rcu_read_unlock();
                        return 0;
                }
        }

        rcu_read_unlock();

        /*
         * This is a bit weird - it's not on the list, but already on another
         * one! The only way that could happen is if there's some BSSID/SSID
         * shared by multiple APs in their multi-BSSID profiles, potentially
         * with hidden SSID mixed in ... ignore it.
         */
        if (!list_empty(&nontrans_bss->nontrans_list))
                return -EINVAL;

        /* add to the list */
        list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list);
        return 0;
}

static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev,
                                  unsigned long expire_time)
{
        struct cfg80211_internal_bss *bss, *tmp;
        bool expired = false;

        lockdep_assert_held(&rdev->bss_lock);

        list_for_each_entry_safe(bss, tmp, &rdev->bss_list, list) {
                if (atomic_read(&bss->hold))
                        continue;
                if (!time_after(expire_time, bss->ts))
                        continue;

                if (__cfg80211_unlink_bss(rdev, bss))
                        expired = true;
        }

        if (expired)
                rdev->bss_generation++;
}

static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev)
{
        struct cfg80211_internal_bss *bss, *oldest = NULL;
        bool ret;

        lockdep_assert_held(&rdev->bss_lock);

        list_for_each_entry(bss, &rdev->bss_list, list) {
                if (atomic_read(&bss->hold))
                        continue;

                if (!list_empty(&bss->hidden_list) &&
                    !bss->pub.hidden_beacon_bss)
                        continue;

                if (oldest && time_before(oldest->ts, bss->ts))
                        continue;
                oldest = bss;
        }

        if (WARN_ON(!oldest))
                return false;

        /*
         * The callers make sure to increase rdev->bss_generation if anything
         * gets removed (and a new entry added), so there's no need to also do
         * it here.
         */

        ret = __cfg80211_unlink_bss(rdev, oldest);
        WARN_ON(!ret);
        return ret;
}

static u8 cfg80211_parse_bss_param(u8 data,
                                   struct cfg80211_colocated_ap *coloc_ap)
{
        coloc_ap->oct_recommended =
                u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED);
        coloc_ap->same_ssid =
                u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_SAME_SSID);
        coloc_ap->multi_bss =
                u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID);
        coloc_ap->transmitted_bssid =
                u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID);
        coloc_ap->unsolicited_probe =
                u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE);
        coloc_ap->colocated_ess =
                u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS);

        return u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_AP);
}

static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies,
                                    const struct element **elem, u32 *s_ssid)
{

        *elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len);
        if (!*elem || (*elem)->datalen > IEEE80211_MAX_SSID_LEN)
                return -EINVAL;

        *s_ssid = ~crc32_le(~0, (*elem)->data, (*elem)->datalen);
        return 0;
}

VISIBLE_IF_CFG80211_KUNIT void
cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list)
{
        struct cfg80211_colocated_ap *ap, *tmp_ap;

        list_for_each_entry_safe(ap, tmp_ap, coloc_ap_list, list) {
                list_del(&ap->list);
                kfree(ap);
        }
}
EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_free_coloc_ap_list);

static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry,
                                  const u8 *pos, u8 length,
                                  const struct element *ssid_elem,
                                  u32 s_ssid_tmp)
{
        u8 bss_params;

        entry->psd_20 = IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED;

        /* The length is already verified by the caller to contain bss_params */
        if (length > sizeof(struct ieee80211_tbtt_info_7_8_9)) {
                struct ieee80211_tbtt_info_ge_11 *tbtt_info = (void *)pos;

                memcpy(entry->bssid, tbtt_info->bssid, ETH_ALEN);
                entry->short_ssid = le32_to_cpu(tbtt_info->short_ssid);
                entry->short_ssid_valid = true;

                bss_params = tbtt_info->bss_params;

                /* Ignore disabled links */
                if (length >= offsetofend(typeof(*tbtt_info), mld_params)) {
                        if (le16_get_bits(tbtt_info->mld_params.params,
                                          IEEE80211_RNR_MLD_PARAMS_DISABLED_LINK))
                                return -EINVAL;
                }

                if (length >= offsetofend(struct ieee80211_tbtt_info_ge_11,
                                          psd_20))
                        entry->psd_20 = tbtt_info->psd_20;
        } else {
                struct ieee80211_tbtt_info_7_8_9 *tbtt_info = (void *)pos;

                memcpy(entry->bssid, tbtt_info->bssid, ETH_ALEN);

                bss_params = tbtt_info->bss_params;

                if (length == offsetofend(struct ieee80211_tbtt_info_7_8_9,
                                          psd_20))
                        entry->psd_20 = tbtt_info->psd_20;
        }

        /* ignore entries with invalid BSSID */
        if (!is_valid_ether_addr(entry->bssid))
                return -EINVAL;

        /* skip non colocated APs */
        if (!cfg80211_parse_bss_param(bss_params, entry))
                return -EINVAL;

        /* no information about the short ssid. Consider the entry valid
         * for now. It would later be dropped in case there are explicit
         * SSIDs that need to be matched
         */
        if (!entry->same_ssid && !entry->short_ssid_valid)
                return 0;

        if (entry->same_ssid) {
                entry->short_ssid = s_ssid_tmp;
                entry->short_ssid_valid = true;

                /*
                 * This is safe because we validate datalen in
                 * cfg80211_parse_colocated_ap(), before calling this
                 * function.
                 */
                memcpy(&entry->ssid, &ssid_elem->data, ssid_elem->datalen);
                entry->ssid_len = ssid_elem->datalen;
        }

        return 0;
}

bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
                       enum cfg80211_rnr_iter_ret
                       (*iter)(void *data, u8 type,
                               const struct ieee80211_neighbor_ap_info *info,
                               const u8 *tbtt_info, u8 tbtt_info_len),
                       void *iter_data)
{
        const struct element *rnr;
        const u8 *pos, *end;

        for_each_element_id(rnr, WLAN_EID_REDUCED_NEIGHBOR_REPORT,
                            elems, elems_len) {
                const struct ieee80211_neighbor_ap_info *info;

                pos = rnr->data;
                end = rnr->data + rnr->datalen;

                /* RNR IE may contain more than one NEIGHBOR_AP_INFO */
                while (sizeof(*info) <= end - pos) {
                        u8 length, i, count;
                        u8 type;

                        info = (void *)pos;
                        count = u8_get_bits(info->tbtt_info_hdr,
                                            IEEE80211_AP_INFO_TBTT_HDR_COUNT) +
                                1;
                        length = info->tbtt_info_len;

                        pos += sizeof(*info);

                        if (count * length > end - pos)
                                return false;

                        type = u8_get_bits(info->tbtt_info_hdr,
                                           IEEE80211_AP_INFO_TBTT_HDR_TYPE);

                        for (i = 0; i < count; i++) {
                                switch (iter(iter_data, type, info,
                                             pos, length)) {
                                case RNR_ITER_CONTINUE:
                                        break;
                                case RNR_ITER_BREAK:
                                        return true;
                                case RNR_ITER_ERROR:
                                        return false;
                                }

                                pos += length;
                        }
                }

                if (pos != end)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL_GPL(cfg80211_iter_rnr);

struct colocated_ap_data {
        const struct element *ssid_elem;
        struct list_head ap_list;
        u32 s_ssid_tmp;
        int n_coloc;
};

static enum cfg80211_rnr_iter_ret
cfg80211_parse_colocated_ap_iter(void *_data, u8 type,
                                 const struct ieee80211_neighbor_ap_info *info,
                                 const u8 *tbtt_info, u8 tbtt_info_len)
{
        struct colocated_ap_data *data = _data;
        struct cfg80211_colocated_ap *entry;
        enum nl80211_band band;

        if (type != IEEE80211_TBTT_INFO_TYPE_TBTT)
                return RNR_ITER_CONTINUE;

        if (!ieee80211_operating_class_to_band(info->op_class, &band))
                return RNR_ITER_CONTINUE;

        /* TBTT info must include bss param + BSSID + (short SSID or
         * same_ssid bit to be set). Ignore other options, and move to
         * the next AP info
         */
        if (band != NL80211_BAND_6GHZ ||
            !(tbtt_info_len == offsetofend(struct ieee80211_tbtt_info_7_8_9,
                                           bss_params) ||
              tbtt_info_len == sizeof(struct ieee80211_tbtt_info_7_8_9) ||
              tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11,
                                           bss_params)))
                return RNR_ITER_CONTINUE;

        entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, GFP_ATOMIC);
        if (!entry)
                return RNR_ITER_ERROR;

        entry->center_freq =
                ieee80211_channel_to_frequency(info->channel, band);

        if (!cfg80211_parse_ap_info(entry, tbtt_info, tbtt_info_len,
                                    data->ssid_elem, data->s_ssid_tmp)) {
                data->n_coloc++;
                list_add_tail(&entry->list, &data->ap_list);
        } else {
                kfree(entry);
        }

        return RNR_ITER_CONTINUE;
}

VISIBLE_IF_CFG80211_KUNIT int
cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies,
                            struct list_head *list)
{
        struct colocated_ap_data data = {};
        int ret;

        INIT_LIST_HEAD(&data.ap_list);

        ret = cfg80211_calc_short_ssid(ies, &data.ssid_elem, &data.s_ssid_tmp);
        if (ret)
                return 0;

        if (!cfg80211_iter_rnr(ies->data, ies->len,
                               cfg80211_parse_colocated_ap_iter, &data)) {
                cfg80211_free_coloc_ap_list(&data.ap_list);
                return 0;
        }

        list_splice_tail(&data.ap_list, list);
        return data.n_coloc;
}
EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_parse_colocated_ap);

static  void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request,
                                        struct ieee80211_channel *chan,
                                        bool add_to_6ghz)
{
        int i;
        u32 n_channels = request->n_channels;
        struct cfg80211_scan_6ghz_params *params =
                &request->scan_6ghz_params[request->n_6ghz_params];

        for (i = 0; i < n_channels; i++) {
                if (request->channels[i] == chan) {
                        if (add_to_6ghz)
                                params->channel_idx = i;
                        return;
                }
        }

        request->channels[n_channels] = chan;
        if (add_to_6ghz)
                request->scan_6ghz_params[request->n_6ghz_params].channel_idx =
                        n_channels;

        request->n_channels++;
}

static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap,
                                     struct cfg80211_scan_request *request)
{
        int i;
        u32 s_ssid;

        for (i = 0; i < request->n_ssids; i++) {
                /* wildcard ssid in the scan request */
                if (!request->ssids[i].ssid_len) {
                        if (ap->multi_bss && !ap->transmitted_bssid)
                                continue;

                        return true;
                }

                if (ap->ssid_len &&
                    ap->ssid_len == request->ssids[i].ssid_len) {
                        if (!memcmp(request->ssids[i].ssid, ap->ssid,
                                    ap->ssid_len))
                                return true;
                } else if (ap->short_ssid_valid) {
                        s_ssid = ~crc32_le(~0, request->ssids[i].ssid,
                                           request->ssids[i].ssid_len);

                        if (ap->short_ssid == s_ssid)
                                return true;
                }
        }

        return false;
}

static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
{
        u8 i;
        struct cfg80211_colocated_ap *ap;
        int n_channels, count = 0, err;
        struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req;
        LIST_HEAD(coloc_ap_list);
        bool need_scan_psc = true;
        const struct ieee80211_sband_iftype_data *iftd;
        size_t size, offs_ssids, offs_6ghz_params, offs_ies;

        rdev_req->scan_6ghz = true;

        if (!rdev->wiphy.bands[NL80211_BAND_6GHZ])
                return -EOPNOTSUPP;

        iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ],
                                               rdev_req->wdev->iftype);
        if (!iftd || !iftd->he_cap.has_he)
                return -EOPNOTSUPP;

        n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels;

        if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) {
                struct cfg80211_internal_bss *intbss;

                spin_lock_bh(&rdev->bss_lock);
                list_for_each_entry(intbss, &rdev->bss_list, list) {
                        struct cfg80211_bss *res = &intbss->pub;
                        const struct cfg80211_bss_ies *ies;
                        const struct element *ssid_elem;
                        struct cfg80211_colocated_ap *entry;
                        u32 s_ssid_tmp;
                        int ret;

                        ies = rcu_access_pointer(res->ies);
                        count += cfg80211_parse_colocated_ap(ies,
                                                             &coloc_ap_list);

                        /* In case the scan request specified a specific BSSID
                         * and the BSS is found and operating on 6GHz band then
                         * add this AP to the collocated APs list.
                         * This is relevant for ML probe requests when the lower
                         * band APs have not been discovered.
                         */
                        if (is_broadcast_ether_addr(rdev_req->bssid) ||
                            !ether_addr_equal(rdev_req->bssid, res->bssid) ||
                            res->channel->band != NL80211_BAND_6GHZ)
                                continue;

                        ret = cfg80211_calc_short_ssid(ies, &ssid_elem,
                                                       &s_ssid_tmp);
                        if (ret)
                                continue;

                        entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN,
                                        GFP_ATOMIC);

                        if (!entry)
                                continue;

                        memcpy(entry->bssid, res->bssid, ETH_ALEN);
                        entry->short_ssid = s_ssid_tmp;
                        memcpy(entry->ssid, ssid_elem->data,
                               ssid_elem->datalen);
                        entry->ssid_len = ssid_elem->datalen;
                        entry->short_ssid_valid = true;
                        entry->center_freq = res->channel->center_freq;

                        list_add_tail(&entry->list, &coloc_ap_list);
                        count++;
                }
                spin_unlock_bh(&rdev->bss_lock);
        }

        size = struct_size(request, channels, n_channels);
        offs_ssids = size;
        size += sizeof(*request->ssids) * rdev_req->n_ssids;
        offs_6ghz_params = size;
        size += sizeof(*request->scan_6ghz_params) * count;
        offs_ies = size;
        size += rdev_req->ie_len;

        request = kzalloc(size, GFP_KERNEL);
        if (!request) {
                cfg80211_free_coloc_ap_list(&coloc_ap_list);
                return -ENOMEM;
        }

        *request = *rdev_req;
        request->n_channels = 0;
        request->n_6ghz_params = 0;
        if (rdev_req->n_ssids) {
                /*
                 * Add the ssids from the parent scan request to the new
                 * scan request, so the driver would be able to use them
                 * in its probe requests to discover hidden APs on PSC
                 * channels.
                 */
                request->ssids = (void *)request + offs_ssids;
                memcpy(request->ssids, rdev_req->ssids,
                       sizeof(*request->ssids) * request->n_ssids);
        }
        request->scan_6ghz_params = (void *)request + offs_6ghz_params;

        if (rdev_req->ie_len) {
                void *ie = (void *)request + offs_ies;

                memcpy(ie, rdev_req->ie, rdev_req->ie_len);
                request->ie = ie;
        }

        /*
         * PSC channels should not be scanned in case of direct scan with 1 SSID
         * and at least one of the reported co-located APs with same SSID
         * indicating that all APs in the same ESS are co-located
         */
        if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) {
                list_for_each_entry(ap, &coloc_ap_list, list) {
                        if (ap->colocated_ess &&
                            cfg80211_find_ssid_match(ap, request)) {
                                need_scan_psc = false;
                                break;
                        }
                }
        }

        /*
         * add to the scan request the channels that need to be scanned
         * regardless of the collocated APs (PSC channels or all channels
         * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set)
         */
        for (i = 0; i < rdev_req->n_channels; i++) {
                if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ &&
                    ((need_scan_psc &&
                      cfg80211_channel_is_psc(rdev_req->channels[i])) ||
                     !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) {
                        cfg80211_scan_req_add_chan(request,
                                                   rdev_req->channels[i],
                                                   false);
                }
        }

        if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))
                goto skip;

        list_for_each_entry(ap, &coloc_ap_list, list) {
                bool found = false;
                struct cfg80211_scan_6ghz_params *scan_6ghz_params =
                        &request->scan_6ghz_params[request->n_6ghz_params];
                struct ieee80211_channel *chan =
                        ieee80211_get_channel(&rdev->wiphy, ap->center_freq);

                if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
                        continue;

                for (i = 0; i < rdev_req->n_channels; i++) {
                        if (rdev_req->channels[i] == chan)
                                found = true;
                }

                if (!found)
                        continue;

                if (request->n_ssids > 0 &&
                    !cfg80211_find_ssid_match(ap, request))
                        continue;

                if (!is_broadcast_ether_addr(request->bssid) &&
                    !ether_addr_equal(request->bssid, ap->bssid))
                        continue;

                if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid)
                        continue;

                cfg80211_scan_req_add_chan(request, chan, true);
                memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN);
                scan_6ghz_params->short_ssid = ap->short_ssid;
                scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid;
                scan_6ghz_params->unsolicited_probe = ap->unsolicited_probe;
                scan_6ghz_params->psd_20 = ap->psd_20;

                /*
                 * If a PSC channel is added to the scan and 'need_scan_psc' is
                 * set to false, then all the APs that the scan logic is
                 * interested with on the channel are collocated and thus there
                 * is no need to perform the initial PSC channel listen.
                 */
                if (cfg80211_channel_is_psc(chan) && !need_scan_psc)
                        scan_6ghz_params->psc_no_listen = true;

                request->n_6ghz_params++;
        }

skip:
        cfg80211_free_coloc_ap_list(&coloc_ap_list);

        if (request->n_channels) {
                struct cfg80211_scan_request *old = rdev->int_scan_req;

                rdev->int_scan_req = request;

                /*
                 * If this scan follows a previous scan, save the scan start
                 * info from the first part of the scan
                 */
                if (old)
                        rdev->int_scan_req->info = old->info;

                err = rdev_scan(rdev, request);
                if (err) {
                        rdev->int_scan_req = old;
                        kfree(request);
                } else {
                        kfree(old);
                }

                return err;
        }

        kfree(request);
        return -EINVAL;
}

int cfg80211_scan(struct cfg80211_registered_device *rdev)
{
        struct cfg80211_scan_request *request;
        struct cfg80211_scan_request *rdev_req = rdev->scan_req;
        u32 n_channels = 0, idx, i;

        if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ))
                return rdev_scan(rdev, rdev_req);

        for (i = 0; i < rdev_req->n_channels; i++) {
                if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ)
                        n_channels++;
        }

        if (!n_channels)
                return cfg80211_scan_6ghz(rdev);

        request = kzalloc(struct_size(request, channels, n_channels),
                          GFP_KERNEL);
        if (!request)
                return -ENOMEM;

        *request = *rdev_req;
        request->n_channels = n_channels;

        for (i = idx = 0; i < rdev_req->n_channels; i++) {
                if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ)
                        request->channels[idx++] = rdev_req->channels[i];
        }

        rdev_req->scan_6ghz = false;
        rdev->int_scan_req = request;
        return rdev_scan(rdev, request);
}

void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
                           bool send_message)
{
        struct cfg80211_scan_request *request, *rdev_req;
        struct wireless_dev *wdev;
        struct sk_buff *msg;
#ifdef CONFIG_CFG80211_WEXT
        union iwreq_data wrqu;
#endif

        lockdep_assert_held(&rdev->wiphy.mtx);

        if (rdev->scan_msg) {
                nl80211_send_scan_msg(rdev, rdev->scan_msg);
                rdev->scan_msg = NULL;
                return;
        }

        rdev_req = rdev->scan_req;
        if (!rdev_req)
                return;

        wdev = rdev_req->wdev;
        request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req;

        if (wdev_running(wdev) &&
            (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) &&
            !rdev_req->scan_6ghz && !request->info.aborted &&
            !cfg80211_scan_6ghz(rdev))
                return;

        /*
         * This must be before sending the other events!
         * Otherwise, wpa_supplicant gets completely confused with
         * wext events.
         */
        if (wdev->netdev)
                cfg80211_sme_scan_done(wdev->netdev);

        if (!request->info.aborted &&
            request->flags & NL80211_SCAN_FLAG_FLUSH) {
                /* flush entries from previous scans */
                spin_lock_bh(&rdev->bss_lock);
                __cfg80211_bss_expire(rdev, request->scan_start);
                spin_unlock_bh(&rdev->bss_lock);
        }

        msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted);

#ifdef CONFIG_CFG80211_WEXT
        if (wdev->netdev && !request->info.aborted) {
                memset(&wrqu, 0, sizeof(wrqu));

                wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL);
        }
#endif

        dev_put(wdev->netdev);

        kfree(rdev->int_scan_req);
        rdev->int_scan_req = NULL;

        kfree(rdev->scan_req);
        rdev->scan_req = NULL;

        if (!send_message)
                rdev->scan_msg = msg;
        else
                nl80211_send_scan_msg(rdev, msg);
}

void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk)
{
        ___cfg80211_scan_done(wiphy_to_rdev(wiphy), true);
}

void cfg80211_scan_done(struct cfg80211_scan_request *request,
                        struct cfg80211_scan_info *info)
{
        struct cfg80211_scan_info old_info = request->info;

        trace_cfg80211_scan_done(request, info);
        WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req &&
                request != wiphy_to_rdev(request->wiphy)->int_scan_req);

        request->info = *info;

        /*
         * In case the scan is split, the scan_start_tsf and tsf_bssid should
         * be of the first part. In such a case old_info.scan_start_tsf should
         * be non zero.
         */
        if (request->scan_6ghz && old_info.scan_start_tsf) {
                request->info.scan_start_tsf = old_info.scan_start_tsf;
                memcpy(request->info.tsf_bssid, old_info.tsf_bssid,
                       sizeof(request->info.tsf_bssid));
        }

        request->notified = true;
        wiphy_work_queue(request->wiphy,
                         &wiphy_to_rdev(request->wiphy)->scan_done_wk);
}
EXPORT_SYMBOL(cfg80211_scan_done);

void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        list_add_rcu(&req->list, &rdev->sched_scan_req_list);
}

static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev,
                                        struct cfg80211_sched_scan_request *req)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        list_del_rcu(&req->list);
        kfree_rcu(req, rcu_head);
}

static struct cfg80211_sched_scan_request *
cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid)
{
        struct cfg80211_sched_scan_request *pos;

        list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list,
                                lockdep_is_held(&rdev->wiphy.mtx)) {
                if (pos->reqid == reqid)
                        return pos;
        }
        return NULL;
}

/*
 * Determines if a scheduled scan request can be handled. When a legacy
 * scheduled scan is running no other scheduled scan is allowed regardless
 * whether the request is for legacy or multi-support scan. When a multi-support
 * scheduled scan is running a request for legacy scan is not allowed. In this
 * case a request for multi-support scan can be handled if resources are
 * available, ie. struct wiphy::max_sched_scan_reqs limit is not yet reached.
 */
int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev,
                                     bool want_multi)
{
        struct cfg80211_sched_scan_request *pos;
        int i = 0;

        list_for_each_entry(pos, &rdev->sched_scan_req_list, list) {
                /* request id zero means legacy in progress */
                if (!i && !pos->reqid)
                        return -EINPROGRESS;
                i++;
        }

        if (i) {
                /* no legacy allowed when multi request(s) are active */
                if (!want_multi)
                        return -EINPROGRESS;

                /* resource limit reached */
                if (i == rdev->wiphy.max_sched_scan_reqs)
                        return -ENOSPC;
        }
        return 0;
}

void cfg80211_sched_scan_results_wk(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;
        struct cfg80211_sched_scan_request *req, *tmp;

        rdev = container_of(work, struct cfg80211_registered_device,
                           sched_scan_res_wk);

        wiphy_lock(&rdev->wiphy);
        list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) {
                if (req->report_results) {
                        req->report_results = false;
                        if (req->flags & NL80211_SCAN_FLAG_FLUSH) {
                                /* flush entries from previous scans */
                                spin_lock_bh(&rdev->bss_lock);
                                __cfg80211_bss_expire(rdev, req->scan_start);
                                spin_unlock_bh(&rdev->bss_lock);
                                req->scan_start = jiffies;
                        }
                        nl80211_send_sched_scan(req,
                                                NL80211_CMD_SCHED_SCAN_RESULTS);
                }
        }
        wiphy_unlock(&rdev->wiphy);
}

void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_sched_scan_request *request;

        trace_cfg80211_sched_scan_results(wiphy, reqid);
        /* ignore if we're not scanning */

        rcu_read_lock();
        request = cfg80211_find_sched_scan_req(rdev, reqid);
        if (request) {
                request->report_results = true;
                queue_work(cfg80211_wq, &rdev->sched_scan_res_wk);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(cfg80211_sched_scan_results);

void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        lockdep_assert_held(&wiphy->mtx);

        trace_cfg80211_sched_scan_stopped(wiphy, reqid);

        __cfg80211_stop_sched_scan(rdev, reqid, true);
}
EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked);

void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid)
{
        wiphy_lock(wiphy);
        cfg80211_sched_scan_stopped_locked(wiphy, reqid);
        wiphy_unlock(wiphy);
}
EXPORT_SYMBOL(cfg80211_sched_scan_stopped);

int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req,
                                 bool driver_initiated)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        if (!driver_initiated) {
                int err = rdev_sched_scan_stop(rdev, req->dev, req->reqid);
                if (err)
                        return err;
        }

        nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_STOPPED);

        cfg80211_del_sched_scan_req(rdev, req);

        return 0;
}

int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
                               u64 reqid, bool driver_initiated)
{
        struct cfg80211_sched_scan_request *sched_scan_req;

        lockdep_assert_held(&rdev->wiphy.mtx);

        sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid);
        if (!sched_scan_req)
                return -ENOENT;

        return cfg80211_stop_sched_scan_req(rdev, sched_scan_req,
                                            driver_initiated);
}

void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
                      unsigned long age_secs)
{
        struct cfg80211_internal_bss *bss;
        unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC);

        spin_lock_bh(&rdev->bss_lock);
        list_for_each_entry(bss, &rdev->bss_list, list)
                bss->ts -= age_jiffies;
        spin_unlock_bh(&rdev->bss_lock);
}

void cfg80211_bss_expire(struct cfg80211_registered_device *rdev)
{
        __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE);
}

void cfg80211_bss_flush(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        spin_lock_bh(&rdev->bss_lock);
        __cfg80211_bss_expire(rdev, jiffies);
        spin_unlock_bh(&rdev->bss_lock);
}
EXPORT_SYMBOL(cfg80211_bss_flush);

const struct element *
cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len,
                         const u8 *match, unsigned int match_len,
                         unsigned int match_offset)
{
        const struct element *elem;

        for_each_element_id(elem, eid, ies, len) {
                if (elem->datalen >= match_offset + match_len &&
                    !memcmp(elem->data + match_offset, match, match_len))
                        return elem;
        }

        return NULL;
}
EXPORT_SYMBOL(cfg80211_find_elem_match);

const struct element *cfg80211_find_vendor_elem(unsigned int oui, int oui_type,
                                                const u8 *ies,
                                                unsigned int len)
{
        const struct element *elem;
        u8 match[] = { oui >> 16, oui >> 8, oui, oui_type };
        int match_len = (oui_type < 0) ? 3 : sizeof(match);

        if (WARN_ON(oui_type > 0xff))
                return NULL;

        elem = cfg80211_find_elem_match(WLAN_EID_VENDOR_SPECIFIC, ies, len,
                                        match, match_len, 0);

        if (!elem || elem->datalen < 4)
                return NULL;

        return elem;
}
EXPORT_SYMBOL(cfg80211_find_vendor_elem);

/**
 * enum bss_compare_mode - BSS compare mode
 * @BSS_CMP_REGULAR: regular compare mode (for insertion and normal find)
 * @BSS_CMP_HIDE_ZLEN: find hidden SSID with zero-length mode
 * @BSS_CMP_HIDE_NUL: find hidden SSID with NUL-ed out mode
 */
enum bss_compare_mode {
        BSS_CMP_REGULAR,
        BSS_CMP_HIDE_ZLEN,
        BSS_CMP_HIDE_NUL,
};

static int cmp_bss(struct cfg80211_bss *a,
                   struct cfg80211_bss *b,
                   enum bss_compare_mode mode)
{
        const struct cfg80211_bss_ies *a_ies, *b_ies;
        const u8 *ie1 = NULL;
        const u8 *ie2 = NULL;
        int i, r;

        if (a->channel != b->channel)
                return (b->channel->center_freq * 1000 + b->channel->freq_offset) -
                       (a->channel->center_freq * 1000 + a->channel->freq_offset);

        a_ies = rcu_access_pointer(a->ies);
        if (!a_ies)
                return -1;
        b_ies = rcu_access_pointer(b->ies);
        if (!b_ies)
                return 1;

        if (WLAN_CAPABILITY_IS_STA_BSS(a->capability))
                ie1 = cfg80211_find_ie(WLAN_EID_MESH_ID,
                                       a_ies->data, a_ies->len);
        if (WLAN_CAPABILITY_IS_STA_BSS(b->capability))
                ie2 = cfg80211_find_ie(WLAN_EID_MESH_ID,
                                       b_ies->data, b_ies->len);
        if (ie1 && ie2) {
                int mesh_id_cmp;

                if (ie1[1] == ie2[1])
                        mesh_id_cmp = memcmp(ie1 + 2, ie2 + 2, ie1[1]);
                else
                        mesh_id_cmp = ie2[1] - ie1[1];

                ie1 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
                                       a_ies->data, a_ies->len);
                ie2 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG,
                                       b_ies->data, b_ies->len);
                if (ie1 && ie2) {
                        if (mesh_id_cmp)
                                return mesh_id_cmp;
                        if (ie1[1] != ie2[1])
                                return ie2[1] - ie1[1];
                        return memcmp(ie1 + 2, ie2 + 2, ie1[1]);
                }
        }

        r = memcmp(a->bssid, b->bssid, sizeof(a->bssid));
        if (r)
                return r;

        ie1 = cfg80211_find_ie(WLAN_EID_SSID, a_ies->data, a_ies->len);
        ie2 = cfg80211_find_ie(WLAN_EID_SSID, b_ies->data, b_ies->len);

        if (!ie1 && !ie2)
                return 0;

        /*
         * Note that with "hide_ssid", the function returns a match if
         * the already-present BSS ("b") is a hidden SSID beacon for
         * the new BSS ("a").
         */

        /* sort missing IE before (left of) present IE */
        if (!ie1)
                return -1;
        if (!ie2)
                return 1;

        switch (mode) {
        case BSS_CMP_HIDE_ZLEN:
                /*
                 * In ZLEN mode we assume the BSS entry we're
                 * looking for has a zero-length SSID. So if
                 * the one we're looking at right now has that,
                 * return 0. Otherwise, return the difference
                 * in length, but since we're looking for the
                 * 0-length it's really equivalent to returning
                 * the length of the one we're looking at.
                 *
                 * No content comparison is needed as we assume
                 * the content length is zero.
                 */
                return ie2[1];
        case BSS_CMP_REGULAR:
        default:
                /* sort by length first, then by contents */
                if (ie1[1] != ie2[1])
                        return ie2[1] - ie1[1];
                return memcmp(ie1 + 2, ie2 + 2, ie1[1]);
        case BSS_CMP_HIDE_NUL:
                if (ie1[1] != ie2[1])
                        return ie2[1] - ie1[1];
                /* this is equivalent to memcmp(zeroes, ie2 + 2, len) */
                for (i = 0; i < ie2[1]; i++)
                        if (ie2[i + 2])
                                return -1;
                return 0;
        }
}

static bool cfg80211_bss_type_match(u16 capability,
                                    enum nl80211_band band,
                                    enum ieee80211_bss_type bss_type)
{
        bool ret = true;
        u16 mask, val;

        if (bss_type == IEEE80211_BSS_TYPE_ANY)
                return ret;

        if (band == NL80211_BAND_60GHZ) {
                mask = WLAN_CAPABILITY_DMG_TYPE_MASK;
                switch (bss_type) {
                case IEEE80211_BSS_TYPE_ESS:
                        val = WLAN_CAPABILITY_DMG_TYPE_AP;
                        break;
                case IEEE80211_BSS_TYPE_PBSS:
                        val = WLAN_CAPABILITY_DMG_TYPE_PBSS;
                        break;
                case IEEE80211_BSS_TYPE_IBSS:
                        val = WLAN_CAPABILITY_DMG_TYPE_IBSS;
                        break;
                default:
                        return false;
                }
        } else {
                mask = WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS;
                switch (bss_type) {
                case IEEE80211_BSS_TYPE_ESS:
                        val = WLAN_CAPABILITY_ESS;
                        break;
                case IEEE80211_BSS_TYPE_IBSS:
                        val = WLAN_CAPABILITY_IBSS;
                        break;
                case IEEE80211_BSS_TYPE_MBSS:
                        val = 0;
                        break;
                default:
                        return false;
                }
        }

        ret = ((capability & mask) == val);
        return ret;
}

/* Returned bss is reference counted and must be cleaned up appropriately. */
struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy,
                                        struct ieee80211_channel *channel,
                                        const u8 *bssid,
                                        const u8 *ssid, size_t ssid_len,
                                        enum ieee80211_bss_type bss_type,
                                        enum ieee80211_privacy privacy,
                                        u32 use_for)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_internal_bss *bss, *res = NULL;
        unsigned long now = jiffies;
        int bss_privacy;

        trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, bss_type,
                               privacy);

        spin_lock_bh(&rdev->bss_lock);

        list_for_each_entry(bss, &rdev->bss_list, list) {
                if (!cfg80211_bss_type_match(bss->pub.capability,
                                             bss->pub.channel->band, bss_type))
                        continue;

                bss_privacy = (bss->pub.capability & WLAN_CAPABILITY_PRIVACY);
                if ((privacy == IEEE80211_PRIVACY_ON && !bss_privacy) ||
                    (privacy == IEEE80211_PRIVACY_OFF && bss_privacy))
                        continue;
                if (channel && bss->pub.channel != channel)
                        continue;
                if (!is_valid_ether_addr(bss->pub.bssid))
                        continue;
                if ((bss->pub.use_for & use_for) != use_for)
                        continue;
                /* Don't get expired BSS structs */
                if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) &&
                    !atomic_read(&bss->hold))
                        continue;
                if (is_bss(&bss->pub, bssid, ssid, ssid_len)) {
                        res = bss;
                        bss_ref_get(rdev, res);
                        break;
                }
        }

        spin_unlock_bh(&rdev->bss_lock);
        if (!res)
                return NULL;
        trace_cfg80211_return_bss(&res->pub);
        return &res->pub;
}
EXPORT_SYMBOL(__cfg80211_get_bss);

static void rb_insert_bss(struct cfg80211_registered_device *rdev,
                          struct cfg80211_internal_bss *bss)
{
        struct rb_node **p = &rdev->bss_tree.rb_node;
        struct rb_node *parent = NULL;
        struct cfg80211_internal_bss *tbss;
        int cmp;

        while (*p) {
                parent = *p;
                tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn);

                cmp = cmp_bss(&bss->pub, &tbss->pub, BSS_CMP_REGULAR);

                if (WARN_ON(!cmp)) {
                        /* will sort of leak this BSS */
                        return;
                }

                if (cmp < 0)
                        p = &(*p)->rb_left;
                else
                        p = &(*p)->rb_right;
        }

        rb_link_node(&bss->rbn, parent, p);
        rb_insert_color(&bss->rbn, &rdev->bss_tree);
}

static struct cfg80211_internal_bss *
rb_find_bss(struct cfg80211_registered_device *rdev,
            struct cfg80211_internal_bss *res,
            enum bss_compare_mode mode)
{
        struct rb_node *n = rdev->bss_tree.rb_node;
        struct cfg80211_internal_bss *bss;
        int r;

        while (n) {
                bss = rb_entry(n, struct cfg80211_internal_bss, rbn);
                r = cmp_bss(&res->pub, &bss->pub, mode);

                if (r == 0)
                        return bss;
                else if (r < 0)
                        n = n->rb_left;
                else
                        n = n->rb_right;
        }

        return NULL;
}

static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev,
                                   struct cfg80211_internal_bss *new)
{
        const struct cfg80211_bss_ies *ies;
        struct cfg80211_internal_bss *bss;
        const u8 *ie;
        int i, ssidlen;
        u8 fold = 0;
        u32 n_entries = 0;

        ies = rcu_access_pointer(new->pub.beacon_ies);
        if (WARN_ON(!ies))
                return false;

        ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len);
        if (!ie) {
                /* nothing to do */
                return true;
        }

        ssidlen = ie[1];
        for (i = 0; i < ssidlen; i++)
                fold |= ie[2 + i];

        if (fold) {
                /* not a hidden SSID */
                return true;
        }

        /* This is the bad part ... */

        list_for_each_entry(bss, &rdev->bss_list, list) {
                /*
                 * we're iterating all the entries anyway, so take the
                 * opportunity to validate the list length accounting
                 */
                n_entries++;

                if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid))
                        continue;
                if (bss->pub.channel != new->pub.channel)
                        continue;
                if (rcu_access_pointer(bss->pub.beacon_ies))
                        continue;
                ies = rcu_access_pointer(bss->pub.ies);
                if (!ies)
                        continue;
                ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len);
                if (!ie)
                        continue;
                if (ssidlen && ie[1] != ssidlen)
                        continue;
                if (WARN_ON_ONCE(bss->pub.hidden_beacon_bss))
                        continue;
                if (WARN_ON_ONCE(!list_empty(&bss->hidden_list)))
                        list_del(&bss->hidden_list);
                /* combine them */
                list_add(&bss->hidden_list, &new->hidden_list);
                bss->pub.hidden_beacon_bss = &new->pub;
                new->refcount += bss->refcount;
                rcu_assign_pointer(bss->pub.beacon_ies,
                                   new->pub.beacon_ies);
        }

        WARN_ONCE(n_entries != rdev->bss_entries,
                  "rdev bss entries[%d]/list[len:%d] corruption\n",
                  rdev->bss_entries, n_entries);

        return true;
}

static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known,
                                         const struct cfg80211_bss_ies *new_ies,
                                         const struct cfg80211_bss_ies *old_ies)
{
        struct cfg80211_internal_bss *bss;

        /* Assign beacon IEs to all sub entries */
        list_for_each_entry(bss, &known->hidden_list, hidden_list) {
                const struct cfg80211_bss_ies *ies;

                ies = rcu_access_pointer(bss->pub.beacon_ies);
                WARN_ON(ies != old_ies);

                rcu_assign_pointer(bss->pub.beacon_ies, new_ies);
        }
}

static void cfg80211_check_stuck_ecsa(struct cfg80211_registered_device *rdev,
                                      struct cfg80211_internal_bss *known,
                                      const struct cfg80211_bss_ies *old)
{
        const struct ieee80211_ext_chansw_ie *ecsa;
        const struct element *elem_new, *elem_old;
        const struct cfg80211_bss_ies *new, *bcn;

        if (known->pub.proberesp_ecsa_stuck)
                return;

        new = rcu_dereference_protected(known->pub.proberesp_ies,
                                        lockdep_is_held(&rdev->bss_lock));
        if (WARN_ON(!new))
                return;

        if (new->tsf - old->tsf < USEC_PER_SEC)
                return;

        elem_old = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN,
                                      old->data, old->len);
        if (!elem_old)
                return;

        elem_new = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN,
                                      new->data, new->len);
        if (!elem_new)
                return;

        bcn = rcu_dereference_protected(known->pub.beacon_ies,
                                        lockdep_is_held(&rdev->bss_lock));
        if (bcn &&
            cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN,
                               bcn->data, bcn->len))
                return;

        if (elem_new->datalen != elem_old->datalen)
                return;
        if (elem_new->datalen < sizeof(struct ieee80211_ext_chansw_ie))
                return;
        if (memcmp(elem_new->data, elem_old->data, elem_new->datalen))
                return;

        ecsa = (void *)elem_new->data;

        if (!ecsa->mode)
                return;

        if (ecsa->new_ch_num !=
            ieee80211_frequency_to_channel(known->pub.channel->center_freq))
                return;

        known->pub.proberesp_ecsa_stuck = 1;
}

static bool
cfg80211_update_known_bss(struct cfg80211_registered_device *rdev,
                          struct cfg80211_internal_bss *known,
                          struct cfg80211_internal_bss *new,
                          bool signal_valid)
{
        lockdep_assert_held(&rdev->bss_lock);

        /* Update IEs */
        if (rcu_access_pointer(new->pub.proberesp_ies)) {
                const struct cfg80211_bss_ies *old;

                old = rcu_access_pointer(known->pub.proberesp_ies);

                rcu_assign_pointer(known->pub.proberesp_ies,
                                   new->pub.proberesp_ies);
                /* Override possible earlier Beacon frame IEs */
                rcu_assign_pointer(known->pub.ies,
                                   new->pub.proberesp_ies);
                if (old) {
                        cfg80211_check_stuck_ecsa(rdev, known, old);
                        kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head);
                }
        }

        if (rcu_access_pointer(new->pub.beacon_ies)) {
                const struct cfg80211_bss_ies *old;

                if (known->pub.hidden_beacon_bss &&
                    !list_empty(&known->hidden_list)) {
                        const struct cfg80211_bss_ies *f;

                        /* The known BSS struct is one of the probe
                         * response members of a group, but we're
                         * receiving a beacon (beacon_ies in the new
                         * bss is used). This can only mean that the
                         * AP changed its beacon from not having an
                         * SSID to showing it, which is confusing so
                         * drop this information.
                         */

                        f = rcu_access_pointer(new->pub.beacon_ies);
                        kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head);
                        return false;
                }

                old = rcu_access_pointer(known->pub.beacon_ies);

                rcu_assign_pointer(known->pub.beacon_ies, new->pub.beacon_ies);

                /* Override IEs if they were from a beacon before */
                if (old == rcu_access_pointer(known->pub.ies))
                        rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies);

                cfg80211_update_hidden_bsses(known,
                                             rcu_access_pointer(new->pub.beacon_ies),
                                             old);

                if (old)
                        kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head);
        }

        known->pub.beacon_interval = new->pub.beacon_interval;

        /* don't update the signal if beacon was heard on
         * adjacent channel.
         */
        if (signal_valid)
                known->pub.signal = new->pub.signal;
        known->pub.capability = new->pub.capability;
        known->ts = new->ts;
        known->ts_boottime = new->ts_boottime;
        known->parent_tsf = new->parent_tsf;
        known->pub.chains = new->pub.chains;
        memcpy(known->pub.chain_signal, new->pub.chain_signal,
               IEEE80211_MAX_CHAINS);
        ether_addr_copy(known->parent_bssid, new->parent_bssid);
        known->pub.max_bssid_indicator = new->pub.max_bssid_indicator;
        known->pub.bssid_index = new->pub.bssid_index;
        known->pub.use_for &= new->pub.use_for;
        known->pub.cannot_use_reasons = new->pub.cannot_use_reasons;

        return true;
}

/* Returned bss is reference counted and must be cleaned up appropriately. */
static struct cfg80211_internal_bss *
__cfg80211_bss_update(struct cfg80211_registered_device *rdev,
                      struct cfg80211_internal_bss *tmp,
                      bool signal_valid, unsigned long ts)
{
        struct cfg80211_internal_bss *found = NULL;
        struct cfg80211_bss_ies *ies;

        if (WARN_ON(!tmp->pub.channel))
                goto free_ies;

        tmp->ts = ts;

        if (WARN_ON(!rcu_access_pointer(tmp->pub.ies)))
                goto free_ies;

        found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR);

        if (found) {
                if (!cfg80211_update_known_bss(rdev, found, tmp, signal_valid))
                        return NULL;
        } else {
                struct cfg80211_internal_bss *new;
                struct cfg80211_internal_bss *hidden;

                /*
                 * create a copy -- the "res" variable that is passed in
                 * is allocated on the stack since it's not needed in the
                 * more common case of an update
                 */
                new = kzalloc(sizeof(*new) + rdev->wiphy.bss_priv_size,
                              GFP_ATOMIC);
                if (!new)
                        goto free_ies;
                memcpy(new, tmp, sizeof(*new));
                new->refcount = 1;
                INIT_LIST_HEAD(&new->hidden_list);
                INIT_LIST_HEAD(&new->pub.nontrans_list);
                /* we'll set this later if it was non-NULL */
                new->pub.transmitted_bss = NULL;

                if (rcu_access_pointer(tmp->pub.proberesp_ies)) {
                        hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN);
                        if (!hidden)
                                hidden = rb_find_bss(rdev, tmp,
                                                     BSS_CMP_HIDE_NUL);
                        if (hidden) {
                                new->pub.hidden_beacon_bss = &hidden->pub;
                                list_add(&new->hidden_list,
                                         &hidden->hidden_list);
                                hidden->refcount++;

                                ies = (void *)rcu_access_pointer(new->pub.beacon_ies);
                                rcu_assign_pointer(new->pub.beacon_ies,
                                                   hidden->pub.beacon_ies);
                                if (ies)
                                        kfree_rcu(ies, rcu_head);
                        }
                } else {
                        /*
                         * Ok so we found a beacon, and don't have an entry. If
                         * it's a beacon with hidden SSID, we might be in for an
                         * expensive search for any probe responses that should
                         * be grouped with this beacon for updates ...
                         */
                        if (!cfg80211_combine_bsses(rdev, new)) {
                                bss_ref_put(rdev, new);
                                return NULL;
                        }
                }

                if (rdev->bss_entries >= bss_entries_limit &&
                    !cfg80211_bss_expire_oldest(rdev)) {
                        bss_ref_put(rdev, new);
                        return NULL;
                }

                /* This must be before the call to bss_ref_get */
                if (tmp->pub.transmitted_bss) {
                        new->pub.transmitted_bss = tmp->pub.transmitted_bss;
                        bss_ref_get(rdev, bss_from_pub(tmp->pub.transmitted_bss));
                }

                list_add_tail(&new->list, &rdev->bss_list);
                rdev->bss_entries++;
                rb_insert_bss(rdev, new);
                found = new;
        }

        rdev->bss_generation++;
        bss_ref_get(rdev, found);

        return found;

free_ies:
        ies = (void *)rcu_dereference(tmp->pub.beacon_ies);
        if (ies)
                kfree_rcu(ies, rcu_head);
        ies = (void *)rcu_dereference(tmp->pub.proberesp_ies);
        if (ies)
                kfree_rcu(ies, rcu_head);

        return NULL;
}

struct cfg80211_internal_bss *
cfg80211_bss_update(struct cfg80211_registered_device *rdev,
                    struct cfg80211_internal_bss *tmp,
                    bool signal_valid, unsigned long ts)
{
        struct cfg80211_internal_bss *res;

        spin_lock_bh(&rdev->bss_lock);
        res = __cfg80211_bss_update(rdev, tmp, signal_valid, ts);
        spin_unlock_bh(&rdev->bss_lock);

        return res;
}

int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
                                    enum nl80211_band band)
{
        const struct element *tmp;

        if (band == NL80211_BAND_6GHZ) {
                struct ieee80211_he_operation *he_oper;

                tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie,
                                             ielen);
                if (tmp && tmp->datalen >= sizeof(*he_oper) &&
                    tmp->datalen >= ieee80211_he_oper_size(&tmp->data[1])) {
                        const struct ieee80211_he_6ghz_oper *he_6ghz_oper;

                        he_oper = (void *)&tmp->data[1];

                        he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper);
                        if (!he_6ghz_oper)
                                return -1;

                        return he_6ghz_oper->primary;
                }
        } else if (band == NL80211_BAND_S1GHZ) {
                tmp = cfg80211_find_elem(WLAN_EID_S1G_OPERATION, ie, ielen);
                if (tmp && tmp->datalen >= sizeof(struct ieee80211_s1g_oper_ie)) {
                        struct ieee80211_s1g_oper_ie *s1gop = (void *)tmp->data;

                        return s1gop->oper_ch;
                }
        } else {
                tmp = cfg80211_find_elem(WLAN_EID_DS_PARAMS, ie, ielen);
                if (tmp && tmp->datalen == 1)
                        return tmp->data[0];

                tmp = cfg80211_find_elem(WLAN_EID_HT_OPERATION, ie, ielen);
                if (tmp &&
                    tmp->datalen >= sizeof(struct ieee80211_ht_operation)) {
                        struct ieee80211_ht_operation *htop = (void *)tmp->data;

                        return htop->primary_chan;
                }
        }

        return -1;
}
EXPORT_SYMBOL(cfg80211_get_ies_channel_number);

/*
 * Update RX channel information based on the available frame payload
 * information. This is mainly for the 2.4 GHz band where frames can be received
 * from neighboring channels and the Beacon frames use the DSSS Parameter Set
 * element to indicate the current (transmitting) channel, but this might also
 * be needed on other bands if RX frequency does not match with the actual
 * operating channel of a BSS, or if the AP reports a different primary channel.
 */
static struct ieee80211_channel *
cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
                         struct ieee80211_channel *channel)
{
        u32 freq;
        int channel_number;
        struct ieee80211_channel *alt_channel;

        channel_number = cfg80211_get_ies_channel_number(ie, ielen,
                                                         channel->band);

        if (channel_number < 0) {
                /* No channel information in frame payload */
                return channel;
        }

        freq = ieee80211_channel_to_freq_khz(channel_number, channel->band);

        /*
         * Frame info (beacon/prob res) is the same as received channel,
         * no need for further processing.
         */
        if (freq == ieee80211_channel_to_khz(channel))
                return channel;

        alt_channel = ieee80211_get_channel_khz(wiphy, freq);
        if (!alt_channel) {
                if (channel->band == NL80211_BAND_2GHZ ||
                    channel->band == NL80211_BAND_6GHZ) {
                        /*
                         * Better not allow unexpected channels when that could
                         * be going beyond the 1-11 range (e.g., discovering
                         * BSS on channel 12 when radio is configured for
                         * channel 11) or beyond the 6 GHz channel range.
                         */
                        return NULL;
                }

                /* No match for the payload channel number - ignore it */
                return channel;
        }

        /*
         * Use the channel determined through the payload channel number
         * instead of the RX channel reported by the driver.
         */
        if (alt_channel->flags & IEEE80211_CHAN_DISABLED)
                return NULL;
        return alt_channel;
}

struct cfg80211_inform_single_bss_data {
        struct cfg80211_inform_bss *drv_data;
        enum cfg80211_bss_frame_type ftype;
        struct ieee80211_channel *channel;
        u8 bssid[ETH_ALEN];
        u64 tsf;
        u16 capability;
        u16 beacon_interval;
        const u8 *ie;
        size_t ielen;

        enum {
                BSS_SOURCE_DIRECT = 0,
                BSS_SOURCE_MBSSID,
                BSS_SOURCE_STA_PROFILE,
        } bss_source;
        /* Set if reporting bss_source != BSS_SOURCE_DIRECT */
        struct cfg80211_bss *source_bss;
        u8 max_bssid_indicator;
        u8 bssid_index;

        u8 use_for;
        u64 cannot_use_reasons;
};

static bool cfg80211_6ghz_power_type_valid(const u8 *ie, size_t ielen,
                                           const u32 flags)
{
        const struct element *tmp;
        struct ieee80211_he_operation *he_oper;

        tmp = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ielen);
        if (tmp && tmp->datalen >= sizeof(*he_oper) + 1 &&
            tmp->datalen >= ieee80211_he_oper_size(tmp->data + 1)) {
                const struct ieee80211_he_6ghz_oper *he_6ghz_oper;

                he_oper = (void *)&tmp->data[1];
                he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper);

                if (!he_6ghz_oper)
                        return false;

                switch (u8_get_bits(he_6ghz_oper->control,
                                    IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
                case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
                case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
                        return true;
                case IEEE80211_6GHZ_CTRL_REG_SP_AP:
                case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
                        return !(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT);
                case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
                        return !(flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT);
                default:
                        return false;
                }
        }
        return false;
}

/* Returned bss is reference counted and must be cleaned up appropriately. */
static struct cfg80211_bss *
cfg80211_inform_single_bss_data(struct wiphy *wiphy,
                                struct cfg80211_inform_single_bss_data *data,
                                gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_inform_bss *drv_data = data->drv_data;
        struct cfg80211_bss_ies *ies;
        struct ieee80211_channel *channel;
        struct cfg80211_internal_bss tmp = {}, *res;
        int bss_type;
        bool signal_valid;
        unsigned long ts;

        if (WARN_ON(!wiphy))
                return NULL;

        if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC &&
                    (drv_data->signal < 0 || drv_data->signal > 100)))
                return NULL;

        if (WARN_ON(data->bss_source != BSS_SOURCE_DIRECT && !data->source_bss))
                return NULL;

        channel = data->channel;
        if (!channel)
                channel = cfg80211_get_bss_channel(wiphy, data->ie, data->ielen,
                                                   drv_data->chan);
        if (!channel)
                return NULL;

        if (channel->band == NL80211_BAND_6GHZ &&
            !cfg80211_6ghz_power_type_valid(data->ie, data->ielen,
                                            channel->flags)) {
                data->use_for = 0;
                data->cannot_use_reasons =
                        NL80211_BSS_CANNOT_USE_6GHZ_PWR_MISMATCH;
        }

        memcpy(tmp.pub.bssid, data->bssid, ETH_ALEN);
        tmp.pub.channel = channel;
        if (data->bss_source != BSS_SOURCE_STA_PROFILE)
                tmp.pub.signal = drv_data->signal;
        else
                tmp.pub.signal = 0;
        tmp.pub.beacon_interval = data->beacon_interval;
        tmp.pub.capability = data->capability;
        tmp.ts_boottime = drv_data->boottime_ns;
        tmp.parent_tsf = drv_data->parent_tsf;
        ether_addr_copy(tmp.parent_bssid, drv_data->parent_bssid);
        tmp.pub.chains = drv_data->chains;
        memcpy(tmp.pub.chain_signal, drv_data->chain_signal,
               IEEE80211_MAX_CHAINS);
        tmp.pub.use_for = data->use_for;
        tmp.pub.cannot_use_reasons = data->cannot_use_reasons;

        switch (data->bss_source) {
        case BSS_SOURCE_MBSSID:
                tmp.pub.transmitted_bss = data->source_bss;
                fallthrough;
        case BSS_SOURCE_STA_PROFILE:
                ts = bss_from_pub(data->source_bss)->ts;
                tmp.pub.bssid_index = data->bssid_index;
                tmp.pub.max_bssid_indicator = data->max_bssid_indicator;
                break;
        case BSS_SOURCE_DIRECT:
                ts = jiffies;

                if (channel->band == NL80211_BAND_60GHZ) {
                        bss_type = data->capability &
                                   WLAN_CAPABILITY_DMG_TYPE_MASK;
                        if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP ||
                            bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS)
                                regulatory_hint_found_beacon(wiphy, channel,
                                                             gfp);
                } else {
                        if (data->capability & WLAN_CAPABILITY_ESS)
                                regulatory_hint_found_beacon(wiphy, channel,
                                                             gfp);
                }
                break;
        }

        /*
         * If we do not know here whether the IEs are from a Beacon or Probe
         * Response frame, we need to pick one of the options and only use it
         * with the driver that does not provide the full Beacon/Probe Response
         * frame. Use Beacon frame pointer to avoid indicating that this should
         * override the IEs pointer should we have received an earlier
         * indication of Probe Response data.
         */
        ies = kzalloc(sizeof(*ies) + data->ielen, gfp);
        if (!ies)
                return NULL;
        ies->len = data->ielen;
        ies->tsf = data->tsf;
        ies->from_beacon = false;
        memcpy(ies->data, data->ie, data->ielen);

        switch (data->ftype) {
        case CFG80211_BSS_FTYPE_BEACON:
        case CFG80211_BSS_FTYPE_S1G_BEACON:
                ies->from_beacon = true;
                fallthrough;
        case CFG80211_BSS_FTYPE_UNKNOWN:
                rcu_assign_pointer(tmp.pub.beacon_ies, ies);
                break;
        case CFG80211_BSS_FTYPE_PRESP:
                rcu_assign_pointer(tmp.pub.proberesp_ies, ies);
                break;
        }
        rcu_assign_pointer(tmp.pub.ies, ies);

        signal_valid = drv_data->chan == channel;
        spin_lock_bh(&rdev->bss_lock);
        res = __cfg80211_bss_update(rdev, &tmp, signal_valid, ts);
        if (!res)
                goto drop;

        rdev_inform_bss(rdev, &res->pub, ies, drv_data->drv_data);

        if (data->bss_source == BSS_SOURCE_MBSSID) {
                /* this is a nontransmitting bss, we need to add it to
                 * transmitting bss' list if it is not there
                 */
                if (cfg80211_add_nontrans_list(data->source_bss, &res->pub)) {
                        if (__cfg80211_unlink_bss(rdev, res)) {
                                rdev->bss_generation++;
                                res = NULL;
                        }
                }

                if (!res)
                        goto drop;
        }
        spin_unlock_bh(&rdev->bss_lock);

        trace_cfg80211_return_bss(&res->pub);
        /* __cfg80211_bss_update gives us a referenced result */
        return &res->pub;

drop:
        spin_unlock_bh(&rdev->bss_lock);
        return NULL;
}

static const struct element
*cfg80211_get_profile_continuation(const u8 *ie, size_t ielen,
                                   const struct element *mbssid_elem,
                                   const struct element *sub_elem)
{
        const u8 *mbssid_end = mbssid_elem->data + mbssid_elem->datalen;
        const struct element *next_mbssid;
        const struct element *next_sub;

        next_mbssid = cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID,
                                         mbssid_end,
                                         ielen - (mbssid_end - ie));

        /*
         * If it is not the last subelement in current MBSSID IE or there isn't
         * a next MBSSID IE - profile is complete.
        */
        if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) ||
            !next_mbssid)
                return NULL;

        /* For any length error, just return NULL */

        if (next_mbssid->datalen < 4)
                return NULL;

        next_sub = (void *)&next_mbssid->data[1];

        if (next_mbssid->data + next_mbssid->datalen <
            next_sub->data + next_sub->datalen)
                return NULL;

        if (next_sub->id != 0 || next_sub->datalen < 2)
                return NULL;

        /*
         * Check if the first element in the next sub element is a start
         * of a new profile
         */
        return next_sub->data[0] == WLAN_EID_NON_TX_BSSID_CAP ?
               NULL : next_mbssid;
}

size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
                              const struct element *mbssid_elem,
                              const struct element *sub_elem,
                              u8 *merged_ie, size_t max_copy_len)
{
        size_t copied_len = sub_elem->datalen;
        const struct element *next_mbssid;

        if (sub_elem->datalen > max_copy_len)
                return 0;

        memcpy(merged_ie, sub_elem->data, sub_elem->datalen);

        while ((next_mbssid = cfg80211_get_profile_continuation(ie, ielen,
                                                                mbssid_elem,
                                                                sub_elem))) {
                const struct element *next_sub = (void *)&next_mbssid->data[1];

                if (copied_len + next_sub->datalen > max_copy_len)
                        break;
                memcpy(merged_ie + copied_len, next_sub->data,
                       next_sub->datalen);
                copied_len += next_sub->datalen;
        }

        return copied_len;
}
EXPORT_SYMBOL(cfg80211_merge_profile);

static void
cfg80211_parse_mbssid_data(struct wiphy *wiphy,
                           struct cfg80211_inform_single_bss_data *tx_data,
                           struct cfg80211_bss *source_bss,
                           gfp_t gfp)
{
        struct cfg80211_inform_single_bss_data data = {
                .drv_data = tx_data->drv_data,
                .ftype = tx_data->ftype,
                .tsf = tx_data->tsf,
                .beacon_interval = tx_data->beacon_interval,
                .source_bss = source_bss,
                .bss_source = BSS_SOURCE_MBSSID,
                .use_for = tx_data->use_for,
                .cannot_use_reasons = tx_data->cannot_use_reasons,
        };
        const u8 *mbssid_index_ie;
        const struct element *elem, *sub;
        u8 *new_ie, *profile;
        u64 seen_indices = 0;
        struct cfg80211_bss *bss;

        if (!source_bss)
                return;
        if (!cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID,
                                tx_data->ie, tx_data->ielen))
                return;
        if (!wiphy->support_mbssid)
                return;
        if (wiphy->support_only_he_mbssid &&
            !cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY,
                                    tx_data->ie, tx_data->ielen))
                return;

        new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp);
        if (!new_ie)
                return;

        profile = kmalloc(tx_data->ielen, gfp);
        if (!profile)
                goto out;

        for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID,
                            tx_data->ie, tx_data->ielen) {
                if (elem->datalen < 4)
                        continue;
                if (elem->data[0] < 1 || (int)elem->data[0] > 8)
                        continue;
                for_each_element(sub, elem->data + 1, elem->datalen - 1) {
                        u8 profile_len;

                        if (sub->id != 0 || sub->datalen < 4) {
                                /* not a valid BSS profile */
                                continue;
                        }

                        if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP ||
                            sub->data[1] != 2) {
                                /* The first element within the Nontransmitted
                                 * BSSID Profile is not the Nontransmitted
                                 * BSSID Capability element.
                                 */
                                continue;
                        }

                        memset(profile, 0, tx_data->ielen);
                        profile_len = cfg80211_merge_profile(tx_data->ie,
                                                             tx_data->ielen,
                                                             elem,
                                                             sub,
                                                             profile,
                                                             tx_data->ielen);

                        /* found a Nontransmitted BSSID Profile */
                        mbssid_index_ie = cfg80211_find_ie
                                (WLAN_EID_MULTI_BSSID_IDX,
                                 profile, profile_len);
                        if (!mbssid_index_ie || mbssid_index_ie[1] < 1 ||
                            mbssid_index_ie[2] == 0 ||
                            mbssid_index_ie[2] > 46 ||
                            mbssid_index_ie[2] >= (1 << elem->data[0])) {
                                /* No valid Multiple BSSID-Index element */
                                continue;
                        }

                        if (seen_indices & BIT_ULL(mbssid_index_ie[2]))
                                /* We don't support legacy split of a profile */
                                net_dbg_ratelimited("Partial info for BSSID index %d\n",
                                                    mbssid_index_ie[2]);

                        seen_indices |= BIT_ULL(mbssid_index_ie[2]);

                        data.bssid_index = mbssid_index_ie[2];
                        data.max_bssid_indicator = elem->data[0];

                        cfg80211_gen_new_bssid(tx_data->bssid,
                                               data.max_bssid_indicator,
                                               data.bssid_index,
                                               data.bssid);

                        memset(new_ie, 0, IEEE80211_MAX_DATA_LEN);
                        data.ie = new_ie;
                        data.ielen = cfg80211_gen_new_ie(tx_data->ie,
                                                         tx_data->ielen,
                                                         profile,
                                                         profile_len,
                                                         new_ie,
                                                         IEEE80211_MAX_DATA_LEN);
                        if (!data.ielen)
                                continue;

                        data.capability = get_unaligned_le16(profile + 2);
                        bss = cfg80211_inform_single_bss_data(wiphy, &data, gfp);
                        if (!bss)
                                break;
                        cfg80211_put_bss(wiphy, bss);
                }
        }

out:
        kfree(new_ie);
        kfree(profile);
}

ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies,
                                    size_t ieslen, u8 *data, size_t data_len,
                                    u8 frag_id)
{
        const struct element *next;
        ssize_t copied;
        u8 elem_datalen;

        if (!elem)
                return -EINVAL;

        /* elem might be invalid after the memmove */
        next = (void *)(elem->data + elem->datalen);
        elem_datalen = elem->datalen;

        if (elem->id == WLAN_EID_EXTENSION) {
                copied = elem->datalen - 1;

                if (data) {
                        if (copied > data_len)
                                return -ENOSPC;

                        memmove(data, elem->data + 1, copied);
                }
        } else {
                copied = elem->datalen;

                if (data) {
                        if (copied > data_len)
                                return -ENOSPC;

                        memmove(data, elem->data, copied);
                }
        }

        /* Fragmented elements must have 255 bytes */
        if (elem_datalen < 255)
                return copied;

        for (elem = next;
             elem->data < ies + ieslen &&
                elem->data + elem->datalen <= ies + ieslen;
             elem = next) {
                /* elem might be invalid after the memmove */
                next = (void *)(elem->data + elem->datalen);

                if (elem->id != frag_id)
                        break;

                elem_datalen = elem->datalen;

                if (data) {
                        if (copied + elem_datalen > data_len)
                                return -ENOSPC;

                        memmove(data + copied, elem->data, elem_datalen);
                }

                copied += elem_datalen;

                /* Only the last fragment may be short */
                if (elem_datalen != 255)
                        break;
        }

        return copied;
}
EXPORT_SYMBOL(cfg80211_defragment_element);

struct cfg80211_mle {
        struct ieee80211_multi_link_elem *mle;
        struct ieee80211_mle_per_sta_profile
                *sta_prof[IEEE80211_MLD_MAX_NUM_LINKS];
        ssize_t sta_prof_len[IEEE80211_MLD_MAX_NUM_LINKS];

        u8 data[];
};

static struct cfg80211_mle *
cfg80211_defrag_mle(const struct element *mle, const u8 *ie, size_t ielen,
                    gfp_t gfp)
{
        const struct element *elem;
        struct cfg80211_mle *res;
        size_t buf_len;
        ssize_t mle_len;
        u8 common_size, idx;

        if (!mle || !ieee80211_mle_size_ok(mle->data + 1, mle->datalen - 1))
                return NULL;

        /* Required length for first defragmentation */
        buf_len = mle->datalen - 1;
        for_each_element(elem, mle->data + mle->datalen,
                         ielen - sizeof(*mle) + mle->datalen) {
                if (elem->id != WLAN_EID_FRAGMENT)
                        break;

                buf_len += elem->datalen;
        }

        res = kzalloc(struct_size(res, data, buf_len), gfp);
        if (!res)
                return NULL;

        mle_len = cfg80211_defragment_element(mle, ie, ielen,
                                              res->data, buf_len,
                                              WLAN_EID_FRAGMENT);
        if (mle_len < 0)
                goto error;

        res->mle = (void *)res->data;

        /* Find the sub-element area in the buffer */
        common_size = ieee80211_mle_common_size((u8 *)res->mle);
        ie = res->data + common_size;
        ielen = mle_len - common_size;

        idx = 0;
        for_each_element_id(elem, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE,
                            ie, ielen) {
                res->sta_prof[idx] = (void *)elem->data;
                res->sta_prof_len[idx] = elem->datalen;

                idx++;
                if (idx >= IEEE80211_MLD_MAX_NUM_LINKS)
                        break;
        }
        if (!for_each_element_completed(elem, ie, ielen))
                goto error;

        /* Defragment sta_info in-place */
        for (idx = 0; idx < IEEE80211_MLD_MAX_NUM_LINKS && res->sta_prof[idx];
             idx++) {
                if (res->sta_prof_len[idx] < 255)
                        continue;

                elem = (void *)res->sta_prof[idx] - 2;

                if (idx + 1 < ARRAY_SIZE(res->sta_prof) &&
                    res->sta_prof[idx + 1])
                        buf_len = (u8 *)res->sta_prof[idx + 1] -
                                  (u8 *)res->sta_prof[idx];
                else
                        buf_len = ielen + ie - (u8 *)elem;

                res->sta_prof_len[idx] =
                        cfg80211_defragment_element(elem,
                                                    (u8 *)elem, buf_len,
                                                    (u8 *)res->sta_prof[idx],
                                                    buf_len,
                                                    IEEE80211_MLE_SUBELEM_FRAGMENT);
                if (res->sta_prof_len[idx] < 0)
                        goto error;
        }

        return res;

error:
        kfree(res);
        return NULL;
}

struct tbtt_info_iter_data {
        const struct ieee80211_neighbor_ap_info *ap_info;
        u8 param_ch_count;
        u32 use_for;
        u8 mld_id, link_id;
        bool non_tx;
};

static enum cfg80211_rnr_iter_ret
cfg802121_mld_ap_rnr_iter(void *_data, u8 type,
                          const struct ieee80211_neighbor_ap_info *info,
                          const u8 *tbtt_info, u8 tbtt_info_len)
{
        const struct ieee80211_rnr_mld_params *mld_params;
        struct tbtt_info_iter_data *data = _data;
        u8 link_id;
        bool non_tx = false;

        if (type == IEEE80211_TBTT_INFO_TYPE_TBTT &&
            tbtt_info_len >= offsetofend(struct ieee80211_tbtt_info_ge_11,
                                         mld_params)) {
                const struct ieee80211_tbtt_info_ge_11 *tbtt_info_ge_11 =
                        (void *)tbtt_info;

                non_tx = (tbtt_info_ge_11->bss_params &
                          (IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID |
                           IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID)) ==
                         IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID;
                mld_params = &tbtt_info_ge_11->mld_params;
        } else if (type == IEEE80211_TBTT_INFO_TYPE_MLD &&
                 tbtt_info_len >= sizeof(struct ieee80211_rnr_mld_params))
                mld_params = (void *)tbtt_info;
        else
                return RNR_ITER_CONTINUE;

        link_id = le16_get_bits(mld_params->params,
                                IEEE80211_RNR_MLD_PARAMS_LINK_ID);

        if (data->mld_id != mld_params->mld_id)
                return RNR_ITER_CONTINUE;

        if (data->link_id != link_id)
                return RNR_ITER_CONTINUE;

        data->ap_info = info;
        data->param_ch_count =
                le16_get_bits(mld_params->params,
                              IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT);
        data->non_tx = non_tx;

        if (type == IEEE80211_TBTT_INFO_TYPE_TBTT)
                data->use_for = NL80211_BSS_USE_FOR_ALL;
        else
                data->use_for = NL80211_BSS_USE_FOR_MLD_LINK;
        return RNR_ITER_BREAK;
}

static u8
cfg80211_rnr_info_for_mld_ap(const u8 *ie, size_t ielen, u8 mld_id, u8 link_id,
                             const struct ieee80211_neighbor_ap_info **ap_info,
                             u8 *param_ch_count, bool *non_tx)
{
        struct tbtt_info_iter_data data = {
                .mld_id = mld_id,
                .link_id = link_id,
        };

        cfg80211_iter_rnr(ie, ielen, cfg802121_mld_ap_rnr_iter, &data);

        *ap_info = data.ap_info;
        *param_ch_count = data.param_ch_count;
        *non_tx = data.non_tx;

        return data.use_for;
}

static struct element *
cfg80211_gen_reporter_rnr(struct cfg80211_bss *source_bss, bool is_mbssid,
                          bool same_mld, u8 link_id, u8 bss_change_count,
                          gfp_t gfp)
{
        const struct cfg80211_bss_ies *ies;
        struct ieee80211_neighbor_ap_info ap_info;
        struct ieee80211_tbtt_info_ge_11 tbtt_info;
        u32 short_ssid;
        const struct element *elem;
        struct element *res;

        /*
         * We only generate the RNR to permit ML lookups. For that we do not
         * need an entry for the corresponding transmitting BSS, lets just skip
         * it even though it would be easy to add.
         */
        if (!same_mld)
                return NULL;

        /* We could use tx_data->ies if we change cfg80211_calc_short_ssid */
        rcu_read_lock();
        ies = rcu_dereference(source_bss->ies);

        ap_info.tbtt_info_len = offsetofend(typeof(tbtt_info), mld_params);
        ap_info.tbtt_info_hdr =
                        u8_encode_bits(IEEE80211_TBTT_INFO_TYPE_TBTT,
                                       IEEE80211_AP_INFO_TBTT_HDR_TYPE) |
                        u8_encode_bits(0, IEEE80211_AP_INFO_TBTT_HDR_COUNT);

        ap_info.channel = ieee80211_frequency_to_channel(source_bss->channel->center_freq);

        /* operating class */
        elem = cfg80211_find_elem(WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
                                  ies->data, ies->len);
        if (elem && elem->datalen >= 1) {
                ap_info.op_class = elem->data[0];
        } else {
                struct cfg80211_chan_def chandef;

                /* The AP is not providing us with anything to work with. So
                 * make up a somewhat reasonable operating class, but don't
                 * bother with it too much as no one will ever use the
                 * information.
                 */
                cfg80211_chandef_create(&chandef, source_bss->channel,
                                        NL80211_CHAN_NO_HT);

                if (!ieee80211_chandef_to_operating_class(&chandef,
                                                          &ap_info.op_class))
                        goto out_unlock;
        }

        /* Just set TBTT offset and PSD 20 to invalid/unknown */
        tbtt_info.tbtt_offset = 255;
        tbtt_info.psd_20 = IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED;

        memcpy(tbtt_info.bssid, source_bss->bssid, ETH_ALEN);
        if (cfg80211_calc_short_ssid(ies, &elem, &short_ssid))
                goto out_unlock;

        rcu_read_unlock();

        tbtt_info.short_ssid = cpu_to_le32(short_ssid);

        tbtt_info.bss_params = IEEE80211_RNR_TBTT_PARAMS_SAME_SSID;

        if (is_mbssid) {
                tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID;
                tbtt_info.bss_params |= IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID;
        }

        tbtt_info.mld_params.mld_id = 0;
        tbtt_info.mld_params.params =
                le16_encode_bits(link_id, IEEE80211_RNR_MLD_PARAMS_LINK_ID) |
                le16_encode_bits(bss_change_count,
                                 IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT);

        res = kzalloc(struct_size(res, data,
                                  sizeof(ap_info) + ap_info.tbtt_info_len),
                      gfp);
        if (!res)
                return NULL;

        /* Copy the data */
        res->id = WLAN_EID_REDUCED_NEIGHBOR_REPORT;
        res->datalen = sizeof(ap_info) + ap_info.tbtt_info_len;
        memcpy(res->data, &ap_info, sizeof(ap_info));
        memcpy(res->data + sizeof(ap_info), &tbtt_info, ap_info.tbtt_info_len);

        return res;

out_unlock:
        rcu_read_unlock();
        return NULL;
}

static void
cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy,
                                struct cfg80211_inform_single_bss_data *tx_data,
                                struct cfg80211_bss *source_bss,
                                const struct element *elem,
                                gfp_t gfp)
{
        struct cfg80211_inform_single_bss_data data = {
                .drv_data = tx_data->drv_data,
                .ftype = tx_data->ftype,
                .source_bss = source_bss,
                .bss_source = BSS_SOURCE_STA_PROFILE,
        };
        struct element *reporter_rnr = NULL;
        struct ieee80211_multi_link_elem *ml_elem;
        struct cfg80211_mle *mle;
        u16 control;
        u8 ml_common_len;
        u8 *new_ie = NULL;
        struct cfg80211_bss *bss;
        u8 mld_id, reporter_link_id, bss_change_count;
        u16 seen_links = 0;
        u8 i;

        if (!ieee80211_mle_type_ok(elem->data + 1,
                                   IEEE80211_ML_CONTROL_TYPE_BASIC,
                                   elem->datalen - 1))
                return;

        ml_elem = (void *)(elem->data + 1);
        control = le16_to_cpu(ml_elem->control);
        ml_common_len = ml_elem->variable[0];

        /* Must be present when transmitted by an AP (in a probe response) */
        if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) ||
            !(control & IEEE80211_MLC_BASIC_PRES_LINK_ID) ||
            !(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP))
                return;

        reporter_link_id = ieee80211_mle_get_link_id(elem->data + 1);
        bss_change_count = ieee80211_mle_get_bss_param_ch_cnt(elem->data + 1);

        /*
         * The MLD ID of the reporting AP is always zero. It is set if the AP
         * is part of an MBSSID set and will be non-zero for ML Elements
         * relating to a nontransmitted BSS (matching the Multi-BSSID Index,
         * Draft P802.11be_D3.2, 35.3.4.2)
         */
        mld_id = ieee80211_mle_get_mld_id(elem->data + 1);

        /* Fully defrag the ML element for sta information/profile iteration */
        mle = cfg80211_defrag_mle(elem, tx_data->ie, tx_data->ielen, gfp);
        if (!mle)
                return;

        /* No point in doing anything if there is no per-STA profile */
        if (!mle->sta_prof[0])
                goto out;

        new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp);
        if (!new_ie)
                goto out;

        reporter_rnr = cfg80211_gen_reporter_rnr(source_bss,
                                                 u16_get_bits(control,
                                                              IEEE80211_MLC_BASIC_PRES_MLD_ID),
                                                 mld_id == 0, reporter_link_id,
                                                 bss_change_count,
                                                 gfp);

        for (i = 0; i < ARRAY_SIZE(mle->sta_prof) && mle->sta_prof[i]; i++) {
                const struct ieee80211_neighbor_ap_info *ap_info;
                enum nl80211_band band;
                u32 freq;
                const u8 *profile;
                ssize_t profile_len;
                u8 param_ch_count;
                u8 link_id, use_for;
                bool non_tx;

                if (!ieee80211_mle_basic_sta_prof_size_ok((u8 *)mle->sta_prof[i],
                                                          mle->sta_prof_len[i]))
                        continue;

                control = le16_to_cpu(mle->sta_prof[i]->control);

                if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE))
                        continue;

                link_id = u16_get_bits(control,
                                       IEEE80211_MLE_STA_CONTROL_LINK_ID);
                if (seen_links & BIT(link_id))
                        break;
                seen_links |= BIT(link_id);

                if (!(control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) ||
                    !(control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) ||
                    !(control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT))
                        continue;

                memcpy(data.bssid, mle->sta_prof[i]->variable, ETH_ALEN);
                data.beacon_interval =
                        get_unaligned_le16(mle->sta_prof[i]->variable + 6);
                data.tsf = tx_data->tsf +
                           get_unaligned_le64(mle->sta_prof[i]->variable + 8);

                /* sta_info_len counts itself */
                profile = mle->sta_prof[i]->variable +
                          mle->sta_prof[i]->sta_info_len - 1;
                profile_len = (u8 *)mle->sta_prof[i] + mle->sta_prof_len[i] -
                              profile;

                if (profile_len < 2)
                        continue;

                data.capability = get_unaligned_le16(profile);
                profile += 2;
                profile_len -= 2;

                /* Find in RNR to look up channel information */
                use_for = cfg80211_rnr_info_for_mld_ap(tx_data->ie,
                                                       tx_data->ielen,
                                                       mld_id, link_id,
                                                       &ap_info,
                                                       &param_ch_count,
                                                       &non_tx);
                if (!use_for)
                        continue;

                /*
                 * As of 802.11be_D5.0, the specification does not give us any
                 * way of discovering both the MaxBSSID and the Multiple-BSSID
                 * Index. It does seem like the Multiple-BSSID Index element
                 * may be provided, but section 9.4.2.45 explicitly forbids
                 * including a Multiple-BSSID Element (in this case without any
                 * subelements).
                 * Without both pieces of information we cannot calculate the
                 * reference BSSID, so simply ignore the BSS.
                 */
                if (non_tx)
                        continue;

                /* We could sanity check the BSSID is included */

                if (!ieee80211_operating_class_to_band(ap_info->op_class,
                                                       &band))
                        continue;

                freq = ieee80211_channel_to_freq_khz(ap_info->channel, band);
                data.channel = ieee80211_get_channel_khz(wiphy, freq);

                if (use_for == NL80211_BSS_USE_FOR_MLD_LINK &&
                    !(wiphy->flags & WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY)) {
                        use_for = 0;
                        data.cannot_use_reasons =
                                NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY;
                }
                data.use_for = use_for;

                /* Generate new elements */
                memset(new_ie, 0, IEEE80211_MAX_DATA_LEN);
                data.ie = new_ie;
                data.ielen = cfg80211_gen_new_ie(tx_data->ie, tx_data->ielen,
                                                 profile, profile_len,
                                                 new_ie,
                                                 IEEE80211_MAX_DATA_LEN);
                if (!data.ielen)
                        continue;

                /* The generated elements do not contain:
                 *  - Basic ML element
                 *  - A TBTT entry in the RNR for the transmitting AP
                 *
                 * This information is needed both internally and in userspace
                 * as such, we should append it here.
                 */
                if (data.ielen + 3 + sizeof(*ml_elem) + ml_common_len >
                    IEEE80211_MAX_DATA_LEN)
                        continue;

                /* Copy the Basic Multi-Link element including the common
                 * information, and then fix up the link ID and BSS param
                 * change count.
                 * Note that the ML element length has been verified and we
                 * also checked that it contains the link ID.
                 */
                new_ie[data.ielen++] = WLAN_EID_EXTENSION;
                new_ie[data.ielen++] = 1 + sizeof(*ml_elem) + ml_common_len;
                new_ie[data.ielen++] = WLAN_EID_EXT_EHT_MULTI_LINK;
                memcpy(new_ie + data.ielen, ml_elem,
                       sizeof(*ml_elem) + ml_common_len);

                new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN] = link_id;
                new_ie[data.ielen + sizeof(*ml_elem) + 1 + ETH_ALEN + 1] =
                        param_ch_count;

                data.ielen += sizeof(*ml_elem) + ml_common_len;

                if (reporter_rnr && (use_for & NL80211_BSS_USE_FOR_NORMAL)) {
                        if (data.ielen + sizeof(struct element) +
                            reporter_rnr->datalen > IEEE80211_MAX_DATA_LEN)
                                continue;

                        memcpy(new_ie + data.ielen, reporter_rnr,
                               sizeof(struct element) + reporter_rnr->datalen);
                        data.ielen += sizeof(struct element) +
                                      reporter_rnr->datalen;
                }

                bss = cfg80211_inform_single_bss_data(wiphy, &data, gfp);
                if (!bss)
                        break;
                cfg80211_put_bss(wiphy, bss);
        }

out:
        kfree(reporter_rnr);
        kfree(new_ie);
        kfree(mle);
}

static void cfg80211_parse_ml_sta_data(struct wiphy *wiphy,
                                       struct cfg80211_inform_single_bss_data *tx_data,
                                       struct cfg80211_bss *source_bss,
                                       gfp_t gfp)
{
        const struct element *elem;

        if (!source_bss)
                return;

        if (tx_data->ftype != CFG80211_BSS_FTYPE_PRESP)
                return;

        for_each_element_extid(elem, WLAN_EID_EXT_EHT_MULTI_LINK,
                               tx_data->ie, tx_data->ielen)
                cfg80211_parse_ml_elem_sta_data(wiphy, tx_data, source_bss,
                                                elem, gfp);
}

struct cfg80211_bss *
cfg80211_inform_bss_data(struct wiphy *wiphy,
                         struct cfg80211_inform_bss *data,
                         enum cfg80211_bss_frame_type ftype,
                         const u8 *bssid, u64 tsf, u16 capability,
                         u16 beacon_interval, const u8 *ie, size_t ielen,
                         gfp_t gfp)
{
        struct cfg80211_inform_single_bss_data inform_data = {
                .drv_data = data,
                .ftype = ftype,
                .tsf = tsf,
                .capability = capability,
                .beacon_interval = beacon_interval,
                .ie = ie,
                .ielen = ielen,
                .use_for = data->restrict_use ?
                                data->use_for :
                                NL80211_BSS_USE_FOR_ALL,
                .cannot_use_reasons = data->cannot_use_reasons,
        };
        struct cfg80211_bss *res;

        memcpy(inform_data.bssid, bssid, ETH_ALEN);

        res = cfg80211_inform_single_bss_data(wiphy, &inform_data, gfp);
        if (!res)
                return NULL;

        /* don't do any further MBSSID/ML handling for S1G */
        if (ftype == CFG80211_BSS_FTYPE_S1G_BEACON)
                return res;

        cfg80211_parse_mbssid_data(wiphy, &inform_data, res, gfp);

        cfg80211_parse_ml_sta_data(wiphy, &inform_data, res, gfp);

        return res;
}
EXPORT_SYMBOL(cfg80211_inform_bss_data);

struct cfg80211_bss *
cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
                               struct cfg80211_inform_bss *data,
                               struct ieee80211_mgmt *mgmt, size_t len,
                               gfp_t gfp)
{
        size_t min_hdr_len = offsetof(struct ieee80211_mgmt,
                                      u.probe_resp.variable);
        struct ieee80211_ext *ext = NULL;
        enum cfg80211_bss_frame_type ftype;
        u16 beacon_interval;
        const u8 *bssid;
        u16 capability;
        const u8 *ie;
        size_t ielen;
        u64 tsf;

        if (WARN_ON(!mgmt))
                return NULL;

        if (WARN_ON(!wiphy))
                return NULL;

        BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
                     offsetof(struct ieee80211_mgmt, u.beacon.variable));

        trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len);

        if (ieee80211_is_s1g_beacon(mgmt->frame_control)) {
                ext = (void *) mgmt;
                min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon);
                if (ieee80211_is_s1g_short_beacon(mgmt->frame_control))
                        min_hdr_len = offsetof(struct ieee80211_ext,
                                               u.s1g_short_beacon.variable);
        }

        if (WARN_ON(len < min_hdr_len))
                return NULL;

        ielen = len - min_hdr_len;
        ie = mgmt->u.probe_resp.variable;
        if (ext) {
                const struct ieee80211_s1g_bcn_compat_ie *compat;
                const struct element *elem;

                if (ieee80211_is_s1g_short_beacon(mgmt->frame_control))
                        ie = ext->u.s1g_short_beacon.variable;
                else
                        ie = ext->u.s1g_beacon.variable;

                elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen);
                if (!elem)
                        return NULL;
                if (elem->datalen < sizeof(*compat))
                        return NULL;
                compat = (void *)elem->data;
                bssid = ext->u.s1g_beacon.sa;
                capability = le16_to_cpu(compat->compat_info);
                beacon_interval = le16_to_cpu(compat->beacon_int);
        } else {
                bssid = mgmt->bssid;
                beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
                capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
        }

        tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);

        if (ieee80211_is_probe_resp(mgmt->frame_control))
                ftype = CFG80211_BSS_FTYPE_PRESP;
        else if (ext)
                ftype = CFG80211_BSS_FTYPE_S1G_BEACON;
        else
                ftype = CFG80211_BSS_FTYPE_BEACON;

        return cfg80211_inform_bss_data(wiphy, data, ftype,
                                        bssid, tsf, capability,
                                        beacon_interval, ie, ielen,
                                        gfp);
}
EXPORT_SYMBOL(cfg80211_inform_bss_frame_data);

void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        if (!pub)
                return;

        spin_lock_bh(&rdev->bss_lock);
        bss_ref_get(rdev, bss_from_pub(pub));
        spin_unlock_bh(&rdev->bss_lock);
}
EXPORT_SYMBOL(cfg80211_ref_bss);

void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        if (!pub)
                return;

        spin_lock_bh(&rdev->bss_lock);
        bss_ref_put(rdev, bss_from_pub(pub));
        spin_unlock_bh(&rdev->bss_lock);
}
EXPORT_SYMBOL(cfg80211_put_bss);

void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_internal_bss *bss, *tmp1;
        struct cfg80211_bss *nontrans_bss, *tmp;

        if (WARN_ON(!pub))
                return;

        bss = bss_from_pub(pub);

        spin_lock_bh(&rdev->bss_lock);
        if (list_empty(&bss->list))
                goto out;

        list_for_each_entry_safe(nontrans_bss, tmp,
                                 &pub->nontrans_list,
                                 nontrans_list) {
                tmp1 = bss_from_pub(nontrans_bss);
                if (__cfg80211_unlink_bss(rdev, tmp1))
                        rdev->bss_generation++;
        }

        if (__cfg80211_unlink_bss(rdev, bss))
                rdev->bss_generation++;
out:
        spin_unlock_bh(&rdev->bss_lock);
}
EXPORT_SYMBOL(cfg80211_unlink_bss);

void cfg80211_bss_iter(struct wiphy *wiphy,
                       struct cfg80211_chan_def *chandef,
                       void (*iter)(struct wiphy *wiphy,
                                    struct cfg80211_bss *bss,
                                    void *data),
                       void *iter_data)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_internal_bss *bss;

        spin_lock_bh(&rdev->bss_lock);

        list_for_each_entry(bss, &rdev->bss_list, list) {
                if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel,
                                                     false))
                        iter(wiphy, &bss->pub, iter_data);
        }

        spin_unlock_bh(&rdev->bss_lock);
}
EXPORT_SYMBOL(cfg80211_bss_iter);

void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
                                     unsigned int link_id,
                                     struct ieee80211_channel *chan)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_internal_bss *cbss = wdev->links[link_id].client.current_bss;
        struct cfg80211_internal_bss *new = NULL;
        struct cfg80211_internal_bss *bss;
        struct cfg80211_bss *nontrans_bss;
        struct cfg80211_bss *tmp;

        spin_lock_bh(&rdev->bss_lock);

        /*
         * Some APs use CSA also for bandwidth changes, i.e., without actually
         * changing the control channel, so no need to update in such a case.
         */
        if (cbss->pub.channel == chan)
                goto done;

        /* use transmitting bss */
        if (cbss->pub.transmitted_bss)
                cbss = bss_from_pub(cbss->pub.transmitted_bss);

        cbss->pub.channel = chan;

        list_for_each_entry(bss, &rdev->bss_list, list) {
                if (!cfg80211_bss_type_match(bss->pub.capability,
                                             bss->pub.channel->band,
                                             wdev->conn_bss_type))
                        continue;

                if (bss == cbss)
                        continue;

                if (!cmp_bss(&bss->pub, &cbss->pub, BSS_CMP_REGULAR)) {
                        new = bss;
                        break;
                }
        }

        if (new) {
                /* to save time, update IEs for transmitting bss only */
                cfg80211_update_known_bss(rdev, cbss, new, false);
                new->pub.proberesp_ies = NULL;
                new->pub.beacon_ies = NULL;

                list_for_each_entry_safe(nontrans_bss, tmp,
                                         &new->pub.nontrans_list,
                                         nontrans_list) {
                        bss = bss_from_pub(nontrans_bss);
                        if (__cfg80211_unlink_bss(rdev, bss))
                                rdev->bss_generation++;
                }

                WARN_ON(atomic_read(&new->hold));
                if (!WARN_ON(!__cfg80211_unlink_bss(rdev, new)))
                        rdev->bss_generation++;
        }

        rb_erase(&cbss->rbn, &rdev->bss_tree);
        rb_insert_bss(rdev, cbss);
        rdev->bss_generation++;

        list_for_each_entry_safe(nontrans_bss, tmp,
                                 &cbss->pub.nontrans_list,
                                 nontrans_list) {
                bss = bss_from_pub(nontrans_bss);
                bss->pub.channel = chan;
                rb_erase(&bss->rbn, &rdev->bss_tree);
                rb_insert_bss(rdev, bss);
                rdev->bss_generation++;
        }

done:
        spin_unlock_bh(&rdev->bss_lock);
}

#ifdef CONFIG_CFG80211_WEXT
static struct cfg80211_registered_device *
cfg80211_get_dev_from_ifindex(struct net *net, int ifindex)
{
        struct cfg80211_registered_device *rdev;
        struct net_device *dev;

        ASSERT_RTNL();

        dev = dev_get_by_index(net, ifindex);
        if (!dev)
                return ERR_PTR(-ENODEV);
        if (dev->ieee80211_ptr)
                rdev = wiphy_to_rdev(dev->ieee80211_ptr->wiphy);
        else
                rdev = ERR_PTR(-ENODEV);
        dev_put(dev);
        return rdev;
}

int cfg80211_wext_siwscan(struct net_device *dev,
                          struct iw_request_info *info,
                          union iwreq_data *wrqu, char *extra)
{
        struct cfg80211_registered_device *rdev;
        struct wiphy *wiphy;
        struct iw_scan_req *wreq = NULL;
        struct cfg80211_scan_request *creq;
        int i, err, n_channels = 0;
        enum nl80211_band band;

        if (!netif_running(dev))
                return -ENETDOWN;

        if (wrqu->data.length == sizeof(struct iw_scan_req))
                wreq = (struct iw_scan_req *)extra;

        rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex);

        if (IS_ERR(rdev))
                return PTR_ERR(rdev);

        if (rdev->scan_req || rdev->scan_msg)
                return -EBUSY;

        wiphy = &rdev->wiphy;

        /* Determine number of channels, needed to allocate creq */
        if (wreq && wreq->num_channels) {
                /* Passed from userspace so should be checked */
                if (unlikely(wreq->num_channels > IW_MAX_FREQUENCIES))
                        return -EINVAL;
                n_channels = wreq->num_channels;
        } else {
                n_channels = ieee80211_get_num_supported_channels(wiphy);
        }

        creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) +
                       n_channels * sizeof(void *),
                       GFP_ATOMIC);
        if (!creq)
                return -ENOMEM;

        creq->wiphy = wiphy;
        creq->wdev = dev->ieee80211_ptr;
        /* SSIDs come after channels */
        creq->ssids = (void *)&creq->channels[n_channels];
        creq->n_channels = n_channels;
        creq->n_ssids = 1;
        creq->scan_start = jiffies;

        /* translate "Scan on frequencies" request */
        i = 0;
        for (band = 0; band < NUM_NL80211_BANDS; band++) {
                int j;

                if (!wiphy->bands[band])
                        continue;

                for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
                        /* ignore disabled channels */
                        if (wiphy->bands[band]->channels[j].flags &
                                                IEEE80211_CHAN_DISABLED)
                                continue;

                        /* If we have a wireless request structure and the
                         * wireless request specifies frequencies, then search
                         * for the matching hardware channel.
                         */
                        if (wreq && wreq->num_channels) {
                                int k;
                                int wiphy_freq = wiphy->bands[band]->channels[j].center_freq;
                                for (k = 0; k < wreq->num_channels; k++) {
                                        struct iw_freq *freq =
                                                &wreq->channel_list[k];
                                        int wext_freq =
                                                cfg80211_wext_freq(freq);

                                        if (wext_freq == wiphy_freq)
                                                goto wext_freq_found;
                                }
                                goto wext_freq_not_found;
                        }

                wext_freq_found:
                        creq->channels[i] = &wiphy->bands[band]->channels[j];
                        i++;
                wext_freq_not_found: ;
                }
        }
        /* No channels found? */
        if (!i) {
                err = -EINVAL;
                goto out;
        }

        /* Set real number of channels specified in creq->channels[] */
        creq->n_channels = i;

        /* translate "Scan for SSID" request */
        if (wreq) {
                if (wrqu->data.flags & IW_SCAN_THIS_ESSID) {
                        if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) {
                                err = -EINVAL;
                                goto out;
                        }
                        memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len);
                        creq->ssids[0].ssid_len = wreq->essid_len;
                }
                if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) {
                        creq->ssids = NULL;
                        creq->n_ssids = 0;
                }
        }

        for (i = 0; i < NUM_NL80211_BANDS; i++)
                if (wiphy->bands[i])
                        creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1;

        eth_broadcast_addr(creq->bssid);

        wiphy_lock(&rdev->wiphy);

        rdev->scan_req = creq;
        err = rdev_scan(rdev, creq);
        if (err) {
                rdev->scan_req = NULL;
                /* creq will be freed below */
        } else {
                nl80211_send_scan_start(rdev, dev->ieee80211_ptr);
                /* creq now owned by driver */
                creq = NULL;
                dev_hold(dev);
        }
        wiphy_unlock(&rdev->wiphy);
 out:
        kfree(creq);
        return err;
}
EXPORT_WEXT_HANDLER(cfg80211_wext_siwscan);

static char *ieee80211_scan_add_ies(struct iw_request_info *info,
                                    const struct cfg80211_bss_ies *ies,
                                    char *current_ev, char *end_buf)
{
        const u8 *pos, *end, *next;
        struct iw_event iwe;

        if (!ies)
                return current_ev;

        /*
         * If needed, fragment the IEs buffer (at IE boundaries) into short
         * enough fragments to fit into IW_GENERIC_IE_MAX octet messages.
         */
        pos = ies->data;
        end = pos + ies->len;

        while (end - pos > IW_GENERIC_IE_MAX) {
                next = pos + 2 + pos[1];
                while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX)
                        next = next + 2 + next[1];

                memset(&iwe, 0, sizeof(iwe));
                iwe.cmd = IWEVGENIE;
                iwe.u.data.length = next - pos;
                current_ev = iwe_stream_add_point_check(info, current_ev,
                                                        end_buf, &iwe,
                                                        (void *)pos);
                if (IS_ERR(current_ev))
                        return current_ev;
                pos = next;
        }

        if (end > pos) {
                memset(&iwe, 0, sizeof(iwe));
                iwe.cmd = IWEVGENIE;
                iwe.u.data.length = end - pos;
                current_ev = iwe_stream_add_point_check(info, current_ev,
                                                        end_buf, &iwe,
                                                        (void *)pos);
                if (IS_ERR(current_ev))
                        return current_ev;
        }

        return current_ev;
}

static char *
ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info,
              struct cfg80211_internal_bss *bss, char *current_ev,
              char *end_buf)
{
        const struct cfg80211_bss_ies *ies;
        struct iw_event iwe;
        const u8 *ie;
        u8 buf[50];
        u8 *cfg, *p, *tmp;
        int rem, i, sig;
        bool ismesh = false;

        memset(&iwe, 0, sizeof(iwe));
        iwe.cmd = SIOCGIWAP;
        iwe.u.ap_addr.sa_family = ARPHRD_ETHER;
        memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN);
        current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe,
                                                IW_EV_ADDR_LEN);
        if (IS_ERR(current_ev))
                return current_ev;

        memset(&iwe, 0, sizeof(iwe));
        iwe.cmd = SIOCGIWFREQ;
        iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq);
        iwe.u.freq.e = 0;
        current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe,
                                                IW_EV_FREQ_LEN);
        if (IS_ERR(current_ev))
                return current_ev;

        memset(&iwe, 0, sizeof(iwe));
        iwe.cmd = SIOCGIWFREQ;
        iwe.u.freq.m = bss->pub.channel->center_freq;
        iwe.u.freq.e = 6;
        current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe,
                                                IW_EV_FREQ_LEN);
        if (IS_ERR(current_ev))
                return current_ev;

        if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) {
                memset(&iwe, 0, sizeof(iwe));
                iwe.cmd = IWEVQUAL;
                iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED |
                                     IW_QUAL_NOISE_INVALID |
                                     IW_QUAL_QUAL_UPDATED;
                switch (wiphy->signal_type) {
                case CFG80211_SIGNAL_TYPE_MBM:
                        sig = bss->pub.signal / 100;
                        iwe.u.qual.level = sig;
                        iwe.u.qual.updated |= IW_QUAL_DBM;
                        if (sig < -110)                /* rather bad */
                                sig = -110;
                        else if (sig > -40)        /* perfect */
                                sig = -40;
                        /* will give a range of 0 .. 70 */
                        iwe.u.qual.qual = sig + 110;
                        break;
                case CFG80211_SIGNAL_TYPE_UNSPEC:
                        iwe.u.qual.level = bss->pub.signal;
                        /* will give range 0 .. 100 */
                        iwe.u.qual.qual = bss->pub.signal;
                        break;
                default:
                        /* not reached */
                        break;
                }
                current_ev = iwe_stream_add_event_check(info, current_ev,
                                                        end_buf, &iwe,
                                                        IW_EV_QUAL_LEN);
                if (IS_ERR(current_ev))
                        return current_ev;
        }

        memset(&iwe, 0, sizeof(iwe));
        iwe.cmd = SIOCGIWENCODE;
        if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY)
                iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY;
        else
                iwe.u.data.flags = IW_ENCODE_DISABLED;
        iwe.u.data.length = 0;
        current_ev = iwe_stream_add_point_check(info, current_ev, end_buf,
                                                &iwe, "");
        if (IS_ERR(current_ev))
                return current_ev;

        rcu_read_lock();
        ies = rcu_dereference(bss->pub.ies);
        rem = ies->len;
        ie = ies->data;

        while (rem >= 2) {
                /* invalid data */
                if (ie[1] > rem - 2)
                        break;

                switch (ie[0]) {
                case WLAN_EID_SSID:
                        memset(&iwe, 0, sizeof(iwe));
                        iwe.cmd = SIOCGIWESSID;
                        iwe.u.data.length = ie[1];
                        iwe.u.data.flags = 1;
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf, &iwe,
                                                                (u8 *)ie + 2);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        break;
                case WLAN_EID_MESH_ID:
                        memset(&iwe, 0, sizeof(iwe));
                        iwe.cmd = SIOCGIWESSID;
                        iwe.u.data.length = ie[1];
                        iwe.u.data.flags = 1;
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf, &iwe,
                                                                (u8 *)ie + 2);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        break;
                case WLAN_EID_MESH_CONFIG:
                        ismesh = true;
                        if (ie[1] != sizeof(struct ieee80211_meshconf_ie))
                                break;
                        cfg = (u8 *)ie + 2;
                        memset(&iwe, 0, sizeof(iwe));
                        iwe.cmd = IWEVCUSTOM;
                        iwe.u.data.length = sprintf(buf,
                                                    "Mesh Network Path Selection Protocol ID: 0x%02X",
                                                    cfg[0]);
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf,
                                                                &iwe, buf);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        iwe.u.data.length = sprintf(buf,
                                                    "Path Selection Metric ID: 0x%02X",
                                                    cfg[1]);
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf,
                                                                &iwe, buf);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        iwe.u.data.length = sprintf(buf,
                                                    "Congestion Control Mode ID: 0x%02X",
                                                    cfg[2]);
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf,
                                                                &iwe, buf);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        iwe.u.data.length = sprintf(buf,
                                                    "Synchronization ID: 0x%02X",
                                                    cfg[3]);
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf,
                                                                &iwe, buf);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        iwe.u.data.length = sprintf(buf,
                                                    "Authentication ID: 0x%02X",
                                                    cfg[4]);
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf,
                                                                &iwe, buf);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        iwe.u.data.length = sprintf(buf,
                                                    "Formation Info: 0x%02X",
                                                    cfg[5]);
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf,
                                                                &iwe, buf);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        iwe.u.data.length = sprintf(buf,
                                                    "Capabilities: 0x%02X",
                                                    cfg[6]);
                        current_ev = iwe_stream_add_point_check(info,
                                                                current_ev,
                                                                end_buf,
                                                                &iwe, buf);
                        if (IS_ERR(current_ev))
                                goto unlock;
                        break;
                case WLAN_EID_SUPP_RATES:
                case WLAN_EID_EXT_SUPP_RATES:
                        /* display all supported rates in readable format */
                        p = current_ev + iwe_stream_lcp_len(info);

                        memset(&iwe, 0, sizeof(iwe));
                        iwe.cmd = SIOCGIWRATE;
                        /* Those two flags are ignored... */
                        iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0;

                        for (i = 0; i < ie[1]; i++) {
                                iwe.u.bitrate.value =
                                        ((ie[i + 2] & 0x7f) * 500000);
                                tmp = p;
                                p = iwe_stream_add_value(info, current_ev, p,
                                                         end_buf, &iwe,
                                                         IW_EV_PARAM_LEN);
                                if (p == tmp) {
                                        current_ev = ERR_PTR(-E2BIG);
                                        goto unlock;
                                }
                        }
                        current_ev = p;
                        break;
                }
                rem -= ie[1] + 2;
                ie += ie[1] + 2;
        }

        if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) ||
            ismesh) {
                memset(&iwe, 0, sizeof(iwe));
                iwe.cmd = SIOCGIWMODE;
                if (ismesh)
                        iwe.u.mode = IW_MODE_MESH;
                else if (bss->pub.capability & WLAN_CAPABILITY_ESS)
                        iwe.u.mode = IW_MODE_MASTER;
                else
                        iwe.u.mode = IW_MODE_ADHOC;
                current_ev = iwe_stream_add_event_check(info, current_ev,
                                                        end_buf, &iwe,
                                                        IW_EV_UINT_LEN);
                if (IS_ERR(current_ev))
                        goto unlock;
        }

        memset(&iwe, 0, sizeof(iwe));
        iwe.cmd = IWEVCUSTOM;
        iwe.u.data.length = sprintf(buf, "tsf=%016llx",
                                    (unsigned long long)(ies->tsf));
        current_ev = iwe_stream_add_point_check(info, current_ev, end_buf,
                                                &iwe, buf);
        if (IS_ERR(current_ev))
                goto unlock;
        memset(&iwe, 0, sizeof(iwe));
        iwe.cmd = IWEVCUSTOM;
        iwe.u.data.length = sprintf(buf, " Last beacon: %ums ago",
                                    elapsed_jiffies_msecs(bss->ts));
        current_ev = iwe_stream_add_point_check(info, current_ev,
                                                end_buf, &iwe, buf);
        if (IS_ERR(current_ev))
                goto unlock;

        current_ev = ieee80211_scan_add_ies(info, ies, current_ev, end_buf);

 unlock:
        rcu_read_unlock();
        return current_ev;
}


static int ieee80211_scan_results(struct cfg80211_registered_device *rdev,
                                  struct iw_request_info *info,
                                  char *buf, size_t len)
{
        char *current_ev = buf;
        char *end_buf = buf + len;
        struct cfg80211_internal_bss *bss;
        int err = 0;

        spin_lock_bh(&rdev->bss_lock);
        cfg80211_bss_expire(rdev);

        list_for_each_entry(bss, &rdev->bss_list, list) {
                if (buf + len - current_ev <= IW_EV_ADDR_LEN) {
                        err = -E2BIG;
                        break;
                }
                current_ev = ieee80211_bss(&rdev->wiphy, info, bss,
                                           current_ev, end_buf);
                if (IS_ERR(current_ev)) {
                        err = PTR_ERR(current_ev);
                        break;
                }
        }
        spin_unlock_bh(&rdev->bss_lock);

        if (err)
                return err;
        return current_ev - buf;
}


int cfg80211_wext_giwscan(struct net_device *dev,
                          struct iw_request_info *info,
                          union iwreq_data *wrqu, char *extra)
{
        struct iw_point *data = &wrqu->data;
        struct cfg80211_registered_device *rdev;
        int res;

        if (!netif_running(dev))
                return -ENETDOWN;

        rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex);

        if (IS_ERR(rdev))
                return PTR_ERR(rdev);

        if (rdev->scan_req || rdev->scan_msg)
                return -EAGAIN;

        res = ieee80211_scan_results(rdev, info, extra, data->length);
        data->length = 0;
        if (res >= 0) {
                data->length = res;
                res = 0;
        }

        return res;
}
EXPORT_WEXT_HANDLER(cfg80211_wext_giwscan);
#endif
















































































































































































































































































































































































































































































































































































































































































































































    1 












    1 







    1 
    1 














    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














    1 
    1 


    1 









    1 





    1 

































    1 











    1 





























































































    1 












    1 




    1 






    1 



    1 









    1 


























































































































































































































































































































































































































































































































































































































    1 










    1 

    1 

    1 










    1 










    1 
    1 

    1 
    1 



    1 


































































































































































































































































































































































































































































































































































































































































































































    1 






































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/base.c
 *
 *  Copyright (C) 1991, 1992 Linus Torvalds
 *
 *  proc base directory handling functions
 *
 *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
 *  Instead of using magical inumbers to determine the kind of object
 *  we allocate and fill in-core inodes upon lookup. They don't even
 *  go into icache. We cache the reference to task_struct upon lookup too.
 *  Eventually it should become a filesystem in its own. We don't use the
 *  rest of procfs anymore.
 *
 *
 *  Changelog:
 *  17-Jan-2005
 *  Allan Bezerra
 *  Bruna Moreira <bruna.moreira@indt.org.br>
 *  Edjard Mota <edjard.mota@indt.org.br>
 *  Ilias Biris <ilias.biris@indt.org.br>
 *  Mauricio Lin <mauricio.lin@indt.org.br>
 *
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *
 *  A new process specific entry (smaps) included in /proc. It shows the
 *  size of rss for each memory area. The maps entry lacks information
 *  about physical memory size (rss) for each mapped file, i.e.,
 *  rss information for executables and library files.
 *  This additional information is useful for any tools that need to know
 *  about physical memory consumption for a process specific library.
 *
 *  Changelog:
 *  21-Feb-2005
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *  Pud inclusion in the page table walking.
 *
 *  ChangeLog:
 *  10-Mar-2005
 *  10LE Instituto Nokia de Tecnologia - INdT:
 *  A better way to walks through the page table as suggested by Hugh Dickins.
 *
 *  Simo Piiroinen <simo.piiroinen@nokia.com>:
 *  Smaps information related to shared, private, clean and dirty pages.
 *
 *  Paul Mundt <paul.mundt@nokia.com>:
 *  Overall revision about smaps.
 */

#include <linux/uaccess.h>

#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
#include <linux/mnt_namespace.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
#include <linux/nsproxy.h>
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
#include <linux/ksm.h>
#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"

#include "../../lib/kstrtox.h"

/* NOTE:
 *        Implementing inode permission operations in /proc is almost
 *        certainly an error.  Permission checks need to happen during
 *        each system call not at open time.  The reason is that most of
 *        what we wish to check for permissions in /proc varies at runtime.
 *
 *        The classic example of a problem is opening file descriptors
 *        in /proc for a task before it execs a suid executable.
 */

static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;

struct pid_entry {
        const char *name;
        unsigned int len;
        umode_t mode;
        const struct inode_operations *iop;
        const struct file_operations *fop;
        union proc_op op;
};

#define NOD(NAME, MODE, IOP, FOP, OP) {                        \
        .name = (NAME),                                        \
        .len  = sizeof(NAME) - 1,                        \
        .mode = MODE,                                        \
        .iop  = IOP,                                        \
        .fop  = FOP,                                        \
        .op   = OP,                                        \
}

#define DIR(NAME, MODE, iops, fops)        \
        NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
#define LNK(NAME, get_link)                                        \
        NOD(NAME, (S_IFLNK|S_IRWXUGO),                                \
                &proc_pid_link_inode_operations, NULL,                \
                { .proc_get_link = get_link } )
#define REG(NAME, MODE, fops)                                \
        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_single_file_operations,        \
                { .proc_show = show } )
#define ATTR(LSMID, NAME, MODE)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_pid_attr_operations,        \
                { .lsmid = LSMID })

/*
 * Count the number of hardlinks for the pid_entry table, excluding the .
 * and .. links.
 */
static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
        unsigned int n)
{
        unsigned int i;
        unsigned int count;

        count = 2;
        for (i = 0; i < n; ++i) {
                if (S_ISDIR(entries[i].mode))
                        ++count;
        }

        return count;
}

static int get_task_root(struct task_struct *task, struct path *root)
{
        int result = -ENOENT;

        task_lock(task);
        if (task->fs) {
                get_fs_root(task->fs, root);
                result = 0;
        }
        task_unlock(task);
        return result;
}

static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                task_lock(task);
                if (task->fs) {
                        get_fs_pwd(task->fs, path);
                        result = 0;
                }
                task_unlock(task);
                put_task_struct(task);
        }
        return result;
}

static int proc_root_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                result = get_task_root(task, path);
                put_task_struct(task);
        }
        return result;
}

/*
 * If the user used setproctitle(), we just get the string from
 * user space at arg_start, and limit it to a maximum of one page.
 */
static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
                                size_t count, unsigned long pos,
                                unsigned long arg_start)
{
        char *page;
        int ret, got;

        if (pos >= PAGE_SIZE)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
        if (got > 0) {
                int len = strnlen(page, got);

                /* Include the NUL character if it was found */
                if (len < got)
                        len++;

                if (len > pos) {
                        len -= pos;
                        if (len > count)
                                len = count;
                        len -= copy_to_user(buf, page+pos, len);
                        if (!len)
                                len = -EFAULT;
                        ret = len;
                }
        }
        free_page((unsigned long)page);
        return ret;
}

static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
                              size_t count, loff_t *ppos)
{
        unsigned long arg_start, arg_end, env_start, env_end;
        unsigned long pos, len;
        char *page, c;

        /* Check if process spawned far enough to have cmdline. */
        if (!mm->env_end)
                return 0;

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        if (arg_start >= arg_end)
                return 0;

        /*
         * We allow setproctitle() to overwrite the argument
         * strings, and overflow past the original end. But
         * only when it overflows into the environment area.
         */
        if (env_start != arg_end || env_end < env_start)
                env_start = env_end = arg_end;
        len = env_end - arg_start;

        /* We're not going to care if "*ppos" has high bits set */
        pos = *ppos;
        if (pos >= len)
                return 0;
        if (count > len - pos)
                count = len - pos;
        if (!count)
                return 0;

        /*
         * Magical special case: if the argv[] end byte is not
         * zero, the user has overwritten it with setproctitle(3).
         *
         * Possible future enhancement: do this only once when
         * pos is 0, and set a flag in the 'struct file'.
         */
        if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
                return get_mm_proctitle(mm, buf, count, pos, arg_start);

        /*
         * For the non-setproctitle() case we limit things strictly
         * to the [arg_start, arg_end[ range.
         */
        pos += arg_start;
        if (pos < arg_start || pos >= arg_end)
                return 0;
        if (count > arg_end - pos)
                count = arg_end - pos;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        len = 0;
        while (count) {
                int got;
                size_t size = min_t(size_t, PAGE_SIZE, count);

                got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
                if (got <= 0)
                        break;
                got -= copy_to_user(buf, page, got);
                if (unlikely(!got)) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }
                pos += got;
                buf += got;
                len += got;
                count -= got;
        }

        free_page((unsigned long)page);
        return len;
}

static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
                                size_t count, loff_t *pos)
{
        struct mm_struct *mm;
        ssize_t ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = get_mm_cmdline(mm, buf, count, pos);
        mmput(mm);
        return ret;
}

static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
                                     size_t count, loff_t *pos)
{
        struct task_struct *tsk;
        ssize_t ret;

        BUG_ON(*pos < 0);

        tsk = get_proc_task(file_inode(file));
        if (!tsk)
                return -ESRCH;
        ret = get_task_cmdline(tsk, buf, count, pos);
        put_task_struct(tsk);
        if (ret > 0)
                *pos += ret;
        return ret;
}

static const struct file_operations proc_pid_cmdline_ops = {
        .read        = proc_pid_cmdline_read,
        .llseek        = generic_file_llseek,
};

#ifdef CONFIG_KALLSYMS
/*
 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 * Returns the resolved symbol.  If that fails, simply return the address.
 */
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long wchan;
        char symname[KSYM_NAME_LEN];

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto print0;

        wchan = get_wchan(task);
        if (wchan && !lookup_symbol_name(wchan, symname)) {
                seq_puts(m, symname);
                return 0;
        }

print0:
        seq_putc(m, '0');
        return 0;
}
#endif /* CONFIG_KALLSYMS */

static int lock_trace(struct task_struct *task)
{
        int err = down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return err;
        if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
                up_read(&task->signal->exec_update_lock);
                return -EPERM;
        }
        return 0;
}

static void unlock_trace(struct task_struct *task)
{
        up_read(&task->signal->exec_update_lock);
}

#ifdef CONFIG_STACKTRACE

#define MAX_STACK_TRACE_DEPTH        64

static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long *entries;
        int err;

        /*
         * The ability to racily run the kernel stack unwinder on a running task
         * and then observe the unwinder output is scary; while it is useful for
         * debugging kernel issues, it can also allow an attacker to leak kernel
         * stack contents.
         * Doing this in a manner that is at least safe from races would require
         * some work to ensure that the remote task can not be scheduled; and
         * even then, this would still expose the unwinder as local attack
         * surface.
         * Therefore, this interface is restricted to root.
         */
        if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
                return -EACCES;

        entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
                                GFP_KERNEL);
        if (!entries)
                return -ENOMEM;

        err = lock_trace(task);
        if (!err) {
                unsigned int i, nr_entries;

                nr_entries = stack_trace_save_tsk(task, entries,
                                                  MAX_STACK_TRACE_DEPTH, 0);

                for (i = 0; i < nr_entries; i++) {
                        seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
                }

                unlock_trace(task);
        }
        kfree(entries);

        return err;
}
#endif

#ifdef CONFIG_SCHED_INFO
/*
 * Provides /proc/PID/schedstat
 */
static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
                              struct pid *pid, struct task_struct *task)
{
        if (unlikely(!sched_info_on()))
                seq_puts(m, "0 0 0\n");
        else
                seq_printf(m, "%llu %llu %lu\n",
                   (unsigned long long)task->se.sum_exec_runtime,
                   (unsigned long long)task->sched_info.run_delay,
                   task->sched_info.pcount);

        return 0;
}
#endif

#ifdef CONFIG_LATENCYTOP
static int lstats_show_proc(struct seq_file *m, void *v)
{
        int i;
        struct inode *inode = m->private;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *lr = &task->latency_record[i];
                if (lr->backtrace[0]) {
                        int q;
                        seq_printf(m, "%i %li %li",
                                   lr->count, lr->time, lr->max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
                                unsigned long bt = lr->backtrace[q];

                                if (!bt)
                                        break;
                                seq_printf(m, " %ps", (void *)bt);
                        }
                        seq_putc(m, '\n');
                }

        }
        put_task_struct(task);
        return 0;
}

static int lstats_open(struct inode *inode, struct file *file)
{
        return single_open(file, lstats_show_proc, inode);
}

static ssize_t lstats_write(struct file *file, const char __user *buf,
                            size_t count, loff_t *offs)
{
        struct task_struct *task = get_proc_task(file_inode(file));

        if (!task)
                return -ESRCH;
        clear_tsk_latency_tracing(task);
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_lstats_operations = {
        .open                = lstats_open,
        .read                = seq_read,
        .write                = lstats_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif

static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long totalpages = totalram_pages() + total_swap_pages;
        unsigned long points = 0;
        long badness;

        badness = oom_badness(task, totalpages);
        /*
         * Special case OOM_SCORE_ADJ_MIN for all others scale the
         * badness value into [0, 2000] range which we have been
         * exporting for a long time so userspace might depend on it.
         */
        if (badness != LONG_MIN)
                points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;

        seq_printf(m, "%lu\n", points);

        return 0;
}

struct limit_names {
        const char *name;
        const char *unit;
};

static const struct limit_names lnames[RLIM_NLIMITS] = {
        [RLIMIT_CPU] = {"Max cpu time", "seconds"},
        [RLIMIT_FSIZE] = {"Max file size", "bytes"},
        [RLIMIT_DATA] = {"Max data size", "bytes"},
        [RLIMIT_STACK] = {"Max stack size", "bytes"},
        [RLIMIT_CORE] = {"Max core file size", "bytes"},
        [RLIMIT_RSS] = {"Max resident set", "bytes"},
        [RLIMIT_NPROC] = {"Max processes", "processes"},
        [RLIMIT_NOFILE] = {"Max open files", "files"},
        [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
        [RLIMIT_AS] = {"Max address space", "bytes"},
        [RLIMIT_LOCKS] = {"Max file locks", "locks"},
        [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
        [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
        [RLIMIT_NICE] = {"Max nice priority", NULL},
        [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
        [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
};

/* Display limits for a process */
static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
                           struct pid *pid, struct task_struct *task)
{
        unsigned int i;
        unsigned long flags;

        struct rlimit rlim[RLIM_NLIMITS];

        if (!lock_task_sighand(task, &flags))
                return 0;
        memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
        unlock_task_sighand(task, &flags);

        /*
         * print the file header
         */
        seq_puts(m, "Limit                     "
                "Soft Limit           "
                "Hard Limit           "
                "Units     \n");

        for (i = 0; i < RLIM_NLIMITS; i++) {
                if (rlim[i].rlim_cur == RLIM_INFINITY)
                        seq_printf(m, "%-25s %-20s ",
                                   lnames[i].name, "unlimited");
                else
                        seq_printf(m, "%-25s %-20lu ",
                                   lnames[i].name, rlim[i].rlim_cur);

                if (rlim[i].rlim_max == RLIM_INFINITY)
                        seq_printf(m, "%-20s ", "unlimited");
                else
                        seq_printf(m, "%-20lu ", rlim[i].rlim_max);

                if (lnames[i].unit)
                        seq_printf(m, "%-10s\n", lnames[i].unit);
                else
                        seq_putc(m, '\n');
        }

        return 0;
}

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *task)
{
        struct syscall_info info;
        u64 *args = &info.data.args[0];
        int res;

        res = lock_trace(task);
        if (res)
                return res;

        if (task_current_syscall(task, &info))
                seq_puts(m, "running\n");
        else if (info.data.nr < 0)
                seq_printf(m, "%d 0x%llx 0x%llx\n",
                           info.data.nr, info.sp, info.data.instruction_pointer);
        else
                seq_printf(m,
                       "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
                       info.data.nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
                       info.sp, info.data.instruction_pointer);
        unlock_trace(task);

        return 0;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */

/************************************************************************/
/*                       Here the fs part begins                        */
/************************************************************************/

/* permission checks */
static bool proc_fd_access_allowed(struct inode *inode)
{
        struct task_struct *task;
        bool allowed = false;
        /* Allow access to a task's file descriptors if it is us or we
         * may use ptrace attach to the process and find out that
         * information.
         */
        task = get_proc_task(inode);
        if (task) {
                allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
                put_task_struct(task);
        }
        return allowed;
}

int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct iattr *attr)
{
        int error;
        struct inode *inode = d_inode(dentry);

        if (attr->ia_valid & ATTR_MODE)
                return -EPERM;

        error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (error)
                return error;

        setattr_copy(&nop_mnt_idmap, inode, attr);
        return 0;
}

/*
 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
 * or euid/egid (for hide_pid_min=2)?
 */
static bool has_pid_permissions(struct proc_fs_info *fs_info,
                                 struct task_struct *task,
                                 enum proc_hidepid hide_pid_min)
{
        /*
         * If 'hidpid' mount option is set force a ptrace check,
         * we indicate that we are using a filesystem syscall
         * by passing PTRACE_MODE_READ_FSCREDS
         */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
                return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);

        if (fs_info->hide_pid < hide_pid_min)
                return true;
        if (in_group_p(fs_info->pid_gid))
                return true;
        return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}


static int proc_pid_permission(struct mnt_idmap *idmap,
                               struct inode *inode, int mask)
{
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;
        bool has_perms;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
        put_task_struct(task);

        if (!has_perms) {
                if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
                        /*
                         * Let's make getdents(), stat(), and open()
                         * consistent with each other.  If a process
                         * may not stat() a file, it shouldn't be seen
                         * in procfs at all.
                         */
                        return -ENOENT;
                }

                return -EPERM;
        }
        return generic_permission(&nop_mnt_idmap, inode, mask);
}



static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_setattr,
};

static int proc_single_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct pid *pid = proc_pid(inode);
        struct task_struct *task;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);

        put_task_struct(task);
        return ret;
}

static int proc_single_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, proc_single_show, inode);
}

static const struct file_operations proc_single_file_operations = {
        .open                = proc_single_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};


struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
        struct task_struct *task = get_proc_task(inode);
        struct mm_struct *mm = ERR_PTR(-ESRCH);

        if (task) {
                mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
                put_task_struct(task);

                if (!IS_ERR_OR_NULL(mm)) {
                        /* ensure this mm_struct can't be freed */
                        mmgrab(mm);
                        /* but do not pin its memory */
                        mmput(mm);
                }
        }

        return mm;
}

static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
        struct mm_struct *mm = proc_mem_open(inode, mode);

        if (IS_ERR(mm))
                return PTR_ERR(mm);

        file->private_data = mm;
        return 0;
}

static int mem_open(struct inode *inode, struct file *file)
{
        int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);

        /* OK to pass negative loff_t, we can catch out-of-range */
        file->f_mode |= FMODE_UNSIGNED_OFFSET;

        return ret;
}

static ssize_t mem_rw(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos, int write)
{
        struct mm_struct *mm = file->private_data;
        unsigned long addr = *ppos;
        ssize_t copied;
        char *page;
        unsigned int flags;

        if (!mm)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        copied = 0;
        if (!mmget_not_zero(mm))
                goto free;

        flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);

        while (count > 0) {
                size_t this_len = min_t(size_t, count, PAGE_SIZE);

                if (write && copy_from_user(page, buf, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                this_len = access_remote_vm(mm, addr, page, this_len, flags);
                if (!this_len) {
                        if (!copied)
                                copied = -EIO;
                        break;
                }

                if (!write && copy_to_user(buf, page, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                buf += this_len;
                addr += this_len;
                copied += this_len;
                count -= this_len;
        }
        *ppos = addr;

        mmput(mm);
free:
        free_page((unsigned long) page);
        return copied;
}

static ssize_t mem_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        return mem_rw(file, buf, count, ppos, 0);
}

static ssize_t mem_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos)
{
        return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
        switch (orig) {
        case 0:
                file->f_pos = offset;
                break;
        case 1:
                file->f_pos += offset;
                break;
        default:
                return -EINVAL;
        }
        force_successful_syscall_return();
        return file->f_pos;
}

static int mem_release(struct inode *inode, struct file *file)
{
        struct mm_struct *mm = file->private_data;
        if (mm)
                mmdrop(mm);
        return 0;
}

static const struct file_operations proc_mem_operations = {
        .llseek                = mem_lseek,
        .read                = mem_read,
        .write                = mem_write,
        .open                = mem_open,
        .release        = mem_release,
};

static int environ_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ);
}

static ssize_t environ_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        char *page;
        unsigned long src = *ppos;
        int ret = 0;
        struct mm_struct *mm = file->private_data;
        unsigned long env_start, env_end;

        /* Ensure the process spawned far enough to have an environment. */
        if (!mm || !mm->env_end)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        if (!mmget_not_zero(mm))
                goto free;

        spin_lock(&mm->arg_lock);
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        while (count > 0) {
                size_t this_len, max_len;
                int retval;

                if (src >= (env_end - env_start))
                        break;

                this_len = env_end - (env_start + src);

                max_len = min_t(size_t, PAGE_SIZE, count);
                this_len = min(max_len, this_len);

                retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);

                if (retval <= 0) {
                        ret = retval;
                        break;
                }

                if (copy_to_user(buf, page, retval)) {
                        ret = -EFAULT;
                        break;
                }

                ret += retval;
                src += retval;
                buf += retval;
                count -= retval;
        }
        *ppos = src;
        mmput(mm);

free:
        free_page((unsigned long) page);
        return ret;
}

static const struct file_operations proc_environ_operations = {
        .open                = environ_open,
        .read                = environ_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static int auxv_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
}

static ssize_t auxv_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        struct mm_struct *mm = file->private_data;
        unsigned int nwords = 0;

        if (!mm)
                return 0;
        do {
                nwords += 2;
        } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
        return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
                                       nwords * sizeof(mm->saved_auxv[0]));
}

static const struct file_operations proc_auxv_operations = {
        .open                = auxv_open,
        .read                = auxv_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
                            loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        int oom_adj = OOM_ADJUST_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
                oom_adj = OOM_ADJUST_MAX;
        else
                oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
                          OOM_SCORE_ADJ_MAX;
        put_task_struct(task);
        if (oom_adj > OOM_ADJUST_MAX)
                oom_adj = OOM_ADJUST_MAX;
        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        int err = 0;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;

        mutex_lock(&oom_adj_mutex);
        if (legacy) {
                if (oom_adj < task->signal->oom_score_adj &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
                /*
                 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
                 * /proc/pid/oom_score_adj instead.
                 */
                pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
                          current->comm, task_pid_nr(current), task_pid_nr(task),
                          task_pid_nr(task));
        } else {
                if ((short)oom_adj < task->signal->oom_score_adj_min &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
        }

        /*
         * Make sure we will check other processes sharing the mm if this is
         * not vfrok which wants its own oom_score_adj.
         * pin the mm so it doesn't go away and get reused after task_unlock
         */
        if (!task->vfork_done) {
                struct task_struct *p = find_lock_task_mm(task);

                if (p) {
                        if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
                                mm = p->mm;
                                mmgrab(mm);
                        }
                        task_unlock(p);
                }
        }

        task->signal->oom_score_adj = oom_adj;
        if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = (short)oom_adj;
        trace_oom_score_adj_update(task);

        if (mm) {
                struct task_struct *p;

                rcu_read_lock();
                for_each_process(p) {
                        if (same_thread_group(task, p))
                                continue;

                        /* do not touch kernel threads or the global init */
                        if (p->flags & PF_KTHREAD || is_global_init(p))
                                continue;

                        task_lock(p);
                        if (!p->vfork_done && process_shares_mm(p, mm)) {
                                p->signal->oom_score_adj = oom_adj;
                                if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                                        p->signal->oom_score_adj_min = (short)oom_adj;
                        }
                        task_unlock(p);
                }
                rcu_read_unlock();
                mmdrop(mm);
        }
err_unlock:
        mutex_unlock(&oom_adj_mutex);
        put_task_struct(task);
        return err;
}

/*
 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
 * kernels.  The effective policy is defined by oom_score_adj, which has a
 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
 * Processes that become oom disabled via oom_adj will still be oom disabled
 * with this implementation.
 *
 * oom_adj cannot be removed since existing userspace binaries use it.
 */
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF] = {};
        int oom_adj;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_adj);
        if (err)
                goto out;
        if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
             oom_adj != OOM_DISABLE) {
                err = -EINVAL;
                goto out;
        }

        /*
         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
         * value is always attainable.
         */
        if (oom_adj == OOM_ADJUST_MAX)
                oom_adj = OOM_SCORE_ADJ_MAX;
        else
                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;

        err = __set_oom_adj(file, oom_adj, true);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_adj_operations = {
        .read                = oom_adj_read,
        .write                = oom_adj_write,
        .llseek                = generic_file_llseek,
};

static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        short oom_score_adj = OOM_SCORE_ADJ_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        oom_score_adj = task->signal->oom_score_adj;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF] = {};
        int oom_score_adj;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
        if (err)
                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
                        oom_score_adj > OOM_SCORE_ADJ_MAX) {
                err = -EINVAL;
                goto out;
        }

        err = __set_oom_adj(file, oom_score_adj, false);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_score_adj_operations = {
        .read                = oom_score_adj_read,
        .write                = oom_score_adj_write,
        .llseek                = default_llseek,
};

#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                           from_kuid(file->f_cred->user_ns,
                                     audit_get_loginuid(task)));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        uid_t loginuid;
        kuid_t kloginuid;
        int rv;

        /* Don't let kthreads write their own loginuid */
        if (current->flags & PF_KTHREAD)
                return -EPERM;

        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
                return -EPERM;
        }
        rcu_read_unlock();

        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }

        rv = kstrtou32_from_user(buf, count, 10, &loginuid);
        if (rv < 0)
                return rv;

        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
                if (!uid_valid(kloginuid))
                        return -EINVAL;
        }

        rv = audit_set_loginuid(kloginuid);
        if (rv < 0)
                return rv;
        return count;
}

static const struct file_operations proc_loginuid_operations = {
        .read                = proc_loginuid_read,
        .write                = proc_loginuid_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                                audit_get_sessionid(task));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static const struct file_operations proc_sessionid_operations = {
        .read                = proc_sessionid_read,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_FAULT_INJECTION
static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
                                      size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        size_t len;
        int make_it_fail;

        if (!task)
                return -ESRCH;
        make_it_fail = task->make_it_fail;
        put_task_struct(task);

        len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);

        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char buffer[PROC_NUMBUF] = {};
        int make_it_fail;
        int rv;

        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
        rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
        if (rv < 0)
                return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->make_it_fail = make_it_fail;
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_fault_inject_operations = {
        .read                = proc_fault_inject_read,
        .write                = proc_fault_inject_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
                                   size_t count, loff_t *ppos)
{
        struct task_struct *task;
        int err;
        unsigned int n;

        err = kstrtouint_from_user(buf, count, 0, &n);
        if (err)
                return err;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->fail_nth = n;
        put_task_struct(task);

        return count;
}

static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
                                  size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char numbuf[PROC_NUMBUF];
        ssize_t len;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, numbuf, len);
}

static const struct file_operations proc_fail_nth_operations = {
        .read                = proc_fail_nth_read,
        .write                = proc_fail_nth_write,
};
#endif


#ifdef CONFIG_SCHED_DEBUG
/*
 * Print out various scheduling related per-task fields:
 */
static int sched_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_show_task(p, ns, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_set_task(p);

        put_task_struct(p);

        return count;
}

static int sched_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, sched_show, inode);
}

static const struct file_operations proc_pid_sched_operations = {
        .open                = sched_open,
        .read                = seq_read,
        .write                = sched_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif

#ifdef CONFIG_SCHED_AUTOGROUP
/*
 * Print out autogroup related information:
 */
static int sched_autogroup_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_autogroup_show_task(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_autogroup_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[PROC_NUMBUF] = {};
        int nice;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;

        err = kstrtoint(strstrip(buffer), 0, &nice);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        err = proc_sched_autogroup_set_nice(p, nice);
        if (err)
                count = err;

        put_task_struct(p);

        return count;
}

static int sched_autogroup_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = single_open(filp, sched_autogroup_show, NULL);
        if (!ret) {
                struct seq_file *m = filp->private_data;

                m->private = inode;
        }
        return ret;
}

static const struct file_operations proc_pid_sched_autogroup_operations = {
        .open                = sched_autogroup_open,
        .read                = seq_read,
        .write                = sched_autogroup_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif /* CONFIG_SCHED_AUTOGROUP */

#ifdef CONFIG_TIME_NS
static int timens_offsets_show(struct seq_file *m, void *v)
{
        struct task_struct *p;

        p = get_proc_task(file_inode(m->file));
        if (!p)
                return -ESRCH;
        proc_timens_show_offsets(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
                                    size_t count, loff_t *ppos)
{
        struct inode *inode = file_inode(file);
        struct proc_timens_offset offsets[2];
        char *kbuf = NULL, *pos, *next_line;
        struct task_struct *p;
        int ret, noffsets;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /* Parse the user data */
        ret = -EINVAL;
        noffsets = 0;
        for (pos = kbuf; pos; pos = next_line) {
                struct proc_timens_offset *off = &offsets[noffsets];
                char clock[10];
                int err;

                /* Find the end of line and ensure we don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                err = sscanf(pos, "%9s %lld %lu", clock,
                                &off->val.tv_sec, &off->val.tv_nsec);
                if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
                        goto out;

                clock[sizeof(clock) - 1] = 0;
                if (strcmp(clock, "monotonic") == 0 ||
                    strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
                        off->clockid = CLOCK_MONOTONIC;
                else if (strcmp(clock, "boottime") == 0 ||
                         strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
                        off->clockid = CLOCK_BOOTTIME;
                else
                        goto out;

                noffsets++;
                if (noffsets == ARRAY_SIZE(offsets)) {
                        if (next_line)
                                count = next_line - kbuf;
                        break;
                }
        }

        ret = -ESRCH;
        p = get_proc_task(inode);
        if (!p)
                goto out;
        ret = proc_timens_set_offset(file, p, offsets, noffsets);
        put_task_struct(p);
        if (ret)
                goto out;

        ret = count;
out:
        kfree(kbuf);
        return ret;
}

static int timens_offsets_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timens_offsets_show, inode);
}

static const struct file_operations proc_timens_offsets_operations = {
        .open                = timens_offsets_open,
        .read                = seq_read,
        .write                = timens_offsets_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};
#endif /* CONFIG_TIME_NS */

static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[TASK_COMM_LEN] = {};
        const size_t maxlen = sizeof(buffer) - 1;

        if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
                return -EFAULT;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (same_thread_group(current, p)) {
                set_task_comm(p, buffer);
                proc_comm_connector(p);
        }
        else
                count = -EINVAL;

        put_task_struct(p);

        return count;
}

static int comm_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        proc_task_name(m, p, false);
        seq_putc(m, '\n');

        put_task_struct(p);

        return 0;
}

static int comm_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, comm_show, inode);
}

static const struct file_operations proc_pid_set_comm_operations = {
        .open                = comm_open,
        .read                = seq_read,
        .write                = comm_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
        struct task_struct *task;
        struct file *exe_file;

        task = get_proc_task(d_inode(dentry));
        if (!task)
                return -ENOENT;
        exe_file = get_task_exe_file(task);
        put_task_struct(task);
        if (exe_file) {
                *exe_path = exe_file->f_path;
                path_get(&exe_file->f_path);
                fput(exe_file);
                return 0;
        } else
                return -ENOENT;
}

static const char *proc_pid_get_link(struct dentry *dentry,
                                     struct inode *inode,
                                     struct delayed_call *done)
{
        struct path path;
        int error = -EACCES;

        if (!dentry)
                return ERR_PTR(-ECHILD);

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = nd_jump_link(&path);
out:
        return ERR_PTR(error);
}

static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
        char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
        char *pathname;
        int len;

        if (!tmp)
                return -ENOMEM;

        pathname = d_path(path, tmp, PATH_MAX);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
        len = tmp + PATH_MAX - 1 - pathname;

        if (len > buflen)
                len = buflen;
        if (copy_to_user(buffer, pathname, len))
                len = -EFAULT;
 out:
        kfree(tmp);
        return len;
}

static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
{
        int error = -EACCES;
        struct inode *inode = d_inode(dentry);
        struct path path;

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = do_proc_readlink(&path, buffer, buflen);
        path_put(&path);
out:
        return error;
}

const struct inode_operations proc_pid_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_pid_get_link,
        .setattr        = proc_setattr,
};


/* building an inode */

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid)
{
        /* Depending on the state of dumpable compute who should own a
         * proc file for a task.
         */
        const struct cred *cred;
        kuid_t uid;
        kgid_t gid;

        if (unlikely(task->flags & PF_KTHREAD)) {
                *ruid = GLOBAL_ROOT_UID;
                *rgid = GLOBAL_ROOT_GID;
                return;
        }

        /* Default to the tasks effective ownership */
        rcu_read_lock();
        cred = __task_cred(task);
        uid = cred->euid;
        gid = cred->egid;
        rcu_read_unlock();

        /*
         * Before the /proc/pid/status file was created the only way to read
         * the effective uid of a /process was to stat /proc/pid.  Reading
         * /proc/pid/status is slow enough that procps and other packages
         * kept stating /proc/pid.  To keep the rules in /proc simple I have
         * made this apply to all per process world readable and executable
         * directories.
         */
        if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
                struct mm_struct *mm;
                task_lock(task);
                mm = task->mm;
                /* Make non-dumpable tasks owned by some root */
                if (mm) {
                        if (get_dumpable(mm) != SUID_DUMP_USER) {
                                struct user_namespace *user_ns = mm->user_ns;

                                uid = make_kuid(user_ns, 0);
                                if (!uid_valid(uid))
                                        uid = GLOBAL_ROOT_UID;

                                gid = make_kgid(user_ns, 0);
                                if (!gid_valid(gid))
                                        gid = GLOBAL_ROOT_GID;
                        }
                } else {
                        uid = GLOBAL_ROOT_UID;
                        gid = GLOBAL_ROOT_GID;
                }
                task_unlock(task);
        }
        *ruid = uid;
        *rgid = gid;
}

void proc_pid_evict_inode(struct proc_inode *ei)
{
        struct pid *pid = ei->pid;

        if (S_ISDIR(ei->vfs_inode.i_mode)) {
                spin_lock(&pid->lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(&pid->lock);
        }
}

struct inode *proc_pid_make_inode(struct super_block *sb,
                                  struct task_struct *task, umode_t mode)
{
        struct inode * inode;
        struct proc_inode *ei;
        struct pid *pid;

        /* We need a new inode */

        inode = new_inode(sb);
        if (!inode)
                goto out;

        /* Common stuff */
        ei = PROC_I(inode);
        inode->i_mode = mode;
        inode->i_ino = get_next_ino();
        simple_inode_init_ts(inode);
        inode->i_op = &proc_def_inode_operations;

        /*
         * grab the reference to task.
         */
        pid = get_task_pid(task, PIDTYPE_PID);
        if (!pid)
                goto out_unlock;

        /* Let the pid remember us for quick removal */
        ei->pid = pid;

        task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
        security_task_to_inode(task, inode);

out:
        return inode;

out_unlock:
        iput(inode);
        return NULL;
}

/*
 * Generating an inode and adding it into @pid->inodes, so that task will
 * invalidate inode's dentry before being released.
 *
 * This helper is used for creating dir-type entries under '/proc' and
 * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
 * can be released by invalidating '/proc/<tgid>' dentry.
 * In theory, dentries under '/proc/<tgid>/task' can also be released by
 * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
 * thread exiting situation: Any one of threads should invalidate its
 * '/proc/<tgid>/task/<pid>' dentry before released.
 */
static struct inode *proc_pid_make_base_inode(struct super_block *sb,
                                struct task_struct *task, umode_t mode)
{
        struct inode *inode;
        struct proc_inode *ei;
        struct pid *pid;

        inode = proc_pid_make_inode(sb, task, mode);
        if (!inode)
                return NULL;

        /* Let proc_flush_pid find this directory inode */
        ei = PROC_I(inode);
        pid = ei->pid;
        spin_lock(&pid->lock);
        hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
        spin_unlock(&pid->lock);

        return inode;
}

int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
                struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

        stat->uid = GLOBAL_ROOT_UID;
        stat->gid = GLOBAL_ROOT_GID;
        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
                        rcu_read_unlock();
                        /*
                         * This doesn't prevent learning whether PID exists,
                         * it only makes getattr() consistent with readdir().
                         */
                        return -ENOENT;
                }
                task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
        }
        rcu_read_unlock();
        return 0;
}

/* dentry stuff */

/*
 * Set <pid>/... inode ownership (can change due to setuid(), etc.)
 */
void pid_update_inode(struct task_struct *task, struct inode *inode)
{
        task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);

        inode->i_mode &= ~(S_ISUID | S_ISGID);
        security_task_to_inode(task, inode);
}

/*
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
 *
 */
static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
        struct inode *inode;
        struct task_struct *task;
        int ret = 0;

        rcu_read_lock();
        inode = d_inode_rcu(dentry);
        if (!inode)
                goto out;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);

        if (task) {
                pid_update_inode(task, inode);
                ret = 1;
        }
out:
        rcu_read_unlock();
        return ret;
}

static inline bool proc_inode_is_dead(struct inode *inode)
{
        return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
}

int pid_delete_dentry(const struct dentry *dentry)
{
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
         * kill it immediately.
         */
        return proc_inode_is_dead(d_inode(dentry));
}

const struct dentry_operations pid_dentry_operations =
{
        .d_revalidate        = pid_revalidate,
        .d_delete        = pid_delete_dentry,
};

/* Lookups */

/*
 * Fill a directory entry.
 *
 * If possible create the dcache entry and derive our inode number and
 * file type from dcache entry.
 *
 * Since all of the proc inode numbers are dynamically generated, the inode
 * numbers do not exist until the inode is cache.  This means creating
 * the dcache entry in readdir is necessary to keep the inode numbers
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
bool proc_fill_cache(struct file *file, struct dir_context *ctx,
        const char *name, unsigned int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
        struct dentry *child, *dir = file->f_path.dentry;
        struct qstr qname = QSTR_INIT(name, len);
        struct inode *inode;
        unsigned type = DT_UNKNOWN;
        ino_t ino = 1;

        child = d_hash_and_lookup(dir, &qname);
        if (!child) {
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
                child = d_alloc_parallel(dir, &qname, &wq);
                if (IS_ERR(child))
                        goto end_instantiate;
                if (d_in_lookup(child)) {
                        struct dentry *res;
                        res = instantiate(child, task, ptr);
                        d_lookup_done(child);
                        if (unlikely(res)) {
                                dput(child);
                                child = res;
                                if (IS_ERR(child))
                                        goto end_instantiate;
                        }
                }
        }
        inode = d_inode(child);
        ino = inode->i_ino;
        type = inode->i_mode >> 12;
        dput(child);
end_instantiate:
        return dir_emit(ctx, name, len, ino, type);
}

/*
 * dname_to_vma_addr - maps a dentry name into two unsigned longs
 * which represent vma start and end addresses.
 */
static int dname_to_vma_addr(struct dentry *dentry,
                             unsigned long *start, unsigned long *end)
{
        const char *str = dentry->d_name.name;
        unsigned long long sval, eval;
        unsigned int len;

        if (str[0] == '0' && str[1] != '-')
                return -EINVAL;
        len = _parse_integer(str, 16, &sval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (sval != (unsigned long)sval)
                return -EINVAL;
        str += len;

        if (*str != '-')
                return -EINVAL;
        str++;

        if (str[0] == '0' && str[1])
                return -EINVAL;
        len = _parse_integer(str, 16, &eval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (eval != (unsigned long)eval)
                return -EINVAL;
        str += len;

        if (*str != '\0')
                return -EINVAL;

        *start = sval;
        *end = eval;

        return 0;
}

static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        bool exact_vma_exists = false;
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        struct inode *inode;
        int status = 0;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
                goto out_notask;

        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR_OR_NULL(mm))
                goto out;

        if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
                status = mmap_read_lock_killable(mm);
                if (!status) {
                        exact_vma_exists = !!find_exact_vma(mm, vm_start,
                                                            vm_end);
                        mmap_read_unlock(mm);
                }
        }

        mmput(mm);

        if (exact_vma_exists) {
                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);

                security_task_to_inode(task, inode);
                status = 1;
        }

out:
        put_task_struct(task);

out_notask:
        return status;
}

static const struct dentry_operations tid_map_files_dentry_operations = {
        .d_revalidate        = map_files_d_revalidate,
        .d_delete        = pid_delete_dentry,
};

static int map_files_get_link(struct dentry *dentry, struct path *path)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        int rc;

        rc = -ENOENT;
        task = get_proc_task(d_inode(dentry));
        if (!task)
                goto out;

        mm = get_task_mm(task);
        put_task_struct(task);
        if (!mm)
                goto out;

        rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
        if (rc)
                goto out_mmput;

        rc = mmap_read_lock_killable(mm);
        if (rc)
                goto out_mmput;

        rc = -ENOENT;
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (vma && vma->vm_file) {
                *path = *file_user_path(vma->vm_file);
                path_get(path);
                rc = 0;
        }
        mmap_read_unlock(mm);

out_mmput:
        mmput(mm);
out:
        return rc;
}

struct map_files_info {
        unsigned long        start;
        unsigned long        end;
        fmode_t                mode;
};

/*
 * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
 * to concerns about how the symlinks may be used to bypass permissions on
 * ancestor directories in the path to the file in question.
 */
static const char *
proc_map_files_get_link(struct dentry *dentry,
                        struct inode *inode,
                        struct delayed_call *done)
{
        if (!checkpoint_restore_ns_capable(&init_user_ns))
                return ERR_PTR(-EPERM);

        return proc_pid_get_link(dentry, inode, done);
}

/*
 * Identical to proc_pid_link_inode_operations except for get_link()
 */
static const struct inode_operations proc_map_files_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_map_files_get_link,
        .setattr        = proc_setattr,
};

static struct dentry *
proc_map_files_instantiate(struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
{
        fmode_t mode = (fmode_t)(unsigned long)ptr;
        struct proc_inode *ei;
        struct inode *inode;

        inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
                                    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
                                    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        ei->op.proc_get_link = map_files_get_link;

        inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;

        d_set_d_op(dentry, &tid_map_files_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_map_files_lookup(struct inode *dir,
                struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct dentry *result;
        struct mm_struct *mm;

        result = ERR_PTR(-ENOENT);
        task = get_proc_task(dir);
        if (!task)
                goto out;

        result = ERR_PTR(-EACCES);
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        result = ERR_PTR(-ENOENT);
        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        result = ERR_PTR(-EINTR);
        if (mmap_read_lock_killable(mm))
                goto out_put_mm;

        result = ERR_PTR(-ENOENT);
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (!vma)
                goto out_no_vma;

        if (vma->vm_file)
                result = proc_map_files_instantiate(dentry, task,
                                (void *)(unsigned long)vma->vm_file->f_mode);

out_no_vma:
        mmap_read_unlock(mm);
out_put_mm:
        mmput(mm);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

static const struct inode_operations proc_map_files_inode_operations = {
        .lookup                = proc_map_files_lookup,
        .permission        = proc_fd_permission,
        .setattr        = proc_setattr,
};

static int
proc_map_files_readdir(struct file *file, struct dir_context *ctx)
{
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long nr_files, pos, i;
        GENRADIX(struct map_files_info) fa;
        struct map_files_info *p;
        int ret;
        struct vma_iterator vmi;

        genradix_init(&fa);

        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out;

        ret = -EACCES;
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        ret = 0;
        if (!dir_emit_dots(file, ctx))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        ret = mmap_read_lock_killable(mm);
        if (ret) {
                mmput(mm);
                goto out_put_task;
        }

        nr_files = 0;

        /*
         * We need two passes here:
         *
         *  1) Collect vmas of mapped files with mmap_lock taken
         *  2) Release mmap_lock and instantiate entries
         *
         * otherwise we get lockdep complained, since filldir()
         * routine might require mmap_lock taken in might_fault().
         */

        pos = 2;
        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (!vma->vm_file)
                        continue;
                if (++pos <= ctx->pos)
                        continue;

                p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
                if (!p) {
                        ret = -ENOMEM;
                        mmap_read_unlock(mm);
                        mmput(mm);
                        goto out_put_task;
                }

                p->start = vma->vm_start;
                p->end = vma->vm_end;
                p->mode = vma->vm_file->f_mode;
        }
        mmap_read_unlock(mm);
        mmput(mm);

        for (i = 0; i < nr_files; i++) {
                char buf[4 * sizeof(long) + 2];        /* max: %lx-%lx\0 */
                unsigned int len;

                p = genradix_ptr(&fa, i);
                len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
                if (!proc_fill_cache(file, ctx,
                                      buf, len,
                                      proc_map_files_instantiate,
                                      task,
                                      (void *)(unsigned long)p->mode))
                        break;
                ctx->pos++;
        }

out_put_task:
        put_task_struct(task);
out:
        genradix_free(&fa);
        return ret;
}

static const struct file_operations proc_map_files_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_map_files_readdir,
        .llseek                = generic_file_llseek,
};

#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
        struct pid *pid;
        struct task_struct *task;
        struct sighand_struct *sighand;
        struct pid_namespace *ns;
        unsigned long flags;
};

static void *timers_start(struct seq_file *m, loff_t *pos)
{
        struct timers_private *tp = m->private;

        tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
        if (!tp->task)
                return ERR_PTR(-ESRCH);

        tp->sighand = lock_task_sighand(tp->task, &tp->flags);
        if (!tp->sighand)
                return ERR_PTR(-ESRCH);

        return seq_list_start(&tp->task->signal->posix_timers, *pos);
}

static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct timers_private *tp = m->private;
        return seq_list_next(v, &tp->task->signal->posix_timers, pos);
}

static void timers_stop(struct seq_file *m, void *v)
{
        struct timers_private *tp = m->private;

        if (tp->sighand) {
                unlock_task_sighand(tp->task, &tp->flags);
                tp->sighand = NULL;
        }

        if (tp->task) {
                put_task_struct(tp->task);
                tp->task = NULL;
        }
}

static int show_timer(struct seq_file *m, void *v)
{
        struct k_itimer *timer;
        struct timers_private *tp = m->private;
        int notify;
        static const char * const nstr[] = {
                [SIGEV_SIGNAL] = "signal",
                [SIGEV_NONE] = "none",
                [SIGEV_THREAD] = "thread",
        };

        timer = list_entry((struct list_head *)v, struct k_itimer, list);
        notify = timer->it_sigev_notify;

        seq_printf(m, "ID: %d\n", timer->it_id);
        seq_printf(m, "signal: %d/%px\n",
                   timer->sigq->info.si_signo,
                   timer->sigq->info.si_value.sival_ptr);
        seq_printf(m, "notify: %s/%s.%d\n",
                   nstr[notify & ~SIGEV_THREAD_ID],
                   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
                   pid_nr_ns(timer->it_pid, tp->ns));
        seq_printf(m, "ClockID: %d\n", timer->it_clock);

        return 0;
}

static const struct seq_operations proc_timers_seq_ops = {
        .start        = timers_start,
        .next        = timers_next,
        .stop        = timers_stop,
        .show        = show_timer,
};

static int proc_timers_open(struct inode *inode, struct file *file)
{
        struct timers_private *tp;

        tp = __seq_open_private(file, &proc_timers_seq_ops,
                        sizeof(struct timers_private));
        if (!tp)
                return -ENOMEM;

        tp->pid = proc_pid(inode);
        tp->ns = proc_pid_ns(inode->i_sb);
        return 0;
}

static const struct file_operations proc_timers_operations = {
        .open                = proc_timers_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release_private,
};
#endif

static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        u64 slack_ns;
        int err;

        err = kstrtoull_from_user(buf, count, 10, &slack_ns);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        count = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_setscheduler(p);
                if (err) {
                        count = err;
                        goto out;
                }
        }

        task_lock(p);
        if (slack_ns == 0)
                p->timer_slack_ns = p->default_timer_slack_ns;
        else
                p->timer_slack_ns = slack_ns;
        task_unlock(p);

out:
        put_task_struct(p);

        return count;
}

static int timerslack_ns_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;
        int err = 0;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        err = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_getscheduler(p);
                if (err)
                        goto out;
        }

        task_lock(p);
        seq_printf(m, "%llu\n", p->timer_slack_ns);
        task_unlock(p);

out:
        put_task_struct(p);

        return err;
}

static int timerslack_ns_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timerslack_ns_show, inode);
}

static const struct file_operations proc_pid_set_timerslack_ns_operations = {
        .open                = timerslack_ns_open,
        .read                = seq_read,
        .write                = timerslack_ns_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static struct dentry *proc_pident_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;

        inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        if (S_ISDIR(inode->i_mode))
                set_nlink(inode, 2);        /* Use getattr to fix if necessary */
        if (p->iop)
                inode->i_op = p->iop;
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
        pid_update_inode(task, inode);
        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_pident_lookup(struct inode *dir, 
                                         struct dentry *dentry,
                                         const struct pid_entry *p,
                                         const struct pid_entry *end)
{
        struct task_struct *task = get_proc_task(dir);
        struct dentry *res = ERR_PTR(-ENOENT);

        if (!task)
                goto out_no_task;

        /*
         * Yes, it does not scale. And it should not. Don't add
         * new entries into /proc/<tgid>/ without very good reasons.
         */
        for (; p < end; p++) {
                if (p->len != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, p->name, p->len)) {
                        res = proc_pident_instantiate(dentry, task, p);
                        break;
                }
        }
        put_task_struct(task);
out_no_task:
        return res;
}

static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
                const struct pid_entry *ents, unsigned int nents)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        const struct pid_entry *p;

        if (!task)
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                goto out;

        if (ctx->pos >= nents + 2)
                goto out;

        for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
                if (!proc_fill_cache(file, ctx, p->name, p->len,
                                proc_pident_instantiate, task, p))
                        break;
                ctx->pos++;
        }
out:
        put_task_struct(task);
        return 0;
}

#ifdef CONFIG_SECURITY
static int proc_pid_attr_open(struct inode *inode, struct file *file)
{
        file->private_data = NULL;
        __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
        return 0;
}

static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        char *p = NULL;
        ssize_t length;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;

        length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
                                      file->f_path.dentry->d_name.name,
                                      &p);
        put_task_struct(task);
        if (length > 0)
                length = simple_read_from_buffer(buf, count, ppos, p, length);
        kfree(p);
        return length;
}

static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task;
        void *page;
        int rv;

        /* A task may only write when it was the opener. */
        if (file->private_data != current->mm)
                return -EPERM;

        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (!task) {
                rcu_read_unlock();
                return -ESRCH;
        }
        /* A task may only write its own attributes. */
        if (current != task) {
                rcu_read_unlock();
                return -EACCES;
        }
        /* Prevent changes to overridden credentials. */
        if (current_cred() != current_real_cred()) {
                rcu_read_unlock();
                return -EBUSY;
        }
        rcu_read_unlock();

        if (count > PAGE_SIZE)
                count = PAGE_SIZE;

        /* No partial writes. */
        if (*ppos != 0)
                return -EINVAL;

        page = memdup_user(buf, count);
        if (IS_ERR(page)) {
                rv = PTR_ERR(page);
                goto out;
        }

        /* Guard against adverse ptrace interaction */
        rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
        if (rv < 0)
                goto out_free;

        rv = security_setprocattr(PROC_I(inode)->op.lsmid,
                                  file->f_path.dentry->d_name.name, page,
                                  count);
        mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
        kfree(page);
out:
        return rv;
}

static const struct file_operations proc_pid_attr_operations = {
        .open                = proc_pid_attr_open,
        .read                = proc_pid_attr_read,
        .write                = proc_pid_attr_write,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

#define LSM_DIR_OPS(LSM) \
static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
                             struct dir_context *ctx) \
{ \
        return proc_pident_readdir(filp, ctx, \
                                   LSM##_attr_dir_stuff, \
                                   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct file_operations proc_##LSM##_attr_dir_ops = { \
        .read                = generic_read_dir, \
        .iterate_shared        = proc_##LSM##_attr_dir_iterate, \
        .llseek                = default_llseek, \
}; \
\
static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
                                struct dentry *dentry, unsigned int flags) \
{ \
        return proc_pident_lookup(dir, dentry, \
                                  LSM##_attr_dir_stuff, \
                                  LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
        .lookup                = proc_##LSM##_attr_dir_lookup, \
        .getattr        = pid_getattr, \
        .setattr        = proc_setattr, \
}

#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
        ATTR(LSM_ID_SMACK, "current",        0666),
};
LSM_DIR_OPS(smack);
#endif

#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
        ATTR(LSM_ID_APPARMOR, "current",        0666),
        ATTR(LSM_ID_APPARMOR, "prev",                0444),
        ATTR(LSM_ID_APPARMOR, "exec",                0666),
};
LSM_DIR_OPS(apparmor);
#endif

static const struct pid_entry attr_dir_stuff[] = {
        ATTR(LSM_ID_UNDEF, "current",        0666),
        ATTR(LSM_ID_UNDEF, "prev",                0444),
        ATTR(LSM_ID_UNDEF, "exec",                0666),
        ATTR(LSM_ID_UNDEF, "fscreate",        0666),
        ATTR(LSM_ID_UNDEF, "keycreate",        0666),
        ATTR(LSM_ID_UNDEF, "sockcreate",        0666),
#ifdef CONFIG_SECURITY_SMACK
        DIR("smack",                        0555,
            proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
#endif
#ifdef CONFIG_SECURITY_APPARMOR
        DIR("apparmor",                        0555,
            proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
#endif
};

static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx, 
                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
}

static const struct file_operations proc_attr_dir_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_attr_dir_readdir,
        .llseek                = generic_file_llseek,
};

static struct dentry *proc_attr_dir_lookup(struct inode *dir,
                                struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  attr_dir_stuff,
                                  attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}

static const struct inode_operations proc_attr_dir_inode_operations = {
        .lookup                = proc_attr_dir_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};

#endif

#ifdef CONFIG_ELF_CORE
static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
                                         size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        struct mm_struct *mm;
        char buffer[PROC_NUMBUF];
        size_t len;
        int ret;

        if (!task)
                return -ESRCH;

        ret = 0;
        mm = get_task_mm(task);
        if (mm) {
                len = snprintf(buffer, sizeof(buffer), "%08lx\n",
                               ((mm->flags & MMF_DUMP_FILTER_MASK) >>
                                MMF_DUMP_FILTER_SHIFT));
                mmput(mm);
                ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
        }

        put_task_struct(task);

        return ret;
}

static ssize_t proc_coredump_filter_write(struct file *file,
                                          const char __user *buf,
                                          size_t count,
                                          loff_t *ppos)
{
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;

        ret = kstrtouint_from_user(buf, count, 0, &val);
        if (ret < 0)
                return ret;

        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
        ret = 0;

        for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
                if (val & mask)
                        set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
                else
                        clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
        }

        mmput(mm);
 out_no_mm:
        put_task_struct(task);
 out_no_task:
        if (ret < 0)
                return ret;
        return count;
}

static const struct file_operations proc_coredump_filter_operations = {
        .read                = proc_coredump_filter_read,
        .write                = proc_coredump_filter_write,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
        struct task_io_accounting acct;
        int result;

        result = down_read_killable(&task->signal->exec_update_lock);
        if (result)
                return result;

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
                result = -EACCES;
                goto out_unlock;
        }

        if (whole) {
                struct signal_struct *sig = task->signal;
                struct task_struct *t;
                unsigned int seq = 1;
                unsigned long flags;

                rcu_read_lock();
                do {
                        seq++; /* 2 on the 1st/lockless path, otherwise odd */
                        flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);

                        acct = sig->ioac;
                        __for_each_thread(sig, t)
                                task_io_accounting_add(&acct, &t->ioac);

                } while (need_seqretry(&sig->stats_lock, seq));
                done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
                rcu_read_unlock();
        } else {
                acct = task->ioac;
        }

        seq_printf(m,
                   "rchar: %llu\n"
                   "wchar: %llu\n"
                   "syscr: %llu\n"
                   "syscw: %llu\n"
                   "read_bytes: %llu\n"
                   "write_bytes: %llu\n"
                   "cancelled_write_bytes: %llu\n",
                   (unsigned long long)acct.rchar,
                   (unsigned long long)acct.wchar,
                   (unsigned long long)acct.syscr,
                   (unsigned long long)acct.syscw,
                   (unsigned long long)acct.read_bytes,
                   (unsigned long long)acct.write_bytes,
                   (unsigned long long)acct.cancelled_write_bytes);
        result = 0;

out_unlock:
        up_read(&task->signal->exec_update_lock);
        return result;
}

static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                  struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 0);
}

static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                   struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 1);
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */

#ifdef CONFIG_USER_NS
static int proc_id_map_open(struct inode *inode, struct file *file,
        const struct seq_operations *seq_ops)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        struct seq_file *seq;
        int ret = -EINVAL;

        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        ret = seq_open(file, seq_ops);
        if (ret)
                goto err_put_ns;

        seq = file->private_data;
        seq->private = ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_id_map_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        put_user_ns(ns);
        return seq_release(inode, file);
}

static int proc_uid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_uid_seq_operations);
}

static int proc_gid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_gid_seq_operations);
}

static int proc_projid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_projid_seq_operations);
}

static const struct file_operations proc_uid_map_operations = {
        .open                = proc_uid_map_open,
        .write                = proc_uid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_gid_map_operations = {
        .open                = proc_gid_map_open,
        .write                = proc_gid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_projid_map_operations = {
        .open                = proc_projid_map_open,
        .write                = proc_projid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static int proc_setgroups_open(struct inode *inode, struct file *file)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        int ret;

        ret = -ESRCH;
        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        if (file->f_mode & FMODE_WRITE) {
                ret = -EACCES;
                if (!ns_capable(ns, CAP_SYS_ADMIN))
                        goto err_put_ns;
        }

        ret = single_open(file, &proc_setgroups_show, ns);
        if (ret)
                goto err_put_ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_setgroups_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        int ret = single_release(inode, file);
        put_user_ns(ns);
        return ret;
}

static const struct file_operations proc_setgroups_operations = {
        .open                = proc_setgroups_open,
        .write                = proc_setgroups_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_setgroups_release,
};
#endif /* CONFIG_USER_NS */

static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        int err = lock_trace(task);
        if (!err) {
                seq_printf(m, "%08x\n", task->personality);
                unlock_trace(task);
        }
        return err;
}

#ifdef CONFIG_LIVEPATCH
static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        seq_printf(m, "%d\n", task->patch_state);
        return 0;
}
#endif /* CONFIG_LIVEPATCH */

#ifdef CONFIG_KSM
static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm;

        mm = get_task_mm(task);
        if (mm) {
                seq_printf(m, "%lu\n", mm->ksm_merging_pages);
                mmput(mm);
        }

        return 0;
}
static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm;

        mm = get_task_mm(task);
        if (mm) {
                seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
                seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
                seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
                seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
                mmput(mm);
        }

        return 0;
}
#endif /* CONFIG_KSM */

#ifdef CONFIG_STACKLEAK_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        unsigned long prev_depth = THREAD_SIZE -
                                (task->prev_lowest_stack & (THREAD_SIZE - 1));
        unsigned long depth = THREAD_SIZE -
                                (task->lowest_stack & (THREAD_SIZE - 1));

        seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
                                                        prev_depth, depth);
        return 0;
}
#endif /* CONFIG_STACKLEAK_METRICS */

/*
 * Thread groups
 */
static const struct file_operations proc_task_operations;
static const struct inode_operations proc_task_inode_operations;

static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
        DIR("fdinfo",     S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",          S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",    S_IRUSR, proc_environ_operations),
        REG("auxv",       S_IRUSR, proc_auxv_operations),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",          S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
#ifdef CONFIG_TIME_NS
        REG("timens_offsets",  S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",    S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
        REG("maps",       S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
        REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",        proc_cwd_link),
        LNK("root",       proc_root_link),
        LNK("exe",        proc_exe_link),
        REG("mounts",     S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
        REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",      S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",     S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_ELF_CORE
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tgid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
        REG("timers",          S_IRUGO, proc_timers_operations),
#endif
        REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_STACKLEAK_METRICS
        ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
        ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
#ifdef CONFIG_KSM
        ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
        ONE("ksm_stat",  S_IRUSR, proc_pid_ksm_stat),
#endif
};

static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}

static const struct file_operations proc_tgid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tgid_base_readdir,
        .llseek                = generic_file_llseek,
};

struct pid *tgid_pidfd_to_pid(const struct file *file)
{
        if (file->f_op != &proc_tgid_base_operations)
                return ERR_PTR(-EBADF);

        return proc_pid(file_inode(file));
}

static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tgid_base_stuff,
                                  tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}

static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup                = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
        .permission        = proc_pid_permission,
};

/**
 * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
 * @pid: pid that should be flushed.
 *
 * This function walks a list of inodes (that belong to any proc
 * filesystem) that are attached to the pid and flushes them from
 * the dentry cache.
 *
 * It is safe and reasonable to cache /proc entries for a task until
 * that task exits.  After that they just clog up the dcache with
 * useless entries, possibly causing useful dcache entries to be
 * flushed instead.  This routine is provided to flush those useless
 * dcache entries when a process is reaped.
 *
 * NOTE: This routine is just an optimization so it does not guarantee
 *       that no dcache entries will exist after a process is reaped
 *       it just makes it very unlikely that any will persist.
 */

void proc_flush_pid(struct pid *pid)
{
        proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
}

static struct dentry *proc_pid_instantiate(struct dentry * dentry,
                                   struct task_struct *task, const void *ptr)
{
        struct inode *inode;

        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tgid_base_inode_operations;
        inode->i_fop = &proc_tgid_base_operations;
        inode->i_flags|=S_IMMUTABLE;

        set_nlink(inode, nlink_tgid);
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task;
        unsigned tgid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        tgid = name_to_int(&dentry->d_name);
        if (tgid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tgid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;

        /* Limit procfs to only ptraceable tasks */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
                        goto out_put_task;
        }

        result = proc_pid_instantiate(dentry, task, NULL);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

/*
 * Find the first task with tgid >= tgid
 *
 */
struct tgid_iter {
        unsigned int tgid;
        struct task_struct *task;
};
static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
{
        struct pid *pid;

        if (iter.task)
                put_task_struct(iter.task);
        rcu_read_lock();
retry:
        iter.task = NULL;
        pid = find_ge_pid(iter.tgid, ns);
        if (pid) {
                iter.tgid = pid_nr_ns(pid, ns);
                iter.task = pid_task(pid, PIDTYPE_TGID);
                if (!iter.task) {
                        iter.tgid += 1;
                        goto retry;
                }
                get_task_struct(iter.task);
        }
        rcu_read_unlock();
        return iter;
}

#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)

/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
        struct tgid_iter iter;
        struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
        struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
        loff_t pos = ctx->pos;

        if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
                return 0;

        if (pos == TGID_OFFSET - 2) {
                struct inode *inode = d_inode(fs_info->proc_self);
                if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        if (pos == TGID_OFFSET - 1) {
                struct inode *inode = d_inode(fs_info->proc_thread_self);
                if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        iter.tgid = pos - TGID_OFFSET;
        iter.task = NULL;
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
                char name[10 + 1];
                unsigned int len;

                cond_resched();
                if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
                        continue;

                len = snprintf(name, sizeof(name), "%u", iter.tgid);
                ctx->pos = iter.tgid + TGID_OFFSET;
                if (!proc_fill_cache(file, ctx, name, len,
                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
                        return 0;
                }
        }
        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
        return 0;
}

/*
 * proc_tid_comm_permission is a special permission function exclusively
 * used for the node /proc/<pid>/task/<tid>/comm.
 * It bypasses generic permission checks in the case where a task of the same
 * task group attempts to access the node.
 * The rationale behind this is that glibc and bionic access this node for
 * cross thread naming (pthread_set/getname_np(!self)). However, if
 * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
 * which locks out the cross thread naming implementation.
 * This function makes sure that the node is always accessible for members of
 * same thread group.
 */
static int proc_tid_comm_permission(struct mnt_idmap *idmap,
                                    struct inode *inode, int mask)
{
        bool is_same_tgroup;
        struct task_struct *task;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        is_same_tgroup = same_thread_group(current, task);
        put_task_struct(task);

        if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
                /* This file (/proc/<pid>/task/<tid>/comm) can always be
                 * read or written by the members of the corresponding
                 * thread group.
                 */
                return 0;
        }

        return generic_permission(&nop_mnt_idmap, inode, mask);
}

static const struct inode_operations proc_tid_comm_inode_operations = {
                .setattr        = proc_setattr,
                .permission        = proc_tid_comm_permission,
};

/*
 * Tasks
 */
static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",   S_IRUSR, proc_environ_operations),
        REG("auxv",      S_IRUSR, proc_auxv_operations),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",         S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
        NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
                         &proc_tid_comm_inode_operations,
                         &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",   S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
        REG("maps",      S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_PROC_CHILDREN
        REG("children",  S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
        REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",       proc_cwd_link),
        LNK("root",      proc_root_link),
        LNK("exe",       proc_exe_link),
        REG("mounts",    S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",     S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",    S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
        ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
#ifdef CONFIG_KSM
        ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
        ONE("ksm_stat",  S_IRUSR, proc_pid_ksm_stat),
#endif
};

static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}

static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tid_base_stuff,
                                  tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}

static const struct file_operations proc_tid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tid_base_readdir,
        .llseek                = generic_file_llseek,
};

static const struct inode_operations proc_tid_base_inode_operations = {
        .lookup                = proc_tid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};

static struct dentry *proc_task_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        struct inode *inode;
        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tid_base_inode_operations;
        inode->i_fop = &proc_tid_base_operations;
        inode->i_flags |= S_IMMUTABLE;

        set_nlink(inode, nlink_tid);
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
        struct task_struct *task;
        struct task_struct *leader = get_proc_task(dir);
        unsigned tid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        if (!leader)
                goto out_no_task;

        tid = name_to_int(&dentry->d_name);
        if (tid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;
        if (!same_thread_group(leader, task))
                goto out_drop_task;

        result = proc_task_instantiate(dentry, task, NULL);
out_drop_task:
        put_task_struct(task);
out:
        put_task_struct(leader);
out_no_task:
        return result;
}

/*
 * Find the first tid of a thread group to return to user space.
 *
 * Usually this is just the thread group leader, but if the users
 * buffer was too small or there was a seek into the middle of the
 * directory we have more work todo.
 *
 * In the case of a short read we start with find_task_by_pid.
 *
 * In the case of a seek we start with the leader and walk nr
 * threads past it.
 */
static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
                                        struct pid_namespace *ns)
{
        struct task_struct *pos, *task;
        unsigned long nr = f_pos;

        if (nr != f_pos)        /* 32bit overflow? */
                return NULL;

        rcu_read_lock();
        task = pid_task(pid, PIDTYPE_PID);
        if (!task)
                goto fail;

        /* Attempt to start with the tid of a thread */
        if (tid && nr) {
                pos = find_task_by_pid_ns(tid, ns);
                if (pos && same_thread_group(pos, task))
                        goto found;
        }

        /* If nr exceeds the number of threads there is nothing todo */
        if (nr >= get_nr_threads(task))
                goto fail;

        /* If we haven't found our starting place yet start
         * with the leader and walk nr threads forward.
         */
        for_each_thread(task, pos) {
                if (!nr--)
                        goto found;
        }
fail:
        pos = NULL;
        goto out;
found:
        get_task_struct(pos);
out:
        rcu_read_unlock();
        return pos;
}

/*
 * Find the next thread in the thread list.
 * Return NULL if there is an error or no next thread.
 *
 * The reference to the input task_struct is released.
 */
static struct task_struct *next_tid(struct task_struct *start)
{
        struct task_struct *pos = NULL;
        rcu_read_lock();
        if (pid_alive(start)) {
                pos = __next_thread(start);
                if (pos)
                        get_task_struct(pos);
        }
        rcu_read_unlock();
        put_task_struct(start);
        return pos;
}

/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct task_struct *task;
        struct pid_namespace *ns;
        int tid;

        if (proc_inode_is_dead(inode))
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                return 0;

        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
         */
        ns = proc_pid_ns(inode->i_sb);
        tid = (int)file->f_version;
        file->f_version = 0;
        for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
             task;
             task = next_tid(task), ctx->pos++) {
                char name[10 + 1];
                unsigned int len;

                tid = task_pid_nr_ns(task, ns);
                if (!tid)
                        continue;        /* The task has just exited. */
                len = snprintf(name, sizeof(name), "%u", tid);
                if (!proc_fill_cache(file, ctx, name, len,
                                proc_task_instantiate, task, NULL)) {
                        /* returning this tgid failed, save it as the first
                         * pid for the next readir call */
                        file->f_version = (u64)tid;
                        put_task_struct(task);
                        break;
                }
        }

        return 0;
}

static int proc_task_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct task_struct *p = get_proc_task(inode);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

        if (p) {
                stat->nlink += get_nr_threads(p);
                put_task_struct(p);
        }

        return 0;
}

static const struct inode_operations proc_task_inode_operations = {
        .lookup                = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_setattr,
        .permission        = proc_pid_permission,
};

static const struct file_operations proc_task_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_task_readdir,
        .llseek                = generic_file_llseek,
};

void __init set_proc_pid_nlink(void)
{
        nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
        nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}
























































































































































































































































































































































































































































































































































































































































































































































    1 













    1 



    1 






    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
/* Connection tracking via netlink socket. Allows for user space
 * protocol helpers and general trouble making from userspace.
 *
 * (C) 2001 by Jay Schulist <jschlst@samba.org>
 * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org>
 * (C) 2003 by Patrick Mchardy <kaber@trash.net>
 * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org>
 *
 * Initial connection tracking via netlink development funded and
 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
 *
 * Further development of this code funded by Astaro AG (http://www.astaro.com)
 *
 * This software may be used and distributed according to the terms
 * of the GNU General Public License, incorporated herein by reference.
 */

#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/rculist.h>
#include <linux/rculist_nulls.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/security.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/netlink.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/siphash.h>

#include <linux/netfilter.h>
#include <net/netlink.h>
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#if IS_ENABLED(CONFIG_NF_NAT)
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#endif

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>

#include "nf_internals.h"

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("List and change connection tracking table");

struct ctnetlink_list_dump_ctx {
        struct nf_conn *last;
        unsigned int cpu;
        bool done;
};

static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
                                const struct nf_conntrack_tuple *tuple,
                                const struct nf_conntrack_l4proto *l4proto)
{
        int ret = 0;
        struct nlattr *nest_parms;

        nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO);
        if (!nest_parms)
                goto nla_put_failure;
        if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum))
                goto nla_put_failure;

        if (likely(l4proto->tuple_to_nlattr))
                ret = l4proto->tuple_to_nlattr(skb, tuple);

        nla_nest_end(skb, nest_parms);

        return ret;

nla_put_failure:
        return -1;
}

static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
                                const struct nf_conntrack_tuple *tuple)
{
        if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
            nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
                return -EMSGSIZE;
        return 0;
}

static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
                                const struct nf_conntrack_tuple *tuple)
{
        if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
            nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
                return -EMSGSIZE;
        return 0;
}

static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
                                    const struct nf_conntrack_tuple *tuple)
{
        int ret = 0;
        struct nlattr *nest_parms;

        nest_parms = nla_nest_start(skb, CTA_TUPLE_IP);
        if (!nest_parms)
                goto nla_put_failure;

        switch (tuple->src.l3num) {
        case NFPROTO_IPV4:
                ret = ipv4_tuple_to_nlattr(skb, tuple);
                break;
        case NFPROTO_IPV6:
                ret = ipv6_tuple_to_nlattr(skb, tuple);
                break;
        }

        nla_nest_end(skb, nest_parms);

        return ret;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_tuples(struct sk_buff *skb,
                                 const struct nf_conntrack_tuple *tuple)
{
        const struct nf_conntrack_l4proto *l4proto;
        int ret;

        rcu_read_lock();
        ret = ctnetlink_dump_tuples_ip(skb, tuple);

        if (ret >= 0) {
                l4proto = nf_ct_l4proto_find(tuple->dst.protonum);
                ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
        }
        rcu_read_unlock();
        return ret;
}

static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype,
                                  const struct nf_conntrack_zone *zone, int dir)
{
        if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir)
                return 0;
        if (nla_put_be16(skb, attrtype, htons(zone->id)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct)
{
        if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
                                  bool skip_zero)
{
        long timeout;

        if (nf_ct_is_confirmed(ct))
                timeout = nf_ct_expires(ct) / HZ;
        else
                timeout = ct->timeout / HZ;

        if (skip_zero && timeout == 0)
                return 0;

        if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct,
                                    bool destroy)
{
        const struct nf_conntrack_l4proto *l4proto;
        struct nlattr *nest_proto;
        int ret;

        l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
        if (!l4proto->to_nlattr)
                return 0;

        nest_proto = nla_nest_start(skb, CTA_PROTOINFO);
        if (!nest_proto)
                goto nla_put_failure;

        ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy);

        nla_nest_end(skb, nest_proto);

        return ret;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
                                   const struct nf_conn *ct)
{
        struct nlattr *nest_helper;
        const struct nf_conn_help *help = nfct_help(ct);
        struct nf_conntrack_helper *helper;

        if (!help)
                return 0;

        rcu_read_lock();
        helper = rcu_dereference(help->helper);
        if (!helper)
                goto out;

        nest_helper = nla_nest_start(skb, CTA_HELP);
        if (!nest_helper)
                goto nla_put_failure;
        if (nla_put_string(skb, CTA_HELP_NAME, helper->name))
                goto nla_put_failure;

        if (helper->to_nlattr)
                helper->to_nlattr(skb, ct);

        nla_nest_end(skb, nest_helper);
out:
        rcu_read_unlock();
        return 0;

nla_put_failure:
        rcu_read_unlock();
        return -1;
}

static int
dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
              enum ip_conntrack_dir dir, int type)
{
        enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG;
        struct nf_conn_counter *counter = acct->counter;
        struct nlattr *nest_count;
        u64 pkts, bytes;

        if (type == IPCTNL_MSG_CT_GET_CTRZERO) {
                pkts = atomic64_xchg(&counter[dir].packets, 0);
                bytes = atomic64_xchg(&counter[dir].bytes, 0);
        } else {
                pkts = atomic64_read(&counter[dir].packets);
                bytes = atomic64_read(&counter[dir].bytes);
        }

        nest_count = nla_nest_start(skb, attr);
        if (!nest_count)
                goto nla_put_failure;

        if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts),
                         CTA_COUNTERS_PAD) ||
            nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes),
                         CTA_COUNTERS_PAD))
                goto nla_put_failure;

        nla_nest_end(skb, nest_count);

        return 0;

nla_put_failure:
        return -1;
}

static int
ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type)
{
        struct nf_conn_acct *acct = nf_conn_acct_find(ct);

        if (!acct)
                return 0;

        if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0)
                return -1;
        if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0)
                return -1;

        return 0;
}

static int
ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
{
        struct nlattr *nest_count;
        const struct nf_conn_tstamp *tstamp;

        tstamp = nf_conn_tstamp_find(ct);
        if (!tstamp)
                return 0;

        nest_count = nla_nest_start(skb, CTA_TIMESTAMP);
        if (!nest_count)
                goto nla_put_failure;

        if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start),
                         CTA_TIMESTAMP_PAD) ||
            (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP,
                                               cpu_to_be64(tstamp->stop),
                                               CTA_TIMESTAMP_PAD)))
                goto nla_put_failure;
        nla_nest_end(skb, nest_count);

        return 0;

nla_put_failure:
        return -1;
}

#ifdef CONFIG_NF_CONNTRACK_MARK
static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct,
                               bool dump)
{
        u32 mark = READ_ONCE(ct->mark);

        if (!mark && !dump)
                return 0;

        if (nla_put_be32(skb, CTA_MARK, htonl(mark)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -1;
}
#else
#define ctnetlink_dump_mark(a, b, c) (0)
#endif

#ifdef CONFIG_NF_CONNTRACK_SECMARK
static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
        struct nlattr *nest_secctx;
        int len, ret;
        char *secctx;

        ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
        if (ret)
                return 0;

        ret = -1;
        nest_secctx = nla_nest_start(skb, CTA_SECCTX);
        if (!nest_secctx)
                goto nla_put_failure;

        if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
                goto nla_put_failure;
        nla_nest_end(skb, nest_secctx);

        ret = 0;
nla_put_failure:
        security_release_secctx(secctx, len);
        return ret;
}
#else
#define ctnetlink_dump_secctx(a, b) (0)
#endif

#ifdef CONFIG_NF_CONNTRACK_LABELS
static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
        struct nf_conn_labels *labels = nf_ct_labels_find(ct);

        if (!labels)
                return 0;
        return nla_total_size(sizeof(labels->bits));
}

static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
{
        struct nf_conn_labels *labels = nf_ct_labels_find(ct);
        unsigned int i;

        if (!labels)
                return 0;

        i = 0;
        do {
                if (labels->bits[i] != 0)
                        return nla_put(skb, CTA_LABELS, sizeof(labels->bits),
                                       labels->bits);
                i++;
        } while (i < ARRAY_SIZE(labels->bits));

        return 0;
}
#else
#define ctnetlink_dump_labels(a, b) (0)
#define ctnetlink_label_size(a)        (0)
#endif

#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)

static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
{
        struct nlattr *nest_parms;

        if (!(ct->status & IPS_EXPECTED))
                return 0;

        nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        return 0;

nla_put_failure:
        return -1;
}

static int
dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)
{
        struct nlattr *nest_parms;

        nest_parms = nla_nest_start(skb, type);
        if (!nest_parms)
                goto nla_put_failure;

        if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS,
                         htonl(seq->correction_pos)) ||
            nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE,
                         htonl(seq->offset_before)) ||
            nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER,
                         htonl(seq->offset_after)))
                goto nla_put_failure;

        nla_nest_end(skb, nest_parms);

        return 0;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct)
{
        struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
        struct nf_ct_seqadj *seq;

        if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)
                return 0;

        spin_lock_bh(&ct->lock);
        seq = &seqadj->seq[IP_CT_DIR_ORIGINAL];
        if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)
                goto err;

        seq = &seqadj->seq[IP_CT_DIR_REPLY];
        if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)
                goto err;

        spin_unlock_bh(&ct->lock);
        return 0;
err:
        spin_unlock_bh(&ct->lock);
        return -1;
}

static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct)
{
        struct nf_conn_synproxy *synproxy = nfct_synproxy(ct);
        struct nlattr *nest_parms;

        if (!synproxy)
                return 0;

        nest_parms = nla_nest_start(skb, CTA_SYNPROXY);
        if (!nest_parms)
                goto nla_put_failure;

        if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) ||
            nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) ||
            nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff)))
                goto nla_put_failure;

        nla_nest_end(skb, nest_parms);

        return 0;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
        __be32 id = (__force __be32)nf_ct_get_id(ct);

        if (nla_put_be32(skb, CTA_ID, id))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -1;
}

static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
        if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use))))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -1;
}

/* all these functions access ct->ext. Caller must either hold a reference
 * on ct or prevent its deletion by holding either the bucket spinlock or
 * pcpu dying list lock.
 */
static int ctnetlink_dump_extinfo(struct sk_buff *skb,
                                  struct nf_conn *ct, u32 type)
{
        if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
            ctnetlink_dump_timestamp(skb, ct) < 0 ||
            ctnetlink_dump_helpinfo(skb, ct) < 0 ||
            ctnetlink_dump_labels(skb, ct) < 0 ||
            ctnetlink_dump_ct_seq_adj(skb, ct) < 0 ||
            ctnetlink_dump_ct_synproxy(skb, ct) < 0)
                return -1;

        return 0;
}

static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
{
        if (ctnetlink_dump_status(skb, ct) < 0 ||
            ctnetlink_dump_mark(skb, ct, true) < 0 ||
            ctnetlink_dump_secctx(skb, ct) < 0 ||
            ctnetlink_dump_id(skb, ct) < 0 ||
            ctnetlink_dump_use(skb, ct) < 0 ||
            ctnetlink_dump_master(skb, ct) < 0)
                return -1;

        if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) &&
            (ctnetlink_dump_timeout(skb, ct, false) < 0 ||
             ctnetlink_dump_protoinfo(skb, ct, false) < 0))
                return -1;

        return 0;
}

static int
ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
                    struct nf_conn *ct, bool extinfo, unsigned int flags)
{
        const struct nf_conntrack_zone *zone;
        struct nlmsghdr *nlh;
        struct nlattr *nest_parms;
        unsigned int event;

        if (portid)
                flags |= NLM_F_MULTI;
        event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct),
                           NFNETLINK_V0, 0);
        if (!nlh)
                goto nlmsg_failure;

        zone = nf_ct_zone(ct);

        nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
                goto nla_put_failure;
        if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
                                   NF_CT_ZONE_DIR_ORIG) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
                goto nla_put_failure;
        if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
                                   NF_CT_ZONE_DIR_REPL) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
                                   NF_CT_DEFAULT_ZONE_DIR) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_info(skb, ct) < 0)
                goto nla_put_failure;
        if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return skb->len;

nlmsg_failure:
nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -1;
}

static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = {
        [CTA_IP_V4_SRC]        = { .type = NLA_U32 },
        [CTA_IP_V4_DST]        = { .type = NLA_U32 },
        [CTA_IP_V6_SRC]        = { .len = sizeof(__be32) * 4 },
        [CTA_IP_V6_DST]        = { .len = sizeof(__be32) * 4 },
};

#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS)
static size_t ctnetlink_proto_size(const struct nf_conn *ct)
{
        const struct nf_conntrack_l4proto *l4proto;
        size_t len, len4 = 0;

        len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
        len *= 3u; /* ORIG, REPLY, MASTER */

        l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
        len += l4proto->nlattr_size;
        if (l4proto->nlattr_tuple_size) {
                len4 = l4proto->nlattr_tuple_size();
                len4 *= 3u; /* ORIG, REPLY, MASTER */
        }

        return len + len4;
}
#endif

static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
        if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT))
                return 0;
        return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */
               + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */
               + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */
               ;
}

static inline int ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
        int len, ret;

        ret = security_secid_to_secctx(ct->secmark, NULL, &len);
        if (ret)
                return 0;

        return nla_total_size(0) /* CTA_SECCTX */
               + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
#else
        return 0;
#endif
}

static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
        if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
                return 0;
        return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t));
#else
        return 0;
#endif
}

#ifdef CONFIG_NF_CONNTRACK_EVENTS
static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
{
        return NLMSG_ALIGN(sizeof(struct nfgenmsg))
               + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
               + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
               + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
               + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
               + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
               + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
               + ctnetlink_acct_size(ct)
               + ctnetlink_timestamp_size(ct)
               + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
               + nla_total_size(0) /* CTA_PROTOINFO */
               + nla_total_size(0) /* CTA_HELP */
               + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
               + ctnetlink_secctx_size(ct)
#if IS_ENABLED(CONFIG_NF_NAT)
               + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
               + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
#ifdef CONFIG_NF_CONNTRACK_MARK
               + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
               + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
#endif
               + ctnetlink_proto_size(ct)
               + ctnetlink_label_size(ct)
               ;
}

static int
ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
{
        const struct nf_conntrack_zone *zone;
        struct net *net;
        struct nlmsghdr *nlh;
        struct nlattr *nest_parms;
        struct nf_conn *ct = item->ct;
        struct sk_buff *skb;
        unsigned int type;
        unsigned int flags = 0, group;
        int err;

        if (events & (1 << IPCT_DESTROY)) {
                type = IPCTNL_MSG_CT_DELETE;
                group = NFNLGRP_CONNTRACK_DESTROY;
        } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
                type = IPCTNL_MSG_CT_NEW;
                flags = NLM_F_CREATE|NLM_F_EXCL;
                group = NFNLGRP_CONNTRACK_NEW;
        } else if (events) {
                type = IPCTNL_MSG_CT_NEW;
                group = NFNLGRP_CONNTRACK_UPDATE;
        } else
                return 0;

        net = nf_ct_net(ct);
        if (!item->report && !nfnetlink_has_listeners(net, group))
                return 0;

        skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC);
        if (skb == NULL)
                goto errout;

        type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type);
        nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct),
                           NFNETLINK_V0, 0);
        if (!nlh)
                goto nlmsg_failure;

        zone = nf_ct_zone(ct);

        nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
                goto nla_put_failure;
        if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
                                   NF_CT_ZONE_DIR_ORIG) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
                goto nla_put_failure;
        if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
                                   NF_CT_ZONE_DIR_REPL) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
                                   NF_CT_DEFAULT_ZONE_DIR) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_id(skb, ct) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_status(skb, ct) < 0)
                goto nla_put_failure;

        if (events & (1 << IPCT_DESTROY)) {
                if (ctnetlink_dump_timeout(skb, ct, true) < 0)
                        goto nla_put_failure;

                if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
                    ctnetlink_dump_timestamp(skb, ct) < 0 ||
                    ctnetlink_dump_protoinfo(skb, ct, true) < 0)
                        goto nla_put_failure;
        } else {
                if (ctnetlink_dump_timeout(skb, ct, false) < 0)
                        goto nla_put_failure;

                if (events & (1 << IPCT_PROTOINFO) &&
                    ctnetlink_dump_protoinfo(skb, ct, false) < 0)
                        goto nla_put_failure;

                if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
                    && ctnetlink_dump_helpinfo(skb, ct) < 0)
                        goto nla_put_failure;

#ifdef CONFIG_NF_CONNTRACK_SECMARK
                if ((events & (1 << IPCT_SECMARK) || ct->secmark)
                    && ctnetlink_dump_secctx(skb, ct) < 0)
                        goto nla_put_failure;
#endif
                if (events & (1 << IPCT_LABEL) &&
                     ctnetlink_dump_labels(skb, ct) < 0)
                        goto nla_put_failure;

                if (events & (1 << IPCT_RELATED) &&
                    ctnetlink_dump_master(skb, ct) < 0)
                        goto nla_put_failure;

                if (events & (1 << IPCT_SEQADJ) &&
                    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
                        goto nla_put_failure;

                if (events & (1 << IPCT_SYNPROXY) &&
                    ctnetlink_dump_ct_synproxy(skb, ct) < 0)
                        goto nla_put_failure;
        }

#ifdef CONFIG_NF_CONNTRACK_MARK
        if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK)))
                goto nla_put_failure;
#endif
        nlmsg_end(skb, nlh);
        err = nfnetlink_send(skb, net, item->portid, group, item->report,
                             GFP_ATOMIC);
        if (err == -ENOBUFS || err == -EAGAIN)
                return -ENOBUFS;

        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
nlmsg_failure:
        kfree_skb(skb);
errout:
        if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0)
                return -ENOBUFS;

        return 0;
}
#endif /* CONFIG_NF_CONNTRACK_EVENTS */

static int ctnetlink_done(struct netlink_callback *cb)
{
        if (cb->args[1])
                nf_ct_put((struct nf_conn *)cb->args[1]);
        kfree(cb->data);
        return 0;
}

struct ctnetlink_filter_u32 {
        u32 val;
        u32 mask;
};

struct ctnetlink_filter {
        u8 family;
        bool zone_filter;

        u_int32_t orig_flags;
        u_int32_t reply_flags;

        struct nf_conntrack_tuple orig;
        struct nf_conntrack_tuple reply;
        struct nf_conntrack_zone zone;

        struct ctnetlink_filter_u32 mark;
        struct ctnetlink_filter_u32 status;
};

static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = {
        [CTA_FILTER_ORIG_FLAGS]                = { .type = NLA_U32 },
        [CTA_FILTER_REPLY_FLAGS]        = { .type = NLA_U32 },
};

static int ctnetlink_parse_filter(const struct nlattr *attr,
                                  struct ctnetlink_filter *filter)
{
        struct nlattr *tb[CTA_FILTER_MAX + 1];
        int ret = 0;

        ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy,
                               NULL);
        if (ret)
                return ret;

        if (tb[CTA_FILTER_ORIG_FLAGS]) {
                filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]);
                if (filter->orig_flags & ~CTA_FILTER_F_ALL)
                        return -EOPNOTSUPP;
        }

        if (tb[CTA_FILTER_REPLY_FLAGS]) {
                filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]);
                if (filter->reply_flags & ~CTA_FILTER_F_ALL)
                        return -EOPNOTSUPP;
        }

        return 0;
}

static int ctnetlink_parse_zone(const struct nlattr *attr,
                                struct nf_conntrack_zone *zone);
static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
                                         struct nf_conntrack_tuple *tuple,
                                         u32 type, u_int8_t l3num,
                                         struct nf_conntrack_zone *zone,
                                         u_int32_t flags);

static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark,
                                       const struct nlattr * const cda[])
{
#ifdef CONFIG_NF_CONNTRACK_MARK
        if (cda[CTA_MARK]) {
                mark->val = ntohl(nla_get_be32(cda[CTA_MARK]));

                if (cda[CTA_MARK_MASK])
                        mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
                else
                        mark->mask = 0xffffffff;
        } else if (cda[CTA_MARK_MASK]) {
                return -EINVAL;
        }
#endif
        return 0;
}

static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status,
                                         const struct nlattr * const cda[])
{
        if (cda[CTA_STATUS]) {
                status->val = ntohl(nla_get_be32(cda[CTA_STATUS]));
                if (cda[CTA_STATUS_MASK])
                        status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK]));
                else
                        status->mask = status->val;

                /* status->val == 0? always true, else always false. */
                if (status->mask == 0)
                        return -EINVAL;
        } else if (cda[CTA_STATUS_MASK]) {
                return -EINVAL;
        }

        /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */
        BUILD_BUG_ON(__IPS_MAX_BIT >= 32);
        return 0;
}

static struct ctnetlink_filter *
ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
{
        struct ctnetlink_filter *filter;
        int err;

#ifndef CONFIG_NF_CONNTRACK_MARK
        if (cda[CTA_MARK] || cda[CTA_MARK_MASK])
                return ERR_PTR(-EOPNOTSUPP);
#endif

        filter = kzalloc(sizeof(*filter), GFP_KERNEL);
        if (filter == NULL)
                return ERR_PTR(-ENOMEM);

        filter->family = family;

        err = ctnetlink_filter_parse_mark(&filter->mark, cda);
        if (err)
                goto err_filter;

        err = ctnetlink_filter_parse_status(&filter->status, cda);
        if (err)
                goto err_filter;

        if (cda[CTA_ZONE]) {
                err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
                if (err < 0)
                        goto err_filter;
                filter->zone_filter = true;
        }

        if (!cda[CTA_FILTER])
                return filter;

        err = ctnetlink_parse_filter(cda[CTA_FILTER], filter);
        if (err < 0)
                goto err_filter;

        if (filter->orig_flags) {
                if (!cda[CTA_TUPLE_ORIG]) {
                        err = -EINVAL;
                        goto err_filter;
                }

                err = ctnetlink_parse_tuple_filter(cda, &filter->orig,
                                                   CTA_TUPLE_ORIG,
                                                   filter->family,
                                                   &filter->zone,
                                                   filter->orig_flags);
                if (err < 0)
                        goto err_filter;
        }

        if (filter->reply_flags) {
                if (!cda[CTA_TUPLE_REPLY]) {
                        err = -EINVAL;
                        goto err_filter;
                }

                err = ctnetlink_parse_tuple_filter(cda, &filter->reply,
                                                   CTA_TUPLE_REPLY,
                                                   filter->family,
                                                   &filter->zone,
                                                   filter->reply_flags);
                if (err < 0)
                        goto err_filter;
        }

        return filter;

err_filter:
        kfree(filter);

        return ERR_PTR(err);
}

static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
{
        return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE];
}

static int ctnetlink_start(struct netlink_callback *cb)
{
        const struct nlattr * const *cda = cb->data;
        struct ctnetlink_filter *filter = NULL;
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        u8 family = nfmsg->nfgen_family;

        if (ctnetlink_needs_filter(family, cda)) {
                filter = ctnetlink_alloc_filter(cda, family);
                if (IS_ERR(filter))
                        return PTR_ERR(filter);
        }

        cb->data = filter;
        return 0;
}

static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple,
                                        struct nf_conntrack_tuple *ct_tuple,
                                        u_int32_t flags, int family)
{
        switch (family) {
        case NFPROTO_IPV4:
                if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
                    filter_tuple->src.u3.ip != ct_tuple->src.u3.ip)
                        return  0;

                if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
                    filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip)
                        return  0;
                break;
        case NFPROTO_IPV6:
                if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
                    !ipv6_addr_cmp(&filter_tuple->src.u3.in6,
                                   &ct_tuple->src.u3.in6))
                        return 0;

                if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
                    !ipv6_addr_cmp(&filter_tuple->dst.u3.in6,
                                   &ct_tuple->dst.u3.in6))
                        return 0;
                break;
        }

        if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) &&
            filter_tuple->dst.protonum != ct_tuple->dst.protonum)
                return 0;

        switch (ct_tuple->dst.protonum) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) &&
                    filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port)
                        return 0;

                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) &&
                    filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port)
                        return 0;
                break;
        case IPPROTO_ICMP:
                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) &&
                    filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
                        return 0;
                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) &&
                    filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
                        return 0;
                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) &&
                    filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
                        return 0;
                break;
        case IPPROTO_ICMPV6:
                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) &&
                    filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
                        return 0;
                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) &&
                    filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
                        return 0;
                if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) &&
                    filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
                        return 0;
                break;
        }

        return 1;
}

static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
{
        struct ctnetlink_filter *filter = data;
        struct nf_conntrack_tuple *tuple;
        u32 status;

        if (filter == NULL)
                goto out;

        /* Match entries of a given L3 protocol number.
         * If it is not specified, ie. l3proto == 0,
         * then match everything.
         */
        if (filter->family && nf_ct_l3num(ct) != filter->family)
                goto ignore_entry;

        if (filter->zone_filter &&
            !nf_ct_zone_equal_any(ct, &filter->zone))
                goto ignore_entry;

        if (filter->orig_flags) {
                tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
                if (!ctnetlink_filter_match_tuple(&filter->orig, tuple,
                                                  filter->orig_flags,
                                                  filter->family))
                        goto ignore_entry;
        }

        if (filter->reply_flags) {
                tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY);
                if (!ctnetlink_filter_match_tuple(&filter->reply, tuple,
                                                  filter->reply_flags,
                                                  filter->family))
                        goto ignore_entry;
        }

#ifdef CONFIG_NF_CONNTRACK_MARK
        if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val)
                goto ignore_entry;
#endif
        status = (u32)READ_ONCE(ct->status);
        if ((status & filter->status.mask) != filter->status.val)
                goto ignore_entry;

out:
        return 1;

ignore_entry:
        return 0;
}

static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
        unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
        struct net *net = sock_net(skb->sk);
        struct nf_conn *ct, *last;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        struct nf_conn *nf_ct_evict[8];
        int res, i;
        spinlock_t *lockp;

        last = (struct nf_conn *)cb->args[1];
        i = 0;

        local_bh_disable();
        for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
restart:
                while (i) {
                        i--;
                        if (nf_ct_should_gc(nf_ct_evict[i]))
                                nf_ct_kill(nf_ct_evict[i]);
                        nf_ct_put(nf_ct_evict[i]);
                }

                lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
                nf_conntrack_lock(lockp);
                if (cb->args[0] >= nf_conntrack_htable_size) {
                        spin_unlock(lockp);
                        goto out;
                }
                hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
                                           hnnode) {
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        if (nf_ct_is_expired(ct)) {
                                /* need to defer nf_ct_kill() until lock is released */
                                if (i < ARRAY_SIZE(nf_ct_evict) &&
                                    refcount_inc_not_zero(&ct->ct_general.use))
                                        nf_ct_evict[i++] = ct;
                                continue;
                        }

                        if (!net_eq(net, nf_ct_net(ct)))
                                continue;

                        if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
                                continue;

                        if (cb->args[1]) {
                                if (ct != last)
                                        continue;
                                cb->args[1] = 0;
                        }
                        if (!ctnetlink_filter_match(ct, cb->data))
                                continue;

                        res =
                        ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
                                            cb->nlh->nlmsg_seq,
                                            NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
                                            ct, true, flags);
                        if (res < 0) {
                                nf_conntrack_get(&ct->ct_general);
                                cb->args[1] = (unsigned long)ct;
                                spin_unlock(lockp);
                                goto out;
                        }
                }
                spin_unlock(lockp);
                if (cb->args[1]) {
                        cb->args[1] = 0;
                        goto restart;
                }
        }
out:
        local_bh_enable();
        if (last) {
                /* nf ct hash resize happened, now clear the leftover. */
                if ((struct nf_conn *)cb->args[1] == last)
                        cb->args[1] = 0;

                nf_ct_put(last);
        }

        while (i) {
                i--;
                if (nf_ct_should_gc(nf_ct_evict[i]))
                        nf_ct_kill(nf_ct_evict[i]);
                nf_ct_put(nf_ct_evict[i]);
        }

        return skb->len;
}

static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
                                struct nf_conntrack_tuple *t,
                                u_int32_t flags)
{
        if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
                if (!tb[CTA_IP_V4_SRC])
                        return -EINVAL;

                t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
        }

        if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
                if (!tb[CTA_IP_V4_DST])
                        return -EINVAL;

                t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
        }

        return 0;
}

static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
                                struct nf_conntrack_tuple *t,
                                u_int32_t flags)
{
        if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
                if (!tb[CTA_IP_V6_SRC])
                        return -EINVAL;

                t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
        }

        if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
                if (!tb[CTA_IP_V6_DST])
                        return -EINVAL;

                t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
        }

        return 0;
}

static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
                                    struct nf_conntrack_tuple *tuple,
                                    u_int32_t flags)
{
        struct nlattr *tb[CTA_IP_MAX+1];
        int ret = 0;

        ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr,
                                          cta_ip_nla_policy, NULL);
        if (ret < 0)
                return ret;

        switch (tuple->src.l3num) {
        case NFPROTO_IPV4:
                ret = ipv4_nlattr_to_tuple(tb, tuple, flags);
                break;
        case NFPROTO_IPV6:
                ret = ipv6_nlattr_to_tuple(tb, tuple, flags);
                break;
        }

        return ret;
}

static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
        [CTA_PROTO_NUM]        = { .type = NLA_U8 },
};

static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
                                       struct nf_conntrack_tuple *tuple,
                                       u_int32_t flags)
{
        const struct nf_conntrack_l4proto *l4proto;
        struct nlattr *tb[CTA_PROTO_MAX+1];
        int ret = 0;

        ret = nla_parse_nested_deprecated(tb, CTA_PROTO_MAX, attr,
                                          proto_nla_policy, NULL);
        if (ret < 0)
                return ret;

        if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)))
                return 0;

        if (!tb[CTA_PROTO_NUM])
                return -EINVAL;

        tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);

        rcu_read_lock();
        l4proto = nf_ct_l4proto_find(tuple->dst.protonum);

        if (likely(l4proto->nlattr_to_tuple)) {
                ret = nla_validate_nested_deprecated(attr, CTA_PROTO_MAX,
                                                     l4proto->nla_policy,
                                                     NULL);
                if (ret == 0)
                        ret = l4proto->nlattr_to_tuple(tb, tuple, flags);
        }

        rcu_read_unlock();

        return ret;
}

static int
ctnetlink_parse_zone(const struct nlattr *attr,
                     struct nf_conntrack_zone *zone)
{
        nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID,
                        NF_CT_DEFAULT_ZONE_DIR, 0);
#ifdef CONFIG_NF_CONNTRACK_ZONES
        if (attr)
                zone->id = ntohs(nla_get_be16(attr));
#else
        if (attr)
                return -EOPNOTSUPP;
#endif
        return 0;
}

static int
ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type,
                           struct nf_conntrack_zone *zone)
{
        int ret;

        if (zone->id != NF_CT_DEFAULT_ZONE_ID)
                return -EINVAL;

        ret = ctnetlink_parse_zone(attr, zone);
        if (ret < 0)
                return ret;

        if (type == CTA_TUPLE_REPLY)
                zone->dir = NF_CT_ZONE_DIR_REPL;
        else
                zone->dir = NF_CT_ZONE_DIR_ORIG;

        return 0;
}

static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
        [CTA_TUPLE_IP]                = { .type = NLA_NESTED },
        [CTA_TUPLE_PROTO]        = { .type = NLA_NESTED },
        [CTA_TUPLE_ZONE]        = { .type = NLA_U16 },
};

#define CTA_FILTER_F_ALL_CTA_PROTO \
  (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \
   CTA_FILTER_F_CTA_PROTO_DST_PORT | \
   CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \
   CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \
   CTA_FILTER_F_CTA_PROTO_ICMP_ID | \
   CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \
   CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \
   CTA_FILTER_F_CTA_PROTO_ICMPV6_ID)

static int
ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
                              struct nf_conntrack_tuple *tuple, u32 type,
                              u_int8_t l3num, struct nf_conntrack_zone *zone,
                              u_int32_t flags)
{
        struct nlattr *tb[CTA_TUPLE_MAX+1];
        int err;

        memset(tuple, 0, sizeof(*tuple));

        err = nla_parse_nested_deprecated(tb, CTA_TUPLE_MAX, cda[type],
                                          tuple_nla_policy, NULL);
        if (err < 0)
                return err;

        if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6)
                return -EOPNOTSUPP;
        tuple->src.l3num = l3num;

        if (flags & CTA_FILTER_FLAG(CTA_IP_DST) ||
            flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
                if (!tb[CTA_TUPLE_IP])
                        return -EINVAL;

                err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags);
                if (err < 0)
                        return err;
        }

        if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) {
                if (!tb[CTA_TUPLE_PROTO])
                        return -EINVAL;

                err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags);
                if (err < 0)
                        return err;
        } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) {
                /* Can't manage proto flags without a protonum  */
                return -EINVAL;
        }

        if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) {
                if (!zone)
                        return -EINVAL;

                err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE],
                                                 type, zone);
                if (err < 0)
                        return err;
        }

        /* orig and expect tuples get DIR_ORIGINAL */
        if (type == CTA_TUPLE_REPLY)
                tuple->dst.dir = IP_CT_DIR_REPLY;
        else
                tuple->dst.dir = IP_CT_DIR_ORIGINAL;

        return 0;
}

static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
                      struct nf_conntrack_tuple *tuple, u32 type,
                      u_int8_t l3num, struct nf_conntrack_zone *zone)
{
        return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone,
                                            CTA_FILTER_FLAG(ALL));
}

static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
        [CTA_HELP_NAME]                = { .type = NLA_NUL_STRING,
                                    .len = NF_CT_HELPER_NAME_LEN - 1 },
};

static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
                                struct nlattr **helpinfo)
{
        int err;
        struct nlattr *tb[CTA_HELP_MAX+1];

        err = nla_parse_nested_deprecated(tb, CTA_HELP_MAX, attr,
                                          help_nla_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[CTA_HELP_NAME])
                return -EINVAL;

        *helper_name = nla_data(tb[CTA_HELP_NAME]);

        if (tb[CTA_HELP_INFO])
                *helpinfo = tb[CTA_HELP_INFO];

        return 0;
}

static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
        [CTA_TUPLE_ORIG]        = { .type = NLA_NESTED },
        [CTA_TUPLE_REPLY]        = { .type = NLA_NESTED },
        [CTA_STATUS]                 = { .type = NLA_U32 },
        [CTA_PROTOINFO]                = { .type = NLA_NESTED },
        [CTA_HELP]                = { .type = NLA_NESTED },
        [CTA_NAT_SRC]                = { .type = NLA_NESTED },
        [CTA_TIMEOUT]                 = { .type = NLA_U32 },
        [CTA_MARK]                = { .type = NLA_U32 },
        [CTA_ID]                = { .type = NLA_U32 },
        [CTA_NAT_DST]                = { .type = NLA_NESTED },
        [CTA_TUPLE_MASTER]        = { .type = NLA_NESTED },
        [CTA_NAT_SEQ_ADJ_ORIG]  = { .type = NLA_NESTED },
        [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED },
        [CTA_ZONE]                = { .type = NLA_U16 },
        [CTA_MARK_MASK]                = { .type = NLA_U32 },
        [CTA_LABELS]                = { .type = NLA_BINARY,
                                    .len = NF_CT_LABELS_MAX_SIZE },
        [CTA_LABELS_MASK]        = { .type = NLA_BINARY,
                                    .len = NF_CT_LABELS_MAX_SIZE },
        [CTA_FILTER]                = { .type = NLA_NESTED },
        [CTA_STATUS_MASK]        = { .type = NLA_U32 },
};

static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
{
        return ctnetlink_filter_match(ct, data);
}

static int ctnetlink_flush_conntrack(struct net *net,
                                     const struct nlattr * const cda[],
                                     u32 portid, int report, u8 family)
{
        struct ctnetlink_filter *filter = NULL;
        struct nf_ct_iter_data iter = {
                .net                = net,
                .portid                = portid,
                .report                = report,
        };

        if (ctnetlink_needs_filter(family, cda)) {
                if (cda[CTA_FILTER])
                        return -EOPNOTSUPP;

                filter = ctnetlink_alloc_filter(cda, family);
                if (IS_ERR(filter))
                        return PTR_ERR(filter);

                iter.data = filter;
        }

        nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter);
        kfree(filter);

        return 0;
}

static int ctnetlink_del_conntrack(struct sk_buff *skb,
                                   const struct nfnl_info *info,
                                   const struct nlattr * const cda[])
{
        u8 family = info->nfmsg->nfgen_family;
        struct nf_conntrack_tuple_hash *h;
        struct nf_conntrack_tuple tuple;
        struct nf_conntrack_zone zone;
        struct nf_conn *ct;
        int err;

        err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
        if (err < 0)
                return err;

        if (cda[CTA_TUPLE_ORIG])
                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
                                            family, &zone);
        else if (cda[CTA_TUPLE_REPLY])
                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
                                            family, &zone);
        else {
                u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC;

                return ctnetlink_flush_conntrack(info->net, cda,
                                                 NETLINK_CB(skb).portid,
                                                 nlmsg_report(info->nlh), u3);
        }

        if (err < 0)
                return err;

        h = nf_conntrack_find_get(info->net, &zone, &tuple);
        if (!h)
                return -ENOENT;

        ct = nf_ct_tuplehash_to_ctrack(h);

        if (cda[CTA_ID]) {
                __be32 id = nla_get_be32(cda[CTA_ID]);

                if (id != (__force __be32)nf_ct_get_id(ct)) {
                        nf_ct_put(ct);
                        return -ENOENT;
                }
        }

        nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh));
        nf_ct_put(ct);

        return 0;
}

static int ctnetlink_get_conntrack(struct sk_buff *skb,
                                   const struct nfnl_info *info,
                                   const struct nlattr * const cda[])
{
        u_int8_t u3 = info->nfmsg->nfgen_family;
        struct nf_conntrack_tuple_hash *h;
        struct nf_conntrack_tuple tuple;
        struct nf_conntrack_zone zone;
        struct sk_buff *skb2;
        struct nf_conn *ct;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = ctnetlink_start,
                        .dump = ctnetlink_dump_table,
                        .done = ctnetlink_done,
                        .data = (void *)cda,
                };

                return netlink_dump_start(info->sk, skb, info->nlh, &c);
        }

        err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
        if (err < 0)
                return err;

        if (cda[CTA_TUPLE_ORIG])
                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
                                            u3, &zone);
        else if (cda[CTA_TUPLE_REPLY])
                err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
                                            u3, &zone);
        else
                return -EINVAL;

        if (err < 0)
                return err;

        h = nf_conntrack_find_get(info->net, &zone, &tuple);
        if (!h)
                return -ENOENT;

        ct = nf_ct_tuplehash_to_ctrack(h);

        skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!skb2) {
                nf_ct_put(ct);
                return -ENOMEM;
        }

        err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid,
                                  info->nlh->nlmsg_seq,
                                  NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct,
                                  true, 0);
        nf_ct_put(ct);
        if (err <= 0) {
                kfree_skb(skb2);
                return -ENOMEM;
        }

        return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}

static int ctnetlink_done_list(struct netlink_callback *cb)
{
        struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;

        if (ctx->last)
                nf_ct_put(ctx->last);

        return 0;
}

#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int ctnetlink_dump_one_entry(struct sk_buff *skb,
                                    struct netlink_callback *cb,
                                    struct nf_conn *ct,
                                    bool dying)
{
        struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        u8 l3proto = nfmsg->nfgen_family;
        int res;

        if (l3proto && nf_ct_l3num(ct) != l3proto)
                return 0;

        if (ctx->last) {
                if (ct != ctx->last)
                        return 0;

                ctx->last = NULL;
        }

        /* We can't dump extension info for the unconfirmed
         * list because unconfirmed conntracks can have
         * ct->ext reallocated (and thus freed).
         *
         * In the dying list case ct->ext can't be free'd
         * until after we drop pcpu->lock.
         */
        res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
                                  cb->nlh->nlmsg_seq,
                                  NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
                                  ct, dying, 0);
        if (res < 0) {
                if (!refcount_inc_not_zero(&ct->ct_general.use))
                        return 0;

                ctx->last = ct;
        }

        return res;
}
#endif

static int
ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
        return 0;
}

static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
        struct nf_conn *last = ctx->last;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        const struct net *net = sock_net(skb->sk);
        struct nf_conntrack_net_ecache *ecache_net;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
#endif

        if (ctx->done)
                return 0;

        ctx->last = NULL;

#ifdef CONFIG_NF_CONNTRACK_EVENTS
        ecache_net = nf_conn_pernet_ecache(net);
        spin_lock_bh(&ecache_net->dying_lock);

        hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) {
                struct nf_conn *ct;
                int res;

                ct = nf_ct_tuplehash_to_ctrack(h);
                if (last && last != ct)
                        continue;

                res = ctnetlink_dump_one_entry(skb, cb, ct, true);
                if (res < 0) {
                        spin_unlock_bh(&ecache_net->dying_lock);
                        nf_ct_put(last);
                        return skb->len;
                }

                nf_ct_put(last);
                last = NULL;
        }

        spin_unlock_bh(&ecache_net->dying_lock);
#endif
        ctx->done = true;
        nf_ct_put(last);

        return skb->len;
}

static int ctnetlink_get_ct_dying(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const cda[])
{
        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = ctnetlink_dump_dying,
                        .done = ctnetlink_done_list,
                };
                return netlink_dump_start(info->sk, skb, info->nlh, &c);
        }

        return -EOPNOTSUPP;
}

static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
                                        const struct nfnl_info *info,
                                        const struct nlattr * const cda[])
{
        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = ctnetlink_dump_unconfirmed,
                        .done = ctnetlink_done_list,
                };
                return netlink_dump_start(info->sk, skb, info->nlh, &c);
        }

        return -EOPNOTSUPP;
}

#if IS_ENABLED(CONFIG_NF_NAT)
static int
ctnetlink_parse_nat_setup(struct nf_conn *ct,
                          enum nf_nat_manip_type manip,
                          const struct nlattr *attr)
        __must_hold(RCU)
{
        const struct nf_nat_hook *nat_hook;
        int err;

        nat_hook = rcu_dereference(nf_nat_hook);
        if (!nat_hook) {
#ifdef CONFIG_MODULES
                rcu_read_unlock();
                nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
                if (request_module("nf-nat") < 0) {
                        nfnl_lock(NFNL_SUBSYS_CTNETLINK);
                        rcu_read_lock();
                        return -EOPNOTSUPP;
                }
                nfnl_lock(NFNL_SUBSYS_CTNETLINK);
                rcu_read_lock();
                nat_hook = rcu_dereference(nf_nat_hook);
                if (nat_hook)
                        return -EAGAIN;
#endif
                return -EOPNOTSUPP;
        }

        err = nat_hook->parse_nat_setup(ct, manip, attr);
        if (err == -EAGAIN) {
#ifdef CONFIG_MODULES
                rcu_read_unlock();
                nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
                if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) {
                        nfnl_lock(NFNL_SUBSYS_CTNETLINK);
                        rcu_read_lock();
                        return -EOPNOTSUPP;
                }
                nfnl_lock(NFNL_SUBSYS_CTNETLINK);
                rcu_read_lock();
#else
                err = -EOPNOTSUPP;
#endif
        }
        return err;
}
#endif

static int
ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
        return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS])));
}

static int
ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
{
#if IS_ENABLED(CONFIG_NF_NAT)
        int ret;

        if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
                return 0;

        ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST,
                                        cda[CTA_NAT_DST]);
        if (ret < 0)
                return ret;

        return ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC,
                                         cda[CTA_NAT_SRC]);
#else
        if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
                return 0;
        return -EOPNOTSUPP;
#endif
}

static int ctnetlink_change_helper(struct nf_conn *ct,
                                   const struct nlattr * const cda[])
{
        struct nf_conntrack_helper *helper;
        struct nf_conn_help *help = nfct_help(ct);
        char *helpname = NULL;
        struct nlattr *helpinfo = NULL;
        int err;

        err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
        if (err < 0)
                return err;

        /* don't change helper of sibling connections */
        if (ct->master) {
                /* If we try to change the helper to the same thing twice,
                 * treat the second attempt as a no-op instead of returning
                 * an error.
                 */
                err = -EBUSY;
                if (help) {
                        rcu_read_lock();
                        helper = rcu_dereference(help->helper);
                        if (helper && !strcmp(helper->name, helpname))
                                err = 0;
                        rcu_read_unlock();
                }

                return err;
        }

        if (!strcmp(helpname, "")) {
                if (help && help->helper) {
                        /* we had a helper before ... */
                        nf_ct_remove_expectations(ct);
                        RCU_INIT_POINTER(help->helper, NULL);
                }

                return 0;
        }

        rcu_read_lock();
        helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
                                            nf_ct_protonum(ct));
        if (helper == NULL) {
                rcu_read_unlock();
                return -EOPNOTSUPP;
        }

        if (help) {
                if (rcu_access_pointer(help->helper) == helper) {
                        /* update private helper data if allowed. */
                        if (helper->from_nlattr)
                                helper->from_nlattr(helpinfo, ct);
                        err = 0;
                } else
                        err = -EBUSY;
        } else {
                /* we cannot set a helper for an existing conntrack */
                err = -EOPNOTSUPP;
        }

        rcu_read_unlock();
        return err;
}

static int ctnetlink_change_timeout(struct nf_conn *ct,
                                    const struct nlattr * const cda[])
{
        return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ);
}

#if defined(CONFIG_NF_CONNTRACK_MARK)
static void ctnetlink_change_mark(struct nf_conn *ct,
                                    const struct nlattr * const cda[])
{
        u32 mark, newmark, mask = 0;

        if (cda[CTA_MARK_MASK])
                mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));

        mark = ntohl(nla_get_be32(cda[CTA_MARK]));
        newmark = (READ_ONCE(ct->mark) & mask) ^ mark;
        if (newmark != READ_ONCE(ct->mark))
                WRITE_ONCE(ct->mark, newmark);
}
#endif

static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
        [CTA_PROTOINFO_TCP]        = { .type = NLA_NESTED },
        [CTA_PROTOINFO_DCCP]        = { .type = NLA_NESTED },
        [CTA_PROTOINFO_SCTP]        = { .type = NLA_NESTED },
};

static int ctnetlink_change_protoinfo(struct nf_conn *ct,
                                      const struct nlattr * const cda[])
{
        const struct nlattr *attr = cda[CTA_PROTOINFO];
        const struct nf_conntrack_l4proto *l4proto;
        struct nlattr *tb[CTA_PROTOINFO_MAX+1];
        int err = 0;

        err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_MAX, attr,
                                          protoinfo_policy, NULL);
        if (err < 0)
                return err;

        l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
        if (l4proto->from_nlattr)
                err = l4proto->from_nlattr(tb, ct);

        return err;
}

static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
        [CTA_SEQADJ_CORRECTION_POS]        = { .type = NLA_U32 },
        [CTA_SEQADJ_OFFSET_BEFORE]        = { .type = NLA_U32 },
        [CTA_SEQADJ_OFFSET_AFTER]        = { .type = NLA_U32 },
};

static int change_seq_adj(struct nf_ct_seqadj *seq,
                          const struct nlattr * const attr)
{
        int err;
        struct nlattr *cda[CTA_SEQADJ_MAX+1];

        err = nla_parse_nested_deprecated(cda, CTA_SEQADJ_MAX, attr,
                                          seqadj_policy, NULL);
        if (err < 0)
                return err;

        if (!cda[CTA_SEQADJ_CORRECTION_POS])
                return -EINVAL;

        seq->correction_pos =
                ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS]));

        if (!cda[CTA_SEQADJ_OFFSET_BEFORE])
                return -EINVAL;

        seq->offset_before =
                ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE]));

        if (!cda[CTA_SEQADJ_OFFSET_AFTER])
                return -EINVAL;

        seq->offset_after =
                ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER]));

        return 0;
}

static int
ctnetlink_change_seq_adj(struct nf_conn *ct,
                         const struct nlattr * const cda[])
{
        struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
        int ret = 0;

        if (!seqadj)
                return 0;

        spin_lock_bh(&ct->lock);
        if (cda[CTA_SEQ_ADJ_ORIG]) {
                ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL],
                                     cda[CTA_SEQ_ADJ_ORIG]);
                if (ret < 0)
                        goto err;

                set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
        }

        if (cda[CTA_SEQ_ADJ_REPLY]) {
                ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY],
                                     cda[CTA_SEQ_ADJ_REPLY]);
                if (ret < 0)
                        goto err;

                set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
        }

        spin_unlock_bh(&ct->lock);
        return 0;
err:
        spin_unlock_bh(&ct->lock);
        return ret;
}

static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = {
        [CTA_SYNPROXY_ISN]        = { .type = NLA_U32 },
        [CTA_SYNPROXY_ITS]        = { .type = NLA_U32 },
        [CTA_SYNPROXY_TSOFF]        = { .type = NLA_U32 },
};

static int ctnetlink_change_synproxy(struct nf_conn *ct,
                                     const struct nlattr * const cda[])
{
        struct nf_conn_synproxy *synproxy = nfct_synproxy(ct);
        struct nlattr *tb[CTA_SYNPROXY_MAX + 1];
        int err;

        if (!synproxy)
                return 0;

        err = nla_parse_nested_deprecated(tb, CTA_SYNPROXY_MAX,
                                          cda[CTA_SYNPROXY], synproxy_policy,
                                          NULL);
        if (err < 0)
                return err;

        if (!tb[CTA_SYNPROXY_ISN] ||
            !tb[CTA_SYNPROXY_ITS] ||
            !tb[CTA_SYNPROXY_TSOFF])
                return -EINVAL;

        synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN]));
        synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS]));
        synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF]));

        return 0;
}

static int
ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
{
#ifdef CONFIG_NF_CONNTRACK_LABELS
        size_t len = nla_len(cda[CTA_LABELS]);
        const void *mask = cda[CTA_LABELS_MASK];

        if (len & (sizeof(u32)-1)) /* must be multiple of u32 */
                return -EINVAL;

        if (mask) {
                if (nla_len(cda[CTA_LABELS_MASK]) == 0 ||
                    nla_len(cda[CTA_LABELS_MASK]) != len)
                        return -EINVAL;
                mask = nla_data(cda[CTA_LABELS_MASK]);
        }

        len /= sizeof(u32);

        return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len);
#else
        return -EOPNOTSUPP;
#endif
}

static int
ctnetlink_change_conntrack(struct nf_conn *ct,
                           const struct nlattr * const cda[])
{
        int err;

        /* only allow NAT changes and master assignation for new conntracks */
        if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER])
                return -EOPNOTSUPP;

        if (cda[CTA_HELP]) {
                err = ctnetlink_change_helper(ct, cda);
                if (err < 0)
                        return err;
        }

        if (cda[CTA_TIMEOUT]) {
                err = ctnetlink_change_timeout(ct, cda);
                if (err < 0)
                        return err;
        }

        if (cda[CTA_STATUS]) {
                err = ctnetlink_change_status(ct, cda);
                if (err < 0)
                        return err;
        }

        if (cda[CTA_PROTOINFO]) {
                err = ctnetlink_change_protoinfo(ct, cda);
                if (err < 0)
                        return err;
        }

#if defined(CONFIG_NF_CONNTRACK_MARK)
        if (cda[CTA_MARK])
                ctnetlink_change_mark(ct, cda);
#endif

        if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
                err = ctnetlink_change_seq_adj(ct, cda);
                if (err < 0)
                        return err;
        }

        if (cda[CTA_SYNPROXY]) {
                err = ctnetlink_change_synproxy(ct, cda);
                if (err < 0)
                        return err;
        }

        if (cda[CTA_LABELS]) {
                err = ctnetlink_attach_labels(ct, cda);
                if (err < 0)
                        return err;
        }

        return 0;
}

static struct nf_conn *
ctnetlink_create_conntrack(struct net *net,
                           const struct nf_conntrack_zone *zone,
                           const struct nlattr * const cda[],
                           struct nf_conntrack_tuple *otuple,
                           struct nf_conntrack_tuple *rtuple,
                           u8 u3)
{
        struct nf_conn *ct;
        int err = -EINVAL;
        struct nf_conntrack_helper *helper;
        struct nf_conn_tstamp *tstamp;
        u64 timeout;

        ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
        if (IS_ERR(ct))
                return ERR_PTR(-ENOMEM);

        if (!cda[CTA_TIMEOUT])
                goto err1;

        rcu_read_lock();
         if (cda[CTA_HELP]) {
                char *helpname = NULL;
                struct nlattr *helpinfo = NULL;

                err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo);
                 if (err < 0)
                        goto err2;

                helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
                                                    nf_ct_protonum(ct));
                if (helper == NULL) {
                        rcu_read_unlock();
#ifdef CONFIG_MODULES
                        if (request_module("nfct-helper-%s", helpname) < 0) {
                                err = -EOPNOTSUPP;
                                goto err1;
                        }

                        rcu_read_lock();
                        helper = __nf_conntrack_helper_find(helpname,
                                                            nf_ct_l3num(ct),
                                                            nf_ct_protonum(ct));
                        if (helper) {
                                err = -EAGAIN;
                                goto err2;
                        }
                        rcu_read_unlock();
#endif
                        err = -EOPNOTSUPP;
                        goto err1;
                } else {
                        struct nf_conn_help *help;

                        help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
                        if (help == NULL) {
                                err = -ENOMEM;
                                goto err2;
                        }
                        /* set private helper data if allowed. */
                        if (helper->from_nlattr)
                                helper->from_nlattr(helpinfo, ct);

                        /* disable helper auto-assignment for this entry */
                        ct->status |= IPS_HELPER;
                        RCU_INIT_POINTER(help->helper, helper);
                }
        }

        err = ctnetlink_setup_nat(ct, cda);
        if (err < 0)
                goto err2;

        nf_ct_acct_ext_add(ct, GFP_ATOMIC);
        nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
        nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
        nf_ct_labels_ext_add(ct);
        nfct_seqadj_ext_add(ct);
        nfct_synproxy_ext_add(ct);

        /* we must add conntrack extensions before confirmation. */
        ct->status |= IPS_CONFIRMED;

        timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
        __nf_ct_set_timeout(ct, timeout);

        if (cda[CTA_STATUS]) {
                err = ctnetlink_change_status(ct, cda);
                if (err < 0)
                        goto err2;
        }

        if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
                err = ctnetlink_change_seq_adj(ct, cda);
                if (err < 0)
                        goto err2;
        }

        memset(&ct->proto, 0, sizeof(ct->proto));
        if (cda[CTA_PROTOINFO]) {
                err = ctnetlink_change_protoinfo(ct, cda);
                if (err < 0)
                        goto err2;
        }

        if (cda[CTA_SYNPROXY]) {
                err = ctnetlink_change_synproxy(ct, cda);
                if (err < 0)
                        goto err2;
        }

#if defined(CONFIG_NF_CONNTRACK_MARK)
        if (cda[CTA_MARK])
                ctnetlink_change_mark(ct, cda);
#endif

        /* setup master conntrack: this is a confirmed expectation */
        if (cda[CTA_TUPLE_MASTER]) {
                struct nf_conntrack_tuple master;
                struct nf_conntrack_tuple_hash *master_h;
                struct nf_conn *master_ct;

                err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER,
                                            u3, NULL);
                if (err < 0)
                        goto err2;

                master_h = nf_conntrack_find_get(net, zone, &master);
                if (master_h == NULL) {
                        err = -ENOENT;
                        goto err2;
                }
                master_ct = nf_ct_tuplehash_to_ctrack(master_h);
                __set_bit(IPS_EXPECTED_BIT, &ct->status);
                ct->master = master_ct;
        }
        tstamp = nf_conn_tstamp_find(ct);
        if (tstamp)
                tstamp->start = ktime_get_real_ns();

        err = nf_conntrack_hash_check_insert(ct);
        if (err < 0)
                goto err3;

        rcu_read_unlock();

        return ct;

err3:
        if (ct->master)
                nf_ct_put(ct->master);
err2:
        rcu_read_unlock();
err1:
        nf_conntrack_free(ct);
        return ERR_PTR(err);
}

static int ctnetlink_new_conntrack(struct sk_buff *skb,
                                   const struct nfnl_info *info,
                                   const struct nlattr * const cda[])
{
        struct nf_conntrack_tuple otuple, rtuple;
        struct nf_conntrack_tuple_hash *h = NULL;
        u_int8_t u3 = info->nfmsg->nfgen_family;
        struct nf_conntrack_zone zone;
        struct nf_conn *ct;
        int err;

        err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
        if (err < 0)
                return err;

        if (cda[CTA_TUPLE_ORIG]) {
                err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG,
                                            u3, &zone);
                if (err < 0)
                        return err;
        }

        if (cda[CTA_TUPLE_REPLY]) {
                err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY,
                                            u3, &zone);
                if (err < 0)
                        return err;
        }

        if (cda[CTA_TUPLE_ORIG])
                h = nf_conntrack_find_get(info->net, &zone, &otuple);
        else if (cda[CTA_TUPLE_REPLY])
                h = nf_conntrack_find_get(info->net, &zone, &rtuple);

        if (h == NULL) {
                err = -ENOENT;
                if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
                        enum ip_conntrack_events events;

                        if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
                                return -EINVAL;
                        if (otuple.dst.protonum != rtuple.dst.protonum)
                                return -EINVAL;

                        ct = ctnetlink_create_conntrack(info->net, &zone, cda,
                                                        &otuple, &rtuple, u3);
                        if (IS_ERR(ct))
                                return PTR_ERR(ct);

                        err = 0;
                        if (test_bit(IPS_EXPECTED_BIT, &ct->status))
                                events = 1 << IPCT_RELATED;
                        else
                                events = 1 << IPCT_NEW;

                        if (cda[CTA_LABELS] &&
                            ctnetlink_attach_labels(ct, cda) == 0)
                                events |= (1 << IPCT_LABEL);

                        nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
                                                      (1 << IPCT_ASSURED) |
                                                      (1 << IPCT_HELPER) |
                                                      (1 << IPCT_PROTOINFO) |
                                                      (1 << IPCT_SEQADJ) |
                                                      (1 << IPCT_MARK) |
                                                      (1 << IPCT_SYNPROXY) |
                                                      events,
                                                      ct, NETLINK_CB(skb).portid,
                                                      nlmsg_report(info->nlh));
                        nf_ct_put(ct);
                }

                return err;
        }
        /* implicit 'else' */

        err = -EEXIST;
        ct = nf_ct_tuplehash_to_ctrack(h);
        if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) {
                err = ctnetlink_change_conntrack(ct, cda);
                if (err == 0) {
                        nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
                                                      (1 << IPCT_ASSURED) |
                                                      (1 << IPCT_HELPER) |
                                                      (1 << IPCT_LABEL) |
                                                      (1 << IPCT_PROTOINFO) |
                                                      (1 << IPCT_SEQADJ) |
                                                      (1 << IPCT_MARK) |
                                                      (1 << IPCT_SYNPROXY),
                                                      ct, NETLINK_CB(skb).portid,
                                                      nlmsg_report(info->nlh));
                }
        }

        nf_ct_put(ct);
        return err;
}

static int
ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
                                __u16 cpu, const struct ip_conntrack_stat *st)
{
        struct nlmsghdr *nlh;
        unsigned int flags = portid ? NLM_F_MULTI : 0, event;

        event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
                              IPCTNL_MSG_CT_GET_STATS_CPU);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
                           NFNETLINK_V0, htons(cpu));
        if (!nlh)
                goto nlmsg_failure;

        if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
            nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
            nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
            nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
                                htonl(st->insert_failed)) ||
            nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) ||
            nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
            nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
            nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
                                htonl(st->search_restart)) ||
            nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
                                htonl(st->clash_resolve)) ||
            nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG,
                         htonl(st->chaintoolong)))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return skb->len;

nla_put_failure:
nlmsg_failure:
        nlmsg_cancel(skb, nlh);
        return -1;
}

static int
ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        int cpu;
        struct net *net = sock_net(skb->sk);

        if (cb->args[0] == nr_cpu_ids)
                return 0;

        for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
                const struct ip_conntrack_stat *st;

                if (!cpu_possible(cpu))
                        continue;

                st = per_cpu_ptr(net->ct.stat, cpu);
                if (ctnetlink_ct_stat_cpu_fill_info(skb,
                                                    NETLINK_CB(cb->skb).portid,
                                                    cb->nlh->nlmsg_seq,
                                                    cpu, st) < 0)
                                break;
        }
        cb->args[0] = cpu;

        return skb->len;
}

static int ctnetlink_stat_ct_cpu(struct sk_buff *skb,
                                 const struct nfnl_info *info,
                                 const struct nlattr * const cda[])
{
        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = ctnetlink_ct_stat_cpu_dump,
                };
                return netlink_dump_start(info->sk, skb, info->nlh, &c);
        }

        return 0;
}

static int
ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
                            struct net *net)
{
        unsigned int flags = portid ? NLM_F_MULTI : 0, event;
        unsigned int nr_conntracks;
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
                           NFNETLINK_V0, 0);
        if (!nlh)
                goto nlmsg_failure;

        nr_conntracks = nf_conntrack_count(net);
        if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
                goto nla_put_failure;

        if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return skb->len;

nla_put_failure:
nlmsg_failure:
        nlmsg_cancel(skb, nlh);
        return -1;
}

static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const cda[])
{
        struct sk_buff *skb2;
        int err;

        skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (skb2 == NULL)
                return -ENOMEM;

        err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
                                          info->nlh->nlmsg_seq,
                                          NFNL_MSG_TYPE(info->nlh->nlmsg_type),
                                          sock_net(skb->sk));
        if (err <= 0) {
                kfree_skb(skb2);
                return -ENOMEM;
        }

        return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}

static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
        [CTA_EXPECT_MASTER]        = { .type = NLA_NESTED },
        [CTA_EXPECT_TUPLE]        = { .type = NLA_NESTED },
        [CTA_EXPECT_MASK]        = { .type = NLA_NESTED },
        [CTA_EXPECT_TIMEOUT]        = { .type = NLA_U32 },
        [CTA_EXPECT_ID]                = { .type = NLA_U32 },
        [CTA_EXPECT_HELP_NAME]        = { .type = NLA_NUL_STRING,
                                    .len = NF_CT_HELPER_NAME_LEN - 1 },
        [CTA_EXPECT_ZONE]        = { .type = NLA_U16 },
        [CTA_EXPECT_FLAGS]        = { .type = NLA_U32 },
        [CTA_EXPECT_CLASS]        = { .type = NLA_U32 },
        [CTA_EXPECT_NAT]        = { .type = NLA_NESTED },
        [CTA_EXPECT_FN]                = { .type = NLA_NUL_STRING },
};

static struct nf_conntrack_expect *
ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
                       struct nf_conntrack_helper *helper,
                       struct nf_conntrack_tuple *tuple,
                       struct nf_conntrack_tuple *mask);

#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
static size_t
ctnetlink_glue_build_size(const struct nf_conn *ct)
{
        return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */
               + 3 * nla_total_size(0) /* CTA_TUPLE_IP */
               + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */
               + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */
               + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
               + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
               + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
               + nla_total_size(0) /* CTA_PROTOINFO */
               + nla_total_size(0) /* CTA_HELP */
               + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
               + ctnetlink_secctx_size(ct)
               + ctnetlink_acct_size(ct)
               + ctnetlink_timestamp_size(ct)
#if IS_ENABLED(CONFIG_NF_NAT)
               + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
               + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
#ifdef CONFIG_NF_CONNTRACK_MARK
               + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
               + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */
#endif
               + ctnetlink_proto_size(ct)
               ;
}

static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
        const struct nf_conntrack_zone *zone;
        struct nlattr *nest_parms;

        zone = nf_ct_zone(ct);

        nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
                goto nla_put_failure;
        if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
                                   NF_CT_ZONE_DIR_ORIG) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
                goto nla_put_failure;
        if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone,
                                   NF_CT_ZONE_DIR_REPL) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone,
                                   NF_CT_DEFAULT_ZONE_DIR) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_id(skb, ct) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_status(skb, ct) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_timeout(skb, ct, false) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_protoinfo(skb, ct, false) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 ||
            ctnetlink_dump_timestamp(skb, ct) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_helpinfo(skb, ct) < 0)
                goto nla_put_failure;

#ifdef CONFIG_NF_CONNTRACK_SECMARK
        if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0)
                goto nla_put_failure;
#endif
        if (ct->master && ctnetlink_dump_master(skb, ct) < 0)
                goto nla_put_failure;

        if ((ct->status & IPS_SEQ_ADJUST) &&
            ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
                goto nla_put_failure;

        if (ctnetlink_dump_ct_synproxy(skb, ct) < 0)
                goto nla_put_failure;

#ifdef CONFIG_NF_CONNTRACK_MARK
        if (ctnetlink_dump_mark(skb, ct, true) < 0)
                goto nla_put_failure;
#endif
        if (ctnetlink_dump_labels(skb, ct) < 0)
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -ENOSPC;
}

static int
ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo,
                     u_int16_t ct_attr, u_int16_t ct_info_attr)
{
        struct nlattr *nest_parms;

        nest_parms = nla_nest_start(skb, ct_attr);
        if (!nest_parms)
                goto nla_put_failure;

        if (__ctnetlink_glue_build(skb, ct) < 0)
                goto nla_put_failure;

        nla_nest_end(skb, nest_parms);

        if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo)))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -ENOSPC;
}

static int
ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
        unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
        unsigned long d = ct->status ^ status;

        if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
                /* SEEN_REPLY bit can only be set */
                return -EBUSY;

        if (d & IPS_ASSURED && !(status & IPS_ASSURED))
                /* ASSURED bit can only be set */
                return -EBUSY;

        /* This check is less strict than ctnetlink_change_status()
         * because callers often flip IPS_EXPECTED bits when sending
         * an NFQA_CT attribute to the kernel.  So ignore the
         * unchangeable bits but do not error out. Also user programs
         * are allowed to clear the bits that they are allowed to change.
         */
        __nf_ct_change_status(ct, status, ~status);
        return 0;
}

static int
ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
{
        int err;

        if (cda[CTA_TIMEOUT]) {
                err = ctnetlink_change_timeout(ct, cda);
                if (err < 0)
                        return err;
        }
        if (cda[CTA_STATUS]) {
                err = ctnetlink_update_status(ct, cda);
                if (err < 0)
                        return err;
        }
        if (cda[CTA_HELP]) {
                err = ctnetlink_change_helper(ct, cda);
                if (err < 0)
                        return err;
        }
        if (cda[CTA_LABELS]) {
                err = ctnetlink_attach_labels(ct, cda);
                if (err < 0)
                        return err;
        }
#if defined(CONFIG_NF_CONNTRACK_MARK)
        if (cda[CTA_MARK]) {
                ctnetlink_change_mark(ct, cda);
        }
#endif
        return 0;
}

static int
ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct)
{
        struct nlattr *cda[CTA_MAX+1];
        int ret;

        ret = nla_parse_nested_deprecated(cda, CTA_MAX, attr, ct_nla_policy,
                                          NULL);
        if (ret < 0)
                return ret;

        return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct);
}

static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda,
                                    const struct nf_conn *ct,
                                    struct nf_conntrack_tuple *tuple,
                                    struct nf_conntrack_tuple *mask)
{
        int err;

        err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
                                    nf_ct_l3num(ct), NULL);
        if (err < 0)
                return err;

        return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
                                     nf_ct_l3num(ct), NULL);
}

static int
ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
                             u32 portid, u32 report)
{
        struct nlattr *cda[CTA_EXPECT_MAX+1];
        struct nf_conntrack_tuple tuple, mask;
        struct nf_conntrack_helper *helper = NULL;
        struct nf_conntrack_expect *exp;
        int err;

        err = nla_parse_nested_deprecated(cda, CTA_EXPECT_MAX, attr,
                                          exp_nla_policy, NULL);
        if (err < 0)
                return err;

        err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda,
                                       ct, &tuple, &mask);
        if (err < 0)
                return err;

        if (cda[CTA_EXPECT_HELP_NAME]) {
                const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);

                helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
                                                    nf_ct_protonum(ct));
                if (helper == NULL)
                        return -EOPNOTSUPP;
        }

        exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct,
                                     helper, &tuple, &mask);
        if (IS_ERR(exp))
                return PTR_ERR(exp);

        err = nf_ct_expect_related_report(exp, portid, report, 0);
        nf_ct_expect_put(exp);
        return err;
}

static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
                                  enum ip_conntrack_info ctinfo, int diff)
{
        if (!(ct->status & IPS_NAT_MASK))
                return;

        nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
}

static const struct nfnl_ct_hook ctnetlink_glue_hook = {
        .build_size        = ctnetlink_glue_build_size,
        .build                = ctnetlink_glue_build,
        .parse                = ctnetlink_glue_parse,
        .attach_expect        = ctnetlink_glue_attach_expect,
        .seq_adjust        = ctnetlink_glue_seqadj,
};
#endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */

/***********************************************************************
 * EXPECT
 ***********************************************************************/

static int ctnetlink_exp_dump_tuple(struct sk_buff *skb,
                                    const struct nf_conntrack_tuple *tuple,
                                    u32 type)
{
        struct nlattr *nest_parms;

        nest_parms = nla_nest_start(skb, type);
        if (!nest_parms)
                goto nla_put_failure;
        if (ctnetlink_dump_tuples(skb, tuple) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest_parms);

        return 0;

nla_put_failure:
        return -1;
}

static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
                                   const struct nf_conntrack_tuple *tuple,
                                   const struct nf_conntrack_tuple_mask *mask)
{
        const struct nf_conntrack_l4proto *l4proto;
        struct nf_conntrack_tuple m;
        struct nlattr *nest_parms;
        int ret;

        memset(&m, 0xFF, sizeof(m));
        memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
        m.src.u.all = mask->src.u.all;
        m.src.l3num = tuple->src.l3num;
        m.dst.protonum = tuple->dst.protonum;

        nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK);
        if (!nest_parms)
                goto nla_put_failure;

        rcu_read_lock();
        ret = ctnetlink_dump_tuples_ip(skb, &m);
        if (ret >= 0) {
                l4proto = nf_ct_l4proto_find(tuple->dst.protonum);
                ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
        }
        rcu_read_unlock();

        if (unlikely(ret < 0))
                goto nla_put_failure;

        nla_nest_end(skb, nest_parms);

        return 0;

nla_put_failure:
        return -1;
}

#if IS_ENABLED(CONFIG_NF_NAT)
static const union nf_inet_addr any_addr;
#endif

static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
{
        static siphash_aligned_key_t exp_id_seed;
        unsigned long a, b, c, d;

        net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));

        a = (unsigned long)exp;
        b = (unsigned long)exp->helper;
        c = (unsigned long)exp->master;
        d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed);

#ifdef CONFIG_64BIT
        return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed);
#else
        return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed);
#endif
}

static int
ctnetlink_exp_dump_expect(struct sk_buff *skb,
                          const struct nf_conntrack_expect *exp)
{
        struct nf_conn *master = exp->master;
        long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
        struct nf_conn_help *help;
#if IS_ENABLED(CONFIG_NF_NAT)
        struct nlattr *nest_parms;
        struct nf_conntrack_tuple nat_tuple = {};
#endif
        struct nf_ct_helper_expectfn *expfn;

        if (timeout < 0)
                timeout = 0;

        if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0)
                goto nla_put_failure;
        if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0)
                goto nla_put_failure;
        if (ctnetlink_exp_dump_tuple(skb,
                                 &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                 CTA_EXPECT_MASTER) < 0)
                goto nla_put_failure;

#if IS_ENABLED(CONFIG_NF_NAT)
        if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) ||
            exp->saved_proto.all) {
                nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT);
                if (!nest_parms)
                        goto nla_put_failure;

                if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir)))
                        goto nla_put_failure;

                nat_tuple.src.l3num = nf_ct_l3num(master);
                nat_tuple.src.u3 = exp->saved_addr;
                nat_tuple.dst.protonum = nf_ct_protonum(master);
                nat_tuple.src.u = exp->saved_proto;

                if (ctnetlink_exp_dump_tuple(skb, &nat_tuple,
                                                CTA_EXPECT_NAT_TUPLE) < 0)
                        goto nla_put_failure;
                nla_nest_end(skb, nest_parms);
        }
#endif
        if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
            nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) ||
            nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
            nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
                goto nla_put_failure;
        help = nfct_help(master);
        if (help) {
                struct nf_conntrack_helper *helper;

                helper = rcu_dereference(help->helper);
                if (helper &&
                    nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name))
                        goto nla_put_failure;
        }
        expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn);
        if (expfn != NULL &&
            nla_put_string(skb, CTA_EXPECT_FN, expfn->name))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -1;
}

static int
ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
                        int event, const struct nf_conntrack_expect *exp)
{
        struct nlmsghdr *nlh;
        unsigned int flags = portid ? NLM_F_MULTI : 0;

        event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags,
                           exp->tuple.src.l3num, NFNETLINK_V0, 0);
        if (!nlh)
                goto nlmsg_failure;

        if (ctnetlink_exp_dump_expect(skb, exp) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return skb->len;

nlmsg_failure:
nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -1;
}

#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item)
{
        struct nf_conntrack_expect *exp = item->exp;
        struct net *net = nf_ct_exp_net(exp);
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        unsigned int type, group;
        int flags = 0;

        if (events & (1 << IPEXP_DESTROY)) {
                type = IPCTNL_MSG_EXP_DELETE;
                group = NFNLGRP_CONNTRACK_EXP_DESTROY;
        } else if (events & (1 << IPEXP_NEW)) {
                type = IPCTNL_MSG_EXP_NEW;
                flags = NLM_F_CREATE|NLM_F_EXCL;
                group = NFNLGRP_CONNTRACK_EXP_NEW;
        } else
                return 0;

        if (!item->report && !nfnetlink_has_listeners(net, group))
                return 0;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (skb == NULL)
                goto errout;

        type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type);
        nlh = nfnl_msg_put(skb, item->portid, 0, type, flags,
                           exp->tuple.src.l3num, NFNETLINK_V0, 0);
        if (!nlh)
                goto nlmsg_failure;

        if (ctnetlink_exp_dump_expect(skb, exp) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
nlmsg_failure:
        kfree_skb(skb);
errout:
        nfnetlink_set_err(net, 0, 0, -ENOBUFS);
        return 0;
}
#endif
static int ctnetlink_exp_done(struct netlink_callback *cb)
{
        if (cb->args[1])
                nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
        return 0;
}

static int
ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct nf_conntrack_expect *exp, *last;
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        u_int8_t l3proto = nfmsg->nfgen_family;

        rcu_read_lock();
        last = (struct nf_conntrack_expect *)cb->args[1];
        for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
                hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
                                         hnode) {
                        if (l3proto && exp->tuple.src.l3num != l3proto)
                                continue;

                        if (!net_eq(nf_ct_net(exp->master), net))
                                continue;

                        if (cb->args[1]) {
                                if (exp != last)
                                        continue;
                                cb->args[1] = 0;
                        }
                        if (ctnetlink_exp_fill_info(skb,
                                                    NETLINK_CB(cb->skb).portid,
                                                    cb->nlh->nlmsg_seq,
                                                    IPCTNL_MSG_EXP_NEW,
                                                    exp) < 0) {
                                if (!refcount_inc_not_zero(&exp->use))
                                        continue;
                                cb->args[1] = (unsigned long)exp;
                                goto out;
                        }
                }
                if (cb->args[1]) {
                        cb->args[1] = 0;
                        goto restart;
                }
        }
out:
        rcu_read_unlock();
        if (last)
                nf_ct_expect_put(last);

        return skb->len;
}

static int
ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct nf_conntrack_expect *exp, *last;
        struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nf_conn *ct = cb->data;
        struct nf_conn_help *help = nfct_help(ct);
        u_int8_t l3proto = nfmsg->nfgen_family;

        if (cb->args[0])
                return 0;

        rcu_read_lock();
        last = (struct nf_conntrack_expect *)cb->args[1];
restart:
        hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
                if (l3proto && exp->tuple.src.l3num != l3proto)
                        continue;
                if (cb->args[1]) {
                        if (exp != last)
                                continue;
                        cb->args[1] = 0;
                }
                if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid,
                                            cb->nlh->nlmsg_seq,
                                            IPCTNL_MSG_EXP_NEW,
                                            exp) < 0) {
                        if (!refcount_inc_not_zero(&exp->use))
                                continue;
                        cb->args[1] = (unsigned long)exp;
                        goto out;
                }
        }
        if (cb->args[1]) {
                cb->args[1] = 0;
                goto restart;
        }
        cb->args[0] = 1;
out:
        rcu_read_unlock();
        if (last)
                nf_ct_expect_put(last);

        return skb->len;
}

static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
                                 struct sk_buff *skb,
                                 const struct nlmsghdr *nlh,
                                 const struct nlattr * const cda[],
                                 struct netlink_ext_ack *extack)
{
        int err;
        struct nfgenmsg *nfmsg = nlmsg_data(nlh);
        u_int8_t u3 = nfmsg->nfgen_family;
        struct nf_conntrack_tuple tuple;
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct nf_conntrack_zone zone;
        struct netlink_dump_control c = {
                .dump = ctnetlink_exp_ct_dump_table,
                .done = ctnetlink_exp_done,
        };

        err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
                                    u3, NULL);
        if (err < 0)
                return err;

        err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
        if (err < 0)
                return err;

        h = nf_conntrack_find_get(net, &zone, &tuple);
        if (!h)
                return -ENOENT;

        ct = nf_ct_tuplehash_to_ctrack(h);
        /* No expectation linked to this connection tracking. */
        if (!nfct_help(ct)) {
                nf_ct_put(ct);
                return 0;
        }

        c.data = ct;

        err = netlink_dump_start(ctnl, skb, nlh, &c);
        nf_ct_put(ct);

        return err;
}

static int ctnetlink_get_expect(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const cda[])
{
        u_int8_t u3 = info->nfmsg->nfgen_family;
        struct nf_conntrack_tuple tuple;
        struct nf_conntrack_expect *exp;
        struct nf_conntrack_zone zone;
        struct sk_buff *skb2;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                if (cda[CTA_EXPECT_MASTER])
                        return ctnetlink_dump_exp_ct(info->net, info->sk, skb,
                                                     info->nlh, cda,
                                                     info->extack);
                else {
                        struct netlink_dump_control c = {
                                .dump = ctnetlink_exp_dump_table,
                                .done = ctnetlink_exp_done,
                        };
                        return netlink_dump_start(info->sk, skb, info->nlh, &c);
                }
        }

        err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
        if (err < 0)
                return err;

        if (cda[CTA_EXPECT_TUPLE])
                err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
                                            u3, NULL);
        else if (cda[CTA_EXPECT_MASTER])
                err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
                                            u3, NULL);
        else
                return -EINVAL;

        if (err < 0)
                return err;

        exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
        if (!exp)
                return -ENOENT;

        if (cda[CTA_EXPECT_ID]) {
                __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);

                if (id != nf_expect_get_id(exp)) {
                        nf_ct_expect_put(exp);
                        return -ENOENT;
                }
        }

        skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!skb2) {
                nf_ct_expect_put(exp);
                return -ENOMEM;
        }

        rcu_read_lock();
        err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
                                      info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
                                      exp);
        rcu_read_unlock();
        nf_ct_expect_put(exp);
        if (err <= 0) {
                kfree_skb(skb2);
                return -ENOMEM;
        }

        return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}

static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data)
{
        struct nf_conntrack_helper *helper;
        const struct nf_conn_help *m_help;
        const char *name = data;

        m_help = nfct_help(exp->master);

        helper = rcu_dereference(m_help->helper);
        if (!helper)
                return false;

        return strcmp(helper->name, name) == 0;
}

static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
{
        return true;
}

static int ctnetlink_del_expect(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const cda[])
{
        u_int8_t u3 = info->nfmsg->nfgen_family;
        struct nf_conntrack_expect *exp;
        struct nf_conntrack_tuple tuple;
        struct nf_conntrack_zone zone;
        int err;

        if (cda[CTA_EXPECT_TUPLE]) {
                /* delete a single expect by tuple */
                err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
                if (err < 0)
                        return err;

                err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
                                            u3, NULL);
                if (err < 0)
                        return err;

                /* bump usage count to 2 */
                exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
                if (!exp)
                        return -ENOENT;

                if (cda[CTA_EXPECT_ID]) {
                        __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
                        if (ntohl(id) != (u32)(unsigned long)exp) {
                                nf_ct_expect_put(exp);
                                return -ENOENT;
                        }
                }

                /* after list removal, usage count == 1 */
                spin_lock_bh(&nf_conntrack_expect_lock);
                if (del_timer(&exp->timeout)) {
                        nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
                                                   nlmsg_report(info->nlh));
                        nf_ct_expect_put(exp);
                }
                spin_unlock_bh(&nf_conntrack_expect_lock);
                /* have to put what we 'get' above.
                 * after this line usage count == 0 */
                nf_ct_expect_put(exp);
        } else if (cda[CTA_EXPECT_HELP_NAME]) {
                char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);

                nf_ct_expect_iterate_net(info->net, expect_iter_name, name,
                                         NETLINK_CB(skb).portid,
                                         nlmsg_report(info->nlh));
        } else {
                /* This basically means we have to flush everything*/
                nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL,
                                         NETLINK_CB(skb).portid,
                                         nlmsg_report(info->nlh));
        }

        return 0;
}
static int
ctnetlink_change_expect(struct nf_conntrack_expect *x,
                        const struct nlattr * const cda[])
{
        if (cda[CTA_EXPECT_TIMEOUT]) {
                if (!del_timer(&x->timeout))
                        return -ETIME;

                x->timeout.expires = jiffies +
                        ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
                add_timer(&x->timeout);
        }
        return 0;
}

#if IS_ENABLED(CONFIG_NF_NAT)
static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
        [CTA_EXPECT_NAT_DIR]        = { .type = NLA_U32 },
        [CTA_EXPECT_NAT_TUPLE]        = { .type = NLA_NESTED },
};
#endif

static int
ctnetlink_parse_expect_nat(const struct nlattr *attr,
                           struct nf_conntrack_expect *exp,
                           u_int8_t u3)
{
#if IS_ENABLED(CONFIG_NF_NAT)
        struct nlattr *tb[CTA_EXPECT_NAT_MAX+1];
        struct nf_conntrack_tuple nat_tuple = {};
        int err;

        err = nla_parse_nested_deprecated(tb, CTA_EXPECT_NAT_MAX, attr,
                                          exp_nat_nla_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE])
                return -EINVAL;

        err = ctnetlink_parse_tuple((const struct nlattr * const *)tb,
                                    &nat_tuple, CTA_EXPECT_NAT_TUPLE,
                                    u3, NULL);
        if (err < 0)
                return err;

        exp->saved_addr = nat_tuple.src.u3;
        exp->saved_proto = nat_tuple.src.u;
        exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR]));

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

static struct nf_conntrack_expect *
ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
                       struct nf_conntrack_helper *helper,
                       struct nf_conntrack_tuple *tuple,
                       struct nf_conntrack_tuple *mask)
{
        u_int32_t class = 0;
        struct nf_conntrack_expect *exp;
        struct nf_conn_help *help;
        int err;

        help = nfct_help(ct);
        if (!help)
                return ERR_PTR(-EOPNOTSUPP);

        if (cda[CTA_EXPECT_CLASS] && helper) {
                class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS]));
                if (class > helper->expect_class_max)
                        return ERR_PTR(-EINVAL);
        }
        exp = nf_ct_expect_alloc(ct);
        if (!exp)
                return ERR_PTR(-ENOMEM);

        if (cda[CTA_EXPECT_FLAGS]) {
                exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
                exp->flags &= ~NF_CT_EXPECT_USERSPACE;
        } else {
                exp->flags = 0;
        }
        if (cda[CTA_EXPECT_FN]) {
                const char *name = nla_data(cda[CTA_EXPECT_FN]);
                struct nf_ct_helper_expectfn *expfn;

                expfn = nf_ct_helper_expectfn_find_by_name(name);
                if (expfn == NULL) {
                        err = -EINVAL;
                        goto err_out;
                }
                exp->expectfn = expfn->expectfn;
        } else
                exp->expectfn = NULL;

        exp->class = class;
        exp->master = ct;
        exp->helper = helper;
        exp->tuple = *tuple;
        exp->mask.src.u3 = mask->src.u3;
        exp->mask.src.u.all = mask->src.u.all;

        if (cda[CTA_EXPECT_NAT]) {
                err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT],
                                                 exp, nf_ct_l3num(ct));
                if (err < 0)
                        goto err_out;
        }
        return exp;
err_out:
        nf_ct_expect_put(exp);
        return ERR_PTR(err);
}

static int
ctnetlink_create_expect(struct net *net,
                        const struct nf_conntrack_zone *zone,
                        const struct nlattr * const cda[],
                        u_int8_t u3, u32 portid, int report)
{
        struct nf_conntrack_tuple tuple, mask, master_tuple;
        struct nf_conntrack_tuple_hash *h = NULL;
        struct nf_conntrack_helper *helper = NULL;
        struct nf_conntrack_expect *exp;
        struct nf_conn *ct;
        int err;

        /* caller guarantees that those three CTA_EXPECT_* exist */
        err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
                                    u3, NULL);
        if (err < 0)
                return err;
        err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK,
                                    u3, NULL);
        if (err < 0)
                return err;
        err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER,
                                    u3, NULL);
        if (err < 0)
                return err;

        /* Look for master conntrack of this expectation */
        h = nf_conntrack_find_get(net, zone, &master_tuple);
        if (!h)
                return -ENOENT;
        ct = nf_ct_tuplehash_to_ctrack(h);

        rcu_read_lock();
        if (cda[CTA_EXPECT_HELP_NAME]) {
                const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);

                helper = __nf_conntrack_helper_find(helpname, u3,
                                                    nf_ct_protonum(ct));
                if (helper == NULL) {
                        rcu_read_unlock();
#ifdef CONFIG_MODULES
                        if (request_module("nfct-helper-%s", helpname) < 0) {
                                err = -EOPNOTSUPP;
                                goto err_ct;
                        }
                        rcu_read_lock();
                        helper = __nf_conntrack_helper_find(helpname, u3,
                                                            nf_ct_protonum(ct));
                        if (helper) {
                                err = -EAGAIN;
                                goto err_rcu;
                        }
                        rcu_read_unlock();
#endif
                        err = -EOPNOTSUPP;
                        goto err_ct;
                }
        }

        exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask);
        if (IS_ERR(exp)) {
                err = PTR_ERR(exp);
                goto err_rcu;
        }

        err = nf_ct_expect_related_report(exp, portid, report, 0);
        nf_ct_expect_put(exp);
err_rcu:
        rcu_read_unlock();
err_ct:
        nf_ct_put(ct);
        return err;
}

static int ctnetlink_new_expect(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const cda[])
{
        u_int8_t u3 = info->nfmsg->nfgen_family;
        struct nf_conntrack_tuple tuple;
        struct nf_conntrack_expect *exp;
        struct nf_conntrack_zone zone;
        int err;

        if (!cda[CTA_EXPECT_TUPLE]
            || !cda[CTA_EXPECT_MASK]
            || !cda[CTA_EXPECT_MASTER])
                return -EINVAL;

        err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
        if (err < 0)
                return err;

        err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE,
                                    u3, NULL);
        if (err < 0)
                return err;

        spin_lock_bh(&nf_conntrack_expect_lock);
        exp = __nf_ct_expect_find(info->net, &zone, &tuple);
        if (!exp) {
                spin_unlock_bh(&nf_conntrack_expect_lock);
                err = -ENOENT;
                if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
                        err = ctnetlink_create_expect(info->net, &zone, cda, u3,
                                                      NETLINK_CB(skb).portid,
                                                      nlmsg_report(info->nlh));
                }
                return err;
        }

        err = -EEXIST;
        if (!(info->nlh->nlmsg_flags & NLM_F_EXCL))
                err = ctnetlink_change_expect(exp, cda);
        spin_unlock_bh(&nf_conntrack_expect_lock);

        return err;
}

static int
ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
                             const struct ip_conntrack_stat *st)
{
        struct nlmsghdr *nlh;
        unsigned int flags = portid ? NLM_F_MULTI : 0, event;

        event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
                              IPCTNL_MSG_EXP_GET_STATS_CPU);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
                           NFNETLINK_V0, htons(cpu));
        if (!nlh)
                goto nlmsg_failure;

        if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
            nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
            nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return skb->len;

nla_put_failure:
nlmsg_failure:
        nlmsg_cancel(skb, nlh);
        return -1;
}

static int
ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        int cpu;
        struct net *net = sock_net(skb->sk);

        if (cb->args[0] == nr_cpu_ids)
                return 0;

        for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
                const struct ip_conntrack_stat *st;

                if (!cpu_possible(cpu))
                        continue;

                st = per_cpu_ptr(net->ct.stat, cpu);
                if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid,
                                                 cb->nlh->nlmsg_seq,
                                                 cpu, st) < 0)
                        break;
        }
        cb->args[0] = cpu;

        return skb->len;
}

static int ctnetlink_stat_exp_cpu(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const cda[])
{
        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = ctnetlink_exp_stat_cpu_dump,
                };
                return netlink_dump_start(info->sk, skb, info->nlh, &c);
        }

        return 0;
}

#ifdef CONFIG_NF_CONNTRACK_EVENTS
static struct nf_ct_event_notifier ctnl_notifier = {
        .ct_event = ctnetlink_conntrack_event,
        .exp_event = ctnetlink_expect_event,
};
#endif

static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
        [IPCTNL_MSG_CT_NEW]        = {
                .call                = ctnetlink_new_conntrack,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = CTA_MAX,
                .policy                = ct_nla_policy
        },
        [IPCTNL_MSG_CT_GET]        = {
                .call                = ctnetlink_get_conntrack,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = CTA_MAX,
                .policy                = ct_nla_policy
        },
        [IPCTNL_MSG_CT_DELETE]        = {
                .call                = ctnetlink_del_conntrack,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = CTA_MAX,
                .policy                = ct_nla_policy
        },
        [IPCTNL_MSG_CT_GET_CTRZERO] = {
                .call                = ctnetlink_get_conntrack,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = CTA_MAX,
                .policy                = ct_nla_policy
        },
        [IPCTNL_MSG_CT_GET_STATS_CPU] = {
                .call                = ctnetlink_stat_ct_cpu,
                .type                = NFNL_CB_MUTEX,
        },
        [IPCTNL_MSG_CT_GET_STATS] = {
                .call                = ctnetlink_stat_ct,
                .type                = NFNL_CB_MUTEX,
        },
        [IPCTNL_MSG_CT_GET_DYING] = {
                .call                = ctnetlink_get_ct_dying,
                .type                = NFNL_CB_MUTEX,
        },
        [IPCTNL_MSG_CT_GET_UNCONFIRMED]        = {
                .call                = ctnetlink_get_ct_unconfirmed,
                .type                = NFNL_CB_MUTEX,
        },
};

static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
        [IPCTNL_MSG_EXP_GET] = {
                .call                = ctnetlink_get_expect,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = CTA_EXPECT_MAX,
                .policy                = exp_nla_policy
        },
        [IPCTNL_MSG_EXP_NEW] = {
                .call                = ctnetlink_new_expect,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = CTA_EXPECT_MAX,
                .policy                = exp_nla_policy
        },
        [IPCTNL_MSG_EXP_DELETE] = {
                .call                = ctnetlink_del_expect,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = CTA_EXPECT_MAX,
                .policy                = exp_nla_policy
        },
        [IPCTNL_MSG_EXP_GET_STATS_CPU] = {
                .call                = ctnetlink_stat_exp_cpu,
                .type                = NFNL_CB_MUTEX,
        },
};

static const struct nfnetlink_subsystem ctnl_subsys = {
        .name                                = "conntrack",
        .subsys_id                        = NFNL_SUBSYS_CTNETLINK,
        .cb_count                        = IPCTNL_MSG_MAX,
        .cb                                = ctnl_cb,
};

static const struct nfnetlink_subsystem ctnl_exp_subsys = {
        .name                                = "conntrack_expect",
        .subsys_id                        = NFNL_SUBSYS_CTNETLINK_EXP,
        .cb_count                        = IPCTNL_MSG_EXP_MAX,
        .cb                                = ctnl_exp_cb,
};

MODULE_ALIAS("ip_conntrack_netlink");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK);
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);

static int __net_init ctnetlink_net_init(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        nf_conntrack_register_notifier(net, &ctnl_notifier);
#endif
        return 0;
}

static void ctnetlink_net_pre_exit(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        nf_conntrack_unregister_notifier(net);
#endif
}

static struct pernet_operations ctnetlink_net_ops = {
        .init                = ctnetlink_net_init,
        .pre_exit        = ctnetlink_net_pre_exit,
};

static int __init ctnetlink_init(void)
{
        int ret;

        NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx);

        ret = nfnetlink_subsys_register(&ctnl_subsys);
        if (ret < 0) {
                pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
                goto err_out;
        }

        ret = nfnetlink_subsys_register(&ctnl_exp_subsys);
        if (ret < 0) {
                pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n");
                goto err_unreg_subsys;
        }

        ret = register_pernet_subsys(&ctnetlink_net_ops);
        if (ret < 0) {
                pr_err("ctnetlink_init: cannot register pernet operations\n");
                goto err_unreg_exp_subsys;
        }
#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
        /* setup interaction between nf_queue and nf_conntrack_netlink. */
        RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook);
#endif
        return 0;

err_unreg_exp_subsys:
        nfnetlink_subsys_unregister(&ctnl_exp_subsys);
err_unreg_subsys:
        nfnetlink_subsys_unregister(&ctnl_subsys);
err_out:
        return ret;
}

static void __exit ctnetlink_exit(void)
{
        unregister_pernet_subsys(&ctnetlink_net_ops);
        nfnetlink_subsys_unregister(&ctnl_exp_subsys);
        nfnetlink_subsys_unregister(&ctnl_subsys);
#ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT
        RCU_INIT_POINTER(nfnl_ct_hook, NULL);
#endif
        synchronize_rcu();
}

module_init(ctnetlink_init);
module_exit(ctnetlink_exit);









































































    1 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions implement the SCTP primitive functions from Section 10.
 *
 * Note that the descriptions from the specification are USER level
 * functions--this file is the functions which populate the struct proto
 * for SCTP which is the BOTTOM of the sockets interface.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Narasimha Budihal     <narasimha@refcode.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 *    Kevin Gao             <kevin.gao@intel.com>
 */

#include <linux/types.h>
#include <linux/list.h> /* For struct list_head */
#include <linux/socket.h>
#include <linux/ip.h>
#include <linux/time.h> /* For struct timeval */
#include <linux/gfp.h>
#include <net/sock.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

#define DECLARE_PRIMITIVE(name) \
/* This is called in the code as sctp_primitive_ ## name.  */ \
int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
                            void *arg) { \
        int error = 0; \
        enum sctp_event_type event_type; union sctp_subtype subtype; \
        enum sctp_state state; \
        struct sctp_endpoint *ep; \
        \
        event_type = SCTP_EVENT_T_PRIMITIVE; \
        subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \
        state = asoc ? asoc->state : SCTP_STATE_CLOSED; \
        ep = asoc ? asoc->ep : NULL; \
        \
        error = sctp_do_sm(net, event_type, subtype, state, ep, asoc,        \
                           arg, GFP_KERNEL); \
        return error; \
}

/* 10.1 ULP-to-SCTP
 * B) Associate
 *
 * Format: ASSOCIATE(local SCTP instance name, destination transport addr,
 *         outbound stream count)
 * -> association id [,destination transport addr list] [,outbound stream
 *    count]
 *
 * This primitive allows the upper layer to initiate an association to a
 * specific peer endpoint.
 *
 * This version assumes that asoc is fully populated with the initial
 * parameters.  We then return a traditional kernel indicator of
 * success or failure.
 */

/* This is called in the code as sctp_primitive_ASSOCIATE.  */

DECLARE_PRIMITIVE(ASSOCIATE)

/* 10.1 ULP-to-SCTP
 * C) Shutdown
 *
 * Format: SHUTDOWN(association id)
 * -> result
 *
 * Gracefully closes an association. Any locally queued user data
 * will be delivered to the peer. The association will be terminated only
 * after the peer acknowledges all the SCTP packets sent.  A success code
 * will be returned on successful termination of the association. If
 * attempting to terminate the association results in a failure, an error
 * code shall be returned.
 */

DECLARE_PRIMITIVE(SHUTDOWN);

/* 10.1 ULP-to-SCTP
 * C) Abort
 *
 * Format: Abort(association id [, cause code])
 * -> result
 *
 * Ungracefully closes an association. Any locally queued user data
 * will be discarded and an ABORT chunk is sent to the peer. A success
 * code will be returned on successful abortion of the association. If
 * attempting to abort the association results in a failure, an error
 * code shall be returned.
 */

DECLARE_PRIMITIVE(ABORT);

/* 10.1 ULP-to-SCTP
 * E) Send
 *
 * Format: SEND(association id, buffer address, byte count [,context]
 *         [,stream id] [,life time] [,destination transport address]
 *         [,unorder flag] [,no-bundle flag] [,payload protocol-id] )
 * -> result
 *
 * This is the main method to send user data via SCTP.
 *
 * Mandatory attributes:
 *
 *  o association id - local handle to the SCTP association
 *
 *  o buffer address - the location where the user message to be
 *    transmitted is stored;
 *
 *  o byte count - The size of the user data in number of bytes;
 *
 * Optional attributes:
 *
 *  o context - an optional 32 bit integer that will be carried in the
 *    sending failure notification to the ULP if the transportation of
 *    this User Message fails.
 *
 *  o stream id - to indicate which stream to send the data on. If not
 *    specified, stream 0 will be used.
 *
 *  o life time - specifies the life time of the user data. The user data
 *    will not be sent by SCTP after the life time expires. This
 *    parameter can be used to avoid efforts to transmit stale
 *    user messages. SCTP notifies the ULP if the data cannot be
 *    initiated to transport (i.e. sent to the destination via SCTP's
 *    send primitive) within the life time variable. However, the
 *    user data will be transmitted if SCTP has attempted to transmit a
 *    chunk before the life time expired.
 *
 *  o destination transport address - specified as one of the destination
 *    transport addresses of the peer endpoint to which this packet
 *    should be sent. Whenever possible, SCTP should use this destination
 *    transport address for sending the packets, instead of the current
 *    primary path.
 *
 *  o unorder flag - this flag, if present, indicates that the user
 *    would like the data delivered in an unordered fashion to the peer
 *    (i.e., the U flag is set to 1 on all DATA chunks carrying this
 *    message).
 *
 *  o no-bundle flag - instructs SCTP not to bundle this user data with
 *    other outbound DATA chunks. SCTP MAY still bundle even when
 *    this flag is present, when faced with network congestion.
 *
 *  o payload protocol-id - A 32 bit unsigned integer that is to be
 *    passed to the peer indicating the type of payload protocol data
 *    being transmitted. This value is passed as opaque data by SCTP.
 */

DECLARE_PRIMITIVE(SEND);

/* 10.1 ULP-to-SCTP
 * J) Request Heartbeat
 *
 * Format: REQUESTHEARTBEAT(association id, destination transport address)
 *
 * -> result
 *
 * Instructs the local endpoint to perform a HeartBeat on the specified
 * destination transport address of the given association. The returned
 * result should indicate whether the transmission of the HEARTBEAT
 * chunk to the destination address is successful.
 *
 * Mandatory attributes:
 *
 * o association id - local handle to the SCTP association
 *
 * o destination transport address - the transport address of the
 *   association on which a heartbeat should be issued.
 */

DECLARE_PRIMITIVE(REQUESTHEARTBEAT);

/* ADDIP
* 3.1.1 Address Configuration Change Chunk (ASCONF)
*
* This chunk is used to communicate to the remote endpoint one of the
* configuration change requests that MUST be acknowledged.  The
* information carried in the ASCONF Chunk uses the form of a
* Type-Length-Value (TLV), as described in "3.2.1 Optional/
* Variable-length Parameter Format" in RFC2960 [5], forall variable
* parameters.
*/

DECLARE_PRIMITIVE(ASCONF);

/* RE-CONFIG 5.1 */
DECLARE_PRIMITIVE(RECONF);




































































































    1 











    1 












    1 






















    1 










    2 








    1 










    2 
























































































































































    3 



    2 



    3 











































































    2 

































































































































































































    2 




    2 












    2 















    1 

























    1 







    1 













    3 

    2 

























    3 






    3 















    3 

    2 











    3 
























    2 









    3 









    2 












    3 


    3 

    3 









    1 






    1 








    3 















    2 



    2 






















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
// SPDX-License-Identifier: GPL-2.0-only
/*
 * (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
 * (C) 2011 Patrick McHardy <kaber@trash.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <net/xfrm.h>
#include <linux/siphash.h>
#include <linux/rtnetlink.h>

#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#include <uapi/linux/netfilter/nf_nat.h>

#include "nf_internals.h"

#define NF_NAT_MAX_ATTEMPTS        128
#define NF_NAT_HARDER_THRESH        (NF_NAT_MAX_ATTEMPTS / 4)

static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];

static DEFINE_MUTEX(nf_nat_proto_mutex);
static unsigned int nat_net_id __read_mostly;

static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
static siphash_aligned_key_t nf_nat_hash_rnd;

struct nf_nat_lookup_hook_priv {
        struct nf_hook_entries __rcu *entries;

        struct rcu_head rcu_head;
};

struct nf_nat_hooks_net {
        struct nf_hook_ops *nat_hook_ops;
        unsigned int users;
};

struct nat_net {
        struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
};

#ifdef CONFIG_XFRM
static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
                                       const struct nf_conn *ct,
                                       enum ip_conntrack_dir dir,
                                       unsigned long statusbit,
                                       struct flowi *fl)
{
        const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
        struct flowi4 *fl4 = &fl->u.ip4;

        if (ct->status & statusbit) {
                fl4->daddr = t->dst.u3.ip;
                if (t->dst.protonum == IPPROTO_TCP ||
                    t->dst.protonum == IPPROTO_UDP ||
                    t->dst.protonum == IPPROTO_UDPLITE ||
                    t->dst.protonum == IPPROTO_DCCP ||
                    t->dst.protonum == IPPROTO_SCTP)
                        fl4->fl4_dport = t->dst.u.all;
        }

        statusbit ^= IPS_NAT_MASK;

        if (ct->status & statusbit) {
                fl4->saddr = t->src.u3.ip;
                if (t->dst.protonum == IPPROTO_TCP ||
                    t->dst.protonum == IPPROTO_UDP ||
                    t->dst.protonum == IPPROTO_UDPLITE ||
                    t->dst.protonum == IPPROTO_DCCP ||
                    t->dst.protonum == IPPROTO_SCTP)
                        fl4->fl4_sport = t->src.u.all;
        }
}

static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
                                       const struct nf_conn *ct,
                                       enum ip_conntrack_dir dir,
                                       unsigned long statusbit,
                                       struct flowi *fl)
{
#if IS_ENABLED(CONFIG_IPV6)
        const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
        struct flowi6 *fl6 = &fl->u.ip6;

        if (ct->status & statusbit) {
                fl6->daddr = t->dst.u3.in6;
                if (t->dst.protonum == IPPROTO_TCP ||
                    t->dst.protonum == IPPROTO_UDP ||
                    t->dst.protonum == IPPROTO_UDPLITE ||
                    t->dst.protonum == IPPROTO_DCCP ||
                    t->dst.protonum == IPPROTO_SCTP)
                        fl6->fl6_dport = t->dst.u.all;
        }

        statusbit ^= IPS_NAT_MASK;

        if (ct->status & statusbit) {
                fl6->saddr = t->src.u3.in6;
                if (t->dst.protonum == IPPROTO_TCP ||
                    t->dst.protonum == IPPROTO_UDP ||
                    t->dst.protonum == IPPROTO_UDPLITE ||
                    t->dst.protonum == IPPROTO_DCCP ||
                    t->dst.protonum == IPPROTO_SCTP)
                        fl6->fl6_sport = t->src.u.all;
        }
#endif
}

static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
{
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        enum ip_conntrack_dir dir;
        unsigned  long statusbit;
        u8 family;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct == NULL)
                return;

        family = nf_ct_l3num(ct);
        dir = CTINFO2DIR(ctinfo);
        if (dir == IP_CT_DIR_ORIGINAL)
                statusbit = IPS_DST_NAT;
        else
                statusbit = IPS_SRC_NAT;

        switch (family) {
        case NFPROTO_IPV4:
                nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl);
                return;
        case NFPROTO_IPV6:
                nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl);
                return;
        }
}
#endif /* CONFIG_XFRM */

/* We keep an extra hash for each conntrack, for fast searching. */
static unsigned int
hash_by_src(const struct net *net,
            const struct nf_conntrack_zone *zone,
            const struct nf_conntrack_tuple *tuple)
{
        unsigned int hash;
        struct {
                struct nf_conntrack_man src;
                u32 net_mix;
                u32 protonum;
                u32 zone;
        } __aligned(SIPHASH_ALIGNMENT) combined;

        get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));

        memset(&combined, 0, sizeof(combined));

        /* Original src, to ensure we map it consistently if poss. */
        combined.src = tuple->src;
        combined.net_mix = net_hash_mix(net);
        combined.protonum = tuple->dst.protonum;

        /* Zone ID can be used provided its valid for both directions */
        if (zone->dir == NF_CT_DEFAULT_ZONE_DIR)
                combined.zone = zone->id;

        hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd);

        return reciprocal_scale(hash, nf_nat_htable_size);
}

/* Is this tuple already taken? (not by us) */
static int
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
                  const struct nf_conn *ignored_conntrack)
{
        /* Conntrack tracking doesn't keep track of outgoing tuples; only
         * incoming ones.  NAT means they don't have a fixed mapping,
         * so we invert the tuple and look for the incoming reply.
         *
         * We could keep a separate hash if this proves too slow.
         */
        struct nf_conntrack_tuple reply;

        nf_ct_invert_tuple(&reply, tuple);
        return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}

static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
{
        static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
                                                  IPS_DYING;
        static const unsigned long flags_needed = IPS_SRC_NAT;
        enum tcp_conntrack old_state;

        old_state = READ_ONCE(ct->proto.tcp.state);
        if (old_state < TCP_CONNTRACK_TIME_WAIT)
                return false;

        if (flags & flags_refuse)
                return false;

        return (flags & flags_needed) == flags_needed;
}

/* reverse direction will send packets to new source, so
 * make sure such packets are invalid.
 */
static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
{
        return (__s32)(new->proto.tcp.seen[0].td_end -
                       old->proto.tcp.seen[0].td_end) > 0;
}

static int
nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
                         const struct nf_conn *ignored_conntrack,
                         unsigned int attempts_left)
{
        static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
        struct nf_conntrack_tuple_hash *thash;
        const struct nf_conntrack_zone *zone;
        struct nf_conntrack_tuple reply;
        unsigned long flags;
        struct nf_conn *ct;
        bool taken = true;
        struct net *net;

        nf_ct_invert_tuple(&reply, tuple);

        if (attempts_left > NF_NAT_HARDER_THRESH ||
            tuple->dst.protonum != IPPROTO_TCP ||
            ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
                return nf_conntrack_tuple_taken(&reply, ignored_conntrack);

        /* :ast few attempts to find a free tcp port. Destructive
         * action: evict colliding if its in timewait state and the
         * tcp sequence number has advanced past the one used by the
         * old entry.
         */
        net = nf_ct_net(ignored_conntrack);
        zone = nf_ct_zone(ignored_conntrack);

        thash = nf_conntrack_find_get(net, zone, &reply);
        if (!thash)
                return false;

        ct = nf_ct_tuplehash_to_ctrack(thash);

        if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
                goto out;

        if (WARN_ON_ONCE(ct == ignored_conntrack))
                goto out;

        flags = READ_ONCE(ct->status);
        if (!nf_nat_may_kill(ct, flags))
                goto out;

        if (!nf_seq_has_advanced(ct, ignored_conntrack))
                goto out;

        /* Even if we can evict do not reuse if entry is offloaded. */
        if (nf_ct_kill(ct))
                taken = flags & flags_offload;
out:
        nf_ct_put(ct);
        return taken;
}

static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
                                 const struct nf_nat_range2 *range)
{
        if (t->src.l3num == NFPROTO_IPV4)
                return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
                       ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);

        return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
               ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
}

/* Is the manipable part of the tuple between min and max incl? */
static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
                             enum nf_nat_manip_type maniptype,
                             const union nf_conntrack_man_proto *min,
                             const union nf_conntrack_man_proto *max)
{
        __be16 port;

        switch (tuple->dst.protonum) {
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
                       ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
        case IPPROTO_GRE: /* all fall though */
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_DCCP:
        case IPPROTO_SCTP:
                if (maniptype == NF_NAT_MANIP_SRC)
                        port = tuple->src.u.all;
                else
                        port = tuple->dst.u.all;

                return ntohs(port) >= ntohs(min->all) &&
                       ntohs(port) <= ntohs(max->all);
        default:
                return true;
        }
}

/* If we source map this tuple so reply looks like reply_tuple, will
 * that meet the constraints of range.
 */
static int nf_in_range(const struct nf_conntrack_tuple *tuple,
                    const struct nf_nat_range2 *range)
{
        /* If we are supposed to map IPs, then we must be in the
         * range specified, otherwise let this drag us onto a new src IP.
         */
        if (range->flags & NF_NAT_RANGE_MAP_IPS &&
            !nf_nat_inet_in_range(tuple, range))
                return 0;

        if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
                return 1;

        return l4proto_in_range(tuple, NF_NAT_MANIP_SRC,
                                &range->min_proto, &range->max_proto);
}

static inline int
same_src(const struct nf_conn *ct,
         const struct nf_conntrack_tuple *tuple)
{
        const struct nf_conntrack_tuple *t;

        t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
        return (t->dst.protonum == tuple->dst.protonum &&
                nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
                t->src.u.all == tuple->src.u.all);
}

/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
                     const struct nf_conntrack_zone *zone,
                     const struct nf_conntrack_tuple *tuple,
                     struct nf_conntrack_tuple *result,
                     const struct nf_nat_range2 *range)
{
        unsigned int h = hash_by_src(net, zone, tuple);
        const struct nf_conn *ct;

        hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
                if (same_src(ct, tuple) &&
                    net_eq(net, nf_ct_net(ct)) &&
                    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
                        /* Copy source part from reply tuple. */
                        nf_ct_invert_tuple(result,
                                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
                        result->dst = tuple->dst;

                        if (nf_in_range(result, range))
                                return 1;
                }
        }
        return 0;
}

/* For [FUTURE] fragmentation handling, we want the least-used
 * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 * 1-65535, we don't do pro-rata allocation based on ports; we choose
 * the ip with the lowest src-ip/dst-ip/proto usage.
 */
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
                    struct nf_conntrack_tuple *tuple,
                    const struct nf_nat_range2 *range,
                    const struct nf_conn *ct,
                    enum nf_nat_manip_type maniptype)
{
        union nf_inet_addr *var_ipp;
        unsigned int i, max;
        /* Host order */
        u32 minip, maxip, j, dist;
        bool full_range;

        /* No IP mapping?  Do nothing. */
        if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
                return;

        if (maniptype == NF_NAT_MANIP_SRC)
                var_ipp = &tuple->src.u3;
        else
                var_ipp = &tuple->dst.u3;

        /* Fast path: only one choice. */
        if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
                *var_ipp = range->min_addr;
                return;
        }

        if (nf_ct_l3num(ct) == NFPROTO_IPV4)
                max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
        else
                max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;

        /* Hashing source and destination IPs gives a fairly even
         * spread in practice (if there are a small number of IPs
         * involved, there usually aren't that many connections
         * anyway).  The consistency means that servers see the same
         * client coming from the same IP (some Internet Banking sites
         * like this), even across reboots.
         */
        j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
                   range->flags & NF_NAT_RANGE_PERSISTENT ?
                        0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);

        full_range = false;
        for (i = 0; i <= max; i++) {
                /* If first bytes of the address are at the maximum, use the
                 * distance. Otherwise use the full range.
                 */
                if (!full_range) {
                        minip = ntohl((__force __be32)range->min_addr.all[i]);
                        maxip = ntohl((__force __be32)range->max_addr.all[i]);
                        dist  = maxip - minip + 1;
                } else {
                        minip = 0;
                        dist  = ~0;
                }

                var_ipp->all[i] = (__force __u32)
                        htonl(minip + reciprocal_scale(j, dist));
                if (var_ipp->all[i] != range->max_addr.all[i])
                        full_range = true;

                if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
                        j ^= (__force u32)tuple->dst.u3.all[i];
        }
}

/* Alter the per-proto part of the tuple (depending on maniptype), to
 * give a unique tuple in the given range if possible.
 *
 * Per-protocol part of tuple is initialized to the incoming packet.
 */
static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
                                        const struct nf_nat_range2 *range,
                                        enum nf_nat_manip_type maniptype,
                                        const struct nf_conn *ct)
{
        unsigned int range_size, min, max, i, attempts;
        __be16 *keyptr;
        u16 off;

        switch (tuple->dst.protonum) {
        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                /* id is same for either direction... */
                keyptr = &tuple->src.u.icmp.id;
                if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
                        min = 0;
                        range_size = 65536;
                } else {
                        min = ntohs(range->min_proto.icmp.id);
                        range_size = ntohs(range->max_proto.icmp.id) -
                                     ntohs(range->min_proto.icmp.id) + 1;
                }
                goto find_free_id;
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
        case IPPROTO_GRE:
                /* If there is no master conntrack we are not PPTP,
                   do not change tuples */
                if (!ct->master)
                        return;

                if (maniptype == NF_NAT_MANIP_SRC)
                        keyptr = &tuple->src.u.gre.key;
                else
                        keyptr = &tuple->dst.u.gre.key;

                if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
                        min = 1;
                        range_size = 65535;
                } else {
                        min = ntohs(range->min_proto.gre.key);
                        range_size = ntohs(range->max_proto.gre.key) - min + 1;
                }
                goto find_free_id;
#endif
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
        case IPPROTO_TCP:
        case IPPROTO_SCTP:
        case IPPROTO_DCCP:
                if (maniptype == NF_NAT_MANIP_SRC)
                        keyptr = &tuple->src.u.all;
                else
                        keyptr = &tuple->dst.u.all;

                break;
        default:
                return;
        }

        /* If no range specified... */
        if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
                /* If it's dst rewrite, can't change port */
                if (maniptype == NF_NAT_MANIP_DST)
                        return;

                if (ntohs(*keyptr) < 1024) {
                        /* Loose convention: >> 512 is credential passing */
                        if (ntohs(*keyptr) < 512) {
                                min = 1;
                                range_size = 511 - min + 1;
                        } else {
                                min = 600;
                                range_size = 1023 - min + 1;
                        }
                } else {
                        min = 1024;
                        range_size = 65535 - 1024 + 1;
                }
        } else {
                min = ntohs(range->min_proto.all);
                max = ntohs(range->max_proto.all);
                if (unlikely(max < min))
                        swap(max, min);
                range_size = max - min + 1;
        }

find_free_id:
        if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
                off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
        else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) ||
                 maniptype != NF_NAT_MANIP_DST)
                off = get_random_u16();
        else
                off = 0;

        attempts = range_size;
        if (attempts > NF_NAT_MAX_ATTEMPTS)
                attempts = NF_NAT_MAX_ATTEMPTS;

        /* We are in softirq; doing a search of the entire range risks
         * soft lockup when all tuples are already used.
         *
         * If we can't find any free port from first offset, pick a new
         * one and try again, with ever smaller search window.
         */
another_round:
        for (i = 0; i < attempts; i++, off++) {
                *keyptr = htons(min + off % range_size);
                if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
                        return;
        }

        if (attempts >= range_size || attempts < 16)
                return;
        attempts /= 2;
        off = get_random_u16();
        goto another_round;
}

/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __nf_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
                 const struct nf_conntrack_tuple *orig_tuple,
                 const struct nf_nat_range2 *range,
                 struct nf_conn *ct,
                 enum nf_nat_manip_type maniptype)
{
        const struct nf_conntrack_zone *zone;
        struct net *net = nf_ct_net(ct);

        zone = nf_ct_zone(ct);

        /* 1) If this srcip/proto/src-proto-part is currently mapped,
         * and that same mapping gives a unique tuple within the given
         * range, use that.
         *
         * This is only required for source (ie. NAT/masq) mappings.
         * So far, we don't do local source mappings, so multiple
         * manips not an issue.
         */
        if (maniptype == NF_NAT_MANIP_SRC &&
            !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
                /* try the original tuple first */
                if (nf_in_range(orig_tuple, range)) {
                        if (!nf_nat_used_tuple(orig_tuple, ct)) {
                                *tuple = *orig_tuple;
                                return;
                        }
                } else if (find_appropriate_src(net, zone,
                                                orig_tuple, tuple, range)) {
                        pr_debug("get_unique_tuple: Found current src map\n");
                        if (!nf_nat_used_tuple(tuple, ct))
                                return;
                }
        }

        /* 2) Select the least-used IP/proto combination in the given range */
        *tuple = *orig_tuple;
        find_best_ips_proto(zone, tuple, range, ct, maniptype);

        /* 3) The per-protocol part of the manip is made to map into
         * the range to make a unique tuple.
         */

        /* Only bother mapping if it's not already in range and unique */
        if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
                if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
                        if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
                            l4proto_in_range(tuple, maniptype,
                                             &range->min_proto,
                                             &range->max_proto) &&
                            (range->min_proto.all == range->max_proto.all ||
                             !nf_nat_used_tuple(tuple, ct)))
                                return;
                } else if (!nf_nat_used_tuple(tuple, ct)) {
                        return;
                }
        }

        /* Last chance: get protocol to try to obtain unique tuple. */
        nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
}

struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
{
        struct nf_conn_nat *nat = nfct_nat(ct);
        if (nat)
                return nat;

        if (!nf_ct_is_confirmed(ct))
                nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);

        return nat;
}
EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);

unsigned int
nf_nat_setup_info(struct nf_conn *ct,
                  const struct nf_nat_range2 *range,
                  enum nf_nat_manip_type maniptype)
{
        struct net *net = nf_ct_net(ct);
        struct nf_conntrack_tuple curr_tuple, new_tuple;

        /* Can't setup nat info for confirmed ct. */
        if (nf_ct_is_confirmed(ct))
                return NF_ACCEPT;

        WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
                maniptype != NF_NAT_MANIP_DST);

        if (WARN_ON(nf_nat_initialized(ct, maniptype)))
                return NF_DROP;

        /* What we've got will look like inverse of reply. Normally
         * this is what is in the conntrack, except for prior
         * manipulations (future optimization: if num_manips == 0,
         * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
         */
        nf_ct_invert_tuple(&curr_tuple,
                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

        get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);

        if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
                struct nf_conntrack_tuple reply;

                /* Alter conntrack table so will recognize replies. */
                nf_ct_invert_tuple(&reply, &new_tuple);
                nf_conntrack_alter_reply(ct, &reply);

                /* Non-atomic: we own this at the moment. */
                if (maniptype == NF_NAT_MANIP_SRC)
                        ct->status |= IPS_SRC_NAT;
                else
                        ct->status |= IPS_DST_NAT;

                if (nfct_help(ct) && !nfct_seqadj(ct))
                        if (!nfct_seqadj_ext_add(ct))
                                return NF_DROP;
        }

        if (maniptype == NF_NAT_MANIP_SRC) {
                unsigned int srchash;
                spinlock_t *lock;

                srchash = hash_by_src(net, nf_ct_zone(ct),
                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
                lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
                spin_lock_bh(lock);
                hlist_add_head_rcu(&ct->nat_bysource,
                                   &nf_nat_bysource[srchash]);
                spin_unlock_bh(lock);
        }

        /* It's done. */
        if (maniptype == NF_NAT_MANIP_DST)
                ct->status |= IPS_DST_NAT_DONE;
        else
                ct->status |= IPS_SRC_NAT_DONE;

        return NF_ACCEPT;
}
EXPORT_SYMBOL(nf_nat_setup_info);

static unsigned int
__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
{
        /* Force range to this IP; let proto decide mapping for
         * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
         * Use reply in case it's already been mangled (eg local packet).
         */
        union nf_inet_addr ip =
                (manip == NF_NAT_MANIP_SRC ?
                ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
                ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
        struct nf_nat_range2 range = {
                .flags                = NF_NAT_RANGE_MAP_IPS,
                .min_addr        = ip,
                .max_addr        = ip,
        };
        return nf_nat_setup_info(ct, &range, manip);
}

unsigned int
nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
        return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
}
EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);

/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
                           enum ip_conntrack_info ctinfo,
                           unsigned int hooknum,
                           struct sk_buff *skb)
{
        enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        unsigned int verdict = NF_ACCEPT;
        unsigned long statusbit;

        if (mtype == NF_NAT_MANIP_SRC)
                statusbit = IPS_SRC_NAT;
        else
                statusbit = IPS_DST_NAT;

        /* Invert if this is reply dir. */
        if (dir == IP_CT_DIR_REPLY)
                statusbit ^= IPS_NAT_MASK;

        /* Non-atomic: these bits don't change. */
        if (ct->status & statusbit)
                verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);

        return verdict;
}
EXPORT_SYMBOL_GPL(nf_nat_packet);

static bool in_vrf_postrouting(const struct nf_hook_state *state)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (state->hook == NF_INET_POST_ROUTING &&
            netif_is_l3_master(state->out))
                return true;
#endif
        return false;
}

unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state)
{
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        struct nf_conn_nat *nat;
        /* maniptype == SRC for postrouting. */
        enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);

        ct = nf_ct_get(skb, &ctinfo);
        /* Can't track?  It's not due to stress, or conntrack would
         * have dropped it.  Hence it's the user's responsibilty to
         * packet filter it out, or implement conntrack/NAT for that
         * protocol. 8) --RR
         */
        if (!ct || in_vrf_postrouting(state))
                return NF_ACCEPT;

        nat = nfct_nat(ct);

        switch (ctinfo) {
        case IP_CT_RELATED:
        case IP_CT_RELATED_REPLY:
                /* Only ICMPs can be IP_CT_IS_REPLY.  Fallthrough */
        case IP_CT_NEW:
                /* Seen it before?  This can happen for loopback, retrans,
                 * or local packets.
                 */
                if (!nf_nat_initialized(ct, maniptype)) {
                        struct nf_nat_lookup_hook_priv *lpriv = priv;
                        struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
                        unsigned int ret;
                        int i;

                        if (!e)
                                goto null_bind;

                        for (i = 0; i < e->num_hook_entries; i++) {
                                ret = e->hooks[i].hook(e->hooks[i].priv, skb,
                                                       state);
                                if (ret != NF_ACCEPT)
                                        return ret;
                                if (nf_nat_initialized(ct, maniptype))
                                        goto do_nat;
                        }
null_bind:
                        ret = nf_nat_alloc_null_binding(ct, state->hook);
                        if (ret != NF_ACCEPT)
                                return ret;
                } else {
                        pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
                                 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
                                 ct, ct->status);
                        if (nf_nat_oif_changed(state->hook, ctinfo, nat,
                                               state->out))
                                goto oif_changed;
                }
                break;
        default:
                /* ESTABLISHED */
                WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
                        ctinfo != IP_CT_ESTABLISHED_REPLY);
                if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
                        goto oif_changed;
        }
do_nat:
        return nf_nat_packet(ct, ctinfo, state->hook, skb);

oif_changed:
        nf_ct_kill_acct(ct, ctinfo, skb);
        return NF_DROP;
}
EXPORT_SYMBOL_GPL(nf_nat_inet_fn);

struct nf_nat_proto_clean {
        u8        l3proto;
        u8        l4proto;
};

/* kill conntracks with affected NAT section */
static int nf_nat_proto_remove(struct nf_conn *i, void *data)
{
        const struct nf_nat_proto_clean *clean = data;

        if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
            (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
                return 0;

        return i->status & IPS_NAT_MASK ? 1 : 0;
}

static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
        unsigned int h;

        h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
        hlist_del_rcu(&ct->nat_bysource);
        spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
}

static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
{
        if (nf_nat_proto_remove(ct, data))
                return 1;

        /* This module is being removed and conntrack has nat null binding.
         * Remove it from bysource hash, as the table will be freed soon.
         *
         * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
         * will delete entry from already-freed table.
         */
        if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
                nf_nat_cleanup_conntrack(ct);

        /* don't delete conntrack.  Although that would make things a lot
         * simpler, we'd end up flushing all conntracks on nat rmmod.
         */
        return 0;
}

#if IS_ENABLED(CONFIG_NF_CT_NETLINK)

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>

static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
        [CTA_PROTONAT_PORT_MIN]        = { .type = NLA_U16 },
        [CTA_PROTONAT_PORT_MAX]        = { .type = NLA_U16 },
};

static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
                                          struct nf_nat_range2 *range)
{
        if (tb[CTA_PROTONAT_PORT_MIN]) {
                range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
                range->max_proto.all = range->min_proto.all;
                range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
        }
        if (tb[CTA_PROTONAT_PORT_MAX]) {
                range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]);
                range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
        }
        return 0;
}

static int nfnetlink_parse_nat_proto(struct nlattr *attr,
                                     const struct nf_conn *ct,
                                     struct nf_nat_range2 *range)
{
        struct nlattr *tb[CTA_PROTONAT_MAX+1];
        int err;

        err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr,
                                          protonat_nla_policy, NULL);
        if (err < 0)
                return err;

        return nf_nat_l4proto_nlattr_to_range(tb, range);
}

static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
        [CTA_NAT_V4_MINIP]        = { .type = NLA_U32 },
        [CTA_NAT_V4_MAXIP]        = { .type = NLA_U32 },
        [CTA_NAT_V6_MINIP]        = { .len = sizeof(struct in6_addr) },
        [CTA_NAT_V6_MAXIP]        = { .len = sizeof(struct in6_addr) },
        [CTA_NAT_PROTO]                = { .type = NLA_NESTED },
};

static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
                                       struct nf_nat_range2 *range)
{
        if (tb[CTA_NAT_V4_MINIP]) {
                range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
                range->flags |= NF_NAT_RANGE_MAP_IPS;
        }

        if (tb[CTA_NAT_V4_MAXIP])
                range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
        else
                range->max_addr.ip = range->min_addr.ip;

        return 0;
}

static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
                                       struct nf_nat_range2 *range)
{
        if (tb[CTA_NAT_V6_MINIP]) {
                nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
                           sizeof(struct in6_addr));
                range->flags |= NF_NAT_RANGE_MAP_IPS;
        }

        if (tb[CTA_NAT_V6_MAXIP])
                nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP],
                           sizeof(struct in6_addr));
        else
                range->max_addr = range->min_addr;

        return 0;
}

static int
nfnetlink_parse_nat(const struct nlattr *nat,
                    const struct nf_conn *ct, struct nf_nat_range2 *range)
{
        struct nlattr *tb[CTA_NAT_MAX+1];
        int err;

        memset(range, 0, sizeof(*range));

        err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat,
                                          nat_nla_policy, NULL);
        if (err < 0)
                return err;

        switch (nf_ct_l3num(ct)) {
        case NFPROTO_IPV4:
                err = nf_nat_ipv4_nlattr_to_range(tb, range);
                break;
        case NFPROTO_IPV6:
                err = nf_nat_ipv6_nlattr_to_range(tb, range);
                break;
        default:
                err = -EPROTONOSUPPORT;
                break;
        }

        if (err)
                return err;

        if (!tb[CTA_NAT_PROTO])
                return 0;

        return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
}

/* This function is called under rcu_read_lock() */
static int
nfnetlink_parse_nat_setup(struct nf_conn *ct,
                          enum nf_nat_manip_type manip,
                          const struct nlattr *attr)
{
        struct nf_nat_range2 range;
        int err;

        /* Should not happen, restricted to creating new conntracks
         * via ctnetlink.
         */
        if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
                return -EEXIST;

        /* No NAT information has been passed, allocate the null-binding */
        if (attr == NULL)
                return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0;

        err = nfnetlink_parse_nat(attr, ct, &range);
        if (err < 0)
                return err;

        return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
}
#else
static int
nfnetlink_parse_nat_setup(struct nf_conn *ct,
                          enum nf_nat_manip_type manip,
                          const struct nlattr *attr)
{
        return -EOPNOTSUPP;
}
#endif

static struct nf_ct_helper_expectfn follow_master_nat = {
        .name                = "nat-follow-master",
        .expectfn        = nf_nat_follow_master,
};

int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
                       const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
{
        struct nat_net *nat_net = net_generic(net, nat_net_id);
        struct nf_nat_hooks_net *nat_proto_net;
        struct nf_nat_lookup_hook_priv *priv;
        unsigned int hooknum = ops->hooknum;
        struct nf_hook_ops *nat_ops;
        int i, ret;

        if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
                return -EINVAL;

        nat_proto_net = &nat_net->nat_proto_net[pf];

        for (i = 0; i < ops_count; i++) {
                if (orig_nat_ops[i].hooknum == hooknum) {
                        hooknum = i;
                        break;
                }
        }

        if (WARN_ON_ONCE(i == ops_count))
                return -EINVAL;

        mutex_lock(&nf_nat_proto_mutex);
        if (!nat_proto_net->nat_hook_ops) {
                WARN_ON(nat_proto_net->users != 0);

                nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
                if (!nat_ops) {
                        mutex_unlock(&nf_nat_proto_mutex);
                        return -ENOMEM;
                }

                for (i = 0; i < ops_count; i++) {
                        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
                        if (priv) {
                                nat_ops[i].priv = priv;
                                continue;
                        }
                        mutex_unlock(&nf_nat_proto_mutex);
                        while (i)
                                kfree(nat_ops[--i].priv);
                        kfree(nat_ops);
                        return -ENOMEM;
                }

                ret = nf_register_net_hooks(net, nat_ops, ops_count);
                if (ret < 0) {
                        mutex_unlock(&nf_nat_proto_mutex);
                        for (i = 0; i < ops_count; i++)
                                kfree(nat_ops[i].priv);
                        kfree(nat_ops);
                        return ret;
                }

                nat_proto_net->nat_hook_ops = nat_ops;
        }

        nat_ops = nat_proto_net->nat_hook_ops;
        priv = nat_ops[hooknum].priv;
        if (WARN_ON_ONCE(!priv)) {
                mutex_unlock(&nf_nat_proto_mutex);
                return -EOPNOTSUPP;
        }

        ret = nf_hook_entries_insert_raw(&priv->entries, ops);
        if (ret == 0)
                nat_proto_net->users++;

        mutex_unlock(&nf_nat_proto_mutex);
        return ret;
}

void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
                          unsigned int ops_count)
{
        struct nat_net *nat_net = net_generic(net, nat_net_id);
        struct nf_nat_hooks_net *nat_proto_net;
        struct nf_nat_lookup_hook_priv *priv;
        struct nf_hook_ops *nat_ops;
        int hooknum = ops->hooknum;
        int i;

        if (pf >= ARRAY_SIZE(nat_net->nat_proto_net))
                return;

        nat_proto_net = &nat_net->nat_proto_net[pf];

        mutex_lock(&nf_nat_proto_mutex);
        if (WARN_ON(nat_proto_net->users == 0))
                goto unlock;

        nat_proto_net->users--;

        nat_ops = nat_proto_net->nat_hook_ops;
        for (i = 0; i < ops_count; i++) {
                if (nat_ops[i].hooknum == hooknum) {
                        hooknum = i;
                        break;
                }
        }
        if (WARN_ON_ONCE(i == ops_count))
                goto unlock;
        priv = nat_ops[hooknum].priv;
        nf_hook_entries_delete_raw(&priv->entries, ops);

        if (nat_proto_net->users == 0) {
                nf_unregister_net_hooks(net, nat_ops, ops_count);

                for (i = 0; i < ops_count; i++) {
                        priv = nat_ops[i].priv;
                        kfree_rcu(priv, rcu_head);
                }

                nat_proto_net->nat_hook_ops = NULL;
                kfree(nat_ops);
        }
unlock:
        mutex_unlock(&nf_nat_proto_mutex);
}

static struct pernet_operations nat_net_ops = {
        .id = &nat_net_id,
        .size = sizeof(struct nat_net),
};

static const struct nf_nat_hook nat_hook = {
        .parse_nat_setup        = nfnetlink_parse_nat_setup,
#ifdef CONFIG_XFRM
        .decode_session                = __nf_nat_decode_session,
#endif
        .manip_pkt                = nf_nat_manip_pkt,
        .remove_nat_bysrc        = nf_nat_cleanup_conntrack,
};

static int __init nf_nat_init(void)
{
        int ret, i;

        /* Leave them the same for the moment. */
        nf_nat_htable_size = nf_conntrack_htable_size;
        if (nf_nat_htable_size < CONNTRACK_LOCKS)
                nf_nat_htable_size = CONNTRACK_LOCKS;

        nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
        if (!nf_nat_bysource)
                return -ENOMEM;

        for (i = 0; i < CONNTRACK_LOCKS; i++)
                spin_lock_init(&nf_nat_locks[i]);

        ret = register_pernet_subsys(&nat_net_ops);
        if (ret < 0) {
                kvfree(nf_nat_bysource);
                return ret;
        }

        nf_ct_helper_expectfn_register(&follow_master_nat);

        WARN_ON(nf_nat_hook != NULL);
        RCU_INIT_POINTER(nf_nat_hook, &nat_hook);

        ret = register_nf_nat_bpf();
        if (ret < 0) {
                RCU_INIT_POINTER(nf_nat_hook, NULL);
                nf_ct_helper_expectfn_unregister(&follow_master_nat);
                synchronize_net();
                unregister_pernet_subsys(&nat_net_ops);
                kvfree(nf_nat_bysource);
        }

        return ret;
}

static void __exit nf_nat_cleanup(void)
{
        struct nf_nat_proto_clean clean = {};

        nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);

        nf_ct_helper_expectfn_unregister(&follow_master_nat);
        RCU_INIT_POINTER(nf_nat_hook, NULL);

        synchronize_net();
        kvfree(nf_nat_bysource);
        unregister_pernet_subsys(&nat_net_ops);
}

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Network address translation core");

module_init(nf_nat_init);
module_exit(nf_nat_cleanup);






































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#ifndef _INET6_HASHTABLES_H
#define _INET6_HASHTABLES_H


#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <linux/jhash.h>

#include <net/inet_sock.h>

#include <net/ipv6.h>
#include <net/netns/hash.h>

struct inet_hashinfo;

static inline unsigned int __inet6_ehashfn(const u32 lhash,
                                    const u16 lport,
                                    const u32 fhash,
                                    const __be16 fport,
                                    const u32 initval)
{
        const u32 ports = (((u32)lport) << 16) | (__force u32)fport;
        return jhash_3words(lhash, fhash, ports, initval);
}

/*
 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 *
 * The sockhash lock must be held as a reader here.
 */
struct sock *__inet6_lookup_established(struct net *net,
                                        struct inet_hashinfo *hashinfo,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum, const int dif,
                                        const int sdif);

typedef u32 (inet6_ehashfn_t)(const struct net *net,
                               const struct in6_addr *laddr, const u16 lport,
                               const struct in6_addr *faddr, const __be16 fport);

inet6_ehashfn_t inet6_ehashfn;

INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn);

struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
                                    struct sk_buff *skb, int doff,
                                    const struct in6_addr *saddr,
                                    __be16 sport,
                                    const struct in6_addr *daddr,
                                    unsigned short hnum,
                                    inet6_ehashfn_t *ehashfn);

struct sock *inet6_lookup_listener(struct net *net,
                                   struct inet_hashinfo *hashinfo,
                                   struct sk_buff *skb, int doff,
                                   const struct in6_addr *saddr,
                                   const __be16 sport,
                                   const struct in6_addr *daddr,
                                   const unsigned short hnum,
                                   const int dif, const int sdif);

struct sock *inet6_lookup_run_sk_lookup(struct net *net,
                                        int protocol,
                                        struct sk_buff *skb, int doff,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum, const int dif,
                                        inet6_ehashfn_t *ehashfn);

static inline struct sock *__inet6_lookup(struct net *net,
                                          struct inet_hashinfo *hashinfo,
                                          struct sk_buff *skb, int doff,
                                          const struct in6_addr *saddr,
                                          const __be16 sport,
                                          const struct in6_addr *daddr,
                                          const u16 hnum,
                                          const int dif, const int sdif,
                                          bool *refcounted)
{
        struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr,
                                                     sport, daddr, hnum,
                                                     dif, sdif);
        *refcounted = true;
        if (sk)
                return sk;
        *refcounted = false;
        return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
                                     daddr, hnum, dif, sdif);
}

static inline
struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff,
                              const struct in6_addr *saddr, const __be16 sport,
                              const struct in6_addr *daddr, const __be16 dport,
                              bool *refcounted, inet6_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool prefetched;

        sk = skb_steal_sock(skb, refcounted, &prefetched);
        if (!sk)
                return NULL;

        if (!prefetched || !sk_fullsock(sk))
                return sk;

        if (sk->sk_protocol == IPPROTO_TCP) {
                if (sk->sk_state != TCP_LISTEN)
                        return sk;
        } else if (sk->sk_protocol == IPPROTO_UDP) {
                if (sk->sk_state != TCP_CLOSE)
                        return sk;
        } else {
                return sk;
        }

        reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
                                          saddr, sport, daddr, ntohs(dport),
                                          ehashfn);
        if (!reuse_sk)
                return sk;

        /* We've chosen a new reuseport sock which is never refcounted. This
         * implies that sk also isn't refcounted.
         */
        WARN_ON_ONCE(*refcounted);

        return reuse_sk;
}

static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
                                              struct sk_buff *skb, int doff,
                                              const __be16 sport,
                                              const __be16 dport,
                                              int iif, int sdif,
                                              bool *refcounted)
{
        struct net *net = dev_net(skb_dst(skb)->dev);
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct sock *sk;

        sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport,
                              refcounted, inet6_ehashfn);
        if (IS_ERR(sk))
                return NULL;
        if (sk)
                return sk;

        return __inet6_lookup(net, hashinfo, skb,
                              doff, &ip6h->saddr, sport,
                              &ip6h->daddr, ntohs(dport),
                              iif, sdif, refcounted);
}

struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
                          struct sk_buff *skb, int doff,
                          const struct in6_addr *saddr, const __be16 sport,
                          const struct in6_addr *daddr, const __be16 dport,
                          const int dif);

int inet6_hash(struct sock *sk);

static inline bool inet6_match(struct net *net, const struct sock *sk,
                               const struct in6_addr *saddr,
                               const struct in6_addr *daddr,
                               const __portpair ports,
                               const int dif, const int sdif)
{
        if (!net_eq(sock_net(sk), net) ||
            sk->sk_family != AF_INET6 ||
            sk->sk_portpair != ports ||
            !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
            !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
                return false;

        /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
        return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
                                    sdif);
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

#endif /* _INET6_HASHTABLES_H */















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/mm.h>
#include <linux/percpu_counter.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>

/*
 * Arrange for legacy / undefined architecture specific flags to be
 * ignored by mmap handling code.
 */
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_ABOVE4G
#define MAP_ABOVE4G 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif

/*
 * The historical set of flags that all mmap implementations implicitly
 * support when a ->mmap_validate() op is not provided in file_operations.
 *
 * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the
 * kernel.
 */
#define LEGACY_MAP_MASK (MAP_SHARED \
                | MAP_PRIVATE \
                | MAP_FIXED \
                | MAP_ANONYMOUS \
                | MAP_DENYWRITE \
                | MAP_EXECUTABLE \
                | MAP_UNINITIALIZED \
                | MAP_GROWSDOWN \
                | MAP_LOCKED \
                | MAP_NORESERVE \
                | MAP_POPULATE \
                | MAP_NONBLOCK \
                | MAP_STACK \
                | MAP_HUGETLB \
                | MAP_32BIT \
                | MAP_ABOVE4G \
                | MAP_HUGE_2MB \
                | MAP_HUGE_1GB)

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
extern struct percpu_counter vm_committed_as;

#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif

unsigned long vm_memory_committed(void);

static inline void vm_acct_memory(long pages)
{
        percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}

static inline void vm_unacct_memory(long pages)
{
        vm_acct_memory(-pages);
}

/*
 * Allow architectures to handle additional protection and flag bits. The
 * overriding macros must be defined in the arch-specific asm/mman.h file.
 */

#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(flags) 0
#endif

#ifndef arch_validate_prot
/*
 * This is called from mprotect().  PROT_GROWSDOWN and PROT_GROWSUP have
 * already been masked out.
 *
 * Returns true if the prot flags are valid
 */
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
        return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif

#ifndef arch_validate_flags
/*
 * This is called from mmap() and mprotect() with the updated vma->vm_flags.
 *
 * Returns true if the VM_* flags are valid.
 */
static inline bool arch_validate_flags(unsigned long flags)
{
        return true;
}
#define arch_validate_flags arch_validate_flags
#endif

/*
 * Optimisation macro.  It is equivalent to:
 *      (x & bit1) ? bit2 : 0
 * but this version is faster.
 * ("bit1" and "bit2" must be single bits)
 */
#define _calc_vm_trans(x, bit1, bit2) \
  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
   : ((x) & (bit1)) / ((bit1) / (bit2))))

/*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
        return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
               _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
               _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
               arch_calc_vm_prot_bits(prot, pkey);
}

/*
 * Combine the mmap "flags" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_flag_bits(unsigned long flags)
{
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
               _calc_vm_trans(flags, MAP_SYNC,             VM_SYNC      ) |
               _calc_vm_trans(flags, MAP_STACK,             VM_NOHUGEPAGE) |
               arch_calc_vm_flag_bits(flags);
}

unsigned long vm_commit_limit(void);

#ifndef arch_memory_deny_write_exec_supported
static inline bool arch_memory_deny_write_exec_supported(void)
{
        return true;
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
#endif

/*
 * Denies creating a writable executable mapping or gaining executable permissions.
 *
 * This denies the following:
 *
 *         a)        mmap(PROT_WRITE | PROT_EXEC)
 *
 *        b)        mmap(PROT_WRITE)
 *                mprotect(PROT_EXEC)
 *
 *        c)        mmap(PROT_WRITE)
 *                mprotect(PROT_READ)
 *                mprotect(PROT_EXEC)
 *
 * But allows the following:
 *
 *        d)        mmap(PROT_READ | PROT_EXEC)
 *                mmap(PROT_READ | PROT_EXEC | PROT_BTI)
 */
static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)
{
        if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
                return false;

        if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
                return true;

        if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
                return true;

        return false;
}

#endif /* _LINUX_MMAN_H */


















































































    2 












































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMEKEEPING_H
#define _LINUX_TIMEKEEPING_H

#include <linux/errno.h>
#include <linux/clocksource_ids.h>
#include <linux/ktime.h>

/* Included from linux/ktime.h */

void timekeeping_init(void);
extern int timekeeping_suspended;

/* Architecture timer tick functions: */
extern void legacy_timer_tick(unsigned long ticks);

/*
 * Get and set timeofday
 */
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);

/*
 * ktime_get() family - read the current time in a multitude of ways.
 *
 * The default time reference is CLOCK_MONOTONIC, starting at
 * boot time but not counting the time spent in suspend.
 * For other references, use the functions with "real", "clocktai",
 * "boottime" and "raw" suffixes.
 *
 * To get the time in a different format, use the ones with
 * "ns", "ts64" and "seconds" suffix.
 *
 * See Documentation/core-api/timekeeping.rst for more details.
 */


/*
 * timespec64 based interfaces
 */
extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);

void getboottime64(struct timespec64 *ts);

/*
 * time64_t base interfaces
 */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);

/*
 * ktime_t based interfaces
 */

enum tk_offsets {
        TK_OFFS_REAL,
        TK_OFFS_BOOT,
        TK_OFFS_TAI,
        TK_OFFS_MAX,
};

extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 *
 * Returns: real (wall) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
        return ktime_get_with_offset(TK_OFFS_REAL);
}

static inline ktime_t ktime_get_coarse_real(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_REAL);
}

/**
 * ktime_get_boottime - Get monotonic time since boot in ktime_t format
 *
 * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the
 * time spent in suspend.
 *
 * Returns: monotonic time since boot in ktime_t format
 */
static inline ktime_t ktime_get_boottime(void)
{
        return ktime_get_with_offset(TK_OFFS_BOOT);
}

static inline ktime_t ktime_get_coarse_boottime(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
}

/**
 * ktime_get_clocktai - Get the TAI time of day in ktime_t format
 *
 * Returns: the TAI time of day in ktime_t format
 */
static inline ktime_t ktime_get_clocktai(void)
{
        return ktime_get_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse_clocktai(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse(void)
{
        struct timespec64 ts;

        ktime_get_coarse_ts64(&ts);
        return timespec64_to_ktime(ts);
}

static inline u64 ktime_get_coarse_ns(void)
{
        return ktime_to_ns(ktime_get_coarse());
}

static inline u64 ktime_get_coarse_real_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_real());
}

static inline u64 ktime_get_coarse_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_boottime());
}

static inline u64 ktime_get_coarse_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_clocktai());
}

/**
 * ktime_mono_to_real - Convert monotonic time to clock realtime
 * @mono: monotonic time to convert
 *
 * Returns: time converted to realtime clock
 */
static inline ktime_t ktime_mono_to_real(ktime_t mono)
{
        return ktime_mono_to_any(mono, TK_OFFS_REAL);
}

/**
 * ktime_get_ns - Get the current time in nanoseconds
 *
 * Returns: current time converted to nanoseconds
 */
static inline u64 ktime_get_ns(void)
{
        return ktime_to_ns(ktime_get());
}

/**
 * ktime_get_real_ns - Get the current real/wall time in nanoseconds
 *
 * Returns: current real time converted to nanoseconds
 */
static inline u64 ktime_get_real_ns(void)
{
        return ktime_to_ns(ktime_get_real());
}

/**
 * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds
 *
 * Returns: current boottime converted to nanoseconds
 */
static inline u64 ktime_get_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_boottime());
}

/**
 * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds
 *
 * Returns: current TAI time converted to nanoseconds
 */
static inline u64 ktime_get_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_clocktai());
}

/**
 * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds
 *
 * Returns: current raw monotonic time converted to nanoseconds
 */
static inline u64 ktime_get_raw_ns(void)
{
        return ktime_to_ns(ktime_get_raw());
}

extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_tai_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);

/*
 * timespec64/time64_t interfaces utilizing the ktime based ones
 * for API completeness, these could be implemented more efficiently
 * if needed.
 */
static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_boottime());
}

static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
}

static inline time64_t ktime_get_boottime_seconds(void)
{
        return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
}

static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_clocktai());
}

static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
}

static inline time64_t ktime_get_clocktai_seconds(void)
{
        return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
}

/*
 * RTC specific
 */
extern bool timekeeping_rtc_skipsuspend(void);
extern bool timekeeping_rtc_skipresume(void);

extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);

/**
 * struct ktime_timestamps - Simultaneous mono/boot/real timestamps
 * @mono:        Monotonic timestamp
 * @boot:        Boottime timestamp
 * @real:        Realtime timestamp
 */
struct ktime_timestamps {
        u64                mono;
        u64                boot;
        u64                real;
};

/**
 * struct system_time_snapshot - simultaneous raw/real time capture with
 *                                 counter value
 * @cycles:        Clocksource counter value to produce the system times
 * @real:        Realtime system time
 * @raw:        Monotonic raw system time
 * @cs_id:        Clocksource ID
 * @clock_was_set_seq:        The sequence number of clock-was-set events
 * @cs_was_changed_seq:        The sequence number of clocksource change events
 */
struct system_time_snapshot {
        u64                        cycles;
        ktime_t                        real;
        ktime_t                        raw;
        enum clocksource_ids        cs_id;
        unsigned int                clock_was_set_seq;
        u8                        cs_was_changed_seq;
};

/**
 * struct system_device_crosststamp - system/device cross-timestamp
 *                                      (synchronized capture)
 * @device:                Device time
 * @sys_realtime:        Realtime simultaneous with device time
 * @sys_monoraw:        Monotonic raw simultaneous with device time
 */
struct system_device_crosststamp {
        ktime_t device;
        ktime_t sys_realtime;
        ktime_t sys_monoraw;
};

/**
 * struct system_counterval_t - system counter value with the ID of the
 *                                corresponding clocksource
 * @cycles:        System counter value
 * @cs_id:        Clocksource ID corresponding to system counter value. Used by
 *                timekeeping code to verify comparability of two cycle values.
 *                The default ID, CSID_GENERIC, does not identify a specific
 *                clocksource.
 */
struct system_counterval_t {
        u64                        cycles;
        enum clocksource_ids        cs_id;
};

/*
 * Get cross timestamp between system clock and device clock
 */
extern int get_device_system_crosststamp(
                        int (*get_time_fn)(ktime_t *device_time,
                                struct system_counterval_t *system_counterval,
                                void *ctx),
                        void *ctx,
                        struct system_time_snapshot *history,
                        struct system_device_crosststamp *xtstamp);

/*
 * Simultaneously snapshot realtime and monotonic raw clocks
 */
extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);

/* NMI safe mono/boot/realtime timestamps */
extern void ktime_get_fast_timestamps(struct ktime_timestamps *snap);

/*
 * Persistent clock related interfaces
 */
extern int persistent_clock_is_local;

extern void read_persistent_clock64(struct timespec64 *ts);
void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
                                          struct timespec64 *boot_offset);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
extern int update_persistent_clock64(struct timespec64 now);
#endif

#endif











































    4 












































































    4 








































    4 

    3 









    2 













    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/lockref.h>

#if USE_CMPXCHG_LOCKREF

/*
 * Note that the "cmpxchg()" reloads the "old" value for the
 * failure case.
 */
#define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
        int retry = 100;                                                        \
        struct lockref old;                                                        \
        BUILD_BUG_ON(sizeof(old) != 8);                                                \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {          \
                struct lockref new = old;                                        \
                CODE                                                                \
                if (likely(try_cmpxchg64_relaxed(&lockref->lock_count,                \
                                                 &old.lock_count,                \
                                                 new.lock_count))) {                \
                        SUCCESS;                                                \
                }                                                                \
                if (!--retry)                                                        \
                        break;                                                        \
        }                                                                        \
} while (0)

#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif

/**
 * lockref_get - Increments reference count unconditionally
 * @lockref: pointer to lockref structure
 *
 * This operation is only valid if you already hold a reference
 * to the object, so you know the count cannot be zero.
 */
void lockref_get(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
        ,
                return;
        );

        spin_lock(&lockref->lock);
        lockref->count++;
        spin_unlock(&lockref->lock);
}
EXPORT_SYMBOL(lockref_get);

/**
 * lockref_get_not_zero - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 */
int lockref_get_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_zero);

/**
 * lockref_put_not_zero - Decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count would become zero
 */
int lockref_put_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 1) {
                lockref->count--;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_put_not_zero);

/**
 * lockref_put_return - Decrement reference count if possible
 * @lockref: pointer to lockref structure
 *
 * Decrement the reference count and return the new value.
 * If the lockref was dead or locked, return an error.
 */
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}
EXPORT_SYMBOL(lockref_put_return);

/**
 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
 */
int lockref_put_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        break;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 1)
                return 0;
        lockref->count--;
        spin_unlock(&lockref->lock);
        return 1;
}
EXPORT_SYMBOL(lockref_put_or_lock);

/**
 * lockref_mark_dead - mark lockref dead
 * @lockref: pointer to lockref structure
 */
void lockref_mark_dead(struct lockref *lockref)
{
        assert_spin_locked(&lockref->lock);
        lockref->count = -128;
}
EXPORT_SYMBOL(lockref_mark_dead);

/**
 * lockref_get_not_dead - Increments count unless the ref is dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if lockref was dead
 */
int lockref_get_not_dead(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count < 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count >= 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_dead);

































































































    1 

    1 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BEN_VLAN_802_1Q_INC__
#define __BEN_VLAN_802_1Q_INC__

#include <linux/if_vlan.h>
#include <linux/u64_stats_sync.h>
#include <linux/list.h>

/* if this changes, algorithm will have to be reworked because this
 * depends on completely exhausting the VLAN identifier space.  Thus
 * it gives constant time look-up, but in many cases it wastes memory.
 */
#define VLAN_GROUP_ARRAY_SPLIT_PARTS  8
#define VLAN_GROUP_ARRAY_PART_LEN     (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS)

enum vlan_protos {
        VLAN_PROTO_8021Q        = 0,
        VLAN_PROTO_8021AD,
        VLAN_PROTO_NUM,
};

struct vlan_group {
        unsigned int                nr_vlan_devs;
        struct hlist_node        hlist;        /* linked list */
        struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM]
                                               [VLAN_GROUP_ARRAY_SPLIT_PARTS];
};

struct vlan_info {
        struct net_device        *real_dev; /* The ethernet(like) device
                                            * the vlan is attached to.
                                            */
        struct vlan_group        grp;
        struct list_head        vid_list;
        unsigned int                nr_vids;
        struct rcu_head                rcu;
};

static inline int vlan_proto_idx(__be16 proto)
{
        switch (proto) {
        case htons(ETH_P_8021Q):
                return VLAN_PROTO_8021Q;
        case htons(ETH_P_8021AD):
                return VLAN_PROTO_8021AD;
        default:
                WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto));
                return -EINVAL;
        }
}

static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
                                                         unsigned int pidx,
                                                         u16 vlan_id)
{
        struct net_device **array;

        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];

        /* paired with smp_wmb() in vlan_group_prealloc_vid() */
        smp_rmb();

        return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
}

static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
                                                       __be16 vlan_proto,
                                                       u16 vlan_id)
{
        int pidx = vlan_proto_idx(vlan_proto);

        if (pidx < 0)
                return NULL;

        return __vlan_group_get_device(vg, pidx, vlan_id);
}

static inline void vlan_group_set_device(struct vlan_group *vg,
                                         __be16 vlan_proto, u16 vlan_id,
                                         struct net_device *dev)
{
        int pidx = vlan_proto_idx(vlan_proto);
        struct net_device **array;

        if (!vg || pidx < 0)
                return;
        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
        array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
}

/* Must be invoked with rcu_read_lock or with RTNL. */
static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id)
{
        struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info);

        if (vlan_info)
                return vlan_group_get_device(&vlan_info->grp,
                                             vlan_proto, vlan_id);

        return NULL;
}

static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
{
        netdev_features_t ret;

        ret = real_dev->hw_enc_features &
              (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
               NETIF_F_GSO_ENCAP_ALL);

        if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
                return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
        return 0;
}

#define vlan_group_for_each_dev(grp, i, dev) \
        for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
                if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
                                                            (i) % VLAN_N_VID)))

int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto);
void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto);

/* found in vlan_dev.c */
void vlan_dev_set_ingress_priority(const struct net_device *dev,
                                   u32 skb_prio, u16 vlan_prio);
int vlan_dev_set_egress_priority(const struct net_device *dev,
                                 u32 skb_prio, u16 vlan_prio);
void vlan_dev_free_egress_priority(const struct net_device *dev);
int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
void vlan_dev_get_realdev_name(const struct net_device *dev, char *result,
                               size_t size);

int vlan_check_real_dev(struct net_device *real_dev,
                        __be16 protocol, u16 vlan_id,
                        struct netlink_ext_ack *extack);
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
bool vlan_dev_inherit_address(struct net_device *dev,
                              struct net_device *real_dev);

static inline u32 vlan_get_ingress_priority(struct net_device *dev,
                                            u16 vlan_tci)
{
        struct vlan_dev_priv *vip = vlan_dev_priv(dev);

        return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
}

#ifdef CONFIG_VLAN_8021Q_GVRP
int vlan_gvrp_request_join(const struct net_device *dev);
void vlan_gvrp_request_leave(const struct net_device *dev);
int vlan_gvrp_init_applicant(struct net_device *dev);
void vlan_gvrp_uninit_applicant(struct net_device *dev);
int vlan_gvrp_init(void);
void vlan_gvrp_uninit(void);
#else
static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_gvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_gvrp_init(void) { return 0; }
static inline void vlan_gvrp_uninit(void) {}
#endif

#ifdef CONFIG_VLAN_8021Q_MVRP
int vlan_mvrp_request_join(const struct net_device *dev);
void vlan_mvrp_request_leave(const struct net_device *dev);
int vlan_mvrp_init_applicant(struct net_device *dev);
void vlan_mvrp_uninit_applicant(struct net_device *dev);
int vlan_mvrp_init(void);
void vlan_mvrp_uninit(void);
#else
static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_mvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_mvrp_init(void) { return 0; }
static inline void vlan_mvrp_uninit(void) {}
#endif

extern const char vlan_fullname[];
extern const char vlan_version[];
int vlan_netlink_init(void);
void vlan_netlink_fini(void);

extern struct rtnl_link_ops vlan_link_ops;

extern unsigned int vlan_net_id;

struct proc_dir_entry;

struct vlan_net {
        /* /proc/net/vlan */
        struct proc_dir_entry *proc_vlan_dir;
        /* /proc/net/vlan/config */
        struct proc_dir_entry *proc_vlan_conf;
        /* Determines interface naming scheme. */
        unsigned short name_type;
};

#endif /* !(__BEN_VLAN_802_1Q_INC__) */





































































    2 








    2 
    2 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 











    1 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>
#include <linux/mnt_idmapping.h>
#include <uapi/linux/lsm.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @targ_ns:  The user namespace in which we need the capability
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
 * and has_capability() functions.  That is, it has the reverse semantics:
 * cap_has_capability() returns 0 when a task has a capability, but the
 * kernel's capable() and has_capability() returns 1 for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
                int cap, unsigned int opts)
{
        struct user_namespace *ns = targ_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (ns == cred->user_ns)
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred->user_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(const struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Return: 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
{
        struct user_namespace *ns;
        kuid_t kroot;

        if (!vfsuid_valid(rootvfsuid))
                return false;

        kroot = vfsuid_into_kuid(rootvfsuid);
        for (ns = current_user_ns();; ns = ns->parent) {
                if (from_kuid(ns, kroot) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct mnt_idmap *idmap,
                          struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size;
        kuid_t kroot;
        vfsuid_t vfsroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;
        size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf,
                                  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
        dput(dentry);
        /* gcc11 complains if we don't check for !tmpbuf */
        if (size < 0 || !tmpbuf)
                goto out_free;

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header(size, cap)) {
                root = 0;
        } else if (is_v3header(size, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If this is an idmapped mount shift the kuid. */
        vfsroot = make_vfsuid(idmap, fs_ns, kroot);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot));
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!rootid_owns_currentns(vfsroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

/**
 * rootid_from_xattr - translate root uid of vfs caps
 *
 * @value:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 * @task_ns:        user namespace of the caller
 */
static vfsuid_t rootid_from_xattr(const void *value, size_t size,
                                  struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return VFSUIDT_INIT(make_kuid(task_ns, rootid));
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/**
 * cap_convert_nscap - check vfs caps
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        used to retrieve inode to check permissions on
 * @ivalue:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 *
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: On success, return the new size; on error, return < 0.
 */
int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        vfsuid_t vfsrootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        vfsrootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!vfsuid_valid(vfsrootid))
                return -EINVAL;

        rootid = from_vfsuid(idmap, fs_ns, vfsrootid);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        /*
         * pP' = (X & fP) | (pI & fI)
         * The addition of pA' is handled later.
         */
        new->cap_permitted.val =
                (new->cap_bset.val & caps->permitted.val) |
                (new->cap_inheritable.val & caps->inheritable.val);

        if (caps->permitted.val & ~new->cap_permitted.val)
                /* insufficient to execute correctly */
                ret = -EPERM;

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/**
 * get_vfs_caps_from_disk - retrieve vfs caps from disk
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry from which @inode is retrieved
 * @cpu_caps:        vfs capabilities
 *
 * Extract the on-exec-apply capability sets for an executable file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        vfsuid_t rootvfsuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }

        rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
        if (!vfsuid_valid(rootvfsuid))
                return -ENODATA;

        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!rootid_owns_currentns(rootvfsuid))
                return -ENODATA;

        cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
        cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);

        /*
         * Rev1 had just a single 32-bit word, later expanded
         * to a second one for the high bits
         */
        if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
                cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
                cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
        }

        cpu_caps->permitted.val &= CAP_VALID_MASK;
        cpu_caps->inheritable.val &= CAP_VALID_MASK;

        cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file_mnt_idmap(file),
                                    file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

static inline bool __is_setuid(struct cred *new, const struct cred *old)
{ return !uid_eq(new->euid, old->uid); }

static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (!__is_setuid(new, old) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, is_setid;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        is_setid = __is_setuid(new, old) || __is_setgid(new, old);

        if ((is_setid || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || is_setid)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (is_setid ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry being altered
 * @name:        The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied.
 *
 * Return: 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilities to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Determine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Determine if the requested scheduler policy change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setioprio - Determine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Determine if the requested I/O priority change is permitted for the specified
 * task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setnice - Determine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Determine if the requested task priority change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2: The argument data for this function
 * @arg3: The argument data for this function
 * @arg4: The argument data for this function
 * @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                    || (cap_capable(current_cred(),
                                    current_cred()->user_ns,
                                    CAP_SETPCAP,
                                    CAP_OPT_NONE) != 0)                        /*[4]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         * [4] doing anything requires privilege (go read about
                         *     the "sendmail capabilities bug")
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted.
 *
 * Return: 1 if permission is granted, 0 if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        int cap_sys_admin = 0;

        if (cap_capable(current_cred(), &init_user_ns,
                                CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
                cap_sys_admin = 1;

        return cap_sys_admin;
}

/**
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.
 *
 * Return: 0 if this mapping should be allowed or -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

int cap_mmap_file(struct file *file, unsigned long reqprot,
                  unsigned long prot, unsigned long flags)
{
        return 0;
}

#ifdef CONFIG_SECURITY

static const struct lsm_id capability_lsmid = {
        .name = "capability",
        .id = LSM_ID_CAPABILITY,
};

static struct security_hook_list capability_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(mmap_file, cap_mmap_file),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                           &capability_lsmid);
        return 0;
}

DEFINE_LSM(capability) = {
        .name = "capability",
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */














    2 



    2 



    2 



















































    1 




    2 











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IPv6 library code, needed by static components when full IPv6 support is
 * not configured or static.
 */
#include <linux/export.h>
#include <net/ipv6.h>

/*
 * find out if nexthdr is a well-known extension header or a protocol
 */

bool ipv6_ext_hdr(u8 nexthdr)
{
        /*
         * find out if nexthdr is an extension header or a protocol
         */
        return   (nexthdr == NEXTHDR_HOP)        ||
                 (nexthdr == NEXTHDR_ROUTING)        ||
                 (nexthdr == NEXTHDR_FRAGMENT)        ||
                 (nexthdr == NEXTHDR_AUTH)        ||
                 (nexthdr == NEXTHDR_NONE)        ||
                 (nexthdr == NEXTHDR_DEST);
}
EXPORT_SYMBOL(ipv6_ext_hdr);

/*
 * Skip any extension headers. This is used by the ICMP module.
 *
 * Note that strictly speaking this conflicts with RFC 2460 4.0:
 * ...The contents and semantics of each extension header determine whether
 * or not to proceed to the next header.  Therefore, extension headers must
 * be processed strictly in the order they appear in the packet; a
 * receiver must not, for example, scan through a packet looking for a
 * particular kind of extension header and process that header prior to
 * processing all preceding ones.
 *
 * We do exactly this. This is a protocol bug. We can't decide after a
 * seeing an unknown discard-with-error flavour TLV option if it's a
 * ICMP error message or not (errors should never be send in reply to
 * ICMP error messages).
 *
 * But I see no other way to do this. This might need to be reexamined
 * when Linux implements ESP (and maybe AUTH) headers.
 * --AK
 *
 * This function parses (probably truncated) exthdr set "hdr".
 * "nexthdrp" initially points to some place,
 * where type of the first header can be found.
 *
 * It skips all well-known exthdrs, and returns pointer to the start
 * of unparsable area i.e. the first header with unknown type.
 * If it is not NULL *nexthdr is updated by type/protocol of this header.
 *
 * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
 *        - it may return pointer pointing beyond end of packet,
 *            if the last recognized header is truncated in the middle.
 *        - if packet is truncated, so that all parsed headers are skipped,
 *            it returns NULL.
 *          - First fragment header is skipped, not-first ones
 *            are considered as unparsable.
 *          - Reports the offset field of the final fragment header so it is
 *            possible to tell whether this is a first fragment, later fragment,
 *            or not fragmented.
 *          - ESP is unparsable for now and considered like
 *            normal payload protocol.
 *          - Note also special handling of AUTH header. Thanks to IPsec wizards.
 *
 * --ANK (980726)
 */

int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
                     __be16 *frag_offp)
{
        u8 nexthdr = *nexthdrp;

        *frag_offp = 0;

        while (ipv6_ext_hdr(nexthdr)) {
                struct ipv6_opt_hdr _hdr, *hp;
                int hdrlen;

                if (nexthdr == NEXTHDR_NONE)
                        return -1;
                hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
                if (!hp)
                        return -1;
                if (nexthdr == NEXTHDR_FRAGMENT) {
                        __be16 _frag_off, *fp;
                        fp = skb_header_pointer(skb,
                                                start+offsetof(struct frag_hdr,
                                                               frag_off),
                                                sizeof(_frag_off),
                                                &_frag_off);
                        if (!fp)
                                return -1;

                        *frag_offp = *fp;
                        if (ntohs(*frag_offp) & ~0x7)
                                break;
                        hdrlen = 8;
                } else if (nexthdr == NEXTHDR_AUTH)
                        hdrlen = ipv6_authlen(hp);
                else
                        hdrlen = ipv6_optlen(hp);

                nexthdr = hp->nexthdr;
                start += hdrlen;
        }

        *nexthdrp = nexthdr;
        return start;
}
EXPORT_SYMBOL(ipv6_skip_exthdr);

int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
{
        const unsigned char *nh = skb_network_header(skb);
        int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
        struct ipv6_opt_hdr *hdr;
        int len;

        if (offset + 2 > packet_len)
                goto bad;
        hdr = (struct ipv6_opt_hdr *)(nh + offset);
        len = ((hdr->hdrlen + 1) << 3);

        if (offset + len > packet_len)
                goto bad;

        offset += 2;
        len -= 2;

        while (len > 0) {
                int opttype = nh[offset];
                int optlen;

                if (opttype == type)
                        return offset;

                switch (opttype) {
                case IPV6_TLV_PAD1:
                        optlen = 1;
                        break;
                default:
                        if (len < 2)
                                goto bad;
                        optlen = nh[offset + 1] + 2;
                        if (optlen > len)
                                goto bad;
                        break;
                }
                offset += optlen;
                len -= optlen;
        }
        /* not_found */
 bad:
        return -1;
}
EXPORT_SYMBOL_GPL(ipv6_find_tlv);

/*
 * find the offset to specified header or the protocol number of last header
 * if target < 0. "last header" is transport protocol header, ESP, or
 * "No next header".
 *
 * Note that *offset is used as input/output parameter, and if it is not zero,
 * then it must be a valid offset to an inner IPv6 header. This can be used
 * to explore inner IPv6 header, eg. ICMPv6 error messages.
 *
 * If target header is found, its offset is set in *offset and return protocol
 * number. Otherwise, return -1.
 *
 * If the first fragment doesn't contain the final protocol header or
 * NEXTHDR_NONE it is considered invalid.
 *
 * Note that non-1st fragment is special case that "the protocol number
 * of last header" is "next header" field in Fragment header. In this case,
 * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
 * isn't NULL.
 *
 * if flags is not NULL and it's a fragment, then the frag flag
 * IP6_FH_F_FRAG will be set. If it's an AH header, the
 * IP6_FH_F_AUTH flag is set and target < 0, then this function will
 * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this
 * function will skip all those routing headers, where segements_left was 0.
 */
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
                  int target, unsigned short *fragoff, int *flags)
{
        unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
        u8 nexthdr = ipv6_hdr(skb)->nexthdr;
        bool found;

        if (fragoff)
                *fragoff = 0;

        if (*offset) {
                struct ipv6hdr _ip6, *ip6;

                ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
                if (!ip6 || (ip6->version != 6))
                        return -EBADMSG;
                start = *offset + sizeof(struct ipv6hdr);
                nexthdr = ip6->nexthdr;
        }

        do {
                struct ipv6_opt_hdr _hdr, *hp;
                unsigned int hdrlen;
                found = (nexthdr == target);

                if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) {
                        if (target < 0 || found)
                                break;
                        return -ENOENT;
                }

                hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
                if (!hp)
                        return -EBADMSG;

                if (nexthdr == NEXTHDR_ROUTING) {
                        struct ipv6_rt_hdr _rh, *rh;

                        rh = skb_header_pointer(skb, start, sizeof(_rh),
                                                &_rh);
                        if (!rh)
                                return -EBADMSG;

                        if (flags && (*flags & IP6_FH_F_SKIP_RH) &&
                            rh->segments_left == 0)
                                found = false;
                }

                if (nexthdr == NEXTHDR_FRAGMENT) {
                        unsigned short _frag_off;
                        __be16 *fp;

                        if (flags)        /* Indicate that this is a fragment */
                                *flags |= IP6_FH_F_FRAG;
                        fp = skb_header_pointer(skb,
                                                start+offsetof(struct frag_hdr,
                                                               frag_off),
                                                sizeof(_frag_off),
                                                &_frag_off);
                        if (!fp)
                                return -EBADMSG;

                        _frag_off = ntohs(*fp) & ~0x7;
                        if (_frag_off) {
                                if (target < 0 &&
                                    ((!ipv6_ext_hdr(hp->nexthdr)) ||
                                     hp->nexthdr == NEXTHDR_NONE)) {
                                        if (fragoff)
                                                *fragoff = _frag_off;
                                        return hp->nexthdr;
                                }
                                if (!found)
                                        return -ENOENT;
                                if (fragoff)
                                        *fragoff = _frag_off;
                                break;
                        }
                        hdrlen = 8;
                } else if (nexthdr == NEXTHDR_AUTH) {
                        if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
                                break;
                        hdrlen = ipv6_authlen(hp);
                } else
                        hdrlen = ipv6_optlen(hp);

                if (!found) {
                        nexthdr = hp->nexthdr;
                        start += hdrlen;
                }
        } while (!found);

        *offset = start;
        return nexthdr;
}
EXPORT_SYMBOL(ipv6_find_hdr);







































    1 



















    1 




    1 






















    1 



    1 


























































    1 













    1 




    1 









    1 






























































































    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright Red Hat Inc. 2017
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions manipulate sctp stream queue/scheduling.
 *
 * Please send any bug reports or fixes you make to the
 * email addresched(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
 */

#include <linux/list.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>

/* First Come First Serve (a.k.a. FIFO)
 * RFC DRAFT ndata Section 3.1
 */
static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid,
                               __u16 value, gfp_t gfp)
{
        return 0;
}

static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid,
                               __u16 *value)
{
        *value = 0;
        return 0;
}

static int sctp_sched_fcfs_init(struct sctp_stream *stream)
{
        return 0;
}

static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid,
                                    gfp_t gfp)
{
        return 0;
}

static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid)
{
}

static void sctp_sched_fcfs_enqueue(struct sctp_outq *q,
                                    struct sctp_datamsg *msg)
{
}

static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q)
{
        struct sctp_stream *stream = &q->asoc->stream;
        struct sctp_chunk *ch = NULL;
        struct list_head *entry;

        if (list_empty(&q->out_chunk_list))
                goto out;

        if (stream->out_curr) {
                ch = list_entry(stream->out_curr->ext->outq.next,
                                struct sctp_chunk, stream_list);
        } else {
                entry = q->out_chunk_list.next;
                ch = list_entry(entry, struct sctp_chunk, list);
        }

        sctp_sched_dequeue_common(q, ch);

out:
        return ch;
}

static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q,
                                         struct sctp_chunk *chunk)
{
}

static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream)
{
}

static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream)
{
}

static struct sctp_sched_ops sctp_sched_fcfs = {
        .set = sctp_sched_fcfs_set,
        .get = sctp_sched_fcfs_get,
        .init = sctp_sched_fcfs_init,
        .init_sid = sctp_sched_fcfs_init_sid,
        .free_sid = sctp_sched_fcfs_free_sid,
        .enqueue = sctp_sched_fcfs_enqueue,
        .dequeue = sctp_sched_fcfs_dequeue,
        .dequeue_done = sctp_sched_fcfs_dequeue_done,
        .sched_all = sctp_sched_fcfs_sched_all,
        .unsched_all = sctp_sched_fcfs_unsched_all,
};

static void sctp_sched_ops_fcfs_init(void)
{
        sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs);
}

/* API to other parts of the stack */

static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1];

void sctp_sched_ops_register(enum sctp_sched_type sched,
                             struct sctp_sched_ops *sched_ops)
{
        sctp_sched_ops[sched] = sched_ops;
}

void sctp_sched_ops_init(void)
{
        sctp_sched_ops_fcfs_init();
        sctp_sched_ops_prio_init();
        sctp_sched_ops_rr_init();
        sctp_sched_ops_fc_init();
        sctp_sched_ops_wfq_init();
}

static void sctp_sched_free_sched(struct sctp_stream *stream)
{
        struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
        struct sctp_stream_out_ext *soute;
        int i;

        sched->unsched_all(stream);
        for (i = 0; i < stream->outcnt; i++) {
                soute = SCTP_SO(stream, i)->ext;
                if (!soute)
                        continue;
                sched->free_sid(stream, i);
                /* Give the next scheduler a clean slate. */
                memset_after(soute, 0, outq);
        }
}

int sctp_sched_set_sched(struct sctp_association *asoc,
                         enum sctp_sched_type sched)
{
        struct sctp_sched_ops *old = asoc->outqueue.sched;
        struct sctp_datamsg *msg = NULL;
        struct sctp_sched_ops *n;
        struct sctp_chunk *ch;
        int i, ret = 0;

        if (sched > SCTP_SS_MAX)
                return -EINVAL;

        n = sctp_sched_ops[sched];
        if (old == n)
                return ret;

        if (old)
                sctp_sched_free_sched(&asoc->stream);

        asoc->outqueue.sched = n;
        n->init(&asoc->stream);
        for (i = 0; i < asoc->stream.outcnt; i++) {
                if (!SCTP_SO(&asoc->stream, i)->ext)
                        continue;

                ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC);
                if (ret)
                        goto err;
        }

        /* We have to requeue all chunks already queued. */
        list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
                if (ch->msg == msg)
                        continue;
                msg = ch->msg;
                n->enqueue(&asoc->outqueue, msg);
        }

        return ret;

err:
        sctp_sched_free_sched(&asoc->stream);
        asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */

        return ret;
}

int sctp_sched_get_sched(struct sctp_association *asoc)
{
        int i;

        for (i = 0; i <= SCTP_SS_MAX; i++)
                if (asoc->outqueue.sched == sctp_sched_ops[i])
                        return i;

        return 0;
}

int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
                         __u16 value, gfp_t gfp)
{
        if (sid >= asoc->stream.outcnt)
                return -EINVAL;

        if (!SCTP_SO(&asoc->stream, sid)->ext) {
                int ret;

                ret = sctp_stream_init_ext(&asoc->stream, sid);
                if (ret)
                        return ret;
        }

        return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp);
}

int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
                         __u16 *value)
{
        if (sid >= asoc->stream.outcnt)
                return -EINVAL;

        if (!SCTP_SO(&asoc->stream, sid)->ext)
                return 0;

        return asoc->outqueue.sched->get(&asoc->stream, sid, value);
}

void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
{
        if (!list_is_last(&ch->frag_list, &ch->msg->chunks) &&
            !q->asoc->peer.intl_capable) {
                struct sctp_stream_out *sout;
                __u16 sid;

                /* datamsg is not finish, so save it as current one,
                 * in case application switch scheduler or a higher
                 * priority stream comes in.
                 */
                sid = sctp_chunk_stream_no(ch);
                sout = SCTP_SO(&q->asoc->stream, sid);
                q->asoc->stream.out_curr = sout;
                return;
        }

        q->asoc->stream.out_curr = NULL;
        q->sched->dequeue_done(q, ch);
}

/* Auxiliary functions for the schedulers */
void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch)
{
        list_del_init(&ch->list);
        list_del_init(&ch->stream_list);
        q->out_qlen -= ch->skb->len;
}

int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp)
{
        struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
        struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext;

        INIT_LIST_HEAD(&ext->outq);
        return sched->init_sid(stream, sid, gfp);
}

struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream)
{
        struct sctp_association *asoc;

        asoc = container_of(stream, struct sctp_association, stream);

        return asoc->outqueue.sched;
}






















































    1 
    1 



    1 
    1 





























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>

/*
 * bpf_arena is a sparsely populated shared memory region between bpf program and
 * user space process.
 *
 * For example on x86-64 the values could be:
 * user_vm_start 7f7d26200000     // picked by mmap()
 * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
 * For user space all pointers within the arena are normal 8-byte addresses.
 * In this example 7f7d26200000 is the address of the first page (pgoff=0).
 * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
 * (u32)7f7d26200000 -> 26200000
 * hence
 * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
 * kernel memory region.
 *
 * BPF JITs generate the following code to access arena:
 *   mov eax, eax  // eax has lower 32-bit of user pointer
 *   mov word ptr [rax + r12 + off], bx
 * where r12 == kern_vm_start and off is s16.
 * Hence allocate 4Gb + GUARD_SZ/2 on each side.
 *
 * Initially kernel vm_area and user vma are not populated.
 * User space can fault-in any address which will insert the page
 * into kernel and user vma.
 * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
 * which will insert it into kernel vm_area.
 * The later fault-in from user space will populate that page into user vma.
 */

/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8)
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)

struct bpf_arena {
        struct bpf_map map;
        u64 user_vm_start;
        u64 user_vm_end;
        struct vm_struct *kern_vm;
        struct maple_tree mt;
        struct list_head vma_list;
        struct mutex lock;
};

u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
        return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
}

u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
{
        return arena ? arena->user_vm_start : 0;
}

static long arena_map_peek_elem(struct bpf_map *map, void *value)
{
        return -EOPNOTSUPP;
}

static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
{
        return -EOPNOTSUPP;
}

static long arena_map_pop_elem(struct bpf_map *map, void *value)
{
        return -EOPNOTSUPP;
}

static long arena_map_delete_elem(struct bpf_map *map, void *value)
{
        return -EOPNOTSUPP;
}

static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
        return -EOPNOTSUPP;
}

static long compute_pgoff(struct bpf_arena *arena, long uaddr)
{
        return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
}

static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
        struct vm_struct *kern_vm;
        int numa_node = bpf_map_attr_numa_node(attr);
        struct bpf_arena *arena;
        u64 vm_range;
        int err = -ENOMEM;

        if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
            /* BPF_F_MMAPABLE must be set */
            !(attr->map_flags & BPF_F_MMAPABLE) ||
            /* No unsupported flags present */
            (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
                return ERR_PTR(-EINVAL);

        if (attr->map_extra & ~PAGE_MASK)
                /* If non-zero the map_extra is an expected user VMA start address */
                return ERR_PTR(-EINVAL);

        vm_range = (u64)attr->max_entries * PAGE_SIZE;
        if (vm_range > SZ_4G)
                return ERR_PTR(-E2BIG);

        if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
                /* user vma must not cross 32-bit boundary */
                return ERR_PTR(-ERANGE);

        kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
        if (!kern_vm)
                return ERR_PTR(-ENOMEM);

        arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
        if (!arena)
                goto err;

        arena->kern_vm = kern_vm;
        arena->user_vm_start = attr->map_extra;
        if (arena->user_vm_start)
                arena->user_vm_end = arena->user_vm_start + vm_range;

        INIT_LIST_HEAD(&arena->vma_list);
        bpf_map_init_from_attr(&arena->map, attr);
        mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
        mutex_init(&arena->lock);

        return &arena->map;
err:
        free_vm_area(kern_vm);
        return ERR_PTR(err);
}

static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
        struct page *page;
        pte_t pte;

        pte = ptep_get(ptep);
        if (!pte_present(pte)) /* sanity check */
                return 0;
        page = pte_page(pte);
        /*
         * We do not update pte here:
         * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
         * 2. TLB flushing is batched or deferred. Even if we clear pte,
         * the TLB entries can stick around and continue to permit access to
         * the freed page. So it all relies on 1.
         */
        __free_page(page);
        return 0;
}

static void arena_map_free(struct bpf_map *map)
{
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);

        /*
         * Check that user vma-s are not around when bpf map is freed.
         * mmap() holds vm_file which holds bpf_map refcnt.
         * munmap() must have happened on vma followed by arena_vm_close()
         * which would clear arena->vma_list.
         */
        if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
                return;

        /*
         * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
         * It unmaps everything from vmalloc area and clears pgtables.
         * Call apply_to_existing_page_range() first to find populated ptes and
         * free those pages.
         */
        apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
                                     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
        free_vm_area(arena->kern_vm);
        mtree_destroy(&arena->mt);
        bpf_map_area_free(arena);
}

static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
{
        return ERR_PTR(-EINVAL);
}

static long arena_map_update_elem(struct bpf_map *map, void *key,
                                  void *value, u64 flags)
{
        return -EOPNOTSUPP;
}

static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
                               const struct btf_type *key_type, const struct btf_type *value_type)
{
        return 0;
}

static u64 arena_map_mem_usage(const struct bpf_map *map)
{
        return 0;
}

struct vma_list {
        struct vm_area_struct *vma;
        struct list_head head;
};

static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
{
        struct vma_list *vml;

        vml = kmalloc(sizeof(*vml), GFP_KERNEL);
        if (!vml)
                return -ENOMEM;
        vma->vm_private_data = vml;
        vml->vma = vma;
        list_add(&vml->head, &arena->vma_list);
        return 0;
}

static void arena_vm_close(struct vm_area_struct *vma)
{
        struct bpf_map *map = vma->vm_file->private_data;
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
        struct vma_list *vml;

        guard(mutex)(&arena->lock);
        vml = vma->vm_private_data;
        list_del(&vml->head);
        vma->vm_private_data = NULL;
        kfree(vml);
}

#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */

static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
{
        struct bpf_map *map = vmf->vma->vm_file->private_data;
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
        struct page *page;
        long kbase, kaddr;
        int ret;

        kbase = bpf_arena_get_kern_vm_start(arena);
        kaddr = kbase + (u32)(vmf->address);

        guard(mutex)(&arena->lock);
        page = vmalloc_to_page((void *)kaddr);
        if (page)
                /* already have a page vmap-ed */
                goto out;

        if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
                /* User space requested to segfault when page is not allocated by bpf prog */
                return VM_FAULT_SIGSEGV;

        ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
        if (ret)
                return VM_FAULT_SIGSEGV;

        /* Account into memcg of the process that created bpf_arena */
        ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
        if (ret) {
                mtree_erase(&arena->mt, vmf->pgoff);
                return VM_FAULT_SIGSEGV;
        }

        ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
        if (ret) {
                mtree_erase(&arena->mt, vmf->pgoff);
                __free_page(page);
                return VM_FAULT_SIGSEGV;
        }
out:
        page_ref_add(page, 1);
        vmf->page = page;
        return 0;
}

static const struct vm_operations_struct arena_vm_ops = {
        .close                = arena_vm_close,
        .fault          = arena_vm_fault,
};

static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
                                             unsigned long len, unsigned long pgoff,
                                             unsigned long flags)
{
        struct bpf_map *map = filp->private_data;
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
        long ret;

        if (pgoff)
                return -EINVAL;
        if (len > SZ_4G)
                return -E2BIG;

        /* if user_vm_start was specified at arena creation time */
        if (arena->user_vm_start) {
                if (len > arena->user_vm_end - arena->user_vm_start)
                        return -E2BIG;
                if (len != arena->user_vm_end - arena->user_vm_start)
                        return -EINVAL;
                if (addr != arena->user_vm_start)
                        return -EINVAL;
        }

        ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
        if (IS_ERR_VALUE(ret))
                return ret;
        if ((ret >> 32) == ((ret + len - 1) >> 32))
                return ret;
        if (WARN_ON_ONCE(arena->user_vm_start))
                /* checks at map creation time should prevent this */
                return -EFAULT;
        return round_up(ret, SZ_4G);
}

static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);

        guard(mutex)(&arena->lock);
        if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
                /*
                 * If map_extra was not specified at arena creation time then
                 * 1st user process can do mmap(NULL, ...) to pick user_vm_start
                 * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
                 *   or
                 * specify addr in map_extra and
                 * use the same addr later with mmap(addr, MAP_FIXED..);
                 */
                return -EBUSY;

        if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
                /* all user processes must have the same size of mmap-ed region */
                return -EBUSY;

        /* Earlier checks should prevent this */
        if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff))
                return -EFAULT;

        if (remember_vma(arena, vma))
                return -ENOMEM;

        arena->user_vm_start = vma->vm_start;
        arena->user_vm_end = vma->vm_end;
        /*
         * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
         * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
         * potential change of user_vm_start.
         */
        vm_flags_set(vma, VM_DONTEXPAND);
        vma->vm_ops = &arena_vm_ops;
        return 0;
}

static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
{
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);

        if ((u64)off > arena->user_vm_end - arena->user_vm_start)
                return -ERANGE;
        *imm = (unsigned long)arena->user_vm_start;
        return 0;
}

BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
const struct bpf_map_ops arena_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc = arena_map_alloc,
        .map_free = arena_map_free,
        .map_direct_value_addr = arena_map_direct_value_addr,
        .map_mmap = arena_map_mmap,
        .map_get_unmapped_area = arena_get_unmapped_area,
        .map_get_next_key = arena_map_get_next_key,
        .map_push_elem = arena_map_push_elem,
        .map_peek_elem = arena_map_peek_elem,
        .map_pop_elem = arena_map_pop_elem,
        .map_lookup_elem = arena_map_lookup_elem,
        .map_update_elem = arena_map_update_elem,
        .map_delete_elem = arena_map_delete_elem,
        .map_check_btf = arena_map_check_btf,
        .map_mem_usage = arena_map_mem_usage,
        .map_btf_id = &bpf_arena_map_btf_ids[0],
};

static u64 clear_lo32(u64 val)
{
        return val & ~(u64)~0U;
}

/*
 * Allocate pages and vmap them into kernel vmalloc area.
 * Later the pages will be mmaped into user space vma.
 */
static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
{
        /* user_vm_end/start are fixed before bpf prog runs */
        long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
        u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
        struct page **pages;
        long pgoff = 0;
        u32 uaddr32;
        int ret, i;

        if (page_cnt > page_cnt_max)
                return 0;

        if (uaddr) {
                if (uaddr & ~PAGE_MASK)
                        return 0;
                pgoff = compute_pgoff(arena, uaddr);
                if (pgoff > page_cnt_max - page_cnt)
                        /* requested address will be outside of user VMA */
                        return 0;
        }

        /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
        pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                return 0;

        guard(mutex)(&arena->lock);

        if (uaddr)
                ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
                                         MT_ENTRY, GFP_KERNEL);
        else
                ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
                                        page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
        if (ret)
                goto out_free_pages;

        ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
                                  node_id, page_cnt, pages);
        if (ret)
                goto out;

        uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
        /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
         * will not overflow 32-bit. Lower 32-bit need to represent
         * contiguous user address range.
         * Map these pages at kern_vm_start base.
         * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
         * lower 32-bit and it's ok.
         */
        ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
                                kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
        if (ret) {
                for (i = 0; i < page_cnt; i++)
                        __free_page(pages[i]);
                goto out;
        }
        kvfree(pages);
        return clear_lo32(arena->user_vm_start) + uaddr32;
out:
        mtree_erase(&arena->mt, pgoff);
out_free_pages:
        kvfree(pages);
        return 0;
}

/*
 * If page is present in vmalloc area, unmap it from vmalloc area,
 * unmap it from all user space vma-s,
 * and free it.
 */
static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
        struct vma_list *vml;

        list_for_each_entry(vml, &arena->vma_list, head)
                zap_page_range_single(vml->vma, uaddr,
                                      PAGE_SIZE * page_cnt, NULL);
}

static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
{
        u64 full_uaddr, uaddr_end;
        long kaddr, pgoff, i;
        struct page *page;

        /* only aligned lower 32-bit are relevant */
        uaddr = (u32)uaddr;
        uaddr &= PAGE_MASK;
        full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
        uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
        if (full_uaddr >= uaddr_end)
                return;

        page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;

        guard(mutex)(&arena->lock);

        pgoff = compute_pgoff(arena, uaddr);
        /* clear range */
        mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);

        if (page_cnt > 1)
                /* bulk zap if multiple pages being freed */
                zap_pages(arena, full_uaddr, page_cnt);

        kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
        for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
                page = vmalloc_to_page((void *)kaddr);
                if (!page)
                        continue;
                if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
                        /* Optimization for the common case of page_cnt==1:
                         * If page wasn't mapped into some user vma there
                         * is no need to call zap_pages which is slow. When
                         * page_cnt is big it's faster to do the batched zap.
                         */
                        zap_pages(arena, full_uaddr, 1);
                vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
                __free_page(page);
        }
}

__bpf_kfunc_start_defs();

__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
                                        int node_id, u64 flags)
{
        struct bpf_map *map = p__map;
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);

        if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
                return NULL;

        return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
}

__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
        struct bpf_map *map = p__map;
        struct bpf_arena *arena = container_of(map, struct bpf_arena, map);

        if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
                return;
        arena_free_pages(arena, (long)ptr__ign, page_cnt);
}
__bpf_kfunc_end_defs();

BTF_KFUNCS_START(arena_kfuncs)
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
BTF_KFUNCS_END(arena_kfuncs)

static const struct btf_kfunc_id_set common_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &arena_kfuncs,
};

static int __init kfunc_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}
late_initcall(kfunc_init);








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 
























































































































    1 





















































    1 




































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Interface handling
 *
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005-2006, Devicescape Software, Inc.
 * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
 * Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright (c) 2016        Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 */
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/kcov.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>
#include "ieee80211_i.h"
#include "sta_info.h"
#include "debugfs_netdev.h"
#include "mesh.h"
#include "led.h"
#include "driver-ops.h"
#include "wme.h"
#include "rate.h"

/**
 * DOC: Interface list locking
 *
 * The interface list in each struct ieee80211_local is protected
 * three-fold:
 *
 * (1) modifications may only be done under the RTNL *and* wiphy mutex
 *     *and* iflist_mtx
 * (2) modifications are done in an RCU manner so atomic readers
 *     can traverse the list in RCU-safe blocks.
 *
 * As a consequence, reads (traversals) of the list can be protected
 * by either the RTNL, the wiphy mutex, the iflist_mtx or RCU.
 */

static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work);

bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_chanctx_conf *chanctx_conf;
        int power;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
        if (!chanctx_conf) {
                rcu_read_unlock();
                return false;
        }

        power = ieee80211_chandef_max_power(&chanctx_conf->def);
        rcu_read_unlock();

        if (sdata->deflink.user_power_level != IEEE80211_UNSET_POWER_LEVEL)
                power = min(power, sdata->deflink.user_power_level);

        if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL)
                power = min(power, sdata->deflink.ap_power_level);

        if (power != sdata->vif.bss_conf.txpower) {
                sdata->vif.bss_conf.txpower = power;
                ieee80211_hw_config(sdata->local, 0);
                return true;
        }

        return false;
}

void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
                              bool update_bss)
{
        if (__ieee80211_recalc_txpower(sdata) ||
            (update_bss && ieee80211_sdata_running(sdata)))
                ieee80211_link_info_change_notify(sdata, &sdata->deflink,
                                                  BSS_CHANGED_TXPOWER);
}

static u32 __ieee80211_idle_off(struct ieee80211_local *local)
{
        if (!(local->hw.conf.flags & IEEE80211_CONF_IDLE))
                return 0;

        local->hw.conf.flags &= ~IEEE80211_CONF_IDLE;
        return IEEE80211_CONF_CHANGE_IDLE;
}

static u32 __ieee80211_idle_on(struct ieee80211_local *local)
{
        if (local->hw.conf.flags & IEEE80211_CONF_IDLE)
                return 0;

        ieee80211_flush_queues(local, NULL, false);

        local->hw.conf.flags |= IEEE80211_CONF_IDLE;
        return IEEE80211_CONF_CHANGE_IDLE;
}

static u32 __ieee80211_recalc_idle(struct ieee80211_local *local,
                                   bool force_active)
{
        bool working, scanning, active;
        unsigned int led_trig_start = 0, led_trig_stop = 0;

        lockdep_assert_wiphy(local->hw.wiphy);

        active = force_active ||
                 !list_empty(&local->chanctx_list) ||
                 local->monitors;

        working = !local->ops->remain_on_channel &&
                  !list_empty(&local->roc_list);

        scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) ||
                   test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning);

        if (working || scanning)
                led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_WORK;
        else
                led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_WORK;

        if (active)
                led_trig_start |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED;
        else
                led_trig_stop |= IEEE80211_TPT_LEDTRIG_FL_CONNECTED;

        ieee80211_mod_tpt_led_trig(local, led_trig_start, led_trig_stop);

        if (working || scanning || active)
                return __ieee80211_idle_off(local);
        return __ieee80211_idle_on(local);
}

u32 ieee80211_idle_off(struct ieee80211_local *local)
{
        return __ieee80211_recalc_idle(local, true);
}

void ieee80211_recalc_idle(struct ieee80211_local *local)
{
        u32 change = __ieee80211_recalc_idle(local, false);
        if (change)
                ieee80211_hw_config(local, change);
}

static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
                                bool check_dup)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_sub_if_data *iter;
        u64 new, mask, tmp;
        u8 *m;
        int ret = 0;

        lockdep_assert_wiphy(local->hw.wiphy);

        if (is_zero_ether_addr(local->hw.wiphy->addr_mask))
                return 0;

        m = addr;
        new =        ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
                ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
                ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);

        m = local->hw.wiphy->addr_mask;
        mask =        ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
                ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
                ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);

        if (!check_dup)
                return ret;

        list_for_each_entry(iter, &local->interfaces, list) {
                if (iter == sdata)
                        continue;

                if (iter->vif.type == NL80211_IFTYPE_MONITOR &&
                    !(iter->u.mntr.flags & MONITOR_FLAG_ACTIVE))
                        continue;

                m = iter->vif.addr;
                tmp =        ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
                        ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
                        ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);

                if ((new & ~mask) != (tmp & ~mask)) {
                        ret = -EINVAL;
                        break;
                }
        }

        return ret;
}

static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_roc_work *roc;
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_sub_if_data *scan_sdata;
        int ret = 0;

        lockdep_assert_wiphy(local->hw.wiphy);

        /* To be the most flexible here we want to only limit changing the
         * address if the specific interface is doing offchannel work or
         * scanning.
         */
        if (netif_carrier_ok(sdata->dev))
                return -EBUSY;

        /* First check no ROC work is happening on this iface */
        list_for_each_entry(roc, &local->roc_list, list) {
                if (roc->sdata != sdata)
                        continue;

                if (roc->started) {
                        ret = -EBUSY;
                        goto unlock;
                }
        }

        /* And if this iface is scanning */
        if (local->scanning) {
                scan_sdata = rcu_dereference_protected(local->scan_sdata,
                                                       lockdep_is_held(&local->hw.wiphy->mtx));
                if (sdata == scan_sdata)
                        ret = -EBUSY;
        }

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                /* More interface types could be added here but changing the
                 * address while powered makes the most sense in client modes.
                 */
                break;
        default:
                ret = -EOPNOTSUPP;
        }

unlock:
        return ret;
}

static int _ieee80211_change_mac(struct ieee80211_sub_if_data *sdata,
                                 void *addr)
{
        struct ieee80211_local *local = sdata->local;
        struct sockaddr *sa = addr;
        bool check_dup = true;
        bool live = false;
        int ret;

        if (ieee80211_sdata_running(sdata)) {
                ret = ieee80211_can_powered_addr_change(sdata);
                if (ret)
                        return ret;

                live = true;
        }

        if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
            !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
                check_dup = false;

        ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup);
        if (ret)
                return ret;

        if (live)
                drv_remove_interface(local, sdata);
        ret = eth_mac_addr(sdata->dev, sa);

        if (ret == 0) {
                memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN);
                ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr);
        }

        /* Regardless of eth_mac_addr() return we still want to add the
         * interface back. This should not fail...
         */
        if (live)
                WARN_ON(drv_add_interface(local, sdata));

        return ret;
}

static int ieee80211_change_mac(struct net_device *dev, void *addr)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
        struct ieee80211_local *local = sdata->local;
        int ret;

        /*
         * This happens during unregistration if there's a bond device
         * active (maybe other cases?) and we must get removed from it.
         * But we really don't care anymore if it's not registered now.
         */
        if (!dev->ieee80211_ptr->registered)
                return 0;

        wiphy_lock(local->hw.wiphy);
        ret = _ieee80211_change_mac(sdata, addr);
        wiphy_unlock(local->hw.wiphy);

        return ret;
}

static inline int identical_mac_addr_allowed(int type1, int type2)
{
        return type1 == NL80211_IFTYPE_MONITOR ||
                type2 == NL80211_IFTYPE_MONITOR ||
                type1 == NL80211_IFTYPE_P2P_DEVICE ||
                type2 == NL80211_IFTYPE_P2P_DEVICE ||
                (type1 == NL80211_IFTYPE_AP && type2 == NL80211_IFTYPE_AP_VLAN) ||
                (type1 == NL80211_IFTYPE_AP_VLAN &&
                        (type2 == NL80211_IFTYPE_AP ||
                         type2 == NL80211_IFTYPE_AP_VLAN));
}

static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
                                            enum nl80211_iftype iftype)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_sub_if_data *nsdata;

        ASSERT_RTNL();
        lockdep_assert_wiphy(local->hw.wiphy);

        /* we hold the RTNL here so can safely walk the list */
        list_for_each_entry(nsdata, &local->interfaces, list) {
                if (nsdata != sdata && ieee80211_sdata_running(nsdata)) {
                        /*
                         * Only OCB and monitor mode may coexist
                         */
                        if ((sdata->vif.type == NL80211_IFTYPE_OCB &&
                             nsdata->vif.type != NL80211_IFTYPE_MONITOR) ||
                            (sdata->vif.type != NL80211_IFTYPE_MONITOR &&
                             nsdata->vif.type == NL80211_IFTYPE_OCB))
                                return -EBUSY;

                        /*
                         * Allow only a single IBSS interface to be up at any
                         * time. This is restricted because beacon distribution
                         * cannot work properly if both are in the same IBSS.
                         *
                         * To remove this restriction we'd have to disallow them
                         * from setting the same SSID on different IBSS interfaces
                         * belonging to the same hardware. Then, however, we're
                         * faced with having to adopt two different TSF timers...
                         */
                        if (iftype == NL80211_IFTYPE_ADHOC &&
                            nsdata->vif.type == NL80211_IFTYPE_ADHOC)
                                return -EBUSY;
                        /*
                         * will not add another interface while any channel
                         * switch is active.
                         */
                        if (nsdata->vif.bss_conf.csa_active)
                                return -EBUSY;

                        /*
                         * The remaining checks are only performed for interfaces
                         * with the same MAC address.
                         */
                        if (!ether_addr_equal(sdata->vif.addr,
                                              nsdata->vif.addr))
                                continue;

                        /*
                         * check whether it may have the same address
                         */
                        if (!identical_mac_addr_allowed(iftype,
                                                        nsdata->vif.type))
                                return -ENOTUNIQ;

                        /* No support for VLAN with MLO yet */
                        if (iftype == NL80211_IFTYPE_AP_VLAN &&
                            sdata->wdev.use_4addr &&
                            nsdata->vif.type == NL80211_IFTYPE_AP &&
                            nsdata->vif.valid_links)
                                return -EOPNOTSUPP;

                        /*
                         * can only add VLANs to enabled APs
                         */
                        if (iftype == NL80211_IFTYPE_AP_VLAN &&
                            nsdata->vif.type == NL80211_IFTYPE_AP)
                                sdata->bss = &nsdata->u.ap;
                }
        }

        return ieee80211_check_combinations(sdata, NULL, 0, 0);
}

static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata,
                                  enum nl80211_iftype iftype)
{
        int n_queues = sdata->local->hw.queues;
        int i;

        if (iftype == NL80211_IFTYPE_NAN)
                return 0;

        if (iftype != NL80211_IFTYPE_P2P_DEVICE) {
                for (i = 0; i < IEEE80211_NUM_ACS; i++) {
                        if (WARN_ON_ONCE(sdata->vif.hw_queue[i] ==
                                         IEEE80211_INVAL_HW_QUEUE))
                                return -EINVAL;
                        if (WARN_ON_ONCE(sdata->vif.hw_queue[i] >=
                                         n_queues))
                                return -EINVAL;
                }
        }

        if ((iftype != NL80211_IFTYPE_AP &&
             iftype != NL80211_IFTYPE_P2P_GO &&
             iftype != NL80211_IFTYPE_MESH_POINT) ||
            !ieee80211_hw_check(&sdata->local->hw, QUEUE_CONTROL)) {
                sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
                return 0;
        }

        if (WARN_ON_ONCE(sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE))
                return -EINVAL;

        if (WARN_ON_ONCE(sdata->vif.cab_queue >= n_queues))
                return -EINVAL;

        return 0;
}

static int ieee80211_open(struct net_device *dev)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
        int err;

        /* fail early if user set an invalid address */
        if (!is_valid_ether_addr(dev->dev_addr))
                return -EADDRNOTAVAIL;

        wiphy_lock(sdata->local->hw.wiphy);
        err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type);
        if (err)
                goto out;

        err = ieee80211_do_open(&sdata->wdev, true);
out:
        wiphy_unlock(sdata->local->hw.wiphy);

        return err;
}

static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down)
{
        struct ieee80211_local *local = sdata->local;
        unsigned long flags;
        struct sk_buff *skb, *tmp;
        u32 hw_reconf_flags = 0;
        int i, flushed;
        struct ps_data *ps;
        struct cfg80211_chan_def chandef;
        bool cancel_scan;
        struct cfg80211_nan_func *func;

        lockdep_assert_wiphy(local->hw.wiphy);

        clear_bit(SDATA_STATE_RUNNING, &sdata->state);
        synchronize_rcu(); /* flush _ieee80211_wake_txqs() */

        cancel_scan = rcu_access_pointer(local->scan_sdata) == sdata;
        if (cancel_scan)
                ieee80211_scan_cancel(local);

        ieee80211_roc_purge(local, sdata);

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_STATION:
                ieee80211_mgd_stop(sdata);
                break;
        case NL80211_IFTYPE_ADHOC:
                ieee80211_ibss_stop(sdata);
                break;
        case NL80211_IFTYPE_MONITOR:
                if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
                        break;
                list_del_rcu(&sdata->u.mntr.list);
                break;
        default:
                break;
        }

        /*
         * Remove all stations associated with this interface.
         *
         * This must be done before calling ops->remove_interface()
         * because otherwise we can later invoke ops->sta_notify()
         * whenever the STAs are removed, and that invalidates driver
         * assumptions about always getting a vif pointer that is valid
         * (because if we remove a STA after ops->remove_interface()
         * the driver will have removed the vif info already!)
         *
         * For AP_VLANs stations may exist since there's nothing else that
         * would have removed them, but in other modes there shouldn't
         * be any stations.
         */
        flushed = sta_info_flush(sdata, -1);
        WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP_VLAN && flushed > 0);

        /* don't count this interface for allmulti while it is down */
        if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
                atomic_dec(&local->iff_allmultis);

        if (sdata->vif.type == NL80211_IFTYPE_AP) {
                local->fif_pspoll--;
                local->fif_probe_req--;
        } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
                local->fif_probe_req--;
        }

        if (sdata->dev) {
                netif_addr_lock_bh(sdata->dev);
                spin_lock_bh(&local->filter_lock);
                __hw_addr_unsync(&local->mc_list, &sdata->dev->mc,
                                 sdata->dev->addr_len);
                spin_unlock_bh(&local->filter_lock);
                netif_addr_unlock_bh(sdata->dev);
        }

        del_timer_sync(&local->dynamic_ps_timer);
        wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work);

        WARN(ieee80211_vif_is_mld(&sdata->vif),
             "destroying interface with valid links 0x%04x\n",
             sdata->vif.valid_links);

        sdata->vif.bss_conf.csa_active = false;
        if (sdata->vif.type == NL80211_IFTYPE_STATION)
                sdata->deflink.u.mgd.csa.waiting_bcn = false;
        if (sdata->csa_blocked_queues) {
                ieee80211_wake_vif_queues(local, sdata,
                                          IEEE80211_QUEUE_STOP_REASON_CSA);
                sdata->csa_blocked_queues = false;
        }

        wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa.finalize_work);
        wiphy_work_cancel(local->hw.wiphy,
                          &sdata->deflink.color_change_finalize_work);
        wiphy_delayed_work_cancel(local->hw.wiphy,
                                  &sdata->dfs_cac_timer_work);

        if (sdata->wdev.cac_started) {
                chandef = sdata->vif.bss_conf.chanreq.oper;
                WARN_ON(local->suspended);
                ieee80211_link_release_channel(&sdata->deflink);
                cfg80211_cac_event(sdata->dev, &chandef,
                                   NL80211_RADAR_CAC_ABORTED,
                                   GFP_KERNEL);
        }

        if (sdata->vif.type == NL80211_IFTYPE_AP) {
                WARN_ON(!list_empty(&sdata->u.ap.vlans));
        } else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
                /* remove all packets in parent bc_buf pointing to this dev */
                ps = &sdata->bss->ps;

                spin_lock_irqsave(&ps->bc_buf.lock, flags);
                skb_queue_walk_safe(&ps->bc_buf, skb, tmp) {
                        if (skb->dev == sdata->dev) {
                                __skb_unlink(skb, &ps->bc_buf);
                                local->total_ps_buffered--;
                                ieee80211_free_txskb(&local->hw, skb);
                        }
                }
                spin_unlock_irqrestore(&ps->bc_buf.lock, flags);
        }

        if (going_down)
                local->open_count--;

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_AP_VLAN:
                list_del(&sdata->u.vlan.list);
                RCU_INIT_POINTER(sdata->vif.bss_conf.chanctx_conf, NULL);
                /* see comment in the default case below */
                ieee80211_free_keys(sdata, true);
                /* no need to tell driver */
                break;
        case NL80211_IFTYPE_MONITOR:
                if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
                        local->cooked_mntrs--;
                        break;
                }

                local->monitors--;
                if (local->monitors == 0) {
                        local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR;
                        hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
                }

                ieee80211_adjust_monitor_flags(sdata, -1);
                break;
        case NL80211_IFTYPE_NAN:
                /* clean all the functions */
                spin_lock_bh(&sdata->u.nan.func_lock);

                idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, i) {
                        idr_remove(&sdata->u.nan.function_inst_ids, i);
                        cfg80211_free_nan_func(func);
                }
                idr_destroy(&sdata->u.nan.function_inst_ids);

                spin_unlock_bh(&sdata->u.nan.func_lock);
                break;
        case NL80211_IFTYPE_P2P_DEVICE:
                /* relies on synchronize_rcu() below */
                RCU_INIT_POINTER(local->p2p_sdata, NULL);
                fallthrough;
        default:
                wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work);
                /*
                 * When we get here, the interface is marked down.
                 * Free the remaining keys, if there are any
                 * (which can happen in AP mode if userspace sets
                 * keys before the interface is operating)
                 *
                 * Force the key freeing to always synchronize_net()
                 * to wait for the RX path in case it is using this
                 * interface enqueuing frames at this very time on
                 * another CPU.
                 */
                ieee80211_free_keys(sdata, true);
                skb_queue_purge(&sdata->skb_queue);
                skb_queue_purge(&sdata->status_queue);
        }

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
        for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
                skb_queue_walk_safe(&local->pending[i], skb, tmp) {
                        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
                        if (info->control.vif == &sdata->vif) {
                                __skb_unlink(skb, &local->pending[i]);
                                ieee80211_free_txskb(&local->hw, skb);
                        }
                }
        }
        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);

        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                ieee80211_txq_remove_vlan(local, sdata);

        sdata->bss = NULL;

        if (local->open_count == 0)
                ieee80211_clear_tx_pending(local);

        sdata->vif.bss_conf.beacon_int = 0;

        /*
         * If the interface goes down while suspended, presumably because
         * the device was unplugged and that happens before our resume,
         * then the driver is already unconfigured and the remainder of
         * this function isn't needed.
         * XXX: what about WoWLAN? If the device has software state, e.g.
         *        memory allocated, it might expect teardown commands from
         *        mac80211 here?
         */
        if (local->suspended) {
                WARN_ON(local->wowlan);
                WARN_ON(rcu_access_pointer(local->monitor_sdata));
                return;
        }

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_AP_VLAN:
                break;
        case NL80211_IFTYPE_MONITOR:
                if (local->monitors == 0)
                        ieee80211_del_virtual_monitor(local);

                ieee80211_recalc_idle(local);
                ieee80211_recalc_offload(local);

                if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
                        break;

                fallthrough;
        default:
                if (going_down)
                        drv_remove_interface(local, sdata);
        }

        ieee80211_recalc_ps(local);

        if (cancel_scan)
                wiphy_delayed_work_flush(local->hw.wiphy, &local->scan_work);

        if (local->open_count == 0) {
                ieee80211_stop_device(local);

                /* no reconfiguring after stop! */
                return;
        }

        /* do after stop to avoid reconfiguring when we stop anyway */
        ieee80211_configure_filter(local);
        ieee80211_hw_config(local, hw_reconf_flags);

        if (local->monitors == local->open_count)
                ieee80211_add_virtual_monitor(local);
}

static void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_sub_if_data *tx_sdata, *non_tx_sdata, *tmp_sdata;
        struct ieee80211_vif *tx_vif = sdata->vif.mbssid_tx_vif;

        if (!tx_vif)
                return;

        tx_sdata = vif_to_sdata(tx_vif);
        sdata->vif.mbssid_tx_vif = NULL;

        list_for_each_entry_safe(non_tx_sdata, tmp_sdata,
                                 &tx_sdata->local->interfaces, list) {
                if (non_tx_sdata != sdata && non_tx_sdata != tx_sdata &&
                    non_tx_sdata->vif.mbssid_tx_vif == tx_vif &&
                    ieee80211_sdata_running(non_tx_sdata)) {
                        non_tx_sdata->vif.mbssid_tx_vif = NULL;
                        dev_close(non_tx_sdata->wdev.netdev);
                }
        }

        if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) {
                tx_sdata->vif.mbssid_tx_vif = NULL;
                dev_close(tx_sdata->wdev.netdev);
        }
}

static int ieee80211_stop(struct net_device *dev)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);

        /* close dependent VLAN and MBSSID interfaces before locking wiphy */
        if (sdata->vif.type == NL80211_IFTYPE_AP) {
                struct ieee80211_sub_if_data *vlan, *tmpsdata;

                list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans,
                                         u.vlan.list)
                        dev_close(vlan->dev);

                ieee80211_stop_mbssid(sdata);
        }

        wiphy_lock(sdata->local->hw.wiphy);
        wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->activate_links_work);

        ieee80211_do_stop(sdata, true);
        wiphy_unlock(sdata->local->hw.wiphy);

        return 0;
}

static void ieee80211_set_multicast_list(struct net_device *dev)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
        struct ieee80211_local *local = sdata->local;
        int allmulti, sdata_allmulti;

        allmulti = !!(dev->flags & IFF_ALLMULTI);
        sdata_allmulti = !!(sdata->flags & IEEE80211_SDATA_ALLMULTI);

        if (allmulti != sdata_allmulti) {
                if (dev->flags & IFF_ALLMULTI)
                        atomic_inc(&local->iff_allmultis);
                else
                        atomic_dec(&local->iff_allmultis);
                sdata->flags ^= IEEE80211_SDATA_ALLMULTI;
        }

        spin_lock_bh(&local->filter_lock);
        __hw_addr_sync(&local->mc_list, &dev->mc, dev->addr_len);
        spin_unlock_bh(&local->filter_lock);
        wiphy_work_queue(local->hw.wiphy, &local->reconfig_filter);
}

/*
 * Called when the netdev is removed or, by the code below, before
 * the interface type changes.
 */
static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata)
{
        /* free extra data */
        ieee80211_free_keys(sdata, false);

        ieee80211_debugfs_remove_netdev(sdata);

        ieee80211_destroy_frag_cache(&sdata->frags);

        if (ieee80211_vif_is_mesh(&sdata->vif))
                ieee80211_mesh_teardown_sdata(sdata);

        ieee80211_vif_clear_links(sdata);
        ieee80211_link_stop(&sdata->deflink);
}

static void ieee80211_uninit(struct net_device *dev)
{
        ieee80211_teardown_sdata(IEEE80211_DEV_TO_SUB_IF(dev));
}

static void
ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        dev_fetch_sw_netstats(stats, dev->tstats);
}

static int ieee80211_netdev_setup_tc(struct net_device *dev,
                                     enum tc_setup_type type, void *type_data)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
        struct ieee80211_local *local = sdata->local;

        return drv_net_setup_tc(local, sdata, dev, type, type_data);
}

static const struct net_device_ops ieee80211_dataif_ops = {
        .ndo_open                = ieee80211_open,
        .ndo_stop                = ieee80211_stop,
        .ndo_uninit                = ieee80211_uninit,
        .ndo_start_xmit                = ieee80211_subif_start_xmit,
        .ndo_set_rx_mode        = ieee80211_set_multicast_list,
        .ndo_set_mac_address         = ieee80211_change_mac,
        .ndo_get_stats64        = ieee80211_get_stats64,
        .ndo_setup_tc                = ieee80211_netdev_setup_tc,
};

static u16 ieee80211_monitor_select_queue(struct net_device *dev,
                                          struct sk_buff *skb,
                                          struct net_device *sb_dev)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        struct ieee80211_hdr *hdr;
        int len_rthdr;

        if (local->hw.queues < IEEE80211_NUM_ACS)
                return 0;

        /* reset flags and info before parsing radiotap header */
        memset(info, 0, sizeof(*info));

        if (!ieee80211_parse_tx_radiotap(skb, dev))
                return 0; /* doesn't matter, frame will be dropped */

        len_rthdr = ieee80211_get_radiotap_len(skb->data);
        hdr = (struct ieee80211_hdr *)(skb->data + len_rthdr);
        if (skb->len < len_rthdr + 2 ||
            skb->len < len_rthdr + ieee80211_hdrlen(hdr->frame_control))
                return 0; /* doesn't matter, frame will be dropped */

        return ieee80211_select_queue_80211(sdata, skb, hdr);
}

static const struct net_device_ops ieee80211_monitorif_ops = {
        .ndo_open                = ieee80211_open,
        .ndo_stop                = ieee80211_stop,
        .ndo_uninit                = ieee80211_uninit,
        .ndo_start_xmit                = ieee80211_monitor_start_xmit,
        .ndo_set_rx_mode        = ieee80211_set_multicast_list,
        .ndo_set_mac_address         = ieee80211_change_mac,
        .ndo_select_queue        = ieee80211_monitor_select_queue,
        .ndo_get_stats64        = ieee80211_get_stats64,
};

static int ieee80211_netdev_fill_forward_path(struct net_device_path_ctx *ctx,
                                              struct net_device_path *path)
{
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_local *local;
        struct sta_info *sta;
        int ret = -ENOENT;

        sdata = IEEE80211_DEV_TO_SUB_IF(ctx->dev);
        local = sdata->local;

        if (!local->ops->net_fill_forward_path)
                return -EOPNOTSUPP;

        rcu_read_lock();
        switch (sdata->vif.type) {
        case NL80211_IFTYPE_AP_VLAN:
                sta = rcu_dereference(sdata->u.vlan.sta);
                if (sta)
                        break;
                if (sdata->wdev.use_4addr)
                        goto out;
                if (is_multicast_ether_addr(ctx->daddr))
                        goto out;
                sta = sta_info_get_bss(sdata, ctx->daddr);
                break;
        case NL80211_IFTYPE_AP:
                if (is_multicast_ether_addr(ctx->daddr))
                        goto out;
                sta = sta_info_get(sdata, ctx->daddr);
                break;
        case NL80211_IFTYPE_STATION:
                if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) {
                        sta = sta_info_get(sdata, ctx->daddr);
                        if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) {
                                if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
                                        goto out;

                                break;
                        }
                }

                sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid);
                break;
        default:
                goto out;
        }

        if (!sta)
                goto out;

        ret = drv_net_fill_forward_path(local, sdata, &sta->sta, ctx, path);
out:
        rcu_read_unlock();

        return ret;
}

static const struct net_device_ops ieee80211_dataif_8023_ops = {
        .ndo_open                = ieee80211_open,
        .ndo_stop                = ieee80211_stop,
        .ndo_uninit                = ieee80211_uninit,
        .ndo_start_xmit                = ieee80211_subif_start_xmit_8023,
        .ndo_set_rx_mode        = ieee80211_set_multicast_list,
        .ndo_set_mac_address        = ieee80211_change_mac,
        .ndo_get_stats64        = ieee80211_get_stats64,
        .ndo_fill_forward_path        = ieee80211_netdev_fill_forward_path,
        .ndo_setup_tc                = ieee80211_netdev_setup_tc,
};

static bool ieee80211_iftype_supports_hdr_offload(enum nl80211_iftype iftype)
{
        switch (iftype) {
        /* P2P GO and client are mapped to AP/STATION types */
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_STATION:
                return true;
        default:
                return false;
        }
}

static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        u32 flags;

        flags = sdata->vif.offload_flags;

        if (ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) &&
            ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) {
                flags |= IEEE80211_OFFLOAD_ENCAP_ENABLED;

                if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) &&
                    local->hw.wiphy->frag_threshold != (u32)-1)
                        flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;

                if (local->monitors)
                        flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
        } else {
                flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED;
        }

        if (ieee80211_hw_check(&local->hw, SUPPORTS_RX_DECAP_OFFLOAD) &&
            ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) {
                flags |= IEEE80211_OFFLOAD_DECAP_ENABLED;

                if (local->monitors &&
                    !ieee80211_hw_check(&local->hw, SUPPORTS_CONC_MON_RX_DECAP))
                        flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED;
        } else {
                flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED;
        }

        if (sdata->vif.offload_flags == flags)
                return false;

        sdata->vif.offload_flags = flags;
        ieee80211_check_fast_rx_iface(sdata);
        return true;
}

static void ieee80211_set_vif_encap_ops(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_sub_if_data *bss = sdata;
        bool enabled;

        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
                if (!sdata->bss)
                        return;

                bss = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap);
        }

        if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD) ||
            !ieee80211_iftype_supports_hdr_offload(bss->vif.type))
                return;

        enabled = bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_ENABLED;
        if (sdata->wdev.use_4addr &&
            !(bss->vif.offload_flags & IEEE80211_OFFLOAD_ENCAP_4ADDR))
                enabled = false;

        sdata->dev->netdev_ops = enabled ? &ieee80211_dataif_8023_ops :
                                           &ieee80211_dataif_ops;
}

static void ieee80211_recalc_sdata_offload(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_sub_if_data *vsdata;

        if (ieee80211_set_sdata_offload_flags(sdata)) {
                drv_update_vif_offload(local, sdata);
                ieee80211_set_vif_encap_ops(sdata);
        }

        list_for_each_entry(vsdata, &local->interfaces, list) {
                if (vsdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
                    vsdata->bss != &sdata->u.ap)
                        continue;

                ieee80211_set_vif_encap_ops(vsdata);
        }
}

void ieee80211_recalc_offload(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata;

        if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_ENCAP_OFFLOAD))
                return;

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry(sdata, &local->interfaces, list) {
                if (!ieee80211_sdata_running(sdata))
                        continue;

                ieee80211_recalc_sdata_offload(sdata);
        }
}

void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
                                    const int offset)
{
        struct ieee80211_local *local = sdata->local;
        u32 flags = sdata->u.mntr.flags;

#define ADJUST(_f, _s)        do {                                        \
        if (flags & MONITOR_FLAG_##_f)                                \
                local->fif_##_s += offset;                        \
        } while (0)

        ADJUST(FCSFAIL, fcsfail);
        ADJUST(PLCPFAIL, plcpfail);
        ADJUST(CONTROL, control);
        ADJUST(CONTROL, pspoll);
        ADJUST(OTHER_BSS, other_bss);

#undef ADJUST
}

static void ieee80211_set_default_queues(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        int i;

        for (i = 0; i < IEEE80211_NUM_ACS; i++) {
                if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
                        sdata->vif.hw_queue[i] = IEEE80211_INVAL_HW_QUEUE;
                else if (local->hw.queues >= IEEE80211_NUM_ACS)
                        sdata->vif.hw_queue[i] = i;
                else
                        sdata->vif.hw_queue[i] = 0;
        }
        sdata->vif.cab_queue = IEEE80211_INVAL_HW_QUEUE;
}

static void ieee80211_sdata_init(struct ieee80211_local *local,
                                 struct ieee80211_sub_if_data *sdata)
{
        sdata->local = local;

        /*
         * Initialize the default link, so we can use link_id 0 for non-MLD,
         * and that continues to work for non-MLD-aware drivers that use just
         * vif.bss_conf instead of vif.link_conf.
         *
         * Note that we never change this, so if link ID 0 isn't used in an
         * MLD connection, we get a separate allocation for it.
         */
        ieee80211_link_init(sdata, -1, &sdata->deflink, &sdata->vif.bss_conf);
}

int ieee80211_add_virtual_monitor(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata;
        int ret;

        ASSERT_RTNL();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (local->monitor_sdata)
                return 0;

        sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL);
        if (!sdata)
                return -ENOMEM;

        /* set up data */
        sdata->vif.type = NL80211_IFTYPE_MONITOR;
        snprintf(sdata->name, IFNAMSIZ, "%s-monitor",
                 wiphy_name(local->hw.wiphy));
        sdata->wdev.iftype = NL80211_IFTYPE_MONITOR;
        sdata->wdev.wiphy = local->hw.wiphy;

        ieee80211_sdata_init(local, sdata);

        ieee80211_set_default_queues(sdata);

        if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) {
                ret = drv_add_interface(local, sdata);
                if (WARN_ON(ret)) {
                        /* ok .. stupid driver, it asked for this! */
                        kfree(sdata);
                        return ret;
                }
        }

        set_bit(SDATA_STATE_RUNNING, &sdata->state);

        ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR);
        if (ret) {
                kfree(sdata);
                return ret;
        }

        mutex_lock(&local->iflist_mtx);
        rcu_assign_pointer(local->monitor_sdata, sdata);
        mutex_unlock(&local->iflist_mtx);

        ret = ieee80211_link_use_channel(&sdata->deflink, &local->monitor_chanreq,
                                         IEEE80211_CHANCTX_EXCLUSIVE);
        if (ret) {
                mutex_lock(&local->iflist_mtx);
                RCU_INIT_POINTER(local->monitor_sdata, NULL);
                mutex_unlock(&local->iflist_mtx);
                synchronize_net();
                drv_remove_interface(local, sdata);
                kfree(sdata);
                return ret;
        }

        skb_queue_head_init(&sdata->skb_queue);
        skb_queue_head_init(&sdata->status_queue);
        wiphy_work_init(&sdata->work, ieee80211_iface_work);

        return 0;
}

void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata;

        ASSERT_RTNL();
        lockdep_assert_wiphy(local->hw.wiphy);

        mutex_lock(&local->iflist_mtx);

        sdata = rcu_dereference_protected(local->monitor_sdata,
                                          lockdep_is_held(&local->iflist_mtx));
        if (!sdata) {
                mutex_unlock(&local->iflist_mtx);
                return;
        }

        RCU_INIT_POINTER(local->monitor_sdata, NULL);
        mutex_unlock(&local->iflist_mtx);

        synchronize_net();

        ieee80211_link_release_channel(&sdata->deflink);

        if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
                drv_remove_interface(local, sdata);

        kfree(sdata);
}

/*
 * NOTE: Be very careful when changing this function, it must NOT return
 * an error on interface type changes that have been pre-checked, so most
 * checks should be in ieee80211_check_concurrent_iface.
 */
int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
        struct net_device *dev = wdev->netdev;
        struct ieee80211_local *local = sdata->local;
        u64 changed = 0;
        int res;
        u32 hw_reconf_flags = 0;

        lockdep_assert_wiphy(local->hw.wiphy);

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_AP_VLAN: {
                struct ieee80211_sub_if_data *master;

                if (!sdata->bss)
                        return -ENOLINK;

                list_add(&sdata->u.vlan.list, &sdata->bss->vlans);

                master = container_of(sdata->bss,
                                      struct ieee80211_sub_if_data, u.ap);
                sdata->control_port_protocol =
                        master->control_port_protocol;
                sdata->control_port_no_encrypt =
                        master->control_port_no_encrypt;
                sdata->control_port_over_nl80211 =
                        master->control_port_over_nl80211;
                sdata->control_port_no_preauth =
                        master->control_port_no_preauth;
                sdata->vif.cab_queue = master->vif.cab_queue;
                memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
                       sizeof(sdata->vif.hw_queue));
                sdata->vif.bss_conf.chanreq = master->vif.bss_conf.chanreq;

                sdata->crypto_tx_tailroom_needed_cnt +=
                        master->crypto_tx_tailroom_needed_cnt;

                break;
                }
        case NL80211_IFTYPE_AP:
                sdata->bss = &sdata->u.ap;
                break;
        case NL80211_IFTYPE_MESH_POINT:
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_MONITOR:
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_P2P_DEVICE:
        case NL80211_IFTYPE_OCB:
        case NL80211_IFTYPE_NAN:
                /* no special treatment */
                break;
        case NL80211_IFTYPE_UNSPECIFIED:
        case NUM_NL80211_IFTYPES:
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_WDS:
                /* cannot happen */
                WARN_ON(1);
                break;
        }

        if (local->open_count == 0) {
                /* here we can consider everything in good order (again) */
                local->reconfig_failure = false;

                res = drv_start(local);
                if (res)
                        goto err_del_bss;
                ieee80211_led_radio(local, true);
                ieee80211_mod_tpt_led_trig(local,
                                           IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);
        }

        /*
         * Copy the hopefully now-present MAC address to
         * this interface, if it has the special null one.
         */
        if (dev && is_zero_ether_addr(dev->dev_addr)) {
                eth_hw_addr_set(dev, local->hw.wiphy->perm_addr);
                memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);

                if (!is_valid_ether_addr(dev->dev_addr)) {
                        res = -EADDRNOTAVAIL;
                        goto err_stop;
                }
        }

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_AP_VLAN:
                /* no need to tell driver, but set carrier and chanctx */
                if (sdata->bss->active) {
                        ieee80211_link_vlan_copy_chanctx(&sdata->deflink);
                        netif_carrier_on(dev);
                        ieee80211_set_vif_encap_ops(sdata);
                } else {
                        netif_carrier_off(dev);
                }
                break;
        case NL80211_IFTYPE_MONITOR:
                if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) {
                        local->cooked_mntrs++;
                        break;
                }

                if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
                        res = drv_add_interface(local, sdata);
                        if (res)
                                goto err_stop;
                } else if (local->monitors == 0 && local->open_count == 0) {
                        res = ieee80211_add_virtual_monitor(local);
                        if (res)
                                goto err_stop;
                }

                /* must be before the call to ieee80211_configure_filter */
                local->monitors++;
                if (local->monitors == 1) {
                        local->hw.conf.flags |= IEEE80211_CONF_MONITOR;
                        hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
                }

                ieee80211_adjust_monitor_flags(sdata, 1);
                ieee80211_configure_filter(local);
                ieee80211_recalc_offload(local);
                ieee80211_recalc_idle(local);

                netif_carrier_on(dev);
                break;
        default:
                if (coming_up) {
                        ieee80211_del_virtual_monitor(local);
                        ieee80211_set_sdata_offload_flags(sdata);

                        res = drv_add_interface(local, sdata);
                        if (res)
                                goto err_stop;

                        ieee80211_set_vif_encap_ops(sdata);
                        res = ieee80211_check_queues(sdata,
                                ieee80211_vif_type_p2p(&sdata->vif));
                        if (res)
                                goto err_del_interface;
                }

                if (sdata->vif.type == NL80211_IFTYPE_AP) {
                        local->fif_pspoll++;
                        local->fif_probe_req++;

                        ieee80211_configure_filter(local);
                } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
                        local->fif_probe_req++;
                }

                if (sdata->vif.probe_req_reg)
                        drv_config_iface_filter(local, sdata,
                                                FIF_PROBE_REQ,
                                                FIF_PROBE_REQ);

                if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
                    sdata->vif.type != NL80211_IFTYPE_NAN)
                        changed |= ieee80211_reset_erp_info(sdata);
                ieee80211_link_info_change_notify(sdata, &sdata->deflink,
                                                  changed);

                switch (sdata->vif.type) {
                case NL80211_IFTYPE_STATION:
                case NL80211_IFTYPE_ADHOC:
                case NL80211_IFTYPE_AP:
                case NL80211_IFTYPE_MESH_POINT:
                case NL80211_IFTYPE_OCB:
                        netif_carrier_off(dev);
                        break;
                case NL80211_IFTYPE_P2P_DEVICE:
                case NL80211_IFTYPE_NAN:
                        break;
                default:
                        /* not reached */
                        WARN_ON(1);
                }

                /*
                 * Set default queue parameters so drivers don't
                 * need to initialise the hardware if the hardware
                 * doesn't start up with sane defaults.
                 * Enable QoS for anything but station interfaces.
                 */
                ieee80211_set_wmm_default(&sdata->deflink, true,
                        sdata->vif.type != NL80211_IFTYPE_STATION);
        }

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_P2P_DEVICE:
                rcu_assign_pointer(local->p2p_sdata, sdata);
                break;
        case NL80211_IFTYPE_MONITOR:
                if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
                        break;
                list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list);
                break;
        default:
                break;
        }

        /*
         * set_multicast_list will be invoked by the networking core
         * which will check whether any increments here were done in
         * error and sync them down to the hardware as filter flags.
         */
        if (sdata->flags & IEEE80211_SDATA_ALLMULTI)
                atomic_inc(&local->iff_allmultis);

        if (coming_up)
                local->open_count++;

        if (local->open_count == 1)
                ieee80211_hw_conf_init(local);
        else if (hw_reconf_flags)
                ieee80211_hw_config(local, hw_reconf_flags);

        ieee80211_recalc_ps(local);

        set_bit(SDATA_STATE_RUNNING, &sdata->state);

        return 0;
 err_del_interface:
        drv_remove_interface(local, sdata);
 err_stop:
        if (!local->open_count)
                drv_stop(local);
 err_del_bss:
        sdata->bss = NULL;
        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                list_del(&sdata->u.vlan.list);
        /* might already be clear but that doesn't matter */
        clear_bit(SDATA_STATE_RUNNING, &sdata->state);
        return res;
}

static void ieee80211_if_free(struct net_device *dev)
{
        free_percpu(dev->tstats);
}

static void ieee80211_if_setup(struct net_device *dev)
{
        ether_setup(dev);
        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_NO_QUEUE;
        dev->netdev_ops = &ieee80211_dataif_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = ieee80211_if_free;
}

static void ieee80211_iface_process_skb(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        struct sk_buff *skb)
{
        struct ieee80211_mgmt *mgmt = (void *)skb->data;

        lockdep_assert_wiphy(local->hw.wiphy);

        if (ieee80211_is_action(mgmt->frame_control) &&
            mgmt->u.action.category == WLAN_CATEGORY_BACK) {
                struct sta_info *sta;
                int len = skb->len;

                sta = sta_info_get_bss(sdata, mgmt->sa);
                if (sta) {
                        switch (mgmt->u.action.u.addba_req.action_code) {
                        case WLAN_ACTION_ADDBA_REQ:
                                ieee80211_process_addba_request(local, sta,
                                                                mgmt, len);
                                break;
                        case WLAN_ACTION_ADDBA_RESP:
                                ieee80211_process_addba_resp(local, sta,
                                                             mgmt, len);
                                break;
                        case WLAN_ACTION_DELBA:
                                ieee80211_process_delba(sdata, sta,
                                                        mgmt, len);
                                break;
                        default:
                                WARN_ON(1);
                                break;
                        }
                }
        } else if (ieee80211_is_action(mgmt->frame_control) &&
                   mgmt->u.action.category == WLAN_CATEGORY_VHT) {
                switch (mgmt->u.action.u.vht_group_notif.action_code) {
                case WLAN_VHT_ACTION_OPMODE_NOTIF: {
                        struct ieee80211_rx_status *status;
                        enum nl80211_band band;
                        struct sta_info *sta;
                        u8 opmode;

                        status = IEEE80211_SKB_RXCB(skb);
                        band = status->band;
                        opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;

                        sta = sta_info_get_bss(sdata, mgmt->sa);

                        if (sta)
                                ieee80211_vht_handle_opmode(sdata,
                                                            &sta->deflink,
                                                            opmode, band);

                        break;
                }
                case WLAN_VHT_ACTION_GROUPID_MGMT:
                        ieee80211_process_mu_groups(sdata, &sdata->deflink,
                                                    mgmt);
                        break;
                default:
                        WARN_ON(1);
                        break;
                }
        } else if (ieee80211_is_action(mgmt->frame_control) &&
                   mgmt->u.action.category == WLAN_CATEGORY_S1G) {
                switch (mgmt->u.action.u.s1g.action_code) {
                case WLAN_S1G_TWT_TEARDOWN:
                case WLAN_S1G_TWT_SETUP:
                        ieee80211_s1g_rx_twt_action(sdata, skb);
                        break;
                default:
                        break;
                }
        } else if (ieee80211_is_action(mgmt->frame_control) &&
                   mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) {
                if (sdata->vif.type == NL80211_IFTYPE_STATION) {
                        switch (mgmt->u.action.u.ttlm_req.action_code) {
                        case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ:
                                ieee80211_process_neg_ttlm_req(sdata, mgmt,
                                                               skb->len);
                                break;
                        case WLAN_PROTECTED_EHT_ACTION_TTLM_RES:
                                ieee80211_process_neg_ttlm_res(sdata, mgmt,
                                                               skb->len);
                                break;
                        default:
                                break;
                        }
                }
        } else if (ieee80211_is_ext(mgmt->frame_control)) {
                if (sdata->vif.type == NL80211_IFTYPE_STATION)
                        ieee80211_sta_rx_queued_ext(sdata, skb);
                else
                        WARN_ON(1);
        } else if (ieee80211_is_data_qos(mgmt->frame_control)) {
                struct ieee80211_hdr *hdr = (void *)mgmt;
                struct sta_info *sta;

                /*
                 * So the frame isn't mgmt, but frame_control
                 * is at the right place anyway, of course, so
                 * the if statement is correct.
                 *
                 * Warn if we have other data frame types here,
                 * they must not get here.
                 */
                WARN_ON(hdr->frame_control &
                                cpu_to_le16(IEEE80211_STYPE_NULLFUNC));
                WARN_ON(!(hdr->seq_ctrl &
                                cpu_to_le16(IEEE80211_SCTL_FRAG)));
                /*
                 * This was a fragment of a frame, received while
                 * a block-ack session was active. That cannot be
                 * right, so terminate the session.
                 */
                sta = sta_info_get_bss(sdata, mgmt->sa);
                if (sta) {
                        u16 tid = ieee80211_get_tid(hdr);

                        __ieee80211_stop_rx_ba_session(
                                sta, tid, WLAN_BACK_RECIPIENT,
                                WLAN_REASON_QSTA_REQUIRE_SETUP,
                                true);
                }
        } else switch (sdata->vif.type) {
        case NL80211_IFTYPE_STATION:
                ieee80211_sta_rx_queued_mgmt(sdata, skb);
                break;
        case NL80211_IFTYPE_ADHOC:
                ieee80211_ibss_rx_queued_mgmt(sdata, skb);
                break;
        case NL80211_IFTYPE_MESH_POINT:
                if (!ieee80211_vif_is_mesh(&sdata->vif))
                        break;
                ieee80211_mesh_rx_queued_mgmt(sdata, skb);
                break;
        default:
                WARN(1, "frame for unexpected interface type");
                break;
        }
}

static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata,
                                           struct sk_buff *skb)
{
        struct ieee80211_mgmt *mgmt = (void *)skb->data;

        if (ieee80211_is_action(mgmt->frame_control) &&
            mgmt->u.action.category == WLAN_CATEGORY_S1G) {
                switch (mgmt->u.action.u.s1g.action_code) {
                case WLAN_S1G_TWT_TEARDOWN:
                case WLAN_S1G_TWT_SETUP:
                        ieee80211_s1g_status_twt_action(sdata, skb);
                        break;
                default:
                        break;
                }
        }
}

static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work)
{
        struct ieee80211_sub_if_data *sdata =
                container_of(work, struct ieee80211_sub_if_data, work);
        struct ieee80211_local *local = sdata->local;
        struct sk_buff *skb;

        if (!ieee80211_sdata_running(sdata))
                return;

        if (test_bit(SCAN_SW_SCANNING, &local->scanning))
                return;

        if (!ieee80211_can_run_worker(local))
                return;

        /* first process frames */
        while ((skb = skb_dequeue(&sdata->skb_queue))) {
                kcov_remote_start_common(skb_get_kcov_handle(skb));

                if (skb->protocol == cpu_to_be16(ETH_P_TDLS))
                        ieee80211_process_tdls_channel_switch(sdata, skb);
                else
                        ieee80211_iface_process_skb(local, sdata, skb);

                kfree_skb(skb);
                kcov_remote_stop();
        }

        /* process status queue */
        while ((skb = skb_dequeue(&sdata->status_queue))) {
                kcov_remote_start_common(skb_get_kcov_handle(skb));

                ieee80211_iface_process_status(sdata, skb);
                kfree_skb(skb);

                kcov_remote_stop();
        }

        /* then other type-dependent work */
        switch (sdata->vif.type) {
        case NL80211_IFTYPE_STATION:
                ieee80211_sta_work(sdata);
                break;
        case NL80211_IFTYPE_ADHOC:
                ieee80211_ibss_work(sdata);
                break;
        case NL80211_IFTYPE_MESH_POINT:
                if (!ieee80211_vif_is_mesh(&sdata->vif))
                        break;
                ieee80211_mesh_work(sdata);
                break;
        case NL80211_IFTYPE_OCB:
                ieee80211_ocb_work(sdata);
                break;
        default:
                break;
        }
}

static void ieee80211_activate_links_work(struct wiphy *wiphy,
                                          struct wiphy_work *work)
{
        struct ieee80211_sub_if_data *sdata =
                container_of(work, struct ieee80211_sub_if_data,
                             activate_links_work);
        struct ieee80211_local *local = wiphy_priv(wiphy);

        if (local->in_reconfig)
                return;

        ieee80211_set_active_links(&sdata->vif, sdata->desired_active_links);
        sdata->desired_active_links = 0;
}

/*
 * Helper function to initialise an interface to a specific type.
 */
static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
                                  enum nl80211_iftype type)
{
        static const u8 bssid_wildcard[ETH_ALEN] = {0xff, 0xff, 0xff,
                                                    0xff, 0xff, 0xff};

        /* clear type-dependent unions */
        memset(&sdata->u, 0, sizeof(sdata->u));
        memset(&sdata->deflink.u, 0, sizeof(sdata->deflink.u));

        /* and set some type-dependent values */
        sdata->vif.type = type;
        sdata->vif.p2p = false;
        sdata->wdev.iftype = type;

        sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE);
        sdata->control_port_no_encrypt = false;
        sdata->control_port_over_nl80211 = false;
        sdata->control_port_no_preauth = false;
        sdata->vif.cfg.idle = true;
        sdata->vif.bss_conf.txpower = INT_MIN; /* unset */

        sdata->noack_map = 0;

        /* only monitor/p2p-device differ */
        if (sdata->dev) {
                sdata->dev->netdev_ops = &ieee80211_dataif_ops;
                sdata->dev->type = ARPHRD_ETHER;
        }

        skb_queue_head_init(&sdata->skb_queue);
        skb_queue_head_init(&sdata->status_queue);
        wiphy_work_init(&sdata->work, ieee80211_iface_work);
        wiphy_work_init(&sdata->activate_links_work,
                        ieee80211_activate_links_work);
        wiphy_delayed_work_init(&sdata->dfs_cac_timer_work,
                                ieee80211_dfs_cac_timer_work);

        switch (type) {
        case NL80211_IFTYPE_P2P_GO:
                type = NL80211_IFTYPE_AP;
                sdata->vif.type = type;
                sdata->vif.p2p = true;
                fallthrough;
        case NL80211_IFTYPE_AP:
                skb_queue_head_init(&sdata->u.ap.ps.bc_buf);
                INIT_LIST_HEAD(&sdata->u.ap.vlans);
                sdata->vif.bss_conf.bssid = sdata->vif.addr;
                break;
        case NL80211_IFTYPE_P2P_CLIENT:
                type = NL80211_IFTYPE_STATION;
                sdata->vif.type = type;
                sdata->vif.p2p = true;
                fallthrough;
        case NL80211_IFTYPE_STATION:
                sdata->vif.bss_conf.bssid = sdata->deflink.u.mgd.bssid;
                ieee80211_sta_setup_sdata(sdata);
                break;
        case NL80211_IFTYPE_OCB:
                sdata->vif.bss_conf.bssid = bssid_wildcard;
                ieee80211_ocb_setup_sdata(sdata);
                break;
        case NL80211_IFTYPE_ADHOC:
                sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid;
                ieee80211_ibss_setup_sdata(sdata);
                break;
        case NL80211_IFTYPE_MESH_POINT:
                if (ieee80211_vif_is_mesh(&sdata->vif))
                        ieee80211_mesh_init_sdata(sdata);
                break;
        case NL80211_IFTYPE_MONITOR:
                sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP;
                sdata->dev->netdev_ops = &ieee80211_monitorif_ops;
                sdata->u.mntr.flags = MONITOR_FLAG_CONTROL |
                                      MONITOR_FLAG_OTHER_BSS;
                break;
        case NL80211_IFTYPE_NAN:
                idr_init(&sdata->u.nan.function_inst_ids);
                spin_lock_init(&sdata->u.nan.func_lock);
                sdata->vif.bss_conf.bssid = sdata->vif.addr;
                break;
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_DEVICE:
                sdata->vif.bss_conf.bssid = sdata->vif.addr;
                break;
        case NL80211_IFTYPE_UNSPECIFIED:
        case NL80211_IFTYPE_WDS:
        case NUM_NL80211_IFTYPES:
                WARN_ON(1);
                break;
        }

        /* need to do this after the switch so vif.type is correct */
        ieee80211_link_setup(&sdata->deflink);

        ieee80211_debugfs_recreate_netdev(sdata, false);
}

static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
                                           enum nl80211_iftype type)
{
        struct ieee80211_local *local = sdata->local;
        int ret, err;
        enum nl80211_iftype internal_type = type;
        bool p2p = false;

        ASSERT_RTNL();

        if (!local->ops->change_interface)
                return -EBUSY;

        /* for now, don't support changing while links exist */
        if (ieee80211_vif_is_mld(&sdata->vif))
                return -EBUSY;

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_AP:
                if (!list_empty(&sdata->u.ap.vlans))
                        return -EBUSY;
                break;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_OCB:
                /*
                 * Could maybe also all others here?
                 * Just not sure how that interacts
                 * with the RX/config path e.g. for
                 * mesh.
                 */
                break;
        default:
                return -EBUSY;
        }

        switch (type) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_OCB:
                /*
                 * Could probably support everything
                 * but here.
                 */
                break;
        case NL80211_IFTYPE_P2P_CLIENT:
                p2p = true;
                internal_type = NL80211_IFTYPE_STATION;
                break;
        case NL80211_IFTYPE_P2P_GO:
                p2p = true;
                internal_type = NL80211_IFTYPE_AP;
                break;
        default:
                return -EBUSY;
        }

        ret = ieee80211_check_concurrent_iface(sdata, internal_type);
        if (ret)
                return ret;

        ieee80211_stop_vif_queues(local, sdata,
                                  IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE);
        /* do_stop will synchronize_rcu() first thing */
        ieee80211_do_stop(sdata, false);

        ieee80211_teardown_sdata(sdata);

        ieee80211_set_sdata_offload_flags(sdata);
        ret = drv_change_interface(local, sdata, internal_type, p2p);
        if (ret)
                type = ieee80211_vif_type_p2p(&sdata->vif);

        /*
         * Ignore return value here, there's not much we can do since
         * the driver changed the interface type internally already.
         * The warnings will hopefully make driver authors fix it :-)
         */
        ieee80211_check_queues(sdata, type);

        ieee80211_setup_sdata(sdata, type);
        ieee80211_set_vif_encap_ops(sdata);

        err = ieee80211_do_open(&sdata->wdev, false);
        WARN(err, "type change: do_open returned %d", err);

        ieee80211_wake_vif_queues(local, sdata,
                                  IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE);
        return ret;
}

int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
                             enum nl80211_iftype type)
{
        int ret;

        ASSERT_RTNL();

        if (type == ieee80211_vif_type_p2p(&sdata->vif))
                return 0;

        if (ieee80211_sdata_running(sdata)) {
                ret = ieee80211_runtime_change_iftype(sdata, type);
                if (ret)
                        return ret;
        } else {
                /* Purge and reset type-dependent state. */
                ieee80211_teardown_sdata(sdata);
                ieee80211_setup_sdata(sdata, type);
        }

        /* reset some values that shouldn't be kept across type changes */
        if (type == NL80211_IFTYPE_STATION)
                sdata->u.mgd.use_4addr = false;

        return 0;
}

static void ieee80211_assign_perm_addr(struct ieee80211_local *local,
                                       u8 *perm_addr, enum nl80211_iftype type)
{
        struct ieee80211_sub_if_data *sdata;
        u64 mask, start, addr, val, inc;
        u8 *m;
        u8 tmp_addr[ETH_ALEN];
        int i;

        lockdep_assert_wiphy(local->hw.wiphy);

        /* default ... something at least */
        memcpy(perm_addr, local->hw.wiphy->perm_addr, ETH_ALEN);

        if (is_zero_ether_addr(local->hw.wiphy->addr_mask) &&
            local->hw.wiphy->n_addresses <= 1)
                return;

        switch (type) {
        case NL80211_IFTYPE_MONITOR:
                /* doesn't matter */
                break;
        case NL80211_IFTYPE_AP_VLAN:
                /* match up with an AP interface */
                list_for_each_entry(sdata, &local->interfaces, list) {
                        if (sdata->vif.type != NL80211_IFTYPE_AP)
                                continue;
                        memcpy(perm_addr, sdata->vif.addr, ETH_ALEN);
                        break;
                }
                /* keep default if no AP interface present */
                break;
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_P2P_GO:
                if (ieee80211_hw_check(&local->hw, P2P_DEV_ADDR_FOR_INTF)) {
                        list_for_each_entry(sdata, &local->interfaces, list) {
                                if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE)
                                        continue;
                                if (!ieee80211_sdata_running(sdata))
                                        continue;
                                memcpy(perm_addr, sdata->vif.addr, ETH_ALEN);
                                return;
                        }
                }
                fallthrough;
        default:
                /* assign a new address if possible -- try n_addresses first */
                for (i = 0; i < local->hw.wiphy->n_addresses; i++) {
                        bool used = false;

                        list_for_each_entry(sdata, &local->interfaces, list) {
                                if (ether_addr_equal(local->hw.wiphy->addresses[i].addr,
                                                     sdata->vif.addr)) {
                                        used = true;
                                        break;
                                }
                        }

                        if (!used) {
                                memcpy(perm_addr,
                                       local->hw.wiphy->addresses[i].addr,
                                       ETH_ALEN);
                                break;
                        }
                }

                /* try mask if available */
                if (is_zero_ether_addr(local->hw.wiphy->addr_mask))
                        break;

                m = local->hw.wiphy->addr_mask;
                mask =        ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
                        ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
                        ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);

                if (__ffs64(mask) + hweight64(mask) != fls64(mask)) {
                        /* not a contiguous mask ... not handled now! */
                        pr_info("not contiguous\n");
                        break;
                }

                /*
                 * Pick address of existing interface in case user changed
                 * MAC address manually, default to perm_addr.
                 */
                m = local->hw.wiphy->perm_addr;
                list_for_each_entry(sdata, &local->interfaces, list) {
                        if (sdata->vif.type == NL80211_IFTYPE_MONITOR)
                                continue;
                        m = sdata->vif.addr;
                        break;
                }
                start = ((u64)m[0] << 5*8) | ((u64)m[1] << 4*8) |
                        ((u64)m[2] << 3*8) | ((u64)m[3] << 2*8) |
                        ((u64)m[4] << 1*8) | ((u64)m[5] << 0*8);

                inc = 1ULL<<__ffs64(mask);
                val = (start & mask);
                addr = (start & ~mask) | (val & mask);
                do {
                        bool used = false;

                        tmp_addr[5] = addr >> 0*8;
                        tmp_addr[4] = addr >> 1*8;
                        tmp_addr[3] = addr >> 2*8;
                        tmp_addr[2] = addr >> 3*8;
                        tmp_addr[1] = addr >> 4*8;
                        tmp_addr[0] = addr >> 5*8;

                        val += inc;

                        list_for_each_entry(sdata, &local->interfaces, list) {
                                if (ether_addr_equal(tmp_addr, sdata->vif.addr)) {
                                        used = true;
                                        break;
                                }
                        }

                        if (!used) {
                                memcpy(perm_addr, tmp_addr, ETH_ALEN);
                                break;
                        }
                        addr = (start & ~mask) | (val & mask);
                } while (addr != start);

                break;
        }
}

int ieee80211_if_add(struct ieee80211_local *local, const char *name,
                     unsigned char name_assign_type,
                     struct wireless_dev **new_wdev, enum nl80211_iftype type,
                     struct vif_params *params)
{
        struct net_device *ndev = NULL;
        struct ieee80211_sub_if_data *sdata = NULL;
        struct txq_info *txqi;
        int ret, i;

        ASSERT_RTNL();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN) {
                struct wireless_dev *wdev;

                sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size,
                                GFP_KERNEL);
                if (!sdata)
                        return -ENOMEM;
                wdev = &sdata->wdev;

                sdata->dev = NULL;
                strscpy(sdata->name, name, IFNAMSIZ);
                ieee80211_assign_perm_addr(local, wdev->address, type);
                memcpy(sdata->vif.addr, wdev->address, ETH_ALEN);
                ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr);
        } else {
                int size = ALIGN(sizeof(*sdata) + local->hw.vif_data_size,
                                 sizeof(void *));
                int txq_size = 0;

                if (type != NL80211_IFTYPE_AP_VLAN &&
                    (type != NL80211_IFTYPE_MONITOR ||
                     (params->flags & MONITOR_FLAG_ACTIVE)))
                        txq_size += sizeof(struct txq_info) +
                                    local->hw.txq_data_size;

                ndev = alloc_netdev_mqs(size + txq_size,
                                        name, name_assign_type,
                                        ieee80211_if_setup, 1, 1);
                if (!ndev)
                        return -ENOMEM;

                dev_net_set(ndev, wiphy_net(local->hw.wiphy));

                ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                if (!ndev->tstats) {
                        free_netdev(ndev);
                        return -ENOMEM;
                }

                ndev->needed_headroom = local->tx_headroom +
                                        4*6 /* four MAC addresses */
                                        + 2 + 2 + 2 + 2 /* ctl, dur, seq, qos */
                                        + 6 /* mesh */
                                        + 8 /* rfc1042/bridge tunnel */
                                        - ETH_HLEN /* ethernet hard_header_len */
                                        + IEEE80211_ENCRYPT_HEADROOM;
                ndev->needed_tailroom = IEEE80211_ENCRYPT_TAILROOM;

                ret = dev_alloc_name(ndev, ndev->name);
                if (ret < 0) {
                        ieee80211_if_free(ndev);
                        free_netdev(ndev);
                        return ret;
                }

                ieee80211_assign_perm_addr(local, ndev->perm_addr, type);
                if (is_valid_ether_addr(params->macaddr))
                        eth_hw_addr_set(ndev, params->macaddr);
                else
                        eth_hw_addr_set(ndev, ndev->perm_addr);
                SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy));

                /* don't use IEEE80211_DEV_TO_SUB_IF -- it checks too much */
                sdata = netdev_priv(ndev);
                ndev->ieee80211_ptr = &sdata->wdev;
                memcpy(sdata->vif.addr, ndev->dev_addr, ETH_ALEN);
                ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr);
                memcpy(sdata->name, ndev->name, IFNAMSIZ);

                if (txq_size) {
                        txqi = netdev_priv(ndev) + size;
                        ieee80211_txq_init(sdata, NULL, txqi, 0);
                }

                sdata->dev = ndev;
        }

        /* initialise type-independent data */
        sdata->wdev.wiphy = local->hw.wiphy;

        ieee80211_sdata_init(local, sdata);

        ieee80211_init_frag_cache(&sdata->frags);

        INIT_LIST_HEAD(&sdata->key_list);

        wiphy_delayed_work_init(&sdata->dec_tailroom_needed_wk,
                                ieee80211_delayed_tailroom_dec);

        for (i = 0; i < NUM_NL80211_BANDS; i++) {
                struct ieee80211_supported_band *sband;
                sband = local->hw.wiphy->bands[i];
                sdata->rc_rateidx_mask[i] =
                        sband ? (1 << sband->n_bitrates) - 1 : 0;
                if (sband) {
                        __le16 cap;
                        u16 *vht_rate_mask;

                        memcpy(sdata->rc_rateidx_mcs_mask[i],
                               sband->ht_cap.mcs.rx_mask,
                               sizeof(sdata->rc_rateidx_mcs_mask[i]));

                        cap = sband->vht_cap.vht_mcs.rx_mcs_map;
                        vht_rate_mask = sdata->rc_rateidx_vht_mcs_mask[i];
                        ieee80211_get_vht_mask_from_cap(cap, vht_rate_mask);
                } else {
                        memset(sdata->rc_rateidx_mcs_mask[i], 0,
                               sizeof(sdata->rc_rateidx_mcs_mask[i]));
                        memset(sdata->rc_rateidx_vht_mcs_mask[i], 0,
                               sizeof(sdata->rc_rateidx_vht_mcs_mask[i]));
                }
        }

        ieee80211_set_default_queues(sdata);

        sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
        sdata->deflink.user_power_level = local->user_power_level;

        /* setup type-dependent data */
        ieee80211_setup_sdata(sdata, type);

        if (ndev) {
                ndev->ieee80211_ptr->use_4addr = params->use_4addr;
                if (type == NL80211_IFTYPE_STATION)
                        sdata->u.mgd.use_4addr = params->use_4addr;

                ndev->features |= local->hw.netdev_features;
                ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
                ndev->hw_features |= ndev->features &
                                        MAC80211_SUPPORTED_FEATURES_TX;
                sdata->vif.netdev_features = local->hw.netdev_features;

                netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops);

                /* MTU range is normally 256 - 2304, where the upper limit is
                 * the maximum MSDU size. Monitor interfaces send and receive
                 * MPDU and A-MSDU frames which may be much larger so we do
                 * not impose an upper limit in that case.
                 */
                ndev->min_mtu = 256;
                if (type == NL80211_IFTYPE_MONITOR)
                        ndev->max_mtu = 0;
                else
                        ndev->max_mtu = local->hw.max_mtu;

                ret = cfg80211_register_netdevice(ndev);
                if (ret) {
                        free_netdev(ndev);
                        return ret;
                }
        }

        mutex_lock(&local->iflist_mtx);
        list_add_tail_rcu(&sdata->list, &local->interfaces);
        mutex_unlock(&local->iflist_mtx);

        if (new_wdev)
                *new_wdev = &sdata->wdev;

        return 0;
}

void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata)
{
        ASSERT_RTNL();
        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        mutex_lock(&sdata->local->iflist_mtx);
        list_del_rcu(&sdata->list);
        mutex_unlock(&sdata->local->iflist_mtx);

        if (sdata->vif.txq)
                ieee80211_txq_purge(sdata->local, to_txq_info(sdata->vif.txq));

        synchronize_rcu();

        cfg80211_unregister_wdev(&sdata->wdev);

        if (!sdata->dev) {
                ieee80211_teardown_sdata(sdata);
                kfree(sdata);
        }
}

void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata)
{
        if (WARN_ON_ONCE(!test_bit(SDATA_STATE_RUNNING, &sdata->state)))
                return;
        ieee80211_do_stop(sdata, true);
}

void ieee80211_remove_interfaces(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata, *tmp;
        LIST_HEAD(unreg_list);

        ASSERT_RTNL();

        /* Before destroying the interfaces, make sure they're all stopped so
         * that the hardware is stopped. Otherwise, the driver might still be
         * iterating the interfaces during the shutdown, e.g. from a worker
         * or from RX processing or similar, and if it does so (using atomic
         * iteration) while we're manipulating the list, the iteration will
         * crash.
         *
         * After this, the hardware should be stopped and the driver should
         * have stopped all of its activities, so that we can do RCU-unaware
         * manipulations of the interface list below.
         */
        cfg80211_shutdown_all_interfaces(local->hw.wiphy);

        wiphy_lock(local->hw.wiphy);

        WARN(local->open_count, "%s: open count remains %d\n",
             wiphy_name(local->hw.wiphy), local->open_count);

        mutex_lock(&local->iflist_mtx);
        list_splice_init(&local->interfaces, &unreg_list);
        mutex_unlock(&local->iflist_mtx);

        list_for_each_entry_safe(sdata, tmp, &unreg_list, list) {
                bool netdev = sdata->dev;

                /*
                 * Remove IP addresses explicitly, since the notifier will
                 * skip the callbacks if wdev->registered is false, since
                 * we can't acquire the wiphy_lock() again there if already
                 * inside this locked section.
                 */
                sdata->vif.cfg.arp_addr_cnt = 0;
                if (sdata->vif.type == NL80211_IFTYPE_STATION &&
                    sdata->u.mgd.associated)
                        ieee80211_vif_cfg_change_notify(sdata,
                                                        BSS_CHANGED_ARP_FILTER);

                list_del(&sdata->list);
                cfg80211_unregister_wdev(&sdata->wdev);

                if (!netdev)
                        kfree(sdata);
        }
        wiphy_unlock(local->hw.wiphy);
}

static int netdev_notify(struct notifier_block *nb,
                         unsigned long state, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct ieee80211_sub_if_data *sdata;

        if (state != NETDEV_CHANGENAME)
                return NOTIFY_DONE;

        if (!dev->ieee80211_ptr || !dev->ieee80211_ptr->wiphy)
                return NOTIFY_DONE;

        if (dev->ieee80211_ptr->wiphy->privid != mac80211_wiphy_privid)
                return NOTIFY_DONE;

        sdata = IEEE80211_DEV_TO_SUB_IF(dev);
        memcpy(sdata->name, dev->name, IFNAMSIZ);
        ieee80211_debugfs_rename_netdev(sdata);

        return NOTIFY_OK;
}

static struct notifier_block mac80211_netdev_notifier = {
        .notifier_call = netdev_notify,
};

int ieee80211_iface_init(void)
{
        return register_netdevice_notifier(&mac80211_netdev_notifier);
}

void ieee80211_iface_exit(void)
{
        unregister_netdevice_notifier(&mac80211_netdev_notifier);
}

void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata)
{
        if (sdata->vif.type == NL80211_IFTYPE_AP)
                atomic_inc(&sdata->u.ap.num_mcast_sta);
        else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                atomic_inc(&sdata->u.vlan.num_mcast_sta);
}

void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata)
{
        if (sdata->vif.type == NL80211_IFTYPE_AP)
                atomic_dec(&sdata->u.ap.num_mcast_sta);
        else if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                atomic_dec(&sdata->u.vlan.num_mcast_sta);
}










































































































































































































































































    5 





    5 





































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NODEMASK_H
#define __LINUX_NODEMASK_H

/*
 * Nodemasks provide a bitmap suitable for representing the
 * set of Node's in a system, one bit position per Node number.
 *
 * See detailed comments in the file linux/bitmap.h describing the
 * data type on which these nodemasks are based.
 *
 * For details of nodemask_parse_user(), see bitmap_parse_user() in
 * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
 * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
 * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
 * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
 * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
 * lib/bitmap.c.
 *
 * The available nodemask operations are:
 *
 * void node_set(node, mask)                turn on bit 'node' in mask
 * void node_clear(node, mask)                turn off bit 'node' in mask
 * void nodes_setall(mask)                set all bits
 * void nodes_clear(mask)                clear all bits
 * int node_isset(node, mask)                true iff bit 'node' set in mask
 * int node_test_and_set(node, mask)        test and set bit 'node' in mask
 *
 * void nodes_and(dst, src1, src2)        dst = src1 & src2  [intersection]
 * void nodes_or(dst, src1, src2)        dst = src1 | src2  [union]
 * void nodes_xor(dst, src1, src2)        dst = src1 ^ src2
 * void nodes_andnot(dst, src1, src2)        dst = src1 & ~src2
 * void nodes_complement(dst, src)        dst = ~src
 *
 * int nodes_equal(mask1, mask2)        Does mask1 == mask2?
 * int nodes_intersects(mask1, mask2)        Do mask1 and mask2 intersect?
 * int nodes_subset(mask1, mask2)        Is mask1 a subset of mask2?
 * int nodes_empty(mask)                Is mask empty (no bits sets)?
 * int nodes_full(mask)                        Is mask full (all bits sets)?
 * int nodes_weight(mask)                Hamming weight - number of set bits
 *
 * void nodes_shift_right(dst, src, n)        Shift right
 * void nodes_shift_left(dst, src, n)        Shift left
 *
 * unsigned int first_node(mask)        Number lowest set bit, or MAX_NUMNODES
 * unsigend int next_node(node, mask)        Next node past 'node', or MAX_NUMNODES
 * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
 *                                        or MAX_NUMNODES
 * unsigned int first_unset_node(mask)        First node not set in mask, or
 *                                        MAX_NUMNODES
 *
 * nodemask_t nodemask_of_node(node)        Return nodemask with bit 'node' set
 * NODE_MASK_ALL                        Initializer - all bits set
 * NODE_MASK_NONE                        Initializer - no bits set
 * unsigned long *nodes_addr(mask)        Array of unsigned long's in mask
 *
 * int nodemask_parse_user(ubuf, ulen, mask)        Parse ascii string as nodemask
 * int nodelist_parse(buf, map)                Parse ascii string as nodelist
 * int node_remap(oldbit, old, new)        newbit = map(old, new)(oldbit)
 * void nodes_remap(dst, src, old, new)        *dst = map(old, new)(src)
 * void nodes_onto(dst, orig, relmap)        *dst = orig relative to relmap
 * void nodes_fold(dst, orig, sz)        dst bits = orig bits mod sz
 *
 * for_each_node_mask(node, mask)        for-loop node over mask
 *
 * int num_online_nodes()                Number of online Nodes
 * int num_possible_nodes()                Number of all possible Nodes
 *
 * int node_random(mask)                Random node with set bit in mask
 *
 * int node_online(node)                Is some node online?
 * int node_possible(node)                Is some node possible?
 *
 * node_set_online(node)                set bit 'node' in node_online_map
 * node_set_offline(node)                clear bit 'node' in node_online_map
 *
 * for_each_node(node)                        for-loop node over node_possible_map
 * for_each_online_node(node)                for-loop node over node_online_map
 *
 * Subtlety:
 * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
 *
 * NODEMASK_SCRATCH
 * When doing above logical AND, OR, XOR, Remap operations the callers tend to
 * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
 * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
 * for such situations. See below and CPUMASK_ALLOC also.
 */

#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/nodemask_types.h>
#include <linux/numa.h>
#include <linux/random.h>

extern nodemask_t _unused_nodemask_arg_;

/**
 * nodemask_pr_args - printf args to output a nodemask
 * @maskp: nodemask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
 */
#define nodemask_pr_args(maskp)        __nodemask_pr_numnodes(maskp), \
                                __nodemask_pr_bits(maskp)
static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
        return m ? MAX_NUMNODES : 0;
}
static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
        return m ? m->bits : NULL;
}

/*
 * The inline keyword gives the compiler room to decide to inline, or
 * not inline a function as it sees best.  However, as these functions
 * are called in both __init and non-__init functions, if they are not
 * inlined we will end up with a section mismatch error (of the type of
 * freeable items not being freed).  So we must use __always_inline here
 * to fix the problem.  If other functions in the future also end up in
 * this situation they will also need to be annotated as __always_inline
 */
#define node_set(node, dst) __node_set((node), &(dst))
static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
{
        set_bit(node, dstp->bits);
}

#define node_clear(node, dst) __node_clear((node), &(dst))
static inline void __node_clear(int node, volatile nodemask_t *dstp)
{
        clear_bit(node, dstp->bits);
}

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_zero(dstp->bits, nbits);
}

/* No static inline type checking - see Subtlety (1) above. */
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)

#define node_test_and_set(node, nodemask) \
                        __node_test_and_set((node), &(nodemask))
static inline bool __node_test_and_set(int node, nodemask_t *addr)
{
        return test_and_set_bit(node, addr->bits);
}

#define nodes_and(dst, src1, src2) \
                        __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_or(dst, src1, src2) \
                        __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_xor(dst, src1, src2) \
                        __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_andnot(dst, src1, src2) \
                        __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_complement(dst, src) \
                        __nodes_complement(&(dst), &(src), MAX_NUMNODES)
static inline void __nodes_complement(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_complement(dstp->bits, srcp->bits, nbits);
}

#define nodes_equal(src1, src2) \
                        __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_equal(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_equal(src1p->bits, src2p->bits, nbits);
}

#define nodes_intersects(src1, src2) \
                        __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_intersects(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}

#define nodes_subset(src1, src2) \
                        __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_subset(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_subset(src1p->bits, src2p->bits, nbits);
}

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_full(srcp->bits, nbits);
}

#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_weight(srcp->bits, nbits);
}

#define nodes_shift_right(dst, src, n) \
                        __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_right(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
}

#define nodes_shift_left(dst, src, n) \
                        __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_left(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
}

/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min_ts could be dropped. */

#define first_node(src) __first_node(&(src))
static inline unsigned int __first_node(const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static inline unsigned int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

/*
 * Find the next present node in src, starting after node n, wrapping around to
 * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
 */
#define next_node_in(n, src) __next_node_in((n), &(src))
static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
{
        unsigned int ret = __next_node(node, srcp);

        if (ret == MAX_NUMNODES)
                ret = __first_node(srcp);
        return ret;
}

static inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
        nodes_clear(*mask);
        node_set(node, *mask);
}

#define nodemask_of_node(node)                                                \
({                                                                        \
        typeof(_unused_nodemask_arg_) m;                                \
        if (sizeof(m) == sizeof(unsigned long)) {                        \
                m.bits[0] = 1UL << (node);                                \
        } else {                                                        \
                init_nodemask_of_node(&m, (node));                        \
        }                                                                \
        m;                                                                \
})

#define first_unset_node(mask) __first_unset_node(&(mask))
static inline unsigned int __first_unset_node(const nodemask_t *maskp)
{
        return min_t(unsigned int, MAX_NUMNODES,
                        find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#else

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#endif

#define NODE_MASK_NONE                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                        \
} })

#define nodes_addr(src) ((src).bits)

#define nodemask_parse_user(ubuf, ulen, dst) \
                __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
static inline int __nodemask_parse_user(const char __user *buf, int len,
                                        nodemask_t *dstp, int nbits)
{
        return bitmap_parse_user(buf, len, dstp->bits, nbits);
}

#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
{
        return bitmap_parselist(buf, dstp->bits, nbits);
}

#define node_remap(oldbit, old, new) \
                __node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
static inline int __node_remap(int oldbit,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
}

#define nodes_remap(dst, src, old, new) \
                __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

#define nodes_onto(dst, orig, relmap) \
                __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
                const nodemask_t *relmapp, int nbits)
{
        bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
}

#define nodes_fold(dst, orig, sz) \
                __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
                int sz, int nbits)
{
        bitmap_fold(dstp->bits, origp->bits, sz, nbits);
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                                    \
        for ((node) = first_node(mask);                                    \
             (node) < MAX_NUMNODES;                                    \
             (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                                  \
        for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,                /* The node could become online at some point */
        N_ONLINE,                /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,                /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,                /* The node has memory(regular, high, movable) */
        N_CPU,                /* The node has one or more cpus */
        N_GENERIC_INITIATOR,        /* The node has one or more Generic Initiators */
        NR_NODE_STATES
};

/*
 * The following particular system nodemasks and operations
 * on them manage all possible and online nodes.
 */

extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
static inline int node_state(int node, enum node_states state)
{
        return node_isset(node, node_states[state]);
}

static inline void node_set_state(int node, enum node_states state)
{
        __node_set(node, &node_states[state]);
}

static inline void node_clear_state(int node, enum node_states state)
{
        __node_clear(node, &node_states[state]);
}

static inline int num_node_state(enum node_states state)
{
        return nodes_weight(node_states[state]);
}

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

#define first_online_node        first_node(node_states[N_ONLINE])
#define first_memory_node        first_node(node_states[N_MEMORY])
static inline unsigned int next_online_node(int nid)
{
        return next_node(nid, node_states[N_ONLINE]);
}
static inline unsigned int next_memory_node(int nid)
{
        return next_node(nid, node_states[N_MEMORY]);
}

extern unsigned int nr_node_ids;
extern unsigned int nr_online_nodes;

static inline void node_set_online(int nid)
{
        node_set_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

static inline void node_set_offline(int nid)
{
        node_clear_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

#else

static inline int node_state(int node, enum node_states state)
{
        return node == 0;
}

static inline void node_set_state(int node, enum node_states state)
{
}

static inline void node_clear_state(int node, enum node_states state)
{
}

static inline int num_node_state(enum node_states state)
{
        return 1;
}

#define for_each_node_state(node, __state) \
        for ( (node) = 0; (node) == 0; (node) = 1)

#define first_online_node        0
#define first_memory_node        0
#define next_online_node(nid)        (MAX_NUMNODES)
#define next_memory_node(nid)        (MAX_NUMNODES)
#define nr_node_ids                1U
#define nr_online_nodes                1U

#define node_set_online(node)           node_set_state((node), N_ONLINE)
#define node_set_offline(node)           node_clear_state((node), N_ONLINE)

#endif

static inline int node_random(const nodemask_t *maskp)
{
#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
        int w, bit;

        w = nodes_weight(*maskp);
        switch (w) {
        case 0:
                bit = NUMA_NO_NODE;
                break;
        case 1:
                bit = first_node(*maskp);
                break;
        default:
                bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w));
                break;
        }
        return bit;
#else
        return 0;
#endif
}

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map         node_states[N_POSSIBLE]

#define num_online_nodes()        num_node_state(N_ONLINE)
#define num_possible_nodes()        num_node_state(N_POSSIBLE)
#define node_online(node)        node_state((node), N_ONLINE)
#define node_possible(node)        node_state((node), N_POSSIBLE)

#define for_each_node(node)           for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

/*
 * For nodemask scratch area.
 * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
 * name.
 */
#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
#define NODEMASK_ALLOC(type, name, gfp_flags)        \
                        type *name = kmalloc(sizeof(*name), gfp_flags)
#define NODEMASK_FREE(m)                        kfree(m)
#else
#define NODEMASK_ALLOC(type, name, gfp_flags)        type _##name, *name = &_##name
#define NODEMASK_FREE(m)                        do {} while (0)
#endif

/* Example structure for using NODEMASK_ALLOC, used in mempolicy. */
struct nodemask_scratch {
        nodemask_t        mask1;
        nodemask_t        mask2;
};

#define NODEMASK_SCRATCH(x)                                                \
                        NODEMASK_ALLOC(struct nodemask_scratch, x,        \
                                        GFP_KERNEL | __GFP_NORETRY)
#define NODEMASK_SCRATCH_FREE(x)        NODEMASK_FREE(x)


#endif /* __LINUX_NODEMASK_H */












































































































































































































































































































































































































































    3 









    1 





    1 


    1 

    1 










    2 















    2 





















    1 






















    1 




















































































































































































    1 















    1 






































































































































































































































































































































































    1 

    1 













































































































































































































































































































































































































































































































































































    1 





    1 
    1 

    1 



    1 









    1 


    1 


    1 

    1 





















































































    1 
    1 








    1 












    1 









































    1 

























    1 














    1 




    1 





    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Generic PPP layer for Linux.
 *
 * Copyright 1999-2002 Paul Mackerras.
 *
 * The generic PPP layer handles the PPP network interfaces, the
 * /dev/ppp device, packet and VJ compression, and multilink.
 * It talks to PPP `channels' via the interface defined in
 * include/linux/ppp_channel.h.  Channels provide the basic means for
 * sending and receiving PPP frames on some kind of communications
 * channel.
 *
 * Part of the code in this driver was inspired by the old async-only
 * PPP driver, written by Michael Callahan and Al Longyear, and
 * subsequently hacked by Paul Mackerras.
 *
 * ==FILEVERSION 20041108==
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/idr.h>
#include <linux/netdevice.h>
#include <linux/poll.h>
#include <linux/ppp_defs.h>
#include <linux/filter.h>
#include <linux/ppp-ioctl.h>
#include <linux/ppp_channel.h>
#include <linux/ppp-comp.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/stddef.h>
#include <linux/device.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <asm/unaligned.h>
#include <net/slhc_vj.h>
#include <linux/atomic.h>
#include <linux/refcount.h>

#include <linux/nsproxy.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>

#define PPP_VERSION        "2.4.2"

/*
 * Network protocols we support.
 */
#define NP_IP        0                /* Internet Protocol V4 */
#define NP_IPV6        1                /* Internet Protocol V6 */
#define NP_IPX        2                /* IPX protocol */
#define NP_AT        3                /* Appletalk protocol */
#define NP_MPLS_UC 4                /* MPLS unicast */
#define NP_MPLS_MC 5                /* MPLS multicast */
#define NUM_NP        6                /* Number of NPs. */

#define MPHDRLEN        6        /* multilink protocol header length */
#define MPHDRLEN_SSN        4        /* ditto with short sequence numbers */

#define PPP_PROTO_LEN        2

/*
 * An instance of /dev/ppp can be associated with either a ppp
 * interface unit or a ppp channel.  In both cases, file->private_data
 * points to one of these.
 */
struct ppp_file {
        enum {
                INTERFACE=1, CHANNEL
        }                kind;
        struct sk_buff_head xq;                /* pppd transmit queue */
        struct sk_buff_head rq;                /* receive queue for pppd */
        wait_queue_head_t rwait;        /* for poll on reading /dev/ppp */
        refcount_t        refcnt;                /* # refs (incl /dev/ppp attached) */
        int                hdrlen;                /* space to leave for headers */
        int                index;                /* interface unit / channel number */
        int                dead;                /* unit/channel has been shut down */
};

#define PF_TO_X(pf, X)                container_of(pf, X, file)

#define PF_TO_PPP(pf)                PF_TO_X(pf, struct ppp)
#define PF_TO_CHANNEL(pf)        PF_TO_X(pf, struct channel)

/*
 * Data structure to hold primary network stats for which
 * we want to use 64 bit storage.  Other network stats
 * are stored in dev->stats of the ppp strucute.
 */
struct ppp_link_stats {
        u64 rx_packets;
        u64 tx_packets;
        u64 rx_bytes;
        u64 tx_bytes;
};

/*
 * Data structure describing one ppp unit.
 * A ppp unit corresponds to a ppp network interface device
 * and represents a multilink bundle.
 * It can have 0 or more ppp channels connected to it.
 */
struct ppp {
        struct ppp_file        file;                /* stuff for read/write/poll 0 */
        struct file        *owner;                /* file that owns this unit 48 */
        struct list_head channels;        /* list of attached channels 4c */
        int                n_channels;        /* how many channels are attached 54 */
        spinlock_t        rlock;                /* lock for receive side 58 */
        spinlock_t        wlock;                /* lock for transmit side 5c */
        int __percpu        *xmit_recursion; /* xmit recursion detect */
        int                mru;                /* max receive unit 60 */
        unsigned int        flags;                /* control bits 64 */
        unsigned int        xstate;                /* transmit state bits 68 */
        unsigned int        rstate;                /* receive state bits 6c */
        int                debug;                /* debug flags 70 */
        struct slcompress *vj;                /* state for VJ header compression */
        enum NPmode        npmode[NUM_NP];        /* what to do with each net proto 78 */
        struct sk_buff        *xmit_pending;        /* a packet ready to go out 88 */
        struct compressor *xcomp;        /* transmit packet compressor 8c */
        void                *xc_state;        /* its internal state 90 */
        struct compressor *rcomp;        /* receive decompressor 94 */
        void                *rc_state;        /* its internal state 98 */
        unsigned long        last_xmit;        /* jiffies when last pkt sent 9c */
        unsigned long        last_recv;        /* jiffies when last pkt rcvd a0 */
        struct net_device *dev;                /* network interface device a4 */
        int                closing;        /* is device closing down? a8 */
#ifdef CONFIG_PPP_MULTILINK
        int                nxchan;                /* next channel to send something on */
        u32                nxseq;                /* next sequence number to send */
        int                mrru;                /* MP: max reconst. receive unit */
        u32                nextseq;        /* MP: seq no of next packet */
        u32                minseq;                /* MP: min of most recent seqnos */
        struct sk_buff_head mrq;        /* MP: receive reconstruction queue */
#endif /* CONFIG_PPP_MULTILINK */
#ifdef CONFIG_PPP_FILTER
        struct bpf_prog *pass_filter;        /* filter for packets to pass */
        struct bpf_prog *active_filter; /* filter for pkts to reset idle */
#endif /* CONFIG_PPP_FILTER */
        struct net        *ppp_net;        /* the net we belong to */
        struct ppp_link_stats stats64;        /* 64 bit network stats */
};

/*
 * Bits in flags: SC_NO_TCP_CCID, SC_CCP_OPEN, SC_CCP_UP, SC_LOOP_TRAFFIC,
 * SC_MULTILINK, SC_MP_SHORTSEQ, SC_MP_XSHORTSEQ, SC_COMP_TCP, SC_REJ_COMP_TCP,
 * SC_MUST_COMP
 * Bits in rstate: SC_DECOMP_RUN, SC_DC_ERROR, SC_DC_FERROR.
 * Bits in xstate: SC_COMP_RUN
 */
#define SC_FLAG_BITS        (SC_NO_TCP_CCID|SC_CCP_OPEN|SC_CCP_UP|SC_LOOP_TRAFFIC \
                         |SC_MULTILINK|SC_MP_SHORTSEQ|SC_MP_XSHORTSEQ \
                         |SC_COMP_TCP|SC_REJ_COMP_TCP|SC_MUST_COMP)

/*
 * Private data structure for each channel.
 * This includes the data structure used for multilink.
 */
struct channel {
        struct ppp_file        file;                /* stuff for read/write/poll */
        struct list_head list;                /* link in all/new_channels list */
        struct ppp_channel *chan;        /* public channel data structure */
        struct rw_semaphore chan_sem;        /* protects `chan' during chan ioctl */
        spinlock_t        downl;                /* protects `chan', file.xq dequeue */
        struct ppp        *ppp;                /* ppp unit we're connected to */
        struct net        *chan_net;        /* the net channel belongs to */
        netns_tracker        ns_tracker;
        struct list_head clist;                /* link in list of channels per unit */
        rwlock_t        upl;                /* protects `ppp' and 'bridge' */
        struct channel __rcu *bridge;        /* "bridged" ppp channel */
#ifdef CONFIG_PPP_MULTILINK
        u8                avail;                /* flag used in multilink stuff */
        u8                had_frag;        /* >= 1 fragments have been sent */
        u32                lastseq;        /* MP: last sequence # received */
        int                speed;                /* speed of the corresponding ppp channel*/
#endif /* CONFIG_PPP_MULTILINK */
};

struct ppp_config {
        struct file *file;
        s32 unit;
        bool ifname_is_set;
};

/*
 * SMP locking issues:
 * Both the ppp.rlock and ppp.wlock locks protect the ppp.channels
 * list and the ppp.n_channels field, you need to take both locks
 * before you modify them.
 * The lock ordering is: channel.upl -> ppp.wlock -> ppp.rlock ->
 * channel.downl.
 */

static DEFINE_MUTEX(ppp_mutex);
static atomic_t ppp_unit_count = ATOMIC_INIT(0);
static atomic_t channel_count = ATOMIC_INIT(0);

/* per-net private data for this module */
static unsigned int ppp_net_id __read_mostly;
struct ppp_net {
        /* units to ppp mapping */
        struct idr units_idr;

        /*
         * all_ppp_mutex protects the units_idr mapping.
         * It also ensures that finding a ppp unit in the units_idr
         * map and updating its file.refcnt field is atomic.
         */
        struct mutex all_ppp_mutex;

        /* channels */
        struct list_head all_channels;
        struct list_head new_channels;
        int last_channel_index;

        /*
         * all_channels_lock protects all_channels and
         * last_channel_index, and the atomicity of find
         * a channel and updating its file.refcnt field.
         */
        spinlock_t all_channels_lock;
};

/* Get the PPP protocol number from a skb */
#define PPP_PROTO(skb)        get_unaligned_be16((skb)->data)

/* We limit the length of ppp->file.rq to this (arbitrary) value */
#define PPP_MAX_RQLEN        32

/*
 * Maximum number of multilink fragments queued up.
 * This has to be large enough to cope with the maximum latency of
 * the slowest channel relative to the others.  Strictly it should
 * depend on the number of channels and their characteristics.
 */
#define PPP_MP_MAX_QLEN        128

/* Multilink header bits. */
#define B        0x80                /* this fragment begins a packet */
#define E        0x40                /* this fragment ends a packet */

/* Compare multilink sequence numbers (assumed to be 32 bits wide) */
#define seq_before(a, b)        ((s32)((a) - (b)) < 0)
#define seq_after(a, b)                ((s32)((a) - (b)) > 0)

/* Prototypes. */
static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf,
                        struct file *file, unsigned int cmd, unsigned long arg);
static void ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb);
static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb);
static void ppp_push(struct ppp *ppp);
static void ppp_channel_push(struct channel *pch);
static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb,
                              struct channel *pch);
static void ppp_receive_error(struct ppp *ppp);
static void ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb);
static struct sk_buff *ppp_decompress_frame(struct ppp *ppp,
                                            struct sk_buff *skb);
#ifdef CONFIG_PPP_MULTILINK
static void ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb,
                                struct channel *pch);
static void ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb);
static struct sk_buff *ppp_mp_reconstruct(struct ppp *ppp);
static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb);
#endif /* CONFIG_PPP_MULTILINK */
static int ppp_set_compress(struct ppp *ppp, struct ppp_option_data *data);
static void ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound);
static void ppp_ccp_closed(struct ppp *ppp);
static struct compressor *find_compressor(int type);
static void ppp_get_stats(struct ppp *ppp, struct ppp_stats *st);
static int ppp_create_interface(struct net *net, struct file *file, int *unit);
static void init_ppp_file(struct ppp_file *pf, int kind);
static void ppp_destroy_interface(struct ppp *ppp);
static struct ppp *ppp_find_unit(struct ppp_net *pn, int unit);
static struct channel *ppp_find_channel(struct ppp_net *pn, int unit);
static int ppp_connect_channel(struct channel *pch, int unit);
static int ppp_disconnect_channel(struct channel *pch);
static void ppp_destroy_channel(struct channel *pch);
static int unit_get(struct idr *p, void *ptr, int min);
static int unit_set(struct idr *p, void *ptr, int n);
static void unit_put(struct idr *p, int n);
static void *unit_find(struct idr *p, int n);
static void ppp_setup(struct net_device *dev);

static const struct net_device_ops ppp_netdev_ops;

static const struct class ppp_class = {
        .name = "ppp",
};

/* per net-namespace data */
static inline struct ppp_net *ppp_pernet(struct net *net)
{
        return net_generic(net, ppp_net_id);
}

/* Translates a PPP protocol number to a NP index (NP == network protocol) */
static inline int proto_to_npindex(int proto)
{
        switch (proto) {
        case PPP_IP:
                return NP_IP;
        case PPP_IPV6:
                return NP_IPV6;
        case PPP_IPX:
                return NP_IPX;
        case PPP_AT:
                return NP_AT;
        case PPP_MPLS_UC:
                return NP_MPLS_UC;
        case PPP_MPLS_MC:
                return NP_MPLS_MC;
        }
        return -EINVAL;
}

/* Translates an NP index into a PPP protocol number */
static const int npindex_to_proto[NUM_NP] = {
        PPP_IP,
        PPP_IPV6,
        PPP_IPX,
        PPP_AT,
        PPP_MPLS_UC,
        PPP_MPLS_MC,
};

/* Translates an ethertype into an NP index */
static inline int ethertype_to_npindex(int ethertype)
{
        switch (ethertype) {
        case ETH_P_IP:
                return NP_IP;
        case ETH_P_IPV6:
                return NP_IPV6;
        case ETH_P_IPX:
                return NP_IPX;
        case ETH_P_PPPTALK:
        case ETH_P_ATALK:
                return NP_AT;
        case ETH_P_MPLS_UC:
                return NP_MPLS_UC;
        case ETH_P_MPLS_MC:
                return NP_MPLS_MC;
        }
        return -1;
}

/* Translates an NP index into an ethertype */
static const int npindex_to_ethertype[NUM_NP] = {
        ETH_P_IP,
        ETH_P_IPV6,
        ETH_P_IPX,
        ETH_P_PPPTALK,
        ETH_P_MPLS_UC,
        ETH_P_MPLS_MC,
};

/*
 * Locking shorthand.
 */
#define ppp_xmit_lock(ppp)        spin_lock_bh(&(ppp)->wlock)
#define ppp_xmit_unlock(ppp)        spin_unlock_bh(&(ppp)->wlock)
#define ppp_recv_lock(ppp)        spin_lock_bh(&(ppp)->rlock)
#define ppp_recv_unlock(ppp)        spin_unlock_bh(&(ppp)->rlock)
#define ppp_lock(ppp)                do { ppp_xmit_lock(ppp); \
                                     ppp_recv_lock(ppp); } while (0)
#define ppp_unlock(ppp)                do { ppp_recv_unlock(ppp); \
                                     ppp_xmit_unlock(ppp); } while (0)

/*
 * /dev/ppp device routines.
 * The /dev/ppp device is used by pppd to control the ppp unit.
 * It supports the read, write, ioctl and poll functions.
 * Open instances of /dev/ppp can be in one of three states:
 * unattached, attached to a ppp unit, or attached to a ppp channel.
 */
static int ppp_open(struct inode *inode, struct file *file)
{
        /*
         * This could (should?) be enforced by the permissions on /dev/ppp.
         */
        if (!ns_capable(file->f_cred->user_ns, CAP_NET_ADMIN))
                return -EPERM;
        return 0;
}

static int ppp_release(struct inode *unused, struct file *file)
{
        struct ppp_file *pf = file->private_data;
        struct ppp *ppp;

        if (pf) {
                file->private_data = NULL;
                if (pf->kind == INTERFACE) {
                        ppp = PF_TO_PPP(pf);
                        rtnl_lock();
                        if (file == ppp->owner)
                                unregister_netdevice(ppp->dev);
                        rtnl_unlock();
                }
                if (refcount_dec_and_test(&pf->refcnt)) {
                        switch (pf->kind) {
                        case INTERFACE:
                                ppp_destroy_interface(PF_TO_PPP(pf));
                                break;
                        case CHANNEL:
                                ppp_destroy_channel(PF_TO_CHANNEL(pf));
                                break;
                        }
                }
        }
        return 0;
}

static ssize_t ppp_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        struct ppp_file *pf = file->private_data;
        DECLARE_WAITQUEUE(wait, current);
        ssize_t ret;
        struct sk_buff *skb = NULL;
        struct iovec iov;
        struct iov_iter to;

        ret = count;

        if (!pf)
                return -ENXIO;
        add_wait_queue(&pf->rwait, &wait);
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                skb = skb_dequeue(&pf->rq);
                if (skb)
                        break;
                ret = 0;
                if (pf->dead)
                        break;
                if (pf->kind == INTERFACE) {
                        /*
                         * Return 0 (EOF) on an interface that has no
                         * channels connected, unless it is looping
                         * network traffic (demand mode).
                         */
                        struct ppp *ppp = PF_TO_PPP(pf);

                        ppp_recv_lock(ppp);
                        if (ppp->n_channels == 0 &&
                            (ppp->flags & SC_LOOP_TRAFFIC) == 0) {
                                ppp_recv_unlock(ppp);
                                break;
                        }
                        ppp_recv_unlock(ppp);
                }
                ret = -EAGAIN;
                if (file->f_flags & O_NONBLOCK)
                        break;
                ret = -ERESTARTSYS;
                if (signal_pending(current))
                        break;
                schedule();
        }
        set_current_state(TASK_RUNNING);
        remove_wait_queue(&pf->rwait, &wait);

        if (!skb)
                goto out;

        ret = -EOVERFLOW;
        if (skb->len > count)
                goto outf;
        ret = -EFAULT;
        iov.iov_base = buf;
        iov.iov_len = count;
        iov_iter_init(&to, ITER_DEST, &iov, 1, count);
        if (skb_copy_datagram_iter(skb, 0, &to, skb->len))
                goto outf;
        ret = skb->len;

 outf:
        kfree_skb(skb);
 out:
        return ret;
}

static ssize_t ppp_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos)
{
        struct ppp_file *pf = file->private_data;
        struct sk_buff *skb;
        ssize_t ret;

        if (!pf)
                return -ENXIO;
        /* All PPP packets should start with the 2-byte protocol */
        if (count < PPP_PROTO_LEN)
                return -EINVAL;
        ret = -ENOMEM;
        skb = alloc_skb(count + pf->hdrlen, GFP_KERNEL);
        if (!skb)
                goto out;
        skb_reserve(skb, pf->hdrlen);
        ret = -EFAULT;
        if (copy_from_user(skb_put(skb, count), buf, count)) {
                kfree_skb(skb);
                goto out;
        }

        switch (pf->kind) {
        case INTERFACE:
                ppp_xmit_process(PF_TO_PPP(pf), skb);
                break;
        case CHANNEL:
                skb_queue_tail(&pf->xq, skb);
                ppp_channel_push(PF_TO_CHANNEL(pf));
                break;
        }

        ret = count;

 out:
        return ret;
}

/* No kernel lock - fine */
static __poll_t ppp_poll(struct file *file, poll_table *wait)
{
        struct ppp_file *pf = file->private_data;
        __poll_t mask;

        if (!pf)
                return 0;
        poll_wait(file, &pf->rwait, wait);
        mask = EPOLLOUT | EPOLLWRNORM;
        if (skb_peek(&pf->rq))
                mask |= EPOLLIN | EPOLLRDNORM;
        if (pf->dead)
                mask |= EPOLLHUP;
        else if (pf->kind == INTERFACE) {
                /* see comment in ppp_read */
                struct ppp *ppp = PF_TO_PPP(pf);

                ppp_recv_lock(ppp);
                if (ppp->n_channels == 0 &&
                    (ppp->flags & SC_LOOP_TRAFFIC) == 0)
                        mask |= EPOLLIN | EPOLLRDNORM;
                ppp_recv_unlock(ppp);
        }

        return mask;
}

#ifdef CONFIG_PPP_FILTER
static struct bpf_prog *get_filter(struct sock_fprog *uprog)
{
        struct sock_fprog_kern fprog;
        struct bpf_prog *res = NULL;
        int err;

        if (!uprog->len)
                return NULL;

        /* uprog->len is unsigned short, so no overflow here */
        fprog.len = uprog->len;
        fprog.filter = memdup_array_user(uprog->filter,
                                         uprog->len, sizeof(struct sock_filter));
        if (IS_ERR(fprog.filter))
                return ERR_CAST(fprog.filter);

        err = bpf_prog_create(&res, &fprog);
        kfree(fprog.filter);

        return err ? ERR_PTR(err) : res;
}

static struct bpf_prog *ppp_get_filter(struct sock_fprog __user *p)
{
        struct sock_fprog uprog;

        if (copy_from_user(&uprog, p, sizeof(struct sock_fprog)))
                return ERR_PTR(-EFAULT);
        return get_filter(&uprog);
}

#ifdef CONFIG_COMPAT
struct sock_fprog32 {
        unsigned short len;
        compat_caddr_t filter;
};

#define PPPIOCSPASS32                _IOW('t', 71, struct sock_fprog32)
#define PPPIOCSACTIVE32                _IOW('t', 70, struct sock_fprog32)

static struct bpf_prog *compat_ppp_get_filter(struct sock_fprog32 __user *p)
{
        struct sock_fprog32 uprog32;
        struct sock_fprog uprog;

        if (copy_from_user(&uprog32, p, sizeof(struct sock_fprog32)))
                return ERR_PTR(-EFAULT);
        uprog.len = uprog32.len;
        uprog.filter = compat_ptr(uprog32.filter);
        return get_filter(&uprog);
}
#endif
#endif

/* Bridge one PPP channel to another.
 * When two channels are bridged, ppp_input on one channel is redirected to
 * the other's ops->start_xmit handler.
 * In order to safely bridge channels we must reject channels which are already
 * part of a bridge instance, or which form part of an existing unit.
 * Once successfully bridged, each channel holds a reference on the other
 * to prevent it being freed while the bridge is extant.
 */
static int ppp_bridge_channels(struct channel *pch, struct channel *pchb)
{
        write_lock_bh(&pch->upl);
        if (pch->ppp ||
            rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl))) {
                write_unlock_bh(&pch->upl);
                return -EALREADY;
        }
        refcount_inc(&pchb->file.refcnt);
        rcu_assign_pointer(pch->bridge, pchb);
        write_unlock_bh(&pch->upl);

        write_lock_bh(&pchb->upl);
        if (pchb->ppp ||
            rcu_dereference_protected(pchb->bridge, lockdep_is_held(&pchb->upl))) {
                write_unlock_bh(&pchb->upl);
                goto err_unset;
        }
        refcount_inc(&pch->file.refcnt);
        rcu_assign_pointer(pchb->bridge, pch);
        write_unlock_bh(&pchb->upl);

        return 0;

err_unset:
        write_lock_bh(&pch->upl);
        /* Re-read pch->bridge with upl held in case it was modified concurrently */
        pchb = rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl));
        RCU_INIT_POINTER(pch->bridge, NULL);
        write_unlock_bh(&pch->upl);
        synchronize_rcu();

        if (pchb)
                if (refcount_dec_and_test(&pchb->file.refcnt))
                        ppp_destroy_channel(pchb);

        return -EALREADY;
}

static int ppp_unbridge_channels(struct channel *pch)
{
        struct channel *pchb, *pchbb;

        write_lock_bh(&pch->upl);
        pchb = rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl));
        if (!pchb) {
                write_unlock_bh(&pch->upl);
                return -EINVAL;
        }
        RCU_INIT_POINTER(pch->bridge, NULL);
        write_unlock_bh(&pch->upl);

        /* Only modify pchb if phcb->bridge points back to pch.
         * If not, it implies that there has been a race unbridging (and possibly
         * even rebridging) pchb.  We should leave pchb alone to avoid either a
         * refcount underflow, or breaking another established bridge instance.
         */
        write_lock_bh(&pchb->upl);
        pchbb = rcu_dereference_protected(pchb->bridge, lockdep_is_held(&pchb->upl));
        if (pchbb == pch)
                RCU_INIT_POINTER(pchb->bridge, NULL);
        write_unlock_bh(&pchb->upl);

        synchronize_rcu();

        if (pchbb == pch)
                if (refcount_dec_and_test(&pch->file.refcnt))
                        ppp_destroy_channel(pch);

        if (refcount_dec_and_test(&pchb->file.refcnt))
                ppp_destroy_channel(pchb);

        return 0;
}

static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct ppp_file *pf;
        struct ppp *ppp;
        int err = -EFAULT, val, val2, i;
        struct ppp_idle32 idle32;
        struct ppp_idle64 idle64;
        struct npioctl npi;
        int unit, cflags;
        struct slcompress *vj;
        void __user *argp = (void __user *)arg;
        int __user *p = argp;

        mutex_lock(&ppp_mutex);

        pf = file->private_data;
        if (!pf) {
                err = ppp_unattached_ioctl(current->nsproxy->net_ns,
                                           pf, file, cmd, arg);
                goto out;
        }

        if (cmd == PPPIOCDETACH) {
                /*
                 * PPPIOCDETACH is no longer supported as it was heavily broken,
                 * and is only known to have been used by pppd older than
                 * ppp-2.4.2 (released November 2003).
                 */
                pr_warn_once("%s (%d) used obsolete PPPIOCDETACH ioctl\n",
                             current->comm, current->pid);
                err = -EINVAL;
                goto out;
        }

        if (pf->kind == CHANNEL) {
                struct channel *pch, *pchb;
                struct ppp_channel *chan;
                struct ppp_net *pn;

                pch = PF_TO_CHANNEL(pf);

                switch (cmd) {
                case PPPIOCCONNECT:
                        if (get_user(unit, p))
                                break;
                        err = ppp_connect_channel(pch, unit);
                        break;

                case PPPIOCDISCONN:
                        err = ppp_disconnect_channel(pch);
                        break;

                case PPPIOCBRIDGECHAN:
                        if (get_user(unit, p))
                                break;
                        err = -ENXIO;
                        pn = ppp_pernet(current->nsproxy->net_ns);
                        spin_lock_bh(&pn->all_channels_lock);
                        pchb = ppp_find_channel(pn, unit);
                        /* Hold a reference to prevent pchb being freed while
                         * we establish the bridge.
                         */
                        if (pchb)
                                refcount_inc(&pchb->file.refcnt);
                        spin_unlock_bh(&pn->all_channels_lock);
                        if (!pchb)
                                break;
                        err = ppp_bridge_channels(pch, pchb);
                        /* Drop earlier refcount now bridge establishment is complete */
                        if (refcount_dec_and_test(&pchb->file.refcnt))
                                ppp_destroy_channel(pchb);
                        break;

                case PPPIOCUNBRIDGECHAN:
                        err = ppp_unbridge_channels(pch);
                        break;

                default:
                        down_read(&pch->chan_sem);
                        chan = pch->chan;
                        err = -ENOTTY;
                        if (chan && chan->ops->ioctl)
                                err = chan->ops->ioctl(chan, cmd, arg);
                        up_read(&pch->chan_sem);
                }
                goto out;
        }

        if (pf->kind != INTERFACE) {
                /* can't happen */
                pr_err("PPP: not interface or channel??\n");
                err = -EINVAL;
                goto out;
        }

        ppp = PF_TO_PPP(pf);
        switch (cmd) {
        case PPPIOCSMRU:
                if (get_user(val, p))
                        break;
                ppp->mru = val;
                err = 0;
                break;

        case PPPIOCSFLAGS:
                if (get_user(val, p))
                        break;
                ppp_lock(ppp);
                cflags = ppp->flags & ~val;
#ifdef CONFIG_PPP_MULTILINK
                if (!(ppp->flags & SC_MULTILINK) && (val & SC_MULTILINK))
                        ppp->nextseq = 0;
#endif
                ppp->flags = val & SC_FLAG_BITS;
                ppp_unlock(ppp);
                if (cflags & SC_CCP_OPEN)
                        ppp_ccp_closed(ppp);
                err = 0;
                break;

        case PPPIOCGFLAGS:
                val = ppp->flags | ppp->xstate | ppp->rstate;
                if (put_user(val, p))
                        break;
                err = 0;
                break;

        case PPPIOCSCOMPRESS:
        {
                struct ppp_option_data data;
                if (copy_from_user(&data, argp, sizeof(data)))
                        err = -EFAULT;
                else
                        err = ppp_set_compress(ppp, &data);
                break;
        }
        case PPPIOCGUNIT:
                if (put_user(ppp->file.index, p))
                        break;
                err = 0;
                break;

        case PPPIOCSDEBUG:
                if (get_user(val, p))
                        break;
                ppp->debug = val;
                err = 0;
                break;

        case PPPIOCGDEBUG:
                if (put_user(ppp->debug, p))
                        break;
                err = 0;
                break;

        case PPPIOCGIDLE32:
                idle32.xmit_idle = (jiffies - ppp->last_xmit) / HZ;
                idle32.recv_idle = (jiffies - ppp->last_recv) / HZ;
                if (copy_to_user(argp, &idle32, sizeof(idle32)))
                        break;
                err = 0;
                break;

        case PPPIOCGIDLE64:
                idle64.xmit_idle = (jiffies - ppp->last_xmit) / HZ;
                idle64.recv_idle = (jiffies - ppp->last_recv) / HZ;
                if (copy_to_user(argp, &idle64, sizeof(idle64)))
                        break;
                err = 0;
                break;

        case PPPIOCSMAXCID:
                if (get_user(val, p))
                        break;
                val2 = 15;
                if ((val >> 16) != 0) {
                        val2 = val >> 16;
                        val &= 0xffff;
                }
                vj = slhc_init(val2+1, val+1);
                if (IS_ERR(vj)) {
                        err = PTR_ERR(vj);
                        break;
                }
                ppp_lock(ppp);
                if (ppp->vj)
                        slhc_free(ppp->vj);
                ppp->vj = vj;
                ppp_unlock(ppp);
                err = 0;
                break;

        case PPPIOCGNPMODE:
        case PPPIOCSNPMODE:
                if (copy_from_user(&npi, argp, sizeof(npi)))
                        break;
                err = proto_to_npindex(npi.protocol);
                if (err < 0)
                        break;
                i = err;
                if (cmd == PPPIOCGNPMODE) {
                        err = -EFAULT;
                        npi.mode = ppp->npmode[i];
                        if (copy_to_user(argp, &npi, sizeof(npi)))
                                break;
                } else {
                        ppp->npmode[i] = npi.mode;
                        /* we may be able to transmit more packets now (??) */
                        netif_wake_queue(ppp->dev);
                }
                err = 0;
                break;

#ifdef CONFIG_PPP_FILTER
        case PPPIOCSPASS:
        case PPPIOCSACTIVE:
        {
                struct bpf_prog *filter = ppp_get_filter(argp);
                struct bpf_prog **which;

                if (IS_ERR(filter)) {
                        err = PTR_ERR(filter);
                        break;
                }
                if (cmd == PPPIOCSPASS)
                        which = &ppp->pass_filter;
                else
                        which = &ppp->active_filter;
                ppp_lock(ppp);
                if (*which)
                        bpf_prog_destroy(*which);
                *which = filter;
                ppp_unlock(ppp);
                err = 0;
                break;
        }
#endif /* CONFIG_PPP_FILTER */

#ifdef CONFIG_PPP_MULTILINK
        case PPPIOCSMRRU:
                if (get_user(val, p))
                        break;
                ppp_recv_lock(ppp);
                ppp->mrru = val;
                ppp_recv_unlock(ppp);
                err = 0;
                break;
#endif /* CONFIG_PPP_MULTILINK */

        default:
                err = -ENOTTY;
        }

out:
        mutex_unlock(&ppp_mutex);

        return err;
}

#ifdef CONFIG_COMPAT
struct ppp_option_data32 {
        compat_uptr_t                ptr;
        u32                        length;
        compat_int_t                transmit;
};
#define PPPIOCSCOMPRESS32        _IOW('t', 77, struct ppp_option_data32)

static long ppp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct ppp_file *pf;
        int err = -ENOIOCTLCMD;
        void __user *argp = (void __user *)arg;

        mutex_lock(&ppp_mutex);

        pf = file->private_data;
        if (pf && pf->kind == INTERFACE) {
                struct ppp *ppp = PF_TO_PPP(pf);
                switch (cmd) {
#ifdef CONFIG_PPP_FILTER
                case PPPIOCSPASS32:
                case PPPIOCSACTIVE32:
                {
                        struct bpf_prog *filter = compat_ppp_get_filter(argp);
                        struct bpf_prog **which;

                        if (IS_ERR(filter)) {
                                err = PTR_ERR(filter);
                                break;
                        }
                        if (cmd == PPPIOCSPASS32)
                                which = &ppp->pass_filter;
                        else
                                which = &ppp->active_filter;
                        ppp_lock(ppp);
                        if (*which)
                                bpf_prog_destroy(*which);
                        *which = filter;
                        ppp_unlock(ppp);
                        err = 0;
                        break;
                }
#endif /* CONFIG_PPP_FILTER */
                case PPPIOCSCOMPRESS32:
                {
                        struct ppp_option_data32 data32;
                        if (copy_from_user(&data32, argp, sizeof(data32))) {
                                err = -EFAULT;
                        } else {
                                struct ppp_option_data data = {
                                        .ptr = compat_ptr(data32.ptr),
                                        .length = data32.length,
                                        .transmit = data32.transmit
                                };
                                err = ppp_set_compress(ppp, &data);
                        }
                        break;
                }
                }
        }
        mutex_unlock(&ppp_mutex);

        /* all other commands have compatible arguments */
        if (err == -ENOIOCTLCMD)
                err = ppp_ioctl(file, cmd, (unsigned long)compat_ptr(arg));

        return err;
}
#endif

static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf,
                        struct file *file, unsigned int cmd, unsigned long arg)
{
        int unit, err = -EFAULT;
        struct ppp *ppp;
        struct channel *chan;
        struct ppp_net *pn;
        int __user *p = (int __user *)arg;

        switch (cmd) {
        case PPPIOCNEWUNIT:
                /* Create a new ppp unit */
                if (get_user(unit, p))
                        break;
                err = ppp_create_interface(net, file, &unit);
                if (err < 0)
                        break;

                err = -EFAULT;
                if (put_user(unit, p))
                        break;
                err = 0;
                break;

        case PPPIOCATTACH:
                /* Attach to an existing ppp unit */
                if (get_user(unit, p))
                        break;
                err = -ENXIO;
                pn = ppp_pernet(net);
                mutex_lock(&pn->all_ppp_mutex);
                ppp = ppp_find_unit(pn, unit);
                if (ppp) {
                        refcount_inc(&ppp->file.refcnt);
                        file->private_data = &ppp->file;
                        err = 0;
                }
                mutex_unlock(&pn->all_ppp_mutex);
                break;

        case PPPIOCATTCHAN:
                if (get_user(unit, p))
                        break;
                err = -ENXIO;
                pn = ppp_pernet(net);
                spin_lock_bh(&pn->all_channels_lock);
                chan = ppp_find_channel(pn, unit);
                if (chan) {
                        refcount_inc(&chan->file.refcnt);
                        file->private_data = &chan->file;
                        err = 0;
                }
                spin_unlock_bh(&pn->all_channels_lock);
                break;

        default:
                err = -ENOTTY;
        }

        return err;
}

static const struct file_operations ppp_device_fops = {
        .owner                = THIS_MODULE,
        .read                = ppp_read,
        .write                = ppp_write,
        .poll                = ppp_poll,
        .unlocked_ioctl        = ppp_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ppp_compat_ioctl,
#endif
        .open                = ppp_open,
        .release        = ppp_release,
        .llseek                = noop_llseek,
};

static __net_init int ppp_init_net(struct net *net)
{
        struct ppp_net *pn = net_generic(net, ppp_net_id);

        idr_init(&pn->units_idr);
        mutex_init(&pn->all_ppp_mutex);

        INIT_LIST_HEAD(&pn->all_channels);
        INIT_LIST_HEAD(&pn->new_channels);

        spin_lock_init(&pn->all_channels_lock);

        return 0;
}

static __net_exit void ppp_exit_net(struct net *net)
{
        struct ppp_net *pn = net_generic(net, ppp_net_id);
        struct net_device *dev;
        struct net_device *aux;
        struct ppp *ppp;
        LIST_HEAD(list);
        int id;

        rtnl_lock();
        for_each_netdev_safe(net, dev, aux) {
                if (dev->netdev_ops == &ppp_netdev_ops)
                        unregister_netdevice_queue(dev, &list);
        }

        idr_for_each_entry(&pn->units_idr, ppp, id)
                /* Skip devices already unregistered by previous loop */
                if (!net_eq(dev_net(ppp->dev), net))
                        unregister_netdevice_queue(ppp->dev, &list);

        unregister_netdevice_many(&list);
        rtnl_unlock();

        mutex_destroy(&pn->all_ppp_mutex);
        idr_destroy(&pn->units_idr);
        WARN_ON_ONCE(!list_empty(&pn->all_channels));
        WARN_ON_ONCE(!list_empty(&pn->new_channels));
}

static struct pernet_operations ppp_net_ops = {
        .init = ppp_init_net,
        .exit = ppp_exit_net,
        .id   = &ppp_net_id,
        .size = sizeof(struct ppp_net),
};

static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set)
{
        struct ppp_net *pn = ppp_pernet(ppp->ppp_net);
        int ret;

        mutex_lock(&pn->all_ppp_mutex);

        if (unit < 0) {
                ret = unit_get(&pn->units_idr, ppp, 0);
                if (ret < 0)
                        goto err;
                if (!ifname_is_set) {
                        while (1) {
                                snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ret);
                                if (!netdev_name_in_use(ppp->ppp_net, ppp->dev->name))
                                        break;
                                unit_put(&pn->units_idr, ret);
                                ret = unit_get(&pn->units_idr, ppp, ret + 1);
                                if (ret < 0)
                                        goto err;
                        }
                }
        } else {
                /* Caller asked for a specific unit number. Fail with -EEXIST
                 * if unavailable. For backward compatibility, return -EEXIST
                 * too if idr allocation fails; this makes pppd retry without
                 * requesting a specific unit number.
                 */
                if (unit_find(&pn->units_idr, unit)) {
                        ret = -EEXIST;
                        goto err;
                }
                ret = unit_set(&pn->units_idr, ppp, unit);
                if (ret < 0) {
                        /* Rewrite error for backward compatibility */
                        ret = -EEXIST;
                        goto err;
                }
        }
        ppp->file.index = ret;

        if (!ifname_is_set)
                snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ppp->file.index);

        mutex_unlock(&pn->all_ppp_mutex);

        ret = register_netdevice(ppp->dev);
        if (ret < 0)
                goto err_unit;

        atomic_inc(&ppp_unit_count);

        return 0;

err_unit:
        mutex_lock(&pn->all_ppp_mutex);
        unit_put(&pn->units_idr, ppp->file.index);
err:
        mutex_unlock(&pn->all_ppp_mutex);

        return ret;
}

static int ppp_dev_configure(struct net *src_net, struct net_device *dev,
                             const struct ppp_config *conf)
{
        struct ppp *ppp = netdev_priv(dev);
        int indx;
        int err;
        int cpu;

        ppp->dev = dev;
        ppp->ppp_net = src_net;
        ppp->mru = PPP_MRU;
        ppp->owner = conf->file;

        init_ppp_file(&ppp->file, INTERFACE);
        ppp->file.hdrlen = PPP_HDRLEN - 2; /* don't count proto bytes */

        for (indx = 0; indx < NUM_NP; ++indx)
                ppp->npmode[indx] = NPMODE_PASS;
        INIT_LIST_HEAD(&ppp->channels);
        spin_lock_init(&ppp->rlock);
        spin_lock_init(&ppp->wlock);

        ppp->xmit_recursion = alloc_percpu(int);
        if (!ppp->xmit_recursion) {
                err = -ENOMEM;
                goto err1;
        }
        for_each_possible_cpu(cpu)
                (*per_cpu_ptr(ppp->xmit_recursion, cpu)) = 0;

#ifdef CONFIG_PPP_MULTILINK
        ppp->minseq = -1;
        skb_queue_head_init(&ppp->mrq);
#endif /* CONFIG_PPP_MULTILINK */
#ifdef CONFIG_PPP_FILTER
        ppp->pass_filter = NULL;
        ppp->active_filter = NULL;
#endif /* CONFIG_PPP_FILTER */

        err = ppp_unit_register(ppp, conf->unit, conf->ifname_is_set);
        if (err < 0)
                goto err2;

        conf->file->private_data = &ppp->file;

        return 0;
err2:
        free_percpu(ppp->xmit_recursion);
err1:
        return err;
}

static const struct nla_policy ppp_nl_policy[IFLA_PPP_MAX + 1] = {
        [IFLA_PPP_DEV_FD]        = { .type = NLA_S32 },
};

static int ppp_nl_validate(struct nlattr *tb[], struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        if (!data)
                return -EINVAL;

        if (!data[IFLA_PPP_DEV_FD])
                return -EINVAL;
        if (nla_get_s32(data[IFLA_PPP_DEV_FD]) < 0)
                return -EBADF;

        return 0;
}

static int ppp_nl_newlink(struct net *src_net, struct net_device *dev,
                          struct nlattr *tb[], struct nlattr *data[],
                          struct netlink_ext_ack *extack)
{
        struct ppp_config conf = {
                .unit = -1,
                .ifname_is_set = true,
        };
        struct file *file;
        int err;

        file = fget(nla_get_s32(data[IFLA_PPP_DEV_FD]));
        if (!file)
                return -EBADF;

        /* rtnl_lock is already held here, but ppp_create_interface() locks
         * ppp_mutex before holding rtnl_lock. Using mutex_trylock() avoids
         * possible deadlock due to lock order inversion, at the cost of
         * pushing the problem back to userspace.
         */
        if (!mutex_trylock(&ppp_mutex)) {
                err = -EBUSY;
                goto out;
        }

        if (file->f_op != &ppp_device_fops || file->private_data) {
                err = -EBADF;
                goto out_unlock;
        }

        conf.file = file;

        /* Don't use device name generated by the rtnetlink layer when ifname
         * isn't specified. Let ppp_dev_configure() set the device name using
         * the PPP unit identifer as suffix (i.e. ppp<unit_id>). This allows
         * userspace to infer the device name using to the PPPIOCGUNIT ioctl.
         */
        if (!tb[IFLA_IFNAME] || !nla_len(tb[IFLA_IFNAME]) || !*(char *)nla_data(tb[IFLA_IFNAME]))
                conf.ifname_is_set = false;

        err = ppp_dev_configure(src_net, dev, &conf);

out_unlock:
        mutex_unlock(&ppp_mutex);
out:
        fput(file);

        return err;
}

static void ppp_nl_dellink(struct net_device *dev, struct list_head *head)
{
        unregister_netdevice_queue(dev, head);
}

static size_t ppp_nl_get_size(const struct net_device *dev)
{
        return 0;
}

static int ppp_nl_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        return 0;
}

static struct net *ppp_nl_get_link_net(const struct net_device *dev)
{
        struct ppp *ppp = netdev_priv(dev);

        return READ_ONCE(ppp->ppp_net);
}

static struct rtnl_link_ops ppp_link_ops __read_mostly = {
        .kind                = "ppp",
        .maxtype        = IFLA_PPP_MAX,
        .policy                = ppp_nl_policy,
        .priv_size        = sizeof(struct ppp),
        .setup                = ppp_setup,
        .validate        = ppp_nl_validate,
        .newlink        = ppp_nl_newlink,
        .dellink        = ppp_nl_dellink,
        .get_size        = ppp_nl_get_size,
        .fill_info        = ppp_nl_fill_info,
        .get_link_net        = ppp_nl_get_link_net,
};

#define PPP_MAJOR        108

/* Called at boot time if ppp is compiled into the kernel,
   or at module load time (from init_module) if compiled as a module. */
static int __init ppp_init(void)
{
        int err;

        pr_info("PPP generic driver version " PPP_VERSION "\n");

        err = register_pernet_device(&ppp_net_ops);
        if (err) {
                pr_err("failed to register PPP pernet device (%d)\n", err);
                goto out;
        }

        err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops);
        if (err) {
                pr_err("failed to register PPP device (%d)\n", err);
                goto out_net;
        }

        err = class_register(&ppp_class);
        if (err)
                goto out_chrdev;

        err = rtnl_link_register(&ppp_link_ops);
        if (err) {
                pr_err("failed to register rtnetlink PPP handler\n");
                goto out_class;
        }

        /* not a big deal if we fail here :-) */
        device_create(&ppp_class, NULL, MKDEV(PPP_MAJOR, 0), NULL, "ppp");

        return 0;

out_class:
        class_unregister(&ppp_class);
out_chrdev:
        unregister_chrdev(PPP_MAJOR, "ppp");
out_net:
        unregister_pernet_device(&ppp_net_ops);
out:
        return err;
}

/*
 * Network interface unit routines.
 */
static netdev_tx_t
ppp_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct ppp *ppp = netdev_priv(dev);
        int npi, proto;
        unsigned char *pp;

        npi = ethertype_to_npindex(ntohs(skb->protocol));
        if (npi < 0)
                goto outf;

        /* Drop, accept or reject the packet */
        switch (ppp->npmode[npi]) {
        case NPMODE_PASS:
                break;
        case NPMODE_QUEUE:
                /* it would be nice to have a way to tell the network
                   system to queue this one up for later. */
                goto outf;
        case NPMODE_DROP:
        case NPMODE_ERROR:
                goto outf;
        }

        /* Put the 2-byte PPP protocol number on the front,
           making sure there is room for the address and control fields. */
        if (skb_cow_head(skb, PPP_HDRLEN))
                goto outf;

        pp = skb_push(skb, 2);
        proto = npindex_to_proto[npi];
        put_unaligned_be16(proto, pp);

        skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev)));
        ppp_xmit_process(ppp, skb);

        return NETDEV_TX_OK;

 outf:
        kfree_skb(skb);
        ++dev->stats.tx_dropped;
        return NETDEV_TX_OK;
}

static int
ppp_net_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
                       void __user *addr, int cmd)
{
        struct ppp *ppp = netdev_priv(dev);
        int err = -EFAULT;
        struct ppp_stats stats;
        struct ppp_comp_stats cstats;
        char *vers;

        switch (cmd) {
        case SIOCGPPPSTATS:
                ppp_get_stats(ppp, &stats);
                if (copy_to_user(addr, &stats, sizeof(stats)))
                        break;
                err = 0;
                break;

        case SIOCGPPPCSTATS:
                memset(&cstats, 0, sizeof(cstats));
                if (ppp->xc_state)
                        ppp->xcomp->comp_stat(ppp->xc_state, &cstats.c);
                if (ppp->rc_state)
                        ppp->rcomp->decomp_stat(ppp->rc_state, &cstats.d);
                if (copy_to_user(addr, &cstats, sizeof(cstats)))
                        break;
                err = 0;
                break;

        case SIOCGPPPVER:
                vers = PPP_VERSION;
                if (copy_to_user(addr, vers, strlen(vers) + 1))
                        break;
                err = 0;
                break;

        default:
                err = -EINVAL;
        }

        return err;
}

static void
ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
{
        struct ppp *ppp = netdev_priv(dev);

        ppp_recv_lock(ppp);
        stats64->rx_packets = ppp->stats64.rx_packets;
        stats64->rx_bytes   = ppp->stats64.rx_bytes;
        ppp_recv_unlock(ppp);

        ppp_xmit_lock(ppp);
        stats64->tx_packets = ppp->stats64.tx_packets;
        stats64->tx_bytes   = ppp->stats64.tx_bytes;
        ppp_xmit_unlock(ppp);

        stats64->rx_errors        = dev->stats.rx_errors;
        stats64->tx_errors        = dev->stats.tx_errors;
        stats64->rx_dropped       = dev->stats.rx_dropped;
        stats64->tx_dropped       = dev->stats.tx_dropped;
        stats64->rx_length_errors = dev->stats.rx_length_errors;
}

static int ppp_dev_init(struct net_device *dev)
{
        struct ppp *ppp;

        netdev_lockdep_set_classes(dev);

        ppp = netdev_priv(dev);
        /* Let the netdevice take a reference on the ppp file. This ensures
         * that ppp_destroy_interface() won't run before the device gets
         * unregistered.
         */
        refcount_inc(&ppp->file.refcnt);

        return 0;
}

static void ppp_dev_uninit(struct net_device *dev)
{
        struct ppp *ppp = netdev_priv(dev);
        struct ppp_net *pn = ppp_pernet(ppp->ppp_net);

        ppp_lock(ppp);
        ppp->closing = 1;
        ppp_unlock(ppp);

        mutex_lock(&pn->all_ppp_mutex);
        unit_put(&pn->units_idr, ppp->file.index);
        mutex_unlock(&pn->all_ppp_mutex);

        ppp->owner = NULL;

        ppp->file.dead = 1;
        wake_up_interruptible(&ppp->file.rwait);
}

static void ppp_dev_priv_destructor(struct net_device *dev)
{
        struct ppp *ppp;

        ppp = netdev_priv(dev);
        if (refcount_dec_and_test(&ppp->file.refcnt))
                ppp_destroy_interface(ppp);
}

static int ppp_fill_forward_path(struct net_device_path_ctx *ctx,
                                 struct net_device_path *path)
{
        struct ppp *ppp = netdev_priv(ctx->dev);
        struct ppp_channel *chan;
        struct channel *pch;

        if (ppp->flags & SC_MULTILINK)
                return -EOPNOTSUPP;

        if (list_empty(&ppp->channels))
                return -ENODEV;

        pch = list_first_entry(&ppp->channels, struct channel, clist);
        chan = pch->chan;
        if (!chan->ops->fill_forward_path)
                return -EOPNOTSUPP;

        return chan->ops->fill_forward_path(ctx, path, chan);
}

static const struct net_device_ops ppp_netdev_ops = {
        .ndo_init         = ppp_dev_init,
        .ndo_uninit      = ppp_dev_uninit,
        .ndo_start_xmit  = ppp_start_xmit,
        .ndo_siocdevprivate = ppp_net_siocdevprivate,
        .ndo_get_stats64 = ppp_get_stats64,
        .ndo_fill_forward_path = ppp_fill_forward_path,
};

static const struct device_type ppp_type = {
        .name = "ppp",
};

static void ppp_setup(struct net_device *dev)
{
        dev->netdev_ops = &ppp_netdev_ops;
        SET_NETDEV_DEVTYPE(dev, &ppp_type);

        dev->features |= NETIF_F_LLTX;

        dev->hard_header_len = PPP_HDRLEN;
        dev->mtu = PPP_MRU;
        dev->addr_len = 0;
        dev->tx_queue_len = 3;
        dev->type = ARPHRD_PPP;
        dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
        dev->priv_destructor = ppp_dev_priv_destructor;
        netif_keep_dst(dev);
}

/*
 * Transmit-side routines.
 */

/* Called to do any work queued up on the transmit side that can now be done */
static void __ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb)
{
        ppp_xmit_lock(ppp);
        if (!ppp->closing) {
                ppp_push(ppp);

                if (skb)
                        skb_queue_tail(&ppp->file.xq, skb);
                while (!ppp->xmit_pending &&
                       (skb = skb_dequeue(&ppp->file.xq)))
                        ppp_send_frame(ppp, skb);
                /* If there's no work left to do, tell the core net
                   code that we can accept some more. */
                if (!ppp->xmit_pending && !skb_peek(&ppp->file.xq))
                        netif_wake_queue(ppp->dev);
                else
                        netif_stop_queue(ppp->dev);
        } else {
                kfree_skb(skb);
        }
        ppp_xmit_unlock(ppp);
}

static void ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb)
{
        local_bh_disable();

        if (unlikely(*this_cpu_ptr(ppp->xmit_recursion)))
                goto err;

        (*this_cpu_ptr(ppp->xmit_recursion))++;
        __ppp_xmit_process(ppp, skb);
        (*this_cpu_ptr(ppp->xmit_recursion))--;

        local_bh_enable();

        return;

err:
        local_bh_enable();

        kfree_skb(skb);

        if (net_ratelimit())
                netdev_err(ppp->dev, "recursion detected\n");
}

static inline struct sk_buff *
pad_compress_skb(struct ppp *ppp, struct sk_buff *skb)
{
        struct sk_buff *new_skb;
        int len;
        int new_skb_size = ppp->dev->mtu +
                ppp->xcomp->comp_extra + ppp->dev->hard_header_len;
        int compressor_skb_size = ppp->dev->mtu +
                ppp->xcomp->comp_extra + PPP_HDRLEN;
        new_skb = alloc_skb(new_skb_size, GFP_ATOMIC);
        if (!new_skb) {
                if (net_ratelimit())
                        netdev_err(ppp->dev, "PPP: no memory (comp pkt)\n");
                return NULL;
        }
        if (ppp->dev->hard_header_len > PPP_HDRLEN)
                skb_reserve(new_skb,
                            ppp->dev->hard_header_len - PPP_HDRLEN);

        /* compressor still expects A/C bytes in hdr */
        len = ppp->xcomp->compress(ppp->xc_state, skb->data - 2,
                                   new_skb->data, skb->len + 2,
                                   compressor_skb_size);
        if (len > 0 && (ppp->flags & SC_CCP_UP)) {
                consume_skb(skb);
                skb = new_skb;
                skb_put(skb, len);
                skb_pull(skb, 2);        /* pull off A/C bytes */
        } else if (len == 0) {
                /* didn't compress, or CCP not up yet */
                consume_skb(new_skb);
                new_skb = skb;
        } else {
                /*
                 * (len < 0)
                 * MPPE requires that we do not send unencrypted
                 * frames.  The compressor will return -1 if we
                 * should drop the frame.  We cannot simply test
                 * the compress_proto because MPPE and MPPC share
                 * the same number.
                 */
                if (net_ratelimit())
                        netdev_err(ppp->dev, "ppp: compressor dropped pkt\n");
                kfree_skb(skb);
                consume_skb(new_skb);
                new_skb = NULL;
        }
        return new_skb;
}

/*
 * Compress and send a frame.
 * The caller should have locked the xmit path,
 * and xmit_pending should be 0.
 */
static void
ppp_send_frame(struct ppp *ppp, struct sk_buff *skb)
{
        int proto = PPP_PROTO(skb);
        struct sk_buff *new_skb;
        int len;
        unsigned char *cp;

        skb->dev = ppp->dev;

        if (proto < 0x8000) {
#ifdef CONFIG_PPP_FILTER
                /* check if we should pass this packet */
                /* the filter instructions are constructed assuming
                   a four-byte PPP header on each packet */
                *(u8 *)skb_push(skb, 2) = 1;
                if (ppp->pass_filter &&
                    bpf_prog_run(ppp->pass_filter, skb) == 0) {
                        if (ppp->debug & 1)
                                netdev_printk(KERN_DEBUG, ppp->dev,
                                              "PPP: outbound frame "
                                              "not passed\n");
                        kfree_skb(skb);
                        return;
                }
                /* if this packet passes the active filter, record the time */
                if (!(ppp->active_filter &&
                      bpf_prog_run(ppp->active_filter, skb) == 0))
                        ppp->last_xmit = jiffies;
                skb_pull(skb, 2);
#else
                /* for data packets, record the time */
                ppp->last_xmit = jiffies;
#endif /* CONFIG_PPP_FILTER */
        }

        ++ppp->stats64.tx_packets;
        ppp->stats64.tx_bytes += skb->len - PPP_PROTO_LEN;

        switch (proto) {
        case PPP_IP:
                if (!ppp->vj || (ppp->flags & SC_COMP_TCP) == 0)
                        break;
                /* try to do VJ TCP header compression */
                new_skb = alloc_skb(skb->len + ppp->dev->hard_header_len - 2,
                                    GFP_ATOMIC);
                if (!new_skb) {
                        netdev_err(ppp->dev, "PPP: no memory (VJ comp pkt)\n");
                        goto drop;
                }
                skb_reserve(new_skb, ppp->dev->hard_header_len - 2);
                cp = skb->data + 2;
                len = slhc_compress(ppp->vj, cp, skb->len - 2,
                                    new_skb->data + 2, &cp,
                                    !(ppp->flags & SC_NO_TCP_CCID));
                if (cp == skb->data + 2) {
                        /* didn't compress */
                        consume_skb(new_skb);
                } else {
                        if (cp[0] & SL_TYPE_COMPRESSED_TCP) {
                                proto = PPP_VJC_COMP;
                                cp[0] &= ~SL_TYPE_COMPRESSED_TCP;
                        } else {
                                proto = PPP_VJC_UNCOMP;
                                cp[0] = skb->data[2];
                        }
                        consume_skb(skb);
                        skb = new_skb;
                        cp = skb_put(skb, len + 2);
                        cp[0] = 0;
                        cp[1] = proto;
                }
                break;

        case PPP_CCP:
                /* peek at outbound CCP frames */
                ppp_ccp_peek(ppp, skb, 0);
                break;
        }

        /* try to do packet compression */
        if ((ppp->xstate & SC_COMP_RUN) && ppp->xc_state &&
            proto != PPP_LCP && proto != PPP_CCP) {
                if (!(ppp->flags & SC_CCP_UP) && (ppp->flags & SC_MUST_COMP)) {
                        if (net_ratelimit())
                                netdev_err(ppp->dev,
                                           "ppp: compression required but "
                                           "down - pkt dropped.\n");
                        goto drop;
                }
                skb = pad_compress_skb(ppp, skb);
                if (!skb)
                        goto drop;
        }

        /*
         * If we are waiting for traffic (demand dialling),
         * queue it up for pppd to receive.
         */
        if (ppp->flags & SC_LOOP_TRAFFIC) {
                if (ppp->file.rq.qlen > PPP_MAX_RQLEN)
                        goto drop;
                skb_queue_tail(&ppp->file.rq, skb);
                wake_up_interruptible(&ppp->file.rwait);
                return;
        }

        ppp->xmit_pending = skb;
        ppp_push(ppp);
        return;

 drop:
        kfree_skb(skb);
        ++ppp->dev->stats.tx_errors;
}

/*
 * Try to send the frame in xmit_pending.
 * The caller should have the xmit path locked.
 */
static void
ppp_push(struct ppp *ppp)
{
        struct list_head *list;
        struct channel *pch;
        struct sk_buff *skb = ppp->xmit_pending;

        if (!skb)
                return;

        list = &ppp->channels;
        if (list_empty(list)) {
                /* nowhere to send the packet, just drop it */
                ppp->xmit_pending = NULL;
                kfree_skb(skb);
                return;
        }

        if ((ppp->flags & SC_MULTILINK) == 0) {
                /* not doing multilink: send it down the first channel */
                list = list->next;
                pch = list_entry(list, struct channel, clist);

                spin_lock(&pch->downl);
                if (pch->chan) {
                        if (pch->chan->ops->start_xmit(pch->chan, skb))
                                ppp->xmit_pending = NULL;
                } else {
                        /* channel got unregistered */
                        kfree_skb(skb);
                        ppp->xmit_pending = NULL;
                }
                spin_unlock(&pch->downl);
                return;
        }

#ifdef CONFIG_PPP_MULTILINK
        /* Multilink: fragment the packet over as many links
           as can take the packet at the moment. */
        if (!ppp_mp_explode(ppp, skb))
                return;
#endif /* CONFIG_PPP_MULTILINK */

        ppp->xmit_pending = NULL;
        kfree_skb(skb);
}

#ifdef CONFIG_PPP_MULTILINK
static bool mp_protocol_compress __read_mostly = true;
module_param(mp_protocol_compress, bool, 0644);
MODULE_PARM_DESC(mp_protocol_compress,
                 "compress protocol id in multilink fragments");

/*
 * Divide a packet to be transmitted into fragments and
 * send them out the individual links.
 */
static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
{
        int len, totlen;
        int i, bits, hdrlen, mtu;
        int flen;
        int navail, nfree, nzero;
        int nbigger;
        int totspeed;
        int totfree;
        unsigned char *p, *q;
        struct list_head *list;
        struct channel *pch;
        struct sk_buff *frag;
        struct ppp_channel *chan;

        totspeed = 0; /*total bitrate of the bundle*/
        nfree = 0; /* # channels which have no packet already queued */
        navail = 0; /* total # of usable channels (not deregistered) */
        nzero = 0; /* number of channels with zero speed associated*/
        totfree = 0; /*total # of channels available and
                                  *having no queued packets before
                                  *starting the fragmentation*/

        hdrlen = (ppp->flags & SC_MP_XSHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN;
        i = 0;
        list_for_each_entry(pch, &ppp->channels, clist) {
                if (pch->chan) {
                        pch->avail = 1;
                        navail++;
                        pch->speed = pch->chan->speed;
                } else {
                        pch->avail = 0;
                }
                if (pch->avail) {
                        if (skb_queue_empty(&pch->file.xq) ||
                                !pch->had_frag) {
                                        if (pch->speed == 0)
                                                nzero++;
                                        else
                                                totspeed += pch->speed;

                                        pch->avail = 2;
                                        ++nfree;
                                        ++totfree;
                                }
                        if (!pch->had_frag && i < ppp->nxchan)
                                ppp->nxchan = i;
                }
                ++i;
        }
        /*
         * Don't start sending this packet unless at least half of
         * the channels are free.  This gives much better TCP
         * performance if we have a lot of channels.
         */
        if (nfree == 0 || nfree < navail / 2)
                return 0; /* can't take now, leave it in xmit_pending */

        /* Do protocol field compression */
        p = skb->data;
        len = skb->len;
        if (*p == 0 && mp_protocol_compress) {
                ++p;
                --len;
        }

        totlen = len;
        nbigger = len % nfree;

        /* skip to the channel after the one we last used
           and start at that one */
        list = &ppp->channels;
        for (i = 0; i < ppp->nxchan; ++i) {
                list = list->next;
                if (list == &ppp->channels) {
                        i = 0;
                        break;
                }
        }

        /* create a fragment for each channel */
        bits = B;
        while (len > 0) {
                list = list->next;
                if (list == &ppp->channels) {
                        i = 0;
                        continue;
                }
                pch = list_entry(list, struct channel, clist);
                ++i;
                if (!pch->avail)
                        continue;

                /*
                 * Skip this channel if it has a fragment pending already and
                 * we haven't given a fragment to all of the free channels.
                 */
                if (pch->avail == 1) {
                        if (nfree > 0)
                                continue;
                } else {
                        pch->avail = 1;
                }

                /* check the channel's mtu and whether it is still attached. */
                spin_lock(&pch->downl);
                if (pch->chan == NULL) {
                        /* can't use this channel, it's being deregistered */
                        if (pch->speed == 0)
                                nzero--;
                        else
                                totspeed -= pch->speed;

                        spin_unlock(&pch->downl);
                        pch->avail = 0;
                        totlen = len;
                        totfree--;
                        nfree--;
                        if (--navail == 0)
                                break;
                        continue;
                }

                /*
                *if the channel speed is not set divide
                *the packet evenly among the free channels;
                *otherwise divide it according to the speed
                *of the channel we are going to transmit on
                */
                flen = len;
                if (nfree > 0) {
                        if (pch->speed == 0) {
                                flen = len/nfree;
                                if (nbigger > 0) {
                                        flen++;
                                        nbigger--;
                                }
                        } else {
                                flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) /
                                        ((totspeed*totfree)/pch->speed)) - hdrlen;
                                if (nbigger > 0) {
                                        flen += ((totfree - nzero)*pch->speed)/totspeed;
                                        nbigger -= ((totfree - nzero)*pch->speed)/
                                                        totspeed;
                                }
                        }
                        nfree--;
                }

                /*
                 *check if we are on the last channel or
                 *we exceded the length of the data to
                 *fragment
                 */
                if ((nfree <= 0) || (flen > len))
                        flen = len;
                /*
                 *it is not worth to tx on slow channels:
                 *in that case from the resulting flen according to the
                 *above formula will be equal or less than zero.
                 *Skip the channel in this case
                 */
                if (flen <= 0) {
                        pch->avail = 2;
                        spin_unlock(&pch->downl);
                        continue;
                }

                /*
                 * hdrlen includes the 2-byte PPP protocol field, but the
                 * MTU counts only the payload excluding the protocol field.
                 * (RFC1661 Section 2)
                 */
                mtu = pch->chan->mtu - (hdrlen - 2);
                if (mtu < 4)
                        mtu = 4;
                if (flen > mtu)
                        flen = mtu;
                if (flen == len)
                        bits |= E;
                frag = alloc_skb(flen + hdrlen + (flen == 0), GFP_ATOMIC);
                if (!frag)
                        goto noskb;
                q = skb_put(frag, flen + hdrlen);

                /* make the MP header */
                put_unaligned_be16(PPP_MP, q);
                if (ppp->flags & SC_MP_XSHORTSEQ) {
                        q[2] = bits + ((ppp->nxseq >> 8) & 0xf);
                        q[3] = ppp->nxseq;
                } else {
                        q[2] = bits;
                        q[3] = ppp->nxseq >> 16;
                        q[4] = ppp->nxseq >> 8;
                        q[5] = ppp->nxseq;
                }

                memcpy(q + hdrlen, p, flen);

                /* try to send it down the channel */
                chan = pch->chan;
                if (!skb_queue_empty(&pch->file.xq) ||
                        !chan->ops->start_xmit(chan, frag))
                        skb_queue_tail(&pch->file.xq, frag);
                pch->had_frag = 1;
                p += flen;
                len -= flen;
                ++ppp->nxseq;
                bits = 0;
                spin_unlock(&pch->downl);
        }
        ppp->nxchan = i;

        return 1;

 noskb:
        spin_unlock(&pch->downl);
        if (ppp->debug & 1)
                netdev_err(ppp->dev, "PPP: no memory (fragment)\n");
        ++ppp->dev->stats.tx_errors;
        ++ppp->nxseq;
        return 1;        /* abandon the frame */
}
#endif /* CONFIG_PPP_MULTILINK */

/* Try to send data out on a channel */
static void __ppp_channel_push(struct channel *pch)
{
        struct sk_buff *skb;
        struct ppp *ppp;

        spin_lock(&pch->downl);
        if (pch->chan) {
                while (!skb_queue_empty(&pch->file.xq)) {
                        skb = skb_dequeue(&pch->file.xq);
                        if (!pch->chan->ops->start_xmit(pch->chan, skb)) {
                                /* put the packet back and try again later */
                                skb_queue_head(&pch->file.xq, skb);
                                break;
                        }
                }
        } else {
                /* channel got deregistered */
                skb_queue_purge(&pch->file.xq);
        }
        spin_unlock(&pch->downl);
        /* see if there is anything from the attached unit to be sent */
        if (skb_queue_empty(&pch->file.xq)) {
                ppp = pch->ppp;
                if (ppp)
                        __ppp_xmit_process(ppp, NULL);
        }
}

static void ppp_channel_push(struct channel *pch)
{
        read_lock_bh(&pch->upl);
        if (pch->ppp) {
                (*this_cpu_ptr(pch->ppp->xmit_recursion))++;
                __ppp_channel_push(pch);
                (*this_cpu_ptr(pch->ppp->xmit_recursion))--;
        } else {
                __ppp_channel_push(pch);
        }
        read_unlock_bh(&pch->upl);
}

/*
 * Receive-side routines.
 */

struct ppp_mp_skb_parm {
        u32                sequence;
        u8                BEbits;
};
#define PPP_MP_CB(skb)        ((struct ppp_mp_skb_parm *)((skb)->cb))

static inline void
ppp_do_recv(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
{
        ppp_recv_lock(ppp);
        if (!ppp->closing)
                ppp_receive_frame(ppp, skb, pch);
        else
                kfree_skb(skb);
        ppp_recv_unlock(ppp);
}

/**
 * __ppp_decompress_proto - Decompress protocol field, slim version.
 * @skb: Socket buffer where protocol field should be decompressed. It must have
 *         at least 1 byte of head room and 1 byte of linear data. First byte of
 *         data must be a protocol field byte.
 *
 * Decompress protocol field in PPP header if it's compressed, e.g. when
 * Protocol-Field-Compression (PFC) was negotiated. No checks w.r.t. skb data
 * length are done in this function.
 */
static void __ppp_decompress_proto(struct sk_buff *skb)
{
        if (skb->data[0] & 0x01)
                *(u8 *)skb_push(skb, 1) = 0x00;
}

/**
 * ppp_decompress_proto - Check skb data room and decompress protocol field.
 * @skb: Socket buffer where protocol field should be decompressed. First byte
 *         of data must be a protocol field byte.
 *
 * Decompress protocol field in PPP header if it's compressed, e.g. when
 * Protocol-Field-Compression (PFC) was negotiated. This function also makes
 * sure that skb data room is sufficient for Protocol field, before and after
 * decompression.
 *
 * Return: true - decompressed successfully, false - not enough room in skb.
 */
static bool ppp_decompress_proto(struct sk_buff *skb)
{
        /* At least one byte should be present (if protocol is compressed) */
        if (!pskb_may_pull(skb, 1))
                return false;

        __ppp_decompress_proto(skb);

        /* Protocol field should occupy 2 bytes when not compressed */
        return pskb_may_pull(skb, 2);
}

/* Attempt to handle a frame via. a bridged channel, if one exists.
 * If the channel is bridged, the frame is consumed by the bridge.
 * If not, the caller must handle the frame by normal recv mechanisms.
 * Returns true if the frame is consumed, false otherwise.
 */
static bool ppp_channel_bridge_input(struct channel *pch, struct sk_buff *skb)
{
        struct channel *pchb;

        rcu_read_lock();
        pchb = rcu_dereference(pch->bridge);
        if (!pchb)
                goto out_rcu;

        spin_lock(&pchb->downl);
        if (!pchb->chan) {
                /* channel got unregistered */
                kfree_skb(skb);
                goto outl;
        }

        skb_scrub_packet(skb, !net_eq(pch->chan_net, pchb->chan_net));
        if (!pchb->chan->ops->start_xmit(pchb->chan, skb))
                kfree_skb(skb);

outl:
        spin_unlock(&pchb->downl);
out_rcu:
        rcu_read_unlock();

        /* If pchb is set then we've consumed the packet */
        return !!pchb;
}

void
ppp_input(struct ppp_channel *chan, struct sk_buff *skb)
{
        struct channel *pch = chan->ppp;
        int proto;

        if (!pch) {
                kfree_skb(skb);
                return;
        }

        /* If the channel is bridged, transmit via. bridge */
        if (ppp_channel_bridge_input(pch, skb))
                return;

        read_lock_bh(&pch->upl);
        if (!ppp_decompress_proto(skb)) {
                kfree_skb(skb);
                if (pch->ppp) {
                        ++pch->ppp->dev->stats.rx_length_errors;
                        ppp_receive_error(pch->ppp);
                }
                goto done;
        }

        proto = PPP_PROTO(skb);
        if (!pch->ppp || proto >= 0xc000 || proto == PPP_CCPFRAG) {
                /* put it on the channel queue */
                skb_queue_tail(&pch->file.rq, skb);
                /* drop old frames if queue too long */
                while (pch->file.rq.qlen > PPP_MAX_RQLEN &&
                       (skb = skb_dequeue(&pch->file.rq)))
                        kfree_skb(skb);
                wake_up_interruptible(&pch->file.rwait);
        } else {
                ppp_do_recv(pch->ppp, skb, pch);
        }

done:
        read_unlock_bh(&pch->upl);
}

/* Put a 0-length skb in the receive queue as an error indication */
void
ppp_input_error(struct ppp_channel *chan, int code)
{
        struct channel *pch = chan->ppp;
        struct sk_buff *skb;

        if (!pch)
                return;

        read_lock_bh(&pch->upl);
        if (pch->ppp) {
                skb = alloc_skb(0, GFP_ATOMIC);
                if (skb) {
                        skb->len = 0;                /* probably unnecessary */
                        skb->cb[0] = code;
                        ppp_do_recv(pch->ppp, skb, pch);
                }
        }
        read_unlock_bh(&pch->upl);
}

/*
 * We come in here to process a received frame.
 * The receive side of the ppp unit is locked.
 */
static void
ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
{
        /* note: a 0-length skb is used as an error indication */
        if (skb->len > 0) {
                skb_checksum_complete_unset(skb);
#ifdef CONFIG_PPP_MULTILINK
                /* XXX do channel-level decompression here */
                if (PPP_PROTO(skb) == PPP_MP)
                        ppp_receive_mp_frame(ppp, skb, pch);
                else
#endif /* CONFIG_PPP_MULTILINK */
                        ppp_receive_nonmp_frame(ppp, skb);
        } else {
                kfree_skb(skb);
                ppp_receive_error(ppp);
        }
}

static void
ppp_receive_error(struct ppp *ppp)
{
        ++ppp->dev->stats.rx_errors;
        if (ppp->vj)
                slhc_toss(ppp->vj);
}

static void
ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb)
{
        struct sk_buff *ns;
        int proto, len, npi;

        /*
         * Decompress the frame, if compressed.
         * Note that some decompressors need to see uncompressed frames
         * that come in as well as compressed frames.
         */
        if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN) &&
            (ppp->rstate & (SC_DC_FERROR | SC_DC_ERROR)) == 0)
                skb = ppp_decompress_frame(ppp, skb);

        if (ppp->flags & SC_MUST_COMP && ppp->rstate & SC_DC_FERROR)
                goto err;

        /* At this point the "Protocol" field MUST be decompressed, either in
         * ppp_input(), ppp_decompress_frame() or in ppp_receive_mp_frame().
         */
        proto = PPP_PROTO(skb);
        switch (proto) {
        case PPP_VJC_COMP:
                /* decompress VJ compressed packets */
                if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP))
                        goto err;

                if (skb_tailroom(skb) < 124 || skb_cloned(skb)) {
                        /* copy to a new sk_buff with more tailroom */
                        ns = dev_alloc_skb(skb->len + 128);
                        if (!ns) {
                                netdev_err(ppp->dev, "PPP: no memory "
                                           "(VJ decomp)\n");
                                goto err;
                        }
                        skb_reserve(ns, 2);
                        skb_copy_bits(skb, 0, skb_put(ns, skb->len), skb->len);
                        consume_skb(skb);
                        skb = ns;
                }
                else
                        skb->ip_summed = CHECKSUM_NONE;

                len = slhc_uncompress(ppp->vj, skb->data + 2, skb->len - 2);
                if (len <= 0) {
                        netdev_printk(KERN_DEBUG, ppp->dev,
                                      "PPP: VJ decompression error\n");
                        goto err;
                }
                len += 2;
                if (len > skb->len)
                        skb_put(skb, len - skb->len);
                else if (len < skb->len)
                        skb_trim(skb, len);
                proto = PPP_IP;
                break;

        case PPP_VJC_UNCOMP:
                if (!ppp->vj || (ppp->flags & SC_REJ_COMP_TCP))
                        goto err;

                /* Until we fix the decompressor need to make sure
                 * data portion is linear.
                 */
                if (!pskb_may_pull(skb, skb->len))
                        goto err;

                if (slhc_remember(ppp->vj, skb->data + 2, skb->len - 2) <= 0) {
                        netdev_err(ppp->dev, "PPP: VJ uncompressed error\n");
                        goto err;
                }
                proto = PPP_IP;
                break;

        case PPP_CCP:
                ppp_ccp_peek(ppp, skb, 1);
                break;
        }

        ++ppp->stats64.rx_packets;
        ppp->stats64.rx_bytes += skb->len - 2;

        npi = proto_to_npindex(proto);
        if (npi < 0) {
                /* control or unknown frame - pass it to pppd */
                skb_queue_tail(&ppp->file.rq, skb);
                /* limit queue length by dropping old frames */
                while (ppp->file.rq.qlen > PPP_MAX_RQLEN &&
                       (skb = skb_dequeue(&ppp->file.rq)))
                        kfree_skb(skb);
                /* wake up any process polling or blocking on read */
                wake_up_interruptible(&ppp->file.rwait);

        } else {
                /* network protocol frame - give it to the kernel */

#ifdef CONFIG_PPP_FILTER
                /* check if the packet passes the pass and active filters */
                /* the filter instructions are constructed assuming
                   a four-byte PPP header on each packet */
                if (ppp->pass_filter || ppp->active_filter) {
                        if (skb_unclone(skb, GFP_ATOMIC))
                                goto err;

                        *(u8 *)skb_push(skb, 2) = 0;
                        if (ppp->pass_filter &&
                            bpf_prog_run(ppp->pass_filter, skb) == 0) {
                                if (ppp->debug & 1)
                                        netdev_printk(KERN_DEBUG, ppp->dev,
                                                      "PPP: inbound frame "
                                                      "not passed\n");
                                kfree_skb(skb);
                                return;
                        }
                        if (!(ppp->active_filter &&
                              bpf_prog_run(ppp->active_filter, skb) == 0))
                                ppp->last_recv = jiffies;
                        __skb_pull(skb, 2);
                } else
#endif /* CONFIG_PPP_FILTER */
                        ppp->last_recv = jiffies;

                if ((ppp->dev->flags & IFF_UP) == 0 ||
                    ppp->npmode[npi] != NPMODE_PASS) {
                        kfree_skb(skb);
                } else {
                        /* chop off protocol */
                        skb_pull_rcsum(skb, 2);
                        skb->dev = ppp->dev;
                        skb->protocol = htons(npindex_to_ethertype[npi]);
                        skb_reset_mac_header(skb);
                        skb_scrub_packet(skb, !net_eq(ppp->ppp_net,
                                                      dev_net(ppp->dev)));
                        netif_rx(skb);
                }
        }
        return;

 err:
        kfree_skb(skb);
        ppp_receive_error(ppp);
}

static struct sk_buff *
ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb)
{
        int proto = PPP_PROTO(skb);
        struct sk_buff *ns;
        int len;

        /* Until we fix all the decompressor's need to make sure
         * data portion is linear.
         */
        if (!pskb_may_pull(skb, skb->len))
                goto err;

        if (proto == PPP_COMP) {
                int obuff_size;

                switch(ppp->rcomp->compress_proto) {
                case CI_MPPE:
                        obuff_size = ppp->mru + PPP_HDRLEN + 1;
                        break;
                default:
                        obuff_size = ppp->mru + PPP_HDRLEN;
                        break;
                }

                ns = dev_alloc_skb(obuff_size);
                if (!ns) {
                        netdev_err(ppp->dev, "ppp_decompress_frame: "
                                   "no memory\n");
                        goto err;
                }
                /* the decompressor still expects the A/C bytes in the hdr */
                len = ppp->rcomp->decompress(ppp->rc_state, skb->data - 2,
                                skb->len + 2, ns->data, obuff_size);
                if (len < 0) {
                        /* Pass the compressed frame to pppd as an
                           error indication. */
                        if (len == DECOMP_FATALERROR)
                                ppp->rstate |= SC_DC_FERROR;
                        kfree_skb(ns);
                        goto err;
                }

                consume_skb(skb);
                skb = ns;
                skb_put(skb, len);
                skb_pull(skb, 2);        /* pull off the A/C bytes */

                /* Don't call __ppp_decompress_proto() here, but instead rely on
                 * corresponding algo (mppe/bsd/deflate) to decompress it.
                 */
        } else {
                /* Uncompressed frame - pass to decompressor so it
                   can update its dictionary if necessary. */
                if (ppp->rcomp->incomp)
                        ppp->rcomp->incomp(ppp->rc_state, skb->data - 2,
                                           skb->len + 2);
        }

        return skb;

 err:
        ppp->rstate |= SC_DC_ERROR;
        ppp_receive_error(ppp);
        return skb;
}

#ifdef CONFIG_PPP_MULTILINK
/*
 * Receive a multilink frame.
 * We put it on the reconstruction queue and then pull off
 * as many completed frames as we can.
 */
static void
ppp_receive_mp_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch)
{
        u32 mask, seq;
        struct channel *ch;
        int mphdrlen = (ppp->flags & SC_MP_SHORTSEQ)? MPHDRLEN_SSN: MPHDRLEN;

        if (!pskb_may_pull(skb, mphdrlen + 1) || ppp->mrru == 0)
                goto err;                /* no good, throw it away */

        /* Decode sequence number and begin/end bits */
        if (ppp->flags & SC_MP_SHORTSEQ) {
                seq = ((skb->data[2] & 0x0f) << 8) | skb->data[3];
                mask = 0xfff;
        } else {
                seq = (skb->data[3] << 16) | (skb->data[4] << 8)| skb->data[5];
                mask = 0xffffff;
        }
        PPP_MP_CB(skb)->BEbits = skb->data[2];
        skb_pull(skb, mphdrlen);        /* pull off PPP and MP headers */

        /*
         * Do protocol ID decompression on the first fragment of each packet.
         * We have to do that here, because ppp_receive_nonmp_frame() expects
         * decompressed protocol field.
         */
        if (PPP_MP_CB(skb)->BEbits & B)
                __ppp_decompress_proto(skb);

        /*
         * Expand sequence number to 32 bits, making it as close
         * as possible to ppp->minseq.
         */
        seq |= ppp->minseq & ~mask;
        if ((int)(ppp->minseq - seq) > (int)(mask >> 1))
                seq += mask + 1;
        else if ((int)(seq - ppp->minseq) > (int)(mask >> 1))
                seq -= mask + 1;        /* should never happen */
        PPP_MP_CB(skb)->sequence = seq;
        pch->lastseq = seq;

        /*
         * If this packet comes before the next one we were expecting,
         * drop it.
         */
        if (seq_before(seq, ppp->nextseq)) {
                kfree_skb(skb);
                ++ppp->dev->stats.rx_dropped;
                ppp_receive_error(ppp);
                return;
        }

        /*
         * Reevaluate minseq, the minimum over all channels of the
         * last sequence number received on each channel.  Because of
         * the increasing sequence number rule, we know that any fragment
         * before `minseq' which hasn't arrived is never going to arrive.
         * The list of channels can't change because we have the receive
         * side of the ppp unit locked.
         */
        list_for_each_entry(ch, &ppp->channels, clist) {
                if (seq_before(ch->lastseq, seq))
                        seq = ch->lastseq;
        }
        if (seq_before(ppp->minseq, seq))
                ppp->minseq = seq;

        /* Put the fragment on the reconstruction queue */
        ppp_mp_insert(ppp, skb);

        /* If the queue is getting long, don't wait any longer for packets
           before the start of the queue. */
        if (skb_queue_len(&ppp->mrq) >= PPP_MP_MAX_QLEN) {
                struct sk_buff *mskb = skb_peek(&ppp->mrq);
                if (seq_before(ppp->minseq, PPP_MP_CB(mskb)->sequence))
                        ppp->minseq = PPP_MP_CB(mskb)->sequence;
        }

        /* Pull completed packets off the queue and receive them. */
        while ((skb = ppp_mp_reconstruct(ppp))) {
                if (pskb_may_pull(skb, 2))
                        ppp_receive_nonmp_frame(ppp, skb);
                else {
                        ++ppp->dev->stats.rx_length_errors;
                        kfree_skb(skb);
                        ppp_receive_error(ppp);
                }
        }

        return;

 err:
        kfree_skb(skb);
        ppp_receive_error(ppp);
}

/*
 * Insert a fragment on the MP reconstruction queue.
 * The queue is ordered by increasing sequence number.
 */
static void
ppp_mp_insert(struct ppp *ppp, struct sk_buff *skb)
{
        struct sk_buff *p;
        struct sk_buff_head *list = &ppp->mrq;
        u32 seq = PPP_MP_CB(skb)->sequence;

        /* N.B. we don't need to lock the list lock because we have the
           ppp unit receive-side lock. */
        skb_queue_walk(list, p) {
                if (seq_before(seq, PPP_MP_CB(p)->sequence))
                        break;
        }
        __skb_queue_before(list, p, skb);
}

/*
 * Reconstruct a packet from the MP fragment queue.
 * We go through increasing sequence numbers until we find a
 * complete packet, or we get to the sequence number for a fragment
 * which hasn't arrived but might still do so.
 */
static struct sk_buff *
ppp_mp_reconstruct(struct ppp *ppp)
{
        u32 seq = ppp->nextseq;
        u32 minseq = ppp->minseq;
        struct sk_buff_head *list = &ppp->mrq;
        struct sk_buff *p, *tmp;
        struct sk_buff *head, *tail;
        struct sk_buff *skb = NULL;
        int lost = 0, len = 0;

        if (ppp->mrru == 0)        /* do nothing until mrru is set */
                return NULL;
        head = __skb_peek(list);
        tail = NULL;
        skb_queue_walk_safe(list, p, tmp) {
        again:
                if (seq_before(PPP_MP_CB(p)->sequence, seq)) {
                        /* this can't happen, anyway ignore the skb */
                        netdev_err(ppp->dev, "ppp_mp_reconstruct bad "
                                   "seq %u < %u\n",
                                   PPP_MP_CB(p)->sequence, seq);
                        __skb_unlink(p, list);
                        kfree_skb(p);
                        continue;
                }
                if (PPP_MP_CB(p)->sequence != seq) {
                        u32 oldseq;
                        /* Fragment `seq' is missing.  If it is after
                           minseq, it might arrive later, so stop here. */
                        if (seq_after(seq, minseq))
                                break;
                        /* Fragment `seq' is lost, keep going. */
                        lost = 1;
                        oldseq = seq;
                        seq = seq_before(minseq, PPP_MP_CB(p)->sequence)?
                                minseq + 1: PPP_MP_CB(p)->sequence;

                        if (ppp->debug & 1)
                                netdev_printk(KERN_DEBUG, ppp->dev,
                                              "lost frag %u..%u\n",
                                              oldseq, seq-1);

                        goto again;
                }

                /*
                 * At this point we know that all the fragments from
                 * ppp->nextseq to seq are either present or lost.
                 * Also, there are no complete packets in the queue
                 * that have no missing fragments and end before this
                 * fragment.
                 */

                /* B bit set indicates this fragment starts a packet */
                if (PPP_MP_CB(p)->BEbits & B) {
                        head = p;
                        lost = 0;
                        len = 0;
                }

                len += p->len;

                /* Got a complete packet yet? */
                if (lost == 0 && (PPP_MP_CB(p)->BEbits & E) &&
                    (PPP_MP_CB(head)->BEbits & B)) {
                        if (len > ppp->mrru + 2) {
                                ++ppp->dev->stats.rx_length_errors;
                                netdev_printk(KERN_DEBUG, ppp->dev,
                                              "PPP: reconstructed packet"
                                              " is too long (%d)\n", len);
                        } else {
                                tail = p;
                                break;
                        }
                        ppp->nextseq = seq + 1;
                }

                /*
                 * If this is the ending fragment of a packet,
                 * and we haven't found a complete valid packet yet,
                 * we can discard up to and including this fragment.
                 */
                if (PPP_MP_CB(p)->BEbits & E) {
                        struct sk_buff *tmp2;

                        skb_queue_reverse_walk_from_safe(list, p, tmp2) {
                                if (ppp->debug & 1)
                                        netdev_printk(KERN_DEBUG, ppp->dev,
                                                      "discarding frag %u\n",
                                                      PPP_MP_CB(p)->sequence);
                                __skb_unlink(p, list);
                                kfree_skb(p);
                        }
                        head = skb_peek(list);
                        if (!head)
                                break;
                }
                ++seq;
        }

        /* If we have a complete packet, copy it all into one skb. */
        if (tail != NULL) {
                /* If we have discarded any fragments,
                   signal a receive error. */
                if (PPP_MP_CB(head)->sequence != ppp->nextseq) {
                        skb_queue_walk_safe(list, p, tmp) {
                                if (p == head)
                                        break;
                                if (ppp->debug & 1)
                                        netdev_printk(KERN_DEBUG, ppp->dev,
                                                      "discarding frag %u\n",
                                                      PPP_MP_CB(p)->sequence);
                                __skb_unlink(p, list);
                                kfree_skb(p);
                        }

                        if (ppp->debug & 1)
                                netdev_printk(KERN_DEBUG, ppp->dev,
                                              "  missed pkts %u..%u\n",
                                              ppp->nextseq,
                                              PPP_MP_CB(head)->sequence-1);
                        ++ppp->dev->stats.rx_dropped;
                        ppp_receive_error(ppp);
                }

                skb = head;
                if (head != tail) {
                        struct sk_buff **fragpp = &skb_shinfo(skb)->frag_list;
                        p = skb_queue_next(list, head);
                        __skb_unlink(skb, list);
                        skb_queue_walk_from_safe(list, p, tmp) {
                                __skb_unlink(p, list);
                                *fragpp = p;
                                p->next = NULL;
                                fragpp = &p->next;

                                skb->len += p->len;
                                skb->data_len += p->len;
                                skb->truesize += p->truesize;

                                if (p == tail)
                                        break;
                        }
                } else {
                        __skb_unlink(skb, list);
                }

                ppp->nextseq = PPP_MP_CB(tail)->sequence + 1;
        }

        return skb;
}
#endif /* CONFIG_PPP_MULTILINK */

/*
 * Channel interface.
 */

/* Create a new, unattached ppp channel. */
int ppp_register_channel(struct ppp_channel *chan)
{
        return ppp_register_net_channel(current->nsproxy->net_ns, chan);
}

/* Create a new, unattached ppp channel for specified net. */
int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
{
        struct channel *pch;
        struct ppp_net *pn;

        pch = kzalloc(sizeof(struct channel), GFP_KERNEL);
        if (!pch)
                return -ENOMEM;

        pn = ppp_pernet(net);

        pch->ppp = NULL;
        pch->chan = chan;
        pch->chan_net = get_net_track(net, &pch->ns_tracker, GFP_KERNEL);
        chan->ppp = pch;
        init_ppp_file(&pch->file, CHANNEL);
        pch->file.hdrlen = chan->hdrlen;
#ifdef CONFIG_PPP_MULTILINK
        pch->lastseq = -1;
#endif /* CONFIG_PPP_MULTILINK */
        init_rwsem(&pch->chan_sem);
        spin_lock_init(&pch->downl);
        rwlock_init(&pch->upl);

        spin_lock_bh(&pn->all_channels_lock);
        pch->file.index = ++pn->last_channel_index;
        list_add(&pch->list, &pn->new_channels);
        atomic_inc(&channel_count);
        spin_unlock_bh(&pn->all_channels_lock);

        return 0;
}

/*
 * Return the index of a channel.
 */
int ppp_channel_index(struct ppp_channel *chan)
{
        struct channel *pch = chan->ppp;

        if (pch)
                return pch->file.index;
        return -1;
}

/*
 * Return the PPP unit number to which a channel is connected.
 */
int ppp_unit_number(struct ppp_channel *chan)
{
        struct channel *pch = chan->ppp;
        int unit = -1;

        if (pch) {
                read_lock_bh(&pch->upl);
                if (pch->ppp)
                        unit = pch->ppp->file.index;
                read_unlock_bh(&pch->upl);
        }
        return unit;
}

/*
 * Return the PPP device interface name of a channel.
 */
char *ppp_dev_name(struct ppp_channel *chan)
{
        struct channel *pch = chan->ppp;
        char *name = NULL;

        if (pch) {
                read_lock_bh(&pch->upl);
                if (pch->ppp && pch->ppp->dev)
                        name = pch->ppp->dev->name;
                read_unlock_bh(&pch->upl);
        }
        return name;
}


/*
 * Disconnect a channel from the generic layer.
 * This must be called in process context.
 */
void
ppp_unregister_channel(struct ppp_channel *chan)
{
        struct channel *pch = chan->ppp;
        struct ppp_net *pn;

        if (!pch)
                return;                /* should never happen */

        chan->ppp = NULL;

        /*
         * This ensures that we have returned from any calls into
         * the channel's start_xmit or ioctl routine before we proceed.
         */
        down_write(&pch->chan_sem);
        spin_lock_bh(&pch->downl);
        pch->chan = NULL;
        spin_unlock_bh(&pch->downl);
        up_write(&pch->chan_sem);
        ppp_disconnect_channel(pch);

        pn = ppp_pernet(pch->chan_net);
        spin_lock_bh(&pn->all_channels_lock);
        list_del(&pch->list);
        spin_unlock_bh(&pn->all_channels_lock);

        ppp_unbridge_channels(pch);

        pch->file.dead = 1;
        wake_up_interruptible(&pch->file.rwait);

        if (refcount_dec_and_test(&pch->file.refcnt))
                ppp_destroy_channel(pch);
}

/*
 * Callback from a channel when it can accept more to transmit.
 * This should be called at BH/softirq level, not interrupt level.
 */
void
ppp_output_wakeup(struct ppp_channel *chan)
{
        struct channel *pch = chan->ppp;

        if (!pch)
                return;
        ppp_channel_push(pch);
}

/*
 * Compression control.
 */

/* Process the PPPIOCSCOMPRESS ioctl. */
static int
ppp_set_compress(struct ppp *ppp, struct ppp_option_data *data)
{
        int err = -EFAULT;
        struct compressor *cp, *ocomp;
        void *state, *ostate;
        unsigned char ccp_option[CCP_MAX_OPTION_LENGTH];

        if (data->length > CCP_MAX_OPTION_LENGTH)
                goto out;
        if (copy_from_user(ccp_option, data->ptr, data->length))
                goto out;

        err = -EINVAL;
        if (data->length < 2 || ccp_option[1] < 2 || ccp_option[1] > data->length)
                goto out;

        cp = try_then_request_module(
                find_compressor(ccp_option[0]),
                "ppp-compress-%d", ccp_option[0]);
        if (!cp)
                goto out;

        err = -ENOBUFS;
        if (data->transmit) {
                state = cp->comp_alloc(ccp_option, data->length);
                if (state) {
                        ppp_xmit_lock(ppp);
                        ppp->xstate &= ~SC_COMP_RUN;
                        ocomp = ppp->xcomp;
                        ostate = ppp->xc_state;
                        ppp->xcomp = cp;
                        ppp->xc_state = state;
                        ppp_xmit_unlock(ppp);
                        if (ostate) {
                                ocomp->comp_free(ostate);
                                module_put(ocomp->owner);
                        }
                        err = 0;
                } else
                        module_put(cp->owner);

        } else {
                state = cp->decomp_alloc(ccp_option, data->length);
                if (state) {
                        ppp_recv_lock(ppp);
                        ppp->rstate &= ~SC_DECOMP_RUN;
                        ocomp = ppp->rcomp;
                        ostate = ppp->rc_state;
                        ppp->rcomp = cp;
                        ppp->rc_state = state;
                        ppp_recv_unlock(ppp);
                        if (ostate) {
                                ocomp->decomp_free(ostate);
                                module_put(ocomp->owner);
                        }
                        err = 0;
                } else
                        module_put(cp->owner);
        }

 out:
        return err;
}

/*
 * Look at a CCP packet and update our state accordingly.
 * We assume the caller has the xmit or recv path locked.
 */
static void
ppp_ccp_peek(struct ppp *ppp, struct sk_buff *skb, int inbound)
{
        unsigned char *dp;
        int len;

        if (!pskb_may_pull(skb, CCP_HDRLEN + 2))
                return;        /* no header */
        dp = skb->data + 2;

        switch (CCP_CODE(dp)) {
        case CCP_CONFREQ:

                /* A ConfReq starts negotiation of compression
                 * in one direction of transmission,
                 * and hence brings it down...but which way?
                 *
                 * Remember:
                 * A ConfReq indicates what the sender would like to receive
                 */
                if(inbound)
                        /* He is proposing what I should send */
                        ppp->xstate &= ~SC_COMP_RUN;
                else
                        /* I am proposing to what he should send */
                        ppp->rstate &= ~SC_DECOMP_RUN;

                break;

        case CCP_TERMREQ:
        case CCP_TERMACK:
                /*
                 * CCP is going down, both directions of transmission
                 */
                ppp->rstate &= ~SC_DECOMP_RUN;
                ppp->xstate &= ~SC_COMP_RUN;
                break;

        case CCP_CONFACK:
                if ((ppp->flags & (SC_CCP_OPEN | SC_CCP_UP)) != SC_CCP_OPEN)
                        break;
                len = CCP_LENGTH(dp);
                if (!pskb_may_pull(skb, len + 2))
                        return;                /* too short */
                dp += CCP_HDRLEN;
                len -= CCP_HDRLEN;
                if (len < CCP_OPT_MINLEN || len < CCP_OPT_LENGTH(dp))
                        break;
                if (inbound) {
                        /* we will start receiving compressed packets */
                        if (!ppp->rc_state)
                                break;
                        if (ppp->rcomp->decomp_init(ppp->rc_state, dp, len,
                                        ppp->file.index, 0, ppp->mru, ppp->debug)) {
                                ppp->rstate |= SC_DECOMP_RUN;
                                ppp->rstate &= ~(SC_DC_ERROR | SC_DC_FERROR);
                        }
                } else {
                        /* we will soon start sending compressed packets */
                        if (!ppp->xc_state)
                                break;
                        if (ppp->xcomp->comp_init(ppp->xc_state, dp, len,
                                        ppp->file.index, 0, ppp->debug))
                                ppp->xstate |= SC_COMP_RUN;
                }
                break;

        case CCP_RESETACK:
                /* reset the [de]compressor */
                if ((ppp->flags & SC_CCP_UP) == 0)
                        break;
                if (inbound) {
                        if (ppp->rc_state && (ppp->rstate & SC_DECOMP_RUN)) {
                                ppp->rcomp->decomp_reset(ppp->rc_state);
                                ppp->rstate &= ~SC_DC_ERROR;
                        }
                } else {
                        if (ppp->xc_state && (ppp->xstate & SC_COMP_RUN))
                                ppp->xcomp->comp_reset(ppp->xc_state);
                }
                break;
        }
}

/* Free up compression resources. */
static void
ppp_ccp_closed(struct ppp *ppp)
{
        void *xstate, *rstate;
        struct compressor *xcomp, *rcomp;

        ppp_lock(ppp);
        ppp->flags &= ~(SC_CCP_OPEN | SC_CCP_UP);
        ppp->xstate = 0;
        xcomp = ppp->xcomp;
        xstate = ppp->xc_state;
        ppp->xc_state = NULL;
        ppp->rstate = 0;
        rcomp = ppp->rcomp;
        rstate = ppp->rc_state;
        ppp->rc_state = NULL;
        ppp_unlock(ppp);

        if (xstate) {
                xcomp->comp_free(xstate);
                module_put(xcomp->owner);
        }
        if (rstate) {
                rcomp->decomp_free(rstate);
                module_put(rcomp->owner);
        }
}

/* List of compressors. */
static LIST_HEAD(compressor_list);
static DEFINE_SPINLOCK(compressor_list_lock);

struct compressor_entry {
        struct list_head list;
        struct compressor *comp;
};

static struct compressor_entry *
find_comp_entry(int proto)
{
        struct compressor_entry *ce;

        list_for_each_entry(ce, &compressor_list, list) {
                if (ce->comp->compress_proto == proto)
                        return ce;
        }
        return NULL;
}

/* Register a compressor */
int
ppp_register_compressor(struct compressor *cp)
{
        struct compressor_entry *ce;
        int ret;
        spin_lock(&compressor_list_lock);
        ret = -EEXIST;
        if (find_comp_entry(cp->compress_proto))
                goto out;
        ret = -ENOMEM;
        ce = kmalloc(sizeof(struct compressor_entry), GFP_ATOMIC);
        if (!ce)
                goto out;
        ret = 0;
        ce->comp = cp;
        list_add(&ce->list, &compressor_list);
 out:
        spin_unlock(&compressor_list_lock);
        return ret;
}

/* Unregister a compressor */
void
ppp_unregister_compressor(struct compressor *cp)
{
        struct compressor_entry *ce;

        spin_lock(&compressor_list_lock);
        ce = find_comp_entry(cp->compress_proto);
        if (ce && ce->comp == cp) {
                list_del(&ce->list);
                kfree(ce);
        }
        spin_unlock(&compressor_list_lock);
}

/* Find a compressor. */
static struct compressor *
find_compressor(int type)
{
        struct compressor_entry *ce;
        struct compressor *cp = NULL;

        spin_lock(&compressor_list_lock);
        ce = find_comp_entry(type);
        if (ce) {
                cp = ce->comp;
                if (!try_module_get(cp->owner))
                        cp = NULL;
        }
        spin_unlock(&compressor_list_lock);
        return cp;
}

/*
 * Miscelleneous stuff.
 */

static void
ppp_get_stats(struct ppp *ppp, struct ppp_stats *st)
{
        struct slcompress *vj = ppp->vj;

        memset(st, 0, sizeof(*st));
        st->p.ppp_ipackets = ppp->stats64.rx_packets;
        st->p.ppp_ierrors = ppp->dev->stats.rx_errors;
        st->p.ppp_ibytes = ppp->stats64.rx_bytes;
        st->p.ppp_opackets = ppp->stats64.tx_packets;
        st->p.ppp_oerrors = ppp->dev->stats.tx_errors;
        st->p.ppp_obytes = ppp->stats64.tx_bytes;
        if (!vj)
                return;
        st->vj.vjs_packets = vj->sls_o_compressed + vj->sls_o_uncompressed;
        st->vj.vjs_compressed = vj->sls_o_compressed;
        st->vj.vjs_searches = vj->sls_o_searches;
        st->vj.vjs_misses = vj->sls_o_misses;
        st->vj.vjs_errorin = vj->sls_i_error;
        st->vj.vjs_tossed = vj->sls_i_tossed;
        st->vj.vjs_uncompressedin = vj->sls_i_uncompressed;
        st->vj.vjs_compressedin = vj->sls_i_compressed;
}

/*
 * Stuff for handling the lists of ppp units and channels
 * and for initialization.
 */

/*
 * Create a new ppp interface unit.  Fails if it can't allocate memory
 * or if there is already a unit with the requested number.
 * unit == -1 means allocate a new number.
 */
static int ppp_create_interface(struct net *net, struct file *file, int *unit)
{
        struct ppp_config conf = {
                .file = file,
                .unit = *unit,
                .ifname_is_set = false,
        };
        struct net_device *dev;
        struct ppp *ppp;
        int err;

        dev = alloc_netdev(sizeof(struct ppp), "", NET_NAME_ENUM, ppp_setup);
        if (!dev) {
                err = -ENOMEM;
                goto err;
        }
        dev_net_set(dev, net);
        dev->rtnl_link_ops = &ppp_link_ops;

        rtnl_lock();

        err = ppp_dev_configure(net, dev, &conf);
        if (err < 0)
                goto err_dev;
        ppp = netdev_priv(dev);
        *unit = ppp->file.index;

        rtnl_unlock();

        return 0;

err_dev:
        rtnl_unlock();
        free_netdev(dev);
err:
        return err;
}

/*
 * Initialize a ppp_file structure.
 */
static void
init_ppp_file(struct ppp_file *pf, int kind)
{
        pf->kind = kind;
        skb_queue_head_init(&pf->xq);
        skb_queue_head_init(&pf->rq);
        refcount_set(&pf->refcnt, 1);
        init_waitqueue_head(&pf->rwait);
}

/*
 * Free the memory used by a ppp unit.  This is only called once
 * there are no channels connected to the unit and no file structs
 * that reference the unit.
 */
static void ppp_destroy_interface(struct ppp *ppp)
{
        atomic_dec(&ppp_unit_count);

        if (!ppp->file.dead || ppp->n_channels) {
                /* "can't happen" */
                netdev_err(ppp->dev, "ppp: destroying ppp struct %p "
                           "but dead=%d n_channels=%d !\n",
                           ppp, ppp->file.dead, ppp->n_channels);
                return;
        }

        ppp_ccp_closed(ppp);
        if (ppp->vj) {
                slhc_free(ppp->vj);
                ppp->vj = NULL;
        }
        skb_queue_purge(&ppp->file.xq);
        skb_queue_purge(&ppp->file.rq);
#ifdef CONFIG_PPP_MULTILINK
        skb_queue_purge(&ppp->mrq);
#endif /* CONFIG_PPP_MULTILINK */
#ifdef CONFIG_PPP_FILTER
        if (ppp->pass_filter) {
                bpf_prog_destroy(ppp->pass_filter);
                ppp->pass_filter = NULL;
        }

        if (ppp->active_filter) {
                bpf_prog_destroy(ppp->active_filter);
                ppp->active_filter = NULL;
        }
#endif /* CONFIG_PPP_FILTER */

        kfree_skb(ppp->xmit_pending);
        free_percpu(ppp->xmit_recursion);

        free_netdev(ppp->dev);
}

/*
 * Locate an existing ppp unit.
 * The caller should have locked the all_ppp_mutex.
 */
static struct ppp *
ppp_find_unit(struct ppp_net *pn, int unit)
{
        return unit_find(&pn->units_idr, unit);
}

/*
 * Locate an existing ppp channel.
 * The caller should have locked the all_channels_lock.
 * First we look in the new_channels list, then in the
 * all_channels list.  If found in the new_channels list,
 * we move it to the all_channels list.  This is for speed
 * when we have a lot of channels in use.
 */
static struct channel *
ppp_find_channel(struct ppp_net *pn, int unit)
{
        struct channel *pch;

        list_for_each_entry(pch, &pn->new_channels, list) {
                if (pch->file.index == unit) {
                        list_move(&pch->list, &pn->all_channels);
                        return pch;
                }
        }

        list_for_each_entry(pch, &pn->all_channels, list) {
                if (pch->file.index == unit)
                        return pch;
        }

        return NULL;
}

/*
 * Connect a PPP channel to a PPP interface unit.
 */
static int
ppp_connect_channel(struct channel *pch, int unit)
{
        struct ppp *ppp;
        struct ppp_net *pn;
        int ret = -ENXIO;
        int hdrlen;

        pn = ppp_pernet(pch->chan_net);

        mutex_lock(&pn->all_ppp_mutex);
        ppp = ppp_find_unit(pn, unit);
        if (!ppp)
                goto out;
        write_lock_bh(&pch->upl);
        ret = -EINVAL;
        if (pch->ppp ||
            rcu_dereference_protected(pch->bridge, lockdep_is_held(&pch->upl)))
                goto outl;

        ppp_lock(ppp);
        spin_lock_bh(&pch->downl);
        if (!pch->chan) {
                /* Don't connect unregistered channels */
                spin_unlock_bh(&pch->downl);
                ppp_unlock(ppp);
                ret = -ENOTCONN;
                goto outl;
        }
        spin_unlock_bh(&pch->downl);
        if (pch->file.hdrlen > ppp->file.hdrlen)
                ppp->file.hdrlen = pch->file.hdrlen;
        hdrlen = pch->file.hdrlen + 2;        /* for protocol bytes */
        if (hdrlen > ppp->dev->hard_header_len)
                ppp->dev->hard_header_len = hdrlen;
        list_add_tail(&pch->clist, &ppp->channels);
        ++ppp->n_channels;
        pch->ppp = ppp;
        refcount_inc(&ppp->file.refcnt);
        ppp_unlock(ppp);
        ret = 0;

 outl:
        write_unlock_bh(&pch->upl);
 out:
        mutex_unlock(&pn->all_ppp_mutex);
        return ret;
}

/*
 * Disconnect a channel from its ppp unit.
 */
static int
ppp_disconnect_channel(struct channel *pch)
{
        struct ppp *ppp;
        int err = -EINVAL;

        write_lock_bh(&pch->upl);
        ppp = pch->ppp;
        pch->ppp = NULL;
        write_unlock_bh(&pch->upl);
        if (ppp) {
                /* remove it from the ppp unit's list */
                ppp_lock(ppp);
                list_del(&pch->clist);
                if (--ppp->n_channels == 0)
                        wake_up_interruptible(&ppp->file.rwait);
                ppp_unlock(ppp);
                if (refcount_dec_and_test(&ppp->file.refcnt))
                        ppp_destroy_interface(ppp);
                err = 0;
        }
        return err;
}

/*
 * Free up the resources used by a ppp channel.
 */
static void ppp_destroy_channel(struct channel *pch)
{
        put_net_track(pch->chan_net, &pch->ns_tracker);
        pch->chan_net = NULL;

        atomic_dec(&channel_count);

        if (!pch->file.dead) {
                /* "can't happen" */
                pr_err("ppp: destroying undead channel %p !\n", pch);
                return;
        }
        skb_queue_purge(&pch->file.xq);
        skb_queue_purge(&pch->file.rq);
        kfree(pch);
}

static void __exit ppp_cleanup(void)
{
        /* should never happen */
        if (atomic_read(&ppp_unit_count) || atomic_read(&channel_count))
                pr_err("PPP: removing module but units remain!\n");
        rtnl_link_unregister(&ppp_link_ops);
        unregister_chrdev(PPP_MAJOR, "ppp");
        device_destroy(&ppp_class, MKDEV(PPP_MAJOR, 0));
        class_unregister(&ppp_class);
        unregister_pernet_device(&ppp_net_ops);
}

/*
 * Units handling. Caller must protect concurrent access
 * by holding all_ppp_mutex
 */

/* associate pointer with specified number */
static int unit_set(struct idr *p, void *ptr, int n)
{
        int unit;

        unit = idr_alloc(p, ptr, n, n + 1, GFP_KERNEL);
        if (unit == -ENOSPC)
                unit = -EINVAL;
        return unit;
}

/* get new free unit number and associate pointer with it */
static int unit_get(struct idr *p, void *ptr, int min)
{
        return idr_alloc(p, ptr, min, 0, GFP_KERNEL);
}

/* put unit number back to a pool */
static void unit_put(struct idr *p, int n)
{
        idr_remove(p, n);
}

/* get pointer associated with the number */
static void *unit_find(struct idr *p, int n)
{
        return idr_find(p, n);
}

/* Module/initialization stuff */

module_init(ppp_init);
module_exit(ppp_cleanup);

EXPORT_SYMBOL(ppp_register_net_channel);
EXPORT_SYMBOL(ppp_register_channel);
EXPORT_SYMBOL(ppp_unregister_channel);
EXPORT_SYMBOL(ppp_channel_index);
EXPORT_SYMBOL(ppp_unit_number);
EXPORT_SYMBOL(ppp_dev_name);
EXPORT_SYMBOL(ppp_input);
EXPORT_SYMBOL(ppp_input_error);
EXPORT_SYMBOL(ppp_output_wakeup);
EXPORT_SYMBOL(ppp_register_compressor);
EXPORT_SYMBOL(ppp_unregister_compressor);
MODULE_DESCRIPTION("Generic PPP layer driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0);
MODULE_ALIAS_RTNL_LINK("ppp");
MODULE_ALIAS("devname:ppp");




































































































































    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 




    1 













































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
// SPDX-License-Identifier: GPL-2.0+
/*
 * 2002-10-15  Posix Clocks & timers
 *                           by George Anzinger george@mvista.com
 *                             Copyright (C) 2002 2003 by MontaVista Software.
 *
 * 2004-06-01  Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
 *                             Copyright (C) 2004 Boris Hu
 *
 * These are all the functions necessary to implement POSIX clocks & timers
 */
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/mutex.h>
#include <linux/sched/task.h>

#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/hash.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/export.h>
#include <linux/hashtable.h>
#include <linux/compat.h>
#include <linux/nospec.h>
#include <linux/time_namespace.h>

#include "timekeeping.h"
#include "posix-timers.h"

static struct kmem_cache *posix_timers_cache;

/*
 * Timers are managed in a hash table for lockless lookup. The hash key is
 * constructed from current::signal and the timer ID and the timer is
 * matched against current::signal and the timer ID when walking the hash
 * bucket list.
 *
 * This allows checkpoint/restore to reconstruct the exact timer IDs for
 * a process.
 */
static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
static DEFINE_SPINLOCK(hash_lock);

static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;

/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
                        ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif

static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);

#define lock_timer(tid, flags)                                                   \
({        struct k_itimer *__timr;                                           \
        __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
        __timr;                                                                   \
})

static int hash(struct signal_struct *sig, unsigned int nr)
{
        return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
}

static struct k_itimer *__posix_timers_find(struct hlist_head *head,
                                            struct signal_struct *sig,
                                            timer_t id)
{
        struct k_itimer *timer;

        hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
                /* timer->it_signal can be set concurrently */
                if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
                        return timer;
        }
        return NULL;
}

static struct k_itimer *posix_timer_by_id(timer_t id)
{
        struct signal_struct *sig = current->signal;
        struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];

        return __posix_timers_find(head, sig, id);
}

static int posix_timer_add(struct k_itimer *timer)
{
        struct signal_struct *sig = current->signal;
        struct hlist_head *head;
        unsigned int cnt, id;

        /*
         * FIXME: Replace this by a per signal struct xarray once there is
         * a plan to handle the resulting CRIU regression gracefully.
         */
        for (cnt = 0; cnt <= INT_MAX; cnt++) {
                spin_lock(&hash_lock);
                id = sig->next_posix_timer_id;

                /* Write the next ID back. Clamp it to the positive space */
                sig->next_posix_timer_id = (id + 1) & INT_MAX;

                head = &posix_timers_hashtable[hash(sig, id)];
                if (!__posix_timers_find(head, sig, id)) {
                        hlist_add_head_rcu(&timer->t_hash, head);
                        spin_unlock(&hash_lock);
                        return id;
                }
                spin_unlock(&hash_lock);
        }
        /* POSIX return code when no timer ID could be allocated */
        return -EAGAIN;
}

static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
        spin_unlock_irqrestore(&timr->it_lock, flags);
}

static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_real_ts64(tp);
        return 0;
}

static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
{
        return ktime_get_real();
}

static int posix_clock_realtime_set(const clockid_t which_clock,
                                    const struct timespec64 *tp)
{
        return do_sys_settimeofday64(tp, NULL);
}

static int posix_clock_realtime_adj(const clockid_t which_clock,
                                    struct __kernel_timex *t)
{
        return do_adjtimex(t);
}

static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
{
        return ktime_get();
}

static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_raw_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_coarse_real_ts64(tp);
        return 0;
}

static int posix_get_monotonic_coarse(clockid_t which_clock,
                                                struct timespec64 *tp)
{
        ktime_get_coarse_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
{
        *tp = ktime_to_timespec64(KTIME_LOW_RES);
        return 0;
}

static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_boottime_ts64(tp);
        timens_add_boottime(tp);
        return 0;
}

static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
{
        return ktime_get_boottime();
}

static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_clocktai_ts64(tp);
        return 0;
}

static ktime_t posix_get_tai_ktime(clockid_t which_clock)
{
        return ktime_get_clocktai();
}

static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
        tp->tv_sec = 0;
        tp->tv_nsec = hrtimer_resolution;
        return 0;
}

static __init int init_posix_timers(void)
{
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof(struct k_itimer), 0,
                                        SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
}
__initcall(init_posix_timers);

/*
 * The siginfo si_overrun field and the return value of timer_getoverrun(2)
 * are of type int. Clamp the overrun value to INT_MAX
 */
static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval)
{
        s64 sum = timr->it_overrun_last + (s64)baseval;

        return sum > (s64)INT_MAX ? INT_MAX : (int)sum;
}

static void common_hrtimer_rearm(struct k_itimer *timr)
{
        struct hrtimer *timer = &timr->it.real.timer;

        timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
                                            timr->it_interval);
        hrtimer_restart(timer);
}

/*
 * This function is called from the signal delivery code if
 * info->si_sys_private is not zero, which indicates that the timer has to
 * be rearmed. Restart the timer and update info::si_overrun.
 */
void posixtimer_rearm(struct kernel_siginfo *info)
{
        struct k_itimer *timr;
        unsigned long flags;

        timr = lock_timer(info->si_tid, &flags);
        if (!timr)
                return;

        if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
                timr->kclock->timer_rearm(timr);

                timr->it_active = 1;
                timr->it_overrun_last = timr->it_overrun;
                timr->it_overrun = -1LL;
                ++timr->it_requeue_pending;

                info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
        }

        unlock_timer(timr, flags);
}

int posix_timer_event(struct k_itimer *timr, int si_private)
{
        enum pid_type type;
        int ret;
        /*
         * FIXME: if ->sigq is queued we can race with
         * dequeue_signal()->posixtimer_rearm().
         *
         * If dequeue_signal() sees the "right" value of
         * si_sys_private it calls posixtimer_rearm().
         * We re-queue ->sigq and drop ->it_lock().
         * posixtimer_rearm() locks the timer
         * and re-schedules it while ->sigq is pending.
         * Not really bad, but not that we want.
         */
        timr->sigq->info.si_sys_private = si_private;

        type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
        ret = send_sigqueue(timr->sigq, timr->it_pid, type);
        /* If we failed to send the signal the timer stops. */
        return ret > 0;
}

/*
 * This function gets called when a POSIX.1b interval timer expires from
 * the HRTIMER interrupt (soft interrupt on RT kernels).
 *
 * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI
 * based timers.
 */
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
        enum hrtimer_restart ret = HRTIMER_NORESTART;
        struct k_itimer *timr;
        unsigned long flags;
        int si_private = 0;

        timr = container_of(timer, struct k_itimer, it.real.timer);
        spin_lock_irqsave(&timr->it_lock, flags);

        timr->it_active = 0;
        if (timr->it_interval != 0)
                si_private = ++timr->it_requeue_pending;

        if (posix_timer_event(timr, si_private)) {
                /*
                 * The signal was not queued due to SIG_IGN. As a
                 * consequence the timer is not going to be rearmed from
                 * the signal delivery path. But as a real signal handler
                 * can be installed later the timer must be rearmed here.
                 */
                if (timr->it_interval != 0) {
                        ktime_t now = hrtimer_cb_get_time(timer);

                        /*
                         * FIXME: What we really want, is to stop this
                         * timer completely and restart it in case the
                         * SIG_IGN is removed. This is a non trivial
                         * change to the signal handling code.
                         *
                         * For now let timers with an interval less than a
                         * jiffie expire every jiffie and recheck for a
                         * valid signal handler.
                         *
                         * This avoids interrupt starvation in case of a
                         * very small interval, which would expire the
                         * timer immediately again.
                         *
                         * Moving now ahead of time by one jiffie tricks
                         * hrtimer_forward() to expire the timer later,
                         * while it still maintains the overrun accuracy
                         * for the price of a slight inconsistency in the
                         * timer_gettime() case. This is at least better
                         * than a timer storm.
                         *
                         * Only required when high resolution timers are
                         * enabled as the periodic tick based timers are
                         * automatically aligned to the next tick.
                         */
                        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) {
                                ktime_t kj = TICK_NSEC;

                                if (timr->it_interval < kj)
                                        now = ktime_add(now, kj);
                        }

                        timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
                        ret = HRTIMER_RESTART;
                        ++timr->it_requeue_pending;
                        timr->it_active = 1;
                }
        }

        unlock_timer(timr, flags);
        return ret;
}

static struct pid *good_sigevent(sigevent_t * event)
{
        struct pid *pid = task_tgid(current);
        struct task_struct *rtn;

        switch (event->sigev_notify) {
        case SIGEV_SIGNAL | SIGEV_THREAD_ID:
                pid = find_vpid(event->sigev_notify_thread_id);
                rtn = pid_task(pid, PIDTYPE_PID);
                if (!rtn || !same_thread_group(rtn, current))
                        return NULL;
                fallthrough;
        case SIGEV_SIGNAL:
        case SIGEV_THREAD:
                if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
                        return NULL;
                fallthrough;
        case SIGEV_NONE:
                return pid;
        default:
                return NULL;
        }
}

static struct k_itimer * alloc_posix_timer(void)
{
        struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);

        if (!tmr)
                return tmr;
        if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
                kmem_cache_free(posix_timers_cache, tmr);
                return NULL;
        }
        clear_siginfo(&tmr->sigq->info);
        return tmr;
}

static void k_itimer_rcu_free(struct rcu_head *head)
{
        struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);

        kmem_cache_free(posix_timers_cache, tmr);
}

static void posix_timer_free(struct k_itimer *tmr)
{
        put_pid(tmr->it_pid);
        sigqueue_free(tmr->sigq);
        call_rcu(&tmr->rcu, k_itimer_rcu_free);
}

static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
        spin_lock(&hash_lock);
        hlist_del_rcu(&tmr->t_hash);
        spin_unlock(&hash_lock);
        posix_timer_free(tmr);
}

static int common_timer_create(struct k_itimer *new_timer)
{
        hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
        return 0;
}

/* Create a POSIX.1b interval timer. */
static int do_timer_create(clockid_t which_clock, struct sigevent *event,
                           timer_t __user *created_timer_id)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct k_itimer *new_timer;
        int error, new_timer_id;

        if (!kc)
                return -EINVAL;
        if (!kc->timer_create)
                return -EOPNOTSUPP;

        new_timer = alloc_posix_timer();
        if (unlikely(!new_timer))
                return -EAGAIN;

        spin_lock_init(&new_timer->it_lock);

        /*
         * Add the timer to the hash table. The timer is not yet valid
         * because new_timer::it_signal is still NULL. The timer id is also
         * not yet visible to user space.
         */
        new_timer_id = posix_timer_add(new_timer);
        if (new_timer_id < 0) {
                posix_timer_free(new_timer);
                return new_timer_id;
        }

        new_timer->it_id = (timer_t) new_timer_id;
        new_timer->it_clock = which_clock;
        new_timer->kclock = kc;
        new_timer->it_overrun = -1LL;

        if (event) {
                rcu_read_lock();
                new_timer->it_pid = get_pid(good_sigevent(event));
                rcu_read_unlock();
                if (!new_timer->it_pid) {
                        error = -EINVAL;
                        goto out;
                }
                new_timer->it_sigev_notify     = event->sigev_notify;
                new_timer->sigq->info.si_signo = event->sigev_signo;
                new_timer->sigq->info.si_value = event->sigev_value;
        } else {
                new_timer->it_sigev_notify     = SIGEV_SIGNAL;
                new_timer->sigq->info.si_signo = SIGALRM;
                memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t));
                new_timer->sigq->info.si_value.sival_int = new_timer->it_id;
                new_timer->it_pid = get_pid(task_tgid(current));
        }

        new_timer->sigq->info.si_tid   = new_timer->it_id;
        new_timer->sigq->info.si_code  = SI_TIMER;

        if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
                error = -EFAULT;
                goto out;
        }
        /*
         * After succesful copy out, the timer ID is visible to user space
         * now but not yet valid because new_timer::signal is still NULL.
         *
         * Complete the initialization with the clock specific create
         * callback.
         */
        error = kc->timer_create(new_timer);
        if (error)
                goto out;

        spin_lock_irq(&current->sighand->siglock);
        /* This makes the timer valid in the hash table */
        WRITE_ONCE(new_timer->it_signal, current->signal);
        list_add(&new_timer->list, &current->signal->posix_timers);
        spin_unlock_irq(&current->sighand->siglock);
        /*
         * After unlocking sighand::siglock @new_timer is subject to
         * concurrent removal and cannot be touched anymore
         */
        return 0;
out:
        posix_timer_unhash_and_free(new_timer);
        return error;
}

SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                struct sigevent __user *, timer_event_spec,
                timer_t __user *, created_timer_id)
{
        if (timer_event_spec) {
                sigevent_t event;

                if (copy_from_user(&event, timer_event_spec, sizeof (event)))
                        return -EFAULT;
                return do_timer_create(which_clock, &event, created_timer_id);
        }
        return do_timer_create(which_clock, NULL, created_timer_id);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
                       struct compat_sigevent __user *, timer_event_spec,
                       timer_t __user *, created_timer_id)
{
        if (timer_event_spec) {
                sigevent_t event;

                if (get_compat_sigevent(&event, timer_event_spec))
                        return -EFAULT;
                return do_timer_create(which_clock, &event, created_timer_id);
        }
        return do_timer_create(which_clock, NULL, created_timer_id);
}
#endif

static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
        struct k_itimer *timr;

        /*
         * timer_t could be any type >= int and we want to make sure any
         * @timer_id outside positive int range fails lookup.
         */
        if ((unsigned long long)timer_id > INT_MAX)
                return NULL;

        /*
         * The hash lookup and the timers are RCU protected.
         *
         * Timers are added to the hash in invalid state where
         * timr::it_signal == NULL. timer::it_signal is only set after the
         * rest of the initialization succeeded.
         *
         * Timer destruction happens in steps:
         *  1) Set timr::it_signal to NULL with timr::it_lock held
         *  2) Release timr::it_lock
         *  3) Remove from the hash under hash_lock
         *  4) Call RCU for removal after the grace period
         *
         * Holding rcu_read_lock() accross the lookup ensures that
         * the timer cannot be freed.
         *
         * The lookup validates locklessly that timr::it_signal ==
         * current::it_signal and timr::it_id == @timer_id. timr::it_id
         * can't change, but timr::it_signal becomes NULL during
         * destruction.
         */
        rcu_read_lock();
        timr = posix_timer_by_id(timer_id);
        if (timr) {
                spin_lock_irqsave(&timr->it_lock, *flags);
                /*
                 * Validate under timr::it_lock that timr::it_signal is
                 * still valid. Pairs with #1 above.
                 */
                if (timr->it_signal == current->signal) {
                        rcu_read_unlock();
                        return timr;
                }
                spin_unlock_irqrestore(&timr->it_lock, *flags);
        }
        rcu_read_unlock();

        return NULL;
}

static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now)
{
        struct hrtimer *timer = &timr->it.real.timer;

        return __hrtimer_expires_remaining_adjusted(timer, now);
}

static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
{
        struct hrtimer *timer = &timr->it.real.timer;

        return hrtimer_forward(timer, now, timr->it_interval);
}

/*
 * Get the time remaining on a POSIX.1b interval timer.
 *
 * Two issues to handle here:
 *
 *  1) The timer has a requeue pending. The return value must appear as
 *     if the timer has been requeued right now.
 *
 *  2) The timer is a SIGEV_NONE timer. These timers are never enqueued
 *     into the hrtimer queue and therefore never expired. Emulate expiry
 *     here taking #1 into account.
 */
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
        const struct k_clock *kc = timr->kclock;
        ktime_t now, remaining, iv;
        bool sig_none;

        sig_none = timr->it_sigev_notify == SIGEV_NONE;
        iv = timr->it_interval;

        /* interval timer ? */
        if (iv) {
                cur_setting->it_interval = ktime_to_timespec64(iv);
        } else if (!timr->it_active) {
                /*
                 * SIGEV_NONE oneshot timers are never queued and therefore
                 * timr->it_active is always false. The check below
                 * vs. remaining time will handle this case.
                 *
                 * For all other timers there is nothing to update here, so
                 * return.
                 */
                if (!sig_none)
                        return;
        }

        now = kc->clock_get_ktime(timr->it_clock);

        /*
         * If this is an interval timer and either has requeue pending or
         * is a SIGEV_NONE timer move the expiry time forward by intervals,
         * so expiry is > now.
         */
        if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
                timr->it_overrun += kc->timer_forward(timr, now);

        remaining = kc->timer_remaining(timr, now);
        /*
         * As @now is retrieved before a possible timer_forward() and
         * cannot be reevaluated by the compiler @remaining is based on the
         * same @now value. Therefore @remaining is consistent vs. @now.
         *
         * Consequently all interval timers, i.e. @iv > 0, cannot have a
         * remaining time <= 0 because timer_forward() guarantees to move
         * them forward so that the next timer expiry is > @now.
         */
        if (remaining <= 0) {
                /*
                 * A single shot SIGEV_NONE timer must return 0, when it is
                 * expired! Timers which have a real signal delivery mode
                 * must return a remaining time greater than 0 because the
                 * signal has not yet been delivered.
                 */
                if (!sig_none)
                        cur_setting->it_value.tv_nsec = 1;
        } else {
                cur_setting->it_value = ktime_to_timespec64(remaining);
        }
}

static int do_timer_gettime(timer_t timer_id,  struct itimerspec64 *setting)
{
        const struct k_clock *kc;
        struct k_itimer *timr;
        unsigned long flags;
        int ret = 0;

        timr = lock_timer(timer_id, &flags);
        if (!timr)
                return -EINVAL;

        memset(setting, 0, sizeof(*setting));
        kc = timr->kclock;
        if (WARN_ON_ONCE(!kc || !kc->timer_get))
                ret = -EINVAL;
        else
                kc->timer_get(timr, setting);

        unlock_timer(timr, flags);
        return ret;
}

/* Get the time remaining on a POSIX.1b interval timer. */
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
                struct __kernel_itimerspec __user *, setting)
{
        struct itimerspec64 cur_setting;

        int ret = do_timer_gettime(timer_id, &cur_setting);
        if (!ret) {
                if (put_itimerspec64(&cur_setting, setting))
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
                struct old_itimerspec32 __user *, setting)
{
        struct itimerspec64 cur_setting;

        int ret = do_timer_gettime(timer_id, &cur_setting);
        if (!ret) {
                if (put_old_itimerspec32(&cur_setting, setting))
                        ret = -EFAULT;
        }
        return ret;
}

#endif

/**
 * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer
 * @timer_id:        The timer ID which identifies the timer
 *
 * The "overrun count" of a timer is one plus the number of expiration
 * intervals which have elapsed between the first expiry, which queues the
 * signal and the actual signal delivery. On signal delivery the "overrun
 * count" is calculated and cached, so it can be returned directly here.
 *
 * As this is relative to the last queued signal the returned overrun count
 * is meaningless outside of the signal delivery path and even there it
 * does not accurately reflect the current state when user space evaluates
 * it.
 *
 * Returns:
 *        -EINVAL                @timer_id is invalid
 *        1..INT_MAX        The number of overruns related to the last delivered signal
 */
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
        struct k_itimer *timr;
        unsigned long flags;
        int overrun;

        timr = lock_timer(timer_id, &flags);
        if (!timr)
                return -EINVAL;

        overrun = timer_overrun_to_int(timr, 0);
        unlock_timer(timr, flags);

        return overrun;
}

static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
                               bool absolute, bool sigev_none)
{
        struct hrtimer *timer = &timr->it.real.timer;
        enum hrtimer_mode mode;

        mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
        /*
         * Posix magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they become CLOCK_MONOTONIC based under the
         * hood. See hrtimer_init(). Update timr->kclock, so the generic
         * functions which use timr->kclock->clock_get_*() work.
         *
         * Note: it_clock stays unmodified, because the next timer_set() might
         * use ABSTIME, so it needs to switch back.
         */
        if (timr->it_clock == CLOCK_REALTIME)
                timr->kclock = absolute ? &clock_realtime : &clock_monotonic;

        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
        timr->it.real.timer.function = posix_timer_fn;

        if (!absolute)
                expires = ktime_add_safe(expires, timer->base->get_time());
        hrtimer_set_expires(timer, expires);

        if (!sigev_none)
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
{
        return hrtimer_try_to_cancel(&timr->it.real.timer);
}

static void common_timer_wait_running(struct k_itimer *timer)
{
        hrtimer_cancel_wait_running(&timer->it.real.timer);
}

/*
 * On PREEMPT_RT this prevents priority inversion and a potential livelock
 * against the ksoftirqd thread in case that ksoftirqd gets preempted while
 * executing a hrtimer callback.
 *
 * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this
 * just results in a cpu_relax().
 *
 * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is
 * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this
 * prevents spinning on an eventually scheduled out task and a livelock
 * when the task which tries to delete or disarm the timer has preempted
 * the task which runs the expiry in task work context.
 */
static struct k_itimer *timer_wait_running(struct k_itimer *timer,
                                           unsigned long *flags)
{
        const struct k_clock *kc = READ_ONCE(timer->kclock);
        timer_t timer_id = READ_ONCE(timer->it_id);

        /* Prevent kfree(timer) after dropping the lock */
        rcu_read_lock();
        unlock_timer(timer, *flags);

        /*
         * kc->timer_wait_running() might drop RCU lock. So @timer
         * cannot be touched anymore after the function returns!
         */
        if (!WARN_ON_ONCE(!kc->timer_wait_running))
                kc->timer_wait_running(timer);

        rcu_read_unlock();
        /* Relock the timer. It might be not longer hashed. */
        return lock_timer(timer_id, flags);
}

/* Set a POSIX.1b interval timer. */
int common_timer_set(struct k_itimer *timr, int flags,
                     struct itimerspec64 *new_setting,
                     struct itimerspec64 *old_setting)
{
        const struct k_clock *kc = timr->kclock;
        bool sigev_none;
        ktime_t expires;

        if (old_setting)
                common_timer_get(timr, old_setting);

        /* Prevent rearming by clearing the interval */
        timr->it_interval = 0;
        /*
         * Careful here. On SMP systems the timer expiry function could be
         * active and spinning on timr->it_lock.
         */
        if (kc->timer_try_to_cancel(timr) < 0)
                return TIMER_RETRY;

        timr->it_active = 0;
        timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
                ~REQUEUE_PENDING;
        timr->it_overrun_last = 0;

        /* Switch off the timer when it_value is zero */
        if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
                return 0;

        timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
        expires = timespec64_to_ktime(new_setting->it_value);
        if (flags & TIMER_ABSTIME)
                expires = timens_ktime_to_host(timr->it_clock, expires);
        sigev_none = timr->it_sigev_notify == SIGEV_NONE;

        kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
        timr->it_active = !sigev_none;
        return 0;
}

static int do_timer_settime(timer_t timer_id, int tmr_flags,
                            struct itimerspec64 *new_spec64,
                            struct itimerspec64 *old_spec64)
{
        const struct k_clock *kc;
        struct k_itimer *timr;
        unsigned long flags;
        int error = 0;

        if (!timespec64_valid(&new_spec64->it_interval) ||
            !timespec64_valid(&new_spec64->it_value))
                return -EINVAL;

        if (old_spec64)
                memset(old_spec64, 0, sizeof(*old_spec64));

        timr = lock_timer(timer_id, &flags);
retry:
        if (!timr)
                return -EINVAL;

        kc = timr->kclock;
        if (WARN_ON_ONCE(!kc || !kc->timer_set))
                error = -EINVAL;
        else
                error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);

        if (error == TIMER_RETRY) {
                // We already got the old time...
                old_spec64 = NULL;
                /* Unlocks and relocks the timer if it still exists */
                timr = timer_wait_running(timr, &flags);
                goto retry;
        }
        unlock_timer(timr, flags);

        return error;
}

/* Set a POSIX.1b interval timer */
SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
                const struct __kernel_itimerspec __user *, new_setting,
                struct __kernel_itimerspec __user *, old_setting)
{
        struct itimerspec64 new_spec, old_spec, *rtn;
        int error = 0;

        if (!new_setting)
                return -EINVAL;

        if (get_itimerspec64(&new_spec, new_setting))
                return -EFAULT;

        rtn = old_setting ? &old_spec : NULL;
        error = do_timer_settime(timer_id, flags, &new_spec, rtn);
        if (!error && old_setting) {
                if (put_itimerspec64(&old_spec, old_setting))
                        error = -EFAULT;
        }
        return error;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
                struct old_itimerspec32 __user *, new,
                struct old_itimerspec32 __user *, old)
{
        struct itimerspec64 new_spec, old_spec;
        struct itimerspec64 *rtn = old ? &old_spec : NULL;
        int error = 0;

        if (!new)
                return -EINVAL;
        if (get_old_itimerspec32(&new_spec, new))
                return -EFAULT;

        error = do_timer_settime(timer_id, flags, &new_spec, rtn);
        if (!error && old) {
                if (put_old_itimerspec32(&old_spec, old))
                        error = -EFAULT;
        }
        return error;
}
#endif

int common_timer_del(struct k_itimer *timer)
{
        const struct k_clock *kc = timer->kclock;

        timer->it_interval = 0;
        if (kc->timer_try_to_cancel(timer) < 0)
                return TIMER_RETRY;
        timer->it_active = 0;
        return 0;
}

static inline int timer_delete_hook(struct k_itimer *timer)
{
        const struct k_clock *kc = timer->kclock;

        if (WARN_ON_ONCE(!kc || !kc->timer_del))
                return -EINVAL;
        return kc->timer_del(timer);
}

/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
        struct k_itimer *timer;
        unsigned long flags;

        timer = lock_timer(timer_id, &flags);

retry_delete:
        if (!timer)
                return -EINVAL;

        if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
                /* Unlocks and relocks the timer if it still exists */
                timer = timer_wait_running(timer, &flags);
                goto retry_delete;
        }

        spin_lock(&current->sighand->siglock);
        list_del(&timer->list);
        spin_unlock(&current->sighand->siglock);
        /*
         * A concurrent lookup could check timer::it_signal lockless. It
         * will reevaluate with timer::it_lock held and observe the NULL.
         */
        WRITE_ONCE(timer->it_signal, NULL);

        unlock_timer(timer, flags);
        posix_timer_unhash_and_free(timer);
        return 0;
}

/*
 * Delete a timer if it is armed, remove it from the hash and schedule it
 * for RCU freeing.
 */
static void itimer_delete(struct k_itimer *timer)
{
        unsigned long flags;

        /*
         * irqsave is required to make timer_wait_running() work.
         */
        spin_lock_irqsave(&timer->it_lock, flags);

retry_delete:
        /*
         * Even if the timer is not longer accessible from other tasks
         * it still might be armed and queued in the underlying timer
         * mechanism. Worse, that timer mechanism might run the expiry
         * function concurrently.
         */
        if (timer_delete_hook(timer) == TIMER_RETRY) {
                /*
                 * Timer is expired concurrently, prevent livelocks
                 * and pointless spinning on RT.
                 *
                 * timer_wait_running() drops timer::it_lock, which opens
                 * the possibility for another task to delete the timer.
                 *
                 * That's not possible here because this is invoked from
                 * do_exit() only for the last thread of the thread group.
                 * So no other task can access and delete that timer.
                 */
                if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
                        return;

                goto retry_delete;
        }
        list_del(&timer->list);

        /*
         * Setting timer::it_signal to NULL is technically not required
         * here as nothing can access the timer anymore legitimately via
         * the hash table. Set it to NULL nevertheless so that all deletion
         * paths are consistent.
         */
        WRITE_ONCE(timer->it_signal, NULL);

        spin_unlock_irqrestore(&timer->it_lock, flags);
        posix_timer_unhash_and_free(timer);
}

/*
 * Invoked from do_exit() when the last thread of a thread group exits.
 * At that point no other task can access the timers of the dying
 * task anymore.
 */
void exit_itimers(struct task_struct *tsk)
{
        struct list_head timers;
        struct k_itimer *tmr;

        if (list_empty(&tsk->signal->posix_timers))
                return;

        /* Protect against concurrent read via /proc/$PID/timers */
        spin_lock_irq(&tsk->sighand->siglock);
        list_replace_init(&tsk->signal->posix_timers, &timers);
        spin_unlock_irq(&tsk->sighand->siglock);

        /* The timers are not longer accessible via tsk::signal */
        while (!list_empty(&timers)) {
                tmr = list_first_entry(&timers, struct k_itimer, list);
                itimer_delete(tmr);
        }
}

SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 new_tp;

        if (!kc || !kc->clock_set)
                return -EINVAL;

        if (get_timespec64(&new_tp, tp))
                return -EFAULT;

        /*
         * Permission checks have to be done inside the clock specific
         * setter callback.
         */
        return kc->clock_set(which_clock, &new_tp);
}

SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 kernel_tp;
        int error;

        if (!kc)
                return -EINVAL;

        error = kc->clock_get_timespec(which_clock, &kernel_tp);

        if (!error && put_timespec64(&kernel_tp, tp))
                error = -EFAULT;

        return error;
}

int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);

        if (!kc)
                return -EINVAL;
        if (!kc->clock_adj)
                return -EOPNOTSUPP;

        return kc->clock_adj(which_clock, ktx);
}

SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
                struct __kernel_timex __user *, utx)
{
        struct __kernel_timex ktx;
        int err;

        if (copy_from_user(&ktx, utx, sizeof(ktx)))
                return -EFAULT;

        err = do_clock_adjtime(which_clock, &ktx);

        if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
                return -EFAULT;

        return err;
}

/**
 * sys_clock_getres - Get the resolution of a clock
 * @which_clock:        The clock to get the resolution for
 * @tp:                        Pointer to a a user space timespec64 for storage
 *
 * POSIX defines:
 *
 * "The clock_getres() function shall return the resolution of any
 * clock. Clock resolutions are implementation-defined and cannot be set by
 * a process. If the argument res is not NULL, the resolution of the
 * specified clock shall be stored in the location pointed to by res. If
 * res is NULL, the clock resolution is not returned. If the time argument
 * of clock_settime() is not a multiple of res, then the value is truncated
 * to a multiple of res."
 *
 * Due to the various hardware constraints the real resolution can vary
 * wildly and even change during runtime when the underlying devices are
 * replaced. The kernel also can use hardware devices with different
 * resolutions for reading the time and for arming timers.
 *
 * The kernel therefore deviates from the POSIX spec in various aspects:
 *
 * 1) The resolution returned to user space
 *
 *    For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI,
 *    CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW
 *    the kernel differentiates only two cases:
 *
 *    I)  Low resolution mode:
 *
 *          When high resolution timers are disabled at compile or runtime
 *          the resolution returned is nanoseconds per tick, which represents
 *          the precision at which timers expire.
 *
 *    II) High resolution mode:
 *
 *          When high resolution timers are enabled the resolution returned
 *          is always one nanosecond independent of the actual resolution of
 *          the underlying hardware devices.
 *
 *          For CLOCK_*_ALARM the actual resolution depends on system
 *          state. When system is running the resolution is the same as the
 *          resolution of the other clocks. During suspend the actual
 *          resolution is the resolution of the underlying RTC device which
 *          might be way less precise than the clockevent device used during
 *          running state.
 *
 *   For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution
 *   returned is always nanoseconds per tick.
 *
 *   For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution
 *   returned is always one nanosecond under the assumption that the
 *   underlying scheduler clock has a better resolution than nanoseconds
 *   per tick.
 *
 *   For dynamic POSIX clocks (PTP devices) the resolution returned is
 *   always one nanosecond.
 *
 * 2) Affect on sys_clock_settime()
 *
 *    The kernel does not truncate the time which is handed in to
 *    sys_clock_settime(). The kernel internal timekeeping is always using
 *    nanoseconds precision independent of the clocksource device which is
 *    used to read the time from. The resolution of that device only
 *    affects the presicion of the time returned by sys_clock_gettime().
 *
 * Returns:
 *        0                Success. @tp contains the resolution
 *        -EINVAL                @which_clock is not a valid clock ID
 *        -EFAULT                Copying the resolution to @tp faulted
 *        -ENODEV                Dynamic POSIX clock is not backed by a device
 *        -EOPNOTSUPP        Dynamic POSIX clock does not support getres()
 */
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
                struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 rtn_tp;
        int error;

        if (!kc)
                return -EINVAL;

        error = kc->clock_getres(which_clock, &rtn_tp);

        if (!error && tp && put_timespec64(&rtn_tp, tp))
                error = -EFAULT;

        return error;
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;

        if (!kc || !kc->clock_set)
                return -EINVAL;

        if (get_old_timespec32(&ts, tp))
                return -EFAULT;

        return kc->clock_set(which_clock, &ts);
}

SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;
        int err;

        if (!kc)
                return -EINVAL;

        err = kc->clock_get_timespec(which_clock, &ts);

        if (!err && put_old_timespec32(&ts, tp))
                err = -EFAULT;

        return err;
}

SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
                struct old_timex32 __user *, utp)
{
        struct __kernel_timex ktx;
        int err;

        err = get_old_timex32(&ktx, utp);
        if (err)
                return err;

        err = do_clock_adjtime(which_clock, &ktx);

        if (err >= 0 && put_old_timex32(utp, &ktx))
                return -EFAULT;

        return err;
}

SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;
        int err;

        if (!kc)
                return -EINVAL;

        err = kc->clock_getres(which_clock, &ts);
        if (!err && tp && put_old_timespec32(&ts, tp))
                return -EFAULT;

        return err;
}

#endif

/*
 * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI
 */
static int common_nsleep(const clockid_t which_clock, int flags,
                         const struct timespec64 *rqtp)
{
        ktime_t texp = timespec64_to_ktime(*rqtp);

        return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                 which_clock);
}

/*
 * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME
 *
 * Absolute nanosleeps for these clocks are time-namespace adjusted.
 */
static int common_nsleep_timens(const clockid_t which_clock, int flags,
                                const struct timespec64 *rqtp)
{
        ktime_t texp = timespec64_to_ktime(*rqtp);

        if (flags & TIMER_ABSTIME)
                texp = timens_ktime_to_host(which_clock, texp);

        return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                 which_clock);
}

SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                const struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 t;

        if (!kc)
                return -EINVAL;
        if (!kc->nsleep)
                return -EOPNOTSUPP;

        if (get_timespec64(&t, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&t))
                return -EINVAL;
        if (flags & TIMER_ABSTIME)
                rmtp = NULL;
        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;

        return kc->nsleep(which_clock, flags, &t);
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
                struct old_timespec32 __user *, rqtp,
                struct old_timespec32 __user *, rmtp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 t;

        if (!kc)
                return -EINVAL;
        if (!kc->nsleep)
                return -EOPNOTSUPP;

        if (get_old_timespec32(&t, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&t))
                return -EINVAL;
        if (flags & TIMER_ABSTIME)
                rmtp = NULL;
        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;

        return kc->nsleep(which_clock, flags, &t);
}

#endif

static const struct k_clock clock_realtime = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_realtime_timespec,
        .clock_get_ktime        = posix_get_realtime_ktime,
        .clock_set                = posix_clock_realtime_set,
        .clock_adj                = posix_clock_realtime_adj,
        .nsleep                        = common_nsleep,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_monotonic = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_monotonic_timespec,
        .clock_get_ktime        = posix_get_monotonic_ktime,
        .nsleep                        = common_nsleep_timens,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_monotonic_raw = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_monotonic_raw,
};

static const struct k_clock clock_realtime_coarse = {
        .clock_getres                = posix_get_coarse_res,
        .clock_get_timespec        = posix_get_realtime_coarse,
};

static const struct k_clock clock_monotonic_coarse = {
        .clock_getres                = posix_get_coarse_res,
        .clock_get_timespec        = posix_get_monotonic_coarse,
};

static const struct k_clock clock_tai = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_ktime        = posix_get_tai_ktime,
        .clock_get_timespec        = posix_get_tai_timespec,
        .nsleep                        = common_nsleep,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_boottime = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_ktime        = posix_get_boottime_ktime,
        .clock_get_timespec        = posix_get_boottime_timespec,
        .nsleep                        = common_nsleep_timens,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock * const posix_clocks[] = {
        [CLOCK_REALTIME]                = &clock_realtime,
        [CLOCK_MONOTONIC]                = &clock_monotonic,
        [CLOCK_PROCESS_CPUTIME_ID]        = &clock_process,
        [CLOCK_THREAD_CPUTIME_ID]        = &clock_thread,
        [CLOCK_MONOTONIC_RAW]                = &clock_monotonic_raw,
        [CLOCK_REALTIME_COARSE]                = &clock_realtime_coarse,
        [CLOCK_MONOTONIC_COARSE]        = &clock_monotonic_coarse,
        [CLOCK_BOOTTIME]                = &clock_boottime,
        [CLOCK_REALTIME_ALARM]                = &alarm_clock,
        [CLOCK_BOOTTIME_ALARM]                = &alarm_clock,
        [CLOCK_TAI]                        = &clock_tai,
};

static const struct k_clock *clockid_to_kclock(const clockid_t id)
{
        clockid_t idx = id;

        if (id < 0) {
                return (id & CLOCKFD_MASK) == CLOCKFD ?
                        &clock_posix_dynamic : &clock_posix_cpu;
        }

        if (id >= ARRAY_SIZE(posix_clocks))
                return NULL;

        return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}






































































































































































































    2 



    2 
    2 

    2 















    1 



    1 
    1 





























































































    1 









    1 
    1 
































































































































    1 



















    1 
    1 















    1 





    7 




    9 





    2 



    2 




    7 







    9 













    7 


















    1 


















    1 
    1 
















































































































































































































































































































































































































    1 
    1 










































































    6 


    5 



    5 





















    2 




    2 
    2 

    2 

    2 



    4 



    3 


    3 


























    3 





    3 




    3 
    4 

    3 






    1 
    1 
    1 
    1 
    1 








    1 
    1 
    1 
    1 
    1 







    1 
    1 
    1 
    1 











    2 
    2 
    2 
    2 
    2 








    9 
    8 
   10 
   10 
    9 








    6 





    5 





    2 
    3 
    3 
    4 







    1 




    1 






    2 
    2 
    3 
    3 








    1 






    2 





    2 















    1 


    1 














































































    2 
















    2 



    1 







































































































































































































































































































































































































































































































































































































































































































    3 



    3 


































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
// SPDX-License-Identifier: GPL-2.0-only
/*
 * AppArmor security module
 *
 * This file contains AppArmor LSM hooks.
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2010 Canonical Ltd.
 */

#include <linux/lsm_hooks.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/ptrace.h>
#include <linux/ctype.h>
#include <linux/sysctl.h>
#include <linux/audit.h>
#include <linux/user_namespace.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/zstd.h>
#include <net/sock.h>
#include <uapi/linux/mount.h>
#include <uapi/linux/lsm.h>

#include "include/apparmor.h"
#include "include/apparmorfs.h"
#include "include/audit.h"
#include "include/capability.h"
#include "include/cred.h"
#include "include/file.h"
#include "include/ipc.h"
#include "include/net.h"
#include "include/path.h"
#include "include/label.h"
#include "include/policy.h"
#include "include/policy_ns.h"
#include "include/procattr.h"
#include "include/mount.h"
#include "include/secid.h"

/* Flag indicating whether initialization completed */
int apparmor_initialized;

union aa_buffer {
        struct list_head list;
        DECLARE_FLEX_ARRAY(char, buffer);
};

struct aa_local_cache {
        unsigned int hold;
        unsigned int count;
        struct list_head head;
};

#define RESERVE_COUNT 2
static int reserve_count = RESERVE_COUNT;
static int buffer_count;

static LIST_HEAD(aa_global_buffers);
static DEFINE_SPINLOCK(aa_buffers_lock);
static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers);

/*
 * LSM hook functions
 */

/*
 * put the associated labels
 */
static void apparmor_cred_free(struct cred *cred)
{
        aa_put_label(cred_label(cred));
        set_cred_label(cred, NULL);
}

/*
 * allocate the apparmor part of blank credentials
 */
static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        set_cred_label(cred, NULL);
        return 0;
}

/*
 * prepare new cred label for modification by prepare_cred block
 */
static int apparmor_cred_prepare(struct cred *new, const struct cred *old,
                                 gfp_t gfp)
{
        set_cred_label(new, aa_get_newest_label(cred_label(old)));
        return 0;
}

/*
 * transfer the apparmor data to a blank set of creds
 */
static void apparmor_cred_transfer(struct cred *new, const struct cred *old)
{
        set_cred_label(new, aa_get_newest_label(cred_label(old)));
}

static void apparmor_task_free(struct task_struct *task)
{

        aa_free_task_ctx(task_ctx(task));
}

static int apparmor_task_alloc(struct task_struct *task,
                               unsigned long clone_flags)
{
        struct aa_task_ctx *new = task_ctx(task);

        aa_dup_task_ctx(new, task_ctx(current));

        return 0;
}

static int apparmor_ptrace_access_check(struct task_struct *child,
                                        unsigned int mode)
{
        struct aa_label *tracer, *tracee;
        const struct cred *cred;
        int error;

        cred = get_task_cred(child);
        tracee = cred_label(cred);        /* ref count on cred */
        tracer = __begin_current_label_crit_section();
        error = aa_may_ptrace(current_cred(), tracer, cred, tracee,
                        (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ
                                                  : AA_PTRACE_TRACE);
        __end_current_label_crit_section(tracer);
        put_cred(cred);

        return error;
}

static int apparmor_ptrace_traceme(struct task_struct *parent)
{
        struct aa_label *tracer, *tracee;
        const struct cred *cred;
        int error;

        tracee = __begin_current_label_crit_section();
        cred = get_task_cred(parent);
        tracer = cred_label(cred);        /* ref count on cred */
        error = aa_may_ptrace(cred, tracer, current_cred(), tracee,
                              AA_PTRACE_TRACE);
        put_cred(cred);
        __end_current_label_crit_section(tracee);

        return error;
}

/* Derived from security/commoncap.c:cap_capget */
static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective,
                           kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        struct aa_label *label;
        const struct cred *cred;

        rcu_read_lock();
        cred = __task_cred(target);
        label = aa_get_newest_cred_label(cred);

        /*
         * cap_capget is stacked ahead of this and will
         * initialize effective and permitted.
         */
        if (!unconfined(label)) {
                struct aa_profile *profile;
                struct label_it i;

                label_for_each_confined(i, label, profile) {
                        struct aa_ruleset *rules;
                        if (COMPLAIN_MODE(profile))
                                continue;
                        rules = list_first_entry(&profile->rules,
                                                 typeof(*rules), list);
                        *effective = cap_intersect(*effective,
                                                   rules->caps.allow);
                        *permitted = cap_intersect(*permitted,
                                                   rules->caps.allow);
                }
        }
        rcu_read_unlock();
        aa_put_label(label);

        return 0;
}

static int apparmor_capable(const struct cred *cred, struct user_namespace *ns,
                            int cap, unsigned int opts)
{
        struct aa_label *label;
        int error = 0;

        label = aa_get_newest_cred_label(cred);
        if (!unconfined(label))
                error = aa_capable(cred, label, cap, opts);
        aa_put_label(label);

        return error;
}

/**
 * common_perm - basic common permission check wrapper fn for paths
 * @op: operation being checked
 * @path: path to check permission of  (NOT NULL)
 * @mask: requested permissions mask
 * @cond: conditional info for the permission request  (NOT NULL)
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm(const char *op, const struct path *path, u32 mask,
                       struct path_cond *cond)
{
        struct aa_label *label;
        int error = 0;

        label = __begin_current_label_crit_section();
        if (!unconfined(label))
                error = aa_path_perm(op, current_cred(), label, path, 0, mask,
                                     cond);
        __end_current_label_crit_section(label);

        return error;
}

/**
 * common_perm_cond - common permission wrapper around inode cond
 * @op: operation being checked
 * @path: location to check (NOT NULL)
 * @mask: requested permissions mask
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_cond(const char *op, const struct path *path, u32 mask)
{
        vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt),
                                            d_backing_inode(path->dentry));
        struct path_cond cond = {
                vfsuid_into_kuid(vfsuid),
                d_backing_inode(path->dentry)->i_mode
        };

        if (!path_mediated_fs(path->dentry))
                return 0;

        return common_perm(op, path, mask, &cond);
}

/**
 * common_perm_dir_dentry - common permission wrapper when path is dir, dentry
 * @op: operation being checked
 * @dir: directory of the dentry  (NOT NULL)
 * @dentry: dentry to check  (NOT NULL)
 * @mask: requested permissions mask
 * @cond: conditional info for the permission request  (NOT NULL)
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_dir_dentry(const char *op, const struct path *dir,
                                  struct dentry *dentry, u32 mask,
                                  struct path_cond *cond)
{
        struct path path = { .mnt = dir->mnt, .dentry = dentry };

        return common_perm(op, &path, mask, cond);
}

/**
 * common_perm_rm - common permission wrapper for operations doing rm
 * @op: operation being checked
 * @dir: directory that the dentry is in  (NOT NULL)
 * @dentry: dentry being rm'd  (NOT NULL)
 * @mask: requested permission mask
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_rm(const char *op, const struct path *dir,
                          struct dentry *dentry, u32 mask)
{
        struct inode *inode = d_backing_inode(dentry);
        struct path_cond cond = { };
        vfsuid_t vfsuid;

        if (!inode || !path_mediated_fs(dentry))
                return 0;

        vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode);
        cond.uid = vfsuid_into_kuid(vfsuid);
        cond.mode = inode->i_mode;

        return common_perm_dir_dentry(op, dir, dentry, mask, &cond);
}

/**
 * common_perm_create - common permission wrapper for operations doing create
 * @op: operation being checked
 * @dir: directory that dentry will be created in  (NOT NULL)
 * @dentry: dentry to create   (NOT NULL)
 * @mask: request permission mask
 * @mode: created file mode
 *
 * Returns: %0 else error code if error or permission denied
 */
static int common_perm_create(const char *op, const struct path *dir,
                              struct dentry *dentry, u32 mask, umode_t mode)
{
        struct path_cond cond = { current_fsuid(), mode };

        if (!path_mediated_fs(dir->dentry))
                return 0;

        return common_perm_dir_dentry(op, dir, dentry, mask, &cond);
}

static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry)
{
        return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE);
}

static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry,
                               umode_t mode)
{
        return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE,
                                  S_IFDIR);
}

static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE);
}

static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry,
                               umode_t mode, unsigned int dev)
{
        return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode);
}

static int apparmor_path_truncate(const struct path *path)
{
        return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR);
}

static int apparmor_file_truncate(struct file *file)
{
        return apparmor_path_truncate(&file->f_path);
}

static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry,
                                 const char *old_name)
{
        return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE,
                                  S_IFLNK);
}

static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir,
                              struct dentry *new_dentry)
{
        struct aa_label *label;
        int error = 0;

        if (!path_mediated_fs(old_dentry))
                return 0;

        label = begin_current_label_crit_section();
        if (!unconfined(label))
                error = aa_path_link(current_cred(), label, old_dentry, new_dir,
                                     new_dentry);
        end_current_label_crit_section(label);

        return error;
}

static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                                const struct path *new_dir, struct dentry *new_dentry,
                                const unsigned int flags)
{
        struct aa_label *label;
        int error = 0;

        if (!path_mediated_fs(old_dentry))
                return 0;
        if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry))
                return 0;

        label = begin_current_label_crit_section();
        if (!unconfined(label)) {
                struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt);
                vfsuid_t vfsuid;
                struct path old_path = { .mnt = old_dir->mnt,
                                         .dentry = old_dentry };
                struct path new_path = { .mnt = new_dir->mnt,
                                         .dentry = new_dentry };
                struct path_cond cond = {
                        .mode = d_backing_inode(old_dentry)->i_mode
                };
                vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry));
                cond.uid = vfsuid_into_kuid(vfsuid);

                if (flags & RENAME_EXCHANGE) {
                        struct path_cond cond_exchange = {
                                .mode = d_backing_inode(new_dentry)->i_mode,
                        };
                        vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry));
                        cond_exchange.uid = vfsuid_into_kuid(vfsuid);

                        error = aa_path_perm(OP_RENAME_SRC, current_cred(),
                                             label, &new_path, 0,
                                             MAY_READ | AA_MAY_GETATTR | MAY_WRITE |
                                             AA_MAY_SETATTR | AA_MAY_DELETE,
                                             &cond_exchange);
                        if (!error)
                                error = aa_path_perm(OP_RENAME_DEST, current_cred(),
                                                     label, &old_path,
                                                     0, MAY_WRITE | AA_MAY_SETATTR |
                                                     AA_MAY_CREATE, &cond_exchange);
                }

                if (!error)
                        error = aa_path_perm(OP_RENAME_SRC, current_cred(),
                                             label, &old_path, 0,
                                             MAY_READ | AA_MAY_GETATTR | MAY_WRITE |
                                             AA_MAY_SETATTR | AA_MAY_DELETE,
                                             &cond);
                if (!error)
                        error = aa_path_perm(OP_RENAME_DEST, current_cred(),
                                             label, &new_path,
                                             0, MAY_WRITE | AA_MAY_SETATTR |
                                             AA_MAY_CREATE, &cond);

        }
        end_current_label_crit_section(label);

        return error;
}

static int apparmor_path_chmod(const struct path *path, umode_t mode)
{
        return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD);
}

static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN);
}

static int apparmor_inode_getattr(const struct path *path)
{
        return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR);
}

static int apparmor_file_open(struct file *file)
{
        struct aa_file_ctx *fctx = file_ctx(file);
        struct aa_label *label;
        int error = 0;

        if (!path_mediated_fs(file->f_path.dentry))
                return 0;

        /* If in exec, permission is handled by bprm hooks.
         * Cache permissions granted by the previous exec check, with
         * implicit read and executable mmap which are required to
         * actually execute the image.
         *
         * Illogically, FMODE_EXEC is in f_flags, not f_mode.
         */
        if (file->f_flags & __FMODE_EXEC) {
                fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP;
                return 0;
        }

        label = aa_get_newest_cred_label(file->f_cred);
        if (!unconfined(label)) {
                struct mnt_idmap *idmap = file_mnt_idmap(file);
                struct inode *inode = file_inode(file);
                vfsuid_t vfsuid;
                struct path_cond cond = {
                        .mode = inode->i_mode,
                };
                vfsuid = i_uid_into_vfsuid(idmap, inode);
                cond.uid = vfsuid_into_kuid(vfsuid);

                error = aa_path_perm(OP_OPEN, file->f_cred,
                                     label, &file->f_path, 0,
                                     aa_map_file_to_perms(file), &cond);
                /* todo cache full allowed permissions set and state */
                fctx->allow = aa_map_file_to_perms(file);
        }
        aa_put_label(label);

        return error;
}

static int apparmor_file_alloc_security(struct file *file)
{
        struct aa_file_ctx *ctx = file_ctx(file);
        struct aa_label *label = begin_current_label_crit_section();

        spin_lock_init(&ctx->lock);
        rcu_assign_pointer(ctx->label, aa_get_label(label));
        end_current_label_crit_section(label);
        return 0;
}

static void apparmor_file_free_security(struct file *file)
{
        struct aa_file_ctx *ctx = file_ctx(file);

        if (ctx)
                aa_put_label(rcu_access_pointer(ctx->label));
}

static int common_file_perm(const char *op, struct file *file, u32 mask,
                            bool in_atomic)
{
        struct aa_label *label;
        int error = 0;

        /* don't reaudit files closed during inheritance */
        if (file->f_path.dentry == aa_null.dentry)
                return -EACCES;

        label = __begin_current_label_crit_section();
        error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic);
        __end_current_label_crit_section(label);

        return error;
}

static int apparmor_file_receive(struct file *file)
{
        return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file),
                                false);
}

static int apparmor_file_permission(struct file *file, int mask)
{
        return common_file_perm(OP_FPERM, file, mask, false);
}

static int apparmor_file_lock(struct file *file, unsigned int cmd)
{
        u32 mask = AA_MAY_LOCK;

        if (cmd == F_WRLCK)
                mask |= MAY_WRITE;

        return common_file_perm(OP_FLOCK, file, mask, false);
}

static int common_mmap(const char *op, struct file *file, unsigned long prot,
                       unsigned long flags, bool in_atomic)
{
        int mask = 0;

        if (!file || !file_ctx(file))
                return 0;

        if (prot & PROT_READ)
                mask |= MAY_READ;
        /*
         * Private mappings don't require write perms since they don't
         * write back to the files
         */
        if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE))
                mask |= MAY_WRITE;
        if (prot & PROT_EXEC)
                mask |= AA_EXEC_MMAP;

        return common_file_perm(op, file, mask, in_atomic);
}

static int apparmor_mmap_file(struct file *file, unsigned long reqprot,
                              unsigned long prot, unsigned long flags)
{
        return common_mmap(OP_FMMAP, file, prot, flags, GFP_ATOMIC);
}

static int apparmor_file_mprotect(struct vm_area_struct *vma,
                                  unsigned long reqprot, unsigned long prot)
{
        return common_mmap(OP_FMPROT, vma->vm_file, prot,
                           !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0,
                           false);
}

#ifdef CONFIG_IO_URING
static const char *audit_uring_mask(u32 mask)
{
        if (mask & AA_MAY_CREATE_SQPOLL)
                return "sqpoll";
        if (mask & AA_MAY_OVERRIDE_CRED)
                return "override_creds";
        return "";
}

static void audit_uring_cb(struct audit_buffer *ab, void *va)
{
        struct apparmor_audit_data *ad = aad_of_va(va);

        if (ad->request & AA_URING_PERM_MASK) {
                audit_log_format(ab, " requested=\"%s\"",
                                 audit_uring_mask(ad->request));
                if (ad->denied & AA_URING_PERM_MASK) {
                        audit_log_format(ab, " denied=\"%s\"",
                                         audit_uring_mask(ad->denied));
                }
        }
        if (ad->uring.target) {
                audit_log_format(ab, " tcontext=");
                aa_label_xaudit(ab, labels_ns(ad->subj_label),
                                ad->uring.target,
                                FLAGS_NONE, GFP_ATOMIC);
        }
}

static int profile_uring(struct aa_profile *profile, u32 request,
                         struct aa_label *new, int cap,
                         struct apparmor_audit_data *ad)
{
        unsigned int state;
        struct aa_ruleset *rules;
        int error = 0;

        AA_BUG(!profile);

        rules = list_first_entry(&profile->rules, typeof(*rules), list);
        state = RULE_MEDIATES(rules, AA_CLASS_IO_URING);
        if (state) {
                struct aa_perms perms = { };

                if (new) {
                        aa_label_match(profile, rules, new, state,
                                       false, request, &perms);
                } else {
                        perms = *aa_lookup_perms(rules->policy, state);
                }
                aa_apply_modes_to_perms(profile, &perms);
                error = aa_check_perms(profile, &perms, request, ad,
                                       audit_uring_cb);
        }

        return error;
}

/**
 * apparmor_uring_override_creds - check the requested cred override
 * @new: the target creds
 *
 * Check to see if the current task is allowed to override it's credentials
 * to service an io_uring operation.
 */
static int apparmor_uring_override_creds(const struct cred *new)
{
        struct aa_profile *profile;
        struct aa_label *label;
        int error;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING,
                          OP_URING_OVERRIDE);

        ad.uring.target = cred_label(new);
        label = __begin_current_label_crit_section();
        error = fn_for_each(label, profile,
                        profile_uring(profile, AA_MAY_OVERRIDE_CRED,
                                      cred_label(new), CAP_SYS_ADMIN, &ad));
        __end_current_label_crit_section(label);

        return error;
}

/**
 * apparmor_uring_sqpoll - check if a io_uring polling thread can be created
 *
 * Check to see if the current task is allowed to create a new io_uring
 * kernel polling thread.
 */
static int apparmor_uring_sqpoll(void)
{
        struct aa_profile *profile;
        struct aa_label *label;
        int error;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING,
                          OP_URING_SQPOLL);

        label = __begin_current_label_crit_section();
        error = fn_for_each(label, profile,
                        profile_uring(profile, AA_MAY_CREATE_SQPOLL,
                                      NULL, CAP_SYS_ADMIN, &ad));
        __end_current_label_crit_section(label);

        return error;
}
#endif /* CONFIG_IO_URING */

static int apparmor_sb_mount(const char *dev_name, const struct path *path,
                             const char *type, unsigned long flags, void *data)
{
        struct aa_label *label;
        int error = 0;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        flags &= ~AA_MS_IGNORE_MASK;

        label = __begin_current_label_crit_section();
        if (!unconfined(label)) {
                if (flags & MS_REMOUNT)
                        error = aa_remount(current_cred(), label, path, flags,
                                           data);
                else if (flags & MS_BIND)
                        error = aa_bind_mount(current_cred(), label, path,
                                              dev_name, flags);
                else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE |
                                  MS_UNBINDABLE))
                        error = aa_mount_change_type(current_cred(), label,
                                                     path, flags);
                else if (flags & MS_MOVE)
                        error = aa_move_mount_old(current_cred(), label, path,
                                                  dev_name);
                else
                        error = aa_new_mount(current_cred(), label, dev_name,
                                             path, type, flags, data);
        }
        __end_current_label_crit_section(label);

        return error;
}

static int apparmor_move_mount(const struct path *from_path,
                               const struct path *to_path)
{
        struct aa_label *label;
        int error = 0;

        label = __begin_current_label_crit_section();
        if (!unconfined(label))
                error = aa_move_mount(current_cred(), label, from_path,
                                      to_path);
        __end_current_label_crit_section(label);

        return error;
}

static int apparmor_sb_umount(struct vfsmount *mnt, int flags)
{
        struct aa_label *label;
        int error = 0;

        label = __begin_current_label_crit_section();
        if (!unconfined(label))
                error = aa_umount(current_cred(), label, mnt, flags);
        __end_current_label_crit_section(label);

        return error;
}

static int apparmor_sb_pivotroot(const struct path *old_path,
                                 const struct path *new_path)
{
        struct aa_label *label;
        int error = 0;

        label = aa_get_current_label();
        if (!unconfined(label))
                error = aa_pivotroot(current_cred(), label, old_path, new_path);
        aa_put_label(label);

        return error;
}

static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx,
                                u32 *size, u32 flags)
{
        int error = -ENOENT;
        struct aa_task_ctx *ctx = task_ctx(current);
        struct aa_label *label = NULL;
        char *value = NULL;

        switch (attr) {
        case LSM_ATTR_CURRENT:
                label = aa_get_newest_label(cred_label(current_cred()));
                break;
        case LSM_ATTR_PREV:
                if (ctx->previous)
                        label = aa_get_newest_label(ctx->previous);
                break;
        case LSM_ATTR_EXEC:
                if (ctx->onexec)
                        label = aa_get_newest_label(ctx->onexec);
                break;
        default:
                error = -EOPNOTSUPP;
                break;
        }

        if (label) {
                error = aa_getprocattr(label, &value, false);
                if (error > 0)
                        error = lsm_fill_user_ctx(lx, size, value, error,
                                                  LSM_ID_APPARMOR, 0);
                kfree(value);
        }

        aa_put_label(label);

        if (error < 0)
                return error;
        return 1;
}

static int apparmor_getprocattr(struct task_struct *task, const char *name,
                                char **value)
{
        int error = -ENOENT;
        /* released below */
        const struct cred *cred = get_task_cred(task);
        struct aa_task_ctx *ctx = task_ctx(current);
        struct aa_label *label = NULL;

        if (strcmp(name, "current") == 0)
                label = aa_get_newest_label(cred_label(cred));
        else if (strcmp(name, "prev") == 0  && ctx->previous)
                label = aa_get_newest_label(ctx->previous);
        else if (strcmp(name, "exec") == 0 && ctx->onexec)
                label = aa_get_newest_label(ctx->onexec);
        else
                error = -EINVAL;

        if (label)
                error = aa_getprocattr(label, value, true);

        aa_put_label(label);
        put_cred(cred);

        return error;
}

static int do_setattr(u64 attr, void *value, size_t size)
{
        char *command, *largs = NULL, *args = value;
        size_t arg_size;
        int error;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE,
                          OP_SETPROCATTR);

        if (size == 0)
                return -EINVAL;

        /* AppArmor requires that the buffer must be null terminated atm */
        if (args[size - 1] != '\0') {
                /* null terminate */
                largs = args = kmalloc(size + 1, GFP_KERNEL);
                if (!args)
                        return -ENOMEM;
                memcpy(args, value, size);
                args[size] = '\0';
        }

        error = -EINVAL;
        args = strim(args);
        command = strsep(&args, " ");
        if (!args)
                goto out;
        args = skip_spaces(args);
        if (!*args)
                goto out;

        arg_size = size - (args - (largs ? largs : (char *) value));
        if (attr == LSM_ATTR_CURRENT) {
                if (strcmp(command, "changehat") == 0) {
                        error = aa_setprocattr_changehat(args, arg_size,
                                                         AA_CHANGE_NOFLAGS);
                } else if (strcmp(command, "permhat") == 0) {
                        error = aa_setprocattr_changehat(args, arg_size,
                                                         AA_CHANGE_TEST);
                } else if (strcmp(command, "changeprofile") == 0) {
                        error = aa_change_profile(args, AA_CHANGE_NOFLAGS);
                } else if (strcmp(command, "permprofile") == 0) {
                        error = aa_change_profile(args, AA_CHANGE_TEST);
                } else if (strcmp(command, "stack") == 0) {
                        error = aa_change_profile(args, AA_CHANGE_STACK);
                } else
                        goto fail;
        } else if (attr == LSM_ATTR_EXEC) {
                if (strcmp(command, "exec") == 0)
                        error = aa_change_profile(args, AA_CHANGE_ONEXEC);
                else if (strcmp(command, "stack") == 0)
                        error = aa_change_profile(args, (AA_CHANGE_ONEXEC |
                                                         AA_CHANGE_STACK));
                else
                        goto fail;
        } else
                /* only support the "current" and "exec" process attributes */
                goto fail;

        if (!error)
                error = size;
out:
        kfree(largs);
        return error;

fail:
        ad.subj_label = begin_current_label_crit_section();
        if (attr == LSM_ATTR_CURRENT)
                ad.info = "current";
        else if (attr == LSM_ATTR_EXEC)
                ad.info = "exec";
        else
                ad.info = "invalid";
        ad.error = error = -EINVAL;
        aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL);
        end_current_label_crit_section(ad.subj_label);
        goto out;
}

static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
                                u32 size, u32 flags)
{
        int rc;

        if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC)
                return -EOPNOTSUPP;

        rc = do_setattr(attr, ctx->ctx, ctx->ctx_len);
        if (rc > 0)
                return 0;
        return rc;
}

static int apparmor_setprocattr(const char *name, void *value,
                                size_t size)
{
        int attr = lsm_name_to_attr(name);

        if (attr)
                return do_setattr(attr, value, size);
        return -EINVAL;
}

/**
 * apparmor_bprm_committing_creds - do task cleanup on committing new creds
 * @bprm: binprm for the exec  (NOT NULL)
 */
static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm)
{
        struct aa_label *label = aa_current_raw_label();
        struct aa_label *new_label = cred_label(bprm->cred);

        /* bail out if unconfined or not changing profile */
        if ((new_label->proxy == label->proxy) ||
            (unconfined(new_label)))
                return;

        aa_inherit_files(bprm->cred, current->files);

        current->pdeath_signal = 0;

        /* reset soft limits and set hard limits for the new label */
        __aa_transition_rlimits(label, new_label);
}

/**
 * apparmor_bprm_committed_creds() - do cleanup after new creds committed
 * @bprm: binprm for the exec  (NOT NULL)
 */
static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm)
{
        /* clear out temporary/transitional state from the context */
        aa_clear_task_ctx_trans(task_ctx(current));

        return;
}

static void apparmor_current_getsecid_subj(u32 *secid)
{
        struct aa_label *label = __begin_current_label_crit_section();
        *secid = label->secid;
        __end_current_label_crit_section(label);
}

static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid)
{
        struct aa_label *label = aa_get_task_label(p);
        *secid = label->secid;
        aa_put_label(label);
}

static int apparmor_task_setrlimit(struct task_struct *task,
                unsigned int resource, struct rlimit *new_rlim)
{
        struct aa_label *label = __begin_current_label_crit_section();
        int error = 0;

        if (!unconfined(label))
                error = aa_task_setrlimit(current_cred(), label, task,
                                          resource, new_rlim);
        __end_current_label_crit_section(label);

        return error;
}

static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info,
                              int sig, const struct cred *cred)
{
        const struct cred *tc;
        struct aa_label *cl, *tl;
        int error;

        tc = get_task_cred(target);
        tl = aa_get_newest_cred_label(tc);
        if (cred) {
                /*
                 * Dealing with USB IO specific behavior
                 */
                cl = aa_get_newest_cred_label(cred);
                error = aa_may_signal(cred, cl, tc, tl, sig);
                aa_put_label(cl);
        } else {
                cl = __begin_current_label_crit_section();
                error = aa_may_signal(current_cred(), cl, tc, tl, sig);
                __end_current_label_crit_section(cl);
        }
        aa_put_label(tl);
        put_cred(tc);

        return error;
}

static int apparmor_userns_create(const struct cred *cred)
{
        struct aa_label *label;
        struct aa_profile *profile;
        int error = 0;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS,
                          OP_USERNS_CREATE);

        ad.subj_cred = current_cred();

        label = begin_current_label_crit_section();
        if (!unconfined(label)) {
                error = fn_for_each(label, profile,
                                    aa_profile_ns_perm(profile, &ad,
                                                       AA_USERNS_CREATE));
        }
        end_current_label_crit_section(label);

        return error;
}

static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags)
{
        struct aa_sk_ctx *ctx;

        ctx = kzalloc(sizeof(*ctx), flags);
        if (!ctx)
                return -ENOMEM;

        sk->sk_security = ctx;

        return 0;
}

static void apparmor_sk_free_security(struct sock *sk)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);

        sk->sk_security = NULL;
        aa_put_label(ctx->label);
        aa_put_label(ctx->peer);
        kfree(ctx);
}

/**
 * apparmor_sk_clone_security - clone the sk_security field
 * @sk: sock to have security cloned
 * @newsk: sock getting clone
 */
static void apparmor_sk_clone_security(const struct sock *sk,
                                       struct sock *newsk)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);
        struct aa_sk_ctx *new = aa_sock(newsk);

        if (new->label)
                aa_put_label(new->label);
        new->label = aa_get_label(ctx->label);

        if (new->peer)
                aa_put_label(new->peer);
        new->peer = aa_get_label(ctx->peer);
}

static int apparmor_socket_create(int family, int type, int protocol, int kern)
{
        struct aa_label *label;
        int error = 0;

        AA_BUG(in_interrupt());

        label = begin_current_label_crit_section();
        if (!(kern || unconfined(label)))
                error = af_select(family,
                                  create_perm(label, family, type, protocol),
                                  aa_af_perm(current_cred(), label,
                                             OP_CREATE, AA_MAY_CREATE,
                                             family, type, protocol));
        end_current_label_crit_section(label);

        return error;
}

/**
 * apparmor_socket_post_create - setup the per-socket security struct
 * @sock: socket that is being setup
 * @family: family of socket being created
 * @type: type of the socket
 * @ptotocol: protocol of the socket
 * @kern: socket is a special kernel socket
 *
 * Note:
 * -   kernel sockets labeled kernel_t used to use unconfined
 * -   socket may not have sk here if created with sock_create_lite or
 *     sock_alloc. These should be accept cases which will be handled in
 *     sock_graft.
 */
static int apparmor_socket_post_create(struct socket *sock, int family,
                                       int type, int protocol, int kern)
{
        struct aa_label *label;

        if (kern) {
                label = aa_get_label(kernel_t);
        } else
                label = aa_get_current_label();

        if (sock->sk) {
                struct aa_sk_ctx *ctx = aa_sock(sock->sk);

                aa_put_label(ctx->label);
                ctx->label = aa_get_label(label);
        }
        aa_put_label(label);

        return 0;
}

static int apparmor_socket_bind(struct socket *sock,
                                struct sockaddr *address, int addrlen)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!address);
        AA_BUG(in_interrupt());

        return af_select(sock->sk->sk_family,
                         bind_perm(sock, address, addrlen),
                         aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk));
}

static int apparmor_socket_connect(struct socket *sock,
                                   struct sockaddr *address, int addrlen)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!address);
        AA_BUG(in_interrupt());

        return af_select(sock->sk->sk_family,
                         connect_perm(sock, address, addrlen),
                         aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk));
}

static int apparmor_socket_listen(struct socket *sock, int backlog)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(in_interrupt());

        return af_select(sock->sk->sk_family,
                         listen_perm(sock, backlog),
                         aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk));
}

/*
 * Note: while @newsock is created and has some information, the accept
 *       has not been done.
 */
static int apparmor_socket_accept(struct socket *sock, struct socket *newsock)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!newsock);
        AA_BUG(in_interrupt());

        return af_select(sock->sk->sk_family,
                         accept_perm(sock, newsock),
                         aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk));
}

static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock,
                            struct msghdr *msg, int size)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(!msg);
        AA_BUG(in_interrupt());

        return af_select(sock->sk->sk_family,
                         msg_perm(op, request, sock, msg, size),
                         aa_sk_perm(op, request, sock->sk));
}

static int apparmor_socket_sendmsg(struct socket *sock,
                                   struct msghdr *msg, int size)
{
        return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size);
}

static int apparmor_socket_recvmsg(struct socket *sock,
                                   struct msghdr *msg, int size, int flags)
{
        return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size);
}

/* revaliation, get/set attr, shutdown */
static int aa_sock_perm(const char *op, u32 request, struct socket *sock)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(in_interrupt());

        return af_select(sock->sk->sk_family,
                         sock_perm(op, request, sock),
                         aa_sk_perm(op, request, sock->sk));
}

static int apparmor_socket_getsockname(struct socket *sock)
{
        return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock);
}

static int apparmor_socket_getpeername(struct socket *sock)
{
        return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock);
}

/* revaliation, get/set attr, opt */
static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock,
                            int level, int optname)
{
        AA_BUG(!sock);
        AA_BUG(!sock->sk);
        AA_BUG(in_interrupt());

        return af_select(sock->sk->sk_family,
                         opt_perm(op, request, sock, level, optname),
                         aa_sk_perm(op, request, sock->sk));
}

static int apparmor_socket_getsockopt(struct socket *sock, int level,
                                      int optname)
{
        return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock,
                                level, optname);
}

static int apparmor_socket_setsockopt(struct socket *sock, int level,
                                      int optname)
{
        return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock,
                                level, optname);
}

static int apparmor_socket_shutdown(struct socket *sock, int how)
{
        return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock);
}

#ifdef CONFIG_NETWORK_SECMARK
/**
 * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk
 * @sk: sk to associate @skb with
 * @skb: skb to check for perms
 *
 * Note: can not sleep may be called with locks held
 *
 * dont want protocol specific in __skb_recv_datagram()
 * to deny an incoming connection  socket_sock_rcv_skb()
 */
static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);

        if (!skb->secmark)
                return 0;

        return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE,
                                      skb->secmark, sk);
}
#endif


static struct aa_label *sk_peer_label(struct sock *sk)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);

        if (ctx->peer)
                return ctx->peer;

        return ERR_PTR(-ENOPROTOOPT);
}

/**
 * apparmor_socket_getpeersec_stream - get security context of peer
 * @sock: socket that we are trying to get the peer context of
 * @optval: output - buffer to copy peer name to
 * @optlen: output - size of copied name in @optval
 * @len: size of @optval buffer
 * Returns: 0 on success, -errno of failure
 *
 * Note: for tcp only valid if using ipsec or cipso on lan
 */
static int apparmor_socket_getpeersec_stream(struct socket *sock,
                                             sockptr_t optval, sockptr_t optlen,
                                             unsigned int len)
{
        char *name = NULL;
        int slen, error = 0;
        struct aa_label *label;
        struct aa_label *peer;

        label = begin_current_label_crit_section();
        peer = sk_peer_label(sock->sk);
        if (IS_ERR(peer)) {
                error = PTR_ERR(peer);
                goto done;
        }
        slen = aa_label_asxprint(&name, labels_ns(label), peer,
                                 FLAG_SHOW_MODE | FLAG_VIEW_SUBNS |
                                 FLAG_HIDDEN_UNCONFINED, GFP_KERNEL);
        /* don't include terminating \0 in slen, it breaks some apps */
        if (slen < 0) {
                error = -ENOMEM;
                goto done;
        }
        if (slen > len) {
                error = -ERANGE;
                goto done_len;
        }

        if (copy_to_sockptr(optval, name, slen))
                error = -EFAULT;
done_len:
        if (copy_to_sockptr(optlen, &slen, sizeof(slen)))
                error = -EFAULT;
done:
        end_current_label_crit_section(label);
        kfree(name);
        return error;
}

/**
 * apparmor_socket_getpeersec_dgram - get security label of packet
 * @sock: the peer socket
 * @skb: packet data
 * @secid: pointer to where to put the secid of the packet
 *
 * Sets the netlabel socket state on sk from parent
 */
static int apparmor_socket_getpeersec_dgram(struct socket *sock,
                                            struct sk_buff *skb, u32 *secid)

{
        /* TODO: requires secid support */
        return -ENOPROTOOPT;
}

/**
 * apparmor_sock_graft - Initialize newly created socket
 * @sk: child sock
 * @parent: parent socket
 *
 * Note: could set off of SOCK_CTX(parent) but need to track inode and we can
 *       just set sk security information off of current creating process label
 *       Labeling of sk for accept case - probably should be sock based
 *       instead of task, because of the case where an implicitly labeled
 *       socket is shared by different tasks.
 */
static void apparmor_sock_graft(struct sock *sk, struct socket *parent)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);

        if (!ctx->label)
                ctx->label = aa_get_current_label();
}

#ifdef CONFIG_NETWORK_SECMARK
static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
                                      struct request_sock *req)
{
        struct aa_sk_ctx *ctx = aa_sock(sk);

        if (!skb->secmark)
                return 0;

        return apparmor_secmark_check(ctx->label, OP_CONNECT, AA_MAY_CONNECT,
                                      skb->secmark, sk);
}
#endif

/*
 * The cred blob is a pointer to, not an instance of, an aa_label.
 */
struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = {
        .lbs_cred = sizeof(struct aa_label *),
        .lbs_file = sizeof(struct aa_file_ctx),
        .lbs_task = sizeof(struct aa_task_ctx),
};

static const struct lsm_id apparmor_lsmid = {
        .name = "apparmor",
        .id = LSM_ID_APPARMOR,
};

static struct security_hook_list apparmor_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
        LSM_HOOK_INIT(capget, apparmor_capget),
        LSM_HOOK_INIT(capable, apparmor_capable),

        LSM_HOOK_INIT(move_mount, apparmor_move_mount),
        LSM_HOOK_INIT(sb_mount, apparmor_sb_mount),
        LSM_HOOK_INIT(sb_umount, apparmor_sb_umount),
        LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot),

        LSM_HOOK_INIT(path_link, apparmor_path_link),
        LSM_HOOK_INIT(path_unlink, apparmor_path_unlink),
        LSM_HOOK_INIT(path_symlink, apparmor_path_symlink),
        LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir),
        LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir),
        LSM_HOOK_INIT(path_mknod, apparmor_path_mknod),
        LSM_HOOK_INIT(path_rename, apparmor_path_rename),
        LSM_HOOK_INIT(path_chmod, apparmor_path_chmod),
        LSM_HOOK_INIT(path_chown, apparmor_path_chown),
        LSM_HOOK_INIT(path_truncate, apparmor_path_truncate),
        LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr),

        LSM_HOOK_INIT(file_open, apparmor_file_open),
        LSM_HOOK_INIT(file_receive, apparmor_file_receive),
        LSM_HOOK_INIT(file_permission, apparmor_file_permission),
        LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security),
        LSM_HOOK_INIT(file_free_security, apparmor_file_free_security),
        LSM_HOOK_INIT(mmap_file, apparmor_mmap_file),
        LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect),
        LSM_HOOK_INIT(file_lock, apparmor_file_lock),
        LSM_HOOK_INIT(file_truncate, apparmor_file_truncate),

        LSM_HOOK_INIT(getselfattr, apparmor_getselfattr),
        LSM_HOOK_INIT(setselfattr, apparmor_setselfattr),
        LSM_HOOK_INIT(getprocattr, apparmor_getprocattr),
        LSM_HOOK_INIT(setprocattr, apparmor_setprocattr),

        LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security),
        LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security),
        LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security),

        LSM_HOOK_INIT(socket_create, apparmor_socket_create),
        LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create),
        LSM_HOOK_INIT(socket_bind, apparmor_socket_bind),
        LSM_HOOK_INIT(socket_connect, apparmor_socket_connect),
        LSM_HOOK_INIT(socket_listen, apparmor_socket_listen),
        LSM_HOOK_INIT(socket_accept, apparmor_socket_accept),
        LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg),
        LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg),
        LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname),
        LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername),
        LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt),
        LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt),
        LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown),
#ifdef CONFIG_NETWORK_SECMARK
        LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb),
#endif
        LSM_HOOK_INIT(socket_getpeersec_stream,
                      apparmor_socket_getpeersec_stream),
        LSM_HOOK_INIT(socket_getpeersec_dgram,
                      apparmor_socket_getpeersec_dgram),
        LSM_HOOK_INIT(sock_graft, apparmor_sock_graft),
#ifdef CONFIG_NETWORK_SECMARK
        LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request),
#endif

        LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank),
        LSM_HOOK_INIT(cred_free, apparmor_cred_free),
        LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer),

        LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec),
        LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds),
        LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds),

        LSM_HOOK_INIT(task_free, apparmor_task_free),
        LSM_HOOK_INIT(task_alloc, apparmor_task_alloc),
        LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj),
        LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj),
        LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit),
        LSM_HOOK_INIT(task_kill, apparmor_task_kill),
        LSM_HOOK_INIT(userns_create, apparmor_userns_create),

#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init),
        LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known),
        LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match),
        LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free),
#endif

        LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx),
        LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid),
        LSM_HOOK_INIT(release_secctx, apparmor_release_secctx),

#ifdef CONFIG_IO_URING
        LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds),
        LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll),
#endif
};

/*
 * AppArmor sysfs module parameters
 */

static int param_set_aabool(const char *val, const struct kernel_param *kp);
static int param_get_aabool(char *buffer, const struct kernel_param *kp);
#define param_check_aabool param_check_bool
static const struct kernel_param_ops param_ops_aabool = {
        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_aabool,
        .get = param_get_aabool
};

static int param_set_aauint(const char *val, const struct kernel_param *kp);
static int param_get_aauint(char *buffer, const struct kernel_param *kp);
#define param_check_aauint param_check_uint
static const struct kernel_param_ops param_ops_aauint = {
        .set = param_set_aauint,
        .get = param_get_aauint
};

static int param_set_aacompressionlevel(const char *val,
                                        const struct kernel_param *kp);
static int param_get_aacompressionlevel(char *buffer,
                                        const struct kernel_param *kp);
#define param_check_aacompressionlevel param_check_int
static const struct kernel_param_ops param_ops_aacompressionlevel = {
        .set = param_set_aacompressionlevel,
        .get = param_get_aacompressionlevel
};

static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp);
static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp);
#define param_check_aalockpolicy param_check_bool
static const struct kernel_param_ops param_ops_aalockpolicy = {
        .flags = KERNEL_PARAM_OPS_FL_NOARG,
        .set = param_set_aalockpolicy,
        .get = param_get_aalockpolicy
};

static int param_set_audit(const char *val, const struct kernel_param *kp);
static int param_get_audit(char *buffer, const struct kernel_param *kp);

static int param_set_mode(const char *val, const struct kernel_param *kp);
static int param_get_mode(char *buffer, const struct kernel_param *kp);

/* Flag values, also controllable via /sys/module/apparmor/parameters
 * We define special types as we want to do additional mediation.
 */

/* AppArmor global enforcement switch - complain, enforce, kill */
enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE;
module_param_call(mode, param_set_mode, param_get_mode,
                  &aa_g_profile_mode, S_IRUSR | S_IWUSR);

/* whether policy verification hashing is enabled */
bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT);
#ifdef CONFIG_SECURITY_APPARMOR_HASH
module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR);
#endif

/* whether policy exactly as loaded is retained for debug and checkpointing */
bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY);
#ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY
module_param_named(export_binary, aa_g_export_binary, aabool, 0600);
#endif

/* policy loaddata compression level */
int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL;
module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level,
                   aacompressionlevel, 0400);

/* Debug mode */
bool aa_g_debug = IS_ENABLED(CONFIG_SECURITY_APPARMOR_DEBUG_MESSAGES);
module_param_named(debug, aa_g_debug, aabool, S_IRUSR | S_IWUSR);

/* Audit mode */
enum audit_mode aa_g_audit;
module_param_call(audit, param_set_audit, param_get_audit,
                  &aa_g_audit, S_IRUSR | S_IWUSR);

/* Determines if audit header is included in audited messages.  This
 * provides more context if the audit daemon is not running
 */
bool aa_g_audit_header = true;
module_param_named(audit_header, aa_g_audit_header, aabool,
                   S_IRUSR | S_IWUSR);

/* lock out loading/removal of policy
 * TODO: add in at boot loading of policy, which is the only way to
 *       load policy, if lock_policy is set
 */
bool aa_g_lock_policy;
module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy,
                   S_IRUSR | S_IWUSR);

/* Syscall logging mode */
bool aa_g_logsyscall;
module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR);

/* Maximum pathname length before accesses will start getting rejected */
unsigned int aa_g_path_max = 2 * PATH_MAX;
module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR);

/* Determines how paranoid loading of policy is and how much verification
 * on the loaded policy is done.
 * DEPRECATED: read only as strict checking of load is always done now
 * that none root users (user namespaces) can load policy.
 */
bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD);
module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO);

static int param_get_aaintbool(char *buffer, const struct kernel_param *kp);
static int param_set_aaintbool(const char *val, const struct kernel_param *kp);
#define param_check_aaintbool param_check_int
static const struct kernel_param_ops param_ops_aaintbool = {
        .set = param_set_aaintbool,
        .get = param_get_aaintbool
};
/* Boot time disable flag */
static int apparmor_enabled __ro_after_init = 1;
module_param_named(enabled, apparmor_enabled, aaintbool, 0444);

static int __init apparmor_enabled_setup(char *str)
{
        unsigned long enabled;
        int error = kstrtoul(str, 0, &enabled);
        if (!error)
                apparmor_enabled = enabled ? 1 : 0;
        return 1;
}

__setup("apparmor=", apparmor_enabled_setup);

/* set global flag turning off the ability to load policy */
static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;
        return param_set_bool(val, kp);
}

static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_bool(buffer, kp);
}

static int param_set_aabool(const char *val, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;
        return param_set_bool(val, kp);
}

static int param_get_aabool(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_bool(buffer, kp);
}

static int param_set_aauint(const char *val, const struct kernel_param *kp)
{
        int error;

        if (!apparmor_enabled)
                return -EINVAL;
        /* file is ro but enforce 2nd line check */
        if (apparmor_initialized)
                return -EPERM;

        error = param_set_uint(val, kp);
        aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer));
        pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max);

        return error;
}

static int param_get_aauint(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_uint(buffer, kp);
}

/* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */
static int param_set_aaintbool(const char *val, const struct kernel_param *kp)
{
        struct kernel_param kp_local;
        bool value;
        int error;

        if (apparmor_initialized)
                return -EPERM;

        /* Create local copy, with arg pointing to bool type. */
        value = !!*((int *)kp->arg);
        memcpy(&kp_local, kp, sizeof(kp_local));
        kp_local.arg = &value;

        error = param_set_bool(val, &kp_local);
        if (!error)
                *((int *)kp->arg) = *((bool *)kp_local.arg);
        return error;
}

/*
 * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to
 * 1/0, this converts the "int that is actually bool" back to bool for
 * display in the /sys filesystem, while keeping it "int" for the LSM
 * infrastructure.
 */
static int param_get_aaintbool(char *buffer, const struct kernel_param *kp)
{
        struct kernel_param kp_local;
        bool value;

        /* Create local copy, with arg pointing to bool type. */
        value = !!*((int *)kp->arg);
        memcpy(&kp_local, kp, sizeof(kp_local));
        kp_local.arg = &value;

        return param_get_bool(buffer, &kp_local);
}

static int param_set_aacompressionlevel(const char *val,
                                        const struct kernel_param *kp)
{
        int error;

        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized)
                return -EPERM;

        error = param_set_int(val, kp);

        aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level,
                                               AA_MIN_CLEVEL, AA_MAX_CLEVEL);
        pr_info("AppArmor: policy rawdata compression level set to %d\n",
                aa_g_rawdata_compression_level);

        return error;
}

static int param_get_aacompressionlevel(char *buffer,
                                        const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return param_get_int(buffer, kp);
}

static int param_get_audit(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;
        return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]);
}

static int param_set_audit(const char *val, const struct kernel_param *kp)
{
        int i;

        if (!apparmor_enabled)
                return -EINVAL;
        if (!val)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;

        i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val);
        if (i < 0)
                return -EINVAL;

        aa_g_audit = i;
        return 0;
}

static int param_get_mode(char *buffer, const struct kernel_param *kp)
{
        if (!apparmor_enabled)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_view_capable(NULL))
                return -EPERM;

        return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]);
}

static int param_set_mode(const char *val, const struct kernel_param *kp)
{
        int i;

        if (!apparmor_enabled)
                return -EINVAL;
        if (!val)
                return -EINVAL;
        if (apparmor_initialized && !aa_current_policy_admin_capable(NULL))
                return -EPERM;

        i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX,
                         val);
        if (i < 0)
                return -EINVAL;

        aa_g_profile_mode = i;
        return 0;
}

char *aa_get_buffer(bool in_atomic)
{
        union aa_buffer *aa_buf;
        struct aa_local_cache *cache;
        bool try_again = true;
        gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);

        /* use per cpu cached buffers first */
        cache = get_cpu_ptr(&aa_local_buffers);
        if (!list_empty(&cache->head)) {
                aa_buf = list_first_entry(&cache->head, union aa_buffer, list);
                list_del(&aa_buf->list);
                cache->hold--;
                cache->count--;
                put_cpu_ptr(&aa_local_buffers);
                return &aa_buf->buffer[0];
        }
        put_cpu_ptr(&aa_local_buffers);

        if (!spin_trylock(&aa_buffers_lock)) {
                cache = get_cpu_ptr(&aa_local_buffers);
                cache->hold += 1;
                put_cpu_ptr(&aa_local_buffers);
                spin_lock(&aa_buffers_lock);
        } else {
                cache = get_cpu_ptr(&aa_local_buffers);
                put_cpu_ptr(&aa_local_buffers);
        }
retry:
        if (buffer_count > reserve_count ||
            (in_atomic && !list_empty(&aa_global_buffers))) {
                aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer,
                                          list);
                list_del(&aa_buf->list);
                buffer_count--;
                spin_unlock(&aa_buffers_lock);
                return aa_buf->buffer;
        }
        if (in_atomic) {
                /*
                 * out of reserve buffers and in atomic context so increase
                 * how many buffers to keep in reserve
                 */
                reserve_count++;
                flags = GFP_ATOMIC;
        }
        spin_unlock(&aa_buffers_lock);

        if (!in_atomic)
                might_sleep();
        aa_buf = kmalloc(aa_g_path_max, flags);
        if (!aa_buf) {
                if (try_again) {
                        try_again = false;
                        spin_lock(&aa_buffers_lock);
                        goto retry;
                }
                pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n");
                return NULL;
        }
        return aa_buf->buffer;
}

void aa_put_buffer(char *buf)
{
        union aa_buffer *aa_buf;
        struct aa_local_cache *cache;

        if (!buf)
                return;
        aa_buf = container_of(buf, union aa_buffer, buffer[0]);

        cache = get_cpu_ptr(&aa_local_buffers);
        if (!cache->hold) {
                put_cpu_ptr(&aa_local_buffers);

                if (spin_trylock(&aa_buffers_lock)) {
                        /* put back on global list */
                        list_add(&aa_buf->list, &aa_global_buffers);
                        buffer_count++;
                        spin_unlock(&aa_buffers_lock);
                        cache = get_cpu_ptr(&aa_local_buffers);
                        put_cpu_ptr(&aa_local_buffers);
                        return;
                }
                /* contention on global list, fallback to percpu */
                cache = get_cpu_ptr(&aa_local_buffers);
                cache->hold += 1;
        }

        /* cache in percpu list */
        list_add(&aa_buf->list, &cache->head);
        cache->count++;
        put_cpu_ptr(&aa_local_buffers);
}

/*
 * AppArmor init functions
 */

/**
 * set_init_ctx - set a task context and profile on the first task.
 *
 * TODO: allow setting an alternate profile than unconfined
 */
static int __init set_init_ctx(void)
{
        struct cred *cred = (__force struct cred *)current->real_cred;

        set_cred_label(cred, aa_get_label(ns_unconfined(root_ns)));

        return 0;
}

static void destroy_buffers(void)
{
        union aa_buffer *aa_buf;

        spin_lock(&aa_buffers_lock);
        while (!list_empty(&aa_global_buffers)) {
                aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer,
                                         list);
                list_del(&aa_buf->list);
                spin_unlock(&aa_buffers_lock);
                kfree(aa_buf);
                spin_lock(&aa_buffers_lock);
        }
        spin_unlock(&aa_buffers_lock);
}

static int __init alloc_buffers(void)
{
        union aa_buffer *aa_buf;
        int i, num;

        /*
         * per cpu set of cached allocated buffers used to help reduce
         * lock contention
         */
        for_each_possible_cpu(i) {
                per_cpu(aa_local_buffers, i).hold = 0;
                per_cpu(aa_local_buffers, i).count = 0;
                INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head);
        }
        /*
         * A function may require two buffers at once. Usually the buffers are
         * used for a short period of time and are shared. On UP kernel buffers
         * two should be enough, with more CPUs it is possible that more
         * buffers will be used simultaneously. The preallocated pool may grow.
         * This preallocation has also the side-effect that AppArmor will be
         * disabled early at boot if aa_g_path_max is extremly high.
         */
        if (num_online_cpus() > 1)
                num = 4 + RESERVE_COUNT;
        else
                num = 2 + RESERVE_COUNT;

        for (i = 0; i < num; i++) {

                aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL |
                                 __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
                if (!aa_buf) {
                        destroy_buffers();
                        return -ENOMEM;
                }
                aa_put_buffer(aa_buf->buffer);
        }
        return 0;
}

#ifdef CONFIG_SYSCTL
static int apparmor_dointvec(struct ctl_table *table, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        if (!aa_current_policy_admin_capable(NULL))
                return -EPERM;
        if (!apparmor_enabled)
                return -EINVAL;

        return proc_dointvec(table, write, buffer, lenp, ppos);
}

static struct ctl_table apparmor_sysctl_table[] = {
#ifdef CONFIG_USER_NS
        {
                .procname       = "unprivileged_userns_apparmor_policy",
                .data           = &unprivileged_userns_apparmor_policy,
                .maxlen         = sizeof(int),
                .mode           = 0600,
                .proc_handler   = apparmor_dointvec,
        },
#endif /* CONFIG_USER_NS */
        {
                .procname       = "apparmor_display_secid_mode",
                .data           = &apparmor_display_secid_mode,
                .maxlen         = sizeof(int),
                .mode           = 0600,
                .proc_handler   = apparmor_dointvec,
        },
        {
                .procname       = "apparmor_restrict_unprivileged_unconfined",
                .data           = &aa_unprivileged_unconfined_restricted,
                .maxlen         = sizeof(int),
                .mode           = 0600,
                .proc_handler   = apparmor_dointvec,
        },
};

static int __init apparmor_init_sysctl(void)
{
        return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM;
}
#else
static inline int apparmor_init_sysctl(void)
{
        return 0;
}
#endif /* CONFIG_SYSCTL */

#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK)
static unsigned int apparmor_ip_postroute(void *priv,
                                          struct sk_buff *skb,
                                          const struct nf_hook_state *state)
{
        struct aa_sk_ctx *ctx;
        struct sock *sk;

        if (!skb->secmark)
                return NF_ACCEPT;

        sk = skb_to_full_sk(skb);
        if (sk == NULL)
                return NF_ACCEPT;

        ctx = aa_sock(sk);
        if (!apparmor_secmark_check(ctx->label, OP_SENDMSG, AA_MAY_SEND,
                                    skb->secmark, sk))
                return NF_ACCEPT;

        return NF_DROP_ERR(-ECONNREFUSED);

}

static const struct nf_hook_ops apparmor_nf_ops[] = {
        {
                .hook =         apparmor_ip_postroute,
                .pf =           NFPROTO_IPV4,
                .hooknum =      NF_INET_POST_ROUTING,
                .priority =     NF_IP_PRI_SELINUX_FIRST,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook =         apparmor_ip_postroute,
                .pf =           NFPROTO_IPV6,
                .hooknum =      NF_INET_POST_ROUTING,
                .priority =     NF_IP6_PRI_SELINUX_FIRST,
        },
#endif
};

static int __net_init apparmor_nf_register(struct net *net)
{
        return nf_register_net_hooks(net, apparmor_nf_ops,
                                    ARRAY_SIZE(apparmor_nf_ops));
}

static void __net_exit apparmor_nf_unregister(struct net *net)
{
        nf_unregister_net_hooks(net, apparmor_nf_ops,
                                ARRAY_SIZE(apparmor_nf_ops));
}

static struct pernet_operations apparmor_net_ops = {
        .init = apparmor_nf_register,
        .exit = apparmor_nf_unregister,
};

static int __init apparmor_nf_ip_init(void)
{
        int err;

        if (!apparmor_enabled)
                return 0;

        err = register_pernet_subsys(&apparmor_net_ops);
        if (err)
                panic("Apparmor: register_pernet_subsys: error %d\n", err);

        return 0;
}
__initcall(apparmor_nf_ip_init);
#endif

static char nulldfa_src[] = {
        #include "nulldfa.in"
};
static struct aa_dfa *nulldfa;

static char stacksplitdfa_src[] = {
        #include "stacksplitdfa.in"
};
struct aa_dfa *stacksplitdfa;
struct aa_policydb *nullpdb;

static int __init aa_setup_dfa_engine(void)
{
        int error = -ENOMEM;

        nullpdb = aa_alloc_pdb(GFP_KERNEL);
        if (!nullpdb)
                return -ENOMEM;

        nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src),
                            TO_ACCEPT1_FLAG(YYTD_DATA32) |
                            TO_ACCEPT2_FLAG(YYTD_DATA32));
        if (IS_ERR(nulldfa)) {
                error = PTR_ERR(nulldfa);
                goto fail;
        }
        nullpdb->dfa = aa_get_dfa(nulldfa);
        nullpdb->perms = kcalloc(2, sizeof(struct aa_perms), GFP_KERNEL);
        if (!nullpdb->perms)
                goto fail;
        nullpdb->size = 2;

        stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src,
                                      sizeof(stacksplitdfa_src),
                                      TO_ACCEPT1_FLAG(YYTD_DATA32) |
                                      TO_ACCEPT2_FLAG(YYTD_DATA32));
        if (IS_ERR(stacksplitdfa)) {
                error = PTR_ERR(stacksplitdfa);
                goto fail;
        }

        return 0;

fail:
        aa_put_pdb(nullpdb);
        aa_put_dfa(nulldfa);
        nullpdb = NULL;
        nulldfa = NULL;
        stacksplitdfa = NULL;

        return error;
}

static void __init aa_teardown_dfa_engine(void)
{
        aa_put_dfa(stacksplitdfa);
        aa_put_dfa(nulldfa);
        aa_put_pdb(nullpdb);
        nullpdb = NULL;
        stacksplitdfa = NULL;
        nulldfa = NULL;
}

static int __init apparmor_init(void)
{
        int error;

        error = aa_setup_dfa_engine();
        if (error) {
                AA_ERROR("Unable to setup dfa engine\n");
                goto alloc_out;
        }

        error = aa_alloc_root_ns();
        if (error) {
                AA_ERROR("Unable to allocate default profile namespace\n");
                goto alloc_out;
        }

        error = apparmor_init_sysctl();
        if (error) {
                AA_ERROR("Unable to register sysctls\n");
                goto alloc_out;

        }

        error = alloc_buffers();
        if (error) {
                AA_ERROR("Unable to allocate work buffers\n");
                goto alloc_out;
        }

        error = set_init_ctx();
        if (error) {
                AA_ERROR("Failed to set context on init task\n");
                aa_free_root_ns();
                goto buffers_out;
        }
        security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
                                &apparmor_lsmid);

        /* Report that AppArmor successfully initialized */
        apparmor_initialized = 1;
        if (aa_g_profile_mode == APPARMOR_COMPLAIN)
                aa_info_message("AppArmor initialized: complain mode enabled");
        else if (aa_g_profile_mode == APPARMOR_KILL)
                aa_info_message("AppArmor initialized: kill mode enabled");
        else
                aa_info_message("AppArmor initialized");

        return error;

buffers_out:
        destroy_buffers();
alloc_out:
        aa_destroy_aafs();
        aa_teardown_dfa_engine();

        apparmor_enabled = false;
        return error;
}

DEFINE_LSM(apparmor) = {
        .name = "apparmor",
        .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
        .enabled = &apparmor_enabled,
        .blobs = &apparmor_blob_sizes,
        .init = apparmor_init,
};





































































































































































































































    1 
    1 


















    1 









    1 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BACKING_DEV_DEFS_H
#define __LINUX_BACKING_DEV_DEFS_H

#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/refcount.h>

struct page;
struct device;
struct dentry;

/*
 * Bits in bdi_writeback.state
 */
enum wb_state {
        WB_registered,                /* bdi_register() was done */
        WB_writeback_running,        /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
        WB_start_all,                /* nr_pages == 0 (all) work pending */
};

enum wb_stat_item {
        WB_RECLAIMABLE,
        WB_WRITEBACK,
        WB_DIRTIED,
        WB_WRITTEN,
        NR_WB_STAT_ITEMS
};

#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

/*
 * why some writeback work was initiated
 */
enum wb_reason {
        WB_REASON_BACKGROUND,
        WB_REASON_VMSCAN,
        WB_REASON_SYNC,
        WB_REASON_PERIODIC,
        WB_REASON_LAPTOP_TIMER,
        WB_REASON_FS_FREE_SPACE,
        /*
         * There is no bdi forker thread any more and works are done
         * by emergency worker, however, this is TPs userland visible
         * and we'll be exposing exactly the same information,
         * so it has a mismatch name.
         */
        WB_REASON_FORKER_THREAD,
        WB_REASON_FOREIGN_FLUSH,

        WB_REASON_MAX,
};

struct wb_completion {
        atomic_t                cnt;
        wait_queue_head_t        *waitq;
};

#define __WB_COMPLETION_INIT(_waitq)        \
        (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }

/*
 * If one wants to wait for one or more wb_writeback_works, each work's
 * ->done should be set to a wb_completion defined using the following
 * macro.  Once all work items are issued with wb_queue_work(), the caller
 * can wait for the completion of all using wb_wait_for_completion().  Work
 * items which are waited upon aren't freed automatically on completion.
 */
#define WB_COMPLETION_INIT(bdi)                __WB_COMPLETION_INIT(&(bdi)->wb_waitq)

#define DEFINE_WB_COMPLETION(cmpl, bdi)        \
        struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)

/*
 * Each wb (bdi_writeback) can perform writeback operations, is measured
 * and throttled, independently.  Without cgroup writeback, each bdi
 * (bdi_writeback) is served by its embedded bdi->wb.
 *
 * On the default hierarchy, blkcg implicitly enables memcg.  This allows
 * using memcg's page ownership for attributing writeback IOs, and every
 * memcg - blkcg combination can be served by its own wb by assigning a
 * dedicated wb to each memcg, which enables isolation across different
 * cgroups and propagation of IO back pressure down from the IO layer upto
 * the tasks which are generating the dirty pages to be written back.
 *
 * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
 * refcounted with the number of inodes attached to it, and pins the memcg
 * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
 * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
 * is tested for blkcg after lookup and removed from index on mismatch so
 * that a new wb for the combination can be created.
 *
 * Each bdi_writeback that is not embedded into the backing_dev_info must hold
 * a reference to the parent backing_dev_info.  See cgwb_create() for details.
 */
struct bdi_writeback {
        struct backing_dev_info *bdi;        /* our parent bdi */

        unsigned long state;                /* Always use atomic bitops on this */
        unsigned long last_old_flush;        /* last old data flush */

        struct list_head b_dirty;        /* dirty inodes */
        struct list_head b_io;                /* parked for writeback */
        struct list_head b_more_io;        /* parked for more writeback */
        struct list_head b_dirty_time;        /* time stamps are dirty */
        spinlock_t list_lock;                /* protects the b_* lists */

        atomic_t writeback_inodes;        /* number of inodes under writeback */
        struct percpu_counter stat[NR_WB_STAT_ITEMS];

        unsigned long bw_time_stamp;        /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;        /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;        /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */

        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;

        struct fprop_local_percpu completions;
        int dirty_exceeded;
        enum wb_reason start_all_reason;

        spinlock_t work_lock;                /* protects work_list & dwork scheduling */
        struct list_head work_list;
        struct delayed_work dwork;        /* work item used for writeback */
        struct delayed_work bw_dwork;        /* work item used for bandwidth estimate */

        struct list_head bdi_node;        /* anchored at bdi->wb_list */

#ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;        /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
        struct cgroup_subsys_state *memcg_css; /* the associated memcg */
        struct cgroup_subsys_state *blkcg_css; /* and blkcg */
        struct list_head memcg_node;        /* anchored at memcg->cgwb_list */
        struct list_head blkcg_node;        /* anchored at blkcg->cgwb_list */
        struct list_head b_attached;        /* attached inodes, protected by list_lock */
        struct list_head offline_node;        /* anchored at offline_cgwbs */

        union {
                struct work_struct release_work;
                struct rcu_head rcu;
        };
#endif
};

struct backing_dev_info {
        u64 id;
        struct rb_node rb_node; /* keyed by ->id */
        struct list_head bdi_list;
        unsigned long ra_pages;        /* max readahead in PAGE_SIZE units */
        unsigned long io_pages;        /* max allowed IO size */

        struct kref refcnt;        /* Reference counter for the structure */
        unsigned int capabilities; /* Device capabilities */
        unsigned int min_ratio;
        unsigned int max_ratio, max_prop_frac;

        /*
         * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
         * any dirty wbs, which is depended upon by bdi_has_dirty().
         */
        atomic_long_t tot_write_bandwidth;
        /*
         * Jiffies when last process was dirty throttled on this bdi. Used by
         * blk-wbt.
         */
        unsigned long last_bdp_sleep;

        struct bdi_writeback wb;  /* the root writeback info for this bdi */
        struct list_head wb_list; /* list of all wbs */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
        struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
#endif
        wait_queue_head_t wb_waitq;

        struct device *dev;
        char dev_name[64];
        struct device *owner;

        struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
#endif
};

struct wb_lock_cookie {
        bool locked;
        unsigned long flags;
};

#ifdef CONFIG_CGROUP_WRITEBACK

/**
 * wb_tryget - try to increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline bool wb_tryget(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                return percpu_ref_tryget(&wb->refcnt);
        return true;
}

/**
 * wb_get - increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline void wb_get(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                percpu_ref_get(&wb->refcnt);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 * @nr: number of references to put
 */
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
        if (WARN_ON_ONCE(!wb->bdi)) {
                /*
                 * A driver bug might cause a file to be removed before bdi was
                 * initialized.
                 */
                return;
        }

        if (wb != &wb->bdi->wb)
                percpu_ref_put_many(&wb->refcnt, nr);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 */
static inline void wb_put(struct bdi_writeback *wb)
{
        wb_put_many(wb, 1);
}

/**
 * wb_dying - is a wb dying?
 * @wb: bdi_writeback of interest
 *
 * Returns whether @wb is unlinked and being drained.
 */
static inline bool wb_dying(struct bdi_writeback *wb)
{
        return percpu_ref_is_dying(&wb->refcnt);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool wb_tryget(struct bdi_writeback *wb)
{
        return true;
}

static inline void wb_get(struct bdi_writeback *wb)
{
}

static inline void wb_put(struct bdi_writeback *wb)
{
}

static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
}

static inline bool wb_dying(struct bdi_writeback *wb)
{
        return false;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

#endif        /* __LINUX_BACKING_DEV_DEFS_H */

















































































































































































































    7 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 




















    3 


    4 

    3 
    3 






























































































    7 







    9 





















































































































    5 


    2 


    5 









































































































































































    7 










    7 
































    8 






























    4 







    4 



















    7 






    2 
    5 






























































































































































































    1 





    1 













    1 






    1 
    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support four policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
 *
 * weighted interleave
 *                Allocate memory interleaved over a set of nodes based on
 *                a set of weights (per-node), with normal fallback if it
 *                fails.  Otherwise operates the same as interleave.
 *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
 *                on node 0 for every 1 page allocated on node 1.
 *
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
 * preferred      Try a specific node first before normal fallback.
 *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
 *
 * preferred many Try a set of nodes first before normal fallback. This is
 *                similar to preferred without the special case.
 *
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *                  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmem/tmpfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>

#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>

#include "internal.h"

/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)        /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)        /* Invert check for nodemask */
#define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)        /* Write-lock walked vmas */

static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
enum zone_type policy_zone = 0;

/*
 * run-time system-wide default policy => local allocation
 */
static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_LOCAL,
};

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

/*
 * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
 * system-default value should be used. A NULL iw_table also denotes that
 * system-default values should be used. Until the system-default table
 * is implemented, the system-default is always 1.
 *
 * iw_table is RCU protected
 */
static u8 __rcu *iw_table;
static DEFINE_MUTEX(iw_table_lock);

static u8 get_il_weight(int node)
{
        u8 *table;
        u8 weight;

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        /* if no iw_table, use system default */
        weight = table ? table[node] : 1;
        /* if value in iw_table is 0, use system default */
        weight = weight ? weight : 1;
        rcu_read_unlock();
        return weight;
}

/**
 * numa_nearest_node - Find nearest node by state
 * @node: Node id to start the search
 * @state: State to filter the search
 *
 * Lookup the closest node by distance if @nid is not in state.
 *
 * Return: this @node if it is in state, otherwise the closest node by distance
 */
int numa_nearest_node(int node, unsigned int state)
{
        int min_dist = INT_MAX, dist, n, min_node;

        if (state >= NR_NODE_STATES)
                return -EINVAL;

        if (node == NUMA_NO_NODE || node_state(node, state))
                return node;

        min_node = node;
        for_each_node_state(n, state) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(numa_nearest_node);

struct mempolicy *get_task_policy(struct task_struct *p)
{
        struct mempolicy *pol = p->mempolicy;
        int node;

        if (pol)
                return pol;

        node = numa_node_id();
        if (node != NUMA_NO_NODE) {
                pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
                if (pol->mode)
                        return pol;
        }

        return &default_policy;
}

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
        return pol->flags & MPOL_MODE_FLAGS;
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->nodes = *nodes;
        return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;

        nodes_clear(pol->nodes);
        node_set(first_node(*nodes), pol->nodes);
        return 0;
}

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_lock for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /*
         * Default (pol==NULL) resp. local memory policies are not a
         * subject of any remapping. They also do not need any special
         * constructor.
         */
        if (!pol || pol->mode == MPOL_LOCAL)
                return 0;

        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);

        if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
        else
                nodes_and(nsc->mask2, *nodes, nsc->mask1);

        if (mpol_store_user_nodemask(pol))
                pol->w.user_nodemask = *nodes;
        else
                pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;

        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);

                        mode = MPOL_LOCAL;
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes) ||
                    (flags & MPOL_F_STATIC_NODES) ||
                    (flags & MPOL_F_RELATIVE_NODES))
                        return ERR_PTR(-EINVAL);
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);

        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;
        policy->home_node = NUMA_NO_NODE;

        return policy;
}

/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *pol)
{
        if (!atomic_dec_and_test(&pol->refcnt))
                return;
        kmem_cache_free(policy_cache, pol);
}

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
                nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }

        if (nodes_empty(tmp))
                tmp = *nodes;

        pol->nodes = tmp;
}

static void mpol_rebind_preferred(struct mempolicy *pol,
                                                const nodemask_t *nodes)
{
        pol->w.cpuset_mems_allowed = *nodes;
}

/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 * policies are protected by task->mems_allowed_seq to prevent a premature
 * OOM/allocation failure due to parallel nodemask modification.
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
        if (!pol || pol->mode == MPOL_LOCAL)
                return;
        if (!mpol_store_user_nodemask(pol) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;

        mpol_ops[pol->mode].rebind(pol, newmask);
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
 *
 * Called with task's alloc_lock held.
 */
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
        mpol_rebind_policy(tsk->mempolicy, new);
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 */
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_write_lock(mm);
        for_each_vma(vmi, vma) {
                vma_start_write(vma);
                mpol_rebind_policy(vma->vm_policy, new);
        }
        mmap_write_unlock(mm);
}

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_LOCAL] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_PREFERRED_MANY] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_WEIGHTED_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
};

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags);
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                pgoff_t ilx, int *nid);

static bool strictly_unmovable(unsigned long flags)
{
        /*
         * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
         * if any misplaced page is found.
         */
        return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
                         MPOL_MF_STRICT;
}

struct migration_mpol {                /* for alloc_migration_target_by_mpol() */
        struct mempolicy *pol;
        pgoff_t ilx;
};

struct queue_pages {
        struct list_head *pagelist;
        unsigned long flags;
        nodemask_t *nmask;
        unsigned long start;
        unsigned long end;
        struct vm_area_struct *first;
        struct folio *large;                /* note last large folio encountered */
        long nr_failed;                        /* could not be isolated at this time */
};

/*
 * Check if the folio's nid is in qp->nmask.
 *
 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 * in the invert of qp->nmask.
 */
static inline bool queue_folio_required(struct folio *folio,
                                        struct queue_pages *qp)
{
        int nid = folio_nid(folio);
        unsigned long flags = qp->flags;

        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}

static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
        struct folio *folio;
        struct queue_pages *qp = walk->private;

        if (unlikely(is_pmd_migration_entry(*pmd))) {
                qp->nr_failed++;
                return;
        }
        folio = pmd_folio(*pmd);
        if (is_huge_zero_folio(folio)) {
                walk->action = ACTION_CONTINUE;
                return;
        }
        if (!queue_folio_required(folio, qp))
                return;
        if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma) ||
            !migrate_folio_add(folio, qp->pagelist, qp->flags))
                qp->nr_failed++;
}

/*
 * Scan through folios, checking if they satisfy the required conditions,
 * moving them from LRU to local pagelist for migration if they do (or not).
 *
 * queue_folios_pte_range() has two possible return values:
 * 0 - continue walking to scan for more, even if an existing folio on the
 *     wrong node could not be isolated and queued for migration.
 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
 *        and an existing folio was on a node that does not follow the policy.
 */
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct folio *folio;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        pte_t *pte, *mapped_pte;
        pte_t ptent;
        spinlock_t *ptl;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                queue_folios_pmd(pmd, walk);
                spin_unlock(ptl);
                goto out;
        }

        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        if (!pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = ptep_get(pte);
                if (pte_none(ptent))
                        continue;
                if (!pte_present(ptent)) {
                        if (is_migration_entry(pte_to_swp_entry(ptent)))
                                qp->nr_failed++;
                        continue;
                }
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;
                /*
                 * vm_normal_folio() filters out zero pages, but there might
                 * still be reserved folios to skip, perhaps in a VDSO.
                 */
                if (folio_test_reserved(folio))
                        continue;
                if (!queue_folio_required(folio, qp))
                        continue;
                if (folio_test_large(folio)) {
                        /*
                         * A large folio can only be isolated from LRU once,
                         * but may be mapped by many PTEs (and Copy-On-Write may
                         * intersperse PTEs of other, order 0, folios).  This is
                         * a common case, so don't mistake it for failure (but
                         * there can be other cases of multi-mapped pages which
                         * this quick check does not help to filter out - and a
                         * search of the pagelist might grow to be prohibitive).
                         *
                         * migrate_pages(&pagelist) returns nr_failed folios, so
                         * check "large" now so that queue_pages_range() returns
                         * a comparable nr_failed folios.  This does imply that
                         * if folio could not be isolated for some racy reason
                         * at its first PTE, later PTEs will not give it another
                         * chance of isolation; but keeps the accounting simple.
                         */
                        if (folio == qp->large)
                                continue;
                        qp->large = folio;
                }
                if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
                    !vma_migratable(vma) ||
                    !migrate_folio_add(folio, qp->pagelist, flags)) {
                        qp->nr_failed++;
                        if (strictly_unmovable(flags))
                                break;
                }
        }
        pte_unmap_unlock(mapped_pte, ptl);
        cond_resched();
out:
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
        return 0;
}

static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
                               unsigned long addr, unsigned long end,
                               struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        struct folio *folio;
        spinlock_t *ptl;
        pte_t entry;

        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
        entry = huge_ptep_get(pte);
        if (!pte_present(entry)) {
                if (unlikely(is_hugetlb_entry_migration(entry)))
                        qp->nr_failed++;
                goto unlock;
        }
        folio = pfn_folio(pte_pfn(entry));
        if (!queue_folio_required(folio, qp))
                goto unlock;
        if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma)) {
                qp->nr_failed++;
                goto unlock;
        }
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_likely_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) ||
            (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
                if (!isolate_hugetlb(folio, qp->pagelist))
                        qp->nr_failed++;
unlock:
        spin_unlock(ptl);
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
#endif
        return 0;
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
 */
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        struct mmu_gather tlb;
        long nr_updated;

        tlb_gather_mmu(&tlb, vma->vm_mm);

        nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
        if (nr_updated > 0)
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);

        tlb_finish_mmu(&tlb);

        return nr_updated;
}
#endif /* CONFIG_NUMA_BALANCING */

static int queue_pages_test_walk(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
{
        struct vm_area_struct *next, *vma = walk->vma;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;

        /* range check first */
        VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);

        if (!qp->first) {
                qp->first = vma;
                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                        (qp->start < vma->vm_start))
                        /* hole at head side of range */
                        return -EFAULT;
        }
        next = find_vma(vma->vm_mm, vma->vm_end);
        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                ((vma->vm_end < qp->end) &&
                (!next || vma->vm_end < next->vm_start)))
                /* hole at middle or tail of range */
                return -EFAULT;

        /*
         * Need check MPOL_MF_STRICT to return -EIO if possible
         * regardless of vma_migratable
         */
        if (!vma_migratable(vma) &&
            !(flags & MPOL_MF_STRICT))
                return 1;

        /*
         * Check page nodes, and queue pages to move, in the current vma.
         * But if no moving, and no strict checking, the scan can be skipped.
         */
        if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return 0;
        return 1;
}

static const struct mm_walk_ops queue_pages_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_RDLOCK,
};

static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_WRLOCK,
};

/*
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are not on the required set of @nodes,
 * and migration is allowed, they are isolated and queued to @pagelist.
 *
 * queue_pages_range() may return:
 * 0 - all pages already on the right node, or successfully queued for moving
 *     (or neither strict checking nor moving requested: only range checking).
 * >0 - this number of misplaced folios could not be queued for moving
 *      (a hugetlbfs page or a transparent huge page being counted as 1).
 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
 */
static long
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                nodemask_t *nodes, unsigned long flags,
                struct list_head *pagelist)
{
        int err;
        struct queue_pages qp = {
                .pagelist = pagelist,
                .flags = flags,
                .nmask = nodes,
                .start = start,
                .end = end,
                .first = NULL,
        };
        const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
                        &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;

        err = walk_page_range(mm, start, end, ops, &qp);

        if (!qp.first)
                /* whole range in hole */
                err = -EFAULT;

        return err ? : qp.nr_failed;
}

/*
 * Apply policy to a single VMA
 * This must be called with the mmap_lock held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
                                struct mempolicy *pol)
{
        int err;
        struct mempolicy *old;
        struct mempolicy *new;

        vma_assert_write_locked(vma);

        new = mpol_dup(pol);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
                if (err)
                        goto err_out;
        }

        old = vma->vm_policy;
        vma->vm_policy = new; /* protected by mmap_lock */
        mpol_put(old);

        return 0;
 err_out:
        mpol_put(new);
        return err;
}

/* Split or merge the VMA (if required) and apply the new policy */
static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct vm_area_struct **prev, unsigned long start,
                unsigned long end, struct mempolicy *new_pol)
{
        unsigned long vmstart, vmend;

        vmend = min(end, vma->vm_end);
        if (start > vma->vm_start) {
                *prev = vma;
                vmstart = start;
        } else {
                vmstart = vma->vm_start;
        }

        if (mpol_equal(vma->vm_policy, new_pol)) {
                *prev = vma;
                return 0;
        }

        vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;
        return vma_replace_policy(vma, new_pol);
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        task_lock(current);
        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                mpol_put(new);
                goto out;
        }

        old = current->mempolicy;
        current->mempolicy = new;
        if (new && (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
                current->il_prev = MAX_NUMNODES-1;
                current->il_weight = 0;
        }
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

/*
 * Return nodemask for policy for get_mempolicy() query
 *
 * Called with task's alloc_lock held
 */
static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
        nodes_clear(*nodes);
        if (pol == &default_policy)
                return;

        switch (pol->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                *nodes = pol->nodes;
                break;
        case MPOL_LOCAL:
                /* return empty node mask for local allocation */
                break;
        default:
                BUG();
        }
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
        struct page *p = NULL;
        int ret;

        ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
        if (ret > 0) {
                ret = page_to_nid(p);
                put_page(p);
        }
        return ret;
}

/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                             unsigned long addr, unsigned long flags)
{
        int err;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;

        if (flags &
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;

        if (flags & MPOL_F_MEMS_ALLOWED) {
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
                        return -EINVAL;
                *policy = 0;        /* just so it's initialized */
                task_lock(current);
                *nmask  = cpuset_current_mems_allowed;
                task_unlock(current);
                return 0;
        }

        if (flags & MPOL_F_ADDR) {
                pgoff_t ilx;                /* ignored here */
                /*
                 * Do NOT fall back to task policy if the
                 * vma/shared policy at addr is NULL.  We
                 * want to return MPOL_DEFAULT in this case.
                 */
                mmap_read_lock(mm);
                vma = vma_lookup(mm, addr);
                if (!vma) {
                        mmap_read_unlock(mm);
                        return -EFAULT;
                }
                pol = __get_vma_policy(vma, addr, &ilx);
        } else if (addr)
                return -EINVAL;

        if (!pol)
                pol = &default_policy;        /* indicates default behavior */

        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, because we are about to
                         * drop the mmap_lock, after which only "pol" remains
                         * valid, "vma" is stale.
                         */
                        pol_refcount = pol;
                        vma = NULL;
                        mpol_get(pol);
                        mmap_read_unlock(mm);
                        err = lookup_node(mm, addr);
                        if (err < 0)
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = next_node_in(current->il_prev, pol->nodes);
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        if (current->il_weight)
                                *policy = current->il_prev;
                        else
                                *policy = next_node_in(current->il_prev,
                                                       pol->nodes);
                } else {
                        err = -EINVAL;
                        goto out;
                }
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
                /*
                 * Internal mempolicy flags must be masked off before exposing
                 * the policy to userspace.
                 */
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }

        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
                        *nmask = pol->w.user_nodemask;
                } else {
                        task_lock(current);
                        get_policy_nodemask(pol, nmask);
                        task_unlock(current);
                }
        }

 out:
        mpol_cond_put(pol);
        if (vma)
                mmap_read_unlock(mm);
        if (pol_refcount)
                mpol_put(pol_refcount);
        return err;
}

#ifdef CONFIG_MIGRATION
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_likely_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
                if (folio_isolate_lru(folio)) {
                        list_add_tail(&folio->lru, foliolist);
                        node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                } else {
                        /*
                         * Non-movable folio may reach here.  And, there may be
                         * temporary off LRU folios or non-LRU movable folios.
                         * Treat them as unmovable folios since they can't be
                         * isolated, so they can't be moved at the moment.
                         */
                        return false;
                }
        }
        return true;
}

/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
static long migrate_to_node(struct mm_struct *mm, int source, int dest,
                            int flags)
{
        nodemask_t nmask;
        struct vm_area_struct *vma;
        LIST_HEAD(pagelist);
        long nr_failed;
        long err = 0;
        struct migration_target_control mtc = {
                .nid = dest,
                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
                .reason = MR_SYSCALL,
        };

        nodes_clear(nmask);
        node_set(source, nmask);

        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

        mmap_read_lock(mm);
        vma = find_vma(mm, 0);

        /*
         * This does not migrate the range, but isolates all pages that
         * need migration.  Between passing in the full user address
         * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
         * but passes back the count of pages which could not be isolated.
         */
        nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
                                      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        mmap_read_unlock(mm);

        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
                        (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
                if (err)
                        putback_movable_pages(&pagelist);
        }

        if (err >= 0)
                err += nr_failed;
        return err;
}

/*
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        long nr_failed = 0;
        long err = 0;
        nodemask_t tmp;

        lru_cache_disable();

        /*
         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
         * bit in 'tmp', and return that <source, dest> pair for migration.
         * The pair of nodemasks 'to' and 'from' define the map.
         *
         * If no pair of bits is found that way, fallback to picking some
         * pair of 'source' and 'dest' bits that are not the same.  If the
         * 'source' and 'dest' bits are the same, this represents a node
         * that will be migrating to itself, so no pages need move.
         *
         * If no bits are left in 'tmp', or if all remaining bits left
         * in 'tmp' correspond to the same bit in 'to', return false
         * (nothing left to migrate).
         *
         * This lets us pick a pair of nodes to migrate between, such that
         * if possible the dest node is not already occupied by some other
         * source node, minimizing the risk of overloading the memory on a
         * node that would happen if we migrated incoming memory to a node
         * before migrating outgoing memory source that same node.
         *
         * A single scan of tmp is sufficient.  As we go, we remember the
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
         */

        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s, d;
                int source = NUMA_NO_NODE;
                int dest = 0;

                for_each_node_mask(s, tmp) {

                        /*
                         * do_migrate_pages() tries to maintain the relative
                         * node relationship of the pages established between
                         * threads and memory areas.
                         *
                         * However if the number of source nodes is not equal to
                         * the number of destination nodes we can not preserve
                         * this node relative relationship.  In that case, skip
                         * copying memory from a node that is in the destination
                         * mask.
                         *
                         * Example: [2,3,4] -> [3,4,5] moves everything.
                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
                         */

                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
                                                (node_isset(s, *to)))
                                continue;

                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;

                        source = s;        /* Node moved. Memorize */
                        dest = d;

                        /* dest not in remaining from nodes? */
                        if (!node_isset(dest, tmp))
                                break;
                }
                if (source == NUMA_NO_NODE)
                        break;

                node_clear(source, tmp);
                err = migrate_to_node(mm, source, dest, flags);
                if (err > 0)
                        nr_failed += err;
                if (err < 0)
                        break;
        }

        lru_cache_enable();
        if (err < 0)
                return err;
        return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}

/*
 * Allocate a new folio for page migration, according to NUMA mempolicy.
 */
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        struct migration_mpol *mmpol = (struct migration_mpol *)private;
        struct mempolicy *pol = mmpol->pol;
        pgoff_t ilx = mmpol->ilx;
        struct page *page;
        unsigned int order;
        int nid = numa_node_id();
        gfp_t gfp;

        order = folio_order(src);
        ilx += src->index >> order;

        if (folio_test_hugetlb(src)) {
                nodemask_t *nodemask;
                struct hstate *h;

                h = folio_hstate(src);
                gfp = htlb_alloc_mask(h);
                nodemask = policy_nodemask(gfp, pol, ilx, &nid);
                return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
                                htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
        }

        if (folio_test_large(src))
                gfp = GFP_TRANSHUGE;
        else
                gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;

        page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
        return page_rmappable_folio(page);
}
#else

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        return false;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        return -ENOSYS;
}

static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        return NULL;
}
#endif

static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vma_iterator vmi;
        struct migration_mpol mmpol;
        struct mempolicy *new;
        unsigned long end;
        long err;
        long nr_failed;
        LIST_HEAD(pagelist);

        if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;

        if (start & ~PAGE_MASK)
                return -EINVAL;

        if (mode == MPOL_DEFAULT)
                flags &= ~MPOL_MF_STRICT;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;

        new = mpol_new(mode, mode_flags, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);

        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
         */
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;

        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_disable();
        {
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
                        mmap_write_lock(mm);
                        err = mpol_set_nodemask(new, nmask, scratch);
                        if (err)
                                mmap_write_unlock(mm);
                } else
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
        if (err)
                goto mpol_out;

        /*
         * Lock the VMAs before scanning for pages to migrate,
         * to ensure we don't miss a concurrently inserted page.
         */
        nr_failed = queue_pages_range(mm, start, end, nmask,
                        flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);

        if (nr_failed < 0) {
                err = nr_failed;
                nr_failed = 0;
        } else {
                vma_iter_init(&vmi, mm, start);
                prev = vma_prev(&vmi);
                for_each_vma_range(vmi, vma, end) {
                        err = mbind_range(&vmi, vma, &prev, start, end, new);
                        if (err)
                                break;
                }
        }

        if (!err && !list_empty(&pagelist)) {
                /* Convert MPOL_DEFAULT's NULL to task or default policy */
                if (!new) {
                        new = get_task_policy(current);
                        mpol_get(new);
                }
                mmpol.pol = new;
                mmpol.ilx = 0;

                /*
                 * In the interleaved case, attempt to allocate on exactly the
                 * targeted nodes, for the first VMA to be migrated; for later
                 * VMAs, the nodes will still be interleaved from the targeted
                 * nodemask, but one by one may be selected differently.
                 */
                if (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        struct folio *folio;
                        unsigned int order;
                        unsigned long addr = -EFAULT;

                        list_for_each_entry(folio, &pagelist, lru) {
                                if (!folio_test_ksm(folio))
                                        break;
                        }
                        if (!list_entry_is_head(folio, &pagelist, lru)) {
                                vma_iter_init(&vmi, mm, start);
                                for_each_vma_range(vmi, vma, end) {
                                        addr = page_address_in_vma(
                                                folio_page(folio, 0), vma);
                                        if (addr != -EFAULT)
                                                break;
                                }
                        }
                        if (addr != -EFAULT) {
                                order = folio_order(folio);
                                /* We already know the pol, but not the ilx */
                                mpol_cond_put(get_vma_policy(vma, addr, order,
                                                             &mmpol.ilx));
                                /* Set base from which to increment by index */
                                mmpol.ilx -= folio->index >> order;
                        }
                }
        }

        mmap_write_unlock(mm);

        if (!err && !list_empty(&pagelist)) {
                nr_failed |= migrate_pages(&pagelist,
                                alloc_migration_target_by_mpol, NULL,
                                (unsigned long)&mmpol, MIGRATE_SYNC,
                                MR_MEMPOLICY_MBIND, NULL);
        }

        if (nr_failed && (flags & MPOL_MF_STRICT))
                err = -EIO;
        if (!list_empty(&pagelist))
                putback_movable_pages(&pagelist);
mpol_out:
        mpol_put(new);
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_enable();
        return err;
}

/*
 * User space interface with variable sized bitmaps for nodelists.
 */
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
                      unsigned long maxnode)
{
        unsigned long nlongs = BITS_TO_LONGS(maxnode);
        int ret;

        if (in_compat_syscall())
                ret = compat_get_bitmap(mask,
                                        (const compat_ulong_t __user *)nmask,
                                        maxnode);
        else
                ret = copy_from_user(mask, nmask,
                                     nlongs * sizeof(unsigned long));

        if (ret)
                return -EFAULT;

        if (maxnode % BITS_PER_LONG)
                mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;

        return 0;
}

/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
{
        --maxnode;
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
                return -EINVAL;

        /*
         * When the user specified more nodes than supported just check
         * if the non supported part is all zero, one word at a time,
         * starting at the end.
         */
        while (maxnode > MAX_NUMNODES) {
                unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
                unsigned long t;

                if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
                        return -EFAULT;

                if (maxnode - bits >= MAX_NUMNODES) {
                        maxnode -= bits;
                } else {
                        maxnode = MAX_NUMNODES;
                        t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
                }
                if (t)
                        return -EINVAL;
        }

        return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
{
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
        bool compat = in_compat_syscall();

        if (compat)
                nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);

        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
                        return -EINVAL;
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
                        return -EFAULT;
                copy = nbytes;
                maxnode = nr_node_ids;
        }

        if (compat)
                return compat_put_bitmap((compat_ulong_t __user *)mask,
                                         nodes_addr(*nodes), maxnode);

        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
        *flags = *mode & MPOL_MODE_FLAGS;
        *mode &= ~MPOL_MODE_FLAGS;

        if ((unsigned int)(*mode) >=  MPOL_MAX)
                return -EINVAL;
        if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        if (*flags & MPOL_F_NUMA_BALANCING) {
                if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
                        *flags |= (MPOL_F_MOF | MPOL_F_MORON);
                else
                        return -EINVAL;
        }
        return 0;
}

static long kernel_mbind(unsigned long start, unsigned long len,
                         unsigned long mode, const unsigned long __user *nmask,
                         unsigned long maxnode, unsigned int flags)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        start = untagged_addr(start);
        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}

SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
                unsigned long, home_node, unsigned long, flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct mempolicy *new, *old;
        unsigned long end;
        int err = -ENOENT;
        VMA_ITERATOR(vmi, mm, start);

        start = untagged_addr(start);
        if (start & ~PAGE_MASK)
                return -EINVAL;
        /*
         * flags is used for future extension if any.
         */
        if (flags != 0)
                return -EINVAL;

        /*
         * Check home_node is online to avoid accessing uninitialized
         * NODE_DATA.
         */
        if (home_node >= MAX_NUMNODES || !node_online(home_node))
                return -EINVAL;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        mmap_write_lock(mm);
        prev = vma_prev(&vmi);
        for_each_vma_range(vmi, vma, end) {
                /*
                 * If any vma in the range got policy other than MPOL_BIND
                 * or MPOL_PREFERRED_MANY we return error. We don't reset
                 * the home node for vmas we already updated before.
                 */
                old = vma_policy(vma);
                if (!old) {
                        prev = vma;
                        continue;
                }
                if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
                        err = -EOPNOTSUPP;
                        break;
                }
                new = mpol_dup(old);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        break;
                }

                vma_start_write(vma);
                new->home_node = home_node;
                err = mbind_range(&vmi, vma, &prev, start, end, new);
                mpol_put(new);
                if (err)
                        break;
        }
        mmap_write_unlock(mm);
        return err;
}

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned int, flags)
{
        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}

/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
                                 unsigned long maxnode)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_set_mempolicy(lmode, mode_flags, &nodes);
}

SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
{
        return kernel_set_mempolicy(mode, nmask, maxnode);
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
                                const unsigned long __user *old_nodes,
                                const unsigned long __user *new_nodes)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        nodemask_t task_nodes;
        int err;
        nodemask_t *old;
        nodemask_t *new;
        NODEMASK_SCRATCH(scratch);

        if (!scratch)
                return -ENOMEM;

        old = &scratch->mask1;
        new = &scratch->mask2;

        err = get_nodes(old, old_nodes, maxnode);
        if (err)
                goto out;

        err = get_nodes(new, new_nodes, maxnode);
        if (err)
                goto out;

        /* Find the mm_struct */
        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        get_task_struct(task);

        err = -EINVAL;

        /*
         * Check if this process has the right to modify the specified process.
         * Use the regular "ptrace_may_access()" checks.
         */
        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                err = -EPERM;
                goto out_put;
        }
        rcu_read_unlock();

        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out_put;
        }

        task_nodes = cpuset_mems_allowed(current);
        nodes_and(*new, *new, task_nodes);
        if (nodes_empty(*new))
                goto out_put;

        err = security_task_movememory(task);
        if (err)
                goto out_put;

        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                err = -EINVAL;
                goto out;
        }

        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

        mmput(mm);
out:
        NODEMASK_SCRATCH_FREE(scratch);

        return err;

out_put:
        put_task_struct(task);
        goto out;
}

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, old_nodes,
                const unsigned long __user *, new_nodes)
{
        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}

/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
                                unsigned long maxnode,
                                unsigned long addr,
                                unsigned long flags)
{
        int err;
        int pval;
        nodemask_t nodes;

        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;

        addr = untagged_addr(addr);

        err = do_get_mempolicy(&pval, &nodes, addr, flags);

        if (err)
                return err;

        if (policy && put_user(pval, policy))
                return -EFAULT;

        if (nmask)
                err = copy_nodes_to_user(nmask, maxnode, &nodes);

        return err;
}

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                unsigned long __user *, nmask, unsigned long, maxnode,
                unsigned long, addr, unsigned long, flags)
{
        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}

bool vma_migratable(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                return false;

        /*
         * DAX device mappings require predictable access latency, so avoid
         * incurring periodic faults.
         */
        if (vma_is_dax(vma))
                return false;

        if (is_vm_hugetlb_page(vma) &&
                !hugepage_migration_supported(hstate_vma(vma)))
                return false;

        /*
         * Migration allocates pages in the highest zone. If we cannot
         * do so then migration (at least from node to node) is not
         * possible.
         */
        if (vma->vm_file &&
                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                        < policy_zone)
                return false;
        return true;
}

struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                                   unsigned long addr, pgoff_t *ilx)
{
        *ilx = 0;
        return (vma->vm_ops && vma->vm_ops->get_policy) ?
                vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}

/*
 * get_vma_policy(@vma, @addr, @order, @ilx)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup
 * @order: 0, or appropriate huge_page_order for interleaving
 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
 *       MPOL_WEIGHTED_INTERLEAVE
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to current->mempolicy or system default policy, as necessary.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                 unsigned long addr, int order, pgoff_t *ilx)
{
        struct mempolicy *pol;

        pol = __get_vma_policy(vma, addr, ilx);
        if (!pol)
                pol = get_task_policy(current);
        if (pol->mode == MPOL_INTERLEAVE ||
            pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                *ilx += vma->vm_pgoff >> order;
                *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
        }
        return pol;
}

bool vma_policy_mof(struct vm_area_struct *vma)
{
        struct mempolicy *pol;

        if (vma->vm_ops && vma->vm_ops->get_policy) {
                bool ret = false;
                pgoff_t ilx;                /* ignored here */

                pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
                if (pol && (pol->flags & MPOL_F_MOF))
                        ret = true;
                mpol_cond_put(pol);

                return ret;
        }

        pol = vma->vm_policy;
        if (!pol)
                pol = get_task_policy(current);

        return pol->flags & MPOL_F_MOF;
}

bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
        enum zone_type dynamic_policy_zone = policy_zone;

        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

        /*
         * if policy->nodes has movable memory only,
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->nodes is intersect with node_states[N_MEMORY].
         * so if the following test fails, it implies
         * policy->nodes has movable memory only.
         */
        if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
                dynamic_policy_zone = ZONE_MOVABLE;

        return zone >= dynamic_policy_zone;
}

static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
        unsigned int node;
        unsigned int cpuset_mems_cookie;

retry:
        /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
        cpuset_mems_cookie = read_mems_allowed_begin();
        node = current->il_prev;
        if (!current->il_weight || !node_isset(node, policy->nodes)) {
                node = next_node_in(node, policy->nodes);
                if (read_mems_allowed_retry(cpuset_mems_cookie))
                        goto retry;
                if (node == MAX_NUMNODES)
                        return node;
                current->il_prev = node;
                current->il_weight = get_il_weight(node);
        }
        current->il_weight--;
        return node;
}

/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
        unsigned int nid;
        unsigned int cpuset_mems_cookie;

        /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nid = next_node_in(current->il_prev, policy->nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        if (nid < MAX_NUMNODES)
                current->il_prev = nid;
        return nid;
}

/*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
 */
unsigned int mempolicy_slab_node(void)
{
        struct mempolicy *policy;
        int node = numa_mem_id();

        if (!in_task())
                return node;

        policy = current->mempolicy;
        if (!policy)
                return node;

        switch (policy->mode) {
        case MPOL_PREFERRED:
                return first_node(policy->nodes);

        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);

        case MPOL_WEIGHTED_INTERLEAVE:
                return weighted_interleave_nodes(policy);

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
        {
                struct zoneref *z;

                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
                struct zonelist *zonelist;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->nodes);
                return z->zone ? zone_to_nid(z->zone) : node;
        }
        case MPOL_LOCAL:
                return node;

        default:
                BUG();
        }
}

static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
                                              nodemask_t *mask)
{
        /*
         * barrier stabilizes the nodemask locally so that it can be iterated
         * over safely without concern for changes. Allocators validate node
         * selection does not violate mems_allowed, so this is safe.
         */
        barrier();
        memcpy(mask, &pol->nodes, sizeof(nodemask_t));
        barrier();
        return nodes_weight(*mask);
}

static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nr_nodes;
        u8 *table;
        unsigned int weight_total = 0;
        u8 weight;
        int nid;

        nr_nodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nr_nodes)
                return numa_node_id();

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        /* calculate the total weight */
        for_each_node_mask(nid, nodemask) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                weight = weight ? weight : 1;
                weight_total += weight;
        }

        /* Calculate the node offset based on totals */
        target = ilx % weight_total;
        nid = first_node(nodemask);
        while (target) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                weight = weight ? weight : 1;
                if (target < weight)
                        break;
                target -= weight;
                nid = next_node_in(nid, nodemask);
        }
        rcu_read_unlock();
        return nid;
}

/*
 * Do static interleaving for interleave index @ilx.  Returns the ilx'th
 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
 * exceeds the number of present nodes.
 */
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nnodes;
        int i;
        int nid;

        nnodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nnodes)
                return numa_node_id();
        target = ilx % nnodes;
        nid = first_node(nodemask);
        for (i = 0; i < target; i++)
                nid = next_node(nid, nodemask);
        return nid;
}

/*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation, together with preferred node id (or the input node id).
 */
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                   pgoff_t ilx, int *nid)
{
        nodemask_t *nodemask = NULL;

        switch (pol->mode) {
        case MPOL_PREFERRED:
                /* Override input node id */
                *nid = first_node(pol->nodes);
                break;
        case MPOL_PREFERRED_MANY:
                nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                break;
        case MPOL_BIND:
                /* Restrict to nodemask (but not on lower zones) */
                if (apply_policy_zone(pol, gfp_zone(gfp)) &&
                    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
                        nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
                 * requested node and not break the policy.
                 */
                WARN_ON_ONCE(gfp & __GFP_THISNODE);
                break;
        case MPOL_INTERLEAVE:
                /* Override input node id */
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        interleave_nodes(pol) : interleave_nid(pol, ilx);
                break;
        case MPOL_WEIGHTED_INTERLEAVE:
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        weighted_interleave_nodes(pol) :
                        weighted_interleave_nid(pol, ilx);
                break;
        }

        return nodemask;
}

#ifdef CONFIG_HUGETLBFS
/*
 * huge_node(@vma, @addr, @gfp_flags, @mpol)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup and interleave policy
 * @gfp_flags: for requested zone
 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
 *
 * Returns a nid suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
 * to the mempolicy's @nodemask for filtering the zonelist.
 */
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                struct mempolicy **mpol, nodemask_t **nodemask)
{
        pgoff_t ilx;
        int nid;

        nid = numa_node_id();
        *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
        *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
        return nid;
}

/*
 * init_nodemask_of_mempolicy
 *
 * If the current task's mempolicy is "default" [NULL], return 'false'
 * to indicate default policy.  Otherwise, extract the policy nodemask
 * for 'bind' or 'interleave' policy into the argument nodemask, or
 * initialize the argument nodemask to contain the single node for
 * 'preferred' or 'local' policy and return 'true' to indicate presence
 * of non-default mempolicy.
 *
 * We don't bother with reference counting the mempolicy [mpol_get/put]
 * because the current task is examining it's own mempolicy and a task's
 * mempolicy is only ever changed by the task itself.
 *
 * N.B., it is the caller's responsibility to free a returned nodemask.
 */
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
        struct mempolicy *mempolicy;

        if (!(mask && current->mempolicy))
                return false;

        task_lock(current);
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                *mask = mempolicy->nodes;
                break;

        case MPOL_LOCAL:
                init_nodemask_of_node(mask, numa_node_id());
                break;

        default:
                BUG();
        }
        task_unlock(current);

        return true;
}
#endif

/*
 * mempolicy_in_oom_domain
 *
 * If tsk's mempolicy is "bind", check for intersection between mask and
 * the policy nodemask. Otherwise, return true for all other policies
 * including "interleave", as a tsk with "interleave" policy may have
 * memory allocated from all nodes in system.
 *
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
 */
bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                        const nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        bool ret = true;

        if (!mask)
                return ret;

        task_lock(tsk);
        mempolicy = tsk->mempolicy;
        if (mempolicy && mempolicy->mode == MPOL_BIND)
                ret = nodes_intersects(mempolicy->nodes, *mask);
        task_unlock(tsk);

        return ret;
}

static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
                                                int nid, nodemask_t *nodemask)
{
        struct page *page;
        gfp_t preferred_gfp;

        /*
         * This is a two pass approach. The first pass will only try the
         * preferred nodes but skip the direct reclaim and allow the
         * allocation to fail, while the second pass will try all the
         * nodes in system.
         */
        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
        page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
        if (!page)
                page = __alloc_pages_noprof(gfp, order, nid, NULL);

        return page;
}

/**
 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
 * @gfp: GFP flags.
 * @order: Order of the page allocation.
 * @pol: Pointer to the NUMA mempolicy.
 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
 *
 * Return: The page on success or NULL if allocation fails.
 */
struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        nodemask_t *nodemask;
        struct page *page;

        nodemask = policy_nodemask(gfp, pol, ilx, &nid);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_preferred_many(gfp, order, nid, nodemask);

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            /* filter "hugepage" allocation, unless from alloc_pages() */
            order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
                /*
                 * For hugepage allocation and non-interleave policy which
                 * allows the current node (or other explicitly preferred
                 * node) we only try to allocate from the current/preferred
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode != MPOL_INTERLEAVE &&
                    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
                    (!nodemask || node_isset(nid, *nodemask))) {
                        /*
                         * First, try to allocate THP only on local node, but
                         * don't reclaim unnecessarily, just compact.
                         */
                        page = __alloc_pages_node_noprof(nid,
                                gfp | __GFP_THISNODE | __GFP_NORETRY, order);
                        if (page || !(gfp & __GFP_DIRECT_RECLAIM))
                                return page;
                        /*
                         * If hugepage allocations are configured to always
                         * synchronous compact or the vma has been madvised
                         * to prefer hugepage backing, retry allowing remote
                         * memory with both reclaim and compact as well.
                         */
                }
        }

        page = __alloc_pages_noprof(gfp, order, nid, nodemask);

        if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
                /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
                if (static_branch_likely(&vm_numa_stat_key) &&
                    page_to_nid(page) == nid) {
                        preempt_disable();
                        __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
                        preempt_enable();
                }
        }

        return page;
}

/**
 * vma_alloc_folio - Allocate a folio for a VMA.
 * @gfp: GFP flags.
 * @order: Order of the folio.
 * @vma: Pointer to VMA.
 * @addr: Virtual address of the allocation.  Must be inside @vma.
 * @hugepage: Unused (was: For hugepages try only preferred node if possible).
 *
 * Allocate a folio for a specific address in @vma, using the appropriate
 * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
 * VMA to prevent it from going away.  Should be used for all allocations
 * for folios that will be mapped into user space, excepting hugetlbfs, and
 * excepting where direct use of alloc_pages_mpol() is more appropriate.
 *
 * Return: The folio on success or NULL if allocation fails.
 */
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, bool hugepage)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct page *page;

        pol = get_vma_policy(vma, addr, order, &ilx);
        page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order,
                                       pol, ilx, numa_node_id());
        mpol_cond_put(pol);
        return page_rmappable_folio(page);
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);

/**
 * alloc_pages - Allocate pages.
 * @gfp: GFP flags.
 * @order: Power of two of number of pages to allocate.
 *
 * Allocate 1 << @order contiguous pages.  The physical address of the
 * first page is naturally aligned (eg an order-3 allocation will be aligned
 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
 * process is honoured when in process context.
 *
 * Context: Can be called from any context, providing the appropriate GFP
 * flags are used.
 * Return: The page on success or NULL if allocation fails.
 */
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
        struct mempolicy *pol = &default_policy;

        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
         */
        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
                                       numa_node_id());
}
EXPORT_SYMBOL(alloc_pages_noprof);

struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc_noprof);

static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        int nodes;
        unsigned long nr_pages_per_node;
        int delta;
        int i;
        unsigned long nr_allocated;
        unsigned long total_allocated = 0;

        nodes = nodes_weight(pol->nodes);
        nr_pages_per_node = nr_pages / nodes;
        delta = nr_pages - nodes * nr_pages_per_node;

        for (i = 0; i < nodes; i++) {
                if (delta) {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node + 1, NULL,
                                        page_array);
                        delta--;
                } else {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node, NULL, page_array);
                }

                page_array += nr_allocated;
                total_allocated += nr_allocated;
        }

        return total_allocated;
}

static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        struct task_struct *me = current;
        unsigned int cpuset_mems_cookie;
        unsigned long total_allocated = 0;
        unsigned long nr_allocated = 0;
        unsigned long rounds;
        unsigned long node_pages, delta;
        u8 *table, *weights, weight;
        unsigned int weight_total = 0;
        unsigned long rem_pages = nr_pages;
        nodemask_t nodes;
        int nnodes, node;
        int resume_node = MAX_NUMNODES - 1;
        u8 resume_weight = 0;
        int prev_node;
        int i;

        if (!nr_pages)
                return 0;

        /* read the nodes onto the stack, retry if done during rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nnodes = read_once_policy_nodemask(pol, &nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        /* if the nodemask has become invalid, we cannot do anything */
        if (!nnodes)
                return 0;

        /* Continue allocating from most recent node and adjust the nr_pages */
        node = me->il_prev;
        weight = me->il_weight;
        if (weight && node_isset(node, nodes)) {
                node_pages = min(rem_pages, weight);
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  NULL, page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                /* if that's all the pages, no need to interleave */
                if (rem_pages <= weight) {
                        me->il_weight -= rem_pages;
                        return total_allocated;
                }
                /* Otherwise we adjust remaining pages, continue from there */
                rem_pages -= weight;
        }
        /* clear active weight in case of an allocation failure */
        me->il_weight = 0;
        prev_node = node;

        /* create a local copy of node weights to operate on outside rcu */
        weights = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!weights)
                return total_allocated;

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        if (table)
                memcpy(weights, table, nr_node_ids);
        rcu_read_unlock();

        /* calculate total, detect system default usage */
        for_each_node_mask(node, nodes) {
                if (!weights[node])
                        weights[node] = 1;
                weight_total += weights[node];
        }

        /*
         * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
         * Track which node weighted interleave should resume from.
         *
         * if (rounds > 0) and (delta == 0), resume_node will always be
         * the node following prev_node and its weight.
         */
        rounds = rem_pages / weight_total;
        delta = rem_pages % weight_total;
        resume_node = next_node_in(prev_node, nodes);
        resume_weight = weights[resume_node];
        for (i = 0; i < nnodes; i++) {
                node = next_node_in(prev_node, nodes);
                weight = weights[node];
                node_pages = weight * rounds;
                /* If a delta exists, add this node's portion of the delta */
                if (delta > weight) {
                        node_pages += weight;
                        delta -= weight;
                } else if (delta) {
                        /* when delta is depleted, resume from that node */
                        node_pages += delta;
                        resume_node = node;
                        resume_weight = weight - delta;
                        delta = 0;
                }
                /* node_pages can be 0 if an allocation fails and rounds == 0 */
                if (!node_pages)
                        break;
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  NULL, page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                if (total_allocated == nr_pages)
                        break;
                prev_node = node;
        }
        me->il_prev = resume_node;
        me->il_weight = resume_weight;
        kfree(weights);
        return total_allocated;
}

static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        gfp_t preferred_gfp;
        unsigned long nr_allocated = 0;

        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);

        nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
                                           nr_pages, NULL, page_array);

        if (nr_allocated < nr_pages)
                nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
                                nr_pages - nr_allocated, NULL,
                                page_array + nr_allocated);
        return nr_allocated;
}

/* alloc pages bulk and mempolicy should be considered at the
 * same time in some situation such as vmalloc.
 *
 * It can accelerate memory allocation especially interleaving
 * allocate memory.
 */
unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
                unsigned long nr_pages, struct page **page_array)
{
        struct mempolicy *pol = &default_policy;
        nodemask_t *nodemask;
        int nid;

        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        if (pol->mode == MPOL_INTERLEAVE)
                return alloc_pages_bulk_array_interleave(gfp, pol,
                                                         nr_pages, page_array);

        if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
                return alloc_pages_bulk_array_weighted_interleave(
                                  gfp, pol, nr_pages, page_array);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_bulk_array_preferred_many(gfp,
                                numa_node_id(), pol, nr_pages, page_array);

        nid = numa_node_id();
        nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
        return alloc_pages_bulk_noprof(gfp, nid, nodemask,
                                       nr_pages, NULL, page_array);
}

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        struct mempolicy *pol = mpol_dup(src->vm_policy);

        if (IS_ERR(pol))
                return PTR_ERR(pol);
        dst->vm_policy = pol;
        return 0;
}

/*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 *
 * current's mempolicy may be rebinded by the other task(the task that changes
 * cpuset's mems), so we needn't do rebind work for current task.
 */

/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

        if (!new)
                return ERR_PTR(-ENOMEM);

        /* task's mempolicy is protected by alloc_lock */
        if (old == current->mempolicy) {
                task_lock(current);
                *new = *old;
                task_unlock(current);
        } else
                *new = *old;

        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                mpol_rebind_policy(new, &mems);
        }
        atomic_set(&new->refcnt, 1);
        return new;
}

/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (!a || !b)
                return false;
        if (a->mode != b->mode)
                return false;
        if (a->flags != b->flags)
                return false;
        if (a->home_node != b->home_node)
                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
                        return false;

        switch (a->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                return !!nodes_equal(a->nodes, b->nodes);
        case MPOL_LOCAL:
                return true;
        default:
                BUG();
                return false;
        }
}

/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
 * The policies are kept in Red-Black tree linked from the inode.
 * They are protected by the sp->lock rwlock, which should be held
 * for any accesses to the tree.
 */

/*
 * lookup first element intersecting start-end.  Caller holds sp->lock for
 * reading or for writing
 */
static struct sp_node *sp_lookup(struct shared_policy *sp,
                                        pgoff_t start, pgoff_t end)
{
        struct rb_node *n = sp->root.rb_node;

        while (n) {
                struct sp_node *p = rb_entry(n, struct sp_node, nd);

                if (start >= p->end)
                        n = n->rb_right;
                else if (end <= p->start)
                        n = n->rb_left;
                else
                        break;
        }
        if (!n)
                return NULL;
        for (;;) {
                struct sp_node *w = NULL;
                struct rb_node *prev = rb_prev(n);
                if (!prev)
                        break;
                w = rb_entry(prev, struct sp_node, nd);
                if (w->end <= start)
                        break;
                n = prev;
        }
        return rb_entry(n, struct sp_node, nd);
}

/*
 * Insert a new shared policy into the list.  Caller holds sp->lock for
 * writing.
 */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
        struct rb_node **p = &sp->root.rb_node;
        struct rb_node *parent = NULL;
        struct sp_node *nd;

        while (*p) {
                parent = *p;
                nd = rb_entry(parent, struct sp_node, nd);
                if (new->start < nd->start)
                        p = &(*p)->rb_left;
                else if (new->end > nd->end)
                        p = &(*p)->rb_right;
                else
                        BUG();
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
}

/* Find shared policy intersecting idx */
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                                pgoff_t idx)
{
        struct mempolicy *pol = NULL;
        struct sp_node *sn;

        if (!sp->root.rb_node)
                return NULL;
        read_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
        read_unlock(&sp->lock);
        return pol;
}

static void sp_free(struct sp_node *n)
{
        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
}

/**
 * mpol_misplaced - check whether current folio node is valid in policy
 *
 * @folio: folio to be checked
 * @vmf: structure describing the fault
 * @addr: virtual address in @vma for shared policy lookup and interleave policy
 *
 * Lookup current policy node id for vma,addr and "compare to" folio's
 * node id.  Policy determination "mimics" alloc_page_vma().
 * Called from fault path where we know the vma and faulting address.
 *
 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
 * policy, or a suitable node ID to allocate a replacement folio from.
 */
int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                   unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct zoneref *z;
        int curnid = folio_nid(folio);
        struct vm_area_struct *vma = vmf->vma;
        int thiscpu = raw_smp_processor_id();
        int thisnid = numa_node_id();
        int polnid = NUMA_NO_NODE;
        int ret = NUMA_NO_NODE;

        /*
         * Make sure ptl is held so that we don't preempt and we
         * have a stable smp processor id
         */
        lockdep_assert_held(vmf->ptl);
        pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;

        switch (pol->mode) {
        case MPOL_INTERLEAVE:
                polnid = interleave_nid(pol, ilx);
                break;

        case MPOL_WEIGHTED_INTERLEAVE:
                polnid = weighted_interleave_nid(pol, ilx);
                break;

        case MPOL_PREFERRED:
                if (node_isset(curnid, pol->nodes))
                        goto out;
                polnid = first_node(pol->nodes);
                break;

        case MPOL_LOCAL:
                polnid = numa_node_id();
                break;

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
                /*
                 * Even though MPOL_PREFERRED_MANY can allocate pages outside
                 * policy nodemask we don't allow numa migration to nodes
                 * outside policy nodemask for now. This is done so that if we
                 * want demotion to slow memory to happen, before allocating
                 * from some DRAM node say 'x', we will end up using a
                 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
                 * we should not promote to node 'x' from slow memory node.
                 */
                if (pol->flags & MPOL_F_MORON) {
                        /*
                         * Optimize placement among multiple nodes
                         * via NUMA balancing
                         */
                        if (node_isset(thisnid, pol->nodes))
                                break;
                        goto out;
                }

                /*
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
                 */
                if (node_isset(curnid, pol->nodes))
                        goto out;
                z = first_zones_zonelist(
                                node_zonelist(thisnid, GFP_HIGHUSER),
                                gfp_zone(GFP_HIGHUSER),
                                &pol->nodes);
                polnid = zone_to_nid(z->zone);
                break;

        default:
                BUG();
        }

        /* Migrate the folio towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
                polnid = thisnid;

                if (!should_numa_migrate_memory(current, folio, curnid,
                                                thiscpu))
                        goto out;
        }

        if (curnid != polnid)
                ret = polnid;
out:
        mpol_cond_put(pol);

        return ret;
}

/*
 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
 * dropped after task->mempolicy is set to NULL so that any allocation done as
 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
 * policy.
 */
void mpol_put_task_policy(struct task_struct *task)
{
        struct mempolicy *pol;

        task_lock(task);
        pol = task->mempolicy;
        task->mempolicy = NULL;
        task_unlock(task);
        mpol_put(pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
        rb_erase(&n->nd, &sp->root);
        sp_free(n);
}

static void sp_node_init(struct sp_node *node, unsigned long start,
                        unsigned long end, struct mempolicy *pol)
{
        node->start = start;
        node->end = end;
        node->policy = pol;
}

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
{
        struct sp_node *n;
        struct mempolicy *newpol;

        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;

        newpol = mpol_dup(pol);
        if (IS_ERR(newpol)) {
                kmem_cache_free(sn_cache, n);
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
        sp_node_init(n, start, end, newpol);

        return n;
}

/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
                                 pgoff_t end, struct sp_node *new)
{
        struct sp_node *n;
        struct sp_node *n_new = NULL;
        struct mempolicy *mpol_new = NULL;
        int ret = 0;

restart:
        write_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
                struct rb_node *next = rb_next(&n->nd);
                if (n->start >= start) {
                        if (n->end <= end)
                                sp_delete(sp, n);
                        else
                                n->start = end;
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
                                if (!n_new)
                                        goto alloc_new;

                                *mpol_new = *n->policy;
                                atomic_set(&mpol_new->refcnt, 1);
                                sp_node_init(n_new, end, n->end, mpol_new);
                                n->end = start;
                                sp_insert(sp, n_new);
                                n_new = NULL;
                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
                }
                if (!next)
                        break;
                n = rb_entry(next, struct sp_node, nd);
        }
        if (new)
                sp_insert(sp, new);
        write_unlock(&sp->lock);
        ret = 0;

err_out:
        if (mpol_new)
                mpol_put(mpol_new);
        if (n_new)
                kmem_cache_free(sn_cache, n_new);

        return ret;

alloc_new:
        write_unlock(&sp->lock);
        ret = -ENOMEM;
        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n_new)
                goto err_out;
        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!mpol_new)
                goto err_out;
        atomic_set(&mpol_new->refcnt, 1);
        goto restart;
}

/**
 * mpol_shared_policy_init - initialize shared policy for inode
 * @sp: pointer to inode shared policy
 * @mpol:  struct mempolicy to install
 *
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
 * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
        int ret;

        sp->root = RB_ROOT;                /* empty tree == default mempolicy */
        rwlock_init(&sp->lock);

        if (mpol) {
                struct sp_node *sn;
                struct mempolicy *npol;
                NODEMASK_SCRATCH(scratch);

                if (!scratch)
                        goto put_mpol;

                /* contextualize the tmpfs mount point mempolicy to this file */
                npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(npol))
                        goto free_scratch; /* no valid nodemask intersection */

                task_lock(current);
                ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                if (ret)
                        goto put_npol;

                /* alloc node covering entire file; adds ref to file's npol */
                sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
                if (sn)
                        sp_insert(sp, sn);
put_npol:
                mpol_put(npol);        /* drop initial ref on file's npol */
free_scratch:
                NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
                mpol_put(mpol);        /* drop our incoming ref on sb mpol */
        }
}

int mpol_set_shared_policy(struct shared_policy *sp,
                        struct vm_area_struct *vma, struct mempolicy *pol)
{
        int err;
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);

        if (pol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
                if (!new)
                        return -ENOMEM;
        }
        err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
        if (err && new)
                sp_free(new);
        return err;
}

/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *sp)
{
        struct sp_node *n;
        struct rb_node *next;

        if (!sp->root.rb_node)
                return;
        write_lock(&sp->lock);
        next = rb_first(&sp->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(sp, n);
        }
        write_unlock(&sp->lock);
}

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
}

static int __init setup_numabalancing(char *str)
{
        int ret = 0;
        if (!str)
                goto out;

        if (!strcmp(str, "enable")) {
                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
                numabalancing_override = -1;
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("Unable to parse numa_balancing=\n");

        return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */

void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes;
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .nodes = nodemask_of_node(nid),
                };
        }

        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/*
 * Parse and format mempolicy from/to strings
 */
static const char * const policy_modes[] =
{
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
        [MPOL_LOCAL]      = "local",
        [MPOL_PREFERRED_MANY]  = "prefer (many)",
};

#ifdef CONFIG_TMPFS
/**
 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
 *
 * Format of input:
 *        <mode>[=<flags>][:<nodelist>]
 *
 * Return: %0 on success, else %1
 */
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        struct mempolicy *new = NULL;
        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
        int err = 1, mode;

        if (flags)
                *flags++ = '\0';        /* terminate mode string */

        if (nodelist) {
                /* NUL-terminate mode or flags string */
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);

        mode = match_string(policy_modes, MPOL_MAX, str);
        if (mode < 0)
                goto out;

        switch (mode) {
        case MPOL_PREFERRED:
                /*
                 * Insist on a nodelist of one node only, although later
                 * we use first_node(nodes) to grab a single node, so here
                 * nodelist (or nodes) cannot be empty.
                 */
                if (nodelist) {
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
                        if (*rest)
                                goto out;
                        if (nodes_empty(nodes))
                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                /*
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
                 * Don't allow a nodelist;  mpol_new() checks flags
                 */
                if (nodelist)
                        goto out;
                break;
        case MPOL_DEFAULT:
                /*
                 * Insist on a empty nodelist
                 */
                if (!nodelist)
                        err = 0;
                goto out;
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
                 */
                if (!nodelist)
                        goto out;
        }

        mode_flags = 0;
        if (flags) {
                /*
                 * Currently, we only support two mutually exclusive
                 * mode flags.
                 */
                if (!strcmp(flags, "static"))
                        mode_flags |= MPOL_F_STATIC_NODES;
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
                        goto out;
        }

        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
                goto out;

        /*
         * Save nodes for mpol_to_str() to show the tmpfs mount options
         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
         */
        if (mode != MPOL_PREFERRED) {
                new->nodes = nodes;
        } else if (nodelist) {
                nodes_clear(new->nodes);
                node_set(first_node(nodes), new->nodes);
        } else {
                new->mode = MPOL_LOCAL;
        }

        /*
         * Save nodes for contextualization: this will be used to "clone"
         * the mempolicy in a specific context [cpuset] at a later time.
         */
        new->w.user_nodemask = nodes;

        err = 0;

out:
        /* Restore string for error message */
        if (nodelist)
                *--nodelist = ':';
        if (flags)
                *--flags = '=';
        if (!err)
                *mpol = new;
        return err;
}
#endif /* CONFIG_TMPFS */

/**
 * mpol_to_str - format a mempolicy structure for printing
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
 * Convert @pol into a string.  If @buffer is too short, truncate the string.
 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
 * longest flag, "relative", and to display at least a few node ids.
 */
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
        char *p = buffer;
        nodemask_t nodes = NODE_MASK_NONE;
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;

        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
                mode = pol->mode;
                flags = pol->flags;
        }

        switch (mode) {
        case MPOL_DEFAULT:
        case MPOL_LOCAL:
                break;
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                nodes = pol->nodes;
                break;
        default:
                WARN_ON_ONCE(1);
                snprintf(p, maxlen, "unknown");
                return;
        }

        p += snprintf(p, maxlen, "%s", policy_modes[mode]);

        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");

                /*
                 * Currently, the only defined flags are mutually exclusive
                 */
                if (flags & MPOL_F_STATIC_NODES)
                        p += snprintf(p, buffer + maxlen - p, "static");
                else if (flags & MPOL_F_RELATIVE_NODES)
                        p += snprintf(p, buffer + maxlen - p, "relative");
        }

        if (!nodes_empty(nodes))
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
}

#ifdef CONFIG_SYSFS
struct iw_node_attr {
        struct kobj_attribute kobj_attr;
        int nid;
};

static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
                         char *buf)
{
        struct iw_node_attr *node_attr;
        u8 weight;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        weight = get_il_weight(node_attr->nid);
        return sysfs_emit(buf, "%d\n", weight);
}

static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t count)
{
        struct iw_node_attr *node_attr;
        u8 *new;
        u8 *old;
        u8 weight = 0;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        if (count == 0 || sysfs_streq(buf, ""))
                weight = 0;
        else if (kstrtou8(buf, 0, &weight))
                return -EINVAL;

        new = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!new)
                return -ENOMEM;

        mutex_lock(&iw_table_lock);
        old = rcu_dereference_protected(iw_table,
                                        lockdep_is_held(&iw_table_lock));
        if (old)
                memcpy(new, old, nr_node_ids);
        new[node_attr->nid] = weight;
        rcu_assign_pointer(iw_table, new);
        mutex_unlock(&iw_table_lock);
        synchronize_rcu();
        kfree(old);
        return count;
}

static struct iw_node_attr **node_attrs;

static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
                                  struct kobject *parent)
{
        if (!node_attr)
                return;
        sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
        kfree(node_attr->kobj_attr.attr.name);
        kfree(node_attr);
}

static void sysfs_wi_release(struct kobject *wi_kobj)
{
        int i;

        for (i = 0; i < nr_node_ids; i++)
                sysfs_wi_node_release(node_attrs[i], wi_kobj);
        kobject_put(wi_kobj);
}

static const struct kobj_type wi_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
        .release = sysfs_wi_release,
};

static int add_weight_node(int nid, struct kobject *wi_kobj)
{
        struct iw_node_attr *node_attr;
        char *name;

        node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
        if (!node_attr)
                return -ENOMEM;

        name = kasprintf(GFP_KERNEL, "node%d", nid);
        if (!name) {
                kfree(node_attr);
                return -ENOMEM;
        }

        sysfs_attr_init(&node_attr->kobj_attr.attr);
        node_attr->kobj_attr.attr.name = name;
        node_attr->kobj_attr.attr.mode = 0644;
        node_attr->kobj_attr.show = node_show;
        node_attr->kobj_attr.store = node_store;
        node_attr->nid = nid;

        if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
                kfree(node_attr->kobj_attr.attr.name);
                kfree(node_attr);
                pr_err("failed to add attribute to weighted_interleave\n");
                return -ENOMEM;
        }

        node_attrs[nid] = node_attr;
        return 0;
}

static int add_weighted_interleave_group(struct kobject *root_kobj)
{
        struct kobject *wi_kobj;
        int nid, err;

        wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
        if (!wi_kobj)
                return -ENOMEM;

        err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
                                   "weighted_interleave");
        if (err) {
                kfree(wi_kobj);
                return err;
        }

        for_each_node_state(nid, N_POSSIBLE) {
                err = add_weight_node(nid, wi_kobj);
                if (err) {
                        pr_err("failed to add sysfs [node%d]\n", nid);
                        break;
                }
        }
        if (err)
                kobject_put(wi_kobj);
        return 0;
}

static void mempolicy_kobj_release(struct kobject *kobj)
{
        u8 *old;

        mutex_lock(&iw_table_lock);
        old = rcu_dereference_protected(iw_table,
                                        lockdep_is_held(&iw_table_lock));
        rcu_assign_pointer(iw_table, NULL);
        mutex_unlock(&iw_table_lock);
        synchronize_rcu();
        kfree(old);
        kfree(node_attrs);
        kfree(kobj);
}

static const struct kobj_type mempolicy_ktype = {
        .release = mempolicy_kobj_release
};

static int __init mempolicy_sysfs_init(void)
{
        int err;
        static struct kobject *mempolicy_kobj;

        mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
        if (!mempolicy_kobj) {
                err = -ENOMEM;
                goto err_out;
        }

        node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
                             GFP_KERNEL);
        if (!node_attrs) {
                err = -ENOMEM;
                goto mempol_out;
        }

        err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
                                   "mempolicy");
        if (err)
                goto node_out;

        err = add_weighted_interleave_group(mempolicy_kobj);
        if (err) {
                pr_err("mempolicy sysfs structure failed to initialize\n");
                kobject_put(mempolicy_kobj);
                return err;
        }

        return err;
node_out:
        kfree(node_attrs);
mempol_out:
        kfree(mempolicy_kobj);
err_out:
        pr_err("failed to add mempolicy kobject to the system\n");
        return err;
}

late_initcall(mempolicy_sysfs_init);
#endif /* CONFIG_SYSFS */






























































































































































































































































































































































































































































    1 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
// SPDX-License-Identifier: GPL-2.0+
/*
 * ext4_jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
 *
 * Ext4-specific journaling extensions.
 */

#ifndef _EXT4_JBD2_H
#define _EXT4_JBD2_H

#include <linux/fs.h>
#include <linux/jbd2.h>
#include "ext4.h"

#define EXT4_JOURNAL(inode)        (EXT4_SB((inode)->i_sb)->s_journal)

/* Define the number of blocks we need to account to a transaction to
 * modify one block of data.
 *
 * We may have to touch one inode, one bitmap buffer, up to three
 * indirection blocks, the group and superblock summaries, and the data
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
 * 5 levels of tree, data block (for each of these we need bitmap + group
 * summaries), root which is stored in the inode, sb
 */

#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
        (ext4_has_feature_extents(sb) ? 20U : 8U)

/* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
 * and the superblock, which are already accounted for. */

#define EXT4_XATTR_TRANS_BLOCKS                6U

/* Define the minimum size for a transaction which modifies data.  This
 * needs to take into account the fact that we may end up modifying two
 * quota files too (one for the group, one for the user quota).  The
 * superblock only gets updated once, of course, so don't bother
 * counting that again for the quota updates. */

#define EXT4_DATA_TRANS_BLOCKS(sb)        (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/*
 * Define the number of metadata blocks we need to account to modify data.
 *
 * This include super block, inode block, quota blocks and xattr blocks
 */
#define EXT4_META_TRANS_BLOCKS(sb)        (EXT4_XATTR_TRANS_BLOCKS + \
                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/* Define an arbitrary limit for the amount of data we will anticipate
 * writing to any given transaction.  For unbounded transactions such as
 * write(2) and truncate(2) we can write more than this, but we always
 * start off at the maximum transaction size and grow the transaction
 * optimistically as we go. */

#define EXT4_MAX_TRANS_DATA                64U

/* We break up a large truncate or write transaction once the handle's
 * buffer credits gets this low, we need either to extend the
 * transaction or to start a new one.  Reserve enough space here for
 * inode, bitmap, superblock, group and indirection updates for at least
 * one block, plus two quota updates.  Quota allocations are not
 * needed. */

#define EXT4_RESERVE_TRANS_BLOCKS        12U

/*
 * Number of credits needed if we need to insert an entry into a
 * directory.  For each new index block, we need 4 blocks (old index
 * block, new index block, bitmap block, bg summary).  For normal
 * htree directories there are 2 levels; if the largedir feature
 * enabled it's 3 levels.
 */
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS        12U

#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
 * allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)

#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))

/*
 * Ext4 handle operation types -- for logging purposes
 */
#define EXT4_HT_MISC             0
#define EXT4_HT_INODE            1
#define EXT4_HT_WRITE_PAGE       2
#define EXT4_HT_MAP_BLOCKS       3
#define EXT4_HT_DIR              4
#define EXT4_HT_TRUNCATE         5
#define EXT4_HT_QUOTA            6
#define EXT4_HT_RESIZE           7
#define EXT4_HT_MIGRATE          8
#define EXT4_HT_MOVE_EXTENTS     9
#define EXT4_HT_XATTR           10
#define EXT4_HT_EXT_CONVERT     11
#define EXT4_HT_MAX             12

/**
 *   struct ext4_journal_cb_entry - Base structure for callback information.
 *
 *   This struct is a 'seed' structure for a using with your own callback
 *   structs. If you are using callbacks you must allocate one of these
 *   or another struct of your own definition which has this struct
 *   as it's first element and pass it to ext4_journal_callback_add().
 */
struct ext4_journal_cb_entry {
        /* list information for other callbacks attached to the same handle */
        struct list_head jce_list;

        /*  Function to call with this callback structure */
        void (*jce_func)(struct super_block *sb,
                         struct ext4_journal_cb_entry *jce, int error);

        /* user data goes here */
};

/**
 * ext4_journal_callback_add: add a function to call after transaction commit
 * @handle: active journal transaction handle to register callback on
 * @func: callback function to call after the transaction has committed:
 *        @sb: superblock of current filesystem for transaction
 *        @jce: returned journal callback data
 *        @rc: journal state at commit (0 = transaction committed properly)
 * @jce: journal callback data (internal and function private data struct)
 *
 * The registered function will be called in the context of the journal thread
 * after the transaction for which the handle was created has completed.
 *
 * No locks are held when the callback function is called, so it is safe to
 * call blocking functions from within the callback, but the callback should
 * not block or run for too long, or the filesystem will be blocked waiting for
 * the next transaction to commit. No journaling functions can be used, or
 * there is a risk of deadlock.
 *
 * There is no guaranteed calling order of multiple registered callbacks on
 * the same transaction.
 */
static inline void _ext4_journal_callback_add(handle_t *handle,
                        struct ext4_journal_cb_entry *jce)
{
        /* Add the jce to transaction's private list */
        list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
}

static inline void ext4_journal_callback_add(handle_t *handle,
                        void (*func)(struct super_block *sb,
                                     struct ext4_journal_cb_entry *jce,
                                     int rc),
                        struct ext4_journal_cb_entry *jce)
{
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        /* Add the jce to transaction's private list */
        jce->jce_func = func;
        spin_lock(&sbi->s_md_lock);
        _ext4_journal_callback_add(handle, jce);
        spin_unlock(&sbi->s_md_lock);
}


/**
 * ext4_journal_callback_del: delete a registered callback
 * @handle: active journal transaction handle on which callback was registered
 * @jce: registered journal callback entry to unregister
 * Return true if object was successfully removed
 */
static inline bool ext4_journal_callback_try_del(handle_t *handle,
                                             struct ext4_journal_cb_entry *jce)
{
        bool deleted;
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        spin_lock(&sbi->s_md_lock);
        deleted = !list_empty(&jce->jce_list);
        list_del_init(&jce->jce_list);
        spin_unlock(&sbi->s_md_lock);
        return deleted;
}

int
ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
                     struct ext4_iloc *iloc);

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                        struct ext4_iloc *iloc);

#define ext4_mark_inode_dirty(__h, __i)                                        \
                __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line);

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc);
/*
 * Wrapper functions with which ext4 calls into JBD.
 */
int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct super_block *sb,
                                    struct buffer_head *bh,
                                    enum ext4_journal_trigger_type trigger_type);

int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr);

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct super_block *sb,
                                struct buffer_head *bh,
                                enum ext4_journal_trigger_type trigger_type);

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh);

#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \
                                        (bh), (trigger_type))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
                      (bh), (block_nr))
#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \
                                         (bh), (trigger_type))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                     (bh))

handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
                                  unsigned int line, int type, int blocks,
                                  int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);

#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)

/* Note:  Do not use this for NULL handles.  This is only to determine if
 * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
        if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
                return 0;
        return 1;
}

static inline void ext4_handle_sync(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                handle->h_sync = 1;
}

static inline int ext4_handle_is_aborted(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                return is_handle_aborted(handle);
        return 0;
}

static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
                                                    int blocks)
{
        /* Freeing each metadata block can result in freeing one cluster */
        return blocks * EXT4_SB(sb)->s_cluster_ratio;
}

static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
{
        return ext4_free_metadata_revoke_credits(sb, 8);
}

#define ext4_journal_start_sb(sb, type, nblocks)                        \
        __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
                                ext4_trans_default_revoke_credits(sb))

#define ext4_journal_start(inode, type, nblocks)                        \
        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0,        \
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
        __ext4_journal_start((inode), __LINE__, (type), (blocks), 0,        \
                             (revoke_creds))

static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
                                             int blocks, int rsv_blocks,
                                             int revoke_creds)
{
        return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
                                       rsv_blocks, revoke_creds);
}

#define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))

#define ext4_journal_start_reserved(handle, type) \
        __ext4_journal_start_reserved((handle), __LINE__, (type))

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type);

static inline handle_t *ext4_journal_current_handle(void)
{
        return journal_current_handle();
}

static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_extend(handle, nblocks, revoke);
        return 0;
}

static inline int ext4_journal_restart(handle_t *handle, int nblocks,
                                       int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
        return 0;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred);


/*
 * Ensure @handle has at least @check_creds credits available. If not,
 * transaction will be extended or restarted to contain at least @extend_cred
 * credits. Before restarting transaction @fn is executed to allow for cleanup
 * before the transaction is restarted.
 *
 * The return value is < 0 in case of error, 0 in case the handle has enough
 * credits or transaction extension succeeded, 1 in case transaction had to be
 * restarted.
 */
#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred,        \
                                       revoke_cred, fn) \
({                                                                        \
        __label__ __ensure_end;                                                \
        int err = __ext4_journal_ensure_credits((handle), (check_cred),        \
                                        (extend_cred), (revoke_cred));        \
                                                                        \
        if (err <= 0)                                                        \
                goto __ensure_end;                                        \
        err = (fn);                                                        \
        if (err < 0)                                                        \
                goto __ensure_end;                                        \
        err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
        if (err == 0)                                                        \
                err = 1;                                                \
__ensure_end:                                                                \
        err;                                                                \
})

/*
 * Ensure given handle has at least requested amount of credits available,
 * possibly restarting transaction if needed. We also make sure the transaction
 * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
 * as freeing one or two blocks is very common pattern and requesting this is
 * very cheap.
 */
static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
                                              int revoke_creds)
{
        return ext4_journal_ensure_credits_fn(handle, credits, credits,
                                revoke_creds, 0);
}

static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) != NULL)
                return jbd2_journal_blocks_per_page(inode);
        return 0;
}

static inline int ext4_journal_force_commit(journal_t *journal)
{
        if (journal)
                return jbd2_journal_force_commit(journal);
        return 0;
}

static inline int ext4_jbd2_inode_add_write(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_write(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_wait(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline void ext4_update_inode_fsync_trans(handle_t *handle,
                                                 struct inode *inode,
                                                 int datasync)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                if (datasync)
                        ei->i_datasync_tid = handle->h_transaction->t_tid;
        }
}

/* super.c */
int ext4_force_commit(struct super_block *sb);

/*
 * Ext4 inode journal modes
 */
#define EXT4_INODE_JOURNAL_DATA_MODE        0x01 /* journal data mode */
#define EXT4_INODE_ORDERED_DATA_MODE        0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE        0x04 /* writeback data mode */

int ext4_inode_journal_mode(struct inode *inode);

static inline int ext4_should_journal_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}

static inline int ext4_should_order_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}

static inline int ext4_should_writeback_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}

static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
{
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 0;
        if (!ext4_should_journal_data(inode))
                return 0;
        /*
         * Data blocks in one extent are contiguous, just account for partial
         * clusters at extent boundaries
         */
        return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
}

/*
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_rwsem for direct I/O reads.  This only works for extent-based
 * files, and it doesn't work if data journaling is enabled, since the
 * dioread_nolock code uses b_private to pass information back to the
 * I/O completion handler, and this conflicts with the jbd's use of
 * b_private.
 */
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        /* temporary fix to prevent generic/422 test failures */
        if (!test_opt(inode->i_sb, DELALLOC))
                return 0;
        return 1;
}

#endif        /* _EXT4_JBD2_H */































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    2 
    2 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 









    2 







    2 




    2 





    2 






    2 

    2 



    1 


    2 

























































































    1 










    2 











    2 











    2 




    2 












    2 






    1 


    2 
    2 







    1 
    2 

    2 

    1 























































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
#include <linux/buffer_head.h> /* for try_to_free_buffers */

#include <asm/mman.h>

#include "swap.h"

/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
 *  ->i_mmap_rwsem                (truncate_pagecache)
 *    ->private_lock                (__free_pte->block_dirty_folio)
 *      ->swap_lock                (exclusive_swap_page, others)
 *        ->i_pages lock
 *
 *  ->i_rwsem
 *    ->invalidate_lock                (acquired by fs in truncate path)
 *      ->i_mmap_rwsem                (truncate->unmap_mapping_range)
 *
 *  ->mmap_lock
 *    ->i_mmap_rwsem
 *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
 *        ->i_pages lock        (arch-dependent flush_dcache_mmap_lock)
 *
 *  ->mmap_lock
 *    ->invalidate_lock                (filemap_fault)
 *      ->lock_page                (filemap_fault, access_process_vm)
 *
 *  ->i_rwsem                        (generic_perform_write)
 *    ->mmap_lock                (fault_in_readable->do_page_fault)
 *
 *  bdi->wb.list_lock
 *    sb_lock                        (fs/fs-writeback.c)
 *    ->i_pages lock                (__sync_single_inode)
 *
 *  ->i_mmap_rwsem
 *    ->anon_vma.lock                (vma_merge)
 *
 *  ->anon_vma.lock
 *    ->page_table_lock or pte_lock        (anon_vma_prepare and various)
 *
 *  ->page_table_lock or pte_lock
 *    ->swap_lock                (try_to_unmap_one)
 *    ->private_lock                (try_to_unmap_one)
 *    ->i_pages lock                (try_to_unmap_one)
 *    ->lruvec->lru_lock        (follow_page->mark_page_accessed)
 *    ->lruvec->lru_lock        (check_pte_range->isolate_lru_page)
 *    ->private_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->i_pages lock                (folio_remove_rmap_pte->set_page_dirty)
 *    bdi.wb->list_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->inode->i_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->memcg->move_lock        (folio_remove_rmap_pte->folio_memcg_lock)
 *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
 *    ->inode->i_lock                (zap_pte_range->set_page_dirty)
 *    ->private_lock                (zap_pte_range->block_dirty_folio)
 */

static void mapping_set_update(struct xa_state *xas,
                struct address_space *mapping)
{
        if (dax_mapping(mapping) || shmem_mapping(mapping))
                return;
        xas_set_update(xas, workingset_update_node);
        xas_set_lru(xas, &shadow_nodes);
}

static void page_cache_delete(struct address_space *mapping,
                                   struct folio *folio, void *shadow)
{
        XA_STATE(xas, &mapping->i_pages, folio->index);
        long nr = 1;

        mapping_set_update(&xas, mapping);

        xas_set_order(&xas, folio->index, folio_order(folio));
        nr = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        xas_store(&xas, shadow);
        xas_init_marks(&xas);

        folio->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
        mapping->nrpages -= nr;
}

static void filemap_unaccount_folio(struct address_space *mapping,
                struct folio *folio)
{
        long nr;

        VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
        if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
                pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
                         current->comm, folio_pfn(folio));
                dump_page(&folio->page, "still mapped when deleted");
                dump_stack();
                add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

                if (mapping_exiting(mapping) && !folio_test_large(folio)) {
                        int mapcount = folio_mapcount(folio);

                        if (folio_ref_count(folio) >= mapcount + 2) {
                                /*
                                 * All vmas have already been torn down, so it's
                                 * a good bet that actually the page is unmapped
                                 * and we'd rather not leak it: if we're wrong,
                                 * another bad page check should catch it later.
                                 */
                                page_mapcount_reset(&folio->page);
                                folio_ref_sub(folio, mapcount);
                        }
                }
        }

        /* hugetlb folios do not participate in page cache accounting. */
        if (folio_test_hugetlb(folio))
                return;

        nr = folio_nr_pages(folio);

        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
        if (folio_test_swapbacked(folio)) {
                __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
                if (folio_test_pmd_mappable(folio))
                        __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
        } else if (folio_test_pmd_mappable(folio)) {
                __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
                filemap_nr_thps_dec(mapping);
        }

        /*
         * At this point folio must be either written or cleaned by
         * truncate.  Dirty folio here signals a bug and loss of
         * unwritten data - on ordinary filesystems.
         *
         * But it's harmless on in-memory filesystems like tmpfs; and can
         * occur when a driver which did get_user_pages() sets page dirty
         * before putting it, while the inode is being finally evicted.
         *
         * Below fixes dirty accounting after removing the folio entirely
         * but leaves the dirty flag set: it has no effect for truncated
         * folio and anyway will be cleared before returning folio to
         * buddy allocator.
         */
        if (WARN_ON_ONCE(folio_test_dirty(folio) &&
                         mapping_can_writeback(mapping)))
                folio_account_cleaned(folio, inode_to_wb(mapping->host));
}

/*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold the i_pages lock.
 */
void __filemap_remove_folio(struct folio *folio, void *shadow)
{
        struct address_space *mapping = folio->mapping;

        trace_mm_filemap_delete_from_page_cache(folio);
        filemap_unaccount_folio(mapping, folio);
        page_cache_delete(mapping, folio, shadow);
}

void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
        void (*free_folio)(struct folio *);
        int refs = 1;

        free_folio = mapping->a_ops->free_folio;
        if (free_folio)
                free_folio(folio);

        if (folio_test_large(folio))
                refs = folio_nr_pages(folio);
        folio_put_refs(folio, refs);
}

/**
 * filemap_remove_folio - Remove folio from page cache.
 * @folio: The folio.
 *
 * This must be called only on folios that are locked and have been
 * verified to be in the page cache.  It will never put the folio into
 * the free list because the caller has a reference on the page.
 */
void filemap_remove_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        filemap_free_folio(mapping, folio);
}

/*
 * page_cache_delete_batch - delete several folios from page cache
 * @mapping: the mapping to which folios belong
 * @fbatch: batch of folios to delete
 *
 * The function walks over mapping->i_pages and removes folios passed in
 * @fbatch from the mapping. The function expects @fbatch to be sorted
 * by page index and is optimised for it to be dense.
 * It tolerates holes in @fbatch (mapping entries at those indices are not
 * modified).
 *
 * The function expects the i_pages lock to be held.
 */
static void page_cache_delete_batch(struct address_space *mapping,
                             struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
        long total_pages = 0;
        int i = 0;
        struct folio *folio;

        mapping_set_update(&xas, mapping);
        xas_for_each(&xas, folio, ULONG_MAX) {
                if (i >= folio_batch_count(fbatch))
                        break;

                /* A swap/dax/shadow entry got inserted? Skip it. */
                if (xa_is_value(folio))
                        continue;
                /*
                 * A page got inserted in our range? Skip it. We have our
                 * pages locked so they are protected from being removed.
                 * If we see a page whose index is higher than ours, it
                 * means our page has been removed, which shouldn't be
                 * possible because we're holding the PageLock.
                 */
                if (folio != fbatch->folios[i]) {
                        VM_BUG_ON_FOLIO(folio->index >
                                        fbatch->folios[i]->index, folio);
                        continue;
                }

                WARN_ON_ONCE(!folio_test_locked(folio));

                folio->mapping = NULL;
                /* Leave folio->index set: truncation lookup relies on it */

                i++;
                xas_store(&xas, NULL);
                total_pages += folio_nr_pages(folio);
        }
        mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch)
{
        int i;

        if (!folio_batch_count(fbatch))
                return;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                trace_mm_filemap_delete_from_page_cache(folio);
                filemap_unaccount_folio(mapping, folio);
        }
        page_cache_delete_batch(mapping, fbatch);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        for (i = 0; i < folio_batch_count(fbatch); i++)
                filemap_free_folio(mapping, fbatch->folios[i]);
}

int filemap_check_errors(struct address_space *mapping)
{
        int ret = 0;
        /* Check for outstanding write errors */
        if (test_bit(AS_ENOSPC, &mapping->flags) &&
            test_and_clear_bit(AS_ENOSPC, &mapping->flags))
                ret = -ENOSPC;
        if (test_bit(AS_EIO, &mapping->flags) &&
            test_and_clear_bit(AS_EIO, &mapping->flags))
                ret = -EIO;
        return ret;
}
EXPORT_SYMBOL(filemap_check_errors);

static int filemap_check_and_keep_errors(struct address_space *mapping)
{
        /* Check for outstanding write errors */
        if (test_bit(AS_EIO, &mapping->flags))
                return -EIO;
        if (test_bit(AS_ENOSPC, &mapping->flags))
                return -ENOSPC;
        return 0;
}

/**
 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @wbc:        the writeback_control controlling the writeout
 *
 * Call writepages on the mapping using the provided wbc to control the
 * writeout.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_fdatawrite_wbc(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        int ret;

        if (!mapping_can_writeback(mapping) ||
            !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;

        wbc_attach_fdatawrite_inode(wbc, mapping->host);
        ret = do_writepages(mapping, wbc);
        wbc_detach_inode(wbc);
        return ret;
}
EXPORT_SYMBOL(filemap_fdatawrite_wbc);

/**
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @start:        offset in bytes where the range starts
 * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:        enable synchronous operation
 *
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 * opposed to a regular memory cleansing writeback.  The difference between
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end, int sync_mode)
{
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
                .nr_to_write = LONG_MAX,
                .range_start = start,
                .range_end = end,
        };

        return filemap_fdatawrite_wbc(mapping, &wbc);
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
        int sync_mode)
{
        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
}

int filemap_fdatawrite(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end)
{
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite_range);

/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:        target address_space
 *
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_flush(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

/**
 * filemap_range_has_page - check if a page exists in range.
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback.
 *
 * Return: %true if at least one page exists in the specified range,
 * %false otherwise.
 */
bool filemap_range_has_page(struct address_space *mapping,
                           loff_t start_byte, loff_t end_byte)
{
        struct folio *folio;
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        for (;;) {
                folio = xas_find(&xas, max);
                if (xas_retry(&xas, folio))
                        continue;
                /* Shadow entries don't count */
                if (xa_is_value(folio))
                        continue;
                /*
                 * We don't need to try to pin this page; we're about to
                 * release the RCU lock anyway.  It is enough to know that
                 * there was a page here recently.
                 */
                break;
        }
        rcu_read_unlock();

        return folio != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);

static void __filemap_fdatawait_range(struct address_space *mapping,
                                     loff_t start_byte, loff_t end_byte)
{
        pgoff_t index = start_byte >> PAGE_SHIFT;
        pgoff_t end = end_byte >> PAGE_SHIFT;
        struct folio_batch fbatch;
        unsigned nr_folios;

        folio_batch_init(&fbatch);

        while (index <= end) {
                unsigned i;

                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                PAGECACHE_TAG_WRITEBACK, &fbatch);

                if (!nr_folios)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        folio_wait_writeback(folio);
                        folio_clear_error(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

/**
 * filemap_fdatawait_range - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space
 * in the given range and wait for all of them.  Check error status of
 * the address space and return it.
 *
 * Since the error status of the address space is cleared by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                            loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);

/**
 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space in the
 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 * this function does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 */
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

/**
 * file_fdatawait_range - wait for writeback to complete
 * @file:                file pointing to address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the address space that file
 * refers to, in the given range and wait for all of them.  Check error
 * status of the address space vs. the file->f_wb_err cursor and return it.
 *
 * Since the error status of the file is advanced by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space vs. the file->f_wb_err cursor.
 */
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
        struct address_space *mapping = file->f_mapping;

        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);

/**
 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 * @mapping: address space structure to wait for
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.  Unlike filemap_fdatawait(), this function
 * does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
        __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);

/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
        return mapping->nrpages;
}

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte)
{
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;
        struct folio *folio;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        xas_for_each(&xas, folio, max) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (folio_test_dirty(folio) || folio_test_locked(folio) ||
                                folio_test_writeback(folio))
                        break;
        }
        rcu_read_unlock();
        return folio != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);

/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:        the address_space for the pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * Return: error status of the address space.
 */
int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
{
        int err = 0, err2;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /*
                 * Even if the above returned error, the pages may be
                 * written partially (e.g. -ENOSPC), so we wait for it.
                 * But the -EIO is special case, it may indicate the worst
                 * thing (e.g. bug) happened, so we avoid waiting for it.
                 */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = filemap_check_errors(mapping);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);

void __filemap_set_wb_err(struct address_space *mapping, int err)
{
        errseq_t eseq = errseq_set(&mapping->wb_err, err);

        trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
 * file_check_and_advance_wb_err - report wb error (if any) that was previously
 *                                    and advance wb_err to current one
 * @file: struct file on which the error is being reported
 *
 * When userland calls fsync (or something like nfsd does the equivalent), we
 * want to report any writeback errors that occurred since the last fsync (or
 * since the file was opened if there haven't been any).
 *
 * Grab the wb_err from the mapping. If it matches what we have in the file,
 * then just quickly return 0. The file is all caught up.
 *
 * If it doesn't match, then take the mapping value, set the "seen" flag in
 * it and try to swap it into place. If it works, or another task beat us
 * to it with the new value, then update the f_wb_err and return the error
 * portion. The error at this point must be reported via proper channels
 * (a'la fsync, or NFS COMMIT operation, etc.).
 *
 * While we handle mapping->wb_err with atomic operations, the f_wb_err
 * value is protected by the f_lock since we must ensure that it reflects
 * the latest value swapped in for this file descriptor.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_check_and_advance_wb_err(struct file *file)
{
        int err = 0;
        errseq_t old = READ_ONCE(file->f_wb_err);
        struct address_space *mapping = file->f_mapping;

        /* Locklessly handle the common case where nothing has changed */
        if (errseq_check(&mapping->wb_err, old)) {
                /* Something changed, must use slow path */
                spin_lock(&file->f_lock);
                old = file->f_wb_err;
                err = errseq_check_and_advance(&mapping->wb_err,
                                                &file->f_wb_err);
                trace_file_check_and_advance_wb_err(file, old);
                spin_unlock(&file->f_lock);
        }

        /*
         * We're mostly using this function as a drop in replacement for
         * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
         * that the legacy code would have had on these flags.
         */
        clear_bit(AS_EIO, &mapping->flags);
        clear_bit(AS_ENOSPC, &mapping->flags);
        return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
 * file_write_and_wait_range - write out & wait on a file range
 * @file:        file pointing to address_space with pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * After writing out and waiting on the data, we check and advance the
 * f_wb_err cursor to the latest value, and return any errors detected there.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
        int err = 0, err2;
        struct address_space *mapping = file->f_mapping;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = file_check_and_advance_wb_err(file);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

/**
 * replace_page_cache_folio - replace a pagecache folio with a new one
 * @old:        folio to be replaced
 * @new:        folio to replace with
 *
 * This function replaces a folio in the pagecache with a new one.  On
 * success it acquires the pagecache reference for the new folio and
 * drops it for the old folio.  Both the old and new folios must be
 * locked.  This function does not add the new folio to the LRU, the
 * caller must do that.
 *
 * The remove + add is atomic.  This function cannot fail.
 */
void replace_page_cache_folio(struct folio *old, struct folio *new)
{
        struct address_space *mapping = old->mapping;
        void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
        pgoff_t offset = old->index;
        XA_STATE(xas, &mapping->i_pages, offset);

        VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
        VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
        VM_BUG_ON_FOLIO(new->mapping, new);

        folio_get(new);
        new->mapping = mapping;
        new->index = offset;

        mem_cgroup_replace_folio(old, new);

        xas_lock_irq(&xas);
        xas_store(&xas, new);

        old->mapping = NULL;
        /* hugetlb pages do not participate in page cache accounting. */
        if (!folio_test_hugetlb(old))
                __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
        if (!folio_test_hugetlb(new))
                __lruvec_stat_add_folio(new, NR_FILE_PAGES);
        if (folio_test_swapbacked(old))
                __lruvec_stat_sub_folio(old, NR_SHMEM);
        if (folio_test_swapbacked(new))
                __lruvec_stat_add_folio(new, NR_SHMEM);
        xas_unlock_irq(&xas);
        if (free_folio)
                free_folio(old);
        folio_put(old);
}
EXPORT_SYMBOL_GPL(replace_page_cache_folio);

noinline int __filemap_add_folio(struct address_space *mapping,
                struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
        XA_STATE(xas, &mapping->i_pages, index);
        void *alloced_shadow = NULL;
        int alloced_order = 0;
        bool huge;
        long nr;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
        mapping_set_update(&xas, mapping);

        VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
        xas_set_order(&xas, index, folio_order(folio));
        huge = folio_test_hugetlb(folio);
        nr = folio_nr_pages(folio);

        gfp &= GFP_RECLAIM_MASK;
        folio_ref_add(folio, nr);
        folio->mapping = mapping;
        folio->index = xas.xa_index;

        for (;;) {
                int order = -1, split_order = 0;
                void *entry, *old = NULL;

                xas_lock_irq(&xas);
                xas_for_each_conflict(&xas, entry) {
                        old = entry;
                        if (!xa_is_value(entry)) {
                                xas_set_err(&xas, -EEXIST);
                                goto unlock;
                        }
                        /*
                         * If a larger entry exists,
                         * it will be the first and only entry iterated.
                         */
                        if (order == -1)
                                order = xas_get_order(&xas);
                }

                /* entry may have changed before we re-acquire the lock */
                if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
                        xas_destroy(&xas);
                        alloced_order = 0;
                }

                if (old) {
                        if (order > 0 && order > folio_order(folio)) {
                                /* How to handle large swap entries? */
                                BUG_ON(shmem_mapping(mapping));
                                if (!alloced_order) {
                                        split_order = order;
                                        goto unlock;
                                }
                                xas_split(&xas, old, order);
                                xas_reset(&xas);
                        }
                        if (shadowp)
                                *shadowp = old;
                }

                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;

                mapping->nrpages += nr;

                /* hugetlb pages do not participate in page cache accounting */
                if (!huge) {
                        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                        if (folio_test_pmd_mappable(folio))
                                __lruvec_stat_mod_folio(folio,
                                                NR_FILE_THPS, nr);
                }

unlock:
                xas_unlock_irq(&xas);

                /* split needed, alloc here and retry. */
                if (split_order) {
                        xas_split_alloc(&xas, old, split_order, gfp);
                        if (xas_error(&xas))
                                goto error;
                        alloced_shadow = old;
                        alloced_order = split_order;
                        xas_reset(&xas);
                        continue;
                }

                if (!xas_nomem(&xas, gfp))
                        break;
        }

        if (xas_error(&xas))
                goto error;

        trace_mm_filemap_add_to_page_cache(folio);
        return 0;
error:
        folio->mapping = NULL;
        /* Leave page->index set: truncation relies upon it */
        folio_put_refs(folio, nr);
        return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);

int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                                pgoff_t index, gfp_t gfp)
{
        void *shadow = NULL;
        int ret;

        ret = mem_cgroup_charge(folio, NULL, gfp);
        if (ret)
                return ret;

        __folio_set_locked(folio);
        ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
        if (unlikely(ret)) {
                mem_cgroup_uncharge(folio);
                __folio_clear_locked(folio);
        } else {
                /*
                 * The folio might have been evicted from cache only
                 * recently, in which case it should be activated like
                 * any other repeatedly accessed folio.
                 * The exception is folios getting rewritten; evicting other
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
                WARN_ON_ONCE(folio_test_active(folio));
                if (!(gfp & __GFP_WRITE) && shadow)
                        workingset_refault(folio, shadow);
                folio_add_lru(folio);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(filemap_add_folio);

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
        int n;
        struct folio *folio;

        if (cpuset_do_page_mem_spread()) {
                unsigned int cpuset_mems_cookie;
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
                        folio = __folio_alloc_node_noprof(gfp, order, n);
                } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));

                return folio;
        }
        return folio_alloc_noprof(gfp, order);
}
EXPORT_SYMBOL(filemap_alloc_folio_noprof);
#endif

/*
 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
 *
 * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to lock
 * @mapping2: the second mapping to lock
 */
void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2)
{
        if (mapping1 > mapping2)
                swap(mapping1, mapping2);
        if (mapping1)
                down_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);

/*
 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
 *
 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to unlock
 * @mapping2: the second mapping to unlock
 */
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2)
{
        if (mapping1)
                up_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);

/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;

static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
        return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}

void __init pagecache_init(void)
{
        int i;

        for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
                init_waitqueue_head(&folio_wait_table[i]);

        page_writeback_init();
}

/*
 * The page wait code treats the "wait->flags" somewhat unusually, because
 * we have multiple different kinds of waits, not just the usual "exclusive"
 * one.
 *
 * We have:
 *
 *  (a) no special bits set:
 *
 *        We're just waiting for the bit to be released, and when a waker
 *        calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
 *        and remove it from the wait queue.
 *
 *        Simple and straightforward.
 *
 *  (b) WQ_FLAG_EXCLUSIVE:
 *
 *        The waiter is waiting to get the lock, and only one waiter should
 *        be woken up to avoid any thundering herd behavior. We'll set the
 *        WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
 *
 *        This is the traditional exclusive wait.
 *
 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
 *
 *        The waiter is waiting to get the bit, and additionally wants the
 *        lock to be transferred to it for fair lock behavior. If the lock
 *        cannot be taken, we stop walking the wait queue without waking
 *        the waiter.
 *
 *        This is the "fair lock handoff" case, and in addition to setting
 *        WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
 *        that it now has the lock.
 */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
        unsigned int flags;
        struct wait_page_key *key = arg;
        struct wait_page_queue *wait_page
                = container_of(wait, struct wait_page_queue, wait);

        if (!wake_page_match(wait_page, key))
                return 0;

        /*
         * If it's a lock handoff wait, we get the bit for it, and
         * stop walking (and do not wake it up) if we can't.
         */
        flags = wait->flags;
        if (flags & WQ_FLAG_EXCLUSIVE) {
                if (test_bit(key->bit_nr, &key->folio->flags))
                        return -1;
                if (flags & WQ_FLAG_CUSTOM) {
                        if (test_and_set_bit(key->bit_nr, &key->folio->flags))
                                return -1;
                        flags |= WQ_FLAG_DONE;
                }
        }

        /*
         * We are holding the wait-queue lock, but the waiter that
         * is waiting for this will be checking the flags without
         * any locking.
         *
         * So update the flags atomically, and wake up the waiter
         * afterwards to avoid any races. This store-release pairs
         * with the load-acquire in folio_wait_bit_common().
         */
        smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
        wake_up_state(wait->private, mode);

        /*
         * Ok, we have successfully done what we're waiting for,
         * and we can unconditionally remove the wait entry.
         *
         * Note that this pairs with the "finish_wait()" in the
         * waiter, and has to be the absolute last thing we do.
         * After this list_del_init(&wait->entry) the wait entry
         * might be de-allocated and the process might even have
         * exited.
         */
        list_del_init_careful(&wait->entry);
        return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}

static void folio_wake_bit(struct folio *folio, int bit_nr)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        struct wait_page_key key;
        unsigned long flags;

        key.folio = folio;
        key.bit_nr = bit_nr;
        key.page_match = 0;

        spin_lock_irqsave(&q->lock, flags);
        __wake_up_locked_key(q, TASK_NORMAL, &key);

        /*
         * It's possible to miss clearing waiters here, when we woke our page
         * waiters, but the hashed waitqueue has waiters for other pages on it.
         * That's okay, it's a rare case. The next waker will clear it.
         *
         * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
         * other), the flag may be cleared in the course of freeing the page;
         * but that is not required for correctness.
         */
        if (!waitqueue_active(q) || !key.page_match)
                folio_clear_waiters(folio);

        spin_unlock_irqrestore(&q->lock, flags);
}

/*
 * A choice of three behaviors for folio_wait_bit_common():
 */
enum behavior {
        EXCLUSIVE,        /* Hold ref to page and take the bit when woken, like
                         * __folio_lock() waiting on then setting PG_locked.
                         */
        SHARED,                /* Hold ref to page and check the bit when woken, like
                         * folio_wait_writeback() waiting on PG_writeback.
                         */
        DROP,                /* Drop ref to page before wait, no check when woken,
                         * like folio_put_wait_locked() on PG_locked.
                         */
};

/*
 * Attempt to check (or get) the folio flag, and mark us done
 * if successful.
 */
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
                                        struct wait_queue_entry *wait)
{
        if (wait->flags & WQ_FLAG_EXCLUSIVE) {
                if (test_and_set_bit(bit_nr, &folio->flags))
                        return false;
        } else if (test_bit(bit_nr, &folio->flags))
                return false;

        wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
        return true;
}

/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;

static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
                int state, enum behavior behavior)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        int unfairness = sysctl_page_lock_unfairness;
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;

        if (bit_nr == PG_locked &&
            !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = bit_nr;

repeat:
        wait->flags = 0;
        if (behavior == EXCLUSIVE) {
                wait->flags = WQ_FLAG_EXCLUSIVE;
                if (--unfairness < 0)
                        wait->flags |= WQ_FLAG_CUSTOM;
        }

        /*
         * Do one last check whether we can get the
         * page bit synchronously.
         *
         * Do the folio_set_waiters() marking before that
         * to let any waker we _just_ missed know they
         * need to wake us up (otherwise they'll never
         * even go to the slow case that looks at the
         * page queue), and add ourselves to the wait
         * queue if we need to sleep.
         *
         * This part needs to be done under the queue
         * lock to avoid races.
         */
        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, bit_nr, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * From now on, all the logic will be based on
         * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
         * see whether the page bit testing has already
         * been done by the wake function.
         *
         * We can drop our reference to the folio.
         */
        if (behavior == DROP)
                folio_put(folio);

        /*
         * Note that until the "finish_wait()", or until
         * we see the WQ_FLAG_WOKEN flag, we need to
         * be very careful with the 'wait->flags', because
         * we may race with a waker that sets them.
         */
        for (;;) {
                unsigned int flags;

                set_current_state(state);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(state, current))
                                break;

                        io_schedule();
                        continue;
                }

                /* If we were non-exclusive, we're done */
                if (behavior != EXCLUSIVE)
                        break;

                /* If the waker got the lock for us, we're done */
                if (flags & WQ_FLAG_DONE)
                        break;

                /*
                 * Otherwise, if we're getting the lock, we need to
                 * try to get it ourselves.
                 *
                 * And if that fails, we'll have to retry this all.
                 */
                if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
                        goto repeat;

                wait->flags |= WQ_FLAG_DONE;
                break;
        }

        /*
         * If a signal happened, this 'finish_wait()' may remove the last
         * waiter from the wait-queues, but the folio waiters bit will remain
         * set. That's ok. The next wakeup will take care of it, and trying
         * to do it here would be difficult and prone to races.
         */
        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }

        /*
         * NOTE! The wait->flags weren't stable until we've done the
         * 'finish_wait()', and we could have exited the loop above due
         * to a signal, and had a wakeup event happen after the signal
         * test but before the 'finish_wait()'.
         *
         * So only after the finish_wait() can we reliably determine
         * if we got woken up or not, so we can now figure out the final
         * return value based on that state without races.
         *
         * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
         * waiter, but an exclusive one requires WQ_FLAG_DONE.
         */
        if (behavior == EXCLUSIVE)
                return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;

        return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}

#ifdef CONFIG_MIGRATION
/**
 * migration_entry_wait_on_locked - Wait for a migration entry to be removed
 * @entry: migration swap entry.
 * @ptl: already locked ptl. This function will drop the lock.
 *
 * Wait for a migration entry referencing the given page to be removed. This is
 * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
 * this can be called without taking a reference on the page. Instead this
 * should be called while holding the ptl for the migration entry referencing
 * the page.
 *
 * Returns after unlocking the ptl.
 *
 * This follows the same logic as folio_wait_bit_common() so see the comments
 * there.
 */
void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
        __releases(ptl)
{
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;
        wait_queue_head_t *q;
        struct folio *folio = pfn_swap_entry_folio(entry);

        q = folio_waitqueue(folio);
        if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = PG_locked;
        wait->flags = 0;

        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, PG_locked, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * If a migration entry exists for the page the migration path must hold
         * a valid reference to the page, and it must take the ptl to remove the
         * migration entry. So the page is valid until the ptl is dropped.
         */
        spin_unlock(ptl);

        for (;;) {
                unsigned int flags;

                set_current_state(TASK_UNINTERRUPTIBLE);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
                                break;

                        io_schedule();
                        continue;
                }
                break;
        }

        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }
}
#endif

void folio_wait_bit(struct folio *folio, int bit_nr)
{
        folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit);

int folio_wait_bit_killable(struct folio *folio, int bit_nr)
{
        return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit_killable);

/**
 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
 * @folio: The folio to wait for.
 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
 *
 * The caller should hold a reference on @folio.  They expect the page to
 * become unlocked relatively soon, but do not wish to hold up migration
 * (for example) by holding the reference while waiting for the folio to
 * come unlocked.  After this function returns, the caller should not
 * dereference @folio.
 *
 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
 */
static int folio_put_wait_locked(struct folio *folio, int state)
{
        return folio_wait_bit_common(folio, PG_locked, state, DROP);
}

/**
 * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
 * @folio: Folio defining the wait queue of interest
 * @waiter: Waiter to add to the queue
 *
 * Add an arbitrary @waiter to the wait queue for the nominated @folio.
 */
void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        unsigned long flags;

        spin_lock_irqsave(&q->lock, flags);
        __add_wait_queue_entry_tail(q, waiter);
        folio_set_waiters(folio);
        spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(folio_add_wait_queue);

/**
 * folio_unlock - Unlock a locked folio.
 * @folio: The folio.
 *
 * Unlocks the folio and wakes up any thread sleeping on the page lock.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_unlock(struct folio *folio)
{
        /* Bit 7 allows x86 to check the byte's sign bit */
        BUILD_BUG_ON(PG_waiters != 7);
        BUILD_BUG_ON(PG_locked > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_unlock);

/**
 * folio_end_read - End read on a folio.
 * @folio: The folio.
 * @success: True if all reads completed successfully.
 *
 * When all reads against a folio have completed, filesystems should
 * call this function to let the pagecache know that no more reads
 * are outstanding.  This will unlock the folio and wake up any thread
 * sleeping on the lock.  The folio will also be marked uptodate if all
 * reads succeeded.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_end_read(struct folio *folio, bool success)
{
        unsigned long mask = 1 << PG_locked;

        /* Must be in bottom byte for x86 to work */
        BUILD_BUG_ON(PG_uptodate > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);

        if (likely(success))
                mask |= 1 << PG_uptodate;
        if (folio_xor_flags_has_waiters(folio, mask))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_end_read);

/**
 * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
 * @folio: The folio.
 *
 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
 * it.  The folio reference held for PG_private_2 being set is released.
 *
 * This is, for example, used when a netfs folio is being written to a local
 * disk cache, thereby allowing writes to the cache for the same folio to be
 * serialised.
 */
void folio_end_private_2(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
        clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
        folio_wake_bit(folio, PG_private_2);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_private_2);

/**
 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio.
 */
void folio_wait_private_2(struct folio *folio)
{
        while (folio_test_private_2(folio))
                folio_wait_bit(folio, PG_private_2);
}
EXPORT_SYMBOL(folio_wait_private_2);

/**
 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
 * received by the calling task.
 *
 * Return:
 * - 0 if successful.
 * - -EINTR if a fatal signal was encountered.
 */
int folio_wait_private_2_killable(struct folio *folio)
{
        int ret = 0;

        while (folio_test_private_2(folio)) {
                ret = folio_wait_bit_killable(folio, PG_private_2);
                if (ret < 0)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(folio_wait_private_2_killable);

/**
 * folio_end_writeback - End writeback against a folio.
 * @folio: The folio.
 *
 * The folio must actually be under writeback.
 *
 * Context: May be called from process or interrupt context.
 */
void folio_end_writeback(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

        /*
         * folio_test_clear_reclaim() could be used here but it is an
         * atomic operation and overkill in this particular case. Failing
         * to shuffle a folio marked for immediate reclaim is too mild
         * a gain to justify taking an atomic operation penalty at the
         * end of every folio writeback.
         */
        if (folio_test_reclaim(folio)) {
                folio_clear_reclaim(folio);
                folio_rotate_reclaimable(folio);
        }

        /*
         * Writeback does not hold a folio reference of its own, relying
         * on truncation to wait for the clearing of PG_writeback.
         * But here we must make sure that the folio is not freed and
         * reused before the folio_wake_bit().
         */
        folio_get(folio);
        if (__folio_end_writeback(folio))
                folio_wake_bit(folio, PG_writeback);
        acct_reclaim_writeback(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);

/**
 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
 * @folio: The folio to lock
 */
void __folio_lock(struct folio *folio)
{
        folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
                                EXCLUSIVE);
}
EXPORT_SYMBOL(__folio_lock);

int __folio_lock_killable(struct folio *folio)
{
        return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
                                        EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__folio_lock_killable);

static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
        struct wait_queue_head *q = folio_waitqueue(folio);
        int ret;

        wait->folio = folio;
        wait->bit_nr = PG_locked;

        spin_lock_irq(&q->lock);
        __add_wait_queue_entry_tail(q, &wait->wait);
        folio_set_waiters(folio);
        ret = !folio_trylock(folio);
        /*
         * If we were successful now, we know we're still on the
         * waitqueue as we're still under the lock. This means it's
         * safe to remove and return success, we know the callback
         * isn't going to trigger.
         */
        if (!ret)
                __remove_wait_queue(q, &wait->wait);
        else
                ret = -EIOCBQUEUED;
        spin_unlock_irq(&q->lock);
        return ret;
}

/*
 * Return values:
 * 0 - folio is locked.
 * non-zero - folio is not locked.
 *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
 *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
 *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
 *
 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
 */
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
{
        unsigned int flags = vmf->flags;

        if (fault_flag_allow_retry_first(flags)) {
                /*
                 * CAUTION! In this case, mmap_lock/per-VMA lock is not
                 * released even though returning VM_FAULT_RETRY.
                 */
                if (flags & FAULT_FLAG_RETRY_NOWAIT)
                        return VM_FAULT_RETRY;

                release_fault_lock(vmf);
                if (flags & FAULT_FLAG_KILLABLE)
                        folio_wait_locked_killable(folio);
                else
                        folio_wait_locked(folio);
                return VM_FAULT_RETRY;
        }
        if (flags & FAULT_FLAG_KILLABLE) {
                bool ret;

                ret = __folio_lock_killable(folio);
                if (ret) {
                        release_fault_lock(vmf);
                        return VM_FAULT_RETRY;
                }
        } else {
                __folio_lock(folio);
        }

        return 0;
}

/**
 * page_cache_next_miss() - Find the next gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 * gap with the lowest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 5, then subsequently a gap is
 * created at index 10, page_cache_next_miss covering both indices may
 * return 10 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'return - index >= max_scan' will be true).
 * In the rare case of index wrap-around, 0 will be returned.
 */
pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_next(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == 0)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_next_miss);

/**
 * page_cache_prev_miss() - Find the previous gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [max(index - max_scan + 1, 0), index] for the
 * gap with the highest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 10, then subsequently a gap is
 * created at index 5, page_cache_prev_miss() covering both indices may
 * return 5 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'index - return >= max_scan' will be true).
 * In the rare case of wrap-around, ULONG_MAX will be returned.
 */
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_prev(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == ULONG_MAX)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);

/*
 * Lockless page cache protocol:
 * On the lookup side:
 * 1. Load the folio from i_pages
 * 2. Increment the refcount if it's not zero
 * 3. If the folio is not found by xas_reload(), put the refcount and retry
 *
 * On the removal side:
 * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
 * B. Remove the page from i_pages
 * C. Return the page to the page allocator
 *
 * This means that any page may have its reference count temporarily
 * increased by a speculative page cache (or GUP-fast) lookup as it can
 * be allocated by another user before the RCU grace period expires.
 * Because the refcount temporarily acquired here may end up being the
 * last refcount on the page, any page allocation must be freeable by
 * folio_put().
 */

/*
 * filemap_get_entry - Get a page cache entry.
 * @mapping: the address_space to search
 * @index: The page cache index.
 *
 * Looks up the page cache entry at @mapping & @index.  If it is a folio,
 * it is returned with an increased refcount.  If it is a shadow entry
 * of a previously evicted folio, or a swap entry from shmem/tmpfs,
 * it is returned without further action.
 *
 * Return: The folio, swap or shadow entry, %NULL if nothing is found.
 */
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
repeat:
        xas_reset(&xas);
        folio = xas_load(&xas);
        if (xas_retry(&xas, folio))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                goto out;

        if (!folio_try_get_rcu(folio))
                goto repeat;

        if (unlikely(folio != xas_reload(&xas))) {
                folio_put(folio);
                goto repeat;
        }
out:
        rcu_read_unlock();

        return folio;
}

/**
 * __filemap_get_folio - Find and get a reference to a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 * @fgp_flags: %FGP flags modify how the folio is returned.
 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
 *
 * Looks up the page cache entry at @mapping & @index.
 *
 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
 * if the %GFP flags specified for %FGP_CREAT are atomic.
 *
 * If this function returns a folio, it is returned with an increased refcount.
 *
 * Return: The found folio or an ERR_PTR() otherwise.
 */
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp)
{
        struct folio *folio;

repeat:
        folio = filemap_get_entry(mapping, index);
        if (xa_is_value(folio))
                folio = NULL;
        if (!folio)
                goto no_page;

        if (fgp_flags & FGP_LOCK) {
                if (fgp_flags & FGP_NOWAIT) {
                        if (!folio_trylock(folio)) {
                                folio_put(folio);
                                return ERR_PTR(-EAGAIN);
                        }
                } else {
                        folio_lock(folio);
                }

                /* Has the page been truncated? */
                if (unlikely(folio->mapping != mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                }
                VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
        }

        if (fgp_flags & FGP_ACCESSED)
                folio_mark_accessed(folio);
        else if (fgp_flags & FGP_WRITE) {
                /* Clear idle flag for buffer write */
                if (folio_test_idle(folio))
                        folio_clear_idle(folio);
        }

        if (fgp_flags & FGP_STABLE)
                folio_wait_stable(folio);
no_page:
        if (!folio && (fgp_flags & FGP_CREAT)) {
                unsigned order = FGF_GET_ORDER(fgp_flags);
                int err;

                if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                        gfp |= __GFP_WRITE;
                if (fgp_flags & FGP_NOFS)
                        gfp &= ~__GFP_FS;
                if (fgp_flags & FGP_NOWAIT) {
                        gfp &= ~GFP_KERNEL;
                        gfp |= GFP_NOWAIT | __GFP_NOWARN;
                }
                if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
                        fgp_flags |= FGP_LOCK;

                if (!mapping_large_folio_support(mapping))
                        order = 0;
                if (order > MAX_PAGECACHE_ORDER)
                        order = MAX_PAGECACHE_ORDER;
                /* If we're not aligned, allocate a smaller folio */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);

                do {
                        gfp_t alloc_gfp = gfp;

                        err = -ENOMEM;
                        if (order > 0)
                                alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
                        folio = filemap_alloc_folio(alloc_gfp, order);
                        if (!folio)
                                continue;

                        /* Init accessed so avoid atomic mark_page_accessed later */
                        if (fgp_flags & FGP_ACCESSED)
                                __folio_set_referenced(folio);

                        err = filemap_add_folio(mapping, folio, index, gfp);
                        if (!err)
                                break;
                        folio_put(folio);
                        folio = NULL;
                } while (order-- > 0);

                if (err == -EEXIST)
                        goto repeat;
                if (err)
                        return ERR_PTR(err);
                /*
                 * filemap_add_folio locks the page, and for mmap
                 * we expect an unlocked page.
                 */
                if (folio && (fgp_flags & FGP_FOR_MMAP))
                        folio_unlock(folio);
        }

        if (!folio)
                return ERR_PTR(-ENOENT);
        return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);

static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
                xa_mark_t mark)
{
        struct folio *folio;

retry:
        if (mark == XA_PRESENT)
                folio = xas_find(xas, max);
        else
                folio = xas_find_marked(xas, max, mark);

        if (xas_retry(xas, folio))
                goto retry;
        /*
         * A shadow entry of a recently evicted page, a swap
         * entry from shmem/tmpfs or a DAX entry.  Return it
         * without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                return folio;

        if (!folio_try_get_rcu(folio))
                goto reset;

        if (unlikely(folio != xas_reload(xas))) {
                folio_put(folio);
                goto reset;
        }

        return folio;
reset:
        xas_reset(xas);
        goto retry;
}

/**
 * find_get_entries - gang pagecache lookup
 * @mapping:        The address_space to search
 * @start:        The starting page cache index
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices corresponding to the entries in @entries
 *
 * find_get_entries() will search for and return a batch of entries in
 * the mapping.  The entries are placed in @fbatch.  find_get_entries()
 * takes a reference on any actual folios it returns.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries or large folios.
 *
 * Any shadow entries of evicted folios, or swap entries from
 * shmem/tmpfs, are included in the returned array.
 *
 * Return: The number of entries which were found.
 */
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
        }
        rcu_read_unlock();

        if (folio_batch_count(fbatch)) {
                unsigned long nr = 1;
                int idx = folio_batch_count(fbatch) - 1;

                folio = fbatch->folios[idx];
                if (!xa_is_value(folio))
                        nr = folio_nr_pages(folio);
                *start = indices[idx] + nr;
        }
        return folio_batch_count(fbatch);
}

/**
 * find_lock_entries - Find a batch of pagecache entries.
 * @mapping:        The address_space to search.
 * @start:        The starting page cache index.
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices of the entries in @fbatch.
 *
 * find_lock_entries() will return a batch of entries from @mapping.
 * Swap, shadow and DAX entries are included.  Folios are returned
 * locked and with an incremented refcount.  Folios which are locked
 * by somebody else or under writeback are skipped.  Folios which are
 * partially outside the range are not returned.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries, large folios, folios which could not be
 * locked or folios under writeback.
 *
 * Return: The number of entries which were found.
 */
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
                if (!xa_is_value(folio)) {
                        if (folio->index < *start)
                                goto put;
                        if (folio_next_index(folio) - 1 > end)
                                goto put;
                        if (!folio_trylock(folio))
                                goto put;
                        if (folio->mapping != mapping ||
                            folio_test_writeback(folio))
                                goto unlock;
                        VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
                                        folio);
                }
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
                continue;
unlock:
                folio_unlock(folio);
put:
                folio_put(folio);
        }
        rcu_read_unlock();

        if (folio_batch_count(fbatch)) {
                unsigned long nr = 1;
                int idx = folio_batch_count(fbatch) - 1;

                folio = fbatch->folios[idx];
                if (!xa_is_value(folio))
                        nr = folio_nr_pages(folio);
                *start = indices[idx] + nr;
        }
        return folio_batch_count(fbatch);
}

/**
 * filemap_get_folios - Get a batch of folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill.
 *
 * Search for and return a batch of folios in the mapping starting at
 * index @start and up to index @end (inclusive).  The folios are returned
 * in @fbatch with an elevated reference count.
 *
 * Return: The number of folios which were found.
 * We also update @start to index the next folio for the traversal.
 */
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch)
{
        return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
}
EXPORT_SYMBOL(filemap_get_folios);

/**
 * filemap_get_folios_contig - Get a batch of contiguous folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill
 *
 * filemap_get_folios_contig() works exactly like filemap_get_folios(),
 * except the returned folios are guaranteed to be contiguous. This may
 * not return all contiguous folios if the batch gets filled up.
 *
 * Return: The number of folios found.
 * Also update @start to be positioned for traversal of the next folio.
 */

unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        unsigned long nr;
        struct folio *folio;

        rcu_read_lock();

        for (folio = xas_load(&xas); folio && xas.xa_index <= end;
                        folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                /*
                 * If the entry has been swapped out, we can stop looking.
                 * No current caller is looking for DAX entries.
                 */
                if (xa_is_value(folio))
                        goto update_start;

                if (!folio_try_get_rcu(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio)) {
                        nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
                continue;
put_folio:
                folio_put(folio);

retry:
                xas_reset(&xas);
        }

update_start:
        nr = folio_batch_count(fbatch);

        if (nr) {
                folio = fbatch->folios[nr - 1];
                *start = folio_next_index(folio);
        }
out:
        rcu_read_unlock();
        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_contig);

/**
 * filemap_get_folios_tag - Get a batch of folios matching @tag
 * @mapping:    The address_space to search
 * @start:      The starting page index
 * @end:        The final page index (inclusive)
 * @tag:        The tag index
 * @fbatch:     The batch to fill
 *
 * The first folio may start before @start; if it does, it will contain
 * @start.  The final folio may extend beyond @end; if it does, it will
 * contain @end.  The folios have ascending indices.  There may be gaps
 * between the folios if there are indices which have no folio in the
 * page cache.  If folios are added to or removed from the page cache
 * while this is running, they may or may not be found by this call.
 * Only returns folios that are tagged with @tag.
 *
 * Return: The number of folios found.
 * Also update @start to index the next folio for traversal.
 */
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                        pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
                /*
                 * Shadow entries should never be tagged, but this iteration
                 * is lockless so there is a window for page reclaim to evict
                 * a page we saw tagged. Skip over it.
                 */
                if (xa_is_value(folio))
                        continue;
                if (!folio_batch_add(fbatch, folio)) {
                        unsigned long nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
        }
        /*
         * We come here when there is no page beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
         * breaks the iteration when there is a page at index -1 but that is
         * already broke anyway.
         */
        if (end == (pgoff_t)-1)
                *start = (pgoff_t)-1;
        else
                *start = end + 1;
out:
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_tag);

/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
static void shrink_readahead_size_eio(struct file_ra_state *ra)
{
        ra->ra_pages /= 4;
}

/*
 * filemap_get_read_batch - Get a batch of folios for read
 *
 * Get a batch of folios which represent a contiguous range of bytes in
 * the file.  No exceptional entries will be returned.  If @index is in
 * the middle of a folio, the entire folio will be returned.  The last
 * folio in the batch may have the readahead flag set or the uptodate flag
 * clear so that the caller can take the appropriate action.
 */
static void filemap_get_read_batch(struct address_space *mapping,
                pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xas.xa_index > max || xa_is_value(folio))
                        break;
                if (xa_is_sibling(folio))
                        break;
                if (!folio_try_get_rcu(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio))
                        break;
                if (!folio_test_uptodate(folio))
                        break;
                if (folio_test_readahead(folio))
                        break;
                xas_advance(&xas, folio_next_index(folio) - 1);
                continue;
put_folio:
                folio_put(folio);
retry:
                xas_reset(&xas);
        }
        rcu_read_unlock();
}

static int filemap_read_folio(struct file *file, filler_t filler,
                struct folio *folio)
{
        bool workingset = folio_test_workingset(folio);
        unsigned long pflags;
        int error;

        /*
         * A previous I/O error may have been due to temporary failures,
         * eg. multipath errors.  PG_error will be set again if read_folio
         * fails.
         */
        folio_clear_error(folio);

        /* Start the actual read. The read will unlock the page. */
        if (unlikely(workingset))
                psi_memstall_enter(&pflags);
        error = filler(file, folio);
        if (unlikely(workingset))
                psi_memstall_leave(&pflags);
        if (error)
                return error;

        error = folio_wait_locked_killable(folio);
        if (error)
                return error;
        if (folio_test_uptodate(folio))
                return 0;
        if (file)
                shrink_readahead_size_eio(&file->f_ra);
        return -EIO;
}

static bool filemap_range_uptodate(struct address_space *mapping,
                loff_t pos, size_t count, struct folio *folio,
                bool need_uptodate)
{
        if (folio_test_uptodate(folio))
                return true;
        /* pipes can't handle partially uptodate pages */
        if (need_uptodate)
                return false;
        if (!mapping->a_ops->is_partially_uptodate)
                return false;
        if (mapping->host->i_blkbits >= folio_shift(folio))
                return false;

        if (folio_pos(folio) > pos) {
                count -= folio_pos(folio) - pos;
                pos = 0;
        } else {
                pos -= folio_pos(folio);
        }

        return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}

static int filemap_update_page(struct kiocb *iocb,
                struct address_space *mapping, size_t count,
                struct folio *folio, bool need_uptodate)
{
        int error;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!filemap_invalidate_trylock_shared(mapping))
                        return -EAGAIN;
        } else {
                filemap_invalidate_lock_shared(mapping);
        }

        if (!folio_trylock(folio)) {
                error = -EAGAIN;
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
                        goto unlock_mapping;
                if (!(iocb->ki_flags & IOCB_WAITQ)) {
                        filemap_invalidate_unlock_shared(mapping);
                        /*
                         * This is where we usually end up waiting for a
                         * previously submitted readahead to finish.
                         */
                        folio_put_wait_locked(folio, TASK_KILLABLE);
                        return AOP_TRUNCATED_PAGE;
                }
                error = __folio_lock_async(folio, iocb->ki_waitq);
                if (error)
                        goto unlock_mapping;
        }

        error = AOP_TRUNCATED_PAGE;
        if (!folio->mapping)
                goto unlock;

        error = 0;
        if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
                                   need_uptodate))
                goto unlock;

        error = -EAGAIN;
        if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
                goto unlock;

        error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
                        folio);
        goto unlock_mapping;
unlock:
        folio_unlock(folio);
unlock_mapping:
        filemap_invalidate_unlock_shared(mapping);
        if (error == AOP_TRUNCATED_PAGE)
                folio_put(folio);
        return error;
}

static int filemap_create_folio(struct file *file,
                struct address_space *mapping, pgoff_t index,
                struct folio_batch *fbatch)
{
        struct folio *folio;
        int error;

        folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
        if (!folio)
                return -ENOMEM;

        /*
         * Protect against truncate / hole punch. Grabbing invalidate_lock
         * here assures we cannot instantiate and bring uptodate new
         * pagecache folios after evicting page cache during truncate
         * and before actually freeing blocks.        Note that we could
         * release invalidate_lock after inserting the folio into
         * the page cache as the locked folio would then be enough to
         * synchronize with hole punching. But there are code paths
         * such as filemap_update_page() filling in partially uptodate
         * pages or ->readahead() that need to hold invalidate_lock
         * while mapping blocks for IO so let's hold the lock here as
         * well to keep locking rules simple.
         */
        filemap_invalidate_lock_shared(mapping);
        error = filemap_add_folio(mapping, folio, index,
                        mapping_gfp_constraint(mapping, GFP_KERNEL));
        if (error == -EEXIST)
                error = AOP_TRUNCATED_PAGE;
        if (error)
                goto error;

        error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
        if (error)
                goto error;

        filemap_invalidate_unlock_shared(mapping);
        folio_batch_add(fbatch, folio);
        return 0;
error:
        filemap_invalidate_unlock_shared(mapping);
        folio_put(folio);
        return error;
}

static int filemap_readahead(struct kiocb *iocb, struct file *file,
                struct address_space *mapping, struct folio *folio,
                pgoff_t last_index)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);

        if (iocb->ki_flags & IOCB_NOIO)
                return -EAGAIN;
        page_cache_async_ra(&ractl, folio, last_index - folio->index);
        return 0;
}

static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct file_ra_state *ra = &filp->f_ra;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        int err = 0;

        /* "last_index" is the index of the page beyond the end of the read */
        last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
        if (fatal_signal_pending(current))
                return -EINTR;

        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                page_cache_sync_readahead(mapping, ra, filp, index,
                                last_index - index);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                        return -EAGAIN;
                err = filemap_create_folio(filp, mapping,
                                iocb->ki_pos >> PAGE_SHIFT, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }

        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if ((iocb->ki_flags & IOCB_WAITQ) &&
                    folio_batch_count(fbatch) > 1)
                        iocb->ki_flags |= IOCB_NOWAIT;
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }

        return 0;
err:
        if (err < 0)
                folio_put(folio);
        if (likely(--fbatch->nr))
                return 0;
        if (err == AOP_TRUNCATED_PAGE)
                goto retry;
        return err;
}

static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
{
        unsigned int shift = folio_shift(folio);

        return (pos1 >> shift == pos2 >> shift);
}

/**
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
 *
 * Copies data from the page cache.  If the data is not currently present,
 * uses the readahead and read_folio address_space operations to fetch it.
 *
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
 */
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t already_read)
{
        struct file *filp = iocb->ki_filp;
        struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
        struct folio_batch fbatch;
        int i, error = 0;
        bool writably_mapped;
        loff_t isize, end_offset;
        loff_t last_pos = ra->prev_pos;

        if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                return 0;
        if (unlikely(!iov_iter_count(iter)))
                return 0;

        iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
        folio_batch_init(&fbatch);

        do {
                cond_resched();

                /*
                 * If we've already successfully copied some data, then we
                 * can no longer safely return -EIOCBQUEUED. Hence mark
                 * an async read NOWAIT at that point.
                 */
                if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;

                if (unlikely(iocb->ki_pos >= i_size_read(inode)))
                        break;

                error = filemap_get_pages(iocb, iter->count, &fbatch, false);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= isize))
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(mapping);

                /*
                 * When a read accesses the same folio several times, only
                 * mark it as accessed the first time.
                 */
                if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
                                    fbatch.folios[0]))
                        folio_mark_accessed(fbatch.folios[0]);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t fsize = folio_size(folio);
                        size_t offset = iocb->ki_pos & (fsize - 1);
                        size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                                             fsize - offset);
                        size_t copied;

                        if (end_offset < folio_pos(folio))
                                break;
                        if (i > 0)
                                folio_mark_accessed(folio);
                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        copied = copy_folio_to_iter(folio, offset, bytes, iter);

                        already_read += copied;
                        iocb->ki_pos += copied;
                        last_pos = iocb->ki_pos;

                        if (copied < bytes) {
                                error = -EFAULT;
                                break;
                        }
                }
put_folios:
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_put(fbatch.folios[i]);
                folio_batch_init(&fbatch);
        } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

        file_accessed(filp);
        ra->prev_pos = last_pos;
        return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);

int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos;
        loff_t end = pos + count - 1;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (filemap_range_needs_writeback(mapping, pos, end))
                        return -EAGAIN;
                return 0;
        }

        return filemap_write_and_wait_range(mapping, pos, end);
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);

int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos;
        loff_t end = pos + count - 1;
        int ret;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                /* we could block if there are any pages in the range */
                if (filemap_range_has_page(mapping, pos, end))
                        return -EAGAIN;
        } else {
                ret = filemap_write_and_wait_range(mapping, pos, end);
                if (ret)
                        return ret;
        }

        /*
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
                                             end >> PAGE_SHIFT);
}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);

/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:        kernel I/O control block
 * @iter:        destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
 * Return:
 * * number of bytes copied, even for partial reads
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
 */
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        size_t count = iov_iter_count(iter);
        ssize_t retval = 0;

        if (!count)
                return 0; /* skip atime */

        if (iocb->ki_flags & IOCB_DIRECT) {
                struct file *file = iocb->ki_filp;
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;

                retval = kiocb_write_and_wait(iocb, count);
                if (retval < 0)
                        return retval;
                file_accessed(file);

                retval = mapping->a_ops->direct_IO(iocb, iter);
                if (retval >= 0) {
                        iocb->ki_pos += retval;
                        count -= retval;
                }
                if (retval != -EIOCBQUEUED)
                        iov_iter_revert(iter, count - iov_iter_count(iter));

                /*
                 * Btrfs can have a short DIO read if we encounter
                 * compressed extents, so if there was an error, or if
                 * we've already read everything we wanted to, or if
                 * there was a short read because we hit EOF, go ahead
                 * and return.  Otherwise fallthrough to buffered io for
                 * the rest of the read.  Buffered reads will not work for
                 * DAX files, so don't bother trying.
                 */
                if (retval < 0 || !count || IS_DAX(inode))
                        return retval;
                if (iocb->ki_pos >= i_size_read(inode))
                        return retval;
        }

        return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);

/*
 * Splice subpages from a folio into a pipe.
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size)
{
        struct page *page;
        size_t spliced = 0, offset = offset_in_folio(folio, fpos);

        page = folio_page(folio, offset / PAGE_SIZE);
        size = min(size, folio_size(folio) - offset);
        offset %= PAGE_SIZE;

        while (spliced < size &&
               !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);
                size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);

                *buf = (struct pipe_buffer) {
                        .ops        = &page_cache_pipe_buf_ops,
                        .page        = page,
                        .offset        = offset,
                        .len        = part,
                };
                folio_get(folio);
                pipe->head++;
                page++;
                spliced += part;
                offset = 0;
        }

        return spliced;
}

/**
 * filemap_splice_read -  Splice data from a file's pagecache into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function gets folios from a file's pagecache and splices them into the
 * pipe.  Readahead will be called as necessary to fill more folios.  This may
 * be used for blockdevs also.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
 */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags)
{
        struct folio_batch fbatch;
        struct kiocb iocb;
        size_t total_spliced = 0, used, npages;
        loff_t isize, end_offset;
        bool writably_mapped;
        int i, error = 0;

        if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
                return 0;

        init_sync_kiocb(&iocb, in);
        iocb.ki_pos = *ppos;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_occupancy(pipe->head, pipe->tail);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);

        folio_batch_init(&fbatch);

        do {
                cond_resched();

                if (*ppos >= i_size_read(in->f_mapping->host))
                        break;

                iocb.ki_pos = *ppos;
                error = filemap_get_pages(&iocb, len, &fbatch, true);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(in->f_mapping->host);
                if (unlikely(*ppos >= isize))
                        break;
                end_offset = min_t(loff_t, isize, *ppos + len);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(in->f_mapping);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t n;

                        if (folio_pos(folio) >= end_offset)
                                goto out;
                        folio_mark_accessed(folio);

                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        n = min_t(loff_t, len, isize - *ppos);
                        n = splice_folio_into_pipe(pipe, folio, *ppos, n);
                        if (!n)
                                goto out;
                        len -= n;
                        total_spliced += n;
                        *ppos += n;
                        in->f_ra.prev_pos = *ppos;
                        if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                                goto out;
                }

                folio_batch_release(&fbatch);
        } while (len);

out:
        folio_batch_release(&fbatch);
        file_accessed(in);

        return total_spliced ? total_spliced : error;
}
EXPORT_SYMBOL(filemap_splice_read);

static inline loff_t folio_seek_hole_data(struct xa_state *xas,
                struct address_space *mapping, struct folio *folio,
                loff_t start, loff_t end, bool seek_data)
{
        const struct address_space_operations *ops = mapping->a_ops;
        size_t offset, bsz = i_blocksize(mapping->host);

        if (xa_is_value(folio) || folio_test_uptodate(folio))
                return seek_data ? start : end;
        if (!ops->is_partially_uptodate)
                return seek_data ? end : start;

        xas_pause(xas);
        rcu_read_unlock();
        folio_lock(folio);
        if (unlikely(folio->mapping != mapping))
                goto unlock;

        offset = offset_in_folio(folio, start) & ~(bsz - 1);

        do {
                if (ops->is_partially_uptodate(folio, offset, bsz) ==
                                                        seek_data)
                        break;
                start = (start + bsz) & ~(bsz - 1);
                offset += bsz;
        } while (offset < folio_size(folio));
unlock:
        folio_unlock(folio);
        rcu_read_lock();
        return start;
}

static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
{
        if (xa_is_value(folio))
                return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
        return folio_size(folio);
}

/**
 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
 * @mapping: Address space to search.
 * @start: First byte to consider.
 * @end: Limit of search (exclusive).
 * @whence: Either SEEK_HOLE or SEEK_DATA.
 *
 * If the page cache knows which blocks contain holes and which blocks
 * contain data, your filesystem can use this function to implement
 * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
 * entirely memory-based such as tmpfs, and filesystems which support
 * unwritten extents.
 *
 * Return: The requested offset on success, or -ENXIO if @whence specifies
 * SEEK_DATA and there is no data after @start.  There is an implicit hole
 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
 * and @end contain data.
 */
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
                loff_t end, int whence)
{
        XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
        pgoff_t max = (end - 1) >> PAGE_SHIFT;
        bool seek_data = (whence == SEEK_DATA);
        struct folio *folio;

        if (end <= start)
                return -ENXIO;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
                loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
                size_t seek_size;

                if (start < pos) {
                        if (!seek_data)
                                goto unlock;
                        start = pos;
                }

                seek_size = seek_folio_size(&xas, folio);
                pos = round_up((u64)pos + 1, seek_size);
                start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
                                seek_data);
                if (start < pos)
                        goto unlock;
                if (start >= end)
                        break;
                if (seek_size > PAGE_SIZE)
                        xas_set(&xas, pos >> PAGE_SHIFT);
                if (!xa_is_value(folio))
                        folio_put(folio);
        }
        if (seek_data)
                start = -ENXIO;
unlock:
        rcu_read_unlock();
        if (folio && !xa_is_value(folio))
                folio_put(folio);
        if (start > end)
                return end;
        return start;
}

#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS  (100)
/*
 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
 * @vmf - the vm_fault for this fault.
 * @folio - the folio to lock.
 * @fpin - the pointer to the file we may pin (or is already pinned).
 *
 * This works similar to lock_folio_or_retry in that it can drop the
 * mmap_lock.  It differs in that it actually returns the folio locked
 * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
 * to drop the mmap_lock then fpin will point to the pinned file and
 * needs to be fput()'ed at a later point.
 */
static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
                                     struct file **fpin)
{
        if (folio_trylock(folio))
                return 1;

        /*
         * NOTE! This will make us return with VM_FAULT_RETRY, but with
         * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
         * is supposed to work. We have way too many special cases..
         */
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                return 0;

        *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
        if (vmf->flags & FAULT_FLAG_KILLABLE) {
                if (__folio_lock_killable(folio)) {
                        /*
                         * We didn't have the right flags to drop the
                         * fault lock, but all fault_handlers only check
                         * for fatal signals if we return VM_FAULT_RETRY,
                         * so we need to drop the fault lock here and
                         * return 0 if we don't have a fpin.
                         */
                        if (*fpin == NULL)
                                release_fault_lock(vmf);
                        return 0;
                }
        } else
                __folio_lock(folio);

        return 1;
}

/*
 * Synchronous readahead happens when we don't even find a page in the page
 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
 * to drop the mmap sem we return the file that was pinned in order for us to do
 * that.  If we didn't pin a file then we return NULL.  The file that is
 * returned needs to be fput()'ed when we're done with it.
 */
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        struct address_space *mapping = file->f_mapping;
        DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned long vm_flags = vmf->vma->vm_flags;
        unsigned int mmap_miss;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Use the readahead code, even if readahead is disabled */
        if (vm_flags & VM_HUGEPAGE) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
                ra->size = HPAGE_PMD_NR;
                /*
                 * Fetch two PMD folios, so we get the chance to actually
                 * readahead, unless we've been told not to.
                 */
                if (!(vm_flags & VM_RAND_READ))
                        ra->size *= 2;
                ra->async_size = HPAGE_PMD_NR;
                page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
                return fpin;
        }
#endif

        /* If we don't want any read-ahead, don't bother */
        if (vm_flags & VM_RAND_READ)
                return fpin;
        if (!ra->ra_pages)
                return fpin;

        if (vm_flags & VM_SEQ_READ) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_sync_ra(&ractl, ra->ra_pages);
                return fpin;
        }

        /* Avoid banging the cache line if not needed */
        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss < MMAP_LOTSAMISS * 10)
                WRITE_ONCE(ra->mmap_miss, ++mmap_miss);

        /*
         * Do we miss much more than hit in this file? If so,
         * stop bothering with read-ahead. It will only hurt.
         */
        if (mmap_miss > MMAP_LOTSAMISS)
                return fpin;

        /*
         * mmap read-around
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
        ra->size = ra->ra_pages;
        ra->async_size = ra->ra_pages / 4;
        ractl._index = ra->start;
        page_cache_ra_order(&ractl, ra, 0);
        return fpin;
}

/*
 * Asynchronous readahead happens when we find the page and PG_readahead,
 * so we want to possibly extend the readahead further.  We return the file that
 * was pinned if we have to drop the mmap_lock in order to do IO.
 */
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
                                            struct folio *folio)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned int mmap_miss;

        /* If we don't want any read-ahead, don't bother */
        if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
                return fpin;

        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss)
                WRITE_ONCE(ra->mmap_miss, --mmap_miss);

        if (folio_test_readahead(folio)) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_async_ra(&ractl, folio, ra->ra_pages);
        }
        return fpin;
}

static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;
        pte_t *ptep;

        /*
         * We might have COW'ed a pagecache folio and might now have an mlocked
         * anon folio mapped. The original pagecache folio is not mlocked and
         * might have been evicted. During a read+clear/modify/write update of
         * the PTE, such as done in do_numa_page()/change_pte_range(), we
         * temporarily clear the PTE under PT lock and might detect it here as
         * "none" when not holding the PT lock.
         *
         * Not rechecking the PTE under PT lock could result in an unexpected
         * major fault in an mlock'ed region. Recheck only for this special
         * scenario while holding the PT lock, to not degrade non-mlocked
         * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
         * the number of times we hold PT lock.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return 0;

        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return 0;

        ptep = pte_offset_map(vmf->pmd, vmf->address);
        if (unlikely(!ptep))
                return VM_FAULT_NOPAGE;

        if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
                ret = VM_FAULT_NOPAGE;
        } else {
                spin_lock(vmf->ptl);
                if (unlikely(!pte_none(ptep_get(ptep))))
                        ret = VM_FAULT_NOPAGE;
                spin_unlock(vmf->ptl);
        }
        pte_unmap(ptep);
        return ret;
}

/**
 * filemap_fault - read in file data for page fault handling
 * @vmf:        struct vm_fault containing details of the fault
 *
 * filemap_fault() is invoked via the vma operations vector for a
 * mapped memory region to read in file data during a page fault.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
 *
 * vma->vm_mm->mmap_lock must be held on entry.
 *
 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
 *
 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
 * has not been released.
 *
 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
 *
 * Return: bitwise-OR of %VM_FAULT_ codes.
 */
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
        int error;
        struct file *file = vmf->vma->vm_file;
        struct file *fpin = NULL;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        pgoff_t max_idx, index = vmf->pgoff;
        struct folio *folio;
        vm_fault_t ret = 0;
        bool mapping_locked = false;

        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx))
                return VM_FAULT_SIGBUS;

        /*
         * Do we have something in the page cache already?
         */
        folio = filemap_get_folio(mapping, index);
        if (likely(!IS_ERR(folio))) {
                /*
                 * We found the page, so try async readahead before waiting for
                 * the lock.
                 */
                if (!(vmf->flags & FAULT_FLAG_TRIED))
                        fpin = do_async_mmap_readahead(vmf, folio);
                if (unlikely(!folio_test_uptodate(folio))) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
        } else {
                ret = filemap_fault_recheck_pte_none(vmf);
                if (unlikely(ret))
                        return ret;

                /* No page in the page cache at all */
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
                fpin = do_sync_mmap_readahead(vmf);
retry_find:
                /*
                 * See comment in filemap_create_folio() why we need
                 * invalidate_lock
                 */
                if (!mapping_locked) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
                folio = __filemap_get_folio(mapping, index,
                                          FGP_CREAT|FGP_FOR_MMAP,
                                          vmf->gfp_mask);
                if (IS_ERR(folio)) {
                        if (fpin)
                                goto out_retry;
                        filemap_invalidate_unlock_shared(mapping);
                        return VM_FAULT_OOM;
                }
        }

        if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
                goto out_retry;

        /* Did it get truncated? */
        if (unlikely(folio->mapping != mapping)) {
                folio_unlock(folio);
                folio_put(folio);
                goto retry_find;
        }
        VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);

        /*
         * We have a locked folio in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error,
         * or because readahead was otherwise unable to retrieve it.
         */
        if (unlikely(!folio_test_uptodate(folio))) {
                /*
                 * If the invalidate lock is not held, the folio was in cache
                 * and uptodate and now it is not. Strange but possible since we
                 * didn't hold the page lock all the time. Let's drop
                 * everything, get the invalidate lock and try again.
                 */
                if (!mapping_locked) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto retry_find;
                }

                /*
                 * OK, the folio is really not uptodate. This can be because the
                 * VMA has the VM_RAND_READ flag set, or because an error
                 * arose. Let's read it in directly.
                 */
                goto page_not_uptodate;
        }

        /*
         * We've made it this far and we had to drop our mmap_lock, now is the
         * time to return to the upper layer and have it re-find the vma and
         * redo the fault.
         */
        if (fpin) {
                folio_unlock(folio);
                goto out_retry;
        }
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);

        /*
         * Found the page and have a reference on it.
         * We must recheck i_size under page lock.
         */
        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx)) {
                folio_unlock(folio);
                folio_put(folio);
                return VM_FAULT_SIGBUS;
        }

        vmf->page = folio_file_page(folio, index);
        return ret | VM_FAULT_LOCKED;

page_not_uptodate:
        /*
         * Umm, take care of errors if the page isn't up-to-date.
         * Try to re-read it _once_. We do this synchronously,
         * because there really aren't any performance issues here
         * and we need to check for errors.
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
        if (fpin)
                goto out_retry;
        folio_put(folio);

        if (!error || error == AOP_TRUNCATED_PAGE)
                goto retry_find;
        filemap_invalidate_unlock_shared(mapping);

        return VM_FAULT_SIGBUS;

out_retry:
        /*
         * We dropped the mmap_lock, we need to return to the fault handler to
         * re-find the vma and come back and find our hopefully still populated
         * page.
         */
        if (!IS_ERR(folio))
                folio_put(folio);
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);
        if (fpin)
                fput(fpin);
        return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);

static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
                pgoff_t start)
{
        struct mm_struct *mm = vmf->vma->vm_mm;

        /* Huge page is mapped? No need to proceed. */
        if (pmd_trans_huge(*vmf->pmd)) {
                folio_unlock(folio);
                folio_put(folio);
                return true;
        }

        if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
                struct page *page = folio_file_page(folio, start);
                vm_fault_t ret = do_set_pmd(vmf, page);
                if (!ret) {
                        /* The page is mapped successfully, reference consumed. */
                        folio_unlock(folio);
                        return true;
                }
        }

        if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
                pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);

        return false;
}

static struct folio *next_uptodate_folio(struct xa_state *xas,
                struct address_space *mapping, pgoff_t end_pgoff)
{
        struct folio *folio = xas_next_entry(xas, end_pgoff);
        unsigned long max_idx;

        do {
                if (!folio)
                        return NULL;
                if (xas_retry(xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (folio_test_locked(folio))
                        continue;
                if (!folio_try_get_rcu(folio))
                        continue;
                /* Has the page moved or been split? */
                if (unlikely(folio != xas_reload(xas)))
                        goto skip;
                if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
                        goto skip;
                if (!folio_trylock(folio))
                        goto skip;
                if (folio->mapping != mapping)
                        goto unlock;
                if (!folio_test_uptodate(folio))
                        goto unlock;
                max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (xas->xa_index >= max_idx)
                        goto unlock;
                return folio;
unlock:
                folio_unlock(folio);
skip:
                folio_put(folio);
        } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);

        return NULL;
}

/*
 * Map page range [start_page, start_page + nr_pages) of folio.
 * start_page is gotten from start by folio_page(folio, start)
 */
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
                        struct folio *folio, unsigned long start,
                        unsigned long addr, unsigned int nr_pages,
                        unsigned long *rss, unsigned int *mmap_miss)
{
        vm_fault_t ret = 0;
        struct page *page = folio_page(folio, start);
        unsigned int count = 0;
        pte_t *old_ptep = vmf->pte;

        do {
                if (PageHWPoison(page + count))
                        goto skip;

                /*
                 * If there are too many folios that are recently evicted
                 * in a file, they will probably continue to be evicted.
                 * In such situation, read-ahead is only a waste of IO.
                 * Don't decrease mmap_miss in this scenario to make sure
                 * we can stop read-ahead.
                 */
                if (!folio_test_workingset(folio))
                        (*mmap_miss)++;

                /*
                 * NOTE: If there're PTE markers, we'll leave them to be
                 * handled in the specific fault path, and it'll prohibit the
                 * fault-around logic.
                 */
                if (!pte_none(ptep_get(&vmf->pte[count])))
                        goto skip;

                count++;
                continue;
skip:
                if (count) {
                        set_pte_range(vmf, folio, page, count, addr);
                        *rss += count;
                        folio_ref_add(folio, count);
                        if (in_range(vmf->address, addr, count * PAGE_SIZE))
                                ret = VM_FAULT_NOPAGE;
                }

                count++;
                page += count;
                vmf->pte += count;
                addr += count * PAGE_SIZE;
                count = 0;
        } while (--nr_pages > 0);

        if (count) {
                set_pte_range(vmf, folio, page, count, addr);
                *rss += count;
                folio_ref_add(folio, count);
                if (in_range(vmf->address, addr, count * PAGE_SIZE))
                        ret = VM_FAULT_NOPAGE;
        }

        vmf->pte = old_ptep;

        return ret;
}

static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
                struct folio *folio, unsigned long addr,
                unsigned long *rss, unsigned int *mmap_miss)
{
        vm_fault_t ret = 0;
        struct page *page = &folio->page;

        if (PageHWPoison(page))
                return ret;

        /* See comment of filemap_map_folio_range() */
        if (!folio_test_workingset(folio))
                (*mmap_miss)++;

        /*
         * NOTE: If there're PTE markers, we'll leave them to be
         * handled in the specific fault path, and it'll prohibit
         * the fault-around logic.
         */
        if (!pte_none(ptep_get(vmf->pte)))
                return ret;

        if (vmf->address == addr)
                ret = VM_FAULT_NOPAGE;

        set_pte_range(vmf, folio, page, 1, addr);
        (*rss)++;
        folio_ref_inc(folio);

        return ret;
}

vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                             pgoff_t start_pgoff, pgoff_t end_pgoff)
{
        struct vm_area_struct *vma = vmf->vma;
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
        unsigned long addr;
        XA_STATE(xas, &mapping->i_pages, start_pgoff);
        struct folio *folio;
        vm_fault_t ret = 0;
        unsigned long rss = 0;
        unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;

        rcu_read_lock();
        folio = next_uptodate_folio(&xas, mapping, end_pgoff);
        if (!folio)
                goto out;

        if (filemap_map_pmd(vmf, folio, start_pgoff)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte) {
                folio_unlock(folio);
                folio_put(folio);
                goto out;
        }

        folio_type = mm_counter_file(folio);
        do {
                unsigned long end;

                addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                vmf->pte += xas.xa_index - last_pgoff;
                last_pgoff = xas.xa_index;
                end = folio_next_index(folio) - 1;
                nr_pages = min(end, end_pgoff) - xas.xa_index + 1;

                if (!folio_test_large(folio))
                        ret |= filemap_map_order0_folio(vmf,
                                        folio, addr, &rss, &mmap_miss);
                else
                        ret |= filemap_map_folio_range(vmf, folio,
                                        xas.xa_index - folio->index, addr,
                                        nr_pages, &rss, &mmap_miss);

                folio_unlock(folio);
                folio_put(folio);
        } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
        add_mm_counter(vma->vm_mm, folio_type, rss);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        rcu_read_unlock();

        mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
        if (mmap_miss >= mmap_miss_saved)
                WRITE_ONCE(file->f_ra.mmap_miss, 0);
        else
                WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);

        return ret;
}
EXPORT_SYMBOL(filemap_map_pages);

vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct folio *folio = page_folio(vmf->page);
        vm_fault_t ret = VM_FAULT_LOCKED;

        sb_start_pagefault(mapping->host->i_sb);
        file_update_time(vmf->vma->vm_file);
        folio_lock(folio);
        if (folio->mapping != mapping) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }
        /*
         * We mark the folio dirty already here so that when freeze is in
         * progress, we are guaranteed that writeback during freezing will
         * see the dirty folio and writeprotect it again.
         */
        folio_mark_dirty(folio);
        folio_wait_stable(folio);
out:
        sb_end_pagefault(mapping->host->i_sb);
        return ret;
}

const struct vm_operations_struct generic_file_vm_ops = {
        .fault                = filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite        = filemap_page_mkwrite,
};

/* This is used for a general mmap of a disk file */

int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct address_space *mapping = file->f_mapping;

        if (!mapping->a_ops->read_folio)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
        return 0;
}

/*
 * This is for filesystems which do not implement ->writepage.
 */
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        if (vma_is_shared_maywrite(vma))
                return -EINVAL;
        return generic_file_mmap(file, vma);
}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
#endif /* CONFIG_MMU */

EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);

static struct folio *do_read_cache_folio(struct address_space *mapping,
                pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;
        int err;

        if (!filler)
                filler = mapping->a_ops->read_folio;
repeat:
        folio = filemap_get_folio(mapping, index);
        if (IS_ERR(folio)) {
                folio = filemap_alloc_folio(gfp, 0);
                if (!folio)
                        return ERR_PTR(-ENOMEM);
                err = filemap_add_folio(mapping, folio, index, gfp);
                if (unlikely(err)) {
                        folio_put(folio);
                        if (err == -EEXIST)
                                goto repeat;
                        /* Presumably ENOMEM for xarray node */
                        return ERR_PTR(err);
                }

                goto filler;
        }
        if (folio_test_uptodate(folio))
                goto out;

        if (!folio_trylock(folio)) {
                folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }

        /* Folio was truncated from mapping */
        if (!folio->mapping) {
                folio_unlock(folio);
                folio_put(folio);
                goto repeat;
        }

        /* Someone else locked and filled the page in a very small window */
        if (folio_test_uptodate(folio)) {
                folio_unlock(folio);
                goto out;
        }

filler:
        err = filemap_read_folio(file, filler, folio);
        if (err) {
                folio_put(folio);
                if (err == AOP_TRUNCATED_PAGE)
                        goto repeat;
                return ERR_PTR(err);
        }

out:
        folio_mark_accessed(folio);
        return folio;
}

/**
 * read_cache_folio - Read into page cache, fill it if needed.
 * @mapping: The address_space to read from.
 * @index: The index to read.
 * @filler: Function to perform the read, or NULL to use aops->read_folio().
 * @file: Passed to filler function, may be NULL if not required.
 *
 * Read one page into the page cache.  If it succeeds, the folio returned
 * will contain @index, but it may not be the first page of the folio.
 *
 * If the filler function returns an error, it will be returned to the
 * caller.
 *
 * Context: May sleep.  Expects mapping->invalidate_lock to be held.
 * Return: An uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
                filler_t filler, struct file *file)
{
        return do_read_cache_folio(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_folio);

/**
 * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
 * @mapping:        The address_space for the folio.
 * @index:        The index that the allocated folio will contain.
 * @gfp:        The page allocator flags to use if allocating.
 *
 * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
 * any new memory allocations done using the specified allocation flags.
 *
 * The most likely error from this function is EIO, but ENOMEM is
 * possible and so is EINTR.  If ->read_folio returns another error,
 * that will be returned to the caller.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: Uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *mapping_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
        return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(mapping_read_folio_gfp);

static struct page *do_read_cache_page(struct address_space *mapping,
                pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;

        folio = do_read_cache_folio(mapping, index, filler, file, gfp);
        if (IS_ERR(folio))
                return &folio->page;
        return folio_file_page(folio, index);
}

struct page *read_cache_page(struct address_space *mapping,
                        pgoff_t index, filler_t *filler, struct file *file)
{
        return do_read_cache_page(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);

/**
 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the page's address_space
 * @index:        the page index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
 * any new page allocations done using the specified allocation flags.
 *
 * If the page does not get brought uptodate, return -EIO.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: up to date page on success, ERR_PTR() on failure.
 */
struct page *read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index,
                                gfp_t gfp)
{
        return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);

/*
 * Warn about a page cache invalidation failure during a direct I/O write.
 */
static void dio_warn_stale_pagecache(struct file *filp)
{
        static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
        char pathname[128];
        char *path;

        errseq_set(&filp->f_mapping->wb_err, -EIO);
        if (__ratelimit(&_rs)) {
                path = file_path(filp, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
                pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
                        current->comm);
        }
}

void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;

        if (mapping->nrpages &&
            invalidate_inode_pages2_range(mapping,
                        iocb->ki_pos >> PAGE_SHIFT,
                        (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
                dio_warn_stale_pagecache(iocb->ki_filp);
}

ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        size_t write_len = iov_iter_count(from);
        ssize_t written;

        /*
         * If a page can not be invalidated, return 0 to fall back
         * to buffered write.
         */
        written = kiocb_invalidate_pages(iocb, write_len);
        if (written) {
                if (written == -EBUSY)
                        return 0;
                return written;
        }

        written = mapping->a_ops->direct_IO(iocb, from);

        /*
         * Finally, try again to invalidate clean pages which might have been
         * cached by non-direct readahead, or faulted in by get_user_pages()
         * if the source of the write was an mmap'ed region of the file
         * we're writing.  Either one is a pretty crazy thing to do,
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         *
         * Most of the time we do not need this since dio_complete() will do
         * the invalidation for us. However there are some file systems that
         * do not end up with dio_complete() being called, so let's not break
         * them by removing it completely.
         *
         * Noticeable example is a blkdev_direct_IO().
         *
         * Skip invalidation for async writes or if mapping has no pages.
         */
        if (written > 0) {
                struct inode *inode = mapping->host;
                loff_t pos = iocb->ki_pos;

                kiocb_invalidate_post_direct_write(iocb, written);
                pos += written;
                write_len -= written;
                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
                iocb->ki_pos = pos;
        }
        if (written != -EIOCBQUEUED)
                iov_iter_revert(from, write_len - iov_iter_count(from));
        return written;
}
EXPORT_SYMBOL(generic_file_direct_write);

ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
{
        struct file *file = iocb->ki_filp;
        loff_t pos = iocb->ki_pos;
        struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        long status = 0;
        ssize_t written = 0;

        do {
                struct page *page;
                unsigned long offset;        /* Offset into pagecache page */
                unsigned long bytes;        /* Bytes to write to page */
                size_t copied;                /* Bytes copied from user */
                void *fsdata = NULL;

                offset = (pos & (PAGE_SIZE - 1));
                bytes = min_t(unsigned long, PAGE_SIZE - offset,
                                                iov_iter_count(i));

again:
                /*
                 * Bring in the user page that we will copy from _first_.
                 * Otherwise there's a nasty deadlock on copying from the
                 * same page as we're writing to, without it being marked
                 * up-to-date.
                 */
                if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
                        status = -EFAULT;
                        break;
                }

                if (fatal_signal_pending(current)) {
                        status = -EINTR;
                        break;
                }

                status = a_ops->write_begin(file, mapping, pos, bytes,
                                                &page, &fsdata);
                if (unlikely(status < 0))
                        break;

                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);

                copied = copy_page_from_iter_atomic(page, offset, bytes, i);
                flush_dcache_page(page);

                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status != copied)) {
                        iov_iter_revert(i, copied - max(status, 0L));
                        if (unlikely(status < 0))
                                break;
                }
                cond_resched();

                if (unlikely(status == 0)) {
                        /*
                         * A short copy made ->write_end() reject the
                         * thing entirely.  Might be memory poisoning
                         * halfway through, might be a race with munmap,
                         * might be severe memory pressure.
                         */
                        if (copied)
                                bytes = copied;
                        goto again;
                }
                pos += status;
                written += status;

                balance_dirty_pages_ratelimited(mapping);
        } while (iov_iter_count(i));

        if (!written)
                return status;
        iocb->ki_pos += written;
        return written;
}
EXPORT_SYMBOL(generic_perform_write);

/**
 * __generic_file_write_iter - write data to a file
 * @iocb:        IO state structure (file, offset, etc.)
 * @from:        iov_iter with data to write
 *
 * This function does all the work needed for actually writing data to a
 * file. It does all basic checks, removes SUID from the file, updates
 * modification times and calls proper subroutines depending on whether we
 * do direct IO or a standard buffered write.
 *
 * It expects i_rwsem to be grabbed unless we work on a block device or similar
 * object which does not need locking at all.
 *
 * This function does *not* take care of syncing data in case of O_SYNC write.
 * A caller has to handle it. This is mainly due to the fact that we want to
 * avoid syncing under i_rwsem.
 *
 * Return:
 * * number of bytes written, even for truncated writes
 * * negative error code if no data has been written at all
 */
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        ssize_t ret;

        ret = file_remove_privs(file);
        if (ret)
                return ret;

        ret = file_update_time(file);
        if (ret)
                return ret;

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = generic_file_direct_write(iocb, from);
                /*
                 * If the write stopped short of completing, fall back to
                 * buffered writes.  Some filesystems do this for writes to
                 * holes, for example.  For DAX files, a buffered write will
                 * not succeed (even if it did, DAX does not handle dirty
                 * page-cache pages correctly).
                 */
                if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
                        return ret;
                return direct_write_fallback(iocb, from, ret,
                                generic_perform_write(iocb, from));
        }

        return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);

/**
 * generic_file_write_iter - write data to a file
 * @iocb:        IO state structure
 * @from:        iov_iter with data to write
 *
 * This is a wrapper around __generic_file_write_iter() to be used by most
 * filesystems. It takes care of syncing the file in case of O_SYNC file
 * and acquires i_rwsem as needed.
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
 */
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret > 0)
                ret = __generic_file_write_iter(iocb, from);
        inode_unlock(inode);

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);

/**
 * filemap_release_folio() - Release fs-specific metadata on a folio.
 * @folio: The folio which the kernel is trying to free.
 * @gfp: Memory allocation flags (and I/O mode).
 *
 * The address_space is trying to release any data attached to a folio
 * (presumably at folio->private).
 *
 * This will also be called if the private_2 flag is set on a page,
 * indicating that the folio has other metadata associated with it.
 *
 * The @gfp argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block
 * (__GFP_RECLAIM & __GFP_FS).
 *
 * Return: %true if the release was successful, otherwise %false.
 */
bool filemap_release_folio(struct folio *folio, gfp_t gfp)
{
        struct address_space * const mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        if (!folio_needs_release(folio))
                return true;
        if (folio_test_writeback(folio))
                return false;

        if (mapping && mapping->a_ops->release_folio)
                return mapping->a_ops->release_folio(folio, gfp);
        return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);

/**
 * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
 * @inode: The inode to flush
 * @flush: Set to write back rather than simply invalidate.
 * @start: First byte to in range.
 * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
 *       onwards.
 *
 * Invalidate all the folios on an inode that contribute to the specified
 * range, possibly writing them back first.  Whilst the operation is
 * undertaken, the invalidate lock is held to prevent new folios from being
 * installed.
 */
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t first = start >> PAGE_SHIFT;
        pgoff_t last = end >> PAGE_SHIFT;
        pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;

        if (!mapping || !mapping->nrpages || end < start)
                goto out;

        /* Prevent new folios from being added to the inode. */
        filemap_invalidate_lock(mapping);

        if (!mapping->nrpages)
                goto unlock;

        unmap_mapping_pages(mapping, first, nr, false);

        /* Write back the data if we're asked to. */
        if (flush) {
                struct writeback_control wbc = {
                        .sync_mode        = WB_SYNC_ALL,
                        .nr_to_write        = LONG_MAX,
                        .range_start        = start,
                        .range_end        = end,
                };

                filemap_fdatawrite_wbc(mapping, &wbc);
        }

        /* Wait for writeback to complete on all folios and discard. */
        truncate_inode_pages_range(mapping, start, end);

unlock:
        filemap_invalidate_unlock(mapping);
out:
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL_GPL(filemap_invalidate_inode);

#ifdef CONFIG_CACHESTAT_SYSCALL
/**
 * filemap_cachestat() - compute the page cache statistics of a mapping
 * @mapping:        The mapping to compute the statistics for.
 * @first_index:        The starting page cache index.
 * @last_index:        The final page index (inclusive).
 * @cs:        the cachestat struct to write the result to.
 *
 * This will query the page cache statistics of a mapping in the
 * page range of [first_index, last_index] (inclusive). The statistics
 * queried include: number of dirty pages, number of pages marked for
 * writeback, and the number of (recently) evicted pages.
 */
static void filemap_cachestat(struct address_space *mapping,
                pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
{
        XA_STATE(xas, &mapping->i_pages, first_index);
        struct folio *folio;

        rcu_read_lock();
        xas_for_each(&xas, folio, last_index) {
                int order;
                unsigned long nr_pages;
                pgoff_t folio_first_index, folio_last_index;

                /*
                 * Don't deref the folio. It is not pinned, and might
                 * get freed (and reused) underneath us.
                 *
                 * We *could* pin it, but that would be expensive for
                 * what should be a fast and lightweight syscall.
                 *
                 * Instead, derive all information of interest from
                 * the rcu-protected xarray.
                 */

                if (xas_retry(&xas, folio))
                        continue;

                order = xa_get_order(xas.xa, xas.xa_index);
                nr_pages = 1 << order;
                folio_first_index = round_down(xas.xa_index, 1 << order);
                folio_last_index = folio_first_index + nr_pages - 1;

                /* Folios might straddle the range boundaries, only count covered pages */
                if (folio_first_index < first_index)
                        nr_pages -= first_index - folio_first_index;

                if (folio_last_index > last_index)
                        nr_pages -= folio_last_index - last_index;

                if (xa_is_value(folio)) {
                        /* page is evicted */
                        void *shadow = (void *)folio;
                        bool workingset; /* not used */

                        cs->nr_evicted += nr_pages;

#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
                        if (shmem_mapping(mapping)) {
                                /* shmem file - in swap cache */
                                swp_entry_t swp = radix_to_swp_entry(folio);

                                /* swapin error results in poisoned entry */
                                if (non_swap_entry(swp))
                                        goto resched;

                                /*
                                 * Getting a swap entry from the shmem
                                 * inode means we beat
                                 * shmem_unuse(). rcu_read_lock()
                                 * ensures swapoff waits for us before
                                 * freeing the swapper space. However,
                                 * we can race with swapping and
                                 * invalidation, so there might not be
                                 * a shadow in the swapcache (yet).
                                 */
                                shadow = get_shadow_from_swap_cache(swp);
                                if (!shadow)
                                        goto resched;
                        }
#endif
                        if (workingset_test_recent(shadow, true, &workingset))
                                cs->nr_recently_evicted += nr_pages;

                        goto resched;
                }

                /* page is in cache */
                cs->nr_cache += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
                        cs->nr_dirty += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
                        cs->nr_writeback += nr_pages;

resched:
                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();
}

/*
 * The cachestat(2) system call.
 *
 * cachestat() returns the page cache statistics of a file in the
 * bytes range specified by `off` and `len`: number of cached pages,
 * number of dirty pages, number of pages marked for writeback,
 * number of evicted pages, and number of recently evicted pages.
 *
 * An evicted page is a page that is previously in the page cache
 * but has been evicted since. A page is recently evicted if its last
 * eviction was recent enough that its reentry to the cache would
 * indicate that it is actively being used by the system, and that
 * there is memory pressure on the system.
 *
 * `off` and `len` must be non-negative integers. If `len` > 0,
 * the queried range is [`off`, `off` + `len`]. If `len` == 0,
 * we will query in the range from `off` to the end of the file.
 *
 * The `flags` argument is unused for now, but is included for future
 * extensibility. User should pass 0 (i.e no flag specified).
 *
 * Currently, hugetlbfs is not supported.
 *
 * Because the status of a page can change after cachestat() checks it
 * but before it returns to the application, the returned values may
 * contain stale information.
 *
 * return values:
 *  zero        - success
 *  -EFAULT     - cstat or cstat_range points to an illegal address
 *  -EINVAL     - invalid flags
 *  -EBADF      - invalid file descriptor
 *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
 */
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
                struct cachestat_range __user *, cstat_range,
                struct cachestat __user *, cstat, unsigned int, flags)
{
        struct fd f = fdget(fd);
        struct address_space *mapping;
        struct cachestat_range csr;
        struct cachestat cs;
        pgoff_t first_index, last_index;

        if (!f.file)
                return -EBADF;

        if (copy_from_user(&csr, cstat_range,
                        sizeof(struct cachestat_range))) {
                fdput(f);
                return -EFAULT;
        }

        /* hugetlbfs is not supported */
        if (is_file_hugepages(f.file)) {
                fdput(f);
                return -EOPNOTSUPP;
        }

        if (flags != 0) {
                fdput(f);
                return -EINVAL;
        }

        first_index = csr.off >> PAGE_SHIFT;
        last_index =
                csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
        memset(&cs, 0, sizeof(struct cachestat));
        mapping = f.file->f_mapping;
        filemap_cachestat(mapping, first_index, last_index, &cs);
        fdput(f);

        if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
                return -EFAULT;

        return 0;
}
#endif /* CONFIG_CACHESTAT_SYSCALL */













































































    3 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/pagevec.h
 *
 * In many places it is efficient to batch an operation up against multiple
 * folios.  A folio_batch is a container which is used for that.
 */

#ifndef _LINUX_PAGEVEC_H
#define _LINUX_PAGEVEC_H

#include <linux/types.h>

/* 31 pointers + header align the folio_batch structure to a power of two */
#define PAGEVEC_SIZE        31

struct folio;

/**
 * struct folio_batch - A collection of folios.
 *
 * The folio_batch is used to amortise the cost of retrieving and
 * operating on a set of folios.  The order of folios in the batch may be
 * significant (eg delete_from_page_cache_batch()).  Some users of the
 * folio_batch store "exceptional" entries in it which can be removed
 * by calling folio_batch_remove_exceptionals().
 */
struct folio_batch {
        unsigned char nr;
        unsigned char i;
        bool percpu_pvec_drained;
        struct folio *folios[PAGEVEC_SIZE];
};

/**
 * folio_batch_init() - Initialise a batch of folios
 * @fbatch: The folio batch.
 *
 * A freshly initialised folio_batch contains zero folios.
 */
static inline void folio_batch_init(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
        fbatch->percpu_pvec_drained = false;
}

static inline void folio_batch_reinit(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
}

static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
{
        return fbatch->nr;
}

static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
{
        return PAGEVEC_SIZE - fbatch->nr;
}

/**
 * folio_batch_add() - Add a folio to a batch.
 * @fbatch: The folio batch.
 * @folio: The folio to add.
 *
 * The folio is added to the end of the batch.
 * The batch must have previously been initialised using folio_batch_init().
 *
 * Return: The number of slots still available.
 */
static inline unsigned folio_batch_add(struct folio_batch *fbatch,
                struct folio *folio)
{
        fbatch->folios[fbatch->nr++] = folio;
        return folio_batch_space(fbatch);
}

/**
 * folio_batch_next - Return the next folio to process.
 * @fbatch: The folio batch being processed.
 *
 * Use this function to implement a queue of folios.
 *
 * Return: The next folio in the queue, or NULL if the queue is empty.
 */
static inline struct folio *folio_batch_next(struct folio_batch *fbatch)
{
        if (fbatch->i == fbatch->nr)
                return NULL;
        return fbatch->folios[fbatch->i++];
}

void __folio_batch_release(struct folio_batch *pvec);

static inline void folio_batch_release(struct folio_batch *fbatch)
{
        if (folio_batch_count(fbatch))
                __folio_batch_release(fbatch);
}

void folio_batch_remove_exceptionals(struct folio_batch *fbatch);
#endif /* _LINUX_PAGEVEC_H */













































    2 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  include/linux/eventpoll.h ( Efficient event polling implementation )
 *  Copyright (C) 2001,...,2006         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H

#include <uapi/linux/eventpoll.h>
#include <uapi/linux/kcmp.h>


/* Forward declarations to avoid compiler errors */
struct file;


#ifdef CONFIG_EPOLL

#ifdef CONFIG_KCMP
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif

/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);

/*
 * This is called from inside fs/file_table.c:__fput() to unlink files
 * from the eventpoll interface. We need to have this facility to cleanup
 * correctly files that are closed without being removed from the eventpoll
 * interface.
 */
static inline void eventpoll_release(struct file *file)
{

        /*
         * Fast check to avoid the get/release of the semaphore. Since
         * we're doing this outside the semaphore lock, it might return
         * false negatives, but we don't care. It'll help in 99.99% of cases
         * to avoid the semaphore lock. False positives simply cannot happen
         * because the file in on the way to be removed and nobody ( but
         * eventpoll ) has still a reference to this file.
         */
        if (likely(!file->f_ep))
                return;

        /*
         * The file is being closed while it is still linked to an epoll
         * descriptor. We need to handle this by correctly unlinking it
         * from its containers.
         */
        eventpoll_release_file(file);
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock);

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
        return op != EPOLL_CTL_DEL;
}

#else

static inline void eventpoll_release(struct file *file) {}

#endif

#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT)
/* ARM OABI has an incompatible struct layout and needs a special handler */
extern struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent);
#else
static inline struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent)
{
        if (__put_user(revents, &uevent->events) ||
            __put_user(data, &uevent->data))
                return NULL;

        return uevent+1;
}
#endif

#endif /* #ifndef _LINUX_EVENTPOLL_H */



















































































































































































































































































    1 
    1 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2013 Politecnico di Torino, Italy
 *                    TORSEC group -- https://security.polito.it
 *
 * Author: Roberto Sassu <roberto.sassu@polito.it>
 *
 * File: ima_template.c
 *      Helpers to manage template descriptors.
 */

#include <linux/rculist.h>
#include "ima.h"
#include "ima_template_lib.h"

enum header_fields { HDR_PCR, HDR_DIGEST, HDR_TEMPLATE_NAME,
                     HDR_TEMPLATE_DATA, HDR__LAST };

static struct ima_template_desc builtin_templates[] = {
        {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT},
        {.name = "ima-ng", .fmt = "d-ng|n-ng"},
        {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"},
        {.name = "ima-ngv2", .fmt = "d-ngv2|n-ng"},
        {.name = "ima-sigv2", .fmt = "d-ngv2|n-ng|sig"},
        {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"},
        {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"},
        {.name = "evm-sig",
         .fmt = "d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode"},
        {.name = "", .fmt = ""},        /* placeholder for a custom format */
};

static LIST_HEAD(defined_templates);
static DEFINE_SPINLOCK(template_list);
static int template_setup_done;

static const struct ima_template_field supported_fields[] = {
        {.field_id = "d", .field_init = ima_eventdigest_init,
         .field_show = ima_show_template_digest},
        {.field_id = "n", .field_init = ima_eventname_init,
         .field_show = ima_show_template_string},
        {.field_id = "d-ng", .field_init = ima_eventdigest_ng_init,
         .field_show = ima_show_template_digest_ng},
        {.field_id = "d-ngv2", .field_init = ima_eventdigest_ngv2_init,
         .field_show = ima_show_template_digest_ngv2},
        {.field_id = "n-ng", .field_init = ima_eventname_ng_init,
         .field_show = ima_show_template_string},
        {.field_id = "sig", .field_init = ima_eventsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "buf", .field_init = ima_eventbuf_init,
         .field_show = ima_show_template_buf},
        {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init,
         .field_show = ima_show_template_digest_ng},
        {.field_id = "modsig", .field_init = ima_eventmodsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "evmsig", .field_init = ima_eventevmsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "iuid", .field_init = ima_eventinodeuid_init,
         .field_show = ima_show_template_uint},
        {.field_id = "igid", .field_init = ima_eventinodegid_init,
         .field_show = ima_show_template_uint},
        {.field_id = "imode", .field_init = ima_eventinodemode_init,
         .field_show = ima_show_template_uint},
        {.field_id = "xattrnames",
         .field_init = ima_eventinodexattrnames_init,
         .field_show = ima_show_template_string},
        {.field_id = "xattrlengths",
         .field_init = ima_eventinodexattrlengths_init,
         .field_show = ima_show_template_sig},
        {.field_id = "xattrvalues",
         .field_init = ima_eventinodexattrvalues_init,
         .field_show = ima_show_template_sig},
};

/*
 * Used when restoring measurements carried over from a kexec. 'd' and 'n' don't
 * need to be accounted for since they shouldn't be defined in the same template
 * description as 'd-ng' and 'n-ng' respectively.
 */
#define MAX_TEMPLATE_NAME_LEN \
        sizeof("d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode")

static struct ima_template_desc *ima_template;
static struct ima_template_desc *ima_buf_template;

/**
 * ima_template_has_modsig - Check whether template has modsig-related fields.
 * @ima_template: IMA template to check.
 *
 * Tells whether the given template has fields referencing a file's appended
 * signature.
 */
bool ima_template_has_modsig(const struct ima_template_desc *ima_template)
{
        int i;

        for (i = 0; i < ima_template->num_fields; i++)
                if (!strcmp(ima_template->fields[i]->field_id, "modsig") ||
                    !strcmp(ima_template->fields[i]->field_id, "d-modsig"))
                        return true;

        return false;
}

static int __init ima_template_setup(char *str)
{
        struct ima_template_desc *template_desc;
        int template_len = strlen(str);

        if (template_setup_done)
                return 1;

        if (!ima_template)
                ima_init_template_list();

        /*
         * Verify that a template with the supplied name exists.
         * If not, use CONFIG_IMA_DEFAULT_TEMPLATE.
         */
        template_desc = lookup_template_desc(str);
        if (!template_desc) {
                pr_err("template %s not found, using %s\n",
                       str, CONFIG_IMA_DEFAULT_TEMPLATE);
                return 1;
        }

        /*
         * Verify whether the current hash algorithm is supported
         * by the 'ima' template.
         */
        if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 &&
            ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) {
                pr_err("template does not support hash alg\n");
                return 1;
        }

        ima_template = template_desc;
        template_setup_done = 1;
        return 1;
}
__setup("ima_template=", ima_template_setup);

static int __init ima_template_fmt_setup(char *str)
{
        int num_templates = ARRAY_SIZE(builtin_templates);

        if (template_setup_done)
                return 1;

        if (template_desc_init_fields(str, NULL, NULL) < 0) {
                pr_err("format string '%s' not valid, using template %s\n",
                       str, CONFIG_IMA_DEFAULT_TEMPLATE);
                return 1;
        }

        builtin_templates[num_templates - 1].fmt = str;
        ima_template = builtin_templates + num_templates - 1;
        template_setup_done = 1;

        return 1;
}
__setup("ima_template_fmt=", ima_template_fmt_setup);

struct ima_template_desc *lookup_template_desc(const char *name)
{
        struct ima_template_desc *template_desc;
        int found = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(template_desc, &defined_templates, list) {
                if ((strcmp(template_desc->name, name) == 0) ||
                    (strcmp(template_desc->fmt, name) == 0)) {
                        found = 1;
                        break;
                }
        }
        rcu_read_unlock();
        return found ? template_desc : NULL;
}

static const struct ima_template_field *
lookup_template_field(const char *field_id)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(supported_fields); i++)
                if (strncmp(supported_fields[i].field_id, field_id,
                            IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0)
                        return &supported_fields[i];
        return NULL;
}

static int template_fmt_size(const char *template_fmt)
{
        char c;
        int template_fmt_len = strlen(template_fmt);
        int i = 0, j = 0;

        while (i < template_fmt_len) {
                c = template_fmt[i];
                if (c == '|')
                        j++;
                i++;
        }

        return j + 1;
}

int template_desc_init_fields(const char *template_fmt,
                              const struct ima_template_field ***fields,
                              int *num_fields)
{
        const char *template_fmt_ptr;
        const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX];
        int template_num_fields;
        int i, len;

        if (num_fields && *num_fields > 0) /* already initialized? */
                return 0;

        template_num_fields = template_fmt_size(template_fmt);

        if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) {
                pr_err("format string '%s' contains too many fields\n",
                       template_fmt);
                return -EINVAL;
        }

        for (i = 0, template_fmt_ptr = template_fmt; i < template_num_fields;
             i++, template_fmt_ptr += len + 1) {
                char tmp_field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN + 1];

                len = strchrnul(template_fmt_ptr, '|') - template_fmt_ptr;
                if (len == 0 || len > IMA_TEMPLATE_FIELD_ID_MAX_LEN) {
                        pr_err("Invalid field with length %d\n", len);
                        return -EINVAL;
                }

                memcpy(tmp_field_id, template_fmt_ptr, len);
                tmp_field_id[len] = '\0';
                found_fields[i] = lookup_template_field(tmp_field_id);
                if (!found_fields[i]) {
                        pr_err("field '%s' not found\n", tmp_field_id);
                        return -ENOENT;
                }
        }

        if (fields && num_fields) {
                *fields = kmalloc_array(i, sizeof(**fields), GFP_KERNEL);
                if (*fields == NULL)
                        return -ENOMEM;

                memcpy(*fields, found_fields, i * sizeof(**fields));
                *num_fields = i;
        }

        return 0;
}

void ima_init_template_list(void)
{
        int i;

        if (!list_empty(&defined_templates))
                return;

        spin_lock(&template_list);
        for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) {
                list_add_tail_rcu(&builtin_templates[i].list,
                                  &defined_templates);
        }
        spin_unlock(&template_list);
}

struct ima_template_desc *ima_template_desc_current(void)
{
        if (!ima_template) {
                ima_init_template_list();
                ima_template =
                    lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE);
        }
        return ima_template;
}

struct ima_template_desc *ima_template_desc_buf(void)
{
        if (!ima_buf_template) {
                ima_init_template_list();
                ima_buf_template = lookup_template_desc("ima-buf");
        }
        return ima_buf_template;
}

int __init ima_init_template(void)
{
        struct ima_template_desc *template = ima_template_desc_current();
        int result;

        result = template_desc_init_fields(template->fmt,
                                           &(template->fields),
                                           &(template->num_fields));
        if (result < 0) {
                pr_err("template %s init failed, result: %d\n",
                       (strlen(template->name) ?
                       template->name : template->fmt), result);
                return result;
        }

        template = ima_template_desc_buf();
        if (!template) {
                pr_err("Failed to get ima-buf template\n");
                return -EINVAL;
        }

        result = template_desc_init_fields(template->fmt,
                                           &(template->fields),
                                           &(template->num_fields));
        if (result < 0)
                pr_err("template %s init failed, result: %d\n",
                       (strlen(template->name) ?
                       template->name : template->fmt), result);

        return result;
}

static struct ima_template_desc *restore_template_fmt(char *template_name)
{
        struct ima_template_desc *template_desc = NULL;
        int ret;

        ret = template_desc_init_fields(template_name, NULL, NULL);
        if (ret < 0) {
                pr_err("attempting to initialize the template \"%s\" failed\n",
                        template_name);
                goto out;
        }

        template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL);
        if (!template_desc)
                goto out;

        template_desc->name = "";
        template_desc->fmt = kstrdup(template_name, GFP_KERNEL);
        if (!template_desc->fmt) {
                kfree(template_desc);
                template_desc = NULL;
                goto out;
        }

        spin_lock(&template_list);
        list_add_tail_rcu(&template_desc->list, &defined_templates);
        spin_unlock(&template_list);
out:
        return template_desc;
}

static int ima_restore_template_data(struct ima_template_desc *template_desc,
                                     void *template_data,
                                     int template_data_size,
                                     struct ima_template_entry **entry)
{
        struct tpm_digest *digests;
        int ret = 0;
        int i;

        *entry = kzalloc(struct_size(*entry, template_data,
                                     template_desc->num_fields), GFP_NOFS);
        if (!*entry)
                return -ENOMEM;

        digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots,
                          sizeof(*digests), GFP_NOFS);
        if (!digests) {
                kfree(*entry);
                return -ENOMEM;
        }

        (*entry)->digests = digests;

        ret = ima_parse_buf(template_data, template_data + template_data_size,
                            NULL, template_desc->num_fields,
                            (*entry)->template_data, NULL, NULL,
                            ENFORCE_FIELDS | ENFORCE_BUFEND, "template data");
        if (ret < 0) {
                kfree((*entry)->digests);
                kfree(*entry);
                return ret;
        }

        (*entry)->template_desc = template_desc;
        for (i = 0; i < template_desc->num_fields; i++) {
                struct ima_field_data *field_data = &(*entry)->template_data[i];
                u8 *data = field_data->data;

                (*entry)->template_data[i].data =
                        kzalloc(field_data->len + 1, GFP_KERNEL);
                if (!(*entry)->template_data[i].data) {
                        ret = -ENOMEM;
                        break;
                }
                memcpy((*entry)->template_data[i].data, data, field_data->len);
                (*entry)->template_data_len += sizeof(field_data->len);
                (*entry)->template_data_len += field_data->len;
        }

        if (ret < 0) {
                ima_free_template_entry(*entry);
                *entry = NULL;
        }

        return ret;
}

/* Restore the serialized binary measurement list without extending PCRs. */
int ima_restore_measurement_list(loff_t size, void *buf)
{
        char template_name[MAX_TEMPLATE_NAME_LEN];
        unsigned char zero[TPM_DIGEST_SIZE] = { 0 };

        struct ima_kexec_hdr *khdr = buf;
        struct ima_field_data hdr[HDR__LAST] = {
                [HDR_PCR] = {.len = sizeof(u32)},
                [HDR_DIGEST] = {.len = TPM_DIGEST_SIZE},
        };

        void *bufp = buf + sizeof(*khdr);
        void *bufendp;
        struct ima_template_entry *entry;
        struct ima_template_desc *template_desc;
        DECLARE_BITMAP(hdr_mask, HDR__LAST);
        unsigned long count = 0;
        int ret = 0;

        if (!buf || size < sizeof(*khdr))
                return 0;

        if (ima_canonical_fmt) {
                khdr->version = le16_to_cpu((__force __le16)khdr->version);
                khdr->count = le64_to_cpu((__force __le64)khdr->count);
                khdr->buffer_size = le64_to_cpu((__force __le64)khdr->buffer_size);
        }

        if (khdr->version != 1) {
                pr_err("attempting to restore a incompatible measurement list");
                return -EINVAL;
        }

        if (khdr->count > ULONG_MAX - 1) {
                pr_err("attempting to restore too many measurements");
                return -EINVAL;
        }

        bitmap_zero(hdr_mask, HDR__LAST);
        bitmap_set(hdr_mask, HDR_PCR, 1);
        bitmap_set(hdr_mask, HDR_DIGEST, 1);

        /*
         * ima kexec buffer prefix: version, buffer size, count
         * v1 format: pcr, digest, template-name-len, template-name,
         *              template-data-size, template-data
         */
        bufendp = buf + khdr->buffer_size;
        while ((bufp < bufendp) && (count++ < khdr->count)) {
                int enforce_mask = ENFORCE_FIELDS;

                enforce_mask |= (count == khdr->count) ? ENFORCE_BUFEND : 0;
                ret = ima_parse_buf(bufp, bufendp, &bufp, HDR__LAST, hdr, NULL,
                                    hdr_mask, enforce_mask, "entry header");
                if (ret < 0)
                        break;

                if (hdr[HDR_TEMPLATE_NAME].len >= MAX_TEMPLATE_NAME_LEN) {
                        pr_err("attempting to restore a template name that is too long\n");
                        ret = -EINVAL;
                        break;
                }

                /* template name is not null terminated */
                memcpy(template_name, hdr[HDR_TEMPLATE_NAME].data,
                       hdr[HDR_TEMPLATE_NAME].len);
                template_name[hdr[HDR_TEMPLATE_NAME].len] = 0;

                if (strcmp(template_name, "ima") == 0) {
                        pr_err("attempting to restore an unsupported template \"%s\" failed\n",
                               template_name);
                        ret = -EINVAL;
                        break;
                }

                template_desc = lookup_template_desc(template_name);
                if (!template_desc) {
                        template_desc = restore_template_fmt(template_name);
                        if (!template_desc)
                                break;
                }

                /*
                 * Only the running system's template format is initialized
                 * on boot.  As needed, initialize the other template formats.
                 */
                ret = template_desc_init_fields(template_desc->fmt,
                                                &(template_desc->fields),
                                                &(template_desc->num_fields));
                if (ret < 0) {
                        pr_err("attempting to restore the template fmt \"%s\" failed\n",
                               template_desc->fmt);
                        ret = -EINVAL;
                        break;
                }

                ret = ima_restore_template_data(template_desc,
                                                hdr[HDR_TEMPLATE_DATA].data,
                                                hdr[HDR_TEMPLATE_DATA].len,
                                                &entry);
                if (ret < 0)
                        break;

                if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) {
                        ret = ima_calc_field_array_hash(
                                                &entry->template_data[0],
                                                entry);
                        if (ret < 0) {
                                pr_err("cannot calculate template digest\n");
                                ret = -EINVAL;
                                break;
                        }
                }

                entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) :
                             le32_to_cpu(*(__le32 *)(hdr[HDR_PCR].data));
                ret = ima_restore_measurement_entry(entry);
                if (ret < 0)
                        break;

        }
        return ret;
}









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 

    1 


    1 









    1 





    1 







    1 










    1 





















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
13444
13445
13446
13447
13448
13449
13450
13451
13452
13453
13454
13455
13456
13457
13458
13459
13460
13461
13462
13463
13464
13465
13466
13467
13468
13469
13470
13471
13472
13473
13474
13475
13476
13477
13478
13479
13480
13481
13482
13483
13484
13485
13486
13487
13488
13489
13490
13491
13492
13493
13494
13495
13496
13497
13498
13499
13500
13501
13502
13503
13504
13505
13506
13507
13508
13509
13510
13511
13512
13513
13514
13515
13516
13517
13518
13519
13520
13521
13522
13523
13524
13525
13526
13527
13528
13529
13530
13531
13532
13533
13534
13535
13536
13537
13538
13539
13540
13541
13542
13543
13544
13545
13546
13547
13548
13549
13550
13551
13552
13553
13554
13555
13556
13557
13558
13559
13560
13561
13562
13563
13564
13565
13566
13567
13568
13569
13570
13571
13572
13573
13574
13575
13576
13577
13578
13579
13580
13581
13582
13583
13584
13585
13586
13587
13588
13589
13590
13591
13592
13593
13594
13595
13596
13597
13598
13599
13600
13601
13602
13603
13604
13605
13606
13607
13608
13609
13610
13611
13612
13613
13614
13615
13616
13617
13618
13619
13620
13621
13622
13623
13624
13625
13626
13627
13628
13629
13630
13631
13632
13633
13634
13635
13636
13637
13638
13639
13640
13641
13642
13643
13644
13645
13646
13647
13648
13649
13650
13651
13652
13653
13654
13655
13656
13657
13658
13659
13660
13661
13662
13663
13664
13665
13666
13667
13668
13669
13670
13671
13672
13673
13674
13675
13676
13677
13678
13679
13680
13681
13682
13683
13684
13685
13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
13916
13917
13918
13919
13920
13921
13922
13923
13924
13925
13926
13927
13928
13929
13930
13931
13932
13933
13934
13935
13936
13937
13938
13939
13940
13941
13942
13943
13944
13945
13946
13947
13948
13949
13950
13951
13952
13953
13954
13955
13956
13957
13958
13959
13960
13961
13962
13963
13964
13965
13966
13967
13968
13969
13970
13971
13972
13973
13974
13975
13976
13977
13978
13979
13980
13981
13982
13983
13984
13985
13986
13987
13988
13989
13990
13991
13992
13993
13994
13995
13996
13997
13998
13999
14000
14001
14002
14003
14004
14005
14006
14007
14008
14009
14010
14011
14012
14013
14014
14015
14016
14017
14018
14019
14020
14021
14022
14023
14024
14025
14026
14027
14028
14029
14030
14031
14032
14033
14034
14035
14036
14037
14038
14039
14040
14041
14042
14043
14044
14045
14046
14047
14048
14049
14050
14051
14052
14053
14054
14055
14056
14057
14058
14059
14060
14061
14062
14063
14064
14065
14066
14067
14068
14069
14070
14071
14072
14073
14074
14075
14076
14077
14078
14079
14080
14081
14082
14083
14084
14085
14086
14087
14088
14089
14090
14091
14092
14093
14094
14095
14096
14097
14098
14099
14100
14101
14102
14103
14104
14105
14106
14107
14108
14109
14110
14111
14112
14113
14114
14115
14116
14117
14118
14119
14120
14121
14122
14123
14124
14125
14126
14127
14128
14129
14130
14131
14132
14133
14134
14135
14136
14137
14138
14139
14140
14141
14142
14143
14144
14145
14146
14147
14148
14149
14150
14151
14152
14153
14154
14155
14156
14157
14158
14159
14160
14161
14162
14163
14164
14165
14166
14167
14168
14169
14170
14171
14172
14173
14174
14175
14176
14177
14178
14179
14180
14181
14182
14183
14184
14185
14186
14187
14188
14189
14190
14191
14192
14193
14194
14195
14196
14197
14198
14199
14200
14201
14202
14203
14204
14205
14206
14207
14208
14209
14210
14211
14212
14213
14214
14215
14216
14217
14218
14219
14220
14221
14222
14223
14224
14225
14226
14227
14228
14229
14230
14231
14232
14233
14234
14235
14236
14237
14238
14239
14240
14241
14242
14243
14244
14245
14246
14247
14248
14249
14250
14251
14252
14253
14254
14255
14256
14257
14258
14259
14260
14261
14262
14263
14264
14265
14266
14267
14268
14269
14270
14271
14272
14273
14274
14275
14276
14277
14278
14279
14280
14281
14282
14283
14284
14285
14286
14287
14288
14289
14290
14291
14292
14293
14294
14295
14296
14297
14298
14299
14300
14301
14302
14303
14304
14305
14306
14307
14308
14309
14310
14311
14312
14313
14314
14315
14316
14317
14318
14319
14320
14321
14322
14323
14324
14325
14326
14327
14328
14329
14330
14331
14332
14333
14334
14335
14336
14337
14338
14339
14340
14341
14342
14343
14344
14345
14346
14347
14348
14349
14350
14351
14352
14353
14354
14355
14356
14357
14358
14359
14360
14361
14362
14363
14364
14365
14366
14367
14368
14369
14370
14371
14372
14373
14374
14375
14376
14377
14378
14379
14380
14381
14382
14383
14384
14385
14386
14387
14388
14389
14390
14391
14392
14393
14394
14395
14396
14397
14398
14399
14400
14401
14402
14403
14404
14405
14406
14407
14408
14409
14410
14411
14412
14413
14414
14415
14416
14417
14418
14419
14420
14421
14422
14423
14424
14425
14426
14427
14428
14429
14430
14431
14432
14433
14434
14435
14436
14437
14438
14439
14440
14441
14442
14443
14444
14445
14446
14447
14448
14449
14450
14451
14452
14453
14454
14455
14456
14457
14458
14459
14460
14461
14462
14463
14464
14465
14466
14467
14468
14469
14470
14471
14472
14473
14474
14475
14476
14477
14478
14479
14480
14481
14482
14483
14484
14485
14486
14487
14488
14489
14490
14491
14492
14493
14494
14495
14496
14497
14498
14499
14500
14501
14502
14503
14504
14505
14506
14507
14508
14509
14510
14511
14512
14513
14514
14515
14516
14517
14518
14519
14520
14521
14522
14523
14524
14525
14526
14527
14528
14529
14530
14531
14532
14533
14534
14535
14536
14537
14538
14539
14540
14541
14542
14543
14544
14545
14546
14547
14548
14549
14550
14551
14552
14553
14554
14555
14556
14557
14558
14559
14560
14561
14562
14563
14564
14565
14566
14567
14568
14569
14570
14571
14572
14573
14574
14575
14576
14577
14578
14579
14580
14581
14582
14583
14584
14585
14586
14587
14588
14589
14590
14591
14592
14593
14594
14595
14596
14597
14598
14599
14600
14601
14602
14603
14604
14605
14606
14607
14608
14609
14610
14611
14612
14613
14614
14615
14616
14617
14618
14619
14620
14621
14622
14623
14624
14625
14626
14627
14628
14629
14630
14631
14632
14633
14634
14635
14636
14637
14638
14639
14640
14641
14642
14643
14644
14645
14646
14647
14648
14649
14650
14651
14652
14653
14654
14655
14656
14657
14658
14659
14660
14661
14662
14663
14664
14665
14666
14667
14668
14669
14670
14671
14672
14673
14674
14675
14676
14677
14678
14679
14680
14681
14682
14683
14684
14685
14686
14687
14688
14689
14690
14691
14692
14693
14694
14695
14696
14697
14698
14699
14700
14701
14702
14703
14704
14705
14706
14707
14708
14709
14710
14711
14712
14713
14714
14715
14716
14717
14718
14719
14720
14721
14722
14723
14724
14725
14726
14727
14728
14729
14730
14731
14732
14733
14734
14735
14736
14737
14738
14739
14740
14741
14742
14743
14744
14745
14746
14747
14748
14749
14750
14751
14752
14753
14754
14755
14756
14757
14758
14759
14760
14761
14762
14763
14764
14765
14766
14767
14768
14769
14770
14771
14772
14773
14774
14775
14776
14777
14778
14779
14780
14781
14782
14783
14784
14785
14786
14787
14788
14789
14790
14791
14792
14793
14794
14795
14796
14797
14798
14799
14800
14801
14802
14803
14804
14805
14806
14807
14808
14809
14810
14811
14812
14813
14814
14815
14816
14817
14818
14819
14820
14821
14822
14823
14824
14825
14826
14827
14828
14829
14830
14831
14832
14833
14834
14835
14836
14837
14838
14839
14840
14841
14842
14843
14844
14845
14846
14847
14848
14849
14850
14851
14852
14853
14854
14855
14856
14857
14858
14859
14860
14861
14862
14863
14864
14865
14866
14867
14868
14869
14870
14871
14872
14873
14874
14875
14876
14877
14878
14879
14880
14881
14882
14883
14884
14885
14886
14887
14888
14889
14890
14891
14892
14893
14894
14895
14896
14897
14898
14899
14900
14901
14902
14903
14904
14905
14906
14907
14908
14909
14910
14911
14912
14913
14914
14915
14916
14917
14918
14919
14920
14921
14922
14923
14924
14925
14926
14927
14928
14929
14930
14931
14932
14933
14934
14935
14936
14937
14938
14939
14940
14941
14942
14943
14944
14945
14946
14947
14948
14949
14950
14951
14952
14953
14954
14955
14956
14957
14958
14959
14960
14961
14962
14963
14964
14965
14966
14967
14968
14969
14970
14971
14972
14973
14974
14975
14976
14977
14978
14979
14980
14981
14982
14983
14984
14985
14986
14987
14988
14989
14990
14991
14992
14993
14994
14995
14996
14997
14998
14999
15000
15001
15002
15003
15004
15005
15006
15007
15008
15009
15010
15011
15012
15013
15014
15015
15016
15017
15018
15019
15020
15021
15022
15023
15024
15025
15026
15027
15028
15029
15030
15031
15032
15033
15034
15035
15036
15037
15038
15039
15040
15041
15042
15043
15044
15045
15046
15047
15048
15049
15050
15051
15052
15053
15054
15055
15056
15057
15058
15059
15060
15061
15062
15063
15064
15065
15066
15067
15068
15069
15070
15071
15072
15073
15074
15075
15076
15077
15078
15079
15080
15081
15082
15083
15084
15085
15086
15087
15088
15089
15090
15091
15092
15093
15094
15095
15096
15097
15098
15099
15100
15101
15102
15103
15104
15105
15106
15107
15108
15109
15110
15111
15112
15113
15114
15115
15116
15117
15118
15119
15120
15121
15122
15123
15124
15125
15126
15127
15128
15129
15130
15131
15132
15133
15134
15135
15136
15137
15138
15139
15140
15141
15142
15143
15144
15145
15146
15147
15148
15149
15150
15151
15152
15153
15154
15155
15156
15157
15158
15159
15160
15161
15162
15163
15164
15165
15166
15167
15168
15169
15170
15171
15172
15173
15174
15175
15176
15177
15178
15179
15180
15181
15182
15183
15184
15185
15186
15187
15188
15189
15190
15191
15192
15193
15194
15195
15196
15197
15198
15199
15200
15201
15202
15203
15204
15205
15206
15207
15208
15209
15210
15211
15212
15213
15214
15215
15216
15217
15218
15219
15220
15221
15222
15223
15224
15225
15226
15227
15228
15229
15230
15231
15232
15233
15234
15235
15236
15237
15238
15239
15240
15241
15242
15243
15244
15245
15246
15247
15248
15249
15250
15251
15252
15253
15254
15255
15256
15257
15258
15259
15260
15261
15262
15263
15264
15265
15266
15267
15268
15269
15270
15271
15272
15273
15274
15275
15276
15277
15278
15279
15280
15281
15282
15283
15284
15285
15286
15287
15288
15289
15290
15291
15292
15293
15294
15295
15296
15297
15298
15299
15300
15301
15302
15303
15304
15305
15306
15307
15308
15309
15310
15311
15312
15313
15314
15315
15316
15317
15318
15319
15320
15321
15322
15323
15324
15325
15326
15327
15328
15329
15330
15331
15332
15333
15334
15335
15336
15337
15338
15339
15340
15341
15342
15343
15344
15345
15346
15347
15348
15349
15350
15351
15352
15353
15354
15355
15356
15357
15358
15359
15360
15361
15362
15363
15364
15365
15366
15367
15368
15369
15370
15371
15372
15373
15374
15375
15376
15377
15378
15379
15380
15381
15382
15383
15384
15385
15386
15387
15388
15389
15390
15391
15392
15393
15394
15395
15396
15397
15398
15399
15400
15401
15402
15403
15404
15405
15406
15407
15408
15409
15410
15411
15412
15413
15414
15415
15416
15417
15418
15419
15420
15421
15422
15423
15424
15425
15426
15427
15428
15429
15430
15431
15432
15433
15434
15435
15436
15437
15438
15439
15440
15441
15442
15443
15444
15445
15446
15447
15448
15449
15450
15451
15452
15453
15454
15455
15456
15457
15458
15459
15460
15461
15462
15463
15464
15465
15466
15467
15468
15469
15470
15471
15472
15473
15474
15475
15476
15477
15478
15479
15480
15481
15482
15483
15484
15485
15486
15487
15488
15489
15490
15491
15492
15493
15494
15495
15496
15497
15498
15499
15500
15501
15502
15503
15504
15505
15506
15507
15508
15509
15510
15511
15512
15513
15514
15515
15516
15517
15518
15519
15520
15521
15522
15523
15524
15525
15526
15527
15528
15529
15530
15531
15532
15533
15534
15535
15536
15537
15538
15539
15540
15541
15542
15543
15544
15545
15546
15547
15548
15549
15550
15551
15552
15553
15554
15555
15556
15557
15558
15559
15560
15561
15562
15563
15564
15565
15566
15567
15568
15569
15570
15571
15572
15573
15574
15575
15576
15577
15578
15579
15580
15581
15582
15583
15584
15585
15586
15587
15588
15589
15590
15591
15592
15593
15594
15595
15596
15597
15598
15599
15600
15601
15602
15603
15604
15605
15606
15607
15608
15609
15610
15611
15612
15613
15614
15615
15616
15617
15618
15619
15620
15621
15622
15623
15624
15625
15626
15627
15628
15629
15630
15631
15632
15633
15634
15635
15636
15637
15638
15639
15640
15641
15642
15643
15644
15645
15646
15647
15648
15649
15650
15651
15652
15653
15654
15655
15656
15657
15658
15659
15660
15661
15662
15663
15664
15665
15666
15667
15668
15669
15670
15671
15672
15673
15674
15675
15676
15677
15678
15679
15680
15681
15682
15683
15684
15685
15686
15687
15688
15689
15690
15691
15692
15693
15694
15695
15696
15697
15698
15699
15700
15701
15702
15703
15704
15705
15706
15707
15708
15709
15710
15711
15712
15713
15714
15715
15716
15717
15718
15719
15720
15721
15722
15723
15724
15725
15726
15727
15728
15729
15730
15731
15732
15733
15734
15735
15736
15737
15738
15739
15740
15741
15742
15743
15744
15745
15746
15747
15748
15749
15750
15751
15752
15753
15754
15755
15756
15757
15758
15759
15760
15761
15762
15763
15764
15765
15766
15767
15768
15769
15770
15771
15772
15773
15774
15775
15776
15777
15778
15779
15780
15781
15782
15783
15784
15785
15786
15787
15788
15789
15790
15791
15792
15793
15794
15795
15796
15797
15798
15799
15800
15801
15802
15803
15804
15805
15806
15807
15808
15809
15810
15811
15812
15813
15814
15815
15816
15817
15818
15819
15820
15821
15822
15823
15824
15825
15826
15827
15828
15829
15830
15831
15832
15833
15834
15835
15836
15837
15838
15839
15840
15841
15842
15843
15844
15845
15846
15847
15848
15849
15850
15851
15852
15853
15854
15855
15856
15857
15858
15859
15860
15861
15862
15863
15864
15865
15866
15867
15868
15869
15870
15871
15872
15873
15874
15875
15876
15877
15878
15879
15880
15881
15882
15883
15884
15885
15886
15887
15888
15889
15890
15891
15892
15893
15894
15895
15896
15897
15898
15899
15900
15901
15902
15903
15904
15905
15906
15907
15908
15909
15910
15911
15912
15913
15914
15915
15916
15917
15918
15919
15920
15921
15922
15923
15924
15925
15926
15927
15928
15929
15930
15931
15932
15933
15934
15935
15936
15937
15938
15939
15940
15941
15942
15943
15944
15945
15946
15947
15948
15949
15950
15951
15952
15953
15954
15955
15956
15957
15958
15959
15960
15961
15962
15963
15964
15965
15966
15967
15968
15969
15970
15971
15972
15973
15974
15975
15976
15977
15978
15979
15980
15981
15982
15983
15984
15985
15986
15987
15988
15989
15990
15991
15992
15993
15994
15995
15996
15997
15998
15999
16000
16001
16002
16003
16004
16005
16006
16007
16008
16009
16010
16011
16012
16013
16014
16015
16016
16017
16018
16019
16020
16021
16022
16023
16024
16025
16026
16027
16028
16029
16030
16031
16032
16033
16034
16035
16036
16037
16038
16039
16040
16041
16042
16043
16044
16045
16046
16047
16048
16049
16050
16051
16052
16053
16054
16055
16056
16057
16058
16059
16060
16061
16062
16063
16064
16065
16066
16067
16068
16069
16070
16071
16072
16073
16074
16075
16076
16077
16078
16079
16080
16081
16082
16083
16084
16085
16086
16087
16088
16089
16090
16091
16092
16093
16094
16095
16096
16097
16098
16099
16100
16101
16102
16103
16104
16105
16106
16107
16108
16109
16110
16111
16112
16113
16114
16115
16116
16117
16118
16119
16120
16121
16122
16123
16124
16125
16126
16127
16128
16129
16130
16131
16132
16133
16134
16135
16136
16137
16138
16139
16140
16141
16142
16143
16144
16145
16146
16147
16148
16149
16150
16151
16152
16153
16154
16155
16156
16157
16158
16159
16160
16161
16162
16163
16164
16165
16166
16167
16168
16169
16170
16171
16172
16173
16174
16175
16176
16177
16178
16179
16180
16181
16182
16183
16184
16185
16186
16187
16188
16189
16190
16191
16192
16193
16194
16195
16196
16197
16198
16199
16200
16201
16202
16203
16204
16205
16206
16207
16208
16209
16210
16211
16212
16213
16214
16215
16216
16217
16218
16219
16220
16221
16222
16223
16224
16225
16226
16227
16228
16229
16230
16231
16232
16233
16234
16235
16236
16237
16238
16239
16240
16241
16242
16243
16244
16245
16246
16247
16248
16249
16250
16251
16252
16253
16254
16255
16256
16257
16258
16259
16260
16261
16262
16263
16264
16265
16266
16267
16268
16269
16270
16271
16272
16273
16274
16275
16276
16277
16278
16279
16280
16281
16282
16283
16284
16285
16286
16287
16288
16289
16290
16291
16292
16293
16294
16295
16296
16297
16298
16299
16300
16301
16302
16303
16304
16305
16306
16307
16308
16309
16310
16311
16312
16313
16314
16315
16316
16317
16318
16319
16320
16321
16322
16323
16324
16325
16326
16327
16328
16329
16330
16331
16332
16333
16334
16335
16336
16337
16338
16339
16340
16341
16342
16343
16344
16345
16346
16347
16348
16349
16350
16351
16352
16353
16354
16355
16356
16357
16358
16359
16360
16361
16362
16363
16364
16365
16366
16367
16368
16369
16370
16371
16372
16373
16374
16375
16376
16377
16378
16379
16380
16381
16382
16383
16384
16385
16386
16387
16388
16389
16390
16391
16392
16393
16394
16395
16396
16397
16398
16399
16400
16401
16402
16403
16404
16405
16406
16407
16408
16409
16410
16411
16412
16413
16414
16415
16416
16417
16418
16419
16420
16421
16422
16423
16424
16425
16426
16427
16428
16429
16430
16431
16432
16433
16434
16435
16436
16437
16438
16439
16440
16441
16442
16443
16444
16445
16446
16447
16448
16449
16450
16451
16452
16453
16454
16455
16456
16457
16458
16459
16460
16461
16462
16463
16464
16465
16466
16467
16468
16469
16470
16471
16472
16473
16474
16475
16476
16477
16478
16479
16480
16481
16482
16483
16484
16485
16486
16487
16488
16489
16490
16491
16492
16493
16494
16495
16496
16497
16498
16499
16500
16501
16502
16503
16504
16505
16506
16507
16508
16509
16510
16511
16512
16513
16514
16515
16516
16517
16518
16519
16520
16521
16522
16523
16524
16525
16526
16527
16528
16529
16530
16531
16532
16533
16534
16535
16536
16537
16538
16539
16540
16541
16542
16543
16544
16545
16546
16547
16548
16549
16550
16551
16552
16553
16554
16555
16556
16557
16558
16559
16560
16561
16562
16563
16564
16565
16566
16567
16568
16569
16570
16571
16572
16573
16574
16575
16576
16577
16578
16579
16580
16581
16582
16583
16584
16585
16586
16587
16588
16589
16590
16591
16592
16593
16594
16595
16596
16597
16598
16599
16600
16601
16602
16603
16604
16605
16606
16607
16608
16609
16610
16611
16612
16613
16614
16615
16616
16617
16618
16619
16620
16621
16622
16623
16624
16625
16626
16627
16628
16629
16630
16631
16632
16633
16634
16635
16636
16637
16638
16639
16640
16641
16642
16643
16644
16645
16646
16647
16648
16649
16650
16651
16652
16653
16654
16655
16656
16657
16658
16659
16660
16661
16662
16663
16664
16665
16666
16667
16668
16669
16670
16671
16672
16673
16674
16675
16676
16677
16678
16679
16680
16681
16682
16683
16684
16685
16686
16687
16688
16689
16690
16691
16692
16693
16694
16695
16696
16697
16698
16699
16700
16701
16702
16703
16704
16705
16706
16707
16708
16709
16710
16711
16712
16713
16714
16715
16716
16717
16718
16719
16720
16721
16722
16723
16724
16725
16726
16727
16728
16729
16730
16731
16732
16733
16734
16735
16736
16737
16738
16739
16740
16741
16742
16743
16744
16745
16746
16747
16748
16749
16750
16751
16752
16753
16754
16755
16756
16757
16758
16759
16760
16761
16762
16763
16764
16765
16766
16767
16768
16769
16770
16771
16772
16773
16774
16775
16776
16777
16778
16779
16780
16781
16782
16783
16784
16785
16786
16787
16788
16789
16790
16791
16792
16793
16794
16795
16796
16797
16798
16799
16800
16801
16802
16803
16804
16805
16806
16807
16808
16809
16810
16811
16812
16813
16814
16815
16816
16817
16818
16819
16820
16821
16822
16823
16824
16825
16826
16827
16828
16829
16830
16831
16832
16833
16834
16835
16836
16837
16838
16839
16840
16841
16842
16843
16844
16845
16846
16847
16848
16849
16850
16851
16852
16853
16854
16855
16856
16857
16858
16859
16860
16861
16862
16863
16864
16865
16866
16867
16868
16869
16870
16871
16872
16873
16874
16875
16876
16877
16878
16879
16880
16881
16882
16883
16884
16885
16886
16887
16888
16889
16890
16891
16892
16893
16894
16895
16896
16897
16898
16899
16900
16901
16902
16903
16904
16905
16906
16907
16908
16909
16910
16911
16912
16913
16914
16915
16916
16917
16918
16919
16920
16921
16922
16923
16924
16925
16926
16927
16928
16929
16930
16931
16932
16933
16934
16935
16936
16937
16938
16939
16940
16941
16942
16943
16944
16945
16946
16947
16948
16949
16950
16951
16952
16953
16954
16955
16956
16957
16958
16959
16960
16961
16962
16963
16964
16965
16966
16967
16968
16969
16970
16971
16972
16973
16974
16975
16976
16977
16978
16979
16980
16981
16982
16983
16984
16985
16986
16987
16988
16989
16990
16991
16992
16993
16994
16995
16996
16997
16998
16999
17000
17001
17002
17003
17004
17005
17006
17007
17008
17009
17010
17011
17012
17013
17014
17015
17016
17017
17018
17019
17020
17021
17022
17023
17024
17025
17026
17027
17028
17029
17030
17031
17032
17033
17034
17035
17036
17037
17038
17039
17040
17041
17042
17043
17044
17045
17046
17047
17048
17049
17050
17051
17052
17053
17054
17055
17056
17057
17058
17059
17060
17061
17062
17063
17064
17065
17066
17067
17068
17069
17070
17071
17072
17073
17074
17075
17076
17077
17078
17079
17080
17081
17082
17083
17084
17085
17086
17087
17088
17089
17090
17091
17092
17093
17094
17095
17096
17097
17098
17099
17100
17101
17102
17103
17104
17105
17106
17107
17108
17109
17110
17111
17112
17113
17114
17115
17116
17117
17118
17119
17120
17121
17122
17123
17124
17125
17126
17127
17128
17129
17130
17131
17132
17133
17134
17135
17136
17137
17138
17139
17140
17141
17142
17143
17144
17145
17146
17147
17148
17149
17150
17151
17152
17153
17154
17155
17156
17157
17158
17159
17160
17161
17162
17163
17164
17165
17166
17167
17168
17169
17170
17171
17172
17173
17174
17175
17176
17177
17178
17179
17180
17181
17182
17183
17184
17185
17186
17187
17188
17189
17190
17191
17192
17193
17194
17195
17196
17197
17198
17199
17200
17201
17202
17203
17204
17205
17206
17207
17208
17209
17210
17211
17212
17213
17214
17215
17216
17217
17218
17219
17220
17221
17222
17223
17224
17225
17226
17227
17228
17229
17230
17231
17232
17233
17234
17235
17236
17237
17238
17239
17240
17241
17242
17243
17244
17245
17246
17247
17248
17249
17250
17251
17252
17253
17254
17255
17256
17257
17258
17259
17260
17261
17262
17263
17264
17265
17266
17267
17268
17269
17270
17271
17272
17273
17274
17275
17276
17277
17278
17279
17280
17281
17282
17283
17284
17285
17286
17287
17288
17289
17290
17291
17292
17293
17294
17295
17296
17297
17298
17299
17300
17301
17302
17303
17304
17305
17306
17307
17308
17309
17310
17311
17312
17313
17314
17315
17316
17317
17318
17319
17320
17321
17322
17323
17324
17325
17326
17327
17328
17329
17330
17331
17332
17333
17334
17335
17336
17337
17338
17339
17340
17341
17342
17343
17344
17345
17346
17347
17348
17349
17350
17351
17352
17353
17354
17355
17356
17357
17358
17359
17360
17361
17362
17363
17364
17365
17366
17367
17368
17369
17370
17371
17372
17373
17374
17375
17376
17377
17378
17379
17380
17381
17382
17383
17384
17385
17386
17387
17388
17389
17390
17391
17392
17393
17394
17395
17396
17397
17398
17399
17400
17401
17402
17403
17404
17405
17406
17407
17408
17409
17410
17411
17412
17413
17414
17415
17416
17417
17418
17419
17420
17421
17422
17423
17424
17425
17426
17427
17428
17429
17430
17431
17432
17433
17434
17435
17436
17437
17438
17439
17440
17441
17442
17443
17444
17445
17446
17447
17448
17449
17450
17451
17452
17453
17454
17455
17456
17457
17458
17459
17460
17461
17462
17463
17464
17465
17466
17467
17468
17469
17470
17471
17472
17473
17474
17475
17476
17477
17478
17479
17480
17481
17482
17483
17484
17485
17486
17487
17488
17489
17490
17491
17492
17493
17494
17495
17496
17497
17498
17499
17500
17501
17502
17503
17504
17505
17506
17507
17508
17509
17510
17511
17512
17513
17514
17515
17516
17517
17518
17519
17520
17521
17522
17523
17524
17525
17526
17527
17528
17529
17530
17531
17532
17533
17534
17535
17536
17537
17538
17539
17540
17541
17542
17543
17544
17545
17546
17547
17548
17549
17550
17551
17552
17553
17554
17555
17556
17557
17558
17559
17560
17561
17562
17563
17564
17565
17566
17567
17568
17569
17570
17571
17572
17573
17574
17575
17576
17577
17578
17579
17580
17581
17582
17583
17584
17585
17586
17587
17588
17589
17590
17591
17592
17593
17594
17595
17596
17597
17598
17599
17600
17601
17602
17603
17604
17605
17606
17607
17608
17609
17610
17611
17612
17613
17614
17615
17616
17617
17618
17619
17620
17621
17622
17623
17624
17625
17626
17627
17628
17629
17630
17631
17632
17633
17634
17635
17636
17637
17638
17639
17640
17641
17642
17643
17644
17645
17646
17647
17648
17649
17650
17651
17652
17653
17654
17655
17656
17657
17658
17659
17660
17661
17662
17663
17664
17665
17666
17667
17668
17669
17670
17671
17672
17673
17674
17675
17676
17677
17678
17679
17680
17681
17682
17683
17684
17685
17686
17687
17688
17689
17690
17691
17692
17693
17694
17695
17696
17697
17698
17699
17700
17701
17702
17703
17704
17705
17706
17707
17708
17709
17710
17711
17712
17713
17714
17715
17716
17717
17718
17719
17720
17721
17722
17723
17724
17725
17726
17727
17728
17729
17730
17731
17732
17733
17734
17735
17736
17737
17738
17739
17740
17741
17742
17743
17744
17745
17746
17747
17748
17749
17750
17751
17752
17753
17754
17755
17756
17757
17758
17759
17760
17761
17762
17763
17764
17765
17766
17767
17768
17769
17770
17771
17772
17773
17774
17775
17776
17777
17778
17779
17780
17781
17782
17783
17784
17785
17786
17787
17788
17789
17790
17791
17792
17793
17794
17795
17796
17797
17798
17799
17800
17801
17802
17803
17804
17805
17806
17807
17808
17809
17810
17811
17812
17813
17814
17815
17816
17817
17818
17819
17820
17821
17822
17823
17824
17825
17826
17827
17828
17829
17830
17831
17832
17833
17834
17835
17836
17837
17838
17839
17840
17841
17842
17843
17844
17845
17846
17847
17848
17849
17850
17851
17852
17853
17854
17855
17856
17857
17858
17859
17860
17861
17862
17863
17864
17865
17866
17867
17868
17869
17870
17871
17872
17873
17874
17875
17876
17877
17878
17879
17880
17881
17882
17883
17884
17885
17886
17887
17888
17889
17890
17891
17892
17893
17894
17895
17896
17897
17898
17899
17900
17901
17902
17903
17904
17905
17906
17907
17908
17909
17910
17911
17912
17913
17914
17915
17916
17917
17918
17919
17920
17921
17922
17923
17924
17925
17926
17927
17928
17929
17930
17931
17932
17933
17934
17935
17936
17937
17938
17939
17940
17941
17942
17943
17944
17945
17946
17947
17948
17949
17950
17951
17952
17953
17954
17955
17956
17957
17958
17959
17960
17961
17962
17963
17964
17965
17966
17967
17968
17969
17970
17971
17972
17973
17974
17975
17976
17977
17978
17979
17980
17981
17982
17983
17984
17985
17986
17987
17988
17989
17990
17991
17992
17993
17994
17995
17996
17997
17998
17999
18000
18001
18002
18003
18004
18005
18006
18007
18008
18009
18010
18011
18012
18013
18014
18015
18016
18017
18018
18019
18020
18021
18022
18023
18024
18025
18026
18027
18028
18029
18030
18031
18032
18033
18034
18035
18036
18037
18038
18039
18040
18041
18042
18043
18044
18045
18046
18047
18048
18049
18050
18051
18052
18053
18054
18055
18056
18057
18058
18059
18060
18061
18062
18063
18064
18065
18066
18067
18068
18069
18070
18071
18072
18073
18074
18075
18076
18077
18078
18079
18080
18081
18082
18083
18084
18085
18086
18087
18088
18089
18090
18091
18092
18093
18094
18095
18096
18097
18098
18099
18100
18101
18102
18103
18104
18105
18106
18107
18108
18109
18110
18111
18112
18113
18114
18115
18116
18117
18118
18119
18120
18121
18122
18123
18124
18125
18126
18127
18128
18129
18130
18131
18132
18133
18134
18135
18136
18137
18138
18139
18140
18141
18142
18143
18144
18145
18146
18147
18148
18149
18150
18151
18152
18153
18154
18155
18156
18157
18158
18159
18160
18161
18162
18163
18164
18165
18166
18167
18168
18169
18170
18171
18172
18173
18174
18175
18176
18177
18178
18179
18180
18181
18182
18183
18184
18185
18186
18187
18188
18189
18190
18191
18192
18193
18194
18195
18196
18197
18198
18199
18200
18201
18202
18203
18204
18205
18206
18207
18208
18209
18210
18211
18212
18213
18214
18215
18216
18217
18218
18219
18220
18221
18222
18223
18224
18225
18226
18227
18228
18229
18230
18231
18232
18233
18234
18235
18236
18237
18238
18239
18240
18241
18242
18243
18244
18245
18246
18247
18248
18249
18250
18251
18252
18253
18254
18255
18256
18257
18258
18259
18260
18261
18262
18263
18264
18265
18266
18267
18268
18269
18270
18271
18272
18273
18274
18275
18276
18277
18278
18279
18280
18281
18282
18283
18284
18285
18286
18287
18288
18289
18290
18291
18292
18293
18294
18295
18296
18297
18298
18299
18300
18301
18302
18303
18304
18305
18306
18307
18308
18309
18310
18311
18312
18313
18314
18315
18316
18317
18318
18319
18320
18321
18322
18323
18324
18325
18326
18327
18328
18329
18330
18331
18332
18333
18334
18335
18336
18337
18338
18339
18340
18341
18342
18343
18344
18345
18346
18347
18348
18349
18350
18351
18352
18353
18354
18355
18356
18357
18358
18359
18360
18361
18362
18363
18364
18365
18366
18367
18368
18369
18370
18371
18372
18373
18374
18375
18376
18377
18378
18379
18380
18381
18382
18383
18384
18385
18386
18387
18388
18389
18390
18391
18392
18393
18394
18395
18396
18397
18398
18399
18400
18401
18402
18403
18404
18405
18406
18407
18408
18409
18410
18411
18412
18413
18414
18415
18416
18417
18418
18419
18420
18421
18422
18423
18424
18425
18426
18427
18428
18429
18430
18431
18432
18433
18434
18435
18436
18437
18438
18439
18440
18441
18442
18443
18444
18445
18446
18447
18448
18449
18450
18451
18452
18453
18454
18455
18456
18457
18458
18459
18460
18461
18462
18463
18464
18465
18466
18467
18468
18469
18470
18471
18472
18473
18474
18475
18476
18477
18478
18479
18480
18481
18482
18483
18484
18485
18486
18487
18488
18489
18490
18491
18492
18493
18494
18495
18496
18497
18498
18499
18500
18501
18502
18503
18504
18505
18506
18507
18508
18509
18510
18511
18512
18513
18514
18515
18516
18517
18518
18519
18520
18521
18522
18523
18524
18525
18526
18527
18528
18529
18530
18531
18532
18533
18534
18535
18536
18537
18538
18539
18540
18541
18542
18543
18544
18545
18546
18547
18548
18549
18550
18551
18552
18553
18554
18555
18556
18557
18558
18559
18560
18561
18562
18563
18564
18565
18566
18567
18568
18569
18570
18571
18572
18573
18574
18575
18576
18577
18578
18579
18580
18581
18582
18583
18584
18585
18586
18587
18588
18589
18590
18591
18592
18593
18594
18595
18596
18597
18598
18599
18600
18601
18602
18603
18604
18605
18606
18607
18608
18609
18610
18611
18612
18613
18614
18615
18616
18617
18618
18619
18620
18621
18622
18623
18624
18625
18626
18627
18628
18629
18630
18631
18632
18633
18634
18635
18636
18637
18638
18639
18640
18641
18642
18643
18644
18645
18646
18647
18648
18649
18650
18651
18652
18653
18654
18655
18656
18657
18658
18659
18660
18661
18662
18663
18664
18665
18666
18667
18668
18669
18670
18671
18672
18673
18674
18675
18676
18677
18678
18679
18680
18681
18682
18683
18684
18685
18686
18687
18688
18689
18690
18691
18692
18693
18694
18695
18696
18697
18698
18699
18700
18701
18702
18703
18704
18705
18706
18707
18708
18709
18710
18711
18712
18713
18714
18715
18716
18717
18718
18719
18720
18721
18722
18723
18724
18725
18726
18727
18728
18729
18730
18731
18732
18733
18734
18735
18736
18737
18738
18739
18740
18741
18742
18743
18744
18745
18746
18747
18748
18749
18750
18751
18752
18753
18754
18755
18756
18757
18758
18759
18760
18761
18762
18763
18764
18765
18766
18767
18768
18769
18770
18771
18772
18773
18774
18775
18776
18777
18778
18779
18780
18781
18782
18783
18784
18785
18786
18787
18788
18789
18790
18791
18792
18793
18794
18795
18796
18797
18798
18799
18800
18801
18802
18803
18804
18805
18806
18807
18808
18809
18810
18811
18812
18813
18814
18815
18816
18817
18818
18819
18820
18821
18822
18823
18824
18825
18826
18827
18828
18829
18830
18831
18832
18833
18834
18835
18836
18837
18838
18839
18840
18841
18842
18843
18844
18845
18846
18847
18848
18849
18850
18851
18852
18853
18854
18855
18856
18857
18858
18859
18860
18861
18862
18863
18864
18865
18866
18867
18868
18869
18870
18871
18872
18873
18874
18875
18876
18877
18878
18879
18880
18881
18882
18883
18884
18885
18886
18887
18888
18889
18890
18891
18892
18893
18894
18895
18896
18897
18898
18899
18900
18901
18902
18903
18904
18905
18906
18907
18908
18909
18910
18911
18912
18913
18914
18915
18916
18917
18918
18919
18920
18921
18922
18923
18924
18925
18926
18927
18928
18929
18930
18931
18932
18933
18934
18935
18936
18937
18938
18939
18940
18941
18942
18943
18944
18945
18946
18947
18948
18949
18950
18951
18952
18953
18954
18955
18956
18957
18958
18959
18960
18961
18962
18963
18964
18965
18966
18967
18968
18969
18970
18971
18972
18973
18974
18975
18976
18977
18978
18979
18980
18981
18982
18983
18984
18985
18986
18987
18988
18989
18990
18991
18992
18993
18994
18995
18996
18997
18998
18999
19000
19001
19002
19003
19004
19005
19006
19007
19008
19009
19010
19011
19012
19013
19014
19015
19016
19017
19018
19019
19020
19021
19022
19023
19024
19025
19026
19027
19028
19029
19030
19031
19032
19033
19034
19035
19036
19037
19038
19039
19040
19041
19042
19043
19044
19045
19046
19047
19048
19049
19050
19051
19052
19053
19054
19055
19056
19057
19058
19059
19060
19061
19062
19063
19064
19065
19066
19067
19068
19069
19070
19071
19072
19073
19074
19075
19076
19077
19078
19079
19080
19081
19082
19083
19084
19085
19086
19087
19088
19089
19090
19091
19092
19093
19094
19095
19096
19097
19098
19099
19100
19101
19102
19103
19104
19105
19106
19107
19108
19109
19110
19111
19112
19113
19114
19115
19116
19117
19118
19119
19120
19121
19122
19123
19124
19125
19126
19127
19128
19129
19130
19131
19132
19133
19134
19135
19136
19137
19138
19139
19140
19141
19142
19143
19144
19145
19146
19147
19148
19149
19150
19151
19152
19153
19154
19155
19156
19157
19158
19159
19160
19161
19162
19163
19164
19165
19166
19167
19168
19169
19170
19171
19172
19173
19174
19175
19176
19177
19178
19179
19180
19181
19182
19183
19184
19185
19186
19187
19188
19189
19190
19191
19192
19193
19194
19195
19196
19197
19198
19199
19200
19201
19202
19203
19204
19205
19206
19207
19208
19209
19210
19211
19212
19213
19214
19215
19216
19217
19218
19219
19220
19221
19222
19223
19224
19225
19226
19227
19228
19229
19230
19231
19232
19233
19234
19235
19236
19237
19238
19239
19240
19241
19242
19243
19244
19245
19246
19247
19248
19249
19250
19251
19252
19253
19254
19255
19256
19257
19258
19259
19260
19261
19262
19263
19264
19265
19266
19267
19268
19269
19270
19271
19272
19273
19274
19275
19276
19277
19278
19279
19280
19281
19282
19283
19284
19285
19286
19287
19288
19289
19290
19291
19292
19293
19294
19295
19296
19297
19298
19299
19300
19301
19302
19303
19304
19305
19306
19307
19308
19309
19310
19311
19312
19313
19314
19315
19316
19317
19318
19319
19320
19321
19322
19323
19324
19325
19326
19327
19328
19329
19330
19331
19332
19333
19334
19335
19336
19337
19338
19339
19340
19341
19342
19343
19344
19345
19346
19347
19348
19349
19350
19351
19352
19353
19354
19355
19356
19357
19358
19359
19360
19361
19362
19363
19364
19365
19366
19367
19368
19369
19370
19371
19372
19373
19374
19375
19376
19377
19378
19379
19380
19381
19382
19383
19384
19385
19386
19387
19388
19389
19390
19391
19392
19393
19394
19395
19396
19397
19398
19399
19400
19401
19402
19403
19404
19405
19406
19407
19408
19409
19410
19411
19412
19413
19414
19415
19416
19417
19418
19419
19420
19421
19422
19423
19424
19425
19426
19427
19428
19429
19430
19431
19432
19433
19434
19435
19436
19437
19438
19439
19440
19441
19442
19443
19444
19445
19446
19447
19448
19449
19450
19451
19452
19453
19454
19455
19456
19457
19458
19459
19460
19461
19462
19463
19464
19465
19466
19467
19468
19469
19470
19471
19472
19473
19474
19475
19476
19477
19478
19479
19480
19481
19482
19483
19484
19485
19486
19487
19488
19489
19490
19491
19492
19493
19494
19495
19496
19497
19498
19499
19500
19501
19502
19503
19504
19505
19506
19507
19508
19509
19510
19511
19512
19513
19514
19515
19516
19517
19518
19519
19520
19521
19522
19523
19524
19525
19526
19527
19528
19529
19530
19531
19532
19533
19534
19535
19536
19537
19538
19539
19540
19541
19542
19543
19544
19545
19546
19547
19548
19549
19550
19551
19552
19553
19554
19555
19556
19557
19558
19559
19560
19561
19562
19563
19564
19565
19566
19567
19568
19569
19570
19571
19572
19573
19574
19575
19576
19577
19578
19579
19580
19581
19582
19583
19584
19585
19586
19587
19588
19589
19590
19591
19592
19593
19594
19595
19596
19597
19598
19599
19600
19601
19602
19603
19604
19605
19606
19607
19608
19609
19610
19611
19612
19613
19614
19615
19616
19617
19618
19619
19620
19621
19622
19623
19624
19625
19626
19627
19628
19629
19630
19631
19632
19633
19634
19635
19636
19637
19638
19639
19640
19641
19642
19643
19644
19645
19646
19647
19648
19649
19650
19651
19652
19653
19654
19655
19656
19657
19658
19659
19660
19661
19662
19663
19664
19665
19666
19667
19668
19669
19670
19671
19672
19673
19674
19675
19676
19677
19678
19679
19680
19681
19682
19683
19684
19685
19686
19687
19688
19689
19690
19691
19692
19693
19694
19695
19696
19697
19698
19699
19700
19701
19702
19703
19704
19705
19706
19707
19708
19709
19710
19711
19712
19713
19714
19715
19716
19717
19718
19719
19720
19721
19722
19723
19724
19725
19726
19727
19728
19729
19730
19731
19732
19733
19734
19735
19736
19737
19738
19739
19740
19741
19742
19743
19744
19745
19746
19747
19748
19749
19750
19751
19752
19753
19754
19755
19756
19757
19758
19759
19760
19761
19762
19763
19764
19765
19766
19767
19768
19769
19770
19771
19772
19773
19774
19775
19776
19777
19778
19779
19780
19781
19782
19783
19784
19785
19786
19787
19788
19789
19790
19791
19792
19793
19794
19795
19796
19797
19798
19799
19800
19801
19802
19803
19804
19805
19806
19807
19808
19809
19810
19811
19812
19813
19814
19815
19816
19817
19818
19819
19820
19821
19822
19823
19824
19825
19826
19827
19828
19829
19830
19831
19832
19833
19834
19835
19836
19837
19838
19839
19840
19841
19842
19843
19844
19845
19846
19847
19848
19849
19850
19851
19852
19853
19854
19855
19856
19857
19858
19859
19860
19861
19862
19863
19864
19865
19866
19867
19868
19869
19870
19871
19872
19873
19874
19875
19876
19877
19878
19879
19880
19881
19882
19883
19884
19885
19886
19887
19888
19889
19890
19891
19892
19893
19894
19895
19896
19897
19898
19899
19900
19901
19902
19903
19904
19905
19906
19907
19908
19909
19910
19911
19912
19913
19914
19915
19916
19917
19918
19919
19920
19921
19922
19923
19924
19925
19926
19927
19928
19929
19930
19931
19932
19933
19934
19935
19936
19937
19938
19939
19940
19941
19942
19943
19944
19945
19946
19947
19948
19949
19950
19951
19952
19953
19954
19955
19956
19957
19958
19959
19960
19961
19962
19963
19964
19965
19966
19967
19968
19969
19970
19971
19972
19973
19974
19975
19976
19977
19978
19979
19980
19981
19982
19983
19984
19985
19986
19987
19988
19989
19990
19991
19992
19993
19994
19995
19996
19997
19998
19999
20000
20001
20002
20003
20004
20005
20006
20007
20008
20009
20010
20011
20012
20013
20014
20015
20016
20017
20018
20019
20020
20021
20022
20023
20024
20025
20026
20027
20028
20029
20030
20031
20032
20033
20034
20035
20036
20037
20038
20039
20040
20041
20042
20043
20044
20045
20046
20047
20048
20049
20050
20051
20052
20053
20054
20055
20056
20057
20058
20059
20060
20061
20062
20063
20064
20065
20066
20067
20068
20069
20070
20071
20072
20073
20074
20075
20076
20077
20078
20079
20080
20081
20082
20083
20084
20085
20086
20087
20088
20089
20090
20091
20092
20093
20094
20095
20096
20097
20098
20099
20100
20101
20102
20103
20104
20105
20106
20107
20108
20109
20110
20111
20112
20113
20114
20115
20116
20117
20118
20119
20120
20121
20122
20123
20124
20125
20126
20127
20128
20129
20130
20131
20132
20133
20134
20135
20136
20137
20138
20139
20140
20141
20142
20143
20144
20145
20146
20147
20148
20149
20150
20151
20152
20153
20154
20155
20156
20157
20158
20159
20160
20161
20162
20163
20164
20165
20166
20167
20168
20169
20170
20171
20172
20173
20174
20175
20176
20177
20178
20179
20180
20181
20182
20183
20184
20185
20186
20187
20188
20189
20190
20191
20192
20193
20194
20195
20196
20197
20198
20199
20200
20201
20202
20203
20204
20205
20206
20207
20208
20209
20210
20211
20212
20213
20214
20215
20216
20217
20218
20219
20220
20221
20222
20223
20224
20225
20226
20227
20228
20229
20230
20231
20232
20233
20234
20235
20236
20237
20238
20239
20240
20241
20242
20243
20244
20245
20246
20247
20248
20249
20250
20251
20252
20253
20254
20255
20256
20257
20258
20259
20260
20261
20262
20263
20264
20265
20266
20267
20268
20269
20270
20271
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This is the new netlink-based wireless configuration interface.
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright 2015-2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 */

#include <linux/if.h>
#include <linux/module.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/if_ether.h>
#include <linux/ieee80211.h>
#include <linux/nl80211.h>
#include <linux/rtnetlink.h>
#include <linux/netlink.h>
#include <linux/nospec.h>
#include <linux/etherdevice.h>
#include <linux/if_vlan.h>
#include <net/net_namespace.h>
#include <net/genetlink.h>
#include <net/cfg80211.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include "core.h"
#include "nl80211.h"
#include "reg.h"
#include "rdev-ops.h"

static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
                                   struct genl_info *info,
                                   struct cfg80211_crypto_settings *settings,
                                   int cipher_limit);

/* the netlink family */
static struct genl_family nl80211_fam;

/* multicast groups */
enum nl80211_multicast_groups {
        NL80211_MCGRP_CONFIG,
        NL80211_MCGRP_SCAN,
        NL80211_MCGRP_REGULATORY,
        NL80211_MCGRP_MLME,
        NL80211_MCGRP_VENDOR,
        NL80211_MCGRP_NAN,
        NL80211_MCGRP_TESTMODE /* keep last - ifdef! */
};

static const struct genl_multicast_group nl80211_mcgrps[] = {
        [NL80211_MCGRP_CONFIG] = { .name = NL80211_MULTICAST_GROUP_CONFIG },
        [NL80211_MCGRP_SCAN] = { .name = NL80211_MULTICAST_GROUP_SCAN },
        [NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG },
        [NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME },
        [NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR },
        [NL80211_MCGRP_NAN] = { .name = NL80211_MULTICAST_GROUP_NAN },
#ifdef CONFIG_NL80211_TESTMODE
        [NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE }
#endif
};

/* returns ERR_PTR values */
static struct wireless_dev *
__cfg80211_wdev_from_attrs(struct cfg80211_registered_device *rdev,
                           struct net *netns, struct nlattr **attrs)
{
        struct wireless_dev *result = NULL;
        bool have_ifidx = attrs[NL80211_ATTR_IFINDEX];
        bool have_wdev_id = attrs[NL80211_ATTR_WDEV];
        u64 wdev_id = 0;
        int wiphy_idx = -1;
        int ifidx = -1;

        if (!have_ifidx && !have_wdev_id)
                return ERR_PTR(-EINVAL);

        if (have_ifidx)
                ifidx = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]);
        if (have_wdev_id) {
                wdev_id = nla_get_u64(attrs[NL80211_ATTR_WDEV]);
                wiphy_idx = wdev_id >> 32;
        }

        if (rdev) {
                struct wireless_dev *wdev;

                lockdep_assert_held(&rdev->wiphy.mtx);

                list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                        if (have_ifidx && wdev->netdev &&
                            wdev->netdev->ifindex == ifidx) {
                                result = wdev;
                                break;
                        }
                        if (have_wdev_id && wdev->identifier == (u32)wdev_id) {
                                result = wdev;
                                break;
                        }
                }

                return result ?: ERR_PTR(-ENODEV);
        }

        ASSERT_RTNL();

        for_each_rdev(rdev) {
                struct wireless_dev *wdev;

                if (wiphy_net(&rdev->wiphy) != netns)
                        continue;

                if (have_wdev_id && rdev->wiphy_idx != wiphy_idx)
                        continue;

                list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                        if (have_ifidx && wdev->netdev &&
                            wdev->netdev->ifindex == ifidx) {
                                result = wdev;
                                break;
                        }
                        if (have_wdev_id && wdev->identifier == (u32)wdev_id) {
                                result = wdev;
                                break;
                        }
                }

                if (result)
                        break;
        }

        if (result)
                return result;
        return ERR_PTR(-ENODEV);
}

static struct cfg80211_registered_device *
__cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs)
{
        struct cfg80211_registered_device *rdev = NULL, *tmp;
        struct net_device *netdev;

        ASSERT_RTNL();

        if (!attrs[NL80211_ATTR_WIPHY] &&
            !attrs[NL80211_ATTR_IFINDEX] &&
            !attrs[NL80211_ATTR_WDEV])
                return ERR_PTR(-EINVAL);

        if (attrs[NL80211_ATTR_WIPHY])
                rdev = cfg80211_rdev_by_wiphy_idx(
                                nla_get_u32(attrs[NL80211_ATTR_WIPHY]));

        if (attrs[NL80211_ATTR_WDEV]) {
                u64 wdev_id = nla_get_u64(attrs[NL80211_ATTR_WDEV]);
                struct wireless_dev *wdev;
                bool found = false;

                tmp = cfg80211_rdev_by_wiphy_idx(wdev_id >> 32);
                if (tmp) {
                        /* make sure wdev exists */
                        list_for_each_entry(wdev, &tmp->wiphy.wdev_list, list) {
                                if (wdev->identifier != (u32)wdev_id)
                                        continue;
                                found = true;
                                break;
                        }

                        if (!found)
                                tmp = NULL;

                        if (rdev && tmp != rdev)
                                return ERR_PTR(-EINVAL);
                        rdev = tmp;
                }
        }

        if (attrs[NL80211_ATTR_IFINDEX]) {
                int ifindex = nla_get_u32(attrs[NL80211_ATTR_IFINDEX]);

                netdev = __dev_get_by_index(netns, ifindex);
                if (netdev) {
                        if (netdev->ieee80211_ptr)
                                tmp = wiphy_to_rdev(
                                        netdev->ieee80211_ptr->wiphy);
                        else
                                tmp = NULL;

                        /* not wireless device -- return error */
                        if (!tmp)
                                return ERR_PTR(-EINVAL);

                        /* mismatch -- return error */
                        if (rdev && tmp != rdev)
                                return ERR_PTR(-EINVAL);

                        rdev = tmp;
                }
        }

        if (!rdev)
                return ERR_PTR(-ENODEV);

        if (netns != wiphy_net(&rdev->wiphy))
                return ERR_PTR(-ENODEV);

        return rdev;
}

/*
 * This function returns a pointer to the driver
 * that the genl_info item that is passed refers to.
 *
 * The result of this can be a PTR_ERR and hence must
 * be checked with IS_ERR() for errors.
 */
static struct cfg80211_registered_device *
cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
{
        return __cfg80211_rdev_from_attrs(netns, info->attrs);
}

static int validate_beacon_head(const struct nlattr *attr,
                                struct netlink_ext_ack *extack)
{
        const u8 *data = nla_data(attr);
        unsigned int len = nla_len(attr);
        const struct element *elem;
        const struct ieee80211_mgmt *mgmt = (void *)data;
        unsigned int fixedlen, hdrlen;
        bool s1g_bcn;

        if (len < offsetofend(typeof(*mgmt), frame_control))
                goto err;

        s1g_bcn = ieee80211_is_s1g_beacon(mgmt->frame_control);
        if (s1g_bcn) {
                fixedlen = offsetof(struct ieee80211_ext,
                                    u.s1g_beacon.variable);
                hdrlen = offsetof(struct ieee80211_ext, u.s1g_beacon);
        } else {
                fixedlen = offsetof(struct ieee80211_mgmt,
                                    u.beacon.variable);
                hdrlen = offsetof(struct ieee80211_mgmt, u.beacon);
        }

        if (len < fixedlen)
                goto err;

        if (ieee80211_hdrlen(mgmt->frame_control) != hdrlen)
                goto err;

        data += fixedlen;
        len -= fixedlen;

        for_each_element(elem, data, len) {
                /* nothing */
        }

        if (for_each_element_completed(elem, data, len))
                return 0;

err:
        NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head");
        return -EINVAL;
}

static int validate_ie_attr(const struct nlattr *attr,
                            struct netlink_ext_ack *extack)
{
        const u8 *data = nla_data(attr);
        unsigned int len = nla_len(attr);
        const struct element *elem;

        for_each_element(elem, data, len) {
                /* nothing */
        }

        if (for_each_element_completed(elem, data, len))
                return 0;

        NL_SET_ERR_MSG_ATTR(extack, attr, "malformed information elements");
        return -EINVAL;
}

static int validate_he_capa(const struct nlattr *attr,
                            struct netlink_ext_ack *extack)
{
        if (!ieee80211_he_capa_size_ok(nla_data(attr), nla_len(attr)))
                return -EINVAL;

        return 0;
}

/* policy for the attributes */
static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];

static const struct nla_policy
nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = {
        [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, },
        [NL80211_FTM_RESP_ATTR_LCI] = { .type = NLA_BINARY,
                                        .len = U8_MAX },
        [NL80211_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_BINARY,
                                             .len = U8_MAX },
};

static const struct nla_policy
nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = {
        [NL80211_PMSR_FTM_REQ_ATTR_ASAP] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP] =
                NLA_POLICY_MAX(NLA_U8, 15),
        [NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD] = { .type = NLA_U16 },
        [NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION] =
                NLA_POLICY_MAX(NLA_U8, 15),
        [NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST] =
                NLA_POLICY_MAX(NLA_U8, 31),
        [NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES] = { .type = NLA_U8 },
        [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR] = { .type = NLA_U8 },
};

static const struct nla_policy
nl80211_pmsr_req_data_policy[NL80211_PMSR_TYPE_MAX + 1] = {
        [NL80211_PMSR_TYPE_FTM] =
                NLA_POLICY_NESTED(nl80211_pmsr_ftm_req_attr_policy),
};

static const struct nla_policy
nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = {
        [NL80211_PMSR_REQ_ATTR_DATA] =
                NLA_POLICY_NESTED(nl80211_pmsr_req_data_policy),
        [NL80211_PMSR_REQ_ATTR_GET_AP_TSF] = { .type = NLA_FLAG },
};

static const struct nla_policy
nl80211_pmsr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = {
        [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR,
        [NL80211_PMSR_PEER_ATTR_CHAN] = NLA_POLICY_NESTED(nl80211_policy),
        [NL80211_PMSR_PEER_ATTR_REQ] =
                NLA_POLICY_NESTED(nl80211_pmsr_req_attr_policy),
        [NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT },
};

static const struct nla_policy
nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = {
        [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_PEERS] =
                NLA_POLICY_NESTED_ARRAY(nl80211_pmsr_peer_attr_policy),
};

static const struct nla_policy
he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = {
        [NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] =
                NLA_POLICY_RANGE(NLA_U8, 1, 20),
        [NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] =
                NLA_POLICY_RANGE(NLA_U8, 1, 20),
        [NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET] =
                NLA_POLICY_RANGE(NLA_U8, 1, 20),
        [NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP] =
                NLA_POLICY_EXACT_LEN(8),
        [NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP] =
                NLA_POLICY_EXACT_LEN(8),
        [NL80211_HE_OBSS_PD_ATTR_SR_CTRL] = { .type = NLA_U8 },
};

static const struct nla_policy
he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = {
        [NL80211_HE_BSS_COLOR_ATTR_COLOR] = NLA_POLICY_RANGE(NLA_U8, 1, 63),
        [NL80211_HE_BSS_COLOR_ATTR_DISABLED] = { .type = NLA_FLAG },
        [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG },
};

static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
        [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY,
                                    .len = NL80211_MAX_SUPP_RATES },
        [NL80211_TXRATE_HT] = { .type = NLA_BINARY,
                                .len = NL80211_MAX_SUPP_HT_RATES },
        [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)),
        [NL80211_TXRATE_GI] = { .type = NLA_U8 },
        [NL80211_TXRATE_HE] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_he)),
        [NL80211_TXRATE_HE_GI] =  NLA_POLICY_RANGE(NLA_U8,
                                                   NL80211_RATE_INFO_HE_GI_0_8,
                                                   NL80211_RATE_INFO_HE_GI_3_2),
        [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8,
                                                   NL80211_RATE_INFO_HE_1XLTF,
                                                   NL80211_RATE_INFO_HE_4XLTF),
};

static const struct nla_policy
nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
        [NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 },
        [NL80211_TID_CONFIG_ATTR_PEER_SUPP] = { .type = NLA_U64 },
        [NL80211_TID_CONFIG_ATTR_OVERRIDE] = { .type = NLA_FLAG },
        [NL80211_TID_CONFIG_ATTR_TIDS] = NLA_POLICY_RANGE(NLA_U16, 1, 0xff),
        [NL80211_TID_CONFIG_ATTR_NOACK] =
                        NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
        [NL80211_TID_CONFIG_ATTR_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_TID_CONFIG_ATTR_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_TID_CONFIG_ATTR_AMPDU_CTRL] =
                        NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
        [NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] =
                        NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
        [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] =
                        NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
        [NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE] =
                        NLA_POLICY_MAX(NLA_U8, NL80211_TX_RATE_FIXED),
        [NL80211_TID_CONFIG_ATTR_TX_RATE] =
                        NLA_POLICY_NESTED(nl80211_txattr_policy),
};

static const struct nla_policy
nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = {
        [NL80211_FILS_DISCOVERY_ATTR_INT_MIN] = NLA_POLICY_MAX(NLA_U32, 10000),
        [NL80211_FILS_DISCOVERY_ATTR_INT_MAX] = NLA_POLICY_MAX(NLA_U32, 10000),
        [NL80211_FILS_DISCOVERY_ATTR_TMPL] =
                        NLA_POLICY_RANGE(NLA_BINARY,
                                         NL80211_FILS_DISCOVERY_TMPL_MIN_LEN,
                                         IEEE80211_MAX_DATA_LEN),
};

static const struct nla_policy
nl80211_unsol_bcast_probe_resp_policy[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1] = {
        [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] = NLA_POLICY_MAX(NLA_U32, 20),
        [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL] = { .type = NLA_BINARY,
                                                       .len = IEEE80211_MAX_DATA_LEN }
};

static const struct nla_policy
sar_specs_policy[NL80211_SAR_ATTR_SPECS_MAX + 1] = {
        [NL80211_SAR_ATTR_SPECS_POWER] = { .type = NLA_S32 },
        [NL80211_SAR_ATTR_SPECS_RANGE_INDEX] = {.type = NLA_U32 },
};

static const struct nla_policy
sar_policy[NL80211_SAR_ATTR_MAX + 1] = {
        [NL80211_SAR_ATTR_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_SAR_TYPE),
        [NL80211_SAR_ATTR_SPECS] = NLA_POLICY_NESTED_ARRAY(sar_specs_policy),
};

static const struct nla_policy
nl80211_mbssid_config_policy[NL80211_MBSSID_CONFIG_ATTR_MAX + 1] = {
        [NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES] = NLA_POLICY_MIN(NLA_U8, 2),
        [NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY] =
                                                NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_MBSSID_CONFIG_ATTR_INDEX] = { .type = NLA_U8 },
        [NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX] = { .type = NLA_U32 },
        [NL80211_MBSSID_CONFIG_ATTR_EMA] = { .type = NLA_FLAG },
};

static const struct nla_policy
nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = {
        [NL80211_STA_WME_UAPSD_QUEUES] = { .type = NLA_U8 },
        [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 },
};

static const struct netlink_range_validation nl80211_punct_bitmap_range = {
        .min = 0,
        .max = 0xffff,
};

static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
        [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD },
        [NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
        [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
                                      .len = 20-1 },
        [NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED },

        [NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
        [NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 },
        [NL80211_ATTR_WIPHY_EDMG_CHANNELS] = NLA_POLICY_RANGE(NLA_U8,
                                                NL80211_EDMG_CHANNELS_MIN,
                                                NL80211_EDMG_CHANNELS_MAX),
        [NL80211_ATTR_WIPHY_EDMG_BW_CONFIG] = NLA_POLICY_RANGE(NLA_U8,
                                                NL80211_EDMG_BW_CONFIG_MIN,
                                                NL80211_EDMG_BW_CONFIG_MAX),

        [NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 },
        [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 },
        [NL80211_ATTR_CENTER_FREQ1_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
        [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 },

        [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_ATTR_WIPHY_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 },
        [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 },
        [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 },
        [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG },

        [NL80211_ATTR_IFTYPE] = NLA_POLICY_MAX(NLA_U32, NL80211_IFTYPE_MAX),
        [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
        [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },

        [NL80211_ATTR_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_ATTR_PREV_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),

        [NL80211_ATTR_KEY] = { .type = NLA_NESTED, },
        [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY,
                                    .len = WLAN_MAX_KEY_LEN },
        [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 7),
        [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
        [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
        [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
        [NL80211_ATTR_KEY_TYPE] =
                NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES),

        [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
        [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
        [NL80211_ATTR_BEACON_HEAD] =
                NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head,
                                       IEEE80211_MAX_DATA_LEN),
        [NL80211_ATTR_BEACON_TAIL] =
                NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
                                       IEEE80211_MAX_DATA_LEN),
        [NL80211_ATTR_STA_AID] =
                NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
        [NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED },
        [NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 },
        [NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY,
                                               .len = NL80211_MAX_SUPP_RATES },
        [NL80211_ATTR_STA_PLINK_ACTION] =
                NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_ACTIONS - 1),
        [NL80211_ATTR_STA_TX_POWER_SETTING] =
                NLA_POLICY_RANGE(NLA_U8,
                                 NL80211_TX_POWER_AUTOMATIC,
                                 NL80211_TX_POWER_FIXED),
        [NL80211_ATTR_STA_TX_POWER] = { .type = NLA_S16 },
        [NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 },
        [NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ },
        [NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY,
                                   .len = IEEE80211_MAX_MESH_ID_LEN },
        [NL80211_ATTR_MPATH_NEXT_HOP] = NLA_POLICY_ETH_ADDR_COMPAT,

        /* allow 3 for NUL-termination, we used to declare this NLA_STRING */
        [NL80211_ATTR_REG_ALPHA2] = NLA_POLICY_RANGE(NLA_BINARY, 2, 3),
        [NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED },

        [NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
        [NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
        [NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
        [NL80211_ATTR_BSS_BASIC_RATES] = { .type = NLA_BINARY,
                                           .len = NL80211_MAX_SUPP_RATES },
        [NL80211_ATTR_BSS_HT_OPMODE] = { .type = NLA_U16 },

        [NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED },
        [NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG },

        [NL80211_ATTR_HT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_HT_CAPABILITY_LEN),

        [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
        [NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
                                                   validate_ie_attr,
                                                   IEEE80211_MAX_DATA_LEN),
        [NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED },
        [NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED },

        [NL80211_ATTR_SSID] = { .type = NLA_BINARY,
                                .len = IEEE80211_MAX_SSID_LEN },
        [NL80211_ATTR_AUTH_TYPE] = { .type = NLA_U32 },
        [NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
        [NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
        [NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
        [NL80211_ATTR_USE_MFP] = NLA_POLICY_RANGE(NLA_U32,
                                                  NL80211_MFP_NO,
                                                  NL80211_MFP_OPTIONAL),
        [NL80211_ATTR_STA_FLAGS2] =
                NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_sta_flag_update)),
        [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
        [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 },
        [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
        [NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG },
        [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
        [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 },
        [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
        [NL80211_ATTR_WPA_VERSIONS] =
                NLA_POLICY_RANGE(NLA_U32, 0,
                                 NL80211_WPA_VERSION_1 |
                                 NL80211_WPA_VERSION_2 |
                                 NL80211_WPA_VERSION_3),
        [NL80211_ATTR_PID] = { .type = NLA_U32 },
        [NL80211_ATTR_4ADDR] = { .type = NLA_U8 },
        [NL80211_ATTR_PMKID] = NLA_POLICY_EXACT_LEN_WARN(WLAN_PMKID_LEN),
        [NL80211_ATTR_DURATION] = { .type = NLA_U32 },
        [NL80211_ATTR_COOKIE] = { .type = NLA_U64 },
        [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED },
        [NL80211_ATTR_FRAME] = { .type = NLA_BINARY,
                                 .len = IEEE80211_MAX_DATA_LEN },
        [NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, },
        [NL80211_ATTR_PS_STATE] = NLA_POLICY_RANGE(NLA_U32,
                                                   NL80211_PS_DISABLED,
                                                   NL80211_PS_ENABLED),
        [NL80211_ATTR_CQM] = { .type = NLA_NESTED, },
        [NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG },
        [NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 },
        [NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 },
        [NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 },
        [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 },
        [NL80211_ATTR_WIPHY_ANTENNA_TX] = { .type = NLA_U32 },
        [NL80211_ATTR_WIPHY_ANTENNA_RX] = { .type = NLA_U32 },
        [NL80211_ATTR_MCAST_RATE] = { .type = NLA_U32 },
        [NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG },
        [NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
        [NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED },
        [NL80211_ATTR_STA_PLINK_STATE] =
                NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_STATES - 1),
        [NL80211_ATTR_MEASUREMENT_DURATION] = { .type = NLA_U16 },
        [NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY] = { .type = NLA_FLAG },
        [NL80211_ATTR_MESH_PEER_AID] =
                NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
        [NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 },
        [NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED },
        [NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED },
        [NL80211_ATTR_HIDDEN_SSID] =
                NLA_POLICY_RANGE(NLA_U32,
                                 NL80211_HIDDEN_SSID_NOT_IN_USE,
                                 NL80211_HIDDEN_SSID_ZERO_CONTENTS),
        [NL80211_ATTR_IE_PROBE_RESP] =
                NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
                                       IEEE80211_MAX_DATA_LEN),
        [NL80211_ATTR_IE_ASSOC_RESP] =
                NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
                                       IEEE80211_MAX_DATA_LEN),
        [NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG },
        [NL80211_ATTR_STA_WME] = NLA_POLICY_NESTED(nl80211_sta_wme_policy),
        [NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED },
        [NL80211_ATTR_TX_NO_CCK_RATE] = { .type = NLA_FLAG },
        [NL80211_ATTR_TDLS_ACTION] = { .type = NLA_U8 },
        [NL80211_ATTR_TDLS_DIALOG_TOKEN] = { .type = NLA_U8 },
        [NL80211_ATTR_TDLS_OPERATION] = { .type = NLA_U8 },
        [NL80211_ATTR_TDLS_SUPPORT] = { .type = NLA_FLAG },
        [NL80211_ATTR_TDLS_EXTERNAL_SETUP] = { .type = NLA_FLAG },
        [NL80211_ATTR_TDLS_INITIATOR] = { .type = NLA_FLAG },
        [NL80211_ATTR_DONT_WAIT_FOR_ACK] = { .type = NLA_FLAG },
        [NL80211_ATTR_PROBE_RESP] = { .type = NLA_BINARY,
                                      .len = IEEE80211_MAX_DATA_LEN },
        [NL80211_ATTR_DFS_REGION] = { .type = NLA_U8 },
        [NL80211_ATTR_DISABLE_HT] = { .type = NLA_FLAG },
        [NL80211_ATTR_HT_CAPABILITY_MASK] = {
                .len = NL80211_HT_CAPABILITY_LEN
        },
        [NL80211_ATTR_NOACK_MAP] = { .type = NLA_U16 },
        [NL80211_ATTR_INACTIVITY_TIMEOUT] = { .type = NLA_U16 },
        [NL80211_ATTR_BG_SCAN_PERIOD] = { .type = NLA_U16 },
        [NL80211_ATTR_WDEV] = { .type = NLA_U64 },
        [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 },

        /* need to include at least Auth Transaction and Status Code */
        [NL80211_ATTR_AUTH_DATA] = NLA_POLICY_MIN_LEN(4),

        [NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN),
        [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
        [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127),
        [NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1),
        [NL80211_ATTR_LOCAL_MESH_POWER_MODE] =
                NLA_POLICY_RANGE(NLA_U32,
                                 NL80211_MESH_POWER_UNKNOWN + 1,
                                 NL80211_MESH_POWER_MAX),
        [NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 },
        [NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED },
        [NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 },
        [NL80211_ATTR_STA_EXT_CAPABILITY] = { .type = NLA_BINARY, },
        [NL80211_ATTR_SPLIT_WIPHY_DUMP] = { .type = NLA_FLAG, },
        [NL80211_ATTR_DISABLE_VHT] = { .type = NLA_FLAG },
        [NL80211_ATTR_VHT_CAPABILITY_MASK] = {
                .len = NL80211_VHT_CAPABILITY_LEN,
        },
        [NL80211_ATTR_MDID] = { .type = NLA_U16 },
        [NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
                                  .len = IEEE80211_MAX_DATA_LEN },
        [NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 },
        [NL80211_ATTR_MAX_CRIT_PROT_DURATION] =
                NLA_POLICY_MAX(NLA_U16, NL80211_CRIT_PROTO_MAX_DURATION),
        [NL80211_ATTR_PEER_AID] =
                NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
        [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
        [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
        [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
        [NL80211_ATTR_CNTDWN_OFFS_BEACON] = { .type = NLA_BINARY },
        [NL80211_ATTR_CNTDWN_OFFS_PRESP] = { .type = NLA_BINARY },
        [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = NLA_POLICY_MIN_LEN(2),
        /*
         * The value of the Length field of the Supported Operating
         * Classes element is between 2 and 253.
         */
        [NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES] =
                NLA_POLICY_RANGE(NLA_BINARY, 2, 253),
        [NL80211_ATTR_HANDLE_DFS] = { .type = NLA_FLAG },
        [NL80211_ATTR_OPMODE_NOTIF] = { .type = NLA_U8 },
        [NL80211_ATTR_VENDOR_ID] = { .type = NLA_U32 },
        [NL80211_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 },
        [NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
        [NL80211_ATTR_QOS_MAP] = NLA_POLICY_RANGE(NLA_BINARY,
                                                  IEEE80211_QOS_MAP_LEN_MIN,
                                                  IEEE80211_QOS_MAP_LEN_MAX),
        [NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 },
        [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 },
        [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG },
        [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY },
        [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG },
        [NL80211_ATTR_TSID] = NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_TIDS - 1),
        [NL80211_ATTR_USER_PRIO] =
                NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_UPS - 1),
        [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 },
        [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 },
        [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 },
        [NL80211_ATTR_MAC_MASK] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG },
        [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
        [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
        [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
        [NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
        [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED },
        [NL80211_ATTR_STA_SUPPORT_P2P_PS] =
                NLA_POLICY_MAX(NLA_U8, NUM_NL80211_P2P_PS_STATUS - 1),
        [NL80211_ATTR_MU_MIMO_GROUP_DATA] = {
                .len = VHT_MUMIMO_GROUPS_DATA_LEN
        },
        [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_ATTR_BANDS] = { .type = NLA_U32 },
        [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
        [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
                                    .len = FILS_MAX_KEK_LEN },
        [NL80211_ATTR_FILS_NONCES] = NLA_POLICY_EXACT_LEN_WARN(2 * FILS_NONCE_LEN),
        [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, },
        [NL80211_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 },
        [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = {
                .len = sizeof(struct nl80211_bss_select_rssi_adjust)
        },
        [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
        [NL80211_ATTR_FILS_ERP_USERNAME] = { .type = NLA_BINARY,
                                             .len = FILS_ERP_MAX_USERNAME_LEN },
        [NL80211_ATTR_FILS_ERP_REALM] = { .type = NLA_BINARY,
                                          .len = FILS_ERP_MAX_REALM_LEN },
        [NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 },
        [NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY,
                                        .len = FILS_ERP_MAX_RRK_LEN },
        [NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2),
        [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
        [NL80211_ATTR_PMKR0_NAME] = NLA_POLICY_EXACT_LEN(WLAN_PMK_NAME_LEN),
        [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
        [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },

        [NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
        [NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
        [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
        [NL80211_ATTR_HE_CAPABILITY] =
                NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_he_capa,
                                       NL80211_HE_MAX_CAPABILITY_LEN),
        [NL80211_ATTR_FTM_RESPONDER] =
                NLA_POLICY_NESTED(nl80211_ftm_responder_policy),
        [NL80211_ATTR_TIMEOUT] = NLA_POLICY_MIN(NLA_U32, 1),
        [NL80211_ATTR_PEER_MEASUREMENTS] =
                NLA_POLICY_NESTED(nl80211_pmsr_attr_policy),
        [NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
                                        .len = SAE_PASSWORD_MAX_LEN },
        [NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
        [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy),
        [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
        [NL80211_ATTR_HE_BSS_COLOR] = NLA_POLICY_NESTED(he_bss_color_policy),
        [NL80211_ATTR_TID_CONFIG] =
                NLA_POLICY_NESTED_ARRAY(nl80211_tid_config_attr_policy),
        [NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG },
        [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1),
        [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100),
        [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG },
        [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999),
        [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED },
        [NL80211_ATTR_HE_6GHZ_CAPABILITY] =
                NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)),
        [NL80211_ATTR_FILS_DISCOVERY] =
                NLA_POLICY_NESTED(nl80211_fils_discovery_policy),
        [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] =
                NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy),
        [NL80211_ATTR_S1G_CAPABILITY] =
                NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN),
        [NL80211_ATTR_S1G_CAPABILITY_MASK] =
                NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN),
        [NL80211_ATTR_SAE_PWE] =
                NLA_POLICY_RANGE(NLA_U8, NL80211_SAE_PWE_HUNT_AND_PECK,
                                 NL80211_SAE_PWE_BOTH),
        [NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT },
        [NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy),
        [NL80211_ATTR_DISABLE_HE] = { .type = NLA_FLAG },
        [NL80211_ATTR_OBSS_COLOR_BITMAP] = { .type = NLA_U64 },
        [NL80211_ATTR_COLOR_CHANGE_COUNT] = { .type = NLA_U8 },
        [NL80211_ATTR_COLOR_CHANGE_COLOR] = { .type = NLA_U8 },
        [NL80211_ATTR_COLOR_CHANGE_ELEMS] = NLA_POLICY_NESTED(nl80211_policy),
        [NL80211_ATTR_MBSSID_CONFIG] =
                        NLA_POLICY_NESTED(nl80211_mbssid_config_policy),
        [NL80211_ATTR_MBSSID_ELEMS] = { .type = NLA_NESTED },
        [NL80211_ATTR_RADAR_BACKGROUND] = { .type = NLA_FLAG },
        [NL80211_ATTR_AP_SETTINGS_FLAGS] = { .type = NLA_U32 },
        [NL80211_ATTR_EHT_CAPABILITY] =
                NLA_POLICY_RANGE(NLA_BINARY,
                                 NL80211_EHT_MIN_CAPABILITY_LEN,
                                 NL80211_EHT_MAX_CAPABILITY_LEN),
        [NL80211_ATTR_DISABLE_EHT] = { .type = NLA_FLAG },
        [NL80211_ATTR_MLO_LINKS] =
                NLA_POLICY_NESTED_ARRAY(nl80211_policy),
        [NL80211_ATTR_MLO_LINK_ID] =
                NLA_POLICY_RANGE(NLA_U8, 0, IEEE80211_MLD_MAX_NUM_LINKS),
        [NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN),
        [NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
        [NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT },
        [NL80211_ATTR_PUNCT_BITMAP] =
                NLA_POLICY_FULL_RANGE(NLA_U32, &nl80211_punct_bitmap_range),

        [NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS] = { .type = NLA_U16 },
        [NL80211_ATTR_HW_TIMESTAMP_ENABLED] = { .type = NLA_FLAG },
        [NL80211_ATTR_EMA_RNR_ELEMS] = { .type = NLA_NESTED },
        [NL80211_ATTR_MLO_LINK_DISABLED] = { .type = NLA_FLAG },
        [NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA] = { .type = NLA_FLAG },
        [NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8),
        [NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8),
        [NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG },
};

/* policy for the key attributes */
static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
        [NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN },
        [NL80211_KEY_IDX] = { .type = NLA_U8 },
        [NL80211_KEY_CIPHER] = { .type = NLA_U32 },
        [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
        [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
        [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
        [NL80211_KEY_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES - 1),
        [NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
        [NL80211_KEY_MODE] = NLA_POLICY_RANGE(NLA_U8, 0, NL80211_KEY_SET_TX),
};

/* policy for the key default flags */
static const struct nla_policy
nl80211_key_default_policy[NUM_NL80211_KEY_DEFAULT_TYPES] = {
        [NL80211_KEY_DEFAULT_TYPE_UNICAST] = { .type = NLA_FLAG },
        [NL80211_KEY_DEFAULT_TYPE_MULTICAST] = { .type = NLA_FLAG },
};

#ifdef CONFIG_PM
/* policy for WoWLAN attributes */
static const struct nla_policy
nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = {
        [NL80211_WOWLAN_TRIG_ANY] = { .type = NLA_FLAG },
        [NL80211_WOWLAN_TRIG_DISCONNECT] = { .type = NLA_FLAG },
        [NL80211_WOWLAN_TRIG_MAGIC_PKT] = { .type = NLA_FLAG },
        [NL80211_WOWLAN_TRIG_PKT_PATTERN] = { .type = NLA_NESTED },
        [NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE] = { .type = NLA_FLAG },
        [NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST] = { .type = NLA_FLAG },
        [NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE] = { .type = NLA_FLAG },
        [NL80211_WOWLAN_TRIG_RFKILL_RELEASE] = { .type = NLA_FLAG },
        [NL80211_WOWLAN_TRIG_TCP_CONNECTION] = { .type = NLA_NESTED },
        [NL80211_WOWLAN_TRIG_NET_DETECT] = { .type = NLA_NESTED },
};

static const struct nla_policy
nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
        [NL80211_WOWLAN_TCP_SRC_IPV4] = { .type = NLA_U32 },
        [NL80211_WOWLAN_TCP_DST_IPV4] = { .type = NLA_U32 },
        [NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 },
        [NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 },
        [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = NLA_POLICY_MIN_LEN(1),
        [NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ] = {
                .len = sizeof(struct nl80211_wowlan_tcp_data_seq)
        },
        [NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN] = {
                .len = sizeof(struct nl80211_wowlan_tcp_data_token)
        },
        [NL80211_WOWLAN_TCP_DATA_INTERVAL] = { .type = NLA_U32 },
        [NL80211_WOWLAN_TCP_WAKE_PAYLOAD] = NLA_POLICY_MIN_LEN(1),
        [NL80211_WOWLAN_TCP_WAKE_MASK] = NLA_POLICY_MIN_LEN(1),
};
#endif /* CONFIG_PM */

/* policy for coalesce rule attributes */
static const struct nla_policy
nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
        [NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 },
        [NL80211_ATTR_COALESCE_RULE_CONDITION] =
                NLA_POLICY_RANGE(NLA_U32,
                                 NL80211_COALESCE_CONDITION_MATCH,
                                 NL80211_COALESCE_CONDITION_NO_MATCH),
        [NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED },
};

/* policy for GTK rekey offload attributes */
static const struct nla_policy
nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
        [NL80211_REKEY_DATA_KEK] = {
                .type = NLA_BINARY,
                .len = NL80211_KEK_EXT_LEN
        },
        [NL80211_REKEY_DATA_KCK] = {
                .type = NLA_BINARY,
                .len = NL80211_KCK_EXT_LEN_32
        },
        [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN(NL80211_REPLAY_CTR_LEN),
        [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 },
};

static const struct nla_policy
nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = {
        [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY,
                                                 .len = IEEE80211_MAX_SSID_LEN },
        [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 },
};

static const struct nla_policy
nl80211_plan_policy[NL80211_SCHED_SCAN_PLAN_MAX + 1] = {
        [NL80211_SCHED_SCAN_PLAN_INTERVAL] = { .type = NLA_U32 },
        [NL80211_SCHED_SCAN_PLAN_ITERATIONS] = { .type = NLA_U32 },
};

static const struct nla_policy
nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = {
        [NL80211_BSS_SELECT_ATTR_RSSI] = { .type = NLA_FLAG },
        [NL80211_BSS_SELECT_ATTR_BAND_PREF] = { .type = NLA_U32 },
        [NL80211_BSS_SELECT_ATTR_RSSI_ADJUST] = {
                .len = sizeof(struct nl80211_bss_select_rssi_adjust)
        },
};

/* policy for NAN function attributes */
static const struct nla_policy
nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = {
        [NL80211_NAN_FUNC_TYPE] =
                NLA_POLICY_MAX(NLA_U8, NL80211_NAN_FUNC_MAX_TYPE),
        [NL80211_NAN_FUNC_SERVICE_ID] = {
                                    .len = NL80211_NAN_FUNC_SERVICE_ID_LEN },
        [NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 },
        [NL80211_NAN_FUNC_PUBLISH_BCAST] = { .type = NLA_FLAG },
        [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG },
        [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 },
        [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 },
        [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
        [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG },
        [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 },
        [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY,
                        .len = NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN },
        [NL80211_NAN_FUNC_SRF] = { .type = NLA_NESTED },
        [NL80211_NAN_FUNC_RX_MATCH_FILTER] = { .type = NLA_NESTED },
        [NL80211_NAN_FUNC_TX_MATCH_FILTER] = { .type = NLA_NESTED },
        [NL80211_NAN_FUNC_INSTANCE_ID] = { .type = NLA_U8 },
        [NL80211_NAN_FUNC_TERM_REASON] = { .type = NLA_U8 },
};

/* policy for Service Response Filter attributes */
static const struct nla_policy
nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = {
        [NL80211_NAN_SRF_INCLUDE] = { .type = NLA_FLAG },
        [NL80211_NAN_SRF_BF] = { .type = NLA_BINARY,
                                 .len =  NL80211_NAN_FUNC_SRF_MAX_LEN },
        [NL80211_NAN_SRF_BF_IDX] = { .type = NLA_U8 },
        [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED },
};

/* policy for packet pattern attributes */
static const struct nla_policy
nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
        [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, },
        [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, },
        [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
};

static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
                                     struct cfg80211_registered_device **rdev,
                                     struct wireless_dev **wdev,
                                     struct nlattr **attrbuf)
{
        int err;

        if (!cb->args[0]) {
                struct nlattr **attrbuf_free = NULL;

                if (!attrbuf) {
                        attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf),
                                          GFP_KERNEL);
                        if (!attrbuf)
                                return -ENOMEM;
                        attrbuf_free = attrbuf;
                }

                err = nlmsg_parse_deprecated(cb->nlh,
                                             GENL_HDRLEN + nl80211_fam.hdrsize,
                                             attrbuf, nl80211_fam.maxattr,
                                             nl80211_policy, NULL);
                if (err) {
                        kfree(attrbuf_free);
                        return err;
                }

                rtnl_lock();
                *wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(cb->skb->sk),
                                                   attrbuf);
                kfree(attrbuf_free);
                if (IS_ERR(*wdev)) {
                        rtnl_unlock();
                        return PTR_ERR(*wdev);
                }
                *rdev = wiphy_to_rdev((*wdev)->wiphy);
                mutex_lock(&(*rdev)->wiphy.mtx);
                rtnl_unlock();
                /* 0 is the first index - add 1 to parse only once */
                cb->args[0] = (*rdev)->wiphy_idx + 1;
                cb->args[1] = (*wdev)->identifier;
        } else {
                /* subtract the 1 again here */
                struct wiphy *wiphy;
                struct wireless_dev *tmp;

                rtnl_lock();
                wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
                if (!wiphy) {
                        rtnl_unlock();
                        return -ENODEV;
                }
                *rdev = wiphy_to_rdev(wiphy);
                *wdev = NULL;

                list_for_each_entry(tmp, &(*rdev)->wiphy.wdev_list, list) {
                        if (tmp->identifier == cb->args[1]) {
                                *wdev = tmp;
                                break;
                        }
                }

                if (!*wdev) {
                        rtnl_unlock();
                        return -ENODEV;
                }
                mutex_lock(&(*rdev)->wiphy.mtx);
                rtnl_unlock();
        }

        return 0;
}

/* message building helper */
void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
                     int flags, u8 cmd)
{
        /* since there is no private header just add the generic one */
        return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
}

static int nl80211_msg_put_wmm_rules(struct sk_buff *msg,
                                     const struct ieee80211_reg_rule *rule)
{
        int j;
        struct nlattr *nl_wmm_rules =
                nla_nest_start_noflag(msg, NL80211_FREQUENCY_ATTR_WMM);

        if (!nl_wmm_rules)
                goto nla_put_failure;

        for (j = 0; j < IEEE80211_NUM_ACS; j++) {
                struct nlattr *nl_wmm_rule = nla_nest_start_noflag(msg, j);

                if (!nl_wmm_rule)
                        goto nla_put_failure;

                if (nla_put_u16(msg, NL80211_WMMR_CW_MIN,
                                rule->wmm_rule.client[j].cw_min) ||
                    nla_put_u16(msg, NL80211_WMMR_CW_MAX,
                                rule->wmm_rule.client[j].cw_max) ||
                    nla_put_u8(msg, NL80211_WMMR_AIFSN,
                               rule->wmm_rule.client[j].aifsn) ||
                    nla_put_u16(msg, NL80211_WMMR_TXOP,
                                rule->wmm_rule.client[j].cot))
                        goto nla_put_failure;

                nla_nest_end(msg, nl_wmm_rule);
        }
        nla_nest_end(msg, nl_wmm_rules);

        return 0;

nla_put_failure:
        return -ENOBUFS;
}

static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
                                   struct ieee80211_channel *chan,
                                   bool large)
{
        /* Some channels must be completely excluded from the
         * list to protect old user-space tools from breaking
         */
        if (!large && chan->flags &
            (IEEE80211_CHAN_NO_10MHZ | IEEE80211_CHAN_NO_20MHZ))
                return 0;
        if (!large && chan->freq_offset)
                return 0;

        if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_FREQ,
                        chan->center_freq))
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset))
                goto nla_put_failure;

        if ((chan->flags & IEEE80211_CHAN_PSD) &&
            nla_put_s8(msg, NL80211_FREQUENCY_ATTR_PSD, chan->psd))
                goto nla_put_failure;

        if ((chan->flags & IEEE80211_CHAN_DISABLED) &&
            nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED))
                goto nla_put_failure;
        if (chan->flags & IEEE80211_CHAN_NO_IR) {
                if (nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_IR))
                        goto nla_put_failure;
                if (nla_put_flag(msg, __NL80211_FREQUENCY_ATTR_NO_IBSS))
                        goto nla_put_failure;
        }
        if (chan->flags & IEEE80211_CHAN_RADAR) {
                if (nla_put_flag(msg, NL80211_FREQUENCY_ATTR_RADAR))
                        goto nla_put_failure;
                if (large) {
                        u32 time;

                        time = elapsed_jiffies_msecs(chan->dfs_state_entered);

                        if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_DFS_STATE,
                                        chan->dfs_state))
                                goto nla_put_failure;
                        if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_DFS_TIME,
                                        time))
                                goto nla_put_failure;
                        if (nla_put_u32(msg,
                                        NL80211_FREQUENCY_ATTR_DFS_CAC_TIME,
                                        chan->dfs_cac_ms))
                                goto nla_put_failure;
                }
        }

        if (large) {
                if ((chan->flags & IEEE80211_CHAN_NO_HT40MINUS) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HT40_MINUS))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_HT40PLUS) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HT40_PLUS))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_80MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_80MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_160MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_160MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_INDOOR_ONLY) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_INDOOR_ONLY))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_IR_CONCURRENT) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_IR_CONCURRENT))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_20MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_20MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_10MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_10MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_HE) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_1MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_2MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_4MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_8MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_16MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_EHT) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_EHT))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_DFS_CONCURRENT) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DFS_CONCURRENT))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_6GHZ_VLP_CLIENT))
                        goto nla_put_failure;
                if ((chan->flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT) &&
                    nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_6GHZ_AFC_CLIENT))
                        goto nla_put_failure;
        }

        if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
                        DBM_TO_MBM(chan->max_power)))
                goto nla_put_failure;

        if (large) {
                const struct ieee80211_reg_rule *rule =
                        freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq));

                if (!IS_ERR_OR_NULL(rule) && rule->has_wmm) {
                        if (nl80211_msg_put_wmm_rules(msg, rule))
                                goto nla_put_failure;
                }
        }

        return 0;

 nla_put_failure:
        return -ENOBUFS;
}

static bool nl80211_put_txq_stats(struct sk_buff *msg,
                                  struct cfg80211_txq_stats *txqstats,
                                  int attrtype)
{
        struct nlattr *txqattr;

#define PUT_TXQVAL_U32(attr, memb) do {                                          \
        if (txqstats->filled & BIT(NL80211_TXQ_STATS_ ## attr) &&          \
            nla_put_u32(msg, NL80211_TXQ_STATS_ ## attr, txqstats->memb)) \
                return false;                                                  \
        } while (0)

        txqattr = nla_nest_start_noflag(msg, attrtype);
        if (!txqattr)
                return false;

        PUT_TXQVAL_U32(BACKLOG_BYTES, backlog_bytes);
        PUT_TXQVAL_U32(BACKLOG_PACKETS, backlog_packets);
        PUT_TXQVAL_U32(FLOWS, flows);
        PUT_TXQVAL_U32(DROPS, drops);
        PUT_TXQVAL_U32(ECN_MARKS, ecn_marks);
        PUT_TXQVAL_U32(OVERLIMIT, overlimit);
        PUT_TXQVAL_U32(OVERMEMORY, overmemory);
        PUT_TXQVAL_U32(COLLISIONS, collisions);
        PUT_TXQVAL_U32(TX_BYTES, tx_bytes);
        PUT_TXQVAL_U32(TX_PACKETS, tx_packets);
        PUT_TXQVAL_U32(MAX_FLOWS, max_flows);
        nla_nest_end(msg, txqattr);

#undef PUT_TXQVAL_U32
        return true;
}

/* netlink command implementations */

/**
 * nl80211_link_id - return link ID
 * @attrs: attributes to look at
 *
 * Returns: the link ID or 0 if not given
 *
 * Note this function doesn't do any validation of the link
 * ID validity wrt. links that were actually added, so it must
 * be called only from ops with %NL80211_FLAG_MLO_VALID_LINK_ID
 * or if additional validation is done.
 */
static unsigned int nl80211_link_id(struct nlattr **attrs)
{
        struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID];

        if (!linkid)
                return 0;

        return nla_get_u8(linkid);
}

static int nl80211_link_id_or_invalid(struct nlattr **attrs)
{
        struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID];

        if (!linkid)
                return -1;

        return nla_get_u8(linkid);
}

struct key_parse {
        struct key_params p;
        int idx;
        int type;
        bool def, defmgmt, defbeacon;
        bool def_uni, def_multi;
};

static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key,
                                 struct key_parse *k)
{
        struct nlattr *tb[NL80211_KEY_MAX + 1];
        int err = nla_parse_nested_deprecated(tb, NL80211_KEY_MAX, key,
                                              nl80211_key_policy,
                                              info->extack);
        if (err)
                return err;

        k->def = !!tb[NL80211_KEY_DEFAULT];
        k->defmgmt = !!tb[NL80211_KEY_DEFAULT_MGMT];
        k->defbeacon = !!tb[NL80211_KEY_DEFAULT_BEACON];

        if (k->def) {
                k->def_uni = true;
                k->def_multi = true;
        }
        if (k->defmgmt || k->defbeacon)
                k->def_multi = true;

        if (tb[NL80211_KEY_IDX])
                k->idx = nla_get_u8(tb[NL80211_KEY_IDX]);

        if (tb[NL80211_KEY_DATA]) {
                k->p.key = nla_data(tb[NL80211_KEY_DATA]);
                k->p.key_len = nla_len(tb[NL80211_KEY_DATA]);
        }

        if (tb[NL80211_KEY_SEQ]) {
                k->p.seq = nla_data(tb[NL80211_KEY_SEQ]);
                k->p.seq_len = nla_len(tb[NL80211_KEY_SEQ]);
        }

        if (tb[NL80211_KEY_CIPHER])
                k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);

        if (tb[NL80211_KEY_TYPE])
                k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);

        if (tb[NL80211_KEY_DEFAULT_TYPES]) {
                struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];

                err = nla_parse_nested_deprecated(kdt,
                                                  NUM_NL80211_KEY_DEFAULT_TYPES - 1,
                                                  tb[NL80211_KEY_DEFAULT_TYPES],
                                                  nl80211_key_default_policy,
                                                  info->extack);
                if (err)
                        return err;

                k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST];
                k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST];
        }

        if (tb[NL80211_KEY_MODE])
                k->p.mode = nla_get_u8(tb[NL80211_KEY_MODE]);

        return 0;
}

static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
{
        if (info->attrs[NL80211_ATTR_KEY_DATA]) {
                k->p.key = nla_data(info->attrs[NL80211_ATTR_KEY_DATA]);
                k->p.key_len = nla_len(info->attrs[NL80211_ATTR_KEY_DATA]);
        }

        if (info->attrs[NL80211_ATTR_KEY_SEQ]) {
                k->p.seq = nla_data(info->attrs[NL80211_ATTR_KEY_SEQ]);
                k->p.seq_len = nla_len(info->attrs[NL80211_ATTR_KEY_SEQ]);
        }

        if (info->attrs[NL80211_ATTR_KEY_IDX])
                k->idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);

        if (info->attrs[NL80211_ATTR_KEY_CIPHER])
                k->p.cipher = nla_get_u32(info->attrs[NL80211_ATTR_KEY_CIPHER]);

        k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT];
        k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT];

        if (k->def) {
                k->def_uni = true;
                k->def_multi = true;
        }
        if (k->defmgmt)
                k->def_multi = true;

        if (info->attrs[NL80211_ATTR_KEY_TYPE])
                k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);

        if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) {
                struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
                int err = nla_parse_nested_deprecated(kdt,
                                                      NUM_NL80211_KEY_DEFAULT_TYPES - 1,
                                                      info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES],
                                                      nl80211_key_default_policy,
                                                      info->extack);
                if (err)
                        return err;

                k->def_uni = kdt[NL80211_KEY_DEFAULT_TYPE_UNICAST];
                k->def_multi = kdt[NL80211_KEY_DEFAULT_TYPE_MULTICAST];
        }

        return 0;
}

static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
{
        int err;

        memset(k, 0, sizeof(*k));
        k->idx = -1;
        k->type = -1;

        if (info->attrs[NL80211_ATTR_KEY])
                err = nl80211_parse_key_new(info, info->attrs[NL80211_ATTR_KEY], k);
        else
                err = nl80211_parse_key_old(info, k);

        if (err)
                return err;

        if ((k->def ? 1 : 0) + (k->defmgmt ? 1 : 0) +
            (k->defbeacon ? 1 : 0) > 1) {
                GENL_SET_ERR_MSG(info,
                                 "key with multiple default flags is invalid");
                return -EINVAL;
        }

        if (k->defmgmt || k->defbeacon) {
                if (k->def_uni || !k->def_multi) {
                        GENL_SET_ERR_MSG(info,
                                         "defmgmt/defbeacon key must be mcast");
                        return -EINVAL;
                }
        }

        if (k->idx != -1) {
                if (k->defmgmt) {
                        if (k->idx < 4 || k->idx > 5) {
                                GENL_SET_ERR_MSG(info,
                                                 "defmgmt key idx not 4 or 5");
                                return -EINVAL;
                        }
                } else if (k->defbeacon) {
                        if (k->idx < 6 || k->idx > 7) {
                                GENL_SET_ERR_MSG(info,
                                                 "defbeacon key idx not 6 or 7");
                                return -EINVAL;
                        }
                } else if (k->def) {
                        if (k->idx < 0 || k->idx > 3) {
                                GENL_SET_ERR_MSG(info, "def key idx not 0-3");
                                return -EINVAL;
                        }
                } else {
                        if (k->idx < 0 || k->idx > 7) {
                                GENL_SET_ERR_MSG(info, "key idx not 0-7");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

static struct cfg80211_cached_keys *
nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
                       struct genl_info *info, bool *no_ht)
{
        struct nlattr *keys = info->attrs[NL80211_ATTR_KEYS];
        struct key_parse parse;
        struct nlattr *key;
        struct cfg80211_cached_keys *result;
        int rem, err, def = 0;
        bool have_key = false;

        nla_for_each_nested(key, keys, rem) {
                have_key = true;
                break;
        }

        if (!have_key)
                return NULL;

        result = kzalloc(sizeof(*result), GFP_KERNEL);
        if (!result)
                return ERR_PTR(-ENOMEM);

        result->def = -1;

        nla_for_each_nested(key, keys, rem) {
                memset(&parse, 0, sizeof(parse));
                parse.idx = -1;

                err = nl80211_parse_key_new(info, key, &parse);
                if (err)
                        goto error;
                err = -EINVAL;
                if (!parse.p.key)
                        goto error;
                if (parse.idx < 0 || parse.idx > 3) {
                        GENL_SET_ERR_MSG(info, "key index out of range [0-3]");
                        goto error;
                }
                if (parse.def) {
                        if (def) {
                                GENL_SET_ERR_MSG(info,
                                                 "only one key can be default");
                                goto error;
                        }
                        def = 1;
                        result->def = parse.idx;
                        if (!parse.def_uni || !parse.def_multi)
                                goto error;
                } else if (parse.defmgmt)
                        goto error;
                err = cfg80211_validate_key_settings(rdev, &parse.p,
                                                     parse.idx, false, NULL);
                if (err)
                        goto error;
                if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 &&
                    parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) {
                        GENL_SET_ERR_MSG(info, "connect key must be WEP");
                        err = -EINVAL;
                        goto error;
                }
                result->params[parse.idx].cipher = parse.p.cipher;
                result->params[parse.idx].key_len = parse.p.key_len;
                result->params[parse.idx].key = result->data[parse.idx];
                memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len);

                /* must be WEP key if we got here */
                if (no_ht)
                        *no_ht = true;
        }

        if (result->def < 0) {
                err = -EINVAL;
                GENL_SET_ERR_MSG(info, "need a default/TX key");
                goto error;
        }

        return result;
 error:
        kfree(result);
        return ERR_PTR(err);
}

static int nl80211_key_allowed(struct wireless_dev *wdev)
{
        lockdep_assert_wiphy(wdev->wiphy);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_MESH_POINT:
                break;
        case NL80211_IFTYPE_ADHOC:
                if (wdev->u.ibss.current_bss)
                        return 0;
                return -ENOLINK;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                if (wdev->connected)
                        return 0;
                return -ENOLINK;
        case NL80211_IFTYPE_NAN:
                if (wiphy_ext_feature_isset(wdev->wiphy,
                                            NL80211_EXT_FEATURE_SECURE_NAN))
                        return 0;
                return -EINVAL;
        case NL80211_IFTYPE_UNSPECIFIED:
        case NL80211_IFTYPE_OCB:
        case NL80211_IFTYPE_MONITOR:
        case NL80211_IFTYPE_P2P_DEVICE:
        case NL80211_IFTYPE_WDS:
        case NUM_NL80211_IFTYPES:
                return -EINVAL;
        }

        return 0;
}

static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy,
                                                        u32 freq)
{
        struct ieee80211_channel *chan;

        chan = ieee80211_get_channel_khz(wiphy, freq);
        if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
                return NULL;
        return chan;
}

static int nl80211_put_iftypes(struct sk_buff *msg, u32 attr, u16 ifmodes)
{
        struct nlattr *nl_modes = nla_nest_start_noflag(msg, attr);
        int i;

        if (!nl_modes)
                goto nla_put_failure;

        i = 0;
        while (ifmodes) {
                if ((ifmodes & 1) && nla_put_flag(msg, i))
                        goto nla_put_failure;
                ifmodes >>= 1;
                i++;
        }

        nla_nest_end(msg, nl_modes);
        return 0;

nla_put_failure:
        return -ENOBUFS;
}

static int nl80211_put_iface_combinations(struct wiphy *wiphy,
                                          struct sk_buff *msg,
                                          bool large)
{
        struct nlattr *nl_combis;
        int i, j;

        nl_combis = nla_nest_start_noflag(msg,
                                          NL80211_ATTR_INTERFACE_COMBINATIONS);
        if (!nl_combis)
                goto nla_put_failure;

        for (i = 0; i < wiphy->n_iface_combinations; i++) {
                const struct ieee80211_iface_combination *c;
                struct nlattr *nl_combi, *nl_limits;

                c = &wiphy->iface_combinations[i];

                nl_combi = nla_nest_start_noflag(msg, i + 1);
                if (!nl_combi)
                        goto nla_put_failure;

                nl_limits = nla_nest_start_noflag(msg,
                                                  NL80211_IFACE_COMB_LIMITS);
                if (!nl_limits)
                        goto nla_put_failure;

                for (j = 0; j < c->n_limits; j++) {
                        struct nlattr *nl_limit;

                        nl_limit = nla_nest_start_noflag(msg, j + 1);
                        if (!nl_limit)
                                goto nla_put_failure;
                        if (nla_put_u32(msg, NL80211_IFACE_LIMIT_MAX,
                                        c->limits[j].max))
                                goto nla_put_failure;
                        if (nl80211_put_iftypes(msg, NL80211_IFACE_LIMIT_TYPES,
                                                c->limits[j].types))
                                goto nla_put_failure;
                        nla_nest_end(msg, nl_limit);
                }

                nla_nest_end(msg, nl_limits);

                if (c->beacon_int_infra_match &&
                    nla_put_flag(msg, NL80211_IFACE_COMB_STA_AP_BI_MATCH))
                        goto nla_put_failure;
                if (nla_put_u32(msg, NL80211_IFACE_COMB_NUM_CHANNELS,
                                c->num_different_channels) ||
                    nla_put_u32(msg, NL80211_IFACE_COMB_MAXNUM,
                                c->max_interfaces))
                        goto nla_put_failure;
                if (large &&
                    (nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS,
                                c->radar_detect_widths) ||
                     nla_put_u32(msg, NL80211_IFACE_COMB_RADAR_DETECT_REGIONS,
                                c->radar_detect_regions)))
                        goto nla_put_failure;
                if (c->beacon_int_min_gcd &&
                    nla_put_u32(msg, NL80211_IFACE_COMB_BI_MIN_GCD,
                                c->beacon_int_min_gcd))
                        goto nla_put_failure;

                nla_nest_end(msg, nl_combi);
        }

        nla_nest_end(msg, nl_combis);

        return 0;
nla_put_failure:
        return -ENOBUFS;
}

#ifdef CONFIG_PM
static int nl80211_send_wowlan_tcp_caps(struct cfg80211_registered_device *rdev,
                                        struct sk_buff *msg)
{
        const struct wiphy_wowlan_tcp_support *tcp = rdev->wiphy.wowlan->tcp;
        struct nlattr *nl_tcp;

        if (!tcp)
                return 0;

        nl_tcp = nla_nest_start_noflag(msg,
                                       NL80211_WOWLAN_TRIG_TCP_CONNECTION);
        if (!nl_tcp)
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD,
                        tcp->data_payload_max))
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD,
                        tcp->data_payload_max))
                return -ENOBUFS;

        if (tcp->seq && nla_put_flag(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ))
                return -ENOBUFS;

        if (tcp->tok && nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN,
                                sizeof(*tcp->tok), tcp->tok))
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_INTERVAL,
                        tcp->data_interval_max))
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_WOWLAN_TCP_WAKE_PAYLOAD,
                        tcp->wake_payload_max))
                return -ENOBUFS;

        nla_nest_end(msg, nl_tcp);
        return 0;
}

static int nl80211_send_wowlan(struct sk_buff *msg,
                               struct cfg80211_registered_device *rdev,
                               bool large)
{
        struct nlattr *nl_wowlan;

        if (!rdev->wiphy.wowlan)
                return 0;

        nl_wowlan = nla_nest_start_noflag(msg,
                                          NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED);
        if (!nl_wowlan)
                return -ENOBUFS;

        if (((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_ANY) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_ANY)) ||
            ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_DISCONNECT) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) ||
            ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) ||
            ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED)) ||
            ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) ||
            ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) ||
            ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) ||
            ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE) &&
             nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE)))
                return -ENOBUFS;

        if (rdev->wiphy.wowlan->n_patterns) {
                struct nl80211_pattern_support pat = {
                        .max_patterns = rdev->wiphy.wowlan->n_patterns,
                        .min_pattern_len = rdev->wiphy.wowlan->pattern_min_len,
                        .max_pattern_len = rdev->wiphy.wowlan->pattern_max_len,
                        .max_pkt_offset = rdev->wiphy.wowlan->max_pkt_offset,
                };

                if (nla_put(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN,
                            sizeof(pat), &pat))
                        return -ENOBUFS;
        }

        if ((rdev->wiphy.wowlan->flags & WIPHY_WOWLAN_NET_DETECT) &&
            nla_put_u32(msg, NL80211_WOWLAN_TRIG_NET_DETECT,
                        rdev->wiphy.wowlan->max_nd_match_sets))
                return -ENOBUFS;

        if (large && nl80211_send_wowlan_tcp_caps(rdev, msg))
                return -ENOBUFS;

        nla_nest_end(msg, nl_wowlan);

        return 0;
}
#endif

static int nl80211_send_coalesce(struct sk_buff *msg,
                                 struct cfg80211_registered_device *rdev)
{
        struct nl80211_coalesce_rule_support rule;

        if (!rdev->wiphy.coalesce)
                return 0;

        rule.max_rules = rdev->wiphy.coalesce->n_rules;
        rule.max_delay = rdev->wiphy.coalesce->max_delay;
        rule.pat.max_patterns = rdev->wiphy.coalesce->n_patterns;
        rule.pat.min_pattern_len = rdev->wiphy.coalesce->pattern_min_len;
        rule.pat.max_pattern_len = rdev->wiphy.coalesce->pattern_max_len;
        rule.pat.max_pkt_offset = rdev->wiphy.coalesce->max_pkt_offset;

        if (nla_put(msg, NL80211_ATTR_COALESCE_RULE, sizeof(rule), &rule))
                return -ENOBUFS;

        return 0;
}

static int
nl80211_send_iftype_data(struct sk_buff *msg,
                         const struct ieee80211_supported_band *sband,
                         const struct ieee80211_sband_iftype_data *iftdata)
{
        const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
        const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap;

        if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
                                iftdata->types_mask))
                return -ENOBUFS;

        if (he_cap->has_he) {
                if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
                            sizeof(he_cap->he_cap_elem.mac_cap_info),
                            he_cap->he_cap_elem.mac_cap_info) ||
                    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
                            sizeof(he_cap->he_cap_elem.phy_cap_info),
                            he_cap->he_cap_elem.phy_cap_info) ||
                    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
                            sizeof(he_cap->he_mcs_nss_supp),
                            &he_cap->he_mcs_nss_supp) ||
                    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
                            sizeof(he_cap->ppe_thres), he_cap->ppe_thres))
                        return -ENOBUFS;
        }

        if (eht_cap->has_eht && he_cap->has_he) {
                u8 mcs_nss_size, ppe_thresh_size;
                u16 ppe_thres_hdr;
                bool is_ap;

                is_ap = iftdata->types_mask & BIT(NL80211_IFTYPE_AP) ||
                        iftdata->types_mask & BIT(NL80211_IFTYPE_P2P_GO);

                mcs_nss_size =
                        ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
                                                   &eht_cap->eht_cap_elem,
                                                   is_ap);

                ppe_thres_hdr = get_unaligned_le16(&eht_cap->eht_ppe_thres[0]);
                ppe_thresh_size =
                        ieee80211_eht_ppe_size(ppe_thres_hdr,
                                               eht_cap->eht_cap_elem.phy_cap_info);

                if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC,
                            sizeof(eht_cap->eht_cap_elem.mac_cap_info),
                            eht_cap->eht_cap_elem.mac_cap_info) ||
                    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY,
                            sizeof(eht_cap->eht_cap_elem.phy_cap_info),
                            eht_cap->eht_cap_elem.phy_cap_info) ||
                    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET,
                            mcs_nss_size, &eht_cap->eht_mcs_nss_supp) ||
                    nla_put(msg, NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE,
                            ppe_thresh_size, eht_cap->eht_ppe_thres))
                        return -ENOBUFS;
        }

        if (sband->band == NL80211_BAND_6GHZ &&
            nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA,
                    sizeof(iftdata->he_6ghz_capa),
                    &iftdata->he_6ghz_capa))
                return -ENOBUFS;

        if (iftdata->vendor_elems.data && iftdata->vendor_elems.len &&
            nla_put(msg, NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS,
                    iftdata->vendor_elems.len, iftdata->vendor_elems.data))
                return -ENOBUFS;

        return 0;
}

static int nl80211_send_band_rateinfo(struct sk_buff *msg,
                                      struct ieee80211_supported_band *sband,
                                      bool large)
{
        struct nlattr *nl_rates, *nl_rate;
        struct ieee80211_rate *rate;
        int i;

        /* add HT info */
        if (sband->ht_cap.ht_supported &&
            (nla_put(msg, NL80211_BAND_ATTR_HT_MCS_SET,
                     sizeof(sband->ht_cap.mcs),
                     &sband->ht_cap.mcs) ||
             nla_put_u16(msg, NL80211_BAND_ATTR_HT_CAPA,
                         sband->ht_cap.cap) ||
             nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_FACTOR,
                        sband->ht_cap.ampdu_factor) ||
             nla_put_u8(msg, NL80211_BAND_ATTR_HT_AMPDU_DENSITY,
                        sband->ht_cap.ampdu_density)))
                return -ENOBUFS;

        /* add VHT info */
        if (sband->vht_cap.vht_supported &&
            (nla_put(msg, NL80211_BAND_ATTR_VHT_MCS_SET,
                     sizeof(sband->vht_cap.vht_mcs),
                     &sband->vht_cap.vht_mcs) ||
             nla_put_u32(msg, NL80211_BAND_ATTR_VHT_CAPA,
                         sband->vht_cap.cap)))
                return -ENOBUFS;

        if (large && sband->n_iftype_data) {
                struct nlattr *nl_iftype_data =
                        nla_nest_start_noflag(msg,
                                              NL80211_BAND_ATTR_IFTYPE_DATA);
                const struct ieee80211_sband_iftype_data *iftd;
                int err;

                if (!nl_iftype_data)
                        return -ENOBUFS;

                for_each_sband_iftype_data(sband, i, iftd) {
                        struct nlattr *iftdata;

                        iftdata = nla_nest_start_noflag(msg, i + 1);
                        if (!iftdata)
                                return -ENOBUFS;

                        err = nl80211_send_iftype_data(msg, sband, iftd);
                        if (err)
                                return err;

                        nla_nest_end(msg, iftdata);
                }

                nla_nest_end(msg, nl_iftype_data);
        }

        /* add EDMG info */
        if (large && sband->edmg_cap.channels &&
            (nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_CHANNELS,
                       sband->edmg_cap.channels) ||
            nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_BW_CONFIG,
                       sband->edmg_cap.bw_config)))

                return -ENOBUFS;

        /* add bitrates */
        nl_rates = nla_nest_start_noflag(msg, NL80211_BAND_ATTR_RATES);
        if (!nl_rates)
                return -ENOBUFS;

        for (i = 0; i < sband->n_bitrates; i++) {
                nl_rate = nla_nest_start_noflag(msg, i);
                if (!nl_rate)
                        return -ENOBUFS;

                rate = &sband->bitrates[i];
                if (nla_put_u32(msg, NL80211_BITRATE_ATTR_RATE,
                                rate->bitrate))
                        return -ENOBUFS;
                if ((rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) &&
                    nla_put_flag(msg,
                                 NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE))
                        return -ENOBUFS;

                nla_nest_end(msg, nl_rate);
        }

        nla_nest_end(msg, nl_rates);

        /* S1G capabilities */
        if (sband->band == NL80211_BAND_S1GHZ && sband->s1g_cap.s1g &&
            (nla_put(msg, NL80211_BAND_ATTR_S1G_CAPA,
                     sizeof(sband->s1g_cap.cap),
                     sband->s1g_cap.cap) ||
             nla_put(msg, NL80211_BAND_ATTR_S1G_MCS_NSS_SET,
                     sizeof(sband->s1g_cap.nss_mcs),
                     sband->s1g_cap.nss_mcs)))
                return -ENOBUFS;

        return 0;
}

static int
nl80211_send_mgmt_stypes(struct sk_buff *msg,
                         const struct ieee80211_txrx_stypes *mgmt_stypes)
{
        u16 stypes;
        struct nlattr *nl_ftypes, *nl_ifs;
        enum nl80211_iftype ift;
        int i;

        if (!mgmt_stypes)
                return 0;

        nl_ifs = nla_nest_start_noflag(msg, NL80211_ATTR_TX_FRAME_TYPES);
        if (!nl_ifs)
                return -ENOBUFS;

        for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
                nl_ftypes = nla_nest_start_noflag(msg, ift);
                if (!nl_ftypes)
                        return -ENOBUFS;
                i = 0;
                stypes = mgmt_stypes[ift].tx;
                while (stypes) {
                        if ((stypes & 1) &&
                            nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE,
                                        (i << 4) | IEEE80211_FTYPE_MGMT))
                                return -ENOBUFS;
                        stypes >>= 1;
                        i++;
                }
                nla_nest_end(msg, nl_ftypes);
        }

        nla_nest_end(msg, nl_ifs);

        nl_ifs = nla_nest_start_noflag(msg, NL80211_ATTR_RX_FRAME_TYPES);
        if (!nl_ifs)
                return -ENOBUFS;

        for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
                nl_ftypes = nla_nest_start_noflag(msg, ift);
                if (!nl_ftypes)
                        return -ENOBUFS;
                i = 0;
                stypes = mgmt_stypes[ift].rx;
                while (stypes) {
                        if ((stypes & 1) &&
                            nla_put_u16(msg, NL80211_ATTR_FRAME_TYPE,
                                        (i << 4) | IEEE80211_FTYPE_MGMT))
                                return -ENOBUFS;
                        stypes >>= 1;
                        i++;
                }
                nla_nest_end(msg, nl_ftypes);
        }
        nla_nest_end(msg, nl_ifs);

        return 0;
}

#define CMD(op, n)                                                        \
         do {                                                                \
                if (rdev->ops->op) {                                        \
                        i++;                                                \
                        if (nla_put_u32(msg, i, NL80211_CMD_ ## n))         \
                                goto nla_put_failure;                        \
                }                                                        \
        } while (0)

static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev,
                                        struct sk_buff *msg)
{
        int i = 0;

        /*
         * do *NOT* add anything into this function, new things need to be
         * advertised only to new versions of userspace that can deal with
         * the split (and they can't possibly care about new features...
         */
        CMD(add_virtual_intf, NEW_INTERFACE);
        CMD(change_virtual_intf, SET_INTERFACE);
        CMD(add_key, NEW_KEY);
        CMD(start_ap, START_AP);
        CMD(add_station, NEW_STATION);
        CMD(add_mpath, NEW_MPATH);
        CMD(update_mesh_config, SET_MESH_CONFIG);
        CMD(change_bss, SET_BSS);
        CMD(auth, AUTHENTICATE);
        CMD(assoc, ASSOCIATE);
        CMD(deauth, DEAUTHENTICATE);
        CMD(disassoc, DISASSOCIATE);
        CMD(join_ibss, JOIN_IBSS);
        CMD(join_mesh, JOIN_MESH);
        CMD(set_pmksa, SET_PMKSA);
        CMD(del_pmksa, DEL_PMKSA);
        CMD(flush_pmksa, FLUSH_PMKSA);
        if (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL)
                CMD(remain_on_channel, REMAIN_ON_CHANNEL);
        CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
        CMD(mgmt_tx, FRAME);
        CMD(mgmt_tx_cancel_wait, FRAME_WAIT_CANCEL);
        if (rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
                i++;
                if (nla_put_u32(msg, i, NL80211_CMD_SET_WIPHY_NETNS))
                        goto nla_put_failure;
        }
        if (rdev->ops->set_monitor_channel || rdev->ops->start_ap ||
            rdev->ops->join_mesh) {
                i++;
                if (nla_put_u32(msg, i, NL80211_CMD_SET_CHANNEL))
                        goto nla_put_failure;
        }
        if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) {
                CMD(tdls_mgmt, TDLS_MGMT);
                CMD(tdls_oper, TDLS_OPER);
        }
        if (rdev->wiphy.max_sched_scan_reqs)
                CMD(sched_scan_start, START_SCHED_SCAN);
        CMD(probe_client, PROBE_CLIENT);
        CMD(set_noack_map, SET_NOACK_MAP);
        if (rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS) {
                i++;
                if (nla_put_u32(msg, i, NL80211_CMD_REGISTER_BEACONS))
                        goto nla_put_failure;
        }
        CMD(start_p2p_device, START_P2P_DEVICE);
        CMD(set_mcast_rate, SET_MCAST_RATE);
#ifdef CONFIG_NL80211_TESTMODE
        CMD(testmode_cmd, TESTMODE);
#endif

        if (rdev->ops->connect || rdev->ops->auth) {
                i++;
                if (nla_put_u32(msg, i, NL80211_CMD_CONNECT))
                        goto nla_put_failure;
        }

        if (rdev->ops->disconnect || rdev->ops->deauth) {
                i++;
                if (nla_put_u32(msg, i, NL80211_CMD_DISCONNECT))
                        goto nla_put_failure;
        }

        return i;
 nla_put_failure:
        return -ENOBUFS;
}

static int
nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap,
                           struct sk_buff *msg)
{
        struct nlattr *ftm;

        if (!cap->ftm.supported)
                return 0;

        ftm = nla_nest_start_noflag(msg, NL80211_PMSR_TYPE_FTM);
        if (!ftm)
                return -ENOBUFS;

        if (cap->ftm.asap && nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_ASAP))
                return -ENOBUFS;
        if (cap->ftm.non_asap &&
            nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP))
                return -ENOBUFS;
        if (cap->ftm.request_lci &&
            nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI))
                return -ENOBUFS;
        if (cap->ftm.request_civicloc &&
            nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC))
                return -ENOBUFS;
        if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES,
                        cap->ftm.preambles))
                return -ENOBUFS;
        if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS,
                        cap->ftm.bandwidths))
                return -ENOBUFS;
        if (cap->ftm.max_bursts_exponent >= 0 &&
            nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT,
                        cap->ftm.max_bursts_exponent))
                return -ENOBUFS;
        if (cap->ftm.max_ftms_per_burst &&
            nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST,
                        cap->ftm.max_ftms_per_burst))
                return -ENOBUFS;
        if (cap->ftm.trigger_based &&
            nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED))
                return -ENOBUFS;
        if (cap->ftm.non_trigger_based &&
            nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED))
                return -ENOBUFS;

        nla_nest_end(msg, ftm);
        return 0;
}

static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev,
                                  struct sk_buff *msg)
{
        const struct cfg80211_pmsr_capabilities *cap = rdev->wiphy.pmsr_capa;
        struct nlattr *pmsr, *caps;

        if (!cap)
                return 0;

        /*
         * we don't need to clean up anything here since the caller
         * will genlmsg_cancel() if we fail
         */

        pmsr = nla_nest_start_noflag(msg, NL80211_ATTR_PEER_MEASUREMENTS);
        if (!pmsr)
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_PMSR_ATTR_MAX_PEERS, cap->max_peers))
                return -ENOBUFS;

        if (cap->report_ap_tsf &&
            nla_put_flag(msg, NL80211_PMSR_ATTR_REPORT_AP_TSF))
                return -ENOBUFS;

        if (cap->randomize_mac_addr &&
            nla_put_flag(msg, NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR))
                return -ENOBUFS;

        caps = nla_nest_start_noflag(msg, NL80211_PMSR_ATTR_TYPE_CAPA);
        if (!caps)
                return -ENOBUFS;

        if (nl80211_send_pmsr_ftm_capa(cap, msg))
                return -ENOBUFS;

        nla_nest_end(msg, caps);
        nla_nest_end(msg, pmsr);

        return 0;
}

static int
nl80211_put_iftype_akm_suites(struct cfg80211_registered_device *rdev,
                              struct sk_buff *msg)
{
        int i;
        struct nlattr *nested, *nested_akms;
        const struct wiphy_iftype_akm_suites *iftype_akms;

        if (!rdev->wiphy.num_iftype_akm_suites ||
            !rdev->wiphy.iftype_akm_suites)
                return 0;

        nested = nla_nest_start(msg, NL80211_ATTR_IFTYPE_AKM_SUITES);
        if (!nested)
                return -ENOBUFS;

        for (i = 0; i < rdev->wiphy.num_iftype_akm_suites; i++) {
                nested_akms = nla_nest_start(msg, i + 1);
                if (!nested_akms)
                        return -ENOBUFS;

                iftype_akms = &rdev->wiphy.iftype_akm_suites[i];

                if (nl80211_put_iftypes(msg, NL80211_IFTYPE_AKM_ATTR_IFTYPES,
                                        iftype_akms->iftypes_mask))
                        return -ENOBUFS;

                if (nla_put(msg, NL80211_IFTYPE_AKM_ATTR_SUITES,
                            sizeof(u32) * iftype_akms->n_akm_suites,
                            iftype_akms->akm_suites)) {
                        return -ENOBUFS;
                }
                nla_nest_end(msg, nested_akms);
        }

        nla_nest_end(msg, nested);

        return 0;
}

static int
nl80211_put_tid_config_support(struct cfg80211_registered_device *rdev,
                               struct sk_buff *msg)
{
        struct nlattr *supp;

        if (!rdev->wiphy.tid_config_support.vif &&
            !rdev->wiphy.tid_config_support.peer)
                return 0;

        supp = nla_nest_start(msg, NL80211_ATTR_TID_CONFIG);
        if (!supp)
                return -ENOSPC;

        if (rdev->wiphy.tid_config_support.vif &&
            nla_put_u64_64bit(msg, NL80211_TID_CONFIG_ATTR_VIF_SUPP,
                              rdev->wiphy.tid_config_support.vif,
                              NL80211_TID_CONFIG_ATTR_PAD))
                goto fail;

        if (rdev->wiphy.tid_config_support.peer &&
            nla_put_u64_64bit(msg, NL80211_TID_CONFIG_ATTR_PEER_SUPP,
                              rdev->wiphy.tid_config_support.peer,
                              NL80211_TID_CONFIG_ATTR_PAD))
                goto fail;

        /* for now we just use the same value ... makes more sense */
        if (nla_put_u8(msg, NL80211_TID_CONFIG_ATTR_RETRY_SHORT,
                       rdev->wiphy.tid_config_support.max_retry))
                goto fail;
        if (nla_put_u8(msg, NL80211_TID_CONFIG_ATTR_RETRY_LONG,
                       rdev->wiphy.tid_config_support.max_retry))
                goto fail;

        nla_nest_end(msg, supp);

        return 0;
fail:
        nla_nest_cancel(msg, supp);
        return -ENOBUFS;
}

static int
nl80211_put_sar_specs(struct cfg80211_registered_device *rdev,
                      struct sk_buff *msg)
{
        struct nlattr *sar_capa, *specs, *sub_freq_range;
        u8 num_freq_ranges;
        int i;

        if (!rdev->wiphy.sar_capa)
                return 0;

        num_freq_ranges = rdev->wiphy.sar_capa->num_freq_ranges;

        sar_capa = nla_nest_start(msg, NL80211_ATTR_SAR_SPEC);
        if (!sar_capa)
                return -ENOSPC;

        if (nla_put_u32(msg, NL80211_SAR_ATTR_TYPE, rdev->wiphy.sar_capa->type))
                goto fail;

        specs = nla_nest_start(msg, NL80211_SAR_ATTR_SPECS);
        if (!specs)
                goto fail;

        /* report supported freq_ranges */
        for (i = 0; i < num_freq_ranges; i++) {
                sub_freq_range = nla_nest_start(msg, i + 1);
                if (!sub_freq_range)
                        goto fail;

                if (nla_put_u32(msg, NL80211_SAR_ATTR_SPECS_START_FREQ,
                                rdev->wiphy.sar_capa->freq_ranges[i].start_freq))
                        goto fail;

                if (nla_put_u32(msg, NL80211_SAR_ATTR_SPECS_END_FREQ,
                                rdev->wiphy.sar_capa->freq_ranges[i].end_freq))
                        goto fail;

                nla_nest_end(msg, sub_freq_range);
        }

        nla_nest_end(msg, specs);
        nla_nest_end(msg, sar_capa);

        return 0;
fail:
        nla_nest_cancel(msg, sar_capa);
        return -ENOBUFS;
}

static int nl80211_put_mbssid_support(struct wiphy *wiphy, struct sk_buff *msg)
{
        struct nlattr *config;

        if (!wiphy->mbssid_max_interfaces)
                return 0;

        config = nla_nest_start(msg, NL80211_ATTR_MBSSID_CONFIG);
        if (!config)
                return -ENOBUFS;

        if (nla_put_u8(msg, NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES,
                       wiphy->mbssid_max_interfaces))
                goto fail;

        if (wiphy->ema_max_profile_periodicity &&
            nla_put_u8(msg,
                       NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY,
                       wiphy->ema_max_profile_periodicity))
                goto fail;

        nla_nest_end(msg, config);
        return 0;

fail:
        nla_nest_cancel(msg, config);
        return -ENOBUFS;
}

struct nl80211_dump_wiphy_state {
        s64 filter_wiphy;
        long start;
        long split_start, band_start, chan_start, capa_start;
        bool split;
};

static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
                              enum nl80211_commands cmd,
                              struct sk_buff *msg, u32 portid, u32 seq,
                              int flags, struct nl80211_dump_wiphy_state *state)
{
        void *hdr;
        struct nlattr *nl_bands, *nl_band;
        struct nlattr *nl_freqs, *nl_freq;
        struct nlattr *nl_cmds;
        enum nl80211_band band;
        struct ieee80211_channel *chan;
        int i;
        const struct ieee80211_txrx_stypes *mgmt_stypes =
                                rdev->wiphy.mgmt_stypes;
        u32 features;

        hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
        if (!hdr)
                return -ENOBUFS;

        if (WARN_ON(!state))
                return -EINVAL;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_string(msg, NL80211_ATTR_WIPHY_NAME,
                           wiphy_name(&rdev->wiphy)) ||
            nla_put_u32(msg, NL80211_ATTR_GENERATION,
                        cfg80211_rdev_list_generation))
                goto nla_put_failure;

        if (cmd != NL80211_CMD_NEW_WIPHY)
                goto finish;

        switch (state->split_start) {
        case 0:
                if (nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_SHORT,
                               rdev->wiphy.retry_short) ||
                    nla_put_u8(msg, NL80211_ATTR_WIPHY_RETRY_LONG,
                               rdev->wiphy.retry_long) ||
                    nla_put_u32(msg, NL80211_ATTR_WIPHY_FRAG_THRESHOLD,
                                rdev->wiphy.frag_threshold) ||
                    nla_put_u32(msg, NL80211_ATTR_WIPHY_RTS_THRESHOLD,
                                rdev->wiphy.rts_threshold) ||
                    nla_put_u8(msg, NL80211_ATTR_WIPHY_COVERAGE_CLASS,
                               rdev->wiphy.coverage_class) ||
                    nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
                               rdev->wiphy.max_scan_ssids) ||
                    nla_put_u8(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_SSIDS,
                               rdev->wiphy.max_sched_scan_ssids) ||
                    nla_put_u16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
                                rdev->wiphy.max_scan_ie_len) ||
                    nla_put_u16(msg, NL80211_ATTR_MAX_SCHED_SCAN_IE_LEN,
                                rdev->wiphy.max_sched_scan_ie_len) ||
                    nla_put_u8(msg, NL80211_ATTR_MAX_MATCH_SETS,
                               rdev->wiphy.max_match_sets))
                        goto nla_put_failure;

                if ((rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) &&
                    nla_put_flag(msg, NL80211_ATTR_SUPPORT_IBSS_RSN))
                        goto nla_put_failure;
                if ((rdev->wiphy.flags & WIPHY_FLAG_MESH_AUTH) &&
                    nla_put_flag(msg, NL80211_ATTR_SUPPORT_MESH_AUTH))
                        goto nla_put_failure;
                if ((rdev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) &&
                    nla_put_flag(msg, NL80211_ATTR_SUPPORT_AP_UAPSD))
                        goto nla_put_failure;
                if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
                    nla_put_flag(msg, NL80211_ATTR_ROAM_SUPPORT))
                        goto nla_put_failure;
                if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) &&
                    nla_put_flag(msg, NL80211_ATTR_TDLS_SUPPORT))
                        goto nla_put_failure;
                if ((rdev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP) &&
                    nla_put_flag(msg, NL80211_ATTR_TDLS_EXTERNAL_SETUP))
                        goto nla_put_failure;
                state->split_start++;
                if (state->split)
                        break;
                fallthrough;
        case 1:
                if (nla_put(msg, NL80211_ATTR_CIPHER_SUITES,
                            sizeof(u32) * rdev->wiphy.n_cipher_suites,
                            rdev->wiphy.cipher_suites))
                        goto nla_put_failure;

                if (nla_put_u8(msg, NL80211_ATTR_MAX_NUM_PMKIDS,
                               rdev->wiphy.max_num_pmkids))
                        goto nla_put_failure;

                if ((rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
                    nla_put_flag(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE))
                        goto nla_put_failure;

                if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_TX,
                                rdev->wiphy.available_antennas_tx) ||
                    nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_AVAIL_RX,
                                rdev->wiphy.available_antennas_rx))
                        goto nla_put_failure;

                if ((rdev->wiphy.flags & WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD) &&
                    nla_put_u32(msg, NL80211_ATTR_PROBE_RESP_OFFLOAD,
                                rdev->wiphy.probe_resp_offload))
                        goto nla_put_failure;

                if ((rdev->wiphy.available_antennas_tx ||
                     rdev->wiphy.available_antennas_rx) &&
                    rdev->ops->get_antenna) {
                        u32 tx_ant = 0, rx_ant = 0;
                        int res;

                        res = rdev_get_antenna(rdev, &tx_ant, &rx_ant);
                        if (!res) {
                                if (nla_put_u32(msg,
                                                NL80211_ATTR_WIPHY_ANTENNA_TX,
                                                tx_ant) ||
                                    nla_put_u32(msg,
                                                NL80211_ATTR_WIPHY_ANTENNA_RX,
                                                rx_ant))
                                        goto nla_put_failure;
                        }
                }

                state->split_start++;
                if (state->split)
                        break;
                fallthrough;
        case 2:
                if (nl80211_put_iftypes(msg, NL80211_ATTR_SUPPORTED_IFTYPES,
                                        rdev->wiphy.interface_modes))
                                goto nla_put_failure;
                state->split_start++;
                if (state->split)
                        break;
                fallthrough;
        case 3:
                nl_bands = nla_nest_start_noflag(msg,
                                                 NL80211_ATTR_WIPHY_BANDS);
                if (!nl_bands)
                        goto nla_put_failure;

                for (band = state->band_start;
                     band < (state->split ?
                                NUM_NL80211_BANDS :
                                NL80211_BAND_60GHZ + 1);
                     band++) {
                        struct ieee80211_supported_band *sband;

                        /* omit higher bands for ancient software */
                        if (band > NL80211_BAND_5GHZ && !state->split)
                                break;

                        sband = rdev->wiphy.bands[band];

                        if (!sband)
                                continue;

                        nl_band = nla_nest_start_noflag(msg, band);
                        if (!nl_band)
                                goto nla_put_failure;

                        switch (state->chan_start) {
                        case 0:
                                if (nl80211_send_band_rateinfo(msg, sband,
                                                               state->split))
                                        goto nla_put_failure;
                                state->chan_start++;
                                if (state->split)
                                        break;
                                fallthrough;
                        default:
                                /* add frequencies */
                                nl_freqs = nla_nest_start_noflag(msg,
                                                                 NL80211_BAND_ATTR_FREQS);
                                if (!nl_freqs)
                                        goto nla_put_failure;

                                for (i = state->chan_start - 1;
                                     i < sband->n_channels;
                                     i++) {
                                        nl_freq = nla_nest_start_noflag(msg,
                                                                        i);
                                        if (!nl_freq)
                                                goto nla_put_failure;

                                        chan = &sband->channels[i];

                                        if (nl80211_msg_put_channel(
                                                        msg, &rdev->wiphy, chan,
                                                        state->split))
                                                goto nla_put_failure;

                                        nla_nest_end(msg, nl_freq);
                                        if (state->split)
                                                break;
                                }
                                if (i < sband->n_channels)
                                        state->chan_start = i + 2;
                                else
                                        state->chan_start = 0;
                                nla_nest_end(msg, nl_freqs);
                        }

                        nla_nest_end(msg, nl_band);

                        if (state->split) {
                                /* start again here */
                                if (state->chan_start)
                                        band--;
                                break;
                        }
                }
                nla_nest_end(msg, nl_bands);

                if (band < NUM_NL80211_BANDS)
                        state->band_start = band + 1;
                else
                        state->band_start = 0;

                /* if bands & channels are done, continue outside */
                if (state->band_start == 0 && state->chan_start == 0)
                        state->split_start++;
                if (state->split)
                        break;
                fallthrough;
        case 4:
                nl_cmds = nla_nest_start_noflag(msg,
                                                NL80211_ATTR_SUPPORTED_COMMANDS);
                if (!nl_cmds)
                        goto nla_put_failure;

                i = nl80211_add_commands_unsplit(rdev, msg);
                if (i < 0)
                        goto nla_put_failure;
                if (state->split) {
                        CMD(crit_proto_start, CRIT_PROTOCOL_START);
                        CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
                        if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)
                                CMD(channel_switch, CHANNEL_SWITCH);
                        CMD(set_qos_map, SET_QOS_MAP);
                        if (rdev->wiphy.features &
                                        NL80211_FEATURE_SUPPORTS_WMM_ADMISSION)
                                CMD(add_tx_ts, ADD_TX_TS);
                        CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST);
                        CMD(update_connect_params, UPDATE_CONNECT_PARAMS);
                        CMD(update_ft_ies, UPDATE_FT_IES);
                        if (rdev->wiphy.sar_capa)
                                CMD(set_sar_specs, SET_SAR_SPECS);
                }
#undef CMD

                nla_nest_end(msg, nl_cmds);
                state->split_start++;
                if (state->split)
                        break;
                fallthrough;
        case 5:
                if (rdev->ops->remain_on_channel &&
                    (rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL) &&
                    nla_put_u32(msg,
                                NL80211_ATTR_MAX_REMAIN_ON_CHANNEL_DURATION,
                                rdev->wiphy.max_remain_on_channel_duration))
                        goto nla_put_failure;

                if ((rdev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX) &&
                    nla_put_flag(msg, NL80211_ATTR_OFFCHANNEL_TX_OK))
                        goto nla_put_failure;

                state->split_start++;
                if (state->split)
                        break;
                fallthrough;
        case 6:
#ifdef CONFIG_PM
                if (nl80211_send_wowlan(msg, rdev, state->split))
                        goto nla_put_failure;
                state->split_start++;
                if (state->split)
                        break;
#else
                state->split_start++;
#endif
                fallthrough;
        case 7:
                if (nl80211_put_iftypes(msg, NL80211_ATTR_SOFTWARE_IFTYPES,
                                        rdev->wiphy.software_iftypes))
                        goto nla_put_failure;

                if (nl80211_put_iface_combinations(&rdev->wiphy, msg,
                                                   state->split))
                        goto nla_put_failure;

                state->split_start++;
                if (state->split)
                        break;
                fallthrough;
        case 8:
                if ((rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME) &&
                    nla_put_u32(msg, NL80211_ATTR_DEVICE_AP_SME,
                                rdev->wiphy.ap_sme_capa))
                        goto nla_put_failure;

                features = rdev->wiphy.features;
                /*
                 * We can only add the per-channel limit information if the
                 * dump is split, otherwise it makes it too big. Therefore
                 * only advertise it in that case.
                 */
                if (state->split)
                        features |= NL80211_FEATURE_ADVERTISE_CHAN_LIMITS;
                if (nla_put_u32(msg, NL80211_ATTR_FEATURE_FLAGS, features))
                        goto nla_put_failure;

                if (rdev->wiphy.ht_capa_mod_mask &&
                    nla_put(msg, NL80211_ATTR_HT_CAPABILITY_MASK,
                            sizeof(*rdev->wiphy.ht_capa_mod_mask),
                            rdev->wiphy.ht_capa_mod_mask))
                        goto nla_put_failure;

                if (rdev->wiphy.flags & WIPHY_FLAG_HAVE_AP_SME &&
                    rdev->wiphy.max_acl_mac_addrs &&
                    nla_put_u32(msg, NL80211_ATTR_MAC_ACL_MAX,
                                rdev->wiphy.max_acl_mac_addrs))
                        goto nla_put_failure;

                /*
                 * Any information below this point is only available to
                 * applications that can deal with it being split. This
                 * helps ensure that newly added capabilities don't break
                 * older tools by overrunning their buffers.
                 *
                 * We still increment split_start so that in the split
                 * case we'll continue with more data in the next round,
                 * but break unconditionally so unsplit data stops here.
                 */
                if (state->split)
                        state->split_start++;
                else
                        state->split_start = 0;
                break;
        case 9:
                if (nl80211_send_mgmt_stypes(msg, mgmt_stypes))
                        goto nla_put_failure;

                if (nla_put_u32(msg, NL80211_ATTR_MAX_NUM_SCHED_SCAN_PLANS,
                                rdev->wiphy.max_sched_scan_plans) ||
                    nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_INTERVAL,
                                rdev->wiphy.max_sched_scan_plan_interval) ||
                    nla_put_u32(msg, NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
                                rdev->wiphy.max_sched_scan_plan_iterations))
                        goto nla_put_failure;

                if (rdev->wiphy.extended_capabilities &&
                    (nla_put(msg, NL80211_ATTR_EXT_CAPA,
                             rdev->wiphy.extended_capabilities_len,
                             rdev->wiphy.extended_capabilities) ||
                     nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK,
                             rdev->wiphy.extended_capabilities_len,
                             rdev->wiphy.extended_capabilities_mask)))
                        goto nla_put_failure;

                if (rdev->wiphy.vht_capa_mod_mask &&
                    nla_put(msg, NL80211_ATTR_VHT_CAPABILITY_MASK,
                            sizeof(*rdev->wiphy.vht_capa_mod_mask),
                            rdev->wiphy.vht_capa_mod_mask))
                        goto nla_put_failure;

                if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
                            rdev->wiphy.perm_addr))
                        goto nla_put_failure;

                if (!is_zero_ether_addr(rdev->wiphy.addr_mask) &&
                    nla_put(msg, NL80211_ATTR_MAC_MASK, ETH_ALEN,
                            rdev->wiphy.addr_mask))
                        goto nla_put_failure;

                if (rdev->wiphy.n_addresses > 1) {
                        void *attr;

                        attr = nla_nest_start(msg, NL80211_ATTR_MAC_ADDRS);
                        if (!attr)
                                goto nla_put_failure;

                        for (i = 0; i < rdev->wiphy.n_addresses; i++)
                                if (nla_put(msg, i + 1, ETH_ALEN,
                                            rdev->wiphy.addresses[i].addr))
                                        goto nla_put_failure;

                        nla_nest_end(msg, attr);
                }

                state->split_start++;
                break;
        case 10:
                if (nl80211_send_coalesce(msg, rdev))
                        goto nla_put_failure;

                if ((rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ) &&
                    (nla_put_flag(msg, NL80211_ATTR_SUPPORT_5_MHZ) ||
                     nla_put_flag(msg, NL80211_ATTR_SUPPORT_10_MHZ)))
                        goto nla_put_failure;

                if (rdev->wiphy.max_ap_assoc_sta &&
                    nla_put_u32(msg, NL80211_ATTR_MAX_AP_ASSOC_STA,
                                rdev->wiphy.max_ap_assoc_sta))
                        goto nla_put_failure;

                state->split_start++;
                break;
        case 11:
                if (rdev->wiphy.n_vendor_commands) {
                        const struct nl80211_vendor_cmd_info *info;
                        struct nlattr *nested;

                        nested = nla_nest_start_noflag(msg,
                                                       NL80211_ATTR_VENDOR_DATA);
                        if (!nested)
                                goto nla_put_failure;

                        for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
                                info = &rdev->wiphy.vendor_commands[i].info;
                                if (nla_put(msg, i + 1, sizeof(*info), info))
                                        goto nla_put_failure;
                        }
                        nla_nest_end(msg, nested);
                }

                if (rdev->wiphy.n_vendor_events) {
                        const struct nl80211_vendor_cmd_info *info;
                        struct nlattr *nested;

                        nested = nla_nest_start_noflag(msg,
                                                       NL80211_ATTR_VENDOR_EVENTS);
                        if (!nested)
                                goto nla_put_failure;

                        for (i = 0; i < rdev->wiphy.n_vendor_events; i++) {
                                info = &rdev->wiphy.vendor_events[i];
                                if (nla_put(msg, i + 1, sizeof(*info), info))
                                        goto nla_put_failure;
                        }
                        nla_nest_end(msg, nested);
                }
                state->split_start++;
                break;
        case 12:
                if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH &&
                    nla_put_u8(msg, NL80211_ATTR_MAX_CSA_COUNTERS,
                               rdev->wiphy.max_num_csa_counters))
                        goto nla_put_failure;

                if (rdev->wiphy.regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
                    nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG))
                        goto nla_put_failure;

                if (rdev->wiphy.max_sched_scan_reqs &&
                    nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_MAX_REQS,
                                rdev->wiphy.max_sched_scan_reqs))
                        goto nla_put_failure;

                if (nla_put(msg, NL80211_ATTR_EXT_FEATURES,
                            sizeof(rdev->wiphy.ext_features),
                            rdev->wiphy.ext_features))
                        goto nla_put_failure;

                if (rdev->wiphy.bss_select_support) {
                        struct nlattr *nested;
                        u32 bss_select_support = rdev->wiphy.bss_select_support;

                        nested = nla_nest_start_noflag(msg,
                                                       NL80211_ATTR_BSS_SELECT);
                        if (!nested)
                                goto nla_put_failure;

                        i = 0;
                        while (bss_select_support) {
                                if ((bss_select_support & 1) &&
                                    nla_put_flag(msg, i))
                                        goto nla_put_failure;
                                i++;
                                bss_select_support >>= 1;
                        }
                        nla_nest_end(msg, nested);
                }

                state->split_start++;
                break;
        case 13:
                if (rdev->wiphy.num_iftype_ext_capab &&
                    rdev->wiphy.iftype_ext_capab) {
                        struct nlattr *nested_ext_capab, *nested;

                        nested = nla_nest_start_noflag(msg,
                                                       NL80211_ATTR_IFTYPE_EXT_CAPA);
                        if (!nested)
                                goto nla_put_failure;

                        for (i = state->capa_start;
                             i < rdev->wiphy.num_iftype_ext_capab; i++) {
                                const struct wiphy_iftype_ext_capab *capab;

                                capab = &rdev->wiphy.iftype_ext_capab[i];

                                nested_ext_capab = nla_nest_start_noflag(msg,
                                                                         i);
                                if (!nested_ext_capab ||
                                    nla_put_u32(msg, NL80211_ATTR_IFTYPE,
                                                capab->iftype) ||
                                    nla_put(msg, NL80211_ATTR_EXT_CAPA,
                                            capab->extended_capabilities_len,
                                            capab->extended_capabilities) ||
                                    nla_put(msg, NL80211_ATTR_EXT_CAPA_MASK,
                                            capab->extended_capabilities_len,
                                            capab->extended_capabilities_mask))
                                        goto nla_put_failure;

                                if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO &&
                                    (nla_put_u16(msg,
                                                 NL80211_ATTR_EML_CAPABILITY,
                                                 capab->eml_capabilities) ||
                                     nla_put_u16(msg,
                                                 NL80211_ATTR_MLD_CAPA_AND_OPS,
                                                 capab->mld_capa_and_ops)))
                                        goto nla_put_failure;

                                nla_nest_end(msg, nested_ext_capab);
                                if (state->split)
                                        break;
                        }
                        nla_nest_end(msg, nested);
                        if (i < rdev->wiphy.num_iftype_ext_capab) {
                                state->capa_start = i + 1;
                                break;
                        }
                }

                if (nla_put_u32(msg, NL80211_ATTR_BANDS,
                                rdev->wiphy.nan_supported_bands))
                        goto nla_put_failure;

                if (wiphy_ext_feature_isset(&rdev->wiphy,
                                            NL80211_EXT_FEATURE_TXQS)) {
                        struct cfg80211_txq_stats txqstats = {};
                        int res;

                        res = rdev_get_txq_stats(rdev, NULL, &txqstats);
                        if (!res &&
                            !nl80211_put_txq_stats(msg, &txqstats,
                                                   NL80211_ATTR_TXQ_STATS))
                                goto nla_put_failure;

                        if (nla_put_u32(msg, NL80211_ATTR_TXQ_LIMIT,
                                        rdev->wiphy.txq_limit))
                                goto nla_put_failure;
                        if (nla_put_u32(msg, NL80211_ATTR_TXQ_MEMORY_LIMIT,
                                        rdev->wiphy.txq_memory_limit))
                                goto nla_put_failure;
                        if (nla_put_u32(msg, NL80211_ATTR_TXQ_QUANTUM,
                                        rdev->wiphy.txq_quantum))
                                goto nla_put_failure;
                }

                state->split_start++;
                break;
        case 14:
                if (nl80211_send_pmsr_capa(rdev, msg))
                        goto nla_put_failure;

                state->split_start++;
                break;
        case 15:
                if (rdev->wiphy.akm_suites &&
                    nla_put(msg, NL80211_ATTR_AKM_SUITES,
                            sizeof(u32) * rdev->wiphy.n_akm_suites,
                            rdev->wiphy.akm_suites))
                        goto nla_put_failure;

                if (nl80211_put_iftype_akm_suites(rdev, msg))
                        goto nla_put_failure;

                if (nl80211_put_tid_config_support(rdev, msg))
                        goto nla_put_failure;
                state->split_start++;
                break;
        case 16:
                if (nl80211_put_sar_specs(rdev, msg))
                        goto nla_put_failure;

                if (nl80211_put_mbssid_support(&rdev->wiphy, msg))
                        goto nla_put_failure;

                if (nla_put_u16(msg, NL80211_ATTR_MAX_NUM_AKM_SUITES,
                                rdev->wiphy.max_num_akm_suites))
                        goto nla_put_failure;

                if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO)
                        nla_put_flag(msg, NL80211_ATTR_MLO_SUPPORT);

                if (rdev->wiphy.hw_timestamp_max_peers &&
                    nla_put_u16(msg, NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS,
                                rdev->wiphy.hw_timestamp_max_peers))
                        goto nla_put_failure;

                /* done */
                state->split_start = 0;
                break;
        }
 finish:
        genlmsg_end(msg, hdr);
        return 0;

 nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
                                    struct netlink_callback *cb,
                                    struct nl80211_dump_wiphy_state *state)
{
        struct nlattr **tb = kcalloc(NUM_NL80211_ATTR, sizeof(*tb), GFP_KERNEL);
        int ret;

        if (!tb)
                return -ENOMEM;

        ret = nlmsg_parse_deprecated(cb->nlh,
                                     GENL_HDRLEN + nl80211_fam.hdrsize,
                                     tb, nl80211_fam.maxattr,
                                     nl80211_policy, NULL);
        /* ignore parse errors for backward compatibility */
        if (ret) {
                ret = 0;
                goto out;
        }

        state->split = tb[NL80211_ATTR_SPLIT_WIPHY_DUMP];
        if (tb[NL80211_ATTR_WIPHY])
                state->filter_wiphy = nla_get_u32(tb[NL80211_ATTR_WIPHY]);
        if (tb[NL80211_ATTR_WDEV])
                state->filter_wiphy = nla_get_u64(tb[NL80211_ATTR_WDEV]) >> 32;
        if (tb[NL80211_ATTR_IFINDEX]) {
                struct net_device *netdev;
                struct cfg80211_registered_device *rdev;
                int ifidx = nla_get_u32(tb[NL80211_ATTR_IFINDEX]);

                netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
                if (!netdev) {
                        ret = -ENODEV;
                        goto out;
                }
                if (netdev->ieee80211_ptr) {
                        rdev = wiphy_to_rdev(
                                netdev->ieee80211_ptr->wiphy);
                        state->filter_wiphy = rdev->wiphy_idx;
                }
        }

        ret = 0;
out:
        kfree(tb);
        return ret;
}

static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
{
        int idx = 0, ret;
        struct nl80211_dump_wiphy_state *state = (void *)cb->args[0];
        struct cfg80211_registered_device *rdev;

        rtnl_lock();
        if (!state) {
                state = kzalloc(sizeof(*state), GFP_KERNEL);
                if (!state) {
                        rtnl_unlock();
                        return -ENOMEM;
                }
                state->filter_wiphy = -1;
                ret = nl80211_dump_wiphy_parse(skb, cb, state);
                if (ret) {
                        kfree(state);
                        rtnl_unlock();
                        return ret;
                }
                cb->args[0] = (long)state;
        }

        for_each_rdev(rdev) {
                if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
                        continue;
                if (++idx <= state->start)
                        continue;
                if (state->filter_wiphy != -1 &&
                    state->filter_wiphy != rdev->wiphy_idx)
                        continue;
                wiphy_lock(&rdev->wiphy);
                /* attempt to fit multiple wiphy data chunks into the skb */
                do {
                        ret = nl80211_send_wiphy(rdev, NL80211_CMD_NEW_WIPHY,
                                                 skb,
                                                 NETLINK_CB(cb->skb).portid,
                                                 cb->nlh->nlmsg_seq,
                                                 NLM_F_MULTI, state);
                        if (ret < 0) {
                                /*
                                 * If sending the wiphy data didn't fit (ENOBUFS
                                 * or EMSGSIZE returned), this SKB is still
                                 * empty (so it's not too big because another
                                 * wiphy dataset is already in the skb) and
                                 * we've not tried to adjust the dump allocation
                                 * yet ... then adjust the alloc size to be
                                 * bigger, and return 1 but with the empty skb.
                                 * This results in an empty message being RX'ed
                                 * in userspace, but that is ignored.
                                 *
                                 * We can then retry with the larger buffer.
                                 */
                                if ((ret == -ENOBUFS || ret == -EMSGSIZE) &&
                                    !skb->len && !state->split &&
                                    cb->min_dump_alloc < 4096) {
                                        cb->min_dump_alloc = 4096;
                                        state->split_start = 0;
                                        wiphy_unlock(&rdev->wiphy);
                                        rtnl_unlock();
                                        return 1;
                                }
                                idx--;
                                break;
                        }
                } while (state->split_start > 0);
                wiphy_unlock(&rdev->wiphy);
                break;
        }
        rtnl_unlock();

        state->start = idx;

        return skb->len;
}

static int nl80211_dump_wiphy_done(struct netlink_callback *cb)
{
        kfree((void *)cb->args[0]);
        return 0;
}

static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
{
        struct sk_buff *msg;
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct nl80211_dump_wiphy_state state = {};

        msg = nlmsg_new(4096, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        if (nl80211_send_wiphy(rdev, NL80211_CMD_NEW_WIPHY, msg,
                               info->snd_portid, info->snd_seq, 0,
                               &state) < 0) {
                nlmsg_free(msg);
                return -ENOBUFS;
        }

        return genlmsg_reply(msg, info);
}

static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
        [NL80211_TXQ_ATTR_QUEUE]                = { .type = NLA_U8 },
        [NL80211_TXQ_ATTR_TXOP]                        = { .type = NLA_U16 },
        [NL80211_TXQ_ATTR_CWMIN]                = { .type = NLA_U16 },
        [NL80211_TXQ_ATTR_CWMAX]                = { .type = NLA_U16 },
        [NL80211_TXQ_ATTR_AIFS]                        = { .type = NLA_U8 },
};

static int parse_txq_params(struct nlattr *tb[],
                            struct ieee80211_txq_params *txq_params)
{
        u8 ac;

        if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] ||
            !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
            !tb[NL80211_TXQ_ATTR_AIFS])
                return -EINVAL;

        ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
        txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
        txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
        txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
        txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);

        if (ac >= NL80211_NUM_ACS)
                return -EINVAL;
        txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS);
        return 0;
}

static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
{
        /*
         * You can only set the channel explicitly for some interfaces,
         * most have their channel managed via their respective
         * "establish a connection" command (connect, join, ...)
         *
         * For AP/GO and mesh mode, the channel can be set with the
         * channel userspace API, but is only stored and passed to the
         * low-level driver when the AP starts or the mesh is joined.
         * This is for backward compatibility, userspace can also give
         * the channel in the start-ap or join-mesh commands instead.
         *
         * Monitors are special as they are normally slaved to
         * whatever else is going on, so they have their own special
         * operation to set the monitor channel if possible.
         */
        return !wdev ||
                wdev->iftype == NL80211_IFTYPE_AP ||
                wdev->iftype == NL80211_IFTYPE_MESH_POINT ||
                wdev->iftype == NL80211_IFTYPE_MONITOR ||
                wdev->iftype == NL80211_IFTYPE_P2P_GO;
}

static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
                                  struct genl_info *info, bool monitor,
                                  struct cfg80211_chan_def *chandef)
{
        struct netlink_ext_ack *extack = info->extack;
        struct nlattr **attrs = info->attrs;
        u32 control_freq;

        if (!attrs[NL80211_ATTR_WIPHY_FREQ]) {
                NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ],
                                    "Frequency is missing");
                return -EINVAL;
        }

        control_freq = MHZ_TO_KHZ(
                        nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
        if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
                control_freq +=
                    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);

        memset(chandef, 0, sizeof(*chandef));
        chandef->chan = ieee80211_get_channel_khz(&rdev->wiphy, control_freq);
        chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
        chandef->center_freq1 = KHZ_TO_MHZ(control_freq);
        chandef->freq1_offset = control_freq % 1000;
        chandef->center_freq2 = 0;

        if (!chandef->chan) {
                NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ],
                                    "Unknown channel");
                return -EINVAL;
        }

        if (attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
                enum nl80211_channel_type chantype;

                chantype = nla_get_u32(attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);

                switch (chantype) {
                case NL80211_CHAN_NO_HT:
                case NL80211_CHAN_HT20:
                case NL80211_CHAN_HT40PLUS:
                case NL80211_CHAN_HT40MINUS:
                        cfg80211_chandef_create(chandef, chandef->chan,
                                                chantype);
                        /* user input for center_freq is incorrect */
                        if (attrs[NL80211_ATTR_CENTER_FREQ1] &&
                            chandef->center_freq1 != nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1])) {
                                NL_SET_ERR_MSG_ATTR(extack,
                                                    attrs[NL80211_ATTR_CENTER_FREQ1],
                                                    "bad center frequency 1");
                                return -EINVAL;
                        }
                        /* center_freq2 must be zero */
                        if (attrs[NL80211_ATTR_CENTER_FREQ2] &&
                            nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2])) {
                                NL_SET_ERR_MSG_ATTR(extack,
                                                    attrs[NL80211_ATTR_CENTER_FREQ2],
                                                    "center frequency 2 can't be used");
                                return -EINVAL;
                        }
                        break;
                default:
                        NL_SET_ERR_MSG_ATTR(extack,
                                            attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE],
                                            "invalid channel type");
                        return -EINVAL;
                }
        } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) {
                chandef->width =
                        nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]);
                if (chandef->chan->band == NL80211_BAND_S1GHZ) {
                        /* User input error for channel width doesn't match channel  */
                        if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) {
                                NL_SET_ERR_MSG_ATTR(extack,
                                                    attrs[NL80211_ATTR_CHANNEL_WIDTH],
                                                    "bad channel width");
                                return -EINVAL;
                        }
                }
                if (attrs[NL80211_ATTR_CENTER_FREQ1]) {
                        chandef->center_freq1 =
                                nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]);
                        if (attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET])
                                chandef->freq1_offset = nla_get_u32(
                                      attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]);
                        else
                                chandef->freq1_offset = 0;
                }
                if (attrs[NL80211_ATTR_CENTER_FREQ2])
                        chandef->center_freq2 =
                                nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]);
        }

        if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) {
                chandef->edmg.channels =
                      nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]);

                if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG])
                        chandef->edmg.bw_config =
                     nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]);
        } else {
                chandef->edmg.bw_config = 0;
                chandef->edmg.channels = 0;
        }

        if (info->attrs[NL80211_ATTR_PUNCT_BITMAP]) {
                chandef->punctured =
                        nla_get_u32(info->attrs[NL80211_ATTR_PUNCT_BITMAP]);

                if (chandef->punctured &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_PUNCT)) {
                        NL_SET_ERR_MSG(extack,
                                       "driver doesn't support puncturing");
                        return -EINVAL;
                }
        }

        if (!cfg80211_chandef_valid(chandef)) {
                NL_SET_ERR_MSG(extack, "invalid channel definition");
                return -EINVAL;
        }

        if (!_cfg80211_chandef_usable(&rdev->wiphy, chandef,
                                      IEEE80211_CHAN_DISABLED,
                                      monitor)) {
                NL_SET_ERR_MSG(extack, "(extension) channel is disabled");
                return -EINVAL;
        }

        if ((chandef->width == NL80211_CHAN_WIDTH_5 ||
             chandef->width == NL80211_CHAN_WIDTH_10) &&
            !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ)) {
                NL_SET_ERR_MSG(extack, "5/10 MHz not supported");
                return -EINVAL;
        }

        return 0;
}

int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
                          struct genl_info *info,
                          struct cfg80211_chan_def *chandef)
{
        return _nl80211_parse_chandef(rdev, info, false, chandef);
}

static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 struct genl_info *info,
                                 int _link_id)
{
        struct cfg80211_chan_def chandef;
        int result;
        enum nl80211_iftype iftype = NL80211_IFTYPE_MONITOR;
        struct wireless_dev *wdev = NULL;
        int link_id = _link_id;

        if (dev)
                wdev = dev->ieee80211_ptr;
        if (!nl80211_can_set_dev_channel(wdev))
                return -EOPNOTSUPP;
        if (wdev)
                iftype = wdev->iftype;

        if (link_id < 0) {
                if (wdev && wdev->valid_links)
                        return -EINVAL;
                link_id = 0;
        }

        result = _nl80211_parse_chandef(rdev, info,
                                        iftype == NL80211_IFTYPE_MONITOR,
                                        &chandef);
        if (result)
                return result;

        switch (iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
                                                   iftype))
                        return -EINVAL;
                if (wdev->links[link_id].ap.beacon_interval) {
                        struct ieee80211_channel *cur_chan;

                        if (!dev || !rdev->ops->set_ap_chanwidth ||
                            !(rdev->wiphy.features &
                              NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE))
                                return -EBUSY;

                        /* Only allow dynamic channel width changes */
                        cur_chan = wdev->links[link_id].ap.chandef.chan;
                        if (chandef.chan != cur_chan)
                                return -EBUSY;

                        /* only allow this for regular channel widths */
                        switch (wdev->links[link_id].ap.chandef.width) {
                        case NL80211_CHAN_WIDTH_20_NOHT:
                        case NL80211_CHAN_WIDTH_20:
                        case NL80211_CHAN_WIDTH_40:
                        case NL80211_CHAN_WIDTH_80:
                        case NL80211_CHAN_WIDTH_80P80:
                        case NL80211_CHAN_WIDTH_160:
                        case NL80211_CHAN_WIDTH_320:
                                break;
                        default:
                                return -EINVAL;
                        }

                        switch (chandef.width) {
                        case NL80211_CHAN_WIDTH_20_NOHT:
                        case NL80211_CHAN_WIDTH_20:
                        case NL80211_CHAN_WIDTH_40:
                        case NL80211_CHAN_WIDTH_80:
                        case NL80211_CHAN_WIDTH_80P80:
                        case NL80211_CHAN_WIDTH_160:
                        case NL80211_CHAN_WIDTH_320:
                                break;
                        default:
                                return -EINVAL;
                        }

                        result = rdev_set_ap_chanwidth(rdev, dev, link_id,
                                                       &chandef);
                        if (result)
                                return result;
                        wdev->links[link_id].ap.chandef = chandef;
                } else {
                        wdev->u.ap.preset_chandef = chandef;
                }
                return 0;
        case NL80211_IFTYPE_MESH_POINT:
                return cfg80211_set_mesh_channel(rdev, wdev, &chandef);
        case NL80211_IFTYPE_MONITOR:
                return cfg80211_set_monitor_channel(rdev, &chandef);
        default:
                break;
        }

        return -EINVAL;
}

static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int link_id = nl80211_link_id_or_invalid(info->attrs);
        struct net_device *netdev = info->user_ptr[1];

        return __nl80211_set_channel(rdev, netdev, info, link_id);
}

static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = NULL;
        struct net_device *netdev = NULL;
        struct wireless_dev *wdev;
        int result = 0, rem_txq_params = 0;
        struct nlattr *nl_txq_params;
        u32 changed;
        u8 retry_short = 0, retry_long = 0;
        u32 frag_threshold = 0, rts_threshold = 0;
        u8 coverage_class = 0;
        u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0;

        rtnl_lock();
        /*
         * Try to find the wiphy and netdev. Normally this
         * function shouldn't need the netdev, but this is
         * done for backward compatibility -- previously
         * setting the channel was done per wiphy, but now
         * it is per netdev. Previous userland like hostapd
         * also passed a netdev to set_wiphy, so that it is
         * possible to let that go to the right netdev!
         */

        if (info->attrs[NL80211_ATTR_IFINDEX]) {
                int ifindex = nla_get_u32(info->attrs[NL80211_ATTR_IFINDEX]);

                netdev = __dev_get_by_index(genl_info_net(info), ifindex);
                if (netdev && netdev->ieee80211_ptr)
                        rdev = wiphy_to_rdev(netdev->ieee80211_ptr->wiphy);
                else
                        netdev = NULL;
        }

        if (!netdev) {
                rdev = __cfg80211_rdev_from_attrs(genl_info_net(info),
                                                  info->attrs);
                if (IS_ERR(rdev)) {
                        rtnl_unlock();
                        return PTR_ERR(rdev);
                }
                wdev = NULL;
                netdev = NULL;
                result = 0;
        } else
                wdev = netdev->ieee80211_ptr;

        wiphy_lock(&rdev->wiphy);

        /*
         * end workaround code, by now the rdev is available
         * and locked, and wdev may or may not be NULL.
         */

        if (info->attrs[NL80211_ATTR_WIPHY_NAME])
                result = cfg80211_dev_rename(
                        rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME]));
        rtnl_unlock();

        if (result)
                goto out;

        if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) {
                struct ieee80211_txq_params txq_params;
                struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1];

                if (!rdev->ops->set_txq_params) {
                        result = -EOPNOTSUPP;
                        goto out;
                }

                if (!netdev) {
                        result = -EINVAL;
                        goto out;
                }

                if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
                    netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) {
                        result = -EINVAL;
                        goto out;
                }

                if (!netif_running(netdev)) {
                        result = -ENETDOWN;
                        goto out;
                }

                nla_for_each_nested(nl_txq_params,
                                    info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
                                    rem_txq_params) {
                        result = nla_parse_nested_deprecated(tb,
                                                             NL80211_TXQ_ATTR_MAX,
                                                             nl_txq_params,
                                                             txq_params_policy,
                                                             info->extack);
                        if (result)
                                goto out;
                        result = parse_txq_params(tb, &txq_params);
                        if (result)
                                goto out;

                        txq_params.link_id =
                                nl80211_link_id_or_invalid(info->attrs);

                        if (txq_params.link_id >= 0 &&
                            !(netdev->ieee80211_ptr->valid_links &
                              BIT(txq_params.link_id)))
                                result = -ENOLINK;
                        else if (txq_params.link_id >= 0 &&
                                 !netdev->ieee80211_ptr->valid_links)
                                result = -EINVAL;
                        else
                                result = rdev_set_txq_params(rdev, netdev,
                                                             &txq_params);
                        if (result)
                                goto out;
                }
        }

        if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
                int link_id = nl80211_link_id_or_invalid(info->attrs);

                if (wdev) {
                        result = __nl80211_set_channel(
                                rdev,
                                nl80211_can_set_dev_channel(wdev) ? netdev : NULL,
                                info, link_id);
                } else {
                        result = __nl80211_set_channel(rdev, netdev, info, link_id);
                }

                if (result)
                        goto out;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) {
                struct wireless_dev *txp_wdev = wdev;
                enum nl80211_tx_power_setting type;
                int idx, mbm = 0;

                if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER))
                        txp_wdev = NULL;

                if (!rdev->ops->set_tx_power) {
                        result = -EOPNOTSUPP;
                        goto out;
                }

                idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING;
                type = nla_get_u32(info->attrs[idx]);

                if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] &&
                    (type != NL80211_TX_POWER_AUTOMATIC)) {
                        result = -EINVAL;
                        goto out;
                }

                if (type != NL80211_TX_POWER_AUTOMATIC) {
                        idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL;
                        mbm = nla_get_u32(info->attrs[idx]);
                }

                result = rdev_set_tx_power(rdev, txp_wdev, type, mbm);
                if (result)
                        goto out;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] &&
            info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]) {
                u32 tx_ant, rx_ant;

                if ((!rdev->wiphy.available_antennas_tx &&
                     !rdev->wiphy.available_antennas_rx) ||
                    !rdev->ops->set_antenna) {
                        result = -EOPNOTSUPP;
                        goto out;
                }

                tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]);
                rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]);

                /* reject antenna configurations which don't match the
                 * available antenna masks, except for the "all" mask */
                if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) ||
                    (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) {
                        result = -EINVAL;
                        goto out;
                }

                tx_ant = tx_ant & rdev->wiphy.available_antennas_tx;
                rx_ant = rx_ant & rdev->wiphy.available_antennas_rx;

                result = rdev_set_antenna(rdev, tx_ant, rx_ant);
                if (result)
                        goto out;
        }

        changed = 0;

        if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
                retry_short = nla_get_u8(
                        info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]);

                changed |= WIPHY_PARAM_RETRY_SHORT;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) {
                retry_long = nla_get_u8(
                        info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]);

                changed |= WIPHY_PARAM_RETRY_LONG;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) {
                frag_threshold = nla_get_u32(
                        info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]);
                if (frag_threshold < 256) {
                        result = -EINVAL;
                        goto out;
                }

                if (frag_threshold != (u32) -1) {
                        /*
                         * Fragments (apart from the last one) are required to
                         * have even length. Make the fragmentation code
                         * simpler by stripping LSB should someone try to use
                         * odd threshold value.
                         */
                        frag_threshold &= ~0x1;
                }
                changed |= WIPHY_PARAM_FRAG_THRESHOLD;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) {
                rts_threshold = nla_get_u32(
                        info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]);
                changed |= WIPHY_PARAM_RTS_THRESHOLD;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) {
                if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) {
                        result = -EINVAL;
                        goto out;
                }

                coverage_class = nla_get_u8(
                        info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]);
                changed |= WIPHY_PARAM_COVERAGE_CLASS;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) {
                if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) {
                        result = -EOPNOTSUPP;
                        goto out;
                }

                changed |= WIPHY_PARAM_DYN_ACK;
        }

        if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) {
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_TXQS)) {
                        result = -EOPNOTSUPP;
                        goto out;
                }
                txq_limit = nla_get_u32(
                        info->attrs[NL80211_ATTR_TXQ_LIMIT]);
                changed |= WIPHY_PARAM_TXQ_LIMIT;
        }

        if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) {
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_TXQS)) {
                        result = -EOPNOTSUPP;
                        goto out;
                }
                txq_memory_limit = nla_get_u32(
                        info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]);
                changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT;
        }

        if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) {
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_TXQS)) {
                        result = -EOPNOTSUPP;
                        goto out;
                }
                txq_quantum = nla_get_u32(
                        info->attrs[NL80211_ATTR_TXQ_QUANTUM]);
                changed |= WIPHY_PARAM_TXQ_QUANTUM;
        }

        if (changed) {
                u8 old_retry_short, old_retry_long;
                u32 old_frag_threshold, old_rts_threshold;
                u8 old_coverage_class;
                u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;

                if (!rdev->ops->set_wiphy_params) {
                        result = -EOPNOTSUPP;
                        goto out;
                }

                old_retry_short = rdev->wiphy.retry_short;
                old_retry_long = rdev->wiphy.retry_long;
                old_frag_threshold = rdev->wiphy.frag_threshold;
                old_rts_threshold = rdev->wiphy.rts_threshold;
                old_coverage_class = rdev->wiphy.coverage_class;
                old_txq_limit = rdev->wiphy.txq_limit;
                old_txq_memory_limit = rdev->wiphy.txq_memory_limit;
                old_txq_quantum = rdev->wiphy.txq_quantum;

                if (changed & WIPHY_PARAM_RETRY_SHORT)
                        rdev->wiphy.retry_short = retry_short;
                if (changed & WIPHY_PARAM_RETRY_LONG)
                        rdev->wiphy.retry_long = retry_long;
                if (changed & WIPHY_PARAM_FRAG_THRESHOLD)
                        rdev->wiphy.frag_threshold = frag_threshold;
                if (changed & WIPHY_PARAM_RTS_THRESHOLD)
                        rdev->wiphy.rts_threshold = rts_threshold;
                if (changed & WIPHY_PARAM_COVERAGE_CLASS)
                        rdev->wiphy.coverage_class = coverage_class;
                if (changed & WIPHY_PARAM_TXQ_LIMIT)
                        rdev->wiphy.txq_limit = txq_limit;
                if (changed & WIPHY_PARAM_TXQ_MEMORY_LIMIT)
                        rdev->wiphy.txq_memory_limit = txq_memory_limit;
                if (changed & WIPHY_PARAM_TXQ_QUANTUM)
                        rdev->wiphy.txq_quantum = txq_quantum;

                result = rdev_set_wiphy_params(rdev, changed);
                if (result) {
                        rdev->wiphy.retry_short = old_retry_short;
                        rdev->wiphy.retry_long = old_retry_long;
                        rdev->wiphy.frag_threshold = old_frag_threshold;
                        rdev->wiphy.rts_threshold = old_rts_threshold;
                        rdev->wiphy.coverage_class = old_coverage_class;
                        rdev->wiphy.txq_limit = old_txq_limit;
                        rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
                        rdev->wiphy.txq_quantum = old_txq_quantum;
                        goto out;
                }
        }

        result = 0;

out:
        wiphy_unlock(&rdev->wiphy);
        return result;
}

int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef)
{
        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return -EINVAL;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ,
                        chandef->chan->center_freq))
                return -ENOBUFS;
        if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET,
                        chandef->chan->freq_offset))
                return -ENOBUFS;
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_20_NOHT:
        case NL80211_CHAN_WIDTH_20:
        case NL80211_CHAN_WIDTH_40:
                if (nla_put_u32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE,
                                cfg80211_get_chandef_type(chandef)))
                        return -ENOBUFS;
                break;
        default:
                break;
        }
        if (nla_put_u32(msg, NL80211_ATTR_CHANNEL_WIDTH, chandef->width))
                return -ENOBUFS;
        if (nla_put_u32(msg, NL80211_ATTR_CENTER_FREQ1, chandef->center_freq1))
                return -ENOBUFS;
        if (chandef->center_freq2 &&
            nla_put_u32(msg, NL80211_ATTR_CENTER_FREQ2, chandef->center_freq2))
                return -ENOBUFS;
        if (chandef->punctured &&
            nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, chandef->punctured))
                return -ENOBUFS;

        return 0;
}
EXPORT_SYMBOL(nl80211_send_chandef);

static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
                              struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev,
                              enum nl80211_commands cmd)
{
        struct net_device *dev = wdev->netdev;
        void *hdr;

        lockdep_assert_wiphy(&rdev->wiphy);

        WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE &&
                cmd != NL80211_CMD_DEL_INTERFACE &&
                cmd != NL80211_CMD_SET_INTERFACE);

        hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
        if (!hdr)
                return -1;

        if (dev &&
            (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
             nla_put_string(msg, NL80211_ATTR_IFNAME, dev->name)))
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFTYPE, wdev->iftype) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) ||
            nla_put_u32(msg, NL80211_ATTR_GENERATION,
                        rdev->devlist_generation ^
                        (cfg80211_rdev_list_generation << 2)) ||
            nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr))
                goto nla_put_failure;

        if (rdev->ops->get_channel && !wdev->valid_links) {
                struct cfg80211_chan_def chandef = {};
                int ret;

                ret = rdev_get_channel(rdev, wdev, 0, &chandef);
                if (ret == 0 && nl80211_send_chandef(msg, &chandef))
                        goto nla_put_failure;
        }

        if (rdev->ops->get_tx_power) {
                int dbm, ret;

                ret = rdev_get_tx_power(rdev, wdev, &dbm);
                if (ret == 0 &&
                    nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL,
                                DBM_TO_MBM(dbm)))
                        goto nla_put_failure;
        }

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                if (wdev->u.ap.ssid_len &&
                    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ap.ssid_len,
                            wdev->u.ap.ssid))
                        goto nla_put_failure;
                break;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                if (wdev->u.client.ssid_len &&
                    nla_put(msg, NL80211_ATTR_SSID, wdev->u.client.ssid_len,
                            wdev->u.client.ssid))
                        goto nla_put_failure;
                break;
        case NL80211_IFTYPE_ADHOC:
                if (wdev->u.ibss.ssid_len &&
                    nla_put(msg, NL80211_ATTR_SSID, wdev->u.ibss.ssid_len,
                            wdev->u.ibss.ssid))
                        goto nla_put_failure;
                break;
        default:
                /* nothing */
                break;
        }

        if (rdev->ops->get_txq_stats) {
                struct cfg80211_txq_stats txqstats = {};
                int ret = rdev_get_txq_stats(rdev, wdev, &txqstats);

                if (ret == 0 &&
                    !nl80211_put_txq_stats(msg, &txqstats,
                                           NL80211_ATTR_TXQ_STATS))
                        goto nla_put_failure;
        }

        if (wdev->valid_links) {
                unsigned int link_id;
                struct nlattr *links = nla_nest_start(msg,
                                                      NL80211_ATTR_MLO_LINKS);

                if (!links)
                        goto nla_put_failure;

                for_each_valid_link(wdev, link_id) {
                        struct nlattr *link = nla_nest_start(msg, link_id + 1);
                        struct cfg80211_chan_def chandef = {};
                        int ret;

                        if (!link)
                                goto nla_put_failure;

                        if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))
                                goto nla_put_failure;
                        if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
                                    wdev->links[link_id].addr))
                                goto nla_put_failure;

                        ret = rdev_get_channel(rdev, wdev, link_id, &chandef);
                        if (ret == 0 && nl80211_send_chandef(msg, &chandef))
                                goto nla_put_failure;

                        nla_nest_end(msg, link);
                }

                nla_nest_end(msg, links);
        }

        genlmsg_end(msg, hdr);
        return 0;

 nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *cb)
{
        int wp_idx = 0;
        int if_idx = 0;
        int wp_start = cb->args[0];
        int if_start = cb->args[1];
        int filter_wiphy = -1;
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        int ret;

        rtnl_lock();
        if (!cb->args[2]) {
                struct nl80211_dump_wiphy_state state = {
                        .filter_wiphy = -1,
                };

                ret = nl80211_dump_wiphy_parse(skb, cb, &state);
                if (ret)
                        goto out_unlock;

                filter_wiphy = state.filter_wiphy;

                /*
                 * if filtering, set cb->args[2] to +1 since 0 is the default
                 * value needed to determine that parsing is necessary.
                 */
                if (filter_wiphy >= 0)
                        cb->args[2] = filter_wiphy + 1;
                else
                        cb->args[2] = -1;
        } else if (cb->args[2] > 0) {
                filter_wiphy = cb->args[2] - 1;
        }

        for_each_rdev(rdev) {
                if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk)))
                        continue;
                if (wp_idx < wp_start) {
                        wp_idx++;
                        continue;
                }

                if (filter_wiphy >= 0 && filter_wiphy != rdev->wiphy_idx)
                        continue;

                if_idx = 0;

                wiphy_lock(&rdev->wiphy);
                list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                        if (if_idx < if_start) {
                                if_idx++;
                                continue;
                        }
                        if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).portid,
                                               cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                               rdev, wdev,
                                               NL80211_CMD_NEW_INTERFACE) < 0) {
                                wiphy_unlock(&rdev->wiphy);
                                goto out;
                        }
                        if_idx++;
                }
                wiphy_unlock(&rdev->wiphy);

                if_start = 0;
                wp_idx++;
        }
 out:
        cb->args[0] = wp_idx;
        cb->args[1] = if_idx;

        ret = skb->len;
 out_unlock:
        rtnl_unlock();

        return ret;
}

static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
{
        struct sk_buff *msg;
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0,
                               rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) {
                nlmsg_free(msg);
                return -ENOBUFS;
        }

        return genlmsg_reply(msg, info);
}

static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
        [NL80211_MNTR_FLAG_FCSFAIL] = { .type = NLA_FLAG },
        [NL80211_MNTR_FLAG_PLCPFAIL] = { .type = NLA_FLAG },
        [NL80211_MNTR_FLAG_CONTROL] = { .type = NLA_FLAG },
        [NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG },
        [NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG },
        [NL80211_MNTR_FLAG_ACTIVE] = { .type = NLA_FLAG },
};

static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags)
{
        struct nlattr *flags[NL80211_MNTR_FLAG_MAX + 1];
        int flag;

        *mntrflags = 0;

        if (!nla)
                return -EINVAL;

        if (nla_parse_nested_deprecated(flags, NL80211_MNTR_FLAG_MAX, nla, mntr_flags_policy, NULL))
                return -EINVAL;

        for (flag = 1; flag <= NL80211_MNTR_FLAG_MAX; flag++)
                if (flags[flag])
                        *mntrflags |= (1<<flag);

        *mntrflags |= MONITOR_FLAG_CHANGED;

        return 0;
}

static int nl80211_parse_mon_options(struct cfg80211_registered_device *rdev,
                                     enum nl80211_iftype type,
                                     struct genl_info *info,
                                     struct vif_params *params)
{
        bool change = false;
        int err;

        if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
                if (type != NL80211_IFTYPE_MONITOR)
                        return -EINVAL;

                err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS],
                                          &params->flags);
                if (err)
                        return err;

                change = true;
        }

        if (params->flags & MONITOR_FLAG_ACTIVE &&
            !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR))
                return -EOPNOTSUPP;

        if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) {
                const u8 *mumimo_groups;
                u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;

                if (type != NL80211_IFTYPE_MONITOR)
                        return -EINVAL;

                if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag))
                        return -EOPNOTSUPP;

                mumimo_groups =
                        nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]);

                /* bits 0 and 63 are reserved and must be zero */
                if ((mumimo_groups[0] & BIT(0)) ||
                    (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(7)))
                        return -EINVAL;

                params->vht_mumimo_groups = mumimo_groups;
                change = true;
        }

        if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) {
                u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER;

                if (type != NL80211_IFTYPE_MONITOR)
                        return -EINVAL;

                if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag))
                        return -EOPNOTSUPP;

                params->vht_mumimo_follow_addr =
                        nla_data(info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]);
                change = true;
        }

        return change ? 1 : 0;
}

static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, u8 use_4addr,
                               enum nl80211_iftype iftype)
{
        if (!use_4addr) {
                if (netdev && netif_is_bridge_port(netdev))
                        return -EBUSY;
                return 0;
        }

        switch (iftype) {
        case NL80211_IFTYPE_AP_VLAN:
                if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_AP)
                        return 0;
                break;
        case NL80211_IFTYPE_STATION:
                if (rdev->wiphy.flags & WIPHY_FLAG_4ADDR_STATION)
                        return 0;
                break;
        default:
                break;
        }

        return -EOPNOTSUPP;
}

static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct vif_params params;
        int err;
        enum nl80211_iftype otype, ntype;
        struct net_device *dev = info->user_ptr[1];
        bool change = false;

        memset(&params, 0, sizeof(params));

        otype = ntype = dev->ieee80211_ptr->iftype;

        if (info->attrs[NL80211_ATTR_IFTYPE]) {
                ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
                if (otype != ntype)
                        change = true;
        }

        if (info->attrs[NL80211_ATTR_MESH_ID]) {
                struct wireless_dev *wdev = dev->ieee80211_ptr;

                if (ntype != NL80211_IFTYPE_MESH_POINT)
                        return -EINVAL;
                if (otype != NL80211_IFTYPE_MESH_POINT)
                        return -EINVAL;
                if (netif_running(dev))
                        return -EBUSY;

                wdev->u.mesh.id_up_len =
                        nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
                memcpy(wdev->u.mesh.id,
                       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
                       wdev->u.mesh.id_up_len);
        }

        if (info->attrs[NL80211_ATTR_4ADDR]) {
                params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
                change = true;
                err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype);
                if (err)
                        return err;
        } else {
                params.use_4addr = -1;
        }

        err = nl80211_parse_mon_options(rdev, ntype, info, &params);
        if (err < 0)
                return err;
        if (err > 0)
                change = true;

        if (change)
                err = cfg80211_change_iface(rdev, dev, ntype, &params);
        else
                err = 0;

        if (!err && params.use_4addr != -1)
                dev->ieee80211_ptr->use_4addr = params.use_4addr;

        if (change && !err) {
                struct wireless_dev *wdev = dev->ieee80211_ptr;

                nl80211_notify_iface(rdev, wdev, NL80211_CMD_SET_INTERFACE);
        }

        return err;
}

static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct vif_params params;
        struct wireless_dev *wdev;
        struct sk_buff *msg;
        int err;
        enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;

        memset(&params, 0, sizeof(params));

        if (!info->attrs[NL80211_ATTR_IFNAME])
                return -EINVAL;

        if (info->attrs[NL80211_ATTR_IFTYPE])
                type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);

        if (!rdev->ops->add_virtual_intf)
                return -EOPNOTSUPP;

        if ((type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN ||
             rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) &&
            info->attrs[NL80211_ATTR_MAC]) {
                nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC],
                           ETH_ALEN);
                if (!is_valid_ether_addr(params.macaddr))
                        return -EADDRNOTAVAIL;
        }

        if (info->attrs[NL80211_ATTR_4ADDR]) {
                params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
                err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type);
                if (err)
                        return err;
        }

        if (!cfg80211_iftype_allowed(&rdev->wiphy, type, params.use_4addr, 0))
                return -EOPNOTSUPP;

        err = nl80211_parse_mon_options(rdev, type, info, &params);
        if (err < 0)
                return err;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        wdev = rdev_add_virtual_intf(rdev,
                                nla_data(info->attrs[NL80211_ATTR_IFNAME]),
                                NET_NAME_USER, type, &params);
        if (WARN_ON(!wdev)) {
                nlmsg_free(msg);
                return -EPROTO;
        } else if (IS_ERR(wdev)) {
                nlmsg_free(msg);
                return PTR_ERR(wdev);
        }

        if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
                wdev->owner_nlportid = info->snd_portid;

        switch (type) {
        case NL80211_IFTYPE_MESH_POINT:
                if (!info->attrs[NL80211_ATTR_MESH_ID])
                        break;
                wdev->u.mesh.id_up_len =
                        nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
                memcpy(wdev->u.mesh.id,
                       nla_data(info->attrs[NL80211_ATTR_MESH_ID]),
                       wdev->u.mesh.id_up_len);
                break;
        case NL80211_IFTYPE_NAN:
        case NL80211_IFTYPE_P2P_DEVICE:
                /*
                 * P2P Device and NAN do not have a netdev, so don't go
                 * through the netdev notifier and must be added here
                 */
                cfg80211_init_wdev(wdev);
                cfg80211_register_wdev(rdev, wdev);
                break;
        default:
                break;
        }

        if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0,
                               rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) {
                nlmsg_free(msg);
                return -ENOBUFS;
        }

        return genlmsg_reply(msg, info);
}

static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int ret;

        /* to avoid failing a new interface creation due to pending removal */
        cfg80211_destroy_ifaces(rdev);

        wiphy_lock(&rdev->wiphy);
        ret = _nl80211_new_interface(skb, info);
        wiphy_unlock(&rdev->wiphy);

        return ret;
}

static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];

        if (!rdev->ops->del_virtual_intf)
                return -EOPNOTSUPP;

        /*
         * We hold RTNL, so this is safe, without RTNL opencount cannot
         * reach 0, and thus the rdev cannot be deleted.
         *
         * We need to do it for the dev_close(), since that will call
         * the netdev notifiers, and we need to acquire the mutex there
         * but don't know if we get there from here or from some other
         * place (e.g. "ip link set ... down").
         */
        mutex_unlock(&rdev->wiphy.mtx);

        /*
         * If we remove a wireless device without a netdev then clear
         * user_ptr[1] so that nl80211_post_doit won't dereference it
         * to check if it needs to do dev_put(). Otherwise it crashes
         * since the wdev has been freed, unlike with a netdev where
         * we need the dev_put() for the netdev to really be freed.
         */
        if (!wdev->netdev)
                info->user_ptr[1] = NULL;
        else
                dev_close(wdev->netdev);

        mutex_lock(&rdev->wiphy.mtx);

        return cfg80211_remove_virtual_intf(rdev, wdev);
}

static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        u16 noack_map;

        if (!info->attrs[NL80211_ATTR_NOACK_MAP])
                return -EINVAL;

        if (!rdev->ops->set_noack_map)
                return -EOPNOTSUPP;

        noack_map = nla_get_u16(info->attrs[NL80211_ATTR_NOACK_MAP]);

        return rdev_set_noack_map(rdev, dev, noack_map);
}

static int nl80211_validate_key_link_id(struct genl_info *info,
                                        struct wireless_dev *wdev,
                                        int link_id, bool pairwise)
{
        if (pairwise) {
                if (link_id != -1) {
                        GENL_SET_ERR_MSG(info,
                                         "link ID not allowed for pairwise key");
                        return -EINVAL;
                }

                return 0;
        }

        if (wdev->valid_links) {
                if (link_id == -1) {
                        GENL_SET_ERR_MSG(info,
                                         "link ID must for MLO group key");
                        return -EINVAL;
                }
                if (!(wdev->valid_links & BIT(link_id))) {
                        GENL_SET_ERR_MSG(info, "invalid link ID for MLO group key");
                        return -EINVAL;
                }
        } else if (link_id != -1) {
                GENL_SET_ERR_MSG(info, "link ID not allowed for non-MLO group key");
                return -EINVAL;
        }

        return 0;
}

struct get_key_cookie {
        struct sk_buff *msg;
        int error;
        int idx;
};

static void get_key_callback(void *c, struct key_params *params)
{
        struct nlattr *key;
        struct get_key_cookie *cookie = c;

        if ((params->key &&
             nla_put(cookie->msg, NL80211_ATTR_KEY_DATA,
                     params->key_len, params->key)) ||
            (params->seq &&
             nla_put(cookie->msg, NL80211_ATTR_KEY_SEQ,
                     params->seq_len, params->seq)) ||
            (params->cipher &&
             nla_put_u32(cookie->msg, NL80211_ATTR_KEY_CIPHER,
                         params->cipher)))
                goto nla_put_failure;

        key = nla_nest_start_noflag(cookie->msg, NL80211_ATTR_KEY);
        if (!key)
                goto nla_put_failure;

        if ((params->key &&
             nla_put(cookie->msg, NL80211_KEY_DATA,
                     params->key_len, params->key)) ||
            (params->seq &&
             nla_put(cookie->msg, NL80211_KEY_SEQ,
                     params->seq_len, params->seq)) ||
            (params->cipher &&
             nla_put_u32(cookie->msg, NL80211_KEY_CIPHER,
                         params->cipher)))
                goto nla_put_failure;

        if (nla_put_u8(cookie->msg, NL80211_KEY_IDX, cookie->idx))
                goto nla_put_failure;

        nla_nest_end(cookie->msg, key);

        return;
 nla_put_failure:
        cookie->error = 1;
}

static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int err;
        struct net_device *dev = info->user_ptr[1];
        u8 key_idx = 0;
        const u8 *mac_addr = NULL;
        bool pairwise;
        struct get_key_cookie cookie = {
                .error = 0,
        };
        void *hdr;
        struct sk_buff *msg;
        bool bigtk_support = false;
        int link_id = nl80211_link_id_or_invalid(info->attrs);
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        if (wiphy_ext_feature_isset(&rdev->wiphy,
                                    NL80211_EXT_FEATURE_BEACON_PROTECTION))
                bigtk_support = true;

        if ((wdev->iftype == NL80211_IFTYPE_STATION ||
             wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
            wiphy_ext_feature_isset(&rdev->wiphy,
                                    NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
                bigtk_support = true;

        if (info->attrs[NL80211_ATTR_KEY_IDX]) {
                key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);

                if (key_idx >= 6 && key_idx <= 7 && !bigtk_support) {
                        GENL_SET_ERR_MSG(info, "BIGTK not supported");
                        return -EINVAL;
                }
        }

        if (info->attrs[NL80211_ATTR_MAC])
                mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);

        pairwise = !!mac_addr;
        if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
                u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);

                if (kt != NL80211_KEYTYPE_GROUP &&
                    kt != NL80211_KEYTYPE_PAIRWISE)
                        return -EINVAL;
                pairwise = kt == NL80211_KEYTYPE_PAIRWISE;
        }

        if (!rdev->ops->get_key)
                return -EOPNOTSUPP;

        if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
                return -ENOENT;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_NEW_KEY);
        if (!hdr)
                goto nla_put_failure;

        cookie.msg = msg;
        cookie.idx = key_idx;

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put_u8(msg, NL80211_ATTR_KEY_IDX, key_idx))
                goto nla_put_failure;
        if (mac_addr &&
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr))
                goto nla_put_failure;

        err = nl80211_validate_key_link_id(info, wdev, link_id, pairwise);
        if (err)
                goto free_msg;

        err = rdev_get_key(rdev, dev, link_id, key_idx, pairwise, mac_addr,
                           &cookie, get_key_callback);

        if (err)
                goto free_msg;

        if (cookie.error)
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

 nla_put_failure:
        err = -ENOBUFS;
 free_msg:
        nlmsg_free(msg);
        return err;
}

static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct key_parse key;
        int err;
        struct net_device *dev = info->user_ptr[1];
        int link_id = nl80211_link_id_or_invalid(info->attrs);
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        err = nl80211_parse_key(info, &key);
        if (err)
                return err;

        if (key.idx < 0)
                return -EINVAL;

        /* Only support setting default key and
         * Extended Key ID action NL80211_KEY_SET_TX.
         */
        if (!key.def && !key.defmgmt && !key.defbeacon &&
            !(key.p.mode == NL80211_KEY_SET_TX))
                return -EINVAL;

        if (key.def) {
                if (!rdev->ops->set_default_key)
                        return -EOPNOTSUPP;

                err = nl80211_key_allowed(wdev);
                if (err)
                        return err;

                err = nl80211_validate_key_link_id(info, wdev, link_id, false);
                if (err)
                        return err;

                err = rdev_set_default_key(rdev, dev, link_id, key.idx,
                                           key.def_uni, key.def_multi);

                if (err)
                        return err;

#ifdef CONFIG_CFG80211_WEXT
                wdev->wext.default_key = key.idx;
#endif
                return 0;
        } else if (key.defmgmt) {
                if (key.def_uni || !key.def_multi)
                        return -EINVAL;

                if (!rdev->ops->set_default_mgmt_key)
                        return -EOPNOTSUPP;

                err = nl80211_key_allowed(wdev);
                if (err)
                        return err;

                err = nl80211_validate_key_link_id(info, wdev, link_id, false);
                if (err)
                        return err;

                err = rdev_set_default_mgmt_key(rdev, dev, link_id, key.idx);
                if (err)
                        return err;

#ifdef CONFIG_CFG80211_WEXT
                wdev->wext.default_mgmt_key = key.idx;
#endif
                return 0;
        } else if (key.defbeacon) {
                if (key.def_uni || !key.def_multi)
                        return -EINVAL;

                if (!rdev->ops->set_default_beacon_key)
                        return -EOPNOTSUPP;

                err = nl80211_key_allowed(wdev);
                if (err)
                        return err;

                err = nl80211_validate_key_link_id(info, wdev, link_id, false);
                if (err)
                        return err;

                return rdev_set_default_beacon_key(rdev, dev, link_id, key.idx);
        } else if (key.p.mode == NL80211_KEY_SET_TX &&
                   wiphy_ext_feature_isset(&rdev->wiphy,
                                           NL80211_EXT_FEATURE_EXT_KEY_ID)) {
                u8 *mac_addr = NULL;

                if (info->attrs[NL80211_ATTR_MAC])
                        mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);

                if (!mac_addr || key.idx < 0 || key.idx > 1)
                        return -EINVAL;

                err = nl80211_validate_key_link_id(info, wdev, link_id, true);
                if (err)
                        return err;

                return rdev_add_key(rdev, dev, link_id, key.idx,
                                    NL80211_KEYTYPE_PAIRWISE,
                                    mac_addr, &key.p);
        }

        return -EINVAL;
}

static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int err;
        struct net_device *dev = info->user_ptr[1];
        struct key_parse key;
        const u8 *mac_addr = NULL;
        int link_id = nl80211_link_id_or_invalid(info->attrs);
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        err = nl80211_parse_key(info, &key);
        if (err)
                return err;

        if (!key.p.key) {
                GENL_SET_ERR_MSG(info, "no key");
                return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_MAC])
                mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);

        if (key.type == -1) {
                if (mac_addr)
                        key.type = NL80211_KEYTYPE_PAIRWISE;
                else
                        key.type = NL80211_KEYTYPE_GROUP;
        }

        /* for now */
        if (key.type != NL80211_KEYTYPE_PAIRWISE &&
            key.type != NL80211_KEYTYPE_GROUP) {
                GENL_SET_ERR_MSG(info, "key type not pairwise or group");
                return -EINVAL;
        }

        if (key.type == NL80211_KEYTYPE_GROUP &&
            info->attrs[NL80211_ATTR_VLAN_ID])
                key.p.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);

        if (!rdev->ops->add_key)
                return -EOPNOTSUPP;

        if (cfg80211_validate_key_settings(rdev, &key.p, key.idx,
                                           key.type == NL80211_KEYTYPE_PAIRWISE,
                                           mac_addr)) {
                GENL_SET_ERR_MSG(info, "key setting validation failed");
                return -EINVAL;
        }

        err = nl80211_key_allowed(wdev);
        if (err)
                GENL_SET_ERR_MSG(info, "key not allowed");

        if (!err)
                err = nl80211_validate_key_link_id(info, wdev, link_id,
                                key.type == NL80211_KEYTYPE_PAIRWISE);

        if (!err) {
                err = rdev_add_key(rdev, dev, link_id, key.idx,
                                   key.type == NL80211_KEYTYPE_PAIRWISE,
                                    mac_addr, &key.p);
                if (err)
                        GENL_SET_ERR_MSG(info, "key addition failed");
        }

        return err;
}

static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int err;
        struct net_device *dev = info->user_ptr[1];
        u8 *mac_addr = NULL;
        struct key_parse key;
        int link_id = nl80211_link_id_or_invalid(info->attrs);
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        err = nl80211_parse_key(info, &key);
        if (err)
                return err;

        if (info->attrs[NL80211_ATTR_MAC])
                mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);

        if (key.type == -1) {
                if (mac_addr)
                        key.type = NL80211_KEYTYPE_PAIRWISE;
                else
                        key.type = NL80211_KEYTYPE_GROUP;
        }

        /* for now */
        if (key.type != NL80211_KEYTYPE_PAIRWISE &&
            key.type != NL80211_KEYTYPE_GROUP)
                return -EINVAL;

        if (!cfg80211_valid_key_idx(rdev, key.idx,
                                    key.type == NL80211_KEYTYPE_PAIRWISE))
                return -EINVAL;

        if (!rdev->ops->del_key)
                return -EOPNOTSUPP;

        err = nl80211_key_allowed(wdev);

        if (key.type == NL80211_KEYTYPE_GROUP && mac_addr &&
            !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
                err = -ENOENT;

        if (!err)
                err = nl80211_validate_key_link_id(info, wdev, link_id,
                                key.type == NL80211_KEYTYPE_PAIRWISE);

        if (!err)
                err = rdev_del_key(rdev, dev, link_id, key.idx,
                                   key.type == NL80211_KEYTYPE_PAIRWISE,
                                   mac_addr);

#ifdef CONFIG_CFG80211_WEXT
        if (!err) {
                if (key.idx == wdev->wext.default_key)
                        wdev->wext.default_key = -1;
                else if (key.idx == wdev->wext.default_mgmt_key)
                        wdev->wext.default_mgmt_key = -1;
        }
#endif

        return err;
}

/* This function returns an error or the number of nested attributes */
static int validate_acl_mac_addrs(struct nlattr *nl_attr)
{
        struct nlattr *attr;
        int n_entries = 0, tmp;

        nla_for_each_nested(attr, nl_attr, tmp) {
                if (nla_len(attr) != ETH_ALEN)
                        return -EINVAL;

                n_entries++;
        }

        return n_entries;
}

/*
 * This function parses ACL information and allocates memory for ACL data.
 * On successful return, the calling function is responsible to free the
 * ACL buffer returned by this function.
 */
static struct cfg80211_acl_data *parse_acl_data(struct wiphy *wiphy,
                                                struct genl_info *info)
{
        enum nl80211_acl_policy acl_policy;
        struct nlattr *attr;
        struct cfg80211_acl_data *acl;
        int i = 0, n_entries, tmp;

        if (!wiphy->max_acl_mac_addrs)
                return ERR_PTR(-EOPNOTSUPP);

        if (!info->attrs[NL80211_ATTR_ACL_POLICY])
                return ERR_PTR(-EINVAL);

        acl_policy = nla_get_u32(info->attrs[NL80211_ATTR_ACL_POLICY]);
        if (acl_policy != NL80211_ACL_POLICY_ACCEPT_UNLESS_LISTED &&
            acl_policy != NL80211_ACL_POLICY_DENY_UNLESS_LISTED)
                return ERR_PTR(-EINVAL);

        if (!info->attrs[NL80211_ATTR_MAC_ADDRS])
                return ERR_PTR(-EINVAL);

        n_entries = validate_acl_mac_addrs(info->attrs[NL80211_ATTR_MAC_ADDRS]);
        if (n_entries < 0)
                return ERR_PTR(n_entries);

        if (n_entries > wiphy->max_acl_mac_addrs)
                return ERR_PTR(-EOPNOTSUPP);

        acl = kzalloc(struct_size(acl, mac_addrs, n_entries), GFP_KERNEL);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl->n_acl_entries = n_entries;

        nla_for_each_nested(attr, info->attrs[NL80211_ATTR_MAC_ADDRS], tmp) {
                memcpy(acl->mac_addrs[i].addr, nla_data(attr), ETH_ALEN);
                i++;
        }
        acl->acl_policy = acl_policy;

        return acl;
}

static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_acl_data *acl;
        int err;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
                return -EOPNOTSUPP;

        if (!dev->ieee80211_ptr->links[0].ap.beacon_interval)
                return -EINVAL;

        acl = parse_acl_data(&rdev->wiphy, info);
        if (IS_ERR(acl))
                return PTR_ERR(acl);

        err = rdev_set_mac_acl(rdev, dev, acl);

        kfree(acl);

        return err;
}

static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
                           u8 *rates, u8 rates_len)
{
        u8 i;
        u32 mask = 0;

        for (i = 0; i < rates_len; i++) {
                int rate = (rates[i] & 0x7f) * 5;
                int ridx;

                for (ridx = 0; ridx < sband->n_bitrates; ridx++) {
                        struct ieee80211_rate *srate =
                                &sband->bitrates[ridx];
                        if (rate == srate->bitrate) {
                                mask |= 1 << ridx;
                                break;
                        }
                }
                if (ridx == sband->n_bitrates)
                        return 0; /* rate not found */
        }

        return mask;
}

static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband,
                               u8 *rates, u8 rates_len,
                               u8 mcs[IEEE80211_HT_MCS_MASK_LEN])
{
        u8 i;

        memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN);

        for (i = 0; i < rates_len; i++) {
                int ridx, rbit;

                ridx = rates[i] / 8;
                rbit = BIT(rates[i] % 8);

                /* check validity */
                if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN))
                        return false;

                /* check availability */
                ridx = array_index_nospec(ridx, IEEE80211_HT_MCS_MASK_LEN);
                if (sband->ht_cap.mcs.rx_mask[ridx] & rbit)
                        mcs[ridx] |= rbit;
                else
                        return false;
        }

        return true;
}

static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map)
{
        u16 mcs_mask = 0;

        switch (vht_mcs_map) {
        case IEEE80211_VHT_MCS_NOT_SUPPORTED:
                break;
        case IEEE80211_VHT_MCS_SUPPORT_0_7:
                mcs_mask = 0x00FF;
                break;
        case IEEE80211_VHT_MCS_SUPPORT_0_8:
                mcs_mask = 0x01FF;
                break;
        case IEEE80211_VHT_MCS_SUPPORT_0_9:
                mcs_mask = 0x03FF;
                break;
        default:
                break;
        }

        return mcs_mask;
}

static void vht_build_mcs_mask(u16 vht_mcs_map,
                               u16 vht_mcs_mask[NL80211_VHT_NSS_MAX])
{
        u8 nss;

        for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) {
                vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03);
                vht_mcs_map >>= 2;
        }
}

static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband,
                             struct nl80211_txrate_vht *txrate,
                             u16 mcs[NL80211_VHT_NSS_MAX])
{
        u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
        u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {};
        u8 i;

        if (!sband->vht_cap.vht_supported)
                return false;

        memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX);

        /* Build vht_mcs_mask from VHT capabilities */
        vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask);

        for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
                if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
                        mcs[i] = txrate->mcs[i];
                else
                        return false;
        }

        return true;
}

static u16 he_mcs_map_to_mcs_mask(u8 he_mcs_map)
{
        switch (he_mcs_map) {
        case IEEE80211_HE_MCS_NOT_SUPPORTED:
                return 0;
        case IEEE80211_HE_MCS_SUPPORT_0_7:
                return 0x00FF;
        case IEEE80211_HE_MCS_SUPPORT_0_9:
                return 0x03FF;
        case IEEE80211_HE_MCS_SUPPORT_0_11:
                return 0xFFF;
        default:
                break;
        }
        return 0;
}

static void he_build_mcs_mask(u16 he_mcs_map,
                              u16 he_mcs_mask[NL80211_HE_NSS_MAX])
{
        u8 nss;

        for (nss = 0; nss < NL80211_HE_NSS_MAX; nss++) {
                he_mcs_mask[nss] = he_mcs_map_to_mcs_mask(he_mcs_map & 0x03);
                he_mcs_map >>= 2;
        }
}

static u16 he_get_txmcsmap(struct genl_info *info, unsigned int link_id,
                           const struct ieee80211_sta_he_cap *he_cap)
{
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_chan_def *chandef;
        __le16 tx_mcs;

        chandef = wdev_chandef(wdev, link_id);
        if (!chandef) {
                /*
                 * This is probably broken, but we never maintained
                 * a chandef in these cases, so it always was.
                 */
                return le16_to_cpu(he_cap->he_mcs_nss_supp.tx_mcs_80);
        }

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_80P80:
                tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80;
                break;
        case NL80211_CHAN_WIDTH_160:
                tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_160;
                break;
        default:
                tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80;
                break;
        }

        return le16_to_cpu(tx_mcs);
}

static bool he_set_mcs_mask(struct genl_info *info,
                            struct wireless_dev *wdev,
                            struct ieee80211_supported_band *sband,
                            struct nl80211_txrate_he *txrate,
                            u16 mcs[NL80211_HE_NSS_MAX],
                            unsigned int link_id)
{
        const struct ieee80211_sta_he_cap *he_cap;
        u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {};
        u16 tx_mcs_map = 0;
        u8 i;

        he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype);
        if (!he_cap)
                return false;

        memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX);

        tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap);

        /* Build he_mcs_mask from HE capabilities */
        he_build_mcs_mask(tx_mcs_map, tx_mcs_mask);

        for (i = 0; i < NL80211_HE_NSS_MAX; i++) {
                if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
                        mcs[i] = txrate->mcs[i];
                else
                        return false;
        }

        return true;
}

static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
                                         struct nlattr *attrs[],
                                         enum nl80211_attrs attr,
                                         struct cfg80211_bitrate_mask *mask,
                                         struct net_device *dev,
                                         bool default_all_enabled,
                                         unsigned int link_id)
{
        struct nlattr *tb[NL80211_TXRATE_MAX + 1];
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        int rem, i;
        struct nlattr *tx_rates;
        struct ieee80211_supported_band *sband;
        u16 vht_tx_mcs_map, he_tx_mcs_map;

        memset(mask, 0, sizeof(*mask));
        /* Default to all rates enabled */
        for (i = 0; i < NUM_NL80211_BANDS; i++) {
                const struct ieee80211_sta_he_cap *he_cap;

                if (!default_all_enabled)
                        break;

                sband = rdev->wiphy.bands[i];

                if (!sband)
                        continue;

                mask->control[i].legacy = (1 << sband->n_bitrates) - 1;
                memcpy(mask->control[i].ht_mcs,
                       sband->ht_cap.mcs.rx_mask,
                       sizeof(mask->control[i].ht_mcs));

                if (sband->vht_cap.vht_supported) {
                        vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map);
                        vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs);
                }

                he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype);
                if (!he_cap)
                        continue;

                he_tx_mcs_map = he_get_txmcsmap(info, link_id, he_cap);
                he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs);

                mask->control[i].he_gi = 0xFF;
                mask->control[i].he_ltf = 0xFF;
        }

        /* if no rates are given set it back to the defaults */
        if (!attrs[attr])
                goto out;

        /* The nested attribute uses enum nl80211_band as the index. This maps
         * directly to the enum nl80211_band values used in cfg80211.
         */
        BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8);
        nla_for_each_nested(tx_rates, attrs[attr], rem) {
                enum nl80211_band band = nla_type(tx_rates);
                int err;

                if (band < 0 || band >= NUM_NL80211_BANDS)
                        return -EINVAL;
                sband = rdev->wiphy.bands[band];
                if (sband == NULL)
                        return -EINVAL;
                err = nla_parse_nested_deprecated(tb, NL80211_TXRATE_MAX,
                                                  tx_rates,
                                                  nl80211_txattr_policy,
                                                  info->extack);
                if (err)
                        return err;
                if (tb[NL80211_TXRATE_LEGACY]) {
                        mask->control[band].legacy = rateset_to_mask(
                                sband,
                                nla_data(tb[NL80211_TXRATE_LEGACY]),
                                nla_len(tb[NL80211_TXRATE_LEGACY]));
                        if ((mask->control[band].legacy == 0) &&
                            nla_len(tb[NL80211_TXRATE_LEGACY]))
                                return -EINVAL;
                }
                if (tb[NL80211_TXRATE_HT]) {
                        if (!ht_rateset_to_mask(
                                        sband,
                                        nla_data(tb[NL80211_TXRATE_HT]),
                                        nla_len(tb[NL80211_TXRATE_HT]),
                                        mask->control[band].ht_mcs))
                                return -EINVAL;
                }

                if (tb[NL80211_TXRATE_VHT]) {
                        if (!vht_set_mcs_mask(
                                        sband,
                                        nla_data(tb[NL80211_TXRATE_VHT]),
                                        mask->control[band].vht_mcs))
                                return -EINVAL;
                }

                if (tb[NL80211_TXRATE_GI]) {
                        mask->control[band].gi =
                                nla_get_u8(tb[NL80211_TXRATE_GI]);
                        if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI)
                                return -EINVAL;
                }
                if (tb[NL80211_TXRATE_HE] &&
                    !he_set_mcs_mask(info, wdev, sband,
                                     nla_data(tb[NL80211_TXRATE_HE]),
                                     mask->control[band].he_mcs,
                                     link_id))
                        return -EINVAL;

                if (tb[NL80211_TXRATE_HE_GI])
                        mask->control[band].he_gi =
                                nla_get_u8(tb[NL80211_TXRATE_HE_GI]);
                if (tb[NL80211_TXRATE_HE_LTF])
                        mask->control[band].he_ltf =
                                nla_get_u8(tb[NL80211_TXRATE_HE_LTF]);

                if (mask->control[band].legacy == 0) {
                        /* don't allow empty legacy rates if HT, VHT or HE
                         * are not even supported.
                         */
                        if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
                              rdev->wiphy.bands[band]->vht_cap.vht_supported ||
                              ieee80211_get_he_iftype_cap(sband, wdev->iftype)))
                                return -EINVAL;

                        for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
                                if (mask->control[band].ht_mcs[i])
                                        goto out;

                        for (i = 0; i < NL80211_VHT_NSS_MAX; i++)
                                if (mask->control[band].vht_mcs[i])
                                        goto out;

                        for (i = 0; i < NL80211_HE_NSS_MAX; i++)
                                if (mask->control[band].he_mcs[i])
                                        goto out;

                        /* legacy and mcs rates may not be both empty */
                        return -EINVAL;
                }
        }

out:
        return 0;
}

static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
                                   enum nl80211_band band,
                                   struct cfg80211_bitrate_mask *beacon_rate)
{
        u32 count_ht, count_vht, count_he, i;
        u32 rate = beacon_rate->control[band].legacy;

        /* Allow only one rate */
        if (hweight32(rate) > 1)
                return -EINVAL;

        count_ht = 0;
        for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) {
                if (hweight8(beacon_rate->control[band].ht_mcs[i]) > 1) {
                        return -EINVAL;
                } else if (beacon_rate->control[band].ht_mcs[i]) {
                        count_ht++;
                        if (count_ht > 1)
                                return -EINVAL;
                }
                if (count_ht && rate)
                        return -EINVAL;
        }

        count_vht = 0;
        for (i = 0; i < NL80211_VHT_NSS_MAX; i++) {
                if (hweight16(beacon_rate->control[band].vht_mcs[i]) > 1) {
                        return -EINVAL;
                } else if (beacon_rate->control[band].vht_mcs[i]) {
                        count_vht++;
                        if (count_vht > 1)
                                return -EINVAL;
                }
                if (count_vht && rate)
                        return -EINVAL;
        }

        count_he = 0;
        for (i = 0; i < NL80211_HE_NSS_MAX; i++) {
                if (hweight16(beacon_rate->control[band].he_mcs[i]) > 1) {
                        return -EINVAL;
                } else if (beacon_rate->control[band].he_mcs[i]) {
                        count_he++;
                        if (count_he > 1)
                                return -EINVAL;
                }
                if (count_he && rate)
                        return -EINVAL;
        }

        if ((count_ht && count_vht && count_he) ||
            (!rate && !count_ht && !count_vht && !count_he))
                return -EINVAL;

        if (rate &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_BEACON_RATE_LEGACY))
                return -EINVAL;
        if (count_ht &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_BEACON_RATE_HT))
                return -EINVAL;
        if (count_vht &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_BEACON_RATE_VHT))
                return -EINVAL;
        if (count_he &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_BEACON_RATE_HE))
                return -EINVAL;

        return 0;
}

static int nl80211_parse_mbssid_config(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       struct nlattr *attrs,
                                       struct cfg80211_mbssid_config *config,
                                       u8 num_elems)
{
        struct nlattr *tb[NL80211_MBSSID_CONFIG_ATTR_MAX + 1];

        if (!wiphy->mbssid_max_interfaces)
                return -EOPNOTSUPP;

        if (nla_parse_nested(tb, NL80211_MBSSID_CONFIG_ATTR_MAX, attrs, NULL,
                             NULL) ||
            !tb[NL80211_MBSSID_CONFIG_ATTR_INDEX])
                return -EINVAL;

        config->ema = nla_get_flag(tb[NL80211_MBSSID_CONFIG_ATTR_EMA]);
        if (config->ema) {
                if (!wiphy->ema_max_profile_periodicity)
                        return -EOPNOTSUPP;

                if (num_elems > wiphy->ema_max_profile_periodicity)
                        return -EINVAL;
        }

        config->index = nla_get_u8(tb[NL80211_MBSSID_CONFIG_ATTR_INDEX]);
        if (config->index >= wiphy->mbssid_max_interfaces ||
            (!config->index && !num_elems))
                return -EINVAL;

        if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]) {
                u32 tx_ifindex =
                        nla_get_u32(tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]);

                if ((!config->index && tx_ifindex != dev->ifindex) ||
                    (config->index && tx_ifindex == dev->ifindex))
                        return -EINVAL;

                if (tx_ifindex != dev->ifindex) {
                        struct net_device *tx_netdev =
                                dev_get_by_index(wiphy_net(wiphy), tx_ifindex);

                        if (!tx_netdev || !tx_netdev->ieee80211_ptr ||
                            tx_netdev->ieee80211_ptr->wiphy != wiphy ||
                            tx_netdev->ieee80211_ptr->iftype !=
                                                        NL80211_IFTYPE_AP) {
                                dev_put(tx_netdev);
                                return -EINVAL;
                        }

                        config->tx_wdev = tx_netdev->ieee80211_ptr;
                } else {
                        config->tx_wdev = dev->ieee80211_ptr;
                }
        } else if (!config->index) {
                config->tx_wdev = dev->ieee80211_ptr;
        } else {
                return -EINVAL;
        }

        return 0;
}

static struct cfg80211_mbssid_elems *
nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs)
{
        struct nlattr *nl_elems;
        struct cfg80211_mbssid_elems *elems;
        int rem_elems;
        u8 i = 0, num_elems = 0;

        if (!wiphy->mbssid_max_interfaces)
                return ERR_PTR(-EINVAL);

        nla_for_each_nested(nl_elems, attrs, rem_elems) {
                if (num_elems >= 255)
                        return ERR_PTR(-EINVAL);
                num_elems++;
        }

        elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL);
        if (!elems)
                return ERR_PTR(-ENOMEM);
        elems->cnt = num_elems;

        nla_for_each_nested(nl_elems, attrs, rem_elems) {
                elems->elem[i].data = nla_data(nl_elems);
                elems->elem[i].len = nla_len(nl_elems);
                i++;
        }
        return elems;
}

static struct cfg80211_rnr_elems *
nl80211_parse_rnr_elems(struct wiphy *wiphy, struct nlattr *attrs,
                        struct netlink_ext_ack *extack)
{
        struct nlattr *nl_elems;
        struct cfg80211_rnr_elems *elems;
        int rem_elems;
        u8 i = 0, num_elems = 0;

        nla_for_each_nested(nl_elems, attrs, rem_elems) {
                int ret;

                ret = validate_ie_attr(nl_elems, extack);
                if (ret)
                        return ERR_PTR(ret);

                num_elems++;
        }

        elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL);
        if (!elems)
                return ERR_PTR(-ENOMEM);
        elems->cnt = num_elems;

        nla_for_each_nested(nl_elems, attrs, rem_elems) {
                elems->elem[i].data = nla_data(nl_elems);
                elems->elem[i].len = nla_len(nl_elems);
                i++;
        }
        return elems;
}

static int nl80211_parse_he_bss_color(struct nlattr *attrs,
                                      struct cfg80211_he_bss_color *he_bss_color)
{
        struct nlattr *tb[NL80211_HE_BSS_COLOR_ATTR_MAX + 1];
        int err;

        err = nla_parse_nested(tb, NL80211_HE_BSS_COLOR_ATTR_MAX, attrs,
                               he_bss_color_policy, NULL);
        if (err)
                return err;

        if (!tb[NL80211_HE_BSS_COLOR_ATTR_COLOR])
                return -EINVAL;

        he_bss_color->color =
                nla_get_u8(tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]);
        he_bss_color->enabled =
                !nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]);
        he_bss_color->partial =
                nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_PARTIAL]);

        return 0;
}

static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
                                struct nlattr *attrs[],
                                struct cfg80211_beacon_data *bcn,
                                struct netlink_ext_ack *extack)
{
        bool haveinfo = false;
        int err;

        memset(bcn, 0, sizeof(*bcn));

        bcn->link_id = nl80211_link_id(attrs);

        if (attrs[NL80211_ATTR_BEACON_HEAD]) {
                bcn->head = nla_data(attrs[NL80211_ATTR_BEACON_HEAD]);
                bcn->head_len = nla_len(attrs[NL80211_ATTR_BEACON_HEAD]);
                if (!bcn->head_len)
                        return -EINVAL;
                haveinfo = true;
        }

        if (attrs[NL80211_ATTR_BEACON_TAIL]) {
                bcn->tail = nla_data(attrs[NL80211_ATTR_BEACON_TAIL]);
                bcn->tail_len = nla_len(attrs[NL80211_ATTR_BEACON_TAIL]);
                haveinfo = true;
        }

        if (!haveinfo)
                return -EINVAL;

        if (attrs[NL80211_ATTR_IE]) {
                bcn->beacon_ies = nla_data(attrs[NL80211_ATTR_IE]);
                bcn->beacon_ies_len = nla_len(attrs[NL80211_ATTR_IE]);
        }

        if (attrs[NL80211_ATTR_IE_PROBE_RESP]) {
                bcn->proberesp_ies =
                        nla_data(attrs[NL80211_ATTR_IE_PROBE_RESP]);
                bcn->proberesp_ies_len =
                        nla_len(attrs[NL80211_ATTR_IE_PROBE_RESP]);
        }

        if (attrs[NL80211_ATTR_IE_ASSOC_RESP]) {
                bcn->assocresp_ies =
                        nla_data(attrs[NL80211_ATTR_IE_ASSOC_RESP]);
                bcn->assocresp_ies_len =
                        nla_len(attrs[NL80211_ATTR_IE_ASSOC_RESP]);
        }

        if (attrs[NL80211_ATTR_PROBE_RESP]) {
                bcn->probe_resp = nla_data(attrs[NL80211_ATTR_PROBE_RESP]);
                bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]);
        }

        if (attrs[NL80211_ATTR_FTM_RESPONDER]) {
                struct nlattr *tb[NL80211_FTM_RESP_ATTR_MAX + 1];

                err = nla_parse_nested_deprecated(tb,
                                                  NL80211_FTM_RESP_ATTR_MAX,
                                                  attrs[NL80211_ATTR_FTM_RESPONDER],
                                                  NULL, NULL);
                if (err)
                        return err;

                if (tb[NL80211_FTM_RESP_ATTR_ENABLED] &&
                    wiphy_ext_feature_isset(&rdev->wiphy,
                                            NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
                        bcn->ftm_responder = 1;
                else
                        return -EOPNOTSUPP;

                if (tb[NL80211_FTM_RESP_ATTR_LCI]) {
                        bcn->lci = nla_data(tb[NL80211_FTM_RESP_ATTR_LCI]);
                        bcn->lci_len = nla_len(tb[NL80211_FTM_RESP_ATTR_LCI]);
                }

                if (tb[NL80211_FTM_RESP_ATTR_CIVICLOC]) {
                        bcn->civicloc = nla_data(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
                        bcn->civicloc_len = nla_len(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
                }
        } else {
                bcn->ftm_responder = -1;
        }

        if (attrs[NL80211_ATTR_HE_BSS_COLOR]) {
                err = nl80211_parse_he_bss_color(attrs[NL80211_ATTR_HE_BSS_COLOR],
                                                 &bcn->he_bss_color);
                if (err)
                        return err;
                bcn->he_bss_color_valid = true;
        }

        if (attrs[NL80211_ATTR_MBSSID_ELEMS]) {
                struct cfg80211_mbssid_elems *mbssid =
                        nl80211_parse_mbssid_elems(&rdev->wiphy,
                                                   attrs[NL80211_ATTR_MBSSID_ELEMS]);

                if (IS_ERR(mbssid))
                        return PTR_ERR(mbssid);

                bcn->mbssid_ies = mbssid;

                if (bcn->mbssid_ies && attrs[NL80211_ATTR_EMA_RNR_ELEMS]) {
                        struct cfg80211_rnr_elems *rnr =
                                nl80211_parse_rnr_elems(&rdev->wiphy,
                                                        attrs[NL80211_ATTR_EMA_RNR_ELEMS],
                                                        extack);

                        if (IS_ERR(rnr))
                                return PTR_ERR(rnr);

                        if (rnr && rnr->cnt < bcn->mbssid_ies->cnt)
                                return -EINVAL;

                        bcn->rnr_ies = rnr;
                }
        }

        return 0;
}

static int nl80211_parse_he_obss_pd(struct nlattr *attrs,
                                    struct ieee80211_he_obss_pd *he_obss_pd)
{
        struct nlattr *tb[NL80211_HE_OBSS_PD_ATTR_MAX + 1];
        int err;

        err = nla_parse_nested(tb, NL80211_HE_OBSS_PD_ATTR_MAX, attrs,
                               he_obss_pd_policy, NULL);
        if (err)
                return err;

        if (!tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL])
                return -EINVAL;

        he_obss_pd->sr_ctrl = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL]);

        if (tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET])
                he_obss_pd->min_offset =
                        nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]);
        if (tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET])
                he_obss_pd->max_offset =
                        nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]);
        if (tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET])
                he_obss_pd->non_srg_max_offset =
                        nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET]);

        if (he_obss_pd->min_offset > he_obss_pd->max_offset)
                return -EINVAL;

        if (tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP])
                memcpy(he_obss_pd->bss_color_bitmap,
                       nla_data(tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP]),
                       sizeof(he_obss_pd->bss_color_bitmap));

        if (tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP])
                memcpy(he_obss_pd->partial_bssid_bitmap,
                       nla_data(tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP]),
                       sizeof(he_obss_pd->partial_bssid_bitmap));

        he_obss_pd->enable = true;

        return 0;
}

static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev,
                                        struct nlattr *attrs,
                                        struct cfg80211_fils_discovery *fd)
{
        struct nlattr *tb[NL80211_FILS_DISCOVERY_ATTR_MAX + 1];
        int ret;

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_FILS_DISCOVERY))
                return -EINVAL;

        ret = nla_parse_nested(tb, NL80211_FILS_DISCOVERY_ATTR_MAX, attrs,
                               NULL, NULL);
        if (ret)
                return ret;

        if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] &&
            !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] &&
            !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]) {
                fd->update = true;
                return 0;
        }

        if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] ||
            !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] ||
            !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL])
                return -EINVAL;

        fd->tmpl_len = nla_len(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]);
        fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]);
        fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]);
        fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]);
        fd->update = true;
        return 0;
}

static int
nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev,
                                     struct nlattr *attrs,
                                     struct cfg80211_unsol_bcast_probe_resp *presp)
{
        struct nlattr *tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1];
        int ret;

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP))
                return -EINVAL;

        ret = nla_parse_nested(tb, NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX,
                               attrs, NULL, NULL);
        if (ret)
                return ret;

        if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] &&
            !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]) {
                presp->update = true;
                return 0;
        }

        if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] ||
            !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL])
                return -EINVAL;

        presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
        presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]);
        presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]);
        presp->update = true;
        return 0;
}

static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
                                            const struct element *rates)
{
        int i;

        if (!rates)
                return;

        for (i = 0; i < rates->datalen; i++) {
                if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
                        params->ht_required = true;
                if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY)
                        params->vht_required = true;
                if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY)
                        params->he_required = true;
                if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E)
                        params->sae_h2e_required = true;
        }
}

/*
 * Since the nl80211 API didn't include, from the beginning, attributes about
 * HT/VHT requirements/capabilities, we parse them out of the IEs for the
 * benefit of drivers that rebuild IEs in the firmware.
 */
static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
{
        const struct cfg80211_beacon_data *bcn = &params->beacon;
        size_t ies_len = bcn->tail_len;
        const u8 *ies = bcn->tail;
        const struct element *rates;
        const struct element *cap;

        rates = cfg80211_find_elem(WLAN_EID_SUPP_RATES, ies, ies_len);
        nl80211_check_ap_rate_selectors(params, rates);

        rates = cfg80211_find_elem(WLAN_EID_EXT_SUPP_RATES, ies, ies_len);
        nl80211_check_ap_rate_selectors(params, rates);

        cap = cfg80211_find_elem(WLAN_EID_HT_CAPABILITY, ies, ies_len);
        if (cap && cap->datalen >= sizeof(*params->ht_cap))
                params->ht_cap = (void *)cap->data;
        cap = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
        if (cap && cap->datalen >= sizeof(*params->vht_cap))
                params->vht_cap = (void *)cap->data;
        cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len);
        if (cap && cap->datalen >= sizeof(*params->he_cap) + 1)
                params->he_cap = (void *)(cap->data + 1);
        cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ies, ies_len);
        if (cap && cap->datalen >= sizeof(*params->he_oper) + 1)
                params->he_oper = (void *)(cap->data + 1);
        cap = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_CAPABILITY, ies, ies_len);
        if (cap) {
                if (!cap->datalen)
                        return -EINVAL;
                params->eht_cap = (void *)(cap->data + 1);
                if (!ieee80211_eht_capa_size_ok((const u8 *)params->he_cap,
                                                (const u8 *)params->eht_cap,
                                                cap->datalen - 1, true))
                        return -EINVAL;
        }
        cap = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_OPERATION, ies, ies_len);
        if (cap) {
                if (!cap->datalen)
                        return -EINVAL;
                params->eht_oper = (void *)(cap->data + 1);
                if (!ieee80211_eht_oper_size_ok((const u8 *)params->eht_oper,
                                                cap->datalen - 1))
                        return -EINVAL;
        }
        return 0;
}

static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
                                   struct cfg80211_ap_settings *params)
{
        struct wireless_dev *wdev;

        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                if (wdev->iftype != NL80211_IFTYPE_AP &&
                    wdev->iftype != NL80211_IFTYPE_P2P_GO)
                        continue;

                if (!wdev->u.ap.preset_chandef.chan)
                        continue;

                params->chandef = wdev->u.ap.preset_chandef;
                return true;
        }

        return false;
}

static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
                                    enum nl80211_auth_type auth_type,
                                    enum nl80211_commands cmd)
{
        if (auth_type > NL80211_AUTHTYPE_MAX)
                return false;

        switch (cmd) {
        case NL80211_CMD_AUTHENTICATE:
                if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
                    auth_type == NL80211_AUTHTYPE_SAE)
                        return false;
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_FILS_STA) &&
                    (auth_type == NL80211_AUTHTYPE_FILS_SK ||
                     auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
                     auth_type == NL80211_AUTHTYPE_FILS_PK))
                        return false;
                return true;
        case NL80211_CMD_CONNECT:
                if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
                    auth_type == NL80211_AUTHTYPE_SAE)
                        return false;

                /* FILS with SK PFS or PK not supported yet */
                if (auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
                    auth_type == NL80211_AUTHTYPE_FILS_PK)
                        return false;
                if (!wiphy_ext_feature_isset(
                            &rdev->wiphy,
                            NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) &&
                    auth_type == NL80211_AUTHTYPE_FILS_SK)
                        return false;
                return true;
        case NL80211_CMD_START_AP:
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_SAE_OFFLOAD_AP) &&
                    auth_type == NL80211_AUTHTYPE_SAE)
                        return false;
                /* FILS not supported yet */
                if (auth_type == NL80211_AUTHTYPE_FILS_SK ||
                    auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
                    auth_type == NL80211_AUTHTYPE_FILS_PK)
                        return false;
                return true;
        default:
                return false;
        }
}

static void nl80211_send_ap_started(struct wireless_dev *wdev,
                                    unsigned int link_id)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_START_AP);
        if (!hdr)
                goto out;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD) ||
            (wdev->u.ap.ssid_len &&
             nla_put(msg, NL80211_ATTR_SSID, wdev->u.ap.ssid_len,
                     wdev->u.ap.ssid)) ||
            (wdev->valid_links &&
             nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)))
                goto out;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), msg, 0,
                                NL80211_MCGRP_MLME, GFP_KERNEL);
        return;
out:
        nlmsg_free(msg);
}

static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params)
{
        struct ieee80211_channel *channel = params->chandef.chan;

        if ((params->he_cap ||  params->he_oper) &&
            (channel->flags & IEEE80211_CHAN_NO_HE))
                return -EOPNOTSUPP;

        if ((params->eht_cap || params->eht_oper) &&
            (channel->flags & IEEE80211_CHAN_NO_EHT))
                return -EOPNOTSUPP;

        return 0;
}

static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_ap_settings *params;
        int err;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
                return -EOPNOTSUPP;

        if (!rdev->ops->start_ap)
                return -EOPNOTSUPP;

        if (wdev->cac_started)
                return -EBUSY;

        if (wdev->links[link_id].ap.beacon_interval)
                return -EALREADY;

        /* these are required for START_AP */
        if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] ||
            !info->attrs[NL80211_ATTR_DTIM_PERIOD] ||
            !info->attrs[NL80211_ATTR_BEACON_HEAD])
                return -EINVAL;

        params = kzalloc(sizeof(*params), GFP_KERNEL);
        if (!params)
                return -ENOMEM;

        err = nl80211_parse_beacon(rdev, info->attrs, &params->beacon,
                                   info->extack);
        if (err)
                goto out;

        params->beacon_interval =
                nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);
        params->dtim_period =
                nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);

        err = cfg80211_validate_beacon_int(rdev, dev->ieee80211_ptr->iftype,
                                           params->beacon_interval);
        if (err)
                goto out;

        /*
         * In theory, some of these attributes should be required here
         * but since they were not used when the command was originally
         * added, keep them optional for old user space programs to let
         * them continue to work with drivers that do not need the
         * additional information -- drivers must check!
         */
        if (info->attrs[NL80211_ATTR_SSID]) {
                params->ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
                params->ssid_len =
                        nla_len(info->attrs[NL80211_ATTR_SSID]);
                if (params->ssid_len == 0) {
                        err = -EINVAL;
                        goto out;
                }

                if (wdev->u.ap.ssid_len &&
                    (wdev->u.ap.ssid_len != params->ssid_len ||
                     memcmp(wdev->u.ap.ssid, params->ssid, params->ssid_len))) {
                        /* require identical SSID for MLO */
                        err = -EINVAL;
                        goto out;
                }
        } else if (wdev->valid_links) {
                /* require SSID for MLO */
                err = -EINVAL;
                goto out;
        }

        if (info->attrs[NL80211_ATTR_HIDDEN_SSID])
                params->hidden_ssid = nla_get_u32(
                        info->attrs[NL80211_ATTR_HIDDEN_SSID]);

        params->privacy = !!info->attrs[NL80211_ATTR_PRIVACY];

        if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
                params->auth_type = nla_get_u32(
                        info->attrs[NL80211_ATTR_AUTH_TYPE]);
                if (!nl80211_valid_auth_type(rdev, params->auth_type,
                                             NL80211_CMD_START_AP)) {
                        err = -EINVAL;
                        goto out;
                }
        } else
                params->auth_type = NL80211_AUTHTYPE_AUTOMATIC;

        err = nl80211_crypto_settings(rdev, info, &params->crypto,
                                      NL80211_MAX_NR_CIPHER_SUITES);
        if (err)
                goto out;

        if (info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]) {
                if (!(rdev->wiphy.features & NL80211_FEATURE_INACTIVITY_TIMER)) {
                        err = -EOPNOTSUPP;
                        goto out;
                }
                params->inactivity_timeout = nla_get_u16(
                        info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]);
        }

        if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) {
                if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) {
                        err = -EINVAL;
                        goto out;
                }
                params->p2p_ctwindow =
                        nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
                if (params->p2p_ctwindow != 0 &&
                    !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) {
                        err = -EINVAL;
                        goto out;
                }
        }

        if (info->attrs[NL80211_ATTR_P2P_OPPPS]) {
                u8 tmp;

                if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) {
                        err = -EINVAL;
                        goto out;
                }
                tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
                params->p2p_opp_ps = tmp;
                if (params->p2p_opp_ps != 0 &&
                    !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) {
                        err = -EINVAL;
                        goto out;
                }
        }

        if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
                err = nl80211_parse_chandef(rdev, info, &params->chandef);
                if (err)
                        goto out;
        } else if (wdev->valid_links) {
                /* with MLD need to specify the channel configuration */
                err = -EINVAL;
                goto out;
        } else if (wdev->u.ap.preset_chandef.chan) {
                params->chandef = wdev->u.ap.preset_chandef;
        } else if (!nl80211_get_ap_channel(rdev, params)) {
                err = -EINVAL;
                goto out;
        }

        if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params->chandef,
                                           wdev->iftype)) {
                err = -EINVAL;
                goto out;
        }

        if (info->attrs[NL80211_ATTR_TX_RATES]) {
                err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
                                                    NL80211_ATTR_TX_RATES,
                                                    &params->beacon_rate,
                                                    dev, false, link_id);
                if (err)
                        goto out;

                err = validate_beacon_tx_rate(rdev, params->chandef.chan->band,
                                              &params->beacon_rate);
                if (err)
                        goto out;
        }

        if (info->attrs[NL80211_ATTR_SMPS_MODE]) {
                params->smps_mode =
                        nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]);
                switch (params->smps_mode) {
                case NL80211_SMPS_OFF:
                        break;
                case NL80211_SMPS_STATIC:
                        if (!(rdev->wiphy.features &
                              NL80211_FEATURE_STATIC_SMPS)) {
                                err = -EINVAL;
                                goto out;
                        }
                        break;
                case NL80211_SMPS_DYNAMIC:
                        if (!(rdev->wiphy.features &
                              NL80211_FEATURE_DYNAMIC_SMPS)) {
                                err = -EINVAL;
                                goto out;
                        }
                        break;
                default:
                        err = -EINVAL;
                        goto out;
                }
        } else {
                params->smps_mode = NL80211_SMPS_OFF;
        }

        params->pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
        if (params->pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) {
                err = -EOPNOTSUPP;
                goto out;
        }

        if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
                params->acl = parse_acl_data(&rdev->wiphy, info);
                if (IS_ERR(params->acl)) {
                        err = PTR_ERR(params->acl);
                        params->acl = NULL;
                        goto out;
                }
        }

        params->twt_responder =
                    nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]);

        if (info->attrs[NL80211_ATTR_HE_OBSS_PD]) {
                err = nl80211_parse_he_obss_pd(
                                        info->attrs[NL80211_ATTR_HE_OBSS_PD],
                                        &params->he_obss_pd);
                if (err)
                        goto out;
        }

        if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) {
                err = nl80211_parse_fils_discovery(rdev,
                                                   info->attrs[NL80211_ATTR_FILS_DISCOVERY],
                                                   &params->fils_discovery);
                if (err)
                        goto out;
        }

        if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) {
                err = nl80211_parse_unsol_bcast_probe_resp(
                        rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP],
                        &params->unsol_bcast_probe_resp);
                if (err)
                        goto out;
        }

        if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) {
                err = nl80211_parse_mbssid_config(&rdev->wiphy, dev,
                                                  info->attrs[NL80211_ATTR_MBSSID_CONFIG],
                                                  &params->mbssid_config,
                                                  params->beacon.mbssid_ies ?
                                                        params->beacon.mbssid_ies->cnt :
                                                        0);
                if (err)
                        goto out;
        }

        if (!params->mbssid_config.ema && params->beacon.rnr_ies) {
                err = -EINVAL;
                goto out;
        }

        err = nl80211_calculate_ap_params(params);
        if (err)
                goto out;

        err = nl80211_validate_ap_phy_operation(params);
        if (err)
                goto out;

        if (info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS])
                params->flags = nla_get_u32(
                        info->attrs[NL80211_ATTR_AP_SETTINGS_FLAGS]);
        else if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
                params->flags |= NL80211_AP_SETTINGS_EXTERNAL_AUTH_SUPPORT;

        if (wdev->conn_owner_nlportid &&
            info->attrs[NL80211_ATTR_SOCKET_OWNER] &&
            wdev->conn_owner_nlportid != info->snd_portid) {
                err = -EINVAL;
                goto out;
        }

        /* FIXME: validate MLO/link-id against driver capabilities */

        err = rdev_start_ap(rdev, dev, params);
        if (!err) {
                wdev->links[link_id].ap.beacon_interval = params->beacon_interval;
                wdev->links[link_id].ap.chandef = params->chandef;
                wdev->u.ap.ssid_len = params->ssid_len;
                memcpy(wdev->u.ap.ssid, params->ssid,
                       params->ssid_len);

                if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
                        wdev->conn_owner_nlportid = info->snd_portid;

                nl80211_send_ap_started(wdev, link_id);
        }
out:
        kfree(params->acl);
        kfree(params->beacon.mbssid_ies);
        if (params->mbssid_config.tx_wdev &&
            params->mbssid_config.tx_wdev->netdev &&
            params->mbssid_config.tx_wdev->netdev != dev)
                dev_put(params->mbssid_config.tx_wdev->netdev);
        kfree(params->beacon.rnr_ies);
        kfree(params);

        return err;
}

static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_ap_update *params;
        struct nlattr *attr;
        int err;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
                return -EOPNOTSUPP;

        if (!rdev->ops->change_beacon)
                return -EOPNOTSUPP;

        if (!wdev->links[link_id].ap.beacon_interval)
                return -EINVAL;

        params = kzalloc(sizeof(*params), GFP_KERNEL);
        if (!params)
                return -ENOMEM;

        err = nl80211_parse_beacon(rdev, info->attrs, &params->beacon,
                                   info->extack);
        if (err)
                goto out;

        attr = info->attrs[NL80211_ATTR_FILS_DISCOVERY];
        if (attr) {
                err = nl80211_parse_fils_discovery(rdev, attr,
                                                   &params->fils_discovery);
                if (err)
                        goto out;
        }

        attr = info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP];
        if (attr) {
                err = nl80211_parse_unsol_bcast_probe_resp(rdev, attr,
                                                           &params->unsol_bcast_probe_resp);
                if (err)
                        goto out;
        }

        err = rdev_change_beacon(rdev, dev, params);

out:
        kfree(params->beacon.mbssid_ies);
        kfree(params->beacon.rnr_ies);
        kfree(params);
        return err;
}

static int nl80211_stop_ap(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct net_device *dev = info->user_ptr[1];

        return cfg80211_stop_ap(rdev, dev, link_id, false);
}

static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
        [NL80211_STA_FLAG_AUTHORIZED] = { .type = NLA_FLAG },
        [NL80211_STA_FLAG_SHORT_PREAMBLE] = { .type = NLA_FLAG },
        [NL80211_STA_FLAG_WME] = { .type = NLA_FLAG },
        [NL80211_STA_FLAG_MFP] = { .type = NLA_FLAG },
        [NL80211_STA_FLAG_AUTHENTICATED] = { .type = NLA_FLAG },
        [NL80211_STA_FLAG_TDLS_PEER] = { .type = NLA_FLAG },
};

static int parse_station_flags(struct genl_info *info,
                               enum nl80211_iftype iftype,
                               struct station_parameters *params)
{
        struct nlattr *flags[NL80211_STA_FLAG_MAX + 1];
        struct nlattr *nla;
        int flag;

        /*
         * Try parsing the new attribute first so userspace
         * can specify both for older kernels.
         */
        nla = info->attrs[NL80211_ATTR_STA_FLAGS2];
        if (nla) {
                struct nl80211_sta_flag_update *sta_flags;

                sta_flags = nla_data(nla);
                params->sta_flags_mask = sta_flags->mask;
                params->sta_flags_set = sta_flags->set;
                params->sta_flags_set &= params->sta_flags_mask;
                if ((params->sta_flags_mask |
                     params->sta_flags_set) & BIT(__NL80211_STA_FLAG_INVALID))
                        return -EINVAL;
                return 0;
        }

        /* if present, parse the old attribute */

        nla = info->attrs[NL80211_ATTR_STA_FLAGS];
        if (!nla)
                return 0;

        if (nla_parse_nested_deprecated(flags, NL80211_STA_FLAG_MAX, nla, sta_flags_policy, info->extack))
                return -EINVAL;

        /*
         * Only allow certain flags for interface types so that
         * other attributes are silently ignored. Remember that
         * this is backward compatibility code with old userspace
         * and shouldn't be hit in other cases anyway.
         */
        switch (iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_GO:
                params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHORIZED) |
                                         BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) |
                                         BIT(NL80211_STA_FLAG_WME) |
                                         BIT(NL80211_STA_FLAG_MFP);
                break;
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_STATION:
                params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHORIZED) |
                                         BIT(NL80211_STA_FLAG_TDLS_PEER);
                break;
        case NL80211_IFTYPE_MESH_POINT:
                params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
                                         BIT(NL80211_STA_FLAG_MFP) |
                                         BIT(NL80211_STA_FLAG_AUTHORIZED);
                break;
        default:
                return -EINVAL;
        }

        for (flag = 1; flag <= NL80211_STA_FLAG_MAX; flag++) {
                if (flags[flag]) {
                        params->sta_flags_set |= (1<<flag);

                        /* no longer support new API additions in old API */
                        if (flag > NL80211_STA_FLAG_MAX_OLD_API)
                                return -EINVAL;
                }
        }

        return 0;
}

bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
{
        struct nlattr *rate;
        u32 bitrate;
        u16 bitrate_compat;
        enum nl80211_rate_info rate_flg;

        rate = nla_nest_start_noflag(msg, attr);
        if (!rate)
                return false;

        /* cfg80211_calculate_bitrate will return 0 for mcs >= 32 */
        bitrate = cfg80211_calculate_bitrate(info);
        /* report 16-bit bitrate only if we can */
        bitrate_compat = bitrate < (1UL << 16) ? bitrate : 0;
        if (bitrate > 0 &&
            nla_put_u32(msg, NL80211_RATE_INFO_BITRATE32, bitrate))
                return false;
        if (bitrate_compat > 0 &&
            nla_put_u16(msg, NL80211_RATE_INFO_BITRATE, bitrate_compat))
                return false;

        switch (info->bw) {
        case RATE_INFO_BW_1:
                rate_flg = NL80211_RATE_INFO_1_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_2:
                rate_flg = NL80211_RATE_INFO_2_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_4:
                rate_flg = NL80211_RATE_INFO_4_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_5:
                rate_flg = NL80211_RATE_INFO_5_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_8:
                rate_flg = NL80211_RATE_INFO_8_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_10:
                rate_flg = NL80211_RATE_INFO_10_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_16:
                rate_flg = NL80211_RATE_INFO_16_MHZ_WIDTH;
                break;
        default:
                WARN_ON(1);
                fallthrough;
        case RATE_INFO_BW_20:
                rate_flg = 0;
                break;
        case RATE_INFO_BW_40:
                rate_flg = NL80211_RATE_INFO_40_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_80:
                rate_flg = NL80211_RATE_INFO_80_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_160:
                rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_HE_RU:
                rate_flg = 0;
                WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS));
                break;
        case RATE_INFO_BW_320:
                rate_flg = NL80211_RATE_INFO_320_MHZ_WIDTH;
                break;
        case RATE_INFO_BW_EHT_RU:
                rate_flg = 0;
                WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS));
                break;
        }

        if (rate_flg && nla_put_flag(msg, rate_flg))
                return false;

        if (info->flags & RATE_INFO_FLAGS_MCS) {
                if (nla_put_u8(msg, NL80211_RATE_INFO_MCS, info->mcs))
                        return false;
                if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
                    nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
                        return false;
        } else if (info->flags & RATE_INFO_FLAGS_VHT_MCS) {
                if (nla_put_u8(msg, NL80211_RATE_INFO_VHT_MCS, info->mcs))
                        return false;
                if (nla_put_u8(msg, NL80211_RATE_INFO_VHT_NSS, info->nss))
                        return false;
                if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
                    nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
                        return false;
        } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) {
                if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs))
                        return false;
                if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss))
                        return false;
                if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi))
                        return false;
                if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm))
                        return false;
                if (info->bw == RATE_INFO_BW_HE_RU &&
                    nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC,
                               info->he_ru_alloc))
                        return false;
        } else if (info->flags & RATE_INFO_FLAGS_S1G_MCS) {
                if (nla_put_u8(msg, NL80211_RATE_INFO_S1G_MCS, info->mcs))
                        return false;
                if (nla_put_u8(msg, NL80211_RATE_INFO_S1G_NSS, info->nss))
                        return false;
                if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
                    nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
                        return false;
        } else if (info->flags & RATE_INFO_FLAGS_EHT_MCS) {
                if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_MCS, info->mcs))
                        return false;
                if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss))
                        return false;
                if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi))
                        return false;
                if (info->bw == RATE_INFO_BW_EHT_RU &&
                    nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC,
                               info->eht_ru_alloc))
                        return false;
        }

        nla_nest_end(msg, rate);
        return true;
}

static bool nl80211_put_signal(struct sk_buff *msg, u8 mask, s8 *signal,
                               int id)
{
        void *attr;
        int i = 0;

        if (!mask)
                return true;

        attr = nla_nest_start_noflag(msg, id);
        if (!attr)
                return false;

        for (i = 0; i < IEEE80211_MAX_CHAINS; i++) {
                if (!(mask & BIT(i)))
                        continue;

                if (nla_put_u8(msg, i, signal[i]))
                        return false;
        }

        nla_nest_end(msg, attr);

        return true;
}

static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
                                u32 seq, int flags,
                                struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                const u8 *mac_addr, struct station_info *sinfo)
{
        void *hdr;
        struct nlattr *sinfoattr, *bss_param;

        hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
        if (!hdr) {
                cfg80211_sinfo_release_content(sinfo);
                return -1;
        }

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr) ||
            nla_put_u32(msg, NL80211_ATTR_GENERATION, sinfo->generation))
                goto nla_put_failure;

        sinfoattr = nla_nest_start_noflag(msg, NL80211_ATTR_STA_INFO);
        if (!sinfoattr)
                goto nla_put_failure;

#define PUT_SINFO(attr, memb, type) do {                                \
        BUILD_BUG_ON(sizeof(type) == sizeof(u64));                        \
        if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) &&        \
            nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr,                \
                             sinfo->memb))                                \
                goto nla_put_failure;                                        \
        } while (0)
#define PUT_SINFO_U64(attr, memb) do {                                        \
        if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) &&        \
            nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr,                \
                              sinfo->memb, NL80211_STA_INFO_PAD))        \
                goto nla_put_failure;                                        \
        } while (0)

        PUT_SINFO(CONNECTED_TIME, connected_time, u32);
        PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
        PUT_SINFO_U64(ASSOC_AT_BOOTTIME, assoc_at);

        if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) |
                             BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) &&
            nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES,
                        (u32)sinfo->rx_bytes))
                goto nla_put_failure;

        if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) |
                             BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) &&
            nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES,
                        (u32)sinfo->tx_bytes))
                goto nla_put_failure;

        PUT_SINFO_U64(RX_BYTES64, rx_bytes);
        PUT_SINFO_U64(TX_BYTES64, tx_bytes);
        PUT_SINFO(LLID, llid, u16);
        PUT_SINFO(PLID, plid, u16);
        PUT_SINFO(PLINK_STATE, plink_state, u8);
        PUT_SINFO_U64(RX_DURATION, rx_duration);
        PUT_SINFO_U64(TX_DURATION, tx_duration);

        if (wiphy_ext_feature_isset(&rdev->wiphy,
                                    NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
                PUT_SINFO(AIRTIME_WEIGHT, airtime_weight, u16);

        switch (rdev->wiphy.signal_type) {
        case CFG80211_SIGNAL_TYPE_MBM:
                PUT_SINFO(SIGNAL, signal, u8);
                PUT_SINFO(SIGNAL_AVG, signal_avg, u8);
                break;
        default:
                break;
        }
        if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) {
                if (!nl80211_put_signal(msg, sinfo->chains,
                                        sinfo->chain_signal,
                                        NL80211_STA_INFO_CHAIN_SIGNAL))
                        goto nla_put_failure;
        }
        if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) {
                if (!nl80211_put_signal(msg, sinfo->chains,
                                        sinfo->chain_signal_avg,
                                        NL80211_STA_INFO_CHAIN_SIGNAL_AVG))
                        goto nla_put_failure;
        }
        if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) {
                if (!nl80211_put_sta_rate(msg, &sinfo->txrate,
                                          NL80211_STA_INFO_TX_BITRATE))
                        goto nla_put_failure;
        }
        if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) {
                if (!nl80211_put_sta_rate(msg, &sinfo->rxrate,
                                          NL80211_STA_INFO_RX_BITRATE))
                        goto nla_put_failure;
        }

        PUT_SINFO(RX_PACKETS, rx_packets, u32);
        PUT_SINFO(TX_PACKETS, tx_packets, u32);
        PUT_SINFO(TX_RETRIES, tx_retries, u32);
        PUT_SINFO(TX_FAILED, tx_failed, u32);
        PUT_SINFO(EXPECTED_THROUGHPUT, expected_throughput, u32);
        PUT_SINFO(AIRTIME_LINK_METRIC, airtime_link_metric, u32);
        PUT_SINFO(BEACON_LOSS, beacon_loss_count, u32);
        PUT_SINFO(LOCAL_PM, local_pm, u32);
        PUT_SINFO(PEER_PM, peer_pm, u32);
        PUT_SINFO(NONPEER_PM, nonpeer_pm, u32);
        PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8);
        PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8);

        if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) {
                bss_param = nla_nest_start_noflag(msg,
                                                  NL80211_STA_INFO_BSS_PARAM);
                if (!bss_param)
                        goto nla_put_failure;

                if (((sinfo->bss_param.flags & BSS_PARAM_FLAGS_CTS_PROT) &&
                     nla_put_flag(msg, NL80211_STA_BSS_PARAM_CTS_PROT)) ||
                    ((sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_PREAMBLE) &&
                     nla_put_flag(msg, NL80211_STA_BSS_PARAM_SHORT_PREAMBLE)) ||
                    ((sinfo->bss_param.flags & BSS_PARAM_FLAGS_SHORT_SLOT_TIME) &&
                     nla_put_flag(msg, NL80211_STA_BSS_PARAM_SHORT_SLOT_TIME)) ||
                    nla_put_u8(msg, NL80211_STA_BSS_PARAM_DTIM_PERIOD,
                               sinfo->bss_param.dtim_period) ||
                    nla_put_u16(msg, NL80211_STA_BSS_PARAM_BEACON_INTERVAL,
                                sinfo->bss_param.beacon_interval))
                        goto nla_put_failure;

                nla_nest_end(msg, bss_param);
        }
        if ((sinfo->filled & BIT_ULL(NL80211_STA_INFO_STA_FLAGS)) &&
            nla_put(msg, NL80211_STA_INFO_STA_FLAGS,
                    sizeof(struct nl80211_sta_flag_update),
                    &sinfo->sta_flags))
                goto nla_put_failure;

        PUT_SINFO_U64(T_OFFSET, t_offset);
        PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc);
        PUT_SINFO_U64(BEACON_RX, rx_beacon);
        PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
        PUT_SINFO(RX_MPDUS, rx_mpdu_count, u32);
        PUT_SINFO(FCS_ERROR_COUNT, fcs_err_count, u32);
        if (wiphy_ext_feature_isset(&rdev->wiphy,
                                    NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) {
                PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
                PUT_SINFO(ACK_SIGNAL_AVG, avg_ack_signal, s8);
        }

#undef PUT_SINFO
#undef PUT_SINFO_U64

        if (sinfo->pertid) {
                struct nlattr *tidsattr;
                int tid;

                tidsattr = nla_nest_start_noflag(msg,
                                                 NL80211_STA_INFO_TID_STATS);
                if (!tidsattr)
                        goto nla_put_failure;

                for (tid = 0; tid < IEEE80211_NUM_TIDS + 1; tid++) {
                        struct cfg80211_tid_stats *tidstats;
                        struct nlattr *tidattr;

                        tidstats = &sinfo->pertid[tid];

                        if (!tidstats->filled)
                                continue;

                        tidattr = nla_nest_start_noflag(msg, tid + 1);
                        if (!tidattr)
                                goto nla_put_failure;

#define PUT_TIDVAL_U64(attr, memb) do {                                        \
        if (tidstats->filled & BIT(NL80211_TID_STATS_ ## attr) &&        \
            nla_put_u64_64bit(msg, NL80211_TID_STATS_ ## attr,                \
                              tidstats->memb, NL80211_TID_STATS_PAD))        \
                goto nla_put_failure;                                        \
        } while (0)

                        PUT_TIDVAL_U64(RX_MSDU, rx_msdu);
                        PUT_TIDVAL_U64(TX_MSDU, tx_msdu);
                        PUT_TIDVAL_U64(TX_MSDU_RETRIES, tx_msdu_retries);
                        PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed);

#undef PUT_TIDVAL_U64
                        if ((tidstats->filled &
                             BIT(NL80211_TID_STATS_TXQ_STATS)) &&
                            !nl80211_put_txq_stats(msg, &tidstats->txq_stats,
                                                   NL80211_TID_STATS_TXQ_STATS))
                                goto nla_put_failure;

                        nla_nest_end(msg, tidattr);
                }

                nla_nest_end(msg, tidsattr);
        }

        nla_nest_end(msg, sinfoattr);

        if (sinfo->assoc_req_ies_len &&
            nla_put(msg, NL80211_ATTR_IE, sinfo->assoc_req_ies_len,
                    sinfo->assoc_req_ies))
                goto nla_put_failure;

        if (sinfo->assoc_resp_ies_len &&
            nla_put(msg, NL80211_ATTR_RESP_IE, sinfo->assoc_resp_ies_len,
                    sinfo->assoc_resp_ies))
                goto nla_put_failure;

        if (sinfo->mlo_params_valid) {
                if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID,
                               sinfo->assoc_link_id))
                        goto nla_put_failure;

                if (!is_zero_ether_addr(sinfo->mld_addr) &&
                    nla_put(msg, NL80211_ATTR_MLD_ADDR, ETH_ALEN,
                            sinfo->mld_addr))
                        goto nla_put_failure;
        }

        cfg80211_sinfo_release_content(sinfo);
        genlmsg_end(msg, hdr);
        return 0;

 nla_put_failure:
        cfg80211_sinfo_release_content(sinfo);
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nl80211_dump_station(struct sk_buff *skb,
                                struct netlink_callback *cb)
{
        struct station_info sinfo;
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        u8 mac_addr[ETH_ALEN];
        int sta_idx = cb->args[2];
        int err;

        err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
        if (err)
                return err;
        /* nl80211_prepare_wdev_dump acquired it in the successful case */
        __acquire(&rdev->wiphy.mtx);

        if (!wdev->netdev) {
                err = -EINVAL;
                goto out_err;
        }

        if (!rdev->ops->dump_station) {
                err = -EOPNOTSUPP;
                goto out_err;
        }

        while (1) {
                memset(&sinfo, 0, sizeof(sinfo));
                err = rdev_dump_station(rdev, wdev->netdev, sta_idx,
                                        mac_addr, &sinfo);
                if (err == -ENOENT)
                        break;
                if (err)
                        goto out_err;

                if (nl80211_send_station(skb, NL80211_CMD_NEW_STATION,
                                NETLINK_CB(cb->skb).portid,
                                cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                rdev, wdev->netdev, mac_addr,
                                &sinfo) < 0)
                        goto out;

                sta_idx++;
        }

 out:
        cb->args[2] = sta_idx;
        err = skb->len;
 out_err:
        wiphy_unlock(&rdev->wiphy);

        return err;
}

static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct station_info sinfo;
        struct sk_buff *msg;
        u8 *mac_addr = NULL;
        int err;

        memset(&sinfo, 0, sizeof(sinfo));

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);

        if (!rdev->ops->get_station)
                return -EOPNOTSUPP;

        err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
        if (err)
                return err;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg) {
                cfg80211_sinfo_release_content(&sinfo);
                return -ENOMEM;
        }

        if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
                                 info->snd_portid, info->snd_seq, 0,
                                 rdev, dev, mac_addr, &sinfo) < 0) {
                nlmsg_free(msg);
                return -ENOBUFS;
        }

        return genlmsg_reply(msg, info);
}

int cfg80211_check_station_change(struct wiphy *wiphy,
                                  struct station_parameters *params,
                                  enum cfg80211_station_type statype)
{
        if (params->listen_interval != -1 &&
            statype != CFG80211_STA_AP_CLIENT_UNASSOC)
                return -EINVAL;

        if (params->support_p2p_ps != -1 &&
            statype != CFG80211_STA_AP_CLIENT_UNASSOC)
                return -EINVAL;

        if (params->aid &&
            !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) &&
            statype != CFG80211_STA_AP_CLIENT_UNASSOC)
                return -EINVAL;

        /* When you run into this, adjust the code below for the new flag */
        BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 8);

        switch (statype) {
        case CFG80211_STA_MESH_PEER_KERNEL:
        case CFG80211_STA_MESH_PEER_USER:
                /*
                 * No ignoring the TDLS flag here -- the userspace mesh
                 * code doesn't have the bug of including TDLS in the
                 * mask everywhere.
                 */
                if (params->sta_flags_mask &
                                ~(BIT(NL80211_STA_FLAG_AUTHENTICATED) |
                                  BIT(NL80211_STA_FLAG_MFP) |
                                  BIT(NL80211_STA_FLAG_AUTHORIZED)))
                        return -EINVAL;
                break;
        case CFG80211_STA_TDLS_PEER_SETUP:
        case CFG80211_STA_TDLS_PEER_ACTIVE:
                if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)))
                        return -EINVAL;
                /* ignore since it can't change */
                params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER);
                break;
        default:
                /* disallow mesh-specific things */
                if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION)
                        return -EINVAL;
                if (params->local_pm)
                        return -EINVAL;
                if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE)
                        return -EINVAL;
        }

        if (statype != CFG80211_STA_TDLS_PEER_SETUP &&
            statype != CFG80211_STA_TDLS_PEER_ACTIVE) {
                /* TDLS can't be set, ... */
                if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
                        return -EINVAL;
                /*
                 * ... but don't bother the driver with it. This works around
                 * a hostapd/wpa_supplicant issue -- it always includes the
                 * TLDS_PEER flag in the mask even for AP mode.
                 */
                params->sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER);
        }

        if (statype != CFG80211_STA_TDLS_PEER_SETUP &&
            statype != CFG80211_STA_AP_CLIENT_UNASSOC) {
                /* reject other things that can't change */
                if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD)
                        return -EINVAL;
                if (params->sta_modify_mask & STATION_PARAM_APPLY_CAPABILITY)
                        return -EINVAL;
                if (params->link_sta_params.supported_rates)
                        return -EINVAL;
                if (params->ext_capab || params->link_sta_params.ht_capa ||
                    params->link_sta_params.vht_capa ||
                    params->link_sta_params.he_capa ||
                    params->link_sta_params.eht_capa)
                        return -EINVAL;
                if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
                        return -EINVAL;
        }

        if (statype != CFG80211_STA_AP_CLIENT &&
            statype != CFG80211_STA_AP_CLIENT_UNASSOC) {
                if (params->vlan)
                        return -EINVAL;
        }

        switch (statype) {
        case CFG80211_STA_AP_MLME_CLIENT:
                /* Use this only for authorizing/unauthorizing a station */
                if (!(params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)))
                        return -EOPNOTSUPP;
                break;
        case CFG80211_STA_AP_CLIENT:
        case CFG80211_STA_AP_CLIENT_UNASSOC:
                /* accept only the listed bits */
                if (params->sta_flags_mask &
                                ~(BIT(NL80211_STA_FLAG_AUTHORIZED) |
                                  BIT(NL80211_STA_FLAG_AUTHENTICATED) |
                                  BIT(NL80211_STA_FLAG_ASSOCIATED) |
                                  BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) |
                                  BIT(NL80211_STA_FLAG_WME) |
                                  BIT(NL80211_STA_FLAG_MFP) |
                                  BIT(NL80211_STA_FLAG_SPP_AMSDU)))
                        return -EINVAL;

                /* but authenticated/associated only if driver handles it */
                if (!(wiphy->features & NL80211_FEATURE_FULL_AP_CLIENT_STATE) &&
                    params->sta_flags_mask &
                                (BIT(NL80211_STA_FLAG_AUTHENTICATED) |
                                 BIT(NL80211_STA_FLAG_ASSOCIATED)))
                        return -EINVAL;
                break;
        case CFG80211_STA_IBSS:
        case CFG80211_STA_AP_STA:
                /* reject any changes other than AUTHORIZED */
                if (params->sta_flags_mask & ~BIT(NL80211_STA_FLAG_AUTHORIZED))
                        return -EINVAL;
                break;
        case CFG80211_STA_TDLS_PEER_SETUP:
                /* reject any changes other than AUTHORIZED or WME */
                if (params->sta_flags_mask & ~(BIT(NL80211_STA_FLAG_AUTHORIZED) |
                                               BIT(NL80211_STA_FLAG_WME)))
                        return -EINVAL;
                /* force (at least) rates when authorizing */
                if (params->sta_flags_set & BIT(NL80211_STA_FLAG_AUTHORIZED) &&
                    !params->link_sta_params.supported_rates)
                        return -EINVAL;
                break;
        case CFG80211_STA_TDLS_PEER_ACTIVE:
                /* reject any changes */
                return -EINVAL;
        case CFG80211_STA_MESH_PEER_KERNEL:
                if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE)
                        return -EINVAL;
                break;
        case CFG80211_STA_MESH_PEER_USER:
                if (params->plink_action != NL80211_PLINK_ACTION_NO_ACTION &&
                    params->plink_action != NL80211_PLINK_ACTION_BLOCK)
                        return -EINVAL;
                break;
        }

        /*
         * Older kernel versions ignored this attribute entirely, so don't
         * reject attempts to update it but mark it as unused instead so the
         * driver won't look at the data.
         */
        if (statype != CFG80211_STA_AP_CLIENT_UNASSOC &&
            statype != CFG80211_STA_TDLS_PEER_SETUP)
                params->link_sta_params.opmode_notif_used = false;

        return 0;
}
EXPORT_SYMBOL(cfg80211_check_station_change);

/*
 * Get vlan interface making sure it is running and on the right wiphy.
 */
static struct net_device *get_vlan(struct genl_info *info,
                                   struct cfg80211_registered_device *rdev)
{
        struct nlattr *vlanattr = info->attrs[NL80211_ATTR_STA_VLAN];
        struct net_device *v;
        int ret;

        if (!vlanattr)
                return NULL;

        v = dev_get_by_index(genl_info_net(info), nla_get_u32(vlanattr));
        if (!v)
                return ERR_PTR(-ENODEV);

        if (!v->ieee80211_ptr || v->ieee80211_ptr->wiphy != &rdev->wiphy) {
                ret = -EINVAL;
                goto error;
        }

        if (v->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
            v->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
            v->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) {
                ret = -EINVAL;
                goto error;
        }

        if (!netif_running(v)) {
                ret = -ENETDOWN;
                goto error;
        }

        return v;
 error:
        dev_put(v);
        return ERR_PTR(ret);
}

static int nl80211_parse_sta_wme(struct genl_info *info,
                                 struct station_parameters *params)
{
        struct nlattr *tb[NL80211_STA_WME_MAX + 1];
        struct nlattr *nla;
        int err;

        /* parse WME attributes if present */
        if (!info->attrs[NL80211_ATTR_STA_WME])
                return 0;

        nla = info->attrs[NL80211_ATTR_STA_WME];
        err = nla_parse_nested_deprecated(tb, NL80211_STA_WME_MAX, nla,
                                          nl80211_sta_wme_policy,
                                          info->extack);
        if (err)
                return err;

        if (tb[NL80211_STA_WME_UAPSD_QUEUES])
                params->uapsd_queues = nla_get_u8(
                        tb[NL80211_STA_WME_UAPSD_QUEUES]);
        if (params->uapsd_queues & ~IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK)
                return -EINVAL;

        if (tb[NL80211_STA_WME_MAX_SP])
                params->max_sp = nla_get_u8(tb[NL80211_STA_WME_MAX_SP]);

        if (params->max_sp & ~IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK)
                return -EINVAL;

        params->sta_modify_mask |= STATION_PARAM_APPLY_UAPSD;

        return 0;
}

static int nl80211_parse_sta_channel_info(struct genl_info *info,
                                      struct station_parameters *params)
{
        if (info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]) {
                params->supported_channels =
                     nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
                params->supported_channels_len =
                     nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_CHANNELS]);
                /*
                 * Need to include at least one (first channel, number of
                 * channels) tuple for each subband (checked in policy),
                 * and must have proper tuples for the rest of the data as well.
                 */
                if (params->supported_channels_len % 2)
                        return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]) {
                params->supported_oper_classes =
                 nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
                params->supported_oper_classes_len =
                  nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES]);
        }
        return 0;
}

static int nl80211_set_station_tdls(struct genl_info *info,
                                    struct station_parameters *params)
{
        int err;
        /* Dummy STA entry gets updated once the peer capabilities are known */
        if (info->attrs[NL80211_ATTR_PEER_AID])
                params->aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
        if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
                params->link_sta_params.ht_capa =
                        nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);
        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
                params->link_sta_params.vht_capa =
                        nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
        if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
                params->link_sta_params.he_capa =
                        nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
                params->link_sta_params.he_capa_len =
                        nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);

                if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
                        params->link_sta_params.eht_capa =
                                nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
                        params->link_sta_params.eht_capa_len =
                                nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);

                        if (!ieee80211_eht_capa_size_ok((const u8 *)params->link_sta_params.he_capa,
                                                        (const u8 *)params->link_sta_params.eht_capa,
                                                        params->link_sta_params.eht_capa_len,
                                                        false))
                                return -EINVAL;
                }
        }

        err = nl80211_parse_sta_channel_info(info, params);
        if (err)
                return err;

        return nl80211_parse_sta_wme(info, params);
}

static int nl80211_parse_sta_txpower_setting(struct genl_info *info,
                                             struct sta_txpwr *txpwr,
                                             bool *txpwr_set)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int idx;

        if (info->attrs[NL80211_ATTR_STA_TX_POWER_SETTING]) {
                if (!rdev->ops->set_tx_power ||
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                         NL80211_EXT_FEATURE_STA_TX_PWR))
                        return -EOPNOTSUPP;

                idx = NL80211_ATTR_STA_TX_POWER_SETTING;
                txpwr->type = nla_get_u8(info->attrs[idx]);

                if (txpwr->type == NL80211_TX_POWER_LIMITED) {
                        idx = NL80211_ATTR_STA_TX_POWER;

                        if (info->attrs[idx])
                                txpwr->power = nla_get_s16(info->attrs[idx]);
                        else
                                return -EINVAL;
                }

                *txpwr_set = true;
        } else {
                *txpwr_set = false;
        }

        return 0;
}

static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct station_parameters params;
        u8 *mac_addr;
        int err;

        memset(&params, 0, sizeof(params));

        if (!rdev->ops->change_station)
                return -EOPNOTSUPP;

        /*
         * AID and listen_interval properties can be set only for unassociated
         * station. Include these parameters here and will check them in
         * cfg80211_check_station_change().
         */
        if (info->attrs[NL80211_ATTR_STA_AID])
                params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);

        if (info->attrs[NL80211_ATTR_VLAN_ID])
                params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);

        if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
                params.listen_interval =
                     nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
        else
                params.listen_interval = -1;

        if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS])
                params.support_p2p_ps =
                        nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
        else
                params.support_p2p_ps = -1;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        params.link_sta_params.link_id =
                nl80211_link_id_or_invalid(info->attrs);

        if (info->attrs[NL80211_ATTR_MLD_ADDR]) {
                /* If MLD_ADDR attribute is set then this is an MLD station
                 * and the MLD_ADDR attribute holds the MLD address and the
                 * MAC attribute holds for the LINK address.
                 * In that case, the link_id is also expected to be valid.
                 */
                if (params.link_sta_params.link_id < 0)
                        return -EINVAL;

                mac_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
                params.link_sta_params.mld_mac = mac_addr;
                params.link_sta_params.link_mac =
                        nla_data(info->attrs[NL80211_ATTR_MAC]);
                if (!is_valid_ether_addr(params.link_sta_params.link_mac))
                        return -EINVAL;
        } else {
                mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
        }


        if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) {
                params.link_sta_params.supported_rates =
                        nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
                params.link_sta_params.supported_rates_len =
                        nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
        }

        if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) {
                params.capability =
                        nla_get_u16(info->attrs[NL80211_ATTR_STA_CAPABILITY]);
                params.sta_modify_mask |= STATION_PARAM_APPLY_CAPABILITY;
        }

        if (info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]) {
                params.ext_capab =
                        nla_data(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
                params.ext_capab_len =
                        nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
        }

        if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
                return -EINVAL;

        if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
                params.plink_action =
                        nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);

        if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) {
                params.plink_state =
                        nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]);
                if (info->attrs[NL80211_ATTR_MESH_PEER_AID])
                        params.peer_aid = nla_get_u16(
                                info->attrs[NL80211_ATTR_MESH_PEER_AID]);
                params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE;
        }

        if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE])
                params.local_pm = nla_get_u32(
                        info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]);

        if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
                params.link_sta_params.opmode_notif_used = true;
                params.link_sta_params.opmode_notif =
                        nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
        }

        if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
                params.link_sta_params.he_6ghz_capa =
                        nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);

        if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
                params.airtime_weight =
                        nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]);

        if (params.airtime_weight &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
                return -EOPNOTSUPP;

        err = nl80211_parse_sta_txpower_setting(info,
                                                &params.link_sta_params.txpwr,
                                                &params.link_sta_params.txpwr_set);
        if (err)
                return err;

        /* Include parameters for TDLS peer (will check later) */
        err = nl80211_set_station_tdls(info, &params);
        if (err)
                return err;

        params.vlan = get_vlan(info, rdev);
        if (IS_ERR(params.vlan))
                return PTR_ERR(params.vlan);

        switch (dev->ieee80211_ptr->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_MESH_POINT:
                break;
        default:
                err = -EOPNOTSUPP;
                goto out_put_vlan;
        }

        /* driver will call cfg80211_check_station_change() */
        err = rdev_change_station(rdev, dev, mac_addr, &params);

 out_put_vlan:
        dev_put(params.vlan);

        return err;
}

static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int err;
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct station_parameters params;
        u8 *mac_addr = NULL;
        u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
                         BIT(NL80211_STA_FLAG_ASSOCIATED);

        memset(&params, 0, sizeof(params));

        if (!rdev->ops->add_station)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_STA_AID] &&
            !info->attrs[NL80211_ATTR_PEER_AID])
                return -EINVAL;

        params.link_sta_params.link_id =
                nl80211_link_id_or_invalid(info->attrs);

        if (info->attrs[NL80211_ATTR_MLD_ADDR]) {
                mac_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
                params.link_sta_params.mld_mac = mac_addr;
                params.link_sta_params.link_mac =
                        nla_data(info->attrs[NL80211_ATTR_MAC]);
                if (!is_valid_ether_addr(params.link_sta_params.link_mac))
                        return -EINVAL;
        } else {
                mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
        }

        params.link_sta_params.supported_rates =
                nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
        params.link_sta_params.supported_rates_len =
                nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
        params.listen_interval =
                nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);

        if (info->attrs[NL80211_ATTR_VLAN_ID])
                params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);

        if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
                params.support_p2p_ps =
                        nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
        } else {
                /*
                 * if not specified, assume it's supported for P2P GO interface,
                 * and is NOT supported for AP interface
                 */
                params.support_p2p_ps =
                        dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO;
        }

        if (info->attrs[NL80211_ATTR_PEER_AID])
                params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
        else
                params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);

        if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) {
                params.capability =
                        nla_get_u16(info->attrs[NL80211_ATTR_STA_CAPABILITY]);
                params.sta_modify_mask |= STATION_PARAM_APPLY_CAPABILITY;
        }

        if (info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]) {
                params.ext_capab =
                        nla_data(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
                params.ext_capab_len =
                        nla_len(info->attrs[NL80211_ATTR_STA_EXT_CAPABILITY]);
        }

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
                params.link_sta_params.ht_capa =
                        nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);

        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
                params.link_sta_params.vht_capa =
                        nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);

        if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
                params.link_sta_params.he_capa =
                        nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
                params.link_sta_params.he_capa_len =
                        nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);

                if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
                        params.link_sta_params.eht_capa =
                                nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
                        params.link_sta_params.eht_capa_len =
                                nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);

                        if (!ieee80211_eht_capa_size_ok((const u8 *)params.link_sta_params.he_capa,
                                                        (const u8 *)params.link_sta_params.eht_capa,
                                                        params.link_sta_params.eht_capa_len,
                                                        false))
                                return -EINVAL;
                }
        }

        if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
                params.link_sta_params.he_6ghz_capa =
                        nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);

        if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
                params.link_sta_params.opmode_notif_used = true;
                params.link_sta_params.opmode_notif =
                        nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
        }

        if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
                params.plink_action =
                        nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);

        if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT])
                params.airtime_weight =
                        nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]);

        if (params.airtime_weight &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
                return -EOPNOTSUPP;

        err = nl80211_parse_sta_txpower_setting(info,
                                                &params.link_sta_params.txpwr,
                                                &params.link_sta_params.txpwr_set);
        if (err)
                return err;

        err = nl80211_parse_sta_channel_info(info, &params);
        if (err)
                return err;

        err = nl80211_parse_sta_wme(info, &params);
        if (err)
                return err;

        if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
                return -EINVAL;

        /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT
         * as userspace might just pass through the capabilities from the IEs
         * directly, rather than enforcing this restriction and returning an
         * error in this case.
         */
        if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
                params.link_sta_params.ht_capa = NULL;
                params.link_sta_params.vht_capa = NULL;

                /* HE and EHT require WME */
                if (params.link_sta_params.he_capa_len ||
                    params.link_sta_params.he_6ghz_capa ||
                    params.link_sta_params.eht_capa_len)
                        return -EINVAL;
        }

        /* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */
        if (params.link_sta_params.he_6ghz_capa &&
            (params.link_sta_params.ht_capa || params.link_sta_params.vht_capa))
                return -EINVAL;

        /* When you run into this, adjust the code below for the new flag */
        BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 8);

        switch (dev->ieee80211_ptr->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_GO:
                /* ignore WME attributes if iface/sta is not capable */
                if (!(rdev->wiphy.flags & WIPHY_FLAG_AP_UAPSD) ||
                    !(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME)))
                        params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD;

                /* TDLS peers cannot be added */
                if ((params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) ||
                    info->attrs[NL80211_ATTR_PEER_AID])
                        return -EINVAL;
                /* but don't bother the driver with it */
                params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_TDLS_PEER);

                /* allow authenticated/associated only if driver handles it */
                if (!(rdev->wiphy.features &
                                NL80211_FEATURE_FULL_AP_CLIENT_STATE) &&
                    params.sta_flags_mask & auth_assoc)
                        return -EINVAL;

                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT) &&
                    params.sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
                        return -EINVAL;

                /* Older userspace, or userspace wanting to be compatible with
                 * !NL80211_FEATURE_FULL_AP_CLIENT_STATE, will not set the auth
                 * and assoc flags in the mask, but assumes the station will be
                 * added as associated anyway since this was the required driver
                 * behaviour before NL80211_FEATURE_FULL_AP_CLIENT_STATE was
                 * introduced.
                 * In order to not bother drivers with this quirk in the API
                 * set the flags in both the mask and set for new stations in
                 * this case.
                 */
                if (!(params.sta_flags_mask & auth_assoc)) {
                        params.sta_flags_mask |= auth_assoc;
                        params.sta_flags_set |= auth_assoc;
                }

                /* must be last in here for error handling */
                params.vlan = get_vlan(info, rdev);
                if (IS_ERR(params.vlan))
                        return PTR_ERR(params.vlan);
                break;
        case NL80211_IFTYPE_MESH_POINT:
                /* ignore uAPSD data */
                params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD;

                /* associated is disallowed */
                if (params.sta_flags_mask & BIT(NL80211_STA_FLAG_ASSOCIATED))
                        return -EINVAL;
                /* TDLS peers cannot be added */
                if ((params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) ||
                    info->attrs[NL80211_ATTR_PEER_AID])
                        return -EINVAL;
                break;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                /* ignore uAPSD data */
                params.sta_modify_mask &= ~STATION_PARAM_APPLY_UAPSD;

                /* these are disallowed */
                if (params.sta_flags_mask &
                                (BIT(NL80211_STA_FLAG_ASSOCIATED) |
                                 BIT(NL80211_STA_FLAG_AUTHENTICATED)))
                        return -EINVAL;
                /* Only TDLS peers can be added */
                if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)))
                        return -EINVAL;
                /* Can only add if TDLS ... */
                if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS))
                        return -EOPNOTSUPP;
                /* ... with external setup is supported */
                if (!(rdev->wiphy.flags & WIPHY_FLAG_TDLS_EXTERNAL_SETUP))
                        return -EOPNOTSUPP;
                /*
                 * Older wpa_supplicant versions always mark the TDLS peer
                 * as authorized, but it shouldn't yet be.
                 */
                params.sta_flags_mask &= ~BIT(NL80211_STA_FLAG_AUTHORIZED);
                break;
        default:
                return -EOPNOTSUPP;
        }

        /* be aware of params.vlan when changing code here */

        if (wdev->valid_links) {
                if (params.link_sta_params.link_id < 0) {
                        err = -EINVAL;
                        goto out;
                }
                if (!(wdev->valid_links & BIT(params.link_sta_params.link_id))) {
                        err = -ENOLINK;
                        goto out;
                }
        } else {
                if (params.link_sta_params.link_id >= 0) {
                        err = -EINVAL;
                        goto out;
                }
        }
        err = rdev_add_station(rdev, dev, mac_addr, &params);
out:
        dev_put(params.vlan);
        return err;
}

static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct station_del_parameters params;
        int link_id = nl80211_link_id_or_invalid(info->attrs);

        memset(&params, 0, sizeof(params));

        if (info->attrs[NL80211_ATTR_MAC])
                params.mac = nla_data(info->attrs[NL80211_ATTR_MAC]);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_MESH_POINT:
        case NL80211_IFTYPE_P2P_GO:
                /* always accept these */
                break;
        case NL80211_IFTYPE_ADHOC:
                /* conditionally accept */
                if (wiphy_ext_feature_isset(&rdev->wiphy,
                                            NL80211_EXT_FEATURE_DEL_IBSS_STA))
                        break;
                return -EINVAL;
        default:
                return -EINVAL;
        }

        if (!rdev->ops->del_station)
                return -EOPNOTSUPP;

        if (info->attrs[NL80211_ATTR_MGMT_SUBTYPE]) {
                params.subtype =
                        nla_get_u8(info->attrs[NL80211_ATTR_MGMT_SUBTYPE]);
                if (params.subtype != IEEE80211_STYPE_DISASSOC >> 4 &&
                    params.subtype != IEEE80211_STYPE_DEAUTH >> 4)
                        return -EINVAL;
        } else {
                /* Default to Deauthentication frame */
                params.subtype = IEEE80211_STYPE_DEAUTH >> 4;
        }

        if (info->attrs[NL80211_ATTR_REASON_CODE]) {
                params.reason_code =
                        nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
                if (params.reason_code == 0)
                        return -EINVAL; /* 0 is reserved */
        } else {
                /* Default to reason code 2 */
                params.reason_code = WLAN_REASON_PREV_AUTH_NOT_VALID;
        }

        /* Link ID not expected in case of non-ML operation */
        if (!wdev->valid_links && link_id != -1)
                return -EINVAL;

        /* If given, a valid link ID should be passed during MLO */
        if (wdev->valid_links && link_id >= 0 &&
            !(wdev->valid_links & BIT(link_id)))
                return -EINVAL;

        params.link_id = link_id;

        return rdev_del_station(rdev, dev, &params);
}

static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq,
                                int flags, struct net_device *dev,
                                u8 *dst, u8 *next_hop,
                                struct mpath_info *pinfo)
{
        void *hdr;
        struct nlattr *pinfoattr;

        hdr = nl80211hdr_put(msg, portid, seq, flags, NL80211_CMD_NEW_MPATH);
        if (!hdr)
                return -1;

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, dst) ||
            nla_put(msg, NL80211_ATTR_MPATH_NEXT_HOP, ETH_ALEN, next_hop) ||
            nla_put_u32(msg, NL80211_ATTR_GENERATION, pinfo->generation))
                goto nla_put_failure;

        pinfoattr = nla_nest_start_noflag(msg, NL80211_ATTR_MPATH_INFO);
        if (!pinfoattr)
                goto nla_put_failure;
        if ((pinfo->filled & MPATH_INFO_FRAME_QLEN) &&
            nla_put_u32(msg, NL80211_MPATH_INFO_FRAME_QLEN,
                        pinfo->frame_qlen))
                goto nla_put_failure;
        if (((pinfo->filled & MPATH_INFO_SN) &&
             nla_put_u32(msg, NL80211_MPATH_INFO_SN, pinfo->sn)) ||
            ((pinfo->filled & MPATH_INFO_METRIC) &&
             nla_put_u32(msg, NL80211_MPATH_INFO_METRIC,
                         pinfo->metric)) ||
            ((pinfo->filled & MPATH_INFO_EXPTIME) &&
             nla_put_u32(msg, NL80211_MPATH_INFO_EXPTIME,
                         pinfo->exptime)) ||
            ((pinfo->filled & MPATH_INFO_FLAGS) &&
             nla_put_u8(msg, NL80211_MPATH_INFO_FLAGS,
                        pinfo->flags)) ||
            ((pinfo->filled & MPATH_INFO_DISCOVERY_TIMEOUT) &&
             nla_put_u32(msg, NL80211_MPATH_INFO_DISCOVERY_TIMEOUT,
                         pinfo->discovery_timeout)) ||
            ((pinfo->filled & MPATH_INFO_DISCOVERY_RETRIES) &&
             nla_put_u8(msg, NL80211_MPATH_INFO_DISCOVERY_RETRIES,
                        pinfo->discovery_retries)) ||
            ((pinfo->filled & MPATH_INFO_HOP_COUNT) &&
             nla_put_u8(msg, NL80211_MPATH_INFO_HOP_COUNT,
                        pinfo->hop_count)) ||
            ((pinfo->filled & MPATH_INFO_PATH_CHANGE) &&
             nla_put_u32(msg, NL80211_MPATH_INFO_PATH_CHANGE,
                         pinfo->path_change_count)))
                goto nla_put_failure;

        nla_nest_end(msg, pinfoattr);

        genlmsg_end(msg, hdr);
        return 0;

 nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nl80211_dump_mpath(struct sk_buff *skb,
                              struct netlink_callback *cb)
{
        struct mpath_info pinfo;
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        u8 dst[ETH_ALEN];
        u8 next_hop[ETH_ALEN];
        int path_idx = cb->args[2];
        int err;

        err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
        if (err)
                return err;
        /* nl80211_prepare_wdev_dump acquired it in the successful case */
        __acquire(&rdev->wiphy.mtx);

        if (!rdev->ops->dump_mpath) {
                err = -EOPNOTSUPP;
                goto out_err;
        }

        if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) {
                err = -EOPNOTSUPP;
                goto out_err;
        }

        while (1) {
                err = rdev_dump_mpath(rdev, wdev->netdev, path_idx, dst,
                                      next_hop, &pinfo);
                if (err == -ENOENT)
                        break;
                if (err)
                        goto out_err;

                if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).portid,
                                       cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                       wdev->netdev, dst, next_hop,
                                       &pinfo) < 0)
                        goto out;

                path_idx++;
        }

 out:
        cb->args[2] = path_idx;
        err = skb->len;
 out_err:
        wiphy_unlock(&rdev->wiphy);
        return err;
}

static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int err;
        struct net_device *dev = info->user_ptr[1];
        struct mpath_info pinfo;
        struct sk_buff *msg;
        u8 *dst = NULL;
        u8 next_hop[ETH_ALEN];

        memset(&pinfo, 0, sizeof(pinfo));

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        dst = nla_data(info->attrs[NL80211_ATTR_MAC]);

        if (!rdev->ops->get_mpath)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        err = rdev_get_mpath(rdev, dev, dst, next_hop, &pinfo);
        if (err)
                return err;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        if (nl80211_send_mpath(msg, info->snd_portid, info->snd_seq, 0,
                                 dev, dst, next_hop, &pinfo) < 0) {
                nlmsg_free(msg);
                return -ENOBUFS;
        }

        return genlmsg_reply(msg, info);
}

static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        u8 *dst = NULL;
        u8 *next_hop = NULL;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP])
                return -EINVAL;

        dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
        next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);

        if (!rdev->ops->change_mpath)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        return rdev_change_mpath(rdev, dev, dst, next_hop);
}

static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        u8 *dst = NULL;
        u8 *next_hop = NULL;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_MPATH_NEXT_HOP])
                return -EINVAL;

        dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
        next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);

        if (!rdev->ops->add_mpath)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        return rdev_add_mpath(rdev, dev, dst, next_hop);
}

static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        u8 *dst = NULL;

        if (info->attrs[NL80211_ATTR_MAC])
                dst = nla_data(info->attrs[NL80211_ATTR_MAC]);

        if (!rdev->ops->del_mpath)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        return rdev_del_mpath(rdev, dev, dst);
}

static int nl80211_get_mpp(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        int err;
        struct net_device *dev = info->user_ptr[1];
        struct mpath_info pinfo;
        struct sk_buff *msg;
        u8 *dst = NULL;
        u8 mpp[ETH_ALEN];

        memset(&pinfo, 0, sizeof(pinfo));

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        dst = nla_data(info->attrs[NL80211_ATTR_MAC]);

        if (!rdev->ops->get_mpp)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        err = rdev_get_mpp(rdev, dev, dst, mpp, &pinfo);
        if (err)
                return err;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        if (nl80211_send_mpath(msg, info->snd_portid, info->snd_seq, 0,
                               dev, dst, mpp, &pinfo) < 0) {
                nlmsg_free(msg);
                return -ENOBUFS;
        }

        return genlmsg_reply(msg, info);
}

static int nl80211_dump_mpp(struct sk_buff *skb,
                            struct netlink_callback *cb)
{
        struct mpath_info pinfo;
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        u8 dst[ETH_ALEN];
        u8 mpp[ETH_ALEN];
        int path_idx = cb->args[2];
        int err;

        err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
        if (err)
                return err;
        /* nl80211_prepare_wdev_dump acquired it in the successful case */
        __acquire(&rdev->wiphy.mtx);

        if (!rdev->ops->dump_mpp) {
                err = -EOPNOTSUPP;
                goto out_err;
        }

        if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) {
                err = -EOPNOTSUPP;
                goto out_err;
        }

        while (1) {
                err = rdev_dump_mpp(rdev, wdev->netdev, path_idx, dst,
                                    mpp, &pinfo);
                if (err == -ENOENT)
                        break;
                if (err)
                        goto out_err;

                if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).portid,
                                       cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                       wdev->netdev, dst, mpp,
                                       &pinfo) < 0)
                        goto out;

                path_idx++;
        }

 out:
        cb->args[2] = path_idx;
        err = skb->len;
 out_err:
        wiphy_unlock(&rdev->wiphy);
        return err;
}

static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct bss_parameters params;

        memset(&params, 0, sizeof(params));
        params.link_id = nl80211_link_id_or_invalid(info->attrs);
        /* default to not changing parameters */
        params.use_cts_prot = -1;
        params.use_short_preamble = -1;
        params.use_short_slot_time = -1;
        params.ap_isolate = -1;
        params.ht_opmode = -1;
        params.p2p_ctwindow = -1;
        params.p2p_opp_ps = -1;

        if (info->attrs[NL80211_ATTR_BSS_CTS_PROT])
                params.use_cts_prot =
                    nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]);
        if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE])
                params.use_short_preamble =
                    nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]);
        if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME])
                params.use_short_slot_time =
                    nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]);
        if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
                params.basic_rates =
                        nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
                params.basic_rates_len =
                        nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
        }
        if (info->attrs[NL80211_ATTR_AP_ISOLATE])
                params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
        if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE])
                params.ht_opmode =
                        nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]);

        if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) {
                if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
                        return -EINVAL;
                params.p2p_ctwindow =
                        nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
                if (params.p2p_ctwindow != 0 &&
                    !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
                        return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_P2P_OPPPS]) {
                u8 tmp;

                if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
                        return -EINVAL;
                tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
                params.p2p_opp_ps = tmp;
                if (params.p2p_opp_ps &&
                    !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
                        return -EINVAL;
        }

        if (!rdev->ops->change_bss)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
                return -EOPNOTSUPP;

        return rdev_change_bss(rdev, dev, &params);
}

static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
{
        char *data = NULL;
        bool is_indoor;
        enum nl80211_user_reg_hint_type user_reg_hint_type;
        u32 owner_nlportid;

        /*
         * You should only get this when cfg80211 hasn't yet initialized
         * completely when built-in to the kernel right between the time
         * window between nl80211_init() and regulatory_init(), if that is
         * even possible.
         */
        if (unlikely(!rcu_access_pointer(cfg80211_regdomain)))
                return -EINPROGRESS;

        if (info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE])
                user_reg_hint_type =
                  nla_get_u32(info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE]);
        else
                user_reg_hint_type = NL80211_USER_REG_HINT_USER;

        switch (user_reg_hint_type) {
        case NL80211_USER_REG_HINT_USER:
        case NL80211_USER_REG_HINT_CELL_BASE:
                if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
                        return -EINVAL;

                data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
                return regulatory_hint_user(data, user_reg_hint_type);
        case NL80211_USER_REG_HINT_INDOOR:
                if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
                        owner_nlportid = info->snd_portid;
                        is_indoor = !!info->attrs[NL80211_ATTR_REG_INDOOR];
                } else {
                        owner_nlportid = 0;
                        is_indoor = true;
                }

                regulatory_hint_indoor(is_indoor, owner_nlportid);
                return 0;
        default:
                return -EINVAL;
        }
}

static int nl80211_reload_regdb(struct sk_buff *skb, struct genl_info *info)
{
        return reg_reload_regdb();
}

static int nl80211_get_mesh_config(struct sk_buff *skb,
                                   struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct mesh_config cur_params;
        int err = 0;
        void *hdr;
        struct nlattr *pinfoattr;
        struct sk_buff *msg;

        if (wdev->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        if (!rdev->ops->get_mesh_config)
                return -EOPNOTSUPP;

        /* If not connected, get default parameters */
        if (!wdev->u.mesh.id_len)
                memcpy(&cur_params, &default_mesh_config, sizeof(cur_params));
        else
                err = rdev_get_mesh_config(rdev, dev, &cur_params);

        if (err)
                return err;

        /* Draw up a netlink message to send back */
        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;
        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_GET_MESH_CONFIG);
        if (!hdr)
                goto out;
        pinfoattr = nla_nest_start_noflag(msg, NL80211_ATTR_MESH_CONFIG);
        if (!pinfoattr)
                goto nla_put_failure;
        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put_u16(msg, NL80211_MESHCONF_RETRY_TIMEOUT,
                        cur_params.dot11MeshRetryTimeout) ||
            nla_put_u16(msg, NL80211_MESHCONF_CONFIRM_TIMEOUT,
                        cur_params.dot11MeshConfirmTimeout) ||
            nla_put_u16(msg, NL80211_MESHCONF_HOLDING_TIMEOUT,
                        cur_params.dot11MeshHoldingTimeout) ||
            nla_put_u16(msg, NL80211_MESHCONF_MAX_PEER_LINKS,
                        cur_params.dot11MeshMaxPeerLinks) ||
            nla_put_u8(msg, NL80211_MESHCONF_MAX_RETRIES,
                       cur_params.dot11MeshMaxRetries) ||
            nla_put_u8(msg, NL80211_MESHCONF_TTL,
                       cur_params.dot11MeshTTL) ||
            nla_put_u8(msg, NL80211_MESHCONF_ELEMENT_TTL,
                       cur_params.element_ttl) ||
            nla_put_u8(msg, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
                       cur_params.auto_open_plinks) ||
            nla_put_u32(msg, NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR,
                        cur_params.dot11MeshNbrOffsetMaxNeighbor) ||
            nla_put_u8(msg, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
                       cur_params.dot11MeshHWMPmaxPREQretries) ||
            nla_put_u32(msg, NL80211_MESHCONF_PATH_REFRESH_TIME,
                        cur_params.path_refresh_time) ||
            nla_put_u16(msg, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
                        cur_params.min_discovery_timeout) ||
            nla_put_u32(msg, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
                        cur_params.dot11MeshHWMPactivePathTimeout) ||
            nla_put_u16(msg, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
                        cur_params.dot11MeshHWMPpreqMinInterval) ||
            nla_put_u16(msg, NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL,
                        cur_params.dot11MeshHWMPperrMinInterval) ||
            nla_put_u16(msg, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
                        cur_params.dot11MeshHWMPnetDiameterTraversalTime) ||
            nla_put_u8(msg, NL80211_MESHCONF_HWMP_ROOTMODE,
                       cur_params.dot11MeshHWMPRootMode) ||
            nla_put_u16(msg, NL80211_MESHCONF_HWMP_RANN_INTERVAL,
                        cur_params.dot11MeshHWMPRannInterval) ||
            nla_put_u8(msg, NL80211_MESHCONF_GATE_ANNOUNCEMENTS,
                       cur_params.dot11MeshGateAnnouncementProtocol) ||
            nla_put_u8(msg, NL80211_MESHCONF_FORWARDING,
                       cur_params.dot11MeshForwarding) ||
            nla_put_s32(msg, NL80211_MESHCONF_RSSI_THRESHOLD,
                        cur_params.rssi_threshold) ||
            nla_put_u32(msg, NL80211_MESHCONF_HT_OPMODE,
                        cur_params.ht_opmode) ||
            nla_put_u32(msg, NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
                        cur_params.dot11MeshHWMPactivePathToRootTimeout) ||
            nla_put_u16(msg, NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
                        cur_params.dot11MeshHWMProotInterval) ||
            nla_put_u16(msg, NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
                        cur_params.dot11MeshHWMPconfirmationInterval) ||
            nla_put_u32(msg, NL80211_MESHCONF_POWER_MODE,
                        cur_params.power_mode) ||
            nla_put_u16(msg, NL80211_MESHCONF_AWAKE_WINDOW,
                        cur_params.dot11MeshAwakeWindowDuration) ||
            nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT,
                        cur_params.plink_timeout) ||
            nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE,
                       cur_params.dot11MeshConnectedToMeshGate) ||
            nla_put_u8(msg, NL80211_MESHCONF_NOLEARN,
                       cur_params.dot11MeshNolearn) ||
            nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_AS,
                       cur_params.dot11MeshConnectedToAuthServer))
                goto nla_put_failure;
        nla_nest_end(msg, pinfoattr);
        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

 nla_put_failure:
 out:
        nlmsg_free(msg);
        return -ENOBUFS;
}

static const struct nla_policy
nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
        [NL80211_MESHCONF_RETRY_TIMEOUT] =
                NLA_POLICY_RANGE(NLA_U16, 1, 255),
        [NL80211_MESHCONF_CONFIRM_TIMEOUT] =
                NLA_POLICY_RANGE(NLA_U16, 1, 255),
        [NL80211_MESHCONF_HOLDING_TIMEOUT] =
                NLA_POLICY_RANGE(NLA_U16, 1, 255),
        [NL80211_MESHCONF_MAX_PEER_LINKS] =
                NLA_POLICY_RANGE(NLA_U16, 0, 255),
        [NL80211_MESHCONF_MAX_RETRIES] = NLA_POLICY_MAX(NLA_U8, 16),
        [NL80211_MESHCONF_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_MESHCONF_ELEMENT_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
        [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = NLA_POLICY_MAX(NLA_U8, 1),
        [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] =
                NLA_POLICY_RANGE(NLA_U32, 1, 255),
        [NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 },
        [NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 },
        [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 },
        [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] =
                NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] =
                NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] =
                NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_MESHCONF_HWMP_ROOTMODE] = NLA_POLICY_MAX(NLA_U8, 4),
        [NL80211_MESHCONF_HWMP_RANN_INTERVAL] =
                NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = NLA_POLICY_MAX(NLA_U8, 1),
        [NL80211_MESHCONF_FORWARDING] = NLA_POLICY_MAX(NLA_U8, 1),
        [NL80211_MESHCONF_RSSI_THRESHOLD] =
                NLA_POLICY_RANGE(NLA_S32, -255, 0),
        [NL80211_MESHCONF_HT_OPMODE] = { .type = NLA_U16 },
        [NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT] = { .type = NLA_U32 },
        [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] =
                NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] =
                NLA_POLICY_MIN(NLA_U16, 1),
        [NL80211_MESHCONF_POWER_MODE] =
                NLA_POLICY_RANGE(NLA_U32,
                                 NL80211_MESH_POWER_ACTIVE,
                                 NL80211_MESH_POWER_MAX),
        [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 },
        [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 },
        [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
        [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
        [NL80211_MESHCONF_CONNECTED_TO_AS] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
};

static const struct nla_policy
        nl80211_mesh_setup_params_policy[NL80211_MESH_SETUP_ATTR_MAX+1] = {
        [NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC] = { .type = NLA_U8 },
        [NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL] = { .type = NLA_U8 },
        [NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC] = { .type = NLA_U8 },
        [NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG },
        [NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 },
        [NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG },
        [NL80211_MESH_SETUP_IE] =
                NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
                                       IEEE80211_MAX_DATA_LEN),
        [NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG },
};

static int nl80211_parse_mesh_config(struct genl_info *info,
                                     struct mesh_config *cfg,
                                     u32 *mask_out)
{
        struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
        u32 mask = 0;
        u16 ht_opmode;

#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, mask, attr, fn)        \
do {                                                                        \
        if (tb[attr]) {                                                        \
                cfg->param = fn(tb[attr]);                                \
                mask |= BIT((attr) - 1);                                \
        }                                                                \
} while (0)

        if (!info->attrs[NL80211_ATTR_MESH_CONFIG])
                return -EINVAL;
        if (nla_parse_nested_deprecated(tb, NL80211_MESHCONF_ATTR_MAX, info->attrs[NL80211_ATTR_MESH_CONFIG], nl80211_meshconf_params_policy, info->extack))
                return -EINVAL;

        /* This makes sure that there aren't more than 32 mesh config
         * parameters (otherwise our bitfield scheme would not work.) */
        BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);

        /* Fill in the params struct */
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, mask,
                                  NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, mask,
                                  NL80211_MESHCONF_CONFIRM_TIMEOUT,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, mask,
                                  NL80211_MESHCONF_HOLDING_TIMEOUT,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, mask,
                                  NL80211_MESHCONF_MAX_PEER_LINKS,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, mask,
                                  NL80211_MESHCONF_MAX_RETRIES, nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, mask,
                                  NL80211_MESHCONF_TTL, nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, mask,
                                  NL80211_MESHCONF_ELEMENT_TTL, nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, mask,
                                  NL80211_MESHCONF_AUTO_OPEN_PLINKS,
                                  nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor,
                                  mask,
                                  NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR,
                                  nla_get_u32);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, mask,
                                  NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
                                  nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, mask,
                                  NL80211_MESHCONF_PATH_REFRESH_TIME,
                                  nla_get_u32);
        if (mask & BIT(NL80211_MESHCONF_PATH_REFRESH_TIME) &&
            (cfg->path_refresh_time < 1 || cfg->path_refresh_time > 65535))
                return -EINVAL;
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, mask,
                                  NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout,
                                  mask,
                                  NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
                                  nla_get_u32);
        if (mask & BIT(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT) &&
            (cfg->dot11MeshHWMPactivePathTimeout < 1 ||
             cfg->dot11MeshHWMPactivePathTimeout > 65535))
                return -EINVAL;
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, mask,
                                  NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, mask,
                                  NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
                                  dot11MeshHWMPnetDiameterTraversalTime, mask,
                                  NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, mask,
                                  NL80211_MESHCONF_HWMP_ROOTMODE, nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, mask,
                                  NL80211_MESHCONF_HWMP_RANN_INTERVAL,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshGateAnnouncementProtocol,
                                  mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS,
                                  nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, mask,
                                  NL80211_MESHCONF_FORWARDING, nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, mask,
                                  NL80211_MESHCONF_RSSI_THRESHOLD,
                                  nla_get_s32);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask,
                                  NL80211_MESHCONF_CONNECTED_TO_GATE,
                                  nla_get_u8);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToAuthServer, mask,
                                  NL80211_MESHCONF_CONNECTED_TO_AS,
                                  nla_get_u8);
        /*
         * Check HT operation mode based on
         * IEEE 802.11-2016 9.4.2.57 HT Operation element.
         */
        if (tb[NL80211_MESHCONF_HT_OPMODE]) {
                ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]);

                if (ht_opmode & ~(IEEE80211_HT_OP_MODE_PROTECTION |
                                  IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
                                  IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT))
                        return -EINVAL;

                /* NON_HT_STA bit is reserved, but some programs set it */
                ht_opmode &= ~IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;

                cfg->ht_opmode = ht_opmode;
                mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
        }
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
                                  dot11MeshHWMPactivePathToRootTimeout, mask,
                                  NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
                                  nla_get_u32);
        if (mask & BIT(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT) &&
            (cfg->dot11MeshHWMPactivePathToRootTimeout < 1 ||
             cfg->dot11MeshHWMPactivePathToRootTimeout > 65535))
                return -EINVAL;
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, mask,
                                  NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPconfirmationInterval,
                                  mask,
                                  NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
                                  nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, mask,
                                  NL80211_MESHCONF_POWER_MODE, nla_get_u32);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, mask,
                                  NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask,
                                  NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32);
        FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNolearn, mask,
                                  NL80211_MESHCONF_NOLEARN, nla_get_u8);
        if (mask_out)
                *mask_out = mask;

        return 0;

#undef FILL_IN_MESH_PARAM_IF_SET
}

static int nl80211_parse_mesh_setup(struct genl_info *info,
                                     struct mesh_setup *setup)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct nlattr *tb[NL80211_MESH_SETUP_ATTR_MAX + 1];

        if (!info->attrs[NL80211_ATTR_MESH_SETUP])
                return -EINVAL;
        if (nla_parse_nested_deprecated(tb, NL80211_MESH_SETUP_ATTR_MAX, info->attrs[NL80211_ATTR_MESH_SETUP], nl80211_mesh_setup_params_policy, info->extack))
                return -EINVAL;

        if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC])
                setup->sync_method =
                (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC])) ?
                 IEEE80211_SYNC_METHOD_VENDOR :
                 IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET;

        if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL])
                setup->path_sel_proto =
                (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_PATH_SEL])) ?
                 IEEE80211_PATH_PROTOCOL_VENDOR :
                 IEEE80211_PATH_PROTOCOL_HWMP;

        if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC])
                setup->path_metric =
                (nla_get_u8(tb[NL80211_MESH_SETUP_ENABLE_VENDOR_METRIC])) ?
                 IEEE80211_PATH_METRIC_VENDOR :
                 IEEE80211_PATH_METRIC_AIRTIME;

        if (tb[NL80211_MESH_SETUP_IE]) {
                struct nlattr *ieattr =
                        tb[NL80211_MESH_SETUP_IE];
                setup->ie = nla_data(ieattr);
                setup->ie_len = nla_len(ieattr);
        }
        if (tb[NL80211_MESH_SETUP_USERSPACE_MPM] &&
            !(rdev->wiphy.features & NL80211_FEATURE_USERSPACE_MPM))
                return -EINVAL;
        setup->user_mpm = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_MPM]);
        setup->is_authenticated = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AUTH]);
        setup->is_secure = nla_get_flag(tb[NL80211_MESH_SETUP_USERSPACE_AMPE]);
        if (setup->is_secure)
                setup->user_mpm = true;

        if (tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]) {
                if (!setup->user_mpm)
                        return -EINVAL;
                setup->auth_id =
                        nla_get_u8(tb[NL80211_MESH_SETUP_AUTH_PROTOCOL]);
        }

        return 0;
}

static int nl80211_update_mesh_config(struct sk_buff *skb,
                                      struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct mesh_config cfg = {};
        u32 mask;
        int err;

        if (wdev->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        if (!rdev->ops->update_mesh_config)
                return -EOPNOTSUPP;

        err = nl80211_parse_mesh_config(info, &cfg, &mask);
        if (err)
                return err;

        if (!wdev->u.mesh.id_len)
                err = -ENOLINK;

        if (!err)
                err = rdev_update_mesh_config(rdev, dev, mask, &cfg);

        return err;
}

static int nl80211_put_regdom(const struct ieee80211_regdomain *regdom,
                              struct sk_buff *msg)
{
        struct nlattr *nl_reg_rules;
        unsigned int i;

        if (nla_put_string(msg, NL80211_ATTR_REG_ALPHA2, regdom->alpha2) ||
            (regdom->dfs_region &&
             nla_put_u8(msg, NL80211_ATTR_DFS_REGION, regdom->dfs_region)))
                goto nla_put_failure;

        nl_reg_rules = nla_nest_start_noflag(msg, NL80211_ATTR_REG_RULES);
        if (!nl_reg_rules)
                goto nla_put_failure;

        for (i = 0; i < regdom->n_reg_rules; i++) {
                struct nlattr *nl_reg_rule;
                const struct ieee80211_reg_rule *reg_rule;
                const struct ieee80211_freq_range *freq_range;
                const struct ieee80211_power_rule *power_rule;
                unsigned int max_bandwidth_khz;

                reg_rule = &regdom->reg_rules[i];
                freq_range = &reg_rule->freq_range;
                power_rule = &reg_rule->power_rule;

                nl_reg_rule = nla_nest_start_noflag(msg, i);
                if (!nl_reg_rule)
                        goto nla_put_failure;

                max_bandwidth_khz = freq_range->max_bandwidth_khz;
                if (!max_bandwidth_khz)
                        max_bandwidth_khz = reg_get_max_bandwidth(regdom,
                                                                  reg_rule);

                if (nla_put_u32(msg, NL80211_ATTR_REG_RULE_FLAGS,
                                reg_rule->flags) ||
                    nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_START,
                                freq_range->start_freq_khz) ||
                    nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_END,
                                freq_range->end_freq_khz) ||
                    nla_put_u32(msg, NL80211_ATTR_FREQ_RANGE_MAX_BW,
                                max_bandwidth_khz) ||
                    nla_put_u32(msg, NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN,
                                power_rule->max_antenna_gain) ||
                    nla_put_u32(msg, NL80211_ATTR_POWER_RULE_MAX_EIRP,
                                power_rule->max_eirp) ||
                    nla_put_u32(msg, NL80211_ATTR_DFS_CAC_TIME,
                                reg_rule->dfs_cac_ms))
                        goto nla_put_failure;

                if ((reg_rule->flags & NL80211_RRF_PSD) &&
                    nla_put_s8(msg, NL80211_ATTR_POWER_RULE_PSD,
                               reg_rule->psd))
                        goto nla_put_failure;

                nla_nest_end(msg, nl_reg_rule);
        }

        nla_nest_end(msg, nl_reg_rules);
        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info)
{
        const struct ieee80211_regdomain *regdom = NULL;
        struct cfg80211_registered_device *rdev;
        struct wiphy *wiphy = NULL;
        struct sk_buff *msg;
        int err = -EMSGSIZE;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOBUFS;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_GET_REG);
        if (!hdr)
                goto put_failure;

        rtnl_lock();

        if (info->attrs[NL80211_ATTR_WIPHY]) {
                bool self_managed;

                rdev = cfg80211_get_dev_from_info(genl_info_net(info), info);
                if (IS_ERR(rdev)) {
                        err = PTR_ERR(rdev);
                        goto nla_put_failure;
                }

                wiphy = &rdev->wiphy;
                self_managed = wiphy->regulatory_flags &
                               REGULATORY_WIPHY_SELF_MANAGED;

                rcu_read_lock();

                regdom = get_wiphy_regdom(wiphy);

                /* a self-managed-reg device must have a private regdom */
                if (WARN_ON(!regdom && self_managed)) {
                        err = -EINVAL;
                        goto nla_put_failure_rcu;
                }

                if (regdom &&
                    nla_put_u32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy)))
                        goto nla_put_failure_rcu;
        } else {
                rcu_read_lock();
        }

        if (!wiphy && reg_last_request_cell_base() &&
            nla_put_u32(msg, NL80211_ATTR_USER_REG_HINT_TYPE,
                        NL80211_USER_REG_HINT_CELL_BASE))
                goto nla_put_failure_rcu;

        if (!regdom)
                regdom = rcu_dereference(cfg80211_regdomain);

        if (nl80211_put_regdom(regdom, msg))
                goto nla_put_failure_rcu;

        rcu_read_unlock();

        genlmsg_end(msg, hdr);
        rtnl_unlock();
        return genlmsg_reply(msg, info);

nla_put_failure_rcu:
        rcu_read_unlock();
nla_put_failure:
        rtnl_unlock();
put_failure:
        nlmsg_free(msg);
        return err;
}

static int nl80211_send_regdom(struct sk_buff *msg, struct netlink_callback *cb,
                               u32 seq, int flags, struct wiphy *wiphy,
                               const struct ieee80211_regdomain *regdom)
{
        void *hdr = nl80211hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags,
                                   NL80211_CMD_GET_REG);

        if (!hdr)
                return -1;

        genl_dump_check_consistent(cb, hdr);

        if (nl80211_put_regdom(regdom, msg))
                goto nla_put_failure;

        if (!wiphy && reg_last_request_cell_base() &&
            nla_put_u32(msg, NL80211_ATTR_USER_REG_HINT_TYPE,
                        NL80211_USER_REG_HINT_CELL_BASE))
                goto nla_put_failure;

        if (wiphy &&
            nla_put_u32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy)))
                goto nla_put_failure;

        if (wiphy && wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
            nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nl80211_get_reg_dump(struct sk_buff *skb,
                                struct netlink_callback *cb)
{
        const struct ieee80211_regdomain *regdom = NULL;
        struct cfg80211_registered_device *rdev;
        int err, reg_idx, start = cb->args[2];

        rcu_read_lock();

        if (cfg80211_regdomain && start == 0) {
                err = nl80211_send_regdom(skb, cb, cb->nlh->nlmsg_seq,
                                          NLM_F_MULTI, NULL,
                                          rcu_dereference(cfg80211_regdomain));
                if (err < 0)
                        goto out_err;
        }

        /* the global regdom is idx 0 */
        reg_idx = 1;
        list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) {
                regdom = get_wiphy_regdom(&rdev->wiphy);
                if (!regdom)
                        continue;

                if (++reg_idx <= start)
                        continue;

                err = nl80211_send_regdom(skb, cb, cb->nlh->nlmsg_seq,
                                          NLM_F_MULTI, &rdev->wiphy, regdom);
                if (err < 0) {
                        reg_idx--;
                        break;
                }
        }

        cb->args[2] = reg_idx;
        err = skb->len;
out_err:
        rcu_read_unlock();
        return err;
}

#ifdef CONFIG_CFG80211_CRDA_SUPPORT
static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
        [NL80211_ATTR_REG_RULE_FLAGS]                = { .type = NLA_U32 },
        [NL80211_ATTR_FREQ_RANGE_START]                = { .type = NLA_U32 },
        [NL80211_ATTR_FREQ_RANGE_END]                = { .type = NLA_U32 },
        [NL80211_ATTR_FREQ_RANGE_MAX_BW]        = { .type = NLA_U32 },
        [NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]        = { .type = NLA_U32 },
        [NL80211_ATTR_POWER_RULE_MAX_EIRP]        = { .type = NLA_U32 },
        [NL80211_ATTR_DFS_CAC_TIME]                = { .type = NLA_U32 },
};

static int parse_reg_rule(struct nlattr *tb[],
        struct ieee80211_reg_rule *reg_rule)
{
        struct ieee80211_freq_range *freq_range = &reg_rule->freq_range;
        struct ieee80211_power_rule *power_rule = &reg_rule->power_rule;

        if (!tb[NL80211_ATTR_REG_RULE_FLAGS])
                return -EINVAL;
        if (!tb[NL80211_ATTR_FREQ_RANGE_START])
                return -EINVAL;
        if (!tb[NL80211_ATTR_FREQ_RANGE_END])
                return -EINVAL;
        if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW])
                return -EINVAL;
        if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP])
                return -EINVAL;

        reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]);

        freq_range->start_freq_khz =
                nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]);
        freq_range->end_freq_khz =
                nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]);
        freq_range->max_bandwidth_khz =
                nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]);

        power_rule->max_eirp =
                nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]);

        if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN])
                power_rule->max_antenna_gain =
                        nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]);

        if (tb[NL80211_ATTR_DFS_CAC_TIME])
                reg_rule->dfs_cac_ms =
                        nla_get_u32(tb[NL80211_ATTR_DFS_CAC_TIME]);

        return 0;
}

static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
        struct nlattr *nl_reg_rule;
        char *alpha2;
        int rem_reg_rules, r;
        u32 num_rules = 0, rule_idx = 0;
        enum nl80211_dfs_regions dfs_region = NL80211_DFS_UNSET;
        struct ieee80211_regdomain *rd;

        if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_REG_RULES])
                return -EINVAL;

        alpha2 = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);

        if (info->attrs[NL80211_ATTR_DFS_REGION])
                dfs_region = nla_get_u8(info->attrs[NL80211_ATTR_DFS_REGION]);

        nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
                            rem_reg_rules) {
                num_rules++;
                if (num_rules > NL80211_MAX_SUPP_REG_RULES)
                        return -EINVAL;
        }

        rtnl_lock();
        if (!reg_is_valid_request(alpha2)) {
                r = -EINVAL;
                goto out;
        }

        rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL);
        if (!rd) {
                r = -ENOMEM;
                goto out;
        }

        rd->n_reg_rules = num_rules;
        rd->alpha2[0] = alpha2[0];
        rd->alpha2[1] = alpha2[1];

        /*
         * Disable DFS master mode if the DFS region was
         * not supported or known on this kernel.
         */
        if (reg_supported_dfs_region(dfs_region))
                rd->dfs_region = dfs_region;

        nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
                            rem_reg_rules) {
                r = nla_parse_nested_deprecated(tb, NL80211_REG_RULE_ATTR_MAX,
                                                nl_reg_rule, reg_rule_policy,
                                                info->extack);
                if (r)
                        goto bad_reg;
                r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
                if (r)
                        goto bad_reg;

                rule_idx++;

                if (rule_idx > NL80211_MAX_SUPP_REG_RULES) {
                        r = -EINVAL;
                        goto bad_reg;
                }
        }

        r = set_regdom(rd, REGD_SOURCE_CRDA);
        /* set_regdom takes ownership of rd */
        rd = NULL;
 bad_reg:
        kfree(rd);
 out:
        rtnl_unlock();
        return r;
}
#endif /* CONFIG_CFG80211_CRDA_SUPPORT */

static int validate_scan_freqs(struct nlattr *freqs)
{
        struct nlattr *attr1, *attr2;
        int n_channels = 0, tmp1, tmp2;

        nla_for_each_nested(attr1, freqs, tmp1)
                if (nla_len(attr1) != sizeof(u32))
                        return 0;

        nla_for_each_nested(attr1, freqs, tmp1) {
                n_channels++;
                /*
                 * Some hardware has a limited channel list for
                 * scanning, and it is pretty much nonsensical
                 * to scan for a channel twice, so disallow that
                 * and don't require drivers to check that the
                 * channel list they get isn't longer than what
                 * they can scan, as long as they can scan all
                 * the channels they registered at once.
                 */
                nla_for_each_nested(attr2, freqs, tmp2)
                        if (attr1 != attr2 &&
                            nla_get_u32(attr1) == nla_get_u32(attr2))
                                return 0;
        }

        return n_channels;
}

static bool is_band_valid(struct wiphy *wiphy, enum nl80211_band b)
{
        return b < NUM_NL80211_BANDS && wiphy->bands[b];
}

static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy,
                            struct cfg80211_bss_selection *bss_select)
{
        struct nlattr *attr[NL80211_BSS_SELECT_ATTR_MAX + 1];
        struct nlattr *nest;
        int err;
        bool found = false;
        int i;

        /* only process one nested attribute */
        nest = nla_data(nla);
        if (!nla_ok(nest, nla_len(nest)))
                return -EINVAL;

        err = nla_parse_nested_deprecated(attr, NL80211_BSS_SELECT_ATTR_MAX,
                                          nest, nl80211_bss_select_policy,
                                          NULL);
        if (err)
                return err;

        /* only one attribute may be given */
        for (i = 0; i <= NL80211_BSS_SELECT_ATTR_MAX; i++) {
                if (attr[i]) {
                        if (found)
                                return -EINVAL;
                        found = true;
                }
        }

        bss_select->behaviour = __NL80211_BSS_SELECT_ATTR_INVALID;

        if (attr[NL80211_BSS_SELECT_ATTR_RSSI])
                bss_select->behaviour = NL80211_BSS_SELECT_ATTR_RSSI;

        if (attr[NL80211_BSS_SELECT_ATTR_BAND_PREF]) {
                bss_select->behaviour = NL80211_BSS_SELECT_ATTR_BAND_PREF;
                bss_select->param.band_pref =
                        nla_get_u32(attr[NL80211_BSS_SELECT_ATTR_BAND_PREF]);
                if (!is_band_valid(wiphy, bss_select->param.band_pref))
                        return -EINVAL;
        }

        if (attr[NL80211_BSS_SELECT_ATTR_RSSI_ADJUST]) {
                struct nl80211_bss_select_rssi_adjust *adj_param;

                adj_param = nla_data(attr[NL80211_BSS_SELECT_ATTR_RSSI_ADJUST]);
                bss_select->behaviour = NL80211_BSS_SELECT_ATTR_RSSI_ADJUST;
                bss_select->param.adjust.band = adj_param->band;
                bss_select->param.adjust.delta = adj_param->delta;
                if (!is_band_valid(wiphy, bss_select->param.adjust.band))
                        return -EINVAL;
        }

        /* user-space did not provide behaviour attribute */
        if (bss_select->behaviour == __NL80211_BSS_SELECT_ATTR_INVALID)
                return -EINVAL;

        if (!(wiphy->bss_select_support & BIT(bss_select->behaviour)))
                return -EINVAL;

        return 0;
}

int nl80211_parse_random_mac(struct nlattr **attrs,
                             u8 *mac_addr, u8 *mac_addr_mask)
{
        int i;

        if (!attrs[NL80211_ATTR_MAC] && !attrs[NL80211_ATTR_MAC_MASK]) {
                eth_zero_addr(mac_addr);
                eth_zero_addr(mac_addr_mask);
                mac_addr[0] = 0x2;
                mac_addr_mask[0] = 0x3;

                return 0;
        }

        /* need both or none */
        if (!attrs[NL80211_ATTR_MAC] || !attrs[NL80211_ATTR_MAC_MASK])
                return -EINVAL;

        memcpy(mac_addr, nla_data(attrs[NL80211_ATTR_MAC]), ETH_ALEN);
        memcpy(mac_addr_mask, nla_data(attrs[NL80211_ATTR_MAC_MASK]), ETH_ALEN);

        /* don't allow or configure an mcast address */
        if (!is_multicast_ether_addr(mac_addr_mask) ||
            is_multicast_ether_addr(mac_addr))
                return -EINVAL;

        /*
         * allow users to pass a MAC address that has bits set outside
         * of the mask, but don't bother drivers with having to deal
         * with such bits
         */
        for (i = 0; i < ETH_ALEN; i++)
                mac_addr[i] &= mac_addr_mask[i];

        return 0;
}

static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev,
                                              struct ieee80211_channel *chan)
{
        unsigned int link_id;
        bool all_ok = true;

        lockdep_assert_wiphy(wdev->wiphy);

        if (!cfg80211_beaconing_iface_active(wdev))
                return true;

        /*
         * FIXME: check if we have a free HW resource/link for chan
         *
         * This, as well as the FIXME below, requires knowing the link
         * capabilities of the hardware.
         */

        /* we cannot leave radar channels */
        for_each_valid_link(wdev, link_id) {
                struct cfg80211_chan_def *chandef;

                chandef = wdev_chandef(wdev, link_id);
                if (!chandef || !chandef->chan)
                        continue;

                /*
                 * FIXME: don't require all_ok, but rather check only the
                 *          correct HW resource/link onto which 'chan' falls,
                 *          as only that link leaves the channel for doing
                 *          the off-channel operation.
                 */

                if (chandef->chan->flags & IEEE80211_CHAN_RADAR)
                        all_ok = false;
        }

        if (all_ok)
                return true;

        return regulatory_pre_cac_allowed(wdev->wiphy);
}

static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag,
                                    enum nl80211_ext_feature_index feat)
{
        if (!(flags & flag))
                return true;
        if (wiphy_ext_feature_isset(wiphy, feat))
                return true;
        return false;
}

static int
nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
                         void *request, struct nlattr **attrs,
                         bool is_sched_scan)
{
        u8 *mac_addr, *mac_addr_mask;
        u32 *flags;
        enum nl80211_feature_flags randomness_flag;

        if (!attrs[NL80211_ATTR_SCAN_FLAGS])
                return 0;

        if (is_sched_scan) {
                struct cfg80211_sched_scan_request *req = request;

                randomness_flag = wdev ?
                                  NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR :
                                  NL80211_FEATURE_ND_RANDOM_MAC_ADDR;
                flags = &req->flags;
                mac_addr = req->mac_addr;
                mac_addr_mask = req->mac_addr_mask;
        } else {
                struct cfg80211_scan_request *req = request;

                randomness_flag = NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR;
                flags = &req->flags;
                mac_addr = req->mac_addr;
                mac_addr_mask = req->mac_addr_mask;
        }

        *flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]);

        if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
             !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_LOW_SPAN,
                                     NL80211_EXT_FEATURE_LOW_SPAN_SCAN) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_LOW_POWER,
                                     NL80211_EXT_FEATURE_LOW_POWER_SCAN) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_HIGH_ACCURACY,
                                     NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME,
                                     NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP,
                                     NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
                                     NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE,
                                     NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_RANDOM_SN,
                                     NL80211_EXT_FEATURE_SCAN_RANDOM_SN) ||
            !nl80211_check_scan_feat(wiphy, *flags,
                                     NL80211_SCAN_FLAG_MIN_PREQ_CONTENT,
                                     NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT))
                return -EOPNOTSUPP;

        if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
                int err;

                if (!(wiphy->features & randomness_flag) ||
                    (wdev && wdev->connected))
                        return -EOPNOTSUPP;

                err = nl80211_parse_random_mac(attrs, mac_addr, mac_addr_mask);
                if (err)
                        return err;
        }

        return 0;
}

static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        struct cfg80211_scan_request *request;
        struct nlattr *scan_freqs = NULL;
        bool scan_freqs_khz = false;
        struct nlattr *attr;
        struct wiphy *wiphy;
        int err, tmp, n_ssids = 0, n_channels, i;
        size_t ie_len, size;
        size_t ssids_offset, ie_offset;

        wiphy = &rdev->wiphy;

        if (wdev->iftype == NL80211_IFTYPE_NAN)
                return -EOPNOTSUPP;

        if (!rdev->ops->scan)
                return -EOPNOTSUPP;

        if (rdev->scan_req || rdev->scan_msg)
                return -EBUSY;

        if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) {
                if (!wiphy_ext_feature_isset(wiphy,
                                             NL80211_EXT_FEATURE_SCAN_FREQ_KHZ))
                        return -EOPNOTSUPP;
                scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ];
                scan_freqs_khz = true;
        } else if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES])
                scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQUENCIES];

        if (scan_freqs) {
                n_channels = validate_scan_freqs(scan_freqs);
                if (!n_channels)
                        return -EINVAL;
        } else {
                n_channels = ieee80211_get_num_supported_channels(wiphy);
        }

        if (info->attrs[NL80211_ATTR_SCAN_SSIDS])
                nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp)
                        n_ssids++;

        if (n_ssids > wiphy->max_scan_ssids)
                return -EINVAL;

        if (info->attrs[NL80211_ATTR_IE])
                ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
        else
                ie_len = 0;

        if (ie_len > wiphy->max_scan_ie_len)
                return -EINVAL;

        size = struct_size(request, channels, n_channels);
        ssids_offset = size;
        size = size_add(size, array_size(sizeof(*request->ssids), n_ssids));
        ie_offset = size;
        size = size_add(size, ie_len);
        request = kzalloc(size, GFP_KERNEL);
        if (!request)
                return -ENOMEM;
        request->n_channels = n_channels;

        if (n_ssids)
                request->ssids = (void *)request + ssids_offset;
        request->n_ssids = n_ssids;
        if (ie_len)
                request->ie = (void *)request + ie_offset;

        i = 0;
        if (scan_freqs) {
                /* user specified, bail out if channel not found */
                nla_for_each_nested(attr, scan_freqs, tmp) {
                        struct ieee80211_channel *chan;
                        int freq = nla_get_u32(attr);

                        if (!scan_freqs_khz)
                                freq = MHZ_TO_KHZ(freq);

                        chan = ieee80211_get_channel_khz(wiphy, freq);
                        if (!chan) {
                                err = -EINVAL;
                                goto out_free;
                        }

                        /* ignore disabled channels */
                        if (chan->flags & IEEE80211_CHAN_DISABLED)
                                continue;

                        request->channels[i] = chan;
                        i++;
                }
        } else {
                enum nl80211_band band;

                /* all channels */
                for (band = 0; band < NUM_NL80211_BANDS; band++) {
                        int j;

                        if (!wiphy->bands[band])
                                continue;
                        for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
                                struct ieee80211_channel *chan;

                                chan = &wiphy->bands[band]->channels[j];

                                if (chan->flags & IEEE80211_CHAN_DISABLED)
                                        continue;

                                request->channels[i] = chan;
                                i++;
                        }
                }
        }

        if (!i) {
                err = -EINVAL;
                goto out_free;
        }

        request->n_channels = i;

        for (i = 0; i < request->n_channels; i++) {
                struct ieee80211_channel *chan = request->channels[i];

                /* if we can go off-channel to the target channel we're good */
                if (cfg80211_off_channel_oper_allowed(wdev, chan))
                        continue;

                if (!cfg80211_wdev_on_sub_chan(wdev, chan, true)) {
                        err = -EBUSY;
                        goto out_free;
                }
        }

        i = 0;
        if (n_ssids) {
                nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) {
                        if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) {
                                err = -EINVAL;
                                goto out_free;
                        }
                        request->ssids[i].ssid_len = nla_len(attr);
                        memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));
                        i++;
                }
        }

        if (info->attrs[NL80211_ATTR_IE]) {
                request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
                memcpy((void *)request->ie,
                       nla_data(info->attrs[NL80211_ATTR_IE]),
                       request->ie_len);
        }

        for (i = 0; i < NUM_NL80211_BANDS; i++)
                if (wiphy->bands[i])
                        request->rates[i] =
                                (1 << wiphy->bands[i]->n_bitrates) - 1;

        if (info->attrs[NL80211_ATTR_SCAN_SUPP_RATES]) {
                nla_for_each_nested(attr,
                                    info->attrs[NL80211_ATTR_SCAN_SUPP_RATES],
                                    tmp) {
                        enum nl80211_band band = nla_type(attr);

                        if (band < 0 || band >= NUM_NL80211_BANDS) {
                                err = -EINVAL;
                                goto out_free;
                        }

                        if (!wiphy->bands[band])
                                continue;

                        err = ieee80211_get_ratemask(wiphy->bands[band],
                                                     nla_data(attr),
                                                     nla_len(attr),
                                                     &request->rates[band]);
                        if (err)
                                goto out_free;
                }
        }

        if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) {
                request->duration =
                        nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]);
                request->duration_mandatory =
                        nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]);
        }

        err = nl80211_check_scan_flags(wiphy, wdev, request, info->attrs,
                                       false);
        if (err)
                goto out_free;

        request->no_cck =
                nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);

        /* Initial implementation used NL80211_ATTR_MAC to set the specific
         * BSSID to scan for. This was problematic because that same attribute
         * was already used for another purpose (local random MAC address). The
         * NL80211_ATTR_BSSID attribute was added to fix this. For backwards
         * compatibility with older userspace components, also use the
         * NL80211_ATTR_MAC value here if it can be determined to be used for
         * the specific BSSID use case instead of the random MAC address
         * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use).
         */
        if (info->attrs[NL80211_ATTR_BSSID])
                memcpy(request->bssid,
                       nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN);
        else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) &&
                 info->attrs[NL80211_ATTR_MAC])
                memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]),
                       ETH_ALEN);
        else
                eth_broadcast_addr(request->bssid);

        request->tsf_report_link_id = nl80211_link_id_or_invalid(info->attrs);
        request->wdev = wdev;
        request->wiphy = &rdev->wiphy;
        request->scan_start = jiffies;

        rdev->scan_req = request;
        err = cfg80211_scan(rdev);

        if (err)
                goto out_free;

        nl80211_send_scan_start(rdev, wdev);
        dev_hold(wdev->netdev);

        return 0;

 out_free:
        rdev->scan_req = NULL;
        kfree(request);

        return err;
}

static int nl80211_abort_scan(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];

        if (!rdev->ops->abort_scan)
                return -EOPNOTSUPP;

        if (rdev->scan_msg)
                return 0;

        if (!rdev->scan_req)
                return -ENOENT;

        rdev_abort_scan(rdev, wdev);
        return 0;
}

static int
nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
                               struct cfg80211_sched_scan_request *request,
                               struct nlattr **attrs)
{
        int tmp, err, i = 0;
        struct nlattr *attr;

        if (!attrs[NL80211_ATTR_SCHED_SCAN_PLANS]) {
                u32 interval;

                /*
                 * If scan plans are not specified,
                 * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this
                 * case one scan plan will be set with the specified scan
                 * interval and infinite number of iterations.
                 */
                interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]);
                if (!interval)
                        return -EINVAL;

                request->scan_plans[0].interval =
                        DIV_ROUND_UP(interval, MSEC_PER_SEC);
                if (!request->scan_plans[0].interval)
                        return -EINVAL;

                if (request->scan_plans[0].interval >
                    wiphy->max_sched_scan_plan_interval)
                        request->scan_plans[0].interval =
                                wiphy->max_sched_scan_plan_interval;

                return 0;
        }

        nla_for_each_nested(attr, attrs[NL80211_ATTR_SCHED_SCAN_PLANS], tmp) {
                struct nlattr *plan[NL80211_SCHED_SCAN_PLAN_MAX + 1];

                if (WARN_ON(i >= n_plans))
                        return -EINVAL;

                err = nla_parse_nested_deprecated(plan,
                                                  NL80211_SCHED_SCAN_PLAN_MAX,
                                                  attr, nl80211_plan_policy,
                                                  NULL);
                if (err)
                        return err;

                if (!plan[NL80211_SCHED_SCAN_PLAN_INTERVAL])
                        return -EINVAL;

                request->scan_plans[i].interval =
                        nla_get_u32(plan[NL80211_SCHED_SCAN_PLAN_INTERVAL]);
                if (!request->scan_plans[i].interval ||
                    request->scan_plans[i].interval >
                    wiphy->max_sched_scan_plan_interval)
                        return -EINVAL;

                if (plan[NL80211_SCHED_SCAN_PLAN_ITERATIONS]) {
                        request->scan_plans[i].iterations =
                                nla_get_u32(plan[NL80211_SCHED_SCAN_PLAN_ITERATIONS]);
                        if (!request->scan_plans[i].iterations ||
                            (request->scan_plans[i].iterations >
                             wiphy->max_sched_scan_plan_iterations))
                                return -EINVAL;
                } else if (i < n_plans - 1) {
                        /*
                         * All scan plans but the last one must specify
                         * a finite number of iterations
                         */
                        return -EINVAL;
                }

                i++;
        }

        /*
         * The last scan plan must not specify the number of
         * iterations, it is supposed to run infinitely
         */
        if (request->scan_plans[n_plans - 1].iterations)
                return  -EINVAL;

        return 0;
}

static struct cfg80211_sched_scan_request *
nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
                         struct nlattr **attrs, int max_match_sets)
{
        struct cfg80211_sched_scan_request *request;
        struct nlattr *attr;
        int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i, n_plans = 0;
        enum nl80211_band band;
        size_t ie_len, size;
        struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1];
        s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF;

        if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
                n_channels = validate_scan_freqs(
                                attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
                if (!n_channels)
                        return ERR_PTR(-EINVAL);
        } else {
                n_channels = ieee80211_get_num_supported_channels(wiphy);
        }

        if (attrs[NL80211_ATTR_SCAN_SSIDS])
                nla_for_each_nested(attr, attrs[NL80211_ATTR_SCAN_SSIDS],
                                    tmp)
                        n_ssids++;

        if (n_ssids > wiphy->max_sched_scan_ssids)
                return ERR_PTR(-EINVAL);

        /*
         * First, count the number of 'real' matchsets. Due to an issue with
         * the old implementation, matchsets containing only the RSSI attribute
         * (NL80211_SCHED_SCAN_MATCH_ATTR_RSSI) are considered as the 'default'
         * RSSI for all matchsets, rather than their own matchset for reporting
         * all APs with a strong RSSI. This is needed to be compatible with
         * older userspace that treated a matchset with only the RSSI as the
         * global RSSI for all other matchsets - if there are other matchsets.
         */
        if (attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) {
                nla_for_each_nested(attr,
                                    attrs[NL80211_ATTR_SCHED_SCAN_MATCH],
                                    tmp) {
                        struct nlattr *rssi;

                        err = nla_parse_nested_deprecated(tb,
                                                          NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
                                                          attr,
                                                          nl80211_match_policy,
                                                          NULL);
                        if (err)
                                return ERR_PTR(err);

                        /* SSID and BSSID are mutually exclusive */
                        if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] &&
                            tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID])
                                return ERR_PTR(-EINVAL);

                        /* add other standalone attributes here */
                        if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] ||
                            tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]) {
                                n_match_sets++;
                                continue;
                        }
                        rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI];
                        if (rssi)
                                default_match_rssi = nla_get_s32(rssi);
                }
        }

        /* However, if there's no other matchset, add the RSSI one */
        if (!n_match_sets && default_match_rssi != NL80211_SCAN_RSSI_THOLD_OFF)
                n_match_sets = 1;

        if (n_match_sets > max_match_sets)
                return ERR_PTR(-EINVAL);

        if (attrs[NL80211_ATTR_IE])
                ie_len = nla_len(attrs[NL80211_ATTR_IE]);
        else
                ie_len = 0;

        if (ie_len > wiphy->max_sched_scan_ie_len)
                return ERR_PTR(-EINVAL);

        if (attrs[NL80211_ATTR_SCHED_SCAN_PLANS]) {
                /*
                 * NL80211_ATTR_SCHED_SCAN_INTERVAL must not be specified since
                 * each scan plan already specifies its own interval
                 */
                if (attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
                        return ERR_PTR(-EINVAL);

                nla_for_each_nested(attr,
                                    attrs[NL80211_ATTR_SCHED_SCAN_PLANS], tmp)
                        n_plans++;
        } else {
                /*
                 * The scan interval attribute is kept for backward
                 * compatibility. If no scan plans are specified and sched scan
                 * interval is specified, one scan plan will be set with this
                 * scan interval and infinite number of iterations.
                 */
                if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL])
                        return ERR_PTR(-EINVAL);

                n_plans = 1;
        }

        if (!n_plans || n_plans > wiphy->max_sched_scan_plans)
                return ERR_PTR(-EINVAL);

        if (!wiphy_ext_feature_isset(
                    wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) &&
            (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] ||
             attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]))
                return ERR_PTR(-EINVAL);

        size = struct_size(request, channels, n_channels);
        size = size_add(size, array_size(sizeof(*request->ssids), n_ssids));
        size = size_add(size, array_size(sizeof(*request->match_sets),
                                         n_match_sets));
        size = size_add(size, array_size(sizeof(*request->scan_plans),
                                         n_plans));
        size = size_add(size, ie_len);
        request = kzalloc(size, GFP_KERNEL);
        if (!request)
                return ERR_PTR(-ENOMEM);

        if (n_ssids)
                request->ssids = (void *)&request->channels[n_channels];
        request->n_ssids = n_ssids;
        if (ie_len) {
                if (n_ssids)
                        request->ie = (void *)(request->ssids + n_ssids);
                else
                        request->ie = (void *)(request->channels + n_channels);
        }

        if (n_match_sets) {
                if (request->ie)
                        request->match_sets = (void *)(request->ie + ie_len);
                else if (n_ssids)
                        request->match_sets =
                                (void *)(request->ssids + n_ssids);
                else
                        request->match_sets =
                                (void *)(request->channels + n_channels);
        }
        request->n_match_sets = n_match_sets;

        if (n_match_sets)
                request->scan_plans = (void *)(request->match_sets +
                                               n_match_sets);
        else if (request->ie)
                request->scan_plans = (void *)(request->ie + ie_len);
        else if (n_ssids)
                request->scan_plans = (void *)(request->ssids + n_ssids);
        else
                request->scan_plans = (void *)(request->channels + n_channels);

        request->n_scan_plans = n_plans;

        i = 0;
        if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
                /* user specified, bail out if channel not found */
                nla_for_each_nested(attr,
                                    attrs[NL80211_ATTR_SCAN_FREQUENCIES],
                                    tmp) {
                        struct ieee80211_channel *chan;

                        chan = ieee80211_get_channel(wiphy, nla_get_u32(attr));

                        if (!chan) {
                                err = -EINVAL;
                                goto out_free;
                        }

                        /* ignore disabled channels */
                        if (chan->flags & IEEE80211_CHAN_DISABLED)
                                continue;

                        request->channels[i] = chan;
                        i++;
                }
        } else {
                /* all channels */
                for (band = 0; band < NUM_NL80211_BANDS; band++) {
                        int j;

                        if (!wiphy->bands[band])
                                continue;
                        for (j = 0; j < wiphy->bands[band]->n_channels; j++) {
                                struct ieee80211_channel *chan;

                                chan = &wiphy->bands[band]->channels[j];

                                if (chan->flags & IEEE80211_CHAN_DISABLED)
                                        continue;

                                request->channels[i] = chan;
                                i++;
                        }
                }
        }

        if (!i) {
                err = -EINVAL;
                goto out_free;
        }

        request->n_channels = i;

        i = 0;
        if (n_ssids) {
                nla_for_each_nested(attr, attrs[NL80211_ATTR_SCAN_SSIDS],
                                    tmp) {
                        if (nla_len(attr) > IEEE80211_MAX_SSID_LEN) {
                                err = -EINVAL;
                                goto out_free;
                        }
                        request->ssids[i].ssid_len = nla_len(attr);
                        memcpy(request->ssids[i].ssid, nla_data(attr),
                               nla_len(attr));
                        i++;
                }
        }

        i = 0;
        if (attrs[NL80211_ATTR_SCHED_SCAN_MATCH]) {
                nla_for_each_nested(attr,
                                    attrs[NL80211_ATTR_SCHED_SCAN_MATCH],
                                    tmp) {
                        struct nlattr *ssid, *bssid, *rssi;

                        err = nla_parse_nested_deprecated(tb,
                                                          NL80211_SCHED_SCAN_MATCH_ATTR_MAX,
                                                          attr,
                                                          nl80211_match_policy,
                                                          NULL);
                        if (err)
                                goto out_free;
                        ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID];
                        bssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID];

                        if (!ssid && !bssid) {
                                i++;
                                continue;
                        }

                        if (WARN_ON(i >= n_match_sets)) {
                                /* this indicates a programming error,
                                 * the loop above should have verified
                                 * things properly
                                 */
                                err = -EINVAL;
                                goto out_free;
                        }

                        if (ssid) {
                                memcpy(request->match_sets[i].ssid.ssid,
                                       nla_data(ssid), nla_len(ssid));
                                request->match_sets[i].ssid.ssid_len =
                                        nla_len(ssid);
                        }
                        if (bssid)
                                memcpy(request->match_sets[i].bssid,
                                       nla_data(bssid), ETH_ALEN);

                        /* special attribute - old implementation w/a */
                        request->match_sets[i].rssi_thold = default_match_rssi;
                        rssi = tb[NL80211_SCHED_SCAN_MATCH_ATTR_RSSI];
                        if (rssi)
                                request->match_sets[i].rssi_thold =
                                        nla_get_s32(rssi);
                        i++;
                }

                /* there was no other matchset, so the RSSI one is alone */
                if (i == 0 && n_match_sets)
                        request->match_sets[0].rssi_thold = default_match_rssi;

                request->min_rssi_thold = INT_MAX;
                for (i = 0; i < n_match_sets; i++)
                        request->min_rssi_thold =
                                min(request->match_sets[i].rssi_thold,
                                    request->min_rssi_thold);
        } else {
                request->min_rssi_thold = NL80211_SCAN_RSSI_THOLD_OFF;
        }

        if (ie_len) {
                request->ie_len = ie_len;
                memcpy((void *)request->ie,
                       nla_data(attrs[NL80211_ATTR_IE]),
                       request->ie_len);
        }

        err = nl80211_check_scan_flags(wiphy, wdev, request, attrs, true);
        if (err)
                goto out_free;

        if (attrs[NL80211_ATTR_SCHED_SCAN_DELAY])
                request->delay =
                        nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]);

        if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) {
                request->relative_rssi = nla_get_s8(
                        attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]);
                request->relative_rssi_set = true;
        }

        if (request->relative_rssi_set &&
            attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) {
                struct nl80211_bss_select_rssi_adjust *rssi_adjust;

                rssi_adjust = nla_data(
                        attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]);
                request->rssi_adjust.band = rssi_adjust->band;
                request->rssi_adjust.delta = rssi_adjust->delta;
                if (!is_band_valid(wiphy, request->rssi_adjust.band)) {
                        err = -EINVAL;
                        goto out_free;
                }
        }

        err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs);
        if (err)
                goto out_free;

        request->scan_start = jiffies;

        return request;

out_free:
        kfree(request);
        return ERR_PTR(err);
}

static int nl80211_start_sched_scan(struct sk_buff *skb,
                                    struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_sched_scan_request *sched_scan_req;
        bool want_multi;
        int err;

        if (!rdev->wiphy.max_sched_scan_reqs || !rdev->ops->sched_scan_start)
                return -EOPNOTSUPP;

        want_multi = info->attrs[NL80211_ATTR_SCHED_SCAN_MULTI];
        err = cfg80211_sched_scan_req_possible(rdev, want_multi);
        if (err)
                return err;

        sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev,
                                                  info->attrs,
                                                  rdev->wiphy.max_match_sets);

        err = PTR_ERR_OR_ZERO(sched_scan_req);
        if (err)
                goto out_err;

        /* leave request id zero for legacy request
         * or if driver does not support multi-scheduled scan
         */
        if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1)
                sched_scan_req->reqid = cfg80211_assign_cookie(rdev);

        err = rdev_sched_scan_start(rdev, dev, sched_scan_req);
        if (err)
                goto out_free;

        sched_scan_req->dev = dev;
        sched_scan_req->wiphy = &rdev->wiphy;

        if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
                sched_scan_req->owner_nlportid = info->snd_portid;

        cfg80211_add_sched_scan_req(rdev, sched_scan_req);

        nl80211_send_sched_scan(sched_scan_req, NL80211_CMD_START_SCHED_SCAN);
        return 0;

out_free:
        kfree(sched_scan_req);
out_err:
        return err;
}

static int nl80211_stop_sched_scan(struct sk_buff *skb,
                                   struct genl_info *info)
{
        struct cfg80211_sched_scan_request *req;
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        u64 cookie;

        if (!rdev->wiphy.max_sched_scan_reqs || !rdev->ops->sched_scan_stop)
                return -EOPNOTSUPP;

        if (info->attrs[NL80211_ATTR_COOKIE]) {
                cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
                return __cfg80211_stop_sched_scan(rdev, cookie, false);
        }

        req = list_first_or_null_rcu(&rdev->sched_scan_req_list,
                                     struct cfg80211_sched_scan_request,
                                     list);
        if (!req || req->reqid ||
            (req->owner_nlportid &&
             req->owner_nlportid != info->snd_portid))
                return -ENOENT;

        return cfg80211_stop_sched_scan_req(rdev, req, false);
}

static int nl80211_start_radar_detection(struct sk_buff *skb,
                                         struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_chan_def chandef;
        enum nl80211_dfs_regions dfs_region;
        unsigned int cac_time_ms;
        int err = -EINVAL;

        flush_delayed_work(&rdev->dfs_update_channels_wk);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_MESH_POINT:
        case NL80211_IFTYPE_ADHOC:
                break;
        default:
                /* caution - see cfg80211_beaconing_iface_active() below */
                return -EINVAL;
        }

        wiphy_lock(wiphy);

        dfs_region = reg_get_dfs_region(wiphy);
        if (dfs_region == NL80211_DFS_UNSET)
                goto unlock;

        err = nl80211_parse_chandef(rdev, info, &chandef);
        if (err)
                goto unlock;

        err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype);
        if (err < 0)
                goto unlock;

        if (err == 0) {
                err = -EINVAL;
                goto unlock;
        }

        if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) {
                err = -EINVAL;
                goto unlock;
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND])) {
                err = cfg80211_start_background_radar_detection(rdev, wdev,
                                                                &chandef);
                goto unlock;
        }

        if (cfg80211_beaconing_iface_active(wdev) || wdev->cac_started) {
                err = -EBUSY;
                goto unlock;
        }

        /* CAC start is offloaded to HW and can't be started manually */
        if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) {
                err = -EOPNOTSUPP;
                goto unlock;
        }

        if (!rdev->ops->start_radar_detection) {
                err = -EOPNOTSUPP;
                goto unlock;
        }

        cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, &chandef);
        if (WARN_ON(!cac_time_ms))
                cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;

        err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms);
        if (!err) {
                wdev->links[0].ap.chandef = chandef;
                wdev->cac_started = true;
                wdev->cac_start_time = jiffies;
                wdev->cac_time_ms = cac_time_ms;
        }
unlock:
        wiphy_unlock(wiphy);

        return err;
}

static int nl80211_notify_radar_detection(struct sk_buff *skb,
                                          struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_chan_def chandef;
        enum nl80211_dfs_regions dfs_region;
        int err;

        dfs_region = reg_get_dfs_region(wiphy);
        if (dfs_region == NL80211_DFS_UNSET) {
                GENL_SET_ERR_MSG(info,
                                 "DFS Region is not set. Unexpected Radar indication");
                return -EINVAL;
        }

        err = nl80211_parse_chandef(rdev, info, &chandef);
        if (err) {
                GENL_SET_ERR_MSG(info, "Unable to extract chandef info");
                return err;
        }

        err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype);
        if (err < 0) {
                GENL_SET_ERR_MSG(info, "chandef is invalid");
                return err;
        }

        if (err == 0) {
                GENL_SET_ERR_MSG(info,
                                 "Unexpected Radar indication for chandef/iftype");
                return -EINVAL;
        }

        /* Do not process this notification if radar is already detected
         * by kernel on this channel, and return success.
         */
        if (chandef.chan->dfs_state == NL80211_DFS_UNAVAILABLE)
                return 0;

        cfg80211_set_dfs_state(wiphy, &chandef, NL80211_DFS_UNAVAILABLE);

        cfg80211_sched_dfs_chan_update(rdev);

        rdev->radar_chandef = chandef;

        /* Propagate this notification to other radios as well */
        queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk);

        return 0;
}

static int nl80211_parse_counter_offsets(struct cfg80211_registered_device *rdev,
                                         const u8 *data, size_t datalen,
                                         int first_count, struct nlattr *attr,
                                         const u16 **offsets, unsigned int *n_offsets)
{
        int i;

        *n_offsets = 0;

        if (!attr)
                return 0;

        if (!nla_len(attr) || (nla_len(attr) % sizeof(u16)))
                return -EINVAL;

        *n_offsets = nla_len(attr) / sizeof(u16);
        if (rdev->wiphy.max_num_csa_counters &&
            (*n_offsets > rdev->wiphy.max_num_csa_counters))
                return -EINVAL;

        *offsets = nla_data(attr);

        /* sanity checks - counters should fit and be the same */
        for (i = 0; i < *n_offsets; i++) {
                u16 offset = (*offsets)[i];

                if (offset >= datalen)
                        return -EINVAL;

                if (first_count != -1 && data[offset] != first_count)
                        return -EINVAL;
        }

        return 0;
}

static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_csa_settings params;
        struct nlattr **csa_attrs = NULL;
        int err;
        bool need_new_beacon = false;
        bool need_handle_dfs_flag = true;
        u32 cs_count;

        if (!rdev->ops->channel_switch ||
            !(rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH))
                return -EOPNOTSUPP;

        switch (dev->ieee80211_ptr->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                need_new_beacon = true;
                /* For all modes except AP the handle_dfs flag needs to be
                 * supplied to tell the kernel that userspace will handle radar
                 * events when they happen. Otherwise a switch to a channel
                 * requiring DFS will be rejected.
                 */
                need_handle_dfs_flag = false;

                /* useless if AP is not running */
                if (!wdev->links[link_id].ap.beacon_interval)
                        return -ENOTCONN;
                break;
        case NL80211_IFTYPE_ADHOC:
                if (!wdev->u.ibss.ssid_len)
                        return -ENOTCONN;
                break;
        case NL80211_IFTYPE_MESH_POINT:
                if (!wdev->u.mesh.id_len)
                        return -ENOTCONN;
                break;
        default:
                return -EOPNOTSUPP;
        }

        memset(&params, 0, sizeof(params));
        params.beacon_csa.ftm_responder = -1;

        if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
            !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT])
                return -EINVAL;

        /* only important for AP, IBSS and mesh create IEs internally */
        if (need_new_beacon && !info->attrs[NL80211_ATTR_CSA_IES])
                return -EINVAL;

        /* Even though the attribute is u32, the specification says
         * u8, so let's make sure we don't overflow.
         */
        cs_count = nla_get_u32(info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]);
        if (cs_count > 255)
                return -EINVAL;

        params.count = cs_count;

        if (!need_new_beacon)
                goto skip_beacons;

        err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after,
                                   info->extack);
        if (err)
                goto free;

        csa_attrs = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*csa_attrs),
                            GFP_KERNEL);
        if (!csa_attrs) {
                err = -ENOMEM;
                goto free;
        }

        err = nla_parse_nested_deprecated(csa_attrs, NL80211_ATTR_MAX,
                                          info->attrs[NL80211_ATTR_CSA_IES],
                                          nl80211_policy, info->extack);
        if (err)
                goto free;

        err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa,
                                   info->extack);
        if (err)
                goto free;

        if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) {
                err = -EINVAL;
                goto free;
        }

        err = nl80211_parse_counter_offsets(rdev, params.beacon_csa.tail,
                                            params.beacon_csa.tail_len,
                                            params.count,
                                            csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON],
                                            &params.counter_offsets_beacon,
                                            &params.n_counter_offsets_beacon);
        if (err)
                goto free;

        err = nl80211_parse_counter_offsets(rdev, params.beacon_csa.probe_resp,
                                            params.beacon_csa.probe_resp_len,
                                            params.count,
                                            csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP],
                                            &params.counter_offsets_presp,
                                            &params.n_counter_offsets_presp);
        if (err)
                goto free;

skip_beacons:
        err = nl80211_parse_chandef(rdev, info, &params.chandef);
        if (err)
                goto free;

        if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params.chandef,
                                           wdev->iftype)) {
                err = -EINVAL;
                goto free;
        }

        err = cfg80211_chandef_dfs_required(wdev->wiphy,
                                            &params.chandef,
                                            wdev->iftype);
        if (err < 0)
                goto free;

        if (err > 0) {
                params.radar_required = true;
                if (need_handle_dfs_flag &&
                    !nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS])) {
                        err = -EINVAL;
                        goto free;
                }
        }

        if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX])
                params.block_tx = true;

        params.link_id = link_id;
        err = rdev_channel_switch(rdev, dev, &params);

free:
        kfree(params.beacon_after.mbssid_ies);
        kfree(params.beacon_csa.mbssid_ies);
        kfree(params.beacon_after.rnr_ies);
        kfree(params.beacon_csa.rnr_ies);
        kfree(csa_attrs);
        return err;
}

static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
                            u32 seq, int flags,
                            struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev,
                            struct cfg80211_internal_bss *intbss)
{
        struct cfg80211_bss *res = &intbss->pub;
        const struct cfg80211_bss_ies *ies;
        unsigned int link_id;
        void *hdr;
        struct nlattr *bss;

        lockdep_assert_wiphy(wdev->wiphy);

        hdr = nl80211hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags,
                             NL80211_CMD_NEW_SCAN_RESULTS);
        if (!hdr)
                return -1;

        genl_dump_check_consistent(cb, hdr);

        if (nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->bss_generation))
                goto nla_put_failure;
        if (wdev->netdev &&
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex))
                goto nla_put_failure;
        if (nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        bss = nla_nest_start_noflag(msg, NL80211_ATTR_BSS);
        if (!bss)
                goto nla_put_failure;
        if ((!is_zero_ether_addr(res->bssid) &&
             nla_put(msg, NL80211_BSS_BSSID, ETH_ALEN, res->bssid)))
                goto nla_put_failure;

        rcu_read_lock();
        /* indicate whether we have probe response data or not */
        if (rcu_access_pointer(res->proberesp_ies) &&
            nla_put_flag(msg, NL80211_BSS_PRESP_DATA))
                goto fail_unlock_rcu;

        /* this pointer prefers to be pointed to probe response data
         * but is always valid
         */
        ies = rcu_dereference(res->ies);
        if (ies) {
                if (nla_put_u64_64bit(msg, NL80211_BSS_TSF, ies->tsf,
                                      NL80211_BSS_PAD))
                        goto fail_unlock_rcu;
                if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS,
                                        ies->len, ies->data))
                        goto fail_unlock_rcu;
        }

        /* and this pointer is always (unless driver didn't know) beacon data */
        ies = rcu_dereference(res->beacon_ies);
        if (ies && ies->from_beacon) {
                if (nla_put_u64_64bit(msg, NL80211_BSS_BEACON_TSF, ies->tsf,
                                      NL80211_BSS_PAD))
                        goto fail_unlock_rcu;
                if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES,
                                        ies->len, ies->data))
                        goto fail_unlock_rcu;
        }
        rcu_read_unlock();

        if (res->beacon_interval &&
            nla_put_u16(msg, NL80211_BSS_BEACON_INTERVAL, res->beacon_interval))
                goto nla_put_failure;
        if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) ||
            nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) ||
            nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET,
                        res->channel->freq_offset) ||
            nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO,
                        jiffies_to_msecs(jiffies - intbss->ts)))
                goto nla_put_failure;

        if (intbss->parent_tsf &&
            (nla_put_u64_64bit(msg, NL80211_BSS_PARENT_TSF,
                               intbss->parent_tsf, NL80211_BSS_PAD) ||
             nla_put(msg, NL80211_BSS_PARENT_BSSID, ETH_ALEN,
                     intbss->parent_bssid)))
                goto nla_put_failure;

        if (intbss->ts_boottime &&
            nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME,
                              intbss->ts_boottime, NL80211_BSS_PAD))
                goto nla_put_failure;

        if (!nl80211_put_signal(msg, intbss->pub.chains,
                                intbss->pub.chain_signal,
                                NL80211_BSS_CHAIN_SIGNAL))
                goto nla_put_failure;

        switch (rdev->wiphy.signal_type) {
        case CFG80211_SIGNAL_TYPE_MBM:
                if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal))
                        goto nla_put_failure;
                break;
        case CFG80211_SIGNAL_TYPE_UNSPEC:
                if (nla_put_u8(msg, NL80211_BSS_SIGNAL_UNSPEC, res->signal))
                        goto nla_put_failure;
                break;
        default:
                break;
        }

        switch (wdev->iftype) {
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_STATION:
                for_each_valid_link(wdev, link_id) {
                        if (intbss == wdev->links[link_id].client.current_bss &&
                            (nla_put_u32(msg, NL80211_BSS_STATUS,
                                         NL80211_BSS_STATUS_ASSOCIATED) ||
                             (wdev->valid_links &&
                              (nla_put_u8(msg, NL80211_BSS_MLO_LINK_ID,
                                          link_id) ||
                               nla_put(msg, NL80211_BSS_MLD_ADDR, ETH_ALEN,
                                       wdev->u.client.connected_addr)))))
                                goto nla_put_failure;
                }
                break;
        case NL80211_IFTYPE_ADHOC:
                if (intbss == wdev->u.ibss.current_bss &&
                    nla_put_u32(msg, NL80211_BSS_STATUS,
                                NL80211_BSS_STATUS_IBSS_JOINED))
                        goto nla_put_failure;
                break;
        default:
                break;
        }

        if (nla_put_u32(msg, NL80211_BSS_USE_FOR, res->use_for))
                goto nla_put_failure;

        if (res->cannot_use_reasons &&
            nla_put_u64_64bit(msg, NL80211_BSS_CANNOT_USE_REASONS,
                              res->cannot_use_reasons,
                              NL80211_BSS_PAD))
                goto nla_put_failure;

        nla_nest_end(msg, bss);

        genlmsg_end(msg, hdr);
        return 0;

 fail_unlock_rcu:
        rcu_read_unlock();
 nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct cfg80211_registered_device *rdev;
        struct cfg80211_internal_bss *scan;
        struct wireless_dev *wdev;
        struct nlattr **attrbuf;
        int start = cb->args[2], idx = 0;
        bool dump_include_use_data;
        int err;

        attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL);
        if (!attrbuf)
                return -ENOMEM;

        err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, attrbuf);
        if (err) {
                kfree(attrbuf);
                return err;
        }
        /* nl80211_prepare_wdev_dump acquired it in the successful case */
        __acquire(&rdev->wiphy.mtx);

        dump_include_use_data =
                attrbuf[NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA];
        kfree(attrbuf);

        spin_lock_bh(&rdev->bss_lock);

        /*
         * dump_scan will be called multiple times to break up the scan results
         * into multiple messages.  It is unlikely that any more bss-es will be
         * expired after the first call, so only call only call this on the
         * first dump_scan invocation.
         */
        if (start == 0)
                cfg80211_bss_expire(rdev);

        cb->seq = rdev->bss_generation;

        list_for_each_entry(scan, &rdev->bss_list, list) {
                if (++idx <= start)
                        continue;
                if (!dump_include_use_data &&
                    !(scan->pub.use_for & NL80211_BSS_USE_FOR_NORMAL))
                        continue;
                if (nl80211_send_bss(skb, cb,
                                cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                rdev, wdev, scan) < 0) {
                        idx--;
                        break;
                }
        }

        spin_unlock_bh(&rdev->bss_lock);

        cb->args[2] = idx;
        wiphy_unlock(&rdev->wiphy);

        return skb->len;
}

static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
                               int flags, struct net_device *dev,
                               bool allow_radio_stats,
                               struct survey_info *survey)
{
        void *hdr;
        struct nlattr *infoattr;

        /* skip radio stats if userspace didn't request them */
        if (!survey->channel && !allow_radio_stats)
                return 0;

        hdr = nl80211hdr_put(msg, portid, seq, flags,
                             NL80211_CMD_NEW_SURVEY_RESULTS);
        if (!hdr)
                return -ENOMEM;

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
                goto nla_put_failure;

        infoattr = nla_nest_start_noflag(msg, NL80211_ATTR_SURVEY_INFO);
        if (!infoattr)
                goto nla_put_failure;

        if (survey->channel &&
            nla_put_u32(msg, NL80211_SURVEY_INFO_FREQUENCY,
                        survey->channel->center_freq))
                goto nla_put_failure;

        if (survey->channel && survey->channel->freq_offset &&
            nla_put_u32(msg, NL80211_SURVEY_INFO_FREQUENCY_OFFSET,
                        survey->channel->freq_offset))
                goto nla_put_failure;

        if ((survey->filled & SURVEY_INFO_NOISE_DBM) &&
            nla_put_u8(msg, NL80211_SURVEY_INFO_NOISE, survey->noise))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_IN_USE) &&
            nla_put_flag(msg, NL80211_SURVEY_INFO_IN_USE))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_TIME) &&
            nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME,
                        survey->time, NL80211_SURVEY_INFO_PAD))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_TIME_BUSY) &&
            nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_BUSY,
                              survey->time_busy, NL80211_SURVEY_INFO_PAD))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_TIME_EXT_BUSY) &&
            nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_EXT_BUSY,
                              survey->time_ext_busy, NL80211_SURVEY_INFO_PAD))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_TIME_RX) &&
            nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_RX,
                              survey->time_rx, NL80211_SURVEY_INFO_PAD))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_TIME_TX) &&
            nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_TX,
                              survey->time_tx, NL80211_SURVEY_INFO_PAD))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_TIME_SCAN) &&
            nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_SCAN,
                              survey->time_scan, NL80211_SURVEY_INFO_PAD))
                goto nla_put_failure;
        if ((survey->filled & SURVEY_INFO_TIME_BSS_RX) &&
            nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_BSS_RX,
                              survey->time_bss_rx, NL80211_SURVEY_INFO_PAD))
                goto nla_put_failure;

        nla_nest_end(msg, infoattr);

        genlmsg_end(msg, hdr);
        return 0;

 nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct nlattr **attrbuf;
        struct survey_info survey;
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        int survey_idx = cb->args[2];
        int res;
        bool radio_stats;

        attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL);
        if (!attrbuf)
                return -ENOMEM;

        res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, attrbuf);
        if (res) {
                kfree(attrbuf);
                return res;
        }
        /* nl80211_prepare_wdev_dump acquired it in the successful case */
        __acquire(&rdev->wiphy.mtx);

        /* prepare_wdev_dump parsed the attributes */
        radio_stats = attrbuf[NL80211_ATTR_SURVEY_RADIO_STATS];

        if (!wdev->netdev) {
                res = -EINVAL;
                goto out_err;
        }

        if (!rdev->ops->dump_survey) {
                res = -EOPNOTSUPP;
                goto out_err;
        }

        while (1) {
                res = rdev_dump_survey(rdev, wdev->netdev, survey_idx, &survey);
                if (res == -ENOENT)
                        break;
                if (res)
                        goto out_err;

                /* don't send disabled channels, but do send non-channel data */
                if (survey.channel &&
                    survey.channel->flags & IEEE80211_CHAN_DISABLED) {
                        survey_idx++;
                        continue;
                }

                if (nl80211_send_survey(skb,
                                NETLINK_CB(cb->skb).portid,
                                cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                wdev->netdev, radio_stats, &survey) < 0)
                        goto out;
                survey_idx++;
        }

 out:
        cb->args[2] = survey_idx;
        res = skb->len;
 out_err:
        kfree(attrbuf);
        wiphy_unlock(&rdev->wiphy);
        return res;
}

static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct ieee80211_channel *chan;
        const u8 *bssid, *ssid;
        int err, ssid_len;
        enum nl80211_auth_type auth_type;
        struct key_parse key;
        bool local_state_change;
        struct cfg80211_auth_request req = {};
        u32 freq;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_AUTH_TYPE])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_SSID])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
                return -EINVAL;

        err = nl80211_parse_key(info, &key);
        if (err)
                return err;

        if (key.idx >= 0) {
                if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP)
                        return -EINVAL;
                if (!key.p.key || !key.p.key_len)
                        return -EINVAL;
                if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 ||
                     key.p.key_len != WLAN_KEY_LEN_WEP40) &&
                    (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 ||
                     key.p.key_len != WLAN_KEY_LEN_WEP104))
                        return -EINVAL;
                if (key.idx > 3)
                        return -EINVAL;
        } else {
                key.p.key_len = 0;
                key.p.key = NULL;
        }

        if (key.idx >= 0) {
                int i;
                bool ok = false;

                for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) {
                        if (key.p.cipher == rdev->wiphy.cipher_suites[i]) {
                                ok = true;
                                break;
                        }
                }
                if (!ok)
                        return -EINVAL;
        }

        if (!rdev->ops->auth)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
        freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
        if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
                freq +=
                    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);

        chan = nl80211_get_valid_chan(&rdev->wiphy, freq);
        if (!chan)
                return -EINVAL;

        ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
        ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);

        if (info->attrs[NL80211_ATTR_IE]) {
                req.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
        }

        auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
        if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE))
                return -EINVAL;

        if ((auth_type == NL80211_AUTHTYPE_SAE ||
             auth_type == NL80211_AUTHTYPE_FILS_SK ||
             auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
             auth_type == NL80211_AUTHTYPE_FILS_PK) &&
            !info->attrs[NL80211_ATTR_AUTH_DATA])
                return -EINVAL;

        if (info->attrs[NL80211_ATTR_AUTH_DATA]) {
                if (auth_type != NL80211_AUTHTYPE_SAE &&
                    auth_type != NL80211_AUTHTYPE_FILS_SK &&
                    auth_type != NL80211_AUTHTYPE_FILS_SK_PFS &&
                    auth_type != NL80211_AUTHTYPE_FILS_PK)
                        return -EINVAL;
                req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
                req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
        }

        local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];

        /*
         * Since we no longer track auth state, ignore
         * requests to only change local state.
         */
        if (local_state_change)
                return 0;

        req.auth_type = auth_type;
        req.key = key.p.key;
        req.key_len = key.p.key_len;
        req.key_idx = key.idx;
        req.link_id = nl80211_link_id_or_invalid(info->attrs);
        if (req.link_id >= 0) {
                if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO))
                        return -EINVAL;
                if (!info->attrs[NL80211_ATTR_MLD_ADDR])
                        return -EINVAL;
                req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
                if (!is_valid_ether_addr(req.ap_mld_addr))
                        return -EINVAL;
        }

        req.bss = cfg80211_get_bss(&rdev->wiphy, chan, bssid, ssid, ssid_len,
                                   IEEE80211_BSS_TYPE_ESS,
                                   IEEE80211_PRIVACY_ANY);
        if (!req.bss)
                return -ENOENT;

        err = cfg80211_mlme_auth(rdev, dev, &req);

        cfg80211_put_bss(&rdev->wiphy, req.bss);

        return err;
}

static int validate_pae_over_nl80211(struct cfg80211_registered_device *rdev,
                                     struct genl_info *info)
{
        if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
                GENL_SET_ERR_MSG(info, "SOCKET_OWNER not set");
                return -EINVAL;
        }

        if (!rdev->ops->tx_control_port ||
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211))
                return -EOPNOTSUPP;

        return 0;
}

static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
                                   struct genl_info *info,
                                   struct cfg80211_crypto_settings *settings,
                                   int cipher_limit)
{
        memset(settings, 0, sizeof(*settings));

        settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT];

        if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
                u16 proto;

                proto = nla_get_u16(
                        info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
                settings->control_port_ethertype = cpu_to_be16(proto);
                if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
                    proto != ETH_P_PAE)
                        return -EINVAL;
                if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT])
                        settings->control_port_no_encrypt = true;
        } else
                settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE);

        if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) {
                int r = validate_pae_over_nl80211(rdev, info);

                if (r < 0)
                        return r;

                settings->control_port_over_nl80211 = true;

                if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_PREAUTH])
                        settings->control_port_no_preauth = true;
        }

        if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
                void *data;
                int len, i;

                data = nla_data(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]);
                len = nla_len(info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]);
                settings->n_ciphers_pairwise = len / sizeof(u32);

                if (len % sizeof(u32))
                        return -EINVAL;

                if (settings->n_ciphers_pairwise > cipher_limit)
                        return -EINVAL;

                memcpy(settings->ciphers_pairwise, data, len);

                for (i = 0; i < settings->n_ciphers_pairwise; i++)
                        if (!cfg80211_supported_cipher_suite(
                                        &rdev->wiphy,
                                        settings->ciphers_pairwise[i]))
                                return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]) {
                settings->cipher_group =
                        nla_get_u32(info->attrs[NL80211_ATTR_CIPHER_SUITE_GROUP]);
                if (!cfg80211_supported_cipher_suite(&rdev->wiphy,
                                                     settings->cipher_group))
                        return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_WPA_VERSIONS])
                settings->wpa_versions =
                        nla_get_u32(info->attrs[NL80211_ATTR_WPA_VERSIONS]);

        if (info->attrs[NL80211_ATTR_AKM_SUITES]) {
                void *data;
                int len;

                data = nla_data(info->attrs[NL80211_ATTR_AKM_SUITES]);
                len = nla_len(info->attrs[NL80211_ATTR_AKM_SUITES]);
                settings->n_akm_suites = len / sizeof(u32);

                if (len % sizeof(u32))
                        return -EINVAL;

                if (settings->n_akm_suites > rdev->wiphy.max_num_akm_suites)
                        return -EINVAL;

                memcpy(settings->akm_suites, data, len);
        }

        if (info->attrs[NL80211_ATTR_PMK]) {
                if (nla_len(info->attrs[NL80211_ATTR_PMK]) != WLAN_PMK_LEN)
                        return -EINVAL;
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK) &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK))
                        return -EINVAL;
                settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]);
        }

        if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) {
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_SAE_OFFLOAD) &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_SAE_OFFLOAD_AP))
                        return -EINVAL;
                settings->sae_pwd =
                        nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
                settings->sae_pwd_len =
                        nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]);
        }

        if (info->attrs[NL80211_ATTR_SAE_PWE])
                settings->sae_pwe =
                        nla_get_u8(info->attrs[NL80211_ATTR_SAE_PWE]);
        else
                settings->sae_pwe = NL80211_SAE_PWE_UNSPECIFIED;

        return 0;
}

static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device *rdev,
                                              const u8 *ssid, int ssid_len,
                                              struct nlattr **attrs,
                                              int assoc_link_id, int link_id)
{
        struct ieee80211_channel *chan;
        struct cfg80211_bss *bss;
        const u8 *bssid;
        u32 freq, use_for = 0;

        if (!attrs[NL80211_ATTR_MAC] || !attrs[NL80211_ATTR_WIPHY_FREQ])
                return ERR_PTR(-EINVAL);

        bssid = nla_data(attrs[NL80211_ATTR_MAC]);

        freq = MHZ_TO_KHZ(nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]));
        if (attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
                freq += nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);

        chan = nl80211_get_valid_chan(&rdev->wiphy, freq);
        if (!chan)
                return ERR_PTR(-EINVAL);

        if (assoc_link_id >= 0)
                use_for = NL80211_BSS_USE_FOR_MLD_LINK;
        if (assoc_link_id == link_id)
                use_for |= NL80211_BSS_USE_FOR_NORMAL;

        bss = __cfg80211_get_bss(&rdev->wiphy, chan, bssid,
                                 ssid, ssid_len,
                                 IEEE80211_BSS_TYPE_ESS,
                                 IEEE80211_PRIVACY_ANY,
                                 use_for);
        if (!bss)
                return ERR_PTR(-ENOENT);

        return bss;
}

static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_assoc_request req = {};
        struct nlattr **attrs = NULL;
        const u8 *ap_addr, *ssid;
        unsigned int link_id;
        int err, ssid_len;

        if (dev->ieee80211_ptr->conn_owner_nlportid &&
            dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
                return -EPERM;

        if (!info->attrs[NL80211_ATTR_SSID])
                return -EINVAL;

        if (!rdev->ops->assoc)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
        ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);

        if (info->attrs[NL80211_ATTR_IE]) {
                req.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);

                if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
                                           req.ie, req.ie_len)) {
                        NL_SET_ERR_MSG_ATTR(info->extack,
                                            info->attrs[NL80211_ATTR_IE],
                                            "non-inheritance makes no sense");
                        return -EINVAL;
                }
        }

        if (info->attrs[NL80211_ATTR_USE_MFP]) {
                enum nl80211_mfp mfp =
                        nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
                if (mfp == NL80211_MFP_REQUIRED)
                        req.use_mfp = true;
                else if (mfp != NL80211_MFP_NO)
                        return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_PREV_BSSID])
                req.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HT]))
                req.flags |= ASSOC_REQ_DISABLE_HT;

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
                memcpy(&req.ht_capa_mask,
                       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]),
                       sizeof(req.ht_capa_mask));

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
                        return -EINVAL;
                memcpy(&req.ht_capa,
                       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
                       sizeof(req.ht_capa));
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT]))
                req.flags |= ASSOC_REQ_DISABLE_VHT;

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE]))
                req.flags |= ASSOC_REQ_DISABLE_HE;

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
                req.flags |= ASSOC_REQ_DISABLE_EHT;

        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
                memcpy(&req.vht_capa_mask,
                       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
                       sizeof(req.vht_capa_mask));

        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
                        return -EINVAL;
                memcpy(&req.vht_capa,
                       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]),
                       sizeof(req.vht_capa));
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
                if (!((rdev->wiphy.features &
                        NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
                       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_RRM))
                        return -EINVAL;
                req.flags |= ASSOC_REQ_USE_RRM;
        }

        if (info->attrs[NL80211_ATTR_FILS_KEK]) {
                req.fils_kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]);
                req.fils_kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]);
                if (!info->attrs[NL80211_ATTR_FILS_NONCES])
                        return -EINVAL;
                req.fils_nonces =
                        nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]);
        }

        if (info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]) {
                if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY])
                        return -EINVAL;
                memcpy(&req.s1g_capa_mask,
                       nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]),
                       sizeof(req.s1g_capa_mask));
        }

        if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK])
                        return -EINVAL;
                memcpy(&req.s1g_capa,
                       nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]),
                       sizeof(req.s1g_capa));
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_ASSOC_SPP_AMSDU])) {
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT)) {
                        GENL_SET_ERR_MSG(info, "SPP A-MSDUs not supported");
                        return -EINVAL;
                }
                req.flags |= ASSOC_REQ_SPP_AMSDU;
        }

        req.link_id = nl80211_link_id_or_invalid(info->attrs);

        if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
                unsigned int attrsize = NUM_NL80211_ATTR * sizeof(*attrs);
                struct nlattr *link;
                int rem = 0;

                if (req.link_id < 0)
                        return -EINVAL;

                if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO))
                        return -EINVAL;

                if (info->attrs[NL80211_ATTR_MAC] ||
                    info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
                    !info->attrs[NL80211_ATTR_MLD_ADDR])
                        return -EINVAL;

                req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
                ap_addr = req.ap_mld_addr;

                attrs = kzalloc(attrsize, GFP_KERNEL);
                if (!attrs)
                        return -ENOMEM;

                nla_for_each_nested(link,
                                    info->attrs[NL80211_ATTR_MLO_LINKS],
                                    rem) {
                        memset(attrs, 0, attrsize);

                        nla_parse_nested(attrs, NL80211_ATTR_MAX,
                                         link, NULL, NULL);

                        if (!attrs[NL80211_ATTR_MLO_LINK_ID]) {
                                err = -EINVAL;
                                NL_SET_BAD_ATTR(info->extack, link);
                                goto free;
                        }

                        link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]);
                        /* cannot use the same link ID again */
                        if (req.links[link_id].bss) {
                                err = -EINVAL;
                                NL_SET_BAD_ATTR(info->extack, link);
                                goto free;
                        }
                        req.links[link_id].bss =
                                nl80211_assoc_bss(rdev, ssid, ssid_len, attrs,
                                                  req.link_id, link_id);
                        if (IS_ERR(req.links[link_id].bss)) {
                                err = PTR_ERR(req.links[link_id].bss);
                                req.links[link_id].bss = NULL;
                                NL_SET_ERR_MSG_ATTR(info->extack,
                                                    link, "Error fetching BSS for link");
                                goto free;
                        }

                        if (attrs[NL80211_ATTR_IE]) {
                                req.links[link_id].elems =
                                        nla_data(attrs[NL80211_ATTR_IE]);
                                req.links[link_id].elems_len =
                                        nla_len(attrs[NL80211_ATTR_IE]);

                                if (cfg80211_find_elem(WLAN_EID_FRAGMENT,
                                                       req.links[link_id].elems,
                                                       req.links[link_id].elems_len)) {
                                        NL_SET_ERR_MSG_ATTR(info->extack,
                                                            attrs[NL80211_ATTR_IE],
                                                            "cannot deal with fragmentation");
                                        err = -EINVAL;
                                        goto free;
                                }

                                if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
                                                           req.links[link_id].elems,
                                                           req.links[link_id].elems_len)) {
                                        NL_SET_ERR_MSG_ATTR(info->extack,
                                                            attrs[NL80211_ATTR_IE],
                                                            "cannot deal with non-inheritance");
                                        err = -EINVAL;
                                        goto free;
                                }
                        }

                        req.links[link_id].disabled =
                                nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]);
                }

                if (!req.links[req.link_id].bss) {
                        err = -EINVAL;
                        goto free;
                }

                if (req.links[req.link_id].elems_len) {
                        GENL_SET_ERR_MSG(info,
                                         "cannot have per-link elems on assoc link");
                        err = -EINVAL;
                        goto free;
                }

                if (req.links[req.link_id].disabled) {
                        GENL_SET_ERR_MSG(info,
                                         "cannot have assoc link disabled");
                        err = -EINVAL;
                        goto free;
                }

                kfree(attrs);
                attrs = NULL;
        } else {
                if (req.link_id >= 0)
                        return -EINVAL;

                req.bss = nl80211_assoc_bss(rdev, ssid, ssid_len, info->attrs,
                                            -1, -1);
                if (IS_ERR(req.bss))
                        return PTR_ERR(req.bss);
                ap_addr = req.bss->bssid;
        }

        err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
        if (!err) {
                struct nlattr *link;
                int rem = 0;

                err = cfg80211_mlme_assoc(rdev, dev, &req,
                                          info->extack);

                if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
                        dev->ieee80211_ptr->conn_owner_nlportid =
                                info->snd_portid;
                        memcpy(dev->ieee80211_ptr->disconnect_bssid,
                               ap_addr, ETH_ALEN);
                }

                /* Report error from first problematic link */
                if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
                        nla_for_each_nested(link,
                                            info->attrs[NL80211_ATTR_MLO_LINKS],
                                            rem) {
                                struct nlattr *link_id_attr =
                                        nla_find_nested(link, NL80211_ATTR_MLO_LINK_ID);

                                if (!link_id_attr)
                                        continue;

                                link_id = nla_get_u8(link_id_attr);

                                if (link_id == req.link_id)
                                        continue;

                                if (!req.links[link_id].error ||
                                    WARN_ON(req.links[link_id].error > 0))
                                        continue;

                                WARN_ON(err >= 0);

                                NL_SET_BAD_ATTR(info->extack, link);
                                err = req.links[link_id].error;
                                break;
                        }
                }
        }

free:
        for (link_id = 0; link_id < ARRAY_SIZE(req.links); link_id++)
                cfg80211_put_bss(&rdev->wiphy, req.links[link_id].bss);
        cfg80211_put_bss(&rdev->wiphy, req.bss);
        kfree(attrs);

        return err;
}

static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        const u8 *ie = NULL, *bssid;
        int ie_len = 0;
        u16 reason_code;
        bool local_state_change;

        if (dev->ieee80211_ptr->conn_owner_nlportid &&
            dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
                return -EPERM;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_REASON_CODE])
                return -EINVAL;

        if (!rdev->ops->deauth)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);

        reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
        if (reason_code == 0) {
                /* Reason Code 0 is reserved */
                return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_IE]) {
                ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
        }

        local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];

        return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
                                    local_state_change);
}

static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        const u8 *ie = NULL, *bssid;
        int ie_len = 0;
        u16 reason_code;
        bool local_state_change;

        if (dev->ieee80211_ptr->conn_owner_nlportid &&
            dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
                return -EPERM;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_REASON_CODE])
                return -EINVAL;

        if (!rdev->ops->disassoc)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);

        reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
        if (reason_code == 0) {
                /* Reason Code 0 is reserved */
                return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_IE]) {
                ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
        }

        local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];

        return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
                                      local_state_change);
}

static bool
nl80211_parse_mcast_rate(struct cfg80211_registered_device *rdev,
                         int mcast_rate[NUM_NL80211_BANDS],
                         int rateval)
{
        struct wiphy *wiphy = &rdev->wiphy;
        bool found = false;
        int band, i;

        for (band = 0; band < NUM_NL80211_BANDS; band++) {
                struct ieee80211_supported_band *sband;

                sband = wiphy->bands[band];
                if (!sband)
                        continue;

                for (i = 0; i < sband->n_bitrates; i++) {
                        if (sband->bitrates[i].bitrate == rateval) {
                                mcast_rate[band] = i + 1;
                                found = true;
                                break;
                        }
                }
        }

        return found;
}

static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_ibss_params ibss;
        struct wiphy *wiphy;
        struct cfg80211_cached_keys *connkeys = NULL;
        int err;

        memset(&ibss, 0, sizeof(ibss));

        if (!info->attrs[NL80211_ATTR_SSID] ||
            !nla_len(info->attrs[NL80211_ATTR_SSID]))
                return -EINVAL;

        ibss.beacon_interval = 100;

        if (info->attrs[NL80211_ATTR_BEACON_INTERVAL])
                ibss.beacon_interval =
                        nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);

        err = cfg80211_validate_beacon_int(rdev, NL80211_IFTYPE_ADHOC,
                                           ibss.beacon_interval);
        if (err)
                return err;

        if (!rdev->ops->join_ibss)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
                return -EOPNOTSUPP;

        wiphy = &rdev->wiphy;

        if (info->attrs[NL80211_ATTR_MAC]) {
                ibss.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);

                if (!is_valid_ether_addr(ibss.bssid))
                        return -EINVAL;
        }
        ibss.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
        ibss.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);

        if (info->attrs[NL80211_ATTR_IE]) {
                ibss.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                ibss.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
        }

        err = nl80211_parse_chandef(rdev, info, &ibss.chandef);
        if (err)
                return err;

        if (!cfg80211_reg_can_beacon(&rdev->wiphy, &ibss.chandef,
                                     NL80211_IFTYPE_ADHOC))
                return -EINVAL;

        switch (ibss.chandef.width) {
        case NL80211_CHAN_WIDTH_5:
        case NL80211_CHAN_WIDTH_10:
        case NL80211_CHAN_WIDTH_20_NOHT:
                break;
        case NL80211_CHAN_WIDTH_20:
        case NL80211_CHAN_WIDTH_40:
                if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS))
                        return -EINVAL;
                break;
        case NL80211_CHAN_WIDTH_80:
        case NL80211_CHAN_WIDTH_80P80:
        case NL80211_CHAN_WIDTH_160:
                if (!(rdev->wiphy.features & NL80211_FEATURE_HT_IBSS))
                        return -EINVAL;
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_VHT_IBSS))
                        return -EINVAL;
                break;
        case NL80211_CHAN_WIDTH_320:
                return -EINVAL;
        default:
                return -EINVAL;
        }

        ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
        ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];

        if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
                u8 *rates =
                        nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
                int n_rates =
                        nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
                struct ieee80211_supported_band *sband =
                        wiphy->bands[ibss.chandef.chan->band];

                err = ieee80211_get_ratemask(sband, rates, n_rates,
                                             &ibss.basic_rates);
                if (err)
                        return err;
        }

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
                memcpy(&ibss.ht_capa_mask,
                       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]),
                       sizeof(ibss.ht_capa_mask));

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
                        return -EINVAL;
                memcpy(&ibss.ht_capa,
                       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
                       sizeof(ibss.ht_capa));
        }

        if (info->attrs[NL80211_ATTR_MCAST_RATE] &&
            !nl80211_parse_mcast_rate(rdev, ibss.mcast_rate,
                        nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE])))
                return -EINVAL;

        if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
                bool no_ht = false;

                connkeys = nl80211_parse_connkeys(rdev, info, &no_ht);
                if (IS_ERR(connkeys))
                        return PTR_ERR(connkeys);

                if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
                    no_ht) {
                        kfree_sensitive(connkeys);
                        return -EINVAL;
                }
        }

        ibss.control_port =
                nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT]);

        if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) {
                int r = validate_pae_over_nl80211(rdev, info);

                if (r < 0) {
                        kfree_sensitive(connkeys);
                        return r;
                }

                ibss.control_port_over_nl80211 = true;
        }

        ibss.userspace_handles_dfs =
                nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);

        err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
        if (err)
                kfree_sensitive(connkeys);
        else if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
                dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;

        return err;
}

static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];

        if (!rdev->ops->leave_ibss)
                return -EOPNOTSUPP;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
                return -EOPNOTSUPP;

        return cfg80211_leave_ibss(rdev, dev, false);
}

static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        int mcast_rate[NUM_NL80211_BANDS];
        u32 nla_rate;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB)
                return -EOPNOTSUPP;

        if (!rdev->ops->set_mcast_rate)
                return -EOPNOTSUPP;

        memset(mcast_rate, 0, sizeof(mcast_rate));

        if (!info->attrs[NL80211_ATTR_MCAST_RATE])
                return -EINVAL;

        nla_rate = nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE]);
        if (!nl80211_parse_mcast_rate(rdev, mcast_rate, nla_rate))
                return -EINVAL;

        return rdev_set_mcast_rate(rdev, dev, mcast_rate);
}

static struct sk_buff *
__cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev, int approxlen,
                            u32 portid, u32 seq, enum nl80211_commands cmd,
                            enum nl80211_attrs attr,
                            const struct nl80211_vendor_cmd_info *info,
                            gfp_t gfp)
{
        struct sk_buff *skb;
        void *hdr;
        struct nlattr *data;

        skb = nlmsg_new(approxlen + 100, gfp);
        if (!skb)
                return NULL;

        hdr = nl80211hdr_put(skb, portid, seq, 0, cmd);
        if (!hdr) {
                kfree_skb(skb);
                return NULL;
        }

        if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx))
                goto nla_put_failure;

        if (info) {
                if (nla_put_u32(skb, NL80211_ATTR_VENDOR_ID,
                                info->vendor_id))
                        goto nla_put_failure;
                if (nla_put_u32(skb, NL80211_ATTR_VENDOR_SUBCMD,
                                info->subcmd))
                        goto nla_put_failure;
        }

        if (wdev) {
                if (nla_put_u64_64bit(skb, NL80211_ATTR_WDEV,
                                      wdev_id(wdev), NL80211_ATTR_PAD))
                        goto nla_put_failure;
                if (wdev->netdev &&
                    nla_put_u32(skb, NL80211_ATTR_IFINDEX,
                                wdev->netdev->ifindex))
                        goto nla_put_failure;
        }

        data = nla_nest_start_noflag(skb, attr);
        if (!data)
                goto nla_put_failure;

        ((void **)skb->cb)[0] = rdev;
        ((void **)skb->cb)[1] = hdr;
        ((void **)skb->cb)[2] = data;

        return skb;

 nla_put_failure:
        kfree_skb(skb);
        return NULL;
}

struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy,
                                           struct wireless_dev *wdev,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           unsigned int portid,
                                           int vendor_event_idx,
                                           int approxlen, gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        const struct nl80211_vendor_cmd_info *info;

        switch (cmd) {
        case NL80211_CMD_TESTMODE:
                if (WARN_ON(vendor_event_idx != -1))
                        return NULL;
                info = NULL;
                break;
        case NL80211_CMD_VENDOR:
                if (WARN_ON(vendor_event_idx < 0 ||
                            vendor_event_idx >= wiphy->n_vendor_events))
                        return NULL;
                info = &wiphy->vendor_events[vendor_event_idx];
                break;
        default:
                WARN_ON(1);
                return NULL;
        }

        return __cfg80211_alloc_vendor_skb(rdev, wdev, approxlen, portid, 0,
                                           cmd, attr, info, gfp);
}
EXPORT_SYMBOL(__cfg80211_alloc_event_skb);

void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0];
        void *hdr = ((void **)skb->cb)[1];
        struct nlmsghdr *nlhdr = nlmsg_hdr(skb);
        struct nlattr *data = ((void **)skb->cb)[2];
        enum nl80211_multicast_groups mcgrp = NL80211_MCGRP_TESTMODE;

        /* clear CB data for netlink core to own from now on */
        memset(skb->cb, 0, sizeof(skb->cb));

        nla_nest_end(skb, data);
        genlmsg_end(skb, hdr);

        if (nlhdr->nlmsg_pid) {
                genlmsg_unicast(wiphy_net(&rdev->wiphy), skb,
                                nlhdr->nlmsg_pid);
        } else {
                if (data->nla_type == NL80211_ATTR_VENDOR_DATA)
                        mcgrp = NL80211_MCGRP_VENDOR;

                genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
                                        skb, 0, mcgrp, gfp);
        }
}
EXPORT_SYMBOL(__cfg80211_send_event_skb);

#ifdef CONFIG_NL80211_TESTMODE
static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev;
        int err;

        lockdep_assert_held(&rdev->wiphy.mtx);

        wdev = __cfg80211_wdev_from_attrs(rdev, genl_info_net(info),
                                          info->attrs);

        if (!rdev->ops->testmode_cmd)
                return -EOPNOTSUPP;

        if (IS_ERR(wdev)) {
                err = PTR_ERR(wdev);
                if (err != -EINVAL)
                        return err;
                wdev = NULL;
        } else if (wdev->wiphy != &rdev->wiphy) {
                return -EINVAL;
        }

        if (!info->attrs[NL80211_ATTR_TESTDATA])
                return -EINVAL;

        rdev->cur_cmd_info = info;
        err = rdev_testmode_cmd(rdev, wdev,
                                nla_data(info->attrs[NL80211_ATTR_TESTDATA]),
                                nla_len(info->attrs[NL80211_ATTR_TESTDATA]));
        rdev->cur_cmd_info = NULL;

        return err;
}

static int nl80211_testmode_dump(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        struct cfg80211_registered_device *rdev;
        struct nlattr **attrbuf = NULL;
        int err;
        long phy_idx;
        void *data = NULL;
        int data_len = 0;

        rtnl_lock();

        if (cb->args[0]) {
                /*
                 * 0 is a valid index, but not valid for args[0],
                 * so we need to offset by 1.
                 */
                phy_idx = cb->args[0] - 1;

                rdev = cfg80211_rdev_by_wiphy_idx(phy_idx);
                if (!rdev) {
                        err = -ENOENT;
                        goto out_err;
                }
        } else {
                attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf),
                                  GFP_KERNEL);
                if (!attrbuf) {
                        err = -ENOMEM;
                        goto out_err;
                }

                err = nlmsg_parse_deprecated(cb->nlh,
                                             GENL_HDRLEN + nl80211_fam.hdrsize,
                                             attrbuf, nl80211_fam.maxattr,
                                             nl80211_policy, NULL);
                if (err)
                        goto out_err;

                rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
                if (IS_ERR(rdev)) {
                        err = PTR_ERR(rdev);
                        goto out_err;
                }
                phy_idx = rdev->wiphy_idx;

                if (attrbuf[NL80211_ATTR_TESTDATA])
                        cb->args[1] = (long)attrbuf[NL80211_ATTR_TESTDATA];
        }

        if (cb->args[1]) {
                data = nla_data((void *)cb->args[1]);
                data_len = nla_len((void *)cb->args[1]);
        }

        if (!rdev->ops->testmode_dump) {
                err = -EOPNOTSUPP;
                goto out_err;
        }

        while (1) {
                void *hdr = nl80211hdr_put(skb, NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                           NL80211_CMD_TESTMODE);
                struct nlattr *tmdata;

                if (!hdr)
                        break;

                if (nla_put_u32(skb, NL80211_ATTR_WIPHY, phy_idx)) {
                        genlmsg_cancel(skb, hdr);
                        break;
                }

                tmdata = nla_nest_start_noflag(skb, NL80211_ATTR_TESTDATA);
                if (!tmdata) {
                        genlmsg_cancel(skb, hdr);
                        break;
                }
                err = rdev_testmode_dump(rdev, skb, cb, data, data_len);
                nla_nest_end(skb, tmdata);

                if (err == -ENOBUFS || err == -ENOENT) {
                        genlmsg_cancel(skb, hdr);
                        break;
                } else if (err) {
                        genlmsg_cancel(skb, hdr);
                        goto out_err;
                }

                genlmsg_end(skb, hdr);
        }

        err = skb->len;
        /* see above */
        cb->args[0] = phy_idx + 1;
 out_err:
        kfree(attrbuf);
        rtnl_unlock();
        return err;
}
#endif

static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_connect_params connect;
        struct wiphy *wiphy;
        struct cfg80211_cached_keys *connkeys = NULL;
        u32 freq = 0;
        int err;

        memset(&connect, 0, sizeof(connect));

        if (!info->attrs[NL80211_ATTR_SSID] ||
            !nla_len(info->attrs[NL80211_ATTR_SSID]))
                return -EINVAL;

        if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
                connect.auth_type =
                        nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
                if (!nl80211_valid_auth_type(rdev, connect.auth_type,
                                             NL80211_CMD_CONNECT))
                        return -EINVAL;
        } else
                connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;

        connect.privacy = info->attrs[NL80211_ATTR_PRIVACY];

        if (info->attrs[NL80211_ATTR_WANT_1X_4WAY_HS] &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X))
                return -EINVAL;
        connect.want_1x = info->attrs[NL80211_ATTR_WANT_1X_4WAY_HS];

        err = nl80211_crypto_settings(rdev, info, &connect.crypto,
                                      NL80211_MAX_NR_CIPHER_SUITES);
        if (err)
                return err;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        wiphy = &rdev->wiphy;

        connect.bg_scan_period = -1;
        if (info->attrs[NL80211_ATTR_BG_SCAN_PERIOD] &&
                (wiphy->flags & WIPHY_FLAG_SUPPORTS_FW_ROAM)) {
                connect.bg_scan_period =
                        nla_get_u16(info->attrs[NL80211_ATTR_BG_SCAN_PERIOD]);
        }

        if (info->attrs[NL80211_ATTR_MAC])
                connect.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
        else if (info->attrs[NL80211_ATTR_MAC_HINT])
                connect.bssid_hint =
                        nla_data(info->attrs[NL80211_ATTR_MAC_HINT]);
        connect.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
        connect.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);

        if (info->attrs[NL80211_ATTR_IE]) {
                connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
        }

        if (info->attrs[NL80211_ATTR_USE_MFP]) {
                connect.mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
                if (connect.mfp == NL80211_MFP_OPTIONAL &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_MFP_OPTIONAL))
                        return -EOPNOTSUPP;
        } else {
                connect.mfp = NL80211_MFP_NO;
        }

        if (info->attrs[NL80211_ATTR_PREV_BSSID])
                connect.prev_bssid =
                        nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);

        if (info->attrs[NL80211_ATTR_WIPHY_FREQ])
                freq = MHZ_TO_KHZ(nla_get_u32(
                                        info->attrs[NL80211_ATTR_WIPHY_FREQ]));
        if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET])
                freq +=
                    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]);

        if (freq) {
                connect.channel = nl80211_get_valid_chan(wiphy, freq);
                if (!connect.channel)
                        return -EINVAL;
        } else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) {
                freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]);
                freq = MHZ_TO_KHZ(freq);
                connect.channel_hint = nl80211_get_valid_chan(wiphy, freq);
                if (!connect.channel_hint)
                        return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) {
                connect.edmg.channels =
                      nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]);

                if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG])
                        connect.edmg.bw_config =
                                nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]);
        }

        if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
                connkeys = nl80211_parse_connkeys(rdev, info, NULL);
                if (IS_ERR(connkeys))
                        return PTR_ERR(connkeys);
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HT]))
                connect.flags |= ASSOC_REQ_DISABLE_HT;

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK])
                memcpy(&connect.ht_capa_mask,
                       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]),
                       sizeof(connect.ht_capa_mask));

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) {
                        kfree_sensitive(connkeys);
                        return -EINVAL;
                }
                memcpy(&connect.ht_capa,
                       nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]),
                       sizeof(connect.ht_capa));
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_VHT]))
                connect.flags |= ASSOC_REQ_DISABLE_VHT;

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HE]))
                connect.flags |= ASSOC_REQ_DISABLE_HE;

        if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
                connect.flags |= ASSOC_REQ_DISABLE_EHT;

        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
                memcpy(&connect.vht_capa_mask,
                       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
                       sizeof(connect.vht_capa_mask));

        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY]) {
                if (!info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) {
                        kfree_sensitive(connkeys);
                        return -EINVAL;
                }
                memcpy(&connect.vht_capa,
                       nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]),
                       sizeof(connect.vht_capa));
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
                if (!((rdev->wiphy.features &
                        NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
                       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
                    !wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_RRM)) {
                        kfree_sensitive(connkeys);
                        return -EINVAL;
                }
                connect.flags |= ASSOC_REQ_USE_RRM;
        }

        connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
        if (connect.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) {
                kfree_sensitive(connkeys);
                return -EOPNOTSUPP;
        }

        if (info->attrs[NL80211_ATTR_BSS_SELECT]) {
                /* bss selection makes no sense if bssid is set */
                if (connect.bssid) {
                        kfree_sensitive(connkeys);
                        return -EINVAL;
                }

                err = parse_bss_select(info->attrs[NL80211_ATTR_BSS_SELECT],
                                       wiphy, &connect.bss_select);
                if (err) {
                        kfree_sensitive(connkeys);
                        return err;
                }
        }

        if (wiphy_ext_feature_isset(&rdev->wiphy,
                                    NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) &&
            info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
            info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
            info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
            info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
                connect.fils_erp_username =
                        nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
                connect.fils_erp_username_len =
                        nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
                connect.fils_erp_realm =
                        nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
                connect.fils_erp_realm_len =
                        nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
                connect.fils_erp_next_seq_num =
                        nla_get_u16(
                           info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
                connect.fils_erp_rrk =
                        nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
                connect.fils_erp_rrk_len =
                        nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
        } else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
                kfree_sensitive(connkeys);
                return -EINVAL;
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])) {
                if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
                        kfree_sensitive(connkeys);
                        GENL_SET_ERR_MSG(info,
                                         "external auth requires connection ownership");
                        return -EINVAL;
                }
                connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT;
        }

        if (nla_get_flag(info->attrs[NL80211_ATTR_MLO_SUPPORT]))
                connect.flags |= CONNECT_REQ_MLO_SUPPORT;

        err = cfg80211_connect(rdev, dev, &connect, connkeys,
                               connect.prev_bssid);
        if (err)
                kfree_sensitive(connkeys);

        if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
                dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
                if (connect.bssid)
                        memcpy(dev->ieee80211_ptr->disconnect_bssid,
                               connect.bssid, ETH_ALEN);
                else
                        eth_zero_addr(dev->ieee80211_ptr->disconnect_bssid);
        }

        return err;
}

static int nl80211_update_connect_params(struct sk_buff *skb,
                                         struct genl_info *info)
{
        struct cfg80211_connect_params connect = {};
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        bool fils_sk_offload;
        u32 auth_type;
        u32 changed = 0;

        if (!rdev->ops->update_connect_params)
                return -EOPNOTSUPP;

        if (info->attrs[NL80211_ATTR_IE]) {
                connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
                changed |= UPDATE_ASSOC_IES;
        }

        fils_sk_offload = wiphy_ext_feature_isset(&rdev->wiphy,
                                                  NL80211_EXT_FEATURE_FILS_SK_OFFLOAD);

        /*
         * when driver supports fils-sk offload all attributes must be
         * provided. So the else covers "fils-sk-not-all" and
         * "no-fils-sk-any".
         */
        if (fils_sk_offload &&
            info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
            info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
            info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
            info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
                connect.fils_erp_username =
                        nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
                connect.fils_erp_username_len =
                        nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
                connect.fils_erp_realm =
                        nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
                connect.fils_erp_realm_len =
                        nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
                connect.fils_erp_next_seq_num =
                        nla_get_u16(
                           info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
                connect.fils_erp_rrk =
                        nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
                connect.fils_erp_rrk_len =
                        nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
                changed |= UPDATE_FILS_ERP_INFO;
        } else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
                   info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
                return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
                auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
                if (!nl80211_valid_auth_type(rdev, auth_type,
                                             NL80211_CMD_CONNECT))
                        return -EINVAL;

                if (auth_type == NL80211_AUTHTYPE_FILS_SK &&
                    fils_sk_offload && !(changed & UPDATE_FILS_ERP_INFO))
                        return -EINVAL;

                connect.auth_type = auth_type;
                changed |= UPDATE_AUTH_TYPE;
        }

        if (!wdev->connected)
                return -ENOLINK;

        return rdev_update_connect_params(rdev, dev, &connect, changed);
}

static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        u16 reason;

        if (dev->ieee80211_ptr->conn_owner_nlportid &&
            dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
                return -EPERM;

        if (!info->attrs[NL80211_ATTR_REASON_CODE])
                reason = WLAN_REASON_DEAUTH_LEAVING;
        else
                reason = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);

        if (reason == 0)
                return -EINVAL;

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        return cfg80211_disconnect(rdev, dev, reason, true);
}

static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net *net;
        int err;

        if (info->attrs[NL80211_ATTR_PID]) {
                u32 pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]);

                net = get_net_ns_by_pid(pid);
        } else if (info->attrs[NL80211_ATTR_NETNS_FD]) {
                u32 fd = nla_get_u32(info->attrs[NL80211_ATTR_NETNS_FD]);

                net = get_net_ns_by_fd(fd);
        } else {
                return -EINVAL;
        }

        if (IS_ERR(net))
                return PTR_ERR(net);

        err = 0;

        /* check if anything to do */
        if (!net_eq(wiphy_net(&rdev->wiphy), net))
                err = cfg80211_switch_netns(rdev, net);

        put_net(net);
        return err;
}

static int nl80211_set_pmksa(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_pmksa pmksa;
        bool ap_pmksa_caching_support = false;

        memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));

        ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy,
                NL80211_EXT_FEATURE_AP_PMKSA_CACHING);

        if (!info->attrs[NL80211_ATTR_PMKID])
                return -EINVAL;

        pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);

        if (info->attrs[NL80211_ATTR_MAC]) {
                pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
        } else if (info->attrs[NL80211_ATTR_SSID] &&
                   info->attrs[NL80211_ATTR_FILS_CACHE_ID] &&
                   info->attrs[NL80211_ATTR_PMK]) {
                pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
                pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
                pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]);
        } else {
                return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_PMK]) {
                pmksa.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]);
                pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]);
        }

        if (info->attrs[NL80211_ATTR_PMK_LIFETIME])
                pmksa.pmk_lifetime =
                        nla_get_u32(info->attrs[NL80211_ATTR_PMK_LIFETIME]);

        if (info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD])
                pmksa.pmk_reauth_threshold =
                        nla_get_u8(info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]);

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
            !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP ||
               dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) &&
               ap_pmksa_caching_support))
                return -EOPNOTSUPP;

        if (!rdev->ops->set_pmksa)
                return -EOPNOTSUPP;

        return rdev_set_pmksa(rdev, dev, &pmksa);
}

static int nl80211_del_pmksa(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_pmksa pmksa;
        bool sae_offload_support = false;
        bool owe_offload_support = false;
        bool ap_pmksa_caching_support = false;

        memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));

        sae_offload_support = wiphy_ext_feature_isset(&rdev->wiphy,
                NL80211_EXT_FEATURE_SAE_OFFLOAD);
        owe_offload_support = wiphy_ext_feature_isset(&rdev->wiphy,
                NL80211_EXT_FEATURE_OWE_OFFLOAD);
        ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy,
                NL80211_EXT_FEATURE_AP_PMKSA_CACHING);

        if (info->attrs[NL80211_ATTR_PMKID])
                pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);

        if (info->attrs[NL80211_ATTR_MAC]) {
                pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
        } else if (info->attrs[NL80211_ATTR_SSID]) {
                /* SSID based pmksa flush suppported only for FILS,
                 * OWE/SAE OFFLOAD cases
                 */
                if (info->attrs[NL80211_ATTR_FILS_CACHE_ID] &&
                    info->attrs[NL80211_ATTR_PMK]) {
                        pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]);
                } else if (!sae_offload_support && !owe_offload_support) {
                        return -EINVAL;
                }
                pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
                pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
        } else {
                return -EINVAL;
        }

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
            !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP ||
               dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) &&
               ap_pmksa_caching_support))
                return -EOPNOTSUPP;

        if (!rdev->ops->del_pmksa)
                return -EOPNOTSUPP;

        return rdev_del_pmksa(rdev, dev, &pmksa);
}

static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];

        if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        if (!rdev->ops->flush_pmksa)
                return -EOPNOTSUPP;

        return rdev_flush_pmksa(rdev, dev);
}

static int nl80211_tdls_mgmt(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        u8 action_code, dialog_token;
        u32 peer_capability = 0;
        u16 status_code;
        u8 *peer;
        int link_id;
        bool initiator;

        if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) ||
            !rdev->ops->tdls_mgmt)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_TDLS_ACTION] ||
            !info->attrs[NL80211_ATTR_STATUS_CODE] ||
            !info->attrs[NL80211_ATTR_TDLS_DIALOG_TOKEN] ||
            !info->attrs[NL80211_ATTR_IE] ||
            !info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
        action_code = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_ACTION]);
        status_code = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]);
        dialog_token = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_DIALOG_TOKEN]);
        initiator = nla_get_flag(info->attrs[NL80211_ATTR_TDLS_INITIATOR]);
        if (info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY])
                peer_capability =
                        nla_get_u32(info->attrs[NL80211_ATTR_TDLS_PEER_CAPABILITY]);
        link_id = nl80211_link_id_or_invalid(info->attrs);

        return rdev_tdls_mgmt(rdev, dev, peer, link_id, action_code,
                              dialog_token, status_code, peer_capability,
                              initiator,
                              nla_data(info->attrs[NL80211_ATTR_IE]),
                              nla_len(info->attrs[NL80211_ATTR_IE]));
}

static int nl80211_tdls_oper(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        enum nl80211_tdls_operation operation;
        u8 *peer;

        if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_TDLS) ||
            !rdev->ops->tdls_oper)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_TDLS_OPERATION] ||
            !info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        operation = nla_get_u8(info->attrs[NL80211_ATTR_TDLS_OPERATION]);
        peer = nla_data(info->attrs[NL80211_ATTR_MAC]);

        return rdev_tdls_oper(rdev, dev, peer, operation);
}

static int nl80211_remain_on_channel(struct sk_buff *skb,
                                     struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct wireless_dev *wdev = info->user_ptr[1];
        struct cfg80211_chan_def chandef;
        struct sk_buff *msg;
        void *hdr;
        u64 cookie;
        u32 duration;
        int err;

        if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
            !info->attrs[NL80211_ATTR_DURATION])
                return -EINVAL;

        duration = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]);

        if (!rdev->ops->remain_on_channel ||
            !(rdev->wiphy.flags & WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL))
                return -EOPNOTSUPP;

        /*
         * We should be on that channel for at least a minimum amount of
         * time (10ms) but no longer than the driver supports.
         */
        if (duration < NL80211_MIN_REMAIN_ON_CHANNEL_TIME ||
            duration > rdev->wiphy.max_remain_on_channel_duration)
                return -EINVAL;

        err = nl80211_parse_chandef(rdev, info, &chandef);
        if (err)
                return err;

        if (!cfg80211_off_channel_oper_allowed(wdev, chandef.chan)) {
                const struct cfg80211_chan_def *oper_chandef, *compat_chandef;

                oper_chandef = wdev_chandef(wdev, link_id);

                if (WARN_ON(!oper_chandef)) {
                        /* cannot happen since we must beacon to get here */
                        WARN_ON(1);
                        return -EBUSY;
                }

                /* note: returns first one if identical chandefs */
                compat_chandef = cfg80211_chandef_compatible(&chandef,
                                                             oper_chandef);

                if (compat_chandef != &chandef)
                        return -EBUSY;
        }

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_REMAIN_ON_CHANNEL);
        if (!hdr) {
                err = -ENOBUFS;
                goto free_msg;
        }

        err = rdev_remain_on_channel(rdev, wdev, chandef.chan,
                                     duration, &cookie);

        if (err)
                goto free_msg;

        if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        return genlmsg_reply(msg, info);

 nla_put_failure:
        err = -ENOBUFS;
 free_msg:
        nlmsg_free(msg);
        return err;
}

static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
                                            struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        u64 cookie;

        if (!info->attrs[NL80211_ATTR_COOKIE])
                return -EINVAL;

        if (!rdev->ops->cancel_remain_on_channel)
                return -EOPNOTSUPP;

        cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);

        return rdev_cancel_remain_on_channel(rdev, wdev, cookie);
}

static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
                                       struct genl_info *info)
{
        struct cfg80211_bitrate_mask mask;
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        int err;

        if (!rdev->ops->set_bitrate_mask)
                return -EOPNOTSUPP;

        err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
                                            NL80211_ATTR_TX_RATES, &mask,
                                            dev, true, link_id);
        if (err)
                return err;

        return rdev_set_bitrate_mask(rdev, dev, link_id, NULL, &mask);
}

static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION;

        if (!info->attrs[NL80211_ATTR_FRAME_MATCH])
                return -EINVAL;

        if (info->attrs[NL80211_ATTR_FRAME_TYPE])
                frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_MESH_POINT:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_P2P_DEVICE:
                break;
        case NL80211_IFTYPE_NAN:
                if (!wiphy_ext_feature_isset(wdev->wiphy,
                                             NL80211_EXT_FEATURE_SECURE_NAN))
                        return -EOPNOTSUPP;
                break;
        default:
                return -EOPNOTSUPP;
        }

        /* not much point in registering if we can't reply */
        if (!rdev->ops->mgmt_tx)
                return -EOPNOTSUPP;

        if (info->attrs[NL80211_ATTR_RECEIVE_MULTICAST] &&
            !wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS)) {
                GENL_SET_ERR_MSG(info,
                                 "multicast RX registrations are not supported");
                return -EOPNOTSUPP;
        }

        return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type,
                                           nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
                                           nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]),
                                           info->attrs[NL80211_ATTR_RECEIVE_MULTICAST],
                                           info->extack);
}

static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        struct cfg80211_chan_def chandef;
        int err;
        void *hdr = NULL;
        u64 cookie;
        struct sk_buff *msg = NULL;
        struct cfg80211_mgmt_tx_params params = {
                .dont_wait_for_ack =
                        info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK],
        };

        if (!info->attrs[NL80211_ATTR_FRAME])
                return -EINVAL;

        if (!rdev->ops->mgmt_tx)
                return -EOPNOTSUPP;

        switch (wdev->iftype) {
        case NL80211_IFTYPE_P2P_DEVICE:
                if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
                        return -EINVAL;
                break;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_MESH_POINT:
        case NL80211_IFTYPE_P2P_GO:
                break;
        case NL80211_IFTYPE_NAN:
                if (!wiphy_ext_feature_isset(wdev->wiphy,
                                             NL80211_EXT_FEATURE_SECURE_NAN))
                        return -EOPNOTSUPP;
                break;
        default:
                return -EOPNOTSUPP;
        }

        if (info->attrs[NL80211_ATTR_DURATION]) {
                if (!(rdev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX))
                        return -EINVAL;
                params.wait = nla_get_u32(info->attrs[NL80211_ATTR_DURATION]);

                /*
                 * We should wait on the channel for at least a minimum amount
                 * of time (10ms) but no longer than the driver supports.
                 */
                if (params.wait < NL80211_MIN_REMAIN_ON_CHANNEL_TIME ||
                    params.wait > rdev->wiphy.max_remain_on_channel_duration)
                        return -EINVAL;
        }

        params.offchan = info->attrs[NL80211_ATTR_OFFCHANNEL_TX_OK];

        if (params.offchan && !(rdev->wiphy.flags & WIPHY_FLAG_OFFCHAN_TX))
                return -EINVAL;

        params.no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);

        /* get the channel if any has been specified, otherwise pass NULL to
         * the driver. The latter will use the current one
         */
        chandef.chan = NULL;
        if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
                err = nl80211_parse_chandef(rdev, info, &chandef);
                if (err)
                        return err;
        }

        if (!chandef.chan && params.offchan)
                return -EINVAL;

        if (params.offchan &&
            !cfg80211_off_channel_oper_allowed(wdev, chandef.chan))
                return -EBUSY;

        params.link_id = nl80211_link_id_or_invalid(info->attrs);
        /*
         * This now races due to the unlock, but we cannot check
         * the valid links for the _station_ anyway, so that's up
         * to the driver.
         */
        if (params.link_id >= 0 &&
            !(wdev->valid_links & BIT(params.link_id)))
                return -EINVAL;

        params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
        params.len = nla_len(info->attrs[NL80211_ATTR_FRAME]);

        err = nl80211_parse_counter_offsets(rdev, NULL, params.len, -1,
                                            info->attrs[NL80211_ATTR_CSA_C_OFFSETS_TX],
                                            &params.csa_offsets,
                                            &params.n_csa_offsets);
        if (err)
                return err;

        if (!params.dont_wait_for_ack) {
                msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
                if (!msg)
                        return -ENOMEM;

                hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                                     NL80211_CMD_FRAME);
                if (!hdr) {
                        err = -ENOBUFS;
                        goto free_msg;
                }
        }

        params.chan = chandef.chan;
        err = cfg80211_mlme_mgmt_tx(rdev, wdev, &params, &cookie);
        if (err)
                goto free_msg;

        if (msg) {
                if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
                                      NL80211_ATTR_PAD))
                        goto nla_put_failure;

                genlmsg_end(msg, hdr);
                return genlmsg_reply(msg, info);
        }

        return 0;

 nla_put_failure:
        err = -ENOBUFS;
 free_msg:
        nlmsg_free(msg);
        return err;
}

static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        u64 cookie;

        if (!info->attrs[NL80211_ATTR_COOKIE])
                return -EINVAL;

        if (!rdev->ops->mgmt_tx_cancel_wait)
                return -EOPNOTSUPP;

        switch (wdev->iftype) {
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_P2P_DEVICE:
                break;
        case NL80211_IFTYPE_NAN:
                if (!wiphy_ext_feature_isset(wdev->wiphy,
                                             NL80211_EXT_FEATURE_SECURE_NAN))
                        return -EOPNOTSUPP;
                break;
        default:
                return -EOPNOTSUPP;
        }

        cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);

        return rdev_mgmt_tx_cancel_wait(rdev, wdev, cookie);
}

static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev;
        struct net_device *dev = info->user_ptr[1];
        u8 ps_state;
        bool state;
        int err;

        if (!info->attrs[NL80211_ATTR_PS_STATE])
                return -EINVAL;

        ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);

        wdev = dev->ieee80211_ptr;

        if (!rdev->ops->set_power_mgmt)
                return -EOPNOTSUPP;

        state = (ps_state == NL80211_PS_ENABLED) ? true : false;

        if (state == wdev->ps)
                return 0;

        err = rdev_set_power_mgmt(rdev, dev, state, wdev->ps_timeout);
        if (!err)
                wdev->ps = state;
        return err;
}

static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        enum nl80211_ps_state ps_state;
        struct wireless_dev *wdev;
        struct net_device *dev = info->user_ptr[1];
        struct sk_buff *msg;
        void *hdr;
        int err;

        wdev = dev->ieee80211_ptr;

        if (!rdev->ops->set_power_mgmt)
                return -EOPNOTSUPP;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_GET_POWER_SAVE);
        if (!hdr) {
                err = -ENOBUFS;
                goto free_msg;
        }

        if (wdev->ps)
                ps_state = NL80211_PS_ENABLED;
        else
                ps_state = NL80211_PS_DISABLED;

        if (nla_put_u32(msg, NL80211_ATTR_PS_STATE, ps_state))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

 nla_put_failure:
        err = -ENOBUFS;
 free_msg:
        nlmsg_free(msg);
        return err;
}

static const struct nla_policy
nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = {
        [NL80211_ATTR_CQM_RSSI_THOLD] = { .type = NLA_BINARY },
        [NL80211_ATTR_CQM_RSSI_HYST] = { .type = NLA_U32 },
        [NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT] = { .type = NLA_U32 },
        [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 },
        [NL80211_ATTR_CQM_TXE_PKTS] = { .type = NLA_U32 },
        [NL80211_ATTR_CQM_TXE_INTVL] = { .type = NLA_U32 },
        [NL80211_ATTR_CQM_RSSI_LEVEL] = { .type = NLA_S32 },
};

static int nl80211_set_cqm_txe(struct genl_info *info,
                               u32 rate, u32 pkts, u32 intvl)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        if (rate > 100 || intvl > NL80211_CQM_TXE_MAX_INTVL)
                return -EINVAL;

        if (!rdev->ops->set_cqm_txe_config)
                return -EOPNOTSUPP;

        if (wdev->iftype != NL80211_IFTYPE_STATION &&
            wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        return rdev_set_cqm_txe_config(rdev, dev, rate, pkts, intvl);
}

static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev,
                                    struct cfg80211_cqm_config *cqm_config)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        s32 last, low, high;
        u32 hyst;
        int i, n, low_index;
        int err;

        /*
         * Obtain current RSSI value if possible, if not and no RSSI threshold
         * event has been received yet, we should receive an event after a
         * connection is established and enough beacons received to calculate
         * the average.
         */
        if (!cqm_config->last_rssi_event_value &&
            wdev->links[0].client.current_bss &&
            rdev->ops->get_station) {
                struct station_info sinfo = {};
                u8 *mac_addr;

                mac_addr = wdev->links[0].client.current_bss->pub.bssid;

                err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
                if (err)
                        return err;

                cfg80211_sinfo_release_content(&sinfo);
                if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
                        cqm_config->last_rssi_event_value =
                                (s8) sinfo.rx_beacon_signal_avg;
        }

        last = cqm_config->last_rssi_event_value;
        hyst = cqm_config->rssi_hyst;
        n = cqm_config->n_rssi_thresholds;

        for (i = 0; i < n; i++) {
                i = array_index_nospec(i, n);
                if (last < cqm_config->rssi_thresholds[i])
                        break;
        }

        low_index = i - 1;
        if (low_index >= 0) {
                low_index = array_index_nospec(low_index, n);
                low = cqm_config->rssi_thresholds[low_index] - hyst;
        } else {
                low = S32_MIN;
        }
        if (i < n) {
                i = array_index_nospec(i, n);
                high = cqm_config->rssi_thresholds[i] + hyst - 1;
        } else {
                high = S32_MAX;
        }

        return rdev_set_cqm_rssi_range_config(rdev, dev, low, high);
}

static int nl80211_set_cqm_rssi(struct genl_info *info,
                                const s32 *thresholds, int n_thresholds,
                                u32 hysteresis)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct cfg80211_cqm_config *cqm_config = NULL, *old;
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        s32 prev = S32_MIN;
        int i, err;

        /* Check all values negative and sorted */
        for (i = 0; i < n_thresholds; i++) {
                if (thresholds[i] > 0 || thresholds[i] <= prev)
                        return -EINVAL;

                prev = thresholds[i];
        }

        if (wdev->iftype != NL80211_IFTYPE_STATION &&
            wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        if (n_thresholds == 1 && thresholds[0] == 0) /* Disabling */
                n_thresholds = 0;

        old = wiphy_dereference(wdev->wiphy, wdev->cqm_config);

        /* if already disabled just succeed */
        if (!n_thresholds && !old)
                return 0;

        if (n_thresholds > 1) {
                if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                             NL80211_EXT_FEATURE_CQM_RSSI_LIST) ||
                    !rdev->ops->set_cqm_rssi_range_config)
                        return -EOPNOTSUPP;
        } else {
                if (!rdev->ops->set_cqm_rssi_config)
                        return -EOPNOTSUPP;
        }

        if (n_thresholds) {
                cqm_config = kzalloc(struct_size(cqm_config, rssi_thresholds,
                                                 n_thresholds),
                                     GFP_KERNEL);
                if (!cqm_config)
                        return -ENOMEM;

                cqm_config->rssi_hyst = hysteresis;
                cqm_config->n_rssi_thresholds = n_thresholds;
                memcpy(cqm_config->rssi_thresholds, thresholds,
                       flex_array_size(cqm_config, rssi_thresholds,
                                       n_thresholds));
                cqm_config->use_range_api = n_thresholds > 1 ||
                                            !rdev->ops->set_cqm_rssi_config;

                rcu_assign_pointer(wdev->cqm_config, cqm_config);

                if (cqm_config->use_range_api)
                        err = cfg80211_cqm_rssi_update(rdev, dev, cqm_config);
                else
                        err = rdev_set_cqm_rssi_config(rdev, dev,
                                                       thresholds[0],
                                                       hysteresis);
        } else {
                RCU_INIT_POINTER(wdev->cqm_config, NULL);
                /* if enabled as range also disable via range */
                if (old->use_range_api)
                        err = rdev_set_cqm_rssi_range_config(rdev, dev, 0, 0);
                else
                        err = rdev_set_cqm_rssi_config(rdev, dev, 0, 0);
        }

        if (err) {
                rcu_assign_pointer(wdev->cqm_config, old);
                kfree_rcu(cqm_config, rcu_head);
        } else {
                kfree_rcu(old, rcu_head);
        }

        return err;
}

static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr *attrs[NL80211_ATTR_CQM_MAX + 1];
        struct nlattr *cqm;
        int err;

        cqm = info->attrs[NL80211_ATTR_CQM];
        if (!cqm)
                return -EINVAL;

        err = nla_parse_nested_deprecated(attrs, NL80211_ATTR_CQM_MAX, cqm,
                                          nl80211_attr_cqm_policy,
                                          info->extack);
        if (err)
                return err;

        if (attrs[NL80211_ATTR_CQM_RSSI_THOLD] &&
            attrs[NL80211_ATTR_CQM_RSSI_HYST]) {
                const s32 *thresholds =
                        nla_data(attrs[NL80211_ATTR_CQM_RSSI_THOLD]);
                int len = nla_len(attrs[NL80211_ATTR_CQM_RSSI_THOLD]);
                u32 hysteresis = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_HYST]);

                if (len % 4)
                        return -EINVAL;

                return nl80211_set_cqm_rssi(info, thresholds, len / 4,
                                            hysteresis);
        }

        if (attrs[NL80211_ATTR_CQM_TXE_RATE] &&
            attrs[NL80211_ATTR_CQM_TXE_PKTS] &&
            attrs[NL80211_ATTR_CQM_TXE_INTVL]) {
                u32 rate = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_RATE]);
                u32 pkts = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_PKTS]);
                u32 intvl = nla_get_u32(attrs[NL80211_ATTR_CQM_TXE_INTVL]);

                return nl80211_set_cqm_txe(info, rate, pkts, intvl);
        }

        return -EINVAL;
}

static int nl80211_join_ocb(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct ocb_setup setup = {};
        int err;

        err = nl80211_parse_chandef(rdev, info, &setup.chandef);
        if (err)
                return err;

        return cfg80211_join_ocb(rdev, dev, &setup);
}

static int nl80211_leave_ocb(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];

        return cfg80211_leave_ocb(rdev, dev);
}

static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct mesh_config cfg;
        struct mesh_setup setup;
        int err;

        /* start with default */
        memcpy(&cfg, &default_mesh_config, sizeof(cfg));
        memcpy(&setup, &default_mesh_setup, sizeof(setup));

        if (info->attrs[NL80211_ATTR_MESH_CONFIG]) {
                /* and parse parameters if given */
                err = nl80211_parse_mesh_config(info, &cfg, NULL);
                if (err)
                        return err;
        }

        if (!info->attrs[NL80211_ATTR_MESH_ID] ||
            !nla_len(info->attrs[NL80211_ATTR_MESH_ID]))
                return -EINVAL;

        setup.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]);
        setup.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);

        if (info->attrs[NL80211_ATTR_MCAST_RATE] &&
            !nl80211_parse_mcast_rate(rdev, setup.mcast_rate,
                            nla_get_u32(info->attrs[NL80211_ATTR_MCAST_RATE])))
                        return -EINVAL;

        if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) {
                setup.beacon_interval =
                        nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]);

                err = cfg80211_validate_beacon_int(rdev,
                                                   NL80211_IFTYPE_MESH_POINT,
                                                   setup.beacon_interval);
                if (err)
                        return err;
        }

        if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) {
                setup.dtim_period =
                        nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]);
                if (setup.dtim_period < 1 || setup.dtim_period > 100)
                        return -EINVAL;
        }

        if (info->attrs[NL80211_ATTR_MESH_SETUP]) {
                /* parse additional setup parameters if given */
                err = nl80211_parse_mesh_setup(info, &setup);
                if (err)
                        return err;
        }

        if (setup.user_mpm)
                cfg.auto_open_plinks = false;

        if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
                err = nl80211_parse_chandef(rdev, info, &setup.chandef);
                if (err)
                        return err;
        } else {
                /* __cfg80211_join_mesh() will sort it out */
                setup.chandef.chan = NULL;
        }

        if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
                u8 *rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
                int n_rates =
                        nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
                struct ieee80211_supported_band *sband;

                if (!setup.chandef.chan)
                        return -EINVAL;

                sband = rdev->wiphy.bands[setup.chandef.chan->band];

                err = ieee80211_get_ratemask(sband, rates, n_rates,
                                             &setup.basic_rates);
                if (err)
                        return err;
        }

        if (info->attrs[NL80211_ATTR_TX_RATES]) {
                err = nl80211_parse_tx_bitrate_mask(info, info->attrs,
                                                    NL80211_ATTR_TX_RATES,
                                                    &setup.beacon_rate,
                                                    dev, false, 0);
                if (err)
                        return err;

                if (!setup.chandef.chan)
                        return -EINVAL;

                err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band,
                                              &setup.beacon_rate);
                if (err)
                        return err;
        }

        setup.userspace_handles_dfs =
                nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);

        if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) {
                int r = validate_pae_over_nl80211(rdev, info);

                if (r < 0)
                        return r;

                setup.control_port_over_nl80211 = true;
        }

        err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg);
        if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER])
                dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;

        return err;
}

static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];

        return cfg80211_leave_mesh(rdev, dev);
}

#ifdef CONFIG_PM
static int nl80211_send_wowlan_patterns(struct sk_buff *msg,
                                        struct cfg80211_registered_device *rdev)
{
        struct cfg80211_wowlan *wowlan = rdev->wiphy.wowlan_config;
        struct nlattr *nl_pats, *nl_pat;
        int i, pat_len;

        if (!wowlan->n_patterns)
                return 0;

        nl_pats = nla_nest_start_noflag(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN);
        if (!nl_pats)
                return -ENOBUFS;

        for (i = 0; i < wowlan->n_patterns; i++) {
                nl_pat = nla_nest_start_noflag(msg, i + 1);
                if (!nl_pat)
                        return -ENOBUFS;
                pat_len = wowlan->patterns[i].pattern_len;
                if (nla_put(msg, NL80211_PKTPAT_MASK, DIV_ROUND_UP(pat_len, 8),
                            wowlan->patterns[i].mask) ||
                    nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len,
                            wowlan->patterns[i].pattern) ||
                    nla_put_u32(msg, NL80211_PKTPAT_OFFSET,
                                wowlan->patterns[i].pkt_offset))
                        return -ENOBUFS;
                nla_nest_end(msg, nl_pat);
        }
        nla_nest_end(msg, nl_pats);

        return 0;
}

static int nl80211_send_wowlan_tcp(struct sk_buff *msg,
                                   struct cfg80211_wowlan_tcp *tcp)
{
        struct nlattr *nl_tcp;

        if (!tcp)
                return 0;

        nl_tcp = nla_nest_start_noflag(msg,
                                       NL80211_WOWLAN_TRIG_TCP_CONNECTION);
        if (!nl_tcp)
                return -ENOBUFS;

        if (nla_put_in_addr(msg, NL80211_WOWLAN_TCP_SRC_IPV4, tcp->src) ||
            nla_put_in_addr(msg, NL80211_WOWLAN_TCP_DST_IPV4, tcp->dst) ||
            nla_put(msg, NL80211_WOWLAN_TCP_DST_MAC, ETH_ALEN, tcp->dst_mac) ||
            nla_put_u16(msg, NL80211_WOWLAN_TCP_SRC_PORT, tcp->src_port) ||
            nla_put_u16(msg, NL80211_WOWLAN_TCP_DST_PORT, tcp->dst_port) ||
            nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD,
                    tcp->payload_len, tcp->payload) ||
            nla_put_u32(msg, NL80211_WOWLAN_TCP_DATA_INTERVAL,
                        tcp->data_interval) ||
            nla_put(msg, NL80211_WOWLAN_TCP_WAKE_PAYLOAD,
                    tcp->wake_len, tcp->wake_data) ||
            nla_put(msg, NL80211_WOWLAN_TCP_WAKE_MASK,
                    DIV_ROUND_UP(tcp->wake_len, 8), tcp->wake_mask))
                return -ENOBUFS;

        if (tcp->payload_seq.len &&
            nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ,
                    sizeof(tcp->payload_seq), &tcp->payload_seq))
                return -ENOBUFS;

        if (tcp->payload_tok.len &&
            nla_put(msg, NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN,
                    sizeof(tcp->payload_tok) + tcp->tokens_size,
                    &tcp->payload_tok))
                return -ENOBUFS;

        nla_nest_end(msg, nl_tcp);

        return 0;
}

static int nl80211_send_wowlan_nd(struct sk_buff *msg,
                                  struct cfg80211_sched_scan_request *req)
{
        struct nlattr *nd, *freqs, *matches, *match, *scan_plans, *scan_plan;
        int i;

        if (!req)
                return 0;

        nd = nla_nest_start_noflag(msg, NL80211_WOWLAN_TRIG_NET_DETECT);
        if (!nd)
                return -ENOBUFS;

        if (req->n_scan_plans == 1 &&
            nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_INTERVAL,
                        req->scan_plans[0].interval * 1000))
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay))
                return -ENOBUFS;

        if (req->relative_rssi_set) {
                struct nl80211_bss_select_rssi_adjust rssi_adjust;

                if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI,
                               req->relative_rssi))
                        return -ENOBUFS;

                rssi_adjust.band = req->rssi_adjust.band;
                rssi_adjust.delta = req->rssi_adjust.delta;
                if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST,
                            sizeof(rssi_adjust), &rssi_adjust))
                        return -ENOBUFS;
        }

        freqs = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES);
        if (!freqs)
                return -ENOBUFS;

        for (i = 0; i < req->n_channels; i++) {
                if (nla_put_u32(msg, i, req->channels[i]->center_freq))
                        return -ENOBUFS;
        }

        nla_nest_end(msg, freqs);

        if (req->n_match_sets) {
                matches = nla_nest_start_noflag(msg,
                                                NL80211_ATTR_SCHED_SCAN_MATCH);
                if (!matches)
                        return -ENOBUFS;

                for (i = 0; i < req->n_match_sets; i++) {
                        match = nla_nest_start_noflag(msg, i);
                        if (!match)
                                return -ENOBUFS;

                        if (nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID,
                                    req->match_sets[i].ssid.ssid_len,
                                    req->match_sets[i].ssid.ssid))
                                return -ENOBUFS;
                        nla_nest_end(msg, match);
                }
                nla_nest_end(msg, matches);
        }

        scan_plans = nla_nest_start_noflag(msg, NL80211_ATTR_SCHED_SCAN_PLANS);
        if (!scan_plans)
                return -ENOBUFS;

        for (i = 0; i < req->n_scan_plans; i++) {
                scan_plan = nla_nest_start_noflag(msg, i + 1);
                if (!scan_plan)
                        return -ENOBUFS;

                if (nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
                                req->scan_plans[i].interval) ||
                    (req->scan_plans[i].iterations &&
                     nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_ITERATIONS,
                                 req->scan_plans[i].iterations)))
                        return -ENOBUFS;
                nla_nest_end(msg, scan_plan);
        }
        nla_nest_end(msg, scan_plans);

        nla_nest_end(msg, nd);

        return 0;
}

static int nl80211_get_wowlan(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct sk_buff *msg;
        void *hdr;
        u32 size = NLMSG_DEFAULT_SIZE;

        if (!rdev->wiphy.wowlan)
                return -EOPNOTSUPP;

        if (rdev->wiphy.wowlan_config && rdev->wiphy.wowlan_config->tcp) {
                /* adjust size to have room for all the data */
                size += rdev->wiphy.wowlan_config->tcp->tokens_size +
                        rdev->wiphy.wowlan_config->tcp->payload_len +
                        rdev->wiphy.wowlan_config->tcp->wake_len +
                        rdev->wiphy.wowlan_config->tcp->wake_len / 8;
        }

        msg = nlmsg_new(size, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_GET_WOWLAN);
        if (!hdr)
                goto nla_put_failure;

        if (rdev->wiphy.wowlan_config) {
                struct nlattr *nl_wowlan;

                nl_wowlan = nla_nest_start_noflag(msg,
                                                  NL80211_ATTR_WOWLAN_TRIGGERS);
                if (!nl_wowlan)
                        goto nla_put_failure;

                if ((rdev->wiphy.wowlan_config->any &&
                     nla_put_flag(msg, NL80211_WOWLAN_TRIG_ANY)) ||
                    (rdev->wiphy.wowlan_config->disconnect &&
                     nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT)) ||
                    (rdev->wiphy.wowlan_config->magic_pkt &&
                     nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT)) ||
                    (rdev->wiphy.wowlan_config->gtk_rekey_failure &&
                     nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE)) ||
                    (rdev->wiphy.wowlan_config->eap_identity_req &&
                     nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST)) ||
                    (rdev->wiphy.wowlan_config->four_way_handshake &&
                     nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE)) ||
                    (rdev->wiphy.wowlan_config->rfkill_release &&
                     nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE)))
                        goto nla_put_failure;

                if (nl80211_send_wowlan_patterns(msg, rdev))
                        goto nla_put_failure;

                if (nl80211_send_wowlan_tcp(msg,
                                            rdev->wiphy.wowlan_config->tcp))
                        goto nla_put_failure;

                if (nl80211_send_wowlan_nd(
                            msg,
                            rdev->wiphy.wowlan_config->nd_config))
                        goto nla_put_failure;

                nla_nest_end(msg, nl_wowlan);
        }

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

nla_put_failure:
        nlmsg_free(msg);
        return -ENOBUFS;
}

static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev,
                                    struct nlattr *attr,
                                    struct cfg80211_wowlan *trig)
{
        struct nlattr *tb[NUM_NL80211_WOWLAN_TCP];
        struct cfg80211_wowlan_tcp *cfg;
        struct nl80211_wowlan_tcp_data_token *tok = NULL;
        struct nl80211_wowlan_tcp_data_seq *seq = NULL;
        u32 size;
        u32 data_size, wake_size, tokens_size = 0, wake_mask_size;
        int err, port;

        if (!rdev->wiphy.wowlan->tcp)
                return -EINVAL;

        err = nla_parse_nested_deprecated(tb, MAX_NL80211_WOWLAN_TCP, attr,
                                          nl80211_wowlan_tcp_policy, NULL);
        if (err)
                return err;

        if (!tb[NL80211_WOWLAN_TCP_SRC_IPV4] ||
            !tb[NL80211_WOWLAN_TCP_DST_IPV4] ||
            !tb[NL80211_WOWLAN_TCP_DST_MAC] ||
            !tb[NL80211_WOWLAN_TCP_DST_PORT] ||
            !tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD] ||
            !tb[NL80211_WOWLAN_TCP_DATA_INTERVAL] ||
            !tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD] ||
            !tb[NL80211_WOWLAN_TCP_WAKE_MASK])
                return -EINVAL;

        data_size = nla_len(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD]);
        if (data_size > rdev->wiphy.wowlan->tcp->data_payload_max)
                return -EINVAL;

        if (nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]) >
                        rdev->wiphy.wowlan->tcp->data_interval_max ||
            nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]) == 0)
                return -EINVAL;

        wake_size = nla_len(tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD]);
        if (wake_size > rdev->wiphy.wowlan->tcp->wake_payload_max)
                return -EINVAL;

        wake_mask_size = nla_len(tb[NL80211_WOWLAN_TCP_WAKE_MASK]);
        if (wake_mask_size != DIV_ROUND_UP(wake_size, 8))
                return -EINVAL;

        if (tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]) {
                u32 tokln = nla_len(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]);

                tok = nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_TOKEN]);
                tokens_size = tokln - sizeof(*tok);

                if (!tok->len || tokens_size % tok->len)
                        return -EINVAL;
                if (!rdev->wiphy.wowlan->tcp->tok)
                        return -EINVAL;
                if (tok->len > rdev->wiphy.wowlan->tcp->tok->max_len)
                        return -EINVAL;
                if (tok->len < rdev->wiphy.wowlan->tcp->tok->min_len)
                        return -EINVAL;
                if (tokens_size > rdev->wiphy.wowlan->tcp->tok->bufsize)
                        return -EINVAL;
                if (tok->offset + tok->len > data_size)
                        return -EINVAL;
        }

        if (tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ]) {
                seq = nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD_SEQ]);
                if (!rdev->wiphy.wowlan->tcp->seq)
                        return -EINVAL;
                if (seq->len == 0 || seq->len > 4)
                        return -EINVAL;
                if (seq->len + seq->offset > data_size)
                        return -EINVAL;
        }

        size = sizeof(*cfg);
        size += data_size;
        size += wake_size + wake_mask_size;
        size += tokens_size;

        cfg = kzalloc(size, GFP_KERNEL);
        if (!cfg)
                return -ENOMEM;
        cfg->src = nla_get_in_addr(tb[NL80211_WOWLAN_TCP_SRC_IPV4]);
        cfg->dst = nla_get_in_addr(tb[NL80211_WOWLAN_TCP_DST_IPV4]);
        memcpy(cfg->dst_mac, nla_data(tb[NL80211_WOWLAN_TCP_DST_MAC]),
               ETH_ALEN);
        if (tb[NL80211_WOWLAN_TCP_SRC_PORT])
                port = nla_get_u16(tb[NL80211_WOWLAN_TCP_SRC_PORT]);
        else
                port = 0;
#ifdef CONFIG_INET
        /* allocate a socket and port for it and use it */
        err = __sock_create(wiphy_net(&rdev->wiphy), PF_INET, SOCK_STREAM,
                            IPPROTO_TCP, &cfg->sock, 1);
        if (err) {
                kfree(cfg);
                return err;
        }
        if (inet_csk_get_port(cfg->sock->sk, port)) {
                sock_release(cfg->sock);
                kfree(cfg);
                return -EADDRINUSE;
        }
        cfg->src_port = inet_sk(cfg->sock->sk)->inet_num;
#else
        if (!port) {
                kfree(cfg);
                return -EINVAL;
        }
        cfg->src_port = port;
#endif

        cfg->dst_port = nla_get_u16(tb[NL80211_WOWLAN_TCP_DST_PORT]);
        cfg->payload_len = data_size;
        cfg->payload = (u8 *)cfg + sizeof(*cfg) + tokens_size;
        memcpy((void *)cfg->payload,
               nla_data(tb[NL80211_WOWLAN_TCP_DATA_PAYLOAD]),
               data_size);
        if (seq)
                cfg->payload_seq = *seq;
        cfg->data_interval = nla_get_u32(tb[NL80211_WOWLAN_TCP_DATA_INTERVAL]);
        cfg->wake_len = wake_size;
        cfg->wake_data = (u8 *)cfg + sizeof(*cfg) + tokens_size + data_size;
        memcpy((void *)cfg->wake_data,
               nla_data(tb[NL80211_WOWLAN_TCP_WAKE_PAYLOAD]),
               wake_size);
        cfg->wake_mask = (u8 *)cfg + sizeof(*cfg) + tokens_size +
                         data_size + wake_size;
        memcpy((void *)cfg->wake_mask,
               nla_data(tb[NL80211_WOWLAN_TCP_WAKE_MASK]),
               wake_mask_size);
        if (tok) {
                cfg->tokens_size = tokens_size;
                cfg->payload_tok = *tok;
                memcpy(cfg->payload_tok.token_stream, tok->token_stream,
                       tokens_size);
        }

        trig->tcp = cfg;

        return 0;
}

static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev,
                                   const struct wiphy_wowlan_support *wowlan,
                                   struct nlattr *attr,
                                   struct cfg80211_wowlan *trig)
{
        struct nlattr **tb;
        int err;

        tb = kcalloc(NUM_NL80211_ATTR, sizeof(*tb), GFP_KERNEL);
        if (!tb)
                return -ENOMEM;

        if (!(wowlan->flags & WIPHY_WOWLAN_NET_DETECT)) {
                err = -EOPNOTSUPP;
                goto out;
        }

        err = nla_parse_nested_deprecated(tb, NL80211_ATTR_MAX, attr,
                                          nl80211_policy, NULL);
        if (err)
                goto out;

        trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb,
                                                   wowlan->max_nd_match_sets);
        err = PTR_ERR_OR_ZERO(trig->nd_config);
        if (err)
                trig->nd_config = NULL;

out:
        kfree(tb);
        return err;
}

static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct nlattr *tb[NUM_NL80211_WOWLAN_TRIG];
        struct cfg80211_wowlan new_triggers = {};
        struct cfg80211_wowlan *ntrig;
        const struct wiphy_wowlan_support *wowlan = rdev->wiphy.wowlan;
        int err, i;
        bool prev_enabled = rdev->wiphy.wowlan_config;
        bool regular = false;

        if (!wowlan)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS]) {
                cfg80211_rdev_free_wowlan(rdev);
                rdev->wiphy.wowlan_config = NULL;
                goto set_wakeup;
        }

        err = nla_parse_nested_deprecated(tb, MAX_NL80211_WOWLAN_TRIG,
                                          info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS],
                                          nl80211_wowlan_policy, info->extack);
        if (err)
                return err;

        if (tb[NL80211_WOWLAN_TRIG_ANY]) {
                if (!(wowlan->flags & WIPHY_WOWLAN_ANY))
                        return -EINVAL;
                new_triggers.any = true;
        }

        if (tb[NL80211_WOWLAN_TRIG_DISCONNECT]) {
                if (!(wowlan->flags & WIPHY_WOWLAN_DISCONNECT))
                        return -EINVAL;
                new_triggers.disconnect = true;
                regular = true;
        }

        if (tb[NL80211_WOWLAN_TRIG_MAGIC_PKT]) {
                if (!(wowlan->flags & WIPHY_WOWLAN_MAGIC_PKT))
                        return -EINVAL;
                new_triggers.magic_pkt = true;
                regular = true;
        }

        if (tb[NL80211_WOWLAN_TRIG_GTK_REKEY_SUPPORTED])
                return -EINVAL;

        if (tb[NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE]) {
                if (!(wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE))
                        return -EINVAL;
                new_triggers.gtk_rekey_failure = true;
                regular = true;
        }

        if (tb[NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST]) {
                if (!(wowlan->flags & WIPHY_WOWLAN_EAP_IDENTITY_REQ))
                        return -EINVAL;
                new_triggers.eap_identity_req = true;
                regular = true;
        }

        if (tb[NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE]) {
                if (!(wowlan->flags & WIPHY_WOWLAN_4WAY_HANDSHAKE))
                        return -EINVAL;
                new_triggers.four_way_handshake = true;
                regular = true;
        }

        if (tb[NL80211_WOWLAN_TRIG_RFKILL_RELEASE]) {
                if (!(wowlan->flags & WIPHY_WOWLAN_RFKILL_RELEASE))
                        return -EINVAL;
                new_triggers.rfkill_release = true;
                regular = true;
        }

        if (tb[NL80211_WOWLAN_TRIG_PKT_PATTERN]) {
                struct nlattr *pat;
                int n_patterns = 0;
                int rem, pat_len, mask_len, pkt_offset;
                struct nlattr *pat_tb[NUM_NL80211_PKTPAT];

                regular = true;

                nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
                                    rem)
                        n_patterns++;
                if (n_patterns > wowlan->n_patterns)
                        return -EINVAL;

                new_triggers.patterns = kcalloc(n_patterns,
                                                sizeof(new_triggers.patterns[0]),
                                                GFP_KERNEL);
                if (!new_triggers.patterns)
                        return -ENOMEM;

                new_triggers.n_patterns = n_patterns;
                i = 0;

                nla_for_each_nested(pat, tb[NL80211_WOWLAN_TRIG_PKT_PATTERN],
                                    rem) {
                        u8 *mask_pat;

                        err = nla_parse_nested_deprecated(pat_tb,
                                                          MAX_NL80211_PKTPAT,
                                                          pat,
                                                          nl80211_packet_pattern_policy,
                                                          info->extack);
                        if (err)
                                goto error;

                        err = -EINVAL;
                        if (!pat_tb[NL80211_PKTPAT_MASK] ||
                            !pat_tb[NL80211_PKTPAT_PATTERN])
                                goto error;
                        pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]);
                        mask_len = DIV_ROUND_UP(pat_len, 8);
                        if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len)
                                goto error;
                        if (pat_len > wowlan->pattern_max_len ||
                            pat_len < wowlan->pattern_min_len)
                                goto error;

                        if (!pat_tb[NL80211_PKTPAT_OFFSET])
                                pkt_offset = 0;
                        else
                                pkt_offset = nla_get_u32(
                                        pat_tb[NL80211_PKTPAT_OFFSET]);
                        if (pkt_offset > wowlan->max_pkt_offset)
                                goto error;
                        new_triggers.patterns[i].pkt_offset = pkt_offset;

                        mask_pat = kmalloc(mask_len + pat_len, GFP_KERNEL);
                        if (!mask_pat) {
                                err = -ENOMEM;
                                goto error;
                        }
                        new_triggers.patterns[i].mask = mask_pat;
                        memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_MASK]),
                               mask_len);
                        mask_pat += mask_len;
                        new_triggers.patterns[i].pattern = mask_pat;
                        new_triggers.patterns[i].pattern_len = pat_len;
                        memcpy(mask_pat,
                               nla_data(pat_tb[NL80211_PKTPAT_PATTERN]),
                               pat_len);
                        i++;
                }
        }

        if (tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION]) {
                regular = true;
                err = nl80211_parse_wowlan_tcp(
                        rdev, tb[NL80211_WOWLAN_TRIG_TCP_CONNECTION],
                        &new_triggers);
                if (err)
                        goto error;
        }

        if (tb[NL80211_WOWLAN_TRIG_NET_DETECT]) {
                regular = true;
                err = nl80211_parse_wowlan_nd(
                        rdev, wowlan, tb[NL80211_WOWLAN_TRIG_NET_DETECT],
                        &new_triggers);
                if (err)
                        goto error;
        }

        /* The 'any' trigger means the device continues operating more or less
         * as in its normal operation mode and wakes up the host on most of the
         * normal interrupts (like packet RX, ...)
         * It therefore makes little sense to combine with the more constrained
         * wakeup trigger modes.
         */
        if (new_triggers.any && regular) {
                err = -EINVAL;
                goto error;
        }

        ntrig = kmemdup(&new_triggers, sizeof(new_triggers), GFP_KERNEL);
        if (!ntrig) {
                err = -ENOMEM;
                goto error;
        }
        cfg80211_rdev_free_wowlan(rdev);
        rdev->wiphy.wowlan_config = ntrig;

 set_wakeup:
        if (rdev->ops->set_wakeup &&
            prev_enabled != !!rdev->wiphy.wowlan_config)
                rdev_set_wakeup(rdev, rdev->wiphy.wowlan_config);

        return 0;
 error:
        for (i = 0; i < new_triggers.n_patterns; i++)
                kfree(new_triggers.patterns[i].mask);
        kfree(new_triggers.patterns);
        if (new_triggers.tcp && new_triggers.tcp->sock)
                sock_release(new_triggers.tcp->sock);
        kfree(new_triggers.tcp);
        kfree(new_triggers.nd_config);
        return err;
}
#endif

static int nl80211_send_coalesce_rules(struct sk_buff *msg,
                                       struct cfg80211_registered_device *rdev)
{
        struct nlattr *nl_pats, *nl_pat, *nl_rule, *nl_rules;
        int i, j, pat_len;
        struct cfg80211_coalesce_rules *rule;

        if (!rdev->coalesce->n_rules)
                return 0;

        nl_rules = nla_nest_start_noflag(msg, NL80211_ATTR_COALESCE_RULE);
        if (!nl_rules)
                return -ENOBUFS;

        for (i = 0; i < rdev->coalesce->n_rules; i++) {
                nl_rule = nla_nest_start_noflag(msg, i + 1);
                if (!nl_rule)
                        return -ENOBUFS;

                rule = &rdev->coalesce->rules[i];
                if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_DELAY,
                                rule->delay))
                        return -ENOBUFS;

                if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_CONDITION,
                                rule->condition))
                        return -ENOBUFS;

                nl_pats = nla_nest_start_noflag(msg,
                                                NL80211_ATTR_COALESCE_RULE_PKT_PATTERN);
                if (!nl_pats)
                        return -ENOBUFS;

                for (j = 0; j < rule->n_patterns; j++) {
                        nl_pat = nla_nest_start_noflag(msg, j + 1);
                        if (!nl_pat)
                                return -ENOBUFS;
                        pat_len = rule->patterns[j].pattern_len;
                        if (nla_put(msg, NL80211_PKTPAT_MASK,
                                    DIV_ROUND_UP(pat_len, 8),
                                    rule->patterns[j].mask) ||
                            nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len,
                                    rule->patterns[j].pattern) ||
                            nla_put_u32(msg, NL80211_PKTPAT_OFFSET,
                                        rule->patterns[j].pkt_offset))
                                return -ENOBUFS;
                        nla_nest_end(msg, nl_pat);
                }
                nla_nest_end(msg, nl_pats);
                nla_nest_end(msg, nl_rule);
        }
        nla_nest_end(msg, nl_rules);

        return 0;
}

static int nl80211_get_coalesce(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct sk_buff *msg;
        void *hdr;

        if (!rdev->wiphy.coalesce)
                return -EOPNOTSUPP;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_GET_COALESCE);
        if (!hdr)
                goto nla_put_failure;

        if (rdev->coalesce && nl80211_send_coalesce_rules(msg, rdev))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

nla_put_failure:
        nlmsg_free(msg);
        return -ENOBUFS;
}

void cfg80211_free_coalesce(struct cfg80211_coalesce *coalesce)
{
        int i, j;
        struct cfg80211_coalesce_rules *rule;

        if (!coalesce)
                return;

        for (i = 0; i < coalesce->n_rules; i++) {
                rule = &coalesce->rules[i];
                if (!rule)
                        continue;
                for (j = 0; j < rule->n_patterns; j++)
                        kfree(rule->patterns[j].mask);
                kfree(rule->patterns);
        }
        kfree(coalesce);
}

static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
                                       struct nlattr *rule,
                                       struct cfg80211_coalesce_rules *new_rule)
{
        int err, i;
        const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
        struct nlattr *tb[NUM_NL80211_ATTR_COALESCE_RULE], *pat;
        int rem, pat_len, mask_len, pkt_offset, n_patterns = 0;
        struct nlattr *pat_tb[NUM_NL80211_PKTPAT];

        err = nla_parse_nested_deprecated(tb, NL80211_ATTR_COALESCE_RULE_MAX,
                                          rule, nl80211_coalesce_policy, NULL);
        if (err)
                return err;

        if (tb[NL80211_ATTR_COALESCE_RULE_DELAY])
                new_rule->delay =
                        nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_DELAY]);
        if (new_rule->delay > coalesce->max_delay)
                return -EINVAL;

        if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION])
                new_rule->condition =
                        nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]);

        if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN])
                return -EINVAL;

        nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
                            rem)
                n_patterns++;
        if (n_patterns > coalesce->n_patterns)
                return -EINVAL;

        new_rule->patterns = kcalloc(n_patterns, sizeof(new_rule->patterns[0]),
                                     GFP_KERNEL);
        if (!new_rule->patterns)
                return -ENOMEM;

        new_rule->n_patterns = n_patterns;
        i = 0;

        nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
                            rem) {
                u8 *mask_pat;

                err = nla_parse_nested_deprecated(pat_tb, MAX_NL80211_PKTPAT,
                                                  pat,
                                                  nl80211_packet_pattern_policy,
                                                  NULL);
                if (err)
                        return err;

                if (!pat_tb[NL80211_PKTPAT_MASK] ||
                    !pat_tb[NL80211_PKTPAT_PATTERN])
                        return -EINVAL;
                pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]);
                mask_len = DIV_ROUND_UP(pat_len, 8);
                if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len)
                        return -EINVAL;
                if (pat_len > coalesce->pattern_max_len ||
                    pat_len < coalesce->pattern_min_len)
                        return -EINVAL;

                if (!pat_tb[NL80211_PKTPAT_OFFSET])
                        pkt_offset = 0;
                else
                        pkt_offset = nla_get_u32(pat_tb[NL80211_PKTPAT_OFFSET]);
                if (pkt_offset > coalesce->max_pkt_offset)
                        return -EINVAL;
                new_rule->patterns[i].pkt_offset = pkt_offset;

                mask_pat = kmalloc(mask_len + pat_len, GFP_KERNEL);
                if (!mask_pat)
                        return -ENOMEM;

                new_rule->patterns[i].mask = mask_pat;
                memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_MASK]),
                       mask_len);

                mask_pat += mask_len;
                new_rule->patterns[i].pattern = mask_pat;
                new_rule->patterns[i].pattern_len = pat_len;
                memcpy(mask_pat, nla_data(pat_tb[NL80211_PKTPAT_PATTERN]),
                       pat_len);
                i++;
        }

        return 0;
}

static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
        struct cfg80211_coalesce *new_coalesce;
        int err, rem_rule, n_rules = 0, i;
        struct nlattr *rule;

        if (!rdev->wiphy.coalesce || !rdev->ops->set_coalesce)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) {
                cfg80211_free_coalesce(rdev->coalesce);
                rdev->coalesce = NULL;
                rdev_set_coalesce(rdev, NULL);
                return 0;
        }

        nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
                            rem_rule)
                n_rules++;
        if (n_rules > coalesce->n_rules)
                return -EINVAL;

        new_coalesce = kzalloc(struct_size(new_coalesce, rules, n_rules),
                               GFP_KERNEL);
        if (!new_coalesce)
                return -ENOMEM;

        new_coalesce->n_rules = n_rules;
        i = 0;

        nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
                            rem_rule) {
                err = nl80211_parse_coalesce_rule(rdev, rule,
                                                  &new_coalesce->rules[i]);
                if (err)
                        goto error;

                i++;
        }

        err = rdev_set_coalesce(rdev, new_coalesce);
        if (err)
                goto error;

        cfg80211_free_coalesce(rdev->coalesce);
        rdev->coalesce = new_coalesce;

        return 0;
error:
        cfg80211_free_coalesce(new_coalesce);

        return err;
}

static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct nlattr *tb[NUM_NL80211_REKEY_DATA];
        struct cfg80211_gtk_rekey_data rekey_data = {};
        int err;

        if (!info->attrs[NL80211_ATTR_REKEY_DATA])
                return -EINVAL;

        err = nla_parse_nested_deprecated(tb, MAX_NL80211_REKEY_DATA,
                                          info->attrs[NL80211_ATTR_REKEY_DATA],
                                          nl80211_rekey_policy, info->extack);
        if (err)
                return err;

        if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] ||
            !tb[NL80211_REKEY_DATA_KCK])
                return -EINVAL;
        if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN &&
            !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK &&
              nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN))
                return -ERANGE;
        if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN &&
            !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK &&
              nla_len(tb[NL80211_REKEY_DATA_KCK]) == NL80211_KCK_EXT_LEN) &&
             !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KCK_32 &&
               nla_len(tb[NL80211_REKEY_DATA_KCK]) == NL80211_KCK_EXT_LEN_32))
                return -ERANGE;

        rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]);
        rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]);
        rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]);
        rekey_data.kek_len = nla_len(tb[NL80211_REKEY_DATA_KEK]);
        rekey_data.kck_len = nla_len(tb[NL80211_REKEY_DATA_KCK]);
        if (tb[NL80211_REKEY_DATA_AKM])
                rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]);

        if (!wdev->connected)
                return -ENOTCONN;

        if (!rdev->ops->set_rekey_data)
                return -EOPNOTSUPP;

        return rdev_set_rekey_data(rdev, dev, &rekey_data);
}

static int nl80211_register_unexpected_frame(struct sk_buff *skb,
                                             struct genl_info *info)
{
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        if (wdev->iftype != NL80211_IFTYPE_AP &&
            wdev->iftype != NL80211_IFTYPE_P2P_GO)
                return -EINVAL;

        if (wdev->ap_unexpected_nlportid)
                return -EBUSY;

        wdev->ap_unexpected_nlportid = info->snd_portid;
        return 0;
}

static int nl80211_probe_client(struct sk_buff *skb,
                                struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct sk_buff *msg;
        void *hdr;
        const u8 *addr;
        u64 cookie;
        int err;

        if (wdev->iftype != NL80211_IFTYPE_AP &&
            wdev->iftype != NL80211_IFTYPE_P2P_GO)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!rdev->ops->probe_client)
                return -EOPNOTSUPP;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_PROBE_CLIENT);
        if (!hdr) {
                err = -ENOBUFS;
                goto free_msg;
        }

        addr = nla_data(info->attrs[NL80211_ATTR_MAC]);

        err = rdev_probe_client(rdev, dev, addr, &cookie);
        if (err)
                goto free_msg;

        if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        return genlmsg_reply(msg, info);

 nla_put_failure:
        err = -ENOBUFS;
 free_msg:
        nlmsg_free(msg);
        return err;
}

static int nl80211_register_beacons(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct cfg80211_beacon_registration *reg, *nreg;
        int rv;

        if (!(rdev->wiphy.flags & WIPHY_FLAG_REPORTS_OBSS))
                return -EOPNOTSUPP;

        nreg = kzalloc(sizeof(*nreg), GFP_KERNEL);
        if (!nreg)
                return -ENOMEM;

        /* First, check if already registered. */
        spin_lock_bh(&rdev->beacon_registrations_lock);
        list_for_each_entry(reg, &rdev->beacon_registrations, list) {
                if (reg->nlportid == info->snd_portid) {
                        rv = -EALREADY;
                        goto out_err;
                }
        }
        /* Add it to the list */
        nreg->nlportid = info->snd_portid;
        list_add(&nreg->list, &rdev->beacon_registrations);

        spin_unlock_bh(&rdev->beacon_registrations_lock);

        return 0;
out_err:
        spin_unlock_bh(&rdev->beacon_registrations_lock);
        kfree(nreg);
        return rv;
}

static int nl80211_start_p2p_device(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        int err;

        if (!rdev->ops->start_p2p_device)
                return -EOPNOTSUPP;

        if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)
                return -EOPNOTSUPP;

        if (wdev_running(wdev))
                return 0;

        if (rfkill_blocked(rdev->wiphy.rfkill))
                return -ERFKILL;

        err = rdev_start_p2p_device(rdev, wdev);
        if (err)
                return err;

        wdev->is_running = true;
        rdev->opencount++;

        return 0;
}

static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];

        if (wdev->iftype != NL80211_IFTYPE_P2P_DEVICE)
                return -EOPNOTSUPP;

        if (!rdev->ops->stop_p2p_device)
                return -EOPNOTSUPP;

        cfg80211_stop_p2p_device(rdev, wdev);

        return 0;
}

static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        struct cfg80211_nan_conf conf = {};
        int err;

        if (wdev->iftype != NL80211_IFTYPE_NAN)
                return -EOPNOTSUPP;

        if (wdev_running(wdev))
                return -EEXIST;

        if (rfkill_blocked(rdev->wiphy.rfkill))
                return -ERFKILL;

        if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
                return -EINVAL;

        conf.master_pref =
                nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);

        if (info->attrs[NL80211_ATTR_BANDS]) {
                u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);

                if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
                        return -EOPNOTSUPP;

                if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
                        return -EINVAL;

                conf.bands = bands;
        }

        err = rdev_start_nan(rdev, wdev, &conf);
        if (err)
                return err;

        wdev->is_running = true;
        rdev->opencount++;

        return 0;
}

static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];

        if (wdev->iftype != NL80211_IFTYPE_NAN)
                return -EOPNOTSUPP;

        cfg80211_stop_nan(rdev, wdev);

        return 0;
}

static int validate_nan_filter(struct nlattr *filter_attr)
{
        struct nlattr *attr;
        int len = 0, n_entries = 0, rem;

        nla_for_each_nested(attr, filter_attr, rem) {
                len += nla_len(attr);
                n_entries++;
        }

        if (len >= U8_MAX)
                return -EINVAL;

        return n_entries;
}

static int handle_nan_filter(struct nlattr *attr_filter,
                             struct cfg80211_nan_func *func,
                             bool tx)
{
        struct nlattr *attr;
        int n_entries, rem, i;
        struct cfg80211_nan_func_filter *filter;

        n_entries = validate_nan_filter(attr_filter);
        if (n_entries < 0)
                return n_entries;

        BUILD_BUG_ON(sizeof(*func->rx_filters) != sizeof(*func->tx_filters));

        filter = kcalloc(n_entries, sizeof(*func->rx_filters), GFP_KERNEL);
        if (!filter)
                return -ENOMEM;

        i = 0;
        nla_for_each_nested(attr, attr_filter, rem) {
                filter[i].filter = nla_memdup(attr, GFP_KERNEL);
                if (!filter[i].filter)
                        goto err;

                filter[i].len = nla_len(attr);
                i++;
        }
        if (tx) {
                func->num_tx_filters = n_entries;
                func->tx_filters = filter;
        } else {
                func->num_rx_filters = n_entries;
                func->rx_filters = filter;
        }

        return 0;

err:
        i = 0;
        nla_for_each_nested(attr, attr_filter, rem) {
                kfree(filter[i].filter);
                i++;
        }
        kfree(filter);
        return -ENOMEM;
}

static int nl80211_nan_add_func(struct sk_buff *skb,
                                struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        struct nlattr *tb[NUM_NL80211_NAN_FUNC_ATTR], *func_attr;
        struct cfg80211_nan_func *func;
        struct sk_buff *msg = NULL;
        void *hdr = NULL;
        int err = 0;

        if (wdev->iftype != NL80211_IFTYPE_NAN)
                return -EOPNOTSUPP;

        if (!wdev_running(wdev))
                return -ENOTCONN;

        if (!info->attrs[NL80211_ATTR_NAN_FUNC])
                return -EINVAL;

        err = nla_parse_nested_deprecated(tb, NL80211_NAN_FUNC_ATTR_MAX,
                                          info->attrs[NL80211_ATTR_NAN_FUNC],
                                          nl80211_nan_func_policy,
                                          info->extack);
        if (err)
                return err;

        func = kzalloc(sizeof(*func), GFP_KERNEL);
        if (!func)
                return -ENOMEM;

        func->cookie = cfg80211_assign_cookie(rdev);

        if (!tb[NL80211_NAN_FUNC_TYPE]) {
                err = -EINVAL;
                goto out;
        }


        func->type = nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]);

        if (!tb[NL80211_NAN_FUNC_SERVICE_ID]) {
                err = -EINVAL;
                goto out;
        }

        memcpy(func->service_id, nla_data(tb[NL80211_NAN_FUNC_SERVICE_ID]),
               sizeof(func->service_id));

        func->close_range =
                nla_get_flag(tb[NL80211_NAN_FUNC_CLOSE_RANGE]);

        if (tb[NL80211_NAN_FUNC_SERVICE_INFO]) {
                func->serv_spec_info_len =
                        nla_len(tb[NL80211_NAN_FUNC_SERVICE_INFO]);
                func->serv_spec_info =
                        kmemdup(nla_data(tb[NL80211_NAN_FUNC_SERVICE_INFO]),
                                func->serv_spec_info_len,
                                GFP_KERNEL);
                if (!func->serv_spec_info) {
                        err = -ENOMEM;
                        goto out;
                }
        }

        if (tb[NL80211_NAN_FUNC_TTL])
                func->ttl = nla_get_u32(tb[NL80211_NAN_FUNC_TTL]);

        switch (func->type) {
        case NL80211_NAN_FUNC_PUBLISH:
                if (!tb[NL80211_NAN_FUNC_PUBLISH_TYPE]) {
                        err = -EINVAL;
                        goto out;
                }

                func->publish_type =
                        nla_get_u8(tb[NL80211_NAN_FUNC_PUBLISH_TYPE]);
                func->publish_bcast =
                        nla_get_flag(tb[NL80211_NAN_FUNC_PUBLISH_BCAST]);

                if ((!(func->publish_type & NL80211_NAN_SOLICITED_PUBLISH)) &&
                        func->publish_bcast) {
                        err = -EINVAL;
                        goto out;
                }
                break;
        case NL80211_NAN_FUNC_SUBSCRIBE:
                func->subscribe_active =
                        nla_get_flag(tb[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE]);
                break;
        case NL80211_NAN_FUNC_FOLLOW_UP:
                if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] ||
                    !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] ||
                    !tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]) {
                        err = -EINVAL;
                        goto out;
                }

                func->followup_id =
                        nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_ID]);
                func->followup_reqid =
                        nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]);
                memcpy(func->followup_dest.addr,
                       nla_data(tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]),
                       sizeof(func->followup_dest.addr));
                if (func->ttl) {
                        err = -EINVAL;
                        goto out;
                }
                break;
        default:
                err = -EINVAL;
                goto out;
        }

        if (tb[NL80211_NAN_FUNC_SRF]) {
                struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR];

                err = nla_parse_nested_deprecated(srf_tb,
                                                  NL80211_NAN_SRF_ATTR_MAX,
                                                  tb[NL80211_NAN_FUNC_SRF],
                                                  nl80211_nan_srf_policy,
                                                  info->extack);
                if (err)
                        goto out;

                func->srf_include =
                        nla_get_flag(srf_tb[NL80211_NAN_SRF_INCLUDE]);

                if (srf_tb[NL80211_NAN_SRF_BF]) {
                        if (srf_tb[NL80211_NAN_SRF_MAC_ADDRS] ||
                            !srf_tb[NL80211_NAN_SRF_BF_IDX]) {
                                err = -EINVAL;
                                goto out;
                        }

                        func->srf_bf_len =
                                nla_len(srf_tb[NL80211_NAN_SRF_BF]);
                        func->srf_bf =
                                kmemdup(nla_data(srf_tb[NL80211_NAN_SRF_BF]),
                                        func->srf_bf_len, GFP_KERNEL);
                        if (!func->srf_bf) {
                                err = -ENOMEM;
                                goto out;
                        }

                        func->srf_bf_idx =
                                nla_get_u8(srf_tb[NL80211_NAN_SRF_BF_IDX]);
                } else {
                        struct nlattr *attr, *mac_attr =
                                srf_tb[NL80211_NAN_SRF_MAC_ADDRS];
                        int n_entries, rem, i = 0;

                        if (!mac_attr) {
                                err = -EINVAL;
                                goto out;
                        }

                        n_entries = validate_acl_mac_addrs(mac_attr);
                        if (n_entries <= 0) {
                                err = -EINVAL;
                                goto out;
                        }

                        func->srf_num_macs = n_entries;
                        func->srf_macs =
                                kcalloc(n_entries, sizeof(*func->srf_macs),
                                        GFP_KERNEL);
                        if (!func->srf_macs) {
                                err = -ENOMEM;
                                goto out;
                        }

                        nla_for_each_nested(attr, mac_attr, rem)
                                memcpy(func->srf_macs[i++].addr, nla_data(attr),
                                       sizeof(*func->srf_macs));
                }
        }

        if (tb[NL80211_NAN_FUNC_TX_MATCH_FILTER]) {
                err = handle_nan_filter(tb[NL80211_NAN_FUNC_TX_MATCH_FILTER],
                                        func, true);
                if (err)
                        goto out;
        }

        if (tb[NL80211_NAN_FUNC_RX_MATCH_FILTER]) {
                err = handle_nan_filter(tb[NL80211_NAN_FUNC_RX_MATCH_FILTER],
                                        func, false);
                if (err)
                        goto out;
        }

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg) {
                err = -ENOMEM;
                goto out;
        }

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_ADD_NAN_FUNCTION);
        /* This can't really happen - we just allocated 4KB */
        if (WARN_ON(!hdr)) {
                err = -ENOMEM;
                goto out;
        }

        err = rdev_add_nan_func(rdev, wdev, func);
out:
        if (err < 0) {
                cfg80211_free_nan_func(func);
                nlmsg_free(msg);
                return err;
        }

        /* propagate the instance id and cookie to userspace  */
        if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, func->cookie,
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        func_attr = nla_nest_start_noflag(msg, NL80211_ATTR_NAN_FUNC);
        if (!func_attr)
                goto nla_put_failure;

        if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID,
                       func->instance_id))
                goto nla_put_failure;

        nla_nest_end(msg, func_attr);

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

nla_put_failure:
        nlmsg_free(msg);
        return -ENOBUFS;
}

static int nl80211_nan_del_func(struct sk_buff *skb,
                               struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        u64 cookie;

        if (wdev->iftype != NL80211_IFTYPE_NAN)
                return -EOPNOTSUPP;

        if (!wdev_running(wdev))
                return -ENOTCONN;

        if (!info->attrs[NL80211_ATTR_COOKIE])
                return -EINVAL;

        cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);

        rdev_del_nan_func(rdev, wdev, cookie);

        return 0;
}

static int nl80211_nan_change_config(struct sk_buff *skb,
                                     struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        struct cfg80211_nan_conf conf = {};
        u32 changed = 0;

        if (wdev->iftype != NL80211_IFTYPE_NAN)
                return -EOPNOTSUPP;

        if (!wdev_running(wdev))
                return -ENOTCONN;

        if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
                conf.master_pref =
                        nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
                if (conf.master_pref <= 1 || conf.master_pref == 255)
                        return -EINVAL;

                changed |= CFG80211_NAN_CONF_CHANGED_PREF;
        }

        if (info->attrs[NL80211_ATTR_BANDS]) {
                u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);

                if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
                        return -EOPNOTSUPP;

                if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
                        return -EINVAL;

                conf.bands = bands;
                changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
        }

        if (!changed)
                return -EINVAL;

        return rdev_nan_change_conf(rdev, wdev, &conf, changed);
}

void cfg80211_nan_match(struct wireless_dev *wdev,
                        struct cfg80211_nan_match_params *match, gfp_t gfp)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct nlattr *match_attr, *local_func_attr, *peer_func_attr;
        struct sk_buff *msg;
        void *hdr;

        if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr))
                return;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_MATCH);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
                                         wdev->netdev->ifindex)) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, match->cookie,
                              NL80211_ATTR_PAD) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, match->addr))
                goto nla_put_failure;

        match_attr = nla_nest_start_noflag(msg, NL80211_ATTR_NAN_MATCH);
        if (!match_attr)
                goto nla_put_failure;

        local_func_attr = nla_nest_start_noflag(msg,
                                                NL80211_NAN_MATCH_FUNC_LOCAL);
        if (!local_func_attr)
                goto nla_put_failure;

        if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->inst_id))
                goto nla_put_failure;

        nla_nest_end(msg, local_func_attr);

        peer_func_attr = nla_nest_start_noflag(msg,
                                               NL80211_NAN_MATCH_FUNC_PEER);
        if (!peer_func_attr)
                goto nla_put_failure;

        if (nla_put_u8(msg, NL80211_NAN_FUNC_TYPE, match->type) ||
            nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->peer_inst_id))
                goto nla_put_failure;

        if (match->info && match->info_len &&
            nla_put(msg, NL80211_NAN_FUNC_SERVICE_INFO, match->info_len,
                    match->info))
                goto nla_put_failure;

        nla_nest_end(msg, peer_func_attr);
        nla_nest_end(msg, match_attr);
        genlmsg_end(msg, hdr);

        if (!wdev->owner_nlportid)
                genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
                                        msg, 0, NL80211_MCGRP_NAN, gfp);
        else
                genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
                                wdev->owner_nlportid);

        return;

nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_nan_match);

void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
                                  u8 inst_id,
                                  enum nl80211_nan_func_term_reason reason,
                                  u64 cookie, gfp_t gfp)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        struct nlattr *func_attr;
        void *hdr;

        if (WARN_ON(!inst_id))
                return;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_NAN_FUNCTION);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
                                         wdev->netdev->ifindex)) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        func_attr = nla_nest_start_noflag(msg, NL80211_ATTR_NAN_FUNC);
        if (!func_attr)
                goto nla_put_failure;

        if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, inst_id) ||
            nla_put_u8(msg, NL80211_NAN_FUNC_TERM_REASON, reason))
                goto nla_put_failure;

        nla_nest_end(msg, func_attr);
        genlmsg_end(msg, hdr);

        if (!wdev->owner_nlportid)
                genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
                                        msg, 0, NL80211_MCGRP_NAN, gfp);
        else
                genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
                                wdev->owner_nlportid);

        return;

nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_nan_func_terminated);

static int nl80211_get_protocol_features(struct sk_buff *skb,
                                         struct genl_info *info)
{
        void *hdr;
        struct sk_buff *msg;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_GET_PROTOCOL_FEATURES);
        if (!hdr)
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_PROTOCOL_FEATURES,
                        NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

 nla_put_failure:
        kfree_skb(msg);
        return -ENOBUFS;
}

static int nl80211_update_ft_ies(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct cfg80211_update_ft_ies_params ft_params;
        struct net_device *dev = info->user_ptr[1];

        if (!rdev->ops->update_ft_ies)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MDID] ||
            !info->attrs[NL80211_ATTR_IE])
                return -EINVAL;

        memset(&ft_params, 0, sizeof(ft_params));
        ft_params.md = nla_get_u16(info->attrs[NL80211_ATTR_MDID]);
        ft_params.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
        ft_params.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);

        return rdev_update_ft_ies(rdev, dev, &ft_params);
}

static int nl80211_crit_protocol_start(struct sk_buff *skb,
                                       struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        enum nl80211_crit_proto_id proto = NL80211_CRIT_PROTO_UNSPEC;
        u16 duration;
        int ret;

        if (!rdev->ops->crit_proto_start)
                return -EOPNOTSUPP;

        if (WARN_ON(!rdev->ops->crit_proto_stop))
                return -EINVAL;

        if (rdev->crit_proto_nlportid)
                return -EBUSY;

        /* determine protocol if provided */
        if (info->attrs[NL80211_ATTR_CRIT_PROT_ID])
                proto = nla_get_u16(info->attrs[NL80211_ATTR_CRIT_PROT_ID]);

        if (proto >= NUM_NL80211_CRIT_PROTO)
                return -EINVAL;

        /* timeout must be provided */
        if (!info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION])
                return -EINVAL;

        duration =
                nla_get_u16(info->attrs[NL80211_ATTR_MAX_CRIT_PROT_DURATION]);

        ret = rdev_crit_proto_start(rdev, wdev, proto, duration);
        if (!ret)
                rdev->crit_proto_nlportid = info->snd_portid;

        return ret;
}

static int nl80211_crit_protocol_stop(struct sk_buff *skb,
                                      struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];

        if (!rdev->ops->crit_proto_stop)
                return -EOPNOTSUPP;

        if (rdev->crit_proto_nlportid) {
                rdev->crit_proto_nlportid = 0;
                rdev_crit_proto_stop(rdev, wdev);
        }
        return 0;
}

static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd,
                                       struct nlattr *attr,
                                       struct netlink_ext_ack *extack)
{
        if (vcmd->policy == VENDOR_CMD_RAW_DATA) {
                if (attr->nla_type & NLA_F_NESTED) {
                        NL_SET_ERR_MSG_ATTR(extack, attr,
                                            "unexpected nested data");
                        return -EINVAL;
                }

                return 0;
        }

        if (!(attr->nla_type & NLA_F_NESTED)) {
                NL_SET_ERR_MSG_ATTR(extack, attr, "expected nested data");
                return -EINVAL;
        }

        return nla_validate_nested(attr, vcmd->maxattr, vcmd->policy, extack);
}

static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev =
                __cfg80211_wdev_from_attrs(rdev, genl_info_net(info),
                                           info->attrs);
        int i, err;
        u32 vid, subcmd;

        if (!rdev->wiphy.vendor_commands)
                return -EOPNOTSUPP;

        if (IS_ERR(wdev)) {
                err = PTR_ERR(wdev);
                if (err != -EINVAL)
                        return err;
                wdev = NULL;
        } else if (wdev->wiphy != &rdev->wiphy) {
                return -EINVAL;
        }

        if (!info->attrs[NL80211_ATTR_VENDOR_ID] ||
            !info->attrs[NL80211_ATTR_VENDOR_SUBCMD])
                return -EINVAL;

        vid = nla_get_u32(info->attrs[NL80211_ATTR_VENDOR_ID]);
        subcmd = nla_get_u32(info->attrs[NL80211_ATTR_VENDOR_SUBCMD]);
        for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
                const struct wiphy_vendor_command *vcmd;
                void *data = NULL;
                int len = 0;

                vcmd = &rdev->wiphy.vendor_commands[i];

                if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
                        continue;

                if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV |
                                   WIPHY_VENDOR_CMD_NEED_NETDEV)) {
                        if (!wdev)
                                return -EINVAL;
                        if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV &&
                            !wdev->netdev)
                                return -EINVAL;

                        if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
                                if (!wdev_running(wdev))
                                        return -ENETDOWN;
                        }
                } else {
                        wdev = NULL;
                }

                if (!vcmd->doit)
                        return -EOPNOTSUPP;

                if (info->attrs[NL80211_ATTR_VENDOR_DATA]) {
                        data = nla_data(info->attrs[NL80211_ATTR_VENDOR_DATA]);
                        len = nla_len(info->attrs[NL80211_ATTR_VENDOR_DATA]);

                        err = nl80211_vendor_check_policy(vcmd,
                                        info->attrs[NL80211_ATTR_VENDOR_DATA],
                                        info->extack);
                        if (err)
                                return err;
                }

                rdev->cur_cmd_info = info;
                err = vcmd->doit(&rdev->wiphy, wdev, data, len);
                rdev->cur_cmd_info = NULL;
                return err;
        }

        return -EOPNOTSUPP;
}

static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
                                       struct netlink_callback *cb,
                                       struct cfg80211_registered_device **rdev,
                                       struct wireless_dev **wdev)
{
        struct nlattr **attrbuf;
        u32 vid, subcmd;
        unsigned int i;
        int vcmd_idx = -1;
        int err;
        void *data = NULL;
        unsigned int data_len = 0;

        if (cb->args[0]) {
                /* subtract the 1 again here */
                struct wiphy *wiphy = wiphy_idx_to_wiphy(cb->args[0] - 1);
                struct wireless_dev *tmp;

                if (!wiphy)
                        return -ENODEV;
                *rdev = wiphy_to_rdev(wiphy);
                *wdev = NULL;

                if (cb->args[1]) {
                        list_for_each_entry(tmp, &wiphy->wdev_list, list) {
                                if (tmp->identifier == cb->args[1] - 1) {
                                        *wdev = tmp;
                                        break;
                                }
                        }
                }

                /* keep rtnl locked in successful case */
                return 0;
        }

        attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL);
        if (!attrbuf)
                return -ENOMEM;

        err = nlmsg_parse_deprecated(cb->nlh,
                                     GENL_HDRLEN + nl80211_fam.hdrsize,
                                     attrbuf, nl80211_fam.maxattr,
                                     nl80211_policy, NULL);
        if (err)
                goto out;

        if (!attrbuf[NL80211_ATTR_VENDOR_ID] ||
            !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) {
                err = -EINVAL;
                goto out;
        }

        *wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(skb->sk), attrbuf);
        if (IS_ERR(*wdev))
                *wdev = NULL;

        *rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
        if (IS_ERR(*rdev)) {
                err = PTR_ERR(*rdev);
                goto out;
        }

        vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]);
        subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);

        for (i = 0; i < (*rdev)->wiphy.n_vendor_commands; i++) {
                const struct wiphy_vendor_command *vcmd;

                vcmd = &(*rdev)->wiphy.vendor_commands[i];

                if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
                        continue;

                if (!vcmd->dumpit) {
                        err = -EOPNOTSUPP;
                        goto out;
                }

                vcmd_idx = i;
                break;
        }

        if (vcmd_idx < 0) {
                err = -EOPNOTSUPP;
                goto out;
        }

        if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
                data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
                data_len = nla_len(attrbuf[NL80211_ATTR_VENDOR_DATA]);

                err = nl80211_vendor_check_policy(
                                &(*rdev)->wiphy.vendor_commands[vcmd_idx],
                                attrbuf[NL80211_ATTR_VENDOR_DATA],
                                cb->extack);
                if (err)
                        goto out;
        }

        /* 0 is the first index - add 1 to parse only once */
        cb->args[0] = (*rdev)->wiphy_idx + 1;
        /* add 1 to know if it was NULL */
        cb->args[1] = *wdev ? (*wdev)->identifier + 1 : 0;
        cb->args[2] = vcmd_idx;
        cb->args[3] = (unsigned long)data;
        cb->args[4] = data_len;

        /* keep rtnl locked in successful case */
        err = 0;
out:
        kfree(attrbuf);
        return err;
}

static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
                                   struct netlink_callback *cb)
{
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        unsigned int vcmd_idx;
        const struct wiphy_vendor_command *vcmd;
        void *data;
        int data_len;
        int err;
        struct nlattr *vendor_data;

        rtnl_lock();
        err = nl80211_prepare_vendor_dump(skb, cb, &rdev, &wdev);
        if (err)
                goto out;

        vcmd_idx = cb->args[2];
        data = (void *)cb->args[3];
        data_len = cb->args[4];
        vcmd = &rdev->wiphy.vendor_commands[vcmd_idx];

        if (vcmd->flags & (WIPHY_VENDOR_CMD_NEED_WDEV |
                           WIPHY_VENDOR_CMD_NEED_NETDEV)) {
                if (!wdev) {
                        err = -EINVAL;
                        goto out;
                }
                if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_NETDEV &&
                    !wdev->netdev) {
                        err = -EINVAL;
                        goto out;
                }

                if (vcmd->flags & WIPHY_VENDOR_CMD_NEED_RUNNING) {
                        if (!wdev_running(wdev)) {
                                err = -ENETDOWN;
                                goto out;
                        }
                }
        }

        while (1) {
                void *hdr = nl80211hdr_put(skb, NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                           NL80211_CMD_VENDOR);
                if (!hdr)
                        break;

                if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
                    (wdev && nla_put_u64_64bit(skb, NL80211_ATTR_WDEV,
                                               wdev_id(wdev),
                                               NL80211_ATTR_PAD))) {
                        genlmsg_cancel(skb, hdr);
                        break;
                }

                vendor_data = nla_nest_start_noflag(skb,
                                                    NL80211_ATTR_VENDOR_DATA);
                if (!vendor_data) {
                        genlmsg_cancel(skb, hdr);
                        break;
                }

                err = vcmd->dumpit(&rdev->wiphy, wdev, skb, data, data_len,
                                   (unsigned long *)&cb->args[5]);
                nla_nest_end(skb, vendor_data);

                if (err == -ENOBUFS || err == -ENOENT) {
                        genlmsg_cancel(skb, hdr);
                        break;
                } else if (err <= 0) {
                        genlmsg_cancel(skb, hdr);
                        goto out;
                }

                genlmsg_end(skb, hdr);
        }

        err = skb->len;
 out:
        rtnl_unlock();
        return err;
}

struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           int approxlen)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        if (WARN_ON(!rdev->cur_cmd_info))
                return NULL;

        return __cfg80211_alloc_vendor_skb(rdev, NULL, approxlen,
                                           rdev->cur_cmd_info->snd_portid,
                                           rdev->cur_cmd_info->snd_seq,
                                           cmd, attr, NULL, GFP_KERNEL);
}
EXPORT_SYMBOL(__cfg80211_alloc_reply_skb);

int cfg80211_vendor_cmd_reply(struct sk_buff *skb)
{
        struct cfg80211_registered_device *rdev = ((void **)skb->cb)[0];
        void *hdr = ((void **)skb->cb)[1];
        struct nlattr *data = ((void **)skb->cb)[2];

        /* clear CB data for netlink core to own from now on */
        memset(skb->cb, 0, sizeof(skb->cb));

        if (WARN_ON(!rdev->cur_cmd_info)) {
                kfree_skb(skb);
                return -EINVAL;
        }

        nla_nest_end(skb, data);
        genlmsg_end(skb, hdr);
        return genlmsg_reply(skb, rdev->cur_cmd_info);
}
EXPORT_SYMBOL_GPL(cfg80211_vendor_cmd_reply);

unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        if (WARN_ON(!rdev->cur_cmd_info))
                return 0;

        return rdev->cur_cmd_info->snd_portid;
}
EXPORT_SYMBOL_GPL(cfg80211_vendor_cmd_get_sender);

static int nl80211_set_qos_map(struct sk_buff *skb,
                               struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct cfg80211_qos_map *qos_map = NULL;
        struct net_device *dev = info->user_ptr[1];
        u8 *pos, len, num_des, des_len, des;
        int ret;

        if (!rdev->ops->set_qos_map)
                return -EOPNOTSUPP;

        if (info->attrs[NL80211_ATTR_QOS_MAP]) {
                pos = nla_data(info->attrs[NL80211_ATTR_QOS_MAP]);
                len = nla_len(info->attrs[NL80211_ATTR_QOS_MAP]);

                if (len % 2)
                        return -EINVAL;

                qos_map = kzalloc(sizeof(struct cfg80211_qos_map), GFP_KERNEL);
                if (!qos_map)
                        return -ENOMEM;

                num_des = (len - IEEE80211_QOS_MAP_LEN_MIN) >> 1;
                if (num_des) {
                        des_len = num_des *
                                sizeof(struct cfg80211_dscp_exception);
                        memcpy(qos_map->dscp_exception, pos, des_len);
                        qos_map->num_des = num_des;
                        for (des = 0; des < num_des; des++) {
                                if (qos_map->dscp_exception[des].up > 7) {
                                        kfree(qos_map);
                                        return -EINVAL;
                                }
                        }
                        pos += des_len;
                }
                memcpy(qos_map->up, pos, IEEE80211_QOS_MAP_LEN_MIN);
        }

        ret = nl80211_key_allowed(dev->ieee80211_ptr);
        if (!ret)
                ret = rdev_set_qos_map(rdev, dev, qos_map);

        kfree(qos_map);
        return ret;
}

static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        const u8 *peer;
        u8 tsid, up;
        u16 admitted_time = 0;

        if (!(rdev->wiphy.features & NL80211_FEATURE_SUPPORTS_WMM_ADMISSION))
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] ||
            !info->attrs[NL80211_ATTR_USER_PRIO])
                return -EINVAL;

        tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]);
        up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]);

        /* WMM uses TIDs 0-7 even for TSPEC */
        if (tsid >= IEEE80211_FIRST_TSPEC_TSID) {
                /* TODO: handle 802.11 TSPEC/admission control
                 * need more attributes for that (e.g. BA session requirement);
                 * change the WMM adminssion test above to allow both then
                 */
                return -EINVAL;
        }

        peer = nla_data(info->attrs[NL80211_ATTR_MAC]);

        if (info->attrs[NL80211_ATTR_ADMITTED_TIME]) {
                admitted_time =
                        nla_get_u16(info->attrs[NL80211_ATTR_ADMITTED_TIME]);
                if (!admitted_time)
                        return -EINVAL;
        }

        switch (wdev->iftype) {
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                if (wdev->connected)
                        break;
                return -ENOTCONN;
        default:
                return -EOPNOTSUPP;
        }

        return rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time);
}

static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        const u8 *peer;
        u8 tsid;

        if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]);
        peer = nla_data(info->attrs[NL80211_ATTR_MAC]);

        return rdev_del_tx_ts(rdev, dev, tsid, peer);
}

static int nl80211_tdls_channel_switch(struct sk_buff *skb,
                                       struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_chan_def chandef = {};
        const u8 *addr;
        u8 oper_class;
        int err;

        if (!rdev->ops->tdls_channel_switch ||
            !(rdev->wiphy.features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH))
                return -EOPNOTSUPP;

        switch (dev->ieee80211_ptr->iftype) {
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                break;
        default:
                return -EOPNOTSUPP;
        }

        if (!info->attrs[NL80211_ATTR_MAC] ||
            !info->attrs[NL80211_ATTR_OPER_CLASS])
                return -EINVAL;

        err = nl80211_parse_chandef(rdev, info, &chandef);
        if (err)
                return err;

        /*
         * Don't allow wide channels on the 2.4Ghz band, as per IEEE802.11-2012
         * section 10.22.6.2.1. Disallow 5/10Mhz channels as well for now, the
         * specification is not defined for them.
         */
        if (chandef.chan->band == NL80211_BAND_2GHZ &&
            chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
            chandef.width != NL80211_CHAN_WIDTH_20)
                return -EINVAL;

        /* we will be active on the TDLS link */
        if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
                                           wdev->iftype))
                return -EINVAL;

        /* don't allow switching to DFS channels */
        if (cfg80211_chandef_dfs_required(wdev->wiphy, &chandef, wdev->iftype))
                return -EINVAL;

        addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
        oper_class = nla_get_u8(info->attrs[NL80211_ATTR_OPER_CLASS]);

        return rdev_tdls_channel_switch(rdev, dev, addr, oper_class, &chandef);
}

static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb,
                                              struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        const u8 *addr;

        if (!rdev->ops->tdls_channel_switch ||
            !rdev->ops->tdls_cancel_channel_switch ||
            !(rdev->wiphy.features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH))
                return -EOPNOTSUPP;

        switch (dev->ieee80211_ptr->iftype) {
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                break;
        default:
                return -EOPNOTSUPP;
        }

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        addr = nla_data(info->attrs[NL80211_ATTR_MAC]);

        rdev_tdls_cancel_channel_switch(rdev, dev, addr);

        return 0;
}

static int nl80211_set_multicast_to_unicast(struct sk_buff *skb,
                                            struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        const struct nlattr *nla;
        bool enabled;

        if (!rdev->ops->set_multicast_to_unicast)
                return -EOPNOTSUPP;

        if (wdev->iftype != NL80211_IFTYPE_AP &&
            wdev->iftype != NL80211_IFTYPE_P2P_GO)
                return -EOPNOTSUPP;

        nla = info->attrs[NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED];
        enabled = nla_get_flag(nla);

        return rdev_set_multicast_to_unicast(rdev, dev, enabled);
}

static int nl80211_set_pmk(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_pmk_conf pmk_conf = {};

        if (wdev->iftype != NL80211_IFTYPE_STATION &&
            wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X))
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MAC] || !info->attrs[NL80211_ATTR_PMK])
                return -EINVAL;

        if (!wdev->connected)
                return -ENOTCONN;

        pmk_conf.aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
        if (memcmp(pmk_conf.aa, wdev->u.client.connected_addr, ETH_ALEN))
                return -EINVAL;

        pmk_conf.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]);
        pmk_conf.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]);
        if (pmk_conf.pmk_len != WLAN_PMK_LEN &&
            pmk_conf.pmk_len != WLAN_PMK_LEN_SUITE_B_192)
                return -EINVAL;

        if (info->attrs[NL80211_ATTR_PMKR0_NAME])
                pmk_conf.pmk_r0_name =
                        nla_data(info->attrs[NL80211_ATTR_PMKR0_NAME]);

        return rdev_set_pmk(rdev, dev, &pmk_conf);
}

static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        const u8 *aa;

        if (wdev->iftype != NL80211_IFTYPE_STATION &&
            wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X))
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        aa = nla_data(info->attrs[NL80211_ATTR_MAC]);
        return rdev_del_pmk(rdev, dev, aa);
}

static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_external_auth_params params;

        if (!rdev->ops->external_auth)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_SSID] &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
            dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_BSSID])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_STATUS_CODE])
                return -EINVAL;

        memset(&params, 0, sizeof(params));

        if (info->attrs[NL80211_ATTR_SSID]) {
                params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
                if (params.ssid.ssid_len == 0)
                        return -EINVAL;
                memcpy(params.ssid.ssid,
                       nla_data(info->attrs[NL80211_ATTR_SSID]),
                       params.ssid.ssid_len);
        }

        memcpy(params.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]),
               ETH_ALEN);

        params.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]);

        if (info->attrs[NL80211_ATTR_PMKID])
                params.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);

        return rdev_external_auth(rdev, dev, &params);
}

static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
{
        bool dont_wait_for_ack = info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK];
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        const u8 *buf;
        size_t len;
        u8 *dest;
        u16 proto;
        bool noencrypt;
        u64 cookie = 0;
        int link_id;
        int err;

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211))
                return -EOPNOTSUPP;

        if (!rdev->ops->tx_control_port)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_FRAME] ||
            !info->attrs[NL80211_ATTR_MAC] ||
            !info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
                GENL_SET_ERR_MSG(info, "Frame, MAC or ethertype missing");
                return -EINVAL;
        }

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_MESH_POINT:
                break;
        case NL80211_IFTYPE_ADHOC:
                if (wdev->u.ibss.current_bss)
                        break;
                return -ENOTCONN;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                if (wdev->connected)
                        break;
                return -ENOTCONN;
        default:
                return -EOPNOTSUPP;
        }

        buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
        len = nla_len(info->attrs[NL80211_ATTR_FRAME]);
        dest = nla_data(info->attrs[NL80211_ATTR_MAC]);
        proto = nla_get_u16(info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
        noencrypt =
                nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]);

        link_id = nl80211_link_id_or_invalid(info->attrs);

        err = rdev_tx_control_port(rdev, dev, buf, len,
                                   dest, cpu_to_be16(proto), noencrypt, link_id,
                                   dont_wait_for_ack ? NULL : &cookie);
        if (!err && !dont_wait_for_ack)
                nl_set_extack_cookie_u64(info->extack, cookie);
        return err;
}

static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
                                           struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_ftm_responder_stats ftm_stats = {};
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct sk_buff *msg;
        void *hdr;
        struct nlattr *ftm_stats_attr;
        int err;

        if (wdev->iftype != NL80211_IFTYPE_AP ||
            !wdev->links[link_id].ap.beacon_interval)
                return -EOPNOTSUPP;

        err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats);
        if (err)
                return err;

        if (!ftm_stats.filled)
                return -ENODATA;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
                             NL80211_CMD_GET_FTM_RESPONDER_STATS);
        if (!hdr)
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
                goto nla_put_failure;

        ftm_stats_attr = nla_nest_start_noflag(msg,
                                               NL80211_ATTR_FTM_RESPONDER_STATS);
        if (!ftm_stats_attr)
                goto nla_put_failure;

#define SET_FTM(field, name, type)                                         \
        do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
            nla_put_ ## type(msg, NL80211_FTM_STATS_ ## name,                 \
                             ftm_stats.field))                                 \
                goto nla_put_failure; } while (0)
#define SET_FTM_U64(field, name)                                         \
        do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
            nla_put_u64_64bit(msg, NL80211_FTM_STATS_ ## name,                 \
                              ftm_stats.field, NL80211_FTM_STATS_PAD))         \
                goto nla_put_failure; } while (0)

        SET_FTM(success_num, SUCCESS_NUM, u32);
        SET_FTM(partial_num, PARTIAL_NUM, u32);
        SET_FTM(failed_num, FAILED_NUM, u32);
        SET_FTM(asap_num, ASAP_NUM, u32);
        SET_FTM(non_asap_num, NON_ASAP_NUM, u32);
        SET_FTM_U64(total_duration_ms, TOTAL_DURATION_MSEC);
        SET_FTM(unknown_triggers_num, UNKNOWN_TRIGGERS_NUM, u32);
        SET_FTM(reschedule_requests_num, RESCHEDULE_REQUESTS_NUM, u32);
        SET_FTM(out_of_window_triggers_num, OUT_OF_WINDOW_TRIGGERS_NUM, u32);
#undef SET_FTM

        nla_nest_end(msg, ftm_stats_attr);

        genlmsg_end(msg, hdr);
        return genlmsg_reply(msg, info);

nla_put_failure:
        nlmsg_free(msg);
        return -ENOBUFS;
}

static int nl80211_update_owe_info(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct cfg80211_update_owe_info owe_info;
        struct net_device *dev = info->user_ptr[1];

        if (!rdev->ops->update_owe_info)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_STATUS_CODE] ||
            !info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        memset(&owe_info, 0, sizeof(owe_info));
        owe_info.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]);
        nla_memcpy(owe_info.peer, info->attrs[NL80211_ATTR_MAC], ETH_ALEN);

        if (info->attrs[NL80211_ATTR_IE]) {
                owe_info.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
                owe_info.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
        }

        return rdev_update_owe_info(rdev, dev, &owe_info);
}

static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct station_info sinfo = {};
        const u8 *buf;
        size_t len;
        u8 *dest;
        int err;

        if (!rdev->ops->probe_mesh_link || !rdev->ops->get_station)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MAC] ||
            !info->attrs[NL80211_ATTR_FRAME]) {
                GENL_SET_ERR_MSG(info, "Frame or MAC missing");
                return -EINVAL;
        }

        if (wdev->iftype != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        dest = nla_data(info->attrs[NL80211_ATTR_MAC]);
        buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
        len = nla_len(info->attrs[NL80211_ATTR_FRAME]);

        if (len < sizeof(struct ethhdr))
                return -EINVAL;

        if (!ether_addr_equal(buf, dest) || is_multicast_ether_addr(buf) ||
            !ether_addr_equal(buf + ETH_ALEN, dev->dev_addr))
                return -EINVAL;

        err = rdev_get_station(rdev, dev, dest, &sinfo);
        if (err)
                return err;

        cfg80211_sinfo_release_content(&sinfo);

        return rdev_probe_mesh_link(rdev, dev, dest, buf, len);
}

static int parse_tid_conf(struct cfg80211_registered_device *rdev,
                          struct nlattr *attrs[], struct net_device *dev,
                          struct cfg80211_tid_cfg *tid_conf,
                          struct genl_info *info, const u8 *peer,
                          unsigned int link_id)
{
        struct netlink_ext_ack *extack = info->extack;
        u64 mask;
        int err;

        if (!attrs[NL80211_TID_CONFIG_ATTR_TIDS])
                return -EINVAL;

        tid_conf->config_override =
                        nla_get_flag(attrs[NL80211_TID_CONFIG_ATTR_OVERRIDE]);
        tid_conf->tids = nla_get_u16(attrs[NL80211_TID_CONFIG_ATTR_TIDS]);

        if (tid_conf->config_override) {
                if (rdev->ops->reset_tid_config) {
                        err = rdev_reset_tid_config(rdev, dev, peer,
                                                    tid_conf->tids);
                        if (err)
                                return err;
                } else {
                        return -EINVAL;
                }
        }

        if (attrs[NL80211_TID_CONFIG_ATTR_NOACK]) {
                tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_NOACK);
                tid_conf->noack =
                        nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_NOACK]);
        }

        if (attrs[NL80211_TID_CONFIG_ATTR_RETRY_SHORT]) {
                tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RETRY_SHORT);
                tid_conf->retry_short =
                        nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RETRY_SHORT]);

                if (tid_conf->retry_short > rdev->wiphy.max_data_retry_count)
                        return -EINVAL;
        }

        if (attrs[NL80211_TID_CONFIG_ATTR_RETRY_LONG]) {
                tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RETRY_LONG);
                tid_conf->retry_long =
                        nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RETRY_LONG]);

                if (tid_conf->retry_long > rdev->wiphy.max_data_retry_count)
                        return -EINVAL;
        }

        if (attrs[NL80211_TID_CONFIG_ATTR_AMPDU_CTRL]) {
                tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMPDU_CTRL);
                tid_conf->ampdu =
                        nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMPDU_CTRL]);
        }

        if (attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]) {
                tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL);
                tid_conf->rtscts =
                        nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]);
        }

        if (attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]) {
                tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMSDU_CTRL);
                tid_conf->amsdu =
                        nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]);
        }

        if (attrs[NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE]) {
                u32 idx = NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, attr;

                tid_conf->txrate_type = nla_get_u8(attrs[idx]);

                if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) {
                        attr = NL80211_TID_CONFIG_ATTR_TX_RATE;
                        err = nl80211_parse_tx_bitrate_mask(info, attrs, attr,
                                                    &tid_conf->txrate_mask, dev,
                                                    true, link_id);
                        if (err)
                                return err;

                        tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE);
                }
                tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE);
        }

        if (peer)
                mask = rdev->wiphy.tid_config_support.peer;
        else
                mask = rdev->wiphy.tid_config_support.vif;

        if (tid_conf->mask & ~mask) {
                NL_SET_ERR_MSG(extack, "unsupported TID configuration");
                return -EOPNOTSUPP;
        }

        return 0;
}

static int nl80211_set_tid_config(struct sk_buff *skb,
                                  struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct nlattr *attrs[NL80211_TID_CONFIG_ATTR_MAX + 1];
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_tid_config *tid_config;
        struct nlattr *tid;
        int conf_idx = 0, rem_conf;
        int ret = -EINVAL;
        u32 num_conf = 0;

        if (!info->attrs[NL80211_ATTR_TID_CONFIG])
                return -EINVAL;

        if (!rdev->ops->set_tid_config)
                return -EOPNOTSUPP;

        nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG],
                            rem_conf)
                num_conf++;

        tid_config = kzalloc(struct_size(tid_config, tid_conf, num_conf),
                             GFP_KERNEL);
        if (!tid_config)
                return -ENOMEM;

        tid_config->n_tid_conf = num_conf;

        if (info->attrs[NL80211_ATTR_MAC])
                tid_config->peer = nla_data(info->attrs[NL80211_ATTR_MAC]);

        nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG],
                            rem_conf) {
                ret = nla_parse_nested(attrs, NL80211_TID_CONFIG_ATTR_MAX,
                                       tid, NULL, NULL);

                if (ret)
                        goto bad_tid_conf;

                ret = parse_tid_conf(rdev, attrs, dev,
                                     &tid_config->tid_conf[conf_idx],
                                     info, tid_config->peer, link_id);
                if (ret)
                        goto bad_tid_conf;

                conf_idx++;
        }

        ret = rdev_set_tid_config(rdev, dev, tid_config);

bad_tid_conf:
        kfree(tid_config);
        return ret;
}

static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct cfg80211_color_change_settings params = {};
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct nlattr **tb;
        u16 offset;
        int err;

        if (!rdev->ops->color_change)
                return -EOPNOTSUPP;

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_BSS_COLOR))
                return -EOPNOTSUPP;

        if (wdev->iftype != NL80211_IFTYPE_AP)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT] ||
            !info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR] ||
            !info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS])
                return -EINVAL;

        params.count = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT]);
        params.color = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR]);

        err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_next,
                                   info->extack);
        if (err)
                return err;

        tb = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*tb), GFP_KERNEL);
        if (!tb)
                return -ENOMEM;

        err = nla_parse_nested(tb, NL80211_ATTR_MAX,
                               info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS],
                               nl80211_policy, info->extack);
        if (err)
                goto out;

        err = nl80211_parse_beacon(rdev, tb, &params.beacon_color_change,
                                   info->extack);
        if (err)
                goto out;

        if (!tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) {
                err = -EINVAL;
                goto out;
        }

        if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) != sizeof(u16)) {
                err = -EINVAL;
                goto out;
        }

        offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]);
        if (offset >= params.beacon_color_change.tail_len) {
                err = -EINVAL;
                goto out;
        }

        if (params.beacon_color_change.tail[offset] != params.count) {
                err = -EINVAL;
                goto out;
        }

        params.counter_offset_beacon = offset;

        if (tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) {
                if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) !=
                    sizeof(u16)) {
                        err = -EINVAL;
                        goto out;
                }

                offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]);
                if (offset >= params.beacon_color_change.probe_resp_len) {
                        err = -EINVAL;
                        goto out;
                }

                if (params.beacon_color_change.probe_resp[offset] !=
                    params.count) {
                        err = -EINVAL;
                        goto out;
                }

                params.counter_offset_presp = offset;
        }

        params.link_id = nl80211_link_id(info->attrs);
        err = rdev_color_change(rdev, dev, &params);

out:
        kfree(params.beacon_next.mbssid_ies);
        kfree(params.beacon_color_change.mbssid_ies);
        kfree(params.beacon_next.rnr_ies);
        kfree(params.beacon_color_change.rnr_ies);
        kfree(tb);
        return err;
}

static int nl80211_set_fils_aad(struct sk_buff *skb,
                                struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_fils_aad fils_aad = {};
        u8 *nonces;

        if (!info->attrs[NL80211_ATTR_MAC] ||
            !info->attrs[NL80211_ATTR_FILS_KEK] ||
            !info->attrs[NL80211_ATTR_FILS_NONCES])
                return -EINVAL;

        fils_aad.macaddr = nla_data(info->attrs[NL80211_ATTR_MAC]);
        fils_aad.kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]);
        fils_aad.kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]);
        nonces = nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]);
        fils_aad.snonce = nonces;
        fils_aad.anonce = nonces + FILS_NONCE_LEN;

        return rdev_set_fils_aad(rdev, dev, &fils_aad);
}

static int nl80211_add_link(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        int ret;

        if (!(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
                return -EINVAL;

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
                break;
        default:
                return -EINVAL;
        }

        if (!info->attrs[NL80211_ATTR_MAC] ||
            !is_valid_ether_addr(nla_data(info->attrs[NL80211_ATTR_MAC])))
                return -EINVAL;

        wdev->valid_links |= BIT(link_id);
        ether_addr_copy(wdev->links[link_id].addr,
                        nla_data(info->attrs[NL80211_ATTR_MAC]));

        ret = rdev_add_intf_link(rdev, wdev, link_id);
        if (ret) {
                wdev->valid_links &= ~BIT(link_id);
                eth_zero_addr(wdev->links[link_id].addr);
        }

        return ret;
}

static int nl80211_remove_link(struct sk_buff *skb, struct genl_info *info)
{
        unsigned int link_id = nl80211_link_id(info->attrs);
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        /* cannot remove if there's no link */
        if (!info->attrs[NL80211_ATTR_MLO_LINK_ID])
                return -EINVAL;

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
                break;
        default:
                return -EINVAL;
        }

        cfg80211_remove_link(wdev, link_id);

        return 0;
}

static int
nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info,
                             bool add)
{
        struct link_station_parameters params = {};
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        int err;

        if ((add && !rdev->ops->add_link_station) ||
            (!add && !rdev->ops->mod_link_station))
                return -EOPNOTSUPP;

        if (add && !info->attrs[NL80211_ATTR_MAC])
                return -EINVAL;

        if (!info->attrs[NL80211_ATTR_MLD_ADDR])
                return -EINVAL;

        if (add && !info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES])
                return -EINVAL;

        params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);

        if (info->attrs[NL80211_ATTR_MAC]) {
                params.link_mac = nla_data(info->attrs[NL80211_ATTR_MAC]);
                if (!is_valid_ether_addr(params.link_mac))
                        return -EINVAL;
        }

        if (!info->attrs[NL80211_ATTR_MLO_LINK_ID])
                return -EINVAL;

        params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]);

        if (info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]) {
                params.supported_rates =
                        nla_data(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
                params.supported_rates_len =
                        nla_len(info->attrs[NL80211_ATTR_STA_SUPPORTED_RATES]);
        }

        if (info->attrs[NL80211_ATTR_HT_CAPABILITY])
                params.ht_capa =
                        nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]);

        if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
                params.vht_capa =
                        nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);

        if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
                params.he_capa =
                        nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
                params.he_capa_len =
                        nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);

                if (info->attrs[NL80211_ATTR_EHT_CAPABILITY]) {
                        params.eht_capa =
                                nla_data(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);
                        params.eht_capa_len =
                                nla_len(info->attrs[NL80211_ATTR_EHT_CAPABILITY]);

                        if (!ieee80211_eht_capa_size_ok((const u8 *)params.he_capa,
                                                        (const u8 *)params.eht_capa,
                                                        params.eht_capa_len,
                                                        false))
                                return -EINVAL;
                }
        }

        if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
                params.he_6ghz_capa =
                        nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);

        if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
                params.opmode_notif_used = true;
                params.opmode_notif =
                        nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
        }

        err = nl80211_parse_sta_txpower_setting(info, &params.txpwr,
                                                &params.txpwr_set);
        if (err)
                return err;

        if (add)
                return rdev_add_link_station(rdev, dev, &params);

        return rdev_mod_link_station(rdev, dev, &params);
}

static int
nl80211_add_link_station(struct sk_buff *skb, struct genl_info *info)
{
        return nl80211_add_mod_link_station(skb, info, true);
}

static int
nl80211_modify_link_station(struct sk_buff *skb, struct genl_info *info)
{
        return nl80211_add_mod_link_station(skb, info, false);
}

static int
nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info)
{
        struct link_station_del_parameters params = {};
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];

        if (!rdev->ops->del_link_station)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MLD_ADDR] ||
            !info->attrs[NL80211_ATTR_MLO_LINK_ID])
                return -EINVAL;

        params.mld_mac = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
        params.link_id = nla_get_u8(info->attrs[NL80211_ATTR_MLO_LINK_ID]);

        return rdev_del_link_station(rdev, dev, &params);
}

static int nl80211_set_hw_timestamp(struct sk_buff *skb,
                                    struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct cfg80211_set_hw_timestamp hwts = {};

        if (!rdev->wiphy.hw_timestamp_max_peers)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_MAC] &&
            rdev->wiphy.hw_timestamp_max_peers != CFG80211_HW_TIMESTAMP_ALL_PEERS)
                return -EOPNOTSUPP;

        if (info->attrs[NL80211_ATTR_MAC])
                hwts.macaddr = nla_data(info->attrs[NL80211_ATTR_MAC]);

        hwts.enable =
                nla_get_flag(info->attrs[NL80211_ATTR_HW_TIMESTAMP_ENABLED]);

        return rdev_set_hw_timestamp(rdev, dev, &hwts);
}

static int
nl80211_set_ttlm(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_ttlm_params params = {};
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct net_device *dev = info->user_ptr[1];
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        if (wdev->iftype != NL80211_IFTYPE_STATION &&
            wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
                return -EOPNOTSUPP;

        if (!wdev->connected)
                return -ENOLINK;

        if (!info->attrs[NL80211_ATTR_MLO_TTLM_DLINK] ||
            !info->attrs[NL80211_ATTR_MLO_TTLM_ULINK])
                return -EINVAL;

        nla_memcpy(params.dlink,
                   info->attrs[NL80211_ATTR_MLO_TTLM_DLINK],
                   sizeof(params.dlink));
        nla_memcpy(params.ulink,
                   info->attrs[NL80211_ATTR_MLO_TTLM_ULINK],
                   sizeof(params.ulink));

        return rdev_set_ttlm(rdev, dev, &params);
}

#define NL80211_FLAG_NEED_WIPHY                0x01
#define NL80211_FLAG_NEED_NETDEV        0x02
#define NL80211_FLAG_NEED_RTNL                0x04
#define NL80211_FLAG_CHECK_NETDEV_UP        0x08
#define NL80211_FLAG_NEED_NETDEV_UP        (NL80211_FLAG_NEED_NETDEV |\
                                         NL80211_FLAG_CHECK_NETDEV_UP)
#define NL80211_FLAG_NEED_WDEV                0x10
/* If a netdev is associated, it must be UP, P2P must be started */
#define NL80211_FLAG_NEED_WDEV_UP        (NL80211_FLAG_NEED_WDEV |\
                                         NL80211_FLAG_CHECK_NETDEV_UP)
#define NL80211_FLAG_CLEAR_SKB                0x20
#define NL80211_FLAG_NO_WIPHY_MTX        0x40
#define NL80211_FLAG_MLO_VALID_LINK_ID        0x80
#define NL80211_FLAG_MLO_UNSUPPORTED        0x100

#define INTERNAL_FLAG_SELECTORS(__sel)                        \
        SELECTOR(__sel, NONE, 0) /* must be first */        \
        SELECTOR(__sel, WIPHY,                                \
                 NL80211_FLAG_NEED_WIPHY)                \
        SELECTOR(__sel, WDEV,                                \
                 NL80211_FLAG_NEED_WDEV)                \
        SELECTOR(__sel, NETDEV,                                \
                 NL80211_FLAG_NEED_NETDEV)                \
        SELECTOR(__sel, NETDEV_LINK,                        \
                 NL80211_FLAG_NEED_NETDEV |                \
                 NL80211_FLAG_MLO_VALID_LINK_ID)        \
        SELECTOR(__sel, NETDEV_NO_MLO,                        \
                 NL80211_FLAG_NEED_NETDEV |                \
                 NL80211_FLAG_MLO_UNSUPPORTED)        \
        SELECTOR(__sel, WIPHY_RTNL,                        \
                 NL80211_FLAG_NEED_WIPHY |                \
                 NL80211_FLAG_NEED_RTNL)                \
        SELECTOR(__sel, WIPHY_RTNL_NOMTX,                \
                 NL80211_FLAG_NEED_WIPHY |                \
                 NL80211_FLAG_NEED_RTNL |                \
                 NL80211_FLAG_NO_WIPHY_MTX)                \
        SELECTOR(__sel, WDEV_RTNL,                        \
                 NL80211_FLAG_NEED_WDEV |                \
                 NL80211_FLAG_NEED_RTNL)                \
        SELECTOR(__sel, NETDEV_RTNL,                        \
                 NL80211_FLAG_NEED_NETDEV |                \
                 NL80211_FLAG_NEED_RTNL)                \
        SELECTOR(__sel, NETDEV_UP,                        \
                 NL80211_FLAG_NEED_NETDEV_UP)                \
        SELECTOR(__sel, NETDEV_UP_LINK,                        \
                 NL80211_FLAG_NEED_NETDEV_UP |                \
                 NL80211_FLAG_MLO_VALID_LINK_ID)        \
        SELECTOR(__sel, NETDEV_UP_NO_MLO,                \
                 NL80211_FLAG_NEED_NETDEV_UP |                \
                 NL80211_FLAG_MLO_UNSUPPORTED)                \
        SELECTOR(__sel, NETDEV_UP_NO_MLO_CLEAR,                \
                 NL80211_FLAG_NEED_NETDEV_UP |                \
                 NL80211_FLAG_CLEAR_SKB |                \
                 NL80211_FLAG_MLO_UNSUPPORTED)                \
        SELECTOR(__sel, NETDEV_UP_NOTMX,                \
                 NL80211_FLAG_NEED_NETDEV_UP |                \
                 NL80211_FLAG_NO_WIPHY_MTX)                \
        SELECTOR(__sel, NETDEV_UP_NOTMX_NOMLO,                \
                 NL80211_FLAG_NEED_NETDEV_UP |                \
                 NL80211_FLAG_NO_WIPHY_MTX |                \
                 NL80211_FLAG_MLO_UNSUPPORTED)                \
        SELECTOR(__sel, NETDEV_UP_CLEAR,                \
                 NL80211_FLAG_NEED_NETDEV_UP |                \
                 NL80211_FLAG_CLEAR_SKB)                \
        SELECTOR(__sel, WDEV_UP,                        \
                 NL80211_FLAG_NEED_WDEV_UP)                \
        SELECTOR(__sel, WDEV_UP_LINK,                        \
                 NL80211_FLAG_NEED_WDEV_UP |                \
                 NL80211_FLAG_MLO_VALID_LINK_ID)        \
        SELECTOR(__sel, WDEV_UP_RTNL,                        \
                 NL80211_FLAG_NEED_WDEV_UP |                \
                 NL80211_FLAG_NEED_RTNL)                \
        SELECTOR(__sel, WIPHY_CLEAR,                        \
                 NL80211_FLAG_NEED_WIPHY |                \
                 NL80211_FLAG_CLEAR_SKB)

enum nl80211_internal_flags_selector {
#define SELECTOR(_, name, value)        NL80211_IFL_SEL_##name,
        INTERNAL_FLAG_SELECTORS(_)
#undef SELECTOR
};

static u32 nl80211_internal_flags[] = {
#define SELECTOR(_, name, value)        [NL80211_IFL_SEL_##name] = value,
        INTERNAL_FLAG_SELECTORS(_)
#undef SELECTOR
};

static int nl80211_pre_doit(const struct genl_split_ops *ops,
                            struct sk_buff *skb,
                            struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = NULL;
        struct wireless_dev *wdev = NULL;
        struct net_device *dev = NULL;
        u32 internal_flags;
        int err;

        if (WARN_ON(ops->internal_flags >= ARRAY_SIZE(nl80211_internal_flags)))
                return -EINVAL;

        internal_flags = nl80211_internal_flags[ops->internal_flags];

        rtnl_lock();
        if (internal_flags & NL80211_FLAG_NEED_WIPHY) {
                rdev = cfg80211_get_dev_from_info(genl_info_net(info), info);
                if (IS_ERR(rdev)) {
                        err = PTR_ERR(rdev);
                        goto out_unlock;
                }
                info->user_ptr[0] = rdev;
        } else if (internal_flags & NL80211_FLAG_NEED_NETDEV ||
                   internal_flags & NL80211_FLAG_NEED_WDEV) {
                wdev = __cfg80211_wdev_from_attrs(NULL, genl_info_net(info),
                                                  info->attrs);
                if (IS_ERR(wdev)) {
                        err = PTR_ERR(wdev);
                        goto out_unlock;
                }

                dev = wdev->netdev;
                dev_hold(dev);
                rdev = wiphy_to_rdev(wdev->wiphy);

                if (internal_flags & NL80211_FLAG_NEED_NETDEV) {
                        if (!dev) {
                                err = -EINVAL;
                                goto out_unlock;
                        }

                        info->user_ptr[1] = dev;
                } else {
                        info->user_ptr[1] = wdev;
                }

                if (internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
                    !wdev_running(wdev)) {
                        err = -ENETDOWN;
                        goto out_unlock;
                }

                info->user_ptr[0] = rdev;
        }

        if (internal_flags & NL80211_FLAG_MLO_VALID_LINK_ID) {
                struct nlattr *link_id = info->attrs[NL80211_ATTR_MLO_LINK_ID];

                if (!wdev) {
                        err = -EINVAL;
                        goto out_unlock;
                }

                /* MLO -> require valid link ID */
                if (wdev->valid_links &&
                    (!link_id ||
                     !(wdev->valid_links & BIT(nla_get_u8(link_id))))) {
                        err = -EINVAL;
                        goto out_unlock;
                }

                /* non-MLO -> no link ID attribute accepted */
                if (!wdev->valid_links && link_id) {
                        err = -EINVAL;
                        goto out_unlock;
                }
        }

        if (internal_flags & NL80211_FLAG_MLO_UNSUPPORTED) {
                if (info->attrs[NL80211_ATTR_MLO_LINK_ID] ||
                    (wdev && wdev->valid_links)) {
                        err = -EINVAL;
                        goto out_unlock;
                }
        }

        if (rdev && !(internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) {
                wiphy_lock(&rdev->wiphy);
                /* we keep the mutex locked until post_doit */
                __release(&rdev->wiphy.mtx);
        }
        if (!(internal_flags & NL80211_FLAG_NEED_RTNL))
                rtnl_unlock();

        return 0;
out_unlock:
        rtnl_unlock();
        dev_put(dev);
        return err;
}

static void nl80211_post_doit(const struct genl_split_ops *ops,
                              struct sk_buff *skb,
                              struct genl_info *info)
{
        u32 internal_flags = nl80211_internal_flags[ops->internal_flags];

        if (info->user_ptr[1]) {
                if (internal_flags & NL80211_FLAG_NEED_WDEV) {
                        struct wireless_dev *wdev = info->user_ptr[1];

                        dev_put(wdev->netdev);
                } else {
                        dev_put(info->user_ptr[1]);
                }
        }

        if (info->user_ptr[0] &&
            !(internal_flags & NL80211_FLAG_NO_WIPHY_MTX)) {
                struct cfg80211_registered_device *rdev = info->user_ptr[0];

                /* we kept the mutex locked since pre_doit */
                __acquire(&rdev->wiphy.mtx);
                wiphy_unlock(&rdev->wiphy);
        }

        if (internal_flags & NL80211_FLAG_NEED_RTNL)
                rtnl_unlock();

        /* If needed, clear the netlink message payload from the SKB
         * as it might contain key data that shouldn't stick around on
         * the heap after the SKB is freed. The netlink message header
         * is still needed for further processing, so leave it intact.
         */
        if (internal_flags & NL80211_FLAG_CLEAR_SKB) {
                struct nlmsghdr *nlh = nlmsg_hdr(skb);

                memset(nlmsg_data(nlh), 0, nlmsg_len(nlh));
        }
}

static int nl80211_set_sar_sub_specs(struct cfg80211_registered_device *rdev,
                                     struct cfg80211_sar_specs *sar_specs,
                                     struct nlattr *spec[], int index)
{
        u32 range_index, i;

        if (!sar_specs || !spec)
                return -EINVAL;

        if (!spec[NL80211_SAR_ATTR_SPECS_POWER] ||
            !spec[NL80211_SAR_ATTR_SPECS_RANGE_INDEX])
                return -EINVAL;

        range_index = nla_get_u32(spec[NL80211_SAR_ATTR_SPECS_RANGE_INDEX]);

        /* check if range_index exceeds num_freq_ranges */
        if (range_index >= rdev->wiphy.sar_capa->num_freq_ranges)
                return -EINVAL;

        /* check if range_index duplicates */
        for (i = 0; i < index; i++) {
                if (sar_specs->sub_specs[i].freq_range_index == range_index)
                        return -EINVAL;
        }

        sar_specs->sub_specs[index].power =
                nla_get_s32(spec[NL80211_SAR_ATTR_SPECS_POWER]);

        sar_specs->sub_specs[index].freq_range_index = range_index;

        return 0;
}

static int nl80211_set_sar_specs(struct sk_buff *skb, struct genl_info *info)
{
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct nlattr *spec[NL80211_SAR_ATTR_SPECS_MAX + 1];
        struct nlattr *tb[NL80211_SAR_ATTR_MAX + 1];
        struct cfg80211_sar_specs *sar_spec;
        enum nl80211_sar_type type;
        struct nlattr *spec_list;
        u32 specs;
        int rem, err;

        if (!rdev->wiphy.sar_capa || !rdev->ops->set_sar_specs)
                return -EOPNOTSUPP;

        if (!info->attrs[NL80211_ATTR_SAR_SPEC])
                return -EINVAL;

        nla_parse_nested(tb, NL80211_SAR_ATTR_MAX,
                         info->attrs[NL80211_ATTR_SAR_SPEC],
                         NULL, NULL);

        if (!tb[NL80211_SAR_ATTR_TYPE] || !tb[NL80211_SAR_ATTR_SPECS])
                return -EINVAL;

        type = nla_get_u32(tb[NL80211_SAR_ATTR_TYPE]);
        if (type != rdev->wiphy.sar_capa->type)
                return -EINVAL;

        specs = 0;
        nla_for_each_nested(spec_list, tb[NL80211_SAR_ATTR_SPECS], rem)
                specs++;

        if (specs > rdev->wiphy.sar_capa->num_freq_ranges)
                return -EINVAL;

        sar_spec = kzalloc(struct_size(sar_spec, sub_specs, specs), GFP_KERNEL);
        if (!sar_spec)
                return -ENOMEM;

        sar_spec->type = type;
        specs = 0;
        nla_for_each_nested(spec_list, tb[NL80211_SAR_ATTR_SPECS], rem) {
                nla_parse_nested(spec, NL80211_SAR_ATTR_SPECS_MAX,
                                 spec_list, NULL, NULL);

                switch (type) {
                case NL80211_SAR_TYPE_POWER:
                        if (nl80211_set_sar_sub_specs(rdev, sar_spec,
                                                      spec, specs)) {
                                err = -EINVAL;
                                goto error;
                        }
                        break;
                default:
                        err = -EINVAL;
                        goto error;
                }
                specs++;
        }

        sar_spec->num_sub_specs = specs;

        rdev->cur_cmd_info = info;
        err = rdev_set_sar_specs(rdev, sar_spec);
        rdev->cur_cmd_info = NULL;
error:
        kfree(sar_spec);
        return err;
}

#define SELECTOR(__sel, name, value) \
        ((__sel) == (value)) ? NL80211_IFL_SEL_##name :
int __missing_selector(void);
#define IFLAGS(__val) INTERNAL_FLAG_SELECTORS(__val) __missing_selector()

static const struct genl_ops nl80211_ops[] = {
        {
                .cmd = NL80211_CMD_GET_WIPHY,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_wiphy,
                .dumpit = nl80211_dump_wiphy,
                .done = nl80211_dump_wiphy_done,
                /* can be retrieved by unprivileged users */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY),
        },
};

static const struct genl_small_ops nl80211_small_ops[] = {
        {
                .cmd = NL80211_CMD_SET_WIPHY,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_wiphy,
                .flags = GENL_UNS_ADMIN_PERM,
        },
        {
                .cmd = NL80211_CMD_GET_INTERFACE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_interface,
                .dumpit = nl80211_dump_interface,
                /* can be retrieved by unprivileged users */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV),
        },
        {
                .cmd = NL80211_CMD_SET_INTERFACE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_interface,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
                                         NL80211_FLAG_NEED_RTNL),
        },
        {
                .cmd = NL80211_CMD_NEW_INTERFACE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_new_interface,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags =
                        IFLAGS(NL80211_FLAG_NEED_WIPHY |
                               NL80211_FLAG_NEED_RTNL |
                               /* we take the wiphy mutex later ourselves */
                               NL80211_FLAG_NO_WIPHY_MTX),
        },
        {
                .cmd = NL80211_CMD_DEL_INTERFACE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_del_interface,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV |
                                         NL80211_FLAG_NEED_RTNL),
        },
        {
                .cmd = NL80211_CMD_GET_KEY,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_key,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_KEY,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_key,
                .flags = GENL_UNS_ADMIN_PERM,
                /* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on key */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_NEW_KEY,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_new_key,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_DEL_KEY,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_del_key,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_BEACON,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .flags = GENL_UNS_ADMIN_PERM,
                .doit = nl80211_set_beacon,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_START_AP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .flags = GENL_UNS_ADMIN_PERM,
                .doit = nl80211_start_ap,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_STOP_AP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .flags = GENL_UNS_ADMIN_PERM,
                .doit = nl80211_stop_ap,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_GET_STATION,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_station,
                .dumpit = nl80211_dump_station,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_SET_STATION,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_station,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_NEW_STATION,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_new_station,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_DEL_STATION,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_del_station,
                .flags = GENL_UNS_ADMIN_PERM,
                /* cannot use NL80211_FLAG_MLO_VALID_LINK_ID, depends on
                 * whether MAC address is passed or not. If MAC address is
                 * passed, then even during MLO, link ID is not required.
                 */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_GET_MPATH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_mpath,
                .dumpit = nl80211_dump_mpath,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_GET_MPP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_mpp,
                .dumpit = nl80211_dump_mpp,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_MPATH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_mpath,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_NEW_MPATH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_new_mpath,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_DEL_MPATH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_del_mpath,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_BSS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_bss,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_GET_REG,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_reg_do,
                .dumpit = nl80211_get_reg_dump,
                /* can be retrieved by unprivileged users */
        },
#ifdef CONFIG_CFG80211_CRDA_SUPPORT
        {
                .cmd = NL80211_CMD_SET_REG,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_reg,
                .flags = GENL_ADMIN_PERM,
        },
#endif
        {
                .cmd = NL80211_CMD_REQ_SET_REG,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_req_set_reg,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NL80211_CMD_RELOAD_REGDB,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_reload_regdb,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NL80211_CMD_GET_MESH_CONFIG,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_mesh_config,
                /* can be retrieved by unprivileged users */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_MESH_CONFIG,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_update_mesh_config,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_TRIGGER_SCAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_trigger_scan,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_ABORT_SCAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_abort_scan,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_GET_SCAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .dumpit = nl80211_dump_scan,
        },
        {
                .cmd = NL80211_CMD_START_SCHED_SCAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_start_sched_scan,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_STOP_SCHED_SCAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_stop_sched_scan,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_AUTHENTICATE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_authenticate,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_ASSOCIATE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_associate,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_DEAUTHENTICATE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_deauthenticate,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_DISASSOCIATE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_disassociate,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_JOIN_IBSS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_join_ibss,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_LEAVE_IBSS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_leave_ibss,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
#ifdef CONFIG_NL80211_TESTMODE
        {
                .cmd = NL80211_CMD_TESTMODE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_testmode_do,
                .dumpit = nl80211_testmode_dump,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY),
        },
#endif
        {
                .cmd = NL80211_CMD_CONNECT,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_connect,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_UPDATE_CONNECT_PARAMS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_update_connect_params,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_DISCONNECT,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_disconnect,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_WIPHY_NETNS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_wiphy_netns,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY |
                                         NL80211_FLAG_NEED_RTNL |
                                         NL80211_FLAG_NO_WIPHY_MTX),
        },
        {
                .cmd = NL80211_CMD_GET_SURVEY,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .dumpit = nl80211_dump_survey,
        },
        {
                .cmd = NL80211_CMD_SET_PMKSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_pmksa,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_DEL_PMKSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_del_pmksa,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_FLUSH_PMKSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_flush_pmksa,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_REMAIN_ON_CHANNEL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_remain_on_channel,
                .flags = GENL_UNS_ADMIN_PERM,
                /* FIXME: requiring a link ID here is probably not good */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_cancel_remain_on_channel,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_TX_BITRATE_MASK,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_tx_bitrate_mask,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_REGISTER_FRAME,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_register_mgmt,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV),
        },
        {
                .cmd = NL80211_CMD_FRAME,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_tx_mgmt,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_FRAME_WAIT_CANCEL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_tx_mgmt_cancel_wait,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_POWER_SAVE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_power_save,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_GET_POWER_SAVE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_power_save,
                /* can be retrieved by unprivileged users */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_SET_CQM,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_cqm,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_SET_CHANNEL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_channel,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_JOIN_MESH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_join_mesh,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_LEAVE_MESH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_leave_mesh,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_JOIN_OCB,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_join_ocb,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_LEAVE_OCB,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_leave_ocb,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
#ifdef CONFIG_PM
        {
                .cmd = NL80211_CMD_GET_WOWLAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_wowlan,
                /* can be retrieved by unprivileged users */
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY),
        },
        {
                .cmd = NL80211_CMD_SET_WOWLAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_wowlan,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY),
        },
#endif
        {
                .cmd = NL80211_CMD_SET_REKEY_OFFLOAD,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_rekey_data,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_TDLS_MGMT,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_tdls_mgmt,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_TDLS_OPER,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_tdls_oper,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_UNEXPECTED_FRAME,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_register_unexpected_frame,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_PROBE_CLIENT,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_probe_client,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_REGISTER_BEACONS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_register_beacons,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY),
        },
        {
                .cmd = NL80211_CMD_SET_NOACK_MAP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_noack_map,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_START_P2P_DEVICE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_start_p2p_device,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV |
                                         NL80211_FLAG_NEED_RTNL),
        },
        {
                .cmd = NL80211_CMD_STOP_P2P_DEVICE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_stop_p2p_device,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP |
                                         NL80211_FLAG_NEED_RTNL),
        },
        {
                .cmd = NL80211_CMD_START_NAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_start_nan,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV |
                                         NL80211_FLAG_NEED_RTNL),
        },
        {
                .cmd = NL80211_CMD_STOP_NAN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_stop_nan,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP |
                                         NL80211_FLAG_NEED_RTNL),
        },
        {
                .cmd = NL80211_CMD_ADD_NAN_FUNCTION,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_nan_add_func,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_DEL_NAN_FUNCTION,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_nan_del_func,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_CHANGE_NAN_CONFIG,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_nan_change_config,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_MCAST_RATE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_mcast_rate,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_SET_MAC_ACL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_mac_acl,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
                                         NL80211_FLAG_MLO_UNSUPPORTED),
        },
        {
                .cmd = NL80211_CMD_RADAR_DETECT,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_start_radar_detection,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_NO_WIPHY_MTX |
                                         NL80211_FLAG_MLO_UNSUPPORTED),
        },
        {
                .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_protocol_features,
        },
        {
                .cmd = NL80211_CMD_UPDATE_FT_IES,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_update_ft_ies,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_CRIT_PROTOCOL_START,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_crit_protocol_start,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_CRIT_PROTOCOL_STOP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_crit_protocol_stop,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_GET_COALESCE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_coalesce,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY),
        },
        {
                .cmd = NL80211_CMD_SET_COALESCE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_coalesce,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY),
        },
        {
                .cmd = NL80211_CMD_CHANNEL_SWITCH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_channel_switch,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_VENDOR,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_vendor_cmd,
                .dumpit = nl80211_vendor_cmd_dump,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_SET_QOS_MAP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_qos_map,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_ADD_TX_TS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_add_tx_ts,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_UNSUPPORTED),
        },
        {
                .cmd = NL80211_CMD_DEL_TX_TS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_del_tx_ts,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_tdls_channel_switch,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_tdls_cancel_channel_switch,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_MULTICAST_TO_UNICAST,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_multicast_to_unicast,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV),
        },
        {
                .cmd = NL80211_CMD_SET_PMK,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_pmk,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_CLEAR_SKB),
        },
        {
                .cmd = NL80211_CMD_DEL_PMK,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_del_pmk,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_EXTERNAL_AUTH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_external_auth,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_CONTROL_PORT_FRAME,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_tx_control_port,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_get_ftm_responder_stats,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_PEER_MEASUREMENT_START,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_pmsr_start,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WDEV_UP),
        },
        {
                .cmd = NL80211_CMD_NOTIFY_RADAR,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_notify_radar_detection,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_UPDATE_OWE_INFO,
                .doit = nl80211_update_owe_info,
                .flags = GENL_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_PROBE_MESH_LINK,
                .doit = nl80211_probe_mesh_link,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_TID_CONFIG,
                .doit = nl80211_set_tid_config,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_SET_SAR_SPECS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_sar_specs,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_WIPHY |
                                         NL80211_FLAG_NEED_RTNL),
        },
        {
                .cmd = NL80211_CMD_COLOR_CHANGE_REQUEST,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_color_change,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_SET_FILS_AAD,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nl80211_set_fils_aad,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_ADD_LINK,
                .doit = nl80211_add_link,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_REMOVE_LINK,
                .doit = nl80211_remove_link,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_ADD_LINK_STA,
                .doit = nl80211_add_link_station,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_MODIFY_LINK_STA,
                .doit = nl80211_modify_link_station,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_REMOVE_LINK_STA,
                .doit = nl80211_remove_link_station,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
                                         NL80211_FLAG_MLO_VALID_LINK_ID),
        },
        {
                .cmd = NL80211_CMD_SET_HW_TIMESTAMP,
                .doit = nl80211_set_hw_timestamp,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
        {
                .cmd = NL80211_CMD_SET_TID_TO_LINK_MAPPING,
                .doit = nl80211_set_ttlm,
                .flags = GENL_UNS_ADMIN_PERM,
                .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
        },
};

static struct genl_family nl80211_fam __ro_after_init = {
        .name = NL80211_GENL_NAME,        /* have users key off the name instead */
        .hdrsize = 0,                        /* no private header */
        .version = 1,                        /* no particular meaning now */
        .maxattr = NL80211_ATTR_MAX,
        .policy = nl80211_policy,
        .netnsok = true,
        .pre_doit = nl80211_pre_doit,
        .post_doit = nl80211_post_doit,
        .module = THIS_MODULE,
        .ops = nl80211_ops,
        .n_ops = ARRAY_SIZE(nl80211_ops),
        .small_ops = nl80211_small_ops,
        .n_small_ops = ARRAY_SIZE(nl80211_small_ops),
        .resv_start_op = NL80211_CMD_REMOVE_LINK_STA + 1,
        .mcgrps = nl80211_mcgrps,
        .n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
        .parallel_ops = true,
};

/* notification functions */

void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
                          enum nl80211_commands cmd)
{
        struct sk_buff *msg;
        struct nl80211_dump_wiphy_state state = {};

        WARN_ON(cmd != NL80211_CMD_NEW_WIPHY &&
                cmd != NL80211_CMD_DEL_WIPHY);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        if (nl80211_send_wiphy(rdev, cmd, msg, 0, 0, 0, &state) < 0) {
                nlmsg_free(msg);
                return;
        }

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_CONFIG, GFP_KERNEL);
}

void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
                                struct wireless_dev *wdev,
                                enum nl80211_commands cmd)
{
        struct sk_buff *msg;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, cmd) < 0) {
                nlmsg_free(msg);
                return;
        }

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_CONFIG, GFP_KERNEL);
}

static int nl80211_add_scan_req(struct sk_buff *msg,
                                struct cfg80211_registered_device *rdev)
{
        struct cfg80211_scan_request *req = rdev->scan_req;
        struct nlattr *nest;
        int i;
        struct cfg80211_scan_info *info;

        if (WARN_ON(!req))
                return 0;

        nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_SSIDS);
        if (!nest)
                goto nla_put_failure;
        for (i = 0; i < req->n_ssids; i++) {
                if (nla_put(msg, i, req->ssids[i].ssid_len, req->ssids[i].ssid))
                        goto nla_put_failure;
        }
        nla_nest_end(msg, nest);

        if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) {
                nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ);
                if (!nest)
                        goto nla_put_failure;
                for (i = 0; i < req->n_channels; i++) {
                        if (nla_put_u32(msg, i,
                                   ieee80211_channel_to_khz(req->channels[i])))
                                goto nla_put_failure;
                }
                nla_nest_end(msg, nest);
        } else {
                nest = nla_nest_start_noflag(msg,
                                             NL80211_ATTR_SCAN_FREQUENCIES);
                if (!nest)
                        goto nla_put_failure;
                for (i = 0; i < req->n_channels; i++) {
                        if (nla_put_u32(msg, i, req->channels[i]->center_freq))
                                goto nla_put_failure;
                }
                nla_nest_end(msg, nest);
        }

        if (req->ie &&
            nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie))
                goto nla_put_failure;

        if (req->flags &&
            nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags))
                goto nla_put_failure;

        info = rdev->int_scan_req ? &rdev->int_scan_req->info :
                &rdev->scan_req->info;
        if (info->scan_start_tsf &&
            (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF,
                               info->scan_start_tsf, NL80211_BSS_PAD) ||
             nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN,
                     info->tsf_bssid)))
                goto nla_put_failure;

        return 0;
 nla_put_failure:
        return -ENOBUFS;
}

static int nl80211_prep_scan_msg(struct sk_buff *msg,
                                 struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev,
                                 u32 portid, u32 seq, int flags,
                                 u32 cmd)
{
        void *hdr;

        hdr = nl80211hdr_put(msg, portid, seq, flags, cmd);
        if (!hdr)
                return -1;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
                                         wdev->netdev->ifindex)) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        /* ignore errors and send incomplete event anyway */
        nl80211_add_scan_req(msg, rdev);

        genlmsg_end(msg, hdr);
        return 0;

 nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int
nl80211_prep_sched_scan_msg(struct sk_buff *msg,
                            struct cfg80211_sched_scan_request *req, u32 cmd)
{
        void *hdr;

        hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
        if (!hdr)
                return -1;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY,
                        wiphy_to_rdev(req->wiphy)->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, req->dev->ifindex) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->reqid,
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return 0;

 nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

void nl80211_send_scan_start(struct cfg80211_registered_device *rdev,
                             struct wireless_dev *wdev)
{
        struct sk_buff *msg;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
                                  NL80211_CMD_TRIGGER_SCAN) < 0) {
                nlmsg_free(msg);
                return;
        }

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_SCAN, GFP_KERNEL);
}

struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev,
                                       struct wireless_dev *wdev, bool aborted)
{
        struct sk_buff *msg;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return NULL;

        if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0,
                                  aborted ? NL80211_CMD_SCAN_ABORTED :
                                            NL80211_CMD_NEW_SCAN_RESULTS) < 0) {
                nlmsg_free(msg);
                return NULL;
        }

        return msg;
}

/* send message created by nl80211_build_scan_msg() */
void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev,
                           struct sk_buff *msg)
{
        if (!msg)
                return;

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_SCAN, GFP_KERNEL);
}

void nl80211_send_sched_scan(struct cfg80211_sched_scan_request *req, u32 cmd)
{
        struct sk_buff *msg;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        if (nl80211_prep_sched_scan_msg(msg, req, cmd) < 0) {
                nlmsg_free(msg);
                return;
        }

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(req->wiphy), msg, 0,
                                NL80211_MCGRP_SCAN, GFP_KERNEL);
}

static bool nl80211_reg_change_event_fill(struct sk_buff *msg,
                                          struct regulatory_request *request)
{
        /* Userspace can always count this one always being set */
        if (nla_put_u8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator))
                goto nla_put_failure;

        if (request->alpha2[0] == '0' && request->alpha2[1] == '0') {
                if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
                               NL80211_REGDOM_TYPE_WORLD))
                        goto nla_put_failure;
        } else if (request->alpha2[0] == '9' && request->alpha2[1] == '9') {
                if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
                               NL80211_REGDOM_TYPE_CUSTOM_WORLD))
                        goto nla_put_failure;
        } else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') ||
                   request->intersect) {
                if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
                               NL80211_REGDOM_TYPE_INTERSECTION))
                        goto nla_put_failure;
        } else {
                if (nla_put_u8(msg, NL80211_ATTR_REG_TYPE,
                               NL80211_REGDOM_TYPE_COUNTRY) ||
                    nla_put_string(msg, NL80211_ATTR_REG_ALPHA2,
                                   request->alpha2))
                        goto nla_put_failure;
        }

        if (request->wiphy_idx != WIPHY_IDX_INVALID) {
                struct wiphy *wiphy = wiphy_idx_to_wiphy(request->wiphy_idx);

                if (wiphy &&
                    nla_put_u32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx))
                        goto nla_put_failure;

                if (wiphy &&
                    wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
                    nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG))
                        goto nla_put_failure;
        }

        return true;

nla_put_failure:
        return false;
}

/*
 * This can happen on global regulatory changes or device specific settings
 * based on custom regulatory domains.
 */
void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
                                     struct regulatory_request *request)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, cmd_id);
        if (!hdr)
                goto nla_put_failure;

        if (!nl80211_reg_change_event_fill(msg, request))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        rcu_read_lock();
        genlmsg_multicast_allns(&nl80211_fam, msg, 0,
                                NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
        rcu_read_unlock();

        return;

nla_put_failure:
        nlmsg_free(msg);
}

struct nl80211_mlme_event {
        enum nl80211_commands cmd;
        const u8 *buf;
        size_t buf_len;
        int uapsd_queues;
        const u8 *req_ies;
        size_t req_ies_len;
        bool reconnect;
};

static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
                                    struct net_device *netdev,
                                    const struct nl80211_mlme_event *event,
                                    gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(100 + event->buf_len + event->req_ies_len, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, event->cmd);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_FRAME, event->buf_len, event->buf) ||
            (event->req_ies &&
             nla_put(msg, NL80211_ATTR_REQ_IE, event->req_ies_len,
                     event->req_ies)))
                goto nla_put_failure;

        if (event->reconnect &&
            nla_put_flag(msg, NL80211_ATTR_RECONNECT_REQUESTED))
                goto nla_put_failure;

        if (event->uapsd_queues >= 0) {
                struct nlattr *nla_wmm =
                        nla_nest_start_noflag(msg, NL80211_ATTR_STA_WME);
                if (!nla_wmm)
                        goto nla_put_failure;

                if (nla_put_u8(msg, NL80211_STA_WME_UAPSD_QUEUES,
                               event->uapsd_queues))
                        goto nla_put_failure;

                nla_nest_end(msg, nla_wmm);
        }

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev,
                          struct net_device *netdev, const u8 *buf,
                          size_t len, gfp_t gfp)
{
        struct nl80211_mlme_event event = {
                .cmd = NL80211_CMD_AUTHENTICATE,
                .buf = buf,
                .buf_len = len,
                .uapsd_queues = -1,
        };

        nl80211_send_mlme_event(rdev, netdev, &event, gfp);
}

void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev,
                           struct net_device *netdev,
                           const struct cfg80211_rx_assoc_resp_data *data)
{
        struct nl80211_mlme_event event = {
                .cmd = NL80211_CMD_ASSOCIATE,
                .buf = data->buf,
                .buf_len = data->len,
                .uapsd_queues = data->uapsd_queues,
                .req_ies = data->req_ies,
                .req_ies_len = data->req_ies_len,
        };

        nl80211_send_mlme_event(rdev, netdev, &event, GFP_KERNEL);
}

void nl80211_send_deauth(struct cfg80211_registered_device *rdev,
                         struct net_device *netdev, const u8 *buf,
                         size_t len, bool reconnect, gfp_t gfp)
{
        struct nl80211_mlme_event event = {
                .cmd = NL80211_CMD_DEAUTHENTICATE,
                .buf = buf,
                .buf_len = len,
                .reconnect = reconnect,
                .uapsd_queues = -1,
        };

        nl80211_send_mlme_event(rdev, netdev, &event, gfp);
}

void nl80211_send_disassoc(struct cfg80211_registered_device *rdev,
                           struct net_device *netdev, const u8 *buf,
                           size_t len, bool reconnect, gfp_t gfp)
{
        struct nl80211_mlme_event event = {
                .cmd = NL80211_CMD_DISASSOCIATE,
                .buf = buf,
                .buf_len = len,
                .reconnect = reconnect,
                .uapsd_queues = -1,
        };

        nl80211_send_mlme_event(rdev, netdev, &event, gfp);
}

void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf,
                                  size_t len)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        const struct ieee80211_mgmt *mgmt = (void *)buf;
        struct nl80211_mlme_event event = {
                .buf = buf,
                .buf_len = len,
                .uapsd_queues = -1,
        };

        if (WARN_ON(len < 2))
                return;

        if (ieee80211_is_deauth(mgmt->frame_control)) {
                event.cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE;
        } else if (ieee80211_is_disassoc(mgmt->frame_control)) {
                event.cmd = NL80211_CMD_UNPROT_DISASSOCIATE;
        } else if (ieee80211_is_beacon(mgmt->frame_control)) {
                if (wdev->unprot_beacon_reported &&
                    elapsed_jiffies_msecs(wdev->unprot_beacon_reported) < 10000)
                        return;
                event.cmd = NL80211_CMD_UNPROT_BEACON;
                wdev->unprot_beacon_reported = jiffies;
        } else {
                return;
        }

        trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len);
        nl80211_send_mlme_event(rdev, dev, &event, GFP_ATOMIC);
}
EXPORT_SYMBOL(cfg80211_rx_unprot_mlme_mgmt);

static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev,
                                      struct net_device *netdev, int cmd,
                                      const u8 *addr, gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, const u8 *addr,
                               gfp_t gfp)
{
        nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_AUTHENTICATE,
                                  addr, gfp);
}

void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev,
                                struct net_device *netdev, const u8 *addr,
                                gfp_t gfp)
{
        nl80211_send_mlme_timeout(rdev, netdev, NL80211_CMD_ASSOCIATE,
                                  addr, gfp);
}

void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_connect_resp_params *cr,
                                 gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;
        unsigned int link;
        size_t link_info_size = 0;
        const u8 *connected_addr = cr->valid_links ?
                                   cr->ap_mld_addr : cr->links[0].bssid;

        if (cr->valid_links) {
                for_each_valid_link(cr, link) {
                        /* Nested attribute header */
                        link_info_size += NLA_HDRLEN;
                        /* Link ID */
                        link_info_size += nla_total_size(sizeof(u8));
                        link_info_size += cr->links[link].addr ?
                                          nla_total_size(ETH_ALEN) : 0;
                        link_info_size += (cr->links[link].bssid ||
                                           cr->links[link].bss) ?
                                          nla_total_size(ETH_ALEN) : 0;
                        link_info_size += nla_total_size(sizeof(u16));
                }
        }

        msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
                        cr->fils.kek_len + cr->fils.pmk_len +
                        (cr->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size,
                        gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONNECT);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            (connected_addr &&
             nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr)) ||
            nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
                        cr->status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE :
                        cr->status) ||
            (cr->status < 0 &&
             (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) ||
              nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON,
                          cr->timeout_reason))) ||
            (cr->req_ie &&
             nla_put(msg, NL80211_ATTR_REQ_IE, cr->req_ie_len, cr->req_ie)) ||
            (cr->resp_ie &&
             nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len,
                     cr->resp_ie)) ||
            (cr->fils.update_erp_next_seq_num &&
             nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
                         cr->fils.erp_next_seq_num)) ||
            (cr->status == WLAN_STATUS_SUCCESS &&
             ((cr->fils.kek &&
               nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils.kek_len,
                       cr->fils.kek)) ||
              (cr->fils.pmk &&
               nla_put(msg, NL80211_ATTR_PMK, cr->fils.pmk_len, cr->fils.pmk)) ||
              (cr->fils.pmkid &&
               nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid)))))
                goto nla_put_failure;

        if (cr->valid_links) {
                int i = 1;
                struct nlattr *nested;

                nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
                if (!nested)
                        goto nla_put_failure;

                for_each_valid_link(cr, link) {
                        struct nlattr *nested_mlo_links;
                        const u8 *bssid = cr->links[link].bss ?
                                          cr->links[link].bss->bssid :
                                          cr->links[link].bssid;

                        nested_mlo_links = nla_nest_start(msg, i);
                        if (!nested_mlo_links)
                                goto nla_put_failure;

                        if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) ||
                            (bssid &&
                             nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
                            (cr->links[link].addr &&
                             nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
                                     cr->links[link].addr)) ||
                            nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
                                        cr->links[link].status))
                                goto nla_put_failure;

                        nla_nest_end(msg, nested_mlo_links);
                        i++;
                }
                nla_nest_end(msg, nested);
        }

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
                         struct net_device *netdev,
                         struct cfg80211_roam_info *info, gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;
        size_t link_info_size = 0;
        unsigned int link;
        const u8 *connected_addr = info->ap_mld_addr ?
                                   info->ap_mld_addr :
                                   (info->links[0].bss ?
                                    info->links[0].bss->bssid :
                                    info->links[0].bssid);

        if (info->valid_links) {
                for_each_valid_link(info, link) {
                        /* Nested attribute header */
                        link_info_size += NLA_HDRLEN;
                        /* Link ID */
                        link_info_size += nla_total_size(sizeof(u8));
                        link_info_size += info->links[link].addr ?
                                          nla_total_size(ETH_ALEN) : 0;
                        link_info_size += (info->links[link].bssid ||
                                           info->links[link].bss) ?
                                          nla_total_size(ETH_ALEN) : 0;
                }
        }

        msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
                        info->fils.kek_len + info->fils.pmk_len +
                        (info->fils.pmkid ? WLAN_PMKID_LEN : 0) +
                        link_info_size, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ROAM);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, connected_addr) ||
            (info->req_ie &&
             nla_put(msg, NL80211_ATTR_REQ_IE, info->req_ie_len,
                     info->req_ie)) ||
            (info->resp_ie &&
             nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
                     info->resp_ie)) ||
            (info->fils.update_erp_next_seq_num &&
             nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
                         info->fils.erp_next_seq_num)) ||
            (info->fils.kek &&
             nla_put(msg, NL80211_ATTR_FILS_KEK, info->fils.kek_len,
                     info->fils.kek)) ||
            (info->fils.pmk &&
             nla_put(msg, NL80211_ATTR_PMK, info->fils.pmk_len, info->fils.pmk)) ||
            (info->fils.pmkid &&
             nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
                goto nla_put_failure;

        if (info->valid_links) {
                int i = 1;
                struct nlattr *nested;

                nested = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
                if (!nested)
                        goto nla_put_failure;

                for_each_valid_link(info, link) {
                        struct nlattr *nested_mlo_links;
                        const u8 *bssid = info->links[link].bss ?
                                          info->links[link].bss->bssid :
                                          info->links[link].bssid;

                        nested_mlo_links = nla_nest_start(msg, i);
                        if (!nested_mlo_links)
                                goto nla_put_failure;

                        if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link) ||
                            (bssid &&
                             nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
                            (info->links[link].addr &&
                             nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
                                     info->links[link].addr)))
                                goto nla_put_failure;

                        nla_nest_end(msg, nested_mlo_links);
                        i++;
                }
                nla_nest_end(msg, nested);
        }

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
                                  struct net_device *netdev, const u8 *peer_addr,
                                  const u8 *td_bitmap, u8 td_bitmap_len)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PORT_AUTHORIZED);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer_addr))
                goto nla_put_failure;

        if ((td_bitmap_len > 0) && td_bitmap)
                if (nla_put(msg, NL80211_ATTR_TD_BITMAP,
                            td_bitmap_len, td_bitmap))
                        goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, GFP_KERNEL);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, u16 reason,
                               const u8 *ie, size_t ie_len, bool from_ap)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(100 + ie_len, GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DISCONNECT);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            (reason &&
             nla_put_u16(msg, NL80211_ATTR_REASON_CODE, reason)) ||
            (from_ap &&
             nla_put_flag(msg, NL80211_ATTR_DISCONNECTED_BY_AP)) ||
            (ie && nla_put(msg, NL80211_ATTR_IE, ie_len, ie)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, GFP_KERNEL);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_links_removed(struct net_device *dev, u16 link_mask)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        struct nlattr *links;
        void *hdr;

        lockdep_assert_wiphy(wdev->wiphy);
        trace_cfg80211_links_removed(dev, link_mask);

        if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
                    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
                return;

        if (WARN_ON(!wdev->valid_links || !link_mask ||
                    (wdev->valid_links & link_mask) != link_mask ||
                    wdev->valid_links == link_mask))
                return;

        cfg80211_wdev_release_link_bsses(wdev, link_mask);
        wdev->valid_links &= ~link_mask;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_LINKS_REMOVED);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
                goto nla_put_failure;

        links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);
        if (!links)
                goto nla_put_failure;

        while (link_mask) {
                struct nlattr *link;
                int link_id = __ffs(link_mask);

                link = nla_nest_start(msg, link_id + 1);
                if (!link)
                        goto nla_put_failure;

                if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))
                        goto nla_put_failure;

                nla_nest_end(msg, link);
                link_mask &= ~(1 << link_id);
        }

        nla_nest_end(msg, links);

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, GFP_KERNEL);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_links_removed);

void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
                             struct net_device *netdev, const u8 *bssid,
                             gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_JOIN_IBSS);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
                                        const u8 *ie, u8 ie_len,
                                        int sig_dbm, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg;
        void *hdr;

        if (WARN_ON(wdev->iftype != NL80211_IFTYPE_MESH_POINT))
                return;

        trace_cfg80211_notify_new_peer_candidate(dev, addr);

        msg = nlmsg_new(100 + ie_len, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NEW_PEER_CANDIDATE);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
            (ie_len && ie &&
             nla_put(msg, NL80211_ATTR_IE, ie_len, ie)) ||
            (sig_dbm &&
             nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate);

void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev, const u8 *addr,
                                 enum nl80211_key_type key_type, int key_id,
                                 const u8 *tsc, gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_MICHAEL_MIC_FAILURE);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            (addr && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr)) ||
            nla_put_u32(msg, NL80211_ATTR_KEY_TYPE, key_type) ||
            (key_id != -1 &&
             nla_put_u8(msg, NL80211_ATTR_KEY_IDX, key_id)) ||
            (tsc && nla_put(msg, NL80211_ATTR_KEY_SEQ, 6, tsc)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
                                    struct ieee80211_channel *channel_before,
                                    struct ieee80211_channel *channel_after)
{
        struct sk_buff *msg;
        void *hdr;
        struct nlattr *nl_freq;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_BEACON_HINT);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        /*
         * Since we are applying the beacon hint to a wiphy we know its
         * wiphy_idx is valid
         */
        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, get_wiphy_idx(wiphy)))
                goto nla_put_failure;

        /* Before */
        nl_freq = nla_nest_start_noflag(msg, NL80211_ATTR_FREQ_BEFORE);
        if (!nl_freq)
                goto nla_put_failure;

        if (nl80211_msg_put_channel(msg, wiphy, channel_before, false))
                goto nla_put_failure;
        nla_nest_end(msg, nl_freq);

        /* After */
        nl_freq = nla_nest_start_noflag(msg, NL80211_ATTR_FREQ_AFTER);
        if (!nl_freq)
                goto nla_put_failure;

        if (nl80211_msg_put_channel(msg, wiphy, channel_after, false))
                goto nla_put_failure;
        nla_nest_end(msg, nl_freq);

        genlmsg_end(msg, hdr);

        rcu_read_lock();
        genlmsg_multicast_allns(&nl80211_fam, msg, 0,
                                NL80211_MCGRP_REGULATORY, GFP_ATOMIC);
        rcu_read_unlock();

        return;

nla_put_failure:
        nlmsg_free(msg);
}

static void nl80211_send_remain_on_chan_event(
        int cmd, struct cfg80211_registered_device *rdev,
        struct wireless_dev *wdev, u64 cookie,
        struct ieee80211_channel *chan,
        unsigned int duration, gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
                                         wdev->netdev->ifindex)) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD) ||
            nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq) ||
            nla_put_u32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE,
                        NL80211_CHAN_NO_HT) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL &&
            nla_put_u32(msg, NL80211_ATTR_DURATION, duration))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_assoc_comeback(struct net_device *netdev,
                             const u8 *ap_addr, u32 timeout)
{
        struct wireless_dev *wdev = netdev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;

        trace_cfg80211_assoc_comeback(wdev, ap_addr, timeout);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ASSOC_COMEBACK);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, ap_addr) ||
            nla_put_u32(msg, NL80211_ATTR_TIMEOUT, timeout))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, GFP_KERNEL);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_assoc_comeback);

void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie,
                               struct ieee80211_channel *chan,
                               unsigned int duration, gfp_t gfp)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        trace_cfg80211_ready_on_channel(wdev, cookie, chan, duration);
        nl80211_send_remain_on_chan_event(NL80211_CMD_REMAIN_ON_CHANNEL,
                                          rdev, wdev, cookie, chan,
                                          duration, gfp);
}
EXPORT_SYMBOL(cfg80211_ready_on_channel);

void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
                                        struct ieee80211_channel *chan,
                                        gfp_t gfp)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        trace_cfg80211_ready_on_channel_expired(wdev, cookie, chan);
        nl80211_send_remain_on_chan_event(NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
                                          rdev, wdev, cookie, chan, 0, gfp);
}
EXPORT_SYMBOL(cfg80211_remain_on_channel_expired);

void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
                                        struct ieee80211_channel *chan,
                                        gfp_t gfp)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        trace_cfg80211_tx_mgmt_expired(wdev, cookie, chan);
        nl80211_send_remain_on_chan_event(NL80211_CMD_FRAME_WAIT_CANCEL,
                                          rdev, wdev, cookie, chan, 0, gfp);
}
EXPORT_SYMBOL(cfg80211_tx_mgmt_expired);

void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
                      struct station_info *sinfo, gfp_t gfp)
{
        struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;

        trace_cfg80211_new_sta(dev, mac_addr, sinfo);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0,
                                 rdev, dev, mac_addr, sinfo) < 0) {
                nlmsg_free(msg);
                return;
        }

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
}
EXPORT_SYMBOL(cfg80211_new_sta);

void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
                            struct station_info *sinfo, gfp_t gfp)
{
        struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        struct station_info empty_sinfo = {};

        if (!sinfo)
                sinfo = &empty_sinfo;

        trace_cfg80211_del_sta(dev, mac_addr);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg) {
                cfg80211_sinfo_release_content(sinfo);
                return;
        }

        if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
                                 rdev, dev, mac_addr, sinfo) < 0) {
                nlmsg_free(msg);
                return;
        }

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
}
EXPORT_SYMBOL(cfg80211_del_sta_sinfo);

void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
                          enum nl80211_connect_failed_reason reason,
                          gfp_t gfp)
{
        struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_GOODSIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONN_FAILED);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr) ||
            nla_put_u32(msg, NL80211_ATTR_CONN_FAILED_REASON, reason))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_conn_failed);

static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
                                       const u8 *addr, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg;
        void *hdr;
        u32 nlportid = READ_ONCE(wdev->ap_unexpected_nlportid);

        if (!nlportid)
                return false;

        msg = nlmsg_new(100, gfp);
        if (!msg)
                return true;

        hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
        if (!hdr) {
                nlmsg_free(msg);
                return true;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
        return true;

 nla_put_failure:
        nlmsg_free(msg);
        return true;
}

bool cfg80211_rx_spurious_frame(struct net_device *dev,
                                const u8 *addr, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        bool ret;

        trace_cfg80211_rx_spurious_frame(dev, addr);

        if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP &&
                    wdev->iftype != NL80211_IFTYPE_P2P_GO)) {
                trace_cfg80211_return_bool(false);
                return false;
        }
        ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_FRAME,
                                         addr, gfp);
        trace_cfg80211_return_bool(ret);
        return ret;
}
EXPORT_SYMBOL(cfg80211_rx_spurious_frame);

bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
                                        const u8 *addr, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        bool ret;

        trace_cfg80211_rx_unexpected_4addr_frame(dev, addr);

        if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP &&
                    wdev->iftype != NL80211_IFTYPE_P2P_GO &&
                    wdev->iftype != NL80211_IFTYPE_AP_VLAN)) {
                trace_cfg80211_return_bool(false);
                return false;
        }
        ret = __nl80211_unexpected_frame(dev,
                                         NL80211_CMD_UNEXPECTED_4ADDR_FRAME,
                                         addr, gfp);
        trace_cfg80211_return_bool(ret);
        return ret;
}
EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame);

int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
                      struct wireless_dev *wdev, u32 nlportid,
                      struct cfg80211_rx_info *info, gfp_t gfp)
{
        struct net_device *netdev = wdev->netdev;
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(100 + info->len, gfp);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
        if (!hdr) {
                nlmsg_free(msg);
                return -ENOMEM;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
                                        netdev->ifindex)) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD) ||
            (info->have_link_id &&
             nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, info->link_id)) ||
            nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(info->freq)) ||
            nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, info->freq % 1000) ||
            (info->sig_dbm &&
             nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, info->sig_dbm)) ||
            nla_put(msg, NL80211_ATTR_FRAME, info->len, info->buf) ||
            (info->flags &&
             nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, info->flags)) ||
            (info->rx_tstamp && nla_put_u64_64bit(msg,
                                                  NL80211_ATTR_RX_HW_TIMESTAMP,
                                                  info->rx_tstamp,
                                                  NL80211_ATTR_PAD)) ||
            (info->ack_tstamp && nla_put_u64_64bit(msg,
                                                   NL80211_ATTR_TX_HW_TIMESTAMP,
                                                   info->ack_tstamp,
                                                   NL80211_ATTR_PAD)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);

 nla_put_failure:
        nlmsg_free(msg);
        return -ENOBUFS;
}

static void nl80211_frame_tx_status(struct wireless_dev *wdev,
                                    struct cfg80211_tx_status *status,
                                    gfp_t gfp, enum nl80211_commands command)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct net_device *netdev = wdev->netdev;
        struct sk_buff *msg;
        void *hdr;

        if (command == NL80211_CMD_FRAME_TX_STATUS)
                trace_cfg80211_mgmt_tx_status(wdev, status->cookie,
                                              status->ack);
        else
                trace_cfg80211_control_port_tx_status(wdev, status->cookie,
                                                      status->ack);

        msg = nlmsg_new(100 + status->len, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, command);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
                                   netdev->ifindex)) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD) ||
            nla_put(msg, NL80211_ATTR_FRAME, status->len, status->buf) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, status->cookie,
                              NL80211_ATTR_PAD) ||
            (status->ack && nla_put_flag(msg, NL80211_ATTR_ACK)) ||
            (status->tx_tstamp &&
             nla_put_u64_64bit(msg, NL80211_ATTR_TX_HW_TIMESTAMP,
                               status->tx_tstamp, NL80211_ATTR_PAD)) ||
            (status->ack_tstamp &&
             nla_put_u64_64bit(msg, NL80211_ATTR_RX_HW_TIMESTAMP,
                               status->ack_tstamp, NL80211_ATTR_PAD)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
                                     const u8 *buf, size_t len, bool ack,
                                     gfp_t gfp)
{
        struct cfg80211_tx_status status = {
                .cookie = cookie,
                .buf = buf,
                .len = len,
                .ack = ack
        };

        nl80211_frame_tx_status(wdev, &status, gfp,
                                NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS);
}
EXPORT_SYMBOL(cfg80211_control_port_tx_status);

void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev,
                                 struct cfg80211_tx_status *status, gfp_t gfp)
{
        nl80211_frame_tx_status(wdev, status, gfp, NL80211_CMD_FRAME_TX_STATUS);
}
EXPORT_SYMBOL(cfg80211_mgmt_tx_status_ext);

static int __nl80211_rx_control_port(struct net_device *dev,
                                     struct sk_buff *skb,
                                     bool unencrypted,
                                     int link_id,
                                     gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct ethhdr *ehdr = eth_hdr(skb);
        const u8 *addr = ehdr->h_source;
        u16 proto = be16_to_cpu(skb->protocol);
        struct sk_buff *msg;
        void *hdr;
        struct nlattr *frame;

        u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid);

        if (!nlportid)
                return -ENOENT;

        msg = nlmsg_new(100 + skb->len, gfp);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONTROL_PORT_FRAME);
        if (!hdr) {
                nlmsg_free(msg);
                return -ENOBUFS;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
            nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) ||
            (link_id >= 0 &&
             nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) ||
            (unencrypted && nla_put_flag(msg,
                                         NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT)))
                goto nla_put_failure;

        frame = nla_reserve(msg, NL80211_ATTR_FRAME, skb->len);
        if (!frame)
                goto nla_put_failure;

        skb_copy_bits(skb, 0, nla_data(frame), skb->len);
        genlmsg_end(msg, hdr);

        return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);

 nla_put_failure:
        nlmsg_free(msg);
        return -ENOBUFS;
}

bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb,
                              bool unencrypted, int link_id)
{
        int ret;

        trace_cfg80211_rx_control_port(dev, skb, unencrypted, link_id);
        ret = __nl80211_rx_control_port(dev, skb, unencrypted, link_id,
                                        GFP_ATOMIC);
        trace_cfg80211_return_bool(ret == 0);
        return ret == 0;
}
EXPORT_SYMBOL(cfg80211_rx_control_port);

static struct sk_buff *cfg80211_prepare_cqm(struct net_device *dev,
                                            const char *mac, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        void **cb;

        if (!msg)
                return NULL;

        cb = (void **)msg->cb;

        cb[0] = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NOTIFY_CQM);
        if (!cb[0]) {
                nlmsg_free(msg);
                return NULL;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
                goto nla_put_failure;

        if (mac && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac))
                goto nla_put_failure;

        cb[1] = nla_nest_start_noflag(msg, NL80211_ATTR_CQM);
        if (!cb[1])
                goto nla_put_failure;

        cb[2] = rdev;

        return msg;
 nla_put_failure:
        nlmsg_free(msg);
        return NULL;
}

static void cfg80211_send_cqm(struct sk_buff *msg, gfp_t gfp)
{
        void **cb = (void **)msg->cb;
        struct cfg80211_registered_device *rdev = cb[2];

        nla_nest_end(msg, cb[1]);
        genlmsg_end(msg, cb[0]);

        memset(msg->cb, 0, sizeof(msg->cb));

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
}

void cfg80211_cqm_rssi_notify(struct net_device *dev,
                              enum nl80211_cqm_rssi_threshold_event rssi_event,
                              s32 rssi_level, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_cqm_config *cqm_config;

        trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level);

        if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW &&
                    rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH))
                return;

        rcu_read_lock();
        cqm_config = rcu_dereference(wdev->cqm_config);
        if (cqm_config) {
                cqm_config->last_rssi_event_value = rssi_level;
                cqm_config->last_rssi_event_type = rssi_event;
                wiphy_work_queue(wdev->wiphy, &wdev->cqm_rssi_work);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(cfg80211_cqm_rssi_notify);

void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy, struct wiphy_work *work)
{
        struct wireless_dev *wdev = container_of(work, struct wireless_dev,
                                                 cqm_rssi_work);
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        enum nl80211_cqm_rssi_threshold_event rssi_event;
        struct cfg80211_cqm_config *cqm_config;
        struct sk_buff *msg;
        s32 rssi_level;

        cqm_config = wiphy_dereference(wdev->wiphy, wdev->cqm_config);
        if (!cqm_config)
                return;

        if (cqm_config->use_range_api)
                cfg80211_cqm_rssi_update(rdev, wdev->netdev, cqm_config);

        rssi_level = cqm_config->last_rssi_event_value;
        rssi_event = cqm_config->last_rssi_event_type;

        msg = cfg80211_prepare_cqm(wdev->netdev, NULL, GFP_KERNEL);
        if (!msg)
                return;

        if (nla_put_u32(msg, NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT,
                        rssi_event))
                goto nla_put_failure;

        if (rssi_level && nla_put_s32(msg, NL80211_ATTR_CQM_RSSI_LEVEL,
                                      rssi_level))
                goto nla_put_failure;

        cfg80211_send_cqm(msg, GFP_KERNEL);

        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_cqm_txe_notify(struct net_device *dev,
                             const u8 *peer, u32 num_packets,
                             u32 rate, u32 intvl, gfp_t gfp)
{
        struct sk_buff *msg;

        msg = cfg80211_prepare_cqm(dev, peer, gfp);
        if (!msg)
                return;

        if (nla_put_u32(msg, NL80211_ATTR_CQM_TXE_PKTS, num_packets))
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_CQM_TXE_RATE, rate))
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_CQM_TXE_INTVL, intvl))
                goto nla_put_failure;

        cfg80211_send_cqm(msg, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_cqm_txe_notify);

void cfg80211_cqm_pktloss_notify(struct net_device *dev,
                                 const u8 *peer, u32 num_packets, gfp_t gfp)
{
        struct sk_buff *msg;

        trace_cfg80211_cqm_pktloss_notify(dev, peer, num_packets);

        msg = cfg80211_prepare_cqm(dev, peer, gfp);
        if (!msg)
                return;

        if (nla_put_u32(msg, NL80211_ATTR_CQM_PKT_LOSS_EVENT, num_packets))
                goto nla_put_failure;

        cfg80211_send_cqm(msg, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify);

void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp)
{
        struct sk_buff *msg;

        msg = cfg80211_prepare_cqm(dev, NULL, gfp);
        if (!msg)
                return;

        if (nla_put_flag(msg, NL80211_ATTR_CQM_BEACON_LOSS_EVENT))
                goto nla_put_failure;

        cfg80211_send_cqm(msg, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_cqm_beacon_loss_notify);

static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev,
                                     struct net_device *netdev, const u8 *bssid,
                                     const u8 *replay_ctr, gfp_t gfp)
{
        struct sk_buff *msg;
        struct nlattr *rekey_attr;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_SET_REKEY_OFFLOAD);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
                goto nla_put_failure;

        rekey_attr = nla_nest_start_noflag(msg, NL80211_ATTR_REKEY_DATA);
        if (!rekey_attr)
                goto nla_put_failure;

        if (nla_put(msg, NL80211_REKEY_DATA_REPLAY_CTR,
                    NL80211_REPLAY_CTR_LEN, replay_ctr))
                goto nla_put_failure;

        nla_nest_end(msg, rekey_attr);

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid,
                               const u8 *replay_ctr, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        trace_cfg80211_gtk_rekey_notify(dev, bssid);
        nl80211_gtk_rekey_notify(rdev, dev, bssid, replay_ctr, gfp);
}
EXPORT_SYMBOL(cfg80211_gtk_rekey_notify);

static void
nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, int index,
                               const u8 *bssid, bool preauth, gfp_t gfp)
{
        struct sk_buff *msg;
        struct nlattr *attr;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PMKSA_CANDIDATE);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex))
                goto nla_put_failure;

        attr = nla_nest_start_noflag(msg, NL80211_ATTR_PMKSA_CANDIDATE);
        if (!attr)
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_PMKSA_CANDIDATE_INDEX, index) ||
            nla_put(msg, NL80211_PMKSA_CANDIDATE_BSSID, ETH_ALEN, bssid) ||
            (preauth &&
             nla_put_flag(msg, NL80211_PMKSA_CANDIDATE_PREAUTH)))
                goto nla_put_failure;

        nla_nest_end(msg, attr);

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
                                     const u8 *bssid, bool preauth, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        trace_cfg80211_pmksa_candidate_notify(dev, index, bssid, preauth);
        nl80211_pmksa_candidate_notify(rdev, dev, index, bssid, preauth, gfp);
}
EXPORT_SYMBOL(cfg80211_pmksa_candidate_notify);

static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
                                     struct net_device *netdev,
                                     unsigned int link_id,
                                     struct cfg80211_chan_def *chandef,
                                     gfp_t gfp,
                                     enum nl80211_commands notif,
                                     u8 count, bool quiet)
{
        struct wireless_dev *wdev = netdev->ieee80211_ptr;
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, notif);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex))
                goto nla_put_failure;

        if (wdev->valid_links &&
            nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))
                goto nla_put_failure;

        if (nl80211_send_chandef(msg, chandef))
                goto nla_put_failure;

        if (notif == NL80211_CMD_CH_SWITCH_STARTED_NOTIFY) {
                if (nla_put_u32(msg, NL80211_ATTR_CH_SWITCH_COUNT, count))
                        goto nla_put_failure;
                if (quiet &&
                    nla_put_flag(msg, NL80211_ATTR_CH_SWITCH_BLOCK_TX))
                        goto nla_put_failure;
        }

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_ch_switch_notify(struct net_device *dev,
                               struct cfg80211_chan_def *chandef,
                               unsigned int link_id)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        lockdep_assert_wiphy(wdev->wiphy);
        WARN_INVALID_LINK_ID(wdev, link_id);

        trace_cfg80211_ch_switch_notify(dev, chandef, link_id);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_P2P_CLIENT:
                if (!WARN_ON(!wdev->links[link_id].client.current_bss))
                        cfg80211_update_assoc_bss_entry(wdev, link_id,
                                                        chandef->chan);
                break;
        case NL80211_IFTYPE_MESH_POINT:
                wdev->u.mesh.chandef = *chandef;
                wdev->u.mesh.preset_chandef = *chandef;
                break;
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                wdev->links[link_id].ap.chandef = *chandef;
                break;
        case NL80211_IFTYPE_ADHOC:
                wdev->u.ibss.chandef = *chandef;
                break;
        default:
                WARN_ON(1);
                break;
        }

        cfg80211_schedule_channels_check(wdev);
        cfg80211_sched_dfs_chan_update(rdev);

        nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL,
                                 NL80211_CMD_CH_SWITCH_NOTIFY, 0, false);
}
EXPORT_SYMBOL(cfg80211_ch_switch_notify);

void cfg80211_ch_switch_started_notify(struct net_device *dev,
                                       struct cfg80211_chan_def *chandef,
                                       unsigned int link_id, u8 count,
                                       bool quiet)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        lockdep_assert_wiphy(wdev->wiphy);
        WARN_INVALID_LINK_ID(wdev, link_id);

        trace_cfg80211_ch_switch_started_notify(dev, chandef, link_id);


        nl80211_ch_switch_notify(rdev, dev, link_id, chandef, GFP_KERNEL,
                                 NL80211_CMD_CH_SWITCH_STARTED_NOTIFY,
                                 count, quiet);
}
EXPORT_SYMBOL(cfg80211_ch_switch_started_notify);

int cfg80211_bss_color_notify(struct net_device *dev,
                              enum nl80211_commands cmd, u8 count,
                              u64 color_bitmap, u8 link_id)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;

        lockdep_assert_wiphy(wdev->wiphy);

        trace_cfg80211_bss_color_notify(dev, cmd, count, color_bitmap);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, 0, 0, 0, cmd);
        if (!hdr)
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
                goto nla_put_failure;

        if (wdev->valid_links &&
            nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))
                goto nla_put_failure;

        if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED &&
            nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count))
                goto nla_put_failure;

        if (cmd == NL80211_CMD_OBSS_COLOR_COLLISION &&
            nla_put_u64_64bit(msg, NL80211_ATTR_OBSS_COLOR_BITMAP,
                              color_bitmap, NL80211_ATTR_PAD))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        return genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy),
                                       msg, 0, NL80211_MCGRP_MLME, GFP_KERNEL);

nla_put_failure:
        nlmsg_free(msg);
        return -EINVAL;
}
EXPORT_SYMBOL(cfg80211_bss_color_notify);

void
nl80211_radar_notify(struct cfg80211_registered_device *rdev,
                     const struct cfg80211_chan_def *chandef,
                     enum nl80211_radar_event event,
                     struct net_device *netdev, gfp_t gfp)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_RADAR_DETECT);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx))
                goto nla_put_failure;

        /* NOP and radar events don't need a netdev parameter */
        if (netdev) {
                struct wireless_dev *wdev = netdev->ieee80211_ptr;

                if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
                    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                                      NL80211_ATTR_PAD))
                        goto nla_put_failure;
        }

        if (nla_put_u32(msg, NL80211_ATTR_RADAR_EVENT, event))
                goto nla_put_failure;

        if (nl80211_send_chandef(msg, chandef))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}

void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac,
                                       struct sta_opmode_info *sta_opmode,
                                       gfp_t gfp)
{
        struct sk_buff *msg;
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        void *hdr;

        if (WARN_ON(!mac))
                return;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STA_OPMODE_CHANGED);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx))
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
                goto nla_put_failure;

        if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac))
                goto nla_put_failure;

        if ((sta_opmode->changed & STA_OPMODE_SMPS_MODE_CHANGED) &&
            nla_put_u8(msg, NL80211_ATTR_SMPS_MODE, sta_opmode->smps_mode))
                goto nla_put_failure;

        if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) &&
            nla_put_u32(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw))
                goto nla_put_failure;

        if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) &&
            nla_put_u8(msg, NL80211_ATTR_NSS, sta_opmode->rx_nss))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);

        return;

nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify);

void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
                           u64 cookie, bool acked, s32 ack_signal,
                           bool is_valid_ack_signal, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg;
        void *hdr;

        trace_cfg80211_probe_status(dev, addr, cookie, acked);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);

        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PROBE_CLIENT);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
                              NL80211_ATTR_PAD) ||
            (acked && nla_put_flag(msg, NL80211_ATTR_ACK)) ||
            (is_valid_ack_signal && nla_put_s32(msg, NL80211_ATTR_ACK_SIGNAL,
                                                ack_signal)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_probe_status);

void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame,
                                     size_t len, int freq, int sig_dbm)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;
        struct cfg80211_beacon_registration *reg;

        trace_cfg80211_report_obss_beacon(wiphy, frame, len, freq, sig_dbm);

        spin_lock_bh(&rdev->beacon_registrations_lock);
        list_for_each_entry(reg, &rdev->beacon_registrations, list) {
                msg = nlmsg_new(len + 100, GFP_ATOMIC);
                if (!msg) {
                        spin_unlock_bh(&rdev->beacon_registrations_lock);
                        return;
                }

                hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
                if (!hdr)
                        goto nla_put_failure;

                if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
                    (freq &&
                     (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ,
                                  KHZ_TO_MHZ(freq)) ||
                      nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET,
                                  freq % 1000))) ||
                    (sig_dbm &&
                     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
                    nla_put(msg, NL80211_ATTR_FRAME, len, frame))
                        goto nla_put_failure;

                genlmsg_end(msg, hdr);

                genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, reg->nlportid);
        }
        spin_unlock_bh(&rdev->beacon_registrations_lock);
        return;

 nla_put_failure:
        spin_unlock_bh(&rdev->beacon_registrations_lock);
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_report_obss_beacon_khz);

#ifdef CONFIG_PM
static int cfg80211_net_detect_results(struct sk_buff *msg,
                                       struct cfg80211_wowlan_wakeup *wakeup)
{
        struct cfg80211_wowlan_nd_info *nd = wakeup->net_detect;
        struct nlattr *nl_results, *nl_match, *nl_freqs;
        int i, j;

        nl_results = nla_nest_start_noflag(msg,
                                           NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS);
        if (!nl_results)
                return -EMSGSIZE;

        for (i = 0; i < nd->n_matches; i++) {
                struct cfg80211_wowlan_nd_match *match = nd->matches[i];

                nl_match = nla_nest_start_noflag(msg, i);
                if (!nl_match)
                        break;

                /* The SSID attribute is optional in nl80211, but for
                 * simplicity reasons it's always present in the
                 * cfg80211 structure.  If a driver can't pass the
                 * SSID, that needs to be changed.  A zero length SSID
                 * is still a valid SSID (wildcard), so it cannot be
                 * used for this purpose.
                 */
                if (nla_put(msg, NL80211_ATTR_SSID, match->ssid.ssid_len,
                            match->ssid.ssid)) {
                        nla_nest_cancel(msg, nl_match);
                        goto out;
                }

                if (match->n_channels) {
                        nl_freqs = nla_nest_start_noflag(msg,
                                                         NL80211_ATTR_SCAN_FREQUENCIES);
                        if (!nl_freqs) {
                                nla_nest_cancel(msg, nl_match);
                                goto out;
                        }

                        for (j = 0; j < match->n_channels; j++) {
                                if (nla_put_u32(msg, j, match->channels[j])) {
                                        nla_nest_cancel(msg, nl_freqs);
                                        nla_nest_cancel(msg, nl_match);
                                        goto out;
                                }
                        }

                        nla_nest_end(msg, nl_freqs);
                }

                nla_nest_end(msg, nl_match);
        }

out:
        nla_nest_end(msg, nl_results);
        return 0;
}

void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
                                   struct cfg80211_wowlan_wakeup *wakeup,
                                   gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg;
        void *hdr;
        int size = 200;

        trace_cfg80211_report_wowlan_wakeup(wdev->wiphy, wdev, wakeup);

        if (wakeup)
                size += wakeup->packet_present_len;

        msg = nlmsg_new(size, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_SET_WOWLAN);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto free_msg;

        if (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
                                        wdev->netdev->ifindex))
                goto free_msg;

        if (wakeup) {
                struct nlattr *reasons;

                reasons = nla_nest_start_noflag(msg,
                                                NL80211_ATTR_WOWLAN_TRIGGERS);
                if (!reasons)
                        goto free_msg;

                if (wakeup->disconnect &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_DISCONNECT))
                        goto free_msg;
                if (wakeup->magic_pkt &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_MAGIC_PKT))
                        goto free_msg;
                if (wakeup->gtk_rekey_failure &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_GTK_REKEY_FAILURE))
                        goto free_msg;
                if (wakeup->eap_identity_req &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_EAP_IDENT_REQUEST))
                        goto free_msg;
                if (wakeup->four_way_handshake &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE))
                        goto free_msg;
                if (wakeup->rfkill_release &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_RFKILL_RELEASE))
                        goto free_msg;

                if (wakeup->pattern_idx >= 0 &&
                    nla_put_u32(msg, NL80211_WOWLAN_TRIG_PKT_PATTERN,
                                wakeup->pattern_idx))
                        goto free_msg;

                if (wakeup->tcp_match &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_WAKEUP_TCP_MATCH))
                        goto free_msg;

                if (wakeup->tcp_connlost &&
                    nla_put_flag(msg, NL80211_WOWLAN_TRIG_WAKEUP_TCP_CONNLOST))
                        goto free_msg;

                if (wakeup->tcp_nomoretokens &&
                    nla_put_flag(msg,
                                 NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS))
                        goto free_msg;

                if (wakeup->unprot_deauth_disassoc &&
                    nla_put_flag(msg,
                                 NL80211_WOWLAN_TRIG_UNPROTECTED_DEAUTH_DISASSOC))
                        goto free_msg;

                if (wakeup->packet) {
                        u32 pkt_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211;
                        u32 len_attr = NL80211_WOWLAN_TRIG_WAKEUP_PKT_80211_LEN;

                        if (!wakeup->packet_80211) {
                                pkt_attr =
                                        NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023;
                                len_attr =
                                        NL80211_WOWLAN_TRIG_WAKEUP_PKT_8023_LEN;
                        }

                        if (wakeup->packet_len &&
                            nla_put_u32(msg, len_attr, wakeup->packet_len))
                                goto free_msg;

                        if (nla_put(msg, pkt_attr, wakeup->packet_present_len,
                                    wakeup->packet))
                                goto free_msg;
                }

                if (wakeup->net_detect &&
                    cfg80211_net_detect_results(msg, wakeup))
                                goto free_msg;

                nla_nest_end(msg, reasons);
        }

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 free_msg:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_report_wowlan_wakeup);
#endif

void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
                                enum nl80211_tdls_operation oper,
                                u16 reason_code, gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg;
        void *hdr;

        trace_cfg80211_tdls_oper_request(wdev->wiphy, dev, peer, oper,
                                         reason_code);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_TDLS_OPER);
        if (!hdr) {
                nlmsg_free(msg);
                return;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put_u8(msg, NL80211_ATTR_TDLS_OPERATION, oper) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer) ||
            (reason_code > 0 &&
             nla_put_u16(msg, NL80211_ATTR_REASON_CODE, reason_code)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_tdls_oper_request);

static int nl80211_netlink_notify(struct notifier_block * nb,
                                  unsigned long state,
                                  void *_notify)
{
        struct netlink_notify *notify = _notify;
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        struct cfg80211_beacon_registration *reg, *tmp;

        if (state != NETLINK_URELEASE || notify->protocol != NETLINK_GENERIC)
                return NOTIFY_DONE;

        rcu_read_lock();

        list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) {
                struct cfg80211_sched_scan_request *sched_scan_req;

                list_for_each_entry_rcu(sched_scan_req,
                                        &rdev->sched_scan_req_list,
                                        list) {
                        if (sched_scan_req->owner_nlportid == notify->portid) {
                                sched_scan_req->nl_owner_dead = true;
                                wiphy_work_queue(&rdev->wiphy,
                                                 &rdev->sched_scan_stop_wk);
                        }
                }

                list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) {
                        cfg80211_mlme_unregister_socket(wdev, notify->portid);

                        if (wdev->owner_nlportid == notify->portid) {
                                wdev->nl_owner_dead = true;
                                schedule_work(&rdev->destroy_work);
                        } else if (wdev->conn_owner_nlportid == notify->portid) {
                                schedule_work(&wdev->disconnect_wk);
                        }

                        cfg80211_release_pmsr(wdev, notify->portid);
                }

                spin_lock_bh(&rdev->beacon_registrations_lock);
                list_for_each_entry_safe(reg, tmp, &rdev->beacon_registrations,
                                         list) {
                        if (reg->nlportid == notify->portid) {
                                list_del(&reg->list);
                                kfree(reg);
                                break;
                        }
                }
                spin_unlock_bh(&rdev->beacon_registrations_lock);
        }

        rcu_read_unlock();

        /*
         * It is possible that the user space process that is controlling the
         * indoor setting disappeared, so notify the regulatory core.
         */
        regulatory_netlink_notify(notify->portid);
        return NOTIFY_OK;
}

static struct notifier_block nl80211_netlink_notifier = {
        .notifier_call = nl80211_netlink_notify,
};

void cfg80211_ft_event(struct net_device *netdev,
                       struct cfg80211_ft_event_params *ft_event)
{
        struct wiphy *wiphy = netdev->ieee80211_ptr->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;

        trace_cfg80211_ft_event(wiphy, netdev, ft_event);

        if (!ft_event->target_ap)
                return;

        msg = nlmsg_new(100 + ft_event->ies_len + ft_event->ric_ies_len,
                        GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FT_EVENT);
        if (!hdr)
                goto out;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, ft_event->target_ap))
                goto out;

        if (ft_event->ies &&
            nla_put(msg, NL80211_ATTR_IE, ft_event->ies_len, ft_event->ies))
                goto out;
        if (ft_event->ric_ies &&
            nla_put(msg, NL80211_ATTR_IE_RIC, ft_event->ric_ies_len,
                    ft_event->ric_ies))
                goto out;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, GFP_KERNEL);
        return;
 out:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_ft_event);

void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp)
{
        struct cfg80211_registered_device *rdev;
        struct sk_buff *msg;
        void *hdr;
        u32 nlportid;

        rdev = wiphy_to_rdev(wdev->wiphy);
        if (!rdev->crit_proto_nlportid)
                return;

        nlportid = rdev->crit_proto_nlportid;
        rdev->crit_proto_nlportid = 0;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CRIT_PROTOCOL_STOP);
        if (!hdr)
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
        return;

 nla_put_failure:
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_crit_proto_stopped);

void nl80211_send_ap_stopped(struct wireless_dev *wdev, unsigned int link_id)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STOP_AP);
        if (!hdr)
                goto out;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD) ||
            (wdev->valid_links &&
             nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)))
                goto out;

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), msg, 0,
                                NL80211_MCGRP_MLME, GFP_KERNEL);
        return;
 out:
        nlmsg_free(msg);
}

int cfg80211_external_auth_request(struct net_device *dev,
                                   struct cfg80211_external_auth_params *params,
                                   gfp_t gfp)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg;
        void *hdr;

        if (!wdev->conn_owner_nlportid)
                return -EINVAL;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return -ENOMEM;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_EXTERNAL_AUTH);
        if (!hdr)
                goto nla_put_failure;

        /* Some historical mistakes in drivers <-> userspace interface (notably
         * between drivers and wpa_supplicant) led to a big-endian conversion
         * being needed on NL80211_ATTR_AKM_SUITES _only_ when its value is
         * WLAN_AKM_SUITE_SAE. This is now fixed on userspace side, but for the
         * benefit of older wpa_supplicant versions, send this particular value
         * in big-endian. Note that newer wpa_supplicant will also detect this
         * particular value in big endian still, so it all continues to work.
         */
        if (params->key_mgmt_suite == WLAN_AKM_SUITE_SAE) {
                if (nla_put_be32(msg, NL80211_ATTR_AKM_SUITES,
                                 cpu_to_be32(WLAN_AKM_SUITE_SAE)))
                        goto nla_put_failure;
        } else {
                if (nla_put_u32(msg, NL80211_ATTR_AKM_SUITES,
                                params->key_mgmt_suite))
                        goto nla_put_failure;
        }

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
            nla_put_u32(msg, NL80211_ATTR_EXTERNAL_AUTH_ACTION,
                        params->action) ||
            nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, params->bssid) ||
            nla_put(msg, NL80211_ATTR_SSID, params->ssid.ssid_len,
                    params->ssid.ssid) ||
            (!is_zero_ether_addr(params->mld_addr) &&
             nla_put(msg, NL80211_ATTR_MLD_ADDR, ETH_ALEN, params->mld_addr)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
                        wdev->conn_owner_nlportid);
        return 0;

 nla_put_failure:
        nlmsg_free(msg);
        return -ENOBUFS;
}
EXPORT_SYMBOL(cfg80211_external_auth_request);

void cfg80211_update_owe_info_event(struct net_device *netdev,
                                    struct cfg80211_update_owe_info *owe_info,
                                    gfp_t gfp)
{
        struct wiphy *wiphy = netdev->ieee80211_ptr->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct sk_buff *msg;
        void *hdr;

        trace_cfg80211_update_owe_info_event(wiphy, netdev, owe_info);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_UPDATE_OWE_INFO);
        if (!hdr)
                goto nla_put_failure;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) ||
            nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, owe_info->peer))
                goto nla_put_failure;

        if (!owe_info->ie_len ||
            nla_put(msg, NL80211_ATTR_IE, owe_info->ie_len, owe_info->ie))
                goto nla_put_failure;

        if (owe_info->assoc_link_id != -1) {
                if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID,
                               owe_info->assoc_link_id))
                        goto nla_put_failure;

                if (!is_zero_ether_addr(owe_info->peer_mld_addr) &&
                    nla_put(msg, NL80211_ATTR_MLD_ADDR, ETH_ALEN,
                            owe_info->peer_mld_addr))
                        goto nla_put_failure;
        }

        genlmsg_end(msg, hdr);

        genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
                                NL80211_MCGRP_MLME, gfp);
        return;

nla_put_failure:
        genlmsg_cancel(msg, hdr);
        nlmsg_free(msg);
}
EXPORT_SYMBOL(cfg80211_update_owe_info_event);

void cfg80211_schedule_channels_check(struct wireless_dev *wdev)
{
        struct wiphy *wiphy = wdev->wiphy;

        /* Schedule channels check if NO_IR or DFS relaxations are supported */
        if (wdev->iftype == NL80211_IFTYPE_STATION &&
            (wiphy_ext_feature_isset(wiphy,
                                     NL80211_EXT_FEATURE_DFS_CONCURRENT) ||
            (IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) &&
             wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)))
                reg_check_channels();
}
EXPORT_SYMBOL(cfg80211_schedule_channels_check);

/* initialisation/exit functions */

int __init nl80211_init(void)
{
        int err;

        err = genl_register_family(&nl80211_fam);
        if (err)
                return err;

        err = netlink_register_notifier(&nl80211_netlink_notifier);
        if (err)
                goto err_out;

        return 0;
 err_out:
        genl_unregister_family(&nl80211_fam);
        return err;
}

void nl80211_exit(void)
{
        netlink_unregister_notifier(&nl80211_netlink_notifier);
        genl_unregister_family(&nl80211_fam);
}





















    3 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  include/linux/signalfd.h
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 */
#ifndef _LINUX_SIGNALFD_H
#define _LINUX_SIGNALFD_H

#include <uapi/linux/signalfd.h>
#include <linux/sched/signal.h>

#ifdef CONFIG_SIGNALFD

/*
 * Deliver the signal to listening signalfd.
 */
static inline void signalfd_notify(struct task_struct *tsk, int sig)
{
        if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh)))
                wake_up(&tsk->sighand->signalfd_wqh);
}

extern void signalfd_cleanup(struct sighand_struct *sighand);

#else /* CONFIG_SIGNALFD */

static inline void signalfd_notify(struct task_struct *tsk, int sig) { }

static inline void signalfd_cleanup(struct sighand_struct *sighand) { }

#endif /* CONFIG_SIGNALFD */

#endif /* _LINUX_SIGNALFD_H */

















































































































































































































































































































































































































































































































































































































































    4 

    2 















    3 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  NET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Ethernet handlers.
 *
 * Version:        @(#)eth.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *                Relocated to include/linux where it belongs by Alan Cox
 *                                                        <gw4pts@gw4pts.ampr.org>
 */
#ifndef _LINUX_ETHERDEVICE_H
#define _LINUX_ETHERDEVICE_H

#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/random.h>
#include <linux/crc32.h>
#include <asm/unaligned.h>
#include <asm/bitsperlong.h>

#ifdef __KERNEL__
struct device;
struct fwnode_handle;

int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
int platform_get_ethdev_address(struct device *dev, struct net_device *netdev);
unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf);
int device_get_mac_address(struct device *dev, char *addr);
int device_get_ethdev_address(struct device *dev, struct net_device *netdev);
int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr);

u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops;

int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
               const void *daddr, const void *saddr, unsigned len);
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
                     __be16 type);
void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev,
                             const unsigned char *haddr);
__be16 eth_header_parse_protocol(const struct sk_buff *skb);
int eth_prepare_mac_addr_change(struct net_device *dev, void *p);
void eth_commit_mac_addr_change(struct net_device *dev, void *p);
int eth_mac_addr(struct net_device *dev, void *p);
int eth_validate_addr(struct net_device *dev);

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                            unsigned int rxqs);
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)

struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
                                           unsigned int txqs,
                                           unsigned int rxqs);
#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
int eth_gro_complete(struct sk_buff *skb, int nhoff);

/* Reserved Ethernet Addresses per IEEE 802.1Q */
static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
#define eth_stp_addr eth_reserved_addr_base

static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };

static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };

/**
 * is_link_local_ether_addr - Determine if given Ethernet address is link-local
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per
 * IEEE 802.1Q 8.6.3 Frame filtering.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_link_local_ether_addr(const u8 *addr)
{
        __be16 *a = (__be16 *)addr;
        static const __be16 *b = (const __be16 *)eth_reserved_addr_base;
        static const __be16 m = cpu_to_be16(0xfff0);

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
                (__force int)((a[2] ^ b[2]) & m)) == 0;
#else
        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
#endif
}

/**
 * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is all zeroes.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_zero_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0;
#else
        return (*(const u16 *)(addr + 0) |
                *(const u16 *)(addr + 2) |
                *(const u16 *)(addr + 4)) == 0;
#endif
}

/**
 * is_multicast_ether_addr - Determine if the Ethernet address is a multicast.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is a multicast address.
 * By definition the broadcast address is also a multicast address.
 */
static inline bool is_multicast_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 a = *(const u32 *)addr;
#else
        u16 a = *(const u16 *)addr;
#endif
#ifdef __BIG_ENDIAN
        return 0x01 & (a >> ((sizeof(a) * 8) - 8));
#else
        return 0x01 & a;
#endif
}

static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#ifdef __BIG_ENDIAN
        return 0x01 & ((*(const u64 *)addr) >> 56);
#else
        return 0x01 & (*(const u64 *)addr);
#endif
#else
        return is_multicast_ether_addr(addr);
#endif
}

/**
 * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802).
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is a local address.
 */
static inline bool is_local_ether_addr(const u8 *addr)
{
        return 0x02 & addr[0];
}

/**
 * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is the broadcast address.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_broadcast_ether_addr(const u8 *addr)
{
        return (*(const u16 *)(addr + 0) &
                *(const u16 *)(addr + 2) &
                *(const u16 *)(addr + 4)) == 0xffff;
}

/**
 * is_unicast_ether_addr - Determine if the Ethernet address is unicast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return true if the address is a unicast address.
 */
static inline bool is_unicast_ether_addr(const u8 *addr)
{
        return !is_multicast_ether_addr(addr);
}

/**
 * is_valid_ether_addr - Determine if the given Ethernet address is valid
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not
 * a multicast address, and is not FF:FF:FF:FF:FF:FF.
 *
 * Return true if the address is valid.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_valid_ether_addr(const u8 *addr)
{
        /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to
         * explicitly check for it here. */
        return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
}

/**
 * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol
 * @proto: Ethertype/length value to be tested
 *
 * Check that the value from the Ethertype/length field is a valid Ethertype.
 *
 * Return true if the valid is an 802.3 supported Ethertype.
 */
static inline bool eth_proto_is_802_3(__be16 proto)
{
#ifndef __BIG_ENDIAN
        /* if CPU is little endian mask off bits representing LSB */
        proto &= htons(0xFF00);
#endif
        /* cast both to u16 and compare since LSB can be ignored */
        return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN);
}

/**
 * eth_random_addr - Generate software assigned random Ethernet address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Generate a random Ethernet address (MAC) that is not multicast
 * and has the local assigned bit set.
 */
static inline void eth_random_addr(u8 *addr)
{
        get_random_bytes(addr, ETH_ALEN);
        addr[0] &= 0xfe;        /* clear multicast bit */
        addr[0] |= 0x02;        /* set local assignment bit (IEEE802) */
}

/**
 * eth_broadcast_addr - Assign broadcast address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the broadcast address to the given address array.
 */
static inline void eth_broadcast_addr(u8 *addr)
{
        memset(addr, 0xff, ETH_ALEN);
}

/**
 * eth_zero_addr - Assign zero address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the zero address to the given address array.
 */
static inline void eth_zero_addr(u8 *addr)
{
        memset(addr, 0x00, ETH_ALEN);
}

/**
 * eth_hw_addr_random - Generate software assigned random Ethernet and
 * set device flag
 * @dev: pointer to net_device structure
 *
 * Generate a random Ethernet address (MAC) to be used by a net device
 * and set addr_assign_type so the state can be read by sysfs and be
 * used by userspace.
 */
static inline void eth_hw_addr_random(struct net_device *dev)
{
        u8 addr[ETH_ALEN];

        eth_random_addr(addr);
        __dev_addr_set(dev, addr, ETH_ALEN);
        dev->addr_assign_type = NET_ADDR_RANDOM;
}

/**
 * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr
 * @ha: pointer to hardware address
 *
 * Calculate CRC from a hardware address as basis for filter hashes.
 */
static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha)
{
        return ether_crc(ETH_ALEN, ha->addr);
}

/**
 * ether_addr_copy - Copy an Ethernet address
 * @dst: Pointer to a six-byte array Ethernet address destination
 * @src: Pointer to a six-byte array Ethernet address source
 *
 * Please note: dst & src must both be aligned to u16.
 */
static inline void ether_addr_copy(u8 *dst, const u8 *src)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        *(u32 *)dst = *(const u32 *)src;
        *(u16 *)(dst + 4) = *(const u16 *)(src + 4);
#else
        u16 *a = (u16 *)dst;
        const u16 *b = (const u16 *)src;

        a[0] = b[0];
        a[1] = b[1];
        a[2] = b[2];
#endif
}

/**
 * eth_hw_addr_set - Assign Ethernet address to a net_device
 * @dev: pointer to net_device structure
 * @addr: address to assign
 *
 * Assign given address to the net_device, addr_assign_type is not changed.
 */
static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, ETH_ALEN);
}

/**
 * eth_hw_addr_inherit - Copy dev_addr from another net_device
 * @dst: pointer to net_device to copy dev_addr to
 * @src: pointer to net_device to copy dev_addr from
 *
 * Copy the Ethernet address from one net_device to another along with
 * the address attributes (addr_assign_type).
 */
static inline void eth_hw_addr_inherit(struct net_device *dst,
                                       struct net_device *src)
{
        dst->addr_assign_type = src->addr_assign_type;
        eth_hw_addr_set(dst, src->dev_addr);
}

/**
 * ether_addr_equal - Compare two Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: addr1 & addr2 must both be aligned to u16.
 */
static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
                   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));

        return fold == 0;
#else
        const u16 *a = (const u16 *)addr1;
        const u16 *b = (const u16 *)addr2;

        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
#endif
}

/**
 * ether_addr_equal_64bits - Compare two Ethernet addresses
 * @addr1: Pointer to an array of 8 bytes
 * @addr2: Pointer to an other array of 8 bytes
 *
 * Compare two Ethernet addresses, returns true if equal, false otherwise.
 *
 * The function doesn't need any conditional branches and possibly uses
 * word memory accesses on CPU allowing cheap unaligned memory reads.
 * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 }
 *
 * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
 */

static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);

#ifdef __BIG_ENDIAN
        return (fold >> 16) == 0;
#else
        return (fold << 16) == 0;
#endif
#else
        return ether_addr_equal(addr1, addr2);
#endif
}

/**
 * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: Use only when any Ethernet address may not be u16 aligned.
 */
static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ether_addr_equal(addr1, addr2);
#else
        return memcmp(addr1, addr2, ETH_ALEN) == 0;
#endif
}

/**
 * ether_addr_equal_masked - Compare two Ethernet addresses with a mask
 * @addr1: Pointer to a six-byte array containing the 1st Ethernet address
 * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address
 * @mask: Pointer to a six-byte array containing the Ethernet address bitmask
 *
 * Compare two Ethernet addresses with a mask, returns true if for every bit
 * set in the bitmask the equivalent bits in the ethernet addresses are equal.
 * Using a mask with all bits set is a slower ether_addr_equal.
 */
static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
                                           const u8 *mask)
{
        int i;

        for (i = 0; i < ETH_ALEN; i++) {
                if ((addr1[i] ^ addr2[i]) & mask[i])
                        return false;
        }

        return true;
}

static inline bool ether_addr_is_ipv4_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ipv6_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ip_mcast(const u8 *addr)
{
        return ether_addr_is_ipv4_mcast(addr) ||
                ether_addr_is_ipv6_mcast(addr);
}

/**
 * ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return a u64 value of the address
 */
static inline u64 ether_addr_to_u64(const u8 *addr)
{
        u64 u = 0;
        int i;

        for (i = 0; i < ETH_ALEN; i++)
                u = u << 8 | addr[i];

        return u;
}

/**
 * u64_to_ether_addr - Convert a u64 to an Ethernet address.
 * @u: u64 to convert to an Ethernet MAC address
 * @addr: Pointer to a six-byte array to contain the Ethernet address
 */
static inline void u64_to_ether_addr(u64 u, u8 *addr)
{
        int i;

        for (i = ETH_ALEN - 1; i >= 0; i--) {
                addr[i] = u & 0xff;
                u = u >> 8;
        }
}

/**
 * eth_addr_dec - Decrement the given MAC address
 *
 * @addr: Pointer to a six-byte array containing Ethernet address to decrement
 */
static inline void eth_addr_dec(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u--;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_inc() - Increment the given MAC address.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_inc(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u++;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address.
 *
 * @offset: Offset to add.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_add(u8 *addr, long offset)
{
        u64 u = ether_addr_to_u64(addr);

        u += offset;
        u64_to_ether_addr(u, addr);
}

/**
 * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
 * @dev: Pointer to a device structure
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Compare passed address with all addresses of the device. Return true if the
 * address if one of the device addresses.
 *
 * Note that this function calls ether_addr_equal_64bits() so take care of
 * the right padding.
 */
static inline bool is_etherdev_addr(const struct net_device *dev,
                                    const u8 addr[6 + 2])
{
        struct netdev_hw_addr *ha;
        bool res = false;

        rcu_read_lock();
        for_each_dev_addr(dev, ha) {
                res = ether_addr_equal_64bits(addr, ha->addr);
                if (res)
                        break;
        }
        rcu_read_unlock();
        return res;
}
#endif        /* __KERNEL__ */

/**
 * compare_ether_header - Compare two Ethernet headers
 * @a: Pointer to Ethernet header
 * @b: Pointer to Ethernet header
 *
 * Compare two Ethernet headers, returns 0 if equal.
 * This assumes that the network header (i.e., IP header) is 4-byte
 * aligned OR the platform can handle unaligned access.  This is the
 * case for all packets coming into netif_receive_skb or similar
 * entry points.
 */

static inline unsigned long compare_ether_header(const void *a, const void *b)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        unsigned long fold;

        /*
         * We want to compare 14 bytes:
         *  [a0 ... a13] ^ [b0 ... b13]
         * Use two long XOR, ORed together, with an overlap of two bytes.
         *  [a0  a1  a2  a3  a4  a5  a6  a7 ] ^ [b0  b1  b2  b3  b4  b5  b6  b7 ] |
         *  [a6  a7  a8  a9  a10 a11 a12 a13] ^ [b6  b7  b8  b9  b10 b11 b12 b13]
         * This means the [a6 a7] ^ [b6 b7] part is done two times.
        */
        fold = *(unsigned long *)a ^ *(unsigned long *)b;
        fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6);
        return fold;
#else
        u32 *a32 = (u32 *)((u8 *)a + 2);
        u32 *b32 = (u32 *)((u8 *)b + 2);

        return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
               (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
#endif
}

/**
 * eth_hw_addr_gen - Generate and assign Ethernet address to a port
 * @dev: pointer to port's net_device structure
 * @base_addr: base Ethernet address
 * @id: offset to add to the base address
 *
 * Generate a MAC address using a base address and an offset and assign it
 * to a net_device. Commonly used by switch drivers which need to compute
 * addresses for all their ports. addr_assign_type is not changed.
 */
static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr,
                                   unsigned int id)
{
        u64 u = ether_addr_to_u64(base_addr);
        u8 addr[ETH_ALEN];

        u += id;
        u64_to_ether_addr(u, addr);
        eth_hw_addr_set(dev, addr);
}

/**
 * eth_skb_pkt_type - Assign packet type if destination address does not match
 * @skb: Assigned a packet type if address does not match @dev address
 * @dev: Network device used to compare packet address against
 *
 * If the destination MAC address of the packet does not match the network
 * device address, assign an appropriate packet type.
 */
static inline void eth_skb_pkt_type(struct sk_buff *skb,
                                    const struct net_device *dev)
{
        const struct ethhdr *eth = eth_hdr(skb);

        if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) {
                if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
                        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                                skb->pkt_type = PACKET_BROADCAST;
                        else
                                skb->pkt_type = PACKET_MULTICAST;
                } else {
                        skb->pkt_type = PACKET_OTHERHOST;
                }
        }
}

static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb)
{
        struct ethhdr *eth = (struct ethhdr *)skb->data;

        skb_pull_inline(skb, ETH_HLEN);
        return eth;
}

/**
 * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame
 * @skb: Buffer to pad
 *
 * An Ethernet frame should have a minimum size of 60 bytes.  This function
 * takes short frames and pads them with zeros up to the 60 byte limit.
 */
static inline int eth_skb_pad(struct sk_buff *skb)
{
        return skb_put_padto(skb, ETH_ZLEN);
}

#endif        /* _LINUX_ETHERDEVICE_H */


















    1 





    1 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/limits.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/types.h>

#include <linux/reciprocal_div.h>

/*
 * For a description of the algorithm please have a look at
 * include/linux/reciprocal_div.h
 */

struct reciprocal_value reciprocal_value(u32 d)
{
        struct reciprocal_value R;
        u64 m;
        int l;

        l = fls(d - 1);
        m = ((1ULL << 32) * ((1ULL << l) - d));
        do_div(m, d);
        ++m;
        R.m = (u32)m;
        R.sh1 = min(l, 1);
        R.sh2 = max(l - 1, 0);

        return R;
}
EXPORT_SYMBOL(reciprocal_value);

struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
{
        struct reciprocal_value_adv R;
        u32 l, post_shift;
        u64 mhigh, mlow;

        /* ceil(log2(d)) */
        l = fls(d - 1);
        /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to
         * be handled before calling "reciprocal_value_adv", please see the
         * comment at include/linux/reciprocal_div.h.
         */
        WARN(l == 32,
             "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor",
             d, __func__);
        post_shift = l;
        mlow = 1ULL << (32 + l);
        do_div(mlow, d);
        mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
        do_div(mhigh, d);

        for (; post_shift > 0; post_shift--) {
                u64 lo = mlow >> 1, hi = mhigh >> 1;

                if (lo >= hi)
                        break;

                mlow = lo;
                mhigh = hi;
        }

        R.m = (u32)mhigh;
        R.sh = post_shift;
        R.exp = l;
        R.is_wide_m = mhigh > U32_MAX;

        return R;
}
EXPORT_SYMBOL(reciprocal_value_adv);













    2 












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM fib6

#if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FIB6_H

#include <linux/in6.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <linux/tracepoint.h>

TRACE_EVENT(fib6_table_lookup,

        TP_PROTO(const struct net *net, const struct fib6_result *res,
                 struct fib6_table *table, const struct flowi6 *flp),

        TP_ARGS(net, res, table, flp),

        TP_STRUCT__entry(
                __field(        u32,        tb_id                )
                __field(        int,        err                )
                __field(        int,        oif                )
                __field(        int,        iif                )
                __field(        __u8,        tos                )
                __field(        __u8,        scope                )
                __field(        __u8,        flags                )
                __array(        __u8,        src,        16        )
                __array(        __u8,        dst,        16        )
                __field(        u16,        sport                )
                __field(        u16,        dport                )
                __field(        u8,        proto                )
                __field(        u8,        rt_type                )
                __array(                char,        name,        IFNAMSIZ )
                __array(                __u8,        gw,        16         )
        ),

        TP_fast_assign(
                struct in6_addr *in6;

                __entry->tb_id = table->tb6_id;
                __entry->err = ip6_rt_type_to_error(res->fib6_type);
                __entry->oif = flp->flowi6_oif;
                __entry->iif = flp->flowi6_iif;
                __entry->tos = ip6_tclass(flp->flowlabel);
                __entry->scope = flp->flowi6_scope;
                __entry->flags = flp->flowi6_flags;

                in6 = (struct in6_addr *)__entry->src;
                *in6 = flp->saddr;

                in6 = (struct in6_addr *)__entry->dst;
                *in6 = flp->daddr;

                __entry->proto = flp->flowi6_proto;
                if (__entry->proto == IPPROTO_TCP ||
                    __entry->proto == IPPROTO_UDP) {
                        __entry->sport = ntohs(flp->fl6_sport);
                        __entry->dport = ntohs(flp->fl6_dport);
                } else {
                        __entry->sport = 0;
                        __entry->dport = 0;
                }

                if (res->nh && res->nh->fib_nh_dev) {
                        strscpy(__entry->name, res->nh->fib_nh_dev->name, IFNAMSIZ);
                } else {
                        strcpy(__entry->name, "-");
                }
                if (res->f6i == net->ipv6.fib6_null_entry) {
                        in6 = (struct in6_addr *)__entry->gw;
                        *in6 = in6addr_any;
                } else if (res->nh) {
                        in6 = (struct in6_addr *)__entry->gw;
                        *in6 = res->nh->fib_nh_gw6;
                }
        ),

        TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d",
                  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
                  __entry->src, __entry->sport, __entry->dst, __entry->dport,
                  __entry->tos, __entry->scope, __entry->flags,
                  __entry->name, __entry->gw, __entry->err)
);

#endif /* _TRACE_FIB6_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






















































































































































































































































































































































































































    1 







    1 


































    1 














    1 

































































































    1 







    1 

    1 


















































































    1 








    1 













































    1 



    1 





























    1 



















































































    1 

    1 









    1 













    1 
    1 

    1 



















    1 



    1 





    1 




































































































































































































































    1 

    1 










    1 



































































































































    1 











    1 












    1 




























































    1 



























































































    1 





















    1 


























































































































































    1 




















    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
 *
 * Copyright (c) 2017 - 2019, Intel Corporation.
 */

#define pr_fmt(fmt) "MPTCP: " fmt

#include <linux/kernel.h>
#include <crypto/sha2.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
#include "mib.h"

#include <trace/events/mptcp.h>

static bool mptcp_cap_flag_sha256(u8 flags)
{
        return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
}

static void mptcp_parse_option(const struct sk_buff *skb,
                               const unsigned char *ptr, int opsize,
                               struct mptcp_options_received *mp_opt)
{
        u8 subtype = *ptr >> 4;
        int expected_opsize;
        u16 subopt;
        u8 version;
        u8 flags;
        u8 i;

        switch (subtype) {
        case MPTCPOPT_MP_CAPABLE:
                /* strict size checking */
                if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                        if (skb->len > tcp_hdr(skb)->doff << 2)
                                expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
                        else
                                expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
                        subopt = OPTION_MPTCP_MPC_ACK;
                } else {
                        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) {
                                expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
                                subopt = OPTION_MPTCP_MPC_SYNACK;
                        } else {
                                expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
                                subopt = OPTION_MPTCP_MPC_SYN;
                        }
                }

                /* Cfr RFC 8684 Section 3.3.0:
                 * If a checksum is present but its use had
                 * not been negotiated in the MP_CAPABLE handshake, the receiver MUST
                 * close the subflow with a RST, as it is not behaving as negotiated.
                 * If a checksum is not present when its use has been negotiated, the
                 * receiver MUST close the subflow with a RST, as it is considered
                 * broken
                 * We parse even option with mismatching csum presence, so that
                 * later in subflow_data_ready we can trigger the reset.
                 */
                if (opsize != expected_opsize &&
                    (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA ||
                     opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM))
                        break;

                /* try to be gentle vs future versions on the initial syn */
                version = *ptr++ & MPTCP_VERSION_MASK;
                if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
                        if (version != MPTCP_SUPPORTED_VERSION)
                                break;
                } else if (version < MPTCP_SUPPORTED_VERSION) {
                        break;
                }

                flags = *ptr++;
                if (!mptcp_cap_flag_sha256(flags) ||
                    (flags & MPTCP_CAP_EXTENSIBILITY))
                        break;

                /* RFC 6824, Section 3.1:
                 * "For the Checksum Required bit (labeled "A"), if either
                 * host requires the use of checksums, checksums MUST be used.
                 * In other words, the only way for checksums not to be used
                 * is if both hosts in their SYNs set A=0."
                 */
                if (flags & MPTCP_CAP_CHECKSUM_REQD)
                        mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;

                mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0);

                mp_opt->suboptions |= subopt;
                if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
                        mp_opt->sndr_key = get_unaligned_be64(ptr);
                        ptr += 8;
                }
                if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
                        mp_opt->rcvr_key = get_unaligned_be64(ptr);
                        ptr += 8;
                }
                if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) {
                        /* Section 3.1.:
                         * "the data parameters in a MP_CAPABLE are semantically
                         * equivalent to those in a DSS option and can be used
                         * interchangeably."
                         */
                        mp_opt->suboptions |= OPTION_MPTCP_DSS;
                        mp_opt->use_map = 1;
                        mp_opt->mpc_map = 1;
                        mp_opt->use_ack = 0;
                        mp_opt->data_len = get_unaligned_be16(ptr);
                        ptr += 2;
                }
                if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) {
                        mp_opt->csum = get_unaligned((__force __sum16 *)ptr);
                        mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
                        ptr += 2;
                }
                pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u",
                         version, flags, opsize, mp_opt->sndr_key,
                         mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum);
                break;

        case MPTCPOPT_MP_JOIN:
                if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
                        mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN;
                        mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
                        mp_opt->join_id = *ptr++;
                        mp_opt->token = get_unaligned_be32(ptr);
                        ptr += 4;
                        mp_opt->nonce = get_unaligned_be32(ptr);
                        ptr += 4;
                        pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
                                 mp_opt->backup, mp_opt->join_id,
                                 mp_opt->token, mp_opt->nonce);
                } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
                        mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK;
                        mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
                        mp_opt->join_id = *ptr++;
                        mp_opt->thmac = get_unaligned_be64(ptr);
                        ptr += 8;
                        mp_opt->nonce = get_unaligned_be32(ptr);
                        ptr += 4;
                        pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
                                 mp_opt->backup, mp_opt->join_id,
                                 mp_opt->thmac, mp_opt->nonce);
                } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
                        mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK;
                        ptr += 2;
                        memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
                        pr_debug("MP_JOIN hmac");
                }
                break;

        case MPTCPOPT_DSS:
                pr_debug("DSS");
                ptr++;

                /* we must clear 'mpc_map' be able to detect MP_CAPABLE
                 * map vs DSS map in mptcp_incoming_options(), and reconstruct
                 * map info accordingly
                 */
                mp_opt->mpc_map = 0;
                flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
                mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
                mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
                mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
                mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
                mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);

                pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
                         mp_opt->data_fin, mp_opt->dsn64,
                         mp_opt->use_map, mp_opt->ack64,
                         mp_opt->use_ack);

                expected_opsize = TCPOLEN_MPTCP_DSS_BASE;

                if (mp_opt->use_ack) {
                        if (mp_opt->ack64)
                                expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
                        else
                                expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
                }

                if (mp_opt->use_map) {
                        if (mp_opt->dsn64)
                                expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
                        else
                                expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
                }

                /* Always parse any csum presence combination, we will enforce
                 * RFC 8684 Section 3.3.0 checks later in subflow_data_ready
                 */
                if (opsize != expected_opsize &&
                    opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
                        break;

                mp_opt->suboptions |= OPTION_MPTCP_DSS;
                if (mp_opt->use_ack) {
                        if (mp_opt->ack64) {
                                mp_opt->data_ack = get_unaligned_be64(ptr);
                                ptr += 8;
                        } else {
                                mp_opt->data_ack = get_unaligned_be32(ptr);
                                ptr += 4;
                        }

                        pr_debug("data_ack=%llu", mp_opt->data_ack);
                }

                if (mp_opt->use_map) {
                        if (mp_opt->dsn64) {
                                mp_opt->data_seq = get_unaligned_be64(ptr);
                                ptr += 8;
                        } else {
                                mp_opt->data_seq = get_unaligned_be32(ptr);
                                ptr += 4;
                        }

                        mp_opt->subflow_seq = get_unaligned_be32(ptr);
                        ptr += 4;

                        mp_opt->data_len = get_unaligned_be16(ptr);
                        ptr += 2;

                        if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) {
                                mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD;
                                mp_opt->csum = get_unaligned((__force __sum16 *)ptr);
                                ptr += 2;
                        }

                        pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
                                 mp_opt->data_seq, mp_opt->subflow_seq,
                                 mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD),
                                 mp_opt->csum);
                }

                break;

        case MPTCPOPT_ADD_ADDR:
                mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
                if (!mp_opt->echo) {
                        if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
                            opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
                                mp_opt->addr.family = AF_INET;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                        else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
                                 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
                                mp_opt->addr.family = AF_INET6;
#endif
                        else
                                break;
                } else {
                        if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
                            opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
                                mp_opt->addr.family = AF_INET;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                        else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
                                 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
                                mp_opt->addr.family = AF_INET6;
#endif
                        else
                                break;
                }

                mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR;
                mp_opt->addr.id = *ptr++;
                mp_opt->addr.port = 0;
                mp_opt->ahmac = 0;
                if (mp_opt->addr.family == AF_INET) {
                        memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4);
                        ptr += 4;
                        if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
                            opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
                                mp_opt->addr.port = htons(get_unaligned_be16(ptr));
                                ptr += 2;
                        }
                }
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                else {
                        memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16);
                        ptr += 16;
                        if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
                            opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
                                mp_opt->addr.port = htons(get_unaligned_be16(ptr));
                                ptr += 2;
                        }
                }
#endif
                if (!mp_opt->echo) {
                        mp_opt->ahmac = get_unaligned_be64(ptr);
                        ptr += 8;
                }
                pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
                         (mp_opt->addr.family == AF_INET6) ? "6" : "",
                         mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port));
                break;

        case MPTCPOPT_RM_ADDR:
                if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
                    opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
                        break;

                ptr++;

                mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR;
                mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
                for (i = 0; i < mp_opt->rm_list.nr; i++)
                        mp_opt->rm_list.ids[i] = *ptr++;
                pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
                break;

        case MPTCPOPT_MP_PRIO:
                if (opsize != TCPOLEN_MPTCP_PRIO)
                        break;

                mp_opt->suboptions |= OPTION_MPTCP_PRIO;
                mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
                pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
                break;

        case MPTCPOPT_MP_FASTCLOSE:
                if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
                        break;

                ptr += 2;
                mp_opt->rcvr_key = get_unaligned_be64(ptr);
                ptr += 8;
                mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE;
                pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key);
                break;

        case MPTCPOPT_RST:
                if (opsize != TCPOLEN_MPTCP_RST)
                        break;

                if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
                        break;

                mp_opt->suboptions |= OPTION_MPTCP_RST;
                flags = *ptr++;
                mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT;
                mp_opt->reset_reason = *ptr;
                pr_debug("MP_RST: transient=%u reason=%u",
                         mp_opt->reset_transient, mp_opt->reset_reason);
                break;

        case MPTCPOPT_MP_FAIL:
                if (opsize != TCPOLEN_MPTCP_FAIL)
                        break;

                ptr += 2;
                mp_opt->suboptions |= OPTION_MPTCP_FAIL;
                mp_opt->fail_seq = get_unaligned_be64(ptr);
                pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq);
                break;

        default:
                break;
        }
}

void mptcp_get_options(const struct sk_buff *skb,
                       struct mptcp_options_received *mp_opt)
{
        const struct tcphdr *th = tcp_hdr(skb);
        const unsigned char *ptr;
        int length;

        /* initialize option status */
        mp_opt->suboptions = 0;

        length = (th->doff * 4) - sizeof(struct tcphdr);
        ptr = (const unsigned char *)(th + 1);

        while (length > 0) {
                int opcode = *ptr++;
                int opsize;

                switch (opcode) {
                case TCPOPT_EOL:
                        return;
                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
                        length--;
                        continue;
                default:
                        if (length < 2)
                                return;
                        opsize = *ptr++;
                        if (opsize < 2) /* "silly options" */
                                return;
                        if (opsize > length)
                                return;        /* don't parse partial options */
                        if (opcode == TCPOPT_MPTCP)
                                mptcp_parse_option(skb, ptr, opsize, mp_opt);
                        ptr += opsize - 2;
                        length -= opsize;
                }
        }
}

bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
                       unsigned int *size, struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        /* we will use snd_isn to detect first pkt [re]transmission
         * in mptcp_established_options_mp()
         */
        subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
        if (subflow->request_mptcp) {
                opts->suboptions = OPTION_MPTCP_MPC_SYN;
                opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk));
                opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk));
                *size = TCPOLEN_MPTCP_MPC_SYN;
                return true;
        } else if (subflow->request_join) {
                pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
                         subflow->local_nonce);
                opts->suboptions = OPTION_MPTCP_MPJ_SYN;
                opts->join_id = subflow->local_id;
                opts->token = subflow->remote_token;
                opts->nonce = subflow->local_nonce;
                opts->backup = subflow->request_bkup;
                *size = TCPOLEN_MPTCP_MPJ_SYN;
                return true;
        }
        return false;
}

static void clear_3rdack_retransmission(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        sk_stop_timer(sk, &icsk->icsk_delack_timer);
        icsk->icsk_ack.timeout = 0;
        icsk->icsk_ack.ato = 0;
        icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
}

static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                                         bool snd_data_fin_enable,
                                         unsigned int *size,
                                         struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        struct mptcp_ext *mpext;
        unsigned int data_len;
        u8 len;

        /* When skb is not available, we better over-estimate the emitted
         * options len. A full DSS option (28 bytes) is longer than
         * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
         * tell the caller to defer the estimate to
         * mptcp_established_options_dss(), which will reserve enough space.
         */
        if (!skb)
                return false;

        /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */
        if (subflow->fully_established || snd_data_fin_enable ||
            subflow->snd_isn != TCP_SKB_CB(skb)->seq ||
            sk->sk_state != TCP_ESTABLISHED)
                return false;

        if (subflow->mp_capable) {
                mpext = mptcp_get_ext(skb);
                data_len = mpext ? mpext->data_len : 0;

                /* we will check ops->data_len in mptcp_write_options() to
                 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
                 * TCPOLEN_MPTCP_MPC_ACK
                 */
                opts->data_len = data_len;
                opts->suboptions = OPTION_MPTCP_MPC_ACK;
                opts->sndr_key = subflow->local_key;
                opts->rcvr_key = subflow->remote_key;
                opts->csum_reqd = READ_ONCE(msk->csum_enabled);
                opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk));

                /* Section 3.1.
                 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
                 * packets that start the first subflow of an MPTCP connection,
                 * as well as the first packet that carries data
                 */
                if (data_len > 0) {
                        len = TCPOLEN_MPTCP_MPC_ACK_DATA;
                        if (opts->csum_reqd) {
                                /* we need to propagate more info to csum the pseudo hdr */
                                opts->data_seq = mpext->data_seq;
                                opts->subflow_seq = mpext->subflow_seq;
                                opts->csum = mpext->csum;
                                len += TCPOLEN_MPTCP_DSS_CHECKSUM;
                        }
                        *size = ALIGN(len, 4);
                } else {
                        *size = TCPOLEN_MPTCP_MPC_ACK;
                }

                pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
                         subflow, subflow->local_key, subflow->remote_key,
                         data_len);

                return true;
        } else if (subflow->mp_join) {
                opts->suboptions = OPTION_MPTCP_MPJ_ACK;
                memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
                *size = TCPOLEN_MPTCP_MPJ_ACK;
                pr_debug("subflow=%p", subflow);

                /* we can use the full delegate action helper only from BH context
                 * If we are in process context - sk is flushing the backlog at
                 * socket lock release time - just set the appropriate flag, will
                 * be handled by the release callback
                 */
                if (sock_owned_by_user(sk))
                        set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status);
                else
                        mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK);
                return true;
        }
        return false;
}

static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
                                 struct sk_buff *skb, struct mptcp_ext *ext)
{
        /* The write_seq value has already been incremented, so the actual
         * sequence number for the DATA_FIN is one less.
         */
        u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1;

        if (!ext->use_map || !skb->len) {
                /* RFC6824 requires a DSS mapping with specific values
                 * if DATA_FIN is set but no data payload is mapped
                 */
                ext->data_fin = 1;
                ext->use_map = 1;
                ext->dsn64 = 1;
                ext->data_seq = data_fin_tx_seq;
                ext->subflow_seq = 0;
                ext->data_len = 1;
        } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) {
                /* If there's an existing DSS mapping and it is the
                 * final mapping, DATA_FIN consumes 1 additional byte of
                 * mapping space.
                 */
                ext->data_fin = 1;
                ext->data_len++;
        }
}

static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
                                          bool snd_data_fin_enable,
                                          unsigned int *size,
                                          struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        unsigned int dss_size = 0;
        struct mptcp_ext *mpext;
        unsigned int ack_size;
        bool ret = false;
        u64 ack_seq;

        opts->csum_reqd = READ_ONCE(msk->csum_enabled);
        mpext = skb ? mptcp_get_ext(skb) : NULL;

        if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
                unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;

                if (mpext) {
                        if (opts->csum_reqd)
                                map_size += TCPOLEN_MPTCP_DSS_CHECKSUM;

                        opts->ext_copy = *mpext;
                }

                dss_size = map_size;
                if (skb && snd_data_fin_enable)
                        mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
                opts->suboptions = OPTION_MPTCP_DSS;
                ret = true;
        }

        /* passive sockets msk will set the 'can_ack' after accept(), even
         * if the first subflow may have the already the remote key handy
         */
        opts->ext_copy.use_ack = 0;
        if (!READ_ONCE(msk->can_ack)) {
                *size = ALIGN(dss_size, 4);
                return ret;
        }

        ack_seq = READ_ONCE(msk->ack_seq);
        if (READ_ONCE(msk->use_64bit_ack)) {
                ack_size = TCPOLEN_MPTCP_DSS_ACK64;
                opts->ext_copy.data_ack = ack_seq;
                opts->ext_copy.ack64 = 1;
        } else {
                ack_size = TCPOLEN_MPTCP_DSS_ACK32;
                opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
                opts->ext_copy.ack64 = 0;
        }
        opts->ext_copy.use_ack = 1;
        opts->suboptions = OPTION_MPTCP_DSS;
        WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));

        /* Add kind/length/subtype/flag overhead if mapping is not populated */
        if (dss_size == 0)
                ack_size += TCPOLEN_MPTCP_DSS_BASE;

        dss_size += ack_size;

        *size = ALIGN(dss_size, 4);
        return true;
}

static u64 add_addr_generate_hmac(u64 key1, u64 key2,
                                  struct mptcp_addr_info *addr)
{
        u16 port = ntohs(addr->port);
        u8 hmac[SHA256_DIGEST_SIZE];
        u8 msg[19];
        int i = 0;

        msg[i++] = addr->id;
        if (addr->family == AF_INET) {
                memcpy(&msg[i], &addr->addr.s_addr, 4);
                i += 4;
        }
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        else if (addr->family == AF_INET6) {
                memcpy(&msg[i], &addr->addr6.s6_addr, 16);
                i += 16;
        }
#endif
        msg[i++] = port >> 8;
        msg[i++] = port & 0xFF;

        mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac);

        return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
}

static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb,
                                               unsigned int *size,
                                               unsigned int remaining,
                                               struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        bool drop_other_suboptions = false;
        unsigned int opt_size = *size;
        bool echo;
        int len;

        /* add addr will strip the existing options, be sure to avoid breaking
         * MPC/MPJ handshakes
         */
        if (!mptcp_pm_should_add_signal(msk) ||
            (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
            !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr,
                    &echo, &drop_other_suboptions))
                return false;

        if (drop_other_suboptions)
                remaining += opt_size;
        len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port);
        if (remaining < len)
                return false;

        *size = len;
        if (drop_other_suboptions) {
                pr_debug("drop other suboptions");
                opts->suboptions = 0;

                /* note that e.g. DSS could have written into the memory
                 * aliased by ahmac, we must reset the field here
                 * to avoid appending the hmac even for ADD_ADDR echo
                 * options
                 */
                opts->ahmac = 0;
                *size -= opt_size;
        }
        opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
        if (!echo) {
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX);
                opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key),
                                                     READ_ONCE(msk->remote_key),
                                                     &opts->addr);
        } else {
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX);
        }
        pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
                 opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port));

        return true;
}

static bool mptcp_established_options_rm_addr(struct sock *sk,
                                              unsigned int *size,
                                              unsigned int remaining,
                                              struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        struct mptcp_rm_list rm_list;
        int i, len;

        if (!mptcp_pm_should_rm_signal(msk) ||
            !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
                return false;

        len = mptcp_rm_addr_len(&rm_list);
        if (len < 0)
                return false;
        if (remaining < len)
                return false;

        *size = len;
        opts->suboptions |= OPTION_MPTCP_RM_ADDR;
        opts->rm_list = rm_list;

        for (i = 0; i < opts->rm_list.nr; i++)
                pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
        MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr);
        return true;
}

static bool mptcp_established_options_mp_prio(struct sock *sk,
                                              unsigned int *size,
                                              unsigned int remaining,
                                              struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        /* can't send MP_PRIO with MPC, as they share the same option space:
         * 'backup'. Also it makes no sense at all
         */
        if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC))
                return false;

        /* account for the trailing 'nop' option */
        if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN)
                return false;

        *size = TCPOLEN_MPTCP_PRIO_ALIGN;
        opts->suboptions |= OPTION_MPTCP_PRIO;
        opts->backup = subflow->request_bkup;

        pr_debug("prio=%d", opts->backup);

        return true;
}

static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb,
                                                   unsigned int *size,
                                                   unsigned int remaining,
                                                   struct mptcp_out_options *opts)
{
        const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        if (remaining < TCPOLEN_MPTCP_RST)
                return false;

        *size = TCPOLEN_MPTCP_RST;
        opts->suboptions |= OPTION_MPTCP_RST;
        opts->reset_transient = subflow->reset_transient;
        opts->reset_reason = subflow->reset_reason;
        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX);

        return true;
}

static bool mptcp_established_options_fastclose(struct sock *sk,
                                                unsigned int *size,
                                                unsigned int remaining,
                                                struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);

        if (likely(!subflow->send_fastclose))
                return false;

        if (remaining < TCPOLEN_MPTCP_FASTCLOSE)
                return false;

        *size = TCPOLEN_MPTCP_FASTCLOSE;
        opts->suboptions |= OPTION_MPTCP_FASTCLOSE;
        opts->rcvr_key = READ_ONCE(msk->remote_key);

        pr_debug("FASTCLOSE key=%llu", opts->rcvr_key);
        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX);
        return true;
}

static bool mptcp_established_options_mp_fail(struct sock *sk,
                                              unsigned int *size,
                                              unsigned int remaining,
                                              struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        if (likely(!subflow->send_mp_fail))
                return false;

        if (remaining < TCPOLEN_MPTCP_FAIL)
                return false;

        *size = TCPOLEN_MPTCP_FAIL;
        opts->suboptions |= OPTION_MPTCP_FAIL;
        opts->fail_seq = subflow->map_seq;

        pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq);
        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX);

        return true;
}

bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
                               unsigned int *size, unsigned int remaining,
                               struct mptcp_out_options *opts)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        unsigned int opt_size = 0;
        bool snd_data_fin;
        bool ret = false;

        opts->suboptions = 0;

        if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb)))
                return false;

        if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
                if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) ||
                    mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) {
                        *size += opt_size;
                        remaining -= opt_size;
                }
                /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */
                if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) {
                        *size += opt_size;
                        remaining -= opt_size;
                }
                return true;
        }

        snd_data_fin = mptcp_data_fin_enabled(msk);
        if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts))
                ret = true;
        else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) {
                unsigned int mp_fail_size;

                ret = true;
                if (mptcp_established_options_mp_fail(sk, &mp_fail_size,
                                                      remaining - opt_size, opts)) {
                        *size += opt_size + mp_fail_size;
                        remaining -= opt_size - mp_fail_size;
                        return true;
                }
        }

        /* we reserved enough space for the above options, and exceeding the
         * TCP option space would be fatal
         */
        if (WARN_ON_ONCE(opt_size > remaining))
                return false;

        *size += opt_size;
        remaining -= opt_size;
        if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) {
                *size += opt_size;
                remaining -= opt_size;
                ret = true;
        } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) {
                *size += opt_size;
                remaining -= opt_size;
                ret = true;
        }

        if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) {
                *size += opt_size;
                remaining -= opt_size;
                ret = true;
        }

        return ret;
}

bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
                          struct mptcp_out_options *opts)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);

        if (subflow_req->mp_capable) {
                opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
                opts->sndr_key = subflow_req->local_key;
                opts->csum_reqd = subflow_req->csum_reqd;
                opts->allow_join_id0 = subflow_req->allow_join_id0;
                *size = TCPOLEN_MPTCP_MPC_SYNACK;
                pr_debug("subflow_req=%p, local_key=%llu",
                         subflow_req, subflow_req->local_key);
                return true;
        } else if (subflow_req->mp_join) {
                opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
                opts->backup = subflow_req->backup;
                opts->join_id = subflow_req->local_id;
                opts->thmac = subflow_req->thmac;
                opts->nonce = subflow_req->local_nonce;
                pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
                         subflow_req, opts->backup, opts->join_id,
                         opts->thmac, opts->nonce);
                *size = TCPOLEN_MPTCP_MPJ_SYNACK;
                return true;
        }
        return false;
}

static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
                                    struct mptcp_subflow_context *subflow,
                                    struct sk_buff *skb,
                                    struct mptcp_options_received *mp_opt)
{
        /* here we can process OoO, in-window pkts, only in-sequence 4th ack
         * will make the subflow fully established
         */
        if (likely(subflow->fully_established)) {
                /* on passive sockets, check for 3rd ack retransmission
                 * note that msk is always set by subflow_syn_recv_sock()
                 * for mp_join subflows
                 */
                if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
                    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
                    subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) &&
                    !subflow->request_join)
                        tcp_send_ack(ssk);
                goto check_notify;
        }

        /* we must process OoO packets before the first subflow is fully
         * established. OoO packets are instead a protocol violation
         * for MP_JOIN subflows as the peer must not send any data
         * before receiving the forth ack - cfr. RFC 8684 section 3.2.
         */
        if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
                if (subflow->mp_join)
                        goto reset;
                if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
                        goto set_fully_established;
                return subflow->mp_capable;
        }

        if (subflow->remote_key_valid &&
            (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) ||
             ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) {
                /* subflows are fully established as soon as we get any
                 * additional ack, including ADD_ADDR.
                 */
                goto set_fully_established;
        }

        /* If the first established packet does not contain MP_CAPABLE + data
         * then fallback to TCP. Fallback scenarios requires a reset for
         * MP_JOIN subflows.
         */
        if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) {
                if (subflow->mp_join)
                        goto reset;
                subflow->mp_capable = 0;
                pr_fallback(msk);
                mptcp_do_fallback(ssk);
                return false;
        }

        if (mp_opt->deny_join_id0)
                WRITE_ONCE(msk->pm.remote_deny_join_id0, true);

        if (unlikely(!READ_ONCE(msk->pm.server_side)))
                pr_warn_once("bogus mpc option on established client sk");

set_fully_established:
        mptcp_data_lock((struct sock *)msk);
        __mptcp_subflow_fully_established(msk, subflow, mp_opt);
        mptcp_data_unlock((struct sock *)msk);

check_notify:
        /* if the subflow is not already linked into the conn_list, we can't
         * notify the PM: this subflow is still on the listener queue
         * and the PM possibly acquiring the subflow lock could race with
         * the listener close
         */
        if (likely(subflow->pm_notified) || list_empty(&subflow->node))
                return true;

        subflow->pm_notified = 1;
        if (subflow->mp_join) {
                clear_3rdack_retransmission(ssk);
                mptcp_pm_subflow_established(msk);
        } else {
                mptcp_pm_fully_established(msk, ssk);
        }
        return true;

reset:
        mptcp_subflow_reset(ssk);
        return false;
}

u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
{
        u32 old_seq32, cur_seq32;

        old_seq32 = (u32)old_seq;
        cur_seq32 = (u32)cur_seq;
        cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32;
        if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32)))
                return cur_seq + (1LL << 32);

        /* reverse wrap could happen, too */
        if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32)))
                return cur_seq - (1LL << 32);
        return cur_seq;
}

static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
{
        msk->bytes_acked += new_snd_una - msk->snd_una;
        WRITE_ONCE(msk->snd_una, new_snd_una);
}

static void ack_update_msk(struct mptcp_sock *msk,
                           struct sock *ssk,
                           struct mptcp_options_received *mp_opt)
{
        u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt);
        struct sock *sk = (struct sock *)msk;
        u64 old_snd_una;

        mptcp_data_lock(sk);

        /* avoid ack expansion on update conflict, to reduce the risk of
         * wrongly expanding to a future ack sequence number, which is way
         * more dangerous than missing an ack
         */
        old_snd_una = msk->snd_una;
        new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64);

        /* ACK for data not even sent yet? Ignore.*/
        if (unlikely(after64(new_snd_una, snd_nxt)))
                new_snd_una = old_snd_una;

        new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;

        if (after64(new_wnd_end, msk->wnd_end))
                WRITE_ONCE(msk->wnd_end, new_wnd_end);

        /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
        if (after64(msk->wnd_end, snd_nxt))
                __mptcp_check_push(sk, ssk);

        if (after64(new_snd_una, old_snd_una)) {
                __mptcp_snd_una_update(msk, new_snd_una);
                __mptcp_data_acked(sk);
        }
        msk->last_ack_recv = tcp_jiffies32;
        mptcp_data_unlock(sk);

        trace_ack_update_msk(mp_opt->data_ack,
                             old_snd_una, new_snd_una,
                             new_wnd_end, READ_ONCE(msk->wnd_end));
}

bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
{
        /* Skip if DATA_FIN was already received.
         * If updating simultaneously with the recvmsg loop, values
         * should match. If they mismatch, the peer is misbehaving and
         * we will prefer the most recent information.
         */
        if (READ_ONCE(msk->rcv_data_fin))
                return false;

        WRITE_ONCE(msk->rcv_data_fin_seq,
                   mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit));
        WRITE_ONCE(msk->rcv_data_fin, 1);

        return true;
}

static bool add_addr_hmac_valid(struct mptcp_sock *msk,
                                struct mptcp_options_received *mp_opt)
{
        u64 hmac = 0;

        if (mp_opt->echo)
                return true;

        hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key),
                                      READ_ONCE(msk->local_key),
                                      &mp_opt->addr);

        pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
                 msk, hmac, mp_opt->ahmac);

        return hmac == mp_opt->ahmac;
}

/* Return false if a subflow has been reset, else return true */
bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        struct mptcp_options_received mp_opt;
        struct mptcp_ext *mpext;

        if (__mptcp_check_fallback(msk)) {
                /* Keep it simple and unconditionally trigger send data cleanup and
                 * pending queue spooling. We will need to acquire the data lock
                 * for more accurate checks, and once the lock is acquired, such
                 * helpers are cheap.
                 */
                mptcp_data_lock(subflow->conn);
                if (sk_stream_memory_free(sk))
                        __mptcp_check_push(subflow->conn, sk);

                /* on fallback we just need to ignore the msk-level snd_una, as
                 * this is really plain TCP
                 */
                __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));

                __mptcp_data_acked(subflow->conn);
                mptcp_data_unlock(subflow->conn);
                return true;
        }

        mptcp_get_options(skb, &mp_opt);

        /* The subflow can be in close state only if check_fully_established()
         * just sent a reset. If so, tell the caller to ignore the current packet.
         */
        if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
                return sk->sk_state != TCP_CLOSE;

        if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) {
                if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) &&
                    READ_ONCE(msk->local_key) == mp_opt.rcvr_key) {
                        WRITE_ONCE(msk->rcv_fastclose, true);
                        mptcp_schedule_work((struct sock *)msk);
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX);
                }

                if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) &&
                    add_addr_hmac_valid(msk, &mp_opt)) {
                        if (!mp_opt.echo) {
                                mptcp_pm_add_addr_received(sk, &mp_opt.addr);
                                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
                        } else {
                                mptcp_pm_add_addr_echoed(msk, &mp_opt.addr);
                                mptcp_pm_del_add_timer(msk, &mp_opt.addr, true);
                                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
                        }

                        if (mp_opt.addr.port)
                                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD);
                }

                if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR)
                        mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);

                if (mp_opt.suboptions & OPTION_MPTCP_PRIO) {
                        mptcp_pm_mp_prio_received(sk, mp_opt.backup);
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX);
                }

                if (mp_opt.suboptions & OPTION_MPTCP_FAIL) {
                        mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq);
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX);
                }

                if (mp_opt.suboptions & OPTION_MPTCP_RST) {
                        subflow->reset_seen = 1;
                        subflow->reset_reason = mp_opt.reset_reason;
                        subflow->reset_transient = mp_opt.reset_transient;
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX);
                }

                if (!(mp_opt.suboptions & OPTION_MPTCP_DSS))
                        return true;
        }

        /* we can't wait for recvmsg() to update the ack_seq, otherwise
         * monodirectional flows will stuck
         */
        if (mp_opt.use_ack)
                ack_update_msk(msk, sk, &mp_opt);

        /* Zero-data-length packets are dropped by the caller and not
         * propagated to the MPTCP layer, so the skb extension does not
         * need to be allocated or populated. DATA_FIN information, if
         * present, needs to be updated here before the skb is freed.
         */
        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
                if (mp_opt.data_fin && mp_opt.data_len == 1 &&
                    mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64))
                        mptcp_schedule_work((struct sock *)msk);

                return true;
        }

        mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
        if (!mpext)
                return true;

        memset(mpext, 0, sizeof(*mpext));

        if (likely(mp_opt.use_map)) {
                if (mp_opt.mpc_map) {
                        /* this is an MP_CAPABLE carrying MPTCP data
                         * we know this map the first chunk of data
                         */
                        mptcp_crypto_key_sha(subflow->remote_key, NULL,
                                             &mpext->data_seq);
                        mpext->data_seq++;
                        mpext->subflow_seq = 1;
                        mpext->dsn64 = 1;
                        mpext->mpc_map = 1;
                        mpext->data_fin = 0;
                } else {
                        mpext->data_seq = mp_opt.data_seq;
                        mpext->subflow_seq = mp_opt.subflow_seq;
                        mpext->dsn64 = mp_opt.dsn64;
                        mpext->data_fin = mp_opt.data_fin;
                }
                mpext->data_len = mp_opt.data_len;
                mpext->use_map = 1;
                mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD);

                if (mpext->csum_reqd)
                        mpext->csum = mp_opt.csum;
        }

        return true;
}

static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
{
        const struct sock *ssk = (const struct sock *)tp;
        struct mptcp_subflow_context *subflow;
        u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
        struct mptcp_sock *msk;
        u32 new_win;
        u64 win;

        subflow = mptcp_subflow_ctx(ssk);
        msk = mptcp_sk(subflow->conn);

        ack_seq = READ_ONCE(msk->ack_seq);
        rcv_wnd_new = ack_seq + tp->rcv_wnd;

        rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
        if (after64(rcv_wnd_new, rcv_wnd_old)) {
                u64 rcv_wnd;

                for (;;) {
                        rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);

                        if (rcv_wnd == rcv_wnd_old)
                                break;

                        rcv_wnd_old = rcv_wnd;
                        if (before64(rcv_wnd_new, rcv_wnd_old)) {
                                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE);
                                goto raise_win;
                        }
                        MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT);
                }
                return;
        }

        if (rcv_wnd_new != rcv_wnd_old) {
raise_win:
                win = rcv_wnd_old - ack_seq;
                tp->rcv_wnd = min_t(u64, win, U32_MAX);
                new_win = tp->rcv_wnd;

                /* Make sure we do not exceed the maximum possible
                 * scaled window.
                 */
                if (unlikely(th->syn))
                        new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
                if (!tp->rx_opt.rcv_wscale &&
                    READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows))
                        new_win = min(new_win, MAX_TCP_WINDOW);
                else
                        new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));

                /* RFC1323 scaling applied */
                new_win >>= tp->rx_opt.rcv_wscale;
                th->window = htons(new_win);
                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED);
        }
}

__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
{
        struct csum_pseudo_header header;
        __wsum csum;

        /* cfr RFC 8684 3.3.1.:
         * the data sequence number used in the pseudo-header is
         * always the 64-bit value, irrespective of what length is used in the
         * DSS option itself.
         */
        header.data_seq = cpu_to_be64(data_seq);
        header.subflow_seq = htonl(subflow_seq);
        header.data_len = htons(data_len);
        header.csum = 0;

        csum = csum_partial(&header, sizeof(header), sum);
        return csum_fold(csum);
}

static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext)
{
        return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len,
                                 ~csum_unfold(mpext->csum));
}

static void put_len_csum(u16 len, __sum16 csum, void *data)
{
        __sum16 *sumptr = data + 2;
        __be16 *ptr = data;

        put_unaligned_be16(len, ptr);

        put_unaligned(csum, sumptr);
}

void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp,
                         struct mptcp_out_options *opts)
{
        const struct sock *ssk = (const struct sock *)tp;
        struct mptcp_subflow_context *subflow;

        /* Which options can be used together?
         *
         * X: mutually exclusive
         * O: often used together
         * C: can be used together in some cases
         * P: could be used together but we prefer not to (optimisations)
         *
         *  Opt: | MPC  | MPJ  | DSS  | ADD  |  RM  | PRIO | FAIL |  FC  |
         * ------|------|------|------|------|------|------|------|------|
         *  MPC  |------|------|------|------|------|------|------|------|
         *  MPJ  |  X   |------|------|------|------|------|------|------|
         *  DSS  |  X   |  X   |------|------|------|------|------|------|
         *  ADD  |  X   |  X   |  P   |------|------|------|------|------|
         *  RM   |  C   |  C   |  C   |  P   |------|------|------|------|
         *  PRIO |  X   |  C   |  C   |  C   |  C   |------|------|------|
         *  FAIL |  X   |  X   |  C   |  X   |  X   |  X   |------|------|
         *  FC   |  X   |  X   |  X   |  X   |  X   |  X   |  X   |------|
         *  RST  |  X   |  X   |  X   |  X   |  X   |  X   |  O   |  O   |
         * ------|------|------|------|------|------|------|------|------|
         *
         * The same applies in mptcp_established_options() function.
         */
        if (likely(OPTION_MPTCP_DSS & opts->suboptions)) {
                struct mptcp_ext *mpext = &opts->ext_copy;
                u8 len = TCPOLEN_MPTCP_DSS_BASE;
                u8 flags = 0;

                if (mpext->use_ack) {
                        flags = MPTCP_DSS_HAS_ACK;
                        if (mpext->ack64) {
                                len += TCPOLEN_MPTCP_DSS_ACK64;
                                flags |= MPTCP_DSS_ACK64;
                        } else {
                                len += TCPOLEN_MPTCP_DSS_ACK32;
                        }
                }

                if (mpext->use_map) {
                        len += TCPOLEN_MPTCP_DSS_MAP64;

                        /* Use only 64-bit mapping flags for now, add
                         * support for optional 32-bit mappings later.
                         */
                        flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
                        if (mpext->data_fin)
                                flags |= MPTCP_DSS_DATA_FIN;

                        if (opts->csum_reqd)
                                len += TCPOLEN_MPTCP_DSS_CHECKSUM;
                }

                *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);

                if (mpext->use_ack) {
                        if (mpext->ack64) {
                                put_unaligned_be64(mpext->data_ack, ptr);
                                ptr += 2;
                        } else {
                                put_unaligned_be32(mpext->data_ack32, ptr);
                                ptr += 1;
                        }
                }

                if (mpext->use_map) {
                        put_unaligned_be64(mpext->data_seq, ptr);
                        ptr += 2;
                        put_unaligned_be32(mpext->subflow_seq, ptr);
                        ptr += 1;
                        if (opts->csum_reqd) {
                                /* data_len == 0 is reserved for the infinite mapping,
                                 * the checksum will also be set to 0.
                                 */
                                put_len_csum(mpext->data_len,
                                             (mpext->data_len ? mptcp_make_csum(mpext) : 0),
                                             ptr);
                        } else {
                                put_unaligned_be32(mpext->data_len << 16 |
                                                   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
                        }
                        ptr += 1;
                }

                /* We might need to add MP_FAIL options in rare cases */
                if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions))
                        goto mp_fail;
        } else if (OPTIONS_MPTCP_MPC & opts->suboptions) {
                u8 len, flag = MPTCP_CAP_HMAC_SHA256;

                if (OPTION_MPTCP_MPC_SYN & opts->suboptions) {
                        len = TCPOLEN_MPTCP_MPC_SYN;
                } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) {
                        len = TCPOLEN_MPTCP_MPC_SYNACK;
                } else if (opts->data_len) {
                        len = TCPOLEN_MPTCP_MPC_ACK_DATA;
                        if (opts->csum_reqd)
                                len += TCPOLEN_MPTCP_DSS_CHECKSUM;
                } else {
                        len = TCPOLEN_MPTCP_MPC_ACK;
                }

                if (opts->csum_reqd)
                        flag |= MPTCP_CAP_CHECKSUM_REQD;

                if (!opts->allow_join_id0)
                        flag |= MPTCP_CAP_DENY_JOIN_ID0;

                *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
                                      MPTCP_SUPPORTED_VERSION,
                                      flag);

                if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
                    opts->suboptions))
                        goto mp_capable_done;

                put_unaligned_be64(opts->sndr_key, ptr);
                ptr += 2;
                if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
                        goto mp_capable_done;

                put_unaligned_be64(opts->rcvr_key, ptr);
                ptr += 2;
                if (!opts->data_len)
                        goto mp_capable_done;

                if (opts->csum_reqd) {
                        put_len_csum(opts->data_len,
                                     __mptcp_make_csum(opts->data_seq,
                                                       opts->subflow_seq,
                                                       opts->data_len,
                                                       ~csum_unfold(opts->csum)),
                                     ptr);
                } else {
                        put_unaligned_be32(opts->data_len << 16 |
                                           TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
                }
                ptr += 1;

                /* MPC is additionally mutually exclusive with MP_PRIO */
                goto mp_capable_done;
        } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) {
                if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
                        *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
                                              TCPOLEN_MPTCP_MPJ_SYN,
                                              opts->backup, opts->join_id);
                        put_unaligned_be32(opts->token, ptr);
                        ptr += 1;
                        put_unaligned_be32(opts->nonce, ptr);
                        ptr += 1;
                } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
                        *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
                                              TCPOLEN_MPTCP_MPJ_SYNACK,
                                              opts->backup, opts->join_id);
                        put_unaligned_be64(opts->thmac, ptr);
                        ptr += 2;
                        put_unaligned_be32(opts->nonce, ptr);
                        ptr += 1;
                } else {
                        *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
                                              TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
                        memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
                        ptr += 5;
                }
        } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
                u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
                u8 echo = MPTCP_ADDR_ECHO;

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                if (opts->addr.family == AF_INET6)
                        len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
#endif

                if (opts->addr.port)
                        len += TCPOLEN_MPTCP_PORT_LEN;

                if (opts->ahmac) {
                        len += sizeof(opts->ahmac);
                        echo = 0;
                }

                *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
                                      len, echo, opts->addr.id);
                if (opts->addr.family == AF_INET) {
                        memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4);
                        ptr += 1;
                }
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
                else if (opts->addr.family == AF_INET6) {
                        memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16);
                        ptr += 4;
                }
#endif

                if (!opts->addr.port) {
                        if (opts->ahmac) {
                                put_unaligned_be64(opts->ahmac, ptr);
                                ptr += 2;
                        }
                } else {
                        u16 port = ntohs(opts->addr.port);

                        if (opts->ahmac) {
                                u8 *bptr = (u8 *)ptr;

                                put_unaligned_be16(port, bptr);
                                bptr += 2;
                                put_unaligned_be64(opts->ahmac, bptr);
                                bptr += 8;
                                put_unaligned_be16(TCPOPT_NOP << 8 |
                                                   TCPOPT_NOP, bptr);

                                ptr += 3;
                        } else {
                                put_unaligned_be32(port << 16 |
                                                   TCPOPT_NOP << 8 |
                                                   TCPOPT_NOP, ptr);
                                ptr += 1;
                        }
                }
        } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) {
                /* FASTCLOSE is mutually exclusive with others except RST */
                *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE,
                                      TCPOLEN_MPTCP_FASTCLOSE,
                                      0, 0);
                put_unaligned_be64(opts->rcvr_key, ptr);
                ptr += 2;

                if (OPTION_MPTCP_RST & opts->suboptions)
                        goto mp_rst;
                return;
        } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) {
mp_fail:
                /* MP_FAIL is mutually exclusive with others except RST */
                subflow = mptcp_subflow_ctx(ssk);
                subflow->send_mp_fail = 0;

                *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL,
                                      TCPOLEN_MPTCP_FAIL,
                                      0, 0);
                put_unaligned_be64(opts->fail_seq, ptr);
                ptr += 2;

                if (OPTION_MPTCP_RST & opts->suboptions)
                        goto mp_rst;
                return;
        } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) {
mp_rst:
                *ptr++ = mptcp_option(MPTCPOPT_RST,
                                      TCPOLEN_MPTCP_RST,
                                      opts->reset_transient,
                                      opts->reset_reason);
                return;
        }

        if (OPTION_MPTCP_PRIO & opts->suboptions) {
                subflow = mptcp_subflow_ctx(ssk);
                subflow->send_mp_prio = 0;

                *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
                                      TCPOLEN_MPTCP_PRIO,
                                      opts->backup, TCPOPT_NOP);

                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX);
        }

mp_capable_done:
        if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
                u8 i = 1;

                *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
                                      TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
                                      0, opts->rm_list.ids[0]);

                while (i < opts->rm_list.nr) {
                        u8 id1, id2, id3, id4;

                        id1 = opts->rm_list.ids[i];
                        id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
                        id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
                        id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
                        put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
                        ptr += 1;
                        i += 4;
                }
        }

        if (tp)
                mptcp_set_rwin(tp, th);
}

__be32 mptcp_get_reset_option(const struct sk_buff *skb)
{
        const struct mptcp_ext *ext = mptcp_get_ext(skb);
        u8 flags, reason;

        if (ext) {
                flags = ext->reset_transient;
                reason = ext->reset_reason;

                return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST,
                                    flags, reason);
        }

        return htonl(0u);
}
EXPORT_SYMBOL_GPL(mptcp_get_reset_option);




































































































































































    3 






    3 

    3 

    3 







    3 

























































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/domain.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"

#include <linux/binfmts.h>
#include <linux/slab.h>
#include <linux/rculist.h>

/* Variables definitions.*/

/* The initial domain. */
struct tomoyo_domain_info tomoyo_kernel_domain;

/**
 * tomoyo_update_policy - Update an entry for exception policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_head
                                                 *,
                                                 const struct tomoyo_acl_head
                                                 *))
{
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_head *entry;
        struct list_head *list = param->list;

        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -ENOMEM;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!check_duplicate(entry, new_entry))
                        continue;
                entry->is_deleted = param->is_delete;
                error = 0;
                break;
        }
        if (error && !param->is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
        return error;
}

/**
 * tomoyo_same_acl_head - Check for duplicated "struct tomoyo_acl_info" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        return a->type == b->type && a->cond == b->cond;
}

/**
 * tomoyo_update_domain - Update an entry for domain policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 * @merge_duplicate: Callback function to merge duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_info
                                                 *,
                                                 const struct tomoyo_acl_info
                                                 *),
                         bool (*merge_duplicate)(struct tomoyo_acl_info *,
                                                 struct tomoyo_acl_info *,
                                                 const bool))
{
        const bool is_delete = param->is_delete;
        int error = is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_info *entry;
        struct list_head * const list = param->list;

        if (param->data[0]) {
                new_entry->cond = tomoyo_get_condition(param);
                if (!new_entry->cond)
                        return -EINVAL;
                /*
                 * Domain transition preference is allowed for only
                 * "file execute" entries.
                 */
                if (new_entry->cond->transit &&
                    !(new_entry->type == TOMOYO_TYPE_PATH_ACL &&
                      container_of(new_entry, struct tomoyo_path_acl, head)
                      ->perm == 1 << TOMOYO_TYPE_EXECUTE))
                        goto out;
        }
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!tomoyo_same_acl_head(entry, new_entry) ||
                    !check_duplicate(entry, new_entry))
                        continue;
                if (merge_duplicate)
                        entry->is_deleted = merge_duplicate(entry, new_entry,
                                                            is_delete);
                else
                        entry->is_deleted = is_delete;
                error = 0;
                break;
        }
        if (error && !is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_condition(new_entry->cond);
        return error;
}

/**
 * tomoyo_check_acl - Do permission check.
 *
 * @r:           Pointer to "struct tomoyo_request_info".
 * @check_entry: Callback function to check type specific parameters.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *))
{
        const struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;
        const struct list_head *list = &domain->acl_info_list;
        u16 i = 0;

retry:
        list_for_each_entry_rcu(ptr, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->is_deleted || ptr->type != r->param_type)
                        continue;
                if (!check_entry(r, ptr))
                        continue;
                if (!tomoyo_condition(r, ptr->cond))
                        continue;
                r->matched_acl = ptr;
                r->granted = true;
                return;
        }
        for (; i < TOMOYO_MAX_ACL_GROUPS; i++) {
                if (!test_bit(i, domain->group))
                        continue;
                list = &domain->ns->acl_group[i++];
                goto retry;
        }
        r->granted = false;
}

/* The list for "struct tomoyo_domain_info". */
LIST_HEAD(tomoyo_domain_list);

/**
 * tomoyo_last_word - Get last component of a domainname.
 *
 * @name: Domainname to check.
 *
 * Returns the last word of @domainname.
 */
static const char *tomoyo_last_word(const char *name)
{
        const char *cp = strrchr(name, ' ');

        if (cp)
                return cp + 1;
        return name;
}

/**
 * tomoyo_same_transition_control - Check for duplicated "struct tomoyo_transition_control" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a,
                                           const struct tomoyo_acl_head *b)
{
        const struct tomoyo_transition_control *p1 = container_of(a,
                                                                  typeof(*p1),
                                                                  head);
        const struct tomoyo_transition_control *p2 = container_of(b,
                                                                  typeof(*p2),
                                                                  head);

        return p1->type == p2->type && p1->is_last_name == p2->is_last_name
                && p1->domainname == p2->domainname
                && p1->program == p2->program;
}

/**
 * tomoyo_write_transition_control - Write "struct tomoyo_transition_control" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @type:  Type of this entry.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type)
{
        struct tomoyo_transition_control e = { .type = type };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        char *program = param->data;
        char *domainname = strstr(program, " from ");

        if (domainname) {
                *domainname = '\0';
                domainname += 6;
        } else if (type == TOMOYO_TRANSITION_CONTROL_NO_KEEP ||
                   type == TOMOYO_TRANSITION_CONTROL_KEEP) {
                domainname = program;
                program = NULL;
        }
        if (program && strcmp(program, "any")) {
                if (!tomoyo_correct_path(program))
                        return -EINVAL;
                e.program = tomoyo_get_name(program);
                if (!e.program)
                        goto out;
        }
        if (domainname && strcmp(domainname, "any")) {
                if (!tomoyo_correct_domain(domainname)) {
                        if (!tomoyo_correct_path(domainname))
                                goto out;
                        e.is_last_name = true;
                }
                e.domainname = tomoyo_get_name(domainname);
                if (!e.domainname)
                        goto out;
        }
        param->list = &param->ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_transition_control);
out:
        tomoyo_put_name(e.domainname);
        tomoyo_put_name(e.program);
        return error;
}

/**
 * tomoyo_scan_transition - Try to find specific domain transition type.
 *
 * @list:       Pointer to "struct list_head".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 * @last_name:  The last component of @domainname.
 * @type:       One of values in "enum tomoyo_transition_type".
 *
 * Returns true if found one, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static inline bool tomoyo_scan_transition
(const struct list_head *list, const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program, const char *last_name,
 const enum tomoyo_transition_type type)
{
        const struct tomoyo_transition_control *ptr;

        list_for_each_entry_rcu(ptr, list, head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->head.is_deleted || ptr->type != type)
                        continue;
                if (ptr->domainname) {
                        if (!ptr->is_last_name) {
                                if (ptr->domainname != domainname)
                                        continue;
                        } else {
                                /*
                                 * Use direct strcmp() since this is
                                 * unlikely used.
                                 */
                                if (strcmp(ptr->domainname->name, last_name))
                                        continue;
                        }
                }
                if (ptr->program && tomoyo_pathcmp(ptr->program, program))
                        continue;
                return true;
        }
        return false;
}

/**
 * tomoyo_transition_type - Get domain transition type.
 *
 * @ns:         Pointer to "struct tomoyo_policy_namespace".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 *
 * Returns TOMOYO_TRANSITION_CONTROL_TRANSIT if executing @program causes
 * domain transition across namespaces, TOMOYO_TRANSITION_CONTROL_INITIALIZE if
 * executing @program reinitializes domain transition within that namespace,
 * TOMOYO_TRANSITION_CONTROL_KEEP if executing @program stays at @domainname ,
 * others otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static enum tomoyo_transition_type tomoyo_transition_type
(const struct tomoyo_policy_namespace *ns,
 const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program)
{
        const char *last_name = tomoyo_last_word(domainname->name);
        enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET;

        while (type < TOMOYO_MAX_TRANSITION_TYPE) {
                const struct list_head * const list =
                        &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];

                if (!tomoyo_scan_transition(list, domainname, program,
                                            last_name, type)) {
                        type++;
                        continue;
                }
                if (type != TOMOYO_TRANSITION_CONTROL_NO_RESET &&
                    type != TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE)
                        break;
                /*
                 * Do not check for reset_domain if no_reset_domain matched.
                 * Do not check for initialize_domain if no_initialize_domain
                 * matched.
                 */
                type++;
                type++;
        }
        return type;
}

/**
 * tomoyo_same_aggregator - Check for duplicated "struct tomoyo_aggregator" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a,
                                   const struct tomoyo_acl_head *b)
{
        const struct tomoyo_aggregator *p1 = container_of(a, typeof(*p1),
                                                          head);
        const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2),
                                                          head);

        return p1->original_name == p2->original_name &&
                p1->aggregated_name == p2->aggregated_name;
}

/**
 * tomoyo_write_aggregator - Write "struct tomoyo_aggregator" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_aggregator(struct tomoyo_acl_param *param)
{
        struct tomoyo_aggregator e = { };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        const char *original_name = tomoyo_read_token(param);
        const char *aggregated_name = tomoyo_read_token(param);

        if (!tomoyo_correct_word(original_name) ||
            !tomoyo_correct_path(aggregated_name))
                return -EINVAL;
        e.original_name = tomoyo_get_name(original_name);
        e.aggregated_name = tomoyo_get_name(aggregated_name);
        if (!e.original_name || !e.aggregated_name ||
            e.aggregated_name->is_patterned) /* No patterns allowed. */
                goto out;
        param->list = &param->ns->policy_list[TOMOYO_ID_AGGREGATOR];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_aggregator);
out:
        tomoyo_put_name(e.original_name);
        tomoyo_put_name(e.aggregated_name);
        return error;
}

/**
 * tomoyo_find_namespace - Find specified namespace.
 *
 * @name: Name of namespace to find.
 * @len:  Length of @name.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" if found,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static struct tomoyo_policy_namespace *tomoyo_find_namespace
(const char *name, const unsigned int len)
{
        struct tomoyo_policy_namespace *ns;

        list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) {
                if (strncmp(name, ns->name, len) ||
                    (name[len] && name[len] != ' '))
                        continue;
                return ns;
        }
        return NULL;
}

/**
 * tomoyo_assign_namespace - Create a new namespace.
 *
 * @domainname: Name of namespace to create.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" on success,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
{
        struct tomoyo_policy_namespace *ptr;
        struct tomoyo_policy_namespace *entry;
        const char *cp = domainname;
        unsigned int len = 0;

        while (*cp && *cp++ != ' ')
                len++;
        ptr = tomoyo_find_namespace(domainname, len);
        if (ptr)
                return ptr;
        if (len >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_domain_def(domainname))
                return NULL;
        entry = kzalloc(sizeof(*entry) + len + 1, GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = tomoyo_find_namespace(domainname, len);
        if (!ptr && tomoyo_memory_ok(entry)) {
                char *name = (char *) (entry + 1);

                ptr = entry;
                memmove(name, domainname, len);
                name[len] = '\0';
                entry->name = name;
                tomoyo_init_policy_namespace(entry);
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_namespace_jump - Check for namespace jump.
 *
 * @domainname: Name of domain.
 *
 * Returns true if namespace differs, false otherwise.
 */
static bool tomoyo_namespace_jump(const char *domainname)
{
        const char *namespace = tomoyo_current_namespace()->name;
        const int len = strlen(namespace);

        return strncmp(domainname, namespace, len) ||
                (domainname[len] && domainname[len] != ' ');
}

/**
 * tomoyo_assign_domain - Create a domain or a namespace.
 *
 * @domainname: The name of domain.
 * @transit:    True if transit to domain found or created.
 *
 * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit)
{
        struct tomoyo_domain_info e = { };
        struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname);
        bool created = false;

        if (entry) {
                if (transit) {
                        /*
                         * Since namespace is created at runtime, profiles may
                         * not be created by the moment the process transits to
                         * that domain. Do not perform domain transition if
                         * profile for that domain is not yet created.
                         */
                        if (tomoyo_policy_loaded &&
                            !entry->ns->profile_ptr[entry->profile])
                                return NULL;
                }
                return entry;
        }
        /* Requested domain does not exist. */
        /* Don't create requested domain if domainname is invalid. */
        if (strlen(domainname) >= TOMOYO_EXEC_TMPSIZE - 10 ||
            !tomoyo_correct_domain(domainname))
                return NULL;
        /*
         * Since definition of profiles and acl_groups may differ across
         * namespaces, do not inherit "use_profile" and "use_group" settings
         * by automatically creating requested domain upon domain transition.
         */
        if (transit && tomoyo_namespace_jump(domainname))
                return NULL;
        e.ns = tomoyo_assign_namespace(domainname);
        if (!e.ns)
                return NULL;
        /*
         * "use_profile" and "use_group" settings for automatically created
         * domains are inherited from current domain. These are 0 for manually
         * created domains.
         */
        if (transit) {
                const struct tomoyo_domain_info *domain = tomoyo_domain();

                e.profile = domain->profile;
                memcpy(e.group, domain->group, sizeof(e.group));
        }
        e.domainname = tomoyo_get_name(domainname);
        if (!e.domainname)
                return NULL;
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        entry = tomoyo_find_domain(domainname);
        if (!entry) {
                entry = tomoyo_commit_ok(&e, sizeof(e));
                if (entry) {
                        INIT_LIST_HEAD(&entry->acl_info_list);
                        list_add_tail_rcu(&entry->list, &tomoyo_domain_list);
                        created = true;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_name(e.domainname);
        if (entry && transit) {
                if (created) {
                        struct tomoyo_request_info r;
                        int i;

                        tomoyo_init_request_info(&r, entry,
                                                 TOMOYO_MAC_FILE_EXECUTE);
                        r.granted = false;
                        tomoyo_write_log(&r, "use_profile %u\n",
                                         entry->profile);
                        for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++)
                                if (test_bit(i, entry->group))
                                        tomoyo_write_log(&r, "use_group %u\n",
                                                         i);
                        tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                }
        }
        return entry;
}

/**
 * tomoyo_environ - Check permission for environment variable names.
 *
 * @ee: Pointer to "struct tomoyo_execve".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_environ(struct tomoyo_execve *ee)
{
        struct tomoyo_request_info *r = &ee->r;
        struct linux_binprm *bprm = ee->bprm;
        /* env_page.data is allocated by tomoyo_dump_page(). */
        struct tomoyo_page_dump env_page = { };
        char *arg_ptr; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
        int arg_len = 0;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        int error = -ENOMEM;

        ee->r.type = TOMOYO_MAC_ENVIRON;
        ee->r.profile = r->domain->profile;
        ee->r.mode = tomoyo_get_mode(r->domain->ns, ee->r.profile,
                                     TOMOYO_MAC_ENVIRON);
        if (!r->mode || !envp_count)
                return 0;
        arg_ptr = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!arg_ptr)
                goto out;
        while (error == -ENOMEM) {
                if (!tomoyo_dump_page(bprm, pos, &env_page))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (argv_count && offset < PAGE_SIZE) {
                        if (!env_page.data[offset++])
                                argv_count--;
                }
                if (argv_count) {
                        offset = 0;
                        continue;
                }
                while (offset < PAGE_SIZE) {
                        const unsigned char c = env_page.data[offset++];

                        if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
                                if (c == '=') {
                                        arg_ptr[arg_len++] = '\0';
                                } else if (c == '\\') {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = '\\';
                                } else if (c > ' ' && c < 127) {
                                        arg_ptr[arg_len++] = c;
                                } else {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = (c >> 6) + '0';
                                        arg_ptr[arg_len++]
                                                = ((c >> 3) & 7) + '0';
                                        arg_ptr[arg_len++] = (c & 7) + '0';
                                }
                        } else {
                                arg_ptr[arg_len] = '\0';
                        }
                        if (c)
                                continue;
                        if (tomoyo_env_perm(r, arg_ptr)) {
                                error = -EPERM;
                                break;
                        }
                        if (!--envp_count) {
                                error = 0;
                                break;
                        }
                        arg_len = 0;
                }
                offset = 0;
        }
out:
        if (r->mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        kfree(env_page.data);
        kfree(arg_ptr);
        return error;
}

/**
 * tomoyo_find_next_domain - Find a domain.
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_find_next_domain(struct linux_binprm *bprm)
{
        struct tomoyo_domain_info *old_domain = tomoyo_domain();
        struct tomoyo_domain_info *domain = NULL;
        const char *original_name = bprm->filename;
        int retval = -ENOMEM;
        bool reject_on_transition_failure = false;
        const struct tomoyo_path_info *candidate;
        struct tomoyo_path_info exename;
        struct tomoyo_execve *ee = kzalloc(sizeof(*ee), GFP_NOFS);

        if (!ee)
                return -ENOMEM;
        ee->tmp = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!ee->tmp) {
                kfree(ee);
                return -ENOMEM;
        }
        /* ee->dump->data is allocated by tomoyo_dump_page(). */
        tomoyo_init_request_info(&ee->r, NULL, TOMOYO_MAC_FILE_EXECUTE);
        ee->r.ee = ee;
        ee->bprm = bprm;
        ee->r.obj = &ee->obj;
        ee->obj.path1 = bprm->file->f_path;
        /* Get symlink's pathname of program. */
        retval = -ENOENT;
        exename.name = tomoyo_realpath_nofollow(original_name);
        if (!exename.name)
                goto out;
        tomoyo_fill_path_info(&exename);
retry:
        /* Check 'aggregator' directive. */
        {
                struct tomoyo_aggregator *ptr;
                struct list_head *list =
                        &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR];

                /* Check 'aggregator' directive. */
                candidate = &exename;
                list_for_each_entry_rcu(ptr, list, head.list,
                                        srcu_read_lock_held(&tomoyo_ss)) {
                        if (ptr->head.is_deleted ||
                            !tomoyo_path_matches_pattern(&exename,
                                                         ptr->original_name))
                                continue;
                        candidate = ptr->aggregated_name;
                        break;
                }
        }

        /* Check execute permission. */
        retval = tomoyo_execute_permission(&ee->r, candidate);
        if (retval == TOMOYO_RETRY_REQUEST)
                goto retry;
        if (retval < 0)
                goto out;
        /*
         * To be able to specify domainnames with wildcards, use the
         * pathname specified in the policy (which may contain
         * wildcard) rather than the pathname passed to execve()
         * (which never contains wildcard).
         */
        if (ee->r.param.path.matched_path)
                candidate = ee->r.param.path.matched_path;

        /*
         * Check for domain transition preference if "file execute" matched.
         * If preference is given, make execve() fail if domain transition
         * has failed, for domain transition preference should be used with
         * destination domain defined.
         */
        if (ee->transition) {
                const char *domainname = ee->transition->name;

                reject_on_transition_failure = true;
                if (!strcmp(domainname, "keep"))
                        goto force_keep_domain;
                if (!strcmp(domainname, "child"))
                        goto force_child_domain;
                if (!strcmp(domainname, "reset"))
                        goto force_reset_domain;
                if (!strcmp(domainname, "initialize"))
                        goto force_initialize_domain;
                if (!strcmp(domainname, "parent")) {
                        char *cp;

                        strscpy(ee->tmp, old_domain->domainname->name, TOMOYO_EXEC_TMPSIZE);
                        cp = strrchr(ee->tmp, ' ');
                        if (cp)
                                *cp = '\0';
                } else if (*domainname == '<')
                        strscpy(ee->tmp, domainname, TOMOYO_EXEC_TMPSIZE);
                else
                        snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                                 old_domain->domainname->name, domainname);
                goto force_jump_domain;
        }
        /*
         * No domain transition preference specified.
         * Calculate domain to transit to.
         */
        switch (tomoyo_transition_type(old_domain->ns, old_domain->domainname,
                                       candidate)) {
        case TOMOYO_TRANSITION_CONTROL_RESET:
force_reset_domain:
                /* Transit to the root of specified namespace. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>",
                         candidate->name);
                /*
                 * Make execve() fail if domain transition across namespaces
                 * has failed.
                 */
                reject_on_transition_failure = true;
                break;
        case TOMOYO_TRANSITION_CONTROL_INITIALIZE:
force_initialize_domain:
                /* Transit to the child of current namespace's root. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->ns->name, candidate->name);
                break;
        case TOMOYO_TRANSITION_CONTROL_KEEP:
force_keep_domain:
                /* Keep current domain. */
                domain = old_domain;
                break;
        default:
                if (old_domain == &tomoyo_kernel_domain &&
                    !tomoyo_policy_loaded) {
                        /*
                         * Needn't to transit from kernel domain before
                         * starting /sbin/init. But transit from kernel domain
                         * if executing initializers because they might start
                         * before /sbin/init.
                         */
                        domain = old_domain;
                        break;
                }
force_child_domain:
                /* Normal domain transition. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->domainname->name, candidate->name);
                break;
        }
force_jump_domain:
        if (!domain)
                domain = tomoyo_assign_domain(ee->tmp, true);
        if (domain)
                retval = 0;
        else if (reject_on_transition_failure) {
                pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp);
                retval = -ENOMEM;
        } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING)
                retval = -ENOMEM;
        else {
                retval = 0;
                if (!old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED]) {
                        old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED] = true;
                        ee->r.granted = false;
                        tomoyo_write_log(&ee->r, "%s", tomoyo_dif
                                         [TOMOYO_DIF_TRANSITION_FAILED]);
                        pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp);
                }
        }
 out:
        if (!domain)
                domain = old_domain;
        /* Update reference count on "struct tomoyo_domain_info". */
        {
                struct tomoyo_task *s = tomoyo_task(current);

                s->old_domain_info = s->domain_info;
                s->domain_info = domain;
                atomic_inc(&domain->users);
        }
        kfree(exename.name);
        if (!retval) {
                ee->r.domain = domain;
                retval = tomoyo_environ(ee);
        }
        kfree(ee->tmp);
        kfree(ee->dump.data);
        kfree(ee);
        return retval;
}

/**
 * tomoyo_dump_page - Dump a page to buffer.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @pos:  Location to dump.
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump)
{
        struct page *page;
#ifdef CONFIG_MMU
        int ret;
#endif

        /* dump->data is released by tomoyo_find_next_domain(). */
        if (!dump->data) {
                dump->data = kzalloc(PAGE_SIZE, GFP_NOFS);
                if (!dump->data)
                        return false;
        }
        /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */
#ifdef CONFIG_MMU
        /*
         * This is called at execve() time in order to dig around
         * in the argv/environment of the new proceess
         * (represented by bprm).
         */
        mmap_read_lock(bprm->mm);
        ret = get_user_pages_remote(bprm->mm, pos, 1,
                                    FOLL_FORCE, &page, NULL);
        mmap_read_unlock(bprm->mm);
        if (ret <= 0)
                return false;
#else
        page = bprm->page[pos / PAGE_SIZE];
#endif
        if (page != dump->page) {
                const unsigned int offset = pos % PAGE_SIZE;
                /*
                 * Maybe kmap()/kunmap() should be used here.
                 * But remove_arg_zero() uses kmap_atomic()/kunmap_atomic().
                 * So do I.
                 */
                char *kaddr = kmap_atomic(page);

                dump->page = page;
                memcpy(dump->data + offset, kaddr + offset,
                       PAGE_SIZE - offset);
                kunmap_atomic(kaddr);
        }
        /* Same with put_arg_page(page) in fs/exec.c */
#ifdef CONFIG_MMU
        put_page(page);
#endif
        return true;
}




















































































































































































































































































































































































































































    5 





    8 











































    1 




    1 



















































































    7 











    7 




















    1 








    7 





























    6 








    6 



    6 































    4 




























































































































































































































































































































































    1 















    1 




    1 

















    8 

    1 







    1 



    8 




   10 



   11 


   11 







    7 
    1 


   11 


    9 

















    9 




   10 









    3 







    3 






    8 
   11 
    4 
    5 



   11 

   10 







   10 












    3 
   10 








    6 







    5 
    1 

    7 















    7 
    8 
    5 




















































































































































































    3 
    6 



    5 




























































































    3 




    5 
    3 
    4 











































    4 









    5 





















































































































































































































































































































































    1 
    1 

































































































































    4 





    3 


    4 

    3 










    4 










    1 




    1 









































    1 





















    1 




















































































































































































































































































































































































































































    4 
    4 



















    4 




    5 














    2 
    3 































































































































































































































































































































    3 












    2 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 




    4 




































    1 



    1 























    1 



    1 
































































    2 

















































































    1 
    1 


    1 

    1 











    1 




    1 












    1 



    1 






    1 








    1 
    1 





















    1 







    1 


    1 
    1 










    1 






    1 



















    1 






    1 



    1 
    1 



























    1 







    1 
    1 


    1 











































































































































































































































    1 



    1 
    1 























































































































    1 




    1 













    1 




















    1 





    1 






    1 









































    1 















    1 






    1 























































    1 


    1 
    1 






    1 






































    1 
    1 































    1 
    1 


    1 
    1 










    1 












    1 












    1 







    1 




























    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    2 













    2 





    2 











    2 
    1 









    1 






















    1 



    1 


    1 

    1 




    1 






    1 
    1 





















































    1 




    1 

    1 




    1 









    1 









    1 


    1 







    1 

    1 




    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















    1 





    1 


    1 



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Routines having to do with the 'struct sk_buff' memory handlers.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                        Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *        Fixes:
 *                Alan Cox        :        Fixed the worst of the load
 *                                        balancer bugs.
 *                Dave Platt        :        Interrupt stacking fix.
 *        Richard Kooijman        :        Timestamp fixes.
 *                Alan Cox        :        Changed buffer format.
 *                Alan Cox        :        destructor hook for AF_UNIX etc.
 *                Linus Torvalds        :        Better skb_clone.
 *                Alan Cox        :        Added skb_copy.
 *                Alan Cox        :        Added all the changed routines Linus
 *                                        only put in the headers
 *                Ray VanTassle        :        Fixed --skb->lock in free
 *                Alan Cox        :        skb_copy copy arp field
 *                Andi Kleen        :        slabified it.
 *                Robert Olsson        :        Removed skb_head_pool
 *
 *        NOTE:
 *                The __skb_ routines should be called with interrupts
 *        disabled, or you better be *real* sure that the operation is atomic
 *        with respect to whatever list is being frobbed (e.g. via lock_sock()
 *        or via disabling bottom half handlers, etc).
 */

/*
 *        The functions in this file will not compile correctly with gcc 2.4.x
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/bitfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
#include <net/dropreason.h>

#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/textsearch.h>

#include "dev.h"
#include "sock_destructor.h"

#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif

#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)

/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
 * size, and we can differentiate heads from skb_small_head_cache
 * vs system slabs by looking at their size (skb_end_offset()).
 */
#define SKB_SMALL_HEAD_CACHE_SIZE                                        \
        (is_power_of_2(SKB_SMALL_HEAD_SIZE) ?                        \
                (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) :        \
                SKB_SMALL_HEAD_SIZE)

#define SKB_SMALL_HEAD_HEADROOM                                                \
        SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)

/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
 * netmem is a page.
 */
static_assert(offsetof(struct bio_vec, bv_page) ==
              offsetof(skb_frag_t, netmem));
static_assert(sizeof_field(struct bio_vec, bv_page) ==
              sizeof_field(skb_frag_t, netmem));

static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
static_assert(sizeof_field(struct bio_vec, bv_len) ==
              sizeof_field(skb_frag_t, len));

static_assert(offsetof(struct bio_vec, bv_offset) ==
              offsetof(skb_frag_t, offset));
static_assert(sizeof_field(struct bio_vec, bv_offset) ==
              sizeof_field(skb_frag_t, offset));

#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
static const char * const drop_reasons[] = {
        [SKB_CONSUMED] = "CONSUMED",
        DEFINE_DROP_REASON(FN, FN)
};

static const struct drop_reason_list drop_reasons_core = {
        .reasons = drop_reasons,
        .n_reasons = ARRAY_SIZE(drop_reasons),
};

const struct drop_reason_list __rcu *
drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
        [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
};
EXPORT_SYMBOL(drop_reasons_by_subsys);

/**
 * drop_reasons_register_subsys - register another drop reason subsystem
 * @subsys: the subsystem to register, must not be the core
 * @list: the list of drop reasons within the subsystem, must point to
 *        a statically initialized list
 */
void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
                                  const struct drop_reason_list *list)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        /* must point to statically allocated memory, so INIT is OK */
        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
}
EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);

/**
 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
 * @subsys: the subsystem to remove, must not be the core
 *
 * Note: This will synchronize_rcu() to ensure no users when it returns.
 */
void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);

        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);

/**
 *        skb_panic - private function for out-of-line support
 *        @skb:        buffer
 *        @sz:        size
 *        @addr:        address
 *        @msg:        skb_over_panic or skb_under_panic
 *
 *        Out-of-line support for skb_put() and skb_push().
 *        Called via the wrapper skb_over_panic() or skb_under_panic().
 *        Keep out of line to prevent kernel bloat.
 *        __builtin_return_address is not used because it is not always reliable.
 */
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
                      const char msg[])
{
        pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
                 msg, addr, skb->len, sz, skb->head, skb->data,
                 (unsigned long)skb->tail, (unsigned long)skb->end,
                 skb->dev ? skb->dev->name : "<NULL>");
        BUG();
}

static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

#define NAPI_SKB_CACHE_SIZE        64
#define NAPI_SKB_CACHE_BULK        16
#define NAPI_SKB_CACHE_HALF        (NAPI_SKB_CACHE_SIZE / 2)

#if PAGE_SIZE == SZ_4K

#define NAPI_HAS_SMALL_PAGE_FRAG        1
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)        ((nc).pfmemalloc)

/* specialized page frag allocator using a single order 0 page
 * and slicing it into 1K sized fragment. Constrained to systems
 * with a very limited amount of 1K fragments fitting a single
 * page - to avoid excessive truesize underestimation
 */

struct page_frag_1k {
        void *va;
        u16 offset;
        bool pfmemalloc;
};

static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
{
        struct page *page;
        int offset;

        offset = nc->offset - SZ_1K;
        if (likely(offset >= 0))
                goto use_frag;

        page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
        if (!page)
                return NULL;

        nc->va = page_address(page);
        nc->pfmemalloc = page_is_pfmemalloc(page);
        offset = PAGE_SIZE - SZ_1K;
        page_ref_add(page, offset / SZ_1K);

use_frag:
        nc->offset = offset;
        return nc->va + offset;
}
#else

/* the small page is actually unused in this build; add dummy helpers
 * to please the compiler and avoid later preprocessor's conditionals
 */
#define NAPI_HAS_SMALL_PAGE_FRAG        0
#define NAPI_SMALL_PAGE_PFMEMALLOC(nc)        false

struct page_frag_1k {
};

static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
{
        return NULL;
}

#endif

struct napi_alloc_cache {
        local_lock_t bh_lock;
        struct page_frag_cache page;
        struct page_frag_1k page_small;
        unsigned int skb_count;
        void *skb_cache[NAPI_SKB_CACHE_SIZE];
};

static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

/* Double check that napi_get_frags() allocates skbs with
 * skb->head being backed by slab, not a page fragment.
 * This is to make sure bug fixed in 3226b158e67c
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 * does not accidentally come back.
 */
void napi_get_frags_check(struct napi_struct *napi)
{
        struct sk_buff *skb;

        local_bh_disable();
        skb = napi_get_frags(napi);
        WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
        napi_free_frags(napi);
        local_bh_enable();
}

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        void *data;

        fragsz = SKB_DATA_ALIGN(fragsz);

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
                                       align_mask);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        return data;

}
EXPORT_SYMBOL(__napi_alloc_frag_align);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        void *data;

        if (in_hardirq() || irqs_disabled()) {
                struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);

                fragsz = SKB_DATA_ALIGN(fragsz);
                data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
                                               align_mask);
        } else {
                local_bh_disable();
                data = __napi_alloc_frag_align(fragsz, align_mask);
                local_bh_enable();
        }
        return data;
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);

static struct sk_buff *napi_skb_cache_get(void)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        struct sk_buff *skb;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        if (unlikely(!nc->skb_count)) {
                nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                                      GFP_ATOMIC,
                                                      NAPI_SKB_CACHE_BULK,
                                                      nc->skb_cache);
                if (unlikely(!nc->skb_count)) {
                        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                        return NULL;
                }
        }

        skb = nc->skb_cache[--nc->skb_count];
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));

        return skb;
}

static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
                                         unsigned int size)
{
        struct skb_shared_info *shinfo;

        size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        /* Assumes caller memset cleared SKB */
        skb->truesize = SKB_TRUESIZE(size);
        refcount_set(&skb->users, 1);
        skb->head = data;
        skb->data = data;
        skb_reset_tail_pointer(skb);
        skb_set_end_offset(skb, size);
        skb->mac_header = (typeof(skb->mac_header))~0U;
        skb->transport_header = (typeof(skb->transport_header))~0U;
        skb->alloc_cpu = raw_smp_processor_id();
        /* make sure we initialize shinfo sequentially */
        shinfo = skb_shinfo(skb);
        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
        atomic_set(&shinfo->dataref, 1);

        skb_set_kcov_handle(skb, kcov_common_handle());
}

static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
                                     unsigned int *size)
{
        void *resized;

        /* Must find the allocation size (and grow it to match). */
        *size = ksize(data);
        /* krealloc() will immediately return "data" when
         * "ksize(data)" is requested: it is the existing upper
         * bounds. As a result, GFP_ATOMIC will be ignored. Note
         * that this "new" pointer needs to be passed back to the
         * caller for use so the __alloc_size hinting will be
         * tracked correctly.
         */
        resized = krealloc(data, *size, GFP_ATOMIC);
        WARN_ON_ONCE(resized != data);
        return resized;
}

/* build_skb() variant which can operate on slab buffers.
 * Note that this should be used sparingly as slab buffers
 * cannot be combined efficiently by GRO!
 */
struct sk_buff *slab_build_skb(void *data)
{
        struct sk_buff *skb;
        unsigned int size;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        data = __slab_build_skb(skb, data, &size);
        __finalize_skb_around(skb, data, size);

        return skb;
}
EXPORT_SYMBOL(slab_build_skb);

/* Caller must provide SKB that is memset cleared */
static void __build_skb_around(struct sk_buff *skb, void *data,
                               unsigned int frag_size)
{
        unsigned int size = frag_size;

        /* frag_size == 0 is considered deprecated now. Callers
         * using slab buffer should use slab_build_skb() instead.
         */
        if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
                data = __slab_build_skb(skb, data, &size);

        __finalize_skb_around(skb, data, size);
}

/**
 * __build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data (must not be 0)
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
 * skb_shared_info. @data must have been allocated from the page
 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
 * allocation is deprecated, and callers should use slab_build_skb()
 * instead.)
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/* build_skb() is wrapper over __build_skb(), that specifically
 * takes care of skb->head and skb->pfmemalloc
 */
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __build_skb(data, frag_size);

        if (likely(skb && frag_size)) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb);

/**
 * build_skb_around - build a network buffer around provided skb
 * @skb: sk_buff provide by caller, must be memset cleared
 * @data: data buffer provided by caller
 * @frag_size: size of data
 */
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size)
{
        if (unlikely(!skb))
                return NULL;

        __build_skb_around(skb, data, frag_size);

        if (frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb_around);

/**
 * __napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __build_skb() that uses NAPI percpu caches to obtain
 * skbuff_head instead of inplace allocation.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = napi_skb_cache_get();
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/**
 * napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __napi_build_skb() that takes care of skb->head_frag
 * and skb->pfmemalloc when the data is a page or page fragment.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __napi_build_skb(data, frag_size);

        if (likely(skb) && frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }

        return skb;
}
EXPORT_SYMBOL(napi_build_skb);

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
                             bool *pfmemalloc)
{
        bool ret_pfmemalloc = false;
        size_t obj_size;
        void *obj;

        obj_size = SKB_HEAD_ALIGN(*size);
        if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
            !(flags & KMALLOC_NOT_NORMAL_BITS)) {
                obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
                                flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                node);
                *size = SKB_SMALL_HEAD_CACHE_SIZE;
                if (obj || !(gfp_pfmemalloc_allowed(flags)))
                        goto out;
                /* Try again but now we are using pfmemalloc reserves */
                ret_pfmemalloc = true;
                obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
                goto out;
        }

        obj_size = kmalloc_size_roundup(obj_size);
        /* The following cast might truncate high-order bits of obj_size, this
         * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
         */
        *size = (unsigned int)obj_size;

        /*
         * Try a regular allocation, when that fails and we're not entitled
         * to the reserves, fail.
         */
        obj = kmalloc_node_track_caller(obj_size,
                                        flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                        node);
        if (obj || !(gfp_pfmemalloc_allowed(flags)))
                goto out;

        /* Try again but now we are using pfmemalloc reserves */
        ret_pfmemalloc = true;
        obj = kmalloc_node_track_caller(obj_size, flags, node);

out:
        if (pfmemalloc)
                *pfmemalloc = ret_pfmemalloc;

        return obj;
}

/*         Allocate a new skbuff. We do this ourselves so we can fill in a few
 *        'private' fields and also do memory statistics to find all the
 *        [BEEP] leaks.
 *
 */

/**
 *        __alloc_skb        -        allocate a network buffer
 *        @size: size to allocate
 *        @gfp_mask: allocation mask
 *        @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *                instead of head cache and allocate a cloned (child) skb.
 *                If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *                allocations in case the data is required for writeback
 *        @node: numa node to allocate memory on
 *
 *        Allocate a new &sk_buff. The returned buffer has no headroom and a
 *        tail room of at least size bytes. The object has a reference count
 *        of one. The return is the buffer. On a failure the return is %NULL.
 *
 *        Buffers may only be allocated from interrupts using a @gfp_mask of
 *        %GFP_ATOMIC.
 */
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
                            int flags, int node)
{
        struct kmem_cache *cache;
        struct sk_buff *skb;
        bool pfmemalloc;
        u8 *data;

        cache = (flags & SKB_ALLOC_FCLONE)
                ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;

        if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
                gfp_mask |= __GFP_MEMALLOC;

        /* Get the HEAD */
        if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
            likely(node == NUMA_NO_NODE || node == numa_mem_id()))
                skb = napi_skb_cache_get();
        else
                skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
        if (unlikely(!skb))
                return NULL;
        prefetchw(skb);

        /* We do our best to align skb_shared_info on a separate cache
         * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
         * aligned memory blocks, unless SLUB/SLAB debug is enabled.
         * Both skb->head and skb_shared_info are cache line aligned.
         */
        data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
        if (unlikely(!data))
                goto nodata;
        /* kmalloc_size_roundup() might give us more room than requested.
         * Put skb_shared_info exactly at the end of allocated zone,
         * to allow max possible filling before reallocation.
         */
        prefetchw(data + SKB_WITH_OVERHEAD(size));

        /*
         * Only clear those fields we need to clear, not those that we will
         * actually initialise below. Hence, don't put any more fields after
         * the tail pointer in struct sk_buff!
         */
        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, size);
        skb->pfmemalloc = pfmemalloc;

        if (flags & SKB_ALLOC_FCLONE) {
                struct sk_buff_fclones *fclones;

                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                skb->fclone = SKB_FCLONE_ORIG;
                refcount_set(&fclones->fclone_ref, 1);
        }

        return skb;

nodata:
        kmem_cache_free(cache, skb);
        return NULL;
}
EXPORT_SYMBOL(__alloc_skb);

/**
 *        __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @len: length to allocate
 *        @gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has NET_SKB_PAD headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
                                   gfp_t gfp_mask)
{
        struct page_frag_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        len += NET_SKB_PAD;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(1024) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len = SKB_HEAD_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        if (in_hardirq() || irqs_disabled()) {
                nc = this_cpu_ptr(&netdev_alloc_cache);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = nc->pfmemalloc;
        } else {
                local_bh_disable();
                local_lock_nested_bh(&napi_alloc_cache.bh_lock);

                nc = this_cpu_ptr(&napi_alloc_cache.page);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = nc->pfmemalloc;

                local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                local_bh_enable();
        }

        if (unlikely(!data))
                return NULL;

        skb = __build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD);
        skb->dev = dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);

/**
 *        napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 *        @napi: napi instance this buffer was allocated for
 *        @len: length to allocate
 *
 *        Allocate a new sk_buff for use in NAPI receive.  This buffer will
 *        attempt to allocate the head from a special reserved region used
 *        only for NAPI Rx allocation.  By doing this we can save several
 *        CPU cycles by avoiding having to disable and re-enable IRQs.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
        gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
        struct napi_alloc_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());
        len += NET_SKB_PAD + NET_IP_ALIGN;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         * When the small frag allocator is available, prefer it over kmalloc
         * for small fragments
         */
        if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
                                  NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc = this_cpu_ptr(&napi_alloc_cache);
        if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
                /* we are artificially inflating the allocation size, but
                 * that is not as bad as it may look like, as:
                 * - 'len' less than GRO_MAX_HEAD makes little sense
                 * - On most systems, larger 'len' values lead to fragment
                 *   size above 512 bytes
                 * - kmalloc would use the kmalloc-1k slab for such values
                 * - Builds with smaller GRO_MAX_HEAD will very likely do
                 *   little networking, as that implies no WiFi and no
                 *   tunnels support, and 32 bits arches.
                 */
                len = SZ_1K;

                data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
                pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
        } else {
                len = SKB_HEAD_ALIGN(len);

                data = page_frag_alloc(&nc->page, len, gfp_mask);
                pfmemalloc = nc->page.pfmemalloc;
        }
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

        if (unlikely(!data))
                return NULL;

        skb = __napi_build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
        skb->dev = napi->dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(napi_alloc_skb);

void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
                            int off, int size, unsigned int truesize)
{
        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_add_rx_frag_netmem);

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize)
{
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_frag_size_add(frag, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);

static void skb_drop_list(struct sk_buff **listp)
{
        kfree_skb_list(*listp);
        *listp = NULL;
}

static inline void skb_drop_fraglist(struct sk_buff *skb)
{
        skb_drop_list(&skb_shinfo(skb)->frag_list);
}

static void skb_clone_fraglist(struct sk_buff *skb)
{
        struct sk_buff *list;

        skb_walk_frags(skb, list)
                skb_get(list);
}

static bool is_pp_page(struct page *page)
{
        return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        u32 size, truesize, len, max_head_size, off;
        struct sk_buff *skb = *pskb, *nskb;
        int err, i, head_off;
        void *data;

        /* XDP does not support fraglist so we need to linearize
         * the skb.
         */
        if (skb_has_frag_list(skb))
                return -EOPNOTSUPP;

        max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
        if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
                return -ENOMEM;

        size = min_t(u32, skb->len, max_head_size);
        truesize = SKB_HEAD_ALIGN(size) + headroom;
        data = page_pool_dev_alloc_va(pool, &truesize);
        if (!data)
                return -ENOMEM;

        nskb = napi_build_skb(data, truesize);
        if (!nskb) {
                page_pool_free_va(pool, data, true);
                return -ENOMEM;
        }

        skb_reserve(nskb, headroom);
        skb_copy_header(nskb, skb);
        skb_mark_for_recycle(nskb);

        err = skb_copy_bits(skb, 0, nskb->data, size);
        if (err) {
                consume_skb(nskb);
                return err;
        }
        skb_put(nskb, size);

        head_off = skb_headroom(nskb) - skb_headroom(skb);
        skb_headers_offset_update(nskb, head_off);

        off = size;
        len = skb->len - off;
        for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
                struct page *page;
                u32 page_off;

                size = min_t(u32, len, PAGE_SIZE);
                truesize = size;

                page = page_pool_dev_alloc(pool, &page_off, &truesize);
                if (!page) {
                        consume_skb(nskb);
                        return -ENOMEM;
                }

                skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
                err = skb_copy_bits(skb, off, page_address(page) + page_off,
                                    size);
                if (err) {
                        consume_skb(nskb);
                        return err;
                }

                len -= size;
                off += size;
        }

        consume_skb(skb);
        *pskb = nskb;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}
EXPORT_SYMBOL(skb_pp_cow_data);

int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         struct bpf_prog *prog)
{
        if (!prog->aux->xdp_has_frags)
                return -EINVAL;

        return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
}
EXPORT_SYMBOL(skb_cow_data_for_xdp);

#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(struct page *page)
{
        page = compound_head(page);

        /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
         * in order to preserve any existing bits, such as bit 0 for the
         * head page of compound page and bit 1 for pfmemalloc page, so
         * mask those bits for freeing side when doing below checking,
         * and page_is_pfmemalloc() is checked in __page_pool_put_page()
         * to avoid recycling the pfmemalloc page.
         */
        if (unlikely(!is_pp_page(page)))
                return false;

        page_pool_put_full_page(page->pp, page, false);

        return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif

static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
        if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
                return false;
        return napi_pp_put_page(virt_to_page(data));
}

/**
 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
 * @skb:        page pool aware skb
 *
 * Increase the fragment reference count (pp_ref_count) of a skb. This is
 * intended to gain fragment references only for page pool aware skbs,
 * i.e. when skb->pp_recycle is true, and not for fragments in a
 * non-pp-recycling skb. It has a fallback to increase references on normal
 * pages, as page pool aware skbs may also have normal page fragments.
 */
static int skb_pp_frag_ref(struct sk_buff *skb)
{
        struct skb_shared_info *shinfo;
        struct page *head_page;
        int i;

        if (!skb->pp_recycle)
                return -EINVAL;

        shinfo = skb_shinfo(skb);

        for (i = 0; i < shinfo->nr_frags; i++) {
                head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
                if (likely(is_pp_page(head_page)))
                        page_pool_ref_page(head_page);
                else
                        page_ref_inc(head_page);
        }
        return 0;
}

static void skb_kfree_head(void *head, unsigned int end_offset)
{
        if (end_offset == SKB_SMALL_HEAD_HEADROOM)
                kmem_cache_free(net_hotdata.skb_small_head_cache, head);
        else
                kfree(head);
}

static void skb_free_head(struct sk_buff *skb)
{
        unsigned char *head = skb->head;

        if (skb->head_frag) {
                if (skb_pp_recycle(skb, head))
                        return;
                skb_free_frag(head);
        } else {
                skb_kfree_head(head, skb_end_offset(skb));
        }
}

static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        int i;

        if (!skb_data_unref(skb, shinfo))
                goto exit;

        if (skb_zcopy(skb)) {
                bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;

                skb_zcopy_clear(skb, true);
                if (skip_unref)
                        goto free_head;
        }

        for (i = 0; i < shinfo->nr_frags; i++)
                __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);

free_head:
        if (shinfo->frag_list)
                kfree_skb_list_reason(shinfo->frag_list, reason);

        skb_free_head(skb);
exit:
        /* When we clone an SKB we copy the reycling bit. The pp_recycle
         * bit is only set on the head though, so in order to avoid races
         * while trying to recycle fragments on __skb_frag_unref() we need
         * to make one SKB responsible for triggering the recycle path.
         * So disable the recycling bit if an SKB is cloned and we have
         * additional references to the fragmented part of the SKB.
         * Eventually the last SKB will have the recycling bit set and it's
         * dataref set to 0, which will trigger the recycling
         */
        skb->pp_recycle = 0;
}

/*
 *        Free an skbuff by memory without cleaning the state.
 */
static void kfree_skbmem(struct sk_buff *skb)
{
        struct sk_buff_fclones *fclones;

        switch (skb->fclone) {
        case SKB_FCLONE_UNAVAILABLE:
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
                return;

        case SKB_FCLONE_ORIG:
                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                /* We usually free the clone (TX completion) before original skb
                 * This test would have no chance to be true for the clone,
                 * while here, branch prediction will be good.
                 */
                if (refcount_read(&fclones->fclone_ref) == 1)
                        goto fastpath;
                break;

        default: /* SKB_FCLONE_CLONE */
                fclones = container_of(skb, struct sk_buff_fclones, skb2);
                break;
        }
        if (!refcount_dec_and_test(&fclones->fclone_ref))
                return;
fastpath:
        kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
}

void skb_release_head_state(struct sk_buff *skb)
{
        skb_dst_drop(skb);
        if (skb->destructor) {
                DEBUG_NET_WARN_ON_ONCE(in_hardirq());
                skb->destructor(skb);
        }
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        nf_conntrack_put(skb_nfct(skb));
#endif
        skb_ext_put(skb);
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_head_state(skb);
        if (likely(skb->head))
                skb_release_data(skb, reason);
}

/**
 *        __kfree_skb - private function
 *        @skb: buffer
 *
 *        Free an sk_buff. Release anything attached to the buffer.
 *        Clean the state. This is an internal helper function. Users should
 *        always call kfree_skb
 */

void __kfree_skb(struct sk_buff *skb)
{
        skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
        kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);

static __always_inline
bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                          enum skb_drop_reason reason)
{
        if (unlikely(!skb_unref(skb)))
                return false;

        DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
                               u32_get_bits(reason,
                                            SKB_DROP_REASON_SUBSYS_MASK) >=
                                SKB_DROP_REASON_SUBSYS_NUM);

        if (reason == SKB_CONSUMED)
                trace_consume_skb(skb, __builtin_return_address(0));
        else
                trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
        return true;
}

/**
 *        sk_skb_reason_drop - free an sk_buff with special reason
 *        @sk: the socket to receive @skb, or NULL if not applicable
 *        @skb: buffer to free
 *        @reason: reason why this skb is dropped
 *
 *        Drop a reference to the buffer and free it if the usage count has hit
 *        zero. Meanwhile, pass the receiving socket and drop reason to
 *        'kfree_skb' tracepoint.
 */
void __fix_address
sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (__sk_skb_reason_drop(sk, skb, reason))
                __kfree_skb(skb);
}
EXPORT_SYMBOL(sk_skb_reason_drop);

#define KFREE_SKB_BULK_SIZE        16

struct skb_free_array {
        unsigned int skb_count;
        void *skb_array[KFREE_SKB_BULK_SIZE];
};

static void kfree_skb_add_bulk(struct sk_buff *skb,
                               struct skb_free_array *sa,
                               enum skb_drop_reason reason)
{
        /* if SKB is a clone, don't handle this case */
        if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, reason);
        sa->skb_array[sa->skb_count++] = skb;

        if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
                                     sa->skb_array);
                sa->skb_count = 0;
        }
}

void __fix_address
kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
{
        struct skb_free_array sa;

        sa.skb_count = 0;

        while (segs) {
                struct sk_buff *next = segs->next;

                if (__sk_skb_reason_drop(NULL, segs, reason)) {
                        skb_poison_list(segs);
                        kfree_skb_add_bulk(segs, &sa, reason);
                }

                segs = next;
        }

        if (sa.skb_count)
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
}
EXPORT_SYMBOL(kfree_skb_list_reason);

/* Dump skb information and contents.
 *
 * Must only be called from net_ratelimit()-ed paths.
 *
 * Dumps whole packets if full_pkt, only headers otherwise.
 */
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
        struct skb_shared_info *sh = skb_shinfo(skb);
        struct net_device *dev = skb->dev;
        struct sock *sk = skb->sk;
        struct sk_buff *list_skb;
        bool has_mac, has_trans;
        int headroom, tailroom;
        int i, len, seg_len;

        if (full_pkt)
                len = skb->len;
        else
                len = min_t(int, skb->len, MAX_HEADER + 128);

        headroom = skb_headroom(skb);
        tailroom = skb_tailroom(skb);

        has_mac = skb_mac_header_was_set(skb);
        has_trans = skb_transport_header_was_set(skb);

        printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
               "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
               "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
               "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
               "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
               "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
               "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
               level, skb->len, headroom, skb_headlen(skb), tailroom,
               has_mac ? skb->mac_header : -1,
               has_mac ? skb_mac_header_len(skb) : -1,
               skb->mac_len,
               skb->network_header,
               has_trans ? skb_network_header_len(skb) : -1,
               has_trans ? skb->transport_header : -1,
               sh->tx_flags, sh->nr_frags,
               sh->gso_size, sh->gso_type, sh->gso_segs,
               skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
               skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
               skb->hash, skb->sw_hash, skb->l4_hash,
               ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
               skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
               skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
               skb->inner_network_header, skb->inner_transport_header);

        if (dev)
                printk("%sdev name=%s feat=%pNF\n",
                       level, dev->name, &dev->features);
        if (sk)
                printk("%ssk family=%hu type=%u proto=%u\n",
                       level, sk->sk_family, sk->sk_type, sk->sk_protocol);

        if (full_pkt && headroom)
                print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->head, headroom, false);

        seg_len = min_t(int, skb_headlen(skb), len);
        if (seg_len)
                print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->data, seg_len, false);
        len -= seg_len;

        if (full_pkt && tailroom)
                print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb_tail_pointer(skb), tailroom, false);

        for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                skb_frag_foreach_page(frag, skb_frag_off(frag),
                                      skb_frag_size(frag), p, p_off, p_len,
                                      copied) {
                        seg_len = min_t(int, p_len, len);
                        vaddr = kmap_atomic(p);
                        print_hex_dump(level, "skb frag:     ",
                                       DUMP_PREFIX_OFFSET,
                                       16, 1, vaddr + p_off, seg_len, false);
                        kunmap_atomic(vaddr);
                        len -= seg_len;
                        if (!len)
                                break;
                }
        }

        if (full_pkt && skb_has_frag_list(skb)) {
                printk("skb fraglist:\n");
                skb_walk_frags(skb, list_skb)
                        skb_dump(level, list_skb, true);
        }
}
EXPORT_SYMBOL(skb_dump);

/**
 *        skb_tx_error - report an sk_buff xmit error
 *        @skb: buffer that triggered an error
 *
 *        Report xmit error if a device callback is tracking this skb.
 *        skb must be freed afterwards.
 */
void skb_tx_error(struct sk_buff *skb)
{
        if (skb) {
                skb_zcopy_downgrade_managed(skb);
                skb_zcopy_clear(skb, true);
        }
}
EXPORT_SYMBOL(skb_tx_error);

#ifdef CONFIG_TRACEPOINTS
/**
 *        consume_skb - free an skbuff
 *        @skb: buffer to free
 *
 *        Drop a ref to the buffer and free it if the usage count has hit zero
 *        Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *        is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_consume_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
#endif

/**
 *        __consume_stateless_skb - free an skbuff, assuming it is stateless
 *        @skb: buffer to free
 *
 *        Alike consume_skb(), but this variant assumes that this is the last
 *        skb reference and all the head states have been already dropped
 */
void __consume_stateless_skb(struct sk_buff *skb)
{
        trace_consume_skb(skb, __builtin_return_address(0));
        skb_release_data(skb, SKB_CONSUMED);
        kfree_skbmem(skb);
}

static void napi_skb_cache_put(struct sk_buff *skb)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        u32 i;

        if (!kasan_mempool_poison_object(skb))
                return;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc->skb_cache[nc->skb_count++] = skb;

        if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
                for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
                        kasan_mempool_unpoison_object(nc->skb_cache[i],
                                                kmem_cache_size(net_hotdata.skbuff_cache));

                kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
                                     nc->skb_cache + NAPI_SKB_CACHE_HALF);
                nc->skb_count = NAPI_SKB_CACHE_HALF;
        }
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}

void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_all(skb, reason);
        napi_skb_cache_put(skb);
}

void napi_skb_free_stolen_head(struct sk_buff *skb)
{
        if (unlikely(skb->slow_gro)) {
                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_ext_put(skb);
                skb_orphan(skb);
                skb->slow_gro = 0;
        }
        napi_skb_cache_put(skb);
}

void napi_consume_skb(struct sk_buff *skb, int budget)
{
        /* Zero budget indicate non-NAPI context called us, like netpoll */
        if (unlikely(!budget)) {
                dev_consume_skb_any(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());

        if (!skb_unref(skb))
                return;

        /* if reaching here SKB is ready to free */
        trace_consume_skb(skb, __builtin_return_address(0));

        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, SKB_CONSUMED);
        napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);

/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
        BUILD_BUG_ON(offsetof(struct sk_buff, field) !=                \
                     offsetof(struct sk_buff, headers.field));        \

static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
        new->tstamp                = old->tstamp;
        /* We do not copy old->sk */
        new->dev                = old->dev;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        skb_dst_copy(new, old);
        __skb_ext_copy(new, old);
        __nf_copy(new, old, false);

        /* Note : this field could be in the headers group.
         * It is not yet because we do not want to have a 16 bit hole
         */
        new->queue_mapping = old->queue_mapping;

        memcpy(&new->headers, &old->headers, sizeof(new->headers));
        CHECK_SKB_FIELD(protocol);
        CHECK_SKB_FIELD(csum);
        CHECK_SKB_FIELD(hash);
        CHECK_SKB_FIELD(priority);
        CHECK_SKB_FIELD(skb_iif);
        CHECK_SKB_FIELD(vlan_proto);
        CHECK_SKB_FIELD(vlan_tci);
        CHECK_SKB_FIELD(transport_header);
        CHECK_SKB_FIELD(network_header);
        CHECK_SKB_FIELD(mac_header);
        CHECK_SKB_FIELD(inner_protocol);
        CHECK_SKB_FIELD(inner_transport_header);
        CHECK_SKB_FIELD(inner_network_header);
        CHECK_SKB_FIELD(inner_mac_header);
        CHECK_SKB_FIELD(mark);
#ifdef CONFIG_NETWORK_SECMARK
        CHECK_SKB_FIELD(secmark);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
        CHECK_SKB_FIELD(napi_id);
#endif
        CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
        CHECK_SKB_FIELD(sender_cpu);
#endif
#ifdef CONFIG_NET_SCHED
        CHECK_SKB_FIELD(tc_index);
#endif

}

/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x

        n->next = n->prev = NULL;
        n->sk = NULL;
        __copy_skb_header(n, skb);

        C(len);
        C(data_len);
        C(mac_len);
        n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
        n->cloned = 1;
        n->nohdr = 0;
        n->peeked = 0;
        C(pfmemalloc);
        C(pp_recycle);
        n->destructor = NULL;
        C(tail);
        C(end);
        C(head);
        C(head_frag);
        C(data);
        C(truesize);
        refcount_set(&n->users, 1);

        atomic_inc(&(skb_shinfo(skb)->dataref));
        skb->cloned = 1;

        return n;
#undef C
}

/**
 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
 * @first: first sk_buff of the msg
 */
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
{
        struct sk_buff *n;

        n = alloc_skb(0, GFP_ATOMIC);
        if (!n)
                return NULL;

        n->len = first->len;
        n->data_len = first->len;
        n->truesize = first->truesize;

        skb_shinfo(n)->frag_list = first;

        __copy_skb_header(n, first);
        n->destructor = NULL;

        return n;
}
EXPORT_SYMBOL_GPL(alloc_skb_for_msg);

/**
 *        skb_morph        -        morph one skb into another
 *        @dst: the skb to receive the contents
 *        @src: the skb to supply the contents
 *
 *        This is identical to skb_clone except that the target skb is
 *        supplied by the user.
 *
 *        The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
        skb_release_all(dst, SKB_CONSUMED);
        return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
        unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
        struct user_struct *user;

        if (capable(CAP_IPC_LOCK) || !size)
                return 0;

        rlim = rlimit(RLIMIT_MEMLOCK);
        if (rlim == RLIM_INFINITY)
                return 0;

        num_pg = (size >> PAGE_SHIFT) + 2;        /* worst case */
        max_pg = rlim >> PAGE_SHIFT;
        user = mmp->user ? : current_user();

        old_pg = atomic_long_read(&user->locked_vm);
        do {
                new_pg = old_pg + num_pg;
                if (new_pg > max_pg)
                        return -ENOBUFS;
        } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));

        if (!mmp->user) {
                mmp->user = get_uid(user);
                mmp->num_pg = num_pg;
        } else {
                mmp->num_pg += num_pg;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);

void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
        if (mmp->user) {
                atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
                free_uid(mmp->user);
        }
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);

static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
        struct ubuf_info_msgzc *uarg;
        struct sk_buff *skb;

        WARN_ON_ONCE(!in_task());

        skb = sock_omalloc(sk, 0, GFP_KERNEL);
        if (!skb)
                return NULL;

        BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
        uarg = (void *)skb->cb;
        uarg->mmp.user = NULL;

        if (mm_account_pinned_pages(&uarg->mmp, size)) {
                kfree_skb(skb);
                return NULL;
        }

        uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
        uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
        uarg->len = 1;
        uarg->bytelen = size;
        uarg->zerocopy = 1;
        uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
        refcount_set(&uarg->ubuf.refcnt, 1);
        sock_hold(sk);

        return &uarg->ubuf;
}

static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
        return container_of((void *)uarg, struct sk_buff, cb);
}

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg)
{
        if (uarg) {
                struct ubuf_info_msgzc *uarg_zc;
                const u32 byte_limit = 1 << 19;                /* limit to a few TSO */
                u32 bytelen, next;

                /* there might be non MSG_ZEROCOPY users */
                if (uarg->ops != &msg_zerocopy_ubuf_ops)
                        return NULL;

                /* realloc only when socket is locked (TCP, UDP cork),
                 * so uarg->len and sk_zckey access is serialized
                 */
                if (!sock_owned_by_user(sk)) {
                        WARN_ON_ONCE(1);
                        return NULL;
                }

                uarg_zc = uarg_to_msgzc(uarg);
                bytelen = uarg_zc->bytelen + size;
                if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
                        /* TCP can create new skb to attach new uarg */
                        if (sk->sk_type == SOCK_STREAM)
                                goto new_alloc;
                        return NULL;
                }

                next = (u32)atomic_read(&sk->sk_zckey);
                if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
                        if (mm_account_pinned_pages(&uarg_zc->mmp, size))
                                return NULL;
                        uarg_zc->len++;
                        uarg_zc->bytelen = bytelen;
                        atomic_set(&sk->sk_zckey, ++next);

                        /* no extra ref when appending to datagram (MSG_MORE) */
                        if (sk->sk_type == SOCK_STREAM)
                                net_zcopy_get(uarg);

                        return uarg;
                }
        }

new_alloc:
        return msg_zerocopy_alloc(sk, size);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);

static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
        struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        u32 old_lo, old_hi;
        u64 sum_len;

        old_lo = serr->ee.ee_info;
        old_hi = serr->ee.ee_data;
        sum_len = old_hi - old_lo + 1ULL + len;

        if (sum_len >= (1ULL << 32))
                return false;

        if (lo != old_hi + 1)
                return false;

        serr->ee.ee_data += len;
        return true;
}

static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
        struct sk_buff *tail, *skb = skb_from_uarg(uarg);
        struct sock_exterr_skb *serr;
        struct sock *sk = skb->sk;
        struct sk_buff_head *q;
        unsigned long flags;
        bool is_zerocopy;
        u32 lo, hi;
        u16 len;

        mm_unaccount_pinned_pages(&uarg->mmp);

        /* if !len, there was only 1 call, and it was aborted
         * so do not queue a completion notification
         */
        if (!uarg->len || sock_flag(sk, SOCK_DEAD))
                goto release;

        len = uarg->len;
        lo = uarg->id;
        hi = uarg->id + len - 1;
        is_zerocopy = uarg->zerocopy;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = 0;
        serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
        serr->ee.ee_data = hi;
        serr->ee.ee_info = lo;
        if (!is_zerocopy)
                serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;

        q = &sk->sk_error_queue;
        spin_lock_irqsave(&q->lock, flags);
        tail = skb_peek_tail(q);
        if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
            !skb_zerocopy_notify_extend(tail, lo, len)) {
                __skb_queue_tail(q, skb);
                skb = NULL;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        sk_error_report(sk);

release:
        consume_skb(skb);
        sock_put(sk);
}

static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
                                  bool success)
{
        struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);

        uarg_zc->zerocopy = uarg_zc->zerocopy & success;

        if (refcount_dec_and_test(&uarg->refcnt))
                __msg_zerocopy_callback(uarg_zc);
}

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;

        atomic_dec(&sk->sk_zckey);
        uarg_to_msgzc(uarg)->len--;

        if (have_uref)
                msg_zerocopy_complete(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);

const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
        .complete = msg_zerocopy_complete,
};
EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg)
{
        struct ubuf_info *orig_uarg = skb_zcopy(skb);
        int err, orig_len = skb->len;

        if (uarg->ops->link_skb) {
                err = uarg->ops->link_skb(skb, uarg);
                if (err)
                        return err;
        } else {
                /* An skb can only point to one uarg. This edge case happens
                 * when TCP appends to an skb, but zerocopy_realloc triggered
                 * a new alloc.
                 */
                if (orig_uarg && uarg != orig_uarg)
                        return -EEXIST;
        }

        err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
        if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
                struct sock *save_sk = skb->sk;

                /* Streams do not free skb on error. Reset to prev state. */
                iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
                skb->sk = sk;
                ___pskb_trim(skb, orig_len);
                skb->sk = save_sk;
                return err;
        }

        if (!uarg->ops->link_skb)
                skb_zcopy_set(skb, uarg, NULL);
        return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);

void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        int i;

        skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_frag_ref(skb, i);
}
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);

static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
                              gfp_t gfp_mask)
{
        if (skb_zcopy(orig)) {
                if (skb_zcopy(nskb)) {
                        /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
                        if (!gfp_mask) {
                                WARN_ON_ONCE(1);
                                return -ENOMEM;
                        }
                        if (skb_uarg(nskb) == skb_uarg(orig))
                                return 0;
                        if (skb_copy_ubufs(nskb, GFP_ATOMIC))
                                return -EIO;
                }
                skb_zcopy_set(nskb, skb_uarg(orig), NULL);
        }
        return 0;
}

/**
 *        skb_copy_ubufs        -        copy userspace skb frags buffers to kernel
 *        @skb: the skb to modify
 *        @gfp_mask: allocation priority
 *
 *        This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
 *        It will copy all frags into kernel and drop the reference
 *        to userspace pages.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 *
 *        Returns 0 on success or a negative error code on failure
 *        to allocate kernel memory to copy to.
 */
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
        int num_frags = skb_shinfo(skb)->nr_frags;
        struct page *page, *head = NULL;
        int i, order, psize, new_frags;
        u32 d_off;

        if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
                return -EINVAL;

        if (!num_frags)
                goto release;

        /* We might have to allocate high order pages, so compute what minimum
         * page order is needed.
         */
        order = 0;
        while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
                order++;
        psize = (PAGE_SIZE << order);

        new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
        for (i = 0; i < new_frags; i++) {
                page = alloc_pages(gfp_mask | __GFP_COMP, order);
                if (!page) {
                        while (head) {
                                struct page *next = (struct page *)page_private(head);
                                put_page(head);
                                head = next;
                        }
                        return -ENOMEM;
                }
                set_page_private(page, (unsigned long)head);
                head = page;
        }

        page = head;
        d_off = 0;
        for (i = 0; i < num_frags; i++) {
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
                                      p, p_off, p_len, copied) {
                        u32 copy, done = 0;
                        vaddr = kmap_atomic(p);

                        while (done < p_len) {
                                if (d_off == psize) {
                                        d_off = 0;
                                        page = (struct page *)page_private(page);
                                }
                                copy = min_t(u32, psize - d_off, p_len - done);
                                memcpy(page_address(page) + d_off,
                                       vaddr + p_off + done, copy);
                                done += copy;
                                d_off += copy;
                        }
                        kunmap_atomic(vaddr);
                }
        }

        /* skb frags release userspace buffers */
        for (i = 0; i < num_frags; i++)
                skb_frag_unref(skb, i);

        /* skb frags point to kernel buffers */
        for (i = 0; i < new_frags - 1; i++) {
                __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
                head = (struct page *)page_private(head);
        }
        __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
                               d_off);
        skb_shinfo(skb)->nr_frags = new_frags;

release:
        skb_zcopy_clear(skb, false);
        return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);

/**
 *        skb_clone        -        duplicate an sk_buff
 *        @skb: buffer to clone
 *        @gfp_mask: allocation priority
 *
 *        Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *        copies share the same packet data but not structure. The new
 *        buffer has a reference count of 1. If the allocation fails the
 *        function returns %NULL otherwise the new buffer is returned.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff_fclones *fclones = container_of(skb,
                                                       struct sk_buff_fclones,
                                                       skb1);
        struct sk_buff *n;

        if (skb_orphan_frags(skb, gfp_mask))
                return NULL;

        if (skb->fclone == SKB_FCLONE_ORIG &&
            refcount_read(&fclones->fclone_ref) == 1) {
                n = &fclones->skb2;
                refcount_set(&fclones->fclone_ref, 2);
                n->fclone = SKB_FCLONE_CLONE;
        } else {
                if (skb_pfmemalloc(skb))
                        gfp_mask |= __GFP_MEMALLOC;

                n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
                if (!n)
                        return NULL;

                n->fclone = SKB_FCLONE_UNAVAILABLE;
        }

        return __skb_clone(n, skb);
}
EXPORT_SYMBOL(skb_clone);

void skb_headers_offset_update(struct sk_buff *skb, int off)
{
        /* Only adjust this if it actually is csum_start rather than csum */
        if (skb->ip_summed == CHECKSUM_PARTIAL)
                skb->csum_start += off;
        /* {transport,network,mac}_header and tail are relative to skb->head */
        skb->transport_header += off;
        skb->network_header   += off;
        if (skb_mac_header_was_set(skb))
                skb->mac_header += off;
        skb->inner_transport_header += off;
        skb->inner_network_header += off;
        skb->inner_mac_header += off;
}
EXPORT_SYMBOL(skb_headers_offset_update);

void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
        __copy_skb_header(new, old);

        skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
        skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
        skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
EXPORT_SYMBOL(skb_copy_header);

static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
        if (skb_pfmemalloc(skb))
                return SKB_ALLOC_RX;
        return 0;
}

/**
 *        skb_copy        -        create private copy of an sk_buff
 *        @skb: buffer to copy
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data. This is used when the
 *        caller wishes to modify the data and needs a private copy of the
 *        data to alter. Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        As by-product this function converts non-linear &sk_buff to linear
 *        one, so that &sk_buff becomes completely private and caller is allowed
 *        to modify all the data of returned buffer. This means that this
 *        function is not recommended for use in circumstances when only
 *        header is going to be modified. Use pskb_copy() instead.
 */

struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff *n;
        unsigned int size;
        int headerlen;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        headerlen = skb_headroom(skb);
        size = skb_end_offset(skb) + skb->data_len;
        n = __alloc_skb(size, gfp_mask,
                        skb_alloc_rx_flag(skb), NUMA_NO_NODE);
        if (!n)
                return NULL;

        /* Set the data pointer */
        skb_reserve(n, headerlen);
        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));

        skb_copy_header(n, skb);
        return n;
}
EXPORT_SYMBOL(skb_copy);

/**
 *        __pskb_copy_fclone        -  create copy of an sk_buff with private head.
 *        @skb: buffer to copy
 *        @headroom: headroom of new skb
 *        @gfp_mask: allocation priority
 *        @fclone: if true allocate the copy of the skb from the fclone
 *        cache instead of the head cache; it is recommended to set this
 *        to true for the cases where the copy will likely be cloned
 *
 *        Make a copy of both an &sk_buff and part of its data, located
 *        in header. Fragmented data remain shared. This is used when
 *        the caller wishes to modify only header of &sk_buff and needs
 *        private copy of the header to alter. Returns %NULL on failure
 *        or the pointer to the buffer on success.
 *        The returned buffer has a reference count of 1.
 */

struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone)
{
        unsigned int size = skb_headlen(skb) + headroom;
        int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
        struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);

        if (!n)
                goto out;

        /* Set the data pointer */
        skb_reserve(n, headroom);
        /* Set the tail pointer and length */
        skb_put(n, skb_headlen(skb));
        /* Copy the bytes */
        skb_copy_from_linear_data(skb, n->data, n->len);

        n->truesize += skb->data_len;
        n->data_len  = skb->data_len;
        n->len             = skb->len;

        if (skb_shinfo(skb)->nr_frags) {
                int i;

                if (skb_orphan_frags(skb, gfp_mask) ||
                    skb_zerocopy_clone(n, skb, gfp_mask)) {
                        kfree_skb(n);
                        n = NULL;
                        goto out;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
                        skb_frag_ref(skb, i);
                }
                skb_shinfo(n)->nr_frags = i;
        }

        if (skb_has_frag_list(skb)) {
                skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
                skb_clone_fraglist(n);
        }

        skb_copy_header(n, skb);
out:
        return n;
}
EXPORT_SYMBOL(__pskb_copy_fclone);

/**
 *        pskb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @nhead: room to add at head
 *        @ntail: room to add at tail
 *        @gfp_mask: allocation priority
 *
 *        Expands (or creates identical copy, if @nhead and @ntail are zero)
 *        header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
 *        reference count of 1. Returns zero in the case of success or error,
 *        if expansion failed. In the last case, &sk_buff is not changed.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
                     gfp_t gfp_mask)
{
        unsigned int osize = skb_end_offset(skb);
        unsigned int size = osize + nhead + ntail;
        long off;
        u8 *data;
        int i;

        BUG_ON(nhead < 0);

        BUG_ON(skb_shared(skb));

        skb_zcopy_downgrade_managed(skb);

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                goto nodata;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy only real data... and, alas, header. This should be
         * optimized for the cases when header is void.
         */
        memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));

        /*
         * if shinfo is shared we must drop the old head gracefully, but if it
         * is not we can just drop the old head and let the existing refcount
         * be since all we did is relocate the values
         */
        if (skb_cloned(skb)) {
                if (skb_orphan_frags(skb, gfp_mask))
                        goto nofrags;
                if (skb_zcopy(skb))
                        refcount_inc(&skb_uarg(skb)->refcnt);
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);

                skb_release_data(skb, SKB_CONSUMED);
        } else {
                skb_free_head(skb);
        }
        off = (data + nhead) - skb->head;

        skb->head     = data;
        skb->head_frag = 0;
        skb->data    += off;

        skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        off           = nhead;
#endif
        skb->tail              += off;
        skb_headers_offset_update(skb, nhead);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        skb_metadata_clear(skb);

        /* It is not generally safe to change skb->truesize.
         * For the moment, we really care of rx path, or
         * when skb is orphaned (not attached to a socket).
         */
        if (!skb->sk || skb->destructor == sock_edemux)
                skb->truesize += size - osize;

        return 0;

nofrags:
        skb_kfree_head(data, size);
nodata:
        return -ENOMEM;
}
EXPORT_SYMBOL(pskb_expand_head);

/* Make private copy of skb with writable head and some headroom */

struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
        struct sk_buff *skb2;
        int delta = headroom - skb_headroom(skb);

        if (delta <= 0)
                skb2 = pskb_copy(skb, GFP_ATOMIC);
        else {
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
                                             GFP_ATOMIC)) {
                        kfree_skb(skb2);
                        skb2 = NULL;
                }
        }
        return skb2;
}
EXPORT_SYMBOL(skb_realloc_headroom);

/* Note: We plan to rework this in linux-6.4 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        unsigned int saved_end_offset, saved_truesize;
        struct skb_shared_info *shinfo;
        int res;

        saved_end_offset = skb_end_offset(skb);
        saved_truesize = skb->truesize;

        res = pskb_expand_head(skb, 0, 0, pri);
        if (res)
                return res;

        skb->truesize = saved_truesize;

        if (likely(skb_end_offset(skb) == saved_end_offset))
                return 0;

        /* We can not change skb->end if the original or new value
         * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
         */
        if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
            skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) {
                /* We think this path should not be taken.
                 * Add a temporary trace to warn us just in case.
                 */
                pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
                            saved_end_offset, skb_end_offset(skb));
                WARN_ON_ONCE(1);
                return 0;
        }

        shinfo = skb_shinfo(skb);

        /* We are about to change back skb->end,
         * we need to move skb_shinfo() to its new location.
         */
        memmove(skb->head + saved_end_offset,
                shinfo,
                offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));

        skb_set_end_offset(skb, saved_end_offset);

        return 0;
}

/**
 *        skb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @headroom: needed headroom
 *
 *        Unlike skb_realloc_headroom, this one does not allocate a new skb
 *        if possible; copies skb->sk to new skb as needed
 *        and frees original skb in case of failures.
 *
 *        It expect increased headroom and generates warning otherwise.
 */

struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
        int delta = headroom - skb_headroom(skb);
        int osize = skb_end_offset(skb);
        struct sock *sk = skb->sk;

        if (WARN_ONCE(delta <= 0,
                      "%s is expecting an increase in the headroom", __func__))
                return skb;

        delta = SKB_DATA_ALIGN(delta);
        /* pskb_expand_head() might crash, if skb is shared. */
        if (skb_shared(skb) || !is_skb_wmem(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                if (unlikely(!nskb))
                        goto fail;

                if (sk)
                        skb_set_owner_w(nskb, sk);
                consume_skb(skb);
                skb = nskb;
        }
        if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
                goto fail;

        if (sk && is_skb_wmem(skb)) {
                delta = skb_end_offset(skb) - osize;
                refcount_add(delta, &sk->sk_wmem_alloc);
                skb->truesize += delta;
        }
        return skb;

fail:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_expand_head);

/**
 *        skb_copy_expand        -        copy and expand sk_buff
 *        @skb: buffer to copy
 *        @newheadroom: new free bytes at head
 *        @newtailroom: new free bytes at tail
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data and while doing so
 *        allocate additional space.
 *
 *        This is used when the caller wishes to modify the data and needs a
 *        private copy of the data to alter as well as more space for new fields.
 *        Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        You must pass %GFP_ATOMIC as the allocation priority if this function
 *        is called from an interrupt.
 */
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                                int newheadroom, int newtailroom,
                                gfp_t gfp_mask)
{
        /*
         *        Allocate the copy buffer
         */
        int head_copy_len, head_copy_off;
        struct sk_buff *n;
        int oldheadroom;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        oldheadroom = skb_headroom(skb);
        n = __alloc_skb(newheadroom + skb->len + newtailroom,
                        gfp_mask, skb_alloc_rx_flag(skb),
                        NUMA_NO_NODE);
        if (!n)
                return NULL;

        skb_reserve(n, newheadroom);

        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        head_copy_len = oldheadroom;
        head_copy_off = 0;
        if (newheadroom <= head_copy_len)
                head_copy_len = newheadroom;
        else
                head_copy_off = newheadroom - head_copy_len;

        /* Copy the linear header and data. */
        BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
                             skb->len + head_copy_len));

        skb_copy_header(n, skb);

        skb_headers_offset_update(n, newheadroom - oldheadroom);

        return n;
}
EXPORT_SYMBOL(skb_copy_expand);

/**
 *        __skb_pad                -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *        @free_on_error: free buffer on error
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error
 *        if @free_on_error is true.
 */

int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
        int err;
        int ntail;

        /* If the skbuff is non linear tailroom is always zero.. */
        if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
                memset(skb->data+skb->len, 0, pad);
                return 0;
        }

        ntail = skb->data_len + pad - (skb->end - skb->tail);
        if (likely(skb_cloned(skb) || ntail > 0)) {
                err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
                if (unlikely(err))
                        goto free_skb;
        }

        /* FIXME: The use of this function with non-linear skb's really needs
         * to be audited.
         */
        err = skb_linearize(skb);
        if (unlikely(err))
                goto free_skb;

        memset(skb->data + skb->len, 0, pad);
        return 0;

free_skb:
        if (free_on_error)
                kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(__skb_pad);

/**
 *        pskb_put - add data to the tail of a potentially fragmented buffer
 *        @skb: start of the buffer to use
 *        @tail: tail fragment of the buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the potentially
 *        fragmented buffer. @tail must be the last fragment of @skb -- or
 *        @skb itself. If this would exceed the total buffer size the kernel
 *        will panic. A pointer to the first byte of the extra data is
 *        returned.
 */

void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
{
        if (tail != skb) {
                skb->data_len += len;
                skb->len += len;
        }
        return skb_put(tail, len);
}
EXPORT_SYMBOL_GPL(pskb_put);

/**
 *        skb_put - add data to a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer. If this would
 *        exceed the total buffer size the kernel will panic. A pointer to the
 *        first byte of the extra data is returned.
 */
void *skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        if (unlikely(skb->tail > skb->end))
                skb_over_panic(skb, len, __builtin_return_address(0));
        return tmp;
}
EXPORT_SYMBOL(skb_put);

/**
 *        skb_push - add data to the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer at the buffer
 *        start. If this would exceed the total buffer headroom the kernel will
 *        panic. A pointer to the first byte of the extra data is returned.
 */
void *skb_push(struct sk_buff *skb, unsigned int len)
{
        skb->data -= len;
        skb->len  += len;
        if (unlikely(skb->data < skb->head))
                skb_under_panic(skb, len, __builtin_return_address(0));
        return skb->data;
}
EXPORT_SYMBOL(skb_push);

/**
 *        skb_pull - remove data from the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the next data in the buffer
 *        is returned. Once the data has been pulled future pushes will overwrite
 *        the old data.
 */
void *skb_pull(struct sk_buff *skb, unsigned int len)
{
        return skb_pull_inline(skb, len);
}
EXPORT_SYMBOL(skb_pull);

/**
 *        skb_pull_data - remove data from the start of a buffer returning its
 *        original position.
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the original data in the buffer
 *        is returned after checking if there is enough data to pull. Once the
 *        data has been pulled future pushes will overwrite the old data.
 */
void *skb_pull_data(struct sk_buff *skb, size_t len)
{
        void *data = skb->data;

        if (skb->len < len)
                return NULL;

        skb_pull(skb, len);

        return data;
}
EXPORT_SYMBOL(skb_pull_data);

/**
 *        skb_trim - remove end from a buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        Cut the length of a buffer down by removing data from the tail. If
 *        the buffer is already under the length specified it is not modified.
 *        The skb must be linear.
 */
void skb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->len > len)
                __skb_trim(skb, len);
}
EXPORT_SYMBOL(skb_trim);

/* Trims skb to length len. It can change skb pointers.
 */

int ___pskb_trim(struct sk_buff *skb, unsigned int len)
{
        struct sk_buff **fragp;
        struct sk_buff *frag;
        int offset = skb_headlen(skb);
        int nfrags = skb_shinfo(skb)->nr_frags;
        int i;
        int err;

        if (skb_cloned(skb) &&
            unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
                return err;

        i = 0;
        if (offset >= len)
                goto drop_pages;

        for (; i < nfrags; i++) {
                int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (end < len) {
                        offset = end;
                        continue;
                }

                skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);

drop_pages:
                skb_shinfo(skb)->nr_frags = i;

                for (; i < nfrags; i++)
                        skb_frag_unref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_drop_fraglist(skb);
                goto done;
        }

        for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
             fragp = &frag->next) {
                int end = offset + frag->len;

                if (skb_shared(frag)) {
                        struct sk_buff *nfrag;

                        nfrag = skb_clone(frag, GFP_ATOMIC);
                        if (unlikely(!nfrag))
                                return -ENOMEM;

                        nfrag->next = frag->next;
                        consume_skb(frag);
                        frag = nfrag;
                        *fragp = frag;
                }

                if (end < len) {
                        offset = end;
                        continue;
                }

                if (end > len &&
                    unlikely((err = pskb_trim(frag, len - offset))))
                        return err;

                if (frag->next)
                        skb_drop_list(&frag->next);
                break;
        }

done:
        if (len > skb_headlen(skb)) {
                skb->data_len -= skb->len - len;
                skb->len       = len;
        } else {
                skb->len       = len;
                skb->data_len  = 0;
                skb_set_tail_pointer(skb, len);
        }

        if (!skb->sk || skb->destructor == sock_edemux)
                skb_condense(skb);
        return 0;
}
EXPORT_SYMBOL(___pskb_trim);

/* Note : use pskb_trim_rcsum() instead of calling this directly
 */
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                int delta = skb->len - len;

                skb->csum = csum_block_sub(skb->csum,
                                           skb_checksum(skb, len, delta, 0),
                                           len);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
                int offset = skb_checksum_start_offset(skb) + skb->csum_offset;

                if (offset + sizeof(__sum16) > hdlen)
                        return -EINVAL;
        }
        return __pskb_trim(skb, len);
}
EXPORT_SYMBOL(pskb_trim_rcsum_slow);

/**
 *        __pskb_pull_tail - advance tail of skb header
 *        @skb: buffer to reallocate
 *        @delta: number of bytes to advance tail
 *
 *        The function makes a sense only on a fragmented &sk_buff,
 *        it expands header moving its tail forward and copying necessary
 *        data from fragmented part.
 *
 *        &sk_buff MUST have reference count of 1.
 *
 *        Returns %NULL (and &sk_buff does not change) if pull failed
 *        or value of new tail of skb in the case of success.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

/* Moves tail of skb head forward, copying data from fragmented part,
 * when it is necessary.
 * 1. It may fail due to malloc failure.
 * 2. It may change skb pointers.
 *
 * It is pretty complicated. Luckily, it is called only in exceptional cases.
 */
void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
        /* If skb has not enough free space at tail, get new one
         * plus 128 bytes for future expansions. If we have enough
         * room at tail, reallocate without expansion only if skb is cloned.
         */
        int i, k, eat = (skb->tail + delta) - skb->end;

        if (eat > 0 || skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
                                     GFP_ATOMIC))
                        return NULL;
        }

        BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
                             skb_tail_pointer(skb), delta));

        /* Optimization: no fragments, no reasons to preestimate
         * size of pulled pages. Superb.
         */
        if (!skb_has_frag_list(skb))
                goto pull_pages;

        /* Estimate size of pulled pages. */
        eat = delta;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size >= eat)
                        goto pull_pages;
                eat -= size;
        }

        /* If we need update frag list, we are in troubles.
         * Certainly, it is possible to add an offset to skb data,
         * but taking into account that pulling is expected to
         * be very rare operation, it is worth to fight against
         * further bloating skb head and crucify ourselves here instead.
         * Pure masohism, indeed. 8)8)
         */
        if (eat) {
                struct sk_buff *list = skb_shinfo(skb)->frag_list;
                struct sk_buff *clone = NULL;
                struct sk_buff *insp = NULL;

                do {
                        if (list->len <= eat) {
                                /* Eaten as whole. */
                                eat -= list->len;
                                list = list->next;
                                insp = list;
                        } else {
                                /* Eaten partially. */
                                if (skb_is_gso(skb) && !list->head_frag &&
                                    skb_headlen(list))
                                        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;

                                if (skb_shared(list)) {
                                        /* Sucks! We need to fork list. :-( */
                                        clone = skb_clone(list, GFP_ATOMIC);
                                        if (!clone)
                                                return NULL;
                                        insp = list->next;
                                        list = clone;
                                } else {
                                        /* This may be pulled without
                                         * problems. */
                                        insp = list;
                                }
                                if (!pskb_pull(list, eat)) {
                                        kfree_skb(clone);
                                        return NULL;
                                }
                                break;
                        }
                } while (eat);

                /* Free pulled out fragments. */
                while ((list = skb_shinfo(skb)->frag_list) != insp) {
                        skb_shinfo(skb)->frag_list = list->next;
                        consume_skb(list);
                }
                /* And insert new clone at head. */
                if (clone) {
                        clone->next = list;
                        skb_shinfo(skb)->frag_list = clone;
                }
        }
        /* Success! Now we may commit changes to skb data. */

pull_pages:
        eat = delta;
        k = 0;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size <= eat) {
                        skb_frag_unref(skb, i);
                        eat -= size;
                } else {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[k];

                        *frag = skb_shinfo(skb)->frags[i];
                        if (eat) {
                                skb_frag_off_add(frag, eat);
                                skb_frag_size_sub(frag, eat);
                                if (!i)
                                        goto end;
                                eat = 0;
                        }
                        k++;
                }
        }
        skb_shinfo(skb)->nr_frags = k;

end:
        skb->tail     += delta;
        skb->data_len -= delta;

        if (!skb->data_len)
                skb_zcopy_clear(skb, false);

        return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);

/**
 *        skb_copy_bits - copy bits from skb to kernel buffer
 *        @skb: source skb
 *        @offset: offset in source
 *        @to: destination buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source skb to the
 *        destination buffer.
 *
 *        CAUTION ! :
 *                If its prototype is ever changed,
 *                check arch/{*}/net/{*}.S files,
 *                since it is called from BPF assembly code.
 */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        /* Copy header. */
        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_from_linear_data_offset(skb, offset, to, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                to     += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(f);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(f,
                                              skb_frag_off(f) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(to + copied, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_bits(frag_iter, offset - start, to, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_bits);

/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
        put_page(spd->pages[i]);
}

static struct page *linear_to_page(struct page *page, unsigned int *len,
                                   unsigned int *offset,
                                   struct sock *sk)
{
        struct page_frag *pfrag = sk_page_frag(sk);

        if (!sk_page_frag_refill(sk, pfrag))
                return NULL;

        *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);

        memcpy(page_address(pfrag->page) + pfrag->offset,
               page_address(page) + *offset, *len);
        *offset = pfrag->offset;
        pfrag->offset += *len;

        return pfrag->page;
}

static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
                             struct page *page,
                             unsigned int offset)
{
        return        spd->nr_pages &&
                spd->pages[spd->nr_pages - 1] == page &&
                (spd->partial[spd->nr_pages - 1].offset +
                 spd->partial[spd->nr_pages - 1].len == offset);
}

/*
 * Fill page/offset/length into spd, if it can hold more pages.
 */
static bool spd_fill_page(struct splice_pipe_desc *spd,
                          struct pipe_inode_info *pipe, struct page *page,
                          unsigned int *len, unsigned int offset,
                          bool linear,
                          struct sock *sk)
{
        if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
                return true;

        if (linear) {
                page = linear_to_page(page, len, &offset, sk);
                if (!page)
                        return true;
        }
        if (spd_can_coalesce(spd, page, offset)) {
                spd->partial[spd->nr_pages - 1].len += *len;
                return false;
        }
        get_page(page);
        spd->pages[spd->nr_pages] = page;
        spd->partial[spd->nr_pages].len = *len;
        spd->partial[spd->nr_pages].offset = offset;
        spd->nr_pages++;

        return false;
}

static bool __splice_segment(struct page *page, unsigned int poff,
                             unsigned int plen, unsigned int *off,
                             unsigned int *len,
                             struct splice_pipe_desc *spd, bool linear,
                             struct sock *sk,
                             struct pipe_inode_info *pipe)
{
        if (!*len)
                return true;

        /* skip this segment if already processed */
        if (*off >= plen) {
                *off -= plen;
                return false;
        }

        /* ignore any bits we already processed */
        poff += *off;
        plen -= *off;
        *off = 0;

        do {
                unsigned int flen = min(*len, plen);

                if (spd_fill_page(spd, pipe, page, &flen, poff,
                                  linear, sk))
                        return true;
                poff += flen;
                plen -= flen;
                *len -= flen;
        } while (*len && plen);

        return false;
}

/*
 * Map linear and fragment data from the skb to spd. It reports true if the
 * pipe is full or if we already spliced the requested length.
 */
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
                              unsigned int *offset, unsigned int *len,
                              struct splice_pipe_desc *spd, struct sock *sk)
{
        int seg;
        struct sk_buff *iter;

        /* map the linear part :
         * If skb->head_frag is set, this 'linear' part is backed by a
         * fragment, and if the head is not shared with any clones then
         * we can avoid a copy since we own the head portion of this page.
         */
        if (__splice_segment(virt_to_page(skb->data),
                             (unsigned long) skb->data & (PAGE_SIZE - 1),
                             skb_headlen(skb),
                             offset, len, spd,
                             skb_head_is_locked(skb),
                             sk, pipe))
                return true;

        /*
         * then map the fragments
         */
        for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
                const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];

                if (__splice_segment(skb_frag_page(f),
                                     skb_frag_off(f), skb_frag_size(f),
                                     offset, len, spd, false, sk, pipe))
                        return true;
        }

        skb_walk_frags(skb, iter) {
                if (*offset >= iter->len) {
                        *offset -= iter->len;
                        continue;
                }
                /* __skb_splice_bits() only fails if the output has no room
                 * left, so no point in going over the frag_list for the error
                 * case.
                 */
                if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
                        return true;
        }

        return false;
}

/*
 * Map data from the skb to a pipe. Should handle both the linear part,
 * the fragments, and the frag list.
 */
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int tlen,
                    unsigned int flags)
{
        struct partial_page partial[MAX_SKB_FRAGS];
        struct page *pages[MAX_SKB_FRAGS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
                .nr_pages_max = MAX_SKB_FRAGS,
                .ops = &nosteal_pipe_buf_ops,
                .spd_release = sock_spd_release,
        };
        int ret = 0;

        __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);

        if (spd.nr_pages)
                ret = splice_to_pipe(pipe, &spd);

        return ret;
}
EXPORT_SYMBOL_GPL(skb_splice_bits);

static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;
        size_t size = msg_data_left(msg);

        if (!sock)
                return -EINVAL;

        if (!sock->ops->sendmsg_locked)
                return sock_no_sendmsg_locked(sk, msg, size);

        return sock->ops->sendmsg_locked(sk, msg, size);
}

static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;

        if (!sock)
                return -EINVAL;
        return sock_sendmsg(sock, msg);
}

typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
                           int len, sendmsg_func sendmsg)
{
        unsigned int orig_len = len;
        struct sk_buff *head = skb;
        unsigned short fragidx;
        int slen, ret;

do_frag_list:

        /* Deal with head data */
        while (offset < skb_headlen(skb) && len) {
                struct kvec kv;
                struct msghdr msg;

                slen = min_t(int, len, skb_headlen(skb) - offset);
                kv.iov_base = skb->data + offset;
                kv.iov_len = slen;
                memset(&msg, 0, sizeof(msg));
                msg.msg_flags = MSG_DONTWAIT;

                iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
                ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                      sendmsg_unlocked, sk, &msg);
                if (ret <= 0)
                        goto error;

                offset += ret;
                len -= ret;
        }

        /* All the data was skb head? */
        if (!len)
                goto out;

        /* Make offset relative to start of frags */
        offset -= skb_headlen(skb);

        /* Find where we are in frag list */
        for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                if (offset < skb_frag_size(frag))
                        break;

                offset -= skb_frag_size(frag);
        }

        for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                slen = min_t(size_t, len, skb_frag_size(frag) - offset);

                while (slen) {
                        struct bio_vec bvec;
                        struct msghdr msg = {
                                .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
                        };

                        bvec_set_page(&bvec, skb_frag_page(frag), slen,
                                      skb_frag_off(frag) + offset);
                        iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
                                      slen);

                        ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                              sendmsg_unlocked, sk, &msg);
                        if (ret <= 0)
                                goto error;

                        len -= ret;
                        offset += ret;
                        slen -= ret;
                }

                offset = 0;
        }

        if (len) {
                /* Process any frag lists */

                if (skb == head) {
                        if (skb_has_frag_list(skb)) {
                                skb = skb_shinfo(skb)->frag_list;
                                goto do_frag_list;
                        }
                } else if (skb->next) {
                        skb = skb->next;
                        goto do_frag_list;
                }
        }

out:
        return orig_len - len;

error:
        return orig_len == len ? ret : orig_len - len;
}

/* Send skb data on a socket. Socket must be locked. */
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);

/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
}

/**
 *        skb_store_bits - store bits from kernel buffer to skb
 *        @skb: destination buffer
 *        @offset: offset in destination
 *        @from: source buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source buffer to the
 *        destination skb.  This function handles all the messy bits of
 *        traversing fragment lists and such.
 */

int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_to_linear_data_offset(skb, offset, from, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                from += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(vaddr + p_off, from + copied, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_store_bits(frag_iter, offset - start,
                                           from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_store_bits);

/* Checksum skb data. */
__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
                      __wsum csum, const struct skb_checksum_ops *ops)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;

        /* Checksum header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
                                       skb->data + offset, copy, csum);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                pos        = copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = INDIRECT_CALL_1(ops->update,
                                                        csum_partial_ext,
                                                        vaddr + p_off, p_len, 0);
                                kunmap_atomic(vaddr);
                                csum = INDIRECT_CALL_1(ops->combine,
                                                       csum_block_add_ext, csum,
                                                       csum2, pos, p_len);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        __wsum csum2;
                        if (copy > len)
                                copy = len;
                        csum2 = __skb_checksum(frag_iter, offset - start,
                                               copy, 0, ops);
                        csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
                                               csum, csum2, pos, copy);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);

        return csum;
}
EXPORT_SYMBOL(__skb_checksum);

__wsum skb_checksum(const struct sk_buff *skb, int offset,
                    int len, __wsum csum)
{
        const struct skb_checksum_ops ops = {
                .update  = csum_partial_ext,
                .combine = csum_block_add_ext,
        };

        return __skb_checksum(skb, offset, len, csum, &ops);
}
EXPORT_SYMBOL(skb_checksum);

/* Both of above in one bottle. */

__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
                                    u8 *to, int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;
        __wsum csum = 0;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = csum_partial_copy_nocheck(skb->data + offset, to,
                                                 copy);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                to     += copy;
                pos        = copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = csum_partial_copy_nocheck(vaddr + p_off,
                                                                  to + copied,
                                                                  p_len);
                                kunmap_atomic(vaddr);
                                csum = csum_block_add(csum, csum2, pos);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                __wsum csum2;
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        csum2 = skb_copy_and_csum_bits(frag_iter,
                                                       offset - start,
                                                       to, copy);
                        csum = csum_block_add(csum, csum2, pos);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        to     += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return csum;
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
        __sum16 sum;

        sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
        /* See comments in __skb_checksum_complete(). */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }
        if (!skb_shared(skb))
                skb->csum_valid = !sum;
        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);

/* This function assumes skb->csum already holds pseudo header's checksum,
 * which has been changed from the hardware checksum, for example, by
 * __skb_checksum_validate_complete(). And, the original skb->csum must
 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
 *
 * It returns non-zero if the recomputed checksum is still invalid, otherwise
 * zero. The new checksum is stored back into skb->csum unless the skb is
 * shared.
 */
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
        __wsum csum;
        __sum16 sum;

        csum = skb_checksum(skb, 0, skb->len, 0);

        sum = csum_fold(csum_add(skb->csum, csum));
        /* This check is inverted, because we already knew the hardware
         * checksum is invalid before calling this function. So, if the
         * re-computed checksum is valid instead, then we have a mismatch
         * between the original skb->csum and skb_checksum(). This means either
         * the original hardware checksum is incorrect or we screw up skb->csum
         * when moving skb->data around.
         */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }

        if (!skb_shared(skb)) {
                /* Save full packet checksum */
                skb->csum = csum;
                skb->ip_summed = CHECKSUM_COMPLETE;
                skb->csum_complete_sw = 1;
                skb->csum_valid = !sum;
        }

        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);

static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
{
        net_warn_ratelimited(
                "%s: attempt to compute crc32c without libcrc32c.ko\n",
                __func__);
        return 0;
}

static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
                                       int offset, int len)
{
        net_warn_ratelimited(
                "%s: attempt to compute crc32c without libcrc32c.ko\n",
                __func__);
        return 0;
}

static const struct skb_checksum_ops default_crc32c_ops = {
        .update  = warn_crc32c_csum_update,
        .combine = warn_crc32c_csum_combine,
};

const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
        &default_crc32c_ops;
EXPORT_SYMBOL(crc32c_csum_stub);

 /**
 *        skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
 *        @from: source buffer
 *
 *        Calculates the amount of linear headroom needed in the 'to' skb passed
 *        into skb_zerocopy().
 */
unsigned int
skb_zerocopy_headlen(const struct sk_buff *from)
{
        unsigned int hlen = 0;

        if (!from->head_frag ||
            skb_headlen(from) < L1_CACHE_BYTES ||
            skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
                hlen = skb_headlen(from);
                if (!hlen)
                        hlen = from->len;
        }

        if (skb_has_frag_list(from))
                hlen = from->len;

        return hlen;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);

/**
 *        skb_zerocopy - Zero copy skb to skb
 *        @to: destination buffer
 *        @from: source buffer
 *        @len: number of bytes to copy from source buffer
 *        @hlen: size of linear headroom in destination buffer
 *
 *        Copies up to `len` bytes from `from` to `to` by creating references
 *        to the frags in the source buffer.
 *
 *        The `hlen` as calculated by skb_zerocopy_headlen() specifies the
 *        headroom in the `to` buffer.
 *
 *        Return value:
 *        0: everything is OK
 *        -ENOMEM: couldn't orphan frags of @from due to lack of memory
 *        -EFAULT: skb_copy_bits() found some problem with skb geometry
 */
int
skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
        int i, j = 0;
        int plen = 0; /* length of skb->head fragment */
        int ret;
        struct page *page;
        unsigned int offset;

        BUG_ON(!from->head_frag && !hlen);

        /* dont bother with small payloads */
        if (len <= skb_tailroom(to))
                return skb_copy_bits(from, 0, skb_put(to, len), len);

        if (hlen) {
                ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
                if (unlikely(ret))
                        return ret;
                len -= hlen;
        } else {
                plen = min_t(int, skb_headlen(from), len);
                if (plen) {
                        page = virt_to_head_page(from->head);
                        offset = from->data - (unsigned char *)page_address(page);
                        __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
                                               offset, plen);
                        get_page(page);
                        j = 1;
                        len -= plen;
                }
        }

        skb_len_add(to, len + plen);

        if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
                skb_tx_error(from);
                return -ENOMEM;
        }
        skb_zerocopy_clone(to, from, GFP_ATOMIC);

        for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
                int size;

                if (!len)
                        break;
                skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
                size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
                                        len);
                skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
                len -= size;
                skb_frag_ref(to, j);
                j++;
        }
        skb_shinfo(to)->nr_frags = j;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);

void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
        __wsum csum;
        long csstart;

        if (skb->ip_summed == CHECKSUM_PARTIAL)
                csstart = skb_checksum_start_offset(skb);
        else
                csstart = skb_headlen(skb);

        BUG_ON(csstart > skb_headlen(skb));

        skb_copy_from_linear_data(skb, to, csstart);

        csum = 0;
        if (csstart != skb->len)
                csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
                                              skb->len - csstart);

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                long csstuff = csstart + skb->csum_offset;

                *((__sum16 *)(to + csstuff)) = csum_fold(csum);
        }
}
EXPORT_SYMBOL(skb_copy_and_csum_dev);

/**
 *        skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The head item is
 *        returned or %NULL if the list is empty.
 */

struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue);

/**
 *        skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The tail item is
 *        returned or %NULL if the list is empty.
 */
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue_tail(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue_tail);

/**
 *        skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function takes the list
 *        lock and is atomic with respect to other list locking functions.
 */
void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason)
{
        struct sk_buff_head tmp;
        unsigned long flags;

        if (skb_queue_empty_lockless(list))
                return;

        __skb_queue_head_init(&tmp);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_splice_init(list, &tmp);
        spin_unlock_irqrestore(&list->lock, flags);

        __skb_queue_purge_reason(&tmp, reason);
}
EXPORT_SYMBOL(skb_queue_purge_reason);

/**
 *        skb_rbtree_purge - empty a skb rbtree
 *        @root: root of the rbtree to empty
 *        Return value: the sum of truesizes of all purged skbs.
 *
 *        Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
 *        the list and one reference dropped. This function does not take
 *        any lock. Synchronization should be handled by the caller (e.g., TCP
 *        out-of-order queue is protected by the socket lock).
 */
unsigned int skb_rbtree_purge(struct rb_root *root)
{
        struct rb_node *p = rb_first(root);
        unsigned int sum = 0;

        while (p) {
                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);

                p = rb_next(p);
                rb_erase(&skb->rbnode, root);
                sum += skb->truesize;
                kfree_skb(skb);
        }
        return sum;
}

void skb_errqueue_purge(struct sk_buff_head *list)
{
        struct sk_buff *skb, *next;
        struct sk_buff_head kill;
        unsigned long flags;

        __skb_queue_head_init(&kill);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_walk_safe(list, skb, next) {
                if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
                    SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
                        continue;
                __skb_unlink(skb, list);
                __skb_queue_tail(&kill, skb);
        }
        spin_unlock_irqrestore(&list->lock, flags);
        __skb_queue_purge(&kill);
}
EXPORT_SYMBOL(skb_errqueue_purge);

/**
 *        skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_head(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_head);

/**
 *        skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the tail of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_tail(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_tail);

/**
 *        skb_unlink        -        remove a buffer from a list
 *        @skb: buffer to remove
 *        @list: list to use
 *
 *        Remove a packet from a list. The list locks are taken and this
 *        function is atomic with respect to other list locked calls
 *
 *        You must know what list the SKB is on.
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_unlink(skb, list);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_unlink);

/**
 *        skb_append        -        append a buffer
 *        @old: buffer to insert after
 *        @newsk: buffer to insert
 *        @list: list to use
 *
 *        Place a packet after a given packet in a list. The list locks are taken
 *        and this function is atomic with respect to other list locked calls.
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_after(list, old, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_append);

static inline void skb_split_inside_header(struct sk_buff *skb,
                                           struct sk_buff* skb1,
                                           const u32 len, const int pos)
{
        int i;

        skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
                                         pos - len);
        /* And move data appendix as is. */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];

        skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
        skb_shinfo(skb)->nr_frags  = 0;
        skb1->data_len                   = skb->data_len;
        skb1->len                   += skb1->data_len;
        skb->data_len                   = 0;
        skb->len                   = len;
        skb_set_tail_pointer(skb, len);
}

static inline void skb_split_no_header(struct sk_buff *skb,
                                       struct sk_buff* skb1,
                                       const u32 len, int pos)
{
        int i, k = 0;
        const int nfrags = skb_shinfo(skb)->nr_frags;

        skb_shinfo(skb)->nr_frags = 0;
        skb1->len                  = skb1->data_len = skb->len - len;
        skb->len                  = len;
        skb->data_len                  = len - pos;

        for (i = 0; i < nfrags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + size > len) {
                        skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < len) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_ref(skb, i);
                                skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
                                skb_shinfo(skb)->nr_frags++;
                        }
                        k++;
                } else
                        skb_shinfo(skb)->nr_frags++;
                pos += size;
        }
        skb_shinfo(skb1)->nr_frags = k;
}

/**
 * skb_split - Split fragmented skb to two parts at length len.
 * @skb: the buffer to split
 * @skb1: the buffer to receive the second part
 * @len: new length for skb
 */
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
        int pos = skb_headlen(skb);
        const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;

        skb_zcopy_downgrade_managed(skb);

        skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
        skb_zerocopy_clone(skb1, skb, 0);
        if (len < pos)        /* Split line is inside header. */
                skb_split_inside_header(skb, skb1, len, pos);
        else                /* Second chunk has no header, nothing to copy. */
                skb_split_no_header(skb, skb1, len, pos);
}
EXPORT_SYMBOL(skb_split);

/* Shifting from/to a cloned skb is a no-go.
 *
 * Caller cannot keep skb_shinfo related pointers past calling here!
 */
static int skb_prepare_for_shift(struct sk_buff *skb)
{
        return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
}

/**
 * skb_shift - Shifts paged data partially from skb to another
 * @tgt: buffer into which tail data gets added
 * @skb: buffer from which the paged data comes from
 * @shiftlen: shift up to this many bytes
 *
 * Attempts to shift up to shiftlen worth of bytes, which may be less than
 * the length of the skb, from skb to tgt. Returns number bytes shifted.
 * It's up to caller to free skb if everything was shifted.
 *
 * If @tgt runs out of frags, the whole operation is aborted.
 *
 * Skb cannot include anything else but paged data while tgt is allowed
 * to have non-paged data as well.
 *
 * TODO: full sized shift could be optimized but that would need
 * specialized skb free'er to handle frags without up-to-date nr_frags.
 */
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
        int from, to, merge, todo;
        skb_frag_t *fragfrom, *fragto;

        BUG_ON(shiftlen > skb->len);

        if (skb_headlen(skb))
                return 0;
        if (skb_zcopy(tgt) || skb_zcopy(skb))
                return 0;

        DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
        DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));

        todo = shiftlen;
        from = 0;
        to = skb_shinfo(tgt)->nr_frags;
        fragfrom = &skb_shinfo(skb)->frags[from];

        /* Actual merge is delayed until the point when we know we can
         * commit all, so that we don't have to undo partial changes
         */
        if (!to ||
            !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
                              skb_frag_off(fragfrom))) {
                merge = -1;
        } else {
                merge = to - 1;

                todo -= skb_frag_size(fragfrom);
                if (todo < 0) {
                        if (skb_prepare_for_shift(skb) ||
                            skb_prepare_for_shift(tgt))
                                return 0;

                        /* All previous frag pointers might be stale! */
                        fragfrom = &skb_shinfo(skb)->frags[from];
                        fragto = &skb_shinfo(tgt)->frags[merge];

                        skb_frag_size_add(fragto, shiftlen);
                        skb_frag_size_sub(fragfrom, shiftlen);
                        skb_frag_off_add(fragfrom, shiftlen);

                        goto onlymerged;
                }

                from++;
        }

        /* Skip full, not-fitting skb to avoid expensive operations */
        if ((shiftlen == skb->len) &&
            (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
                return 0;

        if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
                return 0;

        while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
                if (to == MAX_SKB_FRAGS)
                        return 0;

                fragfrom = &skb_shinfo(skb)->frags[from];
                fragto = &skb_shinfo(tgt)->frags[to];

                if (todo >= skb_frag_size(fragfrom)) {
                        *fragto = *fragfrom;
                        todo -= skb_frag_size(fragfrom);
                        from++;
                        to++;

                } else {
                        __skb_frag_ref(fragfrom);
                        skb_frag_page_copy(fragto, fragfrom);
                        skb_frag_off_copy(fragto, fragfrom);
                        skb_frag_size_set(fragto, todo);

                        skb_frag_off_add(fragfrom, todo);
                        skb_frag_size_sub(fragfrom, todo);
                        todo = 0;

                        to++;
                        break;
                }
        }

        /* Ready to "commit" this state change to tgt */
        skb_shinfo(tgt)->nr_frags = to;

        if (merge >= 0) {
                fragfrom = &skb_shinfo(skb)->frags[0];
                fragto = &skb_shinfo(tgt)->frags[merge];

                skb_frag_size_add(fragto, skb_frag_size(fragfrom));
                __skb_frag_unref(fragfrom, skb->pp_recycle);
        }

        /* Reposition in the original skb */
        to = 0;
        while (from < skb_shinfo(skb)->nr_frags)
                skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
        skb_shinfo(skb)->nr_frags = to;

        BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);

onlymerged:
        /* Most likely the tgt won't ever need its checksum anymore, skb on
         * the other hand might need it if it needs to be resent
         */
        tgt->ip_summed = CHECKSUM_PARTIAL;
        skb->ip_summed = CHECKSUM_PARTIAL;

        skb_len_add(skb, -shiftlen);
        skb_len_add(tgt, shiftlen);

        return shiftlen;
}

/**
 * skb_prepare_seq_read - Prepare a sequential read of skb data
 * @skb: the buffer to read
 * @from: lower offset of data to be read
 * @to: upper offset of data to be read
 * @st: state variable
 *
 * Initializes the specified state variable. Must be called before
 * invoking skb_seq_read() for the first time.
 */
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st)
{
        st->lower_offset = from;
        st->upper_offset = to;
        st->root_skb = st->cur_skb = skb;
        st->frag_idx = st->stepped_offset = 0;
        st->frag_data = NULL;
        st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);

/**
 * skb_seq_read - Sequentially read skb data
 * @consumed: number of bytes consumed by the caller so far
 * @data: destination pointer for data to be returned
 * @st: state variable
 *
 * Reads a block of skb data at @consumed relative to the
 * lower offset specified to skb_prepare_seq_read(). Assigns
 * the head of the data block to @data and returns the length
 * of the block or 0 if the end of the skb data or the upper
 * offset has been reached.
 *
 * The caller is not required to consume all of the data
 * returned, i.e. @consumed is typically set to the number
 * of bytes already consumed and the next call to
 * skb_seq_read() will return the remaining part of the block.
 *
 * Note 1: The size of each block of data returned can be arbitrary,
 *       this limitation is the cost for zerocopy sequential
 *       reads of potentially non linear data.
 *
 * Note 2: Fragment lists within fragments are not implemented
 *       at the moment, state->root_skb could be replaced with
 *       a stack for this purpose.
 */
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st)
{
        unsigned int block_limit, abs_offset = consumed + st->lower_offset;
        skb_frag_t *frag;

        if (unlikely(abs_offset >= st->upper_offset)) {
                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }
                return 0;
        }

next_skb:
        block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;

        if (abs_offset < block_limit && !st->frag_data) {
                *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
                return block_limit - abs_offset;
        }

        if (st->frag_idx == 0 && !st->frag_data)
                st->stepped_offset += skb_headlen(st->cur_skb);

        while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
                unsigned int pg_idx, pg_off, pg_sz;

                frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];

                pg_idx = 0;
                pg_off = skb_frag_off(frag);
                pg_sz = skb_frag_size(frag);

                if (skb_frag_must_loop(skb_frag_page(frag))) {
                        pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
                        pg_off = offset_in_page(pg_off + st->frag_off);
                        pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
                                                    PAGE_SIZE - pg_off);
                }

                block_limit = pg_sz + st->stepped_offset;
                if (abs_offset < block_limit) {
                        if (!st->frag_data)
                                st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);

                        *data = (u8 *)st->frag_data + pg_off +
                                (abs_offset - st->stepped_offset);

                        return block_limit - abs_offset;
                }

                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }

                st->stepped_offset += pg_sz;
                st->frag_off += pg_sz;
                if (st->frag_off == skb_frag_size(frag)) {
                        st->frag_off = 0;
                        st->frag_idx++;
                }
        }

        if (st->frag_data) {
                kunmap_atomic(st->frag_data);
                st->frag_data = NULL;
        }

        if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
                st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
                st->frag_idx = 0;
                goto next_skb;
        } else if (st->cur_skb->next) {
                st->cur_skb = st->cur_skb->next;
                st->frag_idx = 0;
                goto next_skb;
        }

        return 0;
}
EXPORT_SYMBOL(skb_seq_read);

/**
 * skb_abort_seq_read - Abort a sequential read of skb data
 * @st: state variable
 *
 * Must be called if skb_seq_read() was not called until it
 * returned 0.
 */
void skb_abort_seq_read(struct skb_seq_state *st)
{
        if (st->frag_data)
                kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);

#define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))

static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
                                          struct ts_config *conf,
                                          struct ts_state *state)
{
        return skb_seq_read(offset, text, TS_SKB_CB(state));
}

static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
{
        skb_abort_seq_read(TS_SKB_CB(state));
}

/**
 * skb_find_text - Find a text pattern in skb data
 * @skb: the buffer to look in
 * @from: search offset
 * @to: search limit
 * @config: textsearch configuration
 *
 * Finds a pattern in the skb data according to the specified
 * textsearch configuration. Use textsearch_next() to retrieve
 * subsequent occurrences of the pattern. Returns the offset
 * to the first occurrence or UINT_MAX if no match was found.
 */
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config)
{
        unsigned int patlen = config->ops->get_pattern_len(config);
        struct ts_state state;
        unsigned int ret;

        BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));

        config->get_next_block = skb_ts_get_next_block;
        config->finish = skb_ts_finish;

        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));

        ret = textsearch_find(config, &state);
        return (ret + patlen <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags)
{
        int i = skb_shinfo(skb)->nr_frags;

        if (skb_can_coalesce(skb, i, page, offset)) {
                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
        } else if (i < max_frags) {
                skb_zcopy_downgrade_managed(skb);
                get_page(page);
                skb_fill_page_desc_noacc(skb, i, page, offset, size);
        } else {
                return -EMSGSIZE;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(skb_append_pagefrags);

/**
 *        skb_pull_rcsum - pull skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_pull on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_pull unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
        unsigned char *data = skb->data;

        BUG_ON(len > skb->len);
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, data, len);
        return skb->data;
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);

static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
{
        skb_frag_t head_frag;
        struct page *page;

        page = virt_to_head_page(frag_skb->head);
        skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
                                (unsigned char *)page_address(page),
                                skb_headlen(frag_skb));
        return head_frag;
}

struct sk_buff *skb_segment_list(struct sk_buff *skb,
                                 netdev_features_t features,
                                 unsigned int offset)
{
        struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
        unsigned int tnl_hlen = skb_tnl_header_len(skb);
        unsigned int delta_truesize = 0;
        unsigned int delta_len = 0;
        struct sk_buff *tail = NULL;
        struct sk_buff *nskb, *tmp;
        int len_diff, err;

        skb_push(skb, -skb_network_offset(skb) + offset);

        /* Ensure the head is writeable before touching the shared info */
        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto err_linearize;

        skb_shinfo(skb)->frag_list = NULL;

        while (list_skb) {
                nskb = list_skb;
                list_skb = list_skb->next;

                err = 0;
                delta_truesize += nskb->truesize;
                if (skb_shared(nskb)) {
                        tmp = skb_clone(nskb, GFP_ATOMIC);
                        if (tmp) {
                                consume_skb(nskb);
                                nskb = tmp;
                                err = skb_unclone(nskb, GFP_ATOMIC);
                        } else {
                                err = -ENOMEM;
                        }
                }

                if (!tail)
                        skb->next = nskb;
                else
                        tail->next = nskb;

                if (unlikely(err)) {
                        nskb->next = list_skb;
                        goto err_linearize;
                }

                tail = nskb;

                delta_len += nskb->len;

                skb_push(nskb, -skb_network_offset(nskb) + offset);

                skb_release_head_state(nskb);
                len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
                __copy_skb_header(nskb, skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
                nskb->transport_header += len_diff;
                skb_copy_from_linear_data_offset(skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 offset + tnl_hlen);

                if (skb_needs_linearize(nskb, features) &&
                    __skb_linearize(nskb))
                        goto err_linearize;
        }

        skb->truesize = skb->truesize - delta_truesize;
        skb->data_len = skb->data_len - delta_len;
        skb->len = skb->len - delta_len;

        skb_gso_reset(skb);

        skb->prev = tail;

        if (skb_needs_linearize(skb, features) &&
            __skb_linearize(skb))
                goto err_linearize;

        skb_get(skb);

        return skb;

err_linearize:
        kfree_skb_list(skb->next);
        skb->next = NULL;
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(skb_segment_list);

/**
 *        skb_segment - Perform protocol segmentation on skb.
 *        @head_skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 *
 *        This function performs segmentation on the given skb.  It returns
 *        a pointer to the first in a list of new skbs for the segments.
 *        In case of error it returns ERR_PTR(err).
 */
struct sk_buff *skb_segment(struct sk_buff *head_skb,
                            netdev_features_t features)
{
        struct sk_buff *segs = NULL;
        struct sk_buff *tail = NULL;
        struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
        unsigned int mss = skb_shinfo(head_skb)->gso_size;
        unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
        unsigned int offset = doffset;
        unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
        unsigned int partial_segs = 0;
        unsigned int headroom;
        unsigned int len = head_skb->len;
        struct sk_buff *frag_skb;
        skb_frag_t *frag;
        __be16 proto;
        bool csum, sg;
        int err = -ENOMEM;
        int i = 0;
        int nfrags, pos;

        if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
            mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
                struct sk_buff *check_skb;

                for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
                        if (skb_headlen(check_skb) && !check_skb->head_frag) {
                                /* gso_size is untrusted, and we have a frag_list with
                                 * a linear non head_frag item.
                                 *
                                 * If head_skb's headlen does not fit requested gso_size,
                                 * it means that the frag_list members do NOT terminate
                                 * on exact gso_size boundaries. Hence we cannot perform
                                 * skb_frag_t page sharing. Therefore we must fallback to
                                 * copying the frag_list skbs; we do so by disabling SG.
                                 */
                                features &= ~NETIF_F_SG;
                                break;
                        }
                }
        }

        __skb_push(head_skb, doffset);
        proto = skb_network_protocol(head_skb, NULL);
        if (unlikely(!proto))
                return ERR_PTR(-EINVAL);

        sg = !!(features & NETIF_F_SG);
        csum = !!can_checksum_protocol(features, proto);

        if (sg && csum && (mss != GSO_BY_FRAGS))  {
                if (!(features & NETIF_F_GSO_PARTIAL)) {
                        struct sk_buff *iter;
                        unsigned int frag_len;

                        if (!list_skb ||
                            !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
                                goto normal;

                        /* If we get here then all the required
                         * GSO features except frag_list are supported.
                         * Try to split the SKB to multiple GSO SKBs
                         * with no frag_list.
                         * Currently we can do that only when the buffers don't
                         * have a linear part and all the buffers except
                         * the last are of the same length.
                         */
                        frag_len = list_skb->len;
                        skb_walk_frags(head_skb, iter) {
                                if (frag_len != iter->len && iter->next)
                                        goto normal;
                                if (skb_headlen(iter) && !iter->head_frag)
                                        goto normal;

                                len -= iter->len;
                        }

                        if (len != frag_len)
                                goto normal;
                }

                /* GSO partial only requires that we trim off any excess that
                 * doesn't fit into an MSS sized block, so take care of that
                 * now.
                 * Cap len to not accidentally hit GSO_BY_FRAGS.
                 */
                partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
                if (partial_segs > 1)
                        mss *= partial_segs;
                else
                        partial_segs = 0;
        }

normal:
        headroom = skb_headroom(head_skb);
        pos = skb_headlen(head_skb);

        if (skb_orphan_frags(head_skb, GFP_ATOMIC))
                return ERR_PTR(-ENOMEM);

        nfrags = skb_shinfo(head_skb)->nr_frags;
        frag = skb_shinfo(head_skb)->frags;
        frag_skb = head_skb;

        do {
                struct sk_buff *nskb;
                skb_frag_t *nskb_frag;
                int hsize;
                int size;

                if (unlikely(mss == GSO_BY_FRAGS)) {
                        len = list_skb->len;
                } else {
                        len = head_skb->len - offset;
                        if (len > mss)
                                len = mss;
                }

                hsize = skb_headlen(head_skb) - offset;

                if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
                    (skb_headlen(list_skb) == len || sg)) {
                        BUG_ON(skb_headlen(list_skb) > len);

                        nskb = skb_clone(list_skb, GFP_ATOMIC);
                        if (unlikely(!nskb))
                                goto err;

                        i = 0;
                        nfrags = skb_shinfo(list_skb)->nr_frags;
                        frag = skb_shinfo(list_skb)->frags;
                        frag_skb = list_skb;
                        pos += skb_headlen(list_skb);

                        while (pos < offset + len) {
                                BUG_ON(i >= nfrags);

                                size = skb_frag_size(frag);
                                if (pos + size > offset + len)
                                        break;

                                i++;
                                pos += size;
                                frag++;
                        }

                        list_skb = list_skb->next;

                        if (unlikely(pskb_trim(nskb, len))) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        hsize = skb_end_offset(nskb);
                        if (skb_cow_head(nskb, doffset + headroom)) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        nskb->truesize += skb_end_offset(nskb) - hsize;
                        skb_release_head_state(nskb);
                        __skb_push(nskb, doffset);
                } else {
                        if (hsize < 0)
                                hsize = 0;
                        if (hsize > len || !sg)
                                hsize = len;

                        nskb = __alloc_skb(hsize + doffset + headroom,
                                           GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
                                           NUMA_NO_NODE);

                        if (unlikely(!nskb))
                                goto err;

                        skb_reserve(nskb, headroom);
                        __skb_put(nskb, doffset);
                }

                if (segs)
                        tail->next = nskb;
                else
                        segs = nskb;
                tail = nskb;

                __copy_skb_header(nskb, head_skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
                skb_reset_mac_len(nskb);

                skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 doffset + tnl_hlen);

                if (nskb->len == len + doffset)
                        goto perform_csum_check;

                if (!sg) {
                        if (!csum) {
                                if (!nskb->remcsum_offload)
                                        nskb->ip_summed = CHECKSUM_NONE;
                                SKB_GSO_CB(nskb)->csum =
                                        skb_copy_and_csum_bits(head_skb, offset,
                                                               skb_put(nskb,
                                                                       len),
                                                               len);
                                SKB_GSO_CB(nskb)->csum_start =
                                        skb_headroom(nskb) + doffset;
                        } else {
                                if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
                                        goto err;
                        }
                        continue;
                }

                nskb_frag = skb_shinfo(nskb)->frags;

                skb_copy_from_linear_data_offset(head_skb, offset,
                                                 skb_put(nskb, hsize), hsize);

                skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
                                           SKBFL_SHARED_FRAG;

                if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
                        goto err;

                while (pos < offset + len) {
                        if (i >= nfrags) {
                                if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
                                    skb_zerocopy_clone(nskb, list_skb,
                                                       GFP_ATOMIC))
                                        goto err;

                                i = 0;
                                nfrags = skb_shinfo(list_skb)->nr_frags;
                                frag = skb_shinfo(list_skb)->frags;
                                frag_skb = list_skb;
                                if (!skb_headlen(list_skb)) {
                                        BUG_ON(!nfrags);
                                } else {
                                        BUG_ON(!list_skb->head_frag);

                                        /* to make room for head_frag. */
                                        i--;
                                        frag--;
                                }

                                list_skb = list_skb->next;
                        }

                        if (unlikely(skb_shinfo(nskb)->nr_frags >=
                                     MAX_SKB_FRAGS)) {
                                net_warn_ratelimited(
                                        "skb_segment: too many frags: %u %u\n",
                                        pos, mss);
                                err = -EINVAL;
                                goto err;
                        }

                        *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
                        __skb_frag_ref(nskb_frag);
                        size = skb_frag_size(nskb_frag);

                        if (pos < offset) {
                                skb_frag_off_add(nskb_frag, offset - pos);
                                skb_frag_size_sub(nskb_frag, offset - pos);
                        }

                        skb_shinfo(nskb)->nr_frags++;

                        if (pos + size <= offset + len) {
                                i++;
                                frag++;
                                pos += size;
                        } else {
                                skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
                                goto skip_fraglist;
                        }

                        nskb_frag++;
                }

skip_fraglist:
                nskb->data_len = len - hsize;
                nskb->len += nskb->data_len;
                nskb->truesize += nskb->data_len;

perform_csum_check:
                if (!csum) {
                        if (skb_has_shared_frag(nskb) &&
                            __skb_linearize(nskb))
                                goto err;

                        if (!nskb->remcsum_offload)
                                nskb->ip_summed = CHECKSUM_NONE;
                        SKB_GSO_CB(nskb)->csum =
                                skb_checksum(nskb, doffset,
                                             nskb->len - doffset, 0);
                        SKB_GSO_CB(nskb)->csum_start =
                                skb_headroom(nskb) + doffset;
                }
        } while ((offset += len) < head_skb->len);

        /* Some callers want to get the end of the list.
         * Put it in segs->prev to avoid walking the list.
         * (see validate_xmit_skb_list() for example)
         */
        segs->prev = tail;

        if (partial_segs) {
                struct sk_buff *iter;
                int type = skb_shinfo(head_skb)->gso_type;
                unsigned short gso_size = skb_shinfo(head_skb)->gso_size;

                /* Update type to add partial and then remove dodgy if set */
                type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
                type &= ~SKB_GSO_DODGY;

                /* Update GSO info and prepare to start updating headers on
                 * our way back down the stack of protocols.
                 */
                for (iter = segs; iter; iter = iter->next) {
                        skb_shinfo(iter)->gso_size = gso_size;
                        skb_shinfo(iter)->gso_segs = partial_segs;
                        skb_shinfo(iter)->gso_type = type;
                        SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
                }

                if (tail->len - doffset <= gso_size)
                        skb_shinfo(tail)->gso_size = 0;
                else if (tail != segs)
                        skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
        }

        /* Following permits correct backpressure, for protocols
         * using skb_set_owner_w().
         * Idea is to tranfert ownership from head_skb to last segment.
         */
        if (head_skb->destructor == sock_wfree) {
                swap(tail->truesize, head_skb->truesize);
                swap(tail->destructor, head_skb->destructor);
                swap(tail->sk, head_skb->sk);
        }
        return segs;

err:
        kfree_skb_list(segs);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skb_segment);

#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE        8
#define SKB_EXT_CHUNKSIZEOF(x)        (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)

static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
#endif
#ifdef CONFIG_XFRM
        [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
#endif
};

static __always_inline unsigned int skb_ext_total_length(void)
{
        unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
        int i;

        for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
                l += skb_ext_type_len[i];

        return l;
}

static void skb_extensions_init(void)
{
        BUILD_BUG_ON(SKB_EXT_NUM >= 8);
#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
        BUILD_BUG_ON(skb_ext_total_length() > 255);
#endif

        skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
                                             SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
                                             0,
                                             SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                             NULL);
}
#else
static void skb_extensions_init(void) {}
#endif

/* The SKB kmem_cache slab is critical for network performance.  Never
 * merge/alias the slab with similar sized objects.  This avoids fragmentation
 * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
 */
#ifndef CONFIG_SLUB_TINY
#define FLAG_SKB_NO_MERGE        SLAB_NO_MERGE
#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
#define FLAG_SKB_NO_MERGE        0
#endif

void __init skb_init(void)
{
        net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
                                              sizeof(struct sk_buff),
                                              0,
                                              SLAB_HWCACHE_ALIGN|SLAB_PANIC|
                                                FLAG_SKB_NO_MERGE,
                                              offsetof(struct sk_buff, cb),
                                              sizeof_field(struct sk_buff, cb),
                                              NULL);
        net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
                                                sizeof(struct sk_buff_fclones),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                NULL);
        /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
         * struct skb_shared_info is located at the end of skb->head,
         * and should not be copied to/from user.
         */
        net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
                                                SKB_SMALL_HEAD_CACHE_SIZE,
                                                0,
                                                SLAB_HWCACHE_ALIGN | SLAB_PANIC,
                                                0,
                                                SKB_SMALL_HEAD_HEADROOM,
                                                NULL);
        skb_extensions_init();
}

static int
__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
               unsigned int recursion_level)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int elt = 0;

        if (unlikely(recursion_level >= 24))
                return -EMSGSIZE;

        if (copy > 0) {
                if (copy > len)
                        copy = len;
                sg_set_buf(sg, skb->data + offset, copy);
                elt++;
                if ((len -= copy) == 0)
                        return elt;
                offset += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        sg_set_page(&sg[elt], skb_frag_page(frag), copy,
                                    skb_frag_off(frag) + offset - start);
                        elt++;
                        if (!(len -= copy))
                                return elt;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end, ret;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
                                              copy, recursion_level + 1);
                        if (unlikely(ret < 0))
                                return ret;
                        elt += ret;
                        if ((len -= copy) == 0)
                                return elt;
                        offset += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return elt;
}

/**
 *        skb_to_sgvec - Fill a scatter-gather list from a socket buffer
 *        @skb: Socket buffer containing the buffers to be mapped
 *        @sg: The scatter-gather list to map into
 *        @offset: The offset into the buffer's contents to start mapping
 *        @len: Length of buffer space to be mapped
 *
 *        Fill the specified scatter-gather list with mappings/pointers into a
 *        region of the buffer space attached to a socket buffer. Returns either
 *        the number of scatterlist items used, or -EMSGSIZE if the contents
 *        could not fit.
 */
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
        int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);

        if (nsg <= 0)
                return nsg;

        sg_mark_end(&sg[nsg - 1]);

        return nsg;
}
EXPORT_SYMBOL_GPL(skb_to_sgvec);

/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
 * sglist without mark the sg which contain last skb data as the end.
 * So the caller can mannipulate sg list as will when padding new data after
 * the first call without calling sg_unmark_end to expend sg list.
 *
 * Scenario to use skb_to_sgvec_nomark:
 * 1. sg_init_table
 * 2. skb_to_sgvec_nomark(payload1)
 * 3. skb_to_sgvec_nomark(payload2)
 *
 * This is equivalent to:
 * 1. sg_init_table
 * 2. skb_to_sgvec(payload1)
 * 3. sg_unmark_end
 * 4. skb_to_sgvec(payload2)
 *
 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
 * is more preferable.
 */
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                        int offset, int len)
{
        return __skb_to_sgvec(skb, sg, offset, len, 0);
}
EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);



/**
 *        skb_cow_data - Check that a socket buffer's data buffers are writable
 *        @skb: The socket buffer to check.
 *        @tailbits: Amount of trailing space to be added
 *        @trailer: Returned pointer to the skb where the @tailbits space begins
 *
 *        Make sure that the data buffers attached to a socket buffer are
 *        writable. If they are not, private copies are made of the data buffers
 *        and the socket buffer is set to use these instead.
 *
 *        If @tailbits is given, make sure that there is space to write @tailbits
 *        bytes of data beyond current end of socket buffer.  @trailer will be
 *        set to point to the skb in which this space begins.
 *
 *        The number of scatterlist elements required to completely map the
 *        COW'd and extended socket buffer will be returned.
 */
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
{
        int copyflag;
        int elt;
        struct sk_buff *skb1, **skb_p;

        /* If skb is cloned or its head is paged, reallocate
         * head pulling out all the pages (pages are considered not writable
         * at the moment even if they are anonymous).
         */
        if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
            !__pskb_pull_tail(skb, __skb_pagelen(skb)))
                return -ENOMEM;

        /* Easy case. Most of packets will go this way. */
        if (!skb_has_frag_list(skb)) {
                /* A little of trouble, not enough of space for trailer.
                 * This should not happen, when stack is tuned to generate
                 * good frames. OK, on miss we reallocate and reserve even more
                 * space, 128 bytes is fair. */

                if (skb_tailroom(skb) < tailbits &&
                    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
                        return -ENOMEM;

                /* Voila! */
                *trailer = skb;
                return 1;
        }

        /* Misery. We are in troubles, going to mincer fragments... */

        elt = 1;
        skb_p = &skb_shinfo(skb)->frag_list;
        copyflag = 0;

        while ((skb1 = *skb_p) != NULL) {
                int ntail = 0;

                /* The fragment is partially pulled by someone,
                 * this can happen on input. Copy it and everything
                 * after it. */

                if (skb_shared(skb1))
                        copyflag = 1;

                /* If the skb is the last, worry about trailer. */

                if (skb1->next == NULL && tailbits) {
                        if (skb_shinfo(skb1)->nr_frags ||
                            skb_has_frag_list(skb1) ||
                            skb_tailroom(skb1) < tailbits)
                                ntail = tailbits + 128;
                }

                if (copyflag ||
                    skb_cloned(skb1) ||
                    ntail ||
                    skb_shinfo(skb1)->nr_frags ||
                    skb_has_frag_list(skb1)) {
                        struct sk_buff *skb2;

                        /* Fuck, we are miserable poor guys... */
                        if (ntail == 0)
                                skb2 = skb_copy(skb1, GFP_ATOMIC);
                        else
                                skb2 = skb_copy_expand(skb1,
                                                       skb_headroom(skb1),
                                                       ntail,
                                                       GFP_ATOMIC);
                        if (unlikely(skb2 == NULL))
                                return -ENOMEM;

                        if (skb1->sk)
                                skb_set_owner_w(skb2, skb1->sk);

                        /* Looking around. Are we still alive?
                         * OK, link new skb, drop old one */

                        skb2->next = skb1->next;
                        *skb_p = skb2;
                        kfree_skb(skb1);
                        skb1 = skb2;
                }
                elt++;
                *trailer = skb1;
                skb_p = &skb1->next;
        }

        return elt;
}
EXPORT_SYMBOL_GPL(skb_cow_data);

static void sock_rmem_free(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}

static void skb_set_err_queue(struct sk_buff *skb)
{
        /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
         * So, it is safe to (mis)use it to mark skbs on the error queue.
         */
        skb->pkt_type = PACKET_OUTGOING;
        BUILD_BUG_ON(PACKET_OUTGOING == 0);
}

/*
 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
 */
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
            (unsigned int)READ_ONCE(sk->sk_rcvbuf))
                return -ENOMEM;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rmem_free;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        skb_set_err_queue(skb);

        /* before exiting rcu section, make sure dst is refcounted */
        skb_dst_force(skb);

        skb_queue_tail(&sk->sk_error_queue, skb);
        if (!sock_flag(sk, SOCK_DEAD))
                sk_error_report(sk);
        return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);

static bool is_icmp_err_skb(const struct sk_buff *skb)
{
        return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
                       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
}

struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
        struct sk_buff_head *q = &sk->sk_error_queue;
        struct sk_buff *skb, *skb_next = NULL;
        bool icmp_next = false;
        unsigned long flags;

        if (skb_queue_empty_lockless(q))
                return NULL;

        spin_lock_irqsave(&q->lock, flags);
        skb = __skb_dequeue(q);
        if (skb && (skb_next = skb_peek(q))) {
                icmp_next = is_icmp_err_skb(skb_next);
                if (icmp_next)
                        sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        if (is_icmp_err_skb(skb) && !icmp_next)
                sk->sk_err = 0;

        if (skb_next)
                sk_error_report(sk);

        return skb;
}
EXPORT_SYMBOL(sock_dequeue_err_skb);

/**
 * skb_clone_sk - create clone of skb, and take reference to socket
 * @skb: the skb to clone
 *
 * This function creates a clone of a buffer that holds a reference on
 * sk_refcnt.  Buffers created via this function are meant to be
 * returned using sock_queue_err_skb, or free via kfree_skb.
 *
 * When passing buffers allocated with this function to sock_queue_err_skb
 * it is necessary to wrap the call with sock_hold/sock_put in order to
 * prevent the socket from being released prior to being enqueued on
 * the sk_error_queue.
 */
struct sk_buff *skb_clone_sk(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct sk_buff *clone;

        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
                return NULL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (!clone) {
                sock_put(sk);
                return NULL;
        }

        clone->sk = sk;
        clone->destructor = sock_efree;

        return clone;
}
EXPORT_SYMBOL(skb_clone_sk);

static void __skb_complete_tx_timestamp(struct sk_buff *skb,
                                        struct sock *sk,
                                        int tstype,
                                        bool opt_stats)
{
        struct sock_exterr_skb *serr;
        int err;

        BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
        serr->ee.ee_info = tstype;
        serr->opt_stats = opt_stats;
        serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
        if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
                serr->ee.ee_data = skb_shinfo(skb)->tskey;
                if (sk_is_tcp(sk))
                        serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
        }

        err = sock_queue_err_skb(sk, skb);

        if (err)
                kfree_skb(skb);
}

static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
        bool ret;

        if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
                return true;

        read_lock_bh(&sk->sk_callback_lock);
        ret = sk->sk_socket && sk->sk_socket->file &&
              file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
        read_unlock_bh(&sk->sk_callback_lock);
        return ret;
}

void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps)
{
        struct sock *sk = skb->sk;

        if (!skb_may_tx_timestamp(sk, false))
                goto err;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                *skb_hwtstamps(skb) = *hwtstamps;
                __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
                sock_put(sk);
                return;
        }

err:
        kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);

void __skb_tstamp_tx(struct sk_buff *orig_skb,
                     const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype)
{
        struct sk_buff *skb;
        bool tsonly, opt_stats = false;
        u32 tsflags;

        if (!sk)
                return;

        tsflags = READ_ONCE(sk->sk_tsflags);
        if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
            skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
                return;

        tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
        if (!skb_may_tx_timestamp(sk, tsonly))
                return;

        if (tsonly) {
#ifdef CONFIG_INET
                if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
                    sk_is_tcp(sk)) {
                        skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
                                                             ack_skb);
                        opt_stats = true;
                } else
#endif
                        skb = alloc_skb(0, GFP_ATOMIC);
        } else {
                skb = skb_clone(orig_skb, GFP_ATOMIC);

                if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
                        kfree_skb(skb);
                        return;
                }
        }
        if (!skb)
                return;

        if (tsonly) {
                skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
                                             SKBTX_ANY_TSTAMP;
                skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
        }

        if (hwtstamps)
                *skb_hwtstamps(skb) = *hwtstamps;
        else
                __net_timestamp(skb);

        __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);

void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps)
{
        return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
                               SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);

#ifdef CONFIG_WIRELESS
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
        struct sock *sk = skb->sk;
        struct sock_exterr_skb *serr;
        int err = 1;

        skb->wifi_acked_valid = 1;
        skb->wifi_acked = acked;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                err = sock_queue_err_skb(sk, skb);
                sock_put(sk);
        }
        if (err)
                kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
#endif /* CONFIG_WIRELESS */

/**
 * skb_partial_csum_set - set up and verify partial csum values for packet
 * @skb: the skb to set
 * @start: the number of bytes after skb->data to start checksumming.
 * @off: the offset from start to place the checksum.
 *
 * For untrusted partially-checksummed packets, we need to make sure the values
 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
 *
 * This function checks and sets those values and skb->ip_summed: if this
 * returns false you should drop the packet.
 */
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
        u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
        u32 csum_start = skb_headroom(skb) + (u32)start;

        if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
                net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
                                     start, off, skb_headroom(skb), skb_headlen(skb));
                return false;
        }
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = csum_start;
        skb->csum_offset = off;
        skb->transport_header = csum_start;
        return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);

static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
                               unsigned int max)
{
        if (skb_headlen(skb) >= len)
                return 0;

        /* If we need to pullup then pullup to the max, so we
         * won't need to do it again.
         */
        if (max > skb->len)
                max = skb->len;

        if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
                return -ENOMEM;

        if (skb_headlen(skb) < len)
                return -EPROTO;

        return 0;
}

#define MAX_TCP_HDR_LEN (15 * 4)

static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
                                      typeof(IPPROTO_IP) proto,
                                      unsigned int off)
{
        int err;

        switch (proto) {
        case IPPROTO_TCP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
                                          off + MAX_TCP_HDR_LEN);
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct tcphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;

        case IPPROTO_UDP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
                                          off + sizeof(struct udphdr));
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct udphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
        }

        return ERR_PTR(-EPROTO);
}

/* This value should be large enough to cover a tagged ethernet header plus
 * maximally sized IP and TCP or UDP headers.
 */
#define MAX_IP_HDR_LEN 128

static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
        unsigned int off;
        bool fragment;
        __sum16 *csum;
        int err;

        fragment = false;

        err = skb_maybe_pull_tail(skb,
                                  sizeof(struct iphdr),
                                  MAX_IP_HDR_LEN);
        if (err < 0)
                goto out;

        if (ip_is_fragment(ip_hdr(skb)))
                fragment = true;

        off = ip_hdrlen(skb);

        err = -EPROTO;

        if (fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
                                           ip_hdr(skb)->daddr,
                                           skb->len - off,
                                           ip_hdr(skb)->protocol, 0);
        err = 0;

out:
        return err;
}

/* This value should be large enough to cover a tagged ethernet header plus
 * an IPv6 header, all options, and a maximal TCP or UDP header.
 */
#define MAX_IPV6_HDR_LEN 256

#define OPT_HDR(type, skb, off) \
        (type *)(skb_network_header(skb) + (off))

static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
{
        int err;
        u8 nexthdr;
        unsigned int off;
        unsigned int len;
        bool fragment;
        bool done;
        __sum16 *csum;

        fragment = false;
        done = false;

        off = sizeof(struct ipv6hdr);

        err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
        if (err < 0)
                goto out;

        nexthdr = ipv6_hdr(skb)->nexthdr;

        len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
        while (off <= len && !done) {
                switch (nexthdr) {
                case IPPROTO_DSTOPTS:
                case IPPROTO_HOPOPTS:
                case IPPROTO_ROUTING: {
                        struct ipv6_opt_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ipv6_opt_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_optlen(hp);
                        break;
                }
                case IPPROTO_AH: {
                        struct ip_auth_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ip_auth_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ip_auth_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_authlen(hp);
                        break;
                }
                case IPPROTO_FRAGMENT: {
                        struct frag_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct frag_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct frag_hdr, skb, off);

                        if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
                                fragment = true;

                        nexthdr = hp->nexthdr;
                        off += sizeof(struct frag_hdr);
                        break;
                }
                default:
                        done = true;
                        break;
                }
        }

        err = -EPROTO;

        if (!done || fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, nexthdr, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                         &ipv6_hdr(skb)->daddr,
                                         skb->len - off, nexthdr, 0);
        err = 0;

out:
        return err;
}

/**
 * skb_checksum_setup - set up partial checksum offset
 * @skb: the skb to set up
 * @recalculate: if true the pseudo-header checksum will be recalculated
 */
int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
{
        int err;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                err = skb_checksum_setup_ipv4(skb, recalculate);
                break;

        case htons(ETH_P_IPV6):
                err = skb_checksum_setup_ipv6(skb, recalculate);
                break;

        default:
                err = -EPROTO;
                break;
        }

        return err;
}
EXPORT_SYMBOL(skb_checksum_setup);

/**
 * skb_checksum_maybe_trim - maybe trims the given skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 *
 * Checks whether the given skb has data beyond the given transport length.
 * If so, returns a cloned skb trimmed to this transport length.
 * Otherwise returns the provided skb. Returns NULL in error cases
 * (e.g. transport_len exceeds skb length or out-of-memory).
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
                                               unsigned int transport_len)
{
        struct sk_buff *skb_chk;
        unsigned int len = skb_transport_offset(skb) + transport_len;
        int ret;

        if (skb->len < len)
                return NULL;
        else if (skb->len == len)
                return skb;

        skb_chk = skb_clone(skb, GFP_ATOMIC);
        if (!skb_chk)
                return NULL;

        ret = pskb_trim_rcsum(skb_chk, len);
        if (ret) {
                kfree_skb(skb_chk);
                return NULL;
        }

        return skb_chk;
}

/**
 * skb_checksum_trimmed - validate checksum of an skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 * @skb_chkf: checksum function to use
 *
 * Applies the given checksum function skb_chkf to the provided skb.
 * Returns a checked and maybe trimmed skb. Returns NULL on error.
 *
 * If the skb has data beyond the given transport length, then a
 * trimmed & cloned skb is checked and returned.
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb))
{
        struct sk_buff *skb_chk;
        unsigned int offset = skb_transport_offset(skb);
        __sum16 ret;

        skb_chk = skb_checksum_maybe_trim(skb, transport_len);
        if (!skb_chk)
                goto err;

        if (!pskb_may_pull(skb_chk, offset))
                goto err;

        skb_pull_rcsum(skb_chk, offset);
        ret = skb_chkf(skb_chk);
        skb_push_rcsum(skb_chk, offset);

        if (ret)
                goto err;

        return skb_chk;

err:
        if (skb_chk && skb_chk != skb)
                kfree_skb(skb_chk);

        return NULL;

}
EXPORT_SYMBOL(skb_checksum_trimmed);

void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
        net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
                             skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
        if (head_stolen) {
                skb_release_head_state(skb);
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
        } else {
                __kfree_skb(skb);
        }
}
EXPORT_SYMBOL(kfree_skb_partial);

/**
 * skb_try_coalesce - try to merge skb to prior one
 * @to: prior buffer
 * @from: buffer to add
 * @fragstolen: pointer to boolean
 * @delta_truesize: how much more was allocated than was requested
 */
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize)
{
        struct skb_shared_info *to_shinfo, *from_shinfo;
        int i, delta, len = from->len;

        *fragstolen = false;

        if (skb_cloned(to))
                return false;

        /* In general, avoid mixing page_pool and non-page_pool allocated
         * pages within the same SKB. In theory we could take full
         * references if @from is cloned and !@to->pp_recycle but its
         * tricky (due to potential race with the clone disappearing) and
         * rare, so not worth dealing with.
         */
        if (to->pp_recycle != from->pp_recycle)
                return false;

        if (len <= skb_tailroom(to)) {
                if (len)
                        BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
                *delta_truesize = 0;
                return true;
        }

        to_shinfo = skb_shinfo(to);
        from_shinfo = skb_shinfo(from);
        if (to_shinfo->frag_list || from_shinfo->frag_list)
                return false;
        if (skb_zcopy(to) || skb_zcopy(from))
                return false;

        if (skb_headlen(from) != 0) {
                struct page *page;
                unsigned int offset;

                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
                        return false;

                if (skb_head_is_locked(from))
                        return false;

                delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));

                page = virt_to_head_page(from->head);
                offset = from->data - (unsigned char *)page_address(page);

                skb_fill_page_desc(to, to_shinfo->nr_frags,
                                   page, offset, skb_headlen(from));
                *fragstolen = true;
        } else {
                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags > MAX_SKB_FRAGS)
                        return false;

                delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
        }

        WARN_ON_ONCE(delta < len);

        memcpy(to_shinfo->frags + to_shinfo->nr_frags,
               from_shinfo->frags,
               from_shinfo->nr_frags * sizeof(skb_frag_t));
        to_shinfo->nr_frags += from_shinfo->nr_frags;

        if (!skb_cloned(from))
                from_shinfo->nr_frags = 0;

        /* if the skb is not cloned this does nothing
         * since we set nr_frags to 0.
         */
        if (skb_pp_frag_ref(from)) {
                for (i = 0; i < from_shinfo->nr_frags; i++)
                        __skb_frag_ref(&from_shinfo->frags[i]);
        }

        to->truesize += delta;
        to->len += len;
        to->data_len += len;

        *delta_truesize = delta;
        return true;
}
EXPORT_SYMBOL(skb_try_coalesce);

/**
 * skb_scrub_packet - scrub an skb
 *
 * @skb: buffer to clean
 * @xnet: packet is crossing netns
 *
 * skb_scrub_packet can be used after encapsulating or decapsulting a packet
 * into/from a tunnel. Some information have to be cleared during these
 * operations.
 * skb_scrub_packet can also be used to clean a skb before injecting it in
 * another namespace (@xnet == true). We have to clear all information in the
 * skb that could impact namespace isolation.
 */
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
        skb->pkt_type = PACKET_HOST;
        skb->skb_iif = 0;
        skb->ignore_df = 0;
        skb_dst_drop(skb);
        skb_ext_reset(skb);
        nf_reset_ct(skb);
        nf_reset_trace(skb);

#ifdef CONFIG_NET_SWITCHDEV
        skb->offload_fwd_mark = 0;
        skb->offload_l3_fwd_mark = 0;
#endif

        if (!xnet)
                return;

        ipvs_reset(skb);
        skb->mark = 0;
        skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);

static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
        int mac_len, meta_len;
        void *meta;

        if (skb_cow(skb, skb_headroom(skb)) < 0) {
                kfree_skb(skb);
                return NULL;
        }

        mac_len = skb->data - skb_mac_header(skb);
        if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
                memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
                        mac_len - VLAN_HLEN - ETH_TLEN);
        }

        meta_len = skb_metadata_len(skb);
        if (meta_len) {
                meta = skb_metadata_end(skb) - meta_len;
                memmove(meta + VLAN_HLEN, meta, meta_len);
        }

        skb->mac_header += VLAN_HLEN;
        return skb;
}

struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
{
        struct vlan_hdr *vhdr;
        u16 vlan_tci;

        if (unlikely(skb_vlan_tag_present(skb))) {
                /* vlan_tci is already set-up so leave this for another time */
                return skb;
        }

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                goto err_free;
        /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
        if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
                goto err_free;

        vhdr = (struct vlan_hdr *)skb->data;
        vlan_tci = ntohs(vhdr->h_vlan_TCI);
        __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);

        skb_pull_rcsum(skb, VLAN_HLEN);
        vlan_set_encap_proto(skb, vhdr);

        skb = skb_reorder_vlan_header(skb);
        if (unlikely(!skb))
                goto err_free;

        skb_reset_network_header(skb);
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);

        return skb;

err_free:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);

int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
{
        if (!pskb_may_pull(skb, write_len))
                return -ENOMEM;

        if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
                return 0;

        return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable);

int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
{
        int needed_headroom = dev->needed_headroom;
        int needed_tailroom = dev->needed_tailroom;

        /* For tail taggers, we need to pad short frames ourselves, to ensure
         * that the tail tag does not fail at its role of being at the end of
         * the packet, once the conduit interface pads the frame. Account for
         * that pad length here, and pad later.
         */
        if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
                needed_tailroom += ETH_ZLEN - skb->len;
        /* skb_headroom() returns unsigned int... */
        needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
        needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);

        if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
                /* No reallocation needed, yay! */
                return 0;

        return pskb_expand_head(skb, needed_headroom, needed_tailroom,
                                GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable_head_tail);

/* remove VLAN header from packet and update csum accordingly.
 * expects a non skb_vlan_tag_present skb with a vlan tag payload
 */
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
        int offset = skb->data - skb_mac_header(skb);
        int err;

        if (WARN_ONCE(offset,
                      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
                      offset)) {
                return -EINVAL;
        }

        err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);

        vlan_remove_tag(skb, vlan_tci);

        skb->mac_header += VLAN_HLEN;

        if (skb_network_offset(skb) < ETH_HLEN)
                skb_set_network_header(skb, ETH_HLEN);

        skb_reset_mac_len(skb);

        return err;
}
EXPORT_SYMBOL(__skb_vlan_pop);

/* Pop a vlan tag either from hwaccel or from payload.
 * Expects skb->data at mac header.
 */
int skb_vlan_pop(struct sk_buff *skb)
{
        u16 vlan_tci;
        __be16 vlan_proto;
        int err;

        if (likely(skb_vlan_tag_present(skb))) {
                __vlan_hwaccel_clear_tag(skb);
        } else {
                if (unlikely(!eth_type_vlan(skb->protocol)))
                        return 0;

                err = __skb_vlan_pop(skb, &vlan_tci);
                if (err)
                        return err;
        }
        /* move next vlan tag to hw accel tag */
        if (likely(!eth_type_vlan(skb->protocol)))
                return 0;

        vlan_proto = skb->protocol;
        err = __skb_vlan_pop(skb, &vlan_tci);
        if (unlikely(err))
                return err;

        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_pop);

/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
 * Expects skb->data at mac header.
 */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                int offset = skb->data - skb_mac_header(skb);
                int err;

                if (WARN_ONCE(offset,
                              "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
                              offset)) {
                        return -EINVAL;
                }

                err = __vlan_insert_tag(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
                if (err)
                        return err;

                skb->protocol = skb->vlan_proto;
                skb->mac_len += VLAN_HLEN;

                skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
        }
        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_push);

/**
 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 *
 * Drop the Ethernet header of @skb.
 *
 * Expects that skb->data points to the mac header and that no VLAN tags are
 * present.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_pop(struct sk_buff *skb)
{
        if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
            skb_network_offset(skb) < ETH_HLEN)
                return -EPROTO;

        skb_pull_rcsum(skb, ETH_HLEN);
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        return 0;
}
EXPORT_SYMBOL(skb_eth_pop);

/**
 * skb_eth_push() - Add a new Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 * @dst: Destination MAC address of the new header
 * @src: Source MAC address of the new header
 *
 * Prepend @skb with a new Ethernet header.
 *
 * Expects that skb->data points to the mac header, which must be empty.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src)
{
        struct ethhdr *eth;
        int err;

        if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
                return -EPROTO;

        err = skb_cow_head(skb, sizeof(*eth));
        if (err < 0)
                return err;

        skb_push(skb, sizeof(*eth));
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        eth = eth_hdr(skb);
        ether_addr_copy(eth->h_dest, dst);
        ether_addr_copy(eth->h_source, src);
        eth->h_proto = skb->protocol;

        skb_postpush_rcsum(skb, eth, sizeof(*eth));

        return 0;
}
EXPORT_SYMBOL(skb_eth_push);

/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
                             __be16 ethertype)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be16 diff[] = { ~hdr->h_proto, ethertype };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        hdr->h_proto = ethertype;
}

/**
 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
 *                   the packet
 *
 * @skb: buffer
 * @mpls_lse: MPLS label stack entry to push
 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
 *            ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet)
{
        struct mpls_shim_hdr *lse;
        int err;

        if (unlikely(!eth_p_mpls(mpls_proto)))
                return -EINVAL;

        /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
        if (skb->encapsulation)
                return -EINVAL;

        err = skb_cow_head(skb, MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (!skb->inner_protocol) {
                skb_set_inner_network_header(skb, skb_network_offset(skb));
                skb_set_inner_protocol(skb, skb->protocol);
        }

        skb_push(skb, MPLS_HLEN);
        memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
                mac_len);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);
        skb_reset_mac_len(skb);

        lse = mpls_hdr(skb);
        lse->label_stack_entry = mpls_lse;
        skb_postpush_rcsum(skb, lse, MPLS_HLEN);

        if (ethernet && mac_len >= ETH_HLEN)
                skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
        skb->protocol = mpls_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_push);

/**
 * skb_mpls_pop() - pop the outermost MPLS header
 *
 * @skb: buffer
 * @next_proto: ethertype of header after popped MPLS header
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the packet is ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return 0;

        err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
        memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
                mac_len);

        __skb_pull(skb, MPLS_HLEN);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);

        if (ethernet && mac_len >= ETH_HLEN) {
                struct ethhdr *hdr;

                /* use mpls_hdr() to get ethertype to account for VLANs. */
                hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
                skb_mod_eth_type(skb, hdr, next_proto);
        }
        skb->protocol = next_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_pop);

/**
 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
 *
 * @skb: buffer
 * @mpls_lse: new MPLS label stack entry to update to
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        mpls_hdr(skb)->label_stack_entry = mpls_lse;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_update_lse);

/**
 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
 *
 * @skb: buffer
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_dec_ttl(struct sk_buff *skb)
{
        u32 lse;
        u8 ttl;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
                return -ENOMEM;

        lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
        ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
        if (!--ttl)
                return -EINVAL;

        lse &= ~MPLS_LS_TTL_MASK;
        lse |= ttl << MPLS_LS_TTL_SHIFT;

        return skb_mpls_update_lse(skb, cpu_to_be32(lse));
}
EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);

/**
 * alloc_skb_with_frags - allocate skb with page frags
 *
 * @header_len: size of linear part
 * @data_len: needed length in frags
 * @order: max page order desired.
 * @errcode: pointer to error code if any
 * @gfp_mask: allocation mask
 *
 * This can be used to allocate a paged skb, given a maximal order for frags.
 */
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int order,
                                     int *errcode,
                                     gfp_t gfp_mask)
{
        unsigned long chunk;
        struct sk_buff *skb;
        struct page *page;
        int nr_frags = 0;

        *errcode = -EMSGSIZE;
        if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
                return NULL;

        *errcode = -ENOBUFS;
        skb = alloc_skb(header_len, gfp_mask);
        if (!skb)
                return NULL;

        while (data_len) {
                if (nr_frags == MAX_SKB_FRAGS - 1)
                        goto failure;
                while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
                        order--;

                if (order) {
                        page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
                                           __GFP_COMP |
                                           __GFP_NOWARN,
                                           order);
                        if (!page) {
                                order--;
                                continue;
                        }
                } else {
                        page = alloc_page(gfp_mask);
                        if (!page)
                                goto failure;
                }
                chunk = min_t(unsigned long, data_len,
                              PAGE_SIZE << order);
                skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
                nr_frags++;
                skb->truesize += (PAGE_SIZE << order);
                data_len -= chunk;
        }
        return skb;

failure:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);

/* carve out the first off bytes from skb when off < headlen */
static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
                                    const int headlen, gfp_t gfp_mask)
{
        int i;
        unsigned int size = skb_end_offset(skb);
        int new_hlen = headlen - off;
        u8 *data;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy real data, and all frags */
        skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
        skb->len -= off;

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info,
                        frags[skb_shinfo(skb)->nr_frags]));
        if (skb_cloned(skb)) {
                /* drop the old head gracefully */
                if (skb_orphan_frags(skb, gfp_mask)) {
                        skb_kfree_head(data, size);
                        return -ENOMEM;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);
                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);
                skb_release_data(skb, SKB_CONSUMED);
        } else {
                /* we can reuse existing recount- all we did was
                 * relocate values
                 */
                skb_free_head(skb);
        }

        skb->head = data;
        skb->data = data;
        skb->head_frag = 0;
        skb_set_end_offset(skb, size);
        skb_set_tail_pointer(skb, skb_headlen(skb));
        skb_headers_offset_update(skb, 0);
        skb->cloned = 0;
        skb->hdr_len = 0;
        skb->nohdr = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        return 0;
}

static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);

/* carve out the first eat bytes from skb's frag_list. May recurse into
 * pskb_carve()
 */
static int pskb_carve_frag_list(struct sk_buff *skb,
                                struct skb_shared_info *shinfo, int eat,
                                gfp_t gfp_mask)
{
        struct sk_buff *list = shinfo->frag_list;
        struct sk_buff *clone = NULL;
        struct sk_buff *insp = NULL;

        do {
                if (!list) {
                        pr_err("Not enough bytes to eat. Want %d\n", eat);
                        return -EFAULT;
                }
                if (list->len <= eat) {
                        /* Eaten as whole. */
                        eat -= list->len;
                        list = list->next;
                        insp = list;
                } else {
                        /* Eaten partially. */
                        if (skb_shared(list)) {
                                clone = skb_clone(list, gfp_mask);
                                if (!clone)
                                        return -ENOMEM;
                                insp = list->next;
                                list = clone;
                        } else {
                                /* This may be pulled without problems. */
                                insp = list;
                        }
                        if (pskb_carve(list, eat, gfp_mask) < 0) {
                                kfree_skb(clone);
                                return -ENOMEM;
                        }
                        break;
                }
        } while (eat);

        /* Free pulled out fragments. */
        while ((list = shinfo->frag_list) != insp) {
                shinfo->frag_list = list->next;
                consume_skb(list);
        }
        /* And insert new clone at head. */
        if (clone) {
                clone->next = list;
                shinfo->frag_list = clone;
        }
        return 0;
}

/* carve off first len bytes from skb. Split line (off) is in the
 * non-linear part of skb
 */
static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
                                       int pos, gfp_t gfp_mask)
{
        int i, k = 0;
        unsigned int size = skb_end_offset(skb);
        u8 *data;
        const int nfrags = skb_shinfo(skb)->nr_frags;
        struct skb_shared_info *shinfo;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
        if (skb_orphan_frags(skb, gfp_mask)) {
                skb_kfree_head(data, size);
                return -ENOMEM;
        }
        shinfo = (struct skb_shared_info *)(data + size);
        for (i = 0; i < nfrags; i++) {
                int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + fsize > off) {
                        shinfo->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < off) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_off_add(&shinfo->frags[0], off - pos);
                                skb_frag_size_sub(&shinfo->frags[0], off - pos);
                        }
                        skb_frag_ref(skb, i);
                        k++;
                }
                pos += fsize;
        }
        shinfo->nr_frags = k;
        if (skb_has_frag_list(skb))
                skb_clone_fraglist(skb);

        /* split line is in frag list */
        if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
                /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
                if (skb_has_frag_list(skb))
                        kfree_skb_list(skb_shinfo(skb)->frag_list);
                skb_kfree_head(data, size);
                return -ENOMEM;
        }
        skb_release_data(skb, SKB_CONSUMED);

        skb->head = data;
        skb->head_frag = 0;
        skb->data = data;
        skb_set_end_offset(skb, size);
        skb_reset_tail_pointer(skb);
        skb_headers_offset_update(skb, 0);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        skb->len -= off;
        skb->data_len = skb->len;
        atomic_set(&skb_shinfo(skb)->dataref, 1);
        return 0;
}

/* remove len bytes from the beginning of the skb */
static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
{
        int headlen = skb_headlen(skb);

        if (len < headlen)
                return pskb_carve_inside_header(skb, len, headlen, gfp);
        else
                return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
}

/* Extract to_copy bytes starting at off from skb, and return this in
 * a new skb
 */
struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
                             int to_copy, gfp_t gfp)
{
        struct sk_buff  *clone = skb_clone(skb, gfp);

        if (!clone)
                return NULL;

        if (pskb_carve(clone, off, gfp) < 0 ||
            pskb_trim(clone, to_copy)) {
                kfree_skb(clone);
                return NULL;
        }
        return clone;
}
EXPORT_SYMBOL(pskb_extract);

/**
 * skb_condense - try to get rid of fragments/frag_list if possible
 * @skb: buffer
 *
 * Can be used to save memory before skb is added to a busy queue.
 * If packet has bytes in frags and enough tail room in skb->head,
 * pull all of them, so that we can free the frags right now and adjust
 * truesize.
 * Notes:
 *        We do not reallocate skb->head thus can not fail.
 *        Caller must re-evaluate skb->truesize if needed.
 */
void skb_condense(struct sk_buff *skb)
{
        if (skb->data_len) {
                if (skb->data_len > skb->end - skb->tail ||
                    skb_cloned(skb))
                        return;

                /* Nice, we can free page frag(s) right now */
                __pskb_pull_tail(skb, skb->data_len);
        }
        /* At this point, skb->truesize might be over estimated,
         * because skb had a fragment, and fragments do not tell
         * their truesize.
         * When we pulled its content into skb->head, fragment
         * was freed, but __pskb_pull_tail() could not possibly
         * adjust skb->truesize, not knowing the frag truesize.
         */
        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
EXPORT_SYMBOL(skb_condense);

#ifdef CONFIG_SKB_EXTENSIONS
static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
{
        return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}

/**
 * __skb_ext_alloc - allocate a new skb extensions storage
 *
 * @flags: See kmalloc().
 *
 * Returns the newly allocated pointer. The pointer can later attached to a
 * skb via __skb_ext_set().
 * Note: caller must handle the skb_ext as an opaque data.
 */
struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
        struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);

        if (new) {
                memset(new->offset, 0, sizeof(new->offset));
                refcount_set(&new->refcnt, 1);
        }

        return new;
}

static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
                                         unsigned int old_active)
{
        struct skb_ext *new;

        if (refcount_read(&old->refcnt) == 1)
                return old;

        new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
        if (!new)
                return NULL;

        memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
        refcount_set(&new->refcnt, 1);

#ifdef CONFIG_XFRM
        if (old_active & (1 << SKB_EXT_SEC_PATH)) {
                struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
                unsigned int i;

                for (i = 0; i < sp->len; i++)
                        xfrm_state_hold(sp->xvec[i]);
        }
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (old_active & (1 << SKB_EXT_MCTP)) {
                struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);

                if (flow->key)
                        refcount_inc(&flow->key->refs);
        }
#endif
        __skb_ext_put(old);
        return new;
}

/**
 * __skb_ext_set - attach the specified extension storage to this skb
 * @skb: buffer
 * @id: extension id
 * @ext: extension storage previously allocated via __skb_ext_alloc()
 *
 * Existing extensions, if any, are cleared.
 *
 * Returns the pointer to the extension.
 */
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext)
{
        unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);

        skb_ext_put(skb);
        newlen = newoff + skb_ext_type_len[id];
        ext->chunks = newlen;
        ext->offset[id] = newoff;
        skb->extensions = ext;
        skb->active_extensions = 1 << id;
        return skb_ext_get_ptr(ext, id);
}

/**
 * skb_ext_add - allocate space for given extension, COW if needed
 * @skb: buffer
 * @id: extension to allocate space for
 *
 * Allocates enough space for the given extension.
 * If the extension is already present, a pointer to that extension
 * is returned.
 *
 * If the skb was cloned, COW applies and the returned memory can be
 * modified without changing the extension space of clones buffers.
 *
 * Returns pointer to the extension or NULL on allocation failure.
 */
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *new, *old = NULL;
        unsigned int newlen, newoff;

        if (skb->active_extensions) {
                old = skb->extensions;

                new = skb_ext_maybe_cow(old, skb->active_extensions);
                if (!new)
                        return NULL;

                if (__skb_ext_exist(new, id))
                        goto set_active;

                newoff = new->chunks;
        } else {
                newoff = SKB_EXT_CHUNKSIZEOF(*new);

                new = __skb_ext_alloc(GFP_ATOMIC);
                if (!new)
                        return NULL;
        }

        newlen = newoff + skb_ext_type_len[id];
        new->chunks = newlen;
        new->offset[id] = newoff;
set_active:
        skb->slow_gro = 1;
        skb->extensions = new;
        skb->active_extensions |= 1 << id;
        return skb_ext_get_ptr(new, id);
}
EXPORT_SYMBOL(skb_ext_add);

#ifdef CONFIG_XFRM
static void skb_ext_put_sp(struct sec_path *sp)
{
        unsigned int i;

        for (i = 0; i < sp->len; i++)
                xfrm_state_put(sp->xvec[i]);
}
#endif

#ifdef CONFIG_MCTP_FLOWS
static void skb_ext_put_mctp(struct mctp_flow *flow)
{
        if (flow->key)
                mctp_key_unref(flow->key);
}
#endif

void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *ext = skb->extensions;

        skb->active_extensions &= ~(1 << id);
        if (skb->active_extensions == 0) {
                skb->extensions = NULL;
                __skb_ext_put(ext);
#ifdef CONFIG_XFRM
        } else if (id == SKB_EXT_SEC_PATH &&
                   refcount_read(&ext->refcnt) == 1) {
                struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);

                skb_ext_put_sp(sp);
                sp->len = 0;
#endif
        }
}
EXPORT_SYMBOL(__skb_ext_del);

void __skb_ext_put(struct skb_ext *ext)
{
        /* If this is last clone, nothing can increment
         * it after check passes.  Avoids one atomic op.
         */
        if (refcount_read(&ext->refcnt) == 1)
                goto free_now;

        if (!refcount_dec_and_test(&ext->refcnt))
                return;
free_now:
#ifdef CONFIG_XFRM
        if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
                skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (__skb_ext_exist(ext, SKB_EXT_MCTP))
                skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
#endif

        kmem_cache_free(skbuff_ext_cache, ext);
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */

static void kfree_skb_napi_cache(struct sk_buff *skb)
{
        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        local_bh_disable();
        __napi_kfree_skb(skb, SKB_CONSUMED);
        local_bh_enable();
}

/**
 * skb_attempt_defer_free - queue skb for remote freeing
 * @skb: buffer
 *
 * Put @skb in a per-cpu list, using the cpu which
 * allocated the skb/pages to reduce false sharing
 * and memory zone spinlock contention.
 */
void skb_attempt_defer_free(struct sk_buff *skb)
{
        int cpu = skb->alloc_cpu;
        struct softnet_data *sd;
        unsigned int defer_max;
        bool kick;

        if (cpu == raw_smp_processor_id() ||
            WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
            !cpu_online(cpu)) {
nodefer:        kfree_skb_napi_cache(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
        DEBUG_NET_WARN_ON_ONCE(skb->destructor);

        sd = &per_cpu(softnet_data, cpu);
        defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
        if (READ_ONCE(sd->defer_count) >= defer_max)
                goto nodefer;

        spin_lock_bh(&sd->defer_lock);
        /* Send an IPI every time queue reaches half capacity. */
        kick = sd->defer_count == (defer_max >> 1);
        /* Paired with the READ_ONCE() few lines above */
        WRITE_ONCE(sd->defer_count, sd->defer_count + 1);

        skb->next = sd->defer_list;
        /* Paired with READ_ONCE() in skb_defer_free_flush() */
        WRITE_ONCE(sd->defer_list, skb);
        spin_unlock_bh(&sd->defer_lock);

        /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
         * if we are unlucky enough (this seems very unlikely).
         */
        if (unlikely(kick))
                kick_defer_list_purge(sd, cpu);
}

static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
                                 size_t offset, size_t len)
{
        const char *kaddr;
        __wsum csum;

        kaddr = kmap_local_page(page);
        csum = csum_partial(kaddr + offset, len, 0);
        kunmap_local(kaddr);
        skb->csum = csum_block_add(skb->csum, csum, skb->len);
}

/**
 * skb_splice_from_iter - Splice (or copy) pages to skbuff
 * @skb: The buffer to add pages to
 * @iter: Iterator representing the pages to be added
 * @maxsize: Maximum amount of pages to be added
 * @gfp: Allocation flags
 *
 * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
 * extracts pages from an iterator and adds them to the socket buffer if
 * possible, copying them to fragments if not possible (such as if they're slab
 * pages).
 *
 * Returns the amount of data spliced/copied or -EMSGSIZE if there's
 * insufficient space in the buffer to transfer anything.
 */
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize, gfp_t gfp)
{
        size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
        struct page *pages[8], **ppages = pages;
        ssize_t spliced = 0, ret = 0;
        unsigned int i;

        while (iter->count > 0) {
                ssize_t space, nr, len;
                size_t off;

                ret = -EMSGSIZE;
                space = frag_limit - skb_shinfo(skb)->nr_frags;
                if (space < 0)
                        break;

                /* We might be able to coalesce without increasing nr_frags */
                nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));

                len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
                if (len <= 0) {
                        ret = len ?: -EIO;
                        break;
                }

                i = 0;
                do {
                        struct page *page = pages[i++];
                        size_t part = min_t(size_t, PAGE_SIZE - off, len);

                        ret = -EIO;
                        if (WARN_ON_ONCE(!sendpage_ok(page)))
                                goto out;

                        ret = skb_append_pagefrags(skb, page, off, part,
                                                   frag_limit);
                        if (ret < 0) {
                                iov_iter_revert(iter, len);
                                goto out;
                        }

                        if (skb->ip_summed == CHECKSUM_NONE)
                                skb_splice_csum_page(skb, page, off, part);

                        off = 0;
                        spliced += part;
                        maxsize -= part;
                        len -= part;
                } while (len > 0);

                if (maxsize <= 0)
                        break;
        }

out:
        skb_len_add(skb, spliced);
        return spliced ?: ret;
}
EXPORT_SYMBOL(skb_splice_from_iter);

static __always_inline
size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
                             size_t len, void *to, void *priv2)
{
        __wsum *csum = priv2;
        __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);

        *csum = csum_block_add(*csum, next, progress);
        return 0;
}

static __always_inline
size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
                                size_t len, void *to, void *priv2)
{
        __wsum next, *csum = priv2;

        next = csum_and_copy_from_user(iter_from, to + progress, len);
        *csum = csum_block_add(*csum, next, progress);
        return next ? 0 : len;
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
                                  __wsum *csum, struct iov_iter *i)
{
        size_t copied;

        if (WARN_ON_ONCE(!i->data_source))
                return false;
        copied = iterate_and_advance2(i, bytes, addr, csum,
                                      copy_from_user_iter_csum,
                                      memcpy_from_iter_csum);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);























    2 



































    2 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM workqueue

#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WORKQUEUE_H

#include <linux/tracepoint.h>
#include <linux/workqueue.h>

struct pool_workqueue;

/**
 * workqueue_queue_work - called when a work gets queued
 * @req_cpu:        the requested cpu
 * @pwq:        pointer to struct pool_workqueue
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued on a workqueue (ie: once the delay
 * has been reached).
 */
TRACE_EVENT(workqueue_queue_work,

        TP_PROTO(int req_cpu, struct pool_workqueue *pwq,
                 struct work_struct *work),

        TP_ARGS(req_cpu, pwq, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __string( workqueue,        pwq->wq->name)
                __field( int,        req_cpu        )
                __field( int,        cpu        )
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __assign_str(workqueue);
                __entry->req_cpu        = req_cpu;
                __entry->cpu                = pwq->pool->cpu;
        ),

        TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d",
                  __entry->work, __entry->function, __get_str(workqueue),
                  __entry->req_cpu, __entry->cpu)
);

/**
 * workqueue_activate_work - called when a work gets activated
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a queued work is put on the active queue,
 * which happens immediately after queueing unless @max_active limit
 * is reached.
 */
TRACE_EVENT(workqueue_activate_work,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p function=%ps ", __entry->work, __entry->function)
);

/**
 * workqueue_execute_start - called immediately before the workqueue callback
 * @work:        pointer to struct work_struct
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_start,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * workqueue_execute_end - called immediately after the workqueue callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_end,

        TP_PROTO(struct work_struct *work, work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

#endif /*  _TRACE_WORKQUEUE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These are the definitions needed for the tsnmap type.  The tsnmap is used
 * to track out of order TSNs received.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *   Jon Grimm             <jgrimm@us.ibm.com>
 *   La Monte H.P. Yarroll <piggy@acm.org>
 *   Karl Knutson          <karl@athena.chicago.il.us>
 *   Sridhar Samudrala     <sri@us.ibm.com>
 */
#include <net/sctp/constants.h>

#ifndef __sctp_tsnmap_h__
#define __sctp_tsnmap_h__

/* RFC 2960 12.2 Parameters necessary per association (i.e. the TCB)
 * Mapping  An array of bits or bytes indicating which out of
 * Array    order TSN's have been received (relative to the
 *          Last Rcvd TSN). If no gaps exist, i.e. no out of
 *          order packets have been received, this array
 *          will be set to all zero. This structure may be
 *          in the form of a circular buffer or bit array.
 */
struct sctp_tsnmap {
        /* This array counts the number of chunks with each TSN.
         * It points at one of the two buffers with which we will
         * ping-pong between.
         */
        unsigned long *tsn_map;

        /* This is the TSN at tsn_map[0].  */
        __u32 base_tsn;

        /* Last Rcvd   : This is the last TSN received in
         * TSN               : sequence. This value is set initially by
         *             : taking the peer's Initial TSN, received in
         *             : the INIT or INIT ACK chunk, and subtracting
         *             : one from it.
         *
         * Throughout most of the specification this is called the
         * "Cumulative TSN ACK Point".  In this case, we
         * ignore the advice in 12.2 in favour of the term
         * used in the bulk of the text.
         */
        __u32 cumulative_tsn_ack_point;

        /* This is the highest TSN we've marked.  */
        __u32 max_tsn_seen;

        /* This is the minimum number of TSNs we can track.  This corresponds
         * to the size of tsn_map.   Note: the overflow_map allows us to
         * potentially track more than this quantity.
         */
        __u16 len;

        /* Data chunks pending receipt. used by SCTP_STATUS sockopt */
        __u16 pending_data;

        /* Record duplicate TSNs here.  We clear this after
         * every SACK.  Store up to SCTP_MAX_DUP_TSNS worth of
         * information.
         */
        __u16 num_dup_tsns;
        __be32 dup_tsns[SCTP_MAX_DUP_TSNS];
};

struct sctp_tsnmap_iter {
        __u32 start;
};

/* Initialize a block of memory as a tsnmap.  */
struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *, __u16 len,
                                     __u32 initial_tsn, gfp_t gfp);

void sctp_tsnmap_free(struct sctp_tsnmap *map);

/* Test the tracking state of this TSN.
 * Returns:
 *   0 if the TSN has not yet been seen
 *  >0 if the TSN has been seen (duplicate)
 *  <0 if the TSN is invalid (too large to track)
 */
int sctp_tsnmap_check(const struct sctp_tsnmap *, __u32 tsn);

/* Mark this TSN as seen.  */
int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn,
                     struct sctp_transport *trans);

/* Mark this TSN and all lower as seen. */
void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn);

/* Retrieve the Cumulative TSN ACK Point.  */
static inline __u32 sctp_tsnmap_get_ctsn(const struct sctp_tsnmap *map)
{
        return map->cumulative_tsn_ack_point;
}

/* Retrieve the highest TSN we've seen.  */
static inline __u32 sctp_tsnmap_get_max_tsn_seen(const struct sctp_tsnmap *map)
{
        return map->max_tsn_seen;
}

/* How many duplicate TSNs are stored? */
static inline __u16 sctp_tsnmap_num_dups(struct sctp_tsnmap *map)
{
        return map->num_dup_tsns;
}

/* Return pointer to duplicate tsn array as needed by SACK. */
static inline __be32 *sctp_tsnmap_get_dups(struct sctp_tsnmap *map)
{
        map->num_dup_tsns = 0;
        return map->dup_tsns;
}

/* How many gap ack blocks do we have recorded? */
__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map,
                           struct sctp_gap_ack_block *gabs);

/* Refresh the count on pending data. */
__u16 sctp_tsnmap_pending(struct sctp_tsnmap *map);

/* Is there a gap in the TSN map?  */
static inline int sctp_tsnmap_has_gap(const struct sctp_tsnmap *map)
{
        return map->cumulative_tsn_ack_point != map->max_tsn_seen;
}

/* Mark a duplicate TSN.  Note:  limit the storage of duplicate TSN
 * information.
 */
static inline void sctp_tsnmap_mark_dup(struct sctp_tsnmap *map, __u32 tsn)
{
        if (map->num_dup_tsns < SCTP_MAX_DUP_TSNS)
                map->dup_tsns[map->num_dup_tsns++] = htonl(tsn);
}

/* Renege a TSN that was seen.  */
void sctp_tsnmap_renege(struct sctp_tsnmap *, __u32 tsn);

/* Is there a gap in the TSN map? */
int sctp_tsnmap_has_gap(const struct sctp_tsnmap *);

#endif /* __sctp_tsnmap_h__ */




























    1 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pagemap

#if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGEMAP_H

#include <linux/tracepoint.h>
#include <linux/mm.h>

#define        PAGEMAP_MAPPED                0x0001u
#define PAGEMAP_ANONYMOUS        0x0002u
#define PAGEMAP_FILE                0x0004u
#define PAGEMAP_SWAPCACHE        0x0008u
#define PAGEMAP_SWAPBACKED        0x0010u
#define PAGEMAP_MAPPEDDISK        0x0020u
#define PAGEMAP_BUFFERS                0x0040u

#define trace_pagemap_flags(folio) ( \
        (folio_test_anon(folio)                ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
        (folio_mapped(folio)                ? PAGEMAP_MAPPED     : 0) | \
        (folio_test_swapcache(folio)        ? PAGEMAP_SWAPCACHE  : 0) | \
        (folio_test_swapbacked(folio)        ? PAGEMAP_SWAPBACKED : 0) | \
        (folio_test_mappedtodisk(folio)        ? PAGEMAP_MAPPEDDISK : 0) | \
        (folio_test_private(folio)        ? PAGEMAP_BUFFERS    : 0) \
        )

TRACE_EVENT(mm_lru_insertion,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
                __field(enum lru_list,        lru        )
                __field(unsigned long,        flags        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
                __entry->lru        = folio_lru_list(folio);
                __entry->flags        = trace_pagemap_flags(folio);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
                        __entry->folio,
                        __entry->pfn,
                        __entry->lru,
                        __entry->flags & PAGEMAP_MAPPED                ? "M" : " ",
                        __entry->flags & PAGEMAP_ANONYMOUS        ? "a" : "f",
                        __entry->flags & PAGEMAP_SWAPCACHE        ? "s" : " ",
                        __entry->flags & PAGEMAP_SWAPBACKED        ? "b" : " ",
                        __entry->flags & PAGEMAP_MAPPEDDISK        ? "d" : " ",
                        __entry->flags & PAGEMAP_BUFFERS        ? "B" : " ")
);

TRACE_EVENT(mm_lru_activate,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
        ),

        TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
);

#endif /* _TRACE_PAGEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H

#include <linux/const.h>
#include <asm/pgtable_64_types.h>

#ifndef __ASSEMBLY__

/*
 * This file contains the functions and defines necessary to modify and use
 * the x86-64 page table tree.
 */
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
#include <asm/fixmap.h>

extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];

#define swapper_pg_dir init_top_pgt

extern void paging_init(void);
static inline void sync_initial_page_table(void) { }

#define pte_ERROR(e)                                        \
        pr_err("%s:%d: bad pte %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e)                                        \
        pr_err("%s:%d: bad pmd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e)                                        \
        pr_err("%s:%d: bad pud %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pud_val(e))

#if CONFIG_PGTABLE_LEVELS >= 5
#define p4d_ERROR(e)                                        \
        pr_err("%s:%d: bad p4d %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), p4d_val(e))
#endif

#define pgd_ERROR(e)                                        \
        pr_err("%s:%d: bad pgd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pgd_val(e))

struct mm_struct;

#define mm_p4d_folded mm_p4d_folded
static inline bool mm_p4d_folded(struct mm_struct *mm)
{
        return !pgtable_l5_enabled();
}

void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);

static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
        WRITE_ONCE(*ptep, pte);
}

static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
{
        native_set_pte(ptep, native_make_pte(0));
}

static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
        native_set_pte(ptep, pte);
}

static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        WRITE_ONCE(*pmdp, pmd);
}

static inline void native_pmd_clear(pmd_t *pmd)
{
        native_set_pmd(pmd, native_make_pmd(0));
}

static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pte(xchg(&xp->pte, 0));
#else
        /* native_local_ptep_get_and_clear,
           but duplicated because of cyclic dependency */
        pte_t ret = *xp;
        native_pte_clear(NULL, 0, xp);
        return ret;
#endif
}

static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pmd(xchg(&xp->pmd, 0));
#else
        /* native_local_pmdp_get_and_clear,
           but duplicated because of cyclic dependency */
        pmd_t ret = *xp;
        native_pmd_clear(xp);
        return ret;
#endif
}

static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
        WRITE_ONCE(*pudp, pud);
}

static inline void native_pud_clear(pud_t *pud)
{
        native_set_pud(pud, native_make_pud(0));
}

static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pud(xchg(&xp->pud, 0));
#else
        /* native_local_pudp_get_and_clear,
         * but duplicated because of cyclic dependency
         */
        pud_t ret = *xp;

        native_pud_clear(xp);
        return ret;
#endif
}

static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        pgd_t pgd;

        if (pgtable_l5_enabled() ||
            !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) {
                WRITE_ONCE(*p4dp, p4d);
                return;
        }

        pgd = native_make_pgd(native_p4d_val(p4d));
        pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
        WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}

static inline void native_p4d_clear(p4d_t *p4d)
{
        native_set_p4d(p4d, native_make_p4d(0));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}

static inline void native_pgd_clear(pgd_t *pgd)
{
        native_set_pgd(pgd, native_make_pgd(0));
}

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 */

/* PGD - Level 4 access */

/* PUD - Level 3 access */

/* PMD - Level 2 access */

/* PTE - Level 1 access */

/*
 * Encode and de-code a swap entry
 *
 * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
 * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
 * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| E|F|SD|0| <- swp entry
 *
 * G (8) is aliased and used as a PROT_NONE indicator for
 * !present ptes.  We need to start storing swap entries above
 * there.  We also need to avoid using A and D because of an
 * erratum where they can be incorrectly set by hardware on
 * non-present PTEs.
 *
 * SD Bits 1-4 are not used in non-present format and available for
 * special use described below:
 *
 * SD (1) in swp entry is used to store soft dirty bit, which helps us
 * remember soft dirty over page migration
 *
 * F (2) in swp entry is used to record when a pagetable is
 * writeprotected by userfaultfd WP support.
 *
 * E (3) in swp entry is used to remember PG_anon_exclusive.
 *
 * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
 * but also L and G.
 *
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define SWP_TYPE_BITS                5

#define SWP_OFFSET_FIRST_BIT        (_PAGE_BIT_PROTNONE + 1)

/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT        (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)

#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)

/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))

/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)

/*
 * Shift the offset up "too far" by TYPE bits, then down again
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define __swp_entry(type, offset) ((swp_entry_t) { \
        (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
        | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })

#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd)                ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x)                (__pte((x).val))
#define __swp_entry_to_pmd(x)                (__pmd((x).val))

extern void cleanup_highmap(void);

#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
#define HAVE_ARCH_UNMAPPED_AREA_VMFLAGS

#define PAGE_AGP    PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1

/* fs/proc/kcore.c */
#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)

#define __HAVE_ARCH_PTE_SAME

#define vmemmap ((struct page *)VMEMMAP_START)

extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);

#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        if (end >> __VIRTUAL_MASK_SHIFT)
                return false;
        return true;
}

#include <asm/pgtable-invert.h>

#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */





































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H

#include <linux/blk-crypto.h>
#include <linux/memblock.h>        /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"

struct elevator_type;

/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT                (5 * HZ)

extern struct dentry *blk_debugfs_root;

struct blk_flush_queue {
        spinlock_t                mq_flush_lock;
        unsigned int                flush_pending_idx:1;
        unsigned int                flush_running_idx:1;
        blk_status_t                 rq_status;
        unsigned long                flush_pending_since;
        struct list_head        flush_queue[2];
        unsigned long                flush_data_in_flight;
        struct request                *flush_rq;
};

bool is_flush_rq(struct request *req);

struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);

void blk_freeze_queue(struct request_queue *q);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio);
void bio_await_chain(struct bio *bio);

static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
        rcu_read_lock();
        if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
                goto fail;

        /*
         * The code that increments the pm_only counter must ensure that the
         * counter is globally visible before the queue is unfrozen.
         */
        if (blk_queue_pm_only(q) &&
            (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
                goto fail_put;

        rcu_read_unlock();
        return true;

fail_put:
        blk_queue_exit(q);
fail:
        rcu_read_unlock();
        return false;
}

static inline int bio_queue_enter(struct bio *bio)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);

        if (blk_try_enter_queue(q, false))
                return 0;
        return __bio_queue_enter(q, bio);
}

static inline void blk_wait_io(struct completion *done)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                while (!wait_for_completion_io_timeout(done, timeout))
                        ;
        else
                wait_for_completion_io(done);
}

#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
                gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);

bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
                struct page *page, unsigned len, unsigned offset,
                bool *same_page);

static inline bool biovec_phys_mergeable(struct request_queue *q,
                struct bio_vec *vec1, struct bio_vec *vec2)
{
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
        phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;

        /*
         * Merging adjacent physical pages may not work correctly under KMSAN
         * if their metadata pages aren't adjacent. Just disable merging.
         */
        if (IS_ENABLED(CONFIG_KMSAN))
                return false;

        if (addr1 + vec1->bv_len != addr2)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
                return false;
        if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
                return false;
        return true;
}

static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        return (offset & lim->virt_boundary_mask) ||
                ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}

/*
 * Check if adding a bio_vec after bprv with offset would create a gap in
 * the SG list. Most drivers don't care about this, but some do.
 */
static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        if (!lim->virt_boundary_mask)
                return false;
        return __bvec_gap_to_prev(lim, bprv, offset);
}

static inline bool rq_mergeable(struct request *rq)
{
        if (blk_rq_is_passthrough(rq))
                return false;

        if (req_op(rq) == REQ_OP_FLUSH)
                return false;

        if (req_op(rq) == REQ_OP_WRITE_ZEROES)
                return false;

        if (req_op(rq) == REQ_OP_ZONE_APPEND)
                return false;

        if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                return false;
        if (rq->rq_flags & RQF_NOMERGE_FLAGS)
                return false;

        return true;
}

/*
 * There are two different ways to handle DISCARD merges:
 *  1) If max_discard_segments > 1, the driver treats every bio as a range and
 *     send the bios to controller together. The ranges don't need to be
 *     contiguous.
 *  2) Otherwise, the request will be normal read/write requests.  The ranges
 *     need to be contiguous.
 */
static inline bool blk_discard_mergable(struct request *req)
{
        if (req_op(req) == REQ_OP_DISCARD &&
            queue_max_discard_segments(req->q) > 1)
                return true;
        return false;
}

static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
        if (req_op(rq) == REQ_OP_DISCARD)
                return queue_max_discard_segments(rq->q);
        return queue_max_segments(rq->q);
}

static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
                                                     enum req_op op)
{
        if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
                return min(q->limits.max_discard_sectors,
                           UINT_MAX >> SECTOR_SHIFT);

        if (unlikely(op == REQ_OP_WRITE_ZEROES))
                return q->limits.max_write_zeroes_sectors;

        return q->limits.max_sectors;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
        if (bio_integrity(bio))
                return __bio_integrity_endio(bio);
        return true;
}

bool blk_integrity_merge_rq(struct request_queue *, struct request *,
                struct request *);
bool blk_integrity_merge_bio(struct request_queue *, struct request *,
                struct bio *);

static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        struct bio_integrity_payload *bip = bio_integrity(req->bio);
        struct bio_integrity_payload *bip_next = bio_integrity(next);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct bio_integrity_payload *bip_next = bio_integrity(req->bio);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

extern const struct attribute_group blk_integrity_attr_group;
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool blk_integrity_merge_rq(struct request_queue *rq,
                struct request *r1, struct request *r2)
{
        return true;
}
static inline bool blk_integrity_merge_bio(struct request_queue *rq,
                struct request *r, struct bio *b)
{
        return true;
}
static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        return false;
}
static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        return false;
}

static inline void blk_flush_integrity(void)
{
}
static inline bool bio_integrity_endio(struct bio *bio)
{
        return true;
}
static inline void bio_integrity_free(struct bio *bio)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);

enum bio_merge_status {
        BIO_MERGE_OK,
        BIO_MERGE_NONE,
        BIO_MERGE_FAILED,
};

enum bio_merge_status bio_attempt_back_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs);

/*
 * Plug flush limits
 */
#define BLK_MAX_REQUEST_COUNT        32
#define BLK_PLUG_FLUSH_SIZE        (128 * 1024)

/*
 * Internal elevator interface
 */
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)

bool blk_insert_flush(struct request *rq);

int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
void elevator_disable(struct request_queue *q);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);

ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
                const char *buf, size_t count);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
                                const char *, size_t);

static inline bool bio_may_exceed_limits(struct bio *bio,
                                         const struct queue_limits *lim)
{
        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
        case REQ_OP_WRITE_ZEROES:
                return true; /* non-trivial splitting decisions */
        default:
                break;
        }

        /*
         * All drivers must accept single-segments bios that are <= PAGE_SIZE.
         * This is a quick and dirty check that relies on the fact that
         * bi_io_vec[0] is always valid if a bio has data.  The check might
         * lead to occasional false negatives when bios are cloned, but compared
         * to the performance impact of cloned bios themselves the loop below
         * doesn't matter anyway.
         */
        return lim->chunk_sectors || bio->bi_vcnt != 1 ||
                bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}

struct bio *__bio_split_to_limits(struct bio *bio,
                                  const struct queue_limits *lim,
                                  unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                                struct request *next);
unsigned int blk_recalc_rq_segments(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);

int blk_set_default_limits(struct queue_limits *lim);
int blk_dev_init(void);

/*
 * Contribute to IO statistics IFF:
 *
 *        a) it's attached to a gendisk, and
 *        b) the queue had IO stats enabled when this request was started
 */
static inline bool blk_do_io_stat(struct request *rq)
{
        return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
}

void update_io_ticks(struct block_device *part, unsigned long now, bool end);
unsigned int part_in_flight(struct block_device *part);

static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
        req->cmd_flags |= REQ_NOMERGE;
        if (req == q->last_merge)
                q->last_merge = NULL;
}

/*
 * Internal io_context interface
 */
struct io_cq *ioc_find_get_icq(struct request_queue *q);
struct io_cq *ioc_lookup_icq(struct request_queue *q);
#ifdef CONFIG_BLK_ICQ
void ioc_clear_queue(struct request_queue *q);
#else
static inline void ioc_clear_queue(struct request_queue *q)
{
}
#endif /* CONFIG_BLK_ICQ */

struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);

static inline bool blk_queue_may_bounce(struct request_queue *q)
{
        return IS_ENABLED(CONFIG_BOUNCE) &&
                q->limits.bounce == BLK_BOUNCE_HIGH &&
                max_low_pfn >= max_pfn;
}

static inline struct bio *blk_queue_bounce(struct bio *bio,
                struct request_queue *q)
{
        if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
                return __blk_queue_bounce(bio, q);
        return bio;
}

#ifdef CONFIG_BLK_DEV_ZONED
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
static inline bool bio_is_zone_append(struct bio *bio)
{
        return bio_op(bio) == REQ_OP_ZONE_APPEND ||
                bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
void blk_zone_write_plug_bio_merged(struct bio *bio);
void blk_zone_write_plug_init_request(struct request *rq);
static inline void blk_zone_update_request_bio(struct request *rq,
                                               struct bio *bio)
{
        /*
         * For zone append requests, the request sector indicates the location
         * at which the BIO data was written. Return this value to the BIO
         * issuer through the BIO iter sector.
         * For plugged zone writes, which include emulated zone append, we need
         * the original BIO sector so that blk_zone_write_plug_bio_endio() can
         * lookup the zone write plug.
         */
        if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
                bio->bi_iter.bi_sector = rq->__sector;
}
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
        /*
         * For write BIOs to zoned devices, signal the completion of the BIO so
         * that the next write BIO can be submitted by zone write plugging.
         */
        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_bio_endio(bio);
}

void blk_zone_write_plug_finish_request(struct request *rq);
static inline void blk_zone_finish_request(struct request *rq)
{
        if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                blk_zone_write_plug_finish_request(rq);
}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
                unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
                unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline void disk_init_zone_resources(struct gendisk *disk)
{
}
static inline void disk_free_zone_resources(struct gendisk *disk)
{
}
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return false;
}
static inline bool bio_is_zone_append(struct bio *bio)
{
        return false;
}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
static inline void blk_zone_update_request_bio(struct request *rq,
                                               struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
{
}
static inline void blk_zone_finish_request(struct request *rq)
{
}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
                unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
                blk_mode_t mode, unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
#endif /* CONFIG_BLK_DEV_ZONED */

struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
void bdev_add(struct block_device *bdev, dev_t dev);
void bdev_unhash(struct block_device *bdev);
void bdev_drop(struct block_device *bdev);

int blk_alloc_ext_minor(void);
void blk_free_ext_minor(unsigned int minor);
#define ADDPART_FLAG_NONE        0
#define ADDPART_FLAG_RAID        1
#define ADDPART_FLAG_WHOLEDISK        2
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
int bdev_del_partition(struct gendisk *disk, int partno);
int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
void drop_partition(struct block_device *part);

void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
                struct lock_class_key *lkclass);

int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset,
                unsigned int max_sectors, bool *same_page);

/*
 * Clean up a page appropriately, where the page may be pinned, may have a
 * ref taken on it or neither.
 */
static inline void bio_release_page(struct bio *bio, struct page *page)
{
        if (bio_flagged(bio, BIO_PAGE_PINNED))
                unpin_user_page(page);
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);

int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
void disk_block_events(struct gendisk *disk);
void disk_unblock_events(struct gendisk *disk);
void disk_flush_events(struct gendisk *disk, unsigned int mask);
extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;

extern struct attribute_group blk_trace_attr_group;

blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
                loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);

extern const struct address_space_operations def_blk_aops;

int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);

#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool should_fail_request(struct block_device *part,
                                        unsigned int bytes)
{
        return false;
}
#endif /* CONFIG_FAIL_MAKE_REQUEST */

/*
 * Optimized request reference counting. Ideally we'd make timeouts be more
 * clever, as that's the only reason we need references at all... But until
 * this happens, this is faster than using refcount_t. Also see:
 *
 * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
 */
#define req_ref_zero_or_close_to_overflow(req)        \
        ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)

static inline bool req_ref_inc_not_zero(struct request *req)
{
        return atomic_inc_not_zero(&req->ref);
}

static inline bool req_ref_put_and_test(struct request *req)
{
        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        return atomic_dec_and_test(&req->ref);
}

static inline void req_ref_set(struct request *req, int value)
{
        atomic_set(&req->ref, value);
}

static inline int req_ref_read(struct request *req)
{
        return atomic_read(&req->ref);
}

static inline u64 blk_time_get_ns(void)
{
        struct blk_plug *plug = current->plug;

        if (!plug || !in_task())
                return ktime_get_ns();

        /*
         * 0 could very well be a valid time, but rather than flag "this is
         * a valid timestamp" separately, just accept that we'll do an extra
         * ktime_get_ns() if we just happen to get 0 as the current time.
         */
        if (!plug->cur_ktime) {
                plug->cur_ktime = ktime_get_ns();
                current->flags |= PF_BLOCK_TS;
        }
        return plug->cur_ktime;
}

static inline ktime_t blk_time_get(void)
{
        return ns_to_ktime(blk_time_get_ns());
}

/*
 * From most significant bit:
 * 1 bit: reserved for other usage, see below
 * 12 bits: original size of bio
 * 51 bits: issue time of bio
 */
#define BIO_ISSUE_RES_BITS      1
#define BIO_ISSUE_SIZE_BITS     12
#define BIO_ISSUE_RES_SHIFT     (64 - BIO_ISSUE_RES_BITS)
#define BIO_ISSUE_SIZE_SHIFT    (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
#define BIO_ISSUE_TIME_MASK     ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
#define BIO_ISSUE_SIZE_MASK     \
        (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
#define BIO_ISSUE_RES_MASK      (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))

/* Reserved bit for blk-throtl */
#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)

static inline u64 __bio_issue_time(u64 time)
{
        return time & BIO_ISSUE_TIME_MASK;
}

static inline u64 bio_issue_time(struct bio_issue *issue)
{
        return __bio_issue_time(issue->value);
}

static inline sector_t bio_issue_size(struct bio_issue *issue)
{
        return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
}

static inline void bio_issue_init(struct bio_issue *issue,
                                       sector_t size)
{
        size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
        issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
                        (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
                        ((u64)size << BIO_ISSUE_SIZE_SHIFT));
}

void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
              const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);

#endif /* BLK_INTERNAL_H */



































































































































































































































































    1 
























































































    1 




































    1 

    1 












    1 






    1 



























































































































































    1 
    1 











    1 


    1 






    1 








    1 




    1 





    1 

    1 




















    1 








    1 









    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2003 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * The base lksctp header.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Xingang Guo           <xingang.guo@intel.com>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Daisy Chang           <daisyc@us.ibm.com>
 *    Sridhar Samudrala     <sri@us.ibm.com>
 *    Ardelle Fan           <ardelle.fan@intel.com>
 *    Ryan Layer            <rmlayer@us.ibm.com>
 *    Kevin Gao             <kevin.gao@intel.com> 
 */

#ifndef __net_sctp_h__
#define __net_sctp_h__

/* Header Strategy.
 *    Start getting some control over the header file depencies:
 *       includes
 *       constants
 *       structs
 *       prototypes
 *       macros, externs, and inlines
 *
 *   Move test_frame specific items out of the kernel headers
 *   and into the test frame headers.   This is not perfect in any sense
 *   and will continue to evolve.
 */

#include <linux/types.h>
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/spinlock.h>
#include <linux/jiffies.h>
#include <linux/idr.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ip6_route.h>
#endif

#include <linux/uaccess.h>
#include <asm/page.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/sctp/structs.h>
#include <net/sctp/constants.h>

#ifdef CONFIG_IP_SCTP_MODULE
#define SCTP_PROTOSW_FLAG 0
#else /* static! */
#define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT
#endif

/*
 * Function declarations.
 */

/*
 * sctp/protocol.c
 */
int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *addr,
                              enum sctp_scope, gfp_t gfp, int flags);
struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
int sctp_register_pf(struct sctp_pf *, sa_family_t);
void sctp_addr_wq_mgmt(struct net *, struct sctp_sockaddr_entry *, int);
int sctp_udp_sock_start(struct net *net);
void sctp_udp_sock_stop(struct net *net);

/*
 * sctp/socket.c
 */
int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr,
                      int addr_len, int flags);
int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
int sctp_inet_listen(struct socket *sock, int backlog);
void sctp_write_space(struct sock *sk);
void sctp_data_ready(struct sock *sk);
__poll_t sctp_poll(struct file *file, struct socket *sock,
                poll_table *wait);
void sctp_sock_rfree(struct sk_buff *skb);
void sctp_copy_sock(struct sock *newsk, struct sock *sk,
                    struct sctp_association *asoc);
extern struct percpu_counter sctp_sockets_allocated;
int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *);

typedef int (*sctp_callback_t)(struct sctp_endpoint *, struct sctp_transport *, void *);
void sctp_transport_walk_start(struct rhashtable_iter *iter);
void sctp_transport_walk_stop(struct rhashtable_iter *iter);
struct sctp_transport *sctp_transport_get_next(struct net *net,
                        struct rhashtable_iter *iter);
struct sctp_transport *sctp_transport_get_idx(struct net *net,
                        struct rhashtable_iter *iter, int pos);
int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net,
                                  const union sctp_addr *laddr,
                                  const union sctp_addr *paddr, void *p, int dif);
int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done,
                                    struct net *net, int *pos, void *p);
int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p);
int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
                       struct sctp_info *info);

/*
 * sctp/primitive.c
 */
int sctp_primitive_ASSOCIATE(struct net *, struct sctp_association *, void *arg);
int sctp_primitive_SHUTDOWN(struct net *, struct sctp_association *, void *arg);
int sctp_primitive_ABORT(struct net *, struct sctp_association *, void *arg);
int sctp_primitive_SEND(struct net *, struct sctp_association *, void *arg);
int sctp_primitive_REQUESTHEARTBEAT(struct net *, struct sctp_association *, void *arg);
int sctp_primitive_ASCONF(struct net *, struct sctp_association *, void *arg);
int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc,
                          void *arg);

/*
 * sctp/input.c
 */
int sctp_rcv(struct sk_buff *skb);
int sctp_v4_err(struct sk_buff *skb, u32 info);
int sctp_hash_endpoint(struct sctp_endpoint *ep);
void sctp_unhash_endpoint(struct sctp_endpoint *);
struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
                             struct sctphdr *, struct sctp_association **,
                             struct sctp_transport **);
void sctp_err_finish(struct sock *, struct sctp_transport *);
int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb);
int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb);
void sctp_icmp_frag_needed(struct sock *, struct sctp_association *,
                           struct sctp_transport *t, __u32 pmtu);
void sctp_icmp_redirect(struct sock *, struct sctp_transport *,
                        struct sk_buff *);
void sctp_icmp_proto_unreachable(struct sock *sk,
                                 struct sctp_association *asoc,
                                 struct sctp_transport *t);
int sctp_transport_hashtable_init(void);
void sctp_transport_hashtable_destroy(void);
int sctp_hash_transport(struct sctp_transport *t);
void sctp_unhash_transport(struct sctp_transport *t);
struct sctp_transport *sctp_addrs_lookup_transport(
                                struct net *net,
                                const union sctp_addr *laddr,
                                const union sctp_addr *paddr,
                                int dif, int sdif);
struct sctp_transport *sctp_epaddr_lookup_transport(
                                const struct sctp_endpoint *ep,
                                const union sctp_addr *paddr);
bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif);

/*
 * sctp/proc.c
 */
int __net_init sctp_proc_init(struct net *net);

/*
 * sctp/offload.c
 */
int sctp_offload_init(void);

/*
 * sctp/stream_sched.c
 */
void sctp_sched_ops_init(void);

/*
 * sctp/stream.c
 */
int sctp_send_reset_streams(struct sctp_association *asoc,
                            struct sctp_reset_streams *params);
int sctp_send_reset_assoc(struct sctp_association *asoc);
int sctp_send_add_streams(struct sctp_association *asoc,
                          struct sctp_add_streams *params);

/*
 * Module global variables
 */

 /*
  * sctp/protocol.c
  */
extern struct kmem_cache *sctp_chunk_cachep __read_mostly;
extern struct kmem_cache *sctp_bucket_cachep __read_mostly;
extern long sysctl_sctp_mem[3];
extern int sysctl_sctp_rmem[3];
extern int sysctl_sctp_wmem[3];

/*
 *  Section:  Macros, externs, and inlines
 */

/* SCTP SNMP MIB stats handlers */
#define SCTP_INC_STATS(net, field)        SNMP_INC_STATS((net)->sctp.sctp_statistics, field)
#define __SCTP_INC_STATS(net, field)        __SNMP_INC_STATS((net)->sctp.sctp_statistics, field)
#define SCTP_DEC_STATS(net, field)        SNMP_DEC_STATS((net)->sctp.sctp_statistics, field)

/* sctp mib definitions */
enum {
        SCTP_MIB_NUM = 0,
        SCTP_MIB_CURRESTAB,                        /* CurrEstab */
        SCTP_MIB_ACTIVEESTABS,                        /* ActiveEstabs */
        SCTP_MIB_PASSIVEESTABS,                        /* PassiveEstabs */
        SCTP_MIB_ABORTEDS,                        /* Aborteds */
        SCTP_MIB_SHUTDOWNS,                        /* Shutdowns */
        SCTP_MIB_OUTOFBLUES,                        /* OutOfBlues */
        SCTP_MIB_CHECKSUMERRORS,                /* ChecksumErrors */
        SCTP_MIB_OUTCTRLCHUNKS,                        /* OutCtrlChunks */
        SCTP_MIB_OUTORDERCHUNKS,                /* OutOrderChunks */
        SCTP_MIB_OUTUNORDERCHUNKS,                /* OutUnorderChunks */
        SCTP_MIB_INCTRLCHUNKS,                        /* InCtrlChunks */
        SCTP_MIB_INORDERCHUNKS,                        /* InOrderChunks */
        SCTP_MIB_INUNORDERCHUNKS,                /* InUnorderChunks */
        SCTP_MIB_FRAGUSRMSGS,                        /* FragUsrMsgs */
        SCTP_MIB_REASMUSRMSGS,                        /* ReasmUsrMsgs */
        SCTP_MIB_OUTSCTPPACKS,                        /* OutSCTPPacks */
        SCTP_MIB_INSCTPPACKS,                        /* InSCTPPacks */
        SCTP_MIB_T1_INIT_EXPIREDS,
        SCTP_MIB_T1_COOKIE_EXPIREDS,
        SCTP_MIB_T2_SHUTDOWN_EXPIREDS,
        SCTP_MIB_T3_RTX_EXPIREDS,
        SCTP_MIB_T4_RTO_EXPIREDS,
        SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS,
        SCTP_MIB_DELAY_SACK_EXPIREDS,
        SCTP_MIB_AUTOCLOSE_EXPIREDS,
        SCTP_MIB_T1_RETRANSMITS,
        SCTP_MIB_T3_RETRANSMITS,
        SCTP_MIB_PMTUD_RETRANSMITS,
        SCTP_MIB_FAST_RETRANSMITS,
        SCTP_MIB_IN_PKT_SOFTIRQ,
        SCTP_MIB_IN_PKT_BACKLOG,
        SCTP_MIB_IN_PKT_DISCARDS,
        SCTP_MIB_IN_DATA_CHUNK_DISCARDS,
        __SCTP_MIB_MAX
};

#define SCTP_MIB_MAX    __SCTP_MIB_MAX
struct sctp_mib {
        unsigned long   mibs[SCTP_MIB_MAX];
};

/* helper function to track stats about max rto and related transport */
static inline void sctp_max_rto(struct sctp_association *asoc,
                                struct sctp_transport *trans)
{
        if (asoc->stats.max_obs_rto < (__u64)trans->rto) {
                asoc->stats.max_obs_rto = trans->rto;
                memset(&asoc->stats.obs_rto_ipaddr, 0,
                        sizeof(struct sockaddr_storage));
                memcpy(&asoc->stats.obs_rto_ipaddr, &trans->ipaddr,
                        trans->af_specific->sockaddr_len);
        }
}

/*
 * Macros for keeping a global reference of object allocations.
 */
#ifdef CONFIG_SCTP_DBG_OBJCNT

extern atomic_t sctp_dbg_objcnt_sock;
extern atomic_t sctp_dbg_objcnt_ep;
extern atomic_t sctp_dbg_objcnt_assoc;
extern atomic_t sctp_dbg_objcnt_transport;
extern atomic_t sctp_dbg_objcnt_chunk;
extern atomic_t sctp_dbg_objcnt_bind_addr;
extern atomic_t sctp_dbg_objcnt_bind_bucket;
extern atomic_t sctp_dbg_objcnt_addr;
extern atomic_t sctp_dbg_objcnt_datamsg;
extern atomic_t sctp_dbg_objcnt_keys;

/* Macros to atomically increment/decrement objcnt counters.  */
#define SCTP_DBG_OBJCNT_INC(name) \
atomic_inc(&sctp_dbg_objcnt_## name)
#define SCTP_DBG_OBJCNT_DEC(name) \
atomic_dec(&sctp_dbg_objcnt_## name)
#define SCTP_DBG_OBJCNT(name) \
atomic_t sctp_dbg_objcnt_## name = ATOMIC_INIT(0)

/* Macro to help create new entries in the global array of
 * objcnt counters.
 */
#define SCTP_DBG_OBJCNT_ENTRY(name) \
{.label= #name, .counter= &sctp_dbg_objcnt_## name}

void sctp_dbg_objcnt_init(struct net *);

#else

#define SCTP_DBG_OBJCNT_INC(name)
#define SCTP_DBG_OBJCNT_DEC(name)

static inline void sctp_dbg_objcnt_init(struct net *net) { return; }

#endif /* CONFIG_SCTP_DBG_OBJCOUNT */

#if defined CONFIG_SYSCTL
void sctp_sysctl_register(void);
void sctp_sysctl_unregister(void);
int sctp_sysctl_net_register(struct net *net);
void sctp_sysctl_net_unregister(struct net *net);
#else
static inline void sctp_sysctl_register(void) { return; }
static inline void sctp_sysctl_unregister(void) { return; }
static inline int sctp_sysctl_net_register(struct net *net) { return 0; }
static inline void sctp_sysctl_net_unregister(struct net *net) { return; }
#endif

/* Size of Supported Address Parameter for 'x' address types. */
#define SCTP_SAT_LEN(x) (sizeof(struct sctp_paramhdr) + (x) * sizeof(__u16))

#if IS_ENABLED(CONFIG_IPV6)

void sctp_v6_pf_init(void);
void sctp_v6_pf_exit(void);
int sctp_v6_protosw_init(void);
void sctp_v6_protosw_exit(void);
int sctp_v6_add_protocol(void);
void sctp_v6_del_protocol(void);

#else /* #ifdef defined(CONFIG_IPV6) */

static inline void sctp_v6_pf_init(void) { return; }
static inline void sctp_v6_pf_exit(void) { return; }
static inline int sctp_v6_protosw_init(void) { return 0; }
static inline void sctp_v6_protosw_exit(void) { return; }
static inline int sctp_v6_add_protocol(void) { return 0; }
static inline void sctp_v6_del_protocol(void) { return; }

#endif /* #if defined(CONFIG_IPV6) */


/* Map an association to an assoc_id. */
static inline sctp_assoc_t sctp_assoc2id(const struct sctp_association *asoc)
{
        return asoc ? asoc->assoc_id : 0;
}

static inline enum sctp_sstat_state
sctp_assoc_to_state(const struct sctp_association *asoc)
{
        /* SCTP's uapi always had SCTP_EMPTY(=0) as a dummy state, but we
         * got rid of it in kernel space. Therefore SCTP_CLOSED et al
         * start at =1 in user space, but actually as =0 in kernel space.
         * Now that we can not break user space and SCTP_EMPTY is exposed
         * there, we need to fix it up with an ugly offset not to break
         * applications. :(
         */
        return asoc->state + 1;
}

/* Look up the association by its id.  */
struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id);

int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp);

/* A macro to walk a list of skbs.  */
#define sctp_skb_for_each(pos, head, tmp) \
        skb_queue_walk_safe(head, pos, tmp)

/**
 *        sctp_list_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. The head item is
 *        returned or %NULL if the list is empty.
 */

static inline struct list_head *sctp_list_dequeue(struct list_head *list)
{
        struct list_head *result = NULL;

        if (!list_empty(list)) {
                result = list->next;
                list_del_init(result);
        }
        return result;
}

/* SCTP version of skb_set_owner_r.  We need this one because
 * of the way we have to do receive buffer accounting on bundled
 * chunks.
 */
static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        struct sctp_ulpevent *event = sctp_skb2event(skb);

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sctp_sock_rfree;
        atomic_add(event->rmem_len, &sk->sk_rmem_alloc);
        /*
         * This mimics the behavior of skb_set_owner_r
         */
        sk_mem_charge(sk, event->rmem_len);
}

/* Tests if the list has one and only one entry. */
static inline int sctp_list_single_entry(struct list_head *head)
{
        return list_is_singular(head);
}

static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk)
{
        return !list_empty(&chunk->list);
}

/* Walk through a list of TLV parameters.  Don't trust the
 * individual parameter lengths and instead depend on
 * the chunk length to indicate when to stop.  Make sure
 * there is room for a param header too.
 */
#define sctp_walk_params(pos, chunk)\
_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length))

#define _sctp_walk_params(pos, chunk, end)\
for (pos.v = (u8 *)(chunk + 1);\
     (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\
      (void *)chunk + end) &&\
     pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\
     ntohs(pos.p->length) >= sizeof(struct sctp_paramhdr);\
     pos.v += SCTP_PAD4(ntohs(pos.p->length)))

#define sctp_walk_errors(err, chunk_hdr)\
_sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length))

#define _sctp_walk_errors(err, chunk_hdr, end)\
for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \
            sizeof(struct sctp_chunkhdr));\
     ((void *)err + offsetof(struct sctp_errhdr, length) + sizeof(err->length) <=\
      (void *)chunk_hdr + end) &&\
     (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\
     ntohs(err->length) >= sizeof(struct sctp_errhdr); \
     err = (struct sctp_errhdr *)((void *)err + SCTP_PAD4(ntohs(err->length))))

#define sctp_walk_fwdtsn(pos, chunk)\
_sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk))

#define _sctp_walk_fwdtsn(pos, chunk, end)\
for (pos = (void *)(chunk->subh.fwdtsn_hdr + 1);\
     (void *)pos <= (void *)(chunk->subh.fwdtsn_hdr + 1) + end - sizeof(struct sctp_fwdtsn_skip);\
     pos++)

/* External references. */

extern struct proto sctp_prot;
extern struct proto sctpv6_prot;
void sctp_put_port(struct sock *sk);

extern struct idr sctp_assocs_id;
extern spinlock_t sctp_assocs_id_lock;

/* Static inline functions. */

/* Convert from an IP version number to an Address Family symbol.  */
static inline int ipver2af(__u8 ipver)
{
        switch (ipver) {
        case 4:
                return  AF_INET;
        case 6:
                return AF_INET6;
        default:
                return 0;
        }
}

/* Convert from an address parameter type to an address family.  */
static inline int param_type2af(__be16 type)
{
        switch (type) {
        case SCTP_PARAM_IPV4_ADDRESS:
                return  AF_INET;
        case SCTP_PARAM_IPV6_ADDRESS:
                return AF_INET6;
        default:
                return 0;
        }
}

/* Warning: The following hash functions assume a power of two 'size'. */
/* This is the hash function for the SCTP port hash table. */
static inline int sctp_phashfn(struct net *net, __u16 lport)
{
        return (net_hash_mix(net) + lport) & (sctp_port_hashsize - 1);
}

/* This is the hash function for the endpoint hash table. */
static inline int sctp_ep_hashfn(struct net *net, __u16 lport)
{
        return (net_hash_mix(net) + lport) & (sctp_ep_hashsize - 1);
}

#define sctp_for_each_hentry(ep, head) \
        hlist_for_each_entry(ep, head, node)

/* Is a socket of this style? */
#define sctp_style(sk, style) __sctp_style((sk), (SCTP_SOCKET_##style))
static inline int __sctp_style(const struct sock *sk,
                               enum sctp_socket_type style)
{
        return sctp_sk(sk)->type == style;
}

/* Is the association in this state? */
#define sctp_state(asoc, state) __sctp_state((asoc), (SCTP_STATE_##state))
static inline int __sctp_state(const struct sctp_association *asoc,
                               enum sctp_state state)
{
        return asoc->state == state;
}

/* Is the socket in this state? */
#define sctp_sstate(sk, state) __sctp_sstate((sk), (SCTP_SS_##state))
static inline int __sctp_sstate(const struct sock *sk,
                                enum sctp_sock_state state)
{
        return sk->sk_state == state;
}

/* Map v4-mapped v6 address back to v4 address */
static inline void sctp_v6_map_v4(union sctp_addr *addr)
{
        addr->v4.sin_family = AF_INET;
        addr->v4.sin_port = addr->v6.sin6_port;
        addr->v4.sin_addr.s_addr = addr->v6.sin6_addr.s6_addr32[3];
}

/* Map v4 address to v4-mapped v6 address */
static inline void sctp_v4_map_v6(union sctp_addr *addr)
{
        __be16 port;

        port = addr->v4.sin_port;
        addr->v6.sin6_addr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
        addr->v6.sin6_port = port;
        addr->v6.sin6_family = AF_INET6;
        addr->v6.sin6_flowinfo = 0;
        addr->v6.sin6_scope_id = 0;
        addr->v6.sin6_addr.s6_addr32[0] = 0;
        addr->v6.sin6_addr.s6_addr32[1] = 0;
        addr->v6.sin6_addr.s6_addr32[2] = htonl(0x0000ffff);
}

/* The cookie is always 0 since this is how it's used in the
 * pmtu code.
 */
static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t)
{
        if (t->dst && !dst_check(t->dst, t->dst_cookie))
                sctp_transport_dst_release(t);

        return t->dst;
}

/* Calculate max payload size given a MTU, or the total overhead if
 * given MTU is zero
 */
static inline __u32 __sctp_mtu_payload(const struct sctp_sock *sp,
                                       const struct sctp_transport *t,
                                       __u32 mtu, __u32 extra)
{
        __u32 overhead = sizeof(struct sctphdr) + extra;

        if (sp) {
                overhead += sp->pf->af->net_header_len;
                if (sp->udp_port && (!t || t->encap_port))
                        overhead += sizeof(struct udphdr);
        } else {
                overhead += sizeof(struct ipv6hdr);
        }

        if (WARN_ON_ONCE(mtu && mtu <= overhead))
                mtu = overhead;

        return mtu ? mtu - overhead : overhead;
}

static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp,
                                     __u32 mtu, __u32 extra)
{
        return __sctp_mtu_payload(sp, NULL, mtu, extra);
}

static inline __u32 sctp_dst_mtu(const struct dst_entry *dst)
{
        return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst),
                                 SCTP_DEFAULT_MINSEGMENT));
}

static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
{
        __u32 pmtu = sctp_dst_mtu(t->dst);

        if (t->pathmtu == pmtu)
                return true;

        t->pathmtu = pmtu;

        return false;
}

static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize)
{
        return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize);
}

static inline int sctp_transport_pl_hlen(struct sctp_transport *t)
{
        return __sctp_mtu_payload(sctp_sk(t->asoc->base.sk), t, 0, 0) -
               sizeof(struct sctphdr);
}

static inline void sctp_transport_pl_reset(struct sctp_transport *t)
{
        if (t->probe_interval && (t->param_flags & SPP_PMTUD_ENABLE) &&
            (t->state == SCTP_ACTIVE || t->state == SCTP_UNKNOWN)) {
                if (t->pl.state == SCTP_PL_DISABLED) {
                        t->pl.state = SCTP_PL_BASE;
                        t->pl.pmtu = SCTP_BASE_PLPMTU;
                        t->pl.probe_size = SCTP_BASE_PLPMTU;
                        sctp_transport_reset_probe_timer(t);
                }
        } else {
                if (t->pl.state != SCTP_PL_DISABLED) {
                        if (del_timer(&t->probe_timer))
                                sctp_transport_put(t);
                        t->pl.state = SCTP_PL_DISABLED;
                }
        }
}

static inline void sctp_transport_pl_update(struct sctp_transport *t)
{
        if (t->pl.state == SCTP_PL_DISABLED)
                return;

        t->pl.state = SCTP_PL_BASE;
        t->pl.pmtu = SCTP_BASE_PLPMTU;
        t->pl.probe_size = SCTP_BASE_PLPMTU;
        sctp_transport_reset_probe_timer(t);
}

static inline bool sctp_transport_pl_enabled(struct sctp_transport *t)
{
        return t->pl.state != SCTP_PL_DISABLED;
}

static inline bool sctp_newsk_ready(const struct sock *sk)
{
        return sock_flag(sk, SOCK_DEAD) || sk->sk_socket;
}

static inline void sctp_sock_set_nodelay(struct sock *sk)
{
        lock_sock(sk);
        sctp_sk(sk)->nodelay = true;
        release_sock(sk);
}

#endif /* __net_sctp_h__ */



















    1 

























    1 










    1 












    1 
















   20 
   20 










   16 


    1 



    1 







    1 








    1 






    1 
































































































































































































































































    1 





    1 




    1 










































































































   12 










   13 



   13 



   12 

   13 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

#include <trace/events/cgroup.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/*
 * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments. The parameter @fast_path determine the
 * tracepoints being added, allowing us to diagnose "flush" related
 * operations without handling high-frequency fast-path "update" events.
 */
static __always_inline
unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
                                     struct cgroup *cgrp, const bool fast_path)
{
        unsigned long flags;
        bool contended;

        /*
         * The _irqsave() is needed because cgroup_rstat_lock is
         * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
         * this lock with the _irq() suffix only disables interrupts on
         * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
         * interrupts on both configurations. The _irqsave() ensures
         * that interrupts are always disabled and later restored.
         */
        contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
        if (contended) {
                if (fast_path)
                        trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
                else
                        trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);

                raw_spin_lock_irqsave(cpu_lock, flags);
        }

        if (fast_path)
                trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
        else
                trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);

        return flags;
}

static __always_inline
void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
                              struct cgroup *cgrp, unsigned long flags,
                              const bool fast_path)
{
        if (fast_path)
                trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
        else
                trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);

        raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        unsigned long flags;

        /*
         * Speculative already-on-list test. This may race leading to
         * temporary inaccuracies, which is fine.
         *
         * Because @parent's updated_children is terminated with @parent
         * instead of NULL, we can tell whether @cgrp is on the list by
         * testing the next pointer for NULL.
         */
        if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
                return;

        flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);

        /* put @cgrp and all ancestors on the corresponding updated lists */
        while (true) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
                struct cgroup *parent = cgroup_parent(cgrp);
                struct cgroup_rstat_cpu *prstatc;

                /*
                 * Both additions and removals are bottom-up.  If a cgroup
                 * is already in the tree, all ancestors are.
                 */
                if (rstatc->updated_next)
                        break;

                /* Root has no parent to link it to, but mark it busy */
                if (!parent) {
                        rstatc->updated_next = cgrp;
                        break;
                }

                prstatc = cgroup_rstat_cpu(parent, cpu);
                rstatc->updated_next = prstatc->updated_children;
                prstatc->updated_children = cgrp;

                cgrp = parent;
        }

        _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
}

/**
 * cgroup_rstat_push_children - push children cgroups into the given list
 * @head: current head of the list (= subtree root)
 * @child: first child of the root
 * @cpu: target cpu
 * Return: A new singly linked list of cgroups to be flush
 *
 * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
 * level and push all the parents first before their next level children
 * into a singly linked list built from the tail backward like "pushing"
 * cgroups into a stack. The root is pushed by the caller.
 */
static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
                                                 struct cgroup *child, int cpu)
{
        struct cgroup *chead = child;        /* Head of child cgroup level */
        struct cgroup *ghead = NULL;        /* Head of grandchild cgroup level */
        struct cgroup *parent, *grandchild;
        struct cgroup_rstat_cpu *crstatc;

        child->rstat_flush_next = NULL;

next_level:
        while (chead) {
                child = chead;
                chead = child->rstat_flush_next;
                parent = cgroup_parent(child);

                /* updated_next is parent cgroup terminated */
                while (child != parent) {
                        child->rstat_flush_next = head;
                        head = child;
                        crstatc = cgroup_rstat_cpu(child, cpu);
                        grandchild = crstatc->updated_children;
                        if (grandchild != child) {
                                /* Push the grand child to the next level */
                                crstatc->updated_children = child;
                                grandchild->rstat_flush_next = ghead;
                                ghead = grandchild;
                        }
                        child = crstatc->updated_next;
                        crstatc->updated_next = NULL;
                }
        }

        if (ghead) {
                chead = ghead;
                ghead = NULL;
                goto next_level;
        }
        return head;
}

/**
 * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
 * @root: root of the cgroup subtree to traverse
 * @cpu: target cpu
 * Return: A singly linked list of cgroups to be flushed
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  During traversal,
 * each returned cgroup is unlinked from the updated tree.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, the child is before its parent in
 * the list.
 *
 * Note that updated_children is self terminated and points to a list of
 * child cgroups if not empty. Whereas updated_next is like a sibling link
 * within the children list and terminated by the parent cgroup. An exception
 * here is the cgroup root whose updated_next can be self terminated.
 */
static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
        struct cgroup *head = NULL, *parent, *child;
        unsigned long flags;

        flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);

        /* Return NULL if this subtree is not on-list */
        if (!rstatc->updated_next)
                goto unlock_ret;

        /*
         * Unlink @root from its parent. As the updated_children list is
         * singly linked, we have to walk it to find the removal point.
         */
        parent = cgroup_parent(root);
        if (parent) {
                struct cgroup_rstat_cpu *prstatc;
                struct cgroup **nextp;

                prstatc = cgroup_rstat_cpu(parent, cpu);
                nextp = &prstatc->updated_children;
                while (*nextp != root) {
                        struct cgroup_rstat_cpu *nrstatc;

                        nrstatc = cgroup_rstat_cpu(*nextp, cpu);
                        WARN_ON_ONCE(*nextp == parent);
                        nextp = &nrstatc->updated_next;
                }
                *nextp = rstatc->updated_next;
        }

        rstatc->updated_next = NULL;

        /* Push @root to the list first before pushing the children */
        head = root;
        root->rstat_flush_next = NULL;
        child = rstatc->updated_children;
        rstatc->updated_children = root;
        if (child != root)
                head = cgroup_rstat_push_children(head, child, cpu);
unlock_ret:
        _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
        return head;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 */

__bpf_hook_start();

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
                                     struct cgroup *parent, int cpu)
{
}

__bpf_hook_end();

/*
 * Helper functions for locking cgroup_rstat_lock.
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments.  The parameter @cpu_in_loop indicate lock
 * was released and re-taken when collection data from the CPUs. The
 * value -1 is used when obtaining the main lock else this is the CPU
 * number processed last.
 */
static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
        __acquires(&cgroup_rstat_lock)
{
        bool contended;

        contended = !spin_trylock_irq(&cgroup_rstat_lock);
        if (contended) {
                trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
                spin_lock_irq(&cgroup_rstat_lock);
        }
        trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}

static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
        __releases(&cgroup_rstat_lock)
{
        trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
        spin_unlock_irq(&cgroup_rstat_lock);
}

/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
        int cpu;

        lockdep_assert_held(&cgroup_rstat_lock);

        for_each_possible_cpu(cpu) {
                struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);

                for (; pos; pos = pos->rstat_flush_next) {
                        struct cgroup_subsys_state *css;

                        cgroup_base_stat_flush(pos, cpu);
                        bpf_rstat_flush(pos, cgroup_parent(pos), cpu);

                        rcu_read_lock();
                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
                                                rstat_css_node)
                                css->ss->css_rstat_flush(css, cpu);
                        rcu_read_unlock();
                }

                /* play nice and yield if necessary */
                if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
                        __cgroup_rstat_unlock(cgrp, cpu);
                        if (!cond_resched())
                                cpu_relax();
                        __cgroup_rstat_lock(cgrp, cpu);
                }
        }
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
        might_sleep();

        __cgroup_rstat_lock(cgrp, -1);
        cgroup_rstat_flush_locked(cgrp);
        __cgroup_rstat_unlock(cgrp, -1);
}

/**
 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 * paired with cgroup_rstat_flush_release().
 *
 * This function may block.
 */
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
        __acquires(&cgroup_rstat_lock)
{
        might_sleep();
        __cgroup_rstat_lock(cgrp, -1);
        cgroup_rstat_flush_locked(cgrp);
}

/**
 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 * @cgrp: cgroup used by tracepoint
 */
void cgroup_rstat_flush_release(struct cgroup *cgrp)
        __releases(&cgroup_rstat_lock)
{
        __cgroup_rstat_unlock(cgrp, -1);
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
        int cpu;

        /* the root cgrp has rstat_cpu preallocated */
        if (!cgrp->rstat_cpu) {
                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
                if (!cgrp->rstat_cpu)
                        return -ENOMEM;
        }

        /* ->updated_children list is self terminated */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                rstatc->updated_children = cgrp;
                u64_stats_init(&rstatc->bsync);
        }

        return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
        int cpu;

        cgroup_rstat_flush(cgrp);

        /* sanity check */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
                    WARN_ON_ONCE(rstatc->updated_next))
                        return;
        }

        free_percpu(cgrp->rstat_cpu);
        cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime += src_bstat->cputime.utime;
        dst_bstat->cputime.stime += src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_rstat_cpu *prstatc;
        struct cgroup_base_stat delta;
        unsigned seq;

        /* Root-level stats are sourced from system-wide CPU stats */
        if (!parent)
                return;

        /* fetch the current per-cpu values */
        do {
                seq = __u64_stats_fetch_begin(&rstatc->bsync);
                delta = rstatc->bstat;
        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

        /* propagate per-cpu delta to cgroup and per-cpu global statistics */
        cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
        cgroup_base_stat_add(&cgrp->bstat, &delta);
        cgroup_base_stat_add(&rstatc->last_bstat, &delta);
        cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);

        /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
        if (cgroup_parent(parent)) {
                delta = cgrp->bstat;
                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
                cgroup_base_stat_add(&parent->bstat, &delta);
                cgroup_base_stat_add(&cgrp->last_bstat, &delta);

                delta = rstatc->subtree_bstat;
                prstatc = cgroup_rstat_cpu(parent, cpu);
                cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
                cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
                cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
        }
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
        *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
        return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
                                                 struct cgroup_rstat_cpu *rstatc,
                                                 unsigned long flags)
{
        u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
        cgroup_rstat_updated(cgrp, smp_processor_id());
        put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;
        unsigned long flags;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;
        unsigned long flags;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

        switch (index) {
        case CPUTIME_USER:
        case CPUTIME_NICE:
                rstatc->bstat.cputime.utime += delta_exec;
                break;
        case CPUTIME_SYSTEM:
        case CPUTIME_IRQ:
        case CPUTIME_SOFTIRQ:
                rstatc->bstat.cputime.stime += delta_exec;
                break;
#ifdef CONFIG_SCHED_CORE
        case CPUTIME_FORCEIDLE:
                rstatc->bstat.forceidle_sum += delta_exec;
                break;
#endif
        default:
                break;
        }

        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
        struct task_cputime *cputime = &bstat->cputime;
        int i;

        memset(bstat, 0, sizeof(*bstat));
        for_each_possible_cpu(i) {
                struct kernel_cpustat kcpustat;
                u64 *cpustat = kcpustat.cpustat;
                u64 user = 0;
                u64 sys = 0;

                kcpustat_cpu_fetch(&kcpustat, i);

                user += cpustat[CPUTIME_USER];
                user += cpustat[CPUTIME_NICE];
                cputime->utime += user;

                sys += cpustat[CPUTIME_SYSTEM];
                sys += cpustat[CPUTIME_IRQ];
                sys += cpustat[CPUTIME_SOFTIRQ];
                cputime->stime += sys;

                cputime->sum_exec_runtime += user;
                cputime->sum_exec_runtime += sys;
                cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];

#ifdef CONFIG_SCHED_CORE
                bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
        }
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        u64 usage, utime, stime;
        struct cgroup_base_stat bstat;
#ifdef CONFIG_SCHED_CORE
        u64 forceidle_time;
#endif

        if (cgroup_parent(cgrp)) {
                cgroup_rstat_flush_hold(cgrp);
                usage = cgrp->bstat.cputime.sum_exec_runtime;
                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                               &utime, &stime);
#ifdef CONFIG_SCHED_CORE
                forceidle_time = cgrp->bstat.forceidle_sum;
#endif
                cgroup_rstat_flush_release(cgrp);
        } else {
                root_cgroup_cputime(&bstat);
                usage = bstat.cputime.sum_exec_runtime;
                utime = bstat.cputime.utime;
                stime = bstat.cputime.stime;
#ifdef CONFIG_SCHED_CORE
                forceidle_time = bstat.forceidle_sum;
#endif
        }

        do_div(usage, NSEC_PER_USEC);
        do_div(utime, NSEC_PER_USEC);
        do_div(stime, NSEC_PER_USEC);
#ifdef CONFIG_SCHED_CORE
        do_div(forceidle_time, NSEC_PER_USEC);
#endif

        seq_printf(seq, "usage_usec %llu\n"
                   "user_usec %llu\n"
                   "system_usec %llu\n",
                   usage, utime, stime);

#ifdef CONFIG_SCHED_CORE
        seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, cgroup_rstat_updated)
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
        .owner          = THIS_MODULE,
        .set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
                                         &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);




















































































































































































































































































































































































































































































































































































































































































































































    1 





















































































































































    1 

    1 














    1 




    1 





    1 



    1 





    1 








    1 



    1 














    1 














































































































































































































    1 




    1 












    1 

    1 

    1 


    1 

























    1 





    1 

    1 






























































































    1 
    1 






    1 







    1 







    1 
    1 



    1 





    1 







































































    1 




    1 
















    1 




































    1 









    1 
























































    1 
    1 











    1 











    1 

    1 



    1 

































































    1 






































    1 

































































































    1 

















    1 
    1 











    1 















    1 

    1 


    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions work with the state functions in sctp_sm_statefuns.c
 * to implement that state operations.  These functions implement the
 * steps which require modifying existing data structures.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Jon Grimm             <jgrimm@austin.ibm.com>
 *    Hui Huang                    <hui.huang@nokia.com>
 *    Dajiang Zhang            <dajiang.zhang@nokia.com>
 *    Daisy Chang            <daisyc@us.ibm.com>
 *    Sridhar Samudrala            <sri@us.ibm.com>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/ip.h>
#include <linux/gfp.h>
#include <net/sock.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>

static int sctp_cmd_interpreter(enum sctp_event_type event_type,
                                union sctp_subtype subtype,
                                enum sctp_state state,
                                struct sctp_endpoint *ep,
                                struct sctp_association *asoc,
                                void *event_arg,
                                enum sctp_disposition status,
                                struct sctp_cmd_seq *commands,
                                gfp_t gfp);
static int sctp_side_effects(enum sctp_event_type event_type,
                             union sctp_subtype subtype,
                             enum sctp_state state,
                             struct sctp_endpoint *ep,
                             struct sctp_association **asoc,
                             void *event_arg,
                             enum sctp_disposition status,
                             struct sctp_cmd_seq *commands,
                             gfp_t gfp);

/********************************************************************
 * Helper functions
 ********************************************************************/

/* A helper function for delayed processing of INET ECN CE bit. */
static void sctp_do_ecn_ce_work(struct sctp_association *asoc,
                                __u32 lowest_tsn)
{
        /* Save the TSN away for comparison when we receive CWR */

        asoc->last_ecne_tsn = lowest_tsn;
        asoc->need_ecne = 1;
}

/* Helper function for delayed processing of SCTP ECNE chunk.  */
/* RFC 2960 Appendix A
 *
 * RFC 2481 details a specific bit for a sender to send in
 * the header of its next outbound TCP segment to indicate to
 * its peer that it has reduced its congestion window.  This
 * is termed the CWR bit.  For SCTP the same indication is made
 * by including the CWR chunk.  This chunk contains one data
 * element, i.e. the TSN number that was sent in the ECNE chunk.
 * This element represents the lowest TSN number in the datagram
 * that was originally marked with the CE bit.
 */
static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc,
                                                __u32 lowest_tsn,
                                                struct sctp_chunk *chunk)
{
        struct sctp_chunk *repl;

        /* Our previously transmitted packet ran into some congestion
         * so we should take action by reducing cwnd and ssthresh
         * and then ACK our peer that we we've done so by
         * sending a CWR.
         */

        /* First, try to determine if we want to actually lower
         * our cwnd variables.  Only lower them if the ECNE looks more
         * recent than the last response.
         */
        if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) {
                struct sctp_transport *transport;

                /* Find which transport's congestion variables
                 * need to be adjusted.
                 */
                transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn);

                /* Update the congestion variables. */
                if (transport)
                        sctp_transport_lower_cwnd(transport,
                                                  SCTP_LOWER_CWND_ECNE);
                asoc->last_cwr_tsn = lowest_tsn;
        }

        /* Always try to quiet the other end.  In case of lost CWR,
         * resend last_cwr_tsn.
         */
        repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk);

        /* If we run out of memory, it will look like a lost CWR.  We'll
         * get back in sync eventually.
         */
        return repl;
}

/* Helper function to do delayed processing of ECN CWR chunk.  */
static void sctp_do_ecn_cwr_work(struct sctp_association *asoc,
                                 __u32 lowest_tsn)
{
        /* Turn off ECNE getting auto-prepended to every outgoing
         * packet
         */
        asoc->need_ecne = 0;
}

/* Generate SACK if necessary.  We call this at the end of a packet.  */
static int sctp_gen_sack(struct sctp_association *asoc, int force,
                         struct sctp_cmd_seq *commands)
{
        struct sctp_transport *trans = asoc->peer.last_data_from;
        __u32 ctsn, max_tsn_seen;
        struct sctp_chunk *sack;
        int error = 0;

        if (force ||
            (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) ||
            (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE)))
                asoc->peer.sack_needed = 1;

        ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
        max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);

        /* From 12.2 Parameters necessary per association (i.e. the TCB):
         *
         * Ack State : This flag indicates if the next received packet
         *              : is to be responded to with a SACK. ...
         *             : When DATA chunks are out of order, SACK's
         *           : are not delayed (see Section 6).
         *
         * [This is actually not mentioned in Section 6, but we
         * implement it here anyway. --piggy]
         */
        if (max_tsn_seen != ctsn)
                asoc->peer.sack_needed = 1;

        /* From 6.2  Acknowledgement on Reception of DATA Chunks:
         *
         * Section 4.2 of [RFC2581] SHOULD be followed. Specifically,
         * an acknowledgement SHOULD be generated for at least every
         * second packet (not every second DATA chunk) received, and
         * SHOULD be generated within 200 ms of the arrival of any
         * unacknowledged DATA chunk. ...
         */
        if (!asoc->peer.sack_needed) {
                asoc->peer.sack_cnt++;

                /* Set the SACK delay timeout based on the
                 * SACK delay for the last transport
                 * data was received from, or the default
                 * for the association.
                 */
                if (trans) {
                        /* We will need a SACK for the next packet.  */
                        if (asoc->peer.sack_cnt >= trans->sackfreq - 1)
                                asoc->peer.sack_needed = 1;

                        asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
                                trans->sackdelay;
                } else {
                        /* We will need a SACK for the next packet.  */
                        if (asoc->peer.sack_cnt >= asoc->sackfreq - 1)
                                asoc->peer.sack_needed = 1;

                        asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
                                asoc->sackdelay;
                }

                /* Restart the SACK timer. */
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
        } else {
                __u32 old_a_rwnd = asoc->a_rwnd;

                asoc->a_rwnd = asoc->rwnd;
                sack = sctp_make_sack(asoc);
                if (!sack) {
                        asoc->a_rwnd = old_a_rwnd;
                        goto nomem;
                }

                asoc->peer.sack_needed = 0;
                asoc->peer.sack_cnt = 0;

                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(sack));

                /* Stop the SACK timer.  */
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
        }

        return error;
nomem:
        error = -ENOMEM;
        return error;
}

/* When the T3-RTX timer expires, it calls this function to create the
 * relevant state machine event.
 */
void sctp_generate_t3_rtx_event(struct timer_list *t)
{
        struct sctp_transport *transport =
                from_timer(transport, t, T3_rtx_timer);
        struct sctp_association *asoc = transport->asoc;
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);
        int error;

        /* Check whether a task is in the sock.  */

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                pr_debug("%s: sock is busy\n", __func__);

                /* Try again later.  */
                if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20)))
                        sctp_transport_hold(transport);
                goto out_unlock;
        }

        /* Run through the state machine.  */
        error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
                           SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX),
                           asoc->state,
                           asoc->ep, asoc,
                           transport, GFP_ATOMIC);

        if (error)
                sk->sk_err = -error;

out_unlock:
        bh_unlock_sock(sk);
        sctp_transport_put(transport);
}

/* This is a sa interface for producing timeout events.  It works
 * for timeouts which use the association as their parameter.
 */
static void sctp_generate_timeout_event(struct sctp_association *asoc,
                                        enum sctp_event_timeout timeout_type)
{
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);
        int error = 0;

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                pr_debug("%s: sock is busy: timer %d\n", __func__,
                         timeout_type);

                /* Try again later.  */
                if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20)))
                        sctp_association_hold(asoc);
                goto out_unlock;
        }

        /* Is this association really dead and just waiting around for
         * the timer to let go of the reference?
         */
        if (asoc->base.dead)
                goto out_unlock;

        /* Run through the state machine.  */
        error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
                           SCTP_ST_TIMEOUT(timeout_type),
                           asoc->state, asoc->ep, asoc,
                           (void *)timeout_type, GFP_ATOMIC);

        if (error)
                sk->sk_err = -error;

out_unlock:
        bh_unlock_sock(sk);
        sctp_association_put(asoc);
}

static void sctp_generate_t1_cookie_event(struct timer_list *t)
{
        struct sctp_association *asoc =
                from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]);

        sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE);
}

static void sctp_generate_t1_init_event(struct timer_list *t)
{
        struct sctp_association *asoc =
                from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_INIT]);

        sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT);
}

static void sctp_generate_t2_shutdown_event(struct timer_list *t)
{
        struct sctp_association *asoc =
                from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]);

        sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN);
}

static void sctp_generate_t4_rto_event(struct timer_list *t)
{
        struct sctp_association *asoc =
                from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]);

        sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO);
}

static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t)
{
        struct sctp_association *asoc =
                from_timer(asoc, t,
                           timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]);

        sctp_generate_timeout_event(asoc,
                                    SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD);

} /* sctp_generate_t5_shutdown_guard_event() */

static void sctp_generate_autoclose_event(struct timer_list *t)
{
        struct sctp_association *asoc =
                from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]);

        sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE);
}

/* Generate a heart beat event.  If the sock is busy, reschedule.   Make
 * sure that the transport is still valid.
 */
void sctp_generate_heartbeat_event(struct timer_list *t)
{
        struct sctp_transport *transport = from_timer(transport, t, hb_timer);
        struct sctp_association *asoc = transport->asoc;
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);
        u32 elapsed, timeout;
        int error = 0;

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                pr_debug("%s: sock is busy\n", __func__);

                /* Try again later.  */
                if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20)))
                        sctp_transport_hold(transport);
                goto out_unlock;
        }

        /* Check if we should still send the heartbeat or reschedule */
        elapsed = jiffies - transport->last_time_sent;
        timeout = sctp_transport_timeout(transport);
        if (elapsed < timeout) {
                elapsed = timeout - elapsed;
                if (!mod_timer(&transport->hb_timer, jiffies + elapsed))
                        sctp_transport_hold(transport);
                goto out_unlock;
        }

        error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
                           SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT),
                           asoc->state, asoc->ep, asoc,
                           transport, GFP_ATOMIC);

        if (error)
                sk->sk_err = -error;

out_unlock:
        bh_unlock_sock(sk);
        sctp_transport_put(transport);
}

/* Handle the timeout of the ICMP protocol unreachable timer.  Trigger
 * the correct state machine transition that will close the association.
 */
void sctp_generate_proto_unreach_event(struct timer_list *t)
{
        struct sctp_transport *transport =
                from_timer(transport, t, proto_unreach_timer);
        struct sctp_association *asoc = transport->asoc;
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                pr_debug("%s: sock is busy\n", __func__);

                /* Try again later.  */
                if (!mod_timer(&transport->proto_unreach_timer,
                                jiffies + (HZ/20)))
                        sctp_transport_hold(transport);
                goto out_unlock;
        }

        /* Is this structure just waiting around for us to actually
         * get destroyed?
         */
        if (asoc->base.dead)
                goto out_unlock;

        sctp_do_sm(net, SCTP_EVENT_T_OTHER,
                   SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
                   asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC);

out_unlock:
        bh_unlock_sock(sk);
        sctp_transport_put(transport);
}

 /* Handle the timeout of the RE-CONFIG timer. */
void sctp_generate_reconf_event(struct timer_list *t)
{
        struct sctp_transport *transport =
                from_timer(transport, t, reconf_timer);
        struct sctp_association *asoc = transport->asoc;
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);
        int error = 0;

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                pr_debug("%s: sock is busy\n", __func__);

                /* Try again later.  */
                if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20)))
                        sctp_transport_hold(transport);
                goto out_unlock;
        }

        /* This happens when the response arrives after the timer is triggered. */
        if (!asoc->strreset_chunk)
                goto out_unlock;

        error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
                           SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF),
                           asoc->state, asoc->ep, asoc,
                           transport, GFP_ATOMIC);

        if (error)
                sk->sk_err = -error;

out_unlock:
        bh_unlock_sock(sk);
        sctp_transport_put(transport);
}

/* Handle the timeout of the probe timer. */
void sctp_generate_probe_event(struct timer_list *t)
{
        struct sctp_transport *transport = from_timer(transport, t, probe_timer);
        struct sctp_association *asoc = transport->asoc;
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);
        int error = 0;

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                pr_debug("%s: sock is busy\n", __func__);

                /* Try again later.  */
                if (!mod_timer(&transport->probe_timer, jiffies + (HZ / 20)))
                        sctp_transport_hold(transport);
                goto out_unlock;
        }

        error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
                           SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_PROBE),
                           asoc->state, asoc->ep, asoc,
                           transport, GFP_ATOMIC);

        if (error)
                sk->sk_err = -error;

out_unlock:
        bh_unlock_sock(sk);
        sctp_transport_put(transport);
}

/* Inject a SACK Timeout event into the state machine.  */
static void sctp_generate_sack_event(struct timer_list *t)
{
        struct sctp_association *asoc =
                from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]);

        sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK);
}

sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
        [SCTP_EVENT_TIMEOUT_NONE] =                NULL,
        [SCTP_EVENT_TIMEOUT_T1_COOKIE] =        sctp_generate_t1_cookie_event,
        [SCTP_EVENT_TIMEOUT_T1_INIT] =                sctp_generate_t1_init_event,
        [SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =        sctp_generate_t2_shutdown_event,
        [SCTP_EVENT_TIMEOUT_T3_RTX] =                NULL,
        [SCTP_EVENT_TIMEOUT_T4_RTO] =                sctp_generate_t4_rto_event,
        [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] =
                                        sctp_generate_t5_shutdown_guard_event,
        [SCTP_EVENT_TIMEOUT_HEARTBEAT] =        NULL,
        [SCTP_EVENT_TIMEOUT_RECONF] =                NULL,
        [SCTP_EVENT_TIMEOUT_SACK] =                sctp_generate_sack_event,
        [SCTP_EVENT_TIMEOUT_AUTOCLOSE] =        sctp_generate_autoclose_event,
};


/* RFC 2960 8.2 Path Failure Detection
 *
 * When its peer endpoint is multi-homed, an endpoint should keep a
 * error counter for each of the destination transport addresses of the
 * peer endpoint.
 *
 * Each time the T3-rtx timer expires on any address, or when a
 * HEARTBEAT sent to an idle address is not acknowledged within a RTO,
 * the error counter of that destination address will be incremented.
 * When the value in the error counter exceeds the protocol parameter
 * 'Path.Max.Retrans' of that destination address, the endpoint should
 * mark the destination transport address as inactive, and a
 * notification SHOULD be sent to the upper layer.
 *
 */
static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
                                         struct sctp_association *asoc,
                                         struct sctp_transport *transport,
                                         int is_hb)
{
        /* The check for association's overall error counter exceeding the
         * threshold is done in the state function.
         */
        /* We are here due to a timer expiration.  If the timer was
         * not a HEARTBEAT, then normal error tracking is done.
         * If the timer was a heartbeat, we only increment error counts
         * when we already have an outstanding HEARTBEAT that has not
         * been acknowledged.
         * Additionally, some tranport states inhibit error increments.
         */
        if (!is_hb) {
                asoc->overall_error_count++;
                if (transport->state != SCTP_INACTIVE)
                        transport->error_count++;
         } else if (transport->hb_sent) {
                if (transport->state != SCTP_UNCONFIRMED)
                        asoc->overall_error_count++;
                if (transport->state != SCTP_INACTIVE)
                        transport->error_count++;
        }

        /* If the transport error count is greater than the pf_retrans
         * threshold, and less than pathmaxrtx, and if the current state
         * is SCTP_ACTIVE, then mark this transport as Partially Failed,
         * see SCTP Quick Failover Draft, section 5.1
         */
        if (asoc->base.net->sctp.pf_enable &&
            transport->state == SCTP_ACTIVE &&
            transport->error_count < transport->pathmaxrxt &&
            transport->error_count > transport->pf_retrans) {

                sctp_assoc_control_transport(asoc, transport,
                                             SCTP_TRANSPORT_PF,
                                             0);

                /* Update the hb timer to resend a heartbeat every rto */
                sctp_transport_reset_hb_timer(transport);
        }

        if (transport->state != SCTP_INACTIVE &&
            (transport->error_count > transport->pathmaxrxt)) {
                pr_debug("%s: association:%p transport addr:%pISpc failed\n",
                         __func__, asoc, &transport->ipaddr.sa);

                sctp_assoc_control_transport(asoc, transport,
                                             SCTP_TRANSPORT_DOWN,
                                             SCTP_FAILED_THRESHOLD);
        }

        if (transport->error_count > transport->ps_retrans &&
            asoc->peer.primary_path == transport &&
            asoc->peer.active_path != transport)
                sctp_assoc_set_primary(asoc, asoc->peer.active_path);

        /* E2) For the destination address for which the timer
         * expires, set RTO <- RTO * 2 ("back off the timer").  The
         * maximum value discussed in rule C7 above (RTO.max) may be
         * used to provide an upper bound to this doubling operation.
         *
         * Special Case:  the first HB doesn't trigger exponential backoff.
         * The first unacknowledged HB triggers it.  We do this with a flag
         * that indicates that we have an outstanding HB.
         */
        if (!is_hb || transport->hb_sent) {
                transport->rto = min((transport->rto * 2), transport->asoc->rto_max);
                sctp_max_rto(asoc, transport);
        }
}

/* Worker routine to handle INIT command failure.  */
static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands,
                                 struct sctp_association *asoc,
                                 unsigned int error)
{
        struct sctp_ulpevent *event;

        event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_CANT_STR_ASSOC,
                                                (__u16)error, 0, 0, NULL,
                                                GFP_ATOMIC);

        if (event)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(event));

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_CLOSED));

        /* SEND_FAILED sent later when cleaning up the association. */
        asoc->outqueue.error = error;
        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
}

/* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
                                  struct sctp_association *asoc,
                                  enum sctp_event_type event_type,
                                  union sctp_subtype subtype,
                                  struct sctp_chunk *chunk,
                                  unsigned int error)
{
        struct sctp_ulpevent *event;
        struct sctp_chunk *abort;

        /* Cancel any partial delivery in progress. */
        asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC);

        if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT)
                event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
                                                (__u16)error, 0, 0, chunk,
                                                GFP_ATOMIC);
        else
                event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST,
                                                (__u16)error, 0, 0, NULL,
                                                GFP_ATOMIC);
        if (event)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(event));

        if (asoc->overall_error_count >= asoc->max_retrans) {
                abort = sctp_make_violation_max_retrans(asoc, chunk);
                if (abort)
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(abort));
        }

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_CLOSED));

        /* SEND_FAILED sent later when cleaning up the association. */
        asoc->outqueue.error = error;
        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
}

/* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT
 * inside the cookie.  In reality, this is only used for INIT-ACK processing
 * since all other cases use "temporary" associations and can do all
 * their work in statefuns directly.
 */
static int sctp_cmd_process_init(struct sctp_cmd_seq *commands,
                                 struct sctp_association *asoc,
                                 struct sctp_chunk *chunk,
                                 struct sctp_init_chunk *peer_init,
                                 gfp_t gfp)
{
        int error;

        /* We only process the init as a sideeffect in a single
         * case.   This is when we process the INIT-ACK.   If we
         * fail during INIT processing (due to malloc problems),
         * just return the error and stop processing the stack.
         */
        if (!sctp_process_init(asoc, chunk, sctp_source(chunk), peer_init, gfp))
                error = -ENOMEM;
        else
                error = 0;

        return error;
}

/* Helper function to break out starting up of heartbeat timers.  */
static void sctp_cmd_hb_timers_start(struct sctp_cmd_seq *cmds,
                                     struct sctp_association *asoc)
{
        struct sctp_transport *t;

        /* Start a heartbeat timer for each transport on the association.
         * hold a reference on the transport to make sure none of
         * the needed data structures go away.
         */
        list_for_each_entry(t, &asoc->peer.transport_addr_list, transports)
                sctp_transport_reset_hb_timer(t);
}

static void sctp_cmd_hb_timers_stop(struct sctp_cmd_seq *cmds,
                                    struct sctp_association *asoc)
{
        struct sctp_transport *t;

        /* Stop all heartbeat timers. */

        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                        transports) {
                if (del_timer(&t->hb_timer))
                        sctp_transport_put(t);
        }
}

/* Helper function to stop any pending T3-RTX timers */
static void sctp_cmd_t3_rtx_timers_stop(struct sctp_cmd_seq *cmds,
                                        struct sctp_association *asoc)
{
        struct sctp_transport *t;

        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                        transports) {
                if (del_timer(&t->T3_rtx_timer))
                        sctp_transport_put(t);
        }
}


/* Helper function to handle the reception of an HEARTBEAT ACK.  */
static void sctp_cmd_transport_on(struct sctp_cmd_seq *cmds,
                                  struct sctp_association *asoc,
                                  struct sctp_transport *t,
                                  struct sctp_chunk *chunk)
{
        struct sctp_sender_hb_info *hbinfo;
        int was_unconfirmed = 0;

        /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the
         * HEARTBEAT should clear the error counter of the destination
         * transport address to which the HEARTBEAT was sent.
         */
        t->error_count = 0;

        /*
         * Although RFC4960 specifies that the overall error count must
         * be cleared when a HEARTBEAT ACK is received, we make an
         * exception while in SHUTDOWN PENDING. If the peer keeps its
         * window shut forever, we may never be able to transmit our
         * outstanding data and rely on the retransmission limit be reached
         * to shutdown the association.
         */
        if (t->asoc->state < SCTP_STATE_SHUTDOWN_PENDING)
                t->asoc->overall_error_count = 0;

        /* Clear the hb_sent flag to signal that we had a good
         * acknowledgement.
         */
        t->hb_sent = 0;

        /* Mark the destination transport address as active if it is not so
         * marked.
         */
        if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) {
                was_unconfirmed = 1;
                sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
                                             SCTP_HEARTBEAT_SUCCESS);
        }

        if (t->state == SCTP_PF)
                sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
                                             SCTP_HEARTBEAT_SUCCESS);

        /* HB-ACK was received for a the proper HB.  Consider this
         * forward progress.
         */
        if (t->dst)
                sctp_transport_dst_confirm(t);

        /* The receiver of the HEARTBEAT ACK should also perform an
         * RTT measurement for that destination transport address
         * using the time value carried in the HEARTBEAT ACK chunk.
         * If the transport's rto_pending variable has been cleared,
         * it was most likely due to a retransmit.  However, we want
         * to re-enable it to properly update the rto.
         */
        if (t->rto_pending == 0)
                t->rto_pending = 1;

        hbinfo = (struct sctp_sender_hb_info *)chunk->skb->data;
        sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at));

        /* Update the heartbeat timer.  */
        sctp_transport_reset_hb_timer(t);

        if (was_unconfirmed && asoc->peer.transport_count == 1)
                sctp_transport_immediate_rtx(t);
}


/* Helper function to process the process SACK command.  */
static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds,
                                 struct sctp_association *asoc,
                                 struct sctp_chunk *chunk)
{
        int err = 0;

        if (sctp_outq_sack(&asoc->outqueue, chunk)) {
                /* There are no more TSNs awaiting SACK.  */
                err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER,
                                 SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN),
                                 asoc->state, asoc->ep, asoc, NULL,
                                 GFP_ATOMIC);
        }

        return err;
}

/* Helper function to set the timeout value for T2-SHUTDOWN timer and to set
 * the transport for a shutdown chunk.
 */
static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds,
                              struct sctp_association *asoc,
                              struct sctp_chunk *chunk)
{
        struct sctp_transport *t;

        if (chunk->transport)
                t = chunk->transport;
        else {
                t = sctp_assoc_choose_alter_transport(asoc,
                                              asoc->shutdown_last_sent_to);
                chunk->transport = t;
        }
        asoc->shutdown_last_sent_to = t;
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto;
}

/* Helper function to change the state of an association. */
static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds,
                               struct sctp_association *asoc,
                               enum sctp_state state)
{
        struct sock *sk = asoc->base.sk;

        asoc->state = state;

        pr_debug("%s: asoc:%p[%s]\n", __func__, asoc, sctp_state_tbl[state]);

        if (sctp_style(sk, TCP)) {
                /* Change the sk->sk_state of a TCP-style socket that has
                 * successfully completed a connect() call.
                 */
                if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
                        inet_sk_set_state(sk, SCTP_SS_ESTABLISHED);

                /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */
                if (sctp_state(asoc, SHUTDOWN_RECEIVED) &&
                    sctp_sstate(sk, ESTABLISHED)) {
                        inet_sk_set_state(sk, SCTP_SS_CLOSING);
                        sk->sk_shutdown |= RCV_SHUTDOWN;
                }
        }

        if (sctp_state(asoc, COOKIE_WAIT)) {
                /* Reset init timeouts since they may have been
                 * increased due to timer expirations.
                 */
                asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
                                                asoc->rto_initial;
                asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
                                                asoc->rto_initial;
        }

        if (sctp_state(asoc, ESTABLISHED)) {
                kfree(asoc->peer.cookie);
                asoc->peer.cookie = NULL;
        }

        if (sctp_state(asoc, ESTABLISHED) ||
            sctp_state(asoc, CLOSED) ||
            sctp_state(asoc, SHUTDOWN_RECEIVED)) {
                /* Wake up any processes waiting in the asoc's wait queue in
                 * sctp_wait_for_connect() or sctp_wait_for_sndbuf().
                 */
                if (waitqueue_active(&asoc->wait))
                        wake_up_interruptible(&asoc->wait);

                /* Wake up any processes waiting in the sk's sleep queue of
                 * a TCP-style or UDP-style peeled-off socket in
                 * sctp_wait_for_accept() or sctp_wait_for_packet().
                 * For a UDP-style socket, the waiters are woken up by the
                 * notifications.
                 */
                if (!sctp_style(sk, UDP))
                        sk->sk_state_change(sk);
        }

        if (sctp_state(asoc, SHUTDOWN_PENDING) &&
            !sctp_outq_is_empty(&asoc->outqueue))
                sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC);
}

/* Helper function to delete an association. */
static void sctp_cmd_delete_tcb(struct sctp_cmd_seq *cmds,
                                struct sctp_association *asoc)
{
        struct sock *sk = asoc->base.sk;

        /* If it is a non-temporary association belonging to a TCP-style
         * listening socket that is not closed, do not free it so that accept()
         * can pick it up later.
         */
        if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) &&
            (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK))
                return;

        sctp_association_free(asoc);
}

/*
 * ADDIP Section 4.1 ASCONF Chunk Procedures
 * A4) Start a T-4 RTO timer, using the RTO value of the selected
 * destination address (we use active path instead of primary path just
 * because primary path may be inactive.
 */
static void sctp_cmd_setup_t4(struct sctp_cmd_seq *cmds,
                              struct sctp_association *asoc,
                              struct sctp_chunk *chunk)
{
        struct sctp_transport *t;

        t = sctp_assoc_choose_alter_transport(asoc, chunk->transport);
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto;
        chunk->transport = t;
}

/* Process an incoming Operation Error Chunk. */
static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
                                   struct sctp_association *asoc,
                                   struct sctp_chunk *chunk)
{
        struct sctp_errhdr *err_hdr;
        struct sctp_ulpevent *ev;

        while (chunk->chunk_end > chunk->skb->data) {
                err_hdr = (struct sctp_errhdr *)(chunk->skb->data);

                ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
                                                     GFP_ATOMIC);
                if (!ev)
                        return;

                asoc->stream.si->enqueue_event(&asoc->ulpq, ev);

                switch (err_hdr->cause) {
                case SCTP_ERROR_UNKNOWN_CHUNK:
                {
                        struct sctp_chunkhdr *unk_chunk_hdr;

                        unk_chunk_hdr = (struct sctp_chunkhdr *)(err_hdr + 1);
                        switch (unk_chunk_hdr->type) {
                        /* ADDIP 4.1 A9) If the peer responds to an ASCONF with
                         * an ERROR chunk reporting that it did not recognized
                         * the ASCONF chunk type, the sender of the ASCONF MUST
                         * NOT send any further ASCONF chunks and MUST stop its
                         * T-4 timer.
                         */
                        case SCTP_CID_ASCONF:
                                if (asoc->peer.asconf_capable == 0)
                                        break;

                                asoc->peer.asconf_capable = 0;
                                sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
                                        SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
                                break;
                        default:
                                break;
                        }
                        break;
                }
                default:
                        break;
                }
        }
}

/* Helper function to remove the association non-primary peer
 * transports.
 */
static void sctp_cmd_del_non_primary(struct sctp_association *asoc)
{
        struct sctp_transport *t;
        struct list_head *temp;
        struct list_head *pos;

        list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
                t = list_entry(pos, struct sctp_transport, transports);
                if (!sctp_cmp_addr_exact(&t->ipaddr,
                                         &asoc->peer.primary_addr)) {
                        sctp_assoc_rm_peer(asoc, t);
                }
        }
}

/* Helper function to set sk_err on a 1-1 style socket. */
static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error)
{
        struct sock *sk = asoc->base.sk;

        if (!sctp_style(sk, UDP))
                sk->sk_err = error;
}

/* Helper function to generate an association change event */
static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands,
                                  struct sctp_association *asoc,
                                  u8 state)
{
        struct sctp_ulpevent *ev;

        ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0,
                                            asoc->c.sinit_num_ostreams,
                                            asoc->c.sinit_max_instreams,
                                            NULL, GFP_ATOMIC);
        if (ev)
                asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
}

static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands,
                                  struct sctp_association *asoc)
{
        struct sctp_ulpevent *ev;

        ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC);
        if (ev)
                asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
}

/* Helper function to generate an adaptation indication event */
static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands,
                                    struct sctp_association *asoc)
{
        struct sctp_ulpevent *ev;

        ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);

        if (ev)
                asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
}


static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
                                     enum sctp_event_timeout timer,
                                     char *name)
{
        struct sctp_transport *t;

        t = asoc->init_last_sent_to;
        asoc->init_err_counter++;

        if (t->init_sent_count > (asoc->init_cycle + 1)) {
                asoc->timeouts[timer] *= 2;
                if (asoc->timeouts[timer] > asoc->max_init_timeo) {
                        asoc->timeouts[timer] = asoc->max_init_timeo;
                }
                asoc->init_cycle++;

                pr_debug("%s: T1[%s] timeout adjustment init_err_counter:%d"
                         " cycle:%d timeout:%ld\n", __func__, name,
                         asoc->init_err_counter, asoc->init_cycle,
                         asoc->timeouts[timer]);
        }

}

/* Send the whole message, chunk by chunk, to the outqueue.
 * This way the whole message is queued up and bundling if
 * encouraged for small fragments.
 */
static void sctp_cmd_send_msg(struct sctp_association *asoc,
                              struct sctp_datamsg *msg, gfp_t gfp)
{
        struct sctp_chunk *chunk;

        list_for_each_entry(chunk, &msg->chunks, frag_list)
                sctp_outq_tail(&asoc->outqueue, chunk, gfp);

        asoc->outqueue.sched->enqueue(&asoc->outqueue, msg);
}


/* These three macros allow us to pull the debugging code out of the
 * main flow of sctp_do_sm() to keep attention focused on the real
 * functionality there.
 */
#define debug_pre_sfn() \
        pr_debug("%s[pre-fn]: ep:%p, %s, %s, asoc:%p[%s], %s\n", __func__, \
                 ep, sctp_evttype_tbl[event_type], (*debug_fn)(subtype),   \
                 asoc, sctp_state_tbl[state], state_fn->name)

#define debug_post_sfn() \
        pr_debug("%s[post-fn]: asoc:%p, status:%s\n", __func__, asoc, \
                 sctp_status_tbl[status])

#define debug_post_sfx() \
        pr_debug("%s[post-sfx]: error:%d, asoc:%p[%s]\n", __func__, error, \
                 asoc, sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \
                 sctp_assoc2id(asoc))) ? asoc->state : SCTP_STATE_CLOSED])

/*
 * This is the master state machine processing function.
 *
 * If you want to understand all of lksctp, this is a
 * good place to start.
 */
int sctp_do_sm(struct net *net, enum sctp_event_type event_type,
               union sctp_subtype subtype, enum sctp_state state,
               struct sctp_endpoint *ep, struct sctp_association *asoc,
               void *event_arg, gfp_t gfp)
{
        typedef const char *(printfn_t)(union sctp_subtype);
        static printfn_t *table[] = {
                NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname,
        };
        printfn_t *debug_fn  __attribute__ ((unused)) = table[event_type];
        const struct sctp_sm_table_entry *state_fn;
        struct sctp_cmd_seq commands;
        enum sctp_disposition status;
        int error = 0;

        /* Look up the state function, run it, and then process the
         * side effects.  These three steps are the heart of lksctp.
         */
        state_fn = sctp_sm_lookup_event(net, event_type, state, subtype);

        sctp_init_cmd_seq(&commands);

        debug_pre_sfn();
        status = state_fn->fn(net, ep, asoc, subtype, event_arg, &commands);
        debug_post_sfn();

        error = sctp_side_effects(event_type, subtype, state,
                                  ep, &asoc, event_arg, status,
                                  &commands, gfp);
        debug_post_sfx();

        return error;
}

/*****************************************************************
 * This the master state function side effect processing function.
 *****************************************************************/
static int sctp_side_effects(enum sctp_event_type event_type,
                             union sctp_subtype subtype,
                             enum sctp_state state,
                             struct sctp_endpoint *ep,
                             struct sctp_association **asoc,
                             void *event_arg,
                             enum sctp_disposition status,
                             struct sctp_cmd_seq *commands,
                             gfp_t gfp)
{
        int error;

        /* FIXME - Most of the dispositions left today would be categorized
         * as "exceptional" dispositions.  For those dispositions, it
         * may not be proper to run through any of the commands at all.
         * For example, the command interpreter might be run only with
         * disposition SCTP_DISPOSITION_CONSUME.
         */
        if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state,
                                               ep, *asoc,
                                               event_arg, status,
                                               commands, gfp)))
                goto bail;

        switch (status) {
        case SCTP_DISPOSITION_DISCARD:
                pr_debug("%s: ignored sctp protocol event - state:%d, "
                         "event_type:%d, event_id:%d\n", __func__, state,
                         event_type, subtype.chunk);
                break;

        case SCTP_DISPOSITION_NOMEM:
                /* We ran out of memory, so we need to discard this
                 * packet.
                 */
                /* BUG--we should now recover some memory, probably by
                 * reneging...
                 */
                error = -ENOMEM;
                break;

        case SCTP_DISPOSITION_DELETE_TCB:
        case SCTP_DISPOSITION_ABORT:
                /* This should now be a command. */
                *asoc = NULL;
                break;

        case SCTP_DISPOSITION_CONSUME:
                /*
                 * We should no longer have much work to do here as the
                 * real work has been done as explicit commands above.
                 */
                break;

        case SCTP_DISPOSITION_VIOLATION:
                net_err_ratelimited("protocol violation state %d chunkid %d\n",
                                    state, subtype.chunk);
                break;

        case SCTP_DISPOSITION_NOT_IMPL:
                pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n",
                        state, event_type, subtype.chunk);
                break;

        case SCTP_DISPOSITION_BUG:
                pr_err("bug in state %d, event_type %d, event_id %d\n",
                       state, event_type, subtype.chunk);
                BUG();
                break;

        default:
                pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n",
                       status, state, event_type, subtype.chunk);
                error = status;
                if (error >= 0)
                        error = -EINVAL;
                WARN_ON_ONCE(1);
                break;
        }

bail:
        return error;
}

/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

/* This is the side-effect interpreter.  */
static int sctp_cmd_interpreter(enum sctp_event_type event_type,
                                union sctp_subtype subtype,
                                enum sctp_state state,
                                struct sctp_endpoint *ep,
                                struct sctp_association *asoc,
                                void *event_arg,
                                enum sctp_disposition status,
                                struct sctp_cmd_seq *commands,
                                gfp_t gfp)
{
        struct sctp_sock *sp = sctp_sk(ep->base.sk);
        struct sctp_chunk *chunk = NULL, *new_obj;
        struct sctp_packet *packet;
        struct sctp_sackhdr sackh;
        struct timer_list *timer;
        struct sctp_transport *t;
        unsigned long timeout;
        struct sctp_cmd *cmd;
        int local_cork = 0;
        int error = 0;
        int force;

        if (SCTP_EVENT_T_TIMEOUT != event_type)
                chunk = event_arg;

        /* Note:  This whole file is a huge candidate for rework.
         * For example, each command could either have its own handler, so
         * the loop would look like:
         *     while (cmds)
         *         cmd->handle(x, y, z)
         * --jgrimm
         */
        while (NULL != (cmd = sctp_next_cmd(commands))) {
                switch (cmd->verb) {
                case SCTP_CMD_NOP:
                        /* Do nothing. */
                        break;

                case SCTP_CMD_NEW_ASOC:
                        /* Register a new association.  */
                        if (local_cork) {
                                sctp_outq_uncork(&asoc->outqueue, gfp);
                                local_cork = 0;
                        }

                        /* Register with the endpoint.  */
                        asoc = cmd->obj.asoc;
                        BUG_ON(asoc->peer.primary_path == NULL);
                        sctp_endpoint_add_asoc(ep, asoc);
                        break;

                case SCTP_CMD_PURGE_OUTQUEUE:
                       sctp_outq_teardown(&asoc->outqueue);
                       break;

                case SCTP_CMD_DELETE_TCB:
                        if (local_cork) {
                                sctp_outq_uncork(&asoc->outqueue, gfp);
                                local_cork = 0;
                        }
                        /* Delete the current association.  */
                        sctp_cmd_delete_tcb(commands, asoc);
                        asoc = NULL;
                        break;

                case SCTP_CMD_NEW_STATE:
                        /* Enter a new state.  */
                        sctp_cmd_new_state(commands, asoc, cmd->obj.state);
                        break;

                case SCTP_CMD_REPORT_TSN:
                        /* Record the arrival of a TSN.  */
                        error = sctp_tsnmap_mark(&asoc->peer.tsn_map,
                                                 cmd->obj.u32, NULL);
                        break;

                case SCTP_CMD_REPORT_FWDTSN:
                        asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32);
                        break;

                case SCTP_CMD_PROCESS_FWDTSN:
                        asoc->stream.si->handle_ftsn(&asoc->ulpq,
                                                     cmd->obj.chunk);
                        break;

                case SCTP_CMD_GEN_SACK:
                        /* Generate a Selective ACK.
                         * The argument tells us whether to just count
                         * the packet and MAYBE generate a SACK, or
                         * force a SACK out.
                         */
                        force = cmd->obj.i32;
                        error = sctp_gen_sack(asoc, force, commands);
                        break;

                case SCTP_CMD_PROCESS_SACK:
                        /* Process an inbound SACK.  */
                        error = sctp_cmd_process_sack(commands, asoc,
                                                      cmd->obj.chunk);
                        break;

                case SCTP_CMD_GEN_INIT_ACK:
                        /* Generate an INIT ACK chunk.  */
                        new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC,
                                                     0);
                        if (!new_obj) {
                                error = -ENOMEM;
                                break;
                        }

                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(new_obj));
                        break;

                case SCTP_CMD_PEER_INIT:
                        /* Process a unified INIT from the peer.
                         * Note: Only used during INIT-ACK processing.  If
                         * there is an error just return to the outter
                         * layer which will bail.
                         */
                        error = sctp_cmd_process_init(commands, asoc, chunk,
                                                      cmd->obj.init, gfp);
                        break;

                case SCTP_CMD_GEN_COOKIE_ECHO:
                        /* Generate a COOKIE ECHO chunk.  */
                        new_obj = sctp_make_cookie_echo(asoc, chunk);
                        if (!new_obj) {
                                if (cmd->obj.chunk)
                                        sctp_chunk_free(cmd->obj.chunk);
                                error = -ENOMEM;
                                break;
                        }
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(new_obj));

                        /* If there is an ERROR chunk to be sent along with
                         * the COOKIE_ECHO, send it, too.
                         */
                        if (cmd->obj.chunk)
                                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                                SCTP_CHUNK(cmd->obj.chunk));

                        if (new_obj->transport) {
                                new_obj->transport->init_sent_count++;
                                asoc->init_last_sent_to = new_obj->transport;
                        }

                        /* FIXME - Eventually come up with a cleaner way to
                         * enabling COOKIE-ECHO + DATA bundling during
                         * multihoming stale cookie scenarios, the following
                         * command plays with asoc->peer.retran_path to
                         * avoid the problem of sending the COOKIE-ECHO and
                         * DATA in different paths, which could result
                         * in the association being ABORTed if the DATA chunk
                         * is processed first by the server.  Checking the
                         * init error counter simply causes this command
                         * to be executed only during failed attempts of
                         * association establishment.
                         */
                        if ((asoc->peer.retran_path !=
                             asoc->peer.primary_path) &&
                            (asoc->init_err_counter > 0)) {
                                sctp_add_cmd_sf(commands,
                                                SCTP_CMD_FORCE_PRIM_RETRAN,
                                                SCTP_NULL());
                        }

                        break;

                case SCTP_CMD_GEN_SHUTDOWN:
                        /* Generate SHUTDOWN when in SHUTDOWN_SENT state.
                         * Reset error counts.
                         */
                        asoc->overall_error_count = 0;

                        /* Generate a SHUTDOWN chunk.  */
                        new_obj = sctp_make_shutdown(asoc, chunk);
                        if (!new_obj) {
                                error = -ENOMEM;
                                break;
                        }
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(new_obj));
                        break;

                case SCTP_CMD_CHUNK_ULP:
                        /* Send a chunk to the sockets layer.  */
                        pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n",
                                 __func__, cmd->obj.chunk, &asoc->ulpq);

                        asoc->stream.si->ulpevent_data(&asoc->ulpq,
                                                       cmd->obj.chunk,
                                                       GFP_ATOMIC);
                        break;

                case SCTP_CMD_EVENT_ULP:
                        /* Send a notification to the sockets layer.  */
                        pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n",
                                 __func__, cmd->obj.ulpevent, &asoc->ulpq);

                        asoc->stream.si->enqueue_event(&asoc->ulpq,
                                                       cmd->obj.ulpevent);
                        break;

                case SCTP_CMD_REPLY:
                        /* If an caller has not already corked, do cork. */
                        if (!asoc->outqueue.cork) {
                                sctp_outq_cork(&asoc->outqueue);
                                local_cork = 1;
                        }
                        /* Send a chunk to our peer.  */
                        sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp);
                        break;

                case SCTP_CMD_SEND_PKT:
                        /* Send a full packet to our peer.  */
                        packet = cmd->obj.packet;
                        sctp_packet_transmit(packet, gfp);
                        sctp_ootb_pkt_free(packet);
                        break;

                case SCTP_CMD_T1_RETRAN:
                        /* Mark a transport for retransmission.  */
                        sctp_retransmit(&asoc->outqueue, cmd->obj.transport,
                                        SCTP_RTXR_T1_RTX);
                        break;

                case SCTP_CMD_RETRAN:
                        /* Mark a transport for retransmission.  */
                        sctp_retransmit(&asoc->outqueue, cmd->obj.transport,
                                        SCTP_RTXR_T3_RTX);
                        break;

                case SCTP_CMD_ECN_CE:
                        /* Do delayed CE processing.   */
                        sctp_do_ecn_ce_work(asoc, cmd->obj.u32);
                        break;

                case SCTP_CMD_ECN_ECNE:
                        /* Do delayed ECNE processing. */
                        new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32,
                                                        chunk);
                        if (new_obj)
                                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                                SCTP_CHUNK(new_obj));
                        break;

                case SCTP_CMD_ECN_CWR:
                        /* Do delayed CWR processing.  */
                        sctp_do_ecn_cwr_work(asoc, cmd->obj.u32);
                        break;

                case SCTP_CMD_SETUP_T2:
                        sctp_cmd_setup_t2(commands, asoc, cmd->obj.chunk);
                        break;

                case SCTP_CMD_TIMER_START_ONCE:
                        timer = &asoc->timers[cmd->obj.to];

                        if (timer_pending(timer))
                                break;
                        fallthrough;

                case SCTP_CMD_TIMER_START:
                        timer = &asoc->timers[cmd->obj.to];
                        timeout = asoc->timeouts[cmd->obj.to];
                        BUG_ON(!timeout);

                        /*
                         * SCTP has a hard time with timer starts.  Because we process
                         * timer starts as side effects, it can be hard to tell if we
                         * have already started a timer or not, which leads to BUG
                         * halts when we call add_timer. So here, instead of just starting
                         * a timer, if the timer is already started, and just mod
                         * the timer with the shorter of the two expiration times
                         */
                        if (!timer_pending(timer))
                                sctp_association_hold(asoc);
                        timer_reduce(timer, jiffies + timeout);
                        break;

                case SCTP_CMD_TIMER_RESTART:
                        timer = &asoc->timers[cmd->obj.to];
                        timeout = asoc->timeouts[cmd->obj.to];
                        if (!mod_timer(timer, jiffies + timeout))
                                sctp_association_hold(asoc);
                        break;

                case SCTP_CMD_TIMER_STOP:
                        timer = &asoc->timers[cmd->obj.to];
                        if (del_timer(timer))
                                sctp_association_put(asoc);
                        break;

                case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
                        chunk = cmd->obj.chunk;
                        t = sctp_assoc_choose_alter_transport(asoc,
                                                asoc->init_last_sent_to);
                        asoc->init_last_sent_to = t;
                        chunk->transport = t;
                        t->init_sent_count++;
                        /* Set the new transport as primary */
                        sctp_assoc_set_primary(asoc, t);
                        break;

                case SCTP_CMD_INIT_RESTART:
                        /* Do the needed accounting and updates
                         * associated with restarting an initialization
                         * timer. Only multiply the timeout by two if
                         * all transports have been tried at the current
                         * timeout.
                         */
                        sctp_cmd_t1_timer_update(asoc,
                                                SCTP_EVENT_TIMEOUT_T1_INIT,
                                                "INIT");

                        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
                        break;

                case SCTP_CMD_COOKIEECHO_RESTART:
                        /* Do the needed accounting and updates
                         * associated with restarting an initialization
                         * timer. Only multiply the timeout by two if
                         * all transports have been tried at the current
                         * timeout.
                         */
                        sctp_cmd_t1_timer_update(asoc,
                                                SCTP_EVENT_TIMEOUT_T1_COOKIE,
                                                "COOKIE");

                        /* If we've sent any data bundled with
                         * COOKIE-ECHO we need to resend.
                         */
                        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                                        transports) {
                                sctp_retransmit_mark(&asoc->outqueue, t,
                                            SCTP_RTXR_T1_RTX);
                        }

                        sctp_add_cmd_sf(commands,
                                        SCTP_CMD_TIMER_RESTART,
                                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
                        break;

                case SCTP_CMD_INIT_FAILED:
                        sctp_cmd_init_failed(commands, asoc, cmd->obj.u16);
                        break;

                case SCTP_CMD_ASSOC_FAILED:
                        sctp_cmd_assoc_failed(commands, asoc, event_type,
                                              subtype, chunk, cmd->obj.u16);
                        break;

                case SCTP_CMD_INIT_COUNTER_INC:
                        asoc->init_err_counter++;
                        break;

                case SCTP_CMD_INIT_COUNTER_RESET:
                        asoc->init_err_counter = 0;
                        asoc->init_cycle = 0;
                        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                                            transports) {
                                t->init_sent_count = 0;
                        }
                        break;

                case SCTP_CMD_REPORT_DUP:
                        sctp_tsnmap_mark_dup(&asoc->peer.tsn_map,
                                             cmd->obj.u32);
                        break;

                case SCTP_CMD_REPORT_BAD_TAG:
                        pr_debug("%s: vtag mismatch!\n", __func__);
                        break;

                case SCTP_CMD_STRIKE:
                        /* Mark one strike against a transport.  */
                        sctp_do_8_2_transport_strike(commands, asoc,
                                                    cmd->obj.transport, 0);
                        break;

                case SCTP_CMD_TRANSPORT_IDLE:
                        t = cmd->obj.transport;
                        sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
                        break;

                case SCTP_CMD_TRANSPORT_HB_SENT:
                        t = cmd->obj.transport;
                        sctp_do_8_2_transport_strike(commands, asoc,
                                                     t, 1);
                        t->hb_sent = 1;
                        break;

                case SCTP_CMD_TRANSPORT_ON:
                        t = cmd->obj.transport;
                        sctp_cmd_transport_on(commands, asoc, t, chunk);
                        break;

                case SCTP_CMD_HB_TIMERS_START:
                        sctp_cmd_hb_timers_start(commands, asoc);
                        break;

                case SCTP_CMD_HB_TIMER_UPDATE:
                        t = cmd->obj.transport;
                        sctp_transport_reset_hb_timer(t);
                        break;

                case SCTP_CMD_HB_TIMERS_STOP:
                        sctp_cmd_hb_timers_stop(commands, asoc);
                        break;

                case SCTP_CMD_PROBE_TIMER_UPDATE:
                        t = cmd->obj.transport;
                        sctp_transport_reset_probe_timer(t);
                        break;

                case SCTP_CMD_REPORT_ERROR:
                        error = cmd->obj.error;
                        break;

                case SCTP_CMD_PROCESS_CTSN:
                        /* Dummy up a SACK for processing. */
                        sackh.cum_tsn_ack = cmd->obj.be32;
                        sackh.a_rwnd = htonl(asoc->peer.rwnd +
                                             asoc->outqueue.outstanding_bytes);
                        sackh.num_gap_ack_blocks = 0;
                        sackh.num_dup_tsns = 0;
                        chunk->subh.sack_hdr = &sackh;
                        sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK,
                                        SCTP_CHUNK(chunk));
                        break;

                case SCTP_CMD_DISCARD_PACKET:
                        /* We need to discard the whole packet.
                         * Uncork the queue since there might be
                         * responses pending
                         */
                        chunk->pdiscard = 1;
                        if (asoc) {
                                sctp_outq_uncork(&asoc->outqueue, gfp);
                                local_cork = 0;
                        }
                        break;

                case SCTP_CMD_RTO_PENDING:
                        t = cmd->obj.transport;
                        t->rto_pending = 1;
                        break;

                case SCTP_CMD_PART_DELIVER:
                        asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC);
                        break;

                case SCTP_CMD_RENEGE:
                        asoc->stream.si->renege_events(&asoc->ulpq,
                                                       cmd->obj.chunk,
                                                       GFP_ATOMIC);
                        break;

                case SCTP_CMD_SETUP_T4:
                        sctp_cmd_setup_t4(commands, asoc, cmd->obj.chunk);
                        break;

                case SCTP_CMD_PROCESS_OPERR:
                        sctp_cmd_process_operr(commands, asoc, chunk);
                        break;
                case SCTP_CMD_CLEAR_INIT_TAG:
                        asoc->peer.i.init_tag = 0;
                        break;
                case SCTP_CMD_DEL_NON_PRIMARY:
                        sctp_cmd_del_non_primary(asoc);
                        break;
                case SCTP_CMD_T3_RTX_TIMERS_STOP:
                        sctp_cmd_t3_rtx_timers_stop(commands, asoc);
                        break;
                case SCTP_CMD_FORCE_PRIM_RETRAN:
                        t = asoc->peer.retran_path;
                        asoc->peer.retran_path = asoc->peer.primary_path;
                        sctp_outq_uncork(&asoc->outqueue, gfp);
                        local_cork = 0;
                        asoc->peer.retran_path = t;
                        break;
                case SCTP_CMD_SET_SK_ERR:
                        sctp_cmd_set_sk_err(asoc, cmd->obj.error);
                        break;
                case SCTP_CMD_ASSOC_CHANGE:
                        sctp_cmd_assoc_change(commands, asoc,
                                              cmd->obj.u8);
                        break;
                case SCTP_CMD_ADAPTATION_IND:
                        sctp_cmd_adaptation_ind(commands, asoc);
                        break;
                case SCTP_CMD_PEER_NO_AUTH:
                        sctp_cmd_peer_no_auth(commands, asoc);
                        break;

                case SCTP_CMD_ASSOC_SHKEY:
                        error = sctp_auth_asoc_init_active_key(asoc,
                                                GFP_ATOMIC);
                        break;
                case SCTP_CMD_UPDATE_INITTAG:
                        asoc->peer.i.init_tag = cmd->obj.u32;
                        break;
                case SCTP_CMD_SEND_MSG:
                        if (!asoc->outqueue.cork) {
                                sctp_outq_cork(&asoc->outqueue);
                                local_cork = 1;
                        }
                        sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
                        break;
                case SCTP_CMD_PURGE_ASCONF_QUEUE:
                        sctp_asconf_queue_teardown(asoc);
                        break;

                case SCTP_CMD_SET_ASOC:
                        if (asoc && local_cork) {
                                sctp_outq_uncork(&asoc->outqueue, gfp);
                                local_cork = 0;
                        }
                        asoc = cmd->obj.asoc;
                        break;

                default:
                        pr_warn("Impossible command: %u\n",
                                cmd->verb);
                        break;
                }

                if (error) {
                        cmd = sctp_next_cmd(commands);
                        while (cmd) {
                                if (cmd->verb == SCTP_CMD_REPLY)
                                        sctp_chunk_free(cmd->obj.chunk);
                                cmd = sctp_next_cmd(commands);
                        }
                        break;
                }
        }

        /* If this is in response to a received chunk, wait until
         * we are done with the packet to open the queue so that we don't
         * send multiple packets in response to a single request.
         */
        if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
                if (chunk->end_of_packet || chunk->singleton)
                        sctp_outq_uncork(&asoc->outqueue, gfp);
        } else if (local_cork)
                sctp_outq_uncork(&asoc->outqueue, gfp);

        if (sp->data_ready_signalled)
                sp->data_ready_signalled = 0;

        return error;
}



















































































































































    4 




    4 






































   21 





















   14 




    4 

















    4 




   19 
















   19 











    3 



   14 



























    6 




   13 

















   13 









    6 















    6 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Variant of atomic_t specialized for reference counts.
 *
 * The interface matches the atomic_t interface (to aid in porting) but only
 * provides the few functions one should use for reference counting.
 *
 * Saturation semantics
 * ====================
 *
 * refcount_t differs from atomic_t in that the counter saturates at
 * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
 * counter and causing 'spurious' use-after-free issues. In order to avoid the
 * cost associated with introducing cmpxchg() loops into all of the saturating
 * operations, we temporarily allow the counter to take on an unchecked value
 * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
 * or overflow has occurred. Although this is racy when multiple threads
 * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
 * equidistant from 0 and INT_MAX we minimise the scope for error:
 *
 *                                    INT_MAX     REFCOUNT_SATURATED   UINT_MAX
 *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
 *   +--------------------------------+----------------+----------------+
 *                                     <---------- bad value! ---------->
 *
 * (in a signed view of the world, the "bad value" range corresponds to
 * a negative counter value).
 *
 * As an example, consider a refcount_inc() operation that causes the counter
 * to overflow:
 *
 *         int old = atomic_fetch_add_relaxed(r);
 *        // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
 *        if (old < 0)
 *                atomic_set(r, REFCOUNT_SATURATED);
 *
 * If another thread also performs a refcount_inc() operation between the two
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
 * underlying object.
 * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
 * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
 * With the current PID limit, if no batched refcounting operations are used and
 * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
 * operations, this makes it impossible for a saturated refcount to leave the
 * saturation range, even if it is possible for multiple uses of the same
 * refcount to nest in the context of a single task:
 *
 *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
 *     0x40000000 / 0x400000 = 0x100 = 256
 *
 * If hundreds of references are added/removed with a single refcounting
 * operation, it may potentially be possible to leave the saturation range; but
 * given the precise timing details involved with the round-robin scheduling of
 * each thread manipulating the refcount and the need to hit the race multiple
 * times in succession, there doesn't appear to be a practical avenue of attack
 * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object we're increasing the
 * reference count on will provide the ordering. For locked data structures,
 * its the lock acquire, for RCU/lockless data structures its the dependent
 * load.
 *
 * Do note that inc_not_zero() provides a control dependency which will order
 * future stores against the inc, this ensures we'll never modify the object
 * if we did not in fact acquire a reference.
 *
 * The decrements will provide release order, such that all the prior loads and
 * stores will be issued before, it also provides a control dependency, which
 * will order us against the subsequent free().
 *
 * The control dependency is against the load of the cmpxchg (ll/sc) that
 * succeeded. This means the stores aren't fully ordered, but this is fine
 * because the 1->0 transition indicates no concurrency.
 *
 * Note that the allocator is responsible for ordering things between free()
 * and alloc().
 *
 * The decrements dec_and_test() and sub_and_test() also provide acquire
 * ordering on success.
 *
 */

#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/refcount_types.h>
#include <linux/spinlock_types.h>

struct mutex;

#define REFCOUNT_INIT(n)        { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX                INT_MAX
#define REFCOUNT_SATURATED        (INT_MIN / 2)

enum refcount_saturation_type {
        REFCOUNT_ADD_NOT_ZERO_OVF,
        REFCOUNT_ADD_OVF,
        REFCOUNT_ADD_UAF,
        REFCOUNT_SUB_UAF,
        REFCOUNT_DEC_LEAK,
};

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);

/**
 * refcount_set - set a refcount's value
 * @r: the refcount
 * @n: value to which the refcount will be set
 */
static inline void refcount_set(refcount_t *r, int n)
{
        atomic_set(&r->refs, n);
}

/**
 * refcount_read - get a refcount's value
 * @r: the refcount
 *
 * Return: the refcount's value
 */
static inline unsigned int refcount_read(const refcount_t *r)
{
        return atomic_read(&r->refs);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;
        } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

/**
 * refcount_add_not_zero - add a value to a refcount unless it is 0
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
        return __refcount_add_not_zero(i, r, NULL);
}

static inline __signed_wrap
void __refcount_add(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_add_relaxed(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(!old))
                refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
        else if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

/**
 * refcount_add - add a value to a refcount
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 */
static inline void refcount_add(int i, refcount_t *r)
{
        __refcount_add(i, r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero(1, r, oldp);
}

/**
 * refcount_inc_not_zero - increment a refcount unless it is 0
 * @r: the refcount to increment
 *
 * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
 * and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
        return __refcount_inc_not_zero(r, NULL);
}

static inline void __refcount_inc(refcount_t *r, int *oldp)
{
        __refcount_add(1, r, oldp);
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */
static inline void refcount_inc(refcount_t *r)
{
        __refcount_inc(r, NULL);
}

static inline __must_check __signed_wrap
bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (old == i) {
                smp_acquire__after_ctrl_dep();
                return true;
        }

        if (unlikely(old < 0 || old - i < 0))
                refcount_warn_saturate(r, REFCOUNT_SUB_UAF);

        return false;
}

/**
 * refcount_sub_and_test - subtract from a refcount and test if it is 0
 * @i: amount to subtract from the refcount
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), but it will WARN, return false and
 * ultimately leak on underflow and will fail to decrement when saturated
 * at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_dec(), or one of its variants, should instead be used to
 * decrement a reference count.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
        return __refcount_sub_and_test(i, r, NULL);
}

static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
        return __refcount_sub_and_test(1, r, oldp);
}

/**
 * refcount_dec_and_test - decrement a refcount and test if it is 0
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
        return __refcount_dec_and_test(r, NULL);
}

static inline void __refcount_dec(refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(1, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(old <= 1))
                refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}

/**
 * refcount_dec - decrement a refcount
 * @r: the refcount
 *
 * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
 * when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before.
 */
static inline void refcount_dec(refcount_t *r)
{
        __refcount_dec(r, NULL);
}

extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
                                                       spinlock_t *lock,
                                                       unsigned long *flags) __cond_acquires(lock);
#endif /* _LINUX_REFCOUNT_H */

































    3 
















    3 






    3 





















    3 
    3 
























    2 




































































































































    3 


















    2 
    2 






    2 





    2 




    2 










    3 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/realpath.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/magic.h>
#include <linux/proc_fs.h>

/**
 * tomoyo_encode2 - Encode binary string to ascii string.
 *
 * @str:     String in binary format.
 * @str_len: Size of @str in byte.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode2(const char *str, int str_len)
{
        int i;
        int len = 0;
        const char *p = str;
        char *cp;
        char *cp0;

        if (!p)
                return NULL;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\')
                        len += 2;
                else if (c > ' ' && c < 127)
                        len++;
                else
                        len += 4;
        }
        len++;
        /* Reserve space for appending "/". */
        cp = kzalloc(len + 10, GFP_NOFS);
        if (!cp)
                return NULL;
        cp0 = cp;
        p = str;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\') {
                        *cp++ = '\\';
                        *cp++ = '\\';
                } else if (c > ' ' && c < 127) {
                        *cp++ = c;
                } else {
                        *cp++ = '\\';
                        *cp++ = (c >> 6) + '0';
                        *cp++ = ((c >> 3) & 7) + '0';
                        *cp++ = (c & 7) + '0';
                }
        }
        return cp0;
}

/**
 * tomoyo_encode - Encode binary string to ascii string.
 *
 * @str: String in binary format.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode(const char *str)
{
        return str ? tomoyo_encode2(str, strlen(str)) : NULL;
}

/**
 * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root.
 *
 * @path:   Pointer to "struct path".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer,
                                      const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                /* go to whatever namespace root we are under */
                pos = d_absolute_path(path, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(path->dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_dentry_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer,
                                    const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                pos = dentry_path_raw(dentry, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_local_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 */
static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
                                   const int buflen)
{
        struct super_block *sb = dentry->d_sb;
        char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen);

        if (IS_ERR(pos))
                return pos;
        /* Convert from $PID to self if $PID is current thread. */
        if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') {
                char *ep;
                const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10);
                struct pid_namespace *proc_pidns = proc_pid_ns(sb);

                if (*ep == '/' && pid && pid ==
                    task_tgid_nr_ns(current, proc_pidns)) {
                        pos = ep - 5;
                        if (pos < buffer)
                                goto out;
                        memmove(pos, "/self", 5);
                }
                goto prepend_filesystem_name;
        }
        /* Use filesystem name for unnamed devices. */
        if (!MAJOR(sb->s_dev))
                goto prepend_filesystem_name;
        {
                struct inode *inode = d_backing_inode(sb->s_root);

                /*
                 * Use filesystem name if filesystem does not support rename()
                 * operation.
                 */
                if (!inode->i_op->rename)
                        goto prepend_filesystem_name;
        }
        /* Prepend device name. */
        {
                char name[64];
                int name_len;
                const dev_t dev = sb->s_dev;

                name[sizeof(name) - 1] = '\0';
                snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev),
                         MINOR(dev));
                name_len = strlen(name);
                pos -= name_len;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                return pos;
        }
        /* Prepend filesystem name. */
prepend_filesystem_name:
        {
                const char *name = sb->s_type->name;
                const int name_len = strlen(name);

                pos -= name_len + 1;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                pos[name_len] = ':';
        }
        return pos;
out:
        return ERR_PTR(-ENOMEM);
}

/**
 * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root.
 *
 * @path: Pointer to "struct path".
 *
 * Returns the realpath of the given @path on success, NULL otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 * Characters out of 0x20 < c < 0x7F range are converted to
 * \ooo style octal string.
 * Character \ is converted to \\ string.
 *
 * These functions use kzalloc(), so the caller must call kfree()
 * if these functions didn't return NULL.
 */
char *tomoyo_realpath_from_path(const struct path *path)
{
        char *buf = NULL;
        char *name = NULL;
        unsigned int buf_len = PAGE_SIZE / 2;
        struct dentry *dentry = path->dentry;
        struct super_block *sb = dentry->d_sb;

        while (1) {
                char *pos;
                struct inode *inode;

                buf_len <<= 1;
                kfree(buf);
                buf = kmalloc(buf_len, GFP_NOFS);
                if (!buf)
                        break;
                /* To make sure that pos is '\0' terminated. */
                buf[buf_len - 1] = '\0';
                /* For "pipe:[\$]" and "socket:[\$]". */
                if (dentry->d_op && dentry->d_op->d_dname) {
                        pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1);
                        goto encode;
                }
                inode = d_backing_inode(sb->s_root);
                /*
                 * Get local name for filesystems without rename() operation
                 */
                if ((!inode->i_op->rename &&
                     !(sb->s_type->fs_flags & FS_REQUIRES_DEV)))
                        pos = tomoyo_get_local_path(path->dentry, buf,
                                                    buf_len - 1);
                /* Get absolute name for the rest. */
                else {
                        pos = tomoyo_get_absolute_path(path, buf, buf_len - 1);
                        /*
                         * Fall back to local name if absolute name is not
                         * available.
                         */
                        if (pos == ERR_PTR(-EINVAL))
                                pos = tomoyo_get_local_path(path->dentry, buf,
                                                            buf_len - 1);
                }
encode:
                if (IS_ERR(pos))
                        continue;
                name = tomoyo_encode(pos);
                break;
        }
        kfree(buf);
        if (!name)
                tomoyo_warn_oom(__func__);
        return name;
}

/**
 * tomoyo_realpath_nofollow - Get realpath of a pathname.
 *
 * @pathname: The pathname to solve.
 *
 * Returns the realpath of @pathname on success, NULL otherwise.
 */
char *tomoyo_realpath_nofollow(const char *pathname)
{
        struct path path;

        if (pathname && kern_path(pathname, 0, &path) == 0) {
                char *buf = tomoyo_realpath_from_path(&path);

                path_put(&path);
                return buf;
        }
        return NULL;
}




























































































































































































































































































































































































































































































































































































































































































































































































    1 





















    1 




    1 

























































    1 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#ifndef _LINUX_BPF_VERIFIER_H
#define _LINUX_BPF_VERIFIER_H 1

#include <linux/bpf.h> /* for enum bpf_reg_type */
#include <linux/btf.h> /* for struct btf and btf_id() */
#include <linux/filter.h> /* for MAX_BPF_STACK */
#include <linux/tnum.h>

/* Maximum variable offset umax_value permitted when resolving memory accesses.
 * In practice this is far bigger than any realistic pointer offset; this limit
 * ensures that umax_value + (int)off + (int)size cannot overflow a u64.
 */
#define BPF_MAX_VAR_OFF        (1 << 29)
/* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO].  This ensures
 * that converting umax_value to int cannot overflow.
 */
#define BPF_MAX_VAR_SIZ        (1 << 29)
/* size of tmp_str_buf in bpf_verifier.
 * we need at least 306 bytes to fit full stack mask representation
 * (in the "-8,-16,...,-512" form)
 */
#define TMP_STR_BUF_LEN 320

/* Liveness marks, used for registers and spilled-regs (in stack slots).
 * Read marks propagate upwards until they find a write mark; they record that
 * "one of this state's descendants read this reg" (and therefore the reg is
 * relevant for states_equal() checks).
 * Write marks collect downwards and do not propagate; they record that "the
 * straight-line code that reached this state (from its parent) wrote this reg"
 * (and therefore that reads propagated from this state or its descendants
 * should not propagate to its parent).
 * A state with a write mark can receive read marks; it just won't propagate
 * them to its parent, since the write mark is a property, not of the state,
 * but of the link between it and its parent.  See mark_reg_read() and
 * mark_stack_slot_read() in kernel/bpf/verifier.c.
 */
enum bpf_reg_liveness {
        REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
        REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
        REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
        REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
        REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
        REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
};

/* For every reg representing a map value or allocated object pointer,
 * we consider the tuple of (ptr, id) for them to be unique in verifier
 * context and conside them to not alias each other for the purposes of
 * tracking lock state.
 */
struct bpf_active_lock {
        /* This can either be reg->map_ptr or reg->btf. If ptr is NULL,
         * there's no active lock held, and other fields have no
         * meaning. If non-NULL, it indicates that a lock is held and
         * id member has the reg->id of the register which can be >= 0.
         */
        void *ptr;
        /* This will be reg->id */
        u32 id;
};

#define ITER_PREFIX "bpf_iter_"

enum bpf_iter_state {
        BPF_ITER_STATE_INVALID, /* for non-first slot */
        BPF_ITER_STATE_ACTIVE,
        BPF_ITER_STATE_DRAINED,
};

struct bpf_reg_state {
        /* Ordering of fields matters.  See states_equal() */
        enum bpf_reg_type type;
        /* Fixed part of pointer offset, pointer types only */
        s32 off;
        union {
                /* valid when type == PTR_TO_PACKET */
                int range;

                /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
                 *   PTR_TO_MAP_VALUE_OR_NULL
                 */
                struct {
                        struct bpf_map *map_ptr;
                        /* To distinguish map lookups from outer map
                         * the map_uid is non-zero for registers
                         * pointing to inner maps.
                         */
                        u32 map_uid;
                };

                /* for PTR_TO_BTF_ID */
                struct {
                        struct btf *btf;
                        u32 btf_id;
                };

                struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
                        u32 mem_size;
                        u32 dynptr_id; /* for dynptr slices */
                };

                /* For dynptr stack slots */
                struct {
                        enum bpf_dynptr_type type;
                        /* A dynptr is 16 bytes so it takes up 2 stack slots.
                         * We need to track which slot is the first slot
                         * to protect against cases where the user may try to
                         * pass in an address starting at the second slot of the
                         * dynptr.
                         */
                        bool first_slot;
                } dynptr;

                /* For bpf_iter stack slots */
                struct {
                        /* BTF container and BTF type ID describing
                         * struct bpf_iter_<type> of an iterator state
                         */
                        struct btf *btf;
                        u32 btf_id;
                        /* packing following two fields to fit iter state into 16 bytes */
                        enum bpf_iter_state state:2;
                        int depth:30;
                } iter;

                /* Max size from any of the above. */
                struct {
                        unsigned long raw1;
                        unsigned long raw2;
                } raw;

                u32 subprogno; /* for PTR_TO_FUNC */
        };
        /* For scalar types (SCALAR_VALUE), this represents our knowledge of
         * the actual value.
         * For pointer types, this represents the variable part of the offset
         * from the pointed-to object, and is shared with all bpf_reg_states
         * with the same id as us.
         */
        struct tnum var_off;
        /* Used to determine if any memory access using this register will
         * result in a bad access.
         * These refer to the same value as var_off, not necessarily the actual
         * contents of the register.
         */
        s64 smin_value; /* minimum possible (s64)value */
        s64 smax_value; /* maximum possible (s64)value */
        u64 umin_value; /* minimum possible (u64)value */
        u64 umax_value; /* maximum possible (u64)value */
        s32 s32_min_value; /* minimum possible (s32)value */
        s32 s32_max_value; /* maximum possible (s32)value */
        u32 u32_min_value; /* minimum possible (u32)value */
        u32 u32_max_value; /* maximum possible (u32)value */
        /* For PTR_TO_PACKET, used to find other pointers with the same variable
         * offset, so they can share range knowledge.
         * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
         * came from, when one is tested for != NULL.
         * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
         * for the purpose of tracking that it's freed.
         * For PTR_TO_SOCKET this is used to share which pointers retain the
         * same reference to the socket, to determine proper reference freeing.
         * For stack slots that are dynptrs, this is used to track references to
         * the dynptr to determine proper reference freeing.
         * Similarly to dynptrs, we use ID to track "belonging" of a reference
         * to a specific instance of bpf_iter.
         */
        u32 id;
        /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
         * from a pointer-cast helper, bpf_sk_fullsock() and
         * bpf_tcp_sock().
         *
         * Consider the following where "sk" is a reference counted
         * pointer returned from "sk = bpf_sk_lookup_tcp();":
         *
         * 1: sk = bpf_sk_lookup_tcp();
         * 2: if (!sk) { return 0; }
         * 3: fullsock = bpf_sk_fullsock(sk);
         * 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
         * 5: tp = bpf_tcp_sock(fullsock);
         * 6: if (!tp) { bpf_sk_release(sk); return 0; }
         * 7: bpf_sk_release(sk);
         * 8: snd_cwnd = tp->snd_cwnd;  // verifier will complain
         *
         * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
         * "tp" ptr should be invalidated also.  In order to do that,
         * the reg holding "fullsock" and "sk" need to remember
         * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
         * such that the verifier can reset all regs which have
         * ref_obj_id matching the sk_reg->id.
         *
         * sk_reg->ref_obj_id is set to sk_reg->id at line 1.
         * sk_reg->id will stay as NULL-marking purpose only.
         * After NULL-marking is done, sk_reg->id can be reset to 0.
         *
         * After "fullsock = bpf_sk_fullsock(sk);" at line 3,
         * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
         *
         * After "tp = bpf_tcp_sock(fullsock);" at line 5,
         * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
         * which is the same as sk_reg->ref_obj_id.
         *
         * From the verifier perspective, if sk, fullsock and tp
         * are not NULL, they are the same ptr with different
         * reg->type.  In particular, bpf_sk_release(tp) is also
         * allowed and has the same effect as bpf_sk_release(sk).
         */
        u32 ref_obj_id;
        /* parentage chain for liveness checking */
        struct bpf_reg_state *parent;
        /* Inside the callee two registers can be both PTR_TO_STACK like
         * R1=fp-8 and R2=fp-8, but one of them points to this function stack
         * while another to the caller's stack. To differentiate them 'frameno'
         * is used which is an index in bpf_verifier_state->frame[] array
         * pointing to bpf_func_state.
         */
        u32 frameno;
        /* Tracks subreg definition. The stored value is the insn_idx of the
         * writing insn. This is safe because subreg_def is used before any insn
         * patching which only happens after main verification finished.
         */
        s32 subreg_def;
        enum bpf_reg_liveness live;
        /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
        bool precise;
};

enum bpf_stack_slot_type {
        STACK_INVALID,    /* nothing was stored in this stack slot */
        STACK_SPILL,      /* register spilled into stack */
        STACK_MISC,          /* BPF program wrote some data into this slot */
        STACK_ZERO,          /* BPF program wrote constant zero */
        /* A dynptr is stored in this stack slot. The type of dynptr
         * is stored in bpf_stack_state->spilled_ptr.dynptr.type
         */
        STACK_DYNPTR,
        STACK_ITER,
};

#define BPF_REG_SIZE 8        /* size of eBPF register in bytes */

#define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \
                          (1 << BPF_REG_3) | (1 << BPF_REG_4) | \
                          (1 << BPF_REG_5))

#define BPF_DYNPTR_SIZE                sizeof(struct bpf_dynptr_kern)
#define BPF_DYNPTR_NR_SLOTS                (BPF_DYNPTR_SIZE / BPF_REG_SIZE)

struct bpf_stack_state {
        struct bpf_reg_state spilled_ptr;
        u8 slot_type[BPF_REG_SIZE];
};

struct bpf_reference_state {
        /* Track each reference created with a unique id, even if the same
         * instruction creates the reference multiple times (eg, via CALL).
         */
        int id;
        /* Instruction where the allocation of this reference occurred. This
         * is used purely to inform the user of a reference leak.
         */
        int insn_idx;
        /* There can be a case like:
         * main (frame 0)
         *  cb (frame 1)
         *   func (frame 3)
         *    cb (frame 4)
         * Hence for frame 4, if callback_ref just stored boolean, it would be
         * impossible to distinguish nested callback refs. Hence store the
         * frameno and compare that to callback_ref in check_reference_leak when
         * exiting a callback function.
         */
        int callback_ref;
};

struct bpf_retval_range {
        s32 minval;
        s32 maxval;
};

/* state of the program:
 * type of all registers and stack info
 */
struct bpf_func_state {
        struct bpf_reg_state regs[MAX_BPF_REG];
        /* index of call instruction that called into this func */
        int callsite;
        /* stack frame number of this function state from pov of
         * enclosing bpf_verifier_state.
         * 0 = main function, 1 = first callee.
         */
        u32 frameno;
        /* subprog number == index within subprog_info
         * zero == main subprog
         */
        u32 subprogno;
        /* Every bpf_timer_start will increment async_entry_cnt.
         * It's used to distinguish:
         * void foo(void) { for(;;); }
         * void foo(void) { bpf_timer_set_callback(,foo); }
         */
        u32 async_entry_cnt;
        struct bpf_retval_range callback_ret_range;
        bool in_callback_fn;
        bool in_async_callback_fn;
        bool in_exception_callback_fn;
        /* For callback calling functions that limit number of possible
         * callback executions (e.g. bpf_loop) keeps track of current
         * simulated iteration number.
         * Value in frame N refers to number of times callback with frame
         * N+1 was simulated, e.g. for the following call:
         *
         *   bpf_loop(..., fn, ...); | suppose current frame is N
         *                           | fn would be simulated in frame N+1
         *                           | number of simulations is tracked in frame N
         */
        u32 callback_depth;

        /* The following fields should be last. See copy_func_state() */
        int acquired_refs;
        struct bpf_reference_state *refs;
        /* The state of the stack. Each element of the array describes BPF_REG_SIZE
         * (i.e. 8) bytes worth of stack memory.
         * stack[0] represents bytes [*(r10-8)..*(r10-1)]
         * stack[1] represents bytes [*(r10-16)..*(r10-9)]
         * ...
         * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)]
         */
        struct bpf_stack_state *stack;
        /* Size of the current stack, in bytes. The stack state is tracked below, in
         * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE.
         */
        int allocated_stack;
};

#define MAX_CALL_FRAMES 8

/* instruction history flags, used in bpf_jmp_history_entry.flags field */
enum {
        /* instruction references stack slot through PTR_TO_STACK register;
         * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
         * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512,
         * 8 bytes per slot, so slot index (spi) is [0, 63])
         */
        INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */

        INSN_F_SPI_MASK = 0x3f, /* 6 bits */
        INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */

        INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */
};

static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);

struct bpf_jmp_history_entry {
        u32 idx;
        /* insn idx can't be bigger than 1 million */
        u32 prev_idx : 22;
        /* special flags, e.g., whether insn is doing register stack spill/load */
        u32 flags : 10;
};

/* Maximum number of register states that can exist at once */
#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
struct bpf_verifier_state {
        /* call stack tracking */
        struct bpf_func_state *frame[MAX_CALL_FRAMES];
        struct bpf_verifier_state *parent;
        /*
         * 'branches' field is the number of branches left to explore:
         * 0 - all possible paths from this state reached bpf_exit or
         * were safely pruned
         * 1 - at least one path is being explored.
         * This state hasn't reached bpf_exit
         * 2 - at least two paths are being explored.
         * This state is an immediate parent of two children.
         * One is fallthrough branch with branches==1 and another
         * state is pushed into stack (to be explored later) also with
         * branches==1. The parent of this state has branches==1.
         * The verifier state tree connected via 'parent' pointer looks like:
         * 1
         * 1
         * 2 -> 1 (first 'if' pushed into stack)
         * 1
         * 2 -> 1 (second 'if' pushed into stack)
         * 1
         * 1
         * 1 bpf_exit.
         *
         * Once do_check() reaches bpf_exit, it calls update_branch_counts()
         * and the verifier state tree will look:
         * 1
         * 1
         * 2 -> 1 (first 'if' pushed into stack)
         * 1
         * 1 -> 1 (second 'if' pushed into stack)
         * 0
         * 0
         * 0 bpf_exit.
         * After pop_stack() the do_check() will resume at second 'if'.
         *
         * If is_state_visited() sees a state with branches > 0 it means
         * there is a loop. If such state is exactly equal to the current state
         * it's an infinite loop. Note states_equal() checks for states
         * equivalency, so two states being 'states_equal' does not mean
         * infinite loop. The exact comparison is provided by
         * states_maybe_looping() function. It's a stronger pre-check and
         * much faster than states_equal().
         *
         * This algorithm may not find all possible infinite loops or
         * loop iteration count may be too high.
         * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in.
         */
        u32 branches;
        u32 insn_idx;
        u32 curframe;

        struct bpf_active_lock active_lock;
        bool speculative;
        bool active_rcu_lock;
        u32 active_preempt_lock;
        /* If this state was ever pointed-to by other state's loop_entry field
         * this flag would be set to true. Used to avoid freeing such states
         * while they are still in use.
         */
        bool used_as_loop_entry;
        bool in_sleepable;

        /* first and last insn idx of this verifier state */
        u32 first_insn_idx;
        u32 last_insn_idx;
        /* If this state is a part of states loop this field points to some
         * parent of this state such that:
         * - it is also a member of the same states loop;
         * - DFS states traversal starting from initial state visits loop_entry
         *   state before this state.
         * Used to compute topmost loop entry for state loops.
         * State loops might appear because of open coded iterators logic.
         * See get_loop_entry() for more information.
         */
        struct bpf_verifier_state *loop_entry;
        /* jmp history recorded from first to last.
         * backtracking is using it to go from last to first.
         * For most states jmp_history_cnt is [0-3].
         * For loops can go up to ~40.
         */
        struct bpf_jmp_history_entry *jmp_history;
        u32 jmp_history_cnt;
        u32 dfs_depth;
        u32 callback_unroll_depth;
        u32 may_goto_depth;
};

#define bpf_get_spilled_reg(slot, frame, mask)                                \
        (((slot < frame->allocated_stack / BPF_REG_SIZE) &&                \
          ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \
         ? &frame->stack[slot].spilled_ptr : NULL)

/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
#define bpf_for_each_spilled_reg(iter, frame, reg, mask)                        \
        for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask);                \
             iter < frame->allocated_stack / BPF_REG_SIZE;                \
             iter++, reg = bpf_get_spilled_reg(iter, frame, mask))

#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr)   \
        ({                                                               \
                struct bpf_verifier_state *___vstate = __vst;            \
                int ___i, ___j;                                          \
                for (___i = 0; ___i <= ___vstate->curframe; ___i++) {    \
                        struct bpf_reg_state *___regs;                   \
                        __state = ___vstate->frame[___i];                \
                        ___regs = __state->regs;                         \
                        for (___j = 0; ___j < MAX_BPF_REG; ___j++) {     \
                                __reg = &___regs[___j];                  \
                                (void)(__expr);                          \
                        }                                                \
                        bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \
                                if (!__reg)                              \
                                        continue;                        \
                                (void)(__expr);                          \
                        }                                                \
                }                                                        \
        })

/* Invoke __expr over regsiters in __vst, setting __state and __reg */
#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \
        bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr)

/* linked list of verifier states used to prune search */
struct bpf_verifier_state_list {
        struct bpf_verifier_state state;
        struct bpf_verifier_state_list *next;
        int miss_cnt, hit_cnt;
};

struct bpf_loop_inline_state {
        unsigned int initialized:1; /* set to true upon first entry */
        unsigned int fit_for_inline:1; /* true if callback function is the same
                                        * at each call and flags are always zero
                                        */
        u32 callback_subprogno; /* valid when fit_for_inline is true */
};

/* pointer and state for maps */
struct bpf_map_ptr_state {
        struct bpf_map *map_ptr;
        bool poison;
        bool unpriv;
};

/* Possible states for alu_state member. */
#define BPF_ALU_SANITIZE_SRC                (1U << 0)
#define BPF_ALU_SANITIZE_DST                (1U << 1)
#define BPF_ALU_NEG_VALUE                (1U << 2)
#define BPF_ALU_NON_POINTER                (1U << 3)
#define BPF_ALU_IMMEDIATE                (1U << 4)
#define BPF_ALU_SANITIZE                (BPF_ALU_SANITIZE_SRC | \
                                         BPF_ALU_SANITIZE_DST)

struct bpf_insn_aux_data {
        union {
                enum bpf_reg_type ptr_type;        /* pointer type for load/store insns */
                struct bpf_map_ptr_state map_ptr_state;
                s32 call_imm;                        /* saved imm field of call insn */
                u32 alu_limit;                        /* limit for add/sub register with pointer */
                struct {
                        u32 map_index;                /* index into used_maps[] */
                        u32 map_off;                /* offset from value base address */
                };
                struct {
                        enum bpf_reg_type reg_type;        /* type of pseudo_btf_id */
                        union {
                                struct {
                                        struct btf *btf;
                                        u32 btf_id;        /* btf_id for struct typed var */
                                };
                                u32 mem_size;        /* mem_size for non-struct typed var */
                        };
                } btf_var;
                /* if instruction is a call to bpf_loop this field tracks
                 * the state of the relevant registers to make decision about inlining
                 */
                struct bpf_loop_inline_state loop_inline_state;
        };
        union {
                /* remember the size of type passed to bpf_obj_new to rewrite R1 */
                u64 obj_new_size;
                /* remember the offset of node field within type to rewrite */
                u64 insert_off;
        };
        struct btf_struct_meta *kptr_struct_meta;
        u64 map_key_state; /* constant (32 bit) key tracking for maps */
        int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
        u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
        bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
        bool zext_dst; /* this insn zero extends dst reg */
        bool needs_zext; /* alu op needs to clear upper bits */
        bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
        bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
        bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
        u8 alu_state; /* used in combination with alu_limit */

        /* below fields are initialized once */
        unsigned int orig_idx; /* original instruction index */
        bool jmp_point;
        bool prune_point;
        /* ensure we check state equivalence and save state checkpoint and
         * this instruction, regardless of any heuristics
         */
        bool force_checkpoint;
        /* true if instruction is a call to a helper function that
         * accepts callback function as a parameter.
         */
        bool calls_callback;
};

#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
#define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */

#define BPF_VERIFIER_TMP_LOG_SIZE        1024

struct bpf_verifier_log {
        /* Logical start and end positions of a "log window" of the verifier log.
         * start_pos == 0 means we haven't truncated anything.
         * Once truncation starts to happen, start_pos + len_total == end_pos,
         * except during log reset situations, in which (end_pos - start_pos)
         * might get smaller than len_total (see bpf_vlog_reset()).
         * Generally, (end_pos - start_pos) gives number of useful data in
         * user log buffer.
         */
        u64 start_pos;
        u64 end_pos;
        char __user *ubuf;
        u32 level;
        u32 len_total;
        u32 len_max;
        char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
};

#define BPF_LOG_LEVEL1        1
#define BPF_LOG_LEVEL2        2
#define BPF_LOG_STATS        4
#define BPF_LOG_FIXED        8
#define BPF_LOG_LEVEL        (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
#define BPF_LOG_MASK        (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED)
#define BPF_LOG_KERNEL        (BPF_LOG_MASK + 1) /* kernel internal flag */
#define BPF_LOG_MIN_ALIGNMENT 8U
#define BPF_LOG_ALIGNMENT 40U

static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
{
        return log && log->level;
}

#define BPF_MAX_SUBPROGS 256

struct bpf_subprog_arg_info {
        enum bpf_arg_type arg_type;
        union {
                u32 mem_size;
                u32 btf_id;
        };
};

struct bpf_subprog_info {
        /* 'start' has to be the first field otherwise find_subprog() won't work */
        u32 start; /* insn idx of function entry point */
        u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
        u16 stack_depth; /* max. stack depth used by this function */
        u16 stack_extra;
        bool has_tail_call: 1;
        bool tail_call_reachable: 1;
        bool has_ld_abs: 1;
        bool is_cb: 1;
        bool is_async_cb: 1;
        bool is_exception_cb: 1;
        bool args_cached: 1;

        u8 arg_cnt;
        struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
};

struct bpf_verifier_env;

struct backtrack_state {
        struct bpf_verifier_env *env;
        u32 frame;
        u32 reg_masks[MAX_CALL_FRAMES];
        u64 stack_masks[MAX_CALL_FRAMES];
};

struct bpf_id_pair {
        u32 old;
        u32 cur;
};

struct bpf_idmap {
        u32 tmp_id_gen;
        struct bpf_id_pair map[BPF_ID_MAP_SIZE];
};

struct bpf_idset {
        u32 count;
        u32 ids[BPF_ID_MAP_SIZE];
};

/* single container for all structs
 * one verifier_env per bpf_check() call
 */
struct bpf_verifier_env {
        u32 insn_idx;
        u32 prev_insn_idx;
        struct bpf_prog *prog;                /* eBPF program being verified */
        const struct bpf_verifier_ops *ops;
        struct module *attach_btf_mod;        /* The owner module of prog->aux->attach_btf */
        struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
        int stack_size;                        /* number of states to be processed */
        bool strict_alignment;                /* perform strict pointer alignment checks */
        bool test_state_freq;                /* test verifier with different pruning frequency */
        bool test_reg_invariants;        /* fail verification on register invariants violations */
        struct bpf_verifier_state *cur_state; /* current verifier state */
        struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
        struct bpf_verifier_state_list *free_list;
        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
        struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */
        u32 used_map_cnt;                /* number of used maps */
        u32 used_btf_cnt;                /* number of used BTF objects */
        u32 id_gen;                        /* used to generate unique reg IDs */
        u32 hidden_subprog_cnt;                /* number of hidden subprogs */
        int exception_callback_subprog;
        bool explore_alu_limits;
        bool allow_ptr_leaks;
        /* Allow access to uninitialized stack memory. Writes with fixed offset are
         * always allowed, so this refers to reads (with fixed or variable offset),
         * to writes with variable offset and to indirect (helper) accesses.
         */
        bool allow_uninit_stack;
        bool bpf_capable;
        bool bypass_spec_v1;
        bool bypass_spec_v4;
        bool seen_direct_write;
        bool seen_exception;
        struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
        const struct bpf_line_info *prev_linfo;
        struct bpf_verifier_log log;
        struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */
        union {
                struct bpf_idmap idmap_scratch;
                struct bpf_idset idset_scratch;
        };
        struct {
                int *insn_state;
                int *insn_stack;
                int cur_stack;
        } cfg;
        struct backtrack_state bt;
        struct bpf_jmp_history_entry *cur_hist_ent;
        u32 pass_cnt; /* number of times do_check() was called */
        u32 subprog_cnt;
        /* number of instructions analyzed by the verifier */
        u32 prev_insn_processed, insn_processed;
        /* number of jmps, calls, exits analyzed so far */
        u32 prev_jmps_processed, jmps_processed;
        /* total verification time */
        u64 verification_time;
        /* maximum number of verifier states kept in 'branching' instructions */
        u32 max_states_per_insn;
        /* total number of allocated verifier states */
        u32 total_states;
        /* some states are freed during program analysis.
         * this is peak number of states. this number dominates kernel
         * memory consumption during verification
         */
        u32 peak_states;
        /* longest register parentage chain walked for liveness marking */
        u32 longest_mark_read_walk;
        bpfptr_t fd_array;

        /* bit mask to keep track of whether a register has been accessed
         * since the last time the function state was printed
         */
        u32 scratched_regs;
        /* Same as scratched_regs but for stack slots */
        u64 scratched_stack_slots;
        u64 prev_log_pos, prev_insn_print_pos;
        /* buffer used to temporary hold constants as scalar registers */
        struct bpf_reg_state fake_reg[2];
        /* buffer used to generate temporary string representations,
         * e.g., in reg_type_str() to generate reg_type string
         */
        char tmp_str_buf[TMP_STR_BUF_LEN];
};

static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
{
        return &env->prog->aux->func_info_aux[subprog];
}

static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
{
        return &env->subprog_info[subprog];
}

__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
                                      const char *fmt, va_list args);
__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
                                           const char *fmt, ...);
__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
                            const char *fmt, ...);
int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
                  char __user *log_buf, u32 log_size);
void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);

__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
                                  u32 insn_off,
                                  const char *prefix_fmt, ...);

static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state *cur = env->cur_state;

        return cur->frame[cur->curframe];
}

static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
{
        return cur_func(env)->regs;
}

int bpf_prog_offload_verifier_prep(struct bpf_prog *prog);
int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
                                 int insn_idx, int prev_insn_idx);
int bpf_prog_offload_finalize(struct bpf_verifier_env *env);
void
bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
                              struct bpf_insn *insn);
void
bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);

/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
                                             struct btf *btf, u32 btf_id)
{
        if (tgt_prog)
                return ((u64)tgt_prog->aux->id << 32) | btf_id;
        else
                return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id;
}

/* unpack the IDs from the key as constructed above */
static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id)
{
        if (obj_id)
                *obj_id = key >> 32;
        if (btf_id)
                *btf_id = key & 0x7FFFFFFF;
}

int bpf_check_attach_target(struct bpf_verifier_log *log,
                            const struct bpf_prog *prog,
                            const struct bpf_prog *tgt_prog,
                            u32 btf_id,
                            struct bpf_attach_target_info *tgt_info);
void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab);

int mark_chain_precision(struct bpf_verifier_env *env, int regno);

#define BPF_BASE_TYPE_MASK        GENMASK(BPF_BASE_TYPE_BITS - 1, 0)

/* extract base type from bpf_{arg, return, reg}_type. */
static inline u32 base_type(u32 type)
{
        return type & BPF_BASE_TYPE_MASK;
}

/* extract flags from an extended type. See bpf_type_flag in bpf.h. */
static inline u32 type_flag(u32 type)
{
        return type & ~BPF_BASE_TYPE_MASK;
}

/* only use after check_attach_btf_id() */
static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog)
{
        return prog->type == BPF_PROG_TYPE_EXT ?
                prog->aux->dst_prog->type : prog->type;
}

static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
{
        switch (resolve_prog_type(prog)) {
        case BPF_PROG_TYPE_TRACING:
                return prog->expected_attach_type != BPF_TRACE_ITER;
        case BPF_PROG_TYPE_STRUCT_OPS:
        case BPF_PROG_TYPE_LSM:
                return false;
        default:
                return true;
        }
}

#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED | NON_OWN_REF)

static inline bool bpf_type_has_unsafe_modifiers(u32 type)
{
        return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS;
}

static inline bool type_is_ptr_alloc_obj(u32 type)
{
        return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
}

static inline bool type_is_non_owning_ref(u32 type)
{
        return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
}

static inline bool type_is_pkt_pointer(enum bpf_reg_type type)
{
        type = base_type(type);
        return type == PTR_TO_PACKET ||
               type == PTR_TO_PACKET_META;
}

static inline bool type_is_sk_pointer(enum bpf_reg_type type)
{
        return type == PTR_TO_SOCKET ||
                type == PTR_TO_SOCK_COMMON ||
                type == PTR_TO_TCP_SOCK ||
                type == PTR_TO_XDP_SOCK;
}

static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
{
        env->scratched_regs |= 1U << regno;
}

static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
{
        env->scratched_stack_slots |= 1ULL << spi;
}

static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
{
        return (env->scratched_regs >> regno) & 1;
}

static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
{
        return (env->scratched_stack_slots >> regno) & 1;
}

static inline bool verifier_state_scratched(const struct bpf_verifier_env *env)
{
        return env->scratched_regs || env->scratched_stack_slots;
}

static inline void mark_verifier_state_clean(struct bpf_verifier_env *env)
{
        env->scratched_regs = 0U;
        env->scratched_stack_slots = 0ULL;
}

/* Used for printing the entire verifier state. */
static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env)
{
        env->scratched_regs = ~0U;
        env->scratched_stack_slots = ~0ULL;
}

static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_size)
{
#ifdef __BIG_ENDIAN
        off -= spill_size - fill_size;
#endif

        return !(off % BPF_REG_SIZE);
}

const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
const char *dynptr_type_str(enum bpf_dynptr_type type);
const char *iter_type_str(const struct btf *btf, u32 btf_id);
const char *iter_state_str(enum bpf_iter_state state);

void print_verifier_state(struct bpf_verifier_env *env,
                          const struct bpf_func_state *state, bool print_all);
void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state);

#endif /* _LINUX_BPF_VERIFIER_H */


























    3 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_SEQADJ_H
#define _NF_CONNTRACK_SEQADJ_H

#include <net/netfilter/nf_conntrack_extend.h>

/**
 * struct nf_ct_seqadj - sequence number adjustment information
 *
 * @correction_pos: position of the last TCP sequence number modification
 * @offset_before: sequence number offset before last modification
 * @offset_after: sequence number offset after last modification
 */
struct nf_ct_seqadj {
        u32                correction_pos;
        s32                offset_before;
        s32                offset_after;
};

struct nf_conn_seqadj {
        struct nf_ct_seqadj        seq[IP_CT_DIR_MAX];
};

static inline struct nf_conn_seqadj *nfct_seqadj(const struct nf_conn *ct)
{
        return nf_ct_ext_find(ct, NF_CT_EXT_SEQADJ);
}

static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct)
{
        return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC);
}

int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                      s32 off);
int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                     __be32 seq, s32 off);
void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct,
                          enum ip_conntrack_info ctinfo, s32 off);

int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo, unsigned int protoff);
s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, u32 seq);

#endif /* _NF_CONNTRACK_SEQADJ_H */


















































    1 














    1 






    1 
















































































    1 











    1 





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
 * http://www.intel.com/products/processor/manuals/
 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
 * Volume 2A: Instruction Set Reference, A-M
 *
 * Copyright (C) 2008 Intel Corporation
 * Authors: Austin Zhang <austin_zhang@linux.intel.com>
 *          Kent Liu <kent.liu@intel.com>
 */
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>

#include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h>
#include <asm/simd.h>

#define CHKSUM_BLOCK_SIZE        1
#define CHKSUM_DIGEST_SIZE        4

#define SCALE_F        sizeof(unsigned long)

#ifdef CONFIG_X86_64
#define CRC32_INST "crc32q %1, %q0"
#else
#define CRC32_INST "crc32l %1, %0"
#endif

#ifdef CONFIG_X86_64
/*
 * use carryless multiply version of crc32c when buffer
 * size is >= 512 to account
 * for fpu state save/restore overhead.
 */
#define CRC32C_PCL_BREAKEVEN        512

asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
                                unsigned int crc_init);
#endif /* CONFIG_X86_64 */

static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
        while (length--) {
                asm("crc32b %1, %0"
                    : "+r" (crc) : "rm" (*data));
                data++;
        }

        return crc;
}

static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
{
        unsigned int iquotient = len / SCALE_F;
        unsigned int iremainder = len % SCALE_F;
        unsigned long *ptmp = (unsigned long *)p;

        while (iquotient--) {
                asm(CRC32_INST
                    : "+r" (crc) : "rm" (*ptmp));
                ptmp++;
        }

        if (iremainder)
                crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
                                 iremainder);

        return crc;
}

/*
 * Setting the seed allows arbitrary accumulators and flexible XOR policy
 * If your algorithm starts with ~0, then XOR with ~0 before you set
 * the seed.
 */
static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
                        unsigned int keylen)
{
        u32 *mctx = crypto_shash_ctx(hash);

        if (keylen != sizeof(u32))
                return -EINVAL;
        *mctx = le32_to_cpup((__le32 *)key);
        return 0;
}

static int crc32c_intel_init(struct shash_desc *desc)
{
        u32 *mctx = crypto_shash_ctx(desc->tfm);
        u32 *crcp = shash_desc_ctx(desc);

        *crcp = *mctx;

        return 0;
}

static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
                               unsigned int len)
{
        u32 *crcp = shash_desc_ctx(desc);

        *crcp = crc32c_intel_le_hw(*crcp, data, len);
        return 0;
}

static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
                                u8 *out)
{
        *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
        return 0;
}

static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
                              unsigned int len, u8 *out)
{
        return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
}

static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
{
        u32 *crcp = shash_desc_ctx(desc);

        *(__le32 *)out = ~cpu_to_le32p(crcp);
        return 0;
}

static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *out)
{
        return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
                                    out);
}

static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
{
        u32 *key = crypto_tfm_ctx(tfm);

        *key = ~0;

        return 0;
}

#ifdef CONFIG_X86_64
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
                               unsigned int len)
{
        u32 *crcp = shash_desc_ctx(desc);

        /*
         * use faster PCL version if datasize is large enough to
         * overcome kernel fpu state save/restore overhead
         */
        if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
                kernel_fpu_begin();
                *crcp = crc_pcl(data, len, *crcp);
                kernel_fpu_end();
        } else
                *crcp = crc32c_intel_le_hw(*crcp, data, len);
        return 0;
}

static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
                                u8 *out)
{
        if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
                kernel_fpu_begin();
                *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
                kernel_fpu_end();
        } else
                *(__le32 *)out =
                        ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
        return 0;
}

static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
                              unsigned int len, u8 *out)
{
        return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
}

static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *out)
{
        return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
                                    out);
}
#endif /* CONFIG_X86_64 */

static struct shash_alg alg = {
        .setkey                        =        crc32c_intel_setkey,
        .init                        =        crc32c_intel_init,
        .update                        =        crc32c_intel_update,
        .final                        =        crc32c_intel_final,
        .finup                        =        crc32c_intel_finup,
        .digest                        =        crc32c_intel_digest,
        .descsize                =        sizeof(u32),
        .digestsize                =        CHKSUM_DIGEST_SIZE,
        .base                        =        {
                .cra_name                =        "crc32c",
                .cra_driver_name        =        "crc32c-intel",
                .cra_priority                =        200,
                .cra_flags                =        CRYPTO_ALG_OPTIONAL_KEY,
                .cra_blocksize                =        CHKSUM_BLOCK_SIZE,
                .cra_ctxsize                =        sizeof(u32),
                .cra_module                =        THIS_MODULE,
                .cra_init                =        crc32c_intel_cra_init,
        }
};

static const struct x86_cpu_id crc32c_cpu_id[] = {
        X86_MATCH_FEATURE(X86_FEATURE_XMM4_2, NULL),
        {}
};
MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);

static int __init crc32c_intel_mod_init(void)
{
        if (!x86_match_cpu(crc32c_cpu_id))
                return -ENODEV;
#ifdef CONFIG_X86_64
        if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
                alg.update = crc32c_pcl_intel_update;
                alg.finup = crc32c_pcl_intel_finup;
                alg.digest = crc32c_pcl_intel_digest;
        }
#endif
        return crypto_register_shash(&alg);
}

static void __exit crc32c_intel_mod_fini(void)
{
        crypto_unregister_shash(&alg);
}

module_init(crc32c_intel_mod_init);
module_exit(crc32c_intel_mod_fini);

MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
MODULE_LICENSE("GPL");

MODULE_ALIAS_CRYPTO("crc32c");
MODULE_ALIAS_CRYPTO("crc32c-intel");



















































































    1 








    1 

























































































































































































































































































    1 





























    1 


    1 
    1 

























































































































































































































    1 




    1 





    1 


























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
/*
 * net/tipc/msg.c: TIPC message header routines
 *
 * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
 * Copyright (c) 2005, 2010-2011, Wind River Systems
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <net/sock.h>
#include "core.h"
#include "msg.h"
#include "addr.h"
#include "name_table.h"
#include "crypto.h"

#define BUF_ALIGN(x) ALIGN(x, 4)
#define MAX_FORWARD_SIZE 1024
#ifdef CONFIG_TIPC_CRYPTO
#define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16)
#define BUF_OVERHEAD (BUF_HEADROOM + TIPC_AES_GCM_TAG_SIZE)
#else
#define BUF_HEADROOM (LL_MAX_HEADER + 48)
#define BUF_OVERHEAD BUF_HEADROOM
#endif

const int one_page_mtu = PAGE_SIZE - SKB_DATA_ALIGN(BUF_OVERHEAD) -
                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

/**
 * tipc_buf_acquire - creates a TIPC message buffer
 * @size: message size (including TIPC header)
 * @gfp: memory allocation flags
 *
 * Return: a new buffer with data pointers set to the specified size.
 *
 * NOTE:
 * Headroom is reserved to allow prepending of a data link header.
 * There may also be unrequested tailroom present at the buffer's end.
 */
struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp)
{
        struct sk_buff *skb;

        skb = alloc_skb_fclone(BUF_OVERHEAD + size, gfp);
        if (skb) {
                skb_reserve(skb, BUF_HEADROOM);
                skb_put(skb, size);
                skb->next = NULL;
        }
        return skb;
}

void tipc_msg_init(u32 own_node, struct tipc_msg *m, u32 user, u32 type,
                   u32 hsize, u32 dnode)
{
        memset(m, 0, hsize);
        msg_set_version(m);
        msg_set_user(m, user);
        msg_set_hdr_sz(m, hsize);
        msg_set_size(m, hsize);
        msg_set_prevnode(m, own_node);
        msg_set_type(m, type);
        if (hsize > SHORT_H_SIZE) {
                msg_set_orignode(m, own_node);
                msg_set_destnode(m, dnode);
        }
}

struct sk_buff *tipc_msg_create(uint user, uint type,
                                uint hdr_sz, uint data_sz, u32 dnode,
                                u32 onode, u32 dport, u32 oport, int errcode)
{
        struct tipc_msg *msg;
        struct sk_buff *buf;

        buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC);
        if (unlikely(!buf))
                return NULL;

        msg = buf_msg(buf);
        tipc_msg_init(onode, msg, user, type, hdr_sz, dnode);
        msg_set_size(msg, hdr_sz + data_sz);
        msg_set_origport(msg, oport);
        msg_set_destport(msg, dport);
        msg_set_errcode(msg, errcode);
        return buf;
}

/* tipc_buf_append(): Append a buffer to the fragment list of another buffer
 * @*headbuf: in:  NULL for first frag, otherwise value returned from prev call
 *            out: set when successful non-complete reassembly, otherwise NULL
 * @*buf:     in:  the buffer to append. Always defined
 *            out: head buf after successful complete reassembly, otherwise NULL
 * Returns 1 when reassembly complete, otherwise 0
 */
int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
{
        struct sk_buff *head = *headbuf;
        struct sk_buff *frag = *buf;
        struct sk_buff *tail = NULL;
        struct tipc_msg *msg;
        u32 fragid;
        int delta;
        bool headstolen;

        if (!frag)
                goto err;

        msg = buf_msg(frag);
        fragid = msg_type(msg);
        frag->next = NULL;
        skb_pull(frag, msg_hdr_sz(msg));

        if (fragid == FIRST_FRAGMENT) {
                if (unlikely(head))
                        goto err;
                if (skb_has_frag_list(frag) && __skb_linearize(frag))
                        goto err;
                *buf = NULL;
                frag = skb_unshare(frag, GFP_ATOMIC);
                if (unlikely(!frag))
                        goto err;
                head = *headbuf = frag;
                TIPC_SKB_CB(head)->tail = NULL;
                return 0;
        }

        if (!head)
                goto err;

        /* Either the input skb ownership is transferred to headskb
         * or the input skb is freed, clear the reference to avoid
         * bad access on error path.
         */
        *buf = NULL;
        if (skb_try_coalesce(head, frag, &headstolen, &delta)) {
                kfree_skb_partial(frag, headstolen);
        } else {
                tail = TIPC_SKB_CB(head)->tail;
                if (!skb_has_frag_list(head))
                        skb_shinfo(head)->frag_list = frag;
                else
                        tail->next = frag;
                head->truesize += frag->truesize;
                head->data_len += frag->len;
                head->len += frag->len;
                TIPC_SKB_CB(head)->tail = frag;
        }

        if (fragid == LAST_FRAGMENT) {
                TIPC_SKB_CB(head)->validated = 0;
                if (unlikely(!tipc_msg_validate(&head)))
                        goto err;
                *buf = head;
                TIPC_SKB_CB(head)->tail = NULL;
                *headbuf = NULL;
                return 1;
        }
        return 0;
err:
        kfree_skb(*buf);
        kfree_skb(*headbuf);
        *buf = *headbuf = NULL;
        return 0;
}

/**
 * tipc_msg_append(): Append data to tail of an existing buffer queue
 * @_hdr: header to be used
 * @m: the data to be appended
 * @mss: max allowable size of buffer
 * @dlen: size of data to be appended
 * @txq: queue to append to
 *
 * Return: the number of 1k blocks appended or errno value
 */
int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen,
                    int mss, struct sk_buff_head *txq)
{
        struct sk_buff *skb;
        int accounted, total, curr;
        int mlen, cpy, rem = dlen;
        struct tipc_msg *hdr;

        skb = skb_peek_tail(txq);
        accounted = skb ? msg_blocks(buf_msg(skb)) : 0;
        total = accounted;

        do {
                if (!skb || skb->len >= mss) {
                        skb = tipc_buf_acquire(mss, GFP_KERNEL);
                        if (unlikely(!skb))
                                return -ENOMEM;
                        skb_orphan(skb);
                        skb_trim(skb, MIN_H_SIZE);
                        hdr = buf_msg(skb);
                        skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE);
                        msg_set_hdr_sz(hdr, MIN_H_SIZE);
                        msg_set_size(hdr, MIN_H_SIZE);
                        __skb_queue_tail(txq, skb);
                        total += 1;
                }
                hdr = buf_msg(skb);
                curr = msg_blocks(hdr);
                mlen = msg_size(hdr);
                cpy = min_t(size_t, rem, mss - mlen);
                if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter))
                        return -EFAULT;
                msg_set_size(hdr, mlen + cpy);
                skb_put(skb, cpy);
                rem -= cpy;
                total += msg_blocks(hdr) - curr;
        } while (rem > 0);
        return total - accounted;
}

/* tipc_msg_validate - validate basic format of received message
 *
 * This routine ensures a TIPC message has an acceptable header, and at least
 * as much data as the header indicates it should.  The routine also ensures
 * that the entire message header is stored in the main fragment of the message
 * buffer, to simplify future access to message header fields.
 *
 * Note: Having extra info present in the message header or data areas is OK.
 * TIPC will ignore the excess, under the assumption that it is optional info
 * introduced by a later release of the protocol.
 */
bool tipc_msg_validate(struct sk_buff **_skb)
{
        struct sk_buff *skb = *_skb;
        struct tipc_msg *hdr;
        int msz, hsz;

        /* Ensure that flow control ratio condition is satisfied */
        if (unlikely(skb->truesize / buf_roundup_len(skb) >= 4)) {
                skb = skb_copy_expand(skb, BUF_HEADROOM, 0, GFP_ATOMIC);
                if (!skb)
                        return false;
                kfree_skb(*_skb);
                *_skb = skb;
        }

        if (unlikely(TIPC_SKB_CB(skb)->validated))
                return true;

        if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE)))
                return false;

        hsz = msg_hdr_sz(buf_msg(skb));
        if (unlikely(hsz < MIN_H_SIZE) || (hsz > MAX_H_SIZE))
                return false;
        if (unlikely(!pskb_may_pull(skb, hsz)))
                return false;

        hdr = buf_msg(skb);
        if (unlikely(msg_version(hdr) != TIPC_VERSION))
                return false;

        msz = msg_size(hdr);
        if (unlikely(msz < hsz))
                return false;
        if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE))
                return false;
        if (unlikely(skb->len < msz))
                return false;

        TIPC_SKB_CB(skb)->validated = 1;
        return true;
}

/**
 * tipc_msg_fragment - build a fragment skb list for TIPC message
 *
 * @skb: TIPC message skb
 * @hdr: internal msg header to be put on the top of the fragments
 * @pktmax: max size of a fragment incl. the header
 * @frags: returned fragment skb list
 *
 * Return: 0 if the fragmentation is successful, otherwise: -EINVAL
 * or -ENOMEM
 */
int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
                      int pktmax, struct sk_buff_head *frags)
{
        int pktno, nof_fragms, dsz, dmax, eat;
        struct tipc_msg *_hdr;
        struct sk_buff *_skb;
        u8 *data;

        /* Non-linear buffer? */
        if (skb_linearize(skb))
                return -ENOMEM;

        data = (u8 *)skb->data;
        dsz = msg_size(buf_msg(skb));
        dmax = pktmax - INT_H_SIZE;
        if (dsz <= dmax || !dmax)
                return -EINVAL;

        nof_fragms = dsz / dmax + 1;
        for (pktno = 1; pktno <= nof_fragms; pktno++) {
                if (pktno < nof_fragms)
                        eat = dmax;
                else
                        eat = dsz % dmax;
                /* Allocate a new fragment */
                _skb = tipc_buf_acquire(INT_H_SIZE + eat, GFP_ATOMIC);
                if (!_skb)
                        goto error;
                skb_orphan(_skb);
                __skb_queue_tail(frags, _skb);
                /* Copy header & data to the fragment */
                skb_copy_to_linear_data(_skb, hdr, INT_H_SIZE);
                skb_copy_to_linear_data_offset(_skb, INT_H_SIZE, data, eat);
                data += eat;
                /* Update the fragment's header */
                _hdr = buf_msg(_skb);
                msg_set_fragm_no(_hdr, pktno);
                msg_set_nof_fragms(_hdr, nof_fragms);
                msg_set_size(_hdr, INT_H_SIZE + eat);
        }
        return 0;

error:
        __skb_queue_purge(frags);
        __skb_queue_head_init(frags);
        return -ENOMEM;
}

/**
 * tipc_msg_build - create buffer chain containing specified header and data
 * @mhdr: Message header, to be prepended to data
 * @m: User message
 * @offset: buffer offset for fragmented messages (FIXME)
 * @dsz: Total length of user data
 * @pktmax: Max packet size that can be used
 * @list: Buffer or chain of buffers to be returned to caller
 *
 * Note that the recursive call we are making here is safe, since it can
 * logically go only one further level down.
 *
 * Return: message data size or errno: -ENOMEM, -EFAULT
 */
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset,
                   int dsz, int pktmax, struct sk_buff_head *list)
{
        int mhsz = msg_hdr_sz(mhdr);
        struct tipc_msg pkthdr;
        int msz = mhsz + dsz;
        int pktrem = pktmax;
        struct sk_buff *skb;
        int drem = dsz;
        int pktno = 1;
        char *pktpos;
        int pktsz;
        int rc;

        msg_set_size(mhdr, msz);

        /* No fragmentation needed? */
        if (likely(msz <= pktmax)) {
                skb = tipc_buf_acquire(msz, GFP_KERNEL);

                /* Fall back to smaller MTU if node local message */
                if (unlikely(!skb)) {
                        if (pktmax != MAX_MSG_SIZE)
                                return -ENOMEM;
                        rc = tipc_msg_build(mhdr, m, offset, dsz,
                                            one_page_mtu, list);
                        if (rc != dsz)
                                return rc;
                        if (tipc_msg_assemble(list))
                                return dsz;
                        return -ENOMEM;
                }
                skb_orphan(skb);
                __skb_queue_tail(list, skb);
                skb_copy_to_linear_data(skb, mhdr, mhsz);
                pktpos = skb->data + mhsz;
                if (copy_from_iter_full(pktpos, dsz, &m->msg_iter))
                        return dsz;
                rc = -EFAULT;
                goto error;
        }

        /* Prepare reusable fragment header */
        tipc_msg_init(msg_prevnode(mhdr), &pkthdr, MSG_FRAGMENTER,
                      FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr));
        msg_set_size(&pkthdr, pktmax);
        msg_set_fragm_no(&pkthdr, pktno);
        msg_set_importance(&pkthdr, msg_importance(mhdr));

        /* Prepare first fragment */
        skb = tipc_buf_acquire(pktmax, GFP_KERNEL);
        if (!skb)
                return -ENOMEM;
        skb_orphan(skb);
        __skb_queue_tail(list, skb);
        pktpos = skb->data;
        skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE);
        pktpos += INT_H_SIZE;
        pktrem -= INT_H_SIZE;
        skb_copy_to_linear_data_offset(skb, INT_H_SIZE, mhdr, mhsz);
        pktpos += mhsz;
        pktrem -= mhsz;

        do {
                if (drem < pktrem)
                        pktrem = drem;

                if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) {
                        rc = -EFAULT;
                        goto error;
                }
                drem -= pktrem;

                if (!drem)
                        break;

                /* Prepare new fragment: */
                if (drem < (pktmax - INT_H_SIZE))
                        pktsz = drem + INT_H_SIZE;
                else
                        pktsz = pktmax;
                skb = tipc_buf_acquire(pktsz, GFP_KERNEL);
                if (!skb) {
                        rc = -ENOMEM;
                        goto error;
                }
                skb_orphan(skb);
                __skb_queue_tail(list, skb);
                msg_set_type(&pkthdr, FRAGMENT);
                msg_set_size(&pkthdr, pktsz);
                msg_set_fragm_no(&pkthdr, ++pktno);
                skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE);
                pktpos = skb->data + INT_H_SIZE;
                pktrem = pktsz - INT_H_SIZE;

        } while (1);
        msg_set_type(buf_msg(skb), LAST_FRAGMENT);
        return dsz;
error:
        __skb_queue_purge(list);
        __skb_queue_head_init(list);
        return rc;
}

/**
 * tipc_msg_bundle - Append contents of a buffer to tail of an existing one
 * @bskb: the bundle buffer to append to
 * @msg: message to be appended
 * @max: max allowable size for the bundle buffer
 *
 * Return: "true" if bundling has been performed, otherwise "false"
 */
static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg,
                            u32 max)
{
        struct tipc_msg *bmsg = buf_msg(bskb);
        u32 msz, bsz, offset, pad;

        msz = msg_size(msg);
        bsz = msg_size(bmsg);
        offset = BUF_ALIGN(bsz);
        pad = offset - bsz;

        if (unlikely(skb_tailroom(bskb) < (pad + msz)))
                return false;
        if (unlikely(max < (offset + msz)))
                return false;

        skb_put(bskb, pad + msz);
        skb_copy_to_linear_data_offset(bskb, offset, msg, msz);
        msg_set_size(bmsg, offset + msz);
        msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1);
        return true;
}

/**
 * tipc_msg_try_bundle - Try to bundle a new message to the last one
 * @tskb: the last/target message to which the new one will be appended
 * @skb: the new message skb pointer
 * @mss: max message size (header inclusive)
 * @dnode: destination node for the message
 * @new_bundle: if this call made a new bundle or not
 *
 * Return: "true" if the new message skb is potential for bundling this time or
 * later, in the case a bundling has been done this time, the skb is consumed
 * (the skb pointer = NULL).
 * Otherwise, "false" if the skb cannot be bundled at all.
 */
bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss,
                         u32 dnode, bool *new_bundle)
{
        struct tipc_msg *msg, *inner, *outer;
        u32 tsz;

        /* First, check if the new buffer is suitable for bundling */
        msg = buf_msg(*skb);
        if (msg_user(msg) == MSG_FRAGMENTER)
                return false;
        if (msg_user(msg) == TUNNEL_PROTOCOL)
                return false;
        if (msg_user(msg) == BCAST_PROTOCOL)
                return false;
        if (mss <= INT_H_SIZE + msg_size(msg))
                return false;

        /* Ok, but the last/target buffer can be empty? */
        if (unlikely(!tskb))
                return true;

        /* Is it a bundle already? Try to bundle the new message to it */
        if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) {
                *new_bundle = false;
                goto bundle;
        }

        /* Make a new bundle of the two messages if possible */
        tsz = msg_size(buf_msg(tskb));
        if (unlikely(mss < BUF_ALIGN(INT_H_SIZE + tsz) + msg_size(msg)))
                return true;
        if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE,
                                      GFP_ATOMIC)))
                return true;
        inner = buf_msg(tskb);
        skb_push(tskb, INT_H_SIZE);
        outer = buf_msg(tskb);
        tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE,
                      dnode);
        msg_set_importance(outer, msg_importance(inner));
        msg_set_size(outer, INT_H_SIZE + tsz);
        msg_set_msgcnt(outer, 1);
        *new_bundle = true;

bundle:
        if (likely(tipc_msg_bundle(tskb, msg, mss))) {
                consume_skb(*skb);
                *skb = NULL;
        }
        return true;
}

/**
 *  tipc_msg_extract(): extract bundled inner packet from buffer
 *  @skb: buffer to be extracted from.
 *  @iskb: extracted inner buffer, to be returned
 *  @pos: position in outer message of msg to be extracted.
 *  Returns position of next msg.
 *  Consumes outer buffer when last packet extracted
 *  Return: true when there is an extracted buffer, otherwise false
 */
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
{
        struct tipc_msg *hdr, *ihdr;
        int imsz;

        *iskb = NULL;
        if (unlikely(skb_linearize(skb)))
                goto none;

        hdr = buf_msg(skb);
        if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE)))
                goto none;

        ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos);
        imsz = msg_size(ihdr);

        if ((*pos + imsz) > msg_data_sz(hdr))
                goto none;

        *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC);
        if (!*iskb)
                goto none;

        skb_copy_to_linear_data(*iskb, ihdr, imsz);
        if (unlikely(!tipc_msg_validate(iskb)))
                goto none;

        *pos += BUF_ALIGN(imsz);
        return true;
none:
        kfree_skb(skb);
        kfree_skb(*iskb);
        *iskb = NULL;
        return false;
}

/**
 * tipc_msg_reverse(): swap source and destination addresses and add error code
 * @own_node: originating node id for reversed message
 * @skb:  buffer containing message to be reversed; will be consumed
 * @err:  error code to be set in message, if any
 * Replaces consumed buffer with new one when successful
 * Return: true if success, otherwise false
 */
bool tipc_msg_reverse(u32 own_node,  struct sk_buff **skb, int err)
{
        struct sk_buff *_skb = *skb;
        struct tipc_msg *_hdr, *hdr;
        int hlen, dlen;

        if (skb_linearize(_skb))
                goto exit;
        _hdr = buf_msg(_skb);
        dlen = min_t(uint, msg_data_sz(_hdr), MAX_FORWARD_SIZE);
        hlen = msg_hdr_sz(_hdr);

        if (msg_dest_droppable(_hdr))
                goto exit;
        if (msg_errcode(_hdr))
                goto exit;

        /* Never return SHORT header */
        if (hlen == SHORT_H_SIZE)
                hlen = BASIC_H_SIZE;

        /* Don't return data along with SYN+, - sender has a clone */
        if (msg_is_syn(_hdr) && err == TIPC_ERR_OVERLOAD)
                dlen = 0;

        /* Allocate new buffer to return */
        *skb = tipc_buf_acquire(hlen + dlen, GFP_ATOMIC);
        if (!*skb)
                goto exit;
        memcpy((*skb)->data, _skb->data, msg_hdr_sz(_hdr));
        memcpy((*skb)->data + hlen, msg_data(_hdr), dlen);

        /* Build reverse header in new buffer */
        hdr = buf_msg(*skb);
        msg_set_hdr_sz(hdr, hlen);
        msg_set_errcode(hdr, err);
        msg_set_non_seq(hdr, 0);
        msg_set_origport(hdr, msg_destport(_hdr));
        msg_set_destport(hdr, msg_origport(_hdr));
        msg_set_destnode(hdr, msg_prevnode(_hdr));
        msg_set_prevnode(hdr, own_node);
        msg_set_orignode(hdr, own_node);
        msg_set_size(hdr, hlen + dlen);
        skb_orphan(_skb);
        kfree_skb(_skb);
        return true;
exit:
        kfree_skb(_skb);
        *skb = NULL;
        return false;
}

bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy)
{
        struct sk_buff *skb, *_skb;

        skb_queue_walk(msg, skb) {
                _skb = skb_clone(skb, GFP_ATOMIC);
                if (!_skb) {
                        __skb_queue_purge(cpy);
                        pr_err_ratelimited("Failed to clone buffer chain\n");
                        return false;
                }
                __skb_queue_tail(cpy, _skb);
        }
        return true;
}

/**
 * tipc_msg_lookup_dest(): try to find new destination for named message
 * @net: pointer to associated network namespace
 * @skb: the buffer containing the message.
 * @err: error code to be used by caller if lookup fails
 * Does not consume buffer
 * Return: true if a destination is found, false otherwise
 */
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
{
        struct tipc_msg *msg = buf_msg(skb);
        u32 scope = msg_lookup_scope(msg);
        u32 self = tipc_own_addr(net);
        u32 inst = msg_nameinst(msg);
        struct tipc_socket_addr sk;
        struct tipc_uaddr ua;

        if (!msg_isdata(msg))
                return false;
        if (!msg_named(msg))
                return false;
        if (msg_errcode(msg))
                return false;
        *err = TIPC_ERR_NO_NAME;
        if (skb_linearize(skb))
                return false;
        msg = buf_msg(skb);
        if (msg_reroute_cnt(msg))
                return false;
        tipc_uaddr(&ua, TIPC_SERVICE_RANGE, scope,
                   msg_nametype(msg), inst, inst);
        sk.node = tipc_scope2node(net, scope);
        if (!tipc_nametbl_lookup_anycast(net, &ua, &sk))
                return false;
        msg_incr_reroute_cnt(msg);
        if (sk.node != self)
                msg_set_prevnode(msg, self);
        msg_set_destnode(msg, sk.node);
        msg_set_destport(msg, sk.ref);
        *err = TIPC_OK;

        return true;
}

/* tipc_msg_assemble() - assemble chain of fragments into one message
 */
bool tipc_msg_assemble(struct sk_buff_head *list)
{
        struct sk_buff *skb, *tmp = NULL;

        if (skb_queue_len(list) == 1)
                return true;

        while ((skb = __skb_dequeue(list))) {
                skb->next = NULL;
                if (tipc_buf_append(&tmp, &skb)) {
                        __skb_queue_tail(list, skb);
                        return true;
                }
                if (!tmp)
                        break;
        }
        __skb_queue_purge(list);
        __skb_queue_head_init(list);
        pr_warn("Failed do assemble buffer\n");
        return false;
}

/* tipc_msg_reassemble() - clone a buffer chain of fragments and
 *                         reassemble the clones into one message
 */
bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq)
{
        struct sk_buff *skb, *_skb;
        struct sk_buff *frag = NULL;
        struct sk_buff *head = NULL;
        int hdr_len;

        /* Copy header if single buffer */
        if (skb_queue_len(list) == 1) {
                skb = skb_peek(list);
                hdr_len = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb));
                _skb = __pskb_copy(skb, hdr_len, GFP_ATOMIC);
                if (!_skb)
                        return false;
                __skb_queue_tail(rcvq, _skb);
                return true;
        }

        /* Clone all fragments and reassemble */
        skb_queue_walk(list, skb) {
                frag = skb_clone(skb, GFP_ATOMIC);
                if (!frag)
                        goto error;
                frag->next = NULL;
                if (tipc_buf_append(&head, &frag))
                        break;
                if (!head)
                        goto error;
        }
        __skb_queue_tail(rcvq, frag);
        return true;
error:
        pr_warn("Failed do clone local mcast rcv buffer\n");
        kfree_skb(head);
        return false;
}

bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
                        struct sk_buff_head *cpy)
{
        struct sk_buff *skb, *_skb;

        skb_queue_walk(msg, skb) {
                _skb = pskb_copy(skb, GFP_ATOMIC);
                if (!_skb) {
                        __skb_queue_purge(cpy);
                        return false;
                }
                msg_set_destnode(buf_msg(_skb), dst);
                __skb_queue_tail(cpy, _skb);
        }
        return true;
}

/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number
 * @list: list to be appended to
 * @seqno: sequence number of buffer to add
 * @skb: buffer to add
 */
bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
                             struct sk_buff *skb)
{
        struct sk_buff *_skb, *tmp;

        if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) {
                __skb_queue_head(list, skb);
                return true;
        }

        if (more(seqno, buf_seqno(skb_peek_tail(list)))) {
                __skb_queue_tail(list, skb);
                return true;
        }

        skb_queue_walk_safe(list, _skb, tmp) {
                if (more(seqno, buf_seqno(_skb)))
                        continue;
                if (seqno == buf_seqno(_skb))
                        break;
                __skb_queue_before(list, _skb, skb);
                return true;
        }
        kfree_skb(skb);
        return false;
}

void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb,
                     struct sk_buff_head *xmitq)
{
        if (tipc_msg_reverse(tipc_own_addr(net), &skb, err))
                __skb_queue_tail(xmitq, skb);
}





















    2 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>

#define DEVCG_ACC_MKNOD 1
#define DEVCG_ACC_READ  2
#define DEVCG_ACC_WRITE 4
#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)

#define DEVCG_DEV_BLOCK 1
#define DEVCG_DEV_CHAR  2
#define DEVCG_DEV_ALL   4  /* this represents all devices */


#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access);
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{
        short type, access = 0;

        if (likely(!inode->i_rdev))
                return 0;

        if (S_ISBLK(inode->i_mode))
                type = DEVCG_DEV_BLOCK;
        else if (S_ISCHR(inode->i_mode))
                type = DEVCG_DEV_CHAR;
        else
                return 0;

        if (mask & MAY_WRITE)
                access |= DEVCG_ACC_WRITE;
        if (mask & MAY_READ)
                access |= DEVCG_ACC_READ;

        return devcgroup_check_permission(type, imajor(inode), iminor(inode),
                                          access);
}

static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{
        short type;

        if (!S_ISBLK(mode) && !S_ISCHR(mode))
                return 0;

        if (S_ISCHR(mode) && dev == WHITEOUT_DEV)
                return 0;

        if (S_ISBLK(mode))
                type = DEVCG_DEV_BLOCK;
        else
                type = DEVCG_DEV_CHAR;

        return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
                                          DEVCG_ACC_MKNOD);
}

#else
static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access)
{ return 0; }
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{ return 0; }
static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{ return 0; }
#endif
































    1 


















    1 



    1 

















    1 





    1 















    1 
















































































    1 




    1 





























    1 



















    1 



    1 




    1 



























    1 















    1 

















































    1 










    1 



























    1 












































































    1 






    1 



















































































































































































    1 
    1 
    1 



    1 



































































































































































































































































































































































































































    1 







    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.c - library routines for handling generic kernel objects
 *
 * Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2007 Novell Inc.
 *
 * Please see the file Documentation/core-api/kobject.rst for critical information
 * about using the kobject interface.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/random.h>

/**
 * kobject_namespace() - Return @kobj's namespace tag.
 * @kobj: kobject in question
 *
 * Returns namespace tag of @kobj if its parent has namespace ops enabled
 * and thus @kobj should have a namespace tag associated with it.  Returns
 * %NULL otherwise.
 */
const void *kobject_namespace(const struct kobject *kobj)
{
        const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);

        if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
                return NULL;

        return kobj->ktype->namespace(kobj);
}

/**
 * kobject_get_ownership() - Get sysfs ownership data for @kobj.
 * @kobj: kobject in question
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns initial uid/gid pair that should be used when creating sysfs
 * representation of given kobject. Normally used to adjust ownership of
 * objects in a container.
 */
void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;

        if (kobj->ktype->get_ownership)
                kobj->ktype->get_ownership(kobj, uid, gid);
}

static bool kobj_ns_type_is_valid(enum kobj_ns_type type)
{
        if ((type <= KOBJ_NS_TYPE_NONE) || (type >= KOBJ_NS_TYPES))
                return false;

        return true;
}

static int create_dir(struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);
        const struct kobj_ns_type_operations *ops;
        int error;

        error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
        if (error)
                return error;

        if (ktype) {
                error = sysfs_create_groups(kobj, ktype->default_groups);
                if (error) {
                        sysfs_remove_dir(kobj);
                        return error;
                }
        }

        /*
         * @kobj->sd may be deleted by an ancestor going away.  Hold an
         * extra reference so that it stays until @kobj is gone.
         */
        sysfs_get(kobj->sd);

        /*
         * If @kobj has ns_ops, its children need to be filtered based on
         * their namespace tags.  Enable namespace support on @kobj->sd.
         */
        ops = kobj_child_ns_ops(kobj);
        if (ops) {
                BUG_ON(!kobj_ns_type_is_valid(ops->type));
                BUG_ON(!kobj_ns_type_registered(ops->type));

                sysfs_enable_ns(kobj->sd);
        }

        return 0;
}

static int get_kobj_path_length(const struct kobject *kobj)
{
        int length = 1;
        const struct kobject *parent = kobj;

        /* walk up the ancestors until we hit the one pointing to the
         * root.
         * Add 1 to strlen for leading '/' of each level.
         */
        do {
                if (kobject_name(parent) == NULL)
                        return 0;
                length += strlen(kobject_name(parent)) + 1;
                parent = parent->parent;
        } while (parent);
        return length;
}

static int fill_kobj_path(const struct kobject *kobj, char *path, int length)
{
        const struct kobject *parent;

        --length;
        for (parent = kobj; parent; parent = parent->parent) {
                int cur = strlen(kobject_name(parent));
                /* back up enough to print this name with '/' */
                length -= cur;
                if (length <= 0)
                        return -EINVAL;
                memcpy(path + length, kobject_name(parent), cur);
                *(path + --length) = '/';
        }

        pr_debug("'%s' (%p): %s: path = '%s'\n", kobject_name(kobj),
                 kobj, __func__, path);

        return 0;
}

/**
 * kobject_get_path() - Allocate memory and fill in the path for @kobj.
 * @kobj:        kobject in question, with which to build the path
 * @gfp_mask:        the allocation type used to allocate the path
 *
 * Return: The newly allocated memory, caller must free with kfree().
 */
char *kobject_get_path(const struct kobject *kobj, gfp_t gfp_mask)
{
        char *path;
        int len;

retry:
        len = get_kobj_path_length(kobj);
        if (len == 0)
                return NULL;
        path = kzalloc(len, gfp_mask);
        if (!path)
                return NULL;
        if (fill_kobj_path(kobj, path, len)) {
                kfree(path);
                goto retry;
        }

        return path;
}
EXPORT_SYMBOL_GPL(kobject_get_path);

/* add the kobject to its kset's list */
static void kobj_kset_join(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        kset_get(kobj->kset);
        spin_lock(&kobj->kset->list_lock);
        list_add_tail(&kobj->entry, &kobj->kset->list);
        spin_unlock(&kobj->kset->list_lock);
}

/* remove the kobject from its kset's list */
static void kobj_kset_leave(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        spin_lock(&kobj->kset->list_lock);
        list_del_init(&kobj->entry);
        spin_unlock(&kobj->kset->list_lock);
        kset_put(kobj->kset);
}

static void kobject_init_internal(struct kobject *kobj)
{
        if (!kobj)
                return;
        kref_init(&kobj->kref);
        INIT_LIST_HEAD(&kobj->entry);
        kobj->state_in_sysfs = 0;
        kobj->state_add_uevent_sent = 0;
        kobj->state_remove_uevent_sent = 0;
        kobj->state_initialized = 1;
}


static int kobject_add_internal(struct kobject *kobj)
{
        int error = 0;
        struct kobject *parent;

        if (!kobj)
                return -ENOENT;

        if (!kobj->name || !kobj->name[0]) {
                WARN(1,
                     "kobject: (%p): attempted to be registered with empty name!\n",
                     kobj);
                return -EINVAL;
        }

        parent = kobject_get(kobj->parent);

        /* join kset if set, use it as parent if we do not already have one */
        if (kobj->kset) {
                if (!parent)
                        parent = kobject_get(&kobj->kset->kobj);
                kobj_kset_join(kobj);
                kobj->parent = parent;
        }

        pr_debug("'%s' (%p): %s: parent: '%s', set: '%s'\n",
                 kobject_name(kobj), kobj, __func__,
                 parent ? kobject_name(parent) : "<NULL>",
                 kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

        error = create_dir(kobj);
        if (error) {
                kobj_kset_leave(kobj);
                kobject_put(parent);
                kobj->parent = NULL;

                /* be noisy on error issues */
                if (error == -EEXIST)
                        pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
                               __func__, kobject_name(kobj));
                else
                        pr_err("%s failed for %s (error: %d parent: %s)\n",
                               __func__, kobject_name(kobj), error,
                               parent ? kobject_name(parent) : "'none'");
        } else
                kobj->state_in_sysfs = 1;

        return error;
}

/**
 * kobject_set_name_vargs() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 * @vargs: vargs to format the string.
 */
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                  va_list vargs)
{
        const char *s;

        if (kobj->name && !fmt)
                return 0;

        s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
        if (!s)
                return -ENOMEM;

        /*
         * ewww... some of these buggers have '/' in the name ... If
         * that's the case, we need to make sure we have an actual
         * allocated copy to modify, since kvasprintf_const may have
         * returned something from .rodata.
         */
        if (strchr(s, '/')) {
                char *t;

                t = kstrdup(s, GFP_KERNEL);
                kfree_const(s);
                if (!t)
                        return -ENOMEM;
                s = strreplace(t, '/', '!');
        }
        kfree_const(kobj->name);
        kobj->name = s;

        return 0;
}

/**
 * kobject_set_name() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 *
 * This sets the name of the kobject.  If you have already added the
 * kobject to the system, you must call kobject_rename() in order to
 * change the name of the kobject.
 */
int kobject_set_name(struct kobject *kobj, const char *fmt, ...)
{
        va_list vargs;
        int retval;

        va_start(vargs, fmt);
        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        va_end(vargs);

        return retval;
}
EXPORT_SYMBOL(kobject_set_name);

/**
 * kobject_init() - Initialize a kobject structure.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 *
 * This function will properly initialize a kobject such that it can then
 * be passed to the kobject_add() call.
 *
 * After this function is called, the kobject MUST be cleaned up by a call
 * to kobject_put(), not by a call to kfree directly to ensure that all of
 * the memory is cleaned up properly.
 */
void kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
{
        char *err_str;

        if (!kobj) {
                err_str = "invalid kobject pointer!";
                goto error;
        }
        if (!ktype) {
                err_str = "must have a ktype to be initialized properly!\n";
                goto error;
        }
        if (kobj->state_initialized) {
                /* do not error out as sometimes we can recover */
                pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n",
                       kobj);
                dump_stack_lvl(KERN_ERR);
        }

        kobject_init_internal(kobj);
        kobj->ktype = ktype;
        return;

error:
        pr_err("kobject (%p): %s\n", kobj, err_str);
        dump_stack_lvl(KERN_ERR);
}
EXPORT_SYMBOL(kobject_init);

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                                           struct kobject *parent,
                                           const char *fmt, va_list vargs)
{
        int retval;

        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        if (retval) {
                pr_err("can not set name properly!\n");
                return retval;
        }
        kobj->parent = parent;
        return kobject_add_internal(kobj);
}

/**
 * kobject_add() - The main kobject add function.
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 *
 * Return: If this function returns an error, kobject_put() must be
 *         called to properly clean up the memory associated with the
 *         object.  Under no instance should the kobject that is passed
 *         to this function be directly freed with a call to kfree(),
 *         that can leak memory.
 *
 *         If this function returns success, kobject_put() must also be called
 *         in order to properly clean up the memory associated with the object.
 *
 *         In short, once this function is called, kobject_put() MUST be called
 *         when the use of the object is finished in order to properly free
 *         everything.
 */
int kobject_add(struct kobject *kobj, struct kobject *parent,
                const char *fmt, ...)
{
        va_list args;
        int retval;

        if (!kobj)
                return -EINVAL;

        if (!kobj->state_initialized) {
                pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n",
                       kobject_name(kobj), kobj);
                dump_stack_lvl(KERN_ERR);
                return -EINVAL;
        }
        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL(kobject_add);

/**
 * kobject_init_and_add() - Initialize a kobject structure and add it to
 *                          the kobject hierarchy.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 * @parent: pointer to the parent of this kobject.
 * @fmt: the name of the kobject.
 *
 * This function combines the call to kobject_init() and kobject_add().
 *
 * If this function returns an error, kobject_put() must be called to
 * properly clean up the memory associated with the object.  This is the
 * same type of error handling after a call to kobject_add() and kobject
 * lifetime rules are the same here.
 */
int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
                         struct kobject *parent, const char *fmt, ...)
{
        va_list args;
        int retval;

        kobject_init(kobj, ktype);

        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL_GPL(kobject_init_and_add);

/**
 * kobject_rename() - Change the name of an object.
 * @kobj: object in question.
 * @new_name: object's new name
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of kobject_rename
 * on the same kobject and to ensure that new_name is valid and
 * won't conflict with other kobjects.
 */
int kobject_rename(struct kobject *kobj, const char *new_name)
{
        int error = 0;
        const char *devpath = NULL;
        const char *dup_name = NULL, *name;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        if (!kobj->parent) {
                kobject_put(kobj);
                return -EINVAL;
        }

        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;

        name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
        if (!name) {
                error = -ENOMEM;
                goto out;
        }

        error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj));
        if (error)
                goto out;

        /* Install the new kobject name */
        dup_name = kobj->name;
        kobj->name = name;

        /* This function is mostly/only used for network interface.
         * Some hotplug package track interfaces by their name and
         * therefore want to know when the name is changed by the user. */
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);

out:
        kfree_const(dup_name);
        kfree(devpath_string);
        kfree(devpath);
        kobject_put(kobj);

        return error;
}
EXPORT_SYMBOL_GPL(kobject_rename);

/**
 * kobject_move() - Move object to another parent.
 * @kobj: object in question.
 * @new_parent: object's new parent (can be NULL)
 */
int kobject_move(struct kobject *kobj, struct kobject *new_parent)
{
        int error;
        struct kobject *old_parent;
        const char *devpath = NULL;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        new_parent = kobject_get(new_parent);
        if (!new_parent) {
                if (kobj->kset)
                        new_parent = kobject_get(&kobj->kset->kobj);
        }

        /* old object path */
        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;
        error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj));
        if (error)
                goto out;
        old_parent = kobj->parent;
        kobj->parent = new_parent;
        new_parent = NULL;
        kobject_put(old_parent);
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
        kobject_put(new_parent);
        kobject_put(kobj);
        kfree(devpath_string);
        kfree(devpath);
        return error;
}
EXPORT_SYMBOL_GPL(kobject_move);

static void __kobject_del(struct kobject *kobj)
{
        struct kernfs_node *sd;
        const struct kobj_type *ktype;

        sd = kobj->sd;
        ktype = get_ktype(kobj);

        if (ktype)
                sysfs_remove_groups(kobj, ktype->default_groups);

        /* send "remove" if the caller did not do it but sent "add" */
        if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
                pr_debug("'%s' (%p): auto cleanup 'remove' event\n",
                         kobject_name(kobj), kobj);
                kobject_uevent(kobj, KOBJ_REMOVE);
        }

        sysfs_remove_dir(kobj);
        sysfs_put(sd);

        kobj->state_in_sysfs = 0;
        kobj_kset_leave(kobj);
        kobj->parent = NULL;
}

/**
 * kobject_del() - Unlink kobject from hierarchy.
 * @kobj: object.
 *
 * This is the function that should be called to delete an object
 * successfully added via kobject_add().
 */
void kobject_del(struct kobject *kobj)
{
        struct kobject *parent;

        if (!kobj)
                return;

        parent = kobj->parent;
        __kobject_del(kobj);
        kobject_put(parent);
}
EXPORT_SYMBOL(kobject_del);

/**
 * kobject_get() - Increment refcount for object.
 * @kobj: object.
 */
struct kobject *kobject_get(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_get(&kobj->kref);
        }
        return kobj;
}
EXPORT_SYMBOL(kobject_get);

struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
        if (!kobj)
                return NULL;
        if (!kref_get_unless_zero(&kobj->kref))
                kobj = NULL;
        return kobj;
}
EXPORT_SYMBOL(kobject_get_unless_zero);

/*
 * kobject_cleanup - free kobject resources.
 * @kobj: object to cleanup
 */
static void kobject_cleanup(struct kobject *kobj)
{
        struct kobject *parent = kobj->parent;
        const struct kobj_type *t = get_ktype(kobj);
        const char *name = kobj->name;

        pr_debug("'%s' (%p): %s, parent %p\n",
                 kobject_name(kobj), kobj, __func__, kobj->parent);

        if (t && !t->release)
                pr_debug("'%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                         kobject_name(kobj), kobj);

        /* remove from sysfs if the caller did not do it */
        if (kobj->state_in_sysfs) {
                pr_debug("'%s' (%p): auto cleanup kobject_del\n",
                         kobject_name(kobj), kobj);
                __kobject_del(kobj);
        } else {
                /* avoid dropping the parent reference unnecessarily */
                parent = NULL;
        }

        if (t && t->release) {
                pr_debug("'%s' (%p): calling ktype release\n",
                         kobject_name(kobj), kobj);
                t->release(kobj);
        }

        /* free name if we allocated it */
        if (name) {
                pr_debug("'%s': free name\n", name);
                kfree_const(name);
        }

        kobject_put(parent);
}

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
        kobject_cleanup(container_of(to_delayed_work(work),
                                     struct kobject, release));
}
#endif

static void kobject_release(struct kref *kref)
{
        struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        unsigned long delay = HZ + HZ * get_random_u32_below(4);
        pr_info("'%s' (%p): %s, parent %p (delayed %ld)\n",
                kobject_name(kobj), kobj, __func__, kobj->parent, delay);
        INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);

        schedule_delayed_work(&kobj->release, delay);
#else
        kobject_cleanup(kobj);
#endif
}

/**
 * kobject_put() - Decrement refcount for object.
 * @kobj: object.
 *
 * Decrement the refcount, and if 0, call kobject_cleanup().
 */
void kobject_put(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_put(&kobj->kref, kobject_release);
        }
}
EXPORT_SYMBOL(kobject_put);

static void dynamic_kobj_release(struct kobject *kobj)
{
        pr_debug("(%p): %s\n", kobj, __func__);
        kfree(kobj);
}

static const struct kobj_type dynamic_kobj_ktype = {
        .release        = dynamic_kobj_release,
        .sysfs_ops        = &kobj_sysfs_ops,
};

/**
 * kobject_create() - Create a struct kobject dynamically.
 *
 * This function creates a kobject structure dynamically and sets it up
 * to be a "dynamic" kobject with a default release function set up.
 *
 * If the kobject was not able to be created, NULL will be returned.
 * The kobject structure returned from here must be cleaned up with a
 * call to kobject_put() and not kfree(), as kobject_init() has
 * already been called on this structure.
 */
static struct kobject *kobject_create(void)
{
        struct kobject *kobj;

        kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
        if (!kobj)
                return NULL;

        kobject_init(kobj, &dynamic_kobj_ktype);
        return kobj;
}

/**
 * kobject_create_and_add() - Create a struct kobject dynamically and
 *                            register it with sysfs.
 * @name: the name for the kobject
 * @parent: the parent kobject of this kobject, if any.
 *
 * This function creates a kobject structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kobject_put() and the structure will be dynamically freed when
 * it is no longer being used.
 *
 * If the kobject was not able to be created, NULL will be returned.
 */
struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)
{
        struct kobject *kobj;
        int retval;

        kobj = kobject_create();
        if (!kobj)
                return NULL;

        retval = kobject_add(kobj, parent, "%s", name);
        if (retval) {
                pr_warn("%s: kobject_add error: %d\n", __func__, retval);
                kobject_put(kobj);
                kobj = NULL;
        }
        return kobj;
}
EXPORT_SYMBOL_GPL(kobject_create_and_add);

/**
 * kset_init() - Initialize a kset for use.
 * @k: kset
 */
void kset_init(struct kset *k)
{
        kobject_init_internal(&k->kobj);
        INIT_LIST_HEAD(&k->list);
        spin_lock_init(&k->list_lock);
}

/* default kobject attribute operations */
static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                              char *buf)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->show)
                ret = kattr->show(kobj, kattr, buf);
        return ret;
}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                               const char *buf, size_t count)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->store)
                ret = kattr->store(kobj, kattr, buf, count);
        return ret;
}

const struct sysfs_ops kobj_sysfs_ops = {
        .show        = kobj_attr_show,
        .store        = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

/**
 * kset_register() - Initialize and add a kset.
 * @k: kset.
 *
 * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
 * is freed, it can not be used any more.
 */
int kset_register(struct kset *k)
{
        int err;

        if (!k)
                return -EINVAL;

        if (!k->kobj.ktype) {
                pr_err("must have a ktype to be initialized properly!\n");
                return -EINVAL;
        }

        kset_init(k);
        err = kobject_add_internal(&k->kobj);
        if (err) {
                kfree_const(k->kobj.name);
                /* Set it to NULL to avoid accessing bad pointer in callers. */
                k->kobj.name = NULL;
                return err;
        }
        kobject_uevent(&k->kobj, KOBJ_ADD);
        return 0;
}
EXPORT_SYMBOL(kset_register);

/**
 * kset_unregister() - Remove a kset.
 * @k: kset.
 */
void kset_unregister(struct kset *k)
{
        if (!k)
                return;
        kobject_del(&k->kobj);
        kobject_put(&k->kobj);
}
EXPORT_SYMBOL(kset_unregister);

/**
 * kset_find_obj() - Search for object in kset.
 * @kset: kset we're looking in.
 * @name: object's name.
 *
 * Lock kset via @kset->subsys, and iterate over @kset->list,
 * looking for a matching kobject. If matching object is found
 * take a reference and return the object.
 */
struct kobject *kset_find_obj(struct kset *kset, const char *name)
{
        struct kobject *k;
        struct kobject *ret = NULL;

        spin_lock(&kset->list_lock);

        list_for_each_entry(k, &kset->list, entry) {
                if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
                        ret = kobject_get_unless_zero(k);
                        break;
                }
        }

        spin_unlock(&kset->list_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(kset_find_obj);

static void kset_release(struct kobject *kobj)
{
        struct kset *kset = container_of(kobj, struct kset, kobj);
        pr_debug("'%s' (%p): %s\n",
                 kobject_name(kobj), kobj, __func__);
        kfree(kset);
}

static void kset_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        if (kobj->parent)
                kobject_get_ownership(kobj->parent, uid, gid);
}

static const struct kobj_type kset_ktype = {
        .sysfs_ops        = &kobj_sysfs_ops,
        .release        = kset_release,
        .get_ownership        = kset_get_ownership,
};

/**
 * kset_create() - Create a struct kset dynamically.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically.  This structure can
 * then be registered with the system and show up in sysfs with a call to
 * kset_register().  When you are finished with this structure, if
 * kset_register() has been called, call kset_unregister() and the
 * structure will be dynamically freed when it is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
static struct kset *kset_create(const char *name,
                                const struct kset_uevent_ops *uevent_ops,
                                struct kobject *parent_kobj)
{
        struct kset *kset;
        int retval;

        kset = kzalloc(sizeof(*kset), GFP_KERNEL);
        if (!kset)
                return NULL;
        retval = kobject_set_name(&kset->kobj, "%s", name);
        if (retval) {
                kfree(kset);
                return NULL;
        }
        kset->uevent_ops = uevent_ops;
        kset->kobj.parent = parent_kobj;

        /*
         * The kobject of this kset will have a type of kset_ktype and belong to
         * no kset itself.  That way we can properly free it when it is
         * finished being used.
         */
        kset->kobj.ktype = &kset_ktype;
        kset->kobj.kset = NULL;

        return kset;
}

/**
 * kset_create_and_add() - Create a struct kset dynamically and add it to sysfs.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kset_unregister() and the structure will be dynamically freed when it
 * is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
struct kset *kset_create_and_add(const char *name,
                                 const struct kset_uevent_ops *uevent_ops,
                                 struct kobject *parent_kobj)
{
        struct kset *kset;
        int error;

        kset = kset_create(name, uevent_ops, parent_kobj);
        if (!kset)
                return NULL;
        error = kset_register(kset);
        if (error) {
                kfree(kset);
                return NULL;
        }
        return kset;
}
EXPORT_SYMBOL_GPL(kset_create_and_add);


static DEFINE_SPINLOCK(kobj_ns_type_lock);
static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES];

int kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
{
        enum kobj_ns_type type = ops->type;
        int error;

        spin_lock(&kobj_ns_type_lock);

        error = -EINVAL;
        if (!kobj_ns_type_is_valid(type))
                goto out;

        error = -EBUSY;
        if (kobj_ns_ops_tbl[type])
                goto out;

        error = 0;
        kobj_ns_ops_tbl[type] = ops;

out:
        spin_unlock(&kobj_ns_type_lock);
        return error;
}

int kobj_ns_type_registered(enum kobj_ns_type type)
{
        int registered = 0;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type))
                registered = kobj_ns_ops_tbl[type] != NULL;
        spin_unlock(&kobj_ns_type_lock);

        return registered;
}

const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent)
{
        const struct kobj_ns_type_operations *ops = NULL;

        if (parent && parent->ktype && parent->ktype->child_ns_type)
                ops = parent->ktype->child_ns_type(parent);

        return ops;
}

const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj)
{
        return kobj_child_ns_ops(kobj->parent);
}

bool kobj_ns_current_may_mount(enum kobj_ns_type type)
{
        bool may_mount = true;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                may_mount = kobj_ns_ops_tbl[type]->current_may_mount();
        spin_unlock(&kobj_ns_type_lock);

        return may_mount;
}

void *kobj_ns_grab_current(enum kobj_ns_type type)
{
        void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->grab_current_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}
EXPORT_SYMBOL_GPL(kobj_ns_grab_current);

const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk)
{
        const void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->netlink_ns(sk);
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}

const void *kobj_ns_initial(enum kobj_ns_type type)
{
        const void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->initial_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}

void kobj_ns_drop(enum kobj_ns_type type, void *ns)
{
        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) &&
            kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns)
                kobj_ns_ops_tbl[type]->drop_ns(ns);
        spin_unlock(&kobj_ns_type_lock);
}
EXPORT_SYMBOL_GPL(kobj_ns_drop);







































































































































































































































    1 















    1 










































    1 

    1 

    1 

    1 
















    1 

    1 

    1 

    1 















































































    1 

















    1 





    1 





















    1 







    1 

    1 





























































































































    1 




































































































































































    1 































    1 





















    1 



    1 













    1 






































































































































































































































































































































































































































    1 
















































































    1 
















    1 





    1 













    1 

    1 

    1 




    1 
    1 




    1 



















































































































































    1 




























































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 



    1 

































    1 


    1 



    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





































    1 























    1 



    1 








    1 





















    1 






    1 


















    1 





    1 

















    1 































    1 











    1 
















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
// SPDX-License-Identifier: GPL-2.0-only
/*
 * BPF JIT compiler
 *
 * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
 * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#include <linux/netdevice.h>
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/memory.h>
#include <linux/sort.h>
#include <asm/extable.h>
#include <asm/ftrace.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
#include <asm/cfi.h>

static bool all_callee_regs_used[4] = {true, true, true, true};

static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
        if (len == 1)
                *ptr = bytes;
        else if (len == 2)
                *(u16 *)ptr = bytes;
        else {
                *(u32 *)ptr = bytes;
                barrier();
        }
        return ptr + len;
}

#define EMIT(bytes, len) \
        do { prog = emit_code(prog, bytes, len); } while (0)

#define EMIT1(b1)                EMIT(b1, 1)
#define EMIT2(b1, b2)                EMIT((b1) + ((b2) << 8), 2)
#define EMIT3(b1, b2, b3)        EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
#define EMIT4(b1, b2, b3, b4)   EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)

#define EMIT1_off32(b1, off) \
        do { EMIT1(b1); EMIT(off, 4); } while (0)
#define EMIT2_off32(b1, b2, off) \
        do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
#define EMIT3_off32(b1, b2, b3, off) \
        do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
#define EMIT4_off32(b1, b2, b3, b4, off) \
        do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)

#ifdef CONFIG_X86_KERNEL_IBT
#define EMIT_ENDBR()                EMIT(gen_endbr(), 4)
#define EMIT_ENDBR_POISON()        EMIT(gen_endbr_poison(), 4)
#else
#define EMIT_ENDBR()
#define EMIT_ENDBR_POISON()
#endif

static bool is_imm8(int value)
{
        return value <= 127 && value >= -128;
}

static bool is_simm32(s64 value)
{
        return value == (s64)(s32)value;
}

static bool is_uimm32(u64 value)
{
        return value == (u64)(u32)value;
}

/* mov dst, src */
#define EMIT_mov(DST, SRC)                                                                 \
        do {                                                                                 \
                if (DST != SRC)                                                                 \
                        EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
        } while (0)

static int bpf_size_to_x86_bytes(int bpf_size)
{
        if (bpf_size == BPF_W)
                return 4;
        else if (bpf_size == BPF_H)
                return 2;
        else if (bpf_size == BPF_B)
                return 1;
        else if (bpf_size == BPF_DW)
                return 4; /* imm32 */
        else
                return 0;
}

/*
 * List of x86 cond jumps opcodes (. + s8)
 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
 */
#define X86_JB  0x72
#define X86_JAE 0x73
#define X86_JE  0x74
#define X86_JNE 0x75
#define X86_JBE 0x76
#define X86_JA  0x77
#define X86_JL  0x7C
#define X86_JGE 0x7D
#define X86_JLE 0x7E
#define X86_JG  0x7F

/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_JIT_REG + 1)
#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
#define X86_REG_R12 (MAX_BPF_JIT_REG + 3)

/*
 * The following table maps BPF registers to x86-64 registers.
 *
 * x86-64 register R12 is unused, since if used as base address
 * register in load/store instructions, it always needs an
 * extra byte of encoding and is callee saved.
 *
 * x86-64 register R9 is not used by BPF programs, but can be used by BPF
 * trampoline. x86-64 register R10 is used for blinding (if enabled).
 */
static const int reg2hex[] = {
        [BPF_REG_0] = 0,  /* RAX */
        [BPF_REG_1] = 7,  /* RDI */
        [BPF_REG_2] = 6,  /* RSI */
        [BPF_REG_3] = 2,  /* RDX */
        [BPF_REG_4] = 1,  /* RCX */
        [BPF_REG_5] = 0,  /* R8  */
        [BPF_REG_6] = 3,  /* RBX callee saved */
        [BPF_REG_7] = 5,  /* R13 callee saved */
        [BPF_REG_8] = 6,  /* R14 callee saved */
        [BPF_REG_9] = 7,  /* R15 callee saved */
        [BPF_REG_FP] = 5, /* RBP readonly */
        [BPF_REG_AX] = 2, /* R10 temp register */
        [AUX_REG] = 3,    /* R11 temp register */
        [X86_REG_R9] = 1, /* R9 register, 6th function argument */
        [X86_REG_R12] = 4, /* R12 callee saved */
};

static const int reg2pt_regs[] = {
        [BPF_REG_0] = offsetof(struct pt_regs, ax),
        [BPF_REG_1] = offsetof(struct pt_regs, di),
        [BPF_REG_2] = offsetof(struct pt_regs, si),
        [BPF_REG_3] = offsetof(struct pt_regs, dx),
        [BPF_REG_4] = offsetof(struct pt_regs, cx),
        [BPF_REG_5] = offsetof(struct pt_regs, r8),
        [BPF_REG_6] = offsetof(struct pt_regs, bx),
        [BPF_REG_7] = offsetof(struct pt_regs, r13),
        [BPF_REG_8] = offsetof(struct pt_regs, r14),
        [BPF_REG_9] = offsetof(struct pt_regs, r15),
};

/*
 * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
 * which need extra byte of encoding.
 * rax,rcx,...,rbp have simpler encoding
 */
static bool is_ereg(u32 reg)
{
        return (1 << reg) & (BIT(BPF_REG_5) |
                             BIT(AUX_REG) |
                             BIT(BPF_REG_7) |
                             BIT(BPF_REG_8) |
                             BIT(BPF_REG_9) |
                             BIT(X86_REG_R9) |
                             BIT(X86_REG_R12) |
                             BIT(BPF_REG_AX));
}

/*
 * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64
 * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte
 * of encoding. al,cl,dl,bl have simpler encoding.
 */
static bool is_ereg_8l(u32 reg)
{
        return is_ereg(reg) ||
            (1 << reg) & (BIT(BPF_REG_1) |
                          BIT(BPF_REG_2) |
                          BIT(BPF_REG_FP));
}

static bool is_axreg(u32 reg)
{
        return reg == BPF_REG_0;
}

/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
static u8 add_1mod(u8 byte, u32 reg)
{
        if (is_ereg(reg))
                byte |= 1;
        return byte;
}

static u8 add_2mod(u8 byte, u32 r1, u32 r2)
{
        if (is_ereg(r1))
                byte |= 1;
        if (is_ereg(r2))
                byte |= 4;
        return byte;
}

static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
{
        if (is_ereg(r1))
                byte |= 1;
        if (is_ereg(index))
                byte |= 2;
        if (is_ereg(r2))
                byte |= 4;
        return byte;
}

/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
static u8 add_1reg(u8 byte, u32 dst_reg)
{
        return byte + reg2hex[dst_reg];
}

/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
{
        return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
}

/* Some 1-byte opcodes for binary ALU operations */
static u8 simple_alu_opcodes[] = {
        [BPF_ADD] = 0x01,
        [BPF_SUB] = 0x29,
        [BPF_AND] = 0x21,
        [BPF_OR] = 0x09,
        [BPF_XOR] = 0x31,
        [BPF_LSH] = 0xE0,
        [BPF_RSH] = 0xE8,
        [BPF_ARSH] = 0xF8,
};

static void jit_fill_hole(void *area, unsigned int size)
{
        /* Fill whole space with INT3 instructions */
        memset(area, 0xcc, size);
}

int bpf_arch_text_invalidate(void *dst, size_t len)
{
        return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len));
}

struct jit_context {
        int cleanup_addr; /* Epilogue code offset */

        /*
         * Program specific offsets of labels in the code; these rely on the
         * JIT doing at least 2 passes, recording the position on the first
         * pass, only to generate the correct offset on the second pass.
         */
        int tail_call_direct_label;
        int tail_call_indirect_label;
};

/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE        128
#define BPF_INSN_SAFETY                64

/* Number of bytes emit_patch() needs to generate instructions */
#define X86_PATCH_SIZE                5
/* Number of bytes that will be skipped on tailcall */
#define X86_TAIL_CALL_OFFSET        (11 + ENDBR_INSN_SIZE)

static void push_r12(u8 **pprog)
{
        u8 *prog = *pprog;

        EMIT2(0x41, 0x54);   /* push r12 */
        *pprog = prog;
}

static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
{
        u8 *prog = *pprog;

        if (callee_regs_used[0])
                EMIT1(0x53);         /* push rbx */
        if (callee_regs_used[1])
                EMIT2(0x41, 0x55);   /* push r13 */
        if (callee_regs_used[2])
                EMIT2(0x41, 0x56);   /* push r14 */
        if (callee_regs_used[3])
                EMIT2(0x41, 0x57);   /* push r15 */
        *pprog = prog;
}

static void pop_r12(u8 **pprog)
{
        u8 *prog = *pprog;

        EMIT2(0x41, 0x5C);   /* pop r12 */
        *pprog = prog;
}

static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
{
        u8 *prog = *pprog;

        if (callee_regs_used[3])
                EMIT2(0x41, 0x5F);   /* pop r15 */
        if (callee_regs_used[2])
                EMIT2(0x41, 0x5E);   /* pop r14 */
        if (callee_regs_used[1])
                EMIT2(0x41, 0x5D);   /* pop r13 */
        if (callee_regs_used[0])
                EMIT1(0x5B);         /* pop rbx */
        *pprog = prog;
}

static void emit_nops(u8 **pprog, int len)
{
        u8 *prog = *pprog;
        int i, noplen;

        while (len > 0) {
                noplen = len;

                if (noplen > ASM_NOP_MAX)
                        noplen = ASM_NOP_MAX;

                for (i = 0; i < noplen; i++)
                        EMIT1(x86_nops[noplen][i]);
                len -= noplen;
        }

        *pprog = prog;
}

/*
 * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
 * in arch/x86/kernel/alternative.c
 */

static void emit_fineibt(u8 **pprog, u32 hash)
{
        u8 *prog = *pprog;

        EMIT_ENDBR();
        EMIT3_off32(0x41, 0x81, 0xea, hash);                /* subl $hash, %r10d        */
        EMIT2(0x74, 0x07);                                /* jz.d8 +7                */
        EMIT2(0x0f, 0x0b);                                /* ud2                        */
        EMIT1(0x90);                                        /* nop                        */
        EMIT_ENDBR_POISON();

        *pprog = prog;
}

static void emit_kcfi(u8 **pprog, u32 hash)
{
        u8 *prog = *pprog;

        EMIT1_off32(0xb8, hash);                        /* movl $hash, %eax        */
#ifdef CONFIG_CALL_PADDING
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
        EMIT1(0x90);
#endif
        EMIT_ENDBR();

        *pprog = prog;
}

static void emit_cfi(u8 **pprog, u32 hash)
{
        u8 *prog = *pprog;

        switch (cfi_mode) {
        case CFI_FINEIBT:
                emit_fineibt(&prog, hash);
                break;

        case CFI_KCFI:
                emit_kcfi(&prog, hash);
                break;

        default:
                EMIT_ENDBR();
                break;
        }

        *pprog = prog;
}

/*
 * Emit x86-64 prologue code for BPF program.
 * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
 * while jumping to another program
 */
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
                          bool tail_call_reachable, bool is_subprog,
                          bool is_exception_cb)
{
        u8 *prog = *pprog;

        emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
        /* BPF trampoline can be made to work without these nops,
         * but let's waste 5 bytes for now and optimize later
         */
        emit_nops(&prog, X86_PATCH_SIZE);
        if (!ebpf_from_cbpf) {
                if (tail_call_reachable && !is_subprog)
                        /* When it's the entry of the whole tailcall context,
                         * zeroing rax means initialising tail_call_cnt.
                         */
                        EMIT2(0x31, 0xC0); /* xor eax, eax */
                else
                        /* Keep the same instruction layout. */
                        EMIT2(0x66, 0x90); /* nop2 */
        }
        /* Exception callback receives FP as third parameter */
        if (is_exception_cb) {
                EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
                EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */
                /* The main frame must have exception_boundary as true, so we
                 * first restore those callee-saved regs from stack, before
                 * reusing the stack frame.
                 */
                pop_callee_regs(&prog, all_callee_regs_used);
                pop_r12(&prog);
                /* Reset the stack frame. */
                EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */
        } else {
                EMIT1(0x55);             /* push rbp */
                EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
        }

        /* X86_TAIL_CALL_OFFSET is here */
        EMIT_ENDBR();

        /* sub rsp, rounded_stack_depth */
        if (stack_depth)
                EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
        if (tail_call_reachable)
                EMIT1(0x50);         /* push rax */
        *pprog = prog;
}

static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
{
        u8 *prog = *pprog;
        s64 offset;

        offset = func - (ip + X86_PATCH_SIZE);
        if (!is_simm32(offset)) {
                pr_err("Target call %p is out of range\n", func);
                return -ERANGE;
        }
        EMIT1_off32(opcode, offset);
        *pprog = prog;
        return 0;
}

static int emit_call(u8 **pprog, void *func, void *ip)
{
        return emit_patch(pprog, func, ip, 0xE8);
}

static int emit_rsb_call(u8 **pprog, void *func, void *ip)
{
        OPTIMIZER_HIDE_VAR(func);
        ip += x86_call_depth_emit_accounting(pprog, func, ip);
        return emit_patch(pprog, func, ip, 0xE8);
}

static int emit_jump(u8 **pprog, void *func, void *ip)
{
        return emit_patch(pprog, func, ip, 0xE9);
}

static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                                void *old_addr, void *new_addr)
{
        const u8 *nop_insn = x86_nops[5];
        u8 old_insn[X86_PATCH_SIZE];
        u8 new_insn[X86_PATCH_SIZE];
        u8 *prog;
        int ret;

        memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
        if (old_addr) {
                prog = old_insn;
                ret = t == BPF_MOD_CALL ?
                      emit_call(&prog, old_addr, ip) :
                      emit_jump(&prog, old_addr, ip);
                if (ret)
                        return ret;
        }

        memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
        if (new_addr) {
                prog = new_insn;
                ret = t == BPF_MOD_CALL ?
                      emit_call(&prog, new_addr, ip) :
                      emit_jump(&prog, new_addr, ip);
                if (ret)
                        return ret;
        }

        ret = -EBUSY;
        mutex_lock(&text_mutex);
        if (memcmp(ip, old_insn, X86_PATCH_SIZE))
                goto out;
        ret = 1;
        if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
                text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
                ret = 0;
        }
out:
        mutex_unlock(&text_mutex);
        return ret;
}

int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                       void *old_addr, void *new_addr)
{
        if (!is_kernel_text((long)ip) &&
            !is_bpf_text_address((long)ip))
                /* BPF poking in modules is not supported */
                return -EINVAL;

        /*
         * See emit_prologue(), for IBT builds the trampoline hook is preceded
         * with an ENDBR instruction.
         */
        if (is_endbr(*(u32 *)ip))
                ip += ENDBR_INSN_SIZE;

        return __bpf_arch_text_poke(ip, t, old_addr, new_addr);
}

#define EMIT_LFENCE()        EMIT3(0x0F, 0xAE, 0xE8)

static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
{
        u8 *prog = *pprog;

        if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
                EMIT_LFENCE();
                EMIT2(0xFF, 0xE0 + reg);
        } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
                OPTIMIZER_HIDE_VAR(reg);
                if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
                        emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip);
                else
                        emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
        } else {
                EMIT2(0xFF, 0xE0 + reg);        /* jmp *%\reg */
                if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS))
                        EMIT1(0xCC);                /* int3 */
        }

        *pprog = prog;
}

static void emit_return(u8 **pprog, u8 *ip)
{
        u8 *prog = *pprog;

        if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
                emit_jump(&prog, x86_return_thunk, ip);
        } else {
                EMIT1(0xC3);                /* ret */
                if (IS_ENABLED(CONFIG_MITIGATION_SLS))
                        EMIT1(0xCC);        /* int3 */
        }

        *pprog = prog;
}

/*
 * Generate the following code:
 *
 * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
 *   if (index >= array->map.max_entries)
 *     goto out;
 *   if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
 *     goto out;
 *   prog = array->ptrs[index];
 *   if (prog == NULL)
 *     goto out;
 *   goto *(prog->bpf_func + prologue_size);
 * out:
 */
static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
                                        u8 **pprog, bool *callee_regs_used,
                                        u32 stack_depth, u8 *ip,
                                        struct jit_context *ctx)
{
        int tcc_off = -4 - round_up(stack_depth, 8);
        u8 *prog = *pprog, *start = *pprog;
        int offset;

        /*
         * rdi - pointer to ctx
         * rsi - pointer to bpf_array
         * rdx - index in bpf_array
         */

        /*
         * if (index >= array->map.max_entries)
         *        goto out;
         */
        EMIT2(0x89, 0xD2);                        /* mov edx, edx */
        EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
              offsetof(struct bpf_array, map.max_entries));

        offset = ctx->tail_call_indirect_label - (prog + 2 - start);
        EMIT2(X86_JBE, offset);                   /* jbe out */

        /*
         * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
         *        goto out;
         */
        EMIT2_off32(0x8B, 0x85, tcc_off);         /* mov eax, dword ptr [rbp - tcc_off] */
        EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */

        offset = ctx->tail_call_indirect_label - (prog + 2 - start);
        EMIT2(X86_JAE, offset);                   /* jae out */
        EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
        EMIT2_off32(0x89, 0x85, tcc_off);         /* mov dword ptr [rbp - tcc_off], eax */

        /* prog = array->ptrs[index]; */
        EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6,       /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */
                    offsetof(struct bpf_array, ptrs));

        /*
         * if (prog == NULL)
         *        goto out;
         */
        EMIT3(0x48, 0x85, 0xC9);                  /* test rcx,rcx */

        offset = ctx->tail_call_indirect_label - (prog + 2 - start);
        EMIT2(X86_JE, offset);                    /* je out */

        if (bpf_prog->aux->exception_boundary) {
                pop_callee_regs(&prog, all_callee_regs_used);
                pop_r12(&prog);
        } else {
                pop_callee_regs(&prog, callee_regs_used);
                if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
                        pop_r12(&prog);
        }

        EMIT1(0x58);                              /* pop rax */
        if (stack_depth)
                EMIT3_off32(0x48, 0x81, 0xC4,     /* add rsp, sd */
                            round_up(stack_depth, 8));

        /* goto *(prog->bpf_func + X86_TAIL_CALL_OFFSET); */
        EMIT4(0x48, 0x8B, 0x49,                   /* mov rcx, qword ptr [rcx + 32] */
              offsetof(struct bpf_prog, bpf_func));
        EMIT4(0x48, 0x83, 0xC1,                   /* add rcx, X86_TAIL_CALL_OFFSET */
              X86_TAIL_CALL_OFFSET);
        /*
         * Now we're ready to jump into next BPF program
         * rdi == ctx (1st arg)
         * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET
         */
        emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start));

        /* out: */
        ctx->tail_call_indirect_label = prog - start;
        *pprog = prog;
}

static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
                                      struct bpf_jit_poke_descriptor *poke,
                                      u8 **pprog, u8 *ip,
                                      bool *callee_regs_used, u32 stack_depth,
                                      struct jit_context *ctx)
{
        int tcc_off = -4 - round_up(stack_depth, 8);
        u8 *prog = *pprog, *start = *pprog;
        int offset;

        /*
         * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
         *        goto out;
         */
        EMIT2_off32(0x8B, 0x85, tcc_off);             /* mov eax, dword ptr [rbp - tcc_off] */
        EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);         /* cmp eax, MAX_TAIL_CALL_CNT */

        offset = ctx->tail_call_direct_label - (prog + 2 - start);
        EMIT2(X86_JAE, offset);                       /* jae out */
        EMIT3(0x83, 0xC0, 0x01);                      /* add eax, 1 */
        EMIT2_off32(0x89, 0x85, tcc_off);             /* mov dword ptr [rbp - tcc_off], eax */

        poke->tailcall_bypass = ip + (prog - start);
        poke->adj_off = X86_TAIL_CALL_OFFSET;
        poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE;
        poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE;

        emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
                  poke->tailcall_bypass);

        if (bpf_prog->aux->exception_boundary) {
                pop_callee_regs(&prog, all_callee_regs_used);
                pop_r12(&prog);
        } else {
                pop_callee_regs(&prog, callee_regs_used);
                if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
                        pop_r12(&prog);
        }

        EMIT1(0x58);                                  /* pop rax */
        if (stack_depth)
                EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));

        emit_nops(&prog, X86_PATCH_SIZE);

        /* out: */
        ctx->tail_call_direct_label = prog - start;

        *pprog = prog;
}

static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
{
        struct bpf_jit_poke_descriptor *poke;
        struct bpf_array *array;
        struct bpf_prog *target;
        int i, ret;

        for (i = 0; i < prog->aux->size_poke_tab; i++) {
                poke = &prog->aux->poke_tab[i];
                if (poke->aux && poke->aux != prog->aux)
                        continue;

                WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));

                if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
                        continue;

                array = container_of(poke->tail_call.map, struct bpf_array, map);
                mutex_lock(&array->aux->poke_mutex);
                target = array->ptrs[poke->tail_call.key];
                if (target) {
                        ret = __bpf_arch_text_poke(poke->tailcall_target,
                                                   BPF_MOD_JUMP, NULL,
                                                   (u8 *)target->bpf_func +
                                                   poke->adj_off);
                        BUG_ON(ret < 0);
                        ret = __bpf_arch_text_poke(poke->tailcall_bypass,
                                                   BPF_MOD_JUMP,
                                                   (u8 *)poke->tailcall_target +
                                                   X86_PATCH_SIZE, NULL);
                        BUG_ON(ret < 0);
                }
                WRITE_ONCE(poke->tailcall_target_stable, true);
                mutex_unlock(&array->aux->poke_mutex);
        }
}

static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
                           u32 dst_reg, const u32 imm32)
{
        u8 *prog = *pprog;
        u8 b1, b2, b3;

        /*
         * Optimization: if imm32 is positive, use 'mov %eax, imm32'
         * (which zero-extends imm32) to save 2 bytes.
         */
        if (sign_propagate && (s32)imm32 < 0) {
                /* 'mov %rax, imm32' sign extends imm32 */
                b1 = add_1mod(0x48, dst_reg);
                b2 = 0xC7;
                b3 = 0xC0;
                EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
                goto done;
        }

        /*
         * Optimization: if imm32 is zero, use 'xor %eax, %eax'
         * to save 3 bytes.
         */
        if (imm32 == 0) {
                if (is_ereg(dst_reg))
                        EMIT1(add_2mod(0x40, dst_reg, dst_reg));
                b2 = 0x31; /* xor */
                b3 = 0xC0;
                EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
                goto done;
        }

        /* mov %eax, imm32 */
        if (is_ereg(dst_reg))
                EMIT1(add_1mod(0x40, dst_reg));
        EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
done:
        *pprog = prog;
}

static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
                           const u32 imm32_hi, const u32 imm32_lo)
{
        u64 imm64 = ((u64)imm32_hi << 32) | (u32)imm32_lo;
        u8 *prog = *pprog;

        if (is_uimm32(imm64)) {
                /*
                 * For emitting plain u32, where sign bit must not be
                 * propagated LLVM tends to load imm64 over mov32
                 * directly, so save couple of bytes by just doing
                 * 'mov %eax, imm32' instead.
                 */
                emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
        } else if (is_simm32(imm64)) {
                emit_mov_imm32(&prog, true, dst_reg, imm32_lo);
        } else {
                /* movabsq rax, imm64 */
                EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
                EMIT(imm32_lo, 4);
                EMIT(imm32_hi, 4);
        }

        *pprog = prog;
}

static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
{
        u8 *prog = *pprog;

        if (is64) {
                /* mov dst, src */
                EMIT_mov(dst_reg, src_reg);
        } else {
                /* mov32 dst, src */
                if (is_ereg(dst_reg) || is_ereg(src_reg))
                        EMIT1(add_2mod(0x40, dst_reg, src_reg));
                EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
        }

        *pprog = prog;
}

static void emit_movsx_reg(u8 **pprog, int num_bits, bool is64, u32 dst_reg,
                           u32 src_reg)
{
        u8 *prog = *pprog;

        if (is64) {
                /* movs[b,w,l]q dst, src */
                if (num_bits == 8)
                        EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbe,
                              add_2reg(0xC0, src_reg, dst_reg));
                else if (num_bits == 16)
                        EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbf,
                              add_2reg(0xC0, src_reg, dst_reg));
                else if (num_bits == 32)
                        EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x63,
                              add_2reg(0xC0, src_reg, dst_reg));
        } else {
                /* movs[b,w]l dst, src */
                if (num_bits == 8) {
                        EMIT4(add_2mod(0x40, src_reg, dst_reg), 0x0f, 0xbe,
                              add_2reg(0xC0, src_reg, dst_reg));
                } else if (num_bits == 16) {
                        if (is_ereg(dst_reg) || is_ereg(src_reg))
                                EMIT1(add_2mod(0x40, src_reg, dst_reg));
                        EMIT3(add_2mod(0x0f, src_reg, dst_reg), 0xbf,
                              add_2reg(0xC0, src_reg, dst_reg));
                }
        }

        *pprog = prog;
}

/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
{
        u8 *prog = *pprog;

        if (is_imm8(off)) {
                /* 1-byte signed displacement.
                 *
                 * If off == 0 we could skip this and save one extra byte, but
                 * special case of x86 R13 which always needs an offset is not
                 * worth the hassle
                 */
                EMIT2(add_2reg(0x40, ptr_reg, val_reg), off);
        } else {
                /* 4-byte signed displacement */
                EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off);
        }
        *pprog = prog;
}

static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
{
        u8 *prog = *pprog;

        if (is_imm8(off)) {
                EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
        } else {
                EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
        }
        *pprog = prog;
}

/*
 * Emit a REX byte if it will be necessary to address these registers
 */
static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
{
        u8 *prog = *pprog;

        if (is64)
                EMIT1(add_2mod(0x48, dst_reg, src_reg));
        else if (is_ereg(dst_reg) || is_ereg(src_reg))
                EMIT1(add_2mod(0x40, dst_reg, src_reg));
        *pprog = prog;
}

/*
 * Similar version of maybe_emit_mod() for a single register
 */
static void maybe_emit_1mod(u8 **pprog, u32 reg, bool is64)
{
        u8 *prog = *pprog;

        if (is64)
                EMIT1(add_1mod(0x48, reg));
        else if (is_ereg(reg))
                EMIT1(add_1mod(0x40, reg));
        *pprog = prog;
}

/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
        u8 *prog = *pprog;

        switch (size) {
        case BPF_B:
                /* Emit 'movzx rax, byte ptr [rax + off]' */
                EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
                break;
        case BPF_H:
                /* Emit 'movzx rax, word ptr [rax + off]' */
                EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
                break;
        case BPF_W:
                /* Emit 'mov eax, dword ptr [rax+0x14]' */
                if (is_ereg(dst_reg) || is_ereg(src_reg))
                        EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
                else
                        EMIT1(0x8B);
                break;
        case BPF_DW:
                /* Emit 'mov rax, qword ptr [rax+0x14]' */
                EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
                break;
        }
        emit_insn_suffix(&prog, src_reg, dst_reg, off);
        *pprog = prog;
}

/* LDSX: dst_reg = *(s8*)(src_reg + off) */
static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
        u8 *prog = *pprog;

        switch (size) {
        case BPF_B:
                /* Emit 'movsx rax, byte ptr [rax + off]' */
                EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBE);
                break;
        case BPF_H:
                /* Emit 'movsx rax, word ptr [rax + off]' */
                EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBF);
                break;
        case BPF_W:
                /* Emit 'movsx rax, dword ptr [rax+0x14]' */
                EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x63);
                break;
        }
        emit_insn_suffix(&prog, src_reg, dst_reg, off);
        *pprog = prog;
}

static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{
        u8 *prog = *pprog;

        switch (size) {
        case BPF_B:
                /* movzx rax, byte ptr [rax + r12 + off] */
                EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6);
                break;
        case BPF_H:
                /* movzx rax, word ptr [rax + r12 + off] */
                EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7);
                break;
        case BPF_W:
                /* mov eax, dword ptr [rax + r12 + off] */
                EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B);
                break;
        case BPF_DW:
                /* mov rax, qword ptr [rax + r12 + off] */
                EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B);
                break;
        }
        emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off);
        *pprog = prog;
}

static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
        emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
}

/* STX: *(u8*)(dst_reg + off) = src_reg */
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
        u8 *prog = *pprog;

        switch (size) {
        case BPF_B:
                /* Emit 'mov byte ptr [rax + off], al' */
                if (is_ereg(dst_reg) || is_ereg_8l(src_reg))
                        /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */
                        EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
                else
                        EMIT1(0x88);
                break;
        case BPF_H:
                if (is_ereg(dst_reg) || is_ereg(src_reg))
                        EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
                else
                        EMIT2(0x66, 0x89);
                break;
        case BPF_W:
                if (is_ereg(dst_reg) || is_ereg(src_reg))
                        EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
                else
                        EMIT1(0x89);
                break;
        case BPF_DW:
                EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
                break;
        }
        emit_insn_suffix(&prog, dst_reg, src_reg, off);
        *pprog = prog;
}

/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{
        u8 *prog = *pprog;

        switch (size) {
        case BPF_B:
                /* mov byte ptr [rax + r12 + off], al */
                EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88);
                break;
        case BPF_H:
                /* mov word ptr [rax + r12 + off], ax */
                EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
                break;
        case BPF_W:
                /* mov dword ptr [rax + r12 + 1], eax */
                EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
                break;
        case BPF_DW:
                /* mov qword ptr [rax + r12 + 1], rax */
                EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89);
                break;
        }
        emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
        *pprog = prog;
}

static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
        emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
}

/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
{
        u8 *prog = *pprog;

        switch (size) {
        case BPF_B:
                /* mov byte ptr [rax + r12 + off], imm8 */
                EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6);
                break;
        case BPF_H:
                /* mov word ptr [rax + r12 + off], imm16 */
                EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
                break;
        case BPF_W:
                /* mov dword ptr [rax + r12 + 1], imm32 */
                EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
                break;
        case BPF_DW:
                /* mov qword ptr [rax + r12 + 1], imm32 */
                EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7);
                break;
        }
        emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off);
        EMIT(imm, bpf_size_to_x86_bytes(size));
        *pprog = prog;
}

static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
{
        emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
}

static int emit_atomic(u8 **pprog, u8 atomic_op,
                       u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
{
        u8 *prog = *pprog;

        EMIT1(0xF0); /* lock prefix */

        maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW);

        /* emit opcode */
        switch (atomic_op) {
        case BPF_ADD:
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
                /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
                EMIT1(simple_alu_opcodes[atomic_op]);
                break;
        case BPF_ADD | BPF_FETCH:
                /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
                EMIT2(0x0F, 0xC1);
                break;
        case BPF_XCHG:
                /* src_reg = atomic_xchg(dst_reg + off, src_reg); */
                EMIT1(0x87);
                break;
        case BPF_CMPXCHG:
                /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
                EMIT2(0x0F, 0xB1);
                break;
        default:
                pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
                return -EFAULT;
        }

        emit_insn_suffix(&prog, dst_reg, src_reg, off);

        *pprog = prog;
        return 0;
}

static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size,
                             u32 dst_reg, u32 src_reg, u32 index_reg, int off)
{
        u8 *prog = *pprog;

        EMIT1(0xF0); /* lock prefix */
        switch (size) {
        case BPF_W:
                EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg));
                break;
        case BPF_DW:
                EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg));
                break;
        default:
                pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n");
                return -EFAULT;
        }

        /* emit opcode */
        switch (atomic_op) {
        case BPF_ADD:
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
                /* lock *(u32/u64*)(dst_reg + idx_reg + off) <op>= src_reg */
                EMIT1(simple_alu_opcodes[atomic_op]);
                break;
        case BPF_ADD | BPF_FETCH:
                /* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */
                EMIT2(0x0F, 0xC1);
                break;
        case BPF_XCHG:
                /* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */
                EMIT1(0x87);
                break;
        case BPF_CMPXCHG:
                /* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */
                EMIT2(0x0F, 0xB1);
                break;
        default:
                pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
                return -EFAULT;
        }
        emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
        *pprog = prog;
        return 0;
}

#define DONT_CLEAR 1

bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
{
        u32 reg = x->fixup >> 8;

        /* jump over faulting load and clear dest register */
        if (reg != DONT_CLEAR)
                *(unsigned long *)((void *)regs + reg) = 0;
        regs->ip += x->fixup & 0xff;
        return true;
}

static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
                             bool *regs_used, bool *tail_call_seen)
{
        int i;

        for (i = 1; i <= insn_cnt; i++, insn++) {
                if (insn->code == (BPF_JMP | BPF_TAIL_CALL))
                        *tail_call_seen = true;
                if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6)
                        regs_used[0] = true;
                if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7)
                        regs_used[1] = true;
                if (insn->dst_reg == BPF_REG_8 || insn->src_reg == BPF_REG_8)
                        regs_used[2] = true;
                if (insn->dst_reg == BPF_REG_9 || insn->src_reg == BPF_REG_9)
                        regs_used[3] = true;
        }
}

/* emit the 3-byte VEX prefix
 *
 * r: same as rex.r, extra bit for ModRM reg field
 * x: same as rex.x, extra bit for SIB index field
 * b: same as rex.b, extra bit for ModRM r/m, or SIB base
 * m: opcode map select, encoding escape bytes e.g. 0x0f38
 * w: same as rex.w (32 bit or 64 bit) or opcode specific
 * src_reg2: additional source reg (encoded as BPF reg)
 * l: vector length (128 bit or 256 bit) or reserved
 * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3)
 */
static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
                      bool w, u8 src_reg2, bool l, u8 pp)
{
        u8 *prog = *pprog;
        const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */
        u8 b1, b2;
        u8 vvvv = reg2hex[src_reg2];

        /* reg2hex gives only the lower 3 bit of vvvv */
        if (is_ereg(src_reg2))
                vvvv |= 1 << 3;

        /*
         * 2nd byte of 3-byte VEX prefix
         * ~ means bit inverted encoding
         *
         *    7                           0
         *  +---+---+---+---+---+---+---+---+
         *  |~R |~X |~B |         m         |
         *  +---+---+---+---+---+---+---+---+
         */
        b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
        /*
         * 3rd byte of 3-byte VEX prefix
         *
         *    7                           0
         *  +---+---+---+---+---+---+---+---+
         *  | W |     ~vvvv     | L |   pp  |
         *  +---+---+---+---+---+---+---+---+
         */
        b2 = (w << 7) | ((~vvvv & 0xf) << 3) | (l << 2) | (pp & 3);

        EMIT3(b0, b1, b2);
        *pprog = prog;
}

/* emit BMI2 shift instruction */
static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
{
        u8 *prog = *pprog;
        bool r = is_ereg(dst_reg);
        u8 m = 2; /* escape code 0f38 */

        emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
        EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
        *pprog = prog;
}

#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))

/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
#define RESTORE_TAIL_CALL_CNT(stack)                                \
        EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)

static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
                  int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
        bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
        struct bpf_insn *insn = bpf_prog->insnsi;
        bool callee_regs_used[4] = {};
        int insn_cnt = bpf_prog->len;
        bool tail_call_seen = false;
        bool seen_exit = false;
        u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
        u64 arena_vm_start, user_vm_start;
        int i, excnt = 0;
        int ilen, proglen = 0;
        u8 *prog = temp;
        int err;

        arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
        user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);

        detect_reg_usage(insn, insn_cnt, callee_regs_used,
                         &tail_call_seen);

        /* tail call's presence in current prog implies it is reachable */
        tail_call_reachable |= tail_call_seen;

        emit_prologue(&prog, bpf_prog->aux->stack_depth,
                      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
                      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
        /* Exception callback will clobber callee regs for its own use, and
         * restore the original callee regs from main prog's stack frame.
         */
        if (bpf_prog->aux->exception_boundary) {
                /* We also need to save r12, which is not mapped to any BPF
                 * register, as we throw after entry into the kernel, which may
                 * overwrite r12.
                 */
                push_r12(&prog);
                push_callee_regs(&prog, all_callee_regs_used);
        } else {
                if (arena_vm_start)
                        push_r12(&prog);
                push_callee_regs(&prog, callee_regs_used);
        }
        if (arena_vm_start)
                emit_mov_imm64(&prog, X86_REG_R12,
                               arena_vm_start >> 32, (u32) arena_vm_start);

        ilen = prog - temp;
        if (rw_image)
                memcpy(rw_image + proglen, temp, ilen);
        proglen += ilen;
        addrs[0] = proglen;
        prog = temp;

        for (i = 1; i <= insn_cnt; i++, insn++) {
                const s32 imm32 = insn->imm;
                u32 dst_reg = insn->dst_reg;
                u32 src_reg = insn->src_reg;
                u8 b2 = 0, b3 = 0;
                u8 *start_of_ldx;
                s64 jmp_offset;
                s16 insn_off;
                u8 jmp_cond;
                u8 *func;
                int nops;

                switch (insn->code) {
                        /* ALU */
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU64 | BPF_ADD | BPF_X:
                case BPF_ALU64 | BPF_SUB | BPF_X:
                case BPF_ALU64 | BPF_AND | BPF_X:
                case BPF_ALU64 | BPF_OR | BPF_X:
                case BPF_ALU64 | BPF_XOR | BPF_X:
                        maybe_emit_mod(&prog, dst_reg, src_reg,
                                       BPF_CLASS(insn->code) == BPF_ALU64);
                        b2 = simple_alu_opcodes[BPF_OP(insn->code)];
                        EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
                        break;

                case BPF_ALU64 | BPF_MOV | BPF_X:
                        if (insn_is_cast_user(insn)) {
                                if (dst_reg != src_reg)
                                        /* 32-bit mov */
                                        emit_mov_reg(&prog, false, dst_reg, src_reg);
                                /* shl dst_reg, 32 */
                                maybe_emit_1mod(&prog, dst_reg, true);
                                EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32);

                                /* or dst_reg, user_vm_start */
                                maybe_emit_1mod(&prog, dst_reg, true);
                                if (is_axreg(dst_reg))
                                        EMIT1_off32(0x0D,  user_vm_start >> 32);
                                else
                                        EMIT2_off32(0x81, add_1reg(0xC8, dst_reg),  user_vm_start >> 32);

                                /* rol dst_reg, 32 */
                                maybe_emit_1mod(&prog, dst_reg, true);
                                EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32);

                                /* xor r11, r11 */
                                EMIT3(0x4D, 0x31, 0xDB);

                                /* test dst_reg32, dst_reg32; check if lower 32-bit are zero */
                                maybe_emit_mod(&prog, dst_reg, dst_reg, false);
                                EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));

                                /* cmove r11, dst_reg; if so, set dst_reg to zero */
                                /* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */
                                maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
                                EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
                                break;
                        } else if (insn_is_mov_percpu_addr(insn)) {
                                /* mov <dst>, <src> (if necessary) */
                                EMIT_mov(dst_reg, src_reg);
#ifdef CONFIG_SMP
                                /* add <dst>, gs:[<off>] */
                                EMIT2(0x65, add_1mod(0x48, dst_reg));
                                EMIT3(0x03, add_2reg(0x04, 0, dst_reg), 0x25);
                                EMIT((u32)(unsigned long)&this_cpu_off, 4);
#endif
                                break;
                        }
                        fallthrough;
                case BPF_ALU | BPF_MOV | BPF_X:
                        if (insn->off == 0)
                                emit_mov_reg(&prog,
                                             BPF_CLASS(insn->code) == BPF_ALU64,
                                             dst_reg, src_reg);
                        else
                                emit_movsx_reg(&prog, insn->off,
                                               BPF_CLASS(insn->code) == BPF_ALU64,
                                               dst_reg, src_reg);
                        break;

                        /* neg dst */
                case BPF_ALU | BPF_NEG:
                case BPF_ALU64 | BPF_NEG:
                        maybe_emit_1mod(&prog, dst_reg,
                                        BPF_CLASS(insn->code) == BPF_ALU64);
                        EMIT2(0xF7, add_1reg(0xD8, dst_reg));
                        break;

                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU64 | BPF_ADD | BPF_K:
                case BPF_ALU64 | BPF_SUB | BPF_K:
                case BPF_ALU64 | BPF_AND | BPF_K:
                case BPF_ALU64 | BPF_OR | BPF_K:
                case BPF_ALU64 | BPF_XOR | BPF_K:
                        maybe_emit_1mod(&prog, dst_reg,
                                        BPF_CLASS(insn->code) == BPF_ALU64);

                        /*
                         * b3 holds 'normal' opcode, b2 short form only valid
                         * in case dst is eax/rax.
                         */
                        switch (BPF_OP(insn->code)) {
                        case BPF_ADD:
                                b3 = 0xC0;
                                b2 = 0x05;
                                break;
                        case BPF_SUB:
                                b3 = 0xE8;
                                b2 = 0x2D;
                                break;
                        case BPF_AND:
                                b3 = 0xE0;
                                b2 = 0x25;
                                break;
                        case BPF_OR:
                                b3 = 0xC8;
                                b2 = 0x0D;
                                break;
                        case BPF_XOR:
                                b3 = 0xF0;
                                b2 = 0x35;
                                break;
                        }

                        if (is_imm8(imm32))
                                EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
                        else if (is_axreg(dst_reg))
                                EMIT1_off32(b2, imm32);
                        else
                                EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
                        break;

                case BPF_ALU64 | BPF_MOV | BPF_K:
                case BPF_ALU | BPF_MOV | BPF_K:
                        emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64,
                                       dst_reg, imm32);
                        break;

                case BPF_LD | BPF_IMM | BPF_DW:
                        emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm);
                        insn++;
                        i++;
                        break;

                        /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */
                case BPF_ALU | BPF_MOD | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_MOD | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU64 | BPF_MOD | BPF_X:
                case BPF_ALU64 | BPF_DIV | BPF_X:
                case BPF_ALU64 | BPF_MOD | BPF_K:
                case BPF_ALU64 | BPF_DIV | BPF_K: {
                        bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;

                        if (dst_reg != BPF_REG_0)
                                EMIT1(0x50); /* push rax */
                        if (dst_reg != BPF_REG_3)
                                EMIT1(0x52); /* push rdx */

                        if (BPF_SRC(insn->code) == BPF_X) {
                                if (src_reg == BPF_REG_0 ||
                                    src_reg == BPF_REG_3) {
                                        /* mov r11, src_reg */
                                        EMIT_mov(AUX_REG, src_reg);
                                        src_reg = AUX_REG;
                                }
                        } else {
                                /* mov r11, imm32 */
                                EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
                                src_reg = AUX_REG;
                        }

                        if (dst_reg != BPF_REG_0)
                                /* mov rax, dst_reg */
                                emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg);

                        if (insn->off == 0) {
                                /*
                                 * xor edx, edx
                                 * equivalent to 'xor rdx, rdx', but one byte less
                                 */
                                EMIT2(0x31, 0xd2);

                                /* div src_reg */
                                maybe_emit_1mod(&prog, src_reg, is64);
                                EMIT2(0xF7, add_1reg(0xF0, src_reg));
                        } else {
                                if (BPF_CLASS(insn->code) == BPF_ALU)
                                        EMIT1(0x99); /* cdq */
                                else
                                        EMIT2(0x48, 0x99); /* cqo */

                                /* idiv src_reg */
                                maybe_emit_1mod(&prog, src_reg, is64);
                                EMIT2(0xF7, add_1reg(0xF8, src_reg));
                        }

                        if (BPF_OP(insn->code) == BPF_MOD &&
                            dst_reg != BPF_REG_3)
                                /* mov dst_reg, rdx */
                                emit_mov_reg(&prog, is64, dst_reg, BPF_REG_3);
                        else if (BPF_OP(insn->code) == BPF_DIV &&
                                 dst_reg != BPF_REG_0)
                                /* mov dst_reg, rax */
                                emit_mov_reg(&prog, is64, dst_reg, BPF_REG_0);

                        if (dst_reg != BPF_REG_3)
                                EMIT1(0x5A); /* pop rdx */
                        if (dst_reg != BPF_REG_0)
                                EMIT1(0x58); /* pop rax */
                        break;
                }

                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU64 | BPF_MUL | BPF_K:
                        maybe_emit_mod(&prog, dst_reg, dst_reg,
                                       BPF_CLASS(insn->code) == BPF_ALU64);

                        if (is_imm8(imm32))
                                /* imul dst_reg, dst_reg, imm8 */
                                EMIT3(0x6B, add_2reg(0xC0, dst_reg, dst_reg),
                                      imm32);
                        else
                                /* imul dst_reg, dst_reg, imm32 */
                                EMIT2_off32(0x69,
                                            add_2reg(0xC0, dst_reg, dst_reg),
                                            imm32);
                        break;

                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU64 | BPF_MUL | BPF_X:
                        maybe_emit_mod(&prog, src_reg, dst_reg,
                                       BPF_CLASS(insn->code) == BPF_ALU64);

                        /* imul dst_reg, src_reg */
                        EMIT3(0x0F, 0xAF, add_2reg(0xC0, src_reg, dst_reg));
                        break;

                        /* Shifts */
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_ARSH | BPF_K:
                case BPF_ALU64 | BPF_LSH | BPF_K:
                case BPF_ALU64 | BPF_RSH | BPF_K:
                case BPF_ALU64 | BPF_ARSH | BPF_K:
                        maybe_emit_1mod(&prog, dst_reg,
                                        BPF_CLASS(insn->code) == BPF_ALU64);

                        b3 = simple_alu_opcodes[BPF_OP(insn->code)];
                        if (imm32 == 1)
                                EMIT2(0xD1, add_1reg(b3, dst_reg));
                        else
                                EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
                        break;

                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_ARSH | BPF_X:
                case BPF_ALU64 | BPF_LSH | BPF_X:
                case BPF_ALU64 | BPF_RSH | BPF_X:
                case BPF_ALU64 | BPF_ARSH | BPF_X:
                        /* BMI2 shifts aren't better when shift count is already in rcx */
                        if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
                                /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
                                bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
                                u8 op;

                                switch (BPF_OP(insn->code)) {
                                case BPF_LSH:
                                        op = 1; /* prefix 0x66 */
                                        break;
                                case BPF_RSH:
                                        op = 3; /* prefix 0xf2 */
                                        break;
                                case BPF_ARSH:
                                        op = 2; /* prefix 0xf3 */
                                        break;
                                }

                                emit_shiftx(&prog, dst_reg, src_reg, w, op);

                                break;
                        }

                        if (src_reg != BPF_REG_4) { /* common case */
                                /* Check for bad case when dst_reg == rcx */
                                if (dst_reg == BPF_REG_4) {
                                        /* mov r11, dst_reg */
                                        EMIT_mov(AUX_REG, dst_reg);
                                        dst_reg = AUX_REG;
                                } else {
                                        EMIT1(0x51); /* push rcx */
                                }
                                /* mov rcx, src_reg */
                                EMIT_mov(BPF_REG_4, src_reg);
                        }

                        /* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */
                        maybe_emit_1mod(&prog, dst_reg,
                                        BPF_CLASS(insn->code) == BPF_ALU64);

                        b3 = simple_alu_opcodes[BPF_OP(insn->code)];
                        EMIT2(0xD3, add_1reg(b3, dst_reg));

                        if (src_reg != BPF_REG_4) {
                                if (insn->dst_reg == BPF_REG_4)
                                        /* mov dst_reg, r11 */
                                        EMIT_mov(insn->dst_reg, AUX_REG);
                                else
                                        EMIT1(0x59); /* pop rcx */
                        }

                        break;

                case BPF_ALU | BPF_END | BPF_FROM_BE:
                case BPF_ALU64 | BPF_END | BPF_FROM_LE:
                        switch (imm32) {
                        case 16:
                                /* Emit 'ror %ax, 8' to swap lower 2 bytes */
                                EMIT1(0x66);
                                if (is_ereg(dst_reg))
                                        EMIT1(0x41);
                                EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);

                                /* Emit 'movzwl eax, ax' */
                                if (is_ereg(dst_reg))
                                        EMIT3(0x45, 0x0F, 0xB7);
                                else
                                        EMIT2(0x0F, 0xB7);
                                EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
                                break;
                        case 32:
                                /* Emit 'bswap eax' to swap lower 4 bytes */
                                if (is_ereg(dst_reg))
                                        EMIT2(0x41, 0x0F);
                                else
                                        EMIT1(0x0F);
                                EMIT1(add_1reg(0xC8, dst_reg));
                                break;
                        case 64:
                                /* Emit 'bswap rax' to swap 8 bytes */
                                EMIT3(add_1mod(0x48, dst_reg), 0x0F,
                                      add_1reg(0xC8, dst_reg));
                                break;
                        }
                        break;

                case BPF_ALU | BPF_END | BPF_FROM_LE:
                        switch (imm32) {
                        case 16:
                                /*
                                 * Emit 'movzwl eax, ax' to zero extend 16-bit
                                 * into 64 bit
                                 */
                                if (is_ereg(dst_reg))
                                        EMIT3(0x45, 0x0F, 0xB7);
                                else
                                        EMIT2(0x0F, 0xB7);
                                EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
                                break;
                        case 32:
                                /* Emit 'mov eax, eax' to clear upper 32-bits */
                                if (is_ereg(dst_reg))
                                        EMIT1(0x45);
                                EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
                                break;
                        case 64:
                                /* nop */
                                break;
                        }
                        break;

                        /* speculation barrier */
                case BPF_ST | BPF_NOSPEC:
                        EMIT_LFENCE();
                        break;

                        /* ST: *(u8*)(dst_reg + off) = imm */
                case BPF_ST | BPF_MEM | BPF_B:
                        if (is_ereg(dst_reg))
                                EMIT2(0x41, 0xC6);
                        else
                                EMIT1(0xC6);
                        goto st;
                case BPF_ST | BPF_MEM | BPF_H:
                        if (is_ereg(dst_reg))
                                EMIT3(0x66, 0x41, 0xC7);
                        else
                                EMIT2(0x66, 0xC7);
                        goto st;
                case BPF_ST | BPF_MEM | BPF_W:
                        if (is_ereg(dst_reg))
                                EMIT2(0x41, 0xC7);
                        else
                                EMIT1(0xC7);
                        goto st;
                case BPF_ST | BPF_MEM | BPF_DW:
                        EMIT2(add_1mod(0x48, dst_reg), 0xC7);

st:                        if (is_imm8(insn->off))
                                EMIT2(add_1reg(0x40, dst_reg), insn->off);
                        else
                                EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);

                        EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
                        break;

                        /* STX: *(u8*)(dst_reg + off) = src_reg */
                case BPF_STX | BPF_MEM | BPF_B:
                case BPF_STX | BPF_MEM | BPF_H:
                case BPF_STX | BPF_MEM | BPF_W:
                case BPF_STX | BPF_MEM | BPF_DW:
                        emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
                        break;

                case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
                case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
                case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
                case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
                        start_of_ldx = prog;
                        emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm);
                        goto populate_extable;

                        /* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */
                case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
                case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
                case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
                case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
                case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
                case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
                case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
                case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
                        start_of_ldx = prog;
                        if (BPF_CLASS(insn->code) == BPF_LDX)
                                emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
                        else
                                emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
populate_extable:
                        {
                                struct exception_table_entry *ex;
                                u8 *_insn = image + proglen + (start_of_ldx - temp);
                                s64 delta;

                                if (!bpf_prog->aux->extable)
                                        break;

                                if (excnt >= bpf_prog->aux->num_exentries) {
                                        pr_err("mem32 extable bug\n");
                                        return -EFAULT;
                                }
                                ex = &bpf_prog->aux->extable[excnt++];

                                delta = _insn - (u8 *)&ex->insn;
                                /* switch ex to rw buffer for writes */
                                ex = (void *)rw_image + ((void *)ex - (void *)image);

                                ex->insn = delta;

                                ex->data = EX_TYPE_BPF;

                                ex->fixup = (prog - start_of_ldx) |
                                        ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8);
                        }
                        break;

                        /* LDX: dst_reg = *(u8*)(src_reg + off) */
                case BPF_LDX | BPF_MEM | BPF_B:
                case BPF_LDX | BPF_PROBE_MEM | BPF_B:
                case BPF_LDX | BPF_MEM | BPF_H:
                case BPF_LDX | BPF_PROBE_MEM | BPF_H:
                case BPF_LDX | BPF_MEM | BPF_W:
                case BPF_LDX | BPF_PROBE_MEM | BPF_W:
                case BPF_LDX | BPF_MEM | BPF_DW:
                case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
                        /* LDXS: dst_reg = *(s8*)(src_reg + off) */
                case BPF_LDX | BPF_MEMSX | BPF_B:
                case BPF_LDX | BPF_MEMSX | BPF_H:
                case BPF_LDX | BPF_MEMSX | BPF_W:
                case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
                case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
                case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
                        insn_off = insn->off;

                        if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
                            BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
                                /* Conservatively check that src_reg + insn->off is a kernel address:
                                 *   src_reg + insn->off > TASK_SIZE_MAX + PAGE_SIZE
                                 *   and
                                 *   src_reg + insn->off < VSYSCALL_ADDR
                                 */

                                u64 limit = TASK_SIZE_MAX + PAGE_SIZE - VSYSCALL_ADDR;
                                u8 *end_of_jmp;

                                /* movabsq r10, VSYSCALL_ADDR */
                                emit_mov_imm64(&prog, BPF_REG_AX, (long)VSYSCALL_ADDR >> 32,
                                               (u32)(long)VSYSCALL_ADDR);

                                /* mov src_reg, r11 */
                                EMIT_mov(AUX_REG, src_reg);

                                if (insn->off) {
                                        /* add r11, insn->off */
                                        maybe_emit_1mod(&prog, AUX_REG, true);
                                        EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
                                }

                                /* sub r11, r10 */
                                maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
                                EMIT2(0x29, add_2reg(0xC0, AUX_REG, BPF_REG_AX));

                                /* movabsq r10, limit */
                                emit_mov_imm64(&prog, BPF_REG_AX, (long)limit >> 32,
                                               (u32)(long)limit);

                                /* cmp r10, r11 */
                                maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
                                EMIT2(0x39, add_2reg(0xC0, AUX_REG, BPF_REG_AX));

                                /* if unsigned '>', goto load */
                                EMIT2(X86_JA, 0);
                                end_of_jmp = prog;

                                /* xor dst_reg, dst_reg */
                                emit_mov_imm32(&prog, false, dst_reg, 0);
                                /* jmp byte_after_ldx */
                                EMIT2(0xEB, 0);

                                /* populate jmp_offset for JAE above to jump to start_of_ldx */
                                start_of_ldx = prog;
                                end_of_jmp[-1] = start_of_ldx - end_of_jmp;
                        }
                        if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX ||
                            BPF_MODE(insn->code) == BPF_MEMSX)
                                emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
                        else
                                emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
                        if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
                            BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
                                struct exception_table_entry *ex;
                                u8 *_insn = image + proglen + (start_of_ldx - temp);
                                s64 delta;

                                /* populate jmp_offset for JMP above */
                                start_of_ldx[-1] = prog - start_of_ldx;

                                if (!bpf_prog->aux->extable)
                                        break;

                                if (excnt >= bpf_prog->aux->num_exentries) {
                                        pr_err("ex gen bug\n");
                                        return -EFAULT;
                                }
                                ex = &bpf_prog->aux->extable[excnt++];

                                delta = _insn - (u8 *)&ex->insn;
                                if (!is_simm32(delta)) {
                                        pr_err("extable->insn doesn't fit into 32-bit\n");
                                        return -EFAULT;
                                }
                                /* switch ex to rw buffer for writes */
                                ex = (void *)rw_image + ((void *)ex - (void *)image);

                                ex->insn = delta;

                                ex->data = EX_TYPE_BPF;

                                if (dst_reg > BPF_REG_9) {
                                        pr_err("verifier error\n");
                                        return -EFAULT;
                                }
                                /*
                                 * Compute size of x86 insn and its target dest x86 register.
                                 * ex_handler_bpf() will use lower 8 bits to adjust
                                 * pt_regs->ip to jump over this x86 instruction
                                 * and upper bits to figure out which pt_regs to zero out.
                                 * End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
                                 * of 4 bytes will be ignored and rbx will be zero inited.
                                 */
                                ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
                        }
                        break;

                case BPF_STX | BPF_ATOMIC | BPF_W:
                case BPF_STX | BPF_ATOMIC | BPF_DW:
                        if (insn->imm == (BPF_AND | BPF_FETCH) ||
                            insn->imm == (BPF_OR | BPF_FETCH) ||
                            insn->imm == (BPF_XOR | BPF_FETCH)) {
                                bool is64 = BPF_SIZE(insn->code) == BPF_DW;
                                u32 real_src_reg = src_reg;
                                u32 real_dst_reg = dst_reg;
                                u8 *branch_target;

                                /*
                                 * Can't be implemented with a single x86 insn.
                                 * Need to do a CMPXCHG loop.
                                 */

                                /* Will need RAX as a CMPXCHG operand so save R0 */
                                emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
                                if (src_reg == BPF_REG_0)
                                        real_src_reg = BPF_REG_AX;
                                if (dst_reg == BPF_REG_0)
                                        real_dst_reg = BPF_REG_AX;

                                branch_target = prog;
                                /* Load old value */
                                emit_ldx(&prog, BPF_SIZE(insn->code),
                                         BPF_REG_0, real_dst_reg, insn->off);
                                /*
                                 * Perform the (commutative) operation locally,
                                 * put the result in the AUX_REG.
                                 */
                                emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
                                maybe_emit_mod(&prog, AUX_REG, real_src_reg, is64);
                                EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
                                      add_2reg(0xC0, AUX_REG, real_src_reg));
                                /* Attempt to swap in new value */
                                err = emit_atomic(&prog, BPF_CMPXCHG,
                                                  real_dst_reg, AUX_REG,
                                                  insn->off,
                                                  BPF_SIZE(insn->code));
                                if (WARN_ON(err))
                                        return err;
                                /*
                                 * ZF tells us whether we won the race. If it's
                                 * cleared we need to try again.
                                 */
                                EMIT2(X86_JNE, -(prog - branch_target) - 2);
                                /* Return the pre-modification value */
                                emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0);
                                /* Restore R0 after clobbering RAX */
                                emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
                                break;
                        }

                        err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
                                          insn->off, BPF_SIZE(insn->code));
                        if (err)
                                return err;
                        break;

                case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
                case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
                        start_of_ldx = prog;
                        err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code),
                                                dst_reg, src_reg, X86_REG_R12, insn->off);
                        if (err)
                                return err;
                        goto populate_extable;

                        /* call */
                case BPF_JMP | BPF_CALL: {
                        u8 *ip = image + addrs[i - 1];

                        func = (u8 *) __bpf_call_base + imm32;
                        if (tail_call_reachable) {
                                RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
                                ip += 7;
                        }
                        if (!imm32)
                                return -EINVAL;
                        ip += x86_call_depth_emit_accounting(&prog, func, ip);
                        if (emit_call(&prog, func, ip))
                                return -EINVAL;
                        break;
                }

                case BPF_JMP | BPF_TAIL_CALL:
                        if (imm32)
                                emit_bpf_tail_call_direct(bpf_prog,
                                                          &bpf_prog->aux->poke_tab[imm32 - 1],
                                                          &prog, image + addrs[i - 1],
                                                          callee_regs_used,
                                                          bpf_prog->aux->stack_depth,
                                                          ctx);
                        else
                                emit_bpf_tail_call_indirect(bpf_prog,
                                                            &prog,
                                                            callee_regs_used,
                                                            bpf_prog->aux->stack_depth,
                                                            image + addrs[i - 1],
                                                            ctx);
                        break;

                        /* cond jump */
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JNE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JLT | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JLE | BPF_X:
                case BPF_JMP | BPF_JSGT | BPF_X:
                case BPF_JMP | BPF_JSLT | BPF_X:
                case BPF_JMP | BPF_JSGE | BPF_X:
                case BPF_JMP | BPF_JSLE | BPF_X:
                case BPF_JMP32 | BPF_JEQ | BPF_X:
                case BPF_JMP32 | BPF_JNE | BPF_X:
                case BPF_JMP32 | BPF_JGT | BPF_X:
                case BPF_JMP32 | BPF_JLT | BPF_X:
                case BPF_JMP32 | BPF_JGE | BPF_X:
                case BPF_JMP32 | BPF_JLE | BPF_X:
                case BPF_JMP32 | BPF_JSGT | BPF_X:
                case BPF_JMP32 | BPF_JSLT | BPF_X:
                case BPF_JMP32 | BPF_JSGE | BPF_X:
                case BPF_JMP32 | BPF_JSLE | BPF_X:
                        /* cmp dst_reg, src_reg */
                        maybe_emit_mod(&prog, dst_reg, src_reg,
                                       BPF_CLASS(insn->code) == BPF_JMP);
                        EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
                        goto emit_cond_jmp;

                case BPF_JMP | BPF_JSET | BPF_X:
                case BPF_JMP32 | BPF_JSET | BPF_X:
                        /* test dst_reg, src_reg */
                        maybe_emit_mod(&prog, dst_reg, src_reg,
                                       BPF_CLASS(insn->code) == BPF_JMP);
                        EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
                        goto emit_cond_jmp;

                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP32 | BPF_JSET | BPF_K:
                        /* test dst_reg, imm32 */
                        maybe_emit_1mod(&prog, dst_reg,
                                        BPF_CLASS(insn->code) == BPF_JMP);
                        EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
                        goto emit_cond_jmp;

                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JNE | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JLT | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JLE | BPF_K:
                case BPF_JMP | BPF_JSGT | BPF_K:
                case BPF_JMP | BPF_JSLT | BPF_K:
                case BPF_JMP | BPF_JSGE | BPF_K:
                case BPF_JMP | BPF_JSLE | BPF_K:
                case BPF_JMP32 | BPF_JEQ | BPF_K:
                case BPF_JMP32 | BPF_JNE | BPF_K:
                case BPF_JMP32 | BPF_JGT | BPF_K:
                case BPF_JMP32 | BPF_JLT | BPF_K:
                case BPF_JMP32 | BPF_JGE | BPF_K:
                case BPF_JMP32 | BPF_JLE | BPF_K:
                case BPF_JMP32 | BPF_JSGT | BPF_K:
                case BPF_JMP32 | BPF_JSLT | BPF_K:
                case BPF_JMP32 | BPF_JSGE | BPF_K:
                case BPF_JMP32 | BPF_JSLE | BPF_K:
                        /* test dst_reg, dst_reg to save one extra byte */
                        if (imm32 == 0) {
                                maybe_emit_mod(&prog, dst_reg, dst_reg,
                                               BPF_CLASS(insn->code) == BPF_JMP);
                                EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
                                goto emit_cond_jmp;
                        }

                        /* cmp dst_reg, imm8/32 */
                        maybe_emit_1mod(&prog, dst_reg,
                                        BPF_CLASS(insn->code) == BPF_JMP);

                        if (is_imm8(imm32))
                                EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
                        else
                                EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);

emit_cond_jmp:                /* Convert BPF opcode to x86 */
                        switch (BPF_OP(insn->code)) {
                        case BPF_JEQ:
                                jmp_cond = X86_JE;
                                break;
                        case BPF_JSET:
                        case BPF_JNE:
                                jmp_cond = X86_JNE;
                                break;
                        case BPF_JGT:
                                /* GT is unsigned '>', JA in x86 */
                                jmp_cond = X86_JA;
                                break;
                        case BPF_JLT:
                                /* LT is unsigned '<', JB in x86 */
                                jmp_cond = X86_JB;
                                break;
                        case BPF_JGE:
                                /* GE is unsigned '>=', JAE in x86 */
                                jmp_cond = X86_JAE;
                                break;
                        case BPF_JLE:
                                /* LE is unsigned '<=', JBE in x86 */
                                jmp_cond = X86_JBE;
                                break;
                        case BPF_JSGT:
                                /* Signed '>', GT in x86 */
                                jmp_cond = X86_JG;
                                break;
                        case BPF_JSLT:
                                /* Signed '<', LT in x86 */
                                jmp_cond = X86_JL;
                                break;
                        case BPF_JSGE:
                                /* Signed '>=', GE in x86 */
                                jmp_cond = X86_JGE;
                                break;
                        case BPF_JSLE:
                                /* Signed '<=', LE in x86 */
                                jmp_cond = X86_JLE;
                                break;
                        default: /* to silence GCC warning */
                                return -EFAULT;
                        }
                        jmp_offset = addrs[i + insn->off] - addrs[i];
                        if (is_imm8(jmp_offset)) {
                                if (jmp_padding) {
                                        /* To keep the jmp_offset valid, the extra bytes are
                                         * padded before the jump insn, so we subtract the
                                         * 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
                                         *
                                         * If the previous pass already emits an imm8
                                         * jmp_cond, then this BPF insn won't shrink, so
                                         * "nops" is 0.
                                         *
                                         * On the other hand, if the previous pass emits an
                                         * imm32 jmp_cond, the extra 4 bytes(*) is padded to
                                         * keep the image from shrinking further.
                                         *
                                         * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond
                                         *     is 2 bytes, so the size difference is 4 bytes.
                                         */
                                        nops = INSN_SZ_DIFF - 2;
                                        if (nops != 0 && nops != 4) {
                                                pr_err("unexpected jmp_cond padding: %d bytes\n",
                                                       nops);
                                                return -EFAULT;
                                        }
                                        emit_nops(&prog, nops);
                                }
                                EMIT2(jmp_cond, jmp_offset);
                        } else if (is_simm32(jmp_offset)) {
                                EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
                        } else {
                                pr_err("cond_jmp gen bug %llx\n", jmp_offset);
                                return -EFAULT;
                        }

                        break;

                case BPF_JMP | BPF_JA:
                case BPF_JMP32 | BPF_JA:
                        if (BPF_CLASS(insn->code) == BPF_JMP) {
                                if (insn->off == -1)
                                        /* -1 jmp instructions will always jump
                                         * backwards two bytes. Explicitly handling
                                         * this case avoids wasting too many passes
                                         * when there are long sequences of replaced
                                         * dead code.
                                         */
                                        jmp_offset = -2;
                                else
                                        jmp_offset = addrs[i + insn->off] - addrs[i];
                        } else {
                                if (insn->imm == -1)
                                        jmp_offset = -2;
                                else
                                        jmp_offset = addrs[i + insn->imm] - addrs[i];
                        }

                        if (!jmp_offset) {
                                /*
                                 * If jmp_padding is enabled, the extra nops will
                                 * be inserted. Otherwise, optimize out nop jumps.
                                 */
                                if (jmp_padding) {
                                        /* There are 3 possible conditions.
                                         * (1) This BPF_JA is already optimized out in
                                         *     the previous run, so there is no need
                                         *     to pad any extra byte (0 byte).
                                         * (2) The previous pass emits an imm8 jmp,
                                         *     so we pad 2 bytes to match the previous
                                         *     insn size.
                                         * (3) Similarly, the previous pass emits an
                                         *     imm32 jmp, and 5 bytes is padded.
                                         */
                                        nops = INSN_SZ_DIFF;
                                        if (nops != 0 && nops != 2 && nops != 5) {
                                                pr_err("unexpected nop jump padding: %d bytes\n",
                                                       nops);
                                                return -EFAULT;
                                        }
                                        emit_nops(&prog, nops);
                                }
                                break;
                        }
emit_jmp:
                        if (is_imm8(jmp_offset)) {
                                if (jmp_padding) {
                                        /* To avoid breaking jmp_offset, the extra bytes
                                         * are padded before the actual jmp insn, so
                                         * 2 bytes is subtracted from INSN_SZ_DIFF.
                                         *
                                         * If the previous pass already emits an imm8
                                         * jmp, there is nothing to pad (0 byte).
                                         *
                                         * If it emits an imm32 jmp (5 bytes) previously
                                         * and now an imm8 jmp (2 bytes), then we pad
                                         * (5 - 2 = 3) bytes to stop the image from
                                         * shrinking further.
                                         */
                                        nops = INSN_SZ_DIFF - 2;
                                        if (nops != 0 && nops != 3) {
                                                pr_err("unexpected jump padding: %d bytes\n",
                                                       nops);
                                                return -EFAULT;
                                        }
                                        emit_nops(&prog, INSN_SZ_DIFF - 2);
                                }
                                EMIT2(0xEB, jmp_offset);
                        } else if (is_simm32(jmp_offset)) {
                                EMIT1_off32(0xE9, jmp_offset);
                        } else {
                                pr_err("jmp gen bug %llx\n", jmp_offset);
                                return -EFAULT;
                        }
                        break;

                case BPF_JMP | BPF_EXIT:
                        if (seen_exit) {
                                jmp_offset = ctx->cleanup_addr - addrs[i];
                                goto emit_jmp;
                        }
                        seen_exit = true;
                        /* Update cleanup_addr */
                        ctx->cleanup_addr = proglen;
                        if (bpf_prog->aux->exception_boundary) {
                                pop_callee_regs(&prog, all_callee_regs_used);
                                pop_r12(&prog);
                        } else {
                                pop_callee_regs(&prog, callee_regs_used);
                                if (arena_vm_start)
                                        pop_r12(&prog);
                        }
                        EMIT1(0xC9);         /* leave */
                        emit_return(&prog, image + addrs[i - 1] + (prog - temp));
                        break;

                default:
                        /*
                         * By design x86-64 JIT should support all BPF instructions.
                         * This error will be seen if new instruction was added
                         * to the interpreter, but not to the JIT, or if there is
                         * junk in bpf_prog.
                         */
                        pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
                        return -EINVAL;
                }

                ilen = prog - temp;
                if (ilen > BPF_MAX_INSN_SIZE) {
                        pr_err("bpf_jit: fatal insn size error\n");
                        return -EFAULT;
                }

                if (image) {
                        /*
                         * When populating the image, assert that:
                         *
                         *  i) We do not write beyond the allocated space, and
                         * ii) addrs[i] did not change from the prior run, in order
                         *     to validate assumptions made for computing branch
                         *     displacements.
                         */
                        if (unlikely(proglen + ilen > oldproglen ||
                                     proglen + ilen != addrs[i])) {
                                pr_err("bpf_jit: fatal error\n");
                                return -EFAULT;
                        }
                        memcpy(rw_image + proglen, temp, ilen);
                }
                proglen += ilen;
                addrs[i] = proglen;
                prog = temp;
        }

        if (image && excnt != bpf_prog->aux->num_exentries) {
                pr_err("extable is not populated\n");
                return -EFAULT;
        }
        return proglen;
}

static void clean_stack_garbage(const struct btf_func_model *m,
                                u8 **pprog, int nr_stack_slots,
                                int stack_size)
{
        int arg_size, off;
        u8 *prog;

        /* Generally speaking, the compiler will pass the arguments
         * on-stack with "push" instruction, which will take 8-byte
         * on the stack. In this case, there won't be garbage values
         * while we copy the arguments from origin stack frame to current
         * in BPF_DW.
         *
         * However, sometimes the compiler will only allocate 4-byte on
         * the stack for the arguments. For now, this case will only
         * happen if there is only one argument on-stack and its size
         * not more than 4 byte. In this case, there will be garbage
         * values on the upper 4-byte where we store the argument on
         * current stack frame.
         *
         * arguments on origin stack:
         *
         * stack_arg_1(4-byte) xxx(4-byte)
         *
         * what we copy:
         *
         * stack_arg_1(8-byte): stack_arg_1(origin) xxx
         *
         * and the xxx is the garbage values which we should clean here.
         */
        if (nr_stack_slots != 1)
                return;

        /* the size of the last argument */
        arg_size = m->arg_size[m->nr_args - 1];
        if (arg_size <= 4) {
                off = -(stack_size - 4);
                prog = *pprog;
                /* mov DWORD PTR [rbp + off], 0 */
                if (!is_imm8(off))
                        EMIT2_off32(0xC7, 0x85, off);
                else
                        EMIT3(0xC7, 0x45, off);
                EMIT(0, 4);
                *pprog = prog;
        }
}

/* get the count of the regs that are used to pass arguments */
static int get_nr_used_regs(const struct btf_func_model *m)
{
        int i, arg_regs, nr_used_regs = 0;

        for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
                arg_regs = (m->arg_size[i] + 7) / 8;
                if (nr_used_regs + arg_regs <= 6)
                        nr_used_regs += arg_regs;

                if (nr_used_regs >= 6)
                        break;
        }

        return nr_used_regs;
}

static void save_args(const struct btf_func_model *m, u8 **prog,
                      int stack_size, bool for_call_origin)
{
        int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0;
        int i, j;

        /* Store function arguments to stack.
         * For a function that accepts two pointers the sequence will be:
         * mov QWORD PTR [rbp-0x10],rdi
         * mov QWORD PTR [rbp-0x8],rsi
         */
        for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
                arg_regs = (m->arg_size[i] + 7) / 8;

                /* According to the research of Yonghong, struct members
                 * should be all in register or all on the stack.
                 * Meanwhile, the compiler will pass the argument on regs
                 * if the remaining regs can hold the argument.
                 *
                 * Disorder of the args can happen. For example:
                 *
                 * struct foo_struct {
                 *     long a;
                 *     int b;
                 * };
                 * int foo(char, char, char, char, char, struct foo_struct,
                 *         char);
                 *
                 * the arg1-5,arg7 will be passed by regs, and arg6 will
                 * by stack.
                 */
                if (nr_regs + arg_regs > 6) {
                        /* copy function arguments from origin stack frame
                         * into current stack frame.
                         *
                         * The starting address of the arguments on-stack
                         * is:
                         *   rbp + 8(push rbp) +
                         *   8(return addr of origin call) +
                         *   8(return addr of the caller)
                         * which means: rbp + 24
                         */
                        for (j = 0; j < arg_regs; j++) {
                                emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP,
                                         nr_stack_slots * 8 + 0x18);
                                emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0,
                                         -stack_size);

                                if (!nr_stack_slots)
                                        first_off = stack_size;
                                stack_size -= 8;
                                nr_stack_slots++;
                        }
                } else {
                        /* Only copy the arguments on-stack to current
                         * 'stack_size' and ignore the regs, used to
                         * prepare the arguments on-stack for origin call.
                         */
                        if (for_call_origin) {
                                nr_regs += arg_regs;
                                continue;
                        }

                        /* copy the arguments from regs into stack */
                        for (j = 0; j < arg_regs; j++) {
                                emit_stx(prog, BPF_DW, BPF_REG_FP,
                                         nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
                                         -stack_size);
                                stack_size -= 8;
                                nr_regs++;
                        }
                }
        }

        clean_stack_garbage(m, prog, nr_stack_slots, first_off);
}

static void restore_regs(const struct btf_func_model *m, u8 **prog,
                         int stack_size)
{
        int i, j, arg_regs, nr_regs = 0;

        /* Restore function arguments from stack.
         * For a function that accepts two pointers the sequence will be:
         * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
         * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
         *
         * The logic here is similar to what we do in save_args()
         */
        for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
                arg_regs = (m->arg_size[i] + 7) / 8;
                if (nr_regs + arg_regs <= 6) {
                        for (j = 0; j < arg_regs; j++) {
                                emit_ldx(prog, BPF_DW,
                                         nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
                                         BPF_REG_FP,
                                         -stack_size);
                                stack_size -= 8;
                                nr_regs++;
                        }
                } else {
                        stack_size -= 8 * arg_regs;
                }

                if (nr_regs >= 6)
                        break;
        }
}

static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
                           struct bpf_tramp_link *l, int stack_size,
                           int run_ctx_off, bool save_ret,
                           void *image, void *rw_image)
{
        u8 *prog = *pprog;
        u8 *jmp_insn;
        int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
        struct bpf_prog *p = l->link.prog;
        u64 cookie = l->cookie;

        /* mov rdi, cookie */
        emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie);

        /* Prepare struct bpf_tramp_run_ctx.
         *
         * bpf_tramp_run_ctx is already preserved by
         * arch_prepare_bpf_trampoline().
         *
         * mov QWORD PTR [rbp - run_ctx_off + ctx_cookie_off], rdi
         */
        emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);

        /* arg1: mov rdi, progs[i] */
        emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
        /* arg2: lea rsi, [rbp - ctx_cookie_off] */
        if (!is_imm8(-run_ctx_off))
                EMIT3_off32(0x48, 0x8D, 0xB5, -run_ctx_off);
        else
                EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);

        if (emit_rsb_call(&prog, bpf_trampoline_enter(p), image + (prog - (u8 *)rw_image)))
                return -EINVAL;
        /* remember prog start time returned by __bpf_prog_enter */
        emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);

        /* if (__bpf_prog_enter*(prog) == 0)
         *        goto skip_exec_of_prog;
         */
        EMIT3(0x48, 0x85, 0xC0);  /* test rax,rax */
        /* emit 2 nops that will be replaced with JE insn */
        jmp_insn = prog;
        emit_nops(&prog, 2);

        /* arg1: lea rdi, [rbp - stack_size] */
        if (!is_imm8(-stack_size))
                EMIT3_off32(0x48, 0x8D, 0xBD, -stack_size);
        else
                EMIT4(0x48, 0x8D, 0x7D, -stack_size);
        /* arg2: progs[i]->insnsi for interpreter */
        if (!p->jited)
                emit_mov_imm64(&prog, BPF_REG_2,
                               (long) p->insnsi >> 32,
                               (u32) (long) p->insnsi);
        /* call JITed bpf program or interpreter */
        if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
                return -EINVAL;

        /*
         * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
         * of the previous call which is then passed on the stack to
         * the next BPF program.
         *
         * BPF_TRAMP_FENTRY trampoline may need to return the return
         * value of BPF_PROG_TYPE_STRUCT_OPS prog.
         */
        if (save_ret)
                emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);

        /* replace 2 nops with JE insn, since jmp target is known */
        jmp_insn[0] = X86_JE;
        jmp_insn[1] = prog - jmp_insn - 2;

        /* arg1: mov rdi, progs[i] */
        emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
        /* arg2: mov rsi, rbx <- start time in nsec */
        emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
        /* arg3: lea rdx, [rbp - run_ctx_off] */
        if (!is_imm8(-run_ctx_off))
                EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
        else
                EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
        if (emit_rsb_call(&prog, bpf_trampoline_exit(p), image + (prog - (u8 *)rw_image)))
                return -EINVAL;

        *pprog = prog;
        return 0;
}

static void emit_align(u8 **pprog, u32 align)
{
        u8 *target, *prog = *pprog;

        target = PTR_ALIGN(prog, align);
        if (target != prog)
                emit_nops(&prog, target - prog);

        *pprog = prog;
}

static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
{
        u8 *prog = *pprog;
        s64 offset;

        offset = func - (ip + 2 + 4);
        if (!is_simm32(offset)) {
                pr_err("Target %p is out of range\n", func);
                return -EINVAL;
        }
        EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
        *pprog = prog;
        return 0;
}

static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
                      struct bpf_tramp_links *tl, int stack_size,
                      int run_ctx_off, bool save_ret,
                      void *image, void *rw_image)
{
        int i;
        u8 *prog = *pprog;

        for (i = 0; i < tl->nr_links; i++) {
                if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
                                    run_ctx_off, save_ret, image, rw_image))
                        return -EINVAL;
        }
        *pprog = prog;
        return 0;
}

static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
                              struct bpf_tramp_links *tl, int stack_size,
                              int run_ctx_off, u8 **branches,
                              void *image, void *rw_image)
{
        u8 *prog = *pprog;
        int i;

        /* The first fmod_ret program will receive a garbage return value.
         * Set this to 0 to avoid confusing the program.
         */
        emit_mov_imm32(&prog, false, BPF_REG_0, 0);
        emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
        for (i = 0; i < tl->nr_links; i++) {
                if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
                                    image, rw_image))
                        return -EINVAL;

                /* mod_ret prog stored return value into [rbp - 8]. Emit:
                 * if (*(u64 *)(rbp - 8) !=  0)
                 *        goto do_fexit;
                 */
                /* cmp QWORD PTR [rbp - 0x8], 0x0 */
                EMIT4(0x48, 0x83, 0x7d, 0xf8); EMIT1(0x00);

                /* Save the location of the branch and Generate 6 nops
                 * (4 bytes for an offset and 2 bytes for the jump) These nops
                 * are replaced with a conditional jump once do_fexit (i.e. the
                 * start of the fexit invocation) is finalized.
                 */
                branches[i] = prog;
                emit_nops(&prog, 4 + 2);
        }

        *pprog = prog;
        return 0;
}

/* Example:
 * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 * its 'struct btf_func_model' will be nr_args=2
 * The assembly code when eth_type_trans is executing after trampoline:
 *
 * push rbp
 * mov rbp, rsp
 * sub rsp, 16                     // space for skb and dev
 * push rbx                        // temp regs to pass start time
 * mov qword ptr [rbp - 16], rdi   // save skb pointer to stack
 * mov qword ptr [rbp - 8], rsi    // save dev pointer to stack
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time in bpf stats are enabled
 * lea rdi, [rbp - 16]             // R1==ctx of bpf prog
 * call addr_of_jited_FENTRY_prog
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rdi, qword ptr [rbp - 16]   // restore skb pointer from stack
 * mov rsi, qword ptr [rbp - 8]    // restore dev pointer from stack
 * pop rbx
 * leave
 * ret
 *
 * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
 * replaced with 'call generated_bpf_trampoline'. When it returns
 * eth_type_trans will continue executing with original skb and dev pointers.
 *
 * The assembly code when eth_type_trans is called from trampoline:
 *
 * push rbp
 * mov rbp, rsp
 * sub rsp, 24                     // space for skb, dev, return value
 * push rbx                        // temp regs to pass start time
 * mov qword ptr [rbp - 24], rdi   // save skb pointer to stack
 * mov qword ptr [rbp - 16], rsi   // save dev pointer to stack
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time if bpf stats are enabled
 * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
 * call addr_of_jited_FENTRY_prog  // bpf prog can access skb and dev
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rdi, qword ptr [rbp - 24]   // restore skb pointer from stack
 * mov rsi, qword ptr [rbp - 16]   // restore dev pointer from stack
 * call eth_type_trans+5           // execute body of eth_type_trans
 * mov qword ptr [rbp - 8], rax    // save return value
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time in bpf stats are enabled
 * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
 * call addr_of_jited_FEXIT_prog   // bpf prog can access skb, dev, return value
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rax, qword ptr [rbp - 8]    // restore eth_type_trans's return value
 * pop rbx
 * leave
 * add rsp, 8                      // skip eth_type_trans's frame
 * ret                             // return to its caller
 */
static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
                                         void *rw_image_end, void *image,
                                         const struct btf_func_model *m, u32 flags,
                                         struct bpf_tramp_links *tlinks,
                                         void *func_addr)
{
        int i, ret, nr_regs = m->nr_args, stack_size = 0;
        int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
        struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
        struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
        struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
        void *orig_call = func_addr;
        u8 **branches = NULL;
        u8 *prog;
        bool save_ret;

        /*
         * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
         * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
         * because @func_addr.
         */
        WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
                     (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));

        /* extra registers for struct arguments */
        for (i = 0; i < m->nr_args; i++) {
                if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
                        nr_regs += (m->arg_size[i] + 7) / 8 - 1;
        }

        /* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
         * are passed through regs, the remains are through stack.
         */
        if (nr_regs > MAX_BPF_FUNC_ARGS)
                return -ENOTSUPP;

        /* Generated trampoline stack layout:
         *
         * RBP + 8         [ return address  ]
         * RBP + 0         [ RBP             ]
         *
         * RBP - 8         [ return value    ]  BPF_TRAMP_F_CALL_ORIG or
         *                                      BPF_TRAMP_F_RET_FENTRY_RET flags
         *
         *                 [ reg_argN        ]  always
         *                 [ ...             ]
         * RBP - regs_off  [ reg_arg1        ]  program's ctx pointer
         *
         * RBP - nregs_off [ regs count             ]  always
         *
         * RBP - ip_off    [ traced function ]  BPF_TRAMP_F_IP_ARG flag
         *
         * RBP - rbx_off   [ rbx value       ]  always
         *
         * RBP - run_ctx_off [ bpf_tramp_run_ctx ]
         *
         *                     [ stack_argN ]  BPF_TRAMP_F_CALL_ORIG
         *                     [ ...        ]
         *                     [ stack_arg2 ]
         * RBP - arg_stack_off [ stack_arg1 ]
         * RSP                 [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
         */

        /* room for return value of orig_call or fentry prog */
        save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
        if (save_ret)
                stack_size += 8;

        stack_size += nr_regs * 8;
        regs_off = stack_size;

        /* regs count  */
        stack_size += 8;
        nregs_off = stack_size;

        if (flags & BPF_TRAMP_F_IP_ARG)
                stack_size += 8; /* room for IP address argument */

        ip_off = stack_size;

        stack_size += 8;
        rbx_off = stack_size;

        stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7;
        run_ctx_off = stack_size;

        if (nr_regs > 6 && (flags & BPF_TRAMP_F_CALL_ORIG)) {
                /* the space that used to pass arguments on-stack */
                stack_size += (nr_regs - get_nr_used_regs(m)) * 8;
                /* make sure the stack pointer is 16-byte aligned if we
                 * need pass arguments on stack, which means
                 *  [stack_size + 8(rbp) + 8(rip) + 8(origin rip)]
                 * should be 16-byte aligned. Following code depend on
                 * that stack_size is already 8-byte aligned.
                 */
                stack_size += (stack_size % 16) ? 0 : 8;
        }

        arg_stack_off = stack_size;

        if (flags & BPF_TRAMP_F_SKIP_FRAME) {
                /* skip patched call instruction and point orig_call to actual
                 * body of the kernel function.
                 */
                if (is_endbr(*(u32 *)orig_call))
                        orig_call += ENDBR_INSN_SIZE;
                orig_call += X86_PATCH_SIZE;
        }

        prog = rw_image;

        if (flags & BPF_TRAMP_F_INDIRECT) {
                /*
                 * Indirect call for bpf_struct_ops
                 */
                emit_cfi(&prog, cfi_get_func_hash(func_addr));
        } else {
                /*
                 * Direct-call fentry stub, as such it needs accounting for the
                 * __fentry__ call.
                 */
                x86_call_depth_emit_accounting(&prog, NULL, image);
        }
        EMIT1(0x55);                 /* push rbp */
        EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
        if (!is_imm8(stack_size)) {
                /* sub rsp, stack_size */
                EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
        } else {
                /* sub rsp, stack_size */
                EMIT4(0x48, 0x83, 0xEC, stack_size);
        }
        if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
                EMIT1(0x50);                /* push rax */
        /* mov QWORD PTR [rbp - rbx_off], rbx */
        emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);

        /* Store number of argument registers of the traced function:
         *   mov rax, nr_regs
         *   mov QWORD PTR [rbp - nregs_off], rax
         */
        emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs);
        emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off);

        if (flags & BPF_TRAMP_F_IP_ARG) {
                /* Store IP address of the traced function:
                 * movabsq rax, func_addr
                 * mov QWORD PTR [rbp - ip_off], rax
                 */
                emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr);
                emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
        }

        save_args(m, &prog, regs_off, false);

        if (flags & BPF_TRAMP_F_CALL_ORIG) {
                /* arg1: mov rdi, im */
                emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
                if (emit_rsb_call(&prog, __bpf_tramp_enter,
                                  image + (prog - (u8 *)rw_image))) {
                        ret = -EINVAL;
                        goto cleanup;
                }
        }

        if (fentry->nr_links) {
                if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
                               flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
                        return -EINVAL;
        }

        if (fmod_ret->nr_links) {
                branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
                                   GFP_KERNEL);
                if (!branches)
                        return -ENOMEM;

                if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
                                       run_ctx_off, branches, image, rw_image)) {
                        ret = -EINVAL;
                        goto cleanup;
                }
        }

        if (flags & BPF_TRAMP_F_CALL_ORIG) {
                restore_regs(m, &prog, regs_off);
                save_args(m, &prog, arg_stack_off, true);

                if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
                        /* Before calling the original function, restore the
                         * tail_call_cnt from stack to rax.
                         */
                        RESTORE_TAIL_CALL_CNT(stack_size);
                }

                if (flags & BPF_TRAMP_F_ORIG_STACK) {
                        emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
                        EMIT2(0xff, 0xd3); /* call *rbx */
                } else {
                        /* call original function */
                        if (emit_rsb_call(&prog, orig_call, image + (prog - (u8 *)rw_image))) {
                                ret = -EINVAL;
                                goto cleanup;
                        }
                }
                /* remember return value in a stack for bpf prog to access */
                emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
                im->ip_after_call = image + (prog - (u8 *)rw_image);
                emit_nops(&prog, X86_PATCH_SIZE);
        }

        if (fmod_ret->nr_links) {
                /* From Intel 64 and IA-32 Architectures Optimization
                 * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
                 * Coding Rule 11: All branch targets should be 16-byte
                 * aligned.
                 */
                emit_align(&prog, 16);
                /* Update the branches saved in invoke_bpf_mod_ret with the
                 * aligned address of do_fexit.
                 */
                for (i = 0; i < fmod_ret->nr_links; i++) {
                        emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
                                            image + (branches[i] - (u8 *)rw_image), X86_JNE);
                }
        }

        if (fexit->nr_links) {
                if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
                               false, image, rw_image)) {
                        ret = -EINVAL;
                        goto cleanup;
                }
        }

        if (flags & BPF_TRAMP_F_RESTORE_REGS)
                restore_regs(m, &prog, regs_off);

        /* This needs to be done regardless. If there were fmod_ret programs,
         * the return value is only updated on the stack and still needs to be
         * restored to R0.
         */
        if (flags & BPF_TRAMP_F_CALL_ORIG) {
                im->ip_epilogue = image + (prog - (u8 *)rw_image);
                /* arg1: mov rdi, im */
                emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
                if (emit_rsb_call(&prog, __bpf_tramp_exit, image + (prog - (u8 *)rw_image))) {
                        ret = -EINVAL;
                        goto cleanup;
                }
        } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
                /* Before running the original function, restore the
                 * tail_call_cnt from stack to rax.
                 */
                RESTORE_TAIL_CALL_CNT(stack_size);
        }

        /* restore return value of orig_call or fentry prog back into RAX */
        if (save_ret)
                emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);

        emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
        EMIT1(0xC9); /* leave */
        if (flags & BPF_TRAMP_F_SKIP_FRAME) {
                /* skip our return address and return to parent */
                EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
        }
        emit_return(&prog, image + (prog - (u8 *)rw_image));
        /* Make sure the trampoline generation logic doesn't overflow */
        if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
                ret = -EFAULT;
                goto cleanup;
        }
        ret = prog - (u8 *)rw_image + BPF_INSN_SAFETY;

cleanup:
        kfree(branches);
        return ret;
}

void *arch_alloc_bpf_trampoline(unsigned int size)
{
        return bpf_prog_pack_alloc(size, jit_fill_hole);
}

void arch_free_bpf_trampoline(void *image, unsigned int size)
{
        bpf_prog_pack_free(image, size);
}

int arch_protect_bpf_trampoline(void *image, unsigned int size)
{
        return 0;
}

int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
                                const struct btf_func_model *m, u32 flags,
                                struct bpf_tramp_links *tlinks,
                                void *func_addr)
{
        void *rw_image, *tmp;
        int ret;
        u32 size = image_end - image;

        /* rw_image doesn't need to be in module memory range, so we can
         * use kvmalloc.
         */
        rw_image = kvmalloc(size, GFP_KERNEL);
        if (!rw_image)
                return -ENOMEM;

        ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
                                            flags, tlinks, func_addr);
        if (ret < 0)
                goto out;

        tmp = bpf_arch_text_copy(image, rw_image, size);
        if (IS_ERR(tmp))
                ret = PTR_ERR(tmp);
out:
        kvfree(rw_image);
        return ret;
}

int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
                             struct bpf_tramp_links *tlinks, void *func_addr)
{
        struct bpf_tramp_image im;
        void *image;
        int ret;

        /* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
         * This will NOT cause fragmentation in direct map, as we do not
         * call set_memory_*() on this buffer.
         *
         * We cannot use kvmalloc here, because we need image to be in
         * module memory range.
         */
        image = bpf_jit_alloc_exec(PAGE_SIZE);
        if (!image)
                return -ENOMEM;

        ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
                                            m, flags, tlinks, func_addr);
        bpf_jit_free_exec(image);
        return ret;
}

static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{
        u8 *jg_reloc, *prog = *pprog;
        int pivot, err, jg_bytes = 1;
        s64 jg_offset;

        if (a == b) {
                /* Leaf node of recursion, i.e. not a range of indices
                 * anymore.
                 */
                EMIT1(add_1mod(0x48, BPF_REG_3));        /* cmp rdx,func */
                if (!is_simm32(progs[a]))
                        return -1;
                EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
                            progs[a]);
                err = emit_cond_near_jump(&prog,        /* je func */
                                          (void *)progs[a], image + (prog - buf),
                                          X86_JE);
                if (err)
                        return err;

                emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf));

                *pprog = prog;
                return 0;
        }

        /* Not a leaf node, so we pivot, and recursively descend into
         * the lower and upper ranges.
         */
        pivot = (b - a) / 2;
        EMIT1(add_1mod(0x48, BPF_REG_3));                /* cmp rdx,func */
        if (!is_simm32(progs[a + pivot]))
                return -1;
        EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);

        if (pivot > 2) {                                /* jg upper_part */
                /* Require near jump. */
                jg_bytes = 4;
                EMIT2_off32(0x0F, X86_JG + 0x10, 0);
        } else {
                EMIT2(X86_JG, 0);
        }
        jg_reloc = prog;

        err = emit_bpf_dispatcher(&prog, a, a + pivot,        /* emit lower_part */
                                  progs, image, buf);
        if (err)
                return err;

        /* From Intel 64 and IA-32 Architectures Optimization
         * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
         * Coding Rule 11: All branch targets should be 16-byte
         * aligned.
         */
        emit_align(&prog, 16);
        jg_offset = prog - jg_reloc;
        emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);

        err = emit_bpf_dispatcher(&prog, a + pivot + 1,        /* emit upper_part */
                                  b, progs, image, buf);
        if (err)
                return err;

        *pprog = prog;
        return 0;
}

static int cmp_ips(const void *a, const void *b)
{
        const s64 *ipa = a;
        const s64 *ipb = b;

        if (*ipa > *ipb)
                return 1;
        if (*ipa < *ipb)
                return -1;
        return 0;
}

int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
        u8 *prog = buf;

        sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
        return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
}

struct x64_jit_data {
        struct bpf_binary_header *rw_header;
        struct bpf_binary_header *header;
        int *addrs;
        u8 *image;
        int proglen;
        struct jit_context ctx;
};

#define MAX_PASSES 20
#define PADDING_PASSES (MAX_PASSES - 5)

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
        struct bpf_binary_header *rw_header = NULL;
        struct bpf_binary_header *header = NULL;
        struct bpf_prog *tmp, *orig_prog = prog;
        struct x64_jit_data *jit_data;
        int proglen, oldproglen = 0;
        struct jit_context ctx = {};
        bool tmp_blinded = false;
        bool extra_pass = false;
        bool padding = false;
        u8 *rw_image = NULL;
        u8 *image = NULL;
        int *addrs;
        int pass;
        int i;

        if (!prog->jit_requested)
                return orig_prog;

        tmp = bpf_jit_blind_constants(prog);
        /*
         * If blinding was requested and we failed during blinding,
         * we must fall back to the interpreter.
         */
        if (IS_ERR(tmp))
                return orig_prog;
        if (tmp != prog) {
                tmp_blinded = true;
                prog = tmp;
        }

        jit_data = prog->aux->jit_data;
        if (!jit_data) {
                jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
                if (!jit_data) {
                        prog = orig_prog;
                        goto out;
                }
                prog->aux->jit_data = jit_data;
        }
        addrs = jit_data->addrs;
        if (addrs) {
                ctx = jit_data->ctx;
                oldproglen = jit_data->proglen;
                image = jit_data->image;
                header = jit_data->header;
                rw_header = jit_data->rw_header;
                rw_image = (void *)rw_header + ((void *)image - (void *)header);
                extra_pass = true;
                padding = true;
                goto skip_init_addrs;
        }
        addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
        if (!addrs) {
                prog = orig_prog;
                goto out_addrs;
        }

        /*
         * Before first pass, make a rough estimation of addrs[]
         * each BPF instruction is translated to less than 64 bytes
         */
        for (proglen = 0, i = 0; i <= prog->len; i++) {
                proglen += 64;
                addrs[i] = proglen;
        }
        ctx.cleanup_addr = proglen;
skip_init_addrs:

        /*
         * JITed image shrinks with every pass and the loop iterates
         * until the image stops shrinking. Very large BPF programs
         * may converge on the last pass. In such case do one more
         * pass to emit the final image.
         */
        for (pass = 0; pass < MAX_PASSES || image; pass++) {
                if (!padding && pass >= PADDING_PASSES)
                        padding = true;
                proglen = do_jit(prog, addrs, image, rw_image, oldproglen, &ctx, padding);
                if (proglen <= 0) {
out_image:
                        image = NULL;
                        if (header) {
                                bpf_arch_text_copy(&header->size, &rw_header->size,
                                                   sizeof(rw_header->size));
                                bpf_jit_binary_pack_free(header, rw_header);
                        }
                        /* Fall back to interpreter mode */
                        prog = orig_prog;
                        if (extra_pass) {
                                prog->bpf_func = NULL;
                                prog->jited = 0;
                                prog->jited_len = 0;
                        }
                        goto out_addrs;
                }
                if (image) {
                        if (proglen != oldproglen) {
                                pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
                                       proglen, oldproglen);
                                goto out_image;
                        }
                        break;
                }
                if (proglen == oldproglen) {
                        /*
                         * The number of entries in extable is the number of BPF_LDX
                         * insns that access kernel memory via "pointer to BTF type".
                         * The verifier changed their opcode from LDX|MEM|size
                         * to LDX|PROBE_MEM|size to make JITing easier.
                         */
                        u32 align = __alignof__(struct exception_table_entry);
                        u32 extable_size = prog->aux->num_exentries *
                                sizeof(struct exception_table_entry);

                        /* allocate module memory for x86 insns and extable */
                        header = bpf_jit_binary_pack_alloc(roundup(proglen, align) + extable_size,
                                                           &image, align, &rw_header, &rw_image,
                                                           jit_fill_hole);
                        if (!header) {
                                prog = orig_prog;
                                goto out_addrs;
                        }
                        prog->aux->extable = (void *) image + roundup(proglen, align);
                }
                oldproglen = proglen;
                cond_resched();
        }

        if (bpf_jit_enable > 1)
                bpf_jit_dump(prog->len, proglen, pass + 1, rw_image);

        if (image) {
                if (!prog->is_func || extra_pass) {
                        /*
                         * bpf_jit_binary_pack_finalize fails in two scenarios:
                         *   1) header is not pointing to proper module memory;
                         *   2) the arch doesn't support bpf_arch_text_copy().
                         *
                         * Both cases are serious bugs and justify WARN_ON.
                         */
                        if (WARN_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header))) {
                                /* header has been freed */
                                header = NULL;
                                goto out_image;
                        }

                        bpf_tail_call_direct_fixup(prog);
                } else {
                        jit_data->addrs = addrs;
                        jit_data->ctx = ctx;
                        jit_data->proglen = proglen;
                        jit_data->image = image;
                        jit_data->header = header;
                        jit_data->rw_header = rw_header;
                }
                /*
                 * ctx.prog_offset is used when CFI preambles put code *before*
                 * the function. See emit_cfi(). For FineIBT specifically this code
                 * can also be executed and bpf_prog_kallsyms_add() will
                 * generate an additional symbol to cover this, hence also
                 * decrement proglen.
                 */
                prog->bpf_func = (void *)image + cfi_get_offset();
                prog->jited = 1;
                prog->jited_len = proglen - cfi_get_offset();
        } else {
                prog = orig_prog;
        }

        if (!image || !prog->is_func || extra_pass) {
                if (image)
                        bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
                kvfree(addrs);
                kfree(jit_data);
                prog->aux->jit_data = NULL;
        }
out:
        if (tmp_blinded)
                bpf_jit_prog_release_other(prog, prog == orig_prog ?
                                           tmp : orig_prog);
        return prog;
}

bool bpf_jit_supports_kfunc_call(void)
{
        return true;
}

void *bpf_arch_text_copy(void *dst, void *src, size_t len)
{
        if (text_poke_copy(dst, src, len) == NULL)
                return ERR_PTR(-EINVAL);
        return dst;
}

/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
bool bpf_jit_supports_subprog_tailcalls(void)
{
        return true;
}

bool bpf_jit_supports_percpu_insn(void)
{
        return true;
}

void bpf_jit_free(struct bpf_prog *prog)
{
        if (prog->jited) {
                struct x64_jit_data *jit_data = prog->aux->jit_data;
                struct bpf_binary_header *hdr;

                /*
                 * If we fail the final pass of JIT (from jit_subprogs),
                 * the program may not be finalized yet. Call finalize here
                 * before freeing it.
                 */
                if (jit_data) {
                        bpf_jit_binary_pack_finalize(prog, jit_data->header,
                                                     jit_data->rw_header);
                        kvfree(jit_data->addrs);
                        kfree(jit_data);
                }
                prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
                hdr = bpf_jit_binary_pack_hdr(prog);
                bpf_jit_binary_pack_free(hdr, NULL);
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
        }

        bpf_prog_unlock_free(prog);
}

bool bpf_jit_supports_exceptions(void)
{
        /* We unwind through both kernel frames (starting from within bpf_throw
         * call) and BPF frames. Therefore we require ORC unwinder to be enabled
         * to walk kernel frames and reach BPF frames in the stack trace.
         */
        return IS_ENABLED(CONFIG_UNWINDER_ORC);
}

void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
#if defined(CONFIG_UNWINDER_ORC)
        struct unwind_state state;
        unsigned long addr;

        for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
             unwind_next_frame(&state)) {
                addr = unwind_get_return_address(&state);
                if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp))
                        break;
        }
        return;
#endif
        WARN(1, "verification of programs using bpf_throw should have failed\n");
}

void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
                               struct bpf_prog *new, struct bpf_prog *old)
{
        u8 *old_addr, *new_addr, *old_bypass_addr;
        int ret;

        old_bypass_addr = old ? NULL : poke->bypass_addr;
        old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
        new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;

        /*
         * On program loading or teardown, the program's kallsym entry
         * might not be in place, so we use __bpf_arch_text_poke to skip
         * the kallsyms check.
         */
        if (new) {
                ret = __bpf_arch_text_poke(poke->tailcall_target,
                                           BPF_MOD_JUMP,
                                           old_addr, new_addr);
                BUG_ON(ret < 0);
                if (!old) {
                        ret = __bpf_arch_text_poke(poke->tailcall_bypass,
                                                   BPF_MOD_JUMP,
                                                   poke->bypass_addr,
                                                   NULL);
                        BUG_ON(ret < 0);
                }
        } else {
                ret = __bpf_arch_text_poke(poke->tailcall_bypass,
                                           BPF_MOD_JUMP,
                                           old_bypass_addr,
                                           poke->bypass_addr);
                BUG_ON(ret < 0);
                /* let other CPUs finish the execution of program
                 * so that it will not possible to expose them
                 * to invalid nop, stack unwind, nop state
                 */
                if (!ret)
                        synchronize_rcu();
                ret = __bpf_arch_text_poke(poke->tailcall_target,
                                           BPF_MOD_JUMP,
                                           old_addr, NULL);
                BUG_ON(ret < 0);
        }
}

bool bpf_jit_supports_arena(void)
{
        return true;
}

bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
{
        if (!in_arena)
                return true;
        switch (insn->code) {
        case BPF_STX | BPF_ATOMIC | BPF_W:
        case BPF_STX | BPF_ATOMIC | BPF_DW:
                if (insn->imm == (BPF_AND | BPF_FETCH) ||
                    insn->imm == (BPF_OR | BPF_FETCH) ||
                    insn->imm == (BPF_XOR | BPF_FETCH))
                        return false;
        }
        return true;
}

bool bpf_jit_supports_ptr_xchg(void)
{
        return true;
}

/* x86-64 JIT emits its own code to filter user addresses so return 0 here */
u64 bpf_arch_uaddress_limit(void)
{
        return 0;
}













































   10 


















    6 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kref.h - library routines for handling generic reference counted objects
 *
 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2004 IBM Corp.
 *
 * based on kobject.h which was:
 * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (C) 2002-2003 Open Source Development Labs
 */

#ifndef _KREF_H_
#define _KREF_H_

#include <linux/spinlock.h>
#include <linux/refcount.h>

struct kref {
        refcount_t refcount;
};

#define KREF_INIT(n)        { .refcount = REFCOUNT_INIT(n), }

/**
 * kref_init - initialize object.
 * @kref: object in question.
 */
static inline void kref_init(struct kref *kref)
{
        refcount_set(&kref->refcount, 1);
}

static inline unsigned int kref_read(const struct kref *kref)
{
        return refcount_read(&kref->refcount);
}

/**
 * kref_get - increment refcount for object.
 * @kref: object.
 */
static inline void kref_get(struct kref *kref)
{
        refcount_inc(&kref->refcount);
}

/**
 * kref_put - decrement refcount for object.
 * @kref: object.
 * @release: pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 *             This pointer is required, and it is not acceptable to pass kfree
 *             in as this function.
 *
 * Decrement the refcount, and if 0, call release().
 * Return 1 if the object was removed, otherwise return 0.  Beware, if this
 * function returns 0, you still can not count on the kref from remaining in
 * memory.  Only use the return value if you want to see if the kref is now
 * gone, not present.
 */
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
        if (refcount_dec_and_test(&kref->refcount)) {
                release(kref);
                return 1;
        }
        return 0;
}

static inline int kref_put_mutex(struct kref *kref,
                                 void (*release)(struct kref *kref),
                                 struct mutex *lock)
{
        if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

static inline int kref_put_lock(struct kref *kref,
                                void (*release)(struct kref *kref),
                                spinlock_t *lock)
{
        if (refcount_dec_and_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_get_unless_zero - Increment refcount for object unless it is zero.
 * @kref: object.
 *
 * Return non-zero if the increment succeeded. Otherwise return 0.
 *
 * This function is intended to simplify locking around refcounting for
 * objects that can be looked up from a lookup structure, and which are
 * removed from that lookup structure in the object destructor.
 * Operations on such objects require at least a read lock around
 * lookup + kref_get, and a write lock around kref_put + remove from lookup
 * structure. Furthermore, RCU implementations become extremely tricky.
 * With a lookup followed by a kref_get_unless_zero *with return value check*
 * locking in the kref_put path can be deferred to the actual removal from
 * the lookup structure and RCU lookups become trivial.
 */
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{
        return refcount_inc_not_zero(&kref->refcount);
}
#endif /* _KREF_H_ */


















































































































    5 






    3 









































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * xfrm_device.c - IPsec device offloading code.
 *
 * Copyright (c) 2015 secunet Security Networks AG
 *
 * Author:
 * Steffen Klassert <steffen.klassert@secunet.com>
 */

#include <linux/errno.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <net/dst.h>
#include <net/gso.h>
#include <net/xfrm.h>
#include <linux/notifier.h>

#ifdef CONFIG_XFRM_OFFLOAD
static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb,
                                  unsigned int hsize)
{
        struct xfrm_offload *xo = xfrm_offload(skb);

        skb_reset_mac_len(skb);
        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header -= x->props.header_len;

        pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len);
}

static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
                                    unsigned int hsize)

{
        struct xfrm_offload *xo = xfrm_offload(skb);

        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header = skb->network_header + hsize;

        skb_reset_mac_len(skb);
        pskb_pull(skb, skb->mac_len + x->props.header_len);
}

static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
                                  unsigned int hsize)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        int phlen = 0;

        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header = skb->network_header + hsize;

        skb_reset_mac_len(skb);
        if (x->sel.family != AF_INET6) {
                phlen = IPV4_BEET_PHMAXLEN;
                if (x->outer_mode.family == AF_INET6)
                        phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        }

        pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen));
}

/* Adjust pointers into the packet when IPsec is done at layer2 */
static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
{
        switch (x->outer_mode.encap) {
        case XFRM_MODE_TUNNEL:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_mode_tunnel_prep(x, skb,
                                                       sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_mode_tunnel_prep(x, skb,
                                                       sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_TRANSPORT:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_transport_prep(x, skb,
                                                     sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_transport_prep(x, skb,
                                                     sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_BEET:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_mode_beet_prep(x, skb,
                                                     sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_mode_beet_prep(x, skb,
                                                     sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_IN_TRIGGER:
                break;
        }
}

static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        __u32 seq = xo->seq.low;

        seq += skb_shinfo(skb)->gso_segs;
        if (unlikely(seq < xo->seq.low))
                return true;

        return false;
}

struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
        int err;
        unsigned long flags;
        struct xfrm_state *x;
        struct softnet_data *sd;
        struct sk_buff *skb2, *nskb, *pskb = NULL;
        netdev_features_t esp_features = features;
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct net_device *dev = skb->dev;
        struct sec_path *sp;

        if (!xo || (xo->flags & XFRM_XMIT))
                return skb;

        if (!(features & NETIF_F_HW_ESP))
                esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);

        sp = skb_sec_path(skb);
        x = sp->xvec[sp->len - 1];
        if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN)
                return skb;

        /* The packet was sent to HW IPsec packet offload engine,
         * but to wrong device. Drop the packet, so it won't skip
         * XFRM stack.
         */
        if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) {
                kfree_skb(skb);
                dev_core_stats_tx_dropped_inc(dev);
                return NULL;
        }

        /* This skb was already validated on the upper/virtual dev */
        if ((x->xso.dev != dev) && (x->xso.real_dev == dev))
                return skb;

        local_irq_save(flags);
        sd = this_cpu_ptr(&softnet_data);
        err = !skb_queue_empty(&sd->xfrm_backlog);
        local_irq_restore(flags);

        if (err) {
                *again = true;
                return skb;
        }

        if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
                                unlikely(xmit_xfrm_check_overflow(skb)))) {
                struct sk_buff *segs;

                /* Packet got rerouted, fixup features and segment it. */
                esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP);

                segs = skb_gso_segment(skb, esp_features);
                if (IS_ERR(segs)) {
                        kfree_skb(skb);
                        dev_core_stats_tx_dropped_inc(dev);
                        return NULL;
                } else {
                        consume_skb(skb);
                        skb = segs;
                }
        }

        if (!skb->next) {
                esp_features |= skb->dev->gso_partial_features;
                xfrm_outer_mode_prep(x, skb);

                xo->flags |= XFRM_DEV_RESUME;

                err = x->type_offload->xmit(x, skb, esp_features);
                if (err) {
                        if (err == -EINPROGRESS)
                                return NULL;

                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        kfree_skb(skb);
                        return NULL;
                }

                skb_push(skb, skb->data - skb_mac_header(skb));

                return skb;
        }

        skb_list_walk_safe(skb, skb2, nskb) {
                esp_features |= skb->dev->gso_partial_features;
                skb_mark_not_on_list(skb2);

                xo = xfrm_offload(skb2);
                xo->flags |= XFRM_DEV_RESUME;

                xfrm_outer_mode_prep(x, skb2);

                err = x->type_offload->xmit(x, skb2, esp_features);
                if (!err) {
                        skb2->next = nskb;
                } else if (err != -EINPROGRESS) {
                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        skb2->next = nskb;
                        kfree_skb_list(skb2);
                        return NULL;
                } else {
                        if (skb == skb2)
                                skb = nskb;
                        else
                                pskb->next = nskb;

                        continue;
                }

                skb_push(skb2, skb2->data - skb_mac_header(skb2));
                pskb = skb2;
        }

        return skb;
}
EXPORT_SYMBOL_GPL(validate_xmit_xfrm);

int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo,
                       struct netlink_ext_ack *extack)
{
        int err;
        struct dst_entry *dst;
        struct net_device *dev;
        struct xfrm_dev_offload *xso = &x->xso;
        xfrm_address_t *saddr;
        xfrm_address_t *daddr;
        bool is_packet_offload;

        if (!x->type_offload) {
                NL_SET_ERR_MSG(extack, "Type doesn't support offload");
                return -EINVAL;
        }

        if (xuo->flags &
            ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) {
                NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
                return -EINVAL;
        }

        if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) ||
            (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) {
                NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction");
                return -EINVAL;
        }

        is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET;

        /* We don't yet support UDP encapsulation and TFC padding. */
        if ((!is_packet_offload && x->encap) || x->tfcpad) {
                NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded");
                return -EINVAL;
        }

        dev = dev_get_by_index(net, xuo->ifindex);
        if (!dev) {
                if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
                        saddr = &x->props.saddr;
                        daddr = &x->id.daddr;
                } else {
                        saddr = &x->id.daddr;
                        daddr = &x->props.saddr;
                }

                dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
                                        x->props.family,
                                        xfrm_smark_get(0, x));
                if (IS_ERR(dst))
                        return (is_packet_offload) ? -EINVAL : 0;

                dev = dst->dev;

                dev_hold(dev);
                dst_release(dst);
        }

        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
                xso->dev = NULL;
                dev_put(dev);
                return (is_packet_offload) ? -EINVAL : 0;
        }

        if (!is_packet_offload && x->props.flags & XFRM_STATE_ESN &&
            !dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
                NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN");
                xso->dev = NULL;
                dev_put(dev);
                return -EINVAL;
        }

        xso->dev = dev;
        netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC);
        xso->real_dev = dev;

        if (xuo->flags & XFRM_OFFLOAD_INBOUND)
                xso->dir = XFRM_DEV_OFFLOAD_IN;
        else
                xso->dir = XFRM_DEV_OFFLOAD_OUT;

        if (is_packet_offload)
                xso->type = XFRM_DEV_OFFLOAD_PACKET;
        else
                xso->type = XFRM_DEV_OFFLOAD_CRYPTO;

        err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack);
        if (err) {
                xso->dev = NULL;
                xso->dir = 0;
                xso->real_dev = NULL;
                netdev_put(dev, &xso->dev_tracker);
                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;

                /* User explicitly requested packet offload mode and configured
                 * policy in addition to the XFRM state. So be civil to users,
                 * and return an error instead of taking fallback path.
                 *
                 * This WARN_ON() can be seen as a documentation for driver
                 * authors to do not return -EOPNOTSUPP in packet offload mode.
                 */
                WARN_ON(err == -EOPNOTSUPP && is_packet_offload);
                if (err != -EOPNOTSUPP || is_packet_offload) {
                        NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state");
                        return err;
                }
        }

        return 0;
}
EXPORT_SYMBOL_GPL(xfrm_dev_state_add);

int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                        struct xfrm_user_offload *xuo, u8 dir,
                        struct netlink_ext_ack *extack)
{
        struct xfrm_dev_offload *xdo = &xp->xdo;
        struct net_device *dev;
        int err;

        if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) {
                /* We support only packet offload mode and it means
                 * that user must set XFRM_OFFLOAD_PACKET bit.
                 */
                NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
                return -EINVAL;
        }

        dev = dev_get_by_index(net, xuo->ifindex);
        if (!dev)
                return -EINVAL;

        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) {
                xdo->dev = NULL;
                dev_put(dev);
                NL_SET_ERR_MSG(extack, "Policy offload is not supported");
                return -EINVAL;
        }

        xdo->dev = dev;
        netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
        xdo->real_dev = dev;
        xdo->type = XFRM_DEV_OFFLOAD_PACKET;
        switch (dir) {
        case XFRM_POLICY_IN:
                xdo->dir = XFRM_DEV_OFFLOAD_IN;
                break;
        case XFRM_POLICY_OUT:
                xdo->dir = XFRM_DEV_OFFLOAD_OUT;
                break;
        case XFRM_POLICY_FWD:
                xdo->dir = XFRM_DEV_OFFLOAD_FWD;
                break;
        default:
                xdo->dev = NULL;
                netdev_put(dev, &xdo->dev_tracker);
                NL_SET_ERR_MSG(extack, "Unrecognized offload direction");
                return -EINVAL;
        }

        err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack);
        if (err) {
                xdo->dev = NULL;
                xdo->real_dev = NULL;
                xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                xdo->dir = 0;
                netdev_put(dev, &xdo->dev_tracker);
                NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this policy");
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(xfrm_dev_policy_add);

bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
{
        int mtu;
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
        struct net_device *dev = x->xso.dev;

        if (!x->type_offload ||
            (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
                return false;

        if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET ||
            ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
             !xdst->child->xfrm)) {
                mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
                if (skb->len <= mtu)
                        goto ok;

                if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
                        goto ok;
        }

        return false;

ok:
        if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_offload_ok)
                return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x);

        return true;
}
EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);

void xfrm_dev_resume(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        int ret = NETDEV_TX_BUSY;
        struct netdev_queue *txq;
        struct softnet_data *sd;
        unsigned long flags;

        rcu_read_lock();
        txq = netdev_core_pick_tx(dev, skb, NULL);

        HARD_TX_LOCK(dev, txq, smp_processor_id());
        if (!netif_xmit_frozen_or_stopped(txq))
                skb = dev_hard_start_xmit(skb, dev, txq, &ret);
        HARD_TX_UNLOCK(dev, txq);

        if (!dev_xmit_complete(ret)) {
                local_irq_save(flags);
                sd = this_cpu_ptr(&softnet_data);
                skb_queue_tail(&sd->xfrm_backlog, skb);
                raise_softirq_irqoff(NET_TX_SOFTIRQ);
                local_irq_restore(flags);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(xfrm_dev_resume);

void xfrm_dev_backlog(struct softnet_data *sd)
{
        struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog;
        struct sk_buff_head list;
        struct sk_buff *skb;

        if (skb_queue_empty(xfrm_backlog))
                return;

        __skb_queue_head_init(&list);

        spin_lock(&xfrm_backlog->lock);
        skb_queue_splice_init(xfrm_backlog, &list);
        spin_unlock(&xfrm_backlog->lock);

        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);
                xfrm_dev_resume(skb);
        }

}
#endif

static int xfrm_api_check(struct net_device *dev)
{
#ifdef CONFIG_XFRM_OFFLOAD
        if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
            !(dev->features & NETIF_F_HW_ESP))
                return NOTIFY_BAD;

        if ((dev->features & NETIF_F_HW_ESP) &&
            (!(dev->xfrmdev_ops &&
               dev->xfrmdev_ops->xdo_dev_state_add &&
               dev->xfrmdev_ops->xdo_dev_state_delete)))
                return NOTIFY_BAD;
#else
        if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM))
                return NOTIFY_BAD;
#endif

        return NOTIFY_DONE;
}

static int xfrm_dev_down(struct net_device *dev)
{
        if (dev->features & NETIF_F_HW_ESP) {
                xfrm_dev_state_flush(dev_net(dev), dev, true);
                xfrm_dev_policy_flush(dev_net(dev), dev, true);
        }

        return NOTIFY_DONE;
}

static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_REGISTER:
                return xfrm_api_check(dev);

        case NETDEV_FEAT_CHANGE:
                return xfrm_api_check(dev);

        case NETDEV_DOWN:
        case NETDEV_UNREGISTER:
                return xfrm_dev_down(dev);
        }
        return NOTIFY_DONE;
}

static struct notifier_block xfrm_dev_notifier = {
        .notifier_call        = xfrm_dev_event,
};

void __init xfrm_dev_init(void)
{
        register_netdevice_notifier(&xfrm_dev_notifier);
}





























































































































































































































































































































































































    1 




    1 









    1 




    1 





























    1 

    1 







































































































































































































































































































































































































































    1 








    1 
    1 



    1 
    1 



    1 












    1 
    1 











    1 
    1 







    1 










    1 











































































































































































































































    1 























    1 

















    1 
    1 
    1 


































    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
/* SPDX-License-Identifier: GPL-2.0 */
/* Multipath TCP
 *
 * Copyright (c) 2017 - 2019, Intel Corporation.
 */

#ifndef __MPTCP_PROTOCOL_H
#define __MPTCP_PROTOCOL_H

#include <linux/random.h>
#include <net/tcp.h>
#include <net/inet_connection_sock.h>
#include <uapi/linux/mptcp.h>
#include <net/genetlink.h>
#include <net/rstreason.h>

#define MPTCP_SUPPORTED_VERSION        1

/* MPTCP option bits */
#define OPTION_MPTCP_MPC_SYN        BIT(0)
#define OPTION_MPTCP_MPC_SYNACK        BIT(1)
#define OPTION_MPTCP_MPC_ACK        BIT(2)
#define OPTION_MPTCP_MPJ_SYN        BIT(3)
#define OPTION_MPTCP_MPJ_SYNACK        BIT(4)
#define OPTION_MPTCP_MPJ_ACK        BIT(5)
#define OPTION_MPTCP_ADD_ADDR        BIT(6)
#define OPTION_MPTCP_RM_ADDR        BIT(7)
#define OPTION_MPTCP_FASTCLOSE        BIT(8)
#define OPTION_MPTCP_PRIO        BIT(9)
#define OPTION_MPTCP_RST        BIT(10)
#define OPTION_MPTCP_DSS        BIT(11)
#define OPTION_MPTCP_FAIL        BIT(12)

#define OPTION_MPTCP_CSUMREQD        BIT(13)

#define OPTIONS_MPTCP_MPC        (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \
                                 OPTION_MPTCP_MPC_ACK)
#define OPTIONS_MPTCP_MPJ        (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \
                                 OPTION_MPTCP_MPJ_ACK)

/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE        0
#define MPTCPOPT_MP_JOIN        1
#define MPTCPOPT_DSS                2
#define MPTCPOPT_ADD_ADDR        3
#define MPTCPOPT_RM_ADDR        4
#define MPTCPOPT_MP_PRIO        5
#define MPTCPOPT_MP_FAIL        6
#define MPTCPOPT_MP_FASTCLOSE        7
#define MPTCPOPT_RST                8

/* MPTCP suboption lengths */
#define TCPOLEN_MPTCP_MPC_SYN                4
#define TCPOLEN_MPTCP_MPC_SYNACK        12
#define TCPOLEN_MPTCP_MPC_ACK                20
#define TCPOLEN_MPTCP_MPC_ACK_DATA        22
#define TCPOLEN_MPTCP_MPJ_SYN                12
#define TCPOLEN_MPTCP_MPJ_SYNACK        16
#define TCPOLEN_MPTCP_MPJ_ACK                24
#define TCPOLEN_MPTCP_DSS_BASE                4
#define TCPOLEN_MPTCP_DSS_ACK32                4
#define TCPOLEN_MPTCP_DSS_ACK64                8
#define TCPOLEN_MPTCP_DSS_MAP32                10
#define TCPOLEN_MPTCP_DSS_MAP64                14
#define TCPOLEN_MPTCP_DSS_CHECKSUM        2
#define TCPOLEN_MPTCP_ADD_ADDR                16
#define TCPOLEN_MPTCP_ADD_ADDR_PORT        18
#define TCPOLEN_MPTCP_ADD_ADDR_BASE        8
#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT        10
#define TCPOLEN_MPTCP_ADD_ADDR6                28
#define TCPOLEN_MPTCP_ADD_ADDR6_PORT        30
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE        20
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT        22
#define TCPOLEN_MPTCP_PORT_LEN                2
#define TCPOLEN_MPTCP_PORT_ALIGN        2
#define TCPOLEN_MPTCP_RM_ADDR_BASE        3
#define TCPOLEN_MPTCP_PRIO                3
#define TCPOLEN_MPTCP_PRIO_ALIGN        4
#define TCPOLEN_MPTCP_FASTCLOSE                12
#define TCPOLEN_MPTCP_RST                4
#define TCPOLEN_MPTCP_FAIL                12

#define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM        (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA)

/* MPTCP MP_JOIN flags */
#define MPTCPOPT_BACKUP                BIT(0)
#define MPTCPOPT_THMAC_LEN        8

/* MPTCP MP_CAPABLE flags */
#define MPTCP_VERSION_MASK        (0x0F)
#define MPTCP_CAP_CHECKSUM_REQD        BIT(7)
#define MPTCP_CAP_EXTENSIBILITY        BIT(6)
#define MPTCP_CAP_DENY_JOIN_ID0        BIT(5)
#define MPTCP_CAP_HMAC_SHA256        BIT(0)
#define MPTCP_CAP_FLAG_MASK        (0x1F)

/* MPTCP DSS flags */
#define MPTCP_DSS_DATA_FIN        BIT(4)
#define MPTCP_DSS_DSN64                BIT(3)
#define MPTCP_DSS_HAS_MAP        BIT(2)
#define MPTCP_DSS_ACK64                BIT(1)
#define MPTCP_DSS_HAS_ACK        BIT(0)
#define MPTCP_DSS_FLAG_MASK        (0x1F)

/* MPTCP ADD_ADDR flags */
#define MPTCP_ADDR_ECHO                BIT(0)

/* MPTCP MP_PRIO flags */
#define MPTCP_PRIO_BKUP                BIT(0)

/* MPTCP TCPRST flags */
#define MPTCP_RST_TRANSIENT        BIT(0)

/* MPTCP socket atomic flags */
#define MPTCP_WORK_RTX                1
#define MPTCP_FALLBACK_DONE        2
#define MPTCP_WORK_CLOSE_SUBFLOW 3

/* MPTCP socket release cb flags */
#define MPTCP_PUSH_PENDING        1
#define MPTCP_CLEAN_UNA                2
#define MPTCP_ERROR_REPORT        3
#define MPTCP_RETRANSMIT        4
#define MPTCP_FLUSH_JOIN_LIST        5
#define MPTCP_SYNC_STATE        6
#define MPTCP_SYNC_SNDBUF        7

struct mptcp_skb_cb {
        u64 map_seq;
        u64 end_seq;
        u32 offset;
        u8  has_rxtstamp:1;
};

#define MPTCP_SKB_CB(__skb)        ((struct mptcp_skb_cb *)&((__skb)->cb[0]))

static inline bool before64(__u64 seq1, __u64 seq2)
{
        return (__s64)(seq1 - seq2) < 0;
}

#define after64(seq2, seq1)        before64(seq1, seq2)

struct mptcp_options_received {
        u64        sndr_key;
        u64        rcvr_key;
        u64        data_ack;
        u64        data_seq;
        u32        subflow_seq;
        u16        data_len;
        __sum16        csum;
        u16        suboptions;
        u32        token;
        u32        nonce;
        u16        use_map:1,
                dsn64:1,
                data_fin:1,
                use_ack:1,
                ack64:1,
                mpc_map:1,
                reset_reason:4,
                reset_transient:1,
                echo:1,
                backup:1,
                deny_join_id0:1,
                __unused:2;
        u8        join_id;
        u64        thmac;
        u8        hmac[MPTCPOPT_HMAC_LEN];
        struct mptcp_addr_info addr;
        struct mptcp_rm_list rm_list;
        u64        ahmac;
        u64        fail_seq;
};

static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
{
        return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
                     ((nib & 0xF) << 8) | field);
}

enum mptcp_pm_status {
        MPTCP_PM_ADD_ADDR_RECEIVED,
        MPTCP_PM_ADD_ADDR_SEND_ACK,
        MPTCP_PM_RM_ADDR_RECEIVED,
        MPTCP_PM_ESTABLISHED,
        MPTCP_PM_SUBFLOW_ESTABLISHED,
        MPTCP_PM_ALREADY_ESTABLISHED,        /* persistent status, set after ESTABLISHED event */
        MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is
                                         * accounted int id_avail_bitmap
                                         */
};

enum mptcp_pm_type {
        MPTCP_PM_TYPE_KERNEL = 0,
        MPTCP_PM_TYPE_USERSPACE,

        __MPTCP_PM_TYPE_NR,
        __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1,
};

/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */
#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1)

enum mptcp_addr_signal_status {
        MPTCP_ADD_ADDR_SIGNAL,
        MPTCP_ADD_ADDR_ECHO,
        MPTCP_RM_ADDR_SIGNAL,
};

/* max value of mptcp_addr_info.id */
#define MPTCP_PM_MAX_ADDR_ID                U8_MAX

struct mptcp_pm_data {
        struct mptcp_addr_info local;
        struct mptcp_addr_info remote;
        struct list_head anno_list;
        struct list_head userspace_pm_local_addr_list;

        spinlock_t        lock;                /*protects the whole PM data */

        u8                addr_signal;
        bool                server_side;
        bool                work_pending;
        bool                accept_addr;
        bool                accept_subflow;
        bool                remote_deny_join_id0;
        u8                add_addr_signaled;
        u8                add_addr_accepted;
        u8                local_addr_used;
        u8                pm_type;
        u8                subflows;
        u8                status;
        DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
        struct mptcp_rm_list rm_list_tx;
        struct mptcp_rm_list rm_list_rx;
};

struct mptcp_pm_addr_entry {
        struct list_head        list;
        struct mptcp_addr_info        addr;
        u8                        flags;
        int                        ifindex;
        struct socket                *lsk;
};

struct mptcp_data_frag {
        struct list_head list;
        u64 data_seq;
        u16 data_len;
        u16 offset;
        u16 overhead;
        u16 already_sent;
        struct page *page;
};

/* MPTCP connection sock */
struct mptcp_sock {
        /* inet_connection_sock must be the first member */
        struct inet_connection_sock sk;
        u64                local_key;                /* protected by the first subflow socket lock
                                                 * lockless access read
                                                 */
        u64                remote_key;                /* same as above */
        u64                write_seq;
        u64                bytes_sent;
        u64                snd_nxt;
        u64                bytes_received;
        u64                ack_seq;
        atomic64_t        rcv_wnd_sent;
        u64                rcv_data_fin_seq;
        u64                bytes_retrans;
        u64                bytes_consumed;
        int                rmem_fwd_alloc;
        int                snd_burst;
        int                old_wspace;
        u64                recovery_snd_nxt;        /* in recovery mode accept up to this seq;
                                                 * recovery related fields are under data_lock
                                                 * protection
                                                 */
        u64                bytes_acked;
        u64                snd_una;
        u64                wnd_end;
        u32                last_data_sent;
        u32                last_data_recv;
        u32                last_ack_recv;
        unsigned long        timer_ival;
        u32                token;
        int                rmem_released;
        unsigned long        flags;
        unsigned long        cb_flags;
        bool                recovery;                /* closing subflow write queue reinjected */
        bool                can_ack;
        bool                fully_established;
        bool                rcv_data_fin;
        bool                snd_data_fin_enable;
        bool                rcv_fastclose;
        bool                use_64bit_ack; /* Set when we received a 64-bit DSN */
        bool                csum_enabled;
        bool                allow_infinite_fallback;
        u8                pending_state; /* A subflow asked to set this sk_state,
                                        * protected by the msk data lock
                                        */
        u8                mpc_endpoint_id;
        u8                recvmsg_inq:1,
                        cork:1,
                        nodelay:1,
                        fastopening:1,
                        in_accept_queue:1,
                        free_first:1,
                        rcvspace_init:1;
        u32                notsent_lowat;
        int                keepalive_cnt;
        int                keepalive_idle;
        int                keepalive_intvl;
        struct work_struct work;
        struct sk_buff  *ooo_last_skb;
        struct rb_root  out_of_order_queue;
        struct sk_buff_head receive_queue;
        struct list_head conn_list;
        struct list_head rtx_queue;
        struct mptcp_data_frag *first_pending;
        struct list_head join_list;
        struct sock        *first; /* The mptcp ops can safely dereference, using suitable
                                 * ONCE annotation, the subflow outside the socket
                                 * lock as such sock is freed after close().
                                 */
        struct mptcp_pm_data        pm;
        struct mptcp_sched_ops        *sched;
        struct {
                u32        space;        /* bytes copied in last measurement window */
                u32        copied; /* bytes copied in this measurement window */
                u64        time;        /* start time of measurement window */
                u64        rtt_us; /* last maximum rtt of subflows */
        } rcvq_space;
        u8                scaling_ratio;

        u32                subflow_id;
        u32                setsockopt_seq;
        char                ca_name[TCP_CA_NAME_MAX];
};

#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)

#define mptcp_for_each_subflow(__msk, __subflow)                        \
        list_for_each_entry(__subflow, &((__msk)->conn_list), node)
#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp)                        \
        list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node)

extern struct genl_family mptcp_genl_family;

static inline void msk_owned_by_me(const struct mptcp_sock *msk)
{
        sock_owned_by_me((const struct sock *)msk);
}

#ifdef CONFIG_DEBUG_NET
/* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */
#undef tcp_sk
#define tcp_sk(ptr) ({                                                                \
        typeof(ptr) _ptr = (ptr);                                                \
        WARN_ON(_ptr->sk_protocol != IPPROTO_TCP);                                \
        container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk);        \
})
#define mptcp_sk(ptr) ({                                                \
        typeof(ptr) _ptr = (ptr);                                        \
        WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP);                        \
        container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk);        \
})

#else /* !CONFIG_DEBUG_NET */
#define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk)
#endif

/* the msk socket don't use the backlog, also account for the bulk
 * free memory
 */
static inline int __mptcp_rmem(const struct sock *sk)
{
        return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
}

static inline int mptcp_win_from_space(const struct sock *sk, int space)
{
        return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
}

static inline int mptcp_space_from_win(const struct sock *sk, int win)
{
        return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win);
}

static inline int __mptcp_space(const struct sock *sk)
{
        return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
}

static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
{
        const struct mptcp_sock *msk = mptcp_sk(sk);

        return READ_ONCE(msk->first_pending);
}

static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_data_frag *cur;

        cur = msk->first_pending;
        return list_is_last(&cur->list, &msk->rtx_queue) ? NULL :
                                                     list_next_entry(cur, list);
}

static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk)
{
        const struct mptcp_sock *msk = mptcp_sk(sk);

        if (!msk->first_pending)
                return NULL;

        if (WARN_ON_ONCE(list_empty(&msk->rtx_queue)))
                return NULL;

        return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
}

static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        if (msk->snd_una == msk->snd_nxt)
                return NULL;

        return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list);
}

struct csum_pseudo_header {
        __be64 data_seq;
        __be32 subflow_seq;
        __be16 data_len;
        __sum16 csum;
};

struct mptcp_subflow_request_sock {
        struct        tcp_request_sock sk;
        u16        mp_capable : 1,
                mp_join : 1,
                backup : 1,
                csum_reqd : 1,
                allow_join_id0 : 1;
        u8        local_id;
        u8        remote_id;
        u64        local_key;
        u64        idsn;
        u32        token;
        u32        ssn_offset;
        u64        thmac;
        u32        local_nonce;
        u32        remote_nonce;
        struct mptcp_sock        *msk;
        struct hlist_nulls_node token_node;
};

static inline struct mptcp_subflow_request_sock *
mptcp_subflow_rsk(const struct request_sock *rsk)
{
        return (struct mptcp_subflow_request_sock *)rsk;
}

struct mptcp_delegated_action {
        struct napi_struct napi;
        struct list_head head;
};

DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);

#define MPTCP_DELEGATE_SCHEDULED        0
#define MPTCP_DELEGATE_SEND                1
#define MPTCP_DELEGATE_ACK                2
#define MPTCP_DELEGATE_SNDBUF                3

#define MPTCP_DELEGATE_ACTIONS_MASK        (~BIT(MPTCP_DELEGATE_SCHEDULED))
/* MPTCP subflow context */
struct mptcp_subflow_context {
        struct        list_head node;/* conn_list of subflows */

        struct_group(reset,

        unsigned long avg_pacing_rate; /* protected by msk socket lock */
        u64        local_key;
        u64        remote_key;
        u64        idsn;
        u64        map_seq;
        u32        snd_isn;
        u32        token;
        u32        rel_write_seq;
        u32        map_subflow_seq;
        u32        ssn_offset;
        u32        map_data_len;
        __wsum        map_data_csum;
        u32        map_csum_len;
        u32        request_mptcp : 1,  /* send MP_CAPABLE */
                request_join : 1,   /* send MP_JOIN */
                request_bkup : 1,
                mp_capable : 1,            /* remote is MPTCP capable */
                mp_join : 1,            /* remote is JOINing */
                fully_established : 1,            /* path validated */
                pm_notified : 1,    /* PM hook called for established status */
                conn_finished : 1,
                map_valid : 1,
                map_csum_reqd : 1,
                map_data_fin : 1,
                mpc_map : 1,
                backup : 1,
                send_mp_prio : 1,
                send_mp_fail : 1,
                send_fastclose : 1,
                send_infinite_map : 1,
                remote_key_valid : 1,        /* received the peer key from */
                disposable : 1,            /* ctx can be free at ulp release time */
                stale : 1,            /* unable to snd/rcv data, do not use for xmit */
                valid_csum_seen : 1,        /* at least one csum validated */
                is_mptfo : 1,            /* subflow is doing TFO */
                __unused : 10;
        bool        data_avail;
        bool        scheduled;
        u32        remote_nonce;
        u64        thmac;
        u32        local_nonce;
        u32        remote_token;
        union {
                u8        hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */
                u64        iasn;            /* initial ack sequence number, MPC subflows only */
        };
        s16        local_id;            /* if negative not initialized yet */
        u8        remote_id;
        u8        reset_seen:1;
        u8        reset_transient:1;
        u8        reset_reason:4;
        u8        stale_count;

        u32        subflow_id;

        long        delegated_status;
        unsigned long        fail_tout;

        );

        struct        list_head delegated_node;   /* link into delegated_action, protected by local BH */

        u32        setsockopt_seq;
        u32        stale_rcv_tstamp;
        int     cached_sndbuf;            /* sndbuf size when last synced with the msk sndbuf,
                                     * protected by the msk socket lock
                                     */

        struct        sock *tcp_sock;            /* tcp sk backpointer */
        struct        sock *conn;            /* parent mptcp_sock */
        const        struct inet_connection_sock_af_ops *icsk_af_ops;
        void        (*tcp_state_change)(struct sock *sk);
        void        (*tcp_error_report)(struct sock *sk);

        struct        rcu_head rcu;
};

static inline struct mptcp_subflow_context *
mptcp_subflow_ctx(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        /* Use RCU on icsk_ulp_data only for sock diag code */
        return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data;
}

static inline struct sock *
mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
{
        return subflow->tcp_sock;
}

static inline void
mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow)
{
        memset(&subflow->reset, 0, sizeof(subflow->reset));
        subflow->request_mptcp = 1;
        WRITE_ONCE(subflow->local_id, -1);
}

/* Convert reset reasons in MPTCP to enum sk_rst_reason type */
static inline enum sk_rst_reason
sk_rst_convert_mptcp_reason(u32 reason)
{
        switch (reason) {
        case MPTCP_RST_EUNSPEC:
                return SK_RST_REASON_MPTCP_RST_EUNSPEC;
        case MPTCP_RST_EMPTCP:
                return SK_RST_REASON_MPTCP_RST_EMPTCP;
        case MPTCP_RST_ERESOURCE:
                return SK_RST_REASON_MPTCP_RST_ERESOURCE;
        case MPTCP_RST_EPROHIBIT:
                return SK_RST_REASON_MPTCP_RST_EPROHIBIT;
        case MPTCP_RST_EWQ2BIG:
                return SK_RST_REASON_MPTCP_RST_EWQ2BIG;
        case MPTCP_RST_EBADPERF:
                return SK_RST_REASON_MPTCP_RST_EBADPERF;
        case MPTCP_RST_EMIDDLEBOX:
                return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX;
        default:
                /* It should not happen, or else errors may occur
                 * in MPTCP layer
                 */
                return SK_RST_REASON_ERROR;
        }
}

static inline void
mptcp_send_active_reset_reason(struct sock *sk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        enum sk_rst_reason reason;

        reason = sk_rst_convert_mptcp_reason(subflow->reset_reason);
        tcp_send_active_reset(sk, GFP_ATOMIC, reason);
}

static inline u64
mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
{
        return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq -
                      subflow->ssn_offset -
                      subflow->map_subflow_seq;
}

static inline u64
mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
{
        return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
}

void mptcp_subflow_process_delegated(struct sock *ssk, long actions);

static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action)
{
        long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action);
        struct mptcp_delegated_action *delegated;
        bool schedule;

        /* the caller held the subflow bh socket lock */
        lockdep_assert_in_softirq();

        /* The implied barrier pairs with tcp_release_cb_override()
         * mptcp_napi_poll(), and ensures the below list check sees list
         * updates done prior to delegated status bits changes
         */
        old = set_mask_bits(&subflow->delegated_status, 0, set_bits);
        if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) {
                if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node)))
                        return;

                delegated = this_cpu_ptr(&mptcp_delegated_actions);
                schedule = list_empty(&delegated->head);
                list_add_tail(&subflow->delegated_node, &delegated->head);
                sock_hold(mptcp_subflow_tcp_sock(subflow));
                if (schedule)
                        napi_schedule(&delegated->napi);
        }
}

static inline struct mptcp_subflow_context *
mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
{
        struct mptcp_subflow_context *ret;

        if (list_empty(&delegated->head))
                return NULL;

        ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
        list_del_init(&ret->delegated_node);
        return ret;
}

int mptcp_is_enabled(const struct net *net);
unsigned int mptcp_get_add_addr_timeout(const struct net *net);
int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
unsigned int mptcp_close_timeout(const struct sock *sk);
int mptcp_get_pm_type(const struct net *net);
const char *mptcp_get_scheduler(const struct net *net);
void mptcp_get_available_schedulers(char *buf, size_t maxlen);
void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
                                       struct mptcp_subflow_context *subflow,
                                       const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
void mptcp_check_and_set_pending(struct sock *sk);
void __mptcp_push_pending(struct sock *sk, unsigned int flags);
bool mptcp_subflow_data_available(struct sock *sk);
void __init mptcp_subflow_init(void);
void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
                     struct mptcp_subflow_context *subflow);
void __mptcp_subflow_send_ack(struct sock *ssk);
void mptcp_subflow_reset(struct sock *ssk);
void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk);
void mptcp_sock_graft(struct sock *sk, struct socket *parent);
struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk);
bool __mptcp_close(struct sock *sk, long timeout);
void mptcp_cancel_work(struct sock *sk);
void __mptcp_unaccepted_force_close(struct sock *sk);
void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk);
void mptcp_set_state(struct sock *sk, int state);

bool mptcp_addresses_equal(const struct mptcp_addr_info *a,
                           const struct mptcp_addr_info *b, bool use_port);
void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr);

/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
                            const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
                                struct socket **new_sock);
void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
                         struct sockaddr_storage *addr,
                         unsigned short family);
struct mptcp_sched_ops *mptcp_sched_find(const char *name);
int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
void mptcp_sched_init(void);
int mptcp_init_sched(struct mptcp_sock *msk,
                     struct mptcp_sched_ops *sched);
void mptcp_release_sched(struct mptcp_sock *msk);
void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
                                 bool scheduled);
struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
int mptcp_sched_get_send(struct mptcp_sock *msk);
int mptcp_sched_get_retrans(struct mptcp_sock *msk);

static inline u64 mptcp_data_avail(const struct mptcp_sock *msk)
{
        return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed);
}

static inline bool mptcp_epollin_ready(const struct sock *sk)
{
        /* mptcp doesn't have to deal with small skbs in the receive queue,
         * at it can always coalesce them
         */
        return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) ||
               (mem_cgroup_sockets_enabled && sk->sk_memcg &&
                mem_cgroup_under_socket_pressure(sk->sk_memcg)) ||
               READ_ONCE(tcp_memory_pressure);
}

int mptcp_set_rcvlowat(struct sock *sk, int val);

static inline bool __tcp_can_send(const struct sock *ssk)
{
        /* only send if our side has not closed yet */
        return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
}

static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
        /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
        if (subflow->request_join && !subflow->fully_established)
                return false;

        return __tcp_can_send(mptcp_subflow_tcp_sock(subflow));
}

void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow);

bool mptcp_subflow_active(struct mptcp_subflow_context *subflow);

void mptcp_subflow_drop_ctx(struct sock *ssk);

static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
                                              struct mptcp_subflow_context *ctx)
{
        sk->sk_data_ready = sock_def_readable;
        sk->sk_state_change = ctx->tcp_state_change;
        sk->sk_write_space = sk_stream_write_space;
        sk->sk_error_report = ctx->tcp_error_report;

        inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
}

void __init mptcp_proto_init(void);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
int __init mptcp_proto_v6_init(void);
#endif

struct sock *mptcp_sk_clone_init(const struct sock *sk,
                                 const struct mptcp_options_received *mp_opt,
                                 struct sock *ssk,
                                 struct request_sock *req);
void mptcp_get_options(const struct sk_buff *skb,
                       struct mptcp_options_received *mp_opt);

void mptcp_finish_connect(struct sock *sk);
void __mptcp_sync_state(struct sock *sk, int state);
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout);

static inline void mptcp_stop_tout_timer(struct sock *sk)
{
        if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
                return;

        sk_stop_timer(sk, &sk->sk_timer);
        inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
}

static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout)
{
        /* avoid 0 timestamp, as that means no close timeout */
        inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1;
}

static inline void mptcp_start_tout_timer(struct sock *sk)
{
        mptcp_set_close_tout(sk, tcp_jiffies32);
        mptcp_reset_tout_timer(mptcp_sk(sk), 0);
}

static inline bool mptcp_is_fully_established(struct sock *sk)
{
        return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
               READ_ONCE(mptcp_sk(sk)->fully_established);
}

void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
bool mptcp_schedule_work(struct sock *sk);
int mptcp_setsockopt(struct sock *sk, int level, int optname,
                     sockptr_t optval, unsigned int optlen);
int mptcp_getsockopt(struct sock *sk, int level, int optname,
                     char __user *optval, int __user *option);

u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq);
static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit)
{
        if (use_64bit)
                return cur_seq;

        return __mptcp_expand_seq(old_seq, cur_seq);
}
void __mptcp_check_push(struct sock *sk, struct sock *ssk);
void __mptcp_data_acked(struct sock *sk);
void __mptcp_error_report(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
{
        return READ_ONCE(msk->snd_data_fin_enable) &&
               READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}

static inline u32 mptcp_notsent_lowat(const struct sock *sk)
{
        struct net *net = sock_net(sk);
        u32 val;

        val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
        return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
}

static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake)
{
        const struct mptcp_sock *msk = mptcp_sk(sk);
        u32 notsent_bytes;

        notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
        return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
}

static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
{
        return mptcp_stream_memory_free(sk, wake) &&
               __sk_stream_is_writeable(sk, wake);
}

static inline void mptcp_write_space(struct sock *sk)
{
        /* pairs with memory barrier in mptcp_poll */
        smp_mb();
        if (mptcp_stream_memory_free(sk, 1))
                sk_stream_write_space(sk);
}

static inline void __mptcp_sync_sndbuf(struct sock *sk)
{
        struct mptcp_subflow_context *subflow;
        int ssk_sndbuf, new_sndbuf;

        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return;

        new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]);
        mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
                ssk_sndbuf =  READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);

                subflow->cached_sndbuf = ssk_sndbuf;
                new_sndbuf += ssk_sndbuf;
        }

        /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */
        WRITE_ONCE(sk->sk_sndbuf, new_sndbuf);
        mptcp_write_space(sk);
}

/* The called held both the msk socket and the subflow socket locks,
 * possibly under BH
 */
static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);

        if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf)
                __mptcp_sync_sndbuf(sk);
}

/* the caller held only the subflow socket lock, either in process or
 * BH context. Additionally this can be called under the msk data lock,
 * so we can't acquire such lock here: let the delegate action acquires
 * the needed locks in suitable order.
 */
static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);

        if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf))
                return;

        local_bh_disable();
        mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF);
        local_bh_enable();
}

void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags);

#define MPTCP_TOKEN_MAX_RETRIES        4

void __init mptcp_token_init(void);
static inline void mptcp_token_init_request(struct request_sock *req)
{
        mptcp_subflow_rsk(req)->token_node.pprev = NULL;
}

int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(struct request_sock *req);
int mptcp_token_new_connect(struct sock *ssk);
void mptcp_token_accept(struct mptcp_subflow_request_sock *r,
                        struct mptcp_sock *msk);
bool mptcp_token_exists(u32 token);
struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token);
struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
                                         long *s_num);
void mptcp_token_destroy(struct mptcp_sock *msk);

void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);

void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum);

void __init mptcp_pm_init(void);
void mptcp_pm_data_init(struct mptcp_sock *msk);
void mptcp_pm_data_reset(struct mptcp_sock *msk);
int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
                        struct mptcp_addr_info *addr);
int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
                         bool require_family,
                         struct mptcp_pm_addr_entry *entry);
bool mptcp_pm_addr_families_match(const struct sock *sk,
                                  const struct mptcp_addr_info *loc,
                                  const struct mptcp_addr_info *rem);
void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk);
void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side);
void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk);
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
void mptcp_pm_connection_closed(struct mptcp_sock *msk);
void mptcp_pm_subflow_established(struct mptcp_sock *msk);
bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk);
void mptcp_pm_subflow_check_next(struct mptcp_sock *msk,
                                 const struct mptcp_subflow_context *subflow);
void mptcp_pm_add_addr_received(const struct sock *ssk,
                                const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
                              const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
                               const struct mptcp_rm_list *rm_list);
void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq);
int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
                                 struct mptcp_addr_info *addr,
                                 struct mptcp_addr_info *rem,
                                 u8 bkup);
bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
                              const struct mptcp_addr_info *addr);
void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
                       const struct mptcp_addr_info *addr, bool check_id);
struct mptcp_pm_add_entry *
mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
                                const struct mptcp_addr_info *addr);
int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
                                         unsigned int id,
                                         u8 *flags, int *ifindex);
int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
                                            u8 *flags, int *ifindex);
int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk,
                                                   unsigned int id,
                                                   u8 *flags, int *ifindex);
int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info);
int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info);
int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info);
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
                           const struct mptcp_addr_info *addr,
                           bool echo);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list);

void mptcp_free_local_addr_list(struct mptcp_sock *msk);

void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
                 const struct sock *ssk, gfp_t gfp);
void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info);
void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id);
void mptcp_event_pm_listener(const struct sock *ssk,
                             enum mptcp_event_type event);
bool mptcp_userspace_pm_active(const struct mptcp_sock *msk);

void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow,
                                     const struct mptcp_options_received *mp_opt);
void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow,
                                              struct request_sock *req);
int mptcp_nl_fill_addr(struct sk_buff *skb,
                       struct mptcp_pm_addr_entry *entry);

static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk)
{
        return READ_ONCE(msk->pm.addr_signal) &
                (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO));
}

static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk)
{
        return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL);
}

static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk)
{
        return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO);
}

static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk)
{
        return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL);
}

static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk)
{
        return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE;
}

static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk)
{
        return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL;
}

static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port)
{
        u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;

        if (family == AF_INET6)
                len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
        if (!echo)
                len += MPTCPOPT_THMAC_LEN;
        /* account for 2 trailing 'nop' options */
        if (port)
                len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN;

        return len;
}

static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list)
{
        if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX)
                return -EINVAL;

        return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1;
}

bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
                              unsigned int opt_size, unsigned int remaining,
                              struct mptcp_addr_info *addr, bool *echo,
                              bool *drop_other_suboptions);
bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
                             struct mptcp_rm_list *rm_list);
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb);
int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
                          struct netlink_callback *cb);
int mptcp_userspace_pm_dump_addr(struct sk_buff *msg,
                                 struct netlink_callback *cb);
int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info);
int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info);
int mptcp_userspace_pm_get_addr(struct sk_buff *skb,
                                struct genl_info *info);

static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow)
{
        int local_id = READ_ONCE(subflow->local_id);

        if (local_id < 0)
                return 0;
        return local_id;
}

void __init mptcp_pm_nl_init(void);
void mptcp_pm_nl_work(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk,
                                     const struct mptcp_rm_list *rm_list);
unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk);

/* called under PM lock */
static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk)
{
        if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk))
                WRITE_ONCE(msk->pm.accept_subflow, true);
}

static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk)
{
        spin_lock_bh(&msk->pm.lock);
        __mptcp_pm_close_subflow(msk);
        spin_unlock_bh(&msk->pm.lock);
}

void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk);
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);

static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
{
        return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
}

void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);

static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
{
        return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}

static inline bool mptcp_check_fallback(const struct sock *sk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);

        return __mptcp_check_fallback(msk);
}

static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
{
        if (__mptcp_check_fallback(msk)) {
                pr_debug("TCP fallback already done (msk=%p)", msk);
                return;
        }
        set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}

static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk)
{
        struct sock *ssk = READ_ONCE(msk->first);

        return ssk && ((1 << inet_sk_state_load(ssk)) &
                       (TCPF_ESTABLISHED | TCPF_SYN_SENT |
                        TCPF_SYN_RECV | TCPF_LISTEN));
}

static inline void mptcp_do_fallback(struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct sock *sk = subflow->conn;
        struct mptcp_sock *msk;

        msk = mptcp_sk(sk);
        __mptcp_do_fallback(msk);
        if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
                gfp_t saved_allocation = ssk->sk_allocation;

                /* we are in a atomic (BH) scope, override ssk default for data
                 * fin allocation
                 */
                ssk->sk_allocation = GFP_ATOMIC;
                ssk->sk_shutdown |= SEND_SHUTDOWN;
                tcp_shutdown(ssk, SEND_SHUTDOWN);
                ssk->sk_allocation = saved_allocation;
        }
}

#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a)

static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
{
        struct mptcp_ext *mpext;

        mpext = skb ? mptcp_get_ext(skb) : NULL;
        if (mpext && mpext->infinite_map)
                return true;

        return false;
}

static inline bool is_active_ssk(struct mptcp_subflow_context *subflow)
{
        return (subflow->request_mptcp || subflow->request_join);
}

static inline bool subflow_simultaneous_connect(struct sock *sk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);

        return (1 << sk->sk_state) &
               (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) &&
               is_active_ssk(subflow) &&
               !subflow->conn_finished;
}

#ifdef CONFIG_SYN_COOKIES
void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
                                       struct sk_buff *skb);
bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
                                        struct sk_buff *skb);
void __init mptcp_join_cookie_init(void);
#else
static inline void
subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req,
                                  struct sk_buff *skb) {}
static inline bool
mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req,
                                   struct sk_buff *skb)
{
        return false;
}

static inline void mptcp_join_cookie_init(void) {}
#endif

#endif /* __MPTCP_PROTOCOL_H */






















































































    2 
























































































































































































































































































    2 








    2 


    2 














    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP router.
 *
 * Version:        @(#)route.h        1.0.4        05/27/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 * Fixes:
 *                Alan Cox        :        Reformatted. Added ip_rt_local()
 *                Alan Cox        :        Support for TCP parameters.
 *                Alexey Kuznetsov:        Major changes for new routing code.
 *                Mike McLagan    :        Routing by source
 *                Robert Olsson   :        Added rt_cache statistics
 */
#ifndef _ROUTE_H
#define _ROUTE_H

#include <net/dst.h>
#include <net/inetpeer.h>
#include <net/flow.h>
#include <net/inet_sock.h>
#include <net/ip_fib.h>
#include <net/arp.h>
#include <net/ndisc.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/route.h>
#include <linux/ip.h>
#include <linux/cache.h>
#include <linux/security.h>

static inline __u8 ip_sock_rt_scope(const struct sock *sk)
{
        if (sock_flag(sk, SOCK_LOCALROUTE))
                return RT_SCOPE_LINK;

        return RT_SCOPE_UNIVERSE;
}

static inline __u8 ip_sock_rt_tos(const struct sock *sk)
{
        return RT_TOS(READ_ONCE(inet_sk(sk)->tos));
}

struct ip_tunnel_info;
struct fib_nh;
struct fib_info;
struct uncached_list;
struct rtable {
        struct dst_entry        dst;

        int                        rt_genid;
        unsigned int                rt_flags;
        __u16                        rt_type;
        __u8                        rt_is_input;
        __u8                        rt_uses_gateway;

        int                        rt_iif;

        u8                        rt_gw_family;
        /* Info on neighbour */
        union {
                __be32                rt_gw4;
                struct in6_addr        rt_gw6;
        };

        /* Miscellaneous cached information */
        u32                        rt_mtu_locked:1,
                                rt_pmtu:31;
};

#define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst)

/**
 * skb_rtable - Returns the skb &rtable
 * @skb: buffer
 */
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
        return dst_rtable(skb_dst(skb));
}

static inline bool rt_is_input_route(const struct rtable *rt)
{
        return rt->rt_is_input != 0;
}

static inline bool rt_is_output_route(const struct rtable *rt)
{
        return rt->rt_is_input == 0;
}

static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
{
        if (rt->rt_gw_family == AF_INET)
                return rt->rt_gw4;
        return daddr;
}

struct ip_rt_acct {
        __u32         o_bytes;
        __u32         o_packets;
        __u32         i_bytes;
        __u32         i_packets;
};

struct rt_cache_stat {
        unsigned int in_slow_tot;
        unsigned int in_slow_mc;
        unsigned int in_no_route;
        unsigned int in_brd;
        unsigned int in_martian_dst;
        unsigned int in_martian_src;
        unsigned int out_slow_tot;
        unsigned int out_slow_mc;
};

extern struct ip_rt_acct __percpu *ip_rt_acct;

struct in_device;

int ip_rt_init(void);
void rt_cache_flush(struct net *net);
void rt_flush_dev(struct net_device *dev);
struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp,
                                        const struct sk_buff *skb);
struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp,
                                            struct fib_result *res,
                                            const struct sk_buff *skb);

static inline struct rtable *__ip_route_output_key(struct net *net,
                                                   struct flowi4 *flp)
{
        return ip_route_output_key_hash(net, flp, NULL);
}

struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
                                    const struct sock *sk);
struct dst_entry *ipv4_blackhole_route(struct net *net,
                                       struct dst_entry *dst_orig);

static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 *flp)
{
        return ip_route_output_flow(net, flp, NULL);
}

/* Simplistic IPv4 route lookup function.
 * This is only suitable for some particular use cases: since the flowi4
 * structure is only partially set, it may bypass some fib-rules.
 */
static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
                                             __be32 saddr, u8 tos, int oif,
                                             __u8 scope)
{
        struct flowi4 fl4 = {
                .flowi4_oif = oif,
                .flowi4_tos = tos,
                .flowi4_scope = scope,
                .daddr = daddr,
                .saddr = saddr,
        };

        return ip_route_output_key(net, &fl4);
}

static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi4 *fl4,
                                                   const struct sock *sk,
                                                   __be32 daddr, __be32 saddr,
                                                   __be16 dport, __be16 sport,
                                                   __u8 proto, __u8 tos, int oif)
{
        flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos,
                           sk ? ip_sock_rt_scope(sk) : RT_SCOPE_UNIVERSE,
                           proto, sk ? inet_sk_flowi_flags(sk) : 0,
                           daddr, saddr, dport, sport, sock_net_uid(net, sk));
        if (sk)
                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
        return ip_route_output_flow(net, fl4, sk);
}

static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4,
                                                 __be32 daddr, __be32 saddr,
                                                 __be32 gre_key, __u8 tos, int oif)
{
        memset(fl4, 0, sizeof(*fl4));
        fl4->flowi4_oif = oif;
        fl4->daddr = daddr;
        fl4->saddr = saddr;
        fl4->flowi4_tos = tos;
        fl4->flowi4_proto = IPPROTO_GRE;
        fl4->fl4_gre_key = gre_key;
        return ip_route_output_key(net, fl4);
}
int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                          u8 tos, struct net_device *dev,
                          struct in_device *in_dev, u32 *itag);
int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
                         u8 tos, struct net_device *devin);
int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src,
                      u8 tos, struct net_device *devin,
                      const struct sk_buff *hint);

static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
                                 u8 tos, struct net_device *devin)
{
        int err;

        rcu_read_lock();
        err = ip_route_input_noref(skb, dst, src, tos, devin);
        if (!err) {
                skb_dst_force(skb);
                if (!skb_dst(skb))
                        err = -EINVAL;
        }
        rcu_read_unlock();

        return err;
}

void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif,
                      u8 protocol);
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol);
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
void ip_rt_send_redirect(struct sk_buff *skb);

unsigned int inet_addr_type(struct net *net, __be32 addr);
unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
                                __be32 addr);
unsigned int inet_addr_type_dev_table(struct net *net,
                                      const struct net_device *dev,
                                      __be32 addr);
void ip_rt_multicast_event(struct in_device *);
int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt);
void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
struct rtable *rt_dst_alloc(struct net_device *dev,
                            unsigned int flags, u16 type, bool noxfrm);
struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt);

struct in_ifaddr;
void fib_add_ifaddr(struct in_ifaddr *);
void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);

void rt_add_uncached_list(struct rtable *rt);
void rt_del_uncached_list(struct rtable *rt);

int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
                       u32 table_id, struct fib_info *fi,
                       int *fa_index, int fa_start, unsigned int flags);

static inline void ip_rt_put(struct rtable *rt)
{
        /* dst_release() accepts a NULL parameter.
         * We rely on dst being first structure in struct rtable
         */
        BUILD_BUG_ON(offsetof(struct rtable, dst) != 0);
        dst_release(&rt->dst);
}

#define IPTOS_RT_MASK        (IPTOS_TOS_MASK & ~3)

extern const __u8 ip_tos2prio[16];

static inline char rt_tos2priority(u8 tos)
{
        return ip_tos2prio[IPTOS_TOS(tos)>>1];
}

/* ip_route_connect() and ip_route_newports() work in tandem whilst
 * binding a socket for a new outgoing connection.
 *
 * In order to use IPSEC properly, we must, in the end, have a
 * route that was looked up using all available keys including source
 * and destination ports.
 *
 * However, if a source port needs to be allocated (the user specified
 * a wildcard source port) we need to obtain addressing information
 * in order to perform that allocation.
 *
 * So ip_route_connect() looks up a route using wildcarded source and
 * destination ports in the key, simply so that we can get a pair of
 * addresses to use for port allocation.
 *
 * Later, once the ports are allocated, ip_route_newports() will make
 * another route lookup if needed to make sure we catch any IPSEC
 * rules keyed on the port information.
 *
 * The callers allocate the flow key on their stack, and must pass in
 * the same flowi4 object to both the ip_route_connect() and the
 * ip_route_newports() calls.
 */

static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
                                         __be32 src, int oif, u8 protocol,
                                         __be16 sport, __be16 dport,
                                         const struct sock *sk)
{
        __u8 flow_flags = 0;

        if (inet_test_bit(TRANSPARENT, sk))
                flow_flags |= FLOWI_FLAG_ANYSRC;

        flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
                           ip_sock_rt_scope(sk), protocol, flow_flags, dst,
                           src, dport, sport, sk->sk_uid);
}

static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst,
                                              __be32 src, int oif, u8 protocol,
                                              __be16 sport, __be16 dport,
                                              const struct sock *sk)
{
        struct net *net = sock_net(sk);
        struct rtable *rt;

        ip_route_connect_init(fl4, dst, src, oif, protocol, sport, dport, sk);

        if (!dst || !src) {
                rt = __ip_route_output_key(net, fl4);
                if (IS_ERR(rt))
                        return rt;
                ip_rt_put(rt);
                flowi4_update_output(fl4, oif, fl4->daddr, fl4->saddr);
        }
        security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
        return ip_route_output_flow(net, fl4, sk);
}

static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable *rt,
                                               __be16 orig_sport, __be16 orig_dport,
                                               __be16 sport, __be16 dport,
                                               const struct sock *sk)
{
        if (sport != orig_sport || dport != orig_dport) {
                fl4->fl4_dport = dport;
                fl4->fl4_sport = sport;
                ip_rt_put(rt);
                flowi4_update_output(fl4, sk->sk_bound_dev_if, fl4->daddr,
                                     fl4->saddr);
                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
                return ip_route_output_flow(sock_net(sk), fl4, sk);
        }
        return rt;
}

static inline int inet_iif(const struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);

        if (rt && rt->rt_iif)
                return rt->rt_iif;

        return skb->skb_iif;
}

static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
{
        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
        struct net *net = dev_net(dst->dev);

        if (hoplimit == 0)
                hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
        return hoplimit;
}

static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
                                             __be32 daddr)
{
        struct neighbour *neigh;

        neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &daddr, dev, false);

        return neigh;
}

static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
                                                struct sk_buff *skb,
                                                bool *is_v6gw)
{
        struct net_device *dev = rt->dst.dev;
        struct neighbour *neigh;

        if (likely(rt->rt_gw_family == AF_INET)) {
                neigh = ip_neigh_gw4(dev, rt->rt_gw4);
        } else if (rt->rt_gw_family == AF_INET6) {
                neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
                *is_v6gw = true;
        } else {
                neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
        }
        return neigh;
}

#endif        /* _ROUTE_H */
























































































































    2 


















    3 
















































    2 


    3 





    2 










    2 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// SPDX-License-Identifier: GPL-2.0
#include <linux/tcp.h>
#include <net/tcp.h>

static u32 tcp_rack_reo_wnd(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (!tp->reord_seen) {
                /* If reordering has not been observed, be aggressive during
                 * the recovery or starting the recovery by DUPACK threshold.
                 */
                if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
                        return 0;

                if (tp->sacked_out >= tp->reordering &&
                    !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
                      TCP_RACK_NO_DUPTHRESH))
                        return 0;
        }

        /* To be more reordering resilient, allow min_rtt/4 settling delay.
         * Use min_rtt instead of the smoothed RTT because reordering is
         * often a path property and less related to queuing or delayed ACKs.
         * Upon receiving DSACKs, linearly increase the window up to the
         * smoothed RTT.
         */
        return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
                   tp->srtt_us >> 3);
}

s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
{
        return tp->rack.rtt_us + reo_wnd -
               tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
}

/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
 *
 * Marks a packet lost, if some packet sent later has been (s)acked.
 * The underlying idea is similar to the traditional dupthresh and FACK
 * but they look at different metrics:
 *
 * dupthresh: 3 OOO packets delivered (packet count)
 * FACK: sequence delta to highest sacked sequence (sequence space)
 * RACK: sent time delta to the latest delivered packet (time domain)
 *
 * The advantage of RACK is it applies to both original and retransmitted
 * packet and therefore is robust against tail losses. Another advantage
 * is being more resilient to reordering by simply allowing some
 * "settling delay", instead of tweaking the dupthresh.
 *
 * When tcp_rack_detect_loss() detects some packets are lost and we
 * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
 * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
 * make us enter the CA_Recovery state.
 */
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb, *n;
        u32 reo_wnd;

        *reo_timeout = 0;
        reo_wnd = tcp_rack_reo_wnd(sk);
        list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
                                 tcp_tsorted_anchor) {
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                s32 remaining;

                /* Skip ones marked lost but not yet retransmitted */
                if ((scb->sacked & TCPCB_LOST) &&
                    !(scb->sacked & TCPCB_SACKED_RETRANS))
                        continue;

                if (!tcp_skb_sent_after(tp->rack.mstamp,
                                        tcp_skb_timestamp_us(skb),
                                        tp->rack.end_seq, scb->end_seq))
                        break;

                /* A packet is lost if it has not been s/acked beyond
                 * the recent RTT plus the reordering window.
                 */
                remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
                if (remaining <= 0) {
                        tcp_mark_skb_lost(sk, skb);
                        list_del_init(&skb->tcp_tsorted_anchor);
                } else {
                        /* Record maximum wait time */
                        *reo_timeout = max_t(u32, *reo_timeout, remaining);
                }
        }
}

bool tcp_rack_mark_lost(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 timeout;

        if (!tp->rack.advanced)
                return false;

        /* Reset the advanced flag to avoid unnecessary queue scanning */
        tp->rack.advanced = 0;
        tcp_rack_detect_loss(sk, &timeout);
        if (timeout) {
                timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US);
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
                                          timeout, inet_csk(sk)->icsk_rto);
        }
        return !!timeout;
}

/* Record the most recently (re)sent time among the (s)acked packets
 * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
 * draft-cheng-tcpm-rack-00.txt
 */
void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
                      u64 xmit_time)
{
        u32 rtt_us;

        rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
        if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
                /* If the sacked packet was retransmitted, it's ambiguous
                 * whether the retransmission or the original (or the prior
                 * retransmission) was sacked.
                 *
                 * If the original is lost, there is no ambiguity. Otherwise
                 * we assume the original can be delayed up to aRTT + min_rtt.
                 * the aRTT term is bounded by the fast recovery or timeout,
                 * so it's at least one RTT (i.e., retransmission is at least
                 * an RTT later).
                 */
                return;
        }
        tp->rack.advanced = 1;
        tp->rack.rtt_us = rtt_us;
        if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
                               end_seq, tp->rack.end_seq)) {
                tp->rack.mstamp = xmit_time;
                tp->rack.end_seq = end_seq;
        }
}

/* We have waited long enough to accommodate reordering. Mark the expired
 * packets lost and retransmit them.
 */
void tcp_rack_reo_timeout(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 timeout, prior_inflight;
        u32 lost = tp->lost;

        prior_inflight = tcp_packets_in_flight(tp);
        tcp_rack_detect_loss(sk, &timeout);
        if (prior_inflight != tcp_packets_in_flight(tp)) {
                if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
                        tcp_enter_recovery(sk, false);
                        if (!inet_csk(sk)->icsk_ca_ops->cong_control)
                                tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0);
                }
                tcp_xmit_retransmit_queue(sk);
        }
        if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
                tcp_rearm_rto(sk);
}

/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
 *
 * If a DSACK is received that seems like it may have been due to reordering
 * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
 * by srtt), since there is possibility that spurious retransmission was
 * due to reordering delay longer than reo_wnd.
 *
 * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
 * no. of successful recoveries (accounts for full DSACK-based loss
 * recovery undo). After that, reset it to default (min_rtt/4).
 *
 * At max, reo_wnd is incremented only once per rtt. So that the new
 * DSACK on which we are reacting, is due to the spurious retx (approx)
 * after the reo_wnd has been updated last time.
 *
 * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
 * absolute value to account for change in rtt.
 */
void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
             TCP_RACK_STATIC_REO_WND) ||
            !rs->prior_delivered)
                return;

        /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
        if (before(rs->prior_delivered, tp->rack.last_delivered))
                tp->rack.dsack_seen = 0;

        /* Adjust the reo_wnd if update is pending */
        if (tp->rack.dsack_seen) {
                tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
                                               tp->rack.reo_wnd_steps + 1);
                tp->rack.dsack_seen = 0;
                tp->rack.last_delivered = tp->delivered;
                tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
        } else if (!tp->rack.reo_wnd_persist) {
                tp->rack.reo_wnd_steps = 1;
        }
}

/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
 * the next unacked packet upon receiving
 * a) three or more DUPACKs to start the fast recovery
 * b) an ACK acknowledging new data during the fast recovery.
 */
void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
{
        const u8 state = inet_csk(sk)->icsk_ca_state;
        struct tcp_sock *tp = tcp_sk(sk);

        if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
            (state == TCP_CA_Recovery && snd_una_advanced)) {
                struct sk_buff *skb = tcp_rtx_queue_head(sk);
                u32 mss;

                if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
                        return;

                mss = tcp_skb_mss(skb);
                if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
                        tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
                                     mss, mss, GFP_ATOMIC);

                tcp_mark_skb_lost(sk, skb);
        }
}



































































    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * x86 KFENCE support.
 *
 * Copyright (C) 2020, Google LLC.
 */

#ifndef _ASM_X86_KFENCE_H
#define _ASM_X86_KFENCE_H

#ifndef MODULE

#include <linux/bug.h>
#include <linux/kfence.h>

#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/tlbflush.h>

/* Force 4K pages for __kfence_pool. */
static inline bool arch_kfence_init_pool(void)
{
        unsigned long addr;

        for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr);
             addr += PAGE_SIZE) {
                unsigned int level;

                if (!lookup_address(addr, &level))
                        return false;

                if (level != PG_LEVEL_4K)
                        set_memory_4k(addr, 1);
        }

        return true;
}

/* Protect the given page and flush TLB. */
static inline bool kfence_protect_page(unsigned long addr, bool protect)
{
        unsigned int level;
        pte_t *pte = lookup_address(addr, &level);

        if (WARN_ON(!pte || level != PG_LEVEL_4K))
                return false;

        /*
         * We need to avoid IPIs, as we may get KFENCE allocations or faults
         * with interrupts disabled. Therefore, the below is best-effort, and
         * does not flush TLBs on all CPUs. We can tolerate some inaccuracy;
         * lazy fault handling takes care of faults after the page is PRESENT.
         */

        if (protect)
                set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
        else
                set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));

        /*
         * Flush this CPU's TLB, assuming whoever did the allocation/free is
         * likely to continue running on this CPU.
         */
        preempt_disable();
        flush_tlb_one_kernel(addr);
        preempt_enable();
        return true;
}

#endif /* !MODULE */

#endif /* _ASM_X86_KFENCE_H */




























































































































































    1 
    1 

















































    1 




    1 
    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        RAW sockets for IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Adapted from linux/net/ipv4/raw.c
 *
 *        Fixes:
 *        Hideaki YOSHIFUJI        :        sin6_scope_id support
 *        YOSHIFUJI,H.@USAGI        :        raw checksum (RFC2292(bis) compliance)
 *        Kazunori MIYAZAWA @USAGI:        change process style to use ip6_append_data
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/slab.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/skbuff.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>

#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/protocol.h>
#include <net/ip6_route.h>
#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#include <net/transp_v6.h>
#include <net/udp.h>
#include <net/inet_common.h>
#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
#include <linux/mroute6.h>

#include <net/raw.h>
#include <net/rawv6.h>
#include <net/xfrm.h>

#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/export.h>

#define        ICMPV6_HDRLEN        4        /* ICMPv6 header, RFC 4443 Section 2.1 */

struct raw_hashinfo raw_v6_hashinfo;
EXPORT_SYMBOL_GPL(raw_v6_hashinfo);

bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
                  const struct in6_addr *loc_addr,
                  const struct in6_addr *rmt_addr, int dif, int sdif)
{
        if (inet_sk(sk)->inet_num != num ||
            !net_eq(sock_net(sk), net) ||
            (!ipv6_addr_any(&sk->sk_v6_daddr) &&
             !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
            !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
                                 dif, sdif))
                return false;

        if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
            ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) ||
            (ipv6_addr_is_multicast(loc_addr) &&
             inet6_mc_check(sk, loc_addr, rmt_addr)))
                return true;

        return false;
}
EXPORT_SYMBOL_GPL(raw_v6_match);

/*
 *        0 - deliver
 *        1 - block
 */
static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb)
{
        struct icmp6hdr _hdr;
        const struct icmp6hdr *hdr;

        /* We require only the four bytes of the ICMPv6 header, not any
         * additional bytes of message body in "struct icmp6hdr".
         */
        hdr = skb_header_pointer(skb, skb_transport_offset(skb),
                                 ICMPV6_HDRLEN, &_hdr);
        if (hdr) {
                const __u32 *data = &raw6_sk(sk)->filter.data[0];
                unsigned int type = hdr->icmp6_type;

                return (data[type >> 5] & (1U << (type & 31))) != 0;
        }
        return 1;
}

#if IS_ENABLED(CONFIG_IPV6_MIP6)
typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);

static mh_filter_t __rcu *mh_filter __read_mostly;

int rawv6_mh_filter_register(mh_filter_t filter)
{
        rcu_assign_pointer(mh_filter, filter);
        return 0;
}
EXPORT_SYMBOL(rawv6_mh_filter_register);

int rawv6_mh_filter_unregister(mh_filter_t filter)
{
        RCU_INIT_POINTER(mh_filter, NULL);
        synchronize_rcu();
        return 0;
}
EXPORT_SYMBOL(rawv6_mh_filter_unregister);

#endif

/*
 *        demultiplex raw sockets.
 *        (should consider queueing the skb in the sock receive_queue
 *        without calling rawv6.c)
 *
 *        Caller owns SKB so we must make clones.
 */
static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
        struct net *net = dev_net(skb->dev);
        const struct in6_addr *saddr;
        const struct in6_addr *daddr;
        struct hlist_head *hlist;
        struct sock *sk;
        bool delivered = false;
        __u8 hash;

        saddr = &ipv6_hdr(skb)->saddr;
        daddr = saddr + 1;

        hash = raw_hashfunc(net, nexthdr);
        hlist = &raw_v6_hashinfo.ht[hash];
        rcu_read_lock();
        sk_for_each_rcu(sk, hlist) {
                int filtered;

                if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
                                  inet6_iif(skb), inet6_sdif(skb)))
                        continue;

                if (atomic_read(&sk->sk_rmem_alloc) >=
                    READ_ONCE(sk->sk_rcvbuf)) {
                        atomic_inc(&sk->sk_drops);
                        continue;
                }

                delivered = true;
                switch (nexthdr) {
                case IPPROTO_ICMPV6:
                        filtered = icmpv6_filter(sk, skb);
                        break;

#if IS_ENABLED(CONFIG_IPV6_MIP6)
                case IPPROTO_MH:
                {
                        /* XXX: To validate MH only once for each packet,
                         * this is placed here. It should be after checking
                         * xfrm policy, however it doesn't. The checking xfrm
                         * policy is placed in rawv6_rcv() because it is
                         * required for each socket.
                         */
                        mh_filter_t *filter;

                        filter = rcu_dereference(mh_filter);
                        filtered = filter ? (*filter)(sk, skb) : 0;
                        break;
                }
#endif
                default:
                        filtered = 0;
                        break;
                }

                if (filtered < 0)
                        break;
                if (filtered == 0) {
                        struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);

                        /* Not releasing hash table! */
                        if (clone)
                                rawv6_rcv(sk, clone);
                }
        }
        rcu_read_unlock();
        return delivered;
}

bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
{
        return ipv6_raw_deliver(skb, nexthdr);
}

/* This cleans up af_inet6 a bit. -DaveM */
static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr;
        __be32 v4addr = 0;
        int addr_type;
        int err;

        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;

        if (addr->sin6_family != AF_INET6)
                return -EINVAL;

        addr_type = ipv6_addr_type(&addr->sin6_addr);

        /* Raw sockets are IPv6 only */
        if (addr_type == IPV6_ADDR_MAPPED)
                return -EADDRNOTAVAIL;

        lock_sock(sk);

        err = -EINVAL;
        if (sk->sk_state != TCP_CLOSE)
                goto out;

        rcu_read_lock();
        /* Check if the address belongs to the host. */
        if (addr_type != IPV6_ADDR_ANY) {
                struct net_device *dev = NULL;

                if (__ipv6_addr_needs_scope_id(addr_type)) {
                        if (addr_len >= sizeof(struct sockaddr_in6) &&
                            addr->sin6_scope_id) {
                                /* Override any existing binding, if another
                                 * one is supplied by user.
                                 */
                                sk->sk_bound_dev_if = addr->sin6_scope_id;
                        }

                        /* Binding to link-local address requires an interface */
                        if (!sk->sk_bound_dev_if)
                                goto out_unlock;
                }

                if (sk->sk_bound_dev_if) {
                        err = -ENODEV;
                        dev = dev_get_by_index_rcu(sock_net(sk),
                                                   sk->sk_bound_dev_if);
                        if (!dev)
                                goto out_unlock;
                }

                /* ipv4 addr of the socket is invalid.  Only the
                 * unspecified and mapped address have a v4 equivalent.
                 */
                v4addr = LOOPBACK4_IPV6;
                if (!(addr_type & IPV6_ADDR_MULTICAST) &&
                    !ipv6_can_nonlocal_bind(sock_net(sk), inet)) {
                        err = -EADDRNOTAVAIL;
                        if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
                                           dev, 0)) {
                                goto out_unlock;
                        }
                }
        }

        inet->inet_rcv_saddr = inet->inet_saddr = v4addr;
        sk->sk_v6_rcv_saddr = addr->sin6_addr;
        if (!(addr_type & IPV6_ADDR_MULTICAST))
                np->saddr = addr->sin6_addr;
        err = 0;
out_unlock:
        rcu_read_unlock();
out:
        release_sock(sk);
        return err;
}

static void rawv6_err(struct sock *sk, struct sk_buff *skb,
                      u8 type, u8 code, int offset, __be32 info)
{
        bool recverr = inet6_test_bit(RECVERR6, sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        int err;
        int harderr;

        /* Report error on raw socket, if:
           1. User requested recverr.
           2. Socket is connected (otherwise the error indication
              is useless without recverr and error is hard.
         */
        if (!recverr && sk->sk_state != TCP_ESTABLISHED)
                return;

        harderr = icmpv6_err_convert(type, code, &err);
        if (type == ICMPV6_PKT_TOOBIG) {
                ip6_sk_update_pmtu(skb, sk, info);
                harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO);
        }
        if (type == NDISC_REDIRECT) {
                ip6_sk_redirect(skb, sk);
                return;
        }
        if (recverr) {
                u8 *payload = skb->data;
                if (!inet_test_bit(HDRINCL, sk))
                        payload += offset;
                ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
        }

        if (recverr || harderr) {
                sk->sk_err = err;
                sk_error_report(sk);
        }
}

void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
                u8 type, u8 code, int inner_offset, __be32 info)
{
        struct net *net = dev_net(skb->dev);
        struct hlist_head *hlist;
        struct sock *sk;
        int hash;

        hash = raw_hashfunc(net, nexthdr);
        hlist = &raw_v6_hashinfo.ht[hash];
        rcu_read_lock();
        sk_for_each_rcu(sk, hlist) {
                /* Note: ipv6_hdr(skb) != skb->data */
                const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;

                if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr,
                                  inet6_iif(skb), inet6_iif(skb)))
                        continue;
                rawv6_err(sk, skb, type, code, inner_offset, info);
        }
        rcu_read_unlock();
}

static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason reason;

        if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
            skb_checksum_complete(skb)) {
                atomic_inc(&sk->sk_drops);
                sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
                return NET_RX_DROP;
        }

        /* Charge it to the socket. */
        skb_dst_drop(skb);
        if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
                sk_skb_reason_drop(sk, skb, reason);
                return NET_RX_DROP;
        }

        return 0;
}

/*
 *        This is next to useless...
 *        if we demultiplex in network layer we don't need the extra call
 *        just to queue the skb...
 *        maybe we could have the network decide upon a hint if it
 *        should call raw_rcv for demultiplexing
 */
int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct inet_sock *inet = inet_sk(sk);
        struct raw6_sock *rp = raw6_sk(sk);

        if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
                atomic_inc(&sk->sk_drops);
                sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
                return NET_RX_DROP;
        }
        nf_reset_ct(skb);

        if (!rp->checksum)
                skb->ip_summed = CHECKSUM_UNNECESSARY;

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                skb_postpull_rcsum(skb, skb_network_header(skb),
                                   skb_network_header_len(skb));
                if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                     &ipv6_hdr(skb)->daddr,
                                     skb->len, inet->inet_num, skb->csum))
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
        }
        if (!skb_csum_unnecessary(skb))
                skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                                         &ipv6_hdr(skb)->daddr,
                                                         skb->len,
                                                         inet->inet_num, 0));

        if (inet_test_bit(HDRINCL, sk)) {
                if (skb_checksum_complete(skb)) {
                        atomic_inc(&sk->sk_drops);
                        sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
                        return NET_RX_DROP;
                }
        }

        rawv6_rcv_skb(sk, skb);
        return 0;
}


/*
 *        This should be easy, if there is something there
 *        we return it, otherwise we block.
 */

static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                         int flags, int *addr_len)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
        struct sk_buff *skb;
        size_t copied;
        int err;

        if (flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (flags & MSG_ERRQUEUE)
                return ipv6_recv_error(sk, msg, len, addr_len);

        if (np->rxpmtu && np->rxopt.bits.rxpmtu)
                return ipv6_recv_rxpmtu(sk, msg, len, addr_len);

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                goto out;

        copied = skb->len;
        if (copied > len) {
                copied = len;
                msg->msg_flags |= MSG_TRUNC;
        }

        if (skb_csum_unnecessary(skb)) {
                err = skb_copy_datagram_msg(skb, 0, msg, copied);
        } else if (msg->msg_flags&MSG_TRUNC) {
                if (__skb_checksum_complete(skb))
                        goto csum_copy_err;
                err = skb_copy_datagram_msg(skb, 0, msg, copied);
        } else {
                err = skb_copy_and_csum_datagram_msg(skb, 0, msg);
                if (err == -EINVAL)
                        goto csum_copy_err;
        }
        if (err)
                goto out_free;

        /* Copy the address. */
        if (sin6) {
                sin6->sin6_family = AF_INET6;
                sin6->sin6_port = 0;
                sin6->sin6_addr = ipv6_hdr(skb)->saddr;
                sin6->sin6_flowinfo = 0;
                sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
                                                          inet6_iif(skb));
                *addr_len = sizeof(*sin6);
        }

        sock_recv_cmsgs(msg, sk, skb);

        if (np->rxopt.all)
                ip6_datagram_recv_ctl(sk, msg, skb);

        err = copied;
        if (flags & MSG_TRUNC)
                err = skb->len;

out_free:
        skb_free_datagram(sk, skb);
out:
        return err;

csum_copy_err:
        skb_kill_datagram(sk, skb, flags);

        /* Error for blocking case is chosen to masquerade
           as some normal condition.
         */
        err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
        goto out;
}

static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                     struct raw6_sock *rp)
{
        struct ipv6_txoptions *opt;
        struct sk_buff *skb;
        int err = 0;
        int offset;
        int len;
        int total_len;
        __wsum tmp_csum;
        __sum16 csum;

        if (!rp->checksum)
                goto send;

        skb = skb_peek(&sk->sk_write_queue);
        if (!skb)
                goto out;

        offset = rp->offset;
        total_len = inet_sk(sk)->cork.base.length;
        opt = inet6_sk(sk)->cork.opt;
        total_len -= opt ? opt->opt_flen : 0;

        if (offset >= total_len - 1) {
                err = -EINVAL;
                ip6_flush_pending_frames(sk);
                goto out;
        }

        /* should be check HW csum miyazawa */
        if (skb_queue_len(&sk->sk_write_queue) == 1) {
                /*
                 * Only one fragment on the socket.
                 */
                tmp_csum = skb->csum;
        } else {
                struct sk_buff *csum_skb = NULL;
                tmp_csum = 0;

                skb_queue_walk(&sk->sk_write_queue, skb) {
                        tmp_csum = csum_add(tmp_csum, skb->csum);

                        if (csum_skb)
                                continue;

                        len = skb->len - skb_transport_offset(skb);
                        if (offset >= len) {
                                offset -= len;
                                continue;
                        }

                        csum_skb = skb;
                }

                skb = csum_skb;
        }

        offset += skb_transport_offset(skb);
        err = skb_copy_bits(skb, offset, &csum, 2);
        if (err < 0) {
                ip6_flush_pending_frames(sk);
                goto out;
        }

        /* in case cksum was not initialized */
        if (unlikely(csum))
                tmp_csum = csum_sub(tmp_csum, csum_unfold(csum));

        csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
                               total_len, fl6->flowi6_proto, tmp_csum);

        if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP)
                csum = CSUM_MANGLED_0;

        BUG_ON(skb_store_bits(skb, offset, &csum, 2));

send:
        err = ip6_push_pending_frames(sk);
out:
        return err;
}

static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
                        struct flowi6 *fl6, struct dst_entry **dstp,
                        unsigned int flags, const struct sockcm_cookie *sockc)
{
        struct net *net = sock_net(sk);
        struct ipv6hdr *iph;
        struct sk_buff *skb;
        int err;
        struct rt6_info *rt = dst_rt6_info(*dstp);
        int hlen = LL_RESERVED_SPACE(rt->dst.dev);
        int tlen = rt->dst.dev->needed_tailroom;

        if (length > rt->dst.dev->mtu) {
                ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
                return -EMSGSIZE;
        }
        if (length < sizeof(struct ipv6hdr))
                return -EINVAL;
        if (flags&MSG_PROBE)
                goto out;

        skb = sock_alloc_send_skb(sk,
                                  length + hlen + tlen + 15,
                                  flags & MSG_DONTWAIT, &err);
        if (!skb)
                goto error;
        skb_reserve(skb, hlen);

        skb->protocol = htons(ETH_P_IPV6);
        skb->priority = READ_ONCE(sk->sk_priority);
        skb->mark = sockc->mark;
        skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);

        skb_put(skb, length);
        skb_reset_network_header(skb);
        iph = ipv6_hdr(skb);

        skb->ip_summed = CHECKSUM_NONE;

        skb_setup_tx_timestamp(skb, sockc->tsflags);

        if (flags & MSG_CONFIRM)
                skb_set_dst_pending_confirm(skb, 1);

        skb->transport_header = skb->network_header;
        err = memcpy_from_msg(iph, msg, length);
        if (err) {
                err = -EFAULT;
                kfree_skb(skb);
                goto error;
        }

        skb_dst_set(skb, &rt->dst);
        *dstp = NULL;

        /* if egress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip6_out(sk, skb);
        if (unlikely(!skb))
                return 0;

        /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev
         * in the error path. Since skb has been freed, the dst could
         * have been queued for deletion.
         */
        rcu_read_lock();
        IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
        err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
                      NULL, rt->dst.dev, dst_output);
        if (err > 0)
                err = net_xmit_errno(err);
        if (err) {
                IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
                rcu_read_unlock();
                goto error_check;
        }
        rcu_read_unlock();
out:
        return 0;

error:
        IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
error_check:
        if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk))
                err = 0;
        return err;
}

struct raw6_frag_vec {
        struct msghdr *msg;
        int hlen;
        char c[4];
};

static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6)
{
        int err = 0;
        switch (fl6->flowi6_proto) {
        case IPPROTO_ICMPV6:
                rfv->hlen = 2;
                err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
                if (!err) {
                        fl6->fl6_icmp_type = rfv->c[0];
                        fl6->fl6_icmp_code = rfv->c[1];
                }
                break;
        case IPPROTO_MH:
                rfv->hlen = 4;
                err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen);
                if (!err)
                        fl6->fl6_mh_type = rfv->c[2];
        }
        return err;
}

static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
                       struct sk_buff *skb)
{
        struct raw6_frag_vec *rfv = from;

        if (offset < rfv->hlen) {
                int copy = min(rfv->hlen - offset, len);

                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        memcpy(to, rfv->c + offset, copy);
                else
                        skb->csum = csum_block_add(
                                skb->csum,
                                csum_partial_copy_nocheck(rfv->c + offset,
                                                          to, copy),
                                odd);

                odd = 0;
                offset += copy;
                to += copy;
                len -= copy;

                if (!len)
                        return 0;
        }

        offset -= rfv->hlen;

        return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb);
}

static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        struct ipv6_txoptions *opt_to_free = NULL;
        struct ipv6_txoptions opt_space;
        DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
        struct in6_addr *daddr, *final_p, final;
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct raw6_sock *rp = raw6_sk(sk);
        struct ipv6_txoptions *opt = NULL;
        struct ip6_flowlabel *flowlabel = NULL;
        struct dst_entry *dst = NULL;
        struct raw6_frag_vec rfv;
        struct flowi6 fl6;
        struct ipcm6_cookie ipc6;
        int addr_len = msg->msg_namelen;
        int hdrincl;
        u16 proto;
        int err;

        /* Rough check on arithmetic overflow,
           better check is made in ip6_append_data().
         */
        if (len > INT_MAX)
                return -EMSGSIZE;

        /* Mirror BSD error message compatibility */
        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        hdrincl = inet_test_bit(HDRINCL, sk);

        /*
         *        Get and verify the address.
         */
        memset(&fl6, 0, sizeof(fl6));

        fl6.flowi6_mark = READ_ONCE(sk->sk_mark);
        fl6.flowi6_uid = sk->sk_uid;

        ipcm6_init(&ipc6);
        ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
        ipc6.sockc.mark = fl6.flowi6_mark;

        if (sin6) {
                if (addr_len < SIN6_LEN_RFC2133)
                        return -EINVAL;

                if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
                        return -EAFNOSUPPORT;

                /* port is the proto value [0..255] carried in nexthdr */
                proto = ntohs(sin6->sin6_port);

                if (!proto)
                        proto = inet->inet_num;
                else if (proto != inet->inet_num &&
                         inet->inet_num != IPPROTO_RAW)
                        return -EINVAL;

                if (proto > 255)
                        return -EINVAL;

                daddr = &sin6->sin6_addr;
                if (inet6_test_bit(SNDFLOW, sk)) {
                        fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
                        if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
                                flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
                                if (IS_ERR(flowlabel))
                                        return -EINVAL;
                        }
                }

                /*
                 * Otherwise it will be difficult to maintain
                 * sk->sk_dst_cache.
                 */
                if (sk->sk_state == TCP_ESTABLISHED &&
                    ipv6_addr_equal(daddr, &sk->sk_v6_daddr))
                        daddr = &sk->sk_v6_daddr;

                if (addr_len >= sizeof(struct sockaddr_in6) &&
                    sin6->sin6_scope_id &&
                    __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
                        fl6.flowi6_oif = sin6->sin6_scope_id;
        } else {
                if (sk->sk_state != TCP_ESTABLISHED)
                        return -EDESTADDRREQ;

                proto = inet->inet_num;
                daddr = &sk->sk_v6_daddr;
                fl6.flowlabel = np->flow_label;
        }

        if (fl6.flowi6_oif == 0)
                fl6.flowi6_oif = sk->sk_bound_dev_if;

        if (msg->msg_controllen) {
                opt = &opt_space;
                memset(opt, 0, sizeof(struct ipv6_txoptions));
                opt->tot_len = sizeof(struct ipv6_txoptions);
                ipc6.opt = opt;

                err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
                if (err < 0) {
                        fl6_sock_release(flowlabel);
                        return err;
                }
                if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
                        flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
                        if (IS_ERR(flowlabel))
                                return -EINVAL;
                }
                if (!(opt->opt_nflen|opt->opt_flen))
                        opt = NULL;
        }
        if (!opt) {
                opt = txopt_get(np);
                opt_to_free = opt;
        }
        if (flowlabel)
                opt = fl6_merge_options(&opt_space, flowlabel, opt);
        opt = ipv6_fixup_options(&opt_space, opt);

        fl6.flowi6_proto = proto;
        fl6.flowi6_mark = ipc6.sockc.mark;

        if (!hdrincl) {
                rfv.msg = msg;
                rfv.hlen = 0;
                err = rawv6_probe_proto_opt(&rfv, &fl6);
                if (err)
                        goto out;
        }

        if (!ipv6_addr_any(daddr))
                fl6.daddr = *daddr;
        else
                fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
        if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
                fl6.saddr = np->saddr;

        final_p = fl6_update_dst(&fl6, opt, &final);

        if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
                fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
        else if (!fl6.flowi6_oif)
                fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
        security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));

        if (hdrincl)
                fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;

        if (ipc6.tclass < 0)
                ipc6.tclass = np->tclass;

        fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);

        dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                goto out;
        }
        if (ipc6.hlimit < 0)
                ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);

        if (ipc6.dontfrag < 0)
                ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);

        if (msg->msg_flags&MSG_CONFIRM)
                goto do_confirm;

back_from_confirm:
        if (hdrincl)
                err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
                                        msg->msg_flags, &ipc6.sockc);
        else {
                ipc6.opt = opt;
                lock_sock(sk);
                err = ip6_append_data(sk, raw6_getfrag, &rfv,
                        len, 0, &ipc6, &fl6, dst_rt6_info(dst),
                        msg->msg_flags);

                if (err)
                        ip6_flush_pending_frames(sk);
                else if (!(msg->msg_flags & MSG_MORE))
                        err = rawv6_push_pending_frames(sk, &fl6, rp);
                release_sock(sk);
        }
done:
        dst_release(dst);
out:
        fl6_sock_release(flowlabel);
        txopt_put(opt_to_free);
        return err < 0 ? err : len;
do_confirm:
        if (msg->msg_flags & MSG_PROBE)
                dst_confirm_neigh(dst, &fl6.daddr);
        if (!(msg->msg_flags & MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
        goto done;
}

static int rawv6_seticmpfilter(struct sock *sk, int optname,
                               sockptr_t optval, int optlen)
{
        switch (optname) {
        case ICMPV6_FILTER:
                if (optlen > sizeof(struct icmp6_filter))
                        optlen = sizeof(struct icmp6_filter);
                if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen))
                        return -EFAULT;
                return 0;
        default:
                return -ENOPROTOOPT;
        }

        return 0;
}

static int rawv6_geticmpfilter(struct sock *sk, int optname,
                               char __user *optval, int __user *optlen)
{
        int len;

        switch (optname) {
        case ICMPV6_FILTER:
                if (get_user(len, optlen))
                        return -EFAULT;
                if (len < 0)
                        return -EINVAL;
                if (len > sizeof(struct icmp6_filter))
                        len = sizeof(struct icmp6_filter);
                if (put_user(len, optlen))
                        return -EFAULT;
                if (copy_to_user(optval, &raw6_sk(sk)->filter, len))
                        return -EFAULT;
                return 0;
        default:
                return -ENOPROTOOPT;
        }

        return 0;
}


static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
                               sockptr_t optval, unsigned int optlen)
{
        struct raw6_sock *rp = raw6_sk(sk);
        int val;

        if (optlen < sizeof(val))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        switch (optname) {
        case IPV6_HDRINCL:
                if (sk->sk_type != SOCK_RAW)
                        return -EINVAL;
                inet_assign_bit(HDRINCL, sk, val);
                return 0;
        case IPV6_CHECKSUM:
                if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
                    level == IPPROTO_IPV6) {
                        /*
                         * RFC3542 tells that IPV6_CHECKSUM socket
                         * option in the IPPROTO_IPV6 level is not
                         * allowed on ICMPv6 sockets.
                         * If you want to set it, use IPPROTO_RAW
                         * level IPV6_CHECKSUM socket option
                         * (Linux extension).
                         */
                        return -EINVAL;
                }

                /* You may get strange result with a positive odd offset;
                   RFC2292bis agrees with me. */
                if (val > 0 && (val&1))
                        return -EINVAL;
                if (val < 0) {
                        rp->checksum = 0;
                } else {
                        rp->checksum = 1;
                        rp->offset = val;
                }

                return 0;

        default:
                return -ENOPROTOOPT;
        }
}

static int rawv6_setsockopt(struct sock *sk, int level, int optname,
                            sockptr_t optval, unsigned int optlen)
{
        switch (level) {
        case SOL_RAW:
                break;

        case SOL_ICMPV6:
                if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                        return -EOPNOTSUPP;
                return rawv6_seticmpfilter(sk, optname, optval, optlen);
        case SOL_IPV6:
                if (optname == IPV6_CHECKSUM ||
                    optname == IPV6_HDRINCL)
                        break;
                fallthrough;
        default:
                return ipv6_setsockopt(sk, level, optname, optval, optlen);
        }

        return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
}

static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
                            char __user *optval, int __user *optlen)
{
        struct raw6_sock *rp = raw6_sk(sk);
        int val, len;

        if (get_user(len, optlen))
                return -EFAULT;

        switch (optname) {
        case IPV6_HDRINCL:
                val = inet_test_bit(HDRINCL, sk);
                break;
        case IPV6_CHECKSUM:
                /*
                 * We allow getsockopt() for IPPROTO_IPV6-level
                 * IPV6_CHECKSUM socket option on ICMPv6 sockets
                 * since RFC3542 is silent about it.
                 */
                if (rp->checksum == 0)
                        val = -1;
                else
                        val = rp->offset;
                break;

        default:
                return -ENOPROTOOPT;
        }

        len = min_t(unsigned int, sizeof(int), len);

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}

static int rawv6_getsockopt(struct sock *sk, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        switch (level) {
        case SOL_RAW:
                break;

        case SOL_ICMPV6:
                if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                        return -EOPNOTSUPP;
                return rawv6_geticmpfilter(sk, optname, optval, optlen);
        case SOL_IPV6:
                if (optname == IPV6_CHECKSUM ||
                    optname == IPV6_HDRINCL)
                        break;
                fallthrough;
        default:
                return ipv6_getsockopt(sk, level, optname, optval, optlen);
        }

        return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
}

static int rawv6_ioctl(struct sock *sk, int cmd, int *karg)
{
        switch (cmd) {
        case SIOCOUTQ: {
                *karg = sk_wmem_alloc_get(sk);
                return 0;
        }
        case SIOCINQ: {
                struct sk_buff *skb;

                spin_lock_bh(&sk->sk_receive_queue.lock);
                skb = skb_peek(&sk->sk_receive_queue);
                if (skb)
                        *karg = skb->len;
                else
                        *karg = 0;
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                return 0;
        }

        default:
#ifdef CONFIG_IPV6_MROUTE
                return ip6mr_ioctl(sk, cmd, karg);
#else
                return -ENOIOCTLCMD;
#endif
        }
}

#ifdef CONFIG_COMPAT
static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
{
        switch (cmd) {
        case SIOCOUTQ:
        case SIOCINQ:
                return -ENOIOCTLCMD;
        default:
#ifdef CONFIG_IPV6_MROUTE
                return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg));
#else
                return -ENOIOCTLCMD;
#endif
        }
}
#endif

static void rawv6_close(struct sock *sk, long timeout)
{
        if (inet_sk(sk)->inet_num == IPPROTO_RAW)
                ip6_ra_control(sk, -1);
        ip6mr_sk_done(sk);
        sk_common_release(sk);
}

static void raw6_destroy(struct sock *sk)
{
        lock_sock(sk);
        ip6_flush_pending_frames(sk);
        release_sock(sk);
}

static int rawv6_init_sk(struct sock *sk)
{
        struct raw6_sock *rp = raw6_sk(sk);

        switch (inet_sk(sk)->inet_num) {
        case IPPROTO_ICMPV6:
                rp->checksum = 1;
                rp->offset   = 2;
                break;
        case IPPROTO_MH:
                rp->checksum = 1;
                rp->offset   = 4;
                break;
        default:
                break;
        }
        return 0;
}

struct proto rawv6_prot = {
        .name                   = "RAWv6",
        .owner                   = THIS_MODULE,
        .close                   = rawv6_close,
        .destroy           = raw6_destroy,
        .connect           = ip6_datagram_connect_v6_only,
        .disconnect           = __udp_disconnect,
        .ioctl                   = rawv6_ioctl,
        .init                   = rawv6_init_sk,
        .setsockopt           = rawv6_setsockopt,
        .getsockopt           = rawv6_getsockopt,
        .sendmsg           = rawv6_sendmsg,
        .recvmsg           = rawv6_recvmsg,
        .bind                   = rawv6_bind,
        .backlog_rcv           = rawv6_rcv_skb,
        .hash                   = raw_hash_sk,
        .unhash                   = raw_unhash_sk,
        .obj_size           = sizeof(struct raw6_sock),
        .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6),
        .useroffset           = offsetof(struct raw6_sock, filter),
        .usersize           = sizeof_field(struct raw6_sock, filter),
        .h.raw_hash           = &raw_v6_hashinfo,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = compat_rawv6_ioctl,
#endif
        .diag_destroy           = raw_abort,
};

#ifdef CONFIG_PROC_FS
static int raw6_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
        } else {
                struct sock *sp = v;
                __u16 srcp  = inet_sk(sp)->inet_num;
                ip6_dgram_sock_seq_show(seq, v, srcp, 0,
                                        raw_seq_private(seq)->bucket);
        }
        return 0;
}

static const struct seq_operations raw6_seq_ops = {
        .start =        raw_seq_start,
        .next =                raw_seq_next,
        .stop =                raw_seq_stop,
        .show =                raw6_seq_show,
};

static int __net_init raw6_init_net(struct net *net)
{
        if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops,
                        sizeof(struct raw_iter_state), &raw_v6_hashinfo))
                return -ENOMEM;

        return 0;
}

static void __net_exit raw6_exit_net(struct net *net)
{
        remove_proc_entry("raw6", net->proc_net);
}

static struct pernet_operations raw6_net_ops = {
        .init = raw6_init_net,
        .exit = raw6_exit_net,
};

int __init raw6_proc_init(void)
{
        return register_pernet_subsys(&raw6_net_ops);
}

void raw6_proc_exit(void)
{
        unregister_pernet_subsys(&raw6_net_ops);
}
#endif        /* CONFIG_PROC_FS */

/* Same as inet6_dgram_ops, sans udp_poll.  */
const struct proto_ops inet6_sockraw_ops = {
        .family                   = PF_INET6,
        .owner                   = THIS_MODULE,
        .release           = inet6_release,
        .bind                   = inet6_bind,
        .connect           = inet_dgram_connect,        /* ok                */
        .socketpair           = sock_no_socketpair,        /* a do nothing        */
        .accept                   = sock_no_accept,                /* a do nothing        */
        .getname           = inet6_getname,
        .poll                   = datagram_poll,                /* ok                */
        .ioctl                   = inet6_ioctl,                /* must change  */
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,                /* ok                */
        .shutdown           = inet_shutdown,                /* ok                */
        .setsockopt           = sock_common_setsockopt,        /* ok                */
        .getsockopt           = sock_common_getsockopt,        /* ok                */
        .sendmsg           = inet_sendmsg,                /* ok                */
        .recvmsg           = sock_common_recvmsg,        /* ok                */
        .mmap                   = sock_no_mmap,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet6_compat_ioctl,
#endif
};

static struct inet_protosw rawv6_protosw = {
        .type                = SOCK_RAW,
        .protocol        = IPPROTO_IP,        /* wild card */
        .prot                = &rawv6_prot,
        .ops                = &inet6_sockraw_ops,
        .flags                = INET_PROTOSW_REUSE,
};

int __init rawv6_init(void)
{
        return inet6_register_protosw(&rawv6_protosw);
}

void rawv6_exit(void)
{
        inet6_unregister_protosw(&rawv6_protosw);
}












































































































































































































    2 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/memcontrol.h>

/*
 * pcpu_block_md is the metadata block struct.
 * Each chunk's bitmap is split into a number of full blocks.
 * All units are in terms of bits.
 *
 * The scan hint is the largest known contiguous area before the contig hint.
 * It is not necessarily the actual largest contig hint though.  There is an
 * invariant that the scan_hint_start > contig_hint_start iff
 * scan_hint == contig_hint.  This is necessary because when scanning forward,
 * we don't know if a new contig hint would be better than the current one.
 */
struct pcpu_block_md {
        int                        scan_hint;        /* scan hint for block */
        int                        scan_hint_start; /* block relative starting
                                                    position of the scan hint */
        int                     contig_hint;    /* contig hint for block */
        int                     contig_hint_start; /* block relative starting
                                                      position of the contig hint */
        int                     left_free;      /* size of free space along
                                                   the left side of the block */
        int                     right_free;     /* size of free space along
                                                   the right side of the block */
        int                     first_free;     /* block position of first free */
        int                        nr_bits;        /* total bits responsible for */
};

struct pcpuobj_ext {
#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup        *cgroup;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref        tag;
#endif
};

#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING)
#define NEED_PCPUOBJ_EXT
#endif

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                        nr_alloc;        /* # of allocations */
        size_t                        max_alloc_size; /* largest allocation size */
#endif

        struct list_head        list;                /* linked to pcpu_slot lists */
        int                        free_bytes;        /* free bytes in the chunk */
        struct pcpu_block_md        chunk_md;
        unsigned long                *bound_map;        /* boundary map */

        /*
         * base_addr is the base address of this chunk.
         * To reduce false sharing, current layout is optimized to make sure
         * base_addr locate in the different cacheline with free_bytes and
         * chunk_md.
         */
        void                        *base_addr ____cacheline_aligned_in_smp;

        unsigned long                *alloc_map;        /* allocation map */
        struct pcpu_block_md        *md_blocks;        /* metadata blocks */

        void                        *data;                /* chunk data */
        bool                        immutable;        /* no [de]population allowed */
        bool                        isolated;        /* isolated from active chunk
                                                   slots */
        int                        start_offset;        /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
        int                        end_offset;        /* additional area required to
                                                   have the region end page
                                                   aligned */
#ifdef NEED_PCPUOBJ_EXT
        struct pcpuobj_ext        *obj_exts;        /* vector of object cgroups */
#endif

        int                        nr_pages;        /* # of pages served by this chunk */
        int                        nr_populated;        /* # of populated pages */
        int                     nr_empty_pop_pages; /* # of empty populated pages */
        unsigned long                populated[];        /* populated bitmap */
};

static inline bool need_pcpuobj_ext(void)
{
        if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING))
                return true;
        if (!mem_cgroup_kmem_disabled())
                return true;
        return false;
}

extern spinlock_t pcpu_lock;

extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_sidelined_slot;
extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages;

extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;

/**
 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bitmap blocks used.
 */
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
        return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}

/**
 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 * @pages: number of physical pages
 *
 * This conversion is from physical pages to the number of bits
 * required in the bitmap.
 */
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
        return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bits in the bitmap.
 */
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
        return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

/**
 * pcpu_obj_full_size - helper to calculate size of each accounted object
 * @size: size of area to allocate in bytes
 *
 * For each accounted object there is an extra space which is used to store
 * obj_cgroup membership if kmemcg is not disabled. Charge it too.
 */
static inline size_t pcpu_obj_full_size(size_t size)
{
        size_t extra_size = 0;

#ifdef CONFIG_MEMCG_KMEM
        if (!mem_cgroup_kmem_disabled())
                extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
#endif

        return size * num_possible_cpus() + extra_size;
}

#ifdef CONFIG_PERCPU_STATS

#include <linux/spinlock.h>

struct percpu_stats {
        u64 nr_alloc;                /* lifetime # of allocations */
        u64 nr_dealloc;                /* lifetime # of deallocations */
        u64 nr_cur_alloc;        /* current # of allocations */
        u64 nr_max_alloc;        /* max # of live allocations */
        u32 nr_chunks;                /* current # of live chunks */
        u32 nr_max_chunks;        /* max # of live chunks */
        size_t min_alloc_size;        /* min allocation size */
        size_t max_alloc_size;        /* max allocation size */
};

extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;

/*
 * For debug purposes. We don't care about the flexible array.
 */
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
        memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));

        /* initialize min_alloc_size to unit_size */
        pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}

/*
 * pcpu_stats_area_alloc - increment area allocation stats
 * @chunk: the location of the area being allocated
 * @size: size of area to allocate in bytes
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_alloc++;
        pcpu_stats.nr_cur_alloc++;
        pcpu_stats.nr_max_alloc =
                max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
        pcpu_stats.min_alloc_size =
                min(pcpu_stats.min_alloc_size, size);
        pcpu_stats.max_alloc_size =
                max(pcpu_stats.max_alloc_size, size);

        chunk->nr_alloc++;
        chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}

/*
 * pcpu_stats_area_dealloc - decrement allocation stats
 * @chunk: the location of the area being deallocated
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_dealloc++;
        pcpu_stats.nr_cur_alloc--;

        chunk->nr_alloc--;
}

/*
 * pcpu_stats_chunk_alloc - increment chunk stats
 */
static inline void pcpu_stats_chunk_alloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks++;
        pcpu_stats.nr_max_chunks =
                max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

/*
 * pcpu_stats_chunk_dealloc - decrement chunk stats
 */
static inline void pcpu_stats_chunk_dealloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks--;

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

#else

static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}

static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}

static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}

static inline void pcpu_stats_chunk_alloc(void)
{
}

static inline void pcpu_stats_chunk_dealloc(void)
{
}

#endif /* !CONFIG_PERCPU_STATS */

#endif











































    1 














    1 



    1 





    1 


    1 






    1 


    1 



    1 






















































































































































    1 















    1 






























    1 









    1 

    1 




















    1 














    1 
    1 


















































































































































































    1 














    1 















































    1 



































    1 






    1 






    1 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Scatterlist Cryptographic API.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2002 David S. Miller (davem@redhat.com)
 * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
 *
 * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
 * and Nettle, by Niels Möller.
 */

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/param.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/completion.h>
#include "internal.h"

LIST_HEAD(crypto_alg_list);
EXPORT_SYMBOL_GPL(crypto_alg_list);
DECLARE_RWSEM(crypto_alg_sem);
EXPORT_SYMBOL_GPL(crypto_alg_sem);

BLOCKING_NOTIFIER_HEAD(crypto_chain);
EXPORT_SYMBOL_GPL(crypto_chain);

#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished);
EXPORT_SYMBOL_GPL(__crypto_boot_test_finished);
#endif

static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);

struct crypto_alg *crypto_mod_get(struct crypto_alg *alg)
{
        return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL;
}
EXPORT_SYMBOL_GPL(crypto_mod_get);

void crypto_mod_put(struct crypto_alg *alg)
{
        struct module *module = alg->cra_module;

        crypto_alg_put(alg);
        module_put(module);
}
EXPORT_SYMBOL_GPL(crypto_mod_put);

static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type,
                                              u32 mask)
{
        struct crypto_alg *q, *alg = NULL;
        int best = -2;

        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                int exact, fuzzy;

                if (crypto_is_moribund(q))
                        continue;

                if ((q->cra_flags ^ type) & mask)
                        continue;

                if (crypto_is_larval(q) &&
                    !crypto_is_test_larval((struct crypto_larval *)q) &&
                    ((struct crypto_larval *)q)->mask != mask)
                        continue;

                exact = !strcmp(q->cra_driver_name, name);
                fuzzy = !strcmp(q->cra_name, name);
                if (!exact && !(fuzzy && q->cra_priority > best))
                        continue;

                if (unlikely(!crypto_mod_get(q)))
                        continue;

                best = q->cra_priority;
                if (alg)
                        crypto_mod_put(alg);
                alg = q;

                if (exact)
                        break;
        }

        return alg;
}

static void crypto_larval_destroy(struct crypto_alg *alg)
{
        struct crypto_larval *larval = (void *)alg;

        BUG_ON(!crypto_is_larval(alg));
        if (!IS_ERR_OR_NULL(larval->adult))
                crypto_mod_put(larval->adult);
        kfree(larval);
}

struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask)
{
        struct crypto_larval *larval;

        larval = kzalloc(sizeof(*larval), GFP_KERNEL);
        if (!larval)
                return ERR_PTR(-ENOMEM);

        larval->mask = mask;
        larval->alg.cra_flags = CRYPTO_ALG_LARVAL | type;
        larval->alg.cra_priority = -1;
        larval->alg.cra_destroy = crypto_larval_destroy;

        strscpy(larval->alg.cra_name, name, CRYPTO_MAX_ALG_NAME);
        init_completion(&larval->completion);

        return larval;
}
EXPORT_SYMBOL_GPL(crypto_larval_alloc);

static struct crypto_alg *crypto_larval_add(const char *name, u32 type,
                                            u32 mask)
{
        struct crypto_alg *alg;
        struct crypto_larval *larval;

        larval = crypto_larval_alloc(name, type, mask);
        if (IS_ERR(larval))
                return ERR_CAST(larval);

        refcount_set(&larval->alg.cra_refcnt, 2);

        down_write(&crypto_alg_sem);
        alg = __crypto_alg_lookup(name, type, mask);
        if (!alg) {
                alg = &larval->alg;
                list_add(&alg->cra_list, &crypto_alg_list);
        }
        up_write(&crypto_alg_sem);

        if (alg != &larval->alg) {
                kfree(larval);
                if (crypto_is_larval(alg))
                        alg = crypto_larval_wait(alg);
        }

        return alg;
}

void crypto_larval_kill(struct crypto_alg *alg)
{
        struct crypto_larval *larval = (void *)alg;

        down_write(&crypto_alg_sem);
        list_del(&alg->cra_list);
        up_write(&crypto_alg_sem);
        complete_all(&larval->completion);
        crypto_alg_put(alg);
}
EXPORT_SYMBOL_GPL(crypto_larval_kill);

void crypto_wait_for_test(struct crypto_larval *larval)
{
        int err;

        err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult);
        if (WARN_ON_ONCE(err != NOTIFY_STOP))
                goto out;

        err = wait_for_completion_killable(&larval->completion);
        WARN_ON(err);
out:
        crypto_larval_kill(&larval->alg);
}
EXPORT_SYMBOL_GPL(crypto_wait_for_test);

static void crypto_start_test(struct crypto_larval *larval)
{
        if (!crypto_is_test_larval(larval))
                return;

        if (larval->test_started)
                return;

        down_write(&crypto_alg_sem);
        if (larval->test_started) {
                up_write(&crypto_alg_sem);
                return;
        }

        larval->test_started = true;
        up_write(&crypto_alg_sem);

        crypto_wait_for_test(larval);
}

static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg)
{
        struct crypto_larval *larval = (void *)alg;
        long time_left;

        if (!crypto_boot_test_finished())
                crypto_start_test(larval);

        time_left = wait_for_completion_killable_timeout(
                &larval->completion, 60 * HZ);

        alg = larval->adult;
        if (time_left < 0)
                alg = ERR_PTR(-EINTR);
        else if (!time_left)
                alg = ERR_PTR(-ETIMEDOUT);
        else if (!alg)
                alg = ERR_PTR(-ENOENT);
        else if (IS_ERR(alg))
                ;
        else if (crypto_is_test_larval(larval) &&
                 !(alg->cra_flags & CRYPTO_ALG_TESTED))
                alg = ERR_PTR(-EAGAIN);
        else if (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL)
                alg = ERR_PTR(-EAGAIN);
        else if (!crypto_mod_get(alg))
                alg = ERR_PTR(-EAGAIN);
        crypto_mod_put(&larval->alg);

        return alg;
}

static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type,
                                            u32 mask)
{
        const u32 fips = CRYPTO_ALG_FIPS_INTERNAL;
        struct crypto_alg *alg;
        u32 test = 0;

        if (!((type | mask) & CRYPTO_ALG_TESTED))
                test |= CRYPTO_ALG_TESTED;

        down_read(&crypto_alg_sem);
        alg = __crypto_alg_lookup(name, (type | test) & ~fips,
                                  (mask | test) & ~fips);
        if (alg) {
                if (((type | mask) ^ fips) & fips)
                        mask |= fips;
                mask &= fips;

                if (!crypto_is_larval(alg) &&
                    ((type ^ alg->cra_flags) & mask)) {
                        /* Algorithm is disallowed in FIPS mode. */
                        crypto_mod_put(alg);
                        alg = ERR_PTR(-ENOENT);
                }
        } else if (test) {
                alg = __crypto_alg_lookup(name, type, mask);
                if (alg && !crypto_is_larval(alg)) {
                        /* Test failed */
                        crypto_mod_put(alg);
                        alg = ERR_PTR(-ELIBBAD);
                }
        }
        up_read(&crypto_alg_sem);

        return alg;
}

static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type,
                                               u32 mask)
{
        struct crypto_alg *alg;

        if (!name)
                return ERR_PTR(-ENOENT);

        type &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);
        mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD);

        alg = crypto_alg_lookup(name, type, mask);
        if (!alg && !(mask & CRYPTO_NOLOAD)) {
                request_module("crypto-%s", name);

                if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask &
                      CRYPTO_ALG_NEED_FALLBACK))
                        request_module("crypto-%s-all", name);

                alg = crypto_alg_lookup(name, type, mask);
        }

        if (!IS_ERR_OR_NULL(alg) && crypto_is_larval(alg))
                alg = crypto_larval_wait(alg);
        else if (!alg)
                alg = crypto_larval_add(name, type, mask);

        return alg;
}

int crypto_probing_notify(unsigned long val, void *v)
{
        int ok;

        ok = blocking_notifier_call_chain(&crypto_chain, val, v);
        if (ok == NOTIFY_DONE) {
                request_module("cryptomgr");
                ok = blocking_notifier_call_chain(&crypto_chain, val, v);
        }

        return ok;
}
EXPORT_SYMBOL_GPL(crypto_probing_notify);

struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask)
{
        struct crypto_alg *alg;
        struct crypto_alg *larval;
        int ok;

        /*
         * If the internal flag is set for a cipher, require a caller to
         * invoke the cipher with the internal flag to use that cipher.
         * Also, if a caller wants to allocate a cipher that may or may
         * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and
         * !(mask & CRYPTO_ALG_INTERNAL).
         */
        if (!((type | mask) & CRYPTO_ALG_INTERNAL))
                mask |= CRYPTO_ALG_INTERNAL;

        larval = crypto_larval_lookup(name, type, mask);
        if (IS_ERR(larval) || !crypto_is_larval(larval))
                return larval;

        ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval);

        if (ok == NOTIFY_STOP)
                alg = crypto_larval_wait(larval);
        else {
                crypto_mod_put(larval);
                alg = ERR_PTR(-ENOENT);
        }
        crypto_larval_kill(larval);
        return alg;
}
EXPORT_SYMBOL_GPL(crypto_alg_mod_lookup);

static void crypto_exit_ops(struct crypto_tfm *tfm)
{
        const struct crypto_type *type = tfm->__crt_alg->cra_type;

        if (type && tfm->exit)
                tfm->exit(tfm);
}

static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
{
        const struct crypto_type *type_obj = alg->cra_type;
        unsigned int len;

        len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1);
        if (type_obj)
                return len + type_obj->ctxsize(alg, type, mask);

        switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
        default:
                BUG();

        case CRYPTO_ALG_TYPE_CIPHER:
                len += crypto_cipher_ctxsize(alg);
                break;

        case CRYPTO_ALG_TYPE_COMPRESS:
                len += crypto_compress_ctxsize(alg);
                break;
        }

        return len;
}

void crypto_shoot_alg(struct crypto_alg *alg)
{
        down_write(&crypto_alg_sem);
        alg->cra_flags |= CRYPTO_ALG_DYING;
        up_write(&crypto_alg_sem);
}
EXPORT_SYMBOL_GPL(crypto_shoot_alg);

struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type,
                                         u32 mask, gfp_t gfp)
{
        struct crypto_tfm *tfm;
        unsigned int tfm_size;
        int err = -ENOMEM;

        tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask);
        tfm = kzalloc(tfm_size, gfp);
        if (tfm == NULL)
                goto out_err;

        tfm->__crt_alg = alg;
        refcount_set(&tfm->refcnt, 1);

        if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
                goto cra_init_failed;

        goto out;

cra_init_failed:
        crypto_exit_ops(tfm);
        if (err == -EAGAIN)
                crypto_shoot_alg(alg);
        kfree(tfm);
out_err:
        tfm = ERR_PTR(err);
out:
        return tfm;
}
EXPORT_SYMBOL_GPL(__crypto_alloc_tfmgfp);

struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
                                      u32 mask)
{
        return __crypto_alloc_tfmgfp(alg, type, mask, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__crypto_alloc_tfm);

/*
 *        crypto_alloc_base - Locate algorithm and allocate transform
 *        @alg_name: Name of algorithm
 *        @type: Type of algorithm
 *        @mask: Mask for type comparison
 *
 *        This function should not be used by new algorithm types.
 *        Please use crypto_alloc_tfm instead.
 *
 *        crypto_alloc_base() will first attempt to locate an already loaded
 *        algorithm.  If that fails and the kernel supports dynamically loadable
 *        modules, it will then attempt to load a module of the same name or
 *        alias.  If that fails it will send a query to any loaded crypto manager
 *        to construct an algorithm on the fly.  A refcount is grabbed on the
 *        algorithm which is then associated with the new transform.
 *
 *        The returned transform is of a non-determinate type.  Most people
 *        should use one of the more specific allocation functions such as
 *        crypto_alloc_skcipher().
 *
 *        In case of error the return value is an error pointer.
 */
struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask)
{
        struct crypto_tfm *tfm;
        int err;

        for (;;) {
                struct crypto_alg *alg;

                alg = crypto_alg_mod_lookup(alg_name, type, mask);
                if (IS_ERR(alg)) {
                        err = PTR_ERR(alg);
                        goto err;
                }

                tfm = __crypto_alloc_tfm(alg, type, mask);
                if (!IS_ERR(tfm))
                        return tfm;

                crypto_mod_put(alg);
                err = PTR_ERR(tfm);

err:
                if (err != -EAGAIN)
                        break;
                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        break;
                }
        }

        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(crypto_alloc_base);

static void *crypto_alloc_tfmmem(struct crypto_alg *alg,
                                 const struct crypto_type *frontend, int node,
                                 gfp_t gfp)
{
        struct crypto_tfm *tfm;
        unsigned int tfmsize;
        unsigned int total;
        char *mem;

        tfmsize = frontend->tfmsize;
        total = tfmsize + sizeof(*tfm) + frontend->extsize(alg);

        mem = kzalloc_node(total, gfp, node);
        if (mem == NULL)
                return ERR_PTR(-ENOMEM);

        tfm = (struct crypto_tfm *)(mem + tfmsize);
        tfm->__crt_alg = alg;
        tfm->node = node;
        refcount_set(&tfm->refcnt, 1);

        return mem;
}

void *crypto_create_tfm_node(struct crypto_alg *alg,
                             const struct crypto_type *frontend,
                             int node)
{
        struct crypto_tfm *tfm;
        char *mem;
        int err;

        mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_KERNEL);
        if (IS_ERR(mem))
                goto out;

        tfm = (struct crypto_tfm *)(mem + frontend->tfmsize);

        err = frontend->init_tfm(tfm);
        if (err)
                goto out_free_tfm;

        if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
                goto cra_init_failed;

        goto out;

cra_init_failed:
        crypto_exit_ops(tfm);
out_free_tfm:
        if (err == -EAGAIN)
                crypto_shoot_alg(alg);
        kfree(mem);
        mem = ERR_PTR(err);
out:
        return mem;
}
EXPORT_SYMBOL_GPL(crypto_create_tfm_node);

void *crypto_clone_tfm(const struct crypto_type *frontend,
                       struct crypto_tfm *otfm)
{
        struct crypto_alg *alg = otfm->__crt_alg;
        struct crypto_tfm *tfm;
        char *mem;

        mem = ERR_PTR(-ESTALE);
        if (unlikely(!crypto_mod_get(alg)))
                goto out;

        mem = crypto_alloc_tfmmem(alg, frontend, otfm->node, GFP_ATOMIC);
        if (IS_ERR(mem)) {
                crypto_mod_put(alg);
                goto out;
        }

        tfm = (struct crypto_tfm *)(mem + frontend->tfmsize);
        tfm->crt_flags = otfm->crt_flags;
        tfm->exit = otfm->exit;

out:
        return mem;
}
EXPORT_SYMBOL_GPL(crypto_clone_tfm);

struct crypto_alg *crypto_find_alg(const char *alg_name,
                                   const struct crypto_type *frontend,
                                   u32 type, u32 mask)
{
        if (frontend) {
                type &= frontend->maskclear;
                mask &= frontend->maskclear;
                type |= frontend->type;
                mask |= frontend->maskset;
        }

        return crypto_alg_mod_lookup(alg_name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_find_alg);

/*
 *        crypto_alloc_tfm_node - Locate algorithm and allocate transform
 *        @alg_name: Name of algorithm
 *        @frontend: Frontend algorithm type
 *        @type: Type of algorithm
 *        @mask: Mask for type comparison
 *        @node: NUMA node in which users desire to put requests, if node is
 *                NUMA_NO_NODE, it means users have no special requirement.
 *
 *        crypto_alloc_tfm() will first attempt to locate an already loaded
 *        algorithm.  If that fails and the kernel supports dynamically loadable
 *        modules, it will then attempt to load a module of the same name or
 *        alias.  If that fails it will send a query to any loaded crypto manager
 *        to construct an algorithm on the fly.  A refcount is grabbed on the
 *        algorithm which is then associated with the new transform.
 *
 *        The returned transform is of a non-determinate type.  Most people
 *        should use one of the more specific allocation functions such as
 *        crypto_alloc_skcipher().
 *
 *        In case of error the return value is an error pointer.
 */

void *crypto_alloc_tfm_node(const char *alg_name,
                       const struct crypto_type *frontend, u32 type, u32 mask,
                       int node)
{
        void *tfm;
        int err;

        for (;;) {
                struct crypto_alg *alg;

                alg = crypto_find_alg(alg_name, frontend, type, mask);
                if (IS_ERR(alg)) {
                        err = PTR_ERR(alg);
                        goto err;
                }

                tfm = crypto_create_tfm_node(alg, frontend, node);
                if (!IS_ERR(tfm))
                        return tfm;

                crypto_mod_put(alg);
                err = PTR_ERR(tfm);

err:
                if (err != -EAGAIN)
                        break;
                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        break;
                }
        }

        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(crypto_alloc_tfm_node);

/*
 *        crypto_destroy_tfm - Free crypto transform
 *        @mem: Start of tfm slab
 *        @tfm: Transform to free
 *
 *        This function frees up the transform and any associated resources,
 *        then drops the refcount on the associated algorithm.
 */
void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
{
        struct crypto_alg *alg;

        if (IS_ERR_OR_NULL(mem))
                return;

        if (!refcount_dec_and_test(&tfm->refcnt))
                return;
        alg = tfm->__crt_alg;

        if (!tfm->exit && alg->cra_exit)
                alg->cra_exit(tfm);
        crypto_exit_ops(tfm);
        crypto_mod_put(alg);
        kfree_sensitive(mem);
}
EXPORT_SYMBOL_GPL(crypto_destroy_tfm);

int crypto_has_alg(const char *name, u32 type, u32 mask)
{
        int ret = 0;
        struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask);

        if (!IS_ERR(alg)) {
                crypto_mod_put(alg);
                ret = 1;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_has_alg);

void crypto_req_done(void *data, int err)
{
        struct crypto_wait *wait = data;

        if (err == -EINPROGRESS)
                return;

        wait->err = err;
        complete(&wait->completion);
}
EXPORT_SYMBOL_GPL(crypto_req_done);

MODULE_DESCRIPTION("Cryptographic core API");
MODULE_LICENSE("GPL");





















































































































































































































































































































































































































































    2 






























    2 


    2 





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>

struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;

/**
 * enum mmu_notifier_event - reason for the mmu notifier callback
 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
 * move the range
 *
 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
 * madvise() or replacing a page by another one, ...).
 *
 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
 * ie using the vma access permission (vm_page_prot) to update the whole range
 * is enough no need to inspect changes to the CPU page table (mprotect()
 * syscall)
 *
 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
 * pages in the range so to mirror those changes the user must inspect the CPU
 * page table (from the end callback).
 *
 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
 * access flags). User should soft dirty the page in the end callback to make
 * sure that anyone relying on soft dirtiness catch pages that might be written
 * through non CPU mappings.
 *
 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
 * that the mm refcount is zero and the range is no longer accessible.
 *
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * owner field matches the driver's device private pgmap owner.
 *
 * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
 * longer have exclusive access to the page. When sent during creation of an
 * exclusive range the owner will be initialised to the value provided by the
 * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
 */
enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_CLEAR,
        MMU_NOTIFY_PROTECTION_VMA,
        MMU_NOTIFY_PROTECTION_PAGE,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
        MMU_NOTIFY_EXCLUSIVE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

struct mmu_notifier_ops {
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
         * freed. This can run concurrently with other mmu notifier
         * methods (the ones invoked outside the mm context) and it
         * should tear down all secondary mmu mappings and freeze the
         * secondary mmu. If this method isn't implemented you've to
         * be sure that nothing could possibly write to the pages
         * through the secondary mmu by the time the last thread with
         * tsk->mm == mm exits.
         *
         * As side note: the pages freed after ->release returns could
         * be immediately reallocated by the gart at an alias physical
         * address with a different cache model, so if ->release isn't
         * implemented because all _software_ driven memory accesses
         * through the secondary mmu are terminated by the time the
         * last thread of this mm quits, you've also to be sure that
         * speculative _hardware_ operations can't allocate dirty
         * cachelines in the cpu that could not be snooped and made
         * coherent with the other read and write operations happening
         * through the gart alias address, so leading to memory
         * corruption.
         */
        void (*release)(struct mmu_notifier *subscription,
                        struct mm_struct *mm);

        /*
         * clear_flush_young is called after the VM is
         * test-and-clearing the young/accessed bitflag in the
         * pte. This way the VM will provide proper aging to the
         * accesses to the page through the secondary MMUs and not
         * only to the ones through the Linux pte.
         * Start-end is necessary in case the secondary MMU is mapping the page
         * at a smaller granularity than the primary MMU.
         */
        int (*clear_flush_young)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * clear_young is a lightweight version of clear_flush_young. Like the
         * latter, it is supposed to test-and-clear the young/accessed bitflag
         * in the secondary pte, but it may omit flushing the secondary tlb.
         */
        int (*clear_young)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long start,
                           unsigned long end);

        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
         * down the secondary mapping on the page.
         */
        int (*test_young)(struct mmu_notifier *subscription,
                          struct mm_struct *mm,
                          unsigned long address);

        /*
         * invalidate_range_start() and invalidate_range_end() must be
         * paired and are called only when the mmap_lock and/or the
         * locks protecting the reverse maps are held. If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().
         *
         * Invalidation of multiple concurrent ranges may be
         * optionally permitted by the driver. Either way the
         * establishment of sptes is forbidden in the range passed to
         * invalidate_range_begin/end for the whole duration of the
         * invalidate_range_begin/end critical section.
         *
         * invalidate_range_start() is called when all pages in the
         * range are still mapped and have at least a refcount of one.
         *
         * invalidate_range_end() is called when all pages in the
         * range have been unmapped and the pages have been freed by
         * the VM.
         *
         * The VM will remove the page table entries and potentially
         * the page between invalidate_range_start() and
         * invalidate_range_end(). If the page must not be freed
         * because of pending I/O or other circumstances then the
         * invalidate_range_start() callback (or the initial mapping
         * by the driver) must make sure that the refcount is kept
         * elevated.
         *
         * If the driver increases the refcount when the pages are
         * initially mapped into an address space then either
         * invalidate_range_start() or invalidate_range_end() may
         * decrease the refcount. If the refcount is decreased on
         * invalidate_range_start() then the VM can free pages as page
         * table entries are removed.  If the refcount is only
         * dropped on invalidate_range_end() then the driver itself
         * will drop the last refcount but it must take care to flush
         * any secondary tlb before doing the final free on the
         * page. Pages will no longer be referenced by the linux
         * address space but may still be referenced by sptes until
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
         * sleep and has to return with -EAGAIN if sleeping would be required.
         * 0 should be returned otherwise. Please note that notifiers that can
         * fail invalidate_range_start are not allowed to implement
         * invalidate_range_end, as there is no mechanism for informing the
         * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
        void (*invalidate_range_end)(struct mmu_notifier *subscription,
                                     const struct mmu_notifier_range *range);

        /*
         * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB
         * which shares page-tables with the CPU. The
         * invalidate_range_start()/end() callbacks should not be implemented as
         * invalidate_secondary_tlbs() already catches the points in time when
         * an external TLB needs to be flushed.
         *
         * This requires arch_invalidate_secondary_tlbs() to be called while
         * holding the ptl spin-lock and therefore this callback is not allowed
         * to sleep.
         *
         * This is called by architecture code whenever invalidating a TLB
         * entry. It is assumed that any secondary TLB has the same rules for
         * when invalidations are required. If this is not the case architecture
         * code will need to call this explicitly when required for secondary
         * TLB invalidation.
         */
        void (*arch_invalidate_secondary_tlbs)(
                                        struct mmu_notifier *subscription,
                                        struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long end);

        /*
         * These callbacks are used with the get/put interface to manage the
         * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
         * notifier for use with the mm.
         *
         * free_notifier() is only called after the mmu_notifier has been
         * fully put, calls to any ops callback are prevented and no ops
         * callbacks are currently running. It is called from a SRCU callback
         * and cannot sleep.
         */
        struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
        void (*free_notifier)(struct mmu_notifier *subscription);
};

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
        struct hlist_node hlist;
        const struct mmu_notifier_ops *ops;
        struct mm_struct *mm;
        struct rcu_head rcu;
        unsigned int users;
};

/**
 * struct mmu_interval_notifier_ops
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 */
struct mmu_interval_notifier_ops {
        bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
                           const struct mmu_notifier_range *range,
                           unsigned long cur_seq);
};

struct mmu_interval_notifier {
        struct interval_tree_node interval_tree;
        const struct mmu_interval_notifier_ops *ops;
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
};

#ifdef CONFIG_MMU_NOTIFIER

#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif

struct mmu_notifier_range {
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
        unsigned flags;
        enum mmu_notifier_event event;
        void *owner;
};

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return unlikely(mm->notifier_subscriptions);
}

struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
        struct mmu_notifier *ret;

        mmap_write_lock(mm);
        ret = mmu_notifier_get_locked(ops, mm);
        mmap_write_unlock(mm);
        return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
                                 struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
                                   struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
                                    struct mm_struct *mm);

unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);

/**
 * mmu_interval_set_seq - Save the invalidation sequence
 * @interval_sub - The subscription passed to invalidate
 * @cur_seq - The cur_seq passed to the invalidate() callback
 *
 * This must be called unconditionally from the invalidate callback of a
 * struct mmu_interval_notifier_ops under the same lock that is used to call
 * mmu_interval_read_retry(). It updates the sequence number for later use by
 * mmu_interval_read_retry(). The provided cur_seq will always be odd.
 *
 * If the caller does not call mmu_interval_read_begin() or
 * mmu_interval_read_retry() then this call is not required.
 */
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
                     unsigned long cur_seq)
{
        WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}

/**
 * mmu_interval_read_retry - End a read side critical section against a VA range
 * interval_sub: The subscription
 * seq: The return of the paired mmu_interval_read_begin()
 *
 * This MUST be called under a user provided lock that is also held
 * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
 *
 * Each call should be paired with a single mmu_interval_read_begin() and
 * should be used to conclude the read side.
 *
 * Returns true if an invalidation collided with this critical section, and
 * the caller should retry.
 */
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
                        unsigned long seq)
{
        return interval_sub->invalidate_seq != seq;
}

/**
 * mmu_interval_check_retry - Test if a collision has occurred
 * interval_sub: The subscription
 * seq: The return of the matching mmu_interval_read_begin()
 *
 * This can be used in the critical section between mmu_interval_read_begin()
 * and mmu_interval_read_retry().  A return of true indicates an invalidation
 * has collided with this critical region and a future
 * mmu_interval_read_retry() will return true.
 *
 * False is not reliable and only suggests a collision may not have
 * occurred. It can be called many times and does not have to hold the user
 * provided lock.
 *
 * This call can be used as part of loops and other expensive operations to
 * expedite a retry.
 */
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
                         unsigned long seq)
{
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        return READ_ONCE(interval_sub->invalidate_seq) != seq;
}

extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
                                      unsigned long start,
                                      unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_release(mm);
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_flush_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_test_young(mm, address);
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        might_sleep();

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

/*
 * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
 * can return an error if a notifier can't proceed without blocking, in which
 * case you're not allowed to modify PTEs in the specified range.
 *
 * This is mainly intended for OOM handling.
 */
static inline int __must_check
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        int ret = 0;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                ret = __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        return ret;
}

static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        if (mmu_notifier_range_blockable(range))
                might_sleep();

        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range);
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
        mm->notifier_subscriptions = NULL;
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_subscriptions_destroy(mm);
}


static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
                                           enum mmu_notifier_event event,
                                           unsigned flags,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
        range->flags = flags;
}

static inline void mmu_notifier_range_init_owner(
                        struct mmu_notifier_range *range,
                        enum mmu_notifier_event event, unsigned int flags,
                        struct mm_struct *mm, unsigned long start,
                        unsigned long end, void *owner)
{
        mmu_notifier_range_init(range, event, flags, mm, start, end);
        range->owner = owner;
}

#define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_clear_flush_young(___vma, ___address, __ptep);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PMD_SIZE);        \
        __young;                                                        \
})

#define ptep_clear_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PMD_SIZE);        \
        __young;                                                        \
})

#else /* CONFIG_MMU_NOTIFIER */

struct mmu_notifier_range {
        unsigned long start;
        unsigned long end;
};

static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
                                            unsigned long start,
                                            unsigned long end)
{
        range->start = start;
        range->end = end;
}

#define mmu_notifier_range_init(range,event,flags,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \
                                        end, owner) \
        _mmu_notifier_range_init(range, start, end)

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return true;
}

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return 0;
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        return 0;
}

static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

#define mmu_notifier_range_update_to_read_only(r) false

#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define        ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush

static inline void mmu_notifier_synchronize(void)
{
}

#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */






























   12 





   11 








































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LOCAL_LOCK_H
# error "Do not include directly, include linux/local_lock.h"
#endif

#include <linux/percpu-defs.h>
#include <linux/lockdep.h>

#ifndef CONFIG_PREEMPT_RT

typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
        struct task_struct        *owner;
#endif
} local_lock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname)                \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_CONFIG,        \
                .lock_type = LD_LOCK_PERCPU,                \
        },                                                \
        .owner = NULL,

static inline void local_lock_acquire(local_lock_t *l)
{
        lock_map_acquire(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
        DEBUG_LOCKS_WARN_ON(l->owner != current);
        l->owner = NULL;
        lock_map_release(&l->dep_map);
}

static inline void local_lock_debug_init(local_lock_t *l)
{
        l->owner = NULL;
}
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname)        { LOCAL_LOCK_DEBUG_INIT(lockname) }

#define __local_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_PERCPU);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __spinlock_nested_bh_init(lock)                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_NORMAL);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_lock(lock)                                        \
        do {                                                        \
                preempt_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irq(lock)                                        \
        do {                                                        \
                local_irq_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                local_irq_save(flags);                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_unlock(lock)                                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                preempt_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                                \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_enable();                                \
        } while (0)

#define __local_unlock_irqrestore(lock, flags)                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_restore(flags);                        \
        } while (0)

#define __local_lock_nested_bh(lock)                                \
        do {                                                        \
                lockdep_assert_in_softirq();                        \
                local_lock_acquire(this_cpu_ptr(lock));        \
        } while (0)

#define __local_unlock_nested_bh(lock)                                \
        local_lock_release(this_cpu_ptr(lock))

#else /* !CONFIG_PREEMPT_RT */

/*
 * On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the
 * critical section while staying preemptible.
 */
typedef spinlock_t local_lock_t;

#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))

#define __local_lock_init(l)                                        \
        do {                                                        \
                local_spin_lock_init((l));                        \
        } while (0)

#define __local_lock(__lock)                                        \
        do {                                                        \
                migrate_disable();                                \
                spin_lock(this_cpu_ptr((__lock)));                \
        } while (0)

#define __local_lock_irq(lock)                        __local_lock(lock)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                flags = 0;                                        \
                __local_lock(lock);                                \
        } while (0)

#define __local_unlock(__lock)                                        \
        do {                                                        \
                spin_unlock(this_cpu_ptr((__lock)));                \
                migrate_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                __local_unlock(lock)

#define __local_unlock_irqrestore(lock, flags)        __local_unlock(lock)

#define __local_lock_nested_bh(lock)                                \
do {                                                                \
        lockdep_assert_in_softirq_func();                        \
        spin_lock(this_cpu_ptr(lock));                                \
} while (0)

#define __local_unlock_nested_bh(lock)                                \
do {                                                                \
        spin_unlock(this_cpu_ptr((lock)));                        \
} while (0)

#endif /* CONFIG_PREEMPT_RT */




































   18 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/fault-inject.h>
#include <linux/fault-inject-usercopy.h>

static struct {
        struct fault_attr attr;
} fail_usercopy = {
        .attr = FAULT_ATTR_INITIALIZER,
};

static int __init setup_fail_usercopy(char *str)
{
        return setup_fault_attr(&fail_usercopy.attr, str);
}
__setup("fail_usercopy=", setup_fail_usercopy);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_usercopy_debugfs(void)
{
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_usercopy", NULL,
                                        &fail_usercopy.attr);
        if (IS_ERR(dir))
                return PTR_ERR(dir);

        return 0;
}

late_initcall(fail_usercopy_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

bool should_fail_usercopy(void)
{
        return should_fail(&fail_usercopy.attr, 1);
}
EXPORT_SYMBOL_GPL(should_fail_usercopy);






















    2 

    3 




    3 









    3 

    3 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>

/*
 * This is an implementation of the notion of "decrement a
 * reference count, and return locked if it decremented to zero".
 *
 * NOTE NOTE NOTE! This is _not_ equivalent to
 *
 *        if (atomic_dec_and_test(&atomic)) {
 *                spin_lock(&lock);
 *                return 1;
 *        }
 *        return 0;
 *
 * because the spin-lock and the decrement must be
 * "atomic".
 */
int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock(lock);
        return 0;
}

EXPORT_SYMBOL(_atomic_dec_and_lock);

int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                 unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave);

int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock(lock);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_raw_lock);

int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                     unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);












































































    1 








































    1 

















    1 
























































































































































































   24 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
 *
 * Based on the original implementation which is:
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Parts of the original code have been moved to arch/x86/vdso/vma.c
 *
 * This file implements vsyscall emulation.  vsyscalls are a legacy ABI:
 * Userspace can request certain kernel services by calling fixed
 * addresses.  This concept is problematic:
 *
 * - It interferes with ASLR.
 * - It's awkward to write code that lives in kernel addresses but is
 *   callable by userspace at fixed addresses.
 * - The whole concept is impossible for 32-bit compat userspace.
 * - UML cannot easily virtualize a vsyscall.
 *
 * As of mid-2014, I believe that there is no new userspace code that
 * will use a vsyscall if the vDSO is present.  I hope that there will
 * soon be no new userspace code that will ever use a vsyscall.
 *
 * The code in this file emulates vsyscalls when notified of a page
 * fault to a vsyscall address.
 */

#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/mm_types.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>

#include <asm/vsyscall.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
#include <asm/paravirt.h>

#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"

static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
        NONE;
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
        XONLY;
#else
        #error VSYSCALL config is broken
#endif

static int __init vsyscall_setup(char *str)
{
        if (str) {
                if (!strcmp("emulate", str))
                        vsyscall_mode = EMULATE;
                else if (!strcmp("xonly", str))
                        vsyscall_mode = XONLY;
                else if (!strcmp("none", str))
                        vsyscall_mode = NONE;
                else
                        return -EINVAL;

                return 0;
        }

        return -EINVAL;
}
early_param("vsyscall", vsyscall_setup);

static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
                              const char *message)
{
        if (!show_unhandled_signals)
                return;

        printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
                           level, current->comm, task_pid_nr(current),
                           message, regs->ip, regs->cs,
                           regs->sp, regs->ax, regs->si, regs->di);
}

static int addr_to_vsyscall_nr(unsigned long addr)
{
        int nr;

        if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
                return -EINVAL;

        nr = (addr & 0xC00UL) >> 10;
        if (nr >= 3)
                return -EINVAL;

        return nr;
}

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
        if (!access_ok((void __user *)ptr, size)) {
                struct thread_struct *thread = &current->thread;

                thread->error_code        = X86_PF_USER | X86_PF_WRITE;
                thread->cr2                = ptr;
                thread->trap_nr                = X86_TRAP_PF;

                force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
                return false;
        } else {
                return true;
        }
}

bool emulate_vsyscall(unsigned long error_code,
                      struct pt_regs *regs, unsigned long address)
{
        unsigned long caller;
        int vsyscall_nr, syscall_nr, tmp;
        long ret;
        unsigned long orig_dx;

        /* Write faults or kernel-privilege faults never get fixed up. */
        if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
                return false;

        if (!(error_code & X86_PF_INSTR)) {
                /* Failed vsyscall read */
                if (vsyscall_mode == EMULATE)
                        return false;

                /*
                 * User code tried and failed to read the vsyscall page.
                 */
                warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
                return false;
        }

        /*
         * No point in checking CS -- the only way to get here is a user mode
         * trap to a high address, which means that we're in 64-bit user code.
         */

        WARN_ON_ONCE(address != regs->ip);

        if (vsyscall_mode == NONE) {
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall attempted with vsyscall=none");
                return false;
        }

        vsyscall_nr = addr_to_vsyscall_nr(address);

        trace_emulate_vsyscall(vsyscall_nr);

        if (vsyscall_nr < 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
                goto sigsegv;
        }

        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "vsyscall with bad stack (exploit attempt?)");
                goto sigsegv;
        }

        /*
         * Check for access_ok violations and find the syscall nr.
         *
         * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
         * 64-bit, so we don't need to special-case it here.  For all the
         * vsyscalls, NULL means "don't write anything" not "write it at
         * address 0".
         */
        switch (vsyscall_nr) {
        case 0:
                if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
                    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_gettimeofday;
                break;

        case 1:
                if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_time;
                break;

        case 2:
                if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
                    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_getcpu;
                break;
        }

        /*
         * Handle seccomp.  regs->ip must be the original value.
         * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
         *
         * We could optimize the seccomp disabled case, but performance
         * here doesn't matter.
         */
        regs->orig_ax = syscall_nr;
        regs->ax = -ENOSYS;
        tmp = secure_computing();
        if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
                warn_bad_vsyscall(KERN_DEBUG, regs,
                                  "seccomp tried to change syscall nr or ip");
                force_exit_sig(SIGSYS);
                return true;
        }
        regs->orig_ax = -1;
        if (tmp)
                goto do_ret;  /* skip requested */

        /*
         * With a real vsyscall, page faults cause SIGSEGV.
         */
        ret = -EFAULT;
        switch (vsyscall_nr) {
        case 0:
                /* this decodes regs->di and regs->si on its own */
                ret = __x64_sys_gettimeofday(regs);
                break;

        case 1:
                /* this decodes regs->di on its own */
                ret = __x64_sys_time(regs);
                break;

        case 2:
                /* while we could clobber regs->dx, we didn't in the past... */
                orig_dx = regs->dx;
                regs->dx = 0;
                /* this decodes regs->di, regs->si and regs->dx on its own */
                ret = __x64_sys_getcpu(regs);
                regs->dx = orig_dx;
                break;
        }

check_fault:
        if (ret == -EFAULT) {
                /* Bad news -- userspace fed a bad pointer to a vsyscall. */
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall fault (exploit attempt?)");
                goto sigsegv;
        }

        regs->ax = ret;

do_ret:
        /* Emulate a ret instruction. */
        regs->ip = caller;
        regs->sp += 8;
        return true;

sigsegv:
        force_sig(SIGSEGV);
        return true;
}

/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
static const char *gate_vma_name(struct vm_area_struct *vma)
{
        return "[vsyscall]";
}
static const struct vm_operations_struct gate_vma_ops = {
        .name = gate_vma_name,
};
static struct vm_area_struct gate_vma __ro_after_init = {
        .vm_start        = VSYSCALL_ADDR,
        .vm_end                = VSYSCALL_ADDR + PAGE_SIZE,
        .vm_page_prot        = PAGE_READONLY_EXEC,
        .vm_flags        = VM_READ | VM_EXEC,
        .vm_ops                = &gate_vma_ops,
};

struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
        if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
                return NULL;
#endif
        if (vsyscall_mode == NONE)
                return NULL;
        return &gate_vma;
}

int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma = get_gate_vma(mm);

        if (!vma)
                return 0;

        return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

/*
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
 */
int in_gate_area_no_mm(unsigned long addr)
{
        return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}

/*
 * The VSYSCALL page is the only user-accessible page in the kernel address
 * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
 * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
 * are enabled.
 *
 * Some day we may create a "minimal" vsyscall mode in which we emulate
 * vsyscalls but leave the page not present.  If so, we skip calling
 * this.
 */
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
        set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
        p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
        set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
#endif
        pud = pud_offset(p4d, VSYSCALL_ADDR);
        set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
        pmd = pmd_offset(pud, VSYSCALL_ADDR);
        set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}

void __init map_vsyscall(void)
{
        extern char __vsyscall_page;
        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);

        /*
         * For full emulation, the page needs to exist for real.  In
         * execute-only mode, there is no PTE at all backing the vsyscall
         * page.
         */
        if (vsyscall_mode == EMULATE) {
                __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
                             PAGE_KERNEL_VVAR);
                set_vsyscall_pgtable_user_bits(swapper_pg_dir);
        }

        if (vsyscall_mode == XONLY)
                vm_flags_init(&gate_vma, VM_EXEC);

        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
                     (unsigned long)VSYSCALL_ADDR);
}


























































































































    1 







    1 



    1 







    1 





    1 

    1 
    1 

























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
// SPDX-License-Identifier: GPL-2.0
/*
 * IPv6 Address Label subsystem
 * for the IPv6 "Default" Source Address Selection
 *
 * Copyright (C)2007 USAGI/WIDE Project
 */
/*
 * Author:
 *        YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org>
 */

#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/in6.h>
#include <linux/slab.h>
#include <net/addrconf.h>
#include <linux/if_addrlabel.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>

#if 0
#define ADDRLABEL(x...) printk(x)
#else
#define ADDRLABEL(x...) do { ; } while (0)
#endif

/*
 * Policy Table
 */
struct ip6addrlbl_entry {
        struct in6_addr prefix;
        int prefixlen;
        int ifindex;
        int addrtype;
        u32 label;
        struct hlist_node list;
        struct rcu_head rcu;
};

/*
 * Default policy table (RFC6724 + extensions)
 *
 * prefix                addr_type        label
 * -------------------------------------------------------------------------
 * ::1/128                LOOPBACK        0
 * ::/0                        N/A                1
 * 2002::/16                N/A                2
 * ::/96                COMPATv4        3
 * ::ffff:0:0/96        V4MAPPED        4
 * fc00::/7                N/A                5                ULA (RFC 4193)
 * 2001::/32                N/A                6                Teredo (RFC 4380)
 * 2001:10::/28                N/A                7                ORCHID (RFC 4843)
 * fec0::/10                N/A                11                Site-local
 *                                                        (deprecated by RFC3879)
 * 3ffe::/16                N/A                12                6bone
 *
 * Note: 0xffffffff is used if we do not have any policies.
 * Note: Labels for ULA and 6to4 are different from labels listed in RFC6724.
 */

#define IPV6_ADDR_LABEL_DEFAULT        0xffffffffUL

static const __net_initconst struct ip6addrlbl_init_table
{
        const struct in6_addr *prefix;
        int prefixlen;
        u32 label;
} ip6addrlbl_init_table[] = {
        {        /* ::/0 */
                .prefix = &in6addr_any,
                .label = 1,
        }, {        /* fc00::/7 */
                .prefix = &(struct in6_addr){ { { 0xfc } } } ,
                .prefixlen = 7,
                .label = 5,
        }, {        /* fec0::/10 */
                .prefix = &(struct in6_addr){ { { 0xfe, 0xc0 } } },
                .prefixlen = 10,
                .label = 11,
        }, {        /* 2002::/16 */
                .prefix = &(struct in6_addr){ { { 0x20, 0x02 } } },
                .prefixlen = 16,
                .label = 2,
        }, {        /* 3ffe::/16 */
                .prefix = &(struct in6_addr){ { { 0x3f, 0xfe } } },
                .prefixlen = 16,
                .label = 12,
        }, {        /* 2001::/32 */
                .prefix = &(struct in6_addr){ { { 0x20, 0x01 } } },
                .prefixlen = 32,
                .label = 6,
        }, {        /* 2001:10::/28 */
                .prefix = &(struct in6_addr){ { { 0x20, 0x01, 0x00, 0x10 } } },
                .prefixlen = 28,
                .label = 7,
        }, {        /* ::ffff:0:0 */
                .prefix = &(struct in6_addr){ { { [10] = 0xff, [11] = 0xff } } },
                .prefixlen = 96,
                .label = 4,
        }, {        /* ::/96 */
                .prefix = &in6addr_any,
                .prefixlen = 96,
                .label = 3,
        }, {        /* ::1/128 */
                .prefix = &in6addr_loopback,
                .prefixlen = 128,
                .label = 0,
        }
};

/* Find label */
static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p,
                               const struct in6_addr *addr,
                               int addrtype, int ifindex)
{
        if (p->ifindex && p->ifindex != ifindex)
                return false;
        if (p->addrtype && p->addrtype != addrtype)
                return false;
        if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen))
                return false;
        return true;
}

static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net,
                                                  const struct in6_addr *addr,
                                                  int type, int ifindex)
{
        struct ip6addrlbl_entry *p;

        hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
                if (__ip6addrlbl_match(p, addr, type, ifindex))
                        return p;
        }
        return NULL;
}

u32 ipv6_addr_label(struct net *net,
                    const struct in6_addr *addr, int type, int ifindex)
{
        u32 label;
        struct ip6addrlbl_entry *p;

        type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK;

        rcu_read_lock();
        p = __ipv6_addr_label(net, addr, type, ifindex);
        label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT;
        rcu_read_unlock();

        ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n",
                  __func__, addr, type, ifindex, label);

        return label;
}

/* allocate one entry */
static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
                                                 int prefixlen, int ifindex,
                                                 u32 label)
{
        struct ip6addrlbl_entry *newp;
        int addrtype;

        ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n",
                  __func__, prefix, prefixlen, ifindex, (unsigned int)label);

        addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK);

        switch (addrtype) {
        case IPV6_ADDR_MAPPED:
                if (prefixlen > 96)
                        return ERR_PTR(-EINVAL);
                if (prefixlen < 96)
                        addrtype = 0;
                break;
        case IPV6_ADDR_COMPATv4:
                if (prefixlen != 96)
                        addrtype = 0;
                break;
        case IPV6_ADDR_LOOPBACK:
                if (prefixlen != 128)
                        addrtype = 0;
                break;
        }

        newp = kmalloc(sizeof(*newp), GFP_KERNEL);
        if (!newp)
                return ERR_PTR(-ENOMEM);

        ipv6_addr_prefix(&newp->prefix, prefix, prefixlen);
        newp->prefixlen = prefixlen;
        newp->ifindex = ifindex;
        newp->addrtype = addrtype;
        newp->label = label;
        INIT_HLIST_NODE(&newp->list);
        return newp;
}

/* add a label */
static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
                            int replace)
{
        struct ip6addrlbl_entry *last = NULL, *p = NULL;
        struct hlist_node *n;
        int ret = 0;

        ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
                  replace);

        hlist_for_each_entry_safe(p, n,        &net->ipv6.ip6addrlbl_table.head, list) {
                if (p->prefixlen == newp->prefixlen &&
                    p->ifindex == newp->ifindex &&
                    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
                        if (!replace) {
                                ret = -EEXIST;
                                goto out;
                        }
                        hlist_replace_rcu(&p->list, &newp->list);
                        kfree_rcu(p, rcu);
                        goto out;
                } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
                           (p->prefixlen < newp->prefixlen)) {
                        hlist_add_before_rcu(&newp->list, &p->list);
                        goto out;
                }
                last = p;
        }
        if (last)
                hlist_add_behind_rcu(&newp->list, &last->list);
        else
                hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head);
out:
        if (!ret)
                WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq,
                           net->ipv6.ip6addrlbl_table.seq + 1);
        return ret;
}

/* add a label */
static int ip6addrlbl_add(struct net *net,
                          const struct in6_addr *prefix, int prefixlen,
                          int ifindex, u32 label, int replace)
{
        struct ip6addrlbl_entry *newp;
        int ret = 0;

        ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
                  __func__, prefix, prefixlen, ifindex, (unsigned int)label,
                  replace);

        newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label);
        if (IS_ERR(newp))
                return PTR_ERR(newp);
        spin_lock(&net->ipv6.ip6addrlbl_table.lock);
        ret = __ip6addrlbl_add(net, newp, replace);
        spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
        if (ret)
                kfree(newp);
        return ret;
}

/* remove a label */
static int __ip6addrlbl_del(struct net *net,
                            const struct in6_addr *prefix, int prefixlen,
                            int ifindex)
{
        struct ip6addrlbl_entry *p = NULL;
        struct hlist_node *n;
        int ret = -ESRCH;

        ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
                  __func__, prefix, prefixlen, ifindex);

        hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
                if (p->prefixlen == prefixlen &&
                    p->ifindex == ifindex &&
                    ipv6_addr_equal(&p->prefix, prefix)) {
                        hlist_del_rcu(&p->list);
                        kfree_rcu(p, rcu);
                        ret = 0;
                        break;
                }
        }
        return ret;
}

static int ip6addrlbl_del(struct net *net,
                          const struct in6_addr *prefix, int prefixlen,
                          int ifindex)
{
        struct in6_addr prefix_buf;
        int ret;

        ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
                  __func__, prefix, prefixlen, ifindex);

        ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
        spin_lock(&net->ipv6.ip6addrlbl_table.lock);
        ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex);
        spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
        return ret;
}

/* add default label */
static int __net_init ip6addrlbl_net_init(struct net *net)
{
        struct ip6addrlbl_entry *p = NULL;
        struct hlist_node *n;
        int err;
        int i;

        ADDRLABEL(KERN_DEBUG "%s\n", __func__);

        spin_lock_init(&net->ipv6.ip6addrlbl_table.lock);
        INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);

        for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
                err = ip6addrlbl_add(net,
                                     ip6addrlbl_init_table[i].prefix,
                                     ip6addrlbl_init_table[i].prefixlen,
                                     0,
                                     ip6addrlbl_init_table[i].label, 0);
                if (err)
                        goto err_ip6addrlbl_add;
        }
        return 0;

err_ip6addrlbl_add:
        hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
                hlist_del_rcu(&p->list);
                kfree_rcu(p, rcu);
        }
        return err;
}

static void __net_exit ip6addrlbl_net_exit(struct net *net)
{
        struct ip6addrlbl_entry *p = NULL;
        struct hlist_node *n;

        /* Remove all labels belonging to the exiting net */
        spin_lock(&net->ipv6.ip6addrlbl_table.lock);
        hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
                hlist_del_rcu(&p->list);
                kfree_rcu(p, rcu);
        }
        spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
}

static struct pernet_operations ipv6_addr_label_ops = {
        .init = ip6addrlbl_net_init,
        .exit = ip6addrlbl_net_exit,
};

int __init ipv6_addr_label_init(void)
{
        return register_pernet_subsys(&ipv6_addr_label_ops);
}

void ipv6_addr_label_cleanup(void)
{
        unregister_pernet_subsys(&ipv6_addr_label_ops);
}

static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
        [IFAL_ADDRESS]                = { .len = sizeof(struct in6_addr), },
        [IFAL_LABEL]                = { .len = sizeof(u32), },
};

static bool addrlbl_ifindex_exists(struct net *net, int ifindex)
{

        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        rcu_read_unlock();

        return dev != NULL;
}

static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifaddrlblmsg *ifal;
        struct nlattr *tb[IFAL_MAX+1];
        struct in6_addr *pfx;
        u32 label;
        int err = 0;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX,
                                     ifal_policy, extack);
        if (err < 0)
                return err;

        ifal = nlmsg_data(nlh);

        if (ifal->ifal_family != AF_INET6 ||
            ifal->ifal_prefixlen > 128)
                return -EINVAL;

        if (!tb[IFAL_ADDRESS])
                return -EINVAL;
        pfx = nla_data(tb[IFAL_ADDRESS]);

        if (!tb[IFAL_LABEL])
                return -EINVAL;
        label = nla_get_u32(tb[IFAL_LABEL]);
        if (label == IPV6_ADDR_LABEL_DEFAULT)
                return -EINVAL;

        switch (nlh->nlmsg_type) {
        case RTM_NEWADDRLABEL:
                if (ifal->ifal_index &&
                    !addrlbl_ifindex_exists(net, ifal->ifal_index))
                        return -EINVAL;

                err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen,
                                     ifal->ifal_index, label,
                                     nlh->nlmsg_flags & NLM_F_REPLACE);
                break;
        case RTM_DELADDRLABEL:
                err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen,
                                     ifal->ifal_index);
                break;
        default:
                err = -EOPNOTSUPP;
        }
        return err;
}

static void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
                              int prefixlen, int ifindex, u32 lseq)
{
        struct ifaddrlblmsg *ifal = nlmsg_data(nlh);
        ifal->ifal_family = AF_INET6;
        ifal->__ifal_reserved = 0;
        ifal->ifal_prefixlen = prefixlen;
        ifal->ifal_flags = 0;
        ifal->ifal_index = ifindex;
        ifal->ifal_seq = lseq;
};

static int ip6addrlbl_fill(struct sk_buff *skb,
                           const struct ip6addrlbl_entry *p,
                           u32 lseq,
                           u32 portid, u32 seq, int event,
                           unsigned int flags)
{
        struct nlmsghdr *nlh = nlmsg_put(skb, portid, seq, event,
                                         sizeof(struct ifaddrlblmsg), flags);
        if (!nlh)
                return -EMSGSIZE;

        ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq);

        if (nla_put_in6_addr(skb, IFAL_ADDRESS, &p->prefix) < 0 ||
            nla_put_u32(skb, IFAL_LABEL, p->label) < 0) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh,
                                     struct netlink_ext_ack *extack)
{
        struct ifaddrlblmsg *ifal;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request");
                return -EINVAL;
        }

        ifal = nlmsg_data(nlh);
        if (ifal->__ifal_reserved || ifal->ifal_prefixlen ||
            ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request");
                return -EINVAL;
        }

        if (nlmsg_attrlen(nlh, sizeof(*ifal))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request");
                return -EINVAL;
        }

        return 0;
}

static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct ip6addrlbl_entry *p;
        int idx = 0, s_idx = cb->args[0];
        int err = 0;
        u32 lseq;

        if (cb->strict_check) {
                err = ip6addrlbl_valid_dump_req(nlh, cb->extack);
                if (err < 0)
                        return err;
        }

        rcu_read_lock();
        lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq);
        hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
                if (idx >= s_idx) {
                        err = ip6addrlbl_fill(skb, p,
                                              lseq,
                                              NETLINK_CB(cb->skb).portid,
                                              nlh->nlmsg_seq,
                                              RTM_NEWADDRLABEL,
                                              NLM_F_MULTI);
                        if (err < 0)
                                break;
                }
                idx++;
        }
        rcu_read_unlock();
        cb->args[0] = idx;
        return err;
}

static inline int ip6addrlbl_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
                + nla_total_size(16)        /* IFAL_ADDRESS */
                + nla_total_size(4);        /* IFAL_LABEL */
}

static int ip6addrlbl_valid_get_req(struct sk_buff *skb,
                                    const struct nlmsghdr *nlh,
                                    struct nlattr **tb,
                                    struct netlink_ext_ack *extack)
{
        struct ifaddrlblmsg *ifal;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb,
                                              IFAL_MAX, ifal_policy, extack);

        ifal = nlmsg_data(nlh);
        if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifal), tb, IFAL_MAX,
                                            ifal_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= IFAL_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case IFAL_ADDRESS:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in addrlabel get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct ifaddrlblmsg *ifal;
        struct nlattr *tb[IFAL_MAX+1];
        struct in6_addr *addr;
        u32 lseq;
        int err = 0;
        struct ip6addrlbl_entry *p;
        struct sk_buff *skb;

        err = ip6addrlbl_valid_get_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        ifal = nlmsg_data(nlh);

        if (ifal->ifal_family != AF_INET6 ||
            ifal->ifal_prefixlen != 128)
                return -EINVAL;

        if (ifal->ifal_index &&
            !addrlbl_ifindex_exists(net, ifal->ifal_index))
                return -EINVAL;

        if (!tb[IFAL_ADDRESS])
                return -EINVAL;
        addr = nla_data(tb[IFAL_ADDRESS]);

        skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;

        err = -ESRCH;

        rcu_read_lock();
        p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
        lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq);
        if (p)
                err = ip6addrlbl_fill(skb, p, lseq,
                                      NETLINK_CB(in_skb).portid,
                                      nlh->nlmsg_seq,
                                      RTM_NEWADDRLABEL, 0);
        rcu_read_unlock();

        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
        } else {
                err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
        }
        return err;
}

int __init ipv6_addr_label_rtnl_register(void)
{
        int ret;

        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL,
                                   ip6addrlbl_newdel,
                                   NULL, RTNL_FLAG_DOIT_UNLOCKED);
        if (ret < 0)
                return ret;
        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL,
                                   ip6addrlbl_newdel,
                                   NULL, RTNL_FLAG_DOIT_UNLOCKED);
        if (ret < 0)
                return ret;
        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL,
                                   ip6addrlbl_get,
                                   ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED |
                                                    RTNL_FLAG_DUMP_UNLOCKED);
        return ret;
}





















































































































































































    4 
    4 


























































































































    1 






























    1 






    1 




    1 


















    1 









    1 







    1 










    1 















    1 






    2 





















    2 




    2 











































































































































    1 





    1 



    1 































































































































































































































































































































































































































































    3 











    3 



    4 










































































































































































































































































































































































































































































































































































































































































































    1 







    1 




    1 

















    1 









    1 




    1 





    1 

    1 







    1 









    1 


    1 






















    1 




























    1 









    1 



    1 


    1 

    1 
    1 


























































    1 









    1 


    1 



    1 





    1 




    1 














































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 

    1 













    1 

    1 


    1 





    1 















    1 
    1 
    1 















    1 

    1 

    1 














    1 
    1 










    1 




    1 



    1 


























    1 




    1 











    1 





















    1 



    1 













    1 























































































































































    4 
    2 



































































































































































    3 

    1 







    1 


































    1 

    1 

    1 








    1 





















    1 




    1 












    1 



    1 







    1 












    1 
    1 

    1 




























    1 





    1 

    1 

    1 





    1 


























































































































































































    1 






    1 
























    1 













    1 




































    1 




    1 

























































































































































































































    4 

    2 
































































































































































































































































































































































































































































    3 

    3 










































































    4 





    1 





    1 







    1 







    3 
    1 


    3 





    3 







    3 

    3 


    2 




    3 






















    3 






    4 





    4 




































































































































































    2 






    2 






    2 











    2 




































































































































































    2 





    2 



    1 





























    2 
    2 









































































































































    2 






    1 

































































































































































































































































































    2 









    2 















    1 
    3 





    4 

    1 












    1 











































    4 












    4 



    3 
    1 








    4 
















    3 







    3 
    2 














    3 








































    4 










































    4 






    3 




























    3 




    4 

























    4 






    4 



    4 










    2 
    2 

    3 




    4 



    2 
    2 






    2 



    4 










    4 
























































    4 


    4 



    3 






    1 















































    3 



    3 


    2 

















    2 





    2 

























































    1 







    1 







































































































































































































































































































   20 





   24 















































































































































































































    2 





    2 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467

// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *                Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *                (Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"
#include "swap.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

#ifndef CONFIG_NUMA
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);

struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif

static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);

/*
 * Return true if the original pte was a uffd-wp pte marker (so the pte was
 * wr-protected).
 */
static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
{
        if (!userfaultfd_wp(vmf->vma))
                return false;
        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return false;

        return pte_marker_uffd_wp(vmf->orig_pte);
}

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.
 */
void *high_memory;
EXPORT_SYMBOL(high_memory);

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
        /*
         * Transitioning a PTE from 'old' to 'young' can be expensive on
         * some architectures, even if it's performed in hardware. By
         * default, "false" means prefaulted entries will be 'young'.
         */
        return false;
}
#endif

static int __init disable_randmaps(char *s)
{
        randomize_va_space = 0;
        return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);

unsigned long highest_memmap_pfn __read_mostly;

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
        trace_rss_stat(mm, member);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pmd_t *pmd;
        unsigned long next;
        unsigned long start;

        start = addr;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);

        start &= PUD_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PUD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
        mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pud_t *pud;
        unsigned long next;
        unsigned long start;

        start = addr;
        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

        start &= P4D_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= P4D_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pud = pud_offset(p4d, start);
        p4d_clear(p4d);
        pud_free_tlb(tlb, pud, start);
        mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        p4d_t *p4d;
        unsigned long next;
        unsigned long start;

        start = addr;
        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                free_pud_range(tlb, p4d, addr, next, floor, ceiling);
        } while (p4d++, addr = next, addr != end);

        start &= PGDIR_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PGDIR_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        p4d = p4d_offset(pgd, start);
        pgd_clear(pgd);
        p4d_free_tlb(tlb, p4d, start);
}

/*
 * This function frees user-level page tables of a process.
 */
void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        pgd_t *pgd;
        unsigned long next;

        /*
         * The next few lines have given us lots of grief...
         *
         * Why are we testing PMD* at this top level?  Because often
         * there will be no work to do at all, and we'd prefer not to
         * go all the way down to the bottom just to discover that.
         *
         * Why all these "- 1"s?  Because 0 represents both the bottom
         * of the address space and the top of it (using -1 for the
         * top wouldn't help much: the masks would do the wrong thing).
         * The rule is that addr 0 and floor 0 refer to the bottom of
         * the address space, but end 0 and ceiling 0 refer to the top
         * Comparisons need to use "end - 1" and "ceiling - 1" (though
         * that end 0 case should be mythical).
         *
         * Wherever addr is brought up or ceiling brought down, we must
         * be careful to reject "the opposite 0" before it confuses the
         * subsequent tests.  But what about where end is brought down
         * by PMD_SIZE below? no, end can't go down to 0 there.
         *
         * Whereas we round start (addr) and ceiling down, by different
         * masks at different levels, in order to test whether a table
         * now has no other vmas using it, so can be freed, we don't
         * bother to round floor or end up - the tests don't need that.
         */

        addr &= PMD_MASK;
        if (addr < floor) {
                addr += PMD_SIZE;
                if (!addr)
                        return;
        }
        if (ceiling) {
                ceiling &= PMD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
        /*
         * We add page table cache pages with PAGE_SIZE,
         * (see pte_free_tlb()), flush the tlb if we need
         */
        tlb_change_page_size(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked)
{
        do {
                unsigned long addr = vma->vm_start;
                struct vm_area_struct *next;

                /*
                 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
                 * be 0.  This will underflow and is okay.
                 */
                next = mas_find(mas, ceiling - 1);
                if (unlikely(xa_is_zero(next)))
                        next = NULL;

                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                if (mm_wr_locked)
                        vma_start_write(vma);
                unlink_anon_vmas(vma);
                unlink_file_vma(vma);

                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                } else {
                        /*
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = mas_find(mas, ceiling - 1);
                                if (unlikely(xa_is_zero(next)))
                                        next = NULL;
                                if (mm_wr_locked)
                                        vma_start_write(vma);
                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                }
                vma = next;
        } while (vma);
}

void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
        spinlock_t *ptl = pmd_lock(mm, pmd);

        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
                /*
                 * Ensure all pte setup (eg. pte page lock and page clearing) are
                 * visible before the pte is made visible to other CPUs by being
                 * put into page tables.
                 *
                 * The other side of the story is the pointer chasing in the page
                 * table walking code (when walking the page table without locking;
                 * ie. most of the time). Fortunately, these data accesses consist
                 * of a chain of data-dependent loads, meaning most CPUs (alpha
                 * being the notable exception) will already guarantee loads are
                 * seen in-order. See the alpha page table accessors for the
                 * smp_rmb() barriers in page table walking code.
                 */
                smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
                pmd_populate(mm, pmd, *pte);
                *pte = NULL;
        }
        spin_unlock(ptl);
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t new = pte_alloc_one(mm);
        if (!new)
                return -ENOMEM;

        pmd_install(mm, pmd, &new);
        if (new)
                pte_free(mm, new);
        return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
        pte_t *new = pte_alloc_one_kernel(&init_mm);
        if (!new)
                return -ENOMEM;

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                smp_wmb(); /* See comment in pmd_install() */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
        return 0;
}

static inline void init_rss_vec(int *rss)
{
        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
        int i;

        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
}

/*
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 */
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                          pte_t pte, struct page *page)
{
        pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);
        pmd_t *pmd = pmd_offset(pud, addr);
        struct address_space *mapping;
        pgoff_t index;
        static unsigned long resume;
        static unsigned long nr_shown;
        static unsigned long nr_unshown;

        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
         */
        if (nr_shown == 60) {
                if (time_before(jiffies, resume)) {
                        nr_unshown++;
                        return;
                }
                if (nr_unshown) {
                        pr_alert("BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                        nr_unshown = 0;
                }
                nr_shown = 0;
        }
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;

        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
        index = linear_page_index(vma, addr);

        pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
                 current->comm,
                 (long long)pte_val(pte), (long long)pmd_val(*pmd));
        if (page)
                dump_page(page, "bad pte");
        pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
        pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
                 vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->fault : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
                 mapping ? mapping->a_ops->read_folio : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}

/*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
 *
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *        pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        unsigned long pfn = pte_pfn(pte);

        if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
                if (likely(!pte_special(pte)))
                        goto check_pfn;
                if (vma->vm_ops && vma->vm_ops->find_special_page)
                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
                if (is_zero_pfn(pfn))
                        return NULL;
                if (pte_devmap(pte))
                /*
                 * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
                 * and will have refcounts incremented on their struct pages
                 * when they are inserted into PTEs, thus they are safe to
                 * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
                 * do not have refcounts. Example of legacy ZONE_DEVICE is
                 * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
                 */
                        return NULL;

                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */

        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (is_zero_pfn(pfn))
                return NULL;

check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        struct page *page = vm_normal_page(vma, addr, pte);

        if (page)
                return page_folio(page);
        return NULL;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd)
{
        unsigned long pfn = pmd_pfn(pmd);

        /*
         * There is no pmd_special() but there may be special pmds, e.g.
         * in a direct-access (dax) mapping, so let's just replicate the
         * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
         */
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (pmd_devmap(pmd))
                return NULL;
        if (is_huge_zero_pmd(pmd))
                return NULL;
        if (unlikely(pfn > highest_memmap_pfn))
                return NULL;

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}

struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd)
{
        struct page *page = vm_normal_page_pmd(vma, addr, pmd);

        if (page)
                return page_folio(page);
        return NULL;
}
#endif

static void restore_exclusive_pte(struct vm_area_struct *vma,
                                  struct page *page, unsigned long address,
                                  pte_t *ptep)
{
        struct folio *folio = page_folio(page);
        pte_t orig_pte;
        pte_t pte;
        swp_entry_t entry;

        orig_pte = ptep_get(ptep);
        pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
        if (pte_swp_soft_dirty(orig_pte))
                pte = pte_mksoft_dirty(pte);

        entry = pte_to_swp_entry(orig_pte);
        if (pte_swp_uffd_wp(orig_pte))
                pte = pte_mkuffd_wp(pte);
        else if (is_writable_device_exclusive_entry(entry))
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);

        VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) &&
                                           PageAnonExclusive(page)), folio);

        /*
         * No need to take a page reference as one was already
         * created when the swap entry was made.
         */
        if (folio_test_anon(folio))
                folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE);
        else
                /*
                 * Currently device exclusive access only supports anonymous
                 * memory so the entry shouldn't point to a filebacked page.
                 */
                WARN_ON_ONCE(1);

        set_pte_at(vma->vm_mm, address, ptep, pte);

        /*
         * No need to invalidate - it was non-present before. However
         * secondary CPUs may have mappings that need invalidating.
         */
        update_mmu_cache(vma, address, ptep);
}

/*
 * Tries to restore an exclusive pte if the page lock can be acquired without
 * sleeping.
 */
static int
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
                        unsigned long addr)
{
        swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
        struct page *page = pfn_swap_entry_to_page(entry);

        if (trylock_page(page)) {
                restore_exclusive_pte(vma, page, addr, src_pte);
                unlock_page(page);
                return 0;
        }

        return -EBUSY;
}

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
        unsigned long vm_flags = dst_vma->vm_flags;
        pte_t orig_pte = ptep_get(src_pte);
        pte_t pte = orig_pte;
        struct folio *folio;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(orig_pte);

        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
                        return -EIO;

                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
                        spin_lock(&mmlist_lock);
                        if (list_empty(&dst_mm->mmlist))
                                list_add(&dst_mm->mmlist,
                                                &src_mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                /* Mark the swap entry as shared. */
                if (pte_swp_exclusive(orig_pte)) {
                        pte = pte_swp_clear_exclusive(orig_pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
                folio = pfn_swap_entry_folio(entry);

                rss[mm_counter(folio)]++;

                if (!is_readable_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both parent and child
                         * to be set to read. A previously exclusive entry is
                         * now shared.
                         */
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(orig_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
                page = pfn_swap_entry_to_page(entry);
                folio = page_folio(page);

                /*
                 * Update rss count even for unaddressable pages, as
                 * they should treated just like normal pages in this
                 * respect.
                 *
                 * We will likely want to have some new rss counters
                 * for unaddressable pages, at some point. But for now
                 * keep things as they are.
                 */
                folio_get(folio);
                rss[mm_counter(folio)]++;
                /* Cannot fail as these pages cannot get pinned. */
                folio_try_dup_anon_rmap_pte(folio, page, src_vma);

                /*
                 * We do not preserve soft-dirty information, because so
                 * far, checkpoint/restore is the only feature that
                 * requires that. And checkpoint/restore does not work
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
                if (is_writable_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
                        entry = make_readable_device_private_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_exclusive_entry(entry)) {
                /*
                 * Make device exclusive entries present by restoring the
                 * original entry then copying as for a present pte. Device
                 * exclusive entries currently only support private writable
                 * (ie. COW) mappings.
                 */
                VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
                if (try_restore_exclusive_pte(src_pte, src_vma, addr))
                        return -EBUSY;
                return -ENOENT;
        } else if (is_pte_marker_entry(entry)) {
                pte_marker marker = copy_pte_marker(entry, dst_vma);

                if (marker)
                        set_pte_at(dst_mm, addr, dst_pte,
                                   make_pte_marker(marker));
                return 0;
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy a present and normal page.
 *
 * NOTE! The usual case is that this isn't required;
 * instead, the caller can just increase the page refcount
 * and re-use the pte the traditional way.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct folio **prealloc, struct page *page)
{
        struct folio *new_folio;
        pte_t pte;

        new_folio = *prealloc;
        if (!new_folio)
                return -EAGAIN;

        /*
         * We have a prealloc page, all good!  Take it
         * over and copy the page & arm it.
         */
        *prealloc = NULL;
        copy_user_highpage(&new_folio->page, page, addr, src_vma);
        __folio_mark_uptodate(new_folio);
        folio_add_new_anon_rmap(new_folio, dst_vma, addr);
        folio_add_lru_vma(new_folio, dst_vma);
        rss[MM_ANONPAGES]++;

        /* All done, just insert the new page copy in the child */
        pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
        if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_mkuffd_wp(pte);
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
                pte_t pte, unsigned long addr, int nr)
{
        struct mm_struct *src_mm = src_vma->vm_mm;

        /* If it's a COW mapping, write protect it both processes. */
        if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
                wrprotect_ptes(src_mm, addr, src_pte, nr);
                pte = pte_wrprotect(pte);
        }

        /* If it's a shared mapping, mark it clean in the child. */
        if (src_vma->vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);

        if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);

        set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
}

/*
 * Copy one present PTE, trying to batch-process subsequent PTEs that map
 * consecutive pages of the same folio by copying them as well.
 *
 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
 * Otherwise, returns the number of copied PTEs (at least 1).
 */
static inline int
copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
                 int max_nr, int *rss, struct folio **prealloc)
{
        struct page *page;
        struct folio *folio;
        bool any_writable;
        fpb_t flags = 0;
        int err, nr;

        page = vm_normal_page(src_vma, addr, pte);
        if (unlikely(!page))
                goto copy_pte;

        folio = page_folio(page);

        /*
         * If we likely have to copy, just don't bother with batching. Make
         * sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
                if (src_vma->vm_flags & VM_SHARED)
                        flags |= FPB_IGNORE_DIRTY;
                if (!vma_soft_dirty_enabled(src_vma))
                        flags |= FPB_IGNORE_SOFT_DIRTY;

                nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
                                     &any_writable, NULL, NULL);
                folio_ref_add(folio, nr);
                if (folio_test_anon(folio)) {
                        if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
                                                                  nr, src_vma))) {
                                folio_ref_sub(folio, nr);
                                return -EAGAIN;
                        }
                        rss[MM_ANONPAGES] += nr;
                        VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
                } else {
                        folio_dup_file_rmap_ptes(folio, page, nr);
                        rss[mm_counter_file(folio)] += nr;
                }
                if (any_writable)
                        pte = pte_mkwrite(pte, src_vma);
                __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
                                    addr, nr);
                return nr;
        }

        folio_get(folio);
        if (folio_test_anon(folio)) {
                /*
                 * If this page may have been pinned by the parent process,
                 * copy the page immediately for the child so that we'll always
                 * guarantee the pinned page won't be randomly replaced in the
                 * future.
                 */
                if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
                        /* Page may be pinned, we have to copy. */
                        folio_put(folio);
                        err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                                addr, rss, prealloc, page);
                        return err ? err : 1;
                }
                rss[MM_ANONPAGES]++;
                VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
        } else {
                folio_dup_file_rmap_pte(folio, page);
                rss[mm_counter_file(folio)]++;
        }

copy_pte:
        __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
        return 1;
}

static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
                struct vm_area_struct *vma, unsigned long addr, bool need_zero)
{
        struct folio *new_folio;

        if (need_zero)
                new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
        else
                new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
                                            addr, false);

        if (!new_folio)
                return NULL;

        if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                return NULL;
        }
        folio_throttle_swaprate(new_folio, GFP_KERNEL);

        return new_folio;
}

static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        pte_t ptent;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, max_nr, ret = 0;
        int rss[NR_MM_COUNTERS];
        swp_entry_t entry = (swp_entry_t){0};
        struct folio *prealloc = NULL;
        int nr;

again:
        progress = 0;
        init_rss_vec(rss);

        /*
         * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
         * error handling here, assume that exclusive mmap_lock on dst and src
         * protects anon from unexpected THP transitions; with shmem and file
         * protected by mmap_lock-less collapse skipping areas with anon_vma
         * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
         * can remove such assumptions later, but this is good enough for now.
         */
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }
        src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
        if (!src_pte) {
                pte_unmap_unlock(dst_pte, dst_ptl);
                /* ret == 0 */
                goto out;
        }
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();

        do {
                nr = 1;

                /*
                 * We are holding two locks at this point - either of them
                 * could generate latencies in another task on another CPU.
                 */
                if (progress >= 32) {
                        progress = 0;
                        if (need_resched() ||
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
                ptent = ptep_get(src_pte);
                if (pte_none(ptent)) {
                        progress++;
                        continue;
                }
                if (unlikely(!pte_present(ptent))) {
                        ret = copy_nonpresent_pte(dst_mm, src_mm,
                                                  dst_pte, src_pte,
                                                  dst_vma, src_vma,
                                                  addr, rss);
                        if (ret == -EIO) {
                                entry = pte_to_swp_entry(ptep_get(src_pte));
                                break;
                        } else if (ret == -EBUSY) {
                                break;
                        } else if (!ret) {
                                progress += 8;
                                continue;
                        }
                        ptent = ptep_get(src_pte);
                        VM_WARN_ON_ONCE(!pte_present(ptent));

                        /*
                         * Device exclusive entry restored, continue by copying
                         * the now present pte.
                         */
                        WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_ptes() will clear `*prealloc' if consumed */
                max_nr = (end - addr) / PAGE_SIZE;
                ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
                                        ptent, addr, max_nr, rss, &prealloc);
                /*
                 * If we need a pre-allocated page for this pte, drop the
                 * locks, allocate, and try again.
                 */
                if (unlikely(ret == -EAGAIN))
                        break;
                if (unlikely(prealloc)) {
                        /*
                         * pre-alloc page cannot be reused by next time so as
                         * to strictly follow mempolicy (e.g., alloc_page_vma()
                         * will allocate page according to address).  This
                         * could only happen if one pinned pte changed.
                         */
                        folio_put(prealloc);
                        prealloc = NULL;
                }
                nr = ret;
                progress += 8 * nr;
        } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
                 addr != end);

        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(orig_src_pte, src_ptl);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();

        if (ret == -EIO) {
                VM_WARN_ON_ONCE(!entry.val);
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
        } else if (ret == -EBUSY) {
                goto out;
        } else if (ret ==  -EAGAIN) {
                prealloc = folio_prealloc(src_mm, src_vma, addr, false);
                if (!prealloc)
                        return -ENOMEM;
        } else if (ret < 0) {
                VM_WARN_ON_ONCE(1);
        }

        /* We've captured and resolved the error. Reset, try again. */
        ret = 0;

        if (addr != end)
                goto again;
out:
        if (unlikely(prealloc))
                folio_put(prealloc);
        return ret;
}

static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pmd_t *src_pmd, *dst_pmd;
        unsigned long next;

        dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
        if (!dst_pmd)
                return -ENOMEM;
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
                        || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
                        err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
                                            addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pmd++, src_pmd++, addr = next, addr != end);
        return 0;
}

static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pud_t *src_pud, *dst_pud;
        unsigned long next;

        dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
        if (!dst_pud)
                return -ENOMEM;
        src_pud = pud_offset(src_p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
                        err = copy_huge_pud(dst_mm, src_mm,
                                            dst_pud, src_pud, addr, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pud++, src_pud++, addr = next, addr != end);
        return 0;
}

static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        p4d_t *src_p4d, *dst_p4d;
        unsigned long next;

        dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
        if (!dst_p4d)
                return -ENOMEM;
        src_p4d = p4d_offset(src_pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(src_p4d))
                        continue;
                if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_p4d++, src_p4d++, addr = next, addr != end);
        return 0;
}

/*
 * Return true if the vma needs to copy the pgtable during this fork().  Return
 * false when we can speed up fork() by allowing lazy page faults later until
 * when the child accesses the memory range.
 */
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        /*
         * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
         * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
         * contains uffd-wp protection information, that's something we can't
         * retrieve from page cache, and skip copying will lose those info.
         */
        if (userfaultfd_wp(dst_vma))
                return true;

        if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                return true;

        if (src_vma->anon_vma)
                return true;

        /*
         * Don't copy ptes where a page fault will fill them correctly.  Fork
         * becomes much lighter when there are big shared or private readonly
         * mappings. The tradeoff is that copy_page_range is more efficient
         * than faulting.
         */
        return false;
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        pgd_t *src_pgd, *dst_pgd;
        unsigned long next;
        unsigned long addr = src_vma->vm_start;
        unsigned long end = src_vma->vm_end;
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct mmu_notifier_range range;
        bool is_cow;
        int ret;

        if (!vma_needs_copy(dst_vma, src_vma))
                return 0;

        if (is_vm_hugetlb_page(src_vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);

        if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
                ret = track_pfn_copy(src_vma);
                if (ret)
                        return ret;
        }

        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(src_vma->vm_flags);

        if (is_cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                        0, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
                /*
                 * Disabling preemption is not needed for the write side, as
                 * the read side doesn't spin, but goes to the mmap_lock.
                 *
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
                vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }

        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
                        untrack_pfn_clear(dst_vma);
                        ret = -ENOMEM;
                        break;
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);

        if (is_cow) {
                raw_write_seqcount_end(&src_mm->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        }
        return ret;
}

/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
        /* By default, zap all pages */
        if (!details)
                return true;

        /* Or, we zap COWed pages only if the caller wants to */
        return details->even_cows;
}

/* Decides whether we should zap this folio with the folio pointer specified */
static inline bool should_zap_folio(struct zap_details *details,
                                    struct folio *folio)
{
        /* If we can make a decision without *folio.. */
        if (should_zap_cows(details))
                return true;

        /* Otherwise we should only zap non-anon folios */
        return !folio_test_anon(folio);
}

static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
{
        if (!details)
                return false;

        return details->zap_flags & ZAP_FLAG_DROP_MARKER;
}

/*
 * This function makes sure that we'll replace the none pte with an uffd-wp
 * swap special pte marker when necessary. Must be with the pgtable lock held.
 */
static inline void
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte, int nr,
                              struct zap_details *details, pte_t pteval)
{
        /* Zap on anonymous always means dropping everything */
        if (vma_is_anonymous(vma))
                return;

        if (zap_drop_file_uffd_wp(details))
                return;

        for (;;) {
                /* the PFN in the PTE is irrelevant. */
                pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
                if (--nr == 0)
                        break;
                pte++;
                addr += PAGE_SIZE;
        }
}

static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, struct folio *folio,
                struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
                unsigned long addr, struct zap_details *details, int *rss,
                bool *force_flush, bool *force_break)
{
        struct mm_struct *mm = tlb->mm;
        bool delay_rmap = false;

        if (!folio_test_anon(folio)) {
                ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                if (pte_dirty(ptent)) {
                        folio_mark_dirty(folio);
                        if (tlb_delay_rmap(tlb)) {
                                delay_rmap = true;
                                *force_flush = true;
                        }
                }
                if (pte_young(ptent) && likely(vma_has_recency(vma)))
                        folio_mark_accessed(folio);
                rss[mm_counter(folio)] -= nr;
        } else {
                /* We don't need up-to-date accessed/dirty bits. */
                clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                rss[MM_ANONPAGES] -= nr;
        }
        /* Checking a single PTE in a batch is sufficient. */
        arch_check_zapped_pte(vma, ptent);
        tlb_remove_tlb_entries(tlb, pte, nr, addr);
        if (unlikely(userfaultfd_pte_wp(vma, ptent)))
                zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
                                              ptent);

        if (!delay_rmap) {
                folio_remove_rmap_ptes(folio, page, nr, vma);

                if (unlikely(folio_mapcount(folio) < 0))
                        print_bad_pte(vma, addr, ptent, page);
        }
        if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
                *force_flush = true;
                *force_break = true;
        }
}

/*
 * Zap or skip at least one present PTE, trying to batch-process subsequent
 * PTEs that map consecutive pages of the same folio.
 *
 * Returns the number of processed (skipped or zapped) PTEs (at least 1).
 */
static inline int zap_present_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *force_flush,
                bool *force_break)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        struct mm_struct *mm = tlb->mm;
        struct folio *folio;
        struct page *page;
        int nr;

        page = vm_normal_page(vma, addr, ptent);
        if (!page) {
                /* We don't need up-to-date accessed/dirty bits. */
                ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
                arch_check_zapped_pte(vma, ptent);
                tlb_remove_tlb_entry(tlb, pte, addr);
                if (userfaultfd_pte_wp(vma, ptent))
                        zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
                                                      details, ptent);
                ksm_might_unmap_zero_page(mm, ptent);
                return 1;
        }

        folio = page_folio(page);
        if (unlikely(!should_zap_folio(details, folio)))
                return 1;

        /*
         * Make sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(folio_test_large(folio) && max_nr != 1)) {
                nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
                                     NULL, NULL, NULL);

                zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
                                       addr, details, rss, force_flush,
                                       force_break);
                return nr;
        }
        zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
                               details, rss, force_flush, force_break);
        return 1;
}

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        bool force_flush = false, force_break = false;
        struct mm_struct *mm = tlb->mm;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
        swp_entry_t entry;
        int nr;

        tlb_change_page_size(tlb, PAGE_SIZE);
        init_rss_vec(rss);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return addr;

        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = ptep_get(pte);
                struct folio *folio;
                struct page *page;
                int max_nr;

                nr = 1;
                if (pte_none(ptent))
                        continue;

                if (need_resched())
                        break;

                if (pte_present(ptent)) {
                        max_nr = (end - addr) / PAGE_SIZE;
                        nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
                                              addr, details, rss, &force_flush,
                                              &force_break);
                        if (unlikely(force_break)) {
                                addr += nr * PAGE_SIZE;
                                break;
                        }
                        continue;
                }

                entry = pte_to_swp_entry(ptent);
                if (is_device_private_entry(entry) ||
                    is_device_exclusive_entry(entry)) {
                        page = pfn_swap_entry_to_page(entry);
                        folio = page_folio(page);
                        if (unlikely(!should_zap_folio(details, folio)))
                                continue;
                        /*
                         * Both device private/exclusive mappings should only
                         * work with anonymous page so far, so we don't need to
                         * consider uffd-wp bit when zap. For more information,
                         * see zap_install_uffd_wp_if_needed().
                         */
                        WARN_ON_ONCE(!vma_is_anonymous(vma));
                        rss[mm_counter(folio)]--;
                        if (is_device_private_entry(entry))
                                folio_remove_rmap_pte(folio, page, vma);
                        folio_put(folio);
                } else if (!non_swap_entry(entry)) {
                        max_nr = (end - addr) / PAGE_SIZE;
                        nr = swap_pte_batch(pte, max_nr, ptent);
                        /* Genuine swap entries, hence a private anon pages */
                        if (!should_zap_cows(details))
                                continue;
                        rss[MM_SWAPENTS] -= nr;
                        free_swap_and_cache_nr(entry, nr);
                } else if (is_migration_entry(entry)) {
                        folio = pfn_swap_entry_folio(entry);
                        if (!should_zap_folio(details, folio))
                                continue;
                        rss[mm_counter(folio)]--;
                } else if (pte_marker_entry_uffd_wp(entry)) {
                        /*
                         * For anon: always drop the marker; for file: only
                         * drop the marker if explicitly requested.
                         */
                        if (!vma_is_anonymous(vma) &&
                            !zap_drop_file_uffd_wp(details))
                                continue;
                } else if (is_hwpoison_entry(entry) ||
                           is_poisoned_swp_entry(entry)) {
                        if (!should_zap_cows(details))
                                continue;
                } else {
                        /* We should have covered all the swap entry types */
                        pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
                        WARN_ON_ONCE(1);
                }
                clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
        } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);

        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();

        /* Do the actual TLB flush before dropping ptl */
        if (force_flush) {
                tlb_flush_mmu_tlbonly(tlb);
                tlb_flush_rmaps(tlb, vma);
        }
        pte_unmap_unlock(start_pte, ptl);

        /*
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
         * memory too. Come back again if we didn't do everything.
         */
        if (force_flush)
                tlb_flush_mmu(tlb);

        return addr;
}

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
                        else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
                                addr = next;
                                continue;
                        }
                        /* fall through */
                } else if (details && details->single_folio &&
                           folio_test_pmd_mappable(details->single_folio) &&
                           next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
                        spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
                        /*
                         * Take and drop THP pmd lock so that we cannot return
                         * prematurely, while zap_huge_pmd() has cleared *pmd,
                         * but not yet decremented compound_mapcount().
                         */
                        spin_unlock(ptl);
                }
                if (pmd_none(*pmd)) {
                        addr = next;
                        continue;
                }
                addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
                if (addr != next)
                        pmd--;
        } while (pmd++, cond_resched(), addr != end);

        return addr;
}

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
                        if (next - addr != HPAGE_PUD_SIZE) {
                                mmap_assert_locked(tlb->mm);
                                split_huge_pud(vma, pud, addr);
                        } else if (zap_huge_pud(tlb, vma, pud, addr))
                                goto next;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
                cond_resched();
        } while (pud++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                next = zap_pud_range(tlb, vma, p4d, addr, next, details);
        } while (p4d++, addr = next, addr != end);

        return addr;
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
{
        pgd_t *pgd;
        unsigned long next;

        BUG_ON(addr >= end);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
}


static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr,
                struct zap_details *details, bool mm_wr_locked)
{
        unsigned long start = max(vma->vm_start, start_addr);
        unsigned long end;

        if (start >= vma->vm_end)
                return;
        end = min(vma->vm_end, end_addr);
        if (end <= vma->vm_start)
                return;

        if (vma->vm_file)
                uprobe_munmap(vma, start, end);

        if (unlikely(vma->vm_flags & VM_PFNMAP))
                untrack_pfn(vma, 0, 0, mm_wr_locked);

        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
                        /*
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
                         * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
                         * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
                                zap_flags_t zap_flags = details ?
                                    details->zap_flags : 0;
                                __unmap_hugepage_range(tlb, vma, start, end,
                                                             NULL, zap_flags);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
 * @mas: the maple state
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 * @tree_end: The maximum index to check
 * @mm_wr_locked: lock flag
 *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long tree_end,
                bool mm_wr_locked)
{
        struct mmu_notifier_range range;
        struct zap_details details = {
                .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
                /* Careful - we need to zap private pages too! */
                .even_cows = true,
        };

        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        do {
                unsigned long start = start_addr;
                unsigned long end = end_addr;
                hugetlb_zap_begin(vma, &start, &end);
                unmap_single_vma(tlb, vma, start, end, &details,
                                 mm_wr_locked);
                hugetlb_zap_end(vma, &details);
                vma = mas_find(mas, tree_end - 1);
        } while (vma && likely(!xa_is_zero(vma)));
        mmu_notifier_invalidate_range_end(&range);
}

/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of shared cache invalidation
 *
 * The range must fit into one VMA.
 */
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        const unsigned long end = address + size;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, end);
        hugetlb_zap_begin(vma, &range.start, &range.end);
        tlb_gather_mmu(&tlb, vma->vm_mm);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        /*
         * unmap 'address-end' not 'range.start-range.end' as range
         * could have been expanded for hugetlb pmd sharing.
         */
        unmap_single_vma(&tlb, vma, address, end, details, false);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
        hugetlb_zap_end(vma, details);
}

/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        if (!range_in_vma(vma, address, address + size) ||
                            !(vma->vm_flags & VM_PFNMAP))
                return;

        zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));
        return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
{
        pmd_t *pmd = walk_to_pmd(mm, addr);

        if (!pmd)
                return NULL;
        return pte_alloc_map_lock(mm, pmd, addr, ptl);
}

static int validate_page_before_insert(struct page *page)
{
        struct folio *folio = page_folio(page);

        if (folio_test_anon(folio) || folio_test_slab(folio) ||
            page_has_type(page))
                return -EINVAL;
        flush_dcache_folio(folio);
        return 0;
}

static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        struct folio *folio = page_folio(page);

        if (!pte_none(ptep_get(pte)))
                return -EBUSY;
        /* Ok, finally just insert the thing.. */
        folio_get(folio);
        inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
        folio_add_file_rmap_pte(folio, page, vma);
        set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
        return 0;
}

/*
 * This is the old fallback for page remapping.
 *
 * For historical reasons, it only allows reserved pages. Only
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page, pgprot_t prot)
{
        int retval;
        pte_t *pte;
        spinlock_t *ptl;

        retval = validate_page_before_insert(page);
        if (retval)
                goto out;
        retval = -ENOMEM;
        pte = get_locked_pte(vma->vm_mm, addr, &ptl);
        if (!pte)
                goto out;
        retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
        pte_unmap_unlock(pte, ptl);
out:
        return retval;
}

static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        int err;

        if (!page_count(page))
                return -EINVAL;
        err = validate_page_before_insert(page);
        if (err)
                return err;
        return insert_page_into_pte_locked(vma, pte, addr, page, prot);
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
{
        pmd_t *pmd = NULL;
        pte_t *start_pte, *pte;
        spinlock_t *pte_lock;
        struct mm_struct *const mm = vma->vm_mm;
        unsigned long curr_page_idx = 0;
        unsigned long remaining_pages_total = *num;
        unsigned long pages_to_write_in_pmd;
        int ret;
more:
        ret = -EFAULT;
        pmd = walk_to_pmd(mm, addr);
        if (!pmd)
                goto out;

        pages_to_write_in_pmd = min_t(unsigned long,
                remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

        /* Allocate the PTE if necessary; takes PMD lock once only. */
        ret = -ENOMEM;
        if (pte_alloc(mm, pmd))
                goto out;

        while (pages_to_write_in_pmd) {
                int pte_idx = 0;
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                if (!start_pte) {
                        ret = -EFAULT;
                        goto out;
                }
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(vma, pte,
                                addr, pages[curr_page_idx], prot);
                        if (unlikely(err)) {
                                pte_unmap_unlock(start_pte, pte_lock);
                                ret = err;
                                remaining_pages_total -= pte_idx;
                                goto out;
                        }
                        addr += PAGE_SIZE;
                        ++curr_page_idx;
                }
                pte_unmap_unlock(start_pte, pte_lock);
                pages_to_write_in_pmd -= batch_size;
                remaining_pages_total -= batch_size;
        }
        if (remaining_pages_total)
                goto more;
        ret = 0;
out:
        *num = remaining_pages_total;
        return ret;
}

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
{
        const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

        if (addr < vma->vm_start || end_addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_pages);

/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
 * This allows drivers to insert individual pages they've allocated
 * into a user vma.
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
 * (see split_page()).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 *
 * Usually this function is called from f_op->mmap() handler
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
{
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);

/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num, unsigned long offset)
{
        unsigned long count = vma_pages(vma);
        unsigned long uaddr = vma->vm_start;
        int ret, i;

        /* Fail if the user requested offset is beyond the end of the object */
        if (offset >= num)
                return -ENXIO;

        /* Fail if the user requested size exceeds available object size */
        if (count > num - offset)
                return -ENXIO;

        for (i = 0; i < count; i++) {
                ret = vm_insert_page(vma, uaddr, pages[offset + i]);
                if (ret < 0)
                        return ret;
                uaddr += PAGE_SIZE;
        }

        return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t prot, bool mkwrite)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, entry;
        spinlock_t *ptl;

        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
        entry = ptep_get(pte);
        if (!pte_none(entry)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
                         * mapping and we expect the PFNs to match. If they
                         * don't match, we are likely racing with block
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
                        if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
                                goto out_unlock;
                        }
                        entry = pte_mkyoung(entry);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
                }
                goto out_unlock;
        }

        /* Ok, finally just insert the thing.. */
        if (pfn_t_devmap(pfn))
                entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
        else
                entry = pte_mkspecial(pfn_t_pte(pfn, prot));

        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

out_unlock:
        pte_unmap_unlock(pte, ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
 * impractical.
 *
 * pgprot typically only differs from @vma->vm_page_prot when drivers set
 * caching- and encryption bits different than those of @vma->vm_page_prot,
 * because the caching- or encryption mode may not be known at mmap() time.
 *
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 *
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot)
{
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
         * consistency in testing and feature parity among all, so we should
         * try to keep these invariants in place for everybody.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));

        return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
                        false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);

/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
{
        return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
        /* these checks mirror the abort conditions in vm_normal_page */
        if (vma->vm_flags & VM_MIXEDMAP)
                return true;
        if (pfn_t_devmap(pfn))
                return true;
        if (pfn_t_special(pfn))
                return true;
        if (is_zero_pfn(pfn_t_to_pfn(pfn)))
                return true;
        return false;
}

static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn, bool mkwrite)
{
        pgprot_t pgprot = vma->vm_page_prot;
        int err;

        BUG_ON(!vm_mixed_ok(vma, pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
                return VM_FAULT_SIGBUS;

        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
            !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
                struct page *page;

                /*
                 * At this point we are committed to insert_page()
                 * regardless of whether the caller specified flags that
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn_t_to_pfn(pfn));
                err = insert_page(vma, addr, page, pgprot);
        } else {
                return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
        }

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);

/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);

/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;
        int err = 0;

        mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return -ENOMEM;
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(ptep_get(pte)));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
                }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pmd_t *pmd;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pud_t *pud;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                err = remap_pmd_range(mm, pud, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        p4d_t *p4d;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                err = remap_pud_range(mm, p4d, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

/*
 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
 * must have pre-validated the caching bits of the pgprot_t.
 */
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
                return -EINVAL;

        /*
         * Physically remapped pages are special. Tell the
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *        (accesses can have side effects).
         *   VM_PFNMAP tells the core MM that the base pages are just
         *        raw PFN mappings, and do not have a "struct page" associated
         *        with them.
         *   VM_DONTEXPAND
         *      Disable vma merging and expanding with mremap().
         *   VM_DONTDUMP
         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         * See vm_normal_page() for details.
         */
        if (is_cow_mapping(vma->vm_flags)) {
                if (addr != vma->vm_start || end != vma->vm_end)
                        return -EINVAL;
                vma->vm_pgoff = pfn;
        }

        vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);

        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
        do {
                next = pgd_addr_end(addr, end);
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        return 0;
}

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int err;

        err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
        if (err)
                return -EINVAL;

        err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
        if (err)
                untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
        return err;
}
EXPORT_SYMBOL(remap_pfn_range);

/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
 * @start: start of the physical memory to be mapped
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
        unsigned long vm_len, pfn, pages;

        /* Check that the physical memory area passed in looks valid */
        if (start + len < start)
                return -EINVAL;
        /*
         * You *really* shouldn't map things that aren't page-aligned,
         * but we've historically allowed it because IO memory might
         * just have smaller alignment.
         */
        len += start & ~PAGE_MASK;
        pfn = start >> PAGE_SHIFT;
        pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
        if (pfn + pages < pfn)
                return -EINVAL;

        /* We start the mapping 'vm_pgoff' pages into the area */
        if (vma->vm_pgoff > pages)
                return -EINVAL;
        pfn += vma->vm_pgoff;
        pages -= vma->vm_pgoff;

        /* Can we fit all of the mapping? */
        vm_len = vma->vm_end - vma->vm_start;
        if (vm_len >> PAGE_SHIFT > pages)
                return -EINVAL;

        /* Ok, let it rip */
        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pte_t *pte, *mapped_pte;
        int err = 0;
        spinlock_t *ptl;

        if (create) {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_alloc_kernel_track(pmd, addr, mask) :
                        pte_alloc_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -ENOMEM;
        } else {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -EINVAL;
        }

        arch_enter_lazy_mmu_mode();

        if (fn) {
                do {
                        if (create || !pte_none(ptep_get(pte))) {
                                err = fn(pte++, addr, data);
                                if (err)
                                        break;
                        }
                } while (addr += PAGE_SIZE, addr != end);
        }
        *mask |= PGTBL_PTE_MODIFIED;

        arch_leave_lazy_mmu_mode();

        if (mm != &init_mm)
                pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        BUG_ON(pud_leaf(*pud));

        if (create) {
                pmd = pmd_alloc_track(mm, pud, addr, mask);
                if (!pmd)
                        return -ENOMEM;
        } else {
                pmd = pmd_offset(pud, addr);
        }
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none(*pmd) && !create)
                        continue;
                if (WARN_ON_ONCE(pmd_leaf(*pmd)))
                        return -EINVAL;
                if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
                        if (!create)
                                continue;
                        pmd_clear_bad(pmd);
                }
                err = apply_to_pte_range(mm, pmd, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pmd++, addr = next, addr != end);

        return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        if (create) {
                pud = pud_alloc_track(mm, p4d, addr, mask);
                if (!pud)
                        return -ENOMEM;
        } else {
                pud = pud_offset(p4d, addr);
        }
        do {
                next = pud_addr_end(addr, end);
                if (pud_none(*pud) && !create)
                        continue;
                if (WARN_ON_ONCE(pud_leaf(*pud)))
                        return -EINVAL;
                if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
                        if (!create)
                                continue;
                        pud_clear_bad(pud);
                }
                err = apply_to_pmd_range(mm, pud, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pud++, addr = next, addr != end);

        return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        if (create) {
                p4d = p4d_alloc_track(mm, pgd, addr, mask);
                if (!p4d)
                        return -ENOMEM;
        } else {
                p4d = p4d_offset(pgd, addr);
        }
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none(*p4d) && !create)
                        continue;
                if (WARN_ON_ONCE(p4d_leaf(*p4d)))
                        return -EINVAL;
                if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
                        if (!create)
                                continue;
                        p4d_clear_bad(p4d);
                }
                err = apply_to_pud_range(mm, p4d, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (p4d++, addr = next, addr != end);

        return err;
}

static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn,
                                 void *data, bool create)
{
        pgd_t *pgd;
        unsigned long start = addr, next;
        unsigned long end = addr + size;
        pgtbl_mod_mask mask = 0;
        int err = 0;

        if (WARN_ON(addr >= end))
                return -EINVAL;

        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none(*pgd) && !create)
                        continue;
                if (WARN_ON_ONCE(pgd_leaf(*pgd)))
                        return -EINVAL;
                if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
                        if (!create)
                                continue;
                        pgd_clear_bad(pgd);
                }
                err = apply_to_p4d_range(mm, pgd, addr, next,
                                         fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, start + size);

        return err;
}

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                        unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);

/*
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
static inline int pte_unmap_same(struct vm_fault *vmf)
{
        int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
                spin_lock(vmf->ptl);
                same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
                spin_unlock(vmf->ptl);
        }
#endif
        pte_unmap(vmf->pte);
        vmf->pte = NULL;
        return same;
}

/*
 * Return:
 *        0:                copied succeeded
 *        -EHWPOISON:        copy failed due to hwpoison in source page
 *        -EAGAIN:        copied failed (some other reason)
 */
static inline int __wp_page_copy_user(struct page *dst, struct page *src,
                                      struct vm_fault *vmf)
{
        int ret;
        void *kaddr;
        void __user *uaddr;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;

        if (likely(src)) {
                if (copy_mc_user_highpage(dst, src, addr, vma)) {
                        memory_failure_queue(page_to_pfn(src), 0);
                        return -EHWPOISON;
                }
                return 0;
        }

        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
         * just copying from the original user address. If that
         * fails, we just zero-fill it. Live with it.
         */
        kaddr = kmap_local_page(dst);
        pagefault_disable();
        uaddr = (void __user *)(addr & PAGE_MASK);

        /*
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
        vmf->pte = NULL;
        if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;

                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
                        update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
        }

        /*
         * This really shouldn't fail, because the page is there
         * in the page tables. But it might just be unreadable,
         * in which case we just give up and fill the result with
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                if (vmf->pte)
                        goto warn;

                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                /*
                 * The same page can be mapped back since last copy attempt.
                 * Try to copy again under PTL.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                        /*
                         * Give a warn in case there can be some obscure
                         * use-case
                         */
warn:
                        WARN_ON_ONCE(1);
                        clear_page(kaddr);
                }
        }

        ret = 0;

pte_unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        pagefault_enable();
        kunmap_local(kaddr);
        flush_dcache_page(dst);

        return ret;
}

static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
        struct file *vm_file = vma->vm_file;

        if (vm_file)
                return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

        /*
         * Special mappings (e.g. VDSO) do not have any file so fake
         * a default GFP_KERNEL for them.
         */
        return GFP_KERNEL;
}

/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
        vm_fault_t ret;
        unsigned int old_flags = vmf->flags;

        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

        if (vmf->vma->vm_file &&
            IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
                return VM_FAULT_SIGBUS;

        ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
                folio_lock(folio);
                if (!folio->mapping) {
                        folio_unlock(folio);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        return ret;
}

/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
        struct folio *folio = page_folio(vmf->page);
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

        dirtied = folio_mark_dirty(folio);
        VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
        /*
         * Take a local copy of the address_space - folio.mapping may be zeroed
         * by truncate after folio_unlock().   The address_space itself remains
         * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
        mapping = folio_raw_mapping(folio);
        folio_unlock(folio);

        if (!page_mkwrite)
                file_update_time(vma->vm_file);

        /*
         * Throttle page dirtying rate down to writeback speed.
         *
         * mapping may be NULL here because some device drivers do not
         * set page.mapping but still dirty their pages
         *
         * Drop the mmap_lock before waiting on IO, if we can. The file
         * is pinning the mapping, as per above.
         */
        if ((dirtied || page_mkwrite) && mapping) {
                struct file *fpin;

                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                balance_dirty_pages_ratelimited(mapping);
                if (fpin) {
                        fput(fpin);
                        return VM_FAULT_COMPLETED;
                }
        }

        return 0;
}

/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t entry;

        VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));

        if (folio) {
                VM_BUG_ON(folio_test_anon(folio) &&
                          !PageAnonExclusive(vmf->page));
                /*
                 * Clear the folio's cpupid information as the existing
                 * information potentially belongs to a now completely
                 * unrelated process.
                 */
                folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
        }

        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
}

/*
 * We could add a bitflag somewhere, but for now, we know that all
 * vm_ops that have a ->map_pages have been audited and don't need
 * the mmap_lock to be held.
 */
static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
                return 0;
        vma_end_read(vma);
        return VM_FAULT_RETRY;
}

/**
 * vmf_anon_prepare - Prepare to handle an anonymous fault.
 * @vmf: The vm_fault descriptor passed from the fault handler.
 *
 * When preparing to insert an anonymous page into a VMA from a
 * fault handler, call this function rather than anon_vma_prepare().
 * If this vma does not already have an associated anon_vma and we are
 * only protected by the per-VMA lock, the caller must retry with the
 * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
 * determine if this VMA can share its anon_vma, and that's not safe to
 * do with only the per-VMA lock held for this VMA.
 *
 * Return: 0 if fault handling can proceed.  Any other value should be
 * returned to the caller.
 */
vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        if (likely(vma->anon_vma))
                return 0;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                if (!mmap_read_trylock(vma->vm_mm)) {
                        vma_end_read(vma);
                        return VM_FAULT_RETRY;
                }
        }
        if (__anon_vma_prepare(vma))
                ret = VM_FAULT_OOM;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                mmap_read_unlock(vma->vm_mm);
        return ret;
}

/*
 * Handle the case of a page which we actually need to copy to a new page,
 * either due to COW or unsharing.
 *
 * Called with mmap_lock locked and the old page referenced, but
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct folio *old_folio = NULL;
        struct folio *new_folio = NULL;
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;
        vm_fault_t ret;
        bool pfn_is_zero;

        delayacct_wpcopy_start();

        if (vmf->page)
                old_folio = page_folio(vmf->page);
        ret = vmf_anon_prepare(vmf);
        if (unlikely(ret))
                goto out;

        pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
        new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
        if (!new_folio)
                goto oom;

        if (!pfn_is_zero) {
                int err;

                err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
                if (err) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
                         * The -EHWPOISON case will not be retried.
                         */
                        folio_put(new_folio);
                        if (old_folio)
                                folio_put(old_folio);

                        delayacct_wpcopy_end();
                        return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
                }
                kmsan_copy_page_meta(&new_folio->page, vmf->page);
        }

        __folio_mark_uptodate(new_folio);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                if (old_folio) {
                        if (!folio_test_anon(old_folio)) {
                                dec_mm_counter(mm, mm_counter_file(old_folio));
                                inc_mm_counter(mm, MM_ANONPAGES);
                        }
                } else {
                        ksm_might_unmap_zero_page(mm, vmf->orig_pte);
                        inc_mm_counter(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(&new_folio->page, vma->vm_page_prot);
                entry = pte_sw_mkyoung(entry);
                if (unlikely(unshare)) {
                        if (pte_soft_dirty(vmf->orig_pte))
                                entry = pte_mksoft_dirty(entry);
                        if (pte_uffd_wp(vmf->orig_pte))
                                entry = pte_mkuffd_wp(entry);
                } else {
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                }

                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry, to keep TLBs on different CPUs in
                 * sync. This code used to set the new PTE then flush TLBs, but
                 * that left a window where the new PTE could be loaded into
                 * some TLBs while the old PTE remains in others.
                 */
                ptep_clear_flush(vma, vmf->address, vmf->pte);
                folio_add_new_anon_rmap(new_folio, vma, vmf->address);
                folio_add_lru_vma(new_folio, vma);
                BUG_ON(unshare && pte_write(entry));
                set_pte_at(mm, vmf->address, vmf->pte, entry);
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                if (old_folio) {
                        /*
                         * Only after switching the pte to the new page may
                         * we remove the mapcount here. Otherwise another
                         * process may come and find the rmap count decremented
                         * before the pte is switched to the new page, and
                         * "reuse" the old page writing into it while our pte
                         * here still points into it and can be read by other
                         * threads.
                         *
                         * The critical issue is to order this
                         * folio_remove_rmap_pte() with the ptp_clear_flush
                         * above. Those stores are ordered by (if nothing else,)
                         * the barrier present in the atomic_add_negative
                         * in folio_remove_rmap_pte();
                         *
                         * Then the TLB flush in ptep_clear_flush ensures that
                         * no process can access the old page before the
                         * decremented mapcount is visible. And the old page
                         * cannot be reused until after the decremented
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
                        folio_remove_rmap_pte(old_folio, vmf->page, vma);
                }

                /* Free the old page.. */
                new_folio = old_folio;
                page_copied = 1;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        } else if (vmf->pte) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        }

        mmu_notifier_invalidate_range_end(&range);

        if (new_folio)
                folio_put(new_folio);
        if (old_folio) {
                if (page_copied)
                        free_swap_cache(old_folio);
                folio_put(old_folio);
        }

        delayacct_wpcopy_end();
        return 0;
oom:
        ret = VM_FAULT_OOM;
out:
        if (old_folio)
                folio_put(old_folio);

        delayacct_wpcopy_end();
        return ret;
}

/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *                          writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 * @folio: the folio of vmf->page
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
 * It handles locking of PTE and modifying it.
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
 *
 * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
 * we acquired PTE lock.
 */
static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
        if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
        }
        wp_page_reuse(vmf, folio);
        return 0;
}

/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                vm_fault_t ret;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                ret = vmf_can_call_fault(vmf);
                if (ret)
                        return ret;

                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf, NULL);
        }
        wp_page_reuse(vmf, NULL);
        return 0;
}

static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        folio_get(folio);

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                tmp = vmf_can_call_fault(vmf);
                if (tmp) {
                        folio_put(folio);
                        return tmp;
                }

                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf, folio);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf, folio);
                folio_lock(folio);
        }
        ret |= fault_dirty_shared_page(vmf);
        folio_put(folio);

        return ret;
}

static bool wp_can_reuse_anon_folio(struct folio *folio,
                                    struct vm_area_struct *vma)
{
        /*
         * We could currently only reuse a subpage of a large folio if no
         * other subpages of the large folios are still mapped. However,
         * let's just consistently not reuse subpages even if we could
         * reuse in that scenario, and give back a large folio a bit
         * sooner.
         */
        if (folio_test_large(folio))
                return false;

        /*
         * We have to verify under folio lock: these early checks are
         * just an optimization to avoid locking the folio and freeing
         * the swapcache if there is little hope that we can reuse.
         *
         * KSM doesn't necessarily raise the folio refcount.
         */
        if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
                return false;
        if (!folio_test_lru(folio))
                /*
                 * We cannot easily detect+handle references from
                 * remote LRU caches or references to LRU folios.
                 */
                lru_add_drain();
        if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
                return false;
        if (!folio_trylock(folio))
                return false;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
                folio_unlock(folio);
                return false;
        }
        /*
         * Ok, we've got the only folio reference from our mapping
         * and the folio is locked, it's dark out, and we're wearing
         * sunglasses. Hit it.
         */
        folio_move_anon_rmap(folio, vma);
        folio_unlock(folio);
        return true;
}

/*
 * This routine handles present pages, when
 * * users try to write to a shared page (FAULT_FLAG_WRITE)
 * * GUP wants to take a R/O pin on a possibly shared anonymous page
 *   (FAULT_FLAG_UNSHARE)
 *
 * It is done by copying the page to a new address and decrementing the
 * shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
 * done any necessary COW.
 *
 * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
 * though the page will change only once the write actually happens. This
 * avoids a few races, and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        pte_t pte;

        if (likely(!unshare)) {
                if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
                        if (!userfaultfd_wp_async(vma)) {
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                return handle_userfault(vmf, VM_UFFD_WP);
                        }

                        /*
                         * Nothing needed (cache flush, TLB invalidations,
                         * etc.) because we're only removing the uffd-wp bit,
                         * which is completely invisible to the user.
                         */
                        pte = pte_clear_uffd_wp(ptep_get(vmf->pte));

                        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
                        /*
                         * Update this to be prepared for following up CoW
                         * handling
                         */
                        vmf->orig_pte = pte;
                }

                /*
                 * Userfaultfd write-protect can defer flushes. Ensure the TLB
                 * is flushed in this case before copying.
                 */
                if (unlikely(userfaultfd_wp(vmf->vma) &&
                             mm_tlb_flush_pending(vmf->vma->vm_mm)))
                        flush_tlb_page(vmf->vma, vmf->address);
        }

        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

        if (vmf->page)
                folio = page_folio(vmf->page);

        /*
         * Shared mapping: we are guaranteed to have VM_WRITE and
         * FAULT_FLAG_WRITE set at this point.
         */
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA.
                 *
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
                if (!vmf->page)
                        return wp_pfn_shared(vmf);
                return wp_page_shared(vmf, folio);
        }

        /*
         * Private mapping: create an exclusive anonymous page copy if reuse
         * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
         *
         * If we encounter a page that is marked exclusive, we must reuse
         * the page without further checks.
         */
        if (folio && folio_test_anon(folio) &&
            (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
                if (!PageAnonExclusive(vmf->page))
                        SetPageAnonExclusive(vmf->page);
                if (unlikely(unshare)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return 0;
                }
                wp_page_reuse(vmf, folio);
                return 0;
        }
        /*
         * Ok, we need to copy. Oh, well..
         */
        if (folio)
                folio_get(folio);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
#ifdef CONFIG_KSM
        if (folio && folio_test_ksm(folio))
                count_vm_event(COW_KSM);
#endif
        return wp_page_copy(vmf);
}

static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
{
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}

static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
                                            pgoff_t first_index,
                                            pgoff_t last_index,
                                            struct zap_details *details)
{
        struct vm_area_struct *vma;
        pgoff_t vba, vea, zba, zea;

        vma_interval_tree_foreach(vma, root, first_index, last_index) {
                vba = vma->vm_pgoff;
                vea = vba + vma_pages(vma) - 1;
                zba = max(first_index, vba);
                zea = min(last_index, vea);

                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
                                details);
        }
}

/**
 * unmap_mapping_folio() - Unmap single folio from processes.
 * @folio: The locked folio to be unmapped.
 *
 * Unmap this folio from any userspace process which still has it mmaped.
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 * truncation or invalidation holds the lock on a folio, it may find that
 * the page has been remapped again: and then uses unmap_mapping_folio()
 * to unmap it finally.
 */
void unmap_mapping_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        struct zap_details details = { };
        pgoff_t        first_index;
        pgoff_t        last_index;

        VM_BUG_ON(!folio_test_locked(folio));

        first_index = folio->index;
        last_index = folio_next_index(folio) - 1;

        details.even_cows = false;
        details.single_folio = folio;
        details.zap_flags = ZAP_FLAG_DROP_MARKER;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}

/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
{
        struct zap_details details = { };
        pgoff_t        first_index = start;
        pgoff_t        last_index = start + nr - 1;

        details.even_cows = even_cows;
        if (last_index < first_index)
                last_index = ULONG_MAX;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);

/**
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
 * address_space corresponding to the specified byte range in the underlying
 * file.
 *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from truncate_pagecache(), which
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
{
        pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
        pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;

        /* Check for overflow. */
        if (sizeof(holelen) > sizeof(hlen)) {
                long long holeend =
                        (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
                if (holeend & ~(long long)ULONG_MAX)
                        hlen = ULONG_MAX - hba + 1;
        }

        unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
 * Restore a potential device exclusive pte to a working pte entry
 */
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
        struct folio *folio = page_folio(vmf->page);
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
        vm_fault_t ret;

        /*
         * We need a reference to lock the folio because we don't hold
         * the PTL so a racing thread can remove the device-exclusive
         * entry and unmap it. If the folio is free the entry must
         * have been removed already. If it happens to have already
         * been re-allocated after being freed all we do is lock and
         * unlock it.
         */
        if (!folio_try_get(folio))
                return 0;

        ret = folio_lock_or_retry(folio, vmf);
        if (ret) {
                folio_put(folio);
                return ret;
        }
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                vma->vm_mm, vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
        mmu_notifier_invalidate_range_start(&range);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);

        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
        folio_put(folio);

        mmu_notifier_invalidate_range_end(&range);
        return 0;
}

static inline bool should_try_to_free_swap(struct folio *folio,
                                           struct vm_area_struct *vma,
                                           unsigned int fault_flags)
{
        if (!folio_test_swapcache(folio))
                return false;
        if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
            folio_test_mlocked(folio))
                return true;
        /*
         * If we want to map a page that's in the swapcache writable, we
         * have to detect via the refcount if we're really the exclusive
         * user. Try freeing the swapcache to get rid of the swapcache
         * reference only in case it's likely that we'll be the exlusive user.
         */
        return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
                folio_ref_count(folio) == 2;
}

static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (!vmf->pte)
                return 0;
        /*
         * Be careful so that we will only recover a special uffd-wp pte into a
         * none pte.  Otherwise it means the pte could have changed, so retry.
         *
         * This should also cover the case where e.g. the pte changed
         * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
         * So is_pte_marker() check is not enough to safely drop the pte.
         */
        if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
                pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

static vm_fault_t do_pte_missing(struct vm_fault *vmf)
{
        if (vma_is_anonymous(vmf->vma))
                return do_anonymous_page(vmf);
        else
                return do_fault(vmf);
}

/*
 * This is actually a page-missing access, but with uffd-wp special pte
 * installed.  It means this pte was wr-protected before being unmapped.
 */
static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
{
        /*
         * Just in case there're leftover special ptes even after the region
         * got unregistered - we can simply clear them.
         */
        if (unlikely(!userfaultfd_wp(vmf->vma)))
                return pte_marker_clear(vmf);

        return do_pte_missing(vmf);
}

static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
{
        swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
        unsigned long marker = pte_marker_get(entry);

        /*
         * PTE markers should never be empty.  If anything weird happened,
         * the best thing to do is to kill the process along with its mm.
         */
        if (WARN_ON_ONCE(!marker))
                return VM_FAULT_SIGBUS;

        /* Higher priority than uffd-wp when data corrupted */
        if (marker & PTE_MARKER_POISONED)
                return VM_FAULT_HWPOISON;

        if (pte_marker_entry_uffd_wp(entry))
                return pte_marker_handle_uffd_wp(vmf);

        /* This is an unknown pte marker */
        return VM_FAULT_SIGBUS;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * We return with the mmap_lock locked or unlocked in the same cases
 * as does filemap_fault().
 */
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *swapcache, *folio = NULL;
        struct page *page;
        struct swap_info_struct *si = NULL;
        rmap_t rmap_flags = RMAP_NONE;
        bool need_clear_cache = false;
        bool exclusive = false;
        swp_entry_t entry;
        pte_t pte;
        vm_fault_t ret = 0;
        void *shadow = NULL;

        if (!pte_unmap_same(vmf))
                goto out;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
                } else if (is_device_exclusive_entry(entry)) {
                        vmf->page = pfn_swap_entry_to_page(entry);
                        ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
                        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                                /*
                                 * migrate_to_ram is not yet ready to operate
                                 * under VMA lock.
                                 */
                                vma_end_read(vma);
                                ret = VM_FAULT_RETRY;
                                goto out;
                        }

                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (unlikely(!vmf->pte ||
                                     !pte_same(ptep_get(vmf->pte),
                                                        vmf->orig_pte)))
                                goto unlock;

                        /*
                         * Get a page reference while we know the page can't be
                         * freed.
                         */
                        get_page(vmf->page);
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
                        put_page(vmf->page);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else if (is_pte_marker_entry(entry)) {
                        ret = handle_pte_marker(vmf);
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }

        /* Prevent swapoff from happening to us. */
        si = get_swap_device(entry);
        if (unlikely(!si))
                goto out;

        folio = swap_cache_get_folio(entry, vma, vmf->address);
        if (folio)
                page = folio_file_page(folio, swp_offset(entry));
        swapcache = folio;

        if (!folio) {
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /*
                         * Prevent parallel swapin from proceeding with
                         * the cache flag. Otherwise, another thread may
                         * finish swapin first, free the entry, and swapout
                         * reusing the same entry. It's undetectable as
                         * pte_same() returns true due to entry reuse.
                         */
                        if (swapcache_prepare(entry)) {
                                /* Relax a bit to prevent rapid repeated page faults */
                                schedule_timeout_uninterruptible(1);
                                goto out;
                        }
                        need_clear_cache = true;

                        /* skip swapcache */
                        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
                                                vma, vmf->address, false);
                        page = &folio->page;
                        if (folio) {
                                __folio_set_locked(folio);
                                __folio_set_swapbacked(folio);

                                if (mem_cgroup_swapin_charge_folio(folio,
                                                        vma->vm_mm, GFP_KERNEL,
                                                        entry)) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }
                                mem_cgroup_swapin_uncharge_swap(entry);

                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(folio, shadow);

                                folio_add_lru(folio);

                                /* To provide entry to swap_read_folio() */
                                folio->swap = entry;
                                swap_read_folio(folio, true, NULL);
                                folio->private = NULL;
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                vmf);
                        if (page)
                                folio = page_folio(page);
                        swapcache = folio;
                }

                if (!folio) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (likely(vmf->pte &&
                                   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        goto unlock;
                }

                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
        } else if (PageHWPoison(page)) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
                goto out_release;
        }

        ret |= folio_lock_or_retry(folio, vmf);
        if (ret & VM_FAULT_RETRY)
                goto out_release;

        if (swapcache) {
                /*
                 * Make sure folio_free_swap() or swapoff did not release the
                 * swapcache from under us.  The page pin, and pte_same test
                 * below, are not enough to exclude that.  Even if it is still
                 * swapcache, we need to check that the page's swap has not
                 * changed.
                 */
                if (unlikely(!folio_test_swapcache(folio) ||
                             page_swap_entry(page).val != entry.val))
                        goto out_page;

                /*
                 * KSM sometimes has to copy on read faults, for example, if
                 * page->index of !PageKSM() pages would be nonlinear inside the
                 * anon VMA -- PageKSM() is lost on actual swapout.
                 */
                folio = ksm_might_need_to_copy(folio, vma, vmf->address);
                if (unlikely(!folio)) {
                        ret = VM_FAULT_OOM;
                        folio = swapcache;
                        goto out_page;
                } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                        ret = VM_FAULT_HWPOISON;
                        folio = swapcache;
                        goto out_page;
                }
                if (folio != swapcache)
                        page = folio_page(folio, 0);

                /*
                 * If we want to map a page that's in the swapcache writable, we
                 * have to detect via the refcount if we're really the exclusive
                 * owner. Try removing the extra reference from the local LRU
                 * caches if required.
                 */
                if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
                    !folio_test_ksm(folio) && !folio_test_lru(folio))
                        lru_add_drain();
        }

        folio_throttle_swaprate(folio, GFP_KERNEL);

        /*
         * Back out if somebody else already faulted in this pte.
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                goto out_nomap;

        if (unlikely(!folio_test_uptodate(folio))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

        /*
         * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
         * must never point at an anonymous page in the swapcache that is
         * PG_anon_exclusive. Sanity check that this holds and especially, that
         * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
         * check after taking the PT lock and making sure that nobody
         * concurrently faulted in this page and set PG_anon_exclusive.
         */
        BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
        BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));

        /*
         * Check under PT lock (to protect against concurrent fork() sharing
         * the swap entry concurrently) for certainly exclusive pages.
         */
        if (!folio_test_ksm(folio)) {
                exclusive = pte_swp_exclusive(vmf->orig_pte);
                if (folio != swapcache) {
                        /*
                         * We have a fresh page that is not exposed to the
                         * swapcache -> certainly exclusive.
                         */
                        exclusive = true;
                } else if (exclusive && folio_test_writeback(folio) &&
                          data_race(si->flags & SWP_STABLE_WRITES)) {
                        /*
                         * This is tricky: not all swap backends support
                         * concurrent page modifications while under writeback.
                         *
                         * So if we stumble over such a page in the swapcache
                         * we must not set the page exclusive, otherwise we can
                         * map it writable without further checks and modify it
                         * while still under writeback.
                         *
                         * For these problematic swap backends, simply drop the
                         * exclusive marker: this is perfectly fine as we start
                         * writeback only if we fully unmapped the page and
                         * there are no unexpected references on the page after
                         * unmapping succeeded. After fully unmapped, no
                         * further GUP references (FOLL_GET and FOLL_PIN) can
                         * appear, so dropping the exclusive marker and mapping
                         * it only R/O is fine.
                         */
                        exclusive = false;
                }
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        /*
         * Remove the swap entry and conditionally try to free up the swapcache.
         * We're already holding a reference on the page but haven't mapped it
         * yet.
         */
        swap_free(entry);
        if (should_try_to_free_swap(folio, vma, vmf->flags))
                folio_free_swap(folio);

        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);

        /*
         * Same logic as in do_wp_page(); however, optimize for pages that are
         * certainly not shared either because we just allocated them without
         * exposing them to the swapcache or because the swap entry indicates
         * exclusivity.
         */
        if (!folio_test_ksm(folio) &&
            (exclusive || folio_ref_count(folio) == 1)) {
                if (vmf->flags & FAULT_FLAG_WRITE) {
                        pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                        vmf->flags &= ~FAULT_FLAG_WRITE;
                }
                rmap_flags |= RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
        if (pte_swp_uffd_wp(vmf->orig_pte))
                pte = pte_mkuffd_wp(pte);
        vmf->orig_pte = pte;

        /* ksm created a completely new copy */
        if (unlikely(folio != swapcache && swapcache)) {
                folio_add_new_anon_rmap(folio, vma, vmf->address);
                folio_add_lru_vma(folio, vma);
        } else {
                folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
                                        rmap_flags);
        }

        VM_BUG_ON(!folio_test_anon(folio) ||
                        (pte_write(pte) && !PageAnonExclusive(page)));
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);

        folio_unlock(folio);
        if (folio != swapcache && swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
                 * (to avoid false positives from pte_same). For
                 * further safety release the lock after the swap_free
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
                folio_unlock(swapcache);
                folio_put(swapcache);
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        /* Clear the swap cache pin for direct swapin after PTL unlock */
        if (need_clear_cache)
                swapcache_clear(si, entry);
        if (si)
                put_swap_device(si);
        return ret;
out_nomap:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
        folio_unlock(folio);
out_release:
        folio_put(folio);
        if (folio != swapcache && swapcache) {
                folio_unlock(swapcache);
                folio_put(swapcache);
        }
        if (need_clear_cache)
                swapcache_clear(si, entry);
        if (si)
                put_swap_device(si);
        return ret;
}

static bool pte_range_none(pte_t *pte, int nr_pages)
{
        int i;

        for (i = 0; i < nr_pages; i++) {
                if (!pte_none(ptep_get_lockless(pte + i)))
                        return false;
        }

        return true;
}

static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * for this vma. Then filter out the orders that can't be allocated over
         * the faulting address and still be fully contained in the vma.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags,
                        TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
        if (!pte)
                return ERR_PTR(-EAGAIN);

        /*
         * Find the highest order where the aligned range is completely
         * pte_none(). Note that all remaining orders will be completely
         * pte_none().
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (pte_range_none(pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap(pte);

        if (!orders)
                goto fallback;

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr, true);
                if (folio) {
                        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                                folio_put(folio);
                                goto next;
                        }
                        folio_throttle_swaprate(folio, gfp);
                        clear_huge_page(&folio->page, vmf->address, 1 << order);
                        return folio;
                }
next:
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
#endif
        return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address;
        struct folio *folio;
        vm_fault_t ret = 0;
        int nr_pages = 1;
        pte_t entry;
        int i;

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

        /*
         * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
         * be distinguished from a transient failure of pte_offset_map().
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;

        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
                if (!vmf->pte)
                        goto unlock;
                if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }

        /* Allocate our own private page. */
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
        folio = alloc_anon_folio(vmf);
        if (IS_ERR(folio))
                return 0;
        if (!folio)
                goto oom;

        nr_pages = folio_nr_pages(folio);
        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);

        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __folio_mark_uptodate(folio);

        entry = mk_pte(&folio->page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry), vma);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte)
                goto release;
        if (nr_pages == 1 && vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, addr, vmf->pte);
                goto release;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                for (i = 0; i < nr_pages; i++)
                        update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
                goto release;
        }

        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;

        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                folio_put(folio);
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }

        folio_ref_add(folio, nr_pages - 1);
        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
#endif
        folio_add_new_anon_rmap(folio, vma, addr);
        folio_add_lru_vma(folio, vma);
setpte:
        if (vmf_orig_pte_uffd_wp(vmf))
                entry = pte_mkuffd_wp(entry);
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
release:
        folio_put(folio);
        goto unlock;
oom:
        return VM_FAULT_OOM;
}

/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        /*
         * Preallocate pte before we take page_lock because this might lead to
         * deadlocks for memcg reclaim which waits for pages under writeback:
         *                                lock_page(A)
         *                                SetPageWriteback(A)
         *                                unlock_page(A)
         * lock_page(B)
         *                                lock_page(B)
         * pte_alloc_one
         *   shrink_page_list
         *     wait_on_page_writeback(A)
         *                                SetPageWriteback(B)
         *                                unlock_page(B)
         *                                # flush A, B to clear the writeback
         */
        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;

        folio = page_folio(vmf->page);
        if (unlikely(PageHWPoison(vmf->page))) {
                vm_fault_t poisonret = VM_FAULT_HWPOISON;
                if (ret & VM_FAULT_LOCKED) {
                        if (page_mapped(vmf->page))
                                unmap_mapping_folio(folio);
                        /* Retry if a clean folio was removed from the cache. */
                        if (mapping_evict_folio(folio->mapping, folio))
                                poisonret = VM_FAULT_NOPAGE;
                        folio_unlock(folio);
                }
                folio_put(folio);
                vmf->page = NULL;
                return poisonret;
        }

        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                folio_lock(folio);
        else
                VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        mm_inc_nr_ptes(vma->vm_mm);
        vmf->prealloc_pte = NULL;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        struct folio *folio = page_folio(page);
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        vm_fault_t ret = VM_FAULT_FALLBACK;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return ret;

        if (page != &folio->page || folio_order(folio) != HPAGE_PMD_ORDER)
                return ret;

        /*
         * Just backoff if any subpage of a THP is corrupted otherwise
         * the corrupted page may mapped by PMD silently to escape the
         * check.  This kind of THP just can be PTE mapped.  Access to
         * the corrupted subpage should trigger SIGBUS as expected.
         */
        if (unlikely(folio_test_has_hwpoisoned(folio)))
                return ret;

        /*
         * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;

        flush_icache_pages(vma, page, HPAGE_PMD_NR);

        entry = mk_huge_pmd(page, vma->vm_page_prot);
        if (write)
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

        add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
        folio_add_file_rmap_pmd(folio, page, vma);

        /*
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
                deposit_prealloc_pte(vmf);

        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);

        update_mmu_cache_pmd(vma, haddr, vmf->pmd);

        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
out:
        spin_unlock(vmf->ptl);
        return ret;
}
#else
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        return VM_FAULT_FALLBACK;
}
#endif

/**
 * set_pte_range - Set a range of PTEs to point to pages in a folio.
 * @vmf: Fault decription.
 * @folio: The folio that contains @page.
 * @page: The first page to create a PTE for.
 * @nr: The number of PTEs to create.
 * @addr: The first address to create a PTE for.
 */
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
        pte_t entry;

        flush_icache_pages(vma, page, nr);
        entry = mk_pte(page, vma->vm_page_prot);

        if (prefault && arch_wants_old_prefaulted_pte())
                entry = pte_mkold(entry);
        else
                entry = pte_sw_mkyoung(entry);

        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
                entry = pte_mkuffd_wp(entry);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                VM_BUG_ON_FOLIO(nr != 1, folio);
                folio_add_new_anon_rmap(folio, vma, addr);
                folio_add_lru_vma(folio, vma);
        } else {
                folio_add_file_rmap_ptes(folio, page, nr, vma);
        }
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);

        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
}

static bool vmf_pte_changed(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
                return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);

        return !pte_none(ptep_get(vmf->pte));
}

/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        vm_fault_t ret;
        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
                      !(vma->vm_flags & VM_SHARED);

        /* Did we COW the page? */
        if (is_cow)
                page = vmf->cow_page;
        else
                page = vmf->page;

        /*
         * check even for read faults because we might have lost our CoWed
         * page
         */
        if (!(vma->vm_flags & VM_SHARED)) {
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        return ret;
        }

        if (pmd_none(*vmf->pmd)) {
                if (PageTransCompound(page)) {
                        ret = do_set_pmd(vmf, page);
                        if (ret != VM_FAULT_FALLBACK)
                                return ret;
                }

                if (vmf->prealloc_pte)
                        pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
                else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
                        return VM_FAULT_OOM;
        }

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                      vmf->address, &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;

        /* Re-check under ptl */
        if (likely(!vmf_pte_changed(vmf))) {
                struct folio *folio = page_folio(page);
                int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);

                set_pte_range(vmf, folio, page, 1, vmf->address);
                add_mm_counter(vma->vm_mm, type, 1);
                ret = 0;
        } else {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                ret = VM_FAULT_NOPAGE;
        }

        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
}

static unsigned long fault_around_pages __read_mostly =
        65536 >> PAGE_SHIFT;

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
        *val = fault_around_pages << PAGE_SHIFT;
        return 0;
}

/*
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
 */
static int fault_around_bytes_set(void *data, u64 val)
{
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;

        /*
         * The minimum value is 1 page, however this results in no fault-around
         * at all. See should_fault_around().
         */
        val = max(val, PAGE_SIZE);
        fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;

        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");

static int __init fault_around_debugfs(void)
{
        debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
                                   &fault_around_bytes_fops);
        return 0;
}
late_initcall(fault_around_debugfs);
#endif

/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function doesn't cross VMA or page table boundaries, in order to call
 * map_pages() and acquire a PTE lock only once.
 *
 * fault_around_pages defines how many pages we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
 *
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_pages * PAGE_SIZE rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
 */
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
        pgoff_t nr_pages = READ_ONCE(fault_around_pages);
        pgoff_t pte_off = pte_index(vmf->address);
        /* The page offset of vmf->address within the VMA. */
        pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
        pgoff_t from_pte, to_pte;
        vm_fault_t ret;

        /* The PTE offset of the start address, clamped to the VMA. */
        from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
                       pte_off - min(pte_off, vma_off));

        /* The PTE offset of the end address, clamped to the VMA and PTE. */
        to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
                      pte_off + vma_pages(vmf->vma) - vma_off) - 1;

        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        rcu_read_lock();
        ret = vmf->vma->vm_ops->map_pages(vmf,
                        vmf->pgoff + from_pte - pte_off,
                        vmf->pgoff + to_pte - pte_off);
        rcu_read_unlock();

        return ret;
}

/* Return true if we should do read fault-around, false otherwise */
static inline bool should_fault_around(struct vm_fault *vmf)
{
        /* No ->map_pages?  No way to fault around... */
        if (!vmf->vma->vm_ops->map_pages)
                return false;

        if (uffd_disable_fault_around(vmf->vma))
                return false;

        /* A single page implies no faulting 'around' at all. */
        return fault_around_pages > 1;
}

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
        vm_fault_t ret = 0;
        struct folio *folio;

        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
        if (should_fault_around(vmf)) {
                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        ret |= finish_fault(vmf);
        folio = page_folio(vmf->page);
        folio_unlock(folio);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                folio_put(folio);
        return ret;
}

static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        ret = vmf_can_call_fault(vmf);
        if (!ret)
                ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;

        folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false);
        if (!folio)
                return VM_FAULT_OOM;

        vmf->cow_page = &folio->page;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;

        copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
        __folio_mark_uptodate(folio);

        ret |= finish_fault(vmf);
        unlock_page(vmf->page);
        put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
uncharge_out:
        folio_put(folio);
        return ret;
}

static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;
        struct folio *folio;

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        folio = page_folio(vmf->page);

        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
                folio_unlock(folio);
                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
        }

        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
                folio_unlock(folio);
                folio_put(folio);
                return ret;
        }

        ret |= fault_dirty_shared_page(vmf);
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 * If mmap_lock is released, vma may become invalid (for example
 * by other thread calling munmap()).
 */
static vm_fault_t do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *vm_mm = vma->vm_mm;
        vm_fault_t ret;

        /*
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
                vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                               vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        ret = VM_FAULT_SIGBUS;
                else {
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
                         * of pte involves: take ptl, clearing the pte so that
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
                        if (unlikely(pte_none(ptep_get(vmf->pte))))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;

                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                }
        } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
        else
                ret = do_shared_fault(vmf);

        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
}

int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int page_nid, int *flags)
{
        struct vm_area_struct *vma = vmf->vma;

        folio_get(folio);

        /* Record the current PID acceesing VMA */
        vma_set_access_pid_bit(vma);

        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
        }

        return mpol_misplaced(folio, vmf, addr);
}

static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                        unsigned long fault_addr, pte_t *fault_pte,
                                        bool writable)
{
        pte_t pte, old_pte;

        old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (writable)
                pte = pte_mkwrite(pte, vma);
        ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
        update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
}

static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                       struct folio *folio, pte_t fault_pte,
                                       bool ignore_writable, bool pte_write_upgrade)
{
        int nr = pte_pfn(fault_pte) - folio_pfn(folio);
        unsigned long start, end, addr = vmf->address;
        unsigned long addr_start = addr - (nr << PAGE_SHIFT);
        unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
        pte_t *start_ptep;

        /* Stay within the VMA and within the page table. */
        start = max3(addr_start, pt_start, vma->vm_start);
        end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
                   vma->vm_end);
        start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);

        /* Restore all PTEs' mapping of the large folio */
        for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
                pte_t ptent = ptep_get(start_ptep);
                bool writable = false;

                if (!pte_present(ptent) || !pte_protnone(ptent))
                        continue;

                if (pfn_folio(pte_pfn(ptent)) != folio)
                        continue;

                if (!ignore_writable) {
                        ptent = pte_modify(ptent, vma->vm_page_prot);
                        writable = pte_write(ptent);
                        if (!writable && pte_write_upgrade &&
                            can_change_pte_writable(vma, addr, ptent))
                                writable = true;
                }

                numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
        }
}

static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        int nid = NUMA_NO_NODE;
        bool writable = false, ignore_writable = false;
        bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
        int last_cpupid;
        int target_nid;
        pte_t pte, old_pte;
        int flags = 0, nr_pages;

        /*
         * The pte cannot be used safely until we verify, while holding the page
         * table lock, that its contents have not changed during fault handling.
         */
        spin_lock(vmf->ptl);
        /* Read the live PTE from the page tables: */
        old_pte = ptep_get(vmf->pte);

        if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }

        pte = pte_modify(old_pte, vma->vm_page_prot);

        /*
         * Detect now whether the PTE could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pte_write(pte);
        if (!writable && pte_write_upgrade &&
            can_change_pte_writable(vma, vmf->address, pte))
                writable = true;

        folio = vm_normal_folio(vma, vmf->address, pte);
        if (!folio || folio_is_zone_device(folio))
                goto out_map;

        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
         * the case where a mapping is writable but the process never writes
         * to it but pte_write gets cleared during protection updates and
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
        if (!writable)
                flags |= TNF_NO_GROUP;

        /*
         * Flag if the folio is shared between multiple address spaces. This
         * is later used when determining whether to group tasks together
         */
        if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
                flags |= TNF_SHARED;

        nid = folio_nid(folio);
        nr_pages = folio_nr_pages(folio);
        /*
         * For memory tiering mode, cpupid of slow memory page is used
         * to record page access time.  So use default value.
         */
        if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
            !node_is_toptier(nid))
                last_cpupid = (-1 & LAST_CPUPID_MASK);
        else
                last_cpupid = folio_last_cpupid(folio);
        target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
        if (target_nid == NUMA_NO_NODE) {
                folio_put(folio);
                goto out_map;
        }
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        writable = false;
        ignore_writable = true;

        /* Migrate to the requested node */
        if (migrate_misplaced_folio(folio, vma, target_nid)) {
                nid = target_nid;
                flags |= TNF_MIGRATED;
        } else {
                flags |= TNF_MIGRATE_FAIL;
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                               vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        goto out;
                if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        goto out;
                }
                goto out_map;
        }

out:
        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
        return 0;
out_map:
        /*
         * Make it present again, depending on how arch implements
         * non-accessible ptes, some can allow access by kernel mode.
         */
        if (folio && folio_test_large(folio))
                numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
                                           pte_write_upgrade);
        else
                numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
                                            writable);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        goto out;
}

static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(vmf);
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
        return VM_FAULT_FALLBACK;
}

/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        vm_fault_t ret;

        if (vma_is_anonymous(vma)) {
                if (likely(!unshare) &&
                    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
                        if (userfaultfd_wp_async(vmf->vma))
                                goto split;
                        return handle_userfault(vmf, VM_UFFD_WP);
                }
                return do_huge_pmd_wp_page(vmf);
        }

        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }

split:
        /* COW or write-notify handled on pte level: split pmd. */
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);

        return VM_FAULT_FALLBACK;
}

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                return VM_FAULT_FALLBACK;
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
}

static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                goto split;
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }
split:
        /* COW or write-notify not handled on PUD level: split pud.*/
        __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
}

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
 * concurrent faults).
 *
 * The mmap_lock may have been released depending on flags and our return value.
 * See filemap_fault() and __folio_lock_or_retry().
 */
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
        pte_t entry;

        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
                vmf->pte = NULL;
                vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
        } else {
                /*
                 * A regular pmd is established and it can't morph into a huge
                 * pmd by anon khugepaged, since that takes mmap_lock in write
                 * mode; but shmem or file collapse to THP could still morph
                 * it into a huge pmd: just retry later if so.
                 */
                vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
                                                 vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        return 0;
                vmf->orig_pte = ptep_get_lockless(vmf->pte);
                vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
                }
        }

        if (!vmf->pte)
                return do_pte_missing(vmf);

        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);

        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);

        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
        if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
                else if (likely(vmf->flags & FAULT_FLAG_WRITE))
                        entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
                update_mmu_cache_range(vmf, vmf->vma, vmf->address,
                                vmf->pte, 1);
        } else {
                /* Skip spurious TLB flush for retried page fault */
                if (vmf->flags & FAULT_FLAG_TRIED)
                        goto unlock;
                /*
                 * This is needed only for protection faults but the arch code
                 * is not yet telling us if this is a protection fault or not.
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
                if (vmf->flags & FAULT_FLAG_WRITE)
                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
                                                     vmf->pte);
        }
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

/*
 * On entry, we hold either the VMA lock or the mmap_lock
 * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
 * the result, the mmap_lock is not held on exit.  See filemap_fault()
 * and __folio_lock_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
                .real_address = address,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        struct mm_struct *mm = vma->vm_mm;
        unsigned long vm_flags = vma->vm_flags;
        pgd_t *pgd;
        p4d_t *p4d;
        vm_fault_t ret;

        pgd = pgd_offset(mm, address);
        p4d = p4d_alloc(mm, pgd, address);
        if (!p4d)
                return VM_FAULT_OOM;

        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
retry_pud:
        if (pud_none(*vmf.pud) &&
            thp_vma_allowable_order(vma, vm_flags,
                                TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pud_t orig_pud = *vmf.pud;

                barrier();
                if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {

                        /*
                         * TODO once we support anonymous PUDs: NUMA case and
                         * FAULT_FLAG_UNSHARE handling.
                         */
                        if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pud_set_accessed(&vmf, orig_pud);
                                return 0;
                        }
                }
        }

        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;

        /* Huge pud page fault raced with pmd_alloc? */
        if (pud_trans_unstable(vmf.pud))
                goto retry_pud;

        if (pmd_none(*vmf.pmd) &&
            thp_vma_allowable_order(vma, vm_flags,
                                TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);

                if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                          !is_pmd_migration_entry(vmf.orig_pmd));
                        if (is_pmd_migration_entry(vmf.orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
                if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
                        if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf);

                        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
                            !pmd_write(vmf.orig_pmd)) {
                                ret = wp_huge_pmd(&vmf);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pmd_set_accessed(&vmf);
                                return 0;
                        }
                }
        }

        return handle_pte_fault(&vmf);
}

/**
 * mm_account_fault - Do page fault accounting
 * @mm: mm from which memcg should be extracted. It can be NULL.
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accounting.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
{
        bool major;

        /* Incomplete faults will be accounted upon completion. */
        if (ret & VM_FAULT_RETRY)
                return;

        /*
         * To preserve the behavior of older kernels, PGFAULT counters record
         * both successful and failed faults, as opposed to perf counters,
         * which ignore failed cases.
         */
        count_vm_event(PGFAULT);
        count_memcg_event_mm(mm, PGFAULT);

        /*
         * Do not account for unsuccessful faults (e.g. when the address wasn't
         * valid).  That includes arch_vma_access_permitted() failing before
         * reaching here. So this is not a "this many hardware page faults"
         * counter.  We should use the hw profiling for that.
         */
        if (ret & VM_FAULT_ERROR)
                return;

        /*
         * We define the fault as a major fault when the final successful fault
         * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
         * handle it immediately previously).
         */
        major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

        if (major)
                current->maj_flt++;
        else
                current->min_flt++;

        /*
         * If the fault is done for GUP, regs will be NULL.  We only do the
         * accounting for the per thread fault counters who triggered the
         * fault, and we skip the perf event updates.
         */
        if (!regs)
                return;

        if (major)
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
        else
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
        /* the LRU algorithm only applies to accesses with recency */
        current->in_lru_fault = vma_has_recency(vma);
}

static void lru_gen_exit_fault(void)
{
        current->in_lru_fault = false;
}
#else
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
}

static void lru_gen_exit_fault(void)
{
}
#endif /* CONFIG_LRU_GEN */

static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
                                       unsigned int *flags)
{
        if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
                if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
                        return VM_FAULT_SIGSEGV;
                /*
                 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
                 * just treat it like an ordinary read-fault otherwise.
                 */
                if (!is_cow_mapping(vma->vm_flags))
                        *flags &= ~FAULT_FLAG_UNSHARE;
        } else if (*flags & FAULT_FLAG_WRITE) {
                /* Write faults on read-only mappings are impossible ... */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
                        return VM_FAULT_SIGSEGV;
                /* ... and FOLL_FORCE only applies to COW mappings. */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
                                 !is_cow_mapping(vma->vm_flags)))
                        return VM_FAULT_SIGSEGV;
        }
#ifdef CONFIG_PER_VMA_LOCK
        /*
         * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
         * the assumption that lock is dropped on VM_FAULT_RETRY.
         */
        if (WARN_ON_ONCE((*flags &
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
                return VM_FAULT_SIGSEGV;
#endif

        return 0;
}

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 */
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
{
        /* If the fault handler drops the mmap_lock, vma may be freed */
        struct mm_struct *mm = vma->vm_mm;
        vm_fault_t ret;

        __set_current_state(TASK_RUNNING);

        ret = sanitize_fault_flags(vma, &flags);
        if (ret)
                goto out;

        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE)) {
                ret = VM_FAULT_SIGSEGV;
                goto out;
        }

        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();

        lru_gen_enter_fault(vma);

        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);

        lru_gen_exit_fault();

        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
                 * The task may have entered a memcg OOM situation but
                 * if the allocation error was handled gracefully (no
                 * VM_FAULT_OOM), there is no need to kill anything.
                 * Just clean up the OOM state peacefully.
                 */
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }
out:
        mm_account_fault(mm, regs, address, flags, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
#include <linux/extable.h>

static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        if (likely(mmap_read_trylock(mm)))
                return true;

        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }

        return !mmap_read_lock_killable(mm);
}

static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
        /*
         * We don't have this operation yet.
         *
         * It should be easy enough to do: it's basically a
         *    atomic_long_try_cmpxchg_acquire()
         * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
         * it also needs the proper lockdep magic etc.
         */
        return false;
}

static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        mmap_read_unlock(mm);
        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }
        return !mmap_write_lock_killable(mm);
}

/*
 * Helper for page fault handling.
 *
 * This is kind of equivalend to "mmap_read_lock()" followed
 * by "find_extend_vma()", except it's a lot more careful about
 * the locking (and will drop the lock on failure).
 *
 * For example, if we have a kernel bug that causes a page
 * fault, we don't want to just use mmap_read_lock() to get
 * the mm lock, because that would deadlock if the bug were
 * to happen while we're holding the mm lock for writing.
 *
 * So this checks the exception tables on kernel faults in
 * order to only do this all for instructions that are actually
 * expected to fault.
 *
 * We can also actually take the mm lock for writing if we
 * need to extend the vma, which helps the VM layer a lot.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        if (!get_mmap_lock_carefully(mm, regs))
                return NULL;

        vma = find_vma(mm, addr);
        if (likely(vma && (vma->vm_start <= addr)))
                return vma;

        /*
         * Well, dang. We might still be successful, but only
         * if we can extend a vma to do so.
         */
        if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
                mmap_read_unlock(mm);
                return NULL;
        }

        /*
         * We can try to upgrade the mmap lock atomically,
         * in which case we can continue to use the vma
         * we already looked up.
         *
         * Otherwise we'll have to drop the mmap lock and
         * re-take it, and also look up the vma again,
         * re-checking it.
         */
        if (!mmap_upgrade_trylock(mm)) {
                if (!upgrade_mmap_lock_carefully(mm, regs))
                        return NULL;

                vma = find_vma(mm, addr);
                if (!vma)
                        goto fail;
                if (vma->vm_start <= addr)
                        goto success;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto fail;
        }

        if (expand_stack_locked(vma, addr))
                goto fail;

success:
        mmap_write_downgrade(mm);
        return vma;

fail:
        mmap_write_unlock(mm);
        return NULL;
}
#endif

#ifdef CONFIG_PER_VMA_LOCK
/*
 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
 * stable and not isolated. If the VMA is not found or is being modified the
 * function returns NULL.
 */
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address)
{
        MA_STATE(mas, &mm->mm_mt, address, address);
        struct vm_area_struct *vma;

        rcu_read_lock();
retry:
        vma = mas_walk(&mas);
        if (!vma)
                goto inval;

        if (!vma_start_read(vma))
                goto inval;

        /* Check since vm_start/vm_end might change before we lock the VMA */
        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                goto inval_end_read;

        /* Check if the VMA got isolated after we found it */
        if (vma->detached) {
                vma_end_read(vma);
                count_vm_vma_lock_event(VMA_LOCK_MISS);
                /* The area was replaced with another one */
                goto retry;
        }

        rcu_read_unlock();
        return vma;

inval_end_read:
        vma_end_read(vma);
inval:
        rcu_read_unlock();
        count_vm_vma_lock_event(VMA_LOCK_ABORT);
        return NULL;
}
#endif /* CONFIG_PER_VMA_LOCK */

#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        p4d_t *new = p4d_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd)) {        /* Another has populated it */
                p4d_free(mm, new);
        } else {
                smp_wmb(); /* See comment in pmd_install() */
                pgd_populate(mm, pgd, new);
        }
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
        pud_t *new = pud_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                p4d_populate(mm, p4d, new);
        } else        /* Another has populated it */
                pud_free(mm, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                pud_populate(mm, pud, new);
        } else {        /* Another has populated it */
                pmd_free(mm, new);
        }
        spin_unlock(ptl);
        return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */

/**
 * follow_pte - look up PTE at a user virtual address
 * @vma: the memory mapping
 * @address: user virtual address
 * @ptepp: location to store found PTE
 * @ptlp: location to store the lock for the PTE
 *
 * On a successful return, the pointer to the PTE is stored in @ptepp;
 * the corresponding lock is taken and its location is stored in @ptlp.
 *
 * The contents of the PTE are only stable until @ptlp is released using
 * pte_unmap_unlock(). This function will fail if the PTE is non-present.
 * Present PTEs may include PTEs that map refcounted pages, such as
 * anonymous folios in COW mappings.
 *
 * Callers must be careful when relying on PTE content after
 * pte_unmap_unlock(). Especially if the PTE maps a refcounted page,
 * callers must protect against invalidation with MMU notifiers; otherwise
 * access to the PFN at a later point in time can trigger use-after-free.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read.
 *
 * This function must not be used to modify PTE content.
 *
 * Return: zero on success, -ve otherwise.
 */
int follow_pte(struct vm_area_struct *vma, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp)
{
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep;

        mmap_assert_locked(mm);
        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                goto out;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;

        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
                goto out;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
                goto out;

        pmd = pmd_offset(pud, address);
        VM_BUG_ON(pmd_trans_huge(*pmd));

        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
        if (!ptep)
                goto out;
        if (!pte_present(ptep_get(ptep)))
                goto unlock;
        *ptepp = ptep;
        return 0;
unlock:
        pte_unmap_unlock(ptep, *ptlp);
out:
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(follow_pte);

#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
 * generic_access_phys - generic implementation for iomem mmap access
 * @vma: the vma to access
 * @addr: userspace address, not relative offset within @vma
 * @buf: buffer to read/write
 * @len: length of transfer
 * @write: set to FOLL_WRITE when writing, otherwise reading
 *
 * This is a generic implementation for &vm_operations_struct.access for an
 * iomem mapping. This callback is used by access_process_vm() when the @vma is
 * not page based.
 */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write)
{
        resource_size_t phys_addr;
        unsigned long prot = 0;
        void __iomem *maddr;
        pte_t *ptep, pte;
        spinlock_t *ptl;
        int offset = offset_in_page(addr);
        int ret = -EINVAL;

retry:
        if (follow_pte(vma, addr, &ptep, &ptl))
                return -EINVAL;
        pte = ptep_get(ptep);
        pte_unmap_unlock(ptep, ptl);

        prot = pgprot_val(pte_pgprot(pte));
        phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;

        if ((write & FOLL_WRITE) && !pte_write(pte))
                return -EINVAL;

        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (!maddr)
                return -ENOMEM;

        if (follow_pte(vma, addr, &ptep, &ptl))
                goto out_unmap;

        if (!pte_same(pte, ptep_get(ptep))) {
                pte_unmap_unlock(ptep, ptl);
                iounmap(maddr);

                goto retry;
        }

        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
                memcpy_fromio(buf, maddr + offset, len);
        ret = len;
        pte_unmap_unlock(ptep, ptl);
out_unmap:
        iounmap(maddr);

        return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif

/*
 * Access another process' address space as given in mm.
 */
static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;

        if (mmap_read_lock_killable(mm))
                return 0;

        /* Untag the address before looking up the VMA */
        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
                return 0;

        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, offset;
                void *maddr;
                struct vm_area_struct *vma = NULL;
                struct page *page = get_user_page_vma_remote(mm, addr,
                                                             gup_flags, &vma);

                if (IS_ERR(page)) {
                        /* We might need to expand the stack to access it */
                        vma = vma_lookup(mm, addr);
                        if (!vma) {
                                vma = expand_stack(mm, addr);

                                /* mmap_lock was dropped on failure */
                                if (!vma)
                                        return buf - old_buf;

                                /* Try again if stack expansion worked */
                                continue;
                        }

                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
                        bytes = 0;
#ifdef CONFIG_HAVE_IOREMAP_PROT
                        if (vma->vm_ops && vma->vm_ops->access)
                                bytes = vma->vm_ops->access(vma, addr, buf,
                                                            len, write);
#endif
                        if (bytes <= 0)
                                break;
                } else {
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
                        if (bytes > PAGE_SIZE-offset)
                                bytes = PAGE_SIZE-offset;

                        maddr = kmap_local_page(page);
                        if (write) {
                                copy_to_user_page(vma, page, addr,
                                                  maddr + offset, buf, bytes);
                                set_page_dirty_lock(page);
                        } else {
                                copy_from_user_page(vma, page, addr,
                                                    buf, maddr + offset, bytes);
                        }
                        unmap_and_put_page(page, maddr);
                }
                len -= bytes;
                buf += bytes;
                addr += bytes;
        }
        mmap_read_unlock(mm);

        return buf - old_buf;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:                the mm_struct of the target address space
 * @addr:        start address to access
 * @buf:        source or destination buffer
 * @len:        number of bytes to transfer
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        return __access_remote_vm(mm, addr, buf, len, gup_flags);
}

/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = __access_remote_vm(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);

/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        /*
         * we might be running from an atomic context so we cannot sleep
         */
        if (!mmap_read_trylock(mm))
                return;

        vma = vma_lookup(mm, ip);
        if (vma && vma->vm_file) {
                struct file *f = vma->vm_file;
                ip -= vma->vm_start;
                ip += vma->vm_pgoff << PAGE_SHIFT;
                printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
                                vma->vm_start,
                                vma->vm_end - vma->vm_start);
        }
        mmap_read_unlock(mm);
}

#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
        if (pagefault_disabled())
                return;
        __might_sleep(file, line);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
        if (current->mm)
                might_lock_read(&current->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline int process_huge_page(
        unsigned long addr_hint, unsigned int pages_per_huge_page,
        int (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
{
        int i, n, base, l, ret;
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        /* Process target subpage last to keep its cache lines hot */
        might_sleep();
        n = (addr_hint - addr) / PAGE_SIZE;
        if (2 * n <= pages_per_huge_page) {
                /* If target subpage in first half of huge page */
                base = 0;
                l = n;
                /* Process subpages at the end of huge page */
                for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        } else {
                /* If target subpage in second half of huge page */
                base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
                l = pages_per_huge_page - n;
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        }
        /*
         * Process remaining subpages in left-right-left-right pattern
         * towards the target subpage
         */
        for (i = 0; i < l; i++) {
                int left_idx = base + i;
                int right_idx = base + 2 * l - 1 - i;

                cond_resched();
                ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
                if (ret)
                        return ret;
                cond_resched();
                ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
                if (ret)
                        return ret;
        }
        return 0;
}

static void clear_gigantic_page(struct page *page,
                                unsigned long addr,
                                unsigned int pages_per_huge_page)
{
        int i;
        struct page *p;

        might_sleep();
        for (i = 0; i < pages_per_huge_page; i++) {
                p = nth_page(page, i);
                cond_resched();
                clear_user_highpage(p, addr + i * PAGE_SIZE);
        }
}

static int clear_subpage(unsigned long addr, int idx, void *arg)
{
        struct page *page = arg;

        clear_user_highpage(nth_page(page, idx), addr);
        return 0;
}

void clear_huge_page(struct page *page,
                     unsigned long addr_hint, unsigned int pages_per_huge_page)
{
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                clear_gigantic_page(page, addr, pages_per_huge_page);
                return;
        }

        process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}

static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
                                     unsigned long addr,
                                     struct vm_area_struct *vma,
                                     unsigned int pages_per_huge_page)
{
        int i;
        struct page *dst_page;
        struct page *src_page;

        for (i = 0; i < pages_per_huge_page; i++) {
                dst_page = folio_page(dst, i);
                src_page = folio_page(src, i);

                cond_resched();
                if (copy_mc_user_highpage(dst_page, src_page,
                                          addr + i*PAGE_SIZE, vma)) {
                        memory_failure_queue(page_to_pfn(src_page), 0);
                        return -EHWPOISON;
                }
        }
        return 0;
}

struct copy_subpage_arg {
        struct page *dst;
        struct page *src;
        struct vm_area_struct *vma;
};

static int copy_subpage(unsigned long addr, int idx, void *arg)
{
        struct copy_subpage_arg *copy_arg = arg;
        struct page *dst = nth_page(copy_arg->dst, idx);
        struct page *src = nth_page(copy_arg->src, idx);

        if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
                memory_failure_queue(page_to_pfn(src), 0);
                return -EHWPOISON;
        }
        return 0;
}

int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint, struct vm_area_struct *vma)
{
        unsigned int pages_per_huge_page = folio_nr_pages(dst);
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
        struct copy_subpage_arg arg = {
                .dst = &dst->page,
                .src = &src->page,
                .vma = vma,
        };

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
                return copy_user_gigantic_page(dst, src, addr, vma,
                                               pages_per_huge_page);

        return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}

long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault)
{
        void *kaddr;
        unsigned long i, rc = 0;
        unsigned int nr_pages = folio_nr_pages(dst_folio);
        unsigned long ret_val = nr_pages * PAGE_SIZE;
        struct page *subpage;

        for (i = 0; i < nr_pages; i++) {
                subpage = folio_page(dst_folio, i);
                kaddr = kmap_local_page(subpage);
                if (!allow_pagefault)
                        pagefault_disable();
                rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
                if (!allow_pagefault)
                        pagefault_enable();
                kunmap_local(kaddr);

                ret_val -= (PAGE_SIZE - rc);
                if (rc)
                        break;

                flush_dcache_page(subpage);

                cond_resched();
        }
        return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
                        SLAB_PANIC, NULL);
}

bool ptlock_alloc(struct ptdesc *ptdesc)
{
        spinlock_t *ptl;

        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        ptdesc->ptl = ptl;
        return true;
}

void ptlock_free(struct ptdesc *ptdesc)
{
        kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif

void vma_pgtable_walk_begin(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);
}

void vma_pgtable_walk_end(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
}

































































    2 


    2 














    2 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
 */

#include <linux/types.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <net/netns/generic.h>
#include <net/route.h>
#include <net/ip.h>

#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif
#include <net/netfilter/nf_conntrack_zones.h>

static DEFINE_MUTEX(defrag4_mutex);

static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
                                   u_int32_t user)
{
        int err;

        local_bh_disable();
        err = ip_defrag(net, skb, user);
        local_bh_enable();

        if (!err)
                skb->ignore_df = 1;

        return err;
}

static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
                                              struct sk_buff *skb)
{
        u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        if (skb_nfct(skb)) {
                enum ip_conntrack_info ctinfo;
                const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);

                zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
        }
#endif
        if (nf_bridge_in_prerouting(skb))
                return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;

        if (hooknum == NF_INET_PRE_ROUTING)
                return IP_DEFRAG_CONNTRACK_IN + zone_id;
        else
                return IP_DEFRAG_CONNTRACK_OUT + zone_id;
}

static unsigned int ipv4_conntrack_defrag(void *priv,
                                          struct sk_buff *skb,
                                          const struct nf_hook_state *state)
{
        struct sock *sk = skb->sk;

        if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
            inet_test_bit(NODEFRAG, sk))
                return NF_ACCEPT;

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#if !IS_ENABLED(CONFIG_NF_NAT)
        /* Previously seen (loopback)?  Ignore.  Do this before
           fragment check. */
        if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
                return NF_ACCEPT;
#endif
        if (skb->_nfct == IP_CT_UNTRACKED)
                return NF_ACCEPT;
#endif
        /* Gather fragments. */
        if (ip_is_fragment(ip_hdr(skb))) {
                enum ip_defrag_users user =
                        nf_ct_defrag_user(state->hook, skb);

                if (nf_ct_ipv4_gather_frags(state->net, skb, user))
                        return NF_STOLEN;
        }
        return NF_ACCEPT;
}

static const struct nf_hook_ops ipv4_defrag_ops[] = {
        {
                .hook                = ipv4_conntrack_defrag,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP_PRI_CONNTRACK_DEFRAG,
        },
        {
                .hook           = ipv4_conntrack_defrag,
                .pf             = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority       = NF_IP_PRI_CONNTRACK_DEFRAG,
        },
};

static void __net_exit defrag4_net_exit(struct net *net)
{
        if (net->nf.defrag_ipv4_users) {
                nf_unregister_net_hooks(net, ipv4_defrag_ops,
                                        ARRAY_SIZE(ipv4_defrag_ops));
                net->nf.defrag_ipv4_users = 0;
        }
}

static const struct nf_defrag_hook defrag_hook = {
        .owner = THIS_MODULE,
        .enable = nf_defrag_ipv4_enable,
        .disable = nf_defrag_ipv4_disable,
};

static struct pernet_operations defrag4_net_ops = {
        .exit = defrag4_net_exit,
};

static int __init nf_defrag_init(void)
{
        int err;

        err = register_pernet_subsys(&defrag4_net_ops);
        if (err)
                return err;

        rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook);
        return err;
}

static void __exit nf_defrag_fini(void)
{
        rcu_assign_pointer(nf_defrag_v4_hook, NULL);
        unregister_pernet_subsys(&defrag4_net_ops);
}

int nf_defrag_ipv4_enable(struct net *net)
{
        int err = 0;

        mutex_lock(&defrag4_mutex);
        if (net->nf.defrag_ipv4_users == UINT_MAX) {
                err = -EOVERFLOW;
                goto out_unlock;
        }

        if (net->nf.defrag_ipv4_users) {
                net->nf.defrag_ipv4_users++;
                goto out_unlock;
        }

        err = nf_register_net_hooks(net, ipv4_defrag_ops,
                                    ARRAY_SIZE(ipv4_defrag_ops));
        if (err == 0)
                net->nf.defrag_ipv4_users = 1;

 out_unlock:
        mutex_unlock(&defrag4_mutex);
        return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);

void nf_defrag_ipv4_disable(struct net *net)
{
        mutex_lock(&defrag4_mutex);
        if (net->nf.defrag_ipv4_users) {
                net->nf.defrag_ipv4_users--;
                if (net->nf.defrag_ipv4_users == 0)
                        nf_unregister_net_hooks(net, ipv4_defrag_ops,
                                                ARRAY_SIZE(ipv4_defrag_ops));
        }

        mutex_unlock(&defrag4_mutex);
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable);

module_init(nf_defrag_init);
module_exit(nf_defrag_fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("IPv4 defragmentation support");


















































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


    2 
    2 

    1 
    1 






    2 





































































































































































































































































































































































































































































































































































    1 






    1 










    1 

    1 







    1 

    1 





    1 

























































































    1 





    1 
























































    1 









    1 

























































    1 
    1 



    1 


















    1 


















































    1 














    1 








    1 




















    1 






    1 




















    1 



    1 







































































    1 











    1 





    1 







    1 
    1 
    1 















    1 



    1 


























































    1 







    1 

    1 






















    1 









    1 




    1 









































































































































































































































































































































































    1 






    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  TUN - Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
 *
 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
 */

/*
 *  Changes:
 *
 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
 *    Add TUNSETLINK ioctl to set the link encapsulation
 *
 *  Mark Smith <markzzzsmith@yahoo.com.au>
 *    Use eth_random_addr() for tap MAC address.
 *
 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
 *    Fixes in packet dropping, queue length setting and queue wakeup.
 *    Increased default tx queue length.
 *    Added ethtool API.
 *    Minor cleanups
 *
 *  Daniel Podlejski <underley@underley.eu.org>
 *    Modifications for 2.3.99-pre5 kernel.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#define DRV_NAME        "tun"
#define DRV_VERSION        "1.6"
#define DRV_DESCRIPTION        "Universal TUN/TAP device driver"
#define DRV_COPYRIGHT        "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/miscdevice.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
#include <linux/compat.h>
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_tun.h>
#include <linux/if_vlan.h>
#include <linux/crc32.h>
#include <linux/math.h>
#include <linux/nsproxy.h>
#include <linux/virtio_net.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <net/ip_tunnels.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/mutex.h>
#include <linux/ieee802154.h>
#include <linux/if_ltalk.h>
#include <uapi/linux/if_fddi.h>
#include <uapi/linux/if_hippi.h>
#include <uapi/linux/if_fc.h>
#include <net/ax25.h>
#include <net/rose.h>
#include <net/6lowpan.h>
#include <net/rps.h>

#include <linux/uaccess.h>
#include <linux/proc_fs.h>

static void tun_default_link_ksettings(struct net_device *dev,
                                       struct ethtool_link_ksettings *cmd);

#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
 * overload it to mean fasync when stored there.
 */
#define TUN_FASYNC        IFF_ATTACH_QUEUE
/* High bits in flags field are unused. */
#define TUN_VNET_LE     0x80000000
#define TUN_VNET_BE     0x40000000

#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
                      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)

#define GOODCOPY_LEN 128

#define FLT_EXACT_COUNT 8
struct tap_filter {
        unsigned int    count;    /* Number of addrs. Zero means disabled */
        u32             mask[2];  /* Mask of the hashed addrs */
        unsigned char        addr[FLT_EXACT_COUNT][ETH_ALEN];
};

/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
 * to max number of VCPUs in guest. */
#define MAX_TAP_QUEUES 256
#define MAX_TAP_FLOWS  4096

#define TUN_FLOW_EXPIRE (3 * HZ)

/* A tun_file connects an open character device to a tuntap netdevice. It
 * also contains all socket related structures (except sock_fprog and tap_filter)
 * to serve as one transmit queue for tuntap device. The sock_fprog and
 * tap_filter were kept in tun_struct since they were used for filtering for the
 * netdevice not for a specific queue (at least I didn't see the requirement for
 * this).
 *
 * RCU usage:
 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
 * other can only be read while rcu_read_lock or rtnl_lock is held.
 */
struct tun_file {
        struct sock sk;
        struct socket socket;
        struct tun_struct __rcu *tun;
        struct fasync_struct *fasync;
        /* only used for fasnyc */
        unsigned int flags;
        union {
                u16 queue_index;
                unsigned int ifindex;
        };
        struct napi_struct napi;
        bool napi_enabled;
        bool napi_frags_enabled;
        struct mutex napi_mutex;        /* Protects access to the above napi */
        struct list_head next;
        struct tun_struct *detached;
        struct ptr_ring tx_ring;
        struct xdp_rxq_info xdp_rxq;
};

struct tun_page {
        struct page *page;
        int count;
};

struct tun_flow_entry {
        struct hlist_node hash_link;
        struct rcu_head rcu;
        struct tun_struct *tun;

        u32 rxhash;
        u32 rps_rxhash;
        int queue_index;
        unsigned long updated ____cacheline_aligned_in_smp;
};

#define TUN_NUM_FLOW_ENTRIES 1024
#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)

struct tun_prog {
        struct rcu_head rcu;
        struct bpf_prog *prog;
};

/* Since the socket were moved to tun_file, to preserve the behavior of persist
 * device, socket filter, sndbuf and vnet header size were restore when the
 * file were attached to a persist device.
 */
struct tun_struct {
        struct tun_file __rcu        *tfiles[MAX_TAP_QUEUES];
        unsigned int            numqueues;
        unsigned int                 flags;
        kuid_t                        owner;
        kgid_t                        group;

        struct net_device        *dev;
        netdev_features_t        set_features;
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
                          NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4)

        int                        align;
        int                        vnet_hdr_sz;
        int                        sndbuf;
        struct tap_filter        txflt;
        struct sock_fprog        fprog;
        /* protected by rtnl lock */
        bool                        filter_attached;
        u32                        msg_enable;
        spinlock_t lock;
        struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
        struct timer_list flow_gc_timer;
        unsigned long ageing_time;
        unsigned int numdisabled;
        struct list_head disabled;
        void *security;
        u32 flow_count;
        u32 rx_batched;
        atomic_long_t rx_frame_errors;
        struct bpf_prog __rcu *xdp_prog;
        struct tun_prog __rcu *steering_prog;
        struct tun_prog __rcu *filter_prog;
        struct ethtool_link_ksettings link_ksettings;
        /* init args */
        struct file *file;
        struct ifreq *ifr;
};

struct veth {
        __be16 h_vlan_proto;
        __be16 h_vlan_TCI;
};

static void tun_flow_init(struct tun_struct *tun);
static void tun_flow_uninit(struct tun_struct *tun);

static int tun_napi_receive(struct napi_struct *napi, int budget)
{
        struct tun_file *tfile = container_of(napi, struct tun_file, napi);
        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
        struct sk_buff_head process_queue;
        struct sk_buff *skb;
        int received = 0;

        __skb_queue_head_init(&process_queue);

        spin_lock(&queue->lock);
        skb_queue_splice_tail_init(queue, &process_queue);
        spin_unlock(&queue->lock);

        while (received < budget && (skb = __skb_dequeue(&process_queue))) {
                napi_gro_receive(napi, skb);
                ++received;
        }

        if (!skb_queue_empty(&process_queue)) {
                spin_lock(&queue->lock);
                skb_queue_splice(&process_queue, queue);
                spin_unlock(&queue->lock);
        }

        return received;
}

static int tun_napi_poll(struct napi_struct *napi, int budget)
{
        unsigned int received;

        received = tun_napi_receive(napi, budget);

        if (received < budget)
                napi_complete_done(napi, received);

        return received;
}

static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
                          bool napi_en, bool napi_frags)
{
        tfile->napi_enabled = napi_en;
        tfile->napi_frags_enabled = napi_en && napi_frags;
        if (napi_en) {
                netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll);
                napi_enable(&tfile->napi);
        }
}

static void tun_napi_enable(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                napi_enable(&tfile->napi);
}

static void tun_napi_disable(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                napi_disable(&tfile->napi);
}

static void tun_napi_del(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                netif_napi_del(&tfile->napi);
}

static bool tun_napi_frags_enabled(const struct tun_file *tfile)
{
        return tfile->napi_frags_enabled;
}

#ifdef CONFIG_TUN_VNET_CROSS_LE
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
{
        return tun->flags & TUN_VNET_BE ? false :
                virtio_legacy_is_little_endian();
}

static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
{
        int be = !!(tun->flags & TUN_VNET_BE);

        if (put_user(be, argp))
                return -EFAULT;

        return 0;
}

static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
{
        int be;

        if (get_user(be, argp))
                return -EFAULT;

        if (be)
                tun->flags |= TUN_VNET_BE;
        else
                tun->flags &= ~TUN_VNET_BE;

        return 0;
}
#else
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
{
        return virtio_legacy_is_little_endian();
}

static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
{
        return -EINVAL;
}

static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
{
        return -EINVAL;
}
#endif /* CONFIG_TUN_VNET_CROSS_LE */

static inline bool tun_is_little_endian(struct tun_struct *tun)
{
        return tun->flags & TUN_VNET_LE ||
                tun_legacy_is_little_endian(tun);
}

static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
{
        return __virtio16_to_cpu(tun_is_little_endian(tun), val);
}

static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
{
        return __cpu_to_virtio16(tun_is_little_endian(tun), val);
}

static inline u32 tun_hashfn(u32 rxhash)
{
        return rxhash & TUN_MASK_FLOW_ENTRIES;
}

static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
{
        struct tun_flow_entry *e;

        hlist_for_each_entry_rcu(e, head, hash_link) {
                if (e->rxhash == rxhash)
                        return e;
        }
        return NULL;
}

static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
                                              struct hlist_head *head,
                                              u32 rxhash, u16 queue_index)
{
        struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);

        if (e) {
                netif_info(tun, tx_queued, tun->dev,
                           "create flow: hash %u index %u\n",
                           rxhash, queue_index);
                e->updated = jiffies;
                e->rxhash = rxhash;
                e->rps_rxhash = 0;
                e->queue_index = queue_index;
                e->tun = tun;
                hlist_add_head_rcu(&e->hash_link, head);
                ++tun->flow_count;
        }
        return e;
}

static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
        netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
                   e->rxhash, e->queue_index);
        hlist_del_rcu(&e->hash_link);
        kfree_rcu(e, rcu);
        --tun->flow_count;
}

static void tun_flow_flush(struct tun_struct *tun)
{
        int i;

        spin_lock_bh(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
                        tun_flow_delete(tun, e);
        }
        spin_unlock_bh(&tun->lock);
}

static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
{
        int i;

        spin_lock_bh(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
                        if (e->queue_index == queue_index)
                                tun_flow_delete(tun, e);
                }
        }
        spin_unlock_bh(&tun->lock);
}

static void tun_flow_cleanup(struct timer_list *t)
{
        struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
        unsigned long delay = tun->ageing_time;
        unsigned long next_timer = jiffies + delay;
        unsigned long count = 0;
        int i;

        spin_lock(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
                        unsigned long this_timer;

                        this_timer = e->updated + delay;
                        if (time_before_eq(this_timer, jiffies)) {
                                tun_flow_delete(tun, e);
                                continue;
                        }
                        count++;
                        if (time_before(this_timer, next_timer))
                                next_timer = this_timer;
                }
        }

        if (count)
                mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
        spin_unlock(&tun->lock);
}

static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
                            struct tun_file *tfile)
{
        struct hlist_head *head;
        struct tun_flow_entry *e;
        unsigned long delay = tun->ageing_time;
        u16 queue_index = tfile->queue_index;

        head = &tun->flows[tun_hashfn(rxhash)];

        rcu_read_lock();

        e = tun_flow_find(head, rxhash);
        if (likely(e)) {
                /* TODO: keep queueing to old queue until it's empty? */
                if (READ_ONCE(e->queue_index) != queue_index)
                        WRITE_ONCE(e->queue_index, queue_index);
                if (e->updated != jiffies)
                        e->updated = jiffies;
                sock_rps_record_flow_hash(e->rps_rxhash);
        } else {
                spin_lock_bh(&tun->lock);
                if (!tun_flow_find(head, rxhash) &&
                    tun->flow_count < MAX_TAP_FLOWS)
                        tun_flow_create(tun, head, rxhash, queue_index);

                if (!timer_pending(&tun->flow_gc_timer))
                        mod_timer(&tun->flow_gc_timer,
                                  round_jiffies_up(jiffies + delay));
                spin_unlock_bh(&tun->lock);
        }

        rcu_read_unlock();
}

/* Save the hash received in the stack receive path and update the
 * flow_hash table accordingly.
 */
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{
        if (unlikely(e->rps_rxhash != hash))
                e->rps_rxhash = hash;
}

/* We try to identify a flow through its rxhash. The reason that
 * we do not check rxq no. is because some cards(e.g 82599), chooses
 * the rxq based on the txq where the last packet of the flow comes. As
 * the userspace application move between processors, we may get a
 * different rxq no. here.
 */
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
        struct tun_flow_entry *e;
        u32 txq, numqueues;

        numqueues = READ_ONCE(tun->numqueues);

        txq = __skb_get_hash_symmetric(skb);
        e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
        if (e) {
                tun_flow_save_rps_rxhash(e, txq);
                txq = e->queue_index;
        } else {
                txq = reciprocal_scale(txq, numqueues);
        }

        return txq;
}

static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
        struct tun_prog *prog;
        u32 numqueues;
        u16 ret = 0;

        numqueues = READ_ONCE(tun->numqueues);
        if (!numqueues)
                return 0;

        prog = rcu_dereference(tun->steering_prog);
        if (prog)
                ret = bpf_prog_run_clear_cb(prog->prog, skb);

        return ret % numqueues;
}

static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
                            struct net_device *sb_dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        u16 ret;

        rcu_read_lock();
        if (rcu_dereference(tun->steering_prog))
                ret = tun_ebpf_select_queue(tun, skb);
        else
                ret = tun_automq_select_queue(tun, skb);
        rcu_read_unlock();

        return ret;
}

static inline bool tun_not_capable(struct tun_struct *tun)
{
        const struct cred *cred = current_cred();
        struct net *net = dev_net(tun->dev);

        return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
                  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
                !ns_capable(net->user_ns, CAP_NET_ADMIN);
}

static void tun_set_real_num_queues(struct tun_struct *tun)
{
        netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
        netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
}

static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
{
        tfile->detached = tun;
        list_add_tail(&tfile->next, &tun->disabled);
        ++tun->numdisabled;
}

static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
{
        struct tun_struct *tun = tfile->detached;

        tfile->detached = NULL;
        list_del_init(&tfile->next);
        --tun->numdisabled;
        return tun;
}

void tun_ptr_free(void *ptr)
{
        if (!ptr)
                return;
        if (tun_is_xdp_frame(ptr)) {
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                xdp_return_frame(xdpf);
        } else {
                __skb_array_destroy_skb(ptr);
        }
}
EXPORT_SYMBOL_GPL(tun_ptr_free);

static void tun_queue_purge(struct tun_file *tfile)
{
        void *ptr;

        while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
                tun_ptr_free(ptr);

        skb_queue_purge(&tfile->sk.sk_write_queue);
        skb_queue_purge(&tfile->sk.sk_error_queue);
}

static void __tun_detach(struct tun_file *tfile, bool clean)
{
        struct tun_file *ntfile;
        struct tun_struct *tun;

        tun = rtnl_dereference(tfile->tun);

        if (tun && clean) {
                if (!tfile->detached)
                        tun_napi_disable(tfile);
                tun_napi_del(tfile);
        }

        if (tun && !tfile->detached) {
                u16 index = tfile->queue_index;
                BUG_ON(index >= tun->numqueues);

                rcu_assign_pointer(tun->tfiles[index],
                                   tun->tfiles[tun->numqueues - 1]);
                ntfile = rtnl_dereference(tun->tfiles[index]);
                ntfile->queue_index = index;
                ntfile->xdp_rxq.queue_index = index;
                rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
                                   NULL);

                --tun->numqueues;
                if (clean) {
                        RCU_INIT_POINTER(tfile->tun, NULL);
                        sock_put(&tfile->sk);
                } else {
                        tun_disable_queue(tun, tfile);
                        tun_napi_disable(tfile);
                }

                synchronize_net();
                tun_flow_delete_by_queue(tun, tun->numqueues + 1);
                /* Drop read queue */
                tun_queue_purge(tfile);
                tun_set_real_num_queues(tun);
        } else if (tfile->detached && clean) {
                tun = tun_enable_queue(tfile);
                sock_put(&tfile->sk);
        }

        if (clean) {
                if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
                        netif_carrier_off(tun->dev);

                        if (!(tun->flags & IFF_PERSIST) &&
                            tun->dev->reg_state == NETREG_REGISTERED)
                                unregister_netdevice(tun->dev);
                }
                if (tun)
                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
                ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
        }
}

static void tun_detach(struct tun_file *tfile, bool clean)
{
        struct tun_struct *tun;
        struct net_device *dev;

        rtnl_lock();
        tun = rtnl_dereference(tfile->tun);
        dev = tun ? tun->dev : NULL;
        __tun_detach(tfile, clean);
        if (dev)
                netdev_state_change(dev);
        rtnl_unlock();

        if (clean)
                sock_put(&tfile->sk);
}

static void tun_detach_all(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile, *tmp;
        int i, n = tun->numqueues;

        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                BUG_ON(!tfile);
                tun_napi_disable(tfile);
                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
                RCU_INIT_POINTER(tfile->tun, NULL);
                --tun->numqueues;
        }
        list_for_each_entry(tfile, &tun->disabled, next) {
                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
                RCU_INIT_POINTER(tfile->tun, NULL);
        }
        BUG_ON(tun->numqueues != 0);

        synchronize_net();
        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                tun_napi_del(tfile);
                /* Drop read queue */
                tun_queue_purge(tfile);
                xdp_rxq_info_unreg(&tfile->xdp_rxq);
                sock_put(&tfile->sk);
        }
        list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
                tun_napi_del(tfile);
                tun_enable_queue(tfile);
                tun_queue_purge(tfile);
                xdp_rxq_info_unreg(&tfile->xdp_rxq);
                sock_put(&tfile->sk);
        }
        BUG_ON(tun->numdisabled != 0);

        if (tun->flags & IFF_PERSIST)
                module_put(THIS_MODULE);
}

static int tun_attach(struct tun_struct *tun, struct file *file,
                      bool skip_filter, bool napi, bool napi_frags,
                      bool publish_tun)
{
        struct tun_file *tfile = file->private_data;
        struct net_device *dev = tun->dev;
        int err;

        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
        if (err < 0)
                goto out;

        err = -EINVAL;
        if (rtnl_dereference(tfile->tun) && !tfile->detached)
                goto out;

        err = -EBUSY;
        if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
                goto out;

        err = -E2BIG;
        if (!tfile->detached &&
            tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
                goto out;

        err = 0;

        /* Re-attach the filter to persist device */
        if (!skip_filter && (tun->filter_attached == true)) {
                lock_sock(tfile->socket.sk);
                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                release_sock(tfile->socket.sk);
                if (!err)
                        goto out;
        }

        if (!tfile->detached &&
            ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
                            GFP_KERNEL, tun_ptr_free)) {
                err = -ENOMEM;
                goto out;
        }

        tfile->queue_index = tun->numqueues;
        tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;

        if (tfile->detached) {
                /* Re-attach detached tfile, updating XDP queue_index */
                WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));

                if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
                        tfile->xdp_rxq.queue_index = tfile->queue_index;
        } else {
                /* Setup XDP RX-queue info, for new tfile getting attached */
                err = xdp_rxq_info_reg(&tfile->xdp_rxq,
                                       tun->dev, tfile->queue_index, 0);
                if (err < 0)
                        goto out;
                err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
                                                 MEM_TYPE_PAGE_SHARED, NULL);
                if (err < 0) {
                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
                        goto out;
                }
                err = 0;
        }

        if (tfile->detached) {
                tun_enable_queue(tfile);
                tun_napi_enable(tfile);
        } else {
                sock_hold(&tfile->sk);
                tun_napi_init(tun, tfile, napi, napi_frags);
        }

        if (rtnl_dereference(tun->xdp_prog))
                sock_set_flag(&tfile->sk, SOCK_XDP);

        /* device is allowed to go away first, so no need to hold extra
         * refcnt.
         */

        /* Publish tfile->tun and tun->tfiles only after we've fully
         * initialized tfile; otherwise we risk using half-initialized
         * object.
         */
        if (publish_tun)
                rcu_assign_pointer(tfile->tun, tun);
        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        tun->numqueues++;
        tun_set_real_num_queues(tun);
out:
        return err;
}

static struct tun_struct *tun_get(struct tun_file *tfile)
{
        struct tun_struct *tun;

        rcu_read_lock();
        tun = rcu_dereference(tfile->tun);
        if (tun)
                dev_hold(tun->dev);
        rcu_read_unlock();

        return tun;
}

static void tun_put(struct tun_struct *tun)
{
        dev_put(tun->dev);
}

/* TAP filtering */
static void addr_hash_set(u32 *mask, const u8 *addr)
{
        int n = ether_crc(ETH_ALEN, addr) >> 26;
        mask[n >> 5] |= (1 << (n & 31));
}

static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
{
        int n = ether_crc(ETH_ALEN, addr) >> 26;
        return mask[n >> 5] & (1 << (n & 31));
}

static int update_filter(struct tap_filter *filter, void __user *arg)
{
        struct { u8 u[ETH_ALEN]; } *addr;
        struct tun_filter uf;
        int err, alen, n, nexact;

        if (copy_from_user(&uf, arg, sizeof(uf)))
                return -EFAULT;

        if (!uf.count) {
                /* Disabled */
                filter->count = 0;
                return 0;
        }

        alen = ETH_ALEN * uf.count;
        addr = memdup_user(arg + sizeof(uf), alen);
        if (IS_ERR(addr))
                return PTR_ERR(addr);

        /* The filter is updated without holding any locks. Which is
         * perfectly safe. We disable it first and in the worst
         * case we'll accept a few undesired packets. */
        filter->count = 0;
        wmb();

        /* Use first set of addresses as an exact filter */
        for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
                memcpy(filter->addr[n], addr[n].u, ETH_ALEN);

        nexact = n;

        /* Remaining multicast addresses are hashed,
         * unicast will leave the filter disabled. */
        memset(filter->mask, 0, sizeof(filter->mask));
        for (; n < uf.count; n++) {
                if (!is_multicast_ether_addr(addr[n].u)) {
                        err = 0; /* no filter */
                        goto free_addr;
                }
                addr_hash_set(filter->mask, addr[n].u);
        }

        /* For ALLMULTI just set the mask to all ones.
         * This overrides the mask populated above. */
        if ((uf.flags & TUN_FLT_ALLMULTI))
                memset(filter->mask, ~0, sizeof(filter->mask));

        /* Now enable the filter */
        wmb();
        filter->count = nexact;

        /* Return the number of exact filters */
        err = nexact;
free_addr:
        kfree(addr);
        return err;
}

/* Returns: 0 - drop, !=0 - accept */
static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
        /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
         * at this point. */
        struct ethhdr *eh = (struct ethhdr *) skb->data;
        int i;

        /* Exact match */
        for (i = 0; i < filter->count; i++)
                if (ether_addr_equal(eh->h_dest, filter->addr[i]))
                        return 1;

        /* Inexact match (multicast only) */
        if (is_multicast_ether_addr(eh->h_dest))
                return addr_hash_test(filter->mask, eh->h_dest);

        return 0;
}

/*
 * Checks whether the packet is accepted or not.
 * Returns: 0 - drop, !=0 - accept
 */
static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
        if (!filter->count)
                return 1;

        return run_filter(filter, skb);
}

/* Network device part of the driver */

static const struct ethtool_ops tun_ethtool_ops;

static int tun_net_init(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct ifreq *ifr = tun->ifr;
        int err;

        spin_lock_init(&tun->lock);

        err = security_tun_dev_alloc_security(&tun->security);
        if (err < 0)
                return err;

        tun_flow_init(tun);

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
        dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
                           TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
                           NETIF_F_HW_VLAN_STAG_TX;
        dev->features = dev->hw_features | NETIF_F_LLTX;
        dev->vlan_features = dev->features &
                             ~(NETIF_F_HW_VLAN_CTAG_TX |
                               NETIF_F_HW_VLAN_STAG_TX);

        tun->flags = (tun->flags & ~TUN_FEATURES) |
                      (ifr->ifr_flags & TUN_FEATURES);

        INIT_LIST_HEAD(&tun->disabled);
        err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI,
                         ifr->ifr_flags & IFF_NAPI_FRAGS, false);
        if (err < 0) {
                tun_flow_uninit(tun);
                security_tun_dev_free_security(tun->security);
                return err;
        }
        return 0;
}

/* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev)
{
        tun_detach_all(dev);
}

/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
        netif_tx_start_all_queues(dev);

        return 0;
}

/* Net device close. */
static int tun_net_close(struct net_device *dev)
{
        netif_tx_stop_all_queues(dev);
        return 0;
}

/* Net device start xmit */
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
                /* Select queue was not called for the skbuff, so we extract the
                 * RPS hash and save it into the flow_table here.
                 */
                struct tun_flow_entry *e;
                __u32 rxhash;

                rxhash = __skb_get_hash_symmetric(skb);
                e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
                if (e)
                        tun_flow_save_rps_rxhash(e, rxhash);
        }
#endif
}

static unsigned int run_ebpf_filter(struct tun_struct *tun,
                                    struct sk_buff *skb,
                                    int len)
{
        struct tun_prog *prog = rcu_dereference(tun->filter_prog);

        if (prog)
                len = bpf_prog_run_clear_cb(prog->prog, skb);

        return len;
}

/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        enum skb_drop_reason drop_reason;
        int txq = skb->queue_mapping;
        struct netdev_queue *queue;
        struct tun_file *tfile;
        int len = skb->len;

        rcu_read_lock();
        tfile = rcu_dereference(tun->tfiles[txq]);

        /* Drop packet if interface is not attached */
        if (!tfile) {
                drop_reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        if (!rcu_dereference(tun->steering_prog))
                tun_automq_xmit(tun, skb);

        netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);

        /* Drop if the filter does not like it.
         * This is a noop if the filter is disabled.
         * Filter can be enabled only for the TAP devices. */
        if (!check_filter(&tun->txflt, skb)) {
                drop_reason = SKB_DROP_REASON_TAP_TXFILTER;
                goto drop;
        }

        if (tfile->socket.sk->sk_filter &&
            sk_filter(tfile->socket.sk, skb)) {
                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                goto drop;
        }

        len = run_ebpf_filter(tun, skb, len);
        if (len == 0) {
                drop_reason = SKB_DROP_REASON_TAP_FILTER;
                goto drop;
        }

        if (pskb_trim(skb, len)) {
                drop_reason = SKB_DROP_REASON_NOMEM;
                goto drop;
        }

        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
                drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
                goto drop;
        }

        skb_tx_timestamp(skb);

        /* Orphan the skb - required as we might hang on to it
         * for indefinite time.
         */
        skb_orphan(skb);

        nf_reset_ct(skb);

        if (ptr_ring_produce(&tfile->tx_ring, skb)) {
                drop_reason = SKB_DROP_REASON_FULL_RING;
                goto drop;
        }

        /* NETIF_F_LLTX requires to do our own update of trans_start */
        queue = netdev_get_tx_queue(dev, txq);
        txq_trans_cond_update(queue);

        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
        tfile->socket.sk->sk_data_ready(tfile->socket.sk);

        rcu_read_unlock();
        return NETDEV_TX_OK;

drop:
        dev_core_stats_tx_dropped_inc(dev);
        skb_tx_error(skb);
        kfree_skb_reason(skb, drop_reason);
        rcu_read_unlock();
        return NET_XMIT_DROP;
}

static void tun_net_mclist(struct net_device *dev)
{
        /*
         * This callback is supposed to deal with mc filter in
         * _rx_ path and has nothing to do with the _tx_ path.
         * In rx path we always accept everything userspace gives us.
         */
}

static netdev_features_t tun_net_fix_features(struct net_device *dev,
        netdev_features_t features)
{
        struct tun_struct *tun = netdev_priv(dev);

        return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
}

static void tun_set_headroom(struct net_device *dev, int new_hr)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (new_hr < NET_SKB_PAD)
                new_hr = NET_SKB_PAD;

        tun->align = new_hr;
}

static void
tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct tun_struct *tun = netdev_priv(dev);

        dev_get_tstats64(dev, stats);

        stats->rx_frame_errors +=
                (unsigned long)atomic_long_read(&tun->rx_frame_errors);
}

static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
                       struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile;
        struct bpf_prog *old_prog;
        int i;

        old_prog = rtnl_dereference(tun->xdp_prog);
        rcu_assign_pointer(tun->xdp_prog, prog);
        if (old_prog)
                bpf_prog_put(old_prog);

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                if (prog)
                        sock_set_flag(&tfile->sk, SOCK_XDP);
                else
                        sock_reset_flag(&tfile->sk, SOCK_XDP);
        }
        list_for_each_entry(tfile, &tun->disabled, next) {
                if (prog)
                        sock_set_flag(&tfile->sk, SOCK_XDP);
                else
                        sock_reset_flag(&tfile->sk, SOCK_XDP);
        }

        return 0;
}

static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
        switch (xdp->command) {
        case XDP_SETUP_PROG:
                return tun_xdp_set(dev, xdp->prog, xdp->extack);
        default:
                return -EINVAL;
        }
}

static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
{
        if (new_carrier) {
                struct tun_struct *tun = netdev_priv(dev);

                if (!tun->numqueues)
                        return -EPERM;

                netif_carrier_on(dev);
        } else {
                netif_carrier_off(dev);
        }
        return 0;
}

static const struct net_device_ops tun_netdev_ops = {
        .ndo_init                = tun_net_init,
        .ndo_uninit                = tun_net_uninit,
        .ndo_open                = tun_net_open,
        .ndo_stop                = tun_net_close,
        .ndo_start_xmit                = tun_net_xmit,
        .ndo_fix_features        = tun_net_fix_features,
        .ndo_select_queue        = tun_select_queue,
        .ndo_set_rx_headroom        = tun_set_headroom,
        .ndo_get_stats64        = tun_net_get_stats64,
        .ndo_change_carrier        = tun_net_change_carrier,
};

static void __tun_xdp_flush_tfile(struct tun_file *tfile)
{
        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
}

static int tun_xdp_xmit(struct net_device *dev, int n,
                        struct xdp_frame **frames, u32 flags)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile;
        u32 numqueues;
        int nxmit = 0;
        int i;

        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
                return -EINVAL;

        rcu_read_lock();

resample:
        numqueues = READ_ONCE(tun->numqueues);
        if (!numqueues) {
                rcu_read_unlock();
                return -ENXIO; /* Caller will free/return all frames */
        }

        tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
                                            numqueues]);
        if (unlikely(!tfile))
                goto resample;

        spin_lock(&tfile->tx_ring.producer_lock);
        for (i = 0; i < n; i++) {
                struct xdp_frame *xdp = frames[i];
                /* Encode the XDP flag into lowest bit for consumer to differ
                 * XDP buffer from sk_buff.
                 */
                void *frame = tun_xdp_to_ptr(xdp);

                if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
                        dev_core_stats_tx_dropped_inc(dev);
                        break;
                }
                nxmit++;
        }
        spin_unlock(&tfile->tx_ring.producer_lock);

        if (flags & XDP_XMIT_FLUSH)
                __tun_xdp_flush_tfile(tfile);

        rcu_read_unlock();
        return nxmit;
}

static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
{
        struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
        int nxmit;

        if (unlikely(!frame))
                return -EOVERFLOW;

        nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
        if (!nxmit)
                xdp_return_frame_rx_napi(frame);
        return nxmit;
}

static const struct net_device_ops tap_netdev_ops = {
        .ndo_init                = tun_net_init,
        .ndo_uninit                = tun_net_uninit,
        .ndo_open                = tun_net_open,
        .ndo_stop                = tun_net_close,
        .ndo_start_xmit                = tun_net_xmit,
        .ndo_fix_features        = tun_net_fix_features,
        .ndo_set_rx_mode        = tun_net_mclist,
        .ndo_set_mac_address        = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_select_queue        = tun_select_queue,
        .ndo_features_check        = passthru_features_check,
        .ndo_set_rx_headroom        = tun_set_headroom,
        .ndo_bpf                = tun_xdp,
        .ndo_xdp_xmit                = tun_xdp_xmit,
        .ndo_change_carrier        = tun_net_change_carrier,
};

static void tun_flow_init(struct tun_struct *tun)
{
        int i;

        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
                INIT_HLIST_HEAD(&tun->flows[i]);

        tun->ageing_time = TUN_FLOW_EXPIRE;
        timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
        mod_timer(&tun->flow_gc_timer,
                  round_jiffies_up(jiffies + tun->ageing_time));
}

static void tun_flow_uninit(struct tun_struct *tun)
{
        del_timer_sync(&tun->flow_gc_timer);
        tun_flow_flush(tun);
}

#define MIN_MTU 68
#define MAX_MTU 65535

/* Initialize net device. */
static void tun_net_initialize(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                dev->netdev_ops = &tun_netdev_ops;
                dev->header_ops = &ip_tunnel_header_ops;

                /* Point-to-Point TUN Device */
                dev->hard_header_len = 0;
                dev->addr_len = 0;
                dev->mtu = 1500;

                /* Zero header length */
                dev->type = ARPHRD_NONE;
                dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
                break;

        case IFF_TAP:
                dev->netdev_ops = &tap_netdev_ops;
                /* Ethernet TAP Device */
                ether_setup(dev);
                dev->priv_flags &= ~IFF_TX_SKB_SHARING;
                dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;

                eth_hw_addr_random(dev);

                /* Currently tun does not support XDP, only tap does. */
                dev->xdp_features = NETDEV_XDP_ACT_BASIC |
                                    NETDEV_XDP_ACT_REDIRECT |
                                    NETDEV_XDP_ACT_NDO_XMIT;

                break;
        }

        dev->min_mtu = MIN_MTU;
        dev->max_mtu = MAX_MTU - dev->hard_header_len;
}

static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
{
        struct sock *sk = tfile->socket.sk;

        return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
}

/* Character device part */

/* Poll */
static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        struct sock *sk;
        __poll_t mask = 0;

        if (!tun)
                return EPOLLERR;

        sk = tfile->socket.sk;

        poll_wait(file, sk_sleep(sk), wait);

        if (!ptr_ring_empty(&tfile->tx_ring))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
         * guarantee EPOLLOUT to be raised by either here or
         * tun_sock_write_space(). Then process could get notification
         * after it writes to a down device and meets -EIO.
         */
        if (tun_sock_writeable(tun, tfile) ||
            (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
             tun_sock_writeable(tun, tfile)))
                mask |= EPOLLOUT | EPOLLWRNORM;

        if (tun->dev->reg_state != NETREG_REGISTERED)
                mask = EPOLLERR;

        tun_put(tun);
        return mask;
}

static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
                                            size_t len,
                                            const struct iov_iter *it)
{
        struct sk_buff *skb;
        size_t linear;
        int err;
        int i;

        if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
            len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
                return ERR_PTR(-EMSGSIZE);

        local_bh_disable();
        skb = napi_get_frags(&tfile->napi);
        local_bh_enable();
        if (!skb)
                return ERR_PTR(-ENOMEM);

        linear = iov_iter_single_seg_count(it);
        err = __skb_grow(skb, linear);
        if (err)
                goto free;

        skb->len = len;
        skb->data_len = len - linear;
        skb->truesize += skb->data_len;

        for (i = 1; i < it->nr_segs; i++) {
                const struct iovec *iov = iter_iov(it);
                size_t fragsz = iov->iov_len;
                struct page *page;
                void *frag;

                if (fragsz == 0 || fragsz > PAGE_SIZE) {
                        err = -EINVAL;
                        goto free;
                }
                frag = netdev_alloc_frag(fragsz);
                if (!frag) {
                        err = -ENOMEM;
                        goto free;
                }
                page = virt_to_head_page(frag);
                skb_fill_page_desc(skb, i - 1, page,
                                   frag - page_address(page), fragsz);
        }

        return skb;
free:
        /* frees skb and all frags allocated with napi_alloc_frag() */
        napi_free_frags(&tfile->napi);
        return ERR_PTR(err);
}

/* prepad is the amount to reserve at front.  len is length after that.
 * linear is a hint as to how much to copy (usually headers). */
static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
                                     size_t prepad, size_t len,
                                     size_t linear, int noblock)
{
        struct sock *sk = tfile->socket.sk;
        struct sk_buff *skb;
        int err;

        /* Under a page?  Don't bother with paged skb. */
        if (prepad + len < PAGE_SIZE)
                linear = len;

        if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
                linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
                                   &err, PAGE_ALLOC_COSTLY_ORDER);
        if (!skb)
                return ERR_PTR(err);

        skb_reserve(skb, prepad);
        skb_put(skb, linear);
        skb->data_len = len - linear;
        skb->len += len - linear;

        return skb;
}

static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
                           struct sk_buff *skb, int more)
{
        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
        struct sk_buff_head process_queue;
        u32 rx_batched = tun->rx_batched;
        bool rcv = false;

        if (!rx_batched || (!more && skb_queue_empty(queue))) {
                local_bh_disable();
                skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
                return;
        }

        spin_lock(&queue->lock);
        if (!more || skb_queue_len(queue) == rx_batched) {
                __skb_queue_head_init(&process_queue);
                skb_queue_splice_tail_init(queue, &process_queue);
                rcv = true;
        } else {
                __skb_queue_tail(queue, skb);
        }
        spin_unlock(&queue->lock);

        if (rcv) {
                struct sk_buff *nskb;

                local_bh_disable();
                while ((nskb = __skb_dequeue(&process_queue))) {
                        skb_record_rx_queue(nskb, tfile->queue_index);
                        netif_receive_skb(nskb);
                }
                skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
        }
}

static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
                              int len, int noblock, bool zerocopy)
{
        if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                return false;

        if (tfile->socket.sk->sk_sndbuf != INT_MAX)
                return false;

        if (!noblock)
                return false;

        if (zerocopy)
                return false;

        if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
            SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
                return false;

        return true;
}

static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
                                       struct page_frag *alloc_frag, char *buf,
                                       int buflen, int len, int pad)
{
        struct sk_buff *skb = build_skb(buf, buflen);

        if (!skb)
                return ERR_PTR(-ENOMEM);

        skb_reserve(skb, pad);
        skb_put(skb, len);
        skb_set_owner_w(skb, tfile->socket.sk);

        get_page(alloc_frag->page);
        alloc_frag->offset += buflen;

        return skb;
}

static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
                       struct xdp_buff *xdp, u32 act)
{
        int err;

        switch (act) {
        case XDP_REDIRECT:
                err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
                if (err) {
                        dev_core_stats_rx_dropped_inc(tun->dev);
                        return err;
                }
                dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
                break;
        case XDP_TX:
                err = tun_xdp_tx(tun->dev, xdp);
                if (err < 0) {
                        dev_core_stats_rx_dropped_inc(tun->dev);
                        return err;
                }
                dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
                break;
        case XDP_PASS:
                break;
        default:
                bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception(tun->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
                dev_core_stats_rx_dropped_inc(tun->dev);
                break;
        }

        return act;
}

static struct sk_buff *tun_build_skb(struct tun_struct *tun,
                                     struct tun_file *tfile,
                                     struct iov_iter *from,
                                     struct virtio_net_hdr *hdr,
                                     int len, int *skb_xdp)
{
        struct page_frag *alloc_frag = &current->task_frag;
        struct bpf_prog *xdp_prog;
        int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
        char *buf;
        size_t copied;
        int pad = TUN_RX_PAD;
        int err = 0;

        rcu_read_lock();
        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog)
                pad += XDP_PACKET_HEADROOM;
        buflen += SKB_DATA_ALIGN(len + pad);
        rcu_read_unlock();

        alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
        if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
                return ERR_PTR(-ENOMEM);

        buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
        copied = copy_page_from_iter(alloc_frag->page,
                                     alloc_frag->offset + pad,
                                     len, from);
        if (copied != len)
                return ERR_PTR(-EFAULT);

        /* There's a small window that XDP may be set after the check
         * of xdp_prog above, this should be rare and for simplicity
         * we do XDP on skb in case the headroom is not enough.
         */
        if (hdr->gso_type || !xdp_prog) {
                *skb_xdp = 1;
                return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
                                       pad);
        }

        *skb_xdp = 0;

        local_bh_disable();
        rcu_read_lock();
        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;

                xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq);
                xdp_prepare_buff(&xdp, buf, pad, len, false);

                act = bpf_prog_run_xdp(xdp_prog, &xdp);
                if (act == XDP_REDIRECT || act == XDP_TX) {
                        get_page(alloc_frag->page);
                        alloc_frag->offset += buflen;
                }
                err = tun_xdp_act(tun, xdp_prog, &xdp, act);
                if (err < 0) {
                        if (act == XDP_REDIRECT || act == XDP_TX)
                                put_page(alloc_frag->page);
                        goto out;
                }

                if (err == XDP_REDIRECT)
                        xdp_do_flush();
                if (err != XDP_PASS)
                        goto out;

                pad = xdp.data - xdp.data_hard_start;
                len = xdp.data_end - xdp.data;
        }
        rcu_read_unlock();
        local_bh_enable();

        return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);

out:
        rcu_read_unlock();
        local_bh_enable();
        return NULL;
}

/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                            void *msg_control, struct iov_iter *from,
                            int noblock, bool more)
{
        struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
        struct sk_buff *skb;
        size_t total_len = iov_iter_count(from);
        size_t len = total_len, align = tun->align, linear;
        struct virtio_net_hdr gso = { 0 };
        int good_linear;
        int copylen;
        bool zerocopy = false;
        int err;
        u32 rxhash = 0;
        int skb_xdp = 1;
        bool frags = tun_napi_frags_enabled(tfile);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;

        if (!(tun->flags & IFF_NO_PI)) {
                if (len < sizeof(pi))
                        return -EINVAL;
                len -= sizeof(pi);

                if (!copy_from_iter_full(&pi, sizeof(pi), from))
                        return -EFAULT;
        }

        if (tun->flags & IFF_VNET_HDR) {
                int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);

                if (len < vnet_hdr_sz)
                        return -EINVAL;
                len -= vnet_hdr_sz;

                if (!copy_from_iter_full(&gso, sizeof(gso), from))
                        return -EFAULT;

                if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
                    tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
                        gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);

                if (tun16_to_cpu(tun, gso.hdr_len) > len)
                        return -EINVAL;
                iov_iter_advance(from, vnet_hdr_sz - sizeof(gso));
        }

        if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
                align += NET_IP_ALIGN;
                if (unlikely(len < ETH_HLEN ||
                             (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
                        return -EINVAL;
        }

        good_linear = SKB_MAX_HEAD(align);

        if (msg_control) {
                struct iov_iter i = *from;

                /* There are 256 bytes to be copied in skb, so there is
                 * enough room for skb expand head in case it is used.
                 * The rest of the buffer is mapped from userspace.
                 */
                copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
                if (copylen > good_linear)
                        copylen = good_linear;
                linear = copylen;
                iov_iter_advance(&i, copylen);
                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
                        zerocopy = true;
        }

        if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
                /* For the packet that is not easy to be processed
                 * (e.g gso or jumbo packet), we will do it at after
                 * skb was created with generic XDP routine.
                 */
                skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
                err = PTR_ERR_OR_ZERO(skb);
                if (err)
                        goto drop;
                if (!skb)
                        return total_len;
        } else {
                if (!zerocopy) {
                        copylen = len;
                        if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
                                linear = good_linear;
                        else
                                linear = tun16_to_cpu(tun, gso.hdr_len);
                }

                if (frags) {
                        mutex_lock(&tfile->napi_mutex);
                        skb = tun_napi_alloc_frags(tfile, copylen, from);
                        /* tun_napi_alloc_frags() enforces a layout for the skb.
                         * If zerocopy is enabled, then this layout will be
                         * overwritten by zerocopy_sg_from_iter().
                         */
                        zerocopy = false;
                } else {
                        if (!linear)
                                linear = min_t(size_t, good_linear, copylen);

                        skb = tun_alloc_skb(tfile, align, copylen, linear,
                                            noblock);
                }

                err = PTR_ERR_OR_ZERO(skb);
                if (err)
                        goto drop;

                if (zerocopy)
                        err = zerocopy_sg_from_iter(skb, from);
                else
                        err = skb_copy_datagram_from_iter(skb, 0, from, len);

                if (err) {
                        err = -EFAULT;
                        drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
                        goto drop;
                }
        }

        if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
                atomic_long_inc(&tun->rx_frame_errors);
                err = -EINVAL;
                goto free_skb;
        }

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                if (tun->flags & IFF_NO_PI) {
                        u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;

                        switch (ip_version) {
                        case 4:
                                pi.proto = htons(ETH_P_IP);
                                break;
                        case 6:
                                pi.proto = htons(ETH_P_IPV6);
                                break;
                        default:
                                err = -EINVAL;
                                goto drop;
                        }
                }

                skb_reset_mac_header(skb);
                skb->protocol = pi.proto;
                skb->dev = tun->dev;
                break;
        case IFF_TAP:
                if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
                        err = -ENOMEM;
                        drop_reason = SKB_DROP_REASON_HDR_TRUNC;
                        goto drop;
                }
                skb->protocol = eth_type_trans(skb, tun->dev);
                break;
        }

        /* copy skb_ubuf_info for callback when skb has no error */
        if (zerocopy) {
                skb_zcopy_init(skb, msg_control);
        } else if (msg_control) {
                struct ubuf_info *uarg = msg_control;
                uarg->ops->complete(NULL, uarg, false);
        }

        skb_reset_network_header(skb);
        skb_probe_transport_header(skb);
        skb_record_rx_queue(skb, tfile->queue_index);

        if (skb_xdp) {
                struct bpf_prog *xdp_prog;
                int ret;

                local_bh_disable();
                rcu_read_lock();
                xdp_prog = rcu_dereference(tun->xdp_prog);
                if (xdp_prog) {
                        ret = do_xdp_generic(xdp_prog, &skb);
                        if (ret != XDP_PASS) {
                                rcu_read_unlock();
                                local_bh_enable();
                                goto unlock_frags;
                        }
                }
                rcu_read_unlock();
                local_bh_enable();
        }

        /* Compute the costly rx hash only if needed for flow updates.
         * We may get a very small possibility of OOO during switching, not
         * worth to optimize.
         */
        if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
            !tfile->detached)
                rxhash = __skb_get_hash_symmetric(skb);

        rcu_read_lock();
        if (unlikely(!(tun->dev->flags & IFF_UP))) {
                err = -EIO;
                rcu_read_unlock();
                drop_reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        if (frags) {
                u32 headlen;

                /* Exercise flow dissector code path. */
                skb_push(skb, ETH_HLEN);
                headlen = eth_get_headlen(tun->dev, skb->data,
                                          skb_headlen(skb));

                if (unlikely(headlen > skb_headlen(skb))) {
                        WARN_ON_ONCE(1);
                        err = -ENOMEM;
                        dev_core_stats_rx_dropped_inc(tun->dev);
napi_busy:
                        napi_free_frags(&tfile->napi);
                        rcu_read_unlock();
                        mutex_unlock(&tfile->napi_mutex);
                        return err;
                }

                if (likely(napi_schedule_prep(&tfile->napi))) {
                        local_bh_disable();
                        napi_gro_frags(&tfile->napi);
                        napi_complete(&tfile->napi);
                        local_bh_enable();
                } else {
                        err = -EBUSY;
                        goto napi_busy;
                }
                mutex_unlock(&tfile->napi_mutex);
        } else if (tfile->napi_enabled) {
                struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
                int queue_len;

                spin_lock_bh(&queue->lock);

                if (unlikely(tfile->detached)) {
                        spin_unlock_bh(&queue->lock);
                        rcu_read_unlock();
                        err = -EBUSY;
                        goto free_skb;
                }

                __skb_queue_tail(queue, skb);
                queue_len = skb_queue_len(queue);
                spin_unlock(&queue->lock);

                if (!more || queue_len > NAPI_POLL_WEIGHT)
                        napi_schedule(&tfile->napi);

                local_bh_enable();
        } else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
                tun_rx_batched(tun, tfile, skb, more);
        } else {
                netif_rx(skb);
        }
        rcu_read_unlock();

        preempt_disable();
        dev_sw_netstats_rx_add(tun->dev, len);
        preempt_enable();

        if (rxhash)
                tun_flow_update(tun, rxhash, tfile);

        return total_len;

drop:
        if (err != -EAGAIN)
                dev_core_stats_rx_dropped_inc(tun->dev);

free_skb:
        if (!IS_ERR_OR_NULL(skb))
                kfree_skb_reason(skb, drop_reason);

unlock_frags:
        if (frags) {
                tfile->napi.skb = NULL;
                mutex_unlock(&tfile->napi_mutex);
        }

        return err ?: total_len;
}

static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        ssize_t result;
        int noblock = 0;

        if (!tun)
                return -EBADFD;

        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
                noblock = 1;

        result = tun_get_user(tun, tfile, NULL, from, noblock, false);

        tun_put(tun);
        return result;
}

static ssize_t tun_put_user_xdp(struct tun_struct *tun,
                                struct tun_file *tfile,
                                struct xdp_frame *xdp_frame,
                                struct iov_iter *iter)
{
        int vnet_hdr_sz = 0;
        size_t size = xdp_frame->len;
        size_t ret;

        if (tun->flags & IFF_VNET_HDR) {
                struct virtio_net_hdr gso = { 0 };

                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
                if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
                        return -EINVAL;
                if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
                             sizeof(gso)))
                        return -EFAULT;
                iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
        }

        ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;

        preempt_disable();
        dev_sw_netstats_tx_add(tun->dev, 1, ret);
        preempt_enable();

        return ret;
}

/* Put packet to the user space buffer */
static ssize_t tun_put_user(struct tun_struct *tun,
                            struct tun_file *tfile,
                            struct sk_buff *skb,
                            struct iov_iter *iter)
{
        struct tun_pi pi = { 0, skb->protocol };
        ssize_t total;
        int vlan_offset = 0;
        int vlan_hlen = 0;
        int vnet_hdr_sz = 0;

        if (skb_vlan_tag_present(skb))
                vlan_hlen = VLAN_HLEN;

        if (tun->flags & IFF_VNET_HDR)
                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);

        total = skb->len + vlan_hlen + vnet_hdr_sz;

        if (!(tun->flags & IFF_NO_PI)) {
                if (iov_iter_count(iter) < sizeof(pi))
                        return -EINVAL;

                total += sizeof(pi);
                if (iov_iter_count(iter) < total) {
                        /* Packet will be striped */
                        pi.flags |= TUN_PKT_STRIP;
                }

                if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
                        return -EFAULT;
        }

        if (vnet_hdr_sz) {
                struct virtio_net_hdr gso;

                if (iov_iter_count(iter) < vnet_hdr_sz)
                        return -EINVAL;

                if (virtio_net_hdr_from_skb(skb, &gso,
                                            tun_is_little_endian(tun), true,
                                            vlan_hlen)) {
                        struct skb_shared_info *sinfo = skb_shinfo(skb);

                        if (net_ratelimit()) {
                                netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
                                           sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
                                           tun16_to_cpu(tun, gso.hdr_len));
                                print_hex_dump(KERN_ERR, "tun: ",
                                               DUMP_PREFIX_NONE,
                                               16, 1, skb->head,
                                               min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
                        }
                        WARN_ON_ONCE(1);
                        return -EINVAL;
                }

                if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
                        return -EFAULT;

                iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
        }

        if (vlan_hlen) {
                int ret;
                struct veth veth;

                veth.h_vlan_proto = skb->vlan_proto;
                veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));

                vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);

                ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
                if (ret || !iov_iter_count(iter))
                        goto done;

                ret = copy_to_iter(&veth, sizeof(veth), iter);
                if (ret != sizeof(veth) || !iov_iter_count(iter))
                        goto done;
        }

        skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);

done:
        /* caller is in process context, */
        preempt_disable();
        dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen);
        preempt_enable();

        return total;
}

static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
{
        DECLARE_WAITQUEUE(wait, current);
        void *ptr = NULL;
        int error = 0;

        ptr = ptr_ring_consume(&tfile->tx_ring);
        if (ptr)
                goto out;
        if (noblock) {
                error = -EAGAIN;
                goto out;
        }

        add_wait_queue(&tfile->socket.wq.wait, &wait);

        while (1) {
                set_current_state(TASK_INTERRUPTIBLE);
                ptr = ptr_ring_consume(&tfile->tx_ring);
                if (ptr)
                        break;
                if (signal_pending(current)) {
                        error = -ERESTARTSYS;
                        break;
                }
                if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
                        error = -EFAULT;
                        break;
                }

                schedule();
        }

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&tfile->socket.wq.wait, &wait);

out:
        *err = error;
        return ptr;
}

static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
                           struct iov_iter *to,
                           int noblock, void *ptr)
{
        ssize_t ret;
        int err;

        if (!iov_iter_count(to)) {
                tun_ptr_free(ptr);
                return 0;
        }

        if (!ptr) {
                /* Read frames from ring */
                ptr = tun_ring_recv(tfile, noblock, &err);
                if (!ptr)
                        return err;
        }

        if (tun_is_xdp_frame(ptr)) {
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                ret = tun_put_user_xdp(tun, tfile, xdpf, to);
                xdp_return_frame(xdpf);
        } else {
                struct sk_buff *skb = ptr;

                ret = tun_put_user(tun, tfile, skb, to);
                if (unlikely(ret < 0))
                        kfree_skb(skb);
                else
                        consume_skb(skb);
        }

        return ret;
}

static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        ssize_t len = iov_iter_count(to), ret;
        int noblock = 0;

        if (!tun)
                return -EBADFD;

        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
                noblock = 1;

        ret = tun_do_read(tun, tfile, to, noblock, NULL);
        ret = min_t(ssize_t, ret, len);
        if (ret > 0)
                iocb->ki_pos = ret;
        tun_put(tun);
        return ret;
}

static void tun_prog_free(struct rcu_head *rcu)
{
        struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);

        bpf_prog_destroy(prog->prog);
        kfree(prog);
}

static int __tun_set_ebpf(struct tun_struct *tun,
                          struct tun_prog __rcu **prog_p,
                          struct bpf_prog *prog)
{
        struct tun_prog *old, *new = NULL;

        if (prog) {
                new = kmalloc(sizeof(*new), GFP_KERNEL);
                if (!new)
                        return -ENOMEM;
                new->prog = prog;
        }

        spin_lock_bh(&tun->lock);
        old = rcu_dereference_protected(*prog_p,
                                        lockdep_is_held(&tun->lock));
        rcu_assign_pointer(*prog_p, new);
        spin_unlock_bh(&tun->lock);

        if (old)
                call_rcu(&old->rcu, tun_prog_free);

        return 0;
}

static void tun_free_netdev(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        BUG_ON(!(list_empty(&tun->disabled)));

        tun_flow_uninit(tun);
        security_tun_dev_free_security(tun->security);
        __tun_set_ebpf(tun, &tun->steering_prog, NULL);
        __tun_set_ebpf(tun, &tun->filter_prog, NULL);
}

static void tun_setup(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        tun->owner = INVALID_UID;
        tun->group = INVALID_GID;
        tun_default_link_ksettings(dev, &tun->link_ksettings);

        dev->ethtool_ops = &tun_ethtool_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = tun_free_netdev;
        /* We prefer our own queue length */
        dev->tx_queue_len = TUN_READQ_SIZE;
}

/* Trivial set of netlink ops to allow deleting tun or tap
 * device with netlink.
 */
static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
                        struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack,
                       "tun/tap creation via rtnetlink is not supported.");
        return -EOPNOTSUPP;
}

static size_t tun_get_size(const struct net_device *dev)
{
        BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
        BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));

        return nla_total_size(sizeof(uid_t)) + /* OWNER */
               nla_total_size(sizeof(gid_t)) + /* GROUP */
               nla_total_size(sizeof(u8)) + /* TYPE */
               nla_total_size(sizeof(u8)) + /* PI */
               nla_total_size(sizeof(u8)) + /* VNET_HDR */
               nla_total_size(sizeof(u8)) + /* PERSIST */
               nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
               nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
               nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
               0;
}

static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
                goto nla_put_failure;
        if (uid_valid(tun->owner) &&
            nla_put_u32(skb, IFLA_TUN_OWNER,
                        from_kuid_munged(current_user_ns(), tun->owner)))
                goto nla_put_failure;
        if (gid_valid(tun->group) &&
            nla_put_u32(skb, IFLA_TUN_GROUP,
                        from_kgid_munged(current_user_ns(), tun->group)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
                       !!(tun->flags & IFF_MULTI_QUEUE)))
                goto nla_put_failure;
        if (tun->flags & IFF_MULTI_QUEUE) {
                if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
                        goto nla_put_failure;
                if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
                                tun->numdisabled))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static struct rtnl_link_ops tun_link_ops __read_mostly = {
        .kind                = DRV_NAME,
        .priv_size        = sizeof(struct tun_struct),
        .setup                = tun_setup,
        .validate        = tun_validate,
        .get_size       = tun_get_size,
        .fill_info      = tun_fill_info,
};

static void tun_sock_write_space(struct sock *sk)
{
        struct tun_file *tfile;
        wait_queue_head_t *wqueue;

        if (!sock_writeable(sk))
                return;

        if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
                return;

        wqueue = sk_sleep(sk);
        if (wqueue && waitqueue_active(wqueue))
                wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

        tfile = container_of(sk, struct tun_file, sk);
        kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}

static void tun_put_page(struct tun_page *tpage)
{
        if (tpage->page)
                __page_frag_cache_drain(tpage->page, tpage->count);
}

static int tun_xdp_one(struct tun_struct *tun,
                       struct tun_file *tfile,
                       struct xdp_buff *xdp, int *flush,
                       struct tun_page *tpage)
{
        unsigned int datasize = xdp->data_end - xdp->data;
        struct tun_xdp_hdr *hdr = xdp->data_hard_start;
        struct virtio_net_hdr *gso = &hdr->gso;
        struct bpf_prog *xdp_prog;
        struct sk_buff *skb = NULL;
        struct sk_buff_head *queue;
        u32 rxhash = 0, act;
        int buflen = hdr->buflen;
        int ret = 0;
        bool skb_xdp = false;
        struct page *page;

        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog) {
                if (gso->gso_type) {
                        skb_xdp = true;
                        goto build;
                }

                xdp_init_buff(xdp, buflen, &tfile->xdp_rxq);
                xdp_set_data_meta_invalid(xdp);

                act = bpf_prog_run_xdp(xdp_prog, xdp);
                ret = tun_xdp_act(tun, xdp_prog, xdp, act);
                if (ret < 0) {
                        put_page(virt_to_head_page(xdp->data));
                        return ret;
                }

                switch (ret) {
                case XDP_REDIRECT:
                        *flush = true;
                        fallthrough;
                case XDP_TX:
                        return 0;
                case XDP_PASS:
                        break;
                default:
                        page = virt_to_head_page(xdp->data);
                        if (tpage->page == page) {
                                ++tpage->count;
                        } else {
                                tun_put_page(tpage);
                                tpage->page = page;
                                tpage->count = 1;
                        }
                        return 0;
                }
        }

build:
        skb = build_skb(xdp->data_hard_start, buflen);
        if (!skb) {
                ret = -ENOMEM;
                goto out;
        }

        skb_reserve(skb, xdp->data - xdp->data_hard_start);
        skb_put(skb, xdp->data_end - xdp->data);

        if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
                atomic_long_inc(&tun->rx_frame_errors);
                kfree_skb(skb);
                ret = -EINVAL;
                goto out;
        }

        skb->protocol = eth_type_trans(skb, tun->dev);
        skb_reset_network_header(skb);
        skb_probe_transport_header(skb);
        skb_record_rx_queue(skb, tfile->queue_index);

        if (skb_xdp) {
                ret = do_xdp_generic(xdp_prog, &skb);
                if (ret != XDP_PASS) {
                        ret = 0;
                        goto out;
                }
        }

        if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
            !tfile->detached)
                rxhash = __skb_get_hash_symmetric(skb);

        if (tfile->napi_enabled) {
                queue = &tfile->sk.sk_write_queue;
                spin_lock(&queue->lock);

                if (unlikely(tfile->detached)) {
                        spin_unlock(&queue->lock);
                        kfree_skb(skb);
                        return -EBUSY;
                }

                __skb_queue_tail(queue, skb);
                spin_unlock(&queue->lock);
                ret = 1;
        } else {
                netif_receive_skb(skb);
                ret = 0;
        }

        /* No need to disable preemption here since this function is
         * always called with bh disabled
         */
        dev_sw_netstats_rx_add(tun->dev, datasize);

        if (rxhash)
                tun_flow_update(tun, rxhash, tfile);

out:
        return ret;
}

static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
        int ret, i;
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun = tun_get(tfile);
        struct tun_msg_ctl *ctl = m->msg_control;
        struct xdp_buff *xdp;

        if (!tun)
                return -EBADFD;

        if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
            ctl && ctl->type == TUN_MSG_PTR) {
                struct tun_page tpage;
                int n = ctl->num;
                int flush = 0, queued = 0;

                memset(&tpage, 0, sizeof(tpage));

                local_bh_disable();
                rcu_read_lock();

                for (i = 0; i < n; i++) {
                        xdp = &((struct xdp_buff *)ctl->ptr)[i];
                        ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
                        if (ret > 0)
                                queued += ret;
                }

                if (flush)
                        xdp_do_flush();

                if (tfile->napi_enabled && queued > 0)
                        napi_schedule(&tfile->napi);

                rcu_read_unlock();
                local_bh_enable();

                tun_put_page(&tpage);

                ret = total_len;
                goto out;
        }

        ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
                           m->msg_flags & MSG_DONTWAIT,
                           m->msg_flags & MSG_MORE);
out:
        tun_put(tun);
        return ret;
}

static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
                       int flags)
{
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun = tun_get(tfile);
        void *ptr = m->msg_control;
        int ret;

        if (!tun) {
                ret = -EBADFD;
                goto out_free;
        }

        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
                ret = -EINVAL;
                goto out_put_tun;
        }
        if (flags & MSG_ERRQUEUE) {
                ret = sock_recv_errqueue(sock->sk, m, total_len,
                                         SOL_PACKET, TUN_TX_TIMESTAMP);
                goto out;
        }
        ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
        if (ret > (ssize_t)total_len) {
                m->msg_flags |= MSG_TRUNC;
                ret = flags & MSG_TRUNC ? ret : total_len;
        }
out:
        tun_put(tun);
        return ret;

out_put_tun:
        tun_put(tun);
out_free:
        tun_ptr_free(ptr);
        return ret;
}

static int tun_ptr_peek_len(void *ptr)
{
        if (likely(ptr)) {
                if (tun_is_xdp_frame(ptr)) {
                        struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                        return xdpf->len;
                }
                return __skb_array_len_with_tag(ptr);
        } else {
                return 0;
        }
}

static int tun_peek_len(struct socket *sock)
{
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun;
        int ret = 0;

        tun = tun_get(tfile);
        if (!tun)
                return 0;

        ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
        tun_put(tun);

        return ret;
}

/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
        .peek_len = tun_peek_len,
        .sendmsg = tun_sendmsg,
        .recvmsg = tun_recvmsg,
};

static struct proto tun_proto = {
        .name                = "tun",
        .owner                = THIS_MODULE,
        .obj_size        = sizeof(struct tun_file),
};

static int tun_flags(struct tun_struct *tun)
{
        return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
}

static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return sysfs_emit(buf, "0x%x\n", tun_flags(tun));
}

static ssize_t owner_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return uid_valid(tun->owner)?
                sysfs_emit(buf, "%u\n",
                           from_kuid_munged(current_user_ns(), tun->owner)) :
                sysfs_emit(buf, "-1\n");
}

static ssize_t group_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return gid_valid(tun->group) ?
                sysfs_emit(buf, "%u\n",
                           from_kgid_munged(current_user_ns(), tun->group)) :
                sysfs_emit(buf, "-1\n");
}

static DEVICE_ATTR_RO(tun_flags);
static DEVICE_ATTR_RO(owner);
static DEVICE_ATTR_RO(group);

static struct attribute *tun_dev_attrs[] = {
        &dev_attr_tun_flags.attr,
        &dev_attr_owner.attr,
        &dev_attr_group.attr,
        NULL
};

static const struct attribute_group tun_attr_group = {
        .attrs = tun_dev_attrs
};

static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
        struct tun_struct *tun;
        struct tun_file *tfile = file->private_data;
        struct net_device *dev;
        int err;

        if (tfile->detached)
                return -EINVAL;

        if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
                if (!capable(CAP_NET_ADMIN))
                        return -EPERM;

                if (!(ifr->ifr_flags & IFF_NAPI) ||
                    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
                        return -EINVAL;
        }

        dev = __dev_get_by_name(net, ifr->ifr_name);
        if (dev) {
                if (ifr->ifr_flags & IFF_TUN_EXCL)
                        return -EBUSY;
                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
                        tun = netdev_priv(dev);
                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
                        tun = netdev_priv(dev);
                else
                        return -EINVAL;

                if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
                    !!(tun->flags & IFF_MULTI_QUEUE))
                        return -EINVAL;

                if (tun_not_capable(tun))
                        return -EPERM;
                err = security_tun_dev_open(tun->security);
                if (err < 0)
                        return err;

                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
                                 ifr->ifr_flags & IFF_NAPI,
                                 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
                if (err < 0)
                        return err;

                if (tun->flags & IFF_MULTI_QUEUE &&
                    (tun->numqueues + tun->numdisabled > 1)) {
                        /* One or more queue has already been attached, no need
                         * to initialize the device again.
                         */
                        netdev_state_change(dev);
                        return 0;
                }

                tun->flags = (tun->flags & ~TUN_FEATURES) |
                              (ifr->ifr_flags & TUN_FEATURES);

                netdev_state_change(dev);
        } else {
                char *name;
                unsigned long flags = 0;
                int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
                             MAX_TAP_QUEUES : 1;

                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                err = security_tun_dev_create();
                if (err < 0)
                        return err;

                /* Set dev type */
                if (ifr->ifr_flags & IFF_TUN) {
                        /* TUN device */
                        flags |= IFF_TUN;
                        name = "tun%d";
                } else if (ifr->ifr_flags & IFF_TAP) {
                        /* TAP device */
                        flags |= IFF_TAP;
                        name = "tap%d";
                } else
                        return -EINVAL;

                if (*ifr->ifr_name)
                        name = ifr->ifr_name;

                dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
                                       NET_NAME_UNKNOWN, tun_setup, queues,
                                       queues);

                if (!dev)
                        return -ENOMEM;

                dev_net_set(dev, net);
                dev->rtnl_link_ops = &tun_link_ops;
                dev->ifindex = tfile->ifindex;
                dev->sysfs_groups[0] = &tun_attr_group;

                tun = netdev_priv(dev);
                tun->dev = dev;
                tun->flags = flags;
                tun->txflt.count = 0;
                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);

                tun->align = NET_SKB_PAD;
                tun->filter_attached = false;
                tun->sndbuf = tfile->socket.sk->sk_sndbuf;
                tun->rx_batched = 0;
                RCU_INIT_POINTER(tun->steering_prog, NULL);

                tun->ifr = ifr;
                tun->file = file;

                tun_net_initialize(dev);

                err = register_netdevice(tun->dev);
                if (err < 0) {
                        free_netdev(dev);
                        return err;
                }
                /* free_netdev() won't check refcnt, to avoid race
                 * with dev_put() we need publish tun after registration.
                 */
                rcu_assign_pointer(tfile->tun, tun);
        }

        if (ifr->ifr_flags & IFF_NO_CARRIER)
                netif_carrier_off(tun->dev);
        else
                netif_carrier_on(tun->dev);

        /* Make sure persistent devices do not get stuck in
         * xoff state.
         */
        if (netif_running(tun->dev))
                netif_tx_wake_all_queues(tun->dev);

        strcpy(ifr->ifr_name, tun->dev->name);
        return 0;
}

static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
{
        strcpy(ifr->ifr_name, tun->dev->name);

        ifr->ifr_flags = tun_flags(tun);

}

/* This is like a cut-down ethtool ops, except done via tun fd so no
 * privs required. */
static int set_offload(struct tun_struct *tun, unsigned long arg)
{
        netdev_features_t features = 0;

        if (arg & TUN_F_CSUM) {
                features |= NETIF_F_HW_CSUM;
                arg &= ~TUN_F_CSUM;

                if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
                        if (arg & TUN_F_TSO_ECN) {
                                features |= NETIF_F_TSO_ECN;
                                arg &= ~TUN_F_TSO_ECN;
                        }
                        if (arg & TUN_F_TSO4)
                                features |= NETIF_F_TSO;
                        if (arg & TUN_F_TSO6)
                                features |= NETIF_F_TSO6;
                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
                }

                arg &= ~TUN_F_UFO;

                /* TODO: for now USO4 and USO6 should work simultaneously */
                if (arg & TUN_F_USO4 && arg & TUN_F_USO6) {
                        features |= NETIF_F_GSO_UDP_L4;
                        arg &= ~(TUN_F_USO4 | TUN_F_USO6);
                }
        }

        /* This gives the user a way to test for new features in future by
         * trying to set them. */
        if (arg)
                return -EINVAL;

        tun->set_features = features;
        tun->dev->wanted_features &= ~TUN_USER_FEATURES;
        tun->dev->wanted_features |= features;
        netdev_update_features(tun->dev);

        return 0;
}

static void tun_detach_filter(struct tun_struct *tun, int n)
{
        int i;
        struct tun_file *tfile;

        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                lock_sock(tfile->socket.sk);
                sk_detach_filter(tfile->socket.sk);
                release_sock(tfile->socket.sk);
        }

        tun->filter_attached = false;
}

static int tun_attach_filter(struct tun_struct *tun)
{
        int i, ret = 0;
        struct tun_file *tfile;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                lock_sock(tfile->socket.sk);
                ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                release_sock(tfile->socket.sk);
                if (ret) {
                        tun_detach_filter(tun, i);
                        return ret;
                }
        }

        tun->filter_attached = true;
        return ret;
}

static void tun_set_sndbuf(struct tun_struct *tun)
{
        struct tun_file *tfile;
        int i;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                tfile->socket.sk->sk_sndbuf = tun->sndbuf;
        }
}

static int tun_set_queue(struct file *file, struct ifreq *ifr)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun;
        int ret = 0;

        rtnl_lock();

        if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
                tun = tfile->detached;
                if (!tun) {
                        ret = -EINVAL;
                        goto unlock;
                }
                ret = security_tun_dev_attach_queue(tun->security);
                if (ret < 0)
                        goto unlock;
                ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
                                 tun->flags & IFF_NAPI_FRAGS, true);
        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
                tun = rtnl_dereference(tfile->tun);
                if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
                        ret = -EINVAL;
                else
                        __tun_detach(tfile, false);
        } else
                ret = -EINVAL;

        if (ret >= 0)
                netdev_state_change(tun->dev);

unlock:
        rtnl_unlock();
        return ret;
}

static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
                        void __user *data)
{
        struct bpf_prog *prog;
        int fd;

        if (copy_from_user(&fd, data, sizeof(fd)))
                return -EFAULT;

        if (fd == -1) {
                prog = NULL;
        } else {
                prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);
        }

        return __tun_set_ebpf(tun, prog_p, prog);
}

/* Return correct value for tun->dev->addr_len based on tun->dev->type. */
static unsigned char tun_get_addr_len(unsigned short type)
{
        switch (type) {
        case ARPHRD_IP6GRE:
        case ARPHRD_TUNNEL6:
                return sizeof(struct in6_addr);
        case ARPHRD_IPGRE:
        case ARPHRD_TUNNEL:
        case ARPHRD_SIT:
                return 4;
        case ARPHRD_ETHER:
                return ETH_ALEN;
        case ARPHRD_IEEE802154:
        case ARPHRD_IEEE802154_MONITOR:
                return IEEE802154_EXTENDED_ADDR_LEN;
        case ARPHRD_PHONET_PIPE:
        case ARPHRD_PPP:
        case ARPHRD_NONE:
                return 0;
        case ARPHRD_6LOWPAN:
                return EUI64_ADDR_LEN;
        case ARPHRD_FDDI:
                return FDDI_K_ALEN;
        case ARPHRD_HIPPI:
                return HIPPI_ALEN;
        case ARPHRD_IEEE802:
                return FC_ALEN;
        case ARPHRD_ROSE:
                return ROSE_ADDR_LEN;
        case ARPHRD_NETROM:
                return AX25_ADDR_LEN;
        case ARPHRD_LOCALTLK:
                return LTALK_ALEN;
        default:
                return 0;
        }
}

static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg, int ifreq_len)
{
        struct tun_file *tfile = file->private_data;
        struct net *net = sock_net(&tfile->sk);
        struct tun_struct *tun;
        void __user* argp = (void __user*)arg;
        unsigned int carrier;
        struct ifreq ifr;
        kuid_t owner;
        kgid_t group;
        int ifindex;
        int sndbuf;
        int vnet_hdr_sz;
        int le;
        int ret;
        bool do_notify = false;

        if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
            (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
                if (copy_from_user(&ifr, argp, ifreq_len))
                        return -EFAULT;
        } else {
                memset(&ifr, 0, sizeof(ifr));
        }
        if (cmd == TUNGETFEATURES) {
                /* Currently this just means: "what IFF flags are valid?".
                 * This is needed because we never checked for invalid flags on
                 * TUNSETIFF.
                 */
                return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER |
                                TUN_FEATURES, (unsigned int __user*)argp);
        } else if (cmd == TUNSETQUEUE) {
                return tun_set_queue(file, &ifr);
        } else if (cmd == SIOCGSKNS) {
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                return open_related_ns(&net->ns, get_net_ns);
        }

        rtnl_lock();

        tun = tun_get(tfile);
        if (cmd == TUNSETIFF) {
                ret = -EEXIST;
                if (tun)
                        goto unlock;

                ifr.ifr_name[IFNAMSIZ-1] = '\0';

                ret = tun_set_iff(net, file, &ifr);

                if (ret)
                        goto unlock;

                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                goto unlock;
        }
        if (cmd == TUNSETIFINDEX) {
                ret = -EPERM;
                if (tun)
                        goto unlock;

                ret = -EFAULT;
                if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
                        goto unlock;
                ret = -EINVAL;
                if (ifindex < 0)
                        goto unlock;
                ret = 0;
                tfile->ifindex = ifindex;
                goto unlock;
        }

        ret = -EBADFD;
        if (!tun)
                goto unlock;

        netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);

        net = dev_net(tun->dev);
        ret = 0;
        switch (cmd) {
        case TUNGETIFF:
                tun_get_iff(tun, &ifr);

                if (tfile->detached)
                        ifr.ifr_flags |= IFF_DETACH_QUEUE;
                if (!tfile->socket.sk->sk_filter)
                        ifr.ifr_flags |= IFF_NOFILTER;

                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                break;

        case TUNSETNOCSUM:
                /* Disable/Enable checksum */

                /* [unimplemented] */
                netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
                           arg ? "disabled" : "enabled");
                break;

        case TUNSETPERSIST:
                /* Disable/Enable persist mode. Keep an extra reference to the
                 * module to prevent the module being unprobed.
                 */
                if (arg && !(tun->flags & IFF_PERSIST)) {
                        tun->flags |= IFF_PERSIST;
                        __module_get(THIS_MODULE);
                        do_notify = true;
                }
                if (!arg && (tun->flags & IFF_PERSIST)) {
                        tun->flags &= ~IFF_PERSIST;
                        module_put(THIS_MODULE);
                        do_notify = true;
                }

                netif_info(tun, drv, tun->dev, "persist %s\n",
                           arg ? "enabled" : "disabled");
                break;

        case TUNSETOWNER:
                /* Set owner of the device */
                owner = make_kuid(current_user_ns(), arg);
                if (!uid_valid(owner)) {
                        ret = -EINVAL;
                        break;
                }
                tun->owner = owner;
                do_notify = true;
                netif_info(tun, drv, tun->dev, "owner set to %u\n",
                           from_kuid(&init_user_ns, tun->owner));
                break;

        case TUNSETGROUP:
                /* Set group of the device */
                group = make_kgid(current_user_ns(), arg);
                if (!gid_valid(group)) {
                        ret = -EINVAL;
                        break;
                }
                tun->group = group;
                do_notify = true;
                netif_info(tun, drv, tun->dev, "group set to %u\n",
                           from_kgid(&init_user_ns, tun->group));
                break;

        case TUNSETLINK:
                /* Only allow setting the type when the interface is down */
                if (tun->dev->flags & IFF_UP) {
                        netif_info(tun, drv, tun->dev,
                                   "Linktype set failed because interface is up\n");
                        ret = -EBUSY;
                } else {
                        ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
                                                       tun->dev);
                        ret = notifier_to_errno(ret);
                        if (ret) {
                                netif_info(tun, drv, tun->dev,
                                           "Refused to change device type\n");
                                break;
                        }
                        tun->dev->type = (int) arg;
                        tun->dev->addr_len = tun_get_addr_len(tun->dev->type);
                        netif_info(tun, drv, tun->dev, "linktype set to %d\n",
                                   tun->dev->type);
                        call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
                                                 tun->dev);
                }
                break;

        case TUNSETDEBUG:
                tun->msg_enable = (u32)arg;
                break;

        case TUNSETOFFLOAD:
                ret = set_offload(tun, arg);
                break;

        case TUNSETTXFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = update_filter(&tun->txflt, (void __user *)arg);
                break;

        case SIOCGIFHWADDR:
                /* Get hw address */
                dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                break;

        case SIOCSIFHWADDR:
                /* Set hw address */
                ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL);
                break;

        case TUNGETSNDBUF:
                sndbuf = tfile->socket.sk->sk_sndbuf;
                if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
                        ret = -EFAULT;
                break;

        case TUNSETSNDBUF:
                if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
                        ret = -EFAULT;
                        break;
                }
                if (sndbuf <= 0) {
                        ret = -EINVAL;
                        break;
                }

                tun->sndbuf = sndbuf;
                tun_set_sndbuf(tun);
                break;

        case TUNGETVNETHDRSZ:
                vnet_hdr_sz = tun->vnet_hdr_sz;
                if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
                        ret = -EFAULT;
                break;

        case TUNSETVNETHDRSZ:
                if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
                        ret = -EFAULT;
                        break;
                }
                if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
                        ret = -EINVAL;
                        break;
                }

                tun->vnet_hdr_sz = vnet_hdr_sz;
                break;

        case TUNGETVNETLE:
                le = !!(tun->flags & TUN_VNET_LE);
                if (put_user(le, (int __user *)argp))
                        ret = -EFAULT;
                break;

        case TUNSETVNETLE:
                if (get_user(le, (int __user *)argp)) {
                        ret = -EFAULT;
                        break;
                }
                if (le)
                        tun->flags |= TUN_VNET_LE;
                else
                        tun->flags &= ~TUN_VNET_LE;
                break;

        case TUNGETVNETBE:
                ret = tun_get_vnet_be(tun, argp);
                break;

        case TUNSETVNETBE:
                ret = tun_set_vnet_be(tun, argp);
                break;

        case TUNATTACHFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = -EFAULT;
                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
                        break;

                ret = tun_attach_filter(tun);
                break;

        case TUNDETACHFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = 0;
                tun_detach_filter(tun, tun->numqueues);
                break;

        case TUNGETFILTER:
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = -EFAULT;
                if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
                        break;
                ret = 0;
                break;

        case TUNSETSTEERINGEBPF:
                ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
                break;

        case TUNSETFILTEREBPF:
                ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
                break;

        case TUNSETCARRIER:
                ret = -EFAULT;
                if (copy_from_user(&carrier, argp, sizeof(carrier)))
                        goto unlock;

                ret = tun_net_change_carrier(tun->dev, (bool)carrier);
                break;

        case TUNGETDEVNETNS:
                ret = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto unlock;
                ret = open_related_ns(&net->ns, get_net_ns);
                break;

        default:
                ret = -EINVAL;
                break;
        }

        if (do_notify)
                netdev_state_change(tun->dev);

unlock:
        rtnl_unlock();
        if (tun)
                tun_put(tun);
        return ret;
}

static long tun_chr_ioctl(struct file *file,
                          unsigned int cmd, unsigned long arg)
{
        return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
}

#ifdef CONFIG_COMPAT
static long tun_chr_compat_ioctl(struct file *file,
                         unsigned int cmd, unsigned long arg)
{
        switch (cmd) {
        case TUNSETIFF:
        case TUNGETIFF:
        case TUNSETTXFILTER:
        case TUNGETSNDBUF:
        case TUNSETSNDBUF:
        case SIOCGIFHWADDR:
        case SIOCSIFHWADDR:
                arg = (unsigned long)compat_ptr(arg);
                break;
        default:
                arg = (compat_ulong_t)arg;
                break;
        }

        /*
         * compat_ifreq is shorter than ifreq, so we must not access beyond
         * the end of that structure. All fields that are used in this
         * driver are compatible though, we don't need to convert the
         * contents.
         */
        return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
}
#endif /* CONFIG_COMPAT */

static int tun_chr_fasync(int fd, struct file *file, int on)
{
        struct tun_file *tfile = file->private_data;
        int ret;

        if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
                goto out;

        if (on) {
                __f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
                tfile->flags |= TUN_FASYNC;
        } else
                tfile->flags &= ~TUN_FASYNC;
        ret = 0;
out:
        return ret;
}

static int tun_chr_open(struct inode *inode, struct file * file)
{
        struct net *net = current->nsproxy->net_ns;
        struct tun_file *tfile;

        tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
                                            &tun_proto, 0);
        if (!tfile)
                return -ENOMEM;
        if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
                sk_free(&tfile->sk);
                return -ENOMEM;
        }

        mutex_init(&tfile->napi_mutex);
        RCU_INIT_POINTER(tfile->tun, NULL);
        tfile->flags = 0;
        tfile->ifindex = 0;

        init_waitqueue_head(&tfile->socket.wq.wait);

        tfile->socket.file = file;
        tfile->socket.ops = &tun_socket_ops;

        sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid());

        tfile->sk.sk_write_space = tun_sock_write_space;
        tfile->sk.sk_sndbuf = INT_MAX;

        file->private_data = tfile;
        INIT_LIST_HEAD(&tfile->next);

        sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);

        /* tun groks IOCB_NOWAIT just fine, mark it as such */
        file->f_mode |= FMODE_NOWAIT;
        return 0;
}

static int tun_chr_close(struct inode *inode, struct file *file)
{
        struct tun_file *tfile = file->private_data;

        tun_detach(tfile, true);

        return 0;
}

#ifdef CONFIG_PROC_FS
static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun;
        struct ifreq ifr;

        memset(&ifr, 0, sizeof(ifr));

        rtnl_lock();
        tun = tun_get(tfile);
        if (tun)
                tun_get_iff(tun, &ifr);
        rtnl_unlock();

        if (tun)
                tun_put(tun);

        seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
}
#endif

static const struct file_operations tun_fops = {
        .owner        = THIS_MODULE,
        .llseek = no_llseek,
        .read_iter  = tun_chr_read_iter,
        .write_iter = tun_chr_write_iter,
        .poll        = tun_chr_poll,
        .unlocked_ioctl        = tun_chr_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = tun_chr_compat_ioctl,
#endif
        .open        = tun_chr_open,
        .release = tun_chr_close,
        .fasync = tun_chr_fasync,
#ifdef CONFIG_PROC_FS
        .show_fdinfo = tun_chr_show_fdinfo,
#endif
};

static struct miscdevice tun_miscdev = {
        .minor = TUN_MINOR,
        .name = "tun",
        .nodename = "net/tun",
        .fops = &tun_fops,
};

/* ethtool interface */

static void tun_default_link_ksettings(struct net_device *dev,
                                       struct ethtool_link_ksettings *cmd)
{
        ethtool_link_ksettings_zero_link_mode(cmd, supported);
        ethtool_link_ksettings_zero_link_mode(cmd, advertising);
        cmd->base.speed                = SPEED_10000;
        cmd->base.duplex        = DUPLEX_FULL;
        cmd->base.port                = PORT_TP;
        cmd->base.phy_address        = 0;
        cmd->base.autoneg        = AUTONEG_DISABLE;
}

static int tun_get_link_ksettings(struct net_device *dev,
                                  struct ethtool_link_ksettings *cmd)
{
        struct tun_struct *tun = netdev_priv(dev);

        memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
        return 0;
}

static int tun_set_link_ksettings(struct net_device *dev,
                                  const struct ethtool_link_ksettings *cmd)
{
        struct tun_struct *tun = netdev_priv(dev);

        memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
        return 0;
}

static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
        struct tun_struct *tun = netdev_priv(dev);

        strscpy(info->driver, DRV_NAME, sizeof(info->driver));
        strscpy(info->version, DRV_VERSION, sizeof(info->version));

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                strscpy(info->bus_info, "tun", sizeof(info->bus_info));
                break;
        case IFF_TAP:
                strscpy(info->bus_info, "tap", sizeof(info->bus_info));
                break;
        }
}

static u32 tun_get_msglevel(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        return tun->msg_enable;
}

static void tun_set_msglevel(struct net_device *dev, u32 value)
{
        struct tun_struct *tun = netdev_priv(dev);

        tun->msg_enable = value;
}

static int tun_get_coalesce(struct net_device *dev,
                            struct ethtool_coalesce *ec,
                            struct kernel_ethtool_coalesce *kernel_coal,
                            struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);

        ec->rx_max_coalesced_frames = tun->rx_batched;

        return 0;
}

static int tun_set_coalesce(struct net_device *dev,
                            struct ethtool_coalesce *ec,
                            struct kernel_ethtool_coalesce *kernel_coal,
                            struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
                tun->rx_batched = NAPI_POLL_WEIGHT;
        else
                tun->rx_batched = ec->rx_max_coalesced_frames;

        return 0;
}

static void tun_get_channels(struct net_device *dev,
                             struct ethtool_channels *channels)
{
        struct tun_struct *tun = netdev_priv(dev);

        channels->combined_count = tun->numqueues;
        channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1;
}

static const struct ethtool_ops tun_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
        .get_drvinfo        = tun_get_drvinfo,
        .get_msglevel        = tun_get_msglevel,
        .set_msglevel        = tun_set_msglevel,
        .get_link        = ethtool_op_get_link,
        .get_channels   = tun_get_channels,
        .get_ts_info        = ethtool_op_get_ts_info,
        .get_coalesce   = tun_get_coalesce,
        .set_coalesce   = tun_set_coalesce,
        .get_link_ksettings = tun_get_link_ksettings,
        .set_link_ksettings = tun_set_link_ksettings,
};

static int tun_queue_resize(struct tun_struct *tun)
{
        struct net_device *dev = tun->dev;
        struct tun_file *tfile;
        struct ptr_ring **rings;
        int n = tun->numqueues + tun->numdisabled;
        int ret, i;

        rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
        if (!rings)
                return -ENOMEM;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                rings[i] = &tfile->tx_ring;
        }
        list_for_each_entry(tfile, &tun->disabled, next)
                rings[i++] = &tfile->tx_ring;

        ret = ptr_ring_resize_multiple(rings, n,
                                       dev->tx_queue_len, GFP_KERNEL,
                                       tun_ptr_free);

        kfree(rings);
        return ret;
}

static int tun_device_event(struct notifier_block *unused,
                            unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct tun_struct *tun = netdev_priv(dev);
        int i;

        if (dev->rtnl_link_ops != &tun_link_ops)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tun_queue_resize(tun))
                        return NOTIFY_BAD;
                break;
        case NETDEV_UP:
                for (i = 0; i < tun->numqueues; i++) {
                        struct tun_file *tfile;

                        tfile = rtnl_dereference(tun->tfiles[i]);
                        tfile->socket.sk->sk_write_space(tfile->socket.sk);
                }
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block tun_notifier_block __read_mostly = {
        .notifier_call        = tun_device_event,
};

static int __init tun_init(void)
{
        int ret = 0;

        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);

        ret = rtnl_link_register(&tun_link_ops);
        if (ret) {
                pr_err("Can't register link_ops\n");
                goto err_linkops;
        }

        ret = misc_register(&tun_miscdev);
        if (ret) {
                pr_err("Can't register misc device %d\n", TUN_MINOR);
                goto err_misc;
        }

        ret = register_netdevice_notifier(&tun_notifier_block);
        if (ret) {
                pr_err("Can't register netdevice notifier\n");
                goto err_notifier;
        }

        return  0;

err_notifier:
        misc_deregister(&tun_miscdev);
err_misc:
        rtnl_link_unregister(&tun_link_ops);
err_linkops:
        return ret;
}

static void __exit tun_cleanup(void)
{
        misc_deregister(&tun_miscdev);
        rtnl_link_unregister(&tun_link_ops);
        unregister_netdevice_notifier(&tun_notifier_block);
}

/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
struct socket *tun_get_socket(struct file *file)
{
        struct tun_file *tfile;
        if (file->f_op != &tun_fops)
                return ERR_PTR(-EINVAL);
        tfile = file->private_data;
        if (!tfile)
                return ERR_PTR(-EBADFD);
        return &tfile->socket;
}
EXPORT_SYMBOL_GPL(tun_get_socket);

struct ptr_ring *tun_get_tx_ring(struct file *file)
{
        struct tun_file *tfile;

        if (file->f_op != &tun_fops)
                return ERR_PTR(-EINVAL);
        tfile = file->private_data;
        if (!tfile)
                return ERR_PTR(-EBADFD);
        return &tfile->tx_ring;
}
EXPORT_SYMBOL_GPL(tun_get_tx_ring);

module_init(tun_init);
module_exit(tun_cleanup);
MODULE_DESCRIPTION(DRV_DESCRIPTION);
MODULE_AUTHOR(DRV_COPYRIGHT);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(TUN_MINOR);
MODULE_ALIAS("devname:net/tun");


































































































































































































































































































































    2 









    2 
























































































































    2 









    2 































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/truncate.c - code for taking down pages from address_spaces
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 10Sep2002        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"

/*
 * Regular page slots are stabilized by the page lock even without the tree
 * itself locked.  These unlocked entries need verification under the tree
 * lock.
 */
static inline void __clear_shadow_entry(struct address_space *mapping,
                                pgoff_t index, void *entry)
{
        XA_STATE(xas, &mapping->i_pages, index);

        xas_set_update(&xas, workingset_update_node);
        if (xas_load(&xas) != entry)
                return;
        xas_store(&xas, NULL);
}

static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
                               void *entry)
{
        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        __clear_shadow_entry(mapping, index, entry);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);
}

/*
 * Unconditionally remove exceptional entries. Usually called from truncate
 * path. Note that the folio_batch may be altered by this function by removing
 * exceptional entries similar to what folio_batch_remove_exceptionals() does.
 */
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
                                struct folio_batch *fbatch, pgoff_t *indices)
{
        int i, j;
        bool dax;

        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return;

        for (j = 0; j < folio_batch_count(fbatch); j++)
                if (xa_is_value(fbatch->folios[j]))
                        break;

        if (j == folio_batch_count(fbatch))
                return;

        dax = dax_mapping(mapping);
        if (!dax) {
                spin_lock(&mapping->host->i_lock);
                xa_lock_irq(&mapping->i_pages);
        }

        for (i = j; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];
                pgoff_t index = indices[i];

                if (!xa_is_value(folio)) {
                        fbatch->folios[j++] = folio;
                        continue;
                }

                if (unlikely(dax)) {
                        dax_delete_mapping_entry(mapping, index);
                        continue;
                }

                __clear_shadow_entry(mapping, index, folio);
        }

        if (!dax) {
                xa_unlock_irq(&mapping->i_pages);
                if (mapping_shrinkable(mapping))
                        inode_add_lru(mapping->host);
                spin_unlock(&mapping->host->i_lock);
        }
        fbatch->nr = j;
}

/*
 * Invalidate exceptional entry if easily possible. This handles exceptional
 * entries for invalidate_inode_pages().
 */
static int invalidate_exceptional_entry(struct address_space *mapping,
                                        pgoff_t index, void *entry)
{
        /* Handled by shmem itself, or for DAX we do nothing. */
        if (shmem_mapping(mapping) || dax_mapping(mapping))
                return 1;
        clear_shadow_entry(mapping, index, entry);
        return 1;
}

/*
 * Invalidate exceptional entry if clean. This handles exceptional entries for
 * invalidate_inode_pages2() so for DAX it evicts only clean entries.
 */
static int invalidate_exceptional_entry2(struct address_space *mapping,
                                         pgoff_t index, void *entry)
{
        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return 1;
        if (dax_mapping(mapping))
                return dax_invalidate_mapping_entry_sync(mapping, index);
        clear_shadow_entry(mapping, index, entry);
        return 1;
}

/**
 * folio_invalidate - Invalidate part or all of a folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * folio_invalidate() is called when all or part of the folio has become
 * invalidated by a truncate operation.
 *
 * folio_invalidate() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void folio_invalidate(struct folio *folio, size_t offset, size_t length)
{
        const struct address_space_operations *aops = folio->mapping->a_ops;

        if (aops->invalidate_folio)
                aops->invalidate_folio(folio, offset, length);
}
EXPORT_SYMBOL_GPL(folio_invalidate);

/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
 *
 * We need to bail out if page->mapping is no longer equal to the original
 * mapping.  This happens a) when the VM reclaimed the page while we waited on
 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
 */
static void truncate_cleanup_folio(struct folio *folio)
{
        if (folio_mapped(folio))
                unmap_mapping_folio(folio);

        if (folio_has_private(folio))
                folio_invalidate(folio, 0, folio_size(folio));

        /*
         * Some filesystems seem to re-dirty the page even after
         * the VM has canceled the dirty bit (eg ext3 journaling).
         * Hence dirty accounting check is placed after invalidation.
         */
        folio_cancel_dirty(folio);
        folio_clear_mappedtodisk(folio);
}

int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
        if (folio->mapping != mapping)
                return -EIO;

        truncate_cleanup_folio(folio);
        filemap_remove_folio(folio);
        return 0;
}

/*
 * Handle partial folios.  The folio may be entirely within the
 * range if a split has raced with us.  If not, we zero the part of the
 * folio that's within the [start, end] range, and then split the folio if
 * it's large.  split_page_range() will discard pages which now lie beyond
 * i_size, and we rely on the caller to discard pages which lie within a
 * newly created hole.
 *
 * Returns false if splitting failed so the caller can avoid
 * discarding the entire folio which is stubbornly unsplit.
 */
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
        loff_t pos = folio_pos(folio);
        unsigned int offset, length;

        if (pos < start)
                offset = start - pos;
        else
                offset = 0;
        length = folio_size(folio);
        if (pos + length <= (u64)end)
                length = length - offset;
        else
                length = end + 1 - pos - offset;

        folio_wait_writeback(folio);
        if (length == folio_size(folio)) {
                truncate_inode_folio(folio->mapping, folio);
                return true;
        }

        /*
         * We may be zeroing pages we're about to discard, but it avoids
         * doing a complex calculation here, and then doing the zeroing
         * anyway if the page split fails.
         */
        folio_zero_range(folio, offset, length);

        if (folio_has_private(folio))
                folio_invalidate(folio, offset, length);
        if (!folio_test_large(folio))
                return true;
        if (split_folio(folio) == 0)
                return true;
        if (folio_test_dirty(folio))
                return false;
        truncate_inode_folio(folio->mapping, folio);
        return true;
}

/*
 * Used to get rid of pages on hardware memory corruption.
 */
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio)
{
        if (!mapping)
                return -EINVAL;
        /*
         * Only punch for normal data pages for now.
         * Handling other types like directories would need more auditing.
         */
        if (!S_ISREG(mapping->host->i_mode))
                return -EIO;
        return truncate_inode_folio(mapping, folio);
}
EXPORT_SYMBOL(generic_error_remove_folio);

/**
 * mapping_evict_folio() - Remove an unused folio from the page-cache.
 * @mapping: The mapping this folio belongs to.
 * @folio: The folio to remove.
 *
 * Safely remove one folio from the page cache.
 * It only drops clean, unused folios.
 *
 * Context: Folio must be locked.
 * Return: The number of pages successfully removed.
 */
long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
{
        /* The page may have been truncated before it was locked */
        if (!mapping)
                return 0;
        if (folio_test_dirty(folio) || folio_test_writeback(folio))
                return 0;
        /* The refcount will be elevated if any page in the folio is mapped */
        if (folio_ref_count(folio) >
                        folio_nr_pages(folio) + folio_has_private(folio) + 1)
                return 0;
        if (!filemap_release_folio(folio, 0))
                return 0;

        return remove_mapping(mapping, folio);
}

/**
 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 * @lend: offset to which to truncate (inclusive)
 *
 * Truncate the page cache, removing the pages that are between
 * specified offsets (and zeroing out partial pages
 * if lstart or lend + 1 is not page aligned).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
 * will wait.  This is to prevent as much IO as possible in the affected region.
 * The first pass will remove most pages, so the search cost of the second pass
 * is low.
 *
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
 *
 * Note that since ->invalidate_folio() accepts range to invalidate
 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
 * page aligned properly.
 */
void truncate_inode_pages_range(struct address_space *mapping,
                                loff_t lstart, loff_t lend)
{
        pgoff_t                start;                /* inclusive */
        pgoff_t                end;                /* exclusive */
        struct folio_batch fbatch;
        pgoff_t                indices[PAGEVEC_SIZE];
        pgoff_t                index;
        int                i;
        struct folio        *folio;
        bool                same_folio;

        if (mapping_empty(mapping))
                return;

        /*
         * 'start' and 'end' always covers the range of pages to be fully
         * truncated. Partial pages are covered with 'partial_start' at the
         * start of the range and 'partial_end' at the end of the range.
         * Note that 'end' is exclusive while 'lend' is inclusive.
         */
        start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (lend == -1)
                /*
                 * lend == -1 indicates end-of-file so we have to set 'end'
                 * to the highest possible pgoff_t and since the type is
                 * unsigned we're using -1.
                 */
                end = -1;
        else
                end = (lend + 1) >> PAGE_SHIFT;

        folio_batch_init(&fbatch);
        index = start;
        while (index < end && find_lock_entries(mapping, &index, end - 1,
                        &fbatch, indices)) {
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        truncate_cleanup_folio(fbatch.folios[i]);
                delete_from_page_cache_batch(mapping, &fbatch);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_unlock(fbatch.folios[i]);
                folio_batch_release(&fbatch);
                cond_resched();
        }

        same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
        folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
        if (!IS_ERR(folio)) {
                same_folio = lend < folio_pos(folio) + folio_size(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend)) {
                        start = folio_next_index(folio);
                        if (same_folio)
                                end = folio->index;
                }
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        if (!same_folio) {
                folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
                                                FGP_LOCK, 0);
                if (!IS_ERR(folio)) {
                        if (!truncate_inode_partial_folio(folio, lstart, lend))
                                end = folio->index;
                        folio_unlock(folio);
                        folio_put(folio);
                }
        }

        index = start;
        while (index < end) {
                cond_resched();
                if (!find_get_entries(mapping, &index, end - 1, &fbatch,
                                indices)) {
                        /* If all gone from start onwards, we're done */
                        if (index == start)
                                break;
                        /* Otherwise restart to make sure all gone */
                        index = start;
                        continue;
                }

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing page->index */

                        if (xa_is_value(folio))
                                continue;

                        folio_lock(folio);
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);
                        truncate_inode_folio(mapping, folio);
                        folio_unlock(folio);
                }
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                folio_batch_release(&fbatch);
        }
}
EXPORT_SYMBOL(truncate_inode_pages_range);

/**
 * truncate_inode_pages - truncate *all* the pages from an offset
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 *
 * Called under (and serialised by) inode->i_rwsem and
 * mapping->invalidate_lock.
 *
 * Note: When this function returns, there can be a page in the process of
 * deletion (inside __filemap_remove_folio()) in the specified range.  Thus
 * mapping->nrpages can be non-zero when this function returns even after
 * truncation of the whole mapping.
 */
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
EXPORT_SYMBOL(truncate_inode_pages);

/**
 * truncate_inode_pages_final - truncate *all* pages before inode dies
 * @mapping: mapping to truncate
 *
 * Called under (and serialized by) inode->i_rwsem.
 *
 * Filesystems have to use this in the .evict_inode path to inform the
 * VM that this is the final truncate and the inode is going away.
 */
void truncate_inode_pages_final(struct address_space *mapping)
{
        /*
         * Page reclaim can not participate in regular inode lifetime
         * management (can't call iput()) and thus can race with the
         * inode teardown.  Tell it when the address space is exiting,
         * so that it does not install eviction information after the
         * final truncate has begun.
         */
        mapping_set_exiting(mapping);

        if (!mapping_empty(mapping)) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
                 * modification that does not see AS_EXITING is
                 * completed before starting the final truncate.
                 */
                xa_lock_irq(&mapping->i_pages);
                xa_unlock_irq(&mapping->i_pages);
        }

        truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);

/**
 * mapping_try_invalidate - Invalidate all the evictable folios of one inode
 * @mapping: the address_space which holds the folios to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 * @nr_failed: How many folio invalidations failed
 *
 * This function is similar to invalidate_mapping_pages(), except that it
 * returns the number of folios which could not be evicted in @nr_failed.
 */
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio_batch fbatch;
        pgoff_t index = start;
        unsigned long ret;
        unsigned long count = 0;
        int i;

        folio_batch_init(&fbatch);
        while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                count += invalidate_exceptional_entry(mapping,
                                                             indices[i], folio);
                                continue;
                        }

                        ret = mapping_evict_folio(mapping, folio);
                        folio_unlock(folio);
                        /*
                         * Invalidation is a hint that the folio is no longer
                         * of interest and try to speed up its reclaim.
                         */
                        if (!ret) {
                                deactivate_file_folio(folio);
                                /* Likely in the lru cache of a remote CPU */
                                if (nr_failed)
                                        (*nr_failed)++;
                        }
                        count += ret;
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        return count;
}

/**
 * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
 * @mapping: the address_space which holds the cache to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 *
 * This function removes pages that are clean, unmapped and unlocked,
 * as well as shadow entries. It will not block on IO activity.
 *
 * If you want to remove all the pages of one inode, regardless of
 * their use and writeback state, use truncate_inode_pages().
 *
 * Return: The number of indices that had their contents invalidated
 */
unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
{
        return mapping_try_invalidate(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);

/*
 * This is like mapping_evict_folio(), except it ignores the folio's
 * refcount.  We do this because invalidate_inode_pages2() needs stronger
 * invalidation guarantees, and cannot afford to leave folios behind because
 * shrink_page_list() has a temp ref on them, or because they're transiently
 * sitting in the folio_add_lru() caches.
 */
static int invalidate_complete_folio2(struct address_space *mapping,
                                        struct folio *folio)
{
        if (folio->mapping != mapping)
                return 0;

        if (!filemap_release_folio(folio, GFP_KERNEL))
                return 0;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        if (folio_test_dirty(folio))
                goto failed;

        BUG_ON(folio_has_private(folio));
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        filemap_free_folio(mapping, folio);
        return 1;
failed:
        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&mapping->host->i_lock);
        return 0;
}

static int folio_launder(struct address_space *mapping, struct folio *folio)
{
        if (!folio_test_dirty(folio))
                return 0;
        if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
                return 0;
        return mapping->a_ops->launder_folio(folio);
}

/**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
 * @mapping: the address_space
 * @start: the page offset 'from' which to invalidate
 * @end: the page offset 'to' which to invalidate (inclusive)
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio_batch fbatch;
        pgoff_t index;
        int i;
        int ret = 0;
        int ret2 = 0;
        int did_range_unmap = 0;

        if (mapping_empty(mapping))
                return 0;

        folio_batch_init(&fbatch);
        index = start;
        while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                if (!invalidate_exceptional_entry2(mapping,
                                                indices[i], folio))
                                        ret = -EBUSY;
                                continue;
                        }

                        if (!did_range_unmap && folio_mapped(folio)) {
                                /*
                                 * If folio is mapped, before taking its lock,
                                 * zap the rest of the file in one hit.
                                 */
                                unmap_mapping_pages(mapping, indices[i],
                                                (1 + end - indices[i]), false);
                                did_range_unmap = 1;
                        }

                        folio_lock(folio);
                        if (unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);

                        if (folio_mapped(folio))
                                unmap_mapping_folio(folio);
                        BUG_ON(folio_mapped(folio));

                        ret2 = folio_launder(mapping, folio);
                        if (ret2 == 0) {
                                if (!invalidate_complete_folio2(mapping, folio))
                                        ret2 = -EBUSY;
                        }
                        if (ret2 < 0)
                                ret = ret2;
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        /*
         * For DAX we invalidate page tables after invalidating page cache.  We
         * could invalidate page tables while invalidating each entry however
         * that would be expensive. And doing range unmapping before doesn't
         * work as we have no cheap way to find whether page cache entry didn't
         * get remapped later.
         */
        if (dax_mapping(mapping)) {
                unmap_mapping_pages(mapping, start, end - start + 1, false);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

/**
 * invalidate_inode_pages2 - remove all pages from an address_space
 * @mapping: the address_space
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2(struct address_space *mapping)
{
        return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);

/**
 * truncate_pagecache - unmap and remove pagecache that has been truncated
 * @inode: inode
 * @newsize: new file size
 *
 * inode's new i_size must already be written before truncate_pagecache
 * is called.
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache(struct inode *inode, loff_t newsize)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t holebegin = round_up(newsize, PAGE_SIZE);

        /*
         * unmap_mapping_range is called twice, first simply for
         * efficiency so that truncate_inode_pages does fewer
         * single-page unmaps.  However after this first call, and
         * before truncate_inode_pages finishes, it is possible for
         * private pages to be COWed, which remain after
         * truncate_inode_pages finishes, hence the second
         * unmap_mapping_range call must be made for correctness.
         */
        unmap_mapping_range(mapping, holebegin, 0, 1);
        truncate_inode_pages(mapping, newsize);
        unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);

/**
 * truncate_setsize - update inode and pagecache for a new file size
 * @inode: inode
 * @newsize: new file size
 *
 * truncate_setsize updates i_size and performs pagecache truncation (if
 * necessary) to @newsize. It will be typically be called from the filesystem's
 * setattr function when ATTR_SIZE is passed in.
 *
 * Must be called with a lock serializing truncates and writes (generally
 * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
 * specific block truncation has been performed.
 */
void truncate_setsize(struct inode *inode, loff_t newsize)
{
        loff_t oldsize = inode->i_size;

        i_size_write(inode, newsize);
        if (newsize > oldsize)
                pagecache_isize_extended(inode, oldsize, newsize);
        truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);

/**
 * pagecache_isize_extended - update pagecache after extension of i_size
 * @inode:        inode for which i_size was extended
 * @from:        original inode size
 * @to:                new inode size
 *
 * Handle extension of inode size either caused by extending truncate or
 * by write starting after current i_size.  We mark the page straddling
 * current i_size RO so that page_mkwrite() is called on the first
 * write access to the page.  The filesystem will update its per-block
 * information before user writes to the page via mmap after the i_size
 * has been changed.
 *
 * The function must be called after i_size is updated so that page fault
 * coming after we unlock the folio will already see the new i_size.
 * The function must be called while we still hold i_rwsem - this not only
 * makes sure i_size is stable but also that userspace cannot observe new
 * i_size value before we are prepared to store mmap writes at new inode size.
 */
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
        int bsize = i_blocksize(inode);
        loff_t rounded_from;
        struct folio *folio;

        WARN_ON(to > inode->i_size);

        if (from >= to || bsize >= PAGE_SIZE)
                return;
        /* Page straddling @from will not have any hole block created? */
        rounded_from = round_up(from, bsize);
        if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
                return;

        folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
        /* Folio not cached? Nothing to do */
        if (IS_ERR(folio))
                return;
        /*
         * See folio_clear_dirty_for_io() for details why folio_mark_dirty()
         * is needed.
         */
        if (folio_mkclean(folio))
                folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(pagecache_isize_extended);

/**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
 * @lstart: offset of beginning of hole
 * @lend: offset of last byte of hole
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t unmap_start = round_up(lstart, PAGE_SIZE);
        loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
        /*
         * This rounding is currently just for example: unmap_mapping_range
         * expands its hole outwards, whereas we want it to contract the hole
         * inwards.  However, existing callers of truncate_pagecache_range are
         * doing their own page rounding first.  Note that unmap_mapping_range
         * allows holelen 0 for all, and we allow lend -1 for end of file.
         */

        /*
         * Unlike in truncate_pagecache, unmap_mapping_range is called only
         * once (before truncating pagecache), and without "even_cows" flag:
         * hole-punching should not remove private COWed pages from the hole.
         */
        if ((u64)unmap_end > (u64)unmap_start)
                unmap_mapping_range(mapping, unmap_start,
                                    1 + unmap_end - unmap_start, 0);
        truncate_inode_pages_range(mapping, lstart, lend);
}
EXPORT_SYMBOL(truncate_pagecache_range);






























































   23 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UNWIND_H
#define _ASM_X86_UNWIND_H

#include <linux/sched.h>
#include <linux/ftrace.h>
#include <linux/rethook.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>

#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE   (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)

struct unwind_state {
        struct stack_info stack_info;
        unsigned long stack_mask;
        struct task_struct *task;
        int graph_idx;
#if defined(CONFIG_RETHOOK)
        struct llist_node *kr_cur;
#endif
        bool error;
#if defined(CONFIG_UNWINDER_ORC)
        bool signal, full_regs;
        unsigned long sp, bp, ip;
        struct pt_regs *regs, *prev_regs;
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
        bool got_irq;
        unsigned long *bp, *orig_sp, ip;
        /*
         * If non-NULL: The current frame is incomplete and doesn't contain a
         * valid BP. When looking for the next frame, use this instead of the
         * non-existent saved BP.
         */
        unsigned long *next_bp;
        struct pt_regs *regs;
#else
        unsigned long *sp;
#endif
};

void __unwind_start(struct unwind_state *state, struct task_struct *task,
                    struct pt_regs *regs, unsigned long *first_frame);
bool unwind_next_frame(struct unwind_state *state);
unsigned long unwind_get_return_address(struct unwind_state *state);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);

static inline bool unwind_done(struct unwind_state *state)
{
        return state->stack_info.type == STACK_TYPE_UNKNOWN;
}

static inline bool unwind_error(struct unwind_state *state)
{
        return state->error;
}

static inline
void unwind_start(struct unwind_state *state, struct task_struct *task,
                  struct pt_regs *regs, unsigned long *first_frame)
{
        first_frame = first_frame ? : get_stack_pointer(task, regs);

        __unwind_start(state, task, regs, first_frame);
}

#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
 * If 'partial' returns true, only the iret frame registers are valid.
 */
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
                                                    bool *partial)
{
        if (unwind_done(state))
                return NULL;

        if (partial) {
#ifdef CONFIG_UNWINDER_ORC
                *partial = !state->full_regs;
#else
                *partial = false;
#endif
        }

        return state->regs;
}
#else
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
                                                    bool *partial)
{
        return NULL;
}
#endif

#ifdef CONFIG_UNWINDER_ORC
void unwind_init(void);
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
                        void *orc, size_t orc_size);
#else
static inline void unwind_init(void) {}
static inline
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
                        void *orc, size_t orc_size) {}
#endif

static inline
unsigned long unwind_recover_rethook(struct unwind_state *state,
                                     unsigned long addr, unsigned long *addr_p)
{
#ifdef CONFIG_RETHOOK
        if (is_rethook_trampoline(addr))
                return rethook_find_ret_addr(state->task, (unsigned long)addr_p,
                                             &state->kr_cur);
#endif
        return addr;
}

/* Recover the return address modified by rethook and ftrace_graph. */
static inline
unsigned long unwind_recover_ret_addr(struct unwind_state *state,
                                     unsigned long addr, unsigned long *addr_p)
{
        unsigned long ret;

        ret = ftrace_graph_ret_addr(state->task, &state->graph_idx,
                                    addr, addr_p);
        return unwind_recover_rethook(state, ret, addr_p);
}

/*
 * This disables KASAN checking when reading a value from another task's stack,
 * since the other task could be running on another CPU and could have poisoned
 * the stack in the meantime.
 */
#define READ_ONCE_TASK_STACK(task, x)                        \
({                                                        \
        unsigned long val;                                \
        if (task == current)                                \
                val = READ_ONCE(x);                        \
        else                                                \
                val = READ_ONCE_NOCHECK(x);                \
        val;                                                \
})

static inline bool task_on_another_cpu(struct task_struct *task)
{
#ifdef CONFIG_SMP
        return task != current && task->on_cpu;
#else
        return false;
#endif
}

#endif /* _ASM_X86_UNWIND_H */


























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

#include <linux/gfp_types.h>

#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
#include <linux/sched.h>

struct vm_area_struct;
struct mempolicy;

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfp_migratetype(const gfp_t gfp_flags)
{
        VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
        BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
        BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
        BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
                      GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);

        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT

static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)                                       \
        | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)               \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)                       \
        | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                      \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);

        z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
                                         ((1 << GFP_ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

/*
 * gfp flag masking for nested internal allocations.
 *
 * For code that needs to do allocations inside the public allocation API (e.g.
 * memory allocation tracking code) the allocations need to obey the caller
 * allocation context constrains to prevent allocation context mismatches (e.g.
 * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock
 * situations.
 *
 * It is also assumed that these nested allocations are for internal kernel
 * object storage purposes only and are not going to be used for DMA, etc. Hence
 * we strip out all the zone information and leave just the context information
 * intact.
 *
 * Further, internal allocations must fail before the higher level allocation
 * can fail, so we must make them fail faster and fail silently. We also don't
 * want them to deplete emergency reserves.  Hence nested allocations must be
 * prepared for these allocations to fail.
 */
static inline gfp_t gfp_nested_mask(gfp_t flags)
{
        return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) |
                (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN));
}

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the case of non-NUMA systems the NODE_DATA() gets optimized to
 * &contig_page_data at compile-time.
 */
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif

struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __alloc_pages(...)                        alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))

struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __folio_alloc(...)                        alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
                                nodemask_t *nodemask, int nr_pages,
                                struct list_head *page_list,
                                struct page **page_array);
#define __alloc_pages_bulk(...)                        alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
                                unsigned long nr_pages,
                                struct page **page_array);
#define  alloc_pages_bulk_array_mempolicy(...)                                \
        alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))

/* Bulk allocate order-0 pages */
#define alloc_pages_bulk_list(_gfp, _nr_pages, _list)                        \
        __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL)

#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array)                \
        __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array)

static inline unsigned long
alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
                                   struct page **page_array)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array);
}

#define alloc_pages_bulk_array_node(...)                                \
        alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__))

static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
        gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);

        if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
                return;

        if (node_online(this_node))
                return;

        pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
        dump_stack();
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp_mask);

        return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
}

#define  __alloc_pages_node(...)                alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))

static inline
struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp);

        return __folio_alloc_noprof(gfp, order, nid, NULL);
}

#define  __folio_alloc_node(...)                alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__))

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
                                                   unsigned int order)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return __alloc_pages_node_noprof(nid, gfp_mask, order);
}

#define  alloc_pages_node(...)                        alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))

#ifdef CONFIG_NUMA
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, bool hugepage);
#else
static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
}
static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid)
{
        return alloc_pages_noprof(gfp, order);
}
static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return __folio_alloc_node(gfp, order, numa_node_id());
}
#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage)                \
        folio_alloc_noprof(gfp, order)
#endif

#define alloc_pages(...)                        alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
#define alloc_pages_mpol(...)                        alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__))
#define folio_alloc(...)                        alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
#define vma_alloc_folio(...)                        alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false);

        return &folio->page;
}
#define alloc_page_vma(...)                        alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))

extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...)                        alloc_hooks(get_free_pages_noprof(__VA_ARGS__))

extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
#define get_zeroed_page(...)                        alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))

void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
#define alloc_pages_exact(...)                        alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))

void free_pages_exact(void *virt, size_t size);

__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
#define alloc_pages_exact_nid(...)                                        \
        alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))

#define __get_free_page(gfp_mask)                                        \
        __get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order)                                \
        __get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);

struct page_frag_cache;
void page_frag_cache_drain(struct page_frag_cache *nc);
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
                              gfp_t gfp_mask, unsigned int align_mask);

static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
                                          unsigned int fragsz, gfp_t gfp_mask,
                                          unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
}

static inline void *page_frag_alloc(struct page_frag_cache *nc,
                             unsigned int fragsz, gfp_t gfp_mask)
{
        return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
}

extern void page_frag_free(void *addr);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

void page_alloc_init_cpuhp(void);
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);
void setup_pcp_cacheinfo(unsigned int cpu);

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

static inline bool gfp_has_io_fs(gfp_t gfp)
{
        return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
}

/*
 * Check if the gfp flags allow compaction - GFP_NOIO is a really
 * tricky context because the migration might require IO.
 */
static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
{
        return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
}

extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);

#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
                              unsigned migratetype, gfp_t gfp_mask);
#define alloc_contig_range(...)                        alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))

extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
                                              int nid, nodemask_t *nodemask);
#define alloc_contig_pages(...)                        alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))

#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);

#endif /* __LINUX_GFP_H */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API for algorithms (i.e., low-level API).
 *
 * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/fips.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/workqueue.h>

#include "internal.h"

static LIST_HEAD(crypto_template_list);

#ifdef CONFIG_CRYPTO_MANAGER_EXTRA_TESTS
DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test);
EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test);
#endif

static inline void crypto_check_module_sig(struct module *mod)
{
        if (fips_enabled && mod && !module_sig_ok(mod))
                panic("Module %s signature verification failed in FIPS mode\n",
                      module_name(mod));
}

static int crypto_check_alg(struct crypto_alg *alg)
{
        crypto_check_module_sig(alg->cra_module);

        if (!alg->cra_name[0] || !alg->cra_driver_name[0])
                return -EINVAL;

        if (alg->cra_alignmask & (alg->cra_alignmask + 1))
                return -EINVAL;

        /* General maximums for all algs. */
        if (alg->cra_alignmask > MAX_ALGAPI_ALIGNMASK)
                return -EINVAL;

        if (alg->cra_blocksize > MAX_ALGAPI_BLOCKSIZE)
                return -EINVAL;

        /* Lower maximums for specific alg types. */
        if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) ==
                               CRYPTO_ALG_TYPE_CIPHER) {
                if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK)
                        return -EINVAL;

                if (alg->cra_blocksize > MAX_CIPHER_BLOCKSIZE)
                        return -EINVAL;
        }

        if (alg->cra_priority < 0)
                return -EINVAL;

        refcount_set(&alg->cra_refcnt, 1);

        return 0;
}

static void crypto_free_instance(struct crypto_instance *inst)
{
        inst->alg.cra_type->free(inst);
}

static void crypto_destroy_instance_workfn(struct work_struct *w)
{
        struct crypto_instance *inst = container_of(w, struct crypto_instance,
                                                    free_work);
        struct crypto_template *tmpl = inst->tmpl;

        crypto_free_instance(inst);
        crypto_tmpl_put(tmpl);
}

static void crypto_destroy_instance(struct crypto_alg *alg)
{
        struct crypto_instance *inst = container_of(alg,
                                                    struct crypto_instance,
                                                    alg);

        INIT_WORK(&inst->free_work, crypto_destroy_instance_workfn);
        schedule_work(&inst->free_work);
}

/*
 * This function adds a spawn to the list secondary_spawns which
 * will be used at the end of crypto_remove_spawns to unregister
 * instances, unless the spawn happens to be one that is depended
 * on by the new algorithm (nalg in crypto_remove_spawns).
 *
 * This function is also responsible for resurrecting any algorithms
 * in the dependency chain of nalg by unsetting n->dead.
 */
static struct list_head *crypto_more_spawns(struct crypto_alg *alg,
                                            struct list_head *stack,
                                            struct list_head *top,
                                            struct list_head *secondary_spawns)
{
        struct crypto_spawn *spawn, *n;

        spawn = list_first_entry_or_null(stack, struct crypto_spawn, list);
        if (!spawn)
                return NULL;

        n = list_prev_entry(spawn, list);
        list_move(&spawn->list, secondary_spawns);

        if (list_is_last(&n->list, stack))
                return top;

        n = list_next_entry(n, list);
        if (!spawn->dead)
                n->dead = false;

        return &n->inst->alg.cra_users;
}

static void crypto_remove_instance(struct crypto_instance *inst,
                                   struct list_head *list)
{
        struct crypto_template *tmpl = inst->tmpl;

        if (crypto_is_dead(&inst->alg))
                return;

        inst->alg.cra_flags |= CRYPTO_ALG_DEAD;

        if (!tmpl || !crypto_tmpl_get(tmpl))
                return;

        list_move(&inst->alg.cra_list, list);
        hlist_del(&inst->list);
        inst->alg.cra_destroy = crypto_destroy_instance;

        BUG_ON(!list_empty(&inst->alg.cra_users));
}

/*
 * Given an algorithm alg, remove all algorithms that depend on it
 * through spawns.  If nalg is not null, then exempt any algorithms
 * that is depended on by nalg.  This is useful when nalg itself
 * depends on alg.
 */
void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
                          struct crypto_alg *nalg)
{
        u32 new_type = (nalg ?: alg)->cra_flags;
        struct crypto_spawn *spawn, *n;
        LIST_HEAD(secondary_spawns);
        struct list_head *spawns;
        LIST_HEAD(stack);
        LIST_HEAD(top);

        spawns = &alg->cra_users;
        list_for_each_entry_safe(spawn, n, spawns, list) {
                if ((spawn->alg->cra_flags ^ new_type) & spawn->mask)
                        continue;

                list_move(&spawn->list, &top);
        }

        /*
         * Perform a depth-first walk starting from alg through
         * the cra_users tree.  The list stack records the path
         * from alg to the current spawn.
         */
        spawns = &top;
        do {
                while (!list_empty(spawns)) {
                        struct crypto_instance *inst;

                        spawn = list_first_entry(spawns, struct crypto_spawn,
                                                 list);
                        inst = spawn->inst;

                        list_move(&spawn->list, &stack);
                        spawn->dead = !spawn->registered || &inst->alg != nalg;

                        if (!spawn->registered)
                                break;

                        BUG_ON(&inst->alg == alg);

                        if (&inst->alg == nalg)
                                break;

                        spawns = &inst->alg.cra_users;

                        /*
                         * Even if spawn->registered is true, the
                         * instance itself may still be unregistered.
                         * This is because it may have failed during
                         * registration.  Therefore we still need to
                         * make the following test.
                         *
                         * We may encounter an unregistered instance here, since
                         * an instance's spawns are set up prior to the instance
                         * being registered.  An unregistered instance will have
                         * NULL ->cra_users.next, since ->cra_users isn't
                         * properly initialized until registration.  But an
                         * unregistered instance cannot have any users, so treat
                         * it the same as ->cra_users being empty.
                         */
                        if (spawns->next == NULL)
                                break;
                }
        } while ((spawns = crypto_more_spawns(alg, &stack, &top,
                                              &secondary_spawns)));

        /*
         * Remove all instances that are marked as dead.  Also
         * complete the resurrection of the others by moving them
         * back to the cra_users list.
         */
        list_for_each_entry_safe(spawn, n, &secondary_spawns, list) {
                if (!spawn->dead)
                        list_move(&spawn->list, &spawn->alg->cra_users);
                else if (spawn->registered)
                        crypto_remove_instance(spawn->inst, list);
        }
}
EXPORT_SYMBOL_GPL(crypto_remove_spawns);

static void crypto_alg_finish_registration(struct crypto_alg *alg,
                                           bool fulfill_requests,
                                           struct list_head *algs_to_put)
{
        struct crypto_alg *q;

        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                if (q == alg)
                        continue;

                if (crypto_is_moribund(q))
                        continue;

                if (crypto_is_larval(q)) {
                        struct crypto_larval *larval = (void *)q;

                        /*
                         * Check to see if either our generic name or
                         * specific name can satisfy the name requested
                         * by the larval entry q.
                         */
                        if (strcmp(alg->cra_name, q->cra_name) &&
                            strcmp(alg->cra_driver_name, q->cra_name))
                                continue;

                        if (larval->adult)
                                continue;
                        if ((q->cra_flags ^ alg->cra_flags) & larval->mask)
                                continue;

                        if (fulfill_requests && crypto_mod_get(alg))
                                larval->adult = alg;
                        else
                                larval->adult = ERR_PTR(-EAGAIN);

                        continue;
                }

                if (strcmp(alg->cra_name, q->cra_name))
                        continue;

                if (strcmp(alg->cra_driver_name, q->cra_driver_name) &&
                    q->cra_priority > alg->cra_priority)
                        continue;

                crypto_remove_spawns(q, algs_to_put, alg);
        }

        crypto_notify(CRYPTO_MSG_ALG_LOADED, alg);
}

static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg)
{
        struct crypto_larval *larval;

        if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) ||
            IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) ||
            (alg->cra_flags & CRYPTO_ALG_INTERNAL))
                return NULL; /* No self-test needed */

        larval = crypto_larval_alloc(alg->cra_name,
                                     alg->cra_flags | CRYPTO_ALG_TESTED, 0);
        if (IS_ERR(larval))
                return larval;

        larval->adult = crypto_mod_get(alg);
        if (!larval->adult) {
                kfree(larval);
                return ERR_PTR(-ENOENT);
        }

        refcount_set(&larval->alg.cra_refcnt, 1);
        memcpy(larval->alg.cra_driver_name, alg->cra_driver_name,
               CRYPTO_MAX_ALG_NAME);
        larval->alg.cra_priority = alg->cra_priority;

        return larval;
}

static struct crypto_larval *
__crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put)
{
        struct crypto_alg *q;
        struct crypto_larval *larval;
        int ret = -EAGAIN;

        if (crypto_is_dead(alg))
                goto err;

        INIT_LIST_HEAD(&alg->cra_users);

        ret = -EEXIST;

        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                if (q == alg)
                        goto err;

                if (crypto_is_moribund(q))
                        continue;

                if (crypto_is_larval(q)) {
                        if (!strcmp(alg->cra_driver_name, q->cra_driver_name))
                                goto err;
                        continue;
                }

                if (!strcmp(q->cra_driver_name, alg->cra_name) ||
                    !strcmp(q->cra_driver_name, alg->cra_driver_name) ||
                    !strcmp(q->cra_name, alg->cra_driver_name))
                        goto err;
        }

        larval = crypto_alloc_test_larval(alg);
        if (IS_ERR(larval))
                goto out;

        list_add(&alg->cra_list, &crypto_alg_list);

        if (larval) {
                /* No cheating! */
                alg->cra_flags &= ~CRYPTO_ALG_TESTED;

                list_add(&larval->alg.cra_list, &crypto_alg_list);
        } else {
                alg->cra_flags |= CRYPTO_ALG_TESTED;
                crypto_alg_finish_registration(alg, true, algs_to_put);
        }

out:
        return larval;

err:
        larval = ERR_PTR(ret);
        goto out;
}

void crypto_alg_tested(const char *name, int err)
{
        struct crypto_larval *test;
        struct crypto_alg *alg;
        struct crypto_alg *q;
        LIST_HEAD(list);
        bool best;

        down_write(&crypto_alg_sem);
        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                if (crypto_is_moribund(q) || !crypto_is_larval(q))
                        continue;

                test = (struct crypto_larval *)q;

                if (!strcmp(q->cra_driver_name, name))
                        goto found;
        }

        pr_err("alg: Unexpected test result for %s: %d\n", name, err);
        goto unlock;

found:
        q->cra_flags |= CRYPTO_ALG_DEAD;
        alg = test->adult;

        if (list_empty(&alg->cra_list))
                goto complete;

        if (err == -ECANCELED)
                alg->cra_flags |= CRYPTO_ALG_FIPS_INTERNAL;
        else if (err)
                goto complete;
        else
                alg->cra_flags &= ~CRYPTO_ALG_FIPS_INTERNAL;

        alg->cra_flags |= CRYPTO_ALG_TESTED;

        /*
         * If a higher-priority implementation of the same algorithm is
         * currently being tested, then don't fulfill request larvals.
         */
        best = true;
        list_for_each_entry(q, &crypto_alg_list, cra_list) {
                if (crypto_is_moribund(q) || !crypto_is_larval(q))
                        continue;

                if (strcmp(alg->cra_name, q->cra_name))
                        continue;

                if (q->cra_priority > alg->cra_priority) {
                        best = false;
                        break;
                }
        }

        crypto_alg_finish_registration(alg, best, &list);

complete:
        complete_all(&test->completion);

unlock:
        up_write(&crypto_alg_sem);

        crypto_remove_final(&list);
}
EXPORT_SYMBOL_GPL(crypto_alg_tested);

void crypto_remove_final(struct list_head *list)
{
        struct crypto_alg *alg;
        struct crypto_alg *n;

        list_for_each_entry_safe(alg, n, list, cra_list) {
                list_del_init(&alg->cra_list);
                crypto_alg_put(alg);
        }
}
EXPORT_SYMBOL_GPL(crypto_remove_final);

int crypto_register_alg(struct crypto_alg *alg)
{
        struct crypto_larval *larval;
        LIST_HEAD(algs_to_put);
        bool test_started = false;
        int err;

        alg->cra_flags &= ~CRYPTO_ALG_DEAD;
        err = crypto_check_alg(alg);
        if (err)
                return err;

        down_write(&crypto_alg_sem);
        larval = __crypto_register_alg(alg, &algs_to_put);
        if (!IS_ERR_OR_NULL(larval)) {
                test_started = crypto_boot_test_finished();
                larval->test_started = test_started;
        }
        up_write(&crypto_alg_sem);

        if (IS_ERR(larval))
                return PTR_ERR(larval);
        if (test_started)
                crypto_wait_for_test(larval);
        crypto_remove_final(&algs_to_put);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_register_alg);

static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list)
{
        if (unlikely(list_empty(&alg->cra_list)))
                return -ENOENT;

        alg->cra_flags |= CRYPTO_ALG_DEAD;

        list_del_init(&alg->cra_list);
        crypto_remove_spawns(alg, list, NULL);

        return 0;
}

void crypto_unregister_alg(struct crypto_alg *alg)
{
        int ret;
        LIST_HEAD(list);

        down_write(&crypto_alg_sem);
        ret = crypto_remove_alg(alg, &list);
        up_write(&crypto_alg_sem);

        if (WARN(ret, "Algorithm %s is not registered", alg->cra_driver_name))
                return;

        if (WARN_ON(refcount_read(&alg->cra_refcnt) != 1))
                return;

        if (alg->cra_destroy)
                alg->cra_destroy(alg);

        crypto_remove_final(&list);
}
EXPORT_SYMBOL_GPL(crypto_unregister_alg);

int crypto_register_algs(struct crypto_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_alg(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_alg(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_algs);

void crypto_unregister_algs(struct crypto_alg *algs, int count)
{
        int i;

        for (i = 0; i < count; i++)
                crypto_unregister_alg(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_algs);

int crypto_register_template(struct crypto_template *tmpl)
{
        struct crypto_template *q;
        int err = -EEXIST;

        down_write(&crypto_alg_sem);

        crypto_check_module_sig(tmpl->module);

        list_for_each_entry(q, &crypto_template_list, list) {
                if (q == tmpl)
                        goto out;
        }

        list_add(&tmpl->list, &crypto_template_list);
        err = 0;
out:
        up_write(&crypto_alg_sem);
        return err;
}
EXPORT_SYMBOL_GPL(crypto_register_template);

int crypto_register_templates(struct crypto_template *tmpls, int count)
{
        int i, err;

        for (i = 0; i < count; i++) {
                err = crypto_register_template(&tmpls[i]);
                if (err)
                        goto out;
        }
        return 0;

out:
        for (--i; i >= 0; --i)
                crypto_unregister_template(&tmpls[i]);
        return err;
}
EXPORT_SYMBOL_GPL(crypto_register_templates);

void crypto_unregister_template(struct crypto_template *tmpl)
{
        struct crypto_instance *inst;
        struct hlist_node *n;
        struct hlist_head *list;
        LIST_HEAD(users);

        down_write(&crypto_alg_sem);

        BUG_ON(list_empty(&tmpl->list));
        list_del_init(&tmpl->list);

        list = &tmpl->instances;
        hlist_for_each_entry(inst, list, list) {
                int err = crypto_remove_alg(&inst->alg, &users);

                BUG_ON(err);
        }

        up_write(&crypto_alg_sem);

        hlist_for_each_entry_safe(inst, n, list, list) {
                BUG_ON(refcount_read(&inst->alg.cra_refcnt) != 1);
                crypto_free_instance(inst);
        }
        crypto_remove_final(&users);
}
EXPORT_SYMBOL_GPL(crypto_unregister_template);

void crypto_unregister_templates(struct crypto_template *tmpls, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_template(&tmpls[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_templates);

static struct crypto_template *__crypto_lookup_template(const char *name)
{
        struct crypto_template *q, *tmpl = NULL;

        down_read(&crypto_alg_sem);
        list_for_each_entry(q, &crypto_template_list, list) {
                if (strcmp(q->name, name))
                        continue;
                if (unlikely(!crypto_tmpl_get(q)))
                        continue;

                tmpl = q;
                break;
        }
        up_read(&crypto_alg_sem);

        return tmpl;
}

struct crypto_template *crypto_lookup_template(const char *name)
{
        return try_then_request_module(__crypto_lookup_template(name),
                                       "crypto-%s", name);
}
EXPORT_SYMBOL_GPL(crypto_lookup_template);

int crypto_register_instance(struct crypto_template *tmpl,
                             struct crypto_instance *inst)
{
        struct crypto_larval *larval;
        struct crypto_spawn *spawn;
        u32 fips_internal = 0;
        LIST_HEAD(algs_to_put);
        int err;

        err = crypto_check_alg(&inst->alg);
        if (err)
                return err;

        inst->alg.cra_module = tmpl->module;
        inst->alg.cra_flags |= CRYPTO_ALG_INSTANCE;

        down_write(&crypto_alg_sem);

        larval = ERR_PTR(-EAGAIN);
        for (spawn = inst->spawns; spawn;) {
                struct crypto_spawn *next;

                if (spawn->dead)
                        goto unlock;

                next = spawn->next;
                spawn->inst = inst;
                spawn->registered = true;

                fips_internal |= spawn->alg->cra_flags;

                crypto_mod_put(spawn->alg);

                spawn = next;
        }

        inst->alg.cra_flags |= (fips_internal & CRYPTO_ALG_FIPS_INTERNAL);

        larval = __crypto_register_alg(&inst->alg, &algs_to_put);
        if (IS_ERR(larval))
                goto unlock;
        else if (larval)
                larval->test_started = true;

        hlist_add_head(&inst->list, &tmpl->instances);
        inst->tmpl = tmpl;

unlock:
        up_write(&crypto_alg_sem);

        if (IS_ERR(larval))
                return PTR_ERR(larval);
        if (larval)
                crypto_wait_for_test(larval);
        crypto_remove_final(&algs_to_put);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_register_instance);

void crypto_unregister_instance(struct crypto_instance *inst)
{
        LIST_HEAD(list);

        down_write(&crypto_alg_sem);

        crypto_remove_spawns(&inst->alg, &list, NULL);
        crypto_remove_instance(inst, &list);

        up_write(&crypto_alg_sem);

        crypto_remove_final(&list);
}
EXPORT_SYMBOL_GPL(crypto_unregister_instance);

int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask)
{
        struct crypto_alg *alg;
        int err = -EAGAIN;

        if (WARN_ON_ONCE(inst == NULL))
                return -EINVAL;

        /* Allow the result of crypto_attr_alg_name() to be passed directly */
        if (IS_ERR(name))
                return PTR_ERR(name);

        alg = crypto_find_alg(name, spawn->frontend,
                              type | CRYPTO_ALG_FIPS_INTERNAL, mask);
        if (IS_ERR(alg))
                return PTR_ERR(alg);

        down_write(&crypto_alg_sem);
        if (!crypto_is_moribund(alg)) {
                list_add(&spawn->list, &alg->cra_users);
                spawn->alg = alg;
                spawn->mask = mask;
                spawn->next = inst->spawns;
                inst->spawns = spawn;
                inst->alg.cra_flags |=
                        (alg->cra_flags & CRYPTO_ALG_INHERITED_FLAGS);
                err = 0;
        }
        up_write(&crypto_alg_sem);
        if (err)
                crypto_mod_put(alg);
        return err;
}
EXPORT_SYMBOL_GPL(crypto_grab_spawn);

void crypto_drop_spawn(struct crypto_spawn *spawn)
{
        if (!spawn->alg) /* not yet initialized? */
                return;

        down_write(&crypto_alg_sem);
        if (!spawn->dead)
                list_del(&spawn->list);
        up_write(&crypto_alg_sem);

        if (!spawn->registered)
                crypto_mod_put(spawn->alg);
}
EXPORT_SYMBOL_GPL(crypto_drop_spawn);

static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn)
{
        struct crypto_alg *alg = ERR_PTR(-EAGAIN);
        struct crypto_alg *target;
        bool shoot = false;

        down_read(&crypto_alg_sem);
        if (!spawn->dead) {
                alg = spawn->alg;
                if (!crypto_mod_get(alg)) {
                        target = crypto_alg_get(alg);
                        shoot = true;
                        alg = ERR_PTR(-EAGAIN);
                }
        }
        up_read(&crypto_alg_sem);

        if (shoot) {
                crypto_shoot_alg(target);
                crypto_alg_put(target);
        }

        return alg;
}

struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type,
                                    u32 mask)
{
        struct crypto_alg *alg;
        struct crypto_tfm *tfm;

        alg = crypto_spawn_alg(spawn);
        if (IS_ERR(alg))
                return ERR_CAST(alg);

        tfm = ERR_PTR(-EINVAL);
        if (unlikely((alg->cra_flags ^ type) & mask))
                goto out_put_alg;

        tfm = __crypto_alloc_tfm(alg, type, mask);
        if (IS_ERR(tfm))
                goto out_put_alg;

        return tfm;

out_put_alg:
        crypto_mod_put(alg);
        return tfm;
}
EXPORT_SYMBOL_GPL(crypto_spawn_tfm);

void *crypto_spawn_tfm2(struct crypto_spawn *spawn)
{
        struct crypto_alg *alg;
        struct crypto_tfm *tfm;

        alg = crypto_spawn_alg(spawn);
        if (IS_ERR(alg))
                return ERR_CAST(alg);

        tfm = crypto_create_tfm(alg, spawn->frontend);
        if (IS_ERR(tfm))
                goto out_put_alg;

        return tfm;

out_put_alg:
        crypto_mod_put(alg);
        return tfm;
}
EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);

int crypto_register_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&crypto_chain, nb);
}
EXPORT_SYMBOL_GPL(crypto_register_notifier);

int crypto_unregister_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&crypto_chain, nb);
}
EXPORT_SYMBOL_GPL(crypto_unregister_notifier);

struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb)
{
        struct rtattr *rta = tb[0];
        struct crypto_attr_type *algt;

        if (!rta)
                return ERR_PTR(-ENOENT);
        if (RTA_PAYLOAD(rta) < sizeof(*algt))
                return ERR_PTR(-EINVAL);
        if (rta->rta_type != CRYPTOA_TYPE)
                return ERR_PTR(-EINVAL);

        algt = RTA_DATA(rta);

        return algt;
}
EXPORT_SYMBOL_GPL(crypto_get_attr_type);

/**
 * crypto_check_attr_type() - check algorithm type and compute inherited mask
 * @tb: the template parameters
 * @type: the algorithm type the template would be instantiated as
 * @mask_ret: (output) the mask that should be passed to crypto_grab_*()
 *              to restrict the flags of any inner algorithms
 *
 * Validate that the algorithm type the user requested is compatible with the
 * one the template would actually be instantiated as.  E.g., if the user is
 * doing crypto_alloc_shash("cbc(aes)", ...), this would return an error because
 * the "cbc" template creates an "skcipher" algorithm, not an "shash" algorithm.
 *
 * Also compute the mask to use to restrict the flags of any inner algorithms.
 *
 * Return: 0 on success; -errno on failure
 */
int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret)
{
        struct crypto_attr_type *algt;

        algt = crypto_get_attr_type(tb);
        if (IS_ERR(algt))
                return PTR_ERR(algt);

        if ((algt->type ^ type) & algt->mask)
                return -EINVAL;

        *mask_ret = crypto_algt_inherited_mask(algt);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_check_attr_type);

const char *crypto_attr_alg_name(struct rtattr *rta)
{
        struct crypto_attr_alg *alga;

        if (!rta)
                return ERR_PTR(-ENOENT);
        if (RTA_PAYLOAD(rta) < sizeof(*alga))
                return ERR_PTR(-EINVAL);
        if (rta->rta_type != CRYPTOA_ALG)
                return ERR_PTR(-EINVAL);

        alga = RTA_DATA(rta);
        alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0;

        return alga->name;
}
EXPORT_SYMBOL_GPL(crypto_attr_alg_name);

int crypto_inst_setname(struct crypto_instance *inst, const char *name,
                        struct crypto_alg *alg)
{
        if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name,
                     alg->cra_name) >= CRYPTO_MAX_ALG_NAME)
                return -ENAMETOOLONG;

        if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
                     name, alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
                return -ENAMETOOLONG;

        return 0;
}
EXPORT_SYMBOL_GPL(crypto_inst_setname);

void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen)
{
        INIT_LIST_HEAD(&queue->list);
        queue->backlog = &queue->list;
        queue->qlen = 0;
        queue->max_qlen = max_qlen;
}
EXPORT_SYMBOL_GPL(crypto_init_queue);

int crypto_enqueue_request(struct crypto_queue *queue,
                           struct crypto_async_request *request)
{
        int err = -EINPROGRESS;

        if (unlikely(queue->qlen >= queue->max_qlen)) {
                if (!(request->flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
                        err = -ENOSPC;
                        goto out;
                }
                err = -EBUSY;
                if (queue->backlog == &queue->list)
                        queue->backlog = &request->list;
        }

        queue->qlen++;
        list_add_tail(&request->list, &queue->list);

out:
        return err;
}
EXPORT_SYMBOL_GPL(crypto_enqueue_request);

void crypto_enqueue_request_head(struct crypto_queue *queue,
                                 struct crypto_async_request *request)
{
        if (unlikely(queue->qlen >= queue->max_qlen))
                queue->backlog = queue->backlog->prev;

        queue->qlen++;
        list_add(&request->list, &queue->list);
}
EXPORT_SYMBOL_GPL(crypto_enqueue_request_head);

struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue)
{
        struct list_head *request;

        if (unlikely(!queue->qlen))
                return NULL;

        queue->qlen--;

        if (queue->backlog != &queue->list)
                queue->backlog = queue->backlog->next;

        request = queue->list.next;
        list_del(request);

        return list_entry(request, struct crypto_async_request, list);
}
EXPORT_SYMBOL_GPL(crypto_dequeue_request);

static inline void crypto_inc_byte(u8 *a, unsigned int size)
{
        u8 *b = (a + size);
        u8 c;

        for (; size; size--) {
                c = *--b + 1;
                *b = c;
                if (c)
                        break;
        }
}

void crypto_inc(u8 *a, unsigned int size)
{
        __be32 *b = (__be32 *)(a + size);
        u32 c;

        if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            IS_ALIGNED((unsigned long)b, __alignof__(*b)))
                for (; size >= 4; size -= 4) {
                        c = be32_to_cpu(*--b) + 1;
                        *b = cpu_to_be32(c);
                        if (likely(c))
                                return;
                }

        crypto_inc_byte(a, size);
}
EXPORT_SYMBOL_GPL(crypto_inc);

unsigned int crypto_alg_extsize(struct crypto_alg *alg)
{
        return alg->cra_ctxsize +
               (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1));
}
EXPORT_SYMBOL_GPL(crypto_alg_extsize);

int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
                        u32 type, u32 mask)
{
        int ret = 0;
        struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask);

        if (!IS_ERR(alg)) {
                crypto_mod_put(alg);
                ret = 1;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_type_has_alg);

static void __init crypto_start_tests(void)
{
        if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
                return;

        for (;;) {
                struct crypto_larval *larval = NULL;
                struct crypto_alg *q;

                down_write(&crypto_alg_sem);

                list_for_each_entry(q, &crypto_alg_list, cra_list) {
                        struct crypto_larval *l;

                        if (!crypto_is_larval(q))
                                continue;

                        l = (void *)q;

                        if (!crypto_is_test_larval(l))
                                continue;

                        if (l->test_started)
                                continue;

                        l->test_started = true;
                        larval = l;
                        break;
                }

                up_write(&crypto_alg_sem);

                if (!larval)
                        break;

                crypto_wait_for_test(larval);
        }

        set_crypto_boot_test_finished();
}

static int __init crypto_algapi_init(void)
{
        crypto_init_proc();
        crypto_start_tests();
        return 0;
}

static void __exit crypto_algapi_exit(void)
{
        crypto_exit_proc();
}

/*
 * We run this at late_initcall so that all the built-in algorithms
 * have had a chance to register themselves first.
 */
late_initcall(crypto_algapi_init);
module_exit(crypto_algapi_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Cryptographic algorithms API");
MODULE_SOFTDEP("pre: cryptomgr");

























































































































































    2 








    3 



























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Directory notifications for Linux.
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
 * dnotify was largly rewritten to use the new fsnotify infrastructure
 */
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/dnotify.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>

static int dir_notify_enable __read_mostly = 1;
#ifdef CONFIG_SYSCTL
static struct ctl_table dnotify_sysctls[] = {
        {
                .procname        = "dir-notify-enable",
                .data                = &dir_notify_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};
static void __init dnotify_sysctl_init(void)
{
        register_sysctl_init("fs", dnotify_sysctls);
}
#else
#define dnotify_sysctl_init() do { } while (0)
#endif

static struct kmem_cache *dnotify_struct_cache __ro_after_init;
static struct kmem_cache *dnotify_mark_cache __ro_after_init;
static struct fsnotify_group *dnotify_group __ro_after_init;

/*
 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
 * is being watched by dnotify.  If multiple userspace applications are watching
 * the same directory with dnotify their information is chained in dn
 */
struct dnotify_mark {
        struct fsnotify_mark fsn_mark;
        struct dnotify_struct *dn;
};

/*
 * When a process starts or stops watching an inode the set of events which
 * dnotify cares about for that inode may change.  This function runs the
 * list of everything receiving dnotify events about this directory and calculates
 * the set of all those events.  After it updates what dnotify is interested in
 * it calls the fsnotify function so it can update the set of all events relevant
 * to this inode.
 */
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
        __u32 new_mask = 0;
        struct dnotify_struct *dn;
        struct dnotify_mark *dn_mark  = container_of(fsn_mark,
                                                     struct dnotify_mark,
                                                     fsn_mark);

        assert_spin_locked(&fsn_mark->lock);

        for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
        if (fsn_mark->mask == new_mask)
                return;
        fsn_mark->mask = new_mask;

        fsnotify_recalc_mask(fsn_mark->connector);
}

/*
 * Mains fsnotify call where events are delivered to dnotify.
 * Find the dnotify mark on the relevant inode, run the list of dnotify structs
 * on that mark and determine which of them has expressed interest in receiving
 * events of this type.  When found send the correct process and signal and
 * destroy the dnotify struct if it was not registered to receive multiple
 * events.
 */
static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
                                struct inode *inode, struct inode *dir,
                                const struct qstr *name, u32 cookie)
{
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
        __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;

        /* not a dir, dnotify doesn't care */
        if (!dir && !(mask & FS_ISDIR))
                return 0;

        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);

        spin_lock(&inode_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_mask & test_mask) == 0) {
                        prev = &dn->dn_next;
                        continue;
                }
                fown = &dn->dn_filp->f_owner;
                send_sigio(fown, dn->dn_fd, POLL_MSG);
                if (dn->dn_mask & FS_DN_MULTISHOT)
                        prev = &dn->dn_next;
                else {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(inode_mark);
                }
        }

        spin_unlock(&inode_mark->lock);

        return 0;
}

static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
        struct dnotify_mark *dn_mark = container_of(fsn_mark,
                                                    struct dnotify_mark,
                                                    fsn_mark);

        BUG_ON(dn_mark->dn);

        kmem_cache_free(dnotify_mark_cache, dn_mark);
}

static const struct fsnotify_ops dnotify_fsnotify_ops = {
        .handle_inode_event = dnotify_handle_event,
        .free_mark = dnotify_free_mark,
};

/*
 * Called every time a file is closed.  Looks first for a dnotify mark on the
 * inode.  If one is found run all of the ->dn structures attached to that
 * mark for one relevant to this process closing the file and remove that
 * dnotify_struct.  If that was the last dnotify_struct also remove the
 * fsnotify_mark.
 */
void dnotify_flush(struct file *filp, fl_owner_t id)
{
        struct fsnotify_mark *fsn_mark;
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
        bool free = false;

        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode))
                return;

        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (!fsn_mark)
                return;
        dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);

        fsnotify_group_lock(dnotify_group);

        spin_lock(&fsn_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(fsn_mark);
                        break;
                }
                prev = &dn->dn_next;
        }

        spin_unlock(&fsn_mark->lock);

        /* nothing else could have found us thanks to the dnotify_groups
           mark_mutex */
        if (dn_mark->dn == NULL) {
                fsnotify_detach_mark(fsn_mark);
                free = true;
        }

        fsnotify_group_unlock(dnotify_group);

        if (free)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
}

/* this conversion is done only at watch creation */
static __u32 convert_arg(unsigned int arg)
{
        __u32 new_mask = FS_EVENT_ON_CHILD;

        if (arg & DN_MULTISHOT)
                new_mask |= FS_DN_MULTISHOT;
        if (arg & DN_DELETE)
                new_mask |= (FS_DELETE | FS_MOVED_FROM);
        if (arg & DN_MODIFY)
                new_mask |= FS_MODIFY;
        if (arg & DN_ACCESS)
                new_mask |= FS_ACCESS;
        if (arg & DN_ATTRIB)
                new_mask |= FS_ATTRIB;
        if (arg & DN_RENAME)
                new_mask |= FS_RENAME;
        if (arg & DN_CREATE)
                new_mask |= (FS_CREATE | FS_MOVED_TO);

        return new_mask;
}

/*
 * If multiple processes watch the same inode with dnotify there is only one
 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
 * onto that mark.  This function either attaches the new dnotify_struct onto
 * that list, or it |= the mask onto an existing dnofiy_struct.
 */
static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
{
        struct dnotify_struct *odn;

        odn = dn_mark->dn;
        while (odn != NULL) {
                /* adding more events to existing dnofiy_struct? */
                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
                        odn->dn_fd = fd;
                        odn->dn_mask |= mask;
                        return -EEXIST;
                }
                odn = odn->dn_next;
        }

        dn->dn_mask = mask;
        dn->dn_fd = fd;
        dn->dn_filp = filp;
        dn->dn_owner = id;
        dn->dn_next = dn_mark->dn;
        dn_mark->dn = dn;

        return 0;
}

/*
 * When a process calls fcntl to attach a dnotify watch to a directory it ends
 * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
 * attached to the fsnotify_mark.
 */
int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
{
        struct dnotify_mark *new_dn_mark, *dn_mark;
        struct fsnotify_mark *new_fsn_mark, *fsn_mark;
        struct dnotify_struct *dn;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f = NULL;
        int destroy = 0, error = 0;
        __u32 mask;

        /* we use these to tell if we need to kfree */
        new_fsn_mark = NULL;
        dn = NULL;

        if (!dir_notify_enable) {
                error = -EINVAL;
                goto out_err;
        }

        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
                error = 0;
                goto out_err;
        }

        /* dnotify only works on directories */
        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode)) {
                error = -ENOTDIR;
                goto out_err;
        }

        /*
         * convert the userspace DN_* "arg" to the internal FS_*
         * defined in fsnotify
         */
        mask = convert_arg(arg);

        error = security_path_notify(&filp->f_path, mask,
                        FSNOTIFY_OBJ_TYPE_INODE);
        if (error)
                goto out_err;

        /* expect most fcntl to add new rather than augment old */
        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
        if (!dn) {
                error = -ENOMEM;
                goto out_err;
        }

        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
        new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
        if (!new_dn_mark) {
                error = -ENOMEM;
                goto out_err;
        }

        /* set up the new_fsn_mark and new_dn_mark */
        new_fsn_mark = &new_dn_mark->fsn_mark;
        fsnotify_init_mark(new_fsn_mark, dnotify_group);
        new_fsn_mark->mask = mask;
        new_dn_mark->dn = NULL;

        /* this is needed to prevent the fcntl/close race described below */
        fsnotify_group_lock(dnotify_group);

        /* add the new_fsn_mark or find an old one. */
        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (fsn_mark) {
                dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
                spin_lock(&fsn_mark->lock);
        } else {
                error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
                if (error) {
                        fsnotify_group_unlock(dnotify_group);
                        goto out_err;
                }
                spin_lock(&new_fsn_mark->lock);
                fsn_mark = new_fsn_mark;
                dn_mark = new_dn_mark;
                /* we used new_fsn_mark, so don't free it */
                new_fsn_mark = NULL;
        }

        rcu_read_lock();
        f = lookup_fdget_rcu(fd);
        rcu_read_unlock();

        /* if (f != filp) means that we lost a race and another task/thread
         * actually closed the fd we are still playing with before we grabbed
         * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
         * fd is the only time we clean up the marks we need to get our mark
         * off the list. */
        if (f != filp) {
                /* if we added ourselves, shoot ourselves, it's possible that
                 * the flush actually did shoot this fsn_mark.  That's fine too
                 * since multiple calls to destroy_mark is perfectly safe, if
                 * we found a dn_mark already attached to the inode, just sod
                 * off silently as the flush at close time dealt with it.
                 */
                if (dn_mark == new_dn_mark)
                        destroy = 1;
                error = 0;
                goto out;
        }

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);

        error = attach_dn(dn, dn_mark, id, fd, filp, mask);
        /* !error means that we attached the dn to the dn_mark, so don't free it */
        if (!error)
                dn = NULL;
        /* -EEXIST means that we didn't add this new dn and used an old one.
         * that isn't an error (and the unused dn should be freed) */
        else if (error == -EEXIST)
                error = 0;

        dnotify_recalc_inode_mask(fsn_mark);
out:
        spin_unlock(&fsn_mark->lock);

        if (destroy)
                fsnotify_detach_mark(fsn_mark);
        fsnotify_group_unlock(dnotify_group);
        if (destroy)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
out_err:
        if (new_fsn_mark)
                fsnotify_put_mark(new_fsn_mark);
        if (dn)
                kmem_cache_free(dnotify_struct_cache, dn);
        if (f)
                fput(f);
        return error;
}

static int __init dnotify_init(void)
{
        dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
                                          SLAB_PANIC|SLAB_ACCOUNT);
        dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);

        dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops,
                                             FSNOTIFY_GROUP_NOFS);
        if (IS_ERR(dnotify_group))
                panic("unable to allocate fsnotify group for dnotify\n");
        dnotify_sysctl_init();
        return 0;
}

module_init(dnotify_init)

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#ifndef _LINUX_BPF_H
#define _LINUX_BPF_H 1

#include <uapi/linux/bpf.h>
#include <uapi/linux/filter.h>

#include <linux/workqueue.h>
#include <linux/file.h>
#include <linux/percpu.h>
#include <linux/err.h>
#include <linux/rbtree_latch.h>
#include <linux/numa.h>
#include <linux/mm_types.h>
#include <linux/wait.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/capability.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/percpu-refcount.h>
#include <linux/stddef.h>
#include <linux/bpfptr.h>
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/static_call.h>
#include <linux/memcontrol.h>
#include <linux/cfi.h>

struct bpf_verifier_env;
struct bpf_verifier_log;
struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct bpf_arena;
struct sock;
struct seq_file;
struct btf;
struct btf_type;
struct exception_table_entry;
struct seq_operations;
struct bpf_iter_aux_info;
struct bpf_local_storage;
struct bpf_local_storage_map;
struct kobject;
struct mem_cgroup;
struct module;
struct bpf_func_state;
struct ftrace_ops;
struct cgroup;
struct bpf_token;
struct user_namespace;
struct super_block;
struct inode;

extern struct idr btf_idr;
extern spinlock_t btf_idr_lock;
extern struct kobject *btf_kobj;
extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
extern bool bpf_global_ma_set;

typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
                                        struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
typedef unsigned int (*bpf_func_t)(const void *,
                                   const struct bpf_insn *);
struct bpf_iter_seq_info {
        const struct seq_operations *seq_ops;
        bpf_iter_init_seq_priv_t init_seq_private;
        bpf_iter_fini_seq_priv_t fini_seq_private;
        u32 seq_priv_size;
};

/* map is generic key/value storage optionally accessible by eBPF programs */
struct bpf_map_ops {
        /* funcs callable from userspace (via syscall) */
        int (*map_alloc_check)(union bpf_attr *attr);
        struct bpf_map *(*map_alloc)(union bpf_attr *attr);
        void (*map_release)(struct bpf_map *map, struct file *map_file);
        void (*map_free)(struct bpf_map *map);
        int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
        void (*map_release_uref)(struct bpf_map *map);
        void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
        int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
                                union bpf_attr __user *uattr);
        int (*map_lookup_and_delete_elem)(struct bpf_map *map, void *key,
                                          void *value, u64 flags);
        int (*map_lookup_and_delete_batch)(struct bpf_map *map,
                                           const union bpf_attr *attr,
                                           union bpf_attr __user *uattr);
        int (*map_update_batch)(struct bpf_map *map, struct file *map_file,
                                const union bpf_attr *attr,
                                union bpf_attr __user *uattr);
        int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
                                union bpf_attr __user *uattr);

        /* funcs callable from userspace and from eBPF programs */
        void *(*map_lookup_elem)(struct bpf_map *map, void *key);
        long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
        long (*map_delete_elem)(struct bpf_map *map, void *key);
        long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
        long (*map_pop_elem)(struct bpf_map *map, void *value);
        long (*map_peek_elem)(struct bpf_map *map, void *value);
        void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);

        /* funcs called by prog_array and perf_event_array map */
        void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
                                int fd);
        /* If need_defer is true, the implementation should guarantee that
         * the to-be-put element is still alive before the bpf program, which
         * may manipulate it, exists.
         */
        void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
        int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
        u32 (*map_fd_sys_lookup_elem)(void *ptr);
        void (*map_seq_show_elem)(struct bpf_map *map, void *key,
                                  struct seq_file *m);
        int (*map_check_btf)(const struct bpf_map *map,
                             const struct btf *btf,
                             const struct btf_type *key_type,
                             const struct btf_type *value_type);

        /* Prog poke tracking helpers. */
        int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
        void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
        void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
                             struct bpf_prog *new);

        /* Direct value access helpers. */
        int (*map_direct_value_addr)(const struct bpf_map *map,
                                     u64 *imm, u32 off);
        int (*map_direct_value_meta)(const struct bpf_map *map,
                                     u64 imm, u32 *off);
        int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
        __poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
                             struct poll_table_struct *pts);
        unsigned long (*map_get_unmapped_area)(struct file *filep, unsigned long addr,
                                               unsigned long len, unsigned long pgoff,
                                               unsigned long flags);

        /* Functions called by bpf_local_storage maps */
        int (*map_local_storage_charge)(struct bpf_local_storage_map *smap,
                                        void *owner, u32 size);
        void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap,
                                           void *owner, u32 size);
        struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);

        /* Misc helpers.*/
        long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);

        /* map_meta_equal must be implemented for maps that can be
         * used as an inner map.  It is a runtime check to ensure
         * an inner map can be inserted to an outer map.
         *
         * Some properties of the inner map has been used during the
         * verification time.  When inserting an inner map at the runtime,
         * map_meta_equal has to ensure the inserting map has the same
         * properties that the verifier has used earlier.
         */
        bool (*map_meta_equal)(const struct bpf_map *meta0,
                               const struct bpf_map *meta1);


        int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
                                              struct bpf_func_state *caller,
                                              struct bpf_func_state *callee);
        long (*map_for_each_callback)(struct bpf_map *map,
                                     bpf_callback_t callback_fn,
                                     void *callback_ctx, u64 flags);

        u64 (*map_mem_usage)(const struct bpf_map *map);

        /* BTF id of struct allocated by map_alloc */
        int *map_btf_id;

        /* bpf_iter info used to open a seq_file */
        const struct bpf_iter_seq_info *iter_seq_info;
};

enum {
        /* Support at most 11 fields in a BTF type */
        BTF_FIELDS_MAX           = 11,
};

enum btf_field_type {
        BPF_SPIN_LOCK  = (1 << 0),
        BPF_TIMER      = (1 << 1),
        BPF_KPTR_UNREF = (1 << 2),
        BPF_KPTR_REF   = (1 << 3),
        BPF_KPTR_PERCPU = (1 << 4),
        BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU,
        BPF_LIST_HEAD  = (1 << 5),
        BPF_LIST_NODE  = (1 << 6),
        BPF_RB_ROOT    = (1 << 7),
        BPF_RB_NODE    = (1 << 8),
        BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
        BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
        BPF_REFCOUNT   = (1 << 9),
        BPF_WORKQUEUE  = (1 << 10),
};

typedef void (*btf_dtor_kfunc_t)(void *);

struct btf_field_kptr {
        struct btf *btf;
        struct module *module;
        /* dtor used if btf_is_kernel(btf), otherwise the type is
         * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
         */
        btf_dtor_kfunc_t dtor;
        u32 btf_id;
};

struct btf_field_graph_root {
        struct btf *btf;
        u32 value_btf_id;
        u32 node_offset;
        struct btf_record *value_rec;
};

struct btf_field {
        u32 offset;
        u32 size;
        enum btf_field_type type;
        union {
                struct btf_field_kptr kptr;
                struct btf_field_graph_root graph_root;
        };
};

struct btf_record {
        u32 cnt;
        u32 field_mask;
        int spin_lock_off;
        int timer_off;
        int wq_off;
        int refcount_off;
        struct btf_field fields[];
};

/* Non-opaque version of bpf_rb_node in uapi/linux/bpf.h */
struct bpf_rb_node_kern {
        struct rb_node rb_node;
        void *owner;
} __attribute__((aligned(8)));

/* Non-opaque version of bpf_list_node in uapi/linux/bpf.h */
struct bpf_list_node_kern {
        struct list_head list_head;
        void *owner;
} __attribute__((aligned(8)));

struct bpf_map {
        const struct bpf_map_ops *ops;
        struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
        void *security;
#endif
        enum bpf_map_type map_type;
        u32 key_size;
        u32 value_size;
        u32 max_entries;
        u64 map_extra; /* any per-map-type extra fields */
        u32 map_flags;
        u32 id;
        struct btf_record *record;
        int numa_node;
        u32 btf_key_type_id;
        u32 btf_value_type_id;
        u32 btf_vmlinux_value_type_id;
        struct btf *btf;
#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup *objcg;
#endif
        char name[BPF_OBJ_NAME_LEN];
        struct mutex freeze_mutex;
        atomic64_t refcnt;
        atomic64_t usercnt;
        /* rcu is used before freeing and work is only used during freeing */
        union {
                struct work_struct work;
                struct rcu_head rcu;
        };
        atomic64_t writecnt;
        /* 'Ownership' of program-containing map is claimed by the first program
         * that is going to use this map or by the first program which FD is
         * stored in the map to make sure that all callers and callees have the
         * same prog type, JITed flag and xdp_has_frags flag.
         */
        struct {
                spinlock_t lock;
                enum bpf_prog_type type;
                bool jited;
                bool xdp_has_frags;
        } owner;
        bool bypass_spec_v1;
        bool frozen; /* write-once; write-protected by freeze_mutex */
        bool free_after_mult_rcu_gp;
        bool free_after_rcu_gp;
        atomic64_t sleepable_refcnt;
        s64 __percpu *elem_count;
};

static inline const char *btf_field_type_name(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return "bpf_spin_lock";
        case BPF_TIMER:
                return "bpf_timer";
        case BPF_WORKQUEUE:
                return "bpf_wq";
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
                return "kptr";
        case BPF_KPTR_PERCPU:
                return "percpu_kptr";
        case BPF_LIST_HEAD:
                return "bpf_list_head";
        case BPF_LIST_NODE:
                return "bpf_list_node";
        case BPF_RB_ROOT:
                return "bpf_rb_root";
        case BPF_RB_NODE:
                return "bpf_rb_node";
        case BPF_REFCOUNT:
                return "bpf_refcount";
        default:
                WARN_ON_ONCE(1);
                return "unknown";
        }
}

static inline u32 btf_field_type_size(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return sizeof(struct bpf_spin_lock);
        case BPF_TIMER:
                return sizeof(struct bpf_timer);
        case BPF_WORKQUEUE:
                return sizeof(struct bpf_wq);
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
                return sizeof(u64);
        case BPF_LIST_HEAD:
                return sizeof(struct bpf_list_head);
        case BPF_LIST_NODE:
                return sizeof(struct bpf_list_node);
        case BPF_RB_ROOT:
                return sizeof(struct bpf_rb_root);
        case BPF_RB_NODE:
                return sizeof(struct bpf_rb_node);
        case BPF_REFCOUNT:
                return sizeof(struct bpf_refcount);
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static inline u32 btf_field_type_align(enum btf_field_type type)
{
        switch (type) {
        case BPF_SPIN_LOCK:
                return __alignof__(struct bpf_spin_lock);
        case BPF_TIMER:
                return __alignof__(struct bpf_timer);
        case BPF_WORKQUEUE:
                return __alignof__(struct bpf_wq);
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
                return __alignof__(u64);
        case BPF_LIST_HEAD:
                return __alignof__(struct bpf_list_head);
        case BPF_LIST_NODE:
                return __alignof__(struct bpf_list_node);
        case BPF_RB_ROOT:
                return __alignof__(struct bpf_rb_root);
        case BPF_RB_NODE:
                return __alignof__(struct bpf_rb_node);
        case BPF_REFCOUNT:
                return __alignof__(struct bpf_refcount);
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
{
        memset(addr, 0, field->size);

        switch (field->type) {
        case BPF_REFCOUNT:
                refcount_set((refcount_t *)addr, 1);
                break;
        case BPF_RB_NODE:
                RB_CLEAR_NODE((struct rb_node *)addr);
                break;
        case BPF_LIST_HEAD:
        case BPF_LIST_NODE:
                INIT_LIST_HEAD((struct list_head *)addr);
                break;
        case BPF_RB_ROOT:
                /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
        case BPF_SPIN_LOCK:
        case BPF_TIMER:
        case BPF_WORKQUEUE:
        case BPF_KPTR_UNREF:
        case BPF_KPTR_REF:
        case BPF_KPTR_PERCPU:
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }
}

static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type)
{
        if (IS_ERR_OR_NULL(rec))
                return false;
        return rec->field_mask & type;
}

static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
{
        int i;

        if (IS_ERR_OR_NULL(rec))
                return;
        for (i = 0; i < rec->cnt; i++)
                bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset);
}

/* 'dst' must be a temporary buffer and should not point to memory that is being
 * used in parallel by a bpf program or bpf syscall, otherwise the access from
 * the bpf program or bpf syscall may be corrupted by the reinitialization,
 * leading to weird problems. Even 'dst' is newly-allocated from bpf memory
 * allocator, it is still possible for 'dst' to be used in parallel by a bpf
 * program or bpf syscall.
 */
static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
{
        bpf_obj_init(map->record, dst);
}

/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
 * forced to use 'long' read/writes to try to atomically copy long counters.
 * Best-effort only.  No barriers here, since it _will_ race with concurrent
 * updates from BPF programs. Called from bpf syscall and mostly used with
 * size 8 or 16 bytes, so ask compiler to inline it.
 */
static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
{
        const long *lsrc = src;
        long *ldst = dst;

        size /= sizeof(long);
        while (size--)
                data_race(*ldst++ = *lsrc++);
}

/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
static inline void bpf_obj_memcpy(struct btf_record *rec,
                                  void *dst, void *src, u32 size,
                                  bool long_memcpy)
{
        u32 curr_off = 0;
        int i;

        if (IS_ERR_OR_NULL(rec)) {
                if (long_memcpy)
                        bpf_long_memcpy(dst, src, round_up(size, 8));
                else
                        memcpy(dst, src, size);
                return;
        }

        for (i = 0; i < rec->cnt; i++) {
                u32 next_off = rec->fields[i].offset;
                u32 sz = next_off - curr_off;

                memcpy(dst + curr_off, src + curr_off, sz);
                curr_off += rec->fields[i].size + sz;
        }
        memcpy(dst + curr_off, src + curr_off, size - curr_off);
}

static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
        bpf_obj_memcpy(map->record, dst, src, map->value_size, false);
}

static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
{
        bpf_obj_memcpy(map->record, dst, src, map->value_size, true);
}

static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size)
{
        u32 curr_off = 0;
        int i;

        if (IS_ERR_OR_NULL(rec)) {
                memset(dst, 0, size);
                return;
        }

        for (i = 0; i < rec->cnt; i++) {
                u32 next_off = rec->fields[i].offset;
                u32 sz = next_off - curr_off;

                memset(dst + curr_off, 0, sz);
                curr_off += rec->fields[i].size + sz;
        }
        memset(dst + curr_off, 0, size - curr_off);
}

static inline void zero_map_value(struct bpf_map *map, void *dst)
{
        bpf_obj_memzero(map->record, dst, map->value_size);
}

void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
                           bool lock_src);
void bpf_timer_cancel_and_free(void *timer);
void bpf_wq_cancel_and_free(void *timer);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
                        struct bpf_spin_lock *spin_lock);
void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
                      struct bpf_spin_lock *spin_lock);
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);

struct bpf_offload_dev;
struct bpf_offloaded_map;

struct bpf_map_dev_ops {
        int (*map_get_next_key)(struct bpf_offloaded_map *map,
                                void *key, void *next_key);
        int (*map_lookup_elem)(struct bpf_offloaded_map *map,
                               void *key, void *value);
        int (*map_update_elem)(struct bpf_offloaded_map *map,
                               void *key, void *value, u64 flags);
        int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
};

struct bpf_offloaded_map {
        struct bpf_map map;
        struct net_device *netdev;
        const struct bpf_map_dev_ops *dev_ops;
        void *dev_priv;
        struct list_head offloads;
};

static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
{
        return container_of(map, struct bpf_offloaded_map, map);
}

static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
{
        return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY;
}

static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
{
        return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) &&
                map->ops->map_seq_show_elem;
}

int map_check_no_btf(const struct bpf_map *map,
                     const struct btf *btf,
                     const struct btf_type *key_type,
                     const struct btf_type *value_type);

bool bpf_map_meta_equal(const struct bpf_map *meta0,
                        const struct bpf_map *meta1);

extern const struct bpf_map_ops bpf_map_offload_ops;

/* bpf_type_flag contains a set of flags that are applicable to the values of
 * arg_type, ret_type and reg_type. For example, a pointer value may be null,
 * or a memory is read-only. We classify types into two categories: base types
 * and extended types. Extended types are base types combined with a type flag.
 *
 * Currently there are no more than 32 base types in arg_type, ret_type and
 * reg_types.
 */
#define BPF_BASE_TYPE_BITS        8

enum bpf_type_flag {
        /* PTR may be NULL. */
        PTR_MAYBE_NULL                = BIT(0 + BPF_BASE_TYPE_BITS),

        /* MEM is read-only. When applied on bpf_arg, it indicates the arg is
         * compatible with both mutable and immutable memory.
         */
        MEM_RDONLY                = BIT(1 + BPF_BASE_TYPE_BITS),

        /* MEM points to BPF ring buffer reservation. */
        MEM_RINGBUF                = BIT(2 + BPF_BASE_TYPE_BITS),

        /* MEM is in user address space. */
        MEM_USER                = BIT(3 + BPF_BASE_TYPE_BITS),

        /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged
         * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In
         * order to drop this tag, it must be passed into bpf_per_cpu_ptr()
         * or bpf_this_cpu_ptr(), which will return the pointer corresponding
         * to the specified cpu.
         */
        MEM_PERCPU                = BIT(4 + BPF_BASE_TYPE_BITS),

        /* Indicates that the argument will be released. */
        OBJ_RELEASE                = BIT(5 + BPF_BASE_TYPE_BITS),

        /* PTR is not trusted. This is only used with PTR_TO_BTF_ID, to mark
         * unreferenced and referenced kptr loaded from map value using a load
         * instruction, so that they can only be dereferenced but not escape the
         * BPF program into the kernel (i.e. cannot be passed as arguments to
         * kfunc or bpf helpers).
         */
        PTR_UNTRUSTED                = BIT(6 + BPF_BASE_TYPE_BITS),

        MEM_UNINIT                = BIT(7 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to memory local to the bpf program. */
        DYNPTR_TYPE_LOCAL        = BIT(8 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to a kernel-produced ringbuf record. */
        DYNPTR_TYPE_RINGBUF        = BIT(9 + BPF_BASE_TYPE_BITS),

        /* Size is known at compile time. */
        MEM_FIXED_SIZE                = BIT(10 + BPF_BASE_TYPE_BITS),

        /* MEM is of an allocated object of type in program BTF. This is used to
         * tag PTR_TO_BTF_ID allocated using bpf_obj_new.
         */
        MEM_ALLOC                = BIT(11 + BPF_BASE_TYPE_BITS),

        /* PTR was passed from the kernel in a trusted context, and may be
         * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions.
         * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above.
         * PTR_UNTRUSTED refers to a kptr that was read directly from a map
         * without invoking bpf_kptr_xchg(). What we really need to know is
         * whether a pointer is safe to pass to a kfunc or BPF helper function.
         * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF
         * helpers, they do not cover all possible instances of unsafe
         * pointers. For example, a pointer that was obtained from walking a
         * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the
         * fact that it may be NULL, invalid, etc. This is due to backwards
         * compatibility requirements, as this was the behavior that was first
         * introduced when kptrs were added. The behavior is now considered
         * deprecated, and PTR_UNTRUSTED will eventually be removed.
         *
         * PTR_TRUSTED, on the other hand, is a pointer that the kernel
         * guarantees to be valid and safe to pass to kfuncs and BPF helpers.
         * For example, pointers passed to tracepoint arguments are considered
         * PTR_TRUSTED, as are pointers that are passed to struct_ops
         * callbacks. As alluded to above, pointers that are obtained from
         * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a
         * struct task_struct *task is PTR_TRUSTED, then accessing
         * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored
         * in a BPF register. Similarly, pointers passed to certain programs
         * types such as kretprobes are not guaranteed to be valid, as they may
         * for example contain an object that was recently freed.
         */
        PTR_TRUSTED                = BIT(12 + BPF_BASE_TYPE_BITS),

        /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */
        MEM_RCU                        = BIT(13 + BPF_BASE_TYPE_BITS),

        /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning.
         * Currently only valid for linked-list and rbtree nodes. If the nodes
         * have a bpf_refcount_field, they must be tagged MEM_RCU as well.
         */
        NON_OWN_REF                = BIT(14 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to sk_buff */
        DYNPTR_TYPE_SKB                = BIT(15 + BPF_BASE_TYPE_BITS),

        /* DYNPTR points to xdp_buff */
        DYNPTR_TYPE_XDP                = BIT(16 + BPF_BASE_TYPE_BITS),

        __BPF_TYPE_FLAG_MAX,
        __BPF_TYPE_LAST_FLAG        = __BPF_TYPE_FLAG_MAX - 1,
};

#define DYNPTR_TYPE_FLAG_MASK        (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
                                 | DYNPTR_TYPE_XDP)

/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT        (1UL << BPF_BASE_TYPE_BITS)

/* Max number of all types. */
#define BPF_TYPE_LIMIT                (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))

/* function argument constraints */
enum bpf_arg_type {
        ARG_DONTCARE = 0,        /* unused argument in helper function */

        /* the following constraints used to prototype
         * bpf_map_lookup/update/delete_elem() functions
         */
        ARG_CONST_MAP_PTR,        /* const argument used as pointer to bpf_map */
        ARG_PTR_TO_MAP_KEY,        /* pointer to stack used as map key */
        ARG_PTR_TO_MAP_VALUE,        /* pointer to stack used as map value */

        /* Used to prototype bpf_memcmp() and other functions that access data
         * on eBPF program stack
         */
        ARG_PTR_TO_MEM,                /* pointer to valid memory (stack, packet, map value) */
        ARG_PTR_TO_ARENA,

        ARG_CONST_SIZE,                /* number of bytes accessed from memory */
        ARG_CONST_SIZE_OR_ZERO,        /* number of bytes accessed from memory or 0 */

        ARG_PTR_TO_CTX,                /* pointer to context */
        ARG_ANYTHING,                /* any (initialized) argument is ok */
        ARG_PTR_TO_SPIN_LOCK,        /* pointer to bpf_spin_lock */
        ARG_PTR_TO_SOCK_COMMON,        /* pointer to sock_common */
        ARG_PTR_TO_INT,                /* pointer to int */
        ARG_PTR_TO_LONG,        /* pointer to long */
        ARG_PTR_TO_SOCKET,        /* pointer to bpf_sock (fullsock) */
        ARG_PTR_TO_BTF_ID,        /* pointer to in-kernel struct */
        ARG_PTR_TO_RINGBUF_MEM,        /* pointer to dynamically reserved ringbuf memory */
        ARG_CONST_ALLOC_SIZE_OR_ZERO,        /* number of allocated bytes requested */
        ARG_PTR_TO_BTF_ID_SOCK_COMMON,        /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
        ARG_PTR_TO_PERCPU_BTF_ID,        /* pointer to in-kernel percpu type */
        ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
        ARG_PTR_TO_STACK,        /* pointer to stack */
        ARG_PTR_TO_CONST_STR,        /* pointer to a null terminated read-only string */
        ARG_PTR_TO_TIMER,        /* pointer to bpf_timer */
        ARG_PTR_TO_KPTR,        /* pointer to referenced kptr */
        ARG_PTR_TO_DYNPTR,      /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
        __BPF_ARG_TYPE_MAX,

        /* Extended arg_types. */
        ARG_PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
        ARG_PTR_TO_MEM_OR_NULL                = PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
        ARG_PTR_TO_CTX_OR_NULL                = PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
        ARG_PTR_TO_SOCKET_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
        ARG_PTR_TO_STACK_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
        ARG_PTR_TO_BTF_ID_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID,
        /* pointer to memory does not need to be initialized, helper function must fill
         * all bytes or clear them in error case.
         */
        ARG_PTR_TO_UNINIT_MEM                = MEM_UNINIT | ARG_PTR_TO_MEM,
        /* Pointer to valid memory of size known at compile time. */
        ARG_PTR_TO_FIXED_SIZE_MEM        = MEM_FIXED_SIZE | ARG_PTR_TO_MEM,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_ARG_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* type of values returned from helper functions */
enum bpf_return_type {
        RET_INTEGER,                        /* function returns integer */
        RET_VOID,                        /* function doesn't return anything */
        RET_PTR_TO_MAP_VALUE,                /* returns a pointer to map elem value */
        RET_PTR_TO_SOCKET,                /* returns a pointer to a socket */
        RET_PTR_TO_TCP_SOCK,                /* returns a pointer to a tcp_sock */
        RET_PTR_TO_SOCK_COMMON,                /* returns a pointer to a sock_common */
        RET_PTR_TO_MEM,                        /* returns a pointer to memory */
        RET_PTR_TO_MEM_OR_BTF_ID,        /* returns a pointer to a valid memory or a btf_id */
        RET_PTR_TO_BTF_ID,                /* returns a pointer to a btf_id */
        __BPF_RET_TYPE_MAX,

        /* Extended ret_types. */
        RET_PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
        RET_PTR_TO_SOCKET_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
        RET_PTR_TO_TCP_SOCK_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
        RET_PTR_TO_SOCK_COMMON_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
        RET_PTR_TO_RINGBUF_MEM_OR_NULL        = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM,
        RET_PTR_TO_DYNPTR_MEM_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_MEM,
        RET_PTR_TO_BTF_ID_OR_NULL        = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
        RET_PTR_TO_BTF_ID_TRUSTED        = PTR_TRUSTED         | RET_PTR_TO_BTF_ID,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_RET_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
 * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
 * instructions after verifying
 */
struct bpf_func_proto {
        u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
        bool gpl_only;
        bool pkt_access;
        bool might_sleep;
        enum bpf_return_type ret_type;
        union {
                struct {
                        enum bpf_arg_type arg1_type;
                        enum bpf_arg_type arg2_type;
                        enum bpf_arg_type arg3_type;
                        enum bpf_arg_type arg4_type;
                        enum bpf_arg_type arg5_type;
                };
                enum bpf_arg_type arg_type[5];
        };
        union {
                struct {
                        u32 *arg1_btf_id;
                        u32 *arg2_btf_id;
                        u32 *arg3_btf_id;
                        u32 *arg4_btf_id;
                        u32 *arg5_btf_id;
                };
                u32 *arg_btf_id[5];
                struct {
                        size_t arg1_size;
                        size_t arg2_size;
                        size_t arg3_size;
                        size_t arg4_size;
                        size_t arg5_size;
                };
                size_t arg_size[5];
        };
        int *ret_btf_id; /* return value btf_id */
        bool (*allowed)(const struct bpf_prog *prog);
};

/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
 * the first argument to eBPF programs.
 * For socket filters: 'struct bpf_context *' == 'struct sk_buff *'
 */
struct bpf_context;

enum bpf_access_type {
        BPF_READ = 1,
        BPF_WRITE = 2
};

/* types of values stored in eBPF registers */
/* Pointer types represent:
 * pointer
 * pointer + imm
 * pointer + (u16) var
 * pointer + (u16) var + imm
 * if (range > 0) then [ptr, ptr + range - off) is safe to access
 * if (id > 0) means that some 'var' was added
 * if (off > 0) means that 'imm' was added
 */
enum bpf_reg_type {
        NOT_INIT = 0,                 /* nothing was written into register */
        SCALAR_VALUE,                 /* reg doesn't contain a valid pointer */
        PTR_TO_CTX,                 /* reg points to bpf_context */
        CONST_PTR_TO_MAP,         /* reg points to struct bpf_map */
        PTR_TO_MAP_VALUE,         /* reg points to map element value */
        PTR_TO_MAP_KEY,                 /* reg points to a map element key */
        PTR_TO_STACK,                 /* reg == frame_pointer + offset */
        PTR_TO_PACKET_META,         /* skb->data - meta_len */
        PTR_TO_PACKET,                 /* reg points to skb->data */
        PTR_TO_PACKET_END,         /* skb->data + headlen */
        PTR_TO_FLOW_KEYS,         /* reg points to bpf_flow_keys */
        PTR_TO_SOCKET,                 /* reg points to struct bpf_sock */
        PTR_TO_SOCK_COMMON,         /* reg points to sock_common */
        PTR_TO_TCP_SOCK,         /* reg points to struct tcp_sock */
        PTR_TO_TP_BUFFER,         /* reg points to a writable raw tp's buffer */
        PTR_TO_XDP_SOCK,         /* reg points to struct xdp_sock */
        /* PTR_TO_BTF_ID points to a kernel struct that does not need
         * to be null checked by the BPF program. This does not imply the
         * pointer is _not_ null and in practice this can easily be a null
         * pointer when reading pointer chains. The assumption is program
         * context will handle null pointer dereference typically via fault
         * handling. The verifier must keep this in mind and can make no
         * assumptions about null or non-null when doing branch analysis.
         * Further, when passed into helpers the helpers can not, without
         * additional context, assume the value is non-null.
         */
        PTR_TO_BTF_ID,
        /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not
         * been checked for null. Used primarily to inform the verifier
         * an explicit null check is required for this struct.
         */
        PTR_TO_MEM,                 /* reg points to valid memory region */
        PTR_TO_ARENA,
        PTR_TO_BUF,                 /* reg points to a read/write buffer */
        PTR_TO_FUNC,                 /* reg points to a bpf program function */
        CONST_PTR_TO_DYNPTR,         /* reg points to a const struct bpf_dynptr */
        __BPF_REG_TYPE_MAX,

        /* Extended reg_types. */
        PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
        PTR_TO_SOCKET_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_SOCKET,
        PTR_TO_SOCK_COMMON_OR_NULL        = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
        PTR_TO_TCP_SOCK_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
        PTR_TO_BTF_ID_OR_NULL                = PTR_MAYBE_NULL | PTR_TO_BTF_ID,

        /* This must be the last entry. Its purpose is to ensure the enum is
         * wide enough to hold the higher bits reserved for bpf_type_flag.
         */
        __BPF_REG_TYPE_LIMIT        = BPF_TYPE_LIMIT,
};
static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);

/* The information passed from prog-specific *_is_valid_access
 * back to the verifier.
 */
struct bpf_insn_access_aux {
        enum bpf_reg_type reg_type;
        union {
                int ctx_field_size;
                struct {
                        struct btf *btf;
                        u32 btf_id;
                };
        };
        struct bpf_verifier_log *log; /* for verbose logs */
};

static inline void
bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
{
        aux->ctx_field_size = size;
}

static bool bpf_is_ldimm64(const struct bpf_insn *insn)
{
        return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
}

static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
{
        return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
}

struct bpf_prog_ops {
        int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
                        union bpf_attr __user *uattr);
};

struct bpf_reg_state;
struct bpf_verifier_ops {
        /* return eBPF function prototype for verification */
        const struct bpf_func_proto *
        (*get_func_proto)(enum bpf_func_id func_id,
                          const struct bpf_prog *prog);

        /* return true if 'size' wide access at offset 'off' within bpf_context
         * with 'type' (read or write) is allowed
         */
        bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info);
        int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
                            const struct bpf_prog *prog);
        int (*gen_ld_abs)(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf);
        u32 (*convert_ctx_access)(enum bpf_access_type type,
                                  const struct bpf_insn *src,
                                  struct bpf_insn *dst,
                                  struct bpf_prog *prog, u32 *target_size);
        int (*btf_struct_access)(struct bpf_verifier_log *log,
                                 const struct bpf_reg_state *reg,
                                 int off, int size);
};

struct bpf_prog_offload_ops {
        /* verifier basic callbacks */
        int (*insn_hook)(struct bpf_verifier_env *env,
                         int insn_idx, int prev_insn_idx);
        int (*finalize)(struct bpf_verifier_env *env);
        /* verifier optimization callbacks (called after .finalize) */
        int (*replace_insn)(struct bpf_verifier_env *env, u32 off,
                            struct bpf_insn *insn);
        int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt);
        /* program management callbacks */
        int (*prepare)(struct bpf_prog *prog);
        int (*translate)(struct bpf_prog *prog);
        void (*destroy)(struct bpf_prog *prog);
};

struct bpf_prog_offload {
        struct bpf_prog                *prog;
        struct net_device        *netdev;
        struct bpf_offload_dev        *offdev;
        void                        *dev_priv;
        struct list_head        offloads;
        bool                        dev_state;
        bool                        opt_failed;
        void                        *jited_image;
        u32                        jited_len;
};

enum bpf_cgroup_storage_type {
        BPF_CGROUP_STORAGE_SHARED,
        BPF_CGROUP_STORAGE_PERCPU,
        __BPF_CGROUP_STORAGE_MAX
};

#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX

/* The longest tracepoint has 12 args.
 * See include/trace/bpf_probe.h
 */
#define MAX_BPF_FUNC_ARGS 12

/* The maximum number of arguments passed through registers
 * a single function may have.
 */
#define MAX_BPF_FUNC_REG_ARGS 5

/* The argument is a structure. */
#define BTF_FMODEL_STRUCT_ARG                BIT(0)

/* The argument is signed. */
#define BTF_FMODEL_SIGNED_ARG                BIT(1)

struct btf_func_model {
        u8 ret_size;
        u8 ret_flags;
        u8 nr_args;
        u8 arg_size[MAX_BPF_FUNC_ARGS];
        u8 arg_flags[MAX_BPF_FUNC_ARGS];
};

/* Restore arguments before returning from trampoline to let original function
 * continue executing. This flag is used for fentry progs when there are no
 * fexit progs.
 */
#define BPF_TRAMP_F_RESTORE_REGS        BIT(0)
/* Call original function after fentry progs, but before fexit progs.
 * Makes sense for fentry/fexit, normal calls and indirect calls.
 */
#define BPF_TRAMP_F_CALL_ORIG                BIT(1)
/* Skip current frame and return to parent.  Makes sense for fentry/fexit
 * programs only. Should not be used with normal calls and indirect calls.
 */
#define BPF_TRAMP_F_SKIP_FRAME                BIT(2)
/* Store IP address of the caller on the trampoline stack,
 * so it's available for trampoline's programs.
 */
#define BPF_TRAMP_F_IP_ARG                BIT(3)
/* Return the return value of fentry prog. Only used by bpf_struct_ops. */
#define BPF_TRAMP_F_RET_FENTRY_RET        BIT(4)

/* Get original function from stack instead of from provided direct address.
 * Makes sense for trampolines with fexit or fmod_ret programs.
 */
#define BPF_TRAMP_F_ORIG_STACK                BIT(5)

/* This trampoline is on a function with another ftrace_ops with IPMODIFY,
 * e.g., a live patch. This flag is set and cleared by ftrace call backs,
 */
#define BPF_TRAMP_F_SHARE_IPMODIFY        BIT(6)

/* Indicate that current trampoline is in a tail call context. Then, it has to
 * cache and restore tail_call_cnt to avoid infinite tail call loop.
 */
#define BPF_TRAMP_F_TAIL_CALL_CTX        BIT(7)

/*
 * Indicate the trampoline should be suitable to receive indirect calls;
 * without this indirectly calling the generated code can result in #UD/#CP,
 * depending on the CFI options.
 *
 * Used by bpf_struct_ops.
 *
 * Incompatible with FENTRY usage, overloads @func_addr argument.
 */
#define BPF_TRAMP_F_INDIRECT                BIT(8)

/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
 * bytes on x86.
 */
enum {
#if defined(__s390x__)
        BPF_MAX_TRAMP_LINKS = 27,
#else
        BPF_MAX_TRAMP_LINKS = 38,
#endif
};

struct bpf_tramp_links {
        struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS];
        int nr_links;
};

struct bpf_tramp_run_ctx;

/* Different use cases for BPF trampoline:
 * 1. replace nop at the function entry (kprobe equivalent)
 *    flags = BPF_TRAMP_F_RESTORE_REGS
 *    fentry = a set of programs to run before returning from trampoline
 *
 * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
 *    flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
 *    orig_call = fentry_ip + MCOUNT_INSN_SIZE
 *    fentry = a set of program to run before calling original function
 *    fexit = a set of program to run after original function
 *
 * 3. replace direct call instruction anywhere in the function body
 *    or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
 *    With flags = 0
 *      fentry = a set of programs to run before returning from trampoline
 *    With flags = BPF_TRAMP_F_CALL_ORIG
 *      orig_call = original callback addr or direct function addr
 *      fentry = a set of program to run before calling original function
 *      fexit = a set of program to run after original function
 */
struct bpf_tramp_image;
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
                                const struct btf_func_model *m, u32 flags,
                                struct bpf_tramp_links *tlinks,
                                void *func_addr);
void *arch_alloc_bpf_trampoline(unsigned int size);
void arch_free_bpf_trampoline(void *image, unsigned int size);
int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size);
int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
                             struct bpf_tramp_links *tlinks, void *func_addr);

u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
                                             struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
                                             struct bpf_tramp_run_ctx *run_ctx);
void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog,
                                      struct bpf_tramp_run_ctx *run_ctx);
typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start,
                                      struct bpf_tramp_run_ctx *run_ctx);
bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog);
bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog);

struct bpf_ksym {
        unsigned long                 start;
        unsigned long                 end;
        char                         name[KSYM_NAME_LEN];
        struct list_head         lnode;
        struct latch_tree_node         tnode;
        bool                         prog;
};

enum bpf_tramp_prog_type {
        BPF_TRAMP_FENTRY,
        BPF_TRAMP_FEXIT,
        BPF_TRAMP_MODIFY_RETURN,
        BPF_TRAMP_MAX,
        BPF_TRAMP_REPLACE, /* more than MAX */
};

struct bpf_tramp_image {
        void *image;
        int size;
        struct bpf_ksym ksym;
        struct percpu_ref pcref;
        void *ip_after_call;
        void *ip_epilogue;
        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
};

struct bpf_trampoline {
        /* hlist for trampoline_table */
        struct hlist_node hlist;
        struct ftrace_ops *fops;
        /* serializes access to fields of this trampoline */
        struct mutex mutex;
        refcount_t refcnt;
        u32 flags;
        u64 key;
        struct {
                struct btf_func_model model;
                void *addr;
                bool ftrace_managed;
        } func;
        /* if !NULL this is BPF_PROG_TYPE_EXT program that extends another BPF
         * program by replacing one of its functions. func.addr is the address
         * of the function it replaced.
         */
        struct bpf_prog *extension_prog;
        /* list of BPF programs using this trampoline */
        struct hlist_head progs_hlist[BPF_TRAMP_MAX];
        /* Number of attached programs. A counter per kind. */
        int progs_cnt[BPF_TRAMP_MAX];
        /* Executable image of trampoline */
        struct bpf_tramp_image *cur_image;
};

struct bpf_attach_target_info {
        struct btf_func_model fmodel;
        long tgt_addr;
        struct module *tgt_mod;
        const char *tgt_name;
        const struct btf_type *tgt_type;
};

#define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */

struct bpf_dispatcher_prog {
        struct bpf_prog *prog;
        refcount_t users;
};

struct bpf_dispatcher {
        /* dispatcher mutex */
        struct mutex mutex;
        void *func;
        struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX];
        int num_progs;
        void *image;
        void *rw_image;
        u32 image_off;
        struct bpf_ksym ksym;
#ifdef CONFIG_HAVE_STATIC_CALL
        struct static_call_key *sc_key;
        void *sc_tramp;
#endif
};

#ifndef __bpfcall
#define __bpfcall __nocfi
#endif

static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
        const void *ctx,
        const struct bpf_insn *insnsi,
        bpf_func_t bpf_func)
{
        return bpf_func(ctx, insnsi);
}

/* the implementation of the opaque uapi struct bpf_dynptr */
struct bpf_dynptr_kern {
        void *data;
        /* Size represents the number of usable bytes of dynptr data.
         * If for example the offset is at 4 for a local dynptr whose data is
         * of type u64, the number of usable bytes is 4.
         *
         * The upper 8 bits are reserved. It is as follows:
         * Bits 0 - 23 = size
         * Bits 24 - 30 = dynptr type
         * Bit 31 = whether dynptr is read-only
         */
        u32 size;
        u32 offset;
} __aligned(8);

enum bpf_dynptr_type {
        BPF_DYNPTR_TYPE_INVALID,
        /* Points to memory that is local to the bpf program */
        BPF_DYNPTR_TYPE_LOCAL,
        /* Underlying data is a ringbuf record */
        BPF_DYNPTR_TYPE_RINGBUF,
        /* Underlying data is a sk_buff */
        BPF_DYNPTR_TYPE_SKB,
        /* Underlying data is a xdp_buff */
        BPF_DYNPTR_TYPE_XDP,
};

int bpf_dynptr_check_size(u32 size);
u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);

#ifdef CONFIG_BPF_JIT
int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
struct bpf_trampoline *bpf_trampoline_get(u64 key,
                                          struct bpf_attach_target_info *tgt_info);
void bpf_trampoline_put(struct bpf_trampoline *tr);
int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);

/*
 * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn
 * indirection with a direct call to the bpf program. If the architecture does
 * not have STATIC_CALL, avoid a double-indirection.
 */
#ifdef CONFIG_HAVE_STATIC_CALL

#define __BPF_DISPATCHER_SC_INIT(_name)                                \
        .sc_key = &STATIC_CALL_KEY(_name),                        \
        .sc_tramp = STATIC_CALL_TRAMP_ADDR(_name),

#define __BPF_DISPATCHER_SC(name)                                \
        DEFINE_STATIC_CALL(bpf_dispatcher_##name##_call, bpf_dispatcher_nop_func)

#define __BPF_DISPATCHER_CALL(name)                                \
        static_call(bpf_dispatcher_##name##_call)(ctx, insnsi, bpf_func)

#define __BPF_DISPATCHER_UPDATE(_d, _new)                        \
        __static_call_update((_d)->sc_key, (_d)->sc_tramp, (_new))

#else
#define __BPF_DISPATCHER_SC_INIT(name)
#define __BPF_DISPATCHER_SC(name)
#define __BPF_DISPATCHER_CALL(name)                bpf_func(ctx, insnsi)
#define __BPF_DISPATCHER_UPDATE(_d, _new)
#endif

#define BPF_DISPATCHER_INIT(_name) {                                \
        .mutex = __MUTEX_INITIALIZER(_name.mutex),                \
        .func = &_name##_func,                                        \
        .progs = {},                                                \
        .num_progs = 0,                                                \
        .image = NULL,                                                \
        .image_off = 0,                                                \
        .ksym = {                                                \
                .name  = #_name,                                \
                .lnode = LIST_HEAD_INIT(_name.ksym.lnode),        \
        },                                                        \
        __BPF_DISPATCHER_SC_INIT(_name##_call)                        \
}

#define DEFINE_BPF_DISPATCHER(name)                                        \
        __BPF_DISPATCHER_SC(name);                                        \
        noinline __bpfcall unsigned int bpf_dispatcher_##name##_func(        \
                const void *ctx,                                        \
                const struct bpf_insn *insnsi,                                \
                bpf_func_t bpf_func)                                        \
        {                                                                \
                return __BPF_DISPATCHER_CALL(name);                        \
        }                                                                \
        EXPORT_SYMBOL(bpf_dispatcher_##name##_func);                        \
        struct bpf_dispatcher bpf_dispatcher_##name =                        \
                BPF_DISPATCHER_INIT(bpf_dispatcher_##name);

#define DECLARE_BPF_DISPATCHER(name)                                        \
        unsigned int bpf_dispatcher_##name##_func(                        \
                const void *ctx,                                        \
                const struct bpf_insn *insnsi,                                \
                bpf_func_t bpf_func);                                        \
        extern struct bpf_dispatcher bpf_dispatcher_##name;

#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func
#define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
                                struct bpf_prog *to);
/* Called only from JIT-enabled code, so there's no need for stubs. */
void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
int bpf_jit_charge_modmem(u32 size);
void bpf_jit_uncharge_modmem(u32 size);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
                                           struct bpf_trampoline *tr)
{
        return -ENOTSUPP;
}
static inline int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
                                             struct bpf_trampoline *tr)
{
        return -ENOTSUPP;
}
static inline struct bpf_trampoline *bpf_trampoline_get(u64 key,
                                                        struct bpf_attach_target_info *tgt_info)
{
        return NULL;
}
static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
#define DEFINE_BPF_DISPATCHER(name)
#define DECLARE_BPF_DISPATCHER(name)
#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func
#define BPF_DISPATCHER_PTR(name) NULL
static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d,
                                              struct bpf_prog *from,
                                              struct bpf_prog *to) {}
static inline bool is_bpf_image_address(unsigned long address)
{
        return false;
}
static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
{
        return false;
}
#endif

struct bpf_func_info_aux {
        u16 linkage;
        bool unreliable;
        bool called : 1;
        bool verified : 1;
};

enum bpf_jit_poke_reason {
        BPF_POKE_REASON_TAIL_CALL,
};

/* Descriptor of pokes pointing /into/ the JITed image. */
struct bpf_jit_poke_descriptor {
        void *tailcall_target;
        void *tailcall_bypass;
        void *bypass_addr;
        void *aux;
        union {
                struct {
                        struct bpf_map *map;
                        u32 key;
                } tail_call;
        };
        bool tailcall_target_stable;
        u8 adj_off;
        u16 reason;
        u32 insn_idx;
};

/* reg_type info for ctx arguments */
struct bpf_ctx_arg_aux {
        u32 offset;
        enum bpf_reg_type reg_type;
        struct btf *btf;
        u32 btf_id;
};

struct btf_mod_pair {
        struct btf *btf;
        struct module *module;
};

struct bpf_kfunc_desc_tab;

struct bpf_prog_aux {
        atomic64_t refcnt;
        u32 used_map_cnt;
        u32 used_btf_cnt;
        u32 max_ctx_offset;
        u32 max_pkt_offset;
        u32 max_tp_access;
        u32 stack_depth;
        u32 id;
        u32 func_cnt; /* used by non-func prog as the number of func progs */
        u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
        u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
        u32 attach_btf_id; /* in-kernel BTF type id to attach to */
        u32 ctx_arg_info_size;
        u32 max_rdonly_access;
        u32 max_rdwr_access;
        struct btf *attach_btf;
        const struct bpf_ctx_arg_aux *ctx_arg_info;
        struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
        struct bpf_prog *dst_prog;
        struct bpf_trampoline *dst_trampoline;
        enum bpf_prog_type saved_dst_prog_type;
        enum bpf_attach_type saved_dst_attach_type;
        bool verifier_zext; /* Zero extensions has been inserted by verifier. */
        bool dev_bound; /* Program is bound to the netdev. */
        bool offload_requested; /* Program is bound and offloaded to the netdev. */
        bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
        bool attach_tracing_prog; /* true if tracing another tracing program */
        bool func_proto_unreliable;
        bool tail_call_reachable;
        bool xdp_has_frags;
        bool exception_cb;
        bool exception_boundary;
        struct bpf_arena *arena;
        /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
        const struct btf_type *attach_func_proto;
        /* function name for valid attach_btf_id */
        const char *attach_func_name;
        struct bpf_prog **func;
        void *jit_data; /* JIT specific data. arch dependent */
        struct bpf_jit_poke_descriptor *poke_tab;
        struct bpf_kfunc_desc_tab *kfunc_tab;
        struct bpf_kfunc_btf_tab *kfunc_btf_tab;
        u32 size_poke_tab;
#ifdef CONFIG_FINEIBT
        struct bpf_ksym ksym_prefix;
#endif
        struct bpf_ksym ksym;
        const struct bpf_prog_ops *ops;
        struct bpf_map **used_maps;
        struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */
        struct btf_mod_pair *used_btfs;
        struct bpf_prog *prog;
        struct user_struct *user;
        u64 load_time; /* ns since boottime */
        u32 verified_insns;
        int cgroup_atype; /* enum cgroup_bpf_attach_type */
        struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
        char name[BPF_OBJ_NAME_LEN];
        u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct bpf_token *token;
        struct bpf_prog_offload *offload;
        struct btf *btf;
        struct bpf_func_info *func_info;
        struct bpf_func_info_aux *func_info_aux;
        /* bpf_line_info loaded from userspace.  linfo->insn_off
         * has the xlated insn offset.
         * Both the main and sub prog share the same linfo.
         * The subprog can access its first linfo by
         * using the linfo_idx.
         */
        struct bpf_line_info *linfo;
        /* jited_linfo is the jited addr of the linfo.  It has a
         * one to one mapping to linfo:
         * jited_linfo[i] is the jited addr for the linfo[i]->insn_off.
         * Both the main and sub prog share the same jited_linfo.
         * The subprog can access its first jited_linfo by
         * using the linfo_idx.
         */
        void **jited_linfo;
        u32 func_info_cnt;
        u32 nr_linfo;
        /* subprog can use linfo_idx to access its first linfo and
         * jited_linfo.
         * main prog always has linfo_idx == 0
         */
        u32 linfo_idx;
        struct module *mod;
        u32 num_exentries;
        struct exception_table_entry *extable;
        union {
                struct work_struct work;
                struct rcu_head        rcu;
        };
};

struct bpf_prog {
        u16                        pages;                /* Number of allocated pages */
        u16                        jited:1,        /* Is our filter JIT'ed? */
                                jit_requested:1,/* archs need to JIT the prog */
                                gpl_compatible:1, /* Is filter GPL compatible? */
                                cb_access:1,        /* Is control block accessed? */
                                dst_needed:1,        /* Do we need dst entry? */
                                blinding_requested:1, /* needs constant blinding */
                                blinded:1,        /* Was blinded */
                                is_func:1,        /* program is a bpf function */
                                kprobe_override:1, /* Do we override a kprobe? */
                                has_callchain_buf:1, /* callchain buffer allocated? */
                                enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
                                call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
                                call_get_func_ip:1, /* Do we call get_func_ip() */
                                tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
                                sleepable:1;        /* BPF program is sleepable */
        enum bpf_prog_type        type;                /* Type of BPF program */
        enum bpf_attach_type        expected_attach_type; /* For some prog types */
        u32                        len;                /* Number of filter blocks */
        u32                        jited_len;        /* Size of jited insns in bytes */
        u8                        tag[BPF_TAG_SIZE];
        struct bpf_prog_stats __percpu *stats;
        int __percpu                *active;
        unsigned int                (*bpf_func)(const void *ctx,
                                            const struct bpf_insn *insn);
        struct bpf_prog_aux        *aux;                /* Auxiliary fields */
        struct sock_fprog_kern        *orig_prog;        /* Original BPF program */
        /* Instructions for interpreter */
        union {
                DECLARE_FLEX_ARRAY(struct sock_filter, insns);
                DECLARE_FLEX_ARRAY(struct bpf_insn, insnsi);
        };
};

struct bpf_array_aux {
        /* Programs with direct jumps into programs part of this array. */
        struct list_head poke_progs;
        struct bpf_map *map;
        struct mutex poke_mutex;
        struct work_struct work;
};

struct bpf_link {
        atomic64_t refcnt;
        u32 id;
        enum bpf_link_type type;
        const struct bpf_link_ops *ops;
        struct bpf_prog *prog;
        /* rcu is used before freeing, work can be used to schedule that
         * RCU-based freeing before that, so they never overlap
         */
        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
};

struct bpf_link_ops {
        void (*release)(struct bpf_link *link);
        /* deallocate link resources callback, called without RCU grace period
         * waiting
         */
        void (*dealloc)(struct bpf_link *link);
        /* deallocate link resources callback, called after RCU grace period;
         * if underlying BPF program is sleepable we go through tasks trace
         * RCU GP and then "classic" RCU GP
         */
        void (*dealloc_deferred)(struct bpf_link *link);
        int (*detach)(struct bpf_link *link);
        int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
                           struct bpf_prog *old_prog);
        void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
        int (*fill_link_info)(const struct bpf_link *link,
                              struct bpf_link_info *info);
        int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
                          struct bpf_map *old_map);
        __poll_t (*poll)(struct file *file, struct poll_table_struct *pts);
};

struct bpf_tramp_link {
        struct bpf_link link;
        struct hlist_node tramp_hlist;
        u64 cookie;
};

struct bpf_shim_tramp_link {
        struct bpf_tramp_link link;
        struct bpf_trampoline *trampoline;
};

struct bpf_tracing_link {
        struct bpf_tramp_link link;
        enum bpf_attach_type attach_type;
        struct bpf_trampoline *trampoline;
        struct bpf_prog *tgt_prog;
};

struct bpf_raw_tp_link {
        struct bpf_link link;
        struct bpf_raw_event_map *btp;
        u64 cookie;
};

struct bpf_link_primer {
        struct bpf_link *link;
        struct file *file;
        int fd;
        u32 id;
};

struct bpf_mount_opts {
        kuid_t uid;
        kgid_t gid;
        umode_t mode;

        /* BPF token-related delegation options */
        u64 delegate_cmds;
        u64 delegate_maps;
        u64 delegate_progs;
        u64 delegate_attachs;
};

struct bpf_token {
        struct work_struct work;
        atomic64_t refcnt;
        struct user_namespace *userns;
        u64 allowed_cmds;
        u64 allowed_maps;
        u64 allowed_progs;
        u64 allowed_attachs;
#ifdef CONFIG_SECURITY
        void *security;
#endif
};

struct bpf_struct_ops_value;
struct btf_member;

#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64
/**
 * struct bpf_struct_ops - A structure of callbacks allowing a subsystem to
 *                           define a BPF_MAP_TYPE_STRUCT_OPS map type composed
 *                           of BPF_PROG_TYPE_STRUCT_OPS progs.
 * @verifier_ops: A structure of callbacks that are invoked by the verifier
 *                  when determining whether the struct_ops progs in the
 *                  struct_ops map are valid.
 * @init: A callback that is invoked a single time, and before any other
 *          callback, to initialize the structure. A nonzero return value means
 *          the subsystem could not be initialized.
 * @check_member: When defined, a callback invoked by the verifier to allow
 *                  the subsystem to determine if an entry in the struct_ops map
 *                  is valid. A nonzero return value means that the map is
 *                  invalid and should be rejected by the verifier.
 * @init_member: A callback that is invoked for each member of the struct_ops
 *                 map to allow the subsystem to initialize the member. A nonzero
 *                 value means the member could not be initialized. This callback
 *                 is exclusive with the @type, @type_id, @value_type, and
 *                 @value_id fields.
 * @reg: A callback that is invoked when the struct_ops map has been
 *         initialized and is being attached to. Zero means the struct_ops map
 *         has been successfully registered and is live. A nonzero return value
 *         means the struct_ops map could not be registered.
 * @unreg: A callback that is invoked when the struct_ops map should be
 *           unregistered.
 * @update: A callback that is invoked when the live struct_ops map is being
 *            updated to contain new values. This callback is only invoked when
 *            the struct_ops map is loaded with BPF_F_LINK. If not defined, the
 *            it is assumed that the struct_ops map cannot be updated.
 * @validate: A callback that is invoked after all of the members have been
 *              initialized. This callback should perform static checks on the
 *              map, meaning that it should either fail or succeed
 *              deterministically. A struct_ops map that has been validated may
 *              not necessarily succeed in being registered if the call to @reg
 *              fails. For example, a valid struct_ops map may be loaded, but
 *              then fail to be registered due to there being another active
 *              struct_ops map on the system in the subsystem already. For this
 *              reason, if this callback is not defined, the check is skipped as
 *              the struct_ops map will have final verification performed in
 *              @reg.
 * @type: BTF type.
 * @value_type: Value type.
 * @name: The name of the struct bpf_struct_ops object.
 * @func_models: Func models
 * @type_id: BTF type id.
 * @value_id: BTF value id.
 */
struct bpf_struct_ops {
        const struct bpf_verifier_ops *verifier_ops;
        int (*init)(struct btf *btf);
        int (*check_member)(const struct btf_type *t,
                            const struct btf_member *member,
                            const struct bpf_prog *prog);
        int (*init_member)(const struct btf_type *t,
                           const struct btf_member *member,
                           void *kdata, const void *udata);
        int (*reg)(void *kdata, struct bpf_link *link);
        void (*unreg)(void *kdata, struct bpf_link *link);
        int (*update)(void *kdata, void *old_kdata, struct bpf_link *link);
        int (*validate)(void *kdata);
        void *cfi_stubs;
        struct module *owner;
        const char *name;
        struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
};

/* Every member of a struct_ops type has an instance even a member is not
 * an operator (function pointer). The "info" field will be assigned to
 * prog->aux->ctx_arg_info of BPF struct_ops programs to provide the
 * argument information required by the verifier to verify the program.
 *
 * btf_ctx_access() will lookup prog->aux->ctx_arg_info to find the
 * corresponding entry for an given argument.
 */
struct bpf_struct_ops_arg_info {
        struct bpf_ctx_arg_aux *info;
        u32 cnt;
};

struct bpf_struct_ops_desc {
        struct bpf_struct_ops *st_ops;

        const struct btf_type *type;
        const struct btf_type *value_type;
        u32 type_id;
        u32 value_id;

        /* Collection of argument information for each member */
        struct bpf_struct_ops_arg_info *arg_info;
};

enum bpf_struct_ops_state {
        BPF_STRUCT_OPS_STATE_INIT,
        BPF_STRUCT_OPS_STATE_INUSE,
        BPF_STRUCT_OPS_STATE_TOBEFREE,
        BPF_STRUCT_OPS_STATE_READY,
};

struct bpf_struct_ops_common_value {
        refcount_t refcnt;
        enum bpf_struct_ops_state state;
};

#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
/* This macro helps developer to register a struct_ops type and generate
 * type information correctly. Developers should use this macro to register
 * a struct_ops type instead of calling __register_bpf_struct_ops() directly.
 */
#define register_bpf_struct_ops(st_ops, type)                                \
        ({                                                                \
                struct bpf_struct_ops_##type {                                \
                        struct bpf_struct_ops_common_value common;        \
                        struct type data ____cacheline_aligned_in_smp;        \
                };                                                        \
                BTF_TYPE_EMIT(struct bpf_struct_ops_##type);                \
                __register_bpf_struct_ops(st_ops);                        \
        })
#define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
bool bpf_struct_ops_get(const void *kdata);
void bpf_struct_ops_put(const void *kdata);
int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
                                       void *value);
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
                                      struct bpf_tramp_link *link,
                                      const struct btf_func_model *model,
                                      void *stub_func,
                                      void **image, u32 *image_off,
                                      bool allow_alloc);
void bpf_struct_ops_image_free(void *image);
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
        if (owner == BPF_MODULE_OWNER)
                return bpf_struct_ops_get(data);
        else
                return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
        if (owner == BPF_MODULE_OWNER)
                bpf_struct_ops_put(data);
        else
                module_put(owner);
}
int bpf_struct_ops_link_create(union bpf_attr *attr);

#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
struct bpf_dummy_ops_state {
        int val;
};

struct bpf_dummy_ops {
        int (*test_1)(struct bpf_dummy_ops_state *cb);
        int (*test_2)(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
                      char a3, unsigned long a4);
        int (*test_sleepable)(struct bpf_dummy_ops_state *cb);
};

int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
                            union bpf_attr __user *uattr);
#endif
int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
                             struct btf *btf,
                             struct bpf_verifier_log *log);
void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map);
void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc);
#else
#define register_bpf_struct_ops(st_ops, type) ({ (void *)(st_ops); 0; })
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
        return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
        module_put(owner);
}
static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
                                                     void *key,
                                                     void *value)
{
        return -EINVAL;
}
static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
{
        return -EOPNOTSUPP;
}
static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
{
}

static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
{
}

#endif

#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
                                    int cgroup_atype);
void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
#else
static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
                                                  int cgroup_atype)
{
        return -EOPNOTSUPP;
}
static inline void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
{
}
#endif

struct bpf_array {
        struct bpf_map map;
        u32 elem_size;
        u32 index_mask;
        struct bpf_array_aux *aux;
        union {
                DECLARE_FLEX_ARRAY(char, value) __aligned(8);
                DECLARE_FLEX_ARRAY(void *, ptrs) __aligned(8);
                DECLARE_FLEX_ARRAY(void __percpu *, pptrs) __aligned(8);
        };
};

#define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
#define MAX_TAIL_CALL_CNT 33

/* Maximum number of loops for bpf_loop and bpf_iter_num.
 * It's enum to expose it (and thus make it discoverable) through BTF.
 */
enum {
        BPF_MAX_LOOPS = 8 * 1024 * 1024,
};

#define BPF_F_ACCESS_MASK        (BPF_F_RDONLY |                \
                                 BPF_F_RDONLY_PROG |        \
                                 BPF_F_WRONLY |                \
                                 BPF_F_WRONLY_PROG)

#define BPF_MAP_CAN_READ        BIT(0)
#define BPF_MAP_CAN_WRITE        BIT(1)

/* Maximum number of user-producer ring buffer samples that can be drained in
 * a call to bpf_user_ringbuf_drain().
 */
#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024)

static inline u32 bpf_map_flags_to_cap(struct bpf_map *map)
{
        u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);

        /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is
         * not possible.
         */
        if (access_flags & BPF_F_RDONLY_PROG)
                return BPF_MAP_CAN_READ;
        else if (access_flags & BPF_F_WRONLY_PROG)
                return BPF_MAP_CAN_WRITE;
        else
                return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE;
}

static inline bool bpf_map_flags_access_ok(u32 access_flags)
{
        return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) !=
               (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
}

struct bpf_event_entry {
        struct perf_event *event;
        struct file *perf_file;
        struct file *map_file;
        struct rcu_head rcu;
};

static inline bool map_type_contains_progs(struct bpf_map *map)
{
        return map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
               map->map_type == BPF_MAP_TYPE_DEVMAP ||
               map->map_type == BPF_MAP_TYPE_CPUMAP;
}

bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp);
int bpf_prog_calc_tag(struct bpf_prog *fp);

const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void);

typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
                                        unsigned long off, unsigned long len);
typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
                                        const struct bpf_insn *src,
                                        struct bpf_insn *dst,
                                        struct bpf_prog *prog,
                                        u32 *target_size);

u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);

/* an array of programs to be executed under rcu_lock.
 *
 * Typical usage:
 * ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run);
 *
 * the structure returned by bpf_prog_array_alloc() should be populated
 * with program pointers and the last pointer must be NULL.
 * The user has to keep refcnt on the program and make sure the program
 * is removed from the array before bpf_prog_put().
 * The 'struct bpf_prog_array *' should only be replaced with xchg()
 * since other cpus are walking the array of pointers in parallel.
 */
struct bpf_prog_array_item {
        struct bpf_prog *prog;
        union {
                struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
                u64 bpf_cookie;
        };
};

struct bpf_prog_array {
        struct rcu_head rcu;
        struct bpf_prog_array_item items[];
};

struct bpf_empty_prog_array {
        struct bpf_prog_array hdr;
        struct bpf_prog *null_prog;
};

/* to avoid allocating empty bpf_prog_array for cgroups that
 * don't have bpf program attached use one global 'bpf_empty_prog_array'
 * It will not be modified the caller of bpf_prog_array_alloc()
 * (since caller requested prog_cnt == 0)
 * that pointer should be 'freed' by bpf_prog_array_free()
 */
extern struct bpf_empty_prog_array bpf_empty_prog_array;

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs);
/* Use when traversal over the bpf_prog_array uses tasks_trace rcu */
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs);
bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
                                __u32 __user *prog_ids, u32 cnt);

void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
                                struct bpf_prog *old_prog);
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index);
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog);
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt);
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        u64 bpf_cookie,
                        struct bpf_prog_array **new_array);

struct bpf_run_ctx {};

struct bpf_cg_run_ctx {
        struct bpf_run_ctx run_ctx;
        const struct bpf_prog_array_item *prog_item;
        int retval;
};

struct bpf_trace_run_ctx {
        struct bpf_run_ctx run_ctx;
        u64 bpf_cookie;
        bool is_uprobe;
};

struct bpf_tramp_run_ctx {
        struct bpf_run_ctx run_ctx;
        u64 bpf_cookie;
        struct bpf_run_ctx *saved_run_ctx;
};

static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
{
        struct bpf_run_ctx *old_ctx = NULL;

#ifdef CONFIG_BPF_SYSCALL
        old_ctx = current->bpf_ctx;
        current->bpf_ctx = new_ctx;
#endif
        return old_ctx;
}

static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)
{
#ifdef CONFIG_BPF_SYSCALL
        current->bpf_ctx = old_ctx;
#endif
}

/* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */
#define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE                        (1 << 0)
/* BPF program asks to set CN on the packet. */
#define BPF_RET_SET_CN                                                (1 << 0)

typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);

static __always_inline u32
bpf_prog_run_array(const struct bpf_prog_array *array,
                   const void *ctx, bpf_prog_run_fn run_prog)
{
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;
        u32 ret = 1;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");

        if (unlikely(!array))
                return ret;

        run_ctx.is_uprobe = false;

        migrate_disable();
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                run_ctx.bpf_cookie = item->bpf_cookie;
                ret &= run_prog(prog, ctx);
                item++;
        }
        bpf_reset_run_ctx(old_run_ctx);
        migrate_enable();
        return ret;
}

/* Notes on RCU design for bpf_prog_arrays containing sleepable programs:
 *
 * We use the tasks_trace rcu flavor read section to protect the bpf_prog_array
 * overall. As a result, we must use the bpf_prog_array_free_sleepable
 * in order to use the tasks_trace rcu grace period.
 *
 * When a non-sleepable program is inside the array, we take the rcu read
 * section and disable preemption for that program alone, so it can access
 * rcu-protected dynamically sized maps.
 */
static __always_inline u32
bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu,
                          const void *ctx, bpf_prog_run_fn run_prog)
{
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        const struct bpf_prog_array *array;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;
        u32 ret = 1;

        might_fault();

        rcu_read_lock_trace();
        migrate_disable();

        run_ctx.is_uprobe = true;

        array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held());
        if (unlikely(!array))
                goto out;
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                if (!prog->sleepable)
                        rcu_read_lock();

                run_ctx.bpf_cookie = item->bpf_cookie;
                ret &= run_prog(prog, ctx);
                item++;

                if (!prog->sleepable)
                        rcu_read_unlock();
        }
        bpf_reset_run_ctx(old_run_ctx);
out:
        migrate_enable();
        rcu_read_unlock_trace();
        return ret;
}

#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
extern struct mutex bpf_stats_enabled_mutex;

/*
 * Block execution of BPF programs attached to instrumentation (perf,
 * kprobes, tracepoints) to prevent deadlocks on map operations as any of
 * these events can happen inside a region which holds a map bucket lock
 * and can deadlock on it.
 */
static inline void bpf_disable_instrumentation(void)
{
        migrate_disable();
        this_cpu_inc(bpf_prog_active);
}

static inline void bpf_enable_instrumentation(void)
{
        this_cpu_dec(bpf_prog_active);
        migrate_enable();
}

extern const struct super_operations bpf_super_ops;
extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;
extern const struct file_operations bpf_iter_fops;

#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        extern const struct bpf_prog_ops _name ## _prog_ops; \
        extern const struct bpf_verifier_ops _name ## _verifier_ops;
#define BPF_MAP_TYPE(_id, _ops) \
        extern const struct bpf_map_ops _ops;
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE

extern const struct bpf_prog_ops bpf_offload_prog_ops;
extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
extern const struct bpf_verifier_ops xdp_analyzer_ops;

struct bpf_prog *bpf_prog_get(u32 ufd);
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
                                       bool attach_drv);
void bpf_prog_add(struct bpf_prog *prog, int i);
void bpf_prog_sub(struct bpf_prog *prog, int i);
void bpf_prog_inc(struct bpf_prog *prog);
struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
void bpf_prog_put(struct bpf_prog *prog);

void bpf_prog_free_id(struct bpf_prog *prog);
void bpf_map_free_id(struct bpf_map *map);

struct btf_field *btf_record_find(const struct btf_record *rec,
                                  u32 offset, u32 field_mask);
void btf_record_free(struct btf_record *rec);
void bpf_map_free_record(struct bpf_map *map);
struct btf_record *btf_record_dup(const struct btf_record *rec);
bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);

struct bpf_map *bpf_map_get(u32 ufd);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
void bpf_map_inc(struct bpf_map *map);
void bpf_map_inc_with_uref(struct bpf_map *map);
struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
void *bpf_map_area_alloc(u64 size, int numa_node);
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
void bpf_map_area_free(void *base);
bool bpf_map_write_active(const struct bpf_map *map);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
int  generic_map_lookup_batch(struct bpf_map *map,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
int  generic_map_update_batch(struct bpf_map *map, struct file *map_file,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
int  generic_map_delete_batch(struct bpf_map *map,
                              const union bpf_attr *attr,
                              union bpf_attr __user *uattr);
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);

int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
                        unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG_KMEM
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
                           int node);
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
                       gfp_t flags);
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
                                    size_t align, gfp_t flags);
#else
#define bpf_map_kmalloc_node(_map, _size, _flags, _node)        \
                kmalloc_node(_size, _flags, _node)
#define bpf_map_kzalloc(_map, _size, _flags)                        \
                kzalloc(_size, _flags)
#define bpf_map_kvcalloc(_map, _n, _size, _flags)                \
                kvcalloc(_n, _size, _flags)
#define bpf_map_alloc_percpu(_map, _size, _align, _flags)        \
                __alloc_percpu_gfp(_size, _align, _flags)
#endif

static inline int
bpf_map_init_elem_count(struct bpf_map *map)
{
        size_t size = sizeof(*map->elem_count), align = size;
        gfp_t flags = GFP_USER | __GFP_NOWARN;

        map->elem_count = bpf_map_alloc_percpu(map, size, align, flags);
        if (!map->elem_count)
                return -ENOMEM;

        return 0;
}

static inline void
bpf_map_free_elem_count(struct bpf_map *map)
{
        free_percpu(map->elem_count);
}

static inline void bpf_map_inc_elem_count(struct bpf_map *map)
{
        this_cpu_inc(*map->elem_count);
}

static inline void bpf_map_dec_elem_count(struct bpf_map *map)
{
        this_cpu_dec(*map->elem_count);
}

extern int sysctl_unprivileged_bpf_disabled;

bool bpf_token_capable(const struct bpf_token *token, int cap);

static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
{
        return bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
{
        return bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
{
        return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
}

static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
{
        return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
}

int bpf_map_new_fd(struct bpf_map *map, int flags);
int bpf_prog_new_fd(struct bpf_prog *prog);

void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                   const struct bpf_link_ops *ops, struct bpf_prog *prog);
int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
int bpf_link_settle(struct bpf_link_primer *primer);
void bpf_link_cleanup(struct bpf_link_primer *primer);
void bpf_link_inc(struct bpf_link *link);
struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link);
void bpf_link_put(struct bpf_link *link);
int bpf_link_new_fd(struct bpf_link *link);
struct bpf_link *bpf_link_get_from_fd(u32 ufd);
struct bpf_link *bpf_link_get_curr_or_next(u32 *id);

void bpf_token_inc(struct bpf_token *token);
void bpf_token_put(struct bpf_token *token);
int bpf_token_create(union bpf_attr *attr);
struct bpf_token *bpf_token_get_from_fd(u32 ufd);

bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
bool bpf_token_allow_prog_type(const struct bpf_token *token,
                               enum bpf_prog_type prog_type,
                               enum bpf_attach_type attach_type);

int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
                            umode_t mode);

#define BPF_ITER_FUNC_PREFIX "bpf_iter_"
#define DEFINE_BPF_ITER_FUNC(target, args...)                        \
        extern int bpf_iter_ ## target(args);                        \
        int __init bpf_iter_ ## target(args) { return 0; }

/*
 * The task type of iterators.
 *
 * For BPF task iterators, they can be parameterized with various
 * parameters to visit only some of tasks.
 *
 * BPF_TASK_ITER_ALL (default)
 *        Iterate over resources of every task.
 *
 * BPF_TASK_ITER_TID
 *        Iterate over resources of a task/tid.
 *
 * BPF_TASK_ITER_TGID
 *        Iterate over resources of every task of a process / task group.
 */
enum bpf_iter_task_type {
        BPF_TASK_ITER_ALL = 0,
        BPF_TASK_ITER_TID,
        BPF_TASK_ITER_TGID,
};

struct bpf_iter_aux_info {
        /* for map_elem iter */
        struct bpf_map *map;

        /* for cgroup iter */
        struct {
                struct cgroup *start; /* starting cgroup */
                enum bpf_cgroup_iter_order order;
        } cgroup;
        struct {
                enum bpf_iter_task_type        type;
                u32 pid;
        } task;
};

typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
                                        union bpf_iter_link_info *linfo,
                                        struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux);
typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
                                        struct seq_file *seq);
typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
                                         struct bpf_link_info *info);
typedef const struct bpf_func_proto *
(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id,
                             const struct bpf_prog *prog);

enum bpf_iter_feature {
        BPF_ITER_RESCHED        = BIT(0),
};

#define BPF_ITER_CTX_ARG_MAX 2
struct bpf_iter_reg {
        const char *target;
        bpf_iter_attach_target_t attach_target;
        bpf_iter_detach_target_t detach_target;
        bpf_iter_show_fdinfo_t show_fdinfo;
        bpf_iter_fill_link_info_t fill_link_info;
        bpf_iter_get_func_proto_t get_func_proto;
        u32 ctx_arg_info_size;
        u32 feature;
        struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
        const struct bpf_iter_seq_info *seq_info;
};

struct bpf_iter_meta {
        __bpf_md_ptr(struct seq_file *, seq);
        u64 session_id;
        u64 seq_num;
};

struct bpf_iter__bpf_map_elem {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct bpf_map *, map);
        __bpf_md_ptr(void *, key);
        __bpf_md_ptr(void *, value);
};

int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
bool bpf_iter_prog_supported(struct bpf_prog *prog);
const struct bpf_func_proto *
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog);
int bpf_iter_new_fd(struct bpf_link *link);
bool bpf_link_is_iter(struct bpf_link *link);
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop);
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx);
void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
                              struct seq_file *seq);
int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
                                struct bpf_link_info *info);

int map_set_for_each_callback_args(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee);

int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
                           u64 flags);
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
                            u64 flags);

int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);

int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
                                 void *key, void *value, u64 map_flags);
int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
                                void *key, void *value, u64 map_flags);
int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);

int bpf_get_file_flag(int flags);
int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
                             size_t actual_size);

/* verify correctness of eBPF program */
int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
#endif

struct btf *bpf_get_btf_vmlinux(void);

/* Map specifics */
struct xdp_frame;
struct sk_buff;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

void __dev_flush(void);
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
                             struct bpf_prog *xdp_prog);
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           struct bpf_prog *xdp_prog, struct bpf_map *map,
                           bool exclude_ingress);

void __cpu_map_flush(void);
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
                    struct net_device *dev_rx);
int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
                             struct sk_buff *skb);

/* Return map's numa specified by userspace */
static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
{
        return (attr->map_flags & BPF_F_NUMA_NODE) ?
                attr->numa_node : NUMA_NO_NODE;
}

struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
int array_map_alloc_check(union bpf_attr *attr);

int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
                          union bpf_attr __user *uattr);
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
                          union bpf_attr __user *uattr);
int bpf_prog_test_run_tracing(struct bpf_prog *prog,
                              const union bpf_attr *kattr,
                              union bpf_attr __user *uattr);
int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
                                     const union bpf_attr *kattr,
                                     union bpf_attr __user *uattr);
int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
                             const union bpf_attr *kattr,
                             union bpf_attr __user *uattr);
int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
                                const union bpf_attr *kattr,
                                union bpf_attr __user *uattr);
int bpf_prog_test_run_nf(struct bpf_prog *prog,
                         const union bpf_attr *kattr,
                         union bpf_attr __user *uattr);
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                    const struct bpf_prog *prog,
                    struct bpf_insn_access_aux *info);

static inline bool bpf_tracing_ctx_access(int off, int size,
                                          enum bpf_access_type type)
{
        if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0)
                return false;
        return true;
}

static inline bool bpf_tracing_btf_ctx_access(int off, int size,
                                              enum bpf_access_type type,
                                              const struct bpf_prog *prog,
                                              struct bpf_insn_access_aux *info)
{
        if (!bpf_tracing_ctx_access(off, size, type))
                return false;
        return btf_ctx_access(off, size, type, prog, info);
}

int btf_struct_access(struct bpf_verifier_log *log,
                      const struct bpf_reg_state *reg,
                      int off, int size, enum bpf_access_type atype,
                      u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name);
bool btf_struct_ids_match(struct bpf_verifier_log *log,
                          const struct btf *btf, u32 id, int off,
                          const struct btf *need_btf, u32 need_type_id,
                          bool strict);

int btf_distill_func_proto(struct bpf_verifier_log *log,
                           struct btf *btf,
                           const struct btf_type *func_proto,
                           const char *func_name,
                           struct btf_func_model *m);

struct bpf_reg_state;
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
                         struct btf *btf, const struct btf_type *t);
const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
                                    int comp_idx, const char *tag_key);
int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
                           int comp_idx, const char *tag_key, int last_id);

struct bpf_prog *bpf_prog_by_id(u32 id);
struct bpf_link *bpf_link_by_id(u32 id);

const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
                                                 const struct bpf_prog *prog);
void bpf_task_storage_free(struct task_struct *task);
void bpf_cgrp_storage_free(struct cgroup *cgroup);
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn);
int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                       u16 btf_fd_idx, u8 **func_addr);

struct bpf_core_ctx {
        struct bpf_verifier_log *log;
        const struct btf *btf;
};

bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
                                const struct bpf_reg_state *reg,
                                const char *field_name, u32 btf_id, const char *suffix);

bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
                               const struct btf *reg_btf, u32 reg_id,
                               const struct btf *arg_btf, u32 arg_id);

int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
                   int relo_idx, void *insn);

static inline bool unprivileged_ebpf_enabled(void)
{
        return !sysctl_unprivileged_bpf_disabled;
}

/* Not all bpf prog type has the bpf_ctx.
 * For the bpf prog type that has initialized the bpf_ctx,
 * this function can be used to decide if a kernel function
 * is called by a bpf program.
 */
static inline bool has_current_bpf_ctx(void)
{
        return !!current->bpf_ctx;
}

void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);

void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                     enum bpf_dynptr_type type, u32 offset, u32 size);
void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);

bool dev_check_flush(void);
bool cpu_map_check_flush(void);
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
                                                     enum bpf_prog_type type,
                                                     bool attach_drv)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_prog_add(struct bpf_prog *prog, int i)
{
}

static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
{
}

static inline void bpf_prog_put(struct bpf_prog *prog)
{
}

static inline void bpf_prog_inc(struct bpf_prog *prog)
{
}

static inline struct bpf_prog *__must_check
bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                                 const struct bpf_link_ops *ops,
                                 struct bpf_prog *prog)
{
}

static inline int bpf_link_prime(struct bpf_link *link,
                                 struct bpf_link_primer *primer)
{
        return -EOPNOTSUPP;
}

static inline int bpf_link_settle(struct bpf_link_primer *primer)
{
        return -EOPNOTSUPP;
}

static inline void bpf_link_cleanup(struct bpf_link_primer *primer)
{
}

static inline void bpf_link_inc(struct bpf_link *link)
{
}

static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
        return NULL;
}

static inline void bpf_link_put(struct bpf_link *link)
{
}

static inline int bpf_obj_get_user(const char __user *pathname, int flags)
{
        return -EOPNOTSUPP;
}

static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
{
        return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
}

static inline void bpf_token_inc(struct bpf_token *token)
{
}

static inline void bpf_token_put(struct bpf_token *token)
{
}

static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void __dev_flush(void)
{
}

struct xdp_frame;
struct bpf_dtab_netdev;
struct bpf_cpu_map_entry;

static inline
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return 0;
}

static inline
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return 0;
}

static inline
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress)
{
        return 0;
}

struct sk_buff;

static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
                                           struct sk_buff *skb,
                                           struct bpf_prog *xdp_prog)
{
        return 0;
}

static inline
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           struct bpf_prog *xdp_prog, struct bpf_map *map,
                           bool exclude_ingress)
{
        return 0;
}

static inline void __cpu_map_flush(void)
{
}

static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
                                  struct xdp_frame *xdpf,
                                  struct net_device *dev_rx)
{
        return 0;
}

static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
                                           struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
                                enum bpf_prog_type type)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog,
                                        const union bpf_attr *kattr,
                                        union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_skb(struct bpf_prog *prog,
                                        const union bpf_attr *kattr,
                                        union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog,
                                            const union bpf_attr *kattr,
                                            union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
                                                   const union bpf_attr *kattr,
                                                   union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
                                              const union bpf_attr *kattr,
                                              union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

static inline void bpf_map_put(struct bpf_map *map)
{
}

static inline struct bpf_prog *bpf_prog_by_id(u32 id)
{
        return ERR_PTR(-ENOTSUPP);
}

static inline int btf_struct_access(struct bpf_verifier_log *log,
                                    const struct bpf_reg_state *reg,
                                    int off, int size, enum bpf_access_type atype,
                                    u32 *next_btf_id, enum bpf_type_flag *flag,
                                    const char **field_name)
{
        return -EACCES;
}

static inline const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        return NULL;
}

static inline void bpf_task_storage_free(struct task_struct *task)
{
}

static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
{
        return false;
}

static inline const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn)
{
        return NULL;
}

static inline int
bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                   u16 btf_fd_idx, u8 **func_addr)
{
        return -ENOTSUPP;
}

static inline bool unprivileged_ebpf_enabled(void)
{
        return false;
}

static inline bool has_current_bpf_ctx(void)
{
        return false;
}

static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
{
}

static inline void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
}

static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
                                   enum bpf_dynptr_type type, u32 offset, u32 size)
{
}

static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
{
}

static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
{
}
#endif /* CONFIG_BPF_SYSCALL */

static __always_inline int
bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
        int ret = -EFAULT;

        if (IS_ENABLED(CONFIG_BPF_EVENTS))
                ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
                          struct btf_mod_pair *used_btfs, u32 len);

static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
                                                 enum bpf_prog_type type)
{
        return bpf_prog_get_type_dev(ufd, type, false);
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len);

bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool);

int bpf_prog_offload_compile(struct bpf_prog *prog);
void bpf_prog_dev_bound_destroy(struct bpf_prog *prog);
int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
                               struct bpf_prog *prog);

int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map);

int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
int bpf_map_offload_update_elem(struct bpf_map *map,
                                void *key, void *value, u64 flags);
int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
int bpf_map_offload_get_next_key(struct bpf_map *map,
                                 void *key, void *next_key);

bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map);

struct bpf_offload_dev *
bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv);
void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev);
void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev);
int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
                                    struct net_device *netdev);
void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
                                       struct net_device *netdev);
bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev);

void unpriv_ebpf_notify(int new_state);

#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
                              struct bpf_prog_aux *prog_aux);
void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id);
int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr);
int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog);
void bpf_dev_bound_netdev_unregister(struct net_device *dev);

static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
{
        return aux->dev_bound;
}

static inline bool bpf_prog_is_offloaded(const struct bpf_prog_aux *aux)
{
        return aux->offload_requested;
}

bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs);

static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
        return unlikely(map->ops == &bpf_map_offload_ops);
}

struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
void bpf_map_offload_map_free(struct bpf_map *map);
u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map);
int bpf_prog_test_run_syscall(struct bpf_prog *prog,
                              const union bpf_attr *kattr,
                              union bpf_attr __user *uattr);

int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
int sock_map_bpf_prog_query(const union bpf_attr *attr,
                            union bpf_attr __user *uattr);
int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);

void sock_map_unhash(struct sock *sk);
void sock_map_destroy(struct sock *sk);
void sock_map_close(struct sock *sk, long timeout);
#else
static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
                                            struct bpf_prog_aux *prog_aux)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog,
                                                u32 func_id)
{
        return NULL;
}

static inline int bpf_prog_dev_bound_init(struct bpf_prog *prog,
                                          union bpf_attr *attr)
{
        return -EOPNOTSUPP;
}

static inline int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog,
                                             struct bpf_prog *old_prog)
{
        return -EOPNOTSUPP;
}

static inline void bpf_dev_bound_netdev_unregister(struct net_device *dev)
{
}

static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
{
        return false;
}

static inline bool bpf_prog_is_offloaded(struct bpf_prog_aux *aux)
{
        return false;
}

static inline bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
{
        return false;
}

static inline bool bpf_map_is_offloaded(struct bpf_map *map)
{
        return false;
}

static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline void bpf_map_offload_map_free(struct bpf_map *map)
{
}

static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
{
        return 0;
}

static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
                                            const union bpf_attr *kattr,
                                            union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

#ifdef CONFIG_BPF_SYSCALL
static inline int sock_map_get_from_fd(const union bpf_attr *attr,
                                       struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int sock_map_prog_detach(const union bpf_attr *attr,
                                       enum bpf_prog_type ptype)
{
        return -EOPNOTSUPP;
}

static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
                                           u64 flags)
{
        return -EOPNOTSUPP;
}

static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
                                          union bpf_attr __user *uattr)
{
        return -EINVAL;
}

static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */

static __always_inline void
bpf_prog_inc_misses_counters(const struct bpf_prog_array *array)
{
        const struct bpf_prog_array_item *item;
        struct bpf_prog *prog;

        if (unlikely(!array))
                return;

        item = &array->items[0];
        while ((prog = READ_ONCE(item->prog))) {
                bpf_prog_inc_misses_counter(prog);
                item++;
        }
}

#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
void bpf_sk_reuseport_detach(struct sock *sk);
int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
                                       void *value);
int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
                                       void *value, u64 map_flags);
#else
static inline void bpf_sk_reuseport_detach(struct sock *sk)
{
}

#ifdef CONFIG_BPF_SYSCALL
static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
                                                     void *key, void *value)
{
        return -EOPNOTSUPP;
}

static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
                                                     void *key, void *value,
                                                     u64 map_flags)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_SYSCALL */
#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */

/* verifier prototypes for helper functions called from eBPF programs */
extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
extern const struct bpf_func_proto bpf_map_update_elem_proto;
extern const struct bpf_func_proto bpf_map_delete_elem_proto;
extern const struct bpf_func_proto bpf_map_push_elem_proto;
extern const struct bpf_func_proto bpf_map_pop_elem_proto;
extern const struct bpf_func_proto bpf_map_peek_elem_proto;
extern const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto;

extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
extern const struct bpf_func_proto bpf_tail_call_proto;
extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto;
extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto;
extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_get_task_stack_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto;
extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;
extern const struct bpf_func_proto bpf_strtol_proto;
extern const struct bpf_func_proto bpf_strtoul_proto;
extern const struct bpf_func_proto bpf_tcp_sock_proto;
extern const struct bpf_func_proto bpf_jiffies64_proto;
extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_event_output_data_proto;
extern const struct bpf_func_proto bpf_ringbuf_output_proto;
extern const struct bpf_func_proto bpf_ringbuf_reserve_proto;
extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
extern const struct bpf_func_proto bpf_ringbuf_query_proto;
extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto;
extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto;
extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_unix_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto;
extern const struct bpf_func_proto bpf_copy_from_user_proto;
extern const struct bpf_func_proto bpf_snprintf_btf_proto;
extern const struct bpf_func_proto bpf_snprintf_proto;
extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
extern const struct bpf_func_proto bpf_sock_from_file_proto;
extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
extern const struct bpf_func_proto bpf_task_storage_get_recur_proto;
extern const struct bpf_func_proto bpf_task_storage_get_proto;
extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto;
extern const struct bpf_func_proto bpf_task_storage_delete_proto;
extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_find_vma_proto;
extern const struct bpf_func_proto bpf_loop_proto;
extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
extern const struct bpf_func_proto bpf_set_retval_proto;
extern const struct bpf_func_proto bpf_get_retval_proto;
extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
extern const struct bpf_func_proto bpf_cgrp_storage_get_proto;
extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto;

const struct bpf_func_proto *tracing_prog_func_proto(
  enum bpf_func_id func_id, const struct bpf_prog *prog);

/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

#if defined(CONFIG_NET)
bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info);
bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info);
u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog,
                                u32 *target_size);
int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
                               struct bpf_dynptr_kern *ptr);
#else
static inline bool bpf_sock_common_is_valid_access(int off, int size,
                                                   enum bpf_access_type type,
                                                   struct bpf_insn_access_aux *info)
{
        return false;
}
static inline bool bpf_sock_is_valid_access(int off, int size,
                                            enum bpf_access_type type,
                                            struct bpf_insn_access_aux *info)
{
        return false;
}
static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                              const struct bpf_insn *si,
                                              struct bpf_insn *insn_buf,
                                              struct bpf_prog *prog,
                                              u32 *target_size)
{
        return 0;
}
static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
                                             struct bpf_dynptr_kern *ptr)
{
        return -EOPNOTSUPP;
}
#endif

#ifdef CONFIG_INET
struct sk_reuseport_kern {
        struct sk_buff *skb;
        struct sock *sk;
        struct sock *selected_sk;
        struct sock *migrating_sk;
        void *data_end;
        u32 hash;
        u32 reuseport_id;
        bool bind_inany;
};
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info);

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog,
                                    u32 *target_size);

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info);

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog,
                                    u32 *target_size);
#else
static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
                                                enum bpf_access_type type,
                                                struct bpf_insn_access_aux *info)
{
        return false;
}

static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                                  const struct bpf_insn *si,
                                                  struct bpf_insn *insn_buf,
                                                  struct bpf_prog *prog,
                                                  u32 *target_size)
{
        return 0;
}
static inline bool bpf_xdp_sock_is_valid_access(int off, int size,
                                                enum bpf_access_type type,
                                                struct bpf_insn_access_aux *info)
{
        return false;
}

static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                                  const struct bpf_insn *si,
                                                  struct bpf_insn *insn_buf,
                                                  struct bpf_prog *prog,
                                                  u32 *target_size)
{
        return 0;
}
#endif /* CONFIG_INET */

enum bpf_text_poke_type {
        BPF_MOD_CALL,
        BPF_MOD_JUMP,
};

int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                       void *addr1, void *addr2);

void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
                               struct bpf_prog *new, struct bpf_prog *old);

void *bpf_arch_text_copy(void *dst, void *src, size_t len);
int bpf_arch_text_invalidate(void *dst, size_t len);

struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);

#define MAX_BPRINTF_VARARGS                12
#define MAX_BPRINTF_BUF                        1024

struct bpf_bprintf_data {
        u32 *bin_args;
        char *buf;
        bool get_bin_args;
        bool get_buf;
};

int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
                        u32 num_args, struct bpf_bprintf_data *data);
void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);

#ifdef CONFIG_BPF_LSM
void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
void bpf_cgroup_atype_put(int cgroup_atype);
#else
static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {}
static inline void bpf_cgroup_atype_put(int cgroup_atype) {}
#endif /* CONFIG_BPF_LSM */

struct key;

#ifdef CONFIG_KEYS
struct bpf_key {
        struct key *key;
        bool has_ref;
};
#endif /* CONFIG_KEYS */

static inline bool type_is_alloc(u32 type)
{
        return type & MEM_ALLOC;
}

static inline gfp_t bpf_memcg_flags(gfp_t flags)
{
        if (memcg_bpf_enabled())
                return flags | __GFP_ACCOUNT;
        return flags;
}

static inline bool bpf_is_subprog(const struct bpf_prog *prog)
{
        return prog->aux->func_idx != 0;
}

#endif /* _LINUX_BPF_H */







































































































































    2 

    2 




























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
// SPDX-License-Identifier: GPL-2.0
/*
 * Author: Andrei Vagin <avagin@openvz.org>
 * Author: Dmitry Safonov <dima@arista.com>
 */

#include <linux/time_namespace.h>
#include <linux/user_namespace.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/clocksource.h>
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/err.h>
#include <linux/mm.h>

#include <vdso/datapage.h>

ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
                                struct timens_offsets *ns_offsets)
{
        ktime_t offset;

        switch (clockid) {
        case CLOCK_MONOTONIC:
                offset = timespec64_to_ktime(ns_offsets->monotonic);
                break;
        case CLOCK_BOOTTIME:
        case CLOCK_BOOTTIME_ALARM:
                offset = timespec64_to_ktime(ns_offsets->boottime);
                break;
        default:
                return tim;
        }

        /*
         * Check that @tim value is in [offset, KTIME_MAX + offset]
         * and subtract offset.
         */
        if (tim < offset) {
                /*
                 * User can specify @tim *absolute* value - if it's lesser than
                 * the time namespace's offset - it's already expired.
                 */
                tim = 0;
        } else {
                tim = ktime_sub(tim, offset);
                if (unlikely(tim > KTIME_MAX))
                        tim = KTIME_MAX;
        }

        return tim;
}

static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
}

static void dec_time_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
}

/**
 * clone_time_ns - Clone a time namespace
 * @user_ns:        User namespace which owns a new namespace.
 * @old_ns:        Namespace to clone
 *
 * Clone @old_ns and set the clone refcount to 1
 *
 * Return: The new namespace or ERR_PTR.
 */
static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
                                          struct time_namespace *old_ns)
{
        struct time_namespace *ns;
        struct ucounts *ucounts;
        int err;

        err = -ENOSPC;
        ucounts = inc_time_namespaces(user_ns);
        if (!ucounts)
                goto fail;

        err = -ENOMEM;
        ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
        if (!ns)
                goto fail_dec;

        refcount_set(&ns->ns.count, 1);

        ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!ns->vvar_page)
                goto fail_free;

        err = ns_alloc_inum(&ns->ns);
        if (err)
                goto fail_free_page;

        ns->ucounts = ucounts;
        ns->ns.ops = &timens_operations;
        ns->user_ns = get_user_ns(user_ns);
        ns->offsets = old_ns->offsets;
        ns->frozen_offsets = false;
        return ns;

fail_free_page:
        __free_page(ns->vvar_page);
fail_free:
        kfree(ns);
fail_dec:
        dec_time_namespaces(ucounts);
fail:
        return ERR_PTR(err);
}

/**
 * copy_time_ns - Create timens_for_children from @old_ns
 * @flags:        Cloning flags
 * @user_ns:        User namespace which owns a new namespace.
 * @old_ns:        Namespace to clone
 *
 * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
 * adds a refcounter to @old_ns otherwise.
 *
 * Return: timens_for_children namespace or ERR_PTR.
 */
struct time_namespace *copy_time_ns(unsigned long flags,
        struct user_namespace *user_ns, struct time_namespace *old_ns)
{
        if (!(flags & CLONE_NEWTIME))
                return get_time_ns(old_ns);

        return clone_time_ns(user_ns, old_ns);
}

static struct timens_offset offset_from_ts(struct timespec64 off)
{
        struct timens_offset ret;

        ret.sec = off.tv_sec;
        ret.nsec = off.tv_nsec;

        return ret;
}

/*
 * A time namespace VVAR page has the same layout as the VVAR page which
 * contains the system wide VDSO data.
 *
 * For a normal task the VVAR pages are installed in the normal ordering:
 *     VVAR
 *     PVCLOCK
 *     HVCLOCK
 *     TIMENS   <- Not really required
 *
 * Now for a timens task the pages are installed in the following order:
 *     TIMENS
 *     PVCLOCK
 *     HVCLOCK
 *     VVAR
 *
 * The check for vdso_data->clock_mode is in the unlikely path of
 * the seq begin magic. So for the non-timens case most of the time
 * 'seq' is even, so the branch is not taken.
 *
 * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
 * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
 * update to finish and for 'seq' to become even anyway.
 *
 * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
 * enforces the time namespace handling path.
 */
static void timens_setup_vdso_data(struct vdso_data *vdata,
                                   struct time_namespace *ns)
{
        struct timens_offset *offset = vdata->offset;
        struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
        struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);

        vdata->seq                        = 1;
        vdata->clock_mode                = VDSO_CLOCKMODE_TIMENS;
        offset[CLOCK_MONOTONIC]                = monotonic;
        offset[CLOCK_MONOTONIC_RAW]        = monotonic;
        offset[CLOCK_MONOTONIC_COARSE]        = monotonic;
        offset[CLOCK_BOOTTIME]                = boottime;
        offset[CLOCK_BOOTTIME_ALARM]        = boottime;
}

struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
        if (likely(vma->vm_mm == current->mm))
                return current->nsproxy->time_ns->vvar_page;

        /*
         * VM_PFNMAP | VM_IO protect .fault() handler from being called
         * through interfaces like /proc/$pid/mem or
         * process_vm_{readv,writev}() as long as there's no .access()
         * in special_mapping_vmops().
         * For more details check_vma_flags() and __access_remote_vm()
         */

        WARN(1, "vvar_page accessed remotely");

        return NULL;
}

/*
 * Protects possibly multiple offsets writers racing each other
 * and tasks entering the namespace.
 */
static DEFINE_MUTEX(offset_lock);

static void timens_set_vvar_page(struct task_struct *task,
                                struct time_namespace *ns)
{
        struct vdso_data *vdata;
        unsigned int i;

        if (ns == &init_time_ns)
                return;

        /* Fast-path, taken by every task in namespace except the first. */
        if (likely(ns->frozen_offsets))
                return;

        mutex_lock(&offset_lock);
        /* Nothing to-do: vvar_page has been already initialized. */
        if (ns->frozen_offsets)
                goto out;

        ns->frozen_offsets = true;
        vdata = arch_get_vdso_data(page_address(ns->vvar_page));

        for (i = 0; i < CS_BASES; i++)
                timens_setup_vdso_data(&vdata[i], ns);

out:
        mutex_unlock(&offset_lock);
}

void free_time_ns(struct time_namespace *ns)
{
        dec_time_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
        __free_page(ns->vvar_page);
        kfree(ns);
}

static struct time_namespace *to_time_ns(struct ns_common *ns)
{
        return container_of(ns, struct time_namespace, ns);
}

static struct ns_common *timens_get(struct task_struct *task)
{
        struct time_namespace *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->time_ns;
                get_time_ns(ns);
        }
        task_unlock(task);

        return ns ? &ns->ns : NULL;
}

static struct ns_common *timens_for_children_get(struct task_struct *task)
{
        struct time_namespace *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->time_ns_for_children;
                get_time_ns(ns);
        }
        task_unlock(task);

        return ns ? &ns->ns : NULL;
}

static void timens_put(struct ns_common *ns)
{
        put_time_ns(to_time_ns(ns));
}

void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
{
        timens_set_vvar_page(tsk, ns);
        vdso_join_timens(tsk, ns);
}

static int timens_install(struct nsset *nsset, struct ns_common *new)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct time_namespace *ns = to_time_ns(new);

        if (!current_is_single_threaded())
                return -EUSERS;

        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        get_time_ns(ns);
        put_time_ns(nsproxy->time_ns);
        nsproxy->time_ns = ns;

        get_time_ns(ns);
        put_time_ns(nsproxy->time_ns_for_children);
        nsproxy->time_ns_for_children = ns;
        return 0;
}

void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
{
        struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
        struct time_namespace *ns = to_time_ns(nsc);

        /* create_new_namespaces() already incremented the ref counter */
        if (nsproxy->time_ns == nsproxy->time_ns_for_children)
                return;

        get_time_ns(ns);
        put_time_ns(nsproxy->time_ns);
        nsproxy->time_ns = ns;

        timens_commit(tsk, ns);
}

static struct user_namespace *timens_owner(struct ns_common *ns)
{
        return to_time_ns(ns)->user_ns;
}

static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
{
        char *clock;

        switch (clockid) {
        case CLOCK_BOOTTIME:
                clock = "boottime";
                break;
        case CLOCK_MONOTONIC:
                clock = "monotonic";
                break;
        default:
                clock = "unknown";
                break;
        }
        seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec);
}

void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
{
        struct ns_common *ns;
        struct time_namespace *time_ns;

        ns = timens_for_children_get(p);
        if (!ns)
                return;
        time_ns = to_time_ns(ns);

        show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
        show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
        put_time_ns(time_ns);
}

int proc_timens_set_offset(struct file *file, struct task_struct *p,
                           struct proc_timens_offset *offsets, int noffsets)
{
        struct ns_common *ns;
        struct time_namespace *time_ns;
        struct timespec64 tp;
        int i, err;

        ns = timens_for_children_get(p);
        if (!ns)
                return -ESRCH;
        time_ns = to_time_ns(ns);

        if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
                put_time_ns(time_ns);
                return -EPERM;
        }

        for (i = 0; i < noffsets; i++) {
                struct proc_timens_offset *off = &offsets[i];

                switch (off->clockid) {
                case CLOCK_MONOTONIC:
                        ktime_get_ts64(&tp);
                        break;
                case CLOCK_BOOTTIME:
                        ktime_get_boottime_ts64(&tp);
                        break;
                default:
                        err = -EINVAL;
                        goto out;
                }

                err = -ERANGE;

                if (off->val.tv_sec > KTIME_SEC_MAX ||
                    off->val.tv_sec < -KTIME_SEC_MAX)
                        goto out;

                tp = timespec64_add(tp, off->val);
                /*
                 * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is
                 * still unreachable.
                 */
                if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
                        goto out;
        }

        mutex_lock(&offset_lock);
        if (time_ns->frozen_offsets) {
                err = -EACCES;
                goto out_unlock;
        }

        err = 0;
        /* Don't report errors after this line */
        for (i = 0; i < noffsets; i++) {
                struct proc_timens_offset *off = &offsets[i];
                struct timespec64 *offset = NULL;

                switch (off->clockid) {
                case CLOCK_MONOTONIC:
                        offset = &time_ns->offsets.monotonic;
                        break;
                case CLOCK_BOOTTIME:
                        offset = &time_ns->offsets.boottime;
                        break;
                }

                *offset = off->val;
        }

out_unlock:
        mutex_unlock(&offset_lock);
out:
        put_time_ns(time_ns);

        return err;
}

const struct proc_ns_operations timens_operations = {
        .name                = "time",
        .type                = CLONE_NEWTIME,
        .get                = timens_get,
        .put                = timens_put,
        .install        = timens_install,
        .owner                = timens_owner,
};

const struct proc_ns_operations timens_for_children_operations = {
        .name                = "time_for_children",
        .real_ns_name        = "time",
        .type                = CLONE_NEWTIME,
        .get                = timens_for_children_get,
        .put                = timens_put,
        .install        = timens_install,
        .owner                = timens_owner,
};

struct time_namespace init_time_ns = {
        .ns.count        = REFCOUNT_INIT(3),
        .user_ns        = &init_user_ns,
        .ns.inum        = PROC_TIME_INIT_INO,
        .ns.ops                = &timens_operations,
        .frozen_offsets        = true,
};


























































































    1 















    1 
















    1 



    1 



    1 




    1 





    1 




    1 
    1 










    1 



    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * SHA-256, as specified in
 * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
 *
 * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
 *
 * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
 * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 * Copyright (c) 2014 Red Hat Inc.
 */

#include <asm/unaligned.h>
#include <crypto/sha256_base.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>

static const u32 SHA256_K[] = {
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};

static inline u32 Ch(u32 x, u32 y, u32 z)
{
        return z ^ (x & (y ^ z));
}

static inline u32 Maj(u32 x, u32 y, u32 z)
{
        return (x & y) | (z & (x | y));
}

#define e0(x)       (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
#define e1(x)       (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
#define s0(x)       (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
#define s1(x)       (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))

static inline void LOAD_OP(int I, u32 *W, const u8 *input)
{
        W[I] = get_unaligned_be32((__u32 *)input + I);
}

static inline void BLEND_OP(int I, u32 *W)
{
        W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
}

#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do {                \
        u32 t1, t2;                                                \
        t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i];        \
        t2 = e0(a) + Maj(a, b, c);                                \
        d += t1;                                                \
        h = t1 + t2;                                                \
} while (0)

static void sha256_transform(u32 *state, const u8 *input, u32 *W)
{
        u32 a, b, c, d, e, f, g, h;
        int i;

        /* load the input */
        for (i = 0; i < 16; i += 8) {
                LOAD_OP(i + 0, W, input);
                LOAD_OP(i + 1, W, input);
                LOAD_OP(i + 2, W, input);
                LOAD_OP(i + 3, W, input);
                LOAD_OP(i + 4, W, input);
                LOAD_OP(i + 5, W, input);
                LOAD_OP(i + 6, W, input);
                LOAD_OP(i + 7, W, input);
        }

        /* now blend */
        for (i = 16; i < 64; i += 8) {
                BLEND_OP(i + 0, W);
                BLEND_OP(i + 1, W);
                BLEND_OP(i + 2, W);
                BLEND_OP(i + 3, W);
                BLEND_OP(i + 4, W);
                BLEND_OP(i + 5, W);
                BLEND_OP(i + 6, W);
                BLEND_OP(i + 7, W);
        }

        /* load the state into our registers */
        a = state[0];  b = state[1];  c = state[2];  d = state[3];
        e = state[4];  f = state[5];  g = state[6];  h = state[7];

        /* now iterate */
        for (i = 0; i < 64; i += 8) {
                SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
                SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
                SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
                SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
                SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
                SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
                SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
                SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
        }

        state[0] += a; state[1] += b; state[2] += c; state[3] += d;
        state[4] += e; state[5] += f; state[6] += g; state[7] += h;
}

static void sha256_transform_blocks(struct sha256_state *sctx,
                                    const u8 *input, int blocks)
{
        u32 W[64];

        do {
                sha256_transform(sctx->state, input, W);
                input += SHA256_BLOCK_SIZE;
        } while (--blocks);

        memzero_explicit(W, sizeof(W));
}

void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
{
        lib_sha256_base_do_update(sctx, data, len, sha256_transform_blocks);
}
EXPORT_SYMBOL(sha256_update);

static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_size)
{
        lib_sha256_base_do_finalize(sctx, sha256_transform_blocks);
        lib_sha256_base_finish(sctx, out, digest_size);
}

void sha256_final(struct sha256_state *sctx, u8 *out)
{
        __sha256_final(sctx, out, 32);
}
EXPORT_SYMBOL(sha256_final);

void sha224_final(struct sha256_state *sctx, u8 *out)
{
        __sha256_final(sctx, out, 28);
}
EXPORT_SYMBOL(sha224_final);

void sha256(const u8 *data, unsigned int len, u8 *out)
{
        struct sha256_state sctx;

        sha256_init(&sctx);
        sha256_update(&sctx, data, len);
        sha256_final(&sctx, out);
}
EXPORT_SYMBOL(sha256);

MODULE_LICENSE("GPL");



























































































































































































    1 




























    4 





















    2 












    5 
    2 









    1 
    2 










   10 
    3 









    5 




    5 











    4 










    2 
    4 
    2 

    2 
    3 


































































































































    3 
    4 








    1 
    1 










    1 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * net/dst.h        Protocol independent destination cache definitions.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#ifndef _NET_DST_H
#define _NET_DST_H

#include <net/dst_ops.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
#include <linux/rcuref.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>

struct sk_buff;

struct dst_entry {
        struct net_device       *dev;
        struct  dst_ops                *ops;
        unsigned long                _metrics;
        unsigned long           expires;
#ifdef CONFIG_XFRM
        struct xfrm_state        *xfrm;
#else
        void                        *__pad1;
#endif
        int                        (*input)(struct sk_buff *);
        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);

        unsigned short                flags;
#define DST_NOXFRM                0x0002
#define DST_NOPOLICY                0x0004
#define DST_NOCOUNT                0x0008
#define DST_FAKE_RTABLE                0x0010
#define DST_XFRM_TUNNEL                0x0020
#define DST_XFRM_QUEUE                0x0040
#define DST_METADATA                0x0080

        /* A non-zero value of dst->obsolete forces by-hand validation
         * of the route entry.  Positive values are set by the generic
         * dst layer to indicate that the entry has been forcefully
         * destroyed.
         *
         * Negative values are used by the implementation layer code to
         * force invocation of the dst_ops->check() method.
         */
        short                        obsolete;
#define DST_OBSOLETE_NONE        0
#define DST_OBSOLETE_DEAD        2
#define DST_OBSOLETE_FORCE_CHK        -1
#define DST_OBSOLETE_KILL        -2
        unsigned short                header_len;        /* more space at head required */
        unsigned short                trailer_len;        /* space to reserve at tail */

        /*
         * __rcuref wants to be on a different cache line from
         * input/output/ops or performance tanks badly
         */
#ifdef CONFIG_64BIT
        rcuref_t                __rcuref;        /* 64-bit offset 64 */
#endif
        int                        __use;
        unsigned long                lastuse;
        struct rcu_head                rcu_head;
        short                        error;
        short                        __pad;
        __u32                        tclassid;
#ifndef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
        rcuref_t                __rcuref;        /* 32-bit offset 64 */
#endif
        netdevice_tracker        dev_tracker;

        /*
         * Used by rtable and rt6_info. Moves lwtstate into the next cache
         * line on 64bit so that lwtstate does not cause false sharing with
         * __rcuref under contention of __rcuref. This also puts the
         * frequently accessed members of rtable and rt6_info out of the
         * __rcuref cache line.
         */
        struct list_head        rt_uncached;
        struct uncached_list        *rt_uncached_list;
#ifdef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
#endif
};

struct dst_metrics {
        u32                metrics[RTAX_MAX];
        refcount_t        refcnt;
} __aligned(4);                /* Low pointer bits contain DST_METRICS_FLAGS */
extern const struct dst_metrics dst_default_metrics;

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);

#define DST_METRICS_READ_ONLY                0x1UL
#define DST_METRICS_REFCOUNTED                0x2UL
#define DST_METRICS_FLAGS                0x3UL
#define __DST_METRICS_PTR(Y)        \
        ((u32 *)((Y) & ~DST_METRICS_FLAGS))
#define DST_METRICS_PTR(X)        __DST_METRICS_PTR((X)->_metrics)

static inline bool dst_metrics_read_only(const struct dst_entry *dst)
{
        return dst->_metrics & DST_METRICS_READ_ONLY;
}

void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);

static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
{
        unsigned long val = dst->_metrics;
        if (!(val & DST_METRICS_READ_ONLY))
                __dst_destroy_metrics_generic(dst, val);
}

static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
{
        unsigned long p = dst->_metrics;

        BUG_ON(!p);

        if (p & DST_METRICS_READ_ONLY)
                return dst->ops->cow_metrics(dst, p);
        return __DST_METRICS_PTR(p);
}

/* This may only be invoked before the entry has reached global
 * visibility.
 */
static inline void dst_init_metrics(struct dst_entry *dst,
                                    const u32 *src_metrics,
                                    bool read_only)
{
        dst->_metrics = ((unsigned long) src_metrics) |
                (read_only ? DST_METRICS_READ_ONLY : 0);
}

static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
{
        u32 *dst_metrics = dst_metrics_write_ptr(dest);

        if (dst_metrics) {
                u32 *src_metrics = DST_METRICS_PTR(src);

                memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
        }
}

static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
{
        return DST_METRICS_PTR(dst);
}

static inline u32
dst_metric_raw(const struct dst_entry *dst, const int metric)
{
        u32 *p = DST_METRICS_PTR(dst);

        return p[metric-1];
}

static inline u32
dst_metric(const struct dst_entry *dst, const int metric)
{
        WARN_ON_ONCE(metric == RTAX_HOPLIMIT ||
                     metric == RTAX_ADVMSS ||
                     metric == RTAX_MTU);
        return dst_metric_raw(dst, metric);
}

static inline u32
dst_metric_advmss(const struct dst_entry *dst)
{
        u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS);

        if (!advmss)
                advmss = dst->ops->default_advmss(dst);

        return advmss;
}

static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
{
        u32 *p = dst_metrics_write_ptr(dst);

        if (p)
                p[metric-1] = val;
}

/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA        (1U << 31)

#define DST_FEATURE_MASK        (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK        (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)

static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
        return dst_metric(dst, RTAX_FEATURES) & feature;
}

INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *));
INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *));
static inline u32 dst_mtu(const struct dst_entry *dst)
{
        return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst);
}

/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
{
        return msecs_to_jiffies(dst_metric(dst, metric));
}

static inline int
dst_metric_locked(const struct dst_entry *dst, int metric)
{
        return dst_metric(dst, RTAX_LOCK) & (1 << metric);
}

static inline void dst_hold(struct dst_entry *dst)
{
        /*
         * If your kernel compilation stops here, please check
         * the placement of __rcuref in struct dst_entry
         */
        BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
        WARN_ON(!rcuref_get(&dst->__rcuref));
}

static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
        if (unlikely(time != dst->lastuse)) {
                dst->__use++;
                dst->lastuse = time;
        }
}

static inline struct dst_entry *dst_clone(struct dst_entry *dst)
{
        if (dst)
                dst_hold(dst);
        return dst;
}

void dst_release(struct dst_entry *dst);

void dst_release_immediate(struct dst_entry *dst);

static inline void refdst_drop(unsigned long refdst)
{
        if (!(refdst & SKB_DST_NOREF))
                dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
}

/**
 * skb_dst_drop - drops skb dst
 * @skb: buffer
 *
 * Drops dst reference count if a reference was taken.
 */
static inline void skb_dst_drop(struct sk_buff *skb)
{
        if (skb->_skb_refdst) {
                refdst_drop(skb->_skb_refdst);
                skb->_skb_refdst = 0UL;
        }
}

static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
{
        nskb->slow_gro |= !!refdst;
        nskb->_skb_refdst = refdst;
        if (!(nskb->_skb_refdst & SKB_DST_NOREF))
                dst_clone(skb_dst(nskb));
}

static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
{
        __skb_dst_copy(nskb, oskb->_skb_refdst);
}

/**
 * dst_hold_safe - Take a reference on a dst if possible
 * @dst: pointer to dst entry
 *
 * This helper returns false if it could not safely
 * take a reference on a dst.
 */
static inline bool dst_hold_safe(struct dst_entry *dst)
{
        return rcuref_get(&dst->__rcuref);
}

/**
 * skb_dst_force - makes sure skb dst is refcounted
 * @skb: buffer
 *
 * If dst is not yet refcounted and not destroyed, grab a ref on it.
 * Returns true if dst is refcounted.
 */
static inline bool skb_dst_force(struct sk_buff *skb)
{
        if (skb_dst_is_noref(skb)) {
                struct dst_entry *dst = skb_dst(skb);

                WARN_ON(!rcu_read_lock_held());
                if (!dst_hold_safe(dst))
                        dst = NULL;

                skb->_skb_refdst = (unsigned long)dst;
                skb->slow_gro |= !!dst;
        }

        return skb->_skb_refdst != 0UL;
}


/**
 *        __skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups. (no accounting done)
 */
static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                   struct net *net)
{
        skb->dev = dev;

        /*
         * Clear hash so that we can recalulate the hash for the
         * encapsulated packet, unless we have already determine the hash
         * over the L4 4-tuple.
         */
        skb_clear_hash_if_not_l4(skb);
        skb_set_queue_mapping(skb, 0);
        skb_scrub_packet(skb, !net_eq(net, dev_net(dev)));
}

/**
 *        skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups, and perform accounting.
 *        Note: this accounting is not SMP safe.
 */
static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                 struct net *net)
{
        DEV_STATS_INC(dev, rx_packets);
        DEV_STATS_ADD(dev, rx_bytes, skb->len);
        __skb_tunnel_rx(skb, dev, net);
}

static inline u32 dst_tclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        const struct dst_entry *dst;

        dst = skb_dst(skb);
        if (dst)
                return dst->tclassid;
#endif
        return 0;
}

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static inline int dst_discard(struct sk_buff *skb)
{
        return dst_discard_out(&init_net, skb->sk, skb);
}
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
                int initial_obsolete, unsigned short flags);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_obsolete,
              unsigned short flags);
void dst_dev_put(struct dst_entry *dst);

static inline void dst_confirm(struct dst_entry *dst)
{
}

static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
        struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst,
                                                     struct sk_buff *skb)
{
        struct neighbour *n;

        if (WARN_ON_ONCE(!dst->ops->neigh_lookup))
                return NULL;

        n = dst->ops->neigh_lookup(dst, skb, NULL);

        return IS_ERR(n) ? NULL : n;
}

static inline void dst_confirm_neigh(const struct dst_entry *dst,
                                     const void *daddr)
{
        if (dst->ops->confirm_neigh)
                dst->ops->confirm_neigh(dst, daddr);
}

static inline void dst_link_failure(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        if (dst && dst->ops && dst->ops->link_failure)
                dst->ops->link_failure(skb);
}

static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
        unsigned long expires = jiffies + timeout;

        if (expires == 0)
                expires = 1;

        if (dst->expires == 0 || time_before(expires, dst->expires))
                dst->expires = expires;
}

INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *,
                                         struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
                                         struct sk_buff *));
/* Output packet to network from transport.  */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(skb_dst(skb)->output,
                                  ip6_output, ip_output,
                                  net, sk, skb);
}

INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(skb_dst(skb)->input,
                                  ip6_input, ip_local_deliver, skb);
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
        if (dst->obsolete)
                dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
                                         ipv4_dst_check, dst, cookie);
        return dst;
}

/* Flags for xfrm_lookup flags argument. */
enum {
        XFRM_LOOKUP_ICMP = 1 << 0,
        XFRM_LOOKUP_QUEUE = 1 << 1,
        XFRM_LOOKUP_KEEP_DST_REF = 1 << 2,
};

struct flowi;
#ifndef CONFIG_XFRM
static inline struct dst_entry *xfrm_lookup(struct net *net,
                                            struct dst_entry *dst_orig,
                                            const struct flowi *fl,
                                            const struct sock *sk,
                                            int flags)
{
        return dst_orig;
}

static inline struct dst_entry *
xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig,
                      const struct flowi *fl, const struct sock *sk,
                      int flags, u32 if_id)
{
        return dst_orig;
}

static inline struct dst_entry *xfrm_lookup_route(struct net *net,
                                                  struct dst_entry *dst_orig,
                                                  const struct flowi *fl,
                                                  const struct sock *sk,
                                                  int flags)
{
        return dst_orig;
}

static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return NULL;
}

#else
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags);

struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk, int flags,
                                        u32 if_id);

struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl, const struct sock *sk,
                                    int flags);

/* skb attached with this dst needs transformation if dst->xfrm is valid */
static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return dst->xfrm;
}
#endif

static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, true);
}

/* update dst pmtu but not do neighbor confirm */
static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu, bool confirm_neigh);
void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb);
u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old);
struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr);
unsigned int dst_blackhole_mtu(const struct dst_entry *dst);

#endif /* _NET_DST_H */


































































































































































































































































    6 


















    1 














































































    5 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Operations on the network namespace
 */
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>

#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/netfilter.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
#include <net/netns/flow_table.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/smc.h>
#include <net/netns/bpf.h>
#include <net/netns/mctp.h>
#include <net/net_trackers.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>
#include <linux/xarray.h>

struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;


#define NETDEV_HASHBITS    8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {
        /* First cache line can be often dirtied.
         * Do not place here read-mostly fields.
         */
        refcount_t                passive;        /* To decide when the network
                                                 * namespace should be freed.
                                                 */
        spinlock_t                rules_mod_lock;

        unsigned int                dev_base_seq;        /* protected by rtnl_mutex */
        u32                        ifindex;

        spinlock_t                nsid_lock;
        atomic_t                fnhe_genid;

        struct list_head        list;                /* list of network namespaces */
        struct list_head        exit_list;        /* To linked to call pernet exit
                                                 * methods on dead net (
                                                 * pernet_ops_rwsem read locked),
                                                 * or to unregister pernet ops
                                                 * (pernet_ops_rwsem write locked).
                                                 */
        struct llist_node        cleanup_list;        /* namespaces on death row */

#ifdef CONFIG_KEYS
        struct key_tag                *key_domain;        /* Key domain of operation tag */
#endif
        struct user_namespace   *user_ns;        /* Owning user namespace */
        struct ucounts                *ucounts;
        struct idr                netns_ids;

        struct ns_common        ns;
        struct ref_tracker_dir  refcnt_tracker;
        struct ref_tracker_dir  notrefcnt_tracker; /* tracker for objects not
                                                    * refcounted against netns
                                                    */
        struct list_head         dev_base_head;
        struct proc_dir_entry         *proc_net;
        struct proc_dir_entry         *proc_net_stat;

#ifdef CONFIG_SYSCTL
        struct ctl_table_set        sysctls;
#endif

        struct sock                 *rtnl;                        /* rtnetlink socket */
        struct sock                *genl_sock;

        struct uevent_sock        *uevent_sock;                /* uevent socket */

        struct hlist_head         *dev_name_head;
        struct hlist_head        *dev_index_head;
        struct xarray                dev_by_index;
        struct raw_notifier_head        netdev_chain;

        /* Note that @hash_mix can be read millions times per second,
         * it is critical that it is on a read_mostly cache line.
         */
        u32                        hash_mix;

        struct net_device       *loopback_dev;          /* The loopback */

        /* core fib_rules */
        struct list_head        rules_ops;

        struct netns_core        core;
        struct netns_mib        mib;
        struct netns_packet        packet;
#if IS_ENABLED(CONFIG_UNIX)
        struct netns_unix        unx;
#endif
        struct netns_nexthop        nexthop;
        struct netns_ipv4        ipv4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6        ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan        ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
        struct netns_sctp        sctp;
#endif
#ifdef CONFIG_NETFILTER
        struct netns_nf                nf;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct netns_ct                ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
        struct netns_nftables        nft;
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
        struct netns_ft ft;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
        struct sk_buff_head        wext_nlevents;
#endif
        struct net_generic __rcu        *gen;

        /* Used to store attached BPF programs */
        struct netns_bpf        bpf;

        /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
        struct netns_xfrm        xfrm;
#endif

        u64                        net_cookie; /* written once */

#if IS_ENABLED(CONFIG_IP_VS)
        struct netns_ipvs        *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
        struct netns_mpls        mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
        struct netns_can        can;
#endif
#ifdef CONFIG_XDP_SOCKETS
        struct netns_xdp        xdp;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct netns_mctp        mctp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        struct sock                *crypto_nlsk;
#endif
        struct sock                *diag_nlsk;
#if IS_ENABLED(CONFIG_SMC)
        struct netns_smc        smc;
#endif
} __randomize_layout;

#include <linux/seq_file_net.h>

/* Init's network namespace */
extern struct net init_net;

#ifdef CONFIG_NET_NS
struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
                        struct net *old_net);

void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);

void net_ns_barrier(void);

struct ns_common *get_net_ns(struct ns_common *ns);
struct net *get_net_ns_by_fd(int fd);
#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(unsigned long flags,
        struct user_namespace *user_ns, struct net *old_net)
{
        if (flags & CLONE_NEWNET)
                return ERR_PTR(-EINVAL);
        return old_net;
}

static inline void net_ns_get_ownership(const struct net *net,
                                        kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;
}

static inline void net_ns_barrier(void) {}

static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
        return ERR_PTR(-EINVAL);
}

static inline struct net *get_net_ns_by_fd(int fd)
{
        return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */


extern struct list_head net_namespace_list;

struct net *get_net_ns_by_pid(pid_t pid);

#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif

#ifdef CONFIG_NET_NS
void __put_net(struct net *net);

/* Try using get_net_track() instead */
static inline struct net *get_net(struct net *net)
{
        refcount_inc(&net->ns.count);
        return net;
}

static inline struct net *maybe_get_net(struct net *net)
{
        /* Used when we know struct net exists but we
         * aren't guaranteed a previous reference count
         * exists.  If the reference count is zero this
         * function fails and returns NULL.
         */
        if (!refcount_inc_not_zero(&net->ns.count))
                net = NULL;
        return net;
}

/* Try using put_net_track() instead */
static inline void put_net(struct net *net)
{
        if (refcount_dec_and_test(&net->ns.count))
                __put_net(net);
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return net1 == net2;
}

static inline int check_net(const struct net *net)
{
        return refcount_read(&net->ns.count) != 0;
}

void net_drop_ns(void *);

#else

static inline struct net *get_net(struct net *net)
{
        return net;
}

static inline void put_net(struct net *net)
{
}

static inline struct net *maybe_get_net(struct net *net)
{
        return net;
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return 1;
}

static inline int check_net(const struct net *net)
{
        return 1;
}

#define net_drop_ns NULL
#endif


static inline void __netns_tracker_alloc(struct net *net,
                                         netns_tracker *tracker,
                                         bool refcounted,
                                         gfp_t gfp)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
        ref_tracker_alloc(refcounted ? &net->refcnt_tracker :
                                       &net->notrefcnt_tracker,
                          tracker, gfp);
#endif
}

static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker,
                                       gfp_t gfp)
{
        __netns_tracker_alloc(net, tracker, true, gfp);
}

static inline void __netns_tracker_free(struct net *net,
                                        netns_tracker *tracker,
                                        bool refcounted)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
       ref_tracker_free(refcounted ? &net->refcnt_tracker :
                                     &net->notrefcnt_tracker, tracker);
#endif
}

static inline struct net *get_net_track(struct net *net,
                                        netns_tracker *tracker, gfp_t gfp)
{
        get_net(net);
        netns_tracker_alloc(net, tracker, gfp);
        return net;
}

static inline void put_net_track(struct net *net, netns_tracker *tracker)
{
        __netns_tracker_free(net, tracker, true);
        put_net(net);
}

typedef struct {
#ifdef CONFIG_NET_NS
        struct net __rcu *net;
#endif
} possible_net_t;

static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
        rcu_assign_pointer(pnet->net, net);
#endif
}

static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference_protected(pnet->net, true);
#else
        return &init_net;
#endif
}

static inline struct net *read_pnet_rcu(possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference(pnet->net);
#else
        return &init_net;
#endif
}

/* Protected by net_rwsem */
#define for_each_net(VAR)                                \
        list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR)                \
        list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR)                                \
        list_for_each_entry_rcu(VAR, &net_namespace_list, list)

#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init        __init
#define __net_exit        __ref
#define __net_initdata        __initdata
#define __net_initconst        __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);

struct pernet_operations {
        struct list_head list;
        /*
         * Below methods are called without any exclusive locks.
         * More than one net may be constructed and destructed
         * in parallel on several cpus. Every pernet_operations
         * have to keep in mind all other pernet_operations and
         * to introduce a locking, if they share common resources.
         *
         * The only time they are called with exclusive lock is
         * from register_pernet_subsys(), unregister_pernet_subsys()
         * register_pernet_device() and unregister_pernet_device().
         *
         * Exit methods using blocking RCU primitives, such as
         * synchronize_rcu(), should be implemented via exit_batch.
         * Then, destruction of a group of net requires single
         * synchronize_rcu() related to these pernet_operations,
         * instead of separate synchronize_rcu() for every net.
         * Please, avoid synchronize_rcu() at all, where it's possible.
         *
         * Note that a combination of pre_exit() and exit() can
         * be used, since a synchronize_rcu() is guaranteed between
         * the calls.
         */
        int (*init)(struct net *net);
        void (*pre_exit)(struct net *net);
        void (*exit)(struct net *net);
        void (*exit_batch)(struct list_head *net_exit_list);
        /* Following method is called with RTNL held. */
        void (*exit_batch_rtnl)(struct list_head *net_exit_list,
                                struct list_head *dev_kill_list);
        unsigned int *id;
        size_t size;
};

/*
 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.
 *
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.
 *
 * For the longest time the ipv4 icmp code was registered as a pernet
 * device which caused kernel oops, and panics during network
 * namespace cleanup.   So please don't get this wrong.
 */
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);

struct ctl_table;

#define register_net_sysctl(net, path, table)        \
        register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table))
#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path,
                                             struct ctl_table *table, size_t table_size);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net,
        const char *path, struct ctl_table *table, size_t table_size)
{
        return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif

static inline int rt_genid_ipv4(const struct net *net)
{
        return atomic_read(&net->ipv4.rt_genid);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
        return atomic_read(&net->ipv6.fib6_sernum);
}
#endif

static inline void rt_genid_bump_ipv4(struct net *net)
{
        atomic_inc(&net->ipv4.rt_genid);
}

extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
        if (__fib6_flush_trees)
                __fib6_flush_trees(net);
}

#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
        return &net->ieee802154_lowpan;
}
#endif

/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
        rt_genid_bump_ipv4(net);
        rt_genid_bump_ipv6(net);
}

static inline int fnhe_genid(const struct net *net)
{
        return atomic_read(&net->fnhe_genid);
}

static inline void fnhe_genid_bump(struct net *net)
{
        atomic_inc(&net->fnhe_genid);
}

#ifdef CONFIG_NET
void net_ns_init(void);
#else
static inline void net_ns_init(void) {}
#endif

#endif /* __NET_NET_NAMESPACE_H */






































































































































    2 











































    4 






































































































































































































































































    2 







































































































































































































































































































    4 















    4 




















































































































































































































    1 








































































































































































































































































































    2 

















































































































































































































































































































































    4 



    3 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H

#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>

/*
 * Macro to mark a page protection value as UC-
 */
#define pgprot_noncached(prot)                                                \
        ((boot_cpu_data.x86 > 3)                                        \
         ? (__pgprot(pgprot_val(prot) |                                        \
                     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))        \
         : (prot))

#ifndef __ASSEMBLY__
#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm/coco.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);

struct seq_file;
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
                                   bool user);
bool ptdump_walk_pgd_level_checkwx(void);
#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
void ptdump_walk_user_pgd_level_checkwx(void);

/*
 * Macros to add or remove encryption attribute
 */
#define pgprot_encrypted(prot)        __pgprot(cc_mkenc(pgprot_val(prot)))
#define pgprot_decrypted(prot)        __pgprot(cc_mkdec(pgprot_val(prot)))

#ifdef CONFIG_DEBUG_WX
#define debug_checkwx_user()        ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx_user()        do { } while (0)
#endif

/*
 * ZERO_PAGE is a global shared page that is always zero: used
 * for zero-mapped memory areas etc..
 */
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
        __visible;
#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))

extern spinlock_t pgd_lock;
extern struct list_head pgd_list;

extern struct mm_struct *pgd_page_get_mm(struct page *page);

extern pmdval_t early_pmd_flags;

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else  /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte)                native_set_pte(ptep, pte)

#define set_pte_atomic(ptep, pte)                                        \
        native_set_pte_atomic(ptep, pte)

#define set_pmd(pmdp, pmd)                native_set_pmd(pmdp, pmd)

#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd)                native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd)                        (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif

#ifndef set_p4d
# define set_p4d(p4dp, p4d)                native_set_p4d(p4dp, p4d)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d)                        native_p4d_clear(p4d)
#endif

#ifndef set_pud
# define set_pud(pudp, pud)                native_set_pud(pudp, pud)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud)                        native_pud_clear(pud)
#endif

#define pte_clear(mm, addr, ptep)        native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd)                        native_pmd_clear(pmd)

#define pgd_val(x)        native_pgd_val(x)
#define __pgd(x)        native_make_pgd(x)

#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x)        native_p4d_val(x)
#define __p4d(x)        native_make_p4d(x)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x)        native_pud_val(x)
#define __pud(x)        native_make_pud(x)
#endif

#ifndef __PAGETABLE_PMD_FOLDED
#define pmd_val(x)        native_pmd_val(x)
#define __pmd(x)        native_make_pmd(x)
#endif

#define pte_val(x)        native_pte_val(x)
#define __pte(x)        native_make_pte(x)

#define arch_end_context_switch(prev)        do {} while(0)
#endif        /* CONFIG_PARAVIRT_XXL */

/*
 * The following only work if pte_present() is true.
 * Undefined behaviour if not..
 */
static inline bool pte_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_DIRTY_BITS;
}

static inline bool pte_shstk(pte_t pte)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}

static inline int pte_young(pte_t pte)
{
        return pte_flags(pte) & _PAGE_ACCESSED;
}

#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
}

static inline bool pmd_shstk(pmd_t pmd)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
               (_PAGE_DIRTY | _PAGE_PSE);
}

#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_ACCESSED;
}

static inline bool pud_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_DIRTY_BITS;
}

static inline int pud_young(pud_t pud)
{
        return pud_flags(pud) & _PAGE_ACCESSED;
}

static inline int pte_write(pte_t pte)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
}

#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
}

#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
        return pud_flags(pud) & _PAGE_RW;
}

static inline int pte_huge(pte_t pte)
{
        return pte_flags(pte) & _PAGE_PSE;
}

static inline int pte_global(pte_t pte)
{
        return pte_flags(pte) & _PAGE_GLOBAL;
}

static inline int pte_exec(pte_t pte)
{
        return !(pte_flags(pte) & _PAGE_NX);
}

static inline int pte_special(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SPECIAL;
}

/* Entries that were set to PROT_NONE are inverted */

static inline u64 protnone_mask(u64 val);

#define PFN_PTE_SHIFT        PAGE_SHIFT

static inline unsigned long pte_pfn(pte_t pte)
{
        phys_addr_t pfn = pte_val(pte);
        pfn ^= protnone_mask(pfn);
        return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}

static inline unsigned long pmd_pfn(pmd_t pmd)
{
        phys_addr_t pfn = pmd_val(pmd);
        pfn ^= protnone_mask(pfn);
        return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}

#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
        phys_addr_t pfn = pud_val(pud);
        pfn ^= protnone_mask(pfn);
        return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}

static inline unsigned long p4d_pfn(p4d_t p4d)
{
        return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}

static inline unsigned long pgd_pfn(pgd_t pgd)
{
        return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

#define p4d_leaf p4d_leaf
static inline bool p4d_leaf(p4d_t p4d)
{
        /* No 512 GiB pages yet */
        return 0;
}

#define pte_page(pte)        pfn_to_page(pte_pfn(pte))

#define pmd_leaf pmd_leaf
static inline bool pmd_leaf(pmd_t pte)
{
        return pmd_flags(pte) & _PAGE_PSE;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */
static inline int pmd_trans_huge(pmd_t pmd)
{
        return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
#endif

#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
        return boot_cpu_has(X86_FEATURE_PSE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pmd_devmap(pmd_t pmd)
{
        return !!(pmd_val(pmd) & _PAGE_DEVMAP);
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_devmap(pud_t pud)
{
        return !!(pud_val(pud) & _PAGE_DEVMAP);
}
#else
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
#endif

static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v | set);
}

static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v & ~clear);
}

/*
 * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
 * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So
 * when creating dirty, write-protected memory, a software bit is used:
 * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the
 * Dirty bit to SavedDirty, and vice-vesra.
 *
 * This shifting is only done if needed. In the case of shifting
 * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of
 * shifting SavedDirty->Dirty, the condition is Write=1.
 */
static inline pgprotval_t mksaveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY;
        v &= ~(cond << _PAGE_BIT_DIRTY);

        return v;
}

static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY;
        v &= ~(cond << _PAGE_BIT_SAVED_DIRTY);

        return v;
}

static inline pte_t pte_mksaveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = mksaveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_clear_saveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = clear_saveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_wrprotect(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
         * dirty value to the software bit, if present.
         */
        return pte_mksaveddirty(pte);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_UFFD_WP;
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
        return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pte_t pte_mkclean(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}

static inline pte_t pte_mkold(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkexec(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_NX);
}

static inline pte_t pte_mkdirty(pte_t pte)
{
        pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pte_mksaveddirty(pte);
}

static inline pte_t pte_mkwrite_shstk(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        return pte_set_flags(pte, _PAGE_DIRTY);
}

static inline pte_t pte_mkyoung(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkwrite_novma(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_RW);
}

struct vm_area_struct;
pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
#define pte_mkwrite pte_mkwrite

static inline pte_t pte_mkhuge(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_clrhuge(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_mkglobal(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_clrglobal(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL);
}

static inline pte_t pte_mkdevmap(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
}

static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v | set);
}

static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v & ~clear);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = mksaveddirty_shift(v);
        return native_make_pmd(v);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = clear_saveddirty_shift(v);
        return native_make_pmd(v);
}

static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pmd_mksaveddirty(pmd);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_UFFD_WP;
}

static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
        return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
}

static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pmd_t pmd_mkold(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkclean(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}

static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
        pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pmd_mksaveddirty(pmd);
}

static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        return pmd_set_flags(pmd, _PAGE_DIRTY);
}

static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_DEVMAP);
}

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_PSE);
}

static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_RW);
}

pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
#define pmd_mkwrite pmd_mkwrite

static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v | set);
}

static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v & ~clear);
}

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_mksaveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = mksaveddirty_shift(v);
        return native_make_pud(v);
}

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_clear_saveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = clear_saveddirty_shift(v);
        return native_make_pud(v);
}

static inline pud_t pud_mkold(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkclean(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}

static inline pud_t pud_wrprotect(pud_t pud)
{
        pud = pud_clear_flags(pud, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkdirty(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkdevmap(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_DEVMAP);
}

static inline pud_t pud_mkhuge(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_PSE);
}

static inline pud_t pud_mkyoung(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkwrite(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_RW);

        return pud_clear_saveddirty(pud);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
}

static inline int pud_soft_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SOFT_DIRTY;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mksoft_dirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_clear_soft_dirty(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
}

#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */

/*
 * Mask out unsupported bits in a present pgprot.  Non-present pgprots
 * can use those bits for other purposes, so leave them be.
 */
static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
        pgprotval_t protval = pgprot_val(pgprot);

        if (protval & _PAGE_PRESENT)
                protval &= __supported_pte_mask;

        return protval;
}

static inline pgprotval_t check_pgprot(pgprot_t pgprot)
{
        pgprotval_t massaged_val = massage_pgprot(pgprot);

        /* mmdebug.h can not be included here because of dependencies */
#ifdef CONFIG_DEBUG_VM
        WARN_ONCE(pgprot_val(pgprot) != massaged_val,
                  "attempted to set unsupported pgprot: %016llx "
                  "bits: %016llx supported: %016llx\n",
                  (u64)pgprot_val(pgprot),
                  (u64)pgprot_val(pgprot) ^ massaged_val,
                  (u64)__supported_pte_mask);
#endif

        return massaged_val;
}

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PTE_PFN_MASK;
        return __pte(pfn | check_pgprot(pgprot));
}

static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PMD_PAGE_MASK;
        return __pmd(pfn | check_pgprot(pgprot));
}

static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PUD_PAGE_MASK;
        return __pud(pfn | check_pgprot(pgprot));
}

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
        return pfn_pmd(pmd_pfn(pmd),
                      __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);

static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
        pteval_t val = pte_val(pte), oldval = val;
        pte_t pte_result;

        /*
         * Chop off the NX bit (if present), and add the NX portion of
         * the newprot (if present):
         */
        val &= _PAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);

        pte_result = __pte(val);

        /*
         * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid:
         *  1. Marking Write=0 PTEs Dirty=1
         *  2. Marking Dirty=1 PTEs Write=0
         *
         * The first case cannot happen because the _PAGE_CHG_MASK will filter
         * out any Dirty bit passed in newprot. Handle the second case by
         * going through the mksaveddirty exercise. Only do this if the old
         * value was Write=1 to avoid doing this on Shadow Stack PTEs.
         */
        if (oldval & _PAGE_RW)
                pte_result = pte_mksaveddirty(pte_result);
        else
                pte_result = pte_clear_saveddirty(pte_result);

        return pte_result;
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
        pmdval_t val = pmd_val(pmd), oldval = val;
        pmd_t pmd_result;

        val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);

        pmd_result = __pmd(val);

        /*
         * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid:
         *  1. Marking Write=0 PMDs Dirty=1
         *  2. Marking Dirty=1 PMDs Write=0
         *
         * The first case cannot happen because the _PAGE_CHG_MASK will filter
         * out any Dirty bit passed in newprot. Handle the second case by
         * going through the mksaveddirty exercise. Only do this if the old
         * value was Write=1 to avoid doing this on Shadow Stack PTEs.
         */
        if (oldval & _PAGE_RW)
                pmd_result = pmd_mksaveddirty(pmd_result);
        else
                pmd_result = pmd_clear_saveddirty(pmd_result);

        return pmd_result;
}

/*
 * mprotect needs to preserve PAT and encryption bits when updating
 * vm_page_prot
 */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
        pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
        return __pgprot(preservebits | addbits);
}

#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))

#define canon_pgprot(p) __pgprot(massage_pgprot(p))

static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
                                         enum page_cache_mode pcm,
                                         enum page_cache_mode new_pcm)
{
        /*
         * PAT type is always WB for untracked ranges, so no need to check.
         */
        if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
                return 1;

        /*
         * Certain new memtypes are not allowed with certain
         * requested memtype:
         * - request is uncached, return cannot be write-back
         * - request is write-combine, return cannot be write-back
         * - request is write-through, return cannot be write-back
         * - request is write-through, return cannot be write-combine
         */
        if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WC &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WC)) {
                return 0;
        }

        return 1;
}

pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);

/*
 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
 * Populates the user and returns the resulting PGD that must be set in
 * the kernel copy of the page tables.
 */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        if (!static_cpu_has(X86_FEATURE_PTI))
                return pgd;
        return __pti_set_user_pgtbl(pgdp, pgd);
}
#else   /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        return pgd;
}
#endif  /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

#endif        /* __ASSEMBLY__ */


#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
# include <asm/pgtable_64.h>
#endif

#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>

static inline int pte_none(pte_t pte)
{
        return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t a, pte_t b)
{
        return a.pte == b.pte;
}

static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        if (__pte_needs_invert(pte_val(pte)))
                return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#define pte_advance_pfn        pte_advance_pfn

static inline int pte_present(pte_t a)
{
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t a)
{
        return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
}
#endif

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
        if (pte_flags(a) & _PAGE_PRESENT)
                return true;

        if ((pte_flags(a) & _PAGE_PROTNONE) &&
                        atomic_read(&mm->tlb_flush_pending))
                return true;

        return false;
}

static inline int pmd_present(pmd_t pmd)
{
        /*
         * Checking for _PAGE_PSE is needed too because
         * split_huge_page will temporarily clear the present bit (but
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * These work without NUMA balancing but the kernel does not care. See the
 * comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
        return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */

static inline int pmd_none(pmd_t pmd)
{
        /* Only check low word on 32-bit platforms, since it might be
           out of sync with upper half. */
        unsigned long val = native_pmd_val(pmd);
        return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
        return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pmd_page(pmd)        pfn_to_page(pmd_pfn(pmd))

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 *
 * (Currently stuck as a macro because of indirect forward reference
 * to linux/mm.h:page_to_nid())
 */
#define mk_pte(page, pgprot)                                                  \
({                                                                          \
        pgprot_t __pgprot = pgprot;                                          \
                                                                          \
        WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
                    _PAGE_DIRTY);                                          \
        pfn_pte(page_to_pfn(page), __pgprot);                                  \
})

static inline int pmd_bad(pmd_t pmd)
{
        return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
               (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}

static inline unsigned long pages_to_mb(unsigned long npg)
{
        return npg >> (20 - PAGE_SHIFT);
}

#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
        return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int pud_present(pud_t pud)
{
        return pud_flags(pud) & _PAGE_PRESENT;
}

static inline pmd_t *pud_pgtable(pud_t pud)
{
        return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pud_page(pud)        pfn_to_page(pud_pfn(pud))

#define pud_leaf pud_leaf
static inline bool pud_leaf(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
                (_PAGE_PSE | _PAGE_PRESENT);
}

static inline int pud_bad(pud_t pud)
{
        return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
        return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int p4d_present(p4d_t p4d)
{
        return p4d_flags(p4d) & _PAGE_PRESENT;
}

static inline pud_t *p4d_pgtable(p4d_t p4d)
{
        return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define p4d_page(p4d)        pfn_to_page(p4d_pfn(p4d))

static inline int p4d_bad(p4d_t p4d)
{
        unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif  /* CONFIG_PGTABLE_LEVELS > 3 */

static inline unsigned long p4d_index(unsigned long address)
{
        return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 1;
        return pgd_flags(pgd) & _PAGE_PRESENT;
}

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
        return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pgd_page(pgd)        pfn_to_page(pgd_pfn(pgd))

/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
        if (!pgtable_l5_enabled())
                return (p4d_t *)pgd;
        return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

static inline int pgd_bad(pgd_t pgd)
{
        unsigned long ignore_flags = _PAGE_USER;

        if (!pgtable_l5_enabled())
                return 0;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 0;
        /*
         * There is no need to do a workaround for the KNL stray
         * A/D bit erratum here.  PGDs only point to page tables
         * except on 32-bit non-PAE which is not supported on
         * KNL.
         */
        return !native_pgd_val(pgd);
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */

#endif        /* __ASSEMBLY__ */

#define KERNEL_PGD_BOUNDARY        pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS                (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)

#ifndef __ASSEMBLY__

extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
                                  unsigned long end, pgprot_t prot);

#ifdef CONFIG_X86_64
extern pgd_t trampoline_pgd_entry;
#endif

/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
        pte_t res = *ptep;

        /* Pure native function needs no input for mm, addr */
        native_pte_clear(NULL, 0, ptep);
        return res;
}

static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
        pmd_t res = *pmdp;

        native_pmd_clear(pmdp);
        return res;
}

static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
{
        pud_t res = *pudp;

        native_pud_clear(pudp);
        return res;
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(mm, pmdp, pmd);
        set_pmd(pmdp, pmd);
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                              pud_t *pudp, pud_t pud)
{
        page_table_check_pud_set(mm, pudp, pud);
        native_set_pud(pudp, pud);
}

/*
 * We only update the dirty/accessed state if we set
 * the dirty bit by hand in the kernel, since the hardware
 * will do the accessed bit for us, and we don't want to
 * race with other CPU's that might be updating the dirty
 * bit at the same time.
 */
struct vm_area_struct;

#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pte_t *ptep);

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pte_t *ptep)
{
        pte_t pte = native_ptep_get_and_clear(ptep);
        page_table_check_pte_clear(mm, pte);
        return pte;
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long addr, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        if (full) {
                /*
                 * Full address destruction in progress; paravirt does not
                 * care about updates and native needs no locking
                 */
                pte = native_local_ptep_get_and_clear(ptep);
                page_table_check_pte_clear(mm, pte);
        } else {
                pte = ptep_get_and_clear(mm, addr, ptep);
        }
        return pte;
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pte_t *ptep)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pte_t old_pte, new_pte;

        old_pte = READ_ONCE(*ptep);
        do {
                new_pte = pte_wrprotect(old_pte);
        } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}

#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)

#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))

#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pmd_t *pmdp);
extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pud_t *pudp);

#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);


#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
{
        pmd_t pmd = native_pmdp_get_and_clear(pmdp);

        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}

#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pud_t *pudp)
{
        pud_t pud = native_pudp_get_and_clear(pudp);

        page_table_check_pud_clear(mm, pud);

        return pud;
}

#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pmd_t *pmdp)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pmd_t old_pmd, new_pmd;

        old_pmd = READ_ONCE(*pmdp);
        do {
                new_pmd = pmd_wrprotect(old_pmd);
        } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd));
}

#ifndef pmdp_establish
#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pmdp, pmd);
        } else {
                pmd_t old = *pmdp;
                WRITE_ONCE(*pmdp, pmd);
                return old;
        }
}
#endif

#define __HAVE_ARCH_PMDP_INVALIDATE_AD
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);

/*
 * Page table pages are page-aligned.  The lower half of the top
 * level is used for userspace and the top half for the kernel.
 *
 * Returns true for parts of the PGD that map userspace and
 * false for the parts that map the kernel.
 */
static inline bool pgdp_maps_userspace(void *__ptr)
{
        unsigned long ptr = (unsigned long)__ptr;

        return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}

#define pgd_leaf        pgd_leaf
static inline bool pgd_leaf(pgd_t pgd) { return false; }

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
 * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages
 * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
 * the user one is in the last 4k.  To switch between them, you
 * just need to flip the 12th bit in their addresses.
 */
#define PTI_PGTABLE_SWITCH_BIT        PAGE_SHIFT

/*
 * This generates better code than the inline assembly in
 * __set_bit().
 */
static inline void *ptr_set_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr |= BIT(bit);
        return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr &= ~BIT(bit);
        return (void *)__ptr;
}

static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
        return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
        return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
        return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
        return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

/*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
 *  dst - pointer to pgd range anywhere on a pgd page
 *  src - ""
 *  count - the number of pgds to copy.
 *
 * dst and src can be on the same page, but the range must not overlap,
 * and must not cross a page boundary.
 */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
        memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
        if (!static_cpu_has(X86_FEATURE_PTI))
                return;
        /* Clone the user space pgd as well */
        memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
               count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
static inline int page_level_shift(enum pg_level level)
{
        return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
}
static inline unsigned long page_level_size(enum pg_level level)
{
        return 1UL << page_level_shift(level);
}
static inline unsigned long page_level_mask(enum pg_level level)
{
        return ~(page_level_size(level) - 1);
}

/*
 * The x86 doesn't have any external MMU info: the kernel page
 * tables contain all the necessary information.
 */
static inline void update_mmu_cache(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_range(struct vm_fault *vmf,
                struct vm_area_struct *vma, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmd)
{
}
static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pud)
{
}
static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

static inline int pte_swp_exclusive(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
}

static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
}

static inline int pmd_swp_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
}

static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        /* ifdef to avoid doing 59-bit shift on 32-bit values */
        return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
        return 0;
#endif
}

static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
        u32 pkru = read_pkru();

        if (!__pkru_allows_read(pkru, pkey))
                return false;
        if (write && !__pkru_allows_write(pkru, pkey))
                return false;

        return true;
}

/*
 * 'pteval' can come from a PTE, PMD or PUD.  We only check
 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
 * same value on all 3 types.
 */
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;

        /*
         * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
         * shouldn't generally allow access to, but since they
         * are already Write=0, the below logic covers both cases.
         */
        if (write)
                need_pte_bits |= _PAGE_RW;

        if ((pteval & need_pte_bits) != need_pte_bits)
                return 0;

        return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}

#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
        return __pte_access_permitted(pte_val(pte), write);
}

#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
        return __pte_access_permitted(pmd_val(pmd), write);
}

#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
        return __pte_access_permitted(pud_val(pud), write);
}

#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);

static inline bool arch_has_pfn_modify_check(void)
{
        return boot_cpu_has_bug(X86_BUG_L1TF);
}

#define arch_check_zapped_pte arch_check_zapped_pte
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);

#define arch_check_zapped_pmd arch_check_zapped_pmd
void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);

#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return !cpu_feature_enabled(X86_FEATURE_XENPV);
}
#endif

#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
        return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
}

static inline bool pmd_user_accessible_page(pmd_t pmd)
{
        return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER);
}

static inline bool pud_user_accessible_page(pud_t pud)
{
        return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER);
}
#endif

#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure

bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif

#endif        /* __ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_H */



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions of this file
 * Copyright(c) 2016-2017 Intel Deutschland GmbH
 * Copyright (C) 2018 - 2024 Intel Corporation
 */

#if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __MAC80211_DRIVER_TRACE

#include <linux/tracepoint.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"

#undef TRACE_SYSTEM
#define TRACE_SYSTEM mac80211

#define MAXNAME                32
#define LOCAL_ENTRY        __array(char, wiphy_name, 32)
#define LOCAL_ASSIGN        strscpy(__entry->wiphy_name, wiphy_name(local->hw.wiphy), MAXNAME)
#define LOCAL_PR_FMT        "%s"
#define LOCAL_PR_ARG        __entry->wiphy_name

#define STA_ENTRY        __array(char, sta_addr, ETH_ALEN)
#define STA_ASSIGN        (sta ? memcpy(__entry->sta_addr, sta->addr, ETH_ALEN) : \
                                eth_zero_addr(__entry->sta_addr))
#define STA_NAMED_ASSIGN(s)        memcpy(__entry->sta_addr, (s)->addr, ETH_ALEN)
#define STA_PR_FMT        " sta:%pM"
#define STA_PR_ARG        __entry->sta_addr

#define VIF_ENTRY        __field(enum nl80211_iftype, vif_type) __field(void *, sdata)        \
                        __field(bool, p2p)                                                \
                        __string(vif_name, sdata->name)
#define VIF_ASSIGN        __entry->vif_type = sdata->vif.type; __entry->sdata = sdata;        \
                        __entry->p2p = sdata->vif.p2p;                                        \
                        __assign_str(vif_name)
#define VIF_PR_FMT        " vif:%s(%d%s)"
#define VIF_PR_ARG        __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""

#define CHANDEF_ENTRY        __field(u32, control_freq)                                        \
                        __field(u32, freq_offset)                                        \
                        __field(u32, chan_width)                                        \
                        __field(u32, center_freq1)                                        \
                        __field(u32, freq1_offset)                                        \
                        __field(u32, center_freq2)
#define CHANDEF_ASSIGN(c)                                                        \
                        __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0;        \
                        __entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0;        \
                        __entry->chan_width = (c) ? (c)->width : 0;                        \
                        __entry->center_freq1 = (c) ? (c)->center_freq1 : 0;                \
                        __entry->freq1_offset = (c) ? (c)->freq1_offset : 0;                \
                        __entry->center_freq2 = (c) ? (c)->center_freq2 : 0;
#define CHANDEF_PR_FMT        " chandef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)"
#define CHANDEF_PR_ARG        __entry->control_freq, __entry->freq_offset, __entry->chan_width, \
                        __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2

#define MIN_CHANDEF_ENTRY                                                                \
                        __field(u32, min_control_freq)                                        \
                        __field(u32, min_freq_offset)                                        \
                        __field(u32, min_chan_width)                                        \
                        __field(u32, min_center_freq1)                                        \
                        __field(u32, min_freq1_offset)                                        \
                        __field(u32, min_center_freq2)

#define MIN_CHANDEF_ASSIGN(c)                                                                \
                        __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0;        \
                        __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;        \
                        __entry->min_chan_width = (c)->width;                                \
                        __entry->min_center_freq1 = (c)->center_freq1;                        \
                        __entry->min_freq1_offset = (c)->freq1_offset;                        \
                        __entry->min_center_freq2 = (c)->center_freq2;
#define MIN_CHANDEF_PR_FMT        " mindef(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)"
#define MIN_CHANDEF_PR_ARG        __entry->min_control_freq, __entry->min_freq_offset,        \
                        __entry->min_chan_width,                                        \
                        __entry->min_center_freq1, __entry->min_freq1_offset,                \
                        __entry->min_center_freq2

#define AP_CHANDEF_ENTRY                                                                \
                        __field(u32, ap_control_freq)                                        \
                        __field(u32, ap_freq_offset)                                        \
                        __field(u32, ap_chan_width)                                        \
                        __field(u32, ap_center_freq1)                                        \
                        __field(u32, ap_freq1_offset)                                        \
                        __field(u32, ap_center_freq2)

#define AP_CHANDEF_ASSIGN(c)                                                                \
                        __entry->ap_control_freq = (c)->chan ? (c)->chan->center_freq : 0;\
                        __entry->ap_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0;\
                        __entry->ap_chan_width = (c)->chan ? (c)->width : 0;                \
                        __entry->ap_center_freq1 = (c)->chan ? (c)->center_freq1 : 0;        \
                        __entry->ap_freq1_offset = (c)->chan ? (c)->freq1_offset : 0;        \
                        __entry->ap_center_freq2 = (c)->chan ? (c)->center_freq2 : 0;
#define AP_CHANDEF_PR_FMT        " ap(%d.%03d MHz,width:%d,center: %d.%03d/%d MHz)"
#define AP_CHANDEF_PR_ARG        __entry->ap_control_freq, __entry->ap_freq_offset,        \
                        __entry->ap_chan_width,                                                \
                        __entry->ap_center_freq1, __entry->ap_freq1_offset,                \
                        __entry->ap_center_freq2

#define CHANCTX_ENTRY        CHANDEF_ENTRY                                                        \
                        MIN_CHANDEF_ENTRY                                                \
                        AP_CHANDEF_ENTRY                                                \
                        __field(u8, rx_chains_static)                                        \
                        __field(u8, rx_chains_dynamic)
#define CHANCTX_ASSIGN        CHANDEF_ASSIGN(&ctx->conf.def)                                        \
                        MIN_CHANDEF_ASSIGN(&ctx->conf.min_def)                                \
                        AP_CHANDEF_ASSIGN(&ctx->conf.ap)                                \
                        __entry->rx_chains_static = ctx->conf.rx_chains_static;                \
                        __entry->rx_chains_dynamic = ctx->conf.rx_chains_dynamic
#define CHANCTX_PR_FMT        CHANDEF_PR_FMT MIN_CHANDEF_PR_FMT AP_CHANDEF_PR_FMT " chains:%d/%d"
#define CHANCTX_PR_ARG        CHANDEF_PR_ARG,        MIN_CHANDEF_PR_ARG, AP_CHANDEF_PR_ARG,                \
                        __entry->rx_chains_static, __entry->rx_chains_dynamic

#define KEY_ENTRY        __field(u32, cipher)                                                \
                        __field(u8, hw_key_idx)                                                \
                        __field(u8, flags)                                                \
                        __field(s8, keyidx)
#define KEY_ASSIGN(k)        __entry->cipher = (k)->cipher;                                        \
                        __entry->flags = (k)->flags;                                        \
                        __entry->keyidx = (k)->keyidx;                                        \
                        __entry->hw_key_idx = (k)->hw_key_idx;
#define KEY_PR_FMT        " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d"
#define KEY_PR_ARG        __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx

#define AMPDU_ACTION_ENTRY        __field(enum ieee80211_ampdu_mlme_action,                \
                                        ieee80211_ampdu_mlme_action)                        \
                                STA_ENTRY                                                \
                                __field(u16, tid)                                        \
                                __field(u16, ssn)                                        \
                                __field(u16, buf_size)                                        \
                                __field(bool, amsdu)                                        \
                                __field(u16, timeout)                                        \
                                __field(u16, action)
#define AMPDU_ACTION_ASSIGN        STA_NAMED_ASSIGN(params->sta);                                \
                                __entry->tid = params->tid;                                \
                                __entry->ssn = params->ssn;                                \
                                __entry->buf_size = params->buf_size;                        \
                                __entry->amsdu = params->amsdu;                                \
                                __entry->timeout = params->timeout;                        \
                                __entry->action = params->action;
#define AMPDU_ACTION_PR_FMT        STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d action %d"
#define AMPDU_ACTION_PR_ARG        STA_PR_ARG, __entry->tid, __entry->ssn,                        \
                                __entry->buf_size, __entry->amsdu, __entry->timeout,        \
                                __entry->action

/*
 * Tracing for driver callbacks.
 */

DECLARE_EVENT_CLASS(local_only_evt,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local),
        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
        ),
        TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
);

DECLARE_EVENT_CLASS(local_sdata_addr_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __array(char, addr, ETH_ALEN)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                memcpy(__entry->addr, sdata->vif.addr, ETH_ALEN);
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " addr:%pM",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr
        )
);

DECLARE_EVENT_CLASS(local_u32_evt,
        TP_PROTO(struct ieee80211_local *local, u32 value),
        TP_ARGS(local, value),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, value)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->value = value;
        ),

        TP_printk(
                LOCAL_PR_FMT " value:%d",
                LOCAL_PR_ARG, __entry->value
        )
);

DECLARE_EVENT_CLASS(local_sdata_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

DEFINE_EVENT(local_only_evt, drv_return_void,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_return_int,
        TP_PROTO(struct ieee80211_local *local, int ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret)
);

TRACE_EVENT(drv_return_bool,
        TP_PROTO(struct ieee80211_local *local, bool ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %s", LOCAL_PR_ARG, (__entry->ret) ?
                  "true" : "false")
);

TRACE_EVENT(drv_return_u32,
        TP_PROTO(struct ieee80211_local *local, u32 ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %u", LOCAL_PR_ARG, __entry->ret)
);

TRACE_EVENT(drv_return_u64,
        TP_PROTO(struct ieee80211_local *local, u64 ret),
        TP_ARGS(local, ret),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u64, ret)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret)
);

DEFINE_EVENT(local_only_evt, drv_start,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_u32_evt, drv_get_et_strings,
             TP_PROTO(struct ieee80211_local *local, u32 sset),
             TP_ARGS(local, sset)
);

DEFINE_EVENT(local_u32_evt, drv_get_et_sset_count,
             TP_PROTO(struct ieee80211_local *local, u32 sset),
             TP_ARGS(local, sset)
);

DEFINE_EVENT(local_only_evt, drv_get_et_stats,
             TP_PROTO(struct ieee80211_local *local),
             TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, drv_suspend,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, drv_resume,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_set_wakeup,
        TP_PROTO(struct ieee80211_local *local, bool enabled),
        TP_ARGS(local, enabled),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(LOCAL_PR_FMT " enabled:%d", LOCAL_PR_ARG, __entry->enabled)
);

DEFINE_EVENT(local_only_evt, drv_stop,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_sdata_addr_evt, drv_add_interface,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_change_interface,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 enum nl80211_iftype type, bool p2p),

        TP_ARGS(local, sdata, type, p2p),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, new_type)
                __field(bool, new_p2p)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->new_type = type;
                __entry->new_p2p = p2p;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " new type:%d%s",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type,
                __entry->new_p2p ? "/p2p" : ""
        )
);

DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_config,
        TP_PROTO(struct ieee80211_local *local,
                 u32 changed),

        TP_ARGS(local, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, changed)
                __field(u32, flags)
                __field(int, power_level)
                __field(int, dynamic_ps_timeout)
                __field(u16, listen_interval)
                __field(u8, long_frame_max_tx_count)
                __field(u8, short_frame_max_tx_count)
                CHANDEF_ENTRY
                __field(int, smps)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->changed = changed;
                __entry->flags = local->hw.conf.flags;
                __entry->power_level = local->hw.conf.power_level;
                __entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout;
                __entry->listen_interval = local->hw.conf.listen_interval;
                __entry->long_frame_max_tx_count =
                        local->hw.conf.long_frame_max_tx_count;
                __entry->short_frame_max_tx_count =
                        local->hw.conf.short_frame_max_tx_count;
                CHANDEF_ASSIGN(&local->hw.conf.chandef)
                __entry->smps = local->hw.conf.smps_mode;
        ),

        TP_printk(
                LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT,
                LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG
        )
);

TRACE_EVENT(drv_vif_cfg_changed,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u64 changed),

        TP_ARGS(local, sdata, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u64, changed)
                __field(bool, assoc)
                __field(bool, ibss_joined)
                __field(bool, ibss_creator)
                __field(u16, aid)
                __dynamic_array(u32, arp_addr_list,
                                sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
                                        IEEE80211_BSS_ARP_ADDR_LIST_LEN :
                                        sdata->vif.cfg.arp_addr_cnt)
                __field(int, arp_addr_cnt)
                __dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
                __field(int, s1g)
                __field(bool, idle)
                __field(bool, ps)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->changed = changed;
                __entry->aid = sdata->vif.cfg.aid;
                __entry->assoc = sdata->vif.cfg.assoc;
                __entry->ibss_joined = sdata->vif.cfg.ibss_joined;
                __entry->ibss_creator = sdata->vif.cfg.ibss_creator;
                __entry->ps = sdata->vif.cfg.ps;

                __entry->arp_addr_cnt = sdata->vif.cfg.arp_addr_cnt;
                memcpy(__get_dynamic_array(arp_addr_list),
                       sdata->vif.cfg.arp_addr_list,
                       sizeof(u32) * (sdata->vif.cfg.arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
                                        IEEE80211_BSS_ARP_ADDR_LIST_LEN :
                                        sdata->vif.cfg.arp_addr_cnt));
                memcpy(__get_dynamic_array(ssid),
                       sdata->vif.cfg.ssid,
                       sdata->vif.cfg.ssid_len);
                __entry->s1g = sdata->vif.cfg.s1g;
                __entry->idle = sdata->vif.cfg.idle;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " changed:%#llx",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->changed
        )
);

TRACE_EVENT(drv_link_info_changed,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *link_conf,
                 u64 changed),

        TP_ARGS(local, sdata, link_conf, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u64, changed)
                __field(int, link_id)
                __field(bool, cts)
                __field(bool, shortpre)
                __field(bool, shortslot)
                __field(bool, enable_beacon)
                __field(u8, dtimper)
                __field(u16, bcnint)
                __field(u16, assoc_cap)
                __field(u64, sync_tsf)
                __field(u32, sync_device_ts)
                __field(u8, sync_dtim_count)
                __field(u32, basic_rates)
                __array(int, mcast_rate, NUM_NL80211_BANDS)
                __field(u16, ht_operation_mode)
                __field(s32, cqm_rssi_thold)
                __field(s32, cqm_rssi_hyst)
                __field(u32, channel_width)
                __field(u32, channel_cfreq1)
                __field(u32, channel_cfreq1_offset)
                __field(bool, qos)
                __field(bool, hidden_ssid)
                __field(int, txpower)
                __field(u8, p2p_oppps_ctwindow)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->changed = changed;
                __entry->link_id = link_conf->link_id;
                __entry->shortpre = link_conf->use_short_preamble;
                __entry->cts = link_conf->use_cts_prot;
                __entry->shortslot = link_conf->use_short_slot;
                __entry->enable_beacon = link_conf->enable_beacon;
                __entry->dtimper = link_conf->dtim_period;
                __entry->bcnint = link_conf->beacon_int;
                __entry->assoc_cap = link_conf->assoc_capability;
                __entry->sync_tsf = link_conf->sync_tsf;
                __entry->sync_device_ts = link_conf->sync_device_ts;
                __entry->sync_dtim_count = link_conf->sync_dtim_count;
                __entry->basic_rates = link_conf->basic_rates;
                memcpy(__entry->mcast_rate, link_conf->mcast_rate,
                       sizeof(__entry->mcast_rate));
                __entry->ht_operation_mode = link_conf->ht_operation_mode;
                __entry->cqm_rssi_thold = link_conf->cqm_rssi_thold;
                __entry->cqm_rssi_hyst = link_conf->cqm_rssi_hyst;
                __entry->channel_width = link_conf->chanreq.oper.width;
                __entry->channel_cfreq1 = link_conf->chanreq.oper.center_freq1;
                __entry->channel_cfreq1_offset = link_conf->chanreq.oper.freq1_offset;
                __entry->qos = link_conf->qos;
                __entry->hidden_ssid = link_conf->hidden_ssid;
                __entry->txpower = link_conf->txpower;
                __entry->p2p_oppps_ctwindow = link_conf->p2p_noa_attr.oppps_ctwindow;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " link_id:%d, changed:%#llx",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id,
                __entry->changed
        )
);

TRACE_EVENT(drv_prepare_multicast,
        TP_PROTO(struct ieee80211_local *local, int mc_count),

        TP_ARGS(local, mc_count),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, mc_count)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->mc_count = mc_count;
        ),

        TP_printk(
                LOCAL_PR_FMT " prepare mc (%d)",
                LOCAL_PR_ARG, __entry->mc_count
        )
);

TRACE_EVENT(drv_configure_filter,
        TP_PROTO(struct ieee80211_local *local,
                 unsigned int changed_flags,
                 unsigned int *total_flags,
                 u64 multicast),

        TP_ARGS(local, changed_flags, total_flags, multicast),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(unsigned int, changed)
                __field(unsigned int, total)
                __field(u64, multicast)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->changed = changed_flags;
                __entry->total = *total_flags;
                __entry->multicast = multicast;
        ),

        TP_printk(
                LOCAL_PR_FMT " changed:%#x total:%#x",
                LOCAL_PR_ARG, __entry->changed, __entry->total
        )
);

TRACE_EVENT(drv_config_iface_filter,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 unsigned int filter_flags,
                 unsigned int changed_flags),

        TP_ARGS(local, sdata, filter_flags, changed_flags),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(unsigned int, filter_flags)
                __field(unsigned int, changed_flags)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->filter_flags = filter_flags;
                __entry->changed_flags = changed_flags;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT
                " filter_flags: %#x changed_flags: %#x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->filter_flags,
                __entry->changed_flags
        )
);

TRACE_EVENT(drv_set_tim,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta, bool set),

        TP_ARGS(local, sta, set),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(bool, set)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->set = set;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " set:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->set
        )
);

TRACE_EVENT(drv_set_key,
        TP_PROTO(struct ieee80211_local *local,
                 enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta,
                 struct ieee80211_key_conf *key),

        TP_ARGS(local, cmd, sdata, sta, key),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, cmd)
                KEY_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->cmd = cmd;
                KEY_ASSIGN(key);
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " cmd: %d" KEY_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd, KEY_PR_ARG
        )
);

TRACE_EVENT(drv_update_tkip_key,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_key_conf *conf,
                 struct ieee80211_sta *sta, u32 iv32),

        TP_ARGS(local, sdata, conf, sta, iv32),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, iv32)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->iv32 = iv32;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " iv32:%#x",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->iv32
        )
);

DEFINE_EVENT(local_sdata_evt, drv_hw_scan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_cancel_hw_scan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_sched_scan_start,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_sched_scan_stop,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_sw_scan_start,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const u8 *mac_addr),

        TP_ARGS(local, sdata, mac_addr),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __array(char, mac_addr, ETH_ALEN)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                memcpy(__entry->mac_addr, mac_addr, ETH_ALEN);
        ),

        TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT ", addr:%pM",
                  LOCAL_PR_ARG, VIF_PR_ARG, __entry->mac_addr)
);

DEFINE_EVENT(local_sdata_evt, drv_sw_scan_complete,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_get_stats,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_low_level_stats *stats,
                 int ret),

        TP_ARGS(local, stats, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, ret)
                __field(unsigned int, ackfail)
                __field(unsigned int, rtsfail)
                __field(unsigned int, fcserr)
                __field(unsigned int, rtssucc)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->ret = ret;
                __entry->ackfail = stats->dot11ACKFailureCount;
                __entry->rtsfail = stats->dot11RTSFailureCount;
                __entry->fcserr = stats->dot11FCSErrorCount;
                __entry->rtssucc = stats->dot11RTSSuccessCount;
        ),

        TP_printk(
                LOCAL_PR_FMT " ret:%d",
                LOCAL_PR_ARG, __entry->ret
        )
);

TRACE_EVENT(drv_get_key_seq,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_key_conf *key),

        TP_ARGS(local, key),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                KEY_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                KEY_ASSIGN(key);
        ),

        TP_printk(
                LOCAL_PR_FMT KEY_PR_FMT,
                LOCAL_PR_ARG, KEY_PR_ARG
        )
);

DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold,
        TP_PROTO(struct ieee80211_local *local, u32 value),
        TP_ARGS(local, value)
);

DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold,
        TP_PROTO(struct ieee80211_local *local, u32 value),
        TP_ARGS(local, value)
);

TRACE_EVENT(drv_set_coverage_class,
        TP_PROTO(struct ieee80211_local *local, s16 value),

        TP_ARGS(local, value),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(s16, value)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->value = value;
        ),

        TP_printk(
                LOCAL_PR_FMT " value:%d",
                LOCAL_PR_ARG, __entry->value
        )
);

TRACE_EVENT(drv_sta_notify,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 enum sta_notify_cmd cmd,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, cmd, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, cmd)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->cmd = cmd;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " cmd:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->cmd
        )
);

TRACE_EVENT(drv_sta_state,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta,
                 enum ieee80211_sta_state old_state,
                 enum ieee80211_sta_state new_state),

        TP_ARGS(local, sdata, sta, old_state, new_state),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, old_state)
                __field(u32, new_state)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->old_state = old_state;
                __entry->new_state = new_state;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " state: %d->%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
                __entry->old_state, __entry->new_state
        )
);

TRACE_EVENT(drv_sta_set_txpwr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(s16, txpwr)
                __field(u8, type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->txpwr = sta->deflink.txpwr.power;
                __entry->type = sta->deflink.txpwr.type;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " txpwr: %d type %d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
                __entry->txpwr,  __entry->type
        )
);

TRACE_EVENT(drv_sta_rc_update,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta,
                 u32 changed),

        TP_ARGS(local, sdata, sta, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u32, changed)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->changed = changed;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " changed: 0x%x",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->changed
        )
);

DECLARE_EVENT_CLASS(sta_event,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
        )
);

DEFINE_EVENT(sta_event, drv_sta_statistics,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_add,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_remove,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_pre_rcu_remove,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sync_rx_queues,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DEFINE_EVENT(sta_event, drv_sta_rate_tbl_update,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

TRACE_EVENT(drv_conf_tx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 unsigned int link_id,
                 u16 ac, const struct ieee80211_tx_queue_params *params),

        TP_ARGS(local, sdata, link_id, ac, params),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(unsigned int, link_id)
                __field(u16, ac)
                __field(u16, txop)
                __field(u16, cw_min)
                __field(u16, cw_max)
                __field(u8, aifs)
                __field(bool, uapsd)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->link_id = link_id;
                __entry->ac = ac;
                __entry->txop = params->txop;
                __entry->cw_max = params->cw_max;
                __entry->cw_min = params->cw_min;
                __entry->aifs = params->aifs;
                __entry->uapsd = params->uapsd;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  " link_id: %d, AC:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->ac
        )
);

DEFINE_EVENT(local_sdata_evt, drv_get_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_set_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u64 tsf),

        TP_ARGS(local, sdata, tsf),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u64, tsf)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->tsf = tsf;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  " tsf:%llu",
                LOCAL_PR_ARG, VIF_PR_ARG, (unsigned long long)__entry->tsf
        )
);

TRACE_EVENT(drv_offset_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 s64 offset),

        TP_ARGS(local, sdata, offset),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(s64, tsf_offset)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->tsf_offset = offset;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  " tsf offset:%lld",
                LOCAL_PR_ARG, VIF_PR_ARG,
                (unsigned long long)__entry->tsf_offset
        )
);

DEFINE_EVENT(local_sdata_evt, drv_reset_tsf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_only_evt, drv_tx_last_beacon,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_ampdu_action,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_ampdu_params *params),

        TP_ARGS(local, sdata, params),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                AMPDU_ACTION_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                AMPDU_ACTION_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG
        )
);

TRACE_EVENT(drv_get_survey,
        TP_PROTO(struct ieee80211_local *local, int _idx,
                 struct survey_info *survey),

        TP_ARGS(local, _idx, survey),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, idx)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->idx = _idx;
        ),

        TP_printk(
                LOCAL_PR_FMT " idx:%d",
                LOCAL_PR_ARG, __entry->idx
        )
);

TRACE_EVENT(drv_flush,
        TP_PROTO(struct ieee80211_local *local,
                 u32 queues, bool drop),

        TP_ARGS(local, queues, drop),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, drop)
                __field(u32, queues)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->drop = drop;
                __entry->queues = queues;
        ),

        TP_printk(
                LOCAL_PR_FMT " queues:0x%x drop:%d",
                LOCAL_PR_ARG, __entry->queues, __entry->drop
        )
);

DEFINE_EVENT(sta_event, drv_flush_sta,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

DECLARE_EVENT_CLASS(chanswitch_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel_switch *ch_switch),

        TP_ARGS(local, sdata, ch_switch),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANDEF_ENTRY
                __field(u64, timestamp)
                __field(u32, device_timestamp)
                __field(bool, block_tx)
                __field(u8, count)
                __field(u8, link_id)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANDEF_ASSIGN(&ch_switch->chandef)
                __entry->timestamp = ch_switch->timestamp;
                __entry->device_timestamp = ch_switch->device_timestamp;
                __entry->block_tx = ch_switch->block_tx;
                __entry->count = ch_switch->count;
                __entry->link_id = ch_switch->link_id;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT CHANDEF_PR_FMT  " count:%d block_tx:%d timestamp:%llu device_ts:%u link_id:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->count,
                __entry->block_tx, __entry->timestamp,
                __entry->device_timestamp, __entry->link_id
        )
);

DEFINE_EVENT(chanswitch_evt, drv_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel_switch *ch_switch),
        TP_ARGS(local, sdata, ch_switch)
);

TRACE_EVENT(drv_set_antenna,
        TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),

        TP_ARGS(local, tx_ant, rx_ant, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx_ant)
                __field(u32, rx_ant)
                __field(int, ret)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx_ant = tx_ant;
                __entry->rx_ant = rx_ant;
                __entry->ret = ret;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
                LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
        )
);

TRACE_EVENT(drv_get_antenna,
        TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),

        TP_ARGS(local, tx_ant, rx_ant, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx_ant)
                __field(u32, rx_ant)
                __field(int, ret)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx_ant = tx_ant;
                __entry->rx_ant = rx_ant;
                __entry->ret = ret;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
                LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
        )
);

TRACE_EVENT(drv_remain_on_channel,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel *chan,
                 unsigned int duration,
                 enum ieee80211_roc_type type),

        TP_ARGS(local, sdata, chan, duration, type),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(int, center_freq)
                __field(int, freq_offset)
                __field(unsigned int, duration)
                __field(u32, type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->center_freq = chan->center_freq;
                __entry->freq_offset = chan->freq_offset;
                __entry->duration = duration;
                __entry->type = type;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " freq:%d.%03dMHz duration:%dms type=%d",
                LOCAL_PR_ARG, VIF_PR_ARG,
                __entry->center_freq, __entry->freq_offset,
                __entry->duration, __entry->type
        )
);

DEFINE_EVENT(local_sdata_evt, drv_cancel_remain_on_channel,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_set_ringparam,
        TP_PROTO(struct ieee80211_local *local, u32 tx, u32 rx),

        TP_ARGS(local, tx, rx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx)
                __field(u32, rx)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx = tx;
                __entry->rx = rx;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx:%d rx %d",
                LOCAL_PR_ARG, __entry->tx, __entry->rx
        )
);

TRACE_EVENT(drv_get_ringparam,
        TP_PROTO(struct ieee80211_local *local, u32 *tx, u32 *tx_max,
                 u32 *rx, u32 *rx_max),

        TP_ARGS(local, tx, tx_max, rx, rx_max),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u32, tx)
                __field(u32, tx_max)
                __field(u32, rx)
                __field(u32, rx_max)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->tx = *tx;
                __entry->tx_max = *tx_max;
                __entry->rx = *rx;
                __entry->rx_max = *rx_max;
        ),

        TP_printk(
                LOCAL_PR_FMT " tx:%d tx_max %d rx %d rx_max %d",
                LOCAL_PR_ARG,
                __entry->tx, __entry->tx_max, __entry->rx, __entry->rx_max
        )
);

DEFINE_EVENT(local_only_evt, drv_tx_frames_pending,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, drv_offchannel_tx_cancel_wait,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(drv_set_bitrate_mask,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const struct cfg80211_bitrate_mask *mask),

        TP_ARGS(local, sdata, mask),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, legacy_2g)
                __field(u32, legacy_5g)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->legacy_2g = mask->control[NL80211_BAND_2GHZ].legacy;
                __entry->legacy_5g = mask->control[NL80211_BAND_5GHZ].legacy;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " 2G Mask:0x%x 5G Mask:0x%x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->legacy_2g, __entry->legacy_5g
        )
);

TRACE_EVENT(drv_set_rekey_data,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_gtk_rekey_data *data),

        TP_ARGS(local, sdata, data),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __array(u8, kek, NL80211_KEK_LEN)
                __array(u8, kck, NL80211_KCK_LEN)
                __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                memcpy(__entry->kek, data->kek, NL80211_KEK_LEN);
                memcpy(__entry->kck, data->kck, NL80211_KCK_LEN);
                memcpy(__entry->replay_ctr, data->replay_ctr,
                       NL80211_REPLAY_CTR_LEN);
        ),

        TP_printk(LOCAL_PR_FMT VIF_PR_FMT,
                  LOCAL_PR_ARG, VIF_PR_ARG)
);

TRACE_EVENT(drv_event_callback,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const struct ieee80211_event *_event),

        TP_ARGS(local, sdata, _event),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->type = _event->type;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " event:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->type
        )
);

DECLARE_EVENT_CLASS(release_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u16 tids, int num_frames,
                 enum ieee80211_frame_release_type reason,
                 bool more_data),

        TP_ARGS(local, sta, tids, num_frames, reason, more_data),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u16, tids)
                __field(int, num_frames)
                __field(int, reason)
                __field(bool, more_data)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->tids = tids;
                __entry->num_frames = num_frames;
                __entry->reason = reason;
                __entry->more_data = more_data;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT
                " TIDs:0x%.4x frames:%d reason:%d more:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->tids, __entry->num_frames,
                __entry->reason, __entry->more_data
        )
);

DEFINE_EVENT(release_evt, drv_release_buffered_frames,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u16 tids, int num_frames,
                 enum ieee80211_frame_release_type reason,
                 bool more_data),

        TP_ARGS(local, sta, tids, num_frames, reason, more_data)
);

DEFINE_EVENT(release_evt, drv_allow_buffered_frames,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u16 tids, int num_frames,
                 enum ieee80211_frame_release_type reason,
                 bool more_data),

        TP_ARGS(local, sta, tids, num_frames, reason, more_data)
);

DECLARE_EVENT_CLASS(mgd_prepare_complete_tx_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u16 duration, u16 subtype, bool success),

        TP_ARGS(local, sdata, duration, subtype, success),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, duration)
                __field(u16, subtype)
                __field(u8, success)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->duration = duration;
                __entry->subtype = subtype;
                __entry->success = success;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " duration: %u, subtype:0x%x, success:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration,
                __entry->subtype, __entry->success
        )
);

DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_prepare_tx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u16 duration, u16 subtype, bool success),

        TP_ARGS(local, sdata, duration, subtype, success)
);

DEFINE_EVENT(mgd_prepare_complete_tx_evt, drv_mgd_complete_tx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u16 duration, u16 subtype, bool success),

        TP_ARGS(local, sdata, duration, subtype, success)
);

DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),

        TP_ARGS(local, sdata)
);

DECLARE_EVENT_CLASS(local_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx),

        TP_ARGS(local, ctx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                CHANCTX_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                CHANCTX_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT CHANCTX_PR_FMT,
                LOCAL_PR_ARG, CHANCTX_PR_ARG
        )
);

DEFINE_EVENT(local_chanctx, drv_add_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, ctx)
);

DEFINE_EVENT(local_chanctx, drv_remove_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, ctx)
);

TRACE_EVENT(drv_change_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_chanctx *ctx,
                 u32 changed),

        TP_ARGS(local, ctx, changed),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                CHANCTX_ENTRY
                __field(u32, changed)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                CHANCTX_ASSIGN;
                __entry->changed = changed;
        ),

        TP_printk(
                LOCAL_PR_FMT CHANCTX_PR_FMT " changed:%#x",
                LOCAL_PR_ARG, CHANCTX_PR_ARG, __entry->changed
        )
);

#if !defined(__TRACE_VIF_ENTRY)
#define __TRACE_VIF_ENTRY
struct trace_vif_entry {
        enum nl80211_iftype vif_type;
        bool p2p;
        char vif_name[IFNAMSIZ];
} __packed;

struct trace_chandef_entry {
        u32 control_freq;
        u32 freq_offset;
        u32 chan_width;
        u32 center_freq1;
        u32 freq1_offset;
        u32 center_freq2;
} __packed;

struct trace_switch_entry {
        struct trace_vif_entry vif;
        unsigned int link_id;
        struct trace_chandef_entry old_chandef;
        struct trace_chandef_entry new_chandef;
} __packed;

#define SWITCH_ENTRY_ASSIGN(to, from) local_vifs[i].to = vifs[i].from
#endif

TRACE_EVENT(drv_switch_vif_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_vif_chanctx_switch *vifs,
                 int n_vifs, enum ieee80211_chanctx_switch_mode mode),
            TP_ARGS(local, vifs, n_vifs, mode),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(int, n_vifs)
                __field(u32, mode)
                __dynamic_array(u8, vifs,
                                sizeof(struct trace_switch_entry) * n_vifs)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->n_vifs = n_vifs;
                __entry->mode = mode;
                {
                        struct trace_switch_entry *local_vifs =
                                __get_dynamic_array(vifs);
                        int i;

                        for (i = 0; i < n_vifs; i++) {
                                struct ieee80211_sub_if_data *sdata;

                                sdata = container_of(vifs[i].vif,
                                                struct ieee80211_sub_if_data,
                                                vif);

                                SWITCH_ENTRY_ASSIGN(vif.vif_type, vif->type);
                                SWITCH_ENTRY_ASSIGN(vif.p2p, vif->p2p);
                                SWITCH_ENTRY_ASSIGN(link_id, link_conf->link_id);
                                strncpy(local_vifs[i].vif.vif_name,
                                        sdata->name,
                                        sizeof(local_vifs[i].vif.vif_name));
                                SWITCH_ENTRY_ASSIGN(old_chandef.control_freq,
                                                old_ctx->def.chan->center_freq);
                                SWITCH_ENTRY_ASSIGN(old_chandef.freq_offset,
                                                old_ctx->def.chan->freq_offset);
                                SWITCH_ENTRY_ASSIGN(old_chandef.chan_width,
                                                    old_ctx->def.width);
                                SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1,
                                                    old_ctx->def.center_freq1);
                                SWITCH_ENTRY_ASSIGN(old_chandef.freq1_offset,
                                                    old_ctx->def.freq1_offset);
                                SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2,
                                                    old_ctx->def.center_freq2);
                                SWITCH_ENTRY_ASSIGN(new_chandef.control_freq,
                                                new_ctx->def.chan->center_freq);
                                SWITCH_ENTRY_ASSIGN(new_chandef.freq_offset,
                                                new_ctx->def.chan->freq_offset);
                                SWITCH_ENTRY_ASSIGN(new_chandef.chan_width,
                                                    new_ctx->def.width);
                                SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1,
                                                    new_ctx->def.center_freq1);
                                SWITCH_ENTRY_ASSIGN(new_chandef.freq1_offset,
                                                    new_ctx->def.freq1_offset);
                                SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2,
                                                    new_ctx->def.center_freq2);
                        }
                }
        ),

        TP_printk(
                LOCAL_PR_FMT " n_vifs:%d mode:%d",
                LOCAL_PR_ARG, __entry->n_vifs, __entry->mode
        )
);

DECLARE_EVENT_CLASS(local_sdata_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *link_conf,
                 struct ieee80211_chanctx *ctx),

        TP_ARGS(local, sdata, link_conf, ctx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANCTX_ENTRY
                __field(unsigned int, link_id)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANCTX_ASSIGN;
                __entry->link_id = link_conf->link_id;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " link_id:%d" CHANCTX_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, CHANCTX_PR_ARG
        )
);

DEFINE_EVENT(local_sdata_chanctx, drv_assign_vif_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *link_conf,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, sdata, link_conf, ctx)
);

DEFINE_EVENT(local_sdata_chanctx, drv_unassign_vif_chanctx,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *link_conf,
                 struct ieee80211_chanctx *ctx),
        TP_ARGS(local, sdata, link_conf, ctx)
);

TRACE_EVENT(drv_start_ap,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *link_conf),

        TP_ARGS(local, sdata, link_conf),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, link_id)
                __field(u8, dtimper)
                __field(u16, bcnint)
                __dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
                __field(bool, hidden_ssid)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->link_id = link_conf->link_id;
                __entry->dtimper = link_conf->dtim_period;
                __entry->bcnint = link_conf->beacon_int;
                __entry->hidden_ssid = link_conf->hidden_ssid;
                memcpy(__get_dynamic_array(ssid),
                       sdata->vif.cfg.ssid,
                       sdata->vif.cfg.ssid_len);
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " link id %u",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id
        )
);

TRACE_EVENT(drv_stop_ap,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *link_conf),

        TP_ARGS(local, sdata, link_conf),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u32, link_id)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->link_id = link_conf->link_id;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT " link id %u",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id
        )
);

TRACE_EVENT(drv_reconfig_complete,
        TP_PROTO(struct ieee80211_local *local,
                 enum ieee80211_reconfig_type reconfig_type),
        TP_ARGS(local, reconfig_type),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u8, reconfig_type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->reconfig_type = reconfig_type;
        ),

        TP_printk(
                LOCAL_PR_FMT  " reconfig_type:%d",
                LOCAL_PR_ARG, __entry->reconfig_type
        )

);

#if IS_ENABLED(CONFIG_IPV6)
DEFINE_EVENT(local_sdata_evt, drv_ipv6_addr_change,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);
#endif

TRACE_EVENT(drv_join_ibss,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_bss_conf *info),

        TP_ARGS(local, sdata, info),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, dtimper)
                __field(u16, bcnint)
                __dynamic_array(u8, ssid, sdata->vif.cfg.ssid_len)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->dtimper = info->dtim_period;
                __entry->bcnint = info->beacon_int;
                memcpy(__get_dynamic_array(ssid),
                       sdata->vif.cfg.ssid,
                       sdata->vif.cfg.ssid_len);
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

DEFINE_EVENT(local_sdata_evt, drv_leave_ibss,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_get_expected_throughput,
        TP_PROTO(struct ieee80211_sta *sta),

        TP_ARGS(sta),

        TP_STRUCT__entry(
                STA_ENTRY
        ),

        TP_fast_assign(
                STA_ASSIGN;
        ),

        TP_printk(
                STA_PR_FMT, STA_PR_ARG
        )
);

TRACE_EVENT(drv_start_nan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_nan_conf *conf),

        TP_ARGS(local, sdata, conf),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", master preference: %u, bands: 0x%0x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
                __entry->bands
        )
);

TRACE_EVENT(drv_stop_nan,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),

        TP_ARGS(local, sdata),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

TRACE_EVENT(drv_nan_change_conf,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_nan_conf *conf,
                 u32 changes),

        TP_ARGS(local, sdata, conf, changes),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
                __field(u32, changes)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
                __entry->changes = changes;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", master preference: %u, bands: 0x%0x, changes: 0x%x",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref,
                __entry->bands, __entry->changes
        )
);

TRACE_EVENT(drv_add_nan_func,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 const struct cfg80211_nan_func *func),

        TP_ARGS(local, sdata, func),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, type)
                __field(u8, inst_id)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->type = func->type;
                __entry->inst_id = func->instance_id;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", type: %u, inst_id: %u",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->type, __entry->inst_id
        )
);

TRACE_EVENT(drv_del_nan_func,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u8 instance_id),

        TP_ARGS(local, sdata, instance_id),
        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, instance_id)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->instance_id = instance_id;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT
                ", instance_id: %u",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->instance_id
        )
);

DEFINE_EVENT(local_sdata_evt, drv_start_pmsr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_abort_pmsr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(drv_set_default_unicast_key,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 int key_idx),

        TP_ARGS(local, sdata, key_idx),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(int, key_idx)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->key_idx = key_idx;
        ),

        TP_printk(LOCAL_PR_FMT VIF_PR_FMT " key_idx:%d",
                  LOCAL_PR_ARG, VIF_PR_ARG, __entry->key_idx)
);

TRACE_EVENT(drv_channel_switch_beacon,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_chan_def *chandef),

        TP_ARGS(local, sdata, chandef),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                CHANDEF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                CHANDEF_ASSIGN(chandef);
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " channel switch to " CHANDEF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG
        )
);

DEFINE_EVENT(chanswitch_evt, drv_pre_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel_switch *ch_switch),
        TP_ARGS(local, sdata, ch_switch)
);

DEFINE_EVENT(local_sdata_evt, drv_post_channel_switch,
             TP_PROTO(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata),
             TP_ARGS(local, sdata)
);

DEFINE_EVENT(local_sdata_evt, drv_abort_channel_switch,
             TP_PROTO(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata),
             TP_ARGS(local, sdata)
);

DEFINE_EVENT(chanswitch_evt, drv_channel_switch_rx_beacon,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_channel_switch *ch_switch),
        TP_ARGS(local, sdata, ch_switch)
);

TRACE_EVENT(drv_get_txpower,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 int dbm, int ret),

        TP_ARGS(local, sdata, dbm, ret),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(int, dbm)
                __field(int, ret)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->dbm = dbm;
                __entry->ret = ret;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " dbm:%d ret:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->dbm, __entry->ret
        )
);

TRACE_EVENT(drv_tdls_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta, u8 oper_class,
                 struct cfg80211_chan_def *chandef),

        TP_ARGS(local, sdata, sta, oper_class, chandef),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u8, oper_class)
                CHANDEF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->oper_class = oper_class;
                CHANDEF_ASSIGN(chandef)
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " tdls channel switch to"
                CHANDEF_PR_FMT  " oper_class:%d " STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, CHANDEF_PR_ARG, __entry->oper_class,
                STA_PR_ARG
        )
);

TRACE_EVENT(drv_tdls_cancel_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sdata, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT
                " tdls cancel channel switch with " STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
        )
);

TRACE_EVENT(drv_tdls_recv_channel_switch,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_tdls_ch_sw_params *params),

        TP_ARGS(local, sdata, params),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, action_code)
                STA_ENTRY
                CHANDEF_ENTRY
                __field(u32, status)
                __field(bool, peer_initiator)
                __field(u32, timestamp)
                __field(u16, switch_time)
                __field(u16, switch_timeout)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_NAMED_ASSIGN(params->sta);
                CHANDEF_ASSIGN(params->chandef)
                __entry->peer_initiator = params->sta->tdls_initiator;
                __entry->action_code = params->action_code;
                __entry->status = params->status;
                __entry->timestamp = params->timestamp;
                __entry->switch_time = params->switch_time;
                __entry->switch_timeout = params->switch_timeout;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " received tdls channel switch packet"
                " action:%d status:%d time:%d switch time:%d switch"
                " timeout:%d initiator: %d chan:" CHANDEF_PR_FMT STA_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->action_code, __entry->status,
                __entry->timestamp, __entry->switch_time,
                __entry->switch_timeout, __entry->peer_initiator,
                CHANDEF_PR_ARG, STA_PR_ARG
        )
);

TRACE_EVENT(drv_wake_tx_queue,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct txq_info *txq),

        TP_ARGS(local, sdata, txq),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u8, ac)
                __field(u8, tid)
        ),

        TP_fast_assign(
                struct ieee80211_sta *sta = txq->txq.sta;

                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->ac = txq->txq.ac;
                __entry->tid = txq->txq.tid;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " ac:%d tid:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ac, __entry->tid
        )
);

TRACE_EVENT(drv_get_ftm_responder_stats,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct cfg80211_ftm_responder_stats *ftm_stats),

        TP_ARGS(local, sdata, ftm_stats),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT,
                LOCAL_PR_ARG, VIF_PR_ARG
        )
);

DEFINE_EVENT(local_sdata_addr_evt, drv_update_vif_offload,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

DECLARE_EVENT_CLASS(sta_flag_evt,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta, bool enabled),

        TP_ARGS(local, sdata, sta, enabled),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(bool, enabled)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->enabled = enabled;
        ),

        TP_printk(
                LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " enabled:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->enabled
        )
);

DEFINE_EVENT(sta_flag_evt, drv_sta_set_4addr,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta, bool enabled),

        TP_ARGS(local, sdata, sta, enabled)
);

DEFINE_EVENT(sta_flag_evt, drv_sta_set_decap_offload,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta, bool enabled),

        TP_ARGS(local, sdata, sta, enabled)
);

TRACE_EVENT(drv_add_twt_setup,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 struct ieee80211_twt_setup *twt,
                 struct ieee80211_twt_params *twt_agrt),

        TP_ARGS(local, sta, twt, twt_agrt),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u8, dialog_token)
                __field(u8, control)
                __field(__le16, req_type)
                __field(__le64, twt)
                __field(u8, duration)
                __field(__le16, mantissa)
                __field(u8, channel)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->dialog_token = twt->dialog_token;
                __entry->control = twt->control;
                __entry->req_type = twt_agrt->req_type;
                __entry->twt = twt_agrt->twt;
                __entry->duration = twt_agrt->min_twt_dur;
                __entry->mantissa = twt_agrt->mantissa;
                __entry->channel = twt_agrt->channel;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT
                " token:%d control:0x%02x req_type:0x%04x"
                " twt:%llu duration:%d mantissa:%d channel:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->dialog_token,
                __entry->control, le16_to_cpu(__entry->req_type),
                le64_to_cpu(__entry->twt), __entry->duration,
                le16_to_cpu(__entry->mantissa), __entry->channel
        )
);

TRACE_EVENT(drv_twt_teardown_request,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta, u8 flowid),

        TP_ARGS(local, sta, flowid),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u8, flowid)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->flowid = flowid;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " flowid:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->flowid
        )
);

DEFINE_EVENT(sta_event, drv_net_fill_forward_path,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta),
        TP_ARGS(local, sdata, sta)
);

TRACE_EVENT(drv_net_setup_tc,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u8 type),

        TP_ARGS(local, sdata, type),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u8, type)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->type = type;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " type:%d\n",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->type
        )
);

TRACE_EVENT(drv_can_activate_links,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u16 active_links),

        TP_ARGS(local, sdata, active_links),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u16, active_links)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->active_links = active_links;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " requested active_links:0x%04x\n",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->active_links
        )
);

TRACE_EVENT(drv_change_vif_links,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u16 old_links, u16 new_links),

        TP_ARGS(local, sdata, old_links, new_links),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(u16, old_links)
                __field(u16, new_links)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->old_links = old_links;
                __entry->new_links = new_links;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT " old_links:0x%04x, new_links:0x%04x\n",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->old_links, __entry->new_links
        )
);

TRACE_EVENT(drv_change_sta_links,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_sta *sta,
                 u16 old_links, u16 new_links),

        TP_ARGS(local, sdata, sta, old_links, new_links),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                STA_ENTRY
                __field(u16, old_links)
                __field(u16, new_links)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                STA_ASSIGN;
                __entry->old_links = old_links;
                __entry->new_links = new_links;
        ),

        TP_printk(
                LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " old_links:0x%04x, new_links:0x%04x\n",
                LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
                __entry->old_links, __entry->new_links
        )
);

/*
 * Tracing for API calls that drivers call.
 */

TRACE_EVENT(api_start_tx_ba_session,
        TP_PROTO(struct ieee80211_sta *sta, u16 tid),

        TP_ARGS(sta, tid),

        TP_STRUCT__entry(
                STA_ENTRY
                __field(u16, tid)
        ),

        TP_fast_assign(
                STA_ASSIGN;
                __entry->tid = tid;
        ),

        TP_printk(
                STA_PR_FMT " tid:%d",
                STA_PR_ARG, __entry->tid
        )
);

TRACE_EVENT(api_start_tx_ba_cb,
        TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),

        TP_ARGS(sdata, ra, tid),

        TP_STRUCT__entry(
                VIF_ENTRY
                __array(u8, ra, ETH_ALEN)
                __field(u16, tid)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                memcpy(__entry->ra, ra, ETH_ALEN);
                __entry->tid = tid;
        ),

        TP_printk(
                VIF_PR_FMT " ra:%pM tid:%d",
                VIF_PR_ARG, __entry->ra, __entry->tid
        )
);

TRACE_EVENT(api_stop_tx_ba_session,
        TP_PROTO(struct ieee80211_sta *sta, u16 tid),

        TP_ARGS(sta, tid),

        TP_STRUCT__entry(
                STA_ENTRY
                __field(u16, tid)
        ),

        TP_fast_assign(
                STA_ASSIGN;
                __entry->tid = tid;
        ),

        TP_printk(
                STA_PR_FMT " tid:%d",
                STA_PR_ARG, __entry->tid
        )
);

TRACE_EVENT(api_stop_tx_ba_cb,
        TP_PROTO(struct ieee80211_sub_if_data *sdata, const u8 *ra, u16 tid),

        TP_ARGS(sdata, ra, tid),

        TP_STRUCT__entry(
                VIF_ENTRY
                __array(u8, ra, ETH_ALEN)
                __field(u16, tid)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                memcpy(__entry->ra, ra, ETH_ALEN);
                __entry->tid = tid;
        ),

        TP_printk(
                VIF_PR_FMT " ra:%pM tid:%d",
                VIF_PR_ARG, __entry->ra, __entry->tid
        )
);

DEFINE_EVENT(local_only_evt, api_restart_hw,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(api_beacon_loss,
        TP_PROTO(struct ieee80211_sub_if_data *sdata),

        TP_ARGS(sdata),

        TP_STRUCT__entry(
                VIF_ENTRY
        ),

        TP_fast_assign(
                VIF_ASSIGN;
        ),

        TP_printk(
                VIF_PR_FMT,
                VIF_PR_ARG
        )
);

TRACE_EVENT(api_connection_loss,
        TP_PROTO(struct ieee80211_sub_if_data *sdata),

        TP_ARGS(sdata),

        TP_STRUCT__entry(
                VIF_ENTRY
        ),

        TP_fast_assign(
                VIF_ASSIGN;
        ),

        TP_printk(
                VIF_PR_FMT,
                VIF_PR_ARG
        )
);

TRACE_EVENT(api_disconnect,
        TP_PROTO(struct ieee80211_sub_if_data *sdata, bool reconnect),

        TP_ARGS(sdata, reconnect),

        TP_STRUCT__entry(
                VIF_ENTRY
                __field(int, reconnect)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                __entry->reconnect = reconnect;
        ),

        TP_printk(
                VIF_PR_FMT " reconnect:%d",
                VIF_PR_ARG, __entry->reconnect
        )
);

TRACE_EVENT(api_cqm_rssi_notify,
        TP_PROTO(struct ieee80211_sub_if_data *sdata,
                 enum nl80211_cqm_rssi_threshold_event rssi_event,
                 s32 rssi_level),

        TP_ARGS(sdata, rssi_event, rssi_level),

        TP_STRUCT__entry(
                VIF_ENTRY
                __field(u32, rssi_event)
                __field(s32, rssi_level)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                __entry->rssi_event = rssi_event;
                __entry->rssi_level = rssi_level;
        ),

        TP_printk(
                VIF_PR_FMT " event:%d rssi:%d",
                VIF_PR_ARG, __entry->rssi_event, __entry->rssi_level
        )
);

DEFINE_EVENT(local_sdata_evt, api_cqm_beacon_loss_notify,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata),
        TP_ARGS(local, sdata)
);

TRACE_EVENT(api_scan_completed,
        TP_PROTO(struct ieee80211_local *local, bool aborted),

        TP_ARGS(local, aborted),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(bool, aborted)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->aborted = aborted;
        ),

        TP_printk(
                LOCAL_PR_FMT " aborted:%d",
                LOCAL_PR_ARG, __entry->aborted
        )
);

TRACE_EVENT(api_sched_scan_results,
        TP_PROTO(struct ieee80211_local *local),

        TP_ARGS(local),

        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT, LOCAL_PR_ARG
        )
);

TRACE_EVENT(api_sched_scan_stopped,
        TP_PROTO(struct ieee80211_local *local),

        TP_ARGS(local),

        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT, LOCAL_PR_ARG
        )
);

TRACE_EVENT(api_sta_block_awake,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta, bool block),

        TP_ARGS(local, sta, block),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(bool, block)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->block = block;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " block:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->block
        )
);

TRACE_EVENT(api_chswitch_done,
        TP_PROTO(struct ieee80211_sub_if_data *sdata, bool success,
                 unsigned int link_id),

        TP_ARGS(sdata, success, link_id),

        TP_STRUCT__entry(
                VIF_ENTRY
                __field(bool, success)
                __field(unsigned int, link_id)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                __entry->success = success;
                __entry->link_id = link_id;
        ),

        TP_printk(
                VIF_PR_FMT " success=%d link_id=%d",
                VIF_PR_ARG, __entry->success, __entry->link_id
        )
);

DEFINE_EVENT(local_only_evt, api_ready_on_channel,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

DEFINE_EVENT(local_only_evt, api_remain_on_channel_expired,
        TP_PROTO(struct ieee80211_local *local),
        TP_ARGS(local)
);

TRACE_EVENT(api_gtk_rekey_notify,
        TP_PROTO(struct ieee80211_sub_if_data *sdata,
                 const u8 *bssid, const u8 *replay_ctr),

        TP_ARGS(sdata, bssid, replay_ctr),

        TP_STRUCT__entry(
                VIF_ENTRY
                __array(u8, bssid, ETH_ALEN)
                __array(u8, replay_ctr, NL80211_REPLAY_CTR_LEN)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                memcpy(__entry->bssid, bssid, ETH_ALEN);
                memcpy(__entry->replay_ctr, replay_ctr, NL80211_REPLAY_CTR_LEN);
        ),

        TP_printk(VIF_PR_FMT, VIF_PR_ARG)
);

TRACE_EVENT(api_enable_rssi_reports,
        TP_PROTO(struct ieee80211_sub_if_data *sdata,
                 int rssi_min_thold, int rssi_max_thold),

        TP_ARGS(sdata, rssi_min_thold, rssi_max_thold),

        TP_STRUCT__entry(
                VIF_ENTRY
                __field(int, rssi_min_thold)
                __field(int, rssi_max_thold)
        ),

        TP_fast_assign(
                VIF_ASSIGN;
                __entry->rssi_min_thold = rssi_min_thold;
                __entry->rssi_max_thold = rssi_max_thold;
        ),

        TP_printk(
                VIF_PR_FMT " rssi_min_thold =%d, rssi_max_thold = %d",
                VIF_PR_ARG, __entry->rssi_min_thold, __entry->rssi_max_thold
        )
);

TRACE_EVENT(api_eosp,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta),

        TP_ARGS(local, sta),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT,
                LOCAL_PR_ARG, STA_PR_ARG
        )
);

TRACE_EVENT(api_send_eosp_nullfunc,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u8 tid),

        TP_ARGS(local, sta, tid),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u8, tid)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->tid = tid;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " tid:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->tid
        )
);

TRACE_EVENT(api_sta_set_buffered,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sta *sta,
                 u8 tid, bool buffered),

        TP_ARGS(local, sta, tid, buffered),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                STA_ENTRY
                __field(u8, tid)
                __field(bool, buffered)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                STA_ASSIGN;
                __entry->tid = tid;
                __entry->buffered = buffered;
        ),

        TP_printk(
                LOCAL_PR_FMT STA_PR_FMT " tid:%d buffered:%d",
                LOCAL_PR_ARG, STA_PR_ARG, __entry->tid, __entry->buffered
        )
);

TRACE_EVENT(api_radar_detected,
        TP_PROTO(struct ieee80211_local *local),

        TP_ARGS(local),

        TP_STRUCT__entry(
                LOCAL_ENTRY
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
        ),

        TP_printk(
                LOCAL_PR_FMT " radar detected",
                LOCAL_PR_ARG
        )
);

TRACE_EVENT(api_request_smps,
        TP_PROTO(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_link_data *link,
                 enum ieee80211_smps_mode smps_mode),

        TP_ARGS(local, sdata, link, smps_mode),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                VIF_ENTRY
                __field(int, link_id)
                __field(u32, smps_mode)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                VIF_ASSIGN;
                __entry->link_id = link->link_id,
                __entry->smps_mode = smps_mode;
        ),

        TP_printk(
                LOCAL_PR_FMT " " VIF_PR_FMT " link:%d, smps_mode:%d",
                LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->smps_mode
        )
);

/*
 * Tracing for internal functions
 * (which may also be called in response to driver calls)
 */

TRACE_EVENT(wake_queue,
        TP_PROTO(struct ieee80211_local *local, u16 queue,
                 enum queue_stop_reason reason),

        TP_ARGS(local, queue, reason),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u16, queue)
                __field(u32, reason)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->queue = queue;
                __entry->reason = reason;
        ),

        TP_printk(
                LOCAL_PR_FMT " queue:%d, reason:%d",
                LOCAL_PR_ARG, __entry->queue, __entry->reason
        )
);

TRACE_EVENT(stop_queue,
        TP_PROTO(struct ieee80211_local *local, u16 queue,
                 enum queue_stop_reason reason),

        TP_ARGS(local, queue, reason),

        TP_STRUCT__entry(
                LOCAL_ENTRY
                __field(u16, queue)
                __field(u32, reason)
        ),

        TP_fast_assign(
                LOCAL_ASSIGN;
                __entry->queue = queue;
                __entry->reason = reason;
        ),

        TP_printk(
                LOCAL_PR_FMT " queue:%d, reason:%d",
                LOCAL_PR_ARG, __entry->queue, __entry->reason
        )
);

TRACE_EVENT(drv_can_neg_ttlm,
            TP_PROTO(struct ieee80211_local *local,
                     struct ieee80211_sub_if_data *sdata,
                     struct ieee80211_neg_ttlm *neg_ttlm),

        TP_ARGS(local, sdata, neg_ttlm),

        TP_STRUCT__entry(LOCAL_ENTRY
                         VIF_ENTRY
                         __array(u16, downlink, sizeof(u16) * 8)
                         __array(u16, uplink, sizeof(u16) * 8)
        ),

        TP_fast_assign(LOCAL_ASSIGN;
                       VIF_ASSIGN;
                       memcpy(__entry->downlink, neg_ttlm->downlink,
                              sizeof(neg_ttlm->downlink));
                       memcpy(__entry->uplink, neg_ttlm->uplink,
                              sizeof(neg_ttlm->uplink));
        ),

        TP_printk(LOCAL_PR_FMT ", " VIF_PR_FMT, LOCAL_PR_ARG, VIF_PR_ARG)
);

TRACE_EVENT(drv_neg_ttlm_res,
            TP_PROTO(struct ieee80211_local *local,
                     struct ieee80211_sub_if_data *sdata,
                     enum ieee80211_neg_ttlm_res res,
                     struct ieee80211_neg_ttlm *neg_ttlm),

        TP_ARGS(local, sdata, res, neg_ttlm),

        TP_STRUCT__entry(LOCAL_ENTRY
                         VIF_ENTRY
                         __field(u32, res)
                         __array(u16, downlink, sizeof(u16) * 8)
                         __array(u16, uplink, sizeof(u16) * 8)
        ),

        TP_fast_assign(LOCAL_ASSIGN;
                       VIF_ASSIGN;
                       __entry->res = res;
                       memcpy(__entry->downlink, neg_ttlm->downlink,
                              sizeof(neg_ttlm->downlink));
                       memcpy(__entry->uplink, neg_ttlm->uplink,
                              sizeof(neg_ttlm->uplink));
        ),

        TP_printk(LOCAL_PR_FMT  VIF_PR_FMT " response: %d\n ",
                  LOCAL_PR_ARG, VIF_PR_ARG, __entry->res
        )
);
#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>














































    4 

































    1 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM maple_tree

#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MM_H


#include <linux/tracepoint.h>

struct ma_state;

TRACE_EVENT(ma_op,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)
TRACE_EVENT(ma_read,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)

TRACE_EVENT(ma_write,

        TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv,
                 void *val),

        TP_ARGS(fn, mas, piv, val),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(unsigned long, piv)
                        __field(void *, val)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->piv                = piv;
                        __entry->val                = val;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last,
                  (unsigned long) __entry->piv,
                  (void *) __entry->val
        )
)
#endif /* _TRACE_MM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>














    1 










    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// SPDX-License-Identifier: GPL-2.0
#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
#include "internal.h"

/*
 * /proc/self:
 */
static const char *proc_self_get_link(struct dentry *dentry,
                                      struct inode *inode,
                                      struct delayed_call *done)
{
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        pid_t tgid = task_tgid_nr_ns(current, ns);
        char *name;

        if (!tgid)
                return ERR_PTR(-ENOENT);
        /* max length of unsigned int in decimal + NULL term */
        name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
        if (unlikely(!name))
                return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
        sprintf(name, "%u", tgid);
        set_delayed_call(done, kfree_link, name);
        return name;
}

static const struct inode_operations proc_self_inode_operations = {
        .get_link        = proc_self_get_link,
};

static unsigned self_inum __ro_after_init;

int proc_setup_self(struct super_block *s)
{
        struct inode *root_inode = d_inode(s->s_root);
        struct proc_fs_info *fs_info = proc_sb_info(s);
        struct dentry *self;
        int ret = -ENOMEM;

        inode_lock(root_inode);
        self = d_alloc_name(s->s_root, "self");
        if (self) {
                struct inode *inode = new_inode(s);
                if (inode) {
                        inode->i_ino = self_inum;
                        simple_inode_init_ts(inode);
                        inode->i_mode = S_IFLNK | S_IRWXUGO;
                        inode->i_uid = GLOBAL_ROOT_UID;
                        inode->i_gid = GLOBAL_ROOT_GID;
                        inode->i_op = &proc_self_inode_operations;
                        d_add(self, inode);
                        ret = 0;
                } else {
                        dput(self);
                }
        }
        inode_unlock(root_inode);

        if (ret)
                pr_err("proc_fill_super: can't allocate /proc/self\n");
        else
                fs_info->proc_self = self;

        return ret;
}

void __init proc_self_init(void)
{
        proc_alloc_inum(&self_inum);
}



































































































    1 



    1 


    1 





















































































































































































































































































































    1 

    1 
    1 






























































































































































































































































































































































































































































































































































































    1 
    1 




    1 














    1 





















    1 




    1 





    1 




    1 
    1 


    1 





































































































































































































































































































































































































































































































































































































































































    1 

    1 











    1 























    1 




















































    1 
    1 










    1 











    1 

    1 









    1 

    1 










































































    1 











































































































































































    1 

    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 








    1 






















































































































































































































    1 

    1 
    1 





    1 
































































































































































































    1 
    1 





















































    1 
    1 


    1 



    1 
    1 















    1 
    1 





    1 


























    1 
    1 
    1 





    1 
    1 

    1 



    1 




















































































































































































































































    1 

    1 







    1 





    1 


    1 








    1 




    1 








    1 









    1 


    1 












    1 














































































































































































































    1 






    1 
    1 


    1 






















    1 







































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
 *
 * Copyright (c) 2017 - 2019, Intel Corporation.
 */

#define pr_fmt(fmt) "MPTCP: " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/atomic.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
#include <net/hotdata.h>
#include <net/xfrm.h>
#include <asm/ioctls.h>
#include "protocol.h"
#include "mib.h"

#define CREATE_TRACE_POINTS
#include <trace/events/mptcp.h>

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct mptcp6_sock {
        struct mptcp_sock msk;
        struct ipv6_pinfo np;
};
#endif

enum {
        MPTCP_CMSG_TS = BIT(0),
        MPTCP_CMSG_INQ = BIT(1),
};

static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;

static void __mptcp_destroy_sock(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);

DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
static struct net_device mptcp_napi_dev;

/* Returns end sequence number of the receiver's advertised window */
static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
{
        return READ_ONCE(msk->wnd_end);
}

static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        if (sk->sk_prot == &tcpv6_prot)
                return &inet6_stream_ops;
#endif
        WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
        return &inet_stream_ops;
}

static int __mptcp_socket_create(struct mptcp_sock *msk)
{
        struct mptcp_subflow_context *subflow;
        struct sock *sk = (struct sock *)msk;
        struct socket *ssock;
        int err;

        err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock);
        if (err)
                return err;

        msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
        WRITE_ONCE(msk->first, ssock->sk);
        subflow = mptcp_subflow_ctx(ssock->sk);
        list_add(&subflow->node, &msk->conn_list);
        sock_hold(ssock->sk);
        subflow->request_mptcp = 1;
        subflow->subflow_id = msk->subflow_id++;

        /* This is the first subflow, always with id 0 */
        WRITE_ONCE(subflow->local_id, 0);
        mptcp_sock_graft(msk->first, sk->sk_socket);
        iput(SOCK_INODE(ssock));

        return 0;
}

/* If the MPC handshake is not started, returns the first subflow,
 * eventually allocating it.
 */
struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
{
        struct sock *sk = (struct sock *)msk;
        int ret;

        if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                return ERR_PTR(-EINVAL);

        if (!msk->first) {
                ret = __mptcp_socket_create(msk);
                if (ret)
                        return ERR_PTR(ret);
        }

        return msk->first;
}

static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
{
        sk_drops_add(sk, skb);
        __kfree_skb(skb);
}

static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size)
{
        WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc,
                   mptcp_sk(sk)->rmem_fwd_alloc + size);
}

static void mptcp_rmem_charge(struct sock *sk, int size)
{
        mptcp_rmem_fwd_alloc_add(sk, -size);
}

static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
                               struct sk_buff *from)
{
        bool fragstolen;
        int delta;

        if (MPTCP_SKB_CB(from)->offset ||
            !skb_try_coalesce(to, from, &fragstolen, &delta))
                return false;

        pr_debug("colesced seq %llx into %llx new len %d new end seq %llx",
                 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
                 to->len, MPTCP_SKB_CB(from)->end_seq);
        MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;

        /* note the fwd memory can reach a negative value after accounting
         * for the delta, but the later skb free will restore a non
         * negative one
         */
        atomic_add(delta, &sk->sk_rmem_alloc);
        mptcp_rmem_charge(sk, delta);
        kfree_skb_partial(from, fragstolen);

        return true;
}

static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
                                   struct sk_buff *from)
{
        if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq)
                return false;

        return mptcp_try_coalesce((struct sock *)msk, to, from);
}

static void __mptcp_rmem_reclaim(struct sock *sk, int amount)
{
        amount >>= PAGE_SHIFT;
        mptcp_rmem_charge(sk, amount << PAGE_SHIFT);
        __sk_mem_reduce_allocated(sk, amount);
}

static void mptcp_rmem_uncharge(struct sock *sk, int size)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        int reclaimable;

        mptcp_rmem_fwd_alloc_add(sk, size);
        reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);

        /* see sk_mem_uncharge() for the rationale behind the following schema */
        if (unlikely(reclaimable >= PAGE_SIZE))
                __mptcp_rmem_reclaim(sk, reclaimable);
}

static void mptcp_rfree(struct sk_buff *skb)
{
        unsigned int len = skb->truesize;
        struct sock *sk = skb->sk;

        atomic_sub(len, &sk->sk_rmem_alloc);
        mptcp_rmem_uncharge(sk, len);
}

void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = mptcp_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        mptcp_rmem_charge(sk, skb->truesize);
}

/* "inspired" by tcp_data_queue_ofo(), main differences:
 * - use mptcp seqs
 * - don't cope with sacks
 */
static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
{
        struct sock *sk = (struct sock *)msk;
        struct rb_node **p, *parent;
        u64 seq, end_seq, max_seq;
        struct sk_buff *skb1;

        seq = MPTCP_SKB_CB(skb)->map_seq;
        end_seq = MPTCP_SKB_CB(skb)->end_seq;
        max_seq = atomic64_read(&msk->rcv_wnd_sent);

        pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
                 RB_EMPTY_ROOT(&msk->out_of_order_queue));
        if (after64(end_seq, max_seq)) {
                /* out of window */
                mptcp_drop(sk, skb);
                pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
                         (unsigned long long)end_seq - (unsigned long)max_seq,
                         (unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
                return;
        }

        p = &msk->out_of_order_queue.rb_node;
        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE);
        if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) {
                rb_link_node(&skb->rbnode, NULL, p);
                rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);
                msk->ooo_last_skb = skb;
                goto end;
        }

        /* with 2 subflows, adding at end of ooo queue is quite likely
         * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
         */
        if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) {
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
                return;
        }

        /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
        if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) {
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL);
                parent = &msk->ooo_last_skb->rbnode;
                p = &parent->rb_right;
                goto insert;
        }

        /* Find place to insert this segment. Handle overlaps on the way. */
        parent = NULL;
        while (*p) {
                parent = *p;
                skb1 = rb_to_skb(parent);
                if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
                        p = &parent->rb_left;
                        continue;
                }
                if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) {
                        if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) {
                                /* All the bits are present. Drop. */
                                mptcp_drop(sk, skb);
                                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
                                return;
                        }
                        if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) {
                                /* partial overlap:
                                 *     |     skb      |
                                 *  |     skb1    |
                                 * continue traversing
                                 */
                        } else {
                                /* skb's seq == skb1's seq and skb covers skb1.
                                 * Replace skb1 with skb.
                                 */
                                rb_replace_node(&skb1->rbnode, &skb->rbnode,
                                                &msk->out_of_order_queue);
                                mptcp_drop(sk, skb1);
                                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
                                goto merge_right;
                        }
                } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) {
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE);
                        return;
                }
                p = &parent->rb_right;
        }

insert:
        /* Insert segment into RB tree. */
        rb_link_node(&skb->rbnode, parent, p);
        rb_insert_color(&skb->rbnode, &msk->out_of_order_queue);

merge_right:
        /* Remove other segments covered by skb. */
        while ((skb1 = skb_rb_next(skb)) != NULL) {
                if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq))
                        break;
                rb_erase(&skb1->rbnode, &msk->out_of_order_queue);
                mptcp_drop(sk, skb1);
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
        }
        /* If there is no skb after us, we are the last_skb ! */
        if (!skb1)
                msk->ooo_last_skb = skb;

end:
        skb_condense(skb);
        mptcp_set_owner_r(skb, sk);
}

static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        int amt, amount;

        if (size <= msk->rmem_fwd_alloc)
                return true;

        size -= msk->rmem_fwd_alloc;
        amt = sk_mem_pages(size);
        amount = amt << PAGE_SHIFT;
        if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV))
                return false;

        mptcp_rmem_fwd_alloc_add(sk, amount);
        return true;
}

static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
                             struct sk_buff *skb, unsigned int offset,
                             size_t copy_len)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct sock *sk = (struct sock *)msk;
        struct sk_buff *tail;
        bool has_rxtstamp;

        __skb_unlink(skb, &ssk->sk_receive_queue);

        skb_ext_reset(skb);
        skb_orphan(skb);

        /* try to fetch required memory from subflow */
        if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
                goto drop;

        has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;

        /* the skb map_seq accounts for the skb offset:
         * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
         * value
         */
        MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow);
        MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len;
        MPTCP_SKB_CB(skb)->offset = offset;
        MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;

        if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
                /* in sequence */
                msk->bytes_received += copy_len;
                WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
                tail = skb_peek_tail(&sk->sk_receive_queue);
                if (tail && mptcp_try_coalesce(sk, tail, skb))
                        return true;

                mptcp_set_owner_r(skb, sk);
                __skb_queue_tail(&sk->sk_receive_queue, skb);
                return true;
        } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
                mptcp_data_queue_ofo(msk, skb);
                return false;
        }

        /* old data, keep it simple and drop the whole pkt, sender
         * will retransmit as needed, if needed.
         */
        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
drop:
        mptcp_drop(sk, skb);
        return false;
}

static void mptcp_stop_rtx_timer(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
        mptcp_sk(sk)->timer_ival = 0;
}

static void mptcp_close_wake_up(struct sock *sk)
{
        if (sock_flag(sk, SOCK_DEAD))
                return;

        sk->sk_state_change(sk);
        if (sk->sk_shutdown == SHUTDOWN_MASK ||
            sk->sk_state == TCP_CLOSE)
                sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
        else
                sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
}

/* called under the msk socket lock */
static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        return ((1 << sk->sk_state) &
                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
               msk->write_seq == READ_ONCE(msk->snd_una);
}

static void mptcp_check_data_fin_ack(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        /* Look for an acknowledged DATA_FIN */
        if (mptcp_pending_data_fin_ack(sk)) {
                WRITE_ONCE(msk->snd_data_fin_enable, 0);

                switch (sk->sk_state) {
                case TCP_FIN_WAIT1:
                        mptcp_set_state(sk, TCP_FIN_WAIT2);
                        break;
                case TCP_CLOSING:
                case TCP_LAST_ACK:
                        mptcp_set_state(sk, TCP_CLOSE);
                        break;
                }

                mptcp_close_wake_up(sk);
        }
}

/* can be called with no lock acquired */
static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        if (READ_ONCE(msk->rcv_data_fin) &&
            ((1 << inet_sk_state_load(sk)) &
             (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) {
                u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq);

                if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) {
                        if (seq)
                                *seq = rcv_data_fin_seq;

                        return true;
                }
        }

        return false;
}

static void mptcp_set_datafin_timeout(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        u32 retransmits;

        retransmits = min_t(u32, icsk->icsk_retransmits,
                            ilog2(TCP_RTO_MAX / TCP_RTO_MIN));

        mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits;
}

static void __mptcp_set_timeout(struct sock *sk, long tout)
{
        mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}

static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow)
{
        const struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

        return inet_csk(ssk)->icsk_pending && !subflow->stale_count ?
               inet_csk(ssk)->icsk_timeout - jiffies : 0;
}

static void mptcp_set_timeout(struct sock *sk)
{
        struct mptcp_subflow_context *subflow;
        long tout = 0;

        mptcp_for_each_subflow(mptcp_sk(sk), subflow)
                tout = max(tout, mptcp_timeout_from_subflow(subflow));
        __mptcp_set_timeout(sk, tout);
}

static inline bool tcp_can_send_ack(const struct sock *ssk)
{
        return !((1 << inet_sk_state_load(ssk)) &
               (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
}

void __mptcp_subflow_send_ack(struct sock *ssk)
{
        if (tcp_can_send_ack(ssk))
                tcp_send_ack(ssk);
}

static void mptcp_subflow_send_ack(struct sock *ssk)
{
        bool slow;

        slow = lock_sock_fast(ssk);
        __mptcp_subflow_send_ack(ssk);
        unlock_sock_fast(ssk, slow);
}

static void mptcp_send_ack(struct mptcp_sock *msk)
{
        struct mptcp_subflow_context *subflow;

        mptcp_for_each_subflow(msk, subflow)
                mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow));
}

static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
{
        bool slow;

        slow = lock_sock_fast(ssk);
        if (tcp_can_send_ack(ssk))
                tcp_cleanup_rbuf(ssk, 1);
        unlock_sock_fast(ssk, slow);
}

static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
{
        const struct inet_connection_sock *icsk = inet_csk(ssk);
        u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending);
        const struct tcp_sock *tp = tcp_sk(ssk);

        return (ack_pending & ICSK_ACK_SCHED) &&
                ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) >
                  READ_ONCE(icsk->icsk_ack.rcv_mss)) ||
                 (rx_empty && ack_pending &
                              (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED)));
}

static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
{
        int old_space = READ_ONCE(msk->old_wspace);
        struct mptcp_subflow_context *subflow;
        struct sock *sk = (struct sock *)msk;
        int space =  __mptcp_space(sk);
        bool cleanup, rx_empty;

        cleanup = (space > 0) && (space >= (old_space << 1));
        rx_empty = !__mptcp_rmem(sk);

        mptcp_for_each_subflow(msk, subflow) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

                if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty))
                        mptcp_subflow_cleanup_rbuf(ssk);
        }
}

static bool mptcp_check_data_fin(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        u64 rcv_data_fin_seq;
        bool ret = false;

        /* Need to ack a DATA_FIN received from a peer while this side
         * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
         * msk->rcv_data_fin was set when parsing the incoming options
         * at the subflow level and the msk lock was not held, so this
         * is the first opportunity to act on the DATA_FIN and change
         * the msk state.
         *
         * If we are caught up to the sequence number of the incoming
         * DATA_FIN, send the DATA_ACK now and do state transition.  If
         * not caught up, do nothing and let the recv code send DATA_ACK
         * when catching up.
         */

        if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) {
                WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1);
                WRITE_ONCE(msk->rcv_data_fin, 0);

                WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
                smp_mb__before_atomic(); /* SHUTDOWN must be visible first */

                switch (sk->sk_state) {
                case TCP_ESTABLISHED:
                        mptcp_set_state(sk, TCP_CLOSE_WAIT);
                        break;
                case TCP_FIN_WAIT1:
                        mptcp_set_state(sk, TCP_CLOSING);
                        break;
                case TCP_FIN_WAIT2:
                        mptcp_set_state(sk, TCP_CLOSE);
                        break;
                default:
                        /* Other states not expected */
                        WARN_ON_ONCE(1);
                        break;
                }

                ret = true;
                if (!__mptcp_check_fallback(msk))
                        mptcp_send_ack(msk);
                mptcp_close_wake_up(sk);
        }
        return ret;
}

static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
                                           struct sock *ssk,
                                           unsigned int *bytes)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct sock *sk = (struct sock *)msk;
        unsigned int moved = 0;
        bool more_data_avail;
        struct tcp_sock *tp;
        bool done = false;
        int sk_rbuf;

        sk_rbuf = READ_ONCE(sk->sk_rcvbuf);

        if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
                int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);

                if (unlikely(ssk_rbuf > sk_rbuf)) {
                        WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf);
                        sk_rbuf = ssk_rbuf;
                }
        }

        pr_debug("msk=%p ssk=%p", msk, ssk);
        tp = tcp_sk(ssk);
        do {
                u32 map_remaining, offset;
                u32 seq = tp->copied_seq;
                struct sk_buff *skb;
                bool fin;

                /* try to move as much data as available */
                map_remaining = subflow->map_data_len -
                                mptcp_subflow_get_map_offset(subflow);

                skb = skb_peek(&ssk->sk_receive_queue);
                if (!skb) {
                        /* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
                         * a different CPU can have already processed the pending
                         * data, stop here or we can enter an infinite loop
                         */
                        if (!moved)
                                done = true;
                        break;
                }

                if (__mptcp_check_fallback(msk)) {
                        /* Under fallback skbs have no MPTCP extension and TCP could
                         * collapse them between the dummy map creation and the
                         * current dequeue. Be sure to adjust the map size.
                         */
                        map_remaining = skb->len;
                        subflow->map_data_len = skb->len;
                }

                offset = seq - TCP_SKB_CB(skb)->seq;
                fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
                if (fin) {
                        done = true;
                        seq++;
                }

                if (offset < skb->len) {
                        size_t len = skb->len - offset;

                        if (tp->urg_data)
                                done = true;

                        if (__mptcp_move_skb(msk, ssk, skb, offset, len))
                                moved += len;
                        seq += len;

                        if (WARN_ON_ONCE(map_remaining < len))
                                break;
                } else {
                        WARN_ON_ONCE(!fin);
                        sk_eat_skb(ssk, skb);
                        done = true;
                }

                WRITE_ONCE(tp->copied_seq, seq);
                more_data_avail = mptcp_subflow_data_available(ssk);

                if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) {
                        done = true;
                        break;
                }
        } while (more_data_avail);

        if (moved > 0)
                msk->last_data_recv = tcp_jiffies32;
        *bytes += moved;
        return done;
}

static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
{
        struct sock *sk = (struct sock *)msk;
        struct sk_buff *skb, *tail;
        bool moved = false;
        struct rb_node *p;
        u64 end_seq;

        p = rb_first(&msk->out_of_order_queue);
        pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue));
        while (p) {
                skb = rb_to_skb(p);
                if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq))
                        break;

                p = rb_next(p);
                rb_erase(&skb->rbnode, &msk->out_of_order_queue);

                if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq,
                                      msk->ack_seq))) {
                        mptcp_drop(sk, skb);
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA);
                        continue;
                }

                end_seq = MPTCP_SKB_CB(skb)->end_seq;
                tail = skb_peek_tail(&sk->sk_receive_queue);
                if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) {
                        int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq;

                        /* skip overlapping data, if any */
                        pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d",
                                 MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq,
                                 delta);
                        MPTCP_SKB_CB(skb)->offset += delta;
                        MPTCP_SKB_CB(skb)->map_seq += delta;
                        __skb_queue_tail(&sk->sk_receive_queue, skb);
                }
                msk->bytes_received += end_seq - msk->ack_seq;
                WRITE_ONCE(msk->ack_seq, end_seq);
                moved = true;
        }
        return moved;
}

static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk)
{
        int err = sock_error(ssk);
        int ssk_state;

        if (!err)
                return false;

        /* only propagate errors on fallen-back sockets or
         * on MPC connect
         */
        if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk)))
                return false;

        /* We need to propagate only transition to CLOSE state.
         * Orphaned socket will see such state change via
         * subflow_sched_work_if_closed() and that path will properly
         * destroy the msk as needed.
         */
        ssk_state = inet_sk_state_load(ssk);
        if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
                mptcp_set_state(sk, ssk_state);
        WRITE_ONCE(sk->sk_err, -err);

        /* This barrier is coupled with smp_rmb() in mptcp_poll() */
        smp_wmb();
        sk_error_report(sk);
        return true;
}

void __mptcp_error_report(struct sock *sk)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);

        mptcp_for_each_subflow(msk, subflow)
                if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow)))
                        break;
}

/* In most cases we will be able to lock the mptcp socket.  If its already
 * owned, we need to defer to the work queue to avoid ABBA deadlock.
 */
static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
{
        struct sock *sk = (struct sock *)msk;
        unsigned int moved = 0;

        __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
        __mptcp_ofo_queue(msk);
        if (unlikely(ssk->sk_err)) {
                if (!sock_owned_by_user(sk))
                        __mptcp_error_report(sk);
                else
                        __set_bit(MPTCP_ERROR_REPORT,  &msk->cb_flags);
        }

        /* If the moves have caught up with the DATA_FIN sequence number
         * it's time to ack the DATA_FIN and change socket state, but
         * this is not a good place to change state. Let the workqueue
         * do it.
         */
        if (mptcp_pending_data_fin(sk, NULL))
                mptcp_schedule_work(sk);
        return moved > 0;
}

void mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct mptcp_sock *msk = mptcp_sk(sk);
        int sk_rbuf, ssk_rbuf;

        /* The peer can send data while we are shutting down this
         * subflow at msk destruction time, but we must avoid enqueuing
         * more data to the msk receive queue
         */
        if (unlikely(subflow->disposable))
                return;

        ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf);
        sk_rbuf = READ_ONCE(sk->sk_rcvbuf);
        if (unlikely(ssk_rbuf > sk_rbuf))
                sk_rbuf = ssk_rbuf;

        /* over limit? can't append more skbs to msk, Also, no need to wake-up*/
        if (__mptcp_rmem(sk) > sk_rbuf) {
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
                return;
        }

        /* Wake-up the reader only for in-sequence data */
        mptcp_data_lock(sk);
        if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
                sk->sk_data_ready(sk);
        mptcp_data_unlock(sk);
}

static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
{
        mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
        WRITE_ONCE(msk->allow_infinite_fallback, false);
        mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
}

static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
{
        struct sock *sk = (struct sock *)msk;

        if (sk->sk_state != TCP_ESTABLISHED)
                return false;

        /* attach to msk socket only after we are sure we will deal with it
         * at close time
         */
        if (sk->sk_socket && !ssk->sk_socket)
                mptcp_sock_graft(ssk, sk->sk_socket);

        mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
        mptcp_sockopt_sync_locked(msk, ssk);
        mptcp_subflow_joined(msk, ssk);
        mptcp_stop_tout_timer(sk);
        __mptcp_propagate_sndbuf(sk, ssk);
        return true;
}

static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list)
{
        struct mptcp_subflow_context *tmp, *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);

        list_for_each_entry_safe(subflow, tmp, join_list, node) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
                bool slow = lock_sock_fast(ssk);

                list_move_tail(&subflow->node, &msk->conn_list);
                if (!__mptcp_finish_join(msk, ssk))
                        mptcp_subflow_reset(ssk);
                unlock_sock_fast(ssk, slow);
        }
}

static bool mptcp_rtx_timer_pending(struct sock *sk)
{
        return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
}

static void mptcp_reset_rtx_timer(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        unsigned long tout;

        /* prevent rescheduling on close */
        if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE))
                return;

        tout = mptcp_sk(sk)->timer_ival;
        sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
}

bool mptcp_schedule_work(struct sock *sk)
{
        if (inet_sk_state_load(sk) != TCP_CLOSE &&
            schedule_work(&mptcp_sk(sk)->work)) {
                /* each subflow already holds a reference to the sk, and the
                 * workqueue is invoked by a subflow, so sk can't go away here.
                 */
                sock_hold(sk);
                return true;
        }
        return false;
}

static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
{
        struct mptcp_subflow_context *subflow;

        msk_owned_by_me(msk);

        mptcp_for_each_subflow(msk, subflow) {
                if (READ_ONCE(subflow->data_avail))
                        return mptcp_subflow_tcp_sock(subflow);
        }

        return NULL;
}

static bool mptcp_skb_can_collapse_to(u64 write_seq,
                                      const struct sk_buff *skb,
                                      const struct mptcp_ext *mpext)
{
        if (!tcp_skb_can_collapse_to(skb))
                return false;

        /* can collapse only if MPTCP level sequence is in order and this
         * mapping has not been xmitted yet
         */
        return mpext && mpext->data_seq + mpext->data_len == write_seq &&
               !mpext->frozen;
}

/* we can append data to the given data frag if:
 * - there is space available in the backing page_frag
 * - the data frag tail matches the current page_frag free offset
 * - the data frag end sequence number matches the current write seq
 */
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
                                       const struct page_frag *pfrag,
                                       const struct mptcp_data_frag *df)
{
        return df && pfrag->page == df->page &&
                pfrag->size - pfrag->offset > 0 &&
                pfrag->offset == (df->offset + df->data_len) &&
                df->data_seq + df->data_len == msk->write_seq;
}

static void dfrag_uncharge(struct sock *sk, int len)
{
        sk_mem_uncharge(sk, len);
        sk_wmem_queued_add(sk, -len);
}

static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
{
        int len = dfrag->data_len + dfrag->overhead;

        list_del(&dfrag->list);
        dfrag_uncharge(sk, len);
        put_page(dfrag->page);
}

/* called under both the msk socket lock and the data lock */
static void __mptcp_clean_una(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_data_frag *dtmp, *dfrag;
        u64 snd_una;

        snd_una = msk->snd_una;
        list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
                if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
                        break;

                if (unlikely(dfrag == msk->first_pending)) {
                        /* in recovery mode can see ack after the current snd head */
                        if (WARN_ON_ONCE(!msk->recovery))
                                break;

                        WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
                }

                dfrag_clear(sk, dfrag);
        }

        dfrag = mptcp_rtx_head(sk);
        if (dfrag && after64(snd_una, dfrag->data_seq)) {
                u64 delta = snd_una - dfrag->data_seq;

                /* prevent wrap around in recovery mode */
                if (unlikely(delta > dfrag->already_sent)) {
                        if (WARN_ON_ONCE(!msk->recovery))
                                goto out;
                        if (WARN_ON_ONCE(delta > dfrag->data_len))
                                goto out;
                        dfrag->already_sent += delta - dfrag->already_sent;
                }

                dfrag->data_seq += delta;
                dfrag->offset += delta;
                dfrag->data_len -= delta;
                dfrag->already_sent -= delta;

                dfrag_uncharge(sk, delta);
        }

        /* all retransmitted data acked, recovery completed */
        if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt))
                msk->recovery = false;

out:
        if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) {
                if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk))
                        mptcp_stop_rtx_timer(sk);
        } else {
                mptcp_reset_rtx_timer(sk);
        }

        if (mptcp_pending_data_fin_ack(sk))
                mptcp_schedule_work(sk);
}

static void __mptcp_clean_una_wakeup(struct sock *sk)
{
        lockdep_assert_held_once(&sk->sk_lock.slock);

        __mptcp_clean_una(sk);
        mptcp_write_space(sk);
}

static void mptcp_clean_una_wakeup(struct sock *sk)
{
        mptcp_data_lock(sk);
        __mptcp_clean_una_wakeup(sk);
        mptcp_data_unlock(sk);
}

static void mptcp_enter_memory_pressure(struct sock *sk)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);
        bool first = true;

        mptcp_for_each_subflow(msk, subflow) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

                if (first)
                        tcp_enter_memory_pressure(ssk);
                sk_stream_moderate_sndbuf(ssk);

                first = false;
        }
        __mptcp_sync_sndbuf(sk);
}

/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
 * data
 */
static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
        if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
                                        pfrag, sk->sk_allocation)))
                return true;

        mptcp_enter_memory_pressure(sk);
        return false;
}

static struct mptcp_data_frag *
mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
                      int orig_offset)
{
        int offset = ALIGN(orig_offset, sizeof(long));
        struct mptcp_data_frag *dfrag;

        dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
        dfrag->data_len = 0;
        dfrag->data_seq = msk->write_seq;
        dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
        dfrag->offset = offset + sizeof(struct mptcp_data_frag);
        dfrag->already_sent = 0;
        dfrag->page = pfrag->page;

        return dfrag;
}

struct mptcp_sendmsg_info {
        int mss_now;
        int size_goal;
        u16 limit;
        u16 sent;
        unsigned int flags;
        bool data_lock_held;
};

static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
                                    u64 data_seq, int avail_size)
{
        u64 window_end = mptcp_wnd_end(msk);
        u64 mptcp_snd_wnd;

        if (__mptcp_check_fallback(msk))
                return avail_size;

        mptcp_snd_wnd = window_end - data_seq;
        avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);

        if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
                tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED);
        }

        return avail_size;
}

static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp)
{
        struct skb_ext *mpext = __skb_ext_alloc(gfp);

        if (!mpext)
                return false;
        __skb_ext_set(skb, SKB_EXT_MPTCP, mpext);
        return true;
}

static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
{
        struct sk_buff *skb;

        skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
        if (likely(skb)) {
                if (likely(__mptcp_add_ext(skb, gfp))) {
                        skb_reserve(skb, MAX_TCP_HEADER);
                        skb->ip_summed = CHECKSUM_PARTIAL;
                        INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
                        return skb;
                }
                __kfree_skb(skb);
        } else {
                mptcp_enter_memory_pressure(sk);
        }
        return NULL;
}

static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
{
        struct sk_buff *skb;

        skb = __mptcp_do_alloc_tx_skb(sk, gfp);
        if (!skb)
                return NULL;

        if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
                tcp_skb_entail(ssk, skb);
                return skb;
        }
        tcp_skb_tsorted_anchor_cleanup(skb);
        kfree_skb(skb);
        return NULL;
}

static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
{
        gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;

        return __mptcp_alloc_tx_skb(sk, ssk, gfp);
}

/* note: this always recompute the csum on the whole skb, even
 * if we just appended a single frag. More status info needed
 */
static void mptcp_update_data_checksum(struct sk_buff *skb, int added)
{
        struct mptcp_ext *mpext = mptcp_get_ext(skb);
        __wsum csum = ~csum_unfold(mpext->csum);
        int offset = skb->len - added;

        mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
}

static void mptcp_update_infinite_map(struct mptcp_sock *msk,
                                      struct sock *ssk,
                                      struct mptcp_ext *mpext)
{
        if (!mpext)
                return;

        mpext->infinite_map = 1;
        mpext->data_len = 0;

        MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
        mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
        pr_fallback(msk);
        mptcp_do_fallback(ssk);
}

#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))

static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
                              struct mptcp_data_frag *dfrag,
                              struct mptcp_sendmsg_info *info)
{
        u64 data_seq = dfrag->data_seq + info->sent;
        int offset = dfrag->offset + info->sent;
        struct mptcp_sock *msk = mptcp_sk(sk);
        bool zero_window_probe = false;
        struct mptcp_ext *mpext = NULL;
        bool can_coalesce = false;
        bool reuse_skb = true;
        struct sk_buff *skb;
        size_t copy;
        int i;

        pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
                 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);

        if (WARN_ON_ONCE(info->sent > info->limit ||
                         info->limit > dfrag->data_len))
                return 0;

        if (unlikely(!__tcp_can_send(ssk)))
                return -EAGAIN;

        /* compute send limit */
        if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE))
                ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE;
        info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
        copy = info->size_goal;

        skb = tcp_write_queue_tail(ssk);
        if (skb && copy > skb->len) {
                /* Limit the write to the size available in the
                 * current skb, if any, so that we create at most a new skb.
                 * Explicitly tells TCP internals to avoid collapsing on later
                 * queue management operation, to avoid breaking the ext <->
                 * SSN association set here
                 */
                mpext = mptcp_get_ext(skb);
                if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
                        TCP_SKB_CB(skb)->eor = 1;
                        tcp_mark_push(tcp_sk(ssk), skb);
                        goto alloc_skb;
                }

                i = skb_shinfo(skb)->nr_frags;
                can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
                if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
                        tcp_mark_push(tcp_sk(ssk), skb);
                        goto alloc_skb;
                }

                copy -= skb->len;
        } else {
alloc_skb:
                skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held);
                if (!skb)
                        return -ENOMEM;

                i = skb_shinfo(skb)->nr_frags;
                reuse_skb = false;
                mpext = mptcp_get_ext(skb);
        }

        /* Zero window and all data acked? Probe. */
        copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy);
        if (copy == 0) {
                u64 snd_una = READ_ONCE(msk->snd_una);

                if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) {
                        tcp_remove_empty_skb(ssk);
                        return 0;
                }

                zero_window_probe = true;
                data_seq = snd_una - 1;
                copy = 1;
        }

        copy = min_t(size_t, copy, info->limit - info->sent);
        if (!sk_wmem_schedule(ssk, copy)) {
                tcp_remove_empty_skb(ssk);
                return -ENOMEM;
        }

        if (can_coalesce) {
                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
        } else {
                get_page(dfrag->page);
                skb_fill_page_desc(skb, i, dfrag->page, offset, copy);
        }

        skb->len += copy;
        skb->data_len += copy;
        skb->truesize += copy;
        sk_wmem_queued_add(ssk, copy);
        sk_mem_charge(ssk, copy);
        WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy);
        TCP_SKB_CB(skb)->end_seq += copy;
        tcp_skb_pcount_set(skb, 0);

        /* on skb reuse we just need to update the DSS len */
        if (reuse_skb) {
                TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
                mpext->data_len += copy;
                goto out;
        }

        memset(mpext, 0, sizeof(*mpext));
        mpext->data_seq = data_seq;
        mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
        mpext->data_len = copy;
        mpext->use_map = 1;
        mpext->dsn64 = 1;

        pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
                 mpext->data_seq, mpext->subflow_seq, mpext->data_len,
                 mpext->dsn64);

        if (zero_window_probe) {
                mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
                mpext->frozen = 1;
                if (READ_ONCE(msk->csum_enabled))
                        mptcp_update_data_checksum(skb, copy);
                tcp_push_pending_frames(ssk);
                return 0;
        }
out:
        if (READ_ONCE(msk->csum_enabled))
                mptcp_update_data_checksum(skb, copy);
        if (mptcp_subflow_ctx(ssk)->send_infinite_map)
                mptcp_update_infinite_map(msk, ssk, mpext);
        trace_mptcp_sendmsg_frag(mpext);
        mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
        return copy;
}

#define MPTCP_SEND_BURST_SIZE                ((1 << 16) - \
                                         sizeof(struct tcphdr) - \
                                         MAX_TCP_OPTION_SPACE - \
                                         sizeof(struct ipv6hdr) - \
                                         sizeof(struct frag_hdr))

struct subflow_send_info {
        struct sock *ssk;
        u64 linger_time;
};

void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow)
{
        if (!subflow->stale)
                return;

        subflow->stale = 0;
        MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER);
}

bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
        if (unlikely(subflow->stale)) {
                u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp);

                if (subflow->stale_rcv_tstamp == rcv_tstamp)
                        return false;

                mptcp_subflow_set_active(subflow);
        }
        return __mptcp_subflow_active(subflow);
}

#define SSK_MODE_ACTIVE        0
#define SSK_MODE_BACKUP        1
#define SSK_MODE_MAX        2

/* implement the mptcp packet scheduler;
 * returns the subflow that will transmit the next DSS
 * additionally updates the rtx timeout
 */
struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
        struct subflow_send_info send_info[SSK_MODE_MAX];
        struct mptcp_subflow_context *subflow;
        struct sock *sk = (struct sock *)msk;
        u32 pace, burst, wmem;
        int i, nr_active = 0;
        struct sock *ssk;
        u64 linger_time;
        long tout = 0;

        /* pick the subflow with the lower wmem/wspace ratio */
        for (i = 0; i < SSK_MODE_MAX; ++i) {
                send_info[i].ssk = NULL;
                send_info[i].linger_time = -1;
        }

        mptcp_for_each_subflow(msk, subflow) {
                trace_mptcp_subflow_get_send(subflow);
                ssk =  mptcp_subflow_tcp_sock(subflow);
                if (!mptcp_subflow_active(subflow))
                        continue;

                tout = max(tout, mptcp_timeout_from_subflow(subflow));
                nr_active += !subflow->backup;
                pace = subflow->avg_pacing_rate;
                if (unlikely(!pace)) {
                        /* init pacing rate from socket */
                        subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate);
                        pace = subflow->avg_pacing_rate;
                        if (!pace)
                                continue;
                }

                linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace);
                if (linger_time < send_info[subflow->backup].linger_time) {
                        send_info[subflow->backup].ssk = ssk;
                        send_info[subflow->backup].linger_time = linger_time;
                }
        }
        __mptcp_set_timeout(sk, tout);

        /* pick the best backup if no other subflow is active */
        if (!nr_active)
                send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk;

        /* According to the blest algorithm, to avoid HoL blocking for the
         * faster flow, we need to:
         * - estimate the faster flow linger time
         * - use the above to estimate the amount of byte transferred
         *   by the faster flow
         * - check that the amount of queued data is greter than the above,
         *   otherwise do not use the picked, slower, subflow
         * We select the subflow with the shorter estimated time to flush
         * the queued mem, which basically ensure the above. We just need
         * to check that subflow has a non empty cwin.
         */
        ssk = send_info[SSK_MODE_ACTIVE].ssk;
        if (!ssk || !sk_stream_memory_free(ssk))
                return NULL;

        burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
        wmem = READ_ONCE(ssk->sk_wmem_queued);
        if (!burst)
                return ssk;

        subflow = mptcp_subflow_ctx(ssk);
        subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
                                           READ_ONCE(ssk->sk_pacing_rate) * burst,
                                           burst + wmem);
        msk->snd_burst = burst;
        return ssk;
}

static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info)
{
        tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
        release_sock(ssk);
}

static void mptcp_update_post_push(struct mptcp_sock *msk,
                                   struct mptcp_data_frag *dfrag,
                                   u32 sent)
{
        u64 snd_nxt_new = dfrag->data_seq;

        dfrag->already_sent += sent;

        msk->snd_burst -= sent;

        snd_nxt_new += dfrag->already_sent;

        /* snd_nxt_new can be smaller than snd_nxt in case mptcp
         * is recovering after a failover. In that event, this re-sends
         * old segments.
         *
         * Thus compute snd_nxt_new candidate based on
         * the dfrag->data_seq that was sent and the data
         * that has been handed to the subflow for transmission
         * and skip update in case it was old dfrag.
         */
        if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
                msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
                WRITE_ONCE(msk->snd_nxt, snd_nxt_new);
        }
}

void mptcp_check_and_set_pending(struct sock *sk)
{
        if (mptcp_send_head(sk)) {
                mptcp_data_lock(sk);
                mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING);
                mptcp_data_unlock(sk);
        }
}

static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
                                  struct mptcp_sendmsg_info *info)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_data_frag *dfrag;
        int len, copied = 0, err = 0;

        while ((dfrag = mptcp_send_head(sk))) {
                info->sent = dfrag->already_sent;
                info->limit = dfrag->data_len;
                len = dfrag->data_len - dfrag->already_sent;
                while (len > 0) {
                        int ret = 0;

                        ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
                        if (ret <= 0) {
                                err = copied ? : ret;
                                goto out;
                        }

                        info->sent += ret;
                        copied += ret;
                        len -= ret;

                        mptcp_update_post_push(msk, dfrag, ret);
                }
                WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));

                if (msk->snd_burst <= 0 ||
                    !sk_stream_memory_free(ssk) ||
                    !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
                        err = copied;
                        goto out;
                }
                mptcp_set_timeout(sk);
        }
        err = copied;

out:
        if (err > 0)
                msk->last_data_sent = tcp_jiffies32;
        return err;
}

void __mptcp_push_pending(struct sock *sk, unsigned int flags)
{
        struct sock *prev_ssk = NULL, *ssk = NULL;
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_sendmsg_info info = {
                                .flags = flags,
        };
        bool do_check_data_fin = false;
        int push_count = 1;

        while (mptcp_send_head(sk) && (push_count > 0)) {
                struct mptcp_subflow_context *subflow;
                int ret = 0;

                if (mptcp_sched_get_send(msk))
                        break;

                push_count = 0;

                mptcp_for_each_subflow(msk, subflow) {
                        if (READ_ONCE(subflow->scheduled)) {
                                mptcp_subflow_set_scheduled(subflow, false);

                                prev_ssk = ssk;
                                ssk = mptcp_subflow_tcp_sock(subflow);
                                if (ssk != prev_ssk) {
                                        /* First check. If the ssk has changed since
                                         * the last round, release prev_ssk
                                         */
                                        if (prev_ssk)
                                                mptcp_push_release(prev_ssk, &info);

                                        /* Need to lock the new subflow only if different
                                         * from the previous one, otherwise we are still
                                         * helding the relevant lock
                                         */
                                        lock_sock(ssk);
                                }

                                push_count++;

                                ret = __subflow_push_pending(sk, ssk, &info);
                                if (ret <= 0) {
                                        if (ret != -EAGAIN ||
                                            (1 << ssk->sk_state) &
                                             (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE))
                                                push_count--;
                                        continue;
                                }
                                do_check_data_fin = true;
                        }
                }
        }

        /* at this point we held the socket lock for the last subflow we used */
        if (ssk)
                mptcp_push_release(ssk, &info);

        /* ensure the rtx timer is running */
        if (!mptcp_rtx_timer_pending(sk))
                mptcp_reset_rtx_timer(sk);
        if (do_check_data_fin)
                mptcp_check_send_data_fin(sk);
}

static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_sendmsg_info info = {
                .data_lock_held = true,
        };
        bool keep_pushing = true;
        struct sock *xmit_ssk;
        int copied = 0;

        info.flags = 0;
        while (mptcp_send_head(sk) && keep_pushing) {
                struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
                int ret = 0;

                /* check for a different subflow usage only after
                 * spooling the first chunk of data
                 */
                if (first) {
                        mptcp_subflow_set_scheduled(subflow, false);
                        ret = __subflow_push_pending(sk, ssk, &info);
                        first = false;
                        if (ret <= 0)
                                break;
                        copied += ret;
                        continue;
                }

                if (mptcp_sched_get_send(msk))
                        goto out;

                if (READ_ONCE(subflow->scheduled)) {
                        mptcp_subflow_set_scheduled(subflow, false);
                        ret = __subflow_push_pending(sk, ssk, &info);
                        if (ret <= 0)
                                keep_pushing = false;
                        copied += ret;
                }

                mptcp_for_each_subflow(msk, subflow) {
                        if (READ_ONCE(subflow->scheduled)) {
                                xmit_ssk = mptcp_subflow_tcp_sock(subflow);
                                if (xmit_ssk != ssk) {
                                        mptcp_subflow_delegate(subflow,
                                                               MPTCP_DELEGATE_SEND);
                                        keep_pushing = false;
                                }
                        }
                }
        }

out:
        /* __mptcp_alloc_tx_skb could have released some wmem and we are
         * not going to flush it via release_sock()
         */
        if (copied) {
                tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
                         info.size_goal);
                if (!mptcp_rtx_timer_pending(sk))
                        mptcp_reset_rtx_timer(sk);

                if (msk->snd_data_fin_enable &&
                    msk->snd_nxt + 1 == msk->write_seq)
                        mptcp_schedule_work(sk);
        }
}

static int mptcp_disconnect(struct sock *sk, int flags);

static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
                                  size_t len, int *copied_syn)
{
        unsigned int saved_flags = msg->msg_flags;
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct sock *ssk;
        int ret;

        /* on flags based fastopen the mptcp is supposed to create the
         * first subflow right now. Otherwise we are in the defer_connect
         * path, and the first subflow must be already present.
         * Since the defer_connect flag is cleared after the first succsful
         * fastopen attempt, no need to check for additional subflow status.
         */
        if (msg->msg_flags & MSG_FASTOPEN) {
                ssk = __mptcp_nmpc_sk(msk);
                if (IS_ERR(ssk))
                        return PTR_ERR(ssk);
        }
        if (!msk->first)
                return -EINVAL;

        ssk = msk->first;

        lock_sock(ssk);
        msg->msg_flags |= MSG_DONTWAIT;
        msk->fastopening = 1;
        ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL);
        msk->fastopening = 0;
        msg->msg_flags = saved_flags;
        release_sock(ssk);

        /* do the blocking bits of inet_stream_connect outside the ssk socket lock */
        if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) {
                ret = __inet_stream_connect(sk->sk_socket, msg->msg_name,
                                            msg->msg_namelen, msg->msg_flags, 1);

                /* Keep the same behaviour of plain TCP: zero the copied bytes in
                 * case of any error, except timeout or signal
                 */
                if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
                        *copied_syn = 0;
        } else if (ret && ret != -EINPROGRESS) {
                /* The disconnect() op called by tcp_sendmsg_fastopen()/
                 * __inet_stream_connect() can fail, due to looking check,
                 * see mptcp_disconnect().
                 * Attempt it again outside the problematic scope.
                 */
                if (!mptcp_disconnect(sk, 0))
                        sk->sk_socket->state = SS_UNCONNECTED;
        }
        inet_clear_bit(DEFER_CONNECT, sk);

        return ret;
}

static int do_copy_data_nocache(struct sock *sk, int copy,
                                struct iov_iter *from, char *to)
{
        if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
                if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
        } else if (!copy_from_iter_full(to, copy, from)) {
                return -EFAULT;
        }
        return 0;
}

/* open-code sk_stream_memory_free() plus sent limit computation to
 * avoid indirect calls in fast-path.
 * Called under the msk socket lock, so we can avoid a bunch of ONCE
 * annotations.
 */
static u32 mptcp_send_limit(const struct sock *sk)
{
        const struct mptcp_sock *msk = mptcp_sk(sk);
        u32 limit, not_sent;

        if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
                return 0;

        limit = mptcp_notsent_lowat(sk);
        if (limit == UINT_MAX)
                return UINT_MAX;

        not_sent = msk->write_seq - msk->snd_nxt;
        if (not_sent >= limit)
                return 0;

        return limit - not_sent;
}

static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct page_frag *pfrag;
        size_t copied = 0;
        int ret = 0;
        long timeo;

        /* silently ignore everything else */
        msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN;

        lock_sock(sk);

        if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
                     msg->msg_flags & MSG_FASTOPEN)) {
                int copied_syn = 0;

                ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn);
                copied += copied_syn;
                if (ret == -EINPROGRESS && copied_syn > 0)
                        goto out;
                else if (ret)
                        goto do_error;
        }

        timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);

        if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
                ret = sk_stream_wait_connect(sk, &timeo);
                if (ret)
                        goto do_error;
        }

        ret = -EPIPE;
        if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)))
                goto do_error;

        pfrag = sk_page_frag(sk);

        while (msg_data_left(msg)) {
                int total_ts, frag_truesize = 0;
                struct mptcp_data_frag *dfrag;
                bool dfrag_collapsed;
                size_t psize, offset;
                u32 copy_limit;

                /* ensure fitting the notsent_lowat() constraint */
                copy_limit = mptcp_send_limit(sk);
                if (!copy_limit)
                        goto wait_for_memory;

                /* reuse tail pfrag, if possible, or carve a new one from the
                 * page allocator
                 */
                dfrag = mptcp_pending_tail(sk);
                dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
                if (!dfrag_collapsed) {
                        if (!mptcp_page_frag_refill(sk, pfrag))
                                goto wait_for_memory;

                        dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset);
                        frag_truesize = dfrag->overhead;
                }

                /* we do not bound vs wspace, to allow a single packet.
                 * memory accounting will prevent execessive memory usage
                 * anyway
                 */
                offset = dfrag->offset + dfrag->data_len;
                psize = pfrag->size - offset;
                psize = min_t(size_t, psize, msg_data_left(msg));
                psize = min_t(size_t, psize, copy_limit);
                total_ts = psize + frag_truesize;

                if (!sk_wmem_schedule(sk, total_ts))
                        goto wait_for_memory;

                ret = do_copy_data_nocache(sk, psize, &msg->msg_iter,
                                           page_address(dfrag->page) + offset);
                if (ret)
                        goto do_error;

                /* data successfully copied into the write queue */
                sk_forward_alloc_add(sk, -total_ts);
                copied += psize;
                dfrag->data_len += psize;
                frag_truesize += psize;
                pfrag->offset += frag_truesize;
                WRITE_ONCE(msk->write_seq, msk->write_seq + psize);

                /* charge data on mptcp pending queue to the msk socket
                 * Note: we charge such data both to sk and ssk
                 */
                sk_wmem_queued_add(sk, frag_truesize);
                if (!dfrag_collapsed) {
                        get_page(dfrag->page);
                        list_add_tail(&dfrag->list, &msk->rtx_queue);
                        if (!msk->first_pending)
                                WRITE_ONCE(msk->first_pending, dfrag);
                }
                pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk,
                         dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
                         !dfrag_collapsed);

                continue;

wait_for_memory:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                __mptcp_push_pending(sk, msg->msg_flags);
                ret = sk_stream_wait_memory(sk, &timeo);
                if (ret)
                        goto do_error;
        }

        if (copied)
                __mptcp_push_pending(sk, msg->msg_flags);

out:
        release_sock(sk);
        return copied;

do_error:
        if (copied)
                goto out;

        copied = sk_stream_error(sk, msg->msg_flags, ret);
        goto out;
}

static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
                                struct msghdr *msg,
                                size_t len, int flags,
                                struct scm_timestamping_internal *tss,
                                int *cmsg_flags)
{
        struct sk_buff *skb, *tmp;
        int copied = 0;

        skb_queue_walk_safe(&msk->receive_queue, skb, tmp) {
                u32 offset = MPTCP_SKB_CB(skb)->offset;
                u32 data_len = skb->len - offset;
                u32 count = min_t(size_t, len - copied, data_len);
                int err;

                if (!(flags & MSG_TRUNC)) {
                        err = skb_copy_datagram_msg(skb, offset, msg, count);
                        if (unlikely(err < 0)) {
                                if (!copied)
                                        return err;
                                break;
                        }
                }

                if (MPTCP_SKB_CB(skb)->has_rxtstamp) {
                        tcp_update_recv_tstamps(skb, tss);
                        *cmsg_flags |= MPTCP_CMSG_TS;
                }

                copied += count;

                if (count < data_len) {
                        if (!(flags & MSG_PEEK)) {
                                MPTCP_SKB_CB(skb)->offset += count;
                                MPTCP_SKB_CB(skb)->map_seq += count;
                                msk->bytes_consumed += count;
                        }
                        break;
                }

                if (!(flags & MSG_PEEK)) {
                        /* we will bulk release the skb memory later */
                        skb->destructor = NULL;
                        WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
                        __skb_unlink(skb, &msk->receive_queue);
                        __kfree_skb(skb);
                        msk->bytes_consumed += count;
                }

                if (copied >= len)
                        break;
        }

        return copied;
}

/* receive buffer autotuning.  See tcp_rcv_space_adjust for more information.
 *
 * Only difference: Use highest rtt estimate of the subflows in use.
 */
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
{
        struct mptcp_subflow_context *subflow;
        struct sock *sk = (struct sock *)msk;
        u8 scaling_ratio = U8_MAX;
        u32 time, advmss = 1;
        u64 rtt_us, mstamp;

        msk_owned_by_me(msk);

        if (copied <= 0)
                return;

        if (!msk->rcvspace_init)
                mptcp_rcv_space_init(msk, msk->first);

        msk->rcvq_space.copied += copied;

        mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
        time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);

        rtt_us = msk->rcvq_space.rtt_us;
        if (rtt_us && time < (rtt_us >> 3))
                return;

        rtt_us = 0;
        mptcp_for_each_subflow(msk, subflow) {
                const struct tcp_sock *tp;
                u64 sf_rtt_us;
                u32 sf_advmss;

                tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));

                sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
                sf_advmss = READ_ONCE(tp->advmss);

                rtt_us = max(sf_rtt_us, rtt_us);
                advmss = max(sf_advmss, advmss);
                scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
        }

        msk->rcvq_space.rtt_us = rtt_us;
        msk->scaling_ratio = scaling_ratio;
        if (time < (rtt_us >> 3) || rtt_us == 0)
                return;

        if (msk->rcvq_space.copied <= msk->rcvq_space.space)
                goto new_measure;

        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
                u64 rcvwin, grow;
                int rcvbuf;

                rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;

                grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);

                do_div(grow, msk->rcvq_space.space);
                rcvwin += (grow << 1);

                rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
                               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));

                if (rcvbuf > sk->sk_rcvbuf) {
                        u32 window_clamp;

                        window_clamp = mptcp_win_from_space(sk, rcvbuf);
                        WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);

                        /* Make subflows follow along.  If we do not do this, we
                         * get drops at subflow level if skbs can't be moved to
                         * the mptcp rx queue fast enough (announced rcv_win can
                         * exceed ssk->sk_rcvbuf).
                         */
                        mptcp_for_each_subflow(msk, subflow) {
                                struct sock *ssk;
                                bool slow;

                                ssk = mptcp_subflow_tcp_sock(subflow);
                                slow = lock_sock_fast(ssk);
                                WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
                                WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
                                tcp_cleanup_rbuf(ssk, 1);
                                unlock_sock_fast(ssk, slow);
                        }
                }
        }

        msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure:
        msk->rcvq_space.copied = 0;
        msk->rcvq_space.time = mstamp;
}

static void __mptcp_update_rmem(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        if (!msk->rmem_released)
                return;

        atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
        mptcp_rmem_uncharge(sk, msk->rmem_released);
        WRITE_ONCE(msk->rmem_released, 0);
}

static void __mptcp_splice_receive_queue(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
}

static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
        struct sock *sk = (struct sock *)msk;
        unsigned int moved = 0;
        bool ret, done;

        do {
                struct sock *ssk = mptcp_subflow_recv_lookup(msk);
                bool slowpath;

                /* we can have data pending in the subflows only if the msk
                 * receive buffer was full at subflow_data_ready() time,
                 * that is an unlikely slow path.
                 */
                if (likely(!ssk))
                        break;

                slowpath = lock_sock_fast(ssk);
                mptcp_data_lock(sk);
                __mptcp_update_rmem(sk);
                done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
                mptcp_data_unlock(sk);

                if (unlikely(ssk->sk_err))
                        __mptcp_error_report(sk);
                unlock_sock_fast(ssk, slowpath);
        } while (!done);

        /* acquire the data lock only if some input data is pending */
        ret = moved > 0;
        if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
            !skb_queue_empty_lockless(&sk->sk_receive_queue)) {
                mptcp_data_lock(sk);
                __mptcp_update_rmem(sk);
                ret |= __mptcp_ofo_queue(msk);
                __mptcp_splice_receive_queue(sk);
                mptcp_data_unlock(sk);
        }
        if (ret)
                mptcp_check_data_fin((struct sock *)msk);
        return !skb_queue_empty(&msk->receive_queue);
}

static unsigned int mptcp_inq_hint(const struct sock *sk)
{
        const struct mptcp_sock *msk = mptcp_sk(sk);
        const struct sk_buff *skb;

        skb = skb_peek(&msk->receive_queue);
        if (skb) {
                u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq;

                if (hint_val >= INT_MAX)
                        return INT_MAX;

                return (unsigned int)hint_val;
        }

        if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
                return 1;

        return 0;
}

static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                         int flags, int *addr_len)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct scm_timestamping_internal tss;
        int copied = 0, cmsg_flags = 0;
        int target;
        long timeo;

        /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
        if (unlikely(flags & MSG_ERRQUEUE))
                return inet_recv_error(sk, msg, len, addr_len);

        lock_sock(sk);
        if (unlikely(sk->sk_state == TCP_LISTEN)) {
                copied = -ENOTCONN;
                goto out_err;
        }

        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        len = min_t(size_t, len, INT_MAX);
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);

        if (unlikely(msk->recvmsg_inq))
                cmsg_flags = MPTCP_CMSG_INQ;

        while (copied < len) {
                int bytes_read;

                bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags);
                if (unlikely(bytes_read < 0)) {
                        if (!copied)
                                copied = bytes_read;
                        goto out_err;
                }

                copied += bytes_read;

                /* be sure to advertise window change */
                mptcp_cleanup_rbuf(msk);

                if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk))
                        continue;

                /* only the MPTCP socket status is relevant here. The exit
                 * conditions mirror closely tcp_recvmsg()
                 */
                if (copied >= target)
                        break;

                if (copied) {
                        if (sk->sk_err ||
                            sk->sk_state == TCP_CLOSE ||
                            (sk->sk_shutdown & RCV_SHUTDOWN) ||
                            !timeo ||
                            signal_pending(current))
                                break;
                } else {
                        if (sk->sk_err) {
                                copied = sock_error(sk);
                                break;
                        }

                        if (sk->sk_shutdown & RCV_SHUTDOWN) {
                                /* race breaker: the shutdown could be after the
                                 * previous receive queue check
                                 */
                                if (__mptcp_move_skbs(msk))
                                        continue;
                                break;
                        }

                        if (sk->sk_state == TCP_CLOSE) {
                                copied = -ENOTCONN;
                                break;
                        }

                        if (!timeo) {
                                copied = -EAGAIN;
                                break;
                        }

                        if (signal_pending(current)) {
                                copied = sock_intr_errno(timeo);
                                break;
                        }
                }

                pr_debug("block timeout %ld", timeo);
                sk_wait_data(sk, &timeo, NULL);
        }

out_err:
        if (cmsg_flags && copied >= 0) {
                if (cmsg_flags & MPTCP_CMSG_TS)
                        tcp_recv_timestamp(msg, sk, &tss);

                if (cmsg_flags & MPTCP_CMSG_INQ) {
                        unsigned int inq = mptcp_inq_hint(sk);

                        put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
                }
        }

        pr_debug("msk=%p rx queue empty=%d:%d copied=%d",
                 msk, skb_queue_empty_lockless(&sk->sk_receive_queue),
                 skb_queue_empty(&msk->receive_queue), copied);
        if (!(flags & MSG_PEEK))
                mptcp_rcv_space_adjust(msk, copied);

        release_sock(sk);
        return copied;
}

static void mptcp_retransmit_timer(struct timer_list *t)
{
        struct inet_connection_sock *icsk = from_timer(icsk, t,
                                                       icsk_retransmit_timer);
        struct sock *sk = &icsk->icsk_inet.sk;
        struct mptcp_sock *msk = mptcp_sk(sk);

        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                /* we need a process context to retransmit */
                if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
                        mptcp_schedule_work(sk);
        } else {
                /* delegate our work to tcp_release_cb() */
                __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags);
        }
        bh_unlock_sock(sk);
        sock_put(sk);
}

static void mptcp_tout_timer(struct timer_list *t)
{
        struct sock *sk = from_timer(sk, t, sk_timer);

        mptcp_schedule_work(sk);
        sock_put(sk);
}

/* Find an idle subflow.  Return NULL if there is unacked data at tcp
 * level.
 *
 * A backup subflow is returned only if that is the only kind available.
 */
struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{
        struct sock *backup = NULL, *pick = NULL;
        struct mptcp_subflow_context *subflow;
        int min_stale_count = INT_MAX;

        mptcp_for_each_subflow(msk, subflow) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

                if (!__mptcp_subflow_active(subflow))
                        continue;

                /* still data outstanding at TCP level? skip this */
                if (!tcp_rtx_and_write_queues_empty(ssk)) {
                        mptcp_pm_subflow_chk_stale(msk, ssk);
                        min_stale_count = min_t(int, min_stale_count, subflow->stale_count);
                        continue;
                }

                if (subflow->backup) {
                        if (!backup)
                                backup = ssk;
                        continue;
                }

                if (!pick)
                        pick = ssk;
        }

        if (pick)
                return pick;

        /* use backup only if there are no progresses anywhere */
        return min_stale_count > 1 ? backup : NULL;
}

bool __mptcp_retransmit_pending_data(struct sock *sk)
{
        struct mptcp_data_frag *cur, *rtx_head;
        struct mptcp_sock *msk = mptcp_sk(sk);

        if (__mptcp_check_fallback(msk))
                return false;

        /* the closing socket has some data untransmitted and/or unacked:
         * some data in the mptcp rtx queue has not really xmitted yet.
         * keep it simple and re-inject the whole mptcp level rtx queue
         */
        mptcp_data_lock(sk);
        __mptcp_clean_una_wakeup(sk);
        rtx_head = mptcp_rtx_head(sk);
        if (!rtx_head) {
                mptcp_data_unlock(sk);
                return false;
        }

        msk->recovery_snd_nxt = msk->snd_nxt;
        msk->recovery = true;
        mptcp_data_unlock(sk);

        msk->first_pending = rtx_head;
        msk->snd_burst = 0;

        /* be sure to clear the "sent status" on all re-injected fragments */
        list_for_each_entry(cur, &msk->rtx_queue, list) {
                if (!cur->already_sent)
                        break;
                cur->already_sent = 0;
        }

        return true;
}

/* flags for __mptcp_close_ssk() */
#define MPTCP_CF_PUSH                BIT(1)
#define MPTCP_CF_FASTCLOSE        BIT(2)

/* be sure to send a reset only if the caller asked for it, also
 * clean completely the subflow status when the subflow reaches
 * TCP_CLOSE state
 */
static void __mptcp_subflow_disconnect(struct sock *ssk,
                                       struct mptcp_subflow_context *subflow,
                                       unsigned int flags)
{
        if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
            (flags & MPTCP_CF_FASTCLOSE)) {
                /* The MPTCP code never wait on the subflow sockets, TCP-level
                 * disconnect should never fail
                 */
                WARN_ON_ONCE(tcp_disconnect(ssk, 0));
                mptcp_subflow_ctx_reset(subflow);
        } else {
                tcp_shutdown(ssk, SEND_SHUTDOWN);
        }
}

/* subflow sockets can be either outgoing (connect) or incoming
 * (accept).
 *
 * Outgoing subflows use in-kernel sockets.
 * Incoming subflows do not have their own 'struct socket' allocated,
 * so we need to use tcp_close() after detaching them from the mptcp
 * parent socket.
 */
static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
                              struct mptcp_subflow_context *subflow,
                              unsigned int flags)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        bool dispose_it, need_push = false;

        /* If the first subflow moved to a close state before accept, e.g. due
         * to an incoming reset or listener shutdown, the subflow socket is
         * already deleted by inet_child_forget() and the mptcp socket can't
         * survive too.
         */
        if (msk->in_accept_queue && msk->first == ssk &&
            (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
                /* ensure later check in mptcp_worker() will dispose the msk */
                sock_set_flag(sk, SOCK_DEAD);
                mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1));
                lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
                mptcp_subflow_drop_ctx(ssk);
                goto out_release;
        }

        dispose_it = msk->free_first || ssk != msk->first;
        if (dispose_it)
                list_del(&subflow->node);

        lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);

        if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) {
                /* be sure to force the tcp_close path
                 * to generate the egress reset
                 */
                ssk->sk_lingertime = 0;
                sock_set_flag(ssk, SOCK_LINGER);
                subflow->send_fastclose = 1;
        }

        need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
        if (!dispose_it) {
                __mptcp_subflow_disconnect(ssk, subflow, flags);
                release_sock(ssk);

                goto out;
        }

        subflow->disposable = 1;

        /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops
         * the ssk has been already destroyed, we just need to release the
         * reference owned by msk;
         */
        if (!inet_csk(ssk)->icsk_ulp_ops) {
                WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD));
                kfree_rcu(subflow, rcu);
        } else {
                /* otherwise tcp will dispose of the ssk and subflow ctx */
                __tcp_close(ssk, 0);

                /* close acquired an extra ref */
                __sock_put(ssk);
        }

out_release:
        __mptcp_subflow_error_report(sk, ssk);
        release_sock(ssk);

        sock_put(ssk);

        if (ssk == msk->first)
                WRITE_ONCE(msk->first, NULL);

out:
        __mptcp_sync_sndbuf(sk);
        if (need_push)
                __mptcp_push_pending(sk, 0);

        /* Catch every 'all subflows closed' scenario, including peers silently
         * closing them, e.g. due to timeout.
         * For established sockets, allow an additional timeout before closing,
         * as the protocol can still create more subflows.
         */
        if (list_is_singular(&msk->conn_list) && msk->first &&
            inet_sk_state_load(msk->first) == TCP_CLOSE) {
                if (sk->sk_state != TCP_ESTABLISHED ||
                    msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
                        mptcp_set_state(sk, TCP_CLOSE);
                        mptcp_close_wake_up(sk);
                } else {
                        mptcp_start_tout_timer(sk);
                }
        }
}

void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
                     struct mptcp_subflow_context *subflow)
{
        if (sk->sk_state == TCP_ESTABLISHED)
                mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL);

        /* subflow aborted before reaching the fully_established status
         * attempt the creation of the next subflow
         */
        mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow);

        __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH);
}

static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
{
        return 0;
}

static void __mptcp_close_subflow(struct sock *sk)
{
        struct mptcp_subflow_context *subflow, *tmp;
        struct mptcp_sock *msk = mptcp_sk(sk);

        might_sleep();

        mptcp_for_each_subflow_safe(msk, subflow, tmp) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

                if (inet_sk_state_load(ssk) != TCP_CLOSE)
                        continue;

                /* 'subflow_data_ready' will re-sched once rx queue is empty */
                if (!skb_queue_empty_lockless(&ssk->sk_receive_queue))
                        continue;

                mptcp_close_ssk(sk, ssk, subflow);
        }

}

static bool mptcp_close_tout_expired(const struct sock *sk)
{
        if (!inet_csk(sk)->icsk_mtup.probe_timestamp ||
            sk->sk_state == TCP_CLOSE)
                return false;

        return time_after32(tcp_jiffies32,
                  inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk));
}

static void mptcp_check_fastclose(struct mptcp_sock *msk)
{
        struct mptcp_subflow_context *subflow, *tmp;
        struct sock *sk = (struct sock *)msk;

        if (likely(!READ_ONCE(msk->rcv_fastclose)))
                return;

        mptcp_token_destroy(msk);

        mptcp_for_each_subflow_safe(msk, subflow, tmp) {
                struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
                bool slow;

                slow = lock_sock_fast(tcp_sk);
                if (tcp_sk->sk_state != TCP_CLOSE) {
                        mptcp_send_active_reset_reason(tcp_sk);
                        tcp_set_state(tcp_sk, TCP_CLOSE);
                }
                unlock_sock_fast(tcp_sk, slow);
        }

        /* Mirror the tcp_reset() error propagation */
        switch (sk->sk_state) {
        case TCP_SYN_SENT:
                WRITE_ONCE(sk->sk_err, ECONNREFUSED);
                break;
        case TCP_CLOSE_WAIT:
                WRITE_ONCE(sk->sk_err, EPIPE);
                break;
        case TCP_CLOSE:
                return;
        default:
                WRITE_ONCE(sk->sk_err, ECONNRESET);
        }

        mptcp_set_state(sk, TCP_CLOSE);
        WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
        smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
        set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);

        /* the calling mptcp_worker will properly destroy the socket */
        if (sock_flag(sk, SOCK_DEAD))
                return;

        sk->sk_state_change(sk);
        sk_error_report(sk);
}

static void __mptcp_retrans(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_subflow_context *subflow;
        struct mptcp_sendmsg_info info = {};
        struct mptcp_data_frag *dfrag;
        struct sock *ssk;
        int ret, err;
        u16 len = 0;

        mptcp_clean_una_wakeup(sk);

        /* first check ssk: need to kick "stale" logic */
        err = mptcp_sched_get_retrans(msk);
        dfrag = mptcp_rtx_head(sk);
        if (!dfrag) {
                if (mptcp_data_fin_enabled(msk)) {
                        struct inet_connection_sock *icsk = inet_csk(sk);

                        icsk->icsk_retransmits++;
                        mptcp_set_datafin_timeout(sk);
                        mptcp_send_ack(msk);

                        goto reset_timer;
                }

                if (!mptcp_send_head(sk))
                        return;

                goto reset_timer;
        }

        if (err)
                goto reset_timer;

        mptcp_for_each_subflow(msk, subflow) {
                if (READ_ONCE(subflow->scheduled)) {
                        u16 copied = 0;

                        mptcp_subflow_set_scheduled(subflow, false);

                        ssk = mptcp_subflow_tcp_sock(subflow);

                        lock_sock(ssk);

                        /* limit retransmission to the bytes already sent on some subflows */
                        info.sent = 0;
                        info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
                                                                    dfrag->already_sent;
                        while (info.sent < info.limit) {
                                ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
                                if (ret <= 0)
                                        break;

                                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
                                copied += ret;
                                info.sent += ret;
                        }
                        if (copied) {
                                len = max(copied, len);
                                tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
                                         info.size_goal);
                                WRITE_ONCE(msk->allow_infinite_fallback, false);
                        }

                        release_sock(ssk);
                }
        }

        msk->bytes_retrans += len;
        dfrag->already_sent = max(dfrag->already_sent, len);

reset_timer:
        mptcp_check_and_set_pending(sk);

        if (!mptcp_rtx_timer_pending(sk))
                mptcp_reset_rtx_timer(sk);
}

/* schedule the timeout timer for the relevant event: either close timeout
 * or mp_fail timeout. The close timeout takes precedence on the mp_fail one
 */
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
{
        struct sock *sk = (struct sock *)msk;
        unsigned long timeout, close_timeout;

        if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
                return;

        close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
                        mptcp_close_timeout(sk);

        /* the close timeout takes precedence on the fail one, and here at least one of
         * them is active
         */
        timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;

        sk_reset_timer(sk, &sk->sk_timer, timeout);
}

static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
{
        struct sock *ssk = msk->first;
        bool slow;

        if (!ssk)
                return;

        pr_debug("MP_FAIL doesn't respond, reset the subflow");

        slow = lock_sock_fast(ssk);
        mptcp_subflow_reset(ssk);
        WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
        unlock_sock_fast(ssk, slow);
}

static void mptcp_do_fastclose(struct sock *sk)
{
        struct mptcp_subflow_context *subflow, *tmp;
        struct mptcp_sock *msk = mptcp_sk(sk);

        mptcp_set_state(sk, TCP_CLOSE);
        mptcp_for_each_subflow_safe(msk, subflow, tmp)
                __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
                                  subflow, MPTCP_CF_FASTCLOSE);
}

static void mptcp_worker(struct work_struct *work)
{
        struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
        struct sock *sk = (struct sock *)msk;
        unsigned long fail_tout;
        int state;

        lock_sock(sk);
        state = sk->sk_state;
        if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
                goto unlock;

        mptcp_check_fastclose(msk);

        mptcp_pm_nl_work(msk);

        mptcp_check_send_data_fin(sk);
        mptcp_check_data_fin_ack(sk);
        mptcp_check_data_fin(sk);

        if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
                __mptcp_close_subflow(sk);

        if (mptcp_close_tout_expired(sk)) {
                mptcp_do_fastclose(sk);
                mptcp_close_wake_up(sk);
        }

        if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) {
                __mptcp_destroy_sock(sk);
                goto unlock;
        }

        if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
                __mptcp_retrans(sk);

        fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0;
        if (fail_tout && time_after(jiffies, fail_tout))
                mptcp_mp_fail_no_response(msk);

unlock:
        release_sock(sk);
        sock_put(sk);
}

static void __mptcp_init_sock(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        INIT_LIST_HEAD(&msk->conn_list);
        INIT_LIST_HEAD(&msk->join_list);
        INIT_LIST_HEAD(&msk->rtx_queue);
        INIT_WORK(&msk->work, mptcp_worker);
        __skb_queue_head_init(&msk->receive_queue);
        msk->out_of_order_queue = RB_ROOT;
        msk->first_pending = NULL;
        WRITE_ONCE(msk->rmem_fwd_alloc, 0);
        WRITE_ONCE(msk->rmem_released, 0);
        msk->timer_ival = TCP_RTO_MIN;
        msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;

        WRITE_ONCE(msk->first, NULL);
        inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
        WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
        WRITE_ONCE(msk->allow_infinite_fallback, true);
        msk->recovery = false;
        msk->subflow_id = 1;
        msk->last_data_sent = tcp_jiffies32;
        msk->last_data_recv = tcp_jiffies32;
        msk->last_ack_recv = tcp_jiffies32;

        mptcp_pm_data_init(msk);

        /* re-use the csk retrans timer for MPTCP-level retrans */
        timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
        timer_setup(&sk->sk_timer, mptcp_tout_timer, 0);
}

static void mptcp_ca_reset(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_assign_congestion_control(sk);
        strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name,
                sizeof(mptcp_sk(sk)->ca_name));

        /* no need to keep a reference to the ops, the name will suffice */
        tcp_cleanup_congestion_control(sk);
        icsk->icsk_ca_ops = NULL;
}

static int mptcp_init_sock(struct sock *sk)
{
        struct net *net = sock_net(sk);
        int ret;

        __mptcp_init_sock(sk);

        if (!mptcp_is_enabled(net))
                return -ENOPROTOOPT;

        if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
                return -ENOMEM;

        ret = mptcp_init_sched(mptcp_sk(sk),
                               mptcp_sched_find(mptcp_get_scheduler(net)));
        if (ret)
                return ret;

        set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);

        /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
         * propagate the correct value
         */
        mptcp_ca_reset(sk);

        sk_sockets_allocated_inc(sk);
        sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]);
        sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]);

        return 0;
}

static void __mptcp_clear_xmit(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        struct mptcp_data_frag *dtmp, *dfrag;

        WRITE_ONCE(msk->first_pending, NULL);
        list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
                dfrag_clear(sk, dfrag);
}

void mptcp_cancel_work(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        if (cancel_work_sync(&msk->work))
                __sock_put(sk);
}

void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
{
        lock_sock(ssk);

        switch (ssk->sk_state) {
        case TCP_LISTEN:
                if (!(how & RCV_SHUTDOWN))
                        break;
                fallthrough;
        case TCP_SYN_SENT:
                WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK));
                break;
        default:
                if (__mptcp_check_fallback(mptcp_sk(sk))) {
                        pr_debug("Fallback");
                        ssk->sk_shutdown |= how;
                        tcp_shutdown(ssk, how);

                        /* simulate the data_fin ack reception to let the state
                         * machine move forward
                         */
                        WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
                        mptcp_schedule_work(sk);
                } else {
                        pr_debug("Sending DATA_FIN on subflow %p", ssk);
                        tcp_send_ack(ssk);
                        if (!mptcp_rtx_timer_pending(sk))
                                mptcp_reset_rtx_timer(sk);
                }
                break;
        }

        release_sock(ssk);
}

void mptcp_set_state(struct sock *sk, int state)
{
        int oldstate = sk->sk_state;

        switch (state) {
        case TCP_ESTABLISHED:
                if (oldstate != TCP_ESTABLISHED)
                        MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
                break;
        case TCP_CLOSE_WAIT:
                /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state:
                 * MPTCP "accepted" sockets will be created later on. So no
                 * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT.
                 */
                break;
        default:
                if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
                        MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB);
        }

        inet_sk_state_store(sk, state);
}

static const unsigned char new_state[16] = {
        /* current state:     new state:      action:        */
        [0 /* (Invalid) */] = TCP_CLOSE,
        [TCP_ESTABLISHED]   = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
        [TCP_SYN_SENT]      = TCP_CLOSE,
        [TCP_SYN_RECV]      = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
        [TCP_FIN_WAIT1]     = TCP_FIN_WAIT1,
        [TCP_FIN_WAIT2]     = TCP_FIN_WAIT2,
        [TCP_TIME_WAIT]     = TCP_CLOSE,        /* should not happen ! */
        [TCP_CLOSE]         = TCP_CLOSE,
        [TCP_CLOSE_WAIT]    = TCP_LAST_ACK  | TCP_ACTION_FIN,
        [TCP_LAST_ACK]      = TCP_LAST_ACK,
        [TCP_LISTEN]        = TCP_CLOSE,
        [TCP_CLOSING]       = TCP_CLOSING,
        [TCP_NEW_SYN_RECV]  = TCP_CLOSE,        /* should not happen ! */
};

static int mptcp_close_state(struct sock *sk)
{
        int next = (int)new_state[sk->sk_state];
        int ns = next & TCP_STATE_MASK;

        mptcp_set_state(sk, ns);

        return next & TCP_ACTION_FIN;
}

static void mptcp_check_send_data_fin(struct sock *sk)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);

        pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu",
                 msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk),
                 msk->snd_nxt, msk->write_seq);

        /* we still need to enqueue subflows or not really shutting down,
         * skip this
         */
        if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq ||
            mptcp_send_head(sk))
                return;

        WRITE_ONCE(msk->snd_nxt, msk->write_seq);

        mptcp_for_each_subflow(msk, subflow) {
                struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);

                mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN);
        }
}

static void __mptcp_wr_shutdown(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d",
                 msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state,
                 !!mptcp_send_head(sk));

        /* will be ignored by fallback sockets */
        WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
        WRITE_ONCE(msk->snd_data_fin_enable, 1);

        mptcp_check_send_data_fin(sk);
}

static void __mptcp_destroy_sock(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        pr_debug("msk=%p", msk);

        might_sleep();

        mptcp_stop_rtx_timer(sk);
        sk_stop_timer(sk, &sk->sk_timer);
        msk->pm.status = 0;
        mptcp_release_sched(msk);

        sk->sk_prot->destroy(sk);

        WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc));
        WARN_ON_ONCE(msk->rmem_released);
        sk_stream_kill_queues(sk);
        xfrm_sk_free_policy(sk);

        sock_put(sk);
}

void __mptcp_unaccepted_force_close(struct sock *sk)
{
        sock_set_flag(sk, SOCK_DEAD);
        mptcp_do_fastclose(sk);
        __mptcp_destroy_sock(sk);
}

static __poll_t mptcp_check_readable(struct sock *sk)
{
        return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0;
}

static void mptcp_check_listen_stop(struct sock *sk)
{
        struct sock *ssk;

        if (inet_sk_state_load(sk) != TCP_LISTEN)
                return;

        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        ssk = mptcp_sk(sk)->first;
        if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
                return;

        lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
        tcp_set_state(ssk, TCP_CLOSE);
        mptcp_subflow_queue_clean(sk, ssk);
        inet_csk_listen_stop(ssk);
        mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
        release_sock(ssk);
}

bool __mptcp_close(struct sock *sk, long timeout)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);
        bool do_cancel_work = false;
        int subflows_alive = 0;

        WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);

        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
                mptcp_check_listen_stop(sk);
                mptcp_set_state(sk, TCP_CLOSE);
                goto cleanup;
        }

        if (mptcp_data_avail(msk) || timeout < 0) {
                /* If the msk has read data, or the caller explicitly ask it,
                 * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose
                 */
                mptcp_do_fastclose(sk);
                timeout = 0;
        } else if (mptcp_close_state(sk)) {
                __mptcp_wr_shutdown(sk);
        }

        sk_stream_wait_close(sk, timeout);

cleanup:
        /* orphan all the subflows */
        mptcp_for_each_subflow(msk, subflow) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
                bool slow = lock_sock_fast_nested(ssk);

                subflows_alive += ssk->sk_state != TCP_CLOSE;

                /* since the close timeout takes precedence on the fail one,
                 * cancel the latter
                 */
                if (ssk == msk->first)
                        subflow->fail_tout = 0;

                /* detach from the parent socket, but allow data_ready to
                 * push incoming data into the mptcp stack, to properly ack it
                 */
                ssk->sk_socket = NULL;
                ssk->sk_wq = NULL;
                unlock_sock_fast(ssk, slow);
        }
        sock_orphan(sk);

        /* all the subflows are closed, only timeout can change the msk
         * state, let's not keep resources busy for no reasons
         */
        if (subflows_alive == 0)
                mptcp_set_state(sk, TCP_CLOSE);

        sock_hold(sk);
        pr_debug("msk=%p state=%d", sk, sk->sk_state);
        if (msk->token)
                mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);

        if (sk->sk_state == TCP_CLOSE) {
                __mptcp_destroy_sock(sk);
                do_cancel_work = true;
        } else {
                mptcp_start_tout_timer(sk);
        }

        return do_cancel_work;
}

static void mptcp_close(struct sock *sk, long timeout)
{
        bool do_cancel_work;

        lock_sock(sk);

        do_cancel_work = __mptcp_close(sk, timeout);
        release_sock(sk);
        if (do_cancel_work)
                mptcp_cancel_work(sk);

        sock_put(sk);
}

static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
{
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
        struct ipv6_pinfo *msk6 = inet6_sk(msk);

        msk->sk_v6_daddr = ssk->sk_v6_daddr;
        msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;

        if (msk6 && ssk6) {
                msk6->saddr = ssk6->saddr;
                msk6->flow_label = ssk6->flow_label;
        }
#endif

        inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
        inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
        inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
        inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
        inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
        inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
}

static int mptcp_disconnect(struct sock *sk, int flags)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        /* We are on the fastopen error path. We can't call straight into the
         * subflows cleanup code due to lock nesting (we are already under
         * msk->firstsocket lock).
         */
        if (msk->fastopening)
                return -EBUSY;

        mptcp_check_listen_stop(sk);
        mptcp_set_state(sk, TCP_CLOSE);

        mptcp_stop_rtx_timer(sk);
        mptcp_stop_tout_timer(sk);

        if (msk->token)
                mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);

        /* msk->subflow is still intact, the following will not free the first
         * subflow
         */
        mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
        WRITE_ONCE(msk->flags, 0);
        msk->cb_flags = 0;
        msk->recovery = false;
        WRITE_ONCE(msk->can_ack, false);
        WRITE_ONCE(msk->fully_established, false);
        WRITE_ONCE(msk->rcv_data_fin, false);
        WRITE_ONCE(msk->snd_data_fin_enable, false);
        WRITE_ONCE(msk->rcv_fastclose, false);
        WRITE_ONCE(msk->use_64bit_ack, false);
        WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
        mptcp_pm_data_reset(msk);
        mptcp_ca_reset(sk);
        msk->bytes_consumed = 0;
        msk->bytes_acked = 0;
        msk->bytes_received = 0;
        msk->bytes_sent = 0;
        msk->bytes_retrans = 0;
        msk->rcvspace_init = 0;

        WRITE_ONCE(sk->sk_shutdown, 0);
        sk_error_report(sk);
        return 0;
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{
        unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);

        return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}

static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk)
{
        const struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6_txoptions *opt;
        struct ipv6_pinfo *newnp;

        newnp = inet6_sk(newsk);

        rcu_read_lock();
        opt = rcu_dereference(np->opt);
        if (opt) {
                opt = ipv6_dup_options(newsk, opt);
                if (!opt)
                        net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__);
        }
        RCU_INIT_POINTER(newnp->opt, opt);
        rcu_read_unlock();
}
#endif

static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk)
{
        struct ip_options_rcu *inet_opt, *newopt = NULL;
        const struct inet_sock *inet = inet_sk(sk);
        struct inet_sock *newinet;

        newinet = inet_sk(newsk);

        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        if (inet_opt) {
                newopt = sock_kmalloc(newsk, sizeof(*inet_opt) +
                                      inet_opt->opt.optlen, GFP_ATOMIC);
                if (newopt)
                        memcpy(newopt, inet_opt, sizeof(*inet_opt) +
                               inet_opt->opt.optlen);
                else
                        net_warn_ratelimited("%s: Failed to copy ip options\n", __func__);
        }
        RCU_INIT_POINTER(newinet->inet_opt, newopt);
        rcu_read_unlock();
}

struct sock *mptcp_sk_clone_init(const struct sock *sk,
                                 const struct mptcp_options_received *mp_opt,
                                 struct sock *ssk,
                                 struct request_sock *req)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk;

        if (!nsk)
                return NULL;

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        if (nsk->sk_family == AF_INET6)
                inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
#endif

        __mptcp_init_sock(nsk);

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        if (nsk->sk_family == AF_INET6)
                mptcp_copy_ip6_options(nsk, sk);
        else
#endif
                mptcp_copy_ip_options(nsk, sk);

        msk = mptcp_sk(nsk);
        WRITE_ONCE(msk->local_key, subflow_req->local_key);
        WRITE_ONCE(msk->token, subflow_req->token);
        msk->in_accept_queue = 1;
        WRITE_ONCE(msk->fully_established, false);
        if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD)
                WRITE_ONCE(msk->csum_enabled, true);

        WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1);
        WRITE_ONCE(msk->snd_nxt, msk->write_seq);
        WRITE_ONCE(msk->snd_una, msk->write_seq);
        WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd);
        msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
        mptcp_init_sched(msk, mptcp_sk(sk)->sched);

        /* passive msk is created after the first/MPC subflow */
        msk->subflow_id = 2;

        sock_reset_flag(nsk, SOCK_RCU_FREE);
        security_inet_csk_clone(nsk, req);

        /* this can't race with mptcp_close(), as the msk is
         * not yet exposted to user-space
         */
        mptcp_set_state(nsk, TCP_ESTABLISHED);

        /* The msk maintain a ref to each subflow in the connections list */
        WRITE_ONCE(msk->first, ssk);
        subflow = mptcp_subflow_ctx(ssk);
        list_add(&subflow->node, &msk->conn_list);
        sock_hold(ssk);

        /* new mpc subflow takes ownership of the newly
         * created mptcp socket
         */
        mptcp_token_accept(subflow_req, msk);

        /* set msk addresses early to ensure mptcp_pm_get_local_id()
         * uses the correct data
         */
        mptcp_copy_inaddrs(nsk, ssk);
        __mptcp_propagate_sndbuf(nsk, ssk);

        mptcp_rcv_space_init(msk, ssk);

        if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
                __mptcp_subflow_fully_established(msk, subflow, mp_opt);
        bh_unlock_sock(nsk);

        /* note: the newly allocated socket refcount is 2 now */
        return nsk;
}

void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
{
        const struct tcp_sock *tp = tcp_sk(ssk);

        msk->rcvspace_init = 1;
        msk->rcvq_space.copied = 0;
        msk->rcvq_space.rtt_us = 0;

        msk->rcvq_space.time = tp->tcp_mstamp;

        /* initial rcv_space offering made to peer */
        msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
                                      TCP_INIT_CWND * tp->advmss);
        if (msk->rcvq_space.space == 0)
                msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
}

void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
{
        struct mptcp_subflow_context *subflow, *tmp;
        struct sock *sk = (struct sock *)msk;

        __mptcp_clear_xmit(sk);

        /* join list will be eventually flushed (with rst) at sock lock release time */
        mptcp_for_each_subflow_safe(msk, subflow, tmp)
                __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);

        /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
        mptcp_data_lock(sk);
        skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
        __skb_queue_purge(&sk->sk_receive_queue);
        skb_rbtree_purge(&msk->out_of_order_queue);
        mptcp_data_unlock(sk);

        /* move all the rx fwd alloc into the sk_mem_reclaim_final in
         * inet_sock_destruct() will dispose it
         */
        sk_forward_alloc_add(sk, msk->rmem_fwd_alloc);
        WRITE_ONCE(msk->rmem_fwd_alloc, 0);
        mptcp_token_destroy(msk);
        mptcp_pm_free_anno_list(msk);
        mptcp_free_local_addr_list(msk);
}

static void mptcp_destroy(struct sock *sk)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        /* allow the following to close even the initial subflow */
        msk->free_first = 1;
        mptcp_destroy_common(msk, 0);
        sk_sockets_allocated_dec(sk);
}

void __mptcp_data_acked(struct sock *sk)
{
        if (!sock_owned_by_user(sk))
                __mptcp_clean_una(sk);
        else
                __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags);
}

void __mptcp_check_push(struct sock *sk, struct sock *ssk)
{
        if (!mptcp_send_head(sk))
                return;

        if (!sock_owned_by_user(sk))
                __mptcp_subflow_push_pending(sk, ssk, false);
        else
                __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
}

#define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \
                                      BIT(MPTCP_RETRANSMIT) | \
                                      BIT(MPTCP_FLUSH_JOIN_LIST))

/* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk)
        __must_hold(&sk->sk_lock.slock)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        for (;;) {
                unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED);
                struct list_head join_list;

                if (!flags)
                        break;

                INIT_LIST_HEAD(&join_list);
                list_splice_init(&msk->join_list, &join_list);

                /* the following actions acquire the subflow socket lock
                 *
                 * 1) can't be invoked in atomic scope
                 * 2) must avoid ABBA deadlock with msk socket spinlock: the RX
                 *    datapath acquires the msk socket spinlock while helding
                 *    the subflow socket lock
                 */
                msk->cb_flags &= ~flags;
                spin_unlock_bh(&sk->sk_lock.slock);

                if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
                        __mptcp_flush_join_list(sk, &join_list);
                if (flags & BIT(MPTCP_PUSH_PENDING))
                        __mptcp_push_pending(sk, 0);
                if (flags & BIT(MPTCP_RETRANSMIT))
                        __mptcp_retrans(sk);

                cond_resched();
                spin_lock_bh(&sk->sk_lock.slock);
        }

        if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags))
                __mptcp_clean_una_wakeup(sk);
        if (unlikely(msk->cb_flags)) {
                /* be sure to sync the msk state before taking actions
                 * depending on sk_state (MPTCP_ERROR_REPORT)
                 * On sk release avoid actions depending on the first subflow
                 */
                if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first)
                        __mptcp_sync_state(sk, msk->pending_state);
                if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
                        __mptcp_error_report(sk);
                if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
                        __mptcp_sync_sndbuf(sk);
        }

        __mptcp_update_rmem(sk);
}

/* MP_JOIN client subflow must wait for 4th ack before sending any data:
 * TCP can't schedule delack timer before the subflow is fully established.
 * MPTCP uses the delack timer to do 3rd ack retransmissions
 */
static void schedule_3rdack_retransmission(struct sock *ssk)
{
        struct inet_connection_sock *icsk = inet_csk(ssk);
        struct tcp_sock *tp = tcp_sk(ssk);
        unsigned long timeout;

        if (mptcp_subflow_ctx(ssk)->fully_established)
                return;

        /* reschedule with a timeout above RTT, as we must look only for drop */
        if (tp->srtt_us)
                timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1));
        else
                timeout = TCP_TIMEOUT_INIT;
        timeout += jiffies;

        WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
        icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
        icsk->icsk_ack.timeout = timeout;
        sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout);
}

void mptcp_subflow_process_delegated(struct sock *ssk, long status)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct sock *sk = subflow->conn;

        if (status & BIT(MPTCP_DELEGATE_SEND)) {
                mptcp_data_lock(sk);
                if (!sock_owned_by_user(sk))
                        __mptcp_subflow_push_pending(sk, ssk, true);
                else
                        __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
                mptcp_data_unlock(sk);
        }
        if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
                mptcp_data_lock(sk);
                if (!sock_owned_by_user(sk))
                        __mptcp_sync_sndbuf(sk);
                else
                        __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
                mptcp_data_unlock(sk);
        }
        if (status & BIT(MPTCP_DELEGATE_ACK))
                schedule_3rdack_retransmission(ssk);
}

static int mptcp_hash(struct sock *sk)
{
        /* should never be called,
         * we hash the TCP subflows not the MPTCP socket
         */
        WARN_ON_ONCE(1);
        return 0;
}

static void mptcp_unhash(struct sock *sk)
{
        /* called from sk_common_release(), but nothing to do here */
}

static int mptcp_get_port(struct sock *sk, unsigned short snum)
{
        struct mptcp_sock *msk = mptcp_sk(sk);

        pr_debug("msk=%p, ssk=%p", msk, msk->first);
        if (WARN_ON_ONCE(!msk->first))
                return -EINVAL;

        return inet_csk_get_port(msk->first, snum);
}

void mptcp_finish_connect(struct sock *ssk)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk;
        struct sock *sk;

        subflow = mptcp_subflow_ctx(ssk);
        sk = subflow->conn;
        msk = mptcp_sk(sk);

        pr_debug("msk=%p, token=%u", sk, subflow->token);

        subflow->map_seq = subflow->iasn;
        subflow->map_subflow_seq = 1;

        /* the socket is not connected yet, no msk/subflow ops can access/race
         * accessing the field below
         */
        WRITE_ONCE(msk->local_key, subflow->local_key);

        mptcp_pm_new_connection(msk, ssk, 0);
}

void mptcp_sock_graft(struct sock *sk, struct socket *parent)
{
        write_lock_bh(&sk->sk_callback_lock);
        rcu_assign_pointer(sk->sk_wq, &parent->wq);
        sk_set_socket(sk, parent);
        sk->sk_uid = SOCK_INODE(parent)->i_uid;
        write_unlock_bh(&sk->sk_callback_lock);
}

bool mptcp_finish_join(struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        struct sock *parent = (void *)msk;
        bool ret = true;

        pr_debug("msk=%p, subflow=%p", msk, subflow);

        /* mptcp socket already closing? */
        if (!mptcp_is_fully_established(parent)) {
                subflow->reset_reason = MPTCP_RST_EMPTCP;
                return false;
        }

        /* active subflow, already present inside the conn_list */
        if (!list_empty(&subflow->node)) {
                mptcp_subflow_joined(msk, ssk);
                mptcp_propagate_sndbuf(parent, ssk);
                return true;
        }

        if (!mptcp_pm_allow_new_subflow(msk))
                goto err_prohibited;

        /* If we can't acquire msk socket lock here, let the release callback
         * handle it
         */
        mptcp_data_lock(parent);
        if (!sock_owned_by_user(parent)) {
                ret = __mptcp_finish_join(msk, ssk);
                if (ret) {
                        sock_hold(ssk);
                        list_add_tail(&subflow->node, &msk->conn_list);
                }
        } else {
                sock_hold(ssk);
                list_add_tail(&subflow->node, &msk->join_list);
                __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags);
        }
        mptcp_data_unlock(parent);

        if (!ret) {
err_prohibited:
                subflow->reset_reason = MPTCP_RST_EPROHIBIT;
                return false;
        }

        return true;
}

static void mptcp_shutdown(struct sock *sk, int how)
{
        pr_debug("sk=%p, how=%d", sk, how);

        if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk))
                __mptcp_wr_shutdown(sk);
}

static int mptcp_forward_alloc_get(const struct sock *sk)
{
        return READ_ONCE(sk->sk_forward_alloc) +
               READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc);
}

static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
{
        const struct sock *sk = (void *)msk;
        u64 delta;

        if (sk->sk_state == TCP_LISTEN)
                return -EINVAL;

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
                return 0;

        delta = msk->write_seq - v;
        if (__mptcp_check_fallback(msk) && msk->first) {
                struct tcp_sock *tp = tcp_sk(msk->first);

                /* the first subflow is disconnected after close - see
                 * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq
                 * so ignore that status, too.
                 */
                if (!((1 << msk->first->sk_state) &
                      (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)))
                        delta += READ_ONCE(tp->write_seq) - tp->snd_una;
        }
        if (delta > INT_MAX)
                delta = INT_MAX;

        return (int)delta;
}

static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
{
        struct mptcp_sock *msk = mptcp_sk(sk);
        bool slow;

        switch (cmd) {
        case SIOCINQ:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;

                lock_sock(sk);
                __mptcp_move_skbs(msk);
                *karg = mptcp_inq_hint(sk);
                release_sock(sk);
                break;
        case SIOCOUTQ:
                slow = lock_sock_fast(sk);
                *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
                unlock_sock_fast(sk, slow);
                break;
        case SIOCOUTQNSD:
                slow = lock_sock_fast(sk);
                *karg = mptcp_ioctl_outq(msk, msk->snd_nxt);
                unlock_sock_fast(sk, slow);
                break;
        default:
                return -ENOIOCTLCMD;
        }

        return 0;
}

static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
                                         struct mptcp_subflow_context *subflow)
{
        subflow->request_mptcp = 0;
        __mptcp_do_fallback(msk);
}

static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        struct mptcp_subflow_context *subflow;
        struct mptcp_sock *msk = mptcp_sk(sk);
        int err = -EINVAL;
        struct sock *ssk;

        ssk = __mptcp_nmpc_sk(msk);
        if (IS_ERR(ssk))
                return PTR_ERR(ssk);

        mptcp_set_state(sk, TCP_SYN_SENT);
        subflow = mptcp_subflow_ctx(ssk);
#ifdef CONFIG_TCP_MD5SIG
        /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
         * TCP option space.
         */
        if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info))
                mptcp_subflow_early_fallback(msk, subflow);
#endif
        if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) {
                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT);
                mptcp_subflow_early_fallback(msk, subflow);
        }

        WRITE_ONCE(msk->write_seq, subflow->idsn);
        WRITE_ONCE(msk->snd_nxt, subflow->idsn);
        WRITE_ONCE(msk->snd_una, subflow->idsn);
        if (likely(!__mptcp_check_fallback(msk)))
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);

        /* if reaching here via the fastopen/sendmsg path, the caller already
         * acquired the subflow socket lock, too.
         */
        if (!msk->fastopening)
                lock_sock(ssk);

        /* the following mirrors closely a very small chunk of code from
         * __inet_stream_connect()
         */
        if (ssk->sk_state != TCP_CLOSE)
                goto out;

        if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) {
                err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len);
                if (err)
                        goto out;
        }

        err = ssk->sk_prot->connect(ssk, uaddr, addr_len);
        if (err < 0)
                goto out;

        inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk));

out:
        if (!msk->fastopening)
                release_sock(ssk);

        /* on successful connect, the msk state will be moved to established by
         * subflow_finish_connect()
         */
        if (unlikely(err)) {
                /* avoid leaving a dangling token in an unconnected socket */
                mptcp_token_destroy(msk);
                mptcp_set_state(sk, TCP_CLOSE);
                return err;
        }

        mptcp_copy_inaddrs(sk, ssk);
        return 0;
}

static struct proto mptcp_prot = {
        .name                = "MPTCP",
        .owner                = THIS_MODULE,
        .init                = mptcp_init_sock,
        .connect        = mptcp_connect,
        .disconnect        = mptcp_disconnect,
        .close                = mptcp_close,
        .setsockopt        = mptcp_setsockopt,
        .getsockopt        = mptcp_getsockopt,
        .shutdown        = mptcp_shutdown,
        .destroy        = mptcp_destroy,
        .sendmsg        = mptcp_sendmsg,
        .ioctl                = mptcp_ioctl,
        .recvmsg        = mptcp_recvmsg,
        .release_cb        = mptcp_release_cb,
        .hash                = mptcp_hash,
        .unhash                = mptcp_unhash,
        .get_port        = mptcp_get_port,
        .forward_alloc_get        = mptcp_forward_alloc_get,
        .stream_memory_free        = mptcp_stream_memory_free,
        .sockets_allocated        = &mptcp_sockets_allocated,

        .memory_allocated        = &tcp_memory_allocated,
        .per_cpu_fw_alloc        = &tcp_memory_per_cpu_fw_alloc,

        .memory_pressure        = &tcp_memory_pressure,
        .sysctl_wmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_wmem),
        .sysctl_rmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_rmem),
        .sysctl_mem        = sysctl_tcp_mem,
        .obj_size        = sizeof(struct mptcp_sock),
        .slab_flags        = SLAB_TYPESAFE_BY_RCU,
        .no_autobind        = true,
};

static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        struct mptcp_sock *msk = mptcp_sk(sock->sk);
        struct sock *ssk, *sk = sock->sk;
        int err = -EINVAL;

        lock_sock(sk);
        ssk = __mptcp_nmpc_sk(msk);
        if (IS_ERR(ssk)) {
                err = PTR_ERR(ssk);
                goto unlock;
        }

        if (sk->sk_family == AF_INET)
                err = inet_bind_sk(ssk, uaddr, addr_len);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
        else if (sk->sk_family == AF_INET6)
                err = inet6_bind_sk(ssk, uaddr, addr_len);
#endif
        if (!err)
                mptcp_copy_inaddrs(sk, ssk);

unlock:
        release_sock(sk);
        return err;
}

static int mptcp_listen(struct socket *sock, int backlog)
{
        struct mptcp_sock *msk = mptcp_sk(sock->sk);
        struct sock *sk = sock->sk;
        struct sock *ssk;
        int err;

        pr_debug("msk=%p", msk);

        lock_sock(sk);

        err = -EINVAL;
        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
                goto unlock;

        ssk = __mptcp_nmpc_sk(msk);
        if (IS_ERR(ssk)) {
                err = PTR_ERR(ssk);
                goto unlock;
        }

        mptcp_set_state(sk, TCP_LISTEN);
        sock_set_flag(sk, SOCK_RCU_FREE);

        lock_sock(ssk);
        err = __inet_listen_sk(ssk, backlog);
        release_sock(ssk);
        mptcp_set_state(sk, inet_sk_state_load(ssk));

        if (!err) {
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
                mptcp_copy_inaddrs(sk, ssk);
                mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);
        }

unlock:
        release_sock(sk);
        return err;
}

static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
                               struct proto_accept_arg *arg)
{
        struct mptcp_sock *msk = mptcp_sk(sock->sk);
        struct sock *ssk, *newsk;

        pr_debug("msk=%p", msk);

        /* Buggy applications can call accept on socket states other then LISTEN
         * but no need to allocate the first subflow just to error out.
         */
        ssk = READ_ONCE(msk->first);
        if (!ssk)
                return -EINVAL;

        pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
        newsk = inet_csk_accept(ssk, arg);
        if (!newsk)
                return arg->err;

        pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
        if (sk_is_mptcp(newsk)) {
                struct mptcp_subflow_context *subflow;
                struct sock *new_mptcp_sock;

                subflow = mptcp_subflow_ctx(newsk);
                new_mptcp_sock = subflow->conn;

                /* is_mptcp should be false if subflow->conn is missing, see
                 * subflow_syn_recv_sock()
                 */
                if (WARN_ON_ONCE(!new_mptcp_sock)) {
                        tcp_sk(newsk)->is_mptcp = 0;
                        goto tcpfallback;
                }

                newsk = new_mptcp_sock;
                MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);

                newsk->sk_kern_sock = arg->kern;
                lock_sock(newsk);
                __inet_accept(sock, newsock, newsk);

                set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
                msk = mptcp_sk(newsk);
                msk->in_accept_queue = 0;

                /* set ssk->sk_socket of accept()ed flows to mptcp socket.
                 * This is needed so NOSPACE flag can be set from tcp stack.
                 */
                mptcp_for_each_subflow(msk, subflow) {
                        struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

                        if (!ssk->sk_socket)
                                mptcp_sock_graft(ssk, newsock);
                }

                /* Do late cleanup for the first subflow as necessary. Also
                 * deal with bad peers not doing a complete shutdown.
                 */
                if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) {
                        __mptcp_close_ssk(newsk, msk->first,
                                          mptcp_subflow_ctx(msk->first), 0);
                        if (unlikely(list_is_singular(&msk->conn_list)))
                                mptcp_set_state(newsk, TCP_CLOSE);
                }
        } else {
tcpfallback:
                newsk->sk_kern_sock = arg->kern;
                lock_sock(newsk);
                __inet_accept(sock, newsock, newsk);
                /* we are being invoked after accepting a non-mp-capable
                 * flow: sk is a tcp_sk, not an mptcp one.
                 *
                 * Hand the socket over to tcp so all further socket ops
                 * bypass mptcp.
                 */
                WRITE_ONCE(newsock->sk->sk_socket->ops,
                           mptcp_fallback_tcp_ops(newsock->sk));
        }
        release_sock(newsk);

        return 0;
}

static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
        struct sock *sk = (struct sock *)msk;

        if (__mptcp_stream_is_writeable(sk, 1))
                return EPOLLOUT | EPOLLWRNORM;

        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
        smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
        if (__mptcp_stream_is_writeable(sk, 1))
                return EPOLLOUT | EPOLLWRNORM;

        return 0;
}

static __poll_t mptcp_poll(struct file *file, struct socket *sock,
                           struct poll_table_struct *wait)
{
        struct sock *sk = sock->sk;
        struct mptcp_sock *msk;
        __poll_t mask = 0;
        u8 shutdown;
        int state;

        msk = mptcp_sk(sk);
        sock_poll_wait(file, sock, wait);

        state = inet_sk_state_load(sk);
        pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags);
        if (state == TCP_LISTEN) {
                struct sock *ssk = READ_ONCE(msk->first);

                if (WARN_ON_ONCE(!ssk))
                        return 0;

                return inet_csk_listen_poll(ssk);
        }

        shutdown = READ_ONCE(sk->sk_shutdown);
        if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
                mask |= EPOLLHUP;
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

        if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
                mask |= mptcp_check_readable(sk);
                if (shutdown & SEND_SHUTDOWN)
                        mask |= EPOLLOUT | EPOLLWRNORM;
                else
                        mask |= mptcp_check_writeable(msk);
        } else if (state == TCP_SYN_SENT &&
                   inet_test_bit(DEFER_CONNECT, sk)) {
                /* cf tcp_poll() note about TFO */
                mask |= EPOLLOUT | EPOLLWRNORM;
        }

        /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
        smp_rmb();
        if (READ_ONCE(sk->sk_err))
                mask |= EPOLLERR;

        return mask;
}

static const struct proto_ops mptcp_stream_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = mptcp_bind,
        .connect           = inet_stream_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = mptcp_stream_accept,
        .getname           = inet_getname,
        .poll                   = mptcp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = mptcp_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
        .set_rcvlowat           = mptcp_set_rcvlowat,
};

static struct inet_protosw mptcp_protosw = {
        .type                = SOCK_STREAM,
        .protocol        = IPPROTO_MPTCP,
        .prot                = &mptcp_prot,
        .ops                = &mptcp_stream_ops,
        .flags                = INET_PROTOSW_ICSK,
};

static int mptcp_napi_poll(struct napi_struct *napi, int budget)
{
        struct mptcp_delegated_action *delegated;
        struct mptcp_subflow_context *subflow;
        int work_done = 0;

        delegated = container_of(napi, struct mptcp_delegated_action, napi);
        while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

                bh_lock_sock_nested(ssk);
                if (!sock_owned_by_user(ssk)) {
                        mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0));
                } else {
                        /* tcp_release_cb_override already processed
                         * the action or will do at next release_sock().
                         * In both case must dequeue the subflow here - on the same
                         * CPU that scheduled it.
                         */
                        smp_wmb();
                        clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status);
                }
                bh_unlock_sock(ssk);
                sock_put(ssk);

                if (++work_done == budget)
                        return budget;
        }

        /* always provide a 0 'work_done' argument, so that napi_complete_done
         * will not try accessing the NULL napi->dev ptr
         */
        napi_complete_done(napi, 0);
        return work_done;
}

void __init mptcp_proto_init(void)
{
        struct mptcp_delegated_action *delegated;
        int cpu;

        mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;

        if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
                panic("Failed to allocate MPTCP pcpu counter\n");

        init_dummy_netdev(&mptcp_napi_dev);
        for_each_possible_cpu(cpu) {
                delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
                INIT_LIST_HEAD(&delegated->head);
                netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
                                  mptcp_napi_poll);
                napi_enable(&delegated->napi);
        }

        mptcp_subflow_init();
        mptcp_pm_init();
        mptcp_sched_init();
        mptcp_token_init();

        if (proto_register(&mptcp_prot, 1) != 0)
                panic("Failed to register MPTCP proto.\n");

        inet_register_protosw(&mptcp_protosw);

        BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
}

#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static const struct proto_ops mptcp_v6_stream_ops = {
        .family                   = PF_INET6,
        .owner                   = THIS_MODULE,
        .release           = inet6_release,
        .bind                   = mptcp_bind,
        .connect           = inet_stream_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = mptcp_stream_accept,
        .getname           = inet6_getname,
        .poll                   = mptcp_poll,
        .ioctl                   = inet6_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = mptcp_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet6_sendmsg,
        .recvmsg           = inet6_recvmsg,
        .mmap                   = sock_no_mmap,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet6_compat_ioctl,
#endif
        .set_rcvlowat           = mptcp_set_rcvlowat,
};

static struct proto mptcp_v6_prot;

static struct inet_protosw mptcp_v6_protosw = {
        .type                = SOCK_STREAM,
        .protocol        = IPPROTO_MPTCP,
        .prot                = &mptcp_v6_prot,
        .ops                = &mptcp_v6_stream_ops,
        .flags                = INET_PROTOSW_ICSK,
};

int __init mptcp_proto_v6_init(void)
{
        int err;

        mptcp_v6_prot = mptcp_prot;
        strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name));
        mptcp_v6_prot.slab = NULL;
        mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
        mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np);

        err = proto_register(&mptcp_v6_prot, 1);
        if (err)
                return err;

        err = inet6_register_protosw(&mptcp_v6_protosw);
        if (err)
                proto_unregister(&mptcp_v6_prot);

        return err;
}
#endif












































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
/* SPDX-License-Identifier: GPL-2.0-only */
/* include/net/xdp.h
 *
 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
 */
#ifndef __LINUX_NET_XDP_H__
#define __LINUX_NET_XDP_H__

#include <linux/bitfield.h>
#include <linux/filter.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h> /* skb_shared_info */

/**
 * DOC: XDP RX-queue information
 *
 * The XDP RX-queue info (xdp_rxq_info) is associated with the driver
 * level RX-ring queues.  It is information that is specific to how
 * the driver has configured a given RX-ring queue.
 *
 * Each xdp_buff frame received in the driver carries a (pointer)
 * reference to this xdp_rxq_info structure.  This provides the XDP
 * data-path read-access to RX-info for both kernel and bpf-side
 * (limited subset).
 *
 * For now, direct access is only safe while running in NAPI/softirq
 * context.  Contents are read-mostly and must not be updated during
 * driver NAPI/softirq poll.
 *
 * The driver usage API is a register and unregister API.
 *
 * The struct is not directly tied to the XDP prog.  A new XDP prog
 * can be attached as long as it doesn't change the underlying
 * RX-ring.  If the RX-ring does change significantly, the NIC driver
 * naturally needs to stop the RX-ring before purging and reallocating
 * memory.  In that process the driver MUST call unregister (which
 * also applies for driver shutdown and unload).  The register API is
 * also mandatory during RX-ring setup.
 */

enum xdp_mem_type {
        MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
        MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
        MEM_TYPE_PAGE_POOL,
        MEM_TYPE_XSK_BUFF_POOL,
        MEM_TYPE_MAX,
};

/* XDP flags for ndo_xdp_xmit */
#define XDP_XMIT_FLUSH                (1U << 0)        /* doorbell signal consumer */
#define XDP_XMIT_FLAGS_MASK        XDP_XMIT_FLUSH

struct xdp_mem_info {
        u32 type; /* enum xdp_mem_type, but known size type */
        u32 id;
};

struct page_pool;

struct xdp_rxq_info {
        struct net_device *dev;
        u32 queue_index;
        u32 reg_state;
        struct xdp_mem_info mem;
        unsigned int napi_id;
        u32 frag_size;
} ____cacheline_aligned; /* perf critical, avoid false-sharing */

struct xdp_txq_info {
        struct net_device *dev;
};

enum xdp_buff_flags {
        XDP_FLAGS_HAS_FRAGS                = BIT(0), /* non-linear xdp buff */
        XDP_FLAGS_FRAGS_PF_MEMALLOC        = BIT(1), /* xdp paged memory is under
                                                   * pressure
                                                   */
};

struct xdp_buff {
        void *data;
        void *data_end;
        void *data_meta;
        void *data_hard_start;
        struct xdp_rxq_info *rxq;
        struct xdp_txq_info *txq;
        u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
        u32 flags; /* supported values defined in xdp_buff_flags */
};

static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
{
        return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
}

static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
{
        xdp->flags |= XDP_FLAGS_HAS_FRAGS;
}

static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
{
        xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
}

static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp)
{
        return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
}

static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
{
        xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}

static __always_inline void
xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
{
        xdp->frame_sz = frame_sz;
        xdp->rxq = rxq;
        xdp->flags = 0;
}

static __always_inline void
xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
                 int headroom, int data_len, const bool meta_valid)
{
        unsigned char *data = hard_start + headroom;

        xdp->data_hard_start = hard_start;
        xdp->data = data;
        xdp->data_end = data + data_len;
        xdp->data_meta = meta_valid ? data : data + 1;
}

/* Reserve memory area at end-of data area.
 *
 * This macro reserves tailroom in the XDP buffer by limiting the
 * XDP/BPF data access to data_hard_end.  Notice same area (and size)
 * is used for XDP_PASS, when constructing the SKB via build_skb().
 */
#define xdp_data_hard_end(xdp)                                \
        ((xdp)->data_hard_start + (xdp)->frame_sz -        \
         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

static inline struct skb_shared_info *
xdp_get_shared_info_from_buff(struct xdp_buff *xdp)
{
        return (struct skb_shared_info *)xdp_data_hard_end(xdp);
}

static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp)
{
        unsigned int len = xdp->data_end - xdp->data;
        struct skb_shared_info *sinfo;

        if (likely(!xdp_buff_has_frags(xdp)))
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        len += sinfo->xdp_frags_size;
out:
        return len;
}

struct xdp_frame {
        void *data;
        u16 len;
        u16 headroom;
        u32 metasize; /* uses lower 8-bits */
        /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
         * while mem info is valid on remote CPU.
         */
        struct xdp_mem_info mem;
        struct net_device *dev_rx; /* used by cpumap */
        u32 frame_sz;
        u32 flags; /* supported values defined in xdp_buff_flags */
};

static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
{
        return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
}

static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame)
{
        return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
}

#define XDP_BULK_QUEUE_SIZE        16
struct xdp_frame_bulk {
        int count;
        void *xa;
        void *q[XDP_BULK_QUEUE_SIZE];
};

static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq)
{
        /* bq->count will be zero'ed when bq->xa gets updated */
        bq->xa = NULL;
}

static inline struct skb_shared_info *
xdp_get_shared_info_from_frame(struct xdp_frame *frame)
{
        void *data_hard_start = frame->data - frame->headroom - sizeof(*frame);

        return (struct skb_shared_info *)(data_hard_start + frame->frame_sz -
                                SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
}

struct xdp_cpumap_stats {
        unsigned int redirect;
        unsigned int pass;
        unsigned int drop;
};

/* Clear kernel pointers in xdp_frame */
static inline void xdp_scrub_frame(struct xdp_frame *frame)
{
        frame->data = NULL;
        frame->dev_rx = NULL;
}

static inline void
xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags,
                           unsigned int size, unsigned int truesize,
                           bool pfmemalloc)
{
        skb_shinfo(skb)->nr_frags = nr_frags;

        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
        skb->pfmemalloc |= pfmemalloc;
}

/* Avoids inlining WARN macro in fast-path */
void xdp_warn(const char *msg, const char *func, const int line);
#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)

struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                           struct sk_buff *skb,
                                           struct net_device *dev);
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                         struct net_device *dev);
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);

static inline
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
{
        xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame);
        xdp->data = frame->data;
        xdp->data_end = frame->data + frame->len;
        xdp->data_meta = frame->data - frame->metasize;
        xdp->frame_sz = frame->frame_sz;
        xdp->flags = frame->flags;
}

static inline
int xdp_update_frame_from_buff(struct xdp_buff *xdp,
                               struct xdp_frame *xdp_frame)
{
        int metasize, headroom;

        /* Assure headroom is available for storing info */
        headroom = xdp->data - xdp->data_hard_start;
        metasize = xdp->data - xdp->data_meta;
        metasize = metasize > 0 ? metasize : 0;
        if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
                return -ENOSPC;

        /* Catch if driver didn't reserve tailroom for skb_shared_info */
        if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
                XDP_WARN("Driver BUG: missing reserved tailroom");
                return -ENOSPC;
        }

        xdp_frame->data = xdp->data;
        xdp_frame->len  = xdp->data_end - xdp->data;
        xdp_frame->headroom = headroom - sizeof(*xdp_frame);
        xdp_frame->metasize = metasize;
        xdp_frame->frame_sz = xdp->frame_sz;
        xdp_frame->flags = xdp->flags;

        return 0;
}

/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
{
        struct xdp_frame *xdp_frame;

        if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                return xdp_convert_zc_to_xdp_frame(xdp);

        /* Store info in top of packet */
        xdp_frame = xdp->data_hard_start;
        if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0))
                return NULL;

        /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
        xdp_frame->mem = xdp->rxq->mem;

        return xdp_frame;
}

void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
                  struct xdp_buff *xdp);
void xdp_return_frame(struct xdp_frame *xdpf);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
void xdp_return_buff(struct xdp_buff *xdp);
void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
                           struct xdp_frame_bulk *bq);

static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
{
        struct skb_shared_info *sinfo;
        unsigned int len = xdpf->len;

        if (likely(!xdp_frame_has_frags(xdpf)))
                goto out;

        sinfo = xdp_get_shared_info_from_frame(xdpf);
        len += sinfo->xdp_frags_size;
out:
        return len;
}

int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                       struct net_device *dev, u32 queue_index,
                       unsigned int napi_id, u32 frag_size);
static inline int
xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                 struct net_device *dev, u32 queue_index,
                 unsigned int napi_id)
{
        return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0);
}

void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
                               enum xdp_mem_type type, void *allocator);
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq);
int xdp_reg_mem_model(struct xdp_mem_info *mem,
                      enum xdp_mem_type type, void *allocator);
void xdp_unreg_mem_model(struct xdp_mem_info *mem);

/* Drivers not supporting XDP metadata can use this helper, which
 * rejects any room expansion for metadata as a result.
 */
static __always_inline void
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
{
        xdp->data_meta = xdp->data + 1;
}

static __always_inline bool
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
{
        return unlikely(xdp->data_meta > xdp->data);
}

static inline bool xdp_metalen_invalid(unsigned long metalen)
{
        unsigned long meta_max;

        meta_max = type_max(typeof_member(struct skb_shared_info, meta_len));
        BUILD_BUG_ON(!__builtin_constant_p(meta_max));

        return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max;
}

struct xdp_attachment_info {
        struct bpf_prog *prog;
        u32 flags;
};

struct netdev_bpf;
void xdp_attachment_setup(struct xdp_attachment_info *info,
                          struct netdev_bpf *bpf);

#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE

/* Define the relationship between xdp-rx-metadata kfunc and
 * various other entities:
 * - xdp_rx_metadata enum
 * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml)
 * - kfunc name
 * - xdp_metadata_ops field
 */
#define XDP_METADATA_KFUNC_xxx        \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
                           NETDEV_XDP_RX_METADATA_TIMESTAMP, \
                           bpf_xdp_metadata_rx_timestamp, \
                           xmo_rx_timestamp) \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
                           NETDEV_XDP_RX_METADATA_HASH, \
                           bpf_xdp_metadata_rx_hash, \
                           xmo_rx_hash) \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
                           NETDEV_XDP_RX_METADATA_VLAN_TAG, \
                           bpf_xdp_metadata_rx_vlan_tag, \
                           xmo_rx_vlan_tag) \

enum xdp_rx_metadata {
#define XDP_METADATA_KFUNC(name, _, __, ___) name,
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
MAX_XDP_METADATA_KFUNC,
};

enum xdp_rss_hash_type {
        /* First part: Individual bits for L3/L4 types */
        XDP_RSS_L3_IPV4                = BIT(0),
        XDP_RSS_L3_IPV6                = BIT(1),

        /* The fixed (L3) IPv4 and IPv6 headers can both be followed by
         * variable/dynamic headers, IPv4 called Options and IPv6 called
         * Extension Headers. HW RSS type can contain this info.
         */
        XDP_RSS_L3_DYNHDR        = BIT(2),

        /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in
         * addition to the protocol specific bit.  This ease interaction with
         * SKBs and avoids reserving a fixed mask for future L4 protocol bits.
         */
        XDP_RSS_L4                = BIT(3), /* L4 based hash, proto can be unknown */
        XDP_RSS_L4_TCP                = BIT(4),
        XDP_RSS_L4_UDP                = BIT(5),
        XDP_RSS_L4_SCTP                = BIT(6),
        XDP_RSS_L4_IPSEC        = BIT(7), /* L4 based hash include IPSEC SPI */
        XDP_RSS_L4_ICMP                = BIT(8),

        /* Second part: RSS hash type combinations used for driver HW mapping */
        XDP_RSS_TYPE_NONE            = 0,
        XDP_RSS_TYPE_L2              = XDP_RSS_TYPE_NONE,

        XDP_RSS_TYPE_L3_IPV4         = XDP_RSS_L3_IPV4,
        XDP_RSS_TYPE_L3_IPV6         = XDP_RSS_L3_IPV6,
        XDP_RSS_TYPE_L3_IPV4_OPT     = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L3_IPV6_EX      = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR,

        XDP_RSS_TYPE_L4_ANY          = XDP_RSS_L4,
        XDP_RSS_TYPE_L4_IPV4_TCP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
        XDP_RSS_TYPE_L4_IPV4_UDP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
        XDP_RSS_TYPE_L4_IPV4_SCTP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
        XDP_RSS_TYPE_L4_IPV4_IPSEC   = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
        XDP_RSS_TYPE_L4_IPV4_ICMP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP,

        XDP_RSS_TYPE_L4_IPV6_TCP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
        XDP_RSS_TYPE_L4_IPV6_UDP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
        XDP_RSS_TYPE_L4_IPV6_SCTP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
        XDP_RSS_TYPE_L4_IPV6_IPSEC   = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
        XDP_RSS_TYPE_L4_IPV6_ICMP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP,

        XDP_RSS_TYPE_L4_IPV6_TCP_EX  = XDP_RSS_TYPE_L4_IPV6_TCP  | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L4_IPV6_UDP_EX  = XDP_RSS_TYPE_L4_IPV6_UDP  | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR,
};

struct xdp_metadata_ops {
        int        (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp);
        int        (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash,
                               enum xdp_rss_hash_type *rss_type);
        int        (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto,
                                   u16 *vlan_tci);
};

#ifdef CONFIG_NET
u32 bpf_xdp_metadata_kfunc_id(int id);
bool bpf_dev_bound_kfunc_id(u32 btf_id);
void xdp_set_features_flag(struct net_device *dev, xdp_features_t val);
void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg);
void xdp_features_clear_redirect_target(struct net_device *dev);
#else
static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; }
static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; }

static inline void
xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
{
}

static inline void
xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
{
}

static inline void
xdp_features_clear_redirect_target(struct net_device *dev)
{
}
#endif

static inline void xdp_clear_features_flag(struct net_device *dev)
{
        xdp_set_features_flag(dev, 0);
}

static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
                                            struct xdp_buff *xdp)
{
        /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus
         * under local_bh_disable(), which provides the needed RCU protection
         * for accessing map entries.
         */
        u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp));

        if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) {
                if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev))
                        act = xdp_master_redirect(xdp);
        }

        return act;
}
#endif /* __LINUX_NET_XDP_H__ */
































































































































































































































































































































































    1 
    1 
    1 
    1 






























































































































































































































































    3 


    4 























































































































































































































































    2 


































































































































































































































































































































































































































































































































    1 


    1 





    1 







    1 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/tracepoint-defs.h>

struct folio_batch;

/*
 * The set of flags that only affect watermark checking and reclaim
 * behaviour. This is used by the MM to obey the caller constraints
 * about IO, FS and watermark checking while ignoring placement
 * hints such as HIGHMEM usage.
 */
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
                        __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
                        __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
                        __GFP_NOLOCKDEP)

/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))

/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

/*
 * Different from WARN_ON_ONCE(), no warning will be issued
 * when we specify __GFP_NOWARN.
 */
#define WARN_ON_ONCE_GFP(cond, gfp)        ({                                \
        static bool __section(".data.once") __warned;                        \
        int __ret_warn_once = !!(cond);                                        \
                                                                        \
        if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
                __warned = true;                                        \
                WARN_ON(1);                                                \
        }                                                                \
        unlikely(__ret_warn_once);                                        \
})

void page_writeback_init(void);

/*
 * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
 * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
 * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
 * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
 */
#define ENTIRELY_MAPPED                0x800000
#define FOLIO_PAGES_MAPPED        (ENTIRELY_MAPPED - 1)

/*
 * Flags passed to __show_mem() and show_free_areas() to suppress output in
 * various contexts.
 */
#define SHOW_MEM_FILTER_NODES                (0x0001u)        /* disallowed nodes */

/*
 * How many individual pages have an elevated _mapcount.  Excludes
 * the folio's entire_mapcount.
 *
 * Don't use this function outside of debugging code.
 */
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
        return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}

/*
 * Retrieve the first entry of a folio based on a provided entry within the
 * folio. We cannot rely on folio->swap as there is no guarantee that it has
 * been initialized. Used for calling arch_swap_restore()
 */
static inline swp_entry_t folio_swap(swp_entry_t entry,
                const struct folio *folio)
{
        swp_entry_t swap = {
                .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
        };

        return swap;
}

static inline void *folio_raw_mapping(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}

#ifdef CONFIG_MMU

/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;

/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
#define FPB_IGNORE_DIRTY                ((__force fpb_t)BIT(0))

/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
#define FPB_IGNORE_SOFT_DIRTY                ((__force fpb_t)BIT(1))

static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
        if (flags & FPB_IGNORE_DIRTY)
                pte = pte_mkclean(pte);
        if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
                pte = pte_clear_soft_dirty(pte);
        return pte_wrprotect(pte_mkold(pte));
}

/**
 * folio_pte_batch - detect a PTE batch for a large folio
 * @folio: The large folio to detect a PTE batch for.
 * @addr: The user virtual address the first page is mapped at.
 * @start_ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @max_nr: The maximum number of table entries to consider.
 * @flags: Flags to modify the PTE batch semantics.
 * @any_writable: Optional pointer to indicate whether any entry except the
 *                  first one is writable.
 * @any_young: Optional pointer to indicate whether any entry except the
 *                  first one is young.
 * @any_dirty: Optional pointer to indicate whether any entry except the
 *                  first one is dirty.
 *
 * Detect a PTE batch: consecutive (present) PTEs that map consecutive
 * pages of the same large folio.
 *
 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
 * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
 * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
 *
 * start_ptep must map any page of the folio. max_nr must be at least one and
 * must be limited by the caller so scanning cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
                pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
                bool *any_writable, bool *any_young, bool *any_dirty)
{
        unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
        const pte_t *end_ptep = start_ptep + max_nr;
        pte_t expected_pte, *ptep;
        bool writable, young, dirty;
        int nr;

        if (any_writable)
                *any_writable = false;
        if (any_young)
                *any_young = false;
        if (any_dirty)
                *any_dirty = false;

        VM_WARN_ON_FOLIO(!pte_present(pte), folio);
        VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
        VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);

        nr = pte_batch_hint(start_ptep, pte);
        expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
        ptep = start_ptep + nr;

        while (ptep < end_ptep) {
                pte = ptep_get(ptep);
                if (any_writable)
                        writable = !!pte_write(pte);
                if (any_young)
                        young = !!pte_young(pte);
                if (any_dirty)
                        dirty = !!pte_dirty(pte);
                pte = __pte_batch_clear_ignored(pte, flags);

                if (!pte_same(pte, expected_pte))
                        break;

                /*
                 * Stop immediately once we reached the end of the folio. In
                 * corner cases the next PFN might fall into a different
                 * folio.
                 */
                if (pte_pfn(pte) >= folio_end_pfn)
                        break;

                if (any_writable)
                        *any_writable |= writable;
                if (any_young)
                        *any_young |= young;
                if (any_dirty)
                        *any_dirty |= dirty;

                nr = pte_batch_hint(ptep, pte);
                expected_pte = pte_advance_pfn(expected_pte, nr);
                ptep += nr;
        }

        return min(ptep - start_ptep, max_nr);
}

/**
 * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
 * @pte: The initial pte state; is_swap_pte(pte) must be true and
 *         non_swap_entry() must be false.
 *
 * Increments the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_next_swp_offset(pte_t pte)
{
        swp_entry_t entry = pte_to_swp_entry(pte);
        pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
                                                   (swp_offset(entry) + 1)));

        if (pte_swp_soft_dirty(pte))
                new = pte_swp_mksoft_dirty(new);
        if (pte_swp_exclusive(pte))
                new = pte_swp_mkexclusive(new);
        if (pte_swp_uffd_wp(pte))
                new = pte_swp_mkuffd_wp(new);

        return new;
}

/**
 * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
 * @start_ptep: Page table pointer for the first entry.
 * @max_nr: The maximum number of table entries to consider.
 * @pte: Page table entry for the first entry.
 *
 * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
 * containing swap entries all with consecutive offsets and targeting the same
 * swap type, all with matching swp pte bits.
 *
 * max_nr must be at least one and must be limited by the caller so scanning
 * cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
        pte_t expected_pte = pte_next_swp_offset(pte);
        const pte_t *end_ptep = start_ptep + max_nr;
        pte_t *ptep = start_ptep + 1;

        VM_WARN_ON(max_nr < 1);
        VM_WARN_ON(!is_swap_pte(pte));
        VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));

        while (ptep < end_ptep) {
                pte = ptep_get(ptep);

                if (!pte_same(pte, expected_pte))
                        break;

                expected_pte = pte_next_swp_offset(expected_pte);
                ptep++;
        }

        return ptep - start_ptep;
}
#endif /* CONFIG_MMU */

void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
                                                int nr_throttled);
static inline void acct_reclaim_writeback(struct folio *folio)
{
        pg_data_t *pgdat = folio_pgdat(folio);
        int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);

        if (nr_throttled)
                __acct_reclaim_writeback(pgdat, folio, nr_throttled);
}

static inline void wake_throttle_isolated(pg_data_t *pgdat)
{
        wait_queue_head_t *wqh;

        wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
        if (waitqueue_active(wqh))
                wake_up(wqh);
}

vm_fault_t vmf_anon_prepare(struct vm_fault *vmf);
vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *start_vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);

struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details);

void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
                unsigned int order);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
                struct file *file, pgoff_t index, unsigned long nr_to_read)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
        force_page_cache_ra(&ractl, nr_to_read);
}

unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
                loff_t end);
long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed);

/**
 * folio_evictable - Test whether a folio is evictable.
 * @folio: The folio to test.
 *
 * Test whether @folio is evictable -- i.e., should be placed on
 * active/inactive lists vs unevictable list.
 *
 * Reasons folio might not be evictable:
 * 1. folio's mapping marked unevictable
 * 2. One of the pages in the folio is part of an mlocked VMA
 */
static inline bool folio_evictable(struct folio *folio)
{
        bool ret;

        /* Prevent address_space of inode and swap cache from being freed */
        rcu_read_lock();
        ret = !mapping_unevictable(folio_mapping(folio)) &&
                        !folio_test_mlocked(folio);
        rcu_read_unlock();
        return ret;
}

/*
 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 * a count of one.
 */
static inline void set_page_refcounted(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_ref_count(page), page);
        set_page_count(page, 1);
}

/*
 * Return true if a folio needs ->release_folio() calling upon it.
 */
static inline bool folio_needs_release(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        return folio_has_private(folio) ||
                (mapping && mapping_release_always(mapping));
}

extern unsigned long highest_memmap_pfn;

/*
 * Maximum number of reclaim retries without progress before the OOM
 * killer is consider the only way forward.
 */
#define MAX_RECLAIM_RETRIES 16

/*
 * in mm/vmscan.c:
 */
bool isolate_lru_page(struct page *page);
bool folio_isolate_lru(struct folio *folio);
void putback_lru_page(struct page *page);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);

/*
 * in mm/rmap.c:
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

/*
 * in mm/page_alloc.c
 */
#define K(x) ((x) << (PAGE_SHIFT-10))

extern char * const zone_names[MAX_NR_ZONES];

/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);

extern int min_free_kbytes;

void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
int __meminit init_per_zone_wmark_min(void);
void page_alloc_sysctl_init(void);

/*
 * Structure for holding the mostly immutable allocation parameters passed
 * between functions involved in allocations, including the alloc_pages*
 * family of functions.
 *
 * nodemask, migratetype and highest_zoneidx are initialized only once in
 * __alloc_pages() and then never change.
 *
 * zonelist, preferred_zone and highest_zoneidx are set first in
 * __alloc_pages() for the fast path, and might be later changed
 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 * by a const pointer.
 */
struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zoneref *preferred_zoneref;
        int migratetype;

        /*
         * highest_zoneidx represents highest usable zone index of
         * the allocation request. Due to the nature of the zone,
         * memory on lower zone than the highest_zoneidx will be
         * protected by lowmem_reserve[highest_zoneidx].
         *
         * highest_zoneidx is also used by reclaim/compaction to limit
         * the target zone since higher zone than this index cannot be
         * usable for this allocation request.
         */
        enum zone_type highest_zoneidx;
        bool spread_dirty_pages;
};

/*
 * This function returns the order of a free page in the buddy system. In
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 * page cannot be allocated or merged in parallel. Alternatively, it must
 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 */
static inline unsigned int buddy_order(struct page *page)
{
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
}

/*
 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
 * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
#define buddy_order_unsafe(page)        READ_ONCE(page_private(page))

/*
 * This function checks whether a page is free && is the buddy
 * we can coalesce a page and its buddy if
 * (a) the buddy is not in a hole (check before calling!) &&
 * (b) the buddy is in the buddy system &&
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
 * For recording whether a page is in the buddy system, we set PageBuddy.
 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
static inline bool page_is_buddy(struct page *page, struct page *buddy,
                                 unsigned int order)
{
        if (!page_is_guard(buddy) && !PageBuddy(buddy))
                return false;

        if (buddy_order(buddy) != order)
                return false;

        /*
         * zone check is done late to avoid uselessly calculating
         * zone/node ids for pages that could never merge.
         */
        if (page_zone_id(page) != page_zone_id(buddy))
                return false;

        VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

        return true;
}

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
 */
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
        return page_pfn ^ (1 << order);
}

/*
 * Find the buddy of @page and validate it.
 * @page: The input page
 * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
 *       function is used in the performance-critical __free_one_page().
 * @order: The order of the page
 * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
 *             page_to_pfn().
 *
 * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
 * not the same as @page. The validation is necessary before use it.
 *
 * Return: the found buddy page or NULL if not found.
 */
static inline struct page *find_buddy_page_pfn(struct page *page,
                        unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
{
        unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
        struct page *buddy;

        buddy = page + (__buddy_pfn - pfn);
        if (buddy_pfn)
                *buddy_pfn = __buddy_pfn;

        if (page_is_buddy(page, buddy, order))
                return buddy;
        return NULL;
}

extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone);

static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone)
{
        if (zone->contiguous)
                return pfn_to_page(start_pfn);

        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}

void set_zone_contiguous(struct zone *zone);

static inline void clear_zone_contiguous(struct zone *zone)
{
        zone->contiguous = false;
}

extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
                                    int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
                                        unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);

/*
 * This will have no effect, other than possibly generating a warning, if the
 * caller passes in a non-large folio.
 */
static inline void folio_set_order(struct folio *folio, unsigned int order)
{
        if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
                return;

        folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef CONFIG_64BIT
        folio->_folio_nr_pages = 1U << order;
#endif
}

void folio_undo_large_rmappable(struct folio *folio);

static inline struct folio *page_rmappable_folio(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (folio && folio_test_large(folio))
                folio_set_large_rmappable(folio);
        return folio;
}

static inline void prep_compound_head(struct page *page, unsigned int order)
{
        struct folio *folio = (struct folio *)page;

        folio_set_order(folio, order);
        atomic_set(&folio->_large_mapcount, -1);
        atomic_set(&folio->_entire_mapcount, -1);
        atomic_set(&folio->_nr_pages_mapped, 0);
        atomic_set(&folio->_pincount, 0);
        if (order > 1)
                INIT_LIST_HEAD(&folio->_deferred_list);
}

static inline void prep_compound_tail(struct page *head, int tail_idx)
{
        struct page *p = head + tail_idx;

        p->mapping = TAIL_MAPPING;
        set_compound_head(p, head);
        set_page_private(p, 0);
}

extern void prep_compound_page(struct page *page, unsigned int order);

extern void post_alloc_hook(struct page *page, unsigned int order,
                                        gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);

extern int user_min_free_kbytes;

void free_unref_page(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);

extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
extern void zone_pcp_init(struct zone *zone);

extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
                          phys_addr_t min_addr,
                          int nid, bool exact_nid);

void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
                unsigned long, enum meminit_context, struct vmem_altmap *, int);

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/*
 * in mm/compaction.c
 */
/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
        struct list_head freepages[NR_PAGE_ORDERS];        /* List of free pages to migrate to */
        struct list_head migratepages;        /* List of pages being migrated */
        unsigned int nr_freepages;        /* Number of isolated free pages */
        unsigned int nr_migratepages;        /* Number of pages to migrate */
        unsigned long free_pfn;                /* isolate_freepages search base */
        /*
         * Acts as an in/out parameter to page isolation for migration.
         * isolate_migratepages uses it as a search base.
         * isolate_migratepages_block will update the value to the next pfn
         * after the last isolated one.
         */
        unsigned long migrate_pfn;
        unsigned long fast_start_pfn;        /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
        unsigned long total_free_scanned;
        unsigned short fast_search_fail;/* failures to use free list searches */
        short search_order;                /* order to start a fast search at */
        const gfp_t gfp_mask;                /* gfp mask of a direct compactor */
        int order;                        /* order a direct compactor needs */
        int migratetype;                /* migratetype of direct compactor */
        const unsigned int alloc_flags;        /* alloc flags of a direct compactor */
        const int highest_zoneidx;        /* zone index of a direct compactor */
        enum migrate_mode mode;                /* Async or sync migration mode */
        bool ignore_skip_hint;                /* Scan blocks even if marked skip */
        bool no_set_skip_hint;                /* Don't mark blocks for skipping */
        bool ignore_block_suitable;        /* Scan blocks considered unsuitable */
        bool direct_compaction;                /* False from kcompactd or /proc/... */
        bool proactive_compaction;        /* kcompactd proactive compaction */
        bool whole_zone;                /* Whole zone should/has been scanned */
        bool contended;                        /* Signal lock contention */
        bool finish_pageblock;                /* Scan the remainder of a pageblock. Used
                                         * when there are potentially transient
                                         * isolation or migration failures to
                                         * ensure forward progress.
                                         */
        bool alloc_contig;                /* alloc_contig_range allocation */
};

/*
 * Used in direct compaction when a page should be taken from the freelists
 * immediately when one is created during the free path.
 */
struct capture_control {
        struct compact_control *cc;
        struct page *page;
};

unsigned long
isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
int
isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);

int __alloc_contig_migrate_range(struct compact_control *cc,
                                        unsigned long start, unsigned long end,
                                        int migratetype);

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);

#endif /* CONFIG_COMPACTION || CONFIG_CMA */

int find_suitable_fallback(struct free_area *area, unsigned int order,
                        int migratetype, bool only_stealable, bool *can_steal);

static inline bool free_area_empty(struct free_area *area, int migratetype)
{
        return list_empty(&area->free_list[migratetype]);
}

/*
 * These three helpers classifies VMAs for virtual memory accounting.
 */

/*
 * Executable code area - executable, not writable, not stack
 */
static inline bool is_exec_mapping(vm_flags_t flags)
{
        return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}

/*
 * Stack area (including shadow stacks)
 *
 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
 * do_mmap() forbids all other combinations.
 */
static inline bool is_stack_mapping(vm_flags_t flags)
{
        return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
}

/*
 * Data area - private, writable, not stack
 */
static inline bool is_data_mapping(vm_flags_t flags)
{
        return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}

/* mm/util.c */
struct anon_vma *folio_anon_vma(struct folio *folio);

#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
                unsigned long end, bool write, int *locked);
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
                               unsigned long bytes);

/*
 * NOTE: This function can't tell whether the folio is "fully mapped" in the
 * range.
 * "fully mapped" means all the pages of folio is associated with the page
 * table of range while this function just check whether the folio range is
 * within the range [start, end). Function caller needs to do page table
 * check if it cares about the page table association.
 *
 * Typical usage (like mlock or madvise) is:
 * Caller knows at least 1 page of folio is associated with page table of VMA
 * and the range [start, end) is intersect with the VMA range. Caller wants
 * to know whether the folio is fully associated with the range. It calls
 * this function to check whether the folio is in the range first. Then checks
 * the page table to know whether the folio is fully mapped to the range.
 */
static inline bool
folio_within_range(struct folio *folio, struct vm_area_struct *vma,
                unsigned long start, unsigned long end)
{
        pgoff_t pgoff, addr;
        unsigned long vma_pglen = vma_pages(vma);

        VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
        if (start > end)
                return false;

        if (start < vma->vm_start)
                start = vma->vm_start;

        if (end > vma->vm_end)
                end = vma->vm_end;

        pgoff = folio_pgoff(folio);

        /* if folio start address is not in vma range */
        if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
                return false;

        addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);

        return !(addr < start || end - addr < folio_size(folio));
}

static inline bool
folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
{
        return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
}

/*
 * mlock_vma_folio() and munlock_vma_folio():
 * should be called with vma's mmap_lock held for read or write,
 * under page table lock for the pte/pmd being added or removed.
 *
 * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
 * the end of folio_remove_rmap_*(); but new anon folios are managed by
 * folio_add_lru_vma() calling mlock_new_folio().
 */
void mlock_folio(struct folio *folio);
static inline void mlock_vma_folio(struct folio *folio,
                                struct vm_area_struct *vma)
{
        /*
         * The VM_SPECIAL check here serves two purposes.
         * 1) VM_IO check prevents migration from double-counting during mlock.
         * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
         *    is never left set on a VM_SPECIAL vma, there is an interval while
         *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
         *    still be set while VM_SPECIAL bits are added: so ignore it then.
         */
        if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
                mlock_folio(folio);
}

void munlock_folio(struct folio *folio);
static inline void munlock_vma_folio(struct folio *folio,
                                        struct vm_area_struct *vma)
{
        /*
         * munlock if the function is called. Ideally, we should only
         * do munlock if any page of folio is unmapped from VMA and
         * cause folio not fully mapped to VMA.
         *
         * But it's not easy to confirm that's the situation. So we
         * always munlock the folio and page reclaim will correct it
         * if it's wrong.
         */
        if (unlikely(vma->vm_flags & VM_LOCKED))
                munlock_folio(folio);
}

void mlock_new_folio(struct folio *folio);
bool need_mlock_drain(int cpu);
void mlock_drain_local(void);
void mlock_drain_remote(int cpu);

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/**
 * vma_address - Find the virtual address a page range is mapped at
 * @vma: The vma which maps this object.
 * @pgoff: The page offset within its object.
 * @nr_pages: The number of pages to consider.
 *
 * If any page in this range is mapped by this VMA, return the first address
 * where any of these pages appear.  Otherwise, return -EFAULT.
 */
static inline unsigned long vma_address(struct vm_area_struct *vma,
                pgoff_t pgoff, unsigned long nr_pages)
{
        unsigned long address;

        if (pgoff >= vma->vm_pgoff) {
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                /* Check for address beyond vma (or wrapped through 0?) */
                if (address < vma->vm_start || address >= vma->vm_end)
                        address = -EFAULT;
        } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
                /* Test above avoids possibility of wrap to 0 on 32-bit */
                address = vma->vm_start;
        } else {
                address = -EFAULT;
        }
        return address;
}

/*
 * Then at what user virtual address will none of the range be found in vma?
 * Assumes that vma_address() already returned a good starting address.
 */
static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
{
        struct vm_area_struct *vma = pvmw->vma;
        pgoff_t pgoff;
        unsigned long address;

        /* Common case, plus ->pgoff is invalid for KSM */
        if (pvmw->nr_pages == 1)
                return pvmw->address + PAGE_SIZE;

        pgoff = pvmw->pgoff + pvmw->nr_pages;
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        /* Check for address beyond vma (or wrapped through 0?) */
        if (address < vma->vm_start || address > vma->vm_end)
                address = vma->vm_end;
        return address;
}

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                                                    struct file *fpin)
{
        int flags = vmf->flags;

        if (fpin)
                return fpin;

        /*
         * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
         * anything, so we only pin the file and drop the mmap_lock if only
         * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
         */
        if (fault_flag_allow_retry_first(flags) &&
            !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
                fpin = get_file(vmf->vma->vm_file);
                release_fault_lock(vmf);
        }
        return fpin;
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void mlock_new_folio(struct folio *folio) { }
static inline bool need_mlock_drain(int cpu) { return false; }
static inline void mlock_drain_local(void) { }
static inline void mlock_drain_remote(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
#endif /* !CONFIG_MMU */

/* Memory initialisation debug and verification */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
DECLARE_STATIC_KEY_TRUE(deferred_pages);

bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

enum mminit_level {
        MMINIT_WARNING,
        MMINIT_VERIFY,
        MMINIT_TRACE
};

#ifdef CONFIG_DEBUG_MEMORY_INIT

extern int mminit_loglevel;

#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
        if (level < mminit_loglevel) { \
                if (level <= MMINIT_WARNING) \
                        pr_warn("mminit::" prefix " " fmt, ##arg);        \
                else \
                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
} while (0)

extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else

static inline void mminit_dprintk(enum mminit_level level,
                                const char *prefix, const char *fmt, ...)
{
}

static inline void mminit_verify_pageflags_layout(void)
{
}

static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */

#define NODE_RECLAIM_NOSCAN        -2
#define NODE_RECLAIM_FULL        -1
#define NODE_RECLAIM_SOME        0
#define NODE_RECLAIM_SUCCESS        1

#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
{
        return NODE_RECLAIM_NOSCAN;
}
static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
{
        return NUMA_NO_NODE;
}
#endif

/*
 * mm/memory-failure.c
 */
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);

extern u32 hwpoison_filter_dev_major;
extern u32 hwpoison_filter_dev_minor;
extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;

extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

extern void set_pageblock_order(void);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *folio_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN                WMARK_MIN
#define ALLOC_WMARK_LOW                WMARK_LOW
#define ALLOC_WMARK_HIGH        WMARK_HIGH
#define ALLOC_NO_WATERMARKS        0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM                0x08
#else
#define ALLOC_OOM                ALLOC_NO_WATERMARKS
#endif

#define ALLOC_NON_BLOCK                 0x10 /* Caller cannot block. Allow access
                                       * to 25% of the min watermark or
                                       * 62.5% if __GFP_HIGH is set.
                                       */
#define ALLOC_MIN_RESERVE         0x20 /* __GFP_HIGH set. Allow access to 50%
                                       * of the min watermark.
                                       */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
#define ALLOC_CMA                 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT          0x0
#endif
#define ALLOC_HIGHATOMIC        0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_KSWAPD                0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

/* Flags that allow allocations below the min watermark. */
#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)

enum ttu_flags;
struct tlbflush_unmap_batch;


/*
 * only for MM internal work items which do not depend on
 * any allocations or locks which might depend on allocations
 */
extern struct workqueue_struct *mm_percpu_wq;

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags pagetype_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];

static inline bool is_migrate_highatomic(enum migratetype migratetype)
{
        return migratetype == MIGRATE_HIGHATOMIC;
}

void setup_zone_pageset(struct zone *zone);

struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
        gfp_t gfp_mask;
        enum migrate_reason reason;
};

/*
 * mm/filemap.c
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size);

/*
 * mm/vmalloc.c
 */
#ifdef CONFIG_MMU
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift);
#else
static inline void vmalloc_init(void)
{
}

static inline
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        return -EINVAL;
}
#endif

int __must_check __vmap_pages_range_noflush(unsigned long addr,
                               unsigned long end, pgprot_t prot,
                               struct page **pages, unsigned int page_shift);

void vunmap_range_noflush(unsigned long start, unsigned long end);

void __vunmap_range_noflush(unsigned long start, unsigned long end);

int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int page_nid, int *flags);

void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_page(struct page *page);

/*
 * mm/gup.c
 */
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
int __must_check try_grab_page(struct page *page, unsigned int flags);

/*
 * mm/huge_memory.c
 */
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write);
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write);

/*
 * mm/mmap.c
 */
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
                                        struct vm_area_struct *vma,
                                        unsigned long delta);

enum {
        /* mark page accessed */
        FOLL_TOUCH = 1 << 16,
        /* a retry, previous pass started an IO */
        FOLL_TRIED = 1 << 17,
        /* we are working on non-current tsk/mm */
        FOLL_REMOTE = 1 << 18,
        /* pages must be released via unpin_user_page */
        FOLL_PIN = 1 << 19,
        /* gup_fast: prevent fall-back to slow gup */
        FOLL_FAST_ONLY = 1 << 20,
        /* allow unlocking the mmap lock */
        FOLL_UNLOCKABLE = 1 << 21,
        /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
        FOLL_MADV_POPULATE = 1 << 22,
};

#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
                            FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
                            FOLL_MADV_POPULATE)

/*
 * Indicates for which pages that are write-protected in the page table,
 * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
 * GUP pin will remain consistent with the pages mapped into the page tables
 * of the MM.
 *
 * Temporary unmapping of PageAnonExclusive() pages or clearing of
 * PageAnonExclusive() has to protect against concurrent GUP:
 * * Ordinary GUP: Using the PT lock
 * * GUP-fast and fork(): mm->write_protect_seq
 * * GUP-fast and KSM or temporary unmapping (swap, migration): see
 *    folio_try_share_anon_rmap_*()
 *
 * Must be called with the (sub)page that's actually referenced via the
 * page table entry, which might not necessarily be the head page for a
 * PTE-mapped THP.
 *
 * If the vma is NULL, we're coming from the GUP-fast path and might have
 * to fallback to the slow path just to lookup the vma.
 */
static inline bool gup_must_unshare(struct vm_area_struct *vma,
                                    unsigned int flags, struct page *page)
{
        /*
         * FOLL_WRITE is implicitly handled correctly as the page table entry
         * has to be writable -- and if it references (part of) an anonymous
         * folio, that part is required to be marked exclusive.
         */
        if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
                return false;
        /*
         * Note: PageAnon(page) is stable until the page is actually getting
         * freed.
         */
        if (!PageAnon(page)) {
                /*
                 * We only care about R/O long-term pining: R/O short-term
                 * pinning does not have the semantics to observe successive
                 * changes through the process page tables.
                 */
                if (!(flags & FOLL_LONGTERM))
                        return false;

                /* We really need the vma ... */
                if (!vma)
                        return true;

                /*
                 * ... because we only care about writable private ("COW")
                 * mappings where we have to break COW early.
                 */
                return is_cow_mapping(vma->vm_flags);
        }

        /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_rmb();

        /*
         * Note that PageKsm() pages cannot be exclusive, and consequently,
         * cannot get pinned.
         */
        return !PageAnonExclusive(page);
}

extern bool mirrored_kernelcore;
extern bool memblock_has_mirror(void);

static __always_inline void vma_set_range(struct vm_area_struct *vma,
                                          unsigned long start, unsigned long end,
                                          pgoff_t pgoff)
{
        vma->vm_start = start;
        vma->vm_end = end;
        vma->vm_pgoff = pgoff;
}

static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
        /*
         * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
         * enablements, because when without soft-dirty being compiled in,
         * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
         * will be constantly true.
         */
        if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
                return false;

        /*
         * Soft-dirty is kind of special: its tracking is enabled when the
         * vma flags not set.
         */
        return !(vma->vm_flags & VM_SOFTDIRTY);
}

static inline void vma_iter_config(struct vma_iterator *vmi,
                unsigned long index, unsigned long last)
{
        __mas_set_range(&vmi->mas, index, last - 1);
}

static inline void vma_iter_reset(struct vma_iterator *vmi)
{
        mas_reset(&vmi->mas);
}

static inline
struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
{
        return mas_prev_range(&vmi->mas, min);
}

static inline
struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
{
        return mas_next_range(&vmi->mas, max);
}

static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
                                       unsigned long max, unsigned long size)
{
        return mas_empty_area(&vmi->mas, min, max - 1, size);
}

static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
                                        unsigned long max, unsigned long size)
{
        return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
}

/*
 * VMA Iterator functions shared between nommu and mmap
 */
static inline int vma_iter_prealloc(struct vma_iterator *vmi,
                struct vm_area_struct *vma)
{
        return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
}

static inline void vma_iter_clear(struct vma_iterator *vmi)
{
        mas_store_prealloc(&vmi->mas, NULL);
}

static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
{
        return mas_walk(&vmi->mas);
}

/* Store a VMA with preallocated memory */
static inline void vma_iter_store(struct vma_iterator *vmi,
                                  struct vm_area_struct *vma)
{

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.index > vma->vm_start)) {
                pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
                        vmi->mas.index, vma->vm_start, vma->vm_start,
                        vma->vm_end, vmi->mas.index, vmi->mas.last);
        }
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.last <  vma->vm_start)) {
                pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
                       vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
                       vmi->mas.index, vmi->mas.last);
        }
#endif

        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_prealloc(&vmi->mas, vma);
}

static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
                        struct vm_area_struct *vma, gfp_t gfp)
{
        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_gfp(&vmi->mas, vma, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

/*
 * VMA lock generalization
 */
struct vma_prepare {
        struct vm_area_struct *vma;
        struct vm_area_struct *adj_next;
        struct file *file;
        struct address_space *mapping;
        struct anon_vma *anon_vma;
        struct vm_area_struct *insert;
        struct vm_area_struct *remove;
        struct vm_area_struct *remove2;
};

void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                unsigned long zone, int nid);

/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority);

#ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */
#define VM_SEALED        _BITUL(63)
#endif

#ifdef CONFIG_64BIT
static inline int can_do_mseal(unsigned long flags)
{
        if (flags)
                return -EINVAL;

        return 0;
}

bool can_modify_mm(struct mm_struct *mm, unsigned long start,
                unsigned long end);
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
                unsigned long end, int behavior);
#else
static inline int can_do_mseal(unsigned long flags)
{
        return -EPERM;
}

static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
                unsigned long end)
{
        return true;
}

static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
                unsigned long end, int behavior)
{
        return true;
}
#endif

#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
                        struct shrinker *shrinker, const char *fmt, va_list ap)
{
        shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);

        return shrinker->name ? 0 : -ENOMEM;
}

static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
        kfree_const(shrinker->name);
        shrinker->name = NULL;
}

extern int shrinker_debugfs_add(struct shrinker *shrinker);
extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                              int *debugfs_id);
extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                    int debugfs_id);
#else /* CONFIG_SHRINKER_DEBUG */
static inline int shrinker_debugfs_add(struct shrinker *shrinker)
{
        return 0;
}
static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
                                              const char *fmt, va_list ap)
{
        return 0;
}
static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
}
static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                                     int *debugfs_id)
{
        *debugfs_id = -1;
        return NULL;
}
static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                           int debugfs_id)
{
}
#endif /* CONFIG_SHRINKER_DEBUG */

/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;

#endif        /* __MM_INTERNAL_H */

















































































































    3 












    3 
    3 

    3 































    3 
    3 


























    3 



















    3 

















    3 



    3 










    3 


    3 













































































































































































































    3 





















































    3 
    3 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
// SPDX-License-Identifier: GPL-2.0
/*
 * FPU signal frame handling routines.
 */

#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/pagemap.h>

#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>

#include <asm/sigframe.h>
#include <asm/trapnr.h>
#include <asm/trace/fpu.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

/*
 * Check for the presence of extended state information in the
 * user fpstate pointer in the sigcontext.
 */
static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
                                            struct _fpx_sw_bytes *fx_sw)
{
        int min_xstate_size = sizeof(struct fxregs_state) +
                              sizeof(struct xstate_header);
        void __user *fpstate = fxbuf;
        unsigned int magic2;

        if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
                return false;

        /* Check for the first magic field and other error scenarios. */
        if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
            fx_sw->xstate_size < min_xstate_size ||
            fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
            fx_sw->xstate_size > fx_sw->extended_size)
                goto setfx;

        /*
         * Check for the presence of second magic word at the end of memory
         * layout. This detects the case where the user just copied the legacy
         * fpstate layout with out copying the extended state information
         * in the memory layout.
         */
        if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
                return false;

        if (likely(magic2 == FP_XSTATE_MAGIC2))
                return true;
setfx:
        trace_x86_fpu_xstate_check_failed(&current->thread.fpu);

        /* Set the parameters for fx only state */
        fx_sw->magic1 = 0;
        fx_sw->xstate_size = sizeof(struct fxregs_state);
        fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
        return true;
}

/*
 * Signal frame handlers.
 */
static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
{
        if (use_fxsr()) {
                struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
                struct user_i387_ia32_struct env;
                struct _fpstate_32 __user *fp = buf;

                fpregs_lock();
                if (!test_thread_flag(TIF_NEED_FPU_LOAD))
                        fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
                fpregs_unlock();

                convert_from_fxsr(&env, tsk);

                if (__copy_to_user(buf, &env, sizeof(env)) ||
                    __put_user(xsave->i387.swd, &fp->status) ||
                    __put_user(X86_FXSR_MAGIC, &fp->magic))
                        return false;
        } else {
                struct fregs_state __user *fp = buf;
                u32 swd;

                if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
                        return false;
        }

        return true;
}

/*
 * Prepare the SW reserved portion of the fxsave memory layout, indicating
 * the presence of the extended state information in the memory layout
 * pointed to by the fpstate pointer in the sigcontext.
 * This is saved when ever the FP and extended state context is
 * saved on the user stack during the signal handler delivery to the user.
 */
static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
                                 struct fpstate *fpstate)
{
        sw_bytes->magic1 = FP_XSTATE_MAGIC1;
        sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
        sw_bytes->xfeatures = fpstate->user_xfeatures;
        sw_bytes->xstate_size = fpstate->user_size;

        if (ia32_frame)
                sw_bytes->extended_size += sizeof(struct fregs_state);
}

static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
                                      struct fpstate *fpstate)
{
        struct xregs_state __user *x = buf;
        struct _fpx_sw_bytes sw_bytes = {};
        u32 xfeatures;
        int err;

        /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
        save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
        err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));

        if (!use_xsave())
                return !err;

        err |= __put_user(FP_XSTATE_MAGIC2,
                          (__u32 __user *)(buf + fpstate->user_size));

        /*
         * Read the xfeatures which we copied (directly from the cpu or
         * from the state in task struct) to the user buffers.
         */
        err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures);

        /*
         * For legacy compatible, we always set FP/SSE bits in the bit
         * vector while saving the state to the user context. This will
         * enable us capturing any changes(during sigreturn) to
         * the FP/SSE bits by the legacy applications which don't touch
         * xfeatures in the xsave header.
         *
         * xsave aware apps can change the xfeatures in the xsave
         * header as well as change any contents in the memory layout.
         * xrestore as part of sigreturn will capture all the changes.
         */
        xfeatures |= XFEATURE_MASK_FPSSE;

        err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);

        return !err;
}

static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
{
        if (use_xsave())
                return xsave_to_user_sigframe(buf);
        if (use_fxsr())
                return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
        else
                return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
}

/*
 * Save the fpu, extended register state to the user signal frame.
 *
 * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
 *  state is copied.
 *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
 *
 *        buf == buf_fx for 64-bit frames and 32-bit fsave frame.
 *        buf != buf_fx for 32-bit frames with fxstate.
 *
 * Save it directly to the user frame with disabled page fault handler. If
 * that faults, try to clear the frame which handles the page fault.
 *
 * If this is a 32-bit frame with fxstate, put a fsave header before
 * the aligned state at 'buf_fx'.
 *
 * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
 * indicating the absence/presence of the extended state to the user.
 */
bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
        struct task_struct *tsk = current;
        struct fpstate *fpstate = tsk->thread.fpu.fpstate;
        bool ia32_fxstate = (buf != buf_fx);
        int ret;

        ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
                         IS_ENABLED(CONFIG_IA32_EMULATION));

        if (!static_cpu_has(X86_FEATURE_FPU)) {
                struct user_i387_ia32_struct fp;

                fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
                                                .left = sizeof(fp)});
                return !copy_to_user(buf, &fp, sizeof(fp));
        }

        if (!access_ok(buf, size))
                return false;

        if (use_xsave()) {
                struct xregs_state __user *xbuf = buf_fx;

                /*
                 * Clear the xsave header first, so that reserved fields are
                 * initialized to zero.
                 */
                if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
                        return false;
        }
retry:
        /*
         * Load the FPU registers if they are not valid for the current task.
         * With a valid FPU state we can attempt to save the state directly to
         * userland's stack frame which will likely succeed. If it does not,
         * resolve the fault in the user memory and try again.
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();

        pagefault_disable();
        ret = copy_fpregs_to_sigframe(buf_fx);
        pagefault_enable();
        fpregs_unlock();

        if (ret) {
                if (!__clear_user(buf_fx, fpstate->user_size))
                        goto retry;
                return false;
        }

        /* Save the fsave header for the 32-bit frames. */
        if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
                return false;

        if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
                return false;

        return true;
}

static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
                                      u64 xrestore, bool fx_only)
{
        if (use_xsave()) {
                u64 init_bv = ufeatures & ~xrestore;
                int ret;

                if (likely(!fx_only))
                        ret = xrstor_from_user_sigframe(buf, xrestore);
                else
                        ret = fxrstor_from_user_sigframe(buf);

                if (!ret && unlikely(init_bv))
                        os_xrstor(&init_fpstate, init_bv);
                return ret;
        } else if (use_fxsr()) {
                return fxrstor_from_user_sigframe(buf);
        } else {
                return frstor_from_user_sigframe(buf);
        }
}

/*
 * Attempt to restore the FPU registers directly from user memory.
 * Pagefaults are handled and any errors returned are fatal.
 */
static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
{
        struct fpu *fpu = &current->thread.fpu;
        int ret;

        /* Restore enabled features only. */
        xrestore &= fpu->fpstate->user_xfeatures;
retry:
        fpregs_lock();
        /* Ensure that XFD is up to date */
        xfd_update_state(fpu->fpstate);
        pagefault_disable();
        ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
                                         xrestore, fx_only);
        pagefault_enable();

        if (unlikely(ret)) {
                /*
                 * The above did an FPU restore operation, restricted to
                 * the user portion of the registers, and failed, but the
                 * microcode might have modified the FPU registers
                 * nevertheless.
                 *
                 * If the FPU registers do not belong to current, then
                 * invalidate the FPU register state otherwise the task
                 * might preempt current and return to user space with
                 * corrupted FPU registers.
                 */
                if (test_thread_flag(TIF_NEED_FPU_LOAD))
                        __cpu_invalidate_fpregs_state();
                fpregs_unlock();

                /* Try to handle #PF, but anything else is fatal. */
                if (ret != X86_TRAP_PF)
                        return false;

                if (!fault_in_readable(buf, fpu->fpstate->user_size))
                        goto retry;
                return false;
        }

        /*
         * Restore supervisor states: previous context switch etc has done
         * XSAVES and saved the supervisor states in the kernel buffer from
         * which they can be restored now.
         *
         * It would be optimal to handle this with a single XRSTORS, but
         * this does not work because the rest of the FPU registers have
         * been restored from a user buffer directly.
         */
        if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
                os_xrstor_supervisor(fpu->fpstate);

        fpregs_mark_activate();
        fpregs_unlock();
        return true;
}

static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
                              bool ia32_fxstate)
{
        struct task_struct *tsk = current;
        struct fpu *fpu = &tsk->thread.fpu;
        struct user_i387_ia32_struct env;
        bool success, fx_only = false;
        union fpregs_state *fpregs;
        u64 user_xfeatures = 0;

        if (use_xsave()) {
                struct _fpx_sw_bytes fx_sw_user;

                if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
                        return false;

                fx_only = !fx_sw_user.magic1;
                user_xfeatures = fx_sw_user.xfeatures;
        } else {
                user_xfeatures = XFEATURE_MASK_FPSSE;
        }

        if (likely(!ia32_fxstate)) {
                /* Restore the FPU registers directly from user memory. */
                return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
        }

        /*
         * Copy the legacy state because the FP portion of the FX frame has
         * to be ignored for histerical raisins. The legacy state is folded
         * in once the larger state has been copied.
         */
        if (__copy_from_user(&env, buf, sizeof(env)))
                return false;

        /*
         * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
         * not modified on context switch and that the xstate is considered
         * to be loaded again on return to userland (overriding last_cpu avoids
         * the optimisation).
         */
        fpregs_lock();
        if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
                /*
                 * If supervisor states are available then save the
                 * hardware state in current's fpstate so that the
                 * supervisor state is preserved. Save the full state for
                 * simplicity. There is no point in optimizing this by only
                 * saving the supervisor states and then shuffle them to
                 * the right place in memory. It's ia32 mode. Shrug.
                 */
                if (xfeatures_mask_supervisor())
                        os_xsave(fpu->fpstate);
                set_thread_flag(TIF_NEED_FPU_LOAD);
        }
        __fpu_invalidate_fpregs_state(fpu);
        __cpu_invalidate_fpregs_state();
        fpregs_unlock();

        fpregs = &fpu->fpstate->regs;
        if (use_xsave() && !fx_only) {
                if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
                        return false;
        } else {
                if (__copy_from_user(&fpregs->fxsave, buf_fx,
                                     sizeof(fpregs->fxsave)))
                        return false;

                if (IS_ENABLED(CONFIG_X86_64)) {
                        /* Reject invalid MXCSR values. */
                        if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
                                return false;
                } else {
                        /* Mask invalid bits out for historical reasons (broken hardware). */
                        fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
                }

                /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
                if (use_xsave())
                        fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
        }

        /* Fold the legacy FP storage */
        convert_to_fxsr(&fpregs->fxsave, &env);

        fpregs_lock();
        if (use_xsave()) {
                /*
                 * Remove all UABI feature bits not set in user_xfeatures
                 * from the memory xstate header which makes the full
                 * restore below bring them into init state. This works for
                 * fx_only mode as well because that has only FP and SSE
                 * set in user_xfeatures.
                 *
                 * Preserve supervisor states!
                 */
                u64 mask = user_xfeatures | xfeatures_mask_supervisor();

                fpregs->xsave.header.xfeatures &= mask;
                success = !os_xrstor_safe(fpu->fpstate,
                                          fpu_kernel_cfg.max_features);
        } else {
                success = !fxrstor_safe(&fpregs->fxsave);
        }

        if (likely(success))
                fpregs_mark_activate();

        fpregs_unlock();
        return success;
}

static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
{
        unsigned int size = fpstate->user_size;

        return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
}

/*
 * Restore FPU state from a sigframe:
 */
bool fpu__restore_sig(void __user *buf, int ia32_frame)
{
        struct fpu *fpu = &current->thread.fpu;
        void __user *buf_fx = buf;
        bool ia32_fxstate = false;
        bool success = false;
        unsigned int size;

        if (unlikely(!buf)) {
                fpu__clear_user_states(fpu);
                return true;
        }

        size = xstate_sigframe_size(fpu->fpstate);

        ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
                       IS_ENABLED(CONFIG_IA32_EMULATION));

        /*
         * Only FXSR enabled systems need the FX state quirk.
         * FRSTOR does not need it and can use the fast path.
         */
        if (ia32_frame && use_fxsr()) {
                buf_fx = buf + sizeof(struct fregs_state);
                size += sizeof(struct fregs_state);
                ia32_fxstate = true;
        }

        if (!access_ok(buf, size))
                goto out;

        if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
                success = !fpregs_soft_set(current, NULL, 0,
                                           sizeof(struct user_i387_ia32_struct),
                                           NULL, buf);
        } else {
                success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
        }

out:
        if (unlikely(!success))
                fpu__clear_user_states(fpu);
        return success;
}

unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
                     unsigned long *buf_fx, unsigned long *size)
{
        unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);

        *buf_fx = sp = round_down(sp - frame_size, 64);
        if (ia32_frame && use_fxsr()) {
                frame_size += sizeof(struct fregs_state);
                sp -= sizeof(struct fregs_state);
        }

        *size = frame_size;

        return sp;
}

unsigned long __init fpu__get_fpstate_size(void)
{
        unsigned long ret = fpu_user_cfg.max_size;

        if (use_xsave())
                ret += FP_XSTATE_MAGIC2_SIZE;

        /*
         * This space is needed on (most) 32-bit kernels, or when a 32-bit
         * app is running on a 64-bit kernel. To keep things simple, just
         * assume the worst case and always include space for 'freg_state',
         * even for 64-bit apps on 64-bit kernels. This wastes a bit of
         * space, but keeps the code simple.
         */
        if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
             IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
                ret += sizeof(struct fregs_state);

        return ret;
}





























































































































































































    3 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H

/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/percpu.h>

#ifdef CONFIG_ADDRESS_MASKING
/*
 * Mask out tag bits from the address.
 */
static inline unsigned long __untagged_addr(unsigned long addr)
{
        asm (ALTERNATIVE("",
                         "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM)
             : [addr] "+r" (addr)
             : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));

        return addr;
}

#define untagged_addr(addr)        ({                                        \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr(__addr);                \
})

static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
                                                   unsigned long addr)
{
        mmap_assert_locked(mm);
        return addr & (mm)->context.untag_mask;
}

#define untagged_addr_remote(mm, addr)        ({                                \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr_remote(mm, __addr);        \
})

#endif

/*
 * The virtual address space space is logically divided into a kernel
 * half and a user half.  When cast to a signed type, user pointers
 * are positive and kernel pointers are negative.
 */
#define valid_user_address(x) ((__force long)(x) >= 0)

/*
 * User pointers can have tag bits on x86-64.  This scheme tolerates
 * arbitrary values in those bits rather then masking them off.
 *
 * Enforce two rules:
 * 1. 'ptr' must be in the user half of the address space
 * 2. 'ptr+size' must not overflow into kernel addresses
 *
 * Note that addresses around the sign change are not valid addresses,
 * and will GP-fault even with LAM enabled if the sign bit is set (see
 * "CR3.LAM_SUP" that can narrow the canonicality check if we ever
 * enable it, but not remove it entirely).
 *
 * So the "overflow into kernel addresses" does not imply some sudden
 * exact boundary at the sign bit, and we can allow a lot of slop on the
 * size check.
 *
 * In fact, we could probably remove the size check entirely, since
 * any kernel accesses will be in increasing address order starting
 * at 'ptr', and even if the end might be in kernel space, we'll
 * hit the GP faults for non-canonical accesses before we ever get
 * there.
 *
 * That's a separate optimization, for now just handle the small
 * constant case.
 */
static inline bool __access_ok(const void __user *ptr, unsigned long size)
{
        if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
                return valid_user_address(ptr);
        } else {
                unsigned long sum = size + (__force unsigned long)ptr;

                return valid_user_address(sum) && sum >= (__force unsigned long)ptr;
        }
}
#define __access_ok __access_ok

/*
 * Copy To/From Userspace
 */

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
rep_movs_alternative(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned long len)
{
        stac();
        /*
         * If CPU has FSRM feature, use 'rep movs'.
         * Otherwise, use rep_movs_alternative.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep movsb",
                            "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
                "2:\n"
                _ASM_EXTABLE_UA(1b, 2b)
                :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
                : : "memory", "rax");
        clac();
        return len;
}

static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic(dst, (__force void *)src, size);
}

static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst, src, size);
}

extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);

static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
                                  unsigned size)
{
        long ret;
        kasan_check_write(dst, size);
        stac();
        ret = __copy_user_nocache(dst, src, size);
        clac();
        return ret;
}

static inline int
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_flushcache(dst, src, size);
}

/*
 * Zero Userspace.
 */

__must_check unsigned long
rep_stos_alternative(void __user *addr, unsigned long len);

static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
        might_fault();
        stac();

        /*
         * No memory constraint because it doesn't change any memory gcc
         * knows about.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep stosb",
                            "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
                "2:\n"
               _ASM_EXTABLE_UA(1b, 2b)
               : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
               : "a" (0));

        clac();

        return size;
}

static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
{
        if (__access_ok(to, n))
                return __clear_user(to, n);
        return n;
}
#endif /* _ASM_X86_UACCESS_64_H */



































































   10 



   11 
    1 

   12 
























































































































































































































































































































































































































































































































































































































































































































    2 
    8 















































































































































































































































































































    2 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H

/*
 * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
 * lockless readers (read-only retry loops), and no writer starvation.
 *
 * See Documentation/locking/seqlock.rst
 *
 * Copyrights:
 * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
 * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
 */

#include <linux/compiler.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/seqlock_types.h>
#include <linux/spinlock.h>

#include <asm/processor.h>

/*
 * The seqlock seqcount_t interface does not prescribe a precise sequence of
 * read begin/retry/end. For readers, typically there is a call to
 * read_seqcount_begin() and read_seqcount_retry(), however, there are more
 * esoteric cases which do not follow this pattern.
 *
 * As a consequence, we take the following best-effort approach for raw usage
 * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
 * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
 * atomics; if there is a matching read_seqcount_retry() call, no following
 * memory operations are considered atomic. Usage of the seqlock_t interface
 * is not affected.
 */
#define KCSAN_SEQLOCK_REGION_MAX 1000

static inline void __seqcount_init(seqcount_t *s, const char *name,
                                          struct lock_class_key *key)
{
        /*
         * Make sure we are not reinitializing a held lock:
         */
        lockdep_init_map(&s->dep_map, name, key, 0);
        s->sequence = 0;
}

#ifdef CONFIG_DEBUG_LOCK_ALLOC

# define SEQCOUNT_DEP_MAP_INIT(lockname)                                \
                .dep_map = { .name = #lockname }

/**
 * seqcount_init() - runtime initializer for seqcount_t
 * @s: Pointer to the seqcount_t instance
 */
# define seqcount_init(s)                                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __seqcount_init((s), #s, &__key);                        \
        } while (0)

static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
        seqcount_t *l = (seqcount_t *)s;
        unsigned long flags;

        local_irq_save(flags);
        seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
        seqcount_release(&l->dep_map, _RET_IP_);
        local_irq_restore(flags);
}

#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif

/**
 * SEQCNT_ZERO() - static initializer for seqcount_t
 * @name: Name of the seqcount_t instance
 */
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }

/*
 * Sequence counters with associated locks (seqcount_LOCKNAME_t)
 *
 * A sequence counter which associates the lock used for writer
 * serialization at initialization time. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * For associated locks which do not implicitly disable preemption,
 * preemption protection is enforced in the write side function.
 *
 * Lockdep is never used in any for the raw write variants.
 *
 * See Documentation/locking/seqlock.rst
 */

/*
 * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
 * @seqcount:        The real sequence counter
 * @lock:        Pointer to the associated lock
 *
 * A plain sequence counter with external writer synchronization by
 * LOCKNAME @lock. The lock is associated to the sequence counter in the
 * static initializer or init function. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * LOCKNAME:        raw_spinlock, spinlock, rwlock or mutex
 */

/*
 * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
 * @s:                Pointer to the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated lock
 */

#define seqcount_LOCKNAME_init(s, _lock, lockname)                        \
        do {                                                                \
                seqcount_##lockname##_t *____s = (s);                        \
                seqcount_init(&____s->seqcount);                        \
                __SEQ_LOCK(____s->lock = (_lock));                        \
        } while (0)

#define seqcount_raw_spinlock_init(s, lock)        seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, rwlock)
#define seqcount_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, mutex)

/*
 * SEQCOUNT_LOCKNAME()        - Instantiate seqcount_LOCKNAME_t and helpers
 * seqprop_LOCKNAME_*()        - Property accessors for seqcount_LOCKNAME_t
 *
 * @lockname:                "LOCKNAME" part of seqcount_LOCKNAME_t
 * @locktype:                LOCKNAME canonical C data type
 * @preemptible:        preemptibility of above locktype
 * @lockbase:                prefix for associated lock/unlock
 */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)        \
static __always_inline seqcount_t *                                        \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)                        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline const seqcount_t *                                \
__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline unsigned                                                \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)        \
{                                                                        \
        unsigned seq = READ_ONCE(s->seqcount.sequence);                        \
                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return seq;                                                \
                                                                        \
        if (preemptible && unlikely(seq & 1)) {                                \
                __SEQ_LOCK(lockbase##_lock(s->lock));                        \
                __SEQ_LOCK(lockbase##_unlock(s->lock));                        \
                                                                        \
                /*                                                        \
                 * Re-read the sequence counter since the (possibly        \
                 * preempted) writer made progress.                        \
                 */                                                        \
                seq = READ_ONCE(s->seqcount.sequence);                        \
        }                                                                \
                                                                        \
        return seq;                                                        \
}                                                                        \
                                                                        \
static __always_inline bool                                                \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)        \
{                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return preemptible;                                        \
                                                                        \
        /* PREEMPT_RT relies on the above LOCK+UNLOCK */                \
        return false;                                                        \
}                                                                        \
                                                                        \
static __always_inline void                                                \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)                \
{                                                                        \
        __SEQ_LOCK(lockdep_assert_held(s->lock));                        \
}

/*
 * __seqprop() for seqcount_t
 */

static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
        return s;
}

static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
{
        return s;
}

static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
        return READ_ONCE(s->sequence);
}

static inline bool __seqprop_preemptible(const seqcount_t *s)
{
        return false;
}

static inline void __seqprop_assert(const seqcount_t *s)
{
        lockdep_assert_preemption_disabled();
}

#define __SEQ_RT        IS_ENABLED(CONFIG_PREEMPT_RT)

SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
#undef SEQCOUNT_LOCKNAME

/*
 * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
 * @name:        Name of the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated LOCKNAME
 */

#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) {                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
        __SEQ_LOCK(.lock        = (assoc_lock))                                \
}

#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock)         SEQCOUNT_LOCKNAME_ZERO(name, lock)

#define __seqprop_case(s, lockname, prop)                                \
        seqcount_##lockname##_t: __seqprop_##lockname##_##prop

#define __seqprop(s, prop) _Generic(*(s),                                \
        seqcount_t:                __seqprop_##prop,                        \
        __seqprop_case((s),        raw_spinlock,        prop),                        \
        __seqprop_case((s),        spinlock,        prop),                        \
        __seqprop_case((s),        rwlock,                prop),                        \
        __seqprop_case((s),        mutex,                prop))

#define seqprop_ptr(s)                        __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s)                __seqprop(s, const_ptr)(s)
#define seqprop_sequence(s)                __seqprop(s, sequence)(s)
#define seqprop_preemptible(s)                __seqprop(s, preemptible)(s)
#define seqprop_assert(s)                __seqprop(s, assert)(s)

/**
 * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define __read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned __seq;                                                        \
                                                                        \
        while ((__seq = seqprop_sequence(s)) & 1)                        \
                cpu_relax();                                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned _seq = __read_seqcount_begin(s);                        \
                                                                        \
        smp_rmb();                                                        \
        _seq;                                                                \
})

/**
 * read_seqcount_begin() - begin a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define read_seqcount_begin(s)                                                \
({                                                                        \
        seqcount_lockdep_reader_access(seqprop_const_ptr(s));                \
        raw_read_seqcount_begin(s);                                        \
})

/**
 * raw_read_seqcount() - read the raw seqcount_t counter value
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_read_seqcount opens a read critical section of the given
 * seqcount_t, without any lockdep checking, and without checking or
 * masking the sequence counter LSB. Calling code is responsible for
 * handling that.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount(s)                                                \
({                                                                        \
        unsigned __seq = seqprop_sequence(s);                                \
                                                                        \
        smp_rmb();                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
 *                        lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_seqcount_begin opens a read critical section of the given
 * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
 * for the count to stabilize. If a writer is active when it begins, it
 * will fail the read_seqcount_retry() at the end of the read critical
 * section instead of stabilizing at the beginning of it.
 *
 * Use this only in special kernel hot paths where the read section is
 * small and has a high probability of success through other external
 * means. It will save a single branching instruction.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_seqcount_begin(s)                                                \
({                                                                        \
        /*                                                                \
         * If the counter is odd, let read_seqcount_retry() fail        \
         * by decrementing the counter.                                        \
         */                                                                \
        raw_read_seqcount(s) & ~1;                                        \
})

/**
 * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: true if a read section retry is required, else false
 */
#define __read_seqcount_retry(s, start)                                        \
        do___read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return unlikely(READ_ONCE(s->sequence) != start);
}

/**
 * read_seqcount_retry() - end a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * read_seqcount_retry closes the read critical section of given
 * seqcount_t.  If the critical section was invalid, it must be ignored
 * (and typically retried).
 *
 * Return: true if a read section retry is required, else false
 */
#define read_seqcount_retry(s, start)                                        \
        do_read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        smp_rmb();
        return do___read_seqcount_retry(s, start);
}

/**
 * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_begin()
 */
#define raw_write_seqcount_begin(s)                                        \
do {                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_raw_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
}

/**
 * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_end()
 */
#define raw_write_seqcount_end(s)                                        \
do {                                                                        \
        do_raw_write_seqcount_end(seqprop_ptr(s));                        \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_begin_nested() - start a seqcount_t write section with
 *                                 custom lockdep nesting level
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @subclass: lockdep nesting level
 *
 * See Documentation/locking/lockdep-design.rst
 * Context: check write_seqcount_begin()
 */
#define write_seqcount_begin_nested(s, subclass)                        \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);        \
} while (0)

static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
        seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
        do_raw_write_seqcount_begin(s);
}

/**
 * write_seqcount_begin() - start a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: sequence counter write side sections must be serialized and
 * non-preemptible. Preemption will be automatically disabled if and
 * only if the seqcount write serialization lock is associated, and
 * preemptible.  If readers can be invoked from hardirq or softirq
 * context, interrupts or bottom halves must be respectively disabled.
 */
#define write_seqcount_begin(s)                                                \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_write_seqcount_begin(seqcount_t *s)
{
        do_write_seqcount_begin_nested(s, 0);
}

/**
 * write_seqcount_end() - end a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: Preemption will be automatically re-enabled if and only if
 * the seqcount write serialization lock is associated, and preemptible.
 */
#define write_seqcount_end(s)                                                \
do {                                                                        \
        do_write_seqcount_end(seqprop_ptr(s));                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_write_seqcount_end(seqcount_t *s)
{
        seqcount_release(&s->dep_map, _RET_IP_);
        do_raw_write_seqcount_end(s);
}

/**
 * raw_write_seqcount_barrier() - do a seqcount_t write barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * This can be used to provide an ordering guarantee instead of the usual
 * consistency guarantee. It is one wmb cheaper, because it can collapse
 * the two back-to-back wmb()s.
 *
 * Note that writes surrounding the barrier should be declared atomic (e.g.
 * via WRITE_ONCE): a) to ensure the writes become visible to other threads
 * atomically, avoiding compiler optimizations; b) to document which writes are
 * meant to propagate to the reader critical section. This is necessary because
 * neither writes before nor after the barrier are enclosed in a seq-writer
 * critical section that would ensure readers are aware of ongoing writes::
 *
 *        seqcount_t seq;
 *        bool X = true, Y = false;
 *
 *        void read(void)
 *        {
 *                bool x, y;
 *
 *                do {
 *                        int s = read_seqcount_begin(&seq);
 *
 *                        x = X; y = Y;
 *
 *                } while (read_seqcount_retry(&seq, s));
 *
 *                BUG_ON(!x && !y);
 *      }
 *
 *      void write(void)
 *      {
 *                WRITE_ONCE(Y, true);
 *
 *                raw_write_seqcount_barrier(seq);
 *
 *                WRITE_ONCE(X, false);
 *      }
 */
#define raw_write_seqcount_barrier(s)                                        \
        do_raw_write_seqcount_barrier(seqprop_ptr(s))

static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
 *                               side operations
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * After write_seqcount_invalidate, no seqcount_t read side operations
 * will complete successfully and see data older than this.
 */
#define write_seqcount_invalidate(s)                                        \
        do_write_seqcount_invalidate(seqprop_ptr(s))

static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
        smp_wmb();
        kcsan_nestable_atomic_begin();
        s->sequence+=2;
        kcsan_nestable_atomic_end();
}

/*
 * Latch sequence counters (seqcount_latch_t)
 *
 * A sequence counter variant where the counter even/odd value is used to
 * switch between two copies of protected data. This allows the read path,
 * typically NMIs, to safely interrupt the write side critical section.
 *
 * As the write sections are fully preemptible, no special handling for
 * PREEMPT_RT is needed.
 */
typedef struct {
        seqcount_t seqcount;
} seqcount_latch_t;

/**
 * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
 * @seq_name: Name of the seqcount_latch_t instance
 */
#define SEQCNT_LATCH_ZERO(seq_name) {                                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
}

/**
 * seqcount_latch_init() - runtime initializer for seqcount_latch_t
 * @s: Pointer to the seqcount_latch_t instance
 */
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)

/**
 * raw_read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See raw_write_seqcount_latch() for details and a full reader/writer
 * usage example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with raw_read_seqcount_latch_retry().
 */
static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
        /*
         * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
         * Due to the dependent load, a full smp_rmb() is not needed.
         */
        return READ_ONCE(s->seqcount.sequence);
}

/**
 * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from raw_read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        smp_rmb();
        return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}

/**
 * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
 * @s: Pointer to seqcount_latch_t
 *
 * The latch technique is a multiversion concurrency control method that allows
 * queries during non-atomic modifications. If you can guarantee queries never
 * interrupt the modification -- e.g. the concurrency is strictly between CPUs
 * -- you most likely do not need this.
 *
 * Where the traditional RCU/lockless data structures rely on atomic
 * modifications to ensure queries observe either the old or the new state the
 * latch allows the same for non-atomic updates. The trade-off is doubling the
 * cost of storage; we have to maintain two copies of the entire data
 * structure.
 *
 * Very simply put: we first modify one copy and then the other. This ensures
 * there is always one copy in a stable state, ready to give us an answer.
 *
 * The basic form is a data structure like::
 *
 *        struct latch_struct {
 *                seqcount_latch_t        seq;
 *                struct data_struct        data[2];
 *        };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following::
 *
 *        void latch_modify(struct latch_struct *latch, ...)
 *        {
 *                smp_wmb();        // Ensure that the last data[1] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[0], ...);
 *
 *                smp_wmb();        // Ensure that the data[0] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[1], ...);
 *        }
 *
 * The query will have a form like::
 *
 *        struct entry *latch_query(struct latch_struct *latch, ...)
 *        {
 *                struct entry *entry;
 *                unsigned seq, idx;
 *
 *                do {
 *                        seq = raw_read_seqcount_latch(&latch->seq);
 *
 *                        idx = seq & 0x01;
 *                        entry = data_query(latch->data[idx], ...);
 *
 *                // This includes needed smp_rmb()
 *                } while (raw_read_seqcount_latch_retry(&latch->seq, seq));
 *
 *                return entry;
 *        }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].
 *
 * NOTE:
 *
 *        The non-requirement for atomic modifications does _NOT_ include
 *        the publishing of new entries in the case where data is a dynamic
 *        data structure.
 *
 *        An iteration might start in data[0] and get suspended long enough
 *        to miss an entire modification sequence, once it resumes it might
 *        observe the new entry.
 *
 * NOTE2:
 *
 *        When data is a dynamic data structure; one should use regular RCU
 *        patterns to manage the lifetimes of the objects within.
 */
static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
        smp_wmb();        /* prior stores before incrementing "sequence" */
        s->seqcount.sequence++;
        smp_wmb();      /* increment "sequence" before following stores */
}

#define __SEQLOCK_UNLOCKED(lockname)                                        \
        {                                                                \
                .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
                .lock =        __SPIN_LOCK_UNLOCKED(lockname)                        \
        }

/**
 * seqlock_init() - dynamic initializer for seqlock_t
 * @sl: Pointer to the seqlock_t instance
 */
#define seqlock_init(sl)                                                \
        do {                                                                \
                spin_lock_init(&(sl)->lock);                                \
                seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock);        \
        } while (0)

/**
 * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
 * @sl: Name of the seqlock_t instance
 */
#define DEFINE_SEQLOCK(sl) \
                seqlock_t sl = __SEQLOCK_UNLOCKED(sl)

/**
 * read_seqbegin() - start a seqlock_t read side critical section
 * @sl: Pointer to seqlock_t
 *
 * Return: count, to be passed to read_seqretry()
 */
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
        unsigned ret = read_seqcount_begin(&sl->seqcount);

        kcsan_atomic_next(0);  /* non-raw usage, assume closing read_seqretry() */
        kcsan_flat_atomic_begin();
        return ret;
}

/**
 * read_seqretry() - end a seqlock_t read side section
 * @sl: Pointer to seqlock_t
 * @start: count, from read_seqbegin()
 *
 * read_seqretry closes the read side critical section of given seqlock_t.
 * If the critical section was invalid, it must be ignored (and typically
 * retried).
 *
 * Return: true if a read section retry is required, else false
 */
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
        /*
         * Assume not nested: read_seqretry() may be called multiple times when
         * completing read critical section.
         */
        kcsan_flat_atomic_end();

        return read_seqcount_retry(&sl->seqcount, start);
}

/*
 * For all seqlock_t write side functions, use the internal
 * do_write_seqcount_begin() instead of generic write_seqcount_begin().
 * This way, no redundant lockdep_assert_held() checks are added.
 */

/**
 * write_seqlock() - start a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_seqlock opens a write side critical section for the given
 * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
 * that sequential lock. All seqlock_t write side sections are thus
 * automatically serialized and non-preemptible.
 *
 * Context: if the seqlock_t read section, or other write side critical
 * sections, can be invoked from hardirq or softirq contexts, use the
 * _irqsave or _bh variants of this function instead.
 */
static inline void write_seqlock(seqlock_t *sl)
{
        spin_lock(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock() - end a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock closes the (serialized and non-preemptible) write side
 * critical section of given seqlock_t.
 */
static inline void write_sequnlock(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock(&sl->lock);
}

/**
 * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of write_seqlock(). Use only if the read side section, or
 * other write side sections, can be invoked from softirq contexts.
 */
static inline void write_seqlock_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_bh closes the serialized, non-preemptible, and
 * softirqs-disabled, seqlock_t write side critical section opened with
 * write_seqlock_bh().
 */
static inline void write_sequnlock_bh(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_bh(&sl->lock);
}

/**
 * write_seqlock_irq() - start a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of write_seqlock(). Use only if the read side section, or
 * other write sections, can be invoked from hardirq contexts.
 */
static inline void write_seqlock_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_irq closes the serialized and non-interruptible
 * seqlock_t write side section opened with write_seqlock_irq().
 */
static inline void write_sequnlock_irq(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
        return flags;
}

/**
 * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
 *                           section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to write_sequnlock_irqrestore().
 *
 * _irqsave variant of write_seqlock(). Use it only if the read side
 * section, or other write sections, can be invoked from hardirq context.
 */
#define write_seqlock_irqsave(lock, flags)                                \
        do { flags = __write_seqlock_irqsave(lock); } while (0)

/**
 * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
 *                                section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
 *
 * write_sequnlock_irqrestore closes the serialized and non-interruptible
 * seqlock_t write section previously opened with write_seqlock_irqsave().
 */
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqlock_excl() - begin a seqlock_t locking reader section
 * @sl:        Pointer to seqlock_t
 *
 * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
 * locking reader exclusively locks out *both* other writers *and* other
 * locking readers, but it does not update the embedded sequence number.
 *
 * Locking readers act like a normal spin_lock()/spin_unlock().
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * The opened read section must be closed with read_sequnlock_excl().
 */
static inline void read_seqlock_excl(seqlock_t *sl)
{
        spin_lock(&sl->lock);
}

/**
 * read_sequnlock_excl() - end a seqlock_t locking reader critical section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl(seqlock_t *sl)
{
        spin_unlock(&sl->lock);
}

/**
 * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
 *                            softirqs disabled
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of read_seqlock_excl(). Use this variant only if the
 * seqlock_t write side section, *or other read sections*, can be invoked
 * from softirq contexts.
 */
static inline void read_seqlock_excl_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
}

/**
 * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
 *                              reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
{
        spin_unlock_bh(&sl->lock);
}

/**
 * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
 *                             reader section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
static inline void read_seqlock_excl_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
}

/**
 * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
 *                             locking reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
{
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        return flags;
}

/**
 * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
 *                                 locking reader section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to read_sequnlock_excl_irqrestore().
 *
 * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
#define read_seqlock_excl_irqsave(lock, flags)                                \
        do { flags = __read_seqlock_excl_irqsave(lock); } while (0)

/**
 * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
 *                                      locking reader section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
 */
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
{
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
 * @lock: Pointer to seqlock_t
 * @seq : Marker and return parameter. If the passed value is even, the
 * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
 * If the passed value is odd, the reader will become a *locking* reader
 * as in read_seqlock_excl().  In the first call to this function, the
 * caller *must* initialize and pass an even value to @seq; this way, a
 * lockless read can be optimistically tried first.
 *
 * read_seqbegin_or_lock is an API designed to optimistically try a normal
 * lockless seqlock_t read section first.  If an odd counter is found, the
 * lockless read trial has failed, and the next read iteration transforms
 * itself into a full seqlock_t locking reader.
 *
 * This is typically used to avoid seqlock_t lockless readers starvation
 * (too much retry loops) in the case of a sharp spike in write side
 * activity.
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * Check Documentation/locking/seqlock.rst for template example code.
 *
 * Return: the encountered sequence counter value, through the @seq
 * parameter, which is overloaded as a return parameter. This returned
 * value must be checked with need_seqretry(). If the read section need to
 * be retried, this returned value must also be passed as the @seq
 * parameter of the next read_seqbegin_or_lock() iteration.
 */
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
{
        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl(lock);
}

/**
 * need_seqretry() - validate seqlock_t "locking or lockless" read section
 * @lock: Pointer to seqlock_t
 * @seq: sequence count, from read_seqbegin_or_lock()
 *
 * Return: true if a read section retry is required, false otherwise
 */
static inline int need_seqretry(seqlock_t *lock, int seq)
{
        return !(seq & 1) && read_seqretry(lock, seq);
}

/**
 * done_seqretry() - end seqlock_t "locking or lockless" reader section
 * @lock: Pointer to seqlock_t
 * @seq: count, from read_seqbegin_or_lock()
 *
 * done_seqretry finishes the seqlock_t read side critical section started
 * with read_seqbegin_or_lock() and validated by need_seqretry().
 */
static inline void done_seqretry(seqlock_t *lock, int seq)
{
        if (seq & 1)
                read_sequnlock_excl(lock);
}

/**
 * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
 *                                   a non-interruptible locking reader
 * @lock: Pointer to seqlock_t
 * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
 *
 * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
 * the seqlock_t write section, *or other read sections*, can be invoked
 * from hardirq context.
 *
 * Note: Interrupts will be disabled only for "locking reader" mode.
 *
 * Return:
 *
 *   1. The saved local interrupts state in case of a locking reader, to
 *      be passed to done_seqretry_irqrestore().
 *
 *   2. The encountered sequence counter value, returned through @seq
 *      overloaded as a return parameter. Check read_seqbegin_or_lock().
 */
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
{
        unsigned long flags = 0;

        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl_irqsave(lock, flags);

        return flags;
}

/**
 * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
 *                                non-interruptible locking reader section
 * @lock:  Pointer to seqlock_t
 * @seq:   Count, from read_seqbegin_or_lock_irqsave()
 * @flags: Caller's saved local interrupt state in case of a locking
 *           reader, also from read_seqbegin_or_lock_irqsave()
 *
 * This is the _irqrestore variant of done_seqretry(). The read section
 * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
 * by need_seqretry().
 */
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
{
        if (seq & 1)
                read_sequnlock_excl_irqrestore(lock, flags);
}
#endif /* __LINUX_SEQLOCK_H */
















































































































































    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/symlink.c - sysfs symlink implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#include <linux/fs.h>
#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/mutex.h>
#include <linux/security.h>

#include "sysfs.h"

static int sysfs_do_create_link_sd(struct kernfs_node *parent,
                                   struct kobject *target_kobj,
                                   const char *name, int warn)
{
        struct kernfs_node *kn, *target = NULL;

        if (WARN_ON(!name || !parent))
                return -EINVAL;

        /*
         * We don't own @target_kobj and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock.  See
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
        if (target_kobj->sd) {
                target = target_kobj->sd;
                kernfs_get(target);
        }
        spin_unlock(&sysfs_symlink_target_lock);

        if (!target)
                return -ENOENT;

        kn = kernfs_create_link(parent, name, target);
        kernfs_put(target);

        if (!IS_ERR(kn))
                return 0;

        if (warn && PTR_ERR(kn) == -EEXIST)
                sysfs_warn_dup(parent, name);
        return PTR_ERR(kn);
}

/**
 *        sysfs_create_link_sd - create symlink to a given object.
 *        @kn:                directory we're creating the link in.
 *        @target:        object we're pointing to.
 *        @name:                name of the symlink.
 */
int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
                         const char *name)
{
        return sysfs_do_create_link_sd(kn, target, name, 1);
}

static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
                                const char *name, int warn)
{
        struct kernfs_node *parent = NULL;

        if (!kobj)
                parent = sysfs_root_kn;
        else
                parent = kobj->sd;

        if (!parent)
                return -EFAULT;

        return sysfs_do_create_link_sd(parent, target, name, warn);
}

/**
 *        sysfs_create_link - create symlink between two objects.
 *        @kobj:        object whose directory we're creating the link in.
 *        @target:        object we're pointing to.
 *        @name:                name of the symlink.
 */
int sysfs_create_link(struct kobject *kobj, struct kobject *target,
                      const char *name)
{
        return sysfs_do_create_link(kobj, target, name, 1);
}
EXPORT_SYMBOL_GPL(sysfs_create_link);

/**
 *        sysfs_create_link_nowarn - create symlink between two objects.
 *        @kobj:        object whose directory we're creating the link in.
 *        @target:        object we're pointing to.
 *        @name:                name of the symlink.
 *
 *        This function does the same as sysfs_create_link(), but it
 *        doesn't warn if the link already exists.
 */
int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
                             const char *name)
{
        return sysfs_do_create_link(kobj, target, name, 0);
}
EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn);

/**
 *        sysfs_delete_link - remove symlink in object's directory.
 *        @kobj:        object we're acting for.
 *        @targ:        object we're pointing to.
 *        @name:        name of the symlink to remove.
 *
 *        Unlike sysfs_remove_link sysfs_delete_link has enough information
 *        to successfully delete symlinks in tagged directories.
 */
void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
                        const char *name)
{
        const void *ns = NULL;

        /*
         * We don't own @target and it may be removed at any time.
         * Synchronize using sysfs_symlink_target_lock.  See
         * sysfs_remove_dir() for details.
         */
        spin_lock(&sysfs_symlink_target_lock);
        if (targ->sd && kernfs_ns_enabled(kobj->sd))
                ns = targ->sd->ns;
        spin_unlock(&sysfs_symlink_target_lock);
        kernfs_remove_by_name_ns(kobj->sd, name, ns);
}

/**
 *        sysfs_remove_link - remove symlink in object's directory.
 *        @kobj:        object we're acting for.
 *        @name:        name of the symlink to remove.
 */
void sysfs_remove_link(struct kobject *kobj, const char *name)
{
        struct kernfs_node *parent = NULL;

        if (!kobj)
                parent = sysfs_root_kn;
        else
                parent = kobj->sd;

        kernfs_remove_by_name(parent, name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_link);

/**
 *        sysfs_rename_link_ns - rename symlink in object's directory.
 *        @kobj:        object we're acting for.
 *        @targ:        object we're pointing to.
 *        @old:        previous name of the symlink.
 *        @new:        new name of the symlink.
 *        @new_ns: new namespace of the symlink.
 *
 *        A helper function for the common rename symlink idiom.
 */
int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
                         const char *old, const char *new, const void *new_ns)
{
        struct kernfs_node *parent, *kn = NULL;
        const void *old_ns = NULL;
        int result;

        if (!kobj)
                parent = sysfs_root_kn;
        else
                parent = kobj->sd;

        if (targ->sd)
                old_ns = targ->sd->ns;

        result = -ENOENT;
        kn = kernfs_find_and_get_ns(parent, old, old_ns);
        if (!kn)
                goto out;

        result = -EINVAL;
        if (kernfs_type(kn) != KERNFS_LINK)
                goto out;
        if (kn->symlink.target_kn->priv != targ)
                goto out;

        result = kernfs_rename_ns(kn, parent, new, new_ns);

out:
        kernfs_put(kn);
        return result;
}
EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);




















































































































































































































    1 


    1 



















































































































































































































































































































































































































































































































































































    2 
    2 



















    2 










    2 





    2 
    2 







    2 

    2 



    1 



    1 







    2 



    2 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














    1 











    1 












    1 





































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
 *  applies to SOCK_STREAM sockets only
 *  offers an alternative communication option for TCP-protocol sockets
 *  applicable with RoCE-cards only
 *
 *  Initial restrictions:
 *    - support for alternate links postponed
 *
 *  Copyright IBM Corp. 2016, 2018
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 *              based on prototype from Frank Blaschka
 */

#define KMSG_COMPONENT "smc"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/module.h>
#include <linux/socket.h>
#include <linux/workqueue.h>
#include <linux/in.h>
#include <linux/sched/signal.h>
#include <linux/if_vlan.h>
#include <linux/rcupdate_wait.h>
#include <linux/ctype.h>
#include <linux/splice.h>

#include <net/sock.h>
#include <net/tcp.h>
#include <net/smc.h>
#include <asm/ioctls.h>

#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include "smc_netns.h"

#include "smc.h"
#include "smc_clc.h"
#include "smc_llc.h"
#include "smc_cdc.h"
#include "smc_core.h"
#include "smc_ib.h"
#include "smc_ism.h"
#include "smc_pnet.h"
#include "smc_netlink.h"
#include "smc_tx.h"
#include "smc_rx.h"
#include "smc_close.h"
#include "smc_stats.h"
#include "smc_tracepoint.h"
#include "smc_sysctl.h"
#include "smc_loopback.h"
#include "smc_inet.h"

static DEFINE_MUTEX(smc_server_lgr_pending);        /* serialize link group
                                                 * creation on server
                                                 */
static DEFINE_MUTEX(smc_client_lgr_pending);        /* serialize link group
                                                 * creation on client
                                                 */

static struct workqueue_struct        *smc_tcp_ls_wq;        /* wq for tcp listen work */
struct workqueue_struct        *smc_hs_wq;        /* wq for handshake work */
struct workqueue_struct        *smc_close_wq;        /* wq for close work */

static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);

int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
        void *hdr;

        if (cb_ctx->pos[0])
                goto out;

        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                          &smc_gen_nl_family, NLM_F_MULTI,
                          SMC_NETLINK_DUMP_HS_LIMITATION);
        if (!hdr)
                return -ENOMEM;

        if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
                       sock_net(skb->sk)->smc.limit_smc_hs))
                goto err;

        genlmsg_end(skb, hdr);
        cb_ctx->pos[0] = 1;
out:
        return skb->len;
err:
        genlmsg_cancel(skb, hdr);
        return -EMSGSIZE;
}

int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
{
        sock_net(skb->sk)->smc.limit_smc_hs = true;
        return 0;
}

int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
{
        sock_net(skb->sk)->smc.limit_smc_hs = false;
        return 0;
}

static void smc_set_keepalive(struct sock *sk, int val)
{
        struct smc_sock *smc = smc_sk(sk);

        smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}

static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
                                          struct sk_buff *skb,
                                          struct request_sock *req,
                                          struct dst_entry *dst,
                                          struct request_sock *req_unhash,
                                          bool *own_req)
{
        struct smc_sock *smc;
        struct sock *child;

        smc = smc_clcsock_user_data(sk);

        if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
                                sk->sk_max_ack_backlog)
                goto drop;

        if (sk_acceptq_is_full(&smc->sk)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
                goto drop;
        }

        /* passthrough to original syn recv sock fct */
        child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
                                               own_req);
        /* child must not inherit smc or its ops */
        if (child) {
                rcu_assign_sk_user_data(child, NULL);

                /* v4-mapped sockets don't inherit parent ops. Don't restore. */
                if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops)
                        inet_csk(child)->icsk_af_ops = smc->ori_af_ops;
        }
        return child;

drop:
        dst_release(dst);
        tcp_listendrop(sk);
        return NULL;
}

static bool smc_hs_congested(const struct sock *sk)
{
        const struct smc_sock *smc;

        smc = smc_clcsock_user_data(sk);

        if (!smc)
                return true;

        if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
                return true;

        return false;
}

struct smc_hashinfo smc_v4_hashinfo = {
        .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};

struct smc_hashinfo smc_v6_hashinfo = {
        .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
};

int smc_hash_sk(struct sock *sk)
{
        struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
        struct hlist_head *head;

        head = &h->ht;

        write_lock_bh(&h->lock);
        sk_add_node(sk, head);
        write_unlock_bh(&h->lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

        return 0;
}

void smc_unhash_sk(struct sock *sk)
{
        struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;

        write_lock_bh(&h->lock);
        if (sk_del_node_init(sk))
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        write_unlock_bh(&h->lock);
}

/* This will be called before user really release sock_lock. So do the
 * work which we didn't do because of user hold the sock_lock in the
 * BH context
 */
void smc_release_cb(struct sock *sk)
{
        struct smc_sock *smc = smc_sk(sk);

        if (smc->conn.tx_in_release_sock) {
                smc_tx_pending(&smc->conn);
                smc->conn.tx_in_release_sock = false;
        }
}

struct proto smc_proto = {
        .name                = "SMC",
        .owner                = THIS_MODULE,
        .keepalive        = smc_set_keepalive,
        .hash                = smc_hash_sk,
        .unhash                = smc_unhash_sk,
        .release_cb        = smc_release_cb,
        .obj_size        = sizeof(struct smc_sock),
        .h.smc_hash        = &smc_v4_hashinfo,
        .slab_flags        = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto);

struct proto smc_proto6 = {
        .name                = "SMC6",
        .owner                = THIS_MODULE,
        .keepalive        = smc_set_keepalive,
        .hash                = smc_hash_sk,
        .unhash                = smc_unhash_sk,
        .release_cb        = smc_release_cb,
        .obj_size        = sizeof(struct smc_sock),
        .h.smc_hash        = &smc_v6_hashinfo,
        .slab_flags        = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);

static void smc_fback_restore_callbacks(struct smc_sock *smc)
{
        struct sock *clcsk = smc->clcsock->sk;

        write_lock_bh(&clcsk->sk_callback_lock);
        clcsk->sk_user_data = NULL;

        smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change);
        smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready);
        smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space);
        smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report);

        write_unlock_bh(&clcsk->sk_callback_lock);
}

static void smc_restore_fallback_changes(struct smc_sock *smc)
{
        if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
                smc->clcsock->file->private_data = smc->sk.sk_socket;
                smc->clcsock->file = NULL;
                smc_fback_restore_callbacks(smc);
        }
}

static int __smc_release(struct smc_sock *smc)
{
        struct sock *sk = &smc->sk;
        int rc = 0;

        if (!smc->use_fallback) {
                rc = smc_close_active(smc);
                smc_sock_set_flag(sk, SOCK_DEAD);
                sk->sk_shutdown |= SHUTDOWN_MASK;
        } else {
                if (sk->sk_state != SMC_CLOSED) {
                        if (sk->sk_state != SMC_LISTEN &&
                            sk->sk_state != SMC_INIT)
                                sock_put(sk); /* passive closing */
                        if (sk->sk_state == SMC_LISTEN) {
                                /* wake up clcsock accept */
                                rc = kernel_sock_shutdown(smc->clcsock,
                                                          SHUT_RDWR);
                        }
                        sk->sk_state = SMC_CLOSED;
                        sk->sk_state_change(sk);
                }
                smc_restore_fallback_changes(smc);
        }

        sk->sk_prot->unhash(sk);

        if (sk->sk_state == SMC_CLOSED) {
                if (smc->clcsock) {
                        release_sock(sk);
                        smc_clcsock_release(smc);
                        lock_sock(sk);
                }
                if (!smc->use_fallback)
                        smc_conn_free(&smc->conn);
        }

        return rc;
}

int smc_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int old_state, rc = 0;

        if (!sk)
                goto out;

        sock_hold(sk); /* sock_put below */
        smc = smc_sk(sk);

        old_state = sk->sk_state;

        /* cleanup for a dangling non-blocking connect */
        if (smc->connect_nonblock && old_state == SMC_INIT)
                tcp_abort(smc->clcsock->sk, ECONNABORTED);

        if (cancel_work_sync(&smc->connect_work))
                sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */

        if (sk->sk_state == SMC_LISTEN)
                /* smc_close_non_accepted() is called and acquires
                 * sock lock for child sockets again
                 */
                lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
        else
                lock_sock(sk);

        if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
            !smc->use_fallback)
                smc_close_active_abort(smc);

        rc = __smc_release(smc);

        /* detach socket */
        sock_orphan(sk);
        sock->sk = NULL;
        release_sock(sk);

        sock_put(sk); /* sock_hold above */
        sock_put(sk); /* final sock_put */
out:
        return rc;
}

static void smc_destruct(struct sock *sk)
{
        if (sk->sk_state != SMC_CLOSED)
                return;
        if (!sock_flag(sk, SOCK_DEAD))
                return;
}

void smc_sk_init(struct net *net, struct sock *sk, int protocol)
{
        struct smc_sock *smc = smc_sk(sk);

        sk->sk_state = SMC_INIT;
        sk->sk_destruct = smc_destruct;
        sk->sk_protocol = protocol;
        WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
        WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
        INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
        INIT_WORK(&smc->connect_work, smc_connect_work);
        INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
        INIT_LIST_HEAD(&smc->accept_q);
        spin_lock_init(&smc->accept_q_lock);
        spin_lock_init(&smc->conn.send_lock);
        sk->sk_prot->hash(sk);
        mutex_init(&smc->clcsock_release_lock);
        smc_init_saved_callbacks(smc);
        smc->limit_smc_hs = net->smc.limit_smc_hs;
        smc->use_fallback = false; /* assume rdma capability first */
        smc->fallback_rsn = 0;
}

static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
                                   int protocol)
{
        struct proto *prot;
        struct sock *sk;

        prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
        sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
        if (!sk)
                return NULL;

        sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
        smc_sk_init(net, sk, protocol);

        return sk;
}

int smc_bind(struct socket *sock, struct sockaddr *uaddr,
             int addr_len)
{
        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc;

        smc = smc_sk(sk);

        /* replicate tests from inet_bind(), to be safe wrt. future changes */
        rc = -EINVAL;
        if (addr_len < sizeof(struct sockaddr_in))
                goto out;

        rc = -EAFNOSUPPORT;
        if (addr->sin_family != AF_INET &&
            addr->sin_family != AF_INET6 &&
            addr->sin_family != AF_UNSPEC)
                goto out;
        /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
        if (addr->sin_family == AF_UNSPEC &&
            addr->sin_addr.s_addr != htonl(INADDR_ANY))
                goto out;

        lock_sock(sk);

        /* Check if socket is already active */
        rc = -EINVAL;
        if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
                goto out_rel;

        smc->clcsock->sk->sk_reuse = sk->sk_reuse;
        smc->clcsock->sk->sk_reuseport = sk->sk_reuseport;
        rc = kernel_bind(smc->clcsock, uaddr, addr_len);

out_rel:
        release_sock(sk);
out:
        return rc;
}

/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 * clc socket (since smc is not called for these options from net/core)
 */

#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
                             (1UL << SOCK_KEEPOPEN) | \
                             (1UL << SOCK_LINGER) | \
                             (1UL << SOCK_BROADCAST) | \
                             (1UL << SOCK_TIMESTAMP) | \
                             (1UL << SOCK_DBG) | \
                             (1UL << SOCK_RCVTSTAMP) | \
                             (1UL << SOCK_RCVTSTAMPNS) | \
                             (1UL << SOCK_LOCALROUTE) | \
                             (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
                             (1UL << SOCK_RXQ_OVFL) | \
                             (1UL << SOCK_WIFI_STATUS) | \
                             (1UL << SOCK_NOFCS) | \
                             (1UL << SOCK_FILTER_LOCKED) | \
                             (1UL << SOCK_TSTAMP_NEW))

/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
                                     unsigned long mask)
{
        nsk->sk_userlocks = osk->sk_userlocks;
        if (osk->sk_userlocks & SOCK_SNDBUF_LOCK)
                nsk->sk_sndbuf = osk->sk_sndbuf;
        if (osk->sk_userlocks & SOCK_RCVBUF_LOCK)
                nsk->sk_rcvbuf = osk->sk_rcvbuf;
}

static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
                                   unsigned long mask)
{
        /* options we don't get control via setsockopt for */
        nsk->sk_type = osk->sk_type;
        nsk->sk_sndtimeo = osk->sk_sndtimeo;
        nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
        nsk->sk_mark = READ_ONCE(osk->sk_mark);
        nsk->sk_priority = READ_ONCE(osk->sk_priority);
        nsk->sk_rcvlowat = osk->sk_rcvlowat;
        nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
        nsk->sk_err = osk->sk_err;

        nsk->sk_flags &= ~mask;
        nsk->sk_flags |= osk->sk_flags & mask;

        smc_adjust_sock_bufsizes(nsk, osk, mask);
}

static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
        smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
}

#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
                             (1UL << SOCK_KEEPOPEN) | \
                             (1UL << SOCK_LINGER) | \
                             (1UL << SOCK_DBG))
/* copy only settings and flags relevant for smc from clc to smc socket */
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
{
        smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}

/* register the new vzalloced sndbuf on all links */
static int smcr_lgr_reg_sndbufs(struct smc_link *link,
                                struct smc_buf_desc *snd_desc)
{
        struct smc_link_group *lgr = link->lgr;
        int i, rc = 0;

        if (!snd_desc->is_vm)
                return -EINVAL;

        /* protect against parallel smcr_link_reg_buf() */
        down_write(&lgr->llc_conf_mutex);
        for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
                if (!smc_link_active(&lgr->lnk[i]))
                        continue;
                rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc);
                if (rc)
                        break;
        }
        up_write(&lgr->llc_conf_mutex);
        return rc;
}

/* register the new rmb on all links */
static int smcr_lgr_reg_rmbs(struct smc_link *link,
                             struct smc_buf_desc *rmb_desc)
{
        struct smc_link_group *lgr = link->lgr;
        bool do_slow = false;
        int i, rc = 0;

        rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
        if (rc)
                return rc;

        down_read(&lgr->llc_conf_mutex);
        for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
                if (!smc_link_active(&lgr->lnk[i]))
                        continue;
                if (!rmb_desc->is_reg_mr[link->link_idx]) {
                        up_read(&lgr->llc_conf_mutex);
                        goto slow_path;
                }
        }
        /* mr register already */
        goto fast_path;
slow_path:
        do_slow = true;
        /* protect against parallel smc_llc_cli_rkey_exchange() and
         * parallel smcr_link_reg_buf()
         */
        down_write(&lgr->llc_conf_mutex);
        for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
                if (!smc_link_active(&lgr->lnk[i]))
                        continue;
                rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc);
                if (rc)
                        goto out;
        }
fast_path:
        /* exchange confirm_rkey msg with peer */
        rc = smc_llc_do_confirm_rkey(link, rmb_desc);
        if (rc) {
                rc = -EFAULT;
                goto out;
        }
        rmb_desc->is_conf_rkey = true;
out:
        do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex);
        smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
        return rc;
}

static int smcr_clnt_conf_first_link(struct smc_sock *smc)
{
        struct smc_link *link = smc->conn.lnk;
        struct smc_llc_qentry *qentry;
        int rc;

        /* Receive CONFIRM LINK request from server over RoCE fabric.
         * Increasing the client's timeout by twice as much as the server's
         * timeout by default can temporarily avoid decline messages of
         * both sides crossing or colliding
         */
        qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME,
                              SMC_LLC_CONFIRM_LINK);
        if (!qentry) {
                struct smc_clc_msg_decline dclc;

                rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
                                      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
                return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
        }
        smc_llc_save_peer_uid(qentry);
        rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
        smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
        if (rc)
                return SMC_CLC_DECL_RMBE_EC;

        rc = smc_ib_modify_qp_rts(link);
        if (rc)
                return SMC_CLC_DECL_ERR_RDYLNK;

        smc_wr_remember_qp_attr(link);

        /* reg the sndbuf if it was vzalloced */
        if (smc->conn.sndbuf_desc->is_vm) {
                if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
                        return SMC_CLC_DECL_ERR_REGBUF;
        }

        /* reg the rmb */
        if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
                return SMC_CLC_DECL_ERR_REGBUF;

        /* confirm_rkey is implicit on 1st contact */
        smc->conn.rmb_desc->is_conf_rkey = true;

        /* send CONFIRM LINK response over RoCE fabric */
        rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
        if (rc < 0)
                return SMC_CLC_DECL_TIMEOUT_CL;

        smc_llc_link_active(link);
        smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);

        if (link->lgr->max_links > 1) {
                /* optional 2nd link, receive ADD LINK request from server */
                qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
                                      SMC_LLC_ADD_LINK);
                if (!qentry) {
                        struct smc_clc_msg_decline dclc;

                        rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
                                              SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
                        if (rc == -EAGAIN)
                                rc = 0; /* no DECLINE received, go with one link */
                        return rc;
                }
                smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
                smc_llc_cli_add_link(link, qentry);
        }
        return 0;
}

static bool smc_isascii(char *hostname)
{
        int i;

        for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
                if (!isascii(hostname[i]))
                        return false;
        return true;
}

static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
                                        struct smc_clc_msg_accept_confirm *clc)
{
        struct smc_clc_first_contact_ext *fce;
        int clc_v2_len;

        if (clc->hdr.version == SMC_V1 ||
            !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
                return;

        if (smc->conn.lgr->is_smcd) {
                memcpy(smc->conn.lgr->negotiated_eid, clc->d1.eid,
                       SMC_MAX_EID_LEN);
                clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1);
        } else {
                memcpy(smc->conn.lgr->negotiated_eid, clc->r1.eid,
                       SMC_MAX_EID_LEN);
                clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1);
        }
        fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len);
        smc->conn.lgr->peer_os = fce->os_type;
        smc->conn.lgr->peer_smc_release = fce->release;
        if (smc_isascii(fce->hostname))
                memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
                       SMC_MAX_HOSTNAME_LEN);
}

static void smcr_conn_save_peer_info(struct smc_sock *smc,
                                     struct smc_clc_msg_accept_confirm *clc)
{
        int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);

        smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
        smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
        smc->conn.peer_rmbe_size = bufsize;
        atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
        smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
}

static void smcd_conn_save_peer_info(struct smc_sock *smc,
                                     struct smc_clc_msg_accept_confirm *clc)
{
        int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);

        smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
        smc->conn.peer_token = ntohll(clc->d0.token);
        /* msg header takes up space in the buffer */
        smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
        atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
        smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
}

static void smc_conn_save_peer_info(struct smc_sock *smc,
                                    struct smc_clc_msg_accept_confirm *clc)
{
        if (smc->conn.lgr->is_smcd)
                smcd_conn_save_peer_info(smc, clc);
        else
                smcr_conn_save_peer_info(smc, clc);
        smc_conn_save_peer_info_fce(smc, clc);
}

static void smc_link_save_peer_info(struct smc_link *link,
                                    struct smc_clc_msg_accept_confirm *clc,
                                    struct smc_init_info *ini)
{
        link->peer_qpn = ntoh24(clc->r0.qpn);
        memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
        memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
        link->peer_psn = ntoh24(clc->r0.psn);
        link->peer_mtu = clc->r0.qp_mtu;
}

static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
                                       struct smc_stats_fback *fback_arr)
{
        int cnt;

        for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
                if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
                        fback_arr[cnt].count++;
                        break;
                }
                if (!fback_arr[cnt].fback_code) {
                        fback_arr[cnt].fback_code = smc->fallback_rsn;
                        fback_arr[cnt].count++;
                        break;
                }
        }
}

static void smc_stat_fallback(struct smc_sock *smc)
{
        struct net *net = sock_net(&smc->sk);

        mutex_lock(&net->smc.mutex_fback_rsn);
        if (smc->listen_smc) {
                smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
                net->smc.fback_rsn->srv_fback_cnt++;
        } else {
                smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
                net->smc.fback_rsn->clnt_fback_cnt++;
        }
        mutex_unlock(&net->smc.mutex_fback_rsn);
}

/* must be called under rcu read lock */
static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
{
        struct socket_wq *wq;
        __poll_t flags;

        wq = rcu_dereference(smc->sk.sk_wq);
        if (!skwq_has_sleeper(wq))
                return;

        /* wake up smc sk->sk_wq */
        if (!key) {
                /* sk_state_change */
                wake_up_interruptible_all(&wq->wait);
        } else {
                flags = key_to_poll(key);
                if (flags & (EPOLLIN | EPOLLOUT))
                        /* sk_data_ready or sk_write_space */
                        wake_up_interruptible_sync_poll(&wq->wait, flags);
                else if (flags & EPOLLERR)
                        /* sk_error_report */
                        wake_up_interruptible_poll(&wq->wait, flags);
        }
}

static int smc_fback_mark_woken(wait_queue_entry_t *wait,
                                unsigned int mode, int sync, void *key)
{
        struct smc_mark_woken *mark =
                container_of(wait, struct smc_mark_woken, wait_entry);

        mark->woken = true;
        mark->key = key;
        return 0;
}

static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
                                     void (*clcsock_callback)(struct sock *sk))
{
        struct smc_mark_woken mark = { .woken = false };
        struct socket_wq *wq;

        init_waitqueue_func_entry(&mark.wait_entry,
                                  smc_fback_mark_woken);
        rcu_read_lock();
        wq = rcu_dereference(clcsk->sk_wq);
        if (!wq)
                goto out;
        add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
        clcsock_callback(clcsk);
        remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);

        if (mark.woken)
                smc_fback_wakeup_waitqueue(smc, mark.key);
out:
        rcu_read_unlock();
}

static void smc_fback_state_change(struct sock *clcsk)
{
        struct smc_sock *smc;

        read_lock_bh(&clcsk->sk_callback_lock);
        smc = smc_clcsock_user_data(clcsk);
        if (smc)
                smc_fback_forward_wakeup(smc, clcsk,
                                         smc->clcsk_state_change);
        read_unlock_bh(&clcsk->sk_callback_lock);
}

static void smc_fback_data_ready(struct sock *clcsk)
{
        struct smc_sock *smc;

        read_lock_bh(&clcsk->sk_callback_lock);
        smc = smc_clcsock_user_data(clcsk);
        if (smc)
                smc_fback_forward_wakeup(smc, clcsk,
                                         smc->clcsk_data_ready);
        read_unlock_bh(&clcsk->sk_callback_lock);
}

static void smc_fback_write_space(struct sock *clcsk)
{
        struct smc_sock *smc;

        read_lock_bh(&clcsk->sk_callback_lock);
        smc = smc_clcsock_user_data(clcsk);
        if (smc)
                smc_fback_forward_wakeup(smc, clcsk,
                                         smc->clcsk_write_space);
        read_unlock_bh(&clcsk->sk_callback_lock);
}

static void smc_fback_error_report(struct sock *clcsk)
{
        struct smc_sock *smc;

        read_lock_bh(&clcsk->sk_callback_lock);
        smc = smc_clcsock_user_data(clcsk);
        if (smc)
                smc_fback_forward_wakeup(smc, clcsk,
                                         smc->clcsk_error_report);
        read_unlock_bh(&clcsk->sk_callback_lock);
}

static void smc_fback_replace_callbacks(struct smc_sock *smc)
{
        struct sock *clcsk = smc->clcsock->sk;

        write_lock_bh(&clcsk->sk_callback_lock);
        clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);

        smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change,
                               &smc->clcsk_state_change);
        smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready,
                               &smc->clcsk_data_ready);
        smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space,
                               &smc->clcsk_write_space);
        smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report,
                               &smc->clcsk_error_report);

        write_unlock_bh(&clcsk->sk_callback_lock);
}

static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
{
        int rc = 0;

        mutex_lock(&smc->clcsock_release_lock);
        if (!smc->clcsock) {
                rc = -EBADF;
                goto out;
        }

        smc->use_fallback = true;
        smc->fallback_rsn = reason_code;
        smc_stat_fallback(smc);
        trace_smc_switch_to_fallback(smc, reason_code);
        if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
                smc->clcsock->file = smc->sk.sk_socket->file;
                smc->clcsock->file->private_data = smc->clcsock;
                smc->clcsock->wq.fasync_list =
                        smc->sk.sk_socket->wq.fasync_list;
                smc->sk.sk_socket->wq.fasync_list = NULL;

                /* There might be some wait entries remaining
                 * in smc sk->sk_wq and they should be woken up
                 * as clcsock's wait queue is woken up.
                 */
                smc_fback_replace_callbacks(smc);
        }
out:
        mutex_unlock(&smc->clcsock_release_lock);
        return rc;
}

/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
        struct net *net = sock_net(&smc->sk);
        int rc = 0;

        rc = smc_switch_to_fallback(smc, reason_code);
        if (rc) { /* fallback fails */
                this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
                if (smc->sk.sk_state == SMC_INIT)
                        sock_put(&smc->sk); /* passive closing */
                return rc;
        }
        smc_copy_sock_settings_to_clc(smc);
        smc->connect_nonblock = 0;
        if (smc->sk.sk_state == SMC_INIT)
                smc->sk.sk_state = SMC_ACTIVE;
        return 0;
}

/* decline and fall back during connect */
static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
                                        u8 version)
{
        struct net *net = sock_net(&smc->sk);
        int rc;

        if (reason_code < 0) { /* error, fallback is not possible */
                this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
                if (smc->sk.sk_state == SMC_INIT)
                        sock_put(&smc->sk); /* passive closing */
                return reason_code;
        }
        if (reason_code != SMC_CLC_DECL_PEERDECL) {
                rc = smc_clc_send_decline(smc, reason_code, version);
                if (rc < 0) {
                        this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
                        if (smc->sk.sk_state == SMC_INIT)
                                sock_put(&smc->sk); /* passive closing */
                        return rc;
                }
        }
        return smc_connect_fallback(smc, reason_code);
}

static void smc_conn_abort(struct smc_sock *smc, int local_first)
{
        struct smc_connection *conn = &smc->conn;
        struct smc_link_group *lgr = conn->lgr;
        bool lgr_valid = false;

        if (smc_conn_lgr_valid(conn))
                lgr_valid = true;

        smc_conn_free(conn);
        if (local_first && lgr_valid)
                smc_lgr_cleanup_early(lgr);
}

/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
{
        /* PNET table look up: search active ib_device and port
         * within same PNETID that also contains the ethernet device
         * used for the internal TCP socket
         */
        smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
        if (!ini->check_smcrv2 && !ini->ib_dev)
                return SMC_CLC_DECL_NOSMCRDEV;
        if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
                return SMC_CLC_DECL_NOSMCRDEV;
        return 0;
}

/* check if there is an ISM device available for this connection. */
/* called for connect and listen */
static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
        /* Find ISM device with same PNETID as connecting interface  */
        smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
        if (!ini->ism_dev[0])
                return SMC_CLC_DECL_NOSMCDDEV;
        else
                ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
        return 0;
}

/* is chid unique for the ism devices that are already determined? */
static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
                                           int cnt)
{
        int i = (!ini->ism_dev[0]) ? 1 : 0;

        for (; i < cnt; i++)
                if (ini->ism_chid[i] == chid)
                        return false;
        return true;
}

/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
 * PNETID matching net_device)
 */
static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
                                       struct smc_init_info *ini)
{
        int rc = SMC_CLC_DECL_NOSMCDDEV;
        struct smcd_dev *smcd;
        int i = 1, entry = 1;
        bool is_emulated;
        u16 chid;

        if (smcd_indicated(ini->smc_type_v1))
                rc = 0;                /* already initialized for V1 */
        mutex_lock(&smcd_dev_list.mutex);
        list_for_each_entry(smcd, &smcd_dev_list.list, list) {
                if (smcd->going_away || smcd == ini->ism_dev[0])
                        continue;
                chid = smc_ism_get_chid(smcd);
                if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
                        continue;
                is_emulated = __smc_ism_is_emulated(chid);
                if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
                    smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
                        if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES)
                                /* It's the last GID-CHID entry left in CLC
                                 * Proposal SMC-Dv2 extension, but an Emulated-
                                 * ISM device will take two entries. So give
                                 * up it and try the next potential ISM device.
                                 */
                                continue;
                        ini->ism_dev[i] = smcd;
                        ini->ism_chid[i] = chid;
                        ini->is_smcd = true;
                        rc = 0;
                        i++;
                        entry = is_emulated ? entry + 2 : entry + 1;
                        if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES)
                                break;
                }
        }
        mutex_unlock(&smcd_dev_list.mutex);
        ini->ism_offered_cnt = i - 1;
        if (!ini->ism_dev[0] && !ini->ism_dev[1])
                ini->smcd_version = 0;

        return rc;
}

/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
                                      struct smc_init_info *ini)
{
        if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
                return SMC_CLC_DECL_ISMVLANERR;
        return 0;
}

static int smc_find_proposal_devices(struct smc_sock *smc,
                                     struct smc_init_info *ini)
{
        int rc = 0;

        /* check if there is an ism device available */
        if (!(ini->smcd_version & SMC_V1) ||
            smc_find_ism_device(smc, ini) ||
            smc_connect_ism_vlan_setup(smc, ini))
                ini->smcd_version &= ~SMC_V1;
        /* else ISM V1 is supported for this connection */

        /* check if there is an rdma device available */
        if (!(ini->smcr_version & SMC_V1) ||
            smc_find_rdma_device(smc, ini))
                ini->smcr_version &= ~SMC_V1;
        /* else RDMA is supported for this connection */

        ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
                                              ini->smcr_version & SMC_V1);

        /* check if there is an ism v2 device available */
        if (!(ini->smcd_version & SMC_V2) ||
            !smc_ism_is_v2_capable() ||
            smc_find_ism_v2_device_clnt(smc, ini))
                ini->smcd_version &= ~SMC_V2;

        /* check if there is an rdma v2 device available */
        ini->check_smcrv2 = true;
        ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
        if (!(ini->smcr_version & SMC_V2) ||
            smc->clcsock->sk->sk_family != AF_INET ||
            !smc_clc_ueid_count() ||
            smc_find_rdma_device(smc, ini))
                ini->smcr_version &= ~SMC_V2;
        ini->check_smcrv2 = false;

        ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
                                              ini->smcr_version & SMC_V2);

        /* if neither ISM nor RDMA are supported, fallback */
        if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
                rc = SMC_CLC_DECL_NOSMCDEV;

        return rc;
}

/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
 * used, the VLAN ID will be registered again during the connection setup.
 */
static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
                                        struct smc_init_info *ini)
{
        if (!smcd_indicated(ini->smc_type_v1))
                return 0;
        if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
                return SMC_CLC_DECL_CNFERR;
        return 0;
}

#define SMC_CLC_MAX_ACCEPT_LEN \
        (sizeof(struct smc_clc_msg_accept_confirm) + \
         sizeof(struct smc_clc_first_contact_ext_v2x) + \
         sizeof(struct smc_clc_msg_trail))

/* CLC handshake during connect */
static int smc_connect_clc(struct smc_sock *smc,
                           struct smc_clc_msg_accept_confirm *aclc,
                           struct smc_init_info *ini)
{
        int rc = 0;

        /* do inband token exchange */
        rc = smc_clc_send_proposal(smc, ini);
        if (rc)
                return rc;
        /* receive SMC Accept CLC message */
        return smc_clc_wait_msg(smc, aclc, SMC_CLC_MAX_ACCEPT_LEN,
                                SMC_CLC_ACCEPT, CLC_WAIT_TIME);
}

void smc_fill_gid_list(struct smc_link_group *lgr,
                       struct smc_gidlist *gidlist,
                       struct smc_ib_device *known_dev, u8 *known_gid)
{
        struct smc_init_info *alt_ini = NULL;

        memset(gidlist, 0, sizeof(*gidlist));
        memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);

        alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
        if (!alt_ini)
                goto out;

        alt_ini->vlan_id = lgr->vlan_id;
        alt_ini->check_smcrv2 = true;
        alt_ini->smcrv2.saddr = lgr->saddr;
        smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);

        if (!alt_ini->smcrv2.ib_dev_v2)
                goto out;

        memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
               SMC_GID_SIZE);

out:
        kfree(alt_ini);
}

static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
                                       struct smc_clc_msg_accept_confirm *aclc,
                                       struct smc_init_info *ini)
{
        struct smc_clc_first_contact_ext *fce =
                smc_get_clc_first_contact_ext(aclc, false);
        struct net *net = sock_net(&smc->sk);
        int rc;

        if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
                return 0;

        if (fce->v2_direct) {
                memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
                ini->smcrv2.uses_gateway = false;
        } else {
                if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr,
                                      smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
                                      ini->smcrv2.nexthop_mac,
                                      &ini->smcrv2.uses_gateway))
                        return SMC_CLC_DECL_NOROUTE;
                if (!ini->smcrv2.uses_gateway) {
                        /* mismatch: peer claims indirect, but its direct */
                        return SMC_CLC_DECL_NOINDIRECT;
                }
        }

        ini->release_nr = fce->release;
        rc = smc_clc_clnt_v2x_features_validate(fce, ini);
        if (rc)
                return rc;

        return 0;
}

/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc,
                            struct smc_clc_msg_accept_confirm *aclc,
                            struct smc_init_info *ini)
{
        int i, reason_code = 0;
        struct smc_link *link;
        u8 *eid = NULL;

        ini->is_smcd = false;
        ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
        ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
        memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
        memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
        memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
        ini->max_conns = SMC_CONN_PER_LGR_MAX;
        ini->max_links = SMC_LINKS_ADD_LNK_MAX;

        reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
        if (reason_code)
                return reason_code;

        mutex_lock(&smc_client_lgr_pending);
        reason_code = smc_conn_create(smc, ini);
        if (reason_code) {
                mutex_unlock(&smc_client_lgr_pending);
                return reason_code;
        }

        smc_conn_save_peer_info(smc, aclc);

        if (ini->first_contact_local) {
                link = smc->conn.lnk;
        } else {
                /* set link that was assigned by server */
                link = NULL;
                for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
                        struct smc_link *l = &smc->conn.lgr->lnk[i];

                        if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
                            !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
                                    SMC_GID_SIZE) &&
                            (aclc->hdr.version > SMC_V1 ||
                             !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
                                     sizeof(l->peer_mac)))) {
                                link = l;
                                break;
                        }
                }
                if (!link) {
                        reason_code = SMC_CLC_DECL_NOSRVLINK;
                        goto connect_abort;
                }
                smc_switch_link_and_count(&smc->conn, link);
        }

        /* create send buffer and rmb */
        if (smc_buf_create(smc, false)) {
                reason_code = SMC_CLC_DECL_MEM;
                goto connect_abort;
        }

        if (ini->first_contact_local)
                smc_link_save_peer_info(link, aclc, ini);

        if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
                reason_code = SMC_CLC_DECL_ERR_RTOK;
                goto connect_abort;
        }

        smc_close_init(smc);
        smc_rx_init(smc);

        if (ini->first_contact_local) {
                if (smc_ib_ready_link(link)) {
                        reason_code = SMC_CLC_DECL_ERR_RDYLNK;
                        goto connect_abort;
                }
        } else {
                /* reg sendbufs if they were vzalloced */
                if (smc->conn.sndbuf_desc->is_vm) {
                        if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) {
                                reason_code = SMC_CLC_DECL_ERR_REGBUF;
                                goto connect_abort;
                        }
                }
                if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
                        reason_code = SMC_CLC_DECL_ERR_REGBUF;
                        goto connect_abort;
                }
        }

        if (aclc->hdr.version > SMC_V1) {
                eid = aclc->r1.eid;
                if (ini->first_contact_local)
                        smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
                                          link->smcibdev, link->gid);
        }

        reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
                                           aclc->hdr.version, eid, ini);
        if (reason_code)
                goto connect_abort;

        smc_tx_init(smc);

        if (ini->first_contact_local) {
                /* QP confirmation over RoCE fabric */
                smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
                reason_code = smcr_clnt_conf_first_link(smc);
                smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
                if (reason_code)
                        goto connect_abort;
        }
        mutex_unlock(&smc_client_lgr_pending);

        smc_copy_sock_settings_to_clc(smc);
        smc->connect_nonblock = 0;
        if (smc->sk.sk_state == SMC_INIT)
                smc->sk.sk_state = SMC_ACTIVE;

        return 0;
connect_abort:
        smc_conn_abort(smc, ini->first_contact_local);
        mutex_unlock(&smc_client_lgr_pending);
        smc->connect_nonblock = 0;

        return reason_code;
}

/* The server has chosen one of the proposed ISM devices for the communication.
 * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
 */
static int
smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc,
                               struct smc_init_info *ini)
{
        int i;

        for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
                if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
                        ini->ism_selected = i;
                        return 0;
                }
        }

        return -EPROTO;
}

/* setup for ISM connection of client */
static int smc_connect_ism(struct smc_sock *smc,
                           struct smc_clc_msg_accept_confirm *aclc,
                           struct smc_init_info *ini)
{
        u8 *eid = NULL;
        int rc = 0;

        ini->is_smcd = true;
        ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;

        if (aclc->hdr.version == SMC_V2) {
                if (ini->first_contact_peer) {
                        struct smc_clc_first_contact_ext *fce =
                                smc_get_clc_first_contact_ext(aclc, true);

                        ini->release_nr = fce->release;
                        rc = smc_clc_clnt_v2x_features_validate(fce, ini);
                        if (rc)
                                return rc;
                }

                rc = smc_v2_determine_accepted_chid(aclc, ini);
                if (rc)
                        return rc;

                if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected]))
                        ini->ism_peer_gid[ini->ism_selected].gid_ext =
                                                ntohll(aclc->d1.gid_ext);
                /* for non-Emulated-ISM devices, peer gid_ext remains 0. */
        }
        ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid);

        /* there is only one lgr role for SMC-D; use server lock */
        mutex_lock(&smc_server_lgr_pending);
        rc = smc_conn_create(smc, ini);
        if (rc) {
                mutex_unlock(&smc_server_lgr_pending);
                return rc;
        }

        /* Create send and receive buffers */
        rc = smc_buf_create(smc, true);
        if (rc) {
                rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
                goto connect_abort;
        }

        smc_conn_save_peer_info(smc, aclc);

        if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) {
                rc = smcd_buf_attach(smc);
                if (rc) {
                        rc = SMC_CLC_DECL_MEM;        /* try to fallback */
                        goto connect_abort;
                }
        }
        smc_close_init(smc);
        smc_rx_init(smc);
        smc_tx_init(smc);

        if (aclc->hdr.version > SMC_V1)
                eid = aclc->d1.eid;

        rc = smc_clc_send_confirm(smc, ini->first_contact_local,
                                  aclc->hdr.version, eid, ini);
        if (rc)
                goto connect_abort;
        mutex_unlock(&smc_server_lgr_pending);

        smc_copy_sock_settings_to_clc(smc);
        smc->connect_nonblock = 0;
        if (smc->sk.sk_state == SMC_INIT)
                smc->sk.sk_state = SMC_ACTIVE;

        return 0;
connect_abort:
        smc_conn_abort(smc, ini->first_contact_local);
        mutex_unlock(&smc_server_lgr_pending);
        smc->connect_nonblock = 0;

        return rc;
}

/* check if received accept type and version matches a proposed one */
static int smc_connect_check_aclc(struct smc_init_info *ini,
                                  struct smc_clc_msg_accept_confirm *aclc)
{
        if (aclc->hdr.typev1 != SMC_TYPE_R &&
            aclc->hdr.typev1 != SMC_TYPE_D)
                return SMC_CLC_DECL_MODEUNSUPP;

        if (aclc->hdr.version >= SMC_V2) {
                if ((aclc->hdr.typev1 == SMC_TYPE_R &&
                     !smcr_indicated(ini->smc_type_v2)) ||
                    (aclc->hdr.typev1 == SMC_TYPE_D &&
                     !smcd_indicated(ini->smc_type_v2)))
                        return SMC_CLC_DECL_MODEUNSUPP;
        } else {
                if ((aclc->hdr.typev1 == SMC_TYPE_R &&
                     !smcr_indicated(ini->smc_type_v1)) ||
                    (aclc->hdr.typev1 == SMC_TYPE_D &&
                     !smcd_indicated(ini->smc_type_v1)))
                        return SMC_CLC_DECL_MODEUNSUPP;
        }

        return 0;
}

/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
        u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
        struct smc_clc_msg_accept_confirm *aclc;
        struct smc_init_info *ini = NULL;
        u8 *buf = NULL;
        int rc = 0;

        if (smc->use_fallback)
                return smc_connect_fallback(smc, smc->fallback_rsn);

        /* if peer has not signalled SMC-capability, fall back */
        if (!tcp_sk(smc->clcsock->sk)->syn_smc)
                return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);

        /* IPSec connections opt out of SMC optimizations */
        if (using_ipsec(smc))
                return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
                                                    version);

        ini = kzalloc(sizeof(*ini), GFP_KERNEL);
        if (!ini)
                return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
                                                    version);

        ini->smcd_version = SMC_V1 | SMC_V2;
        ini->smcr_version = SMC_V1 | SMC_V2;
        ini->smc_type_v1 = SMC_TYPE_B;
        ini->smc_type_v2 = SMC_TYPE_B;

        /* get vlan id from IP device */
        if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
                ini->smcd_version &= ~SMC_V1;
                ini->smcr_version = 0;
                ini->smc_type_v1 = SMC_TYPE_N;
                if (!ini->smcd_version) {
                        rc = SMC_CLC_DECL_GETVLANERR;
                        goto fallback;
                }
        }

        rc = smc_find_proposal_devices(smc, ini);
        if (rc)
                goto fallback;

        buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
        if (!buf) {
                rc = SMC_CLC_DECL_MEM;
                goto fallback;
        }
        aclc = (struct smc_clc_msg_accept_confirm *)buf;

        /* perform CLC handshake */
        rc = smc_connect_clc(smc, aclc, ini);
        if (rc) {
                /* -EAGAIN on timeout, see tcp_recvmsg() */
                if (rc == -EAGAIN) {
                        rc = -ETIMEDOUT;
                        smc->sk.sk_err = ETIMEDOUT;
                }
                goto vlan_cleanup;
        }

        /* check if smc modes and versions of CLC proposal and accept match */
        rc = smc_connect_check_aclc(ini, aclc);
        version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
        if (rc)
                goto vlan_cleanup;

        /* depending on previous steps, connect using rdma or ism */
        if (aclc->hdr.typev1 == SMC_TYPE_R) {
                ini->smcr_version = version;
                rc = smc_connect_rdma(smc, aclc, ini);
        } else if (aclc->hdr.typev1 == SMC_TYPE_D) {
                ini->smcd_version = version;
                rc = smc_connect_ism(smc, aclc, ini);
        }
        if (rc)
                goto vlan_cleanup;

        SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
        smc_connect_ism_vlan_cleanup(smc, ini);
        kfree(buf);
        kfree(ini);
        return 0;

vlan_cleanup:
        smc_connect_ism_vlan_cleanup(smc, ini);
        kfree(buf);
fallback:
        kfree(ini);
        return smc_connect_decline_fallback(smc, rc, version);
}

static void smc_connect_work(struct work_struct *work)
{
        struct smc_sock *smc = container_of(work, struct smc_sock,
                                            connect_work);
        long timeo = smc->sk.sk_sndtimeo;
        int rc = 0;

        if (!timeo)
                timeo = MAX_SCHEDULE_TIMEOUT;
        lock_sock(smc->clcsock->sk);
        if (smc->clcsock->sk->sk_err) {
                smc->sk.sk_err = smc->clcsock->sk->sk_err;
        } else if ((1 << smc->clcsock->sk->sk_state) &
                                        (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
                if ((rc == -EPIPE) &&
                    ((1 << smc->clcsock->sk->sk_state) &
                                        (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
                        rc = 0;
        }
        release_sock(smc->clcsock->sk);
        lock_sock(&smc->sk);
        if (rc != 0 || smc->sk.sk_err) {
                smc->sk.sk_state = SMC_CLOSED;
                if (rc == -EPIPE || rc == -EAGAIN)
                        smc->sk.sk_err = EPIPE;
                else if (rc == -ECONNREFUSED)
                        smc->sk.sk_err = ECONNREFUSED;
                else if (signal_pending(current))
                        smc->sk.sk_err = -sock_intr_errno(timeo);
                sock_put(&smc->sk); /* passive closing */
                goto out;
        }

        rc = __smc_connect(smc);
        if (rc < 0)
                smc->sk.sk_err = -rc;

out:
        if (!sock_flag(&smc->sk, SOCK_DEAD)) {
                if (smc->sk.sk_err) {
                        smc->sk.sk_state_change(&smc->sk);
                } else { /* allow polling before and after fallback decision */
                        smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
                        smc->sk.sk_write_space(&smc->sk);
                }
        }
        release_sock(&smc->sk);
}

int smc_connect(struct socket *sock, struct sockaddr *addr,
                int alen, int flags)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc = -EINVAL;

        smc = smc_sk(sk);

        /* separate smc parameter checking to be safe */
        if (alen < sizeof(addr->sa_family))
                goto out_err;
        if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
                goto out_err;

        lock_sock(sk);
        switch (sock->state) {
        default:
                rc = -EINVAL;
                goto out;
        case SS_CONNECTED:
                rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL;
                goto out;
        case SS_CONNECTING:
                if (sk->sk_state == SMC_ACTIVE)
                        goto connected;
                break;
        case SS_UNCONNECTED:
                sock->state = SS_CONNECTING;
                break;
        }

        switch (sk->sk_state) {
        default:
                goto out;
        case SMC_CLOSED:
                rc = sock_error(sk) ? : -ECONNABORTED;
                sock->state = SS_UNCONNECTED;
                goto out;
        case SMC_ACTIVE:
                rc = -EISCONN;
                goto out;
        case SMC_INIT:
                break;
        }

        smc_copy_sock_settings_to_clc(smc);
        tcp_sk(smc->clcsock->sk)->syn_smc = 1;
        if (smc->connect_nonblock) {
                rc = -EALREADY;
                goto out;
        }
        rc = kernel_connect(smc->clcsock, addr, alen, flags);
        if (rc && rc != -EINPROGRESS)
                goto out;

        if (smc->use_fallback) {
                sock->state = rc ? SS_CONNECTING : SS_CONNECTED;
                goto out;
        }
        sock_hold(&smc->sk); /* sock put in passive closing */
        if (flags & O_NONBLOCK) {
                if (queue_work(smc_hs_wq, &smc->connect_work))
                        smc->connect_nonblock = 1;
                rc = -EINPROGRESS;
                goto out;
        } else {
                rc = __smc_connect(smc);
                if (rc < 0)
                        goto out;
        }

connected:
        rc = 0;
        sock->state = SS_CONNECTED;
out:
        release_sock(sk);
out_err:
        return rc;
}

static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
{
        struct socket *new_clcsock = NULL;
        struct sock *lsk = &lsmc->sk;
        struct sock *new_sk;
        int rc = -EINVAL;

        release_sock(lsk);
        new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
        if (!new_sk) {
                rc = -ENOMEM;
                lsk->sk_err = ENOMEM;
                *new_smc = NULL;
                lock_sock(lsk);
                goto out;
        }
        *new_smc = smc_sk(new_sk);

        mutex_lock(&lsmc->clcsock_release_lock);
        if (lsmc->clcsock)
                rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
        mutex_unlock(&lsmc->clcsock_release_lock);
        lock_sock(lsk);
        if  (rc < 0 && rc != -EAGAIN)
                lsk->sk_err = -rc;
        if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
                new_sk->sk_prot->unhash(new_sk);
                if (new_clcsock)
                        sock_release(new_clcsock);
                new_sk->sk_state = SMC_CLOSED;
                smc_sock_set_flag(new_sk, SOCK_DEAD);
                sock_put(new_sk); /* final */
                *new_smc = NULL;
                goto out;
        }

        /* new clcsock has inherited the smc listen-specific sk_data_ready
         * function; switch it back to the original sk_data_ready function
         */
        new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;

        /* if new clcsock has also inherited the fallback-specific callback
         * functions, switch them back to the original ones.
         */
        if (lsmc->use_fallback) {
                if (lsmc->clcsk_state_change)
                        new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change;
                if (lsmc->clcsk_write_space)
                        new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space;
                if (lsmc->clcsk_error_report)
                        new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report;
        }

        (*new_smc)->clcsock = new_clcsock;
out:
        return rc;
}

/* add a just created sock to the accept queue of the listen sock as
 * candidate for a following socket accept call from user space
 */
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
{
        struct smc_sock *par = smc_sk(parent);

        sock_hold(sk); /* sock_put in smc_accept_unlink () */
        spin_lock(&par->accept_q_lock);
        list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
        spin_unlock(&par->accept_q_lock);
        sk_acceptq_added(parent);
}

/* remove a socket from the accept queue of its parental listening socket */
static void smc_accept_unlink(struct sock *sk)
{
        struct smc_sock *par = smc_sk(sk)->listen_smc;

        spin_lock(&par->accept_q_lock);
        list_del_init(&smc_sk(sk)->accept_q);
        spin_unlock(&par->accept_q_lock);
        sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
        sock_put(sk); /* sock_hold in smc_accept_enqueue */
}

/* remove a sock from the accept queue to bind it to a new socket created
 * for a socket accept call from user space
 */
struct sock *smc_accept_dequeue(struct sock *parent,
                                struct socket *new_sock)
{
        struct smc_sock *isk, *n;
        struct sock *new_sk;

        list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
                new_sk = (struct sock *)isk;

                smc_accept_unlink(new_sk);
                if (new_sk->sk_state == SMC_CLOSED) {
                        new_sk->sk_prot->unhash(new_sk);
                        if (isk->clcsock) {
                                sock_release(isk->clcsock);
                                isk->clcsock = NULL;
                        }
                        sock_put(new_sk); /* final */
                        continue;
                }
                if (new_sock) {
                        sock_graft(new_sk, new_sock);
                        new_sock->state = SS_CONNECTED;
                        if (isk->use_fallback) {
                                smc_sk(new_sk)->clcsock->file = new_sock->file;
                                isk->clcsock->file->private_data = isk->clcsock;
                        }
                }
                return new_sk;
        }
        return NULL;
}

/* clean up for a created but never accepted sock */
void smc_close_non_accepted(struct sock *sk)
{
        struct smc_sock *smc = smc_sk(sk);

        sock_hold(sk); /* sock_put below */
        lock_sock(sk);
        if (!sk->sk_lingertime)
                /* wait for peer closing */
                WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT);
        __smc_release(smc);
        release_sock(sk);
        sock_put(sk); /* sock_hold above */
        sock_put(sk); /* final sock_put */
}

static int smcr_serv_conf_first_link(struct smc_sock *smc)
{
        struct smc_link *link = smc->conn.lnk;
        struct smc_llc_qentry *qentry;
        int rc;

        /* reg the sndbuf if it was vzalloced*/
        if (smc->conn.sndbuf_desc->is_vm) {
                if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
                        return SMC_CLC_DECL_ERR_REGBUF;
        }

        /* reg the rmb */
        if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
                return SMC_CLC_DECL_ERR_REGBUF;

        /* send CONFIRM LINK request to client over the RoCE fabric */
        rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
        if (rc < 0)
                return SMC_CLC_DECL_TIMEOUT_CL;

        /* receive CONFIRM LINK response from client over the RoCE fabric */
        qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
                              SMC_LLC_CONFIRM_LINK);
        if (!qentry) {
                struct smc_clc_msg_decline dclc;

                rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
                                      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
                return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
        }
        smc_llc_save_peer_uid(qentry);
        rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
        smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
        if (rc)
                return SMC_CLC_DECL_RMBE_EC;

        /* confirm_rkey is implicit on 1st contact */
        smc->conn.rmb_desc->is_conf_rkey = true;

        smc_llc_link_active(link);
        smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);

        if (link->lgr->max_links > 1) {
                down_write(&link->lgr->llc_conf_mutex);
                /* initial contact - try to establish second link */
                smc_llc_srv_add_link(link, NULL);
                up_write(&link->lgr->llc_conf_mutex);
        }
        return 0;
}

/* listen worker: finish */
static void smc_listen_out(struct smc_sock *new_smc)
{
        struct smc_sock *lsmc = new_smc->listen_smc;
        struct sock *newsmcsk = &new_smc->sk;

        if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
                atomic_dec(&lsmc->queued_smc_hs);

        if (lsmc->sk.sk_state == SMC_LISTEN) {
                lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
                smc_accept_enqueue(&lsmc->sk, newsmcsk);
                release_sock(&lsmc->sk);
        } else { /* no longer listening */
                smc_close_non_accepted(newsmcsk);
        }

        /* Wake up accept */
        lsmc->sk.sk_data_ready(&lsmc->sk);
        sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
}

/* listen worker: finish in state connected */
static void smc_listen_out_connected(struct smc_sock *new_smc)
{
        struct sock *newsmcsk = &new_smc->sk;

        if (newsmcsk->sk_state == SMC_INIT)
                newsmcsk->sk_state = SMC_ACTIVE;

        smc_listen_out(new_smc);
}

/* listen worker: finish in error state */
static void smc_listen_out_err(struct smc_sock *new_smc)
{
        struct sock *newsmcsk = &new_smc->sk;
        struct net *net = sock_net(newsmcsk);

        this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt);
        if (newsmcsk->sk_state == SMC_INIT)
                sock_put(&new_smc->sk); /* passive closing */
        newsmcsk->sk_state = SMC_CLOSED;

        smc_listen_out(new_smc);
}

/* listen worker: decline and fall back if possible */
static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
                               int local_first, u8 version)
{
        /* RDMA setup failed, switch back to TCP */
        smc_conn_abort(new_smc, local_first);
        if (reason_code < 0 ||
            smc_switch_to_fallback(new_smc, reason_code)) {
                /* error, no fallback possible */
                smc_listen_out_err(new_smc);
                return;
        }
        if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
                if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
                        smc_listen_out_err(new_smc);
                        return;
                }
        }
        smc_listen_out_connected(new_smc);
}

/* listen worker: version checking */
static int smc_listen_v2_check(struct smc_sock *new_smc,
                               struct smc_clc_msg_proposal *pclc,
                               struct smc_init_info *ini)
{
        struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
        struct smc_clc_v2_extension *pclc_v2_ext;
        int rc = SMC_CLC_DECL_PEERNOSMC;

        ini->smc_type_v1 = pclc->hdr.typev1;
        ini->smc_type_v2 = pclc->hdr.typev2;
        ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
        ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
        if (pclc->hdr.version > SMC_V1) {
                if (smcd_indicated(ini->smc_type_v2))
                        ini->smcd_version |= SMC_V2;
                if (smcr_indicated(ini->smc_type_v2))
                        ini->smcr_version |= SMC_V2;
        }
        if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
                rc = SMC_CLC_DECL_PEERNOSMC;
                goto out;
        }
        pclc_v2_ext = smc_get_clc_v2_ext(pclc);
        if (!pclc_v2_ext) {
                ini->smcd_version &= ~SMC_V2;
                ini->smcr_version &= ~SMC_V2;
                rc = SMC_CLC_DECL_NOV2EXT;
                goto out;
        }
        pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
        if (ini->smcd_version & SMC_V2) {
                if (!smc_ism_is_v2_capable()) {
                        ini->smcd_version &= ~SMC_V2;
                        rc = SMC_CLC_DECL_NOISM2SUPP;
                } else if (!pclc_smcd_v2_ext) {
                        ini->smcd_version &= ~SMC_V2;
                        rc = SMC_CLC_DECL_NOV2DEXT;
                } else if (!pclc_v2_ext->hdr.eid_cnt &&
                           !pclc_v2_ext->hdr.flag.seid) {
                        ini->smcd_version &= ~SMC_V2;
                        rc = SMC_CLC_DECL_NOUEID;
                }
        }
        if (ini->smcr_version & SMC_V2) {
                if (!pclc_v2_ext->hdr.eid_cnt) {
                        ini->smcr_version &= ~SMC_V2;
                        rc = SMC_CLC_DECL_NOUEID;
                }
        }

        ini->release_nr = pclc_v2_ext->hdr.flag.release;
        if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE)
                ini->release_nr = SMC_RELEASE;

out:
        if (!ini->smcd_version && !ini->smcr_version)
                return rc;

        return 0;
}

/* listen worker: check prefixes */
static int smc_listen_prfx_check(struct smc_sock *new_smc,
                                 struct smc_clc_msg_proposal *pclc)
{
        struct smc_clc_msg_proposal_prefix *pclc_prfx;
        struct socket *newclcsock = new_smc->clcsock;

        if (pclc->hdr.typev1 == SMC_TYPE_N)
                return 0;
        pclc_prfx = smc_clc_proposal_get_prefix(pclc);
        if (smc_clc_prfx_match(newclcsock, pclc_prfx))
                return SMC_CLC_DECL_DIFFPREFIX;

        return 0;
}

/* listen worker: initialize connection and buffers */
static int smc_listen_rdma_init(struct smc_sock *new_smc,
                                struct smc_init_info *ini)
{
        int rc;

        /* allocate connection / link group */
        rc = smc_conn_create(new_smc, ini);
        if (rc)
                return rc;

        /* create send buffer and rmb */
        if (smc_buf_create(new_smc, false)) {
                smc_conn_abort(new_smc, ini->first_contact_local);
                return SMC_CLC_DECL_MEM;
        }

        return 0;
}

/* listen worker: initialize connection and buffers for SMC-D */
static int smc_listen_ism_init(struct smc_sock *new_smc,
                               struct smc_init_info *ini)
{
        int rc;

        rc = smc_conn_create(new_smc, ini);
        if (rc)
                return rc;

        /* Create send and receive buffers */
        rc = smc_buf_create(new_smc, true);
        if (rc) {
                smc_conn_abort(new_smc, ini->first_contact_local);
                return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
                                         SMC_CLC_DECL_MEM;
        }

        return 0;
}

static bool smc_is_already_selected(struct smcd_dev *smcd,
                                    struct smc_init_info *ini,
                                    int matches)
{
        int i;

        for (i = 0; i < matches; i++)
                if (smcd == ini->ism_dev[i])
                        return true;

        return false;
}

/* check for ISM devices matching proposed ISM devices */
static void smc_check_ism_v2_match(struct smc_init_info *ini,
                                   u16 proposed_chid,
                                   struct smcd_gid *proposed_gid,
                                   unsigned int *matches)
{
        struct smcd_dev *smcd;

        list_for_each_entry(smcd, &smcd_dev_list.list, list) {
                if (smcd->going_away)
                        continue;
                if (smc_is_already_selected(smcd, ini, *matches))
                        continue;
                if (smc_ism_get_chid(smcd) == proposed_chid &&
                    !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
                        ini->ism_peer_gid[*matches].gid = proposed_gid->gid;
                        if (__smc_ism_is_emulated(proposed_chid))
                                ini->ism_peer_gid[*matches].gid_ext =
                                                        proposed_gid->gid_ext;
                                /* non-Emulated-ISM's peer gid_ext remains 0. */
                        ini->ism_dev[*matches] = smcd;
                        (*matches)++;
                        break;
                }
        }
}

static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
{
        if (!ini->rc)
                ini->rc = rc;
}

static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
                                        struct smc_clc_msg_proposal *pclc,
                                        struct smc_init_info *ini)
{
        struct smc_clc_smcd_v2_extension *smcd_v2_ext;
        struct smc_clc_v2_extension *smc_v2_ext;
        struct smc_clc_msg_smcd *pclc_smcd;
        unsigned int matches = 0;
        struct smcd_gid smcd_gid;
        u8 smcd_version;
        u8 *eid = NULL;
        int i, rc;
        u16 chid;

        if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
                goto not_found;

        pclc_smcd = smc_get_clc_msg_smcd(pclc);
        smc_v2_ext = smc_get_clc_v2_ext(pclc);
        smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);

        mutex_lock(&smcd_dev_list.mutex);
        if (pclc_smcd->ism.chid) {
                /* check for ISM device matching proposed native ISM device */
                smcd_gid.gid = ntohll(pclc_smcd->ism.gid);
                smcd_gid.gid_ext = 0;
                smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
                                       &smcd_gid, &matches);
        }
        for (i = 0; i < smc_v2_ext->hdr.ism_gid_cnt; i++) {
                /* check for ISM devices matching proposed non-native ISM
                 * devices
                 */
                smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid);
                smcd_gid.gid_ext = 0;
                chid = ntohs(smcd_v2_ext->gidchid[i].chid);
                if (__smc_ism_is_emulated(chid)) {
                        if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt ||
                            chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid))
                                /* each Emulated-ISM device takes two GID-CHID
                                 * entries and CHID of the second entry repeats
                                 * that of the first entry.
                                 *
                                 * So check if the next GID-CHID entry exists
                                 * and both two entries' CHIDs are the same.
                                 */
                                continue;
                        smcd_gid.gid_ext =
                                ntohll(smcd_v2_ext->gidchid[++i].gid);
                }
                smc_check_ism_v2_match(ini, chid, &smcd_gid, &matches);
        }
        mutex_unlock(&smcd_dev_list.mutex);

        if (!ini->ism_dev[0]) {
                smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
                goto not_found;
        }

        smc_ism_get_system_eid(&eid);
        if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
                               smcd_v2_ext->system_eid, eid))
                goto not_found;

        /* separate - outside the smcd_dev_list.lock */
        smcd_version = ini->smcd_version;
        for (i = 0; i < matches; i++) {
                ini->smcd_version = SMC_V2;
                ini->is_smcd = true;
                ini->ism_selected = i;
                rc = smc_listen_ism_init(new_smc, ini);
                if (rc) {
                        smc_find_ism_store_rc(rc, ini);
                        /* try next active ISM device */
                        continue;
                }
                return; /* matching and usable V2 ISM device found */
        }
        /* no V2 ISM device could be initialized */
        ini->smcd_version = smcd_version;        /* restore original value */
        ini->negotiated_eid[0] = 0;

not_found:
        ini->smcd_version &= ~SMC_V2;
        ini->ism_dev[0] = NULL;
        ini->is_smcd = false;
}

static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
                                        struct smc_clc_msg_proposal *pclc,
                                        struct smc_init_info *ini)
{
        struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
        int rc = 0;

        /* check if ISM V1 is available */
        if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
                goto not_found;
        ini->is_smcd = true; /* prepare ISM check */
        ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid);
        ini->ism_peer_gid[0].gid_ext = 0;
        rc = smc_find_ism_device(new_smc, ini);
        if (rc)
                goto not_found;
        ini->ism_selected = 0;
        rc = smc_listen_ism_init(new_smc, ini);
        if (!rc)
                return;                /* V1 ISM device found */

not_found:
        smc_find_ism_store_rc(rc, ini);
        ini->smcd_version &= ~SMC_V1;
        ini->ism_dev[0] = NULL;
        ini->is_smcd = false;
}

/* listen worker: register buffers */
static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
{
        struct smc_connection *conn = &new_smc->conn;

        if (!local_first) {
                /* reg sendbufs if they were vzalloced */
                if (conn->sndbuf_desc->is_vm) {
                        if (smcr_lgr_reg_sndbufs(conn->lnk,
                                                 conn->sndbuf_desc))
                                return SMC_CLC_DECL_ERR_REGBUF;
                }
                if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
                        return SMC_CLC_DECL_ERR_REGBUF;
        }

        return 0;
}

static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
                                         struct smc_clc_msg_proposal *pclc,
                                         struct smc_init_info *ini)
{
        struct smc_clc_v2_extension *smc_v2_ext;
        u8 smcr_version;
        int rc;

        if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
                goto not_found;

        smc_v2_ext = smc_get_clc_v2_ext(pclc);
        if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
                goto not_found;

        /* prepare RDMA check */
        memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
        memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
        memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
        ini->check_smcrv2 = true;
        ini->smcrv2.clc_sk = new_smc->clcsock->sk;
        ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
        ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
        rc = smc_find_rdma_device(new_smc, ini);
        if (rc) {
                smc_find_ism_store_rc(rc, ini);
                goto not_found;
        }
        if (!ini->smcrv2.uses_gateway)
                memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);

        smcr_version = ini->smcr_version;
        ini->smcr_version = SMC_V2;
        rc = smc_listen_rdma_init(new_smc, ini);
        if (!rc) {
                rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
                if (rc)
                        smc_conn_abort(new_smc, ini->first_contact_local);
        }
        if (!rc)
                return;
        ini->smcr_version = smcr_version;
        smc_find_ism_store_rc(rc, ini);

not_found:
        ini->smcr_version &= ~SMC_V2;
        ini->smcrv2.ib_dev_v2 = NULL;
        ini->check_smcrv2 = false;
}

static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
                                        struct smc_clc_msg_proposal *pclc,
                                        struct smc_init_info *ini)
{
        int rc;

        if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
                return SMC_CLC_DECL_NOSMCDEV;

        /* prepare RDMA check */
        memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
        memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
        memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
        rc = smc_find_rdma_device(new_smc, ini);
        if (rc) {
                /* no RDMA device found */
                return SMC_CLC_DECL_NOSMCDEV;
        }
        rc = smc_listen_rdma_init(new_smc, ini);
        if (rc)
                return rc;
        return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
}

/* determine the local device matching to proposal */
static int smc_listen_find_device(struct smc_sock *new_smc,
                                  struct smc_clc_msg_proposal *pclc,
                                  struct smc_init_info *ini)
{
        int prfx_rc;

        /* check for ISM device matching V2 proposed device */
        smc_find_ism_v2_device_serv(new_smc, pclc, ini);
        if (ini->ism_dev[0])
                return 0;

        /* check for matching IP prefix and subnet length (V1) */
        prfx_rc = smc_listen_prfx_check(new_smc, pclc);
        if (prfx_rc)
                smc_find_ism_store_rc(prfx_rc, ini);

        /* get vlan id from IP device */
        if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
                return ini->rc ?: SMC_CLC_DECL_GETVLANERR;

        /* check for ISM device matching V1 proposed device */
        if (!prfx_rc)
                smc_find_ism_v1_device_serv(new_smc, pclc, ini);
        if (ini->ism_dev[0])
                return 0;

        if (!smcr_indicated(pclc->hdr.typev1) &&
            !smcr_indicated(pclc->hdr.typev2))
                /* skip RDMA and decline */
                return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;

        /* check if RDMA V2 is available */
        smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
        if (ini->smcrv2.ib_dev_v2)
                return 0;

        /* check if RDMA V1 is available */
        if (!prfx_rc) {
                int rc;

                rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
                smc_find_ism_store_rc(rc, ini);
                return (!rc) ? 0 : ini->rc;
        }
        return prfx_rc;
}

/* listen worker: finish RDMA setup */
static int smc_listen_rdma_finish(struct smc_sock *new_smc,
                                  struct smc_clc_msg_accept_confirm *cclc,
                                  bool local_first,
                                  struct smc_init_info *ini)
{
        struct smc_link *link = new_smc->conn.lnk;
        int reason_code = 0;

        if (local_first)
                smc_link_save_peer_info(link, cclc, ini);

        if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
                return SMC_CLC_DECL_ERR_RTOK;

        if (local_first) {
                if (smc_ib_ready_link(link))
                        return SMC_CLC_DECL_ERR_RDYLNK;
                /* QP confirmation over RoCE fabric */
                smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
                reason_code = smcr_serv_conf_first_link(new_smc);
                smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
        }
        return reason_code;
}

/* setup for connection of server */
static void smc_listen_work(struct work_struct *work)
{
        struct smc_sock *new_smc = container_of(work, struct smc_sock,
                                                smc_listen_work);
        struct socket *newclcsock = new_smc->clcsock;
        struct smc_clc_msg_accept_confirm *cclc;
        struct smc_clc_msg_proposal_area *buf;
        struct smc_clc_msg_proposal *pclc;
        struct smc_init_info *ini = NULL;
        u8 proposal_version = SMC_V1;
        u8 accept_version;
        int rc = 0;

        if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
                return smc_listen_out_err(new_smc);

        if (new_smc->use_fallback) {
                smc_listen_out_connected(new_smc);
                return;
        }

        /* check if peer is smc capable */
        if (!tcp_sk(newclcsock->sk)->syn_smc) {
                rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
                if (rc)
                        smc_listen_out_err(new_smc);
                else
                        smc_listen_out_connected(new_smc);
                return;
        }

        /* do inband token exchange -
         * wait for and receive SMC Proposal CLC message
         */
        buf = kzalloc(sizeof(*buf), GFP_KERNEL);
        if (!buf) {
                rc = SMC_CLC_DECL_MEM;
                goto out_decl;
        }
        pclc = (struct smc_clc_msg_proposal *)buf;
        rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
                              SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
        if (rc)
                goto out_decl;

        if (pclc->hdr.version > SMC_V1)
                proposal_version = SMC_V2;

        /* IPSec connections opt out of SMC optimizations */
        if (using_ipsec(new_smc)) {
                rc = SMC_CLC_DECL_IPSEC;
                goto out_decl;
        }

        ini = kzalloc(sizeof(*ini), GFP_KERNEL);
        if (!ini) {
                rc = SMC_CLC_DECL_MEM;
                goto out_decl;
        }

        /* initial version checking */
        rc = smc_listen_v2_check(new_smc, pclc, ini);
        if (rc)
                goto out_decl;

        rc = smc_clc_srv_v2x_features_validate(new_smc, pclc, ini);
        if (rc)
                goto out_decl;

        mutex_lock(&smc_server_lgr_pending);
        smc_close_init(new_smc);
        smc_rx_init(new_smc);
        smc_tx_init(new_smc);

        /* determine ISM or RoCE device used for connection */
        rc = smc_listen_find_device(new_smc, pclc, ini);
        if (rc)
                goto out_unlock;

        /* send SMC Accept CLC message */
        accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
        rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
                                 accept_version, ini->negotiated_eid, ini);
        if (rc)
                goto out_unlock;

        /* SMC-D does not need this lock any more */
        if (ini->is_smcd)
                mutex_unlock(&smc_server_lgr_pending);

        /* receive SMC Confirm CLC message */
        memset(buf, 0, sizeof(*buf));
        cclc = (struct smc_clc_msg_accept_confirm *)buf;
        rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
                              SMC_CLC_CONFIRM, CLC_WAIT_TIME);
        if (rc) {
                if (!ini->is_smcd)
                        goto out_unlock;
                goto out_decl;
        }

        rc = smc_clc_v2x_features_confirm_check(cclc, ini);
        if (rc) {
                if (!ini->is_smcd)
                        goto out_unlock;
                goto out_decl;
        }

        /* fce smc release version is needed in smc_listen_rdma_finish,
         * so save fce info here.
         */
        smc_conn_save_peer_info_fce(new_smc, cclc);

        /* finish worker */
        if (!ini->is_smcd) {
                rc = smc_listen_rdma_finish(new_smc, cclc,
                                            ini->first_contact_local, ini);
                if (rc)
                        goto out_unlock;
                mutex_unlock(&smc_server_lgr_pending);
        }
        smc_conn_save_peer_info(new_smc, cclc);

        if (ini->is_smcd &&
            smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) {
                rc = smcd_buf_attach(new_smc);
                if (rc)
                        goto out_decl;
        }

        smc_listen_out_connected(new_smc);
        SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
        goto out_free;

out_unlock:
        mutex_unlock(&smc_server_lgr_pending);
out_decl:
        smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
                           proposal_version);
out_free:
        kfree(ini);
        kfree(buf);
}

static void smc_tcp_listen_work(struct work_struct *work)
{
        struct smc_sock *lsmc = container_of(work, struct smc_sock,
                                             tcp_listen_work);
        struct sock *lsk = &lsmc->sk;
        struct smc_sock *new_smc;
        int rc = 0;

        lock_sock(lsk);
        while (lsk->sk_state == SMC_LISTEN) {
                rc = smc_clcsock_accept(lsmc, &new_smc);
                if (rc) /* clcsock accept queue empty or error */
                        goto out;
                if (!new_smc)
                        continue;

                if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
                        atomic_inc(&lsmc->queued_smc_hs);

                new_smc->listen_smc = lsmc;
                new_smc->use_fallback = lsmc->use_fallback;
                new_smc->fallback_rsn = lsmc->fallback_rsn;
                sock_hold(lsk); /* sock_put in smc_listen_work */
                INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
                smc_copy_sock_settings_to_smc(new_smc);
                sock_hold(&new_smc->sk); /* sock_put in passive closing */
                if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
                        sock_put(&new_smc->sk);
        }

out:
        release_sock(lsk);
        sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
}

static void smc_clcsock_data_ready(struct sock *listen_clcsock)
{
        struct smc_sock *lsmc;

        read_lock_bh(&listen_clcsock->sk_callback_lock);
        lsmc = smc_clcsock_user_data(listen_clcsock);
        if (!lsmc)
                goto out;
        lsmc->clcsk_data_ready(listen_clcsock);
        if (lsmc->sk.sk_state == SMC_LISTEN) {
                sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
                if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
                        sock_put(&lsmc->sk);
        }
out:
        read_unlock_bh(&listen_clcsock->sk_callback_lock);
}

int smc_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc;

        smc = smc_sk(sk);
        lock_sock(sk);

        rc = -EINVAL;
        if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
            smc->connect_nonblock || sock->state != SS_UNCONNECTED)
                goto out;

        rc = 0;
        if (sk->sk_state == SMC_LISTEN) {
                sk->sk_max_ack_backlog = backlog;
                goto out;
        }
        /* some socket options are handled in core, so we could not apply
         * them to the clc socket -- copy smc socket options to clc socket
         */
        smc_copy_sock_settings_to_clc(smc);
        if (!smc->use_fallback)
                tcp_sk(smc->clcsock->sk)->syn_smc = 1;

        /* save original sk_data_ready function and establish
         * smc-specific sk_data_ready function
         */
        write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
        smc->clcsock->sk->sk_user_data =
                (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
        smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready,
                               smc_clcsock_data_ready, &smc->clcsk_data_ready);
        write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);

        /* save original ops */
        smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;

        smc->af_ops = *smc->ori_af_ops;
        smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;

        inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;

        if (smc->limit_smc_hs)
                tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;

        rc = kernel_listen(smc->clcsock, backlog);
        if (rc) {
                write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
                smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
                                       &smc->clcsk_data_ready);
                smc->clcsock->sk->sk_user_data = NULL;
                write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
                goto out;
        }
        sk->sk_max_ack_backlog = backlog;
        sk->sk_ack_backlog = 0;
        sk->sk_state = SMC_LISTEN;

out:
        release_sock(sk);
        return rc;
}

int smc_accept(struct socket *sock, struct socket *new_sock,
               struct proto_accept_arg *arg)
{
        struct sock *sk = sock->sk, *nsk;
        DECLARE_WAITQUEUE(wait, current);
        struct smc_sock *lsmc;
        long timeo;
        int rc = 0;

        lsmc = smc_sk(sk);
        sock_hold(sk); /* sock_put below */
        lock_sock(sk);

        if (lsmc->sk.sk_state != SMC_LISTEN) {
                rc = -EINVAL;
                release_sock(sk);
                goto out;
        }

        /* Wait for an incoming connection */
        timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
        add_wait_queue_exclusive(sk_sleep(sk), &wait);
        while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (!timeo) {
                        rc = -EAGAIN;
                        break;
                }
                release_sock(sk);
                timeo = schedule_timeout(timeo);
                /* wakeup by sk_data_ready in smc_listen_work() */
                sched_annotate_sleep();
                lock_sock(sk);
                if (signal_pending(current)) {
                        rc = sock_intr_errno(timeo);
                        break;
                }
        }
        set_current_state(TASK_RUNNING);
        remove_wait_queue(sk_sleep(sk), &wait);

        if (!rc)
                rc = sock_error(nsk);
        release_sock(sk);
        if (rc)
                goto out;

        if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) {
                /* wait till data arrives on the socket */
                timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
                                                                MSEC_PER_SEC);
                if (smc_sk(nsk)->use_fallback) {
                        struct sock *clcsk = smc_sk(nsk)->clcsock->sk;

                        lock_sock(clcsk);
                        if (skb_queue_empty(&clcsk->sk_receive_queue))
                                sk_wait_data(clcsk, &timeo, NULL);
                        release_sock(clcsk);
                } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
                        lock_sock(nsk);
                        smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
                        release_sock(nsk);
                }
        }

out:
        sock_put(sk); /* sock_hold above */
        return rc;
}

int smc_getname(struct socket *sock, struct sockaddr *addr,
                int peer)
{
        struct smc_sock *smc;

        if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
            (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
                return -ENOTCONN;

        smc = smc_sk(sock->sk);

        return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
}

int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc;

        smc = smc_sk(sk);
        lock_sock(sk);

        /* SMC does not support connect with fastopen */
        if (msg->msg_flags & MSG_FASTOPEN) {
                /* not connected yet, fallback */
                if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
                        rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
                        if (rc)
                                goto out;
                } else {
                        rc = -EINVAL;
                        goto out;
                }
        } else if ((sk->sk_state != SMC_ACTIVE) &&
                   (sk->sk_state != SMC_APPCLOSEWAIT1) &&
                   (sk->sk_state != SMC_INIT)) {
                rc = -EPIPE;
                goto out;
        }

        if (smc->use_fallback) {
                rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
        } else {
                rc = smc_tx_sendmsg(smc, msg, len);
                SMC_STAT_TX_PAYLOAD(smc, len, rc);
        }
out:
        release_sock(sk);
        return rc;
}

int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                int flags)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc = -ENOTCONN;

        smc = smc_sk(sk);
        lock_sock(sk);
        if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
                /* socket was connected before, no more data to read */
                rc = 0;
                goto out;
        }
        if ((sk->sk_state == SMC_INIT) ||
            (sk->sk_state == SMC_LISTEN) ||
            (sk->sk_state == SMC_CLOSED))
                goto out;

        if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
                rc = 0;
                goto out;
        }

        if (smc->use_fallback) {
                rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
        } else {
                msg->msg_namelen = 0;
                rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
                SMC_STAT_RX_PAYLOAD(smc, rc, rc);
        }

out:
        release_sock(sk);
        return rc;
}

static __poll_t smc_accept_poll(struct sock *parent)
{
        struct smc_sock *isk = smc_sk(parent);
        __poll_t mask = 0;

        spin_lock(&isk->accept_q_lock);
        if (!list_empty(&isk->accept_q))
                mask = EPOLLIN | EPOLLRDNORM;
        spin_unlock(&isk->accept_q_lock);

        return mask;
}

__poll_t smc_poll(struct file *file, struct socket *sock,
                  poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        __poll_t mask = 0;

        if (!sk)
                return EPOLLNVAL;

        smc = smc_sk(sock->sk);
        if (smc->use_fallback) {
                /* delegate to CLC child sock */
                mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
                sk->sk_err = smc->clcsock->sk->sk_err;
        } else {
                if (sk->sk_state != SMC_CLOSED)
                        sock_poll_wait(file, sock, wait);
                if (sk->sk_err)
                        mask |= EPOLLERR;
                if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
                    (sk->sk_state == SMC_CLOSED))
                        mask |= EPOLLHUP;
                if (sk->sk_state == SMC_LISTEN) {
                        /* woken up by sk_data_ready in smc_listen_work() */
                        mask |= smc_accept_poll(sk);
                } else if (smc->use_fallback) { /* as result of connect_work()*/
                        mask |= smc->clcsock->ops->poll(file, smc->clcsock,
                                                           wait);
                        sk->sk_err = smc->clcsock->sk->sk_err;
                } else {
                        if ((sk->sk_state != SMC_INIT &&
                             atomic_read(&smc->conn.sndbuf_space)) ||
                            sk->sk_shutdown & SEND_SHUTDOWN) {
                                mask |= EPOLLOUT | EPOLLWRNORM;
                        } else {
                                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                        }
                        if (atomic_read(&smc->conn.bytes_to_rcv))
                                mask |= EPOLLIN | EPOLLRDNORM;
                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
                        if (sk->sk_state == SMC_APPCLOSEWAIT1)
                                mask |= EPOLLIN;
                        if (smc->conn.urg_state == SMC_URG_VALID)
                                mask |= EPOLLPRI;
                }
        }

        return mask;
}

int smc_shutdown(struct socket *sock, int how)
{
        struct sock *sk = sock->sk;
        bool do_shutdown = true;
        struct smc_sock *smc;
        int rc = -EINVAL;
        int old_state;
        int rc1 = 0;

        smc = smc_sk(sk);

        if ((how < SHUT_RD) || (how > SHUT_RDWR))
                return rc;

        lock_sock(sk);

        if (sock->state == SS_CONNECTING) {
                if (sk->sk_state == SMC_ACTIVE)
                        sock->state = SS_CONNECTED;
                else if (sk->sk_state == SMC_PEERCLOSEWAIT1 ||
                         sk->sk_state == SMC_PEERCLOSEWAIT2 ||
                         sk->sk_state == SMC_APPCLOSEWAIT1 ||
                         sk->sk_state == SMC_APPCLOSEWAIT2 ||
                         sk->sk_state == SMC_APPFINCLOSEWAIT)
                        sock->state = SS_DISCONNECTING;
        }

        rc = -ENOTCONN;
        if ((sk->sk_state != SMC_ACTIVE) &&
            (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
            (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
            (sk->sk_state != SMC_APPCLOSEWAIT1) &&
            (sk->sk_state != SMC_APPCLOSEWAIT2) &&
            (sk->sk_state != SMC_APPFINCLOSEWAIT))
                goto out;
        if (smc->use_fallback) {
                rc = kernel_sock_shutdown(smc->clcsock, how);
                sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
                if (sk->sk_shutdown == SHUTDOWN_MASK) {
                        sk->sk_state = SMC_CLOSED;
                        sk->sk_socket->state = SS_UNCONNECTED;
                        sock_put(sk);
                }
                goto out;
        }
        switch (how) {
        case SHUT_RDWR:                /* shutdown in both directions */
                old_state = sk->sk_state;
                rc = smc_close_active(smc);
                if (old_state == SMC_ACTIVE &&
                    sk->sk_state == SMC_PEERCLOSEWAIT1)
                        do_shutdown = false;
                break;
        case SHUT_WR:
                rc = smc_close_shutdown_write(smc);
                break;
        case SHUT_RD:
                rc = 0;
                /* nothing more to do because peer is not involved */
                break;
        }
        if (do_shutdown && smc->clcsock)
                rc1 = kernel_sock_shutdown(smc->clcsock, how);
        /* map sock_shutdown_cmd constants to sk_shutdown value range */
        sk->sk_shutdown |= how + 1;

        if (sk->sk_state == SMC_CLOSED)
                sock->state = SS_UNCONNECTED;
        else
                sock->state = SS_DISCONNECTING;
out:
        release_sock(sk);
        return rc ? rc : rc1;
}

static int __smc_getsockopt(struct socket *sock, int level, int optname,
                            char __user *optval, int __user *optlen)
{
        struct smc_sock *smc;
        int val, len;

        smc = smc_sk(sock->sk);

        if (get_user(len, optlen))
                return -EFAULT;

        len = min_t(int, len, sizeof(int));

        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case SMC_LIMIT_HS:
                val = smc->limit_smc_hs;
                break;
        default:
                return -EOPNOTSUPP;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static int __smc_setsockopt(struct socket *sock, int level, int optname,
                            sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int val, rc;

        smc = smc_sk(sk);

        lock_sock(sk);
        switch (optname) {
        case SMC_LIMIT_HS:
                if (optlen < sizeof(int)) {
                        rc = -EINVAL;
                        break;
                }
                if (copy_from_sockptr(&val, optval, sizeof(int))) {
                        rc = -EFAULT;
                        break;
                }

                smc->limit_smc_hs = !!val;
                rc = 0;
                break;
        default:
                rc = -EOPNOTSUPP;
                break;
        }
        release_sock(sk);

        return rc;
}

int smc_setsockopt(struct socket *sock, int level, int optname,
                   sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int val, rc;

        if (level == SOL_TCP && optname == TCP_ULP)
                return -EOPNOTSUPP;
        else if (level == SOL_SMC)
                return __smc_setsockopt(sock, level, optname, optval, optlen);

        smc = smc_sk(sk);

        /* generic setsockopts reaching us here always apply to the
         * CLC socket
         */
        mutex_lock(&smc->clcsock_release_lock);
        if (!smc->clcsock) {
                mutex_unlock(&smc->clcsock_release_lock);
                return -EBADF;
        }
        if (unlikely(!smc->clcsock->ops->setsockopt))
                rc = -EOPNOTSUPP;
        else
                rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
                                                   optval, optlen);
        if (smc->clcsock->sk->sk_err) {
                sk->sk_err = smc->clcsock->sk->sk_err;
                sk_error_report(sk);
        }
        mutex_unlock(&smc->clcsock_release_lock);

        if (optlen < sizeof(int))
                return -EINVAL;
        if (copy_from_sockptr(&val, optval, sizeof(int)))
                return -EFAULT;

        lock_sock(sk);
        if (rc || smc->use_fallback)
                goto out;
        switch (optname) {
        case TCP_FASTOPEN:
        case TCP_FASTOPEN_CONNECT:
        case TCP_FASTOPEN_KEY:
        case TCP_FASTOPEN_NO_COOKIE:
                /* option not supported by SMC */
                if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
                        rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
                } else {
                        rc = -EINVAL;
                }
                break;
        case TCP_NODELAY:
                if (sk->sk_state != SMC_INIT &&
                    sk->sk_state != SMC_LISTEN &&
                    sk->sk_state != SMC_CLOSED) {
                        if (val) {
                                SMC_STAT_INC(smc, ndly_cnt);
                                smc_tx_pending(&smc->conn);
                                cancel_delayed_work(&smc->conn.tx_work);
                        }
                }
                break;
        case TCP_CORK:
                if (sk->sk_state != SMC_INIT &&
                    sk->sk_state != SMC_LISTEN &&
                    sk->sk_state != SMC_CLOSED) {
                        if (!val) {
                                SMC_STAT_INC(smc, cork_cnt);
                                smc_tx_pending(&smc->conn);
                                cancel_delayed_work(&smc->conn.tx_work);
                        }
                }
                break;
        case TCP_DEFER_ACCEPT:
                smc->sockopt_defer_accept = val;
                break;
        default:
                break;
        }
out:
        release_sock(sk);

        return rc;
}

int smc_getsockopt(struct socket *sock, int level, int optname,
                   char __user *optval, int __user *optlen)
{
        struct smc_sock *smc;
        int rc;

        if (level == SOL_SMC)
                return __smc_getsockopt(sock, level, optname, optval, optlen);

        smc = smc_sk(sock->sk);
        mutex_lock(&smc->clcsock_release_lock);
        if (!smc->clcsock) {
                mutex_unlock(&smc->clcsock_release_lock);
                return -EBADF;
        }
        /* socket options apply to the CLC socket */
        if (unlikely(!smc->clcsock->ops->getsockopt)) {
                mutex_unlock(&smc->clcsock_release_lock);
                return -EOPNOTSUPP;
        }
        rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
                                           optval, optlen);
        mutex_unlock(&smc->clcsock_release_lock);
        return rc;
}

int smc_ioctl(struct socket *sock, unsigned int cmd,
              unsigned long arg)
{
        union smc_host_cursor cons, urg;
        struct smc_connection *conn;
        struct smc_sock *smc;
        int answ;

        smc = smc_sk(sock->sk);
        conn = &smc->conn;
        lock_sock(&smc->sk);
        if (smc->use_fallback) {
                if (!smc->clcsock) {
                        release_sock(&smc->sk);
                        return -EBADF;
                }
                answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
                release_sock(&smc->sk);
                return answ;
        }
        switch (cmd) {
        case SIOCINQ: /* same as FIONREAD */
                if (smc->sk.sk_state == SMC_LISTEN) {
                        release_sock(&smc->sk);
                        return -EINVAL;
                }
                if (smc->sk.sk_state == SMC_INIT ||
                    smc->sk.sk_state == SMC_CLOSED)
                        answ = 0;
                else
                        answ = atomic_read(&smc->conn.bytes_to_rcv);
                break;
        case SIOCOUTQ:
                /* output queue size (not send + not acked) */
                if (smc->sk.sk_state == SMC_LISTEN) {
                        release_sock(&smc->sk);
                        return -EINVAL;
                }
                if (smc->sk.sk_state == SMC_INIT ||
                    smc->sk.sk_state == SMC_CLOSED)
                        answ = 0;
                else
                        answ = smc->conn.sndbuf_desc->len -
                                        atomic_read(&smc->conn.sndbuf_space);
                break;
        case SIOCOUTQNSD:
                /* output queue size (not send only) */
                if (smc->sk.sk_state == SMC_LISTEN) {
                        release_sock(&smc->sk);
                        return -EINVAL;
                }
                if (smc->sk.sk_state == SMC_INIT ||
                    smc->sk.sk_state == SMC_CLOSED)
                        answ = 0;
                else
                        answ = smc_tx_prepared_sends(&smc->conn);
                break;
        case SIOCATMARK:
                if (smc->sk.sk_state == SMC_LISTEN) {
                        release_sock(&smc->sk);
                        return -EINVAL;
                }
                if (smc->sk.sk_state == SMC_INIT ||
                    smc->sk.sk_state == SMC_CLOSED) {
                        answ = 0;
                } else {
                        smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
                        smc_curs_copy(&urg, &conn->urg_curs, conn);
                        answ = smc_curs_diff(conn->rmb_desc->len,
                                             &cons, &urg) == 1;
                }
                break;
        default:
                release_sock(&smc->sk);
                return -ENOIOCTLCMD;
        }
        release_sock(&smc->sk);

        return put_user(answ, (int __user *)arg);
}

/* Map the affected portions of the rmbe into an spd, note the number of bytes
 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
 * updates till whenever a respective page has been fully processed.
 * Note that subsequent recv() calls have to wait till all splice() processing
 * completed.
 */
ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags)
{
        struct sock *sk = sock->sk;
        struct smc_sock *smc;
        int rc = -ENOTCONN;

        smc = smc_sk(sk);
        lock_sock(sk);
        if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
                /* socket was connected before, no more data to read */
                rc = 0;
                goto out;
        }
        if (sk->sk_state == SMC_INIT ||
            sk->sk_state == SMC_LISTEN ||
            sk->sk_state == SMC_CLOSED)
                goto out;

        if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
                rc = 0;
                goto out;
        }

        if (smc->use_fallback) {
                rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
                                                    pipe, len, flags);
        } else {
                if (*ppos) {
                        rc = -ESPIPE;
                        goto out;
                }
                if (flags & SPLICE_F_NONBLOCK)
                        flags = MSG_DONTWAIT;
                else
                        flags = 0;
                SMC_STAT_INC(smc, splice_cnt);
                rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
        }
out:
        release_sock(sk);

        return rc;
}

/* must look like tcp */
static const struct proto_ops smc_sock_ops = {
        .family                = PF_SMC,
        .owner                = THIS_MODULE,
        .release        = smc_release,
        .bind                = smc_bind,
        .connect        = smc_connect,
        .socketpair        = sock_no_socketpair,
        .accept                = smc_accept,
        .getname        = smc_getname,
        .poll                = smc_poll,
        .ioctl                = smc_ioctl,
        .listen                = smc_listen,
        .shutdown        = smc_shutdown,
        .setsockopt        = smc_setsockopt,
        .getsockopt        = smc_getsockopt,
        .sendmsg        = smc_sendmsg,
        .recvmsg        = smc_recvmsg,
        .mmap                = sock_no_mmap,
        .splice_read        = smc_splice_read,
};

int smc_create_clcsk(struct net *net, struct sock *sk, int family)
{
        struct smc_sock *smc = smc_sk(sk);
        int rc;

        rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
                              &smc->clcsock);
        if (rc) {
                sk_common_release(sk);
                return rc;
        }

        /* smc_clcsock_release() does not wait smc->clcsock->sk's
         * destruction;  its sk_state might not be TCP_CLOSE after
         * smc->sk is close()d, and TCP timers can be fired later,
         * which need net ref.
         */
        sk = smc->clcsock->sk;
        __netns_tracker_free(net, &sk->ns_tracker, false);
        sk->sk_net_refcnt = 1;
        get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
        sock_inuse_add(net, 1);
        return 0;
}

static int __smc_create(struct net *net, struct socket *sock, int protocol,
                        int kern, struct socket *clcsock)
{
        int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
        struct smc_sock *smc;
        struct sock *sk;
        int rc;

        rc = -ESOCKTNOSUPPORT;
        if (sock->type != SOCK_STREAM)
                goto out;

        rc = -EPROTONOSUPPORT;
        if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
                goto out;

        rc = -ENOBUFS;
        sock->ops = &smc_sock_ops;
        sock->state = SS_UNCONNECTED;
        sk = smc_sock_alloc(net, sock, protocol);
        if (!sk)
                goto out;

        /* create internal TCP socket for CLC handshake and fallback */
        smc = smc_sk(sk);

        rc = 0;
        if (clcsock)
                smc->clcsock = clcsock;
        else
                rc = smc_create_clcsk(net, sk, family);
out:
        return rc;
}

static int smc_create(struct net *net, struct socket *sock, int protocol,
                      int kern)
{
        return __smc_create(net, sock, protocol, kern, NULL);
}

static const struct net_proto_family smc_sock_family_ops = {
        .family        = PF_SMC,
        .owner        = THIS_MODULE,
        .create        = smc_create,
};

static int smc_ulp_init(struct sock *sk)
{
        struct socket *tcp = sk->sk_socket;
        struct net *net = sock_net(sk);
        struct socket *smcsock;
        int protocol, ret;

        /* only TCP can be replaced */
        if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
            (sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
                return -ESOCKTNOSUPPORT;
        /* don't handle wq now */
        if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
                return -ENOTCONN;

        if (sk->sk_family == AF_INET)
                protocol = SMCPROTO_SMC;
        else
                protocol = SMCPROTO_SMC6;

        smcsock = sock_alloc();
        if (!smcsock)
                return -ENFILE;

        smcsock->type = SOCK_STREAM;
        __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
        ret = __smc_create(net, smcsock, protocol, 1, tcp);
        if (ret) {
                sock_release(smcsock); /* module_put() which ops won't be NULL */
                return ret;
        }

        /* replace tcp socket to smc */
        smcsock->file = tcp->file;
        smcsock->file->private_data = smcsock;
        smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
        smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
        tcp->file = NULL;

        return ret;
}

static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
                          const gfp_t priority)
{
        struct inet_connection_sock *icsk = inet_csk(newsk);

        /* don't inherit ulp ops to child when listen */
        icsk->icsk_ulp_ops = NULL;
}

static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
        .name                = "smc",
        .owner                = THIS_MODULE,
        .init                = smc_ulp_init,
        .clone                = smc_ulp_clone,
};

unsigned int smc_net_id;

static __net_init int smc_net_init(struct net *net)
{
        int rc;

        rc = smc_sysctl_net_init(net);
        if (rc)
                return rc;
        return smc_pnet_net_init(net);
}

static void __net_exit smc_net_exit(struct net *net)
{
        smc_sysctl_net_exit(net);
        smc_pnet_net_exit(net);
}

static __net_init int smc_net_stat_init(struct net *net)
{
        return smc_stats_init(net);
}

static void __net_exit smc_net_stat_exit(struct net *net)
{
        smc_stats_exit(net);
}

static struct pernet_operations smc_net_ops = {
        .init = smc_net_init,
        .exit = smc_net_exit,
        .id   = &smc_net_id,
        .size = sizeof(struct smc_net),
};

static struct pernet_operations smc_net_stat_ops = {
        .init = smc_net_stat_init,
        .exit = smc_net_stat_exit,
};

static int __init smc_init(void)
{
        int rc;

        rc = register_pernet_subsys(&smc_net_ops);
        if (rc)
                return rc;

        rc = register_pernet_subsys(&smc_net_stat_ops);
        if (rc)
                goto out_pernet_subsys;

        rc = smc_ism_init();
        if (rc)
                goto out_pernet_subsys_stat;
        smc_clc_init();

        rc = smc_nl_init();
        if (rc)
                goto out_ism;

        rc = smc_pnet_init();
        if (rc)
                goto out_nl;

        rc = -ENOMEM;

        smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
        if (!smc_tcp_ls_wq)
                goto out_pnet;

        smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
        if (!smc_hs_wq)
                goto out_alloc_tcp_ls_wq;

        smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
        if (!smc_close_wq)
                goto out_alloc_hs_wq;

        rc = smc_core_init();
        if (rc) {
                pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
                goto out_alloc_wqs;
        }

        rc = smc_llc_init();
        if (rc) {
                pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
                goto out_core;
        }

        rc = smc_cdc_init();
        if (rc) {
                pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
                goto out_core;
        }

        rc = proto_register(&smc_proto, 1);
        if (rc) {
                pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
                goto out_core;
        }

        rc = proto_register(&smc_proto6, 1);
        if (rc) {
                pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
                goto out_proto;
        }

        rc = sock_register(&smc_sock_family_ops);
        if (rc) {
                pr_err("%s: sock_register fails with %d\n", __func__, rc);
                goto out_proto6;
        }
        INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
        INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);

        rc = smc_ib_register_client();
        if (rc) {
                pr_err("%s: ib_register fails with %d\n", __func__, rc);
                goto out_sock;
        }

        rc = smc_loopback_init();
        if (rc) {
                pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc);
                goto out_ib;
        }

        rc = tcp_register_ulp(&smc_ulp_ops);
        if (rc) {
                pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
                goto out_lo;
        }
        rc = smc_inet_init();
        if (rc) {
                pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
                goto out_ulp;
        }
        static_branch_enable(&tcp_have_smc);
        return 0;
out_ulp:
        tcp_unregister_ulp(&smc_ulp_ops);
out_lo:
        smc_loopback_exit();
out_ib:
        smc_ib_unregister_client();
out_sock:
        sock_unregister(PF_SMC);
out_proto6:
        proto_unregister(&smc_proto6);
out_proto:
        proto_unregister(&smc_proto);
out_core:
        smc_core_exit();
out_alloc_wqs:
        destroy_workqueue(smc_close_wq);
out_alloc_hs_wq:
        destroy_workqueue(smc_hs_wq);
out_alloc_tcp_ls_wq:
        destroy_workqueue(smc_tcp_ls_wq);
out_pnet:
        smc_pnet_exit();
out_nl:
        smc_nl_exit();
out_ism:
        smc_clc_exit();
        smc_ism_exit();
out_pernet_subsys_stat:
        unregister_pernet_subsys(&smc_net_stat_ops);
out_pernet_subsys:
        unregister_pernet_subsys(&smc_net_ops);

        return rc;
}

static void __exit smc_exit(void)
{
        static_branch_disable(&tcp_have_smc);
        smc_inet_exit();
        tcp_unregister_ulp(&smc_ulp_ops);
        sock_unregister(PF_SMC);
        smc_core_exit();
        smc_loopback_exit();
        smc_ib_unregister_client();
        smc_ism_exit();
        destroy_workqueue(smc_close_wq);
        destroy_workqueue(smc_tcp_ls_wq);
        destroy_workqueue(smc_hs_wq);
        proto_unregister(&smc_proto6);
        proto_unregister(&smc_proto);
        smc_pnet_exit();
        smc_nl_exit();
        smc_clc_exit();
        unregister_pernet_subsys(&smc_net_stat_ops);
        unregister_pernet_subsys(&smc_net_ops);
        rcu_barrier();
}

module_init(smc_init);
module_exit(smc_exit);

MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("smc socket address family");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_SMC);
MODULE_ALIAS_TCP_ULP("smc");
/* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1);
#if IS_ENABLED(CONFIG_IPV6)
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 256, 1);
#endif /* CONFIG_IPV6 */
MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);













































































































































































































































































































































































































































































































































































































































































































































    1 

    1 























































    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_MAPLE_TREE_H
#define _LINUX_MAPLE_TREE_H
/*
 * Maple Tree - An RCU-safe adaptive tree for storing ranges
 * Copyright (c) 2018-2022 Oracle
 * Authors:     Liam R. Howlett <Liam.Howlett@Oracle.com>
 *              Matthew Wilcox <willy@infradead.org>
 */

#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
/* #define CONFIG_MAPLE_RCU_DISABLED */

/*
 * Allocated nodes are mutable until they have been inserted into the tree,
 * at which time they cannot change their type until they have been removed
 * from the tree and an RCU grace period has passed.
 *
 * Removed nodes have their ->parent set to point to themselves.  RCU readers
 * check ->parent before relying on the value that they loaded from the
 * slots array.  This lets us reuse the slots array for the RCU head.
 *
 * Nodes in the tree point to their parent unless bit 0 is set.
 */
#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
/* 64bit sizes */
#define MAPLE_NODE_SLOTS        31        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        16        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        10        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 1)
#else
/* 32bit sizes */
#define MAPLE_NODE_SLOTS        63        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        32        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        21        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 2)
#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */

#define MAPLE_NODE_MASK                255UL

/*
 * The node->parent of the root node has bit 0 set and the rest of the pointer
 * is a pointer to the tree itself.  No more bits are available in this pointer
 * (on m68k, the data structure may only be 2-byte aligned).
 *
 * Internal non-root nodes can only have maple_range_* nodes as parents.  The
 * parent pointer is 256B aligned like all other tree nodes.  When storing a 32
 * or 64 bit values, the offset can fit into 4 bits.  The 16 bit values need an
 * extra bit to store the offset.  This extra bit comes from a reuse of the last
 * bit in the node type.  This is possible by using bit 1 to indicate if bit 2
 * is part of the type or the slot.
 *
 * Once the type is decided, the decision of an allocation range type or a range
 * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
 * flag.
 *
 *  Node types:
 *   0x??1 = Root
 *   0x?00 = 16 bit nodes
 *   0x010 = 32 bit nodes
 *   0x110 = 64 bit nodes
 *
 *  Slot size and location in the parent pointer:
 *   type  : slot location
 *   0x??1 : Root
 *   0x?00 : 16 bit values, type in 0-1, slot in 2-6
 *   0x010 : 32 bit values, type in 0-2, slot in 3-6
 *   0x110 : 64 bit values, type in 0-2, slot in 3-6
 */

/*
 * This metadata is used to optimize the gap updating code and in reverse
 * searching for gaps or any other code that needs to find the end of the data.
 */
struct maple_metadata {
        unsigned char end;
        unsigned char gap;
};

/*
 * Leaf nodes do not store pointers to nodes, they store user data.  Users may
 * store almost any bit pattern.  As noted above, the optimisation of storing an
 * entry at 0 in the root pointer cannot be done for data which have the bottom
 * two bits set to '10'.  We also reserve values with the bottom two bits set to
 * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use.  Some APIs
 * return errnos as a negative errno shifted right by two bits and the bottom
 * two bits set to '10', and while choosing to store these values in the array
 * is not an error, it may lead to confusion if you're testing for an error with
 * mas_is_err().
 *
 * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
 * 3-6), bit 2 is reserved.  That leaves bits 0-1 unused for now.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges,  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 */

struct maple_range_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
        union {
                void __rcu *slot[MAPLE_RANGE64_SLOTS];
                struct {
                        void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
                        struct maple_metadata meta;
                };
        };
};

/*
 * At tree creation time, the user can specify that they're willing to trade off
 * storing fewer entries in a tree in return for storing more information in
 * each node.
 *
 * The maple tree supports recording the largest range of NULL entries available
 * in this node, also called gaps.  This optimises the tree for allocating a
 * range.
 */
struct maple_arange_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
        void __rcu *slot[MAPLE_ARANGE64_SLOTS];
        unsigned long gap[MAPLE_ARANGE64_SLOTS];
        struct maple_metadata meta;
};

struct maple_alloc {
        unsigned long total;
        unsigned char node_count;
        unsigned int request_count;
        struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
};

struct maple_topiary {
        struct maple_pnode *parent;
        struct maple_enode *next; /* Overlaps the pivot */
};

enum maple_type {
        maple_dense,
        maple_leaf_64,
        maple_range_64,
        maple_arange_64,
};


/**
 * DOC: Maple tree flags
 *
 * * MT_FLAGS_ALLOC_RANGE        - Track gaps in this tree
 * * MT_FLAGS_USE_RCU                - Operate in RCU mode
 * * MT_FLAGS_HEIGHT_OFFSET        - The position of the tree height in the flags
 * * MT_FLAGS_HEIGHT_MASK        - The mask for the maple tree height value
 * * MT_FLAGS_LOCK_MASK                - How the mt_lock is used
 * * MT_FLAGS_LOCK_IRQ                - Acquired irq-safe
 * * MT_FLAGS_LOCK_BH                - Acquired bh-safe
 * * MT_FLAGS_LOCK_EXTERN        - mt_lock is not used
 *
 * MAPLE_HEIGHT_MAX        The largest height that can be stored
 */
#define MT_FLAGS_ALLOC_RANGE        0x01
#define MT_FLAGS_USE_RCU        0x02
#define MT_FLAGS_HEIGHT_OFFSET        0x02
#define MT_FLAGS_HEIGHT_MASK        0x7C
#define MT_FLAGS_LOCK_MASK        0x300
#define MT_FLAGS_LOCK_IRQ        0x100
#define MT_FLAGS_LOCK_BH        0x200
#define MT_FLAGS_LOCK_EXTERN        0x300
#define MT_FLAGS_ALLOC_WRAPPED        0x0800

#define MAPLE_HEIGHT_MAX        31


#define MAPLE_NODE_TYPE_MASK        0x0F
#define MAPLE_NODE_TYPE_SHIFT        0x03

#define MAPLE_RESERVED_RANGE        4096

#ifdef CONFIG_LOCKDEP
typedef struct lockdep_map *lockdep_map_p;
#define mt_lock_is_held(mt)                                             \
        (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))

#define mt_write_lock_is_held(mt)                                        \
        (!(mt)->ma_external_lock ||                                        \
         lock_is_held_type((mt)->ma_external_lock, 0))

#define mt_set_external_lock(mt, lock)                                        \
        (mt)->ma_external_lock = &(lock)->dep_map

#define mt_on_stack(mt)                        (mt).ma_external_lock = NULL
#else
typedef struct { /* nothing */ } lockdep_map_p;
#define mt_lock_is_held(mt)                1
#define mt_write_lock_is_held(mt)        1
#define mt_set_external_lock(mt, lock)        do { } while (0)
#define mt_on_stack(mt)                        do { } while (0)
#endif

/*
 * If the tree contains a single entry at index 0, it is usually stored in
 * tree->ma_root.  To optimise for the page cache, an entry which ends in '00',
 * '01' or '11' is stored in the root, but an entry which ends in '10' will be
 * stored in a node.  Bits 3-6 are used to store enum maple_type.
 *
 * The flags are used both to store some immutable information about this tree
 * (set at tree creation time) and dynamic information set under the spinlock.
 *
 * Another use of flags are to indicate global states of the tree.  This is the
 * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
 * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
 * re-allocating and RCU freeing nodes when there is a single user.
 */
struct maple_tree {
        union {
                spinlock_t        ma_lock;
                lockdep_map_p        ma_external_lock;
        };
        unsigned int        ma_flags;
        void __rcu      *ma_root;
};

/**
 * MTREE_INIT() - Initialize a maple tree
 * @name: The maple tree name
 * @__flags: The maple tree flags
 *
 */
#define MTREE_INIT(name, __flags) {                                        \
        .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock),                \
        .ma_flags = __flags,                                                \
        .ma_root = NULL,                                                \
}

/**
 * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
 * @name: The tree name
 * @__flags: The maple tree flags
 * @__lock: The external lock
 */
#ifdef CONFIG_LOCKDEP
#define MTREE_INIT_EXT(name, __flags, __lock) {                                \
        .ma_external_lock = &(__lock).dep_map,                                \
        .ma_flags = (__flags),                                                \
        .ma_root = NULL,                                                \
}
#else
#define MTREE_INIT_EXT(name, __flags, __lock)        MTREE_INIT(name, __flags)
#endif

#define DEFINE_MTREE(name)                                                \
        struct maple_tree name = MTREE_INIT(name, 0)

#define mtree_lock(mt)                spin_lock((&(mt)->ma_lock))
#define mtree_lock_nested(mas, subclass) \
                spin_lock_nested((&(mt)->ma_lock), subclass)
#define mtree_unlock(mt)        spin_unlock((&(mt)->ma_lock))

/*
 * The Maple Tree squeezes various bits in at various points which aren't
 * necessarily obvious.  Usually, this is done by observing that pointers are
 * N-byte aligned and thus the bottom log_2(N) bits are available for use.  We
 * don't use the high bits of pointers to store additional information because
 * we don't know what bits are unused on any given architecture.
 *
 * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
 * low bits for our own purposes.  Nodes are currently of 4 types:
 * 1. Single pointer (Range is 0-0)
 * 2. Non-leaf Allocation Range nodes
 * 3. Non-leaf Range nodes
 * 4. Leaf Range nodes All nodes consist of a number of node slots,
 *    pivots, and a parent pointer.
 */

struct maple_node {
        union {
                struct {
                        struct maple_pnode *parent;
                        void __rcu *slot[MAPLE_NODE_SLOTS];
                };
                struct {
                        void *pad;
                        struct rcu_head rcu;
                        struct maple_enode *piv_parent;
                        unsigned char parent_slot;
                        enum maple_type type;
                        unsigned char slot_len;
                        unsigned int ma_flags;
                };
                struct maple_range_64 mr64;
                struct maple_arange_64 ma64;
                struct maple_alloc alloc;
        };
};

/*
 * More complicated stores can cause two nodes to become one or three and
 * potentially alter the height of the tree.  Either half of the tree may need
 * to be rebalanced against the other.  The ma_topiary struct is used to track
 * which nodes have been 'cut' from the tree so that the change can be done
 * safely at a later date.  This is done to support RCU.
 */
struct ma_topiary {
        struct maple_enode *head;
        struct maple_enode *tail;
        struct maple_tree *mtree;
};

void *mtree_load(struct maple_tree *mt, unsigned long index);

int mtree_insert(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp);
int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);
int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);

int mtree_store_range(struct maple_tree *mt, unsigned long first,
                      unsigned long last, void *entry, gfp_t gfp);
int mtree_store(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
void *mtree_erase(struct maple_tree *mt, unsigned long index);

int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);

void mtree_destroy(struct maple_tree *mt);
void __mt_destroy(struct maple_tree *mt);

/**
 * mtree_empty() - Determine if a tree has any present entries.
 * @mt: Maple Tree.
 *
 * Context: Any context.
 * Return: %true if the tree contains only NULL pointers.
 */
static inline bool mtree_empty(const struct maple_tree *mt)
{
        return mt->ma_root == NULL;
}

/* Advanced API */

/*
 * Maple State Status
 * ma_active means the maple state is pointing to a node and offset and can
 * continue operating on the tree.
 * ma_start means we have not searched the tree.
 * ma_root means we have searched the tree and the entry we found lives in
 * the root of the tree (ie it has index 0, length 1 and is the only entry in
 * the tree).
 * ma_none means we have searched the tree and there is no node in the
 * tree for this entry.  For example, we searched for index 1 in an empty
 * tree.  Or we have a tree which points to a full leaf node and we
 * searched for an entry which is larger than can be contained in that
 * leaf node.
 * ma_pause means the data within the maple state may be stale, restart the
 * operation
 * ma_overflow means the search has reached the upper limit of the search
 * ma_underflow means the search has reached the lower limit of the search
 * ma_error means there was an error, check the node for the error number.
 */
enum maple_status {
        ma_active,
        ma_start,
        ma_root,
        ma_none,
        ma_pause,
        ma_overflow,
        ma_underflow,
        ma_error,
};

/*
 * The maple state is defined in the struct ma_state and is used to keep track
 * of information during operations, and even between operations when using the
 * advanced API.
 *
 * If state->node has bit 0 set then it references a tree location which is not
 * a node (eg the root).  If bit 1 is set, the rest of the bits are a negative
 * errno.  Bit 2 (the 'unallocated slots' bit) is clear.  Bits 3-6 indicate the
 * node type.
 *
 * state->alloc either has a request number of nodes or an allocated node.  If
 * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
 * and the remaining bits are the value.  If state->alloc is a node, then the
 * node will be of type maple_alloc.  maple_alloc has MAPLE_NODE_SLOTS - 1 for
 * storing more allocated nodes, a total number of nodes allocated, and the
 * node_count in this node.  node_count is the number of allocated nodes in this
 * node.  The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
 * nodes into state->alloc->slot[0]'s node.  Nodes are taken from state->alloc
 * by removing a node from the state->alloc node until state->alloc->node_count
 * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
 * to state->alloc.  Nodes are pushed onto state->alloc by putting the current
 * state->alloc into the pushed node's slot[0].
 *
 * The state also contains the implied min/max of the state->node, the depth of
 * this search, and the offset. The implied min/max are either from the parent
 * node or are 0-oo for the root node.  The depth is incremented or decremented
 * every time a node is walked down or up.  The offset is the slot/pivot of
 * interest in the node - either for reading or writing.
 *
 * When returning a value the maple state index and last respectively contain
 * the start and end of the range for the entry.  Ranges are inclusive in the
 * Maple Tree.
 *
 * The status of the state is used to determine how the next action should treat
 * the state.  For instance, if the status is ma_start then the next action
 * should start at the root of the tree and walk down.  If the status is
 * ma_pause then the node may be stale data and should be discarded.  If the
 * status is ma_overflow, then the last action hit the upper limit.
 *
 */
struct ma_state {
        struct maple_tree *tree;        /* The tree we're operating in */
        unsigned long index;                /* The index we're operating on - range start */
        unsigned long last;                /* The last index we're operating on - range end */
        struct maple_enode *node;        /* The node containing this entry */
        unsigned long min;                /* The minimum index of this node - implied pivot min */
        unsigned long max;                /* The maximum index of this node - implied pivot max */
        struct maple_alloc *alloc;        /* Allocated nodes for this operation */
        enum maple_status status;        /* The status of the state (active, start, none, etc) */
        unsigned char depth;                /* depth of tree descent during write */
        unsigned char offset;
        unsigned char mas_flags;
        unsigned char end;                /* The end of the node */
};

struct ma_wr_state {
        struct ma_state *mas;
        struct maple_node *node;        /* Decoded mas->node */
        unsigned long r_min;                /* range min */
        unsigned long r_max;                /* range max */
        enum maple_type type;                /* mas->node type */
        unsigned char offset_end;        /* The offset where the write ends */
        unsigned long *pivots;                /* mas->node->pivots pointer */
        unsigned long end_piv;                /* The pivot at the offset end */
        void __rcu **slots;                /* mas->node->slots pointer */
        void *entry;                        /* The entry to write */
        void *content;                        /* The existing entry that is being overwritten */
};

#define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
#define mas_lock_nested(mas, subclass) \
                spin_lock_nested(&((mas)->tree->ma_lock), subclass)
#define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))

/*
 * Special values for ma_state.node.
 * MA_ERROR represents an errno.  After dropping the lock and attempting
 * to resolve the error, the walk would have to be restarted from the
 * top of the tree as the tree may have been modified.
 */
#define MA_ERROR(err) \
                ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))

#define MA_STATE(name, mt, first, end)                                        \
        struct ma_state name = {                                        \
                .tree = mt,                                                \
                .index = first,                                                \
                .last = end,                                                \
                .node = NULL,                                                \
                .status = ma_start,                                        \
                .min = 0,                                                \
                .max = ULONG_MAX,                                        \
                .alloc = NULL,                                                \
                .mas_flags = 0,                                                \
        }

#define MA_WR_STATE(name, ma_state, wr_entry)                                \
        struct ma_wr_state name = {                                        \
                .mas = ma_state,                                        \
                .content = NULL,                                        \
                .entry = wr_entry,                                        \
        }

#define MA_TOPIARY(name, tree)                                                \
        struct ma_topiary name = {                                        \
                .head = NULL,                                                \
                .tail = NULL,                                                \
                .mtree = tree,                                                \
        }

void *mas_walk(struct ma_state *mas);
void *mas_store(struct ma_state *mas, void *entry);
void *mas_erase(struct ma_state *mas);
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
void mas_store_prealloc(struct ma_state *mas, void *entry);
void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);

bool mas_nomem(struct ma_state *mas, gfp_t gfp);
void mas_pause(struct ma_state *mas);
void maple_tree_init(void);
void mas_destroy(struct ma_state *mas);
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);

void *mas_prev(struct ma_state *mas, unsigned long min);
void *mas_prev_range(struct ma_state *mas, unsigned long max);
void *mas_next(struct ma_state *mas, unsigned long max);
void *mas_next_range(struct ma_state *mas, unsigned long max);

int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
                   unsigned long size);
/*
 * This finds an empty area from the highest address to the lowest.
 * AKA "Topdown" version,
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                       unsigned long max, unsigned long size);

static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
                            unsigned long addr)
{
        memset(mas, 0, sizeof(struct ma_state));
        mas->tree = tree;
        mas->index = mas->last = addr;
        mas->max = ULONG_MAX;
        mas->status = ma_start;
        mas->node = NULL;
}

static inline bool mas_is_active(struct ma_state *mas)
{
        return mas->status == ma_active;
}

static inline bool mas_is_err(struct ma_state *mas)
{
        return mas->status == ma_error;
}

/**
 * mas_reset() - Reset a Maple Tree operation state.
 * @mas: Maple Tree operation state.
 *
 * Resets the error or walk state of the @mas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * lock and want to reuse the ma_state.
 *
 * Context: Any context.
 */
static __always_inline void mas_reset(struct ma_state *mas)
{
        mas->status = ma_start;
        mas->node = NULL;
}

/**
 * mas_for_each() - Iterate over a range of the maple tree.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__max: maximum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each(__mas, __entry, __max) \
        while (((__entry) = mas_find((__mas), (__max))) != NULL)

#ifdef CONFIG_DEBUG_MAPLE_TREE
enum mt_dump_format {
        mt_dump_dec,
        mt_dump_hex,
};

extern atomic_t maple_tree_tests_run;
extern atomic_t maple_tree_tests_passed;

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
void mas_dump(const struct ma_state *mas);
void mas_wr_dump(const struct ma_wr_state *wr_mas);
void mt_validate(struct maple_tree *mt);
void mt_cache_shrink(void);
#define MT_BUG_ON(__tree, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_BUG_ON(__mas, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_WR_BUG_ON(__wrmas, __x) do {                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MT_WARN_ON(__tree, __x)  ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WARN_ON(__mas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WR_WARN_ON(__wrmas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})
#else
#define MT_BUG_ON(__tree, __x)                BUG_ON(__x)
#define MAS_BUG_ON(__mas, __x)                BUG_ON(__x)
#define MAS_WR_BUG_ON(__mas, __x)        BUG_ON(__x)
#define MT_WARN_ON(__tree, __x)                WARN_ON(__x)
#define MAS_WARN_ON(__mas, __x)                WARN_ON(__x)
#define MAS_WR_WARN_ON(__mas, __x)        WARN_ON(__x)
#endif /* CONFIG_DEBUG_MAPLE_TREE */

/**
 * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
 * current location.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * set the internal maple state values to a sub-range.
 * Please use mas_set_range() if you do not know where you are in the tree.
 */
static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
                unsigned long last)
{
        /* Ensure the range starts within the current slot */
        MAS_WARN_ON(mas, mas_is_active(mas) &&
                   (mas->index > start || mas->last < start));
        mas->index = start;
        mas->last = last;
}

/**
 * mas_set_range() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * Move the operation state to refer to a different range.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
        mas_reset(mas);
        __mas_set_range(mas, start, last);
}

/**
 * mas_set() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @index: New index into the Maple Tree.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline void mas_set(struct ma_state *mas, unsigned long index)
{

        mas_set_range(mas, index, index);
}

static inline bool mt_external_lock(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
}

/**
 * mt_init_flags() - Initialise an empty maple tree with flags.
 * @mt: Maple Tree
 * @flags: maple tree flags.
 *
 * If you need to initialise a Maple Tree with special flags (eg, an
 * allocation tree), use this function.
 *
 * Context: Any context.
 */
static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
{
        mt->ma_flags = flags;
        if (!mt_external_lock(mt))
                spin_lock_init(&mt->ma_lock);
        rcu_assign_pointer(mt->ma_root, NULL);
}

/**
 * mt_init() - Initialise an empty maple tree.
 * @mt: Maple Tree
 *
 * An empty Maple Tree.
 *
 * Context: Any context.
 */
static inline void mt_init(struct maple_tree *mt)
{
        mt_init_flags(mt, 0);
}

static inline bool mt_in_rcu(struct maple_tree *mt)
{
#ifdef CONFIG_MAPLE_RCU_DISABLED
        return false;
#endif
        return mt->ma_flags & MT_FLAGS_USE_RCU;
}

/**
 * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
 * @mt: The Maple Tree
 */
static inline void mt_clear_in_rcu(struct maple_tree *mt)
{
        if (!mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

/**
 * mt_set_in_rcu() - Switch the tree to RCU safe mode.
 * @mt: The Maple Tree
 */
static inline void mt_set_in_rcu(struct maple_tree *mt)
{
        if (mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags |= MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags |= MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

static inline unsigned int mt_height(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
}

void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max);
void *mt_prev(struct maple_tree *mt, unsigned long index,  unsigned long min);
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);

/**
 * mt_for_each - Iterate over each entry starting at index until max.
 * @__tree: The Maple Tree
 * @__entry: The current entry
 * @__index: The index to start the search from. Subsequently used as iterator.
 * @__max: The maximum limit for @index
 *
 * This iterator skips all entries, which resolve to a NULL pointer,
 * e.g. entries which has been reserved with XA_ZERO_ENTRY.
 */
#define mt_for_each(__tree, __entry, __index, __max) \
        for (__entry = mt_find(__tree, &(__index), __max); \
                __entry; __entry = mt_find_after(__tree, &(__index), __max))

#endif /*_LINUX_MAPLE_TREE_H */

















































































































































































    1 



    1 






























































































































    7 













    7 





    7 








    7 

















    3 





























    3 





    1 




















    7 

    7 
















    1 

    2 






















    1 

    1 

    1 



















    8 

    7 
















    1 

    2 





















    1 

    1 

    1 






































    1 

    1 
































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>

static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
static void free_user_ns(struct work_struct *work);

static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
{
        return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
}

static void dec_user_namespaces(struct ucounts *ucounts)
{
        return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
}

static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
        /* Start with the same capabilities as init but useless for doing
         * anything as the capabilities are bound to the new user namespace.
         */
        cred->securebits = SECUREBITS_DEFAULT;
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
        cred->request_key_auth = NULL;
#endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
        cred->user_ns = user_ns;
}

static unsigned long enforced_nproc_rlimit(void)
{
        unsigned long limit = RLIM_INFINITY;

        /* Is RLIMIT_NPROC currently enforced? */
        if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
            (current_user_ns() != &init_user_ns))
                limit = rlimit(RLIMIT_NPROC);

        return limit;
}

/*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
 * new namespace.
 *
 * This is called by copy_creds(), which will finish setting the target task's
 * credentials.
 */
int create_user_ns(struct cred *new)
{
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
        struct ucounts *ucounts;
        int ret, i;

        ret = -ENOSPC;
        if (parent_ns->level > 32)
                goto fail;

        ucounts = inc_user_namespaces(parent_ns, owner);
        if (!ucounts)
                goto fail;

        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
         * by verifying that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
        if (current_chrooted())
                goto fail_dec;

        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
         */
        ret = -EPERM;
        if (!kuid_has_mapping(parent_ns, owner) ||
            !kgid_has_mapping(parent_ns, group))
                goto fail_dec;

        ret = security_create_user_ns(new);
        if (ret < 0)
                goto fail_dec;

        ret = -ENOMEM;
        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                goto fail_dec;

        ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
        ret = ns_alloc_inum(&ns->ns);
        if (ret)
                goto fail_free;
        ns->ns.ops = &userns_operations;

        refcount_set(&ns->ns.count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
        for (i = 0; i < UCOUNT_COUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
        ns->ucounts = ucounts;

        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
        mutex_lock(&userns_state_mutex);
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_KEYS
        INIT_LIST_HEAD(&ns->keyring_name_list);
        init_rwsem(&ns->keyring_sem);
#endif
        ret = -ENOMEM;
        if (!setup_userns_sysctls(ns))
                goto fail_keyring;

        set_cred_user_ns(new, ns);
        return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
        ns_free_inum(&ns->ns);
fail_free:
        kmem_cache_free(user_ns_cachep, ns);
fail_dec:
        dec_user_namespaces(ucounts);
fail:
        return ret;
}

int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
        struct cred *cred;
        int err = -ENOMEM;

        if (!(unshare_flags & CLONE_NEWUSER))
                return 0;

        cred = prepare_creds();
        if (cred) {
                err = create_user_ns(cred);
                if (err)
                        put_cred(cred);
                else
                        *new_cred = cred;
        }

        return err;
}

static void free_user_ns(struct work_struct *work)
{
        struct user_namespace *parent, *ns =
                container_of(work, struct user_namespace, work);

        do {
                struct ucounts *ucounts = ns->ucounts;
                parent = ns->parent;
                if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->gid_map.forward);
                        kfree(ns->gid_map.reverse);
                }
                if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->uid_map.forward);
                        kfree(ns->uid_map.reverse);
                }
                if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->projid_map.forward);
                        kfree(ns->projid_map.reverse);
                }
#if IS_ENABLED(CONFIG_BINFMT_MISC)
                kfree(ns->binfmt_misc);
#endif
                retire_userns_sysctls(ns);
                key_free_user_ns(ns);
                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
                dec_user_namespaces(ucounts);
                ns = parent;
        } while (refcount_dec_and_test(&parent->ns.count));
}

void __put_user_ns(struct user_namespace *ns)
{
        schedule_work(&ns->work);
}
EXPORT_SYMBOL(__put_user_ns);

/*
 * struct idmap_key - holds the information necessary to find an idmapping in a
 * sorted idmap array. It is passed to cmp_map_id() as first argument.
 */
struct idmap_key {
        bool map_up; /* true  -> id from kid; false -> kid from id */
        u32 id; /* id to find */
        u32 count; /* == 0 unless used with map_id_range_down() */
};

/*
 * cmp_map_id - Function to be passed to bsearch() to find the requested
 * idmapping. Expects struct idmap_key to be passed via @k.
 */
static int cmp_map_id(const void *k, const void *e)
{
        u32 first, last, id2;
        const struct idmap_key *key = k;
        const struct uid_gid_extent *el = e;

        id2 = key->id + key->count - 1;

        /* handle map_id_{down,up}() */
        if (key->map_up)
                first = el->lower_first;
        else
                first = el->first;

        last = first + el->count - 1;

        if (key->id >= first && key->id <= last &&
            (id2 >= first && id2 <= last))
                return 0;

        if (key->id < first || id2 < first)
                return -1;

        return 1;
}

/*
 * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = false;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->forward, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

/*
 * map_id_range_down_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_down_base(extents, map, id, count);
        else
                extent = map_id_range_down_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->first) + extent->lower_first;
        else
                id = (u32) -1;

        return id;
}

u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return map_id_range_down(map, id, 1);
}

/*
 * map_id_up_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
{
        unsigned idx;
        u32 first, last;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].lower_first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last)
                        return &map->extent[idx];
        }
        return NULL;
}

/*
 * map_id_up_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
{
        struct idmap_key key;

        key.map_up = true;
        key.count = 1;
        key.id = id;

        return bsearch(&key, map->reverse, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_up_base(extents, map, id);
        else
                extent = map_id_up_max(extents, map, id);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->lower_first) + extent->first;
        else
                id = (u32) -1;

        return id;
}

/**
 *        make_kuid - Map a user-namespace uid pair into a kuid.
 *        @ns:  User namespace that the uid is in
 *        @uid: User identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace uid
 *        pair INVALID_UID is returned.  Callers are expected to test
 *        for and handle INVALID_UID being returned.  INVALID_UID
 *        may be tested for using uid_valid().
 */
kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
{
        /* Map the uid to a global kernel uid */
        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
}
EXPORT_SYMBOL(make_kuid);

/**
 *        from_kuid - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kuid has no mapping in @targ (uid_t)-1 is returned.
 */
uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->uid_map, __kuid_val(kuid));
}
EXPORT_SYMBOL(from_kuid);

/**
 *        from_kuid_munged - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kuid from_kuid_munged never fails and always
 *        returns a valid uid.  This makes from_kuid_munged appropriate
 *        for use in syscalls like stat and getuid where failing the
 *        system call and failing to provide a valid uid are not an
 *        options.
 *
 *        If @kuid has no mapping in @targ overflowuid is returned.
 */
uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
        uid_t uid;
        uid = from_kuid(targ, kuid);

        if (uid == (uid_t) -1)
                uid = overflowuid;
        return uid;
}
EXPORT_SYMBOL(from_kuid_munged);

/**
 *        make_kgid - Map a user-namespace gid pair into a kgid.
 *        @ns:  User namespace that the gid is in
 *        @gid: group identifier
 *
 *        Maps a user-namespace gid pair into a kernel internal kgid,
 *        and returns that kgid.
 *
 *        When there is no mapping defined for the user-namespace gid
 *        pair INVALID_GID is returned.  Callers are expected to test
 *        for and handle INVALID_GID being returned.  INVALID_GID may be
 *        tested for using gid_valid().
 */
kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
        /* Map the gid to a global kernel gid */
        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
}
EXPORT_SYMBOL(make_kgid);

/**
 *        from_kgid - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kgid has no mapping in @targ (gid_t)-1 is returned.
 */
gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
{
        /* Map the gid from a global kernel gid */
        return map_id_up(&targ->gid_map, __kgid_val(kgid));
}
EXPORT_SYMBOL(from_kgid);

/**
 *        from_kgid_munged - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kgid from_kgid_munged never fails and always
 *        returns a valid gid.  This makes from_kgid_munged appropriate
 *        for use in syscalls like stat and getgid where failing the
 *        system call and failing to provide a valid gid are not options.
 *
 *        If @kgid has no mapping in @targ overflowgid is returned.
 */
gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
{
        gid_t gid;
        gid = from_kgid(targ, kgid);

        if (gid == (gid_t) -1)
                gid = overflowgid;
        return gid;
}
EXPORT_SYMBOL(from_kgid_munged);

/**
 *        make_kprojid - Map a user-namespace projid pair into a kprojid.
 *        @ns:  User namespace that the projid is in
 *        @projid: Project identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace projid
 *        pair INVALID_PROJID is returned.  Callers are expected to test
 *        for and handle INVALID_PROJID being returned.  INVALID_PROJID
 *        may be tested for using projid_valid().
 */
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
{
        /* Map the uid to a global kernel uid */
        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
}
EXPORT_SYMBOL(make_kprojid);

/**
 *        from_kprojid - Create a projid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal project identifier to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kprojid has no mapping in @targ (projid_t)-1 is returned.
 */
projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
}
EXPORT_SYMBOL(from_kprojid);

/**
 *        from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal projid to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kprojid from_kprojid_munged never fails and always
 *        returns a valid projid.  This makes from_kprojid_munged
 *        appropriate for use in syscalls like stat and where
 *        failing the system call and failing to provide a valid projid are
 *        not an options.
 *
 *        If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
 */
projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
{
        projid_t projid;
        projid = from_kprojid(targ, kprojid);

        if (projid == (projid_t) -1)
                projid = OVERFLOW_PROJID;
        return projid;
}
EXPORT_SYMBOL(from_kprojid_munged);


static int uid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        uid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int gid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        gid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int projid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        projid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static void *m_start(struct seq_file *seq, loff_t *ppos,
                     struct uid_gid_map *map)
{
        loff_t pos = *ppos;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (pos >= extents)
                return NULL;

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return &map->extent[pos];

        return &map->forward[pos];
}

static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->uid_map);
}

static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->gid_map);
}

static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->projid_map);
}

static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return seq->op->start(seq, pos);
}

static void m_stop(struct seq_file *seq, void *v)
{
        return;
}

const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
};

const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
};

const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = projid_m_show,
};

static bool mappings_overlap(struct uid_gid_map *new_map,
                             struct uid_gid_extent *extent)
{
        u32 upper_first, lower_first, upper_last, lower_last;
        unsigned idx;

        upper_first = extent->first;
        lower_first = extent->lower_first;
        upper_last = upper_first + extent->count - 1;
        lower_last = lower_first + extent->count - 1;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                u32 prev_upper_first, prev_lower_first;
                u32 prev_upper_last, prev_lower_last;
                struct uid_gid_extent *prev;

                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        prev = &new_map->extent[idx];
                else
                        prev = &new_map->forward[idx];

                prev_upper_first = prev->first;
                prev_lower_first = prev->lower_first;
                prev_upper_last = prev_upper_first + prev->count - 1;
                prev_lower_last = prev_lower_first + prev->count - 1;

                /* Does the upper range intersect a previous extent? */
                if ((prev_upper_first <= upper_last) &&
                    (prev_upper_last >= upper_first))
                        return true;

                /* Does the lower range intersect a previous extent? */
                if ((prev_lower_first <= lower_last) &&
                    (prev_lower_last >= lower_first))
                        return true;
        }
        return false;
}

/*
 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
 * Takes care to allocate a 4K block of memory if the number of mappings exceeds
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
{
        struct uid_gid_extent *dest;

        if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
                struct uid_gid_extent *forward;

                /* Allocate memory for 340 mappings. */
                forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS,
                                        sizeof(struct uid_gid_extent),
                                        GFP_KERNEL);
                if (!forward)
                        return -ENOMEM;

                /* Copy over memory. Only set up memory for the forward pointer.
                 * Defer the memory setup for the reverse pointer.
                 */
                memcpy(forward, map->extent,
                       map->nr_extents * sizeof(map->extent[0]));

                map->forward = forward;
                map->reverse = NULL;
        }

        if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
                dest = &map->extent[map->nr_extents];
        else
                dest = &map->forward[map->nr_extents];

        *dest = *extent;
        map->nr_extents++;
        return 0;
}

/* cmp function to sort() forward mappings */
static int cmp_extents_forward(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->first < e2->first)
                return -1;

        if (e1->first > e2->first)
                return 1;

        return 0;
}

/* cmp function to sort() reverse mappings */
static int cmp_extents_reverse(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->lower_first < e2->lower_first)
                return -1;

        if (e1->lower_first > e2->lower_first)
                return 1;

        return 0;
}

/*
 * sort_idmaps - Sorts an array of idmap entries.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int sort_idmaps(struct uid_gid_map *map)
{
        if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return 0;

        /* Sort forward array. */
        sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_forward, NULL);

        /* Only copy the memory from forward we actually need. */
        map->reverse = kmemdup(map->forward,
                               map->nr_extents * sizeof(struct uid_gid_extent),
                               GFP_KERNEL);
        if (!map->reverse)
                return -ENOMEM;

        /* Sort reverse array. */
        sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_reverse, NULL);

        return 0;
}

/**
 * verify_root_map() - check the uid 0 mapping
 * @file: idmapping file
 * @map_ns: user namespace of the target process
 * @new_map: requested idmap
 *
 * If a process requests mapping parent uid 0 into the new ns, verify that the
 * process writing the map had the CAP_SETFCAP capability as the target process
 * will be able to write fscaps that are valid in ancestor user namespaces.
 *
 * Return: true if the mapping is allowed, false if not.
 */
static bool verify_root_map(const struct file *file,
                            struct user_namespace *map_ns,
                            struct uid_gid_map *new_map)
{
        int idx;
        const struct user_namespace *file_ns = file->f_cred->user_ns;
        struct uid_gid_extent *extent0 = NULL;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent0 = &new_map->extent[idx];
                else
                        extent0 = &new_map->forward[idx];
                if (extent0->lower_first == 0)
                        break;

                extent0 = NULL;
        }

        if (!extent0)
                return true;

        if (map_ns == file_ns) {
                /* The process unshared its ns and is writing to its own
                 * /proc/self/uid_map.  User already has full capabilites in
                 * the new namespace.  Verify that the parent had CAP_SETFCAP
                 * when it unshared.
                 * */
                if (!file_ns->parent_could_setfcap)
                        return false;
        } else {
                /* Process p1 is writing to uid_map of p2, who is in a child
                 * user namespace to p1's.  Verify that the opener of the map
                 * file has CAP_SETFCAP against the parent of the new map
                 * namespace */
                if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
                        return false;
        }

        return true;
}

static ssize_t map_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos,
                         int cap_setid,
                         struct uid_gid_map *map,
                         struct uid_gid_map *parent_map)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *map_ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
        struct uid_gid_extent extent;
        char *kbuf, *pos, *next_line;
        ssize_t ret;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /*
         * The userns_state_mutex serializes all writes to any given map.
         *
         * Any map is only ever written once.
         *
         * An id map fits within 1 cache line on most architectures.
         *
         * On read nothing needs to be done unless you are on an
         * architecture with a crazy cache coherency model like alpha.
         *
         * There is a one time data dependency between reading the
         * count of the extents and the values of the extents.  The
         * desired behavior is to see the values of the extents that
         * were written before the count of the extents.
         *
         * To achieve this smp_wmb() is used on guarantee the write
         * order and smp_rmb() is guaranteed that we don't have crazy
         * architectures returning stale data.
         */
        mutex_lock(&userns_state_mutex);

        memset(&new_map, 0, sizeof(struct uid_gid_map));

        ret = -EPERM;
        /* Only allow one successful write to the map */
        if (map->nr_extents != 0)
                goto out;

        /*
         * Adjusting namespace settings requires capabilities on the target.
         */
        if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
                goto out;

        /* Parse the user data */
        ret = -EINVAL;
        pos = kbuf;
        for (; pos; pos = next_line) {

                /* Find the end of line and ensure I don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                pos = skip_spaces(pos);
                extent.first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.lower_first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.count = simple_strtoul(pos, &pos, 10);
                if (*pos && !isspace(*pos))
                        goto out;

                /* Verify there is not trailing junk on the line */
                pos = skip_spaces(pos);
                if (*pos != '\0')
                        goto out;

                /* Verify we have been given valid starting values */
                if ((extent.first == (u32) -1) ||
                    (extent.lower_first == (u32) -1))
                        goto out;

                /* Verify count is not zero and does not cause the
                 * extent to wrap
                 */
                if ((extent.first + extent.count) <= extent.first)
                        goto out;
                if ((extent.lower_first + extent.count) <=
                     extent.lower_first)
                        goto out;

                /* Do the ranges in extent overlap any previous extents? */
                if (mappings_overlap(&new_map, &extent))
                        goto out;

                if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
                    (next_line != NULL))
                        goto out;

                ret = insert_extent(&new_map, &extent);
                if (ret < 0)
                        goto out;
                ret = -EINVAL;
        }
        /* Be very certain the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;

        ret = -EPERM;
        /* Validate the user is allowed to use user id's mapped to. */
        if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
                goto out;

        ret = -EPERM;
        /* Map the lower ids from the parent user namespace to the
         * kernel global id space.
         */
        for (idx = 0; idx < new_map.nr_extents; idx++) {
                struct uid_gid_extent *e;
                u32 lower_first;

                if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        e = &new_map.extent[idx];
                else
                        e = &new_map.forward[idx];

                lower_first = map_id_range_down(parent_map,
                                                e->lower_first,
                                                e->count);

                /* Fail if we can not map the specified extent to
                 * the kernel global id space.
                 */
                if (lower_first == (u32) -1)
                        goto out;

                e->lower_first = lower_first;
        }

        /*
         * If we want to use binary search for lookup, this clones the extent
         * array and sorts both copies.
         */
        ret = sort_idmaps(&new_map);
        if (ret < 0)
                goto out;

        /* Install the map */
        if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                memcpy(map->extent, new_map.extent,
                       new_map.nr_extents * sizeof(new_map.extent[0]));
        } else {
                map->forward = new_map.forward;
                map->reverse = new_map.reverse;
        }
        smp_wmb();
        map->nr_extents = new_map.nr_extents;

        *ppos = count;
        ret = count;
out:
        if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(new_map.forward);
                kfree(new_map.reverse);
                map->forward = NULL;
                map->reverse = NULL;
                map->nr_extents = 0;
        }

        mutex_unlock(&userns_state_mutex);
        kfree(kbuf);
        return ret;
}

ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETUID,
                         &ns->uid_map, &ns->parent->uid_map);
}

ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETGID,
                         &ns->gid_map, &ns->parent->gid_map);
}

ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
                              size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        /* Anyone can set any valid project id no capability needed */
        return map_write(file, buf, size, ppos, -1,
                         &ns->projid_map, &ns->parent->projid_map);
}

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
{
        const struct cred *cred = file->f_cred;

        if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
                return false;

        /* Don't allow mappings that would allow anything that wouldn't
         * be allowed without the establishment of unprivileged mappings.
         */
        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
            uid_eq(ns->owner, cred->euid)) {
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
                        if (uid_eq(uid, cred->euid))
                                return true;
                } else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
                        if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
                            gid_eq(gid, cred->egid))
                                return true;
                }
        }

        /* Allow anyone to set a mapping that doesn't require privilege */
        if (!cap_valid(cap_setid))
                return true;

        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
         * And the opener of the id file also has the appropriate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
                return true;

        return false;
}

int proc_setgroups_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        unsigned long userns_flags = READ_ONCE(ns->flags);

        seq_printf(seq, "%s\n",
                   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
                   "allow" : "deny");
        return 0;
}

ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        char kbuf[8], *pos;
        bool setgroups_allowed;
        ssize_t ret;

        /* Only allow a very narrow range of strings to be written */
        ret = -EINVAL;
        if ((*ppos != 0) || (count >= sizeof(kbuf)))
                goto out;

        /* What was written? */
        ret = -EFAULT;
        if (copy_from_user(kbuf, buf, count))
                goto out;
        kbuf[count] = '\0';
        pos = kbuf;

        /* What is being requested? */
        ret = -EINVAL;
        if (strncmp(pos, "allow", 5) == 0) {
                pos += 5;
                setgroups_allowed = true;
        }
        else if (strncmp(pos, "deny", 4) == 0) {
                pos += 4;
                setgroups_allowed = false;
        }
        else
                goto out;

        /* Verify there is not trailing junk on the line */
        pos = skip_spaces(pos);
        if (*pos != '\0')
                goto out;

        ret = -EPERM;
        mutex_lock(&userns_state_mutex);
        if (setgroups_allowed) {
                /* Enabling setgroups after setgroups has been disabled
                 * is not allowed.
                 */
                if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
                        goto out_unlock;
        } else {
                /* Permanently disabling setgroups after setgroups has
                 * been enabled by writing the gid_map is not allowed.
                 */
                if (ns->gid_map.nr_extents != 0)
                        goto out_unlock;
                ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
        }
        mutex_unlock(&userns_state_mutex);

        /* Report a successful write */
        *ppos = count;
        ret = count;
out:
        return ret;
out_unlock:
        mutex_unlock(&userns_state_mutex);
        goto out;
}

bool userns_may_setgroups(const struct user_namespace *ns)
{
        bool allowed;

        mutex_lock(&userns_state_mutex);
        /* It is not safe to use setgroups until a gid mapping in
         * the user namespace has been established.
         */
        allowed = ns->gid_map.nr_extents != 0;
        /* Is setgroups allowed? */
        allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
        mutex_unlock(&userns_state_mutex);

        return allowed;
}

/*
 * Returns true if @child is the same namespace or a descendant of
 * @ancestor.
 */
bool in_userns(const struct user_namespace *ancestor,
               const struct user_namespace *child)
{
        const struct user_namespace *ns;
        for (ns = child; ns->level > ancestor->level; ns = ns->parent)
                ;
        return (ns == ancestor);
}

bool current_in_userns(const struct user_namespace *target_ns)
{
        return in_userns(target_ns, current_user_ns());
}
EXPORT_SYMBOL(current_in_userns);

static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
        return container_of(ns, struct user_namespace, ns);
}

static struct ns_common *userns_get(struct task_struct *task)
{
        struct user_namespace *user_ns;

        rcu_read_lock();
        user_ns = get_user_ns(__task_cred(task)->user_ns);
        rcu_read_unlock();

        return user_ns ? &user_ns->ns : NULL;
}

static void userns_put(struct ns_common *ns)
{
        put_user_ns(to_user_ns(ns));
}

static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct user_namespace *user_ns = to_user_ns(ns);
        struct cred *cred;

        /* Don't allow gaining capabilities by reentering
         * the same user namespace.
         */
        if (user_ns == current_user_ns())
                return -EINVAL;

        /* Tasks that share a thread group must share a user namespace */
        if (!thread_group_empty(current))
                return -EINVAL;

        if (current->fs->users != 1)
                return -EINVAL;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        cred = nsset_cred(nsset);
        if (!cred)
                return -EINVAL;

        put_user_ns(cred->user_ns);
        set_cred_user_ns(cred, get_user_ns(user_ns));

        if (set_cred_ucounts(cred) < 0)
                return -EINVAL;

        return 0;
}

struct ns_common *ns_get_owner(struct ns_common *ns)
{
        struct user_namespace *my_user_ns = current_user_ns();
        struct user_namespace *owner, *p;

        /* See if the owner is in the current user namespace */
        owner = p = ns->ops->owner(ns);
        for (;;) {
                if (!p)
                        return ERR_PTR(-EPERM);
                if (p == my_user_ns)
                        break;
                p = p->parent;
        }

        return &get_user_ns(owner)->ns;
}

static struct user_namespace *userns_owner(struct ns_common *ns)
{
        return to_user_ns(ns)->parent;
}

const struct proc_ns_operations userns_operations = {
        .name                = "user",
        .type                = CLONE_NEWUSER,
        .get                = userns_get,
        .put                = userns_put,
        .install        = userns_install,
        .owner                = userns_owner,
        .get_parent        = ns_get_owner,
};

static __init int user_namespaces_init(void)
{
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
        return 0;
}
subsys_initcall(user_namespaces_init);











































































































































































































































































































































































































































































































































































































    8 





    7 
    7 



    8 
    6 
    7 









    7 

    8 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
// SPDX-License-Identifier: GPL-2.0-only
/*
 * AppArmor security module
 *
 * This file contains AppArmor mediation of files
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2010 Canonical Ltd.
 */

#include <linux/tty.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mount.h>

#include "include/apparmor.h"
#include "include/audit.h"
#include "include/cred.h"
#include "include/file.h"
#include "include/match.h"
#include "include/net.h"
#include "include/path.h"
#include "include/policy.h"
#include "include/label.h"

static u32 map_mask_to_chr_mask(u32 mask)
{
        u32 m = mask & PERMS_CHRS_MASK;

        if (mask & AA_MAY_GETATTR)
                m |= MAY_READ;
        if (mask & (AA_MAY_SETATTR | AA_MAY_CHMOD | AA_MAY_CHOWN))
                m |= MAY_WRITE;

        return m;
}

/**
 * file_audit_cb - call back for file specific audit fields
 * @ab: audit_buffer  (NOT NULL)
 * @va: audit struct to audit values of  (NOT NULL)
 */
static void file_audit_cb(struct audit_buffer *ab, void *va)
{
        struct common_audit_data *sa = va;
        struct apparmor_audit_data *ad = aad(sa);
        kuid_t fsuid = ad->subj_cred ? ad->subj_cred->fsuid : current_fsuid();
        char str[10];

        if (ad->request & AA_AUDIT_FILE_MASK) {
                aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs,
                                    map_mask_to_chr_mask(ad->request));
                audit_log_format(ab, " requested_mask=\"%s\"", str);
        }
        if (ad->denied & AA_AUDIT_FILE_MASK) {
                aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs,
                                    map_mask_to_chr_mask(ad->denied));
                audit_log_format(ab, " denied_mask=\"%s\"", str);
        }
        if (ad->request & AA_AUDIT_FILE_MASK) {
                audit_log_format(ab, " fsuid=%d",
                                 from_kuid(&init_user_ns, fsuid));
                audit_log_format(ab, " ouid=%d",
                                 from_kuid(&init_user_ns, ad->fs.ouid));
        }

        if (ad->peer) {
                audit_log_format(ab, " target=");
                aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer,
                                FLAG_VIEW_SUBNS, GFP_KERNEL);
        } else if (ad->fs.target) {
                audit_log_format(ab, " target=");
                audit_log_untrustedstring(ab, ad->fs.target);
        }
}

/**
 * aa_audit_file - handle the auditing of file operations
 * @subj_cred: cred of the subject
 * @profile: the profile being enforced  (NOT NULL)
 * @perms: the permissions computed for the request (NOT NULL)
 * @op: operation being mediated
 * @request: permissions requested
 * @name: name of object being mediated (MAYBE NULL)
 * @target: name of target (MAYBE NULL)
 * @tlabel: target label (MAY BE NULL)
 * @ouid: object uid
 * @info: extra information message (MAYBE NULL)
 * @error: 0 if operation allowed else failure error code
 *
 * Returns: %0 or error on failure
 */
int aa_audit_file(const struct cred *subj_cred,
                  struct aa_profile *profile, struct aa_perms *perms,
                  const char *op, u32 request, const char *name,
                  const char *target, struct aa_label *tlabel,
                  kuid_t ouid, const char *info, int error)
{
        int type = AUDIT_APPARMOR_AUTO;
        DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_FILE, op);

        ad.subj_cred = subj_cred;
        ad.request = request;
        ad.name = name;
        ad.fs.target = target;
        ad.peer = tlabel;
        ad.fs.ouid = ouid;
        ad.info = info;
        ad.error = error;
        ad.common.u.tsk = NULL;

        if (likely(!ad.error)) {
                u32 mask = perms->audit;

                if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL))
                        mask = 0xffff;

                /* mask off perms that are not being force audited */
                ad.request &= mask;

                if (likely(!ad.request))
                        return 0;
                type = AUDIT_APPARMOR_AUDIT;
        } else {
                /* only report permissions that were denied */
                ad.request = ad.request & ~perms->allow;
                AA_BUG(!ad.request);

                if (ad.request & perms->kill)
                        type = AUDIT_APPARMOR_KILL;

                /* quiet known rejects, assumes quiet and kill do not overlap */
                if ((ad.request & perms->quiet) &&
                    AUDIT_MODE(profile) != AUDIT_NOQUIET &&
                    AUDIT_MODE(profile) != AUDIT_ALL)
                        ad.request &= ~perms->quiet;

                if (!ad.request)
                        return ad.error;
        }

        ad.denied = ad.request & ~perms->allow;
        return aa_audit(type, profile, &ad, file_audit_cb);
}

/**
 * is_deleted - test if a file has been completely unlinked
 * @dentry: dentry of file to test for deletion  (NOT NULL)
 *
 * Returns: true if deleted else false
 */
static inline bool is_deleted(struct dentry *dentry)
{
        if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0)
                return true;
        return false;
}

static int path_name(const char *op, const struct cred *subj_cred,
                     struct aa_label *label,
                     const struct path *path, int flags, char *buffer,
                     const char **name, struct path_cond *cond, u32 request)
{
        struct aa_profile *profile;
        const char *info = NULL;
        int error;

        error = aa_path_name(path, flags, buffer, name, &info,
                             labels_profile(label)->disconnected);
        if (error) {
                fn_for_each_confined(label, profile,
                        aa_audit_file(subj_cred,
                                      profile, &nullperms, op, request, *name,
                                      NULL, NULL, cond->uid, info, error));
                return error;
        }

        return 0;
}

struct aa_perms default_perms = {};
/**
 * aa_lookup_fperms - convert dfa compressed perms to internal perms
 * @file_rules: the aa_policydb to lookup perms for  (NOT NULL)
 * @state: state in dfa
 * @cond:  conditions to consider  (NOT NULL)
 *
 * TODO: convert from dfa + state to permission entry
 *
 * Returns: a pointer to a file permission set
 */
struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules,
                                 aa_state_t state, struct path_cond *cond)
{
        unsigned int index = ACCEPT_TABLE(file_rules->dfa)[state];

        if (!(file_rules->perms))
                return &default_perms;

        if (uid_eq(current_fsuid(), cond->uid))
                return &(file_rules->perms[index]);

        return &(file_rules->perms[index + 1]);
}

/**
 * aa_str_perms - find permission that match @name
 * @file_rules: the aa_policydb to match against  (NOT NULL)
 * @start: state to start matching in
 * @name: string to match against dfa  (NOT NULL)
 * @cond: conditions to consider for permission set computation  (NOT NULL)
 * @perms: Returns - the permissions found when matching @name
 *
 * Returns: the final state in @dfa when beginning @start and walking @name
 */
aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start,
                        const char *name, struct path_cond *cond,
                        struct aa_perms *perms)
{
        aa_state_t state;
        state = aa_dfa_match(file_rules->dfa, start, name);
        *perms = *(aa_lookup_fperms(file_rules, state, cond));

        return state;
}

static int __aa_path_perm(const char *op, const struct cred *subj_cred,
                          struct aa_profile *profile, const char *name,
                          u32 request, struct path_cond *cond, int flags,
                          struct aa_perms *perms)
{
        struct aa_ruleset *rules = list_first_entry(&profile->rules,
                                                    typeof(*rules), list);
        int e = 0;

        if (profile_unconfined(profile))
                return 0;
        aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE],
                     name, cond, perms);
        if (request & ~perms->allow)
                e = -EACCES;
        return aa_audit_file(subj_cred,
                             profile, perms, op, request, name, NULL, NULL,
                             cond->uid, NULL, e);
}


static int profile_path_perm(const char *op, const struct cred *subj_cred,
                             struct aa_profile *profile,
                             const struct path *path, char *buffer, u32 request,
                             struct path_cond *cond, int flags,
                             struct aa_perms *perms)
{
        const char *name;
        int error;

        if (profile_unconfined(profile))
                return 0;

        error = path_name(op, subj_cred, &profile->label, path,
                          flags | profile->path_flags, buffer, &name, cond,
                          request);
        if (error)
                return error;
        return __aa_path_perm(op, subj_cred, profile, name, request, cond,
                              flags, perms);
}

/**
 * aa_path_perm - do permissions check & audit for @path
 * @op: operation being checked
 * @subj_cred: subject cred
 * @label: profile being enforced  (NOT NULL)
 * @path: path to check permissions of  (NOT NULL)
 * @flags: any additional path flags beyond what the profile specifies
 * @request: requested permissions
 * @cond: conditional info for this request  (NOT NULL)
 *
 * Returns: %0 else error if access denied or other error
 */
int aa_path_perm(const char *op, const struct cred *subj_cred,
                 struct aa_label *label,
                 const struct path *path, int flags, u32 request,
                 struct path_cond *cond)
{
        struct aa_perms perms = {};
        struct aa_profile *profile;
        char *buffer = NULL;
        int error;

        flags |= PATH_DELEGATE_DELETED | (S_ISDIR(cond->mode) ? PATH_IS_DIR :
                                                                0);
        buffer = aa_get_buffer(false);
        if (!buffer)
                return -ENOMEM;
        error = fn_for_each_confined(label, profile,
                        profile_path_perm(op, subj_cred, profile, path, buffer,
                                          request, cond, flags, &perms));

        aa_put_buffer(buffer);

        return error;
}

/**
 * xindex_is_subset - helper for aa_path_link
 * @link: link permission set
 * @target: target permission set
 *
 * test target x permissions are equal OR a subset of link x permissions
 * this is done as part of the subset test, where a hardlink must have
 * a subset of permissions that the target has.
 *
 * Returns: true if subset else false
 */
static inline bool xindex_is_subset(u32 link, u32 target)
{
        if (((link & ~AA_X_UNSAFE) != (target & ~AA_X_UNSAFE)) ||
            ((link & AA_X_UNSAFE) && !(target & AA_X_UNSAFE)))
                return false;

        return true;
}

static int profile_path_link(const struct cred *subj_cred,
                             struct aa_profile *profile,
                             const struct path *link, char *buffer,
                             const struct path *target, char *buffer2,
                             struct path_cond *cond)
{
        struct aa_ruleset *rules = list_first_entry(&profile->rules,
                                                    typeof(*rules), list);
        const char *lname, *tname = NULL;
        struct aa_perms lperms = {}, perms;
        const char *info = NULL;
        u32 request = AA_MAY_LINK;
        aa_state_t state;
        int error;

        error = path_name(OP_LINK, subj_cred, &profile->label, link,
                          profile->path_flags,
                          buffer, &lname, cond, AA_MAY_LINK);
        if (error)
                goto audit;

        /* buffer2 freed below, tname is pointer in buffer2 */
        error = path_name(OP_LINK, subj_cred, &profile->label, target,
                          profile->path_flags,
                          buffer2, &tname, cond, AA_MAY_LINK);
        if (error)
                goto audit;

        error = -EACCES;
        /* aa_str_perms - handles the case of the dfa being NULL */
        state = aa_str_perms(rules->file,
                             rules->file->start[AA_CLASS_FILE], lname,
                             cond, &lperms);

        if (!(lperms.allow & AA_MAY_LINK))
                goto audit;

        /* test to see if target can be paired with link */
        state = aa_dfa_null_transition(rules->file->dfa, state);
        aa_str_perms(rules->file, state, tname, cond, &perms);

        /* force audit/quiet masks for link are stored in the second entry
         * in the link pair.
         */
        lperms.audit = perms.audit;
        lperms.quiet = perms.quiet;
        lperms.kill = perms.kill;

        if (!(perms.allow & AA_MAY_LINK)) {
                info = "target restricted";
                lperms = perms;
                goto audit;
        }

        /* done if link subset test is not required */
        if (!(perms.allow & AA_LINK_SUBSET))
                goto done_tests;

        /* Do link perm subset test requiring allowed permission on link are
         * a subset of the allowed permissions on target.
         */
        aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE],
                     tname, cond, &perms);

        /* AA_MAY_LINK is not considered in the subset test */
        request = lperms.allow & ~AA_MAY_LINK;
        lperms.allow &= perms.allow | AA_MAY_LINK;

        request |= AA_AUDIT_FILE_MASK & (lperms.allow & ~perms.allow);
        if (request & ~lperms.allow) {
                goto audit;
        } else if ((lperms.allow & MAY_EXEC) &&
                   !xindex_is_subset(lperms.xindex, perms.xindex)) {
                lperms.allow &= ~MAY_EXEC;
                request |= MAY_EXEC;
                info = "link not subset of target";
                goto audit;
        }

done_tests:
        error = 0;

audit:
        return aa_audit_file(subj_cred,
                             profile, &lperms, OP_LINK, request, lname, tname,
                             NULL, cond->uid, info, error);
}

/**
 * aa_path_link - Handle hard link permission check
 * @subj_cred: subject cred
 * @label: the label being enforced  (NOT NULL)
 * @old_dentry: the target dentry  (NOT NULL)
 * @new_dir: directory the new link will be created in  (NOT NULL)
 * @new_dentry: the link being created  (NOT NULL)
 *
 * Handle the permission test for a link & target pair.  Permission
 * is encoded as a pair where the link permission is determined
 * first, and if allowed, the target is tested.  The target test
 * is done from the point of the link match (not start of DFA)
 * making the target permission dependent on the link permission match.
 *
 * The subset test if required forces that permissions granted
 * on link are a subset of the permission granted to target.
 *
 * Returns: %0 if allowed else error
 */
int aa_path_link(const struct cred *subj_cred,
                 struct aa_label *label, struct dentry *old_dentry,
                 const struct path *new_dir, struct dentry *new_dentry)
{
        struct path link = { .mnt = new_dir->mnt, .dentry = new_dentry };
        struct path target = { .mnt = new_dir->mnt, .dentry = old_dentry };
        struct path_cond cond = {
                d_backing_inode(old_dentry)->i_uid,
                d_backing_inode(old_dentry)->i_mode
        };
        char *buffer = NULL, *buffer2 = NULL;
        struct aa_profile *profile;
        int error;

        /* buffer freed below, lname is pointer in buffer */
        buffer = aa_get_buffer(false);
        buffer2 = aa_get_buffer(false);
        error = -ENOMEM;
        if (!buffer || !buffer2)
                goto out;

        error = fn_for_each_confined(label, profile,
                        profile_path_link(subj_cred, profile, &link, buffer,
                                          &target, buffer2, &cond));
out:
        aa_put_buffer(buffer);
        aa_put_buffer(buffer2);
        return error;
}

static void update_file_ctx(struct aa_file_ctx *fctx, struct aa_label *label,
                            u32 request)
{
        struct aa_label *l, *old;

        /* update caching of label on file_ctx */
        spin_lock(&fctx->lock);
        old = rcu_dereference_protected(fctx->label,
                                        lockdep_is_held(&fctx->lock));
        l = aa_label_merge(old, label, GFP_ATOMIC);
        if (l) {
                if (l != old) {
                        rcu_assign_pointer(fctx->label, l);
                        aa_put_label(old);
                } else
                        aa_put_label(l);
                fctx->allow |= request;
        }
        spin_unlock(&fctx->lock);
}

static int __file_path_perm(const char *op, const struct cred *subj_cred,
                            struct aa_label *label,
                            struct aa_label *flabel, struct file *file,
                            u32 request, u32 denied, bool in_atomic)
{
        struct aa_profile *profile;
        struct aa_perms perms = {};
        vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(file),
                                            file_inode(file));
        struct path_cond cond = {
                .uid = vfsuid_into_kuid(vfsuid),
                .mode = file_inode(file)->i_mode
        };
        char *buffer;
        int flags, error;

        /* revalidation due to label out of date. No revocation at this time */
        if (!denied && aa_label_is_subset(flabel, label))
                /* TODO: check for revocation on stale profiles */
                return 0;

        flags = PATH_DELEGATE_DELETED | (S_ISDIR(cond.mode) ? PATH_IS_DIR : 0);
        buffer = aa_get_buffer(in_atomic);
        if (!buffer)
                return -ENOMEM;

        /* check every profile in task label not in current cache */
        error = fn_for_each_not_in_set(flabel, label, profile,
                        profile_path_perm(op, subj_cred, profile,
                                          &file->f_path, buffer,
                                          request, &cond, flags, &perms));
        if (denied && !error) {
                /*
                 * check every profile in file label that was not tested
                 * in the initial check above.
                 *
                 * TODO: cache full perms so this only happens because of
                 * conditionals
                 * TODO: don't audit here
                 */
                if (label == flabel)
                        error = fn_for_each(label, profile,
                                profile_path_perm(op, subj_cred,
                                                  profile, &file->f_path,
                                                  buffer, request, &cond, flags,
                                                  &perms));
                else
                        error = fn_for_each_not_in_set(label, flabel, profile,
                                profile_path_perm(op, subj_cred,
                                                  profile, &file->f_path,
                                                  buffer, request, &cond, flags,
                                                  &perms));
        }
        if (!error)
                update_file_ctx(file_ctx(file), label, request);

        aa_put_buffer(buffer);

        return error;
}

static int __file_sock_perm(const char *op, const struct cred *subj_cred,
                            struct aa_label *label,
                            struct aa_label *flabel, struct file *file,
                            u32 request, u32 denied)
{
        struct socket *sock = (struct socket *) file->private_data;
        int error;

        AA_BUG(!sock);

        /* revalidation due to label out of date. No revocation at this time */
        if (!denied && aa_label_is_subset(flabel, label))
                return 0;

        /* TODO: improve to skip profiles cached in flabel */
        error = aa_sock_file_perm(subj_cred, label, op, request, sock);
        if (denied) {
                /* TODO: improve to skip profiles checked above */
                /* check every profile in file label to is cached */
                last_error(error, aa_sock_file_perm(subj_cred, flabel, op,
                                                    request, sock));
        }
        if (!error)
                update_file_ctx(file_ctx(file), label, request);

        return error;
}

/**
 * aa_file_perm - do permission revalidation check & audit for @file
 * @op: operation being checked
 * @subj_cred: subject cred
 * @label: label being enforced   (NOT NULL)
 * @file: file to revalidate access permissions on  (NOT NULL)
 * @request: requested permissions
 * @in_atomic: whether allocations need to be done in atomic context
 *
 * Returns: %0 if access allowed else error
 */
int aa_file_perm(const char *op, const struct cred *subj_cred,
                 struct aa_label *label, struct file *file,
                 u32 request, bool in_atomic)
{
        struct aa_file_ctx *fctx;
        struct aa_label *flabel;
        u32 denied;
        int error = 0;

        AA_BUG(!label);
        AA_BUG(!file);

        fctx = file_ctx(file);

        rcu_read_lock();
        flabel  = rcu_dereference(fctx->label);
        AA_BUG(!flabel);

        /* revalidate access, if task is unconfined, or the cached cred
         * doesn't match or if the request is for more permissions than
         * was granted.
         *
         * Note: the test for !unconfined(flabel) is to handle file
         *       delegation from unconfined tasks
         */
        denied = request & ~fctx->allow;
        if (unconfined(label) || unconfined(flabel) ||
            (!denied && aa_label_is_subset(flabel, label))) {
                rcu_read_unlock();
                goto done;
        }

        flabel  = aa_get_newest_label(flabel);
        rcu_read_unlock();
        /* TODO: label cross check */

        if (file->f_path.mnt && path_mediated_fs(file->f_path.dentry))
                error = __file_path_perm(op, subj_cred, label, flabel, file,
                                         request, denied, in_atomic);

        else if (S_ISSOCK(file_inode(file)->i_mode))
                error = __file_sock_perm(op, subj_cred, label, flabel, file,
                                         request, denied);
        aa_put_label(flabel);

done:
        return error;
}

static void revalidate_tty(const struct cred *subj_cred, struct aa_label *label)
{
        struct tty_struct *tty;
        int drop_tty = 0;

        tty = get_current_tty();
        if (!tty)
                return;

        spin_lock(&tty->files_lock);
        if (!list_empty(&tty->tty_files)) {
                struct tty_file_private *file_priv;
                struct file *file;
                /* TODO: Revalidate access to controlling tty. */
                file_priv = list_first_entry(&tty->tty_files,
                                             struct tty_file_private, list);
                file = file_priv->file;

                if (aa_file_perm(OP_INHERIT, subj_cred, label, file,
                                 MAY_READ | MAY_WRITE, IN_ATOMIC))
                        drop_tty = 1;
        }
        spin_unlock(&tty->files_lock);
        tty_kref_put(tty);

        if (drop_tty)
                no_tty();
}

struct cred_label {
        const struct cred *cred;
        struct aa_label *label;
};

static int match_file(const void *p, struct file *file, unsigned int fd)
{
        struct cred_label *cl = (struct cred_label *)p;

        if (aa_file_perm(OP_INHERIT, cl->cred, cl->label, file,
                         aa_map_file_to_perms(file), IN_ATOMIC))
                return fd + 1;
        return 0;
}


/* based on selinux's flush_unauthorized_files */
void aa_inherit_files(const struct cred *cred, struct files_struct *files)
{
        struct aa_label *label = aa_get_newest_cred_label(cred);
        struct cred_label cl = {
                .cred = cred,
                .label = label,
        };
        struct file *devnull = NULL;
        unsigned int n;

        revalidate_tty(cred, label);

        /* Revalidate access to inherited open files. */
        n = iterate_fd(files, 0, match_file, &cl);
        if (!n) /* none found? */
                goto out;

        devnull = dentry_open(&aa_null, O_RDWR, cred);
        if (IS_ERR(devnull))
                devnull = NULL;
        /* replace all the matching ones with this */
        do {
                replace_fd(n - 1, devnull, 0);
        } while ((n = iterate_fd(files, n, match_file, &cl)) != 0);
        if (devnull)
                fput(devnull);
out:
        aa_put_label(label);
}









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions Copyright (C) 1992 Drew Eckhardt
 */
#ifndef _LINUX_BLKDEV_H
#define _LINUX_BLKDEV_H

#include <linux/types.h>
#include <linux/blk_types.h>
#include <linux/device.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/minmax.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
#include <linux/bio.h>
#include <linux/gfp.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
#include <linux/blkzoned.h>
#include <linux/sched.h>
#include <linux/sbitmap.h>
#include <linux/uuid.h>
#include <linux/xarray.h>
#include <linux/file.h>

struct module;
struct request_queue;
struct elevator_queue;
struct blk_trace;
struct request;
struct sg_io_hdr;
struct blkcg_gq;
struct blk_flush_queue;
struct kiocb;
struct pr_ops;
struct rq_qos;
struct blk_queue_stats;
struct blk_stat_callback;
struct blk_crypto_profile;

extern const struct device_type disk_type;
extern const struct device_type part_type;
extern const struct class block_class;

/*
 * Maximum number of blkcg policies allowed to be registered concurrently.
 * Defined here to simplify include dependency.
 */
#define BLKCG_MAX_POLS                6

#define DISK_MAX_PARTS                        256
#define DISK_NAME_LEN                        32

#define PARTITION_META_INFO_VOLNAMELTH        64
/*
 * Enough for the string representation of any kind of UUID plus NULL.
 * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
 */
#define PARTITION_META_INFO_UUIDLTH        (UUID_STRING_LEN + 1)

struct partition_meta_info {
        char uuid[PARTITION_META_INFO_UUIDLTH];
        u8 volname[PARTITION_META_INFO_VOLNAMELTH];
};

/**
 * DOC: genhd capability flags
 *
 * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
 * removable media.  When set, the device remains present even when media is not
 * inserted.  Shall not be set for devices which are removed entirely when the
 * media is removed.
 *
 * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
 * doesn't appear in sysfs, and can't be opened from userspace or using
 * blkdev_get*. Used for the underlying components of multipath devices.
 *
 * ``GENHD_FL_NO_PART``: partition support is disabled.  The kernel will not
 * scan for partitions from add_disk, and users can't add partitions manually.
 *
 */
enum {
        GENHD_FL_REMOVABLE                        = 1 << 0,
        GENHD_FL_HIDDEN                                = 1 << 1,
        GENHD_FL_NO_PART                        = 1 << 2,
};

enum {
        DISK_EVENT_MEDIA_CHANGE                        = 1 << 0, /* media changed */
        DISK_EVENT_EJECT_REQUEST                = 1 << 1, /* eject requested */
};

enum {
        /* Poll even if events_poll_msecs is unset */
        DISK_EVENT_FLAG_POLL                        = 1 << 0,
        /* Forward events to udev */
        DISK_EVENT_FLAG_UEVENT                        = 1 << 1,
        /* Block event polling when open for exclusive write */
        DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE        = 1 << 2,
};

struct disk_events;
struct badblocks;

struct blk_integrity {
        const struct blk_integrity_profile        *profile;
        unsigned char                                flags;
        unsigned char                                tuple_size;
        unsigned char                                pi_offset;
        unsigned char                                interval_exp;
        unsigned char                                tag_size;
};

typedef unsigned int __bitwise blk_mode_t;

/* open for reading */
#define BLK_OPEN_READ                ((__force blk_mode_t)(1 << 0))
/* open for writing */
#define BLK_OPEN_WRITE                ((__force blk_mode_t)(1 << 1))
/* open exclusively (vs other exclusive openers */
#define BLK_OPEN_EXCL                ((__force blk_mode_t)(1 << 2))
/* opened with O_NDELAY */
#define BLK_OPEN_NDELAY                ((__force blk_mode_t)(1 << 3))
/* open for "writes" only for ioctls (specialy hack for floppy.c) */
#define BLK_OPEN_WRITE_IOCTL        ((__force blk_mode_t)(1 << 4))
/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
#define BLK_OPEN_RESTRICT_WRITES        ((__force blk_mode_t)(1 << 5))
/* return partition scanning errors */
#define BLK_OPEN_STRICT_SCAN        ((__force blk_mode_t)(1 << 6))

struct gendisk {
        /*
         * major/first_minor/minors should not be set by any new driver, the
         * block core will take care of allocating them automatically.
         */
        int major;
        int first_minor;
        int minors;

        char disk_name[DISK_NAME_LEN];        /* name of major driver */

        unsigned short events;                /* supported events */
        unsigned short event_flags;        /* flags related to event processing */

        struct xarray part_tbl;
        struct block_device *part0;

        const struct block_device_operations *fops;
        struct request_queue *queue;
        void *private_data;

        struct bio_set bio_split;

        int flags;
        unsigned long state;
#define GD_NEED_PART_SCAN                0
#define GD_READ_ONLY                        1
#define GD_DEAD                                2
#define GD_NATIVE_CAPACITY                3
#define GD_ADDED                        4
#define GD_SUPPRESS_PART_SCAN                5
#define GD_OWNS_QUEUE                        6

        struct mutex open_mutex;        /* open/close mutex */
        unsigned open_partitions;        /* number of open partitions */

        struct backing_dev_info        *bdi;
        struct kobject queue_kobj;        /* the queue/ directory */
        struct kobject *slave_dir;
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
        struct list_head slave_bdevs;
#endif
        struct timer_rand_state *random;
        atomic_t sync_io;                /* RAID */
        struct disk_events *ev;

#ifdef CONFIG_BLK_DEV_ZONED
        /*
         * Zoned block device information. Reads of this information must be
         * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
         * information is only allowed while no requests are being processed.
         * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
         */
        unsigned int                nr_zones;
        unsigned int                zone_capacity;
        unsigned int                last_zone_capacity;
        unsigned long                *conv_zones_bitmap;
        unsigned int            zone_wplugs_hash_bits;
        spinlock_t              zone_wplugs_lock;
        struct mempool_s        *zone_wplugs_pool;
        struct hlist_head       *zone_wplugs_hash;
        struct list_head        zone_wplugs_err_list;
        struct work_struct        zone_wplugs_work;
        struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */

#if IS_ENABLED(CONFIG_CDROM)
        struct cdrom_device_info *cdi;
#endif
        int node_id;
        struct badblocks *bb;
        struct lockdep_map lockdep_map;
        u64 diskseq;
        blk_mode_t open_mode;

        /*
         * Independent sector access ranges. This is always NULL for
         * devices that do not have multiple independent access ranges.
         */
        struct blk_independent_access_ranges *ia_ranges;
};

/**
 * disk_openers - returns how many openers are there for a disk
 * @disk: disk to check
 *
 * This returns the number of openers for a disk.  Note that this value is only
 * stable if disk->open_mutex is held.
 *
 * Note: Due to a quirk in the block layer open code, each open partition is
 * only counted once even if there are multiple openers.
 */
static inline unsigned int disk_openers(struct gendisk *disk)
{
        return atomic_read(&disk->part0->bd_openers);
}

/**
 * disk_has_partscan - return %true if partition scanning is enabled on a disk
 * @disk: disk to check
 *
 * Returns %true if partitions scanning is enabled for @disk, or %false if
 * partition scanning is disabled either permanently or temporarily.
 */
static inline bool disk_has_partscan(struct gendisk *disk)
{
        return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
                !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
}

/*
 * The gendisk is refcounted by the part0 block_device, and the bd_device
 * therein is also used for device model presentation in sysfs.
 */
#define dev_to_disk(device) \
        (dev_to_bdev(device)->bd_disk)
#define disk_to_dev(disk) \
        (&((disk)->part0->bd_device))

#if IS_REACHABLE(CONFIG_CDROM)
#define disk_to_cdi(disk)        ((disk)->cdi)
#else
#define disk_to_cdi(disk)        NULL
#endif

static inline dev_t disk_devt(struct gendisk *disk)
{
        return MKDEV(disk->major, disk->first_minor);
}

static inline int blk_validate_block_size(unsigned long bsize)
{
        if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
                return -EINVAL;

        return 0;
}

static inline bool blk_op_is_passthrough(blk_opf_t op)
{
        op &= REQ_OP_MASK;
        return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
}

/*
 * BLK_BOUNCE_NONE:        never bounce (default)
 * BLK_BOUNCE_HIGH:        bounce all highmem pages
 */
enum blk_bounce {
        BLK_BOUNCE_NONE,
        BLK_BOUNCE_HIGH,
};

struct queue_limits {
        enum blk_bounce                bounce;
        unsigned long                seg_boundary_mask;
        unsigned long                virt_boundary_mask;

        unsigned int                max_hw_sectors;
        unsigned int                max_dev_sectors;
        unsigned int                chunk_sectors;
        unsigned int                max_sectors;
        unsigned int                max_user_sectors;
        unsigned int                max_segment_size;
        unsigned int                physical_block_size;
        unsigned int                logical_block_size;
        unsigned int                alignment_offset;
        unsigned int                io_min;
        unsigned int                io_opt;
        unsigned int                max_discard_sectors;
        unsigned int                max_hw_discard_sectors;
        unsigned int                max_user_discard_sectors;
        unsigned int                max_secure_erase_sectors;
        unsigned int                max_write_zeroes_sectors;
        unsigned int                max_zone_append_sectors;
        unsigned int                discard_granularity;
        unsigned int                discard_alignment;
        unsigned int                zone_write_granularity;

        unsigned short                max_segments;
        unsigned short                max_integrity_segments;
        unsigned short                max_discard_segments;

        unsigned char                misaligned;
        unsigned char                discard_misaligned;
        unsigned char                raid_partial_stripes_expensive;
        bool                        zoned;
        unsigned int                max_open_zones;
        unsigned int                max_active_zones;

        /*
         * Drivers that set dma_alignment to less than 511 must be prepared to
         * handle individual bvec's that are not a multiple of a SECTOR_SIZE
         * due to possible offsets.
         */
        unsigned int                dma_alignment;
};

typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
                               void *data);

void disk_set_zoned(struct gendisk *disk);

#define BLK_ALL_ZONES  ((unsigned int)-1)
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
                unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
                sector_t sectors, sector_t nr_sectors);
int blk_revalidate_disk_zones(struct gendisk *disk);

/*
 * Independent access ranges: struct blk_independent_access_range describes
 * a range of contiguous sectors that can be accessed using device command
 * execution resources that are independent from the resources used for
 * other access ranges. This is typically found with single-LUN multi-actuator
 * HDDs where each access range is served by a different set of heads.
 * The set of independent ranges supported by the device is defined using
 * struct blk_independent_access_ranges. The independent ranges must not overlap
 * and must include all sectors within the disk capacity (no sector holes
 * allowed).
 * For a device with multiple ranges, requests targeting sectors in different
 * ranges can be executed in parallel. A request can straddle an access range
 * boundary.
 */
struct blk_independent_access_range {
        struct kobject                kobj;
        sector_t                sector;
        sector_t                nr_sectors;
};

struct blk_independent_access_ranges {
        struct kobject                                kobj;
        bool                                        sysfs_registered;
        unsigned int                                nr_ia_ranges;
        struct blk_independent_access_range        ia_range[];
};

struct request_queue {
        /*
         * The queue owner gets to use this for whatever they like.
         * ll_rw_blk doesn't touch it.
         */
        void                        *queuedata;

        struct elevator_queue        *elevator;

        const struct blk_mq_ops        *mq_ops;

        /* sw queues */
        struct blk_mq_ctx __percpu        *queue_ctx;

        /*
         * various queue flags, see QUEUE_* below
         */
        unsigned long                queue_flags;

        unsigned int                rq_timeout;

        unsigned int                queue_depth;

        refcount_t                refs;

        /* hw dispatch queues */
        unsigned int                nr_hw_queues;
        struct xarray                hctx_table;

        struct percpu_ref        q_usage_counter;

        struct request                *last_merge;

        spinlock_t                queue_lock;

        int                        quiesce_depth;

        struct gendisk                *disk;

        /*
         * mq queue kobject
         */
        struct kobject *mq_kobj;

        struct queue_limits        limits;

#ifdef  CONFIG_BLK_DEV_INTEGRITY
        struct blk_integrity integrity;
#endif        /* CONFIG_BLK_DEV_INTEGRITY */

#ifdef CONFIG_PM
        struct device                *dev;
        enum rpm_status                rpm_status;
#endif

        /*
         * Number of contexts that have called blk_set_pm_only(). If this
         * counter is above zero then only RQF_PM requests are processed.
         */
        atomic_t                pm_only;

        struct blk_queue_stats        *stats;
        struct rq_qos                *rq_qos;
        struct mutex                rq_qos_mutex;

        /*
         * ida allocated id for this queue.  Used to index queues from
         * ioctx.
         */
        int                        id;

        unsigned int                dma_pad_mask;

        /*
         * queue settings
         */
        unsigned long                nr_requests;        /* Max # of requests */

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct blk_crypto_profile *crypto_profile;
        struct kobject *crypto_kobject;
#endif

        struct timer_list        timeout;
        struct work_struct        timeout_work;

        atomic_t                nr_active_requests_shared_tags;

        struct blk_mq_tags        *sched_shared_tags;

        struct list_head        icq_list;
#ifdef CONFIG_BLK_CGROUP
        DECLARE_BITMAP                (blkcg_pols, BLKCG_MAX_POLS);
        struct blkcg_gq                *root_blkg;
        struct list_head        blkg_list;
        struct mutex                blkcg_mutex;
#endif

        int                        node;

        spinlock_t                requeue_lock;
        struct list_head        requeue_list;
        struct delayed_work        requeue_work;

#ifdef CONFIG_BLK_DEV_IO_TRACE
        struct blk_trace __rcu        *blk_trace;
#endif
        /*
         * for flush operations
         */
        struct blk_flush_queue        *fq;
        struct list_head        flush_list;

        struct mutex                sysfs_lock;
        struct mutex                sysfs_dir_lock;
        struct mutex                limits_lock;

        /*
         * for reusing dead hctx instance in case of updating
         * nr_hw_queues
         */
        struct list_head        unused_hctx_list;
        spinlock_t                unused_hctx_lock;

        int                        mq_freeze_depth;

#ifdef CONFIG_BLK_DEV_THROTTLING
        /* Throttle data */
        struct throtl_data *td;
#endif
        struct rcu_head                rcu_head;
        wait_queue_head_t        mq_freeze_wq;
        /*
         * Protect concurrent access to q_usage_counter by
         * percpu_ref_kill() and percpu_ref_reinit().
         */
        struct mutex                mq_freeze_lock;

        struct blk_mq_tag_set        *tag_set;
        struct list_head        tag_set_list;

        struct dentry                *debugfs_dir;
        struct dentry                *sched_debugfs_dir;
        struct dentry                *rqos_debugfs_dir;
        /*
         * Serializes all debugfs metadata operations using the above dentries.
         */
        struct mutex                debugfs_mutex;

        bool                        mq_sysfs_init_done;
};

/* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED        0        /* queue is stopped */
#define QUEUE_FLAG_DYING        1        /* queue being torn down */
#define QUEUE_FLAG_NOMERGES     3        /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP        4        /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO        5        /* fake timeout */
#define QUEUE_FLAG_NONROT        6        /* non-rotational device (SSD) */
#define QUEUE_FLAG_VIRT                QUEUE_FLAG_NONROT /* paravirt device */
#define QUEUE_FLAG_IO_STAT        7        /* do disk/partitions IO accounting */
#define QUEUE_FLAG_NOXMERGES        9        /* No extended merges */
#define QUEUE_FLAG_ADD_RANDOM        10        /* Contributes to random pool */
#define QUEUE_FLAG_SYNCHRONOUS        11        /* always completes in submit context */
#define QUEUE_FLAG_SAME_FORCE        12        /* force complete on same CPU */
#define QUEUE_FLAG_HW_WC        13        /* Write back caching supported */
#define QUEUE_FLAG_INIT_DONE        14        /* queue is initialized */
#define QUEUE_FLAG_STABLE_WRITES 15        /* don't modify blks until WB is done */
#define QUEUE_FLAG_POLL                16        /* IO polling enabled if set */
#define QUEUE_FLAG_WC                17        /* Write back caching */
#define QUEUE_FLAG_FUA                18        /* device supports FUA writes */
#define QUEUE_FLAG_DAX                19        /* device supports DAX */
#define QUEUE_FLAG_STATS        20        /* track IO start and completion times */
#define QUEUE_FLAG_REGISTERED        22        /* queue has been registered to a disk */
#define QUEUE_FLAG_QUIESCED        24        /* queue has been quiesced */
#define QUEUE_FLAG_PCI_P2PDMA        25        /* device supports PCI p2p requests */
#define QUEUE_FLAG_ZONE_RESETALL 26        /* supports Zone Reset All */
#define QUEUE_FLAG_RQ_ALLOC_TIME 27        /* record rq->alloc_time_ns */
#define QUEUE_FLAG_HCTX_ACTIVE        28        /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_NOWAIT       29        /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED     30        /* single queue style io dispatch */
#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE        31 /* quiesce_tagset skip the queue*/

#define QUEUE_FLAG_MQ_DEFAULT        ((1UL << QUEUE_FLAG_IO_STAT) |                \
                                 (1UL << QUEUE_FLAG_SAME_COMP) |        \
                                 (1UL << QUEUE_FLAG_NOWAIT))

void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);

#define blk_queue_stopped(q)        test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q)        test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
#define blk_queue_init_done(q)        test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q)        test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_noxmerges(q)        \
        test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
#define blk_queue_nonrot(q)        test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
#define blk_queue_stable_writes(q) \
        test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
#define blk_queue_io_stat(q)        test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q)        test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_zone_resetall(q)        \
        test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
#define blk_queue_dax(q)        test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
#define blk_queue_pci_p2pdma(q)        \
        test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
#define blk_queue_rq_alloc_time(q)        \
        test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
#else
#define blk_queue_rq_alloc_time(q)        false
#endif

#define blk_noretry_request(rq) \
        ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
                             REQ_FAILFAST_DRIVER))
#define blk_queue_quiesced(q)        test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
#define blk_queue_pm_only(q)        atomic_read(&(q)->pm_only)
#define blk_queue_registered(q)        test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
#define blk_queue_sq_sched(q)        test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
#define blk_queue_skip_tagset_quiesce(q) \
        test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags)

extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);

#define list_entry_rq(ptr)        list_entry((ptr), struct request, queuelist)

#define dma_map_bvec(dev, bv, dir, attrs) \
        dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
        (dir), (attrs))

static inline bool queue_is_mq(struct request_queue *q)
{
        return q->mq_ops;
}

#ifdef CONFIG_PM
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return q->rpm_status;
}
#else
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return RPM_ACTIVE;
}
#endif

static inline bool blk_queue_is_zoned(struct request_queue *q)
{
        return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned;
}

#ifdef CONFIG_BLK_DEV_ZONED
unsigned int bdev_nr_zones(struct block_device *bdev);

static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
        return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0;
}

static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
        if (!blk_queue_is_zoned(disk->queue))
                return 0;
        return sector >> ilog2(disk->queue->limits.chunk_sectors);
}

static inline void disk_set_max_open_zones(struct gendisk *disk,
                unsigned int max_open_zones)
{
        disk->queue->limits.max_open_zones = max_open_zones;
}

static inline void disk_set_max_active_zones(struct gendisk *disk,
                unsigned int max_active_zones)
{
        disk->queue->limits.max_active_zones = max_active_zones;
}

static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.max_open_zones;
}

static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.max_active_zones;
}

bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int bdev_nr_zones(struct block_device *bdev)
{
        return 0;
}

static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
        return 0;
}
static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
        return 0;
}
static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
        return 0;
}

static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
        return 0;
}
static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
        return false;
}
#endif /* CONFIG_BLK_DEV_ZONED */

static inline unsigned int blk_queue_depth(struct request_queue *q)
{
        if (q->queue_depth)
                return q->queue_depth;

        return q->nr_requests;
}

/*
 * default timeout for SG_IO if none specified
 */
#define BLK_DEFAULT_SG_TIMEOUT        (60 * HZ)
#define BLK_MIN_SG_TIMEOUT        (7 * HZ)

/* This should not be used directly - use rq_for_each_segment */
#define for_each_bio(_bio)                \
        for (; _bio; _bio = _bio->bi_next)

int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
                                 const struct attribute_group **groups);
static inline int __must_check add_disk(struct gendisk *disk)
{
        return device_add_disk(NULL, disk, NULL);
}
void del_gendisk(struct gendisk *gp);
void invalidate_disk(struct gendisk *disk);
void set_disk_ro(struct gendisk *disk, bool read_only);
void disk_uevent(struct gendisk *disk, enum kobject_action action);

static inline u8 bdev_partno(const struct block_device *bdev)
{
        return atomic_read(&bdev->__bd_flags) & BD_PARTNO;
}

static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag)
{
        return atomic_read(&bdev->__bd_flags) & flag;
}

static inline void bdev_set_flag(struct block_device *bdev, unsigned flag)
{
        atomic_or(flag, &bdev->__bd_flags);
}

static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag)
{
        atomic_andnot(flag, &bdev->__bd_flags);
}

static inline int get_disk_ro(struct gendisk *disk)
{
        return bdev_test_flag(disk->part0, BD_READ_ONLY) ||
                test_bit(GD_READ_ONLY, &disk->state);
}

static inline int bdev_read_only(struct block_device *bdev)
{
        return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk);
}

bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
void disk_force_media_change(struct gendisk *disk);
void bdev_mark_dead(struct block_device *bdev, bool surprise);

void add_disk_randomness(struct gendisk *disk) __latent_entropy;
void rand_initialize_disk(struct gendisk *disk);

static inline sector_t get_start_sect(struct block_device *bdev)
{
        return bdev->bd_start_sect;
}

static inline sector_t bdev_nr_sectors(struct block_device *bdev)
{
        return bdev->bd_nr_sectors;
}

static inline loff_t bdev_nr_bytes(struct block_device *bdev)
{
        return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
}

static inline sector_t get_capacity(struct gendisk *disk)
{
        return bdev_nr_sectors(disk->part0);
}

static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
{
        return bdev_nr_sectors(sb->s_bdev) >>
                (sb->s_blocksize_bits - SECTOR_SHIFT);
}

int bdev_disk_changed(struct gendisk *disk, bool invalidate);

void put_disk(struct gendisk *disk);
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
                struct lock_class_key *lkclass);

/**
 * blk_alloc_disk - allocate a gendisk structure
 * @lim: queue limits to be used for this disk.
 * @node_id: numa node to allocate on
 *
 * Allocate and pre-initialize a gendisk structure for use with BIO based
 * drivers.
 *
 * Returns an ERR_PTR on error, else the allocated disk.
 *
 * Context: can sleep
 */
#define blk_alloc_disk(lim, node_id)                                        \
({                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __blk_alloc_disk(lim, node_id, &__key);                                \
})

int __register_blkdev(unsigned int major, const char *name,
                void (*probe)(dev_t devt));
#define register_blkdev(major, name) \
        __register_blkdev(major, name, NULL)
void unregister_blkdev(unsigned int major, const char *name);

bool disk_check_media_change(struct gendisk *disk);
void set_capacity(struct gendisk *disk, sector_t size);

#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
#else
static inline int bd_link_disk_holder(struct block_device *bdev,
                                      struct gendisk *disk)
{
        return 0;
}
static inline void bd_unlink_disk_holder(struct block_device *bdev,
                                         struct gendisk *disk)
{
}
#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */

dev_t part_devt(struct gendisk *disk, u8 partno);
void inc_diskseq(struct gendisk *disk);
void blk_request_module(dev_t devt);

extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
void submit_bio_noacct(struct bio *bio);
struct bio *bio_split_to_limits(struct bio *bio);

extern int blk_lld_busy(struct request_queue *q);
extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);

/* Helper to convert REQ_OP_XXX to its string format XXX */
extern const char *blk_op_str(enum req_op op);

int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
const char *blk_status_to_str(blk_status_t status);

/* only poll the hardware once, don't continue until a completion was found */
#define BLK_POLL_ONESHOT                (1 << 0)
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
                        unsigned int flags);

static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
{
        return bdev->bd_queue;        /* this is never NULL */
}

/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);

static inline unsigned int bio_zone_no(struct bio *bio)
{
        return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
}

static inline bool bio_straddles_zones(struct bio *bio)
{
        return bio_sectors(bio) &&
                bio_zone_no(bio) !=
                disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
}

/*
 * Return how much of the chunk is left to be used for I/O at a given offset.
 */
static inline unsigned int blk_chunk_sectors_left(sector_t offset,
                unsigned int chunk_sectors)
{
        if (unlikely(!is_power_of_2(chunk_sectors)))
                return chunk_sectors - sector_div(offset, chunk_sectors);
        return chunk_sectors - (offset & (chunk_sectors - 1));
}

/**
 * queue_limits_start_update - start an atomic update of queue limits
 * @q:                queue to update
 *
 * This functions starts an atomic update of the queue limits.  It takes a lock
 * to prevent other updates and returns a snapshot of the current limits that
 * the caller can modify.  The caller must call queue_limits_commit_update()
 * to finish the update.
 *
 * Context: process context.  The caller must have frozen the queue or ensured
 * that there is outstanding I/O by other means.
 */
static inline struct queue_limits
queue_limits_start_update(struct request_queue *q)
        __acquires(q->limits_lock)
{
        mutex_lock(&q->limits_lock);
        return q->limits;
}
int queue_limits_commit_update(struct request_queue *q,
                struct queue_limits *lim);
int queue_limits_set(struct request_queue *q, struct queue_limits *lim);

/**
 * queue_limits_cancel_update - cancel an atomic update of queue limits
 * @q:                queue to update
 *
 * This functions cancels an atomic update of the queue limits started by
 * queue_limits_start_update() and should be used when an error occurs after
 * starting update.
 */
static inline void queue_limits_cancel_update(struct request_queue *q)
{
        mutex_unlock(&q->limits_lock);
}

/*
 * Access functions for manipulating queue properties
 */
extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
void blk_queue_max_secure_erase_sectors(struct request_queue *q,
                unsigned int max_sectors);
extern void blk_queue_max_discard_sectors(struct request_queue *q,
                unsigned int max_discard_sectors);
extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
                unsigned int max_write_same_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
                unsigned int max_zone_append_sectors);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
void blk_queue_zone_write_granularity(struct request_queue *q,
                                      unsigned int size);
extern void blk_queue_alignment_offset(struct request_queue *q,
                                       unsigned int alignment);
void disk_update_readahead(struct gendisk *disk);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                            sector_t offset);
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
                sector_t offset, const char *pfx);
extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);

struct blk_independent_access_ranges *
disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
void disk_set_independent_access_ranges(struct gendisk *disk,
                                struct blk_independent_access_ranges *iars);

bool __must_check blk_get_queue(struct request_queue *);
extern void blk_put_queue(struct request_queue *);

void blk_mark_disk_dead(struct gendisk *disk);

#ifdef CONFIG_BLOCK
/*
 * blk_plug permits building a queue of related requests by holding the I/O
 * fragments for a short period. This allows merging of sequential requests
 * into single larger request. As the requests are moved from a per-task list to
 * the device's request_queue in a batch, this results in improved scalability
 * as the lock contention for request_queue lock is reduced.
 *
 * It is ok not to disable preemption when adding the request to the plug list
 * or when attempting a merge. For details, please see schedule() where
 * blk_flush_plug() is called.
 */
struct blk_plug {
        struct request *mq_list; /* blk-mq requests */

        /* if ios_left is > 1, we can batch tag/rq allocations */
        struct request *cached_rq;
        u64 cur_ktime;
        unsigned short nr_ios;

        unsigned short rq_count;

        bool multiple_queues;
        bool has_elevator;

        struct list_head cb_list; /* md requires an unplug callback */
};

struct blk_plug_cb;
typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
struct blk_plug_cb {
        struct list_head list;
        blk_plug_cb_fn callback;
        void *data;
};
extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
                                             void *data, int size);
extern void blk_start_plug(struct blk_plug *);
extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
extern void blk_finish_plug(struct blk_plug *);

void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
        if (plug)
                __blk_flush_plug(plug, async);
}

/*
 * tsk == current here
 */
static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
{
        struct blk_plug *plug = tsk->plug;

        if (plug)
                plug->cur_ktime = 0;
        current->flags &= ~PF_BLOCK_TS;
}

int blkdev_issue_flush(struct block_device *bdev);
long nr_blockdev_pages(void);
#else /* CONFIG_BLOCK */
struct blk_plug {
};

static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
                                         unsigned short nr_ios)
{
}

static inline void blk_start_plug(struct blk_plug *plug)
{
}

static inline void blk_finish_plug(struct blk_plug *plug)
{
}

static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
}

static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
{
}

static inline int blkdev_issue_flush(struct block_device *bdev)
{
        return 0;
}

static inline long nr_blockdev_pages(void)
{
        return 0;
}
#endif /* CONFIG_BLOCK */

extern void blk_io_schedule(void);

int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask);
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp);

#define BLKDEV_ZERO_NOUNMAP        (1 << 0)  /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK        (1 << 1)  /* don't write explicit zeroes */

extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
                unsigned flags);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned flags);

static inline int sb_issue_discard(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
        return blkdev_issue_discard(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask);
}
static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask)
{
        return blkdev_issue_zeroout(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask, 0);
}

static inline bool bdev_is_partition(struct block_device *bdev)
{
        return bdev_partno(bdev) != 0;
}

enum blk_default_limits {
        BLK_MAX_SEGMENTS        = 128,
        BLK_SAFE_MAX_SECTORS        = 255,
        BLK_MAX_SEGMENT_SIZE        = 65536,
        BLK_SEG_BOUNDARY_MASK        = 0xFFFFFFFFUL,
};

/*
 * Default upper limit for the software max_sectors limit used for
 * regular file system I/O.  This can be increased through sysfs.
 *
 * Not to be confused with the max_hw_sector limit that is entirely
 * controlled by the driver, usually based on hardware limits.
 */
#define BLK_DEF_MAX_SECTORS_CAP        2560u

static inline unsigned long queue_segment_boundary(const struct request_queue *q)
{
        return q->limits.seg_boundary_mask;
}

static inline unsigned long queue_virt_boundary(const struct request_queue *q)
{
        return q->limits.virt_boundary_mask;
}

static inline unsigned int queue_max_sectors(const struct request_queue *q)
{
        return q->limits.max_sectors;
}

static inline unsigned int queue_max_bytes(struct request_queue *q)
{
        return min_t(unsigned int, queue_max_sectors(q), INT_MAX >> 9) << 9;
}

static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
{
        return q->limits.max_hw_sectors;
}

static inline unsigned short queue_max_segments(const struct request_queue *q)
{
        return q->limits.max_segments;
}

static inline unsigned short queue_max_discard_segments(const struct request_queue *q)
{
        return q->limits.max_discard_segments;
}

static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
        return q->limits.max_segment_size;
}

static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
{
        unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);

        return min_not_zero(l->max_zone_append_sectors, max_sectors);
}

static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
{
        if (!blk_queue_is_zoned(q))
                return 0;

        return queue_limits_max_zone_append_sectors(&q->limits);
}

static inline bool queue_emulates_zone_append(struct request_queue *q)
{
        return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
}

static inline bool bdev_emulates_zone_append(struct block_device *bdev)
{
        return queue_emulates_zone_append(bdev_get_queue(bdev));
}

static inline unsigned int
bdev_max_zone_append_sectors(struct block_device *bdev)
{
        return queue_max_zone_append_sectors(bdev_get_queue(bdev));
}

static inline unsigned int bdev_max_segments(struct block_device *bdev)
{
        return queue_max_segments(bdev_get_queue(bdev));
}

static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
        int retval = 512;

        if (q && q->limits.logical_block_size)
                retval = q->limits.logical_block_size;

        return retval;
}

static inline unsigned int bdev_logical_block_size(struct block_device *bdev)
{
        return queue_logical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_physical_block_size(const struct request_queue *q)
{
        return q->limits.physical_block_size;
}

static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
{
        return queue_physical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_min(const struct request_queue *q)
{
        return q->limits.io_min;
}

static inline int bdev_io_min(struct block_device *bdev)
{
        return queue_io_min(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_opt(const struct request_queue *q)
{
        return q->limits.io_opt;
}

static inline int bdev_io_opt(struct block_device *bdev)
{
        return queue_io_opt(bdev_get_queue(bdev));
}

static inline unsigned int
queue_zone_write_granularity(const struct request_queue *q)
{
        return q->limits.zone_write_granularity;
}

static inline unsigned int
bdev_zone_write_granularity(struct block_device *bdev)
{
        return queue_zone_write_granularity(bdev_get_queue(bdev));
}

int bdev_alignment_offset(struct block_device *bdev);
unsigned int bdev_discard_alignment(struct block_device *bdev);

static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev)
{
        return bdev_get_queue(bdev)->limits.max_discard_sectors;
}

static inline unsigned int bdev_discard_granularity(struct block_device *bdev)
{
        return bdev_get_queue(bdev)->limits.discard_granularity;
}

static inline unsigned int
bdev_max_secure_erase_sectors(struct block_device *bdev)
{
        return bdev_get_queue(bdev)->limits.max_secure_erase_sectors;
}

static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return q->limits.max_write_zeroes_sectors;

        return 0;
}

static inline bool bdev_nonrot(struct block_device *bdev)
{
        return blk_queue_nonrot(bdev_get_queue(bdev));
}

static inline bool bdev_synchronous(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_SYNCHRONOUS,
                        &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_stable_writes(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_STABLE_WRITES,
                        &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_write_cache(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_fua(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_nowait(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_is_zoned(struct block_device *bdev)
{
        return blk_queue_is_zoned(bdev_get_queue(bdev));
}

static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
{
        return disk_zone_no(bdev->bd_disk, sec);
}

static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (!blk_queue_is_zoned(q))
                return 0;
        return q->limits.chunk_sectors;
}

static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
                                                   sector_t sector)
{
        return sector & (bdev_zone_sectors(bdev) - 1);
}

static inline sector_t bio_offset_from_zone_start(struct bio *bio)
{
        return bdev_offset_from_zone_start(bio->bi_bdev,
                                           bio->bi_iter.bi_sector);
}

static inline bool bdev_is_zone_start(struct block_device *bdev,
                                      sector_t sector)
{
        return bdev_offset_from_zone_start(bdev, sector) == 0;
}

static inline int queue_dma_alignment(const struct request_queue *q)
{
        return q ? q->limits.dma_alignment : 511;
}

static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
{
        return queue_dma_alignment(bdev_get_queue(bdev));
}

static inline bool bdev_iter_is_aligned(struct block_device *bdev,
                                        struct iov_iter *iter)
{
        return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev),
                                   bdev_logical_block_size(bdev) - 1);
}

static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                 unsigned int len)
{
        unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
        return !(addr & alignment) && !(len & alignment);
}

/* assumes size > 256 */
static inline unsigned int blksize_bits(unsigned int size)
{
        return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT;
}

int kblockd_schedule_work(struct work_struct *work);
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);

#define MODULE_ALIAS_BLOCKDEV(major,minor) \
        MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
        MODULE_ALIAS("block-major-" __stringify(major) "-*")

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

bool blk_crypto_register(struct blk_crypto_profile *profile,
                         struct request_queue *q);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
                                       struct request_queue *q)
{
        return true;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

enum blk_unique_id {
        /* these match the Designator Types specified in SPC */
        BLK_UID_T10        = 1,
        BLK_UID_EUI64        = 2,
        BLK_UID_NAA        = 3,
};

struct block_device_operations {
        void (*submit_bio)(struct bio *bio);
        int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob,
                        unsigned int flags);
        int (*open)(struct gendisk *disk, blk_mode_t mode);
        void (*release)(struct gendisk *disk);
        int (*ioctl)(struct block_device *bdev, blk_mode_t mode,
                        unsigned cmd, unsigned long arg);
        int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode,
                        unsigned cmd, unsigned long arg);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        void (*unlock_native_capacity) (struct gendisk *);
        int (*getgeo)(struct block_device *, struct hd_geometry *);
        int (*set_read_only)(struct block_device *bdev, bool ro);
        void (*free_disk)(struct gendisk *disk);
        /* this callback is with swap_lock and sometimes page table lock held */
        void (*swap_slot_free_notify) (struct block_device *, unsigned long);
        int (*report_zones)(struct gendisk *, sector_t sector,
                        unsigned int nr_zones, report_zones_cb cb, void *data);
        char *(*devnode)(struct gendisk *disk, umode_t *mode);
        /* returns the length of the identifier or a negative errno: */
        int (*get_unique_id)(struct gendisk *disk, u8 id[16],
                        enum blk_unique_id id_type);
        struct module *owner;
        const struct pr_ops *pr_ops;

        /*
         * Special callback for probing GPT entry at a given sector.
         * Needed by Android devices, used by GPT scanner and MMC blk
         * driver.
         */
        int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector);
};

#ifdef CONFIG_COMPAT
extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t,
                                      unsigned int, unsigned long);
#else
#define blkdev_compat_ptr_ioctl NULL
#endif

static inline void blk_wake_io_task(struct task_struct *waiter)
{
        /*
         * If we're polling, the task itself is doing the completions. For
         * that case, we don't need to signal a wakeup, it's enough to just
         * mark us as RUNNING.
         */
        if (waiter == current)
                __set_current_state(TASK_RUNNING);
        else
                wake_up_process(waiter);
}

unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
                                 unsigned long start_time);
void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
                      unsigned int sectors, unsigned long start_time);

unsigned long bio_start_io_acct(struct bio *bio);
void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
                struct block_device *orig_bdev);

/**
 * bio_end_io_acct - end I/O accounting for bio based drivers
 * @bio:        bio to end account for
 * @start_time:        start time returned by bio_start_io_acct()
 */
static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
{
        return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
}

int bdev_read_only(struct block_device *bdev);
int set_blocksize(struct file *file, int size);

int lookup_bdev(const char *pathname, dev_t *dev);

void blkdev_show(struct seq_file *seqf, off_t offset);

#define BDEVNAME_SIZE        32        /* Largest string for a blockdev identifier */
#define BDEVT_SIZE        10        /* Largest string for MAJ:MIN for blkdev */
#ifdef CONFIG_BLOCK
#define BLKDEV_MAJOR_MAX        512
#else
#define BLKDEV_MAJOR_MAX        0
#endif

struct blk_holder_ops {
        void (*mark_dead)(struct block_device *bdev, bool surprise);

        /*
         * Sync the file system mounted on the block device.
         */
        void (*sync)(struct block_device *bdev);

        /*
         * Freeze the file system mounted on the block device.
         */
        int (*freeze)(struct block_device *bdev);

        /*
         * Thaw the file system mounted on the block device.
         */
        int (*thaw)(struct block_device *bdev);
};

/*
 * For filesystems using @fs_holder_ops, the @holder argument passed to
 * helpers used to open and claim block devices via
 * bd_prepare_to_claim() must point to a superblock.
 */
extern const struct blk_holder_ops fs_holder_ops;

/*
 * Return the correct open flags for blkdev_get_by_* for super block flags
 * as stored in sb->s_flags.
 */
#define sb_open_mode(flags) \
        (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
         (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))

struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
                const struct blk_holder_ops *hops);
struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
                void *holder, const struct blk_holder_ops *hops);
int bd_prepare_to_claim(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops);
void bd_abort_claiming(struct block_device *bdev, void *holder);

/* just for blk-cgroup, don't use elsewhere */
struct block_device *blkdev_get_no_open(dev_t dev);
void blkdev_put_no_open(struct block_device *bdev);

struct block_device *I_BDEV(struct inode *inode);
struct block_device *file_bdev(struct file *bdev_file);
bool disk_live(struct gendisk *disk);
unsigned int block_size(struct block_device *bdev);

#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
int sync_blockdev(struct block_device *bdev);
int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
int sync_blockdev_nowait(struct block_device *bdev);
void sync_bdevs(bool wait);
void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
void printk_all_partitions(void);
int __init early_lookup_bdev(const char *pathname, dev_t *dev);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
}
static inline int sync_blockdev(struct block_device *bdev)
{
        return 0;
}
static inline int sync_blockdev_nowait(struct block_device *bdev)
{
        return 0;
}
static inline void sync_bdevs(bool wait)
{
}
static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
{
}
static inline void printk_all_partitions(void)
{
}
static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
{
        return -EINVAL;
}
#endif /* CONFIG_BLOCK */

int bdev_freeze(struct block_device *bdev);
int bdev_thaw(struct block_device *bdev);
void bdev_fput(struct file *bdev_file);

struct io_comp_batch {
        struct request *req_list;
        bool need_ts;
        void (*complete)(struct io_comp_batch *);
};

#define DEFINE_IO_COMP_BATCH(name)        struct io_comp_batch name = { }

#endif /* _LINUX_BLKDEV_H */






















































































































































































































































































































































    2 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  Definitions for the SMC module (socket related)
 *
 *  Copyright IBM Corp. 2016
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 */
#ifndef __SMC_H
#define __SMC_H

#include <linux/socket.h>
#include <linux/types.h>
#include <linux/compiler.h> /* __aligned */
#include <net/genetlink.h>
#include <net/sock.h>

#include "smc_ib.h"

#define SMC_V1                1                /* SMC version V1 */
#define SMC_V2                2                /* SMC version V2 */

#define SMC_RELEASE_0 0
#define SMC_RELEASE_1 1
#define SMC_RELEASE        SMC_RELEASE_1 /* the latest release version */

#define SMCPROTO_SMC                0        /* SMC protocol, IPv4 */
#define SMCPROTO_SMC6                1        /* SMC protocol, IPv6 */

#define SMC_AUTOCORKING_DEFAULT_SIZE        0x10000        /* 64K by default */

extern struct proto smc_proto;
extern struct proto smc_proto6;

extern struct smc_hashinfo smc_v4_hashinfo;
extern struct smc_hashinfo smc_v6_hashinfo;

int smc_hash_sk(struct sock *sk);
void smc_unhash_sk(struct sock *sk);
void smc_release_cb(struct sock *sk);

int smc_release(struct socket *sock);
int smc_bind(struct socket *sock, struct sockaddr *uaddr,
             int addr_len);
int smc_connect(struct socket *sock, struct sockaddr *addr,
                int alen, int flags);
int smc_accept(struct socket *sock, struct socket *new_sock,
               struct proto_accept_arg *arg);
int smc_getname(struct socket *sock, struct sockaddr *addr,
                int peer);
__poll_t smc_poll(struct file *file, struct socket *sock,
                  poll_table *wait);
int smc_ioctl(struct socket *sock, unsigned int cmd,
              unsigned long arg);
int smc_listen(struct socket *sock, int backlog);
int smc_shutdown(struct socket *sock, int how);
int smc_setsockopt(struct socket *sock, int level, int optname,
                   sockptr_t optval, unsigned int optlen);
int smc_getsockopt(struct socket *sock, int level, int optname,
                   char __user *optval, int __user *optlen);
int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len);
int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                int flags);
ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags);

/* smc sock initialization */
void smc_sk_init(struct net *net, struct sock *sk, int protocol);
/* clcsock initialization */
int smc_create_clcsk(struct net *net, struct sock *sk, int family);

#ifdef ATOMIC64_INIT
#define KERNEL_HAS_ATOMIC64
#endif

enum smc_state {                /* possible states of an SMC socket */
        SMC_ACTIVE        = 1,
        SMC_INIT        = 2,
        SMC_CLOSED        = 7,
        SMC_LISTEN        = 10,
        /* normal close */
        SMC_PEERCLOSEWAIT1        = 20,
        SMC_PEERCLOSEWAIT2        = 21,
        SMC_APPFINCLOSEWAIT        = 24,
        SMC_APPCLOSEWAIT1        = 22,
        SMC_APPCLOSEWAIT2        = 23,
        SMC_PEERFINCLOSEWAIT        = 25,
        /* abnormal close */
        SMC_PEERABORTWAIT        = 26,
        SMC_PROCESSABORT        = 27,
};

enum smc_supplemental_features {
        SMC_SPF_EMULATED_ISM_DEV        = 0,
};

#define SMC_FEATURE_MASK \
        (BIT(SMC_SPF_EMULATED_ISM_DEV))

struct smc_link_group;

struct smc_wr_rx_hdr {        /* common prefix part of LLC and CDC to demultiplex */
        union {
                u8 type;
#if defined(__BIG_ENDIAN_BITFIELD)
                struct {
                        u8 llc_version:4,
                           llc_type:4;
                };
#elif defined(__LITTLE_ENDIAN_BITFIELD)
                struct {
                        u8 llc_type:4,
                           llc_version:4;
                };
#endif
        };
} __aligned(1);

struct smc_cdc_conn_state_flags {
#if defined(__BIG_ENDIAN_BITFIELD)
        u8        peer_done_writing : 1;        /* Sending done indicator */
        u8        peer_conn_closed : 1;        /* Peer connection closed indicator */
        u8        peer_conn_abort : 1;        /* Abnormal close indicator */
        u8        reserved : 5;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
        u8        reserved : 5;
        u8        peer_conn_abort : 1;
        u8        peer_conn_closed : 1;
        u8        peer_done_writing : 1;
#endif
};

struct smc_cdc_producer_flags {
#if defined(__BIG_ENDIAN_BITFIELD)
        u8        write_blocked : 1;        /* Writing Blocked, no rx buf space */
        u8        urg_data_pending : 1;        /* Urgent Data Pending */
        u8        urg_data_present : 1;        /* Urgent Data Present */
        u8        cons_curs_upd_req : 1;        /* cursor update requested */
        u8        failover_validation : 1;/* message replay due to failover */
        u8        reserved : 3;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
        u8        reserved : 3;
        u8        failover_validation : 1;
        u8        cons_curs_upd_req : 1;
        u8        urg_data_present : 1;
        u8        urg_data_pending : 1;
        u8        write_blocked : 1;
#endif
};

/* in host byte order */
union smc_host_cursor {        /* SMC cursor - an offset in an RMBE */
        struct {
                u16        reserved;
                u16        wrap;                /* window wrap sequence number */
                u32        count;                /* cursor (= offset) part */
        };
#ifdef KERNEL_HAS_ATOMIC64
        atomic64_t                acurs;        /* for atomic processing */
#else
        u64                        acurs;        /* for atomic processing */
#endif
} __aligned(8);

/* in host byte order, except for flag bitfields in network byte order */
struct smc_host_cdc_msg {                /* Connection Data Control message */
        struct smc_wr_rx_hdr                common; /* .type = 0xFE */
        u8                                len;        /* length = 44 */
        u16                                seqno;        /* connection seq # */
        u32                                token;        /* alert_token */
        union smc_host_cursor                prod;                /* producer cursor */
        union smc_host_cursor                cons;                /* consumer cursor,
                                                         * piggy backed "ack"
                                                         */
        struct smc_cdc_producer_flags        prod_flags;        /* conn. tx/rx status */
        struct smc_cdc_conn_state_flags        conn_state_flags; /* peer conn. status*/
        u8                                reserved[18];
} __aligned(8);

enum smc_urg_state {
        SMC_URG_VALID        = 1,                        /* data present */
        SMC_URG_NOTYET        = 2,                        /* data pending */
        SMC_URG_READ        = 3,                        /* data was already read */
};

struct smc_mark_woken {
        bool woken;
        void *key;
        wait_queue_entry_t wait_entry;
};

struct smc_connection {
        struct rb_node                alert_node;
        struct smc_link_group        *lgr;                /* link group of connection */
        struct smc_link                *lnk;                /* assigned SMC-R link */
        u32                        alert_token_local; /* unique conn. id */
        u8                        peer_rmbe_idx;        /* from tcp handshake */
        int                        peer_rmbe_size;        /* size of peer rx buffer */
        atomic_t                peer_rmbe_space;/* remaining free bytes in peer
                                                 * rmbe
                                                 */
        int                        rtoken_idx;        /* idx to peer RMB rkey/addr */

        struct smc_buf_desc        *sndbuf_desc;        /* send buffer descriptor */
        struct smc_buf_desc        *rmb_desc;        /* RMBE descriptor */
        int                     rmbe_size_comp; /* compressed notation */
        int                        rmbe_update_limit;
                                                /* lower limit for consumer
                                                 * cursor update
                                                 */

        struct smc_host_cdc_msg        local_tx_ctrl;        /* host byte order staging
                                                 * buffer for CDC msg send
                                                 * .prod cf. TCP snd_nxt
                                                 * .cons cf. TCP sends ack
                                                 */
        union smc_host_cursor        local_tx_ctrl_fin;
                                                /* prod crsr - confirmed by peer
                                                 */
        union smc_host_cursor        tx_curs_prep;        /* tx - prepared data
                                                 * snd_max..wmem_alloc
                                                 */
        union smc_host_cursor        tx_curs_sent;        /* tx - sent data
                                                 * snd_nxt ?
                                                 */
        union smc_host_cursor        tx_curs_fin;        /* tx - confirmed by peer
                                                 * snd-wnd-begin ?
                                                 */
        atomic_t                sndbuf_space;        /* remaining space in sndbuf */
        u16                        tx_cdc_seq;        /* sequence # for CDC send */
        u16                        tx_cdc_seq_fin;        /* sequence # - tx completed */
        spinlock_t                send_lock;        /* protect wr_sends */
        atomic_t                cdc_pend_tx_wr; /* number of pending tx CDC wqe
                                                 * - inc when post wqe,
                                                 * - dec on polled tx cqe
                                                 */
        wait_queue_head_t        cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
        struct delayed_work        tx_work;        /* retry of smc_cdc_msg_send */
        u32                        tx_off;                /* base offset in peer rmb */

        struct smc_host_cdc_msg        local_rx_ctrl;        /* filled during event_handl.
                                                 * .prod cf. TCP rcv_nxt
                                                 * .cons cf. TCP snd_una
                                                 */
        union smc_host_cursor        rx_curs_confirmed; /* confirmed to peer
                                                    * source of snd_una ?
                                                    */
        union smc_host_cursor        urg_curs;        /* points at urgent byte */
        enum smc_urg_state        urg_state;
        bool                        urg_tx_pend;        /* urgent data staged */
        bool                        urg_rx_skip_pend;
                                                /* indicate urgent oob data
                                                 * read, but previous regular
                                                 * data still pending
                                                 */
        char                        urg_rx_byte;        /* urgent byte */
        bool                        tx_in_release_sock;
                                                /* flush pending tx data in
                                                 * sock release_cb()
                                                 */
        atomic_t                bytes_to_rcv;        /* arrived data,
                                                 * not yet received
                                                 */
        atomic_t                splice_pending;        /* number of spliced bytes
                                                 * pending processing
                                                 */
#ifndef KERNEL_HAS_ATOMIC64
        spinlock_t                acurs_lock;        /* protect cursors */
#endif
        struct work_struct        close_work;        /* peer sent some closing */
        struct work_struct        abort_work;        /* abort the connection */
        struct tasklet_struct        rx_tsklet;        /* Receiver tasklet for SMC-D */
        u8                        rx_off;                /* receive offset:
                                                 * 0 for SMC-R, 32 for SMC-D
                                                 */
        u64                        peer_token;        /* SMC-D token of peer */
        u8                        killed : 1;        /* abnormal termination */
        u8                        freed : 1;        /* normal termiation */
        u8                        out_of_sync : 1; /* out of sync with peer */
};

struct smc_sock {                                /* smc sock container */
        struct sock                sk;
        struct socket                *clcsock;        /* internal tcp socket */
        void                        (*clcsk_state_change)(struct sock *sk);
                                                /* original stat_change fct. */
        void                        (*clcsk_data_ready)(struct sock *sk);
                                                /* original data_ready fct. */
        void                        (*clcsk_write_space)(struct sock *sk);
                                                /* original write_space fct. */
        void                        (*clcsk_error_report)(struct sock *sk);
                                                /* original error_report fct. */
        struct smc_connection        conn;                /* smc connection */
        struct smc_sock                *listen_smc;        /* listen parent */
        struct work_struct        connect_work;        /* handle non-blocking connect*/
        struct work_struct        tcp_listen_work;/* handle tcp socket accepts */
        struct work_struct        smc_listen_work;/* prepare new accept socket */
        struct list_head        accept_q;        /* sockets to be accepted */
        spinlock_t                accept_q_lock;        /* protects accept_q */
        bool                        limit_smc_hs;        /* put constraint on handshake */
        bool                        use_fallback;        /* fallback to tcp */
        int                        fallback_rsn;        /* reason for fallback */
        u32                        peer_diagnosis; /* decline reason from peer */
        atomic_t                queued_smc_hs;  /* queued smc handshakes */
        struct inet_connection_sock_af_ops                af_ops;
        const struct inet_connection_sock_af_ops        *ori_af_ops;
                                                /* original af ops */
        int                        sockopt_defer_accept;
                                                /* sockopt TCP_DEFER_ACCEPT
                                                 * value
                                                 */
        u8                        wait_close_tx_prepared : 1;
                                                /* shutdown wr or close
                                                 * started, waiting for unsent
                                                 * data to be sent
                                                 */
        u8                        connect_nonblock : 1;
                                                /* non-blocking connect in
                                                 * flight
                                                 */
        struct mutex            clcsock_release_lock;
                                                /* protects clcsock of a listen
                                                 * socket
                                                 * */
};

#define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk)

static inline void smc_init_saved_callbacks(struct smc_sock *smc)
{
        smc->clcsk_state_change        = NULL;
        smc->clcsk_data_ready        = NULL;
        smc->clcsk_write_space        = NULL;
        smc->clcsk_error_report        = NULL;
}

static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk)
{
        return (struct smc_sock *)
               ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
}

/* save target_cb in saved_cb, and replace target_cb with new_cb */
static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *),
                                          void (*new_cb)(struct sock *),
                                          void (**saved_cb)(struct sock *))
{
        /* only save once */
        if (!*saved_cb)
                *saved_cb = *target_cb;
        *target_cb = new_cb;
}

/* restore target_cb to saved_cb, and reset saved_cb to NULL */
static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *),
                                          void (**saved_cb)(struct sock *))
{
        if (!*saved_cb)
                return;
        *target_cb = *saved_cb;
        *saved_cb = NULL;
}

extern struct workqueue_struct        *smc_hs_wq;        /* wq for handshake work */
extern struct workqueue_struct        *smc_close_wq;        /* wq for close work */

#define SMC_SYSTEMID_LEN                8

extern u8        local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */

#define ntohll(x) be64_to_cpu(x)
#define htonll(x) cpu_to_be64(x)

/* convert an u32 value into network byte order, store it into a 3 byte field */
static inline void hton24(u8 *net, u32 host)
{
        __be32 t;

        t = cpu_to_be32(host);
        memcpy(net, ((u8 *)&t) + 1, 3);
}

/* convert a received 3 byte field into host byte order*/
static inline u32 ntoh24(u8 *net)
{
        __be32 t = 0;

        memcpy(((u8 *)&t) + 1, net, 3);
        return be32_to_cpu(t);
}

#ifdef CONFIG_XFRM
static inline bool using_ipsec(struct smc_sock *smc)
{
        return (smc->clcsock->sk->sk_policy[0] ||
                smc->clcsock->sk->sk_policy[1]) ? true : false;
}
#else
static inline bool using_ipsec(struct smc_sock *smc)
{
        return false;
}
#endif

struct smc_gidlist;

struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
void smc_close_non_accepted(struct sock *sk);
void smc_fill_gid_list(struct smc_link_group *lgr,
                       struct smc_gidlist *gidlist,
                       struct smc_ib_device *known_dev, u8 *known_gid);

/* smc handshake limitation interface for netlink  */
int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);

static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag)
{
        set_bit(flag, &sk->sk_flags);
}

#endif        /* __SMC_H */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





















    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * IEEE 802.11 defines
 *
 * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen
 * <jkmaline@cc.hut.fi>
 * Copyright (c) 2002-2003, Jouni Malinen <jkmaline@cc.hut.fi>
 * Copyright (c) 2005, Devicescape Software, Inc.
 * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
 * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
 * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
 * Copyright (c) 2018 - 2024 Intel Corporation
 */

#ifndef LINUX_IEEE80211_H
#define LINUX_IEEE80211_H

#include <linux/types.h>
#include <linux/if_ether.h>
#include <linux/etherdevice.h>
#include <linux/bitfield.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>

/*
 * DS bit usage
 *
 * TA = transmitter address
 * RA = receiver address
 * DA = destination address
 * SA = source address
 *
 * ToDS    FromDS  A1(RA)  A2(TA)  A3      A4      Use
 * -----------------------------------------------------------------
 *  0       0       DA      SA      BSSID   -       IBSS/DLS
 *  0       1       DA      BSSID   SA      -       AP -> STA
 *  1       0       BSSID   SA      DA      -       AP <- STA
 *  1       1       RA      TA      DA      SA      unspecified (WDS)
 */

#define FCS_LEN 4

#define IEEE80211_FCTL_VERS                0x0003
#define IEEE80211_FCTL_FTYPE                0x000c
#define IEEE80211_FCTL_STYPE                0x00f0
#define IEEE80211_FCTL_TODS                0x0100
#define IEEE80211_FCTL_FROMDS                0x0200
#define IEEE80211_FCTL_MOREFRAGS        0x0400
#define IEEE80211_FCTL_RETRY                0x0800
#define IEEE80211_FCTL_PM                0x1000
#define IEEE80211_FCTL_MOREDATA                0x2000
#define IEEE80211_FCTL_PROTECTED        0x4000
#define IEEE80211_FCTL_ORDER                0x8000
#define IEEE80211_FCTL_CTL_EXT                0x0f00

#define IEEE80211_SCTL_FRAG                0x000F
#define IEEE80211_SCTL_SEQ                0xFFF0

#define IEEE80211_FTYPE_MGMT                0x0000
#define IEEE80211_FTYPE_CTL                0x0004
#define IEEE80211_FTYPE_DATA                0x0008
#define IEEE80211_FTYPE_EXT                0x000c

/* management */
#define IEEE80211_STYPE_ASSOC_REQ        0x0000
#define IEEE80211_STYPE_ASSOC_RESP        0x0010
#define IEEE80211_STYPE_REASSOC_REQ        0x0020
#define IEEE80211_STYPE_REASSOC_RESP        0x0030
#define IEEE80211_STYPE_PROBE_REQ        0x0040
#define IEEE80211_STYPE_PROBE_RESP        0x0050
#define IEEE80211_STYPE_BEACON                0x0080
#define IEEE80211_STYPE_ATIM                0x0090
#define IEEE80211_STYPE_DISASSOC        0x00A0
#define IEEE80211_STYPE_AUTH                0x00B0
#define IEEE80211_STYPE_DEAUTH                0x00C0
#define IEEE80211_STYPE_ACTION                0x00D0

/* control */
#define IEEE80211_STYPE_TRIGGER                0x0020
#define IEEE80211_STYPE_CTL_EXT                0x0060
#define IEEE80211_STYPE_BACK_REQ        0x0080
#define IEEE80211_STYPE_BACK                0x0090
#define IEEE80211_STYPE_PSPOLL                0x00A0
#define IEEE80211_STYPE_RTS                0x00B0
#define IEEE80211_STYPE_CTS                0x00C0
#define IEEE80211_STYPE_ACK                0x00D0
#define IEEE80211_STYPE_CFEND                0x00E0
#define IEEE80211_STYPE_CFENDACK        0x00F0

/* data */
#define IEEE80211_STYPE_DATA                        0x0000
#define IEEE80211_STYPE_DATA_CFACK                0x0010
#define IEEE80211_STYPE_DATA_CFPOLL                0x0020
#define IEEE80211_STYPE_DATA_CFACKPOLL                0x0030
#define IEEE80211_STYPE_NULLFUNC                0x0040
#define IEEE80211_STYPE_CFACK                        0x0050
#define IEEE80211_STYPE_CFPOLL                        0x0060
#define IEEE80211_STYPE_CFACKPOLL                0x0070
#define IEEE80211_STYPE_QOS_DATA                0x0080
#define IEEE80211_STYPE_QOS_DATA_CFACK                0x0090
#define IEEE80211_STYPE_QOS_DATA_CFPOLL                0x00A0
#define IEEE80211_STYPE_QOS_DATA_CFACKPOLL        0x00B0
#define IEEE80211_STYPE_QOS_NULLFUNC                0x00C0
#define IEEE80211_STYPE_QOS_CFACK                0x00D0
#define IEEE80211_STYPE_QOS_CFPOLL                0x00E0
#define IEEE80211_STYPE_QOS_CFACKPOLL                0x00F0

/* extension, added by 802.11ad */
#define IEEE80211_STYPE_DMG_BEACON                0x0000
#define IEEE80211_STYPE_S1G_BEACON                0x0010

/* bits unique to S1G beacon */
#define IEEE80211_S1G_BCN_NEXT_TBTT        0x100

/* see 802.11ah-2016 9.9 NDP CMAC frames */
#define IEEE80211_S1G_1MHZ_NDP_BITS        25
#define IEEE80211_S1G_1MHZ_NDP_BYTES        4
#define IEEE80211_S1G_2MHZ_NDP_BITS        37
#define IEEE80211_S1G_2MHZ_NDP_BYTES        5

#define IEEE80211_NDP_FTYPE_CTS                        0
#define IEEE80211_NDP_FTYPE_CF_END                0
#define IEEE80211_NDP_FTYPE_PS_POLL                1
#define IEEE80211_NDP_FTYPE_ACK                        2
#define IEEE80211_NDP_FTYPE_PS_POLL_ACK                3
#define IEEE80211_NDP_FTYPE_BA                        4
#define IEEE80211_NDP_FTYPE_BF_REPORT_POLL        5
#define IEEE80211_NDP_FTYPE_PAGING                6
#define IEEE80211_NDP_FTYPE_PREQ                7

#define SM64(f, v)        ((((u64)v) << f##_S) & f)

/* NDP CMAC frame fields */
#define IEEE80211_NDP_FTYPE                    0x0000000000000007
#define IEEE80211_NDP_FTYPE_S                  0x0000000000000000

/* 1M Probe Request 11ah 9.9.3.1.1 */
#define IEEE80211_NDP_1M_PREQ_ANO      0x0000000000000008
#define IEEE80211_NDP_1M_PREQ_ANO_S                     3
#define IEEE80211_NDP_1M_PREQ_CSSID    0x00000000000FFFF0
#define IEEE80211_NDP_1M_PREQ_CSSID_S                   4
#define IEEE80211_NDP_1M_PREQ_RTYPE    0x0000000000100000
#define IEEE80211_NDP_1M_PREQ_RTYPE_S                  20
#define IEEE80211_NDP_1M_PREQ_RSV      0x0000000001E00000
#define IEEE80211_NDP_1M_PREQ_RSV      0x0000000001E00000
/* 2M Probe Request 11ah 9.9.3.1.2 */
#define IEEE80211_NDP_2M_PREQ_ANO      0x0000000000000008
#define IEEE80211_NDP_2M_PREQ_ANO_S                     3
#define IEEE80211_NDP_2M_PREQ_CSSID    0x0000000FFFFFFFF0
#define IEEE80211_NDP_2M_PREQ_CSSID_S                   4
#define IEEE80211_NDP_2M_PREQ_RTYPE    0x0000001000000000
#define IEEE80211_NDP_2M_PREQ_RTYPE_S                  36

#define IEEE80211_ANO_NETTYPE_WILD              15

/* bits unique to S1G beacon */
#define IEEE80211_S1G_BCN_NEXT_TBTT    0x100

/* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */
#define IEEE80211_CTL_EXT_POLL                0x2000
#define IEEE80211_CTL_EXT_SPR                0x3000
#define IEEE80211_CTL_EXT_GRANT        0x4000
#define IEEE80211_CTL_EXT_DMG_CTS        0x5000
#define IEEE80211_CTL_EXT_DMG_DTS        0x6000
#define IEEE80211_CTL_EXT_SSW                0x8000
#define IEEE80211_CTL_EXT_SSW_FBACK        0x9000
#define IEEE80211_CTL_EXT_SSW_ACK        0xa000


#define IEEE80211_SN_MASK                ((IEEE80211_SCTL_SEQ) >> 4)
#define IEEE80211_MAX_SN                IEEE80211_SN_MASK
#define IEEE80211_SN_MODULO                (IEEE80211_MAX_SN + 1)


/* PV1 Layout IEEE 802.11-2020 9.8.3.1 */
#define IEEE80211_PV1_FCTL_VERS                0x0003
#define IEEE80211_PV1_FCTL_FTYPE        0x001c
#define IEEE80211_PV1_FCTL_STYPE        0x00e0
#define IEEE80211_PV1_FCTL_FROMDS                0x0100
#define IEEE80211_PV1_FCTL_MOREFRAGS        0x0200
#define IEEE80211_PV1_FCTL_PM                0x0400
#define IEEE80211_PV1_FCTL_MOREDATA        0x0800
#define IEEE80211_PV1_FCTL_PROTECTED        0x1000
#define IEEE80211_PV1_FCTL_END_SP       0x2000
#define IEEE80211_PV1_FCTL_RELAYED      0x4000
#define IEEE80211_PV1_FCTL_ACK_POLICY   0x8000
#define IEEE80211_PV1_FCTL_CTL_EXT        0x0f00

static inline bool ieee80211_sn_less(u16 sn1, u16 sn2)
{
        return ((sn1 - sn2) & IEEE80211_SN_MASK) > (IEEE80211_SN_MODULO >> 1);
}

static inline bool ieee80211_sn_less_eq(u16 sn1, u16 sn2)
{
        return ((sn2 - sn1) & IEEE80211_SN_MASK) <= (IEEE80211_SN_MODULO >> 1);
}

static inline u16 ieee80211_sn_add(u16 sn1, u16 sn2)
{
        return (sn1 + sn2) & IEEE80211_SN_MASK;
}

static inline u16 ieee80211_sn_inc(u16 sn)
{
        return ieee80211_sn_add(sn, 1);
}

static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
{
        return (sn1 - sn2) & IEEE80211_SN_MASK;
}

#define IEEE80211_SEQ_TO_SN(seq)        (((seq) & IEEE80211_SCTL_SEQ) >> 4)
#define IEEE80211_SN_TO_SEQ(ssn)        (((ssn) << 4) & IEEE80211_SCTL_SEQ)

/* miscellaneous IEEE 802.11 constants */
#define IEEE80211_MAX_FRAG_THRESHOLD        2352
#define IEEE80211_MAX_RTS_THRESHOLD        2353
#define IEEE80211_MAX_AID                2007
#define IEEE80211_MAX_AID_S1G                8191
#define IEEE80211_MAX_TIM_LEN                251
#define IEEE80211_MAX_MESH_PEERINGS        63
/* Maximum size for the MA-UNITDATA primitive, 802.11 standard section
   6.2.1.1.2.

   802.11e clarifies the figure in section 7.1.2. The frame body is
   up to 2304 octets long (maximum MSDU size) plus any crypt overhead. */
#define IEEE80211_MAX_DATA_LEN                2304
/* 802.11ad extends maximum MSDU size for DMG (freq > 40Ghz) networks
 * to 7920 bytes, see 8.2.3 General frame format
 */
#define IEEE80211_MAX_DATA_LEN_DMG        7920
/* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */
#define IEEE80211_MAX_FRAME_LEN                2352

/* Maximal size of an A-MSDU that can be transported in a HT BA session */
#define IEEE80211_MAX_MPDU_LEN_HT_BA                4095

/* Maximal size of an A-MSDU */
#define IEEE80211_MAX_MPDU_LEN_HT_3839                3839
#define IEEE80211_MAX_MPDU_LEN_HT_7935                7935

#define IEEE80211_MAX_MPDU_LEN_VHT_3895                3895
#define IEEE80211_MAX_MPDU_LEN_VHT_7991                7991
#define IEEE80211_MAX_MPDU_LEN_VHT_11454        11454

#define IEEE80211_MAX_SSID_LEN                32

#define IEEE80211_MAX_MESH_ID_LEN        32

#define IEEE80211_FIRST_TSPEC_TSID        8
#define IEEE80211_NUM_TIDS                16

/* number of user priorities 802.11 uses */
#define IEEE80211_NUM_UPS                8
/* number of ACs */
#define IEEE80211_NUM_ACS                4

#define IEEE80211_QOS_CTL_LEN                2
/* 1d tag mask */
#define IEEE80211_QOS_CTL_TAG1D_MASK                0x0007
/* TID mask */
#define IEEE80211_QOS_CTL_TID_MASK                0x000f
/* EOSP */
#define IEEE80211_QOS_CTL_EOSP                        0x0010
/* ACK policy */
#define IEEE80211_QOS_CTL_ACK_POLICY_NORMAL        0x0000
#define IEEE80211_QOS_CTL_ACK_POLICY_NOACK        0x0020
#define IEEE80211_QOS_CTL_ACK_POLICY_NO_EXPL        0x0040
#define IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK        0x0060
#define IEEE80211_QOS_CTL_ACK_POLICY_MASK        0x0060
/* A-MSDU 802.11n */
#define IEEE80211_QOS_CTL_A_MSDU_PRESENT        0x0080
/* Mesh Control 802.11s */
#define IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT  0x0100

/* Mesh Power Save Level */
#define IEEE80211_QOS_CTL_MESH_PS_LEVEL                0x0200
/* Mesh Receiver Service Period Initiated */
#define IEEE80211_QOS_CTL_RSPI                        0x0400

/* U-APSD queue for WMM IEs sent by AP */
#define IEEE80211_WMM_IE_AP_QOSINFO_UAPSD        (1<<7)
#define IEEE80211_WMM_IE_AP_QOSINFO_PARAM_SET_CNT_MASK        0x0f

/* U-APSD queues for WMM IEs sent by STA */
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VO        (1<<0)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_VI        (1<<1)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BK        (1<<2)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_BE        (1<<3)
#define IEEE80211_WMM_IE_STA_QOSINFO_AC_MASK        0x0f

/* U-APSD max SP length for WMM IEs sent by STA */
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL        0x00
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_2        0x01
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_4        0x02
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_6        0x03
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK        0x03
#define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT        5

#define IEEE80211_HT_CTL_LEN                4

/* trigger type within common_info of trigger frame */
#define IEEE80211_TRIGGER_TYPE_MASK                0xf
#define IEEE80211_TRIGGER_TYPE_BASIC                0x0
#define IEEE80211_TRIGGER_TYPE_BFRP                0x1
#define IEEE80211_TRIGGER_TYPE_MU_BAR                0x2
#define IEEE80211_TRIGGER_TYPE_MU_RTS                0x3
#define IEEE80211_TRIGGER_TYPE_BSRP                0x4
#define IEEE80211_TRIGGER_TYPE_GCR_MU_BAR        0x5
#define IEEE80211_TRIGGER_TYPE_BQRP                0x6
#define IEEE80211_TRIGGER_TYPE_NFRP                0x7

/* UL-bandwidth within common_info of trigger frame */
#define IEEE80211_TRIGGER_ULBW_MASK                0xc0000
#define IEEE80211_TRIGGER_ULBW_20MHZ                0x0
#define IEEE80211_TRIGGER_ULBW_40MHZ                0x1
#define IEEE80211_TRIGGER_ULBW_80MHZ                0x2
#define IEEE80211_TRIGGER_ULBW_160_80P80MHZ        0x3

struct ieee80211_hdr {
        __le16 frame_control;
        __le16 duration_id;
        struct_group(addrs,
                u8 addr1[ETH_ALEN];
                u8 addr2[ETH_ALEN];
                u8 addr3[ETH_ALEN];
        );
        __le16 seq_ctrl;
        u8 addr4[ETH_ALEN];
} __packed __aligned(2);

struct ieee80211_hdr_3addr {
        __le16 frame_control;
        __le16 duration_id;
        u8 addr1[ETH_ALEN];
        u8 addr2[ETH_ALEN];
        u8 addr3[ETH_ALEN];
        __le16 seq_ctrl;
} __packed __aligned(2);

struct ieee80211_qos_hdr {
        __le16 frame_control;
        __le16 duration_id;
        u8 addr1[ETH_ALEN];
        u8 addr2[ETH_ALEN];
        u8 addr3[ETH_ALEN];
        __le16 seq_ctrl;
        __le16 qos_ctrl;
} __packed __aligned(2);

struct ieee80211_qos_hdr_4addr {
        __le16 frame_control;
        __le16 duration_id;
        u8 addr1[ETH_ALEN];
        u8 addr2[ETH_ALEN];
        u8 addr3[ETH_ALEN];
        __le16 seq_ctrl;
        u8 addr4[ETH_ALEN];
        __le16 qos_ctrl;
} __packed __aligned(2);

struct ieee80211_trigger {
        __le16 frame_control;
        __le16 duration;
        u8 ra[ETH_ALEN];
        u8 ta[ETH_ALEN];
        __le64 common_info;
        u8 variable[];
} __packed __aligned(2);

/**
 * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame has to-DS set
 */
static inline bool ieee80211_has_tods(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_TODS)) != 0;
}

/**
 * ieee80211_has_fromds - check if IEEE80211_FCTL_FROMDS is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame has from-DS set
 */
static inline bool ieee80211_has_fromds(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FROMDS)) != 0;
}

/**
 * ieee80211_has_a4 - check if IEEE80211_FCTL_TODS and IEEE80211_FCTL_FROMDS are set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not it's a 4-address frame (from-DS and to-DS set)
 */
static inline bool ieee80211_has_a4(__le16 fc)
{
        __le16 tmp = cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS);
        return (fc & tmp) == tmp;
}

/**
 * ieee80211_has_morefrags - check if IEEE80211_FCTL_MOREFRAGS is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame has more fragments (more frags bit set)
 */
static inline bool ieee80211_has_morefrags(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_MOREFRAGS)) != 0;
}

/**
 * ieee80211_has_retry - check if IEEE80211_FCTL_RETRY is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the retry flag is set
 */
static inline bool ieee80211_has_retry(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_RETRY)) != 0;
}

/**
 * ieee80211_has_pm - check if IEEE80211_FCTL_PM is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the power management flag is set
 */
static inline bool ieee80211_has_pm(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_PM)) != 0;
}

/**
 * ieee80211_has_moredata - check if IEEE80211_FCTL_MOREDATA is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the more data flag is set
 */
static inline bool ieee80211_has_moredata(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) != 0;
}

/**
 * ieee80211_has_protected - check if IEEE80211_FCTL_PROTECTED is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the protected flag is set
 */
static inline bool ieee80211_has_protected(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_PROTECTED)) != 0;
}

/**
 * ieee80211_has_order - check if IEEE80211_FCTL_ORDER is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the order flag is set
 */
static inline bool ieee80211_has_order(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_ORDER)) != 0;
}

/**
 * ieee80211_is_mgmt - check if type is IEEE80211_FTYPE_MGMT
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame type is management
 */
static inline bool ieee80211_is_mgmt(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT);
}

/**
 * ieee80211_is_ctl - check if type is IEEE80211_FTYPE_CTL
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame type is control
 */
static inline bool ieee80211_is_ctl(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL);
}

/**
 * ieee80211_is_data - check if type is IEEE80211_FTYPE_DATA
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a data frame
 */
static inline bool ieee80211_is_data(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA);
}

/**
 * ieee80211_is_ext - check if type is IEEE80211_FTYPE_EXT
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame type is extended
 */
static inline bool ieee80211_is_ext(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_EXT);
}


/**
 * ieee80211_is_data_qos - check if type is IEEE80211_FTYPE_DATA and IEEE80211_STYPE_QOS_DATA is set
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a QoS data frame
 */
static inline bool ieee80211_is_data_qos(__le16 fc)
{
        /*
         * mask with QOS_DATA rather than IEEE80211_FCTL_STYPE as we just need
         * to check the one bit
         */
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_STYPE_QOS_DATA)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_DATA);
}

/**
 * ieee80211_is_data_present - check if type is IEEE80211_FTYPE_DATA and has data
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a QoS data frame that has data
 *        (i.e. is not null data)
 */
static inline bool ieee80211_is_data_present(__le16 fc)
{
        /*
         * mask with 0x40 and test that that bit is clear to only return true
         * for the data-containing substypes.
         */
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | 0x40)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA);
}

/**
 * ieee80211_is_assoc_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ASSOC_REQ
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an association request
 */
static inline bool ieee80211_is_assoc_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ);
}

/**
 * ieee80211_is_assoc_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ASSOC_RESP
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an association response
 */
static inline bool ieee80211_is_assoc_resp(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_RESP);
}

/**
 * ieee80211_is_reassoc_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_REASSOC_REQ
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a reassociation request
 */
static inline bool ieee80211_is_reassoc_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ);
}

/**
 * ieee80211_is_reassoc_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_REASSOC_RESP
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a reassociation response
 */
static inline bool ieee80211_is_reassoc_resp(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_RESP);
}

/**
 * ieee80211_is_probe_req - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_PROBE_REQ
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a probe request
 */
static inline bool ieee80211_is_probe_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ);
}

/**
 * ieee80211_is_probe_resp - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_PROBE_RESP
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a probe response
 */
static inline bool ieee80211_is_probe_resp(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP);
}

/**
 * ieee80211_is_beacon - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_BEACON
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a (regular, not S1G) beacon
 */
static inline bool ieee80211_is_beacon(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON);
}

/**
 * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT &&
 * IEEE80211_STYPE_S1G_BEACON
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an S1G beacon
 */
static inline bool ieee80211_is_s1g_beacon(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE |
                                 IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON);
}

/**
 * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an S1G short beacon,
 *        i.e. it is an S1G beacon with 'next TBTT' flag set
 */
static inline bool ieee80211_is_s1g_short_beacon(__le16 fc)
{
        return ieee80211_is_s1g_beacon(fc) &&
                (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT));
}

/**
 * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an ATIM frame
 */
static inline bool ieee80211_is_atim(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ATIM);
}

/**
 * ieee80211_is_disassoc - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_DISASSOC
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a disassociation frame
 */
static inline bool ieee80211_is_disassoc(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_DISASSOC);
}

/**
 * ieee80211_is_auth - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_AUTH
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an authentication frame
 */
static inline bool ieee80211_is_auth(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_AUTH);
}

/**
 * ieee80211_is_deauth - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_DEAUTH
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a deauthentication frame
 */
static inline bool ieee80211_is_deauth(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_DEAUTH);
}

/**
 * ieee80211_is_action - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ACTION
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an action frame
 */
static inline bool ieee80211_is_action(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION);
}

/**
 * ieee80211_is_back_req - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_BACK_REQ
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a block-ACK request frame
 */
static inline bool ieee80211_is_back_req(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_BACK_REQ);
}

/**
 * ieee80211_is_back - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_BACK
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a block-ACK frame
 */
static inline bool ieee80211_is_back(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_BACK);
}

/**
 * ieee80211_is_pspoll - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_PSPOLL
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a PS-poll frame
 */
static inline bool ieee80211_is_pspoll(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL);
}

/**
 * ieee80211_is_rts - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_RTS
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an RTS frame
 */
static inline bool ieee80211_is_rts(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_RTS);
}

/**
 * ieee80211_is_cts - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CTS
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a CTS frame
 */
static inline bool ieee80211_is_cts(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTS);
}

/**
 * ieee80211_is_ack - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_ACK
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is an ACK frame
 */
static inline bool ieee80211_is_ack(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_ACK);
}

/**
 * ieee80211_is_cfend - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CFEND
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a CF-end frame
 */
static inline bool ieee80211_is_cfend(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CFEND);
}

/**
 * ieee80211_is_cfendack - check if IEEE80211_FTYPE_CTL && IEEE80211_STYPE_CFENDACK
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a CF-end-ack frame
 */
static inline bool ieee80211_is_cfendack(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CFENDACK);
}

/**
 * ieee80211_is_nullfunc - check if frame is a regular (non-QoS) nullfunc frame
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a nullfunc frame
 */
static inline bool ieee80211_is_nullfunc(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC);
}

/**
 * ieee80211_is_qos_nullfunc - check if frame is a QoS nullfunc frame
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a QoS nullfunc frame
 */
static inline bool ieee80211_is_qos_nullfunc(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC);
}

/**
 * ieee80211_is_trigger - check if frame is trigger frame
 * @fc: frame control field in little-endian byteorder
 * Return: whether or not the frame is a trigger frame
 */
static inline bool ieee80211_is_trigger(__le16 fc)
{
        return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) ==
               cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_TRIGGER);
}

/**
 * ieee80211_is_any_nullfunc - check if frame is regular or QoS nullfunc frame
 * @fc: frame control bytes in little-endian byteorder
 * Return: whether or not the frame is a nullfunc or QoS nullfunc frame
 */
static inline bool ieee80211_is_any_nullfunc(__le16 fc)
{
        return (ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc));
}

/**
 * ieee80211_is_first_frag - check if IEEE80211_SCTL_FRAG is not set
 * @seq_ctrl: frame sequence control bytes in little-endian byteorder
 * Return: whether or not the frame is the first fragment (also true if
 *        it's not fragmented at all)
 */
static inline bool ieee80211_is_first_frag(__le16 seq_ctrl)
{
        return (seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG)) == 0;
}

/**
 * ieee80211_is_frag - check if a frame is a fragment
 * @hdr: 802.11 header of the frame
 * Return: whether or not the frame is a fragment
 */
static inline bool ieee80211_is_frag(struct ieee80211_hdr *hdr)
{
        return ieee80211_has_morefrags(hdr->frame_control) ||
               hdr->seq_ctrl & cpu_to_le16(IEEE80211_SCTL_FRAG);
}

static inline u16 ieee80211_get_sn(struct ieee80211_hdr *hdr)
{
        return le16_get_bits(hdr->seq_ctrl, IEEE80211_SCTL_SEQ);
}

struct ieee80211s_hdr {
        u8 flags;
        u8 ttl;
        __le32 seqnum;
        u8 eaddr1[ETH_ALEN];
        u8 eaddr2[ETH_ALEN];
} __packed __aligned(2);

/* Mesh flags */
#define MESH_FLAGS_AE_A4         0x1
#define MESH_FLAGS_AE_A5_A6        0x2
#define MESH_FLAGS_AE                0x3
#define MESH_FLAGS_PS_DEEP        0x4

/**
 * enum ieee80211_preq_flags - mesh PREQ element flags
 *
 * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield
 */
enum ieee80211_preq_flags {
        IEEE80211_PREQ_PROACTIVE_PREP_FLAG        = 1<<2,
};

/**
 * enum ieee80211_preq_target_flags - mesh PREQ element per target flags
 *
 * @IEEE80211_PREQ_TO_FLAG: target only subfield
 * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield
 */
enum ieee80211_preq_target_flags {
        IEEE80211_PREQ_TO_FLAG        = 1<<0,
        IEEE80211_PREQ_USN_FLAG        = 1<<2,
};

/**
 * struct ieee80211_quiet_ie - Quiet element
 * @count: Quiet Count
 * @period: Quiet Period
 * @duration: Quiet Duration
 * @offset: Quiet Offset
 *
 * This structure represents the payload of the "Quiet element" as
 * described in IEEE Std 802.11-2020 section 9.4.2.22.
 */
struct ieee80211_quiet_ie {
        u8 count;
        u8 period;
        __le16 duration;
        __le16 offset;
} __packed;

/**
 * struct ieee80211_msrment_ie - Measurement element
 * @token: Measurement Token
 * @mode: Measurement Report Mode
 * @type: Measurement Type
 * @request: Measurement Request or Measurement Report
 *
 * This structure represents the payload of both the "Measurement
 * Request element" and the "Measurement Report element" as described
 * in IEEE Std 802.11-2020 sections 9.4.2.20 and 9.4.2.21.
 */
struct ieee80211_msrment_ie {
        u8 token;
        u8 mode;
        u8 type;
        u8 request[];
} __packed;

/**
 * struct ieee80211_channel_sw_ie - Channel Switch Announcement element
 * @mode: Channel Switch Mode
 * @new_ch_num: New Channel Number
 * @count: Channel Switch Count
 *
 * This structure represents the payload of the "Channel Switch
 * Announcement element" as described in IEEE Std 802.11-2020 section
 * 9.4.2.18.
 */
struct ieee80211_channel_sw_ie {
        u8 mode;
        u8 new_ch_num;
        u8 count;
} __packed;

/**
 * struct ieee80211_ext_chansw_ie - Extended Channel Switch Announcement element
 * @mode: Channel Switch Mode
 * @new_operating_class: New Operating Class
 * @new_ch_num: New Channel Number
 * @count: Channel Switch Count
 *
 * This structure represents the "Extended Channel Switch Announcement
 * element" as described in IEEE Std 802.11-2020 section 9.4.2.52.
 */
struct ieee80211_ext_chansw_ie {
        u8 mode;
        u8 new_operating_class;
        u8 new_ch_num;
        u8 count;
} __packed;

/**
 * struct ieee80211_sec_chan_offs_ie - secondary channel offset IE
 * @sec_chan_offs: secondary channel offset, uses IEEE80211_HT_PARAM_CHA_SEC_*
 *        values here
 * This structure represents the "Secondary Channel Offset element"
 */
struct ieee80211_sec_chan_offs_ie {
        u8 sec_chan_offs;
} __packed;

/**
 * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE
 * @mesh_ttl: Time To Live
 * @mesh_flags: Flags
 * @mesh_reason: Reason Code
 * @mesh_pre_value: Precedence Value
 *
 * This structure represents the payload of the "Mesh Channel Switch
 * Parameters element" as described in IEEE Std 802.11-2020 section
 * 9.4.2.102.
 */
struct ieee80211_mesh_chansw_params_ie {
        u8 mesh_ttl;
        u8 mesh_flags;
        __le16 mesh_reason;
        __le16 mesh_pre_value;
} __packed;

/**
 * struct ieee80211_wide_bw_chansw_ie - wide bandwidth channel switch IE
 * @new_channel_width: New Channel Width
 * @new_center_freq_seg0: New Channel Center Frequency Segment 0
 * @new_center_freq_seg1: New Channel Center Frequency Segment 1
 *
 * This structure represents the payload of the "Wide Bandwidth
 * Channel Switch element" as described in IEEE Std 802.11-2020
 * section 9.4.2.160.
 */
struct ieee80211_wide_bw_chansw_ie {
        u8 new_channel_width;
        u8 new_center_freq_seg0, new_center_freq_seg1;
} __packed;

/**
 * struct ieee80211_tim_ie - Traffic Indication Map information element
 * @dtim_count: DTIM Count
 * @dtim_period: DTIM Period
 * @bitmap_ctrl: Bitmap Control
 * @required_octet: "Syntatic sugar" to force the struct size to the
 *                  minimum valid size when carried in a non-S1G PPDU
 * @virtual_map: Partial Virtual Bitmap
 *
 * This structure represents the payload of the "TIM element" as
 * described in IEEE Std 802.11-2020 section 9.4.2.5. Note that this
 * definition is only applicable when the element is carried in a
 * non-S1G PPDU. When the TIM is carried in an S1G PPDU, the Bitmap
 * Control and Partial Virtual Bitmap may not be present.
 */
struct ieee80211_tim_ie {
        u8 dtim_count;
        u8 dtim_period;
        u8 bitmap_ctrl;
        union {
                u8 required_octet;
                DECLARE_FLEX_ARRAY(u8, virtual_map);
        };
} __packed;

/**
 * struct ieee80211_meshconf_ie - Mesh Configuration element
 * @meshconf_psel: Active Path Selection Protocol Identifier
 * @meshconf_pmetric: Active Path Selection Metric Identifier
 * @meshconf_congest: Congestion Control Mode Identifier
 * @meshconf_synch: Synchronization Method Identifier
 * @meshconf_auth: Authentication Protocol Identifier
 * @meshconf_form: Mesh Formation Info
 * @meshconf_cap: Mesh Capability (see &enum mesh_config_capab_flags)
 *
 * This structure represents the payload of the "Mesh Configuration
 * element" as described in IEEE Std 802.11-2020 section 9.4.2.97.
 */
struct ieee80211_meshconf_ie {
        u8 meshconf_psel;
        u8 meshconf_pmetric;
        u8 meshconf_congest;
        u8 meshconf_synch;
        u8 meshconf_auth;
        u8 meshconf_form;
        u8 meshconf_cap;
} __packed;

/**
 * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags
 *
 * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish
 *        additional mesh peerings with other mesh STAs
 * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs
 * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure
 *        is ongoing
 * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has
 *        neighbors in deep sleep mode
 *
 * Enumerates the "Mesh Capability" as described in IEEE Std
 * 802.11-2020 section 9.4.2.97.7.
 */
enum mesh_config_capab_flags {
        IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS                = 0x01,
        IEEE80211_MESHCONF_CAPAB_FORWARDING                = 0x08,
        IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING                = 0x20,
        IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL        = 0x40,
};

#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1

/*
 * mesh channel switch parameters element's flag indicator
 *
 */
#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0)
#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1)
#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2)

/**
 * struct ieee80211_rann_ie - RANN (root announcement) element
 * @rann_flags: Flags
 * @rann_hopcount: Hop Count
 * @rann_ttl: Element TTL
 * @rann_addr: Root Mesh STA Address
 * @rann_seq: HWMP Sequence Number
 * @rann_interval: Interval
 * @rann_metric: Metric
 *
 * This structure represents the payload of the "RANN element" as
 * described in IEEE Std 802.11-2020 section 9.4.2.111.
 */
struct ieee80211_rann_ie {
        u8 rann_flags;
        u8 rann_hopcount;
        u8 rann_ttl;
        u8 rann_addr[ETH_ALEN];
        __le32 rann_seq;
        __le32 rann_interval;
        __le32 rann_metric;
} __packed;

enum ieee80211_rann_flags {
        RANN_FLAG_IS_GATE = 1 << 0,
};

enum ieee80211_ht_chanwidth_values {
        IEEE80211_HT_CHANWIDTH_20MHZ = 0,
        IEEE80211_HT_CHANWIDTH_ANY = 1,
};

/**
 * enum ieee80211_vht_opmode_bits - VHT operating mode field bits
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width
 * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag
 * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask
 *        (the NSS value is the value of this field + 1)
 * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift
 * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU
 *        using a beamforming steering matrix
 */
enum ieee80211_vht_opmode_bits {
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK        = 0x03,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ        = 0,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ        = 1,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ        = 2,
        IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ        = 3,
        IEEE80211_OPMODE_NOTIF_BW_160_80P80        = 0x04,
        IEEE80211_OPMODE_NOTIF_RX_NSS_MASK        = 0x70,
        IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT        = 4,
        IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF        = 0x80,
};

/**
 * enum ieee80211_s1g_chanwidth - S1G channel widths
 * These are defined in IEEE802.11-2016ah Table 10-20
 * as BSS Channel Width
 *
 * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel
 * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel
 */
enum ieee80211_s1g_chanwidth {
        IEEE80211_S1G_CHANWIDTH_1MHZ = 0,
        IEEE80211_S1G_CHANWIDTH_2MHZ = 1,
        IEEE80211_S1G_CHANWIDTH_4MHZ = 3,
        IEEE80211_S1G_CHANWIDTH_8MHZ = 7,
        IEEE80211_S1G_CHANWIDTH_16MHZ = 15,
};

#define WLAN_SA_QUERY_TR_ID_LEN 2
#define WLAN_MEMBERSHIP_LEN 8
#define WLAN_USER_POSITION_LEN 16

/**
 * struct ieee80211_tpc_report_ie - TPC Report element
 * @tx_power: Transmit Power
 * @link_margin: Link Margin
 *
 * This structure represents the payload of the "TPC Report element" as
 * described in IEEE Std 802.11-2020 section 9.4.2.16.
 */
struct ieee80211_tpc_report_ie {
        u8 tx_power;
        u8 link_margin;
} __packed;

#define IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK        GENMASK(2, 1)
#define IEEE80211_ADDBA_EXT_FRAG_LEVEL_SHIFT        1
#define IEEE80211_ADDBA_EXT_NO_FRAG                BIT(0)
#define IEEE80211_ADDBA_EXT_BUF_SIZE_MASK        GENMASK(7, 5)
#define IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT        10

struct ieee80211_addba_ext_ie {
        u8 data;
} __packed;

/**
 * struct ieee80211_s1g_bcn_compat_ie - S1G Beacon Compatibility element
 * @compat_info: Compatibility Information
 * @beacon_int: Beacon Interval
 * @tsf_completion: TSF Completion
 *
 * This structure represents the payload of the "S1G Beacon
 * Compatibility element" as described in IEEE Std 802.11-2020 section
 * 9.4.2.196.
 */
struct ieee80211_s1g_bcn_compat_ie {
        __le16 compat_info;
        __le16 beacon_int;
        __le32 tsf_completion;
} __packed;

/**
 * struct ieee80211_s1g_oper_ie - S1G Operation element
 * @ch_width: S1G Operation Information Channel Width
 * @oper_class: S1G Operation Information Operating Class
 * @primary_ch: S1G Operation Information Primary Channel Number
 * @oper_ch: S1G Operation Information  Channel Center Frequency
 * @basic_mcs_nss: Basic S1G-MCS and NSS Set
 *
 * This structure represents the payload of the "S1G Operation
 * element" as described in IEEE Std 802.11-2020 section 9.4.2.212.
 */
struct ieee80211_s1g_oper_ie {
        u8 ch_width;
        u8 oper_class;
        u8 primary_ch;
        u8 oper_ch;
        __le16 basic_mcs_nss;
} __packed;

/**
 * struct ieee80211_aid_response_ie - AID Response element
 * @aid: AID/Group AID
 * @switch_count: AID Switch Count
 * @response_int: AID Response Interval
 *
 * This structure represents the payload of the "AID Response element"
 * as described in IEEE Std 802.11-2020 section 9.4.2.194.
 */
struct ieee80211_aid_response_ie {
        __le16 aid;
        u8 switch_count;
        __le16 response_int;
} __packed;

struct ieee80211_s1g_cap {
        u8 capab_info[10];
        u8 supp_mcs_nss[5];
} __packed;

struct ieee80211_ext {
        __le16 frame_control;
        __le16 duration;
        union {
                struct {
                        u8 sa[ETH_ALEN];
                        __le32 timestamp;
                        u8 change_seq;
                        u8 variable[0];
                } __packed s1g_beacon;
                struct {
                        u8 sa[ETH_ALEN];
                        __le32 timestamp;
                        u8 change_seq;
                        u8 next_tbtt[3];
                        u8 variable[0];
                } __packed s1g_short_beacon;
        } u;
} __packed __aligned(2);

#define IEEE80211_TWT_CONTROL_NDP                        BIT(0)
#define IEEE80211_TWT_CONTROL_RESP_MODE                        BIT(1)
#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST        BIT(3)
#define IEEE80211_TWT_CONTROL_RX_DISABLED                BIT(4)
#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT                BIT(5)

#define IEEE80211_TWT_REQTYPE_REQUEST                        BIT(0)
#define IEEE80211_TWT_REQTYPE_SETUP_CMD                        GENMASK(3, 1)
#define IEEE80211_TWT_REQTYPE_TRIGGER                        BIT(4)
#define IEEE80211_TWT_REQTYPE_IMPLICIT                        BIT(5)
#define IEEE80211_TWT_REQTYPE_FLOWTYPE                        BIT(6)
#define IEEE80211_TWT_REQTYPE_FLOWID                        GENMASK(9, 7)
#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP                GENMASK(14, 10)
#define IEEE80211_TWT_REQTYPE_PROTECTION                BIT(15)

enum ieee80211_twt_setup_cmd {
        TWT_SETUP_CMD_REQUEST,
        TWT_SETUP_CMD_SUGGEST,
        TWT_SETUP_CMD_DEMAND,
        TWT_SETUP_CMD_GROUPING,
        TWT_SETUP_CMD_ACCEPT,
        TWT_SETUP_CMD_ALTERNATE,
        TWT_SETUP_CMD_DICTATE,
        TWT_SETUP_CMD_REJECT,
};

struct ieee80211_twt_params {
        __le16 req_type;
        __le64 twt;
        u8 min_twt_dur;
        __le16 mantissa;
        u8 channel;
} __packed;

struct ieee80211_twt_setup {
        u8 dialog_token;
        u8 element_id;
        u8 length;
        u8 control;
        u8 params[];
} __packed;

#define IEEE80211_TTLM_MAX_CNT                                2
#define IEEE80211_TTLM_CONTROL_DIRECTION                0x03
#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP                0x04
#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT        0x08
#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT        0x10
#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE                0x20

#define IEEE80211_TTLM_DIRECTION_DOWN                0
#define IEEE80211_TTLM_DIRECTION_UP                1
#define IEEE80211_TTLM_DIRECTION_BOTH                2

/**
 * struct ieee80211_ttlm_elem - TID-To-Link Mapping element
 *
 * Defined in section 9.4.2.314 in P802.11be_D4
 *
 * @control: the first part of control field
 * @optional: the second part of control field
 */
struct ieee80211_ttlm_elem {
        u8 control;
        u8 optional[];
} __packed;

/**
 * struct ieee80211_bss_load_elem - BSS Load elemen
 *
 * Defined in section 9.4.2.26 in IEEE 802.11-REVme D4.1
 *
 * @sta_count: total number of STAs currently associated with the AP.
 * @channel_util: Percentage of time that the access point sensed the channel
 *        was busy. This value is in range [0, 255], the highest value means
 *        100% busy.
 * @avail_admission_capa: remaining amount of medium time used for admission
 *        control.
 */
struct ieee80211_bss_load_elem {
        __le16 sta_count;
        u8 channel_util;
        __le16 avail_admission_capa;
} __packed;

struct ieee80211_mgmt {
        __le16 frame_control;
        __le16 duration;
        u8 da[ETH_ALEN];
        u8 sa[ETH_ALEN];
        u8 bssid[ETH_ALEN];
        __le16 seq_ctrl;
        union {
                struct {
                        __le16 auth_alg;
                        __le16 auth_transaction;
                        __le16 status_code;
                        /* possibly followed by Challenge text */
                        u8 variable[];
                } __packed auth;
                struct {
                        __le16 reason_code;
                } __packed deauth;
                struct {
                        __le16 capab_info;
                        __le16 listen_interval;
                        /* followed by SSID and Supported rates */
                        u8 variable[];
                } __packed assoc_req;
                struct {
                        __le16 capab_info;
                        __le16 status_code;
                        __le16 aid;
                        /* followed by Supported rates */
                        u8 variable[];
                } __packed assoc_resp, reassoc_resp;
                struct {
                        __le16 capab_info;
                        __le16 status_code;
                        u8 variable[];
                } __packed s1g_assoc_resp, s1g_reassoc_resp;
                struct {
                        __le16 capab_info;
                        __le16 listen_interval;
                        u8 current_ap[ETH_ALEN];
                        /* followed by SSID and Supported rates */
                        u8 variable[];
                } __packed reassoc_req;
                struct {
                        __le16 reason_code;
                } __packed disassoc;
                struct {
                        __le64 timestamp;
                        __le16 beacon_int;
                        __le16 capab_info;
                        /* followed by some of SSID, Supported rates,
                         * FH Params, DS Params, CF Params, IBSS Params, TIM */
                        u8 variable[];
                } __packed beacon;
                struct {
                        /* only variable items: SSID, Supported rates */
                        DECLARE_FLEX_ARRAY(u8, variable);
                } __packed probe_req;
                struct {
                        __le64 timestamp;
                        __le16 beacon_int;
                        __le16 capab_info;
                        /* followed by some of SSID, Supported rates,
                         * FH Params, DS Params, CF Params, IBSS Params */
                        u8 variable[];
                } __packed probe_resp;
                struct {
                        u8 category;
                        union {
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 status_code;
                                        u8 variable[];
                                } __packed wme_action;
                                struct{
                                        u8 action_code;
                                        u8 variable[];
                                } __packed chan_switch;
                                struct{
                                        u8 action_code;
                                        struct ieee80211_ext_chansw_ie data;
                                        u8 variable[];
                                } __packed ext_chan_switch;
                                struct{
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 element_id;
                                        u8 length;
                                        struct ieee80211_msrment_ie msr_elem;
                                } __packed measurement;
                                struct{
                                        u8 action_code;
                                        u8 dialog_token;
                                        __le16 capab;
                                        __le16 timeout;
                                        __le16 start_seq_num;
                                        /* followed by BA Extension */
                                        u8 variable[];
                                } __packed addba_req;
                                struct{
                                        u8 action_code;
                                        u8 dialog_token;
                                        __le16 status;
                                        __le16 capab;
                                        __le16 timeout;
                                } __packed addba_resp;
                                struct{
                                        u8 action_code;
                                        __le16 params;
                                        __le16 reason_code;
                                } __packed delba;
                                struct {
                                        u8 action_code;
                                        u8 variable[];
                                } __packed self_prot;
                                struct{
                                        u8 action_code;
                                        u8 variable[];
                                } __packed mesh_action;
                                struct {
                                        u8 action;
                                        u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN];
                                } __packed sa_query;
                                struct {
                                        u8 action;
                                        u8 smps_control;
                                } __packed ht_smps;
                                struct {
                                        u8 action_code;
                                        u8 chanwidth;
                                } __packed ht_notify_cw;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        __le16 capability;
                                        u8 variable[0];
                                } __packed tdls_discover_resp;
                                struct {
                                        u8 action_code;
                                        u8 operating_mode;
                                } __packed vht_opmode_notif;
                                struct {
                                        u8 action_code;
                                        u8 membership[WLAN_MEMBERSHIP_LEN];
                                        u8 position[WLAN_USER_POSITION_LEN];
                                } __packed vht_group_notif;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 tpc_elem_id;
                                        u8 tpc_elem_length;
                                        struct ieee80211_tpc_report_ie tpc;
                                } __packed tpc_report;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 follow_up;
                                        u8 tod[6];
                                        u8 toa[6];
                                        __le16 tod_error;
                                        __le16 toa_error;
                                        u8 variable[];
                                } __packed ftm;
                                struct {
                                        u8 action_code;
                                        u8 variable[];
                                } __packed s1g;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 follow_up;
                                        u32 tod;
                                        u32 toa;
                                        u8 max_tod_error;
                                        u8 max_toa_error;
                                } __packed wnm_timing_msr;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 variable[];
                                } __packed ttlm_req;
                                struct {
                                        u8 action_code;
                                        u8 dialog_token;
                                        u8 status_code;
                                        u8 variable[];
                                } __packed ttlm_res;
                                struct {
                                        u8 action_code;
                                } __packed ttlm_tear_down;
                        } u;
                } __packed action;
                DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */
        } u;
} __packed __aligned(2);

/* Supported rates membership selectors */
#define BSS_MEMBERSHIP_SELECTOR_HT_PHY        127
#define BSS_MEMBERSHIP_SELECTOR_VHT_PHY        126
#define BSS_MEMBERSHIP_SELECTOR_GLK        125
#define BSS_MEMBERSHIP_SELECTOR_EPS        124
#define BSS_MEMBERSHIP_SELECTOR_SAE_H2E 123
#define BSS_MEMBERSHIP_SELECTOR_HE_PHY        122
#define BSS_MEMBERSHIP_SELECTOR_EHT_PHY        121

/* mgmt header + 1 byte category code */
#define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u)


/* Management MIC information element (IEEE 802.11w) */
struct ieee80211_mmie {
        u8 element_id;
        u8 length;
        __le16 key_id;
        u8 sequence_number[6];
        u8 mic[8];
} __packed;

/* Management MIC information element (IEEE 802.11w) for GMAC and CMAC-256 */
struct ieee80211_mmie_16 {
        u8 element_id;
        u8 length;
        __le16 key_id;
        u8 sequence_number[6];
        u8 mic[16];
} __packed;

struct ieee80211_vendor_ie {
        u8 element_id;
        u8 len;
        u8 oui[3];
        u8 oui_type;
} __packed;

struct ieee80211_wmm_ac_param {
        u8 aci_aifsn; /* AIFSN, ACM, ACI */
        u8 cw; /* ECWmin, ECWmax (CW = 2^ECW - 1) */
        __le16 txop_limit;
} __packed;

struct ieee80211_wmm_param_ie {
        u8 element_id; /* Element ID: 221 (0xdd); */
        u8 len; /* Length: 24 */
        /* required fields for WMM version 1 */
        u8 oui[3]; /* 00:50:f2 */
        u8 oui_type; /* 2 */
        u8 oui_subtype; /* 1 */
        u8 version; /* 1 for WMM version 1.0 */
        u8 qos_info; /* AP/STA specific QoS info */
        u8 reserved; /* 0 */
        /* AC_BE, AC_BK, AC_VI, AC_VO */
        struct ieee80211_wmm_ac_param ac[4];
} __packed;

/* Control frames */
struct ieee80211_rts {
        __le16 frame_control;
        __le16 duration;
        u8 ra[ETH_ALEN];
        u8 ta[ETH_ALEN];
} __packed __aligned(2);

struct ieee80211_cts {
        __le16 frame_control;
        __le16 duration;
        u8 ra[ETH_ALEN];
} __packed __aligned(2);

struct ieee80211_pspoll {
        __le16 frame_control;
        __le16 aid;
        u8 bssid[ETH_ALEN];
        u8 ta[ETH_ALEN];
} __packed __aligned(2);

/* TDLS */

/* Channel switch timing */
struct ieee80211_ch_switch_timing {
        __le16 switch_time;
        __le16 switch_timeout;
} __packed;

/* Link-id information element */
struct ieee80211_tdls_lnkie {
        u8 ie_type; /* Link Identifier IE */
        u8 ie_len;
        u8 bssid[ETH_ALEN];
        u8 init_sta[ETH_ALEN];
        u8 resp_sta[ETH_ALEN];
} __packed;

struct ieee80211_tdls_data {
        u8 da[ETH_ALEN];
        u8 sa[ETH_ALEN];
        __be16 ether_type;
        u8 payload_type;
        u8 category;
        u8 action_code;
        union {
                struct {
                        u8 dialog_token;
                        __le16 capability;
                        u8 variable[0];
                } __packed setup_req;
                struct {
                        __le16 status_code;
                        u8 dialog_token;
                        __le16 capability;
                        u8 variable[0];
                } __packed setup_resp;
                struct {
                        __le16 status_code;
                        u8 dialog_token;
                        u8 variable[0];
                } __packed setup_cfm;
                struct {
                        __le16 reason_code;
                        u8 variable[0];
                } __packed teardown;
                struct {
                        u8 dialog_token;
                        u8 variable[0];
                } __packed discover_req;
                struct {
                        u8 target_channel;
                        u8 oper_class;
                        u8 variable[0];
                } __packed chan_switch_req;
                struct {
                        __le16 status_code;
                        u8 variable[0];
                } __packed chan_switch_resp;
        } u;
} __packed;

/*
 * Peer-to-Peer IE attribute related definitions.
 */
/*
 * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute.
 */
enum ieee80211_p2p_attr_id {
        IEEE80211_P2P_ATTR_STATUS = 0,
        IEEE80211_P2P_ATTR_MINOR_REASON,
        IEEE80211_P2P_ATTR_CAPABILITY,
        IEEE80211_P2P_ATTR_DEVICE_ID,
        IEEE80211_P2P_ATTR_GO_INTENT,
        IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT,
        IEEE80211_P2P_ATTR_LISTEN_CHANNEL,
        IEEE80211_P2P_ATTR_GROUP_BSSID,
        IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING,
        IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR,
        IEEE80211_P2P_ATTR_MANAGABILITY,
        IEEE80211_P2P_ATTR_CHANNEL_LIST,
        IEEE80211_P2P_ATTR_ABSENCE_NOTICE,
        IEEE80211_P2P_ATTR_DEVICE_INFO,
        IEEE80211_P2P_ATTR_GROUP_INFO,
        IEEE80211_P2P_ATTR_GROUP_ID,
        IEEE80211_P2P_ATTR_INTERFACE,
        IEEE80211_P2P_ATTR_OPER_CHANNEL,
        IEEE80211_P2P_ATTR_INVITE_FLAGS,
        /* 19 - 220: Reserved */
        IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221,

        IEEE80211_P2P_ATTR_MAX
};

/* Notice of Absence attribute - described in P2P spec 4.1.14 */
/* Typical max value used here */
#define IEEE80211_P2P_NOA_DESC_MAX        4

struct ieee80211_p2p_noa_desc {
        u8 count;
        __le32 duration;
        __le32 interval;
        __le32 start_time;
} __packed;

struct ieee80211_p2p_noa_attr {
        u8 index;
        u8 oppps_ctwindow;
        struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX];
} __packed;

#define IEEE80211_P2P_OPPPS_ENABLE_BIT                BIT(7)
#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK        0x7F

/**
 * struct ieee80211_bar - Block Ack Request frame format
 * @frame_control: Frame Control
 * @duration: Duration
 * @ra: RA
 * @ta: TA
 * @control: BAR Control
 * @start_seq_num: Starting Sequence Number (see Figure 9-37)
 *
 * This structure represents the "BlockAckReq frame format"
 * as described in IEEE Std 802.11-2020 section 9.3.1.7.
*/
struct ieee80211_bar {
        __le16 frame_control;
        __le16 duration;
        __u8 ra[ETH_ALEN];
        __u8 ta[ETH_ALEN];
        __le16 control;
        __le16 start_seq_num;
} __packed;

/* 802.11 BAR control masks */
#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL        0x0000
#define IEEE80211_BAR_CTRL_MULTI_TID                0x0002
#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA        0x0004
#define IEEE80211_BAR_CTRL_TID_INFO_MASK        0xf000
#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT        12

#define IEEE80211_HT_MCS_MASK_LEN                10

/**
 * struct ieee80211_mcs_info - Supported MCS Set field
 * @rx_mask: RX mask
 * @rx_highest: highest supported RX rate. If set represents
 *        the highest supported RX data rate in units of 1 Mbps.
 *        If this field is 0 this value should not be used to
 *        consider the highest RX data rate supported.
 * @tx_params: TX parameters
 * @reserved: Reserved bits
 *
 * This structure represents the "Supported MCS Set field" as
 * described in IEEE Std 802.11-2020 section 9.4.2.55.4.
 */
struct ieee80211_mcs_info {
        u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN];
        __le16 rx_highest;
        u8 tx_params;
        u8 reserved[3];
} __packed;

/* 802.11n HT capability MSC set */
#define IEEE80211_HT_MCS_RX_HIGHEST_MASK        0x3ff
#define IEEE80211_HT_MCS_TX_DEFINED                0x01
#define IEEE80211_HT_MCS_TX_RX_DIFF                0x02
/* value 0 == 1 stream etc */
#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK        0x0C
#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT        2
#define                IEEE80211_HT_MCS_TX_MAX_STREAMS        4
#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION        0x10

#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3)))

/*
 * 802.11n D5.0 20.3.5 / 20.6 says:
 * - indices 0 to 7 and 32 are single spatial stream
 * - 8 to 31 are multiple spatial streams using equal modulation
 *   [8..15 for two streams, 16..23 for three and 24..31 for four]
 * - remainder are multiple spatial streams using unequal modulation
 */
#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33
#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \
        (IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8)

/**
 * struct ieee80211_ht_cap - HT capabilities element
 * @cap_info: HT Capability Information
 * @ampdu_params_info: A-MPDU Parameters
 * @mcs: Supported MCS Set
 * @extended_ht_cap_info: HT Extended Capabilities
 * @tx_BF_cap_info: Transmit Beamforming Capabilities
 * @antenna_selection_info: ASEL Capability
 *
 * This structure represents the payload of the "HT Capabilities
 * element" as described in IEEE Std 802.11-2020 section 9.4.2.55.
 */
struct ieee80211_ht_cap {
        __le16 cap_info;
        u8 ampdu_params_info;

        /* 16 bytes MCS information */
        struct ieee80211_mcs_info mcs;

        __le16 extended_ht_cap_info;
        __le32 tx_BF_cap_info;
        u8 antenna_selection_info;
} __packed;

/* 802.11n HT capabilities masks (for cap_info) */
#define IEEE80211_HT_CAP_LDPC_CODING                0x0001
#define IEEE80211_HT_CAP_SUP_WIDTH_20_40        0x0002
#define IEEE80211_HT_CAP_SM_PS                        0x000C
#define                IEEE80211_HT_CAP_SM_PS_SHIFT        2
#define IEEE80211_HT_CAP_GRN_FLD                0x0010
#define IEEE80211_HT_CAP_SGI_20                        0x0020
#define IEEE80211_HT_CAP_SGI_40                        0x0040
#define IEEE80211_HT_CAP_TX_STBC                0x0080
#define IEEE80211_HT_CAP_RX_STBC                0x0300
#define                IEEE80211_HT_CAP_RX_STBC_SHIFT        8
#define IEEE80211_HT_CAP_DELAY_BA                0x0400
#define IEEE80211_HT_CAP_MAX_AMSDU                0x0800
#define IEEE80211_HT_CAP_DSSSCCK40                0x1000
#define IEEE80211_HT_CAP_RESERVED                0x2000
#define IEEE80211_HT_CAP_40MHZ_INTOLERANT        0x4000
#define IEEE80211_HT_CAP_LSIG_TXOP_PROT                0x8000

/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */
#define IEEE80211_HT_EXT_CAP_PCO                0x0001
#define IEEE80211_HT_EXT_CAP_PCO_TIME                0x0006
#define                IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT        1
#define IEEE80211_HT_EXT_CAP_MCS_FB                0x0300
#define                IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT        8
#define IEEE80211_HT_EXT_CAP_HTC_SUP                0x0400
#define IEEE80211_HT_EXT_CAP_RD_RESPONDER        0x0800

/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */
#define IEEE80211_HT_AMPDU_PARM_FACTOR                0x03
#define IEEE80211_HT_AMPDU_PARM_DENSITY                0x1C
#define                IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT        2

/*
 * Maximum length of AMPDU that the STA can receive in high-throughput (HT).
 * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
 */
enum ieee80211_max_ampdu_length_exp {
        IEEE80211_HT_MAX_AMPDU_8K = 0,
        IEEE80211_HT_MAX_AMPDU_16K = 1,
        IEEE80211_HT_MAX_AMPDU_32K = 2,
        IEEE80211_HT_MAX_AMPDU_64K = 3
};

/*
 * Maximum length of AMPDU that the STA can receive in VHT.
 * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets)
 */
enum ieee80211_vht_max_ampdu_length_exp {
        IEEE80211_VHT_MAX_AMPDU_8K = 0,
        IEEE80211_VHT_MAX_AMPDU_16K = 1,
        IEEE80211_VHT_MAX_AMPDU_32K = 2,
        IEEE80211_VHT_MAX_AMPDU_64K = 3,
        IEEE80211_VHT_MAX_AMPDU_128K = 4,
        IEEE80211_VHT_MAX_AMPDU_256K = 5,
        IEEE80211_VHT_MAX_AMPDU_512K = 6,
        IEEE80211_VHT_MAX_AMPDU_1024K = 7
};

#define IEEE80211_HT_MAX_AMPDU_FACTOR 13

/* Minimum MPDU start spacing */
enum ieee80211_min_mpdu_spacing {
        IEEE80211_HT_MPDU_DENSITY_NONE = 0,        /* No restriction */
        IEEE80211_HT_MPDU_DENSITY_0_25 = 1,        /* 1/4 usec */
        IEEE80211_HT_MPDU_DENSITY_0_5 = 2,        /* 1/2 usec */
        IEEE80211_HT_MPDU_DENSITY_1 = 3,        /* 1 usec */
        IEEE80211_HT_MPDU_DENSITY_2 = 4,        /* 2 usec */
        IEEE80211_HT_MPDU_DENSITY_4 = 5,        /* 4 usec */
        IEEE80211_HT_MPDU_DENSITY_8 = 6,        /* 8 usec */
        IEEE80211_HT_MPDU_DENSITY_16 = 7        /* 16 usec */
};

/**
 * struct ieee80211_ht_operation - HT operation IE
 * @primary_chan: Primary Channel
 * @ht_param: HT Operation Information parameters
 * @operation_mode: HT Operation Information operation mode
 * @stbc_param: HT Operation Information STBC params
 * @basic_set: Basic HT-MCS Set
 *
 * This structure represents the payload of the "HT Operation
 * element" as described in IEEE Std 802.11-2020 section 9.4.2.56.
 */
struct ieee80211_ht_operation {
        u8 primary_chan;
        u8 ht_param;
        __le16 operation_mode;
        __le16 stbc_param;
        u8 basic_set[16];
} __packed;

/* for ht_param */
#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET                0x03
#define                IEEE80211_HT_PARAM_CHA_SEC_NONE                0x00
#define                IEEE80211_HT_PARAM_CHA_SEC_ABOVE        0x01
#define                IEEE80211_HT_PARAM_CHA_SEC_BELOW        0x03
#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY                0x04
#define IEEE80211_HT_PARAM_RIFS_MODE                        0x08

/* for operation_mode */
#define IEEE80211_HT_OP_MODE_PROTECTION                        0x0003
#define                IEEE80211_HT_OP_MODE_PROTECTION_NONE                0
#define                IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER        1
#define                IEEE80211_HT_OP_MODE_PROTECTION_20MHZ                2
#define                IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED        3
#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT                0x0004
#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT                0x0010
#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT                5
#define IEEE80211_HT_OP_MODE_CCFS2_MASK                        0x1fe0

/* for stbc_param */
#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON                0x0040
#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT                0x0080
#define IEEE80211_HT_STBC_PARAM_STBC_BEACON                0x0100
#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT        0x0200
#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE                0x0400
#define IEEE80211_HT_STBC_PARAM_PCO_PHASE                0x0800


/* block-ack parameters */
#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001
#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002
#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C
#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0
#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000
#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800

/*
 * A-MPDU buffer sizes
 * According to HT size varies from 8 to 64 frames
 * HE adds the ability to have up to 256 frames.
 * EHT adds the ability to have up to 1K frames.
 */
#define IEEE80211_MIN_AMPDU_BUF                0x8
#define IEEE80211_MAX_AMPDU_BUF_HT        0x40
#define IEEE80211_MAX_AMPDU_BUF_HE        0x100
#define IEEE80211_MAX_AMPDU_BUF_EHT        0x400


/* Spatial Multiplexing Power Save Modes (for capability) */
#define WLAN_HT_CAP_SM_PS_STATIC        0
#define WLAN_HT_CAP_SM_PS_DYNAMIC        1
#define WLAN_HT_CAP_SM_PS_INVALID        2
#define WLAN_HT_CAP_SM_PS_DISABLED        3

/* for SM power control field lower two bits */
#define WLAN_HT_SMPS_CONTROL_DISABLED        0
#define WLAN_HT_SMPS_CONTROL_STATIC        1
#define WLAN_HT_SMPS_CONTROL_DYNAMIC        3

/**
 * struct ieee80211_vht_mcs_info - VHT MCS information
 * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams
 * @rx_highest: Indicates highest long GI VHT PPDU data rate
 *        STA can receive. Rate expressed in units of 1 Mbps.
 *        If this field is 0 this value should not be used to
 *        consider the highest RX data rate supported.
 *        The top 3 bits of this field indicate the Maximum NSTS,total
 *        (a beamformee capability.)
 * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams
 * @tx_highest: Indicates highest long GI VHT PPDU data rate
 *        STA can transmit. Rate expressed in units of 1 Mbps.
 *        If this field is 0 this value should not be used to
 *        consider the highest TX data rate supported.
 *        The top 2 bits of this field are reserved, the
 *        3rd bit from the top indiciates VHT Extended NSS BW
 *        Capability.
 */
struct ieee80211_vht_mcs_info {
        __le16 rx_mcs_map;
        __le16 rx_highest;
        __le16 tx_mcs_map;
        __le16 tx_highest;
} __packed;

/* for rx_highest */
#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT        13
#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK        (7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT)

/* for tx_highest */
#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE        (1 << 13)

/**
 * enum ieee80211_vht_mcs_support - VHT MCS support definitions
 * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
 *        number of streams
 * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported
 * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported
 * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported
 *
 * These definitions are used in each 2-bit subfield of the @rx_mcs_map
 * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are
 * both split into 8 subfields by number of streams. These values indicate
 * which MCSes are supported for the number of streams the value appears
 * for.
 */
enum ieee80211_vht_mcs_support {
        IEEE80211_VHT_MCS_SUPPORT_0_7        = 0,
        IEEE80211_VHT_MCS_SUPPORT_0_8        = 1,
        IEEE80211_VHT_MCS_SUPPORT_0_9        = 2,
        IEEE80211_VHT_MCS_NOT_SUPPORTED        = 3,
};

/**
 * struct ieee80211_vht_cap - VHT capabilities
 *
 * This structure is the "VHT capabilities element" as
 * described in 802.11ac D3.0 8.4.2.160
 * @vht_cap_info: VHT capability info
 * @supp_mcs: VHT MCS supported rates
 */
struct ieee80211_vht_cap {
        __le32 vht_cap_info;
        struct ieee80211_vht_mcs_info supp_mcs;
} __packed;

/**
 * enum ieee80211_vht_chanwidth - VHT channel width
 * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to
 *        determine the channel width (20 or 40 MHz)
 * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth
 * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth
 * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth
 */
enum ieee80211_vht_chanwidth {
        IEEE80211_VHT_CHANWIDTH_USE_HT                = 0,
        IEEE80211_VHT_CHANWIDTH_80MHZ                = 1,
        IEEE80211_VHT_CHANWIDTH_160MHZ                = 2,
        IEEE80211_VHT_CHANWIDTH_80P80MHZ        = 3,
};

/**
 * struct ieee80211_vht_operation - VHT operation IE
 *
 * This structure is the "VHT operation element" as
 * described in 802.11ac D3.0 8.4.2.161
 * @chan_width: Operating channel width
 * @center_freq_seg0_idx: center freq segment 0 index
 * @center_freq_seg1_idx: center freq segment 1 index
 * @basic_mcs_set: VHT Basic MCS rate set
 */
struct ieee80211_vht_operation {
        u8 chan_width;
        u8 center_freq_seg0_idx;
        u8 center_freq_seg1_idx;
        __le16 basic_mcs_set;
} __packed;

/**
 * struct ieee80211_he_cap_elem - HE capabilities element
 * @mac_cap_info: HE MAC Capabilities Information
 * @phy_cap_info: HE PHY Capabilities Information
 *
 * This structure represents the fixed fields of the payload of the
 * "HE capabilities element" as described in IEEE Std 802.11ax-2021
 * sections 9.4.2.248.2 and 9.4.2.248.3.
 */
struct ieee80211_he_cap_elem {
        u8 mac_cap_info[6];
        u8 phy_cap_info[11];
} __packed;

#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN        5

/**
 * enum ieee80211_he_mcs_support - HE MCS support definitions
 * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the
 *        number of streams
 * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported
 * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported
 * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported
 *
 * These definitions are used in each 2-bit subfield of the rx_mcs_*
 * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are
 * both split into 8 subfields by number of streams. These values indicate
 * which MCSes are supported for the number of streams the value appears
 * for.
 */
enum ieee80211_he_mcs_support {
        IEEE80211_HE_MCS_SUPPORT_0_7        = 0,
        IEEE80211_HE_MCS_SUPPORT_0_9        = 1,
        IEEE80211_HE_MCS_SUPPORT_0_11        = 2,
        IEEE80211_HE_MCS_NOT_SUPPORTED        = 3,
};

/**
 * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field
 *
 * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field
 * described in P802.11ax_D2.0 section 9.4.2.237.4
 *
 * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel
 *     widths less than 80MHz.
 * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel
 *     widths less than 80MHz.
 * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel
 *     width 160MHz.
 * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel
 *     width 160MHz.
 * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for
 *     channel width 80p80MHz.
 * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for
 *     channel width 80p80MHz.
 */
struct ieee80211_he_mcs_nss_supp {
        __le16 rx_mcs_80;
        __le16 tx_mcs_80;
        __le16 rx_mcs_160;
        __le16 tx_mcs_160;
        __le16 rx_mcs_80p80;
        __le16 tx_mcs_80p80;
} __packed;

/**
 * struct ieee80211_he_operation - HE Operation element
 * @he_oper_params: HE Operation Parameters + BSS Color Information
 * @he_mcs_nss_set: Basic HE-MCS And NSS Set
 * @optional: Optional fields VHT Operation Information, Max Co-Hosted
 *            BSSID Indicator, and 6 GHz Operation Information
 *
 * This structure represents the payload of the "HE Operation
 * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.249.
 */
struct ieee80211_he_operation {
        __le32 he_oper_params;
        __le16 he_mcs_nss_set;
        u8 optional[];
} __packed;

/**
 * struct ieee80211_he_spr - Spatial Reuse Parameter Set element
 * @he_sr_control: SR Control
 * @optional: Optional fields Non-SRG OBSS PD Max Offset, SRG OBSS PD
 *            Min Offset, SRG OBSS PD Max Offset, SRG BSS Color
 *            Bitmap, and SRG Partial BSSID Bitmap
 *
 * This structure represents the payload of the "Spatial Reuse
 * Parameter Set element" as described in IEEE Std 802.11ax-2021
 * section 9.4.2.252.
 */
struct ieee80211_he_spr {
        u8 he_sr_control;
        u8 optional[];
} __packed;

/**
 * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field
 * @aifsn: ACI/AIFSN
 * @ecw_min_max: ECWmin/ECWmax
 * @mu_edca_timer: MU EDCA Timer
 *
 * This structure represents the "MU AC Parameter Record" as described
 * in IEEE Std 802.11ax-2021 section 9.4.2.251, Figure 9-788p.
 */
struct ieee80211_he_mu_edca_param_ac_rec {
        u8 aifsn;
        u8 ecw_min_max;
        u8 mu_edca_timer;
} __packed;

/**
 * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element
 * @mu_qos_info: QoS Info
 * @ac_be: MU AC_BE Parameter Record
 * @ac_bk: MU AC_BK Parameter Record
 * @ac_vi: MU AC_VI Parameter Record
 * @ac_vo: MU AC_VO Parameter Record
 *
 * This structure represents the payload of the "MU EDCA Parameter Set
 * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.251.
 */
struct ieee80211_mu_edca_param_set {
        u8 mu_qos_info;
        struct ieee80211_he_mu_edca_param_ac_rec ac_be;
        struct ieee80211_he_mu_edca_param_ac_rec ac_bk;
        struct ieee80211_he_mu_edca_param_ac_rec ac_vi;
        struct ieee80211_he_mu_edca_param_ac_rec ac_vo;
} __packed;

#define IEEE80211_EHT_MCS_NSS_RX 0x0f
#define IEEE80211_EHT_MCS_NSS_TX 0xf0

/**
 * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max
 * supported NSS for per MCS.
 *
 * For each field below, bits 0 - 3 indicate the maximal number of spatial
 * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams
 * for Tx.
 *
 * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams
 *     supported for reception and the maximum number of spatial streams
 *     supported for transmission for MCS 0 - 7.
 * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams
 *     supported for reception and the maximum number of spatial streams
 *     supported for transmission for MCS 8 - 9.
 * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams
 *     supported for reception and the maximum number of spatial streams
 *     supported for transmission for MCS 10 - 11.
 * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams
 *     supported for reception and the maximum number of spatial streams
 *     supported for transmission for MCS 12 - 13.
 * @rx_tx_max_nss: array of the previous fields for easier loop access
 */
struct ieee80211_eht_mcs_nss_supp_20mhz_only {
        union {
                struct {
                        u8 rx_tx_mcs7_max_nss;
                        u8 rx_tx_mcs9_max_nss;
                        u8 rx_tx_mcs11_max_nss;
                        u8 rx_tx_mcs13_max_nss;
                };
                u8 rx_tx_max_nss[4];
        };
};

/**
 * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except
 * 20MHz only stations).
 *
 * For each field below, bits 0 - 3 indicate the maximal number of spatial
 * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams
 * for Tx.
 *
 * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams
 *     supported for reception and the maximum number of spatial streams
 *     supported for transmission for MCS 0 - 9.
 * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams
 *     supported for reception and the maximum number of spatial streams
 *     supported for transmission for MCS 10 - 11.
 * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams
 *     supported for reception and the maximum number of spatial streams
 *     supported for transmission for MCS 12 - 13.
 * @rx_tx_max_nss: array of the previous fields for easier loop access
 */
struct ieee80211_eht_mcs_nss_supp_bw {
        union {
                struct {
                        u8 rx_tx_mcs9_max_nss;
                        u8 rx_tx_mcs11_max_nss;
                        u8 rx_tx_mcs13_max_nss;
                };
                u8 rx_tx_max_nss[3];
        };
};

/**
 * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data
 *
 * This structure is the "EHT Capabilities element" fixed fields as
 * described in P802.11be_D2.0 section 9.4.2.313.
 *
 * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP*
 * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP*
 */
struct ieee80211_eht_cap_elem_fixed {
        u8 mac_cap_info[2];
        u8 phy_cap_info[9];
} __packed;

/**
 * struct ieee80211_eht_cap_elem - EHT capabilities element
 * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed
 * @optional: optional parts
 */
struct ieee80211_eht_cap_elem {
        struct ieee80211_eht_cap_elem_fixed fixed;

        /*
         * Followed by:
         * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets.
         * EHT PPE Thresholds field: variable length.
         */
        u8 optional[];
} __packed;

#define IEEE80211_EHT_OPER_INFO_PRESENT                                0x01
#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT        0x02
#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION                        0x04
#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT         0x08
#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK      0x30

/**
 * struct ieee80211_eht_operation - eht operation element
 *
 * This structure is the "EHT Operation Element" fields as
 * described in P802.11be_D2.0 section 9.4.2.311
 *
 * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_*
 * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in
 *     EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and
 *     receive.
 * @optional: optional parts
 */
struct ieee80211_eht_operation {
        u8 params;
        struct ieee80211_eht_mcs_nss_supp_20mhz_only basic_mcs_nss;
        u8 optional[];
} __packed;

/**
 * struct ieee80211_eht_operation_info - eht operation information
 *
 * @control: EHT operation information control.
 * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz
 *     EHT BSS.
 * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS.
 * @optional: optional parts
 */
struct ieee80211_eht_operation_info {
        u8 control;
        u8 ccfs0;
        u8 ccfs1;
        u8 optional[];
} __packed;

/* 802.11ac VHT Capabilities */
#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895                        0x00000000
#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991                        0x00000001
#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454                        0x00000002
#define IEEE80211_VHT_CAP_MAX_MPDU_MASK                                0x00000003
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ                0x00000004
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ        0x00000008
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK                        0x0000000C
#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT                        2
#define IEEE80211_VHT_CAP_RXLDPC                                0x00000010
#define IEEE80211_VHT_CAP_SHORT_GI_80                                0x00000020
#define IEEE80211_VHT_CAP_SHORT_GI_160                                0x00000040
#define IEEE80211_VHT_CAP_TXSTBC                                0x00000080
#define IEEE80211_VHT_CAP_RXSTBC_1                                0x00000100
#define IEEE80211_VHT_CAP_RXSTBC_2                                0x00000200
#define IEEE80211_VHT_CAP_RXSTBC_3                                0x00000300
#define IEEE80211_VHT_CAP_RXSTBC_4                                0x00000400
#define IEEE80211_VHT_CAP_RXSTBC_MASK                                0x00000700
#define IEEE80211_VHT_CAP_RXSTBC_SHIFT                                8
#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE                        0x00000800
#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE                        0x00001000
#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT                  13
#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK                        \
                (7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT)
#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT                16
#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK                \
                (7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT)
#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE                        0x00080000
#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE                        0x00100000
#define IEEE80211_VHT_CAP_VHT_TXOP_PS                                0x00200000
#define IEEE80211_VHT_CAP_HTC_VHT                                0x00400000
#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT        23
#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK        \
                (7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT)
#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB        0x08000000
#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB        0x0c000000
#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN                        0x10000000
#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN                        0x20000000
#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT                        30
#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK                        0xc0000000

/**
 * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS
 * @cap: VHT capabilities of the peer
 * @bw: bandwidth to use
 * @mcs: MCS index to use
 * @ext_nss_bw_capable: indicates whether or not the local transmitter
 *        (rate scaling algorithm) can deal with the new logic
 *        (dot11VHTExtendedNSSBWCapable)
 * @max_vht_nss: current maximum NSS as advertised by the STA in
 *        operating mode notification, can be 0 in which case the
 *        capability data will be used to derive this (from MCS support)
 * Return: The maximum NSS that can be used for the given bandwidth/MCS
 *        combination
 *
 * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can
 * vary for a given BW/MCS. This function parses the data.
 *
 * Note: This function is exported by cfg80211.
 */
int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
                              enum ieee80211_vht_chanwidth bw,
                              int mcs, bool ext_nss_bw_capable,
                              unsigned int max_vht_nss);

/**
 * enum ieee80211_ap_reg_power - regulatory power for a Access Point
 *
 * @IEEE80211_REG_UNSET_AP: Access Point has no regulatory power mode
 * @IEEE80211_REG_LPI_AP: Indoor Access Point
 * @IEEE80211_REG_SP_AP: Standard power Access Point
 * @IEEE80211_REG_VLP_AP: Very low power Access Point
 * @IEEE80211_REG_AP_POWER_AFTER_LAST: internal
 * @IEEE80211_REG_AP_POWER_MAX: maximum value
 */
enum ieee80211_ap_reg_power {
        IEEE80211_REG_UNSET_AP,
        IEEE80211_REG_LPI_AP,
        IEEE80211_REG_SP_AP,
        IEEE80211_REG_VLP_AP,
        IEEE80211_REG_AP_POWER_AFTER_LAST,
        IEEE80211_REG_AP_POWER_MAX =
                IEEE80211_REG_AP_POWER_AFTER_LAST - 1,
};

/**
 * enum ieee80211_client_reg_power - regulatory power for a client
 *
 * @IEEE80211_REG_UNSET_CLIENT: Client has no regulatory power mode
 * @IEEE80211_REG_DEFAULT_CLIENT: Default Client
 * @IEEE80211_REG_SUBORDINATE_CLIENT: Subordinate Client
 * @IEEE80211_REG_CLIENT_POWER_AFTER_LAST: internal
 * @IEEE80211_REG_CLIENT_POWER_MAX: maximum value
 */
enum ieee80211_client_reg_power {
        IEEE80211_REG_UNSET_CLIENT,
        IEEE80211_REG_DEFAULT_CLIENT,
        IEEE80211_REG_SUBORDINATE_CLIENT,
        IEEE80211_REG_CLIENT_POWER_AFTER_LAST,
        IEEE80211_REG_CLIENT_POWER_MAX =
                IEEE80211_REG_CLIENT_POWER_AFTER_LAST - 1,
};

/* 802.11ax HE MAC capabilities */
#define IEEE80211_HE_MAC_CAP0_HTC_HE                                0x01
#define IEEE80211_HE_MAC_CAP0_TWT_REQ                                0x02
#define IEEE80211_HE_MAC_CAP0_TWT_RES                                0x04
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP                0x00
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1                0x08
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2                0x10
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3                0x18
#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK                        0x18
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1                0x00
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2                0x20
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4                0x40
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8                0x60
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16                0x80
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32                0xa0
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64                0xc0
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED        0xe0
#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK                0xe0

#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED                0x00
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128                        0x01
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256                        0x02
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512                        0x03
#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK                0x03
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US                0x00
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US                0x04
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US                0x08
#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK                0x0c
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1                0x00
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2                0x10
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3                0x20
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4                0x30
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5                0x40
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6                0x50
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7                0x60
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8                0x70
#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK                0x70

/* Link adaptation is split between byte HE_MAC_CAP1 and
 * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE
 * in which case the following values apply:
 * 0 = No feedback.
 * 1 = reserved.
 * 2 = Unsolicited feedback.
 * 3 = both
 */
#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION                        0x80

#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION                        0x01
#define IEEE80211_HE_MAC_CAP2_ALL_ACK                                0x02
#define IEEE80211_HE_MAC_CAP2_TRS                                0x04
#define IEEE80211_HE_MAC_CAP2_BSR                                0x08
#define IEEE80211_HE_MAC_CAP2_BCAST_TWT                                0x10
#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP                        0x20
#define IEEE80211_HE_MAC_CAP2_MU_CASCADING                        0x40
#define IEEE80211_HE_MAC_CAP2_ACK_EN                                0x80

#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL                        0x02
#define IEEE80211_HE_MAC_CAP3_OFDMA_RA                                0x04

/* The maximum length of an A-MDPU is defined by the combination of the Maximum
 * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the
 * same field in the HE capabilities.
 */
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0                0x00
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1                0x08
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2                0x10
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3                0x18
#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK                0x18
#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG                        0x20
#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED                        0x40
#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS                0x80

#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG                0x01
#define IEEE80211_HE_MAC_CAP4_QTP                                0x02
#define IEEE80211_HE_MAC_CAP4_BQR                                0x04
#define IEEE80211_HE_MAC_CAP4_PSR_RESP                                0x08
#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP                        0x10
#define IEEE80211_HE_MAC_CAP4_OPS                                0x20
#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU                        0x40
/* Multi TID agg TX is split between byte #4 and #5
 * The value is a combination of B39,B40,B41
 */
#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39                0x80

#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40                0x01
#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41                0x02
#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION        0x04
#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU                        0x08
#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX                0x10
#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS                        0x20
#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING                0x40
#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX                0x80

#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR        20
#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR        16
#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR        13

/* 802.11ax HE PHY capabilities */
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G                0x02
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G        0x04
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G                0x08
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G        0x10
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL                0x1e

#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G        0x20
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G        0x40
#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK                        0xfe

#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ        0x01
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ        0x02
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ        0x04
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ        0x08
#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK                        0x0f
#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A                                0x10
#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD                        0x20
#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US                0x40
/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */
#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS                        0x80

#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS                        0x01
#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US                        0x02
#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ                        0x04
#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ                        0x08
#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX                                0x10
#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX                                0x20

/* Note that the meaning of UL MU below is different between an AP and a non-AP
 * sta, where in the AP case it indicates support for Rx and in the non-AP sta
 * case it indicates support for Tx.
 */
#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO                        0x40
#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO                        0x80

#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM                        0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK                        0x01
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK                        0x02
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM                        0x03
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK                        0x03
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1                                0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2                                0x04
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM                        0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK                        0x08
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK                        0x10
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM                        0x18
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK                        0x18
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1                                0x00
#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2                                0x20
#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU                0x40
#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER                                0x80

#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE                                0x01
#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER                                0x02

/* Minimal allowed value of Max STS under 80MHz is 3 */
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4                0x0c
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5                0x10
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6                0x14
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7                0x18
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8                0x1c
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK        0x1c

/* Minimal allowed value of Max STS above 80MHz is 3 */
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4                0x60
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5                0x80
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6                0xa0
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7                0xc0
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8                0xe0
#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK        0xe0

#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1        0x00
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2        0x01
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3        0x02
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4        0x03
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5        0x04
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6        0x05
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7        0x06
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8        0x07
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK        0x07

#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1        0x00
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2        0x08
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3        0x10
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4        0x18
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5        0x20
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6        0x28
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7        0x30
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8        0x38
#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK        0x38

#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK                                0x40
#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK                                0x80

#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU                        0x01
#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU                        0x02
#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB                        0x04
#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB                0x08
#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB                                0x10
#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE                        0x20
#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO                0x40
#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT                        0x80

#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR                                0x01
#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP                        0x02
#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI                0x04
#define IEEE80211_HE_PHY_CAP7_MAX_NC_1                                        0x08
#define IEEE80211_HE_PHY_CAP7_MAX_NC_2                                        0x10
#define IEEE80211_HE_PHY_CAP7_MAX_NC_3                                        0x18
#define IEEE80211_HE_PHY_CAP7_MAX_NC_4                                        0x20
#define IEEE80211_HE_PHY_CAP7_MAX_NC_5                                        0x28
#define IEEE80211_HE_PHY_CAP7_MAX_NC_6                                        0x30
#define IEEE80211_HE_PHY_CAP7_MAX_NC_7                                        0x38
#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK                                0x38
#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ                        0x40
#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ                        0x80

#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI                0x01
#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G                0x02
#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU                        0x04
#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU                        0x08
#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI                0x10
#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF                0x20
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242                                0x00
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484                                0x40
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996                                0x80
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996                                0xc0
#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK                                0xc0

#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM                0x01
#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK                0x02
#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU                0x04
#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU                0x08
#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB        0x10
#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB        0x20
#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US                        0x0
#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US                        0x1
#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US                        0x2
#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED                0x3
#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS                        6
#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK                        0xc0

#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF                        0x01

/* 802.11ax HE TX/RX MCS NSS Support  */
#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS                        (3)
#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS                        (6)
#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS                        (11)
#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK                        0x07c0
#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK                        0xf800

/* TX/RX HE MCS Support field Highest MCS subfield encoding */
enum ieee80211_he_highest_mcs_supported_subfield_enc {
        HIGHEST_MCS_SUPPORTED_MCS7 = 0,
        HIGHEST_MCS_SUPPORTED_MCS8,
        HIGHEST_MCS_SUPPORTED_MCS9,
        HIGHEST_MCS_SUPPORTED_MCS10,
        HIGHEST_MCS_SUPPORTED_MCS11,
};

/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */
static inline u8
ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap)
{
        u8 count = 4;

        if (he_cap->phy_cap_info[0] &
            IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
                count += 4;

        if (he_cap->phy_cap_info[0] &
            IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
                count += 4;

        return count;
}

/* 802.11ax HE PPE Thresholds */
#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS                        (1)
#define IEEE80211_PPE_THRES_NSS_POS                                (0)
#define IEEE80211_PPE_THRES_NSS_MASK                                (7)
#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU        \
        (BIT(5) | BIT(6))
#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK                0x78
#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS                (3)
#define IEEE80211_PPE_THRES_INFO_PPET_SIZE                        (3)
#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE                        (7)

/*
 * Calculate 802.11ax HE capabilities IE PPE field size
 * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8*
 */
static inline u8
ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
{
        u8 n;

        if ((phy_cap_info[6] &
             IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
                return 0;

        n = hweight8(ppe_thres_hdr &
                     IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
        n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >>
                   IEEE80211_PPE_THRES_NSS_POS));

        /*
         * Each pair is 6 bits, and we need to add the 7 "header" bits to the
         * total size.
         */
        n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
        n = DIV_ROUND_UP(n, 8);

        return n;
}

static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
{
        const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data;
        u8 needed = sizeof(*he_cap_ie_elem);

        if (len < needed)
                return false;

        needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem);
        if (len < needed)
                return false;

        if (he_cap_ie_elem->phy_cap_info[6] &
                        IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) {
                if (len < needed + 1)
                        return false;
                needed += ieee80211_he_ppe_size(data[needed],
                                                he_cap_ie_elem->phy_cap_info);
        }

        return len >= needed;
}

/* HE Operation defines */
#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK                0x00000007
#define IEEE80211_HE_OPERATION_TWT_REQUIRED                        0x00000008
#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK                0x00003ff0
#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET                4
#define IEEE80211_HE_OPERATION_VHT_OPER_INFO                        0x00004000
#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS                        0x00008000
#define IEEE80211_HE_OPERATION_ER_SU_DISABLE                        0x00010000
#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO                        0x00020000
#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK                        0x3f000000
#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET                        24
#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR                0x40000000
#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED                0x80000000

#define IEEE80211_6GHZ_CTRL_REG_LPI_AP                0
#define IEEE80211_6GHZ_CTRL_REG_SP_AP                1
#define IEEE80211_6GHZ_CTRL_REG_VLP_AP                2
#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP        3
#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP        4

/**
 * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
 * @primary: primary channel
 * @control: control flags
 * @ccfs0: channel center frequency segment 0
 * @ccfs1: channel center frequency segment 1
 * @minrate: minimum rate (in 1 Mbps units)
 */
struct ieee80211_he_6ghz_oper {
        u8 primary;
#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH        0x3
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ        0
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ        1
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ        2
#define                IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ        3
#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON        0x4
#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO        0x38
        u8 control;
        u8 ccfs0;
        u8 ccfs1;
        u8 minrate;
} __packed;

/* transmit power interpretation type of transmit power envelope element */
enum ieee80211_tx_power_intrpt_type {
        IEEE80211_TPE_LOCAL_EIRP,
        IEEE80211_TPE_LOCAL_EIRP_PSD,
        IEEE80211_TPE_REG_CLIENT_EIRP,
        IEEE80211_TPE_REG_CLIENT_EIRP_PSD,
};

/* category type of transmit power envelope element */
enum ieee80211_tx_power_category_6ghz {
        IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0,
        IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1,
};

/*
 * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP,
 * setting to 63.5 dBm means no constraint.
 */
#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT        127

/*
 * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD,
 * setting to 127 indicates no PSD limit for the 20 MHz channel.
 */
#define IEEE80211_TPE_PSD_NO_LIMIT                127

/**
 * struct ieee80211_tx_pwr_env - Transmit Power Envelope
 * @info: Transmit Power Information field
 * @variable: Maximum Transmit Power field
 *
 * This structure represents the payload of the "Transmit Power
 * Envelope element" as described in IEEE Std 802.11ax-2021 section
 * 9.4.2.161
 */
struct ieee80211_tx_pwr_env {
        u8 info;
        u8 variable[];
} __packed;

#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7
#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38
#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0

#define IEEE80211_TX_PWR_ENV_EXT_COUNT        0xF

static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len)
{
        const struct ieee80211_tx_pwr_env *env = (const void *)data;
        u8 count, interpret, category;
        u8 needed = sizeof(*env);
        u8 N; /* also called N in the spec */

        if (len < needed)
                return false;

        count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT);
        interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET);
        category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY);

        switch (category) {
        case IEEE80211_TPE_CAT_6GHZ_DEFAULT:
        case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE:
                break;
        default:
                return false;
        }

        switch (interpret) {
        case IEEE80211_TPE_LOCAL_EIRP:
        case IEEE80211_TPE_REG_CLIENT_EIRP:
                if (count > 3)
                        return false;

                /* count == 0 encodes 1 value for 20 MHz, etc. */
                needed += count + 1;

                if (len < needed)
                        return false;

                /* there can be extension fields not accounted for in 'count' */

                return true;
        case IEEE80211_TPE_LOCAL_EIRP_PSD:
        case IEEE80211_TPE_REG_CLIENT_EIRP_PSD:
                if (count > 4)
                        return false;

                N = count ? 1 << (count - 1) : 1;
                needed += N;

                if (len < needed)
                        return false;

                if (len > needed) {
                        u8 K = u8_get_bits(env->variable[N],
                                           IEEE80211_TX_PWR_ENV_EXT_COUNT);

                        needed += 1 + K;
                        if (len < needed)
                                return false;
                }

                return true;
        }

        return false;
}

/*
 * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
 * @he_oper_ie: byte data of the He Operations IE, stating from the byte
 *        after the ext ID byte. It is assumed that he_oper_ie has at least
 *        sizeof(struct ieee80211_he_operation) bytes, the caller must have
 *        validated this.
 * @return the actual size of the IE data (not including header), or 0 on error
 */
static inline u8
ieee80211_he_oper_size(const u8 *he_oper_ie)
{
        const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie;
        u8 oper_len = sizeof(struct ieee80211_he_operation);
        u32 he_oper_params;

        /* Make sure the input is not NULL */
        if (!he_oper_ie)
                return 0;

        /* Calc required length */
        he_oper_params = le32_to_cpu(he_oper->he_oper_params);
        if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
                oper_len += 3;
        if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
                oper_len++;
        if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)
                oper_len += sizeof(struct ieee80211_he_6ghz_oper);

        /* Add the first byte (extension ID) to the total length */
        oper_len++;

        return oper_len;
}

/**
 * ieee80211_he_6ghz_oper - obtain 6 GHz operation field
 * @he_oper: HE operation element (must be pre-validated for size)
 *        but may be %NULL
 *
 * Return: a pointer to the 6 GHz operation field, or %NULL
 */
static inline const struct ieee80211_he_6ghz_oper *
ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper)
{
        const u8 *ret;
        u32 he_oper_params;

        if (!he_oper)
                return NULL;

        ret = (const void *)&he_oper->optional;

        he_oper_params = le32_to_cpu(he_oper->he_oper_params);

        if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO))
                return NULL;
        if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO)
                ret += 3;
        if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
                ret++;

        return (const void *)ret;
}

/* HE Spatial Reuse defines */
#define IEEE80211_HE_SPR_PSR_DISALLOWED                                BIT(0)
#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED                BIT(1)
#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT                        BIT(2)
#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT                BIT(3)
#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED                BIT(4)

/*
 * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size
 * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte
 *        after the ext ID byte. It is assumed that he_spr_ie has at least
 *        sizeof(struct ieee80211_he_spr) bytes, the caller must have validated
 *        this
 * @return the actual size of the IE data (not including header), or 0 on error
 */
static inline u8
ieee80211_he_spr_size(const u8 *he_spr_ie)
{
        const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie;
        u8 spr_len = sizeof(struct ieee80211_he_spr);
        u8 he_spr_params;

        /* Make sure the input is not NULL */
        if (!he_spr_ie)
                return 0;

        /* Calc required length */
        he_spr_params = he_spr->he_sr_control;
        if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT)
                spr_len++;
        if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT)
                spr_len += 18;

        /* Add the first byte (extension ID) to the total length */
        spr_len++;

        return spr_len;
}

/* S1G Capabilities Information field */
#define IEEE80211_S1G_CAPABILITY_LEN        15

#define S1G_CAP0_S1G_LONG        BIT(0)
#define S1G_CAP0_SGI_1MHZ        BIT(1)
#define S1G_CAP0_SGI_2MHZ        BIT(2)
#define S1G_CAP0_SGI_4MHZ        BIT(3)
#define S1G_CAP0_SGI_8MHZ        BIT(4)
#define S1G_CAP0_SGI_16MHZ        BIT(5)
#define S1G_CAP0_SUPP_CH_WIDTH        GENMASK(7, 6)

#define S1G_SUPP_CH_WIDTH_2        0
#define S1G_SUPP_CH_WIDTH_4        1
#define S1G_SUPP_CH_WIDTH_8        2
#define S1G_SUPP_CH_WIDTH_16        3
#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \
                                                    cap[0])) << 1)

#define S1G_CAP1_RX_LDPC        BIT(0)
#define S1G_CAP1_TX_STBC        BIT(1)
#define S1G_CAP1_RX_STBC        BIT(2)
#define S1G_CAP1_SU_BFER        BIT(3)
#define S1G_CAP1_SU_BFEE        BIT(4)
#define S1G_CAP1_BFEE_STS        GENMASK(7, 5)

#define S1G_CAP2_SOUNDING_DIMENSIONS        GENMASK(2, 0)
#define S1G_CAP2_MU_BFER                BIT(3)
#define S1G_CAP2_MU_BFEE                BIT(4)
#define S1G_CAP2_PLUS_HTC_VHT                BIT(5)
#define S1G_CAP2_TRAVELING_PILOT        GENMASK(7, 6)

#define S1G_CAP3_RD_RESPONDER                BIT(0)
#define S1G_CAP3_HT_DELAYED_BA                BIT(1)
#define S1G_CAP3_MAX_MPDU_LEN                BIT(2)
#define S1G_CAP3_MAX_AMPDU_LEN_EXP        GENMASK(4, 3)
#define S1G_CAP3_MIN_MPDU_START                GENMASK(7, 5)

#define S1G_CAP4_UPLINK_SYNC        BIT(0)
#define S1G_CAP4_DYNAMIC_AID        BIT(1)
#define S1G_CAP4_BAT                BIT(2)
#define S1G_CAP4_TIME_ADE        BIT(3)
#define S1G_CAP4_NON_TIM        BIT(4)
#define S1G_CAP4_GROUP_AID        BIT(5)
#define S1G_CAP4_STA_TYPE        GENMASK(7, 6)

#define S1G_CAP5_CENT_AUTH_CONTROL        BIT(0)
#define S1G_CAP5_DIST_AUTH_CONTROL        BIT(1)
#define S1G_CAP5_AMSDU                        BIT(2)
#define S1G_CAP5_AMPDU                        BIT(3)
#define S1G_CAP5_ASYMMETRIC_BA                BIT(4)
#define S1G_CAP5_FLOW_CONTROL                BIT(5)
#define S1G_CAP5_SECTORIZED_BEAM        GENMASK(7, 6)

#define S1G_CAP6_OBSS_MITIGATION        BIT(0)
#define S1G_CAP6_FRAGMENT_BA                BIT(1)
#define S1G_CAP6_NDP_PS_POLL                BIT(2)
#define S1G_CAP6_RAW_OPERATION                BIT(3)
#define S1G_CAP6_PAGE_SLICING                BIT(4)
#define S1G_CAP6_TXOP_SHARING_IMP_ACK        BIT(5)
#define S1G_CAP6_VHT_LINK_ADAPT                GENMASK(7, 6)

#define S1G_CAP7_TACK_AS_PS_POLL                BIT(0)
#define S1G_CAP7_DUP_1MHZ                        BIT(1)
#define S1G_CAP7_MCS_NEGOTIATION                BIT(2)
#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE        BIT(3)
#define S1G_CAP7_NDP_BFING_REPORT_POLL                BIT(4)
#define S1G_CAP7_UNSOLICITED_DYN_AID                BIT(5)
#define S1G_CAP7_SECTOR_TRAINING_OPERATION        BIT(6)
#define S1G_CAP7_TEMP_PS_MODE_SWITCH                BIT(7)

#define S1G_CAP8_TWT_GROUPING        BIT(0)
#define S1G_CAP8_BDT                BIT(1)
#define S1G_CAP8_COLOR                GENMASK(4, 2)
#define S1G_CAP8_TWT_REQUEST        BIT(5)
#define S1G_CAP8_TWT_RESPOND        BIT(6)
#define S1G_CAP8_PV1_FRAME        BIT(7)

#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0)

#define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ        BIT(0)
#define S1G_OPER_CH_WIDTH_OPER                GENMASK(4, 1)

/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */
#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS                        0x01
#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL                        0x02
#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1                0x04
#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2                0x08
#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT                        0x10
#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC                        0x20
#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK                0xc0
#define        IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895                0
#define        IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991                1
#define        IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454                2

#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK                0x01

/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */
#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ                        0x02
#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ                0x04
#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI                0x08
#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO                0x10
#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER                        0x20
#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE                        0x40

/* EHT beamformee number of spatial streams <= 80MHz is split */
#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK                0x80
#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK                0x03

#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK        0x1c
#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK        0xe0

#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK                0x07
#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK                0x38

/* EHT number of sounding dimensions for 320MHz is split */
#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK                0xc0
#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK                0x01
#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK                0x02
#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK                0x04
#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK                0x08
#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK                0x10
#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK                        0x20
#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK                0x40
#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK                        0x80

#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO                0x01
#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP                        0x02
#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP                0x04
#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI        0x08
#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK                        0xf0

#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK                0x01
#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP                0x02
#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP                0x04
#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT                0x08
#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK        0x30
#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US        0
#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US        1
#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US        2
#define   IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US        3

/* Maximum number of supported EHT LTF is split */
#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK        0xc0
#define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF                0x40
#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK        0x07

#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ                        0x08
#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ                0x30
#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ                0x40
#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK                        0x78
#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP                0x80

#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW        0x01
#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ        0x02
#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ        0x04
#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ        0x08
#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ                0x10
#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ                0x20
#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ                0x40
#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT        0x80

#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA        0x01
#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA        0x02

/*
 * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311
 */
#define IEEE80211_EHT_OPER_CHAN_WIDTH                0x7
#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ        0
#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ        1
#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ        2
#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ        3
#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ        4

/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */
static inline u8
ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap,
                           const struct ieee80211_eht_cap_elem_fixed *eht_cap,
                           bool from_ap)
{
        u8 count = 0;

        /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */
        if (he_cap->phy_cap_info[0] &
            IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G)
                return 3;

        /* on 2.4 GHz, these three bits are reserved, so should be 0 */
        if (he_cap->phy_cap_info[0] &
            IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)
                count += 3;

        if (he_cap->phy_cap_info[0] &
            IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)
                count += 3;

        if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)
                count += 3;

        if (count)
                return count;

        return from_ap ? 3 : 4;
}

/* 802.11be EHT PPE Thresholds */
#define IEEE80211_EHT_PPE_THRES_NSS_POS                        0
#define IEEE80211_EHT_PPE_THRES_NSS_MASK                0xf
#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK        0x1f0
#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE                3
#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE        9

/*
 * Calculate 802.11be EHT capabilities IE EHT field size
 */
static inline u8
ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info)
{
        u32 n;

        if (!(phy_cap_info[5] &
              IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT))
                return 0;

        n = hweight16(ppe_thres_hdr &
                      IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK);
        n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK);

        /*
         * Each pair is 6 bits, and we need to add the 9 "header" bits to the
         * total size.
         */
        n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 +
            IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE;
        return DIV_ROUND_UP(n, 8);
}

static inline bool
ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len,
                           bool from_ap)
{
        const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data;
        u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed);

        if (len < needed || !he_capa)
                return false;

        needed += ieee80211_eht_mcs_nss_size((const void *)he_capa,
                                             (const void *)data,
                                             from_ap);
        if (len < needed)
                return false;

        if (elem->phy_cap_info[5] &
                        IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) {
                u16 ppe_thres_hdr;

                if (len < needed + sizeof(ppe_thres_hdr))
                        return false;

                ppe_thres_hdr = get_unaligned_le16(data + needed);
                needed += ieee80211_eht_ppe_size(ppe_thres_hdr,
                                                 elem->phy_cap_info);
        }

        return len >= needed;
}

static inline bool
ieee80211_eht_oper_size_ok(const u8 *data, u8 len)
{
        const struct ieee80211_eht_operation *elem = (const void *)data;
        u8 needed = sizeof(*elem);

        if (len < needed)
                return false;

        if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) {
                needed += 3;

                if (elem->params &
                    IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)
                        needed += 2;
        }

        return len >= needed;
}

/* must validate ieee80211_eht_oper_size_ok() first */
static inline u16
ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper)
{
        const struct ieee80211_eht_operation_info *info =
                (const void *)eht_oper->optional;

        if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT))
                return 0;

        if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT))
                return 0;

        return get_unaligned_le16(info->optional);
}

#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT        BIT(1)

struct ieee80211_bandwidth_indication {
        u8 params;
        struct ieee80211_eht_operation_info info;
} __packed;

static inline bool
ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len)
{
        const struct ieee80211_bandwidth_indication *bwi = (const void *)data;

        if (len < sizeof(*bwi))
                return false;

        if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT &&
            len < sizeof(*bwi) + 2)
                return false;

        return true;
}

#define LISTEN_INT_USF        GENMASK(15, 14)
#define LISTEN_INT_UI        GENMASK(13, 0)

#define IEEE80211_MAX_USF        FIELD_MAX(LISTEN_INT_USF)
#define IEEE80211_MAX_UI        FIELD_MAX(LISTEN_INT_UI)

/* Authentication algorithms */
#define WLAN_AUTH_OPEN 0
#define WLAN_AUTH_SHARED_KEY 1
#define WLAN_AUTH_FT 2
#define WLAN_AUTH_SAE 3
#define WLAN_AUTH_FILS_SK 4
#define WLAN_AUTH_FILS_SK_PFS 5
#define WLAN_AUTH_FILS_PK 6
#define WLAN_AUTH_LEAP 128

#define WLAN_AUTH_CHALLENGE_LEN 128

#define WLAN_CAPABILITY_ESS                (1<<0)
#define WLAN_CAPABILITY_IBSS                (1<<1)

/*
 * A mesh STA sets the ESS and IBSS capability bits to zero.
 * however, this holds true for p2p probe responses (in the p2p_find
 * phase) as well.
 */
#define WLAN_CAPABILITY_IS_STA_BSS(cap)        \
        (!((cap) & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS)))

#define WLAN_CAPABILITY_CF_POLLABLE        (1<<2)
#define WLAN_CAPABILITY_CF_POLL_REQUEST        (1<<3)
#define WLAN_CAPABILITY_PRIVACY                (1<<4)
#define WLAN_CAPABILITY_SHORT_PREAMBLE        (1<<5)
#define WLAN_CAPABILITY_PBCC                (1<<6)
#define WLAN_CAPABILITY_CHANNEL_AGILITY        (1<<7)

/* 802.11h */
#define WLAN_CAPABILITY_SPECTRUM_MGMT        (1<<8)
#define WLAN_CAPABILITY_QOS                (1<<9)
#define WLAN_CAPABILITY_SHORT_SLOT_TIME        (1<<10)
#define WLAN_CAPABILITY_APSD                (1<<11)
#define WLAN_CAPABILITY_RADIO_MEASURE        (1<<12)
#define WLAN_CAPABILITY_DSSS_OFDM        (1<<13)
#define WLAN_CAPABILITY_DEL_BACK        (1<<14)
#define WLAN_CAPABILITY_IMM_BACK        (1<<15)

/* DMG (60gHz) 802.11ad */
/* type - bits 0..1 */
#define WLAN_CAPABILITY_DMG_TYPE_MASK                (3<<0)
#define WLAN_CAPABILITY_DMG_TYPE_IBSS                (1<<0) /* Tx by: STA */
#define WLAN_CAPABILITY_DMG_TYPE_PBSS                (2<<0) /* Tx by: PCP */
#define WLAN_CAPABILITY_DMG_TYPE_AP                (3<<0) /* Tx by: AP */

#define WLAN_CAPABILITY_DMG_CBAP_ONLY                (1<<2)
#define WLAN_CAPABILITY_DMG_CBAP_SOURCE                (1<<3)
#define WLAN_CAPABILITY_DMG_PRIVACY                (1<<4)
#define WLAN_CAPABILITY_DMG_ECPAC                (1<<5)

#define WLAN_CAPABILITY_DMG_SPECTRUM_MGMT        (1<<8)
#define WLAN_CAPABILITY_DMG_RADIO_MEASURE        (1<<12)

/* measurement */
#define IEEE80211_SPCT_MSR_RPRT_MODE_LATE        (1<<0)
#define IEEE80211_SPCT_MSR_RPRT_MODE_INCAPABLE        (1<<1)
#define IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED        (1<<2)

#define IEEE80211_SPCT_MSR_RPRT_TYPE_BASIC        0
#define IEEE80211_SPCT_MSR_RPRT_TYPE_CCA        1
#define IEEE80211_SPCT_MSR_RPRT_TYPE_RPI        2
#define IEEE80211_SPCT_MSR_RPRT_TYPE_LCI        8
#define IEEE80211_SPCT_MSR_RPRT_TYPE_CIVIC        11

/* 802.11g ERP information element */
#define WLAN_ERP_NON_ERP_PRESENT (1<<0)
#define WLAN_ERP_USE_PROTECTION (1<<1)
#define WLAN_ERP_BARKER_PREAMBLE (1<<2)

/* WLAN_ERP_BARKER_PREAMBLE values */
enum {
        WLAN_ERP_PREAMBLE_SHORT = 0,
        WLAN_ERP_PREAMBLE_LONG = 1,
};

/* Band ID, 802.11ad #8.4.1.45 */
enum {
        IEEE80211_BANDID_TV_WS = 0, /* TV white spaces */
        IEEE80211_BANDID_SUB1  = 1, /* Sub-1 GHz (excluding TV white spaces) */
        IEEE80211_BANDID_2G    = 2, /* 2.4 GHz */
        IEEE80211_BANDID_3G    = 3, /* 3.6 GHz */
        IEEE80211_BANDID_5G    = 4, /* 4.9 and 5 GHz */
        IEEE80211_BANDID_60G   = 5, /* 60 GHz */
};

/* Status codes */
enum ieee80211_statuscode {
        WLAN_STATUS_SUCCESS = 0,
        WLAN_STATUS_UNSPECIFIED_FAILURE = 1,
        WLAN_STATUS_CAPS_UNSUPPORTED = 10,
        WLAN_STATUS_REASSOC_NO_ASSOC = 11,
        WLAN_STATUS_ASSOC_DENIED_UNSPEC = 12,
        WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG = 13,
        WLAN_STATUS_UNKNOWN_AUTH_TRANSACTION = 14,
        WLAN_STATUS_CHALLENGE_FAIL = 15,
        WLAN_STATUS_AUTH_TIMEOUT = 16,
        WLAN_STATUS_AP_UNABLE_TO_HANDLE_NEW_STA = 17,
        WLAN_STATUS_ASSOC_DENIED_RATES = 18,
        /* 802.11b */
        WLAN_STATUS_ASSOC_DENIED_NOSHORTPREAMBLE = 19,
        WLAN_STATUS_ASSOC_DENIED_NOPBCC = 20,
        WLAN_STATUS_ASSOC_DENIED_NOAGILITY = 21,
        /* 802.11h */
        WLAN_STATUS_ASSOC_DENIED_NOSPECTRUM = 22,
        WLAN_STATUS_ASSOC_REJECTED_BAD_POWER = 23,
        WLAN_STATUS_ASSOC_REJECTED_BAD_SUPP_CHAN = 24,
        /* 802.11g */
        WLAN_STATUS_ASSOC_DENIED_NOSHORTTIME = 25,
        WLAN_STATUS_ASSOC_DENIED_NODSSSOFDM = 26,
        /* 802.11w */
        WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY = 30,
        WLAN_STATUS_ROBUST_MGMT_FRAME_POLICY_VIOLATION = 31,
        /* 802.11i */
        WLAN_STATUS_INVALID_IE = 40,
        WLAN_STATUS_INVALID_GROUP_CIPHER = 41,
        WLAN_STATUS_INVALID_PAIRWISE_CIPHER = 42,
        WLAN_STATUS_INVALID_AKMP = 43,
        WLAN_STATUS_UNSUPP_RSN_VERSION = 44,
        WLAN_STATUS_INVALID_RSN_IE_CAP = 45,
        WLAN_STATUS_CIPHER_SUITE_REJECTED = 46,
        /* 802.11e */
        WLAN_STATUS_UNSPECIFIED_QOS = 32,
        WLAN_STATUS_ASSOC_DENIED_NOBANDWIDTH = 33,
        WLAN_STATUS_ASSOC_DENIED_LOWACK = 34,
        WLAN_STATUS_ASSOC_DENIED_UNSUPP_QOS = 35,
        WLAN_STATUS_REQUEST_DECLINED = 37,
        WLAN_STATUS_INVALID_QOS_PARAM = 38,
        WLAN_STATUS_CHANGE_TSPEC = 39,
        WLAN_STATUS_WAIT_TS_DELAY = 47,
        WLAN_STATUS_NO_DIRECT_LINK = 48,
        WLAN_STATUS_STA_NOT_PRESENT = 49,
        WLAN_STATUS_STA_NOT_QSTA = 50,
        /* 802.11s */
        WLAN_STATUS_ANTI_CLOG_REQUIRED = 76,
        WLAN_STATUS_FCG_NOT_SUPP = 78,
        WLAN_STATUS_STA_NO_TBTT = 78,
        /* 802.11ad */
        WLAN_STATUS_REJECTED_WITH_SUGGESTED_CHANGES = 39,
        WLAN_STATUS_REJECTED_FOR_DELAY_PERIOD = 47,
        WLAN_STATUS_REJECT_WITH_SCHEDULE = 83,
        WLAN_STATUS_PENDING_ADMITTING_FST_SESSION = 86,
        WLAN_STATUS_PERFORMING_FST_NOW = 87,
        WLAN_STATUS_PENDING_GAP_IN_BA_WINDOW = 88,
        WLAN_STATUS_REJECT_U_PID_SETTING = 89,
        WLAN_STATUS_REJECT_DSE_BAND = 96,
        WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99,
        WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103,
        /* 802.11ai */
        WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 108,
        WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109,
        WLAN_STATUS_SAE_HASH_TO_ELEMENT = 126,
        WLAN_STATUS_SAE_PK = 127,
        WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133,
        WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED = 134,
};


/* Reason codes */
enum ieee80211_reasoncode {
        WLAN_REASON_UNSPECIFIED = 1,
        WLAN_REASON_PREV_AUTH_NOT_VALID = 2,
        WLAN_REASON_DEAUTH_LEAVING = 3,
        WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY = 4,
        WLAN_REASON_DISASSOC_AP_BUSY = 5,
        WLAN_REASON_CLASS2_FRAME_FROM_NONAUTH_STA = 6,
        WLAN_REASON_CLASS3_FRAME_FROM_NONASSOC_STA = 7,
        WLAN_REASON_DISASSOC_STA_HAS_LEFT = 8,
        WLAN_REASON_STA_REQ_ASSOC_WITHOUT_AUTH = 9,
        /* 802.11h */
        WLAN_REASON_DISASSOC_BAD_POWER = 10,
        WLAN_REASON_DISASSOC_BAD_SUPP_CHAN = 11,
        /* 802.11i */
        WLAN_REASON_INVALID_IE = 13,
        WLAN_REASON_MIC_FAILURE = 14,
        WLAN_REASON_4WAY_HANDSHAKE_TIMEOUT = 15,
        WLAN_REASON_GROUP_KEY_HANDSHAKE_TIMEOUT = 16,
        WLAN_REASON_IE_DIFFERENT = 17,
        WLAN_REASON_INVALID_GROUP_CIPHER = 18,
        WLAN_REASON_INVALID_PAIRWISE_CIPHER = 19,
        WLAN_REASON_INVALID_AKMP = 20,
        WLAN_REASON_UNSUPP_RSN_VERSION = 21,
        WLAN_REASON_INVALID_RSN_IE_CAP = 22,
        WLAN_REASON_IEEE8021X_FAILED = 23,
        WLAN_REASON_CIPHER_SUITE_REJECTED = 24,
        /* TDLS (802.11z) */
        WLAN_REASON_TDLS_TEARDOWN_UNREACHABLE = 25,
        WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED = 26,
        /* 802.11e */
        WLAN_REASON_DISASSOC_UNSPECIFIED_QOS = 32,
        WLAN_REASON_DISASSOC_QAP_NO_BANDWIDTH = 33,
        WLAN_REASON_DISASSOC_LOW_ACK = 34,
        WLAN_REASON_DISASSOC_QAP_EXCEED_TXOP = 35,
        WLAN_REASON_QSTA_LEAVE_QBSS = 36,
        WLAN_REASON_QSTA_NOT_USE = 37,
        WLAN_REASON_QSTA_REQUIRE_SETUP = 38,
        WLAN_REASON_QSTA_TIMEOUT = 39,
        WLAN_REASON_QSTA_CIPHER_NOT_SUPP = 45,
        /* 802.11s */
        WLAN_REASON_MESH_PEER_CANCELED = 52,
        WLAN_REASON_MESH_MAX_PEERS = 53,
        WLAN_REASON_MESH_CONFIG = 54,
        WLAN_REASON_MESH_CLOSE = 55,
        WLAN_REASON_MESH_MAX_RETRIES = 56,
        WLAN_REASON_MESH_CONFIRM_TIMEOUT = 57,
        WLAN_REASON_MESH_INVALID_GTK = 58,
        WLAN_REASON_MESH_INCONSISTENT_PARAM = 59,
        WLAN_REASON_MESH_INVALID_SECURITY = 60,
        WLAN_REASON_MESH_PATH_ERROR = 61,
        WLAN_REASON_MESH_PATH_NOFORWARD = 62,
        WLAN_REASON_MESH_PATH_DEST_UNREACHABLE = 63,
        WLAN_REASON_MAC_EXISTS_IN_MBSS = 64,
        WLAN_REASON_MESH_CHAN_REGULATORY = 65,
        WLAN_REASON_MESH_CHAN = 66,
};


/* Information Element IDs */
enum ieee80211_eid {
        WLAN_EID_SSID = 0,
        WLAN_EID_SUPP_RATES = 1,
        WLAN_EID_FH_PARAMS = 2, /* reserved now */
        WLAN_EID_DS_PARAMS = 3,
        WLAN_EID_CF_PARAMS = 4,
        WLAN_EID_TIM = 5,
        WLAN_EID_IBSS_PARAMS = 6,
        WLAN_EID_COUNTRY = 7,
        /* 8, 9 reserved */
        WLAN_EID_REQUEST = 10,
        WLAN_EID_QBSS_LOAD = 11,
        WLAN_EID_EDCA_PARAM_SET = 12,
        WLAN_EID_TSPEC = 13,
        WLAN_EID_TCLAS = 14,
        WLAN_EID_SCHEDULE = 15,
        WLAN_EID_CHALLENGE = 16,
        /* 17-31 reserved for challenge text extension */
        WLAN_EID_PWR_CONSTRAINT = 32,
        WLAN_EID_PWR_CAPABILITY = 33,
        WLAN_EID_TPC_REQUEST = 34,
        WLAN_EID_TPC_REPORT = 35,
        WLAN_EID_SUPPORTED_CHANNELS = 36,
        WLAN_EID_CHANNEL_SWITCH = 37,
        WLAN_EID_MEASURE_REQUEST = 38,
        WLAN_EID_MEASURE_REPORT = 39,
        WLAN_EID_QUIET = 40,
        WLAN_EID_IBSS_DFS = 41,
        WLAN_EID_ERP_INFO = 42,
        WLAN_EID_TS_DELAY = 43,
        WLAN_EID_TCLAS_PROCESSING = 44,
        WLAN_EID_HT_CAPABILITY = 45,
        WLAN_EID_QOS_CAPA = 46,
        /* 47 reserved for Broadcom */
        WLAN_EID_RSN = 48,
        WLAN_EID_802_15_COEX = 49,
        WLAN_EID_EXT_SUPP_RATES = 50,
        WLAN_EID_AP_CHAN_REPORT = 51,
        WLAN_EID_NEIGHBOR_REPORT = 52,
        WLAN_EID_RCPI = 53,
        WLAN_EID_MOBILITY_DOMAIN = 54,
        WLAN_EID_FAST_BSS_TRANSITION = 55,
        WLAN_EID_TIMEOUT_INTERVAL = 56,
        WLAN_EID_RIC_DATA = 57,
        WLAN_EID_DSE_REGISTERED_LOCATION = 58,
        WLAN_EID_SUPPORTED_REGULATORY_CLASSES = 59,
        WLAN_EID_EXT_CHANSWITCH_ANN = 60,
        WLAN_EID_HT_OPERATION = 61,
        WLAN_EID_SECONDARY_CHANNEL_OFFSET = 62,
        WLAN_EID_BSS_AVG_ACCESS_DELAY = 63,
        WLAN_EID_ANTENNA_INFO = 64,
        WLAN_EID_RSNI = 65,
        WLAN_EID_MEASUREMENT_PILOT_TX_INFO = 66,
        WLAN_EID_BSS_AVAILABLE_CAPACITY = 67,
        WLAN_EID_BSS_AC_ACCESS_DELAY = 68,
        WLAN_EID_TIME_ADVERTISEMENT = 69,
        WLAN_EID_RRM_ENABLED_CAPABILITIES = 70,
        WLAN_EID_MULTIPLE_BSSID = 71,
        WLAN_EID_BSS_COEX_2040 = 72,
        WLAN_EID_BSS_INTOLERANT_CHL_REPORT = 73,
        WLAN_EID_OVERLAP_BSS_SCAN_PARAM = 74,
        WLAN_EID_RIC_DESCRIPTOR = 75,
        WLAN_EID_MMIE = 76,
        WLAN_EID_ASSOC_COMEBACK_TIME = 77,
        WLAN_EID_EVENT_REQUEST = 78,
        WLAN_EID_EVENT_REPORT = 79,
        WLAN_EID_DIAGNOSTIC_REQUEST = 80,
        WLAN_EID_DIAGNOSTIC_REPORT = 81,
        WLAN_EID_LOCATION_PARAMS = 82,
        WLAN_EID_NON_TX_BSSID_CAP =  83,
        WLAN_EID_SSID_LIST = 84,
        WLAN_EID_MULTI_BSSID_IDX = 85,
        WLAN_EID_FMS_DESCRIPTOR = 86,
        WLAN_EID_FMS_REQUEST = 87,
        WLAN_EID_FMS_RESPONSE = 88,
        WLAN_EID_QOS_TRAFFIC_CAPA = 89,
        WLAN_EID_BSS_MAX_IDLE_PERIOD = 90,
        WLAN_EID_TSF_REQUEST = 91,
        WLAN_EID_TSF_RESPOSNE = 92,
        WLAN_EID_WNM_SLEEP_MODE = 93,
        WLAN_EID_TIM_BCAST_REQ = 94,
        WLAN_EID_TIM_BCAST_RESP = 95,
        WLAN_EID_COLL_IF_REPORT = 96,
        WLAN_EID_CHANNEL_USAGE = 97,
        WLAN_EID_TIME_ZONE = 98,
        WLAN_EID_DMS_REQUEST = 99,
        WLAN_EID_DMS_RESPONSE = 100,
        WLAN_EID_LINK_ID = 101,
        WLAN_EID_WAKEUP_SCHEDUL = 102,
        /* 103 reserved */
        WLAN_EID_CHAN_SWITCH_TIMING = 104,
        WLAN_EID_PTI_CONTROL = 105,
        WLAN_EID_PU_BUFFER_STATUS = 106,
        WLAN_EID_INTERWORKING = 107,
        WLAN_EID_ADVERTISEMENT_PROTOCOL = 108,
        WLAN_EID_EXPEDITED_BW_REQ = 109,
        WLAN_EID_QOS_MAP_SET = 110,
        WLAN_EID_ROAMING_CONSORTIUM = 111,
        WLAN_EID_EMERGENCY_ALERT = 112,
        WLAN_EID_MESH_CONFIG = 113,
        WLAN_EID_MESH_ID = 114,
        WLAN_EID_LINK_METRIC_REPORT = 115,
        WLAN_EID_CONGESTION_NOTIFICATION = 116,
        WLAN_EID_PEER_MGMT = 117,
        WLAN_EID_CHAN_SWITCH_PARAM = 118,
        WLAN_EID_MESH_AWAKE_WINDOW = 119,
        WLAN_EID_BEACON_TIMING = 120,
        WLAN_EID_MCCAOP_SETUP_REQ = 121,
        WLAN_EID_MCCAOP_SETUP_RESP = 122,
        WLAN_EID_MCCAOP_ADVERT = 123,
        WLAN_EID_MCCAOP_TEARDOWN = 124,
        WLAN_EID_GANN = 125,
        WLAN_EID_RANN = 126,
        WLAN_EID_EXT_CAPABILITY = 127,
        /* 128, 129 reserved for Agere */
        WLAN_EID_PREQ = 130,
        WLAN_EID_PREP = 131,
        WLAN_EID_PERR = 132,
        /* 133-136 reserved for Cisco */
        WLAN_EID_PXU = 137,
        WLAN_EID_PXUC = 138,
        WLAN_EID_AUTH_MESH_PEER_EXCH = 139,
        WLAN_EID_MIC = 140,
        WLAN_EID_DESTINATION_URI = 141,
        WLAN_EID_UAPSD_COEX = 142,
        WLAN_EID_WAKEUP_SCHEDULE = 143,
        WLAN_EID_EXT_SCHEDULE = 144,
        WLAN_EID_STA_AVAILABILITY = 145,
        WLAN_EID_DMG_TSPEC = 146,
        WLAN_EID_DMG_AT = 147,
        WLAN_EID_DMG_CAP = 148,
        /* 149 reserved for Cisco */
        WLAN_EID_CISCO_VENDOR_SPECIFIC = 150,
        WLAN_EID_DMG_OPERATION = 151,
        WLAN_EID_DMG_BSS_PARAM_CHANGE = 152,
        WLAN_EID_DMG_BEAM_REFINEMENT = 153,
        WLAN_EID_CHANNEL_MEASURE_FEEDBACK = 154,
        /* 155-156 reserved for Cisco */
        WLAN_EID_AWAKE_WINDOW = 157,
        WLAN_EID_MULTI_BAND = 158,
        WLAN_EID_ADDBA_EXT = 159,
        WLAN_EID_NEXT_PCP_LIST = 160,
        WLAN_EID_PCP_HANDOVER = 161,
        WLAN_EID_DMG_LINK_MARGIN = 162,
        WLAN_EID_SWITCHING_STREAM = 163,
        WLAN_EID_SESSION_TRANSITION = 164,
        WLAN_EID_DYN_TONE_PAIRING_REPORT = 165,
        WLAN_EID_CLUSTER_REPORT = 166,
        WLAN_EID_RELAY_CAP = 167,
        WLAN_EID_RELAY_XFER_PARAM_SET = 168,
        WLAN_EID_BEAM_LINK_MAINT = 169,
        WLAN_EID_MULTIPLE_MAC_ADDR = 170,
        WLAN_EID_U_PID = 171,
        WLAN_EID_DMG_LINK_ADAPT_ACK = 172,
        /* 173 reserved for Symbol */
        WLAN_EID_MCCAOP_ADV_OVERVIEW = 174,
        WLAN_EID_QUIET_PERIOD_REQ = 175,
        /* 176 reserved for Symbol */
        WLAN_EID_QUIET_PERIOD_RESP = 177,
        /* 178-179 reserved for Symbol */
        /* 180 reserved for ISO/IEC 20011 */
        WLAN_EID_EPAC_POLICY = 182,
        WLAN_EID_CLISTER_TIME_OFF = 183,
        WLAN_EID_INTER_AC_PRIO = 184,
        WLAN_EID_SCS_DESCRIPTOR = 185,
        WLAN_EID_QLOAD_REPORT = 186,
        WLAN_EID_HCCA_TXOP_UPDATE_COUNT = 187,
        WLAN_EID_HL_STREAM_ID = 188,
        WLAN_EID_GCR_GROUP_ADDR = 189,
        WLAN_EID_ANTENNA_SECTOR_ID_PATTERN = 190,
        WLAN_EID_VHT_CAPABILITY = 191,
        WLAN_EID_VHT_OPERATION = 192,
        WLAN_EID_EXTENDED_BSS_LOAD = 193,
        WLAN_EID_WIDE_BW_CHANNEL_SWITCH = 194,
        WLAN_EID_TX_POWER_ENVELOPE = 195,
        WLAN_EID_CHANNEL_SWITCH_WRAPPER = 196,
        WLAN_EID_AID = 197,
        WLAN_EID_QUIET_CHANNEL = 198,
        WLAN_EID_OPMODE_NOTIF = 199,

        WLAN_EID_REDUCED_NEIGHBOR_REPORT = 201,

        WLAN_EID_AID_REQUEST = 210,
        WLAN_EID_AID_RESPONSE = 211,
        WLAN_EID_S1G_BCN_COMPAT = 213,
        WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214,
        WLAN_EID_S1G_TWT = 216,
        WLAN_EID_S1G_CAPABILITIES = 217,
        WLAN_EID_VENDOR_SPECIFIC = 221,
        WLAN_EID_QOS_PARAMETER = 222,
        WLAN_EID_S1G_OPERATION = 232,
        WLAN_EID_CAG_NUMBER = 237,
        WLAN_EID_AP_CSN = 239,
        WLAN_EID_FILS_INDICATION = 240,
        WLAN_EID_DILS = 241,
        WLAN_EID_FRAGMENT = 242,
        WLAN_EID_RSNX = 244,
        WLAN_EID_EXTENSION = 255
};

/* Element ID Extensions for Element ID 255 */
enum ieee80211_eid_ext {
        WLAN_EID_EXT_ASSOC_DELAY_INFO = 1,
        WLAN_EID_EXT_FILS_REQ_PARAMS = 2,
        WLAN_EID_EXT_FILS_KEY_CONFIRM = 3,
        WLAN_EID_EXT_FILS_SESSION = 4,
        WLAN_EID_EXT_FILS_HLP_CONTAINER = 5,
        WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN = 6,
        WLAN_EID_EXT_KEY_DELIVERY = 7,
        WLAN_EID_EXT_FILS_WRAPPED_DATA = 8,
        WLAN_EID_EXT_FILS_PUBLIC_KEY = 12,
        WLAN_EID_EXT_FILS_NONCE = 13,
        WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14,
        WLAN_EID_EXT_HE_CAPABILITY = 35,
        WLAN_EID_EXT_HE_OPERATION = 36,
        WLAN_EID_EXT_UORA = 37,
        WLAN_EID_EXT_HE_MU_EDCA = 38,
        WLAN_EID_EXT_HE_SPR = 39,
        WLAN_EID_EXT_NDP_FEEDBACK_REPORT_PARAMSET = 41,
        WLAN_EID_EXT_BSS_COLOR_CHG_ANN = 42,
        WLAN_EID_EXT_QUIET_TIME_PERIOD_SETUP = 43,
        WLAN_EID_EXT_ESS_REPORT = 45,
        WLAN_EID_EXT_OPS = 46,
        WLAN_EID_EXT_HE_BSS_LOAD = 47,
        WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME = 52,
        WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION = 55,
        WLAN_EID_EXT_NON_INHERITANCE = 56,
        WLAN_EID_EXT_KNOWN_BSSID = 57,
        WLAN_EID_EXT_SHORT_SSID_LIST = 58,
        WLAN_EID_EXT_HE_6GHZ_CAPA = 59,
        WLAN_EID_EXT_UL_MU_POWER_CAPA = 60,
        WLAN_EID_EXT_EHT_OPERATION = 106,
        WLAN_EID_EXT_EHT_MULTI_LINK = 107,
        WLAN_EID_EXT_EHT_CAPABILITY = 108,
        WLAN_EID_EXT_TID_TO_LINK_MAPPING = 109,
        WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
};

/* Action category code */
enum ieee80211_category {
        WLAN_CATEGORY_SPECTRUM_MGMT = 0,
        WLAN_CATEGORY_QOS = 1,
        WLAN_CATEGORY_DLS = 2,
        WLAN_CATEGORY_BACK = 3,
        WLAN_CATEGORY_PUBLIC = 4,
        WLAN_CATEGORY_RADIO_MEASUREMENT = 5,
        WLAN_CATEGORY_FAST_BBS_TRANSITION = 6,
        WLAN_CATEGORY_HT = 7,
        WLAN_CATEGORY_SA_QUERY = 8,
        WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION = 9,
        WLAN_CATEGORY_WNM = 10,
        WLAN_CATEGORY_WNM_UNPROTECTED = 11,
        WLAN_CATEGORY_TDLS = 12,
        WLAN_CATEGORY_MESH_ACTION = 13,
        WLAN_CATEGORY_MULTIHOP_ACTION = 14,
        WLAN_CATEGORY_SELF_PROTECTED = 15,
        WLAN_CATEGORY_DMG = 16,
        WLAN_CATEGORY_WMM = 17,
        WLAN_CATEGORY_FST = 18,
        WLAN_CATEGORY_UNPROT_DMG = 20,
        WLAN_CATEGORY_VHT = 21,
        WLAN_CATEGORY_S1G = 22,
        WLAN_CATEGORY_PROTECTED_EHT = 37,
        WLAN_CATEGORY_VENDOR_SPECIFIC_PROTECTED = 126,
        WLAN_CATEGORY_VENDOR_SPECIFIC = 127,
};

/* SPECTRUM_MGMT action code */
enum ieee80211_spectrum_mgmt_actioncode {
        WLAN_ACTION_SPCT_MSR_REQ = 0,
        WLAN_ACTION_SPCT_MSR_RPRT = 1,
        WLAN_ACTION_SPCT_TPC_REQ = 2,
        WLAN_ACTION_SPCT_TPC_RPRT = 3,
        WLAN_ACTION_SPCT_CHL_SWITCH = 4,
};

/* HT action codes */
enum ieee80211_ht_actioncode {
        WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0,
        WLAN_HT_ACTION_SMPS = 1,
        WLAN_HT_ACTION_PSMP = 2,
        WLAN_HT_ACTION_PCO_PHASE = 3,
        WLAN_HT_ACTION_CSI = 4,
        WLAN_HT_ACTION_NONCOMPRESSED_BF = 5,
        WLAN_HT_ACTION_COMPRESSED_BF = 6,
        WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7,
};

/* VHT action codes */
enum ieee80211_vht_actioncode {
        WLAN_VHT_ACTION_COMPRESSED_BF = 0,
        WLAN_VHT_ACTION_GROUPID_MGMT = 1,
        WLAN_VHT_ACTION_OPMODE_NOTIF = 2,
};

/* Self Protected Action codes */
enum ieee80211_self_protected_actioncode {
        WLAN_SP_RESERVED = 0,
        WLAN_SP_MESH_PEERING_OPEN = 1,
        WLAN_SP_MESH_PEERING_CONFIRM = 2,
        WLAN_SP_MESH_PEERING_CLOSE = 3,
        WLAN_SP_MGK_INFORM = 4,
        WLAN_SP_MGK_ACK = 5,
};

/* Mesh action codes */
enum ieee80211_mesh_actioncode {
        WLAN_MESH_ACTION_LINK_METRIC_REPORT,
        WLAN_MESH_ACTION_HWMP_PATH_SELECTION,
        WLAN_MESH_ACTION_GATE_ANNOUNCEMENT,
        WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION,
        WLAN_MESH_ACTION_MCCA_SETUP_REQUEST,
        WLAN_MESH_ACTION_MCCA_SETUP_REPLY,
        WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST,
        WLAN_MESH_ACTION_MCCA_ADVERTISEMENT,
        WLAN_MESH_ACTION_MCCA_TEARDOWN,
        WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST,
        WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE,
};

/* Unprotected WNM action codes */
enum ieee80211_unprotected_wnm_actioncode {
        WLAN_UNPROTECTED_WNM_ACTION_TIM = 0,
        WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1,
};

/* Protected EHT action codes */
enum ieee80211_protected_eht_actioncode {
        WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0,
        WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1,
        WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2,
};

/* Security key length */
enum ieee80211_key_len {
        WLAN_KEY_LEN_WEP40 = 5,
        WLAN_KEY_LEN_WEP104 = 13,
        WLAN_KEY_LEN_CCMP = 16,
        WLAN_KEY_LEN_CCMP_256 = 32,
        WLAN_KEY_LEN_TKIP = 32,
        WLAN_KEY_LEN_AES_CMAC = 16,
        WLAN_KEY_LEN_SMS4 = 32,
        WLAN_KEY_LEN_GCMP = 16,
        WLAN_KEY_LEN_GCMP_256 = 32,
        WLAN_KEY_LEN_BIP_CMAC_256 = 32,
        WLAN_KEY_LEN_BIP_GMAC_128 = 16,
        WLAN_KEY_LEN_BIP_GMAC_256 = 32,
};

enum ieee80211_s1g_actioncode {
        WLAN_S1G_AID_SWITCH_REQUEST,
        WLAN_S1G_AID_SWITCH_RESPONSE,
        WLAN_S1G_SYNC_CONTROL,
        WLAN_S1G_STA_INFO_ANNOUNCE,
        WLAN_S1G_EDCA_PARAM_SET,
        WLAN_S1G_EL_OPERATION,
        WLAN_S1G_TWT_SETUP,
        WLAN_S1G_TWT_TEARDOWN,
        WLAN_S1G_SECT_GROUP_ID_LIST,
        WLAN_S1G_SECT_ID_FEEDBACK,
        WLAN_S1G_TWT_INFORMATION = 11,
};

#define IEEE80211_WEP_IV_LEN                4
#define IEEE80211_WEP_ICV_LEN                4
#define IEEE80211_CCMP_HDR_LEN                8
#define IEEE80211_CCMP_MIC_LEN                8
#define IEEE80211_CCMP_PN_LEN                6
#define IEEE80211_CCMP_256_HDR_LEN        8
#define IEEE80211_CCMP_256_MIC_LEN        16
#define IEEE80211_CCMP_256_PN_LEN        6
#define IEEE80211_TKIP_IV_LEN                8
#define IEEE80211_TKIP_ICV_LEN                4
#define IEEE80211_CMAC_PN_LEN                6
#define IEEE80211_GMAC_PN_LEN                6
#define IEEE80211_GCMP_HDR_LEN                8
#define IEEE80211_GCMP_MIC_LEN                16
#define IEEE80211_GCMP_PN_LEN                6

#define FILS_NONCE_LEN                        16
#define FILS_MAX_KEK_LEN                64

#define FILS_ERP_MAX_USERNAME_LEN        16
#define FILS_ERP_MAX_REALM_LEN                253
#define FILS_ERP_MAX_RRK_LEN                64

#define PMK_MAX_LEN                        64
#define SAE_PASSWORD_MAX_LEN                128

/* Public action codes (IEEE Std 802.11-2016, 9.6.8.1, Table 9-307) */
enum ieee80211_pub_actioncode {
        WLAN_PUB_ACTION_20_40_BSS_COEX = 0,
        WLAN_PUB_ACTION_DSE_ENABLEMENT = 1,
        WLAN_PUB_ACTION_DSE_DEENABLEMENT = 2,
        WLAN_PUB_ACTION_DSE_REG_LOC_ANN = 3,
        WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4,
        WLAN_PUB_ACTION_DSE_MSMT_REQ = 5,
        WLAN_PUB_ACTION_DSE_MSMT_RESP = 6,
        WLAN_PUB_ACTION_MSMT_PILOT = 7,
        WLAN_PUB_ACTION_DSE_PC = 8,
        WLAN_PUB_ACTION_VENDOR_SPECIFIC = 9,
        WLAN_PUB_ACTION_GAS_INITIAL_REQ = 10,
        WLAN_PUB_ACTION_GAS_INITIAL_RESP = 11,
        WLAN_PUB_ACTION_GAS_COMEBACK_REQ = 12,
        WLAN_PUB_ACTION_GAS_COMEBACK_RESP = 13,
        WLAN_PUB_ACTION_TDLS_DISCOVER_RES = 14,
        WLAN_PUB_ACTION_LOC_TRACK_NOTI = 15,
        WLAN_PUB_ACTION_QAB_REQUEST_FRAME = 16,
        WLAN_PUB_ACTION_QAB_RESPONSE_FRAME = 17,
        WLAN_PUB_ACTION_QMF_POLICY = 18,
        WLAN_PUB_ACTION_QMF_POLICY_CHANGE = 19,
        WLAN_PUB_ACTION_QLOAD_REQUEST = 20,
        WLAN_PUB_ACTION_QLOAD_REPORT = 21,
        WLAN_PUB_ACTION_HCCA_TXOP_ADVERT = 22,
        WLAN_PUB_ACTION_HCCA_TXOP_RESPONSE = 23,
        WLAN_PUB_ACTION_PUBLIC_KEY = 24,
        WLAN_PUB_ACTION_CHANNEL_AVAIL_QUERY = 25,
        WLAN_PUB_ACTION_CHANNEL_SCHEDULE_MGMT = 26,
        WLAN_PUB_ACTION_CONTACT_VERI_SIGNAL = 27,
        WLAN_PUB_ACTION_GDD_ENABLEMENT_REQ = 28,
        WLAN_PUB_ACTION_GDD_ENABLEMENT_RESP = 29,
        WLAN_PUB_ACTION_NETWORK_CHANNEL_CONTROL = 30,
        WLAN_PUB_ACTION_WHITE_SPACE_MAP_ANN = 31,
        WLAN_PUB_ACTION_FTM_REQUEST = 32,
        WLAN_PUB_ACTION_FTM_RESPONSE = 33,
        WLAN_PUB_ACTION_FILS_DISCOVERY = 34,
};

/* TDLS action codes */
enum ieee80211_tdls_actioncode {
        WLAN_TDLS_SETUP_REQUEST = 0,
        WLAN_TDLS_SETUP_RESPONSE = 1,
        WLAN_TDLS_SETUP_CONFIRM = 2,
        WLAN_TDLS_TEARDOWN = 3,
        WLAN_TDLS_PEER_TRAFFIC_INDICATION = 4,
        WLAN_TDLS_CHANNEL_SWITCH_REQUEST = 5,
        WLAN_TDLS_CHANNEL_SWITCH_RESPONSE = 6,
        WLAN_TDLS_PEER_PSM_REQUEST = 7,
        WLAN_TDLS_PEER_PSM_RESPONSE = 8,
        WLAN_TDLS_PEER_TRAFFIC_RESPONSE = 9,
        WLAN_TDLS_DISCOVERY_REQUEST = 10,
};

/* Extended Channel Switching capability to be set in the 1st byte of
 * the @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING        BIT(2)

/* Multiple BSSID capability is set in the 6th bit of 3rd byte of the
 * @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT        BIT(6)

/* Timing Measurement protocol for time sync is set in the 7th bit of 3rd byte
 * of the @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA3_TIMING_MEASUREMENT_SUPPORT        BIT(7)

/* TDLS capabilities in the 4th byte of @WLAN_EID_EXT_CAPABILITY */
#define WLAN_EXT_CAPA4_TDLS_BUFFER_STA                BIT(4)
#define WLAN_EXT_CAPA4_TDLS_PEER_PSM                BIT(5)
#define WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH                BIT(6)

/* Interworking capabilities are set in 7th bit of 4th byte of the
 * @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA4_INTERWORKING_ENABLED        BIT(7)

/*
 * TDLS capabililites to be enabled in the 5th byte of the
 * @WLAN_EID_EXT_CAPABILITY information element
 */
#define WLAN_EXT_CAPA5_TDLS_ENABLED        BIT(5)
#define WLAN_EXT_CAPA5_TDLS_PROHIBITED        BIT(6)
#define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED        BIT(7)

#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED        BIT(5)
#define WLAN_EXT_CAPA8_OPMODE_NOTIF        BIT(6)

/* Defines the maximal number of MSDUs in an A-MSDU. */
#define WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB        BIT(7)
#define WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB        BIT(0)

/*
 * Fine Timing Measurement Initiator - bit 71 of @WLAN_EID_EXT_CAPABILITY
 * information element
 */
#define WLAN_EXT_CAPA9_FTM_INITIATOR        BIT(7)

/* Defines support for TWT Requester and TWT Responder */
#define WLAN_EXT_CAPA10_TWT_REQUESTER_SUPPORT        BIT(5)
#define WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT        BIT(6)

/*
 * When set, indicates that the AP is able to tolerate 26-tone RU UL
 * OFDMA transmissions using HE TB PPDU from OBSS (not falsely classify the
 * 26-tone RU UL OFDMA transmissions as radar pulses).
 */
#define WLAN_EXT_CAPA10_OBSS_NARROW_BW_RU_TOLERANCE_SUPPORT BIT(7)

/* Defines support for enhanced multi-bssid advertisement*/
#define WLAN_EXT_CAPA11_EMA_SUPPORT        BIT(3)

/* TDLS specific payload type in the LLC/SNAP header */
#define WLAN_TDLS_SNAP_RFTYPE        0x2

/* BSS Coex IE information field bits */
#define WLAN_BSS_COEX_INFORMATION_REQUEST        BIT(0)

/**
 * enum ieee80211_mesh_sync_method - mesh synchronization method identifier
 *
 * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method
 * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method
 *        that will be specified in a vendor specific information element
 */
enum ieee80211_mesh_sync_method {
        IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1,
        IEEE80211_SYNC_METHOD_VENDOR = 255,
};

/**
 * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier
 *
 * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol
 * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will
 *        be specified in a vendor specific information element
 */
enum ieee80211_mesh_path_protocol {
        IEEE80211_PATH_PROTOCOL_HWMP = 1,
        IEEE80211_PATH_PROTOCOL_VENDOR = 255,
};

/**
 * enum ieee80211_mesh_path_metric - mesh path selection metric identifier
 *
 * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric
 * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be
 *        specified in a vendor specific information element
 */
enum ieee80211_mesh_path_metric {
        IEEE80211_PATH_METRIC_AIRTIME = 1,
        IEEE80211_PATH_METRIC_VENDOR = 255,
};

/**
 * enum ieee80211_root_mode_identifier - root mesh STA mode identifier
 *
 * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode
 *
 * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default)
 * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than
 *        this value
 * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports
 *        the proactive PREQ with proactive PREP subfield set to 0
 * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA
 *        supports the proactive PREQ with proactive PREP subfield set to 1
 * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports
 *        the proactive RANN
 */
enum ieee80211_root_mode_identifier {
        IEEE80211_ROOTMODE_NO_ROOT = 0,
        IEEE80211_ROOTMODE_ROOT = 1,
        IEEE80211_PROACTIVE_PREQ_NO_PREP = 2,
        IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3,
        IEEE80211_PROACTIVE_RANN = 4,
};

/*
 * IEEE 802.11-2007 7.3.2.9 Country information element
 *
 * Minimum length is 8 octets, ie len must be evenly
 * divisible by 2
 */

/* Although the spec says 8 I'm seeing 6 in practice */
#define IEEE80211_COUNTRY_IE_MIN_LEN        6

/* The Country String field of the element shall be 3 octets in length */
#define IEEE80211_COUNTRY_STRING_LEN        3

/*
 * For regulatory extension stuff see IEEE 802.11-2007
 * Annex I (page 1141) and Annex J (page 1147). Also
 * review 7.3.2.9.
 *
 * When dot11RegulatoryClassesRequired is true and the
 * first_channel/reg_extension_id is >= 201 then the IE
 * compromises of the 'ext' struct represented below:
 *
 *  - Regulatory extension ID - when generating IE this just needs
 *    to be monotonically increasing for each triplet passed in
 *    the IE
 *  - Regulatory class - index into set of rules
 *  - Coverage class - index into air propagation time (Table 7-27),
 *    in microseconds, you can compute the air propagation time from
 *    the index by multiplying by 3, so index 10 yields a propagation
 *    of 10 us. Valid values are 0-31, values 32-255 are not defined
 *    yet. A value of 0 inicates air propagation of <= 1 us.
 *
 *  See also Table I.2 for Emission limit sets and table
 *  I.3 for Behavior limit sets. Table J.1 indicates how to map
 *  a reg_class to an emission limit set and behavior limit set.
 */
#define IEEE80211_COUNTRY_EXTENSION_ID 201

/*
 *  Channels numbers in the IE must be monotonically increasing
 *  if dot11RegulatoryClassesRequired is not true.
 *
 *  If dot11RegulatoryClassesRequired is true consecutive
 *  subband triplets following a regulatory triplet shall
 *  have monotonically increasing first_channel number fields.
 *
 *  Channel numbers shall not overlap.
 *
 *  Note that max_power is signed.
 */
struct ieee80211_country_ie_triplet {
        union {
                struct {
                        u8 first_channel;
                        u8 num_channels;
                        s8 max_power;
                } __packed chans;
                struct {
                        u8 reg_extension_id;
                        u8 reg_class;
                        u8 coverage_class;
                } __packed ext;
        };
} __packed;

enum ieee80211_timeout_interval_type {
        WLAN_TIMEOUT_REASSOC_DEADLINE = 1 /* 802.11r */,
        WLAN_TIMEOUT_KEY_LIFETIME = 2 /* 802.11r */,
        WLAN_TIMEOUT_ASSOC_COMEBACK = 3 /* 802.11w */,
};

/**
 * struct ieee80211_timeout_interval_ie - Timeout Interval element
 * @type: type, see &enum ieee80211_timeout_interval_type
 * @value: timeout interval value
 */
struct ieee80211_timeout_interval_ie {
        u8 type;
        __le32 value;
} __packed;

/**
 * enum ieee80211_idle_options - BSS idle options
 * @WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE: the station should send an RSN
 *        protected frame to the AP to reset the idle timer at the AP for
 *        the station.
 */
enum ieee80211_idle_options {
        WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE = BIT(0),
};

/**
 * struct ieee80211_bss_max_idle_period_ie - BSS max idle period element struct
 *
 * This structure refers to "BSS Max idle period element"
 *
 * @max_idle_period: indicates the time period during which a station can
 *        refrain from transmitting frames to its associated AP without being
 *        disassociated. In units of 1000 TUs.
 * @idle_options: indicates the options associated with the BSS idle capability
 *        as specified in &enum ieee80211_idle_options.
 */
struct ieee80211_bss_max_idle_period_ie {
        __le16 max_idle_period;
        u8 idle_options;
} __packed;

/* BACK action code */
enum ieee80211_back_actioncode {
        WLAN_ACTION_ADDBA_REQ = 0,
        WLAN_ACTION_ADDBA_RESP = 1,
        WLAN_ACTION_DELBA = 2,
};

/* BACK (block-ack) parties */
enum ieee80211_back_parties {
        WLAN_BACK_RECIPIENT = 0,
        WLAN_BACK_INITIATOR = 1,
};

/* SA Query action */
enum ieee80211_sa_query_action {
        WLAN_ACTION_SA_QUERY_REQUEST = 0,
        WLAN_ACTION_SA_QUERY_RESPONSE = 1,
};

/**
 * struct ieee80211_bssid_index - multiple BSSID index element structure
 *
 * This structure refers to "Multiple BSSID-index element"
 *
 * @bssid_index: BSSID index
 * @dtim_period: optional, overrides transmitted BSS dtim period
 * @dtim_count: optional, overrides transmitted BSS dtim count
 */
struct ieee80211_bssid_index {
        u8 bssid_index;
        u8 dtim_period;
        u8 dtim_count;
};

/**
 * struct ieee80211_multiple_bssid_configuration - multiple BSSID configuration
 *        element structure
 *
 * This structure refers to "Multiple BSSID Configuration element"
 *
 * @bssid_count: total number of active BSSIDs in the set
 * @profile_periodicity: the least number of beacon frames need to be received
 *        in order to discover all the nontransmitted BSSIDs in the set.
 */
struct ieee80211_multiple_bssid_configuration {
        u8 bssid_count;
        u8 profile_periodicity;
};

#define SUITE(oui, id)        (((oui) << 8) | (id))

/* cipher suite selectors */
#define WLAN_CIPHER_SUITE_USE_GROUP        SUITE(0x000FAC, 0)
#define WLAN_CIPHER_SUITE_WEP40                SUITE(0x000FAC, 1)
#define WLAN_CIPHER_SUITE_TKIP                SUITE(0x000FAC, 2)
/* reserved:                                 SUITE(0x000FAC, 3) */
#define WLAN_CIPHER_SUITE_CCMP                SUITE(0x000FAC, 4)
#define WLAN_CIPHER_SUITE_WEP104        SUITE(0x000FAC, 5)
#define WLAN_CIPHER_SUITE_AES_CMAC        SUITE(0x000FAC, 6)
#define WLAN_CIPHER_SUITE_GCMP                SUITE(0x000FAC, 8)
#define WLAN_CIPHER_SUITE_GCMP_256        SUITE(0x000FAC, 9)
#define WLAN_CIPHER_SUITE_CCMP_256        SUITE(0x000FAC, 10)
#define WLAN_CIPHER_SUITE_BIP_GMAC_128        SUITE(0x000FAC, 11)
#define WLAN_CIPHER_SUITE_BIP_GMAC_256        SUITE(0x000FAC, 12)
#define WLAN_CIPHER_SUITE_BIP_CMAC_256        SUITE(0x000FAC, 13)

#define WLAN_CIPHER_SUITE_SMS4                SUITE(0x001472, 1)

/* AKM suite selectors */
#define WLAN_AKM_SUITE_8021X                        SUITE(0x000FAC, 1)
#define WLAN_AKM_SUITE_PSK                        SUITE(0x000FAC, 2)
#define WLAN_AKM_SUITE_FT_8021X                        SUITE(0x000FAC, 3)
#define WLAN_AKM_SUITE_FT_PSK                        SUITE(0x000FAC, 4)
#define WLAN_AKM_SUITE_8021X_SHA256                SUITE(0x000FAC, 5)
#define WLAN_AKM_SUITE_PSK_SHA256                SUITE(0x000FAC, 6)
#define WLAN_AKM_SUITE_TDLS                        SUITE(0x000FAC, 7)
#define WLAN_AKM_SUITE_SAE                        SUITE(0x000FAC, 8)
#define WLAN_AKM_SUITE_FT_OVER_SAE                SUITE(0x000FAC, 9)
#define WLAN_AKM_SUITE_AP_PEER_KEY                SUITE(0x000FAC, 10)
#define WLAN_AKM_SUITE_8021X_SUITE_B                SUITE(0x000FAC, 11)
#define WLAN_AKM_SUITE_8021X_SUITE_B_192        SUITE(0x000FAC, 12)
#define WLAN_AKM_SUITE_FT_8021X_SHA384                SUITE(0x000FAC, 13)
#define WLAN_AKM_SUITE_FILS_SHA256                SUITE(0x000FAC, 14)
#define WLAN_AKM_SUITE_FILS_SHA384                SUITE(0x000FAC, 15)
#define WLAN_AKM_SUITE_FT_FILS_SHA256                SUITE(0x000FAC, 16)
#define WLAN_AKM_SUITE_FT_FILS_SHA384                SUITE(0x000FAC, 17)
#define WLAN_AKM_SUITE_OWE                        SUITE(0x000FAC, 18)
#define WLAN_AKM_SUITE_FT_PSK_SHA384                SUITE(0x000FAC, 19)
#define WLAN_AKM_SUITE_PSK_SHA384                SUITE(0x000FAC, 20)

#define WLAN_AKM_SUITE_WFA_DPP                        SUITE(WLAN_OUI_WFA, 2)

#define WLAN_MAX_KEY_LEN                32

#define WLAN_PMK_NAME_LEN                16
#define WLAN_PMKID_LEN                        16
#define WLAN_PMK_LEN_EAP_LEAP                16
#define WLAN_PMK_LEN                        32
#define WLAN_PMK_LEN_SUITE_B_192        48

#define WLAN_OUI_WFA                        0x506f9a
#define WLAN_OUI_TYPE_WFA_P2P                9
#define WLAN_OUI_TYPE_WFA_DPP                0x1A
#define WLAN_OUI_MICROSOFT                0x0050f2
#define WLAN_OUI_TYPE_MICROSOFT_WPA        1
#define WLAN_OUI_TYPE_MICROSOFT_WMM        2
#define WLAN_OUI_TYPE_MICROSOFT_WPS        4
#define WLAN_OUI_TYPE_MICROSOFT_TPC        8

/*
 * WMM/802.11e Tspec Element
 */
#define IEEE80211_WMM_IE_TSPEC_TID_MASK                0x0F
#define IEEE80211_WMM_IE_TSPEC_TID_SHIFT        1

enum ieee80211_tspec_status_code {
        IEEE80211_TSPEC_STATUS_ADMISS_ACCEPTED = 0,
        IEEE80211_TSPEC_STATUS_ADDTS_INVAL_PARAMS = 0x1,
};

struct ieee80211_tspec_ie {
        u8 element_id;
        u8 len;
        u8 oui[3];
        u8 oui_type;
        u8 oui_subtype;
        u8 version;
        __le16 tsinfo;
        u8 tsinfo_resvd;
        __le16 nominal_msdu;
        __le16 max_msdu;
        __le32 min_service_int;
        __le32 max_service_int;
        __le32 inactivity_int;
        __le32 suspension_int;
        __le32 service_start_time;
        __le32 min_data_rate;
        __le32 mean_data_rate;
        __le32 peak_data_rate;
        __le32 max_burst_size;
        __le32 delay_bound;
        __le32 min_phy_rate;
        __le16 sba;
        __le16 medium_time;
} __packed;

struct ieee80211_he_6ghz_capa {
        /* uses IEEE80211_HE_6GHZ_CAP_* below */
        __le16 capa;
} __packed;

/* HE 6 GHz band capabilities */
/* uses enum ieee80211_min_mpdu_spacing values */
#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START        0x0007
/* uses enum ieee80211_vht_max_ampdu_length_exp values */
#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP        0x0038
/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */
#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN        0x00c0
/* WLAN_HT_CAP_SM_PS_* values */
#define IEEE80211_HE_6GHZ_CAP_SM_PS                0x0600
#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER        0x0800
#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS        0x1000
#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS        0x2000

/**
 * ieee80211_get_qos_ctl - get pointer to qos control bytes
 * @hdr: the frame
 * Return: a pointer to the QoS control field in the frame header
 *
 * The qos ctrl bytes come after the frame_control, duration, seq_num
 * and 3 or 4 addresses of length ETH_ALEN. Checks frame_control to choose
 * between struct ieee80211_qos_hdr_4addr and struct ieee80211_qos_hdr.
 */
static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr)
{
        union {
                struct ieee80211_qos_hdr        addr3;
                struct ieee80211_qos_hdr_4addr        addr4;
        } *qos;

        qos = (void *)hdr;
        if (ieee80211_has_a4(qos->addr3.frame_control))
                return (u8 *)&qos->addr4.qos_ctrl;
        else
                return (u8 *)&qos->addr3.qos_ctrl;
}

/**
 * ieee80211_get_tid - get qos TID
 * @hdr: the frame
 * Return: the TID from the QoS control field
 */
static inline u8 ieee80211_get_tid(struct ieee80211_hdr *hdr)
{
        u8 *qc = ieee80211_get_qos_ctl(hdr);

        return qc[0] & IEEE80211_QOS_CTL_TID_MASK;
}

/**
 * ieee80211_get_SA - get pointer to SA
 * @hdr: the frame
 * Return: a pointer to the source address (SA)
 *
 * Given an 802.11 frame, this function returns the offset
 * to the source address (SA). It does not verify that the
 * header is long enough to contain the address, and the
 * header must be long enough to contain the frame control
 * field.
 */
static inline u8 *ieee80211_get_SA(struct ieee80211_hdr *hdr)
{
        if (ieee80211_has_a4(hdr->frame_control))
                return hdr->addr4;
        if (ieee80211_has_fromds(hdr->frame_control))
                return hdr->addr3;
        return hdr->addr2;
}

/**
 * ieee80211_get_DA - get pointer to DA
 * @hdr: the frame
 * Return: a pointer to the destination address (DA)
 *
 * Given an 802.11 frame, this function returns the offset
 * to the destination address (DA). It does not verify that
 * the header is long enough to contain the address, and the
 * header must be long enough to contain the frame control
 * field.
 */
static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr)
{
        if (ieee80211_has_tods(hdr->frame_control))
                return hdr->addr3;
        else
                return hdr->addr1;
}

/**
 * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU
 * @skb: the skb to check, starting with the 802.11 header
 * Return: whether or not the MMPDU is bufferable
 */
static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb)
{
        struct ieee80211_mgmt *mgmt = (void *)skb->data;
        __le16 fc = mgmt->frame_control;

        /*
         * IEEE 802.11 REVme D2.0 definition of bufferable MMPDU;
         * note that this ignores the IBSS special case.
         */
        if (!ieee80211_is_mgmt(fc))
                return false;

        if (ieee80211_is_disassoc(fc) || ieee80211_is_deauth(fc))
                return true;

        if (!ieee80211_is_action(fc))
                return false;

        if (skb->len < offsetofend(typeof(*mgmt), u.action.u.ftm.action_code))
                return true;

        /* action frame - additionally check for non-bufferable FTM */

        if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC &&
            mgmt->u.action.category != WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION)
                return true;

        if (mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_REQUEST ||
            mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_RESPONSE)
                return false;

        return true;
}

/**
 * _ieee80211_is_robust_mgmt_frame - check if frame is a robust management frame
 * @hdr: the frame (buffer must include at least the first octet of payload)
 * Return: whether or not the frame is a robust management frame
 */
static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr)
{
        if (ieee80211_is_disassoc(hdr->frame_control) ||
            ieee80211_is_deauth(hdr->frame_control))
                return true;

        if (ieee80211_is_action(hdr->frame_control)) {
                u8 *category;

                /*
                 * Action frames, excluding Public Action frames, are Robust
                 * Management Frames. However, if we are looking at a Protected
                 * frame, skip the check since the data may be encrypted and
                 * the frame has already been found to be a Robust Management
                 * Frame (by the other end).
                 */
                if (ieee80211_has_protected(hdr->frame_control))
                        return true;
                category = ((u8 *) hdr) + 24;
                return *category != WLAN_CATEGORY_PUBLIC &&
                        *category != WLAN_CATEGORY_HT &&
                        *category != WLAN_CATEGORY_WNM_UNPROTECTED &&
                        *category != WLAN_CATEGORY_SELF_PROTECTED &&
                        *category != WLAN_CATEGORY_UNPROT_DMG &&
                        *category != WLAN_CATEGORY_VHT &&
                        *category != WLAN_CATEGORY_S1G &&
                        *category != WLAN_CATEGORY_VENDOR_SPECIFIC;
        }

        return false;
}

/**
 * ieee80211_is_robust_mgmt_frame - check if skb contains a robust mgmt frame
 * @skb: the skb containing the frame, length will be checked
 * Return: whether or not the frame is a robust management frame
 */
static inline bool ieee80211_is_robust_mgmt_frame(struct sk_buff *skb)
{
        if (skb->len < IEEE80211_MIN_ACTION_SIZE)
                return false;
        return _ieee80211_is_robust_mgmt_frame((void *)skb->data);
}

/**
 * ieee80211_is_public_action - check if frame is a public action frame
 * @hdr: the frame
 * @len: length of the frame
 * Return: whether or not the frame is a public action frame
 */
static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr,
                                              size_t len)
{
        struct ieee80211_mgmt *mgmt = (void *)hdr;

        if (len < IEEE80211_MIN_ACTION_SIZE)
                return false;
        if (!ieee80211_is_action(hdr->frame_control))
                return false;
        return mgmt->u.action.category == WLAN_CATEGORY_PUBLIC;
}

/**
 * ieee80211_is_protected_dual_of_public_action - check if skb contains a
 * protected dual of public action management frame
 * @skb: the skb containing the frame, length will be checked
 *
 * Return: true if the skb contains a protected dual of public action
 * management frame, false otherwise.
 */
static inline bool
ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb)
{
        u8 action;

        if (!ieee80211_is_public_action((void *)skb->data, skb->len) ||
            skb->len < IEEE80211_MIN_ACTION_SIZE + 1)
                return false;

        action = *(u8 *)(skb->data + IEEE80211_MIN_ACTION_SIZE);

        return action != WLAN_PUB_ACTION_20_40_BSS_COEX &&
                action != WLAN_PUB_ACTION_DSE_REG_LOC_ANN &&
                action != WLAN_PUB_ACTION_MSMT_PILOT &&
                action != WLAN_PUB_ACTION_TDLS_DISCOVER_RES &&
                action != WLAN_PUB_ACTION_LOC_TRACK_NOTI &&
                action != WLAN_PUB_ACTION_FTM_REQUEST &&
                action != WLAN_PUB_ACTION_FTM_RESPONSE &&
                action != WLAN_PUB_ACTION_FILS_DISCOVERY &&
                action != WLAN_PUB_ACTION_VENDOR_SPECIFIC;
}

/**
 * _ieee80211_is_group_privacy_action - check if frame is a group addressed
 *        privacy action frame
 * @hdr: the frame
 * Return: whether or not the frame is a group addressed privacy action frame
 */
static inline bool _ieee80211_is_group_privacy_action(struct ieee80211_hdr *hdr)
{
        struct ieee80211_mgmt *mgmt = (void *)hdr;

        if (!ieee80211_is_action(hdr->frame_control) ||
            !is_multicast_ether_addr(hdr->addr1))
                return false;

        return mgmt->u.action.category == WLAN_CATEGORY_MESH_ACTION ||
               mgmt->u.action.category == WLAN_CATEGORY_MULTIHOP_ACTION;
}

/**
 * ieee80211_is_group_privacy_action - check if frame is a group addressed
 *        privacy action frame
 * @skb: the skb containing the frame, length will be checked
 * Return: whether or not the frame is a group addressed privacy action frame
 */
static inline bool ieee80211_is_group_privacy_action(struct sk_buff *skb)
{
        if (skb->len < IEEE80211_MIN_ACTION_SIZE)
                return false;
        return _ieee80211_is_group_privacy_action((void *)skb->data);
}

/**
 * ieee80211_tu_to_usec - convert time units (TU) to microseconds
 * @tu: the TUs
 * Return: the time value converted to microseconds
 */
static inline unsigned long ieee80211_tu_to_usec(unsigned long tu)
{
        return 1024 * tu;
}

/**
 * ieee80211_check_tim - check if AID bit is set in TIM
 * @tim: the TIM IE
 * @tim_len: length of the TIM IE
 * @aid: the AID to look for
 * Return: whether or not traffic is indicated in the TIM for the given AID
 */
static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim,
                                       u8 tim_len, u16 aid)
{
        u8 mask;
        u8 index, indexn1, indexn2;

        if (unlikely(!tim || tim_len < sizeof(*tim)))
                return false;

        aid &= 0x3fff;
        index = aid / 8;
        mask  = 1 << (aid & 7);

        indexn1 = tim->bitmap_ctrl & 0xfe;
        indexn2 = tim_len + indexn1 - 4;

        if (index < indexn1 || index > indexn2)
                return false;

        index -= indexn1;

        return !!(tim->virtual_map[index] & mask);
}

/**
 * ieee80211_get_tdls_action - get TDLS action code
 * @skb: the skb containing the frame, length will not be checked
 * Return: the TDLS action code, or -1 if it's not an encapsulated TDLS action
 *        frame
 *
 * This function assumes the frame is a data frame, and that the network header
 * is in the correct place.
 */
static inline int ieee80211_get_tdls_action(struct sk_buff *skb)
{
        if (!skb_is_nonlinear(skb) &&
            skb->len > (skb_network_offset(skb) + 2)) {
                /* Point to where the indication of TDLS should start */
                const u8 *tdls_data = skb_network_header(skb) - 2;

                if (get_unaligned_be16(tdls_data) == ETH_P_TDLS &&
                    tdls_data[2] == WLAN_TDLS_SNAP_RFTYPE &&
                    tdls_data[3] == WLAN_CATEGORY_TDLS)
                        return tdls_data[4];
        }

        return -1;
}

/* convert time units */
#define TU_TO_JIFFIES(x)        (usecs_to_jiffies((x) * 1024))
#define TU_TO_EXP_TIME(x)        (jiffies + TU_TO_JIFFIES(x))

/* convert frequencies */
#define MHZ_TO_KHZ(freq) ((freq) * 1000)
#define KHZ_TO_MHZ(freq) ((freq) / 1000)
#define PR_KHZ(f) KHZ_TO_MHZ(f), f % 1000
#define KHZ_F "%d.%03d"

/* convert powers */
#define DBI_TO_MBI(gain) ((gain) * 100)
#define MBI_TO_DBI(gain) ((gain) / 100)
#define DBM_TO_MBM(gain) ((gain) * 100)
#define MBM_TO_DBM(gain) ((gain) / 100)

/**
 * ieee80211_action_contains_tpc - checks if the frame contains TPC element
 * @skb: the skb containing the frame, length will be checked
 * Return: %true if the frame contains a TPC element, %false otherwise
 *
 * This function checks if it's either TPC report action frame or Link
 * Measurement report action frame as defined in IEEE Std. 802.11-2012 8.5.2.5
 * and 8.5.7.5 accordingly.
 */
static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb)
{
        struct ieee80211_mgmt *mgmt = (void *)skb->data;

        if (!ieee80211_is_action(mgmt->frame_control))
                return false;

        if (skb->len < IEEE80211_MIN_ACTION_SIZE +
                       sizeof(mgmt->u.action.u.tpc_report))
                return false;

        /*
         * TPC report - check that:
         * category = 0 (Spectrum Management) or 5 (Radio Measurement)
         * spectrum management action = 3 (TPC/Link Measurement report)
         * TPC report EID = 35
         * TPC report element length = 2
         *
         * The spectrum management's tpc_report struct is used here both for
         * parsing tpc_report and radio measurement's link measurement report
         * frame, since the relevant part is identical in both frames.
         */
        if (mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT &&
            mgmt->u.action.category != WLAN_CATEGORY_RADIO_MEASUREMENT)
                return false;

        /* both spectrum mgmt and link measurement have same action code */
        if (mgmt->u.action.u.tpc_report.action_code !=
            WLAN_ACTION_SPCT_TPC_RPRT)
                return false;

        if (mgmt->u.action.u.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT ||
            mgmt->u.action.u.tpc_report.tpc_elem_length !=
            sizeof(struct ieee80211_tpc_report_ie))
                return false;

        return true;
}

/**
 * ieee80211_is_timing_measurement - check if frame is timing measurement response
 * @skb: the SKB to check
 * Return: whether or not the frame is a valid timing measurement response
 */
static inline bool ieee80211_is_timing_measurement(struct sk_buff *skb)
{
        struct ieee80211_mgmt *mgmt = (void *)skb->data;

        if (skb->len < IEEE80211_MIN_ACTION_SIZE)
                return false;

        if (!ieee80211_is_action(mgmt->frame_control))
                return false;

        if (mgmt->u.action.category == WLAN_CATEGORY_WNM_UNPROTECTED &&
            mgmt->u.action.u.wnm_timing_msr.action_code ==
                WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE &&
            skb->len >= offsetofend(typeof(*mgmt), u.action.u.wnm_timing_msr))
                return true;

        return false;
}

/**
 * ieee80211_is_ftm - check if frame is FTM response
 * @skb: the SKB to check
 * Return: whether or not the frame is a valid FTM response action frame
 */
static inline bool ieee80211_is_ftm(struct sk_buff *skb)
{
        struct ieee80211_mgmt *mgmt = (void *)skb->data;

        if (!ieee80211_is_public_action((void *)mgmt, skb->len))
                return false;

        if (mgmt->u.action.u.ftm.action_code ==
                WLAN_PUB_ACTION_FTM_RESPONSE &&
            skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm))
                return true;

        return false;
}

struct element {
        u8 id;
        u8 datalen;
        u8 data[];
} __packed;

/* element iteration helpers */
#define for_each_element(_elem, _data, _datalen)                        \
        for (_elem = (const struct element *)(_data);                        \
             (const u8 *)(_data) + (_datalen) - (const u8 *)_elem >=        \
                (int)sizeof(*_elem) &&                                        \
             (const u8 *)(_data) + (_datalen) - (const u8 *)_elem >=        \
                (int)sizeof(*_elem) + _elem->datalen;                        \
             _elem = (const struct element *)(_elem->data + _elem->datalen))

#define for_each_element_id(element, _id, data, datalen)                \
        for_each_element(element, data, datalen)                        \
                if (element->id == (_id))

#define for_each_element_extid(element, extid, _data, _datalen)                \
        for_each_element(element, _data, _datalen)                        \
                if (element->id == WLAN_EID_EXTENSION &&                \
                    element->datalen > 0 &&                                \
                    element->data[0] == (extid))

#define for_each_subelement(sub, element)                                \
        for_each_element(sub, (element)->data, (element)->datalen)

#define for_each_subelement_id(sub, id, element)                        \
        for_each_element_id(sub, id, (element)->data, (element)->datalen)

#define for_each_subelement_extid(sub, extid, element)                        \
        for_each_element_extid(sub, extid, (element)->data, (element)->datalen)

/**
 * for_each_element_completed - determine if element parsing consumed all data
 * @element: element pointer after for_each_element() or friends
 * @data: same data pointer as passed to for_each_element() or friends
 * @datalen: same data length as passed to for_each_element() or friends
 * Return: %true if all elements were iterated, %false otherwise; see notes
 *
 * This function returns %true if all the data was parsed or considered
 * while walking the elements. Only use this if your for_each_element()
 * loop cannot be broken out of, otherwise it always returns %false.
 *
 * If some data was malformed, this returns %false since the last parsed
 * element will not fill the whole remaining data.
 */
static inline bool for_each_element_completed(const struct element *element,
                                              const void *data, size_t datalen)
{
        return (const u8 *)element == (const u8 *)data + datalen;
}

/*
 * RSNX Capabilities:
 * bits 0-3: Field length (n-1)
 */
#define WLAN_RSNX_CAPA_PROTECTED_TWT BIT(4)
#define WLAN_RSNX_CAPA_SAE_H2E BIT(5)

/*
 * reduced neighbor report, based on Draft P802.11ax_D6.1,
 * section 9.4.2.170 and accepted contributions.
 */
#define IEEE80211_AP_INFO_TBTT_HDR_TYPE                                0x03
#define IEEE80211_AP_INFO_TBTT_HDR_FILTERED                        0x04
#define IEEE80211_AP_INFO_TBTT_HDR_COLOC                        0x08
#define IEEE80211_AP_INFO_TBTT_HDR_COUNT                        0xF0
#define IEEE80211_TBTT_INFO_TYPE_TBTT                                0
#define IEEE80211_TBTT_INFO_TYPE_MLD                                1

#define IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED                0x01
#define IEEE80211_RNR_TBTT_PARAMS_SAME_SSID                        0x02
#define IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID                        0x04
#define IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID                0x08
#define IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS                        0x10
#define IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE                        0x20
#define IEEE80211_RNR_TBTT_PARAMS_COLOC_AP                        0x40

#define IEEE80211_RNR_TBTT_PARAMS_PSD_NO_LIMIT                        127
#define IEEE80211_RNR_TBTT_PARAMS_PSD_RESERVED                        -128

struct ieee80211_neighbor_ap_info {
        u8 tbtt_info_hdr;
        u8 tbtt_info_len;
        u8 op_class;
        u8 channel;
} __packed;

enum ieee80211_range_params_max_total_ltf {
        IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_4 = 0,
        IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_8,
        IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_16,
        IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_UNSPECIFIED,
};

/*
 * reduced neighbor report, based on Draft P802.11be_D3.0,
 * section 9.4.2.170.2.
 */
struct ieee80211_rnr_mld_params {
        u8 mld_id;
        __le16 params;
} __packed;

#define IEEE80211_RNR_MLD_PARAMS_LINK_ID                        0x000F
#define IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT                0x0FF0
#define IEEE80211_RNR_MLD_PARAMS_UPDATES_INCLUDED                0x1000
#define IEEE80211_RNR_MLD_PARAMS_DISABLED_LINK                        0x2000

/* Format of the TBTT information element if it has 7, 8 or 9 bytes */
struct ieee80211_tbtt_info_7_8_9 {
        u8 tbtt_offset;
        u8 bssid[ETH_ALEN];

        /* The following element is optional, structure may not grow */
        u8 bss_params;
        s8 psd_20;
} __packed;

/* Format of the TBTT information element if it has >= 11 bytes */
struct ieee80211_tbtt_info_ge_11 {
        u8 tbtt_offset;
        u8 bssid[ETH_ALEN];
        __le32 short_ssid;

        /* The following elements are optional, structure may grow */
        u8 bss_params;
        s8 psd_20;
        struct ieee80211_rnr_mld_params mld_params;
} __packed;

/* multi-link device */
#define IEEE80211_MLD_MAX_NUM_LINKS        15

#define IEEE80211_ML_CONTROL_TYPE                        0x0007
#define IEEE80211_ML_CONTROL_TYPE_BASIC                        0
#define IEEE80211_ML_CONTROL_TYPE_PREQ                        1
#define IEEE80211_ML_CONTROL_TYPE_RECONF                2
#define IEEE80211_ML_CONTROL_TYPE_TDLS                        3
#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS                4
#define IEEE80211_ML_CONTROL_PRESENCE_MASK                0xfff0

struct ieee80211_multi_link_elem {
        __le16 control;
        u8 variable[];
} __packed;

#define IEEE80211_MLC_BASIC_PRES_LINK_ID                0x0010
#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT        0x0020
#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY                0x0040
#define IEEE80211_MLC_BASIC_PRES_EML_CAPA                0x0080
#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP                0x0100
#define IEEE80211_MLC_BASIC_PRES_MLD_ID                        0x0200

#define IEEE80211_MED_SYNC_DELAY_DURATION                0x00ff
#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH        0x0f00
#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS        0xf000

/*
 * Described in P802.11be_D3.0
 * dot11MSDTimerDuration should default to 5484 (i.e. 171.375)
 * dot11MSDOFDMEDthreshold defaults to -72 (i.e. 0)
 * dot11MSDTXOPMAX defaults to 1
 */
#define IEEE80211_MED_SYNC_DELAY_DEFAULT                0x10ac

#define IEEE80211_EML_CAP_EMLSR_SUPP                        0x0001
#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY                0x000e
#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US                0
#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US                1
#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US                2
#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US                3
#define  IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US                4
#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY        0x0070
#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US                0
#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US                1
#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US                2
#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US                3
#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US                4
#define  IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US                5
#define IEEE80211_EML_CAP_EMLMR_SUPPORT                        0x0080
#define IEEE80211_EML_CAP_EMLMR_DELAY                        0x0700
#define  IEEE80211_EML_CAP_EMLMR_DELAY_0US                        0
#define  IEEE80211_EML_CAP_EMLMR_DELAY_32US                        1
#define  IEEE80211_EML_CAP_EMLMR_DELAY_64US                        2
#define  IEEE80211_EML_CAP_EMLMR_DELAY_128US                        3
#define  IEEE80211_EML_CAP_EMLMR_DELAY_256US                        4
#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT                0x7800
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0                        0
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US                1
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US                2
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US                3
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU                4
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU                5
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU                6
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU                7
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU                8
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU                9
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU                10
#define  IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU                11

#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS                0x000f
#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT                0x0010
#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP        0x0060
#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP        0
#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME        1
#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED        2
#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF        3
#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND                0x0f80
#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT                0x1000

struct ieee80211_mle_basic_common_info {
        u8 len;
        u8 mld_mac_addr[ETH_ALEN];
        u8 variable[];
} __packed;

#define IEEE80211_MLC_PREQ_PRES_MLD_ID                        0x0010

struct ieee80211_mle_preq_common_info {
        u8 len;
        u8 variable[];
} __packed;

#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR                0x0010

/* no fixed fields in RECONF */

struct ieee80211_mle_tdls_common_info {
        u8 len;
        u8 ap_mld_mac_addr[ETH_ALEN];
} __packed;

#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR        0x0010

/* no fixed fields in PRIO_ACCESS */

/**
 * ieee80211_mle_common_size - check multi-link element common size
 * @data: multi-link element, must already be checked for size using
 *        ieee80211_mle_size_ok()
 * Return: the size of the multi-link element's "common" subfield 
 */
static inline u8 ieee80211_mle_common_size(const u8 *data)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control = le16_to_cpu(mle->control);
        u8 common = 0;

        switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
        case IEEE80211_ML_CONTROL_TYPE_BASIC:
        case IEEE80211_ML_CONTROL_TYPE_PREQ:
        case IEEE80211_ML_CONTROL_TYPE_TDLS:
        case IEEE80211_ML_CONTROL_TYPE_RECONF:
                /*
                 * The length is the first octet pointed by mle->variable so no
                 * need to add anything
                 */
                break;
        case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
                if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR)
                        common += ETH_ALEN;
                return common;
        default:
                WARN_ON(1);
                return 0;
        }

        return sizeof(*mle) + common + mle->variable[0];
}

/**
 * ieee80211_mle_get_link_id - returns the link ID
 * @data: the basic multi link element
 * Return: the link ID, or -1 if not present
 *
 * The element is assumed to be of the correct type (BASIC) and big enough,
 * this must be checked using ieee80211_mle_type_ok().
 */
static inline int ieee80211_mle_get_link_id(const u8 *data)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control = le16_to_cpu(mle->control);
        const u8 *common = mle->variable;

        /* common points now at the beginning of ieee80211_mle_basic_common_info */
        common += sizeof(struct ieee80211_mle_basic_common_info);

        if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID))
                return -1;

        return *common;
}

/**
 * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count
 * @data: pointer to the basic multi link element
 * Return: the BSS Parameter Change Count field value, or -1 if not present
 *
 * The element is assumed to be of the correct type (BASIC) and big enough,
 * this must be checked using ieee80211_mle_type_ok().
 */
static inline int
ieee80211_mle_get_bss_param_ch_cnt(const u8 *data)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control = le16_to_cpu(mle->control);
        const u8 *common = mle->variable;

        /* common points now at the beginning of ieee80211_mle_basic_common_info */
        common += sizeof(struct ieee80211_mle_basic_common_info);

        if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT))
                return -1;

        if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
                common += 1;

        return *common;
}

/**
 * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay
 * @data: pointer to the multi-link element
 * Return: the medium synchronization delay field value from the multi-link
 *        element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT)
 *        if not present
 *
 * The element is assumed to be of the correct type (BASIC) and big enough,
 * this must be checked using ieee80211_mle_type_ok().
 */
static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control = le16_to_cpu(mle->control);
        const u8 *common = mle->variable;

        /* common points now at the beginning of ieee80211_mle_basic_common_info */
        common += sizeof(struct ieee80211_mle_basic_common_info);

        if (!(control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY))
                return IEEE80211_MED_SYNC_DELAY_DEFAULT;

        if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
                common += 1;
        if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
                common += 1;

        return get_unaligned_le16(common);
}

/**
 * ieee80211_mle_get_eml_cap - returns the EML capability
 * @data: pointer to the multi-link element
 * Return: the EML capability field value from the multi-link element,
 *        or 0 if not present
 *
 * The element is assumed to be of the correct type (BASIC) and big enough,
 * this must be checked using ieee80211_mle_type_ok().
 */
static inline u16 ieee80211_mle_get_eml_cap(const u8 *data)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control = le16_to_cpu(mle->control);
        const u8 *common = mle->variable;

        /* common points now at the beginning of ieee80211_mle_basic_common_info */
        common += sizeof(struct ieee80211_mle_basic_common_info);

        if (!(control & IEEE80211_MLC_BASIC_PRES_EML_CAPA))
                return 0;

        if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
                common += 1;
        if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
                common += 1;
        if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
                common += 2;

        return get_unaligned_le16(common);
}

/**
 * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations.
 * @data: pointer to the multi-link element
 * Return: the MLD capabilities and operations field value from the multi-link
 *        element, or 0 if not present
 *
 * The element is assumed to be of the correct type (BASIC) and big enough,
 * this must be checked using ieee80211_mle_type_ok().
 */
static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control = le16_to_cpu(mle->control);
        const u8 *common = mle->variable;

        /*
         * common points now at the beginning of
         * ieee80211_mle_basic_common_info
         */
        common += sizeof(struct ieee80211_mle_basic_common_info);

        if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP))
                return 0;

        if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
                common += 1;
        if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
                common += 1;
        if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
                common += 2;
        if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
                common += 2;

        return get_unaligned_le16(common);
}

/**
 * ieee80211_mle_get_mld_id - returns the MLD ID
 * @data: pointer to the multi-link element
 * Return: The MLD ID in the given multi-link element, or 0 if not present
 *
 * The element is assumed to be of the correct type (BASIC) and big enough,
 * this must be checked using ieee80211_mle_type_ok().
 */
static inline u8 ieee80211_mle_get_mld_id(const u8 *data)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control = le16_to_cpu(mle->control);
        const u8 *common = mle->variable;

        /*
         * common points now at the beginning of
         * ieee80211_mle_basic_common_info
         */
        common += sizeof(struct ieee80211_mle_basic_common_info);

        if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID))
                return 0;

        if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
                common += 1;
        if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
                common += 1;
        if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
                common += 2;
        if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
                common += 2;
        if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
                common += 2;

        return *common;
}

/**
 * ieee80211_mle_size_ok - validate multi-link element size
 * @data: pointer to the element data
 * @len: length of the containing element
 * Return: whether or not the multi-link element size is OK
 */
static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u8 fixed = sizeof(*mle);
        u8 common = 0;
        bool check_common_len = false;
        u16 control;

        if (!data || len < fixed)
                return false;

        control = le16_to_cpu(mle->control);

        switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) {
        case IEEE80211_ML_CONTROL_TYPE_BASIC:
                common += sizeof(struct ieee80211_mle_basic_common_info);
                check_common_len = true;
                if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID)
                        common += 1;
                if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)
                        common += 1;
                if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)
                        common += 2;
                if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)
                        common += 2;
                if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)
                        common += 2;
                if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID)
                        common += 1;
                break;
        case IEEE80211_ML_CONTROL_TYPE_PREQ:
                common += sizeof(struct ieee80211_mle_preq_common_info);
                if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID)
                        common += 1;
                check_common_len = true;
                break;
        case IEEE80211_ML_CONTROL_TYPE_RECONF:
                if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR)
                        common += ETH_ALEN;
                break;
        case IEEE80211_ML_CONTROL_TYPE_TDLS:
                common += sizeof(struct ieee80211_mle_tdls_common_info);
                check_common_len = true;
                break;
        case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS:
                if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR)
                        common += ETH_ALEN;
                break;
        default:
                /* we don't know this type */
                return true;
        }

        if (len < fixed + common)
                return false;

        if (!check_common_len)
                return true;

        /* if present, common length is the first octet there */
        return mle->variable[0] >= common;
}

/**
 * ieee80211_mle_type_ok - validate multi-link element type and size
 * @data: pointer to the element data
 * @type: expected type of the element
 * @len: length of the containing element
 * Return: whether or not the multi-link element type matches and size is OK
 */
static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len)
{
        const struct ieee80211_multi_link_elem *mle = (const void *)data;
        u16 control;

        if (!ieee80211_mle_size_ok(data, len))
                return false;

        control = le16_to_cpu(mle->control);

        if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) == type)
                return true;

        return false;
}

enum ieee80211_mle_subelems {
        IEEE80211_MLE_SUBELEM_PER_STA_PROFILE                = 0,
        IEEE80211_MLE_SUBELEM_FRAGMENT                        = 254,
};

#define IEEE80211_MLE_STA_CONTROL_LINK_ID                        0x000f
#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE                0x0010
#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT                0x0020
#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT                0x0040
#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT                0x0080
#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT                0x0100
#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT        0x0200
#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE                0x0400
#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT        0x0800

struct ieee80211_mle_per_sta_profile {
        __le16 control;
        u8 sta_info_len;
        u8 variable[];
} __packed;

/**
 * ieee80211_mle_basic_sta_prof_size_ok - validate basic multi-link element sta
 *        profile size
 * @data: pointer to the sub element data
 * @len: length of the containing sub element
 * Return: %true if the STA profile is large enough, %false otherwise
 */
static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data,
                                                        size_t len)
{
        const struct ieee80211_mle_per_sta_profile *prof = (const void *)data;
        u16 control;
        u8 fixed = sizeof(*prof);
        u8 info_len = 1;

        if (len < fixed)
                return false;

        control = le16_to_cpu(prof->control);

        if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)
                info_len += 6;
        if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT)
                info_len += 2;
        if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT)
                info_len += 8;
        if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT)
                info_len += 2;
        if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE &&
            control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) {
                if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE)
                        info_len += 2;
                else
                        info_len += 1;
        }
        if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)
                info_len += 1;

        return prof->sta_info_len >= info_len &&
               fixed + prof->sta_info_len - 1 <= len;
}

/**
 * ieee80211_mle_basic_sta_prof_bss_param_ch_cnt - get per-STA profile BSS
 *        parameter change count
 * @prof: the per-STA profile, having been checked with
 *        ieee80211_mle_basic_sta_prof_size_ok() for the correct length
 *
 * Return: The BSS parameter change count value if present, 0 otherwise.
 */
static inline u8
ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta_profile *prof)
{
        u16 control = le16_to_cpu(prof->control);
        const u8 *pos = prof->variable;

        if (!(control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT))
                return 0;

        if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT)
                pos += 6;
        if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT)
                pos += 2;
        if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT)
                pos += 8;
        if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT)
                pos += 2;
        if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE &&
            control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) {
                if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE)
                        pos += 2;
                else
                        pos += 1;
        }

        return *pos;
}

#define IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID                        0x000f
#define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE                0x0010
#define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT                0x0020
#define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT                0x0040
#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_UPDATE_TYPE                0x0780
#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT        0x0800

/**
 * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link
 *        element sta profile size.
 * @data: pointer to the sub element data
 * @len: length of the containing sub element
 * Return: %true if the STA profile is large enough, %false otherwise
 */
static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data,
                                                         size_t len)
{
        const struct ieee80211_mle_per_sta_profile *prof = (const void *)data;
        u16 control;
        u8 fixed = sizeof(*prof);
        u8 info_len = 1;

        if (len < fixed)
                return false;

        control = le16_to_cpu(prof->control);

        if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT)
                info_len += ETH_ALEN;
        if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT)
                info_len += 2;
        if (control & IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT)
                info_len += 2;

        return prof->sta_info_len >= info_len &&
               fixed + prof->sta_info_len - 1 <= len;
}

static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len)
{
        const struct ieee80211_ttlm_elem *t2l = (const void *)data;
        u8 control, fixed = sizeof(*t2l), elem_len = 0;

        if (len < fixed)
                return false;

        control = t2l->control;

        if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)
                elem_len += 2;
        if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)
                elem_len += 3;

        if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) {
                u8 bm_size;

                elem_len += 1;
                if (len < fixed + elem_len)
                        return false;

                if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE)
                        bm_size = 1;
                else
                        bm_size = 2;

                elem_len += hweight8(t2l->optional[0]) * bm_size;
        }

        return len >= fixed + elem_len;
}

#define for_each_mle_subelement(_elem, _data, _len)                        \
        if (ieee80211_mle_size_ok(_data, _len))                                \
                for_each_element(_elem,                                        \
                                 _data + ieee80211_mle_common_size(_data),\
                                 _len - ieee80211_mle_common_size(_data))

#endif /* LINUX_IEEE80211_H */





































    2 












































































    2 
























    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>

#include <asm/byteorder.h>
#include <asm/word-at-a-time.h>

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define IS_UNALIGNED(src, dst)        0
#else
#define IS_UNALIGNED(src, dst)        \
        (((long) dst | (long) src) & (sizeof(long) - 1))
#endif

/*
 * Do a strncpy, return length of string without final '\0'.
 * 'count' is the user-supplied count (return 'count' if we
 * hit it), 'max' is the address space maximum (and we return
 * -EFAULT if we hit it).
 */
static __always_inline long do_strncpy_from_user(char *dst, const char __user *src,
                                        unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long res = 0;

        if (IS_UNALIGNED(src, dst))
                goto byte_at_a_time;

        while (max >= sizeof(unsigned long)) {
                unsigned long c, data, mask;

                /* Fall back to byte-at-a-time if we get a page fault */
                unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);

                /*
                 * Note that we mask out the bytes following the NUL. This is
                 * important to do because string oblivious code may read past
                 * the NUL. For those routines, we don't want to give them
                 * potentially random bytes after the NUL in `src`.
                 *
                 * One example of such code is BPF map keys. BPF treats map keys
                 * as an opaque set of bytes. Without the post-NUL mask, any BPF
                 * maps keyed by strings returned from strncpy_from_user() may
                 * have multiple entries for semantically identical strings.
                 */
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        mask = zero_bytemask(data);
                        *(unsigned long *)(dst+res) = c & mask;
                        return res + find_zero(data);
                }

                *(unsigned long *)(dst+res) = c;

                res += sizeof(unsigned long);
                max -= sizeof(unsigned long);
        }

byte_at_a_time:
        while (max) {
                char c;

                unsafe_get_user(c,src+res, efault);
                dst[res] = c;
                if (!c)
                        return res;
                res++;
                max--;
        }

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, that's ok - we got as much as the user asked for.
         */
        if (res >= count)
                return res;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's an EFAULT.
         */
efault:
        return -EFAULT;
}

/**
 * strncpy_from_user: - Copy a NUL terminated string from userspace.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @src:   Source address, in user space.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from userspace to kernel space.
 *
 * On success, returns the length of the string (not including the trailing
 * NUL).
 *
 * If access to userspace fails, returns -EFAULT (some data may have been
 * copied).
 *
 * If @count is smaller than the length of the string, copies @count bytes
 * and returns @count.
 */
long strncpy_from_user(char *dst, const char __user *src, long count)
{
        unsigned long max_addr, src_addr;

        might_fault();
        if (should_fail_usercopy())
                return -EFAULT;
        if (unlikely(count <= 0))
                return 0;

        max_addr = TASK_SIZE_MAX;
        src_addr = (unsigned long)untagged_addr(src);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                kasan_check_write(dst, count);
                check_object_size(dst, count, false);
                if (user_read_access_begin(src, max)) {
                        retval = do_strncpy_from_user(dst, src, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);
























































































































































































































































































   11 



   11 












































































































































































































































































































































































































































































































































































































































































































































































































































































    5 















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
#include <linux/static_call.h>

#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
#include <asm/uv/uv.h>

unsigned int __read_mostly cpu_khz;        /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);

unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);

#define KHZ        1000

/*
 * TSC can be unstable due to cpufreq or due to unsynced TSCs
 */
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;

static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);

int tsc_clocksource_reliable;

static int __read_mostly tsc_force_recalibrate;

static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
static bool have_art;

struct cyc2ns {
        struct cyc2ns_data data[2];        /*  0 + 2*16 = 32 */
        seqcount_latch_t   seq;                /* 32 + 4    = 36 */

}; /* fits one cacheline */

static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);

static int __init tsc_early_khz_setup(char *buf)
{
        return kstrtouint(buf, 0, &tsc_early_khz);
}
early_param("tsc_early_khz", tsc_early_khz_setup);

__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
{
        int seq, idx;

        do {
                seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
                idx = seq & 1;

                data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
                data->cyc2ns_mul    = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
                data->cyc2ns_shift  = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);

        } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}

__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
        preempt_disable_notrace();
        __cyc2ns_read(data);
}

__always_inline void cyc2ns_read_end(void)
{
        preempt_enable_notrace();
}

/*
 * Accelerators for sched_clock()
 * convert from cycles(64bits) => nanoseconds (64bits)
 *  basic equation:
 *              ns = cycles / (freq / ns_per_sec)
 *              ns = cycles * (ns_per_sec / freq)
 *              ns = cycles * (10^9 / (cpu_khz * 10^3))
 *              ns = cycles * (10^6 / cpu_khz)
 *
 *      Then we use scaling math (suggested by george@mvista.com) to get:
 *              ns = cycles * (10^6 * SC / cpu_khz) / SC
 *              ns = cycles * cyc2ns_scale / SC
 *
 *      And since SC is a constant power of two, we can convert the div
 *  into a shift. The larger SC is, the more accurate the conversion, but
 *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
 *  (64-bit result) can be used.
 *
 *  We can use khz divisor instead of mhz to keep a better precision.
 *  (mathieu.desnoyers@polymtl.ca)
 *
 *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
 */

static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
        struct cyc2ns_data data;
        unsigned long long ns;

        __cyc2ns_read(&data);

        ns = data.cyc2ns_offset;
        ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);

        return ns;
}

static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
        unsigned long long ns;
        preempt_disable_notrace();
        ns = __cycles_2_ns(cyc);
        preempt_enable_notrace();
        return ns;
}

static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long long ns_now;
        struct cyc2ns_data data;
        struct cyc2ns *c2n;

        ns_now = cycles_2_ns(tsc_now);

        /*
         * Compute a new multiplier as per the above comment and ensure our
         * time function is continuous; see the comment near struct
         * cyc2ns_data.
         */
        clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
                               NSEC_PER_MSEC, 0);

        /*
         * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
         * not expected to be greater than 31 due to the original published
         * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
         * value) - refer perf_event_mmap_page documentation in perf_event.h.
         */
        if (data.cyc2ns_shift == 32) {
                data.cyc2ns_shift = 31;
                data.cyc2ns_mul >>= 1;
        }

        data.cyc2ns_offset = ns_now -
                mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);

        c2n = per_cpu_ptr(&cyc2ns, cpu);

        raw_write_seqcount_latch(&c2n->seq);
        c2n->data[0] = data;
        raw_write_seqcount_latch(&c2n->seq);
        c2n->data[1] = data;
}

static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long flags;

        local_irq_save(flags);
        sched_clock_idle_sleep_event();

        if (khz)
                __set_cyc2ns_scale(khz, cpu, tsc_now);

        sched_clock_idle_wakeup_event();
        local_irq_restore(flags);
}

/*
 * Initialize cyc2ns for boot cpu
 */
static void __init cyc2ns_init_boot_cpu(void)
{
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);

        seqcount_latch_init(&c2n->seq);
        __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
}

/*
 * Secondary CPUs do not run through tsc_init(), so set up
 * all the scale factors for all CPUs, assuming the same
 * speed as the bootup CPU.
 */
static void __init cyc2ns_init_secondary_cpus(void)
{
        unsigned int cpu, this_cpu = smp_processor_id();
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
        struct cyc2ns_data *data = c2n->data;

        for_each_possible_cpu(cpu) {
                if (cpu != this_cpu) {
                        seqcount_latch_init(&c2n->seq);
                        c2n = per_cpu_ptr(&cyc2ns, cpu);
                        c2n->data[0] = data[0];
                        c2n->data[1] = data[1];
                }
        }
}

/*
 * Scheduler clock - returns current time in nanosec units.
 */
noinstr u64 native_sched_clock(void)
{
        if (static_branch_likely(&__use_tsc)) {
                u64 tsc_now = rdtsc();

                /* return the value in ns */
                return __cycles_2_ns(tsc_now);
        }

        /*
         * Fall back to jiffies if there's no TSC available:
         * ( But note that we still use it if the TSC is marked
         *   unstable. We do this because unlike Time Of Day,
         *   the scheduler clock tolerates small errors and it's
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */

        /* No locking but a rare wrong value is not a big deal: */
        return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}

/*
 * Generate a sched_clock if you already have a TSC value.
 */
u64 native_sched_clock_from_tsc(u64 tsc)
{
        return cycles_2_ns(tsc);
}

/* We need to define a real function for sched_clock, to override the
   weak default version */
#ifdef CONFIG_PARAVIRT
noinstr u64 sched_clock_noinstr(void)
{
        return paravirt_sched_clock();
}

bool using_native_sched_clock(void)
{
        return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));

bool using_native_sched_clock(void) { return true; }
#endif

notrace u64 sched_clock(void)
{
        u64 now;
        preempt_disable_notrace();
        now = sched_clock_noinstr();
        preempt_enable_notrace();
        return now;
}

int check_tsc_unstable(void)
{
        return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);

#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
        mark_tsc_unstable("boot parameter notsc");
        return 1;
}
#else
/*
 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
 * in cpu/common.c
 */
int __init notsc_setup(char *str)
{
        setup_clear_cpu_cap(X86_FEATURE_TSC);
        return 1;
}
#endif

__setup("notsc", notsc_setup);

static int no_sched_irq_time;
static int no_tsc_watchdog;
static int tsc_as_watchdog;

static int __init tsc_setup(char *str)
{
        if (!strcmp(str, "reliable"))
                tsc_clocksource_reliable = 1;
        if (!strncmp(str, "noirqtime", 9))
                no_sched_irq_time = 1;
        if (!strcmp(str, "unstable"))
                mark_tsc_unstable("boot parameter");
        if (!strcmp(str, "nowatchdog")) {
                no_tsc_watchdog = 1;
                if (tsc_as_watchdog)
                        pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
                                 __func__);
                tsc_as_watchdog = 0;
        }
        if (!strcmp(str, "recalibrate"))
                tsc_force_recalibrate = 1;
        if (!strcmp(str, "watchdog")) {
                if (no_tsc_watchdog)
                        pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
                                 __func__);
                else
                        tsc_as_watchdog = 1;
        }
        return 1;
}

__setup("tsc=", tsc_setup);

#define MAX_RETRIES                5
#define TSC_DEFAULT_THRESHOLD        0x20000

/*
 * Read TSC and the reference counters. Take care of any disturbances
 */
static u64 tsc_read_refs(u64 *p, int hpet)
{
        u64 t1, t2;
        u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
        int i;

        for (i = 0; i < MAX_RETRIES; i++) {
                t1 = get_cycles();
                if (hpet)
                        *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
                else
                        *p = acpi_pm_read_early();
                t2 = get_cycles();
                if ((t2 - t1) < thresh)
                        return t2;
        }
        return ULLONG_MAX;
}

/*
 * Calculate the TSC frequency from HPET reference
 */
static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
{
        u64 tmp;

        if (hpet2 < hpet1)
                hpet2 += 0x100000000ULL;
        hpet2 -= hpet1;
        tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
        do_div(tmp, 1000000);
        deltatsc = div64_u64(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

/*
 * Calculate the TSC frequency from PMTimer reference
 */
static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
{
        u64 tmp;

        if (!pm1 && !pm2)
                return ULONG_MAX;

        if (pm2 < pm1)
                pm2 += (u64)ACPI_PM_OVRRUN;
        pm2 -= pm1;
        tmp = pm2 * 1000000000LL;
        do_div(tmp, PMTMR_TICKS_PER_SEC);
        do_div(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

#define CAL_MS                10
#define CAL_LATCH        (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS        1000

#define CAL2_MS                50
#define CAL2_LATCH        (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS        5000


/*
 * Try to calibrate the TSC against the Programmable
 * Interrupt Timer and return the frequency of the TSC
 * in kHz.
 *
 * Return ULONG_MAX on failure to calibrate.
 */
static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
        u64 tsc, t1, t2, delta;
        unsigned long tscmin, tscmax;
        int pitcnt;

        if (!has_legacy_pic()) {
                /*
                 * Relies on tsc_early_delay_calibrate() to have given us semi
                 * usable udelay(), wait for the same 50ms we would have with
                 * the PIT loop below.
                 */
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                return ULONG_MAX;
        }

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Setup CTC channel 2* for mode 0, (interrupt on terminal
         * count mode), binary count. Set the latch register to 50ms
         * (LSB then MSB) to begin countdown.
         */
        outb(0xb0, 0x43);
        outb(latch & 0xff, 0x42);
        outb(latch >> 8, 0x42);

        tsc = t1 = t2 = get_cycles();

        pitcnt = 0;
        tscmax = 0;
        tscmin = ULONG_MAX;
        while ((inb(0x61) & 0x20) == 0) {
                t2 = get_cycles();
                delta = t2 - tsc;
                tsc = t2;
                if ((unsigned long) delta < tscmin)
                        tscmin = (unsigned int) delta;
                if ((unsigned long) delta > tscmax)
                        tscmax = (unsigned int) delta;
                pitcnt++;
        }

        /*
         * Sanity checks:
         *
         * If we were not able to read the PIT more than loopmin
         * times, then we have been hit by a massive SMI
         *
         * If the maximum is 10 times larger than the minimum,
         * then we got hit by an SMI as well.
         */
        if (pitcnt < loopmin || tscmax > 10 * tscmin)
                return ULONG_MAX;

        /* Calculate the PIT value */
        delta = t2 - t1;
        do_div(delta, ms);
        return delta;
}

/*
 * This reads the current MSB of the PIT counter, and
 * checks if we are running on sufficiently fast and
 * non-virtualized hardware.
 *
 * Our expectations are:
 *
 *  - the PIT is running at roughly 1.19MHz
 *
 *  - each IO is going to take about 1us on real hardware,
 *    but we allow it to be much faster (by a factor of 10) or
 *    _slightly_ slower (ie we allow up to a 2us read+counter
 *    update - anything else implies a unacceptably slow CPU
 *    or PIT for the fast calibration to work.
 *
 *  - with 256 PIT ticks to read the value, we have 214us to
 *    see the same MSB (and overhead like doing a single TSC
 *    read per MSB value etc).
 *
 *  - We're doing 2 reads per loop (LSB, MSB), and we expect
 *    them each to take about a microsecond on real hardware.
 *    So we expect a count value of around 100. But we'll be
 *    generous, and accept anything over 50.
 *
 *  - if the PIT is stuck, and we see *many* more reads, we
 *    return early (and the next caller of pit_expect_msb()
 *    then consider it a failure when they don't see the
 *    next expected value).
 *
 * These expectations mean that we know that we have seen the
 * transition from one expected value to another with a fairly
 * high accuracy, and we didn't miss any events. We can thus
 * use the TSC value at the transitions to calculate a pretty
 * good value for the TSC frequency.
 */
static inline int pit_verify_msb(unsigned char val)
{
        /* Ignore LSB */
        inb(0x42);
        return inb(0x42) == val;
}

static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
        int count;
        u64 tsc = 0, prev_tsc = 0;

        for (count = 0; count < 50000; count++) {
                if (!pit_verify_msb(val))
                        break;
                prev_tsc = tsc;
                tsc = get_cycles();
        }
        *deltap = get_cycles() - prev_tsc;
        *tscp = tsc;

        /*
         * We require _some_ success, but the quality control
         * will be based on the error terms on the TSC values.
         */
        return count > 5;
}

/*
 * How many MSB values do we want to see? We aim for
 * a maximum error rate of 500ppm (in practice the
 * real error is much smaller), but refuse to spend
 * more than 50ms on it.
 */
#define MAX_QUICK_PIT_MS 50
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)

static unsigned long quick_pit_calibrate(void)
{
        int i;
        u64 tsc, delta;
        unsigned long d1, d2;

        if (!has_legacy_pic())
                return 0;

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Counter 2, mode 0 (one-shot), binary count
         *
         * NOTE! Mode 2 decrements by two (and then the
         * output is flipped each time, giving the same
         * final output frequency as a decrement-by-one),
         * so mode 0 is much better when looking at the
         * individual counts.
         */
        outb(0xb0, 0x43);

        /* Start at 0xffff */
        outb(0xff, 0x42);
        outb(0xff, 0x42);

        /*
         * The PIT starts counting at the next edge, so we
         * need to delay for a microsecond. The easiest way
         * to do that is to just read back the 16-bit counter
         * once from the PIT.
         */
        pit_verify_msb(0);

        if (pit_expect_msb(0xff, &tsc, &d1)) {
                for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
                        if (!pit_expect_msb(0xff-i, &delta, &d2))
                                break;

                        delta -= tsc;

                        /*
                         * Extrapolate the error and fail fast if the error will
                         * never be below 500 ppm.
                         */
                        if (i == 1 &&
                            d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
                                return 0;

                        /*
                         * Iterate until the error is less than 500 ppm
                         */
                        if (d1+d2 >= delta >> 11)
                                continue;

                        /*
                         * Check the PIT one more time to verify that
                         * all TSC reads were stable wrt the PIT.
                         *
                         * This also guarantees serialization of the
                         * last cycle read ('d2') in pit_expect_msb.
                         */
                        if (!pit_verify_msb(0xfe - i))
                                break;
                        goto success;
                }
        }
        pr_info("Fast TSC calibration failed\n");
        return 0;

success:
        /*
         * Ok, if we get here, then we've seen the
         * MSB of the PIT decrement 'i' times, and the
         * error has shrunk to less than 500 ppm.
         *
         * As a result, we can depend on there not being
         * any odd delays anywhere, and the TSC reads are
         * reliable (within the error).
         *
         * kHz = ticks / time-in-seconds / 1000;
         * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
         * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
         */
        delta *= PIT_TICK_RATE;
        do_div(delta, i*256*1000);
        pr_info("Fast TSC calibration using PIT\n");
        return delta;
}

/**
 * native_calibrate_tsc - determine TSC frequency
 * Determine TSC frequency via CPUID, else return 0.
 */
unsigned long native_calibrate_tsc(void)
{
        unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
        unsigned int crystal_khz;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < 0x15)
                return 0;

        eax_denominator = ebx_numerator = ecx_hz = edx = 0;

        /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
        cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);

        if (ebx_numerator == 0 || eax_denominator == 0)
                return 0;

        crystal_khz = ecx_hz / 1000;

        /*
         * Denverton SoCs don't report crystal clock, and also don't support
         * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
         * clock.
         */
        if (crystal_khz == 0 &&
                        boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
                crystal_khz = 25000;

        /*
         * TSC frequency reported directly by CPUID is a "hardware reported"
         * frequency and is the most accurate one so far we have. This
         * is considered a known frequency.
         */
        if (crystal_khz != 0)
                setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);

        /*
         * Some Intel SoCs like Skylake and Kabylake don't report the crystal
         * clock, but we can easily calculate it to a high degree of accuracy
         * by considering the crystal ratio and the CPU speed.
         */
        if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
                unsigned int eax_base_mhz, ebx, ecx, edx;

                cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
                crystal_khz = eax_base_mhz * 1000 *
                        eax_denominator / ebx_numerator;
        }

        if (crystal_khz == 0)
                return 0;

        /*
         * For Atom SoCs TSC is the only reliable clocksource.
         * Mark TSC reliable so no watchdog on it.
         */
        if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
                setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);

#ifdef CONFIG_X86_LOCAL_APIC
        /*
         * The local APIC appears to be fed by the core crystal clock
         * (which sounds entirely sensible). We can set the global
         * lapic_timer_period here to avoid having to calibrate the APIC
         * timer later.
         */
        lapic_timer_period = crystal_khz * 1000 / HZ;
#endif

        return crystal_khz * ebx_numerator / eax_denominator;
}

static unsigned long cpu_khz_from_cpuid(void)
{
        unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < 0x16)
                return 0;

        eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;

        cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);

        return eax_base_mhz * 1000;
}

/*
 * calibrate cpu using pit, hpet, and ptimer methods. They are available
 * later in boot after acpi is initialized.
 */
static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
{
        u64 tsc1, tsc2, delta, ref1, ref2;
        unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
        unsigned long flags, latch, ms;
        int hpet = is_hpet_enabled(), i, loopmin;

        /*
         * Run 5 calibration loops to get the lowest frequency value
         * (the best estimate). We use two different calibration modes
         * here:
         *
         * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
         * load a timeout of 50ms. We read the time right after we
         * started the timer and wait until the PIT count down reaches
         * zero. In each wait loop iteration we read the TSC and check
         * the delta to the previous read. We keep track of the min
         * and max values of that delta. The delta is mostly defined
         * by the IO time of the PIT access, so we can detect when
         * any disturbance happened between the two reads. If the
         * maximum time is significantly larger than the minimum time,
         * then we discard the result and have another try.
         *
         * 2) Reference counter. If available we use the HPET or the
         * PMTIMER as a reference to check the sanity of that value.
         * We use separate TSC readouts and check inside of the
         * reference read for any possible disturbance. We discard
         * disturbed values here as well. We do that around the PIT
         * calibration delay loop as we have to wait for a certain
         * amount of time anyway.
         */

        /* Preset PIT loop values */
        latch = CAL_LATCH;
        ms = CAL_MS;
        loopmin = CAL_PIT_LOOPS;

        for (i = 0; i < 3; i++) {
                unsigned long tsc_pit_khz;

                /*
                 * Read the start value and the reference count of
                 * hpet/pmtimer when available. Then do the PIT
                 * calibration, which will take at least 50ms, and
                 * read the end value.
                 */
                local_irq_save(flags);
                tsc1 = tsc_read_refs(&ref1, hpet);
                tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
                tsc2 = tsc_read_refs(&ref2, hpet);
                local_irq_restore(flags);

                /* Pick the lowest PIT TSC calibration so far */
                tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);

                /* hpet or pmtimer available ? */
                if (ref1 == ref2)
                        continue;

                /* Check, whether the sampling was disturbed */
                if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
                        continue;

                tsc2 = (tsc2 - tsc1) * 1000000LL;
                if (hpet)
                        tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
                else
                        tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);

                tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);

                /* Check the reference deviation */
                delta = ((u64) tsc_pit_min) * 100;
                do_div(delta, tsc_ref_min);

                /*
                 * If both calibration results are inside a 10% window
                 * then we can be sure, that the calibration
                 * succeeded. We break out of the loop right away. We
                 * use the reference value, as it is more precise.
                 */
                if (delta >= 90 && delta <= 110) {
                        pr_info("PIT calibration matches %s. %d loops\n",
                                hpet ? "HPET" : "PMTIMER", i + 1);
                        return tsc_ref_min;
                }

                /*
                 * Check whether PIT failed more than once. This
                 * happens in virtualized environments. We need to
                 * give the virtual PC a slightly longer timeframe for
                 * the HPET/PMTIMER to make the result precise.
                 */
                if (i == 1 && tsc_pit_min == ULONG_MAX) {
                        latch = CAL2_LATCH;
                        ms = CAL2_MS;
                        loopmin = CAL2_PIT_LOOPS;
                }
        }

        /*
         * Now check the results.
         */
        if (tsc_pit_min == ULONG_MAX) {
                /* PIT gave no useful value */
                pr_warn("Unable to calibrate against PIT\n");

                /* We don't have an alternative source, disable TSC */
                if (!hpet && !ref1 && !ref2) {
                        pr_notice("No reference (HPET/PMTIMER) available\n");
                        return 0;
                }

                /* The alternative source failed as well, disable TSC */
                if (tsc_ref_min == ULONG_MAX) {
                        pr_warn("HPET/PMTIMER calibration failed\n");
                        return 0;
                }

                /* Use the alternative source */
                pr_info("using %s reference calibration\n",
                        hpet ? "HPET" : "PMTIMER");

                return tsc_ref_min;
        }

        /* We don't have an alternative source, use the PIT calibration value */
        if (!hpet && !ref1 && !ref2) {
                pr_info("Using PIT calibration value\n");
                return tsc_pit_min;
        }

        /* The alternative source failed, use the PIT calibration value */
        if (tsc_ref_min == ULONG_MAX) {
                pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
                return tsc_pit_min;
        }

        /*
         * The calibration values differ too much. In doubt, we use
         * the PIT value as we know that there are PMTIMERs around
         * running at double speed. At least we let the user know:
         */
        pr_warn("PIT calibration deviates from %s: %lu %lu\n",
                hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
        pr_info("Using PIT calibration value\n");
        return tsc_pit_min;
}

/**
 * native_calibrate_cpu_early - can calibrate the cpu early in boot
 */
unsigned long native_calibrate_cpu_early(void)
{
        unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();

        if (!fast_calibrate)
                fast_calibrate = cpu_khz_from_msr();
        if (!fast_calibrate) {
                local_irq_save(flags);
                fast_calibrate = quick_pit_calibrate();
                local_irq_restore(flags);
        }
        return fast_calibrate;
}


/**
 * native_calibrate_cpu - calibrate the cpu
 */
static unsigned long native_calibrate_cpu(void)
{
        unsigned long tsc_freq = native_calibrate_cpu_early();

        if (!tsc_freq)
                tsc_freq = pit_hpet_ptimer_calibrate_cpu();

        return tsc_freq;
}

void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
        unsigned long cpu_khz_old = cpu_khz;

        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;

        cpu_khz = x86_platform.calibrate_cpu();
        tsc_khz = x86_platform.calibrate_tsc();
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;
        cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
                                                    cpu_khz_old, cpu_khz);
#endif
}
EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);


static unsigned long long cyc2ns_suspend;

void tsc_save_sched_clock_state(void)
{
        if (!sched_clock_stable())
                return;

        cyc2ns_suspend = sched_clock();
}

/*
 * Even on processors with invariant TSC, TSC gets reset in some the
 * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
 * arbitrary value (still sync'd across cpu's) during resume from such sleep
 * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
 * that sched_clock() continues from the point where it was left off during
 * suspend.
 */
void tsc_restore_sched_clock_state(void)
{
        unsigned long long offset;
        unsigned long flags;
        int cpu;

        if (!sched_clock_stable())
                return;

        local_irq_save(flags);

        /*
         * We're coming out of suspend, there's no concurrency yet; don't
         * bother being nice about the RCU stuff, just write to both
         * data fields.
         */

        this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
        this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);

        offset = cyc2ns_suspend - sched_clock();

        for_each_possible_cpu(cpu) {
                per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
                per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
        }

        local_irq_restore(flags);
}

#ifdef CONFIG_CPU_FREQ
/*
 * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
 * changes.
 *
 * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
 * as unstable and give up in those cases.
 *
 * Should fix up last_tsc too. Currently gettimeofday in the
 * first tick after the change will be slightly wrong.
 */

static unsigned int  ref_freq;
static unsigned long loops_per_jiffy_ref;
static unsigned long tsc_khz_ref;

static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                                void *data)
{
        struct cpufreq_freqs *freq = data;

        if (num_online_cpus() > 1) {
                mark_tsc_unstable("cpufreq changes on SMP");
                return 0;
        }

        if (!ref_freq) {
                ref_freq = freq->old;
                loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
                tsc_khz_ref = tsc_khz;
        }

        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
                boot_cpu_data.loops_per_jiffy =
                        cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);

                tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
                        mark_tsc_unstable("cpufreq changes");

                set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
        }

        return 0;
}

static struct notifier_block time_cpufreq_notifier_block = {
        .notifier_call  = time_cpufreq_notifier
};

static int __init cpufreq_register_tsc_scaling(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return 0;
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;
        cpufreq_register_notifier(&time_cpufreq_notifier_block,
                                CPUFREQ_TRANSITION_NOTIFIER);
        return 0;
}

core_initcall(cpufreq_register_tsc_scaling);

#endif /* CONFIG_CPU_FREQ */

#define ART_CPUID_LEAF (0x15)
#define ART_MIN_DENOMINATOR (1)


/*
 * If ART is present detect the numerator:denominator to convert to TSC
 */
static void __init detect_art(void)
{
        unsigned int unused[2];

        if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
                return;

        /*
         * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
         * and the TSC counter resets must not occur asynchronously.
         */
        if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
            !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
            !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
            tsc_async_resets)
                return;

        cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
              &art_to_tsc_numerator, unused, unused+1);

        if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
                return;

        rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);

        /* Make this sticky over multiple CPU init calls */
        setup_force_cpu_cap(X86_FEATURE_ART);
}


/* clocksource code */

static void tsc_resume(struct clocksource *cs)
{
        tsc_verify_tsc_adjust(true);
}

/*
 * We used to compare the TSC to the cycle_last value in the clocksource
 * structure to avoid a nasty time-warp. This can be observed in a
 * very small window right after one CPU updated cycle_last under
 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
 * is smaller than the cycle_last reference value due to a TSC which
 * is slightly behind. This delta is nowhere else observable, but in
 * that case it results in a forward time jump in the range of hours
 * due to the unsigned delta calculation of the time keeping core
 * code, which is necessary to support wrapping clocksources like pm
 * timer.
 *
 * This sanity check is now done in the core timekeeping code.
 * checking the result of read_tsc() - cycle_last for being negative.
 * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
 */
static u64 read_tsc(struct clocksource *cs)
{
        return (u64)rdtsc_ordered();
}

static void tsc_cs_mark_unstable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        disable_sched_clock_irqtime();
        pr_info("Marking TSC unstable due to clocksource watchdog\n");
}

static void tsc_cs_tick_stable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        if (using_native_sched_clock())
                sched_clock_tick_stable();
}

static int tsc_cs_enable(struct clocksource *cs)
{
        vclocks_set_used(VDSO_CLOCKMODE_TSC);
        return 0;
}

/*
 * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
 */
static struct clocksource clocksource_tsc_early = {
        .name                        = "tsc-early",
        .rating                        = 299,
        .uncertainty_margin        = 32 * NSEC_PER_MSEC,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
        .id                        = CSID_X86_TSC_EARLY,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc_early.list),
};

/*
 * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
 * this one will immediately take over. We will only register if TSC has
 * been found good.
 */
static struct clocksource clocksource_tsc = {
        .name                        = "tsc",
        .rating                        = 300,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_VALID_FOR_HRES |
                                  CLOCK_SOURCE_MUST_VERIFY |
                                  CLOCK_SOURCE_VERIFY_PERCPU,
        .id                        = CSID_X86_TSC,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc.list),
};

void mark_tsc_unstable(char *reason)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        disable_sched_clock_irqtime();
        pr_info("Marking TSC unstable due to %s\n", reason);

        clocksource_mark_unstable(&clocksource_tsc_early);
        clocksource_mark_unstable(&clocksource_tsc);
}

EXPORT_SYMBOL_GPL(mark_tsc_unstable);

static void __init tsc_disable_clocksource_watchdog(void)
{
        clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}

bool tsc_clocksource_watchdog_disabled(void)
{
        return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
               tsc_as_watchdog && !no_tsc_watchdog;
}

static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
        if (is_geode_lx()) {
                /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
                unsigned long res_low, res_high;

                rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
                /* Geode_LX - the OLPC CPU has a very reliable TSC */
                if (res_low & RTSC_SUSP)
                        tsc_clocksource_reliable = 1;
        }
#endif
        if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
                tsc_clocksource_reliable = 1;

        /*
         * Disable the clocksource watchdog when the system has:
         *  - TSC running at constant frequency
         *  - TSC which does not stop in C-States
         *  - the TSC_ADJUST register which allows to detect even minimal
         *    modifications
         *  - not more than two sockets. As the number of sockets cannot be
         *    evaluated at the early boot stage where this has to be
         *    invoked, check the number of online memory nodes as a
         *    fallback solution which is an reasonable estimate.
         */
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
            boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
            boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
            nr_online_nodes <= 4)
                tsc_disable_clocksource_watchdog();
}

/*
 * Make an educated guess if the TSC is trustworthy and synchronized
 * over all CPUs.
 */
int unsynchronized_tsc(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
                return 1;

#ifdef CONFIG_SMP
        if (apic_is_clustered_box())
                return 1;
#endif

        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;

        if (tsc_clocksource_reliable)
                return 0;
        /*
         * Intel systems are normally all synchronized.
         * Exceptions must mark TSC as unstable:
         */
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                /* assume multi socket systems are not synchronized: */
                if (num_possible_cpus() > 1)
                        return 1;
        }

        return 0;
}

/*
 * Convert ART to TSC given numerator/denominator found in detect_art()
 */
struct system_counterval_t convert_art_to_tsc(u64 art)
{
        u64 tmp, res, rem;

        rem = do_div(art, art_to_tsc_denominator);

        res = art * art_to_tsc_numerator;
        tmp = rem * art_to_tsc_numerator;

        do_div(tmp, art_to_tsc_denominator);
        res += tmp + art_to_tsc_offset;

        return (struct system_counterval_t) {
                .cs_id        = have_art ? CSID_X86_TSC : CSID_GENERIC,
                .cycles        = res,
        };
}
EXPORT_SYMBOL(convert_art_to_tsc);

/**
 * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
 * @art_ns: ART (Always Running Timer) in unit of nanoseconds
 *
 * PTM requires all timestamps to be in units of nanoseconds. When user
 * software requests a cross-timestamp, this function converts system timestamp
 * to TSC.
 *
 * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
 * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
 * that this flag is set before conversion to TSC is attempted.
 *
 * Return:
 * struct system_counterval_t - system counter value with the ID of the
 *        corresponding clocksource:
 *        cycles:                System counter value
 *        cs_id:                The clocksource ID for validating comparability
 */

struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
{
        u64 tmp, res, rem;

        rem = do_div(art_ns, USEC_PER_SEC);

        res = art_ns * tsc_khz;
        tmp = rem * tsc_khz;

        do_div(tmp, USEC_PER_SEC);
        res += tmp;

        return (struct system_counterval_t) {
                .cs_id        = have_art ? CSID_X86_TSC : CSID_GENERIC,
                .cycles        = res,
        };
}
EXPORT_SYMBOL(convert_art_ns_to_tsc);


static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
/**
 * tsc_refine_calibration_work - Further refine tsc freq calibration
 * @work: ignored.
 *
 * This functions uses delayed work over a period of a
 * second to further refine the TSC freq value. Since this is
 * timer based, instead of loop based, we don't block the boot
 * process while this longer calibration is done.
 *
 * If there are any calibration anomalies (too many SMIs, etc),
 * or the refined calibration is off by 1% of the fast early
 * calibration, we throw out the new calibration and use the
 * early calibration.
 */
static void tsc_refine_calibration_work(struct work_struct *work)
{
        static u64 tsc_start = ULLONG_MAX, ref_start;
        static int hpet;
        u64 tsc_stop, ref_stop, delta;
        unsigned long freq;
        int cpu;

        /* Don't bother refining TSC on unstable systems */
        if (tsc_unstable)
                goto unreg;

        /*
         * Since the work is started early in boot, we may be
         * delayed the first time we expire. So set the workqueue
         * again once we know timers are working.
         */
        if (tsc_start == ULLONG_MAX) {
restart:
                /*
                 * Only set hpet once, to avoid mixing hardware
                 * if the hpet becomes enabled later.
                 */
                hpet = is_hpet_enabled();
                tsc_start = tsc_read_refs(&ref_start, hpet);
                schedule_delayed_work(&tsc_irqwork, HZ);
                return;
        }

        tsc_stop = tsc_read_refs(&ref_stop, hpet);

        /* hpet or pmtimer available ? */
        if (ref_start == ref_stop)
                goto out;

        /* Check, whether the sampling was disturbed */
        if (tsc_stop == ULLONG_MAX)
                goto restart;

        delta = tsc_stop - tsc_start;
        delta *= 1000000LL;
        if (hpet)
                freq = calc_hpet_ref(delta, ref_start, ref_stop);
        else
                freq = calc_pmtimer_ref(delta, ref_start, ref_stop);

        /* Will hit this only if tsc_force_recalibrate has been set */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {

                /* Warn if the deviation exceeds 500 ppm */
                if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
                        pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
                        pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
                                (unsigned long)tsc_khz / 1000,
                                (unsigned long)tsc_khz % 1000);
                }

                pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
                        hpet ? "HPET" : "PM_TIMER",
                        (unsigned long)freq / 1000,
                        (unsigned long)freq % 1000);

                return;
        }

        /* Make sure we're within 1% */
        if (abs(tsc_khz - freq) > tsc_khz/100)
                goto out;

        tsc_khz = freq;
        pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
                (unsigned long)tsc_khz / 1000,
                (unsigned long)tsc_khz % 1000);

        /* Inform the TSC deadline clockevent devices about the recalibration */
        lapic_update_tsc_freq();

        /* Update the sched_clock() rate to match the clocksource one */
        for_each_possible_cpu(cpu)
                set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);

out:
        if (tsc_unstable)
                goto unreg;

        if (boot_cpu_has(X86_FEATURE_ART))
                have_art = true;
        clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
        clocksource_unregister(&clocksource_tsc_early);
}


static int __init init_tsc_clocksource(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
                return 0;

        if (tsc_unstable) {
                clocksource_unregister(&clocksource_tsc_early);
                return 0;
        }

        if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
                clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;

        /*
         * When TSC frequency is known (retrieved via MSR or CPUID), we skip
         * the refined calibration and directly register it as a clocksource.
         */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
                if (boot_cpu_has(X86_FEATURE_ART))
                        have_art = true;
                clocksource_register_khz(&clocksource_tsc, tsc_khz);
                clocksource_unregister(&clocksource_tsc_early);

                if (!tsc_force_recalibrate)
                        return 0;
        }

        schedule_delayed_work(&tsc_irqwork, 0);
        return 0;
}
/*
 * We use device_initcall here, to ensure we run after the hpet
 * is fully initialized, which may occur at fs_initcall time.
 */
device_initcall(init_tsc_clocksource);

static bool __init determine_cpu_tsc_frequencies(bool early)
{
        /* Make sure that cpu and tsc are not already calibrated */
        WARN_ON(cpu_khz || tsc_khz);

        if (early) {
                cpu_khz = x86_platform.calibrate_cpu();
                if (tsc_early_khz)
                        tsc_khz = tsc_early_khz;
                else
                        tsc_khz = x86_platform.calibrate_tsc();
        } else {
                /* We should not be here with non-native cpu calibration */
                WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
                cpu_khz = pit_hpet_ptimer_calibrate_cpu();
        }

        /*
         * Trust non-zero tsc_khz as authoritative,
         * and use it to sanity check cpu_khz,
         * which will be off if system timer is off.
         */
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;

        if (tsc_khz == 0)
                return false;

        pr_info("Detected %lu.%03lu MHz processor\n",
                (unsigned long)cpu_khz / KHZ,
                (unsigned long)cpu_khz % KHZ);

        if (cpu_khz != tsc_khz) {
                pr_info("Detected %lu.%03lu MHz TSC",
                        (unsigned long)tsc_khz / KHZ,
                        (unsigned long)tsc_khz % KHZ);
        }
        return true;
}

static unsigned long __init get_loops_per_jiffy(void)
{
        u64 lpj = (u64)tsc_khz * KHZ;

        do_div(lpj, HZ);
        return lpj;
}

static void __init tsc_enable_sched_clock(void)
{
        loops_per_jiffy = get_loops_per_jiffy();
        use_tsc_delay();

        /* Sanitize TSC ADJUST before cyc2ns gets initialized */
        tsc_store_and_check_tsc_adjust(true);
        cyc2ns_init_boot_cpu();
        static_branch_enable(&__use_tsc);
}

void __init tsc_early_init(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;
        /* Don't change UV TSC multi-chassis synchronization */
        if (is_early_uv_system())
                return;
        if (!determine_cpu_tsc_frequencies(true))
                return;
        tsc_enable_sched_clock();
}

void __init tsc_init(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
                setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                return;
        }

        /*
         * native_calibrate_cpu_early can only calibrate using methods that are
         * available early in boot.
         */
        if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
                x86_platform.calibrate_cpu = native_calibrate_cpu;

        if (!tsc_khz) {
                /* We failed to determine frequencies earlier, try again */
                if (!determine_cpu_tsc_frequencies(false)) {
                        mark_tsc_unstable("could not calculate TSC khz");
                        setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                        return;
                }
                tsc_enable_sched_clock();
        }

        cyc2ns_init_secondary_cpus();

        if (!no_sched_irq_time)
                enable_sched_clock_irqtime();

        lpj_fine = get_loops_per_jiffy();

        check_system_tsc_reliable();

        if (unsynchronized_tsc()) {
                mark_tsc_unstable("TSCs unsynchronized");
                return;
        }

        if (tsc_clocksource_reliable || no_tsc_watchdog)
                tsc_disable_clocksource_watchdog();

        clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
        detect_art();
}

#ifdef CONFIG_SMP
/*
 * Check whether existing calibration data can be reused.
 */
unsigned long calibrate_delay_is_known(void)
{
        int sibling, cpu = smp_processor_id();
        int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
        const struct cpumask *mask = topology_core_cpumask(cpu);

        /*
         * If TSC has constant frequency and TSC is synchronized across
         * sockets then reuse CPU0 calibration.
         */
        if (constant_tsc && !tsc_unstable)
                return cpu_data(0).loops_per_jiffy;

        /*
         * If TSC has constant frequency and TSC is not synchronized across
         * sockets and this is not the first CPU in the socket, then reuse
         * the calibration value of an already online CPU on that socket.
         *
         * This assumes that CONSTANT_TSC is consistent for all CPUs in a
         * socket.
         */
        if (!constant_tsc || !mask)
                return 0;

        sibling = cpumask_any_but(mask, cpu);
        if (sibling < nr_cpu_ids)
                return cpu_data(sibling).loops_per_jiffy;
        return 0;
}
#endif




















































































































    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
/*
 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
#include <linux/ipv6.h>
#include <linux/poll.h>
#include <net/sock.h>

#include "rds.h"

/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
static unsigned long rds_sock_count;
static LIST_HEAD(rds_sock_list);
DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);

/*
 * This is called as the final descriptor referencing this socket is closed.
 * We have to unbind the socket so that another socket can be bound to the
 * address it was using.
 *
 * We have to be careful about racing with the incoming path.  sock_orphan()
 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
 * messages shouldn't be queued.
 */
static int rds_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct rds_sock *rs;

        if (!sk)
                goto out;

        rs = rds_sk_to_rs(sk);

        sock_orphan(sk);
        /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
         * that ensures the recv path has completed messing
         * with the socket. */
        rds_clear_recv_queue(rs);
        rds_cong_remove_socket(rs);

        rds_remove_bound(rs);

        rds_send_drop_to(rs, NULL);
        rds_rdma_drop_keys(rs);
        rds_notify_queue_get(rs, NULL);
        rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);

        spin_lock_bh(&rds_sock_lock);
        list_del_init(&rs->rs_item);
        rds_sock_count--;
        spin_unlock_bh(&rds_sock_lock);

        rds_trans_put(rs->rs_transport);

        sock->sk = NULL;
        sock_put(sk);
out:
        return 0;
}

/*
 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
 * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
 * this seems more conservative.
 * NB - normally, one would use sk_callback_lock for this, but we can
 * get here from interrupts, whereas the network code grabs sk_callback_lock
 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
 */
void rds_wake_sk_sleep(struct rds_sock *rs)
{
        unsigned long flags;

        read_lock_irqsave(&rs->rs_recv_lock, flags);
        __rds_wake_sk_sleep(rds_rs_to_sk(rs));
        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
}

static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
                       int peer)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        struct sockaddr_in6 *sin6;
        struct sockaddr_in *sin;
        int uaddr_len;

        /* racey, don't care */
        if (peer) {
                if (ipv6_addr_any(&rs->rs_conn_addr))
                        return -ENOTCONN;

                if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
                        sin = (struct sockaddr_in *)uaddr;
                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                        sin->sin_family = AF_INET;
                        sin->sin_port = rs->rs_conn_port;
                        sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
                        uaddr_len = sizeof(*sin);
                } else {
                        sin6 = (struct sockaddr_in6 *)uaddr;
                        sin6->sin6_family = AF_INET6;
                        sin6->sin6_port = rs->rs_conn_port;
                        sin6->sin6_addr = rs->rs_conn_addr;
                        sin6->sin6_flowinfo = 0;
                        /* scope_id is the same as in the bound address. */
                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
                        uaddr_len = sizeof(*sin6);
                }
        } else {
                /* If socket is not yet bound and the socket is connected,
                 * set the return address family to be the same as the
                 * connected address, but with 0 address value.  If it is not
                 * connected, set the family to be AF_UNSPEC (value 0) and
                 * the address size to be that of an IPv4 address.
                 */
                if (ipv6_addr_any(&rs->rs_bound_addr)) {
                        if (ipv6_addr_any(&rs->rs_conn_addr)) {
                                sin = (struct sockaddr_in *)uaddr;
                                memset(sin, 0, sizeof(*sin));
                                sin->sin_family = AF_UNSPEC;
                                return sizeof(*sin);
                        }

#if IS_ENABLED(CONFIG_IPV6)
                        if (!(ipv6_addr_type(&rs->rs_conn_addr) &
                              IPV6_ADDR_MAPPED)) {
                                sin6 = (struct sockaddr_in6 *)uaddr;
                                memset(sin6, 0, sizeof(*sin6));
                                sin6->sin6_family = AF_INET6;
                                return sizeof(*sin6);
                        }
#endif

                        sin = (struct sockaddr_in *)uaddr;
                        memset(sin, 0, sizeof(*sin));
                        sin->sin_family = AF_INET;
                        return sizeof(*sin);
                }
                if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
                        sin = (struct sockaddr_in *)uaddr;
                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
                        sin->sin_family = AF_INET;
                        sin->sin_port = rs->rs_bound_port;
                        sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
                        uaddr_len = sizeof(*sin);
                } else {
                        sin6 = (struct sockaddr_in6 *)uaddr;
                        sin6->sin6_family = AF_INET6;
                        sin6->sin6_port = rs->rs_bound_port;
                        sin6->sin6_addr = rs->rs_bound_addr;
                        sin6->sin6_flowinfo = 0;
                        sin6->sin6_scope_id = rs->rs_bound_scope_id;
                        uaddr_len = sizeof(*sin6);
                }
        }

        return uaddr_len;
}

/*
 * RDS' poll is without a doubt the least intuitive part of the interface,
 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
 * a network protocol.
 *
 * EPOLLIN is asserted if
 *  -        there is data on the receive queue.
 *  -        to signal that a previously congested destination may have become
 *        uncongested
 *  -        A notification has been queued to the socket (this can be a congestion
 *        update, or a RDMA completion, or a MSG_ZEROCOPY completion).
 *
 * EPOLLOUT is asserted if there is room on the send queue. This does not mean
 * however, that the next sendmsg() call will succeed. If the application tries
 * to send to a congested destination, the system call may still fail (and
 * return ENOBUFS).
 */
static __poll_t rds_poll(struct file *file, struct socket *sock,
                             poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct rds_sock *rs = rds_sk_to_rs(sk);
        __poll_t mask = 0;
        unsigned long flags;

        poll_wait(file, sk_sleep(sk), wait);

        if (rs->rs_seen_congestion)
                poll_wait(file, &rds_poll_waitq, wait);

        read_lock_irqsave(&rs->rs_recv_lock, flags);
        if (!rs->rs_cong_monitor) {
                /* When a congestion map was updated, we signal EPOLLIN for
                 * "historical" reasons. Applications can also poll for
                 * WRBAND instead. */
                if (rds_cong_updated_since(&rs->rs_cong_track))
                        mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
        } else {
                spin_lock(&rs->rs_lock);
                if (rs->rs_cong_notify)
                        mask |= (EPOLLIN | EPOLLRDNORM);
                spin_unlock(&rs->rs_lock);
        }
        if (!list_empty(&rs->rs_recv_queue) ||
            !list_empty(&rs->rs_notify_queue) ||
            !list_empty(&rs->rs_zcookie_queue.zcookie_head))
                mask |= (EPOLLIN | EPOLLRDNORM);
        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
                mask |= (EPOLLOUT | EPOLLWRNORM);
        if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
                mask |= POLLERR;
        read_unlock_irqrestore(&rs->rs_recv_lock, flags);

        /* clear state any time we wake a seen-congested socket */
        if (mask)
                rs->rs_seen_congestion = 0;

        return mask;
}

static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        rds_tos_t utos, tos = 0;

        switch (cmd) {
        case SIOCRDSSETTOS:
                if (get_user(utos, (rds_tos_t __user *)arg))
                        return -EFAULT;

                if (rs->rs_transport &&
                    rs->rs_transport->get_tos_map)
                        tos = rs->rs_transport->get_tos_map(utos);
                else
                        return -ENOIOCTLCMD;

                spin_lock_bh(&rds_sock_lock);
                if (rs->rs_tos || rs->rs_conn) {
                        spin_unlock_bh(&rds_sock_lock);
                        return -EINVAL;
                }
                rs->rs_tos = tos;
                spin_unlock_bh(&rds_sock_lock);
                break;
        case SIOCRDSGETTOS:
                spin_lock_bh(&rds_sock_lock);
                tos = rs->rs_tos;
                spin_unlock_bh(&rds_sock_lock);
                if (put_user(tos, (rds_tos_t __user *)arg))
                        return -EFAULT;
                break;
        default:
                return -ENOIOCTLCMD;
        }

        return 0;
}

static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
{
        struct sockaddr_in6 sin6;
        struct sockaddr_in sin;
        int ret = 0;

        /* racing with another thread binding seems ok here */
        if (ipv6_addr_any(&rs->rs_bound_addr)) {
                ret = -ENOTCONN; /* XXX not a great errno */
                goto out;
        }

        if (len < sizeof(struct sockaddr_in)) {
                ret = -EINVAL;
                goto out;
        } else if (len < sizeof(struct sockaddr_in6)) {
                /* Assume IPv4 */
                if (copy_from_sockptr(&sin, optval,
                                sizeof(struct sockaddr_in))) {
                        ret = -EFAULT;
                        goto out;
                }
                ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
                sin6.sin6_port = sin.sin_port;
        } else {
                if (copy_from_sockptr(&sin6, optval,
                                   sizeof(struct sockaddr_in6))) {
                        ret = -EFAULT;
                        goto out;
                }
        }

        rds_send_drop_to(rs, &sin6);
out:
        return ret;
}

static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
                               int optlen)
{
        int value;

        if (optlen < sizeof(int))
                return -EINVAL;
        if (copy_from_sockptr(&value, optval, sizeof(int)))
                return -EFAULT;
        *optvar = !!value;
        return 0;
}

static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
{
        int ret;

        ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
        if (ret == 0) {
                if (rs->rs_cong_monitor) {
                        rds_cong_add_socket(rs);
                } else {
                        rds_cong_remove_socket(rs);
                        rs->rs_cong_mask = 0;
                        rs->rs_cong_notify = 0;
                }
        }
        return ret;
}

static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen)
{
        int t_type;

        if (rs->rs_transport)
                return -EOPNOTSUPP; /* previously attached to transport */

        if (optlen != sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
                return -EFAULT;

        if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
                return -EINVAL;

        rs->rs_transport = rds_trans_get(t_type);

        return rs->rs_transport ? 0 : -ENOPROTOOPT;
}

static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
                                 int optlen, int optname)
{
        int val, valbool;

        if (optlen != sizeof(int))
                return -EFAULT;

        if (copy_from_sockptr(&val, optval, sizeof(int)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        if (optname == SO_TIMESTAMP_NEW)
                sock_set_flag(sk, SOCK_TSTAMP_NEW);

        if (valbool)
                sock_set_flag(sk, SOCK_RCVTSTAMP);
        else
                sock_reset_flag(sk, SOCK_RCVTSTAMP);

        return 0;
}

static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
                                  int optlen)
{
        struct rds_rx_trace_so trace;
        int i;

        if (optlen != sizeof(struct rds_rx_trace_so))
                return -EFAULT;

        if (copy_from_sockptr(&trace, optval, sizeof(trace)))
                return -EFAULT;

        if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
                return -EFAULT;

        rs->rs_rx_traces = trace.rx_traces;
        for (i = 0; i < rs->rs_rx_traces; i++) {
                if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) {
                        rs->rs_rx_traces = 0;
                        return -EFAULT;
                }
                rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
        }

        return 0;
}

static int rds_setsockopt(struct socket *sock, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        int ret;

        if (level != SOL_RDS) {
                ret = -ENOPROTOOPT;
                goto out;
        }

        switch (optname) {
        case RDS_CANCEL_SENT_TO:
                ret = rds_cancel_sent_to(rs, optval, optlen);
                break;
        case RDS_GET_MR:
                ret = rds_get_mr(rs, optval, optlen);
                break;
        case RDS_GET_MR_FOR_DEST:
                ret = rds_get_mr_for_dest(rs, optval, optlen);
                break;
        case RDS_FREE_MR:
                ret = rds_free_mr(rs, optval, optlen);
                break;
        case RDS_RECVERR:
                ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
                break;
        case RDS_CONG_MONITOR:
                ret = rds_cong_monitor(rs, optval, optlen);
                break;
        case SO_RDS_TRANSPORT:
                lock_sock(sock->sk);
                ret = rds_set_transport(rs, optval, optlen);
                release_sock(sock->sk);
                break;
        case SO_TIMESTAMP_OLD:
        case SO_TIMESTAMP_NEW:
                lock_sock(sock->sk);
                ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
                release_sock(sock->sk);
                break;
        case SO_RDS_MSG_RXPATH_LATENCY:
                ret = rds_recv_track_latency(rs, optval, optlen);
                break;
        default:
                ret = -ENOPROTOOPT;
        }
out:
        return ret;
}

static int rds_getsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        int ret = -ENOPROTOOPT, len;
        int trans;

        if (level != SOL_RDS)
                goto out;

        if (get_user(len, optlen)) {
                ret = -EFAULT;
                goto out;
        }

        switch (optname) {
        case RDS_INFO_FIRST ... RDS_INFO_LAST:
                ret = rds_info_getsockopt(sock, optname, optval,
                                          optlen);
                break;

        case RDS_RECVERR:
                if (len < sizeof(int))
                        ret = -EINVAL;
                else
                if (put_user(rs->rs_recverr, (int __user *) optval) ||
                    put_user(sizeof(int), optlen))
                        ret = -EFAULT;
                else
                        ret = 0;
                break;
        case SO_RDS_TRANSPORT:
                if (len < sizeof(int)) {
                        ret = -EINVAL;
                        break;
                }
                trans = (rs->rs_transport ? rs->rs_transport->t_type :
                         RDS_TRANS_NONE); /* unbound */
                if (put_user(trans, (int __user *)optval) ||
                    put_user(sizeof(int), optlen))
                        ret = -EFAULT;
                else
                        ret = 0;
                break;
        default:
                break;
        }

out:
        return ret;

}

static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
                       int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        struct sockaddr_in *sin;
        struct rds_sock *rs = rds_sk_to_rs(sk);
        int ret = 0;

        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return -EINVAL;

        lock_sock(sk);

        switch (uaddr->sa_family) {
        case AF_INET:
                sin = (struct sockaddr_in *)uaddr;
                if (addr_len < sizeof(struct sockaddr_in)) {
                        ret = -EINVAL;
                        break;
                }
                if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
                        ret = -EDESTADDRREQ;
                        break;
                }
                if (ipv4_is_multicast(sin->sin_addr.s_addr) ||
                    sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
                        ret = -EINVAL;
                        break;
                }
                ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
                rs->rs_conn_port = sin->sin_port;
                break;

#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6: {
                struct sockaddr_in6 *sin6;
                int addr_type;

                sin6 = (struct sockaddr_in6 *)uaddr;
                if (addr_len < sizeof(struct sockaddr_in6)) {
                        ret = -EINVAL;
                        break;
                }
                addr_type = ipv6_addr_type(&sin6->sin6_addr);
                if (!(addr_type & IPV6_ADDR_UNICAST)) {
                        __be32 addr4;

                        if (!(addr_type & IPV6_ADDR_MAPPED)) {
                                ret = -EPROTOTYPE;
                                break;
                        }

                        /* It is a mapped address.  Need to do some sanity
                         * checks.
                         */
                        addr4 = sin6->sin6_addr.s6_addr32[3];
                        if (addr4 == htonl(INADDR_ANY) ||
                            addr4 == htonl(INADDR_BROADCAST) ||
                            ipv4_is_multicast(addr4)) {
                                ret = -EPROTOTYPE;
                                break;
                        }
                }

                if (addr_type & IPV6_ADDR_LINKLOCAL) {
                        /* If socket is arleady bound to a link local address,
                         * the peer address must be on the same link.
                         */
                        if (sin6->sin6_scope_id == 0 ||
                            (!ipv6_addr_any(&rs->rs_bound_addr) &&
                             rs->rs_bound_scope_id &&
                             sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
                                ret = -EINVAL;
                                break;
                        }
                        /* Remember the connected address scope ID.  It will
                         * be checked against the binding local address when
                         * the socket is bound.
                         */
                        rs->rs_bound_scope_id = sin6->sin6_scope_id;
                }
                rs->rs_conn_addr = sin6->sin6_addr;
                rs->rs_conn_port = sin6->sin6_port;
                break;
        }
#endif

        default:
                ret = -EAFNOSUPPORT;
                break;
        }

        release_sock(sk);
        return ret;
}

static struct proto rds_proto = {
        .name          = "RDS",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct rds_sock),
};

static const struct proto_ops rds_proto_ops = {
        .family =        AF_RDS,
        .owner =        THIS_MODULE,
        .release =        rds_release,
        .bind =                rds_bind,
        .connect =        rds_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        rds_getname,
        .poll =                rds_poll,
        .ioctl =        rds_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        rds_setsockopt,
        .getsockopt =        rds_getsockopt,
        .sendmsg =        rds_sendmsg,
        .recvmsg =        rds_recvmsg,
        .mmap =                sock_no_mmap,
};

static void rds_sock_destruct(struct sock *sk)
{
        struct rds_sock *rs = rds_sk_to_rs(sk);

        WARN_ON((&rs->rs_item != rs->rs_item.next ||
                 &rs->rs_item != rs->rs_item.prev));
}

static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
{
        struct rds_sock *rs;

        sock_init_data(sock, sk);
        sock->ops                = &rds_proto_ops;
        sk->sk_protocol                = protocol;
        sk->sk_destruct                = rds_sock_destruct;

        rs = rds_sk_to_rs(sk);
        spin_lock_init(&rs->rs_lock);
        rwlock_init(&rs->rs_recv_lock);
        INIT_LIST_HEAD(&rs->rs_send_queue);
        INIT_LIST_HEAD(&rs->rs_recv_queue);
        INIT_LIST_HEAD(&rs->rs_notify_queue);
        INIT_LIST_HEAD(&rs->rs_cong_list);
        rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
        spin_lock_init(&rs->rs_rdma_lock);
        rs->rs_rdma_keys = RB_ROOT;
        rs->rs_rx_traces = 0;
        rs->rs_tos = 0;
        rs->rs_conn = NULL;

        spin_lock_bh(&rds_sock_lock);
        list_add_tail(&rs->rs_item, &rds_sock_list);
        rds_sock_count++;
        spin_unlock_bh(&rds_sock_lock);

        return 0;
}

static int rds_create(struct net *net, struct socket *sock, int protocol,
                      int kern)
{
        struct sock *sk;

        if (sock->type != SOCK_SEQPACKET || protocol)
                return -ESOCKTNOSUPPORT;

        sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern);
        if (!sk)
                return -ENOMEM;

        return __rds_create(sock, sk, protocol);
}

void rds_sock_addref(struct rds_sock *rs)
{
        sock_hold(rds_rs_to_sk(rs));
}

void rds_sock_put(struct rds_sock *rs)
{
        sock_put(rds_rs_to_sk(rs));
}

static const struct net_proto_family rds_family_ops = {
        .family =        AF_RDS,
        .create =        rds_create,
        .owner        =        THIS_MODULE,
};

static void rds_sock_inc_info(struct socket *sock, unsigned int len,
                              struct rds_info_iterator *iter,
                              struct rds_info_lengths *lens)
{
        struct rds_sock *rs;
        struct rds_incoming *inc;
        unsigned int total = 0;

        len /= sizeof(struct rds_info_message);

        spin_lock_bh(&rds_sock_lock);

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                /* This option only supports IPv4 sockets. */
                if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
                        continue;

                read_lock(&rs->rs_recv_lock);

                /* XXX too lazy to maintain counts.. */
                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
                        total++;
                        if (total <= len)
                                rds_inc_info_copy(inc, iter,
                                                  inc->i_saddr.s6_addr32[3],
                                                  rs->rs_bound_addr_v4,
                                                  1);
                }

                read_unlock(&rs->rs_recv_lock);
        }

        spin_unlock_bh(&rds_sock_lock);

        lens->nr = total;
        lens->each = sizeof(struct rds_info_message);
}

#if IS_ENABLED(CONFIG_IPV6)
static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
                               struct rds_info_iterator *iter,
                               struct rds_info_lengths *lens)
{
        struct rds_incoming *inc;
        unsigned int total = 0;
        struct rds_sock *rs;

        len /= sizeof(struct rds6_info_message);

        spin_lock_bh(&rds_sock_lock);

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                read_lock(&rs->rs_recv_lock);

                list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
                        total++;
                        if (total <= len)
                                rds6_inc_info_copy(inc, iter, &inc->i_saddr,
                                                   &rs->rs_bound_addr, 1);
                }

                read_unlock(&rs->rs_recv_lock);
        }

        spin_unlock_bh(&rds_sock_lock);

        lens->nr = total;
        lens->each = sizeof(struct rds6_info_message);
}
#endif

static void rds_sock_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens)
{
        struct rds_info_socket sinfo;
        unsigned int cnt = 0;
        struct rds_sock *rs;

        len /= sizeof(struct rds_info_socket);

        spin_lock_bh(&rds_sock_lock);

        if (len < rds_sock_count) {
                cnt = rds_sock_count;
                goto out;
        }

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                /* This option only supports IPv4 sockets. */
                if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
                        continue;
                sinfo.sndbuf = rds_sk_sndbuf(rs);
                sinfo.rcvbuf = rds_sk_rcvbuf(rs);
                sinfo.bound_addr = rs->rs_bound_addr_v4;
                sinfo.connected_addr = rs->rs_conn_addr_v4;
                sinfo.bound_port = rs->rs_bound_port;
                sinfo.connected_port = rs->rs_conn_port;
                sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));

                rds_info_copy(iter, &sinfo, sizeof(sinfo));
                cnt++;
        }

out:
        lens->nr = cnt;
        lens->each = sizeof(struct rds_info_socket);

        spin_unlock_bh(&rds_sock_lock);
}

#if IS_ENABLED(CONFIG_IPV6)
static void rds6_sock_info(struct socket *sock, unsigned int len,
                           struct rds_info_iterator *iter,
                           struct rds_info_lengths *lens)
{
        struct rds6_info_socket sinfo6;
        struct rds_sock *rs;

        len /= sizeof(struct rds6_info_socket);

        spin_lock_bh(&rds_sock_lock);

        if (len < rds_sock_count)
                goto out;

        list_for_each_entry(rs, &rds_sock_list, rs_item) {
                sinfo6.sndbuf = rds_sk_sndbuf(rs);
                sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
                sinfo6.bound_addr = rs->rs_bound_addr;
                sinfo6.connected_addr = rs->rs_conn_addr;
                sinfo6.bound_port = rs->rs_bound_port;
                sinfo6.connected_port = rs->rs_conn_port;
                sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));

                rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
        }

 out:
        lens->nr = rds_sock_count;
        lens->each = sizeof(struct rds6_info_socket);

        spin_unlock_bh(&rds_sock_lock);
}
#endif

static void rds_exit(void)
{
        sock_unregister(rds_family_ops.family);
        proto_unregister(&rds_proto);
        rds_conn_exit();
        rds_cong_exit();
        rds_sysctl_exit();
        rds_threads_exit();
        rds_stats_exit();
        rds_page_exit();
        rds_bind_lock_destroy();
        rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
        rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
#if IS_ENABLED(CONFIG_IPV6)
        rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
        rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
#endif
}
module_exit(rds_exit);

u32 rds_gen_num;

static int __init rds_init(void)
{
        int ret;

        net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));

        ret = rds_bind_lock_init();
        if (ret)
                goto out;

        ret = rds_conn_init();
        if (ret)
                goto out_bind;

        ret = rds_threads_init();
        if (ret)
                goto out_conn;
        ret = rds_sysctl_init();
        if (ret)
                goto out_threads;
        ret = rds_stats_init();
        if (ret)
                goto out_sysctl;
        ret = proto_register(&rds_proto, 1);
        if (ret)
                goto out_stats;
        ret = sock_register(&rds_family_ops);
        if (ret)
                goto out_proto;

        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
#if IS_ENABLED(CONFIG_IPV6)
        rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
        rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
#endif

        goto out;

out_proto:
        proto_unregister(&rds_proto);
out_stats:
        rds_stats_exit();
out_sysctl:
        rds_sysctl_exit();
out_threads:
        rds_threads_exit();
out_conn:
        rds_conn_exit();
        rds_cong_exit();
        rds_page_exit();
out_bind:
        rds_bind_lock_destroy();
out:
        return ret;
}
module_init(rds_init);

#define DRV_VERSION     "4.0"
#define DRV_RELDATE     "Feb 12, 2009"

MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
                   " v" DRV_VERSION " (" DRV_RELDATE ")");
MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NETPROTO(PF_RDS);
































































    1 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * File: pep.h
 *
 * Phonet Pipe End Point sockets definitions
 *
 * Copyright (C) 2008 Nokia Corporation.
 */

#ifndef NET_PHONET_PEP_H
#define NET_PHONET_PEP_H

#include <linux/skbuff.h>
#include <net/phonet/phonet.h>

struct pep_sock {
        struct pn_sock                pn_sk;

        /* XXX: union-ify listening vs connected stuff ? */
        /* Listening socket stuff: */
        struct hlist_head        hlist;

        /* Connected socket stuff: */
        struct sock                *listener;
        struct sk_buff_head        ctrlreq_queue;
#define PNPIPE_CTRLREQ_MAX        10
        atomic_t                tx_credits;
        int                        ifindex;
        u16                        peer_type;        /* peer type/subtype */
        u8                        pipe_handle;

        u8                        rx_credits;
        u8                        rx_fc;        /* RX flow control */
        u8                        tx_fc;        /* TX flow control */
        u8                        init_enable;        /* auto-enable at creation */
        u8                        aligned;
};

static inline struct pep_sock *pep_sk(struct sock *sk)
{
        return (struct pep_sock *)sk;
}

extern const struct proto_ops phonet_stream_ops;

/* Pipe protocol definitions */
struct pnpipehdr {
        u8                        utid; /* transaction ID */
        u8                        message_id;
        u8                        pipe_handle;
        union {
                u8                state_after_connect;        /* connect request */
                u8                state_after_reset;        /* reset request */
                u8                error_code;                /* any response */
                u8                pep_type;                /* status indication */
                u8                data0;                        /* anything else */
        };
        u8                        data[];
};
#define other_pep_type                data[0]

static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb)
{
        return (struct pnpipehdr *)skb_transport_header(skb);
}

#define MAX_PNPIPE_HEADER (MAX_PHONET_HEADER + 4)

enum {
        PNS_PIPE_CREATE_REQ = 0x00,
        PNS_PIPE_CREATE_RESP,
        PNS_PIPE_REMOVE_REQ,
        PNS_PIPE_REMOVE_RESP,

        PNS_PIPE_DATA = 0x20,
        PNS_PIPE_ALIGNED_DATA,

        PNS_PEP_CONNECT_REQ = 0x40,
        PNS_PEP_CONNECT_RESP,
        PNS_PEP_DISCONNECT_REQ,
        PNS_PEP_DISCONNECT_RESP,
        PNS_PEP_RESET_REQ,
        PNS_PEP_RESET_RESP,
        PNS_PEP_ENABLE_REQ,
        PNS_PEP_ENABLE_RESP,
        PNS_PEP_CTRL_REQ,
        PNS_PEP_CTRL_RESP,
        PNS_PEP_DISABLE_REQ = 0x4C,
        PNS_PEP_DISABLE_RESP,

        PNS_PEP_STATUS_IND = 0x60,
        PNS_PIPE_CREATED_IND,
        PNS_PIPE_RESET_IND = 0x63,
        PNS_PIPE_ENABLED_IND,
        PNS_PIPE_REDIRECTED_IND,
        PNS_PIPE_DISABLED_IND = 0x66,
};

#define PN_PIPE_INVALID_HANDLE        0xff
#define PN_PEP_TYPE_COMMON        0x00

/* Phonet pipe status indication */
enum {
        PN_PEP_IND_FLOW_CONTROL,
        PN_PEP_IND_ID_MCFC_GRANT_CREDITS,
};

/* Phonet pipe error codes */
enum {
        PN_PIPE_NO_ERROR,
        PN_PIPE_ERR_INVALID_PARAM,
        PN_PIPE_ERR_INVALID_HANDLE,
        PN_PIPE_ERR_INVALID_CTRL_ID,
        PN_PIPE_ERR_NOT_ALLOWED,
        PN_PIPE_ERR_PEP_IN_USE,
        PN_PIPE_ERR_OVERLOAD,
        PN_PIPE_ERR_DEV_DISCONNECTED,
        PN_PIPE_ERR_TIMEOUT,
        PN_PIPE_ERR_ALL_PIPES_IN_USE,
        PN_PIPE_ERR_GENERAL,
        PN_PIPE_ERR_NOT_SUPPORTED,
};

/* Phonet pipe states */
enum {
        PN_PIPE_DISABLE,
        PN_PIPE_ENABLE,
};

/* Phonet pipe sub-block types */
enum {
        PN_PIPE_SB_CREATE_REQ_PEP_SUB_TYPE,
        PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE,
        PN_PIPE_SB_REDIRECT_REQ_PEP_SUB_TYPE,
        PN_PIPE_SB_NEGOTIATED_FC,
        PN_PIPE_SB_REQUIRED_FC_TX,
        PN_PIPE_SB_PREFERRED_FC_RX,
        PN_PIPE_SB_ALIGNED_DATA,
};

/* Phonet pipe flow control models */
enum {
        PN_NO_FLOW_CONTROL,
        PN_LEGACY_FLOW_CONTROL,
        PN_ONE_CREDIT_FLOW_CONTROL,
        PN_MULTI_CREDIT_FLOW_CONTROL,
        PN_MAX_FLOW_CONTROL,
};

#define pn_flow_safe(fc) ((fc) >> 1)

/* Phonet pipe flow control states */
enum {
        PEP_IND_EMPTY,
        PEP_IND_BUSY,
        PEP_IND_READY,
};

#endif




























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * This is <linux/capability.h>
 *
 * Andrew G. Morgan <morgan@kernel.org>
 * Alexander Kjeldaas <astor@guardian.no>
 * with help from Aleph1, Roland Buresund and Andrew Main.
 *
 * See here for the libcap library ("POSIX draft" compliance):
 *
 * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
 */
#ifndef _LINUX_CAPABILITY_H
#define _LINUX_CAPABILITY_H

#include <uapi/linux/capability.h>
#include <linux/uidgid.h>
#include <linux/bits.h>

#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3

extern int file_caps_enabled;

typedef struct { u64 val; } kernel_cap_t;

/* same as vfs_ns_cap_data but in cpu endian and always filled completely */
struct cpu_vfs_cap_data {
        __u32 magic_etc;
        kuid_t rootid;
        kernel_cap_t permitted;
        kernel_cap_t inheritable;
};

#define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))

struct file;
struct inode;
struct dentry;
struct task_struct;
struct user_namespace;
struct mnt_idmap;

/*
 * CAP_FS_MASK and CAP_NFSD_MASKS:
 *
 * The fs mask is all the privileges that fsuid==0 historically meant.
 * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE.
 *
 * It has never meant setting security.* and trusted.* xattrs.
 *
 * We could also define fsmask as follows:
 *   1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions
 *   2. The security.* and trusted.* xattrs are fs-related MAC permissions
 */

# define CAP_FS_MASK     (BIT_ULL(CAP_CHOWN)                \
                        | BIT_ULL(CAP_MKNOD)                \
                        | BIT_ULL(CAP_DAC_OVERRIDE)        \
                        | BIT_ULL(CAP_DAC_READ_SEARCH)        \
                        | BIT_ULL(CAP_FOWNER)                \
                        | BIT_ULL(CAP_FSETID)                \
                        | BIT_ULL(CAP_MAC_OVERRIDE))
#define CAP_VALID_MASK         (BIT_ULL(CAP_LAST_CAP+1)-1)

# define CAP_EMPTY_SET    ((kernel_cap_t) { 0 })
# define CAP_FULL_SET     ((kernel_cap_t) { CAP_VALID_MASK })
# define CAP_FS_SET       ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) })
# define CAP_NFSD_SET     ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) })

# define cap_clear(c)         do { (c).val = 0; } while (0)

#define cap_raise(c, flag)  ((c).val |= BIT_ULL(flag))
#define cap_lower(c, flag)  ((c).val &= ~BIT_ULL(flag))
#define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0)

static inline kernel_cap_t cap_combine(const kernel_cap_t a,
                                       const kernel_cap_t b)
{
        return (kernel_cap_t) { a.val | b.val };
}

static inline kernel_cap_t cap_intersect(const kernel_cap_t a,
                                         const kernel_cap_t b)
{
        return (kernel_cap_t) { a.val & b.val };
}

static inline kernel_cap_t cap_drop(const kernel_cap_t a,
                                    const kernel_cap_t drop)
{
        return (kernel_cap_t) { a.val &~ drop.val };
}

static inline bool cap_isclear(const kernel_cap_t a)
{
        return !a.val;
}

static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b)
{
        return a.val == b.val;
}

/*
 * Check if "a" is a subset of "set".
 * return true if ALL of the capabilities in "a" are also in "set"
 *        cap_issubset(0101, 1111) will return true
 * return false if ANY of the capabilities in "a" are not in "set"
 *        cap_issubset(1111, 0101) will return false
 */
static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set)
{
        return !(a.val & ~set.val);
}

/* Used to decide between falling back on the old suser() or fsuser(). */

static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a)
{
        return cap_drop(a, CAP_FS_SET);
}

static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a,
                                            const kernel_cap_t permitted)
{
        return cap_combine(a, cap_intersect(permitted, CAP_FS_SET));
}

static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a)
{
        return cap_drop(a, CAP_NFSD_SET);
}

static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
                                              const kernel_cap_t permitted)
{
        return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET));
}

#ifdef CONFIG_MULTIUSER
extern bool has_capability(struct task_struct *t, int cap);
extern bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap);
extern bool has_capability_noaudit(struct task_struct *t, int cap);
extern bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap);
extern bool capable(int cap);
extern bool ns_capable(struct user_namespace *ns, int cap);
extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
extern bool ns_capable_setid(struct user_namespace *ns, int cap);
#else
static inline bool has_capability(struct task_struct *t, int cap)
{
        return true;
}
static inline bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return true;
}
static inline bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool capable(int cap)
{
        return true;
}
static inline bool ns_capable(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return true;
}
#endif /* CONFIG_MULTIUSER */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode);
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
static inline bool perfmon_capable(void)
{
        return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN);
}

static inline bool bpf_capable(void)
{
        return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
}

static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
{
        return ns_capable(ns, CAP_CHECKPOINT_RESTORE) ||
                ns_capable(ns, CAP_SYS_ADMIN);
}

/* audit system wants to get cap info from files as well */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps);

int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size);

#endif /* !_LINUX_CAPABILITY_H */


















































































































































































































































































































































































































































































































































































































































































































































































    1 











    1 

    1 












    1 






















    1 
















































    1 






    1 




















































    1 































































































































































































































































































































































































































































































































































































































































































    1 











































    1 


























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IBSS mode implementation
 * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
 * Copyright 2004, Instant802 Networks, Inc.
 * Copyright 2005, Devicescape Software, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
 * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright(c) 2016 Intel Deutschland GmbH
 * Copyright(c) 2018-2024 Intel Corporation
 */

#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/if_ether.h>
#include <linux/skbuff.h>
#include <linux/if_arp.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <net/mac80211.h>

#include "ieee80211_i.h"
#include "driver-ops.h"
#include "rate.h"

#define IEEE80211_SCAN_INTERVAL (2 * HZ)
#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)

#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
#define IEEE80211_IBSS_RSN_INACTIVITY_LIMIT (10 * HZ)

#define IEEE80211_IBSS_MAX_STA_ENTRIES 128

static struct beacon_data *
ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata,
                           const int beacon_int, const u32 basic_rates,
                           const u16 capability, u64 tsf,
                           struct cfg80211_chan_def *chandef,
                           bool *have_higher_than_11mbit,
                           struct cfg80211_csa_settings *csa_settings)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        int rates_n = 0, i, ri;
        struct ieee80211_mgmt *mgmt;
        u8 *pos;
        struct ieee80211_supported_band *sband;
        u32 rate_flags, rates = 0, rates_added = 0;
        struct beacon_data *presp;
        int frame_len;

        /* Build IBSS probe response */
        frame_len = sizeof(struct ieee80211_hdr_3addr) +
                    12 /* struct ieee80211_mgmt.u.beacon */ +
                    2 + IEEE80211_MAX_SSID_LEN /* max SSID */ +
                    2 + 8 /* max Supported Rates */ +
                    3 /* max DS params */ +
                    4 /* IBSS params */ +
                    5 /* Channel Switch Announcement */ +
                    2 + (IEEE80211_MAX_SUPP_RATES - 8) +
                    2 + sizeof(struct ieee80211_ht_cap) +
                    2 + sizeof(struct ieee80211_ht_operation) +
                    2 + sizeof(struct ieee80211_vht_cap) +
                    2 + sizeof(struct ieee80211_vht_operation) +
                    ifibss->ie_len;
        presp = kzalloc(sizeof(*presp) + frame_len, GFP_KERNEL);
        if (!presp)
                return NULL;

        presp->head = (void *)(presp + 1);

        mgmt = (void *) presp->head;
        mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
                                          IEEE80211_STYPE_PROBE_RESP);
        eth_broadcast_addr(mgmt->da);
        memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
        memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
        mgmt->u.beacon.beacon_int = cpu_to_le16(beacon_int);
        mgmt->u.beacon.timestamp = cpu_to_le64(tsf);
        mgmt->u.beacon.capab_info = cpu_to_le16(capability);

        pos = (u8 *)mgmt + offsetof(struct ieee80211_mgmt, u.beacon.variable);

        *pos++ = WLAN_EID_SSID;
        *pos++ = ifibss->ssid_len;
        memcpy(pos, ifibss->ssid, ifibss->ssid_len);
        pos += ifibss->ssid_len;

        sband = local->hw.wiphy->bands[chandef->chan->band];
        rate_flags = ieee80211_chandef_rate_flags(chandef);
        rates_n = 0;
        if (have_higher_than_11mbit)
                *have_higher_than_11mbit = false;

        for (i = 0; i < sband->n_bitrates; i++) {
                if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
                        continue;
                if (sband->bitrates[i].bitrate > 110 &&
                    have_higher_than_11mbit)
                        *have_higher_than_11mbit = true;

                rates |= BIT(i);
                rates_n++;
        }

        *pos++ = WLAN_EID_SUPP_RATES;
        *pos++ = min_t(int, 8, rates_n);
        for (ri = 0; ri < sband->n_bitrates; ri++) {
                int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate, 5);
                u8 basic = 0;
                if (!(rates & BIT(ri)))
                        continue;

                if (basic_rates & BIT(ri))
                        basic = 0x80;
                *pos++ = basic | (u8) rate;
                if (++rates_added == 8) {
                        ri++; /* continue at next rate for EXT_SUPP_RATES */
                        break;
                }
        }

        if (sband->band == NL80211_BAND_2GHZ) {
                *pos++ = WLAN_EID_DS_PARAMS;
                *pos++ = 1;
                *pos++ = ieee80211_frequency_to_channel(
                                chandef->chan->center_freq);
        }

        *pos++ = WLAN_EID_IBSS_PARAMS;
        *pos++ = 2;
        /* FIX: set ATIM window based on scan results */
        *pos++ = 0;
        *pos++ = 0;

        if (csa_settings) {
                *pos++ = WLAN_EID_CHANNEL_SWITCH;
                *pos++ = 3;
                *pos++ = csa_settings->block_tx ? 1 : 0;
                *pos++ = ieee80211_frequency_to_channel(
                                csa_settings->chandef.chan->center_freq);
                presp->cntdwn_counter_offsets[0] = (pos - presp->head);
                *pos++ = csa_settings->count;
                presp->cntdwn_current_counter = csa_settings->count;
        }

        /* put the remaining rates in WLAN_EID_EXT_SUPP_RATES */
        if (rates_n > 8) {
                *pos++ = WLAN_EID_EXT_SUPP_RATES;
                *pos++ = rates_n - 8;
                for (; ri < sband->n_bitrates; ri++) {
                        int rate = DIV_ROUND_UP(sband->bitrates[ri].bitrate, 5);
                        u8 basic = 0;
                        if (!(rates & BIT(ri)))
                                continue;

                        if (basic_rates & BIT(ri))
                                basic = 0x80;
                        *pos++ = basic | (u8) rate;
                }
        }

        if (ifibss->ie_len) {
                memcpy(pos, ifibss->ie, ifibss->ie_len);
                pos += ifibss->ie_len;
        }

        /* add HT capability and information IEs */
        if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT &&
            chandef->width != NL80211_CHAN_WIDTH_5 &&
            chandef->width != NL80211_CHAN_WIDTH_10 &&
            sband->ht_cap.ht_supported) {
                struct ieee80211_sta_ht_cap ht_cap;

                memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap));
                ieee80211_apply_htcap_overrides(sdata, &ht_cap);

                pos = ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap);
                /*
                 * Note: According to 802.11n-2009 9.13.3.1, HT Protection
                 * field and RIFS Mode are reserved in IBSS mode, therefore
                 * keep them at 0
                 */
                pos = ieee80211_ie_build_ht_oper(pos, &sband->ht_cap,
                                                 chandef, 0, false);

                /* add VHT capability and information IEs */
                if (chandef->width != NL80211_CHAN_WIDTH_20 &&
                    chandef->width != NL80211_CHAN_WIDTH_40 &&
                    sband->vht_cap.vht_supported) {
                        pos = ieee80211_ie_build_vht_cap(pos, &sband->vht_cap,
                                                         sband->vht_cap.cap);
                        pos = ieee80211_ie_build_vht_oper(pos, &sband->vht_cap,
                                                          chandef);
                }
        }

        if (local->hw.queues >= IEEE80211_NUM_ACS)
                pos = ieee80211_add_wmm_info_ie(pos, 0); /* U-APSD not in use */

        presp->head_len = pos - presp->head;
        if (WARN_ON(presp->head_len > frame_len))
                goto error;

        return presp;
error:
        kfree(presp);
        return NULL;
}

static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
                                      const u8 *bssid, const int beacon_int,
                                      struct cfg80211_chan_def *req_chandef,
                                      const u32 basic_rates,
                                      const u16 capability, u64 tsf,
                                      bool creator)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_mgmt *mgmt;
        struct cfg80211_bss *bss;
        u64 bss_change;
        struct ieee80211_chan_req chanreq = {};
        struct ieee80211_channel *chan;
        struct beacon_data *presp;
        struct cfg80211_inform_bss bss_meta = {};
        bool have_higher_than_11mbit;
        bool radar_required;
        int err;

        lockdep_assert_wiphy(local->hw.wiphy);

        /* Reset own TSF to allow time synchronization work. */
        drv_reset_tsf(local, sdata);

        if (!ether_addr_equal(ifibss->bssid, bssid))
                sta_info_flush(sdata, -1);

        /* if merging, indicate to driver that we leave the old IBSS */
        if (sdata->vif.cfg.ibss_joined) {
                sdata->vif.cfg.ibss_joined = false;
                sdata->vif.cfg.ibss_creator = false;
                sdata->vif.bss_conf.enable_beacon = false;
                netif_carrier_off(sdata->dev);
                ieee80211_bss_info_change_notify(sdata,
                                                 BSS_CHANGED_IBSS |
                                                 BSS_CHANGED_BEACON_ENABLED);
                drv_leave_ibss(local, sdata);
        }

        presp = sdata_dereference(ifibss->presp, sdata);
        RCU_INIT_POINTER(ifibss->presp, NULL);
        if (presp)
                kfree_rcu(presp, rcu_head);

        /* make a copy of the chandef, it could be modified below. */
        chanreq.oper = *req_chandef;
        chan = chanreq.oper.chan;
        if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper,
                                     NL80211_IFTYPE_ADHOC)) {
                if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 ||
                    chanreq.oper.width == NL80211_CHAN_WIDTH_10 ||
                    chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
                    chanreq.oper.width == NL80211_CHAN_WIDTH_20) {
                        sdata_info(sdata,
                                   "Failed to join IBSS, beacons forbidden\n");
                        return;
                }
                chanreq.oper.width = NL80211_CHAN_WIDTH_20;
                chanreq.oper.center_freq1 = chan->center_freq;
                /* check again for downgraded chandef */
                if (!cfg80211_reg_can_beacon(local->hw.wiphy, &chanreq.oper,
                                             NL80211_IFTYPE_ADHOC)) {
                        sdata_info(sdata,
                                   "Failed to join IBSS, beacons forbidden\n");
                        return;
                }
        }

        err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
                                            &chanreq.oper, NL80211_IFTYPE_ADHOC);
        if (err < 0) {
                sdata_info(sdata,
                           "Failed to join IBSS, invalid chandef\n");
                return;
        }
        if (err > 0 && !ifibss->userspace_handles_dfs) {
                sdata_info(sdata,
                           "Failed to join IBSS, DFS channel without control program\n");
                return;
        }

        radar_required = err;

        if (ieee80211_link_use_channel(&sdata->deflink, &chanreq,
                                       ifibss->fixed_channel ?
                                        IEEE80211_CHANCTX_SHARED :
                                        IEEE80211_CHANCTX_EXCLUSIVE)) {
                sdata_info(sdata, "Failed to join IBSS, no channel context\n");
                return;
        }
        sdata->deflink.radar_required = radar_required;

        memcpy(ifibss->bssid, bssid, ETH_ALEN);

        presp = ieee80211_ibss_build_presp(sdata, beacon_int, basic_rates,
                                           capability, tsf, &chanreq.oper,
                                           &have_higher_than_11mbit, NULL);
        if (!presp)
                return;

        rcu_assign_pointer(ifibss->presp, presp);
        mgmt = (void *)presp->head;

        sdata->vif.bss_conf.enable_beacon = true;
        sdata->vif.bss_conf.beacon_int = beacon_int;
        sdata->vif.bss_conf.basic_rates = basic_rates;
        sdata->vif.cfg.ssid_len = ifibss->ssid_len;
        memcpy(sdata->vif.cfg.ssid, ifibss->ssid, ifibss->ssid_len);
        bss_change = BSS_CHANGED_BEACON_INT;
        bss_change |= ieee80211_reset_erp_info(sdata);
        bss_change |= BSS_CHANGED_BSSID;
        bss_change |= BSS_CHANGED_BEACON;
        bss_change |= BSS_CHANGED_BEACON_ENABLED;
        bss_change |= BSS_CHANGED_BASIC_RATES;
        bss_change |= BSS_CHANGED_HT;
        bss_change |= BSS_CHANGED_IBSS;
        bss_change |= BSS_CHANGED_SSID;

        /*
         * In 5 GHz/802.11a, we can always use short slot time.
         * (IEEE 802.11-2012 18.3.8.7)
         *
         * In 2.4GHz, we must always use long slots in IBSS for compatibility
         * reasons.
         * (IEEE 802.11-2012 19.4.5)
         *
         * HT follows these specifications (IEEE 802.11-2012 20.3.18)
         */
        sdata->vif.bss_conf.use_short_slot = chan->band == NL80211_BAND_5GHZ;
        bss_change |= BSS_CHANGED_ERP_SLOT;

        /* cf. IEEE 802.11 9.2.12 */
        sdata->deflink.operating_11g_mode =
                chan->band == NL80211_BAND_2GHZ && have_higher_than_11mbit;

        ieee80211_set_wmm_default(&sdata->deflink, true, false);

        sdata->vif.cfg.ibss_joined = true;
        sdata->vif.cfg.ibss_creator = creator;

        err = drv_join_ibss(local, sdata);
        if (err) {
                sdata->vif.cfg.ibss_joined = false;
                sdata->vif.cfg.ibss_creator = false;
                sdata->vif.bss_conf.enable_beacon = false;
                sdata->vif.cfg.ssid_len = 0;
                RCU_INIT_POINTER(ifibss->presp, NULL);
                kfree_rcu(presp, rcu_head);
                ieee80211_link_release_channel(&sdata->deflink);
                sdata_info(sdata, "Failed to join IBSS, driver failure: %d\n",
                           err);
                return;
        }

        ieee80211_bss_info_change_notify(sdata, bss_change);

        ifibss->state = IEEE80211_IBSS_MLME_JOINED;
        mod_timer(&ifibss->timer,
                  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));

        bss_meta.chan = chan;
        bss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt,
                                             presp->head_len, GFP_KERNEL);

        cfg80211_put_bss(local->hw.wiphy, bss);
        netif_carrier_on(sdata->dev);
        cfg80211_ibss_joined(sdata->dev, ifibss->bssid, chan, GFP_KERNEL);
}

static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
                                    struct ieee80211_bss *bss)
{
        struct cfg80211_bss *cbss =
                container_of((void *)bss, struct cfg80211_bss, priv);
        struct ieee80211_supported_band *sband;
        struct cfg80211_chan_def chandef;
        u32 basic_rates;
        int i, j;
        u16 beacon_int = cbss->beacon_interval;
        const struct cfg80211_bss_ies *ies;
        enum nl80211_channel_type chan_type;
        u64 tsf;
        u32 rate_flags;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        if (beacon_int < 10)
                beacon_int = 10;

        switch (sdata->u.ibss.chandef.width) {
        case NL80211_CHAN_WIDTH_20_NOHT:
        case NL80211_CHAN_WIDTH_20:
        case NL80211_CHAN_WIDTH_40:
                chan_type = cfg80211_get_chandef_type(&sdata->u.ibss.chandef);
                cfg80211_chandef_create(&chandef, cbss->channel, chan_type);
                break;
        case NL80211_CHAN_WIDTH_5:
        case NL80211_CHAN_WIDTH_10:
                cfg80211_chandef_create(&chandef, cbss->channel,
                                        NL80211_CHAN_NO_HT);
                chandef.width = sdata->u.ibss.chandef.width;
                break;
        case NL80211_CHAN_WIDTH_80:
        case NL80211_CHAN_WIDTH_80P80:
        case NL80211_CHAN_WIDTH_160:
                chandef = sdata->u.ibss.chandef;
                chandef.chan = cbss->channel;
                break;
        default:
                /* fall back to 20 MHz for unsupported modes */
                cfg80211_chandef_create(&chandef, cbss->channel,
                                        NL80211_CHAN_NO_HT);
                break;
        }

        sband = sdata->local->hw.wiphy->bands[cbss->channel->band];
        rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef);

        basic_rates = 0;

        for (i = 0; i < bss->supp_rates_len; i++) {
                int rate = bss->supp_rates[i] & 0x7f;
                bool is_basic = !!(bss->supp_rates[i] & 0x80);

                for (j = 0; j < sband->n_bitrates; j++) {
                        int brate;
                        if ((rate_flags & sband->bitrates[j].flags)
                            != rate_flags)
                                continue;

                        brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 5);
                        if (brate == rate) {
                                if (is_basic)
                                        basic_rates |= BIT(j);
                                break;
                        }
                }
        }

        rcu_read_lock();
        ies = rcu_dereference(cbss->ies);
        tsf = ies->tsf;
        rcu_read_unlock();

        __ieee80211_sta_join_ibss(sdata, cbss->bssid,
                                  beacon_int,
                                  &chandef,
                                  basic_rates,
                                  cbss->capability,
                                  tsf, false);
}

int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings,
                              u64 *changed)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct beacon_data *presp, *old_presp;
        struct cfg80211_bss *cbss;
        const struct cfg80211_bss_ies *ies;
        u16 capability = WLAN_CAPABILITY_IBSS;
        u64 tsf;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        if (ifibss->privacy)
                capability |= WLAN_CAPABILITY_PRIVACY;

        cbss = cfg80211_get_bss(sdata->local->hw.wiphy, ifibss->chandef.chan,
                                ifibss->bssid, ifibss->ssid,
                                ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS,
                                IEEE80211_PRIVACY(ifibss->privacy));

        if (unlikely(!cbss))
                return -EINVAL;

        rcu_read_lock();
        ies = rcu_dereference(cbss->ies);
        tsf = ies->tsf;
        rcu_read_unlock();
        cfg80211_put_bss(sdata->local->hw.wiphy, cbss);

        old_presp = sdata_dereference(ifibss->presp, sdata);

        presp = ieee80211_ibss_build_presp(sdata,
                                           sdata->vif.bss_conf.beacon_int,
                                           sdata->vif.bss_conf.basic_rates,
                                           capability, tsf, &ifibss->chandef,
                                           NULL, csa_settings);
        if (!presp)
                return -ENOMEM;

        rcu_assign_pointer(ifibss->presp, presp);
        if (old_presp)
                kfree_rcu(old_presp, rcu_head);

        *changed |= BSS_CHANGED_BEACON;
        return 0;
}

int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata, u64 *changed)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct cfg80211_bss *cbss;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        /* When not connected/joined, sending CSA doesn't make sense. */
        if (ifibss->state != IEEE80211_IBSS_MLME_JOINED)
                return -ENOLINK;

        /* update cfg80211 bss information with the new channel */
        if (!is_zero_ether_addr(ifibss->bssid)) {
                cbss = cfg80211_get_bss(sdata->local->hw.wiphy,
                                        ifibss->chandef.chan,
                                        ifibss->bssid, ifibss->ssid,
                                        ifibss->ssid_len,
                                        IEEE80211_BSS_TYPE_IBSS,
                                        IEEE80211_PRIVACY(ifibss->privacy));
                /* XXX: should not really modify cfg80211 data */
                if (cbss) {
                        cbss->channel = sdata->deflink.csa.chanreq.oper.chan;
                        cfg80211_put_bss(sdata->local->hw.wiphy, cbss);
                }
        }

        ifibss->chandef = sdata->deflink.csa.chanreq.oper;

        /* generate the beacon */
        return ieee80211_ibss_csa_beacon(sdata, NULL, changed);
}

void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;

        wiphy_work_cancel(sdata->local->hw.wiphy,
                          &ifibss->csa_connection_drop_work);
}

static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta)
        __acquires(RCU)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        u8 addr[ETH_ALEN];

        memcpy(addr, sta->sta.addr, ETH_ALEN);

        ibss_dbg(sdata, "Adding new IBSS station %pM\n", addr);

        sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
        sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
        /* authorize the station only if the network is not RSN protected. If
         * not wait for the userspace to authorize it */
        if (!sta->sdata->u.ibss.control_port)
                sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED);

        rate_control_rate_init(sta);

        /* If it fails, maybe we raced another insertion? */
        if (sta_info_insert_rcu(sta))
                return sta_info_get(sdata, addr);
        return sta;
}

static struct sta_info *
ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid,
                       const u8 *addr, u32 supp_rates)
        __acquires(RCU)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta;
        struct ieee80211_chanctx_conf *chanctx_conf;
        struct ieee80211_supported_band *sband;
        int band;

        /*
         * XXX: Consider removing the least recently used entry and
         *         allow new one to be added.
         */
        if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
                net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n",
                                    sdata->name, addr);
                rcu_read_lock();
                return NULL;
        }

        if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH) {
                rcu_read_lock();
                return NULL;
        }

        if (!ether_addr_equal(bssid, sdata->u.ibss.bssid)) {
                rcu_read_lock();
                return NULL;
        }

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
        if (WARN_ON_ONCE(!chanctx_conf))
                return NULL;
        band = chanctx_conf->def.chan->band;
        rcu_read_unlock();

        sta = sta_info_alloc(sdata, addr, GFP_KERNEL);
        if (!sta) {
                rcu_read_lock();
                return NULL;
        }

        /* make sure mandatory rates are always added */
        sband = local->hw.wiphy->bands[band];
        sta->sta.deflink.supp_rates[band] = supp_rates |
                        ieee80211_mandatory_rates(sband);

        return ieee80211_ibss_finish_sta(sta);
}

static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        int active = 0;
        struct sta_info *sta;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        rcu_read_lock();

        list_for_each_entry_rcu(sta, &local->sta_list, list) {
                unsigned long last_active = ieee80211_sta_last_active(sta);

                if (sta->sdata == sdata &&
                    time_is_after_jiffies(last_active +
                                          IEEE80211_IBSS_MERGE_INTERVAL)) {
                        active++;
                        break;
                }
        }

        rcu_read_unlock();

        return active;
}

static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        struct cfg80211_bss *cbss;
        struct beacon_data *presp;
        struct sta_info *sta;

        lockdep_assert_wiphy(local->hw.wiphy);

        if (!is_zero_ether_addr(ifibss->bssid)) {
                cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->chandef.chan,
                                        ifibss->bssid, ifibss->ssid,
                                        ifibss->ssid_len,
                                        IEEE80211_BSS_TYPE_IBSS,
                                        IEEE80211_PRIVACY(ifibss->privacy));

                if (cbss) {
                        cfg80211_unlink_bss(local->hw.wiphy, cbss);
                        cfg80211_put_bss(sdata->local->hw.wiphy, cbss);
                }
        }

        ifibss->state = IEEE80211_IBSS_MLME_SEARCH;

        sta_info_flush(sdata, -1);

        spin_lock_bh(&ifibss->incomplete_lock);
        while (!list_empty(&ifibss->incomplete_stations)) {
                sta = list_first_entry(&ifibss->incomplete_stations,
                                       struct sta_info, list);
                list_del(&sta->list);
                spin_unlock_bh(&ifibss->incomplete_lock);

                sta_info_free(local, sta);
                spin_lock_bh(&ifibss->incomplete_lock);
        }
        spin_unlock_bh(&ifibss->incomplete_lock);

        netif_carrier_off(sdata->dev);

        sdata->vif.cfg.ibss_joined = false;
        sdata->vif.cfg.ibss_creator = false;
        sdata->vif.bss_conf.enable_beacon = false;
        sdata->vif.cfg.ssid_len = 0;

        /* remove beacon */
        presp = sdata_dereference(ifibss->presp, sdata);
        RCU_INIT_POINTER(sdata->u.ibss.presp, NULL);
        if (presp)
                kfree_rcu(presp, rcu_head);

        clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
        ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED |
                                                BSS_CHANGED_IBSS);
        drv_leave_ibss(local, sdata);
        ieee80211_link_release_channel(&sdata->deflink);
}

static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy,
                                               struct wiphy_work *work)
{
        struct ieee80211_sub_if_data *sdata =
                container_of(work, struct ieee80211_sub_if_data,
                             u.ibss.csa_connection_drop_work);

        ieee80211_ibss_disconnect(sdata);
        synchronize_rcu();
        skb_queue_purge(&sdata->skb_queue);

        /* trigger a scan to find another IBSS network to join */
        wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work);
}

static void ieee80211_ibss_csa_mark_radar(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        int err;

        /* if the current channel is a DFS channel, mark the channel as
         * unavailable.
         */
        err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
                                            &ifibss->chandef,
                                            NL80211_IFTYPE_ADHOC);
        if (err > 0)
                cfg80211_radar_event(sdata->local->hw.wiphy, &ifibss->chandef,
                                     GFP_ATOMIC);
}

static bool
ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
                                  struct ieee802_11_elems *elems,
                                  bool beacon)
{
        struct cfg80211_csa_settings params;
        struct ieee80211_csa_ie csa_ie;
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        enum nl80211_channel_type ch_type;
        int err;
        struct ieee80211_conn_settings conn = {
                .mode = IEEE80211_CONN_MODE_HT,
                .bw_limit = IEEE80211_CONN_BW_LIMIT_40,
        };
        u32 vht_cap_info = 0;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        switch (ifibss->chandef.width) {
        case NL80211_CHAN_WIDTH_5:
        case NL80211_CHAN_WIDTH_10:
        case NL80211_CHAN_WIDTH_20_NOHT:
                conn.mode = IEEE80211_CONN_MODE_LEGACY;
                fallthrough;
        case NL80211_CHAN_WIDTH_20:
                conn.bw_limit = IEEE80211_CONN_BW_LIMIT_20;
                break;
        default:
                break;
        }

        if (elems->vht_cap_elem)
                vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info);

        memset(&params, 0, sizeof(params));
        err = ieee80211_parse_ch_switch_ie(sdata, elems,
                                           ifibss->chandef.chan->band,
                                           vht_cap_info, &conn,
                                           ifibss->bssid, &csa_ie);
        /* can't switch to destination channel, fail */
        if (err < 0)
                goto disconnect;

        /* did not contain a CSA */
        if (err)
                return false;

        /* channel switch is not supported, disconnect */
        if (!(sdata->local->hw.wiphy->flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH))
                goto disconnect;

        params.count = csa_ie.count;
        params.chandef = csa_ie.chanreq.oper;

        switch (ifibss->chandef.width) {
        case NL80211_CHAN_WIDTH_20_NOHT:
        case NL80211_CHAN_WIDTH_20:
        case NL80211_CHAN_WIDTH_40:
                /* keep our current HT mode (HT20/HT40+/HT40-), even if
                 * another mode  has been announced. The mode is not adopted
                 * within the beacon while doing CSA and we should therefore
                 * keep the mode which we announce.
                 */
                ch_type = cfg80211_get_chandef_type(&ifibss->chandef);
                cfg80211_chandef_create(&params.chandef, params.chandef.chan,
                                        ch_type);
                break;
        case NL80211_CHAN_WIDTH_5:
        case NL80211_CHAN_WIDTH_10:
                if (params.chandef.width != ifibss->chandef.width) {
                        sdata_info(sdata,
                                   "IBSS %pM received channel switch from incompatible channel width (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
                                   ifibss->bssid,
                                   params.chandef.chan->center_freq,
                                   params.chandef.width,
                                   params.chandef.center_freq1,
                                   params.chandef.center_freq2);
                        goto disconnect;
                }
                break;
        default:
                /* should not happen, conn_flags should prevent VHT modes. */
                WARN_ON(1);
                goto disconnect;
        }

        if (!cfg80211_reg_can_beacon(sdata->local->hw.wiphy, &params.chandef,
                                     NL80211_IFTYPE_ADHOC)) {
                sdata_info(sdata,
                           "IBSS %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
                           ifibss->bssid,
                           params.chandef.chan->center_freq,
                           params.chandef.width,
                           params.chandef.center_freq1,
                           params.chandef.center_freq2);
                goto disconnect;
        }

        err = cfg80211_chandef_dfs_required(sdata->local->hw.wiphy,
                                            &params.chandef,
                                            NL80211_IFTYPE_ADHOC);
        if (err < 0)
                goto disconnect;
        if (err > 0 && !ifibss->userspace_handles_dfs) {
                /* IBSS-DFS only allowed with a control program */
                goto disconnect;
        }

        params.radar_required = err;

        if (cfg80211_chandef_identical(&params.chandef,
                                       &sdata->vif.bss_conf.chanreq.oper)) {
                ibss_dbg(sdata,
                         "received csa with an identical chandef, ignoring\n");
                return true;
        }

        /* all checks done, now perform the channel switch. */
        ibss_dbg(sdata,
                 "received channel switch announcement to go to channel %d MHz\n",
                 params.chandef.chan->center_freq);

        params.block_tx = !!csa_ie.mode;

        if (ieee80211_channel_switch(sdata->local->hw.wiphy, sdata->dev,
                                     &params))
                goto disconnect;

        ieee80211_ibss_csa_mark_radar(sdata);

        return true;
disconnect:
        ibss_dbg(sdata, "Can't handle channel switch, disconnect\n");
        wiphy_work_queue(sdata->local->hw.wiphy,
                         &ifibss->csa_connection_drop_work);

        ieee80211_ibss_csa_mark_radar(sdata);

        return true;
}

static void
ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata,
                                struct ieee80211_mgmt *mgmt, size_t len,
                                struct ieee80211_rx_status *rx_status,
                                struct ieee802_11_elems *elems)
{
        int required_len;

        if (len < IEEE80211_MIN_ACTION_SIZE + 1)
                return;

        /* CSA is the only action we handle for now */
        if (mgmt->u.action.u.measurement.action_code !=
            WLAN_ACTION_SPCT_CHL_SWITCH)
                return;

        required_len = IEEE80211_MIN_ACTION_SIZE +
                       sizeof(mgmt->u.action.u.chan_switch);
        if (len < required_len)
                return;

        if (!sdata->vif.bss_conf.csa_active)
                ieee80211_ibss_process_chanswitch(sdata, elems, false);
}

static void ieee80211_rx_mgmt_deauth_ibss(struct ieee80211_sub_if_data *sdata,
                                          struct ieee80211_mgmt *mgmt,
                                          size_t len)
{
        u16 reason = le16_to_cpu(mgmt->u.deauth.reason_code);

        if (len < IEEE80211_DEAUTH_FRAME_LEN)
                return;

        ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da);
        ibss_dbg(sdata, "\tBSSID=%pM (reason: %d)\n", mgmt->bssid, reason);
        sta_info_destroy_addr(sdata, mgmt->sa);
}

static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
                                        struct ieee80211_mgmt *mgmt,
                                        size_t len)
{
        u16 auth_alg, auth_transaction;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        if (len < 24 + 6)
                return;

        auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
        auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);

        ibss_dbg(sdata, "RX Auth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da);
        ibss_dbg(sdata, "\tBSSID=%pM (auth_transaction=%d)\n",
                 mgmt->bssid, auth_transaction);

        if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1)
                return;

        /*
         * IEEE 802.11 standard does not require authentication in IBSS
         * networks and most implementations do not seem to use it.
         * However, try to reply to authentication attempts if someone
         * has actually implemented this.
         */
        ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, 0, NULL, 0,
                            mgmt->sa, sdata->u.ibss.bssid, NULL, 0, 0, 0);
}

static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_mgmt *mgmt, size_t len,
                                      struct ieee80211_rx_status *rx_status,
                                      struct ieee802_11_elems *elems,
                                      struct ieee80211_channel *channel)
{
        struct sta_info *sta;
        enum nl80211_band band = rx_status->band;
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_supported_band *sband;
        bool rates_updated = false;
        u32 supp_rates = 0;

        if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
                return;

        if (!ether_addr_equal(mgmt->bssid, sdata->u.ibss.bssid))
                return;

        sband = local->hw.wiphy->bands[band];
        if (WARN_ON(!sband))
                return;

        rcu_read_lock();
        sta = sta_info_get(sdata, mgmt->sa);

        if (elems->supp_rates) {
                supp_rates = ieee80211_sta_get_rates(sdata, elems,
                                                     band, NULL);
                if (sta) {
                        u32 prev_rates;

                        prev_rates = sta->sta.deflink.supp_rates[band];

                        sta->sta.deflink.supp_rates[band] = supp_rates |
                                ieee80211_mandatory_rates(sband);
                        if (sta->sta.deflink.supp_rates[band] != prev_rates) {
                                ibss_dbg(sdata,
                                         "updated supp_rates set for %pM based on beacon/probe_resp (0x%x -> 0x%x)\n",
                                         sta->sta.addr, prev_rates,
                                         sta->sta.deflink.supp_rates[band]);
                                rates_updated = true;
                        }
                } else {
                        rcu_read_unlock();
                        sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid,
                                                     mgmt->sa, supp_rates);
                }
        }

        if (sta && !sta->sta.wme &&
            (elems->wmm_info || elems->s1g_capab) &&
            local->hw.queues >= IEEE80211_NUM_ACS) {
                sta->sta.wme = true;
                ieee80211_check_fast_xmit(sta);
        }

        if (sta && elems->ht_operation && elems->ht_cap_elem &&
            sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT &&
            sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_5 &&
            sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_10) {
                /* we both use HT */
                struct ieee80211_ht_cap htcap_ie;
                struct cfg80211_chan_def chandef;
                enum ieee80211_sta_rx_bandwidth bw = sta->sta.deflink.bandwidth;

                cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT);
                ieee80211_chandef_ht_oper(elems->ht_operation, &chandef);

                memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie));
                rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
                                                                   &htcap_ie,
                                                                   &sta->deflink);

                if (elems->vht_operation && elems->vht_cap_elem &&
                    sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_20 &&
                    sdata->u.ibss.chandef.width != NL80211_CHAN_WIDTH_40) {
                        /* we both use VHT */
                        struct ieee80211_vht_cap cap_ie;
                        struct ieee80211_sta_vht_cap cap = sta->sta.deflink.vht_cap;
                        u32 vht_cap_info =
                                le32_to_cpu(elems->vht_cap_elem->vht_cap_info);

                        ieee80211_chandef_vht_oper(&local->hw, vht_cap_info,
                                                   elems->vht_operation,
                                                   elems->ht_operation,
                                                   &chandef);
                        memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
                        ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
                                                            &cap_ie, NULL,
                                                            &sta->deflink);
                        if (memcmp(&cap, &sta->sta.deflink.vht_cap, sizeof(cap)))
                                rates_updated |= true;
                }

                if (bw != sta->sta.deflink.bandwidth)
                        rates_updated |= true;

                if (!cfg80211_chandef_compatible(&sdata->u.ibss.chandef,
                                                 &chandef))
                        WARN_ON_ONCE(1);
        }

        if (sta && rates_updated) {
                u32 changed = IEEE80211_RC_SUPP_RATES_CHANGED;
                u8 rx_nss = sta->sta.deflink.rx_nss;

                /* Force rx_nss recalculation */
                sta->sta.deflink.rx_nss = 0;
                rate_control_rate_init(sta);
                if (sta->sta.deflink.rx_nss != rx_nss)
                        changed |= IEEE80211_RC_NSS_CHANGED;

                drv_sta_rc_update(local, sdata, &sta->sta, changed);
        }

        rcu_read_unlock();
}

static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
                                  struct ieee80211_mgmt *mgmt, size_t len,
                                  struct ieee80211_rx_status *rx_status,
                                  struct ieee802_11_elems *elems)
{
        struct ieee80211_local *local = sdata->local;
        struct cfg80211_bss *cbss;
        struct ieee80211_bss *bss;
        struct ieee80211_channel *channel;
        u64 beacon_timestamp, rx_timestamp;
        u32 supp_rates = 0;
        enum nl80211_band band = rx_status->band;

        channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
        if (!channel)
                return;

        ieee80211_update_sta_info(sdata, mgmt, len, rx_status, elems, channel);

        bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, channel);
        if (!bss)
                return;

        cbss = container_of((void *)bss, struct cfg80211_bss, priv);

        /* same for beacon and probe response */
        beacon_timestamp = le64_to_cpu(mgmt->u.beacon.timestamp);

        /* check if we need to merge IBSS */

        /* not an IBSS */
        if (!(cbss->capability & WLAN_CAPABILITY_IBSS))
                goto put_bss;

        /* different channel */
        if (sdata->u.ibss.fixed_channel &&
            sdata->u.ibss.chandef.chan != cbss->channel)
                goto put_bss;

        /* different SSID */
        if (elems->ssid_len != sdata->u.ibss.ssid_len ||
            memcmp(elems->ssid, sdata->u.ibss.ssid,
                                sdata->u.ibss.ssid_len))
                goto put_bss;

        /* process channel switch */
        if (sdata->vif.bss_conf.csa_active ||
            ieee80211_ibss_process_chanswitch(sdata, elems, true))
                goto put_bss;

        /* same BSSID */
        if (ether_addr_equal(cbss->bssid, sdata->u.ibss.bssid))
                goto put_bss;

        /* we use a fixed BSSID */
        if (sdata->u.ibss.fixed_bssid)
                goto put_bss;

        if (ieee80211_have_rx_timestamp(rx_status)) {
                /* time when timestamp field was received */
                rx_timestamp =
                        ieee80211_calculate_rx_timestamp(local, rx_status,
                                                         len + FCS_LEN, 24);
        } else {
                /*
                 * second best option: get current TSF
                 * (will return -1 if not supported)
                 */
                rx_timestamp = drv_get_tsf(local, sdata);
        }

        ibss_dbg(sdata, "RX beacon SA=%pM BSSID=%pM TSF=0x%llx\n",
                 mgmt->sa, mgmt->bssid,
                 (unsigned long long)rx_timestamp);
        ibss_dbg(sdata, "\tBCN=0x%llx diff=%lld @%lu\n",
                 (unsigned long long)beacon_timestamp,
                 (unsigned long long)(rx_timestamp - beacon_timestamp),
                 jiffies);

        if (beacon_timestamp > rx_timestamp) {
                ibss_dbg(sdata,
                         "beacon TSF higher than local TSF - IBSS merge with BSSID %pM\n",
                         mgmt->bssid);
                ieee80211_sta_join_ibss(sdata, bss);
                supp_rates = ieee80211_sta_get_rates(sdata, elems, band, NULL);
                ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa,
                                       supp_rates);
                rcu_read_unlock();
        }

 put_bss:
        ieee80211_rx_bss_put(local, bss);
}

void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
                              const u8 *bssid, const u8 *addr,
                              u32 supp_rates)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta;
        struct ieee80211_chanctx_conf *chanctx_conf;
        struct ieee80211_supported_band *sband;
        int band;

        /*
         * XXX: Consider removing the least recently used entry and
         *         allow new one to be added.
         */
        if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
                net_info_ratelimited("%s: No room for a new IBSS STA entry %pM\n",
                                    sdata->name, addr);
                return;
        }

        if (ifibss->state == IEEE80211_IBSS_MLME_SEARCH)
                return;

        if (!ether_addr_equal(bssid, sdata->u.ibss.bssid))
                return;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
        if (WARN_ON_ONCE(!chanctx_conf)) {
                rcu_read_unlock();
                return;
        }
        band = chanctx_conf->def.chan->band;
        rcu_read_unlock();

        sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
        if (!sta)
                return;

        /* make sure mandatory rates are always added */
        sband = local->hw.wiphy->bands[band];
        sta->sta.deflink.supp_rates[band] = supp_rates |
                        ieee80211_mandatory_rates(sband);

        spin_lock(&ifibss->incomplete_lock);
        list_add(&sta->list, &ifibss->incomplete_stations);
        spin_unlock(&ifibss->incomplete_lock);
        wiphy_work_queue(local->hw.wiphy, &sdata->work);
}

static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta, *tmp;
        unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT;
        unsigned long exp_rsn = IEEE80211_IBSS_RSN_INACTIVITY_LIMIT;

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
                unsigned long last_active = ieee80211_sta_last_active(sta);

                if (sdata != sta->sdata)
                        continue;

                if (time_is_before_jiffies(last_active + exp_time) ||
                    (time_is_before_jiffies(last_active + exp_rsn) &&
                     sta->sta_state != IEEE80211_STA_AUTHORIZED)) {
                        u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];

                        sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n",
                                sta->sta_state != IEEE80211_STA_AUTHORIZED ?
                                "not authorized " : "", sta->sta.addr);

                        ieee80211_send_deauth_disassoc(sdata, sta->sta.addr,
                                                       ifibss->bssid,
                                                       IEEE80211_STYPE_DEAUTH,
                                                       WLAN_REASON_DEAUTH_LEAVING,
                                                       true, frame_buf);
                        WARN_ON(__sta_info_destroy(sta));
                }
        }
}

/*
 * This function is called with state == IEEE80211_IBSS_MLME_JOINED
 */

static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        mod_timer(&ifibss->timer,
                  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));

        ieee80211_ibss_sta_expire(sdata);

        if (time_before(jiffies, ifibss->last_scan_completed +
                       IEEE80211_IBSS_MERGE_INTERVAL))
                return;

        if (ieee80211_sta_active_ibss(sdata))
                return;

        if (ifibss->fixed_channel)
                return;

        sdata_info(sdata,
                   "No active IBSS STAs - trying to scan for other IBSS networks with same SSID (merge)\n");

        ieee80211_request_ibss_scan(sdata, ifibss->ssid, ifibss->ssid_len,
                                    NULL, 0);
}

static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        u8 bssid[ETH_ALEN];
        u16 capability;
        int i;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        if (ifibss->fixed_bssid) {
                memcpy(bssid, ifibss->bssid, ETH_ALEN);
        } else {
                /* Generate random, not broadcast, locally administered BSSID. Mix in
                 * own MAC address to make sure that devices that do not have proper
                 * random number generator get different BSSID. */
                get_random_bytes(bssid, ETH_ALEN);
                for (i = 0; i < ETH_ALEN; i++)
                        bssid[i] ^= sdata->vif.addr[i];
                bssid[0] &= ~0x01;
                bssid[0] |= 0x02;
        }

        sdata_info(sdata, "Creating new IBSS network, BSSID %pM\n", bssid);

        capability = WLAN_CAPABILITY_IBSS;

        if (ifibss->privacy)
                capability |= WLAN_CAPABILITY_PRIVACY;

        __ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int,
                                  &ifibss->chandef, ifibss->basic_rates,
                                  capability, 0, true);
}

static unsigned int ibss_setup_channels(struct wiphy *wiphy,
                                        struct ieee80211_channel **channels,
                                        unsigned int channels_max,
                                        u32 center_freq, u32 width)
{
        struct ieee80211_channel *chan = NULL;
        unsigned int n_chan = 0;
        u32 start_freq, end_freq, freq;

        if (width <= 20) {
                start_freq = center_freq;
                end_freq = center_freq;
        } else {
                start_freq = center_freq - width / 2 + 10;
                end_freq = center_freq + width / 2 - 10;
        }

        for (freq = start_freq; freq <= end_freq; freq += 20) {
                chan = ieee80211_get_channel(wiphy, freq);
                if (!chan)
                        continue;
                if (n_chan >= channels_max)
                        return n_chan;

                channels[n_chan] = chan;
                n_chan++;
        }

        return n_chan;
}

static unsigned int
ieee80211_ibss_setup_scan_channels(struct wiphy *wiphy,
                                   const struct cfg80211_chan_def *chandef,
                                   struct ieee80211_channel **channels,
                                   unsigned int channels_max)
{
        unsigned int n_chan = 0;
        u32 width, cf1, cf2 = 0;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_40:
                width = 40;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                cf2 = chandef->center_freq2;
                fallthrough;
        case NL80211_CHAN_WIDTH_80:
                width = 80;
                break;
        case NL80211_CHAN_WIDTH_160:
                width = 160;
                break;
        default:
                width = 20;
                break;
        }

        cf1 = chandef->center_freq1;

        n_chan = ibss_setup_channels(wiphy, channels, channels_max, cf1, width);

        if (cf2)
                n_chan += ibss_setup_channels(wiphy, &channels[n_chan],
                                              channels_max - n_chan, cf2,
                                              width);

        return n_chan;
}

/*
 * This function is called with state == IEEE80211_IBSS_MLME_SEARCH
 */

static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        struct cfg80211_bss *cbss;
        struct ieee80211_channel *chan = NULL;
        const u8 *bssid = NULL;
        int active_ibss;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        active_ibss = ieee80211_sta_active_ibss(sdata);
        ibss_dbg(sdata, "sta_find_ibss (active_ibss=%d)\n", active_ibss);

        if (active_ibss)
                return;

        if (ifibss->fixed_bssid)
                bssid = ifibss->bssid;
        if (ifibss->fixed_channel)
                chan = ifibss->chandef.chan;
        if (!is_zero_ether_addr(ifibss->bssid))
                bssid = ifibss->bssid;
        cbss = cfg80211_get_bss(local->hw.wiphy, chan, bssid,
                                ifibss->ssid, ifibss->ssid_len,
                                IEEE80211_BSS_TYPE_IBSS,
                                IEEE80211_PRIVACY(ifibss->privacy));

        if (cbss) {
                struct ieee80211_bss *bss;

                bss = (void *)cbss->priv;
                ibss_dbg(sdata,
                         "sta_find_ibss: selected %pM current %pM\n",
                         cbss->bssid, ifibss->bssid);
                sdata_info(sdata,
                           "Selected IBSS BSSID %pM based on configured SSID\n",
                           cbss->bssid);

                ieee80211_sta_join_ibss(sdata, bss);
                ieee80211_rx_bss_put(local, bss);
                return;
        }

        /* if a fixed bssid and a fixed freq have been provided create the IBSS
         * directly and do not waste time scanning
         */
        if (ifibss->fixed_bssid && ifibss->fixed_channel) {
                sdata_info(sdata, "Created IBSS using preconfigured BSSID %pM\n",
                           bssid);
                ieee80211_sta_create_ibss(sdata);
                return;
        }


        ibss_dbg(sdata, "sta_find_ibss: did not try to join ibss\n");

        /* Selected IBSS not found in current scan results - try to scan */
        if (time_after(jiffies, ifibss->last_scan_completed +
                                        IEEE80211_SCAN_INTERVAL)) {
                struct ieee80211_channel *channels[8];
                unsigned int num;

                sdata_info(sdata, "Trigger new scan to find an IBSS to join\n");

                if (ifibss->fixed_channel) {
                        num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy,
                                                                 &ifibss->chandef,
                                                                 channels,
                                                                 ARRAY_SIZE(channels));
                        ieee80211_request_ibss_scan(sdata, ifibss->ssid,
                                                    ifibss->ssid_len, channels,
                                                    num);
                } else {
                        ieee80211_request_ibss_scan(sdata, ifibss->ssid,
                                                    ifibss->ssid_len, NULL, 0);
                }
        } else {
                int interval = IEEE80211_SCAN_INTERVAL;

                if (time_after(jiffies, ifibss->ibss_join_req +
                               IEEE80211_IBSS_JOIN_TIMEOUT))
                        ieee80211_sta_create_ibss(sdata);

                mod_timer(&ifibss->timer,
                          round_jiffies(jiffies + interval));
        }
}

static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
                                        struct sk_buff *req)
{
        struct ieee80211_mgmt *mgmt = (void *)req->data;
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct ieee80211_local *local = sdata->local;
        int tx_last_beacon, len = req->len;
        struct sk_buff *skb;
        struct beacon_data *presp;
        u8 *pos, *end;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        presp = sdata_dereference(ifibss->presp, sdata);

        if (ifibss->state != IEEE80211_IBSS_MLME_JOINED ||
            len < 24 + 2 || !presp)
                return;

        tx_last_beacon = drv_tx_last_beacon(local);

        ibss_dbg(sdata, "RX ProbeReq SA=%pM DA=%pM\n", mgmt->sa, mgmt->da);
        ibss_dbg(sdata, "\tBSSID=%pM (tx_last_beacon=%d)\n",
                 mgmt->bssid, tx_last_beacon);

        if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da))
                return;

        if (!ether_addr_equal(mgmt->bssid, ifibss->bssid) &&
            !is_broadcast_ether_addr(mgmt->bssid))
                return;

        end = ((u8 *) mgmt) + len;
        pos = mgmt->u.probe_req.variable;
        if (pos[0] != WLAN_EID_SSID ||
            pos + 2 + pos[1] > end) {
                ibss_dbg(sdata, "Invalid SSID IE in ProbeReq from %pM\n",
                         mgmt->sa);
                return;
        }
        if (pos[1] != 0 &&
            (pos[1] != ifibss->ssid_len ||
             memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len))) {
                /* Ignore ProbeReq for foreign SSID */
                return;
        }

        /* Reply with ProbeResp */
        skb = dev_alloc_skb(local->tx_headroom + presp->head_len);
        if (!skb)
                return;

        skb_reserve(skb, local->tx_headroom);
        skb_put_data(skb, presp->head, presp->head_len);

        memcpy(((struct ieee80211_mgmt *) skb->data)->da, mgmt->sa, ETH_ALEN);
        ibss_dbg(sdata, "Sending ProbeResp to %pM\n", mgmt->sa);
        IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;

        /* avoid excessive retries for probe request to wildcard SSIDs */
        if (pos[1] == 0)
                IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_NO_ACK;

        ieee80211_tx_skb(sdata, skb);
}

static
void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata,
                                    struct ieee80211_mgmt *mgmt, size_t len,
                                    struct ieee80211_rx_status *rx_status)
{
        size_t baselen;
        struct ieee802_11_elems *elems;

        BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) !=
                     offsetof(typeof(mgmt->u.beacon), variable));

        /*
         * either beacon or probe_resp but the variable field is at the
         * same offset
         */
        baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
        if (baselen > len)
                return;

        elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable,
                                       len - baselen, false, NULL);

        if (elems) {
                ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems);
                kfree(elems);
        }
}

void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
                                   struct sk_buff *skb)
{
        struct ieee80211_rx_status *rx_status;
        struct ieee80211_mgmt *mgmt;
        u16 fc;
        struct ieee802_11_elems *elems;
        int ies_len;

        rx_status = IEEE80211_SKB_RXCB(skb);
        mgmt = (struct ieee80211_mgmt *) skb->data;
        fc = le16_to_cpu(mgmt->frame_control);

        if (!sdata->u.ibss.ssid_len)
                return; /* not ready to merge yet */

        switch (fc & IEEE80211_FCTL_STYPE) {
        case IEEE80211_STYPE_PROBE_REQ:
                ieee80211_rx_mgmt_probe_req(sdata, skb);
                break;
        case IEEE80211_STYPE_PROBE_RESP:
        case IEEE80211_STYPE_BEACON:
                ieee80211_rx_mgmt_probe_beacon(sdata, mgmt, skb->len,
                                               rx_status);
                break;
        case IEEE80211_STYPE_AUTH:
                ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len);
                break;
        case IEEE80211_STYPE_DEAUTH:
                ieee80211_rx_mgmt_deauth_ibss(sdata, mgmt, skb->len);
                break;
        case IEEE80211_STYPE_ACTION:
                switch (mgmt->u.action.category) {
                case WLAN_CATEGORY_SPECTRUM_MGMT:
                        ies_len = skb->len -
                                  offsetof(struct ieee80211_mgmt,
                                           u.action.u.chan_switch.variable);

                        if (ies_len < 0)
                                break;

                        elems = ieee802_11_parse_elems(
                                mgmt->u.action.u.chan_switch.variable,
                                ies_len, true, NULL);

                        if (elems && !elems->parse_error)
                                ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt,
                                                                skb->len,
                                                                rx_status,
                                                                elems);
                        kfree(elems);
                        break;
                }
        }
}

void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
        struct sta_info *sta;

        /*
         * Work could be scheduled after scan or similar
         * when we aren't even joined (or trying) with a
         * network.
         */
        if (!ifibss->ssid_len)
                return;

        spin_lock_bh(&ifibss->incomplete_lock);
        while (!list_empty(&ifibss->incomplete_stations)) {
                sta = list_first_entry(&ifibss->incomplete_stations,
                                       struct sta_info, list);
                list_del(&sta->list);
                spin_unlock_bh(&ifibss->incomplete_lock);

                ieee80211_ibss_finish_sta(sta);
                rcu_read_unlock();
                spin_lock_bh(&ifibss->incomplete_lock);
        }
        spin_unlock_bh(&ifibss->incomplete_lock);

        switch (ifibss->state) {
        case IEEE80211_IBSS_MLME_SEARCH:
                ieee80211_sta_find_ibss(sdata);
                break;
        case IEEE80211_IBSS_MLME_JOINED:
                ieee80211_sta_merge_ibss(sdata);
                break;
        default:
                WARN_ON(1);
                break;
        }
}

static void ieee80211_ibss_timer(struct timer_list *t)
{
        struct ieee80211_sub_if_data *sdata =
                from_timer(sdata, t, u.ibss.timer);

        wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work);
}

void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;

        timer_setup(&ifibss->timer, ieee80211_ibss_timer, 0);
        INIT_LIST_HEAD(&ifibss->incomplete_stations);
        spin_lock_init(&ifibss->incomplete_lock);
        wiphy_work_init(&ifibss->csa_connection_drop_work,
                        ieee80211_csa_connection_drop_work);
}

/* scan finished notification */
void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata;

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry(sdata, &local->interfaces, list) {
                if (!ieee80211_sdata_running(sdata))
                        continue;
                if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
                        continue;
                sdata->u.ibss.last_scan_completed = jiffies;
        }
}

int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
                        struct cfg80211_ibss_params *params)
{
        u64 changed = 0;
        u32 rate_flags;
        struct ieee80211_supported_band *sband;
        enum ieee80211_chanctx_mode chanmode;
        struct ieee80211_local *local = sdata->local;
        int radar_detect_width = 0;
        int i;
        int ret;

        lockdep_assert_wiphy(local->hw.wiphy);

        if (params->chandef.chan->freq_offset) {
                /* this may work, but is untested */
                return -EOPNOTSUPP;
        }

        ret = cfg80211_chandef_dfs_required(local->hw.wiphy,
                                            &params->chandef,
                                            sdata->wdev.iftype);
        if (ret < 0)
                return ret;

        if (ret > 0) {
                if (!params->userspace_handles_dfs)
                        return -EINVAL;
                radar_detect_width = BIT(params->chandef.width);
        }

        chanmode = (params->channel_fixed && !ret) ?
                IEEE80211_CHANCTX_SHARED : IEEE80211_CHANCTX_EXCLUSIVE;

        ret = ieee80211_check_combinations(sdata, &params->chandef, chanmode,
                                           radar_detect_width);
        if (ret < 0)
                return ret;

        if (params->bssid) {
                memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN);
                sdata->u.ibss.fixed_bssid = true;
        } else
                sdata->u.ibss.fixed_bssid = false;

        sdata->u.ibss.privacy = params->privacy;
        sdata->u.ibss.control_port = params->control_port;
        sdata->u.ibss.userspace_handles_dfs = params->userspace_handles_dfs;
        sdata->u.ibss.basic_rates = params->basic_rates;
        sdata->u.ibss.last_scan_completed = jiffies;

        /* fix basic_rates if channel does not support these rates */
        rate_flags = ieee80211_chandef_rate_flags(&params->chandef);
        sband = local->hw.wiphy->bands[params->chandef.chan->band];
        for (i = 0; i < sband->n_bitrates; i++) {
                if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
                        sdata->u.ibss.basic_rates &= ~BIT(i);
        }
        memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate,
               sizeof(params->mcast_rate));

        sdata->vif.bss_conf.beacon_int = params->beacon_interval;

        sdata->u.ibss.chandef = params->chandef;
        sdata->u.ibss.fixed_channel = params->channel_fixed;

        if (params->ie) {
                sdata->u.ibss.ie = kmemdup(params->ie, params->ie_len,
                                           GFP_KERNEL);
                if (sdata->u.ibss.ie)
                        sdata->u.ibss.ie_len = params->ie_len;
        }

        sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH;
        sdata->u.ibss.ibss_join_req = jiffies;

        memcpy(sdata->u.ibss.ssid, params->ssid, params->ssid_len);
        sdata->u.ibss.ssid_len = params->ssid_len;

        memcpy(&sdata->u.ibss.ht_capa, &params->ht_capa,
               sizeof(sdata->u.ibss.ht_capa));
        memcpy(&sdata->u.ibss.ht_capa_mask, &params->ht_capa_mask,
               sizeof(sdata->u.ibss.ht_capa_mask));

        /*
         * 802.11n-2009 9.13.3.1: In an IBSS, the HT Protection field is
         * reserved, but an HT STA shall protect HT transmissions as though
         * the HT Protection field were set to non-HT mixed mode.
         *
         * In an IBSS, the RIFS Mode field of the HT Operation element is
         * also reserved, but an HT STA shall operate as though this field
         * were set to 1.
         */

        sdata->vif.bss_conf.ht_operation_mode |=
                  IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED
                | IEEE80211_HT_PARAM_RIFS_MODE;

        changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE;
        ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);

        sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
        sdata->deflink.needed_rx_chains = local->rx_chains;
        sdata->control_port_over_nl80211 = params->control_port_over_nl80211;

        wiphy_work_queue(local->hw.wiphy, &sdata->work);

        return 0;
}

int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;

        ieee80211_ibss_disconnect(sdata);
        ifibss->ssid_len = 0;
        eth_zero_addr(ifibss->bssid);

        /* remove beacon */
        kfree(sdata->u.ibss.ie);
        sdata->u.ibss.ie = NULL;
        sdata->u.ibss.ie_len = 0;

        /* on the next join, re-program HT parameters */
        memset(&ifibss->ht_capa, 0, sizeof(ifibss->ht_capa));
        memset(&ifibss->ht_capa_mask, 0, sizeof(ifibss->ht_capa_mask));

        synchronize_rcu();

        skb_queue_purge(&sdata->skb_queue);

        del_timer_sync(&sdata->u.ibss.timer);

        return 0;
}































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Definitions and Declarations for tuple.
 *
 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
 *        - generalize L3 protocol dependent part.
 *
 * Derived from include/linux/netfiter_ipv4/ip_conntrack_tuple.h
 */

#ifndef _NF_CONNTRACK_TUPLE_H
#define _NF_CONNTRACK_TUPLE_H

#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <linux/list_nulls.h>

/* A `tuple' is a structure containing the information to uniquely
  identify a connection.  ie. if two packets have the same tuple, they
  are in the same connection; if not, they are not.

  We divide the structure along "manipulatable" and
  "non-manipulatable" lines, for the benefit of the NAT code.
*/

#define NF_CT_TUPLE_L3SIZE        ARRAY_SIZE(((union nf_inet_addr *)NULL)->all)

/* The manipulable part of the tuple. */
struct nf_conntrack_man {
        union nf_inet_addr u3;
        union nf_conntrack_man_proto u;
        /* Layer 3 protocol */
        u_int16_t l3num;
};

/* This contains the information to distinguish a connection. */
struct nf_conntrack_tuple {
        struct nf_conntrack_man src;

        /* These are the parts of the tuple which are fixed. */
        struct {
                union nf_inet_addr u3;
                union {
                        /* Add other protocols here. */
                        __be16 all;

                        struct {
                                __be16 port;
                        } tcp;
                        struct {
                                __be16 port;
                        } udp;
                        struct {
                                u_int8_t type, code;
                        } icmp;
                        struct {
                                __be16 port;
                        } dccp;
                        struct {
                                __be16 port;
                        } sctp;
                        struct {
                                __be16 key;
                        } gre;
                } u;

                /* The protocol. */
                u_int8_t protonum;

                /* The direction must be ignored for the tuplehash */
                struct { } __nfct_hash_offsetend;

                /* The direction (for tuplehash) */
                u_int8_t dir;
        } dst;
};

struct nf_conntrack_tuple_mask {
        struct {
                union nf_inet_addr u3;
                union nf_conntrack_man_proto u;
        } src;
};

static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t)
{
#ifdef DEBUG
        printk("tuple %p: %u %pI4:%hu -> %pI4:%hu\n",
               t, t->dst.protonum,
               &t->src.u3.ip, ntohs(t->src.u.all),
               &t->dst.u3.ip, ntohs(t->dst.u.all));
#endif
}

static inline void nf_ct_dump_tuple_ipv6(const struct nf_conntrack_tuple *t)
{
#ifdef DEBUG
        printk("tuple %p: %u %pI6 %hu -> %pI6 %hu\n",
               t, t->dst.protonum,
               t->src.u3.all, ntohs(t->src.u.all),
               t->dst.u3.all, ntohs(t->dst.u.all));
#endif
}

static inline void nf_ct_dump_tuple(const struct nf_conntrack_tuple *t)
{
        switch (t->src.l3num) {
        case AF_INET:
                nf_ct_dump_tuple_ip(t);
                break;
        case AF_INET6:
                nf_ct_dump_tuple_ipv6(t);
                break;
        }
}

/* If we're the first tuple, it's the original dir. */
#define NF_CT_DIRECTION(h)                                                \
        ((enum ip_conntrack_dir)(h)->tuple.dst.dir)

/* Connections have two entries in the hash table: one for each way */
struct nf_conntrack_tuple_hash {
        struct hlist_nulls_node hnnode;
        struct nf_conntrack_tuple tuple;
};

static inline bool __nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1,
                                           const struct nf_conntrack_tuple *t2)
{
        return (nf_inet_addr_cmp(&t1->src.u3, &t2->src.u3) &&
                t1->src.u.all == t2->src.u.all &&
                t1->src.l3num == t2->src.l3num);
}

static inline bool __nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1,
                                           const struct nf_conntrack_tuple *t2)
{
        return (nf_inet_addr_cmp(&t1->dst.u3, &t2->dst.u3) &&
                t1->dst.u.all == t2->dst.u.all &&
                t1->dst.protonum == t2->dst.protonum);
}

static inline bool nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1,
                                     const struct nf_conntrack_tuple *t2)
{
        return __nf_ct_tuple_src_equal(t1, t2) &&
               __nf_ct_tuple_dst_equal(t1, t2);
}

static inline bool
nf_ct_tuple_mask_equal(const struct nf_conntrack_tuple_mask *m1,
                       const struct nf_conntrack_tuple_mask *m2)
{
        return (nf_inet_addr_cmp(&m1->src.u3, &m2->src.u3) &&
                m1->src.u.all == m2->src.u.all);
}

static inline bool
nf_ct_tuple_src_mask_cmp(const struct nf_conntrack_tuple *t1,
                         const struct nf_conntrack_tuple *t2,
                         const struct nf_conntrack_tuple_mask *mask)
{
        int count;

        for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++) {
                if ((t1->src.u3.all[count] ^ t2->src.u3.all[count]) &
                    mask->src.u3.all[count])
                        return false;
        }

        if ((t1->src.u.all ^ t2->src.u.all) & mask->src.u.all)
                return false;

        if (t1->src.l3num != t2->src.l3num ||
            t1->dst.protonum != t2->dst.protonum)
                return false;

        return true;
}

static inline bool
nf_ct_tuple_mask_cmp(const struct nf_conntrack_tuple *t,
                     const struct nf_conntrack_tuple *tuple,
                     const struct nf_conntrack_tuple_mask *mask)
{
        return nf_ct_tuple_src_mask_cmp(t, tuple, mask) &&
               __nf_ct_tuple_dst_equal(t, tuple);
}

#endif /* _NF_CONNTRACK_TUPLE_H */































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/*
 * include/net/tipc.h: Include file for TIPC message header routines
 *
 * Copyright (c) 2017 Ericsson AB
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _TIPC_HDR_H
#define _TIPC_HDR_H

#include <linux/random.h>

#define KEEPALIVE_MSG_MASK 0x0e080000  /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */

struct tipc_basic_hdr {
        __be32 w[4];
};

static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
{
        u32 w0 = ntohl(hdr->w[0]);
        bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
        __be32 key;

        /* Return source node identity as key */
        if (likely(!keepalive_msg))
                return hdr->w[3];

        /* Spread PROBE/PROBE_REPLY messages across the cores */
        get_random_bytes(&key, sizeof(key));
        return key;
}

#endif



















































































































































































































    1 





    1 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/proc/net.c
 *
 *  Copyright (C) 2007
 *
 *  Author: Eric Biederman <ebiederm@xmission.com>
 *
 *  proc net directory handling functions
 */
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <linux/uidgid.h>
#include <net/net_namespace.h>
#include <linux/seq_file.h>

#include "internal.h"

static inline struct net *PDE_NET(struct proc_dir_entry *pde)
{
        return pde->parent->data;
}

static struct net *get_proc_net(const struct inode *inode)
{
        return maybe_get_net(PDE_NET(PDE(inode)));
}

static int seq_open_net(struct inode *inode, struct file *file)
{
        unsigned int state_size = PDE(inode)->state_size;
        struct seq_net_private *p;
        struct net *net;

        WARN_ON_ONCE(state_size < sizeof(*p));

        if (file->f_mode & FMODE_WRITE && !PDE(inode)->write)
                return -EACCES;

        net = get_proc_net(inode);
        if (!net)
                return -ENXIO;

        p = __seq_open_private(file, PDE(inode)->seq_ops, state_size);
        if (!p) {
                put_net(net);
                return -ENOMEM;
        }
#ifdef CONFIG_NET_NS
        p->net = net;
        netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL);
#endif
        return 0;
}

static void seq_file_net_put_net(struct seq_file *seq)
{
#ifdef CONFIG_NET_NS
        struct seq_net_private *priv = seq->private;

        put_net_track(priv->net, &priv->ns_tracker);
#else
        put_net(&init_net);
#endif
}

static int seq_release_net(struct inode *ino, struct file *f)
{
        struct seq_file *seq = f->private_data;

        seq_file_net_put_net(seq);
        seq_release_private(ino, f);
        return 0;
}

static const struct proc_ops proc_net_seq_ops = {
        .proc_open        = seq_open_net,
        .proc_read        = seq_read,
        .proc_write        = proc_simple_write,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release_net,
};

int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
{
#ifdef CONFIG_NET_NS
        struct seq_net_private *p = priv_data;

        p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker,
                               GFP_KERNEL);
#endif
        return 0;
}

void bpf_iter_fini_seq_net(void *priv_data)
{
#ifdef CONFIG_NET_NS
        struct seq_net_private *p = priv_data;

        put_net_track(p->net, &p->ns_tracker);
#endif
}

struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
                struct proc_dir_entry *parent, const struct seq_operations *ops,
                unsigned int state_size, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_seq_ops;
        p->seq_ops = ops;
        p->state_size = state_size;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_data);

/**
 * proc_create_net_data_write - Create a writable net_ns-specific proc file
 * @name: The name of the file.
 * @mode: The file's access mode.
 * @parent: The parent directory in which to create.
 * @ops: The seq_file ops with which to read the file.
 * @write: The write method with which to 'modify' the file.
 * @state_size: The size of the per-file private state to allocate.
 * @data: Data for retrieval by pde_data().
 *
 * Create a network namespaced proc file in the @parent directory with the
 * specified @name and @mode that allows reading of a file that displays a
 * series of elements and also provides for the file accepting writes that have
 * some arbitrary effect.
 *
 * The functions in the @ops table are used to iterate over items to be
 * presented and extract the readable content using the seq_file interface.
 *
 * The @write function is called with the data copied into a kernel space
 * scratch buffer and has a NUL appended for convenience.  The buffer may be
 * modified by the @write function.  @write should return 0 on success.
 *
 * The @data value is accessible from the @show and @write functions by calling
 * pde_data() on the file inode.  The network namespace must be accessed by
 * calling seq_file_net() on the seq_file struct.
 */
struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
                                                  struct proc_dir_entry *parent,
                                                  const struct seq_operations *ops,
                                                  proc_write_t write,
                                                  unsigned int state_size, void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_seq_ops;
        p->seq_ops = ops;
        p->state_size = state_size;
        p->write = write;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_data_write);

static int single_open_net(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *de = PDE(inode);
        struct net *net;
        int err;

        net = get_proc_net(inode);
        if (!net)
                return -ENXIO;

        err = single_open(file, de->single_show, net);
        if (err)
                put_net(net);
        return err;
}

static int single_release_net(struct inode *ino, struct file *f)
{
        struct seq_file *seq = f->private_data;
        put_net(seq->private);
        return single_release(ino, f);
}

static const struct proc_ops proc_net_single_ops = {
        .proc_open        = single_open_net,
        .proc_read        = seq_read,
        .proc_write        = proc_simple_write,
        .proc_lseek        = seq_lseek,
        .proc_release        = single_release_net,
};

struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
                struct proc_dir_entry *parent,
                int (*show)(struct seq_file *, void *), void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_single_ops;
        p->single_show = show;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_single);

/**
 * proc_create_net_single_write - Create a writable net_ns-specific proc file
 * @name: The name of the file.
 * @mode: The file's access mode.
 * @parent: The parent directory in which to create.
 * @show: The seqfile show method with which to read the file.
 * @write: The write method with which to 'modify' the file.
 * @data: Data for retrieval by pde_data().
 *
 * Create a network-namespaced proc file in the @parent directory with the
 * specified @name and @mode that allows reading of a file that displays a
 * single element rather than a series and also provides for the file accepting
 * writes that have some arbitrary effect.
 *
 * The @show function is called to extract the readable content via the
 * seq_file interface.
 *
 * The @write function is called with the data copied into a kernel space
 * scratch buffer and has a NUL appended for convenience.  The buffer may be
 * modified by the @write function.  @write should return 0 on success.
 *
 * The @data value is accessible from the @show and @write functions by calling
 * pde_data() on the file inode.  The network namespace must be accessed by
 * calling seq_file_single_net() on the seq_file struct.
 */
struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
                                                    struct proc_dir_entry *parent,
                                                    int (*show)(struct seq_file *, void *),
                                                    proc_write_t write,
                                                    void *data)
{
        struct proc_dir_entry *p;

        p = proc_create_reg(name, mode, &parent, data);
        if (!p)
                return NULL;
        pde_force_lookup(p);
        p->proc_ops = &proc_net_single_ops;
        p->single_show = show;
        p->write = write;
        return proc_register(parent, p);
}
EXPORT_SYMBOL_GPL(proc_create_net_single_write);

static struct net *get_proc_task_net(struct inode *dir)
{
        struct task_struct *task;
        struct nsproxy *ns;
        struct net *net = NULL;

        rcu_read_lock();
        task = pid_task(proc_pid(dir), PIDTYPE_PID);
        if (task != NULL) {
                task_lock(task);
                ns = task->nsproxy;
                if (ns != NULL)
                        net = get_net(ns->net_ns);
                task_unlock(task);
        }
        rcu_read_unlock();

        return net;
}

static struct dentry *proc_tgid_net_lookup(struct inode *dir,
                struct dentry *dentry, unsigned int flags)
{
        struct dentry *de;
        struct net *net;

        de = ERR_PTR(-ENOENT);
        net = get_proc_task_net(dir);
        if (net != NULL) {
                de = proc_lookup_de(dir, dentry, net->proc_net);
                put_net(net);
        }
        return de;
}

static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
                                 const struct path *path, struct kstat *stat,
                                 u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct net *net;

        net = get_proc_task_net(inode);

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

        if (net != NULL) {
                stat->nlink = net->proc_net->nlink;
                put_net(net);
        }

        return 0;
}

const struct inode_operations proc_net_inode_operations = {
        .lookup                = proc_tgid_net_lookup,
        .getattr        = proc_tgid_net_getattr,
        .setattr        = proc_setattr,
};

static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
{
        int ret;
        struct net *net;

        ret = -EINVAL;
        net = get_proc_task_net(file_inode(file));
        if (net != NULL) {
                ret = proc_readdir_de(file, ctx, net->proc_net);
                put_net(net);
        }
        return ret;
}

const struct file_operations proc_net_operations = {
        .llseek                = generic_file_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = proc_tgid_net_readdir,
};

static __net_init int proc_net_ns_init(struct net *net)
{
        struct proc_dir_entry *netd, *net_statd;
        kuid_t uid;
        kgid_t gid;
        int err;

        /*
         * This PDE acts only as an anchor for /proc/${pid}/net hierarchy.
         * Corresponding inode (PDE(inode) == net->proc_net) is never
         * instantiated therefore blanket zeroing is fine.
         * net->proc_net_stat inode is instantiated normally.
         */
        err = -ENOMEM;
        netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL);
        if (!netd)
                goto out;

        netd->subdir = RB_ROOT;
        netd->data = net;
        netd->nlink = 2;
        netd->namelen = 3;
        netd->parent = &proc_root;
        netd->name = netd->inline_name;
        memcpy(netd->name, "net", 4);

        uid = make_kuid(net->user_ns, 0);
        if (!uid_valid(uid))
                uid = netd->uid;

        gid = make_kgid(net->user_ns, 0);
        if (!gid_valid(gid))
                gid = netd->gid;

        proc_set_user(netd, uid, gid);

        /* Seed dentry revalidation for /proc/${pid}/net */
        pde_force_lookup(netd);

        err = -EEXIST;
        net_statd = proc_net_mkdir(net, "stat", netd);
        if (!net_statd)
                goto free_net;

        net->proc_net = netd;
        net->proc_net_stat = net_statd;
        return 0;

free_net:
        pde_free(netd);
out:
        return err;
}

static __net_exit void proc_net_ns_exit(struct net *net)
{
        remove_proc_entry("stat", net->proc_net);
        pde_free(net->proc_net);
}

static struct pernet_operations __net_initdata proc_net_ns_ops = {
        .init = proc_net_ns_init,
        .exit = proc_net_ns_exit,
};

int __init proc_net_init(void)
{
        proc_symlink("net", NULL, "self/net");

        return register_pernet_subsys(&proc_net_ns_ops);
}


































































































    1 
    1 








    1 




    1 










    1 















    1 
    1 















    1 











    1 













    1 



































    1 




    1 

































































    1 






















    1 


    1 
    1 






























    1 



































































































































































































































    1 




























    1 










    1 










































    1 

    1 
    1 
    1 

























    1 




    1 
    1 







































   22 





























   23 


   24 
   22 
   23 








































































































































































    1 


























    1 




















































































    1 





















































































    1 



    1 





    1 















    1 
















    1 






    1 

























































































































































































































































































    1 






    1 




















































































































































































































    1 














































































































































































































































































































































































































































































































































































































































































    1 







































    1 





















    1 




    1 
    1 



















    1 

























    1 


    1 








    1 
    1 




























































































































































































































































    1 





    1 

















    1 




    1 


























































































































































































    1 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <uapi/linux/btf.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <linux/execmem.h>

#include <asm/barrier.h>
#include <asm/unaligned.h>

/* Registers */
#define BPF_R0        regs[BPF_REG_0]
#define BPF_R1        regs[BPF_REG_1]
#define BPF_R2        regs[BPF_REG_2]
#define BPF_R3        regs[BPF_REG_3]
#define BPF_R4        regs[BPF_REG_4]
#define BPF_R5        regs[BPF_REG_5]
#define BPF_R6        regs[BPF_REG_6]
#define BPF_R7        regs[BPF_REG_7]
#define BPF_R8        regs[BPF_REG_8]
#define BPF_R9        regs[BPF_REG_9]
#define BPF_R10        regs[BPF_REG_10]

/* Named registers */
#define DST        regs[insn->dst_reg]
#define SRC        regs[insn->src_reg]
#define FP        regs[BPF_REG_FP]
#define AX        regs[BPF_REG_AX]
#define ARG1        regs[BPF_REG_ARG1]
#define CTX        regs[BPF_REG_CTX]
#define OFF        insn->off
#define IMM        insn->imm

struct bpf_mem_alloc bpf_global_ma;
bool bpf_global_ma_set;

/* No hurry in this branch
 *
 * Exported for the bpf jit load helper.
 */
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
        u8 *ptr = NULL;

        if (k >= SKF_NET_OFF) {
                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
        } else if (k >= SKF_LL_OFF) {
                if (unlikely(!skb_mac_header_was_set(skb)))
                        return NULL;
                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
        }
        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
                return ptr;

        return NULL;
}

/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
enum page_size_enum {
        __PAGE_SIZE = PAGE_SIZE
};

struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog_aux *aux;
        struct bpf_prog *fp;

        size = round_up(size, __PAGE_SIZE);
        fp = __vmalloc(size, gfp_flags);
        if (fp == NULL)
                return NULL;

        aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (aux == NULL) {
                vfree(fp);
                return NULL;
        }
        fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (!fp->active) {
                vfree(fp);
                kfree(aux);
                return NULL;
        }

        fp->pages = size / PAGE_SIZE;
        fp->aux = aux;
        fp->aux->prog = fp;
        fp->jit_requested = ebpf_jit_enabled();
        fp->blinding_requested = bpf_jit_blinding_enabled(fp);
#ifdef CONFIG_CGROUP_BPF
        aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
#endif

        INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
#ifdef CONFIG_FINEIBT
        INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
        mutex_init(&fp->aux->used_maps_mutex);
        mutex_init(&fp->aux->dst_mutex);

        return fp;
}

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *prog;
        int cpu;

        prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
        if (!prog)
                return NULL;

        prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
        if (!prog->stats) {
                free_percpu(prog->active);
                kfree(prog->aux);
                vfree(prog);
                return NULL;
        }

        for_each_possible_cpu(cpu) {
                struct bpf_prog_stats *pstats;

                pstats = per_cpu_ptr(prog->stats, cpu);
                u64_stats_init(&pstats->syncp);
        }
        return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);

int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
{
        if (!prog->aux->nr_linfo || !prog->jit_requested)
                return 0;

        prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
                                          sizeof(*prog->aux->jited_linfo),
                                          bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
        if (!prog->aux->jited_linfo)
                return -ENOMEM;

        return 0;
}

void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
        if (prog->aux->jited_linfo &&
            (!prog->jited || !prog->aux->jited_linfo[0])) {
                kvfree(prog->aux->jited_linfo);
                prog->aux->jited_linfo = NULL;
        }

        kfree(prog->aux->kfunc_tab);
        prog->aux->kfunc_tab = NULL;
}

/* The jit engine is responsible to provide an array
 * for insn_off to the jited_off mapping (insn_to_jit_off).
 *
 * The idx to this array is the insn_off.  Hence, the insn_off
 * here is relative to the prog itself instead of the main prog.
 * This array has one entry for each xlated bpf insn.
 *
 * jited_off is the byte off to the end of the jited insn.
 *
 * Hence, with
 * insn_start:
 *      The first bpf insn off of the prog.  The insn off
 *      here is relative to the main prog.
 *      e.g. if prog is a subprog, insn_start > 0
 * linfo_idx:
 *      The prog's idx to prog->aux->linfo and jited_linfo
 *
 * jited_linfo[linfo_idx] = prog->bpf_func
 *
 * For i > linfo_idx,
 *
 * jited_linfo[i] = prog->bpf_func +
 *        insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
 */
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off)
{
        u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
        const struct bpf_line_info *linfo;
        void **jited_linfo;

        if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
                /* Userspace did not provide linfo */
                return;

        linfo_idx = prog->aux->linfo_idx;
        linfo = &prog->aux->linfo[linfo_idx];
        insn_start = linfo[0].insn_off;
        insn_end = insn_start + prog->len;

        jited_linfo = &prog->aux->jited_linfo[linfo_idx];
        jited_linfo[0] = prog->bpf_func;

        nr_linfo = prog->aux->nr_linfo - linfo_idx;

        for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
                /* The verifier ensures that linfo[i].insn_off is
                 * strictly increasing
                 */
                jited_linfo[i] = prog->bpf_func +
                        insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}

struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *fp;
        u32 pages;

        size = round_up(size, PAGE_SIZE);
        pages = size / PAGE_SIZE;
        if (pages <= fp_old->pages)
                return fp_old;

        fp = __vmalloc(size, gfp_flags);
        if (fp) {
                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
                fp->pages = pages;
                fp->aux->prog = fp;

                /* We keep fp->aux from fp_old around in the new
                 * reallocated structure.
                 */
                fp_old->aux = NULL;
                fp_old->stats = NULL;
                fp_old->active = NULL;
                __bpf_prog_free(fp_old);
        }

        return fp;
}

void __bpf_prog_free(struct bpf_prog *fp)
{
        if (fp->aux) {
                mutex_destroy(&fp->aux->used_maps_mutex);
                mutex_destroy(&fp->aux->dst_mutex);
                kfree(fp->aux->poke_tab);
                kfree(fp->aux);
        }
        free_percpu(fp->stats);
        free_percpu(fp->active);
        vfree(fp);
}

int bpf_prog_calc_tag(struct bpf_prog *fp)
{
        const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
        u32 raw_size = bpf_prog_tag_scratch_size(fp);
        u32 digest[SHA1_DIGEST_WORDS];
        u32 ws[SHA1_WORKSPACE_WORDS];
        u32 i, bsize, psize, blocks;
        struct bpf_insn *dst;
        bool was_ld_map;
        u8 *raw, *todo;
        __be32 *result;
        __be64 *bits;

        raw = vmalloc(raw_size);
        if (!raw)
                return -ENOMEM;

        sha1_init(digest);
        memset(ws, 0, sizeof(ws));

        /* We need to take out the map fd for the digest calculation
         * since they are unstable from user space side.
         */
        dst = (void *)raw;
        for (i = 0, was_ld_map = false; i < fp->len; i++) {
                dst[i] = fp->insnsi[i];
                if (!was_ld_map &&
                    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
                     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
                        was_ld_map = true;
                        dst[i].imm = 0;
                } else if (was_ld_map &&
                           dst[i].code == 0 &&
                           dst[i].dst_reg == 0 &&
                           dst[i].src_reg == 0 &&
                           dst[i].off == 0) {
                        was_ld_map = false;
                        dst[i].imm = 0;
                } else {
                        was_ld_map = false;
                }
        }

        psize = bpf_prog_insn_size(fp);
        memset(&raw[psize], 0, raw_size - psize);
        raw[psize++] = 0x80;

        bsize  = round_up(psize, SHA1_BLOCK_SIZE);
        blocks = bsize / SHA1_BLOCK_SIZE;
        todo   = raw;
        if (bsize - psize >= sizeof(__be64)) {
                bits = (__be64 *)(todo + bsize - sizeof(__be64));
        } else {
                bits = (__be64 *)(todo + bsize + bits_offset);
                blocks++;
        }
        *bits = cpu_to_be64((psize - 1) << 3);

        while (blocks--) {
                sha1_transform(digest, todo, ws);
                todo += SHA1_BLOCK_SIZE;
        }

        result = (__force __be32 *)digest;
        for (i = 0; i < SHA1_DIGEST_WORDS; i++)
                result[i] = cpu_to_be32(digest[i]);
        memcpy(fp->tag, result, sizeof(fp->tag));

        vfree(raw);
        return 0;
}

static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        const s64 imm_min = S32_MIN, imm_max = S32_MAX;
        s32 delta = end_new - end_old;
        s64 imm = insn->imm;

        if (curr < pos && curr + imm + 1 >= end_old)
                imm += delta;
        else if (curr >= end_new && curr + imm + 1 < end_new)
                imm -= delta;
        if (imm < imm_min || imm > imm_max)
                return -ERANGE;
        if (!probe_pass)
                insn->imm = imm;
        return 0;
}

static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        s64 off_min, off_max, off;
        s32 delta = end_new - end_old;

        if (insn->code == (BPF_JMP32 | BPF_JA)) {
                off = insn->imm;
                off_min = S32_MIN;
                off_max = S32_MAX;
        } else {
                off = insn->off;
                off_min = S16_MIN;
                off_max = S16_MAX;
        }

        if (curr < pos && curr + off + 1 >= end_old)
                off += delta;
        else if (curr >= end_new && curr + off + 1 < end_new)
                off -= delta;
        if (off < off_min || off > off_max)
                return -ERANGE;
        if (!probe_pass) {
                if (insn->code == (BPF_JMP32 | BPF_JA))
                        insn->imm = off;
                else
                        insn->off = off;
        }
        return 0;
}

static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
                            s32 end_new, const bool probe_pass)
{
        u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
        struct bpf_insn *insn = prog->insnsi;
        int ret = 0;

        for (i = 0; i < insn_cnt; i++, insn++) {
                u8 code;

                /* In the probing pass we still operate on the original,
                 * unpatched image in order to check overflows before we
                 * do any other adjustments. Therefore skip the patchlet.
                 */
                if (probe_pass && i == pos) {
                        i = end_new;
                        insn = prog->insnsi + end_old;
                }
                if (bpf_pseudo_func(insn)) {
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                        if (ret)
                                return ret;
                        continue;
                }
                code = insn->code;
                if ((BPF_CLASS(code) != BPF_JMP &&
                     BPF_CLASS(code) != BPF_JMP32) ||
                    BPF_OP(code) == BPF_EXIT)
                        continue;
                /* Adjust offset of jmps if we cross patch boundaries. */
                if (BPF_OP(code) == BPF_CALL) {
                        if (insn->src_reg != BPF_PSEUDO_CALL)
                                continue;
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                } else {
                        ret = bpf_adj_delta_to_off(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                }
                if (ret)
                        break;
        }

        return ret;
}

static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
{
        struct bpf_line_info *linfo;
        u32 i, nr_linfo;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo || !delta)
                return;

        linfo = prog->aux->linfo;

        for (i = 0; i < nr_linfo; i++)
                if (off < linfo[i].insn_off)
                        break;

        /* Push all off < linfo[i].insn_off by delta */
        for (; i < nr_linfo; i++)
                linfo[i].insn_off += delta;
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len)
{
        u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
        const u32 cnt_max = S16_MAX;
        struct bpf_prog *prog_adj;
        int err;

        /* Since our patchlet doesn't expand the image, we're done. */
        if (insn_delta == 0) {
                memcpy(prog->insnsi + off, patch, sizeof(*patch));
                return prog;
        }

        insn_adj_cnt = prog->len + insn_delta;

        /* Reject anything that would potentially let the insn->off
         * target overflow when we have excessive program expansions.
         * We need to probe here before we do any reallocation where
         * we afterwards may not fail anymore.
         */
        if (insn_adj_cnt > cnt_max &&
            (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
                return ERR_PTR(err);

        /* Several new instructions need to be inserted. Make room
         * for them. Likely, there's no need for a new allocation as
         * last page could have large enough tailroom.
         */
        prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
                                    GFP_USER);
        if (!prog_adj)
                return ERR_PTR(-ENOMEM);

        prog_adj->len = insn_adj_cnt;

        /* Patching happens in 3 steps:
         *
         * 1) Move over tail of insnsi from next instruction onwards,
         *    so we can patch the single target insn with one or more
         *    new ones (patching is always from 1 to n insns, n > 0).
         * 2) Inject new instructions at the target location.
         * 3) Adjust branch offsets if necessary.
         */
        insn_rest = insn_adj_cnt - off - len;

        memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
                sizeof(*patch) * insn_rest);
        memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);

        /* We are guaranteed to not fail at this point, otherwise
         * the ship has sailed to reverse to the original state. An
         * overflow cannot happen at this point.
         */
        BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));

        bpf_adj_linfo(prog_adj, off, insn_delta);

        return prog_adj;
}

int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
        /* Branch offsets can't overflow when program is shrinking, no need
         * to call bpf_adj_branches(..., true) here
         */
        memmove(prog->insnsi + off, prog->insnsi + off + cnt,
                sizeof(struct bpf_insn) * (prog->len - off - cnt));
        prog->len -= cnt;

        return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
}

static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
        int i;

        for (i = 0; i < fp->aux->real_func_cnt; i++)
                bpf_prog_kallsyms_del(fp->aux->func[i]);
}

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
{
        bpf_prog_kallsyms_del_subprogs(fp);
        bpf_prog_kallsyms_del(fp);
}

#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden   __read_mostly;
long bpf_jit_limit   __read_mostly;
long bpf_jit_limit_max __read_mostly;

static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
        WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));

        prog->aux->ksym.start = (unsigned long) prog->bpf_func;
        prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
}

static void
bpf_prog_ksym_set_name(struct bpf_prog *prog)
{
        char *sym = prog->aux->ksym.name;
        const char *end = sym + KSYM_NAME_LEN;
        const struct btf_type *type;
        const char *func_name;

        BUILD_BUG_ON(sizeof("bpf_prog_") +
                     sizeof(prog->tag) * 2 +
                     /* name has been null terminated.
                      * We should need +1 for the '_' preceding
                      * the name.  However, the null character
                      * is double counted between the name and the
                      * sizeof("bpf_prog_") above, so we omit
                      * the +1 here.
                      */
                     sizeof(prog->aux->name) > KSYM_NAME_LEN);

        sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
        sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));

        /* prog->aux->name will be ignored if full btf name is available */
        if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
                type = btf_type_by_id(prog->aux->btf,
                                      prog->aux->func_info[prog->aux->func_idx].type_id);
                func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
                snprintf(sym, (size_t)(end - sym), "_%s", func_name);
                return;
        }

        if (prog->aux->name[0])
                snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
        else
                *sym = 0;
}

static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
{
        return container_of(n, struct bpf_ksym, tnode)->start;
}

static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
                                          struct latch_tree_node *b)
{
        return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
}

static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
{
        unsigned long val = (unsigned long)key;
        const struct bpf_ksym *ksym;

        ksym = container_of(n, struct bpf_ksym, tnode);

        if (val < ksym->start)
                return -1;
        /* Ensure that we detect return addresses as part of the program, when
         * the final instruction is a call for a program part of the stack
         * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
         */
        if (val > ksym->end)
                return  1;

        return 0;
}

static const struct latch_tree_ops bpf_tree_ops = {
        .less        = bpf_tree_less,
        .comp        = bpf_tree_comp,
};

static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;

void bpf_ksym_add(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        WARN_ON_ONCE(!list_empty(&ksym->lnode));
        list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
        latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        spin_unlock_bh(&bpf_lock);
}

static void __bpf_ksym_del(struct bpf_ksym *ksym)
{
        if (list_empty(&ksym->lnode))
                return;

        latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        list_del_rcu(&ksym->lnode);
}

void bpf_ksym_del(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        __bpf_ksym_del(ksym);
        spin_unlock_bh(&bpf_lock);
}

static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
{
        return fp->jited && !bpf_prog_was_classic(fp);
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp) ||
            !bpf_token_capable(fp->aux->token, CAP_BPF))
                return;

        bpf_prog_ksym_set_addr(fp);
        bpf_prog_ksym_set_name(fp);
        fp->aux->ksym.prog = true;

        bpf_ksym_add(&fp->aux->ksym);

#ifdef CONFIG_FINEIBT
        /*
         * When FineIBT, code in the __cfi_foo() symbols can get executed
         * and hence unwinder needs help.
         */
        if (cfi_mode != CFI_FINEIBT)
                return;

        snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
                 "__cfi_%s", fp->aux->ksym.name);

        fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
        fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;

        bpf_ksym_add(&fp->aux->ksym_prefix);
#endif
}

void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp))
                return;

        bpf_ksym_del(&fp->aux->ksym);
#ifdef CONFIG_FINEIBT
        if (cfi_mode != CFI_FINEIBT)
                return;
        bpf_ksym_del(&fp->aux->ksym_prefix);
#endif
}

static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
{
        struct latch_tree_node *n;

        n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
        return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}

const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym)
{
        struct bpf_ksym *ksym;
        char *ret = NULL;

        rcu_read_lock();
        ksym = bpf_ksym_find(addr);
        if (ksym) {
                unsigned long symbol_start = ksym->start;
                unsigned long symbol_end = ksym->end;

                strscpy(sym, ksym->name, KSYM_NAME_LEN);

                ret = sym;
                if (size)
                        *size = symbol_end - symbol_start;
                if (off)
                        *off  = addr - symbol_start;
        }
        rcu_read_unlock();

        return ret;
}

bool is_bpf_text_address(unsigned long addr)
{
        bool ret;

        rcu_read_lock();
        ret = bpf_ksym_find(addr) != NULL;
        rcu_read_unlock();

        return ret;
}

struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        struct bpf_ksym *ksym = bpf_ksym_find(addr);

        return ksym && ksym->prog ?
               container_of(ksym, struct bpf_prog_aux, ksym)->prog :
               NULL;
}

const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
        const struct exception_table_entry *e = NULL;
        struct bpf_prog *prog;

        rcu_read_lock();
        prog = bpf_prog_ksym_find(addr);
        if (!prog)
                goto out;
        if (!prog->aux->num_exentries)
                goto out;

        e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
out:
        rcu_read_unlock();
        return e;
}

int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym)
{
        struct bpf_ksym *ksym;
        unsigned int it = 0;
        int ret = -ERANGE;

        if (!bpf_jit_kallsyms_enabled())
                return ret;

        rcu_read_lock();
        list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
                if (it++ != symnum)
                        continue;

                strscpy(sym, ksym->name, KSYM_NAME_LEN);

                *value = ksym->start;
                *type  = BPF_SYM_ELF_TYPE;

                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke)
{
        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
        static const u32 poke_tab_max = 1024;
        u32 slot = prog->aux->size_poke_tab;
        u32 size = slot + 1;

        if (size > poke_tab_max)
                return -ENOSPC;
        if (poke->tailcall_target || poke->tailcall_target_stable ||
            poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
                return -EINVAL;

        switch (poke->reason) {
        case BPF_POKE_REASON_TAIL_CALL:
                if (!poke->tail_call.map)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
        if (!tab)
                return -ENOMEM;

        memcpy(&tab[slot], poke, sizeof(*poke));
        prog->aux->size_poke_tab = size;
        prog->aux->poke_tab = tab;

        return slot;
}

/*
 * BPF program pack allocator.
 *
 * Most BPF programs are pretty small. Allocating a hole page for each
 * program is sometime a waste. Many small bpf program also adds pressure
 * to instruction TLB. To solve this issue, we introduce a BPF program pack
 * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
 * to host BPF programs.
 */
#define BPF_PROG_CHUNK_SHIFT        6
#define BPF_PROG_CHUNK_SIZE        (1 << BPF_PROG_CHUNK_SHIFT)
#define BPF_PROG_CHUNK_MASK        (~(BPF_PROG_CHUNK_SIZE - 1))

struct bpf_prog_pack {
        struct list_head list;
        void *ptr;
        unsigned long bitmap[];
};

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
{
        memset(area, 0, size);
}

#define BPF_PROG_SIZE_TO_NBITS(size)        (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)

static DEFINE_MUTEX(pack_mutex);
static LIST_HEAD(pack_list);

/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
 * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
 */
#ifdef PMD_SIZE
/* PMD_SIZE is really big for some archs. It doesn't make sense to
 * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
 * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
 * greater than or equal to 2MB.
 */
#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
#else
#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif

#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)

static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_prog_pack *pack;
        int err;

        pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
                       GFP_KERNEL);
        if (!pack)
                return NULL;
        pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
        if (!pack->ptr)
                goto out;
        bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
        bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);

        set_vm_flush_reset_perms(pack->ptr);
        err = set_memory_rox((unsigned long)pack->ptr,
                             BPF_PROG_PACK_SIZE / PAGE_SIZE);
        if (err)
                goto out;
        list_add_tail(&pack->list, &pack_list);
        return pack;

out:
        bpf_jit_free_exec(pack->ptr);
        kfree(pack);
        return NULL;
}

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
        struct bpf_prog_pack *pack;
        unsigned long pos;
        void *ptr = NULL;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                size = round_up(size, PAGE_SIZE);
                ptr = bpf_jit_alloc_exec(size);
                if (ptr) {
                        int err;

                        bpf_fill_ill_insns(ptr, size);
                        set_vm_flush_reset_perms(ptr);
                        err = set_memory_rox((unsigned long)ptr,
                                             size / PAGE_SIZE);
                        if (err) {
                                bpf_jit_free_exec(ptr);
                                ptr = NULL;
                        }
                }
                goto out;
        }
        list_for_each_entry(pack, &pack_list, list) {
                pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                                 nbits, 0);
                if (pos < BPF_PROG_CHUNK_COUNT)
                        goto found_free_area;
        }

        pack = alloc_new_pack(bpf_fill_ill_insns);
        if (!pack)
                goto out;

        pos = 0;

found_free_area:
        bitmap_set(pack->bitmap, pos, nbits);
        ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);

out:
        mutex_unlock(&pack_mutex);
        return ptr;
}

void bpf_prog_pack_free(void *ptr, u32 size)
{
        struct bpf_prog_pack *pack = NULL, *tmp;
        unsigned int nbits;
        unsigned long pos;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                bpf_jit_free_exec(ptr);
                goto out;
        }

        list_for_each_entry(tmp, &pack_list, list) {
                if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
                        pack = tmp;
                        break;
                }
        }

        if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
                goto out;

        nbits = BPF_PROG_SIZE_TO_NBITS(size);
        pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;

        WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
                  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");

        bitmap_clear(pack->bitmap, pos, nbits);
        if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                       BPF_PROG_CHUNK_COUNT, 0) == 0) {
                list_del(&pack->list);
                bpf_jit_free_exec(pack->ptr);
                kfree(pack);
        }
out:
        mutex_unlock(&pack_mutex);
}

static atomic_long_t bpf_jit_current;

/* Can be overridden by an arch's JIT compiler if it has a custom,
 * dedicated BPF backend memory area, or if neither of the two
 * below apply.
 */
u64 __weak bpf_jit_alloc_exec_limit(void)
{
#if defined(MODULES_VADDR)
        return MODULES_END - MODULES_VADDR;
#else
        return VMALLOC_END - VMALLOC_START;
#endif
}

static int __init bpf_jit_charge_init(void)
{
        /* Only used as heuristic here to derive limit. */
        bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
        bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
                                            PAGE_SIZE), LONG_MAX);
        return 0;
}
pure_initcall(bpf_jit_charge_init);

int bpf_jit_charge_modmem(u32 size)
{
        if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
                if (!bpf_capable()) {
                        atomic_long_sub(size, &bpf_jit_current);
                        return -EPERM;
                }
        }

        return 0;
}

void bpf_jit_uncharge_modmem(u32 size)
{
        atomic_long_sub(size, &bpf_jit_current);
}

void *__weak bpf_jit_alloc_exec(unsigned long size)
{
        return execmem_alloc(EXECMEM_BPF, size);
}

void __weak bpf_jit_free_exec(void *addr)
{
        execmem_free(addr);
}

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *hdr;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* Most of BPF filters are really small, but if some of them
         * fill a page, allow at least 128 extra bytes to insert a
         * random section of illegal instructions.
         */
        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        hdr = bpf_jit_alloc_exec(size);
        if (!hdr) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(hdr, size);

        hdr->size = size;
        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
                     PAGE_SIZE - sizeof(*hdr));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        /* Leave a random number of instructions before BPF code. */
        *image_ptr = &hdr->image[start];

        return hdr;
}

void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
        u32 size = hdr->size;

        bpf_jit_free_exec(hdr);
        bpf_jit_uncharge_modmem(size);
}

/* Allocate jit binary from bpf_prog_pack allocator.
 * Since the allocated memory is RO+X, the JIT engine cannot write directly
 * to the memory. To solve this problem, a RW buffer is also allocated at
 * as the same time. The JIT engine should calculate offsets based on the
 * RO memory address, but write JITed program to the RW buffer. Once the
 * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
 * the JITed program to the RO memory.
 */
struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_header,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *ro_header;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* add 16 bytes for a random section of illegal instructions */
        size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
        if (!ro_header) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        *rw_header = kvmalloc(size, GFP_KERNEL);
        if (!*rw_header) {
                bpf_prog_pack_free(ro_header, size);
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(*rw_header, size);
        (*rw_header)->size = size;

        hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
                     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        *image_ptr = &ro_header->image[start];
        *rw_image = &(*rw_header)->image[start];

        return ro_header;
}

/* Copy JITed text from rw_header to its final location, the ro_header. */
int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
                                 struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header)
{
        void *ptr;

        ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);

        kvfree(rw_header);

        if (IS_ERR(ptr)) {
                bpf_prog_pack_free(ro_header, ro_header->size);
                return PTR_ERR(ptr);
        }
        return 0;
}

/* bpf_jit_binary_pack_free is called in two different scenarios:
 *   1) when the program is freed after;
 *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
 * For case 2), we need to free both the RO memory and the RW buffer.
 *
 * bpf_jit_binary_pack_free requires proper ro_header->size. However,
 * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
 * must be set with either bpf_jit_binary_pack_finalize (normal path) or
 * bpf_arch_text_copy (when jit fails).
 */
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header)
{
        u32 size = ro_header->size;

        bpf_prog_pack_free(ro_header, size);
        kvfree(rw_header);
        bpf_jit_uncharge_modmem(size);
}

struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & BPF_PROG_CHUNK_MASK;
        return (void *)addr;
}

static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & PAGE_MASK;
        return (void *)addr;
}

/* This symbol is only overridden by archs that have different
 * requirements than the usual eBPF JITs, f.e. when they only
 * implement cBPF JIT, do not set images read-only, etc.
 */
void __weak bpf_jit_free(struct bpf_prog *fp)
{
        if (fp->jited) {
                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);

                bpf_jit_binary_free(hdr);
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
        }

        bpf_prog_unlock_free(fp);
}

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed)
{
        s16 off = insn->off;
        s32 imm = insn->imm;
        u8 *addr;
        int err;

        *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
        if (!*func_addr_fixed) {
                /* Place-holder address till the last pass has collected
                 * all addresses for JITed subprograms in which case we
                 * can pick them up from prog->aux.
                 */
                if (!extra_pass)
                        addr = NULL;
                else if (prog->aux->func &&
                         off >= 0 && off < prog->aux->real_func_cnt)
                        addr = (u8 *)prog->aux->func[off]->bpf_func;
                else
                        return -EINVAL;
        } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
                   bpf_jit_supports_far_kfunc_call()) {
                err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
                if (err)
                        return err;
        } else {
                /* Address of a BPF helper call. Since part of the core
                 * kernel, it's always at a fixed location. __bpf_call_base
                 * and the helper with imm relative to it are both in core
                 * kernel.
                 */
                addr = (u8 *)__bpf_call_base + imm;
        }

        *func_addr = (unsigned long)addr;
        return 0;
}

static int bpf_jit_blind_insn(const struct bpf_insn *from,
                              const struct bpf_insn *aux,
                              struct bpf_insn *to_buff,
                              bool emit_zext)
{
        struct bpf_insn *to = to_buff;
        u32 imm_rnd = get_random_u32();
        s16 off;

        BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
        BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);

        /* Constraints on AX register:
         *
         * AX register is inaccessible from user space. It is mapped in
         * all JITs, and used here for constant blinding rewrites. It is
         * typically "stateless" meaning its contents are only valid within
         * the executed instruction, but not across several instructions.
         * There are a few exceptions however which are further detailed
         * below.
         *
         * Constant blinding is only used by JITs, not in the interpreter.
         * The interpreter uses AX in some occasions as a local temporary
         * register e.g. in DIV or MOD instructions.
         *
         * In restricted circumstances, the verifier can also use the AX
         * register for rewrites as long as they do not interfere with
         * the above cases!
         */
        if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
                goto out;

        if (from->imm == 0 &&
            (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
             from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
                *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
                goto out;
        }

        switch (from->code) {
        case BPF_ALU | BPF_ADD | BPF_K:
        case BPF_ALU | BPF_SUB | BPF_K:
        case BPF_ALU | BPF_AND | BPF_K:
        case BPF_ALU | BPF_OR  | BPF_K:
        case BPF_ALU | BPF_XOR | BPF_K:
        case BPF_ALU | BPF_MUL | BPF_K:
        case BPF_ALU | BPF_MOV | BPF_K:
        case BPF_ALU | BPF_DIV | BPF_K:
        case BPF_ALU | BPF_MOD | BPF_K:
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_ALU64 | BPF_ADD | BPF_K:
        case BPF_ALU64 | BPF_SUB | BPF_K:
        case BPF_ALU64 | BPF_AND | BPF_K:
        case BPF_ALU64 | BPF_OR  | BPF_K:
        case BPF_ALU64 | BPF_XOR | BPF_K:
        case BPF_ALU64 | BPF_MUL | BPF_K:
        case BPF_ALU64 | BPF_MOV | BPF_K:
        case BPF_ALU64 | BPF_DIV | BPF_K:
        case BPF_ALU64 | BPF_MOD | BPF_K:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_JMP | BPF_JEQ  | BPF_K:
        case BPF_JMP | BPF_JNE  | BPF_K:
        case BPF_JMP | BPF_JGT  | BPF_K:
        case BPF_JMP | BPF_JLT  | BPF_K:
        case BPF_JMP | BPF_JGE  | BPF_K:
        case BPF_JMP | BPF_JLE  | BPF_K:
        case BPF_JMP | BPF_JSGT | BPF_K:
        case BPF_JMP | BPF_JSLT | BPF_K:
        case BPF_JMP | BPF_JSGE | BPF_K:
        case BPF_JMP | BPF_JSLE | BPF_K:
        case BPF_JMP | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
                break;

        case BPF_JMP32 | BPF_JEQ  | BPF_K:
        case BPF_JMP32 | BPF_JNE  | BPF_K:
        case BPF_JMP32 | BPF_JGT  | BPF_K:
        case BPF_JMP32 | BPF_JLT  | BPF_K:
        case BPF_JMP32 | BPF_JGE  | BPF_K:
        case BPF_JMP32 | BPF_JLE  | BPF_K:
        case BPF_JMP32 | BPF_JSGT | BPF_K:
        case BPF_JMP32 | BPF_JSLT | BPF_K:
        case BPF_JMP32 | BPF_JSGE | BPF_K:
        case BPF_JMP32 | BPF_JSLE | BPF_K:
        case BPF_JMP32 | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
                                      off);
                break;

        case BPF_LD | BPF_IMM | BPF_DW:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
                *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
                break;
        case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                if (emit_zext)
                        *to++ = BPF_ZEXT_REG(BPF_REG_AX);
                *to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
                break;

        case BPF_ST | BPF_MEM | BPF_DW:
        case BPF_ST | BPF_MEM | BPF_W:
        case BPF_ST | BPF_MEM | BPF_H:
        case BPF_ST | BPF_MEM | BPF_B:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;
        }
out:
        return to - to_buff;
}

static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
                                              gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog *fp;

        fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
        if (fp != NULL) {
                /* aux->prog still points to the fp_other one, so
                 * when promoting the clone to the real program,
                 * this still needs to be adapted.
                 */
                memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
        }

        return fp;
}

static void bpf_prog_clone_free(struct bpf_prog *fp)
{
        /* aux was stolen by the other clone, so we cannot free
         * it from this path! It will be freed eventually by the
         * other program on release.
         *
         * At this point, we don't need a deferred release since
         * clone is guaranteed to not be locked.
         */
        fp->aux = NULL;
        fp->stats = NULL;
        fp->active = NULL;
        __bpf_prog_free(fp);
}

void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
{
        /* We have to repoint aux->prog to self, as we don't
         * know whether fp here is the clone or the original.
         */
        fp->aux->prog = fp;
        bpf_prog_clone_free(fp_other);
}

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
        struct bpf_insn insn_buff[16], aux[2];
        struct bpf_prog *clone, *tmp;
        int insn_delta, insn_cnt;
        struct bpf_insn *insn;
        int i, rewritten;

        if (!prog->blinding_requested || prog->blinded)
                return prog;

        clone = bpf_prog_clone_create(prog, GFP_USER);
        if (!clone)
                return ERR_PTR(-ENOMEM);

        insn_cnt = clone->len;
        insn = clone->insnsi;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        /* ld_imm64 with an address of bpf subprog is not
                         * a user controlled constant. Don't randomize it,
                         * since it will conflict with jit_subprogs() logic.
                         */
                        insn++;
                        i++;
                        continue;
                }

                /* We temporarily need to hold the original ld64 insn
                 * so that we can still access the first part in the
                 * second blinding run.
                 */
                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    insn[1].code == 0)
                        memcpy(aux, insn, sizeof(aux));

                rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
                                                clone->aux->verifier_zext);
                if (!rewritten)
                        continue;

                tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
                if (IS_ERR(tmp)) {
                        /* Patching may have repointed aux->prog during
                         * realloc from the original one, so we need to
                         * fix it up here on error.
                         */
                        bpf_jit_prog_release_other(prog, clone);
                        return tmp;
                }

                clone = tmp;
                insn_delta = rewritten - 1;

                /* Walk new program and skip insns we just inserted. */
                insn = clone->insnsi + i + insn_delta;
                insn_cnt += insn_delta;
                i        += insn_delta;
        }

        clone->blinded = 1;
        return clone;
}
#endif /* CONFIG_BPF_JIT */

/* Base function for offset calculation. Needs to go into .text section,
 * therefore keeping it non-static as well; will also be used by JITs
 * anyway later on, so do not let the compiler omit it. This also needs
 * to go into kallsyms for correlation from e.g. bpftool, so naming
 * must not change.
 */
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
        return 0;
}
EXPORT_SYMBOL_GPL(__bpf_call_base);

/* All UAPI available opcodes. */
#define BPF_INSN_MAP(INSN_2, INSN_3)                \
        /* 32 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU, ADD,  X),                        \
        INSN_3(ALU, SUB,  X),                        \
        INSN_3(ALU, AND,  X),                        \
        INSN_3(ALU, OR,   X),                        \
        INSN_3(ALU, LSH,  X),                        \
        INSN_3(ALU, RSH,  X),                        \
        INSN_3(ALU, XOR,  X),                        \
        INSN_3(ALU, MUL,  X),                        \
        INSN_3(ALU, MOV,  X),                        \
        INSN_3(ALU, ARSH, X),                        \
        INSN_3(ALU, DIV,  X),                        \
        INSN_3(ALU, MOD,  X),                        \
        INSN_2(ALU, NEG),                        \
        INSN_3(ALU, END, TO_BE),                \
        INSN_3(ALU, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU, ADD,  K),                        \
        INSN_3(ALU, SUB,  K),                        \
        INSN_3(ALU, AND,  K),                        \
        INSN_3(ALU, OR,   K),                        \
        INSN_3(ALU, LSH,  K),                        \
        INSN_3(ALU, RSH,  K),                        \
        INSN_3(ALU, XOR,  K),                        \
        INSN_3(ALU, MUL,  K),                        \
        INSN_3(ALU, MOV,  K),                        \
        INSN_3(ALU, ARSH, K),                        \
        INSN_3(ALU, DIV,  K),                        \
        INSN_3(ALU, MOD,  K),                        \
        /* 64 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU64, ADD,  X),                        \
        INSN_3(ALU64, SUB,  X),                        \
        INSN_3(ALU64, AND,  X),                        \
        INSN_3(ALU64, OR,   X),                        \
        INSN_3(ALU64, LSH,  X),                        \
        INSN_3(ALU64, RSH,  X),                        \
        INSN_3(ALU64, XOR,  X),                        \
        INSN_3(ALU64, MUL,  X),                        \
        INSN_3(ALU64, MOV,  X),                        \
        INSN_3(ALU64, ARSH, X),                        \
        INSN_3(ALU64, DIV,  X),                        \
        INSN_3(ALU64, MOD,  X),                        \
        INSN_2(ALU64, NEG),                        \
        INSN_3(ALU64, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU64, ADD,  K),                        \
        INSN_3(ALU64, SUB,  K),                        \
        INSN_3(ALU64, AND,  K),                        \
        INSN_3(ALU64, OR,   K),                        \
        INSN_3(ALU64, LSH,  K),                        \
        INSN_3(ALU64, RSH,  K),                        \
        INSN_3(ALU64, XOR,  K),                        \
        INSN_3(ALU64, MUL,  K),                        \
        INSN_3(ALU64, MOV,  K),                        \
        INSN_3(ALU64, ARSH, K),                        \
        INSN_3(ALU64, DIV,  K),                        \
        INSN_3(ALU64, MOD,  K),                        \
        /* Call instruction. */                        \
        INSN_2(JMP, CALL),                        \
        /* Exit instruction. */                        \
        INSN_2(JMP, EXIT),                        \
        /* 32-bit Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP32, JEQ,  X),                        \
        INSN_3(JMP32, JNE,  X),                        \
        INSN_3(JMP32, JGT,  X),                        \
        INSN_3(JMP32, JLT,  X),                        \
        INSN_3(JMP32, JGE,  X),                        \
        INSN_3(JMP32, JLE,  X),                        \
        INSN_3(JMP32, JSGT, X),                        \
        INSN_3(JMP32, JSLT, X),                        \
        INSN_3(JMP32, JSGE, X),                        \
        INSN_3(JMP32, JSLE, X),                        \
        INSN_3(JMP32, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP32, JEQ,  K),                        \
        INSN_3(JMP32, JNE,  K),                        \
        INSN_3(JMP32, JGT,  K),                        \
        INSN_3(JMP32, JLT,  K),                        \
        INSN_3(JMP32, JGE,  K),                        \
        INSN_3(JMP32, JLE,  K),                        \
        INSN_3(JMP32, JSGT, K),                        \
        INSN_3(JMP32, JSLT, K),                        \
        INSN_3(JMP32, JSGE, K),                        \
        INSN_3(JMP32, JSLE, K),                        \
        INSN_3(JMP32, JSET, K),                        \
        /* Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP, JEQ,  X),                        \
        INSN_3(JMP, JNE,  X),                        \
        INSN_3(JMP, JGT,  X),                        \
        INSN_3(JMP, JLT,  X),                        \
        INSN_3(JMP, JGE,  X),                        \
        INSN_3(JMP, JLE,  X),                        \
        INSN_3(JMP, JSGT, X),                        \
        INSN_3(JMP, JSLT, X),                        \
        INSN_3(JMP, JSGE, X),                        \
        INSN_3(JMP, JSLE, X),                        \
        INSN_3(JMP, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP, JEQ,  K),                        \
        INSN_3(JMP, JNE,  K),                        \
        INSN_3(JMP, JGT,  K),                        \
        INSN_3(JMP, JLT,  K),                        \
        INSN_3(JMP, JGE,  K),                        \
        INSN_3(JMP, JLE,  K),                        \
        INSN_3(JMP, JSGT, K),                        \
        INSN_3(JMP, JSLT, K),                        \
        INSN_3(JMP, JSGE, K),                        \
        INSN_3(JMP, JSLE, K),                        \
        INSN_3(JMP, JSET, K),                        \
        INSN_2(JMP, JA),                        \
        INSN_2(JMP32, JA),                        \
        /* Store instructions. */                \
        /*   Register based. */                        \
        INSN_3(STX, MEM,  B),                        \
        INSN_3(STX, MEM,  H),                        \
        INSN_3(STX, MEM,  W),                        \
        INSN_3(STX, MEM,  DW),                        \
        INSN_3(STX, ATOMIC, W),                        \
        INSN_3(STX, ATOMIC, DW),                \
        /*   Immediate based. */                \
        INSN_3(ST, MEM, B),                        \
        INSN_3(ST, MEM, H),                        \
        INSN_3(ST, MEM, W),                        \
        INSN_3(ST, MEM, DW),                        \
        /* Load instructions. */                \
        /*   Register based. */                        \
        INSN_3(LDX, MEM, B),                        \
        INSN_3(LDX, MEM, H),                        \
        INSN_3(LDX, MEM, W),                        \
        INSN_3(LDX, MEM, DW),                        \
        INSN_3(LDX, MEMSX, B),                        \
        INSN_3(LDX, MEMSX, H),                        \
        INSN_3(LDX, MEMSX, W),                        \
        /*   Immediate based. */                \
        INSN_3(LD, IMM, DW)

bool bpf_opcode_in_insntable(u8 code)
{
#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
        static const bool public_insntable[256] = {
                [0 ... 255] = false,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
                /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
                [BPF_LD | BPF_ABS | BPF_B] = true,
                [BPF_LD | BPF_ABS | BPF_H] = true,
                [BPF_LD | BPF_ABS | BPF_W] = true,
                [BPF_LD | BPF_IND | BPF_B] = true,
                [BPF_LD | BPF_IND | BPF_H] = true,
                [BPF_LD | BPF_IND | BPF_W] = true,
                [BPF_JMP | BPF_JCOND] = true,
        };
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
        return public_insntable[code];
}

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
 *        ___bpf_prog_run - run eBPF program on a given context
 *        @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
 *        @insn: is the array of eBPF instructions
 *
 * Decode and execute eBPF instructions.
 *
 * Return: whatever value is in %BPF_R0 at program exit
 */
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
        static const void * const jumptable[256] __annotate_jump_table = {
                [0 ... 255] = &&default_label,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
                /* Non-UAPI available opcodes. */
                [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
                [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
                [BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
                [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
                [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
                [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
                [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
        };
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
        u32 tail_call_cnt = 0;

#define CONT         ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
        goto *jumptable[insn->code];

        /* Explicitly mask the register-based shift amounts with 63 or 31
         * to avoid undefined behavior. Normally this won't affect the
         * generated code, for example, in case of native 64 bit archs such
         * as x86-64 or arm64, the compiler is optimizing the AND away for
         * the interpreter. In case of JITs, each of the JIT backends compiles
         * the BPF shift operations to machine instructions which produce
         * implementation-defined results in such a case; the resulting
         * contents of the register may be arbitrary, but program behaviour
         * as a whole remains defined. In other words, in case of JIT backends,
         * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
         */
        /* ALU (shifts) */
#define SHT(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP (SRC & 63);                \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP ((u32) SRC & 31);        \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        /* ALU (rest) */
#define ALU(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP SRC;                        \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP (u32) SRC;                \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        ALU(ADD,  +)
        ALU(SUB,  -)
        ALU(AND,  &)
        ALU(OR,   |)
        ALU(XOR,  ^)
        ALU(MUL,  *)
        SHT(LSH, <<)
        SHT(RSH, >>)
#undef SHT
#undef ALU
        ALU_NEG:
                DST = (u32) -DST;
                CONT;
        ALU64_NEG:
                DST = -DST;
                CONT;
        ALU_MOV_X:
                switch (OFF) {
                case 0:
                        DST = (u32) SRC;
                        break;
                case 8:
                        DST = (u32)(s8) SRC;
                        break;
                case 16:
                        DST = (u32)(s16) SRC;
                        break;
                }
                CONT;
        ALU_MOV_K:
                DST = (u32) IMM;
                CONT;
        ALU64_MOV_X:
                switch (OFF) {
                case 0:
                        DST = SRC;
                        break;
                case 8:
                        DST = (s8) SRC;
                        break;
                case 16:
                        DST = (s16) SRC;
                        break;
                case 32:
                        DST = (s32) SRC;
                        break;
                }
                CONT;
        ALU64_MOV_K:
                DST = IMM;
                CONT;
        LD_IMM_DW:
                DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
                insn++;
                CONT;
        ALU_ARSH_X:
                DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
                CONT;
        ALU_ARSH_K:
                DST = (u64) (u32) (((s32) DST) >> IMM);
                CONT;
        ALU64_ARSH_X:
                (*(s64 *) &DST) >>= (SRC & 63);
                CONT;
        ALU64_ARSH_K:
                (*(s64 *) &DST) >>= IMM;
                CONT;
        ALU64_MOD_X:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, SRC, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, SRC);
                        DST = DST - AX * SRC;
                        break;
                }
                CONT;
        ALU_MOD_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) SRC);
                        break;
                case 1:
                        AX = abs((s32)DST);
                        AX = do_div(AX, abs((s32)SRC));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_MOD_K:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, IMM, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, IMM);
                        DST = DST - AX * IMM;
                        break;
                }
                CONT;
        ALU_MOD_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) IMM);
                        break;
                case 1:
                        AX = abs((s32)DST);
                        AX = do_div(AX, abs((s32)IMM));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_DIV_X:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, SRC);
                        break;
                case 1:
                        DST = div64_s64(DST, SRC);
                        break;
                }
                CONT;
        ALU_DIV_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) SRC);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs((s32)DST);
                        do_div(AX, abs((s32)SRC));
                        if (((s32)DST < 0) == ((s32)SRC < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU64_DIV_K:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, IMM);
                        break;
                case 1:
                        DST = div64_s64(DST, IMM);
                        break;
                }
                CONT;
        ALU_DIV_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) IMM);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs((s32)DST);
                        do_div(AX, abs((s32)IMM));
                        if (((s32)DST < 0) == ((s32)IMM < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU_END_TO_BE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_be16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_be32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_be64(DST);
                        break;
                }
                CONT;
        ALU_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_le16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_le32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_le64(DST);
                        break;
                }
                CONT;
        ALU64_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) __swab16(DST);
                        break;
                case 32:
                        DST = (__force u32) __swab32(DST);
                        break;
                case 64:
                        DST = (__force u64) __swab64(DST);
                        break;
                }
                CONT;

        /* CALL */
        JMP_CALL:
                /* Function call scratches BPF_R1-BPF_R5 registers,
                 * preserves BPF_R6-BPF_R9, and stores return value
                 * into BPF_R0.
                 */
                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                                                       BPF_R4, BPF_R5);
                CONT;

        JMP_CALL_ARGS:
                BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
                                                            BPF_R3, BPF_R4,
                                                            BPF_R5,
                                                            insn + insn->off + 1);
                CONT;

        JMP_TAIL_CALL: {
                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
                struct bpf_array *array = container_of(map, struct bpf_array, map);
                struct bpf_prog *prog;
                u32 index = BPF_R3;

                if (unlikely(index >= array->map.max_entries))
                        goto out;

                if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
                        goto out;

                tail_call_cnt++;

                prog = READ_ONCE(array->ptrs[index]);
                if (!prog)
                        goto out;

                /* ARG1 at this point is guaranteed to point to CTX from
                 * the verifier side due to the fact that the tail call is
                 * handled like a helper, that is, bpf_tail_call_proto,
                 * where arg1_type is ARG_PTR_TO_CTX.
                 */
                insn = prog->insnsi;
                goto select_insn;
out:
                CONT;
        }
        JMP_JA:
                insn += insn->off;
                CONT;
        JMP32_JA:
                insn += insn->imm;
                CONT;
        JMP_EXIT:
                return BPF_R0;
        /* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP)                                \
        JMP_##OPCODE##_X:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_X:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP_##OPCODE##_K:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_K:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;
        COND_JMP(u, JEQ, ==)
        COND_JMP(u, JNE, !=)
        COND_JMP(u, JGT, >)
        COND_JMP(u, JLT, <)
        COND_JMP(u, JGE, >=)
        COND_JMP(u, JLE, <=)
        COND_JMP(u, JSET, &)
        COND_JMP(s, JSGT, >)
        COND_JMP(s, JSLT, <)
        COND_JMP(s, JSGE, >=)
        COND_JMP(s, JSLE, <=)
#undef COND_JMP
        /* ST, STX and LDX*/
        ST_NOSPEC:
                /* Speculation barrier for mitigating Speculative Store Bypass.
                 * In case of arm64, we rely on the firmware mitigation as
                 * controlled via the ssbd kernel parameter. Whenever the
                 * mitigation is enabled, it works for all of the kernel code
                 * with no need to provide any additional instructions here.
                 * In case of x86, we use 'lfence' insn for mitigation. We
                 * reuse preexisting logic from Spectre v1 mitigation that
                 * happens to produce the required code on x86 for v4 as well.
                 */
                barrier_nospec();
                CONT;
#define LDST(SIZEOP, SIZE)                                                \
        STX_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;        \
                CONT;                                                        \
        ST_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;        \
                CONT;                                                        \
        LDX_MEM_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEM_##SIZEOP:                                                \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),        \
                              (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDST(B,   u8)
        LDST(H,  u16)
        LDST(W,  u32)
        LDST(DW, u64)
#undef LDST

#define LDSX(SIZEOP, SIZE)                                                \
        LDX_MEMSX_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEMSX_##SIZEOP:                                        \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),                \
                                      (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDSX(B,   s8)
        LDSX(H,  s16)
        LDSX(W,  s32)
#undef LDSX

#define ATOMIC_ALU_OP(BOP, KOP)                                                \
                case BOP:                                                \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
                                             (DST + insn->off));        \
                        else                                                \
                                atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
                                               (DST + insn->off));        \
                        break;                                                \
                case BOP | BPF_FETCH:                                        \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                SRC = (u32) atomic_fetch_##KOP(                \
                                        (u32) SRC,                        \
                                        (atomic_t *)(unsigned long) (DST + insn->off)); \
                        else                                                \
                                SRC = (u64) atomic64_fetch_##KOP(        \
                                        (u64) SRC,                        \
                                        (atomic64_t *)(unsigned long) (DST + insn->off)); \
                        break;

        STX_ATOMIC_DW:
        STX_ATOMIC_W:
                switch (IMM) {
                ATOMIC_ALU_OP(BPF_ADD, add)
                ATOMIC_ALU_OP(BPF_AND, and)
                ATOMIC_ALU_OP(BPF_OR, or)
                ATOMIC_ALU_OP(BPF_XOR, xor)
#undef ATOMIC_ALU_OP

                case BPF_XCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                SRC = (u32) atomic_xchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) SRC);
                        else
                                SRC = (u64) atomic64_xchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) SRC);
                        break;
                case BPF_CMPXCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                BPF_R0 = (u32) atomic_cmpxchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) BPF_R0, (u32) SRC);
                        else
                                BPF_R0 = (u64) atomic64_cmpxchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) BPF_R0, (u64) SRC);
                        break;

                default:
                        goto default_label;
                }
                CONT;

        default_label:
                /* If we ever reach this, we have a bug somewhere. Die hard here
                 * instead of just returning 0; we could be somewhere in a subprog,
                 * so execution could continue otherwise which we do /not/ want.
                 *
                 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
                 */
                pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
                        insn->code, insn->imm);
                BUG_ON(1);
                return 0;
}

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG] = {}; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        ARG1 = (u64) (unsigned long) ctx; \
        return ___bpf_prog_run(regs, insn); \
}

#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
                                      const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG]; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        BPF_R1 = r1; \
        BPF_R2 = r2; \
        BPF_R3 = r3; \
        BPF_R4 = r4; \
        BPF_R5 = r5; \
        return ___bpf_prog_run(regs, insn); \
}

#define EVAL1(FN, X) FN(X)
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)

EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);

EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);

#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),

static unsigned int (*interpreters[])(const void *ctx,
                                      const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
static __maybe_unused
u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
                           const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST

#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
        stack_depth = max_t(u32, stack_depth, 1);
        insn->off = (s16) insn->imm;
        insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
                __bpf_call_base_args;
        insn->code = BPF_JMP | BPF_CALL_ARGS;
}
#endif
#else
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
                                         const struct bpf_insn *insn)
{
        /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
         * is not working properly, so warn about it!
         */
        WARN_ON_ONCE(1);
        return 0;
}
#endif

bool bpf_prog_map_compatible(struct bpf_map *map,
                             const struct bpf_prog *fp)
{
        enum bpf_prog_type prog_type = resolve_prog_type(fp);
        bool ret;

        if (fp->kprobe_override)
                return false;

        /* XDP programs inserted into maps are not guaranteed to run on
         * a particular netdev (and can run outside driver context entirely
         * in the case of devmap and cpumap). Until device checks
         * are implemented, prohibit adding dev-bound programs to program maps.
         */
        if (bpf_prog_is_dev_bound(fp->aux))
                return false;

        spin_lock(&map->owner.lock);
        if (!map->owner.type) {
                /* There's no owner yet where we could check for
                 * compatibility.
                 */
                map->owner.type  = prog_type;
                map->owner.jited = fp->jited;
                map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
                ret = true;
        } else {
                ret = map->owner.type  == prog_type &&
                      map->owner.jited == fp->jited &&
                      map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
        }
        spin_unlock(&map->owner.lock);

        return ret;
}

static int bpf_check_tail_call(const struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;
        int i, ret = 0;

        mutex_lock(&aux->used_maps_mutex);
        for (i = 0; i < aux->used_map_cnt; i++) {
                struct bpf_map *map = aux->used_maps[i];

                if (!map_type_contains_progs(map))
                        continue;

                if (!bpf_prog_map_compatible(map, fp)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        mutex_unlock(&aux->used_maps_mutex);
        return ret;
}

static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);

        fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
#else
        fp->bpf_func = __bpf_prog_ret0_warn;
#endif
}

/**
 *        bpf_prog_select_runtime - select exec runtime for BPF program
 *        @fp: bpf_prog populated with BPF program
 *        @err: pointer to error variable
 *
 * Try to JIT eBPF program, if JIT is not available, use interpreter.
 * The BPF program will be executed via bpf_prog_run() function.
 *
 * Return: the &fp argument along with &err set to 0 for success or
 * a negative errno code on failure
 */
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
        /* In case of BPF to BPF calls, verifier did all the prep
         * work with regards to JITing, etc.
         */
        bool jit_needed = false;

        if (fp->bpf_func)
                goto finalize;

        if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
            bpf_prog_has_kfunc_call(fp))
                jit_needed = true;

        bpf_prog_select_func(fp);

        /* eBPF JITs can rewrite the program in case constant
         * blinding is active. However, in case of error during
         * blinding, bpf_int_jit_compile() must always return a
         * valid program, which in this case would simply not
         * be JITed, but falls back to the interpreter.
         */
        if (!bpf_prog_is_offloaded(fp->aux)) {
                *err = bpf_prog_alloc_jited_linfo(fp);
                if (*err)
                        return fp;

                fp = bpf_int_jit_compile(fp);
                bpf_prog_jit_attempt_done(fp);
                if (!fp->jited && jit_needed) {
                        *err = -ENOTSUPP;
                        return fp;
                }
        } else {
                *err = bpf_prog_offload_compile(fp);
                if (*err)
                        return fp;
        }

finalize:
        *err = bpf_prog_lock_ro(fp);
        if (*err)
                return fp;

        /* The tail call compatibility check can only be done at
         * this late stage as we need to determine, if we deal
         * with JITed or non JITed program concatenations and not
         * all eBPF JITs might immediately support all features.
         */
        *err = bpf_check_tail_call(fp);

        return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);

static unsigned int __bpf_prog_ret1(const void *ctx,
                                    const struct bpf_insn *insn)
{
        return 1;
}

static struct bpf_prog_dummy {
        struct bpf_prog prog;
} dummy_bpf_prog = {
        .prog = {
                .bpf_func = __bpf_prog_ret1,
        },
};

struct bpf_empty_prog_array bpf_empty_prog_array = {
        .null_prog = NULL,
};
EXPORT_SYMBOL(bpf_empty_prog_array);

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
        struct bpf_prog_array *p;

        if (prog_cnt)
                p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
        else
                p = &bpf_empty_prog_array.hdr;

        return p;
}

void bpf_prog_array_free(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array.hdr)
                return;
        kfree_rcu(progs, rcu);
}

static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
        struct bpf_prog_array *progs;

        /* If RCU Tasks Trace grace period implies RCU grace period, there is
         * no need to call kfree_rcu(), just call kfree() directly.
         */
        progs = container_of(rcu, struct bpf_prog_array, rcu);
        if (rcu_trace_implies_rcu_gp())
                kfree(progs);
        else
                kfree_rcu(progs, rcu);
}

void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array.hdr)
                return;
        call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}

int bpf_prog_array_length(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;
        u32 cnt = 0;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        cnt++;
        return cnt;
}

bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        return false;
        return true;
}

static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
                                     u32 *prog_ids,
                                     u32 request_cnt)
{
        struct bpf_prog_array_item *item;
        int i = 0;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                prog_ids[i] = item->prog->aux->id;
                if (++i == request_cnt) {
                        item++;
                        break;
                }
        }

        return !!(item->prog);
}

int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
                                __u32 __user *prog_ids, u32 cnt)
{
        unsigned long err = 0;
        bool nospc;
        u32 *ids;

        /* users of this function are doing:
         * cnt = bpf_prog_array_length();
         * if (cnt > 0)
         *     bpf_prog_array_copy_to_user(..., cnt);
         * so below kcalloc doesn't need extra cnt > 0 check.
         */
        ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
        nospc = bpf_prog_array_copy_core(array, ids, cnt);
        err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
        kfree(ids);
        if (err)
                return -EFAULT;
        if (nospc)
                return -ENOSPC;
        return 0;
}

void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
                                struct bpf_prog *old_prog)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog == old_prog) {
                        WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
                        break;
                }
}

/**
 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
 *                                   index into the program array with
 *                                   a dummy no-op program.
 * @array: a bpf_prog_array
 * @index: the index of the program to replace
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to replace.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
{
        return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
}

/**
 * bpf_prog_array_update_at() - Updates the program at the given index
 *                              into the program array.
 * @array: a bpf_prog_array
 * @index: the index of the program to update
 * @prog: the program to insert into the array
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to update.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog)
{
        struct bpf_prog_array_item *item;

        if (unlikely(index < 0))
                return -EINVAL;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                if (!index) {
                        WRITE_ONCE(item->prog, prog);
                        return 0;
                }
                index--;
        }
        return -ENOENT;
}

int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        u64 bpf_cookie,
                        struct bpf_prog_array **new_array)
{
        int new_prog_cnt, carry_prog_cnt = 0;
        struct bpf_prog_array_item *existing, *new;
        struct bpf_prog_array *array;
        bool found_exclude = false;

        /* Figure out how many existing progs we need to carry over to
         * the new array.
         */
        if (old_array) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog) {
                                found_exclude = true;
                                continue;
                        }
                        if (existing->prog != &dummy_bpf_prog.prog)
                                carry_prog_cnt++;
                        if (existing->prog == include_prog)
                                return -EEXIST;
                }
        }

        if (exclude_prog && !found_exclude)
                return -ENOENT;

        /* How many progs (not NULL) will be in the new array? */
        new_prog_cnt = carry_prog_cnt;
        if (include_prog)
                new_prog_cnt += 1;

        /* Do we have any prog (not NULL) in the new array? */
        if (!new_prog_cnt) {
                *new_array = NULL;
                return 0;
        }

        /* +1 as the end of prog_array is marked with NULL */
        array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
        if (!array)
                return -ENOMEM;
        new = array->items;

        /* Fill in the new prog array */
        if (carry_prog_cnt) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog ||
                            existing->prog == &dummy_bpf_prog.prog)
                                continue;

                        new->prog = existing->prog;
                        new->bpf_cookie = existing->bpf_cookie;
                        new++;
                }
        }
        if (include_prog) {
                new->prog = include_prog;
                new->bpf_cookie = bpf_cookie;
                new++;
        }
        new->prog = NULL;
        *new_array = array;
        return 0;
}

int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt)
{
        u32 cnt = 0;

        if (array)
                cnt = bpf_prog_array_length(array);

        *prog_cnt = cnt;

        /* return early if user requested only program count or nothing to copy */
        if (!request_cnt || !cnt)
                return 0;

        /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
        return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
                                                                     : 0;
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len)
{
        struct bpf_map *map;
        bool sleepable;
        u32 i;

        sleepable = aux->prog->sleepable;
        for (i = 0; i < len; i++) {
                map = used_maps[i];
                if (map->ops->map_poke_untrack)
                        map->ops->map_poke_untrack(map, aux);
                if (sleepable)
                        atomic64_dec(&map->sleepable_refcnt);
                bpf_map_put(map);
        }
}

static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
        __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
        kfree(aux->used_maps);
}

void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
                          struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
        struct btf_mod_pair *btf_mod;
        u32 i;

        for (i = 0; i < len; i++) {
                btf_mod = &used_btfs[i];
                if (btf_mod->module)
                        module_put(btf_mod->module);
                btf_put(btf_mod->btf);
        }
#endif
}

static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
        __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
        kfree(aux->used_btfs);
}

static void bpf_prog_free_deferred(struct work_struct *work)
{
        struct bpf_prog_aux *aux;
        int i;

        aux = container_of(work, struct bpf_prog_aux, work);
#ifdef CONFIG_BPF_SYSCALL
        bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
#endif
#ifdef CONFIG_CGROUP_BPF
        if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
                bpf_cgroup_atype_put(aux->cgroup_atype);
#endif
        bpf_free_used_maps(aux);
        bpf_free_used_btfs(aux);
        if (bpf_prog_is_dev_bound(aux))
                bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
        if (aux->prog->has_callchain_buf)
                put_callchain_buffers();
#endif
        if (aux->dst_trampoline)
                bpf_trampoline_put(aux->dst_trampoline);
        for (i = 0; i < aux->real_func_cnt; i++) {
                /* We can just unlink the subprog poke descriptor table as
                 * it was originally linked to the main program and is also
                 * released along with it.
                 */
                aux->func[i]->aux->poke_tab = NULL;
                bpf_jit_free(aux->func[i]);
        }
        if (aux->real_func_cnt) {
                kfree(aux->func);
                bpf_prog_unlock_free(aux->prog);
        } else {
                bpf_jit_free(aux->prog);
        }
}

void bpf_prog_free(struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;

        if (aux->dst_prog)
                bpf_prog_put(aux->dst_prog);
        bpf_token_put(aux->token);
        INIT_WORK(&aux->work, bpf_prog_free_deferred);
        schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);

/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);

void bpf_user_rnd_init_once(void)
{
        prandom_init_once(&bpf_user_rnd_state);
}

BPF_CALL_0(bpf_user_rnd_u32)
{
        /* Should someone ever have the rather unwise idea to use some
         * of the registers passed into this function, then note that
         * this function is called from native eBPF and classic-to-eBPF
         * transformations. Register assignments from both sides are
         * different, f.e. classic always sets fn(ctx, A, X) here.
         */
        struct rnd_state *state;
        u32 res;

        state = &get_cpu_var(bpf_user_rnd_state);
        res = prandom_u32_state(state);
        put_cpu_var(bpf_user_rnd_state);

        return res;
}

BPF_CALL_0(bpf_get_raw_cpu_id)
{
        return raw_smp_processor_id();
}

/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;

const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;

const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
const struct bpf_func_proto bpf_set_retval_proto __weak;
const struct bpf_func_proto bpf_get_retval_proto __weak;

const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
        return NULL;
}

const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
{
        return NULL;
}

u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
        return -ENOTSUPP;
}
EXPORT_SYMBOL_GPL(bpf_event_output);

/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
        .func                = NULL,
        .gpl_only        = false,
        .ret_type        = RET_VOID,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
 * It is encouraged to implement bpf_int_jit_compile() instead, so that
 * eBPF and implicitly also cBPF can get JITed!
 */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
        return prog;
}

/* Stub for JITs that support eBPF. All cBPF code gets transformed into
 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
 */
void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}

bool __weak bpf_helper_changes_pkt_data(void *func)
{
        return false;
}

/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
 * analysis code and wants explicit zero extension inserted by verifier.
 * Otherwise, return FALSE.
 *
 * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
 * you don't override this. JITs that don't want these extra insns can detect
 * them using insn_is_zext.
 */
bool __weak bpf_jit_needs_zext(void)
{
        return false;
}

/* Return true if the JIT inlines the call to the helper corresponding to
 * the imm.
 *
 * The verifier will not patch the insn->imm for the call to the helper if
 * this returns true.
 */
bool __weak bpf_jit_inlines_helper_call(s32 imm)
{
        return false;
}

/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
        return false;
}

bool __weak bpf_jit_supports_percpu_insn(void)
{
        return false;
}

bool __weak bpf_jit_supports_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_far_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_arena(void)
{
        return false;
}

bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
{
        return false;
}

u64 __weak bpf_arch_uaddress_limit(void)
{
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
        return TASK_SIZE;
#else
        return 0;
#endif
}

/* Return TRUE if the JIT backend satisfies the following two conditions:
 * 1) JIT backend supports atomic_xchg() on pointer-sized words.
 * 2) Under the specific arch, the implementation of xchg() is the same
 *    as atomic_xchg() on pointer-sized words.
 */
bool __weak bpf_jit_supports_ptr_xchg(void)
{
        return false;
}

/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
 */
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
                         int len)
{
        return -EFAULT;
}

int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                              void *addr1, void *addr2)
{
        return -ENOTSUPP;
}

void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
{
        return ERR_PTR(-ENOTSUPP);
}

int __weak bpf_arch_text_invalidate(void *dst, size_t len)
{
        return -ENOTSUPP;
}

bool __weak bpf_jit_supports_exceptions(void)
{
        return false;
}

void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}

/* for configs without MMU or 32-bit */
__weak const struct bpf_map_ops arena_map_ops;
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
{
        return 0;
}
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
        int ret;

        ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
        bpf_global_ma_set = !ret;
        return ret;
}
late_initcall(bpf_global_ma_init);
#endif

DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);

/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);



























    5 






















































    2 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H

#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/*
 *  bit-based spin_lock()
 *
 * Don't use this unless you really need to: spin_lock() and spin_unlock()
 * are significantly faster.
 */
static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(bitlock);
}

/*
 * Return true if it was acquired
 */
static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                return 0;
        }
#endif
        __acquire(bitlock);
        return 1;
}

/*
 *  bit-based spin_unlock()
 */
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 *  bit-based spin_unlock()
 *  non-atomic version, which can be used eg. if the bit lock itself is
 *  protecting the rest of the flags in the word.
 */
static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        __clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 * Return true if the lock is held.
 */
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
        return preempt_count();
#else
        return 1;
#endif
}

#endif /* __LINUX_BIT_SPINLOCK_H */





























































    2 
    2 









    1 









    2 






























    1 








    2 
    2 


    1 









    1 
    2 
    2 


























































    2 













    2 



    1 


    2 



    2 







    2 
    2 
    2 










    1 









    2 











    2 

    2 

    2 












    2 


    2 







    2 










    2 






















    2 


    2 
    2 












    1 
    1 





    2 






    2 




































































































































































































































































































































































































































































































































































































































































































































































    2 
















    1 






    1 
    1 
    1 
    1 

    1 








    1 








    1 
    1 















    2 

    1 
    2 
    2 

    2 


























    2 

















































    1 






    2 


























































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IPv6 output functions
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Based on linux/net/ipv4/ip_output.c
 *
 *        Changes:
 *        A.N.Kuznetsov        :        airthmetics in fragmentation.
 *                                extension headers are implemented.
 *                                route changes now work.
 *                                ip6_forward does not confuse sniffers.
 *                                etc.
 *
 *      H. von Brand    :       Added missing #include <linux/string.h>
 *        Imran Patel        :        frag id should be in NBO
 *      Kazunori MIYAZAWA @USAGI
 *                        :       add ip6_append_data and related functions
 *                                for datagram xmit
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/in6.h>
#include <linux/tcp.h>
#include <linux/route.h>
#include <linux/module.h>
#include <linux/slab.h>

#include <linux/bpf-cgroup.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>

#include <net/sock.h>
#include <net/snmp.h>

#include <net/gso.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/protocol.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/rawv6.h>
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/checksum.h>
#include <linux/mroute6.h>
#include <net/l3mdev.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>

static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct net_device *dev = dst->dev;
        struct inet6_dev *idev = ip6_dst_idev(dst);
        unsigned int hh_len = LL_RESERVED_SPACE(dev);
        const struct in6_addr *daddr, *nexthop;
        struct ipv6hdr *hdr;
        struct neighbour *neigh;
        int ret;

        /* Be paranoid, rather than too clever. */
        if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb) {
                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
                        return -ENOMEM;
                }
        }

        hdr = ipv6_hdr(skb);
        daddr = &hdr->daddr;
        if (ipv6_addr_is_multicast(daddr)) {
                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
                    ((mroute6_is_socket(net, skb) &&
                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
                     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);

                        /* Do not check for IFF_ALLMULTI; multicast routing
                           is not supported in any case.
                         */
                        if (newskb)
                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
                                        net, sk, newskb, NULL, newskb->dev,
                                        dev_loopback_xmit);

                        if (hdr->hop_limit == 0) {
                                IP6_INC_STATS(net, idev,
                                              IPSTATS_MIB_OUTDISCARDS);
                                kfree_skb(skb);
                                return 0;
                        }
                }

                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
                if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
                    !(dev->flags & IFF_LOOPBACK)) {
                        kfree_skb(skb);
                        return 0;
                }
        }

        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
                int res = lwtunnel_xmit(skb);

                if (res != LWTUNNEL_XMIT_CONTINUE)
                        return res;
        }

        IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);

        rcu_read_lock();
        nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
        neigh = __ipv6_neigh_lookup_noref(dev, nexthop);

        if (unlikely(IS_ERR_OR_NULL(neigh))) {
                if (unlikely(!neigh))
                        neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
                if (IS_ERR(neigh)) {
                        rcu_read_unlock();
                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
                        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
                        return -EINVAL;
                }
        }
        sock_confirm_neigh(skb, neigh);
        ret = neigh_output(neigh, skb, false);
        rcu_read_unlock();
        return ret;
}

static int
ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
                                    struct sk_buff *skb, unsigned int mtu)
{
        struct sk_buff *segs, *nskb;
        netdev_features_t features;
        int ret = 0;

        /* Please see corresponding comment in ip_finish_output_gso
         * describing the cases where GSO segment length exceeds the
         * egress MTU.
         */
        features = netif_skb_features(skb);
        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
        if (IS_ERR_OR_NULL(segs)) {
                kfree_skb(skb);
                return -ENOMEM;
        }

        consume_skb(skb);

        skb_list_walk_safe(segs, segs, nskb) {
                int err;

                skb_mark_not_on_list(segs);
                /* Last GSO segment can be smaller than gso_size (and MTU).
                 * Adding a fragment header would produce an "atomic fragment",
                 * which is considered harmful (RFC-8021). Avoid that.
                 */
                err = segs->len > mtu ?
                        ip6_fragment(net, sk, segs, ip6_finish_output2) :
                        ip6_finish_output2(net, sk, segs);
                if (err && ret == 0)
                        ret = err;
        }

        return ret;
}

static int ip6_finish_output_gso(struct net *net, struct sock *sk,
                                 struct sk_buff *skb, unsigned int mtu)
{
        if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
            !skb_gso_validate_network_len(skb, mtu))
                return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);

        return ip6_finish_output2(net, sk, skb);
}

static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        unsigned int mtu;

#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
        /* Policy lookup after SNAT yielded a new policy */
        if (skb_dst(skb)->xfrm) {
                IP6CB(skb)->flags |= IP6SKB_REROUTED;
                return dst_output(net, sk, skb);
        }
#endif

        mtu = ip6_skb_dst_mtu(skb);
        if (skb_is_gso(skb))
                return ip6_finish_output_gso(net, sk, skb, mtu);

        if (skb->len > mtu ||
            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
                return ip6_fragment(net, sk, skb, ip6_finish_output2);

        return ip6_finish_output2(net, sk, skb);
}

static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        int ret;

        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
        switch (ret) {
        case NET_XMIT_SUCCESS:
        case NET_XMIT_CN:
                return __ip6_finish_output(net, sk, skb) ? : ret;
        default:
                kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
                return ret;
        }
}

int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));

        skb->protocol = htons(ETH_P_IPV6);
        skb->dev = dev;

        if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
                kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
                return 0;
        }

        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
                            net, sk, skb, indev, dev,
                            ip6_finish_output,
                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
EXPORT_SYMBOL(ip6_output);

bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
{
        if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
                return ip6_default_np_autolabel(net);
        return inet6_test_bit(AUTOFLOWLABEL, sk);
}

/*
 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 * Note : socket lock is not held for SYNACK packets, but might be modified
 * by calls to skb_set_owner_w() and ipv6_local_error(),
 * which are using proper atomic operations or spinlocks.
 */
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
             __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
{
        struct net *net = sock_net(sk);
        const struct ipv6_pinfo *np = inet6_sk(sk);
        struct in6_addr *first_hop = &fl6->daddr;
        struct dst_entry *dst = skb_dst(skb);
        struct net_device *dev = dst->dev;
        struct inet6_dev *idev = ip6_dst_idev(dst);
        struct hop_jumbo_hdr *hop_jumbo;
        int hoplen = sizeof(*hop_jumbo);
        unsigned int head_room;
        struct ipv6hdr *hdr;
        u8  proto = fl6->flowi6_proto;
        int seg_len = skb->len;
        int hlimit = -1;
        u32 mtu;

        head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
        if (opt)
                head_room += opt->opt_nflen + opt->opt_flen;

        if (unlikely(head_room > skb_headroom(skb))) {
                skb = skb_expand_head(skb, head_room);
                if (!skb) {
                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
                        return -ENOBUFS;
                }
        }

        if (opt) {
                seg_len += opt->opt_nflen + opt->opt_flen;

                if (opt->opt_flen)
                        ipv6_push_frag_opts(skb, opt, &proto);

                if (opt->opt_nflen)
                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
                                             &fl6->saddr);
        }

        if (unlikely(seg_len > IPV6_MAXPLEN)) {
                hop_jumbo = skb_push(skb, hoplen);

                hop_jumbo->nexthdr = proto;
                hop_jumbo->hdrlen = 0;
                hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
                hop_jumbo->tlv_len = 4;
                hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);

                proto = IPPROTO_HOPOPTS;
                seg_len = 0;
                IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
        }

        skb_push(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        hdr = ipv6_hdr(skb);

        /*
         *        Fill in the IPv6 header
         */
        if (np)
                hlimit = READ_ONCE(np->hop_limit);
        if (hlimit < 0)
                hlimit = ip6_dst_hoplimit(dst);

        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
                                ip6_autoflowlabel(net, sk), fl6));

        hdr->payload_len = htons(seg_len);
        hdr->nexthdr = proto;
        hdr->hop_limit = hlimit;

        hdr->saddr = fl6->saddr;
        hdr->daddr = *first_hop;

        skb->protocol = htons(ETH_P_IPV6);
        skb->priority = priority;
        skb->mark = mark;

        mtu = dst_mtu(dst);
        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);

                /* if egress device is enslaved to an L3 master device pass the
                 * skb to its handler for processing
                 */
                skb = l3mdev_ip6_out((struct sock *)sk, skb);
                if (unlikely(!skb))
                        return 0;

                /* hooks should never assume socket lock is held.
                 * we promote our socket to non const
                 */
                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
                               net, (struct sock *)sk, skb, NULL, dev,
                               dst_output);
        }

        skb->dev = dev;
        /* ipv6_local_error() does not require socket lock,
         * we promote our socket to non const
         */
        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);

        IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
        kfree_skb(skb);
        return -EMSGSIZE;
}
EXPORT_SYMBOL(ip6_xmit);

static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
{
        struct ip6_ra_chain *ra;
        struct sock *last = NULL;

        read_lock(&ip6_ra_lock);
        for (ra = ip6_ra_chain; ra; ra = ra->next) {
                struct sock *sk = ra->sk;
                if (sk && ra->sel == sel &&
                    (!sk->sk_bound_dev_if ||
                     sk->sk_bound_dev_if == skb->dev->ifindex)) {

                        if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
                            !net_eq(sock_net(sk), dev_net(skb->dev))) {
                                continue;
                        }
                        if (last) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                                if (skb2)
                                        rawv6_rcv(last, skb2);
                        }
                        last = sk;
                }
        }

        if (last) {
                rawv6_rcv(last, skb);
                read_unlock(&ip6_ra_lock);
                return 1;
        }
        read_unlock(&ip6_ra_lock);
        return 0;
}

static int ip6_forward_proxy_check(struct sk_buff *skb)
{
        struct ipv6hdr *hdr = ipv6_hdr(skb);
        u8 nexthdr = hdr->nexthdr;
        __be16 frag_off;
        int offset;

        if (ipv6_ext_hdr(nexthdr)) {
                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
                if (offset < 0)
                        return 0;
        } else
                offset = sizeof(struct ipv6hdr);

        if (nexthdr == IPPROTO_ICMPV6) {
                struct icmp6hdr *icmp6;

                if (!pskb_may_pull(skb, (skb_network_header(skb) +
                                         offset + 1 - skb->data)))
                        return 0;

                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);

                switch (icmp6->icmp6_type) {
                case NDISC_ROUTER_SOLICITATION:
                case NDISC_ROUTER_ADVERTISEMENT:
                case NDISC_NEIGHBOUR_SOLICITATION:
                case NDISC_NEIGHBOUR_ADVERTISEMENT:
                case NDISC_REDIRECT:
                        /* For reaction involving unicast neighbor discovery
                         * message destined to the proxied address, pass it to
                         * input function.
                         */
                        return 1;
                default:
                        break;
                }
        }

        /*
         * The proxying router can't forward traffic sent to a link-local
         * address, so signal the sender and discard the packet. This
         * behavior is clarified by the MIPv6 specification.
         */
        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
                dst_link_failure(skb);
                return -1;
        }

        return 0;
}

static inline int ip6_forward_finish(struct net *net, struct sock *sk,
                                     struct sk_buff *skb)
{
#ifdef CONFIG_NET_SWITCHDEV
        if (skb->offload_l3_fwd_mark) {
                consume_skb(skb);
                return 0;
        }
#endif

        skb_clear_tstamp(skb);
        return dst_output(net, sk, skb);
}

static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
        if (skb->len <= mtu)
                return false;

        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
                return true;

        if (skb->ignore_df)
                return false;

        if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
                return false;

        return true;
}

int ip6_forward(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct ipv6hdr *hdr = ipv6_hdr(skb);
        struct inet6_skb_parm *opt = IP6CB(skb);
        struct net *net = dev_net(dst->dev);
        struct inet6_dev *idev;
        SKB_DR(reason);
        u32 mtu;

        idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
        if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
                goto error;

        if (skb->pkt_type != PACKET_HOST)
                goto drop;

        if (unlikely(skb->sk))
                goto drop;

        if (skb_warn_if_lro(skb))
                goto drop;

        if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
            (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
            !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
                goto drop;
        }

        skb_forward_csum(skb);

        /*
         *        We DO NOT make any processing on
         *        RA packets, pushing them to user level AS IS
         *        without ane WARRANTY that application will be able
         *        to interpret them. The reason is that we
         *        cannot make anything clever here.
         *
         *        We are not end-node, so that if packet contains
         *        AH/ESP, we cannot make anything.
         *        Defragmentation also would be mistake, RA packets
         *        cannot be fragmented, because there is no warranty
         *        that different fragments will go along one path. --ANK
         */
        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
                        return 0;
        }

        /*
         *        check and decrement ttl
         */
        if (hdr->hop_limit <= 1) {
                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);

                kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
                return -ETIMEDOUT;
        }

        /* XXX: idev->cnf.proxy_ndp? */
        if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
                int proxied = ip6_forward_proxy_check(skb);
                if (proxied > 0) {
                        /* It's tempting to decrease the hop limit
                         * here by 1, as we do at the end of the
                         * function too.
                         *
                         * But that would be incorrect, as proxying is
                         * not forwarding.  The ip6_input function
                         * will handle this packet locally, and it
                         * depends on the hop limit being unchanged.
                         *
                         * One example is the NDP hop limit, that
                         * always has to stay 255, but other would be
                         * similar checks around RA packets, where the
                         * user can even change the desired limit.
                         */
                        return ip6_input(skb);
                } else if (proxied < 0) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
                        goto drop;
                }
        }

        if (!xfrm6_route_forward(skb)) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
                SKB_DR_SET(reason, XFRM_POLICY);
                goto drop;
        }
        dst = skb_dst(skb);

        /* IPv6 specs say nothing about it, but it is clear that we cannot
           send redirects to source routed frames.
           We don't send redirects to frames decapsulated from IPsec.
         */
        if (IP6CB(skb)->iif == dst->dev->ifindex &&
            opt->srcrt == 0 && !skb_sec_path(skb)) {
                struct in6_addr *target = NULL;
                struct inet_peer *peer;
                struct rt6_info *rt;

                /*
                 *        incoming and outgoing devices are the same
                 *        send a redirect.
                 */

                rt = dst_rt6_info(dst);
                if (rt->rt6i_flags & RTF_GATEWAY)
                        target = &rt->rt6i_gateway;
                else
                        target = &hdr->daddr;

                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);

                /* Limit redirects both by destination (here)
                   and by source (inside ndisc_send_redirect)
                 */
                if (inet_peer_xrlim_allow(peer, 1*HZ))
                        ndisc_send_redirect(skb, target);
                if (peer)
                        inet_putpeer(peer);
        } else {
                int addrtype = ipv6_addr_type(&hdr->saddr);

                /* This check is security critical. */
                if (addrtype == IPV6_ADDR_ANY ||
                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
                        goto error;
                if (addrtype & IPV6_ADDR_LINKLOCAL) {
                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
                                    ICMPV6_NOT_NEIGHBOUR, 0);
                        goto error;
                }
        }

        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);

        mtu = ip6_dst_mtu_maybe_forward(dst, true);
        if (mtu < IPV6_MIN_MTU)
                mtu = IPV6_MIN_MTU;

        if (ip6_pkt_too_big(skb, mtu)) {
                /* Again, force OUTPUT device used as source address */
                skb->dev = dst->dev;
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
                __IP6_INC_STATS(net, ip6_dst_idev(dst),
                                IPSTATS_MIB_FRAGFAILS);
                kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
                return -EMSGSIZE;
        }

        if (skb_cow(skb, dst->dev->hard_header_len)) {
                __IP6_INC_STATS(net, ip6_dst_idev(dst),
                                IPSTATS_MIB_OUTDISCARDS);
                goto drop;
        }

        hdr = ipv6_hdr(skb);

        /* Mangling hops number delayed to point after skb COW */

        hdr->hop_limit--;

        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
                       net, NULL, skb, skb->dev, dst->dev,
                       ip6_forward_finish);

error:
        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
        SKB_DR_SET(reason, IP_INADDRERRORS);
drop:
        kfree_skb_reason(skb, reason);
        return -EINVAL;
}

static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
        skb_dst_drop(to);
        skb_dst_set(to, dst_clone(skb_dst(from)));
        to->dev = from->dev;
        to->mark = from->mark;

        skb_copy_hash(to, from);

#ifdef CONFIG_NET_SCHED
        to->tc_index = from->tc_index;
#endif
        nf_copy(to, from);
        skb_ext_copy(to, from);
        skb_copy_secmark(to, from);
}

int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
                      u8 nexthdr, __be32 frag_id,
                      struct ip6_fraglist_iter *iter)
{
        unsigned int first_len;
        struct frag_hdr *fh;

        /* BUILD HEADER */
        *prevhdr = NEXTHDR_FRAGMENT;
        iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
        if (!iter->tmp_hdr)
                return -ENOMEM;

        iter->frag = skb_shinfo(skb)->frag_list;
        skb_frag_list_init(skb);

        iter->offset = 0;
        iter->hlen = hlen;
        iter->frag_id = frag_id;
        iter->nexthdr = nexthdr;

        __skb_pull(skb, hlen);
        fh = __skb_push(skb, sizeof(struct frag_hdr));
        __skb_push(skb, hlen);
        skb_reset_network_header(skb);
        memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);

        fh->nexthdr = nexthdr;
        fh->reserved = 0;
        fh->frag_off = htons(IP6_MF);
        fh->identification = frag_id;

        first_len = skb_pagelen(skb);
        skb->data_len = first_len - skb_headlen(skb);
        skb->len = first_len;
        ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));

        return 0;
}
EXPORT_SYMBOL(ip6_fraglist_init);

void ip6_fraglist_prepare(struct sk_buff *skb,
                          struct ip6_fraglist_iter *iter)
{
        struct sk_buff *frag = iter->frag;
        unsigned int hlen = iter->hlen;
        struct frag_hdr *fh;

        frag->ip_summed = CHECKSUM_NONE;
        skb_reset_transport_header(frag);
        fh = __skb_push(frag, sizeof(struct frag_hdr));
        __skb_push(frag, hlen);
        skb_reset_network_header(frag);
        memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
        iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
        fh->nexthdr = iter->nexthdr;
        fh->reserved = 0;
        fh->frag_off = htons(iter->offset);
        if (frag->next)
                fh->frag_off |= htons(IP6_MF);
        fh->identification = iter->frag_id;
        ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
        ip6_copy_metadata(frag, skb);
}
EXPORT_SYMBOL(ip6_fraglist_prepare);

void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
                   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
                   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
{
        state->prevhdr = prevhdr;
        state->nexthdr = nexthdr;
        state->frag_id = frag_id;

        state->hlen = hlen;
        state->mtu = mtu;

        state->left = skb->len - hlen;        /* Space per frame */
        state->ptr = hlen;                /* Where to start from */

        state->hroom = hdr_room;
        state->troom = needed_tailroom;

        state->offset = 0;
}
EXPORT_SYMBOL(ip6_frag_init);

struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
{
        u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
        struct sk_buff *frag;
        struct frag_hdr *fh;
        unsigned int len;

        len = state->left;
        /* IF: it doesn't fit, use 'mtu' - the data space left */
        if (len > state->mtu)
                len = state->mtu;
        /* IF: we are not sending up to and including the packet end
           then align the next start on an eight byte boundary */
        if (len < state->left)
                len &= ~7;

        /* Allocate buffer */
        frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
                         state->hroom + state->troom, GFP_ATOMIC);
        if (!frag)
                return ERR_PTR(-ENOMEM);

        /*
         *        Set up data on packet
         */

        ip6_copy_metadata(frag, skb);
        skb_reserve(frag, state->hroom);
        skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
        skb_reset_network_header(frag);
        fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
        frag->transport_header = (frag->network_header + state->hlen +
                                  sizeof(struct frag_hdr));

        /*
         *        Charge the memory for the fragment to any owner
         *        it might possess
         */
        if (skb->sk)
                skb_set_owner_w(frag, skb->sk);

        /*
         *        Copy the packet header into the new buffer.
         */
        skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);

        fragnexthdr_offset = skb_network_header(frag);
        fragnexthdr_offset += prevhdr - skb_network_header(skb);
        *fragnexthdr_offset = NEXTHDR_FRAGMENT;

        /*
         *        Build fragment header.
         */
        fh->nexthdr = state->nexthdr;
        fh->reserved = 0;
        fh->identification = state->frag_id;

        /*
         *        Copy a block of the IP datagram.
         */
        BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
                             len));
        state->left -= len;

        fh->frag_off = htons(state->offset);
        if (state->left > 0)
                fh->frag_off |= htons(IP6_MF);
        ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));

        state->ptr += len;
        state->offset += len;

        return frag;
}
EXPORT_SYMBOL(ip6_frag_next);

int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                 int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        struct sk_buff *frag;
        struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
                                inet6_sk(skb->sk) : NULL;
        u8 tstamp_type = skb->tstamp_type;
        struct ip6_frag_state state;
        unsigned int mtu, hlen, nexthdr_offset;
        ktime_t tstamp = skb->tstamp;
        int hroom, err = 0;
        __be32 frag_id;
        u8 *prevhdr, nexthdr = 0;

        err = ip6_find_1stfragopt(skb, &prevhdr);
        if (err < 0)
                goto fail;
        hlen = err;
        nexthdr = *prevhdr;
        nexthdr_offset = prevhdr - skb_network_header(skb);

        mtu = ip6_skb_dst_mtu(skb);

        /* We must not fragment if the socket is set to force MTU discovery
         * or if the skb it not generated by a local socket.
         */
        if (unlikely(!skb->ignore_df && skb->len > mtu))
                goto fail_toobig;

        if (IP6CB(skb)->frag_max_size) {
                if (IP6CB(skb)->frag_max_size > mtu)
                        goto fail_toobig;

                /* don't send fragments larger than what we received */
                mtu = IP6CB(skb)->frag_max_size;
                if (mtu < IPV6_MIN_MTU)
                        mtu = IPV6_MIN_MTU;
        }

        if (np) {
                u32 frag_size = READ_ONCE(np->frag_size);

                if (frag_size && frag_size < mtu)
                        mtu = frag_size;
        }
        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
                goto fail_toobig;
        mtu -= hlen + sizeof(struct frag_hdr);

        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
                                    &ipv6_hdr(skb)->saddr);

        if (skb->ip_summed == CHECKSUM_PARTIAL &&
            (err = skb_checksum_help(skb)))
                goto fail;

        prevhdr = skb_network_header(skb) + nexthdr_offset;
        hroom = LL_RESERVED_SPACE(rt->dst.dev);
        if (skb_has_frag_list(skb)) {
                unsigned int first_len = skb_pagelen(skb);
                struct ip6_fraglist_iter iter;
                struct sk_buff *frag2;

                if (first_len - hlen > mtu ||
                    ((first_len - hlen) & 7) ||
                    skb_cloned(skb) ||
                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
                        goto slow_path;

                skb_walk_frags(skb, frag) {
                        /* Correct geometry. */
                        if (frag->len > mtu ||
                            ((frag->len & 7) && frag->next) ||
                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
                                goto slow_path_clean;

                        /* Partially cloned skb? */
                        if (skb_shared(frag))
                                goto slow_path_clean;

                        BUG_ON(frag->sk);
                        if (skb->sk) {
                                frag->sk = skb->sk;
                                frag->destructor = sock_wfree;
                        }
                        skb->truesize -= frag->truesize;
                }

                err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
                                        &iter);
                if (err < 0)
                        goto fail;

                /* We prevent @rt from being freed. */
                rcu_read_lock();

                for (;;) {
                        /* Prepare header of the next frame,
                         * before previous one went down. */
                        if (iter.frag)
                                ip6_fraglist_prepare(skb, &iter);

                        skb_set_delivery_time(skb, tstamp, tstamp_type);
                        err = output(net, sk, skb);
                        if (!err)
                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
                                              IPSTATS_MIB_FRAGCREATES);

                        if (err || !iter.frag)
                                break;

                        skb = ip6_fraglist_next(&iter);
                }

                kfree(iter.tmp_hdr);

                if (err == 0) {
                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
                                      IPSTATS_MIB_FRAGOKS);
                        rcu_read_unlock();
                        return 0;
                }

                kfree_skb_list(iter.frag);

                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
                              IPSTATS_MIB_FRAGFAILS);
                rcu_read_unlock();
                return err;

slow_path_clean:
                skb_walk_frags(skb, frag2) {
                        if (frag2 == frag)
                                break;
                        frag2->sk = NULL;
                        frag2->destructor = NULL;
                        skb->truesize += frag2->truesize;
                }
        }

slow_path:
        /*
         *        Fragment the datagram.
         */

        ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
                      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
                      &state);

        /*
         *        Keep copying data until we run out.
         */

        while (state.left > 0) {
                frag = ip6_frag_next(skb, &state);
                if (IS_ERR(frag)) {
                        err = PTR_ERR(frag);
                        goto fail;
                }

                /*
                 *        Put this fragment into the sending queue.
                 */
                skb_set_delivery_time(frag, tstamp, tstamp_type);
                err = output(net, sk, frag);
                if (err)
                        goto fail;

                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                              IPSTATS_MIB_FRAGCREATES);
        }
        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                      IPSTATS_MIB_FRAGOKS);
        consume_skb(skb);
        return err;

fail_toobig:
        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
        err = -EMSGSIZE;

fail:
        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                      IPSTATS_MIB_FRAGFAILS);
        kfree_skb(skb);
        return err;
}

static inline int ip6_rt_check(const struct rt6key *rt_key,
                               const struct in6_addr *fl_addr,
                               const struct in6_addr *addr_cache)
{
        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
}

static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
                                          struct dst_entry *dst,
                                          const struct flowi6 *fl6)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct rt6_info *rt;

        if (!dst)
                goto out;

        if (dst->ops->family != AF_INET6) {
                dst_release(dst);
                return NULL;
        }

        rt = dst_rt6_info(dst);
        /* Yes, checking route validity in not connected
         * case is not very simple. Take into account,
         * that we do not support routing by source, TOS,
         * and MSG_DONTROUTE                --ANK (980726)
         *
         * 1. ip6_rt_check(): If route was host route,
         *    check that cached destination is current.
         *    If it is network route, we still may
         *    check its validity using saved pointer
         *    to the last used address: daddr_cache.
         *    We do not want to save whole address now,
         *    (because main consumer of this service
         *    is tcp, which has not this problem),
         *    so that the last trick works only on connected
         *    sockets.
         * 2. oif also should be the same.
         */
        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
#ifdef CONFIG_IPV6_SUBTREES
            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
#endif
           (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
                dst_release(dst);
                dst = NULL;
        }

out:
        return dst;
}

static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
                               struct dst_entry **dst, struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        struct neighbour *n;
        struct rt6_info *rt;
#endif
        int err;
        int flags = 0;

        /* The correct way to handle this would be to do
         * ip6_route_get_saddr, and then ip6_route_output; however,
         * the route-specific preferred source forces the
         * ip6_route_output call _before_ ip6_route_get_saddr.
         *
         * In source specific routing (no src=any default route),
         * ip6_route_output will fail given src=any saddr, though, so
         * that's why we try it again later.
         */
        if (ipv6_addr_any(&fl6->saddr)) {
                struct fib6_info *from;
                struct rt6_info *rt;

                *dst = ip6_route_output(net, sk, fl6);
                rt = (*dst)->error ? NULL : dst_rt6_info(*dst);

                rcu_read_lock();
                from = rt ? rcu_dereference(rt->from) : NULL;
                err = ip6_route_get_saddr(net, from, &fl6->daddr,
                                          sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
                                          &fl6->saddr);
                rcu_read_unlock();

                if (err)
                        goto out_err_release;

                /* If we had an erroneous initial result, pretend it
                 * never existed and let the SA-enabled version take
                 * over.
                 */
                if ((*dst)->error) {
                        dst_release(*dst);
                        *dst = NULL;
                }

                if (fl6->flowi6_oif)
                        flags |= RT6_LOOKUP_F_IFACE;
        }

        if (!*dst)
                *dst = ip6_route_output_flags(net, sk, fl6, flags);

        err = (*dst)->error;
        if (err)
                goto out_err_release;

#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        /*
         * Here if the dst entry we've looked up
         * has a neighbour entry that is in the INCOMPLETE
         * state and the src address from the flow is
         * marked as OPTIMISTIC, we release the found
         * dst entry and replace it instead with the
         * dst entry of the nexthop router
         */
        rt = dst_rt6_info(*dst);
        rcu_read_lock();
        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
                                      rt6_nexthop(rt, &fl6->daddr));
        err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
        rcu_read_unlock();

        if (err) {
                struct inet6_ifaddr *ifp;
                struct flowi6 fl_gw6;
                int redirect;

                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
                                      (*dst)->dev, 1);

                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
                if (ifp)
                        in6_ifa_put(ifp);

                if (redirect) {
                        /*
                         * We need to get the dst entry for the
                         * default router instead
                         */
                        dst_release(*dst);
                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
                        *dst = ip6_route_output(net, sk, &fl_gw6);
                        err = (*dst)->error;
                        if (err)
                                goto out_err_release;
                }
        }
#endif
        if (ipv6_addr_v4mapped(&fl6->saddr) &&
            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
                err = -EAFNOSUPPORT;
                goto out_err_release;
        }

        return 0;

out_err_release:
        dst_release(*dst);
        *dst = NULL;

        if (err == -ENETUNREACH)
                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
        return err;
}

/**
 *        ip6_dst_lookup - perform route lookup on flow
 *        @net: Network namespace to perform lookup in
 *        @sk: socket which provides route info
 *        @dst: pointer to dst_entry * for result
 *        @fl6: flow to lookup
 *
 *        This function performs a route lookup on the given flow.
 *
 *        It returns zero on success, or a standard errno code on error.
 */
int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
                   struct flowi6 *fl6)
{
        *dst = NULL;
        return ip6_dst_lookup_tail(net, sk, dst, fl6);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup);

/**
 *        ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 *        @net: Network namespace to perform lookup in
 *        @sk: socket which provides route info
 *        @fl6: flow to lookup
 *        @final_dst: final destination address for ipsec lookup
 *
 *        This function performs a route lookup on the given flow.
 *
 *        It returns a valid dst pointer on success, or a pointer encoded
 *        error code.
 */
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
                                      const struct in6_addr *final_dst)
{
        struct dst_entry *dst = NULL;
        int err;

        err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
        if (err)
                return ERR_PTR(err);
        if (final_dst)
                fl6->daddr = *final_dst;

        return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);

/**
 *        ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
 *        @sk: socket which provides the dst cache and route info
 *        @fl6: flow to lookup
 *        @final_dst: final destination address for ipsec lookup
 *        @connected: whether @sk is connected or not
 *
 *        This function performs a route lookup on the given flow with the
 *        possibility of using the cached route in the socket if it is valid.
 *        It will take the socket dst lock when operating on the dst cache.
 *        As a result, this function can only be used in process context.
 *
 *        In addition, for a connected socket, cache the dst in the socket
 *        if the current cache is not valid.
 *
 *        It returns a valid dst pointer on success, or a pointer encoded
 *        error code.
 */
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
                                         const struct in6_addr *final_dst,
                                         bool connected)
{
        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);

        dst = ip6_sk_dst_check(sk, dst, fl6);
        if (dst)
                return dst;

        dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
        if (connected && !IS_ERR(dst))
                ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);

        return dst;
}
EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);

static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
                                               gfp_t gfp)
{
        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
}

static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
                                                gfp_t gfp)
{
        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
}

static void ip6_append_data_mtu(unsigned int *mtu,
                                int *maxfraglen,
                                unsigned int fragheaderlen,
                                struct sk_buff *skb,
                                struct rt6_info *rt,
                                unsigned int orig_mtu)
{
        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
                if (!skb) {
                        /* first fragment, reserve header_len */
                        *mtu = orig_mtu - rt->dst.header_len;

                } else {
                        /*
                         * this fragment is not first, the headers
                         * space is regarded as data space.
                         */
                        *mtu = orig_mtu;
                }
                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
                              + fragheaderlen - sizeof(struct frag_hdr);
        }
}

static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
                          struct rt6_info *rt)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        unsigned int mtu, frag_size;
        struct ipv6_txoptions *nopt, *opt = ipc6->opt;

        /* callers pass dst together with a reference, set it first so
         * ip6_cork_release() can put it down even in case of an error.
         */
        cork->base.dst = &rt->dst;

        /*
         * setup for corking
         */
        if (opt) {
                if (WARN_ON(v6_cork->opt))
                        return -EINVAL;

                nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
                if (unlikely(!nopt))
                        return -ENOBUFS;

                nopt->tot_len = sizeof(*opt);
                nopt->opt_flen = opt->opt_flen;
                nopt->opt_nflen = opt->opt_nflen;

                nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
                if (opt->dst0opt && !nopt->dst0opt)
                        return -ENOBUFS;

                nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
                if (opt->dst1opt && !nopt->dst1opt)
                        return -ENOBUFS;

                nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
                if (opt->hopopt && !nopt->hopopt)
                        return -ENOBUFS;

                nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
                if (opt->srcrt && !nopt->srcrt)
                        return -ENOBUFS;

                /* need source address above miyazawa*/
        }
        v6_cork->hop_limit = ipc6->hlimit;
        v6_cork->tclass = ipc6->tclass;
        if (rt->dst.flags & DST_XFRM_TUNNEL)
                mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
                      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
        else
                mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
                        READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));

        frag_size = READ_ONCE(np->frag_size);
        if (frag_size && frag_size < mtu)
                mtu = frag_size;

        cork->base.fragsize = mtu;
        cork->base.gso_size = ipc6->gso_size;
        cork->base.tx_flags = 0;
        cork->base.mark = ipc6->sockc.mark;
        sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);

        cork->base.length = 0;
        cork->base.transmit_time = ipc6->sockc.transmit_time;

        return 0;
}

static int __ip6_append_data(struct sock *sk,
                             struct sk_buff_head *queue,
                             struct inet_cork_full *cork_full,
                             struct inet6_cork *v6_cork,
                             struct page_frag *pfrag,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, size_t length, int transhdrlen,
                             unsigned int flags, struct ipcm6_cookie *ipc6)
{
        struct sk_buff *skb, *skb_prev = NULL;
        struct inet_cork *cork = &cork_full->base;
        struct flowi6 *fl6 = &cork_full->fl.u.ip6;
        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
        struct ubuf_info *uarg = NULL;
        int exthdrlen = 0;
        int dst_exthdrlen = 0;
        int hh_len;
        int copy;
        int err;
        int offset = 0;
        bool zc = false;
        u32 tskey = 0;
        struct rt6_info *rt = dst_rt6_info(cork->dst);
        bool paged, hold_tskey, extra_uref = false;
        struct ipv6_txoptions *opt = v6_cork->opt;
        int csummode = CHECKSUM_NONE;
        unsigned int maxnonfragsize, headersize;
        unsigned int wmem_alloc_delta = 0;

        skb = skb_peek_tail(queue);
        if (!skb) {
                exthdrlen = opt ? opt->opt_flen : 0;
                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
        }

        paged = !!cork->gso_size;
        mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
        orig_mtu = mtu;

        hh_len = LL_RESERVED_SPACE(rt->dst.dev);

        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
                        (opt ? opt->opt_nflen : 0);

        headersize = sizeof(struct ipv6hdr) +
                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
                     rt->rt6i_nfheader_len;

        if (mtu <= fragheaderlen ||
            ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
                goto emsgsize;

        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
                     sizeof(struct frag_hdr);

        /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
         * the first fragment
         */
        if (headersize + transhdrlen > mtu)
                goto emsgsize;

        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
            (sk->sk_protocol == IPPROTO_UDP ||
             sk->sk_protocol == IPPROTO_ICMPV6 ||
             sk->sk_protocol == IPPROTO_RAW)) {
                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
                                sizeof(struct ipv6hdr));
                goto emsgsize;
        }

        if (ip6_sk_ignore_df(sk))
                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
        else
                maxnonfragsize = mtu;

        if (cork->length + length > maxnonfragsize - headersize) {
emsgsize:
                pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
                ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
                return -EMSGSIZE;
        }

        /* CHECKSUM_PARTIAL only with no extension headers and when
         * we are not going to fragment
         */
        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
            headersize == sizeof(struct ipv6hdr) &&
            length <= mtu - headersize &&
            (!(flags & MSG_MORE) || cork->gso_size) &&
            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
                csummode = CHECKSUM_PARTIAL;

        if ((flags & MSG_ZEROCOPY) && length) {
                struct msghdr *msg = from;

                if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
                        if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
                                return -EINVAL;

                        /* Leave uarg NULL if can't zerocopy, callers should
                         * be able to handle it.
                         */
                        if ((rt->dst.dev->features & NETIF_F_SG) &&
                            csummode == CHECKSUM_PARTIAL) {
                                paged = true;
                                zc = true;
                                uarg = msg->msg_ubuf;
                        }
                } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
                        uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
                        if (!uarg)
                                return -ENOBUFS;
                        extra_uref = !skb_zcopy(skb);        /* only ref on new uarg */
                        if (rt->dst.dev->features & NETIF_F_SG &&
                            csummode == CHECKSUM_PARTIAL) {
                                paged = true;
                                zc = true;
                        } else {
                                uarg_to_msgzc(uarg)->zerocopy = 0;
                                skb_zcopy_set(skb, uarg, &extra_uref);
                        }
                }
        } else if ((flags & MSG_SPLICE_PAGES) && length) {
                if (inet_test_bit(HDRINCL, sk))
                        return -EPERM;
                if (rt->dst.dev->features & NETIF_F_SG &&
                    getfrag == ip_generic_getfrag)
                        /* We need an empty buffer to attach stuff to */
                        paged = true;
                else
                        flags &= ~MSG_SPLICE_PAGES;
        }

        hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
                     READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
        if (hold_tskey)
                tskey = atomic_inc_return(&sk->sk_tskey) - 1;

        /*
         * Let's try using as much space as possible.
         * Use MTU if total length of the message fits into the MTU.
         * Otherwise, we need to reserve fragment header and
         * fragment alignment (= 8-15 octects, in total).
         *
         * Note that we may need to "move" the data from the tail
         * of the buffer to the new fragment when we split
         * the message.
         *
         * FIXME: It may be fragmented into multiple chunks
         *        at once if non-fragmentable extension headers
         *        are too large.
         * --yoshfuji
         */

        cork->length += length;
        if (!skb)
                goto alloc_new_skb;

        while (length > 0) {
                /* Check if the remaining data fits into current packet. */
                copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
                if (copy < length)
                        copy = maxfraglen - skb->len;

                if (copy <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
                        unsigned int fraggap;
                        unsigned int alloclen, alloc_extra;
                        unsigned int pagedlen;
alloc_new_skb:
                        /* There's no room in the current skb */
                        if (skb)
                                fraggap = skb->len - maxfraglen;
                        else
                                fraggap = 0;
                        /* update mtu and maxfraglen if necessary */
                        if (!skb || !skb_prev)
                                ip6_append_data_mtu(&mtu, &maxfraglen,
                                                    fragheaderlen, skb, rt,
                                                    orig_mtu);

                        skb_prev = skb;

                        /*
                         * If remaining data exceeds the mtu,
                         * we know we need more fragment(s).
                         */
                        datalen = length + fraggap;

                        if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
                        fraglen = datalen + fragheaderlen;
                        pagedlen = 0;

                        alloc_extra = hh_len;
                        alloc_extra += dst_exthdrlen;
                        alloc_extra += rt->dst.trailer_len;

                        /* We just reserve space for fragment header.
                         * Note: this may be overallocation if the message
                         * (without MSG_MORE) fits into the MTU.
                         */
                        alloc_extra += sizeof(struct frag_hdr);

                        if ((flags & MSG_MORE) &&
                            !(rt->dst.dev->features&NETIF_F_SG))
                                alloclen = mtu;
                        else if (!paged &&
                                 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
                                  !(rt->dst.dev->features & NETIF_F_SG)))
                                alloclen = fraglen;
                        else {
                                alloclen = fragheaderlen + transhdrlen;
                                pagedlen = datalen - transhdrlen;
                        }
                        alloclen += alloc_extra;

                        if (datalen != length + fraggap) {
                                /*
                                 * this is not the last fragment, the trailer
                                 * space is regarded as data space.
                                 */
                                datalen += rt->dst.trailer_len;
                        }

                        fraglen = datalen + fragheaderlen;

                        copy = datalen - transhdrlen - fraggap - pagedlen;
                        /* [!] NOTE: copy may be negative if pagedlen>0
                         * because then the equation may reduces to -fraggap.
                         */
                        if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
                                err = -EINVAL;
                                goto error;
                        }
                        if (transhdrlen) {
                                skb = sock_alloc_send_skb(sk, alloclen,
                                                (flags & MSG_DONTWAIT), &err);
                        } else {
                                skb = NULL;
                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
                                    2 * sk->sk_sndbuf)
                                        skb = alloc_skb(alloclen,
                                                        sk->sk_allocation);
                                if (unlikely(!skb))
                                        err = -ENOBUFS;
                        }
                        if (!skb)
                                goto error;
                        /*
                         *        Fill in the control structures
                         */
                        skb->protocol = htons(ETH_P_IPV6);
                        skb->ip_summed = csummode;
                        skb->csum = 0;
                        /* reserve for fragmentation and ipsec header */
                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
                                    dst_exthdrlen);

                        /*
                         *        Find where to start putting bytes
                         */
                        data = skb_put(skb, fraglen - pagedlen);
                        skb_set_network_header(skb, exthdrlen);
                        data += fragheaderlen;
                        skb->transport_header = (skb->network_header +
                                                 fragheaderlen);
                        if (fraggap) {
                                skb->csum = skb_copy_and_csum_bits(
                                        skb_prev, maxfraglen,
                                        data + transhdrlen, fraggap);
                                skb_prev->csum = csum_sub(skb_prev->csum,
                                                          skb->csum);
                                data += fraggap;
                                pskb_trim_unique(skb_prev, maxfraglen);
                        }
                        if (copy > 0 &&
                            getfrag(from, data + transhdrlen, offset,
                                    copy, fraggap, skb) < 0) {
                                err = -EFAULT;
                                kfree_skb(skb);
                                goto error;
                        } else if (flags & MSG_SPLICE_PAGES) {
                                copy = 0;
                        }

                        offset += copy;
                        length -= copy + transhdrlen;
                        transhdrlen = 0;
                        exthdrlen = 0;
                        dst_exthdrlen = 0;

                        /* Only the initial fragment is time stamped */
                        skb_shinfo(skb)->tx_flags = cork->tx_flags;
                        cork->tx_flags = 0;
                        skb_shinfo(skb)->tskey = tskey;
                        tskey = 0;
                        skb_zcopy_set(skb, uarg, &extra_uref);

                        if ((flags & MSG_CONFIRM) && !skb_prev)
                                skb_set_dst_pending_confirm(skb, 1);

                        /*
                         * Put the packet on the pending queue
                         */
                        if (!skb->destructor) {
                                skb->destructor = sock_wfree;
                                skb->sk = sk;
                                wmem_alloc_delta += skb->truesize;
                        }
                        __skb_queue_tail(queue, skb);
                        continue;
                }

                if (copy > length)
                        copy = length;

                if (!(rt->dst.dev->features&NETIF_F_SG) &&
                    skb_tailroom(skb) >= copy) {
                        unsigned int off;

                        off = skb->len;
                        if (getfrag(from, skb_put(skb, copy),
                                                offset, copy, off, skb) < 0) {
                                __skb_trim(skb, off);
                                err = -EFAULT;
                                goto error;
                        }
                } else if (flags & MSG_SPLICE_PAGES) {
                        struct msghdr *msg = from;

                        err = -EIO;
                        if (WARN_ON_ONCE(copy > msg->msg_iter.count))
                                goto error;

                        err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
                                                   sk->sk_allocation);
                        if (err < 0)
                                goto error;
                        copy = err;
                        wmem_alloc_delta += copy;
                } else if (!zc) {
                        int i = skb_shinfo(skb)->nr_frags;

                        err = -ENOMEM;
                        if (!sk_page_frag_refill(sk, pfrag))
                                goto error;

                        skb_zcopy_downgrade_managed(skb);
                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
                                err = -EMSGSIZE;
                                if (i == MAX_SKB_FRAGS)
                                        goto error;

                                __skb_fill_page_desc(skb, i, pfrag->page,
                                                     pfrag->offset, 0);
                                skb_shinfo(skb)->nr_frags = ++i;
                                get_page(pfrag->page);
                        }
                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
                        if (getfrag(from,
                                    page_address(pfrag->page) + pfrag->offset,
                                    offset, copy, skb->len, skb) < 0)
                                goto error_efault;

                        pfrag->offset += copy;
                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                        skb->len += copy;
                        skb->data_len += copy;
                        skb->truesize += copy;
                        wmem_alloc_delta += copy;
                } else {
                        err = skb_zerocopy_iter_dgram(skb, from, copy);
                        if (err < 0)
                                goto error;
                }
                offset += copy;
                length -= copy;
        }

        if (wmem_alloc_delta)
                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
        return 0;

error_efault:
        err = -EFAULT;
error:
        net_zcopy_put_abort(uarg, extra_uref);
        cork->length -= length;
        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
        if (hold_tskey)
                atomic_dec(&sk->sk_tskey);
        return err;
}

int ip6_append_data(struct sock *sk,
                    int getfrag(void *from, char *to, int offset, int len,
                                int odd, struct sk_buff *skb),
                    void *from, size_t length, int transhdrlen,
                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
                    struct rt6_info *rt, unsigned int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        int exthdrlen;
        int err;

        if (flags&MSG_PROBE)
                return 0;
        if (skb_queue_empty(&sk->sk_write_queue)) {
                /*
                 * setup for corking
                 */
                dst_hold(&rt->dst);
                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
                                     ipc6, rt);
                if (err)
                        return err;

                inet->cork.fl.u.ip6 = *fl6;
                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
                length += exthdrlen;
                transhdrlen += exthdrlen;
        } else {
                transhdrlen = 0;
        }

        return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
                                 &np->cork, sk_page_frag(sk), getfrag,
                                 from, length, transhdrlen, flags, ipc6);
}
EXPORT_SYMBOL_GPL(ip6_append_data);

static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
{
        struct dst_entry *dst = cork->base.dst;

        cork->base.dst = NULL;
        skb_dst_set(skb, dst);
}

static void ip6_cork_release(struct inet_cork_full *cork,
                             struct inet6_cork *v6_cork)
{
        if (v6_cork->opt) {
                struct ipv6_txoptions *opt = v6_cork->opt;

                kfree(opt->dst0opt);
                kfree(opt->dst1opt);
                kfree(opt->hopopt);
                kfree(opt->srcrt);
                kfree(opt);
                v6_cork->opt = NULL;
        }

        if (cork->base.dst) {
                dst_release(cork->base.dst);
                cork->base.dst = NULL;
        }
}

struct sk_buff *__ip6_make_skb(struct sock *sk,
                               struct sk_buff_head *queue,
                               struct inet_cork_full *cork,
                               struct inet6_cork *v6_cork)
{
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
        struct in6_addr *final_dst;
        struct net *net = sock_net(sk);
        struct ipv6hdr *hdr;
        struct ipv6_txoptions *opt = v6_cork->opt;
        struct rt6_info *rt = dst_rt6_info(cork->base.dst);
        struct flowi6 *fl6 = &cork->fl.u.ip6;
        unsigned char proto = fl6->flowi6_proto;

        skb = __skb_dequeue(queue);
        if (!skb)
                goto out;
        tail_skb = &(skb_shinfo(skb)->frag_list);

        /* move skb->data to ip header from ext header */
        if (skb->data < skb_network_header(skb))
                __skb_pull(skb, skb_network_offset(skb));
        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
                __skb_pull(tmp_skb, skb_network_header_len(skb));
                *tail_skb = tmp_skb;
                tail_skb = &(tmp_skb->next);
                skb->len += tmp_skb->len;
                skb->data_len += tmp_skb->len;
                skb->truesize += tmp_skb->truesize;
                tmp_skb->destructor = NULL;
                tmp_skb->sk = NULL;
        }

        /* Allow local fragmentation. */
        skb->ignore_df = ip6_sk_ignore_df(sk);
        __skb_pull(skb, skb_network_header_len(skb));

        final_dst = &fl6->daddr;
        if (opt && opt->opt_flen)
                ipv6_push_frag_opts(skb, opt, &proto);
        if (opt && opt->opt_nflen)
                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);

        skb_push(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        hdr = ipv6_hdr(skb);

        ip6_flow_hdr(hdr, v6_cork->tclass,
                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
                                        ip6_autoflowlabel(net, sk), fl6));
        hdr->hop_limit = v6_cork->hop_limit;
        hdr->nexthdr = proto;
        hdr->saddr = fl6->saddr;
        hdr->daddr = *final_dst;

        skb->priority = READ_ONCE(sk->sk_priority);
        skb->mark = cork->base.mark;
        if (sk_is_tcp(sk))
                skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
        else
                skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);

        ip6_cork_steal_dst(skb, cork);
        IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
        if (proto == IPPROTO_ICMPV6) {
                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
                u8 icmp6_type;

                if (sk->sk_socket->type == SOCK_RAW &&
                   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
                        icmp6_type = fl6->fl6_icmp_type;
                else
                        icmp6_type = icmp6_hdr(skb)->icmp6_type;
                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
        }

        ip6_cork_release(cork, v6_cork);
out:
        return skb;
}

int ip6_send_skb(struct sk_buff *skb)
{
        struct net *net = sock_net(skb->sk);
        struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
        int err;

        err = ip6_local_out(net, skb->sk, skb);
        if (err) {
                if (err > 0)
                        err = net_xmit_errno(err);
                if (err)
                        IP6_INC_STATS(net, rt->rt6i_idev,
                                      IPSTATS_MIB_OUTDISCARDS);
        }

        return err;
}

int ip6_push_pending_frames(struct sock *sk)
{
        struct sk_buff *skb;

        skb = ip6_finish_skb(sk);
        if (!skb)
                return 0;

        return ip6_send_skb(skb);
}
EXPORT_SYMBOL_GPL(ip6_push_pending_frames);

static void __ip6_flush_pending_frames(struct sock *sk,
                                       struct sk_buff_head *queue,
                                       struct inet_cork_full *cork,
                                       struct inet6_cork *v6_cork)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
                if (skb_dst(skb))
                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
                                      IPSTATS_MIB_OUTDISCARDS);
                kfree_skb(skb);
        }

        ip6_cork_release(cork, v6_cork);
}

void ip6_flush_pending_frames(struct sock *sk)
{
        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
}
EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);

struct sk_buff *ip6_make_skb(struct sock *sk,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, size_t length, int transhdrlen,
                             struct ipcm6_cookie *ipc6, struct rt6_info *rt,
                             unsigned int flags, struct inet_cork_full *cork)
{
        struct inet6_cork v6_cork;
        struct sk_buff_head queue;
        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
        int err;

        if (flags & MSG_PROBE) {
                dst_release(&rt->dst);
                return NULL;
        }

        __skb_queue_head_init(&queue);

        cork->base.flags = 0;
        cork->base.addr = 0;
        cork->base.opt = NULL;
        v6_cork.opt = NULL;
        err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
        if (err) {
                ip6_cork_release(cork, &v6_cork);
                return ERR_PTR(err);
        }
        if (ipc6->dontfrag < 0)
                ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk);

        err = __ip6_append_data(sk, &queue, cork, &v6_cork,
                                &current->task_frag, getfrag, from,
                                length + exthdrlen, transhdrlen + exthdrlen,
                                flags, ipc6);
        if (err) {
                __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
                return ERR_PTR(err);
        }

        return __ip6_make_skb(sk, &queue, cork, &v6_cork);
}













































    1 

    1 








    1 





    1 
    1 






























    1 

    1 

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_DST_METADATA_H
#define __NET_DST_METADATA_H 1

#include <linux/skbuff.h>
#include <net/ip_tunnels.h>
#include <net/macsec.h>
#include <net/dst.h>

enum metadata_type {
        METADATA_IP_TUNNEL,
        METADATA_HW_PORT_MUX,
        METADATA_MACSEC,
        METADATA_XFRM,
};

struct hw_port_info {
        struct net_device *lower_dev;
        u32 port_id;
};

struct macsec_info {
        sci_t sci;
};

struct xfrm_md_info {
        u32 if_id;
        int link;
        struct dst_entry *dst_orig;
};

struct metadata_dst {
        struct dst_entry                dst;
        enum metadata_type                type;
        union {
                struct ip_tunnel_info        tun_info;
                struct hw_port_info        port_info;
                struct macsec_info        macsec_info;
                struct xfrm_md_info        xfrm_info;
        } u;
};

static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb);

        if (md_dst && md_dst->dst.flags & DST_METADATA)
                return md_dst;

        return NULL;
}

static inline struct ip_tunnel_info *
skb_tunnel_info(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        struct dst_entry *dst;

        if (md_dst && md_dst->type == METADATA_IP_TUNNEL)
                return &md_dst->u.tun_info;

        dst = skb_dst(skb);
        if (dst && dst->lwtstate &&
            (dst->lwtstate->type == LWTUNNEL_ENCAP_IP ||
             dst->lwtstate->type == LWTUNNEL_ENCAP_IP6))
                return lwt_tun_info(dst->lwtstate);

        return NULL;
}

static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt)
{
        return (struct xfrm_md_info *)lwt->data;
}

static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        struct dst_entry *dst;

        if (md_dst && md_dst->type == METADATA_XFRM)
                return &md_dst->u.xfrm_info;

        dst = skb_dst(skb);
        if (dst && dst->lwtstate &&
            dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM)
                return lwt_xfrm_info(dst->lwtstate);

        return NULL;
}

static inline bool skb_valid_dst(const struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        return dst && !(dst->flags & DST_METADATA);
}

static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
                                       const struct sk_buff *skb_b)
{
        const struct metadata_dst *a, *b;

        if (!(skb_a->_skb_refdst | skb_b->_skb_refdst))
                return 0;

        a = (const struct metadata_dst *) skb_dst(skb_a);
        b = (const struct metadata_dst *) skb_dst(skb_b);

        if (!a != !b || a->type != b->type)
                return 1;

        switch (a->type) {
        case METADATA_HW_PORT_MUX:
                return memcmp(&a->u.port_info, &b->u.port_info,
                              sizeof(a->u.port_info));
        case METADATA_IP_TUNNEL:
                return memcmp(&a->u.tun_info, &b->u.tun_info,
                              sizeof(a->u.tun_info) +
                                         a->u.tun_info.options_len);
        case METADATA_MACSEC:
                return memcmp(&a->u.macsec_info, &b->u.macsec_info,
                              sizeof(a->u.macsec_info));
        case METADATA_XFRM:
                return memcmp(&a->u.xfrm_info, &b->u.xfrm_info,
                              sizeof(a->u.xfrm_info));
        default:
                return 1;
        }
}

void metadata_dst_free(struct metadata_dst *);
struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
                                        gfp_t flags);
void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst);
struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);

static inline struct metadata_dst *tun_rx_dst(int md_size)
{
        struct metadata_dst *tun_dst;

        tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
        if (!tun_dst)
                return NULL;

        tun_dst->u.tun_info.options_len = 0;
        tun_dst->u.tun_info.mode = 0;
        return tun_dst;
}

static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
{
        struct metadata_dst *md_dst = skb_metadata_dst(skb);
        int md_size;
        struct metadata_dst *new_md;

        if (!md_dst || md_dst->type != METADATA_IP_TUNNEL)
                return ERR_PTR(-EINVAL);

        md_size = md_dst->u.tun_info.options_len;
        new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
        if (!new_md)
                return ERR_PTR(-ENOMEM);

        memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
               sizeof(struct ip_tunnel_info) + md_size);
#ifdef CONFIG_DST_CACHE
        /* Unclone the dst cache if there is one */
        if (new_md->u.tun_info.dst_cache.cache) {
                int ret;

                ret = dst_cache_init(&new_md->u.tun_info.dst_cache, GFP_ATOMIC);
                if (ret) {
                        metadata_dst_free(new_md);
                        return ERR_PTR(ret);
                }
        }
#endif

        skb_dst_drop(skb);
        skb_dst_set(skb, &new_md->dst);
        return new_md;
}

static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb)
{
        struct metadata_dst *dst;

        dst = tun_dst_unclone(skb);
        if (IS_ERR(dst))
                return NULL;

        return &dst->u.tun_info;
}

static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
                                                    __be32 daddr,
                                                    __u8 tos, __u8 ttl,
                                                    __be16 tp_dst,
                                                    const unsigned long *flags,
                                                    __be64 tunnel_id,
                                                    int md_size)
{
        struct metadata_dst *tun_dst;

        tun_dst = tun_rx_dst(md_size);
        if (!tun_dst)
                return NULL;

        ip_tunnel_key_init(&tun_dst->u.tun_info.key,
                           saddr, daddr, tos, ttl,
                           0, 0, tp_dst, tunnel_id, flags);
        return tun_dst;
}

static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
                                                 const unsigned long *flags,
                                                 __be64 tunnel_id,
                                                 int md_size)
{
        const struct iphdr *iph = ip_hdr(skb);

        return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
                                0, flags, tunnel_id, md_size);
}

static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
                                                      const struct in6_addr *daddr,
                                                      __u8 tos, __u8 ttl,
                                                      __be16 tp_dst,
                                                      __be32 label,
                                                      const unsigned long *flags,
                                                      __be64 tunnel_id,
                                                      int md_size)
{
        struct metadata_dst *tun_dst;
        struct ip_tunnel_info *info;

        tun_dst = tun_rx_dst(md_size);
        if (!tun_dst)
                return NULL;

        info = &tun_dst->u.tun_info;
        info->mode = IP_TUNNEL_INFO_IPV6;
        ip_tunnel_flags_copy(info->key.tun_flags, flags);
        info->key.tun_id = tunnel_id;
        info->key.tp_src = 0;
        info->key.tp_dst = tp_dst;

        info->key.u.ipv6.src = *saddr;
        info->key.u.ipv6.dst = *daddr;

        info->key.tos = tos;
        info->key.ttl = ttl;
        info->key.label = label;

        return tun_dst;
}

static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
                                                   const unsigned long *flags,
                                                   __be64 tunnel_id,
                                                   int md_size)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);

        return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr,
                                  ipv6_get_dsfield(ip6h), ip6h->hop_limit,
                                  0, ip6_flowlabel(ip6h), flags, tunnel_id,
                                  md_size);
}
#endif /* __NET_DST_METADATA_H */














































































































































































































































































    1 




















    1 












    1 




    1 

























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/file.c - sysfs regular (text) file implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/mm.h>

#include "sysfs.h"

/*
 * Determine ktype->sysfs_ops for the given kernfs_node.  This function
 * must be called while holding an active reference.
 */
static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
{
        struct kobject *kobj = kn->parent->priv;

        if (kn->flags & KERNFS_LOCKDEP)
                lockdep_assert_held(kn);
        return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
}

/*
 * Reads on sysfs are handled through seq_file, which takes care of hairy
 * details like buffering and seeking.  The following function pipes
 * sysfs_ops->show() result through seq_file.
 */
static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;
        struct kobject *kobj = of->kn->parent->priv;
        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
        ssize_t count;
        char *buf;

        if (WARN_ON_ONCE(!ops->show))
                return -EINVAL;

        /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
        count = seq_get_buf(sf, &buf);
        if (count < PAGE_SIZE) {
                seq_commit(sf, -1);
                return 0;
        }
        memset(buf, 0, PAGE_SIZE);

        count = ops->show(kobj, of->kn->priv, buf);
        if (count < 0)
                return count;

        /*
         * The code works fine with PAGE_SIZE return but it's likely to
         * indicate truncated result or overflow in normal use cases.
         */
        if (count >= (ssize_t)PAGE_SIZE) {
                printk("fill_read_buffer: %pS returned bad count\n",
                                ops->show);
                /* Try to struggle along */
                count = PAGE_SIZE - 1;
        }
        seq_commit(sf, count);
        return 0;
}

static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
                                 size_t count, loff_t pos)
{
        struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = of->kn->parent->priv;
        loff_t size = file_inode(of->file)->i_size;

        if (!count)
                return 0;

        if (size) {
                if (pos >= size)
                        return 0;
                if (pos + count > size)
                        count = size - pos;
        }

        if (!battr->read)
                return -EIO;

        return battr->read(of->file, kobj, battr, buf, pos, count);
}

/* kernfs read callback for regular sysfs files with pre-alloc */
static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
                             size_t count, loff_t pos)
{
        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
        struct kobject *kobj = of->kn->parent->priv;
        ssize_t len;

        /*
         * If buf != of->prealloc_buf, we don't know how
         * large it is, so cannot safely pass it to ->show
         */
        if (WARN_ON_ONCE(buf != of->prealloc_buf))
                return 0;
        len = ops->show(kobj, of->kn->priv, buf);
        if (len < 0)
                return len;
        if (pos) {
                if (len <= pos)
                        return 0;
                len -= pos;
                memmove(buf, buf + pos, len);
        }
        return min_t(ssize_t, count, len);
}

/* kernfs write callback for regular sysfs files */
static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
                              size_t count, loff_t pos)
{
        const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
        struct kobject *kobj = of->kn->parent->priv;

        if (!count)
                return 0;

        return ops->store(kobj, of->kn->priv, buf, count);
}

/* kernfs write callback for bin sysfs files */
static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
                                  size_t count, loff_t pos)
{
        struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = of->kn->parent->priv;
        loff_t size = file_inode(of->file)->i_size;

        if (size) {
                if (size <= pos)
                        return -EFBIG;
                count = min_t(ssize_t, count, size - pos);
        }
        if (!count)
                return 0;

        if (!battr->write)
                return -EIO;

        return battr->write(of->file, kobj, battr, buf, pos, count);
}

static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
                             struct vm_area_struct *vma)
{
        struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = of->kn->parent->priv;

        return battr->mmap(of->file, kobj, battr, vma);
}

static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
                                  int whence)
{
        struct bin_attribute *battr = of->kn->priv;
        struct kobject *kobj = of->kn->parent->priv;

        if (battr->llseek)
                return battr->llseek(of->file, kobj, battr, offset, whence);
        else
                return generic_file_llseek(of->file, offset, whence);
}

static int sysfs_kf_bin_open(struct kernfs_open_file *of)
{
        struct bin_attribute *battr = of->kn->priv;

        if (battr->f_mapping)
                of->file->f_mapping = battr->f_mapping();

        return 0;
}

void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
{
        struct kernfs_node *kn = kobj->sd, *tmp;

        if (kn && dir)
                kn = kernfs_find_and_get(kn, dir);
        else
                kernfs_get(kn);

        if (kn && attr) {
                tmp = kernfs_find_and_get(kn, attr);
                kernfs_put(kn);
                kn = tmp;
        }

        if (kn) {
                kernfs_notify(kn);
                kernfs_put(kn);
        }
}
EXPORT_SYMBOL_GPL(sysfs_notify);

static const struct kernfs_ops sysfs_file_kfops_empty = {
};

static const struct kernfs_ops sysfs_file_kfops_ro = {
        .seq_show        = sysfs_kf_seq_show,
};

static const struct kernfs_ops sysfs_file_kfops_wo = {
        .write                = sysfs_kf_write,
};

static const struct kernfs_ops sysfs_file_kfops_rw = {
        .seq_show        = sysfs_kf_seq_show,
        .write                = sysfs_kf_write,
};

static const struct kernfs_ops sysfs_prealloc_kfops_ro = {
        .read                = sysfs_kf_read,
        .prealloc        = true,
};

static const struct kernfs_ops sysfs_prealloc_kfops_wo = {
        .write                = sysfs_kf_write,
        .prealloc        = true,
};

static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
        .read                = sysfs_kf_read,
        .write                = sysfs_kf_write,
        .prealloc        = true,
};

static const struct kernfs_ops sysfs_bin_kfops_ro = {
        .read                = sysfs_kf_bin_read,
};

static const struct kernfs_ops sysfs_bin_kfops_wo = {
        .write                = sysfs_kf_bin_write,
};

static const struct kernfs_ops sysfs_bin_kfops_rw = {
        .read                = sysfs_kf_bin_read,
        .write                = sysfs_kf_bin_write,
};

static const struct kernfs_ops sysfs_bin_kfops_mmap = {
        .read                = sysfs_kf_bin_read,
        .write                = sysfs_kf_bin_write,
        .mmap                = sysfs_kf_bin_mmap,
        .open                = sysfs_kf_bin_open,
        .llseek                = sysfs_kf_bin_llseek,
};

int sysfs_add_file_mode_ns(struct kernfs_node *parent,
                const struct attribute *attr, umode_t mode, kuid_t uid,
                kgid_t gid, const void *ns)
{
        struct kobject *kobj = parent->priv;
        const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
        struct lock_class_key *key = NULL;
        const struct kernfs_ops *ops = NULL;
        struct kernfs_node *kn;

        /* every kobject with an attribute needs a ktype assigned */
        if (WARN(!sysfs_ops, KERN_ERR
                        "missing sysfs attribute operations for kobject: %s\n",
                        kobject_name(kobj)))
                return -EINVAL;

        if (mode & SYSFS_PREALLOC) {
                if (sysfs_ops->show && sysfs_ops->store)
                        ops = &sysfs_prealloc_kfops_rw;
                else if (sysfs_ops->show)
                        ops = &sysfs_prealloc_kfops_ro;
                else if (sysfs_ops->store)
                        ops = &sysfs_prealloc_kfops_wo;
        } else {
                if (sysfs_ops->show && sysfs_ops->store)
                        ops = &sysfs_file_kfops_rw;
                else if (sysfs_ops->show)
                        ops = &sysfs_file_kfops_ro;
                else if (sysfs_ops->store)
                        ops = &sysfs_file_kfops_wo;
        }

        if (!ops)
                ops = &sysfs_file_kfops_empty;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (!attr->ignore_lockdep)
                key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif

        kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
                                  PAGE_SIZE, ops, (void *)attr, ns, key);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, attr->name);
                return PTR_ERR(kn);
        }
        return 0;
}

int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
                const struct bin_attribute *battr, umode_t mode,
                kuid_t uid, kgid_t gid, const void *ns)
{
        const struct attribute *attr = &battr->attr;
        struct lock_class_key *key = NULL;
        const struct kernfs_ops *ops;
        struct kernfs_node *kn;

        if (battr->mmap)
                ops = &sysfs_bin_kfops_mmap;
        else if (battr->read && battr->write)
                ops = &sysfs_bin_kfops_rw;
        else if (battr->read)
                ops = &sysfs_bin_kfops_ro;
        else if (battr->write)
                ops = &sysfs_bin_kfops_wo;
        else
                ops = &sysfs_file_kfops_empty;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (!attr->ignore_lockdep)
                key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif

        kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid,
                                  battr->size, ops, (void *)attr, ns, key);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, attr->name);
                return PTR_ERR(kn);
        }
        return 0;
}

/**
 * sysfs_create_file_ns - create an attribute file for an object with custom ns
 * @kobj: object we're creating for
 * @attr: attribute descriptor
 * @ns: namespace the new file should belong to
 */
int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
                         const void *ns)
{
        kuid_t uid;
        kgid_t gid;

        if (WARN_ON(!kobj || !kobj->sd || !attr))
                return -EINVAL;

        kobject_get_ownership(kobj, &uid, &gid);
        return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns);
}
EXPORT_SYMBOL_GPL(sysfs_create_file_ns);

int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr)
{
        int err = 0;
        int i;

        for (i = 0; ptr[i] && !err; i++)
                err = sysfs_create_file(kobj, ptr[i]);
        if (err)
                while (--i >= 0)
                        sysfs_remove_file(kobj, ptr[i]);
        return err;
}
EXPORT_SYMBOL_GPL(sysfs_create_files);

/**
 * sysfs_add_file_to_group - add an attribute file to a pre-existing group.
 * @kobj: object we're acting for.
 * @attr: attribute descriptor.
 * @group: group name.
 */
int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        struct kernfs_node *parent;
        kuid_t uid;
        kgid_t gid;
        int error;

        if (group) {
                parent = kernfs_find_and_get(kobj->sd, group);
        } else {
                parent = kobj->sd;
                kernfs_get(parent);
        }

        if (!parent)
                return -ENOENT;

        kobject_get_ownership(kobj, &uid, &gid);
        error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid,
                                       NULL);
        kernfs_put(parent);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);

/**
 * sysfs_chmod_file - update the modified mode value on an object attribute.
 * @kobj: object we're acting for.
 * @attr: attribute descriptor.
 * @mode: file permissions.
 *
 */
int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
                     umode_t mode)
{
        struct kernfs_node *kn;
        struct iattr newattrs;
        int rc;

        kn = kernfs_find_and_get(kobj->sd, attr->name);
        if (!kn)
                return -ENOENT;

        newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE;

        rc = kernfs_setattr(kn, &newattrs);

        kernfs_put(kn);
        return rc;
}
EXPORT_SYMBOL_GPL(sysfs_chmod_file);

/**
 * sysfs_break_active_protection - break "active" protection
 * @kobj: The kernel object @attr is associated with.
 * @attr: The attribute to break the "active" protection for.
 *
 * With sysfs, just like kernfs, deletion of an attribute is postponed until
 * all active .show() and .store() callbacks have finished unless this function
 * is called. Hence this function is useful in methods that implement self
 * deletion.
 */
struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
                                                  const struct attribute *attr)
{
        struct kernfs_node *kn;

        kobject_get(kobj);
        kn = kernfs_find_and_get(kobj->sd, attr->name);
        if (kn)
                kernfs_break_active_protection(kn);
        else
                kobject_put(kobj);
        return kn;
}
EXPORT_SYMBOL_GPL(sysfs_break_active_protection);

/**
 * sysfs_unbreak_active_protection - restore "active" protection
 * @kn: Pointer returned by sysfs_break_active_protection().
 *
 * Undo the effects of sysfs_break_active_protection(). Since this function
 * calls kernfs_put() on the kernfs node that corresponds to the 'attr'
 * argument passed to sysfs_break_active_protection() that attribute may have
 * been removed between the sysfs_break_active_protection() and
 * sysfs_unbreak_active_protection() calls, it is not safe to access @kn after
 * this function has returned.
 */
void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
        struct kobject *kobj = kn->parent->priv;

        kernfs_unbreak_active_protection(kn);
        kernfs_put(kn);
        kobject_put(kobj);
}
EXPORT_SYMBOL_GPL(sysfs_unbreak_active_protection);

/**
 * sysfs_remove_file_ns - remove an object attribute with a custom ns tag
 * @kobj: object we're acting for
 * @attr: attribute descriptor
 * @ns: namespace tag of the file to remove
 *
 * Hash the attribute name and namespace tag and kill the victim.
 */
void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const void *ns)
{
        struct kernfs_node *parent = kobj->sd;

        kernfs_remove_by_name_ns(parent, attr->name, ns);
}
EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);

/**
 * sysfs_remove_file_self - remove an object attribute from its own method
 * @kobj: object we're acting for
 * @attr: attribute descriptor
 *
 * See kernfs_remove_self() for details.
 */
bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
{
        struct kernfs_node *parent = kobj->sd;
        struct kernfs_node *kn;
        bool ret;

        kn = kernfs_find_and_get(parent, attr->name);
        if (WARN_ON_ONCE(!kn))
                return false;

        ret = kernfs_remove_self(kn);

        kernfs_put(kn);
        return ret;
}
EXPORT_SYMBOL_GPL(sysfs_remove_file_self);

void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr)
{
        int i;

        for (i = 0; ptr[i]; i++)
                sysfs_remove_file(kobj, ptr[i]);
}
EXPORT_SYMBOL_GPL(sysfs_remove_files);

/**
 * sysfs_remove_file_from_group - remove an attribute file from a group.
 * @kobj: object we're acting for.
 * @attr: attribute descriptor.
 * @group: group name.
 */
void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        struct kernfs_node *parent;

        if (group) {
                parent = kernfs_find_and_get(kobj->sd, group);
        } else {
                parent = kobj->sd;
                kernfs_get(parent);
        }

        if (parent) {
                kernfs_remove_by_name(parent, attr->name);
                kernfs_put(parent);
        }
}
EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);

/**
 *        sysfs_create_bin_file - create binary file for object.
 *        @kobj:        object.
 *        @attr:        attribute descriptor.
 */
int sysfs_create_bin_file(struct kobject *kobj,
                          const struct bin_attribute *attr)
{
        kuid_t uid;
        kgid_t gid;

        if (WARN_ON(!kobj || !kobj->sd || !attr))
                return -EINVAL;

        kobject_get_ownership(kobj, &uid, &gid);
        return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, uid,
                                           gid, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_create_bin_file);

/**
 *        sysfs_remove_bin_file - remove binary file for object.
 *        @kobj:        object.
 *        @attr:        attribute descriptor.
 */
void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
{
        kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);

static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid,
                                 kgid_t kgid)
{
        struct iattr newattrs = {
                .ia_valid = ATTR_UID | ATTR_GID,
                .ia_uid = kuid,
                .ia_gid = kgid,
        };
        return kernfs_setattr(kn, &newattrs);
}

/**
 *        sysfs_link_change_owner - change owner of a sysfs file.
 *        @kobj:        object of the kernfs_node the symlink is located in.
 *        @targ:        object of the kernfs_node the symlink points to.
 *        @name:        name of the link.
 *        @kuid:        new owner's kuid
 *        @kgid:        new owner's kgid
 *
 * This function looks up the sysfs symlink entry @name under @kobj and changes
 * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of
 * @targ.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
                            const char *name, kuid_t kuid, kgid_t kgid)
{
        struct kernfs_node *kn = NULL;
        int error;

        if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs)
                return -EINVAL;

        error = -ENOENT;
        kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns);
        if (!kn)
                goto out;

        error = -EINVAL;
        if (kernfs_type(kn) != KERNFS_LINK)
                goto out;
        if (kn->symlink.target_kn->priv != targ)
                goto out;

        error = internal_change_owner(kn, kuid, kgid);

out:
        kernfs_put(kn);
        return error;
}

/**
 *        sysfs_file_change_owner - change owner of a sysfs file.
 *        @kobj:        object.
 *        @name:        name of the file to change.
 *        @kuid:        new owner's kuid
 *        @kgid:        new owner's kgid
 *
 * This function looks up the sysfs entry @name under @kobj and changes the
 * ownership to @kuid/@kgid.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
                            kgid_t kgid)
{
        struct kernfs_node *kn;
        int error;

        if (!name)
                return -EINVAL;

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        kn = kernfs_find_and_get(kobj->sd, name);
        if (!kn)
                return -ENOENT;

        error = internal_change_owner(kn, kuid, kgid);

        kernfs_put(kn);

        return error;
}
EXPORT_SYMBOL_GPL(sysfs_file_change_owner);

/**
 *        sysfs_change_owner - change owner of the given object.
 *        @kobj:        object.
 *        @kuid:        new owner's kuid
 *        @kgid:        new owner's kgid
 *
 * Change the owner of the default directory, files, groups, and attributes of
 * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs
 * entries for a kobject are added by driver core. In summary,
 * sysfs_change_owner() takes care of the default directory entry for @kobj,
 * the default attributes associated with the ktype of @kobj and the default
 * attributes associated with the ktype of @kobj.
 * Additional properties not added by driver core have to be changed by the
 * driver or subsystem which created them. This is similar to how
 * driver/subsystem specific entries are removed.
 *
 * Returns 0 on success or error code on failure.
 */
int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
{
        int error;
        const struct kobj_type *ktype;

        if (!kobj->state_in_sysfs)
                return -EINVAL;

        /* Change the owner of the kobject itself. */
        error = internal_change_owner(kobj->sd, kuid, kgid);
        if (error)
                return error;

        ktype = get_ktype(kobj);
        if (ktype) {
                /*
                 * Change owner of the default groups associated with the
                 * ktype of @kobj.
                 */
                error = sysfs_groups_change_owner(kobj, ktype->default_groups,
                                                  kuid, kgid);
                if (error)
                        return error;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sysfs_change_owner);

/**
 *        sysfs_emit - scnprintf equivalent, aware of PAGE_SIZE buffer.
 *        @buf:        start of PAGE_SIZE buffer.
 *        @fmt:        format
 *        @...:        optional arguments to @format
 *
 *
 * Returns number of characters written to @buf.
 */
int sysfs_emit(char *buf, const char *fmt, ...)
{
        va_list args;
        int len;

        if (WARN(!buf || offset_in_page(buf),
                 "invalid sysfs_emit: buf:%p\n", buf))
                return 0;

        va_start(args, fmt);
        len = vscnprintf(buf, PAGE_SIZE, fmt, args);
        va_end(args);

        return len;
}
EXPORT_SYMBOL_GPL(sysfs_emit);

/**
 *        sysfs_emit_at - scnprintf equivalent, aware of PAGE_SIZE buffer.
 *        @buf:        start of PAGE_SIZE buffer.
 *        @at:        offset in @buf to start write in bytes
 *                @at must be >= 0 && < PAGE_SIZE
 *        @fmt:        format
 *        @...:        optional arguments to @fmt
 *
 *
 * Returns number of characters written starting at &@buf[@at].
 */
int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
{
        va_list args;
        int len;

        if (WARN(!buf || offset_in_page(buf) || at < 0 || at >= PAGE_SIZE,
                 "invalid sysfs_emit_at: buf:%p at:%d\n", buf, at))
                return 0;

        va_start(args, fmt);
        len = vscnprintf(buf + at, PAGE_SIZE - at, fmt, args);
        va_end(args);

        return len;
}
EXPORT_SYMBOL_GPL(sysfs_emit_at);

/**
 *        sysfs_bin_attr_simple_read - read callback to simply copy from memory.
 *        @file:        attribute file which is being read.
 *        @kobj:        object to which the attribute belongs.
 *        @attr:        attribute descriptor.
 *        @buf:        destination buffer.
 *        @off:        offset in bytes from which to read.
 *        @count:        maximum number of bytes to read.
 *
 * Simple ->read() callback for bin_attributes backed by a buffer in memory.
 * The @private and @size members in struct bin_attribute must be set to the
 * buffer's location and size before the bin_attribute is created in sysfs.
 *
 * Bounds check for @off and @count is done in sysfs_kf_bin_read().
 * Negative value check for @off is done in vfs_setpos() and default_llseek().
 *
 * Returns number of bytes written to @buf.
 */
ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
                                   struct bin_attribute *attr, char *buf,
                                   loff_t off, size_t count)
{
        memcpy(buf, attr->private + off, count);
        return count;
}
EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read);




























































































    1 


















    1 



















    1 


    1 
    1 







    1 


    1 
    1 

































    1 


























































































    1 































    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
// SPDX-License-Identifier: GPL-2.0-only
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic TIME_WAIT sockets functions
 *
 *                From code orinally in TCP
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>


/**
 *        inet_twsk_bind_unhash - unhash a timewait socket from bind hash
 *        @tw: timewait socket
 *        @hashinfo: hashinfo pointer
 *
 *        unhash a timewait socket from bind hash, if hashed.
 *        bind hash lock must be held by caller.
 *        Returns 1 if caller should call inet_twsk_put() after lock release.
 */
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
                          struct inet_hashinfo *hashinfo)
{
        struct inet_bind2_bucket *tb2 = tw->tw_tb2;
        struct inet_bind_bucket *tb = tw->tw_tb;

        if (!tb)
                return;

        __sk_del_bind_node((struct sock *)tw);
        tw->tw_tb = NULL;
        tw->tw_tb2 = NULL;
        inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
        inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);

        __sock_put((struct sock *)tw);
}

/* Must be called with locally disabled BHs. */
static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
        struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
        spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
        struct inet_bind_hashbucket *bhead, *bhead2;

        spin_lock(lock);
        sk_nulls_del_node_init_rcu((struct sock *)tw);
        spin_unlock(lock);

        /* Disassociate with bind bucket. */
        bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
                        hashinfo->bhash_size)];
        bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
                                       twsk_net(tw), tw->tw_num);

        spin_lock(&bhead->lock);
        spin_lock(&bhead2->lock);
        inet_twsk_bind_unhash(tw, hashinfo);
        spin_unlock(&bhead2->lock);
        spin_unlock(&bhead->lock);

        refcount_dec(&tw->tw_dr->tw_refcount);
        inet_twsk_put(tw);
}

void inet_twsk_free(struct inet_timewait_sock *tw)
{
        struct module *owner = tw->tw_prot->owner;
        twsk_destructor((struct sock *)tw);
        kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
        module_put(owner);
}

void inet_twsk_put(struct inet_timewait_sock *tw)
{
        if (refcount_dec_and_test(&tw->tw_refcnt))
                inet_twsk_free(tw);
}
EXPORT_SYMBOL_GPL(inet_twsk_put);

static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
                                   struct hlist_nulls_head *list)
{
        hlist_nulls_add_head_rcu(&tw->tw_node, list);
}

static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
{
        __inet_twsk_schedule(tw, timeo, false);
}

/*
 * Enter the time wait state.
 * Essentially we whip up a timewait bucket, copy the relevant info into it
 * from the SK, and mess with hash chains and list linkage.
 *
 * The caller must not access @tw anymore after this function returns.
 */
void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
                                  struct sock *sk,
                                  struct inet_hashinfo *hashinfo,
                                  int timeo)
{
        const struct inet_sock *inet = inet_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
        spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
        struct inet_bind_hashbucket *bhead, *bhead2;

        /* Step 1: Put TW into bind hash. Original socket stays there too.
           Note, that any socket with inet->num != 0 MUST be bound in
           binding cache, even if it is closed.
         */
        bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
                        hashinfo->bhash_size)];
        bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);

        local_bh_disable();
        spin_lock(&bhead->lock);
        spin_lock(&bhead2->lock);

        tw->tw_tb = icsk->icsk_bind_hash;
        WARN_ON(!icsk->icsk_bind_hash);

        tw->tw_tb2 = icsk->icsk_bind2_hash;
        WARN_ON(!icsk->icsk_bind2_hash);
        sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners);

        spin_unlock(&bhead2->lock);
        spin_unlock(&bhead->lock);

        spin_lock(lock);

        /* Step 2: Hash TW into tcp ehash chain */
        inet_twsk_add_node_rcu(tw, &ehead->chain);

        /* Step 3: Remove SK from hash chain */
        if (__sk_nulls_del_node_init_rcu(sk))
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);


        /* Ensure above writes are committed into memory before updating the
         * refcount.
         * Provides ordering vs later refcount_inc().
         */
        smp_wmb();
        /* tw_refcnt is set to 3 because we have :
         * - one reference for bhash chain.
         * - one reference for ehash chain.
         * - one reference for timer.
         * Also note that after this point, we lost our implicit reference
         * so we are not allowed to use tw anymore.
         */
        refcount_set(&tw->tw_refcnt, 3);

        inet_twsk_schedule(tw, timeo);

        spin_unlock(lock);
        local_bh_enable();
}
EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule);

static void tw_timer_handler(struct timer_list *t)
{
        struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);

        inet_twsk_kill(tw);
}

struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
                                           struct inet_timewait_death_row *dr,
                                           const int state)
{
        struct inet_timewait_sock *tw;

        if (refcount_read(&dr->tw_refcount) - 1 >=
            READ_ONCE(dr->sysctl_max_tw_buckets))
                return NULL;

        tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
                              GFP_ATOMIC);
        if (tw) {
                const struct inet_sock *inet = inet_sk(sk);

                tw->tw_dr            = dr;
                /* Give us an identity. */
                tw->tw_daddr            = inet->inet_daddr;
                tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
                tw->tw_bound_dev_if = sk->sk_bound_dev_if;
                tw->tw_tos            = inet->tos;
                tw->tw_num            = inet->inet_num;
                tw->tw_state            = TCP_TIME_WAIT;
                tw->tw_substate            = state;
                tw->tw_sport            = inet->inet_sport;
                tw->tw_dport            = inet->inet_dport;
                tw->tw_family            = sk->sk_family;
                tw->tw_reuse            = sk->sk_reuse;
                tw->tw_reuseport    = sk->sk_reuseport;
                tw->tw_hash            = sk->sk_hash;
                tw->tw_ipv6only            = 0;
                tw->tw_transparent  = inet_test_bit(TRANSPARENT, sk);
                tw->tw_prot            = sk->sk_prot_creator;
                atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
                twsk_net_set(tw, sock_net(sk));
                timer_setup(&tw->tw_timer, tw_timer_handler, 0);
                /*
                 * Because we use RCU lookups, we should not set tw_refcnt
                 * to a non null value before everything is setup for this
                 * timewait socket.
                 */
                refcount_set(&tw->tw_refcnt, 0);

                __module_get(tw->tw_prot->owner);
        }

        return tw;
}
EXPORT_SYMBOL_GPL(inet_twsk_alloc);

/* These are always called from BH context.  See callers in
 * tcp_input.c to verify this.
 */

/* This is for handling early-kills of TIME_WAIT sockets.
 * Warning : consume reference.
 * Caller should not access tw anymore.
 */
void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
        struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
        spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);

        /* inet_twsk_purge() walks over all sockets, including tw ones,
         * and removes them via inet_twsk_deschedule_put() after a
         * refcount_inc_not_zero().
         *
         * inet_twsk_hashdance_schedule() must (re)init the refcount before
         * arming the timer, i.e. inet_twsk_purge can obtain a reference to
         * a twsk that did not yet schedule the timer.
         *
         * The ehash lock synchronizes these two:
         * After acquiring the lock, the timer is always scheduled (else
         * timer_shutdown returns false), because hashdance_schedule releases
         * the ehash lock only after completing the timer initialization.
         *
         * Without grabbing the ehash lock, we get:
         * 1) cpu x sets twsk refcount to 3
         * 2) cpu y bumps refcount to 4
         * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down
         * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown
         * -> timer refcount is never decremented.
         */
        spin_lock(lock);
        /*  Makes sure hashdance_schedule() has completed */
        spin_unlock(lock);

        if (timer_shutdown_sync(&tw->tw_timer))
                inet_twsk_kill(tw);
        inet_twsk_put(tw);
}
EXPORT_SYMBOL(inet_twsk_deschedule_put);

void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
{
        /* timeout := RTO * 3.5
         *
         * 3.5 = 1+2+0.5 to wait for two retransmits.
         *
         * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
         * our ACK acking that FIN can be lost. If N subsequent retransmitted
         * FINs (or previous seqments) are lost (probability of such event
         * is p^(N+1), where p is probability to lose single packet and
         * time to detect the loss is about RTO*(2^N - 1) with exponential
         * backoff). Normal timewait length is calculated so, that we
         * waited at least for one retransmitted FIN (maximal RTO is 120sec).
         * [ BTW Linux. following BSD, violates this requirement waiting
         *   only for 60sec, we should wait at least for 240 secs.
         *   Well, 240 consumes too much of resources 8)
         * ]
         * This interval is not reduced to catch old duplicate and
         * responces to our wandering segments living for two MSLs.
         * However, if we use PAWS to detect
         * old duplicates, we can reduce the interval to bounds required
         * by RTO, rather than MSL. So, if peer understands PAWS, we
         * kill tw bucket after 3.5*RTO (it is important that this number
         * is greater than TS tick!) and detect old duplicates with help
         * of PAWS.
         */

        if (!rearm) {
                bool kill = timeo <= 4*HZ;

                __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
                                                     LINUX_MIB_TIMEWAITED);
                BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
                refcount_inc(&tw->tw_dr->tw_refcount);
        } else {
                mod_timer_pending(&tw->tw_timer, jiffies + timeo);
        }
}
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);

/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
void inet_twsk_purge(struct inet_hashinfo *hashinfo)
{
        struct inet_ehash_bucket *head = &hashinfo->ehash[0];
        unsigned int ehash_mask = hashinfo->ehash_mask;
        struct hlist_nulls_node *node;
        unsigned int slot;
        struct sock *sk;

        for (slot = 0; slot <= ehash_mask; slot++, head++) {
                if (hlist_nulls_empty(&head->chain))
                        continue;

restart_rcu:
                cond_resched();
                rcu_read_lock();
restart:
                sk_nulls_for_each_rcu(sk, node, &head->chain) {
                        int state = inet_sk_state_load(sk);

                        if ((1 << state) & ~(TCPF_TIME_WAIT |
                                             TCPF_NEW_SYN_RECV))
                                continue;

                        if (refcount_read(&sock_net(sk)->ns.count))
                                continue;

                        if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                                continue;

                        if (refcount_read(&sock_net(sk)->ns.count)) {
                                sock_gen_put(sk);
                                goto restart;
                        }

                        rcu_read_unlock();
                        local_bh_disable();
                        if (state == TCP_TIME_WAIT) {
                                inet_twsk_deschedule_put(inet_twsk(sk));
                        } else {
                                struct request_sock *req = inet_reqsk(sk);

                                inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
                                                                  req);
                        }
                        local_bh_enable();
                        goto restart_rcu;
                }
                /* If the nulls value we got at the end of this lookup is
                 * not the expected one, we must restart lookup.
                 * We probably met an item that was moved to another chain.
                 */
                if (get_nulls_value(node) != slot)
                        goto restart;
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL_GPL(inet_twsk_purge);





































































































































































    1 
    1 











    1 
    1 



















    1 








































    1 






    1 







    1 
    1 


















    1 


    1 
































    1 

























    1 











    1 



    1 


    1 


    1 






    1 




    1 

    1 













    1 

    1 


























    1 





































    1 
    1 











    1 



    1 


    1 



    1 






    1 








    1 






    1 



    1 





    1 
    1 

    1 





    1 




    1 













































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Connection tracking protocol helper module for SCTP.
 *
 * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
 * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
 *
 * SCTP is defined in RFC 2960. References to various sections in this code
 * are to this RFC.
 */

#include <linux/types.h>
#include <linux/timer.h>
#include <linux/netfilter.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/sctp.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <net/sctp/checksum.h>

#include <net/netfilter/nf_log.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_timeout.h>

static const char *const sctp_conntrack_names[] = {
        [SCTP_CONNTRACK_NONE]                        = "NONE",
        [SCTP_CONNTRACK_CLOSED]                        = "CLOSED",
        [SCTP_CONNTRACK_COOKIE_WAIT]                = "COOKIE_WAIT",
        [SCTP_CONNTRACK_COOKIE_ECHOED]                = "COOKIE_ECHOED",
        [SCTP_CONNTRACK_ESTABLISHED]                = "ESTABLISHED",
        [SCTP_CONNTRACK_SHUTDOWN_SENT]                = "SHUTDOWN_SENT",
        [SCTP_CONNTRACK_SHUTDOWN_RECD]                = "SHUTDOWN_RECD",
        [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]        = "SHUTDOWN_ACK_SENT",
        [SCTP_CONNTRACK_HEARTBEAT_SENT]                = "HEARTBEAT_SENT",
};

#define SECS  * HZ
#define MINS  * 60 SECS
#define HOURS * 60 MINS
#define DAYS  * 24 HOURS

static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
        [SCTP_CONNTRACK_CLOSED]                        = 10 SECS,
        [SCTP_CONNTRACK_COOKIE_WAIT]                = 3 SECS,
        [SCTP_CONNTRACK_COOKIE_ECHOED]                = 3 SECS,
        [SCTP_CONNTRACK_ESTABLISHED]                = 210 SECS,
        [SCTP_CONNTRACK_SHUTDOWN_SENT]                = 3 SECS,
        [SCTP_CONNTRACK_SHUTDOWN_RECD]                = 3 SECS,
        [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]        = 3 SECS,
        [SCTP_CONNTRACK_HEARTBEAT_SENT]                = 30 SECS,
};

#define        SCTP_FLAG_HEARTBEAT_VTAG_FAILED        1

#define sNO SCTP_CONNTRACK_NONE
#define        sCL SCTP_CONNTRACK_CLOSED
#define        sCW SCTP_CONNTRACK_COOKIE_WAIT
#define        sCE SCTP_CONNTRACK_COOKIE_ECHOED
#define        sES SCTP_CONNTRACK_ESTABLISHED
#define        sSS SCTP_CONNTRACK_SHUTDOWN_SENT
#define        sSR SCTP_CONNTRACK_SHUTDOWN_RECD
#define        sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
#define        sHS SCTP_CONNTRACK_HEARTBEAT_SENT
#define        sIV SCTP_CONNTRACK_MAX

/*
        These are the descriptions of the states:

NOTE: These state names are tantalizingly similar to the states of an
SCTP endpoint. But the interpretation of the states is a little different,
considering that these are the states of the connection and not of an end
point. Please note the subtleties. -Kiran

NONE              - Nothing so far.
COOKIE WAIT       - We have seen an INIT chunk in the original direction, or also
                    an INIT_ACK chunk in the reply direction.
COOKIE ECHOED     - We have seen a COOKIE_ECHO chunk in the original direction.
ESTABLISHED       - We have seen a COOKIE_ACK in the reply direction.
SHUTDOWN_SENT     - We have seen a SHUTDOWN chunk in the original direction.
SHUTDOWN_RECD     - We have seen a SHUTDOWN chunk in the reply direction.
SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
                    to that of the SHUTDOWN chunk.
CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
                    the SHUTDOWN chunk. Connection is closed.
HEARTBEAT_SENT    - We have seen a HEARTBEAT in a new flow.
*/

/* TODO
 - I have assumed that the first INIT is in the original direction.
 This messes things when an INIT comes in the reply direction in CLOSED
 state.
 - Check the error type in the reply dir before transitioning from
cookie echoed to closed.
 - Sec 5.2.4 of RFC 2960
 - Full Multi Homing support.
*/

/* SCTP conntrack state transitions */
static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
        {
/*        ORIGINAL        */
/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
/* init         */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW},
/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},
/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL},
/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA},
/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/
/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */
/* cookie_ack   */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL},
/* heartbeat    */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
        },
        {
/*        REPLY        */
/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */
/* init_ack     */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV},
/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV},
/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV},
/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV},
/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV},
/* cookie_echo  */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV},
/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV},
/* heartbeat    */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES},
        }
};

#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* Print out the private part of the conntrack. */
static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
        seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]);
}
#endif

/* do_basic_checks ensures sch->length > 0, do not use before */
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)        \
for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0;        \
        (offset) < (skb)->len &&                                        \
        ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch)));        \
        (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++)

/* Some validity checks to make sure the chunks are fine */
static int do_basic_checks(struct nf_conn *ct,
                           const struct sk_buff *skb,
                           unsigned int dataoff,
                           unsigned long *map,
                           const struct nf_hook_state *state)
{
        u_int32_t offset, count;
        struct sctp_chunkhdr _sch, *sch;
        int flag;

        flag = 0;

        for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
                if (sch->type == SCTP_CID_INIT ||
                    sch->type == SCTP_CID_INIT_ACK ||
                    sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
                        flag = 1;

                /*
                 * Cookie Ack/Echo chunks not the first OR
                 * Init / Init Ack / Shutdown compl chunks not the only chunks
                 * OR zero-length.
                 */
                if (((sch->type == SCTP_CID_COOKIE_ACK ||
                      sch->type == SCTP_CID_COOKIE_ECHO ||
                      flag) &&
                     count != 0) || !sch->length) {
                        nf_ct_l4proto_log_invalid(skb, ct, state,
                                                  "%s failed. chunk num %d, type %d, len %d flag %d\n",
                                                  __func__, count, sch->type, sch->length, flag);
                        return 1;
                }

                if (map)
                        set_bit(sch->type, map);
        }

        return count == 0;
}

static int sctp_new_state(enum ip_conntrack_dir dir,
                          enum sctp_conntrack cur_state,
                          int chunk_type)
{
        int i;

        switch (chunk_type) {
        case SCTP_CID_INIT:
                i = 0;
                break;
        case SCTP_CID_INIT_ACK:
                i = 1;
                break;
        case SCTP_CID_ABORT:
                i = 2;
                break;
        case SCTP_CID_SHUTDOWN:
                i = 3;
                break;
        case SCTP_CID_SHUTDOWN_ACK:
                i = 4;
                break;
        case SCTP_CID_ERROR:
                i = 5;
                break;
        case SCTP_CID_COOKIE_ECHO:
                i = 6;
                break;
        case SCTP_CID_COOKIE_ACK:
                i = 7;
                break;
        case SCTP_CID_SHUTDOWN_COMPLETE:
                i = 8;
                break;
        case SCTP_CID_HEARTBEAT:
                i = 9;
                break;
        case SCTP_CID_HEARTBEAT_ACK:
                i = 10;
                break;
        default:
                /* Other chunks like DATA or SACK do not change the state */
                pr_debug("Unknown chunk type %d, Will stay in %s\n",
                         chunk_type, sctp_conntrack_names[cur_state]);
                return cur_state;
        }

        return sctp_conntracks[dir][i][cur_state];
}

/* Don't need lock here: this conntrack not in circulation yet */
static noinline bool
sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
         const struct sctphdr *sh, unsigned int dataoff)
{
        enum sctp_conntrack new_state;
        const struct sctp_chunkhdr *sch;
        struct sctp_chunkhdr _sch;
        u32 offset, count;

        memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
        new_state = SCTP_CONNTRACK_MAX;
        for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) {
                new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
                                           SCTP_CONNTRACK_NONE, sch->type);

                /* Invalid: delete conntrack */
                if (new_state == SCTP_CONNTRACK_NONE ||
                    new_state == SCTP_CONNTRACK_MAX) {
                        pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
                        return false;
                }

                /* Copy the vtag into the state info */
                if (sch->type == SCTP_CID_INIT) {
                        struct sctp_inithdr _inithdr, *ih;
                        /* Sec 8.5.1 (A) */
                        if (sh->vtag)
                                return false;

                        ih = skb_header_pointer(skb, offset + sizeof(_sch),
                                                sizeof(_inithdr), &_inithdr);
                        if (!ih)
                                return false;

                        pr_debug("Setting vtag %x for new conn\n",
                                 ih->init_tag);

                        ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
                } else if (sch->type == SCTP_CID_HEARTBEAT) {
                        pr_debug("Setting vtag %x for secondary conntrack\n",
                                 sh->vtag);
                        ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
                } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) {
                /* If it is a shutdown ack OOTB packet, we expect a return
                   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
                        pr_debug("Setting vtag %x for new conn OOTB\n",
                                 sh->vtag);
                        ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
                }

                ct->proto.sctp.state = SCTP_CONNTRACK_NONE;
        }

        return true;
}

static bool sctp_error(struct sk_buff *skb,
                       unsigned int dataoff,
                       const struct nf_hook_state *state)
{
        const struct sctphdr *sh;
        const char *logmsg;

        if (skb->len < dataoff + sizeof(struct sctphdr)) {
                logmsg = "nf_ct_sctp: short packet ";
                goto out_invalid;
        }
        if (state->hook == NF_INET_PRE_ROUTING &&
            state->net->ct.sysctl_checksum &&
            skb->ip_summed == CHECKSUM_NONE) {
                if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) {
                        logmsg = "nf_ct_sctp: failed to read header ";
                        goto out_invalid;
                }
                sh = (const struct sctphdr *)(skb->data + dataoff);
                if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
                        logmsg = "nf_ct_sctp: bad CRC ";
                        goto out_invalid;
                }
                skb->ip_summed = CHECKSUM_UNNECESSARY;
        }
        return false;
out_invalid:
        nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg);
        return true;
}

/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
int nf_conntrack_sctp_packet(struct nf_conn *ct,
                             struct sk_buff *skb,
                             unsigned int dataoff,
                             enum ip_conntrack_info ctinfo,
                             const struct nf_hook_state *state)
{
        enum sctp_conntrack new_state, old_state;
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        const struct sctphdr *sh;
        struct sctphdr _sctph;
        const struct sctp_chunkhdr *sch;
        struct sctp_chunkhdr _sch;
        u_int32_t offset, count;
        unsigned int *timeouts;
        unsigned long map[256 / sizeof(unsigned long)] = { 0 };
        bool ignore = false;

        if (sctp_error(skb, dataoff, state))
                return -NF_ACCEPT;

        sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
        if (sh == NULL)
                goto out;

        if (do_basic_checks(ct, skb, dataoff, map, state) != 0)
                goto out;

        if (!nf_ct_is_confirmed(ct)) {
                /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
                if (test_bit(SCTP_CID_ABORT, map) ||
                    test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
                    test_bit(SCTP_CID_COOKIE_ACK, map))
                        return -NF_ACCEPT;

                if (!sctp_new(ct, skb, sh, dataoff))
                        return -NF_ACCEPT;
        }

        /* Check the verification tag (Sec 8.5) */
        if (!test_bit(SCTP_CID_INIT, map) &&
            !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
            !test_bit(SCTP_CID_COOKIE_ECHO, map) &&
            !test_bit(SCTP_CID_ABORT, map) &&
            !test_bit(SCTP_CID_SHUTDOWN_ACK, map) &&
            !test_bit(SCTP_CID_HEARTBEAT, map) &&
            !test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
            sh->vtag != ct->proto.sctp.vtag[dir]) {
                nf_ct_l4proto_log_invalid(skb, ct, state,
                                          "verification tag check failed %x vs %x for dir %d",
                                          sh->vtag, ct->proto.sctp.vtag[dir], dir);
                goto out;
        }

        old_state = new_state = SCTP_CONNTRACK_NONE;
        spin_lock_bh(&ct->lock);
        for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
                /* Special cases of Verification tag check (Sec 8.5.1) */
                if (sch->type == SCTP_CID_INIT) {
                        /* (A) vtag MUST be zero */
                        if (sh->vtag != 0)
                                goto out_unlock;
                } else if (sch->type == SCTP_CID_ABORT) {
                        /* (B) vtag MUST match own vtag if T flag is unset OR
                         * MUST match peer's vtag if T flag is set
                         */
                        if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
                             sh->vtag != ct->proto.sctp.vtag[dir]) ||
                            ((sch->flags & SCTP_CHUNK_FLAG_T) &&
                             sh->vtag != ct->proto.sctp.vtag[!dir]))
                                goto out_unlock;
                } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
                        /* (C) vtag MUST match own vtag if T flag is unset OR
                         * MUST match peer's vtag if T flag is set
                         */
                        if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
                             sh->vtag != ct->proto.sctp.vtag[dir]) ||
                            ((sch->flags & SCTP_CHUNK_FLAG_T) &&
                             sh->vtag != ct->proto.sctp.vtag[!dir]))
                                goto out_unlock;
                } else if (sch->type == SCTP_CID_COOKIE_ECHO) {
                        /* (D) vtag must be same as init_vtag as found in INIT_ACK */
                        if (sh->vtag != ct->proto.sctp.vtag[dir])
                                goto out_unlock;
                } else if (sch->type == SCTP_CID_COOKIE_ACK) {
                        ct->proto.sctp.init[dir] = 0;
                        ct->proto.sctp.init[!dir] = 0;
                } else if (sch->type == SCTP_CID_HEARTBEAT) {
                        if (ct->proto.sctp.vtag[dir] == 0) {
                                pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
                                ct->proto.sctp.vtag[dir] = sh->vtag;
                        } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
                                if (test_bit(SCTP_CID_DATA, map) || ignore)
                                        goto out_unlock;

                                ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
                                ct->proto.sctp.last_dir = dir;
                                ignore = true;
                                continue;
                        } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
                                ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
                        }
                } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) {
                        if (ct->proto.sctp.vtag[dir] == 0) {
                                pr_debug("Setting vtag %x for dir %d\n",
                                         sh->vtag, dir);
                                ct->proto.sctp.vtag[dir] = sh->vtag;
                        } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
                                if (test_bit(SCTP_CID_DATA, map) || ignore)
                                        goto out_unlock;

                                if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 ||
                                    ct->proto.sctp.last_dir == dir)
                                        goto out_unlock;

                                ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
                                ct->proto.sctp.vtag[dir] = sh->vtag;
                                ct->proto.sctp.vtag[!dir] = 0;
                        } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
                                ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
                        }
                }

                old_state = ct->proto.sctp.state;
                new_state = sctp_new_state(dir, old_state, sch->type);

                /* Invalid */
                if (new_state == SCTP_CONNTRACK_MAX) {
                        nf_ct_l4proto_log_invalid(skb, ct, state,
                                                  "Invalid, old_state %d, dir %d, type %d",
                                                  old_state, dir, sch->type);

                        goto out_unlock;
                }

                /* If it is an INIT or an INIT ACK note down the vtag */
                if (sch->type == SCTP_CID_INIT) {
                        struct sctp_inithdr _ih, *ih;

                        ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
                        if (!ih)
                                goto out_unlock;

                        if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir])
                                ct->proto.sctp.init[!dir] = 0;
                        ct->proto.sctp.init[dir] = 1;

                        pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
                        ct->proto.sctp.vtag[!dir] = ih->init_tag;

                        /* don't renew timeout on init retransmit so
                         * port reuse by client or NAT middlebox cannot
                         * keep entry alive indefinitely (incl. nat info).
                         */
                        if (new_state == SCTP_CONNTRACK_CLOSED &&
                            old_state == SCTP_CONNTRACK_CLOSED &&
                            nf_ct_is_confirmed(ct))
                                ignore = true;
                } else if (sch->type == SCTP_CID_INIT_ACK) {
                        struct sctp_inithdr _ih, *ih;
                        __be32 vtag;

                        ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
                        if (!ih)
                                goto out_unlock;

                        vtag = ct->proto.sctp.vtag[!dir];
                        if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag)
                                goto out_unlock;
                        /* collision */
                        if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
                            vtag != ih->init_tag)
                                goto out_unlock;

                        pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
                        ct->proto.sctp.vtag[!dir] = ih->init_tag;
                }

                ct->proto.sctp.state = new_state;
                if (old_state != new_state) {
                        nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
                        if (new_state == SCTP_CONNTRACK_ESTABLISHED &&
                            !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
                                nf_conntrack_event_cache(IPCT_ASSURED, ct);
                }
        }
        spin_unlock_bh(&ct->lock);

        /* allow but do not refresh timeout */
        if (ignore)
                return NF_ACCEPT;

        timeouts = nf_ct_timeout_lookup(ct);
        if (!timeouts)
                timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts;

        nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);

        return NF_ACCEPT;

out_unlock:
        spin_unlock_bh(&ct->lock);
out:
        return -NF_ACCEPT;
}

static bool sctp_can_early_drop(const struct nf_conn *ct)
{
        switch (ct->proto.sctp.state) {
        case SCTP_CONNTRACK_SHUTDOWN_SENT:
        case SCTP_CONNTRACK_SHUTDOWN_RECD:
        case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT:
                return true;
        default:
                break;
        }

        return false;
}

#if IS_ENABLED(CONFIG_NF_CT_NETLINK)

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>

static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
                          struct nf_conn *ct, bool destroy)
{
        struct nlattr *nest_parms;

        spin_lock_bh(&ct->lock);
        nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP);
        if (!nest_parms)
                goto nla_put_failure;

        if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state))
                goto nla_put_failure;

        if (destroy)
                goto skip_state;

        if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
                         ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
            nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
                         ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
                goto nla_put_failure;

skip_state:
        spin_unlock_bh(&ct->lock);
        nla_nest_end(skb, nest_parms);

        return 0;

nla_put_failure:
        spin_unlock_bh(&ct->lock);
        return -1;
}

static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = {
        [CTA_PROTOINFO_SCTP_STATE]            = { .type = NLA_U8 },
        [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]  = { .type = NLA_U32 },
        [CTA_PROTOINFO_SCTP_VTAG_REPLY]     = { .type = NLA_U32 },
};

#define SCTP_NLATTR_SIZE ( \
                NLA_ALIGN(NLA_HDRLEN + 1) + \
                NLA_ALIGN(NLA_HDRLEN + 4) + \
                NLA_ALIGN(NLA_HDRLEN + 4))

static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
{
        struct nlattr *attr = cda[CTA_PROTOINFO_SCTP];
        struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1];
        int err;

        /* updates may not contain the internal protocol info, skip parsing */
        if (!attr)
                return 0;

        err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr,
                                          sctp_nla_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[CTA_PROTOINFO_SCTP_STATE] ||
            !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] ||
            !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY])
                return -EINVAL;

        spin_lock_bh(&ct->lock);
        ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]);
        ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] =
                nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]);
        ct->proto.sctp.vtag[IP_CT_DIR_REPLY] =
                nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]);
        spin_unlock_bh(&ct->lock);

        return 0;
}
#endif

#ifdef CONFIG_NF_CONNTRACK_TIMEOUT

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>

static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
                                      struct net *net, void *data)
{
        unsigned int *timeouts = data;
        struct nf_sctp_net *sn = nf_sctp_pernet(net);
        int i;

        if (!timeouts)
                timeouts = sn->timeouts;

        /* set default SCTP timeouts. */
        for (i=0; i<SCTP_CONNTRACK_MAX; i++)
                timeouts[i] = sn->timeouts[i];

        /* there's a 1:1 mapping between attributes and protocol states. */
        for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
                if (tb[i]) {
                        timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
                }
        }

        timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED];
        return 0;
}

static int
sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
{
        const unsigned int *timeouts = data;
        int i;

        for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) {
                if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
                        goto nla_put_failure;
        }
        return 0;

nla_put_failure:
        return -ENOSPC;
}

static const struct nla_policy
sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
        [CTA_TIMEOUT_SCTP_CLOSED]                = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_COOKIE_WAIT]                = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_COOKIE_ECHOED]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_ESTABLISHED]                = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED]        = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */

void nf_conntrack_sctp_init_net(struct net *net)
{
        struct nf_sctp_net *sn = nf_sctp_pernet(net);
        int i;

        for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
                sn->timeouts[i] = sctp_timeouts[i];

        /* timeouts[0] is unused, init it so ->timeouts[0] contains
         * 'new' timeout, like udp or icmp.
         */
        sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED];
}

const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
        .l4proto                 = IPPROTO_SCTP,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
        .print_conntrack        = sctp_print_conntrack,
#endif
        .can_early_drop                = sctp_can_early_drop,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
        .nlattr_size                = SCTP_NLATTR_SIZE,
        .to_nlattr                = sctp_to_nlattr,
        .from_nlattr                = nlattr_to_sctp,
        .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
        .nlattr_tuple_size        = nf_ct_port_nlattr_tuple_size,
        .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
        .nla_policy                = nf_ct_port_nla_policy,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        .ctnl_timeout                = {
                .nlattr_to_obj        = sctp_timeout_nlattr_to_obj,
                .obj_to_nlattr        = sctp_timeout_obj_to_nlattr,
                .nlattr_max        = CTA_TIMEOUT_SCTP_MAX,
                .obj_size        = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
                .nla_policy        = sctp_timeout_nla_policy,
        },
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
};














































































































































































































































































































































    3 






    3 


























    3 




    3 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/audit.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/**
 * tomoyo_print_bprm - Print "struct linux_binprm" for auditing.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns the contents of @bprm on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_bprm(struct linux_binprm *bprm,
                               struct tomoyo_page_dump *dump)
{
        static const int tomoyo_buffer_len = 4096 * 2;
        char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS);
        char *cp;
        char *last_start;
        int len;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        bool truncated = false;

        if (!buffer)
                return NULL;
        len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ");
        cp = buffer + len;
        if (!argv_count) {
                memmove(cp, "} envp[]={ ", 11);
                cp += 11;
        }
        last_start = cp;
        while (argv_count || envp_count) {
                if (!tomoyo_dump_page(bprm, pos, dump))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (offset < PAGE_SIZE) {
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];

                        if (cp == last_start)
                                *cp++ = '"';
                        if (cp >= buffer + tomoyo_buffer_len - 32) {
                                /* Reserve some room for "..." string. */
                                truncated = true;
                        } else if (c == '\\') {
                                *cp++ = '\\';
                                *cp++ = '\\';
                        } else if (c > ' ' && c < 127) {
                                *cp++ = c;
                        } else if (!c) {
                                *cp++ = '"';
                                *cp++ = ' ';
                                last_start = cp;
                        } else {
                                *cp++ = '\\';
                                *cp++ = (c >> 6) + '0';
                                *cp++ = ((c >> 3) & 7) + '0';
                                *cp++ = (c & 7) + '0';
                        }
                        if (c)
                                continue;
                        if (argv_count) {
                                if (--argv_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                        memmove(cp, "} envp[]={ ", 11);
                                        cp += 11;
                                        last_start = cp;
                                        truncated = false;
                                }
                        } else if (envp_count) {
                                if (--envp_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                }
                        }
                        if (!argv_count && !envp_count)
                                break;
                }
                offset = 0;
        }
        *cp++ = '}';
        *cp = '\0';
        return buffer;
out:
        snprintf(buffer, tomoyo_buffer_len - 1,
                 "argv[]={ ... } envp[]= { ... }");
        return buffer;
}

/**
 * tomoyo_filetype - Get string representation of file type.
 *
 * @mode: Mode value for stat().
 *
 * Returns file type string.
 */
static inline const char *tomoyo_filetype(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case 0:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE];
        case S_IFDIR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY];
        case S_IFLNK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK];
        case S_IFIFO:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO];
        case S_IFSOCK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET];
        case S_IFBLK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV];
        case S_IFCHR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV];
        }
        return "unknown"; /* This should not happen. */
}

/**
 * tomoyo_print_header - Get header line of audit log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns string representation.
 *
 * This function uses kmalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_header(struct tomoyo_request_info *r)
{
        struct tomoyo_time stamp;
        const pid_t gpid = task_pid_nr(current);
        struct tomoyo_obj_info *obj = r->obj;
        static const int tomoyo_buffer_len = 4096;
        char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS);
        int pos;
        u8 i;

        if (!buffer)
                return NULL;

        tomoyo_convert_time(ktime_get_real_seconds(), &stamp);

        pos = snprintf(buffer, tomoyo_buffer_len - 1,
                       "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }",
                       stamp.year, stamp.month, stamp.day, stamp.hour,
                       stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode],
                       str_yes_no(r->granted), gpid, tomoyo_sys_getpid(),
                       tomoyo_sys_getppid(),
                       from_kuid(&init_user_ns, current_uid()),
                       from_kgid(&init_user_ns, current_gid()),
                       from_kuid(&init_user_ns, current_euid()),
                       from_kgid(&init_user_ns, current_egid()),
                       from_kuid(&init_user_ns, current_suid()),
                       from_kgid(&init_user_ns, current_sgid()),
                       from_kuid(&init_user_ns, current_fsuid()),
                       from_kgid(&init_user_ns, current_fsgid()));
        if (!obj)
                goto no_obj_info;
        if (!obj->validate_done) {
                tomoyo_get_attributes(obj);
                obj->validate_done = true;
        }
        for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
                struct tomoyo_mini_stat *stat;
                unsigned int dev;
                umode_t mode;

                if (!obj->stat_valid[i])
                        continue;
                stat = &obj->stat[i];
                dev = stat->dev;
                mode = stat->mode;
                if (i & 1) {
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }",
                                        (i >> 1) + 1,
                                        from_kuid(&init_user_ns, stat->uid),
                                        from_kgid(&init_user_ns, stat->gid),
                                        (unsigned long)stat->ino,
                                        stat->mode & S_IALLUGO);
                        continue;
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s",
                                (i >> 1) + 1,
                                from_kuid(&init_user_ns, stat->uid),
                                from_kgid(&init_user_ns, stat->gid),
                                (unsigned long)stat->ino,
                                MAJOR(dev), MINOR(dev),
                                mode & S_IALLUGO, tomoyo_filetype(mode));
                if (S_ISCHR(mode) || S_ISBLK(mode)) {
                        dev = stat->rdev;
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " dev_major=%u dev_minor=%u",
                                        MAJOR(dev), MINOR(dev));
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " }");
        }
no_obj_info:
        if (pos < tomoyo_buffer_len - 1)
                return buffer;
        kfree(buffer);
        return NULL;
}

/**
 * tomoyo_init_log - Allocate buffer for audit logs.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns pointer to allocated memory.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args)
{
        char *buf = NULL;
        char *bprm_info = NULL;
        const char *header = NULL;
        char *realpath = NULL;
        const char *symlink = NULL;
        int pos;
        const char *domainname = r->domain->domainname->name;

        header = tomoyo_print_header(r);
        if (!header)
                return NULL;
        /* +10 is for '\n' etc. and '\0'. */
        len += strlen(domainname) + strlen(header) + 10;
        if (r->ee) {
                struct file *file = r->ee->bprm->file;

                realpath = tomoyo_realpath_from_path(&file->f_path);
                bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump);
                if (!realpath || !bprm_info)
                        goto out;
                /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */
                len += strlen(realpath) + 80 + strlen(bprm_info);
        } else if (r->obj && r->obj->symlink_target) {
                symlink = r->obj->symlink_target->name;
                /* +18 is for " symlink.target=\"%s\"" */
                len += 18 + strlen(symlink);
        }
        len = kmalloc_size_roundup(len);
        buf = kzalloc(len, GFP_NOFS);
        if (!buf)
                goto out;
        len--;
        pos = snprintf(buf, len, "%s", header);
        if (realpath) {
                struct linux_binprm *bprm = r->ee->bprm;

                pos += snprintf(buf + pos, len - pos,
                                " exec={ realpath=\"%s\" argc=%d envc=%d %s }",
                                realpath, bprm->argc, bprm->envc, bprm_info);
        } else if (symlink)
                pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"",
                                symlink);
        pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname);
        vsnprintf(buf + pos, len - pos, fmt, args);
out:
        kfree(realpath);
        kfree(bprm_info);
        kfree(header);
        return buf;
}

/* Wait queue for /sys/kernel/security/tomoyo/audit. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait);

/* Structure for audit log. */
struct tomoyo_log {
        struct list_head list;
        char *log;
        int size;
};

/* The list for "struct tomoyo_log". */
static LIST_HEAD(tomoyo_log);

/* Lock for "struct list_head tomoyo_log". */
static DEFINE_SPINLOCK(tomoyo_log_lock);

/* Length of "struct list_head tomoyo_log". */
static unsigned int tomoyo_log_count;

/**
 * tomoyo_get_audit - Get audit mode.
 *
 * @ns:          Pointer to "struct tomoyo_policy_namespace".
 * @profile:     Profile number.
 * @index:       Index number of functionality.
 * @matched_acl: Pointer to "struct tomoyo_acl_info".
 * @is_granted:  True if granted log, false otherwise.
 *
 * Returns true if this request should be audited, false otherwise.
 */
static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns,
                             const u8 profile, const u8 index,
                             const struct tomoyo_acl_info *matched_acl,
                             const bool is_granted)
{
        u8 mode;
        const u8 category = tomoyo_index2category[index] +
                TOMOYO_MAX_MAC_INDEX;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return false;
        p = tomoyo_profile(ns, profile);
        if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG])
                return false;
        if (is_granted && matched_acl && matched_acl->cond &&
            matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES;
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[category];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        if (is_granted)
                return mode & TOMOYO_CONFIG_WANT_GRANT_LOG;
        return mode & TOMOYO_CONFIG_WANT_REJECT_LOG;
}

/**
 * tomoyo_write_log2 - Write an audit log.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns nothing.
 */
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args)
{
        char *buf;
        struct tomoyo_log *entry;
        bool quota_exceeded = false;

        if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type,
                              r->matched_acl, r->granted))
                goto out;
        buf = tomoyo_init_log(r, len, fmt, args);
        if (!buf)
                goto out;
        entry = kzalloc(sizeof(*entry), GFP_NOFS);
        if (!entry) {
                kfree(buf);
                goto out;
        }
        entry->log = buf;
        len = kmalloc_size_roundup(strlen(buf) + 1);
        /*
         * The entry->size is used for memory quota checks.
         * Don't go beyond strlen(entry->log).
         */
        entry->size = len + kmalloc_size_roundup(sizeof(*entry));
        spin_lock(&tomoyo_log_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] &&
            tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >=
            tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) {
                quota_exceeded = true;
        } else {
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size;
                list_add_tail(&entry->list, &tomoyo_log);
                tomoyo_log_count++;
        }
        spin_unlock(&tomoyo_log_lock);
        if (quota_exceeded) {
                kfree(buf);
                kfree(entry);
                goto out;
        }
        wake_up(&tomoyo_log_wait);
out:
        return;
}

/**
 * tomoyo_write_log - Write an audit log.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int len;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
}

/**
 * tomoyo_read_log - Read an audit log.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
void tomoyo_read_log(struct tomoyo_io_buffer *head)
{
        struct tomoyo_log *ptr = NULL;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_log_lock);
        if (!list_empty(&tomoyo_log)) {
                ptr = list_entry(tomoyo_log.next, typeof(*ptr), list);
                list_del(&ptr->list);
                tomoyo_log_count--;
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size;
        }
        spin_unlock(&tomoyo_log_lock);
        if (ptr) {
                head->read_buf = ptr->log;
                head->r.w[head->r.w_pos++] = head->read_buf;
                kfree(ptr);
        }
}

/**
 * tomoyo_poll_log - Wait for an audit log.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log.
 */
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait)
{
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_log_wait, wait);
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}














































































































































































































































































































































































































































































































































































































































































































































































































   36 


























    6 
   34 

























    3 










    4 


























   14 

















   15 



   15 




































































































































































































    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright IBM Corporation, 2001
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *
 * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 * Papers:
 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                http://lse.sourceforge.net/locking/rcupdate.html
 *
 */

#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/processor.h>
#include <linux/cpumask.h>
#include <linux/context_tracking_irq.h>

#define ULONG_CMP_GE(a, b)        (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b)        (ULONG_MAX / 2 < (a) - (b))

/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void rcu_barrier_tasks_rude(void);
void synchronize_rcu(void);

struct rcu_gp_oldstate;
unsigned long get_completed_synchronize_rcu(void);
void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);

// Maximum number of unsigned long values corresponding to
// not-yet-completed RCU grace periods.
#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_rcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
 * get_completed_synchronize_rcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_PREEMPT_RCU

void __rcu_read_lock(void);
void __rcu_read_unlock(void);

/*
 * Defined as a macro as it is a very low level header included from
 * areas that don't even know about current.  This gives the rcu_read_lock()
 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
 */
#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting)

#else /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif

static inline void __rcu_read_lock(void)
{
        preempt_disable();
}

static inline void __rcu_read_unlock(void)
{
        preempt_enable();
        if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
                rcu_read_unlock_strict();
}

static inline int rcu_preempt_depth(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_RCU_LAZY
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
#else
static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
        call_rcu(head, func);
}
#endif

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active;
void rcu_sched_clock_irq(int user);

#ifdef CONFIG_TASKS_RCU_GENERIC
void rcu_init_tasks_generic(void);
#else
static inline void rcu_init_tasks_generic(void) { }
#endif

#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
void rcu_irq_work_resched(void);
#else
static inline void rcu_irq_work_resched(void) { }
#endif

#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
int rcu_nocb_cpu_offload(int cpu);
int rcu_nocb_cpu_deoffload(int cpu);
void rcu_nocb_flush_deferred_wakeup(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

/*
 * Note a quasi-voluntary context switch for RCU-tasks's benefit.
 * This is a macro rather than an inline function to avoid #include hell.
 */
#ifdef CONFIG_TASKS_RCU_GENERIC

# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt)                                \
        do {                                                                \
                if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout))        \
                        WRITE_ONCE((t)->rcu_tasks_holdout, false);        \
        } while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif

# ifdef CONFIG_TASKS_TRACE_RCU
// Bits for ->trc_reader_special.b.need_qs field.
#define TRC_NEED_QS                0x1  // Task needs a quiescent state.
#define TRC_NEED_QS_CHECKED        0x2  // Task has been checked for needing quiescent state.

u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
void rcu_tasks_trace_qs_blkd(struct task_struct *t);

# define rcu_tasks_trace_qs(t)                                                        \
        do {                                                                        \
                int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);        \
                                                                                \
                if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) &&        \
                    likely(!___rttq_nesting)) {                                        \
                        rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED);        \
                } else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&        \
                           !READ_ONCE((t)->trc_reader_special.b.blocked)) {        \
                        rcu_tasks_trace_qs_blkd(t);                                \
                }                                                                \
        } while (0)
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
# endif

#define rcu_tasks_qs(t, preempt)                                        \
do {                                                                        \
        rcu_tasks_classic_qs((t), (preempt));                                \
        rcu_tasks_trace_qs(t);                                                \
} while (0)

# ifdef CONFIG_TASKS_RUDE_RCU
void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks_rude(void);
# endif

#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_stop(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_classic_qs(t, preempt) do { } while (0)
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_stop(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */

/**
 * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
 *
 * As an accident of implementation, an RCU Tasks Trace grace period also
 * acts as an RCU grace period.  However, this could change at any time.
 * Code relying on this accident must call this function to verify that
 * this accident is still happening.
 *
 * You have been warned!
 */
static inline bool rcu_trace_implies_rcu_gp(void) { return true; }

/**
 * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
 *
 * This macro resembles cond_resched(), except that it is defined to
 * report potential quiescent states to RCU-tasks even if the cond_resched()
 * machinery were to be shut off, as some advocate for PREEMPTION kernels.
 */
#define cond_resched_tasks_rcu_qs() \
do { \
        rcu_tasks_qs(current, false); \
        cond_resched(); \
} while (0)

/**
 * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
 * @old_ts: jiffies at start of processing.
 *
 * This helper is for long-running softirq handlers, such as NAPI threads in
 * networking. The caller should initialize the variable passed in as @old_ts
 * at the beginning of the softirq handler. When invoked frequently, this macro
 * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
 * provide both RCU and RCU-Tasks quiescent states. Note that this macro
 * modifies its old_ts argument.
 *
 * Because regions of code that have disabled softirq act as RCU read-side
 * critical sections, this macro should be invoked with softirq (and
 * preemption) enabled.
 *
 * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
 * have more chance to invoke schedule() calls and provide necessary quiescent
 * states. As a contrast, calling cond_resched() only won't achieve the same
 * effect because cond_resched() does not provide RCU-Tasks quiescent states.
 */
#define rcu_softirq_qs_periodic(old_ts) \
do { \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
            time_after(jiffies, (old_ts) + HZ / 10)) { \
                preempt_disable(); \
                rcu_softirq_qs(); \
                preempt_enable(); \
                (old_ts) = jiffies; \
        } \
} while (0)

/*
 * Infrastructure to implement the synchronize_() primitives in
 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
 */

#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif

/*
 * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
 * are needed for dynamic initialization and destruction of rcu_head
 * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
 * dynamic initialization and destruction of statically allocated rcu_head
 * structures.  However, rcu_head structures allocated dynamically in the
 * heap don't need any initialization.
 */
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif        /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */

extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_try_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_lock_release(struct lockdep_map *map)
{
        lock_release(map, _THIS_IP_);
}

int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

# define rcu_lock_acquire(a)                do { } while (0)
# define rcu_try_lock_acquire(a)        do { } while (0)
# define rcu_lock_release(a)                do { } while (0)

static inline int rcu_read_lock_held(void)
{
        return 1;
}

static inline int rcu_read_lock_bh_held(void)
{
        return 1;
}

static inline int rcu_read_lock_sched_held(void)
{
        return !preemptible();
}

static inline int rcu_read_lock_any_held(void)
{
        return !preemptible();
}

static inline int debug_lockdep_rcu_enabled(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_PROVE_RCU

/**
 * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
 * @c: condition to check
 * @s: informative message
 *
 * This checks debug_lockdep_rcu_enabled() before checking (c) to
 * prevent early boot splats due to lockdep not yet being initialized,
 * and rechecks it after checking (c) to prevent false-positive splats
 * due to races with lockdep being disabled.  See commit 3066820034b5dd
 * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail.
 */
#define RCU_LOCKDEP_WARN(c, s)                                                \
        do {                                                                \
                static bool __section(".data.unlikely") __warned;        \
                if (debug_lockdep_rcu_enabled() && (c) &&                \
                    debug_lockdep_rcu_enabled() && !__warned) {                \
                        __warned = true;                                \
                        lockdep_rcu_suspicious(__FILE__, __LINE__, s);        \
                }                                                        \
        } while (0)

#ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void)
{
        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
                         "Illegal context switch in RCU read-side critical section");
}
#else // #ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void) { }
#endif // #else // #ifndef CONFIG_PREEMPT_RCU

#define rcu_sleep_check()                                                \
        do {                                                                \
                rcu_preempt_sleep_check();                                \
                if (!IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                    RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),        \
                                 "Illegal context switch in RCU-bh read-side critical section"); \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),        \
                                 "Illegal context switch in RCU-sched read-side critical section"); \
        } while (0)

#else /* #ifdef CONFIG_PROVE_RCU */

#define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c))
#define rcu_sleep_check() do { } while (0)

#endif /* #else #ifdef CONFIG_PROVE_RCU */

/*
 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple pointers markings to match different RCU implementations
 * (e.g., __srcu), should this make sense in the future.
 */

#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
        ((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */

#define __unrcu_pointer(p, local)                                        \
({                                                                        \
        typeof(*p) *local = (typeof(*p) *__force)(p);                        \
        rcu_check_sparse(p, __rcu);                                        \
        ((typeof(*p) __force __kernel *)(local));                         \
})
/**
 * unrcu_pointer - mark a pointer as not being RCU protected
 * @p: pointer needing to lose its __rcu property
 *
 * Converts @p from an __rcu pointer to a __kernel pointer.
 * This allows an __rcu pointer to be used with xchg() and friends.
 */
#define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu))

#define __rcu_access_pointer(p, local, space) \
({ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_check(p, local, c, space) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_protected(p, local, c, space) \
({ \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(p)); \
})
#define __rcu_dereference_raw(p, local) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(p) local = READ_ONCE(p); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu))

/**
 * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
 * @v: The value to statically initialize with.
 */
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)

/**
 * rcu_assign_pointer() - assign to RCU-protected pointer
 * @p: pointer to assign to
 * @v: value to assign (publish)
 *
 * Assigns the specified value to the specified RCU-protected
 * pointer, ensuring that any concurrent RCU readers will see
 * any prior initialization.
 *
 * Inserts memory barriers on architectures that require them
 * (which is most of them), and also prevents the compiler from
 * reordering the code that initializes the structure after the pointer
 * assignment.  More importantly, this call documents which pointers
 * will be dereferenced by RCU read-side code.
 *
 * In some special cases, you may use RCU_INIT_POINTER() instead
 * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
 * to the fact that it does not constrain either the CPU or the compiler.
 * That said, using RCU_INIT_POINTER() when you should have used
 * rcu_assign_pointer() is a very bad thing that results in
 * impossible-to-diagnose memory corruption.  So please be careful.
 * See the RCU_INIT_POINTER() comment header for details.
 *
 * Note that rcu_assign_pointer() evaluates each of its arguments only
 * once, appearances notwithstanding.  One of the "extra" evaluations
 * is in typeof() and the other visible only to sparse (__CHECKER__),
 * neither of which actually execute the argument.  As with most cpp
 * macros, this execute-arguments-only-once property is important, so
 * please be careful when making changes to rcu_assign_pointer() and the
 * other macros that it invokes.
 */
#define rcu_assign_pointer(p, v)                                              \
do {                                                                              \
        uintptr_t _r_a_p__v = (uintptr_t)(v);                                      \
        rcu_check_sparse(p, __rcu);                                              \
                                                                              \
        if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)              \
                WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
        else                                                                      \
                smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)

/**
 * rcu_replace_pointer() - replace an RCU pointer, returning its old value
 * @rcu_ptr: RCU pointer, whose old value is returned
 * @ptr: regular pointer
 * @c: the lockdep conditions under which the dereference will take place
 *
 * Perform a replacement, where @rcu_ptr is an RCU-annotated
 * pointer and @c is the lockdep argument that is passed to the
 * rcu_dereference_protected() call used to read that pointer.  The old
 * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
 */
#define rcu_replace_pointer(rcu_ptr, ptr, c)                                \
({                                                                        \
        typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));        \
        rcu_assign_pointer((rcu_ptr), (ptr));                                \
        __tmp;                                                                \
})

/**
 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
 * @p: The pointer to read
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * lockdep checks for being in an RCU read-side critical section.  This is
 * useful when the value of this pointer is accessed, but the pointer is
 * not dereferenced, for example, when testing an RCU-protected pointer
 * against NULL.  Although rcu_access_pointer() may also be used in cases
 * where update-side locks prevent the value of the pointer from changing,
 * you should instead use rcu_dereference_protected() for this use case.
 * Within an RCU read-side critical section, there is little reason to
 * use rcu_access_pointer().
 *
 * It is usually best to test the rcu_access_pointer() return value
 * directly in order to avoid accidental dereferences being introduced
 * by later inattentive changes.  In other words, assigning the
 * rcu_access_pointer() return value to a local variable results in an
 * accident waiting to happen.
 *
 * It is also permissible to use rcu_access_pointer() when read-side
 * access to the pointer was removed at least one grace period ago, as is
 * the case in the context of the RCU callback that is freeing up the data,
 * or after a synchronize_rcu() returns.  This can be useful when tearing
 * down multi-linked structures after a grace period has elapsed.  However,
 * rcu_dereference_protected() is normally preferred for this use case.
 */
#define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)

/**
 * rcu_dereference_check() - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Do an rcu_dereference(), but check that the conditions under which the
 * dereference will take place are correct.  Typically the conditions
 * indicate the various locking conditions that should be held at that
 * point.  The check should return true if the conditions are satisfied.
 * An implicit check for being in an RCU read-side critical section
 * (rcu_read_lock()) is included.
 *
 * For example:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
 *
 * could be used to indicate to lockdep that foo->bar may only be dereferenced
 * if either rcu_read_lock() is held, or that the lock required to replace
 * the bar struct at foo->bar is held.
 *
 * Note that the list of conditions may also include indications of when a lock
 * need not be held, for example during initialisation or destruction of the
 * target struct:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
 *                                              atomic_read(&foo->usage) == 0);
 *
 * Inserts memory barriers on architectures that require them
 * (currently only the Alpha), prevents the compiler from refetching
 * (and from merging fetches), and, more importantly, documents exactly
 * which pointers are protected by RCU and checks that the pointer is
 * annotated as __rcu.
 */
#define rcu_dereference_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_held(), __rcu)

/**
 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-bh counterpart to rcu_dereference_check().  However,
 * please note that starting in v5.0 kernels, vanilla RCU grace periods
 * wait for local_bh_disable() regions of code in addition to regions of
 * code demarked by rcu_read_lock() and rcu_read_unlock().  This means
 * that synchronize_rcu(), call_rcu, and friends all take not only
 * rcu_read_lock() but also rcu_read_lock_bh() into account.
 */
#define rcu_dereference_bh_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_bh_held(), __rcu)

/**
 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-sched counterpart to rcu_dereference_check().
 * However, please note that starting in v5.0 kernels, vanilla RCU grace
 * periods wait for preempt_disable() regions of code in addition to
 * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
 * This means that synchronize_rcu(), call_rcu, and friends all take not
 * only rcu_read_lock() but also rcu_read_lock_sched() into account.
 */
#define rcu_dereference_sched_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_sched_held(), \
                                __rcu)

/*
 * The tracing infrastructure traces RCU (we want that), but unfortunately
 * some of the RCU checks causes tracing to lock up the system.
 *
 * The no-tracing version of rcu_dereference_raw() must not call
 * rcu_read_lock_held().
 */
#define rcu_dereference_raw_check(p) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu)

/**
 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE().  This is useful in cases where update-side locks
 * prevent the value of the pointer from changing.  Please note that this
 * primitive does *not* prevent the compiler from repeating this reference
 * or combining it with other references, so it should not be used without
 * protection of appropriate locks.
 *
 * This function is only for update-side use.  Using this function
 * when protected only by rcu_read_lock() will result in infrequent
 * but very ugly failures.
 */
#define rcu_dereference_protected(p, c) \
        __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu)


/**
 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * This is a simple wrapper around rcu_dereference_check().
 */
#define rcu_dereference(p) rcu_dereference_check(p, 0)

/**
 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)

/**
 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)

/**
 * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
 * @p: The pointer to hand off
 *
 * This is simply an identity function, but it documents where a pointer
 * is handed off from RCU to some other synchronization mechanism, for
 * example, reference counting or locking.  In C11, it would map to
 * kill_dependency().  It could be used as follows::
 *
 *        rcu_read_lock();
 *        p = rcu_dereference(gp);
 *        long_lived = is_long_lived(p);
 *        if (long_lived) {
 *                if (!atomic_inc_not_zero(p->refcnt))
 *                        long_lived = false;
 *                else
 *                        p = rcu_pointer_handoff(p);
 *        }
 *        rcu_read_unlock();
 */
#define rcu_pointer_handoff(p) (p)

/**
 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
 *
 * When synchronize_rcu() is invoked on one CPU while other CPUs
 * are within RCU read-side critical sections, then the
 * synchronize_rcu() is guaranteed to block until after all the other
 * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
 * on one CPU while other CPUs are within RCU read-side critical
 * sections, invocation of the corresponding RCU callback is deferred
 * until after the all the other CPUs exit their critical sections.
 *
 * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
 * wait for regions of code with preemption disabled, including regions of
 * code with interrupts or softirqs disabled.  In pre-v5.0 kernels, which
 * define synchronize_sched(), only code enclosed within rcu_read_lock()
 * and rcu_read_unlock() are guaranteed to be waited for.
 *
 * Note, however, that RCU callbacks are permitted to run concurrently
 * with new RCU read-side critical sections.  One way that this can happen
 * is via the following sequence of events: (1) CPU 0 enters an RCU
 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
 * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
 * callback is invoked.  This is legal, because the RCU read-side critical
 * section that was running concurrently with the call_rcu() (and which
 * therefore might be referencing something that the corresponding RCU
 * callback would free up) has completed before the corresponding
 * RCU callback is invoked.
 *
 * RCU read-side critical sections may be nested.  Any deferred actions
 * will be deferred until the outermost RCU read-side critical section
 * completes.
 *
 * You can avoid reading and understanding the next paragraph by
 * following this rule: don't put anything in an rcu_read_lock() RCU
 * read-side critical section that would block in a !PREEMPTION kernel.
 * But if you want the full story, read on!
 *
 * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
 * it is illegal to block while in an RCU read-side critical section.
 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
 * kernel builds, RCU read-side critical sections may be preempted,
 * but explicit blocking is illegal.  Finally, in preemptible RCU
 * implementations in real-time (with -rt patchset) kernel builds, RCU
 * read-side critical sections may be preempted and they may also block, but
 * only when acquiring spinlocks that are subject to priority inheritance.
 */
static __always_inline void rcu_read_lock(void)
{
        __rcu_read_lock();
        __acquire(RCU);
        rcu_lock_acquire(&rcu_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock() used illegally while idle");
}

/*
 * So where is rcu_write_lock()?  It does not exist, as there is no
 * way for writers to lock out RCU readers.  This is a feature, not
 * a bug -- this property is what provides RCU's performance benefits.
 * Of course, writers must coordinate with each other.  The normal
 * spinlock primitives work well for this, but any other technique may be
 * used as well.  RCU does not care how the writers keep out of each
 * others' way, as long as they do so.
 */

/**
 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
 *
 * In almost all situations, rcu_read_unlock() is immune from deadlock.
 * In recent kernels that have consolidated synchronize_sched() and
 * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
 * also extends to the scheduler's runqueue and priority-inheritance
 * spinlocks, courtesy of the quiescent-state deferral that is carried
 * out when rcu_read_unlock() is invoked with interrupts disabled.
 *
 * See rcu_read_lock() for more information.
 */
static inline void rcu_read_unlock(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock() used illegally while idle");
        rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
        __release(RCU);
        __rcu_read_unlock();
}

/**
 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables softirqs.
 * Note that anything else that disables softirqs can also serve as an RCU
 * read-side critical section.  However, please note that this equivalence
 * applies only to v5.0 and later.  Before v5.0, rcu_read_lock() and
 * rcu_read_lock_bh() were unrelated.
 *
 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
 * was invoked from some other task.
 */
static inline void rcu_read_lock_bh(void)
{
        local_bh_disable();
        __acquire(RCU_BH);
        rcu_lock_acquire(&rcu_bh_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_bh() used illegally while idle");
}

/**
 * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
 *
 * See rcu_read_lock_bh() for more information.
 */
static inline void rcu_read_unlock_bh(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_bh() used illegally while idle");
        rcu_lock_release(&rcu_bh_lock_map);
        __release(RCU_BH);
        local_bh_enable();
}

/**
 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables preemption.
 * Read-side critical sections can also be introduced by anything else that
 * disables preemption, including local_irq_disable() and friends.  However,
 * please note that the equivalence to rcu_read_lock() applies only to
 * v5.0 and later.  Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
 * were unrelated.
 *
 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_sched() from process context if the matching
 * rcu_read_lock_sched() was invoked from an NMI handler.
 */
static inline void rcu_read_lock_sched(void)
{
        preempt_disable();
        __acquire(RCU_SCHED);
        rcu_lock_acquire(&rcu_sched_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_sched() used illegally while idle");
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
{
        preempt_disable_notrace();
        __acquire(RCU_SCHED);
}

/**
 * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
 *
 * See rcu_read_lock_sched() for more information.
 */
static inline void rcu_read_unlock_sched(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_sched() used illegally while idle");
        rcu_lock_release(&rcu_sched_lock_map);
        __release(RCU_SCHED);
        preempt_enable();
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
{
        __release(RCU_SCHED);
        preempt_enable_notrace();
}

/**
 * RCU_INIT_POINTER() - initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * Initialize an RCU-protected pointer in special cases where readers
 * do not need ordering constraints on the CPU or the compiler.  These
 * special cases are:
 *
 * 1.        This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
 * 2.        The caller has taken whatever steps are required to prevent
 *        RCU readers from concurrently accessing this pointer *or*
 * 3.        The referenced data structure has already been exposed to
 *        readers either at compile time or via rcu_assign_pointer() *and*
 *
 *        a.        You have not made *any* reader-visible changes to
 *                this structure since then *or*
 *        b.        It is OK for readers accessing this structure from its
 *                new location to see the old state of the structure.  (For
 *                example, the changes were to statistical counters or to
 *                other state where exact synchronization is not required.)
 *
 * Failure to follow these rules governing use of RCU_INIT_POINTER() will
 * result in impossible-to-diagnose memory corruption.  As in the structures
 * will look OK in crash dumps, but any concurrent RCU readers might
 * see pre-initialized values of the referenced data structure.  So
 * please be very careful how you use RCU_INIT_POINTER()!!!
 *
 * If you are creating an RCU-protected linked structure that is accessed
 * by a single external-to-structure RCU-protected pointer, then you may
 * use RCU_INIT_POINTER() to initialize the internal RCU-protected
 * pointers, but you must use rcu_assign_pointer() to initialize the
 * external-to-structure pointer *after* you have completely initialized
 * the reader-accessible portions of the linked structure.
 *
 * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
 * ordering guarantees for either the CPU or the compiler.
 */
#define RCU_INIT_POINTER(p, v) \
        do { \
                rcu_check_sparse(p, __rcu); \
                WRITE_ONCE(p, RCU_INITIALIZER(v)); \
        } while (0)

/**
 * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * GCC-style initialization for an RCU-protected pointer in a structure field.
 */
#define RCU_POINTER_INITIALIZER(p, v) \
                .p = RCU_INITIALIZER(v)

/*
 * Does the specified offset indicate that the corresponding rcu_head
 * structure can be handled by kvfree_rcu()?
 */
#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)

/**
 * kfree_rcu() - kfree an object after a grace period.
 * @ptr: pointer to kfree for double-argument invocations.
 * @rhf: the name of the struct rcu_head within the type of @ptr.
 *
 * Many rcu callbacks functions just call kfree() on the base structure.
 * These functions are trivial, but their size adds up, and furthermore
 * when they are used in a kernel module, that module must invoke the
 * high-latency rcu_barrier() function at module-unload time.
 *
 * The kfree_rcu() function handles this issue.  Rather than encoding a
 * function address in the embedded rcu_head structure, kfree_rcu() instead
 * encodes the offset of the rcu_head structure within the base structure.
 * Because the functions are not allowed in the low-order 4096 bytes of
 * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
 * If the offset is larger than 4095 bytes, a compile-time error will
 * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
 * either fall back to use of call_rcu() or rearrange the structure to
 * position the rcu_head structure into the first 4096 bytes.
 *
 * The object to be freed can be allocated either by kmalloc() or
 * kmem_cache_alloc().
 *
 * Note that the allowable offset might decrease in the future.
 *
 * The BUILD_BUG_ON check must not involve any function calls, hence the
 * checks are done in macros here.
 */
#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)
#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)

/**
 * kfree_rcu_mightsleep() - kfree an object after a grace period.
 * @ptr: pointer to kfree for single-argument invocations.
 *
 * When it comes to head-less variant, only one argument
 * is passed and that is just a pointer which has to be
 * freed after a grace period. Therefore the semantic is
 *
 *     kfree_rcu_mightsleep(ptr);
 *
 * where @ptr is the pointer to be freed by kvfree().
 *
 * Please note, head-less way of freeing is permitted to
 * use from a context that has to follow might_sleep()
 * annotation. Otherwise, please switch and embed the
 * rcu_head structure within the type of @ptr.
 */
#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)

#define kvfree_rcu_arg_2(ptr, rhf)                                        \
do {                                                                        \
        typeof (ptr) ___p = (ptr);                                        \
                                                                        \
        if (___p) {                                                                        \
                BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));        \
                kvfree_call_rcu(&((___p)->rhf), (void *) (___p));                        \
        }                                                                                \
} while (0)

#define kvfree_rcu_arg_1(ptr)                                        \
do {                                                                \
        typeof(ptr) ___p = (ptr);                                \
                                                                \
        if (___p)                                                \
                kvfree_call_rcu(NULL, (void *) (___p));                \
} while (0)

/*
 * Place this after a lock-acquisition primitive to guarantee that
 * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
 * if the UNLOCK and LOCK are executed by the same CPU or if the
 * UNLOCK and LOCK operate on the same lock variable.
 */
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock()        smp_mb()  /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock()        do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */


/* Has the specified rcu_head structure been handed to call_rcu()? */

/**
 * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
 * @rhp: The rcu_head structure to initialize.
 *
 * If you intend to invoke rcu_head_after_call_rcu() to test whether a
 * given rcu_head structure has already been passed to call_rcu(), then
 * you must also invoke this rcu_head_init() function on it just after
 * allocating that structure.  Calls to this function must not race with
 * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
 */
static inline void rcu_head_init(struct rcu_head *rhp)
{
        rhp->func = (rcu_callback_t)~0L;
}

/**
 * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
 * @rhp: The rcu_head structure to test.
 * @f: The function passed to call_rcu() along with @rhp.
 *
 * Returns @true if the @rhp has been passed to call_rcu() with @func,
 * and @false otherwise.  Emits a warning in any other case, including
 * the case where @rhp has already been invoked after a grace period.
 * Calls to this function must not race with callback invocation.  One way
 * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
 * in an RCU read-side critical section that includes a read-side fetch
 * of the pointer to the structure containing @rhp.
 */
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
        rcu_callback_t func = READ_ONCE(rhp->func);

        if (func == f)
                return true;
        WARN_ON_ONCE(func != (rcu_callback_t)~0L);
        return false;
}

/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;

DEFINE_LOCK_GUARD_0(rcu,
        do {
                rcu_read_lock();
                /*
                 * sparse doesn't call the cleanup function,
                 * so just release immediately and don't track
                 * the context. We don't need to anyway, since
                 * the whole point of the guard is to not need
                 * the explicit unlock.
                 */
                __release(RCU);
        } while (0),
        rcu_read_unlock())

#endif /* __LINUX_RCUPDATE_H */

































































































































































































































































































































































































































































































































































































































































































    4 



    2 














































































































    2 






    2 




















































































    2 



    2 





    1 


    1 




















































































































































































































































































































































    2 



    2 




























    2 
    1 






























































































































































































































































































    2 



    1 

































































    2 
    2 















































































































    1 


    2 


    2 






















































    1 



















































































































































































































































































































































































































    2 






    2 
    1 
    2 




























    1 



















    1 











    2 






    2 


    2 



    1 
    1 




    2 

























    2 









    1 











    2 




    1 








    1 
    2 











    2 















    1 

















    1 



    2 


    1 




    2 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kernel/workqueue.c - generic async execution with shared worker pool
 *
 * Copyright (C) 2002                Ingo Molnar
 *
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
 *
 * Made to use alloc_percpu by Christoph Lameter.
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
 *
 * Please read Documentation/core-api/workqueue.rst for details.
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/hashtable.h>
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
#include <linux/delay.h>
#include <linux/irq_work.h>

#include "workqueue_internal.h"

enum worker_pool_flags {
        /*
         * worker_pool flags
         *
         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
         * wq_pool_attach_mutex to avoid changing binding state while
         * worker_attach_to_pool() is in progress.
         *
         * As there can only be one concurrent BH execution context per CPU, a
         * BH pool is per-CPU and always DISASSOCIATED.
         */
        POOL_BH                        = 1 << 0,        /* is a BH pool */
        POOL_MANAGER_ACTIVE        = 1 << 1,        /* being managed */
        POOL_DISASSOCIATED        = 1 << 2,        /* cpu can't serve workers */
        POOL_BH_DRAINING        = 1 << 3,        /* draining after CPU offline */
};

enum worker_flags {
        /* worker flags */
        WORKER_DIE                = 1 << 1,        /* die die die */
        WORKER_IDLE                = 1 << 2,        /* is idle */
        WORKER_PREP                = 1 << 3,        /* preparing to run works */
        WORKER_CPU_INTENSIVE        = 1 << 6,        /* cpu intensive */
        WORKER_UNBOUND                = 1 << 7,        /* worker is unbound */
        WORKER_REBOUND                = 1 << 8,        /* worker was rebound */

        WORKER_NOT_RUNNING        = WORKER_PREP | WORKER_CPU_INTENSIVE |
                                  WORKER_UNBOUND | WORKER_REBOUND,
};

enum work_cancel_flags {
        WORK_CANCEL_DELAYED        = 1 << 0,        /* canceling a delayed_work */
        WORK_CANCEL_DISABLE        = 1 << 1,        /* canceling to disable */
};

enum wq_internal_consts {
        NR_STD_WORKER_POOLS        = 2,                /* # standard pools per cpu */

        UNBOUND_POOL_HASH_ORDER        = 6,                /* hashed by pool->attrs */
        BUSY_WORKER_HASH_ORDER        = 6,                /* 64 pointers */

        MAX_IDLE_WORKERS_RATIO        = 4,                /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT        = 300 * HZ,        /* keep idle ones for 5 mins */

        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
                                                /* call for help after 10ms
                                                   (min two ticks) */
        MAYDAY_INTERVAL                = HZ / 10,        /* and then every 100ms */
        CREATE_COOLDOWN                = HZ,                /* time to breath after fail */

        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give MIN_NICE.
         */
        RESCUER_NICE_LEVEL        = MIN_NICE,
        HIGHPRI_NICE_LEVEL        = MIN_NICE,

        WQ_NAME_LEN                = 32,
};

/*
 * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
 * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
 * msecs_to_jiffies() can't be an initializer.
 */
#define BH_WORKER_JIFFIES        msecs_to_jiffies(2)
#define BH_WORKER_RESTARTS        10

/*
 * Structure fields follow one of the following exclusion rules.
 *
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
 *
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
 * L: pool->lock protected.  Access with pool->lock held.
 *
 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
 *     reads.
 *
 * K: Only modified by worker while holding pool->lock. Can be safely read by
 *    self, while holding pool->lock or from IRQ context if %current is the
 *    kworker.
 *
 * S: Only modified by worker self.
 *
 * A: wq_pool_attach_mutex protected.
 *
 * PL: wq_pool_mutex protected.
 *
 * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
 *
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      RCU for reads.
 *
 * WQ: wq->mutex protected.
 *
 * WR: wq->mutex protected for writes.  RCU protected for reads.
 *
 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
 *     with READ_ONCE() without locking.
 *
 * MD: wq_mayday_lock protected.
 *
 * WD: Used internally by the watchdog.
 */

/* struct worker is defined in workqueue_internal.h */

struct worker_pool {
        raw_spinlock_t                lock;                /* the pool lock */
        int                        cpu;                /* I: the associated cpu */
        int                        node;                /* I: the associated node ID */
        int                        id;                /* I: pool ID */
        unsigned int                flags;                /* L: flags */

        unsigned long                watchdog_ts;        /* L: watchdog timestamp */
        bool                        cpu_stall;        /* WD: stalled cpu bound pool */

        /*
         * The counter is incremented in a process context on the associated CPU
         * w/ preemption disabled, and decremented or reset in the same context
         * but w/ pool->lock held. The readers grab pool->lock and are
         * guaranteed to see if the counter reached zero.
         */
        int                        nr_running;

        struct list_head        worklist;        /* L: list of pending works */

        int                        nr_workers;        /* L: total number of workers */
        int                        nr_idle;        /* L: currently idle workers */

        struct list_head        idle_list;        /* L: list of idle workers */
        struct timer_list        idle_timer;        /* L: worker idle timeout */
        struct work_struct      idle_cull_work; /* L: worker idle cleanup */

        struct timer_list        mayday_timer;          /* L: SOS timer for workers */

        /* a workers is either on busy_hash or idle_list, or the manager */
        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                /* L: hash of busy workers */

        struct worker                *manager;        /* L: purely informational */
        struct list_head        workers;        /* A: attached workers */
        struct list_head        dying_workers;  /* A: workers about to die */
        struct completion        *detach_completion; /* all workers detached */

        struct ida                worker_ida;        /* worker IDs for task name */

        struct workqueue_attrs        *attrs;                /* I: worker attributes */
        struct hlist_node        hash_node;        /* PL: unbound_pool_hash node */
        int                        refcnt;                /* PL: refcnt for unbound pools */

        /*
         * Destruction of pool is RCU protected to allow dereferences
         * from get_work_pool().
         */
        struct rcu_head                rcu;
};

/*
 * Per-pool_workqueue statistics. These can be monitored using
 * tools/workqueue/wq_monitor.py.
 */
enum pool_workqueue_stats {
        PWQ_STAT_STARTED,        /* work items started execution */
        PWQ_STAT_COMPLETED,        /* work items completed execution */
        PWQ_STAT_CPU_TIME,        /* total CPU time consumed */
        PWQ_STAT_CPU_INTENSIVE,        /* wq_cpu_intensive_thresh_us violations */
        PWQ_STAT_CM_WAKEUP,        /* concurrency-management worker wakeups */
        PWQ_STAT_REPATRIATED,        /* unbound workers brought back into scope */
        PWQ_STAT_MAYDAY,        /* maydays to rescuer */
        PWQ_STAT_RESCUED,        /* linked work items executed by rescuer */

        PWQ_NR_STATS,
};

/*
 * The per-pool workqueue.  While queued, bits below WORK_PWQ_SHIFT
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
 */
struct pool_workqueue {
        struct worker_pool        *pool;                /* I: the associated pool */
        struct workqueue_struct *wq;                /* I: the owning workqueue */
        int                        work_color;        /* L: current color */
        int                        flush_color;        /* L: flushing color */
        int                        refcnt;                /* L: reference count */
        int                        nr_in_flight[WORK_NR_COLORS];
                                                /* L: nr of in_flight works */
        bool                        plugged;        /* L: execution suspended */

        /*
         * nr_active management and WORK_STRUCT_INACTIVE:
         *
         * When pwq->nr_active >= max_active, new work item is queued to
         * pwq->inactive_works instead of pool->worklist and marked with
         * WORK_STRUCT_INACTIVE.
         *
         * All work items marked with WORK_STRUCT_INACTIVE do not participate in
         * nr_active and all work items in pwq->inactive_works are marked with
         * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
         * in pwq->inactive_works. Some of them are ready to run in
         * pool->worklist or worker->scheduled. Those work itmes are only struct
         * wq_barrier which is used for flush_work() and should not participate
         * in nr_active. For non-barrier work item, it is marked with
         * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
         */
        int                        nr_active;        /* L: nr of active works */
        struct list_head        inactive_works;        /* L: inactive works */
        struct list_head        pending_node;        /* LN: node on wq_node_nr_active->pending_pwqs */
        struct list_head        pwqs_node;        /* WR: node on wq->pwqs */
        struct list_head        mayday_node;        /* MD: node on wq->maydays */

        u64                        stats[PWQ_NR_STATS];

        /*
         * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
         * and pwq_release_workfn() for details. pool_workqueue itself is also
         * RCU protected so that the first pwq can be determined without
         * grabbing wq->mutex.
         */
        struct kthread_work        release_work;
        struct rcu_head                rcu;
} __aligned(1 << WORK_STRUCT_PWQ_SHIFT);

/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
        struct list_head        list;                /* WQ: list of flushers */
        int                        flush_color;        /* WQ: flush color waiting for */
        struct completion        done;                /* flush completion */
};

struct wq_device;

/*
 * Unlike in a per-cpu workqueue where max_active limits its concurrency level
 * on each CPU, in an unbound workqueue, max_active applies to the whole system.
 * As sharing a single nr_active across multiple sockets can be very expensive,
 * the counting and enforcement is per NUMA node.
 *
 * The following struct is used to enforce per-node max_active. When a pwq wants
 * to start executing a work item, it should increment ->nr using
 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
 * round-robin order.
 */
struct wq_node_nr_active {
        int                        max;                /* per-node max_active */
        atomic_t                nr;                /* per-node nr_active */
        raw_spinlock_t                lock;                /* nests inside pool locks */
        struct list_head        pending_pwqs;        /* LN: pwqs with inactive works */
};

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
        struct list_head        pwqs;                /* WR: all pwqs of this wq */
        struct list_head        list;                /* PR: list of all workqueues */

        struct mutex                mutex;                /* protects this wq */
        int                        work_color;        /* WQ: current work color */
        int                        flush_color;        /* WQ: current flush color */
        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher        *first_flusher;        /* WQ: first flusher */
        struct list_head        flusher_queue;        /* WQ: flush waiters */
        struct list_head        flusher_overflow; /* WQ: flush overflow list */

        struct list_head        maydays;        /* MD: pwqs requesting rescue */
        struct worker                *rescuer;        /* MD: rescue worker */

        int                        nr_drainers;        /* WQ: drain in progress */

        /* See alloc_workqueue() function comment for info on min/max_active */
        int                        max_active;        /* WO: max active works */
        int                        min_active;        /* WO: min active works */
        int                        saved_max_active; /* WQ: saved max_active */
        int                        saved_min_active; /* WQ: saved min_active */

        struct workqueue_attrs        *unbound_attrs;        /* PW: only for unbound wqs */
        struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
        char                        *lock_name;
        struct lock_class_key        key;
        struct lockdep_map        lockdep_map;
#endif
        char                        name[WQ_NAME_LEN]; /* I: workqueue name */

        /*
         * Destruction of workqueue_struct is RCU protected to allow walking
         * the workqueues list without grabbing wq_pool_mutex.
         * This is used to dump all workqueues from sysrq.
         */
        struct rcu_head                rcu;

        /* hot fields used during command issue, aligned to cacheline */
        unsigned int                flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
        struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};

/*
 * Each pod type describes how CPUs should be grouped for unbound workqueues.
 * See the comment above workqueue_attrs->affn_scope.
 */
struct wq_pod_type {
        int                        nr_pods;        /* number of pods */
        cpumask_var_t                *pod_cpus;        /* pod -> cpus */
        int                        *pod_node;        /* pod -> node */
        int                        *cpu_pod;        /* cpu -> pod */
};

struct work_offq_data {
        u32                        pool_id;
        u32                        disable;
        u32                        flags;
};

static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
        [WQ_AFFN_DFL]                = "default",
        [WQ_AFFN_CPU]                = "cpu",
        [WQ_AFFN_SMT]                = "smt",
        [WQ_AFFN_CACHE]                = "cache",
        [WQ_AFFN_NUMA]                = "numa",
        [WQ_AFFN_SYSTEM]        = "system",
};

/*
 * Per-cpu work items which run for longer than the following threshold are
 * automatically considered CPU intensive and excluded from concurrency
 * management to prevent them from noticeably delaying other per-cpu work items.
 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
 * The actual value is initialized in wq_cpu_intensive_thresh_init().
 */
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
static unsigned int wq_cpu_intensive_warning_thresh = 4;
module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644);
#endif

/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

static bool wq_online;                        /* can kworkers be created yet? */
static bool wq_topo_initialized __read_mostly = false;

static struct kmem_cache *pwq_cache;

static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;

/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_pod_attrs_buf;

static DEFINE_MUTEX(wq_pool_mutex);        /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_RAW_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
/* wait for manager to go away */
static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);

static LIST_HEAD(workqueues);                /* PR: list of all workqueues */
static bool workqueue_freezing;                /* PL: have wqs started freezing? */

/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* PL: user requested unbound cpumask via sysfs */
static cpumask_var_t wq_requested_unbound_cpumask;

/* PL: isolated cpumask to be excluded from unbound cpumask */
static cpumask_var_t wq_isolated_cpumask;

/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);

/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

/* to raise softirq for the BH worker pools on other CPUs */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS],
                                     bh_pool_irq_works);

/* the BH worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                     bh_worker_pools);

/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                     cpu_worker_pools);

static DEFINE_IDR(worker_pool_idr);        /* PR: idr of all pools */

/* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

/*
 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
 * process context while holding a pool lock. Bounce to a dedicated kthread
 * worker to avoid A-A deadlocks.
 */
static struct kthread_worker *pwq_release_worker __ro_after_init;

struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
struct workqueue_struct *system_bh_wq;
EXPORT_SYMBOL_GPL(system_bh_wq);
struct workqueue_struct *system_bh_highpri_wq;
EXPORT_SYMBOL_GPL(system_bh_highpri_wq);

static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
static void show_one_worker_pool(struct worker_pool *pool);

#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

#define assert_rcu_or_pool_mutex()                                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU or wq_pool_mutex should be held")

#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq->mutex) &&                \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU, wq->mutex or wq_pool_mutex should be held")

#define for_each_bh_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(bh_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

#define for_each_cpu_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
 * @pi: integer used for iteration
 *
 * This must be called either with wq_pool_mutex held or RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool(pool, pi)                                                \
        idr_for_each_entry(&worker_pool_idr, pool, pi)                        \
                if (({ assert_rcu_or_pool_mutex(); false; })) { }        \
                else

/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
 * This must be called with wq_pool_attach_mutex.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool_worker(worker, pool)                                \
        list_for_each_entry((worker), &(pool)->workers, node)                \
                if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
                else

/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
 *
 * This must be called either with wq->mutex held or RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pwq(pwq, wq)                                                \
        list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,                \
                                 lockdep_is_held(&(wq->mutex)))

#ifdef CONFIG_DEBUG_OBJECTS_WORK

static const struct debug_obj_descr work_debug_descr;

static void *work_debug_hint(void *addr)
{
        return ((struct work_struct *) addr)->func;
}

static bool work_is_static_object(void *addr)
{
        struct work_struct *work = addr;

        return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_init(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_free(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr work_debug_descr = {
        .name                = "work_struct",
        .debug_hint        = work_debug_hint,
        .is_static_object = work_is_static_object,
        .fixup_init        = work_fixup_init,
        .fixup_free        = work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
        debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
        debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
        if (onstack)
                debug_object_init_on_stack(work, &work_debug_descr);
        else
                debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
        debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

void destroy_delayed_work_on_stack(struct delayed_work *work)
{
        destroy_timer_on_stack(&work->timer);
        debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

/**
 * worker_pool_assign_id - allocate ID and assign it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
static int worker_pool_assign_id(struct worker_pool *pool)
{
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
        }
        return ret;
}

static struct pool_workqueue __rcu **
unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
{
       if (cpu >= 0)
               return per_cpu_ptr(wq->cpu_pwq, cpu);
       else
               return &wq->dfl_pwq;
}

/* @cpu < 0 for dfl_pwq */
static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
{
        return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
                                     lockdep_is_held(&wq_pool_mutex) ||
                                     lockdep_is_held(&wq->mutex));
}

/**
 * unbound_effective_cpumask - effective cpumask of an unbound workqueue
 * @wq: workqueue of interest
 *
 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
 * is masked with wq_unbound_cpumask to determine the effective cpumask. The
 * default pwq is always mapped to the pool with the current effective cpumask.
 */
static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
{
        return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
}

static unsigned int work_color_to_flags(int color)
{
        return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(unsigned long work_data)
{
        return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
        return (color + 1) % WORK_NR_COLORS;
}

static unsigned long pool_offq_flags(struct worker_pool *pool)
{
        return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0;
}

/*
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
 * is cleared and the high bits contain OFFQ flags and pool ID.
 *
 * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
 * can be used to set the pwq, pool or clear work->data. These functions should
 * only be called while the work is owned - ie. while the PENDING bit is set.
 *
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
 * corresponding to a work.  Pool is available once the work has been
 * queued anywhere after initialization until it is sync canceled.  pwq is
 * available only while the work item is queued.
 */
static inline void set_work_data(struct work_struct *work, unsigned long data)
{
        WARN_ON_ONCE(!work_pending(work));
        atomic_long_set(&work->data, data | work_static(work));
}

static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
                         unsigned long flags)
{
        set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING |
                      WORK_STRUCT_PWQ | flags);
}

static void set_work_pool_and_keep_pending(struct work_struct *work,
                                           int pool_id, unsigned long flags)
{
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      WORK_STRUCT_PENDING | flags);
}

static void set_work_pool_and_clear_pending(struct work_struct *work,
                                            int pool_id, unsigned long flags)
{
        /*
         * The following wmb is paired with the implied mb in
         * test_and_set_bit(PENDING) and ensures all updates to @work made
         * here are visible to and precede any updates by the next PENDING
         * owner.
         */
        smp_wmb();
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      flags);
        /*
         * The following mb guarantees that previous clear of a PENDING bit
         * will not be reordered with any speculative LOADS or STORES from
         * work->current_func, which is executed afterwards.  This possible
         * reordering can lead to a missed execution on attempt to queue
         * the same @work.  E.g. consider this case:
         *
         *   CPU#0                         CPU#1
         *   ----------------------------  --------------------------------
         *
         * 1  STORE event_indicated
         * 2  queue_work_on() {
         * 3    test_and_set_bit(PENDING)
         * 4 }                             set_..._and_clear_pending() {
         * 5                                 set_work_data() # clear bit
         * 6                                 smp_mb()
         * 7                               work->current_func() {
         * 8                                      LOAD event_indicated
         *                                   }
         *
         * Without an explicit full barrier speculative LOAD on line 8 can
         * be executed before CPU#0 does STORE on line 1.  If that happens,
         * CPU#0 observes the PENDING bit is still set and new execution of
         * a @work is not queued in a hope, that CPU#1 will eventually
         * finish the queued @work.  Meanwhile CPU#1 does not see
         * event_indicated is set, because speculative LOAD was executed
         * before actual STORE.
         */
        smp_mb();
}

static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
{
        return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
}

static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data);
        else
                return NULL;
}

/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);
        int pool_id;

        assert_rcu_or_pool_mutex();

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data)->pool;

        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;

        return idr_find(&worker_pool_idr, pool_id);
}

static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
{
        return (v >> shift) & ((1 << bits) - 1);
}

static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
{
        WARN_ON_ONCE(data & WORK_STRUCT_PWQ);

        offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
                                        WORK_OFFQ_POOL_BITS);
        offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT,
                                        WORK_OFFQ_DISABLE_BITS);
        offqd->flags = data & WORK_OFFQ_FLAG_MASK;
}

static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
{
        return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) |
                ((unsigned long)offqd->flags);
}

/*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with pool->lock held.
 */

/*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
static bool need_more_worker(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && !pool->nr_running;
}

/* Can I start working?  Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
        return pool->nr_idle;
}

/* Do I need to keep working?  Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}

/* Do we need a new worker?  Called from manager. */
static bool need_to_create_worker(struct worker_pool *pool)
{
        return need_more_worker(pool) && !may_start_working(pool);
}

/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
        bool managing = pool->flags & POOL_MANAGER_ACTIVE;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;

        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);

        /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
                pool->nr_running--;
        }

        worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;

        lockdep_assert_held(&pool->lock);

        worker->flags &= ~flags;

        /*
         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
         * of multiple flags, not a single flag.
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        pool->nr_running++;
}

/* Return the first idle worker.  Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;

        return list_first_entry(&pool->idle_list, struct worker, entry);
}

/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
 * @worker is entering idle state.  Update stats and idle timer if
 * necessary.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_enter_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
            WARN_ON_ONCE(!list_empty(&worker->entry) &&
                         (worker->hentry.next || worker->hentry.pprev)))
                return;

        /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;

        /* idle_list is LIFO */
        list_add(&worker->entry, &pool->idle_list);

        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

        /* Sanity check nr_running. */
        WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}

/**
 * worker_leave_idle - leave idle state
 * @worker: worker which is leaving idle state
 *
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_leave_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
        list_del_init(&worker->entry);
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @pool: pool of interest
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
 * This function checks the work item address and work function to avoid
 * false positives.  Note that this isn't complete as one may construct a
 * work function which can introduce dependency onto itself through a
 * recycled work item.  Well, if somebody wants to shoot oneself in the
 * foot that badly, there's only so much we can do, and if such deadlock
 * actually occurs, it should be easy to locate the culprit work function.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 *
 * Return:
 * Pointer to worker which is executing @work if found, %NULL
 * otherwise.
 */
static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                 struct work_struct *work)
{
        struct worker *worker;

        hash_for_each_possible(pool->busy_hash, worker, hentry,
                               (unsigned long)work)
                if (worker->current_work == work &&
                    worker->current_func == work->func)
                        return worker;

        return NULL;
}

/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out parameter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head. Work series to be
 * scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
 * @nextp.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
{
        struct work_struct *n;

        /*
         * Linked worklist will always end before the end of the list,
         * use NULL for list head.
         */
        list_for_each_entry_safe_from(work, n, NULL, entry) {
                list_move_tail(&work->entry, head);
                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
                        break;
        }

        /*
         * If we're already inside safe list traversal and have moved
         * multiple works to the scheduled queue, the next position
         * needs to be updated.
         */
        if (nextp)
                *nextp = n;
}

/**
 * assign_work - assign a work item and its linked work items to a worker
 * @work: work to assign
 * @worker: worker to assign to
 * @nextp: out parameter for nested worklist walking
 *
 * Assign @work and its linked work items to @worker. If @work is already being
 * executed by another worker in the same pool, it'll be punted there.
 *
 * If @nextp is not NULL, it's updated to point to the next work of the last
 * scheduled work. This allows assign_work() to be nested inside
 * list_for_each_entry_safe().
 *
 * Returns %true if @work was successfully assigned to @worker. %false if @work
 * was punted to another worker already executing it.
 */
static bool assign_work(struct work_struct *work, struct worker *worker,
                        struct work_struct **nextp)
{
        struct worker_pool *pool = worker->pool;
        struct worker *collision;

        lockdep_assert_held(&pool->lock);

        /*
         * A single work shouldn't be executed concurrently by multiple workers.
         * __queue_work() ensures that @work doesn't jump to a different pool
         * while still running in the previous pool. Here, we should ensure that
         * @work is not executed concurrently by multiple workers from the same
         * pool. Check whether anyone is already processing the work. If so,
         * defer the work to the currently executing one.
         */
        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, nextp);
                return false;
        }

        move_linked_works(work, &worker->scheduled, nextp);
        return true;
}

static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
{
        int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0;

        return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
}

static void kick_bh_pool(struct worker_pool *pool)
{
#ifdef CONFIG_SMP
        /* see drain_dead_softirq_workfn() for BH_DRAINING */
        if (unlikely(pool->cpu != smp_processor_id() &&
                     !(pool->flags & POOL_BH_DRAINING))) {
                irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
                return;
        }
#endif
        if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                raise_softirq_irqoff(HI_SOFTIRQ);
        else
                raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

/**
 * kick_pool - wake up an idle worker if necessary
 * @pool: pool to kick
 *
 * @pool may have pending work items. Wake up worker if necessary. Returns
 * whether a worker was woken up.
 */
static bool kick_pool(struct worker_pool *pool)
{
        struct worker *worker = first_idle_worker(pool);
        struct task_struct *p;

        lockdep_assert_held(&pool->lock);

        if (!need_more_worker(pool) || !worker)
                return false;

        if (pool->flags & POOL_BH) {
                kick_bh_pool(pool);
                return true;
        }

        p = worker->task;

#ifdef CONFIG_SMP
        /*
         * Idle @worker is about to execute @work and waking up provides an
         * opportunity to migrate @worker at a lower cost by setting the task's
         * wake_cpu field. Let's see if we want to move @worker to improve
         * execution locality.
         *
         * We're waking the worker that went idle the latest and there's some
         * chance that @worker is marked idle but hasn't gone off CPU yet. If
         * so, setting the wake_cpu won't do anything. As this is a best-effort
         * optimization and the race window is narrow, let's leave as-is for
         * now. If this becomes pronounced, we can skip over workers which are
         * still on cpu when picking an idle worker.
         *
         * If @pool has non-strict affinity, @worker might have ended up outside
         * its affinity scope. Repatriate.
         */
        if (!pool->attrs->affn_strict &&
            !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
                struct work_struct *work = list_first_entry(&pool->worklist,
                                                struct work_struct, entry);
                int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
                                                          cpu_online_mask);
                if (wake_cpu < nr_cpu_ids) {
                        p->wake_cpu = wake_cpu;
                        get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
                }
        }
#endif
        wake_up_process(p);
        return true;
}

#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

/*
 * Concurrency-managed per-cpu work items that hog CPU for longer than
 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
 * which prevents them from stalling other concurrency-managed work items. If a
 * work function keeps triggering this mechanism, it's likely that the work item
 * should be using an unbound workqueue instead.
 *
 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
 * and report them so that they can be examined and converted to use unbound
 * workqueues as appropriate. To avoid flooding the console, each violating work
 * function is tracked and reported with exponential backoff.
 */
#define WCI_MAX_ENTS 128

struct wci_ent {
        work_func_t                func;
        atomic64_t                cnt;
        struct hlist_node        hash_node;
};

static struct wci_ent wci_ents[WCI_MAX_ENTS];
static int wci_nr_ents;
static DEFINE_RAW_SPINLOCK(wci_lock);
static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

static struct wci_ent *wci_find_ent(work_func_t func)
{
        struct wci_ent *ent;

        hash_for_each_possible_rcu(wci_hash, ent, hash_node,
                                   (unsigned long)func) {
                if (ent->func == func)
                        return ent;
        }
        return NULL;
}

static void wq_cpu_intensive_report(work_func_t func)
{
        struct wci_ent *ent;

restart:
        ent = wci_find_ent(func);
        if (ent) {
                u64 cnt;

                /*
                 * Start reporting from the warning_thresh and back off
                 * exponentially.
                 */
                cnt = atomic64_inc_return_relaxed(&ent->cnt);
                if (wq_cpu_intensive_warning_thresh &&
                    cnt >= wq_cpu_intensive_warning_thresh &&
                    is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh))
                        printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
                                        ent->func, wq_cpu_intensive_thresh_us,
                                        atomic64_read(&ent->cnt));
                return;
        }

        /*
         * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
         * is exhausted, something went really wrong and we probably made enough
         * noise already.
         */
        if (wci_nr_ents >= WCI_MAX_ENTS)
                return;

        raw_spin_lock(&wci_lock);

        if (wci_nr_ents >= WCI_MAX_ENTS) {
                raw_spin_unlock(&wci_lock);
                return;
        }

        if (wci_find_ent(func)) {
                raw_spin_unlock(&wci_lock);
                goto restart;
        }

        ent = &wci_ents[wci_nr_ents++];
        ent->func = func;
        atomic64_set(&ent->cnt, 0);
        hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

        raw_spin_unlock(&wci_lock);

        goto restart;
}

#else        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
static void wq_cpu_intensive_report(work_func_t func) {}
#endif        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

/**
 * wq_worker_running - a worker is running again
 * @task: task waking up
 *
 * This function is called when a worker returns from schedule()
 */
void wq_worker_running(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        if (!READ_ONCE(worker->sleeping))
                return;

        /*
         * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
         * and the nr_running increment below, we may ruin the nr_running reset
         * and leave with an unexpected pool->nr_running == 1 on the newly unbound
         * pool. Protect against such race.
         */
        preempt_disable();
        if (!(worker->flags & WORKER_NOT_RUNNING))
                worker->pool->nr_running++;
        preempt_enable();

        /*
         * CPU intensive auto-detection cares about how long a work item hogged
         * CPU without sleeping. Reset the starting timestamp on wakeup.
         */
        worker->current_at = worker->task->se.sum_exec_runtime;

        WRITE_ONCE(worker->sleeping, 0);
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called from schedule() when a busy worker is
 * going to sleep.
 */
void wq_worker_sleeping(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct worker_pool *pool;

        /*
         * Rescuers, which may not have all the fields set up like normal
         * workers, also reach here, let's not access anything before
         * checking NOT_RUNNING.
         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return;

        pool = worker->pool;

        /* Return if preempted before wq_worker_running() was reached */
        if (READ_ONCE(worker->sleeping))
                return;

        WRITE_ONCE(worker->sleeping, 1);
        raw_spin_lock_irq(&pool->lock);

        /*
         * Recheck in case unbind_workers() preempted us. We don't
         * want to decrement nr_running after the worker is unbound
         * and nr_running has been reset.
         */
        if (worker->flags & WORKER_NOT_RUNNING) {
                raw_spin_unlock_irq(&pool->lock);
                return;
        }

        pool->nr_running--;
        if (kick_pool(pool))
                worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_tick - a scheduler tick occurred while a kworker is running
 * @task: task currently running
 *
 * Called from sched_tick(). We're in the IRQ context and the current
 * worker's fields which follow the 'K' locking rule can be accessed safely.
 */
void wq_worker_tick(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct pool_workqueue *pwq = worker->current_pwq;
        struct worker_pool *pool = worker->pool;

        if (!pwq)
                return;

        pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

        if (!wq_cpu_intensive_thresh_us)
                return;

        /*
         * If the current worker is concurrency managed and hogged the CPU for
         * longer than wq_cpu_intensive_thresh_us, it's automatically marked
         * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
         *
         * Set @worker->sleeping means that @worker is in the process of
         * switching out voluntarily and won't be contributing to
         * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
         * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
         * double decrements. The task is releasing the CPU anyway. Let's skip.
         * We probably want to make this prettier in the future.
         */
        if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
            worker->task->se.sum_exec_runtime - worker->current_at <
            wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
                return;

        raw_spin_lock(&pool->lock);

        worker_set_flags(worker, WORKER_CPU_INTENSIVE);
        wq_cpu_intensive_report(worker->current_func);
        pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

        if (kick_pool(pool))
                pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
 *
 * Determine the last function a worker executed. This is called from
 * the scheduler to get a worker's last known identity.
 *
 * CONTEXT:
 * raw_spin_lock_irq(rq->lock)
 *
 * This function is called during schedule() when a kworker is going
 * to sleep. It's used by psi to identify aggregation workers during
 * dequeuing, to allow periodic aggregation to shut-off when that
 * worker is the last task in the system or cgroup to go to sleep.
 *
 * As this function doesn't involve any workqueue-related locking, it
 * only returns stable values when called from inside the scheduler's
 * queuing and dequeuing paths, when @task, which must be a kworker,
 * is guaranteed to not be processing any works.
 *
 * Return:
 * The last work function %current executed as a worker, NULL if it
 * hasn't executed any work yet.
 */
work_func_t wq_worker_last_func(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        return worker->last_func;
}

/**
 * wq_node_nr_active - Determine wq_node_nr_active to use
 * @wq: workqueue of interest
 * @node: NUMA node, can be %NUMA_NO_NODE
 *
 * Determine wq_node_nr_active to use for @wq on @node. Returns:
 *
 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
 *
 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
 *
 * - Otherwise, node_nr_active[@node].
 */
static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
                                                   int node)
{
        if (!(wq->flags & WQ_UNBOUND))
                return NULL;

        if (node == NUMA_NO_NODE)
                node = nr_node_ids;

        return wq->node_nr_active[node];
}

/**
 * wq_update_node_max_active - Update per-node max_actives to use
 * @wq: workqueue to update
 * @off_cpu: CPU that's going down, -1 if a CPU is not going down
 *
 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
 * distributed among nodes according to the proportions of numbers of online
 * cpus. The result is always between @wq->min_active and max_active.
 */
static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
{
        struct cpumask *effective = unbound_effective_cpumask(wq);
        int min_active = READ_ONCE(wq->min_active);
        int max_active = READ_ONCE(wq->max_active);
        int total_cpus, node;

        lockdep_assert_held(&wq->mutex);

        if (!wq_topo_initialized)
                return;

        if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
                off_cpu = -1;

        total_cpus = cpumask_weight_and(effective, cpu_online_mask);
        if (off_cpu >= 0)
                total_cpus--;

        /* If all CPUs of the wq get offline, use the default values */
        if (unlikely(!total_cpus)) {
                for_each_node(node)
                        wq_node_nr_active(wq, node)->max = min_active;

                wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
                return;
        }

        for_each_node(node) {
                int node_cpus;

                node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
                if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
                        node_cpus--;

                wq_node_nr_active(wq, node)->max =
                        clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
                              min_active, max_active);
        }

        wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
}

/**
 * get_pwq - get an extra reference on the specified pool_workqueue
 * @pwq: pool_workqueue to get
 *
 * Obtain an extra reference on @pwq.  The caller should guarantee that
 * @pwq has positive refcnt and be holding the matching pool->lock.
 */
static void get_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        WARN_ON_ONCE(pwq->refcnt <= 0);
        pwq->refcnt++;
}

/**
 * put_pwq - put a pool_workqueue reference
 * @pwq: pool_workqueue to put
 *
 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
 * destruction.  The caller should be holding the matching pool->lock.
 */
static void put_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        if (likely(--pwq->refcnt))
                return;
        /*
         * @pwq can't be released under pool->lock, bounce to a dedicated
         * kthread_worker to avoid A-A deadlocks.
         */
        kthread_queue_work(pwq_release_worker, &pwq->release_work);
}

/**
 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
 * @pwq: pool_workqueue to put (can be %NULL)
 *
 * put_pwq() with locking.  This function also allows %NULL @pwq.
 */
static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
        if (pwq) {
                /*
                 * As both pwqs and pools are RCU protected, the
                 * following lock operations are safe.
                 */
                raw_spin_lock_irq(&pwq->pool->lock);
                put_pwq(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
}

static bool pwq_is_empty(struct pool_workqueue *pwq)
{
        return !pwq->nr_active && list_empty(&pwq->inactive_works);
}

static void __pwq_activate_work(struct pool_workqueue *pwq,
                                struct work_struct *work)
{
        unsigned long *wdb = work_data_bits(work);

        WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
        trace_workqueue_activate_work(work);
        if (list_empty(&pwq->pool->worklist))
                pwq->pool->watchdog_ts = jiffies;
        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}

/**
 * pwq_activate_work - Activate a work item if inactive
 * @pwq: pool_workqueue @work belongs to
 * @work: work item to activate
 *
 * Returns %true if activated. %false if already active.
 */
static bool pwq_activate_work(struct pool_workqueue *pwq,
                              struct work_struct *work)
{
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna;

        lockdep_assert_held(&pool->lock);

        if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
                return false;

        nna = wq_node_nr_active(pwq->wq, pool->node);
        if (nna)
                atomic_inc(&nna->nr);

        pwq->nr_active++;
        __pwq_activate_work(pwq, work);
        return true;
}

static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
        int max = READ_ONCE(nna->max);

        while (true) {
                int old, tmp;

                old = atomic_read(&nna->nr);
                if (old >= max)
                        return false;
                tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
                if (tmp == old)
                        return true;
        }
}

/**
 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
 * successfully obtained. %false otherwise.
 */
static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
{
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
        bool obtained = false;

        lockdep_assert_held(&pool->lock);

        if (!nna) {
                /* BH or per-cpu workqueue, pwq->nr_active is sufficient */
                obtained = pwq->nr_active < READ_ONCE(wq->max_active);
                goto out;
        }

        if (unlikely(pwq->plugged))
                return false;

        /*
         * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
         * already waiting on $nna, pwq_dec_nr_active() will maintain the
         * concurrency level. Don't jump the line.
         *
         * We need to ignore the pending test after max_active has increased as
         * pwq_dec_nr_active() can only maintain the concurrency level but not
         * increase it. This is indicated by @fill.
         */
        if (!list_empty(&pwq->pending_node) && likely(!fill))
                goto out;

        obtained = tryinc_node_nr_active(nna);
        if (obtained)
                goto out;

        /*
         * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
         * and try again. The smp_mb() is paired with the implied memory barrier
         * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
         * we see the decremented $nna->nr or they see non-empty
         * $nna->pending_pwqs.
         */
        raw_spin_lock(&nna->lock);

        if (list_empty(&pwq->pending_node))
                list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
        else if (likely(!fill))
                goto out_unlock;

        smp_mb();

        obtained = tryinc_node_nr_active(nna);

        /*
         * If @fill, @pwq might have already been pending. Being spuriously
         * pending in cold paths doesn't affect anything. Let's leave it be.
         */
        if (obtained && likely(!fill))
                list_del_init(&pwq->pending_node);

out_unlock:
        raw_spin_unlock(&nna->lock);
out:
        if (obtained)
                pwq->nr_active++;
        return obtained;
}

/**
 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Activate the first inactive work item of @pwq if available and allowed by
 * max_active limit.
 *
 * Returns %true if an inactive work item has been activated. %false if no
 * inactive work item is found or max_active limit is reached.
 */
static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
{
        struct work_struct *work =
                list_first_entry_or_null(&pwq->inactive_works,
                                         struct work_struct, entry);

        if (work && pwq_tryinc_nr_active(pwq, fill)) {
                __pwq_activate_work(pwq, work);
                return true;
        } else {
                return false;
        }
}

/**
 * unplug_oldest_pwq - unplug the oldest pool_workqueue
 * @wq: workqueue_struct where its oldest pwq is to be unplugged
 *
 * This function should only be called for ordered workqueues where only the
 * oldest pwq is unplugged, the others are plugged to suspend execution to
 * ensure proper work item ordering::
 *
 *    dfl_pwq --------------+     [P] - plugged
 *                          |
 *                          v
 *    pwqs -> A -> B [P] -> C [P] (newest)
 *            |    |        |
 *            1    3        5
 *            |    |        |
 *            2    4        6
 *
 * When the oldest pwq is drained and removed, this function should be called
 * to unplug the next oldest one to start its work item execution. Note that
 * pwq's are linked into wq->pwqs with the oldest first, so the first one in
 * the list is the oldest.
 */
static void unplug_oldest_pwq(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq->mutex);

        /* Caller should make sure that pwqs isn't empty before calling */
        pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
                                       pwqs_node);
        raw_spin_lock_irq(&pwq->pool->lock);
        if (pwq->plugged) {
                pwq->plugged = false;
                if (pwq_activate_first_inactive(pwq, true))
                        kick_pool(pwq->pool);
        }
        raw_spin_unlock_irq(&pwq->pool->lock);
}

/**
 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
 * @nna: wq_node_nr_active to activate a pending pwq for
 * @caller_pool: worker_pool the caller is locking
 *
 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
 * @caller_pool may be unlocked and relocked to lock other worker_pools.
 */
static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
                                      struct worker_pool *caller_pool)
{
        struct worker_pool *locked_pool = caller_pool;
        struct pool_workqueue *pwq;
        struct work_struct *work;

        lockdep_assert_held(&caller_pool->lock);

        raw_spin_lock(&nna->lock);
retry:
        pwq = list_first_entry_or_null(&nna->pending_pwqs,
                                       struct pool_workqueue, pending_node);
        if (!pwq)
                goto out_unlock;

        /*
         * If @pwq is for a different pool than @locked_pool, we need to lock
         * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
         * / lock dance. For that, we also need to release @nna->lock as it's
         * nested inside pool locks.
         */
        if (pwq->pool != locked_pool) {
                raw_spin_unlock(&locked_pool->lock);
                locked_pool = pwq->pool;
                if (!raw_spin_trylock(&locked_pool->lock)) {
                        raw_spin_unlock(&nna->lock);
                        raw_spin_lock(&locked_pool->lock);
                        raw_spin_lock(&nna->lock);
                        goto retry;
                }
        }

        /*
         * $pwq may not have any inactive work items due to e.g. cancellations.
         * Drop it from pending_pwqs and see if there's another one.
         */
        work = list_first_entry_or_null(&pwq->inactive_works,
                                        struct work_struct, entry);
        if (!work) {
                list_del_init(&pwq->pending_node);
                goto retry;
        }

        /*
         * Acquire an nr_active count and activate the inactive work item. If
         * $pwq still has inactive work items, rotate it to the end of the
         * pending_pwqs so that we round-robin through them. This means that
         * inactive work items are not activated in queueing order which is fine
         * given that there has never been any ordering across different pwqs.
         */
        if (likely(tryinc_node_nr_active(nna))) {
                pwq->nr_active++;
                __pwq_activate_work(pwq, work);

                if (list_empty(&pwq->inactive_works))
                        list_del_init(&pwq->pending_node);
                else
                        list_move_tail(&pwq->pending_node, &nna->pending_pwqs);

                /* if activating a foreign pool, make sure it's running */
                if (pwq->pool != caller_pool)
                        kick_pool(pwq->pool);
        }

out_unlock:
        raw_spin_unlock(&nna->lock);
        if (locked_pool != caller_pool) {
                raw_spin_unlock(&locked_pool->lock);
                raw_spin_lock(&caller_pool->lock);
        }
}

/**
 * pwq_dec_nr_active - Retire an active count
 * @pwq: pool_workqueue of interest
 *
 * Decrement @pwq's nr_active and try to activate the first inactive work item.
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
 */
static void pwq_dec_nr_active(struct pool_workqueue *pwq)
{
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);

        lockdep_assert_held(&pool->lock);

        /*
         * @pwq->nr_active should be decremented for both percpu and unbound
         * workqueues.
         */
        pwq->nr_active--;

        /*
         * For a percpu workqueue, it's simple. Just need to kick the first
         * inactive work item on @pwq itself.
         */
        if (!nna) {
                pwq_activate_first_inactive(pwq, false);
                return;
        }

        /*
         * If @pwq is for an unbound workqueue, it's more complicated because
         * multiple pwqs and pools may be sharing the nr_active count. When a
         * pwq needs to wait for an nr_active count, it puts itself on
         * $nna->pending_pwqs. The following atomic_dec_return()'s implied
         * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
         * guarantee that either we see non-empty pending_pwqs or they see
         * decremented $nna->nr.
         *
         * $nna->max may change as CPUs come online/offline and @pwq->wq's
         * max_active gets updated. However, it is guaranteed to be equal to or
         * larger than @pwq->wq->min_active which is above zero unless freezing.
         * This maintains the forward progress guarantee.
         */
        if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
                return;

        if (!list_empty(&nna->pending_pwqs))
                node_activate_pending_pwq(nna, pool);
}

/**
 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 * @pwq: pwq of interest
 * @work_data: work_data of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
 * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * NOTE:
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
 * and thus should be called after all other state updates for the in-flight
 * work item is complete.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
        int color = get_work_color(work_data);

        if (!(work_data & WORK_STRUCT_INACTIVE))
                pwq_dec_nr_active(pwq);

        pwq->nr_in_flight[color]--;

        /* is flush in progress and are we at the flushing tip? */
        if (likely(pwq->flush_color != color))
                goto out_put;

        /* are there still in-flight works? */
        if (pwq->nr_in_flight[color])
                goto out_put;

        /* this pwq is done, clear flush_color */
        pwq->flush_color = -1;

        /*
         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                complete(&pwq->wq->first_flusher->done);
out_put:
        put_pwq(pwq);
}

/**
 * try_to_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store irq state
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
 * stable state - idle, on timer or on worklist.
 *
 * Return:
 *
 *  ========        ================================================================
 *  1                if @work was pending and we successfully stole PENDING
 *  0                if @work was idle and we claimed PENDING
 *  -EAGAIN        if PENDING couldn't be grabbed at the moment, safe to busy-retry
 *  ========        ================================================================
 *
 * Note:
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
 *
 * On successful return, >= 0, irq is disabled and the caller is
 * responsible for releasing it using local_irq_restore(*@irq_flags).
 *
 * This function is safe to call from any context including IRQ handler.
 */
static int try_to_grab_pending(struct work_struct *work, u32 cflags,
                               unsigned long *irq_flags)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        local_irq_save(*irq_flags);

        /* try to steal the timer if it exists */
        if (cflags & WORK_CANCEL_DELAYED) {
                struct delayed_work *dwork = to_delayed_work(work);

                /*
                 * dwork->timer is irqsafe.  If del_timer() fails, it's
                 * guaranteed that the timer is not queued anywhere and not
                 * running on the local CPU.
                 */
                if (likely(del_timer(&dwork->timer)))
                        return 1;
        }

        /* try to claim PENDING the normal way */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
                return 0;

        rcu_read_lock();
        /*
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
        pool = get_work_pool(work);
        if (!pool)
                goto fail;

        raw_spin_lock(&pool->lock);
        /*
         * work->data is guaranteed to point to pwq only while the work
         * item is queued on pwq->wq, and both updating work->data to point
         * to pwq on queueing and to pool on dequeueing are done under
         * pwq->pool->lock.  This in turn guarantees that, if work->data
         * points to pwq which is associated with a locked pool, the work
         * item is currently queued on that pool.
         */
        pwq = get_work_pwq(work);
        if (pwq && pwq->pool == pool) {
                unsigned long work_data;

                debug_work_deactivate(work);

                /*
                 * A cancelable inactive work item must be in the
                 * pwq->inactive_works since a queued barrier can't be
                 * canceled (see the comments in insert_wq_barrier()).
                 *
                 * An inactive work item cannot be grabbed directly because
                 * it might have linked barrier work items which, if left
                 * on the inactive_works list, will confuse pwq->nr_active
                 * management later on and cause stall.  Make sure the work
                 * item is activated before grabbing.
                 */
                pwq_activate_work(pwq, work);

                list_del_init(&work->entry);

                /*
                 * work->data points to pwq iff queued. Let's point to pool. As
                 * this destroys work->data needed by the next step, stash it.
                 */
                work_data = *work_data_bits(work);
                set_work_pool_and_keep_pending(work, pool->id,
                                               pool_offq_flags(pool));

                /* must be the last step, see the function comment */
                pwq_dec_nr_in_flight(pwq, work_data);

                raw_spin_unlock(&pool->lock);
                rcu_read_unlock();
                return 1;
        }
        raw_spin_unlock(&pool->lock);
fail:
        rcu_read_unlock();
        local_irq_restore(*irq_flags);
        return -EAGAIN;
}

/**
 * work_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store IRQ state
 *
 * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
 * or on worklist.
 *
 * Can be called from any context. IRQ is disabled on return with IRQ state
 * stored in *@irq_flags. The caller is responsible for re-enabling it using
 * local_irq_restore().
 *
 * Returns %true if @work was pending. %false if idle.
 */
static bool work_grab_pending(struct work_struct *work, u32 cflags,
                              unsigned long *irq_flags)
{
        int ret;

        while (true) {
                ret = try_to_grab_pending(work, cflags, irq_flags);
                if (ret >= 0)
                        return ret;
                cpu_relax();
        }
}

/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
                        struct list_head *head, unsigned int extra_flags)
{
        debug_work_activate(work);

        /* record the work call stack in order to print it in KASAN reports */
        kasan_record_aux_stack_noalloc(work);

        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
        list_add_tail(&work->entry, head);
        get_pwq(pwq);
}

/*
 * Test whether @work is being queued from another work executing on the
 * same workqueue.
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
        struct worker *worker;

        worker = current_wq_worker();
        /*
         * Return %true iff I'm a worker executing a work item on @wq.  If
         * I'm @worker, it's safe to dereference it without locking.
         */
        return worker && worker->current_pwq->wq == wq;
}

/*
 * When queueing an unbound work item to a wq, prefer local CPU if allowed
 * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
 * avoid perturbing sensitive tasks.
 */
static int wq_select_unbound_cpu(int cpu)
{
        int new_cpu;

        if (likely(!wq_debug_force_rr_cpu)) {
                if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
                        return cpu;
        } else {
                pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
        }

        new_cpu = __this_cpu_read(wq_rr_cpu_last);
        new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
        if (unlikely(new_cpu >= nr_cpu_ids)) {
                new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
                if (unlikely(new_cpu >= nr_cpu_ids))
                        return cpu;
        }
        __this_cpu_write(wq_rr_cpu_last, new_cpu);

        return new_cpu;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
{
        struct pool_workqueue *pwq;
        struct worker_pool *last_pool, *pool;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;

        /*
         * While a work item is PENDING && off queue, a task trying to
         * steal the PENDING will busy-loop waiting for it to either get
         * queued or lose PENDING.  Grabbing PENDING and queueing should
         * happen with IRQ disabled.
         */
        lockdep_assert_irqs_disabled();

        /*
         * For a draining wq, only works from the same workqueue are
         * allowed. The __WQ_DESTROYING helps to spot the issue that
         * queues a new work item to a wq after destroy_workqueue(wq).
         */
        if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
                     WARN_ON_ONCE(!is_chained_work(wq))))
                return;
        rcu_read_lock();
retry:
        /* pwq which will be used unless @work is executing elsewhere */
        if (req_cpu == WORK_CPU_UNBOUND) {
                if (wq->flags & WQ_UNBOUND)
                        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
                else
                        cpu = raw_smp_processor_id();
        }

        pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
        pool = pwq->pool;

        /*
         * If @work was previously on a different pool, it might still be
         * running there, in which case the work needs to be queued on that
         * pool to guarantee non-reentrancy.
         */
        last_pool = get_work_pool(work);
        if (last_pool && last_pool != pool) {
                struct worker *worker;

                raw_spin_lock(&last_pool->lock);

                worker = find_worker_executing_work(last_pool, work);

                if (worker && worker->current_pwq->wq == wq) {
                        pwq = worker->current_pwq;
                        pool = pwq->pool;
                        WARN_ON_ONCE(pool != last_pool);
                } else {
                        /* meh... not running there, queue here */
                        raw_spin_unlock(&last_pool->lock);
                        raw_spin_lock(&pool->lock);
                }
        } else {
                raw_spin_lock(&pool->lock);
        }

        /*
         * pwq is determined and locked. For unbound pools, we could have raced
         * with pwq release and it could already be dead. If its refcnt is zero,
         * repeat pwq selection. Note that unbound pwqs never die without
         * another pwq replacing it in cpu_pwq or while work items are executing
         * on it, so the retrying is guaranteed to make forward-progress.
         */
        if (unlikely(!pwq->refcnt)) {
                if (wq->flags & WQ_UNBOUND) {
                        raw_spin_unlock(&pool->lock);
                        cpu_relax();
                        goto retry;
                }
                /* oops */
                WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
                          wq->name, cpu);
        }

        /* pwq determined, queue */
        trace_workqueue_queue_work(req_cpu, pwq, work);

        if (WARN_ON(!list_empty(&work->entry)))
                goto out;

        pwq->nr_in_flight[pwq->work_color]++;
        work_flags = work_color_to_flags(pwq->work_color);

        /*
         * Limit the number of concurrently active work items to max_active.
         * @work must also queue behind existing inactive work items to maintain
         * ordering when max_active changes. See wq_adjust_max_active().
         */
        if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
                if (list_empty(&pool->worklist))
                        pool->watchdog_ts = jiffies;

                trace_workqueue_activate_work(work);
                insert_work(pwq, work, &pool->worklist, work_flags);
                kick_pool(pool);
        } else {
                work_flags |= WORK_STRUCT_INACTIVE;
                insert_work(pwq, work, &pwq->inactive_works, work_flags);
        }

out:
        raw_spin_unlock(&pool->lock);
        rcu_read_unlock();
}

static bool clear_pending_if_disabled(struct work_struct *work)
{
        unsigned long data = *work_data_bits(work);
        struct work_offq_data offqd;

        if (likely((data & WORK_STRUCT_PWQ) ||
                   !(data & WORK_OFFQ_DISABLE_MASK)))
                return false;

        work_offqd_unpack(&offqd, data);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        return true;
}

/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.  Callers that fail to ensure that the specified
 * CPU cannot go away will execute on a randomly chosen CPU.
 * But note well that callers specifying a CPU that never has been
 * online will get a splat.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
{
        bool ret = false;
        unsigned long irq_flags;

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_work_on);

/**
 * select_numa_node_cpu - Select a CPU based on NUMA node
 * @node: NUMA node ID that we want to select a CPU from
 *
 * This function will attempt to find a "random" cpu available on a given
 * node. If there are no CPUs available on the given node it will return
 * WORK_CPU_UNBOUND indicating that we should just schedule to any
 * available CPU if we need to schedule this work.
 */
static int select_numa_node_cpu(int node)
{
        int cpu;

        /* Delay binding to CPU if node is not valid or online */
        if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
                return WORK_CPU_UNBOUND;

        /* Use local node/cpu if we are already there */
        cpu = raw_smp_processor_id();
        if (node == cpu_to_node(cpu))
                return cpu;

        /* Use "random" otherwise know as "first" online CPU of node */
        cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);

        /* If CPU is valid return that, otherwise just defer */
        return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
}

/**
 * queue_work_node - queue work on a "random" cpu for a given NUMA node
 * @node: NUMA node that we are targeting the work for
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a "random" CPU within a given NUMA node. The basic
 * idea here is to provide a way to somehow associate work with a given
 * NUMA node.
 *
 * This function will only make a best effort attempt at getting this onto
 * the right NUMA node. If no node is requested or the requested node is
 * offline then we just fall back to standard queue_work behavior.
 *
 * Currently the "random" CPU ends up being the first available CPU in the
 * intersection of cpu_online_mask and the cpumask of the node, unless we
 * are running on the node. In that case we just use the current CPU.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_node(int node, struct workqueue_struct *wq,
                     struct work_struct *work)
{
        unsigned long irq_flags;
        bool ret = false;

        /*
         * This current implementation is specific to unbound workqueues.
         * Specifically we only return the first available CPU for a given
         * node instead of cycling through individual CPUs within the node.
         *
         * If this is used with a per-cpu workqueue then the logic in
         * workqueue_select_cpu_near would need to be updated to allow for
         * some round robin type logic.
         */
        WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                int cpu = select_numa_node_cpu(node);

                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);

void delayed_work_timer_fn(struct timer_list *t)
{
        struct delayed_work *dwork = from_timer(dwork, t, timer);

        /* should have been called from irqsafe timer with irq already off */
        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
EXPORT_SYMBOL(delayed_work_timer_fn);

static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;

        WARN_ON_ONCE(!wq);
        WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
        WARN_ON_ONCE(timer_pending(timer));
        WARN_ON_ONCE(!list_empty(&work->entry));

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                __queue_work(cpu, wq, &dwork->work);
                return;
        }

        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;

        if (housekeeping_enabled(HK_TYPE_TIMER)) {
                /* If the current cpu is a housekeeping cpu, use it. */
                cpu = smp_processor_id();
                if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
                        cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
                add_timer_on(timer, cpu);
        } else {
                if (likely(cpu == WORK_CPU_UNBOUND))
                        add_timer_global(timer);
                else
                        add_timer_on(timer, cpu);
        }
}

/**
 * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Return: %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
 */
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                           struct delayed_work *dwork, unsigned long delay)
{
        struct work_struct *work = &dwork->work;
        bool ret = false;
        unsigned long irq_flags;

        /* read the comment in __queue_work() */
        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_delayed_work(cpu, wq, dwork, delay);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);

/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
 * modify @dwork's timer so that it expires after @delay.  If @delay is
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
 * Return: %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
 * This function is safe to call from any context including IRQ handler.
 * See try_to_grab_pending() for details.
 */
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                         struct delayed_work *dwork, unsigned long delay)
{
        unsigned long irq_flags;
        bool ret;

        ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags);

        if (!clear_pending_if_disabled(&dwork->work))
                __queue_delayed_work(cpu, wq, dwork, delay);

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);

static void rcu_work_rcufn(struct rcu_head *rcu)
{
        struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);

        /* read the comment in __queue_work() */
        local_irq_disable();
        __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
        local_irq_enable();
}

/**
 * queue_rcu_work - queue work after a RCU grace period
 * @wq: workqueue to use
 * @rwork: work to queue
 *
 * Return: %false if @rwork was already pending, %true otherwise.  Note
 * that a full RCU grace period is guaranteed only after a %true return.
 * While @rwork is guaranteed to be executed after a %false return, the
 * execution may happen before a full RCU grace period has passed.
 */
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
        struct work_struct *work = &rwork->work;

        /*
         * rcu_work can't be canceled or disabled. Warn if the user reached
         * inside @rwork and disabled the inner work.
         */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !WARN_ON_ONCE(clear_pending_if_disabled(work))) {
                rwork->wq = wq;
                call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
                return true;
        }

        return false;
}
EXPORT_SYMBOL(queue_rcu_work);

static struct worker *alloc_worker(int node)
{
        struct worker *worker;

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
                INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
        return worker;
}

static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
{
        if (pool->cpu < 0 && pool->attrs->affn_strict)
                return pool->attrs->__pod_cpumask;
        else
                return pool->attrs->cpumask;
}

/**
 * worker_attach_to_pool() - attach a worker to a pool
 * @worker: worker to be attached
 * @pool: the target pool
 *
 * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
 * cpu-binding of @worker are kept coordinated with the pool across
 * cpu-[un]hotplugs.
 */
static void worker_attach_to_pool(struct worker *worker,
                                  struct worker_pool *pool)
{
        mutex_lock(&wq_pool_attach_mutex);

        /*
         * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
         * across this function. See the comments above the flag definition for
         * details. BH workers are, while per-CPU, always DISASSOCIATED.
         */
        if (pool->flags & POOL_DISASSOCIATED) {
                worker->flags |= WORKER_UNBOUND;
        } else {
                WARN_ON_ONCE(pool->flags & POOL_BH);
                kthread_set_per_cpu(worker->task, pool->cpu);
        }

        if (worker->rescue_wq)
                set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));

        list_add_tail(&worker->node, &pool->workers);
        worker->pool = pool;

        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_detach_from_pool() - detach a worker from its pool
 * @worker: worker which is attached to its pool
 *
 * Undo the attaching which had been done in worker_attach_to_pool().  The
 * caller worker shouldn't access to the pool after detached except it has
 * other reference to the pool.
 */
static void worker_detach_from_pool(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        struct completion *detach_completion = NULL;

        /* there is one permanent BH worker per CPU which should never detach */
        WARN_ON_ONCE(pool->flags & POOL_BH);

        mutex_lock(&wq_pool_attach_mutex);

        kthread_set_per_cpu(worker->task, -1);
        list_del(&worker->node);
        worker->pool = NULL;

        if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
                detach_completion = pool->detach_completion;
        mutex_unlock(&wq_pool_attach_mutex);

        /* clear leftover flags without pool->lock after it is detached */
        worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);

        if (detach_completion)
                complete(detach_completion);
}

/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
        struct worker *worker;
        int id;
        char id_buf[23];

        /* ID is needed to determine kthread name */
        id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
        if (id < 0) {
                pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
                            ERR_PTR(id));
                return NULL;
        }

        worker = alloc_worker(pool->node);
        if (!worker) {
                pr_err_once("workqueue: Failed to allocate a worker\n");
                goto fail;
        }

        worker->id = id;

        if (!(pool->flags & POOL_BH)) {
                if (pool->cpu >= 0)
                        snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
                                 pool->attrs->nice < 0  ? "H" : "");
                else
                        snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);

                worker->task = kthread_create_on_node(worker_thread, worker,
                                        pool->node, "kworker/%s", id_buf);
                if (IS_ERR(worker->task)) {
                        if (PTR_ERR(worker->task) == -EINTR) {
                                pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
                                       id_buf);
                        } else {
                                pr_err_once("workqueue: Failed to create a worker thread: %pe",
                                            worker->task);
                        }
                        goto fail;
                }

                set_user_nice(worker->task, pool->attrs->nice);
                kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
        }

        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);

        /* start the newly created worker */
        raw_spin_lock_irq(&pool->lock);

        worker->pool->nr_workers++;
        worker_enter_idle(worker);

        /*
         * @worker is waiting on a completion in kthread() and will trigger hung
         * check if not woken up soon. As kick_pool() is noop if @pool is empty,
         * wake it up explicitly.
         */
        if (worker->task)
                wake_up_process(worker->task);

        raw_spin_unlock_irq(&pool->lock);

        return worker;

fail:
        ida_free(&pool->worker_ida, id);
        kfree(worker);
        return NULL;
}

static void unbind_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        kthread_set_per_cpu(worker->task, -1);
        if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
        else
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}

static void wake_dying_workers(struct list_head *cull_list)
{
        struct worker *worker, *tmp;

        list_for_each_entry_safe(worker, tmp, cull_list, entry) {
                list_del_init(&worker->entry);
                unbind_worker(worker);
                /*
                 * If the worker was somehow already running, then it had to be
                 * in pool->idle_list when set_worker_dying() happened or we
                 * wouldn't have gotten here.
                 *
                 * Thus, the worker must either have observed the WORKER_DIE
                 * flag, or have set its state to TASK_IDLE. Either way, the
                 * below will be observed by the worker and is safe to do
                 * outside of pool->lock.
                 */
                wake_up_process(worker->task);
        }
}

/**
 * set_worker_dying - Tag a worker for destruction
 * @worker: worker to be destroyed
 * @list: transfer worker away from its pool->idle_list and into list
 *
 * Tag @worker for destruction and adjust @pool stats accordingly.  The worker
 * should be idle.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void set_worker_dying(struct worker *worker, struct list_head *list)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);
        lockdep_assert_held(&wq_pool_attach_mutex);

        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
            WARN_ON(!list_empty(&worker->scheduled)) ||
            WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;

        pool->nr_workers--;
        pool->nr_idle--;

        worker->flags |= WORKER_DIE;

        list_move(&worker->entry, list);
        list_move(&worker->node, &pool->dying_workers);
}

/**
 * idle_worker_timeout - check if some idle workers can now be deleted.
 * @t: The pool's idle_timer that just expired
 *
 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
 * worker_leave_idle(), as a worker flicking between idle and active while its
 * pool is at the too_many_workers() tipping point would cause too much timer
 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
 * it expire and re-evaluate things from there.
 */
static void idle_worker_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, idle_timer);
        bool do_cull = false;

        if (work_pending(&pool->idle_cull_work))
                return;

        raw_spin_lock_irq(&pool->lock);

        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                /* idle_list is kept in LIFO order, check the last one */
                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                do_cull = !time_before(jiffies, expires);

                if (!do_cull)
                        mod_timer(&pool->idle_timer, expires);
        }
        raw_spin_unlock_irq(&pool->lock);

        if (do_cull)
                queue_work(system_unbound_wq, &pool->idle_cull_work);
}

/**
 * idle_cull_fn - cull workers that have been idle for too long.
 * @work: the pool's work for handling these idle workers
 *
 * This goes through a pool's idle workers and gets rid of those that have been
 * idle for at least IDLE_WORKER_TIMEOUT seconds.
 *
 * We don't want to disturb isolated CPUs because of a pcpu kworker being
 * culled, so this also resets worker affinity. This requires a sleepable
 * context, hence the split between timer callback and work item.
 */
static void idle_cull_fn(struct work_struct *work)
{
        struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
        LIST_HEAD(cull_list);

        /*
         * Grabbing wq_pool_attach_mutex here ensures an already-running worker
         * cannot proceed beyong worker_detach_from_pool() in its self-destruct
         * path. This is required as a previously-preempted worker could run after
         * set_worker_dying() has happened but before wake_dying_workers() did.
         */
        mutex_lock(&wq_pool_attach_mutex);
        raw_spin_lock_irq(&pool->lock);

        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;

                if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
                        break;
                }

                set_worker_dying(worker, &cull_list);
        }

        raw_spin_unlock_irq(&pool->lock);
        wake_dying_workers(&cull_list);
        mutex_unlock(&wq_pool_attach_mutex);
}

static void send_mayday(struct work_struct *work)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq_mayday_lock);

        if (!wq->rescuer)
                return;

        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
                /*
                 * If @pwq is for an unbound wq, its base ref may be put at
                 * any time due to an attribute change.  Pin @pwq until the
                 * rescuer is done with it.
                 */
                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
                pwq->stats[PWQ_STAT_MAYDAY]++;
        }
}

static void pool_mayday_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, mayday_timer);
        struct work_struct *work;

        raw_spin_lock_irq(&pool->lock);
        raw_spin_lock(&wq_mayday_lock);                /* for wq->maydays */

        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }

        raw_spin_unlock(&wq_mayday_lock);
        raw_spin_unlock_irq(&pool->lock);

        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}

/**
 * maybe_create_worker - create a new worker if necessary
 * @pool: pool to create a new worker for
 *
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be %false and
 * may_start_working() %true.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 */
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
        raw_spin_unlock_irq(&pool->lock);

        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);

        while (true) {
                if (create_worker(pool) || !need_to_create_worker(pool))
                        break;

                schedule_timeout_interruptible(CREATE_COOLDOWN);

                if (!need_to_create_worker(pool))
                        break;
        }

        del_timer_sync(&pool->mayday_timer);
        raw_spin_lock_irq(&pool->lock);
        /*
         * This is necessary even after a new worker was just successfully
         * created as @pool->lock was dropped and the new worker might have
         * already become busy.
         */
        if (need_to_create_worker(pool))
                goto restart;
}

/**
 * manage_workers - manage worker pool
 * @worker: self
 *
 * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
 * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
 * %false if the pool doesn't need management and the caller can safely
 * start processing works, %true if management function was performed and
 * the conditions that the caller verified before calling the function may
 * no longer be true.
 */
static bool manage_workers(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & POOL_MANAGER_ACTIVE)
                return false;

        pool->flags |= POOL_MANAGER_ACTIVE;
        pool->manager = worker;

        maybe_create_worker(pool);

        pool->manager = NULL;
        pool->flags &= ~POOL_MANAGER_ACTIVE;
        rcuwait_wake_up(&manager_wait);
        return true;
}

/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
 */
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
        unsigned long work_data;
        int lockdep_start_depth, rcu_start_depth;
        bool bh_draining = pool->flags & POOL_BH_DRAINING;
#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the struct work_struct from
         * inside the function that is called from it, this we need to
         * take into account for lockdep too.  To avoid bogus "held
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
        /* ensure we're on the correct CPU */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);

        /* claim and dequeue */
        debug_work_deactivate(work);
        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
        worker->current_func = work->func;
        worker->current_pwq = pwq;
        if (worker->task)
                worker->current_at = worker->task->se.sum_exec_runtime;
        work_data = *work_data_bits(work);
        worker->current_color = get_work_color(work_data);

        /*
         * Record wq name for cmdline and debug reporting, may get
         * overridden through set_worker_desc().
         */
        strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

        list_del_init(&work->entry);

        /*
         * CPU intensive works don't participate in concurrency management.
         * They're the scheduler's responsibility.  This takes @worker out
         * of concurrency management and the next code block will chain
         * execution of the pending work items.
         */
        if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE);

        /*
         * Kick @pool if necessary. It's always noop for per-cpu worker pools
         * since nr_running would always be >= 1 at this point. This is used to
         * chain execution of the pending work items for WORKER_NOT_RUNNING
         * workers such as the UNBOUND and CPU_INTENSIVE ones.
         */
        kick_pool(pool);

        /*
         * Record the last pool and clear PENDING which should be the last
         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
        set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool));

        pwq->stats[PWQ_STAT_STARTED]++;
        raw_spin_unlock_irq(&pool->lock);

        rcu_start_depth = rcu_preempt_depth();
        lockdep_start_depth = lockdep_depth(current);
        /* see drain_dead_softirq_workfn() */
        if (!bh_draining)
                lock_map_acquire(&pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        /*
         * Strictly speaking we should mark the invariant state without holding
         * any locks, that is, before these two lock_map_acquire()'s.
         *
         * However, that would result in:
         *
         *   A(W1)
         *   WFC(C)
         *                A(W1)
         *                C(C)
         *
         * Which would create W1->C->W1 dependencies, even though there is no
         * actual deadlock possible. There are two solutions, using a
         * read-recursive acquire on the work(queue) 'locks', but this will then
         * hit the lockdep limitation on recursive locks, or simply discard
         * these locks.
         *
         * AFAICT there is no possible deadlock scenario between the
         * flush_work() and complete() primitives (except for single-threaded
         * workqueues), so hiding them isn't a problem.
         */
        lockdep_invariant_state(true);
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work, worker->current_func);
        pwq->stats[PWQ_STAT_COMPLETED]++;
        lock_map_release(&lockdep_map);
        if (!bh_draining)
                lock_map_release(&pwq->wq->lockdep_map);

        if (unlikely((worker->task && in_atomic()) ||
                     lockdep_depth(current) != lockdep_start_depth ||
                     rcu_preempt_depth() != rcu_start_depth)) {
                pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
                       "     preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
                       current->comm, task_pid_nr(current), preempt_count(),
                       lockdep_start_depth, lockdep_depth(current),
                       rcu_start_depth, rcu_preempt_depth(),
                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }

        /*
         * The following prevents a kworker from hogging CPU on !PREEMPTION
         * kernels, where a requeueing work item waiting for something to
         * happen could deadlock with stop_machine as such work item could
         * indefinitely requeue itself while all other CPUs are trapped in
         * stop_machine. At the same time, report a quiescent RCU state so
         * the same condition doesn't freeze RCU.
         */
        if (worker->task)
                cond_resched();

        raw_spin_lock_irq(&pool->lock);

        /*
         * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
         * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
         * wq_cpu_intensive_thresh_us. Clear it.
         */
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

        /* tag the worker for identification in schedule() */
        worker->last_func = worker->current_func;

        /* we're done with it, release */
        hash_del(&worker->hentry);
        worker->current_work = NULL;
        worker->current_func = NULL;
        worker->current_pwq = NULL;
        worker->current_color = INT_MAX;

        /* must be the last step, see the function comment */
        pwq_dec_nr_in_flight(pwq, work_data);
}

/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
{
        struct work_struct *work;
        bool first = true;

        while ((work = list_first_entry_or_null(&worker->scheduled,
                                                struct work_struct, entry))) {
                if (first) {
                        worker->pool->watchdog_ts = jiffies;
                        first = false;
                }
                process_one_work(worker, work);
        }
}

static void set_pf_worker(bool val)
{
        mutex_lock(&wq_pool_attach_mutex);
        if (val)
                current->flags |= PF_WQ_WORKER;
        else
                current->flags &= ~PF_WQ_WORKER;
        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;

        /* tell the scheduler that this is a workqueue worker */
        set_pf_worker(true);
woke_up:
        raw_spin_lock_irq(&pool->lock);

        /* am I supposed to die? */
        if (unlikely(worker->flags & WORKER_DIE)) {
                raw_spin_unlock_irq(&pool->lock);
                set_pf_worker(false);

                set_task_comm(worker->task, "kworker/dying");
                ida_free(&pool->worker_ida, worker->id);
                worker_detach_from_pool(worker);
                WARN_ON_ONCE(!list_empty(&worker->entry));
                kfree(worker);
                return 0;
        }

        worker_leave_idle(worker);
recheck:
        /* no more worker necessary? */
        if (!need_more_worker(pool))
                goto sleep;

        /* do we need to manage? */
        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;

        /*
         * ->scheduled list can only be filled while a worker is
         * preparing to process a work or actually processing it.
         * Make sure nobody diddled with it while I was sleeping.
         */
        WARN_ON_ONCE(!list_empty(&worker->scheduled));

        /*
         * Finish PREP stage.  We're guaranteed to have at least one idle
         * worker or that someone else has already assumed the manager
         * role.  This is where @worker starts participating in concurrency
         * management if applicable and concurrency management is restored
         * after being rebound.  See rebind_workers() for details.
         */
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool));

        worker_set_flags(worker, WORKER_PREP);
sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
         * pool->lock or from local cpu, so setting the current state
         * before releasing pool->lock is enough to prevent losing any
         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_IDLE);
        raw_spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
}

/**
 * rescuer_thread - the rescuer thread function
 * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_MEM_RECLAIM set.
 *
 * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
 * When such condition is possible, the pool summons rescuers of all
 * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 *
 * Return: 0
 */
static int rescuer_thread(void *__rescuer)
{
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        bool should_stop;

        set_user_nice(current, RESCUER_NICE_LEVEL);

        /*
         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
         * doesn't participate in concurrency management.
         */
        set_pf_worker(true);
repeat:
        set_current_state(TASK_IDLE);

        /*
         * By the time the rescuer is requested to stop, the workqueue
         * shouldn't have any work pending, but @wq->maydays may still have
         * pwq(s) queued.  This can happen by non-rescuer workers consuming
         * all the work items before the rescuer got to them.  Go through
         * @wq->maydays processing before acting on should_stop so that the
         * list is always empty on exit.
         */
        should_stop = kthread_should_stop();

        /* see whether any pwq is asking for help */
        raw_spin_lock_irq(&wq_mayday_lock);

        while (!list_empty(&wq->maydays)) {
                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
                struct work_struct *work, *n;

                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);

                raw_spin_unlock_irq(&wq_mayday_lock);

                worker_attach_to_pool(rescuer, pool);

                raw_spin_lock_irq(&pool->lock);

                /*
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry) {
                        if (get_work_pwq(work) == pwq &&
                            assign_work(work, rescuer, &n))
                                pwq->stats[PWQ_STAT_RESCUED]++;
                }

                if (!list_empty(&rescuer->scheduled)) {
                        process_scheduled_works(rescuer);

                        /*
                         * The above execution of rescued work items could
                         * have created more to rescue through
                         * pwq_activate_first_inactive() or chained
                         * queueing.  Let's put @pwq back on mayday list so
                         * that such back-to-back work items, which may be
                         * being used to relieve memory pressure, don't
                         * incur MAYDAY_INTERVAL delay inbetween.
                         */
                        if (pwq->nr_active && need_to_create_worker(pool)) {
                                raw_spin_lock(&wq_mayday_lock);
                                /*
                                 * Queue iff we aren't racing destruction
                                 * and somebody else hasn't queued it already.
                                 */
                                if (wq->rescuer && list_empty(&pwq->mayday_node)) {
                                        get_pwq(pwq);
                                        list_add_tail(&pwq->mayday_node, &wq->maydays);
                                }
                                raw_spin_unlock(&wq_mayday_lock);
                        }
                }

                /*
                 * Put the reference grabbed by send_mayday().  @pool won't
                 * go away while we're still attached to it.
                 */
                put_pwq(pwq);

                /*
                 * Leave this pool. Notify regular workers; otherwise, we end up
                 * with 0 concurrency and stalling the execution.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                worker_detach_from_pool(rescuer);

                raw_spin_lock_irq(&wq_mayday_lock);
        }

        raw_spin_unlock_irq(&wq_mayday_lock);

        if (should_stop) {
                __set_current_state(TASK_RUNNING);
                set_pf_worker(false);
                return 0;
        }

        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
}

static void bh_worker(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        int nr_restarts = BH_WORKER_RESTARTS;
        unsigned long end = jiffies + BH_WORKER_JIFFIES;

        raw_spin_lock_irq(&pool->lock);
        worker_leave_idle(worker);

        /*
         * This function follows the structure of worker_thread(). See there for
         * explanations on each step.
         */
        if (!need_more_worker(pool))
                goto done;

        WARN_ON_ONCE(!list_empty(&worker->scheduled));
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool) &&
                 --nr_restarts && time_before(jiffies, end));

        worker_set_flags(worker, WORKER_PREP);
done:
        worker_enter_idle(worker);
        kick_pool(pool);
        raw_spin_unlock_irq(&pool->lock);
}

/*
 * TODO: Convert all tasklet users to workqueue and use softirq directly.
 *
 * This is currently called from tasklet[_hi]action() and thus is also called
 * whenever there are tasklets to run. Let's do an early exit if there's nothing
 * queued. Once conversion from tasklet is complete, the need_more_worker() test
 * can be dropped.
 *
 * After full conversion, we'll add worker->softirq_action, directly use the
 * softirq action and obtain the worker pointer from the softirq_action pointer.
 */
void workqueue_softirq_action(bool highpri)
{
        struct worker_pool *pool =
                &per_cpu(bh_worker_pools, smp_processor_id())[highpri];
        if (need_more_worker(pool))
                bh_worker(list_first_entry(&pool->workers, struct worker, node));
}

struct wq_drain_dead_softirq_work {
        struct work_struct        work;
        struct worker_pool        *pool;
        struct completion        done;
};

static void drain_dead_softirq_workfn(struct work_struct *work)
{
        struct wq_drain_dead_softirq_work *dead_work =
                container_of(work, struct wq_drain_dead_softirq_work, work);
        struct worker_pool *pool = dead_work->pool;
        bool repeat;

        /*
         * @pool's CPU is dead and we want to execute its still pending work
         * items from this BH work item which is running on a different CPU. As
         * its CPU is dead, @pool can't be kicked and, as work execution path
         * will be nested, a lockdep annotation needs to be suppressed. Mark
         * @pool with %POOL_BH_DRAINING for the special treatments.
         */
        raw_spin_lock_irq(&pool->lock);
        pool->flags |= POOL_BH_DRAINING;
        raw_spin_unlock_irq(&pool->lock);

        bh_worker(list_first_entry(&pool->workers, struct worker, node));

        raw_spin_lock_irq(&pool->lock);
        pool->flags &= ~POOL_BH_DRAINING;
        repeat = need_more_worker(pool);
        raw_spin_unlock_irq(&pool->lock);

        /*
         * bh_worker() might hit consecutive execution limit and bail. If there
         * still are pending work items, reschedule self and return so that we
         * don't hog this CPU's BH.
         */
        if (repeat) {
                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, work);
                else
                        queue_work(system_bh_wq, work);
        } else {
                complete(&dead_work->done);
        }
}

/*
 * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
 * possible to allocate dead_work per CPU and avoid flushing. However, then we
 * have to worry about draining overlapping with CPU coming back online or
 * nesting (one CPU's dead_work queued on another CPU which is also dead and so
 * on). Let's keep it simple and drain them synchronously. These are BH work
 * items which shouldn't be requeued on the same pool. Shouldn't take long.
 */
void workqueue_softirq_dead(unsigned int cpu)
{
        int i;

        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
                struct wq_drain_dead_softirq_work dead_work;

                if (!need_more_worker(pool))
                        continue;

                INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn);
                dead_work.pool = pool;
                init_completion(&dead_work.done);

                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, &dead_work.work);
                else
                        queue_work(system_bh_wq, &dead_work.work);

                wait_for_completion(&dead_work.done);
                destroy_work_on_stack(&dead_work.work);
        }
}

/**
 * check_flush_dependency - check for flush dependency sanity
 * @target_wq: workqueue being flushed
 * @target_work: work item being flushed (NULL for workqueue flushes)
 *
 * %current is trying to flush the whole @target_wq or @target_work on it.
 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
 * reclaiming memory or running on a workqueue which doesn't have
 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
 * a deadlock.
 */
static void check_flush_dependency(struct workqueue_struct *target_wq,
                                   struct work_struct *target_work)
{
        work_func_t target_func = target_work ? target_work->func : NULL;
        struct worker *worker;

        if (target_wq->flags & WQ_MEM_RECLAIM)
                return;

        worker = current_wq_worker();

        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
                  current->pid, current->comm, target_wq->name, target_func);
        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
}

struct wq_barrier {
        struct work_struct        work;
        struct completion        done;
        struct task_struct        *task;        /* purely informational */
};

static void wq_barrier_func(struct work_struct *work)
{
        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
        complete(&barr->done);
}

/**
 * insert_wq_barrier - insert a barrier work
 * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
 *
 * @barr is linked to @target such that @barr is completed only after
 * @target finishes execution.  Please note that the ordering
 * guarantee is observed only with respect to @target and on the local
 * cpu.
 *
 * Currently, a queued barrier can't be canceled.  This is because
 * try_to_grab_pending() can't determine whether the work to be
 * grabbed is at the head of the queue and thus can't clear LINKED
 * flag of the previous work while there must be a valid next work
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
 * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
{
        static __maybe_unused struct lock_class_key bh_key, thr_key;
        unsigned int work_flags = 0;
        unsigned int work_color;
        struct list_head *head;

        /*
         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
         *
         * BH and threaded workqueues need separate lockdep keys to avoid
         * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
         * usage".
         */
        INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
                              (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

        init_completion_map(&barr->done, &target->lockdep_map);

        barr->task = current;

        /* The barrier work item does not participate in nr_active. */
        work_flags |= WORK_STRUCT_INACTIVE;

        /*
         * If @target is currently being executed, schedule the
         * barrier to the worker; otherwise, put it after @target.
         */
        if (worker) {
                head = worker->scheduled.next;
                work_color = worker->current_color;
        } else {
                unsigned long *bits = work_data_bits(target);

                head = target->entry.next;
                /* there can already be other linked works, inherit and set */
                work_flags |= *bits & WORK_STRUCT_LINKED;
                work_color = get_work_color(*bits);
                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
        }

        pwq->nr_in_flight[work_color]++;
        work_flags |= work_color_to_flags(work_color);

        insert_work(pwq, &barr->work, head, work_flags);
}

/**
 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
 * Prepare pwqs for workqueue flushing.
 *
 * If @flush_color is non-negative, flush_color on all pwqs should be
 * -1.  If no pwq has in-flight commands at the specified color, all
 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
 * has in flight commands, its pwq->flush_color is set to
 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
 * calling this function with non-negative @flush_color.  If
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
 * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
 * CONTEXT:
 * mutex_lock(wq->mutex).
 *
 * Return:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
{
        bool wait = false;
        struct pool_workqueue *pwq;

        if (flush_color >= 0) {
                WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }

        for_each_pwq(pwq, wq) {
                struct worker_pool *pool = pwq->pool;

                raw_spin_lock_irq(&pool->lock);

                if (flush_color >= 0) {
                        WARN_ON_ONCE(pwq->flush_color != -1);

                        if (pwq->nr_in_flight[flush_color]) {
                                pwq->flush_color = flush_color;
                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }

                if (work_color >= 0) {
                        WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
                        pwq->work_color = work_color;
                }

                raw_spin_unlock_irq(&pool->lock);
        }

        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);

        return wait;
}

static void touch_wq_lockdep_map(struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(&wq->lockdep_map);
        lock_map_release(&wq->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

static void touch_work_lockdep_map(struct work_struct *work,
                                   struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

/**
 * __flush_workqueue - ensure that any scheduled work has run to completion.
 * @wq: workqueue to flush
 *
 * This function sleeps until all work items which were queued on entry
 * have finished execution, but it is not livelocked by new incoming ones.
 */
void __flush_workqueue(struct workqueue_struct *wq)
{
        struct wq_flusher this_flusher = {
                .list = LIST_HEAD_INIT(this_flusher.list),
                .flush_color = -1,
                .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
        };
        int next_color;

        if (WARN_ON(!wq_online))
                return;

        touch_wq_lockdep_map(wq);

        mutex_lock(&wq->mutex);

        /*
         * Start-to-wait phase
         */
        next_color = work_next_color(wq->work_color);

        if (next_color != wq->flush_color) {
                /*
                 * Color space is not full.  The current work_color
                 * becomes our flush_color and work_color is advanced
                 * by one.
                 */
                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
                this_flusher.flush_color = wq->work_color;
                wq->work_color = next_color;

                if (!wq->first_flusher) {
                        /* no flush in progress, become the first flusher */
                        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

                        wq->first_flusher = &this_flusher;

                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
                                wq->first_flusher = NULL;
                                goto out_unlock;
                        }
                } else {
                        /* wait in queue */
                        WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
                 * Oops, color space is full, wait on overflow queue.
                 * The next flush completion will assign us
                 * flush_color and transfer to flusher_queue.
                 */
                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
        }

        check_flush_dependency(wq, NULL);

        mutex_unlock(&wq->mutex);

        wait_for_completion(&this_flusher.done);

        /*
         * Wake-up-and-cascade phase
         *
         * First flushers are responsible for cascading flushes and
         * handling overflow.  Non-first flushers can simply return.
         */
        if (READ_ONCE(wq->first_flusher) != &this_flusher)
                return;

        mutex_lock(&wq->mutex);

        /* we might have raced, check again with mutex held */
        if (wq->first_flusher != &this_flusher)
                goto out_unlock;

        WRITE_ONCE(wq->first_flusher, NULL);

        WARN_ON_ONCE(!list_empty(&this_flusher.list));
        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

        while (true) {
                struct wq_flusher *next, *tmp;

                /* complete all the flushers sharing the current flush color */
                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
                        if (next->flush_color != wq->flush_color)
                                break;
                        list_del_init(&next->list);
                        complete(&next->done);
                }

                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
                             wq->flush_color != work_next_color(wq->work_color));

                /* this flush_color is finished, advance by one */
                wq->flush_color = work_next_color(wq->flush_color);

                /* one color has been freed, handle overflow queue */
                if (!list_empty(&wq->flusher_overflow)) {
                        /*
                         * Assign the same color to all overflowed
                         * flushers, advance work_color and append to
                         * flusher_queue.  This is the start-to-wait
                         * phase for these overflowed flushers.
                         */
                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
                                tmp->flush_color = wq->work_color;

                        wq->work_color = work_next_color(wq->work_color);

                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }

                if (list_empty(&wq->flusher_queue)) {
                        WARN_ON_ONCE(wq->flush_color != wq->work_color);
                        break;
                }

                /*
                 * Need to flush more colors.  Make the next flusher
                 * the new first flusher and arm pwqs.
                 */
                WARN_ON_ONCE(wq->flush_color == wq->work_color);
                WARN_ON_ONCE(wq->flush_color != next->flush_color);

                list_del_init(&next->list);
                wq->first_flusher = next;

                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;

                /*
                 * Meh... this color is already done, clear first
                 * flusher and repeat cascading.
                 */
                wq->first_flusher = NULL;
        }

out_unlock:
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL(__flush_workqueue);

/**
 * drain_workqueue - drain a workqueue
 * @wq: workqueue to drain
 *
 * Wait until the workqueue becomes empty.  While draining is in progress,
 * only chain queueing is allowed.  IOW, only currently pending or running
 * work items on @wq can queue further work items on it.  @wq is flushed
 * repeatedly until it becomes empty.  The number of flushing is determined
 * by the depth of chaining and should be relatively short.  Whine if it
 * takes too long.
 */
void drain_workqueue(struct workqueue_struct *wq)
{
        unsigned int flush_cnt = 0;
        struct pool_workqueue *pwq;

        /*
         * __queue_work() needs to test whether there are drainers, is much
         * hotter than drain_workqueue() and already looks at @wq->flags.
         * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
         */
        mutex_lock(&wq->mutex);
        if (!wq->nr_drainers++)
                wq->flags |= __WQ_DRAINING;
        mutex_unlock(&wq->mutex);
reflush:
        __flush_workqueue(wq);

        mutex_lock(&wq->mutex);

        for_each_pwq(pwq, wq) {
                bool drained;

                raw_spin_lock_irq(&pwq->pool->lock);
                drained = pwq_is_empty(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);

                if (drained)
                        continue;

                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
                        pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
                                wq->name, __func__, flush_cnt);

                mutex_unlock(&wq->mutex);
                goto reflush;
        }

        if (!--wq->nr_drainers)
                wq->flags &= ~__WQ_DRAINING;
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool from_cancel)
{
        struct worker *worker = NULL;
        struct worker_pool *pool;
        struct pool_workqueue *pwq;
        struct workqueue_struct *wq;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (!pool) {
                rcu_read_unlock();
                return false;
        }

        raw_spin_lock_irq(&pool->lock);
        /* see the comment in try_to_grab_pending() with the same code */
        pwq = get_work_pwq(work);
        if (pwq) {
                if (unlikely(pwq->pool != pool))
                        goto already_gone;
        } else {
                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
                pwq = worker->current_pwq;
        }

        wq = pwq->wq;
        check_flush_dependency(wq, work);

        insert_wq_barrier(pwq, barr, work, worker);
        raw_spin_unlock_irq(&pool->lock);

        touch_work_lockdep_map(work, wq);

        /*
         * Force a lock recursion deadlock when using flush_work() inside a
         * single-threaded or rescuer equipped workqueue.
         *
         * For single threaded workqueues the deadlock happens when the work
         * is after the work issuing the flush_work(). For rescuer equipped
         * workqueues the deadlock happens when the rescuer stalls, blocking
         * forward progress.
         */
        if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer))
                touch_wq_lockdep_map(wq);

        rcu_read_unlock();
        return true;
already_gone:
        raw_spin_unlock_irq(&pool->lock);
        rcu_read_unlock();
        return false;
}

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
        struct wq_barrier barr;
        unsigned long data;

        if (WARN_ON(!wq_online))
                return false;

        if (WARN_ON(!work->func))
                return false;

        if (!start_flush_work(work, &barr, from_cancel))
                return false;

        /*
         * start_flush_work() returned %true. If @from_cancel is set, we know
         * that @work must have been executing during start_flush_work() and
         * can't currently be queued. Its data must contain OFFQ bits. If @work
         * was queued on a BH workqueue, we also know that it was running in the
         * BH context and thus can be busy-waited.
         */
        data = *work_data_bits(work);
        if (from_cancel &&
            !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) {
                /*
                 * On RT, prevent a live lock when %current preempted soft
                 * interrupt processing or prevents ksoftirqd from running by
                 * keeping flipping BH. If the BH work item runs on a different
                 * CPU then this has no effect other than doing the BH
                 * disable/enable dance for nothing. This is copied from
                 * kernel/softirq.c::tasklet_unlock_spin_wait().
                 */
                while (!try_wait_for_completion(&barr.done)) {
                        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                                local_bh_disable();
                                local_bh_enable();
                        } else {
                                cpu_relax();
                        }
                }
        } else {
                wait_for_completion(&barr.done);
        }

        destroy_work_on_stack(&barr.work);
        return true;
}

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
        might_sleep();
        return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

/**
 * flush_delayed_work - wait for a dwork to finish executing the last queueing
 * @dwork: the delayed work to flush
 *
 * Delayed timer is cancelled and the pending work is queued for
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_delayed_work(struct delayed_work *dwork)
{
        local_irq_disable();
        if (del_timer_sync(&dwork->timer))
                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);

/**
 * flush_rcu_work - wait for a rwork to finish executing the last queueing
 * @rwork: the rcu work to flush
 *
 * Return:
 * %true if flush_rcu_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_rcu_work(struct rcu_work *rwork)
{
        if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
                rcu_barrier();
                flush_work(&rwork->work);
                return true;
        } else {
                return flush_work(&rwork->work);
        }
}
EXPORT_SYMBOL(flush_rcu_work);

static void work_offqd_disable(struct work_offq_data *offqd)
{
        const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1;

        if (likely(offqd->disable < max))
                offqd->disable++;
        else
                WARN_ONCE(true, "workqueue: work disable count overflowed\n");
}

static void work_offqd_enable(struct work_offq_data *offqd)
{
        if (likely(offqd->disable > 0))
                offqd->disable--;
        else
                WARN_ONCE(true, "workqueue: work disable count underflowed\n");
}

static bool __cancel_work(struct work_struct *work, u32 cflags)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;
        int ret;

        ret = work_grab_pending(work, cflags, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));

        if (cflags & WORK_CANCEL_DISABLE)
                work_offqd_disable(&offqd);

        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);
        return ret;
}

static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
{
        bool ret;

        ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);

        if (*work_data_bits(work) & WORK_OFFQ_BH)
                WARN_ON_ONCE(in_hardirq());
        else
                might_sleep();

        /*
         * Skip __flush_work() during early boot when we know that @work isn't
         * executing. This allows canceling during early boot.
         */
        if (wq_online)
                __flush_work(work, true);

        if (!(cflags & WORK_CANCEL_DISABLE))
                enable_work(work);

        return ret;
}

/*
 * See cancel_delayed_work()
 */
bool cancel_work(struct work_struct *work)
{
        return __cancel_work(work, 0);
}
EXPORT_SYMBOL(cancel_work);

/**
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
 *
 * Cancel @work and wait for its execution to finish. This function can be used
 * even if the work re-queues itself or migrates to another workqueue. On return
 * from this function, @work is guaranteed to be not pending or executing on any
 * CPU as long as there aren't racing enqueues.
 *
 * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
 * Use cancel_delayed_work_sync() instead.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool cancel_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, 0);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);

/**
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
 *
 * Kill off a pending delayed_work.
 *
 * Return: %true if @dwork was pending and canceled; %false if it wasn't
 * pending.
 *
 * Note:
 * The work callback function may still be running on return, unless
 * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
 * use cancel_delayed_work_sync() to wait on it.
 *
 * This function is safe to call from any context including IRQ handler.
 */
bool cancel_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work);

/**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
 * @dwork: the delayed work cancel
 *
 * This is cancel_work_sync() for delayed works.
 *
 * Return:
 * %true if @dwork was pending, %false otherwise.
 */
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);

/**
 * disable_work - Disable and cancel a work item
 * @work: work item to disable
 *
 * Disable @work by incrementing its disable count and cancel it if currently
 * pending. As long as the disable count is non-zero, any attempt to queue @work
 * will fail and return %false. The maximum supported disable depth is 2 to the
 * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
 *
 * Can be called from any context. Returns %true if @work was pending, %false
 * otherwise.
 */
bool disable_work(struct work_struct *work)
{
        return __cancel_work(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work);

/**
 * disable_work_sync - Disable, cancel and drain a work item
 * @work: work item to disable
 *
 * Similar to disable_work() but also wait for @work to finish if currently
 * executing.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool disable_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work_sync);

/**
 * enable_work - Enable a work item
 * @work: work item to enable
 *
 * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
 * only be queued if its disable count is 0.
 *
 * Can be called from any context. Returns %true if the disable count reached 0.
 * Otherwise, %false.
 */
bool enable_work(struct work_struct *work)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;

        work_grab_pending(work, 0, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));
        work_offqd_enable(&offqd);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);

        return !offqd.disable;
}
EXPORT_SYMBOL_GPL(enable_work);

/**
 * disable_delayed_work - Disable and cancel a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work() for delayed work items.
 */
bool disable_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work,
                             WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work);

/**
 * disable_delayed_work_sync - Disable, cancel and drain a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work_sync() for delayed work items.
 */
bool disable_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work,
                                  WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work_sync);

/**
 * enable_delayed_work - Enable a delayed work item
 * @dwork: delayed work item to enable
 *
 * enable_work() for delayed work items.
 */
bool enable_delayed_work(struct delayed_work *dwork)
{
        return enable_work(&dwork->work);
}
EXPORT_SYMBOL_GPL(enable_delayed_work);

/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
 * schedule_on_each_cpu() executes @func on each online CPU using the
 * system workqueue and blocks until all CPUs have completed.
 * schedule_on_each_cpu() is very slow.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int schedule_on_each_cpu(work_func_t func)
{
        int cpu;
        struct work_struct __percpu *works;

        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);

                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }

        for_each_online_cpu(cpu)
                flush_work(per_cpu_ptr(works, cpu));

        cpus_read_unlock();
        free_percpu(works);
        return 0;
}

/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:                the function to execute
 * @ew:                guaranteed storage for the execute work structure (must
 *                be available when the work executes)
 *
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
 * Return:        0 - function was executed
 *                1 - function was scheduled for execution
 */
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
        if (!in_interrupt()) {
                fn(&ew->work);
                return 0;
        }

        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);

        return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);

/**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
 *
 * Undo alloc_workqueue_attrs().
 */
void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
        if (attrs) {
                free_cpumask_var(attrs->cpumask);
                free_cpumask_var(attrs->__pod_cpumask);
                kfree(attrs);
        }
}

/**
 * alloc_workqueue_attrs - allocate a workqueue_attrs
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
 * return it.
 *
 * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
struct workqueue_attrs *alloc_workqueue_attrs(void)
{
        struct workqueue_attrs *attrs;

        attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
        if (!attrs)
                goto fail;
        if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
                goto fail;
        if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
                goto fail;

        cpumask_copy(attrs->cpumask, cpu_possible_mask);
        attrs->affn_scope = WQ_AFFN_DFL;
        return attrs;
fail:
        free_workqueue_attrs(attrs);
        return NULL;
}

static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from)
{
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
        cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
        to->affn_strict = from->affn_strict;

        /*
         * Unlike hash and equality test, copying shouldn't ignore wq-only
         * fields as copying is used for both pool and wq attrs. Instead,
         * get_unbound_pool() explicitly clears the fields.
         */
        to->affn_scope = from->affn_scope;
        to->ordered = from->ordered;
}

/*
 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
 * comments in 'struct workqueue_attrs' definition.
 */
static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
{
        attrs->affn_scope = WQ_AFFN_NR_TYPES;
        attrs->ordered = false;
        if (attrs->affn_strict)
                cpumask_copy(attrs->cpumask, cpu_possible_mask);
}

/* hash value of the content of @attr */
static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
{
        u32 hash = 0;

        hash = jhash_1word(attrs->nice, hash);
        hash = jhash_1word(attrs->affn_strict, hash);
        hash = jhash(cpumask_bits(attrs->__pod_cpumask),
                     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        if (!attrs->affn_strict)
                hash = jhash(cpumask_bits(attrs->cpumask),
                             BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        return hash;
}

/* content equality test */
static bool wqattrs_equal(const struct workqueue_attrs *a,
                          const struct workqueue_attrs *b)
{
        if (a->nice != b->nice)
                return false;
        if (a->affn_strict != b->affn_strict)
                return false;
        if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
                return false;
        if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask))
                return false;
        return true;
}

/* Update @attrs with actually available CPUs */
static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
                                      const cpumask_t *unbound_cpumask)
{
        /*
         * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
         * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
         * @unbound_cpumask.
         */
        cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
        if (unlikely(cpumask_empty(attrs->cpumask)))
                cpumask_copy(attrs->cpumask, unbound_cpumask);
}

/* find wq_pod_type to use for @attrs */
static const struct wq_pod_type *
wqattrs_pod_type(const struct workqueue_attrs *attrs)
{
        enum wq_affn_scope scope;
        struct wq_pod_type *pt;

        /* to synchronize access to wq_affn_dfl */
        lockdep_assert_held(&wq_pool_mutex);

        if (attrs->affn_scope == WQ_AFFN_DFL)
                scope = wq_affn_dfl;
        else
                scope = attrs->affn_scope;

        pt = &wq_pod_types[scope];

        if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
            likely(pt->nr_pods))
                return pt;

        /*
         * Before workqueue_init_topology(), only SYSTEM is available which is
         * initialized in workqueue_init_early().
         */
        pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        BUG_ON(!pt->nr_pods);
        return pt;
}

/**
 * init_worker_pool - initialize a newly zalloc'd worker_pool
 * @pool: worker_pool to initialize
 *
 * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
 *
 * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
static int init_worker_pool(struct worker_pool *pool)
{
        raw_spin_lock_init(&pool->lock);
        pool->id = -1;
        pool->cpu = -1;
        pool->node = NUMA_NO_NODE;
        pool->flags |= POOL_DISASSOCIATED;
        pool->watchdog_ts = jiffies;
        INIT_LIST_HEAD(&pool->worklist);
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);

        timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
        INIT_WORK(&pool->idle_cull_work, idle_cull_fn);

        timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

        INIT_LIST_HEAD(&pool->workers);
        INIT_LIST_HEAD(&pool->dying_workers);

        ida_init(&pool->worker_ida);
        INIT_HLIST_NODE(&pool->hash_node);
        pool->refcnt = 1;

        /* shouldn't fail above this point */
        pool->attrs = alloc_workqueue_attrs();
        if (!pool->attrs)
                return -ENOMEM;

        wqattrs_clear_for_pool(pool->attrs);

        return 0;
}

#ifdef CONFIG_LOCKDEP
static void wq_init_lockdep(struct workqueue_struct *wq)
{
        char *lock_name;

        lockdep_register_key(&wq->key);
        lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
        if (!lock_name)
                lock_name = wq->name;

        wq->lock_name = lock_name;
        lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
        lockdep_unregister_key(&wq->key);
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
        if (wq->lock_name != wq->name)
                kfree(wq->lock_name);
}
#else
static void wq_init_lockdep(struct workqueue_struct *wq)
{
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
}
#endif

static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        int node;

        for_each_node(node) {
                kfree(nna_ar[node]);
                nna_ar[node] = NULL;
        }

        kfree(nna_ar[nr_node_ids]);
        nna_ar[nr_node_ids] = NULL;
}

static void init_node_nr_active(struct wq_node_nr_active *nna)
{
        nna->max = WQ_DFL_MIN_ACTIVE;
        atomic_set(&nna->nr, 0);
        raw_spin_lock_init(&nna->lock);
        INIT_LIST_HEAD(&nna->pending_pwqs);
}

/*
 * Each node's nr_active counter will be accessed mostly from its own node and
 * should be allocated in the node.
 */
static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        struct wq_node_nr_active *nna;
        int node;

        for_each_node(node) {
                nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
                if (!nna)
                        goto err_free;
                init_node_nr_active(nna);
                nna_ar[node] = nna;
        }

        /* [nr_node_ids] is used as the fallback */
        nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
        if (!nna)
                goto err_free;
        init_node_nr_active(nna);
        nna_ar[nr_node_ids] = nna;

        return 0;

err_free:
        free_node_nr_active(nna_ar);
        return -ENOMEM;
}

static void rcu_free_wq(struct rcu_head *rcu)
{
        struct workqueue_struct *wq =
                container_of(rcu, struct workqueue_struct, rcu);

        if (wq->flags & WQ_UNBOUND)
                free_node_nr_active(wq->node_nr_active);

        wq_free_lockdep(wq);
        free_percpu(wq->cpu_pwq);
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
}

static void rcu_free_pool(struct rcu_head *rcu)
{
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);

        ida_destroy(&pool->worker_ida);
        free_workqueue_attrs(pool->attrs);
        kfree(pool);
}

/**
 * put_unbound_pool - put a worker_pool
 * @pool: worker_pool to put
 *
 * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
 * safe manner.  get_unbound_pool() calls this function on its failure path
 * and this function should be able to release pools which went through,
 * successfully or not, init_worker_pool().
 *
 * Should be called with wq_pool_mutex held.
 */
static void put_unbound_pool(struct worker_pool *pool)
{
        DECLARE_COMPLETION_ONSTACK(detach_completion);
        struct worker *worker;
        LIST_HEAD(cull_list);

        lockdep_assert_held(&wq_pool_mutex);

        if (--pool->refcnt)
                return;

        /* sanity checks */
        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;

        /* release id and unhash */
        if (pool->id >= 0)
                idr_remove(&worker_pool_idr, pool->id);
        hash_del(&pool->hash_node);

        /*
         * Become the manager and destroy all workers.  This prevents
         * @pool's workers from blocking on attach_mutex.  We're the last
         * manager and @pool gets freed with the flag set.
         *
         * Having a concurrent manager is quite unlikely to happen as we can
         * only get here with
         *   pwq->refcnt == pool->refcnt == 0
         * which implies no work queued to the pool, which implies no worker can
         * become the manager. However a worker could have taken the role of
         * manager before the refcnts dropped to 0, since maybe_create_worker()
         * drops pool->lock
         */
        while (true) {
                rcuwait_wait_event(&manager_wait,
                                   !(pool->flags & POOL_MANAGER_ACTIVE),
                                   TASK_UNINTERRUPTIBLE);

                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);
                if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
                        pool->flags |= POOL_MANAGER_ACTIVE;
                        break;
                }
                raw_spin_unlock_irq(&pool->lock);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        while ((worker = first_idle_worker(pool)))
                set_worker_dying(worker, &cull_list);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        raw_spin_unlock_irq(&pool->lock);

        wake_dying_workers(&cull_list);

        if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
                pool->detach_completion = &detach_completion;
        mutex_unlock(&wq_pool_attach_mutex);

        if (pool->detach_completion)
                wait_for_completion(pool->detach_completion);

        /* shut down the timers */
        del_timer_sync(&pool->idle_timer);
        cancel_work_sync(&pool->idle_cull_work);
        del_timer_sync(&pool->mayday_timer);

        /* RCU protected to allow dereferences from get_work_pool() */
        call_rcu(&pool->rcu, rcu_free_pool);
}

/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
        u32 hash = wqattrs_hash(attrs);
        struct worker_pool *pool;
        int pod, node = NUMA_NO_NODE;

        lockdep_assert_held(&wq_pool_mutex);

        /* do we already have a matching pool? */
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
                        return pool;
                }
        }

        /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
        for (pod = 0; pod < pt->nr_pods; pod++) {
                if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
                        node = pt->pod_node[pod];
                        break;
                }
        }

        /* nope, create a new one */
        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;

        pool->node = node;
        copy_workqueue_attrs(pool->attrs, attrs);
        wqattrs_clear_for_pool(pool->attrs);

        if (worker_pool_assign_id(pool) < 0)
                goto fail;

        /* create and start the initial worker */
        if (wq_online && !create_worker(pool))
                goto fail;

        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);

        return pool;
fail:
        if (pool)
                put_unbound_pool(pool);
        return NULL;
}

static void rcu_free_pwq(struct rcu_head *rcu)
{
        kmem_cache_free(pwq_cache,
                        container_of(rcu, struct pool_workqueue, rcu));
}

/*
 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
 * refcnt and needs to be destroyed.
 */
static void pwq_release_workfn(struct kthread_work *work)
{
        struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                                                  release_work);
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        bool is_last = false;

        /*
         * When @pwq is not linked, it doesn't hold any reference to the
         * @wq, and @wq is invalid to access.
         */
        if (!list_empty(&pwq->pwqs_node)) {
                mutex_lock(&wq->mutex);
                list_del_rcu(&pwq->pwqs_node);
                is_last = list_empty(&wq->pwqs);

                /*
                 * For ordered workqueue with a plugged dfl_pwq, restart it now.
                 */
                if (!is_last && (wq->flags & __WQ_ORDERED))
                        unplug_oldest_pwq(wq);

                mutex_unlock(&wq->mutex);
        }

        if (wq->flags & WQ_UNBOUND) {
                mutex_lock(&wq_pool_mutex);
                put_unbound_pool(pool);
                mutex_unlock(&wq_pool_mutex);
        }

        if (!list_empty(&pwq->pending_node)) {
                struct wq_node_nr_active *nna =
                        wq_node_nr_active(pwq->wq, pwq->pool->node);

                raw_spin_lock_irq(&nna->lock);
                list_del_init(&pwq->pending_node);
                raw_spin_unlock_irq(&nna->lock);
        }

        call_rcu(&pwq->rcu, rcu_free_pwq);

        /*
         * If we're the last pwq going away, @wq is already dead and no one
         * is gonna access it anymore.  Schedule RCU free.
         */
        if (is_last) {
                wq_unregister_lockdep(wq);
                call_rcu(&wq->rcu, rcu_free_wq);
        }
}

/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
                     struct worker_pool *pool)
{
        BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);

        memset(pwq, 0, sizeof(*pwq));

        pwq->pool = pool;
        pwq->wq = wq;
        pwq->flush_color = -1;
        pwq->refcnt = 1;
        INIT_LIST_HEAD(&pwq->inactive_works);
        INIT_LIST_HEAD(&pwq->pending_node);
        INIT_LIST_HEAD(&pwq->pwqs_node);
        INIT_LIST_HEAD(&pwq->mayday_node);
        kthread_init_work(&pwq->release_work, pwq_release_workfn);
}

/* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq->mutex);

        /* may be called multiple times, ignore if already linked */
        if (!list_empty(&pwq->pwqs_node))
                return;

        /* set the matching work_color */
        pwq->work_color = wq->work_color;

        /* link in @pwq */
        list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
}

/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq_pool_mutex);

        pool = get_unbound_pool(attrs);
        if (!pool)
                return NULL;

        pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
        if (!pwq) {
                put_unbound_pool(pool);
                return NULL;
        }

        init_pwq(pwq, wq, pool);
        return pwq;
}

/**
 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
 * @attrs: the wq_attrs of the default pwq of the target workqueue
 * @cpu: the target CPU
 * @cpu_going_down: if >= 0, the CPU to consider as offline
 *
 * Calculate the cpumask a workqueue with @attrs should use on @pod. If
 * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
 * The result is stored in @attrs->__pod_cpumask.
 *
 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
 * and @pod has online CPUs requested by @attrs, the returned cpumask is the
 * intersection of the possible CPUs of @pod and @attrs->cpumask.
 *
 * The caller is responsible for ensuring that the cpumask of @pod stays stable.
 */
static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
                                int cpu_going_down)
{
        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
        int pod = pt->cpu_pod[cpu];

        /* does @pod have any online CPUs @attrs wants? */
        cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
        cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
        if (cpu_going_down >= 0)
                cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);

        if (cpumask_empty(attrs->__pod_cpumask)) {
                cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
                return;
        }

        /* yeap, return possible CPUs in @pod that @attrs wants */
        cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);

        if (cpumask_empty(attrs->__pod_cpumask))
                pr_warn_once("WARNING: workqueue cpumask: online intersect > "
                                "possible intersect\n");
}

/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
                                        int cpu, struct pool_workqueue *pwq)
{
        struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
        struct pool_workqueue *old_pwq;

        lockdep_assert_held(&wq_pool_mutex);
        lockdep_assert_held(&wq->mutex);

        /* link_pwq() can handle duplicate calls */
        link_pwq(pwq);

        old_pwq = rcu_access_pointer(*slot);
        rcu_assign_pointer(*slot, pwq);
        return old_pwq;
}

/* context to store the prepared attrs & pwqs before applying */
struct apply_wqattrs_ctx {
        struct workqueue_struct        *wq;                /* target workqueue */
        struct workqueue_attrs        *attrs;                /* attrs to apply */
        struct list_head        list;                /* queued for batching commit */
        struct pool_workqueue        *dfl_pwq;
        struct pool_workqueue        *pwq_tbl[];
};

/* free the resources after success or abort */
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
        if (ctx) {
                int cpu;

                for_each_possible_cpu(cpu)
                        put_pwq_unlocked(ctx->pwq_tbl[cpu]);
                put_pwq_unlocked(ctx->dfl_pwq);

                free_workqueue_attrs(ctx->attrs);

                kfree(ctx);
        }
}

/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
                      const struct workqueue_attrs *attrs,
                      const cpumask_var_t unbound_cpumask)
{
        struct apply_wqattrs_ctx *ctx;
        struct workqueue_attrs *new_attrs;
        int cpu;

        lockdep_assert_held(&wq_pool_mutex);

        if (WARN_ON(attrs->affn_scope < 0 ||
                    attrs->affn_scope >= WQ_AFFN_NR_TYPES))
                return ERR_PTR(-EINVAL);

        ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);

        new_attrs = alloc_workqueue_attrs();
        if (!ctx || !new_attrs)
                goto out_free;

        /*
         * If something goes wrong during CPU up/down, we'll fall back to
         * the default pwq covering whole @attrs->cpumask.  Always create
         * it even if we don't use it immediately.
         */
        copy_workqueue_attrs(new_attrs, attrs);
        wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
        if (!ctx->dfl_pwq)
                goto out_free;

        for_each_possible_cpu(cpu) {
                if (new_attrs->ordered) {
                        ctx->dfl_pwq->refcnt++;
                        ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
                } else {
                        wq_calc_pod_cpumask(new_attrs, cpu, -1);
                        ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
                        if (!ctx->pwq_tbl[cpu])
                                goto out_free;
                }
        }

        /* save the user configured attrs and sanitize it. */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->attrs = new_attrs;

        /*
         * For initialized ordered workqueues, there should only be one pwq
         * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
         * of newly queued work items until execution of older work items in
         * the old pwq's have completed.
         */
        if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))
                ctx->dfl_pwq->plugged = true;

        ctx->wq = wq;
        return ctx;

out_free:
        free_workqueue_attrs(new_attrs);
        apply_wqattrs_cleanup(ctx);
        return ERR_PTR(-ENOMEM);
}

/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
        int cpu;

        /* all pwqs have been created successfully, let's install'em */
        mutex_lock(&ctx->wq->mutex);

        copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);

        /* save the previous pwqs and install the new ones */
        for_each_possible_cpu(cpu)
                ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
                                                        ctx->pwq_tbl[cpu]);
        ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);

        /* update node_nr_active->max */
        wq_update_node_max_active(ctx->wq, -1);

        /* rescuer needs to respect wq cpumask changes */
        if (ctx->wq->rescuer)
                set_cpus_allowed_ptr(ctx->wq->rescuer->task,
                                     unbound_effective_cpumask(ctx->wq));

        mutex_unlock(&ctx->wq->mutex);
}

static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct apply_wqattrs_ctx *ctx;

        /* only unbound workqueues can change attributes */
        if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
                return -EINVAL;

        ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);

        /* the ctx has been prepared successfully, let's commit it */
        apply_wqattrs_commit(ctx);
        apply_wqattrs_cleanup(ctx);

        return 0;
}

/**
 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
 * @wq: the target workqueue
 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
 *
 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
 * work items are affine to the pod it was issued on. Older pwqs are released as
 * in-flight work items finish. Note that a work item which repeatedly requeues
 * itself back-to-back will stay on its current pwq.
 *
 * Performs GFP_KERNEL allocations.
 *
 * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
 *
 * Return: 0 on success and -errno on failure.
 */
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
{
        int ret;

        lockdep_assert_cpus_held();

        mutex_lock(&wq_pool_mutex);
        ret = apply_workqueue_attrs_locked(wq, attrs);
        mutex_unlock(&wq_pool_mutex);

        return ret;
}

/**
 * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
 * @wq: the target workqueue
 * @cpu: the CPU to update pool association for
 * @hotplug_cpu: the CPU coming up or going down
 * @online: whether @cpu is coming up or going down
 *
 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
 * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update pod affinity of
 * @wq accordingly.
 *
 *
 * If pod affinity can't be adjusted due to memory allocation failure, it falls
 * back to @wq->dfl_pwq which may not be optimal but is always correct.
 *
 * Note that when the last allowed CPU of a pod goes offline for a workqueue
 * with a cpumask spanning multiple pods, the workers which were already
 * executing the work items for the workqueue will lose their CPU affinity and
 * may execute on any CPU. This is similar to how per-cpu workqueues behave on
 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
 * responsibility to flush the work item from CPU_DOWN_PREPARE.
 */
static void wq_update_pod(struct workqueue_struct *wq, int cpu,
                          int hotplug_cpu, bool online)
{
        int off_cpu = online ? -1 : hotplug_cpu;
        struct pool_workqueue *old_pwq = NULL, *pwq;
        struct workqueue_attrs *target_attrs;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
                return;

        /*
         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
         * Let's use a preallocated one.  The following buf is protected by
         * CPU hotplug exclusion.
         */
        target_attrs = wq_update_pod_attrs_buf;

        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
        wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);

        /* nothing to do if the target cpumask matches the current pwq */
        wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
        if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
                return;

        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
                pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
                        wq->name);
                goto use_dfl_pwq;
        }

        /* Install the new pwq. */
        mutex_lock(&wq->mutex);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
        goto out_unlock;

use_dfl_pwq:
        mutex_lock(&wq->mutex);
        pwq = unbound_pwq(wq, -1);
        raw_spin_lock_irq(&pwq->pool->lock);
        get_pwq(pwq);
        raw_spin_unlock_irq(&pwq->pool->lock);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
        mutex_unlock(&wq->mutex);
        put_pwq_unlocked(old_pwq);
}

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
        bool highpri = wq->flags & WQ_HIGHPRI;
        int cpu, ret;

        wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
        if (!wq->cpu_pwq)
                goto enomem;

        if (!(wq->flags & WQ_UNBOUND)) {
                for_each_possible_cpu(cpu) {
                        struct pool_workqueue **pwq_p;
                        struct worker_pool __percpu *pools;
                        struct worker_pool *pool;

                        if (wq->flags & WQ_BH)
                                pools = bh_worker_pools;
                        else
                                pools = cpu_worker_pools;

                        pool = &(per_cpu_ptr(pools, cpu)[highpri]);
                        pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);

                        *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
                                                       pool->node);
                        if (!*pwq_p)
                                goto enomem;

                        init_pwq(*pwq_p, wq, pool);

                        mutex_lock(&wq->mutex);
                        link_pwq(*pwq_p);
                        mutex_unlock(&wq->mutex);
                }
                return 0;
        }

        cpus_read_lock();
        if (wq->flags & __WQ_ORDERED) {
                struct pool_workqueue *dfl_pwq;

                ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
                /* there should only be single pwq for ordering guarantee */
                dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
                WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
                              wq->pwqs.prev != &dfl_pwq->pwqs_node),
                     "ordering guarantee broken for workqueue %s\n", wq->name);
        } else {
                ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
        cpus_read_unlock();

        /* for unbound pwq, flush the pwq_release_worker ensures that the
         * pwq_release_workfn() completes before calling kfree(wq).
         */
        if (ret)
                kthread_flush_worker(pwq_release_worker);

        return ret;

enomem:
        if (wq->cpu_pwq) {
                for_each_possible_cpu(cpu) {
                        struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);

                        if (pwq)
                                kmem_cache_free(pwq_cache, pwq);
                }
                free_percpu(wq->cpu_pwq);
                wq->cpu_pwq = NULL;
        }
        return -ENOMEM;
}

static int wq_clamp_max_active(int max_active, unsigned int flags,
                               const char *name)
{
        if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
                        max_active, name, 1, WQ_MAX_ACTIVE);

        return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}

/*
 * Workqueues which may be used during memory reclaim should have a rescuer
 * to guarantee forward progress.
 */
static int init_rescuer(struct workqueue_struct *wq)
{
        struct worker *rescuer;
        int ret;

        if (!(wq->flags & WQ_MEM_RECLAIM))
                return 0;

        rescuer = alloc_worker(NUMA_NO_NODE);
        if (!rescuer) {
                pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
                       wq->name);
                return -ENOMEM;
        }

        rescuer->rescue_wq = wq;
        rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name);
        if (IS_ERR(rescuer->task)) {
                ret = PTR_ERR(rescuer->task);
                pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
                       wq->name, ERR_PTR(ret));
                kfree(rescuer);
                return ret;
        }

        wq->rescuer = rescuer;
        if (wq->flags & WQ_UNBOUND)
                kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
        else
                kthread_bind_mask(rescuer->task, cpu_possible_mask);
        wake_up_process(rescuer->task);

        return 0;
}

/**
 * wq_adjust_max_active - update a wq's max_active to the current setting
 * @wq: target workqueue
 *
 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
 * activate inactive work items accordingly. If @wq is freezing, clear
 * @wq->max_active to zero.
 */
static void wq_adjust_max_active(struct workqueue_struct *wq)
{
        bool activated;
        int new_max, new_min;

        lockdep_assert_held(&wq->mutex);

        if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
                new_max = 0;
                new_min = 0;
        } else {
                new_max = wq->saved_max_active;
                new_min = wq->saved_min_active;
        }

        if (wq->max_active == new_max && wq->min_active == new_min)
                return;

        /*
         * Update @wq->max/min_active and then kick inactive work items if more
         * active work items are allowed. This doesn't break work item ordering
         * because new work items are always queued behind existing inactive
         * work items if there are any.
         */
        WRITE_ONCE(wq->max_active, new_max);
        WRITE_ONCE(wq->min_active, new_min);

        if (wq->flags & WQ_UNBOUND)
                wq_update_node_max_active(wq, -1);

        if (new_max == 0)
                return;

        /*
         * Round-robin through pwq's activating the first inactive work item
         * until max_active is filled.
         */
        do {
                struct pool_workqueue *pwq;

                activated = false;
                for_each_pwq(pwq, wq) {
                        unsigned long irq_flags;

                        /* can be called during early boot w/ irq disabled */
                        raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                        if (pwq_activate_first_inactive(pwq, true)) {
                                activated = true;
                                kick_pool(pwq->pool);
                        }
                        raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                }
        } while (activated);
}

__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
                                         unsigned int flags,
                                         int max_active, ...)
{
        va_list args;
        struct workqueue_struct *wq;
        size_t wq_size;
        int name_len;

        if (flags & WQ_BH) {
                if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
                        return NULL;
                if (WARN_ON_ONCE(max_active))
                        return NULL;
        }

        /* see the comment above the definition of WQ_POWER_EFFICIENT */
        if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
                flags |= WQ_UNBOUND;

        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
                wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
        else
                wq_size = sizeof(*wq);

        wq = kzalloc(wq_size, GFP_KERNEL);
        if (!wq)
                return NULL;

        if (flags & WQ_UNBOUND) {
                wq->unbound_attrs = alloc_workqueue_attrs();
                if (!wq->unbound_attrs)
                        goto err_free_wq;
        }

        va_start(args, max_active);
        name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
        va_end(args);

        if (name_len >= WQ_NAME_LEN)
                pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
                             wq->name);

        if (flags & WQ_BH) {
                /*
                 * BH workqueues always share a single execution context per CPU
                 * and don't impose any max_active limit.
                 */
                max_active = INT_MAX;
        } else {
                max_active = max_active ?: WQ_DFL_ACTIVE;
                max_active = wq_clamp_max_active(max_active, flags, wq->name);
        }

        /* init wq */
        wq->flags = flags;
        wq->max_active = max_active;
        wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
        wq->saved_max_active = wq->max_active;
        wq->saved_min_active = wq->min_active;
        mutex_init(&wq->mutex);
        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->pwqs);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        INIT_LIST_HEAD(&wq->maydays);

        wq_init_lockdep(wq);
        INIT_LIST_HEAD(&wq->list);

        if (flags & WQ_UNBOUND) {
                if (alloc_node_nr_active(wq->node_nr_active) < 0)
                        goto err_unreg_lockdep;
        }

        if (alloc_and_link_pwqs(wq) < 0)
                goto err_free_node_nr_active;

        if (wq_online && init_rescuer(wq) < 0)
                goto err_destroy;

        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                goto err_destroy;

        /*
         * wq_pool_mutex protects global freeze state and workqueues list.
         * Grab it, adjust max_active and add the new @wq to workqueues
         * list.
         */
        mutex_lock(&wq_pool_mutex);

        mutex_lock(&wq->mutex);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);

        list_add_tail_rcu(&wq->list, &workqueues);

        mutex_unlock(&wq_pool_mutex);

        return wq;

err_free_node_nr_active:
        if (wq->flags & WQ_UNBOUND)
                free_node_nr_active(wq->node_nr_active);
err_unreg_lockdep:
        wq_unregister_lockdep(wq);
        wq_free_lockdep(wq);
err_free_wq:
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
        return NULL;
err_destroy:
        destroy_workqueue(wq);
        return NULL;
}
EXPORT_SYMBOL_GPL(alloc_workqueue);

static bool pwq_busy(struct pool_workqueue *pwq)
{
        int i;

        for (i = 0; i < WORK_NR_COLORS; i++)
                if (pwq->nr_in_flight[i])
                        return true;

        if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
                return true;
        if (!pwq_is_empty(pwq))
                return true;

        return false;
}

/**
 * destroy_workqueue - safely terminate a workqueue
 * @wq: target workqueue
 *
 * Safely destroy a workqueue. All work currently pending will be done first.
 */
void destroy_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        int cpu;

        /*
         * Remove it from sysfs first so that sanity check failure doesn't
         * lead to sysfs name conflicts.
         */
        workqueue_sysfs_unregister(wq);

        /* mark the workqueue destruction is in progress */
        mutex_lock(&wq->mutex);
        wq->flags |= __WQ_DESTROYING;
        mutex_unlock(&wq->mutex);

        /* drain it before proceeding with destruction */
        drain_workqueue(wq);

        /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
        if (wq->rescuer) {
                struct worker *rescuer = wq->rescuer;

                /* this prevents new queueing */
                raw_spin_lock_irq(&wq_mayday_lock);
                wq->rescuer = NULL;
                raw_spin_unlock_irq(&wq_mayday_lock);

                /* rescuer will empty maydays list before exiting */
                kthread_stop(rescuer->task);
                kfree(rescuer);
        }

        /*
         * Sanity checks - grab all the locks so that we wait for all
         * in-flight operations which may do put_pwq().
         */
        mutex_lock(&wq_pool_mutex);
        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq) {
                raw_spin_lock_irq(&pwq->pool->lock);
                if (WARN_ON(pwq_busy(pwq))) {
                        pr_warn("%s: %s has the following busy pwq\n",
                                __func__, wq->name);
                        show_pwq(pwq);
                        raw_spin_unlock_irq(&pwq->pool->lock);
                        mutex_unlock(&wq->mutex);
                        mutex_unlock(&wq_pool_mutex);
                        show_one_workqueue(wq);
                        return;
                }
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
        mutex_unlock(&wq->mutex);

        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
         */
        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);

        /*
         * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
         * to put the base refs. @wq will be auto-destroyed from the last
         * pwq_put. RCU read lock prevents @wq from going away from under us.
         */
        rcu_read_lock();

        for_each_possible_cpu(cpu) {
                put_pwq_unlocked(unbound_pwq(wq, cpu));
                RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
        }

        put_pwq_unlocked(unbound_pwq(wq, -1));
        RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);

/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
 *
 * Set max_active of @wq to @max_active. See the alloc_workqueue() function
 * comment.
 *
 * CONTEXT:
 * Don't call from IRQ context.
 */
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
        /* max_active doesn't mean anything for BH workqueues */
        if (WARN_ON(wq->flags & WQ_BH))
                return;
        /* disallow meddling with max_active for ordered workqueues */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return;

        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

        mutex_lock(&wq->mutex);

        wq->saved_max_active = max_active;
        if (wq->flags & WQ_UNBOUND)
                wq->saved_min_active = min(wq->saved_min_active, max_active);

        wq_adjust_max_active(wq);

        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);

/**
 * workqueue_set_min_active - adjust min_active of an unbound workqueue
 * @wq: target unbound workqueue
 * @min_active: new min_active value
 *
 * Set min_active of an unbound workqueue. Unlike other types of workqueues, an
 * unbound workqueue is not guaranteed to be able to process max_active
 * interdependent work items. Instead, an unbound workqueue is guaranteed to be
 * able to process min_active number of interdependent work items which is
 * %WQ_DFL_MIN_ACTIVE by default.
 *
 * Use this function to adjust the min_active value between 0 and the current
 * max_active.
 */
void workqueue_set_min_active(struct workqueue_struct *wq, int min_active)
{
        /* min_active is only meaningful for non-ordered unbound workqueues */
        if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) !=
                    WQ_UNBOUND))
                return;

        mutex_lock(&wq->mutex);
        wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);
}

/**
 * current_work - retrieve %current task's work struct
 *
 * Determine if %current task is a workqueue worker and what it's working on.
 * Useful to find out the context that the %current task is running in.
 *
 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
 */
struct work_struct *current_work(void)
{
        struct worker *worker = current_wq_worker();

        return worker ? worker->current_work : NULL;
}
EXPORT_SYMBOL(current_work);

/**
 * current_is_workqueue_rescuer - is %current workqueue rescuer?
 *
 * Determine whether %current is a workqueue rescuer.  Can be used from
 * work functions to determine whether it's being run off the rescuer task.
 *
 * Return: %true if %current is a workqueue rescuer. %false otherwise.
 */
bool current_is_workqueue_rescuer(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->rescue_wq;
}

/**
 * workqueue_congested - test whether a workqueue is congested
 * @cpu: CPU in question
 * @wq: target workqueue
 *
 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
 * no synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
 *
 * With the exception of ordered workqueues, all workqueues have per-cpu
 * pool_workqueues, each with its own congested state. A workqueue being
 * congested on one CPU doesn't mean that the workqueue is contested on any
 * other CPUs.
 *
 * Return:
 * %true if congested, %false otherwise.
 */
bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool ret;

        rcu_read_lock();
        preempt_disable();

        if (cpu == WORK_CPU_UNBOUND)
                cpu = smp_processor_id();

        pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
        ret = !list_empty(&pwq->inactive_works);

        preempt_enable();
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);

/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * Return:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
unsigned int work_busy(struct work_struct *work)
{
        struct worker_pool *pool;
        unsigned long irq_flags;
        unsigned int ret = 0;

        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (pool) {
                raw_spin_lock_irqsave(&pool->lock, irq_flags);
                if (find_worker_executing_work(pool, work))
                        ret |= WORK_BUSY_RUNNING;
                raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(work_busy);

/**
 * set_worker_desc - set description for the current work item
 * @fmt: printf-style format string
 * @...: arguments for the format string
 *
 * This function can be called by a running work function to describe what
 * the work item is about.  If the worker task gets dumped, this
 * information will be printed out together to help debugging.  The
 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
 */
void set_worker_desc(const char *fmt, ...)
{
        struct worker *worker = current_wq_worker();
        va_list args;

        if (worker) {
                va_start(args, fmt);
                vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
                va_end(args);
        }
}
EXPORT_SYMBOL_GPL(set_worker_desc);

/**
 * print_worker_info - print out worker information and description
 * @log_lvl: the log level to use when printing
 * @task: target task
 *
 * If @task is a worker and currently executing a work item, print out the
 * name of the workqueue being serviced and worker description set with
 * set_worker_desc() by the currently executing work item.
 *
 * This function can be safely called on any task as long as the
 * task_struct itself is accessible.  While safe, this function isn't
 * synchronized and may print out mixups or garbages of limited length.
 */
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
        work_func_t *fn = NULL;
        char name[WQ_NAME_LEN] = { };
        char desc[WORKER_DESC_LEN] = { };
        struct pool_workqueue *pwq = NULL;
        struct workqueue_struct *wq = NULL;
        struct worker *worker;

        if (!(task->flags & PF_WQ_WORKER))
                return;

        /*
         * This function is called without any synchronization and @task
         * could be in any state.  Be careful with dereferences.
         */
        worker = kthread_probe_data(task);

        /*
         * Carefully copy the associated workqueue's workfn, name and desc.
         * Keep the original last '\0' in case the original is garbage.
         */
        copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
        copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
        copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
        copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
        copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);

        if (fn || name[0] || desc[0]) {
                printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
                if (strcmp(name, desc))
                        pr_cont(" (%s)", desc);
                pr_cont("\n");
        }
}

static void pr_cont_pool_info(struct worker_pool *pool)
{
        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
        if (pool->node != NUMA_NO_NODE)
                pr_cont(" node=%d", pool->node);
        pr_cont(" flags=0x%x", pool->flags);
        if (pool->flags & POOL_BH)
                pr_cont(" bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont(" nice=%d", pool->attrs->nice);
}

static void pr_cont_worker_id(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & WQ_BH)
                pr_cont("bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont("%d%s", task_pid_nr(worker->task),
                        worker->rescue_wq ? "(RESCUER)" : "");
}

struct pr_cont_work_struct {
        bool comma;
        work_func_t func;
        long ctr;
};

static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
{
        if (!pcwsp->ctr)
                goto out_record;
        if (func == pcwsp->func) {
                pcwsp->ctr++;
                return;
        }
        if (pcwsp->ctr == 1)
                pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
        else
                pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
        pcwsp->ctr = 0;
out_record:
        if ((long)func == -1L)
                return;
        pcwsp->comma = comma;
        pcwsp->func = func;
        pcwsp->ctr = 1;
}

static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
        if (work->func == wq_barrier_func) {
                struct wq_barrier *barr;

                barr = container_of(work, struct wq_barrier, work);

                pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont("%s BAR(%d)", comma ? "," : "",
                        task_pid_nr(barr->task));
        } else {
                if (!comma)
                        pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont_work_flush(comma, work->func, pcwsp);
        }
}

static void show_pwq(struct pool_workqueue *pwq)
{
        struct pr_cont_work_struct pcws = { .ctr = 0, };
        struct worker_pool *pool = pwq->pool;
        struct work_struct *work;
        struct worker *worker;
        bool has_in_flight = false, has_pending = false;
        int bkt;

        pr_info("  pwq %d:", pool->id);
        pr_cont_pool_info(pool);

        pr_cont(" active=%d refcnt=%d%s\n",
                pwq->nr_active, pwq->refcnt,
                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (worker->current_pwq == pwq) {
                        has_in_flight = true;
                        break;
                }
        }
        if (has_in_flight) {
                bool comma = false;

                pr_info("    in-flight:");
                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                        if (worker->current_pwq != pwq)
                                continue;

                        pr_cont(" %s", comma ? "," : "");
                        pr_cont_worker_id(worker);
                        pr_cont(":%ps", worker->current_func);
                        list_for_each_entry(work, &worker->scheduled, entry)
                                pr_cont_work(false, work, &pcws);
                        pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                        comma = true;
                }
                pr_cont("\n");
        }

        list_for_each_entry(work, &pool->worklist, entry) {
                if (get_work_pwq(work) == pwq) {
                        has_pending = true;
                        break;
                }
        }
        if (has_pending) {
                bool comma = false;

                pr_info("    pending:");
                list_for_each_entry(work, &pool->worklist, entry) {
                        if (get_work_pwq(work) != pwq)
                                continue;

                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }

        if (!list_empty(&pwq->inactive_works)) {
                bool comma = false;

                pr_info("    inactive:");
                list_for_each_entry(work, &pwq->inactive_works, entry) {
                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }
}

/**
 * show_one_workqueue - dump state of specified workqueue
 * @wq: workqueue whose state will be printed
 */
void show_one_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool idle = true;
        unsigned long irq_flags;

        for_each_pwq(pwq, wq) {
                if (!pwq_is_empty(pwq)) {
                        idle = false;
                        break;
                }
        }
        if (idle) /* Nothing to print for idle workqueue */
                return;

        pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);

        for_each_pwq(pwq, wq) {
                raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                if (!pwq_is_empty(pwq)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();
                        show_pwq(pwq);
                        printk_deferred_exit();
                }
                raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                /*
                 * We could be printing a lot from atomic context, e.g.
                 * sysrq-t -> show_all_workqueues(). Avoid triggering
                 * hard lockup.
                 */
                touch_nmi_watchdog();
        }

}

/**
 * show_one_worker_pool - dump state of specified worker pool
 * @pool: worker pool whose state will be printed
 */
static void show_one_worker_pool(struct worker_pool *pool)
{
        struct worker *worker;
        bool first = true;
        unsigned long irq_flags;
        unsigned long hung = 0;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);
        if (pool->nr_workers == pool->nr_idle)
                goto next_pool;

        /* How long the first pending work is waiting for a worker. */
        if (!list_empty(&pool->worklist))
                hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;

        /*
         * Defer printing to avoid deadlocks in console drivers that
         * queue work while holding locks also taken in their write
         * paths.
         */
        printk_deferred_enter();
        pr_info("pool %d:", pool->id);
        pr_cont_pool_info(pool);
        pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
        if (pool->manager)
                pr_cont(" manager: %d",
                        task_pid_nr(pool->manager->task));
        list_for_each_entry(worker, &pool->idle_list, entry) {
                pr_cont(" %s", first ? "idle: " : "");
                pr_cont_worker_id(worker);
                first = false;
        }
        pr_cont("\n");
        printk_deferred_exit();
next_pool:
        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        /*
         * We could be printing a lot from atomic context, e.g.
         * sysrq-t -> show_all_workqueues(). Avoid triggering
         * hard lockup.
         */
        touch_nmi_watchdog();

}

/**
 * show_all_workqueues - dump workqueue state
 *
 * Called from a sysrq handler and prints out all busy workqueues and pools.
 */
void show_all_workqueues(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int pi;

        rcu_read_lock();

        pr_info("Showing busy workqueues and worker pools:\n");

        list_for_each_entry_rcu(wq, &workqueues, list)
                show_one_workqueue(wq);

        for_each_pool(pool, pi)
                show_one_worker_pool(pool);

        rcu_read_unlock();
}

/**
 * show_freezable_workqueues - dump freezable workqueue state
 *
 * Called from try_to_freeze_tasks() and prints out all freezable workqueues
 * still busy.
 */
void show_freezable_workqueues(void)
{
        struct workqueue_struct *wq;

        rcu_read_lock();

        pr_info("Showing freezable workqueues that are still busy:\n");

        list_for_each_entry_rcu(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                show_one_workqueue(wq);
        }

        rcu_read_unlock();
}

/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
        int off;

        /* always show the actual comm */
        off = strscpy(buf, task->comm, size);
        if (off < 0)
                return;

        /* stabilize PF_WQ_WORKER and worker pool association */
        mutex_lock(&wq_pool_attach_mutex);

        if (task->flags & PF_WQ_WORKER) {
                struct worker *worker = kthread_data(task);
                struct worker_pool *pool = worker->pool;

                if (pool) {
                        raw_spin_lock_irq(&pool->lock);
                        /*
                         * ->desc tracks information (wq name or
                         * set_worker_desc()) for the latest execution.  If
                         * current, prepend '+', otherwise '-'.
                         */
                        if (worker->desc[0] != '\0') {
                                if (worker->current_work)
                                        scnprintf(buf + off, size - off, "+%s",
                                                  worker->desc);
                                else
                                        scnprintf(buf + off, size - off, "-%s",
                                                  worker->desc);
                        }
                        raw_spin_unlock_irq(&pool->lock);
                }
        }

        mutex_unlock(&wq_pool_attach_mutex);
}

#ifdef CONFIG_SMP

/*
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
 * are a lot of assumptions on strong associations among work, pwq and
 * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
 * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
 * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */

static void unbind_workers(int cpu)
{
        struct worker_pool *pool;
        struct worker *worker;

        for_each_cpu_worker_pool(pool, cpu) {
                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);

                /*
                 * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * must be on the cpu.  After this, they may become diasporas.
                 * And the preemption disabled section in their sched callbacks
                 * are guaranteed to see WORKER_UNBOUND since the code here
                 * is on the same cpu.
                 */
                for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;

                pool->flags |= POOL_DISASSOCIATED;

                /*
                 * The handling of nr_running in sched callbacks are disabled
                 * now.  Zap nr_running.  After this, nr_running stays zero and
                 * need_more_worker() and keep_working() are always true as
                 * long as the worklist is not empty.  This pool now behaves as
                 * an unbound (in terms of concurrency management) pool which
                 * are served by workers tied to the pool.
                 */
                pool->nr_running = 0;

                /*
                 * With concurrency management just turned off, a busy
                 * worker blocking could lead to lengthy stalls.  Kick off
                 * unbound chain execution of currently pending work items.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                for_each_pool_worker(worker, pool)
                        unbind_worker(worker);

                mutex_unlock(&wq_pool_attach_mutex);
        }
}

/**
 * rebind_workers - rebind all workers of a pool to the associated CPU
 * @pool: pool of interest
 *
 * @pool->cpu is coming online.  Rebind all workers to the CPU.
 */
static void rebind_workers(struct worker_pool *pool)
{
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /*
         * Restore CPU affinity of all workers.  As all idle workers should
         * be on the run-queue of the associated CPU before any local
         * wake-ups for concurrency management happen, restore CPU affinity
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
        for_each_pool_worker(worker, pool) {
                kthread_set_per_cpu(worker->task, pool->cpu);
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool_allowed_cpus(pool)) < 0);
        }

        raw_spin_lock_irq(&pool->lock);

        pool->flags &= ~POOL_DISASSOCIATED;

        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;

                /*
                 * We want to clear UNBOUND but can't directly call
                 * worker_clr_flags() or adjust nr_running.  Atomically
                 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
                 * @worker will clear REBOUND using worker_clr_flags() when
                 * it initiates the next execution cycle thus restoring
                 * concurrency management.  Note that when or whether
                 * @worker clears REBOUND doesn't affect correctness.
                 *
                 * WRITE_ONCE() is necessary because @worker->flags may be
                 * tested without holding any lock in
                 * wq_worker_running().  Without it, NOT_RUNNING test may
                 * fail incorrectly leading to premature concurrency
                 * management operations.
                 */
                WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
                worker_flags |= WORKER_REBOUND;
                worker_flags &= ~WORKER_UNBOUND;
                WRITE_ONCE(worker->flags, worker_flags);
        }

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
 * @pool: unbound pool of interest
 * @cpu: the CPU which is coming up
 *
 * An unbound pool may end up with a cpumask which doesn't have any online
 * CPUs.  When a worker of such pool get scheduled, the scheduler resets
 * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
 * online CPU before, cpus_allowed of all its workers should be restored.
 */
static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
        static cpumask_t cpumask;
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                return;

        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);

        /* as we're called from CPU_ONLINE, the following shouldn't fail */
        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}

int workqueue_prepare_cpu(unsigned int cpu)
{
        struct worker_pool *pool;

        for_each_cpu_worker_pool(pool, cpu) {
                if (pool->nr_workers)
                        continue;
                if (!create_worker(pool))
                        return -ENOMEM;
        }
        return 0;
}

int workqueue_online_cpu(unsigned int cpu)
{
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;

        mutex_lock(&wq_pool_mutex);

        for_each_pool(pool, pi) {
                /* BH pools aren't affected by hotplug */
                if (pool->flags & POOL_BH)
                        continue;

                mutex_lock(&wq_pool_attach_mutex);
                if (pool->cpu == cpu)
                        rebind_workers(pool);
                else if (pool->cpu < 0)
                        restore_unbound_workers_cpumask(pool, cpu);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        /* update pod affinity of unbound workqueues */
        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                wq_update_pod(wq, tcpu, cpu, true);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
        return 0;
}

int workqueue_offline_cpu(unsigned int cpu)
{
        struct workqueue_struct *wq;

        /* unbinding per-cpu workers should happen on the local CPU */
        if (WARN_ON(cpu != smp_processor_id()))
                return -1;

        unbind_workers(cpu);

        /* update pod affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);
        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                wq_update_pod(wq, tcpu, cpu, false);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, cpu);
                        mutex_unlock(&wq->mutex);
                }
        }
        mutex_unlock(&wq_pool_mutex);

        return 0;
}

struct work_for_cpu {
        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

        wfc->ret = wfc->fn(wfc->arg);
}

/**
 * work_on_cpu_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn: the function to run
 * @arg: the function arg
 * @key: The lock class key for lock debugging purposes
 *
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key)
{
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };

        INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
        schedule_work_on(cpu, &wfc.work);
        flush_work(&wfc.work);
        destroy_work_on_stack(&wfc.work);
        return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);

/**
 * work_on_cpu_safe_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn:  the function to run
 * @arg: the function argument
 * @key: The lock class key for lock debugging purposes
 *
 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
 * any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
                          void *arg, struct lock_class_key *key)
{
        long ret = -ENODEV;

        cpus_read_lock();
        if (cpu_online(cpu))
                ret = work_on_cpu_key(cpu, fn, arg, key);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER

/**
 * freeze_workqueues_begin - begin freezing workqueues
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their inactive_works list instead of
 * pool->worklist.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void freeze_workqueues_begin(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;

        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

        mutex_unlock(&wq_pool_mutex);
}

/**
 * freeze_workqueues_busy - are freezable workqueues still busy?
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex.
 *
 * Return:
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
 */
bool freeze_workqueues_busy(void)
{
        bool busy = false;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(!workqueue_freezing);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                rcu_read_lock();
                for_each_pwq(pwq, wq) {
                        WARN_ON_ONCE(pwq->nr_active < 0);
                        if (pwq->nr_active) {
                                busy = true;
                                rcu_read_unlock();
                                goto out_unlock;
                        }
                }
                rcu_read_unlock();
        }
out_unlock:
        mutex_unlock(&wq_pool_mutex);
        return busy;
}

/**
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
 * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void thaw_workqueues(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        if (!workqueue_freezing)
                goto out_unlock;

        workqueue_freezing = false;

        /* restore max_active and repopulate worklist */
        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

out_unlock:
        mutex_unlock(&wq_pool_mutex);
}
#endif /* CONFIG_FREEZER */

static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
        LIST_HEAD(ctxs);
        int ret = 0;
        struct workqueue_struct *wq;
        struct apply_wqattrs_ctx *ctx, *n;

        lockdep_assert_held(&wq_pool_mutex);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING))
                        continue;

                ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
                if (IS_ERR(ctx)) {
                        ret = PTR_ERR(ctx);
                        break;
                }

                list_add_tail(&ctx->list, &ctxs);
        }

        list_for_each_entry_safe(ctx, n, &ctxs, list) {
                if (!ret)
                        apply_wqattrs_commit(ctx);
                apply_wqattrs_cleanup(ctx);
        }

        if (!ret) {
                mutex_lock(&wq_pool_attach_mutex);
                cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
                mutex_unlock(&wq_pool_attach_mutex);
        }
        return ret;
}

/**
 * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
 * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
 *
 * This function can be called from cpuset code to provide a set of isolated
 * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
 * either cpus_read_lock or cpus_write_lock.
 */
int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
        cpumask_var_t cpumask;
        int ret = 0;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        lockdep_assert_cpus_held();
        mutex_lock(&wq_pool_mutex);

        /* Save the current isolated cpumask & export it via sysfs */
        cpumask_copy(wq_isolated_cpumask, exclude_cpumask);

        /*
         * If the operation fails, it will fall back to
         * wq_requested_unbound_cpumask which is initially set to
         * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
         * by any subsequent write to workqueue/cpumask sysfs file.
         */
        if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
                cpumask_copy(cpumask, wq_requested_unbound_cpumask);
        if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                ret = workqueue_apply_unbound_cpumask(cpumask);

        mutex_unlock(&wq_pool_mutex);
        free_cpumask_var(cpumask);
        return ret;
}

static int parse_affn_scope(const char *val)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
                if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
                        return i;
        }
        return -EINVAL;
}

static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
{
        struct workqueue_struct *wq;
        int affn, cpu;

        affn = parse_affn_scope(val);
        if (affn < 0)
                return affn;
        if (affn == WQ_AFFN_DFL)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&wq_pool_mutex);

        wq_affn_dfl = affn;

        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu) {
                        wq_update_pod(wq, cpu, cpu, true);
                }
        }

        mutex_unlock(&wq_pool_mutex);
        cpus_read_unlock();

        return 0;
}

static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
{
        return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
}

static const struct kernel_param_ops wq_affn_dfl_ops = {
        .set        = wq_affn_dfl_set,
        .get        = wq_affn_dfl_get,
};

module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);

#ifdef CONFIG_SYSFS
/*
 * Workqueues with WQ_SYSFS flag set is visible to userland via
 * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
 * following attributes.
 *
 *  per_cpu                RO bool        : whether the workqueue is per-cpu or unbound
 *  max_active                RW int        : maximum number of in-flight work items
 *
 * Unbound workqueues have the following extra attributes.
 *
 *  nice                RW int        : nice value of the workers
 *  cpumask                RW mask        : bitmask of allowed CPUs for the workers
 *  affinity_scope        RW str  : worker CPU affinity scope (cache, numa, none)
 *  affinity_strict        RW bool : worker CPU affinity is strict
 */
struct wq_device {
        struct workqueue_struct                *wq;
        struct device                        dev;
};

static struct workqueue_struct *dev_to_wq(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        return wq_dev->wq;
}

static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
static DEVICE_ATTR_RO(per_cpu);

static ssize_t max_active_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}

static ssize_t max_active_store(struct device *dev,
                                struct device_attribute *attr, const char *buf,
                                size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;

        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
                return -EINVAL;

        workqueue_set_max_active(wq, val);
        return count;
}
static DEVICE_ATTR_RW(max_active);

static struct attribute *wq_sysfs_attrs[] = {
        &dev_attr_per_cpu.attr,
        &dev_attr_max_active.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs);

static void apply_wqattrs_lock(void)
{
        /* CPUs should stay stable across pwq creations and installations */
        cpus_read_lock();
        mutex_lock(&wq_pool_mutex);
}

static void apply_wqattrs_unlock(void)
{
        mutex_unlock(&wq_pool_mutex);
        cpus_read_unlock();
}

static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
        mutex_unlock(&wq->mutex);

        return written;
}

/* prepare workqueue_attrs for sysfs store operations */
static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
        struct workqueue_attrs *attrs;

        lockdep_assert_held(&wq_pool_mutex);

        attrs = alloc_workqueue_attrs();
        if (!attrs)
                return NULL;

        copy_workqueue_attrs(attrs, wq->unbound_attrs);
        return attrs;
}

static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                ret = apply_workqueue_attrs_locked(wq, attrs);
        else
                ret = -EINVAL;

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_cpumask_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                            cpumask_pr_args(wq->unbound_attrs->cpumask));
        mutex_unlock(&wq->mutex);
        return written;
}

static ssize_t wq_cpumask_store(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        ret = cpumask_parse(buf, attrs->cpumask);
        if (!ret)
                ret = apply_workqueue_attrs_locked(wq, attrs);

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affn_scope_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
                written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
                                    wq_affn_names[WQ_AFFN_DFL],
                                    wq_affn_names[wq_affn_dfl]);
        else
                written = scnprintf(buf, PAGE_SIZE, "%s\n",
                                    wq_affn_names[wq->unbound_attrs->affn_scope]);
        mutex_unlock(&wq->mutex);

        return written;
}

static ssize_t wq_affn_scope_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int affn, ret = -ENOMEM;

        affn = parse_affn_scope(buf);
        if (affn < 0)
                return affn;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_scope = affn;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affinity_strict_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n",
                         wq->unbound_attrs->affn_strict);
}

static ssize_t wq_affinity_strict_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int v, ret = -ENOMEM;

        if (sscanf(buf, "%d", &v) != 1)
                return -EINVAL;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_strict = (bool)v;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static struct device_attribute wq_sysfs_unbound_attrs[] = {
        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
        __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
        __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
        __ATTR_NULL,
};

static const struct bus_type wq_subsys = {
        .name                                = "workqueue",
        .dev_groups                        = wq_sysfs_groups,
};

/**
 *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
 *  @cpumask: the cpumask to set
 *
 *  The low-level workqueues cpumask is a global cpumask that limits
 *  the affinity of all unbound workqueues.  This function check the @cpumask
 *  and apply it to all unbound workqueues and updates all pwqs of them.
 *
 *  Return:        0        - Success
 *                -EINVAL        - Invalid @cpumask
 *                -ENOMEM        - Failed to allocate memory for attrs or pwqs.
 */
static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
        int ret = -EINVAL;

        /*
         * Not excluding isolated cpus on purpose.
         * If the user wishes to include them, we allow that.
         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                apply_wqattrs_lock();
                cpumask_copy(wq_requested_unbound_cpumask, cpumask);
                if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
                        ret = 0;
                        goto out_unlock;
                }

                ret = workqueue_apply_unbound_cpumask(cpumask);

out_unlock:
                apply_wqattrs_unlock();
        }

        return ret;
}

static ssize_t __wq_cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
        int written;

        mutex_lock(&wq_pool_mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
        mutex_unlock(&wq_pool_mutex);

        return written;
}

static ssize_t cpumask_requested_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
}
static DEVICE_ATTR_RO(cpumask_requested);

static ssize_t cpumask_isolated_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
}
static DEVICE_ATTR_RO(cpumask_isolated);

static ssize_t cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
}

static ssize_t cpumask_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        cpumask_var_t cpumask;
        int ret;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        ret = cpumask_parse(buf, cpumask);
        if (!ret)
                ret = workqueue_set_unbound_cpumask(cpumask);

        free_cpumask_var(cpumask);
        return ret ? ret : count;
}
static DEVICE_ATTR_RW(cpumask);

static struct attribute *wq_sysfs_cpumask_attrs[] = {
        &dev_attr_cpumask.attr,
        &dev_attr_cpumask_requested.attr,
        &dev_attr_cpumask_isolated.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs_cpumask);

static int __init wq_sysfs_init(void)
{
        return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups);
}
core_initcall(wq_sysfs_init);

static void wq_device_release(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        kfree(wq_dev);
}

/**
 * workqueue_sysfs_register - make a workqueue visible in sysfs
 * @wq: the workqueue to register
 *
 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
 * which is the preferred method.
 *
 * Workqueue user should use this function directly iff it wants to apply
 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
 * apply_workqueue_attrs() may race against userland updating the
 * attributes.
 *
 * Return: 0 on success, -errno on failure.
 */
int workqueue_sysfs_register(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev;
        int ret;

        /*
         * Adjusting max_active breaks ordering guarantee.  Disallow exposing
         * ordered workqueues.
         */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return -EINVAL;

        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
        if (!wq_dev)
                return -ENOMEM;

        wq_dev->wq = wq;
        wq_dev->dev.bus = &wq_subsys;
        wq_dev->dev.release = wq_device_release;
        dev_set_name(&wq_dev->dev, "%s", wq->name);

        /*
         * unbound_attrs are created separately.  Suppress uevent until
         * everything is ready.
         */
        dev_set_uevent_suppress(&wq_dev->dev, true);

        ret = device_register(&wq_dev->dev);
        if (ret) {
                put_device(&wq_dev->dev);
                wq->wq_dev = NULL;
                return ret;
        }

        if (wq->flags & WQ_UNBOUND) {
                struct device_attribute *attr;

                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
                        ret = device_create_file(&wq_dev->dev, attr);
                        if (ret) {
                                device_unregister(&wq_dev->dev);
                                wq->wq_dev = NULL;
                                return ret;
                        }
                }
        }

        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
}

/**
 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
 * @wq: the workqueue to unregister
 *
 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
 */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev = wq->wq_dev;

        if (!wq->wq_dev)
                return;

        wq->wq_dev = NULL;
        device_unregister(&wq_dev->dev);
}
#else        /* CONFIG_SYSFS */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)        { }
#endif        /* CONFIG_SYSFS */

/*
 * Workqueue watchdog.
 *
 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
 * flush dependency, a concurrency managed work item which stays RUNNING
 * indefinitely.  Workqueue stalls can be very difficult to debug as the
 * usual warning mechanisms don't trigger and internal workqueue state is
 * largely opaque.
 *
 * Workqueue watchdog monitors all worker pools periodically and dumps
 * state if some pools failed to make forward progress for a while where
 * forward progress is defined as the first item on ->worklist changing.
 *
 * This mechanism is controlled through the kernel parameter
 * "workqueue.watchdog_thresh" which can be updated at runtime through the
 * corresponding sysfs parameter file.
 */
#ifdef CONFIG_WQ_WATCHDOG

static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer;

static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

/*
 * Show workers that might prevent the processing of pending work items.
 * The only candidates are CPU-bound workers in the running state.
 * Pending work items should be handled by another idle worker
 * in all other situations.
 */
static void show_cpu_pool_hog(struct worker_pool *pool)
{
        struct worker *worker;
        unsigned long irq_flags;
        int bkt;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (task_is_running(worker->task)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();

                        pr_info("pool %d:\n", pool->id);
                        sched_show_task(worker->task);

                        printk_deferred_exit();
                }
        }

        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}

static void show_cpu_pools_hogs(void)
{
        struct worker_pool *pool;
        int pi;

        pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");

        rcu_read_lock();

        for_each_pool(pool, pi) {
                if (pool->cpu_stall)
                        show_cpu_pool_hog(pool);

        }

        rcu_read_unlock();
}

static void wq_watchdog_reset_touched(void)
{
        int cpu;

        wq_watchdog_touched = jiffies;
        for_each_possible_cpu(cpu)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}

static void wq_watchdog_timer_fn(struct timer_list *unused)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        bool lockup_detected = false;
        bool cpu_pool_stall = false;
        unsigned long now = jiffies;
        struct worker_pool *pool;
        int pi;

        if (!thresh)
                return;

        rcu_read_lock();

        for_each_pool(pool, pi) {
                unsigned long pool_ts, touched, ts;

                pool->cpu_stall = false;
                if (list_empty(&pool->worklist))
                        continue;

                /*
                 * If a virtual machine is stopped by the host it can look to
                 * the watchdog like a stall.
                 */
                kvm_check_and_clear_guest_paused();

                /* get the latest of pool and touched timestamps */
                if (pool->cpu >= 0)
                        touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
                else
                        touched = READ_ONCE(wq_watchdog_touched);
                pool_ts = READ_ONCE(pool->watchdog_ts);

                if (time_after(pool_ts, touched))
                        ts = pool_ts;
                else
                        ts = touched;

                /* did we stall? */
                if (time_after(now, ts + thresh)) {
                        lockup_detected = true;
                        if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
                                pool->cpu_stall = true;
                                cpu_pool_stall = true;
                        }
                        pr_emerg("BUG: workqueue lockup - pool");
                        pr_cont_pool_info(pool);
                        pr_cont(" stuck for %us!\n",
                                jiffies_to_msecs(now - pool_ts) / 1000);
                }


        }

        rcu_read_unlock();

        if (lockup_detected)
                show_all_workqueues();

        if (cpu_pool_stall)
                show_cpu_pools_hogs();

        wq_watchdog_reset_touched();
        mod_timer(&wq_watchdog_timer, jiffies + thresh);
}

notrace void wq_watchdog_touch(int cpu)
{
        if (cpu >= 0)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;

        wq_watchdog_touched = jiffies;
}

static void wq_watchdog_set_thresh(unsigned long thresh)
{
        wq_watchdog_thresh = 0;
        del_timer_sync(&wq_watchdog_timer);

        if (thresh) {
                wq_watchdog_thresh = thresh;
                wq_watchdog_reset_touched();
                mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
        }
}

static int wq_watchdog_param_set_thresh(const char *val,
                                        const struct kernel_param *kp)
{
        unsigned long thresh;
        int ret;

        ret = kstrtoul(val, 0, &thresh);
        if (ret)
                return ret;

        if (system_wq)
                wq_watchdog_set_thresh(thresh);
        else
                wq_watchdog_thresh = thresh;

        return 0;
}

static const struct kernel_param_ops wq_watchdog_thresh_ops = {
        .set        = wq_watchdog_param_set_thresh,
        .get        = param_get_ulong,
};

module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
                0644);

static void wq_watchdog_init(void)
{
        timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
        wq_watchdog_set_thresh(wq_watchdog_thresh);
}

#else        /* CONFIG_WQ_WATCHDOG */

static inline void wq_watchdog_init(void) { }

#endif        /* CONFIG_WQ_WATCHDOG */

static void bh_pool_kick_normal(struct irq_work *irq_work)
{
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

static void bh_pool_kick_highpri(struct irq_work *irq_work)
{
        raise_softirq_irqoff(HI_SOFTIRQ);
}

static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
{
        if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
                pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
                        cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
                return;
        }

        cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
}

static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice)
{
        BUG_ON(init_worker_pool(pool));
        pool->cpu = cpu;
        cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
        cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
        pool->attrs->nice = nice;
        pool->attrs->affn_strict = true;
        pool->node = cpu_to_node(cpu);

        /* alloc pool ID */
        mutex_lock(&wq_pool_mutex);
        BUG_ON(worker_pool_assign_id(pool));
        mutex_unlock(&wq_pool_mutex);
}

/**
 * workqueue_init_early - early init for workqueue subsystem
 *
 * This is the first step of three-staged workqueue subsystem initialization and
 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
 * up. It sets up all the data structures and system workqueues and allows early
 * boot code to create workqueues and queue/cancel work items. Actual work item
 * execution starts only after kthreads can be created and scheduled right
 * before early initcalls.
 */
void __init workqueue_init_early(void)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
                                                       bh_pool_kick_highpri };
        int i, cpu;

        BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));

        cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
        restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
        restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
        if (!cpumask_empty(&wq_cmdline_cpumask))
                restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);

        cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);

        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

        wq_update_pod_attrs_buf = alloc_workqueue_attrs();
        BUG_ON(!wq_update_pod_attrs_buf);

        /*
         * If nohz_full is enabled, set power efficient workqueue as unbound.
         * This allows workqueue items to be moved to HK CPUs.
         */
        if (housekeeping_enabled(HK_TYPE_TICK))
                wq_power_efficient = true;

        /* initialize WQ_AFFN_SYSTEM pods */
        pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);

        BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));

        pt->nr_pods = 1;
        cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
        pt->pod_node[0] = NUMA_NO_NODE;
        pt->cpu_pod[0] = 0;

        /* initialize BH and CPU pools */
        for_each_possible_cpu(cpu) {
                struct worker_pool *pool;

                i = 0;
                for_each_bh_worker_pool(pool, cpu) {
                        init_cpu_worker_pool(pool, cpu, std_nice[i]);
                        pool->flags |= POOL_BH;
                        init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
                        i++;
                }

                i = 0;
                for_each_cpu_worker_pool(pool, cpu)
                        init_cpu_worker_pool(pool, cpu, std_nice[i++]);
        }

        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;

                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;

                /*
                 * An ordered wq should have only one pwq as ordering is
                 * guaranteed by max_active which is enforced by pwqs.
                 */
                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                attrs->ordered = true;
                ordered_wq_attrs[i] = attrs;
        }

        system_wq = alloc_workqueue("events", 0, 0);
        system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
        system_long_wq = alloc_workqueue("events_long", 0, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
        system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                                              WQ_POWER_EFFICIENT, 0);
        system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
                                              WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                                              0);
        system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
        system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
                                               WQ_BH | WQ_HIGHPRI, 0);
        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
               !system_unbound_wq || !system_freezable_wq ||
               !system_power_efficient_wq ||
               !system_freezable_power_efficient_wq ||
               !system_bh_wq || !system_bh_highpri_wq);
}

static void __init wq_cpu_intensive_thresh_init(void)
{
        unsigned long thresh;
        unsigned long bogo;

        pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
        BUG_ON(IS_ERR(pwq_release_worker));

        /* if the user set it to a specific value, keep it */
        if (wq_cpu_intensive_thresh_us != ULONG_MAX)
                return;

        /*
         * The default of 10ms is derived from the fact that most modern (as of
         * 2023) processors can do a lot in 10ms and that it's just below what
         * most consider human-perceivable. However, the kernel also runs on a
         * lot slower CPUs including microcontrollers where the threshold is way
         * too low.
         *
         * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
         * This is by no means accurate but it doesn't have to be. The mechanism
         * is still useful even when the threshold is fully scaled up. Also, as
         * the reports would usually be applicable to everyone, some machines
         * operating on longer thresholds won't significantly diminish their
         * usefulness.
         */
        thresh = 10 * USEC_PER_MSEC;

        /* see init/calibrate.c for lpj -> BogoMIPS calculation */
        bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
        if (bogo < 4000)
                thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);

        pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
                 loops_per_jiffy, bogo, thresh);

        wq_cpu_intensive_thresh_us = thresh;
}

/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the second step of three-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled. Workqueues have
 * been created and work items queued on them, but there are no kworkers
 * executing the work items yet. Populate the worker pools with the initial
 * workers and enable future kworker creations.
 */
void __init workqueue_init(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int cpu, bkt;

        wq_cpu_intensive_thresh_init();

        mutex_lock(&wq_pool_mutex);

        /*
         * Per-cpu pools created earlier could be missing node hint. Fix them
         * up. Also, create a rescuer for workqueues that requested it.
         */
        for_each_possible_cpu(cpu) {
                for_each_bh_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
                for_each_cpu_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
        }

        list_for_each_entry(wq, &workqueues, list) {
                WARN(init_rescuer(wq),
                     "workqueue: failed to create early rescuer for %s",
                     wq->name);
        }

        mutex_unlock(&wq_pool_mutex);

        /*
         * Create the initial workers. A BH pool has one pseudo worker that
         * represents the shared BH execution context and thus doesn't get
         * affected by hotplug events. Create the BH pseudo workers for all
         * possible CPUs here.
         */
        for_each_possible_cpu(cpu)
                for_each_bh_worker_pool(pool, cpu)
                        BUG_ON(!create_worker(pool));

        for_each_online_cpu(cpu) {
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
                        BUG_ON(!create_worker(pool));
                }
        }

        hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
                BUG_ON(!create_worker(pool));

        wq_online = true;
        wq_watchdog_init();
}

/*
 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
 * and consecutive pod ID. The rest of @pt is initialized accordingly.
 */
static void __init init_pod_type(struct wq_pod_type *pt,
                                 bool (*cpus_share_pod)(int, int))
{
        int cur, pre, cpu, pod;

        pt->nr_pods = 0;

        /* init @pt->cpu_pod[] according to @cpus_share_pod() */
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->cpu_pod);

        for_each_possible_cpu(cur) {
                for_each_possible_cpu(pre) {
                        if (pre >= cur) {
                                pt->cpu_pod[cur] = pt->nr_pods++;
                                break;
                        }
                        if (cpus_share_pod(cur, pre)) {
                                pt->cpu_pod[cur] = pt->cpu_pod[pre];
                                break;
                        }
                }
        }

        /* init the rest to match @pt->cpu_pod[] */
        pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node);

        for (pod = 0; pod < pt->nr_pods; pod++)
                BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));

        for_each_possible_cpu(cpu) {
                cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
                pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
        }
}

static bool __init cpus_dont_share(int cpu0, int cpu1)
{
        return false;
}

static bool __init cpus_share_smt(int cpu0, int cpu1)
{
#ifdef CONFIG_SCHED_SMT
        return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
#else
        return false;
#endif
}

static bool __init cpus_share_numa(int cpu0, int cpu1)
{
        return cpu_to_node(cpu0) == cpu_to_node(cpu1);
}

/**
 * workqueue_init_topology - initialize CPU pods for unbound workqueues
 *
 * This is the third step of three-staged workqueue subsystem initialization and
 * invoked after SMP and topology information are fully initialized. It
 * initializes the unbound CPU pods accordingly.
 */
void __init workqueue_init_topology(void)
{
        struct workqueue_struct *wq;
        int cpu;

        init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
        init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
        init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
        init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);

        wq_topo_initialized = true;

        mutex_lock(&wq_pool_mutex);

        /*
         * Workqueues allocated earlier would have all CPUs sharing the default
         * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
         * combinations to apply per-pod sharing.
         */
        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        wq_update_pod(wq, cpu, cpu, true);
                if (wq->flags & WQ_UNBOUND) {
                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
}

void __warn_flushing_systemwide_wq(void)
{
        pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
        dump_stack();
}
EXPORT_SYMBOL(__warn_flushing_systemwide_wq);

static int __init workqueue_unbound_cpus_setup(char *str)
{
        if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
                cpumask_clear(&wq_cmdline_cpumask);
                pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
        }

        return 1;
}
__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);































































































































































































































































































































    2 











    3 















    4 








    3 















    4 



    1 












    2 

























    1 


















































































































    3 

    4 



    1 
























    2 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * Copyright (C) 2017-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
 *
 * This driver produces cryptographically secure pseudorandom data. It is divided
 * into roughly six sections, each with a section header:
 *
 *   - Initialization and readiness waiting.
 *   - Fast key erasure RNG, the "crng".
 *   - Entropy accumulation and extraction routines.
 *   - Entropy collection routines.
 *   - Userspace reader/writer interfaces.
 *   - Sysctl interface.
 *
 * The high level overview is that there is one input pool, into which
 * various pieces of data are hashed. Prior to initialization, some of that
 * data is then "credited" as having a certain number of bits of entropy.
 * When enough bits of entropy are available, the hash is finalized and
 * handed as a key to a stream cipher that expands it indefinitely for
 * various consumers. This key is periodically refreshed as the various
 * entropy collectors, described below, add data to the input pool.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/utsname.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/workqueue.h>
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include <linux/siphash.h>
#include <linux/sched/isolation.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#include <asm/archrandom.h>
#include <asm/processor.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/io.h>

/*********************************************************************
 *
 * Initialization and readiness waiting.
 *
 * Much of the RNG infrastructure is devoted to various dependencies
 * being able to wait until the RNG has collected enough entropy and
 * is ready for safe consumption.
 *
 *********************************************************************/

/*
 * crng_init is protected by base_crng->lock, and only increases
 * its value (from empty->early->ready).
 */
static enum {
        CRNG_EMPTY = 0, /* Little to no entropy collected */
        CRNG_EARLY = 1, /* At least POOL_EARLY_BITS collected */
        CRNG_READY = 2  /* Fully initialized with POOL_READY_BITS collected */
} crng_init __read_mostly = CRNG_EMPTY;
static DEFINE_STATIC_KEY_FALSE(crng_is_ready);
#define crng_ready() (static_branch_likely(&crng_is_ready) || crng_init >= CRNG_READY)
/* Various types of waiters for crng_init->CRNG_READY transition. */
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
static struct fasync_struct *fasync;
static ATOMIC_NOTIFIER_HEAD(random_ready_notifier);

/* Control how we warn userspace. */
static struct ratelimit_state urandom_warning =
        RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE);
static int ratelimit_disable __read_mostly =
        IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM);
module_param_named(ratelimit_disable, ratelimit_disable, int, 0644);
MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression");

/*
 * Returns whether or not the input pool has been seeded and thus guaranteed
 * to supply cryptographically secure random numbers. This applies to: the
 * /dev/urandom device, the get_random_bytes function, and the get_random_{u8,
 * u16,u32,u64,long} family of functions.
 *
 * Returns: true if the input pool has been seeded.
 *          false if the input pool has not been seeded.
 */
bool rng_is_initialized(void)
{
        return crng_ready();
}
EXPORT_SYMBOL(rng_is_initialized);

static void __cold crng_set_ready(struct work_struct *work)
{
        static_branch_enable(&crng_is_ready);
}

/* Used by wait_for_random_bytes(), and considered an entropy collector, below. */
static void try_to_generate_entropy(void);

/*
 * Wait for the input pool to be seeded and thus guaranteed to supply
 * cryptographically secure random numbers. This applies to: the /dev/urandom
 * device, the get_random_bytes function, and the get_random_{u8,u16,u32,u64,
 * long} family of functions. Using any of these functions without first
 * calling this function forfeits the guarantee of security.
 *
 * Returns: 0 if the input pool has been seeded.
 *          -ERESTARTSYS if the function was interrupted by a signal.
 */
int wait_for_random_bytes(void)
{
        while (!crng_ready()) {
                int ret;

                try_to_generate_entropy();
                ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
                if (ret)
                        return ret > 0 ? 0 : ret;
        }
        return 0;
}
EXPORT_SYMBOL(wait_for_random_bytes);

/*
 * Add a callback function that will be invoked when the crng is initialised,
 * or immediately if it already has been. Only use this is you are absolutely
 * sure it is required. Most users should instead be able to test
 * `rng_is_initialized()` on demand, or make use of `get_random_bytes_wait()`.
 */
int __cold execute_with_initialized_rng(struct notifier_block *nb)
{
        unsigned long flags;
        int ret = 0;

        spin_lock_irqsave(&random_ready_notifier.lock, flags);
        if (crng_ready())
                nb->notifier_call(nb, 0, NULL);
        else
                ret = raw_notifier_chain_register((struct raw_notifier_head *)&random_ready_notifier.head, nb);
        spin_unlock_irqrestore(&random_ready_notifier.lock, flags);
        return ret;
}

#define warn_unseeded_randomness() \
        if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \
                printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \
                                __func__, (void *)_RET_IP_, crng_init)


/*********************************************************************
 *
 * Fast key erasure RNG, the "crng".
 *
 * These functions expand entropy from the entropy extractor into
 * long streams for external consumption using the "fast key erasure"
 * RNG described at <https://blog.cr.yp.to/20170723-random.html>.
 *
 * There are a few exported interfaces for use by other drivers:
 *
 *        void get_random_bytes(void *buf, size_t len)
 *        u8 get_random_u8()
 *        u16 get_random_u16()
 *        u32 get_random_u32()
 *        u32 get_random_u32_below(u32 ceil)
 *        u32 get_random_u32_above(u32 floor)
 *        u32 get_random_u32_inclusive(u32 floor, u32 ceil)
 *        u64 get_random_u64()
 *        unsigned long get_random_long()
 *
 * These interfaces will return the requested number of random bytes
 * into the given buffer or as a return value. This is equivalent to
 * a read from /dev/urandom. The u8, u16, u32, u64, long family of
 * functions may be higher performance for one-off random integers,
 * because they do a bit of buffering and do not invoke reseeding
 * until the buffer is emptied.
 *
 *********************************************************************/

enum {
        CRNG_RESEED_START_INTERVAL = HZ,
        CRNG_RESEED_INTERVAL = 60 * HZ
};

static struct {
        u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long));
        unsigned long generation;
        spinlock_t lock;
} base_crng = {
        .lock = __SPIN_LOCK_UNLOCKED(base_crng.lock)
};

struct crng {
        u8 key[CHACHA_KEY_SIZE];
        unsigned long generation;
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct crng, crngs) = {
        .generation = ULONG_MAX,
        .lock = INIT_LOCAL_LOCK(crngs.lock),
};

/*
 * Return the interval until the next reseeding, which is normally
 * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval
 * proportional to the uptime.
 */
static unsigned int crng_reseed_interval(void)
{
        static bool early_boot = true;

        if (unlikely(READ_ONCE(early_boot))) {
                time64_t uptime = ktime_get_seconds();
                if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2)
                        WRITE_ONCE(early_boot, false);
                else
                        return max_t(unsigned int, CRNG_RESEED_START_INTERVAL,
                                     (unsigned int)uptime / 2 * HZ);
        }
        return CRNG_RESEED_INTERVAL;
}

/* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */
static void extract_entropy(void *buf, size_t len);

/* This extracts a new crng key from the input pool. */
static void crng_reseed(struct work_struct *work)
{
        static DECLARE_DELAYED_WORK(next_reseed, crng_reseed);
        unsigned long flags;
        unsigned long next_gen;
        u8 key[CHACHA_KEY_SIZE];

        /* Immediately schedule the next reseeding, so that it fires sooner rather than later. */
        if (likely(system_unbound_wq))
                queue_delayed_work(system_unbound_wq, &next_reseed, crng_reseed_interval());

        extract_entropy(key, sizeof(key));

        /*
         * We copy the new key into the base_crng, overwriting the old one,
         * and update the generation counter. We avoid hitting ULONG_MAX,
         * because the per-cpu crngs are initialized to ULONG_MAX, so this
         * forces new CPUs that come online to always initialize.
         */
        spin_lock_irqsave(&base_crng.lock, flags);
        memcpy(base_crng.key, key, sizeof(base_crng.key));
        next_gen = base_crng.generation + 1;
        if (next_gen == ULONG_MAX)
                ++next_gen;
        WRITE_ONCE(base_crng.generation, next_gen);
        if (!static_branch_likely(&crng_is_ready))
                crng_init = CRNG_READY;
        spin_unlock_irqrestore(&base_crng.lock, flags);
        memzero_explicit(key, sizeof(key));
}

/*
 * This generates a ChaCha block using the provided key, and then
 * immediately overwrites that key with half the block. It returns
 * the resultant ChaCha state to the user, along with the second
 * half of the block containing 32 bytes of random data that may
 * be used; random_data_len may not be greater than 32.
 *
 * The returned ChaCha state contains within it a copy of the old
 * key value, at index 4, so the state should always be zeroed out
 * immediately after using in order to maintain forward secrecy.
 * If the state cannot be erased in a timely manner, then it is
 * safer to set the random_data parameter to &chacha_state[4] so
 * that this function overwrites it before returning.
 */
static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
                                  u32 chacha_state[CHACHA_STATE_WORDS],
                                  u8 *random_data, size_t random_data_len)
{
        u8 first_block[CHACHA_BLOCK_SIZE];

        BUG_ON(random_data_len > 32);

        chacha_init_consts(chacha_state);
        memcpy(&chacha_state[4], key, CHACHA_KEY_SIZE);
        memset(&chacha_state[12], 0, sizeof(u32) * 4);
        chacha20_block(chacha_state, first_block);

        memcpy(key, first_block, CHACHA_KEY_SIZE);
        memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len);
        memzero_explicit(first_block, sizeof(first_block));
}

/*
 * This function returns a ChaCha state that you may use for generating
 * random data. It also returns up to 32 bytes on its own of random data
 * that may be used; random_data_len may not be greater than 32.
 */
static void crng_make_state(u32 chacha_state[CHACHA_STATE_WORDS],
                            u8 *random_data, size_t random_data_len)
{
        unsigned long flags;
        struct crng *crng;

        BUG_ON(random_data_len > 32);

        /*
         * For the fast path, we check whether we're ready, unlocked first, and
         * then re-check once locked later. In the case where we're really not
         * ready, we do fast key erasure with the base_crng directly, extracting
         * when crng_init is CRNG_EMPTY.
         */
        if (!crng_ready()) {
                bool ready;

                spin_lock_irqsave(&base_crng.lock, flags);
                ready = crng_ready();
                if (!ready) {
                        if (crng_init == CRNG_EMPTY)
                                extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_fast_key_erasure(base_crng.key, chacha_state,
                                              random_data, random_data_len);
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
                if (!ready)
                        return;
        }

        local_lock_irqsave(&crngs.lock, flags);
        crng = raw_cpu_ptr(&crngs);

        /*
         * If our per-cpu crng is older than the base_crng, then it means
         * somebody reseeded the base_crng. In that case, we do fast key
         * erasure on the base_crng, and use its output as the new key
         * for our per-cpu crng. This brings us up to date with base_crng.
         */
        if (unlikely(crng->generation != READ_ONCE(base_crng.generation))) {
                spin_lock(&base_crng.lock);
                crng_fast_key_erasure(base_crng.key, chacha_state,
                                      crng->key, sizeof(crng->key));
                crng->generation = base_crng.generation;
                spin_unlock(&base_crng.lock);
        }

        /*
         * Finally, when we've made it this far, our per-cpu crng has an up
         * to date key, and we can do fast key erasure with it to produce
         * some random data and a ChaCha state for the caller. All other
         * branches of this function are "unlikely", so most of the time we
         * should wind up here immediately.
         */
        crng_fast_key_erasure(crng->key, chacha_state, random_data, random_data_len);
        local_unlock_irqrestore(&crngs.lock, flags);
}

static void _get_random_bytes(void *buf, size_t len)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 tmp[CHACHA_BLOCK_SIZE];
        size_t first_block_len;

        if (!len)
                return;

        first_block_len = min_t(size_t, 32, len);
        crng_make_state(chacha_state, buf, first_block_len);
        len -= first_block_len;
        buf += first_block_len;

        while (len) {
                if (len < CHACHA_BLOCK_SIZE) {
                        chacha20_block(chacha_state, tmp);
                        memcpy(buf, tmp, len);
                        memzero_explicit(tmp, sizeof(tmp));
                        break;
                }

                chacha20_block(chacha_state, buf);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];
                len -= CHACHA_BLOCK_SIZE;
                buf += CHACHA_BLOCK_SIZE;
        }

        memzero_explicit(chacha_state, sizeof(chacha_state));
}

/*
 * This returns random bytes in arbitrary quantities. The quality of the
 * random bytes is good as /dev/urandom. In order to ensure that the
 * randomness provided by this function is okay, the function
 * wait_for_random_bytes() should be called and return 0 at least once
 * at any point prior.
 */
void get_random_bytes(void *buf, size_t len)
{
        warn_unseeded_randomness();
        _get_random_bytes(buf, len);
}
EXPORT_SYMBOL(get_random_bytes);

static ssize_t get_random_bytes_user(struct iov_iter *iter)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 block[CHACHA_BLOCK_SIZE];
        size_t ret = 0, copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        /*
         * Immediately overwrite the ChaCha key at index 4 with random
         * bytes, in case userspace causes copy_to_iter() below to sleep
         * forever, so that we still retain forward secrecy in that case.
         */
        crng_make_state(chacha_state, (u8 *)&chacha_state[4], CHACHA_KEY_SIZE);
        /*
         * However, if we're doing a read of len <= 32, we don't need to
         * use chacha_state after, so we can simply return those bytes to
         * the user directly.
         */
        if (iov_iter_count(iter) <= CHACHA_KEY_SIZE) {
                ret = copy_to_iter(&chacha_state[4], CHACHA_KEY_SIZE, iter);
                goto out_zero_chacha;
        }

        for (;;) {
                chacha20_block(chacha_state, block);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];

                copied = copy_to_iter(block, sizeof(block), iter);
                ret += copied;
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
out_zero_chacha:
        memzero_explicit(chacha_state, sizeof(chacha_state));
        return ret ? ret : -EFAULT;
}

/*
 * Batched entropy returns random integers. The quality of the random
 * number is good as /dev/urandom. In order to ensure that the randomness
 * provided by this function is okay, the function wait_for_random_bytes()
 * should be called and return 0 at least once at any point prior.
 */

#define DEFINE_BATCHED_ENTROPY(type)                                                \
struct batch_ ##type {                                                                \
        /*                                                                        \
         * We make this 1.5x a ChaCha block, so that we get the                        \
         * remaining 32 bytes from fast key erasure, plus one full                \
         * block from the detached ChaCha state. We can increase                \
         * the size of this later if needed so long as we keep the                \
         * formula of (integer_blocks + 0.5) * CHACHA_BLOCK_SIZE.                \
         */                                                                        \
        type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))];                \
        local_lock_t lock;                                                        \
        unsigned long generation;                                                \
        unsigned int position;                                                        \
};                                                                                \
                                                                                \
static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = {        \
        .lock = INIT_LOCAL_LOCK(batched_entropy_ ##type.lock),                        \
        .position = UINT_MAX                                                        \
};                                                                                \
                                                                                \
type get_random_ ##type(void)                                                        \
{                                                                                \
        type ret;                                                                \
        unsigned long flags;                                                        \
        struct batch_ ##type *batch;                                                \
        unsigned long next_gen;                                                        \
                                                                                \
        warn_unseeded_randomness();                                                \
                                                                                \
        if  (!crng_ready()) {                                                        \
                _get_random_bytes(&ret, sizeof(ret));                                \
                return ret;                                                        \
        }                                                                        \
                                                                                \
        local_lock_irqsave(&batched_entropy_ ##type.lock, flags);                \
        batch = raw_cpu_ptr(&batched_entropy_##type);                                \
                                                                                \
        next_gen = READ_ONCE(base_crng.generation);                                \
        if (batch->position >= ARRAY_SIZE(batch->entropy) ||                        \
            next_gen != batch->generation) {                                        \
                _get_random_bytes(batch->entropy, sizeof(batch->entropy));        \
                batch->position = 0;                                                \
                batch->generation = next_gen;                                        \
        }                                                                        \
                                                                                \
        ret = batch->entropy[batch->position];                                        \
        batch->entropy[batch->position] = 0;                                        \
        ++batch->position;                                                        \
        local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);                \
        return ret;                                                                \
}                                                                                \
EXPORT_SYMBOL(get_random_ ##type);

DEFINE_BATCHED_ENTROPY(u8)
DEFINE_BATCHED_ENTROPY(u16)
DEFINE_BATCHED_ENTROPY(u32)
DEFINE_BATCHED_ENTROPY(u64)

u32 __get_random_u32_below(u32 ceil)
{
        /*
         * This is the slow path for variable ceil. It is still fast, most of
         * the time, by doing traditional reciprocal multiplication and
         * opportunistically comparing the lower half to ceil itself, before
         * falling back to computing a larger bound, and then rejecting samples
         * whose lower half would indicate a range indivisible by ceil. The use
         * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable
         * in 32-bits.
         */
        u32 rand = get_random_u32();
        u64 mult;

        /*
         * This function is technically undefined for ceil == 0, and in fact
         * for the non-underscored constant version in the header, we build bug
         * on that. But for the non-constant case, it's convenient to have that
         * evaluate to being a straight call to get_random_u32(), so that
         * get_random_u32_inclusive() can work over its whole range without
         * undefined behavior.
         */
        if (unlikely(!ceil))
                return rand;

        mult = (u64)ceil * rand;
        if (unlikely((u32)mult < ceil)) {
                u32 bound = -ceil % ceil;
                while (unlikely((u32)mult < bound))
                        mult = (u64)ceil * get_random_u32();
        }
        return mult >> 32;
}
EXPORT_SYMBOL(__get_random_u32_below);

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU is coming up, with entry
 * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP.
 */
int __cold random_prepare_cpu(unsigned int cpu)
{
        /*
         * When the cpu comes back online, immediately invalidate both
         * the per-cpu crng and all batches, so that we serve fresh
         * randomness.
         */
        per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX;
        per_cpu_ptr(&batched_entropy_u8, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u16, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX;
        return 0;
}
#endif


/**********************************************************************
 *
 * Entropy accumulation and extraction routines.
 *
 * Callers may add entropy via:
 *
 *     static void mix_pool_bytes(const void *buf, size_t len)
 *
 * After which, if added entropy should be credited:
 *
 *     static void credit_init_bits(size_t bits)
 *
 * Finally, extract entropy via:
 *
 *     static void extract_entropy(void *buf, size_t len)
 *
 **********************************************************************/

enum {
        POOL_BITS = BLAKE2S_HASH_SIZE * 8,
        POOL_READY_BITS = POOL_BITS, /* When crng_init->CRNG_READY */
        POOL_EARLY_BITS = POOL_READY_BITS / 2 /* When crng_init->CRNG_EARLY */
};

static struct {
        struct blake2s_state hash;
        spinlock_t lock;
        unsigned int init_bits;
} input_pool = {
        .hash.h = { BLAKE2S_IV0 ^ (0x01010000 | BLAKE2S_HASH_SIZE),
                    BLAKE2S_IV1, BLAKE2S_IV2, BLAKE2S_IV3, BLAKE2S_IV4,
                    BLAKE2S_IV5, BLAKE2S_IV6, BLAKE2S_IV7 },
        .hash.outlen = BLAKE2S_HASH_SIZE,
        .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

static void _mix_pool_bytes(const void *buf, size_t len)
{
        blake2s_update(&input_pool.hash, buf, len);
}

/*
 * This function adds bytes into the input pool. It does not
 * update the initialization bit counter; the caller should call
 * credit_init_bits if this is appropriate.
 */
static void mix_pool_bytes(const void *buf, size_t len)
{
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}

/*
 * This is an HKDF-like construction for using the hashed collected entropy
 * as a PRF key, that's then expanded block-by-block.
 */
static void extract_entropy(void *buf, size_t len)
{
        unsigned long flags;
        u8 seed[BLAKE2S_HASH_SIZE], next_key[BLAKE2S_HASH_SIZE];
        struct {
                unsigned long rdseed[32 / sizeof(long)];
                size_t counter;
        } block;
        size_t i, longs;

        for (i = 0; i < ARRAY_SIZE(block.rdseed);) {
                longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                block.rdseed[i++] = random_get_entropy();
        }

        spin_lock_irqsave(&input_pool.lock, flags);

        /* seed = HASHPRF(last_key, entropy_input) */
        blake2s_final(&input_pool.hash, seed);

        /* next_key = HASHPRF(seed, RDSEED || 0) */
        block.counter = 0;
        blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
        blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));

        spin_unlock_irqrestore(&input_pool.lock, flags);
        memzero_explicit(next_key, sizeof(next_key));

        while (len) {
                i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
                /* output = HASHPRF(seed, RDSEED || ++counter) */
                ++block.counter;
                blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
                len -= i;
                buf += i;
        }

        memzero_explicit(seed, sizeof(seed));
        memzero_explicit(&block, sizeof(block));
}

#define credit_init_bits(bits) if (!crng_ready()) _credit_init_bits(bits)

static void __cold _credit_init_bits(size_t bits)
{
        static DECLARE_WORK(set_ready, crng_set_ready);
        unsigned int new, orig, add;
        unsigned long flags;

        if (!bits)
                return;

        add = min_t(size_t, bits, POOL_BITS);

        orig = READ_ONCE(input_pool.init_bits);
        do {
                new = min_t(unsigned int, POOL_BITS, orig + add);
        } while (!try_cmpxchg(&input_pool.init_bits, &orig, new));

        if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) {
                crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */
                if (static_key_initialized && system_unbound_wq)
                        queue_work(system_unbound_wq, &set_ready);
                atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
                wake_up_interruptible(&crng_init_wait);
                kill_fasync(&fasync, SIGIO, POLL_IN);
                pr_notice("crng init done\n");
                if (urandom_warning.missed)
                        pr_notice("%d urandom warning(s) missed due to ratelimiting\n",
                                  urandom_warning.missed);
        } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) {
                spin_lock_irqsave(&base_crng.lock, flags);
                /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */
                if (crng_init == CRNG_EMPTY) {
                        extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_init = CRNG_EARLY;
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
        }
}


/**********************************************************************
 *
 * Entropy collection routines.
 *
 * The following exported functions are used for pushing entropy into
 * the above entropy accumulation routines:
 *
 *        void add_device_randomness(const void *buf, size_t len);
 *        void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);
 *        void add_bootloader_randomness(const void *buf, size_t len);
 *        void add_vmfork_randomness(const void *unique_vm_id, size_t len);
 *        void add_interrupt_randomness(int irq);
 *        void add_input_randomness(unsigned int type, unsigned int code, unsigned int value);
 *        void add_disk_randomness(struct gendisk *disk);
 *
 * add_device_randomness() adds data to the input pool that
 * is likely to differ between two devices (or possibly even per boot).
 * This would be things like MAC addresses or serial numbers, or the
 * read-out of the RTC. This does *not* credit any actual entropy to
 * the pool, but it initializes the pool to different values for devices
 * that might otherwise be identical and have very little entropy
 * available to them (particularly common in the embedded world).
 *
 * add_hwgenerator_randomness() is for true hardware RNGs, and will credit
 * entropy as specified by the caller. If the entropy pool is full it will
 * block until more entropy is needed.
 *
 * add_bootloader_randomness() is called by bootloader drivers, such as EFI
 * and device tree, and credits its input depending on whether or not the
 * command line option 'random.trust_bootloader'.
 *
 * add_vmfork_randomness() adds a unique (but not necessarily secret) ID
 * representing the current instance of a VM to the pool, without crediting,
 * and then force-reseeds the crng so that it takes effect immediately.
 *
 * add_interrupt_randomness() uses the interrupt timing as random
 * inputs to the entropy pool. Using the cycle counters and the irq source
 * as inputs, it feeds the input pool roughly once a second or after 64
 * interrupts, crediting 1 bit of entropy for whichever comes first.
 *
 * add_input_randomness() uses the input layer interrupt timing, as well
 * as the event type information from the hardware.
 *
 * add_disk_randomness() uses what amounts to the seek time of block
 * layer request events, on a per-disk_devt basis, as input to the
 * entropy pool. Note that high-speed solid state drives with very low
 * seek times do not make for good sources of entropy, as their seek
 * times are usually fairly consistent.
 *
 * The last two routines try to estimate how many bits of entropy
 * to credit. They do this by keeping track of the first and second
 * order deltas of the event timings.
 *
 **********************************************************************/

static bool trust_cpu __initdata = true;
static bool trust_bootloader __initdata = true;
static int __init parse_trust_cpu(char *arg)
{
        return kstrtobool(arg, &trust_cpu);
}
static int __init parse_trust_bootloader(char *arg)
{
        return kstrtobool(arg, &trust_bootloader);
}
early_param("random.trust_cpu", parse_trust_cpu);
early_param("random.trust_bootloader", parse_trust_bootloader);

static int random_pm_notification(struct notifier_block *nb, unsigned long action, void *data)
{
        unsigned long flags, entropy = random_get_entropy();

        /*
         * Encode a representation of how long the system has been suspended,
         * in a way that is distinct from prior system suspends.
         */
        ktime_t stamps[] = { ktime_get(), ktime_get_boottime(), ktime_get_real() };

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&action, sizeof(action));
        _mix_pool_bytes(stamps, sizeof(stamps));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        spin_unlock_irqrestore(&input_pool.lock, flags);

        if (crng_ready() && (action == PM_RESTORE_PREPARE ||
            (action == PM_POST_SUSPEND && !IS_ENABLED(CONFIG_PM_AUTOSLEEP) &&
             !IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)))) {
                crng_reseed(NULL);
                pr_notice("crng reseeded on system resumption\n");
        }
        return 0;
}

static struct notifier_block pm_notifier = { .notifier_call = random_pm_notification };

/*
 * This is called extremely early, before time keeping functionality is
 * available, but arch randomness is. Interrupts are not yet enabled.
 */
void __init random_init_early(const char *command_line)
{
        unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)];
        size_t i, longs, arch_bits;

#if defined(LATENT_ENTROPY_PLUGIN)
        static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
        _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed));
#endif

        for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) {
                longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                arch_bits -= sizeof(*entropy) * 8;
                ++i;
        }

        _mix_pool_bytes(init_utsname(), sizeof(*(init_utsname())));
        _mix_pool_bytes(command_line, strlen(command_line));

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);
        else if (trust_cpu)
                _credit_init_bits(arch_bits);
}

/*
 * This is called a little bit after the prior function, and now there is
 * access to timestamps counters. Interrupts are not yet enabled.
 */
void __init random_init(void)
{
        unsigned long entropy = random_get_entropy();
        ktime_t now = ktime_get_real();

        _mix_pool_bytes(&now, sizeof(now));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        add_latent_entropy();

        /*
         * If we were initialized by the cpu or bootloader before jump labels
         * or workqueues are initialized, then we should enable the static
         * branch here, where it's guaranteed that these have been initialized.
         */
        if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY)
                crng_set_ready(NULL);

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);

        WARN_ON(register_pm_notifier(&pm_notifier));

        WARN(!entropy, "Missing cycle counter and fallback timer; RNG "
                       "entropy collection will consequently suffer.");
}

/*
 * Add device- or boot-specific data to the input pool to help
 * initialize it.
 *
 * None of this adds any entropy; it is meant to avoid the problem of
 * the entropy pool having similar initial state across largely
 * identical devices.
 */
void add_device_randomness(const void *buf, size_t len)
{
        unsigned long entropy = random_get_entropy();
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&entropy, sizeof(entropy));
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);

/*
 * Interface for in-kernel drivers of true hardware RNGs. Those devices
 * may produce endless random bits, so this function will sleep for
 * some amount of time after, if the sleep_after parameter is true.
 */
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after)
{
        mix_pool_bytes(buf, len);
        credit_init_bits(entropy);

        /*
         * Throttle writing to once every reseed interval, unless we're not yet
         * initialized or no entropy is credited.
         */
        if (sleep_after && !kthread_should_stop() && (crng_ready() || !entropy))
                schedule_timeout_interruptible(crng_reseed_interval());
}
EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);

/*
 * Handle random seed passed by bootloader, and credit it depending
 * on the command line option 'random.trust_bootloader'.
 */
void __init add_bootloader_randomness(const void *buf, size_t len)
{
        mix_pool_bytes(buf, len);
        if (trust_bootloader)
                credit_init_bits(len * 8);
}

#if IS_ENABLED(CONFIG_VMGENID)
static BLOCKING_NOTIFIER_HEAD(vmfork_chain);

/*
 * Handle a new unique VM ID, which is unique, not secret, so we
 * don't credit it, but we do immediately force a reseed after so
 * that it's used by the crng posthaste.
 */
void __cold add_vmfork_randomness(const void *unique_vm_id, size_t len)
{
        add_device_randomness(unique_vm_id, len);
        if (crng_ready()) {
                crng_reseed(NULL);
                pr_notice("crng reseeded due to virtual machine fork\n");
        }
        blocking_notifier_call_chain(&vmfork_chain, 0, NULL);
}
#if IS_MODULE(CONFIG_VMGENID)
EXPORT_SYMBOL_GPL(add_vmfork_randomness);
#endif

int __cold register_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(register_random_vmfork_notifier);

int __cold unregister_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_random_vmfork_notifier);
#endif

struct fast_pool {
        unsigned long pool[4];
        unsigned long last;
        unsigned int count;
        struct timer_list mix;
};

static void mix_interrupt_randomness(struct timer_list *work);

static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = {
#ifdef CONFIG_64BIT
#define FASTMIX_PERM SIPHASH_PERMUTATION
        .pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 },
#else
#define FASTMIX_PERM HSIPHASH_PERMUTATION
        .pool = { HSIPHASH_CONST_0, HSIPHASH_CONST_1, HSIPHASH_CONST_2, HSIPHASH_CONST_3 },
#endif
        .mix = __TIMER_INITIALIZER(mix_interrupt_randomness, 0)
};

/*
 * This is [Half]SipHash-1-x, starting from an empty key. Because
 * the key is fixed, it assumes that its inputs are non-malicious,
 * and therefore this has no security on its own. s represents the
 * four-word SipHash state, while v represents a two-word input.
 */
static void fast_mix(unsigned long s[4], unsigned long v1, unsigned long v2)
{
        s[3] ^= v1;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v1;
        s[3] ^= v2;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v2;
}

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU has just come online, with
 * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE.
 */
int __cold random_online_cpu(unsigned int cpu)
{
        /*
         * During CPU shutdown and before CPU onlining, add_interrupt_
         * randomness() may schedule mix_interrupt_randomness(), and
         * set the MIX_INFLIGHT flag. However, because the worker can
         * be scheduled on a different CPU during this period, that
         * flag will never be cleared. For that reason, we zero out
         * the flag here, which runs just after workqueues are onlined
         * for the CPU again. This also has the effect of setting the
         * irq randomness count to zero so that new accumulated irqs
         * are fresh.
         */
        per_cpu_ptr(&irq_randomness, cpu)->count = 0;
        return 0;
}
#endif

static void mix_interrupt_randomness(struct timer_list *work)
{
        struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
        /*
         * The size of the copied stack pool is explicitly 2 longs so that we
         * only ever ingest half of the siphash output each time, retaining
         * the other half as the next "key" that carries over. The entropy is
         * supposed to be sufficiently dispersed between bits so on average
         * we don't wind up "losing" some.
         */
        unsigned long pool[2];
        unsigned int count;

        /* Check to see if we're running on the wrong CPU due to hotplug. */
        local_irq_disable();
        if (fast_pool != this_cpu_ptr(&irq_randomness)) {
                local_irq_enable();
                return;
        }

        /*
         * Copy the pool to the stack so that the mixer always has a
         * consistent view, before we reenable irqs again.
         */
        memcpy(pool, fast_pool->pool, sizeof(pool));
        count = fast_pool->count;
        fast_pool->count = 0;
        fast_pool->last = jiffies;
        local_irq_enable();

        mix_pool_bytes(pool, sizeof(pool));
        credit_init_bits(clamp_t(unsigned int, (count & U16_MAX) / 64, 1, sizeof(pool) * 8));

        memzero_explicit(pool, sizeof(pool));
}

void add_interrupt_randomness(int irq)
{
        enum { MIX_INFLIGHT = 1U << 31 };
        unsigned long entropy = random_get_entropy();
        struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
        struct pt_regs *regs = get_irq_regs();
        unsigned int new_count;

        fast_mix(fast_pool->pool, entropy,
                 (regs ? instruction_pointer(regs) : _RET_IP_) ^ swab(irq));
        new_count = ++fast_pool->count;

        if (new_count & MIX_INFLIGHT)
                return;

        if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ))
                return;

        fast_pool->count |= MIX_INFLIGHT;
        if (!timer_pending(&fast_pool->mix)) {
                fast_pool->mix.expires = jiffies;
                add_timer_on(&fast_pool->mix, raw_smp_processor_id());
        }
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);

/* There is one of these per entropy source */
struct timer_rand_state {
        unsigned long last_time;
        long last_delta, last_delta2;
};

/*
 * This function adds entropy to the entropy "pool" by using timing
 * delays. It uses the timer_rand_state structure to make an estimate
 * of how many bits of entropy this call has added to the pool. The
 * value "num" is also added to the pool; it should somehow describe
 * the type of event that just happened.
 */
static void add_timer_randomness(struct timer_rand_state *state, unsigned int num)
{
        unsigned long entropy = random_get_entropy(), now = jiffies, flags;
        long delta, delta2, delta3;
        unsigned int bits;

        /*
         * If we're in a hard IRQ, add_interrupt_randomness() will be called
         * sometime after, so mix into the fast pool.
         */
        if (in_hardirq()) {
                fast_mix(this_cpu_ptr(&irq_randomness)->pool, entropy, num);
        } else {
                spin_lock_irqsave(&input_pool.lock, flags);
                _mix_pool_bytes(&entropy, sizeof(entropy));
                _mix_pool_bytes(&num, sizeof(num));
                spin_unlock_irqrestore(&input_pool.lock, flags);
        }

        if (crng_ready())
                return;

        /*
         * Calculate number of bits of randomness we probably added.
         * We take into account the first, second and third-order deltas
         * in order to make our estimate.
         */
        delta = now - READ_ONCE(state->last_time);
        WRITE_ONCE(state->last_time, now);

        delta2 = delta - READ_ONCE(state->last_delta);
        WRITE_ONCE(state->last_delta, delta);

        delta3 = delta2 - READ_ONCE(state->last_delta2);
        WRITE_ONCE(state->last_delta2, delta2);

        if (delta < 0)
                delta = -delta;
        if (delta2 < 0)
                delta2 = -delta2;
        if (delta3 < 0)
                delta3 = -delta3;
        if (delta > delta2)
                delta = delta2;
        if (delta > delta3)
                delta = delta3;

        /*
         * delta is now minimum absolute delta. Round down by 1 bit
         * on general principles, and limit entropy estimate to 11 bits.
         */
        bits = min(fls(delta >> 1), 11);

        /*
         * As mentioned above, if we're in a hard IRQ, add_interrupt_randomness()
         * will run after this, which uses a different crediting scheme of 1 bit
         * per every 64 interrupts. In order to let that function do accounting
         * close to the one in this function, we credit a full 64/64 bit per bit,
         * and then subtract one to account for the extra one added.
         */
        if (in_hardirq())
                this_cpu_ptr(&irq_randomness)->count += max(1u, bits * 64) - 1;
        else
                _credit_init_bits(bits);
}

void add_input_randomness(unsigned int type, unsigned int code, unsigned int value)
{
        static unsigned char last_value;
        static struct timer_rand_state input_timer_state = { INITIAL_JIFFIES };

        /* Ignore autorepeat and the like. */
        if (value == last_value)
                return;

        last_value = value;
        add_timer_randomness(&input_timer_state,
                             (type << 4) ^ code ^ (code >> 4) ^ value);
}
EXPORT_SYMBOL_GPL(add_input_randomness);

#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
{
        if (!disk || !disk->random)
                return;
        /* First major is 1, so we get >= 0x200 here. */
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
}
EXPORT_SYMBOL_GPL(add_disk_randomness);

void __cold rand_initialize_disk(struct gendisk *disk)
{
        struct timer_rand_state *state;

        /*
         * If kzalloc returns null, we just won't use that entropy
         * source.
         */
        state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
        if (state) {
                state->last_time = INITIAL_JIFFIES;
                disk->random = state;
        }
}
#endif

struct entropy_timer_state {
        unsigned long entropy;
        struct timer_list timer;
        atomic_t samples;
        unsigned int samples_per_bit;
};

/*
 * Each time the timer fires, we expect that we got an unpredictable jump in
 * the cycle counter. Even if the timer is running on another CPU, the timer
 * activity will be touching the stack of the CPU that is generating entropy.
 *
 * Note that we don't re-arm the timer in the timer itself - we are happy to be
 * scheduled away, since that just makes the load more complex, but we do not
 * want the timer to keep ticking unless the entropy loop is running.
 *
 * So the re-arming always happens in the entropy loop itself.
 */
static void __cold entropy_timer(struct timer_list *timer)
{
        struct entropy_timer_state *state = container_of(timer, struct entropy_timer_state, timer);
        unsigned long entropy = random_get_entropy();

        mix_pool_bytes(&entropy, sizeof(entropy));
        if (atomic_inc_return(&state->samples) % state->samples_per_bit == 0)
                credit_init_bits(1);
}

/*
 * If we have an actual cycle counter, see if we can generate enough entropy
 * with timing noise.
 */
static void __cold try_to_generate_entropy(void)
{
        enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
        u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
        struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES);
        unsigned int i, num_different = 0;
        unsigned long last = random_get_entropy();
        int cpu = -1;

        for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) {
                stack->entropy = random_get_entropy();
                if (stack->entropy != last)
                        ++num_different;
                last = stack->entropy;
        }
        stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1);
        if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT)
                return;

        atomic_set(&stack->samples, 0);
        timer_setup_on_stack(&stack->timer, entropy_timer, 0);
        while (!crng_ready() && !signal_pending(current)) {
                /*
                 * Check !timer_pending() and then ensure that any previous callback has finished
                 * executing by checking try_to_del_timer_sync(), before queueing the next one.
                 */
                if (!timer_pending(&stack->timer) && try_to_del_timer_sync(&stack->timer) >= 0) {
                        struct cpumask timer_cpus;
                        unsigned int num_cpus;

                        /*
                         * Preemption must be disabled here, both to read the current CPU number
                         * and to avoid scheduling a timer on a dead CPU.
                         */
                        preempt_disable();

                        /* Only schedule callbacks on timer CPUs that are online. */
                        cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
                        num_cpus = cpumask_weight(&timer_cpus);
                        /* In very bizarre case of misconfiguration, fallback to all online. */
                        if (unlikely(num_cpus == 0)) {
                                timer_cpus = *cpu_online_mask;
                                num_cpus = cpumask_weight(&timer_cpus);
                        }

                        /* Basic CPU round-robin, which avoids the current CPU. */
                        do {
                                cpu = cpumask_next(cpu, &timer_cpus);
                                if (cpu >= nr_cpu_ids)
                                        cpu = cpumask_first(&timer_cpus);
                        } while (cpu == smp_processor_id() && num_cpus > 1);

                        /* Expiring the timer at `jiffies` means it's the next tick. */
                        stack->timer.expires = jiffies;

                        add_timer_on(&stack->timer, cpu);

                        preempt_enable();
                }
                mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
                schedule();
                stack->entropy = random_get_entropy();
        }
        mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));

        del_timer_sync(&stack->timer);
        destroy_timer_on_stack(&stack->timer);
}


/**********************************************************************
 *
 * Userspace reader/writer interfaces.
 *
 * getrandom(2) is the primary modern interface into the RNG and should
 * be used in preference to anything else.
 *
 * Reading from /dev/random has the same functionality as calling
 * getrandom(2) with flags=0. In earlier versions, however, it had
 * vastly different semantics and should therefore be avoided, to
 * prevent backwards compatibility issues.
 *
 * Reading from /dev/urandom has the same functionality as calling
 * getrandom(2) with flags=GRND_INSECURE. Because it does not block
 * waiting for the RNG to be ready, it should not be used.
 *
 * Writing to either /dev/random or /dev/urandom adds entropy to
 * the input pool but does not credit it.
 *
 * Polling on /dev/random indicates when the RNG is initialized, on
 * the read side, and when it wants new entropy, on the write side.
 *
 * Both /dev/random and /dev/urandom have the same set of ioctls for
 * adding entropy, getting the entropy count, zeroing the count, and
 * reseeding the crng.
 *
 **********************************************************************/

SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
{
        struct iov_iter iter;
        int ret;

        if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
                return -EINVAL;

        /*
         * Requesting insecure and blocking randomness at the same time makes
         * no sense.
         */
        if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM))
                return -EINVAL;

        if (!crng_ready() && !(flags & GRND_INSECURE)) {
                if (flags & GRND_NONBLOCK)
                        return -EAGAIN;
                ret = wait_for_random_bytes();
                if (unlikely(ret))
                        return ret;
        }

        ret = import_ubuf(ITER_DEST, ubuf, len, &iter);
        if (unlikely(ret))
                return ret;
        return get_random_bytes_user(&iter);
}

static __poll_t random_poll(struct file *file, poll_table *wait)
{
        poll_wait(file, &crng_init_wait, wait);
        return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM;
}

static ssize_t write_pool_user(struct iov_iter *iter)
{
        u8 block[BLAKE2S_BLOCK_SIZE];
        ssize_t ret = 0;
        size_t copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        for (;;) {
                copied = copy_from_iter(block, sizeof(block), iter);
                ret += copied;
                mix_pool_bytes(block, copied);
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
        return ret ? ret : -EFAULT;
}

static ssize_t random_write_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        return write_pool_user(iter);
}

static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        static int maxwarn = 10;

        /*
         * Opportunistically attempt to initialize the RNG on platforms that
         * have fast cycle counters, but don't (for now) require it to succeed.
         */
        if (!crng_ready())
                try_to_generate_entropy();

        if (!crng_ready()) {
                if (!ratelimit_disable && maxwarn <= 0)
                        ++urandom_warning.missed;
                else if (ratelimit_disable || __ratelimit(&urandom_warning)) {
                        --maxwarn;
                        pr_notice("%s: uninitialized urandom read (%zu bytes read)\n",
                                  current->comm, iov_iter_count(iter));
                }
        }

        return get_random_bytes_user(iter);
}

static ssize_t random_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        int ret;

        if (!crng_ready() &&
            ((kiocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) ||
             (kiocb->ki_filp->f_flags & O_NONBLOCK)))
                return -EAGAIN;

        ret = wait_for_random_bytes();
        if (ret != 0)
                return ret;
        return get_random_bytes_user(iter);
}

static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
        int __user *p = (int __user *)arg;
        int ent_count;

        switch (cmd) {
        case RNDGETENTCNT:
                /* Inherently racy, no point locking. */
                if (put_user(input_pool.init_bits, p))
                        return -EFAULT;
                return 0;
        case RNDADDTOENTCNT:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                credit_init_bits(ent_count);
                return 0;
        case RNDADDENTROPY: {
                struct iov_iter iter;
                ssize_t ret;
                int len;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p++))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                if (get_user(len, p++))
                        return -EFAULT;
                ret = import_ubuf(ITER_SOURCE, p, len, &iter);
                if (unlikely(ret))
                        return ret;
                ret = write_pool_user(&iter);
                if (unlikely(ret < 0))
                        return ret;
                /* Since we're crediting, enforce that it was all written into the pool. */
                if (unlikely(ret != len))
                        return -EFAULT;
                credit_init_bits(ent_count);
                return 0;
        }
        case RNDZAPENTCNT:
        case RNDCLEARPOOL:
                /* No longer has any effect. */
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 0;
        case RNDRESEEDCRNG:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (!crng_ready())
                        return -ENODATA;
                crng_reseed(NULL);
                return 0;
        default:
                return -EINVAL;
        }
}

static int random_fasync(int fd, struct file *filp, int on)
{
        return fasync_helper(fd, filp, on, &fasync);
}

const struct file_operations random_fops = {
        .read_iter = random_read_iter,
        .write_iter = random_write_iter,
        .poll = random_poll,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};

const struct file_operations urandom_fops = {
        .read_iter = urandom_read_iter,
        .write_iter = random_write_iter,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};


/********************************************************************
 *
 * Sysctl interface.
 *
 * These are partly unused legacy knobs with dummy values to not break
 * userspace and partly still useful things. They are usually accessible
 * in /proc/sys/kernel/random/ and are as follows:
 *
 * - boot_id - a UUID representing the current boot.
 *
 * - uuid - a random UUID, different each time the file is read.
 *
 * - poolsize - the number of bits of entropy that the input pool can
 *   hold, tied to the POOL_BITS constant.
 *
 * - entropy_avail - the number of bits of entropy currently in the
 *   input pool. Always <= poolsize.
 *
 * - write_wakeup_threshold - the amount of entropy in the input pool
 *   below which write polls to /dev/random will unblock, requesting
 *   more entropy, tied to the POOL_READY_BITS constant. It is writable
 *   to avoid breaking old userspaces, but writing to it does not
 *   change any behavior of the RNG.
 *
 * - urandom_min_reseed_secs - fixed to the value CRNG_RESEED_INTERVAL.
 *   It is writable to avoid breaking old userspaces, but writing
 *   to it does not change any behavior of the RNG.
 *
 ********************************************************************/

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static int sysctl_random_min_urandom_seed = CRNG_RESEED_INTERVAL / HZ;
static int sysctl_random_write_wakeup_bits = POOL_READY_BITS;
static int sysctl_poolsize = POOL_BITS;
static u8 sysctl_bootid[UUID_SIZE];

/*
 * This function is used to return both the bootid UUID, and random
 * UUID. The difference is in whether table->data is NULL; if it is,
 * then a new UUID is generated and returned to the user.
 */
static int proc_do_uuid(struct ctl_table *table, int write, void *buf,
                        size_t *lenp, loff_t *ppos)
{
        u8 tmp_uuid[UUID_SIZE], *uuid;
        char uuid_string[UUID_STRING_LEN + 1];
        struct ctl_table fake_table = {
                .data = uuid_string,
                .maxlen = UUID_STRING_LEN
        };

        if (write)
                return -EPERM;

        uuid = table->data;
        if (!uuid) {
                uuid = tmp_uuid;
                generate_random_uuid(uuid);
        } else {
                static DEFINE_SPINLOCK(bootid_spinlock);

                spin_lock(&bootid_spinlock);
                if (!uuid[8])
                        generate_random_uuid(uuid);
                spin_unlock(&bootid_spinlock);
        }

        snprintf(uuid_string, sizeof(uuid_string), "%pU", uuid);
        return proc_dostring(&fake_table, 0, buf, lenp, ppos);
}

/* The same as proc_dointvec, but writes don't change anything. */
static int proc_do_rointvec(struct ctl_table *table, int write, void *buf,
                            size_t *lenp, loff_t *ppos)
{
        return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos);
}

static struct ctl_table random_table[] = {
        {
                .procname        = "poolsize",
                .data                = &sysctl_poolsize,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "entropy_avail",
                .data                = &input_pool.init_bits,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "write_wakeup_threshold",
                .data                = &sysctl_random_write_wakeup_bits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "urandom_min_reseed_secs",
                .data                = &sysctl_random_min_urandom_seed,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "boot_id",
                .data                = &sysctl_bootid,
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
        {
                .procname        = "uuid",
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
};

/*
 * random_init() is called before sysctl_init(),
 * so we cannot call register_sysctl_init() in random_init()
 */
static int __init random_sysctls_init(void)
{
        register_sysctl_init("kernel/random", random_table);
        return 0;
}
device_initcall(random_sysctls_init);
#endif



























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DEBUGREG_H
#define _ASM_X86_DEBUGREG_H

#include <linux/bug.h>
#include <linux/percpu.h>
#include <uapi/asm/debugreg.h>

#include <asm/cpufeature.h>
#include <asm/msr.h>

DECLARE_PER_CPU(unsigned long, cpu_dr7);

#ifndef CONFIG_PARAVIRT_XXL
/*
 * These special macros can be used to get or set a debugging register
 */
#define get_debugreg(var, register)                                \
        (var) = native_get_debugreg(register)
#define set_debugreg(value, register)                                \
        native_set_debugreg(register, value)
#endif

static __always_inline unsigned long native_get_debugreg(int regno)
{
        unsigned long val = 0;        /* Damn you, gcc! */

        switch (regno) {
        case 0:
                asm("mov %%db0, %0" :"=r" (val));
                break;
        case 1:
                asm("mov %%db1, %0" :"=r" (val));
                break;
        case 2:
                asm("mov %%db2, %0" :"=r" (val));
                break;
        case 3:
                asm("mov %%db3, %0" :"=r" (val));
                break;
        case 6:
                asm("mov %%db6, %0" :"=r" (val));
                break;
        case 7:
                /*
                 * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them
                 * with other code.
                 *
                 * This is needed because a DR7 access can cause a #VC exception
                 * when running under SEV-ES. Taking a #VC exception is not a
                 * safe thing to do just anywhere in the entry code and
                 * re-ordering might place the access into an unsafe location.
                 *
                 * This happened in the NMI handler, where the DR7 read was
                 * re-ordered to happen before the call to sev_es_ist_enter(),
                 * causing stack recursion.
                 */
                asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER);
                break;
        default:
                BUG();
        }
        return val;
}

static __always_inline void native_set_debugreg(int regno, unsigned long value)
{
        switch (regno) {
        case 0:
                asm("mov %0, %%db0"        ::"r" (value));
                break;
        case 1:
                asm("mov %0, %%db1"        ::"r" (value));
                break;
        case 2:
                asm("mov %0, %%db2"        ::"r" (value));
                break;
        case 3:
                asm("mov %0, %%db3"        ::"r" (value));
                break;
        case 6:
                asm("mov %0, %%db6"        ::"r" (value));
                break;
        case 7:
                /*
                 * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them
                 * with other code.
                 *
                 * While is didn't happen with a DR7 write (see the DR7 read
                 * comment above which explains where it happened), add the
                 * __FORCE_ORDER here too to avoid similar problems in the
                 * future.
                 */
                asm volatile("mov %0, %%db7"        ::"r" (value), __FORCE_ORDER);
                break;
        default:
                BUG();
        }
}

static inline void hw_breakpoint_disable(void)
{
        /* Zero the control register for HW Breakpoint */
        set_debugreg(0UL, 7);

        /* Zero-out the individual HW breakpoint address registers */
        set_debugreg(0UL, 0);
        set_debugreg(0UL, 1);
        set_debugreg(0UL, 2);
        set_debugreg(0UL, 3);
}

static __always_inline bool hw_breakpoint_active(void)
{
        return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
}

extern void hw_breakpoint_restore(void);

static __always_inline unsigned long local_db_save(void)
{
        unsigned long dr7;

        if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active())
                return 0;

        get_debugreg(dr7, 7);
        dr7 &= ~0x400; /* architecturally set bit */
        if (dr7)
                set_debugreg(0, 7);
        /*
         * Ensure the compiler doesn't lower the above statements into
         * the critical section; disabling breakpoints late would not
         * be good.
         */
        barrier();

        return dr7;
}

static __always_inline void local_db_restore(unsigned long dr7)
{
        /*
         * Ensure the compiler doesn't raise this statement into
         * the critical section; enabling breakpoints early would
         * not be good.
         */
        barrier();
        if (dr7)
                set_debugreg(dr7, 7);
}

#ifdef CONFIG_CPU_SUP_AMD
extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr);
extern unsigned long amd_get_dr_addr_mask(unsigned int dr);
#else
static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { }
static inline unsigned long amd_get_dr_addr_mask(unsigned int dr)
{
        return 0;
}
#endif

static inline unsigned long get_debugctlmsr(void)
{
        unsigned long debugctlmsr = 0;

#ifndef CONFIG_X86_DEBUGCTLMSR
        if (boot_cpu_data.x86 < 6)
                return 0;
#endif
        rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);

        return debugctlmsr;
}

static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
        if (boot_cpu_data.x86 < 6)
                return;
#endif
        wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}

#endif /* _ASM_X86_DEBUGREG_H */
























































































































































































































































































































































































































































































































































































































































    2 




























































































































    2 







    2 








    2 



















    2 





    2 




    2 









































































































































    1 















    2 




















    1 






    1 













    1 





    1 









    2 


















    2 

















    2 




















    2 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
 */

#include <linux/types.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/udp.h>
#include <linux/tcp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>

#include <linux/dccp.h>
#include <linux/sctp.h>
#include <net/sctp/checksum.h>

#include <linux/netfilter.h>
#include <net/netfilter/nf_nat.h>

#include <linux/ipv6.h>
#include <linux/netfilter_ipv6.h>
#include <net/checksum.h>
#include <net/ip6_checksum.h>
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/ipv6.h>

#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack.h>
#include <linux/netfilter/nfnetlink_conntrack.h>

static void nf_csum_update(struct sk_buff *skb,
                           unsigned int iphdroff, __sum16 *check,
                           const struct nf_conntrack_tuple *t,
                           enum nf_nat_manip_type maniptype);

static void
__udp_manip_pkt(struct sk_buff *skb,
                unsigned int iphdroff, struct udphdr *hdr,
                const struct nf_conntrack_tuple *tuple,
                enum nf_nat_manip_type maniptype, bool do_csum)
{
        __be16 *portptr, newport;

        if (maniptype == NF_NAT_MANIP_SRC) {
                /* Get rid of src port */
                newport = tuple->src.u.udp.port;
                portptr = &hdr->source;
        } else {
                /* Get rid of dst port */
                newport = tuple->dst.u.udp.port;
                portptr = &hdr->dest;
        }
        if (do_csum) {
                nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
                inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
                                         false);
                if (!hdr->check)
                        hdr->check = CSUM_MANGLED_0;
        }
        *portptr = newport;
}

static bool udp_manip_pkt(struct sk_buff *skb,
                          unsigned int iphdroff, unsigned int hdroff,
                          const struct nf_conntrack_tuple *tuple,
                          enum nf_nat_manip_type maniptype)
{
        struct udphdr *hdr;

        if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
                return false;

        hdr = (struct udphdr *)(skb->data + hdroff);
        __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check);

        return true;
}

static bool udplite_manip_pkt(struct sk_buff *skb,
                              unsigned int iphdroff, unsigned int hdroff,
                              const struct nf_conntrack_tuple *tuple,
                              enum nf_nat_manip_type maniptype)
{
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
        struct udphdr *hdr;

        if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
                return false;

        hdr = (struct udphdr *)(skb->data + hdroff);
        __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true);
#endif
        return true;
}

static bool
sctp_manip_pkt(struct sk_buff *skb,
               unsigned int iphdroff, unsigned int hdroff,
               const struct nf_conntrack_tuple *tuple,
               enum nf_nat_manip_type maniptype)
{
#ifdef CONFIG_NF_CT_PROTO_SCTP
        struct sctphdr *hdr;
        int hdrsize = 8;

        /* This could be an inner header returned in imcp packet; in such
         * cases we cannot update the checksum field since it is outside
         * of the 8 bytes of transport layer headers we are guaranteed.
         */
        if (skb->len >= hdroff + sizeof(*hdr))
                hdrsize = sizeof(*hdr);

        if (skb_ensure_writable(skb, hdroff + hdrsize))
                return false;

        hdr = (struct sctphdr *)(skb->data + hdroff);

        if (maniptype == NF_NAT_MANIP_SRC) {
                /* Get rid of src port */
                hdr->source = tuple->src.u.sctp.port;
        } else {
                /* Get rid of dst port */
                hdr->dest = tuple->dst.u.sctp.port;
        }

        if (hdrsize < sizeof(*hdr))
                return true;

        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                hdr->checksum = sctp_compute_cksum(skb, hdroff);
                skb->ip_summed = CHECKSUM_NONE;
        }

#endif
        return true;
}

static bool
tcp_manip_pkt(struct sk_buff *skb,
              unsigned int iphdroff, unsigned int hdroff,
              const struct nf_conntrack_tuple *tuple,
              enum nf_nat_manip_type maniptype)
{
        struct tcphdr *hdr;
        __be16 *portptr, newport, oldport;
        int hdrsize = 8; /* TCP connection tracking guarantees this much */

        /* this could be a inner header returned in icmp packet; in such
           cases we cannot update the checksum field since it is outside of
           the 8 bytes of transport layer headers we are guaranteed */
        if (skb->len >= hdroff + sizeof(struct tcphdr))
                hdrsize = sizeof(struct tcphdr);

        if (skb_ensure_writable(skb, hdroff + hdrsize))
                return false;

        hdr = (struct tcphdr *)(skb->data + hdroff);

        if (maniptype == NF_NAT_MANIP_SRC) {
                /* Get rid of src port */
                newport = tuple->src.u.tcp.port;
                portptr = &hdr->source;
        } else {
                /* Get rid of dst port */
                newport = tuple->dst.u.tcp.port;
                portptr = &hdr->dest;
        }

        oldport = *portptr;
        *portptr = newport;

        if (hdrsize < sizeof(*hdr))
                return true;

        nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
        inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
        return true;
}

static bool
dccp_manip_pkt(struct sk_buff *skb,
               unsigned int iphdroff, unsigned int hdroff,
               const struct nf_conntrack_tuple *tuple,
               enum nf_nat_manip_type maniptype)
{
#ifdef CONFIG_NF_CT_PROTO_DCCP
        struct dccp_hdr *hdr;
        __be16 *portptr, oldport, newport;
        int hdrsize = 8; /* DCCP connection tracking guarantees this much */

        if (skb->len >= hdroff + sizeof(struct dccp_hdr))
                hdrsize = sizeof(struct dccp_hdr);

        if (skb_ensure_writable(skb, hdroff + hdrsize))
                return false;

        hdr = (struct dccp_hdr *)(skb->data + hdroff);

        if (maniptype == NF_NAT_MANIP_SRC) {
                newport = tuple->src.u.dccp.port;
                portptr = &hdr->dccph_sport;
        } else {
                newport = tuple->dst.u.dccp.port;
                portptr = &hdr->dccph_dport;
        }

        oldport = *portptr;
        *portptr = newport;

        if (hdrsize < sizeof(*hdr))
                return true;

        nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype);
        inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
                                 false);
#endif
        return true;
}

static bool
icmp_manip_pkt(struct sk_buff *skb,
               unsigned int iphdroff, unsigned int hdroff,
               const struct nf_conntrack_tuple *tuple,
               enum nf_nat_manip_type maniptype)
{
        struct icmphdr *hdr;

        if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
                return false;

        hdr = (struct icmphdr *)(skb->data + hdroff);
        switch (hdr->type) {
        case ICMP_ECHO:
        case ICMP_ECHOREPLY:
        case ICMP_TIMESTAMP:
        case ICMP_TIMESTAMPREPLY:
        case ICMP_INFO_REQUEST:
        case ICMP_INFO_REPLY:
        case ICMP_ADDRESS:
        case ICMP_ADDRESSREPLY:
                break;
        default:
                return true;
        }
        inet_proto_csum_replace2(&hdr->checksum, skb,
                                 hdr->un.echo.id, tuple->src.u.icmp.id, false);
        hdr->un.echo.id = tuple->src.u.icmp.id;
        return true;
}

static bool
icmpv6_manip_pkt(struct sk_buff *skb,
                 unsigned int iphdroff, unsigned int hdroff,
                 const struct nf_conntrack_tuple *tuple,
                 enum nf_nat_manip_type maniptype)
{
        struct icmp6hdr *hdr;

        if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
                return false;

        hdr = (struct icmp6hdr *)(skb->data + hdroff);
        nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype);
        if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST ||
            hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
                inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
                                         hdr->icmp6_identifier,
                                         tuple->src.u.icmp.id, false);
                hdr->icmp6_identifier = tuple->src.u.icmp.id;
        }
        return true;
}

/* manipulate a GRE packet according to maniptype */
static bool
gre_manip_pkt(struct sk_buff *skb,
              unsigned int iphdroff, unsigned int hdroff,
              const struct nf_conntrack_tuple *tuple,
              enum nf_nat_manip_type maniptype)
{
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
        const struct gre_base_hdr *greh;
        struct pptp_gre_header *pgreh;

        /* pgreh includes two optional 32bit fields which are not required
         * to be there.  That's where the magic '8' comes from */
        if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8))
                return false;

        greh = (void *)skb->data + hdroff;
        pgreh = (struct pptp_gre_header *)greh;

        /* we only have destination manip of a packet, since 'source key'
         * is not present in the packet itself */
        if (maniptype != NF_NAT_MANIP_DST)
                return true;

        switch (greh->flags & GRE_VERSION) {
        case GRE_VERSION_0:
                /* We do not currently NAT any GREv0 packets.
                 * Try to behave like "nf_nat_proto_unknown" */
                break;
        case GRE_VERSION_1:
                pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
                pgreh->call_id = tuple->dst.u.gre.key;
                break;
        default:
                pr_debug("can't nat unknown GRE version\n");
                return false;
        }
#endif
        return true;
}

static bool l4proto_manip_pkt(struct sk_buff *skb,
                              unsigned int iphdroff, unsigned int hdroff,
                              const struct nf_conntrack_tuple *tuple,
                              enum nf_nat_manip_type maniptype)
{
        switch (tuple->dst.protonum) {
        case IPPROTO_TCP:
                return tcp_manip_pkt(skb, iphdroff, hdroff,
                                     tuple, maniptype);
        case IPPROTO_UDP:
                return udp_manip_pkt(skb, iphdroff, hdroff,
                                     tuple, maniptype);
        case IPPROTO_UDPLITE:
                return udplite_manip_pkt(skb, iphdroff, hdroff,
                                         tuple, maniptype);
        case IPPROTO_SCTP:
                return sctp_manip_pkt(skb, iphdroff, hdroff,
                                      tuple, maniptype);
        case IPPROTO_ICMP:
                return icmp_manip_pkt(skb, iphdroff, hdroff,
                                      tuple, maniptype);
        case IPPROTO_ICMPV6:
                return icmpv6_manip_pkt(skb, iphdroff, hdroff,
                                        tuple, maniptype);
        case IPPROTO_DCCP:
                return dccp_manip_pkt(skb, iphdroff, hdroff,
                                      tuple, maniptype);
        case IPPROTO_GRE:
                return gre_manip_pkt(skb, iphdroff, hdroff,
                                     tuple, maniptype);
        }

        /* If we don't know protocol -- no error, pass it unmodified. */
        return true;
}

static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
                                  unsigned int iphdroff,
                                  const struct nf_conntrack_tuple *target,
                                  enum nf_nat_manip_type maniptype)
{
        struct iphdr *iph;
        unsigned int hdroff;

        if (skb_ensure_writable(skb, iphdroff + sizeof(*iph)))
                return false;

        iph = (void *)skb->data + iphdroff;
        hdroff = iphdroff + iph->ihl * 4;

        if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype))
                return false;
        iph = (void *)skb->data + iphdroff;

        if (maniptype == NF_NAT_MANIP_SRC) {
                csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
                iph->saddr = target->src.u3.ip;
        } else {
                csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
                iph->daddr = target->dst.u3.ip;
        }
        return true;
}

static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
                                  unsigned int iphdroff,
                                  const struct nf_conntrack_tuple *target,
                                  enum nf_nat_manip_type maniptype)
{
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6hdr *ipv6h;
        __be16 frag_off;
        int hdroff;
        u8 nexthdr;

        if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h)))
                return false;

        ipv6h = (void *)skb->data + iphdroff;
        nexthdr = ipv6h->nexthdr;
        hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h),
                                  &nexthdr, &frag_off);
        if (hdroff < 0)
                goto manip_addr;

        if ((frag_off & htons(~0x7)) == 0 &&
            !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype))
                return false;

        /* must reload, offset might have changed */
        ipv6h = (void *)skb->data + iphdroff;

manip_addr:
        if (maniptype == NF_NAT_MANIP_SRC)
                ipv6h->saddr = target->src.u3.in6;
        else
                ipv6h->daddr = target->dst.u3.in6;

#endif
        return true;
}

unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
                              enum nf_nat_manip_type mtype,
                              enum ip_conntrack_dir dir)
{
        struct nf_conntrack_tuple target;

        /* We are aiming to look like inverse of other direction. */
        nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);

        switch (target.src.l3num) {
        case NFPROTO_IPV6:
                if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype))
                        return NF_ACCEPT;
                break;
        case NFPROTO_IPV4:
                if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype))
                        return NF_ACCEPT;
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return NF_DROP;
}

static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
                                    unsigned int iphdroff, __sum16 *check,
                                    const struct nf_conntrack_tuple *t,
                                    enum nf_nat_manip_type maniptype)
{
        struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
        __be32 oldip, newip;

        if (maniptype == NF_NAT_MANIP_SRC) {
                oldip = iph->saddr;
                newip = t->src.u3.ip;
        } else {
                oldip = iph->daddr;
                newip = t->dst.u3.ip;
        }
        inet_proto_csum_replace4(check, skb, oldip, newip, true);
}

static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
                                    unsigned int iphdroff, __sum16 *check,
                                    const struct nf_conntrack_tuple *t,
                                    enum nf_nat_manip_type maniptype)
{
#if IS_ENABLED(CONFIG_IPV6)
        const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff);
        const struct in6_addr *oldip, *newip;

        if (maniptype == NF_NAT_MANIP_SRC) {
                oldip = &ipv6h->saddr;
                newip = &t->src.u3.in6;
        } else {
                oldip = &ipv6h->daddr;
                newip = &t->dst.u3.in6;
        }
        inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
                                  newip->s6_addr32, true);
#endif
}

static void nf_csum_update(struct sk_buff *skb,
                           unsigned int iphdroff, __sum16 *check,
                           const struct nf_conntrack_tuple *t,
                           enum nf_nat_manip_type maniptype)
{
        switch (t->src.l3num) {
        case NFPROTO_IPV4:
                nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype);
                return;
        case NFPROTO_IPV6:
                nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype);
                return;
        }
}

static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
                                    u8 proto, void *data, __sum16 *check,
                                    int datalen, int oldlen)
{
        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                const struct iphdr *iph = ip_hdr(skb);

                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
                        ip_hdrlen(skb);
                skb->csum_offset = (void *)check - data;
                *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
                                            proto, 0);
        } else {
                inet_proto_csum_replace2(check, skb,
                                         htons(oldlen), htons(datalen), true);
        }
}

#if IS_ENABLED(CONFIG_IPV6)
static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
                                    u8 proto, void *data, __sum16 *check,
                                    int datalen, int oldlen)
{
        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                const struct ipv6hdr *ipv6h = ipv6_hdr(skb);

                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
                        (data - (void *)skb->data);
                skb->csum_offset = (void *)check - data;
                *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
                                          datalen, proto, 0);
        } else {
                inet_proto_csum_replace2(check, skb,
                                         htons(oldlen), htons(datalen), true);
        }
}
#endif

void nf_nat_csum_recalc(struct sk_buff *skb,
                        u8 nfproto, u8 proto, void *data, __sum16 *check,
                        int datalen, int oldlen)
{
        switch (nfproto) {
        case NFPROTO_IPV4:
                nf_nat_ipv4_csum_recalc(skb, proto, data, check,
                                        datalen, oldlen);
                return;
#if IS_ENABLED(CONFIG_IPV6)
        case NFPROTO_IPV6:
                nf_nat_ipv6_csum_recalc(skb, proto, data, check,
                                        datalen, oldlen);
                return;
#endif
        }

        WARN_ON_ONCE(1);
}

int nf_nat_icmp_reply_translation(struct sk_buff *skb,
                                  struct nf_conn *ct,
                                  enum ip_conntrack_info ctinfo,
                                  unsigned int hooknum)
{
        struct {
                struct icmphdr        icmp;
                struct iphdr        ip;
        } *inside;
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
        unsigned int hdrlen = ip_hdrlen(skb);
        struct nf_conntrack_tuple target;
        unsigned long statusbit;

        WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);

        if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
                return 0;
        if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP))
                return 0;

        inside = (void *)skb->data + hdrlen;
        if (inside->icmp.type == ICMP_REDIRECT) {
                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
                        return 0;
                if (ct->status & IPS_NAT_MASK)
                        return 0;
        }

        if (manip == NF_NAT_MANIP_SRC)
                statusbit = IPS_SRC_NAT;
        else
                statusbit = IPS_DST_NAT;

        /* Invert if this is reply direction */
        if (dir == IP_CT_DIR_REPLY)
                statusbit ^= IPS_NAT_MASK;

        if (!(ct->status & statusbit))
                return 1;

        if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
                                   &ct->tuplehash[!dir].tuple, !manip))
                return 0;

        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                /* Reloading "inside" here since manip_pkt may reallocate */
                inside = (void *)skb->data + hdrlen;
                inside->icmp.checksum = 0;
                inside->icmp.checksum =
                        csum_fold(skb_checksum(skb, hdrlen,
                                               skb->len - hdrlen, 0));
        }

        /* Change outer to look like the reply to an incoming packet */
        nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
        target.dst.protonum = IPPROTO_ICMP;
        if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);

static unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state)
{
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct)
                return NF_ACCEPT;

        if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
                if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
                        if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
                                                           state->hook))
                                return NF_DROP;
                        else
                                return NF_ACCEPT;
                }
        }

        return nf_nat_inet_fn(priv, skb, state);
}

static unsigned int
nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
                        const struct nf_hook_state *state)
{
        unsigned int ret;
        __be32 daddr = ip_hdr(skb)->daddr;

        ret = nf_nat_ipv4_fn(priv, skb, state);
        if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr)
                skb_dst_drop(skb);

        return ret;
}

#ifdef CONFIG_XFRM
static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
{
        struct sock *sk = skb->sk;
        struct dst_entry *dst;
        unsigned int hh_len;
        struct flowi fl;
        int err;

        err = xfrm_decode_session(net, skb, &fl, family);
        if (err < 0)
                return err;

        dst = skb_dst(skb);
        if (dst->xfrm)
                dst = ((struct xfrm_dst *)dst)->route;
        if (!dst_hold_safe(dst))
                return -EHOSTUNREACH;

        if (sk && !net_eq(net, sock_net(sk)))
                sk = NULL;

        dst = xfrm_lookup(net, dst, &fl, sk, 0);
        if (IS_ERR(dst))
                return PTR_ERR(dst);

        skb_dst_drop(skb);
        skb_dst_set(skb, dst);

        /* Change in oif may mean change in hh_len. */
        hh_len = skb_dst(skb)->dev->hard_header_len;
        if (skb_headroom(skb) < hh_len &&
            pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
                return -ENOMEM;
        return 0;
}
#endif

static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport)
{
        enum ip_conntrack_info ctinfo;
        enum ip_conntrack_dir dir;
        const struct nf_conn *ct;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct)
                return false;

        switch (nf_ct_protonum(ct)) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
                break;
        default:
                return false;
        }

        dir = CTINFO2DIR(ctinfo);
        if (dir != IP_CT_DIR_ORIGINAL)
                return false;

        return ct->tuplehash[!dir].tuple.dst.u.all != sport;
}

static unsigned int
nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        __be32 saddr = ip_hdr(skb)->saddr;
        struct sock *sk = skb->sk;
        unsigned int ret;

        ret = nf_nat_ipv4_fn(priv, skb, state);

        if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
                return ret;

        /* skb has a socket assigned via tcp edemux. We need to check
         * if nf_nat_ipv4_fn() has mangled the packet in a way that
         * edemux would not have found this socket.
         *
         * This includes both changes to the source address and changes
         * to the source port, which are both handled by the
         * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux
         * might have found a socket for the old (pre-snat) address.
         */
        if (saddr != ip_hdr(skb)->saddr ||
            nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
                skb_orphan(skb); /* TCP edemux obtained wrong socket */

        return ret;
}

static unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
                const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        int err;
#endif
        unsigned int ret;

        ret = nf_nat_ipv4_fn(priv, skb, state);
#ifdef CONFIG_XFRM
        if (ret != NF_ACCEPT)
                return ret;

        if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
                return ret;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (ct->tuplehash[dir].tuple.src.u3.ip !=
                     ct->tuplehash[!dir].tuple.dst.u3.ip ||
                    (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
                     ct->tuplehash[dir].tuple.src.u.all !=
                     ct->tuplehash[!dir].tuple.dst.u.all)) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
        }
#endif
        return ret;
}

static unsigned int
nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        unsigned int ret;
        int err;

        ret = nf_nat_ipv4_fn(priv, skb, state);
        if (ret != NF_ACCEPT)
                return ret;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (ct->tuplehash[dir].tuple.dst.u3.ip !=
                    ct->tuplehash[!dir].tuple.src.u3.ip) {
                        err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#ifdef CONFIG_XFRM
                else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
                         ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
                         ct->tuplehash[dir].tuple.dst.u.all !=
                         ct->tuplehash[!dir].tuple.src.u.all) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#endif
        }
        return ret;
}

static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv4_pre_routing,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv4_out,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority        = NF_IP_PRI_NAT_SRC,
        },
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv4_local_fn,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority        = NF_IP_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv4_local_in,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority        = NF_IP_PRI_NAT_SRC,
        },
};

int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
        return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops,
                                  ARRAY_SIZE(nf_nat_ipv4_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn);

void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
        nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn);

#if IS_ENABLED(CONFIG_IPV6)
int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
                                    struct nf_conn *ct,
                                    enum ip_conntrack_info ctinfo,
                                    unsigned int hooknum,
                                    unsigned int hdrlen)
{
        struct {
                struct icmp6hdr        icmp6;
                struct ipv6hdr        ip6;
        } *inside;
        enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
        enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
        struct nf_conntrack_tuple target;
        unsigned long statusbit;

        WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);

        if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
                return 0;
        if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
                return 0;

        inside = (void *)skb->data + hdrlen;
        if (inside->icmp6.icmp6_type == NDISC_REDIRECT) {
                if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
                        return 0;
                if (ct->status & IPS_NAT_MASK)
                        return 0;
        }

        if (manip == NF_NAT_MANIP_SRC)
                statusbit = IPS_SRC_NAT;
        else
                statusbit = IPS_DST_NAT;

        /* Invert if this is reply direction */
        if (dir == IP_CT_DIR_REPLY)
                statusbit ^= IPS_NAT_MASK;

        if (!(ct->status & statusbit))
                return 1;

        if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6),
                                   &ct->tuplehash[!dir].tuple, !manip))
                return 0;

        if (skb->ip_summed != CHECKSUM_PARTIAL) {
                struct ipv6hdr *ipv6h = ipv6_hdr(skb);

                inside = (void *)skb->data + hdrlen;
                inside->icmp6.icmp6_cksum = 0;
                inside->icmp6.icmp6_cksum =
                        csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
                                        skb->len - hdrlen, IPPROTO_ICMPV6,
                                        skb_checksum(skb, hdrlen,
                                                     skb->len - hdrlen, 0));
        }

        nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
        target.dst.protonum = IPPROTO_ICMPV6;
        if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip))
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);

static unsigned int
nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state)
{
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        __be16 frag_off;
        int hdrlen;
        u8 nexthdr;

        ct = nf_ct_get(skb, &ctinfo);
        /* Can't track?  It's not due to stress, or conntrack would
         * have dropped it.  Hence it's the user's responsibilty to
         * packet filter it out, or implement conntrack/NAT for that
         * protocol. 8) --RR
         */
        if (!ct)
                return NF_ACCEPT;

        if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
                nexthdr = ipv6_hdr(skb)->nexthdr;
                hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
                                          &nexthdr, &frag_off);

                if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
                        if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
                                                             state->hook,
                                                             hdrlen))
                                return NF_DROP;
                        else
                                return NF_ACCEPT;
                }
        }

        return nf_nat_inet_fn(priv, skb, state);
}

static unsigned int
nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        struct in6_addr saddr = ipv6_hdr(skb)->saddr;
        struct sock *sk = skb->sk;
        unsigned int ret;

        ret = nf_nat_ipv6_fn(priv, skb, state);

        if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
                return ret;

        /* see nf_nat_ipv4_local_in */
        if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) ||
            nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
                skb_orphan(skb);

        return ret;
}

static unsigned int
nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state)
{
        unsigned int ret, verdict;
        struct in6_addr daddr = ipv6_hdr(skb)->daddr;

        ret = nf_nat_ipv6_fn(priv, skb, state);
        verdict = ret & NF_VERDICT_MASK;
        if (verdict != NF_DROP && verdict != NF_STOLEN &&
            ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
                skb_dst_drop(skb);

        return ret;
}

static unsigned int
nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
                const struct nf_hook_state *state)
{
#ifdef CONFIG_XFRM
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        int err;
#endif
        unsigned int ret;

        ret = nf_nat_ipv6_fn(priv, skb, state);
#ifdef CONFIG_XFRM
        if (ret != NF_ACCEPT)
                return ret;

        if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
                return ret;
        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
                                      &ct->tuplehash[!dir].tuple.dst.u3) ||
                    (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
                     ct->tuplehash[dir].tuple.src.u.all !=
                     ct->tuplehash[!dir].tuple.dst.u.all)) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
        }
#endif

        return ret;
}

static unsigned int
nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        const struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;
        unsigned int ret;
        int err;

        ret = nf_nat_ipv6_fn(priv, skb, state);
        if (ret != NF_ACCEPT)
                return ret;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

                if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
                                      &ct->tuplehash[!dir].tuple.src.u3)) {
                        err = nf_ip6_route_me_harder(state->net, state->sk, skb);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#ifdef CONFIG_XFRM
                else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
                         ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
                         ct->tuplehash[dir].tuple.dst.u.all !=
                         ct->tuplehash[!dir].tuple.src.u.all) {
                        err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
#endif
        }

        return ret;
}

static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv6_in,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP6_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv6_out,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority        = NF_IP6_PRI_NAT_SRC,
        },
        /* Before packet filtering, change destination */
        {
                .hook                = nf_nat_ipv6_local_fn,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority        = NF_IP6_PRI_NAT_DST,
        },
        /* After packet filtering, change source */
        {
                .hook                = nf_nat_ipv6_local_in,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority        = NF_IP6_PRI_NAT_SRC,
        },
};

int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
        return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops,
                                  ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn);

void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
        nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn);
#endif /* CONFIG_IPV6 */

#if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT)
int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops)
{
        int ret;

        if (WARN_ON_ONCE(ops->pf != NFPROTO_INET))
                return -EINVAL;

        ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops,
                                 ARRAY_SIZE(nf_nat_ipv6_ops));
        if (ret)
                return ret;

        ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops,
                                 ARRAY_SIZE(nf_nat_ipv4_ops));
        if (ret)
                nf_nat_unregister_fn(net, NFPROTO_IPV6, ops,
                                        ARRAY_SIZE(nf_nat_ipv6_ops));
        return ret;
}
EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn);

void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
{
        nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
        nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
}
EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn);
#endif /* NFT INET NAT */





























































































































































































































    1 
    1 


    1 



    1 






    1 

    1 




    1 



    1 

    1 

    1 






















    1 





    1 














    1 
    1 


    1 









    1 


    1 

    1 

    1 


    1 


    1 
    1 
    1 



    1 














    1 

    1 







    1 

































































    1 
    1 























    1 



    1 


    1 










































































    1 

    1 







    1 

















    1 







    1 





    1 






    1 
















































    1 






















    1 












    1 

    1 












    1 












































    1 














    1 







    1 








    1 














































































































    1 
    1 




































































    1 
    1 


















































































    1 









    1 























    1 

    1 















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2002, 2004
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 * Copyright (c) 2002-2003 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * SCTP over IPv6.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    Le Yanqun                    <yanqun.le@nokia.com>
 *    Hui Huang                    <hui.huang@nokia.com>
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Sridhar Samudrala            <sri@us.ibm.com>
 *    Jon Grimm                    <jgrimm@us.ibm.com>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 *
 * Based on:
 *        linux/net/ipv6/tcp_ipv6.c
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/ipsec.h>
#include <linux/slab.h>

#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/random.h>
#include <linux/seq_file.h>

#include <net/protocol.h>
#include <net/ndisc.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
#include <net/sctp/sctp.h>
#include <net/udp_tunnel.h>

#include <linux/uaccess.h>

static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
                                         union sctp_addr *s2);
static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
                              __be16 port);
static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
                            const union sctp_addr *addr2);

/* Event handler for inet6 address addition/deletion events.
 * The sctp_local_addr_list needs to be protocted by a spin lock since
 * multiple notifiers (say IPv4 and IPv6) may be running at the same
 * time and thus corrupt the list.
 * The reader side is protected with RCU.
 */
static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
                                void *ptr)
{
        struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
        struct sctp_sockaddr_entry *addr = NULL;
        struct sctp_sockaddr_entry *temp;
        struct net *net = dev_net(ifa->idev->dev);
        int found = 0;

        switch (ev) {
        case NETDEV_UP:
                addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
                if (addr) {
                        addr->a.v6.sin6_family = AF_INET6;
                        addr->a.v6.sin6_addr = ifa->addr;
                        addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex;
                        addr->valid = 1;
                        spin_lock_bh(&net->sctp.local_addr_lock);
                        list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list);
                        sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW);
                        spin_unlock_bh(&net->sctp.local_addr_lock);
                }
                break;
        case NETDEV_DOWN:
                spin_lock_bh(&net->sctp.local_addr_lock);
                list_for_each_entry_safe(addr, temp,
                                        &net->sctp.local_addr_list, list) {
                        if (addr->a.sa.sa_family == AF_INET6 &&
                            ipv6_addr_equal(&addr->a.v6.sin6_addr,
                                            &ifa->addr) &&
                            addr->a.v6.sin6_scope_id == ifa->idev->dev->ifindex) {
                                sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL);
                                found = 1;
                                addr->valid = 0;
                                list_del_rcu(&addr->list);
                                break;
                        }
                }
                spin_unlock_bh(&net->sctp.local_addr_lock);
                if (found)
                        kfree_rcu(addr, rcu);
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block sctp_inet6addr_notifier = {
        .notifier_call = sctp_inet6addr_event,
};

static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb,
                               __u8 type, __u8 code, __u32 info)
{
        struct sctp_association *asoc = t->asoc;
        struct sock *sk = asoc->base.sk;
        int err = 0;

        switch (type) {
        case ICMPV6_PKT_TOOBIG:
                if (ip6_sk_accept_pmtu(sk))
                        sctp_icmp_frag_needed(sk, asoc, t, info);
                return;
        case ICMPV6_PARAMPROB:
                if (ICMPV6_UNK_NEXTHDR == code) {
                        sctp_icmp_proto_unreachable(sk, asoc, t);
                        return;
                }
                break;
        case NDISC_REDIRECT:
                sctp_icmp_redirect(sk, t, skb);
                return;
        default:
                break;
        }

        icmpv6_err_convert(type, code, &err);
        if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
                sk->sk_err = err;
                sk_error_report(sk);
        } else {
                WRITE_ONCE(sk->sk_err_soft, err);
        }
}

/* ICMP error handler. */
static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                       u8 type, u8 code, int offset, __be32 info)
{
        struct net *net = dev_net(skb->dev);
        struct sctp_transport *transport;
        struct sctp_association *asoc;
        __u16 saveip, savesctp;
        struct sock *sk;

        /* Fix up skb to look at the embedded net header. */
        saveip         = skb->network_header;
        savesctp = skb->transport_header;
        skb_reset_network_header(skb);
        skb_set_transport_header(skb, offset);
        sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport);
        /* Put back, the original pointers. */
        skb->network_header   = saveip;
        skb->transport_header = savesctp;
        if (!sk) {
                __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
                return -ENOENT;
        }

        sctp_v6_err_handle(transport, skb, type, code, ntohl(info));
        sctp_err_finish(sk, transport);

        return 0;
}

int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        struct sctp_association *asoc;
        struct sctp_transport *t;
        struct icmp6hdr *hdr;
        __u32 info = 0;

        skb->transport_header += sizeof(struct udphdr);
        sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &t);
        if (!sk) {
                __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
                return -ENOENT;
        }

        skb->transport_header -= sizeof(struct udphdr);
        hdr = (struct icmp6hdr *)(skb_network_header(skb) - sizeof(struct icmp6hdr));
        if (hdr->icmp6_type == NDISC_REDIRECT) {
                /* can't be handled without outer ip6hdr known, leave it to udpv6_err */
                sctp_err_finish(sk, t);
                return 0;
        }
        if (hdr->icmp6_type == ICMPV6_PKT_TOOBIG)
                info = ntohl(hdr->icmp6_mtu);
        sctp_v6_err_handle(t, skb, hdr->icmp6_type, hdr->icmp6_code, info);

        sctp_err_finish(sk, t);
        return 1;
}

static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
{
        struct dst_entry *dst = dst_clone(t->dst);
        struct flowi6 *fl6 = &t->fl.u.ip6;
        struct sock *sk = skb->sk;
        struct ipv6_pinfo *np = inet6_sk(sk);
        __u8 tclass = np->tclass;
        __be32 label;

        pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
                 skb->len, &fl6->saddr, &fl6->daddr);

        if (t->dscp & SCTP_DSCP_SET_MASK)
                tclass = t->dscp & SCTP_DSCP_VAL_MASK;

        if (INET_ECN_is_capable(tclass))
                IP6_ECN_flow_xmit(sk, fl6->flowlabel);

        if (!(t->param_flags & SPP_PMTUD_ENABLE))
                skb->ignore_df = 1;

        SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);

        if (!t->encap_port || !sctp_sk(sk)->udp_port) {
                int res;

                skb_dst_set(skb, dst);
                rcu_read_lock();
                res = ip6_xmit(sk, skb, fl6, sk->sk_mark,
                               rcu_dereference(np->opt),
                               tclass, READ_ONCE(sk->sk_priority));
                rcu_read_unlock();
                return res;
        }

        if (skb_is_gso(skb))
                skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;

        skb->encapsulation = 1;
        skb_reset_inner_mac_header(skb);
        skb_reset_inner_transport_header(skb);
        skb_set_inner_ipproto(skb, IPPROTO_SCTP);
        label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6);

        return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr,
                                    &fl6->daddr, tclass, ip6_dst_hoplimit(dst),
                                    label, sctp_sk(sk)->udp_port, t->encap_port, false);
}

/* Returns the dst cache entry for the given source and destination ip
 * addresses.
 */
static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
                            struct flowi *fl, struct sock *sk)
{
        struct sctp_association *asoc = t->asoc;
        struct dst_entry *dst = NULL;
        struct flowi _fl;
        struct flowi6 *fl6 = &_fl.u.ip6;
        struct sctp_bind_addr *bp;
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sctp_sockaddr_entry *laddr;
        union sctp_addr *daddr = &t->ipaddr;
        union sctp_addr dst_saddr;
        struct in6_addr *final_p, final;
        enum sctp_scope scope;
        __u8 matchlen = 0;

        memset(&_fl, 0, sizeof(_fl));
        fl6->daddr = daddr->v6.sin6_addr;
        fl6->fl6_dport = daddr->v6.sin6_port;
        fl6->flowi6_proto = IPPROTO_SCTP;
        if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
                fl6->flowi6_oif = daddr->v6.sin6_scope_id;
        else if (asoc)
                fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if;
        if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
                fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);

        if (inet6_test_bit(SNDFLOW, sk) &&
            (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
                struct ip6_flowlabel *flowlabel;

                flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
                if (IS_ERR(flowlabel))
                        goto out;
                fl6_sock_release(flowlabel);
        }

        pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);

        if (asoc)
                fl6->fl6_sport = htons(asoc->base.bind_addr.port);

        if (saddr) {
                fl6->saddr = saddr->v6.sin6_addr;
                if (!fl6->fl6_sport)
                        fl6->fl6_sport = saddr->v6.sin6_port;

                pr_debug("src=%pI6 - ", &fl6->saddr);
        }

        rcu_read_lock();
        final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
        rcu_read_unlock();

        dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
        if (!asoc || saddr) {
                t->dst = dst;
                memcpy(fl, &_fl, sizeof(_fl));
                goto out;
        }

        bp = &asoc->base.bind_addr;
        scope = sctp_scope(daddr);
        /* ip6_dst_lookup has filled in the fl6->saddr for us.  Check
         * to see if we can use it.
         */
        if (!IS_ERR(dst)) {
                /* Walk through the bind address list and look for a bind
                 * address that matches the source address of the returned dst.
                 */
                sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port));
                rcu_read_lock();
                list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                        if (!laddr->valid || laddr->state == SCTP_ADDR_DEL ||
                            (laddr->state != SCTP_ADDR_SRC &&
                             !asoc->src_out_of_asoc_ok))
                                continue;

                        /* Do not compare against v4 addrs */
                        if ((laddr->a.sa.sa_family == AF_INET6) &&
                            (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) {
                                rcu_read_unlock();
                                t->dst = dst;
                                memcpy(fl, &_fl, sizeof(_fl));
                                goto out;
                        }
                }
                rcu_read_unlock();
                /* None of the bound addresses match the source address of the
                 * dst. So release it.
                 */
                dst_release(dst);
                dst = NULL;
        }

        /* Walk through the bind address list and try to get the
         * best source address for a given destination.
         */
        rcu_read_lock();
        list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                struct dst_entry *bdst;
                __u8 bmatchlen;

                if (!laddr->valid ||
                    laddr->state != SCTP_ADDR_SRC ||
                    laddr->a.sa.sa_family != AF_INET6 ||
                    scope > sctp_scope(&laddr->a))
                        continue;

                fl6->saddr = laddr->a.v6.sin6_addr;
                fl6->fl6_sport = laddr->a.v6.sin6_port;
                final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
                bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);

                if (IS_ERR(bdst))
                        continue;

                if (ipv6_chk_addr(dev_net(bdst->dev),
                                  &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
                        if (!IS_ERR_OR_NULL(dst))
                                dst_release(dst);
                        dst = bdst;
                        t->dst = dst;
                        memcpy(fl, &_fl, sizeof(_fl));
                        break;
                }

                bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
                if (matchlen > bmatchlen) {
                        dst_release(bdst);
                        continue;
                }

                if (!IS_ERR_OR_NULL(dst))
                        dst_release(dst);
                dst = bdst;
                matchlen = bmatchlen;
                t->dst = dst;
                memcpy(fl, &_fl, sizeof(_fl));
        }
        rcu_read_unlock();

out:
        if (!IS_ERR_OR_NULL(dst)) {
                struct rt6_info *rt;

                rt = dst_rt6_info(dst);
                t->dst_cookie = rt6_get_cookie(rt);
                pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
                         &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
                         &fl->u.ip6.saddr);
        } else {
                t->dst = NULL;
                pr_debug("no route\n");
        }
}

/* Returns the number of consecutive initial bits that match in the 2 ipv6
 * addresses.
 */
static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
                                         union sctp_addr *s2)
{
        return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr);
}

/* Fills in the source address(saddr) based on the destination address(daddr)
 * and asoc's bind address list.
 */
static void sctp_v6_get_saddr(struct sctp_sock *sk,
                              struct sctp_transport *t,
                              struct flowi *fl)
{
        struct flowi6 *fl6 = &fl->u.ip6;
        union sctp_addr *saddr = &t->saddr;

        pr_debug("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst);

        if (t->dst) {
                saddr->v6.sin6_family = AF_INET6;
                saddr->v6.sin6_addr = fl6->saddr;
        }
}

/* Make a copy of all potential local addresses. */
static void sctp_v6_copy_addrlist(struct list_head *addrlist,
                                  struct net_device *dev)
{
        struct inet6_dev *in6_dev;
        struct inet6_ifaddr *ifp;
        struct sctp_sockaddr_entry *addr;

        rcu_read_lock();
        if ((in6_dev = __in6_dev_get(dev)) == NULL) {
                rcu_read_unlock();
                return;
        }

        read_lock_bh(&in6_dev->lock);
        list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
                /* Add the address to the local list.  */
                addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
                if (addr) {
                        addr->a.v6.sin6_family = AF_INET6;
                        addr->a.v6.sin6_addr = ifp->addr;
                        addr->a.v6.sin6_scope_id = dev->ifindex;
                        addr->valid = 1;
                        INIT_LIST_HEAD(&addr->list);
                        list_add_tail(&addr->list, addrlist);
                }
        }

        read_unlock_bh(&in6_dev->lock);
        rcu_read_unlock();
}

/* Copy over any ip options */
static void sctp_v6_copy_ip_options(struct sock *sk, struct sock *newsk)
{
        struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
        struct ipv6_txoptions *opt;

        newnp = inet6_sk(newsk);

        rcu_read_lock();
        opt = rcu_dereference(np->opt);
        if (opt) {
                opt = ipv6_dup_options(newsk, opt);
                if (!opt)
                        pr_err("%s: Failed to copy ip options\n", __func__);
        }
        RCU_INIT_POINTER(newnp->opt, opt);
        rcu_read_unlock();
}

/* Account for the IP options */
static int sctp_v6_ip_options_len(struct sock *sk)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6_txoptions *opt;
        int len = 0;

        rcu_read_lock();
        opt = rcu_dereference(np->opt);
        if (opt)
                len = opt->opt_flen + opt->opt_nflen;

        rcu_read_unlock();
        return len;
}

/* Initialize a sockaddr_storage from in incoming skb. */
static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb,
                             int is_saddr)
{
        /* Always called on head skb, so this is safe */
        struct sctphdr *sh = sctp_hdr(skb);
        struct sockaddr_in6 *sa = &addr->v6;

        addr->v6.sin6_family = AF_INET6;
        addr->v6.sin6_flowinfo = 0; /* FIXME */
        addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;

        if (is_saddr) {
                sa->sin6_port = sh->source;
                sa->sin6_addr = ipv6_hdr(skb)->saddr;
        } else {
                sa->sin6_port = sh->dest;
                sa->sin6_addr = ipv6_hdr(skb)->daddr;
        }
}

/* Initialize an sctp_addr from a socket. */
static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk)
{
        addr->v6.sin6_family = AF_INET6;
        addr->v6.sin6_port = 0;
        addr->v6.sin6_addr = sk->sk_v6_rcv_saddr;
}

/* Initialize sk->sk_rcv_saddr from sctp_addr. */
static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
{
        if (addr->sa.sa_family == AF_INET) {
                sk->sk_v6_rcv_saddr.s6_addr32[0] = 0;
                sk->sk_v6_rcv_saddr.s6_addr32[1] = 0;
                sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff);
                sk->sk_v6_rcv_saddr.s6_addr32[3] =
                        addr->v4.sin_addr.s_addr;
        } else {
                sk->sk_v6_rcv_saddr = addr->v6.sin6_addr;
        }
}

/* Initialize sk->sk_daddr from sctp_addr. */
static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
{
        if (addr->sa.sa_family == AF_INET) {
                sk->sk_v6_daddr.s6_addr32[0] = 0;
                sk->sk_v6_daddr.s6_addr32[1] = 0;
                sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff);
                sk->sk_v6_daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr;
        } else {
                sk->sk_v6_daddr = addr->v6.sin6_addr;
        }
}

/* Initialize a sctp_addr from an address parameter. */
static bool sctp_v6_from_addr_param(union sctp_addr *addr,
                                    union sctp_addr_param *param,
                                    __be16 port, int iif)
{
        if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param))
                return false;

        addr->v6.sin6_family = AF_INET6;
        addr->v6.sin6_port = port;
        addr->v6.sin6_flowinfo = 0; /* BUG */
        addr->v6.sin6_addr = param->v6.addr;
        addr->v6.sin6_scope_id = iif;

        return true;
}

/* Initialize an address parameter from a sctp_addr and return the length
 * of the address parameter.
 */
static int sctp_v6_to_addr_param(const union sctp_addr *addr,
                                 union sctp_addr_param *param)
{
        int length = sizeof(struct sctp_ipv6addr_param);

        param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS;
        param->v6.param_hdr.length = htons(length);
        param->v6.addr = addr->v6.sin6_addr;

        return length;
}

/* Initialize a sctp_addr from struct in6_addr. */
static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
                              __be16 port)
{
        addr->sa.sa_family = AF_INET6;
        addr->v6.sin6_port = port;
        addr->v6.sin6_flowinfo = 0;
        addr->v6.sin6_addr = *saddr;
        addr->v6.sin6_scope_id = 0;
}

static int __sctp_v6_cmp_addr(const union sctp_addr *addr1,
                              const union sctp_addr *addr2)
{
        if (addr1->sa.sa_family != addr2->sa.sa_family) {
                if (addr1->sa.sa_family == AF_INET &&
                    addr2->sa.sa_family == AF_INET6 &&
                    ipv6_addr_v4mapped(&addr2->v6.sin6_addr) &&
                    addr2->v6.sin6_addr.s6_addr32[3] ==
                    addr1->v4.sin_addr.s_addr)
                        return 1;

                if (addr2->sa.sa_family == AF_INET &&
                    addr1->sa.sa_family == AF_INET6 &&
                    ipv6_addr_v4mapped(&addr1->v6.sin6_addr) &&
                    addr1->v6.sin6_addr.s6_addr32[3] ==
                    addr2->v4.sin_addr.s_addr)
                        return 1;

                return 0;
        }

        if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
                return 0;

        /* If this is a linklocal address, compare the scope_id. */
        if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
            addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id &&
            addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)
                return 0;

        return 1;
}

/* Compare addresses exactly.
 * v4-mapped-v6 is also in consideration.
 */
static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
                            const union sctp_addr *addr2)
{
        return __sctp_v6_cmp_addr(addr1, addr2) &&
               addr1->v6.sin6_port == addr2->v6.sin6_port;
}

/* Initialize addr struct to INADDR_ANY. */
static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port)
{
        memset(addr, 0x00, sizeof(union sctp_addr));
        addr->v6.sin6_family = AF_INET6;
        addr->v6.sin6_port = port;
}

/* Is this a wildcard address? */
static int sctp_v6_is_any(const union sctp_addr *addr)
{
        return ipv6_addr_any(&addr->v6.sin6_addr);
}

/* Should this be available for binding?   */
static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
{
        const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr;
        struct sock *sk = &sp->inet.sk;
        struct net *net = sock_net(sk);
        struct net_device *dev = NULL;
        int type;

        type = ipv6_addr_type(in6);
        if (IPV6_ADDR_ANY == type)
                return 1;
        if (type == IPV6_ADDR_MAPPED) {
                if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
                        return 0;
                sctp_v6_map_v4(addr);
                return sctp_get_af_specific(AF_INET)->available(addr, sp);
        }
        if (!(type & IPV6_ADDR_UNICAST))
                return 0;

        if (sk->sk_bound_dev_if) {
                dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
                if (!dev)
                        return 0;
        }

        return ipv6_can_nonlocal_bind(net, &sp->inet) ||
               ipv6_chk_addr(net, in6, dev, 0);
}

/* This function checks if the address is a valid address to be used for
 * SCTP.
 *
 * Output:
 * Return 0 - If the address is a non-unicast or an illegal address.
 * Return 1 - If the address is a unicast.
 */
static int sctp_v6_addr_valid(union sctp_addr *addr,
                              struct sctp_sock *sp,
                              const struct sk_buff *skb)
{
        int ret = ipv6_addr_type(&addr->v6.sin6_addr);

        /* Support v4-mapped-v6 address. */
        if (ret == IPV6_ADDR_MAPPED) {
                /* Note: This routine is used in input, so v4-mapped-v6
                 * are disallowed here when there is no sctp_sock.
                 */
                if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
                        return 0;
                sctp_v6_map_v4(addr);
                return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb);
        }

        /* Is this a non-unicast address */
        if (!(ret & IPV6_ADDR_UNICAST))
                return 0;

        return 1;
}

/* What is the scope of 'addr'?  */
static enum sctp_scope sctp_v6_scope(union sctp_addr *addr)
{
        enum sctp_scope retval;
        int v6scope;

        /* The IPv6 scope is really a set of bit fields.
         * See IFA_* in <net/if_inet6.h>.  Map to a generic SCTP scope.
         */

        v6scope = ipv6_addr_scope(&addr->v6.sin6_addr);
        switch (v6scope) {
        case IFA_HOST:
                retval = SCTP_SCOPE_LOOPBACK;
                break;
        case IFA_LINK:
                retval = SCTP_SCOPE_LINK;
                break;
        case IFA_SITE:
                retval = SCTP_SCOPE_PRIVATE;
                break;
        default:
                retval = SCTP_SCOPE_GLOBAL;
                break;
        }

        return retval;
}

/* Create and initialize a new sk for the socket to be returned by accept(). */
static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
                                             struct sctp_association *asoc,
                                             bool kern)
{
        struct sock *newsk;
        struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
        struct sctp6_sock *newsctp6sk;

        newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern);
        if (!newsk)
                goto out;

        sock_init_data(NULL, newsk);

        sctp_copy_sock(newsk, sk, asoc);
        sock_reset_flag(sk, SOCK_ZAPPED);

        newsctp6sk = (struct sctp6_sock *)newsk;
        inet_sk(newsk)->pinet6 = &newsctp6sk->inet6;

        sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped;

        newnp = inet6_sk(newsk);

        memcpy(newnp, np, sizeof(struct ipv6_pinfo));
        newnp->ipv6_mc_list = NULL;
        newnp->ipv6_ac_list = NULL;
        newnp->ipv6_fl_list = NULL;

        sctp_v6_copy_ip_options(sk, newsk);

        /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
         * and getpeername().
         */
        sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk);

        newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr;

        if (newsk->sk_prot->init(newsk)) {
                sk_common_release(newsk);
                newsk = NULL;
        }

out:
        return newsk;
}

/* Format a sockaddr for return to user space. This makes sure the return is
 * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option.
 */
static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
{
        if (sp->v4mapped) {
                if (addr->sa.sa_family == AF_INET)
                        sctp_v4_map_v6(addr);
        } else {
                if (addr->sa.sa_family == AF_INET6 &&
                    ipv6_addr_v4mapped(&addr->v6.sin6_addr))
                        sctp_v6_map_v4(addr);
        }

        if (addr->sa.sa_family == AF_INET) {
                memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
                return sizeof(struct sockaddr_in);
        }
        return sizeof(struct sockaddr_in6);
}

/* Where did this skb come from?  */
static int sctp_v6_skb_iif(const struct sk_buff *skb)
{
        return inet6_iif(skb);
}

static int sctp_v6_skb_sdif(const struct sk_buff *skb)
{
        return inet6_sdif(skb);
}

/* Was this packet marked by Explicit Congestion Notification? */
static int sctp_v6_is_ce(const struct sk_buff *skb)
{
        return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20);
}

/* Dump the v6 addr to the seq file. */
static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
{
        seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr);
}

static void sctp_v6_ecn_capable(struct sock *sk)
{
        inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
}

/* Initialize a PF_INET msgname from a ulpevent. */
static void sctp_inet6_event_msgname(struct sctp_ulpevent *event,
                                     char *msgname, int *addrlen)
{
        union sctp_addr *addr;
        struct sctp_association *asoc;
        union sctp_addr *paddr;

        if (!msgname)
                return;

        addr = (union sctp_addr *)msgname;
        asoc = event->asoc;
        paddr = &asoc->peer.primary_addr;

        if (paddr->sa.sa_family == AF_INET) {
                addr->v4.sin_family = AF_INET;
                addr->v4.sin_port = htons(asoc->peer.port);
                addr->v4.sin_addr = paddr->v4.sin_addr;
        } else {
                addr->v6.sin6_family = AF_INET6;
                addr->v6.sin6_flowinfo = 0;
                if (ipv6_addr_type(&paddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
                        addr->v6.sin6_scope_id = paddr->v6.sin6_scope_id;
                else
                        addr->v6.sin6_scope_id = 0;
                addr->v6.sin6_port = htons(asoc->peer.port);
                addr->v6.sin6_addr = paddr->v6.sin6_addr;
        }

        *addrlen = sctp_v6_addr_to_user(sctp_sk(asoc->base.sk), addr);
}

/* Initialize a msg_name from an inbound skb. */
static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname,
                                   int *addr_len)
{
        union sctp_addr *addr;
        struct sctphdr *sh;

        if (!msgname)
                return;

        addr = (union sctp_addr *)msgname;
        sh = sctp_hdr(skb);

        if (ip_hdr(skb)->version == 4) {
                addr->v4.sin_family = AF_INET;
                addr->v4.sin_port = sh->source;
                addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr;
        } else {
                addr->v6.sin6_family = AF_INET6;
                addr->v6.sin6_flowinfo = 0;
                addr->v6.sin6_port = sh->source;
                addr->v6.sin6_addr = ipv6_hdr(skb)->saddr;
                if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
                        addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb);
                else
                        addr->v6.sin6_scope_id = 0;
        }

        *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr);
}

/* Do we support this AF? */
static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp)
{
        switch (family) {
        case AF_INET6:
                return 1;
        /* v4-mapped-v6 addresses */
        case AF_INET:
                if (!ipv6_only_sock(sctp_opt2sk(sp)))
                        return 1;
                fallthrough;
        default:
                return 0;
        }
}

/* Address matching with wildcards allowed.  This extra level
 * of indirection lets us choose whether a PF_INET6 should
 * disallow any v4 addresses if we so choose.
 */
static int sctp_inet6_cmp_addr(const union sctp_addr *addr1,
                               const union sctp_addr *addr2,
                               struct sctp_sock *opt)
{
        struct sock *sk = sctp_opt2sk(opt);
        struct sctp_af *af1, *af2;

        af1 = sctp_get_af_specific(addr1->sa.sa_family);
        af2 = sctp_get_af_specific(addr2->sa.sa_family);

        if (!af1 || !af2)
                return 0;

        /* If the socket is IPv6 only, v4 addrs will not match */
        if (ipv6_only_sock(sk) && af1 != af2)
                return 0;

        /* Today, wildcard AF_INET/AF_INET6. */
        if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2))
                return 1;

        if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET)
                return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr;

        return __sctp_v6_cmp_addr(addr1, addr2);
}

/* Verify that the provided sockaddr looks bindable.   Common verification,
 * has already been taken care of.
 */
static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
{
        struct sctp_af *af;

        /* ASSERT: address family has already been verified. */
        if (addr->sa.sa_family != AF_INET6)
                af = sctp_get_af_specific(addr->sa.sa_family);
        else {
                int type = ipv6_addr_type(&addr->v6.sin6_addr);
                struct net_device *dev;

                if (type & IPV6_ADDR_LINKLOCAL) {
                        struct net *net;
                        if (!addr->v6.sin6_scope_id)
                                return 0;
                        net = sock_net(&opt->inet.sk);
                        rcu_read_lock();
                        dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id);
                        if (!dev || !(ipv6_can_nonlocal_bind(net, &opt->inet) ||
                                      ipv6_chk_addr(net, &addr->v6.sin6_addr,
                                                    dev, 0))) {
                                rcu_read_unlock();
                                return 0;
                        }
                        rcu_read_unlock();
                }

                af = opt->pf->af;
        }
        return af->available(addr, opt);
}

/* Verify that the provided sockaddr looks sendable.   Common verification,
 * has already been taken care of.
 */
static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
{
        struct sctp_af *af = NULL;

        /* ASSERT: address family has already been verified. */
        if (addr->sa.sa_family != AF_INET6)
                af = sctp_get_af_specific(addr->sa.sa_family);
        else {
                int type = ipv6_addr_type(&addr->v6.sin6_addr);
                struct net_device *dev;

                if (type & IPV6_ADDR_LINKLOCAL) {
                        if (!addr->v6.sin6_scope_id)
                                return 0;
                        rcu_read_lock();
                        dev = dev_get_by_index_rcu(sock_net(&opt->inet.sk),
                                                   addr->v6.sin6_scope_id);
                        rcu_read_unlock();
                        if (!dev)
                                return 0;
                }
                af = opt->pf->af;
        }

        return af != NULL;
}

/* Fill in Supported Address Type information for INIT and INIT-ACK
 * chunks.   Note: In the future, we may want to look at sock options
 * to determine whether a PF_INET6 socket really wants to have IPV4
 * addresses.
 * Returns number of addresses supported.
 */
static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
                                      __be16 *types)
{
        types[0] = SCTP_PARAM_IPV6_ADDRESS;
        if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) {
                types[1] = SCTP_PARAM_IPV4_ADDRESS;
                return 2;
        }
        return 1;
}

/* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */
static int sctp_getname(struct socket *sock, struct sockaddr *uaddr,
                        int peer)
{
        int rc;

        rc = inet6_getname(sock, uaddr, peer);

        if (rc < 0)
                return rc;

        rc = sctp_v6_addr_to_user(sctp_sk(sock->sk),
                                          (union sctp_addr *)uaddr);

        return rc;
}

static const struct proto_ops inet6_seqpacket_ops = {
        .family                   = PF_INET6,
        .owner                   = THIS_MODULE,
        .release           = inet6_release,
        .bind                   = inet6_bind,
        .connect           = sctp_inet_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = inet_accept,
        .getname           = sctp_getname,
        .poll                   = sctp_poll,
        .ioctl                   = inet6_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sctp_inet_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet6_compat_ioctl,
#endif
};

static struct inet_protosw sctpv6_seqpacket_protosw = {
        .type          = SOCK_SEQPACKET,
        .protocol      = IPPROTO_SCTP,
        .prot                = &sctpv6_prot,
        .ops           = &inet6_seqpacket_ops,
        .flags         = SCTP_PROTOSW_FLAG
};
static struct inet_protosw sctpv6_stream_protosw = {
        .type          = SOCK_STREAM,
        .protocol      = IPPROTO_SCTP,
        .prot                = &sctpv6_prot,
        .ops           = &inet6_seqpacket_ops,
        .flags         = SCTP_PROTOSW_FLAG,
};

static int sctp6_rcv(struct sk_buff *skb)
{
        SCTP_INPUT_CB(skb)->encap_port = 0;
        return sctp_rcv(skb) ? -1 : 0;
}

static const struct inet6_protocol sctpv6_protocol = {
        .handler      = sctp6_rcv,
        .err_handler  = sctp_v6_err,
        .flags        = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
};

static struct sctp_af sctp_af_inet6 = {
        .sa_family           = AF_INET6,
        .sctp_xmit           = sctp_v6_xmit,
        .setsockopt           = ipv6_setsockopt,
        .getsockopt           = ipv6_getsockopt,
        .get_dst           = sctp_v6_get_dst,
        .get_saddr           = sctp_v6_get_saddr,
        .copy_addrlist           = sctp_v6_copy_addrlist,
        .from_skb           = sctp_v6_from_skb,
        .from_sk           = sctp_v6_from_sk,
        .from_addr_param   = sctp_v6_from_addr_param,
        .to_addr_param           = sctp_v6_to_addr_param,
        .cmp_addr           = sctp_v6_cmp_addr,
        .scope                   = sctp_v6_scope,
        .addr_valid           = sctp_v6_addr_valid,
        .inaddr_any           = sctp_v6_inaddr_any,
        .is_any                   = sctp_v6_is_any,
        .available           = sctp_v6_available,
        .skb_iif           = sctp_v6_skb_iif,
        .skb_sdif           = sctp_v6_skb_sdif,
        .is_ce                   = sctp_v6_is_ce,
        .seq_dump_addr           = sctp_v6_seq_dump_addr,
        .ecn_capable           = sctp_v6_ecn_capable,
        .net_header_len           = sizeof(struct ipv6hdr),
        .sockaddr_len           = sizeof(struct sockaddr_in6),
        .ip_options_len           = sctp_v6_ip_options_len,
};

static struct sctp_pf sctp_pf_inet6 = {
        .event_msgname = sctp_inet6_event_msgname,
        .skb_msgname   = sctp_inet6_skb_msgname,
        .af_supported  = sctp_inet6_af_supported,
        .cmp_addr      = sctp_inet6_cmp_addr,
        .bind_verify   = sctp_inet6_bind_verify,
        .send_verify   = sctp_inet6_send_verify,
        .supported_addrs = sctp_inet6_supported_addrs,
        .create_accept_sk = sctp_v6_create_accept_sk,
        .addr_to_user  = sctp_v6_addr_to_user,
        .to_sk_saddr   = sctp_v6_to_sk_saddr,
        .to_sk_daddr   = sctp_v6_to_sk_daddr,
        .copy_ip_options = sctp_v6_copy_ip_options,
        .af            = &sctp_af_inet6,
};

/* Initialize IPv6 support and register with socket layer.  */
void sctp_v6_pf_init(void)
{
        /* Register the SCTP specific PF_INET6 functions. */
        sctp_register_pf(&sctp_pf_inet6, PF_INET6);

        /* Register the SCTP specific AF_INET6 functions. */
        sctp_register_af(&sctp_af_inet6);
}

void sctp_v6_pf_exit(void)
{
        list_del(&sctp_af_inet6.list);
}

/* Initialize IPv6 support and register with socket layer.  */
int sctp_v6_protosw_init(void)
{
        int rc;

        rc = proto_register(&sctpv6_prot, 1);
        if (rc)
                return rc;

        /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */
        inet6_register_protosw(&sctpv6_seqpacket_protosw);
        inet6_register_protosw(&sctpv6_stream_protosw);

        return 0;
}

void sctp_v6_protosw_exit(void)
{
        inet6_unregister_protosw(&sctpv6_seqpacket_protosw);
        inet6_unregister_protosw(&sctpv6_stream_protosw);
        proto_unregister(&sctpv6_prot);
}


/* Register with inet6 layer. */
int sctp_v6_add_protocol(void)
{
        /* Register notifier for inet6 address additions/deletions. */
        register_inet6addr_notifier(&sctp_inet6addr_notifier);

        if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0)
                return -EAGAIN;

        return 0;
}

/* Unregister with inet6 layer. */
void sctp_v6_del_protocol(void)
{
        inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP);
        unregister_inet6addr_notifier(&sctp_inet6addr_notifier);
}






































   14 









   16 

   16 











   15 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_CONTEXT_H
#define __X86_KERNEL_FPU_CONTEXT_H

#include <asm/fpu/xstate.h>
#include <asm/trace/fpu.h>

/* Functions related to FPU context tracking */

/*
 * The in-register FPU state for an FPU context on a CPU is assumed to be
 * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
 * matches the FPU.
 *
 * If the FPU register state is valid, the kernel can skip restoring the
 * FPU state from memory.
 *
 * Any code that clobbers the FPU registers or updates the in-memory
 * FPU state for a task MUST let the rest of the kernel know that the
 * FPU registers are no longer valid for this task.
 *
 * Invalidate a resource you control: CPU if using the CPU for something else
 * (with preemption disabled), FPU for the current task, or a task that
 * is prevented from running by the current task.
 */
static inline void __cpu_invalidate_fpregs_state(void)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
        fpu->last_cpu = -1;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
        return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}

static inline void fpregs_deactivate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        trace_x86_fpu_regs_deactivated(fpu);
}

static inline void fpregs_activate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
        trace_x86_fpu_regs_activated(fpu);
}

/* Internal helper for switch_fpu_return() and signal frame setup */
static inline void fpregs_restore_userregs(void)
{
        struct fpu *fpu = &current->thread.fpu;
        int cpu = smp_processor_id();

        if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
                return;

        if (!fpregs_state_valid(fpu, cpu)) {
                /*
                 * This restores _all_ xstate which has not been
                 * established yet.
                 *
                 * If PKRU is enabled, then the PKRU value is already
                 * correct because it was either set in switch_to() or in
                 * flush_thread(). So it is excluded because it might be
                 * not up to date in current->thread.fpu.xsave state.
                 *
                 * XFD state is handled in restore_fpregs_from_fpstate().
                 */
                restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);

                fpregs_activate(fpu);
                fpu->last_cpu = cpu;
        }
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

#endif




















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGE_REF_H
#define _LINUX_PAGE_REF_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/tracepoint-defs.h>

DECLARE_TRACEPOINT(page_ref_set);
DECLARE_TRACEPOINT(page_ref_mod);
DECLARE_TRACEPOINT(page_ref_mod_and_test);
DECLARE_TRACEPOINT(page_ref_mod_and_return);
DECLARE_TRACEPOINT(page_ref_mod_unless);
DECLARE_TRACEPOINT(page_ref_freeze);
DECLARE_TRACEPOINT(page_ref_unfreeze);

#ifdef CONFIG_DEBUG_PAGE_REF

/*
 * Ideally we would want to use the trace_<tracepoint>_enabled() helper
 * functions. But due to include header file issues, that is not
 * feasible. Instead we have to open code the static key functions.
 *
 * See trace_##name##_enabled(void) in include/linux/tracepoint.h
 */
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)

extern void __page_ref_set(struct page *page, int v);
extern void __page_ref_mod(struct page *page, int v);
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
extern void __page_ref_mod_unless(struct page *page, int v, int u);
extern void __page_ref_freeze(struct page *page, int v, int ret);
extern void __page_ref_unfreeze(struct page *page, int v);

#else

#define page_ref_tracepoint_active(t) false

static inline void __page_ref_set(struct page *page, int v)
{
}
static inline void __page_ref_mod(struct page *page, int v)
{
}
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
{
}
static inline void __page_ref_freeze(struct page *page, int v, int ret)
{
}
static inline void __page_ref_unfreeze(struct page *page, int v)
{
}

#endif

static inline int page_ref_count(const struct page *page)
{
        return atomic_read(&page->_refcount);
}

/**
 * folio_ref_count - The reference count on this folio.
 * @folio: The folio.
 *
 * The refcount is usually incremented by calls to folio_get() and
 * decremented by calls to folio_put().  Some typical users of the
 * folio refcount:
 *
 * - Each reference from a page table
 * - The page cache
 * - Filesystem private data
 * - The LRU list
 * - Pipes
 * - Direct IO which references this page in the process address space
 *
 * Return: The number of references to this folio.
 */
static inline int folio_ref_count(const struct folio *folio)
{
        return page_ref_count(&folio->page);
}

static inline int page_count(const struct page *page)
{
        return folio_ref_count(page_folio(page));
}

static inline void set_page_count(struct page *page, int v)
{
        atomic_set(&page->_refcount, v);
        if (page_ref_tracepoint_active(page_ref_set))
                __page_ref_set(page, v);
}

static inline void folio_set_count(struct folio *folio, int v)
{
        set_page_count(&folio->page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
        set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
        atomic_add(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, nr);
}

static inline void folio_ref_add(struct folio *folio, int nr)
{
        page_ref_add(&folio->page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
        atomic_sub(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -nr);
}

static inline void folio_ref_sub(struct folio *folio, int nr)
{
        page_ref_sub(&folio->page, nr);
}

static inline int folio_ref_sub_return(struct folio *folio, int nr)
{
        int ret = atomic_sub_return(nr, &folio->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(&folio->page, -nr, ret);
        return ret;
}

static inline void page_ref_inc(struct page *page)
{
        atomic_inc(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, 1);
}

static inline void folio_ref_inc(struct folio *folio)
{
        page_ref_inc(&folio->page);
}

static inline void page_ref_dec(struct page *page)
{
        atomic_dec(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -1);
}

static inline void folio_ref_dec(struct folio *folio)
{
        page_ref_dec(&folio->page);
}

static inline int page_ref_sub_and_test(struct page *page, int nr)
{
        int ret = atomic_sub_and_test(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -nr, ret);
        return ret;
}

static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
{
        return page_ref_sub_and_test(&folio->page, nr);
}

static inline int page_ref_inc_return(struct page *page)
{
        int ret = atomic_inc_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, 1, ret);
        return ret;
}

static inline int folio_ref_inc_return(struct folio *folio)
{
        return page_ref_inc_return(&folio->page);
}

static inline int page_ref_dec_and_test(struct page *page)
{
        int ret = atomic_dec_and_test(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_and_test(struct folio *folio)
{
        return page_ref_dec_and_test(&folio->page);
}

static inline int page_ref_dec_return(struct page *page)
{
        int ret = atomic_dec_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_return(struct folio *folio)
{
        return page_ref_dec_return(&folio->page);
}

static inline bool page_ref_add_unless(struct page *page, int nr, int u)
{
        bool ret = atomic_add_unless(&page->_refcount, nr, u);

        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
}

static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
{
        return page_ref_add_unless(&folio->page, nr, u);
}

/**
 * folio_try_get - Attempt to increase the refcount on a folio.
 * @folio: The folio.
 *
 * If you do not already have a reference to a folio, you can attempt to
 * get one using this function.  It may fail if, for example, the folio
 * has been freed since you found a pointer to it, or it is frozen for
 * the purposes of splitting or migration.
 *
 * Return: True if the reference count was successfully incremented.
 */
static inline bool folio_try_get(struct folio *folio)
{
        return folio_ref_add_unless(folio, 1, 0);
}

static inline bool folio_ref_try_add_rcu(struct folio *folio, int count)
{
#ifdef CONFIG_TINY_RCU
        /*
         * The caller guarantees the folio will not be freed from interrupt
         * context, so (on !SMP) we only need preemption to be disabled
         * and TINY_RCU does that for us.
         */
# ifdef CONFIG_PREEMPT_COUNT
        VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
        VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
        folio_ref_add(folio, count);
#else
        if (unlikely(!folio_ref_add_unless(folio, count, 0))) {
                /* Either the folio has been freed, or will be freed. */
                return false;
        }
#endif
        return true;
}

/**
 * folio_try_get_rcu - Attempt to increase the refcount on a folio.
 * @folio: The folio.
 *
 * This is a version of folio_try_get() optimised for non-SMP kernels.
 * If you are still holding the rcu_read_lock() after looking up the
 * page and know that the page cannot have its refcount decreased to
 * zero in interrupt context, you can use this instead of folio_try_get().
 *
 * Example users include get_user_pages_fast() (as pages are not unmapped
 * from interrupt context) and the page cache lookups (as pages are not
 * truncated from interrupt context).  We also know that pages are not
 * frozen in interrupt context for the purposes of splitting or migration.
 *
 * You can also use this function if you're holding a lock that prevents
 * pages being frozen & removed; eg the i_pages lock for the page cache
 * or the mmap_lock or page table lock for page tables.  In this case,
 * it will always succeed, and you could have used a plain folio_get(),
 * but it's sometimes more convenient to have a common function called
 * from both locked and RCU-protected contexts.
 *
 * Return: True if the reference count was successfully incremented.
 */
static inline bool folio_try_get_rcu(struct folio *folio)
{
        return folio_ref_try_add_rcu(folio, 1);
}

static inline int page_ref_freeze(struct page *page, int count)
{
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);

        if (page_ref_tracepoint_active(page_ref_freeze))
                __page_ref_freeze(page, count, ret);
        return ret;
}

static inline int folio_ref_freeze(struct folio *folio, int count)
{
        return page_ref_freeze(&folio->page, count);
}

static inline void page_ref_unfreeze(struct page *page, int count)
{
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
        VM_BUG_ON(count == 0);

        atomic_set_release(&page->_refcount, count);
        if (page_ref_tracepoint_active(page_ref_unfreeze))
                __page_ref_unfreeze(page, count);
}

static inline void folio_ref_unfreeze(struct folio *folio, int count)
{
        page_ref_unfreeze(&folio->page, count);
}
#endif





































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * sysfs.h - definitions for the device driver filesystem
 *
 * Copyright (c) 2001,2002 Patrick Mochel
 * Copyright (c) 2004 Silicon Graphics, Inc.
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#ifndef _SYSFS_H_
#define _SYSFS_H_

#include <linux/kernfs.h>
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/kobject_ns.h>
#include <linux/stat.h>
#include <linux/atomic.h>

struct kobject;
struct module;
struct bin_attribute;
enum kobj_ns_type;

struct attribute {
        const char                *name;
        umode_t                        mode;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        bool                        ignore_lockdep:1;
        struct lock_class_key        *key;
        struct lock_class_key        skey;
#endif
};

/**
 *        sysfs_attr_init - initialize a dynamically allocated sysfs attribute
 *        @attr: struct attribute to initialize
 *
 *        Initialize a dynamically allocated struct attribute so we can
 *        make lockdep happy.  This is a new requirement for attributes
 *        and initially this is only needed when lockdep is enabled.
 *        Lockdep gives a nice error when your attribute is added to
 *        sysfs if you don't have this.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define sysfs_attr_init(attr)                                \
do {                                                        \
        static struct lock_class_key __key;                \
                                                        \
        (attr)->key = &__key;                                \
} while (0)
#else
#define sysfs_attr_init(attr) do {} while (0)
#endif

/**
 * struct attribute_group - data structure used to declare an attribute group.
 * @name:        Optional: Attribute group name
 *                If specified, the attribute group will be created in a
 *                new subdirectory with this name. Additionally when a
 *                group is named, @is_visible and @is_bin_visible may
 *                return SYSFS_GROUP_INVISIBLE to control visibility of
 *                the directory itself.
 * @is_visible:        Optional: Function to return permissions associated with an
 *                attribute of the group. Will be called repeatedly for
 *                each non-binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC are accepted. Must
 *                return 0 if an attribute is not visible. The returned
 *                value will replace static permissions defined in struct
 *                attribute. Use SYSFS_GROUP_VISIBLE() when assigning this
 *                callback to specify separate _group_visible() and
 *                _attr_visible() handlers.
 * @is_bin_visible:
 *                Optional: Function to return permissions associated with a
 *                binary attribute of the group. Will be called repeatedly
 *                for each binary attribute in the group. Only read/write
 *                permissions as well as SYSFS_PREALLOC (and the
 *                visibility flags for named groups) are accepted. Must
 *                return 0 if a binary attribute is not visible. The
 *                returned value will replace static permissions defined
 *                in struct bin_attribute. If @is_visible is not set, Use
 *                SYSFS_GROUP_VISIBLE() when assigning this callback to
 *                specify separate _group_visible() and _attr_visible()
 *                handlers.
 * @attrs:        Pointer to NULL terminated list of attributes.
 * @bin_attrs:        Pointer to NULL terminated list of binary attributes.
 *                Either attrs or bin_attrs or both must be provided.
 */
struct attribute_group {
        const char                *name;
        umode_t                        (*is_visible)(struct kobject *,
                                              struct attribute *, int);
        umode_t                        (*is_bin_visible)(struct kobject *,
                                                  struct bin_attribute *, int);
        struct attribute        **attrs;
        struct bin_attribute        **bin_attrs;
};

#define SYSFS_PREALLOC                010000
#define SYSFS_GROUP_INVISIBLE        020000

/*
 * DEFINE_SYSFS_GROUP_VISIBLE(name):
 *        A helper macro to pair with the assignment of ".is_visible =
 *        SYSFS_GROUP_VISIBLE(name)", that arranges for the directory
 *        associated with a named attribute_group to optionally be hidden.
 *        This allows for static declaration of attribute_groups, and the
 *        simplification of attribute visibility lifetime that implies,
 *        without polluting sysfs with empty attribute directories.
 * Ex.
 *
 * static umode_t example_attr_visible(struct kobject *kobj,
 *                                   struct attribute *attr, int n)
 * {
 *       if (example_attr_condition)
 *               return 0;
 *       else if (ro_attr_condition)
 *               return 0444;
 *       return a->mode;
 * }
 *
 * static bool example_group_visible(struct kobject *kobj)
 * {
 *       if (example_group_condition)
 *               return false;
 *       return true;
 * }
 *
 * DEFINE_SYSFS_GROUP_VISIBLE(example);
 *
 * static struct attribute_group example_group = {
 *       .name = "example",
 *       .is_visible = SYSFS_GROUP_VISIBLE(example),
 *       .attrs = &example_attrs,
 * };
 *
 * Note that it expects <name>_attr_visible and <name>_group_visible to
 * be defined. For cases where individual attributes do not need
 * separate visibility consideration, only entire group visibility at
 * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE().
 */
#define DEFINE_SYSFS_GROUP_VISIBLE(name)                             \
        static inline umode_t sysfs_group_visible_##name(            \
                struct kobject *kobj, struct attribute *attr, int n) \
        {                                                            \
                if (n == 0 && !name##_group_visible(kobj))           \
                        return SYSFS_GROUP_INVISIBLE;                \
                return name##_attr_visible(kobj, attr, n);           \
        }

/*
 * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name):
 *        A helper macro to pair with SYSFS_GROUP_VISIBLE() that like
 *        DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does
 *        not require the implementation of a per-attribute visibility
 *        callback.
 * Ex.
 *
 * static bool example_group_visible(struct kobject *kobj)
 * {
 *       if (example_group_condition)
 *               return false;
 *       return true;
 * }
 *
 * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example);
 *
 * static struct attribute_group example_group = {
 *       .name = "example",
 *       .is_visible = SYSFS_GROUP_VISIBLE(example),
 *       .attrs = &example_attrs,
 * };
 */
#define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name)                   \
        static inline umode_t sysfs_group_visible_##name(         \
                struct kobject *kobj, struct attribute *a, int n) \
        {                                                         \
                if (n == 0 && !name##_group_visible(kobj))        \
                        return SYSFS_GROUP_INVISIBLE;             \
                return a->mode;                                   \
        }

/*
 * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary
 * attributes. If an attribute_group defines both text and binary
 * attributes, the group visibility is determined by the function
 * specified to is_visible() not is_bin_visible()
 */
#define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name)                             \
        static inline umode_t sysfs_group_visible_##name(                \
                struct kobject *kobj, struct bin_attribute *attr, int n) \
        {                                                                \
                if (n == 0 && !name##_group_visible(kobj))               \
                        return SYSFS_GROUP_INVISIBLE;                    \
                return name##_attr_visible(kobj, attr, n);               \
        }

#define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name)                   \
        static inline umode_t sysfs_group_visible_##name(             \
                struct kobject *kobj, struct bin_attribute *a, int n) \
        {                                                             \
                if (n == 0 && !name##_group_visible(kobj))            \
                        return SYSFS_GROUP_INVISIBLE;                 \
                return a->mode;                                       \
        }

#define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn

/*
 * Use these macros to make defining attributes easier.
 * See include/linux/device.h for examples..
 */

#define __ATTR(_name, _mode, _show, _store) {                                \
        .attr = {.name = __stringify(_name),                                \
                 .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_PREALLOC(_name, _mode, _show, _store) {                        \
        .attr = {.name = __stringify(_name),                                \
                 .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define __ATTR_RO(_name) {                                                \
        .attr        = { .name = __stringify(_name), .mode = 0444 },                \
        .show        = _name##_show,                                                \
}

#define __ATTR_RO_MODE(_name, _mode) {                                        \
        .attr        = { .name = __stringify(_name),                                \
                    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _name##_show,                                                \
}

#define __ATTR_RW_MODE(_name, _mode) {                                        \
        .attr        = { .name = __stringify(_name),                                \
                    .mode = VERIFY_OCTAL_PERMISSIONS(_mode) },                \
        .show        = _name##_show,                                                \
        .store        = _name##_store,                                        \
}

#define __ATTR_WO(_name) {                                                \
        .attr        = { .name = __stringify(_name), .mode = 0200 },                \
        .store        = _name##_store,                                        \
}

#define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store)

#define __ATTR_NULL { .attr = { .name = NULL } }

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) {        \
        .attr = {.name = __stringify(_name), .mode = _mode,        \
                        .ignore_lockdep = true },                \
        .show                = _show,                                \
        .store                = _store,                                \
}
#else
#define __ATTR_IGNORE_LOCKDEP        __ATTR
#endif

#define __ATTRIBUTE_GROUPS(_name)                                \
static const struct attribute_group *_name##_groups[] = {        \
        &_name##_group,                                                \
        NULL,                                                        \
}

#define ATTRIBUTE_GROUPS(_name)                                        \
static const struct attribute_group _name##_group = {                \
        .attrs = _name##_attrs,                                        \
};                                                                \
__ATTRIBUTE_GROUPS(_name)

#define BIN_ATTRIBUTE_GROUPS(_name)                                \
static const struct attribute_group _name##_group = {                \
        .bin_attrs = _name##_attrs,                                \
};                                                                \
__ATTRIBUTE_GROUPS(_name)

struct file;
struct vm_area_struct;
struct address_space;

struct bin_attribute {
        struct attribute        attr;
        size_t                        size;
        void                        *private;
        struct address_space *(*f_mapping)(void);
        ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *,
                        char *, loff_t, size_t);
        ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *,
                         char *, loff_t, size_t);
        loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *,
                         loff_t, int);
        int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr,
                    struct vm_area_struct *vma);
};

/**
 *        sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute
 *        @attr: struct bin_attribute to initialize
 *
 *        Initialize a dynamically allocated struct bin_attribute so we
 *        can make lockdep happy.  This is a new requirement for
 *        attributes and initially this is only needed when lockdep is
 *        enabled.  Lockdep gives a nice error when your attribute is
 *        added to sysfs if you don't have this.
 */
#define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr)

/* macros to create static binary attributes easier */
#define __BIN_ATTR(_name, _mode, _read, _write, _size) {                \
        .attr = { .name = __stringify(_name), .mode = _mode },                \
        .read        = _read,                                                \
        .write        = _write,                                                \
        .size        = _size,                                                \
}

#define __BIN_ATTR_RO(_name, _size) {                                        \
        .attr        = { .name = __stringify(_name), .mode = 0444 },                \
        .read        = _name##_read,                                                \
        .size        = _size,                                                \
}

#define __BIN_ATTR_WO(_name, _size) {                                        \
        .attr        = { .name = __stringify(_name), .mode = 0200 },                \
        .write        = _name##_write,                                        \
        .size        = _size,                                                \
}

#define __BIN_ATTR_RW(_name, _size)                                        \
        __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size)

#define __BIN_ATTR_NULL __ATTR_NULL

#define BIN_ATTR(_name, _mode, _read, _write, _size)                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read,        \
                                        _write, _size)

#define BIN_ATTR_RO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size)

#define BIN_ATTR_WO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)

#define BIN_ATTR_RW(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)


#define __BIN_ATTR_ADMIN_RO(_name, _size) {                                        \
        .attr        = { .name = __stringify(_name), .mode = 0400 },                \
        .read        = _name##_read,                                                \
        .size        = _size,                                                \
}

#define __BIN_ATTR_ADMIN_RW(_name, _size)                                        \
        __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size)

#define BIN_ATTR_ADMIN_RO(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)

#define BIN_ATTR_ADMIN_RW(_name, _size)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)

#define __BIN_ATTR_SIMPLE_RO(_name, _mode) {                                \
        .attr        = { .name = __stringify(_name), .mode = _mode },        \
        .read        = sysfs_bin_attr_simple_read,                                \
}

#define BIN_ATTR_SIMPLE_RO(_name)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444)

#define BIN_ATTR_SIMPLE_ADMIN_RO(_name)                                        \
struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400)

struct sysfs_ops {
        ssize_t        (*show)(struct kobject *, struct attribute *, char *);
        ssize_t        (*store)(struct kobject *, struct attribute *, const char *, size_t);
};

#ifdef CONFIG_SYSFS

int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns);
void sysfs_remove_dir(struct kobject *kobj);
int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                                     const void *new_ns);
int __must_check sysfs_move_dir_ns(struct kobject *kobj,
                                   struct kobject *new_parent_kobj,
                                   const void *new_ns);
int __must_check sysfs_create_mount_point(struct kobject *parent_kobj,
                                          const char *name);
void sysfs_remove_mount_point(struct kobject *parent_kobj,
                              const char *name);

int __must_check sysfs_create_file_ns(struct kobject *kobj,
                                      const struct attribute *attr,
                                      const void *ns);
int __must_check sysfs_create_files(struct kobject *kobj,
                                   const struct attribute * const *attr);
int __must_check sysfs_chmod_file(struct kobject *kobj,
                                  const struct attribute *attr, umode_t mode);
struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
                                                  const struct attribute *attr);
void sysfs_unbreak_active_protection(struct kernfs_node *kn);
void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
                          const void *ns);
bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr);
void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr);

int __must_check sysfs_create_bin_file(struct kobject *kobj,
                                       const struct bin_attribute *attr);
void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr);

int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target,
                                   const char *name);
int __must_check sysfs_create_link_nowarn(struct kobject *kobj,
                                          struct kobject *target,
                                          const char *name);
void sysfs_remove_link(struct kobject *kobj, const char *name);

int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target,
                         const char *old_name, const char *new_name,
                         const void *new_ns);

void sysfs_delete_link(struct kobject *dir, struct kobject *targ,
                        const char *name);

int __must_check sysfs_create_group(struct kobject *kobj,
                                    const struct attribute_group *grp);
int __must_check sysfs_create_groups(struct kobject *kobj,
                                     const struct attribute_group **groups);
int __must_check sysfs_update_groups(struct kobject *kobj,
                                     const struct attribute_group **groups);
int sysfs_update_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_remove_group(struct kobject *kobj,
                        const struct attribute_group *grp);
void sysfs_remove_groups(struct kobject *kobj,
                         const struct attribute_group **groups);
int sysfs_add_file_to_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
void sysfs_remove_file_from_group(struct kobject *kobj,
                        const struct attribute *attr, const char *group);
int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp);
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
                            struct kobject *target, const char *link_name);
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
                                  const char *link_name);
int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                         struct kobject *target_kobj,
                                         const char *target_name,
                                         const char *symlink_name);

void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr);

int __must_check sysfs_init(void);

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
        return kernfs_enable_ns(kn);
}

int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
                            kgid_t kgid);
int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid);
int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
                            const char *name, kuid_t kuid, kgid_t kgid);
int sysfs_groups_change_owner(struct kobject *kobj,
                              const struct attribute_group **groups,
                              kuid_t kuid, kgid_t kgid);
int sysfs_group_change_owner(struct kobject *kobj,
                             const struct attribute_group *groups, kuid_t kuid,
                             kgid_t kgid);
__printf(2, 3)
int sysfs_emit(char *buf, const char *fmt, ...);
__printf(3, 4)
int sysfs_emit_at(char *buf, int at, const char *fmt, ...);

ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
                                   struct bin_attribute *attr, char *buf,
                                   loff_t off, size_t count);

#else /* CONFIG_SYSFS */

static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
        return 0;
}

static inline void sysfs_remove_dir(struct kobject *kobj)
{
}

static inline int sysfs_rename_dir_ns(struct kobject *kobj,
                                      const char *new_name, const void *new_ns)
{
        return 0;
}

static inline int sysfs_move_dir_ns(struct kobject *kobj,
                                    struct kobject *new_parent_kobj,
                                    const void *new_ns)
{
        return 0;
}

static inline int sysfs_create_mount_point(struct kobject *parent_kobj,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_mount_point(struct kobject *parent_kobj,
                                            const char *name)
{
}

static inline int sysfs_create_file_ns(struct kobject *kobj,
                                       const struct attribute *attr,
                                       const void *ns)
{
        return 0;
}

static inline int sysfs_create_files(struct kobject *kobj,
                                    const struct attribute * const *attr)
{
        return 0;
}

static inline int sysfs_chmod_file(struct kobject *kobj,
                                   const struct attribute *attr, umode_t mode)
{
        return 0;
}

static inline struct kernfs_node *
sysfs_break_active_protection(struct kobject *kobj,
                              const struct attribute *attr)
{
        return NULL;
}

static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
}

static inline void sysfs_remove_file_ns(struct kobject *kobj,
                                        const struct attribute *attr,
                                        const void *ns)
{
}

static inline bool sysfs_remove_file_self(struct kobject *kobj,
                                          const struct attribute *attr)
{
        return false;
}

static inline void sysfs_remove_files(struct kobject *kobj,
                                     const struct attribute * const *attr)
{
}

static inline int sysfs_create_bin_file(struct kobject *kobj,
                                        const struct bin_attribute *attr)
{
        return 0;
}

static inline void sysfs_remove_bin_file(struct kobject *kobj,
                                         const struct bin_attribute *attr)
{
}

static inline int sysfs_create_link(struct kobject *kobj,
                                    struct kobject *target, const char *name)
{
        return 0;
}

static inline int sysfs_create_link_nowarn(struct kobject *kobj,
                                           struct kobject *target,
                                           const char *name)
{
        return 0;
}

static inline void sysfs_remove_link(struct kobject *kobj, const char *name)
{
}

static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t,
                                       const char *old_name,
                                       const char *new_name, const void *ns)
{
        return 0;
}

static inline void sysfs_delete_link(struct kobject *k, struct kobject *t,
                                     const char *name)
{
}

static inline int sysfs_create_group(struct kobject *kobj,
                                     const struct attribute_group *grp)
{
        return 0;
}

static inline int sysfs_create_groups(struct kobject *kobj,
                                      const struct attribute_group **groups)
{
        return 0;
}

static inline int sysfs_update_groups(struct kobject *kobj,
                                      const struct attribute_group **groups)
{
        return 0;
}

static inline int sysfs_update_group(struct kobject *kobj,
                                const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_remove_group(struct kobject *kobj,
                                      const struct attribute_group *grp)
{
}

static inline void sysfs_remove_groups(struct kobject *kobj,
                                       const struct attribute_group **groups)
{
}

static inline int sysfs_add_file_to_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
        return 0;
}

static inline void sysfs_remove_file_from_group(struct kobject *kobj,
                const struct attribute *attr, const char *group)
{
}

static inline int sysfs_merge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
        return 0;
}

static inline void sysfs_unmerge_group(struct kobject *kobj,
                       const struct attribute_group *grp)
{
}

static inline int sysfs_add_link_to_group(struct kobject *kobj,
                const char *group_name, struct kobject *target,
                const char *link_name)
{
        return 0;
}

static inline void sysfs_remove_link_from_group(struct kobject *kobj,
                const char *group_name, const char *link_name)
{
}

static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
                                                       struct kobject *target_kobj,
                                                       const char *target_name,
                                                       const char *symlink_name)
{
        return 0;
}

static inline void sysfs_notify(struct kobject *kobj, const char *dir,
                                const char *attr)
{
}

static inline int __must_check sysfs_init(void)
{
        return 0;
}

static inline void sysfs_enable_ns(struct kernfs_node *kn)
{
}

static inline int sysfs_file_change_owner(struct kobject *kobj,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_link_change_owner(struct kobject *kobj,
                                          struct kobject *targ,
                                          const char *name, kuid_t kuid,
                                          kgid_t kgid)
{
        return 0;
}

static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_groups_change_owner(struct kobject *kobj,
                          const struct attribute_group **groups,
                          kuid_t kuid, kgid_t kgid)
{
        return 0;
}

static inline int sysfs_group_change_owner(struct kobject *kobj,
                                           const struct attribute_group *groups,
                                           kuid_t kuid, kgid_t kgid)
{
        return 0;
}

__printf(2, 3)
static inline int sysfs_emit(char *buf, const char *fmt, ...)
{
        return 0;
}

__printf(3, 4)
static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
{
        return 0;
}

static inline ssize_t sysfs_bin_attr_simple_read(struct file *file,
                                                 struct kobject *kobj,
                                                 struct bin_attribute *attr,
                                                 char *buf, loff_t off,
                                                 size_t count)
{
        return 0;
}
#endif /* CONFIG_SYSFS */

static inline int __must_check sysfs_create_file(struct kobject *kobj,
                                                 const struct attribute *attr)
{
        return sysfs_create_file_ns(kobj, attr, NULL);
}

static inline void sysfs_remove_file(struct kobject *kobj,
                                     const struct attribute *attr)
{
        sysfs_remove_file_ns(kobj, attr, NULL);
}

static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target,
                                    const char *old_name, const char *new_name)
{
        return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL);
}

static inline void sysfs_notify_dirent(struct kernfs_node *kn)
{
        kernfs_notify(kn);
}

static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent,
                                                   const char *name)
{
        return kernfs_find_and_get(parent, name);
}

static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn)
{
        kernfs_get(kn);
        return kn;
}

static inline void sysfs_put(struct kernfs_node *kn)
{
        kernfs_put(kn);
}

#endif /* _SYSFS_H_ */





































    1 
    1 







































































    1 




    1 
    1 









    1 
















































































    1 




    1 
    1 











    1 



















































































































































































    1 





    1 
    1 
    1 
    1 


    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
// SPDX-License-Identifier: GPL-2.0-only
/*
 * File: pn_dev.c
 *
 * Phonet network device
 *
 * Copyright (C) 2008 Nokia Corporation.
 *
 * Authors: Sakari Ailus <sakari.ailus@nokia.com>
 *          Rémi Denis-Courmont
 */

#include <linux/kernel.h>
#include <linux/net.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/phonet.h>
#include <linux/proc_fs.h>
#include <linux/if_arp.h>
#include <net/sock.h>
#include <net/netns/generic.h>
#include <net/phonet/pn_dev.h>

struct phonet_routes {
        struct mutex                lock;
        struct net_device __rcu        *table[64];
};

struct phonet_net {
        struct phonet_device_list pndevs;
        struct phonet_routes routes;
};

static unsigned int phonet_net_id __read_mostly;

static struct phonet_net *phonet_pernet(struct net *net)
{
        return net_generic(net, phonet_net_id);
}

struct phonet_device_list *phonet_device_list(struct net *net)
{
        struct phonet_net *pnn = phonet_pernet(net);
        return &pnn->pndevs;
}

/* Allocate new Phonet device. */
static struct phonet_device *__phonet_device_alloc(struct net_device *dev)
{
        struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
        struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC);
        if (pnd == NULL)
                return NULL;
        pnd->netdev = dev;
        bitmap_zero(pnd->addrs, 64);

        BUG_ON(!mutex_is_locked(&pndevs->lock));
        list_add_rcu(&pnd->list, &pndevs->list);
        return pnd;
}

static struct phonet_device *__phonet_get(struct net_device *dev)
{
        struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
        struct phonet_device *pnd;

        BUG_ON(!mutex_is_locked(&pndevs->lock));
        list_for_each_entry(pnd, &pndevs->list, list) {
                if (pnd->netdev == dev)
                        return pnd;
        }
        return NULL;
}

static struct phonet_device *__phonet_get_rcu(struct net_device *dev)
{
        struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
        struct phonet_device *pnd;

        list_for_each_entry_rcu(pnd, &pndevs->list, list) {
                if (pnd->netdev == dev)
                        return pnd;
        }
        return NULL;
}

static void phonet_device_destroy(struct net_device *dev)
{
        struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
        struct phonet_device *pnd;

        ASSERT_RTNL();

        mutex_lock(&pndevs->lock);
        pnd = __phonet_get(dev);
        if (pnd)
                list_del_rcu(&pnd->list);
        mutex_unlock(&pndevs->lock);

        if (pnd) {
                u8 addr;

                for_each_set_bit(addr, pnd->addrs, 64)
                        phonet_address_notify(RTM_DELADDR, dev, addr);
                kfree(pnd);
        }
}

struct net_device *phonet_device_get(struct net *net)
{
        struct phonet_device_list *pndevs = phonet_device_list(net);
        struct phonet_device *pnd;
        struct net_device *dev = NULL;

        rcu_read_lock();
        list_for_each_entry_rcu(pnd, &pndevs->list, list) {
                dev = pnd->netdev;
                BUG_ON(!dev);

                if ((dev->reg_state == NETREG_REGISTERED) &&
                        ((pnd->netdev->flags & IFF_UP)) == IFF_UP)
                        break;
                dev = NULL;
        }
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}

int phonet_address_add(struct net_device *dev, u8 addr)
{
        struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
        struct phonet_device *pnd;
        int err = 0;

        mutex_lock(&pndevs->lock);
        /* Find or create Phonet-specific device data */
        pnd = __phonet_get(dev);
        if (pnd == NULL)
                pnd = __phonet_device_alloc(dev);
        if (unlikely(pnd == NULL))
                err = -ENOMEM;
        else if (test_and_set_bit(addr >> 2, pnd->addrs))
                err = -EEXIST;
        mutex_unlock(&pndevs->lock);
        return err;
}

int phonet_address_del(struct net_device *dev, u8 addr)
{
        struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
        struct phonet_device *pnd;
        int err = 0;

        mutex_lock(&pndevs->lock);
        pnd = __phonet_get(dev);
        if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) {
                err = -EADDRNOTAVAIL;
                pnd = NULL;
        } else if (bitmap_empty(pnd->addrs, 64))
                list_del_rcu(&pnd->list);
        else
                pnd = NULL;
        mutex_unlock(&pndevs->lock);

        if (pnd)
                kfree_rcu(pnd, rcu);

        return err;
}

/* Gets a source address toward a destination, through a interface. */
u8 phonet_address_get(struct net_device *dev, u8 daddr)
{
        struct phonet_device *pnd;
        u8 saddr;

        rcu_read_lock();
        pnd = __phonet_get_rcu(dev);
        if (pnd) {
                BUG_ON(bitmap_empty(pnd->addrs, 64));

                /* Use same source address as destination, if possible */
                if (test_bit(daddr >> 2, pnd->addrs))
                        saddr = daddr;
                else
                        saddr = find_first_bit(pnd->addrs, 64) << 2;
        } else
                saddr = PN_NO_ADDR;
        rcu_read_unlock();

        if (saddr == PN_NO_ADDR) {
                /* Fallback to another device */
                struct net_device *def_dev;

                def_dev = phonet_device_get(dev_net(dev));
                if (def_dev) {
                        if (def_dev != dev)
                                saddr = phonet_address_get(def_dev, daddr);
                        dev_put(def_dev);
                }
        }
        return saddr;
}

int phonet_address_lookup(struct net *net, u8 addr)
{
        struct phonet_device_list *pndevs = phonet_device_list(net);
        struct phonet_device *pnd;
        int err = -EADDRNOTAVAIL;

        rcu_read_lock();
        list_for_each_entry_rcu(pnd, &pndevs->list, list) {
                /* Don't allow unregistering devices! */
                if ((pnd->netdev->reg_state != NETREG_REGISTERED) ||
                                ((pnd->netdev->flags & IFF_UP)) != IFF_UP)
                        continue;

                if (test_bit(addr >> 2, pnd->addrs)) {
                        err = 0;
                        goto found;
                }
        }
found:
        rcu_read_unlock();
        return err;
}

/* automatically configure a Phonet device, if supported */
static int phonet_device_autoconf(struct net_device *dev)
{
        struct if_phonet_req req;
        int ret;

        if (!dev->netdev_ops->ndo_siocdevprivate)
                return -EOPNOTSUPP;

        ret = dev->netdev_ops->ndo_siocdevprivate(dev, (struct ifreq *)&req,
                                                  NULL, SIOCPNGAUTOCONF);
        if (ret < 0)
                return ret;

        ASSERT_RTNL();
        ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device);
        if (ret)
                return ret;
        phonet_address_notify(RTM_NEWADDR, dev,
                                req.ifr_phonet_autoconf.device);
        return 0;
}

static void phonet_route_autodel(struct net_device *dev)
{
        struct phonet_net *pnn = phonet_pernet(dev_net(dev));
        unsigned int i;
        DECLARE_BITMAP(deleted, 64);

        /* Remove left-over Phonet routes */
        bitmap_zero(deleted, 64);
        mutex_lock(&pnn->routes.lock);
        for (i = 0; i < 64; i++)
                if (rcu_access_pointer(pnn->routes.table[i]) == dev) {
                        RCU_INIT_POINTER(pnn->routes.table[i], NULL);
                        set_bit(i, deleted);
                }
        mutex_unlock(&pnn->routes.lock);

        if (bitmap_empty(deleted, 64))
                return; /* short-circuit RCU */
        synchronize_rcu();
        for_each_set_bit(i, deleted, 64) {
                rtm_phonet_notify(RTM_DELROUTE, dev, i);
                dev_put(dev);
        }
}

/* notify Phonet of device events */
static int phonet_device_notify(struct notifier_block *me, unsigned long what,
                                void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (what) {
        case NETDEV_REGISTER:
                if (dev->type == ARPHRD_PHONET)
                        phonet_device_autoconf(dev);
                break;
        case NETDEV_UNREGISTER:
                phonet_device_destroy(dev);
                phonet_route_autodel(dev);
                break;
        }
        return 0;

}

static struct notifier_block phonet_device_notifier = {
        .notifier_call = phonet_device_notify,
        .priority = 0,
};

/* Per-namespace Phonet devices handling */
static int __net_init phonet_init_net(struct net *net)
{
        struct phonet_net *pnn = phonet_pernet(net);

        if (!proc_create_net("phonet", 0, net->proc_net, &pn_sock_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;

        INIT_LIST_HEAD(&pnn->pndevs.list);
        mutex_init(&pnn->pndevs.lock);
        mutex_init(&pnn->routes.lock);
        return 0;
}

static void __net_exit phonet_exit_net(struct net *net)
{
        struct phonet_net *pnn = phonet_pernet(net);

        remove_proc_entry("phonet", net->proc_net);
        WARN_ON_ONCE(!list_empty(&pnn->pndevs.list));
}

static struct pernet_operations phonet_net_ops = {
        .init = phonet_init_net,
        .exit = phonet_exit_net,
        .id   = &phonet_net_id,
        .size = sizeof(struct phonet_net),
};

/* Initialize Phonet devices list */
int __init phonet_device_init(void)
{
        int err = register_pernet_subsys(&phonet_net_ops);
        if (err)
                return err;

        proc_create_net("pnresource", 0, init_net.proc_net, &pn_res_seq_ops,
                        sizeof(struct seq_net_private));
        register_netdevice_notifier(&phonet_device_notifier);
        err = phonet_netlink_register();
        if (err)
                phonet_device_exit();
        return err;
}

void phonet_device_exit(void)
{
        rtnl_unregister_all(PF_PHONET);
        unregister_netdevice_notifier(&phonet_device_notifier);
        unregister_pernet_subsys(&phonet_net_ops);
        remove_proc_entry("pnresource", init_net.proc_net);
}

int phonet_route_add(struct net_device *dev, u8 daddr)
{
        struct phonet_net *pnn = phonet_pernet(dev_net(dev));
        struct phonet_routes *routes = &pnn->routes;
        int err = -EEXIST;

        daddr = daddr >> 2;
        mutex_lock(&routes->lock);
        if (routes->table[daddr] == NULL) {
                rcu_assign_pointer(routes->table[daddr], dev);
                dev_hold(dev);
                err = 0;
        }
        mutex_unlock(&routes->lock);
        return err;
}

int phonet_route_del(struct net_device *dev, u8 daddr)
{
        struct phonet_net *pnn = phonet_pernet(dev_net(dev));
        struct phonet_routes *routes = &pnn->routes;

        daddr = daddr >> 2;
        mutex_lock(&routes->lock);
        if (rcu_access_pointer(routes->table[daddr]) == dev)
                RCU_INIT_POINTER(routes->table[daddr], NULL);
        else
                dev = NULL;
        mutex_unlock(&routes->lock);

        if (!dev)
                return -ENOENT;
        synchronize_rcu();
        dev_put(dev);
        return 0;
}

struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr)
{
        struct phonet_net *pnn = phonet_pernet(net);
        struct phonet_routes *routes = &pnn->routes;
        struct net_device *dev;

        daddr >>= 2;
        dev = rcu_dereference(routes->table[daddr]);
        return dev;
}

struct net_device *phonet_route_output(struct net *net, u8 daddr)
{
        struct phonet_net *pnn = phonet_pernet(net);
        struct phonet_routes *routes = &pnn->routes;
        struct net_device *dev;

        daddr >>= 2;
        rcu_read_lock();
        dev = rcu_dereference(routes->table[daddr]);
        dev_hold(dev);
        rcu_read_unlock();

        if (!dev)
                dev = phonet_device_get(net); /* Default route */
        return dev;
}



















































































































    1 

































    1 


























    1 


























    1 
















































































































































































































    2 






























    1 
    1 




    1 
    1 

    1 


    1 


















    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * af_alg: User-space algorithm interface
 *
 * This file provides the user-space API for algorithms.
 *
 * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <linux/atomic.h>
#include <crypto/if_alg.h>
#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/key.h>
#include <linux/key-type.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/security.h>
#include <linux/string.h>
#include <keys/user-type.h>
#include <keys/trusted-type.h>
#include <keys/encrypted-type.h>

struct alg_type_list {
        const struct af_alg_type *type;
        struct list_head list;
};

static struct proto alg_proto = {
        .name                        = "ALG",
        .owner                        = THIS_MODULE,
        .obj_size                = sizeof(struct alg_sock),
};

static LIST_HEAD(alg_types);
static DECLARE_RWSEM(alg_types_sem);

static const struct af_alg_type *alg_get_type(const char *name)
{
        const struct af_alg_type *type = ERR_PTR(-ENOENT);
        struct alg_type_list *node;

        down_read(&alg_types_sem);
        list_for_each_entry(node, &alg_types, list) {
                if (strcmp(node->type->name, name))
                        continue;

                if (try_module_get(node->type->owner))
                        type = node->type;
                break;
        }
        up_read(&alg_types_sem);

        return type;
}

int af_alg_register_type(const struct af_alg_type *type)
{
        struct alg_type_list *node;
        int err = -EEXIST;

        down_write(&alg_types_sem);
        list_for_each_entry(node, &alg_types, list) {
                if (!strcmp(node->type->name, type->name))
                        goto unlock;
        }

        node = kmalloc(sizeof(*node), GFP_KERNEL);
        err = -ENOMEM;
        if (!node)
                goto unlock;

        type->ops->owner = THIS_MODULE;
        if (type->ops_nokey)
                type->ops_nokey->owner = THIS_MODULE;
        node->type = type;
        list_add(&node->list, &alg_types);
        err = 0;

unlock:
        up_write(&alg_types_sem);

        return err;
}
EXPORT_SYMBOL_GPL(af_alg_register_type);

int af_alg_unregister_type(const struct af_alg_type *type)
{
        struct alg_type_list *node;
        int err = -ENOENT;

        down_write(&alg_types_sem);
        list_for_each_entry(node, &alg_types, list) {
                if (strcmp(node->type->name, type->name))
                        continue;

                list_del(&node->list);
                kfree(node);
                err = 0;
                break;
        }
        up_write(&alg_types_sem);

        return err;
}
EXPORT_SYMBOL_GPL(af_alg_unregister_type);

static void alg_do_release(const struct af_alg_type *type, void *private)
{
        if (!type)
                return;

        type->release(private);
        module_put(type->owner);
}

int af_alg_release(struct socket *sock)
{
        if (sock->sk) {
                sock_put(sock->sk);
                sock->sk = NULL;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(af_alg_release);

void af_alg_release_parent(struct sock *sk)
{
        struct alg_sock *ask = alg_sk(sk);
        unsigned int nokey = atomic_read(&ask->nokey_refcnt);

        sk = ask->parent;
        ask = alg_sk(sk);

        if (nokey)
                atomic_dec(&ask->nokey_refcnt);

        if (atomic_dec_and_test(&ask->refcnt))
                sock_put(sk);
}
EXPORT_SYMBOL_GPL(af_alg_release_parent);

static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY;
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct sockaddr_alg_new *sa = (void *)uaddr;
        const struct af_alg_type *type;
        void *private;
        int err;

        if (sock->state == SS_CONNECTED)
                return -EINVAL;

        BUILD_BUG_ON(offsetof(struct sockaddr_alg_new, salg_name) !=
                     offsetof(struct sockaddr_alg, salg_name));
        BUILD_BUG_ON(offsetof(struct sockaddr_alg, salg_name) != sizeof(*sa));

        if (addr_len < sizeof(*sa) + 1)
                return -EINVAL;

        /* If caller uses non-allowed flag, return error. */
        if ((sa->salg_feat & ~allowed) || (sa->salg_mask & ~allowed))
                return -EINVAL;

        sa->salg_type[sizeof(sa->salg_type) - 1] = 0;
        sa->salg_name[addr_len - sizeof(*sa) - 1] = 0;

        type = alg_get_type(sa->salg_type);
        if (PTR_ERR(type) == -ENOENT) {
                request_module("algif-%s", sa->salg_type);
                type = alg_get_type(sa->salg_type);
        }

        if (IS_ERR(type))
                return PTR_ERR(type);

        private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask);
        if (IS_ERR(private)) {
                module_put(type->owner);
                return PTR_ERR(private);
        }

        err = -EBUSY;
        lock_sock(sk);
        if (atomic_read(&ask->refcnt))
                goto unlock;

        swap(ask->type, type);
        swap(ask->private, private);

        err = 0;

unlock:
        release_sock(sk);

        alg_do_release(type, private);

        return err;
}

static int alg_setkey(struct sock *sk, sockptr_t ukey, unsigned int keylen)
{
        struct alg_sock *ask = alg_sk(sk);
        const struct af_alg_type *type = ask->type;
        u8 *key;
        int err;

        key = sock_kmalloc(sk, keylen, GFP_KERNEL);
        if (!key)
                return -ENOMEM;

        err = -EFAULT;
        if (copy_from_sockptr(key, ukey, keylen))
                goto out;

        err = type->setkey(ask->private, key, keylen);

out:
        sock_kzfree_s(sk, key, keylen);

        return err;
}

#ifdef CONFIG_KEYS

static const u8 *key_data_ptr_user(const struct key *key,
                                   unsigned int *datalen)
{
        const struct user_key_payload *ukp;

        ukp = user_key_payload_locked(key);
        if (IS_ERR_OR_NULL(ukp))
                return ERR_PTR(-EKEYREVOKED);

        *datalen = key->datalen;

        return ukp->data;
}

static const u8 *key_data_ptr_encrypted(const struct key *key,
                                        unsigned int *datalen)
{
        const struct encrypted_key_payload *ekp;

        ekp = dereference_key_locked(key);
        if (IS_ERR_OR_NULL(ekp))
                return ERR_PTR(-EKEYREVOKED);

        *datalen = ekp->decrypted_datalen;

        return ekp->decrypted_data;
}

static const u8 *key_data_ptr_trusted(const struct key *key,
                                      unsigned int *datalen)
{
        const struct trusted_key_payload *tkp;

        tkp = dereference_key_locked(key);
        if (IS_ERR_OR_NULL(tkp))
                return ERR_PTR(-EKEYREVOKED);

        *datalen = tkp->key_len;

        return tkp->key;
}

static struct key *lookup_key(key_serial_t serial)
{
        key_ref_t key_ref;

        key_ref = lookup_user_key(serial, 0, KEY_NEED_SEARCH);
        if (IS_ERR(key_ref))
                return ERR_CAST(key_ref);

        return key_ref_to_ptr(key_ref);
}

static int alg_setkey_by_key_serial(struct alg_sock *ask, sockptr_t optval,
                                    unsigned int optlen)
{
        const struct af_alg_type *type = ask->type;
        u8 *key_data = NULL;
        unsigned int key_datalen;
        key_serial_t serial;
        struct key *key;
        const u8 *ret;
        int err;

        if (optlen != sizeof(serial))
                return -EINVAL;

        if (copy_from_sockptr(&serial, optval, optlen))
                return -EFAULT;

        key = lookup_key(serial);
        if (IS_ERR(key))
                return PTR_ERR(key);

        down_read(&key->sem);

        ret = ERR_PTR(-ENOPROTOOPT);
        if (!strcmp(key->type->name, "user") ||
            !strcmp(key->type->name, "logon")) {
                ret = key_data_ptr_user(key, &key_datalen);
        } else if (IS_REACHABLE(CONFIG_ENCRYPTED_KEYS) &&
                           !strcmp(key->type->name, "encrypted")) {
                ret = key_data_ptr_encrypted(key, &key_datalen);
        } else if (IS_REACHABLE(CONFIG_TRUSTED_KEYS) &&
                           !strcmp(key->type->name, "trusted")) {
                ret = key_data_ptr_trusted(key, &key_datalen);
        }

        if (IS_ERR(ret)) {
                up_read(&key->sem);
                key_put(key);
                return PTR_ERR(ret);
        }

        key_data = sock_kmalloc(&ask->sk, key_datalen, GFP_KERNEL);
        if (!key_data) {
                up_read(&key->sem);
                key_put(key);
                return -ENOMEM;
        }

        memcpy(key_data, ret, key_datalen);

        up_read(&key->sem);
        key_put(key);

        err = type->setkey(ask->private, key_data, key_datalen);

        sock_kzfree_s(&ask->sk, key_data, key_datalen);

        return err;
}

#else

static inline int alg_setkey_by_key_serial(struct alg_sock *ask,
                                           sockptr_t optval,
                                           unsigned int optlen)
{
        return -ENOPROTOOPT;
}

#endif

static int alg_setsockopt(struct socket *sock, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        const struct af_alg_type *type;
        int err = -EBUSY;

        lock_sock(sk);
        if (atomic_read(&ask->refcnt) != atomic_read(&ask->nokey_refcnt))
                goto unlock;

        type = ask->type;

        err = -ENOPROTOOPT;
        if (level != SOL_ALG || !type)
                goto unlock;

        switch (optname) {
        case ALG_SET_KEY:
        case ALG_SET_KEY_BY_KEY_SERIAL:
                if (sock->state == SS_CONNECTED)
                        goto unlock;
                if (!type->setkey)
                        goto unlock;

                if (optname == ALG_SET_KEY_BY_KEY_SERIAL)
                        err = alg_setkey_by_key_serial(ask, optval, optlen);
                else
                        err = alg_setkey(sk, optval, optlen);
                break;
        case ALG_SET_AEAD_AUTHSIZE:
                if (sock->state == SS_CONNECTED)
                        goto unlock;
                if (!type->setauthsize)
                        goto unlock;
                err = type->setauthsize(ask->private, optlen);
                break;
        case ALG_SET_DRBG_ENTROPY:
                if (sock->state == SS_CONNECTED)
                        goto unlock;
                if (!type->setentropy)
                        goto unlock;

                err = type->setentropy(ask->private, optval, optlen);
        }

unlock:
        release_sock(sk);

        return err;
}

int af_alg_accept(struct sock *sk, struct socket *newsock,
                  struct proto_accept_arg *arg)
{
        struct alg_sock *ask = alg_sk(sk);
        const struct af_alg_type *type;
        struct sock *sk2;
        unsigned int nokey;
        int err;

        lock_sock(sk);
        type = ask->type;

        err = -EINVAL;
        if (!type)
                goto unlock;

        sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, arg->kern);
        err = -ENOMEM;
        if (!sk2)
                goto unlock;

        sock_init_data(newsock, sk2);
        security_sock_graft(sk2, newsock);
        security_sk_clone(sk, sk2);

        /*
         * newsock->ops assigned here to allow type->accept call to override
         * them when required.
         */
        newsock->ops = type->ops;
        err = type->accept(ask->private, sk2);

        nokey = err == -ENOKEY;
        if (nokey && type->accept_nokey)
                err = type->accept_nokey(ask->private, sk2);

        if (err)
                goto unlock;

        if (atomic_inc_return_relaxed(&ask->refcnt) == 1)
                sock_hold(sk);
        if (nokey) {
                atomic_inc(&ask->nokey_refcnt);
                atomic_set(&alg_sk(sk2)->nokey_refcnt, 1);
        }
        alg_sk(sk2)->parent = sk;
        alg_sk(sk2)->type = type;

        newsock->state = SS_CONNECTED;

        if (nokey)
                newsock->ops = type->ops_nokey;

        err = 0;

unlock:
        release_sock(sk);

        return err;
}
EXPORT_SYMBOL_GPL(af_alg_accept);

static int alg_accept(struct socket *sock, struct socket *newsock,
                      struct proto_accept_arg *arg)
{
        return af_alg_accept(sock->sk, newsock, arg);
}

static const struct proto_ops alg_proto_ops = {
        .family                =        PF_ALG,
        .owner                =        THIS_MODULE,

        .connect        =        sock_no_connect,
        .socketpair        =        sock_no_socketpair,
        .getname        =        sock_no_getname,
        .ioctl                =        sock_no_ioctl,
        .listen                =        sock_no_listen,
        .shutdown        =        sock_no_shutdown,
        .mmap                =        sock_no_mmap,
        .sendmsg        =        sock_no_sendmsg,
        .recvmsg        =        sock_no_recvmsg,

        .bind                =        alg_bind,
        .release        =        af_alg_release,
        .setsockopt        =        alg_setsockopt,
        .accept                =        alg_accept,
};

static void alg_sock_destruct(struct sock *sk)
{
        struct alg_sock *ask = alg_sk(sk);

        alg_do_release(ask->type, ask->private);
}

static int alg_create(struct net *net, struct socket *sock, int protocol,
                      int kern)
{
        struct sock *sk;
        int err;

        if (sock->type != SOCK_SEQPACKET)
                return -ESOCKTNOSUPPORT;
        if (protocol != 0)
                return -EPROTONOSUPPORT;

        err = -ENOMEM;
        sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern);
        if (!sk)
                goto out;

        sock->ops = &alg_proto_ops;
        sock_init_data(sock, sk);

        sk->sk_destruct = alg_sock_destruct;

        return 0;
out:
        return err;
}

static const struct net_proto_family alg_family = {
        .family        =        PF_ALG,
        .create        =        alg_create,
        .owner        =        THIS_MODULE,
};

static void af_alg_link_sg(struct af_alg_sgl *sgl_prev,
                           struct af_alg_sgl *sgl_new)
{
        sg_unmark_end(sgl_prev->sgt.sgl + sgl_prev->sgt.nents - 1);
        sg_chain(sgl_prev->sgt.sgl, sgl_prev->sgt.nents + 1, sgl_new->sgt.sgl);
}

void af_alg_free_sg(struct af_alg_sgl *sgl)
{
        int i;

        if (sgl->sgt.sgl) {
                if (sgl->need_unpin)
                        for (i = 0; i < sgl->sgt.nents; i++)
                                unpin_user_page(sg_page(&sgl->sgt.sgl[i]));
                if (sgl->sgt.sgl != sgl->sgl)
                        kvfree(sgl->sgt.sgl);
                sgl->sgt.sgl = NULL;
        }
}
EXPORT_SYMBOL_GPL(af_alg_free_sg);

static int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con)
{
        struct cmsghdr *cmsg;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
                if (cmsg->cmsg_level != SOL_ALG)
                        continue;

                switch (cmsg->cmsg_type) {
                case ALG_SET_IV:
                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(*con->iv)))
                                return -EINVAL;
                        con->iv = (void *)CMSG_DATA(cmsg);
                        if (cmsg->cmsg_len < CMSG_LEN(con->iv->ivlen +
                                                      sizeof(*con->iv)))
                                return -EINVAL;
                        break;

                case ALG_SET_OP:
                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32)))
                                return -EINVAL;
                        con->op = *(u32 *)CMSG_DATA(cmsg);
                        break;

                case ALG_SET_AEAD_ASSOCLEN:
                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32)))
                                return -EINVAL;
                        con->aead_assoclen = *(u32 *)CMSG_DATA(cmsg);
                        break;

                default:
                        return -EINVAL;
                }
        }

        return 0;
}

/**
 * af_alg_alloc_tsgl - allocate the TX SGL
 *
 * @sk: socket of connection to user space
 * Return: 0 upon success, < 0 upon error
 */
static int af_alg_alloc_tsgl(struct sock *sk)
{
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        struct af_alg_tsgl *sgl;
        struct scatterlist *sg = NULL;

        sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list);
        if (!list_empty(&ctx->tsgl_list))
                sg = sgl->sg;

        if (!sg || sgl->cur >= MAX_SGL_ENTS) {
                sgl = sock_kmalloc(sk,
                                   struct_size(sgl, sg, (MAX_SGL_ENTS + 1)),
                                   GFP_KERNEL);
                if (!sgl)
                        return -ENOMEM;

                sg_init_table(sgl->sg, MAX_SGL_ENTS + 1);
                sgl->cur = 0;

                if (sg)
                        sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg);

                list_add_tail(&sgl->list, &ctx->tsgl_list);
        }

        return 0;
}

/**
 * af_alg_count_tsgl - Count number of TX SG entries
 *
 * The counting starts from the beginning of the SGL to @bytes. If
 * an @offset is provided, the counting of the SG entries starts at the @offset.
 *
 * @sk: socket of connection to user space
 * @bytes: Count the number of SG entries holding given number of bytes.
 * @offset: Start the counting of SG entries from the given offset.
 * Return: Number of TX SG entries found given the constraints
 */
unsigned int af_alg_count_tsgl(struct sock *sk, size_t bytes, size_t offset)
{
        const struct alg_sock *ask = alg_sk(sk);
        const struct af_alg_ctx *ctx = ask->private;
        const struct af_alg_tsgl *sgl;
        unsigned int i;
        unsigned int sgl_count = 0;

        if (!bytes)
                return 0;

        list_for_each_entry(sgl, &ctx->tsgl_list, list) {
                const struct scatterlist *sg = sgl->sg;

                for (i = 0; i < sgl->cur; i++) {
                        size_t bytes_count;

                        /* Skip offset */
                        if (offset >= sg[i].length) {
                                offset -= sg[i].length;
                                bytes -= sg[i].length;
                                continue;
                        }

                        bytes_count = sg[i].length - offset;

                        offset = 0;
                        sgl_count++;

                        /* If we have seen requested number of bytes, stop */
                        if (bytes_count >= bytes)
                                return sgl_count;

                        bytes -= bytes_count;
                }
        }

        return sgl_count;
}
EXPORT_SYMBOL_GPL(af_alg_count_tsgl);

/**
 * af_alg_pull_tsgl - Release the specified buffers from TX SGL
 *
 * If @dst is non-null, reassign the pages to @dst. The caller must release
 * the pages. If @dst_offset is given only reassign the pages to @dst starting
 * at the @dst_offset (byte). The caller must ensure that @dst is large
 * enough (e.g. by using af_alg_count_tsgl with the same offset).
 *
 * @sk: socket of connection to user space
 * @used: Number of bytes to pull from TX SGL
 * @dst: If non-NULL, buffer is reassigned to dst SGL instead of releasing. The
 *         caller must release the buffers in dst.
 * @dst_offset: Reassign the TX SGL from given offset. All buffers before
 *                reaching the offset is released.
 */
void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst,
                      size_t dst_offset)
{
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        struct af_alg_tsgl *sgl;
        struct scatterlist *sg;
        unsigned int i, j = 0;

        while (!list_empty(&ctx->tsgl_list)) {
                sgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl,
                                       list);
                sg = sgl->sg;

                for (i = 0; i < sgl->cur; i++) {
                        size_t plen = min_t(size_t, used, sg[i].length);
                        struct page *page = sg_page(sg + i);

                        if (!page)
                                continue;

                        /*
                         * Assumption: caller created af_alg_count_tsgl(len)
                         * SG entries in dst.
                         */
                        if (dst) {
                                if (dst_offset >= plen) {
                                        /* discard page before offset */
                                        dst_offset -= plen;
                                } else {
                                        /* reassign page to dst after offset */
                                        get_page(page);
                                        sg_set_page(dst + j, page,
                                                    plen - dst_offset,
                                                    sg[i].offset + dst_offset);
                                        dst_offset = 0;
                                        j++;
                                }
                        }

                        sg[i].length -= plen;
                        sg[i].offset += plen;

                        used -= plen;
                        ctx->used -= plen;

                        if (sg[i].length)
                                return;

                        put_page(page);
                        sg_assign_page(sg + i, NULL);
                }

                list_del(&sgl->list);
                sock_kfree_s(sk, sgl, struct_size(sgl, sg, MAX_SGL_ENTS + 1));
        }

        if (!ctx->used)
                ctx->merge = 0;
        ctx->init = ctx->more;
}
EXPORT_SYMBOL_GPL(af_alg_pull_tsgl);

/**
 * af_alg_free_areq_sgls - Release TX and RX SGLs of the request
 *
 * @areq: Request holding the TX and RX SGL
 */
static void af_alg_free_areq_sgls(struct af_alg_async_req *areq)
{
        struct sock *sk = areq->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        struct af_alg_rsgl *rsgl, *tmp;
        struct scatterlist *tsgl;
        struct scatterlist *sg;
        unsigned int i;

        list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) {
                atomic_sub(rsgl->sg_num_bytes, &ctx->rcvused);
                af_alg_free_sg(&rsgl->sgl);
                list_del(&rsgl->list);
                if (rsgl != &areq->first_rsgl)
                        sock_kfree_s(sk, rsgl, sizeof(*rsgl));
        }

        tsgl = areq->tsgl;
        if (tsgl) {
                for_each_sg(tsgl, sg, areq->tsgl_entries, i) {
                        if (!sg_page(sg))
                                continue;
                        put_page(sg_page(sg));
                }

                sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl));
        }
}

/**
 * af_alg_wait_for_wmem - wait for availability of writable memory
 *
 * @sk: socket of connection to user space
 * @flags: If MSG_DONTWAIT is set, then only report if function would sleep
 * Return: 0 when writable memory is available, < 0 upon error
 */
static int af_alg_wait_for_wmem(struct sock *sk, unsigned int flags)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        int err = -ERESTARTSYS;
        long timeout;

        if (flags & MSG_DONTWAIT)
                return -EAGAIN;

        sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        add_wait_queue(sk_sleep(sk), &wait);
        for (;;) {
                if (signal_pending(current))
                        break;
                timeout = MAX_SCHEDULE_TIMEOUT;
                if (sk_wait_event(sk, &timeout, af_alg_writable(sk), &wait)) {
                        err = 0;
                        break;
                }
        }
        remove_wait_queue(sk_sleep(sk), &wait);

        return err;
}

/**
 * af_alg_wmem_wakeup - wakeup caller when writable memory is available
 *
 * @sk: socket of connection to user space
 */
void af_alg_wmem_wakeup(struct sock *sk)
{
        struct socket_wq *wq;

        if (!af_alg_writable(sk))
                return;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
                                                           EPOLLRDNORM |
                                                           EPOLLRDBAND);
        sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup);

/**
 * af_alg_wait_for_data - wait for availability of TX data
 *
 * @sk: socket of connection to user space
 * @flags: If MSG_DONTWAIT is set, then only report if function would sleep
 * @min: Set to minimum request size if partial requests are allowed.
 * Return: 0 when writable memory is available, < 0 upon error
 */
int af_alg_wait_for_data(struct sock *sk, unsigned flags, unsigned min)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        long timeout;
        int err = -ERESTARTSYS;

        if (flags & MSG_DONTWAIT)
                return -EAGAIN;

        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);

        add_wait_queue(sk_sleep(sk), &wait);
        for (;;) {
                if (signal_pending(current))
                        break;
                timeout = MAX_SCHEDULE_TIMEOUT;
                if (sk_wait_event(sk, &timeout,
                                  ctx->init && (!ctx->more ||
                                                (min && ctx->used >= min)),
                                  &wait)) {
                        err = 0;
                        break;
                }
        }
        remove_wait_queue(sk_sleep(sk), &wait);

        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);

        return err;
}
EXPORT_SYMBOL_GPL(af_alg_wait_for_data);

/**
 * af_alg_data_wakeup - wakeup caller when new data can be sent to kernel
 *
 * @sk: socket of connection to user space
 */
static void af_alg_data_wakeup(struct sock *sk)
{
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        struct socket_wq *wq;

        if (!ctx->used)
                return;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                           EPOLLRDNORM |
                                                           EPOLLRDBAND);
        sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        rcu_read_unlock();
}

/**
 * af_alg_sendmsg - implementation of sendmsg system call handler
 *
 * The sendmsg system call handler obtains the user data and stores it
 * in ctx->tsgl_list. This implies allocation of the required numbers of
 * struct af_alg_tsgl.
 *
 * In addition, the ctx is filled with the information sent via CMSG.
 *
 * @sock: socket of connection to user space
 * @msg: message from user space
 * @size: size of message from user space
 * @ivsize: the size of the IV for the cipher operation to verify that the
 *           user-space-provided IV has the right size
 * Return: the number of copied data upon success, < 0 upon error
 */
int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
                   unsigned int ivsize)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        struct af_alg_tsgl *sgl;
        struct af_alg_control con = {};
        long copied = 0;
        bool enc = false;
        bool init = false;
        int err = 0;

        if (msg->msg_controllen) {
                err = af_alg_cmsg_send(msg, &con);
                if (err)
                        return err;

                init = true;
                switch (con.op) {
                case ALG_OP_ENCRYPT:
                        enc = true;
                        break;
                case ALG_OP_DECRYPT:
                        enc = false;
                        break;
                default:
                        return -EINVAL;
                }

                if (con.iv && con.iv->ivlen != ivsize)
                        return -EINVAL;
        }

        lock_sock(sk);
        if (ctx->init && !ctx->more) {
                if (ctx->used) {
                        err = -EINVAL;
                        goto unlock;
                }

                pr_info_once(
                        "%s sent an empty control message without MSG_MORE.\n",
                        current->comm);
        }
        ctx->init = true;

        if (init) {
                ctx->enc = enc;
                if (con.iv)
                        memcpy(ctx->iv, con.iv->iv, ivsize);

                ctx->aead_assoclen = con.aead_assoclen;
        }

        while (size) {
                struct scatterlist *sg;
                size_t len = size;
                ssize_t plen;

                /* use the existing memory in an allocated page */
                if (ctx->merge && !(msg->msg_flags & MSG_SPLICE_PAGES)) {
                        sgl = list_entry(ctx->tsgl_list.prev,
                                         struct af_alg_tsgl, list);
                        sg = sgl->sg + sgl->cur - 1;
                        len = min_t(size_t, len,
                                    PAGE_SIZE - sg->offset - sg->length);

                        err = memcpy_from_msg(page_address(sg_page(sg)) +
                                              sg->offset + sg->length,
                                              msg, len);
                        if (err)
                                goto unlock;

                        sg->length += len;
                        ctx->merge = (sg->offset + sg->length) &
                                     (PAGE_SIZE - 1);

                        ctx->used += len;
                        copied += len;
                        size -= len;
                        continue;
                }

                if (!af_alg_writable(sk)) {
                        err = af_alg_wait_for_wmem(sk, msg->msg_flags);
                        if (err)
                                goto unlock;
                }

                /* allocate a new page */
                len = min_t(unsigned long, len, af_alg_sndbuf(sk));

                err = af_alg_alloc_tsgl(sk);
                if (err)
                        goto unlock;

                sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl,
                                 list);
                sg = sgl->sg;
                if (sgl->cur)
                        sg_unmark_end(sg + sgl->cur - 1);

                if (msg->msg_flags & MSG_SPLICE_PAGES) {
                        struct sg_table sgtable = {
                                .sgl                = sg,
                                .nents                = sgl->cur,
                                .orig_nents        = sgl->cur,
                        };

                        plen = extract_iter_to_sg(&msg->msg_iter, len, &sgtable,
                                                  MAX_SGL_ENTS - sgl->cur, 0);
                        if (plen < 0) {
                                err = plen;
                                goto unlock;
                        }

                        for (; sgl->cur < sgtable.nents; sgl->cur++)
                                get_page(sg_page(&sg[sgl->cur]));
                        len -= plen;
                        ctx->used += plen;
                        copied += plen;
                        size -= plen;
                        ctx->merge = 0;
                } else {
                        do {
                                struct page *pg;
                                unsigned int i = sgl->cur;

                                plen = min_t(size_t, len, PAGE_SIZE);

                                pg = alloc_page(GFP_KERNEL);
                                if (!pg) {
                                        err = -ENOMEM;
                                        goto unlock;
                                }

                                sg_assign_page(sg + i, pg);

                                err = memcpy_from_msg(
                                        page_address(sg_page(sg + i)),
                                        msg, plen);
                                if (err) {
                                        __free_page(sg_page(sg + i));
                                        sg_assign_page(sg + i, NULL);
                                        goto unlock;
                                }

                                sg[i].length = plen;
                                len -= plen;
                                ctx->used += plen;
                                copied += plen;
                                size -= plen;
                                sgl->cur++;
                        } while (len && sgl->cur < MAX_SGL_ENTS);

                        ctx->merge = plen & (PAGE_SIZE - 1);
                }

                if (!size)
                        sg_mark_end(sg + sgl->cur - 1);
        }

        err = 0;

        ctx->more = msg->msg_flags & MSG_MORE;

unlock:
        af_alg_data_wakeup(sk);
        release_sock(sk);

        return copied ?: err;
}
EXPORT_SYMBOL_GPL(af_alg_sendmsg);

/**
 * af_alg_free_resources - release resources required for crypto request
 * @areq: Request holding the TX and RX SGL
 */
void af_alg_free_resources(struct af_alg_async_req *areq)
{
        struct sock *sk = areq->sk;
        struct af_alg_ctx *ctx;

        af_alg_free_areq_sgls(areq);
        sock_kfree_s(sk, areq, areq->areqlen);

        ctx = alg_sk(sk)->private;
        ctx->inflight = false;
}
EXPORT_SYMBOL_GPL(af_alg_free_resources);

/**
 * af_alg_async_cb - AIO callback handler
 * @data: async request completion data
 * @err: if non-zero, error result to be returned via ki_complete();
 *       otherwise return the AIO output length via ki_complete().
 *
 * This handler cleans up the struct af_alg_async_req upon completion of the
 * AIO operation.
 *
 * The number of bytes to be generated with the AIO operation must be set
 * in areq->outlen before the AIO callback handler is invoked.
 */
void af_alg_async_cb(void *data, int err)
{
        struct af_alg_async_req *areq = data;
        struct sock *sk = areq->sk;
        struct kiocb *iocb = areq->iocb;
        unsigned int resultlen;

        /* Buffer size written by crypto operation. */
        resultlen = areq->outlen;

        af_alg_free_resources(areq);
        sock_put(sk);

        iocb->ki_complete(iocb, err ? err : (int)resultlen);
}
EXPORT_SYMBOL_GPL(af_alg_async_cb);

/**
 * af_alg_poll - poll system call handler
 * @file: file pointer
 * @sock: socket to poll
 * @wait: poll_table
 */
__poll_t af_alg_poll(struct file *file, struct socket *sock,
                         poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        __poll_t mask;

        sock_poll_wait(file, sock, wait);
        mask = 0;

        if (!ctx->more || ctx->used)
                mask |= EPOLLIN | EPOLLRDNORM;

        if (af_alg_writable(sk))
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;

        return mask;
}
EXPORT_SYMBOL_GPL(af_alg_poll);

/**
 * af_alg_alloc_areq - allocate struct af_alg_async_req
 *
 * @sk: socket of connection to user space
 * @areqlen: size of struct af_alg_async_req + crypto_*_reqsize
 * Return: allocated data structure or ERR_PTR upon error
 */
struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
                                           unsigned int areqlen)
{
        struct af_alg_ctx *ctx = alg_sk(sk)->private;
        struct af_alg_async_req *areq;

        /* Only one AIO request can be in flight. */
        if (ctx->inflight)
                return ERR_PTR(-EBUSY);

        areq = sock_kmalloc(sk, areqlen, GFP_KERNEL);
        if (unlikely(!areq))
                return ERR_PTR(-ENOMEM);

        ctx->inflight = true;

        areq->areqlen = areqlen;
        areq->sk = sk;
        areq->first_rsgl.sgl.sgt.sgl = areq->first_rsgl.sgl.sgl;
        areq->last_rsgl = NULL;
        INIT_LIST_HEAD(&areq->rsgl_list);
        areq->tsgl = NULL;
        areq->tsgl_entries = 0;

        return areq;
}
EXPORT_SYMBOL_GPL(af_alg_alloc_areq);

/**
 * af_alg_get_rsgl - create the RX SGL for the output data from the crypto
 *                     operation
 *
 * @sk: socket of connection to user space
 * @msg: user space message
 * @flags: flags used to invoke recvmsg with
 * @areq: instance of the cryptographic request that will hold the RX SGL
 * @maxsize: maximum number of bytes to be pulled from user space
 * @outlen: number of bytes in the RX SGL
 * Return: 0 on success, < 0 upon error
 */
int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
                    struct af_alg_async_req *areq, size_t maxsize,
                    size_t *outlen)
{
        struct alg_sock *ask = alg_sk(sk);
        struct af_alg_ctx *ctx = ask->private;
        size_t len = 0;

        while (maxsize > len && msg_data_left(msg)) {
                struct af_alg_rsgl *rsgl;
                ssize_t err;
                size_t seglen;

                /* limit the amount of readable buffers */
                if (!af_alg_readable(sk))
                        break;

                seglen = min_t(size_t, (maxsize - len),
                               msg_data_left(msg));

                if (list_empty(&areq->rsgl_list)) {
                        rsgl = &areq->first_rsgl;
                } else {
                        rsgl = sock_kmalloc(sk, sizeof(*rsgl), GFP_KERNEL);
                        if (unlikely(!rsgl))
                                return -ENOMEM;
                }

                rsgl->sgl.need_unpin =
                        iov_iter_extract_will_pin(&msg->msg_iter);
                rsgl->sgl.sgt.sgl = rsgl->sgl.sgl;
                rsgl->sgl.sgt.nents = 0;
                rsgl->sgl.sgt.orig_nents = 0;
                list_add_tail(&rsgl->list, &areq->rsgl_list);

                sg_init_table(rsgl->sgl.sgt.sgl, ALG_MAX_PAGES);
                err = extract_iter_to_sg(&msg->msg_iter, seglen, &rsgl->sgl.sgt,
                                         ALG_MAX_PAGES, 0);
                if (err < 0) {
                        rsgl->sg_num_bytes = 0;
                        return err;
                }

                sg_mark_end(rsgl->sgl.sgt.sgl + rsgl->sgl.sgt.nents - 1);

                /* chain the new scatterlist with previous one */
                if (areq->last_rsgl)
                        af_alg_link_sg(&areq->last_rsgl->sgl, &rsgl->sgl);

                areq->last_rsgl = rsgl;
                len += err;
                atomic_add(err, &ctx->rcvused);
                rsgl->sg_num_bytes = err;
        }

        *outlen = len;
        return 0;
}
EXPORT_SYMBOL_GPL(af_alg_get_rsgl);

static int __init af_alg_init(void)
{
        int err = proto_register(&alg_proto, 0);

        if (err)
                goto out;

        err = sock_register(&alg_family);
        if (err != 0)
                goto out_unregister_proto;

out:
        return err;

out_unregister_proto:
        proto_unregister(&alg_proto);
        goto out;
}

static void __exit af_alg_exit(void)
{
        sock_unregister(PF_ALG);
        proto_unregister(&alg_proto);
}

module_init(af_alg_init);
module_exit(af_alg_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(AF_ALG);


























































   23 





































   23 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
 */

#ifndef _ASM_X86_STACKTRACE_H
#define _ASM_X86_STACKTRACE_H

#include <linux/uaccess.h>
#include <linux/ptrace.h>

#include <asm/cpu_entry_area.h>
#include <asm/switch_to.h>

enum stack_type {
        STACK_TYPE_UNKNOWN,
        STACK_TYPE_TASK,
        STACK_TYPE_IRQ,
        STACK_TYPE_SOFTIRQ,
        STACK_TYPE_ENTRY,
        STACK_TYPE_EXCEPTION,
        STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};

struct stack_info {
        enum stack_type type;
        unsigned long *begin, *end, *next_sp;
};

bool in_task_stack(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info);

bool in_entry_stack(unsigned long *stack, struct stack_info *info);

int get_stack_info(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info, unsigned long *visit_mask);
bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
                            struct stack_info *info);

static __always_inline
bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
{
        /* make sure it's not in the stack proper */
        if (get_stack_info_noinstr(stack, current, info))
                return false;
        /* but if it is in the page below it, we hit a guard */
        return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info);
}

const char *stack_type_name(enum stack_type type);

static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
{
        void *begin = info->begin;
        void *end   = info->end;

        return (info->type != STACK_TYPE_UNKNOWN &&
                addr >= begin && addr < end &&
                addr + len > begin && addr + len <= end);
}

#ifdef CONFIG_X86_32
#define STACKSLOTS_PER_LINE 8
#else
#define STACKSLOTS_PER_LINE 4
#endif

#ifdef CONFIG_FRAME_POINTER
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
        if (regs)
                return (unsigned long *)regs->bp;

        if (task == current)
                return __builtin_frame_address(0);

        return &((struct inactive_task_frame *)task->thread.sp)->bp;
}
#else
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
        return NULL;
}
#endif /* CONFIG_FRAME_POINTER */

static inline unsigned long *
get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
{
        if (regs)
                return (unsigned long *)regs->sp;

        if (task == current)
                return __builtin_frame_address(0);

        return (unsigned long *)task->thread.sp;
}

/* The form of the top of the frame on the stack */
struct stack_frame {
        struct stack_frame *next_frame;
        unsigned long return_address;
};

struct stack_frame_ia32 {
    u32 next_frame;
    u32 return_address;
};

void show_opcodes(struct pt_regs *regs, const char *loglvl);
void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */

















































































































    4 


    3 



    2 




















































































































































































































    3 
    4 











































































    1 



    1 





































    1 

    1 
    1 
    1 





    1 

    1 


    1 
    1 






















    6 








    2 
    2 





    6 


    6 
    2 
    3 
    6 
    5 






    2 
    6 























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
 * (C) 2002-2003 Nadia Yvette Chambers, IBM
 * (C) 2004 Nadia Yvette Chambers, Oracle
 * (C) 2002-2004 Ingo Molnar, Red Hat
 *
 * pid-structures are backing objects for tasks sharing a given ID to chain
 * against. There is very little to them aside from hashing them and
 * parking tasks using given ID's on a list.
 *
 * The hash is always changed with the tasklist_lock write-acquired,
 * and the hash is only accessed with the tasklist_lock at least
 * read-acquired, so there's no additional SMP locking needed here.
 *
 * We have a list of bitmap pages, which bitmaps represent the PID space.
 * Allocating and freeing PIDs is completely lockless. The worst-case
 * allocation scenario when all but one out of 1 million PIDs possible are
 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
 *
 * Pid namespaces:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>

struct pid init_struct_pid = {
        .count                = REFCOUNT_INIT(1),
        .tasks                = {
                { .first = NULL },
                { .first = NULL },
                { .first = NULL },
        },
        .level                = 0,
        .numbers        = { {
                .nr                = 0,
                .ns                = &init_pid_ns,
        }, }
};

int pid_max = PID_MAX_DEFAULT;

int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
 * Pseudo filesystems start inode numbering after one. We use Reserved
 * PIDs as a natural offset.
 */
static u64 pidfs_ino = RESERVED_PIDS;

/*
 * PID-map pages start out as NULL, they get allocated upon
 * first use and are never deallocated. This way a low pid_max
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */
struct pid_namespace init_pid_ns = {
        .ns.count = REFCOUNT_INIT(2),
        .idr = IDR_INIT(init_pid_ns.idr),
        .pid_allocated = PIDNS_ADDING,
        .level = 0,
        .child_reaper = &init_task,
        .user_ns = &init_user_ns,
        .ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
        .ns.ops = &pidns_operations,
#endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);

/*
 * Note: disable interrupts while the pidmap_lock is held as an
 * interrupt might come in and do read_lock(&tasklist_lock).
 *
 * If we don't disable interrupts there is a nasty deadlock between
 * detach_pid()->free_pid() and another cpu that does
 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
 * read_lock(&tasklist_lock);
 *
 * After we clean up the tasklist_lock and know there are no
 * irq handlers that take it we can leave the interrupts enabled.
 * For now it is easier to be safe than to prove it can't happen.
 */

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

void put_pid(struct pid *pid)
{
        struct pid_namespace *ns;

        if (!pid)
                return;

        ns = pid->numbers[pid->level].ns;
        if (refcount_dec_and_test(&pid->count)) {
                kmem_cache_free(ns->pid_cachep, pid);
                put_pid_ns(ns);
        }
}
EXPORT_SYMBOL_GPL(put_pid);

static void delayed_put_pid(struct rcu_head *rhp)
{
        struct pid *pid = container_of(rhp, struct pid, rcu);
        put_pid(pid);
}

void free_pid(struct pid *pid)
{
        /* We can be called with write_lock_irq(&tasklist_lock) held */
        int i;
        unsigned long flags;

        spin_lock_irqsave(&pidmap_lock, flags);
        for (i = 0; i <= pid->level; i++) {
                struct upid *upid = pid->numbers + i;
                struct pid_namespace *ns = upid->ns;
                switch (--ns->pid_allocated) {
                case 2:
                case 1:
                        /* When all that is left in the pid namespace
                         * is the reaper wake up the reaper.  The reaper
                         * may be sleeping in zap_pid_ns_processes().
                         */
                        wake_up_process(ns->child_reaper);
                        break;
                case PIDNS_ADDING:
                        /* Handle a fork failure of the first process */
                        WARN_ON(ns->child_reaper);
                        ns->pid_allocated = 0;
                        break;
                }

                idr_remove(&ns->idr, upid->nr);
        }
        spin_unlock_irqrestore(&pidmap_lock, flags);

        call_rcu(&pid->rcu, delayed_put_pid);
}

struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                      size_t set_tid_size)
{
        struct pid *pid;
        enum pid_type type;
        int i, nr;
        struct pid_namespace *tmp;
        struct upid *upid;
        int retval = -ENOMEM;

        /*
         * set_tid_size contains the size of the set_tid array. Starting at
         * the most nested currently active PID namespace it tells alloc_pid()
         * which PID to set for a process in that most nested PID namespace
         * up to set_tid_size PID namespaces. It does not have to set the PID
         * for a process in all nested PID namespaces but set_tid_size must
         * never be greater than the current ns->level + 1.
         */
        if (set_tid_size > ns->level + 1)
                return ERR_PTR(-EINVAL);

        pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
        if (!pid)
                return ERR_PTR(retval);

        tmp = ns;
        pid->level = ns->level;

        for (i = ns->level; i >= 0; i--) {
                int tid = 0;

                if (set_tid_size) {
                        tid = set_tid[ns->level - i];

                        retval = -EINVAL;
                        if (tid < 1 || tid >= pid_max)
                                goto out_free;
                        /*
                         * Also fail if a PID != 1 is requested and
                         * no PID 1 exists.
                         */
                        if (tid != 1 && !tmp->child_reaper)
                                goto out_free;
                        retval = -EPERM;
                        if (!checkpoint_restore_ns_capable(tmp->user_ns))
                                goto out_free;
                        set_tid_size--;
                }

                idr_preload(GFP_KERNEL);
                spin_lock_irq(&pidmap_lock);

                if (tid) {
                        nr = idr_alloc(&tmp->idr, NULL, tid,
                                       tid + 1, GFP_ATOMIC);
                        /*
                         * If ENOSPC is returned it means that the PID is
                         * alreay in use. Return EEXIST in that case.
                         */
                        if (nr == -ENOSPC)
                                nr = -EEXIST;
                } else {
                        int pid_min = 1;
                        /*
                         * init really needs pid 1, but after reaching the
                         * maximum wrap back to RESERVED_PIDS
                         */
                        if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
                                pid_min = RESERVED_PIDS;

                        /*
                         * Store a null pointer so find_pid_ns does not find
                         * a partially initialized PID (see below).
                         */
                        nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
                                              pid_max, GFP_ATOMIC);
                }
                spin_unlock_irq(&pidmap_lock);
                idr_preload_end();

                if (nr < 0) {
                        retval = (nr == -ENOSPC) ? -EAGAIN : nr;
                        goto out_free;
                }

                pid->numbers[i].nr = nr;
                pid->numbers[i].ns = tmp;
                tmp = tmp->parent;
        }

        /*
         * ENOMEM is not the most obvious choice especially for the case
         * where the child subreaper has already exited and the pid
         * namespace denies the creation of any new processes. But ENOMEM
         * is what we have exposed to userspace for a long time and it is
         * documented behavior for pid namespaces. So we can't easily
         * change it even if there were an error code better suited.
         */
        retval = -ENOMEM;

        get_pid_ns(ns);
        refcount_set(&pid->count, 1);
        spin_lock_init(&pid->lock);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);

        init_waitqueue_head(&pid->wait_pidfd);
        INIT_HLIST_HEAD(&pid->inodes);

        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
                goto out_unlock;
        pid->stashed = NULL;
        pid->ino = ++pidfs_ino;
        for ( ; upid >= pid->numbers; --upid) {
                /* Make the PID visible to find_pid_ns. */
                idr_replace(&upid->ns->idr, pid, upid->nr);
                upid->ns->pid_allocated++;
        }
        spin_unlock_irq(&pidmap_lock);

        return pid;

out_unlock:
        spin_unlock_irq(&pidmap_lock);
        put_pid_ns(ns);

out_free:
        spin_lock_irq(&pidmap_lock);
        while (++i <= ns->level) {
                upid = pid->numbers + i;
                idr_remove(&upid->ns->idr, upid->nr);
        }

        /* On failure to allocate the first pid, reset the state */
        if (ns->pid_allocated == PIDNS_ADDING)
                idr_set_cursor(&ns->idr, 0);

        spin_unlock_irq(&pidmap_lock);

        kmem_cache_free(ns->pid_cachep, pid);
        return ERR_PTR(retval);
}

void disable_pid_allocation(struct pid_namespace *ns)
{
        spin_lock_irq(&pidmap_lock);
        ns->pid_allocated &= ~PIDNS_ADDING;
        spin_unlock_irq(&pidmap_lock);
}

struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
        return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);

struct pid *find_vpid(int nr)
{
        return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);

static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
{
        return (type == PIDTYPE_PID) ?
                &task->thread_pid :
                &task->signal->pids[type];
}

/*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
void attach_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid = *task_pid_ptr(task, type);
        hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}

static void __change_pid(struct task_struct *task, enum pid_type type,
                        struct pid *new)
{
        struct pid **pid_ptr = task_pid_ptr(task, type);
        struct pid *pid;
        int tmp;

        pid = *pid_ptr;

        hlist_del_rcu(&task->pid_links[type]);
        *pid_ptr = new;

        if (type == PIDTYPE_PID) {
                WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
                wake_up_all(&pid->wait_pidfd);
        }

        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (pid_has_task(pid, tmp))
                        return;

        free_pid(pid);
}

void detach_pid(struct task_struct *task, enum pid_type type)
{
        __change_pid(task, type, NULL);
}

void change_pid(struct task_struct *task, enum pid_type type,
                struct pid *pid)
{
        __change_pid(task, type, pid);
        attach_pid(task, type);
}

void exchange_tids(struct task_struct *left, struct task_struct *right)
{
        struct pid *pid1 = left->thread_pid;
        struct pid *pid2 = right->thread_pid;
        struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
        struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];

        /* Swap the single entry tid lists */
        hlists_swap_heads_rcu(head1, head2);

        /* Swap the per task_struct pid */
        rcu_assign_pointer(left->thread_pid, pid2);
        rcu_assign_pointer(right->thread_pid, pid1);

        /* Swap the cached value */
        WRITE_ONCE(left->pid, pid_nr(pid2));
        WRITE_ONCE(right->pid, pid_nr(pid1));
}

/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
                           enum pid_type type)
{
        WARN_ON_ONCE(type == PIDTYPE_PID);
        hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}

struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pid_links[(type)]);
        }
        return result;
}
EXPORT_SYMBOL(pid_task);

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

struct task_struct *find_task_by_vpid(pid_t vnr)
{
        return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}

struct task_struct *find_get_task_by_vpid(pid_t nr)
{
        struct task_struct *task;

        rcu_read_lock();
        task = find_task_by_vpid(nr);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        return task;
}

struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        rcu_read_lock();
        pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
        rcu_read_unlock();
        return pid;
}
EXPORT_SYMBOL_GPL(get_task_pid);

struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result;
        rcu_read_lock();
        result = pid_task(pid, type);
        if (result)
                get_task_struct(result);
        rcu_read_unlock();
        return result;
}
EXPORT_SYMBOL_GPL(get_pid_task);

struct pid *find_get_pid(pid_t nr)
{
        struct pid *pid;

        rcu_read_lock();
        pid = get_pid(find_vpid(nr));
        rcu_read_unlock();

        return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
        struct upid *upid;
        pid_t nr = 0;

        if (pid && ns->level <= pid->level) {
                upid = &pid->numbers[ns->level];
                if (upid->ns == ns)
                        nr = upid->nr;
        }
        return nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);

pid_t pid_vnr(struct pid *pid)
{
        return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);

pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
                        struct pid_namespace *ns)
{
        pid_t nr = 0;

        rcu_read_lock();
        if (!ns)
                ns = task_active_pid_ns(current);
        nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
        rcu_read_unlock();

        return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);

struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
        return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);

/*
 * Used by proc to find the first pid that is greater than or equal to nr.
 *
 * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
        return idr_get_next(&ns->idr, &nr);
}
EXPORT_SYMBOL_GPL(find_ge_pid);

struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
        struct fd f;
        struct pid *pid;

        f = fdget(fd);
        if (!f.file)
                return ERR_PTR(-EBADF);

        pid = pidfd_pid(f.file);
        if (!IS_ERR(pid)) {
                get_pid(pid);
                *flags = f.file->f_flags;
        }

        fdput(f);
        return pid;
}

/**
 * pidfd_get_task() - Get the task associated with a pidfd
 *
 * @pidfd: pidfd for which to get the task
 * @flags: flags associated with this pidfd
 *
 * Return the task associated with @pidfd. The function takes a reference on
 * the returned task. The caller is responsible for releasing that reference.
 *
 * Return: On success, the task_struct associated with the pidfd.
 *           On error, a negative errno number will be returned.
 */
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
        unsigned int f_flags;
        struct pid *pid;
        struct task_struct *task;

        pid = pidfd_get_pid(pidfd, &f_flags);
        if (IS_ERR(pid))
                return ERR_CAST(pid);

        task = get_pid_task(pid, PIDTYPE_TGID);
        put_pid(pid);
        if (!task)
                return ERR_PTR(-ESRCH);

        *flags = f_flags;
        return task;
}

/**
 * pidfd_create() - Create a new pid file descriptor.
 *
 * @pid:   struct pid that the pidfd will reference
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
 *
 * Note, that this function can only be called after the fd table has
 * been unshared to avoid leaking the pidfd to the new process.
 *
 * This symbol should not be explicitly exported to loadable modules.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
static int pidfd_create(struct pid *pid, unsigned int flags)
{
        int pidfd;
        struct file *pidfd_file;

        pidfd = pidfd_prepare(pid, flags, &pidfd_file);
        if (pidfd < 0)
                return pidfd;

        fd_install(pidfd, pidfd_file);
        return pidfd;
}

/**
 * sys_pidfd_open() - Open new pid file descriptor.
 *
 * @pid:   pid for which to retrieve a pidfd
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set for
 * the task identified by @pid. Without PIDFD_THREAD flag the target task
 * must be a thread-group leader.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
        int fd;
        struct pid *p;

        if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
                return -EINVAL;

        if (pid <= 0)
                return -EINVAL;

        p = find_get_pid(pid);
        if (!p)
                return -ESRCH;

        fd = pidfd_create(p, flags);

        put_pid(p);
        return fd;
}

void __init pid_idr_init(void)
{
        /* Verify no one has done anything silly: */
        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);

        /* bump default and minimum pid_max based on number of cpus */
        pid_max = min(pid_max_max, max_t(int, pid_max,
                                PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
        pid_max_min = max_t(int, pid_max_min,
                                PIDS_PER_CPU_MIN * num_possible_cpus());
        pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);

        idr_init(&init_pid_ns.idr);

        init_pid_ns.pid_cachep = kmem_cache_create("pid",
                        struct_size_t(struct pid, numbers, 1),
                        __alignof__(struct pid),
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
                        NULL);
}

static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
        struct file *file;
        int ret;

        ret = down_read_killable(&task->signal->exec_update_lock);
        if (ret)
                return ERR_PTR(ret);

        if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
                file = fget_task(task, fd);
        else
                file = ERR_PTR(-EPERM);

        up_read(&task->signal->exec_update_lock);

        if (!file) {
                /*
                 * It is possible that the target thread is exiting; it can be
                 * either:
                 * 1. before exit_signals(), which gives a real fd
                 * 2. before exit_files() takes the task_lock() gives a real fd
                 * 3. after exit_files() releases task_lock(), ->files is NULL;
                 *    this has PF_EXITING, since it was set in exit_signals(),
                 *    __pidfd_fget() returns EBADF.
                 * In case 3 we get EBADF, but that really means ESRCH, since
                 * the task is currently exiting and has freed its files
                 * struct, so we fix it up.
                 */
                if (task->flags & PF_EXITING)
                        file = ERR_PTR(-ESRCH);
                else
                        file = ERR_PTR(-EBADF);
        }

        return file;
}

static int pidfd_getfd(struct pid *pid, int fd)
{
        struct task_struct *task;
        struct file *file;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        file = __pidfd_fget(task, fd);
        put_task_struct(task);
        if (IS_ERR(file))
                return PTR_ERR(file);

        ret = receive_fd(file, NULL, O_CLOEXEC);
        fput(file);

        return ret;
}

/**
 * sys_pidfd_getfd() - Get a file descriptor from another process
 *
 * @pidfd:        the pidfd file descriptor of the process
 * @fd:                the file descriptor number to get
 * @flags:        flags on how to get the fd (reserved)
 *
 * This syscall gets a copy of a file descriptor from another process
 * based on the pidfd, and file descriptor number. It requires that
 * the calling process has the ability to ptrace the process represented
 * by the pidfd. The process which is having its file descriptor copied
 * is otherwise unaffected.
 *
 * Return: On success, a cloexec file descriptor is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
                unsigned int, flags)
{
        struct pid *pid;
        struct fd f;
        int ret;

        /* flags is currently unused - make sure it's unset */
        if (flags)
                return -EINVAL;

        f = fdget(pidfd);
        if (!f.file)
                return -EBADF;

        pid = pidfd_pid(f.file);
        if (IS_ERR(pid))
                ret = PTR_ERR(pid);
        else
                ret = pidfd_getfd(pid, fd);

        fdput(f);
        return ret;
}










    2 

























    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/uaccess.h>
#include <linux/kernel.h>

#include <asm/vsyscall.h>

#ifdef CONFIG_X86_64
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
        unsigned long vaddr = (unsigned long)unsafe_src;

        /*
         * Do not allow userspace addresses.  This disallows
         * normal userspace and the userspace guard page:
         */
        if (vaddr < TASK_SIZE_MAX + PAGE_SIZE)
                return false;

        /*
         * Reading from the vsyscall page may cause an unhandled fault in
         * certain cases.  Though it is at an address above TASK_SIZE_MAX, it is
         * usually considered as a user space address.
         */
        if (is_vsyscall_vaddr(vaddr))
                return false;

        /*
         * Allow everything during early boot before 'x86_virt_bits'
         * is initialized.  Needed for instruction decoding in early
         * exception handlers.
         */
        if (!boot_cpu_data.x86_virt_bits)
                return true;

        return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
}
#else
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
        return (unsigned long)unsafe_src >= TASK_SIZE_MAX;
}
#endif














































































































































































































    1 





    1 







































    1 




    1 









































































    1 




    1 
    1 






    1 




























































































































































































































































































































































    1 




    1 

























    1 



    1 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 










































































    1 



    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
/*
 * net/tipc/node.c: TIPC node management routines
 *
 * Copyright (c) 2000-2006, 2012-2016, Ericsson AB
 * Copyright (c) 2005-2006, 2010-2014, Wind River Systems
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "core.h"
#include "link.h"
#include "node.h"
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
#include "monitor.h"
#include "discover.h"
#include "netlink.h"
#include "trace.h"
#include "crypto.h"

#define INVALID_NODE_SIG        0x10000
#define NODE_CLEANUP_AFTER        300000

/* Flags used to take different actions according to flag type
 * TIPC_NOTIFY_NODE_DOWN: notify node is down
 * TIPC_NOTIFY_NODE_UP: notify node is up
 * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type
 */
enum {
        TIPC_NOTIFY_NODE_DOWN                = (1 << 3),
        TIPC_NOTIFY_NODE_UP                = (1 << 4),
        TIPC_NOTIFY_LINK_UP                = (1 << 6),
        TIPC_NOTIFY_LINK_DOWN                = (1 << 7)
};

struct tipc_link_entry {
        struct tipc_link *link;
        spinlock_t lock; /* per link */
        u32 mtu;
        struct sk_buff_head inputq;
        struct tipc_media_addr maddr;
};

struct tipc_bclink_entry {
        struct tipc_link *link;
        struct sk_buff_head inputq1;
        struct sk_buff_head arrvq;
        struct sk_buff_head inputq2;
        struct sk_buff_head namedq;
        u16 named_rcv_nxt;
        bool named_open;
};

/**
 * struct tipc_node - TIPC node structure
 * @addr: network address of node
 * @kref: reference counter to node object
 * @lock: rwlock governing access to structure
 * @net: the applicable net namespace
 * @hash: links to adjacent nodes in unsorted hash chain
 * @active_links: bearer ids of active links, used as index into links[] array
 * @links: array containing references to all links to node
 * @bc_entry: broadcast link entry
 * @action_flags: bit mask of different types of node actions
 * @state: connectivity state vs peer node
 * @preliminary: a preliminary node or not
 * @failover_sent: failover sent or not
 * @sync_point: sequence number where synch/failover is finished
 * @list: links to adjacent nodes in sorted list of cluster's nodes
 * @working_links: number of working links to node (both active and standby)
 * @link_cnt: number of links to node
 * @capabilities: bitmap, indicating peer node's functional capabilities
 * @signature: node instance identifier
 * @link_id: local and remote bearer ids of changing link, if any
 * @peer_id: 128-bit ID of peer
 * @peer_id_string: ID string of peer
 * @publ_list: list of publications
 * @conn_sks: list of connections (FIXME)
 * @timer: node's keepalive timer
 * @keepalive_intv: keepalive interval in milliseconds
 * @rcu: rcu struct for tipc_node
 * @delete_at: indicates the time for deleting a down node
 * @peer_net: peer's net namespace
 * @peer_hash_mix: hash for this peer (FIXME)
 * @crypto_rx: RX crypto handler
 */
struct tipc_node {
        u32 addr;
        struct kref kref;
        rwlock_t lock;
        struct net *net;
        struct hlist_node hash;
        int active_links[2];
        struct tipc_link_entry links[MAX_BEARERS];
        struct tipc_bclink_entry bc_entry;
        int action_flags;
        struct list_head list;
        int state;
        bool preliminary;
        bool failover_sent;
        u16 sync_point;
        int link_cnt;
        u16 working_links;
        u16 capabilities;
        u32 signature;
        u32 link_id;
        u8 peer_id[16];
        char peer_id_string[NODE_ID_STR_LEN];
        struct list_head publ_list;
        struct list_head conn_sks;
        unsigned long keepalive_intv;
        struct timer_list timer;
        struct rcu_head rcu;
        unsigned long delete_at;
        struct net *peer_net;
        u32 peer_hash_mix;
#ifdef CONFIG_TIPC_CRYPTO
        struct tipc_crypto *crypto_rx;
#endif
};

/* Node FSM states and events:
 */
enum {
        SELF_DOWN_PEER_DOWN    = 0xdd,
        SELF_UP_PEER_UP        = 0xaa,
        SELF_DOWN_PEER_LEAVING = 0xd1,
        SELF_UP_PEER_COMING    = 0xac,
        SELF_COMING_PEER_UP    = 0xca,
        SELF_LEAVING_PEER_DOWN = 0x1d,
        NODE_FAILINGOVER       = 0xf0,
        NODE_SYNCHING          = 0xcc
};

enum {
        SELF_ESTABL_CONTACT_EVT = 0xece,
        SELF_LOST_CONTACT_EVT   = 0x1ce,
        PEER_ESTABL_CONTACT_EVT = 0x9ece,
        PEER_LOST_CONTACT_EVT   = 0x91ce,
        NODE_FAILOVER_BEGIN_EVT = 0xfbe,
        NODE_FAILOVER_END_EVT   = 0xfee,
        NODE_SYNCH_BEGIN_EVT    = 0xcbe,
        NODE_SYNCH_END_EVT      = 0xcee
};

static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
                                  struct sk_buff_head *xmitq,
                                  struct tipc_media_addr **maddr);
static void tipc_node_link_down(struct tipc_node *n, int bearer_id,
                                bool delete);
static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq);
static void tipc_node_delete(struct tipc_node *node);
static void tipc_node_timeout(struct timer_list *t);
static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id);
static bool node_is_up(struct tipc_node *n);
static void tipc_node_delete_from_list(struct tipc_node *node);

struct tipc_sock_conn {
        u32 port;
        u32 peer_port;
        u32 peer_node;
        struct list_head list;
};

static struct tipc_link *node_active_link(struct tipc_node *n, int sel)
{
        int bearer_id = n->active_links[sel & 1];

        if (unlikely(bearer_id == INVALID_BEARER_ID))
                return NULL;

        return n->links[bearer_id].link;
}

int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected)
{
        struct tipc_node *n;
        int bearer_id;
        unsigned int mtu = MAX_MSG_SIZE;

        n = tipc_node_find(net, addr);
        if (unlikely(!n))
                return mtu;

        /* Allow MAX_MSG_SIZE when building connection oriented message
         * if they are in the same core network
         */
        if (n->peer_net && connected) {
                tipc_node_put(n);
                return mtu;
        }

        bearer_id = n->active_links[sel & 1];
        if (likely(bearer_id != INVALID_BEARER_ID))
                mtu = n->links[bearer_id].mtu;
        tipc_node_put(n);
        return mtu;
}

bool tipc_node_get_id(struct net *net, u32 addr, u8 *id)
{
        u8 *own_id = tipc_own_id(net);
        struct tipc_node *n;

        if (!own_id)
                return true;

        if (addr == tipc_own_addr(net)) {
                memcpy(id, own_id, TIPC_NODEID_LEN);
                return true;
        }
        n = tipc_node_find(net, addr);
        if (!n)
                return false;

        memcpy(id, &n->peer_id, TIPC_NODEID_LEN);
        tipc_node_put(n);
        return true;
}

u16 tipc_node_get_capabilities(struct net *net, u32 addr)
{
        struct tipc_node *n;
        u16 caps;

        n = tipc_node_find(net, addr);
        if (unlikely(!n))
                return TIPC_NODE_CAPABILITIES;
        caps = n->capabilities;
        tipc_node_put(n);
        return caps;
}

u32 tipc_node_get_addr(struct tipc_node *node)
{
        return (node) ? node->addr : 0;
}

char *tipc_node_get_id_str(struct tipc_node *node)
{
        return node->peer_id_string;
}

#ifdef CONFIG_TIPC_CRYPTO
/**
 * tipc_node_crypto_rx - Retrieve crypto RX handle from node
 * @__n: target tipc_node
 * Note: node ref counter must be held first!
 */
struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n)
{
        return (__n) ? __n->crypto_rx : NULL;
}

struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos)
{
        return container_of(pos, struct tipc_node, list)->crypto_rx;
}

struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr)
{
        struct tipc_node *n;

        n = tipc_node_find(net, addr);
        return (n) ? n->crypto_rx : NULL;
}
#endif

static void tipc_node_free(struct rcu_head *rp)
{
        struct tipc_node *n = container_of(rp, struct tipc_node, rcu);

#ifdef CONFIG_TIPC_CRYPTO
        tipc_crypto_stop(&n->crypto_rx);
#endif
        kfree(n);
}

static void tipc_node_kref_release(struct kref *kref)
{
        struct tipc_node *n = container_of(kref, struct tipc_node, kref);

        kfree(n->bc_entry.link);
        call_rcu(&n->rcu, tipc_node_free);
}

void tipc_node_put(struct tipc_node *node)
{
        kref_put(&node->kref, tipc_node_kref_release);
}

void tipc_node_get(struct tipc_node *node)
{
        kref_get(&node->kref);
}

/*
 * tipc_node_find - locate specified node object, if it exists
 */
static struct tipc_node *tipc_node_find(struct net *net, u32 addr)
{
        struct tipc_net *tn = tipc_net(net);
        struct tipc_node *node;
        unsigned int thash = tipc_hashfn(addr);

        rcu_read_lock();
        hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) {
                if (node->addr != addr || node->preliminary)
                        continue;
                if (!kref_get_unless_zero(&node->kref))
                        node = NULL;
                break;
        }
        rcu_read_unlock();
        return node;
}

/* tipc_node_find_by_id - locate specified node object by its 128-bit id
 * Note: this function is called only when a discovery request failed
 * to find the node by its 32-bit id, and is not time critical
 */
static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id)
{
        struct tipc_net *tn = tipc_net(net);
        struct tipc_node *n;
        bool found = false;

        rcu_read_lock();
        list_for_each_entry_rcu(n, &tn->node_list, list) {
                read_lock_bh(&n->lock);
                if (!memcmp(id, n->peer_id, 16) &&
                    kref_get_unless_zero(&n->kref))
                        found = true;
                read_unlock_bh(&n->lock);
                if (found)
                        break;
        }
        rcu_read_unlock();
        return found ? n : NULL;
}

static void tipc_node_read_lock(struct tipc_node *n)
        __acquires(n->lock)
{
        read_lock_bh(&n->lock);
}

static void tipc_node_read_unlock(struct tipc_node *n)
        __releases(n->lock)
{
        read_unlock_bh(&n->lock);
}

static void tipc_node_write_lock(struct tipc_node *n)
        __acquires(n->lock)
{
        write_lock_bh(&n->lock);
}

static void tipc_node_write_unlock_fast(struct tipc_node *n)
        __releases(n->lock)
{
        write_unlock_bh(&n->lock);
}

static void tipc_node_write_unlock(struct tipc_node *n)
        __releases(n->lock)
{
        struct tipc_socket_addr sk;
        struct net *net = n->net;
        u32 flags = n->action_flags;
        struct list_head *publ_list;
        struct tipc_uaddr ua;
        u32 bearer_id, node;

        if (likely(!flags)) {
                write_unlock_bh(&n->lock);
                return;
        }

        tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
                   TIPC_LINK_STATE, n->addr, n->addr);
        sk.ref = n->link_id;
        sk.node = tipc_own_addr(net);
        node = n->addr;
        bearer_id = n->link_id & 0xffff;
        publ_list = &n->publ_list;

        n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
                             TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP);

        write_unlock_bh(&n->lock);

        if (flags & TIPC_NOTIFY_NODE_DOWN)
                tipc_publ_notify(net, publ_list, node, n->capabilities);

        if (flags & TIPC_NOTIFY_NODE_UP)
                tipc_named_node_up(net, node, n->capabilities);

        if (flags & TIPC_NOTIFY_LINK_UP) {
                tipc_mon_peer_up(net, node, bearer_id);
                tipc_nametbl_publish(net, &ua, &sk, sk.ref);
        }
        if (flags & TIPC_NOTIFY_LINK_DOWN) {
                tipc_mon_peer_down(net, node, bearer_id);
                tipc_nametbl_withdraw(net, &ua, &sk, sk.ref);
        }
}

static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes)
{
        int net_id = tipc_netid(n->net);
        struct tipc_net *tn_peer;
        struct net *tmp;
        u32 hash_chk;

        if (n->peer_net)
                return;

        for_each_net_rcu(tmp) {
                tn_peer = tipc_net(tmp);
                if (!tn_peer)
                        continue;
                /* Integrity checking whether node exists in namespace or not */
                if (tn_peer->net_id != net_id)
                        continue;
                if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN))
                        continue;
                hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random);
                if (hash_mixes ^ hash_chk)
                        continue;
                n->peer_net = tmp;
                n->peer_hash_mix = hash_mixes;
                break;
        }
}

struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id,
                                   u16 capabilities, u32 hash_mixes,
                                   bool preliminary)
{
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        struct tipc_link *l, *snd_l = tipc_bc_sndlink(net);
        struct tipc_node *n, *temp_node;
        unsigned long intv;
        int bearer_id;
        int i;

        spin_lock_bh(&tn->node_list_lock);
        n = tipc_node_find(net, addr) ?:
                tipc_node_find_by_id(net, peer_id);
        if (n) {
                if (!n->preliminary)
                        goto update;
                if (preliminary)
                        goto exit;
                /* A preliminary node becomes "real" now, refresh its data */
                tipc_node_write_lock(n);
                if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX,
                                         tipc_link_min_win(snd_l), tipc_link_max_win(snd_l),
                                         n->capabilities, &n->bc_entry.inputq1,
                                         &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) {
                        pr_warn("Broadcast rcv link refresh failed, no memory\n");
                        tipc_node_write_unlock_fast(n);
                        tipc_node_put(n);
                        n = NULL;
                        goto exit;
                }
                n->preliminary = false;
                n->addr = addr;
                hlist_del_rcu(&n->hash);
                hlist_add_head_rcu(&n->hash,
                                   &tn->node_htable[tipc_hashfn(addr)]);
                list_del_rcu(&n->list);
                list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
                        if (n->addr < temp_node->addr)
                                break;
                }
                list_add_tail_rcu(&n->list, &temp_node->list);
                tipc_node_write_unlock_fast(n);

update:
                if (n->peer_hash_mix ^ hash_mixes)
                        tipc_node_assign_peer_net(n, hash_mixes);
                if (n->capabilities == capabilities)
                        goto exit;
                /* Same node may come back with new capabilities */
                tipc_node_write_lock(n);
                n->capabilities = capabilities;
                for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
                        l = n->links[bearer_id].link;
                        if (l)
                                tipc_link_update_caps(l, capabilities);
                }
                tipc_node_write_unlock_fast(n);

                /* Calculate cluster capabilities */
                tn->capabilities = TIPC_NODE_CAPABILITIES;
                list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
                        tn->capabilities &= temp_node->capabilities;
                }

                tipc_bcast_toggle_rcast(net,
                                        (tn->capabilities & TIPC_BCAST_RCAST));

                goto exit;
        }
        n = kzalloc(sizeof(*n), GFP_ATOMIC);
        if (!n) {
                pr_warn("Node creation failed, no memory\n");
                goto exit;
        }
        tipc_nodeid2string(n->peer_id_string, peer_id);
#ifdef CONFIG_TIPC_CRYPTO
        if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) {
                pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string);
                kfree(n);
                n = NULL;
                goto exit;
        }
#endif
        n->addr = addr;
        n->preliminary = preliminary;
        memcpy(&n->peer_id, peer_id, 16);
        n->net = net;
        n->peer_net = NULL;
        n->peer_hash_mix = 0;
        /* Assign kernel local namespace if exists */
        tipc_node_assign_peer_net(n, hash_mixes);
        n->capabilities = capabilities;
        kref_init(&n->kref);
        rwlock_init(&n->lock);
        INIT_HLIST_NODE(&n->hash);
        INIT_LIST_HEAD(&n->list);
        INIT_LIST_HEAD(&n->publ_list);
        INIT_LIST_HEAD(&n->conn_sks);
        skb_queue_head_init(&n->bc_entry.namedq);
        skb_queue_head_init(&n->bc_entry.inputq1);
        __skb_queue_head_init(&n->bc_entry.arrvq);
        skb_queue_head_init(&n->bc_entry.inputq2);
        for (i = 0; i < MAX_BEARERS; i++)
                spin_lock_init(&n->links[i].lock);
        n->state = SELF_DOWN_PEER_LEAVING;
        n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
        n->signature = INVALID_NODE_SIG;
        n->active_links[0] = INVALID_BEARER_ID;
        n->active_links[1] = INVALID_BEARER_ID;
        if (!preliminary &&
            !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX,
                                 tipc_link_min_win(snd_l), tipc_link_max_win(snd_l),
                                 n->capabilities, &n->bc_entry.inputq1,
                                 &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) {
                pr_warn("Broadcast rcv link creation failed, no memory\n");
                tipc_node_put(n);
                n = NULL;
                goto exit;
        }
        tipc_node_get(n);
        timer_setup(&n->timer, tipc_node_timeout, 0);
        /* Start a slow timer anyway, crypto needs it */
        n->keepalive_intv = 10000;
        intv = jiffies + msecs_to_jiffies(n->keepalive_intv);
        if (!mod_timer(&n->timer, intv))
                tipc_node_get(n);
        hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]);
        list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
                if (n->addr < temp_node->addr)
                        break;
        }
        list_add_tail_rcu(&n->list, &temp_node->list);
        /* Calculate cluster capabilities */
        tn->capabilities = TIPC_NODE_CAPABILITIES;
        list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
                tn->capabilities &= temp_node->capabilities;
        }
        tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST));
        trace_tipc_node_create(n, true, " ");
exit:
        spin_unlock_bh(&tn->node_list_lock);
        return n;
}

static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
{
        unsigned long tol = tipc_link_tolerance(l);
        unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;

        /* Link with lowest tolerance determines timer interval */
        if (intv < n->keepalive_intv)
                n->keepalive_intv = intv;

        /* Ensure link's abort limit corresponds to current tolerance */
        tipc_link_set_abort_limit(l, tol / n->keepalive_intv);
}

static void tipc_node_delete_from_list(struct tipc_node *node)
{
#ifdef CONFIG_TIPC_CRYPTO
        tipc_crypto_key_flush(node->crypto_rx);
#endif
        list_del_rcu(&node->list);
        hlist_del_rcu(&node->hash);
        tipc_node_put(node);
}

static void tipc_node_delete(struct tipc_node *node)
{
        trace_tipc_node_delete(node, true, " ");
        tipc_node_delete_from_list(node);

        del_timer_sync(&node->timer);
        tipc_node_put(node);
}

void tipc_node_stop(struct net *net)
{
        struct tipc_net *tn = tipc_net(net);
        struct tipc_node *node, *t_node;

        spin_lock_bh(&tn->node_list_lock);
        list_for_each_entry_safe(node, t_node, &tn->node_list, list)
                tipc_node_delete(node);
        spin_unlock_bh(&tn->node_list_lock);
}

void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr)
{
        struct tipc_node *n;

        if (in_own_node(net, addr))
                return;

        n = tipc_node_find(net, addr);
        if (!n) {
                pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr);
                return;
        }
        tipc_node_write_lock(n);
        list_add_tail(subscr, &n->publ_list);
        tipc_node_write_unlock_fast(n);
        tipc_node_put(n);
}

void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr)
{
        struct tipc_node *n;

        if (in_own_node(net, addr))
                return;

        n = tipc_node_find(net, addr);
        if (!n) {
                pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr);
                return;
        }
        tipc_node_write_lock(n);
        list_del_init(subscr);
        tipc_node_write_unlock_fast(n);
        tipc_node_put(n);
}

int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port)
{
        struct tipc_node *node;
        struct tipc_sock_conn *conn;
        int err = 0;

        if (in_own_node(net, dnode))
                return 0;

        node = tipc_node_find(net, dnode);
        if (!node) {
                pr_warn("Connecting sock to node 0x%x failed\n", dnode);
                return -EHOSTUNREACH;
        }
        conn = kmalloc(sizeof(*conn), GFP_ATOMIC);
        if (!conn) {
                err = -EHOSTUNREACH;
                goto exit;
        }
        conn->peer_node = dnode;
        conn->port = port;
        conn->peer_port = peer_port;

        tipc_node_write_lock(node);
        list_add_tail(&conn->list, &node->conn_sks);
        tipc_node_write_unlock(node);
exit:
        tipc_node_put(node);
        return err;
}

void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
{
        struct tipc_node *node;
        struct tipc_sock_conn *conn, *safe;

        if (in_own_node(net, dnode))
                return;

        node = tipc_node_find(net, dnode);
        if (!node)
                return;

        tipc_node_write_lock(node);
        list_for_each_entry_safe(conn, safe, &node->conn_sks, list) {
                if (port != conn->port)
                        continue;
                list_del(&conn->list);
                kfree(conn);
        }
        tipc_node_write_unlock(node);
        tipc_node_put(node);
}

static void  tipc_node_clear_links(struct tipc_node *node)
{
        int i;

        for (i = 0; i < MAX_BEARERS; i++) {
                struct tipc_link_entry *le = &node->links[i];

                if (le->link) {
                        kfree(le->link);
                        le->link = NULL;
                        node->link_cnt--;
                }
        }
}

/* tipc_node_cleanup - delete nodes that does not
 * have active links for NODE_CLEANUP_AFTER time
 */
static bool tipc_node_cleanup(struct tipc_node *peer)
{
        struct tipc_node *temp_node;
        struct tipc_net *tn = tipc_net(peer->net);
        bool deleted = false;

        /* If lock held by tipc_node_stop() the node will be deleted anyway */
        if (!spin_trylock_bh(&tn->node_list_lock))
                return false;

        tipc_node_write_lock(peer);

        if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) {
                tipc_node_clear_links(peer);
                tipc_node_delete_from_list(peer);
                deleted = true;
        }
        tipc_node_write_unlock(peer);

        if (!deleted) {
                spin_unlock_bh(&tn->node_list_lock);
                return deleted;
        }

        /* Calculate cluster capabilities */
        tn->capabilities = TIPC_NODE_CAPABILITIES;
        list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
                tn->capabilities &= temp_node->capabilities;
        }
        tipc_bcast_toggle_rcast(peer->net,
                                (tn->capabilities & TIPC_BCAST_RCAST));
        spin_unlock_bh(&tn->node_list_lock);
        return deleted;
}

/* tipc_node_timeout - handle expiration of node timer
 */
static void tipc_node_timeout(struct timer_list *t)
{
        struct tipc_node *n = from_timer(n, t, timer);
        struct tipc_link_entry *le;
        struct sk_buff_head xmitq;
        int remains = n->link_cnt;
        int bearer_id;
        int rc = 0;

        trace_tipc_node_timeout(n, false, " ");
        if (!node_is_up(n) && tipc_node_cleanup(n)) {
                /*Removing the reference of Timer*/
                tipc_node_put(n);
                return;
        }

#ifdef CONFIG_TIPC_CRYPTO
        /* Take any crypto key related actions first */
        tipc_crypto_timeout(n->crypto_rx);
#endif
        __skb_queue_head_init(&xmitq);

        /* Initial node interval to value larger (10 seconds), then it will be
         * recalculated with link lowest tolerance
         */
        tipc_node_read_lock(n);
        n->keepalive_intv = 10000;
        tipc_node_read_unlock(n);
        for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) {
                tipc_node_read_lock(n);
                le = &n->links[bearer_id];
                if (le->link) {
                        spin_lock_bh(&le->lock);
                        /* Link tolerance may change asynchronously: */
                        tipc_node_calculate_timer(n, le->link);
                        rc = tipc_link_timeout(le->link, &xmitq);
                        spin_unlock_bh(&le->lock);
                        remains--;
                }
                tipc_node_read_unlock(n);
                tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n);
                if (rc & TIPC_LINK_DOWN_EVT)
                        tipc_node_link_down(n, bearer_id, false);
        }
        mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv));
}

/**
 * __tipc_node_link_up - handle addition of link
 * @n: target tipc_node
 * @bearer_id: id of the bearer
 * @xmitq: queue for messages to be xmited on
 * Node lock must be held by caller
 * Link becomes active (alone or shared) or standby, depending on its priority.
 */
static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
                                struct sk_buff_head *xmitq)
{
        int *slot0 = &n->active_links[0];
        int *slot1 = &n->active_links[1];
        struct tipc_link *ol = node_active_link(n, 0);
        struct tipc_link *nl = n->links[bearer_id].link;

        if (!nl || tipc_link_is_up(nl))
                return;

        tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT);
        if (!tipc_link_is_up(nl))
                return;

        n->working_links++;
        n->action_flags |= TIPC_NOTIFY_LINK_UP;
        n->link_id = tipc_link_id(nl);

        /* Leave room for tunnel header when returning 'mtu' to users: */
        n->links[bearer_id].mtu = tipc_link_mss(nl);

        tipc_bearer_add_dest(n->net, bearer_id, n->addr);
        tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id);

        pr_debug("Established link <%s> on network plane %c\n",
                 tipc_link_name(nl), tipc_link_plane(nl));
        trace_tipc_node_link_up(n, true, " ");

        /* Ensure that a STATE message goes first */
        tipc_link_build_state_msg(nl, xmitq);

        /* First link? => give it both slots */
        if (!ol) {
                *slot0 = bearer_id;
                *slot1 = bearer_id;
                tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT);
                n->action_flags |= TIPC_NOTIFY_NODE_UP;
                tipc_link_set_active(nl, true);
                tipc_bcast_add_peer(n->net, nl, xmitq);
                return;
        }

        /* Second link => redistribute slots */
        if (tipc_link_prio(nl) > tipc_link_prio(ol)) {
                pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol));
                *slot0 = bearer_id;
                *slot1 = bearer_id;
                tipc_link_set_active(nl, true);
                tipc_link_set_active(ol, false);
        } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) {
                tipc_link_set_active(nl, true);
                *slot1 = bearer_id;
        } else {
                pr_debug("New link <%s> is standby\n", tipc_link_name(nl));
        }

        /* Prepare synchronization with first link */
        tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq);
}

/**
 * tipc_node_link_up - handle addition of link
 * @n: target tipc_node
 * @bearer_id: id of the bearer
 * @xmitq: queue for messages to be xmited on
 *
 * Link becomes active (alone or shared) or standby, depending on its priority.
 */
static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
                              struct sk_buff_head *xmitq)
{
        struct tipc_media_addr *maddr;

        tipc_node_write_lock(n);
        __tipc_node_link_up(n, bearer_id, xmitq);
        maddr = &n->links[bearer_id].maddr;
        tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n);
        tipc_node_write_unlock(n);
}

/**
 * tipc_node_link_failover() - start failover in case "half-failover"
 *
 * This function is only called in a very special situation where link
 * failover can be already started on peer node but not on this node.
 * This can happen when e.g.::
 *
 *        1. Both links <1A-2A>, <1B-2B> down
 *        2. Link endpoint 2A up, but 1A still down (e.g. due to network
 *        disturbance, wrong session, etc.)
 *        3. Link <1B-2B> up
 *        4. Link endpoint 2A down (e.g. due to link tolerance timeout)
 *        5. Node 2 starts failover onto link <1B-2B>
 *
 *        ==> Node 1 does never start link/node failover!
 *
 * @n: tipc node structure
 * @l: link peer endpoint failingover (- can be NULL)
 * @tnl: tunnel link
 * @xmitq: queue for messages to be xmited on tnl link later
 */
static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l,
                                    struct tipc_link *tnl,
                                    struct sk_buff_head *xmitq)
{
        /* Avoid to be "self-failover" that can never end */
        if (!tipc_link_is_up(tnl))
                return;

        /* Don't rush, failure link may be in the process of resetting */
        if (l && !tipc_link_is_reset(l))
                return;

        tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
        tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);

        n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1);
        tipc_link_failover_prepare(l, tnl, xmitq);

        if (l)
                tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
        tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT);
}

/**
 * __tipc_node_link_down - handle loss of link
 * @n: target tipc_node
 * @bearer_id: id of the bearer
 * @xmitq: queue for messages to be xmited on
 * @maddr: output media address of the bearer
 */
static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
                                  struct sk_buff_head *xmitq,
                                  struct tipc_media_addr **maddr)
{
        struct tipc_link_entry *le = &n->links[*bearer_id];
        int *slot0 = &n->active_links[0];
        int *slot1 = &n->active_links[1];
        int i, highest = 0, prio;
        struct tipc_link *l, *_l, *tnl;

        l = n->links[*bearer_id].link;
        if (!l || tipc_link_is_reset(l))
                return;

        n->working_links--;
        n->action_flags |= TIPC_NOTIFY_LINK_DOWN;
        n->link_id = tipc_link_id(l);

        tipc_bearer_remove_dest(n->net, *bearer_id, n->addr);

        pr_debug("Lost link <%s> on network plane %c\n",
                 tipc_link_name(l), tipc_link_plane(l));

        /* Select new active link if any available */
        *slot0 = INVALID_BEARER_ID;
        *slot1 = INVALID_BEARER_ID;
        for (i = 0; i < MAX_BEARERS; i++) {
                _l = n->links[i].link;
                if (!_l || !tipc_link_is_up(_l))
                        continue;
                if (_l == l)
                        continue;
                prio = tipc_link_prio(_l);
                if (prio < highest)
                        continue;
                if (prio > highest) {
                        highest = prio;
                        *slot0 = i;
                        *slot1 = i;
                        continue;
                }
                *slot1 = i;
        }

        if (!node_is_up(n)) {
                if (tipc_link_peer_is_down(l))
                        tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
                tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT);
                trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!");
                tipc_link_fsm_evt(l, LINK_RESET_EVT);
                tipc_link_reset(l);
                tipc_link_build_reset_msg(l, xmitq);
                *maddr = &n->links[*bearer_id].maddr;
                node_lost_contact(n, &le->inputq);
                tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id);
                return;
        }
        tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id);

        /* There is still a working link => initiate failover */
        *bearer_id = n->active_links[0];
        tnl = n->links[*bearer_id].link;
        tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
        tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
        n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1);
        tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq);
        trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!");
        tipc_link_reset(l);
        tipc_link_fsm_evt(l, LINK_RESET_EVT);
        tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
        tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT);
        *maddr = &n->links[*bearer_id].maddr;
}

static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
{
        struct tipc_link_entry *le = &n->links[bearer_id];
        struct tipc_media_addr *maddr = NULL;
        struct tipc_link *l = le->link;
        int old_bearer_id = bearer_id;
        struct sk_buff_head xmitq;

        if (!l)
                return;

        __skb_queue_head_init(&xmitq);

        tipc_node_write_lock(n);
        if (!tipc_link_is_establishing(l)) {
                __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr);
        } else {
                /* Defuse pending tipc_node_link_up() */
                tipc_link_reset(l);
                tipc_link_fsm_evt(l, LINK_RESET_EVT);
        }
        if (delete) {
                kfree(l);
                le->link = NULL;
                n->link_cnt--;
        }
        trace_tipc_node_link_down(n, true, "node link down or deleted!");
        tipc_node_write_unlock(n);
        if (delete)
                tipc_mon_remove_peer(n->net, n->addr, old_bearer_id);
        if (!skb_queue_empty(&xmitq))
                tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n);
        tipc_sk_rcv(n->net, &le->inputq);
}

static bool node_is_up(struct tipc_node *n)
{
        return n->active_links[0] != INVALID_BEARER_ID;
}

bool tipc_node_is_up(struct net *net, u32 addr)
{
        struct tipc_node *n;
        bool retval = false;

        if (in_own_node(net, addr))
                return true;

        n = tipc_node_find(net, addr);
        if (!n)
                return false;
        retval = node_is_up(n);
        tipc_node_put(n);
        return retval;
}

static u32 tipc_node_suggest_addr(struct net *net, u32 addr)
{
        struct tipc_node *n;

        addr ^= tipc_net(net)->random;
        while ((n = tipc_node_find(net, addr))) {
                tipc_node_put(n);
                addr++;
        }
        return addr;
}

/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not
 * Returns suggested address if any, otherwise 0
 */
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
{
        struct tipc_net *tn = tipc_net(net);
        struct tipc_node *n;
        bool preliminary;
        u32 sugg_addr;

        /* Suggest new address if some other peer is using this one */
        n = tipc_node_find(net, addr);
        if (n) {
                if (!memcmp(n->peer_id, id, NODE_ID_LEN))
                        addr = 0;
                tipc_node_put(n);
                if (!addr)
                        return 0;
                return tipc_node_suggest_addr(net, addr);
        }

        /* Suggest previously used address if peer is known */
        n = tipc_node_find_by_id(net, id);
        if (n) {
                sugg_addr = n->addr;
                preliminary = n->preliminary;
                tipc_node_put(n);
                if (!preliminary)
                        return sugg_addr;
        }

        /* Even this node may be in conflict */
        if (tn->trial_addr == addr)
                return tipc_node_suggest_addr(net, addr);

        return 0;
}

void tipc_node_check_dest(struct net *net, u32 addr,
                          u8 *peer_id, struct tipc_bearer *b,
                          u16 capabilities, u32 signature, u32 hash_mixes,
                          struct tipc_media_addr *maddr,
                          bool *respond, bool *dupl_addr)
{
        struct tipc_node *n;
        struct tipc_link *l;
        struct tipc_link_entry *le;
        bool addr_match = false;
        bool sign_match = false;
        bool link_up = false;
        bool link_is_reset = false;
        bool accept_addr = false;
        bool reset = false;
        char *if_name;
        unsigned long intv;
        u16 session;

        *dupl_addr = false;
        *respond = false;

        n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes,
                             false);
        if (!n)
                return;

        tipc_node_write_lock(n);

        le = &n->links[b->identity];

        /* Prepare to validate requesting node's signature and media address */
        l = le->link;
        link_up = l && tipc_link_is_up(l);
        link_is_reset = l && tipc_link_is_reset(l);
        addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr));
        sign_match = (signature == n->signature);

        /* These three flags give us eight permutations: */

        if (sign_match && addr_match && link_up) {
                /* All is fine. Ignore requests. */
                /* Peer node is not a container/local namespace */
                if (!n->peer_hash_mix)
                        n->peer_hash_mix = hash_mixes;
        } else if (sign_match && addr_match && !link_up) {
                /* Respond. The link will come up in due time */
                *respond = true;
        } else if (sign_match && !addr_match && link_up) {
                /* Peer has changed i/f address without rebooting.
                 * If so, the link will reset soon, and the next
                 * discovery will be accepted. So we can ignore it.
                 * It may also be a cloned or malicious peer having
                 * chosen the same node address and signature as an
                 * existing one.
                 * Ignore requests until the link goes down, if ever.
                 */
                *dupl_addr = true;
        } else if (sign_match && !addr_match && !link_up) {
                /* Peer link has changed i/f address without rebooting.
                 * It may also be a cloned or malicious peer; we can't
                 * distinguish between the two.
                 * The signature is correct, so we must accept.
                 */
                accept_addr = true;
                *respond = true;
                reset = true;
        } else if (!sign_match && addr_match && link_up) {
                /* Peer node rebooted. Two possibilities:
                 *  - Delayed re-discovery; this link endpoint has already
                 *    reset and re-established contact with the peer, before
                 *    receiving a discovery message from that node.
                 *    (The peer happened to receive one from this node first).
                 *  - The peer came back so fast that our side has not
                 *    discovered it yet. Probing from this side will soon
                 *    reset the link, since there can be no working link
                 *    endpoint at the peer end, and the link will re-establish.
                 *  Accept the signature, since it comes from a known peer.
                 */
                n->signature = signature;
        } else if (!sign_match && addr_match && !link_up) {
                /*  The peer node has rebooted.
                 *  Accept signature, since it is a known peer.
                 */
                n->signature = signature;
                *respond = true;
        } else if (!sign_match && !addr_match && link_up) {
                /* Peer rebooted with new address, or a new/duplicate peer.
                 * Ignore until the link goes down, if ever.
                 */
                *dupl_addr = true;
        } else if (!sign_match && !addr_match && !link_up) {
                /* Peer rebooted with new address, or it is a new peer.
                 * Accept signature and address.
                 */
                n->signature = signature;
                accept_addr = true;
                *respond = true;
                reset = true;
        }

        if (!accept_addr)
                goto exit;

        /* Now create new link if not already existing */
        if (!l) {
                if (n->link_cnt == 2)
                        goto exit;

                if_name = strchr(b->name, ':') + 1;
                get_random_bytes(&session, sizeof(u16));
                if (!tipc_link_create(net, if_name, b->identity, b->tolerance,
                                      b->net_plane, b->mtu, b->priority,
                                      b->min_win, b->max_win, session,
                                      tipc_own_addr(net), addr, peer_id,
                                      n->capabilities,
                                      tipc_bc_sndlink(n->net), n->bc_entry.link,
                                      &le->inputq,
                                      &n->bc_entry.namedq, &l)) {
                        *respond = false;
                        goto exit;
                }
                trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!");
                tipc_link_reset(l);
                tipc_link_fsm_evt(l, LINK_RESET_EVT);
                if (n->state == NODE_FAILINGOVER)
                        tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
                link_is_reset = tipc_link_is_reset(l);
                le->link = l;
                n->link_cnt++;
                tipc_node_calculate_timer(n, l);
                if (n->link_cnt == 1) {
                        intv = jiffies + msecs_to_jiffies(n->keepalive_intv);
                        if (!mod_timer(&n->timer, intv))
                                tipc_node_get(n);
                }
        }
        memcpy(&le->maddr, maddr, sizeof(*maddr));
exit:
        tipc_node_write_unlock(n);
        if (reset && !link_is_reset)
                tipc_node_link_down(n, b->identity, false);
        tipc_node_put(n);
}

void tipc_node_delete_links(struct net *net, int bearer_id)
{
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        struct tipc_node *n;

        rcu_read_lock();
        list_for_each_entry_rcu(n, &tn->node_list, list) {
                tipc_node_link_down(n, bearer_id, true);
        }
        rcu_read_unlock();
}

static void tipc_node_reset_links(struct tipc_node *n)
{
        int i;

        pr_warn("Resetting all links to %x\n", n->addr);

        trace_tipc_node_reset_links(n, true, " ");
        for (i = 0; i < MAX_BEARERS; i++) {
                tipc_node_link_down(n, i, false);
        }
}

/* tipc_node_fsm_evt - node finite state machine
 * Determines when contact is allowed with peer node
 */
static void tipc_node_fsm_evt(struct tipc_node *n, int evt)
{
        int state = n->state;

        switch (state) {
        case SELF_DOWN_PEER_DOWN:
                switch (evt) {
                case SELF_ESTABL_CONTACT_EVT:
                        state = SELF_UP_PEER_COMING;
                        break;
                case PEER_ESTABL_CONTACT_EVT:
                        state = SELF_COMING_PEER_UP;
                        break;
                case SELF_LOST_CONTACT_EVT:
                case PEER_LOST_CONTACT_EVT:
                        break;
                case NODE_SYNCH_END_EVT:
                case NODE_SYNCH_BEGIN_EVT:
                case NODE_FAILOVER_BEGIN_EVT:
                case NODE_FAILOVER_END_EVT:
                default:
                        goto illegal_evt;
                }
                break;
        case SELF_UP_PEER_UP:
                switch (evt) {
                case SELF_LOST_CONTACT_EVT:
                        state = SELF_DOWN_PEER_LEAVING;
                        break;
                case PEER_LOST_CONTACT_EVT:
                        state = SELF_LEAVING_PEER_DOWN;
                        break;
                case NODE_SYNCH_BEGIN_EVT:
                        state = NODE_SYNCHING;
                        break;
                case NODE_FAILOVER_BEGIN_EVT:
                        state = NODE_FAILINGOVER;
                        break;
                case SELF_ESTABL_CONTACT_EVT:
                case PEER_ESTABL_CONTACT_EVT:
                case NODE_SYNCH_END_EVT:
                case NODE_FAILOVER_END_EVT:
                        break;
                default:
                        goto illegal_evt;
                }
                break;
        case SELF_DOWN_PEER_LEAVING:
                switch (evt) {
                case PEER_LOST_CONTACT_EVT:
                        state = SELF_DOWN_PEER_DOWN;
                        break;
                case SELF_ESTABL_CONTACT_EVT:
                case PEER_ESTABL_CONTACT_EVT:
                case SELF_LOST_CONTACT_EVT:
                        break;
                case NODE_SYNCH_END_EVT:
                case NODE_SYNCH_BEGIN_EVT:
                case NODE_FAILOVER_BEGIN_EVT:
                case NODE_FAILOVER_END_EVT:
                default:
                        goto illegal_evt;
                }
                break;
        case SELF_UP_PEER_COMING:
                switch (evt) {
                case PEER_ESTABL_CONTACT_EVT:
                        state = SELF_UP_PEER_UP;
                        break;
                case SELF_LOST_CONTACT_EVT:
                        state = SELF_DOWN_PEER_DOWN;
                        break;
                case SELF_ESTABL_CONTACT_EVT:
                case PEER_LOST_CONTACT_EVT:
                case NODE_SYNCH_END_EVT:
                case NODE_FAILOVER_BEGIN_EVT:
                        break;
                case NODE_SYNCH_BEGIN_EVT:
                case NODE_FAILOVER_END_EVT:
                default:
                        goto illegal_evt;
                }
                break;
        case SELF_COMING_PEER_UP:
                switch (evt) {
                case SELF_ESTABL_CONTACT_EVT:
                        state = SELF_UP_PEER_UP;
                        break;
                case PEER_LOST_CONTACT_EVT:
                        state = SELF_DOWN_PEER_DOWN;
                        break;
                case SELF_LOST_CONTACT_EVT:
                case PEER_ESTABL_CONTACT_EVT:
                        break;
                case NODE_SYNCH_END_EVT:
                case NODE_SYNCH_BEGIN_EVT:
                case NODE_FAILOVER_BEGIN_EVT:
                case NODE_FAILOVER_END_EVT:
                default:
                        goto illegal_evt;
                }
                break;
        case SELF_LEAVING_PEER_DOWN:
                switch (evt) {
                case SELF_LOST_CONTACT_EVT:
                        state = SELF_DOWN_PEER_DOWN;
                        break;
                case SELF_ESTABL_CONTACT_EVT:
                case PEER_ESTABL_CONTACT_EVT:
                case PEER_LOST_CONTACT_EVT:
                        break;
                case NODE_SYNCH_END_EVT:
                case NODE_SYNCH_BEGIN_EVT:
                case NODE_FAILOVER_BEGIN_EVT:
                case NODE_FAILOVER_END_EVT:
                default:
                        goto illegal_evt;
                }
                break;
        case NODE_FAILINGOVER:
                switch (evt) {
                case SELF_LOST_CONTACT_EVT:
                        state = SELF_DOWN_PEER_LEAVING;
                        break;
                case PEER_LOST_CONTACT_EVT:
                        state = SELF_LEAVING_PEER_DOWN;
                        break;
                case NODE_FAILOVER_END_EVT:
                        state = SELF_UP_PEER_UP;
                        break;
                case NODE_FAILOVER_BEGIN_EVT:
                case SELF_ESTABL_CONTACT_EVT:
                case PEER_ESTABL_CONTACT_EVT:
                        break;
                case NODE_SYNCH_BEGIN_EVT:
                case NODE_SYNCH_END_EVT:
                default:
                        goto illegal_evt;
                }
                break;
        case NODE_SYNCHING:
                switch (evt) {
                case SELF_LOST_CONTACT_EVT:
                        state = SELF_DOWN_PEER_LEAVING;
                        break;
                case PEER_LOST_CONTACT_EVT:
                        state = SELF_LEAVING_PEER_DOWN;
                        break;
                case NODE_SYNCH_END_EVT:
                        state = SELF_UP_PEER_UP;
                        break;
                case NODE_FAILOVER_BEGIN_EVT:
                        state = NODE_FAILINGOVER;
                        break;
                case NODE_SYNCH_BEGIN_EVT:
                case SELF_ESTABL_CONTACT_EVT:
                case PEER_ESTABL_CONTACT_EVT:
                        break;
                case NODE_FAILOVER_END_EVT:
                default:
                        goto illegal_evt;
                }
                break;
        default:
                pr_err("Unknown node fsm state %x\n", state);
                break;
        }
        trace_tipc_node_fsm(n->peer_id, n->state, state, evt);
        n->state = state;
        return;

illegal_evt:
        pr_err("Illegal node fsm evt %x in state %x\n", evt, state);
        trace_tipc_node_fsm(n->peer_id, n->state, state, evt);
}

static void node_lost_contact(struct tipc_node *n,
                              struct sk_buff_head *inputq)
{
        struct tipc_sock_conn *conn, *safe;
        struct tipc_link *l;
        struct list_head *conns = &n->conn_sks;
        struct sk_buff *skb;
        uint i;

        pr_debug("Lost contact with %x\n", n->addr);
        n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
        trace_tipc_node_lost_contact(n, true, " ");

        /* Clean up broadcast state */
        tipc_bcast_remove_peer(n->net, n->bc_entry.link);
        skb_queue_purge(&n->bc_entry.namedq);

        /* Abort any ongoing link failover */
        for (i = 0; i < MAX_BEARERS; i++) {
                l = n->links[i].link;
                if (l)
                        tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT);
        }

        /* Notify publications from this node */
        n->action_flags |= TIPC_NOTIFY_NODE_DOWN;
        n->peer_net = NULL;
        n->peer_hash_mix = 0;
        /* Notify sockets connected to node */
        list_for_each_entry_safe(conn, safe, conns, list) {
                skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG,
                                      SHORT_H_SIZE, 0, tipc_own_addr(n->net),
                                      conn->peer_node, conn->port,
                                      conn->peer_port, TIPC_ERR_NO_NODE);
                if (likely(skb))
                        skb_queue_tail(inputq, skb);
                list_del(&conn->list);
                kfree(conn);
        }
}

/**
 * tipc_node_get_linkname - get the name of a link
 *
 * @net: the applicable net namespace
 * @bearer_id: id of the bearer
 * @addr: peer node address
 * @linkname: link name output buffer
 * @len: size of @linkname output buffer
 *
 * Return: 0 on success
 */
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
                           char *linkname, size_t len)
{
        struct tipc_link *link;
        int err = -EINVAL;
        struct tipc_node *node = tipc_node_find(net, addr);

        if (!node)
                return err;

        if (bearer_id >= MAX_BEARERS)
                goto exit;

        tipc_node_read_lock(node);
        link = node->links[bearer_id].link;
        if (link) {
                strncpy(linkname, tipc_link_name(link), len);
                err = 0;
        }
        tipc_node_read_unlock(node);
exit:
        tipc_node_put(node);
        return err;
}

/* Caller should hold node lock for the passed node */
static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node)
{
        void *hdr;
        struct nlattr *attrs;

        hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
                          NLM_F_MULTI, TIPC_NL_NODE_GET);
        if (!hdr)
                return -EMSGSIZE;

        attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE);
        if (!attrs)
                goto msg_full;

        if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr))
                goto attr_msg_full;
        if (node_is_up(node))
                if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP))
                        goto attr_msg_full;

        nla_nest_end(msg->skb, attrs);
        genlmsg_end(msg->skb, hdr);

        return 0;

attr_msg_full:
        nla_nest_cancel(msg->skb, attrs);
msg_full:
        genlmsg_cancel(msg->skb, hdr);

        return -EMSGSIZE;
}

static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list)
{
        struct tipc_msg *hdr = buf_msg(skb_peek(list));
        struct sk_buff_head inputq;

        switch (msg_user(hdr)) {
        case TIPC_LOW_IMPORTANCE:
        case TIPC_MEDIUM_IMPORTANCE:
        case TIPC_HIGH_IMPORTANCE:
        case TIPC_CRITICAL_IMPORTANCE:
                if (msg_connected(hdr) || msg_named(hdr) ||
                    msg_direct(hdr)) {
                        tipc_loopback_trace(peer_net, list);
                        spin_lock_init(&list->lock);
                        tipc_sk_rcv(peer_net, list);
                        return;
                }
                if (msg_mcast(hdr)) {
                        tipc_loopback_trace(peer_net, list);
                        skb_queue_head_init(&inputq);
                        tipc_sk_mcast_rcv(peer_net, list, &inputq);
                        __skb_queue_purge(list);
                        skb_queue_purge(&inputq);
                        return;
                }
                return;
        case MSG_FRAGMENTER:
                if (tipc_msg_assemble(list)) {
                        tipc_loopback_trace(peer_net, list);
                        skb_queue_head_init(&inputq);
                        tipc_sk_mcast_rcv(peer_net, list, &inputq);
                        __skb_queue_purge(list);
                        skb_queue_purge(&inputq);
                }
                return;
        case GROUP_PROTOCOL:
        case CONN_MANAGER:
                tipc_loopback_trace(peer_net, list);
                spin_lock_init(&list->lock);
                tipc_sk_rcv(peer_net, list);
                return;
        case LINK_PROTOCOL:
        case NAME_DISTRIBUTOR:
        case TUNNEL_PROTOCOL:
        case BCAST_PROTOCOL:
                return;
        default:
                return;
        }
}

/**
 * tipc_node_xmit() - general link level function for message sending
 * @net: the applicable net namespace
 * @list: chain of buffers containing message
 * @dnode: address of destination node
 * @selector: a number used for deterministic link selection
 * Consumes the buffer chain.
 * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
 */
int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
                   u32 dnode, int selector)
{
        struct tipc_link_entry *le = NULL;
        struct tipc_node *n;
        struct sk_buff_head xmitq;
        bool node_up = false;
        struct net *peer_net;
        int bearer_id;
        int rc;

        if (in_own_node(net, dnode)) {
                tipc_loopback_trace(net, list);
                spin_lock_init(&list->lock);
                tipc_sk_rcv(net, list);
                return 0;
        }

        n = tipc_node_find(net, dnode);
        if (unlikely(!n)) {
                __skb_queue_purge(list);
                return -EHOSTUNREACH;
        }

        rcu_read_lock();
        tipc_node_read_lock(n);
        node_up = node_is_up(n);
        peer_net = n->peer_net;
        tipc_node_read_unlock(n);
        if (node_up && peer_net && check_net(peer_net)) {
                /* xmit inner linux container */
                tipc_lxc_xmit(peer_net, list);
                if (likely(skb_queue_empty(list))) {
                        rcu_read_unlock();
                        tipc_node_put(n);
                        return 0;
                }
        }
        rcu_read_unlock();

        tipc_node_read_lock(n);
        bearer_id = n->active_links[selector & 1];
        if (unlikely(bearer_id == INVALID_BEARER_ID)) {
                tipc_node_read_unlock(n);
                tipc_node_put(n);
                __skb_queue_purge(list);
                return -EHOSTUNREACH;
        }

        __skb_queue_head_init(&xmitq);
        le = &n->links[bearer_id];
        spin_lock_bh(&le->lock);
        rc = tipc_link_xmit(le->link, list, &xmitq);
        spin_unlock_bh(&le->lock);
        tipc_node_read_unlock(n);

        if (unlikely(rc == -ENOBUFS))
                tipc_node_link_down(n, bearer_id, false);
        else
                tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n);

        tipc_node_put(n);

        return rc;
}

/* tipc_node_xmit_skb(): send single buffer to destination
 * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE
 * messages, which will not be rejected
 * The only exception is datagram messages rerouted after secondary
 * lookup, which are rare and safe to dispose of anyway.
 */
int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
                       u32 selector)
{
        struct sk_buff_head head;

        __skb_queue_head_init(&head);
        __skb_queue_tail(&head, skb);
        tipc_node_xmit(net, &head, dnode, selector);
        return 0;
}

/* tipc_node_distr_xmit(): send single buffer msgs to individual destinations
 * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected
 */
int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq)
{
        struct sk_buff *skb;
        u32 selector, dnode;

        while ((skb = __skb_dequeue(xmitq))) {
                selector = msg_origport(buf_msg(skb));
                dnode = msg_destnode(buf_msg(skb));
                tipc_node_xmit_skb(net, skb, dnode, selector);
        }
        return 0;
}

void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests)
{
        struct sk_buff_head xmitq;
        struct sk_buff *txskb;
        struct tipc_node *n;
        u16 dummy;
        u32 dst;

        /* Use broadcast if all nodes support it */
        if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) {
                __skb_queue_head_init(&xmitq);
                __skb_queue_tail(&xmitq, skb);
                tipc_bcast_xmit(net, &xmitq, &dummy);
                return;
        }

        /* Otherwise use legacy replicast method */
        rcu_read_lock();
        list_for_each_entry_rcu(n, tipc_nodes(net), list) {
                dst = n->addr;
                if (in_own_node(net, dst))
                        continue;
                if (!node_is_up(n))
                        continue;
                txskb = pskb_copy(skb, GFP_ATOMIC);
                if (!txskb)
                        break;
                msg_set_destnode(buf_msg(txskb), dst);
                tipc_node_xmit_skb(net, txskb, dst, 0);
        }
        rcu_read_unlock();
        kfree_skb(skb);
}

static void tipc_node_mcast_rcv(struct tipc_node *n)
{
        struct tipc_bclink_entry *be = &n->bc_entry;

        /* 'arrvq' is under inputq2's lock protection */
        spin_lock_bh(&be->inputq2.lock);
        spin_lock_bh(&be->inputq1.lock);
        skb_queue_splice_tail_init(&be->inputq1, &be->arrvq);
        spin_unlock_bh(&be->inputq1.lock);
        spin_unlock_bh(&be->inputq2.lock);
        tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2);
}

static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
                                  int bearer_id, struct sk_buff_head *xmitq)
{
        struct tipc_link *ucl;
        int rc;

        rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq);

        if (rc & TIPC_LINK_DOWN_EVT) {
                tipc_node_reset_links(n);
                return;
        }

        if (!(rc & TIPC_LINK_SND_STATE))
                return;

        /* If probe message, a STATE response will be sent anyway */
        if (msg_probe(hdr))
                return;

        /* Produce a STATE message carrying broadcast NACK */
        tipc_node_read_lock(n);
        ucl = n->links[bearer_id].link;
        if (ucl)
                tipc_link_build_state_msg(ucl, xmitq);
        tipc_node_read_unlock(n);
}

/**
 * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node
 * @net: the applicable net namespace
 * @skb: TIPC packet
 * @bearer_id: id of bearer message arrived on
 *
 * Invoked with no locks held.
 */
static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id)
{
        int rc;
        struct sk_buff_head xmitq;
        struct tipc_bclink_entry *be;
        struct tipc_link_entry *le;
        struct tipc_msg *hdr = buf_msg(skb);
        int usr = msg_user(hdr);
        u32 dnode = msg_destnode(hdr);
        struct tipc_node *n;

        __skb_queue_head_init(&xmitq);

        /* If NACK for other node, let rcv link for that node peek into it */
        if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net)))
                n = tipc_node_find(net, dnode);
        else
                n = tipc_node_find(net, msg_prevnode(hdr));
        if (!n) {
                kfree_skb(skb);
                return;
        }
        be = &n->bc_entry;
        le = &n->links[bearer_id];

        rc = tipc_bcast_rcv(net, be->link, skb);

        /* Broadcast ACKs are sent on a unicast link */
        if (rc & TIPC_LINK_SND_STATE) {
                tipc_node_read_lock(n);
                tipc_link_build_state_msg(le->link, &xmitq);
                tipc_node_read_unlock(n);
        }

        if (!skb_queue_empty(&xmitq))
                tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n);

        if (!skb_queue_empty(&be->inputq1))
                tipc_node_mcast_rcv(n);

        /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */
        if (!skb_queue_empty(&n->bc_entry.namedq))
                tipc_named_rcv(net, &n->bc_entry.namedq,
                               &n->bc_entry.named_rcv_nxt,
                               &n->bc_entry.named_open);

        /* If reassembly or retransmission failure => reset all links to peer */
        if (rc & TIPC_LINK_DOWN_EVT)
                tipc_node_reset_links(n);

        tipc_node_put(n);
}

/**
 * tipc_node_check_state - check and if necessary update node state
 * @n: target tipc_node
 * @skb: TIPC packet
 * @bearer_id: identity of bearer delivering the packet
 * @xmitq: queue for messages to be xmited on
 * Return: true if state and msg are ok, otherwise false
 */
static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
                                  int bearer_id, struct sk_buff_head *xmitq)
{
        struct tipc_msg *hdr = buf_msg(skb);
        int usr = msg_user(hdr);
        int mtyp = msg_type(hdr);
        u16 oseqno = msg_seqno(hdr);
        u16 exp_pkts = msg_msgcnt(hdr);
        u16 rcv_nxt, syncpt, dlv_nxt, inputq_len;
        int state = n->state;
        struct tipc_link *l, *tnl, *pl = NULL;
        struct tipc_media_addr *maddr;
        int pb_id;

        if (trace_tipc_node_check_state_enabled()) {
                trace_tipc_skb_dump(skb, false, "skb for node state check");
                trace_tipc_node_check_state(n, true, " ");
        }
        l = n->links[bearer_id].link;
        if (!l)
                return false;
        rcv_nxt = tipc_link_rcv_nxt(l);


        if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL)))
                return true;

        /* Find parallel link, if any */
        for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) {
                if ((pb_id != bearer_id) && n->links[pb_id].link) {
                        pl = n->links[pb_id].link;
                        break;
                }
        }

        if (!tipc_link_validate_msg(l, hdr)) {
                trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!");
                trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!");
                return false;
        }

        /* Check and update node accesibility if applicable */
        if (state == SELF_UP_PEER_COMING) {
                if (!tipc_link_is_up(l))
                        return true;
                if (!msg_peer_link_is_up(hdr))
                        return true;
                tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT);
        }

        if (state == SELF_DOWN_PEER_LEAVING) {
                if (msg_peer_node_is_up(hdr))
                        return false;
                tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
                return true;
        }

        if (state == SELF_LEAVING_PEER_DOWN)
                return false;

        /* Ignore duplicate packets */
        if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt))
                return true;

        /* Initiate or update failover mode if applicable */
        if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) {
                syncpt = oseqno + exp_pkts - 1;
                if (pl && !tipc_link_is_reset(pl)) {
                        __tipc_node_link_down(n, &pb_id, xmitq, &maddr);
                        trace_tipc_node_link_down(n, true,
                                                  "node link down <- failover!");
                        tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl),
                                                        tipc_link_inputq(l));
                }

                /* If parallel link was already down, and this happened before
                 * the tunnel link came up, node failover was never started.
                 * Ensure that a FAILOVER_MSG is sent to get peer out of
                 * NODE_FAILINGOVER state, also this node must accept
                 * TUNNEL_MSGs from peer.
                 */
                if (n->state != NODE_FAILINGOVER)
                        tipc_node_link_failover(n, pl, l, xmitq);

                /* If pkts arrive out of order, use lowest calculated syncpt */
                if (less(syncpt, n->sync_point))
                        n->sync_point = syncpt;
        }

        /* Open parallel link when tunnel link reaches synch point */
        if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) {
                if (!more(rcv_nxt, n->sync_point))
                        return true;
                tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT);
                if (pl)
                        tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT);
                return true;
        }

        /* No syncing needed if only one link */
        if (!pl || !tipc_link_is_up(pl))
                return true;

        /* Initiate synch mode if applicable */
        if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) {
                if (n->capabilities & TIPC_TUNNEL_ENHANCED)
                        syncpt = msg_syncpt(hdr);
                else
                        syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1;
                if (!tipc_link_is_up(l))
                        __tipc_node_link_up(n, bearer_id, xmitq);
                if (n->state == SELF_UP_PEER_UP) {
                        n->sync_point = syncpt;
                        tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT);
                        tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT);
                }
        }

        /* Open tunnel link when parallel link reaches synch point */
        if (n->state == NODE_SYNCHING) {
                if (tipc_link_is_synching(l)) {
                        tnl = l;
                } else {
                        tnl = pl;
                        pl = l;
                }
                inputq_len = skb_queue_len(tipc_link_inputq(pl));
                dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len;
                if (more(dlv_nxt, n->sync_point)) {
                        tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
                        tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
                        return true;
                }
                if (l == pl)
                        return true;
                if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG))
                        return true;
                if (usr == LINK_PROTOCOL)
                        return true;
                return false;
        }
        return true;
}

/**
 * tipc_rcv - process TIPC packets/messages arriving from off-node
 * @net: the applicable net namespace
 * @skb: TIPC packet
 * @b: pointer to bearer message arrived on
 *
 * Invoked with no locks held. Bearer pointer must point to a valid bearer
 * structure (i.e. cannot be NULL), but bearer can be inactive.
 */
void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
{
        struct sk_buff_head xmitq;
        struct tipc_link_entry *le;
        struct tipc_msg *hdr;
        struct tipc_node *n;
        int bearer_id = b->identity;
        u32 self = tipc_own_addr(net);
        int usr, rc = 0;
        u16 bc_ack;
#ifdef CONFIG_TIPC_CRYPTO
        struct tipc_ehdr *ehdr;

        /* Check if message must be decrypted first */
        if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb))
                goto rcv;

        ehdr = (struct tipc_ehdr *)skb->data;
        if (likely(ehdr->user != LINK_CONFIG)) {
                n = tipc_node_find(net, ntohl(ehdr->addr));
                if (unlikely(!n))
                        goto discard;
        } else {
                n = tipc_node_find_by_id(net, ehdr->id);
        }
        skb_dst_force(skb);
        tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b);
        if (!skb)
                return;

rcv:
#endif
        /* Ensure message is well-formed before touching the header */
        if (unlikely(!tipc_msg_validate(&skb)))
                goto discard;
        __skb_queue_head_init(&xmitq);
        hdr = buf_msg(skb);
        usr = msg_user(hdr);
        bc_ack = msg_bcast_ack(hdr);

        /* Handle arrival of discovery or broadcast packet */
        if (unlikely(msg_non_seq(hdr))) {
                if (unlikely(usr == LINK_CONFIG))
                        return tipc_disc_rcv(net, skb, b);
                else
                        return tipc_node_bc_rcv(net, skb, bearer_id);
        }

        /* Discard unicast link messages destined for another node */
        if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self)))
                goto discard;

        /* Locate neighboring node that sent packet */
        n = tipc_node_find(net, msg_prevnode(hdr));
        if (unlikely(!n))
                goto discard;
        le = &n->links[bearer_id];

        /* Ensure broadcast reception is in synch with peer's send state */
        if (unlikely(usr == LINK_PROTOCOL)) {
                if (unlikely(skb_linearize(skb))) {
                        tipc_node_put(n);
                        goto discard;
                }
                hdr = buf_msg(skb);
                tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq);
        } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) {
                tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr);
        }

        /* Receive packet directly if conditions permit */
        tipc_node_read_lock(n);
        if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) {
                spin_lock_bh(&le->lock);
                if (le->link) {
                        rc = tipc_link_rcv(le->link, skb, &xmitq);
                        skb = NULL;
                }
                spin_unlock_bh(&le->lock);
        }
        tipc_node_read_unlock(n);

        /* Check/update node state before receiving */
        if (unlikely(skb)) {
                if (unlikely(skb_linearize(skb)))
                        goto out_node_put;
                tipc_node_write_lock(n);
                if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) {
                        if (le->link) {
                                rc = tipc_link_rcv(le->link, skb, &xmitq);
                                skb = NULL;
                        }
                }
                tipc_node_write_unlock(n);
        }

        if (unlikely(rc & TIPC_LINK_UP_EVT))
                tipc_node_link_up(n, bearer_id, &xmitq);

        if (unlikely(rc & TIPC_LINK_DOWN_EVT))
                tipc_node_link_down(n, bearer_id, false);

        if (unlikely(!skb_queue_empty(&n->bc_entry.namedq)))
                tipc_named_rcv(net, &n->bc_entry.namedq,
                               &n->bc_entry.named_rcv_nxt,
                               &n->bc_entry.named_open);

        if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1)))
                tipc_node_mcast_rcv(n);

        if (!skb_queue_empty(&le->inputq))
                tipc_sk_rcv(net, &le->inputq);

        if (!skb_queue_empty(&xmitq))
                tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n);

out_node_put:
        tipc_node_put(n);
discard:
        kfree_skb(skb);
}

void tipc_node_apply_property(struct net *net, struct tipc_bearer *b,
                              int prop)
{
        struct tipc_net *tn = tipc_net(net);
        int bearer_id = b->identity;
        struct sk_buff_head xmitq;
        struct tipc_link_entry *e;
        struct tipc_node *n;

        __skb_queue_head_init(&xmitq);

        rcu_read_lock();

        list_for_each_entry_rcu(n, &tn->node_list, list) {
                tipc_node_write_lock(n);
                e = &n->links[bearer_id];
                if (e->link) {
                        if (prop == TIPC_NLA_PROP_TOL)
                                tipc_link_set_tolerance(e->link, b->tolerance,
                                                        &xmitq);
                        else if (prop == TIPC_NLA_PROP_MTU)
                                tipc_link_set_mtu(e->link, b->mtu);

                        /* Update MTU for node link entry */
                        e->mtu = tipc_link_mss(e->link);
                }

                tipc_node_write_unlock(n);
                tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL);
        }

        rcu_read_unlock();
}

int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = sock_net(skb->sk);
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
        struct tipc_node *peer, *temp_node;
        u8 node_id[NODE_ID_LEN];
        u64 *w0 = (u64 *)&node_id[0];
        u64 *w1 = (u64 *)&node_id[8];
        u32 addr;
        int err;

        /* We identify the peer by its net */
        if (!info->attrs[TIPC_NLA_NET])
                return -EINVAL;

        err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX,
                                          info->attrs[TIPC_NLA_NET],
                                          tipc_nl_net_policy, info->extack);
        if (err)
                return err;

        /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are
         * mutually exclusive cases
         */
        if (attrs[TIPC_NLA_NET_ADDR]) {
                addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
                if (!addr)
                        return -EINVAL;
        }

        if (attrs[TIPC_NLA_NET_NODEID]) {
                if (!attrs[TIPC_NLA_NET_NODEID_W1])
                        return -EINVAL;
                *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]);
                *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]);
                addr = hash128to32(node_id);
        }

        if (in_own_node(net, addr))
                return -ENOTSUPP;

        spin_lock_bh(&tn->node_list_lock);
        peer = tipc_node_find(net, addr);
        if (!peer) {
                spin_unlock_bh(&tn->node_list_lock);
                return -ENXIO;
        }

        tipc_node_write_lock(peer);
        if (peer->state != SELF_DOWN_PEER_DOWN &&
            peer->state != SELF_DOWN_PEER_LEAVING) {
                tipc_node_write_unlock(peer);
                err = -EBUSY;
                goto err_out;
        }

        tipc_node_clear_links(peer);
        tipc_node_write_unlock(peer);
        tipc_node_delete(peer);

        /* Calculate cluster capabilities */
        tn->capabilities = TIPC_NODE_CAPABILITIES;
        list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
                tn->capabilities &= temp_node->capabilities;
        }
        tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST));
        err = 0;
err_out:
        tipc_node_put(peer);
        spin_unlock_bh(&tn->node_list_lock);

        return err;
}

int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        int err;
        struct net *net = sock_net(skb->sk);
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        int done = cb->args[0];
        int last_addr = cb->args[1];
        struct tipc_node *node;
        struct tipc_nl_msg msg;

        if (done)
                return 0;

        msg.skb = skb;
        msg.portid = NETLINK_CB(cb->skb).portid;
        msg.seq = cb->nlh->nlmsg_seq;

        rcu_read_lock();
        if (last_addr) {
                node = tipc_node_find(net, last_addr);
                if (!node) {
                        rcu_read_unlock();
                        /* We never set seq or call nl_dump_check_consistent()
                         * this means that setting prev_seq here will cause the
                         * consistence check to fail in the netlink callback
                         * handler. Resulting in the NLMSG_DONE message having
                         * the NLM_F_DUMP_INTR flag set if the node state
                         * changed while we released the lock.
                         */
                        cb->prev_seq = 1;
                        return -EPIPE;
                }
                tipc_node_put(node);
        }

        list_for_each_entry_rcu(node, &tn->node_list, list) {
                if (node->preliminary)
                        continue;
                if (last_addr) {
                        if (node->addr == last_addr)
                                last_addr = 0;
                        else
                                continue;
                }

                tipc_node_read_lock(node);
                err = __tipc_nl_add_node(&msg, node);
                if (err) {
                        last_addr = node->addr;
                        tipc_node_read_unlock(node);
                        goto out;
                }

                tipc_node_read_unlock(node);
        }
        done = 1;
out:
        cb->args[0] = done;
        cb->args[1] = last_addr;
        rcu_read_unlock();

        return skb->len;
}

/* tipc_node_find_by_name - locate owner node of link by link's name
 * @net: the applicable net namespace
 * @name: pointer to link name string
 * @bearer_id: pointer to index in 'node->links' array where the link was found.
 *
 * Returns pointer to node owning the link, or 0 if no matching link is found.
 */
static struct tipc_node *tipc_node_find_by_name(struct net *net,
                                                const char *link_name,
                                                unsigned int *bearer_id)
{
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        struct tipc_link *l;
        struct tipc_node *n;
        struct tipc_node *found_node = NULL;
        int i;

        *bearer_id = 0;
        rcu_read_lock();
        list_for_each_entry_rcu(n, &tn->node_list, list) {
                tipc_node_read_lock(n);
                for (i = 0; i < MAX_BEARERS; i++) {
                        l = n->links[i].link;
                        if (l && !strcmp(tipc_link_name(l), link_name)) {
                                *bearer_id = i;
                                found_node = n;
                                break;
                        }
                }
                tipc_node_read_unlock(n);
                if (found_node)
                        break;
        }
        rcu_read_unlock();

        return found_node;
}

int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info)
{
        int err;
        int res = 0;
        int bearer_id;
        char *name;
        struct tipc_link *link;
        struct tipc_node *node;
        struct sk_buff_head xmitq;
        struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
        struct net *net = sock_net(skb->sk);

        __skb_queue_head_init(&xmitq);

        if (!info->attrs[TIPC_NLA_LINK])
                return -EINVAL;

        err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX,
                                          info->attrs[TIPC_NLA_LINK],
                                          tipc_nl_link_policy, info->extack);
        if (err)
                return err;

        if (!attrs[TIPC_NLA_LINK_NAME])
                return -EINVAL;

        name = nla_data(attrs[TIPC_NLA_LINK_NAME]);

        if (strcmp(name, tipc_bclink_name) == 0)
                return tipc_nl_bc_link_set(net, attrs);

        node = tipc_node_find_by_name(net, name, &bearer_id);
        if (!node)
                return -EINVAL;

        tipc_node_read_lock(node);

        link = node->links[bearer_id].link;
        if (!link) {
                res = -EINVAL;
                goto out;
        }

        if (attrs[TIPC_NLA_LINK_PROP]) {
                struct nlattr *props[TIPC_NLA_PROP_MAX + 1];

                err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props);
                if (err) {
                        res = err;
                        goto out;
                }

                if (props[TIPC_NLA_PROP_TOL]) {
                        u32 tol;

                        tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
                        tipc_link_set_tolerance(link, tol, &xmitq);
                }
                if (props[TIPC_NLA_PROP_PRIO]) {
                        u32 prio;

                        prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
                        tipc_link_set_prio(link, prio, &xmitq);
                }
                if (props[TIPC_NLA_PROP_WIN]) {
                        u32 max_win;

                        max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
                        tipc_link_set_queue_limits(link,
                                                   tipc_link_min_win(link),
                                                   max_win);
                }
        }

out:
        tipc_node_read_unlock(node);
        tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr,
                         NULL);
        return res;
}

int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
        struct tipc_nl_msg msg;
        char *name;
        int err;

        msg.portid = info->snd_portid;
        msg.seq = info->snd_seq;

        if (!info->attrs[TIPC_NLA_LINK])
                return -EINVAL;

        err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX,
                                          info->attrs[TIPC_NLA_LINK],
                                          tipc_nl_link_policy, info->extack);
        if (err)
                return err;

        if (!attrs[TIPC_NLA_LINK_NAME])
                return -EINVAL;

        name = nla_data(attrs[TIPC_NLA_LINK_NAME]);

        msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!msg.skb)
                return -ENOMEM;

        if (strcmp(name, tipc_bclink_name) == 0) {
                err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl);
                if (err)
                        goto err_free;
        } else {
                int bearer_id;
                struct tipc_node *node;
                struct tipc_link *link;

                node = tipc_node_find_by_name(net, name, &bearer_id);
                if (!node) {
                        err = -EINVAL;
                        goto err_free;
                }

                tipc_node_read_lock(node);
                link = node->links[bearer_id].link;
                if (!link) {
                        tipc_node_read_unlock(node);
                        err = -EINVAL;
                        goto err_free;
                }

                err = __tipc_nl_add_link(net, &msg, link, 0);
                tipc_node_read_unlock(node);
                if (err)
                        goto err_free;
        }

        return genlmsg_reply(msg.skb, info);

err_free:
        nlmsg_free(msg.skb);
        return err;
}

int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info)
{
        int err;
        char *link_name;
        unsigned int bearer_id;
        struct tipc_link *link;
        struct tipc_node *node;
        struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct tipc_net *tn = tipc_net(net);
        struct tipc_link_entry *le;

        if (!info->attrs[TIPC_NLA_LINK])
                return -EINVAL;

        err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX,
                                          info->attrs[TIPC_NLA_LINK],
                                          tipc_nl_link_policy, info->extack);
        if (err)
                return err;

        if (!attrs[TIPC_NLA_LINK_NAME])
                return -EINVAL;

        link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]);

        err = -EINVAL;
        if (!strcmp(link_name, tipc_bclink_name)) {
                err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net));
                if (err)
                        return err;
                return 0;
        } else if (strstr(link_name, tipc_bclink_name)) {
                rcu_read_lock();
                list_for_each_entry_rcu(node, &tn->node_list, list) {
                        tipc_node_read_lock(node);
                        link = node->bc_entry.link;
                        if (link && !strcmp(link_name, tipc_link_name(link))) {
                                err = tipc_bclink_reset_stats(net, link);
                                tipc_node_read_unlock(node);
                                break;
                        }
                        tipc_node_read_unlock(node);
                }
                rcu_read_unlock();
                return err;
        }

        node = tipc_node_find_by_name(net, link_name, &bearer_id);
        if (!node)
                return -EINVAL;

        le = &node->links[bearer_id];
        tipc_node_read_lock(node);
        spin_lock_bh(&le->lock);
        link = node->links[bearer_id].link;
        if (!link) {
                spin_unlock_bh(&le->lock);
                tipc_node_read_unlock(node);
                return -EINVAL;
        }
        tipc_link_reset_stats(link);
        spin_unlock_bh(&le->lock);
        tipc_node_read_unlock(node);
        return 0;
}

/* Caller should hold node lock  */
static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
                                    struct tipc_node *node, u32 *prev_link,
                                    bool bc_link)
{
        u32 i;
        int err;

        for (i = *prev_link; i < MAX_BEARERS; i++) {
                *prev_link = i;

                if (!node->links[i].link)
                        continue;

                err = __tipc_nl_add_link(net, msg,
                                         node->links[i].link, NLM_F_MULTI);
                if (err)
                        return err;
        }

        if (bc_link) {
                *prev_link = i;
                err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link);
                if (err)
                        return err;
        }

        *prev_link = 0;

        return 0;
}

int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs;
        struct nlattr *link[TIPC_NLA_LINK_MAX + 1];
        struct tipc_net *tn = net_generic(net, tipc_net_id);
        struct tipc_node *node;
        struct tipc_nl_msg msg;
        u32 prev_node = cb->args[0];
        u32 prev_link = cb->args[1];
        int done = cb->args[2];
        bool bc_link = cb->args[3];
        int err;

        if (done)
                return 0;

        if (!prev_node) {
                /* Check if broadcast-receiver links dumping is needed */
                if (attrs && attrs[TIPC_NLA_LINK]) {
                        err = nla_parse_nested_deprecated(link,
                                                          TIPC_NLA_LINK_MAX,
                                                          attrs[TIPC_NLA_LINK],
                                                          tipc_nl_link_policy,
                                                          NULL);
                        if (unlikely(err))
                                return err;
                        if (unlikely(!link[TIPC_NLA_LINK_BROADCAST]))
                                return -EINVAL;
                        bc_link = true;
                }
        }

        msg.skb = skb;
        msg.portid = NETLINK_CB(cb->skb).portid;
        msg.seq = cb->nlh->nlmsg_seq;

        rcu_read_lock();
        if (prev_node) {
                node = tipc_node_find(net, prev_node);
                if (!node) {
                        /* We never set seq or call nl_dump_check_consistent()
                         * this means that setting prev_seq here will cause the
                         * consistence check to fail in the netlink callback
                         * handler. Resulting in the last NLMSG_DONE message
                         * having the NLM_F_DUMP_INTR flag set.
                         */
                        cb->prev_seq = 1;
                        goto out;
                }
                tipc_node_put(node);

                list_for_each_entry_continue_rcu(node, &tn->node_list,
                                                 list) {
                        tipc_node_read_lock(node);
                        err = __tipc_nl_add_node_links(net, &msg, node,
                                                       &prev_link, bc_link);
                        tipc_node_read_unlock(node);
                        if (err)
                                goto out;

                        prev_node = node->addr;
                }
        } else {
                err = tipc_nl_add_bc_link(net, &msg, tn->bcl);
                if (err)
                        goto out;

                list_for_each_entry_rcu(node, &tn->node_list, list) {
                        tipc_node_read_lock(node);
                        err = __tipc_nl_add_node_links(net, &msg, node,
                                                       &prev_link, bc_link);
                        tipc_node_read_unlock(node);
                        if (err)
                                goto out;

                        prev_node = node->addr;
                }
        }
        done = 1;
out:
        rcu_read_unlock();

        cb->args[0] = prev_node;
        cb->args[1] = prev_link;
        cb->args[2] = done;
        cb->args[3] = bc_link;

        return skb->len;
}

int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr *attrs[TIPC_NLA_MON_MAX + 1];
        struct net *net = sock_net(skb->sk);
        int err;

        if (!info->attrs[TIPC_NLA_MON])
                return -EINVAL;

        err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX,
                                          info->attrs[TIPC_NLA_MON],
                                          tipc_nl_monitor_policy,
                                          info->extack);
        if (err)
                return err;

        if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) {
                u32 val;

                val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]);
                err = tipc_nl_monitor_set_threshold(net, val);
                if (err)
                        return err;
        }

        return 0;
}

static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg)
{
        struct nlattr *attrs;
        void *hdr;
        u32 val;

        hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
                          0, TIPC_NL_MON_GET);
        if (!hdr)
                return -EMSGSIZE;

        attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON);
        if (!attrs)
                goto msg_full;

        val = tipc_nl_monitor_get_threshold(net);

        if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val))
                goto attr_msg_full;

        nla_nest_end(msg->skb, attrs);
        genlmsg_end(msg->skb, hdr);

        return 0;

attr_msg_full:
        nla_nest_cancel(msg->skb, attrs);
msg_full:
        genlmsg_cancel(msg->skb, hdr);

        return -EMSGSIZE;
}

int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = sock_net(skb->sk);
        struct tipc_nl_msg msg;
        int err;

        msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!msg.skb)
                return -ENOMEM;
        msg.portid = info->snd_portid;
        msg.seq = info->snd_seq;

        err = __tipc_nl_add_monitor_prop(net, &msg);
        if (err) {
                nlmsg_free(msg.skb);
                return err;
        }

        return genlmsg_reply(msg.skb, info);
}

int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        u32 prev_bearer = cb->args[0];
        struct tipc_nl_msg msg;
        int bearer_id;
        int err;

        if (prev_bearer == MAX_BEARERS)
                return 0;

        msg.skb = skb;
        msg.portid = NETLINK_CB(cb->skb).portid;
        msg.seq = cb->nlh->nlmsg_seq;

        rtnl_lock();
        for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) {
                err = __tipc_nl_add_monitor(net, &msg, bearer_id);
                if (err)
                        break;
        }
        rtnl_unlock();
        cb->args[0] = bearer_id;

        return skb->len;
}

int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
                                   struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        u32 prev_node = cb->args[1];
        u32 bearer_id = cb->args[2];
        int done = cb->args[0];
        struct tipc_nl_msg msg;
        int err;

        if (!prev_node) {
                struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs;
                struct nlattr *mon[TIPC_NLA_MON_MAX + 1];

                if (!attrs[TIPC_NLA_MON])
                        return -EINVAL;

                err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX,
                                                  attrs[TIPC_NLA_MON],
                                                  tipc_nl_monitor_policy,
                                                  NULL);
                if (err)
                        return err;

                if (!mon[TIPC_NLA_MON_REF])
                        return -EINVAL;

                bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]);

                if (bearer_id >= MAX_BEARERS)
                        return -EINVAL;
        }

        if (done)
                return 0;

        msg.skb = skb;
        msg.portid = NETLINK_CB(cb->skb).portid;
        msg.seq = cb->nlh->nlmsg_seq;

        rtnl_lock();
        err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node);
        if (!err)
                done = 1;

        rtnl_unlock();
        cb->args[0] = done;
        cb->args[1] = prev_node;
        cb->args[2] = bearer_id;

        return skb->len;
}

#ifdef CONFIG_TIPC_CRYPTO
static int tipc_nl_retrieve_key(struct nlattr **attrs,
                                struct tipc_aead_key **pkey)
{
        struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY];
        struct tipc_aead_key *key;

        if (!attr)
                return -ENODATA;

        if (nla_len(attr) < sizeof(*key))
                return -EINVAL;
        key = (struct tipc_aead_key *)nla_data(attr);
        if (key->keylen > TIPC_AEAD_KEYLEN_MAX ||
            nla_len(attr) < tipc_aead_key_size(key))
                return -EINVAL;

        *pkey = key;
        return 0;
}

static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id)
{
        struct nlattr *attr = attrs[TIPC_NLA_NODE_ID];

        if (!attr)
                return -ENODATA;

        if (nla_len(attr) < TIPC_NODEID_LEN)
                return -EINVAL;

        *node_id = (u8 *)nla_data(attr);
        return 0;
}

static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv)
{
        struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING];

        if (!attr)
                return -ENODATA;

        *intv = nla_get_u32(attr);
        return 0;
}

static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx;
        struct tipc_node *n = NULL;
        struct tipc_aead_key *ukey;
        bool rekeying = true, master_key = false;
        u8 *id, *own_id, mode;
        u32 intv = 0;
        int rc = 0;

        if (!info->attrs[TIPC_NLA_NODE])
                return -EINVAL;

        rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX,
                              info->attrs[TIPC_NLA_NODE],
                              tipc_nl_node_policy, info->extack);
        if (rc)
                return rc;

        own_id = tipc_own_id(net);
        if (!own_id) {
                GENL_SET_ERR_MSG(info, "not found own node identity (set id?)");
                return -EPERM;
        }

        rc = tipc_nl_retrieve_rekeying(attrs, &intv);
        if (rc == -ENODATA)
                rekeying = false;

        rc = tipc_nl_retrieve_key(attrs, &ukey);
        if (rc == -ENODATA && rekeying)
                goto rekeying;
        else if (rc)
                return rc;

        rc = tipc_aead_key_validate(ukey, info);
        if (rc)
                return rc;

        rc = tipc_nl_retrieve_nodeid(attrs, &id);
        switch (rc) {
        case -ENODATA:
                mode = CLUSTER_KEY;
                master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]);
                break;
        case 0:
                mode = PER_NODE_KEY;
                if (memcmp(id, own_id, NODE_ID_LEN)) {
                        n = tipc_node_find_by_id(net, id) ?:
                                tipc_node_create(net, 0, id, 0xffffu, 0, true);
                        if (unlikely(!n))
                                return -ENOMEM;
                        c = n->crypto_rx;
                }
                break;
        default:
                return rc;
        }

        /* Initiate the TX/RX key */
        rc = tipc_crypto_key_init(c, ukey, mode, master_key);
        if (n)
                tipc_node_put(n);

        if (unlikely(rc < 0)) {
                GENL_SET_ERR_MSG(info, "unable to initiate or attach new key");
                return rc;
        } else if (c == tx) {
                /* Distribute TX key but not master one */
                if (!master_key && tipc_crypto_key_distr(tx, rc, NULL))
                        GENL_SET_ERR_MSG(info, "failed to replicate new key");
rekeying:
                /* Schedule TX rekeying if needed */
                tipc_crypto_rekeying_sched(tx, rekeying, intv);
        }

        return 0;
}

int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
{
        int err;

        rtnl_lock();
        err = __tipc_nl_node_set_key(skb, info);
        rtnl_unlock();

        return err;
}

static int __tipc_nl_node_flush_key(struct sk_buff *skb,
                                    struct genl_info *info)
{
        struct net *net = sock_net(skb->sk);
        struct tipc_net *tn = tipc_net(net);
        struct tipc_node *n;

        tipc_crypto_key_flush(tn->crypto_tx);
        rcu_read_lock();
        list_for_each_entry_rcu(n, &tn->node_list, list)
                tipc_crypto_key_flush(n->crypto_rx);
        rcu_read_unlock();

        return 0;
}

int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info)
{
        int err;

        rtnl_lock();
        err = __tipc_nl_node_flush_key(skb, info);
        rtnl_unlock();

        return err;
}
#endif

/**
 * tipc_node_dump - dump TIPC node data
 * @n: tipc node to be dumped
 * @more: dump more?
 *        - false: dump only tipc node data
 *        - true: dump node link data as well
 * @buf: returned buffer of dump data in format
 */
int tipc_node_dump(struct tipc_node *n, bool more, char *buf)
{
        int i = 0;
        size_t sz = (more) ? NODE_LMAX : NODE_LMIN;

        if (!n) {
                i += scnprintf(buf, sz, "node data: (null)\n");
                return i;
        }

        i += scnprintf(buf, sz, "node data: %x", n->addr);
        i += scnprintf(buf + i, sz - i, " %x", n->state);
        i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]);
        i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]);
        i += scnprintf(buf + i, sz - i, " %x", n->action_flags);
        i += scnprintf(buf + i, sz - i, " %u", n->failover_sent);
        i += scnprintf(buf + i, sz - i, " %u", n->sync_point);
        i += scnprintf(buf + i, sz - i, " %d", n->link_cnt);
        i += scnprintf(buf + i, sz - i, " %u", n->working_links);
        i += scnprintf(buf + i, sz - i, " %x", n->capabilities);
        i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv);

        if (!more)
                return i;

        i += scnprintf(buf + i, sz - i, "link_entry[0]:\n");
        i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu);
        i += scnprintf(buf + i, sz - i, " media: ");
        i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr);
        i += scnprintf(buf + i, sz - i, "\n");
        i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i);
        i += scnprintf(buf + i, sz - i, " inputq: ");
        i += tipc_list_dump(&n->links[0].inputq, false, buf + i);

        i += scnprintf(buf + i, sz - i, "link_entry[1]:\n");
        i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu);
        i += scnprintf(buf + i, sz - i, " media: ");
        i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr);
        i += scnprintf(buf + i, sz - i, "\n");
        i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i);
        i += scnprintf(buf + i, sz - i, " inputq: ");
        i += tipc_list_dump(&n->links[1].inputq, false, buf + i);

        i += scnprintf(buf + i, sz - i, "bclink:\n ");
        i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i);

        return i;
}

void tipc_node_pre_cleanup_net(struct net *exit_net)
{
        struct tipc_node *n;
        struct tipc_net *tn;
        struct net *tmp;

        rcu_read_lock();
        for_each_net_rcu(tmp) {
                if (tmp == exit_net)
                        continue;
                tn = tipc_net(tmp);
                if (!tn)
                        continue;
                spin_lock_bh(&tn->node_list_lock);
                list_for_each_entry_rcu(n, &tn->node_list, list) {
                        if (!n->peer_net)
                                continue;
                        if (n->peer_net != exit_net)
                                continue;
                        tipc_node_write_lock(n);
                        n->peer_net = NULL;
                        n->peer_hash_mix = 0;
                        tipc_node_write_unlock_fast(n);
                        break;
                }
                spin_unlock_bh(&tn->node_list_lock);
        }
        rcu_read_unlock();
}































    1 



    1 


    1 
    1 

    1 

    1 

    1 









































































    1 





    1 





    1 







    1 
    1 




    1 







    1 











    1 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
// SPDX-License-Identifier: GPL-2.0
/*
 *     SUCS NET3:
 *
 *     Generic stream handling routines. These are generic for most
 *     protocols. Even IP. Tonight 8-).
 *     This is used because TCP, LLC (others too) layer all have mostly
 *     identical sendmsg() and recvmsg() code.
 *     So we (will) share it here.
 *
 *     Authors:        Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *                     (from old tcp.c code)
 *                     Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
 */

#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/net.h>
#include <linux/signal.h>
#include <linux/tcp.h>
#include <linux/wait.h>
#include <net/sock.h>

/**
 * sk_stream_write_space - stream socket write_space callback.
 * @sk: socket
 *
 * FIXME: write proper description
 */
void sk_stream_write_space(struct sock *sk)
{
        struct socket *sock = sk->sk_socket;
        struct socket_wq *wq;

        if (__sk_stream_is_writeable(sk, 1) && sock) {
                clear_bit(SOCK_NOSPACE, &sock->flags);

                rcu_read_lock();
                wq = rcu_dereference(sk->sk_wq);
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);
                if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
                        sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
                rcu_read_unlock();
        }
}

/**
 * sk_stream_wait_connect - Wait for a socket to get into the connected state
 * @sk: sock to wait on
 * @timeo_p: for how long to wait
 *
 * Must be called with the socket locked.
 */
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        struct task_struct *tsk = current;
        int done;

        do {
                int err = sock_error(sk);
                if (err)
                        return err;
                if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
                        return -EPIPE;
                if (!*timeo_p)
                        return -EAGAIN;
                if (signal_pending(tsk))
                        return sock_intr_errno(*timeo_p);

                add_wait_queue(sk_sleep(sk), &wait);
                sk->sk_write_pending++;
                done = sk_wait_event(sk, timeo_p,
                                     !READ_ONCE(sk->sk_err) &&
                                     !((1 << READ_ONCE(sk->sk_state)) &
                                       ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
                remove_wait_queue(sk_sleep(sk), &wait);
                sk->sk_write_pending--;
        } while (!done);
        return done < 0 ? done : 0;
}
EXPORT_SYMBOL(sk_stream_wait_connect);

/**
 * sk_stream_closing - Return 1 if we still have things to send in our buffers.
 * @sk: socket to verify
 */
static int sk_stream_closing(const struct sock *sk)
{
        return (1 << READ_ONCE(sk->sk_state)) &
               (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
}

void sk_stream_wait_close(struct sock *sk, long timeout)
{
        if (timeout) {
                DEFINE_WAIT_FUNC(wait, woken_wake_function);

                add_wait_queue(sk_sleep(sk), &wait);

                do {
                        if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait))
                                break;
                } while (!signal_pending(current) && timeout);

                remove_wait_queue(sk_sleep(sk), &wait);
        }
}
EXPORT_SYMBOL(sk_stream_wait_close);

/**
 * sk_stream_wait_memory - Wait for more memory for a socket
 * @sk: socket to wait for memory
 * @timeo_p: for how long
 */
int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
{
        int ret, err = 0;
        long vm_wait = 0;
        long current_timeo = *timeo_p;
        DEFINE_WAIT_FUNC(wait, woken_wake_function);

        if (sk_stream_memory_free(sk))
                current_timeo = vm_wait = get_random_u32_below(HZ / 5) + 2;

        add_wait_queue(sk_sleep(sk), &wait);

        while (1) {
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

                if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                        goto do_error;
                if (!*timeo_p)
                        goto do_eagain;
                if (signal_pending(current))
                        goto do_interrupted;
                sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                if (sk_stream_memory_free(sk) && !vm_wait)
                        break;

                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                sk->sk_write_pending++;
                ret = sk_wait_event(sk, &current_timeo, READ_ONCE(sk->sk_err) ||
                                    (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
                                    (sk_stream_memory_free(sk) && !vm_wait),
                                    &wait);
                sk->sk_write_pending--;
                if (ret < 0)
                        goto do_error;

                if (vm_wait) {
                        vm_wait -= current_timeo;
                        current_timeo = *timeo_p;
                        if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
                            (current_timeo -= vm_wait) < 0)
                                current_timeo = 0;
                        vm_wait = 0;
                }
                *timeo_p = current_timeo;
        }
out:
        if (!sock_flag(sk, SOCK_DEAD))
                remove_wait_queue(sk_sleep(sk), &wait);
        return err;

do_error:
        err = -EPIPE;
        goto out;
do_eagain:
        /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
         * be generated later.
         * When TCP receives ACK packets that make room, tcp_check_space()
         * only calls tcp_new_space() if SOCK_NOSPACE is set.
         */
        set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
        err = -EAGAIN;
        goto out;
do_interrupted:
        err = sock_intr_errno(*timeo_p);
        goto out;
}
EXPORT_SYMBOL(sk_stream_wait_memory);

int sk_stream_error(struct sock *sk, int flags, int err)
{
        if (err == -EPIPE)
                err = sock_error(sk) ? : -EPIPE;
        if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
                send_sig(SIGPIPE, current, 0);
        return err;
}
EXPORT_SYMBOL(sk_stream_error);

void sk_stream_kill_queues(struct sock *sk)
{
        /* First the read buffer. */
        __skb_queue_purge(&sk->sk_receive_queue);

        /* Next, the error queue.
         * We need to use queue lock, because other threads might
         * add packets to the queue without socket lock being held.
         */
        skb_queue_purge(&sk->sk_error_queue);

        /* Next, the write queue. */
        WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue));

        /* Account for returned memory. */
        sk_mem_reclaim_final(sk);

        WARN_ON_ONCE(sk->sk_wmem_queued);

        /* It is _impossible_ for the backlog to contain anything
         * when we get here.  All user references to this socket
         * have gone away, only the net layer knows can touch it.
         */
}
EXPORT_SYMBOL(sk_stream_kill_queues);





























    1 




    1 























    1 











    1 







    1 













    1 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/random.h>
#include <linux/hrtimer.h>
#include <linux/ktime.h>
#include <linux/string.h>
#include <linux/net.h>
#include <linux/siphash.h>
#include <net/secure_seq.h>

#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
#include <linux/in6.h>
#include <net/tcp.h>

static siphash_aligned_key_t net_secret;
static siphash_aligned_key_t ts_secret;

#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)

static __always_inline void net_secret_init(void)
{
        net_get_random_once(&net_secret, sizeof(net_secret));
}

static __always_inline void ts_secret_init(void)
{
        net_get_random_once(&ts_secret, sizeof(ts_secret));
}
#endif

#ifdef CONFIG_INET
static u32 seq_scale(u32 seq)
{
        /*
         *        As close as possible to RFC 793, which
         *        suggests using a 250 kHz clock.
         *        Further reading shows this assumes 2 Mb/s networks.
         *        For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
         *        For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
         *        we also need to limit the resolution so that the u32 seq
         *        overlaps less than one time per MSL (2 minutes).
         *        Choosing a clock of 64 ns period is OK. (period of 274 s)
         */
        return seq + (ktime_get_real_ns() >> 6);
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
u32 secure_tcpv6_ts_off(const struct net *net,
                        const __be32 *saddr, const __be32 *daddr)
{
        const struct {
                struct in6_addr saddr;
                struct in6_addr daddr;
        } __aligned(SIPHASH_ALIGNMENT) combined = {
                .saddr = *(struct in6_addr *)saddr,
                .daddr = *(struct in6_addr *)daddr,
        };

        if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
                return 0;

        ts_secret_init();
        return siphash(&combined, offsetofend(typeof(combined), daddr),
                       &ts_secret);
}
EXPORT_SYMBOL(secure_tcpv6_ts_off);

u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
                     __be16 sport, __be16 dport)
{
        const struct {
                struct in6_addr saddr;
                struct in6_addr daddr;
                __be16 sport;
                __be16 dport;
        } __aligned(SIPHASH_ALIGNMENT) combined = {
                .saddr = *(struct in6_addr *)saddr,
                .daddr = *(struct in6_addr *)daddr,
                .sport = sport,
                .dport = dport
        };
        u32 hash;

        net_secret_init();
        hash = siphash(&combined, offsetofend(typeof(combined), dport),
                       &net_secret);
        return seq_scale(hash);
}
EXPORT_SYMBOL(secure_tcpv6_seq);

u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
                               __be16 dport)
{
        const struct {
                struct in6_addr saddr;
                struct in6_addr daddr;
                unsigned int timeseed;
                __be16 dport;
        } __aligned(SIPHASH_ALIGNMENT) combined = {
                .saddr = *(struct in6_addr *)saddr,
                .daddr = *(struct in6_addr *)daddr,
                .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
                .dport = dport,
        };
        net_secret_init();
        return siphash(&combined, offsetofend(typeof(combined), dport),
                       &net_secret);
}
EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif

#ifdef CONFIG_INET
u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
{
        if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
                return 0;

        ts_secret_init();
        return siphash_2u32((__force u32)saddr, (__force u32)daddr,
                            &ts_secret);
}

/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
 * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
 * it would be easy enough to have the former function use siphash_4u32, passing
 * the arguments as separate u32.
 */
u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
                   __be16 sport, __be16 dport)
{
        u32 hash;

        net_secret_init();
        hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
                            (__force u32)sport << 16 | (__force u32)dport,
                            &net_secret);
        return seq_scale(hash);
}
EXPORT_SYMBOL_GPL(secure_tcp_seq);

u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
        net_secret_init();
        return siphash_4u32((__force u32)saddr, (__force u32)daddr,
                            (__force u16)dport,
                            jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
                            &net_secret);
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif

#if IS_ENABLED(CONFIG_IP_DCCP)
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
                                __be16 sport, __be16 dport)
{
        u64 seq;
        net_secret_init();
        seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
                           (__force u32)sport << 16 | (__force u32)dport,
                           &net_secret);
        seq += ktime_get_real_ns();
        seq &= (1ull << 48) - 1;
        return seq;
}
EXPORT_SYMBOL(secure_dccp_sequence_number);

#if IS_ENABLED(CONFIG_IPV6)
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
                                  __be16 sport, __be16 dport)
{
        const struct {
                struct in6_addr saddr;
                struct in6_addr daddr;
                __be16 sport;
                __be16 dport;
        } __aligned(SIPHASH_ALIGNMENT) combined = {
                .saddr = *(struct in6_addr *)saddr,
                .daddr = *(struct in6_addr *)daddr,
                .sport = sport,
                .dport = dport
        };
        u64 seq;
        net_secret_init();
        seq = siphash(&combined, offsetofend(typeof(combined), dport),
                      &net_secret);
        seq += ktime_get_real_ns();
        seq &= (1ull << 48) - 1;
        return seq;
}
EXPORT_SYMBOL(secure_dccpv6_sequence_number);
#endif
#endif































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vsyscall

#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define __VSYSCALL_TRACE_H

#include <linux/tracepoint.h>

TRACE_EVENT(emulate_vsyscall,

            TP_PROTO(int nr),

            TP_ARGS(nr),

            TP_STRUCT__entry(__field(int, nr)),

            TP_fast_assign(
                           __entry->nr = nr;
                           ),

            TP_printk("nr = %d", __entry->nr)
);

#endif

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
#define TRACE_INCLUDE_FILE vsyscall_trace
#include <trace/define_trace.h>















































































































































































































































    3 










    3 
















































    1 














































    2 
    3 

    2 








































































    3 





    3 


    2 



    1 










    3 
    3 

    3 













    2 




























































































    3 
    2 


    3 



















    3 

    3 
















































    2 



    2 
    1 

    1 
    2 



    1 

    1 


    1 













    2 




    1 


    3 















    2 
    2 
    1 


























    1 



























































    2 






    3 




    1 






    3 















    3 

































    1 



    2 


    2 
    1 







    2 















    2 













































    2 


    3 











    2 














































































































































































    2 











    3 

    3 






















































    2 


    2 
    1 
































    3 





    2 

    3 






    3 




    2 
    1 












    3 


    1 




















































































































































































































































































































































































    2 






    1 









    2 
    2 



    1 


    1 
    1 

    1 
























































































    3 












    2 











    2 













    3 










    3 








    2 


















    3 
    2 











    3 





























    2 
























    2 


    3 



    3 





    2 






















































































































































































































































































































































































































    2 





    3 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009                SUSE Linux Products GmbH
 * Copyright (C) 2009                Tejun Heo <tj@kernel.org>
 *
 * Copyright (C) 2017                Facebook Inc.
 * Copyright (C) 2017                Dennis Zhou <dennis@kernel.org>
 *
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structured like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * The allocator organizes chunks into lists according to free size and
 * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
 * flag should be passed.  All memcg-aware allocations are sharing one set
 * of chunks and all unaccounted allocations and allocations performed
 * by processes belonging to the root memory cgroup are using the second set.
 *
 * The allocator tries to allocate from the fullest chunk first. Each chunk
 * is managed by a bitmap with metadata blocks.  The allocation map is updated
 * on every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/*
 * The slots are sorted by the size of the biggest continuous free area.
 * 1-31 bytes share the same slot.
 */
#define PCPU_SLOT_BASE_SHIFT                5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD        3

#define PCPU_EMPTY_POP_PAGES_LOW        2
#define PCPU_EMPTY_POP_PAGES_HIGH        4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)                                        \
        (void __percpu *)((unsigned long)(addr) -                        \
                          (unsigned long)pcpu_base_addr        +                \
                          (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)                                                \
        (void __force *)((unsigned long)(ptr) +                                \
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
#endif
#else        /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)                (void __force *)(ptr)
#endif        /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static int pcpu_free_slot __ro_after_init;
int pcpu_sidelined_slot __ro_after_init;
int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;

static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;        /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
 */
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock);        /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);        /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/*
 * The number of empty populated pages, protected by pcpu_lock.
 * The reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages;

/*
 * The number of populated pages in use by the allocator, protected by
 * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
 * allocated/deallocated, it is allocated/deallocated in all units of a chunk
 * and increments/decrements this count by 1).
 */
static unsigned long pcpu_nr_populated;

/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
        if (pcpu_async_enabled)
                schedule_work(&pcpu_balance_work);
}

/**
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 *
 * RETURNS:
 * True if the address is served from this chunk.
 */
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
        void *start_addr, *end_addr;

        if (!chunk)
                return false;

        start_addr = chunk->base_addr + chunk->start_offset;
        end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
                   chunk->end_offset;

        return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
        int highbit = fls(size);        /* size is in bytes */
        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
        if (size == pcpu_unit_size)
                return pcpu_free_slot;
        return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
        const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

        if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
            chunk_md->contig_hint == 0)
                return 0;

        return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
        page->index = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
        return (struct pcpu_chunk *)page->index;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
        return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
{
        return (unsigned long)chunk->base_addr +
               pcpu_unit_page_offset(cpu, page_idx);
}

/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
        return chunk->alloc_map +
               (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
        return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
        return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
        return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/**
 * pcpu_check_block_hint - check against the contig hint
 * @block: block of interest
 * @bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Check to see if the allocation can fit in the block's contig hint.
 * Note, a chunk uses the same hints as a block so this can also check against
 * the chunk's contig hint.
 */
static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
                                  size_t align)
{
        int bit_off = ALIGN(block->contig_hint_start, align) -
                block->contig_hint_start;

        return bit_off + bits <= block->contig_hint;
}

/*
 * pcpu_next_hint - determine which hint to use
 * @block: block of interest
 * @alloc_bits: size of allocation
 *
 * This determines if we should scan based on the scan_hint or first_free.
 * In general, we want to scan from first_free to fulfill allocations by
 * first fit.  However, if we know a scan_hint at position scan_hint_start
 * cannot fulfill an allocation, we can begin scanning from there knowing
 * the contig_hint will be our fallback.
 */
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
        /*
         * The three conditions below determine if we can skip past the
         * scan_hint.  First, does the scan hint exist.  Second, is the
         * contig_hint after the scan_hint (possibly not true iff
         * contig_hint == scan_hint).  Third, is the allocation request
         * larger than the scan_hint.
         */
        if (block->scan_hint &&
            block->contig_hint_start > block->scan_hint_start &&
            alloc_bits > block->scan_hint)
                return block->scan_hint_start + block->scan_hint;

        return block->first_free;
}

/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
                                     int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                        return;
                }

                /*
                 * This checks three things.  First is there a contig_hint to
                 * check.  Second, have we checked this hint before by
                 * comparing the block_off.  Third, is this the same as the
                 * right contig hint.  In the last case, it spills over into
                 * the next block and should be handled by the contig area
                 * across blocks code.
                 */
                *bits = block->contig_hint;
                if (*bits && block->contig_hint_start >= block_off &&
                    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
                        *bit_off = pcpu_block_off_to_off(i,
                                        block->contig_hint_start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bits = block->right_free;
                *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
        }
}

/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
                                 int align, int *bit_off, int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (*bits >= alloc_bits)
                                return;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                }

                /* check block->contig_hint */
                *bits = ALIGN(block->contig_hint_start, align) -
                        block->contig_hint_start;
                /*
                 * This uses the block offset to determine if this has been
                 * checked in the prior iteration.
                 */
                if (block->contig_hint &&
                    block->contig_hint_start >= block_off &&
                    block->contig_hint >= *bits + alloc_bits) {
                        int start = pcpu_next_hint(block, alloc_bits);

                        *bits += alloc_bits + block->contig_hint_start -
                                 start;
                        *bit_off = pcpu_block_off_to_off(i, start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
                                 align);
                *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
                *bit_off = pcpu_block_off_to_off(i, *bit_off);
                if (*bits >= alloc_bits)
                        return;
        }

        /* no valid offsets were found - fail condition */
        *bit_off = pcpu_chunk_map_bits(chunk);
}

/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)                \
        for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));        \
             (bit_off) < pcpu_chunk_map_bits((chunk));                        \
             (bit_off) += (bits) + 1,                                        \
             pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
        for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits));                                      \
             (bit_off) < pcpu_chunk_map_bits((chunk));                              \
             (bit_off) += (bits),                                              \
             pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits)))

/**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
 * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;

        if (size <= PAGE_SIZE)
                return kzalloc(size, gfp);
        else
                return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
 */
static void pcpu_mem_free(void *ptr)
{
        kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
{
        if (chunk != pcpu_reserved_chunk) {
                if (move_front)
                        list_move(&chunk->list, &pcpu_chunk_lists[slot]);
                else
                        list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
        }
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
        __pcpu_chunk_move(chunk, slot, true);
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 *
 * CONTEXT:
 * pcpu_lock.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
        int nslot = pcpu_chunk_slot(chunk);

        /* leave isolated chunks in-place */
        if (chunk->isolated)
                return;

        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (!chunk->isolated) {
                chunk->isolated = true;
                pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
        }
        list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}

static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (chunk->isolated) {
                chunk->isolated = false;
                pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
                pcpu_chunk_relocate(chunk, -1);
        }
}

/*
 * pcpu_update_empty_pages - update empty page counters
 * @chunk: chunk of interest
 * @nr: nr of empty pages
 *
 * This is used to keep track of the empty pages now based on the premise
 * a md_block covers a page.  The hint update functions recognize if a block
 * is made full or broken to calculate deltas for keeping track of free pages.
 */
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
        chunk->nr_empty_pop_pages += nr;
        if (chunk != pcpu_reserved_chunk && !chunk->isolated)
                pcpu_nr_empty_pop_pages += nr;
}

/*
 * pcpu_region_overlap - determines if two regions overlap
 * @a: start of first region, inclusive
 * @b: end of first region, exclusive
 * @x: start of second region, inclusive
 * @y: end of second region, exclusive
 *
 * This is used to determine if the hint region [a, b) overlaps with the
 * allocated region [x, y).
 */
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
        return (a < y) && (x < b);
}

/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
        int contig = end - start;

        block->first_free = min(block->first_free, start);
        if (start == 0)
                block->left_free = contig;

        if (end == block->nr_bits)
                block->right_free = contig;

        if (contig > block->contig_hint) {
                /* promote the old contig_hint to be the new scan_hint */
                if (start > block->contig_hint_start) {
                        if (block->contig_hint > block->scan_hint) {
                                block->scan_hint_start =
                                        block->contig_hint_start;
                                block->scan_hint = block->contig_hint;
                        } else if (start < block->scan_hint_start) {
                                /*
                                 * The old contig_hint == scan_hint.  But, the
                                 * new contig is larger so hold the invariant
                                 * scan_hint_start < contig_hint_start.
                                 */
                                block->scan_hint = 0;
                        }
                } else {
                        block->scan_hint = 0;
                }
                block->contig_hint_start = start;
                block->contig_hint = contig;
        } else if (contig == block->contig_hint) {
                if (block->contig_hint_start &&
                    (!start ||
                     __ffs(start) > __ffs(block->contig_hint_start))) {
                        /* start has a better alignment so use it */
                        block->contig_hint_start = start;
                        if (start < block->scan_hint_start &&
                            block->contig_hint > block->scan_hint)
                                block->scan_hint = 0;
                } else if (start > block->scan_hint_start ||
                           block->contig_hint > block->scan_hint) {
                        /*
                         * Knowing contig == contig_hint, update the scan_hint
                         * if it is farther than or larger than the current
                         * scan_hint.
                         */
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        } else {
                /*
                 * The region is smaller than the contig_hint.  So only update
                 * the scan_hint if it is larger than or equal and farther than
                 * the current scan_hint.
                 */
                if ((start < block->contig_hint_start &&
                     (contig > block->scan_hint ||
                      (contig == block->scan_hint &&
                       start > block->scan_hint_start)))) {
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        }
}

/*
 * pcpu_block_update_scan - update a block given a free area from a scan
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finding the final allocation spot first goes through pcpu_find_block_fit()
 * to find a block that can hold the allocation and then pcpu_alloc_area()
 * where a scan is used.  When allocations require specific alignments,
 * we can inadvertently create holes which will not be seen in the alloc
 * or free paths.
 *
 * This takes a given free area hole and updates a block as it may change the
 * scan_hint.  We need to scan backwards to ensure we don't miss free bits
 * from alignment.
 */
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
                                   int bits)
{
        int s_off = pcpu_off_to_block_off(bit_off);
        int e_off = s_off + bits;
        int s_index, l_bit;
        struct pcpu_block_md *block;

        if (e_off > PCPU_BITMAP_BLOCK_BITS)
                return;

        s_index = pcpu_off_to_block_index(bit_off);
        block = chunk->md_blocks + s_index;

        /* scan backwards in case of alignment skipping free bits */
        l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
        s_off = (s_off == l_bit) ? 0 : l_bit + 1;

        pcpu_block_update(block, s_off, e_off);
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * @full_scan: if we should scan from the beginning
 *
 * Iterates over the metadata blocks to find the largest contig area.
 * A full scan can be avoided on the allocation path as this is triggered
 * if we broke the contig_hint.  In doing so, the scan_hint will be before
 * the contig_hint or after if the scan_hint == contig_hint.  This cannot
 * be prevented on freeing as we want to find the largest area possibly
 * spanning blocks.
 */
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits;

        /* promote scan_hint to contig_hint */
        if (!full_scan && chunk_md->scan_hint) {
                bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
                chunk_md->contig_hint_start = chunk_md->scan_hint_start;
                chunk_md->contig_hint = chunk_md->scan_hint;
                chunk_md->scan_hint = 0;
        } else {
                bit_off = chunk_md->first_free;
                chunk_md->contig_hint = 0;
        }

        bits = 0;
        pcpu_for_each_md_free_region(chunk, bit_off, bits)
                pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
        struct pcpu_block_md *block = chunk->md_blocks + index;
        unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
        unsigned int start, end;        /* region start, region end */

        /* promote scan_hint to contig_hint */
        if (block->scan_hint) {
                start = block->scan_hint_start + block->scan_hint;
                block->contig_hint_start = block->scan_hint_start;
                block->contig_hint = block->scan_hint;
                block->scan_hint = 0;
        } else {
                start = block->first_free;
                block->contig_hint = 0;
        }

        block->right_free = 0;

        /* iterate over free areas and update the contig hints */
        for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
                pcpu_block_update(block, start, end);
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
                                         int bits)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Update s_block.
         */
        if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;

        /*
         * block->first_free must be updated if the allocation takes its place.
         * If the allocation breaks the contig_hint, a scan is required to
         * restore this hint.
         */
        if (s_off == s_block->first_free)
                s_block->first_free = find_next_zero_bit(
                                        pcpu_index_alloc_map(chunk, s_index),
                                        PCPU_BITMAP_BLOCK_BITS,
                                        s_off + bits);

        if (pcpu_region_overlap(s_block->scan_hint_start,
                                s_block->scan_hint_start + s_block->scan_hint,
                                s_off,
                                s_off + bits))
                s_block->scan_hint = 0;

        if (pcpu_region_overlap(s_block->contig_hint_start,
                                s_block->contig_hint_start +
                                s_block->contig_hint,
                                s_off,
                                s_off + bits)) {
                /* block contig hint is broken - scan to fix it */
                if (!s_off)
                        s_block->left_free = 0;
                pcpu_block_refresh_hint(chunk, s_index);
        } else {
                /* update left and right contig manually */
                s_block->left_free = min(s_block->left_free, s_off);
                if (s_index == e_index)
                        s_block->right_free = min_t(int, s_block->right_free,
                                        PCPU_BITMAP_BLOCK_BITS - e_off);
                else
                        s_block->right_free = 0;
        }

        /*
         * Update e_block.
         */
        if (s_index != e_index) {
                if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;

                /*
                 * When the allocation is across blocks, the end is along
                 * the left part of the e_block.
                 */
                e_block->first_free = find_next_zero_bit(
                                pcpu_index_alloc_map(chunk, e_index),
                                PCPU_BITMAP_BLOCK_BITS, e_off);

                if (e_off == PCPU_BITMAP_BLOCK_BITS) {
                        /* reset the block */
                        e_block++;
                } else {
                        if (e_off > e_block->scan_hint_start)
                                e_block->scan_hint = 0;

                        e_block->left_free = 0;
                        if (e_off > e_block->contig_hint_start) {
                                /* contig hint is broken - scan to fix it */
                                pcpu_block_refresh_hint(chunk, e_index);
                        } else {
                                e_block->right_free =
                                        min_t(int, e_block->right_free,
                                              PCPU_BITMAP_BLOCK_BITS - e_off);
                        }
                }

                /* update in-between md_blocks */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->scan_hint = 0;
                        block->contig_hint = 0;
                        block->left_free = 0;
                        block->right_free = 0;
                }
        }

        /*
         * If the allocation is not atomic, some blocks may not be
         * populated with pages, while we account it here.  The number
         * of pages will be added back with pcpu_chunk_populated()
         * when populating pages.
         */
        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, -nr_empty_pages);

        if (pcpu_region_overlap(chunk_md->scan_hint_start,
                                chunk_md->scan_hint_start +
                                chunk_md->scan_hint,
                                bit_off,
                                bit_off + bits))
                chunk_md->scan_hint = 0;

        /*
         * The only time a full chunk scan is required is if the chunk
         * contig hint is broken.  Otherwise, it means a smaller space
         * was used and therefore the chunk contig hint is still correct.
         */
        if (pcpu_region_overlap(chunk_md->contig_hint_start,
                                chunk_md->contig_hint_start +
                                chunk_md->contig_hint,
                                bit_off,
                                bit_off + bits))
                pcpu_chunk_refresh_hint(chunk, false);
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk_md->contig_hint.
 * chunk_md->contig_hint may be off by up to a page, but it will never be more
 * than the available space.  If the contig hint is contained in one block, it
 * will be accurate.
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
                                        int bits)
{
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */
        int start, end;                /* start and end of the whole free area */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Check if the freed area aligns with the block->contig_hint.
         * If it does, then the scan to find the beginning/end of the
         * larger free area can be avoided.
         *
         * start and end refer to beginning and end of the free area
         * within each their respective blocks.  This is not necessarily
         * the entire free area as it may span blocks past the beginning
         * or end of the block.
         */
        start = s_off;
        if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
                start = s_block->contig_hint_start;
        } else {
                /*
                 * Scan backwards to find the extent of the free area.
                 * find_last_bit returns the starting bit, so if the start bit
                 * is returned, that means there was no last bit and the
                 * remainder of the chunk is free.
                 */
                int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
                                          start);
                start = (start == l_bit) ? 0 : l_bit + 1;
        }

        end = e_off;
        if (e_off == e_block->contig_hint_start)
                end = e_block->contig_hint_start + e_block->contig_hint;
        else
                end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
                                    PCPU_BITMAP_BLOCK_BITS, end);

        /* update s_block */
        e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
        if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;
        pcpu_block_update(s_block, start, e_off);

        /* freeing in the same block */
        if (s_index != e_index) {
                /* update e_block */
                if (end == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;
                pcpu_block_update(e_block, 0, end);

                /* reset md_blocks in the middle */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->first_free = 0;
                        block->scan_hint = 0;
                        block->contig_hint_start = 0;
                        block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
                        block->left_free = PCPU_BITMAP_BLOCK_BITS;
                        block->right_free = PCPU_BITMAP_BLOCK_BITS;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, nr_empty_pages);

        /*
         * Refresh chunk metadata when the free makes a block free or spans
         * across blocks.  The contig_hint may be off by up to a page, but if
         * the contig_hint is contained in a block, it will be accurate with
         * the else condition below.
         */
        if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
                pcpu_chunk_refresh_hint(chunk, true);
        else
                pcpu_block_update(&chunk->chunk_md,
                                  pcpu_block_off_to_off(s_index, start),
                                  end);
}

/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
                              int *next_off)
{
        unsigned int start, end;

        start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
        end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

        start = find_next_zero_bit(chunk->populated, end, start);
        if (start >= end)
                return true;

        end = find_next_bit(chunk->populated, end, start + 1);

        *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
        return false;
}

/**
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
 */
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
                               size_t align, bool pop_only)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, next_off;

        /*
         * This is an optimization to prevent scanning by assuming if the
         * allocation cannot fit in the global hint, there is memory pressure
         * and creating a new chunk would happen soon.
         */
        if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
                return -1;

        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
        bits = 0;
        pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
                if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
                                                   &next_off))
                        break;

                bit_off = next_off;
                bits = 0;
        }

        if (bit_off == pcpu_chunk_map_bits(chunk))
                return -1;

        return bit_off;
}

/*
 * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
 * @map: the address to base the search on
 * @size: the bitmap size in bits
 * @start: the bitnumber to start searching at
 * @nr: the number of zeroed bits we're looking for
 * @align_mask: alignment mask for zero area
 * @largest_off: offset of the largest area skipped
 * @largest_bits: size of the largest area skipped
 *
 * The @align_mask should be one less than a power of 2.
 *
 * This is a modified version of bitmap_find_next_zero_area_off() to remember
 * the largest area that was skipped.  This is imperfect, but in general is
 * good enough.  The largest remembered region is the largest failed region
 * seen.  This does not include anything we possibly skipped due to alignment.
 * pcpu_block_update_scan() does scan backwards to try and recover what was
 * lost to alignment.  While this can cause scanning to miss earlier possible
 * free areas, smaller allocations will eventually fill those holes.
 */
static unsigned long pcpu_find_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned long nr,
                                         unsigned long align_mask,
                                         unsigned long *largest_off,
                                         unsigned long *largest_bits)
{
        unsigned long index, end, i, area_off, area_bits;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index, align_mask);
        area_off = index;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                area_bits = i - area_off;
                /* remember largest unused area with best alignment */
                if (area_bits > *largest_bits ||
                    (area_bits == *largest_bits && *largest_off &&
                     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
                        *largest_off = area_off;
                        *largest_bits = area_bits;
                }

                start = i + 1;
                goto again;
        }
        return index;
}

/**
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 *
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
 *
 * RETURNS:
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
 */
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
                           size_t align, int start)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        size_t align_mask = (align) ? (align - 1) : 0;
        unsigned long area_off = 0, area_bits = 0;
        int bit_off, end, oslot;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        /*
         * Search to find a fit.
         */
        end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
                    pcpu_chunk_map_bits(chunk));
        bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
                                      align_mask, &area_off, &area_bits);
        if (bit_off >= end)
                return -1;

        if (area_bits)
                pcpu_block_update_scan(chunk, area_off, area_bits);

        /* update alloc map */
        bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

        /* update boundary map */
        set_bit(bit_off, chunk->bound_map);
        bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
        set_bit(bit_off + alloc_bits, chunk->bound_map);

        chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

        /* update first free bit */
        if (bit_off == chunk_md->first_free)
                chunk_md->first_free = find_next_zero_bit(
                                        chunk->alloc_map,
                                        pcpu_chunk_map_bits(chunk),
                                        bit_off + alloc_bits);

        pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

        pcpu_chunk_relocate(chunk, oslot);

        return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 *
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
 *
 * RETURNS:
 * Number of freed bytes.
 */
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, end, oslot, freed;

        lockdep_assert_held(&pcpu_lock);
        pcpu_stats_area_dealloc(chunk);

        oslot = pcpu_chunk_slot(chunk);

        bit_off = off / PCPU_MIN_ALLOC_SIZE;

        /* find end index */
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        bits = end - bit_off;
        bitmap_clear(chunk->alloc_map, bit_off, bits);

        freed = bits * PCPU_MIN_ALLOC_SIZE;

        /* update metadata */
        chunk->free_bytes += freed;

        /* update first free bit */
        chunk_md->first_free = min(chunk_md->first_free, bit_off);

        pcpu_block_update_hint_free(chunk, bit_off, bits);

        pcpu_chunk_relocate(chunk, oslot);

        return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
        block->scan_hint = 0;
        block->contig_hint = nr_bits;
        block->left_free = nr_bits;
        block->right_free = nr_bits;
        block->first_free = 0;
        block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
        struct pcpu_block_md *md_block;

        /* init the chunk's block */
        pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

        for (md_block = chunk->md_blocks;
             md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
             md_block++)
                pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                                                         int map_size)
{
        struct pcpu_chunk *chunk;
        unsigned long aligned_addr;
        int start_offset, offset_bits, region_size, region_bits;
        size_t alloc_size;

        /* region calculations */
        aligned_addr = tmp_addr & PAGE_MASK;

        start_offset = tmp_addr - aligned_addr;
        region_size = ALIGN(start_offset + map_size, PAGE_SIZE);

        /* allocate chunk */
        alloc_size = struct_size(chunk, populated,
                                 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
        chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        INIT_LIST_HEAD(&chunk->list);

        chunk->base_addr = (void *)aligned_addr;
        chunk->start_offset = start_offset;
        chunk->end_offset = region_size - chunk->start_offset - map_size;

        chunk->nr_pages = region_size >> PAGE_SHIFT;
        region_bits = pcpu_chunk_map_bits(chunk);

        alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
        chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->alloc_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size =
                BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
        chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->bound_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
        chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->md_blocks)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

#ifdef NEED_PCPUOBJ_EXT
        /* first chunk is free to use */
        chunk->obj_exts = NULL;
#endif
        pcpu_init_md_blocks(chunk);

        /* manage populated page bitmap */
        chunk->immutable = true;
        bitmap_fill(chunk->populated, chunk->nr_pages);
        chunk->nr_populated = chunk->nr_pages;
        chunk->nr_empty_pop_pages = chunk->nr_pages;

        chunk->free_bytes = map_size;

        if (chunk->start_offset) {
                /* hide the beginning of the bitmap */
                offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map, 0, offset_bits);
                set_bit(0, chunk->bound_map);
                set_bit(offset_bits, chunk->bound_map);

                chunk->chunk_md.first_free = offset_bits;

                pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
        }

        if (chunk->end_offset) {
                /* hide the end of the bitmap */
                offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map,
                           pcpu_chunk_map_bits(chunk) - offset_bits,
                           offset_bits);
                set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
                        chunk->bound_map);
                set_bit(region_bits, chunk->bound_map);

                pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
                                             - offset_bits, offset_bits);
        }

        return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        int region_bits;

        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;

        INIT_LIST_HEAD(&chunk->list);
        chunk->nr_pages = pcpu_unit_pages;
        region_bits = pcpu_chunk_map_bits(chunk);

        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;

        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;

        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;

#ifdef NEED_PCPUOBJ_EXT
        if (need_pcpuobj_ext()) {
                chunk->obj_exts =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct pcpuobj_ext), gfp);
                if (!chunk->obj_exts)
                        goto objcg_fail;
        }
#endif

        pcpu_init_md_blocks(chunk);

        /* init metadata */
        chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

        return chunk;

#ifdef NEED_PCPUOBJ_EXT
objcg_fail:
        pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
        pcpu_mem_free(chunk->bound_map);
bound_map_fail:
        pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
        pcpu_mem_free(chunk);

        return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;
#ifdef NEED_PCPUOBJ_EXT
        pcpu_mem_free(chunk->obj_exts);
#endif
        pcpu_mem_free(chunk->md_blocks);
        pcpu_mem_free(chunk->bound_map);
        pcpu_mem_free(chunk->alloc_map);
        pcpu_mem_free(chunk);
}

/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 */
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_set(chunk->populated, page_start, nr);
        chunk->nr_populated += nr;
        pcpu_nr_populated += nr;

        pcpu_update_empty_pages(chunk, nr);
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
                                   int page_start, int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_clear(chunk->populated, page_start, nr);
        chunk->nr_populated -= nr;
        pcpu_nr_populated -= nr;

        pcpu_update_empty_pages(chunk, -nr);
}

/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk                - populate the specified range of a chunk
 * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
 * pcpu_post_unmap_tlb_flush        - flush tlb for the specified range of a chunk
 * pcpu_create_chunk                - create a new chunk
 * pcpu_destroy_chunk                - destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page                - translate address to physical address
 * pcpu_verify_alloc_info        - check alloc_info is acceptable during init
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
        /* is it in the dynamic region (first chunk)? */
        if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
                return pcpu_first_chunk;

        /* is it in the reserved region? */
        if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
                return pcpu_reserved_chunk;

        /*
         * The address is relative to unit0 which might be unused and
         * thus unmapped.  Offset the address to the unit space of the
         * current processor before looking it up in the vmalloc
         * space.  Note that any possible cpu id can be used here, so
         * there's no need to worry about preemption or cpu hotplug.
         */
        addr += pcpu_unit_offsets[raw_smp_processor_id()];
        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG_KMEM
static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                      struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
                return true;

        objcg = current_obj_cgroup();
        if (!objcg)
                return true;

        if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
                return false;

        *objcgp = objcg;
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
        if (!objcg)
                return;

        if (likely(chunk && chunk->obj_exts)) {
                obj_cgroup_get(objcg);
                chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;

                rcu_read_lock();
                mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                                pcpu_obj_full_size(size));
                rcu_read_unlock();
        } else {
                obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
        }
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        struct obj_cgroup *objcg;

        if (unlikely(!chunk->obj_exts))
                return;

        objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
        if (!objcg)
                return;
        chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;

        obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));

        rcu_read_lock();
        mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                        -pcpu_obj_full_size(size));
        rcu_read_unlock();

        obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG_KMEM */
static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_MEM_ALLOC_PROFILING
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
                alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
                              current->alloc_tag, size);
        }
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
                alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
}
#else
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif

/**
 * pcpu_alloc - the percpu allocator
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 *
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        /* whitelisted flags that can be passed to the backing allocators */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
        do_warn = !(gfp & __GFP_NOWARN);

        /*
         * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
         * therefore alignment must be a minimum of that many bytes.
         * An allocation may have internal fragmentation from rounding up
         * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
         */
        if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
        bits = size >> PCPU_MIN_ALLOC_SHIFT;
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) {
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align);
                return NULL;
        }

        if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
                return NULL;

        if (!is_atomic) {
                /*
                 * pcpu_balance_workfn() allocates memory under this mutex,
                 * and it may wait for memory reclaim. Allow current task
                 * to become OOM victim, in case of memory pressure.
                 */
                if (gfp & __GFP_NOFAIL) {
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

        spin_lock_irqsave(&pcpu_lock, flags);

        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off);
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
                list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
                                         list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
                                if (slot < PCPU_SLOT_FAIL_THRESHOLD)
                                        pcpu_chunk_move(chunk, 0);
                                continue;
                        }

                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
                        if (off >= 0) {
                                pcpu_reintegrate_chunk(chunk);
                                goto area_found;
                        }
                }
        }

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        /* No space left.  Create a new chunk. */
        if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
                chunk = pcpu_create_chunk(pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1);
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
        pcpu_stats_area_alloc(chunk, size);
        spin_unlock_irqrestore(&pcpu_lock, flags);

        /* populate if not all pages are already there */
        if (!is_atomic) {
                unsigned int page_end, rs, re;

                rs = PFN_DOWN(off);
                page_end = PFN_UP(off + size);

                for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re);
                        spin_unlock_irqrestore(&pcpu_lock, flags);
                }

                mutex_unlock(&pcpu_alloc_mutex);
        }

        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();

        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp);

        trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
                                  chunk->base_addr, off, ptr,
                                  pcpu_obj_full_size(size), gfp);

        pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

        pcpu_alloc_tag_alloc_hook(chunk, off, size);

        return ptr;

fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
        trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

        if (do_warn && warn_limit) {
                pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
                        size, align, is_atomic, err);
                if (!is_atomic)
                        dump_stack();
                if (!--warn_limit)
                        pr_info("limit reached, disable warning\n");
        }

        if (is_atomic) {
                /* see the flag handling in pcpu_balance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
                mutex_unlock(&pcpu_alloc_mutex);
        }

        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

        return NULL;
}
EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);

/**
 * pcpu_balance_free - manage the amount of free chunks
 * @empty_only: free chunks only if there are no populated pages
 *
 * If empty_only is %false, reclaim all fully free chunks regardless of the
 * number of populated pages.  Otherwise, only reclaim chunks that have no
 * populated pages.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_free(bool empty_only)
{
        LIST_HEAD(to_free);
        struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
        struct pcpu_chunk *chunk, *next;

        lockdep_assert_held(&pcpu_lock);

        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);

                /* spare the first one */
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;

                if (!empty_only || chunk->nr_empty_pop_pages == 0)
                        list_move(&chunk->list, &to_free);
        }

        if (list_empty(&to_free))
                return;

        spin_unlock_irq(&pcpu_lock);
        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;

                for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        pcpu_depopulate_chunk(chunk, rs, re);
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_depopulated(chunk, rs, re);
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }
        spin_lock_irq(&pcpu_lock);
}

/**
 * pcpu_balance_populated - manage the amount of populated pages
 *
 * Maintain a certain amount of populated pages to satisfy atomic allocations.
 * It is possible that this is called when physical memory is scarce causing
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_populated(void)
{
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        struct pcpu_chunk *chunk;
        int slot, nr_to_pop, ret;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
         * allocs don't increase fragmentation.  If atomic allocation
         * failed previously, always populate the maximum amount.  This
         * should prevent atomic allocs larger than PAGE_SIZE from keeping
         * failing indefinitely; however, large atomic allocs are not
         * something we support properly and can be highly unreliable and
         * inefficient.
         */
retry_pop:
        if (pcpu_atomic_alloc_failed) {
                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
                /* best effort anyway, don't worry about synchronization */
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
                                  pcpu_nr_empty_pop_pages,
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }

        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
                unsigned int nr_unpop = 0, rs, re;

                if (!nr_to_pop)
                        break;

                list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }

                if (!nr_unpop)
                        continue;

                /* @chunk can't go away while pcpu_alloc_mutex is held */
                for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);

                        spin_unlock_irq(&pcpu_lock);
                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                        if (!ret) {
                                nr_to_pop -= nr;
                                pcpu_chunk_populated(chunk, rs, rs + nr);
                        } else {
                                nr_to_pop = 0;
                        }

                        if (!nr_to_pop)
                                break;
                }
        }

        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
                spin_unlock_irq(&pcpu_lock);
                chunk = pcpu_create_chunk(gfp);
                cond_resched();
                spin_lock_irq(&pcpu_lock);
                if (chunk) {
                        pcpu_chunk_relocate(chunk, -1);
                        goto retry_pop;
                }
        }
}

/**
 * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
 *
 * Scan over chunks in the depopulate list and try to release unused populated
 * pages back to the system.  Depopulated chunks are sidelined to prevent
 * repopulating these pages unless required.  Fully free chunks are reintegrated
 * and freed accordingly (1 is kept around).  If we drop below the empty
 * populated pages threshold, reintegrate the chunk if it has empty free pages.
 * Each chunk is scanned in the reverse order to keep populated pages close to
 * the beginning of the chunk.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 *
 */
static void pcpu_reclaim_populated(void)
{
        struct pcpu_chunk *chunk;
        struct pcpu_block_md *block;
        int freed_page_start, freed_page_end;
        int i, end;
        bool reintegrate;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Once a chunk is isolated to the to_depopulate list, the chunk is no
         * longer discoverable to allocations whom may populate pages.  The only
         * other accessor is the free path which only returns area back to the
         * allocator not touching the populated bitmap.
         */
        while ((chunk = list_first_entry_or_null(
                        &pcpu_chunk_lists[pcpu_to_depopulate_slot],
                        struct pcpu_chunk, list))) {
                WARN_ON(chunk->immutable);

                /*
                 * Scan chunk's pages in the reverse order to keep populated
                 * pages close to the beginning of the chunk.
                 */
                freed_page_start = chunk->nr_pages;
                freed_page_end = 0;
                reintegrate = false;
                for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
                        /* no more work to do */
                        if (chunk->nr_empty_pop_pages == 0)
                                break;

                        /* reintegrate chunk to prevent atomic alloc failures */
                        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
                                reintegrate = true;
                                break;
                        }

                        /*
                         * If the page is empty and populated, start or
                         * extend the (i, end) range.  If i == 0, decrease
                         * i and perform the depopulation to cover the last
                         * (first) page in the chunk.
                         */
                        block = chunk->md_blocks + i;
                        if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
                            test_bit(i, chunk->populated)) {
                                if (end == -1)
                                        end = i;
                                if (i > 0)
                                        continue;
                                i--;
                        }

                        /* depopulate if there is an active range */
                        if (end == -1)
                                continue;

                        spin_unlock_irq(&pcpu_lock);
                        pcpu_depopulate_chunk(chunk, i + 1, end + 1);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);

                        pcpu_chunk_depopulated(chunk, i + 1, end + 1);
                        freed_page_start = min(freed_page_start, i + 1);
                        freed_page_end = max(freed_page_end, end + 1);

                        /* reset the range and continue */
                        end = -1;
                }

                /* batch tlb flush per chunk to amortize cost */
                if (freed_page_start < freed_page_end) {
                        spin_unlock_irq(&pcpu_lock);
                        pcpu_post_unmap_tlb_flush(chunk,
                                                  freed_page_start,
                                                  freed_page_end);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                }

                if (reintegrate || chunk->free_bytes == pcpu_unit_size)
                        pcpu_reintegrate_chunk(chunk);
                else
                        list_move_tail(&chunk->list,
                                       &pcpu_chunk_lists[pcpu_sidelined_slot]);
        }
}

/**
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * For each chunk type, manage the number of fully free chunks and the number of
 * populated pages.  An important thing to consider is when pages are freed and
 * how they contribute to the global counts.
 */
static void pcpu_balance_workfn(struct work_struct *work)
{
        /*
         * pcpu_balance_free() is called twice because the first time we may
         * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
         * to grow other chunks.  This then gives pcpu_reclaim_populated() time
         * to move fully free chunks to the active list to be freed if
         * appropriate.
         */
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);

        pcpu_balance_free(false);
        pcpu_reclaim_populated();
        pcpu_balance_populated();
        pcpu_balance_free(true);

        spin_unlock_irq(&pcpu_lock);
        mutex_unlock(&pcpu_alloc_mutex);
}

/**
 * pcpu_alloc_size - the size of the dynamic percpu area
 * @ptr: pointer to the dynamic percpu area
 *
 * Returns the size of the @ptr allocation.  This is undefined for statically
 * defined percpu variables as there is no corresponding chunk->bound_map.
 *
 * RETURNS:
 * The size of the dynamic percpu area.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
size_t pcpu_alloc_size(void __percpu *ptr)
{
        struct pcpu_chunk *chunk;
        unsigned long bit_off, end;
        void *addr;

        if (!ptr)
                return 0;

        addr = __pcpu_ptr_to_addr(ptr);
        /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
        chunk = pcpu_chunk_addr_search(addr);
        bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
}

/**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
void free_percpu(void __percpu *ptr)
{
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int size, off;
        bool need_balance = false;

        if (!ptr)
                return;

        kmemleak_free_percpu(ptr);

        addr = __pcpu_ptr_to_addr(ptr);
        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;

        spin_lock_irqsave(&pcpu_lock, flags);
        size = pcpu_free_area(chunk, off);

        pcpu_alloc_tag_free_hook(chunk, off, size);

        pcpu_memcg_free_hook(chunk, off, size);

        /*
         * If there are more than one fully free chunks, wake up grim reaper.
         * If the chunk is isolated, it may be in the process of being
         * reclaimed.  Let reclaim manage cleaning up of that chunk.
         */
        if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;

                list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
        } else if (pcpu_should_reclaim_chunk(chunk)) {
                pcpu_isolate_chunk(chunk);
                need_balance = true;
        }

        trace_percpu_free_percpu(chunk->base_addr, off, ptr);

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (need_balance)
                pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);

bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;

        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
                void *va = (void *)addr;

                if (va >= start && va < start + static_size) {
                        if (can_addr) {
                                *can_addr = (unsigned long) (va - start);
                                *can_addr += (unsigned long)
                                        per_cpu_ptr(base, get_boot_cpu_id());
                        }
                        return true;
                }
        }
#endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
}

/**
 * is_kernel_percpu_address - test whether address is from static percpu area
 * @addr: address to test
 *
 * Test whether @addr belongs to in-kernel static percpu area.  Module
 * static percpu areas are not considered.  For those, use
 * is_module_percpu_address().
 *
 * RETURNS:
 * %true if @addr is from in-kernel static percpu area, %false otherwise.
 */
bool is_kernel_percpu_address(unsigned long addr)
{
        return __is_kernel_percpu_address(addr, NULL);
}

/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
 * Given @addr which is dereferenceable address obtained via one of
 * percpu access macros, this function translates it into its physical
 * address.  The caller is responsible for ensuring @addr stays valid
 * until this function finishes.
 *
 * percpu allocator has special setup for the first chunk, which currently
 * supports either embedding in linear address space or vmalloc mapping,
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
 * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
 * code.
 *
 * RETURNS:
 * The physical address for @addr.
 */
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
        unsigned long first_low, first_high;
        unsigned int cpu;

        /*
         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         *
         * The address check is against full chunk sizes.  pcpu_base_addr
         * points to the beginning of the first chunk including the
         * static region.  Assumes good intent as the first chunk may
         * not be full (ie. < pcpu_unit_pages in size).
         */
        first_low = (unsigned long)pcpu_base_addr +
                    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
        first_high = (unsigned long)pcpu_base_addr +
                     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
        if ((unsigned long)addr >= first_low &&
            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);

                        if (addr >= start && addr < start + pcpu_unit_size) {
                                in_first_chunk = true;
                                break;
                        }
                }
        }

        if (in_first_chunk) {
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr)) +
                               offset_in_page(addr);
        } else
                return page_to_phys(pcpu_addr_to_page(addr)) +
                       offset_in_page(addr);
}

/**
 * pcpu_alloc_alloc_info - allocate percpu allocation info
 * @nr_groups: the number of groups
 * @nr_units: the number of units
 *
 * Allocate ai which is large enough for @nr_groups groups containing
 * @nr_units units.  The returned ai's groups[0].cpu_map points to the
 * cpu_map array which is long enough for @nr_units and filled with
 * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
 * pointer of other groups.
 *
 * RETURNS:
 * Pointer to the allocated pcpu_alloc_info on success, NULL on
 * failure.
 */
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                                                      int nr_units)
{
        struct pcpu_alloc_info *ai;
        size_t base_size, ai_size;
        void *ptr;
        int unit;

        base_size = ALIGN(struct_size(ai, groups, nr_groups),
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

        ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
        if (!ptr)
                return NULL;
        ai = ptr;
        ptr += base_size;

        ai->groups[0].cpu_map = ptr;

        for (unit = 0; unit < nr_units; unit++)
                ai->groups[0].cpu_map[unit] = NR_CPUS;

        ai->nr_groups = nr_groups;
        ai->__ai_size = PFN_ALIGN(ai_size);

        return ai;
}

/**
 * pcpu_free_alloc_info - free percpu allocation info
 * @ai: pcpu_alloc_info to free
 *
 * Free @ai which was allocated by pcpu_alloc_alloc_info().
 */
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
        memblock_free(ai, ai->__ai_size);
}

/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
 *
 * Print out information about @ai using loglevel @lvl.
 */
static void pcpu_dump_alloc_info(const char *lvl,
                                 const struct pcpu_alloc_info *ai)
{
        int group_width = 1, cpu_width = 1, width;
        char empty_str[] = "--------";
        int alloc = 0, alloc_end = 0;
        int group, v;
        int upa, apl;        /* units per alloc, allocs per line */

        v = ai->nr_groups;
        while (v /= 10)
                group_width++;

        v = num_possible_cpus();
        while (v /= 10)
                cpu_width++;
        empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

        upa = ai->alloc_size / ai->unit_size;
        width = upa * (cpu_width + 1) + group_width + 3;
        apl = rounddown_pow_of_two(max(60 / width, 1));

        printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
               lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
               ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

        for (group = 0; group < ai->nr_groups; group++) {
                const struct pcpu_group_info *gi = &ai->groups[group];
                int unit = 0, unit_end = 0;

                BUG_ON(gi->nr_units % upa);
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
                                pr_cont("\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
                        pr_cont("[%0*d] ", group_width, group);

                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
                                        pr_cont("%0*d ",
                                                cpu_width, gi->cpu_map[unit]);
                                else
                                        pr_cont("%s ", empty_str);
                }
        }
        pr_cont("\n");
}

/**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @ai: pcpu_alloc_info describing how to percpu area is shaped
 * @base_addr: mapped address
 *
 * Initialize the first percpu chunk which contains the kernel static
 * percpu area.  This function is to be called from arch percpu area
 * setup path.
 *
 * @ai contains all information necessary to initialize the first
 * chunk and prime the dynamic percpu allocator.
 *
 * @ai->static_size is the size of static percpu area.
 *
 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @ai->dyn_size determines the number of bytes available for dynamic
 * allocation in the first chunk.  The area between @ai->static_size +
 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
 *
 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
 * and equal to or larger than @ai->static_size + @ai->reserved_size +
 * @ai->dyn_size.
 *
 * @ai->atom_size is the allocation atom size and used as alignment
 * for vm areas.
 *
 * @ai->alloc_size is the allocation size and always multiple of
 * @ai->atom_size.  This is larger than @ai->atom_size if
 * @ai->unit_size is larger than @ai->atom_size.
 *
 * @ai->nr_groups and @ai->groups describe virtual memory layout of
 * percpu areas.  Units which should be colocated are put into the
 * same group.  Dynamic VM areas will be allocated according to these
 * groupings.  If @ai->nr_groups is zero, a single group containing
 * all units is assumed.
 *
 * The caller should have mapped the first chunk at @base_addr and
 * copied static data to each unit.
 *
 * The first chunk will always contain a static and a dynamic region.
 * However, the static region is not managed by any chunk.  If the first
 * chunk also contains a reserved region, it is served by two chunks -
 * one for the reserved region and one for the dynamic region.  They
 * share the same vm, but use offset regions in the area allocation map.
 * The chunk serving the dynamic region is circulated in the chunk slots
 * and available for dynamic allocation like any other chunk.
 */
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                   void *base_addr)
{
        size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        size_t static_size, dyn_size;
        unsigned long *group_offsets;
        size_t *group_sizes;
        unsigned long *unit_off;
        unsigned int cpu;
        int *unit_map;
        int group, unit, i;
        unsigned long tmp_addr;
        size_t alloc_size;

#define PCPU_SETUP_BUG_ON(cond)        do {                                        \
        if (unlikely(cond)) {                                                \
                pr_emerg("failed to initialize, %s\n", #cond);                \
                pr_emerg("cpu_possible_mask=%*pb\n",                        \
                         cpumask_pr_args(cpu_possible_mask));                \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                        \
                BUG();                                                        \
        }                                                                \
} while (0)

        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
        PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
        PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
                            IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

        /* process group information and build config tables accordingly */
        alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
        group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_offsets)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
        group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_sizes)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
        unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
        unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_off)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;

        pcpu_low_unit_cpu = NR_CPUS;
        pcpu_high_unit_cpu = NR_CPUS;

        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];

                group_offsets[group] = gi->base_offset;
                group_sizes[group] = gi->nr_units * ai->unit_size;

                for (i = 0; i < gi->nr_units; i++) {
                        cpu = gi->cpu_map[i];
                        if (cpu == NR_CPUS)
                                continue;

                        PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;

                        /* determine low/high unit_cpu */
                        if (pcpu_low_unit_cpu == NR_CPUS ||
                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
                                pcpu_low_unit_cpu = cpu;
                        if (pcpu_high_unit_cpu == NR_CPUS ||
                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;

        for_each_possible_cpu(cpu)
                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

        /* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
        pcpu_dump_alloc_info(KERN_DEBUG, ai);

        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
        pcpu_group_sizes = group_sizes;
        pcpu_unit_map = unit_map;
        pcpu_unit_offsets = unit_off;

        /* determine basic parameters */
        pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
        pcpu_atom_size = ai->atom_size;
        pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
                                             BITS_TO_LONGS(pcpu_unit_pages));

        pcpu_stats_save_ai(ai);

        /*
         * Allocate chunk slots.  The slots after the active slots are:
         *   sidelined_slot - isolated, depopulated chunks
         *   free_slot - fully free chunks
         *   to_depopulate_slot - isolated, chunks to depopulate
         */
        pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
        pcpu_free_slot = pcpu_sidelined_slot + 1;
        pcpu_to_depopulate_slot = pcpu_free_slot + 1;
        pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
        pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
                                          sizeof(pcpu_chunk_lists[0]),
                                          SMP_CACHE_BYTES);
        if (!pcpu_chunk_lists)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));

        for (i = 0; i < pcpu_nr_slots; i++)
                INIT_LIST_HEAD(&pcpu_chunk_lists[i]);

        /*
         * The end of the static region needs to be aligned with the
         * minimum allocation size as this offsets the reserved and
         * dynamic region.  The first chunk ends page aligned by
         * expanding the dynamic region, therefore the dynamic region
         * can be shrunk to compensate while still staying above the
         * configured sizes.
         */
        static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
        dyn_size = ai->dyn_size - (static_size - ai->static_size);

        /*
         * Initialize first chunk:
         * This chunk is broken up into 3 parts:
         *                < static | [reserved] | dynamic >
         * - static - there is no backing chunk because these allocations can
         *   never be freed.
         * - reserved (pcpu_reserved_chunk) - exists primarily to serve
         *   allocations from module load.
         * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
         *   chunk.
         */
        tmp_addr = (unsigned long)base_addr + static_size;
        if (ai->reserved_size)
                pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
                                                ai->reserved_size);
        tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
        pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);

        pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);

        /* include all regions of the first chunk */
        pcpu_nr_populated += PFN_DOWN(size_sum);

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(base_addr);

        /* we're done */
        pcpu_base_addr = base_addr;
}

#ifdef CONFIG_SMP

const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]        = "auto",
        [PCPU_FC_EMBED]        = "embed",
        [PCPU_FC_PAGE]        = "page",
};

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

static int __init percpu_alloc_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (0)
                /* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
        else if (!strcmp(str, "embed"))
                pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
        else if (!strcmp(str, "page"))
                pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
        else
                pr_warn("unknown allocator %s specified\n", str);

        return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);

/*
 * pcpu_embed_first_chunk() is used by the generic percpu setup.
 * Build it if needed by the arch config or the generic setup is going
 * to be used.
 */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif

/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 *
 * This function determines grouping of units, their mappings to cpus
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
 * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
 * of allocated virtual address space.
 *
 * RETURNS:
 * On success, pointer to the new allocation_info is returned.  On
 * failure, ERR_PTR value is returned.
 */
static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
                                size_t reserved_size, size_t dyn_size,
                                size_t atom_size,
                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
        static struct cpumask mask __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
        int upa, max_upa, best_upa;        /* units_per_alloc */
        int last_allocs, group, unit;
        unsigned int cpu, tcpu;
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;

        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));
        cpumask_clear(&mask);

        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
        dyn_size = size_sum - static_size - reserved_size;

        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
         * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

        /* determine the maximum # of units that can fit in an allocation */
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
        while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;

        cpumask_copy(&mask, cpu_possible_mask);

        /* group cpus according to their proximity */
        for (group = 0; !cpumask_empty(&mask); group++) {
                /* pop the group's first cpu */
                cpu = cpumask_first(&mask);
                group_map[cpu] = group;
                group_cnt[group]++;
                cpumask_clear_cpu(cpu, &mask);

                for_each_cpu(tcpu, &mask) {
                        if (!cpu_distance_fn ||
                            (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
                             cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
                                group_map[tcpu] = group;
                                group_cnt[group]++;
                                cpumask_clear_cpu(tcpu, &mask);
                        }
                }
        }
        nr_groups = group;

        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.
         * Expand the unit_size until we use >= 75% of the units allocated.
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
        best_upa = 0;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;

                if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;

                for (group = 0; group < nr_groups; group++) {
                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
                        allocs += this_allocs;
                        wasted += this_allocs * upa - group_cnt[group];
                }

                /*
                 * Don't accept if wastage is over 1/3.  The
                 * greater-than comparison ensures upa==1 always
                 * passes the following check.
                 */
                if (wasted > num_possible_cpus() / 3)
                        continue;

                /* and then don't consume more memory */
                if (allocs > last_allocs)
                        break;
                last_allocs = allocs;
                best_upa = upa;
        }
        BUG_ON(!best_upa);
        upa = best_upa;

        /* allocate and fill alloc_info */
        for (group = 0; group < nr_groups; group++)
                nr_units += roundup(group_cnt[group], upa);

        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
        if (!ai)
                return ERR_PTR(-ENOMEM);
        cpu_map = ai->groups[0].cpu_map;

        for (group = 0; group < nr_groups; group++) {
                ai->groups[group].cpu_map = cpu_map;
                cpu_map += roundup(group_cnt[group], upa);
        }

        ai->static_size = static_size;
        ai->reserved_size = reserved_size;
        ai->dyn_size = dyn_size;
        ai->unit_size = alloc_size / upa;
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;

        for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];

                /*
                 * Initialize base_offset as if all groups are located
                 * back-to-back.  The caller should update this to
                 * reflect actual allocation.
                 */
                gi->base_offset = unit * ai->unit_size;

                for_each_possible_cpu(cpu)
                        if (group_map[cpu] == group)
                                gi->cpu_map[gi->nr_units++] = cpu;
                gi->nr_units = roundup(gi->nr_units, upa);
                unit += gi->nr_units;
        }
        BUG_ON(unit != nr_units);

        return ai;
}

static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
                                   pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NUMA
        int node = NUMA_NO_NODE;
        void *ptr;

        if (cpu_to_nd_fn)
                node = cpu_to_nd_fn(cpu);

        if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
                ptr = memblock_alloc_from(size, align, goal);
                pr_info("cpu %d has no node %d or node-local memory\n",
                        cpu, node);
                pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
                         cpu, size, (u64)__pa(ptr));
        } else {
                ptr = memblock_alloc_try_nid(size, align, goal,
                                             MEMBLOCK_ALLOC_ACCESSIBLE,
                                             node);

                pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
                         cpu, size, node, (u64)__pa(ptr));
        }
        return ptr;
#else
        return memblock_alloc_from(size, align, goal);
#endif
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
        memblock_free(ptr, size);
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * by calling pcpu_fc_alloc and used as-is without being mapped into
 * vmalloc area.  Allocations are always whole multiples of @atom_size
 * aligned to @atom_size.
 *
 * This enables the first chunk to piggy back on the linear physical
 * mapping which often uses larger page size.  Please note that this
 * can result in very sparse cpu->unit mapping on NUMA machines thus
 * requiring large vmalloc address space.  Don't use this allocator if
 * vmalloc space is not orders of magnitude larger than distances
 * between node memory addresses (ie. 32bit NUMA machines).
 *
 * @dyn_size specifies the minimum dynamic area size.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned using pcpu_fc_free.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                                  size_t atom_size,
                                  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
                                  pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
        size_t size_sum, areas_size;
        unsigned long max_distance;
        int group, i, highest_group, rc = 0;

        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
        if (IS_ERR(ai))
                return PTR_ERR(ai);

        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

        areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
        }

        /* allocate, copy and determine base address & max_distance */
        highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
                void *ptr;

                for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
                        cpu = gi->cpu_map[i];
                BUG_ON(cpu == NR_CPUS);

                /* allocate space for the whole group */
                ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
                if (!ptr) {
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
                /* kmemleak tracks the percpu allocations separately */
                kmemleak_ignore_phys(__pa(ptr));
                areas[group] = ptr;

                base = min(ptr, base);
                if (ptr > areas[highest_group])
                        highest_group = group;
        }
        max_distance = areas[highest_group] - base;
        max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
                                max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
                goto out_free_areas;
#endif
        }

        /*
         * Copy data and free unused parts.  This should happen after all
         * allocations are complete; otherwise, we may end up with
         * overlapping groups.
         */
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                void *ptr = areas[group];

                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
                                /* unused unit, free whole */
                                pcpu_fc_free(ptr, ai->unit_size);
                                continue;
                        }
                        /* copy and return the unused part */
                        memcpy(ptr, __per_cpu_load, ai->static_size);
                        pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
                }
        }

        /* base address is now known, determine group base offsets */
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
        }

        pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
                ai->dyn_size, ai->unit_size);

        pcpu_setup_first_chunk(ai, base);
        goto out_free;

out_free_areas:
        for (group = 0; group < ai->nr_groups; group++)
                if (areas[group])
                        pcpu_fc_free(areas[group],
                                ai->groups[group].nr_units * ai->unit_size);
out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
                memblock_free(areas, areas_size);
        return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */

#ifdef BUILD_PAGE_FIRST_CHUNK
#include <asm/pgalloc.h>

#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PUD_TABLE_SIZE
#define PUD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PMD_TABLE_SIZE
#define PMD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PTE_TABLE_SIZE
#define PTE_TABLE_SIZE PAGE_SIZE
#endif
void __init __weak pcpu_populate_pte(unsigned long addr)
{
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        if (pgd_none(*pgd)) {
                p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
                if (!p4d)
                        goto err_alloc;
                pgd_populate(&init_mm, pgd, p4d);
        }

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d)) {
                pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
                if (!pud)
                        goto err_alloc;
                p4d_populate(&init_mm, p4d, pud);
        }

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud)) {
                pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
                if (!pmd)
                        goto err_alloc;
                pud_populate(&init_mm, pud, pmd);
        }

        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd)) {
                pte_t *new;

                new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
                if (!new)
                        goto err_alloc;
                pmd_populate_kernel(&init_mm, pmd, new);
        }

        return;

err_alloc:
        panic("%s: Failed to allocate memory\n", __func__);
}

/**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up page-remapped first percpu
 * chunk and can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        static struct vm_struct vm;
        struct pcpu_alloc_info *ai;
        char psize_str[16];
        int unit_pages;
        size_t pages_size;
        struct page **pages;
        int unit, i, j, rc = 0;
        int upa;
        int nr_g0_units;

        snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

        ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
        if (IS_ERR(ai))
                return PTR_ERR(ai);
        BUG_ON(ai->nr_groups != 1);
        upa = ai->alloc_size/ai->unit_size;
        nr_g0_units = roundup(num_possible_cpus(), upa);
        if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
                pcpu_free_alloc_info(ai);
                return -EINVAL;
        }

        unit_pages = ai->unit_size >> PAGE_SHIFT;

        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
        pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
        if (!pages)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pages_size);

        /* allocate pages */
        j = 0;
        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned int cpu = ai->groups[0].cpu_map[unit];
                for (i = 0; i < unit_pages; i++) {
                        void *ptr;

                        ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
                        if (!ptr) {
                                pr_warn("failed to allocate %s page for cpu%u\n",
                                                psize_str, cpu);
                                goto enomem;
                        }
                        /* kmemleak tracks the percpu allocations separately */
                        kmemleak_ignore_phys(__pa(ptr));
                        pages[j++] = virt_to_page(ptr);
                }
        }

        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
        vm.size = num_possible_cpus() * ai->unit_size;
        vm_area_register_early(&vm, PAGE_SIZE);

        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned long unit_addr =
                        (unsigned long)vm.addr + unit * ai->unit_size;

                for (i = 0; i < unit_pages; i++)
                        pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));

                /* pte already populated, the following shouldn't fail */
                rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
                                      unit_pages);
                if (rc < 0)
                        panic("failed to map percpu area, err=%d\n", rc);

                flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);

                /* copy static data */
                memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
        }

        /* we're ready, commit */
        pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
                unit_pages, psize_str, ai->static_size,
                ai->reserved_size, ai->dyn_size);

        pcpu_setup_first_chunk(ai, vm.addr);
        goto out_free_ar;

enomem:
        while (--j >= 0)
                pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
out_free_ar:
        memblock_free(pages, pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */

#ifndef        CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
 * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

void __init setup_per_cpu_areas(void)
{
        unsigned long delta;
        unsigned int cpu;
        int rc;

        /*
         * Always reserve area for module percpu variables.  That's
         * what the legacy allocator did.
         */
        rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
                                    PAGE_SIZE, NULL, NULL);
        if (rc < 0)
                panic("Failed to initialize percpu areas.");

        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif        /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

#else        /* CONFIG_SMP */

/*
 * UP percpu area setup.
 *
 * UP always uses km-based percpu allocator with identity mapping.
 * Static percpu variables are indistinguishable from the usual static
 * variables and don't require any special preparation.
 */
void __init setup_per_cpu_areas(void)
{
        const size_t unit_size =
                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
                                         PERCPU_DYNAMIC_RESERVE));
        struct pcpu_alloc_info *ai;
        void *fc;

        ai = pcpu_alloc_alloc_info(1, 1);
        fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
        kmemleak_ignore_phys(__pa(fc));

        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
        ai->atom_size = unit_size;
        ai->alloc_size = unit_size;
        ai->groups[0].nr_units = 1;
        ai->groups[0].cpu_map[0] = 0;

        pcpu_setup_first_chunk(ai, fc);
        pcpu_free_alloc_info(ai);
}

#endif        /* CONFIG_SMP */

/*
 * pcpu_nr_pages - calculate total number of populated backing pages
 *
 * This reflects the number of pages populated to back chunks.  Metadata is
 * excluded in the number exposed in meminfo as the number of backing pages
 * scales with the number of cpus and can quickly outweigh the memory used for
 * metadata.  It also keeps this calculation nice and simple.
 *
 * RETURNS:
 * Total number of populated backing pages in use by the allocator.
 */
unsigned long pcpu_nr_pages(void)
{
        return pcpu_nr_populated * pcpu_nr_units;
}

/*
 * Percpu allocator is initialized early during boot when neither slab or
 * workqueue is available.  Plug async management until everything is up
 * and running.
 */
static int __init percpu_enable_async(void)
{
        pcpu_async_enabled = true;
        return 0;
}
subsys_initcall(percpu_enable_async);





































    4 


















    3 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKRU_H
#define _ASM_X86_PKRU_H

#include <asm/cpufeature.h>

#define PKRU_AD_BIT 0x1u
#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
extern u32 init_pkru_value;
#define pkru_get_init_value()        READ_ONCE(init_pkru_value)
#else
#define init_pkru_value        0
#define pkru_get_init_value()        0
#endif

static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}

static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        /*
         * Access-disable disables writes too so we need to check
         * both bits here.
         */
        return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}

static inline u32 read_pkru(void)
{
        if (cpu_feature_enabled(X86_FEATURE_OSPKE))
                return rdpkru();
        return 0;
}

static inline void write_pkru(u32 pkru)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;
        /*
         * WRPKRU is relatively expensive compared to RDPKRU.
         * Avoid WRPKRU when it would not change the value.
         */
        if (pkru != rdpkru())
                wrpkru(pkru);
}

static inline void pkru_write_default(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        wrpkru(pkru_get_init_value());
}

#endif































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_COUNTER_H
#define _LINUX_PERCPU_COUNTER_H
/*
 * A simple "approximate counter" for use in ext2 and ext3 superblocks.
 *
 * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
 */

#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>

/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH        INT_MAX

#ifdef CONFIG_SMP

struct percpu_counter {
        raw_spinlock_t lock;
        s64 count;
#ifdef CONFIG_HOTPLUG_CPU
        struct list_head list;        /* All percpu_counters are on a list */
#endif
        s32 __percpu *counters;
};

extern int percpu_counter_batch;

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key);

#define percpu_counter_init_many(fbc, value, gfp, nr_counters)                \
        ({                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __percpu_counter_init_many(fbc, value, gfp, nr_counters,\
                                           &__key);                        \
        })


#define percpu_counter_init(fbc, value, gfp)                                \
        percpu_counter_init_many(fbc, value, gfp, 1)

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
        percpu_counter_destroy_many(fbc, 1);
}

void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
                                  s64 amount, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
}

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        return __percpu_counter_limited_add(fbc, limit, amount,
                                            percpu_counter_batch);
}

/*
 * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
 * are accumulated in local per cpu counter and not in fbc->count until
 * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter
 * write efficient.
 * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be
 * used to add up the counts from each CPU to account for all the local
 * counts. So percpu_counter_add_local() and percpu_counter_sub_local()
 * should be used when a counter is updated frequently and read rarely.
 */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        s64 ret = __percpu_counter_sum(fbc);
        return ret < 0 ? 0 : ret;
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return __percpu_counter_sum(fbc);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * It is possible for the percpu_counter_read() to return a small negative
 * number for some counter which should never be negative.
 *
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        /* Prevent reloads of fbc->count */
        s64 ret = READ_ONCE(fbc->count);

        if (ret >= 0)
                return ret;
        return 0;
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return (fbc->counters != NULL);
}

#else /* !CONFIG_SMP */

struct percpu_counter {
        s64 count;
};

static inline int percpu_counter_init_many(struct percpu_counter *fbc,
                                           s64 amount, gfp_t gfp,
                                           u32 nr_counters)
{
        u32 i;

        for (i = 0; i < nr_counters; i++)
                fbc[i].count = amount;

        return 0;
}

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
                                      gfp_t gfp)
{
        return percpu_counter_init_many(fbc, amount, gfp, 1);
}

static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
                                               u32 nr_counters)
{
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        fbc->count = amount;
}

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        if (fbc->count > rhs)
                return 1;
        else if (fbc->count < rhs)
                return -1;
        else
                return 0;
}

static inline int
__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        return percpu_counter_compare(fbc, rhs);
}

static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        unsigned long flags;

        local_irq_save(flags);
        fbc->count += amount;
        local_irq_restore(flags);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        unsigned long flags;
        bool good = false;
        s64 count;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        count = fbc->count + amount;
        if ((amount > 0 && count <= limit) ||
            (amount < 0 && count >= limit)) {
                fbc->count = count;
                good = true;
        }
        local_irq_restore(flags);
        return good;
}

/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, amount);
}

static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        percpu_counter_add(fbc, amount);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * percpu_counter is intended to track positive numbers. In the UP case the
 * number should never be negative.
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        return fbc->count;
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        return percpu_counter_read_positive(fbc);
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return percpu_counter_read(fbc);
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return true;
}

static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}
#endif        /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, -1);
}

static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, -amount);
}

static inline void
percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_local(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005-2006, Devicescape Software, Inc.
 * Copyright 2007        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2008-2011        Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright      2017  Intel Deutschland GmbH
 * Copyright (C) 2018 - 2024 Intel Corporation
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */


/**
 * DOC: Wireless regulatory infrastructure
 *
 * The usual implementation is for a driver to read a device EEPROM to
 * determine which regulatory domain it should be operating under, then
 * looking up the allowable channels in a driver-local table and finally
 * registering those channels in the wiphy structure.
 *
 * Another set of compliance enforcement is for drivers to use their
 * own compliance limits which can be stored on the EEPROM. The host
 * driver or firmware may ensure these are used.
 *
 * In addition to all this we provide an extra layer of regulatory
 * conformance. For drivers which do not have any regulatory
 * information CRDA provides the complete regulatory solution.
 * For others it provides a community effort on further restrictions
 * to enhance compliance.
 *
 * Note: When number of rules --> infinity we will not be able to
 * index on alpha2 any more, instead we'll probably have to
 * rely on some SHA1 checksum of the regdomain for example.
 *
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/ctype.h>
#include <linux/nl80211.h>
#include <linux/platform_device.h>
#include <linux/verification.h>
#include <linux/moduleparam.h>
#include <linux/firmware.h>
#include <linux/units.h>

#include <net/cfg80211.h>
#include "core.h"
#include "reg.h"
#include "rdev-ops.h"
#include "nl80211.h"

/*
 * Grace period we give before making sure all current interfaces reside on
 * channels allowed by the current regulatory domain.
 */
#define REG_ENFORCE_GRACE_MS 60000

/**
 * enum reg_request_treatment - regulatory request treatment
 *
 * @REG_REQ_OK: continue processing the regulatory request
 * @REG_REQ_IGNORE: ignore the regulatory request
 * @REG_REQ_INTERSECT: the regulatory domain resulting from this request should
 *        be intersected with the current one.
 * @REG_REQ_ALREADY_SET: the regulatory request will not change the current
 *        regulatory settings, and no further processing is required.
 */
enum reg_request_treatment {
        REG_REQ_OK,
        REG_REQ_IGNORE,
        REG_REQ_INTERSECT,
        REG_REQ_ALREADY_SET,
};

static struct regulatory_request core_request_world = {
        .initiator = NL80211_REGDOM_SET_BY_CORE,
        .alpha2[0] = '0',
        .alpha2[1] = '0',
        .intersect = false,
        .processed = true,
        .country_ie_env = ENVIRON_ANY,
};

/*
 * Receipt of information from last regulatory request,
 * protected by RTNL (and can be accessed with RCU protection)
 */
static struct regulatory_request __rcu *last_request =
        (void __force __rcu *)&core_request_world;

/* To trigger userspace events and load firmware */
static struct platform_device *reg_pdev;

/*
 * Central wireless core regulatory domains, we only need two,
 * the current one and a world regulatory domain in case we have no
 * information to give us an alpha2.
 * (protected by RTNL, can be read under RCU)
 */
const struct ieee80211_regdomain __rcu *cfg80211_regdomain;

/*
 * Number of devices that registered to the core
 * that support cellular base station regulatory hints
 * (protected by RTNL)
 */
static int reg_num_devs_support_basehint;

/*
 * State variable indicating if the platform on which the devices
 * are attached is operating in an indoor environment. The state variable
 * is relevant for all registered devices.
 */
static bool reg_is_indoor;
static DEFINE_SPINLOCK(reg_indoor_lock);

/* Used to track the userspace process controlling the indoor setting */
static u32 reg_is_indoor_portid;

static void restore_regulatory_settings(bool reset_user, bool cached);
static void print_regdomain(const struct ieee80211_regdomain *rd);
static void reg_process_hint(struct regulatory_request *reg_request);

static const struct ieee80211_regdomain *get_cfg80211_regdom(void)
{
        return rcu_dereference_rtnl(cfg80211_regdomain);
}

/*
 * Returns the regulatory domain associated with the wiphy.
 *
 * Requires any of RTNL, wiphy mutex or RCU protection.
 */
const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy)
{
        return rcu_dereference_check(wiphy->regd,
                                     lockdep_is_held(&wiphy->mtx) ||
                                     lockdep_rtnl_is_held());
}
EXPORT_SYMBOL(get_wiphy_regdom);

static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region)
{
        switch (dfs_region) {
        case NL80211_DFS_UNSET:
                return "unset";
        case NL80211_DFS_FCC:
                return "FCC";
        case NL80211_DFS_ETSI:
                return "ETSI";
        case NL80211_DFS_JP:
                return "JP";
        }
        return "Unknown";
}

enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy)
{
        const struct ieee80211_regdomain *regd = NULL;
        const struct ieee80211_regdomain *wiphy_regd = NULL;
        enum nl80211_dfs_regions dfs_region;

        rcu_read_lock();
        regd = get_cfg80211_regdom();
        dfs_region = regd->dfs_region;

        if (!wiphy)
                goto out;

        wiphy_regd = get_wiphy_regdom(wiphy);
        if (!wiphy_regd)
                goto out;

        if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
                dfs_region = wiphy_regd->dfs_region;
                goto out;
        }

        if (wiphy_regd->dfs_region == regd->dfs_region)
                goto out;

        pr_debug("%s: device specific dfs_region (%s) disagrees with cfg80211's central dfs_region (%s)\n",
                 dev_name(&wiphy->dev),
                 reg_dfs_region_str(wiphy_regd->dfs_region),
                 reg_dfs_region_str(regd->dfs_region));

out:
        rcu_read_unlock();

        return dfs_region;
}

static void rcu_free_regdom(const struct ieee80211_regdomain *r)
{
        if (!r)
                return;
        kfree_rcu((struct ieee80211_regdomain *)r, rcu_head);
}

static struct regulatory_request *get_last_request(void)
{
        return rcu_dereference_rtnl(last_request);
}

/* Used to queue up regulatory hints */
static LIST_HEAD(reg_requests_list);
static DEFINE_SPINLOCK(reg_requests_lock);

/* Used to queue up beacon hints for review */
static LIST_HEAD(reg_pending_beacons);
static DEFINE_SPINLOCK(reg_pending_beacons_lock);

/* Used to keep track of processed beacon hints */
static LIST_HEAD(reg_beacon_list);

struct reg_beacon {
        struct list_head list;
        struct ieee80211_channel chan;
};

static void reg_check_chans_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(reg_check_chans, reg_check_chans_work);

static void reg_todo(struct work_struct *work);
static DECLARE_WORK(reg_work, reg_todo);

/* We keep a static world regulatory domain in case of the absence of CRDA */
static const struct ieee80211_regdomain world_regdom = {
        .n_reg_rules = 8,
        .alpha2 =  "00",
        .reg_rules = {
                /* IEEE 802.11b/g, channels 1..11 */
                REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
                /* IEEE 802.11b/g, channels 12..13. */
                REG_RULE(2467-10, 2472+10, 20, 6, 20,
                        NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW),
                /* IEEE 802.11 channel 14 - Only JP enables
                 * this and for 802.11b only */
                REG_RULE(2484-10, 2484+10, 20, 6, 20,
                        NL80211_RRF_NO_IR |
                        NL80211_RRF_NO_OFDM),
                /* IEEE 802.11a, channel 36..48 */
                REG_RULE(5180-10, 5240+10, 80, 6, 20,
                        NL80211_RRF_NO_IR |
                        NL80211_RRF_AUTO_BW),

                /* IEEE 802.11a, channel 52..64 - DFS required */
                REG_RULE(5260-10, 5320+10, 80, 6, 20,
                        NL80211_RRF_NO_IR |
                        NL80211_RRF_AUTO_BW |
                        NL80211_RRF_DFS),

                /* IEEE 802.11a, channel 100..144 - DFS required */
                REG_RULE(5500-10, 5720+10, 160, 6, 20,
                        NL80211_RRF_NO_IR |
                        NL80211_RRF_DFS),

                /* IEEE 802.11a, channel 149..165 */
                REG_RULE(5745-10, 5825+10, 80, 6, 20,
                        NL80211_RRF_NO_IR),

                /* IEEE 802.11ad (60GHz), channels 1..3 */
                REG_RULE(56160+2160*1-1080, 56160+2160*3+1080, 2160, 0, 0, 0),
        }
};

/* protected by RTNL */
static const struct ieee80211_regdomain *cfg80211_world_regdom =
        &world_regdom;

static char *ieee80211_regdom = "00";
static char user_alpha2[2];
static const struct ieee80211_regdomain *cfg80211_user_regdom;

module_param(ieee80211_regdom, charp, 0444);
MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");

static void reg_free_request(struct regulatory_request *request)
{
        if (request == &core_request_world)
                return;

        if (request != get_last_request())
                kfree(request);
}

static void reg_free_last_request(void)
{
        struct regulatory_request *lr = get_last_request();

        if (lr != &core_request_world && lr)
                kfree_rcu(lr, rcu_head);
}

static void reg_update_last_request(struct regulatory_request *request)
{
        struct regulatory_request *lr;

        lr = get_last_request();
        if (lr == request)
                return;

        reg_free_last_request();
        rcu_assign_pointer(last_request, request);
}

static void reset_regdomains(bool full_reset,
                             const struct ieee80211_regdomain *new_regdom)
{
        const struct ieee80211_regdomain *r;

        ASSERT_RTNL();

        r = get_cfg80211_regdom();

        /* avoid freeing static information or freeing something twice */
        if (r == cfg80211_world_regdom)
                r = NULL;
        if (cfg80211_world_regdom == &world_regdom)
                cfg80211_world_regdom = NULL;
        if (r == &world_regdom)
                r = NULL;

        rcu_free_regdom(r);
        rcu_free_regdom(cfg80211_world_regdom);

        cfg80211_world_regdom = &world_regdom;
        rcu_assign_pointer(cfg80211_regdomain, new_regdom);

        if (!full_reset)
                return;

        reg_update_last_request(&core_request_world);
}

/*
 * Dynamic world regulatory domain requested by the wireless
 * core upon initialization
 */
static void update_world_regdomain(const struct ieee80211_regdomain *rd)
{
        struct regulatory_request *lr;

        lr = get_last_request();

        WARN_ON(!lr);

        reset_regdomains(false, rd);

        cfg80211_world_regdom = rd;
}

bool is_world_regdom(const char *alpha2)
{
        if (!alpha2)
                return false;
        return alpha2[0] == '0' && alpha2[1] == '0';
}

static bool is_alpha2_set(const char *alpha2)
{
        if (!alpha2)
                return false;
        return alpha2[0] && alpha2[1];
}

static bool is_unknown_alpha2(const char *alpha2)
{
        if (!alpha2)
                return false;
        /*
         * Special case where regulatory domain was built by driver
         * but a specific alpha2 cannot be determined
         */
        return alpha2[0] == '9' && alpha2[1] == '9';
}

static bool is_intersected_alpha2(const char *alpha2)
{
        if (!alpha2)
                return false;
        /*
         * Special case where regulatory domain is the
         * result of an intersection between two regulatory domain
         * structures
         */
        return alpha2[0] == '9' && alpha2[1] == '8';
}

static bool is_an_alpha2(const char *alpha2)
{
        if (!alpha2)
                return false;
        return isalpha(alpha2[0]) && isalpha(alpha2[1]);
}

static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y)
{
        if (!alpha2_x || !alpha2_y)
                return false;
        return alpha2_x[0] == alpha2_y[0] && alpha2_x[1] == alpha2_y[1];
}

static bool regdom_changes(const char *alpha2)
{
        const struct ieee80211_regdomain *r = get_cfg80211_regdom();

        if (!r)
                return true;
        return !alpha2_equal(r->alpha2, alpha2);
}

/*
 * The NL80211_REGDOM_SET_BY_USER regdom alpha2 is cached, this lets
 * you know if a valid regulatory hint with NL80211_REGDOM_SET_BY_USER
 * has ever been issued.
 */
static bool is_user_regdom_saved(void)
{
        if (user_alpha2[0] == '9' && user_alpha2[1] == '7')
                return false;

        /* This would indicate a mistake on the design */
        if (WARN(!is_world_regdom(user_alpha2) && !is_an_alpha2(user_alpha2),
                 "Unexpected user alpha2: %c%c\n",
                 user_alpha2[0], user_alpha2[1]))
                return false;

        return true;
}

static const struct ieee80211_regdomain *
reg_copy_regd(const struct ieee80211_regdomain *src_regd)
{
        struct ieee80211_regdomain *regd;
        unsigned int i;

        regd = kzalloc(struct_size(regd, reg_rules, src_regd->n_reg_rules),
                       GFP_KERNEL);
        if (!regd)
                return ERR_PTR(-ENOMEM);

        memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain));

        for (i = 0; i < src_regd->n_reg_rules; i++)
                memcpy(&regd->reg_rules[i], &src_regd->reg_rules[i],
                       sizeof(struct ieee80211_reg_rule));

        return regd;
}

static void cfg80211_save_user_regdom(const struct ieee80211_regdomain *rd)
{
        ASSERT_RTNL();

        if (!IS_ERR(cfg80211_user_regdom))
                kfree(cfg80211_user_regdom);
        cfg80211_user_regdom = reg_copy_regd(rd);
}

struct reg_regdb_apply_request {
        struct list_head list;
        const struct ieee80211_regdomain *regdom;
};

static LIST_HEAD(reg_regdb_apply_list);
static DEFINE_MUTEX(reg_regdb_apply_mutex);

static void reg_regdb_apply(struct work_struct *work)
{
        struct reg_regdb_apply_request *request;

        rtnl_lock();

        mutex_lock(&reg_regdb_apply_mutex);
        while (!list_empty(&reg_regdb_apply_list)) {
                request = list_first_entry(&reg_regdb_apply_list,
                                           struct reg_regdb_apply_request,
                                           list);
                list_del(&request->list);

                set_regdom(request->regdom, REGD_SOURCE_INTERNAL_DB);
                kfree(request);
        }
        mutex_unlock(&reg_regdb_apply_mutex);

        rtnl_unlock();
}

static DECLARE_WORK(reg_regdb_work, reg_regdb_apply);

static int reg_schedule_apply(const struct ieee80211_regdomain *regdom)
{
        struct reg_regdb_apply_request *request;

        request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL);
        if (!request) {
                kfree(regdom);
                return -ENOMEM;
        }

        request->regdom = regdom;

        mutex_lock(&reg_regdb_apply_mutex);
        list_add_tail(&request->list, &reg_regdb_apply_list);
        mutex_unlock(&reg_regdb_apply_mutex);

        schedule_work(&reg_regdb_work);
        return 0;
}

#ifdef CONFIG_CFG80211_CRDA_SUPPORT
/* Max number of consecutive attempts to communicate with CRDA  */
#define REG_MAX_CRDA_TIMEOUTS 10

static u32 reg_crda_timeouts;

static void crda_timeout_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work);

static void crda_timeout_work(struct work_struct *work)
{
        pr_debug("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
        rtnl_lock();
        reg_crda_timeouts++;
        restore_regulatory_settings(true, false);
        rtnl_unlock();
}

static void cancel_crda_timeout(void)
{
        cancel_delayed_work(&crda_timeout);
}

static void cancel_crda_timeout_sync(void)
{
        cancel_delayed_work_sync(&crda_timeout);
}

static void reset_crda_timeouts(void)
{
        reg_crda_timeouts = 0;
}

/*
 * This lets us keep regulatory code which is updated on a regulatory
 * basis in userspace.
 */
static int call_crda(const char *alpha2)
{
        char country[12];
        char *env[] = { country, NULL };
        int ret;

        snprintf(country, sizeof(country), "COUNTRY=%c%c",
                 alpha2[0], alpha2[1]);

        if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) {
                pr_debug("Exceeded CRDA call max attempts. Not calling CRDA\n");
                return -EINVAL;
        }

        if (!is_world_regdom((char *) alpha2))
                pr_debug("Calling CRDA for country: %c%c\n",
                         alpha2[0], alpha2[1]);
        else
                pr_debug("Calling CRDA to update world regulatory domain\n");

        ret = kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, env);
        if (ret)
                return ret;

        queue_delayed_work(system_power_efficient_wq,
                           &crda_timeout, msecs_to_jiffies(3142));
        return 0;
}
#else
static inline void cancel_crda_timeout(void) {}
static inline void cancel_crda_timeout_sync(void) {}
static inline void reset_crda_timeouts(void) {}
static inline int call_crda(const char *alpha2)
{
        return -ENODATA;
}
#endif /* CONFIG_CFG80211_CRDA_SUPPORT */

/* code to directly load a firmware database through request_firmware */
static const struct fwdb_header *regdb;

struct fwdb_country {
        u8 alpha2[2];
        __be16 coll_ptr;
        /* this struct cannot be extended */
} __packed __aligned(4);

struct fwdb_collection {
        u8 len;
        u8 n_rules;
        u8 dfs_region;
        /* no optional data yet */
        /* aligned to 2, then followed by __be16 array of rule pointers */
} __packed __aligned(4);

enum fwdb_flags {
        FWDB_FLAG_NO_OFDM        = BIT(0),
        FWDB_FLAG_NO_OUTDOOR        = BIT(1),
        FWDB_FLAG_DFS                = BIT(2),
        FWDB_FLAG_NO_IR                = BIT(3),
        FWDB_FLAG_AUTO_BW        = BIT(4),
};

struct fwdb_wmm_ac {
        u8 ecw;
        u8 aifsn;
        __be16 cot;
} __packed;

struct fwdb_wmm_rule {
        struct fwdb_wmm_ac client[IEEE80211_NUM_ACS];
        struct fwdb_wmm_ac ap[IEEE80211_NUM_ACS];
} __packed;

struct fwdb_rule {
        u8 len;
        u8 flags;
        __be16 max_eirp;
        __be32 start, end, max_bw;
        /* start of optional data */
        __be16 cac_timeout;
        __be16 wmm_ptr;
} __packed __aligned(4);

#define FWDB_MAGIC 0x52474442
#define FWDB_VERSION 20

struct fwdb_header {
        __be32 magic;
        __be32 version;
        struct fwdb_country country[];
} __packed __aligned(4);

static int ecw2cw(int ecw)
{
        return (1 << ecw) - 1;
}

static bool valid_wmm(struct fwdb_wmm_rule *rule)
{
        struct fwdb_wmm_ac *ac = (struct fwdb_wmm_ac *)rule;
        int i;

        for (i = 0; i < IEEE80211_NUM_ACS * 2; i++) {
                u16 cw_min = ecw2cw((ac[i].ecw & 0xf0) >> 4);
                u16 cw_max = ecw2cw(ac[i].ecw & 0x0f);
                u8 aifsn = ac[i].aifsn;

                if (cw_min >= cw_max)
                        return false;

                if (aifsn < 1)
                        return false;
        }

        return true;
}

static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr)
{
        struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2));

        if ((u8 *)rule + sizeof(rule->len) > data + size)
                return false;

        /* mandatory fields */
        if (rule->len < offsetofend(struct fwdb_rule, max_bw))
                return false;
        if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) {
                u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2;
                struct fwdb_wmm_rule *wmm;

                if (wmm_ptr + sizeof(struct fwdb_wmm_rule) > size)
                        return false;

                wmm = (void *)(data + wmm_ptr);

                if (!valid_wmm(wmm))
                        return false;
        }
        return true;
}

static bool valid_country(const u8 *data, unsigned int size,
                          const struct fwdb_country *country)
{
        unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
        struct fwdb_collection *coll = (void *)(data + ptr);
        __be16 *rules_ptr;
        unsigned int i;

        /* make sure we can read len/n_rules */
        if ((u8 *)coll + offsetofend(typeof(*coll), n_rules) > data + size)
                return false;

        /* make sure base struct and all rules fit */
        if ((u8 *)coll + ALIGN(coll->len, 2) +
            (coll->n_rules * 2) > data + size)
                return false;

        /* mandatory fields must exist */
        if (coll->len < offsetofend(struct fwdb_collection, dfs_region))
                return false;

        rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2));

        for (i = 0; i < coll->n_rules; i++) {
                u16 rule_ptr = be16_to_cpu(rules_ptr[i]);

                if (!valid_rule(data, size, rule_ptr))
                        return false;
        }

        return true;
}

#ifdef CONFIG_CFG80211_REQUIRE_SIGNED_REGDB
#include <keys/asymmetric-type.h>

static struct key *builtin_regdb_keys;

static int __init load_builtin_regdb_keys(void)
{
        builtin_regdb_keys =
                keyring_alloc(".builtin_regdb_keys",
                              KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
                              ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
                              KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
                              KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
        if (IS_ERR(builtin_regdb_keys))
                return PTR_ERR(builtin_regdb_keys);

        pr_notice("Loading compiled-in X.509 certificates for regulatory database\n");

#ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS
        x509_load_certificate_list(shipped_regdb_certs,
                                   shipped_regdb_certs_len,
                                   builtin_regdb_keys);
#endif
#ifdef CONFIG_CFG80211_EXTRA_REGDB_KEYDIR
        if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0')
                x509_load_certificate_list(extra_regdb_certs,
                                           extra_regdb_certs_len,
                                           builtin_regdb_keys);
#endif

        return 0;
}

MODULE_FIRMWARE("regulatory.db.p7s");

static bool regdb_has_valid_signature(const u8 *data, unsigned int size)
{
        const struct firmware *sig;
        bool result;

        if (request_firmware(&sig, "regulatory.db.p7s", &reg_pdev->dev))
                return false;

        result = verify_pkcs7_signature(data, size, sig->data, sig->size,
                                        builtin_regdb_keys,
                                        VERIFYING_UNSPECIFIED_SIGNATURE,
                                        NULL, NULL) == 0;

        release_firmware(sig);

        return result;
}

static void free_regdb_keyring(void)
{
        key_put(builtin_regdb_keys);
}
#else
static int load_builtin_regdb_keys(void)
{
        return 0;
}

static bool regdb_has_valid_signature(const u8 *data, unsigned int size)
{
        return true;
}

static void free_regdb_keyring(void)
{
}
#endif /* CONFIG_CFG80211_REQUIRE_SIGNED_REGDB */

static bool valid_regdb(const u8 *data, unsigned int size)
{
        const struct fwdb_header *hdr = (void *)data;
        const struct fwdb_country *country;

        if (size < sizeof(*hdr))
                return false;

        if (hdr->magic != cpu_to_be32(FWDB_MAGIC))
                return false;

        if (hdr->version != cpu_to_be32(FWDB_VERSION))
                return false;

        if (!regdb_has_valid_signature(data, size))
                return false;

        country = &hdr->country[0];
        while ((u8 *)(country + 1) <= data + size) {
                if (!country->coll_ptr)
                        break;
                if (!valid_country(data, size, country))
                        return false;
                country++;
        }

        return true;
}

static void set_wmm_rule(const struct fwdb_header *db,
                         const struct fwdb_country *country,
                         const struct fwdb_rule *rule,
                         struct ieee80211_reg_rule *rrule)
{
        struct ieee80211_wmm_rule *wmm_rule = &rrule->wmm_rule;
        struct fwdb_wmm_rule *wmm;
        unsigned int i, wmm_ptr;

        wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2;
        wmm = (void *)((u8 *)db + wmm_ptr);

        if (!valid_wmm(wmm)) {
                pr_err("Invalid regulatory WMM rule %u-%u in domain %c%c\n",
                       be32_to_cpu(rule->start), be32_to_cpu(rule->end),
                       country->alpha2[0], country->alpha2[1]);
                return;
        }

        for (i = 0; i < IEEE80211_NUM_ACS; i++) {
                wmm_rule->client[i].cw_min =
                        ecw2cw((wmm->client[i].ecw & 0xf0) >> 4);
                wmm_rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f);
                wmm_rule->client[i].aifsn =  wmm->client[i].aifsn;
                wmm_rule->client[i].cot =
                        1000 * be16_to_cpu(wmm->client[i].cot);
                wmm_rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4);
                wmm_rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f);
                wmm_rule->ap[i].aifsn = wmm->ap[i].aifsn;
                wmm_rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot);
        }

        rrule->has_wmm = true;
}

static int __regdb_query_wmm(const struct fwdb_header *db,
                             const struct fwdb_country *country, int freq,
                             struct ieee80211_reg_rule *rrule)
{
        unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
        struct fwdb_collection *coll = (void *)((u8 *)db + ptr);
        int i;

        for (i = 0; i < coll->n_rules; i++) {
                __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2));
                unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2;
                struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr);

                if (rule->len < offsetofend(struct fwdb_rule, wmm_ptr))
                        continue;

                if (freq >= KHZ_TO_MHZ(be32_to_cpu(rule->start)) &&
                    freq <= KHZ_TO_MHZ(be32_to_cpu(rule->end))) {
                        set_wmm_rule(db, country, rule, rrule);
                        return 0;
                }
        }

        return -ENODATA;
}

int reg_query_regdb_wmm(char *alpha2, int freq, struct ieee80211_reg_rule *rule)
{
        const struct fwdb_header *hdr = regdb;
        const struct fwdb_country *country;

        if (!regdb)
                return -ENODATA;

        if (IS_ERR(regdb))
                return PTR_ERR(regdb);

        country = &hdr->country[0];
        while (country->coll_ptr) {
                if (alpha2_equal(alpha2, country->alpha2))
                        return __regdb_query_wmm(regdb, country, freq, rule);

                country++;
        }

        return -ENODATA;
}
EXPORT_SYMBOL(reg_query_regdb_wmm);

static int regdb_query_country(const struct fwdb_header *db,
                               const struct fwdb_country *country)
{
        unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
        struct fwdb_collection *coll = (void *)((u8 *)db + ptr);
        struct ieee80211_regdomain *regdom;
        unsigned int i;

        regdom = kzalloc(struct_size(regdom, reg_rules, coll->n_rules),
                         GFP_KERNEL);
        if (!regdom)
                return -ENOMEM;

        regdom->n_reg_rules = coll->n_rules;
        regdom->alpha2[0] = country->alpha2[0];
        regdom->alpha2[1] = country->alpha2[1];
        regdom->dfs_region = coll->dfs_region;

        for (i = 0; i < regdom->n_reg_rules; i++) {
                __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2));
                unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2;
                struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr);
                struct ieee80211_reg_rule *rrule = &regdom->reg_rules[i];

                rrule->freq_range.start_freq_khz = be32_to_cpu(rule->start);
                rrule->freq_range.end_freq_khz = be32_to_cpu(rule->end);
                rrule->freq_range.max_bandwidth_khz = be32_to_cpu(rule->max_bw);

                rrule->power_rule.max_antenna_gain = 0;
                rrule->power_rule.max_eirp = be16_to_cpu(rule->max_eirp);

                rrule->flags = 0;
                if (rule->flags & FWDB_FLAG_NO_OFDM)
                        rrule->flags |= NL80211_RRF_NO_OFDM;
                if (rule->flags & FWDB_FLAG_NO_OUTDOOR)
                        rrule->flags |= NL80211_RRF_NO_OUTDOOR;
                if (rule->flags & FWDB_FLAG_DFS)
                        rrule->flags |= NL80211_RRF_DFS;
                if (rule->flags & FWDB_FLAG_NO_IR)
                        rrule->flags |= NL80211_RRF_NO_IR;
                if (rule->flags & FWDB_FLAG_AUTO_BW)
                        rrule->flags |= NL80211_RRF_AUTO_BW;

                rrule->dfs_cac_ms = 0;

                /* handle optional data */
                if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout))
                        rrule->dfs_cac_ms =
                                1000 * be16_to_cpu(rule->cac_timeout);
                if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr))
                        set_wmm_rule(db, country, rule, rrule);
        }

        return reg_schedule_apply(regdom);
}

static int query_regdb(const char *alpha2)
{
        const struct fwdb_header *hdr = regdb;
        const struct fwdb_country *country;

        ASSERT_RTNL();

        if (IS_ERR(regdb))
                return PTR_ERR(regdb);

        country = &hdr->country[0];
        while (country->coll_ptr) {
                if (alpha2_equal(alpha2, country->alpha2))
                        return regdb_query_country(regdb, country);
                country++;
        }

        return -ENODATA;
}

static void regdb_fw_cb(const struct firmware *fw, void *context)
{
        int set_error = 0;
        bool restore = true;
        void *db;

        if (!fw) {
                pr_info("failed to load regulatory.db\n");
                set_error = -ENODATA;
        } else if (!valid_regdb(fw->data, fw->size)) {
                pr_info("loaded regulatory.db is malformed or signature is missing/invalid\n");
                set_error = -EINVAL;
        }

        rtnl_lock();
        if (regdb && !IS_ERR(regdb)) {
                /* negative case - a bug
                 * positive case - can happen due to race in case of multiple cb's in
                 * queue, due to usage of asynchronous callback
                 *
                 * Either case, just restore and free new db.
                 */
        } else if (set_error) {
                regdb = ERR_PTR(set_error);
        } else if (fw) {
                db = kmemdup(fw->data, fw->size, GFP_KERNEL);
                if (db) {
                        regdb = db;
                        restore = context && query_regdb(context);
                } else {
                        restore = true;
                }
        }

        if (restore)
                restore_regulatory_settings(true, false);

        rtnl_unlock();

        kfree(context);

        release_firmware(fw);
}

MODULE_FIRMWARE("regulatory.db");

static int query_regdb_file(const char *alpha2)
{
        int err;

        ASSERT_RTNL();

        if (regdb)
                return query_regdb(alpha2);

        alpha2 = kmemdup(alpha2, 2, GFP_KERNEL);
        if (!alpha2)
                return -ENOMEM;

        err = request_firmware_nowait(THIS_MODULE, true, "regulatory.db",
                                      &reg_pdev->dev, GFP_KERNEL,
                                      (void *)alpha2, regdb_fw_cb);
        if (err)
                kfree(alpha2);

        return err;
}

int reg_reload_regdb(void)
{
        const struct firmware *fw;
        void *db;
        int err;
        const struct ieee80211_regdomain *current_regdomain;
        struct regulatory_request *request;

        err = request_firmware(&fw, "regulatory.db", &reg_pdev->dev);
        if (err)
                return err;

        if (!valid_regdb(fw->data, fw->size)) {
                err = -ENODATA;
                goto out;
        }

        db = kmemdup(fw->data, fw->size, GFP_KERNEL);
        if (!db) {
                err = -ENOMEM;
                goto out;
        }

        rtnl_lock();
        if (!IS_ERR_OR_NULL(regdb))
                kfree(regdb);
        regdb = db;

        /* reset regulatory domain */
        current_regdomain = get_cfg80211_regdom();

        request = kzalloc(sizeof(*request), GFP_KERNEL);
        if (!request) {
                err = -ENOMEM;
                goto out_unlock;
        }

        request->wiphy_idx = WIPHY_IDX_INVALID;
        request->alpha2[0] = current_regdomain->alpha2[0];
        request->alpha2[1] = current_regdomain->alpha2[1];
        request->initiator = NL80211_REGDOM_SET_BY_CORE;
        request->user_reg_hint_type = NL80211_USER_REG_HINT_USER;

        reg_process_hint(request);

out_unlock:
        rtnl_unlock();
 out:
        release_firmware(fw);
        return err;
}

static bool reg_query_database(struct regulatory_request *request)
{
        if (query_regdb_file(request->alpha2) == 0)
                return true;

        if (call_crda(request->alpha2) == 0)
                return true;

        return false;
}

bool reg_is_valid_request(const char *alpha2)
{
        struct regulatory_request *lr = get_last_request();

        if (!lr || lr->processed)
                return false;

        return alpha2_equal(lr->alpha2, alpha2);
}

static const struct ieee80211_regdomain *reg_get_regdomain(struct wiphy *wiphy)
{
        struct regulatory_request *lr = get_last_request();

        /*
         * Follow the driver's regulatory domain, if present, unless a country
         * IE has been processed or a user wants to help complaince further
         */
        if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
            lr->initiator != NL80211_REGDOM_SET_BY_USER &&
            wiphy->regd)
                return get_wiphy_regdom(wiphy);

        return get_cfg80211_regdom();
}

static unsigned int
reg_get_max_bandwidth_from_range(const struct ieee80211_regdomain *rd,
                                 const struct ieee80211_reg_rule *rule)
{
        const struct ieee80211_freq_range *freq_range = &rule->freq_range;
        const struct ieee80211_freq_range *freq_range_tmp;
        const struct ieee80211_reg_rule *tmp;
        u32 start_freq, end_freq, idx, no;

        for (idx = 0; idx < rd->n_reg_rules; idx++)
                if (rule == &rd->reg_rules[idx])
                        break;

        if (idx == rd->n_reg_rules)
                return 0;

        /* get start_freq */
        no = idx;

        while (no) {
                tmp = &rd->reg_rules[--no];
                freq_range_tmp = &tmp->freq_range;

                if (freq_range_tmp->end_freq_khz < freq_range->start_freq_khz)
                        break;

                freq_range = freq_range_tmp;
        }

        start_freq = freq_range->start_freq_khz;

        /* get end_freq */
        freq_range = &rule->freq_range;
        no = idx;

        while (no < rd->n_reg_rules - 1) {
                tmp = &rd->reg_rules[++no];
                freq_range_tmp = &tmp->freq_range;

                if (freq_range_tmp->start_freq_khz > freq_range->end_freq_khz)
                        break;

                freq_range = freq_range_tmp;
        }

        end_freq = freq_range->end_freq_khz;

        return end_freq - start_freq;
}

unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd,
                                   const struct ieee80211_reg_rule *rule)
{
        unsigned int bw = reg_get_max_bandwidth_from_range(rd, rule);

        if (rule->flags & NL80211_RRF_NO_320MHZ)
                bw = min_t(unsigned int, bw, MHZ_TO_KHZ(160));
        if (rule->flags & NL80211_RRF_NO_160MHZ)
                bw = min_t(unsigned int, bw, MHZ_TO_KHZ(80));
        if (rule->flags & NL80211_RRF_NO_80MHZ)
                bw = min_t(unsigned int, bw, MHZ_TO_KHZ(40));

        /*
         * HT40+/HT40- limits are handled per-channel. Only limit BW if both
         * are not allowed.
         */
        if (rule->flags & NL80211_RRF_NO_HT40MINUS &&
            rule->flags & NL80211_RRF_NO_HT40PLUS)
                bw = min_t(unsigned int, bw, MHZ_TO_KHZ(20));

        return bw;
}

/* Sanity check on a regulatory rule */
static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
{
        const struct ieee80211_freq_range *freq_range = &rule->freq_range;
        u32 freq_diff;

        if (freq_range->start_freq_khz <= 0 || freq_range->end_freq_khz <= 0)
                return false;

        if (freq_range->start_freq_khz > freq_range->end_freq_khz)
                return false;

        freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;

        if (freq_range->end_freq_khz <= freq_range->start_freq_khz ||
            freq_range->max_bandwidth_khz > freq_diff)
                return false;

        return true;
}

static bool is_valid_rd(const struct ieee80211_regdomain *rd)
{
        const struct ieee80211_reg_rule *reg_rule = NULL;
        unsigned int i;

        if (!rd->n_reg_rules)
                return false;

        if (WARN_ON(rd->n_reg_rules > NL80211_MAX_SUPP_REG_RULES))
                return false;

        for (i = 0; i < rd->n_reg_rules; i++) {
                reg_rule = &rd->reg_rules[i];
                if (!is_valid_reg_rule(reg_rule))
                        return false;
        }

        return true;
}

/**
 * freq_in_rule_band - tells us if a frequency is in a frequency band
 * @freq_range: frequency rule we want to query
 * @freq_khz: frequency we are inquiring about
 *
 * This lets us know if a specific frequency rule is or is not relevant to
 * a specific frequency's band. Bands are device specific and artificial
 * definitions (the "2.4 GHz band", the "5 GHz band" and the "60GHz band"),
 * however it is safe for now to assume that a frequency rule should not be
 * part of a frequency's band if the start freq or end freq are off by more
 * than 2 GHz for the 2.4 and 5 GHz bands, and by more than 20 GHz for the
 * 60 GHz band.
 * This resolution can be lowered and should be considered as we add
 * regulatory rule support for other "bands".
 *
 * Returns: whether or not the frequency is in the range
 */
static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
                              u32 freq_khz)
{
        /*
         * From 802.11ad: directional multi-gigabit (DMG):
         * Pertaining to operation in a frequency band containing a channel
         * with the Channel starting frequency above 45 GHz.
         */
        u32 limit = freq_khz > 45 * KHZ_PER_GHZ ? 20 * KHZ_PER_GHZ : 2 * KHZ_PER_GHZ;
        if (abs(freq_khz - freq_range->start_freq_khz) <= limit)
                return true;
        if (abs(freq_khz - freq_range->end_freq_khz) <= limit)
                return true;
        return false;
}

/*
 * Later on we can perhaps use the more restrictive DFS
 * region but we don't have information for that yet so
 * for now simply disallow conflicts.
 */
static enum nl80211_dfs_regions
reg_intersect_dfs_region(const enum nl80211_dfs_regions dfs_region1,
                         const enum nl80211_dfs_regions dfs_region2)
{
        if (dfs_region1 != dfs_region2)
                return NL80211_DFS_UNSET;
        return dfs_region1;
}

static void reg_wmm_rules_intersect(const struct ieee80211_wmm_ac *wmm_ac1,
                                    const struct ieee80211_wmm_ac *wmm_ac2,
                                    struct ieee80211_wmm_ac *intersect)
{
        intersect->cw_min = max_t(u16, wmm_ac1->cw_min, wmm_ac2->cw_min);
        intersect->cw_max = max_t(u16, wmm_ac1->cw_max, wmm_ac2->cw_max);
        intersect->cot = min_t(u16, wmm_ac1->cot, wmm_ac2->cot);
        intersect->aifsn = max_t(u8, wmm_ac1->aifsn, wmm_ac2->aifsn);
}

/*
 * Helper for regdom_intersect(), this does the real
 * mathematical intersection fun
 */
static int reg_rules_intersect(const struct ieee80211_regdomain *rd1,
                               const struct ieee80211_regdomain *rd2,
                               const struct ieee80211_reg_rule *rule1,
                               const struct ieee80211_reg_rule *rule2,
                               struct ieee80211_reg_rule *intersected_rule)
{
        const struct ieee80211_freq_range *freq_range1, *freq_range2;
        struct ieee80211_freq_range *freq_range;
        const struct ieee80211_power_rule *power_rule1, *power_rule2;
        struct ieee80211_power_rule *power_rule;
        const struct ieee80211_wmm_rule *wmm_rule1, *wmm_rule2;
        struct ieee80211_wmm_rule *wmm_rule;
        u32 freq_diff, max_bandwidth1, max_bandwidth2;

        freq_range1 = &rule1->freq_range;
        freq_range2 = &rule2->freq_range;
        freq_range = &intersected_rule->freq_range;

        power_rule1 = &rule1->power_rule;
        power_rule2 = &rule2->power_rule;
        power_rule = &intersected_rule->power_rule;

        wmm_rule1 = &rule1->wmm_rule;
        wmm_rule2 = &rule2->wmm_rule;
        wmm_rule = &intersected_rule->wmm_rule;

        freq_range->start_freq_khz = max(freq_range1->start_freq_khz,
                                         freq_range2->start_freq_khz);
        freq_range->end_freq_khz = min(freq_range1->end_freq_khz,
                                       freq_range2->end_freq_khz);

        max_bandwidth1 = freq_range1->max_bandwidth_khz;
        max_bandwidth2 = freq_range2->max_bandwidth_khz;

        if (rule1->flags & NL80211_RRF_AUTO_BW)
                max_bandwidth1 = reg_get_max_bandwidth(rd1, rule1);
        if (rule2->flags & NL80211_RRF_AUTO_BW)
                max_bandwidth2 = reg_get_max_bandwidth(rd2, rule2);

        freq_range->max_bandwidth_khz = min(max_bandwidth1, max_bandwidth2);

        intersected_rule->flags = rule1->flags | rule2->flags;

        /*
         * In case NL80211_RRF_AUTO_BW requested for both rules
         * set AUTO_BW in intersected rule also. Next we will
         * calculate BW correctly in handle_channel function.
         * In other case remove AUTO_BW flag while we calculate
         * maximum bandwidth correctly and auto calculation is
         * not required.
         */
        if ((rule1->flags & NL80211_RRF_AUTO_BW) &&
            (rule2->flags & NL80211_RRF_AUTO_BW))
                intersected_rule->flags |= NL80211_RRF_AUTO_BW;
        else
                intersected_rule->flags &= ~NL80211_RRF_AUTO_BW;

        freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
        if (freq_range->max_bandwidth_khz > freq_diff)
                freq_range->max_bandwidth_khz = freq_diff;

        power_rule->max_eirp = min(power_rule1->max_eirp,
                power_rule2->max_eirp);
        power_rule->max_antenna_gain = min(power_rule1->max_antenna_gain,
                power_rule2->max_antenna_gain);

        intersected_rule->dfs_cac_ms = max(rule1->dfs_cac_ms,
                                           rule2->dfs_cac_ms);

        if (rule1->has_wmm && rule2->has_wmm) {
                u8 ac;

                for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
                        reg_wmm_rules_intersect(&wmm_rule1->client[ac],
                                                &wmm_rule2->client[ac],
                                                &wmm_rule->client[ac]);
                        reg_wmm_rules_intersect(&wmm_rule1->ap[ac],
                                                &wmm_rule2->ap[ac],
                                                &wmm_rule->ap[ac]);
                }

                intersected_rule->has_wmm = true;
        } else if (rule1->has_wmm) {
                *wmm_rule = *wmm_rule1;
                intersected_rule->has_wmm = true;
        } else if (rule2->has_wmm) {
                *wmm_rule = *wmm_rule2;
                intersected_rule->has_wmm = true;
        } else {
                intersected_rule->has_wmm = false;
        }

        if (!is_valid_reg_rule(intersected_rule))
                return -EINVAL;

        return 0;
}

/* check whether old rule contains new rule */
static bool rule_contains(struct ieee80211_reg_rule *r1,
                          struct ieee80211_reg_rule *r2)
{
        /* for simplicity, currently consider only same flags */
        if (r1->flags != r2->flags)
                return false;

        /* verify r1 is more restrictive */
        if ((r1->power_rule.max_antenna_gain >
             r2->power_rule.max_antenna_gain) ||
            r1->power_rule.max_eirp > r2->power_rule.max_eirp)
                return false;

        /* make sure r2's range is contained within r1 */
        if (r1->freq_range.start_freq_khz > r2->freq_range.start_freq_khz ||
            r1->freq_range.end_freq_khz < r2->freq_range.end_freq_khz)
                return false;

        /* and finally verify that r1.max_bw >= r2.max_bw */
        if (r1->freq_range.max_bandwidth_khz <
            r2->freq_range.max_bandwidth_khz)
                return false;

        return true;
}

/* add or extend current rules. do nothing if rule is already contained */
static void add_rule(struct ieee80211_reg_rule *rule,
                     struct ieee80211_reg_rule *reg_rules, u32 *n_rules)
{
        struct ieee80211_reg_rule *tmp_rule;
        int i;

        for (i = 0; i < *n_rules; i++) {
                tmp_rule = &reg_rules[i];
                /* rule is already contained - do nothing */
                if (rule_contains(tmp_rule, rule))
                        return;

                /* extend rule if possible */
                if (rule_contains(rule, tmp_rule)) {
                        memcpy(tmp_rule, rule, sizeof(*rule));
                        return;
                }
        }

        memcpy(&reg_rules[*n_rules], rule, sizeof(*rule));
        (*n_rules)++;
}

/**
 * regdom_intersect - do the intersection between two regulatory domains
 * @rd1: first regulatory domain
 * @rd2: second regulatory domain
 *
 * Use this function to get the intersection between two regulatory domains.
 * Once completed we will mark the alpha2 for the rd as intersected, "98",
 * as no one single alpha2 can represent this regulatory domain.
 *
 * Returns a pointer to the regulatory domain structure which will hold the
 * resulting intersection of rules between rd1 and rd2. We will
 * kzalloc() this structure for you.
 *
 * Returns: the intersected regdomain
 */
static struct ieee80211_regdomain *
regdom_intersect(const struct ieee80211_regdomain *rd1,
                 const struct ieee80211_regdomain *rd2)
{
        int r;
        unsigned int x, y;
        unsigned int num_rules = 0;
        const struct ieee80211_reg_rule *rule1, *rule2;
        struct ieee80211_reg_rule intersected_rule;
        struct ieee80211_regdomain *rd;

        if (!rd1 || !rd2)
                return NULL;

        /*
         * First we get a count of the rules we'll need, then we actually
         * build them. This is to so we can malloc() and free() a
         * regdomain once. The reason we use reg_rules_intersect() here
         * is it will return -EINVAL if the rule computed makes no sense.
         * All rules that do check out OK are valid.
         */

        for (x = 0; x < rd1->n_reg_rules; x++) {
                rule1 = &rd1->reg_rules[x];
                for (y = 0; y < rd2->n_reg_rules; y++) {
                        rule2 = &rd2->reg_rules[y];
                        if (!reg_rules_intersect(rd1, rd2, rule1, rule2,
                                                 &intersected_rule))
                                num_rules++;
                }
        }

        if (!num_rules)
                return NULL;

        rd = kzalloc(struct_size(rd, reg_rules, num_rules), GFP_KERNEL);
        if (!rd)
                return NULL;

        for (x = 0; x < rd1->n_reg_rules; x++) {
                rule1 = &rd1->reg_rules[x];
                for (y = 0; y < rd2->n_reg_rules; y++) {
                        rule2 = &rd2->reg_rules[y];
                        r = reg_rules_intersect(rd1, rd2, rule1, rule2,
                                                &intersected_rule);
                        /*
                         * No need to memset here the intersected rule here as
                         * we're not using the stack anymore
                         */
                        if (r)
                                continue;

                        add_rule(&intersected_rule, rd->reg_rules,
                                 &rd->n_reg_rules);
                }
        }

        rd->alpha2[0] = '9';
        rd->alpha2[1] = '8';
        rd->dfs_region = reg_intersect_dfs_region(rd1->dfs_region,
                                                  rd2->dfs_region);

        return rd;
}

/*
 * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
 * want to just have the channel structure use these
 */
static u32 map_regdom_flags(u32 rd_flags)
{
        u32 channel_flags = 0;
        if (rd_flags & NL80211_RRF_NO_IR_ALL)
                channel_flags |= IEEE80211_CHAN_NO_IR;
        if (rd_flags & NL80211_RRF_DFS)
                channel_flags |= IEEE80211_CHAN_RADAR;
        if (rd_flags & NL80211_RRF_NO_OFDM)
                channel_flags |= IEEE80211_CHAN_NO_OFDM;
        if (rd_flags & NL80211_RRF_NO_OUTDOOR)
                channel_flags |= IEEE80211_CHAN_INDOOR_ONLY;
        if (rd_flags & NL80211_RRF_IR_CONCURRENT)
                channel_flags |= IEEE80211_CHAN_IR_CONCURRENT;
        if (rd_flags & NL80211_RRF_NO_HT40MINUS)
                channel_flags |= IEEE80211_CHAN_NO_HT40MINUS;
        if (rd_flags & NL80211_RRF_NO_HT40PLUS)
                channel_flags |= IEEE80211_CHAN_NO_HT40PLUS;
        if (rd_flags & NL80211_RRF_NO_80MHZ)
                channel_flags |= IEEE80211_CHAN_NO_80MHZ;
        if (rd_flags & NL80211_RRF_NO_160MHZ)
                channel_flags |= IEEE80211_CHAN_NO_160MHZ;
        if (rd_flags & NL80211_RRF_NO_HE)
                channel_flags |= IEEE80211_CHAN_NO_HE;
        if (rd_flags & NL80211_RRF_NO_320MHZ)
                channel_flags |= IEEE80211_CHAN_NO_320MHZ;
        if (rd_flags & NL80211_RRF_NO_EHT)
                channel_flags |= IEEE80211_CHAN_NO_EHT;
        if (rd_flags & NL80211_RRF_DFS_CONCURRENT)
                channel_flags |= IEEE80211_CHAN_DFS_CONCURRENT;
        if (rd_flags & NL80211_RRF_NO_6GHZ_VLP_CLIENT)
                channel_flags |= IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT;
        if (rd_flags & NL80211_RRF_NO_6GHZ_AFC_CLIENT)
                channel_flags |= IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT;
        if (rd_flags & NL80211_RRF_PSD)
                channel_flags |= IEEE80211_CHAN_PSD;
        return channel_flags;
}

static const struct ieee80211_reg_rule *
freq_reg_info_regd(u32 center_freq,
                   const struct ieee80211_regdomain *regd, u32 bw)
{
        int i;
        bool band_rule_found = false;
        bool bw_fits = false;

        if (!regd)
                return ERR_PTR(-EINVAL);

        for (i = 0; i < regd->n_reg_rules; i++) {
                const struct ieee80211_reg_rule *rr;
                const struct ieee80211_freq_range *fr = NULL;

                rr = &regd->reg_rules[i];
                fr = &rr->freq_range;

                /*
                 * We only need to know if one frequency rule was
                 * in center_freq's band, that's enough, so let's
                 * not overwrite it once found
                 */
                if (!band_rule_found)
                        band_rule_found = freq_in_rule_band(fr, center_freq);

                bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw);

                if (band_rule_found && bw_fits)
                        return rr;
        }

        if (!band_rule_found)
                return ERR_PTR(-ERANGE);

        return ERR_PTR(-EINVAL);
}

static const struct ieee80211_reg_rule *
__freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw)
{
        const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy);
        static const u32 bws[] = {0, 1, 2, 4, 5, 8, 10, 16, 20};
        const struct ieee80211_reg_rule *reg_rule = ERR_PTR(-ERANGE);
        int i = ARRAY_SIZE(bws) - 1;
        u32 bw;

        for (bw = MHZ_TO_KHZ(bws[i]); bw >= min_bw; bw = MHZ_TO_KHZ(bws[i--])) {
                reg_rule = freq_reg_info_regd(center_freq, regd, bw);
                if (!IS_ERR(reg_rule))
                        return reg_rule;
        }

        return reg_rule;
}

const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
                                               u32 center_freq)
{
        u32 min_bw = center_freq < MHZ_TO_KHZ(1000) ? 1 : 20;

        return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(min_bw));
}
EXPORT_SYMBOL(freq_reg_info);

const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
{
        switch (initiator) {
        case NL80211_REGDOM_SET_BY_CORE:
                return "core";
        case NL80211_REGDOM_SET_BY_USER:
                return "user";
        case NL80211_REGDOM_SET_BY_DRIVER:
                return "driver";
        case NL80211_REGDOM_SET_BY_COUNTRY_IE:
                return "country element";
        default:
                WARN_ON(1);
                return "bug";
        }
}
EXPORT_SYMBOL(reg_initiator_name);

static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd,
                                          const struct ieee80211_reg_rule *reg_rule,
                                          const struct ieee80211_channel *chan)
{
        const struct ieee80211_freq_range *freq_range = NULL;
        u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0;
        bool is_s1g = chan->band == NL80211_BAND_S1GHZ;

        freq_range = &reg_rule->freq_range;

        max_bandwidth_khz = freq_range->max_bandwidth_khz;
        center_freq_khz = ieee80211_channel_to_khz(chan);
        /* Check if auto calculation requested */
        if (reg_rule->flags & NL80211_RRF_AUTO_BW)
                max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);

        /* If we get a reg_rule we can assume that at least 5Mhz fit */
        if (!cfg80211_does_bw_fit_range(freq_range,
                                        center_freq_khz,
                                        MHZ_TO_KHZ(10)))
                bw_flags |= IEEE80211_CHAN_NO_10MHZ;
        if (!cfg80211_does_bw_fit_range(freq_range,
                                        center_freq_khz,
                                        MHZ_TO_KHZ(20)))
                bw_flags |= IEEE80211_CHAN_NO_20MHZ;

        if (is_s1g) {
                /* S1G is strict about non overlapping channels. We can
                 * calculate which bandwidth is allowed per channel by finding
                 * the largest bandwidth which cleanly divides the freq_range.
                 */
                int edge_offset;
                int ch_bw = max_bandwidth_khz;

                while (ch_bw) {
                        edge_offset = (center_freq_khz - ch_bw / 2) -
                                      freq_range->start_freq_khz;
                        if (edge_offset % ch_bw == 0) {
                                switch (KHZ_TO_MHZ(ch_bw)) {
                                case 1:
                                        bw_flags |= IEEE80211_CHAN_1MHZ;
                                        break;
                                case 2:
                                        bw_flags |= IEEE80211_CHAN_2MHZ;
                                        break;
                                case 4:
                                        bw_flags |= IEEE80211_CHAN_4MHZ;
                                        break;
                                case 8:
                                        bw_flags |= IEEE80211_CHAN_8MHZ;
                                        break;
                                case 16:
                                        bw_flags |= IEEE80211_CHAN_16MHZ;
                                        break;
                                default:
                                        /* If we got here, no bandwidths fit on
                                         * this frequency, ie. band edge.
                                         */
                                        bw_flags |= IEEE80211_CHAN_DISABLED;
                                        break;
                                }
                                break;
                        }
                        ch_bw /= 2;
                }
        } else {
                if (max_bandwidth_khz < MHZ_TO_KHZ(10))
                        bw_flags |= IEEE80211_CHAN_NO_10MHZ;
                if (max_bandwidth_khz < MHZ_TO_KHZ(20))
                        bw_flags |= IEEE80211_CHAN_NO_20MHZ;
                if (max_bandwidth_khz < MHZ_TO_KHZ(40))
                        bw_flags |= IEEE80211_CHAN_NO_HT40;
                if (max_bandwidth_khz < MHZ_TO_KHZ(80))
                        bw_flags |= IEEE80211_CHAN_NO_80MHZ;
                if (max_bandwidth_khz < MHZ_TO_KHZ(160))
                        bw_flags |= IEEE80211_CHAN_NO_160MHZ;
                if (max_bandwidth_khz < MHZ_TO_KHZ(320))
                        bw_flags |= IEEE80211_CHAN_NO_320MHZ;
        }
        return bw_flags;
}

static void handle_channel_single_rule(struct wiphy *wiphy,
                                       enum nl80211_reg_initiator initiator,
                                       struct ieee80211_channel *chan,
                                       u32 flags,
                                       struct regulatory_request *lr,
                                       struct wiphy *request_wiphy,
                                       const struct ieee80211_reg_rule *reg_rule)
{
        u32 bw_flags = 0;
        const struct ieee80211_power_rule *power_rule = NULL;
        const struct ieee80211_regdomain *regd;

        regd = reg_get_regdomain(wiphy);

        power_rule = &reg_rule->power_rule;
        bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan);

        if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
            request_wiphy && request_wiphy == wiphy &&
            request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
                /*
                 * This guarantees the driver's requested regulatory domain
                 * will always be used as a base for further regulatory
                 * settings
                 */
                chan->flags = chan->orig_flags =
                        map_regdom_flags(reg_rule->flags) | bw_flags;
                chan->max_antenna_gain = chan->orig_mag =
                        (int) MBI_TO_DBI(power_rule->max_antenna_gain);
                chan->max_reg_power = chan->max_power = chan->orig_mpwr =
                        (int) MBM_TO_DBM(power_rule->max_eirp);

                if (chan->flags & IEEE80211_CHAN_RADAR) {
                        chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
                        if (reg_rule->dfs_cac_ms)
                                chan->dfs_cac_ms = reg_rule->dfs_cac_ms;
                }

                if (chan->flags & IEEE80211_CHAN_PSD)
                        chan->psd = reg_rule->psd;

                return;
        }

        chan->dfs_state = NL80211_DFS_USABLE;
        chan->dfs_state_entered = jiffies;

        chan->beacon_found = false;
        chan->flags = flags | bw_flags | map_regdom_flags(reg_rule->flags);
        chan->max_antenna_gain =
                min_t(int, chan->orig_mag,
                      MBI_TO_DBI(power_rule->max_antenna_gain));
        chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp);

        if (chan->flags & IEEE80211_CHAN_RADAR) {
                if (reg_rule->dfs_cac_ms)
                        chan->dfs_cac_ms = reg_rule->dfs_cac_ms;
                else
                        chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
        }

        if (chan->flags & IEEE80211_CHAN_PSD)
                chan->psd = reg_rule->psd;

        if (chan->orig_mpwr) {
                /*
                 * Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER
                 * will always follow the passed country IE power settings.
                 */
                if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
                    wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER)
                        chan->max_power = chan->max_reg_power;
                else
                        chan->max_power = min(chan->orig_mpwr,
                                              chan->max_reg_power);
        } else
                chan->max_power = chan->max_reg_power;
}

static void handle_channel_adjacent_rules(struct wiphy *wiphy,
                                          enum nl80211_reg_initiator initiator,
                                          struct ieee80211_channel *chan,
                                          u32 flags,
                                          struct regulatory_request *lr,
                                          struct wiphy *request_wiphy,
                                          const struct ieee80211_reg_rule *rrule1,
                                          const struct ieee80211_reg_rule *rrule2,
                                          struct ieee80211_freq_range *comb_range)
{
        u32 bw_flags1 = 0;
        u32 bw_flags2 = 0;
        const struct ieee80211_power_rule *power_rule1 = NULL;
        const struct ieee80211_power_rule *power_rule2 = NULL;
        const struct ieee80211_regdomain *regd;

        regd = reg_get_regdomain(wiphy);

        power_rule1 = &rrule1->power_rule;
        power_rule2 = &rrule2->power_rule;
        bw_flags1 = reg_rule_to_chan_bw_flags(regd, rrule1, chan);
        bw_flags2 = reg_rule_to_chan_bw_flags(regd, rrule2, chan);

        if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
            request_wiphy && request_wiphy == wiphy &&
            request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
                /* This guarantees the driver's requested regulatory domain
                 * will always be used as a base for further regulatory
                 * settings
                 */
                chan->flags =
                        map_regdom_flags(rrule1->flags) |
                        map_regdom_flags(rrule2->flags) |
                        bw_flags1 |
                        bw_flags2;
                chan->orig_flags = chan->flags;
                chan->max_antenna_gain =
                        min_t(int, MBI_TO_DBI(power_rule1->max_antenna_gain),
                              MBI_TO_DBI(power_rule2->max_antenna_gain));
                chan->orig_mag = chan->max_antenna_gain;
                chan->max_reg_power =
                        min_t(int, MBM_TO_DBM(power_rule1->max_eirp),
                              MBM_TO_DBM(power_rule2->max_eirp));
                chan->max_power = chan->max_reg_power;
                chan->orig_mpwr = chan->max_reg_power;

                if (chan->flags & IEEE80211_CHAN_RADAR) {
                        chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
                        if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms)
                                chan->dfs_cac_ms = max_t(unsigned int,
                                                         rrule1->dfs_cac_ms,
                                                         rrule2->dfs_cac_ms);
                }

                if ((rrule1->flags & NL80211_RRF_PSD) &&
                    (rrule2->flags & NL80211_RRF_PSD))
                        chan->psd = min_t(s8, rrule1->psd, rrule2->psd);
                else
                        chan->flags &= ~NL80211_RRF_PSD;

                return;
        }

        chan->dfs_state = NL80211_DFS_USABLE;
        chan->dfs_state_entered = jiffies;

        chan->beacon_found = false;
        chan->flags = flags | bw_flags1 | bw_flags2 |
                      map_regdom_flags(rrule1->flags) |
                      map_regdom_flags(rrule2->flags);

        /* reg_rule_to_chan_bw_flags may forbids 10 and forbids 20 MHz
         * (otherwise no adj. rule case), recheck therefore
         */
        if (cfg80211_does_bw_fit_range(comb_range,
                                       ieee80211_channel_to_khz(chan),
                                       MHZ_TO_KHZ(10)))
                chan->flags &= ~IEEE80211_CHAN_NO_10MHZ;
        if (cfg80211_does_bw_fit_range(comb_range,
                                       ieee80211_channel_to_khz(chan),
                                       MHZ_TO_KHZ(20)))
                chan->flags &= ~IEEE80211_CHAN_NO_20MHZ;

        chan->max_antenna_gain =
                min_t(int, chan->orig_mag,
                      min_t(int,
                            MBI_TO_DBI(power_rule1->max_antenna_gain),
                            MBI_TO_DBI(power_rule2->max_antenna_gain)));
        chan->max_reg_power = min_t(int,
                                    MBM_TO_DBM(power_rule1->max_eirp),
                                    MBM_TO_DBM(power_rule2->max_eirp));

        if (chan->flags & IEEE80211_CHAN_RADAR) {
                if (rrule1->dfs_cac_ms || rrule2->dfs_cac_ms)
                        chan->dfs_cac_ms = max_t(unsigned int,
                                                 rrule1->dfs_cac_ms,
                                                 rrule2->dfs_cac_ms);
                else
                        chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
        }

        if (chan->orig_mpwr) {
                /* Devices that use REGULATORY_COUNTRY_IE_FOLLOW_POWER
                 * will always follow the passed country IE power settings.
                 */
                if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
                    wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_FOLLOW_POWER)
                        chan->max_power = chan->max_reg_power;
                else
                        chan->max_power = min(chan->orig_mpwr,
                                              chan->max_reg_power);
        } else {
                chan->max_power = chan->max_reg_power;
        }
}

/* Note that right now we assume the desired channel bandwidth
 * is always 20 MHz for each individual channel (HT40 uses 20 MHz
 * per channel, the primary and the extension channel).
 */
static void handle_channel(struct wiphy *wiphy,
                           enum nl80211_reg_initiator initiator,
                           struct ieee80211_channel *chan)
{
        const u32 orig_chan_freq = ieee80211_channel_to_khz(chan);
        struct regulatory_request *lr = get_last_request();
        struct wiphy *request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
        const struct ieee80211_reg_rule *rrule = NULL;
        const struct ieee80211_reg_rule *rrule1 = NULL;
        const struct ieee80211_reg_rule *rrule2 = NULL;

        u32 flags = chan->orig_flags;

        rrule = freq_reg_info(wiphy, orig_chan_freq);
        if (IS_ERR(rrule)) {
                /* check for adjacent match, therefore get rules for
                 * chan - 20 MHz and chan + 20 MHz and test
                 * if reg rules are adjacent
                 */
                rrule1 = freq_reg_info(wiphy,
                                       orig_chan_freq - MHZ_TO_KHZ(20));
                rrule2 = freq_reg_info(wiphy,
                                       orig_chan_freq + MHZ_TO_KHZ(20));
                if (!IS_ERR(rrule1) && !IS_ERR(rrule2)) {
                        struct ieee80211_freq_range comb_range;

                        if (rrule1->freq_range.end_freq_khz !=
                            rrule2->freq_range.start_freq_khz)
                                goto disable_chan;

                        comb_range.start_freq_khz =
                                rrule1->freq_range.start_freq_khz;
                        comb_range.end_freq_khz =
                                rrule2->freq_range.end_freq_khz;
                        comb_range.max_bandwidth_khz =
                                min_t(u32,
                                      rrule1->freq_range.max_bandwidth_khz,
                                      rrule2->freq_range.max_bandwidth_khz);

                        if (!cfg80211_does_bw_fit_range(&comb_range,
                                                        orig_chan_freq,
                                                        MHZ_TO_KHZ(20)))
                                goto disable_chan;

                        handle_channel_adjacent_rules(wiphy, initiator, chan,
                                                      flags, lr, request_wiphy,
                                                      rrule1, rrule2,
                                                      &comb_range);
                        return;
                }

disable_chan:
                /* We will disable all channels that do not match our
                 * received regulatory rule unless the hint is coming
                 * from a Country IE and the Country IE had no information
                 * about a band. The IEEE 802.11 spec allows for an AP
                 * to send only a subset of the regulatory rules allowed,
                 * so an AP in the US that only supports 2.4 GHz may only send
                 * a country IE with information for the 2.4 GHz band
                 * while 5 GHz is still supported.
                 */
                if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
                    PTR_ERR(rrule) == -ERANGE)
                        return;

                if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
                    request_wiphy && request_wiphy == wiphy &&
                    request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
                        pr_debug("Disabling freq %d.%03d MHz for good\n",
                                 chan->center_freq, chan->freq_offset);
                        chan->orig_flags |= IEEE80211_CHAN_DISABLED;
                        chan->flags = chan->orig_flags;
                } else {
                        pr_debug("Disabling freq %d.%03d MHz\n",
                                 chan->center_freq, chan->freq_offset);
                        chan->flags |= IEEE80211_CHAN_DISABLED;
                }
                return;
        }

        handle_channel_single_rule(wiphy, initiator, chan, flags, lr,
                                   request_wiphy, rrule);
}

static void handle_band(struct wiphy *wiphy,
                        enum nl80211_reg_initiator initiator,
                        struct ieee80211_supported_band *sband)
{
        unsigned int i;

        if (!sband)
                return;

        for (i = 0; i < sband->n_channels; i++)
                handle_channel(wiphy, initiator, &sband->channels[i]);
}

static bool reg_request_cell_base(struct regulatory_request *request)
{
        if (request->initiator != NL80211_REGDOM_SET_BY_USER)
                return false;
        return request->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE;
}

bool reg_last_request_cell_base(void)
{
        return reg_request_cell_base(get_last_request());
}

#ifdef CONFIG_CFG80211_REG_CELLULAR_HINTS
/* Core specific check */
static enum reg_request_treatment
reg_ignore_cell_hint(struct regulatory_request *pending_request)
{
        struct regulatory_request *lr = get_last_request();

        if (!reg_num_devs_support_basehint)
                return REG_REQ_IGNORE;

        if (reg_request_cell_base(lr) &&
            !regdom_changes(pending_request->alpha2))
                return REG_REQ_ALREADY_SET;

        return REG_REQ_OK;
}

/* Device specific check */
static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy)
{
        return !(wiphy->features & NL80211_FEATURE_CELL_BASE_REG_HINTS);
}
#else
static enum reg_request_treatment
reg_ignore_cell_hint(struct regulatory_request *pending_request)
{
        return REG_REQ_IGNORE;
}

static bool reg_dev_ignore_cell_hint(struct wiphy *wiphy)
{
        return true;
}
#endif

static bool wiphy_strict_alpha2_regd(struct wiphy *wiphy)
{
        if (wiphy->regulatory_flags & REGULATORY_STRICT_REG &&
            !(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG))
                return true;
        return false;
}

static bool ignore_reg_update(struct wiphy *wiphy,
                              enum nl80211_reg_initiator initiator)
{
        struct regulatory_request *lr = get_last_request();

        if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
                return true;

        if (!lr) {
                pr_debug("Ignoring regulatory request set by %s since last_request is not set\n",
                         reg_initiator_name(initiator));
                return true;
        }

        if (initiator == NL80211_REGDOM_SET_BY_CORE &&
            wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) {
                pr_debug("Ignoring regulatory request set by %s since the driver uses its own custom regulatory domain\n",
                         reg_initiator_name(initiator));
                return true;
        }

        /*
         * wiphy->regd will be set once the device has its own
         * desired regulatory domain set
         */
        if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd &&
            initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
            !is_world_regdom(lr->alpha2)) {
                pr_debug("Ignoring regulatory request set by %s since the driver requires its own regulatory domain to be set first\n",
                         reg_initiator_name(initiator));
                return true;
        }

        if (reg_request_cell_base(lr))
                return reg_dev_ignore_cell_hint(wiphy);

        return false;
}

static bool reg_is_world_roaming(struct wiphy *wiphy)
{
        const struct ieee80211_regdomain *cr = get_cfg80211_regdom();
        const struct ieee80211_regdomain *wr = get_wiphy_regdom(wiphy);
        struct regulatory_request *lr = get_last_request();

        if (is_world_regdom(cr->alpha2) || (wr && is_world_regdom(wr->alpha2)))
                return true;

        if (lr && lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
            wiphy->regulatory_flags & REGULATORY_CUSTOM_REG)
                return true;

        return false;
}

static void reg_call_notifier(struct wiphy *wiphy,
                              struct regulatory_request *request)
{
        if (wiphy->reg_notifier)
                wiphy->reg_notifier(wiphy, request);
}

static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx,
                              struct reg_beacon *reg_beacon)
{
        struct ieee80211_supported_band *sband;
        struct ieee80211_channel *chan;
        bool channel_changed = false;
        struct ieee80211_channel chan_before;
        struct regulatory_request *lr = get_last_request();

        sband = wiphy->bands[reg_beacon->chan.band];
        chan = &sband->channels[chan_idx];

        if (likely(!ieee80211_channel_equal(chan, &reg_beacon->chan)))
                return;

        if (chan->beacon_found)
                return;

        chan->beacon_found = true;

        if (!reg_is_world_roaming(wiphy))
                return;

        if (wiphy->regulatory_flags & REGULATORY_DISABLE_BEACON_HINTS)
                return;

        chan_before = *chan;

        if (chan->flags & IEEE80211_CHAN_NO_IR) {
                chan->flags &= ~IEEE80211_CHAN_NO_IR;
                channel_changed = true;
        }

        if (channel_changed) {
                nl80211_send_beacon_hint_event(wiphy, &chan_before, chan);
                if (wiphy->flags & WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON)
                        reg_call_notifier(wiphy, lr);
        }
}

/*
 * Called when a scan on a wiphy finds a beacon on
 * new channel
 */
static void wiphy_update_new_beacon(struct wiphy *wiphy,
                                    struct reg_beacon *reg_beacon)
{
        unsigned int i;
        struct ieee80211_supported_band *sband;

        if (!wiphy->bands[reg_beacon->chan.band])
                return;

        sband = wiphy->bands[reg_beacon->chan.band];

        for (i = 0; i < sband->n_channels; i++)
                handle_reg_beacon(wiphy, i, reg_beacon);
}

/*
 * Called upon reg changes or a new wiphy is added
 */
static void wiphy_update_beacon_reg(struct wiphy *wiphy)
{
        unsigned int i;
        struct ieee80211_supported_band *sband;
        struct reg_beacon *reg_beacon;

        list_for_each_entry(reg_beacon, &reg_beacon_list, list) {
                if (!wiphy->bands[reg_beacon->chan.band])
                        continue;
                sband = wiphy->bands[reg_beacon->chan.band];
                for (i = 0; i < sband->n_channels; i++)
                        handle_reg_beacon(wiphy, i, reg_beacon);
        }
}

/* Reap the advantages of previously found beacons */
static void reg_process_beacons(struct wiphy *wiphy)
{
        /*
         * Means we are just firing up cfg80211, so no beacons would
         * have been processed yet.
         */
        if (!last_request)
                return;
        wiphy_update_beacon_reg(wiphy);
}

static bool is_ht40_allowed(struct ieee80211_channel *chan)
{
        if (!chan)
                return false;
        if (chan->flags & IEEE80211_CHAN_DISABLED)
                return false;
        /* This would happen when regulatory rules disallow HT40 completely */
        if ((chan->flags & IEEE80211_CHAN_NO_HT40) == IEEE80211_CHAN_NO_HT40)
                return false;
        return true;
}

static void reg_process_ht_flags_channel(struct wiphy *wiphy,
                                         struct ieee80211_channel *channel)
{
        struct ieee80211_supported_band *sband = wiphy->bands[channel->band];
        struct ieee80211_channel *channel_before = NULL, *channel_after = NULL;
        const struct ieee80211_regdomain *regd;
        unsigned int i;
        u32 flags;

        if (!is_ht40_allowed(channel)) {
                channel->flags |= IEEE80211_CHAN_NO_HT40;
                return;
        }

        /*
         * We need to ensure the extension channels exist to
         * be able to use HT40- or HT40+, this finds them (or not)
         */
        for (i = 0; i < sband->n_channels; i++) {
                struct ieee80211_channel *c = &sband->channels[i];

                if (c->center_freq == (channel->center_freq - 20))
                        channel_before = c;
                if (c->center_freq == (channel->center_freq + 20))
                        channel_after = c;
        }

        flags = 0;
        regd = get_wiphy_regdom(wiphy);
        if (regd) {
                const struct ieee80211_reg_rule *reg_rule =
                        freq_reg_info_regd(MHZ_TO_KHZ(channel->center_freq),
                                           regd, MHZ_TO_KHZ(20));

                if (!IS_ERR(reg_rule))
                        flags = reg_rule->flags;
        }

        /*
         * Please note that this assumes target bandwidth is 20 MHz,
         * if that ever changes we also need to change the below logic
         * to include that as well.
         */
        if (!is_ht40_allowed(channel_before) ||
            flags & NL80211_RRF_NO_HT40MINUS)
                channel->flags |= IEEE80211_CHAN_NO_HT40MINUS;
        else
                channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS;

        if (!is_ht40_allowed(channel_after) ||
            flags & NL80211_RRF_NO_HT40PLUS)
                channel->flags |= IEEE80211_CHAN_NO_HT40PLUS;
        else
                channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS;
}

static void reg_process_ht_flags_band(struct wiphy *wiphy,
                                      struct ieee80211_supported_band *sband)
{
        unsigned int i;

        if (!sband)
                return;

        for (i = 0; i < sband->n_channels; i++)
                reg_process_ht_flags_channel(wiphy, &sband->channels[i]);
}

static void reg_process_ht_flags(struct wiphy *wiphy)
{
        enum nl80211_band band;

        if (!wiphy)
                return;

        for (band = 0; band < NUM_NL80211_BANDS; band++)
                reg_process_ht_flags_band(wiphy, wiphy->bands[band]);
}

static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
{
        struct cfg80211_chan_def chandef = {};
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        enum nl80211_iftype iftype;
        bool ret;
        int link;

        iftype = wdev->iftype;

        /* make sure the interface is active */
        if (!wdev->netdev || !netif_running(wdev->netdev))
                return true;

        for (link = 0; link < ARRAY_SIZE(wdev->links); link++) {
                struct ieee80211_channel *chan;

                if (!wdev->valid_links && link > 0)
                        break;
                if (wdev->valid_links && !(wdev->valid_links & BIT(link)))
                        continue;
                switch (iftype) {
                case NL80211_IFTYPE_AP:
                case NL80211_IFTYPE_P2P_GO:
                        if (!wdev->links[link].ap.beacon_interval)
                                continue;
                        chandef = wdev->links[link].ap.chandef;
                        break;
                case NL80211_IFTYPE_MESH_POINT:
                        if (!wdev->u.mesh.beacon_interval)
                                continue;
                        chandef = wdev->u.mesh.chandef;
                        break;
                case NL80211_IFTYPE_ADHOC:
                        if (!wdev->u.ibss.ssid_len)
                                continue;
                        chandef = wdev->u.ibss.chandef;
                        break;
                case NL80211_IFTYPE_STATION:
                case NL80211_IFTYPE_P2P_CLIENT:
                        /* Maybe we could consider disabling that link only? */
                        if (!wdev->links[link].client.current_bss)
                                continue;

                        chan = wdev->links[link].client.current_bss->pub.channel;
                        if (!chan)
                                continue;

                        if (!rdev->ops->get_channel ||
                            rdev_get_channel(rdev, wdev, link, &chandef))
                                cfg80211_chandef_create(&chandef, chan,
                                                        NL80211_CHAN_NO_HT);
                        break;
                case NL80211_IFTYPE_MONITOR:
                case NL80211_IFTYPE_AP_VLAN:
                case NL80211_IFTYPE_P2P_DEVICE:
                        /* no enforcement required */
                        break;
                case NL80211_IFTYPE_OCB:
                        if (!wdev->u.ocb.chandef.chan)
                                continue;
                        chandef = wdev->u.ocb.chandef;
                        break;
                case NL80211_IFTYPE_NAN:
                        /* we have no info, but NAN is also pretty universal */
                        continue;
                default:
                        /* others not implemented for now */
                        WARN_ON_ONCE(1);
                        break;
                }

                switch (iftype) {
                case NL80211_IFTYPE_AP:
                case NL80211_IFTYPE_P2P_GO:
                case NL80211_IFTYPE_ADHOC:
                case NL80211_IFTYPE_MESH_POINT:
                        ret = cfg80211_reg_can_beacon_relax(wiphy, &chandef,
                                                            iftype);
                        if (!ret)
                                return ret;
                        break;
                case NL80211_IFTYPE_STATION:
                case NL80211_IFTYPE_P2P_CLIENT:
                        ret = cfg80211_chandef_usable(wiphy, &chandef,
                                                      IEEE80211_CHAN_DISABLED);
                        if (!ret)
                                return ret;
                        break;
                default:
                        break;
                }
        }

        return true;
}

static void reg_leave_invalid_chans(struct wiphy *wiphy)
{
        struct wireless_dev *wdev;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        wiphy_lock(wiphy);
        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
                if (!reg_wdev_chan_valid(wiphy, wdev))
                        cfg80211_leave(rdev, wdev);
        wiphy_unlock(wiphy);
}

static void reg_check_chans_work(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;

        pr_debug("Verifying active interfaces after reg change\n");
        rtnl_lock();

        for_each_rdev(rdev)
                reg_leave_invalid_chans(&rdev->wiphy);

        rtnl_unlock();
}

void reg_check_channels(void)
{
        /*
         * Give usermode a chance to do something nicer (move to another
         * channel, orderly disconnection), before forcing a disconnection.
         */
        mod_delayed_work(system_power_efficient_wq,
                         &reg_check_chans,
                         msecs_to_jiffies(REG_ENFORCE_GRACE_MS));
}

static void wiphy_update_regulatory(struct wiphy *wiphy,
                                    enum nl80211_reg_initiator initiator)
{
        enum nl80211_band band;
        struct regulatory_request *lr = get_last_request();

        if (ignore_reg_update(wiphy, initiator)) {
                /*
                 * Regulatory updates set by CORE are ignored for custom
                 * regulatory cards. Let us notify the changes to the driver,
                 * as some drivers used this to restore its orig_* reg domain.
                 */
                if (initiator == NL80211_REGDOM_SET_BY_CORE &&
                    wiphy->regulatory_flags & REGULATORY_CUSTOM_REG &&
                    !(wiphy->regulatory_flags &
                      REGULATORY_WIPHY_SELF_MANAGED))
                        reg_call_notifier(wiphy, lr);
                return;
        }

        lr->dfs_region = get_cfg80211_regdom()->dfs_region;

        for (band = 0; band < NUM_NL80211_BANDS; band++)
                handle_band(wiphy, initiator, wiphy->bands[band]);

        reg_process_beacons(wiphy);
        reg_process_ht_flags(wiphy);
        reg_call_notifier(wiphy, lr);
}

static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
{
        struct cfg80211_registered_device *rdev;
        struct wiphy *wiphy;

        ASSERT_RTNL();

        for_each_rdev(rdev) {
                wiphy = &rdev->wiphy;
                wiphy_update_regulatory(wiphy, initiator);
        }

        reg_check_channels();
}

static void handle_channel_custom(struct wiphy *wiphy,
                                  struct ieee80211_channel *chan,
                                  const struct ieee80211_regdomain *regd,
                                  u32 min_bw)
{
        u32 bw_flags = 0;
        const struct ieee80211_reg_rule *reg_rule = NULL;
        const struct ieee80211_power_rule *power_rule = NULL;
        u32 bw, center_freq_khz;

        center_freq_khz = ieee80211_channel_to_khz(chan);
        for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) {
                reg_rule = freq_reg_info_regd(center_freq_khz, regd, bw);
                if (!IS_ERR(reg_rule))
                        break;
        }

        if (IS_ERR_OR_NULL(reg_rule)) {
                pr_debug("Disabling freq %d.%03d MHz as custom regd has no rule that fits it\n",
                         chan->center_freq, chan->freq_offset);
                if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
                        chan->flags |= IEEE80211_CHAN_DISABLED;
                } else {
                        chan->orig_flags |= IEEE80211_CHAN_DISABLED;
                        chan->flags = chan->orig_flags;
                }
                return;
        }

        power_rule = &reg_rule->power_rule;
        bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan);

        chan->dfs_state_entered = jiffies;
        chan->dfs_state = NL80211_DFS_USABLE;

        chan->beacon_found = false;

        if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
                chan->flags = chan->orig_flags | bw_flags |
                              map_regdom_flags(reg_rule->flags);
        else
                chan->flags |= map_regdom_flags(reg_rule->flags) | bw_flags;

        chan->max_antenna_gain = (int) MBI_TO_DBI(power_rule->max_antenna_gain);
        chan->max_reg_power = chan->max_power =
                (int) MBM_TO_DBM(power_rule->max_eirp);

        if (chan->flags & IEEE80211_CHAN_RADAR) {
                if (reg_rule->dfs_cac_ms)
                        chan->dfs_cac_ms = reg_rule->dfs_cac_ms;
                else
                        chan->dfs_cac_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
        }

        if (chan->flags & IEEE80211_CHAN_PSD)
                chan->psd = reg_rule->psd;

        chan->max_power = chan->max_reg_power;
}

static void handle_band_custom(struct wiphy *wiphy,
                               struct ieee80211_supported_band *sband,
                               const struct ieee80211_regdomain *regd)
{
        unsigned int i;

        if (!sband)
                return;

        /*
         * We currently assume that you always want at least 20 MHz,
         * otherwise channel 12 might get enabled if this rule is
         * compatible to US, which permits 2402 - 2472 MHz.
         */
        for (i = 0; i < sband->n_channels; i++)
                handle_channel_custom(wiphy, &sband->channels[i], regd,
                                      MHZ_TO_KHZ(20));
}

/* Used by drivers prior to wiphy registration */
void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
                                   const struct ieee80211_regdomain *regd)
{
        const struct ieee80211_regdomain *new_regd, *tmp;
        enum nl80211_band band;
        unsigned int bands_set = 0;

        WARN(!(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG),
             "wiphy should have REGULATORY_CUSTOM_REG\n");
        wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG;

        for (band = 0; band < NUM_NL80211_BANDS; band++) {
                if (!wiphy->bands[band])
                        continue;
                handle_band_custom(wiphy, wiphy->bands[band], regd);
                bands_set++;
        }

        /*
         * no point in calling this if it won't have any effect
         * on your device's supported bands.
         */
        WARN_ON(!bands_set);
        new_regd = reg_copy_regd(regd);
        if (IS_ERR(new_regd))
                return;

        rtnl_lock();
        wiphy_lock(wiphy);

        tmp = get_wiphy_regdom(wiphy);
        rcu_assign_pointer(wiphy->regd, new_regd);
        rcu_free_regdom(tmp);

        wiphy_unlock(wiphy);
        rtnl_unlock();
}
EXPORT_SYMBOL(wiphy_apply_custom_regulatory);

static void reg_set_request_processed(void)
{
        bool need_more_processing = false;
        struct regulatory_request *lr = get_last_request();

        lr->processed = true;

        spin_lock(&reg_requests_lock);
        if (!list_empty(&reg_requests_list))
                need_more_processing = true;
        spin_unlock(&reg_requests_lock);

        cancel_crda_timeout();

        if (need_more_processing)
                schedule_work(&reg_work);
}

/**
 * reg_process_hint_core - process core regulatory requests
 * @core_request: a pending core regulatory request
 *
 * The wireless subsystem can use this function to process
 * a regulatory request issued by the regulatory core.
 *
 * Returns: %REG_REQ_OK or %REG_REQ_IGNORE, indicating if the
 *        hint was processed or ignored
 */
static enum reg_request_treatment
reg_process_hint_core(struct regulatory_request *core_request)
{
        if (reg_query_database(core_request)) {
                core_request->intersect = false;
                core_request->processed = false;
                reg_update_last_request(core_request);
                return REG_REQ_OK;
        }

        return REG_REQ_IGNORE;
}

static enum reg_request_treatment
__reg_process_hint_user(struct regulatory_request *user_request)
{
        struct regulatory_request *lr = get_last_request();

        if (reg_request_cell_base(user_request))
                return reg_ignore_cell_hint(user_request);

        if (reg_request_cell_base(lr))
                return REG_REQ_IGNORE;

        if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)
                return REG_REQ_INTERSECT;
        /*
         * If the user knows better the user should set the regdom
         * to their country before the IE is picked up
         */
        if (lr->initiator == NL80211_REGDOM_SET_BY_USER &&
            lr->intersect)
                return REG_REQ_IGNORE;
        /*
         * Process user requests only after previous user/driver/core
         * requests have been processed
         */
        if ((lr->initiator == NL80211_REGDOM_SET_BY_CORE ||
             lr->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
             lr->initiator == NL80211_REGDOM_SET_BY_USER) &&
            regdom_changes(lr->alpha2))
                return REG_REQ_IGNORE;

        if (!regdom_changes(user_request->alpha2))
                return REG_REQ_ALREADY_SET;

        return REG_REQ_OK;
}

/**
 * reg_process_hint_user - process user regulatory requests
 * @user_request: a pending user regulatory request
 *
 * The wireless subsystem can use this function to process
 * a regulatory request initiated by userspace.
 *
 * Returns: %REG_REQ_OK or %REG_REQ_IGNORE, indicating if the
 *        hint was processed or ignored
 */
static enum reg_request_treatment
reg_process_hint_user(struct regulatory_request *user_request)
{
        enum reg_request_treatment treatment;

        treatment = __reg_process_hint_user(user_request);
        if (treatment == REG_REQ_IGNORE ||
            treatment == REG_REQ_ALREADY_SET)
                return REG_REQ_IGNORE;

        user_request->intersect = treatment == REG_REQ_INTERSECT;
        user_request->processed = false;

        if (reg_query_database(user_request)) {
                reg_update_last_request(user_request);
                user_alpha2[0] = user_request->alpha2[0];
                user_alpha2[1] = user_request->alpha2[1];
                return REG_REQ_OK;
        }

        return REG_REQ_IGNORE;
}

static enum reg_request_treatment
__reg_process_hint_driver(struct regulatory_request *driver_request)
{
        struct regulatory_request *lr = get_last_request();

        if (lr->initiator == NL80211_REGDOM_SET_BY_CORE) {
                if (regdom_changes(driver_request->alpha2))
                        return REG_REQ_OK;
                return REG_REQ_ALREADY_SET;
        }

        /*
         * This would happen if you unplug and plug your card
         * back in or if you add a new device for which the previously
         * loaded card also agrees on the regulatory domain.
         */
        if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
            !regdom_changes(driver_request->alpha2))
                return REG_REQ_ALREADY_SET;

        return REG_REQ_INTERSECT;
}

/**
 * reg_process_hint_driver - process driver regulatory requests
 * @wiphy: the wireless device for the regulatory request
 * @driver_request: a pending driver regulatory request
 *
 * The wireless subsystem can use this function to process
 * a regulatory request issued by an 802.11 driver.
 *
 * Returns: one of the different reg request treatment values.
 */
static enum reg_request_treatment
reg_process_hint_driver(struct wiphy *wiphy,
                        struct regulatory_request *driver_request)
{
        const struct ieee80211_regdomain *regd, *tmp;
        enum reg_request_treatment treatment;

        treatment = __reg_process_hint_driver(driver_request);

        switch (treatment) {
        case REG_REQ_OK:
                break;
        case REG_REQ_IGNORE:
                return REG_REQ_IGNORE;
        case REG_REQ_INTERSECT:
        case REG_REQ_ALREADY_SET:
                regd = reg_copy_regd(get_cfg80211_regdom());
                if (IS_ERR(regd))
                        return REG_REQ_IGNORE;

                tmp = get_wiphy_regdom(wiphy);
                ASSERT_RTNL();
                wiphy_lock(wiphy);
                rcu_assign_pointer(wiphy->regd, regd);
                wiphy_unlock(wiphy);
                rcu_free_regdom(tmp);
        }


        driver_request->intersect = treatment == REG_REQ_INTERSECT;
        driver_request->processed = false;

        /*
         * Since CRDA will not be called in this case as we already
         * have applied the requested regulatory domain before we just
         * inform userspace we have processed the request
         */
        if (treatment == REG_REQ_ALREADY_SET) {
                nl80211_send_reg_change_event(driver_request);
                reg_update_last_request(driver_request);
                reg_set_request_processed();
                return REG_REQ_ALREADY_SET;
        }

        if (reg_query_database(driver_request)) {
                reg_update_last_request(driver_request);
                return REG_REQ_OK;
        }

        return REG_REQ_IGNORE;
}

static enum reg_request_treatment
__reg_process_hint_country_ie(struct wiphy *wiphy,
                              struct regulatory_request *country_ie_request)
{
        struct wiphy *last_wiphy = NULL;
        struct regulatory_request *lr = get_last_request();

        if (reg_request_cell_base(lr)) {
                /* Trust a Cell base station over the AP's country IE */
                if (regdom_changes(country_ie_request->alpha2))
                        return REG_REQ_IGNORE;
                return REG_REQ_ALREADY_SET;
        } else {
                if (wiphy->regulatory_flags & REGULATORY_COUNTRY_IE_IGNORE)
                        return REG_REQ_IGNORE;
        }

        if (unlikely(!is_an_alpha2(country_ie_request->alpha2)))
                return -EINVAL;

        if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE)
                return REG_REQ_OK;

        last_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);

        if (last_wiphy != wiphy) {
                /*
                 * Two cards with two APs claiming different
                 * Country IE alpha2s. We could
                 * intersect them, but that seems unlikely
                 * to be correct. Reject second one for now.
                 */
                if (regdom_changes(country_ie_request->alpha2))
                        return REG_REQ_IGNORE;
                return REG_REQ_ALREADY_SET;
        }

        if (regdom_changes(country_ie_request->alpha2))
                return REG_REQ_OK;
        return REG_REQ_ALREADY_SET;
}

/**
 * reg_process_hint_country_ie - process regulatory requests from country IEs
 * @wiphy: the wireless device for the regulatory request
 * @country_ie_request: a regulatory request from a country IE
 *
 * The wireless subsystem can use this function to process
 * a regulatory request issued by a country Information Element.
 *
 * Returns: one of the different reg request treatment values.
 */
static enum reg_request_treatment
reg_process_hint_country_ie(struct wiphy *wiphy,
                            struct regulatory_request *country_ie_request)
{
        enum reg_request_treatment treatment;

        treatment = __reg_process_hint_country_ie(wiphy, country_ie_request);

        switch (treatment) {
        case REG_REQ_OK:
                break;
        case REG_REQ_IGNORE:
                return REG_REQ_IGNORE;
        case REG_REQ_ALREADY_SET:
                reg_free_request(country_ie_request);
                return REG_REQ_ALREADY_SET;
        case REG_REQ_INTERSECT:
                /*
                 * This doesn't happen yet, not sure we
                 * ever want to support it for this case.
                 */
                WARN_ONCE(1, "Unexpected intersection for country elements");
                return REG_REQ_IGNORE;
        }

        country_ie_request->intersect = false;
        country_ie_request->processed = false;

        if (reg_query_database(country_ie_request)) {
                reg_update_last_request(country_ie_request);
                return REG_REQ_OK;
        }

        return REG_REQ_IGNORE;
}

bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2)
{
        const struct ieee80211_regdomain *wiphy1_regd = NULL;
        const struct ieee80211_regdomain *wiphy2_regd = NULL;
        const struct ieee80211_regdomain *cfg80211_regd = NULL;
        bool dfs_domain_same;

        rcu_read_lock();

        cfg80211_regd = rcu_dereference(cfg80211_regdomain);
        wiphy1_regd = rcu_dereference(wiphy1->regd);
        if (!wiphy1_regd)
                wiphy1_regd = cfg80211_regd;

        wiphy2_regd = rcu_dereference(wiphy2->regd);
        if (!wiphy2_regd)
                wiphy2_regd = cfg80211_regd;

        dfs_domain_same = wiphy1_regd->dfs_region == wiphy2_regd->dfs_region;

        rcu_read_unlock();

        return dfs_domain_same;
}

static void reg_copy_dfs_chan_state(struct ieee80211_channel *dst_chan,
                                    struct ieee80211_channel *src_chan)
{
        if (!(dst_chan->flags & IEEE80211_CHAN_RADAR) ||
            !(src_chan->flags & IEEE80211_CHAN_RADAR))
                return;

        if (dst_chan->flags & IEEE80211_CHAN_DISABLED ||
            src_chan->flags & IEEE80211_CHAN_DISABLED)
                return;

        if (src_chan->center_freq == dst_chan->center_freq &&
            dst_chan->dfs_state == NL80211_DFS_USABLE) {
                dst_chan->dfs_state = src_chan->dfs_state;
                dst_chan->dfs_state_entered = src_chan->dfs_state_entered;
        }
}

static void wiphy_share_dfs_chan_state(struct wiphy *dst_wiphy,
                                       struct wiphy *src_wiphy)
{
        struct ieee80211_supported_band *src_sband, *dst_sband;
        struct ieee80211_channel *src_chan, *dst_chan;
        int i, j, band;

        if (!reg_dfs_domain_same(dst_wiphy, src_wiphy))
                return;

        for (band = 0; band < NUM_NL80211_BANDS; band++) {
                dst_sband = dst_wiphy->bands[band];
                src_sband = src_wiphy->bands[band];
                if (!dst_sband || !src_sband)
                        continue;

                for (i = 0; i < dst_sband->n_channels; i++) {
                        dst_chan = &dst_sband->channels[i];
                        for (j = 0; j < src_sband->n_channels; j++) {
                                src_chan = &src_sband->channels[j];
                                reg_copy_dfs_chan_state(dst_chan, src_chan);
                        }
                }
        }
}

static void wiphy_all_share_dfs_chan_state(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev;

        ASSERT_RTNL();

        for_each_rdev(rdev) {
                if (wiphy == &rdev->wiphy)
                        continue;
                wiphy_share_dfs_chan_state(wiphy, &rdev->wiphy);
        }
}

/* This processes *all* regulatory hints */
static void reg_process_hint(struct regulatory_request *reg_request)
{
        struct wiphy *wiphy = NULL;
        enum reg_request_treatment treatment;
        enum nl80211_reg_initiator initiator = reg_request->initiator;

        if (reg_request->wiphy_idx != WIPHY_IDX_INVALID)
                wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);

        switch (initiator) {
        case NL80211_REGDOM_SET_BY_CORE:
                treatment = reg_process_hint_core(reg_request);
                break;
        case NL80211_REGDOM_SET_BY_USER:
                treatment = reg_process_hint_user(reg_request);
                break;
        case NL80211_REGDOM_SET_BY_DRIVER:
                if (!wiphy)
                        goto out_free;
                treatment = reg_process_hint_driver(wiphy, reg_request);
                break;
        case NL80211_REGDOM_SET_BY_COUNTRY_IE:
                if (!wiphy)
                        goto out_free;
                treatment = reg_process_hint_country_ie(wiphy, reg_request);
                break;
        default:
                WARN(1, "invalid initiator %d\n", initiator);
                goto out_free;
        }

        if (treatment == REG_REQ_IGNORE)
                goto out_free;

        WARN(treatment != REG_REQ_OK && treatment != REG_REQ_ALREADY_SET,
             "unexpected treatment value %d\n", treatment);

        /* This is required so that the orig_* parameters are saved.
         * NOTE: treatment must be set for any case that reaches here!
         */
        if (treatment == REG_REQ_ALREADY_SET && wiphy &&
            wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
                wiphy_update_regulatory(wiphy, initiator);
                wiphy_all_share_dfs_chan_state(wiphy);
                reg_check_channels();
        }

        return;

out_free:
        reg_free_request(reg_request);
}

static void notify_self_managed_wiphys(struct regulatory_request *request)
{
        struct cfg80211_registered_device *rdev;
        struct wiphy *wiphy;

        for_each_rdev(rdev) {
                wiphy = &rdev->wiphy;
                if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
                    request->initiator == NL80211_REGDOM_SET_BY_USER)
                        reg_call_notifier(wiphy, request);
        }
}

/*
 * Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_*
 * Regulatory hints come on a first come first serve basis and we
 * must process each one atomically.
 */
static void reg_process_pending_hints(void)
{
        struct regulatory_request *reg_request, *lr;

        lr = get_last_request();

        /* When last_request->processed becomes true this will be rescheduled */
        if (lr && !lr->processed) {
                pr_debug("Pending regulatory request, waiting for it to be processed...\n");
                return;
        }

        spin_lock(&reg_requests_lock);

        if (list_empty(&reg_requests_list)) {
                spin_unlock(&reg_requests_lock);
                return;
        }

        reg_request = list_first_entry(&reg_requests_list,
                                       struct regulatory_request,
                                       list);
        list_del_init(&reg_request->list);

        spin_unlock(&reg_requests_lock);

        notify_self_managed_wiphys(reg_request);

        reg_process_hint(reg_request);

        lr = get_last_request();

        spin_lock(&reg_requests_lock);
        if (!list_empty(&reg_requests_list) && lr && lr->processed)
                schedule_work(&reg_work);
        spin_unlock(&reg_requests_lock);
}

/* Processes beacon hints -- this has nothing to do with country IEs */
static void reg_process_pending_beacon_hints(void)
{
        struct cfg80211_registered_device *rdev;
        struct reg_beacon *pending_beacon, *tmp;

        /* This goes through the _pending_ beacon list */
        spin_lock_bh(&reg_pending_beacons_lock);

        list_for_each_entry_safe(pending_beacon, tmp,
                                 &reg_pending_beacons, list) {
                list_del_init(&pending_beacon->list);

                /* Applies the beacon hint to current wiphys */
                for_each_rdev(rdev)
                        wiphy_update_new_beacon(&rdev->wiphy, pending_beacon);

                /* Remembers the beacon hint for new wiphys or reg changes */
                list_add_tail(&pending_beacon->list, &reg_beacon_list);
        }

        spin_unlock_bh(&reg_pending_beacons_lock);
}

static void reg_process_self_managed_hint(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        const struct ieee80211_regdomain *tmp;
        const struct ieee80211_regdomain *regd;
        enum nl80211_band band;
        struct regulatory_request request = {};

        ASSERT_RTNL();
        lockdep_assert_wiphy(wiphy);

        spin_lock(&reg_requests_lock);
        regd = rdev->requested_regd;
        rdev->requested_regd = NULL;
        spin_unlock(&reg_requests_lock);

        if (!regd)
                return;

        tmp = get_wiphy_regdom(wiphy);
        rcu_assign_pointer(wiphy->regd, regd);
        rcu_free_regdom(tmp);

        for (band = 0; band < NUM_NL80211_BANDS; band++)
                handle_band_custom(wiphy, wiphy->bands[band], regd);

        reg_process_ht_flags(wiphy);

        request.wiphy_idx = get_wiphy_idx(wiphy);
        request.alpha2[0] = regd->alpha2[0];
        request.alpha2[1] = regd->alpha2[1];
        request.initiator = NL80211_REGDOM_SET_BY_DRIVER;

        if (wiphy->flags & WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER)
                reg_call_notifier(wiphy, &request);

        nl80211_send_wiphy_reg_change_event(&request);
}

static void reg_process_self_managed_hints(void)
{
        struct cfg80211_registered_device *rdev;

        ASSERT_RTNL();

        for_each_rdev(rdev) {
                wiphy_lock(&rdev->wiphy);
                reg_process_self_managed_hint(&rdev->wiphy);
                wiphy_unlock(&rdev->wiphy);
        }

        reg_check_channels();
}

static void reg_todo(struct work_struct *work)
{
        rtnl_lock();
        reg_process_pending_hints();
        reg_process_pending_beacon_hints();
        reg_process_self_managed_hints();
        rtnl_unlock();
}

static void queue_regulatory_request(struct regulatory_request *request)
{
        request->alpha2[0] = toupper(request->alpha2[0]);
        request->alpha2[1] = toupper(request->alpha2[1]);

        spin_lock(&reg_requests_lock);
        list_add_tail(&request->list, &reg_requests_list);
        spin_unlock(&reg_requests_lock);

        schedule_work(&reg_work);
}

/*
 * Core regulatory hint -- happens during cfg80211_init()
 * and when we restore regulatory settings.
 */
static int regulatory_hint_core(const char *alpha2)
{
        struct regulatory_request *request;

        request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
        if (!request)
                return -ENOMEM;

        request->alpha2[0] = alpha2[0];
        request->alpha2[1] = alpha2[1];
        request->initiator = NL80211_REGDOM_SET_BY_CORE;
        request->wiphy_idx = WIPHY_IDX_INVALID;

        queue_regulatory_request(request);

        return 0;
}

/* User hints */
int regulatory_hint_user(const char *alpha2,
                         enum nl80211_user_reg_hint_type user_reg_hint_type)
{
        struct regulatory_request *request;

        if (WARN_ON(!alpha2))
                return -EINVAL;

        if (!is_world_regdom(alpha2) && !is_an_alpha2(alpha2))
                return -EINVAL;

        request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
        if (!request)
                return -ENOMEM;

        request->wiphy_idx = WIPHY_IDX_INVALID;
        request->alpha2[0] = alpha2[0];
        request->alpha2[1] = alpha2[1];
        request->initiator = NL80211_REGDOM_SET_BY_USER;
        request->user_reg_hint_type = user_reg_hint_type;

        /* Allow calling CRDA again */
        reset_crda_timeouts();

        queue_regulatory_request(request);

        return 0;
}

void regulatory_hint_indoor(bool is_indoor, u32 portid)
{
        spin_lock(&reg_indoor_lock);

        /* It is possible that more than one user space process is trying to
         * configure the indoor setting. To handle such cases, clear the indoor
         * setting in case that some process does not think that the device
         * is operating in an indoor environment. In addition, if a user space
         * process indicates that it is controlling the indoor setting, save its
         * portid, i.e., make it the owner.
         */
        reg_is_indoor = is_indoor;
        if (reg_is_indoor) {
                if (!reg_is_indoor_portid)
                        reg_is_indoor_portid = portid;
        } else {
                reg_is_indoor_portid = 0;
        }

        spin_unlock(&reg_indoor_lock);

        if (!is_indoor)
                reg_check_channels();
}

void regulatory_netlink_notify(u32 portid)
{
        spin_lock(&reg_indoor_lock);

        if (reg_is_indoor_portid != portid) {
                spin_unlock(&reg_indoor_lock);
                return;
        }

        reg_is_indoor = false;
        reg_is_indoor_portid = 0;

        spin_unlock(&reg_indoor_lock);

        reg_check_channels();
}

/* Driver hints */
int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
{
        struct regulatory_request *request;

        if (WARN_ON(!alpha2 || !wiphy))
                return -EINVAL;

        wiphy->regulatory_flags &= ~REGULATORY_CUSTOM_REG;

        request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
        if (!request)
                return -ENOMEM;

        request->wiphy_idx = get_wiphy_idx(wiphy);

        request->alpha2[0] = alpha2[0];
        request->alpha2[1] = alpha2[1];
        request->initiator = NL80211_REGDOM_SET_BY_DRIVER;

        /* Allow calling CRDA again */
        reset_crda_timeouts();

        queue_regulatory_request(request);

        return 0;
}
EXPORT_SYMBOL(regulatory_hint);

void regulatory_hint_country_ie(struct wiphy *wiphy, enum nl80211_band band,
                                const u8 *country_ie, u8 country_ie_len)
{
        char alpha2[2];
        enum environment_cap env = ENVIRON_ANY;
        struct regulatory_request *request = NULL, *lr;

        /* IE len must be evenly divisible by 2 */
        if (country_ie_len & 0x01)
                return;

        if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
                return;

        request = kzalloc(sizeof(*request), GFP_KERNEL);
        if (!request)
                return;

        alpha2[0] = country_ie[0];
        alpha2[1] = country_ie[1];

        if (country_ie[2] == 'I')
                env = ENVIRON_INDOOR;
        else if (country_ie[2] == 'O')
                env = ENVIRON_OUTDOOR;

        rcu_read_lock();
        lr = get_last_request();

        if (unlikely(!lr))
                goto out;

        /*
         * We will run this only upon a successful connection on cfg80211.
         * We leave conflict resolution to the workqueue, where can hold
         * the RTNL.
         */
        if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
            lr->wiphy_idx != WIPHY_IDX_INVALID)
                goto out;

        request->wiphy_idx = get_wiphy_idx(wiphy);
        request->alpha2[0] = alpha2[0];
        request->alpha2[1] = alpha2[1];
        request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
        request->country_ie_env = env;

        /* Allow calling CRDA again */
        reset_crda_timeouts();

        queue_regulatory_request(request);
        request = NULL;
out:
        kfree(request);
        rcu_read_unlock();
}

static void restore_alpha2(char *alpha2, bool reset_user)
{
        /* indicates there is no alpha2 to consider for restoration */
        alpha2[0] = '9';
        alpha2[1] = '7';

        /* The user setting has precedence over the module parameter */
        if (is_user_regdom_saved()) {
                /* Unless we're asked to ignore it and reset it */
                if (reset_user) {
                        pr_debug("Restoring regulatory settings including user preference\n");
                        user_alpha2[0] = '9';
                        user_alpha2[1] = '7';

                        /*
                         * If we're ignoring user settings, we still need to
                         * check the module parameter to ensure we put things
                         * back as they were for a full restore.
                         */
                        if (!is_world_regdom(ieee80211_regdom)) {
                                pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
                                         ieee80211_regdom[0], ieee80211_regdom[1]);
                                alpha2[0] = ieee80211_regdom[0];
                                alpha2[1] = ieee80211_regdom[1];
                        }
                } else {
                        pr_debug("Restoring regulatory settings while preserving user preference for: %c%c\n",
                                 user_alpha2[0], user_alpha2[1]);
                        alpha2[0] = user_alpha2[0];
                        alpha2[1] = user_alpha2[1];
                }
        } else if (!is_world_regdom(ieee80211_regdom)) {
                pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
                         ieee80211_regdom[0], ieee80211_regdom[1]);
                alpha2[0] = ieee80211_regdom[0];
                alpha2[1] = ieee80211_regdom[1];
        } else
                pr_debug("Restoring regulatory settings\n");
}

static void restore_custom_reg_settings(struct wiphy *wiphy)
{
        struct ieee80211_supported_band *sband;
        enum nl80211_band band;
        struct ieee80211_channel *chan;
        int i;

        for (band = 0; band < NUM_NL80211_BANDS; band++) {
                sband = wiphy->bands[band];
                if (!sband)
                        continue;
                for (i = 0; i < sband->n_channels; i++) {
                        chan = &sband->channels[i];
                        chan->flags = chan->orig_flags;
                        chan->max_antenna_gain = chan->orig_mag;
                        chan->max_power = chan->orig_mpwr;
                        chan->beacon_found = false;
                }
        }
}

/*
 * Restoring regulatory settings involves ignoring any
 * possibly stale country IE information and user regulatory
 * settings if so desired, this includes any beacon hints
 * learned as we could have traveled outside to another country
 * after disconnection. To restore regulatory settings we do
 * exactly what we did at bootup:
 *
 *   - send a core regulatory hint
 *   - send a user regulatory hint if applicable
 *
 * Device drivers that send a regulatory hint for a specific country
 * keep their own regulatory domain on wiphy->regd so that does
 * not need to be remembered.
 */
static void restore_regulatory_settings(bool reset_user, bool cached)
{
        char alpha2[2];
        char world_alpha2[2];
        struct reg_beacon *reg_beacon, *btmp;
        LIST_HEAD(tmp_reg_req_list);
        struct cfg80211_registered_device *rdev;

        ASSERT_RTNL();

        /*
         * Clear the indoor setting in case that it is not controlled by user
         * space, as otherwise there is no guarantee that the device is still
         * operating in an indoor environment.
         */
        spin_lock(&reg_indoor_lock);
        if (reg_is_indoor && !reg_is_indoor_portid) {
                reg_is_indoor = false;
                reg_check_channels();
        }
        spin_unlock(&reg_indoor_lock);

        reset_regdomains(true, &world_regdom);
        restore_alpha2(alpha2, reset_user);

        /*
         * If there's any pending requests we simply
         * stash them to a temporary pending queue and
         * add then after we've restored regulatory
         * settings.
         */
        spin_lock(&reg_requests_lock);
        list_splice_tail_init(&reg_requests_list, &tmp_reg_req_list);
        spin_unlock(&reg_requests_lock);

        /* Clear beacon hints */
        spin_lock_bh(&reg_pending_beacons_lock);
        list_for_each_entry_safe(reg_beacon, btmp, &reg_pending_beacons, list) {
                list_del(&reg_beacon->list);
                kfree(reg_beacon);
        }
        spin_unlock_bh(&reg_pending_beacons_lock);

        list_for_each_entry_safe(reg_beacon, btmp, &reg_beacon_list, list) {
                list_del(&reg_beacon->list);
                kfree(reg_beacon);
        }

        /* First restore to the basic regulatory settings */
        world_alpha2[0] = cfg80211_world_regdom->alpha2[0];
        world_alpha2[1] = cfg80211_world_regdom->alpha2[1];

        for_each_rdev(rdev) {
                if (rdev->wiphy.regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
                        continue;
                if (rdev->wiphy.regulatory_flags & REGULATORY_CUSTOM_REG)
                        restore_custom_reg_settings(&rdev->wiphy);
        }

        if (cached && (!is_an_alpha2(alpha2) ||
                       !IS_ERR_OR_NULL(cfg80211_user_regdom))) {
                reset_regdomains(false, cfg80211_world_regdom);
                update_all_wiphy_regulatory(NL80211_REGDOM_SET_BY_CORE);
                print_regdomain(get_cfg80211_regdom());
                nl80211_send_reg_change_event(&core_request_world);
                reg_set_request_processed();

                if (is_an_alpha2(alpha2) &&
                    !regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER)) {
                        struct regulatory_request *ureq;

                        spin_lock(&reg_requests_lock);
                        ureq = list_last_entry(&reg_requests_list,
                                               struct regulatory_request,
                                               list);
                        list_del(&ureq->list);
                        spin_unlock(&reg_requests_lock);

                        notify_self_managed_wiphys(ureq);
                        reg_update_last_request(ureq);
                        set_regdom(reg_copy_regd(cfg80211_user_regdom),
                                   REGD_SOURCE_CACHED);
                }
        } else {
                regulatory_hint_core(world_alpha2);

                /*
                 * This restores the ieee80211_regdom module parameter
                 * preference or the last user requested regulatory
                 * settings, user regulatory settings takes precedence.
                 */
                if (is_an_alpha2(alpha2))
                        regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER);
        }

        spin_lock(&reg_requests_lock);
        list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
        spin_unlock(&reg_requests_lock);

        pr_debug("Kicking the queue\n");

        schedule_work(&reg_work);
}

static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag)
{
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;

        for_each_rdev(rdev) {
                wiphy_lock(&rdev->wiphy);
                list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                        if (!(wdev->wiphy->regulatory_flags & flag)) {
                                wiphy_unlock(&rdev->wiphy);
                                return false;
                        }
                }
                wiphy_unlock(&rdev->wiphy);
        }

        return true;
}

void regulatory_hint_disconnect(void)
{
        /* Restore of regulatory settings is not required when wiphy(s)
         * ignore IE from connected access point but clearance of beacon hints
         * is required when wiphy(s) supports beacon hints.
         */
        if (is_wiphy_all_set_reg_flag(REGULATORY_COUNTRY_IE_IGNORE)) {
                struct reg_beacon *reg_beacon, *btmp;

                if (is_wiphy_all_set_reg_flag(REGULATORY_DISABLE_BEACON_HINTS))
                        return;

                spin_lock_bh(&reg_pending_beacons_lock);
                list_for_each_entry_safe(reg_beacon, btmp,
                                         &reg_pending_beacons, list) {
                        list_del(&reg_beacon->list);
                        kfree(reg_beacon);
                }
                spin_unlock_bh(&reg_pending_beacons_lock);

                list_for_each_entry_safe(reg_beacon, btmp,
                                         &reg_beacon_list, list) {
                        list_del(&reg_beacon->list);
                        kfree(reg_beacon);
                }

                return;
        }

        pr_debug("All devices are disconnected, going to restore regulatory settings\n");
        restore_regulatory_settings(false, true);
}

static bool freq_is_chan_12_13_14(u32 freq)
{
        if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) ||
            freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) ||
            freq == ieee80211_channel_to_frequency(14, NL80211_BAND_2GHZ))
                return true;
        return false;
}

static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan)
{
        struct reg_beacon *pending_beacon;

        list_for_each_entry(pending_beacon, &reg_pending_beacons, list)
                if (ieee80211_channel_equal(beacon_chan,
                                            &pending_beacon->chan))
                        return true;
        return false;
}

void regulatory_hint_found_beacon(struct wiphy *wiphy,
                                  struct ieee80211_channel *beacon_chan,
                                  gfp_t gfp)
{
        struct reg_beacon *reg_beacon;
        bool processing;

        if (beacon_chan->beacon_found ||
            beacon_chan->flags & IEEE80211_CHAN_RADAR ||
            (beacon_chan->band == NL80211_BAND_2GHZ &&
             !freq_is_chan_12_13_14(beacon_chan->center_freq)))
                return;

        spin_lock_bh(&reg_pending_beacons_lock);
        processing = pending_reg_beacon(beacon_chan);
        spin_unlock_bh(&reg_pending_beacons_lock);

        if (processing)
                return;

        reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp);
        if (!reg_beacon)
                return;

        pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n",
                 beacon_chan->center_freq, beacon_chan->freq_offset,
                 ieee80211_freq_khz_to_channel(
                         ieee80211_channel_to_khz(beacon_chan)),
                 wiphy_name(wiphy));

        memcpy(&reg_beacon->chan, beacon_chan,
               sizeof(struct ieee80211_channel));

        /*
         * Since we can be called from BH or and non-BH context
         * we must use spin_lock_bh()
         */
        spin_lock_bh(&reg_pending_beacons_lock);
        list_add_tail(&reg_beacon->list, &reg_pending_beacons);
        spin_unlock_bh(&reg_pending_beacons_lock);

        schedule_work(&reg_work);
}

static void print_rd_rules(const struct ieee80211_regdomain *rd)
{
        unsigned int i;
        const struct ieee80211_reg_rule *reg_rule = NULL;
        const struct ieee80211_freq_range *freq_range = NULL;
        const struct ieee80211_power_rule *power_rule = NULL;
        char bw[32], cac_time[32];

        pr_debug("  (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n");

        for (i = 0; i < rd->n_reg_rules; i++) {
                reg_rule = &rd->reg_rules[i];
                freq_range = &reg_rule->freq_range;
                power_rule = &reg_rule->power_rule;

                if (reg_rule->flags & NL80211_RRF_AUTO_BW)
                        snprintf(bw, sizeof(bw), "%d KHz, %u KHz AUTO",
                                 freq_range->max_bandwidth_khz,
                                 reg_get_max_bandwidth(rd, reg_rule));
                else
                        snprintf(bw, sizeof(bw), "%d KHz",
                                 freq_range->max_bandwidth_khz);

                if (reg_rule->flags & NL80211_RRF_DFS)
                        scnprintf(cac_time, sizeof(cac_time), "%u s",
                                  reg_rule->dfs_cac_ms/1000);
                else
                        scnprintf(cac_time, sizeof(cac_time), "N/A");


                /*
                 * There may not be documentation for max antenna gain
                 * in certain regions
                 */
                if (power_rule->max_antenna_gain)
                        pr_debug("  (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n",
                                freq_range->start_freq_khz,
                                freq_range->end_freq_khz,
                                bw,
                                power_rule->max_antenna_gain,
                                power_rule->max_eirp,
                                cac_time);
                else
                        pr_debug("  (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n",
                                freq_range->start_freq_khz,
                                freq_range->end_freq_khz,
                                bw,
                                power_rule->max_eirp,
                                cac_time);
        }
}

bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region)
{
        switch (dfs_region) {
        case NL80211_DFS_UNSET:
        case NL80211_DFS_FCC:
        case NL80211_DFS_ETSI:
        case NL80211_DFS_JP:
                return true;
        default:
                pr_debug("Ignoring unknown DFS master region: %d\n", dfs_region);
                return false;
        }
}

static void print_regdomain(const struct ieee80211_regdomain *rd)
{
        struct regulatory_request *lr = get_last_request();

        if (is_intersected_alpha2(rd->alpha2)) {
                if (lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE) {
                        struct cfg80211_registered_device *rdev;
                        rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx);
                        if (rdev) {
                                pr_debug("Current regulatory domain updated by AP to: %c%c\n",
                                        rdev->country_ie_alpha2[0],
                                        rdev->country_ie_alpha2[1]);
                        } else
                                pr_debug("Current regulatory domain intersected:\n");
                } else
                        pr_debug("Current regulatory domain intersected:\n");
        } else if (is_world_regdom(rd->alpha2)) {
                pr_debug("World regulatory domain updated:\n");
        } else {
                if (is_unknown_alpha2(rd->alpha2))
                        pr_debug("Regulatory domain changed to driver built-in settings (unknown country)\n");
                else {
                        if (reg_request_cell_base(lr))
                                pr_debug("Regulatory domain changed to country: %c%c by Cell Station\n",
                                        rd->alpha2[0], rd->alpha2[1]);
                        else
                                pr_debug("Regulatory domain changed to country: %c%c\n",
                                        rd->alpha2[0], rd->alpha2[1]);
                }
        }

        pr_debug(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region));
        print_rd_rules(rd);
}

static void print_regdomain_info(const struct ieee80211_regdomain *rd)
{
        pr_debug("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]);
        print_rd_rules(rd);
}

static int reg_set_rd_core(const struct ieee80211_regdomain *rd)
{
        if (!is_world_regdom(rd->alpha2))
                return -EINVAL;
        update_world_regdomain(rd);
        return 0;
}

static int reg_set_rd_user(const struct ieee80211_regdomain *rd,
                           struct regulatory_request *user_request)
{
        const struct ieee80211_regdomain *intersected_rd = NULL;

        if (!regdom_changes(rd->alpha2))
                return -EALREADY;

        if (!is_valid_rd(rd)) {
                pr_err("Invalid regulatory domain detected: %c%c\n",
                       rd->alpha2[0], rd->alpha2[1]);
                print_regdomain_info(rd);
                return -EINVAL;
        }

        if (!user_request->intersect) {
                reset_regdomains(false, rd);
                return 0;
        }

        intersected_rd = regdom_intersect(rd, get_cfg80211_regdom());
        if (!intersected_rd)
                return -EINVAL;

        kfree(rd);
        rd = NULL;
        reset_regdomains(false, intersected_rd);

        return 0;
}

static int reg_set_rd_driver(const struct ieee80211_regdomain *rd,
                             struct regulatory_request *driver_request)
{
        const struct ieee80211_regdomain *regd;
        const struct ieee80211_regdomain *intersected_rd = NULL;
        const struct ieee80211_regdomain *tmp = NULL;
        struct wiphy *request_wiphy;

        if (is_world_regdom(rd->alpha2))
                return -EINVAL;

        if (!regdom_changes(rd->alpha2))
                return -EALREADY;

        if (!is_valid_rd(rd)) {
                pr_err("Invalid regulatory domain detected: %c%c\n",
                       rd->alpha2[0], rd->alpha2[1]);
                print_regdomain_info(rd);
                return -EINVAL;
        }

        request_wiphy = wiphy_idx_to_wiphy(driver_request->wiphy_idx);
        if (!request_wiphy)
                return -ENODEV;

        if (!driver_request->intersect) {
                ASSERT_RTNL();
                wiphy_lock(request_wiphy);
                if (request_wiphy->regd)
                        tmp = get_wiphy_regdom(request_wiphy);

                regd = reg_copy_regd(rd);
                if (IS_ERR(regd)) {
                        wiphy_unlock(request_wiphy);
                        return PTR_ERR(regd);
                }

                rcu_assign_pointer(request_wiphy->regd, regd);
                rcu_free_regdom(tmp);
                wiphy_unlock(request_wiphy);
                reset_regdomains(false, rd);
                return 0;
        }

        intersected_rd = regdom_intersect(rd, get_cfg80211_regdom());
        if (!intersected_rd)
                return -EINVAL;

        /*
         * We can trash what CRDA provided now.
         * However if a driver requested this specific regulatory
         * domain we keep it for its private use
         */
        tmp = get_wiphy_regdom(request_wiphy);
        rcu_assign_pointer(request_wiphy->regd, rd);
        rcu_free_regdom(tmp);

        rd = NULL;

        reset_regdomains(false, intersected_rd);

        return 0;
}

static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd,
                                 struct regulatory_request *country_ie_request)
{
        struct wiphy *request_wiphy;

        if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) &&
            !is_unknown_alpha2(rd->alpha2))
                return -EINVAL;

        /*
         * Lets only bother proceeding on the same alpha2 if the current
         * rd is non static (it means CRDA was present and was used last)
         * and the pending request came in from a country IE
         */

        if (!is_valid_rd(rd)) {
                pr_err("Invalid regulatory domain detected: %c%c\n",
                       rd->alpha2[0], rd->alpha2[1]);
                print_regdomain_info(rd);
                return -EINVAL;
        }

        request_wiphy = wiphy_idx_to_wiphy(country_ie_request->wiphy_idx);
        if (!request_wiphy)
                return -ENODEV;

        if (country_ie_request->intersect)
                return -EINVAL;

        reset_regdomains(false, rd);
        return 0;
}

/*
 * Use this call to set the current regulatory domain. Conflicts with
 * multiple drivers can be ironed out later. Caller must've already
 * kmalloc'd the rd structure.
 */
int set_regdom(const struct ieee80211_regdomain *rd,
               enum ieee80211_regd_source regd_src)
{
        struct regulatory_request *lr;
        bool user_reset = false;
        int r;

        if (IS_ERR_OR_NULL(rd))
                return -ENODATA;

        if (!reg_is_valid_request(rd->alpha2)) {
                kfree(rd);
                return -EINVAL;
        }

        if (regd_src == REGD_SOURCE_CRDA)
                reset_crda_timeouts();

        lr = get_last_request();

        /* Note that this doesn't update the wiphys, this is done below */
        switch (lr->initiator) {
        case NL80211_REGDOM_SET_BY_CORE:
                r = reg_set_rd_core(rd);
                break;
        case NL80211_REGDOM_SET_BY_USER:
                cfg80211_save_user_regdom(rd);
                r = reg_set_rd_user(rd, lr);
                user_reset = true;
                break;
        case NL80211_REGDOM_SET_BY_DRIVER:
                r = reg_set_rd_driver(rd, lr);
                break;
        case NL80211_REGDOM_SET_BY_COUNTRY_IE:
                r = reg_set_rd_country_ie(rd, lr);
                break;
        default:
                WARN(1, "invalid initiator %d\n", lr->initiator);
                kfree(rd);
                return -EINVAL;
        }

        if (r) {
                switch (r) {
                case -EALREADY:
                        reg_set_request_processed();
                        break;
                default:
                        /* Back to world regulatory in case of errors */
                        restore_regulatory_settings(user_reset, false);
                }

                kfree(rd);
                return r;
        }

        /* This would make this whole thing pointless */
        if (WARN_ON(!lr->intersect && rd != get_cfg80211_regdom()))
                return -EINVAL;

        /* update all wiphys now with the new established regulatory domain */
        update_all_wiphy_regulatory(lr->initiator);

        print_regdomain(get_cfg80211_regdom());

        nl80211_send_reg_change_event(lr);

        reg_set_request_processed();

        return 0;
}

static int __regulatory_set_wiphy_regd(struct wiphy *wiphy,
                                       struct ieee80211_regdomain *rd)
{
        const struct ieee80211_regdomain *regd;
        const struct ieee80211_regdomain *prev_regd;
        struct cfg80211_registered_device *rdev;

        if (WARN_ON(!wiphy || !rd))
                return -EINVAL;

        if (WARN(!(wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED),
                 "wiphy should have REGULATORY_WIPHY_SELF_MANAGED\n"))
                return -EPERM;

        if (WARN(!is_valid_rd(rd),
                 "Invalid regulatory domain detected: %c%c\n",
                 rd->alpha2[0], rd->alpha2[1])) {
                print_regdomain_info(rd);
                return -EINVAL;
        }

        regd = reg_copy_regd(rd);
        if (IS_ERR(regd))
                return PTR_ERR(regd);

        rdev = wiphy_to_rdev(wiphy);

        spin_lock(&reg_requests_lock);
        prev_regd = rdev->requested_regd;
        rdev->requested_regd = regd;
        spin_unlock(&reg_requests_lock);

        kfree(prev_regd);
        return 0;
}

int regulatory_set_wiphy_regd(struct wiphy *wiphy,
                              struct ieee80211_regdomain *rd)
{
        int ret = __regulatory_set_wiphy_regd(wiphy, rd);

        if (ret)
                return ret;

        schedule_work(&reg_work);
        return 0;
}
EXPORT_SYMBOL(regulatory_set_wiphy_regd);

int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy,
                                   struct ieee80211_regdomain *rd)
{
        int ret;

        ASSERT_RTNL();

        ret = __regulatory_set_wiphy_regd(wiphy, rd);
        if (ret)
                return ret;

        /* process the request immediately */
        reg_process_self_managed_hint(wiphy);
        reg_check_channels();
        return 0;
}
EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync);

void wiphy_regulatory_register(struct wiphy *wiphy)
{
        struct regulatory_request *lr = get_last_request();

        /* self-managed devices ignore beacon hints and country IE */
        if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
                wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS |
                                           REGULATORY_COUNTRY_IE_IGNORE;

                /*
                 * The last request may have been received before this
                 * registration call. Call the driver notifier if
                 * initiator is USER.
                 */
                if (lr->initiator == NL80211_REGDOM_SET_BY_USER)
                        reg_call_notifier(wiphy, lr);
        }

        if (!reg_dev_ignore_cell_hint(wiphy))
                reg_num_devs_support_basehint++;

        wiphy_update_regulatory(wiphy, lr->initiator);
        wiphy_all_share_dfs_chan_state(wiphy);
        reg_process_self_managed_hints();
}

void wiphy_regulatory_deregister(struct wiphy *wiphy)
{
        struct wiphy *request_wiphy = NULL;
        struct regulatory_request *lr;

        lr = get_last_request();

        if (!reg_dev_ignore_cell_hint(wiphy))
                reg_num_devs_support_basehint--;

        rcu_free_regdom(get_wiphy_regdom(wiphy));
        RCU_INIT_POINTER(wiphy->regd, NULL);

        if (lr)
                request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);

        if (!request_wiphy || request_wiphy != wiphy)
                return;

        lr->wiphy_idx = WIPHY_IDX_INVALID;
        lr->country_ie_env = ENVIRON_ANY;
}

/*
 * See FCC notices for UNII band definitions
 *  5GHz: https://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii
 *  6GHz: https://www.fcc.gov/document/fcc-proposes-more-spectrum-unlicensed-use-0
 */
int cfg80211_get_unii(int freq)
{
        /* UNII-1 */
        if (freq >= 5150 && freq <= 5250)
                return 0;

        /* UNII-2A */
        if (freq > 5250 && freq <= 5350)
                return 1;

        /* UNII-2B */
        if (freq > 5350 && freq <= 5470)
                return 2;

        /* UNII-2C */
        if (freq > 5470 && freq <= 5725)
                return 3;

        /* UNII-3 */
        if (freq > 5725 && freq <= 5825)
                return 4;

        /* UNII-5 */
        if (freq > 5925 && freq <= 6425)
                return 5;

        /* UNII-6 */
        if (freq > 6425 && freq <= 6525)
                return 6;

        /* UNII-7 */
        if (freq > 6525 && freq <= 6875)
                return 7;

        /* UNII-8 */
        if (freq > 6875 && freq <= 7125)
                return 8;

        return -EINVAL;
}

bool regulatory_indoor_allowed(void)
{
        return reg_is_indoor;
}

bool regulatory_pre_cac_allowed(struct wiphy *wiphy)
{
        const struct ieee80211_regdomain *regd = NULL;
        const struct ieee80211_regdomain *wiphy_regd = NULL;
        bool pre_cac_allowed = false;

        rcu_read_lock();

        regd = rcu_dereference(cfg80211_regdomain);
        wiphy_regd = rcu_dereference(wiphy->regd);
        if (!wiphy_regd) {
                if (regd->dfs_region == NL80211_DFS_ETSI)
                        pre_cac_allowed = true;

                rcu_read_unlock();

                return pre_cac_allowed;
        }

        if (regd->dfs_region == wiphy_regd->dfs_region &&
            wiphy_regd->dfs_region == NL80211_DFS_ETSI)
                pre_cac_allowed = true;

        rcu_read_unlock();

        return pre_cac_allowed;
}
EXPORT_SYMBOL(regulatory_pre_cac_allowed);

static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev)
{
        struct wireless_dev *wdev;
        /* If we finished CAC or received radar, we should end any
         * CAC running on the same channels.
         * the check !cfg80211_chandef_dfs_usable contain 2 options:
         * either all channels are available - those the CAC_FINISHED
         * event has effected another wdev state, or there is a channel
         * in unavailable state in wdev chandef - those the RADAR_DETECTED
         * event has effected another wdev state.
         * In both cases we should end the CAC on the wdev.
         */
        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                struct cfg80211_chan_def *chandef;

                if (!wdev->cac_started)
                        continue;

                /* FIXME: radar detection is tied to link 0 for now */
                chandef = wdev_chandef(wdev, 0);
                if (!chandef)
                        continue;

                if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef))
                        rdev_end_cac(rdev, wdev->netdev);
        }
}

void regulatory_propagate_dfs_state(struct wiphy *wiphy,
                                    struct cfg80211_chan_def *chandef,
                                    enum nl80211_dfs_state dfs_state,
                                    enum nl80211_radar_event event)
{
        struct cfg80211_registered_device *rdev;

        ASSERT_RTNL();

        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return;

        for_each_rdev(rdev) {
                if (wiphy == &rdev->wiphy)
                        continue;

                if (!reg_dfs_domain_same(wiphy, &rdev->wiphy))
                        continue;

                if (!ieee80211_get_channel(&rdev->wiphy,
                                           chandef->chan->center_freq))
                        continue;

                cfg80211_set_dfs_state(&rdev->wiphy, chandef, dfs_state);

                if (event == NL80211_RADAR_DETECTED ||
                    event == NL80211_RADAR_CAC_FINISHED) {
                        cfg80211_sched_dfs_chan_update(rdev);
                        cfg80211_check_and_end_cac(rdev);
                }

                nl80211_radar_notify(rdev, chandef, event, NULL, GFP_KERNEL);
        }
}

static int __init regulatory_init_db(void)
{
        int err;

        /*
         * It's possible that - due to other bugs/issues - cfg80211
         * never called regulatory_init() below, or that it failed;
         * in that case, don't try to do any further work here as
         * it's doomed to lead to crashes.
         */
        if (IS_ERR_OR_NULL(reg_pdev))
                return -EINVAL;

        err = load_builtin_regdb_keys();
        if (err) {
                platform_device_unregister(reg_pdev);
                return err;
        }

        /* We always try to get an update for the static regdomain */
        err = regulatory_hint_core(cfg80211_world_regdom->alpha2);
        if (err) {
                if (err == -ENOMEM) {
                        platform_device_unregister(reg_pdev);
                        return err;
                }
                /*
                 * N.B. kobject_uevent_env() can fail mainly for when we're out
                 * memory which is handled and propagated appropriately above
                 * but it can also fail during a netlink_broadcast() or during
                 * early boot for call_usermodehelper(). For now treat these
                 * errors as non-fatal.
                 */
                pr_err("kobject_uevent_env() was unable to call CRDA during init\n");
        }

        /*
         * Finally, if the user set the module parameter treat it
         * as a user hint.
         */
        if (!is_world_regdom(ieee80211_regdom))
                regulatory_hint_user(ieee80211_regdom,
                                     NL80211_USER_REG_HINT_USER);

        return 0;
}
#ifndef MODULE
late_initcall(regulatory_init_db);
#endif

int __init regulatory_init(void)
{
        reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
        if (IS_ERR(reg_pdev))
                return PTR_ERR(reg_pdev);

        rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom);

        user_alpha2[0] = '9';
        user_alpha2[1] = '7';

#ifdef MODULE
        return regulatory_init_db();
#else
        return 0;
#endif
}

void regulatory_exit(void)
{
        struct regulatory_request *reg_request, *tmp;
        struct reg_beacon *reg_beacon, *btmp;

        cancel_work_sync(&reg_work);
        cancel_crda_timeout_sync();
        cancel_delayed_work_sync(&reg_check_chans);

        /* Lock to suppress warnings */
        rtnl_lock();
        reset_regdomains(true, NULL);
        rtnl_unlock();

        dev_set_uevent_suppress(&reg_pdev->dev, true);

        platform_device_unregister(reg_pdev);

        list_for_each_entry_safe(reg_beacon, btmp, &reg_pending_beacons, list) {
                list_del(&reg_beacon->list);
                kfree(reg_beacon);
        }

        list_for_each_entry_safe(reg_beacon, btmp, &reg_beacon_list, list) {
                list_del(&reg_beacon->list);
                kfree(reg_beacon);
        }

        list_for_each_entry_safe(reg_request, tmp, &reg_requests_list, list) {
                list_del(&reg_request->list);
                kfree(reg_request);
        }

        if (!IS_ERR_OR_NULL(regdb))
                kfree(regdb);
        if (!IS_ERR_OR_NULL(cfg80211_user_regdom))
                kfree(cfg80211_user_regdom);

        free_regdb_keyring();
}















































































    1 




















































































    1 






    1 



    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
// SPDX-License-Identifier: GPL-2.0
#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/smp.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/stat.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/elf.h>

#include <asm/elf.h>
#include <asm/ia32.h>

/*
 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
 */
static unsigned long get_align_mask(void)
{
        /* handle 32- and 64-bit case with a single conditional */
        if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
                return 0;

        if (!(current->flags & PF_RANDOMIZE))
                return 0;

        return va_align.mask;
}

/*
 * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
 * va_align.bits, [12:upper_bit), are set to a random value instead of
 * zeroing them. This random value is computed once per boot. This form
 * of ASLR is known as "per-boot ASLR".
 *
 * To achieve this, the random value is added to the info.align_offset
 * value before calling vm_unmapped_area() or ORed directly to the
 * address.
 */
static unsigned long get_align_bits(void)
{
        return va_align.bits & get_align_mask();
}

static int __init control_va_addr_alignment(char *str)
{
        /* guard against enabling this on other CPU families */
        if (va_align.flags < 0)
                return 1;

        if (*str == 0)
                return 1;

        if (!strcmp(str, "32"))
                va_align.flags = ALIGN_VA_32;
        else if (!strcmp(str, "64"))
                va_align.flags = ALIGN_VA_64;
        else if (!strcmp(str, "off"))
                va_align.flags = 0;
        else if (!strcmp(str, "on"))
                va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
        else
                pr_warn("invalid option value: 'align_va_addr=%s'\n", str);

        return 1;
}
__setup("align_va_addr=", control_va_addr_alignment);

SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, off)
{
        if (off & ~PAGE_MASK)
                return -EINVAL;

        return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}

static void find_start_end(unsigned long addr, unsigned long flags,
                unsigned long *begin, unsigned long *end)
{
        if (!in_32bit_syscall() && (flags & MAP_32BIT)) {
                /* This is usually used needed to map code in small
                   model, so it needs to be in the first 31bit. Limit
                   it to that.  This means we need to move the
                   unmapped base down for this case. This can give
                   conflicts with the heap, but we assume that glibc
                   malloc knows how to fall back to mmap. Give it 1GB
                   of playground for now. -AK */
                *begin = 0x40000000;
                *end = 0x80000000;
                if (current->flags & PF_RANDOMIZE) {
                        *begin = randomize_page(*begin, 0x02000000);
                }
                return;
        }

        *begin        = get_mmap_base(1);
        if (in_32bit_syscall())
                *end = task_size_32bit();
        else
                *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}

static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
{
        if (vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

unsigned long
arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
                       unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        struct vm_unmapped_area_info info = {};
        unsigned long begin, end;

        if (flags & MAP_FIXED)
                return addr;

        find_start_end(addr, flags, &begin, &end);

        if (len > end)
                return -ENOMEM;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma(mm, addr);
                if (end - len >= addr &&
                    (!vma || addr + len <= vm_start_gap(vma)))
                        return addr;
        }

        info.length = len;
        info.low_limit = begin;
        info.high_limit = end;
        info.align_offset = pgoff << PAGE_SHIFT;
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp) {
                info.align_mask = get_align_mask();
                info.align_offset += get_align_bits();
        }
        return vm_unmapped_area(&info);
}

unsigned long
arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr0,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags)
{
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
        unsigned long addr = addr0;
        struct vm_unmapped_area_info info = {};

        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
                return -ENOMEM;

        /* No address checking. See comment at mmap_address_hint_valid() */
        if (flags & MAP_FIXED)
                return addr;

        /* for MAP_32BIT mappings we force the legacy mmap base */
        if (!in_32bit_syscall() && (flags & MAP_32BIT))
                goto bottomup;

        /* requesting a specific address */
        if (addr) {
                addr &= PAGE_MASK;
                if (!mmap_address_hint_valid(addr, len))
                        goto get_unmapped_area;

                vma = find_vma(mm, addr);
                if (!vma || addr + len <= vm_start_gap(vma))
                        return addr;
        }
get_unmapped_area:

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        if (!in_32bit_syscall() && (flags & MAP_ABOVE4G))
                info.low_limit = SZ_4G;
        else
                info.low_limit = PAGE_SIZE;

        info.high_limit = get_mmap_base(0);
        info.start_gap = stack_guard_placement(vm_flags);

        /*
         * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
         * in the full address space.
         *
         * !in_32bit_syscall() check to avoid high addresses for x32
         * (and make it no op on native i386).
         */
        if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
                info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;

        info.align_offset = pgoff << PAGE_SHIFT;
        if (filp) {
                info.align_mask = get_align_mask();
                info.align_offset += get_align_bits();
        }
        addr = vm_unmapped_area(&info);
        if (!(addr & ~PAGE_MASK))
                return addr;
        VM_BUG_ON(addr != -ENOMEM);

bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
}

unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
}

unsigned long
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
                          const unsigned long len, const unsigned long pgoff,
                          const unsigned long flags)
{
        return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, flags, 0);
}






















































    1 


















    1 



    1 


































    1 






    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 













    1 
















    1 
    1 
    1 
    1 





    1 















    1 











    1 





    1 





    1 

    1 












    1 


























    1 









    1 























































































































































































































































































































































































































































































































































































































































































































    1 


    1 

    1 












    1 








    1 














    1 

    1 

    1 





































































































































































































































































































































































































































































































































































































    1 












    1 












    1 










    1 




































































































































































    1 





    1 




    1 





    1 




















    1 





    1 






































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/inode.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 *
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
 */

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>

#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

#include <trace/events/ext4.h>

static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
                              struct ext4_inode_info *ei)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;
        __u16 dummy_csum = 0;
        int offset = offsetof(struct ext4_inode, i_checksum_lo);
        unsigned int csum_size = sizeof(dummy_csum);

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
        offset += csum_size;
        csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                           EXT4_GOOD_OLD_INODE_SIZE - offset);

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                offset = offsetof(struct ext4_inode, i_checksum_hi);
                csum = ext4_chksum(sbi, csum, (__u8 *)raw +
                                   EXT4_GOOD_OLD_INODE_SIZE,
                                   offset - EXT4_GOOD_OLD_INODE_SIZE);
                if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
                        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
                                           csum_size);
                        offset += csum_size;
                }
                csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                                   EXT4_INODE_SIZE(inode->i_sb) - offset);
        }

        return csum;
}

static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
                                  struct ext4_inode_info *ei)
{
        __u32 provided, calculated;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_metadata_csum(inode->i_sb))
                return 1;

        provided = le16_to_cpu(raw->i_checksum_lo);
        calculated = ext4_inode_csum(inode, raw, ei);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
        else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei)
{
        __u32 csum;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_metadata_csum(inode->i_sb))
                return;

        csum = ext4_inode_csum(inode, raw, ei);
        raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}

static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
{
        trace_ext4_begin_ordered_truncate(inode, new_size);
        /*
         * If jinode is zero, then we never opened the file for
         * writing, so there's no need to call
         * jbd2_journal_begin_ordered_truncate() since there's no
         * outstanding writes we need to flush.
         */
        if (!EXT4_I(inode)->jinode)
                return 0;
        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
                                                   EXT4_I(inode)->jinode,
                                                   new_size);
}

static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents);

/*
 * Test whether an inode is a fast symlink.
 * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
 */
int ext4_inode_is_fast_symlink(struct inode *inode)
{
        if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                int ea_blocks = EXT4_I(inode)->i_file_acl ?
                                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;

                if (ext4_has_inline_data(inode))
                        return 0;

                return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
        }
        return S_ISLNK(inode->i_mode) && inode->i_size &&
               (inode->i_size < EXT4_N_BLOCKS * 4);
}

/*
 * Called at the last iput() if i_nlink is zero.
 */
void ext4_evict_inode(struct inode *inode)
{
        handle_t *handle;
        int err;
        /*
         * Credits for final inode cleanup and freeing:
         * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
         * (xattr block freeing), bitmap, group descriptor (inode freeing)
         */
        int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
        bool freeze_protected = false;

        trace_ext4_evict_inode(inode);

        if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
                ext4_evict_ea_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages_final(&inode->i_data);

                goto no_delete;
        }

        if (is_bad_inode(inode))
                goto no_delete;
        dquot_initialize(inode);

        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);

        /*
         * For inodes with journalled data, transaction commit could have
         * dirtied the inode. And for inodes with dioread_nolock, unwritten
         * extents converting worker could merge extents and also have dirtied
         * the inode. Flush worker is ignoring it because of I_FREEING flag but
         * we still need to remove the inode from the writeback lists.
         */
        if (!list_empty_careful(&inode->i_io_list))
                inode_io_list_del(inode);

        /*
         * Protect us against freezing - iput() caller didn't have to have any
         * protection against it. When we are in a running transaction though,
         * we are already protected against freezing and we cannot grab further
         * protection due to lock ordering constraints.
         */
        if (!ext4_journal_current_handle()) {
                sb_start_intwrite(inode->i_sb);
                freeze_protected = true;
        }

        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);

        /*
         * Block bitmap, group descriptor, and inode are accounted in both
         * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
         */
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
                         ext4_blocks_for_truncate(inode) + extra_credits - 3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
                 * If we're going to skip the normal cleanup, we still need to
                 * make sure that the in-core orphan linked list is properly
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        /*
         * Set inode->i_size to 0 before calling ext4_truncate(). We need
         * special handling of symlinks here because i_size is used to
         * determine whether ext4_inode_info->i_data contains symlink data or
         * block mappings. Setting i_size to 0 will remove its fast symlink
         * status. Erase i_data so that it becomes a valid empty block map.
         */
        if (ext4_inode_is_fast_symlink(inode))
                memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
        if (inode->i_blocks) {
                err = ext4_truncate(inode);
                if (err) {
                        ext4_error_err(inode->i_sb, -err,
                                       "couldn't truncate inode %lu (err %d)",
                                       inode->i_ino, err);
                        goto stop_handle;
                }
        }

        /* Remove xattr references. */
        err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
                                      extra_credits);
        if (err) {
                ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
stop_handle:
                ext4_journal_stop(handle);
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                ext4_xattr_inode_array_free(ea_inode_array);
                goto no_delete;
        }

        /*
         * Kill off the orphan record which ext4_truncate created.
         * AKPM: I think this can be inside the above `if'.
         * Note that ext4_orphan_del() has to be able to cope with the
         * deletion of a non-existent orphan - this is because we don't
         * know if ext4_truncate() actually created an orphan record.
         * (Well, we could do this if we need to, but heck - it works)
         */
        ext4_orphan_del(handle, inode);
        EXT4_I(inode)->i_dtime        = (__u32)ktime_get_real_seconds();

        /*
         * One subtle ordering requirement: if anything has gone wrong
         * (transaction abort, IO errors, whatever), then we can still
         * do these next steps (the fs will already have been marked as
         * having errors), but we can't free the inode if the mark_dirty
         * fails.
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
                ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        if (freeze_protected)
                sb_end_intwrite(inode->i_sb);
        ext4_xattr_inode_array_free(ea_inode_array);
        return;
no_delete:
        /*
         * Check out some where else accidentally dirty the evicting inode,
         * which may probably cause inode use-after-free issues later.
         */
        WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));

        if (!list_empty(&EXT4_I(inode)->i_fc_list))
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
}

#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
{
        return &EXT4_I(inode)->i_reserved_quota;
}
#endif

/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                used = ei->i_reserved_data_blocks;
        }

        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);

        spin_unlock(&ei->i_block_reservation_lock);

        /* Update quota subsystem for data blocks */
        if (quota_claim)
                dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }

        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
        if ((ei->i_reserved_data_blocks == 0) &&
            !inode_is_open_for_write(inode))
                ext4_discard_preallocations(inode);
}

static int __check_block_validity(struct inode *inode, const char *func,
                                unsigned int line,
                                struct ext4_map_blocks *map)
{
        if (ext4_has_feature_journal(inode->i_sb) &&
            (inode->i_ino ==
             le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
                return 0;
        if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
                ext4_error_inode(inode, func, line, map->m_pblk,
                                 "lblock %lu mapped to illegal pblock %llu "
                                 "(length %d)", (unsigned long) map->m_lblk,
                                 map->m_pblk, map->m_len);
                return -EFSCORRUPTED;
        }
        return 0;
}

int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                       ext4_lblk_t len)
{
        int ret;

        if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
                return fscrypt_zeroout_range(inode, lblk, pblk, len);

        ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
        if (ret > 0)
                ret = 0;

        return ret;
}

#define check_block_validity(inode, map)        \
        __check_block_validity((inode), __func__, __LINE__, (map))

#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
                                       struct ext4_map_blocks *es_map,
                                       struct ext4_map_blocks *map,
                                       int flags)
{
        int retval;

        map->m_flags = 0;
        /*
         * There is a race window that the result is not the same.
         * e.g. xfstests #223 when dioread_nolock enables.  The reason
         * is that we lookup a block mapping in extent status tree with
         * out taking i_data_sem.  So at the time the unwritten extent
         * could be converted.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

        /*
         * We don't check m_len because extent will be collpased in status
         * tree.  So the m_len might not equal.
         */
        if (es_map->m_lblk != map->m_lblk ||
            es_map->m_flags != map->m_flags ||
            es_map->m_pblk != map->m_pblk) {
                printk("ES cache assertion failed for inode: %lu "
                       "es_cached ex [%d/%d/%llu/%x] != "
                       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
                       inode->i_ino, es_map->m_lblk, es_map->m_len,
                       es_map->m_pblk, es_map->m_flags, map->m_lblk,
                       map->m_len, map->m_pblk, map->m_flags,
                       retval, flags);
        }
}
#endif /* ES_AGGRESSIVE_TEST */

/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_map_blocks(),
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocated.
 * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
 * pre-allocated and unwritten, the resulting @map is marked as unwritten.
 * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
 * that case, @map is returned as unmapped but we still do fill map->m_len to
 * indicate the length of a hole starting at map->m_lblk.
 *
 * It returns the error in case of allocation failure.
 */
int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
{
        struct extent_status es;
        int retval;
        int ret = 0;
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        map->m_flags = 0;
        ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
                  flags, map->m_len, (unsigned long) map->m_lblk);

        /*
         * ext4_map_blocks returns an int, and m_len is an unsigned int
         */
        if (unlikely(map->m_len > INT_MAX))
                map->m_len = INT_MAX;

        /* We can handle the block number less than EXT_MAX_BLOCKS */
        if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
                return -EFSCORRUPTED;

        /* Lookup extent status tree firstly */
        if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
            ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
                        map->m_pblk = ext4_es_pblock(&es) +
                                        map->m_lblk - es.es_lblk;
                        map->m_flags |= ext4_es_is_written(&es) ?
                                        EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
                        map->m_pblk = 0;
                        map->m_flags |= ext4_es_is_delayed(&es) ?
                                        EXT4_MAP_DELAYED : 0;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                        retval = 0;
                } else {
                        BUG();
                }

                if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                        return retval;
#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(handle, inode, map,
                                           &orig_map, flags);
#endif
                goto found;
        }
        /*
         * In the query cache no-wait mode, nothing we can do more if we
         * cannot find extent in the cache.
         */
        if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                return 0;

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
                                       map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }

        /* If it is only a block(s) look up */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;

        /*
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
         * ext4_ext_map_blocks() returns with buffer head unmapped
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                /*
                 * If we need to convert extent to unwritten
                 * we continue and do the actual work in
                 * ext4_ext_map_blocks()
                 */
                if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
                        return retval;

        /*
         * Here we clear m_flags because after allocating an new extent,
         * it will be set again.
         */
        map->m_flags &= ~EXT4_MAP_FLAGS;

        /*
         * New blocks allocate and/or writing to unwritten extent
         * will possibly result in updating i_data, so we take
         * the write lock of i_data_sem, and call get_block()
         * with create == 1 flag.
         */
        down_write(&EXT4_I(inode)->i_data_sem);

        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, flags);

                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
                }
        }

        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                /*
                 * We have to zeroout blocks before inserting them into extent
                 * status tree. Otherwise someone could look them up there and
                 * use them before they are really zeroed. We also have to
                 * unmap metadata before zeroing as otherwise writeback can
                 * overwrite zeros with stale data from block device.
                 */
                if (flags & EXT4_GET_BLOCKS_ZERO &&
                    map->m_flags & EXT4_MAP_MAPPED &&
                    map->m_flags & EXT4_MAP_NEW) {
                        ret = ext4_issue_zeroout(inode, map->m_lblk,
                                                 map->m_pblk, map->m_len);
                        if (ret) {
                                retval = ret;
                                goto out_sem;
                        }
                }

                /*
                 * If the extent has been zeroed out, we don't need to update
                 * extent status tree.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
                    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                        if (ext4_es_is_written(&es))
                                goto out_sem;
                }
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
                                       map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
        }

out_sem:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;

                /*
                 * Inodes with freshly allocated blocks where contents will be
                 * visible after transaction commit must be on transaction's
                 * ordered data list.
                 */
                if (map->m_flags & EXT4_MAP_NEW &&
                    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
                    !(flags & EXT4_GET_BLOCKS_ZERO) &&
                    !ext4_is_quota_file(inode) &&
                    ext4_should_order_data(inode)) {
                        loff_t start_byte =
                                (loff_t)map->m_lblk << inode->i_blkbits;
                        loff_t length = (loff_t)map->m_len << inode->i_blkbits;

                        if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
                                ret = ext4_jbd2_inode_add_wait(handle, inode,
                                                start_byte, length);
                        else
                                ret = ext4_jbd2_inode_add_write(handle, inode,
                                                start_byte, length);
                        if (ret)
                                return ret;
                }
        }
        if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
                                map->m_flags & EXT4_MAP_MAPPED))
                ext4_fc_track_range(handle, inode, map->m_lblk,
                                        map->m_lblk + map->m_len - 1);
        if (retval < 0)
                ext_debug(inode, "failed with err %d\n", retval);
        return retval;
}

/*
 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
 * we have to be careful as someone else may be manipulating b_state as well.
 */
static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
{
        unsigned long old_state;
        unsigned long new_state;

        flags &= EXT4_MAP_FLAGS;

        /* Dummy buffer_head? Set non-atomically. */
        if (!bh->b_page) {
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
                return;
        }
        /*
         * Someone else may be modifying b_state. Be careful! This is ugly but
         * once we get rid of using bh as a container for mapping information
         * to pass to / from get_block functions, this can go away.
         */
        old_state = READ_ONCE(bh->b_state);
        do {
                new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
        } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}

static int _ext4_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int flags)
{
        struct ext4_map_blocks map;
        int ret = 0;

        if (ext4_has_inline_data(inode))
                return -ERANGE;

        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;

        ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
                              flags);
        if (ret > 0) {
                map_bh(bh, inode->i_sb, map.m_pblk);
                ext4_update_bh_state(bh, map.m_flags);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        } else if (ret == 0) {
                /* hole case, need to fill in bh->b_size */
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
        }
        return ret;
}

int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh, int create)
{
        return _ext4_get_block(inode, iblock, bh,
                               create ? EXT4_GET_BLOCKS_CREATE : 0);
}

/*
 * Get block function used when preparing for buffered write if we require
 * creating an unwritten extent if blocks haven't been allocated.  The extent
 * will be converted to written after the IO is complete.
 */
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create)
{
        int ret = 0;

        ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
                   inode->i_ino, create);
        ret = _ext4_get_block(inode, iblock, bh_result,
                               EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);

        /*
         * If the buffer is marked unwritten, mark it as new to make sure it is
         * zeroed out correctly in case of partial writes. Otherwise, there is
         * a chance of stale data getting exposed.
         */
        if (ret == 0 && buffer_unwritten(bh_result))
                set_buffer_new(bh_result);

        return ret;
}

/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

/*
 * `handle' can be NULL if create is zero
 */
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int map_flags)
{
        struct ext4_map_blocks map;
        struct buffer_head *bh;
        int create = map_flags & EXT4_GET_BLOCKS_CREATE;
        bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
        int err;

        ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                    || handle != NULL || create == 0);
        ASSERT(create == 0 || !nowait);

        map.m_lblk = block;
        map.m_len = 1;
        err = ext4_map_blocks(handle, inode, &map, map_flags);

        if (err == 0)
                return create ? ERR_PTR(-ENOSPC) : NULL;
        if (err < 0)
                return ERR_PTR(err);

        if (nowait)
                return sb_find_get_block(inode->i_sb, map.m_pblk);

        bh = sb_getblk(inode->i_sb, map.m_pblk);
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);
        if (map.m_flags & EXT4_MAP_NEW) {
                ASSERT(create != 0);
                ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                            || (handle != NULL));

                /*
                 * Now that we do not always journal data, we should
                 * keep in mind whether this should always journal the
                 * new buffer as metadata.  For now, regular file
                 * writes use ext4_get_block instead, so it's not a
                 * problem.
                 */
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                                     EXT4_JTR_NONE);
                if (unlikely(err)) {
                        unlock_buffer(bh);
                        goto errout;
                }
                if (!buffer_uptodate(bh)) {
                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
                        set_buffer_uptodate(bh);
                }
                unlock_buffer(bh);
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (unlikely(err))
                        goto errout;
        } else
                BUFFER_TRACE(bh, "not a new buffer");
        return bh;
errout:
        brelse(bh);
        return ERR_PTR(err);
}

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                               ext4_lblk_t block, int map_flags)
{
        struct buffer_head *bh;
        int ret;

        bh = ext4_getblk(handle, inode, block, map_flags);
        if (IS_ERR(bh))
                return bh;
        if (!bh || ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

/* Read a contiguous batch of blocks. */
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs)
{
        int i, err;

        for (i = 0; i < bh_count; i++) {
                bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
                if (IS_ERR(bhs[i])) {
                        err = PTR_ERR(bhs[i]);
                        bh_count = i;
                        goto out_brelse;
                }
        }

        for (i = 0; i < bh_count; i++)
                /* Note that NULL bhs[i] is valid because of holes. */
                if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
                        ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);

        if (!wait)
                return 0;

        for (i = 0; i < bh_count; i++)
                if (bhs[i])
                        wait_on_buffer(bhs[i]);

        for (i = 0; i < bh_count; i++) {
                if (bhs[i] && !buffer_uptodate(bhs[i])) {
                        err = -EIO;
                        goto out_brelse;
                }
        }
        return 0;

out_brelse:
        for (i = 0; i < bh_count; i++) {
                brelse(bhs[i]);
                bhs[i] = NULL;
        }
        return err;
}

int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh))
{
        struct buffer_head *bh;
        unsigned block_start, block_end;
        unsigned blocksize = head->b_size;
        int err, ret = 0;
        struct buffer_head *next;

        for (bh = head, block_start = 0;
             ret == 0 && (bh != head || !block_start);
             block_start = block_end, bh = next) {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (partial && !buffer_uptodate(bh))
                                *partial = 1;
                        continue;
                }
                err = (*fn)(handle, inode, bh);
                if (!ret)
                        ret = err;
        }
        return ret;
}

/*
 * Helper for handling dirtying of journalled data. We also mark the folio as
 * dirty so that writeback code knows about this page (and inode) contains
 * dirty data. ext4_writepages() then commits appropriate transaction to
 * make data stable.
 */
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
        folio_mark_dirty(bh->b_folio);
        return ext4_handle_dirty_metadata(handle, NULL, bh);
}

int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh)
{
        int dirty = buffer_dirty(bh);
        int ret;

        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
        if (dirty)
                clear_buffer_dirty(bh);
        BUFFER_TRACE(bh, "get write access");
        ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (!ret && dirty)
                ret = ext4_dirty_journalled_data(handle, bh);
        return ret;
}

#ifdef CONFIG_FS_ENCRYPTION
static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
                                  get_block_t *get_block)
{
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
        struct inode *inode = folio->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned bbits;
        struct buffer_head *bh, *head, *wait[2];
        int nr_wait = 0;
        int i;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(from > PAGE_SIZE);
        BUG_ON(to > PAGE_SIZE);
        BUG_ON(from > to);

        head = folio_buffers(folio);
        if (!head)
                head = create_empty_buffers(folio, blocksize, 0);
        bbits = ilog2(blocksize);
        block = (sector_t)folio->index << (PAGE_SHIFT - bbits);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                break;
                        if (buffer_new(bh)) {
                                if (folio_test_uptodate(folio)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio, to,
                                                            block_end,
                                                            block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        set_buffer_uptodate(bh);
                        continue;
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                    (block_start < from || block_end > to)) {
                        ext4_read_bh_lock(bh, 0, false);
                        wait[nr_wait++] = bh;
                }
        }
        /*
         * If we issued read requests, let them complete.
         */
        for (i = 0; i < nr_wait; i++) {
                wait_on_buffer(wait[i]);
                if (!buffer_uptodate(wait[i]))
                        err = -EIO;
        }
        if (unlikely(err)) {
                folio_zero_new_buffers(folio, from, to);
        } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                for (i = 0; i < nr_wait; i++) {
                        int err2;

                        err2 = fscrypt_decrypt_pagecache_blocks(folio,
                                                blocksize, bh_offset(wait[i]));
                        if (err2) {
                                clear_buffer_uptodate(wait[i]);
                                err = err2;
                        }
                }
        }

        return err;
}
#endif

/*
 * To preserve ordering, it is essential that the hole instantiation and
 * the data write be encapsulated in a single transaction.  We cannot
 * close off a transaction and start a new one between the ext4_get_block()
 * and the ext4_write_end().  So doing the jbd2_journal_start at the start of
 * ext4_write_begin() is the right place.
 */
static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len,
                            struct page **pagep, void **fsdata)
{
        struct inode *inode = mapping->host;
        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct folio *folio;
        pgoff_t index;
        unsigned from, to;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        trace_ext4_write_begin(inode, pos, len);
        /*
         * Reserve one block more for addition to orphan list in case
         * we allocate blocks but write fails for some reason
         */
        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_SHIFT;
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    pagep);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

        /*
         * __filemap_get_folio() can take a long time if the
         * system is thrashing due to memory pressure, or if the folio
         * is being written back.  So grab it first before we start
         * the transaction handle.  This also allows us to allocate
         * the folio (if needed) without using GFP_NOFS.
         */
retry_grab:
        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        /*
         * The same as page allocation, we prealloc buffer heads before
         * starting the handle.
         */
        if (!folio_buffers(folio))
                create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);

        folio_unlock(folio);

retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                folio_put(folio);
                return PTR_ERR(handle);
        }

        folio_lock(folio);
        if (folio->mapping != mapping) {
                /* The folio got truncated from under us */
                folio_unlock(folio);
                folio_put(folio);
                ext4_journal_stop(handle);
                goto retry_grab;
        }
        /* In case writeback began while the folio was unlocked */
        folio_wait_stable(folio);

#ifdef CONFIG_FS_ENCRYPTION
        if (ext4_should_dioread_nolock(inode))
                ret = ext4_block_write_begin(folio, pos, len,
                                             ext4_get_block_unwritten);
        else
                ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
#else
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(&folio->page, pos, len,
                                          ext4_get_block_unwritten);
        else
                ret = __block_write_begin(&folio->page, pos, len, ext4_get_block);
#endif
        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio), from, to,
                                             NULL, do_journal_get_write_access);
        }

        if (ret) {
                bool extended = (pos + len > inode->i_size) &&
                                !ext4_verity_in_progress(inode);

                folio_unlock(folio);
                /*
                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_rwsem.
                 *
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
                if (extended && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);

                ext4_journal_stop(handle);
                if (extended) {
                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
                         */
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry_journal;
                folio_put(folio);
                return ret;
        }
        *pagep = &folio->page;
        return ret;
}

/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct inode *inode,
                        struct buffer_head *bh)
{
        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
        ret = ext4_dirty_journalled_data(handle, bh);
        clear_buffer_meta(bh);
        clear_buffer_prio(bh);
        return ret;
}

/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
 * ext4 never places buffers on inode->i_mapping->i_private_list.  metadata
 * buffers are managed internally.
 */
static int ext4_write_end(struct file *file,
                          struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
                          struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int i_size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_write_end(inode, pos, len, copied);

        if (ext4_has_inline_data(inode) &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
        /*
         * it's important to update i_size while still holding folio lock:
         * page writeout could otherwise come in and zero beyond i_size.
         *
         * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
         * blocks are being written past EOF, so skip the i_size update.
         */
        if (!verity)
                i_size_changed = ext4_update_inode_size(inode, pos + copied);
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under folio lock. First, it unnecessarily
         * makes the holding time of folio lock longer. Second, it forces lock
         * ordering of folio lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                ret = ext4_mark_inode_dirty(handle, inode);

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;

        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * This is a private version of folio_zero_new_buffers() which doesn't
 * set the buffer to be dirty, since in data=journalled mode we need
 * to call ext4_dirty_journalled_data() instead.
 */
static void ext4_journalled_zero_new_buffers(handle_t *handle,
                                            struct inode *inode,
                                            struct folio *folio,
                                            unsigned from, unsigned to)
{
        unsigned int block_start = 0, block_end;
        struct buffer_head *head, *bh;

        bh = head = folio_buffers(folio);
        do {
                block_end = block_start + bh->b_size;
                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        unsigned start, size;

                                        start = max(from, block_start);
                                        size = min(to, block_end) - start;

                                        folio_zero_range(folio, start, size);
                                        write_end_fn(handle, inode, bh);
                                }
                                clear_buffer_new(bh);
                        }
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}

static int ext4_journalled_write_end(struct file *file,
                                     struct address_space *mapping,
                                     loff_t pos, unsigned len, unsigned copied,
                                     struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
        int size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        BUG_ON(!ext4_handle_valid(handle));

        if (ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
                copied = 0;
                ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                 from, to);
        } else {
                if (unlikely(copied < len))
                        ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                         from + copied, to);
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio),
                                             from, from + copied, &partial,
                                             write_end_fn);
                if (!partial)
                        folio_mark_uptodate(folio);
        }
        if (!verity)
                size_changed = ext4_update_inode_size(inode, pos + copied);
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);

        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
        }

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * Reserve space for a single cluster
 */
static int ext4_da_reserve_space(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int ret;

        /*
         * We will charge metadata quota at writeout time; this saves
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
        ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
        if (ret)
                return ret;

        spin_lock(&ei->i_block_reservation_lock);
        if (ext4_claim_free_clusters(sbi, 1, 0)) {
                spin_unlock(&ei->i_block_reservation_lock);
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                return -ENOSPC;
        }
        ei->i_reserved_data_blocks++;
        trace_ext4_da_reserve_space(inode);
        spin_unlock(&ei->i_block_reservation_lock);

        return 0;       /* success */
}

void ext4_da_release_space(struct inode *inode, int to_free)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!to_free)
                return;                /* Nothing to release, exit */

        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);

        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
                 * counter is messed up somewhere.  Since this
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
                ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                to_free = ei->i_reserved_data_blocks;
        }
        ei->i_reserved_data_blocks -= to_free;

        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);

        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

        dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}

/*
 * Delayed allocation stuff
 */

struct mpage_da_data {
        /* These are input fields for ext4_do_writepages() */
        struct inode *inode;
        struct writeback_control *wbc;
        unsigned int can_map:1;        /* Can writepages call map blocks? */

        /* These are internal state of ext4_do_writepages() */
        pgoff_t first_page;        /* The first page to write */
        pgoff_t next_page;        /* Current page to examine */
        pgoff_t last_page;        /* Last page to examine */
        /*
         * Extent to map - this can be after first_page because that can be
         * fully mapped. We somewhat abuse m_flags to store whether the extent
         * is delalloc or unwritten.
         */
        struct ext4_map_blocks map;
        struct ext4_io_submit io_submit;        /* IO submission data */
        unsigned int do_map:1;
        unsigned int scanned_until_end:1;
        unsigned int journalled_more_data:1;
};

static void mpage_release_unused_pages(struct mpage_da_data *mpd,
                                       bool invalidate)
{
        unsigned nr, i;
        pgoff_t index, end;
        struct folio_batch fbatch;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;

        /* This is necessary when next_page == 0. */
        if (mpd->first_page >= mpd->next_page)
                return;

        mpd->scanned_until_end = 0;
        index = mpd->first_page;
        end   = mpd->next_page - 1;
        if (invalidate) {
                ext4_lblk_t start, last;
                start = index << (PAGE_SHIFT - inode->i_blkbits);
                last = end << (PAGE_SHIFT - inode->i_blkbits);

                /*
                 * avoid racing with extent status tree scans made by
                 * ext4_insert_delayed_block()
                 */
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_es_remove_extent(inode, start, last - start + 1);
                up_write(&EXT4_I(inode)->i_data_sem);
        }

        folio_batch_init(&fbatch);
        while (index <= end) {
                nr = filemap_get_folios(mapping, &index, end, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (folio->index < mpd->first_page)
                                continue;
                        if (folio_next_index(folio) - 1 > end)
                                continue;
                        BUG_ON(!folio_test_locked(folio));
                        BUG_ON(folio_test_writeback(folio));
                        if (invalidate) {
                                if (folio_mapped(folio))
                                        folio_clear_dirty_for_io(folio);
                                block_invalidate_folio(folio, 0,
                                                folio_size(folio));
                                folio_clear_uptodate(folio);
                        }
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
        }
}

static void ext4_print_free_blocks(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
                        ext4_count_free_clusters(sb)));
        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "Block reservation details");
        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
                 ei->i_reserved_data_blocks);
        return;
}

/*
 * ext4_insert_delayed_block - adds a delayed block to the extents status
 *                             tree, incrementing the reserved cluster/block
 *                             count or making a pending reservation
 *                             where needed
 *
 * @inode - file containing the newly added block
 * @lblk - logical block to be added
 *
 * Returns 0 on success, negative error code on failure.
 */
static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int ret;
        bool allocated = false;

        /*
         * If the cluster containing lblk is shared with a delayed,
         * written, or unwritten extent in a bigalloc file system, it's
         * already been accounted for and does not need to be reserved.
         * A pending reservation must be made for the cluster if it's
         * shared with a written or unwritten extent and doesn't already
         * have one.  Written and unwritten extents can be purged from the
         * extents status tree if the system is under memory pressure, so
         * it's necessary to examine the extent tree if a search of the
         * extents status tree doesn't get a match.
         */
        if (sbi->s_cluster_ratio == 1) {
                ret = ext4_da_reserve_space(inode);
                if (ret != 0)   /* ENOSPC */
                        return ret;
        } else {   /* bigalloc */
                if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
                        if (!ext4_es_scan_clu(inode,
                                              &ext4_es_is_mapped, lblk)) {
                                ret = ext4_clu_mapped(inode,
                                                      EXT4_B2C(sbi, lblk));
                                if (ret < 0)
                                        return ret;
                                if (ret == 0) {
                                        ret = ext4_da_reserve_space(inode);
                                        if (ret != 0)   /* ENOSPC */
                                                return ret;
                                } else {
                                        allocated = true;
                                }
                        } else {
                                allocated = true;
                        }
                }
        }

        ext4_es_insert_delayed_block(inode, lblk, allocated);
        return 0;
}

/*
 * This function is grabs code from the very beginning of
 * ext4_map_blocks, but assumes that the caller is from delayed write
 * time. This function looks up the requested blocks and sets the
 * buffer delay bit under the protection of i_data_sem.
 */
static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                              struct ext4_map_blocks *map,
                              struct buffer_head *bh)
{
        struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
                invalid_block = ~0;

        map->m_flags = 0;
        ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
                  (unsigned long) map->m_lblk);

        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
                if (ext4_es_is_hole(&es))
                        goto add_delayed;

                /*
                 * Delayed extent could be allocated by fallocate.
                 * So we need to check it.
                 */
                if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
                        map_bh(bh, inode->i_sb, invalid_block);
                        set_buffer_new(bh);
                        set_buffer_delay(bh);
                        return 0;
                }

                map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
                retval = es.es_len - (iblock - es.es_lblk);
                if (retval > map->m_len)
                        retval = map->m_len;
                map->m_len = retval;
                if (ext4_es_is_written(&es))
                        map->m_flags |= EXT4_MAP_MAPPED;
                else if (ext4_es_is_unwritten(&es))
                        map->m_flags |= EXT4_MAP_UNWRITTEN;
                else
                        BUG();

#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
                return retval;
        }

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_has_inline_data(inode))
                retval = 0;
        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
        else
                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
        if (retval < 0) {
                up_read(&EXT4_I(inode)->i_data_sem);
                return retval;
        }
        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
                up_read(&EXT4_I(inode)->i_data_sem);
                return retval;
        }
        up_read(&EXT4_I(inode)->i_data_sem);

add_delayed:
        down_write(&EXT4_I(inode)->i_data_sem);
        retval = ext4_insert_delayed_block(inode, map->m_lblk);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (retval)
                return retval;

        map_bh(bh, inode->i_sb, invalid_block);
        set_buffer_new(bh);
        set_buffer_delay(bh);
        return retval;
}

/*
 * This is a special get_block_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
 *
 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
 * We also have b_blocknr = -1 and b_bdev initialized properly
 *
 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
 * initialized properly.
 */
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create)
{
        struct ext4_map_blocks map;
        int ret = 0;

        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

        map.m_lblk = iblock;
        map.m_len = 1;

        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
        ret = ext4_da_map_blocks(inode, iblock, &map, bh);
        if (ret <= 0)
                return ret;

        map_bh(bh, inode->i_sb, map.m_pblk);
        ext4_update_bh_state(bh, map.m_flags);

        if (buffer_unwritten(bh)) {
                /* A delayed write to unwritten bh should be marked
                 * new and mapped.  Mapped ensures that we don't do
                 * get_block multiple times when we write to the same
                 * offset and new ensures that we do proper zero out
                 * for partial write.
                 */
                set_buffer_new(bh);
                set_buffer_mapped(bh);
        }
        return 0;
}

static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
        mpd->first_page += folio_nr_pages(folio);
        folio_unlock(folio);
}

static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
{
        size_t len;
        loff_t size;
        int err;

        BUG_ON(folio->index != mpd->first_page);
        folio_clear_dirty_for_io(folio);
        /*
         * We have to be very careful here!  Nothing protects writeback path
         * against i_size changes and the page can be writeably mapped into
         * page tables. So an application can be growing i_size and writing
         * data through mmap while writeback runs. folio_clear_dirty_for_io()
         * write-protects our page in page tables and the page cannot get
         * written to again until we release folio lock. So only after
         * folio_clear_dirty_for_io() we are safe to sample i_size for
         * ext4_bio_write_folio() to zero-out tail of the written page. We rely
         * on the barrier provided by folio_test_clear_dirty() in
         * folio_clear_dirty_for_io() to make sure i_size is really sampled only
         * after page tables are updated.
         */
        size = i_size_read(mpd->inode);
        len = folio_size(folio);
        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(mpd->inode))
                len = size & (len - 1);
        err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
        if (!err)
                mpd->wbc->nr_to_write--;

        return err;
}

#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))

/*
 * mballoc gives us at most this number of blocks...
 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
 * The rest of mballoc seems to handle chunks up to full group size.
 */
#define MAX_WRITEPAGES_EXTENT_LEN 2048

/*
 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
 *
 * @mpd - extent of blocks
 * @lblk - logical number of the block in the file
 * @bh - buffer head we want to add to the extent
 *
 * The function is used to collect contig. blocks in the same state. If the
 * buffer doesn't require mapping for writeback and we haven't started the
 * extent of buffers to map yet, the function returns 'true' immediately - the
 * caller can write the buffer right away. Otherwise the function returns true
 * if the block has been added to the extent, false if the block couldn't be
 * added.
 */
static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
                                   struct buffer_head *bh)
{
        struct ext4_map_blocks *map = &mpd->map;

        /* Buffer that doesn't need mapping for writeback? */
        if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
            (!buffer_delay(bh) && !buffer_unwritten(bh))) {
                /* So far no extent to map => we write the buffer right away */
                if (map->m_len == 0)
                        return true;
                return false;
        }

        /* First block in the extent? */
        if (map->m_len == 0) {
                /* We cannot map unless handle is started... */
                if (!mpd->do_map)
                        return false;
                map->m_lblk = lblk;
                map->m_len = 1;
                map->m_flags = bh->b_state & BH_FLAGS;
                return true;
        }

        /* Don't go larger than mballoc is willing to allocate */
        if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
                return false;

        /* Can we merge the block to our big extent? */
        if (lblk == map->m_lblk + map->m_len &&
            (bh->b_state & BH_FLAGS) == map->m_flags) {
                map->m_len++;
                return true;
        }
        return false;
}

/*
 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
 *
 * @mpd - extent of blocks for mapping
 * @head - the first buffer in the page
 * @bh - buffer we should start processing from
 * @lblk - logical number of the block in the file corresponding to @bh
 *
 * Walk through page buffers from @bh upto @head (exclusive) and either submit
 * the page for IO if all buffers in this page were mapped and there's no
 * accumulated extent of buffers to map or add buffers in the page to the
 * extent of buffers to map. The function returns 1 if the caller can continue
 * by processing the next page, 0 if it should stop adding buffers to the
 * extent to map because we cannot extend it anymore. It can also return value
 * < 0 in case of error during IO submission.
 */
static int mpage_process_page_bufs(struct mpage_da_data *mpd,
                                   struct buffer_head *head,
                                   struct buffer_head *bh,
                                   ext4_lblk_t lblk)
{
        struct inode *inode = mpd->inode;
        int err;
        ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
                                                        >> inode->i_blkbits;

        if (ext4_verity_in_progress(inode))
                blocks = EXT_MAX_BLOCKS;

        do {
                BUG_ON(buffer_locked(bh));

                if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
                        /* Found extent to map? */
                        if (mpd->map.m_len)
                                return 0;
                        /* Buffer needs mapping and handle is not started? */
                        if (!mpd->do_map)
                                return 0;
                        /* Everything mapped so far and we hit EOF */
                        break;
                }
        } while (lblk++, (bh = bh->b_this_page) != head);
        /* So far everything mapped? Submit the page for IO. */
        if (mpd->map.m_len == 0) {
                err = mpage_submit_folio(mpd, head->b_folio);
                if (err < 0)
                        return err;
                mpage_folio_done(mpd, head->b_folio);
        }
        if (lblk >= blocks) {
                mpd->scanned_until_end = 1;
                return 0;
        }
        return 1;
}

/*
 * mpage_process_folio - update folio buffers corresponding to changed extent
 *                         and may submit fully mapped page for IO
 * @mpd: description of extent to map, on return next extent to map
 * @folio: Contains these buffers.
 * @m_lblk: logical block mapping.
 * @m_pblk: corresponding physical mapping.
 * @map_bh: determines on return whether this page requires any further
 *                  mapping or not.
 *
 * Scan given folio buffers corresponding to changed extent and update buffer
 * state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits.
 * If the given folio is not fully mapped, we update @mpd to the next extent in
 * the given folio that needs mapping & return @map_bh as true.
 */
static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
                              ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
                              bool *map_bh)
{
        struct buffer_head *head, *bh;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        ext4_lblk_t lblk = *m_lblk;
        ext4_fsblk_t pblock = *m_pblk;
        int err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ssize_t io_end_size = 0;
        struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);

        bh = head = folio_buffers(folio);
        do {
                if (lblk < mpd->map.m_lblk)
                        continue;
                if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
                        /*
                         * Buffer after end of mapped extent.
                         * Find next buffer in the folio to map.
                         */
                        mpd->map.m_len = 0;
                        mpd->map.m_flags = 0;
                        io_end_vec->size += io_end_size;

                        err = mpage_process_page_bufs(mpd, head, bh, lblk);
                        if (err > 0)
                                err = 0;
                        if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
                                io_end_vec = ext4_alloc_io_end_vec(io_end);
                                if (IS_ERR(io_end_vec)) {
                                        err = PTR_ERR(io_end_vec);
                                        goto out;
                                }
                                io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
                        }
                        *map_bh = true;
                        goto out;
                }
                if (buffer_delay(bh)) {
                        clear_buffer_delay(bh);
                        bh->b_blocknr = pblock++;
                }
                clear_buffer_unwritten(bh);
                io_end_size += (1 << blkbits);
        } while (lblk++, (bh = bh->b_this_page) != head);

        io_end_vec->size += io_end_size;
        *map_bh = false;
out:
        *m_lblk = lblk;
        *m_pblk = pblock;
        return err;
}

/*
 * mpage_map_buffers - update buffers corresponding to changed extent and
 *                       submit fully mapped pages for IO
 *
 * @mpd - description of extent to map, on return next extent to map
 *
 * Scan buffers corresponding to changed extent (we expect corresponding pages
 * to be already locked) and update buffer state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits,
 * and mark buffers as uninit when we perform writes to unwritten extents
 * and do extent conversion after IO is finished. If the last page is not fully
 * mapped, we update @map to the next extent in the last page that needs
 * mapping. Otherwise we submit the page for IO.
 */
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
        struct folio_batch fbatch;
        unsigned nr, i;
        struct inode *inode = mpd->inode;
        int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
        pgoff_t start, end;
        ext4_lblk_t lblk;
        ext4_fsblk_t pblock;
        int err;
        bool map_bh = false;

        start = mpd->map.m_lblk >> bpp_bits;
        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
        lblk = start << bpp_bits;
        pblock = mpd->map.m_pblk;

        folio_batch_init(&fbatch);
        while (start <= end) {
                nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        err = mpage_process_folio(mpd, folio, &lblk, &pblock,
                                                 &map_bh);
                        /*
                         * If map_bh is true, means page may require further bh
                         * mapping, or maybe the page was submitted for IO.
                         * So we return to call further extent mapping.
                         */
                        if (err < 0 || map_bh)
                                goto out;
                        /* Page fully mapped - let IO run! */
                        err = mpage_submit_folio(mpd, folio);
                        if (err < 0)
                                goto out;
                        mpage_folio_done(mpd, folio);
                }
                folio_batch_release(&fbatch);
        }
        /* Extent fully mapped and matches with page boundary. We are done. */
        mpd->map.m_len = 0;
        mpd->map.m_flags = 0;
        return 0;
out:
        folio_batch_release(&fbatch);
        return err;
}

static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int get_blocks_flags;
        int err, dioread_nolock;

        trace_ext4_da_write_pages_extent(inode, map);
        /*
         * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
         * to convert an unwritten extent to be initialized (in the case
         * where we have written into one or more preallocated blocks).  It is
         * possible that we're going to need more metadata blocks than
         * previously reserved. However we must not fail because we're in
         * writeback and there is nothing we can do about it so it might result
         * in data loss.  So use reserved blocks to allocate metadata if
         * possible.
         *
         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
         * the blocks in question are delalloc blocks.  This indicates
         * that the blocks and quotas has already been checked when
         * the data was copied into the page cache.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
                           EXT4_GET_BLOCKS_METADATA_NOFAIL |
                           EXT4_GET_BLOCKS_IO_SUBMIT;
        dioread_nolock = ext4_should_dioread_nolock(inode);
        if (dioread_nolock)
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (map->m_flags & BIT(BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

        err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
        if (err < 0)
                return err;
        if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
                if (!mpd->io_submit.io_end->handle &&
                    ext4_handle_valid(handle)) {
                        mpd->io_submit.io_end->handle = handle->h_rsv_handle;
                        handle->h_rsv_handle = NULL;
                }
                ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
        }

        BUG_ON(map->m_len == 0);
        return 0;
}

/*
 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
 *                                 mpd->len and submit pages underlying it for IO
 *
 * @handle - handle for journal operations
 * @mpd - extent to map
 * @give_up_on_write - we set this to true iff there is a fatal error and there
 *                     is no hope of writing the data. The caller should discard
 *                     dirty pages to avoid infinite loops.
 *
 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
 * delayed, blocks are allocated, if it is unwritten, we may need to convert
 * them to initialized or split the described range from larger unwritten
 * extent. Note that we need not map all the described range since allocation
 * can return less blocks or the range is covered by more unwritten extents. We
 * cannot map more because we are limited by reserved transaction credits. On
 * the other hand we always make sure that the last touched page is fully
 * mapped so that it can be written out (and thus forward progress is
 * guaranteed). After mapping we submit all mapped pages for IO.
 */
static int mpage_map_and_submit_extent(handle_t *handle,
                                       struct mpage_da_data *mpd,
                                       bool *give_up_on_write)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int err;
        loff_t disksize;
        int progress = 0;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        struct ext4_io_end_vec *io_end_vec;

        io_end_vec = ext4_alloc_io_end_vec(io_end);
        if (IS_ERR(io_end_vec))
                return PTR_ERR(io_end_vec);
        io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
        do {
                err = mpage_map_one_extent(handle, mpd);
                if (err < 0) {
                        struct super_block *sb = inode->i_sb;

                        if (ext4_forced_shutdown(sb))
                                goto invalidate_dirty_pages;
                        /*
                         * Let the uper layers retry transient errors.
                         * In the case of ENOSPC, if ext4_count_free_blocks()
                         * is non-zero, a commit should free up blocks.
                         */
                        if ((err == -ENOMEM) ||
                            (err == -ENOSPC && ext4_count_free_clusters(sb))) {
                                if (progress)
                                        goto update_disksize;
                                return err;
                        }
                        ext4_msg(sb, KERN_CRIT,
                                 "Delayed block allocation failed for "
                                 "inode %lu at logical offset %llu with"
                                 " max blocks %u with error %d",
                                 inode->i_ino,
                                 (unsigned long long)map->m_lblk,
                                 (unsigned)map->m_len, -err);
                        ext4_msg(sb, KERN_CRIT,
                                 "This should not happen!! Data will "
                                 "be lost\n");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(inode);
                invalidate_dirty_pages:
                        *give_up_on_write = true;
                        return err;
                }
                progress = 1;
                /*
                 * Update buffer state, submit mapped pages, and get us new
                 * extent to map
                 */
                err = mpage_map_and_submit_buffers(mpd);
                if (err < 0)
                        goto update_disksize;
        } while (map->m_len);

update_disksize:
        /*
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
         */
        disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
        if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
                int err2;
                loff_t i_size;

                down_write(&EXT4_I(inode)->i_data_sem);
                i_size = i_size_read(inode);
                if (disksize > i_size)
                        disksize = i_size;
                if (disksize > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = disksize;
                up_write(&EXT4_I(inode)->i_data_sem);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (err2) {
                        ext4_error_err(inode->i_sb, -err2,
                                       "Failed to mark inode %lu dirty",
                                       inode->i_ino);
                }
                if (!err)
                        err = err2;
        }
        return err;
}

/*
 * Calculate the total number of credits to reserve for one writepages
 * iteration. This is called from ext4_writepages(). We map an extent of
 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
 * bpp - 1 blocks in bpp different extents.
 */
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);

        return ext4_meta_trans_blocks(inode,
                                MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}

static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
                                     size_t len)
{
        struct buffer_head *page_bufs = folio_buffers(folio);
        struct inode *inode = folio->mapping->host;
        int ret, err;

        ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, do_journal_get_write_access);
        err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, write_end_fn);
        if (ret == 0)
                ret = err;
        err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;

        return ret;
}

static int mpage_journal_page_buffers(handle_t *handle,
                                      struct mpage_da_data *mpd,
                                      struct folio *folio)
{
        struct inode *inode = mpd->inode;
        loff_t size = i_size_read(inode);
        size_t len = folio_size(folio);

        folio_clear_checked(folio);
        mpd->wbc->nr_to_write--;

        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(inode))
                len = size & (len - 1);

        return ext4_journal_folio_buffers(handle, folio, len);
}

/*
 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
 *                                  needing mapping, submit mapped pages
 *
 * @mpd - where to look for pages
 *
 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
 * IO immediately. If we cannot map blocks, we submit just already mapped
 * buffers in the page for IO and keep page dirty. When we can map blocks and
 * we find a page which isn't mapped we start accumulating extent of buffers
 * underlying these pages that needs mapping (formed by either delayed or
 * unwritten buffers). We also lock the pages containing these buffers. The
 * extent found is returned in @mpd structure (starting at mpd->lblk with
 * length mpd->len blocks).
 *
 * Note that this function can attach bios to one io_end structure which are
 * neither logically nor physically contiguous. Although it may seem as an
 * unnecessary complication, it is actually inevitable in blocksize < pagesize
 * case as we need to track IO to all buffers underlying a page in one io_end.
 */
static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
        struct address_space *mapping = mpd->inode->i_mapping;
        struct folio_batch fbatch;
        unsigned int nr_folios;
        pgoff_t index = mpd->first_page;
        pgoff_t end = mpd->last_page;
        xa_mark_t tag;
        int i, err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ext4_lblk_t lblk;
        struct buffer_head *head;
        handle_t *handle = NULL;
        int bpp = ext4_journal_blocks_per_page(mpd->inode);

        if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;

        mpd->map.m_len = 0;
        mpd->next_page = index;
        if (ext4_should_journal_data(mpd->inode)) {
                handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
                                            bpp);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
        }
        folio_batch_init(&fbatch);
        while (index <= end) {
                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                tag, &fbatch);
                if (nr_folios == 0)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /*
                         * Accumulated enough dirty pages? This doesn't apply
                         * to WB_SYNC_ALL mode. For integrity sync we have to
                         * keep going because someone may be concurrently
                         * dirtying pages, and we might have synced a lot of
                         * newly appeared dirty pages, but have not synced all
                         * of the old dirty pages.
                         */
                        if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
                            mpd->wbc->nr_to_write <=
                            mpd->map.m_len >> (PAGE_SHIFT - blkbits))
                                goto out;

                        /* If we can't merge this page, we are done. */
                        if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
                                goto out;

                        if (handle) {
                                err = ext4_journal_ensure_credits(handle, bpp,
                                                                  0);
                                if (err < 0)
                                        goto out;
                        }

                        folio_lock(folio);
                        /*
                         * If the page is no longer dirty, or its mapping no
                         * longer corresponds to inode we are writing (which
                         * means it has been truncated or invalidated), or the
                         * page is already under writeback and we are not doing
                         * a data integrity writeback, skip the page
                         */
                        if (!folio_test_dirty(folio) ||
                            (folio_test_writeback(folio) &&
                             (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }

                        folio_wait_writeback(folio);
                        BUG_ON(folio_test_writeback(folio));

                        /*
                         * Should never happen but for buggy code in
                         * other subsystems that call
                         * set_page_dirty() without properly warning
                         * the file system first.  See [1] for more
                         * information.
                         *
                         * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
                         */
                        if (!folio_buffers(folio)) {
                                ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
                                folio_clear_dirty(folio);
                                folio_unlock(folio);
                                continue;
                        }

                        if (mpd->map.m_len == 0)
                                mpd->first_page = folio->index;
                        mpd->next_page = folio_next_index(folio);
                        /*
                         * Writeout when we cannot modify metadata is simple.
                         * Just submit the page. For data=journal mode we
                         * first handle writeout of the page for checkpoint and
                         * only after that handle delayed page dirtying. This
                         * makes sure current data is checkpointed to the final
                         * location before possibly journalling it again which
                         * is desirable when the page is frequently dirtied
                         * through a pin.
                         */
                        if (!mpd->can_map) {
                                err = mpage_submit_folio(mpd, folio);
                                if (err < 0)
                                        goto out;
                                /* Pending dirtying of journalled data? */
                                if (folio_test_checked(folio)) {
                                        err = mpage_journal_page_buffers(handle,
                                                mpd, folio);
                                        if (err < 0)
                                                goto out;
                                        mpd->journalled_more_data = 1;
                                }
                                mpage_folio_done(mpd, folio);
                        } else {
                                /* Add all dirty buffers to mpd */
                                lblk = ((ext4_lblk_t)folio->index) <<
                                        (PAGE_SHIFT - blkbits);
                                head = folio_buffers(folio);
                                err = mpage_process_page_bufs(mpd, head, head,
                                                lblk);
                                if (err <= 0)
                                        goto out;
                                err = 0;
                        }
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
        mpd->scanned_until_end = 1;
        if (handle)
                ext4_journal_stop(handle);
        return 0;
out:
        folio_batch_release(&fbatch);
        if (handle)
                ext4_journal_stop(handle);
        return err;
}

static int ext4_do_writepages(struct mpage_da_data *mpd)
{
        struct writeback_control *wbc = mpd->wbc;
        pgoff_t        writeback_index = 0;
        long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
        int cycled = 1;
        handle_t *handle = NULL;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
        int needed_blocks, rsv_blocks = 0, ret = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        struct blk_plug plug;
        bool give_up_on_write = false;

        trace_ext4_writepages(inode, wbc);

        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_writepages;

        /*
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
         * fs shutdown state instead of sb->s_flag's SB_RDONLY because
         * the latter could be true if the filesystem is mounted
         * read-only, and in that case, ext4_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
        if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
                ret = -EROFS;
                goto out_writepages;
        }

        /*
         * If we have inline data and arrive here, it means that
         * we will soon create the block for the 1st page, so
         * we'd better clear the inline data here.
         */
        if (ext4_has_inline_data(inode)) {
                /* Just inode will be modified... */
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out_writepages;
                }
                BUG_ON(ext4_test_inode_state(inode,
                                EXT4_STATE_MAY_INLINE_DATA));
                ext4_destroy_inline_data(handle, inode);
                ext4_journal_stop(handle);
        }

        /*
         * data=journal mode does not do delalloc so we just need to writeout /
         * journal already mapped buffers. On the other hand we need to commit
         * transaction to make data stable. We expect all the data to be
         * already in the journal (the only exception are DMA pinned pages
         * dirtied behind our back) so we commit transaction here and run the
         * writeback loop to checkpoint them. The checkpointing is not actually
         * necessary to make data persistent *but* quite a few places (extent
         * shifting operations, fsverity, ...) depend on being able to drop
         * pagecache pages after calling filemap_write_and_wait() and for that
         * checkpointing needs to happen.
         */
        if (ext4_should_journal_data(inode)) {
                mpd->can_map = 0;
                if (wbc->sync_mode == WB_SYNC_ALL)
                        ext4_fc_commit(sbi->s_journal,
                                       EXT4_I(inode)->i_datasync_tid);
        }
        mpd->journalled_more_data = 0;

        if (ext4_should_dioread_nolock(inode)) {
                /*
                 * We may need to convert up to one extent per block in
                 * the page and we may dirty the inode.
                 */
                rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
                                                PAGE_SIZE >> inode->i_blkbits);
        }

        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;

        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index;
                if (writeback_index)
                        cycled = 0;
                mpd->first_page = writeback_index;
                mpd->last_page = -1;
        } else {
                mpd->first_page = wbc->range_start >> PAGE_SHIFT;
                mpd->last_page = wbc->range_end >> PAGE_SHIFT;
        }

        ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, mpd->first_page,
                                        mpd->last_page);
        blk_start_plug(&plug);

        /*
         * First writeback pages that don't need mapping - we can avoid
         * starting a transaction unnecessarily and also avoid being blocked
         * in the block layer on device congestion while having transaction
         * started.
         */
        mpd->do_map = 0;
        mpd->scanned_until_end = 0;
        mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
        if (!mpd->io_submit.io_end) {
                ret = -ENOMEM;
                goto unplug;
        }
        ret = mpage_prepare_extent_to_map(mpd);
        /* Unlock pages we didn't use */
        mpage_release_unused_pages(mpd, false);
        /* Submit prepared bio */
        ext4_io_submit(&mpd->io_submit);
        ext4_put_io_end_defer(mpd->io_submit.io_end);
        mpd->io_submit.io_end = NULL;
        if (ret < 0)
                goto unplug;

        while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
                /* For each extent of pages we use new io_end */
                mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
                if (!mpd->io_submit.io_end) {
                        ret = -ENOMEM;
                        break;
                }

                WARN_ON_ONCE(!mpd->can_map);
                /*
                 * We have two constraints: We find one extent to map and we
                 * must always write out whole page (makes a difference when
                 * blocksize < pagesize) so that we don't block on IO when we
                 * try to write out the rest of the page. Journalled mode is
                 * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                needed_blocks = ext4_da_writepages_trans_blocks(inode);

                /* start a new transaction */
                handle = ext4_journal_start_with_reserve(inode,
                                EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        /* Release allocated io_end */
                        ext4_put_io_end(mpd->io_submit.io_end);
                        mpd->io_submit.io_end = NULL;
                        break;
                }
                mpd->do_map = 1;

                trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
                ret = mpage_prepare_extent_to_map(mpd);
                if (!ret && mpd->map.m_len)
                        ret = mpage_map_and_submit_extent(handle, mpd,
                                        &give_up_on_write);
                /*
                 * Caution: If the handle is synchronous,
                 * ext4_journal_stop() can wait for transaction commit
                 * to finish which may depend on writeback of pages to
                 * complete or on page lock to be released.  In that
                 * case, we have to wait until after we have
                 * submitted all the IO, released page locks we hold,
                 * and dropped io_end reference (for extent conversion
                 * to be able to complete) before stopping the handle.
                 */
                if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
                        ext4_journal_stop(handle);
                        handle = NULL;
                        mpd->do_map = 0;
                }
                /* Unlock pages we didn't use */
                mpage_release_unused_pages(mpd, give_up_on_write);
                /* Submit prepared bio */
                ext4_io_submit(&mpd->io_submit);

                /*
                 * Drop our io_end reference we got from init. We have
                 * to be careful and use deferred io_end finishing if
                 * we are still holding the transaction as we can
                 * release the last reference to io_end which may end
                 * up doing unwritten extent conversion.
                 */
                if (handle) {
                        ext4_put_io_end_defer(mpd->io_submit.io_end);
                        ext4_journal_stop(handle);
                } else
                        ext4_put_io_end(mpd->io_submit.io_end);
                mpd->io_submit.io_end = NULL;

                if (ret == -ENOSPC && sbi->s_journal) {
                        /*
                         * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
                        continue;
                }
                /* Fatal error - ENOMEM, EIO... */
                if (ret)
                        break;
        }
unplug:
        blk_finish_plug(&plug);
        if (!ret && !cycled && wbc->nr_to_write > 0) {
                cycled = 1;
                mpd->last_page = writeback_index - 1;
                mpd->first_page = 0;
                goto retry;
        }

        /* Update index */
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
                mapping->writeback_index = mpd->first_page;

out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        return ret;
}

static int ext4_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        struct super_block *sb = mapping->host->i_sb;
        struct mpage_da_data mpd = {
                .inode = mapping->host,
                .wbc = wbc,
                .can_map = 1,
        };
        int ret;
        int alloc_ctx;

        if (unlikely(ext4_forced_shutdown(sb)))
                return -EIO;

        alloc_ctx = ext4_writepages_down_read(sb);
        ret = ext4_do_writepages(&mpd);
        /*
         * For data=journal writeback we could have come across pages marked
         * for delayed dirtying (PageChecked) which were just added to the
         * running transaction. Try once more to get them to stable storage.
         */
        if (!ret && mpd.journalled_more_data)
                ret = ext4_do_writepages(&mpd);
        ext4_writepages_up_read(sb, alloc_ctx);

        return ret;
}

int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = jinode->i_dirty_start,
                .range_end = jinode->i_dirty_end,
        };
        struct mpage_da_data mpd = {
                .inode = jinode->i_vfs_inode,
                .wbc = &wbc,
                .can_map = 0,
        };
        return ext4_do_writepages(&mpd);
}

static int ext4_dax_writepages(struct address_space *mapping,
                               struct writeback_control *wbc)
{
        int ret;
        long nr_to_write = wbc->nr_to_write;
        struct inode *inode = mapping->host;
        int alloc_ctx;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        alloc_ctx = ext4_writepages_down_read(inode->i_sb);
        trace_ext4_writepages(inode, wbc);

        ret = dax_writeback_mapping_range(mapping,
                                          EXT4_SB(inode->i_sb)->s_daxdev, wbc);
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        ext4_writepages_up_read(inode->i_sb, alloc_ctx);
        return ret;
}

static int ext4_nonda_switch(struct super_block *sb)
{
        s64 free_clusters, dirty_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
        free_clusters =
                percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        dirty_clusters =
                percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        /*
         * Start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
                try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);

        if (2 * free_clusters < 3 * dirty_clusters ||
            free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
                 */
                return 1;
        }
        return 0;
}

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len,
                               struct page **pagep, void **fsdata)
{
        int ret, retries = 0;
        struct folio *folio;
        pgoff_t index;
        struct inode *inode = mapping->host;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        index = pos >> PAGE_SHIFT;

        if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
                return ext4_write_begin(file, mapping, pos,
                                        len, pagep, fsdata);
        }
        *fsdata = (void *)0;
        trace_ext4_da_write_begin(inode, pos, len);

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
                                                      pagep, fsdata);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

retry:
        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

#ifdef CONFIG_FS_ENCRYPTION
        ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
#else
        ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);
#endif
        if (ret < 0) {
                folio_unlock(folio);
                folio_put(folio);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold inode lock.
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry;
                return ret;
        }

        *pagep = &folio->page;
        return ret;
}

/*
 * Check if we should update i_disksize
 * when write to the end of file but not require block allocation
 */
static int ext4_da_should_update_i_disksize(struct folio *folio,
                                            unsigned long offset)
{
        struct buffer_head *bh;
        struct inode *inode = folio->mapping->host;
        unsigned int idx;
        int i;

        bh = folio_buffers(folio);
        idx = offset >> inode->i_blkbits;

        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;

        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
}

static int ext4_da_do_write_end(struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool disksize_changed = false;
        loff_t new_i_size;

        /*
         * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
         * flag, which all that's needed to trigger page writeback.
         */
        copied = block_write_end(NULL, mapping, pos, len, copied,
                        &folio->page, NULL);
        new_i_size = pos + copied;

        /*
         * It's important to update i_size while still holding folio lock,
         * because folio writeout could otherwise come in and zero beyond
         * i_size.
         *
         * Since we are holding inode lock, we are sure i_disksize <=
         * i_size. We also know that if i_disksize < i_size, there are
         * delalloc writes pending in the range up to i_size. If the end of
         * the current write is <= i_size, there's no need to touch
         * i_disksize since writeback will push i_disksize up to i_size
         * eventually. If the end of the current write is > i_size and
         * inside an allocated block which ext4_da_should_update_i_disksize()
         * checked, we need to update i_disksize here as certain
         * ext4_writepages() paths not allocating blocks and update i_disksize.
         */
        if (new_i_size > inode->i_size) {
                unsigned long end;

                i_size_write(inode, new_i_size);
                end = (new_i_size - 1) & (PAGE_SIZE - 1);
                if (copied && ext4_da_should_update_i_disksize(folio, end)) {
                        ext4_update_i_disksize(inode, new_i_size);
                        disksize_changed = true;
                }
        }

        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos)
                pagecache_isize_extended(inode, old_size, pos);

        if (disksize_changed) {
                handle_t *handle;

                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
                ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
        }

        return copied;
}

static int ext4_da_write_end(struct file *file,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned copied,
                             struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        int write_mode = (int)(unsigned long)fsdata;
        struct folio *folio = page_folio(page);

        if (write_mode == FALL_BACK_TO_NONDELALLOC)
                return ext4_write_end(file, mapping, pos,
                                      len, copied, &folio->page, fsdata);

        trace_ext4_da_write_end(inode, pos, len, copied);

        if (write_mode != CONVERT_INLINE_DATA &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
            ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio))
                copied = 0;

        return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}

/*
 * Force all delayed allocation blocks to be allocated for a given inode.
 */
int ext4_alloc_da_blocks(struct inode *inode)
{
        trace_ext4_alloc_da_blocks(inode);

        if (!EXT4_I(inode)->i_reserved_data_blocks)
                return 0;

        /*
         * We do something simple for now.  The filemap_flush() will
         * also start triggering a write of the data blocks, which is
         * not strictly speaking necessary (and for users of
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
         *
         * ext4_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
         *           mpage_add_bh_to_extent()
         *           mpage_da_map_blocks()
         *
         * The problem is that write_cache_pages(), located in
         * mm/page-writeback.c, marks pages clean in preparation for
         * doing I/O, which is not desirable if we're not planning on
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
         *
         * For now, though, we'll cheat by calling filemap_flush(),
         * which will map the blocks, and start the I/O, but not
         * actually wait for the I/O to complete.
         */
        return filemap_flush(inode->i_mapping);
}

/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
 * Naturally, this is dangerous if the block concerned is still in the
 * journal.  If somebody makes a swapfile on an ext4 data-journaling
 * filesystem and enables swap, then they may get a nasty shock when the
 * data getting swapped to that swapfile suddenly gets overwritten by
 * the original zero's written out previously to the journal and
 * awaiting writeback in the kernel's buffer cache.
 *
 * So, if we see any bmap calls here on a modified, data-journaled file,
 * take extra steps to flush any blocks which might be in the cache.
 */
static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
        struct inode *inode = mapping->host;
        sector_t ret = 0;

        inode_lock_shared(inode);
        /*
         * We can get here for an inline file via the FIBMAP ioctl
         */
        if (ext4_has_inline_data(inode))
                goto out;

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            (test_opt(inode->i_sb, DELALLOC) ||
             ext4_should_journal_data(inode))) {
                /*
                 * With delalloc or journalled data we want to sync the file so
                 * that we can make sure we allocate blocks for file and data
                 * is in place for the user to see it
                 */
                filemap_write_and_wait(mapping);
        }

        ret = iomap_bmap(mapping, block, &ext4_iomap_ops);

out:
        inode_unlock_shared(inode);
        return ret;
}

static int ext4_read_folio(struct file *file, struct folio *folio)
{
        int ret = -EAGAIN;
        struct inode *inode = folio->mapping->host;

        trace_ext4_read_folio(inode, folio);

        if (ext4_has_inline_data(inode))
                ret = ext4_readpage_inline(inode, folio);

        if (ret == -EAGAIN)
                return ext4_mpage_readpages(inode, NULL, folio);

        return ret;
}

static void ext4_readahead(struct readahead_control *rac)
{
        struct inode *inode = rac->mapping->host;

        /* If the file has inline data, no need to do readahead. */
        if (ext4_has_inline_data(inode))
                return;

        ext4_mpage_readpages(inode, rac, NULL);
}

static void ext4_invalidate_folio(struct folio *folio, size_t offset,
                                size_t length)
{
        trace_ext4_invalidate_folio(folio, offset, length);

        /* No journalling happens on data buffers when this function is used */
        WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));

        block_invalidate_folio(folio, offset, length);
}

static int __ext4_journalled_invalidate_folio(struct folio *folio,
                                            size_t offset, size_t length)
{
        journal_t *journal = EXT4_JOURNAL(folio->mapping->host);

        trace_ext4_journalled_invalidate_folio(folio, offset, length);

        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0 && length == folio_size(folio))
                folio_clear_checked(folio);

        return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}

/* Wrapper for aops... */
static void ext4_journalled_invalidate_folio(struct folio *folio,
                                           size_t offset,
                                           size_t length)
{
        WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}

static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
        struct inode *inode = folio->mapping->host;
        journal_t *journal = EXT4_JOURNAL(inode);

        trace_ext4_release_folio(inode, folio);

        /* Page has dirty journalled data -> cannot release */
        if (folio_test_checked(folio))
                return false;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, folio);
        else
                return try_to_free_buffers(folio);
}

static bool ext4_inode_datasync_dirty(struct inode *inode)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal) {
                if (jbd2_transaction_committed(journal,
                        EXT4_I(inode)->i_datasync_tid))
                        return false;
                if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
                        return !list_empty(&EXT4_I(inode)->i_fc_list);
                return true;
        }

        /* Any metadata buffers to write? */
        if (!list_empty(&inode->i_mapping->i_private_list))
                return true;
        return inode->i_state & I_DIRTY_DATASYNC;
}

static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
                           struct ext4_map_blocks *map, loff_t offset,
                           loff_t length, unsigned int flags)
{
        u8 blkbits = inode->i_blkbits;

        /*
         * Writes that span EOF might trigger an I/O size update on completion,
         * so consider them to be dirty for the purpose of O_DSYNC, even if
         * there is no other metadata changes being made or are pending.
         */
        iomap->flags = 0;
        if (ext4_inode_datasync_dirty(inode) ||
            offset + length > i_size_read(inode))
                iomap->flags |= IOMAP_F_DIRTY;

        if (map->m_flags & EXT4_MAP_NEW)
                iomap->flags |= IOMAP_F_NEW;

        if (flags & IOMAP_DAX)
                iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
        else
                iomap->bdev = inode->i_sb->s_bdev;
        iomap->offset = (u64) map->m_lblk << blkbits;
        iomap->length = (u64) map->m_len << blkbits;

        if ((map->m_flags & EXT4_MAP_MAPPED) &&
            !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                iomap->flags |= IOMAP_F_MERGED;

        /*
         * Flags passed to ext4_map_blocks() for direct I/O writes can result
         * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
         * set. In order for any allocated unwritten extents to be converted
         * into written extents correctly within the ->end_io() handler, we
         * need to ensure that the iomap->type is set appropriately. Hence, the
         * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
         * been set first.
         */
        if (map->m_flags & EXT4_MAP_UNWRITTEN) {
                iomap->type = IOMAP_UNWRITTEN;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_MAPPED) {
                iomap->type = IOMAP_MAPPED;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_DELAYED) {
                iomap->type = IOMAP_DELALLOC;
                iomap->addr = IOMAP_NULL_ADDR;
        } else {
                iomap->type = IOMAP_HOLE;
                iomap->addr = IOMAP_NULL_ADDR;
        }
}

static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
                            unsigned int flags)
{
        handle_t *handle;
        u8 blkbits = inode->i_blkbits;
        int ret, dio_credits, m_flags = 0, retries = 0;

        /*
         * Trim the mapping request to the maximum value that we can map at
         * once for direct I/O.
         */
        if (map->m_len > DIO_MAX_BLOCKS)
                map->m_len = DIO_MAX_BLOCKS;
        dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);

retry:
        /*
         * Either we allocate blocks and then don't get an unwritten extent, so
         * in that case we have reserved enough credits. Or, the blocks are
         * already allocated and unwritten. In that case, the extent conversion
         * fits into the credits as well.
         */
        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        /*
         * DAX and direct I/O are the only two operations that are currently
         * supported with IOMAP_WRITE.
         */
        WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
        if (flags & IOMAP_DAX)
                m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
        /*
         * We use i_size instead of i_disksize here because delalloc writeback
         * can complete at any point during the I/O and subsequently push the
         * i_disksize out to i_size. This could be beyond where direct I/O is
         * happening and thus expose allocated blocks to direct I/O reads.
         */
        else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
                m_flags = EXT4_GET_BLOCKS_CREATE;
        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;

        ret = ext4_map_blocks(handle, inode, map, m_flags);

        /*
         * We cannot fill holes in indirect tree based inodes as that could
         * expose stale data in the case of a crash. Use the magic error code
         * to fallback to buffered I/O.
         */
        if (!m_flags && !ret)
                ret = -ENOTBLK;

        ext4_journal_stop(handle);
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        return ret;
}


static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
                return -ERANGE;

        /*
         * Calculate the first and last logical blocks respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        if (flags & IOMAP_WRITE) {
                /*
                 * We check here if the blocks are already allocated, then we
                 * don't need to start a journal txn and we can directly return
                 * the mapping information. This could boost performance
                 * especially in multi-threaded overwrite requests.
                 */
                if (offset + length <= i_size_read(inode)) {
                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
                                goto out;
                }
                ret = ext4_iomap_alloc(inode, &map, flags);
        } else {
                ret = ext4_map_blocks(NULL, inode, &map, 0);
        }

        if (ret < 0)
                return ret;
out:
        /*
         * When inline encryption is enabled, sometimes I/O to an encrypted file
         * has to be broken up to guarantee DUN contiguity.  Handle this by
         * limiting the length of the mapping returned.
         */
        map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);

        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
                loff_t length, unsigned flags, struct iomap *iomap,
                struct iomap *srcmap)
{
        int ret;

        /*
         * Even for writes we don't need to allocate blocks, so just pretend
         * we are reading to save overhead of starting a transaction.
         */
        flags &= ~IOMAP_WRITE;
        ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
        WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
        return ret;
}

static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
                          ssize_t written, unsigned flags, struct iomap *iomap)
{
        /*
         * Check to see whether an error occurred while writing out the data to
         * the allocated blocks. If so, return the magic error code so that we
         * fallback to buffered I/O and attempt to complete the remainder of
         * the I/O. Any blocks that may have been allocated in preparation for
         * the direct I/O will be reused during buffered I/O.
         */
        if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
                return -ENOTBLK;

        return 0;
}

const struct iomap_ops ext4_iomap_ops = {
        .iomap_begin                = ext4_iomap_begin,
        .iomap_end                = ext4_iomap_end,
};

const struct iomap_ops ext4_iomap_overwrite_ops = {
        .iomap_begin                = ext4_iomap_overwrite_begin,
        .iomap_end                = ext4_iomap_end,
};

static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
                                   loff_t length, unsigned int flags,
                                   struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (ext4_has_inline_data(inode)) {
                ret = ext4_inline_data_iomap(inode, iomap);
                if (ret != -EAGAIN) {
                        if (ret == 0 && offset >= iomap->length)
                                ret = -ENOENT;
                        return ret;
                }
        }

        /*
         * Calculate the first and last logical block respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        /*
         * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
         * So handle it here itself instead of querying ext4_map_blocks().
         * Since ext4_map_blocks() will warn about it and will return
         * -EIO error.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (offset >= sbi->s_bitmap_maxbytes) {
                        map.m_flags = 0;
                        goto set_iomap;
                }
        }

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
set_iomap:
        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

const struct iomap_ops ext4_iomap_report_ops = {
        .iomap_begin = ext4_iomap_begin_report,
};

/*
 * For data=journal mode, folio should be marked dirty only when it was
 * writeably mapped. When that happens, it was already attached to the
 * transaction and marked as jbddirty (we take care of this in
 * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
 * so we should have nothing to do here, except for the case when someone
 * had the page pinned and dirtied the page through this pin (e.g. by doing
 * direct IO to it). In that case we'd need to attach buffers here to the
 * transaction but we cannot due to lock ordering.  We cannot just dirty the
 * folio and leave attached buffers clean, because the buffers' dirty state is
 * "definitive".  We cannot just set the buffers dirty or jbddirty because all
 * the journalling code will explode.  So what we do is to mark the folio
 * "pending dirty" and next time ext4_writepages() is called, attach buffers
 * to the transaction appropriately.
 */
static bool ext4_journalled_dirty_folio(struct address_space *mapping,
                struct folio *folio)
{
        WARN_ON_ONCE(!folio_buffers(folio));
        if (folio_maybe_dma_pinned(folio))
                folio_set_checked(folio);
        return filemap_dirty_folio(mapping, folio);
}

static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
        WARN_ON_ONCE(!folio_buffers(folio));
        return block_dirty_folio(mapping, folio);
}

static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
                                    struct file *file, sector_t *span)
{
        return iomap_swapfile_activate(sis, file, span,
                                       &ext4_iomap_report_ops);
}

static const struct address_space_operations ext4_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_journalled_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_journalled_write_end,
        .dirty_folio                = ext4_journalled_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_journalled_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio_norefs,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_da_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_da_write_begin,
        .write_end                = ext4_da_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_dax_aops = {
        .writepages                = ext4_dax_writepages,
        .dirty_folio                = noop_dirty_folio,
        .bmap                        = ext4_bmap,
        .swap_activate                = ext4_iomap_swap_activate,
};

void ext4_set_aops(struct inode *inode)
{
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
        case EXT4_INODE_WRITEBACK_DATA_MODE:
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
                return;
        default:
                BUG();
        }
        if (IS_DAX(inode))
                inode->i_mapping->a_ops = &ext4_dax_aops;
        else if (test_opt(inode->i_sb, DELALLOC))
                inode->i_mapping->a_ops = &ext4_da_aops;
        else
                inode->i_mapping->a_ops = &ext4_aops;
}

/*
 * Here we can't skip an unwritten buffer even though it usually reads zero
 * because it might have data in pagecache (eg, if called from ext4_zero_range,
 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
 * racing writeback can come later and flush the stale pagecache to disk.
 */
static int __ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        ext4_fsblk_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
        struct folio *folio;
        int err = 0;

        folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
                                    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                                    mapping_gfp_constraint(mapping, ~__GFP_FS));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        blocksize = inode->i_sb->s_blocksize;

        iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }
        if (buffer_freed(bh)) {
                BUFFER_TRACE(bh, "freed: skip");
                goto unlock;
        }
        if (!buffer_mapped(bh)) {
                BUFFER_TRACE(bh, "unmapped");
                ext4_get_block(inode, iblock, bh, 0);
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh)) {
                        BUFFER_TRACE(bh, "still unmapped");
                        goto unlock;
                }
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh)) {
                err = ext4_read_bh_lock(bh, 0, true);
                if (err)
                        goto unlock;
                if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                        /* We expect the key to be set. */
                        BUG_ON(!fscrypt_has_encryption_key(inode));
                        err = fscrypt_decrypt_pagecache_blocks(folio,
                                                               blocksize,
                                                               bh_offset(bh));
                        if (err) {
                                clear_buffer_uptodate(bh);
                                goto unlock;
                        }
                }
        }
        if (ext4_should_journal_data(inode)) {
                BUFFER_TRACE(bh, "get write access");
                err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto unlock;
        }
        folio_zero_range(folio, offset, length);
        BUFFER_TRACE(bh, "zeroed end of block");

        if (ext4_should_journal_data(inode)) {
                err = ext4_dirty_journalled_data(handle, bh);
        } else {
                err = 0;
                mark_buffer_dirty(bh);
                if (ext4_should_order_data(inode))
                        err = ext4_jbd2_inode_add_write(handle, inode, from,
                                        length);
        }

unlock:
        folio_unlock(folio);
        folio_put(folio);
        return err;
}

/*
 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
 * starting from file offset 'from'.  The range to be zero'd must
 * be contained with in one block.  If the specified range exceeds
 * the end of the block it will be shortened to end of the block
 * that corresponds to 'from'
 */
static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        struct inode *inode = mapping->host;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned max = blocksize - (offset & (blocksize - 1));

        /*
         * correct length if it does not fall between
         * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;

        if (IS_DAX(inode)) {
                return dax_zero_range(inode, from, length, NULL,
                                      &ext4_iomap_ops);
        }
        return __ext4_block_zero_page_range(handle, mapping, from, length);
}

/*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
static int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
{
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned length;
        unsigned blocksize;
        struct inode *inode = mapping->host;

        /* If we are processing an encrypted inode during orphan list handling */
        if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
                return 0;

        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));

        return ext4_block_zero_page_range(handle, mapping, from, length);
}

int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t length)
{
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned partial_start, partial_end;
        ext4_fsblk_t start, end;
        loff_t byte_end = (lstart + length - 1);
        int err = 0;

        partial_start = lstart & (sb->s_blocksize - 1);
        partial_end = byte_end & (sb->s_blocksize - 1);

        start = lstart >> sb->s_blocksize_bits;
        end = byte_end >> sb->s_blocksize_bits;

        /* Handle partial zero within the single block */
        if (start == end &&
            (partial_start || (partial_end != sb->s_blocksize - 1))) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, length);
                return err;
        }
        /* Handle partial zero out on the start of the range */
        if (partial_start) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, sb->s_blocksize);
                if (err)
                        return err;
        }
        /* Handle partial zero out on the end of the range */
        if (partial_end != sb->s_blocksize - 1)
                err = ext4_block_zero_page_range(handle, mapping,
                                                 byte_end - partial_end,
                                                 partial_end + 1);
        return err;
}

int ext4_can_truncate(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
                return 1;
        if (S_ISLNK(inode->i_mode))
                return !ext4_inode_is_fast_symlink(inode);
        return 0;
}

/*
 * We have to make sure i_disksize gets properly updated before we truncate
 * page cache due to hole punching or zero range. Otherwise i_disksize update
 * can get lost as it may have been postponed to submission of writeback but
 * that will never happen after we truncate page cache.
 */
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len)
{
        handle_t *handle;
        int ret;

        loff_t size = i_size_read(inode);

        WARN_ON(!inode_is_locked(inode));
        if (offset > size || offset + len < size)
                return 0;

        if (EXT4_I(inode)->i_disksize >= size)
                return 0;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ext4_update_i_disksize(inode, size);
        ret = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);

        return ret;
}

static void ext4_wait_dax_page(struct inode *inode)
{
        filemap_invalidate_unlock(inode->i_mapping);
        schedule();
        filemap_invalidate_lock(inode->i_mapping);
}

int ext4_break_layouts(struct inode *inode)
{
        struct page *page;
        int error;

        if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
                return -EINVAL;

        do {
                page = dax_layout_busy_page(inode->i_mapping);
                if (!page)
                        return 0;

                error = ___wait_var_event(&page->_refcount,
                                atomic_read(&page->_refcount) == 1,
                                TASK_INTERRUPTIBLE, 0, 0,
                                ext4_wait_dax_page(inode));
        } while (error == 0);

        return error;
}

/*
 * ext4_punch_hole: punches a hole in a file by releasing the blocks
 * associated with the given offset and length
 *
 * @inode:  File inode
 * @offset: The offset where the hole will begin
 * @len:    The length of the hole
 *
 * Returns: 0 on success or negative on failure
 */

int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t first_block, stop_block;
        struct address_space *mapping = inode->i_mapping;
        loff_t first_block_offset, last_block_offset, max_length;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        handle_t *handle;
        unsigned int credits;
        int ret = 0, ret2 = 0;

        trace_ext4_punch_hole(inode, offset, length, 0);

        /*
         * Write out all dirty pages to avoid race conditions
         * Then release them.
         */
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                ret = filemap_write_and_wait_range(mapping, offset,
                                                   offset + length - 1);
                if (ret)
                        return ret;
        }

        inode_lock(inode);

        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
                goto out_mutex;

        /*
         * If the hole extends beyond i_size, set the hole
         * to end after the page that contains i_size
         */
        if (offset + length > inode->i_size) {
                length = inode->i_size +
                   PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
                   offset;
        }

        /*
         * For punch hole the length + offset needs to be within one block
         * before last range. Adjust the length if it goes beyond that limit.
         */
        max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
        if (offset + length > max_length)
                length = max_length - offset;

        if (offset & (sb->s_blocksize - 1) ||
            (offset + length) & (sb->s_blocksize - 1)) {
                /*
                 * Attach jinode to inode for jbd2 if we do any zeroing of
                 * partial block
                 */
                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        goto out_mutex;

        }

        /* Wait all existing dio workers, newcomers will block on i_rwsem */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /*
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
        filemap_invalidate_lock(mapping);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_dio;

        first_block_offset = round_up(offset, sb->s_blocksize);
        last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;

        /* Now release the pages and zero block aligned part of pages*/
        if (last_block_offset > first_block_offset) {
                ret = ext4_update_disksize_before_punch(inode, offset, length);
                if (ret)
                        goto out_dio;
                truncate_pagecache_range(inode, first_block_offset,
                                         last_block_offset);
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(sb, ret);
                goto out_dio;
        }

        ret = ext4_zero_partial_blocks(handle, inode, offset,
                                       length);
        if (ret)
                goto out_stop;

        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);

        /* If there are blocks to remove, do it */
        if (stop_block > first_block) {
                ext4_lblk_t hole_len = stop_block - first_block;

                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);

                ext4_es_remove_extent(inode, first_block, hole_len);

                if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                        ret = ext4_ext_remove_space(inode, first_block,
                                                    stop_block - 1);
                else
                        ret = ext4_ind_remove_space(handle, inode, first_block,
                                                    stop_block);

                ext4_es_insert_extent(inode, first_block, hole_len, ~0,
                                      EXTENT_STATUS_HOLE);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        ext4_fc_track_range(handle, inode, first_block, stop_block);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        ret2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret2))
                ret = ret2;
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
        ext4_journal_stop(handle);
out_dio:
        filemap_invalidate_unlock(mapping);
out_mutex:
        inode_unlock(inode);
        return ret;
}

int ext4_inode_attach_jinode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct jbd2_inode *jinode;

        if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        jinode = jbd2_alloc_inode(GFP_KERNEL);
        spin_lock(&inode->i_lock);
        if (!ei->jinode) {
                if (!jinode) {
                        spin_unlock(&inode->i_lock);
                        return -ENOMEM;
                }
                ei->jinode = jinode;
                jbd2_journal_init_jbd_inode(ei->jinode, inode);
                jinode = NULL;
        }
        spin_unlock(&inode->i_lock);
        if (unlikely(jinode != NULL))
                jbd2_free_inode(jinode);
        return 0;
}

/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
 * simultaneously on behalf of the same inode.
 *
 * As we work through the truncate and commit bits of it to the journal there
 * is one core, guiding principle: the file's tree must always be consistent on
 * disk.  We must be able to restart the truncate after a crash.
 *
 * The file's tree may be transiently inconsistent in memory (although it
 * probably isn't), but whenever we close off and commit a journal transaction,
 * the contents of (the filesystem + the journal) must be consistent and
 * restartable.  It's pretty simple, really: bottom up, right to left (although
 * left-to-right works OK too).
 *
 * Note that at recovery time, journal replay occurs *before* the restart of
 * truncate against the orphan inode list.
 *
 * The committed inode has the new, desired i_size (which is the same as
 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
 * that this inode's truncate did not complete and it will again call
 * ext4_truncate() to have another go.  So there will be instantiated blocks
 * to the right of the truncation point in a crashed ext4 filesystem.  But
 * that's fine - as long as they are linked from the inode, the post-crash
 * ext4_truncate() run will find them and release them.
 */
int ext4_truncate(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int credits;
        int err = 0, err2;
        handle_t *handle;
        struct address_space *mapping = inode->i_mapping;

        /*
         * There is a possibility that we're either freeing the inode
         * or it's a completely new inode. In those cases we might not
         * have i_rwsem locked because it's not necessary.
         */
        if (!(inode->i_state & (I_NEW|I_FREEING)))
                WARN_ON(!inode_is_locked(inode));
        trace_ext4_truncate_enter(inode);

        if (!ext4_can_truncate(inode))
                goto out_trace;

        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

        if (ext4_has_inline_data(inode)) {
                int has_inline = 1;

                err = ext4_inline_data_truncate(inode, &has_inline);
                if (err || has_inline)
                        goto out_trace;
        }

        /* If we zero-out tail of the page, we have to create jinode for jbd2 */
        if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
                err = ext4_inode_attach_jinode(inode);
                if (err)
                        goto out_trace;
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);

        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_trace;
        }

        if (inode->i_size & (inode->i_sb->s_blocksize - 1))
                ext4_block_truncate_page(handle, mapping, inode->i_size);

        /*
         * We add the inode to the orphan list, so that if this
         * truncate spans multiple transactions, and we crash, we will
         * resume the truncate when the filesystem recovers.  It also
         * marks the inode dirty, to catch the new size.
         *
         * Implication: the file must always be in a sane, consistent
         * truncatable state while each transaction commits.
         */
        err = ext4_orphan_add(handle, inode);
        if (err)
                goto out_stop;

        down_write(&EXT4_I(inode)->i_data_sem);

        ext4_discard_preallocations(inode);

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                err = ext4_ext_truncate(handle, inode);
        else
                ext4_ind_truncate(handle, inode);

        up_write(&ei->i_data_sem);
        if (err)
                goto out_stop;

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
         * However, if this was a real unlink then we were called by
         * ext4_evict_inode(), and we allow that function to clean up the
         * orphan info for us.
         */
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        err2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(err2 && !err))
                err = err2;
        ext4_journal_stop(handle);

out_trace:
        trace_ext4_truncate_exit(inode);
        return err;
}

static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                return inode_peek_iversion_raw(inode);
        else
                return inode_peek_iversion(inode);
}

static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
                                 struct ext4_inode_info *ei)
{
        struct inode *inode = &(ei->vfs_inode);
        u64 i_blocks = READ_ONCE(inode->i_blocks);
        struct super_block *sb = inode->i_sb;

        if (i_blocks <= ~0U) {
                /*
                 * i_blocks can be represented in a 32 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }

        /*
         * This should never happen since sb->s_maxbytes should not have
         * allowed this, sb->s_maxbytes was set according to the huge_file
         * feature in ext4_fill_super().
         */
        if (!ext4_has_feature_huge_file(sb))
                return -EFSCORRUPTED;

        if (i_blocks <= 0xffffffffffffULL) {
                /*
                 * i_blocks can be represented in a 48 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
        }
        return 0;
}

static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;
        int block;
        int err;

        err = ext4_inode_blocks_set(raw_inode, ei);

        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
        i_uid = i_uid_read(inode);
        i_gid = i_gid_read(inode);
        i_projid = from_kprojid(&init_user_ns, ei->i_projid);
        if (!(test_opt(inode->i_sb, NO_UID32))) {
                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
                /*
                 * Fix up interoperability with old kernels. Otherwise,
                 * old inodes get re-used with the upper 16 bits of the
                 * uid/gid intact.
                 */
                if (ei->i_dtime && list_empty(&ei->i_orphan)) {
                        raw_inode->i_uid_high = 0;
                        raw_inode->i_gid_high = 0;
                } else {
                        raw_inode->i_uid_high =
                                cpu_to_le16(high_16_bits(i_uid));
                        raw_inode->i_gid_high =
                                cpu_to_le16(high_16_bits(i_gid));
                }
        } else {
                raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
                raw_inode->i_uid_high = 0;
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

        EXT4_INODE_SET_CTIME(inode, raw_inode);
        EXT4_INODE_SET_MTIME(inode, raw_inode);
        EXT4_INODE_SET_ATIME(inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
        ext4_isize_set(raw_inode, ei->i_disksize);

        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                if (old_valid_dev(inode->i_rdev)) {
                        raw_inode->i_block[0] =
                                cpu_to_le32(old_encode_dev(inode->i_rdev));
                        raw_inode->i_block[1] = 0;
                } else {
                        raw_inode->i_block[0] = 0;
                        raw_inode->i_block[1] =
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
        } else if (!ext4_has_inline_data(inode)) {
                for (block = 0; block < EXT4_N_BLOCKS; block++)
                        raw_inode->i_block[block] = ei->i_data[block];
        }

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = ext4_inode_peek_iversion(inode);

                raw_inode->i_disk_version = cpu_to_le32(ivers);
                if (ei->i_extra_isize) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                raw_inode->i_version_hi =
                                        cpu_to_le32(ivers >> 32);
                        raw_inode->i_extra_isize =
                                cpu_to_le16(ei->i_extra_isize);
                }
        }

        if (i_projid != EXT4_DEF_PROJID &&
            !ext4_has_feature_project(inode->i_sb))
                err = err ?: -EFSCORRUPTED;

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                raw_inode->i_projid = cpu_to_le32(i_projid);

        ext4_inode_csum_set(inode, raw_inode, ei);
        return err;
}

/*
 * ext4_get_inode_loc returns with an extra refcount against the inode's
 * underlying buffer_head on success. If we pass 'inode' and it does not
 * have in-inode xattr, we have all inode data in memory that is needed
 * to recreate the on-disk version of this inode.
 */
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
                                struct inode *inode, struct ext4_iloc *iloc,
                                ext4_fsblk_t *ret_block)
{
        struct ext4_group_desc        *gdp;
        struct buffer_head        *bh;
        ext4_fsblk_t                block;
        struct blk_plug                plug;
        int                        inodes_per_block, inode_offset;

        iloc->bh = NULL;
        if (ino < EXT4_ROOT_INO ||
            ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
                return -EFSCORRUPTED;

        iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
        if (!gdp)
                return -EIO;

        /*
         * Figure out the offset within the block group inode table
         */
        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

        block = ext4_inode_table(sb, gdp);
        if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
            (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
                ext4_error(sb, "Invalid inode table block %llu in "
                           "block_group %u", block, iloc->block_group);
                return -EFSCORRUPTED;
        }
        block += (inode_offset / inodes_per_block);

        bh = sb_getblk(sb, block);
        if (unlikely(!bh))
                return -ENOMEM;
        if (ext4_buffer_uptodate(bh))
                goto has_buffer;

        lock_buffer(bh);
        if (ext4_buffer_uptodate(bh)) {
                /* Someone brought it uptodate while we waited */
                unlock_buffer(bh);
                goto has_buffer;
        }

        /*
         * If we have all information of the inode in memory and this
         * is the only valid inode in the block, we need not read the
         * block.
         */
        if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct buffer_head *bitmap_bh;
                int i, start;

                start = inode_offset & ~(inodes_per_block - 1);

                /* Is the inode bitmap in cache? */
                bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
                if (unlikely(!bitmap_bh))
                        goto make_io;

                /*
                 * If the inode bitmap isn't in cache then the
                 * optimisation may end up performing two reads instead
                 * of one, so skip it.
                 */
                if (!buffer_uptodate(bitmap_bh)) {
                        brelse(bitmap_bh);
                        goto make_io;
                }
                for (i = start; i < start + inodes_per_block; i++) {
                        if (i == inode_offset)
                                continue;
                        if (ext4_test_bit(i, bitmap_bh->b_data))
                                break;
                }
                brelse(bitmap_bh);
                if (i == start + inodes_per_block) {
                        struct ext4_inode *raw_inode =
                                (struct ext4_inode *) (bh->b_data + iloc->offset);

                        /* all other inodes are free, so skip I/O */
                        memset(bh->b_data, 0, bh->b_size);
                        if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
                                ext4_fill_raw_inode(inode, raw_inode);
                        set_buffer_uptodate(bh);
                        unlock_buffer(bh);
                        goto has_buffer;
                }
        }

make_io:
        /*
         * If we need to do any I/O, try to pre-readahead extra
         * blocks from the inode table.
         */
        blk_start_plug(&plug);
        if (EXT4_SB(sb)->s_inode_readahead_blks) {
                ext4_fsblk_t b, end, table;
                unsigned num;
                __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;

                table = ext4_inode_table(sb, gdp);
                /* s_inode_readahead_blks is always a power of 2 */
                b = block & ~((ext4_fsblk_t) ra_blks - 1);
                if (table > b)
                        b = table;
                end = b + ra_blks;
                num = EXT4_INODES_PER_GROUP(sb);
                if (ext4_has_group_desc_csum(sb))
                        num -= ext4_itable_unused_count(sb, gdp);
                table += num / inodes_per_block;
                if (end > table)
                        end = table;
                while (b <= end)
                        ext4_sb_breadahead_unmovable(sb, b++);
        }

        /*
         * There are other valid inodes in the buffer, this inode
         * has in-inode xattrs, or we don't have this inode in memory.
         * Read the block from disk.
         */
        trace_ext4_load_inode(sb, ino);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
        blk_finish_plug(&plug);
        wait_on_buffer(bh);
        ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
        if (!buffer_uptodate(bh)) {
                if (ret_block)
                        *ret_block = block;
                brelse(bh);
                return -EIO;
        }
has_buffer:
        iloc->bh = bh;
        return 0;
}

static int __ext4_get_inode_loc_noinmem(struct inode *inode,
                                        struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}


int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc)
{
        return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}

static bool ext4_should_enable_dax(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (test_opt2(inode->i_sb, DAX_NEVER))
                return false;
        if (!S_ISREG(inode->i_mode))
                return false;
        if (ext4_should_journal_data(inode))
                return false;
        if (ext4_has_inline_data(inode))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
                return false;
        if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
                return false;
        if (test_opt(inode->i_sb, DAX_ALWAYS))
                return true;

        return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}

void ext4_set_inode_flags(struct inode *inode, bool init)
{
        unsigned int flags = EXT4_I(inode)->i_flags;
        unsigned int new_fl = 0;

        WARN_ON_ONCE(IS_DAX(inode) && init);

        if (flags & EXT4_SYNC_FL)
                new_fl |= S_SYNC;
        if (flags & EXT4_APPEND_FL)
                new_fl |= S_APPEND;
        if (flags & EXT4_IMMUTABLE_FL)
                new_fl |= S_IMMUTABLE;
        if (flags & EXT4_NOATIME_FL)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;

        /* Because of the way inode_set_flags() works we must preserve S_DAX
         * here if already set. */
        new_fl |= (inode->i_flags & S_DAX);
        if (init && ext4_should_enable_dax(inode))
                new_fl |= S_DAX;

        if (flags & EXT4_ENCRYPT_FL)
                new_fl |= S_ENCRYPTED;
        if (flags & EXT4_CASEFOLD_FL)
                new_fl |= S_CASEFOLD;
        if (flags & EXT4_VERITY_FL)
                new_fl |= S_VERITY;
        inode_set_flags(inode, new_fl,
                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
                        S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                                  struct ext4_inode_info *ei)
{
        blkcnt_t i_blocks ;
        struct inode *inode = &(ei->vfs_inode);
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_huge_file(sb)) {
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
                        return i_blocks;
                }
        } else {
                return le32_to_cpu(raw_inode->i_blocks_lo);
        }
}

static inline int ext4_iget_extra_inode(struct inode *inode,
                                         struct ext4_inode *raw_inode,
                                         struct ext4_inode_info *ei)
{
        __le32 *magic = (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;

        if (EXT4_INODE_HAS_XATTR_SPACE(inode)  &&
            *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
                int err;

                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                err = ext4_find_inline_data_nolock(inode);
                if (!err && ext4_has_inline_data(inode))
                        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return err;
        } else
                EXT4_I(inode)->i_inline_off = 0;
        return 0;
}

int ext4_get_projid(struct inode *inode, kprojid_t *projid)
{
        if (!ext4_has_feature_project(inode->i_sb))
                return -EOPNOTSUPP;
        *projid = EXT4_I(inode)->i_projid;
        return 0;
}

/*
 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
 * set.
 */
static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                inode_set_iversion_raw(inode, val);
        else
                inode_set_iversion_queried(inode, val);
}

static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)

{
        if (flags & EXT4_IGET_EA_INODE) {
                if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                        return "missing EA_INODE flag";
                if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
                    EXT4_I(inode)->i_file_acl)
                        return "ea_inode with extended attributes";
        } else {
                if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                        return "unexpected EA_INODE flag";
        }
        if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
                return "unexpected bad inode w/o EXT4_IGET_BAD";
        return NULL;
}

struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                          ext4_iget_flags flags, const char *function,
                          unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct inode *inode;
        const char *err_str;
        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        loff_t size;
        int block;
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;

        if ((!(flags & EXT4_IGET_SPECIAL) &&
             ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
              ino == le32_to_cpu(es->s_usr_quota_inum) ||
              ino == le32_to_cpu(es->s_grp_quota_inum) ||
              ino == le32_to_cpu(es->s_prj_quota_inum) ||
              ino == le32_to_cpu(es->s_orphan_file_inum))) ||
            (ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(es->s_inodes_count))) {
                if (flags & EXT4_IGET_HANDLE)
                        return ERR_PTR(-ESTALE);
                __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
                             "inode #%lu: comm %s: iget: illegal inode #",
                             ino, current->comm);
                return ERR_PTR(-EFSCORRUPTED);
        }

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW)) {
                if ((err_str = check_igot_inode(inode, flags)) != NULL) {
                        ext4_error_inode(inode, function, line, 0, err_str);
                        iput(inode);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                return inode;
        }

        ei = EXT4_I(inode);
        iloc.bh = NULL;

        ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
        if (ret < 0)
                goto bad_inode;
        raw_inode = ext4_raw_inode(&iloc);

        if ((flags & EXT4_IGET_HANDLE) &&
            (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
                ret = -ESTALE;
                goto bad_inode;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                        EXT4_INODE_SIZE(inode->i_sb) ||
                    (ei->i_extra_isize & 3)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: bad extra_isize %u "
                                         "(inode size %u)",
                                         ei->i_extra_isize,
                                         EXT4_INODE_SIZE(inode->i_sb));
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
        } else
                ei->i_extra_isize = 0;

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_metadata_csum(sb)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = raw_inode->i_generation;
                csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
                                              sizeof(gen));
        }

        if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
            ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
             (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
                ext4_error_inode_err(inode, function, line, 0,
                                EFSBADCRC, "iget: checksum invalid");
                ret = -EFSBADCRC;
                goto bad_inode;
        }

        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
        if (ext4_has_feature_project(sb) &&
            EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
        else
                i_projid = EXT4_DEF_PROJID;

        if (!(test_opt(inode->i_sb, NO_UID32))) {
                i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        i_uid_write(inode, i_uid);
        i_gid_write(inode, i_gid);
        ei->i_projid = make_kprojid(&init_user_ns, i_projid);
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));

        ext4_clear_state_flags(ei);        /* Only relevant on 32-bit archs */
        ei->i_inline_off = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
         * This is needed because nfsd might try to access dead inodes
         * the test is that same one that e2fsck uses
         * NeilBrown 1999oct15
         */
        if (inode->i_nlink == 0) {
                if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
                    ino != EXT4_BOOT_LOADER_INO) {
                        /* this inode is deleted or unallocated */
                        if (flags & EXT4_IGET_SPECIAL) {
                                ext4_error_inode(inode, function, line, 0,
                                                 "iget: special inode unallocated");
                                ret = -EFSCORRUPTED;
                        } else
                                ret = -ESTALE;
                        goto bad_inode;
                }
                /* The only unlinked inodes we let through here have
                 * valid i_mode and are being read by the orphan
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those.
                 * OR it is the EXT4_BOOT_LOADER_INO which is
                 * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        ext4_set_inode_flags(inode, true);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (ext4_has_feature_64bit(sb))
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(sb, raw_inode);
        if ((size = i_size_read(inode)) < 0) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad i_size value: %lld", size);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        /*
         * If dir_index is not enabled but there's dir with INDEX flag set,
         * we'd normally treat htree data as empty space. But with metadata
         * checksumming that corrupts checksums so forbid that.
         */
        if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                ext4_error_inode(inode, function, line, 0,
                         "iget: Dir with htree data on filesystem without dir_index feature.");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
         */
        for (block = 0; block < EXT4_N_BLOCKS; block++)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
        ext4_fc_init_inode(&ei->vfs_inode);

        /*
         * Set transaction id's of transactions that have to be committed
         * to finish f[data]sync. We set them to currently running transaction
         * as we cannot be sure that the inode or some of its metadata isn't
         * part of the transaction - the inode could have been reclaimed and
         * now it is reread from disk.
         */
        if (journal) {
                transaction_t *transaction;
                tid_t tid;

                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
                        transaction = journal->j_running_transaction;
                else
                        transaction = journal->j_committing_transaction;
                if (transaction)
                        tid = transaction->t_tid;
                else
                        tid = journal->j_commit_sequence;
                read_unlock(&journal->j_state_lock);
                ei->i_sync_tid = tid;
                ei->i_datasync_tid = tid;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                if (ei->i_extra_isize == 0) {
                        /* The extra space is currently unused. Use it. */
                        BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
                                            EXT4_GOOD_OLD_INODE_SIZE;
                } else {
                        ret = ext4_iget_extra_inode(inode, raw_inode, ei);
                        if (ret)
                                goto bad_inode;
                }
        }

        EXT4_INODE_GET_CTIME(inode, raw_inode);
        EXT4_INODE_GET_ATIME(inode, raw_inode);
        EXT4_INODE_GET_MTIME(inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = le32_to_cpu(raw_inode->i_disk_version);

                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                ivers |=
                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
                }
                ext4_inode_set_iversion_queried(inode, ivers);
        }

        ret = 0;
        if (ei->i_file_acl &&
            !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad extended attribute block %llu",
                                 ei->i_file_acl);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        } else if (!ext4_has_inline_data(inode)) {
                /* validate the block references in the inode */
                if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
                        (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                        (S_ISLNK(inode->i_mode) &&
                        !ext4_inode_is_fast_symlink(inode)))) {
                        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                                ret = ext4_ext_check_inode(inode);
                        else
                                ret = ext4_ind_check_inode(inode);
                }
        }
        if (ret)
                goto bad_inode;

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
                /* VFS does not allow setting these so must be corruption */
                if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: immutable or append flags "
                                         "not allowed on symlinks");
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
                if (IS_ENCRYPTED(inode)) {
                        inode->i_op = &ext4_encrypted_symlink_inode_operations;
                } else if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_link = (char *)ei->i_data;
                        inode->i_op = &ext4_fast_symlink_inode_operations;
                        nd_terminate_link(ei->i_data, inode->i_size,
                                sizeof(ei->i_data) - 1);
                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                }
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else if (ino == EXT4_BOOT_LOADER_INO) {
                make_bad_inode(inode);
        } else {
                ret = -EFSCORRUPTED;
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bogus i_mode (%o)", inode->i_mode);
                goto bad_inode;
        }
        if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
                ext4_error_inode(inode, function, line, 0,
                                 "casefold flag without casefold feature");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        if ((err_str = check_igot_inode(inode, flags)) != NULL) {
                ext4_error_inode(inode, function, line, 0, err_str);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }

        brelse(iloc.bh);
        unlock_new_inode(inode);
        return inode;

bad_inode:
        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
}

static void __ext4_update_other_inode_time(struct super_block *sb,
                                           unsigned long orig_ino,
                                           unsigned long ino,
                                           struct ext4_inode *raw_inode)
{
        struct inode *inode;

        inode = find_inode_by_ino_rcu(sb, ino);
        if (!inode)
                return;

        if (!inode_is_dirtytime_only(inode))
                return;

        spin_lock(&inode->i_lock);
        if (inode_is_dirtytime_only(inode)) {
                struct ext4_inode_info        *ei = EXT4_I(inode);

                inode->i_state &= ~I_DIRTY_TIME;
                spin_unlock(&inode->i_lock);

                spin_lock(&ei->i_raw_lock);
                EXT4_INODE_SET_CTIME(inode, raw_inode);
                EXT4_INODE_SET_MTIME(inode, raw_inode);
                EXT4_INODE_SET_ATIME(inode, raw_inode);
                ext4_inode_csum_set(inode, raw_inode, ei);
                spin_unlock(&ei->i_raw_lock);
                trace_ext4_other_inode_update_time(inode, orig_ino);
                return;
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Opportunistically update the other time fields for other inodes in
 * the same inode table block.
 */
static void ext4_update_other_inodes_time(struct super_block *sb,
                                          unsigned long orig_ino, char *buf)
{
        unsigned long ino;
        int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int inode_size = EXT4_INODE_SIZE(sb);

        /*
         * Calculate the first inode in the inode table block.  Inode
         * numbers are one-based.  That is, the first inode in a block
         * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
         */
        ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
        rcu_read_lock();
        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
                if (ino == orig_ino)
                        continue;
                __ext4_update_other_inode_time(sb, orig_ino, ino,
                                               (struct ext4_inode *)buf);
        }
        rcu_read_unlock();
}

/*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
 * buffer_head in the inode location struct.
 *
 * The caller must have write access to iloc->bh.
 */
static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
                                struct ext4_iloc *iloc)
{
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct buffer_head *bh = iloc->bh;
        struct super_block *sb = inode->i_sb;
        int err;
        int need_datasync = 0, set_large_file = 0;

        spin_lock(&ei->i_raw_lock);

        /*
         * For fields not tracked in the in-memory inode, initialise them
         * to zero for new inodes.
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

        if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
                need_datasync = 1;
        if (ei->i_disksize > 0x7fffffffULL) {
                if (!ext4_has_feature_large_file(sb) ||
                    EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
                        set_large_file = 1;
        }

        err = ext4_fill_raw_inode(inode, raw_inode);
        spin_unlock(&ei->i_raw_lock);
        if (err) {
                EXT4_ERROR_INODE(inode, "corrupted inode contents");
                goto out_brelse;
        }

        if (inode->i_sb->s_flags & SB_LAZYTIME)
                ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
                                              bh->b_data);

        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (err)
                goto out_error;
        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        if (set_large_file) {
                BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
                err = ext4_journal_get_write_access(handle, sb,
                                                    EXT4_SB(sb)->s_sbh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_error;
                lock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_set_feature_large_file(sb);
                ext4_superblock_csum_set(sb);
                unlock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_handle_sync(handle);
                err = ext4_handle_dirty_metadata(handle, NULL,
                                                 EXT4_SB(sb)->s_sbh);
        }
        ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_error:
        ext4_std_error(inode->i_sb, err);
out_brelse:
        brelse(bh);
        return err;
}

/*
 * ext4_write_inode()
 *
 * We are called from a few places:
 *
 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
 *   Here, there will be no transaction running. We wait for any running
 *   transaction to commit.
 *
 * - Within flush work (sys_sync(), kupdate and such).
 *   We wait on commit, if told to.
 *
 * - Within iput_final() -> write_inode_now()
 *   We wait on commit, if told to.
 *
 * In all cases it is actually safe for us to return without doing anything,
 * because the inode has been copied into a raw inode buffer in
 * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
 * writeback.
 *
 * Note that we are absolutely dependent upon all inode dirtiers doing the
 * right thing: they *must* call mark_inode_dirty() after dirtying info in
 * which we are interested.
 *
 * It would be a bug for them to not do this.  The code:
 *
 *        mark_inode_dirty(inode)
 *        stuff();
 *        inode->i_size = expr;
 *
 * is in error because write_inode() could occur while `stuff()' is running,
 * and the new i_size will be lost.  Plus the inode will no longer be on the
 * superblock's dirty inode list.
 */
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int err;

        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
                return 0;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (EXT4_SB(inode->i_sb)->s_journal) {
                if (ext4_journal_current_handle()) {
                        ext4_debug("called recursively, non-PF_MEMALLOC!\n");
                        dump_stack();
                        return -EIO;
                }

                /*
                 * No need to force transaction in WB_SYNC_NONE mode. Also
                 * ext4_sync_fs() will force the commit after everything is
                 * written.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                        return 0;

                err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                                EXT4_I(inode)->i_sync_tid);
        } else {
                struct ext4_iloc iloc;

                err = __ext4_get_inode_loc_noinmem(inode, &iloc);
                if (err)
                        return err;
                /*
                 * sync(2) will flush the whole buffer cache. No need to do
                 * it here separately for each inode.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                        ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
                                               "IO error syncing inode");
                        err = -EIO;
                }
                brelse(iloc.bh);
        }
        return err;
}

/*
 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
 * buffers that are attached to a folio straddling i_size and are undergoing
 * commit. In that case we have to wait for commit to finish and try again.
 */
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
        unsigned offset;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        tid_t commit_tid = 0;
        int ret;

        offset = inode->i_size & (PAGE_SIZE - 1);
        /*
         * If the folio is fully truncated, we don't need to wait for any commit
         * (and we even should not as __ext4_journalled_invalidate_folio() may
         * strip all buffers from the folio but keep the folio dirty which can then
         * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
         * buffers). Also we don't need to wait for any commit if all buffers in
         * the folio remain valid. This is most beneficial for the common case of
         * blocksize == PAGESIZE.
         */
        if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                return;
        while (1) {
                struct folio *folio = filemap_lock_folio(inode->i_mapping,
                                      inode->i_size >> PAGE_SHIFT);
                if (IS_ERR(folio))
                        return;
                ret = __ext4_journalled_invalidate_folio(folio, offset,
                                                folio_size(folio) - offset);
                folio_unlock(folio);
                folio_put(folio);
                if (ret != -EBUSY)
                        return;
                commit_tid = 0;
                read_lock(&journal->j_state_lock);
                if (journal->j_committing_transaction)
                        commit_tid = journal->j_committing_transaction->t_tid;
                read_unlock(&journal->j_state_lock);
                if (commit_tid)
                        jbd2_log_wait_commit(journal, commit_tid);
        }
}

/*
 * ext4_setattr()
 *
 * Called from notify_change.
 *
 * We want to trap VFS attempts to truncate the file as soon as
 * possible.  In particular, we want to make sure that when the VFS
 * shrinks i_size, we put the inode on the orphan list and modify
 * i_disksize immediately, so that during the subsequent flushing of
 * dirty pages and freeing of disk blocks, we can guarantee that any
 * commit will leave the blocks being flushed in an unused state on
 * disk.  (On recovery, the inode will get truncated and the blocks will
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
 * Another thing we have to assure is that if we are in ordered mode
 * and inode is still attached to the committing transaction, we must
 * we start writeout of all the dirty pages which are being truncated.
 * This way we are sure that all the data written in the previous
 * transaction are already on disk (truncate waits for pages under
 * writeback).
 *
 * Called with inode->i_rwsem down.
 */
int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        int error, rc = 0;
        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        bool inc_ivers = true;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        if (unlikely(IS_APPEND(inode) &&
                     (ia_valid & (ATTR_MODE | ATTR_UID |
                                  ATTR_GID | ATTR_TIMES_SET))))
                return -EPERM;

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        error = fscrypt_prepare_setattr(dentry, attr);
        if (error)
                return error;

        error = fsverity_prepare_setattr(dentry, attr);
        if (error)
                return error;

        if (is_quota_modification(idmap, inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                handle_t *handle;

                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                        (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }

                /* dquot_transfer() calls back ext4_get_inode_usage() which
                 * counts xattr inode references.
                 */
                down_read(&EXT4_I(inode)->xattr_sem);
                error = dquot_transfer(idmap, inode, attr);
                up_read(&EXT4_I(inode)->xattr_sem);

                if (error) {
                        ext4_journal_stop(handle);
                        return error;
                }
                /* Update corresponding info in inode so that everything is in
                 * one transaction */
                i_uid_update(idmap, attr, inode);
                i_gid_update(idmap, attr, inode);
                error = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
                if (unlikely(error)) {
                        return error;
                }
        }

        if (attr->ia_valid & ATTR_SIZE) {
                handle_t *handle;
                loff_t oldsize = inode->i_size;
                loff_t old_disksize;
                int shrink = (attr->ia_size < inode->i_size);

                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
                                return -EFBIG;
                        }
                }
                if (!S_ISREG(inode->i_mode)) {
                        return -EINVAL;
                }

                if (attr->ia_size == inode->i_size)
                        inc_ivers = false;

                if (shrink) {
                        if (ext4_should_order_data(inode)) {
                                error = ext4_begin_ordered_truncate(inode,
                                                            attr->ia_size);
                                if (error)
                                        goto err_out;
                        }
                        /*
                         * Blocks are going to be removed from the inode. Wait
                         * for dio in flight.
                         */
                        inode_dio_wait(inode);
                }

                filemap_invalidate_lock(inode->i_mapping);

                rc = ext4_break_layouts(inode);
                if (rc) {
                        filemap_invalidate_unlock(inode->i_mapping);
                        goto err_out;
                }

                if (attr->ia_size != inode->i_size) {
                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                        if (IS_ERR(handle)) {
                                error = PTR_ERR(handle);
                                goto out_mmap_sem;
                        }
                        if (ext4_handle_valid(handle) && shrink) {
                                error = ext4_orphan_add(handle, inode);
                                orphan = 1;
                        }
                        /*
                         * Update c/mtime on truncate up, ext4_truncate() will
                         * update c/mtime in shrink case below
                         */
                        if (!shrink)
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));

                        if (shrink)
                                ext4_fc_track_range(handle, inode,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits,
                                        EXT_MAX_BLOCKS - 1);
                        else
                                ext4_fc_track_range(
                                        handle, inode,
                                        (oldsize > 0 ? oldsize - 1 : oldsize) >>
                                        inode->i_sb->s_blocksize_bits,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits);

                        down_write(&EXT4_I(inode)->i_data_sem);
                        old_disksize = EXT4_I(inode)->i_disksize;
                        EXT4_I(inode)->i_disksize = attr->ia_size;
                        rc = ext4_mark_inode_dirty(handle, inode);
                        if (!error)
                                error = rc;
                        /*
                         * We have to update i_size under i_data_sem together
                         * with i_disksize to avoid races with writeback code
                         * running ext4_wb_update_i_disksize().
                         */
                        if (!error)
                                i_size_write(inode, attr->ia_size);
                        else
                                EXT4_I(inode)->i_disksize = old_disksize;
                        up_write(&EXT4_I(inode)->i_data_sem);
                        ext4_journal_stop(handle);
                        if (error)
                                goto out_mmap_sem;
                        if (!shrink) {
                                pagecache_isize_extended(inode, oldsize,
                                                         inode->i_size);
                        } else if (ext4_should_journal_data(inode)) {
                                ext4_wait_for_tail_page_commit(inode);
                        }
                }

                /*
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
                truncate_pagecache(inode, inode->i_size);
                /*
                 * Call ext4_truncate() even if i_size didn't change to
                 * truncate possible preallocated blocks.
                 */
                if (attr->ia_size <= oldsize) {
                        rc = ext4_truncate(inode);
                        if (rc)
                                error = rc;
                }
out_mmap_sem:
                filemap_invalidate_unlock(inode->i_mapping);
        }

        if (!error) {
                if (inc_ivers)
                        inode_inc_iversion(inode);
                setattr_copy(idmap, inode, attr);
                mark_inode_dirty(inode);
        }

        /*
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);

        if (!error && (ia_valid & ATTR_MODE))
                rc = posix_acl_chmod(idmap, dentry, inode->i_mode);

err_out:
        if  (error)
                ext4_std_error(inode->i_sb, error);
        if (!error)
                error = rc;
        return error;
}

u32 ext4_dio_alignment(struct inode *inode)
{
        if (fsverity_active(inode))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        if (ext4_has_inline_data(inode))
                return 0;
        if (IS_ENCRYPTED(inode)) {
                if (!fscrypt_dio_supported(inode))
                        return 0;
                return i_blocksize(inode);
        }
        return 1; /* use the iomap defaults */
}

int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
                 struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;

        if ((request_mask & STATX_BTIME) &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = ei->i_crtime.tv_sec;
                stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
        }

        /*
         * Return the DIO alignment restrictions if requested.  We only return
         * this information when requested, since on encrypted files it might
         * take a fair bit of work to get if the file wasn't opened recently.
         */
        if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
                u32 dio_align = ext4_dio_alignment(inode);

                stat->result_mask |= STATX_DIOALIGN;
                if (dio_align == 1) {
                        struct block_device *bdev = inode->i_sb->s_bdev;

                        /* iomap defaults */
                        stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
                        stat->dio_offset_align = bdev_logical_block_size(bdev);
                } else {
                        stat->dio_mem_align = dio_align;
                        stat->dio_offset_align = dio_align;
                }
        }

        flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
        if (flags & EXT4_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (flags & EXT4_COMPR_FL)
                stat->attributes |= STATX_ATTR_COMPRESSED;
        if (flags & EXT4_ENCRYPT_FL)
                stat->attributes |= STATX_ATTR_ENCRYPTED;
        if (flags & EXT4_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (flags & EXT4_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        if (flags & EXT4_VERITY_FL)
                stat->attributes |= STATX_ATTR_VERITY;

        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_ENCRYPTED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP |
                                  STATX_ATTR_VERITY);

        generic_fillattr(idmap, request_mask, inode, stat);
        return 0;
}

int ext4_file_getattr(struct mnt_idmap *idmap,
                      const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        u64 delalloc_blocks;

        ext4_getattr(idmap, path, stat, request_mask, query_flags);

        /*
         * If there is inline data in the inode, the inode will normally not
         * have data blocks allocated (it may have an external xattr block).
         * Report at least one sector for such files, so tools like tar, rsync,
         * others don't incorrectly think the file is completely sparse.
         */
        if (unlikely(ext4_has_inline_data(inode)))
                stat->blocks += (stat->size + 511) >> 9;

        /*
         * We can't update i_blocks if the block allocation is delayed
         * otherwise in the case of system crash before the real block
         * allocation is done, we will have i_blocks inconsistent with
         * on-disk file blocks.
         * We always keep i_blocks updated together with real
         * allocation. But to not confuse with user, stat
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                   EXT4_I(inode)->i_reserved_data_blocks);
        stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
        return 0;
}

static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
                                   int pextents)
{
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_ind_trans_blocks(inode, lblocks);
        return ext4_ext_index_trans_blocks(inode, pextents);
}

/*
 * Account for index blocks, block groups bitmaps and block group
 * descriptor blocks if modify datablocks and index blocks
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
 * different block groups too. If they are contiguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents)
{
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
        int idxblocks;
        int ret;

        /*
         * How many index blocks need to touch to map @lblocks logical blocks
         * to @pextents physical extents?
         */
        idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);

        ret = idxblocks;

        /*
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
        groups = idxblocks + pextents;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;

        /* bitmaps and block group descriptor blocks */
        ret += groups + gdpblocks;

        /* Blocks for super block, inode, quota and xattr blocks */
        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);

        return ret;
}

/*
 * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
 * This could be called via ext4_write_begin()
 *
 * We need to consider the worse case, when
 * one new block per extent.
 */
int ext4_writepage_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);
        int ret;

        ret = ext4_meta_trans_blocks(inode, bpp, bpp);

        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
                ret += bpp;
        return ret;
}

/*
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
 */
int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
{
        return ext4_meta_trans_blocks(inode, nrblocks, 1);
}

/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
int ext4_mark_iloc_dirty(handle_t *handle,
                         struct inode *inode, struct ext4_iloc *iloc)
{
        int err = 0;

        if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
                put_bh(iloc->bh);
                return -EIO;
        }
        ext4_fc_track_inode(handle, inode);

        /* the do_update_inode consumes one bh->b_count */
        get_bh(iloc->bh);

        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
        err = ext4_do_update_inode(handle, inode, iloc);
        put_bh(iloc->bh);
        return err;
}

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
{
        int err;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        err = ext4_get_inode_loc(inode, iloc);
        if (!err) {
                BUFFER_TRACE(iloc->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    iloc->bh, EXT4_JTR_NONE);
                if (err) {
                        brelse(iloc->bh);
                        iloc->bh = NULL;
                }
        }
        ext4_std_error(inode->i_sb, err);
        return err;
}

static int __ext4_expand_extra_isize(struct inode *inode,
                                     unsigned int new_extra_isize,
                                     struct ext4_iloc *iloc,
                                     handle_t *handle, int *no_expand)
{
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int error;

        /* this was checked at iget time, but double check for good measure */
        if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
            (ei->i_extra_isize & 3)) {
                EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
                                 ei->i_extra_isize,
                                 EXT4_INODE_SIZE(inode->i_sb));
                return -EFSCORRUPTED;
        }
        if ((new_extra_isize < ei->i_extra_isize) ||
            (new_extra_isize < 4) ||
            (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
                return -EINVAL;        /* Should never happen */

        raw_inode = ext4_raw_inode(iloc);

        header = IHDR(inode, raw_inode);

        /* No extended attributes present */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
                       EXT4_I(inode)->i_extra_isize, 0,
                       new_extra_isize - EXT4_I(inode)->i_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
                return 0;
        }

        /*
         * We may need to allocate external xattr block so we need quotas
         * initialized. Here we can be called with various locks held so we
         * cannot affort to initialize quotas ourselves. So just bail.
         */
        if (dquot_initialize_needed(inode))
                return -EAGAIN;

        /* try to expand with EAs present */
        error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
                                           raw_inode, handle);
        if (error) {
                /*
                 * Inode size expansion failed; don't try again
                 */
                *no_expand = 1;
        }

        return error;
}

/*
 * Expand an inode by new_extra_isize bytes.
 * Returns 0 on success or negative error number on failure.
 */
static int ext4_try_to_expand_extra_isize(struct inode *inode,
                                          unsigned int new_extra_isize,
                                          struct ext4_iloc iloc,
                                          handle_t *handle)
{
        int no_expand;
        int error;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
                return -EOVERFLOW;

        /*
         * In nojournal mode, we can immediately attempt to expand
         * the inode.  When journaled, we first need to obtain extra
         * buffer credits since we may write into the EA block
         * with this same handle. If journal_extend fails, then it will
         * only result in a minor loss of functionality for that inode.
         * If this is felt to be critical, then e2fsck should be run to
         * force a large enough s_min_extra_isize.
         */
        if (ext4_journal_extend(handle,
                                EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                return -ENOSPC;

        if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
                return -EBUSY;

        error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
                                          handle, &no_expand);
        ext4_write_unlock_xattr(inode, &no_expand);

        return error;
}

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc)
{
        handle_t *handle;
        int no_expand;
        int error, rc;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                brelse(iloc->bh);
                return -EOVERFLOW;
        }

        handle = ext4_journal_start(inode, EXT4_HT_INODE,
                                    EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
                brelse(iloc->bh);
                return error;
        }

        ext4_write_lock_xattr(inode, &no_expand);

        BUFFER_TRACE(iloc->bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
                                              EXT4_JTR_NONE);
        if (error) {
                brelse(iloc->bh);
                goto out_unlock;
        }

        error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
                                          handle, &no_expand);

        rc = ext4_mark_iloc_dirty(handle, inode, iloc);
        if (!error)
                error = rc;

out_unlock:
        ext4_write_unlock_xattr(inode, &no_expand);
        ext4_journal_stop(handle);
        return error;
}

/*
 * What we do here is to mark the in-core inode as clean with respect to inode
 * dirtiness (it may still be data-dirty).
 * This means that the in-core inode may be reaped by prune_icache
 * without having to perform any I/O.  This is a very good thing,
 * because *any* task may call prune_icache - even ones which
 * have a transaction open against a different journal.
 *
 * Is this cheating?  Not really.  Sure, we haven't written the
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
 */
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err;

        might_sleep();
        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
                ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
                                               iloc, handle);

        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out:
        if (unlikely(err))
                ext4_error_inode_err(inode, func, line, 0, err,
                                        "mark_inode_dirty error");
        return err;
}

/*
 * ext4_dirty_inode() is called from __mark_inode_dirty()
 *
 * We're really interested in the case where a file is being extended.
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
 * Also, dquot_alloc_block() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
void ext4_dirty_inode(struct inode *inode, int flags)
{
        handle_t *handle;

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                return;
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
}

int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
        journal_t *journal;
        handle_t *handle;
        int err;
        int alloc_ctx;

        /*
         * We have to be very careful here: changing a data block's
         * journaling status dynamically is dangerous.  If we write a
         * data block to the journal, change the status and then delete
         * that block, we risk forgetting to revoke the old log record
         * from the journal and so a subsequent replay can corrupt data.
         * So, first we make sure that the journal is empty and that
         * nobody is changing anything.
         */

        journal = EXT4_JOURNAL(inode);
        if (!journal)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;

        /* Wait for all existing dio workers */
        inode_dio_wait(inode);

        /*
         * Before flushing the journal and switching inode's aops, we have
         * to flush all dirty data the inode has. There can be outstanding
         * delayed allocations, there can be unwritten extents created by
         * fallocate or buffered writes in dioread_nolock mode covered by
         * dirty data which can be converted only after flushing the dirty
         * data (and journalled aops don't know how to handle these cases).
         */
        if (val) {
                filemap_invalidate_lock(inode->i_mapping);
                err = filemap_write_and_wait(inode->i_mapping);
                if (err < 0) {
                        filemap_invalidate_unlock(inode->i_mapping);
                        return err;
                }
        }

        alloc_ctx = ext4_writepages_down_write(inode->i_sb);
        jbd2_journal_lock_updates(journal);

        /*
         * OK, there are no updates running now, and all cached data is
         * synced to disk.  We are now in a completely consistent state
         * which doesn't have anything in the journal, and we know that
         * no filesystem updates are running, so it is safe to modify
         * the inode's in-core data-journaling state flag now.
         */

        if (val)
                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else {
                err = jbd2_journal_flush(journal, 0);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
                        ext4_writepages_up_write(inode->i_sb, alloc_ctx);
                        return err;
                }
                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        }
        ext4_set_aops(inode);

        jbd2_journal_unlock_updates(journal);
        ext4_writepages_up_write(inode->i_sb, alloc_ctx);

        if (val)
                filemap_invalidate_unlock(inode->i_mapping);

        /* Finally we can mark the inode as dirty. */

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_fc_mark_ineligible(inode->i_sb,
                EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);

        return err;
}

static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh)
{
        return !buffer_mapped(bh);
}

vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = page_folio(vmf->page);
        loff_t size;
        unsigned long len;
        int err;
        vm_fault_t ret;
        struct file *file = vma->vm_file;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        get_block_t *get_block;
        int retries = 0;

        if (unlikely(IS_IMMUTABLE(inode)))
                return VM_FAULT_SIGBUS;

        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);

        filemap_invalidate_lock_shared(mapping);

        err = ext4_convert_inline_data(inode);
        if (err)
                goto out_ret;

        /*
         * On data journalling we skip straight to the transaction handle:
         * there's no delalloc; page truncated will be checked later; the
         * early return w/ all buffers mapped (calculates size/len) can't
         * be used; and there's no dioread_nolock, so only ext4_get_block.
         */
        if (ext4_should_journal_data(inode))
                goto retry_alloc;

        /* Delalloc case is easy... */
        if (test_opt(inode->i_sb, DELALLOC) &&
            !ext4_nonda_switch(inode->i_sb)) {
                do {
                        err = block_page_mkwrite(vma, vmf,
                                                   ext4_da_get_block_prep);
                } while (err == -ENOSPC &&
                       ext4_should_retry_alloc(inode->i_sb, &retries));
                goto out_ret;
        }

        folio_lock(folio);
        size = i_size_read(inode);
        /* Page got truncated from under us? */
        if (folio->mapping != mapping || folio_pos(folio) > size) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        len = folio_size(folio);
        if (folio_pos(folio) + len > size)
                len = size - folio_pos(folio);
        /*
         * Return if we have all the buffers mapped. This avoids the need to do
         * journal_start/journal_stop which can block and take a long time
         *
         * This cannot be done for data journalling, as we have to add the
         * inode to the transaction's list to writeprotect pages on commit.
         */
        if (folio_buffers(folio)) {
                if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
                        folio_wait_stable(folio);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
        }
        folio_unlock(folio);
        /* OK, we need to fill the hole... */
        if (ext4_should_dioread_nolock(inode))
                get_block = ext4_get_block_unwritten;
        else
                get_block = ext4_get_block;
retry_alloc:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }
        /*
         * Data journalling can't use block_page_mkwrite() because it
         * will set_buffer_dirty() before do_journal_get_write_access()
         * thus might hit warning messages for dirty metadata buffers.
         */
        if (!ext4_should_journal_data(inode)) {
                err = block_page_mkwrite(vma, vmf, get_block);
        } else {
                folio_lock(folio);
                size = i_size_read(inode);
                /* Page got truncated from under us? */
                if (folio->mapping != mapping || folio_pos(folio) > size) {
                        ret = VM_FAULT_NOPAGE;
                        goto out_error;
                }

                len = folio_size(folio);
                if (folio_pos(folio) + len > size)
                        len = size - folio_pos(folio);

                err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
                if (!err) {
                        ret = VM_FAULT_SIGBUS;
                        if (ext4_journal_folio_buffers(handle, folio, len))
                                goto out_error;
                } else {
                        folio_unlock(folio);
                }
        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry_alloc;
out_ret:
        ret = vmf_fs_error(err);
out:
        filemap_invalidate_unlock_shared(mapping);
        sb_end_pagefault(inode->i_sb);
        return ret;
out_error:
        folio_unlock(folio);
        ext4_journal_stop(handle);
        goto out;
}
















































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 























































































































































































    1 



    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/fcntl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/rw_hint.h>

#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>

#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)

static int setfl(int fd, struct file * filp, unsigned int arg)
{
        struct inode * inode = file_inode(filp);
        int error = 0;

        /*
         * O_APPEND cannot be cleared if the file is marked as append-only
         * and the file is open for write.
         */
        if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
                return -EPERM;

        /* O_NOATIME can only be set by the owner or superuser */
        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
                if (!inode_owner_or_capable(file_mnt_idmap(filp), inode))
                        return -EPERM;

        /* required for strict SunOS emulation */
        if (O_NONBLOCK != O_NDELAY)
               if (arg & O_NDELAY)
                   arg |= O_NONBLOCK;

        /* Pipe packetized mode is controlled by O_DIRECT flag */
        if (!S_ISFIFO(inode->i_mode) &&
            (arg & O_DIRECT) &&
            !(filp->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        if (filp->f_op->check_flags)
                error = filp->f_op->check_flags(arg);
        if (error)
                return error;

        /*
         * ->fasync() is responsible for setting the FASYNC bit.
         */
        if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
                error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
                if (error < 0)
                        goto out;
                if (error > 0)
                        error = 0;
        }
        spin_lock(&filp->f_lock);
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
        filp->f_iocb_flags = iocb_flags(filp);
        spin_unlock(&filp->f_lock);

 out:
        return error;
}

static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                     int force)
{
        write_lock_irq(&filp->f_owner.lock);
        if (force || !filp->f_owner.pid) {
                put_pid(filp->f_owner.pid);
                filp->f_owner.pid = get_pid(pid);
                filp->f_owner.pid_type = type;

                if (pid) {
                        const struct cred *cred = current_cred();
                        filp->f_owner.uid = cred->uid;
                        filp->f_owner.euid = cred->euid;
                }
        }
        write_unlock_irq(&filp->f_owner.lock);
}

void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
{
        security_file_set_fowner(filp);
        f_modown(filp, pid, type, force);
}
EXPORT_SYMBOL(__f_setown);

int f_setown(struct file *filp, int who, int force)
{
        enum pid_type type;
        struct pid *pid = NULL;
        int ret = 0;

        type = PIDTYPE_TGID;
        if (who < 0) {
                /* avoid overflow below */
                if (who == INT_MIN)
                        return -EINVAL;

                type = PIDTYPE_PGID;
                who = -who;
        }

        rcu_read_lock();
        if (who) {
                pid = find_vpid(who);
                if (!pid)
                        ret = -ESRCH;
        }

        if (!ret)
                __f_setown(filp, pid, type, force);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(f_setown);

void f_delown(struct file *filp)
{
        f_modown(filp, NULL, PIDTYPE_TGID, 1);
}

pid_t f_getown(struct file *filp)
{
        pid_t pid = 0;

        read_lock_irq(&filp->f_owner.lock);
        rcu_read_lock();
        if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
                pid = pid_vnr(filp->f_owner.pid);
                if (filp->f_owner.pid_type == PIDTYPE_PGID)
                        pid = -pid;
        }
        rcu_read_unlock();
        read_unlock_irq(&filp->f_owner.lock);
        return pid;
}

static int f_setown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner;
        struct pid *pid;
        int type;
        int ret;

        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
                return -EFAULT;

        switch (owner.type) {
        case F_OWNER_TID:
                type = PIDTYPE_PID;
                break;

        case F_OWNER_PID:
                type = PIDTYPE_TGID;
                break;

        case F_OWNER_PGRP:
                type = PIDTYPE_PGID;
                break;

        default:
                return -EINVAL;
        }

        rcu_read_lock();
        pid = find_vpid(owner.pid);
        if (owner.pid && !pid)
                ret = -ESRCH;
        else
                 __f_setown(filp, pid, type, 1);
        rcu_read_unlock();

        return ret;
}

static int f_getown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner = {};
        int ret = 0;

        read_lock_irq(&filp->f_owner.lock);
        rcu_read_lock();
        if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
                owner.pid = pid_vnr(filp->f_owner.pid);
        rcu_read_unlock();
        switch (filp->f_owner.pid_type) {
        case PIDTYPE_PID:
                owner.type = F_OWNER_TID;
                break;

        case PIDTYPE_TGID:
                owner.type = F_OWNER_PID;
                break;

        case PIDTYPE_PGID:
                owner.type = F_OWNER_PGRP;
                break;

        default:
                WARN_ON(1);
                ret = -EINVAL;
                break;
        }
        read_unlock_irq(&filp->f_owner.lock);

        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
                if (ret)
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        struct user_namespace *user_ns = current_user_ns();
        uid_t __user *dst = (void __user *)arg;
        uid_t src[2];
        int err;

        read_lock_irq(&filp->f_owner.lock);
        src[0] = from_kuid(user_ns, filp->f_owner.uid);
        src[1] = from_kuid(user_ns, filp->f_owner.euid);
        read_unlock_irq(&filp->f_owner.lock);

        err  = put_user(src[0], &dst[0]);
        err |= put_user(src[1], &dst[1]);

        return err;
}
#else
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        return -EINVAL;
}
#endif

static bool rw_hint_valid(u64 hint)
{
        BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
        BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
        BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
        BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
        BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
        BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);

        switch (hint) {
        case RWH_WRITE_LIFE_NOT_SET:
        case RWH_WRITE_LIFE_NONE:
        case RWH_WRITE_LIFE_SHORT:
        case RWH_WRITE_LIFE_MEDIUM:
        case RWH_WRITE_LIFE_LONG:
        case RWH_WRITE_LIFE_EXTREME:
                return true;
        default:
                return false;
        }
}

static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint = READ_ONCE(inode->i_write_hint);

        if (copy_to_user(argp, &hint, sizeof(*argp)))
                return -EFAULT;
        return 0;
}

static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint;

        if (copy_from_user(&hint, argp, sizeof(hint)))
                return -EFAULT;
        if (!rw_hint_valid(hint))
                return -EINVAL;

        WRITE_ONCE(inode->i_write_hint, hint);

        /*
         * file->f_mapping->host may differ from inode. As an example,
         * blkdev_open() modifies file->f_mapping.
         */
        if (file->f_mapping->host != inode)
                WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);

        return 0;
}

/* Is the file descriptor a dup of the file? */
static long f_dupfd_query(int fd, struct file *filp)
{
        CLASS(fd_raw, f)(fd);

        /*
         * We can do the 'fdput()' immediately, as the only thing that
         * matters is the pointer value which isn't changed by the fdput.
         *
         * Technically we didn't need a ref at all, and 'fdget()' was
         * overkill, but given our lockless file pointer lookup, the
         * alternatives are complicated.
         */
        return f.file == filp;
}

static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                struct file *filp)
{
        void __user *argp = (void __user *)arg;
        int argi = (int)arg;
        struct flock flock;
        long err = -EINVAL;

        switch (cmd) {
        case F_DUPFD:
                err = f_dupfd(argi, filp, 0);
                break;
        case F_DUPFD_CLOEXEC:
                err = f_dupfd(argi, filp, O_CLOEXEC);
                break;
        case F_DUPFD_QUERY:
                err = f_dupfd_query(argi, filp);
                break;
        case F_GETFD:
                err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
                break;
        case F_SETFD:
                err = 0;
                set_close_on_exec(fd, argi & FD_CLOEXEC);
                break;
        case F_GETFL:
                err = filp->f_flags;
                break;
        case F_SETFL:
                err = setfl(fd, filp, argi);
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_GETLK:
#endif
        case F_GETLK:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_getlk(filp, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        return -EFAULT;
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                fallthrough;
#endif
        case F_SETLK:
        case F_SETLKW:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_setlk(fd, filp, cmd, &flock);
                break;
        case F_GETOWN:
                /*
                 * XXX If f_owner is a process group, the
                 * negative return value will get converted
                 * into an error.  Oops.  If we keep the
                 * current syscall conventions, the only way
                 * to fix this will be in libc.
                 */
                err = f_getown(filp);
                force_successful_syscall_return();
                break;
        case F_SETOWN:
                err = f_setown(filp, argi, 1);
                break;
        case F_GETOWN_EX:
                err = f_getown_ex(filp, arg);
                break;
        case F_SETOWN_EX:
                err = f_setown_ex(filp, arg);
                break;
        case F_GETOWNER_UIDS:
                err = f_getowner_uids(filp, arg);
                break;
        case F_GETSIG:
                err = filp->f_owner.signum;
                break;
        case F_SETSIG:
                /* arg == 0 restores default behaviour. */
                if (!valid_signal(argi)) {
                        break;
                }
                err = 0;
                filp->f_owner.signum = argi;
                break;
        case F_GETLEASE:
                err = fcntl_getlease(filp);
                break;
        case F_SETLEASE:
                err = fcntl_setlease(fd, filp, argi);
                break;
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, argi);
                break;
        case F_SETPIPE_SZ:
        case F_GETPIPE_SZ:
                err = pipe_fcntl(filp, cmd, argi);
                break;
        case F_ADD_SEALS:
        case F_GET_SEALS:
                err = memfd_fcntl(filp, cmd, argi);
                break;
        case F_GET_RW_HINT:
                err = fcntl_get_rw_hint(filp, cmd, arg);
                break;
        case F_SET_RW_HINT:
                err = fcntl_set_rw_hint(filp, cmd, arg);
                break;
        default:
                break;
        }
        return err;
}

static int check_fcntl_cmd(unsigned cmd)
{
        switch (cmd) {
        case F_DUPFD:
        case F_DUPFD_CLOEXEC:
        case F_DUPFD_QUERY:
        case F_GETFD:
        case F_SETFD:
        case F_GETFL:
                return 1;
        }
        return 0;
}

SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{        
        struct fd f = fdget_raw(fd);
        long err = -EBADF;

        if (!f.file)
                goto out;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out1;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (!err)
                err = do_fcntl(fd, cmd, arg, f.file);

out1:
         fdput(f);
out:
        return err;
}

#if BITS_PER_LONG == 32
SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                unsigned long, arg)
{        
        void __user *argp = (void __user *)arg;
        struct fd f = fdget_raw(fd);
        struct flock64 flock;
        long err = -EBADF;

        if (!f.file)
                goto out;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out1;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (err)
                goto out1;
        
        switch (cmd) {
        case F_GETLK64:
        case F_OFD_GETLK:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_getlk64(f.file, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        err = -EFAULT;
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_setlk64(fd, f.file, cmd, &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, f.file);
                break;
        }
out1:
        fdput(f);
out:
        return err;
}
#endif

#ifdef CONFIG_COMPAT
/* careful - don't use anywhere else */
#define copy_flock_fields(dst, src)                \
        (dst)->l_type = (src)->l_type;                \
        (dst)->l_whence = (src)->l_whence;        \
        (dst)->l_start = (src)->l_start;        \
        (dst)->l_len = (src)->l_len;                \
        (dst)->l_pid = (src)->l_pid;

static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        memset(&fl, 0, sizeof(struct compat_flock));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
                return -EFAULT;
        return 0;
}

static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
        BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));

        memset(&fl, 0, sizeof(struct compat_flock64));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
                return -EFAULT;
        return 0;
}
#undef copy_flock_fields

static unsigned int
convert_fcntl_cmd(unsigned int cmd)
{
        switch (cmd) {
        case F_GETLK64:
                return F_GETLK;
        case F_SETLK64:
                return F_SETLK;
        case F_SETLKW64:
                return F_SETLKW;
        }

        return cmd;
}

/*
 * GETLK was successful and we need to return the data, but it needs to fit in
 * the compat structure.
 * l_start shouldn't be too big, unless the original start + end is greater than
 * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
 * -EOVERFLOW in that case.  l_len could be too big, in which case we just
 * truncate it, and only allow the app to see that part of the conflicting lock
 * that might make sense to it anyway
 */
static int fixup_compat_flock(struct flock *flock)
{
        if (flock->l_start > COMPAT_OFF_T_MAX)
                return -EOVERFLOW;
        if (flock->l_len > COMPAT_OFF_T_MAX)
                flock->l_len = COMPAT_OFF_T_MAX;
        return 0;
}

static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
                             compat_ulong_t arg)
{
        struct fd f = fdget_raw(fd);
        struct flock flock;
        long err = -EBADF;

        if (!f.file)
                return err;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out_put;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (err)
                goto out_put;

        switch (cmd) {
        case F_GETLK:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
                if (err)
                        break;
                err = fixup_compat_flock(&flock);
                if (!err)
                        err = put_compat_flock(&flock, compat_ptr(arg));
                break;
        case F_GETLK64:
        case F_OFD_GETLK:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
                if (!err)
                        err = put_compat_flock64(&flock, compat_ptr(arg));
                break;
        case F_SETLK:
        case F_SETLKW:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, f.file);
                break;
        }
out_put:
        fdput(f);
        return err;
}

COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        return do_compat_fcntl64(fd, cmd, arg);
}

COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        switch (cmd) {
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                return -EINVAL;
        }
        return do_compat_fcntl64(fd, cmd, arg);
}
#endif

/* Table to convert sigio signal codes into poll band bitmaps */

static const __poll_t band_table[NSIGPOLL] = {
        EPOLLIN | EPOLLRDNORM,                        /* POLL_IN */
        EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND,        /* POLL_OUT */
        EPOLLIN | EPOLLRDNORM | EPOLLMSG,                /* POLL_MSG */
        EPOLLERR,                                /* POLL_ERR */
        EPOLLPRI | EPOLLRDBAND,                        /* POLL_PRI */
        EPOLLHUP | EPOLLERR                        /* POLL_HUP */
};

static inline int sigio_perm(struct task_struct *p,
                             struct fown_struct *fown, int sig)
{
        const struct cred *cred;
        int ret;

        rcu_read_lock();
        cred = __task_cred(p);
        ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
                uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
                uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
               !security_file_send_sigiotask(p, fown, sig));
        rcu_read_unlock();
        return ret;
}

static void send_sigio_to_task(struct task_struct *p,
                               struct fown_struct *fown,
                               int fd, int reason, enum pid_type type)
{
        /*
         * F_SETSIG can change ->signum lockless in parallel, make
         * sure we read it once and use the same value throughout.
         */
        int signum = READ_ONCE(fown->signum);

        if (!sigio_perm(p, fown, signum))
                return;

        switch (signum) {
                default: {
                        kernel_siginfo_t si;

                        /* Queue a rt signal with the appropriate fd as its
                           value.  We use SI_SIGIO as the source, not 
                           SI_KERNEL, since kernel signals always get 
                           delivered even if we can't queue.  Failure to
                           queue in this case _should_ be reported; we fall
                           back to SIGIO in that case. --sct */
                        clear_siginfo(&si);
                        si.si_signo = signum;
                        si.si_errno = 0;
                        si.si_code  = reason;
                        /*
                         * Posix definies POLL_IN and friends to be signal
                         * specific si_codes for SIG_POLL.  Linux extended
                         * these si_codes to other signals in a way that is
                         * ambiguous if other signals also have signal
                         * specific si_codes.  In that case use SI_SIGIO instead
                         * to remove the ambiguity.
                         */
                        if ((signum != SIGPOLL) && sig_specific_sicodes(signum))
                                si.si_code = SI_SIGIO;

                        /* Make sure we are called with one of the POLL_*
                           reasons, otherwise we could leak kernel stack into
                           userspace.  */
                        BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
                        if (reason - POLL_IN >= NSIGPOLL)
                                si.si_band  = ~0L;
                        else
                                si.si_band = mangle_poll(band_table[reason - POLL_IN]);
                        si.si_fd    = fd;
                        if (!do_send_sig_info(signum, &si, p, type))
                                break;
                }
                        fallthrough;        /* fall back on the old plain SIGIO signal */
                case 0:
                        do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
        }
}

void send_sigio(struct fown_struct *fown, int fd, int band)
{
        struct task_struct *p;
        enum pid_type type;
        unsigned long flags;
        struct pid *pid;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigio_to_task(p, fown, fd, band, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigio_to_task(p, fown, fd, band, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
}

static void send_sigurg_to_task(struct task_struct *p,
                                struct fown_struct *fown, enum pid_type type)
{
        if (sigio_perm(p, fown, SIGURG))
                do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}

int send_sigurg(struct fown_struct *fown)
{
        struct task_struct *p;
        enum pid_type type;
        struct pid *pid;
        unsigned long flags;
        int ret = 0;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        ret = 1;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigurg_to_task(p, fown, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigurg_to_task(p, fown, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
        return ret;
}

static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __ro_after_init;

/*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
 * do nothing and return 0.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
 */
int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *fa, **fp;
        int result = 0;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_file = NULL;
                write_unlock_irq(&fa->fa_lock);

                *fp = fa->fa_next;
                kfree_rcu(fa, fa_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
}

struct fasync_struct *fasync_alloc(void)
{
        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
}

/*
 * NOTE! This can be used only for unused fasync entries:
 * entries that actually got inserted on the fasync list
 * need to be released by rcu - see fasync_remove_entry.
 */
void fasync_free(struct fasync_struct *new)
{
        kmem_cache_free(fasync_cache, new);
}

/*
 * Insert a new entry into the fasync list.  Return the pointer to the
 * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
{
        struct fasync_struct *fa, **fp;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                write_unlock_irq(&fa->fa_lock);
                goto out;
        }

        rwlock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
        filp->f_flags |= FASYNC;

out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return fa;
}

/*
 * Add a fasync entry. Return negative on error, positive if
 * added, and zero if did nothing but change an existing one.
 */
static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *new;

        new = fasync_alloc();
        if (!new)
                return -ENOMEM;

        /*
         * fasync_insert_entry() returns the old (update) entry if
         * it existed.
         *
         * So free the (unused) new entry and return 0 to let the
         * caller know that we didn't add any new fasync entries.
         */
        if (fasync_insert_entry(fd, filp, fapp, new)) {
                fasync_free(new);
                return 0;
        }

        return 1;
}

/*
 * fasync_helper() is used by almost all character device drivers
 * to set up the fasync queue, and for regular files by the file
 * lease code. It returns negative on error, 0 if it did no changes
 * and positive if it added/deleted the entry.
 */
int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
{
        if (!on)
                return fasync_remove_entry(filp, fapp);
        return fasync_add_entry(fd, filp, fapp);
}

EXPORT_SYMBOL(fasync_helper);

/*
 * rcu_read_lock() is held
 */
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
        while (fa) {
                struct fown_struct *fown;
                unsigned long flags;

                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
                read_lock_irqsave(&fa->fa_lock, flags);
                if (fa->fa_file) {
                        fown = &fa->fa_file->f_owner;
                        /* Don't send SIGURG to processes which have not set a
                           queued signum: SIGURG has its own default signalling
                           mechanism. */
                        if (!(sig == SIGURG && fown->signum == 0))
                                send_sigio(fown, fa->fa_fd, band);
                }
                read_unlock_irqrestore(&fa->fa_lock, flags);
                fa = rcu_dereference(fa->fa_next);
        }
}

void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
                rcu_read_lock();
                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(kill_fasync);

static int __init fcntl_init(void)
{
        /*
         * Please add new bits here to ensure allocation uniqueness.
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
        BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
                HWEIGHT32(
                        (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
                        __FMODE_EXEC | __FMODE_NONOTIFY));

        fasync_cache = kmem_cache_create("fasync_cache",
                                         sizeof(struct fasync_struct), 0,
                                         SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
}

module_init(fcntl_init)













































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMALLOC_H
#define _LINUX_VMALLOC_H

#include <linux/alloc_tag.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <asm/page.h>                /* pgprot_t */
#include <linux/rbtree.h>
#include <linux/overflow.h>

#include <asm/vmalloc.h>

struct vm_area_struct;                /* vma defining user mapping in mm_types.h */
struct notifier_block;                /* in notifier.h */
struct iov_iter;                /* in uio.h */

/* bits in flags of vmalloc's vm_struct below */
#define VM_IOREMAP                0x00000001        /* ioremap() and friends */
#define VM_ALLOC                0x00000002        /* vmalloc() */
#define VM_MAP                        0x00000004        /* vmap()ed pages */
#define VM_USERMAP                0x00000008        /* suitable for remap_vmalloc_range */
#define VM_DMA_COHERENT                0x00000010        /* dma_alloc_coherent */
#define VM_UNINITIALIZED        0x00000020        /* vm_struct is not fully initialized */
#define VM_NO_GUARD                0x00000040      /* ***DANGEROUS*** don't add guard page */
#define VM_KASAN                0x00000080      /* has allocated kasan shadow memory */
#define VM_FLUSH_RESET_PERMS        0x00000100        /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
#define VM_MAP_PUT_PAGES        0x00000200        /* put pages and free array in vfree */
#define VM_ALLOW_HUGE_VMAP        0x00000400      /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */

#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
        !defined(CONFIG_KASAN_VMALLOC)
#define VM_DEFER_KMEMLEAK        0x00000800        /* defer kmemleak object creation */
#else
#define VM_DEFER_KMEMLEAK        0
#endif
#define VM_SPARSE                0x00001000        /* sparse vm_area. not all pages are present. */

/* bits [20..32] reserved for arch specific ioremap internals */

/*
 * Maximum alignment for ioremap() regions.
 * Can be overridden by arch-specific value.
 */
#ifndef IOREMAP_MAX_ORDER
#define IOREMAP_MAX_ORDER        (7 + PAGE_SHIFT)        /* 128 pages */
#endif

struct vm_struct {
        struct vm_struct        *next;
        void                        *addr;
        unsigned long                size;
        unsigned long                flags;
        struct page                **pages;
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        unsigned int                page_order;
#endif
        unsigned int                nr_pages;
        phys_addr_t                phys_addr;
        const void                *caller;
};

struct vmap_area {
        unsigned long va_start;
        unsigned long va_end;

        struct rb_node rb_node;         /* address sorted rbtree */
        struct list_head list;          /* address sorted list */

        /*
         * The following two variables can be packed, because
         * a vmap_area object can be either:
         *    1) in "free" tree (root is free_vmap_area_root)
         *    2) or "busy" tree (root is vmap_area_root)
         */
        union {
                unsigned long subtree_max_size; /* in "free" tree */
                struct vm_struct *vm;           /* in "busy" tree */
        };
        unsigned long flags; /* mark type of vm_map_ram area */
};

/* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */
#ifndef arch_vmap_p4d_supported
static inline bool arch_vmap_p4d_supported(pgprot_t prot)
{
        return false;
}
#endif

#ifndef arch_vmap_pud_supported
static inline bool arch_vmap_pud_supported(pgprot_t prot)
{
        return false;
}
#endif

#ifndef arch_vmap_pmd_supported
static inline bool arch_vmap_pmd_supported(pgprot_t prot)
{
        return false;
}
#endif

#ifndef arch_vmap_pte_range_map_size
static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end,
                                                         u64 pfn, unsigned int max_page_shift)
{
        return PAGE_SIZE;
}
#endif

#ifndef arch_vmap_pte_supported_shift
static inline int arch_vmap_pte_supported_shift(unsigned long size)
{
        return PAGE_SHIFT;
}
#endif

#ifndef arch_vmap_pgprot_tagged
static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
{
        return prot;
}
#endif

/*
 *        Highlevel APIs for driver use
 */
extern void vm_unmap_ram(const void *mem, unsigned int count);
extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
extern void vm_unmap_aliases(void);

#ifdef CONFIG_MMU
extern unsigned long vmalloc_nr_pages(void);
#else
static inline unsigned long vmalloc_nr_pages(void) { return 0; }
#endif

extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
#define vmalloc(...)                alloc_hooks(vmalloc_noprof(__VA_ARGS__))

extern void *vzalloc_noprof(unsigned long size) __alloc_size(1);
#define vzalloc(...)                alloc_hooks(vzalloc_noprof(__VA_ARGS__))

extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1);
#define vmalloc_user(...)        alloc_hooks(vmalloc_user_noprof(__VA_ARGS__))

extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
#define vmalloc_node(...)        alloc_hooks(vmalloc_node_noprof(__VA_ARGS__))

extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
#define vzalloc_node(...)        alloc_hooks(vzalloc_node_noprof(__VA_ARGS__))

extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1);
#define vmalloc_32(...)                alloc_hooks(vmalloc_32_noprof(__VA_ARGS__))

extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1);
#define vmalloc_32_user(...)        alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__))

extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
#define __vmalloc(...)                alloc_hooks(__vmalloc_noprof(__VA_ARGS__))

extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller) __alloc_size(1);
#define __vmalloc_node_range(...)        alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__))

void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
                int node, const void *caller) __alloc_size(1);
#define __vmalloc_node(...)        alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__))

void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
#define vmalloc_huge(...)        alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__))

extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
#define __vmalloc_array(...)        alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))

extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vmalloc_array(...)        alloc_hooks(vmalloc_array_noprof(__VA_ARGS__))

extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
#define __vcalloc(...)                alloc_hooks(__vcalloc_noprof(__VA_ARGS__))

extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
#define vcalloc(...)                alloc_hooks(vcalloc_noprof(__VA_ARGS__))

extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);

extern void *vmap(struct page **pages, unsigned int count,
                        unsigned long flags, pgprot_t prot);
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
extern void vunmap(const void *addr);

extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
                                       unsigned long uaddr, void *kaddr,
                                       unsigned long pgoff, unsigned long size);

extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                        unsigned long pgoff);

/*
 * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
 * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
 * needs to be called.
 */
#ifndef ARCH_PAGE_TABLE_SYNC_MASK
#define ARCH_PAGE_TABLE_SYNC_MASK 0
#endif

/*
 * There is no default implementation for arch_sync_kernel_mappings(). It is
 * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK
 * is 0.
 */
void arch_sync_kernel_mappings(unsigned long start, unsigned long end);

/*
 *        Lowlevel-APIs (not for driver use!)
 */

static inline size_t get_vm_area_size(const struct vm_struct *area)
{
        if (!(area->flags & VM_NO_GUARD))
                /* return actual size without guard page */
                return area->size - PAGE_SIZE;
        else
                return area->size;

}

extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
extern struct vm_struct *get_vm_area_caller(unsigned long size,
                                        unsigned long flags, const void *caller);
extern struct vm_struct *__get_vm_area_caller(unsigned long size,
                                        unsigned long flags,
                                        unsigned long start, unsigned long end,
                                        const void *caller);
void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
struct vmap_area *find_vmap_area(unsigned long addr);

static inline bool is_vm_area_hugepages(const void *addr)
{
        /*
         * This may not 100% tell if the area is mapped with > PAGE_SIZE
         * page table entries, if for some reason the architecture indicates
         * larger sizes are available but decides not to use them, nothing
         * prevents that. This only indicates the size of the physical page
         * allocated in the vmalloc layer.
         */
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        return find_vm_area(addr)->page_order > 0;
#else
        return false;
#endif
}

#ifdef CONFIG_MMU
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
                      unsigned long end, struct page **pages);
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
                         unsigned long end);
void vunmap_range(unsigned long addr, unsigned long end);
static inline void set_vm_flush_reset_perms(void *addr)
{
        struct vm_struct *vm = find_vm_area(addr);

        if (vm)
                vm->flags |= VM_FLUSH_RESET_PERMS;
}

#else
static inline void set_vm_flush_reset_perms(void *addr)
{
}
#endif

/* for /proc/kcore */
extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);

/*
 *        Internals.  Don't use..
 */
extern __init void vm_area_add_early(struct vm_struct *vm);
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);

#ifdef CONFIG_SMP
# ifdef CONFIG_MMU
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align);

void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
# else
static inline struct vm_struct **
pcpu_get_vm_areas(const unsigned long *offsets,
                const size_t *sizes, int nr_vms,
                size_t align)
{
        return NULL;
}

static inline void
pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
}
# endif
#endif

#ifdef CONFIG_MMU
#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
#else
#define VMALLOC_TOTAL 0UL
#endif

int register_vmap_purge_notifier(struct notifier_block *nb);
int unregister_vmap_purge_notifier(struct notifier_block *nb);

#if defined(CONFIG_MMU) && defined(CONFIG_PRINTK)
bool vmalloc_dump_obj(void *object);
#else
static inline bool vmalloc_dump_obj(void *object) { return false; }
#endif

#endif /* _LINUX_VMALLOC_H */



































































    1 














    4 






    1 













    1 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NUMA memory policies for Linux.
 * Copyright 2003,2004 Andi Kleen SuSE Labs
 */
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1

#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>

struct mm_struct;

#define NO_INTERLEAVE_INDEX (-1UL)        /* use task il_prev for interleaving */

#ifdef CONFIG_NUMA

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interleave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_lock.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;         /* See MPOL_* above */
        unsigned short flags;        /* See set_mempolicy() MPOL_F_* above */
        nodemask_t nodes;        /* interleave/bind/perfer */
        int home_node;                /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */

        union {
                nodemask_t cpuset_mems_allowed;        /* relative to these nodes */
                nodemask_t user_nodemask;        /* nodemask passed by user */
        } w;
};

/*
 * Support for managing mempolicy data objects (clone, copy, destroy)
 * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
 */

extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
        if (pol)
                __mpol_put(pol);
}

/*
 * Does mempolicy pol need explicit unref after use?
 * Currently only needed for shared policies.
 */
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
        return (pol && (pol->flags & MPOL_F_SHARED));
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
        if (mpol_needs_cond_ref(pol))
                __mpol_put(pol);
}

extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
        if (pol)
                pol = __mpol_dup(pol);
        return pol;
}

static inline void mpol_get(struct mempolicy *pol)
{
        if (pol)
                atomic_inc(&pol->refcnt);
}

extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (a == b)
                return true;
        return __mpol_equal(a, b);
}

/*
 * Tree of shared policies for a shared memory region.
 */
struct shared_policy {
        struct rb_root root;
        rwlock_t lock;
};
struct sp_node {
        struct rb_node nd;
        pgoff_t start, end;
        struct mempolicy *policy;
};

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *sp,
                           struct vm_area_struct *vma, struct mempolicy *mpol);
void mpol_free_shared_policy(struct shared_policy *sp);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                            pgoff_t idx);

struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, pgoff_t *ilx);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);

extern int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                const nodemask_t *mask);
extern unsigned int mempolicy_slab_node(void);

extern enum zone_type policy_zone;

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);

int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                                        unsigned long addr);
extern void mpol_put_task_policy(struct task_struct *);

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  (pol->mode == MPOL_PREFERRED_MANY);
}

extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);

#else

struct mempolicy {};

static inline struct mempolicy *get_task_policy(struct task_struct *p)
{
        return NULL;
}

static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        return true;
}

static inline void mpol_put(struct mempolicy *pol)
{
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
}

static inline void mpol_get(struct mempolicy *pol)
{
}

struct shared_policy {};

static inline void mpol_shared_policy_init(struct shared_policy *sp,
                                                struct mempolicy *mpol)
{
}

static inline void mpol_free_shared_policy(struct shared_policy *sp)
{
}

static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
{
        return NULL;
}

static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                unsigned long addr, int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}

static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        return 0;
}

static inline void numa_policy_init(void)
{
}

static inline void numa_default_policy(void)
{
}

static inline void mpol_rebind_task(struct task_struct *tsk,
                                const nodemask_t *new)
{
}

static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}

static inline int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        *mpol = NULL;
        *nodemask = NULL;
        return 0;
}

static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
        return false;
}

static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                                   const nodemask_t *to, int flags)
{
        return 0;
}

static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        return 1;        /* error */
}
#endif

static inline int mpol_misplaced(struct folio *folio,
                                 struct vm_fault *vmf,
                                 unsigned long address)
{
        return -1; /* no node preference */
}

static inline void mpol_put_task_policy(struct task_struct *task)
{
}

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  false;
}

#endif /* CONFIG_NUMA */
#endif
















































































































    1 



























































    1 










    1 























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  

  linux/include/linux/rbtree.h

  To use rbtrees you'll have to implement your own insert and search cores.
  This will avoid us to use callbacks and to drop drammatically performances.
  I know it's not the cleaner way,  but in C (not in C++) to get
  performances and genericity...

  See Documentation/core-api/rbtree.rst for documentation and samples.
*/

#ifndef        _LINUX_RBTREE_H
#define        _LINUX_RBTREE_H

#include <linux/container_of.h>
#include <linux/rbtree_types.h>

#include <linux/stddef.h>
#include <linux/rcupdate.h>

#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))

#define        rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node)  \
        ((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node)  \
        ((node)->__rb_parent_color = (unsigned long)(node))


extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);


/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);

/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
                            struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
                                struct rb_root *root);

static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
                                struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        *rb_link = node;
}

static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
                                    struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        rcu_assign_pointer(*rb_link, node);
}

#define rb_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? rb_entry(____ptr, type, member) : NULL; \
        })

/**
 * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
 * given type allowing the backing memory of @pos to be invalidated
 *
 * @pos:        the 'type *' to use as a loop cursor.
 * @n:                another 'type *' to use as temporary storage
 * @root:        'rb_root *' of the rbtree.
 * @field:        the name of the rb_node field within 'type'.
 *
 * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
 * list_for_each_entry_safe() and allows the iteration to continue independent
 * of changes to @pos by the body of the loop.
 *
 * Note, however, that it cannot handle other modifications that re-order the
 * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
 * rb_erase() may rebalance the tree, causing us to miss some nodes.
 */
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
             pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
                        typeof(*pos), field); 1; }); \
             pos = n)

/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost

static inline void rb_insert_color_cached(struct rb_node *node,
                                          struct rb_root_cached *root,
                                          bool leftmost)
{
        if (leftmost)
                root->rb_leftmost = node;
        rb_insert_color(node, &root->rb_root);
}


static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
        struct rb_node *leftmost = NULL;

        if (root->rb_leftmost == node)
                leftmost = root->rb_leftmost = rb_next(node);

        rb_erase(node, &root->rb_root);

        return leftmost;
}

static inline void rb_replace_node_cached(struct rb_node *victim,
                                          struct rb_node *new,
                                          struct rb_root_cached *root)
{
        if (root->rb_leftmost == victim)
                root->rb_leftmost = new;
        rb_replace_node(victim, new, &root->rb_root);
}

/*
 * The below helper functions use 2 operators with 3 different
 * calling conventions. The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * rb_find().
 *
 * The reason for this is to allow the find() interface without requiring an
 * on-stack dummy object, which might not be feasible due to object size.
 */

/**
 * rb_add_cached() - insert @node into the leftmost cached tree @tree
 * @node: node to insert
 * @tree: leftmost cached tree to insert @node into
 * @less: operator defining the (partial) node order
 *
 * Returns @node when it is the new leftmost, or NULL.
 */
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
              bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);

        return leftmost ? node : NULL;
}

/**
 * rb_add() - insert @node into @tree
 * @node: node to insert
 * @tree: tree to insert @node into
 * @less: operator defining the (partial) node order
 */
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
       bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;

        while (*link) {
                parent = *link;
                if (less(node, parent))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
}

/**
 * rb_find_add() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
            int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
        int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = node->rb_left;
                else if (c > 0)
                        node = node->rb_right;
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_first() - find the first @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the leftmost node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
              int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;
        struct rb_node *match = NULL;

        while (node) {
                int c = cmp(key, node);

                if (c <= 0) {
                        if (!c)
                                match = node;
                        node = node->rb_left;
                } else if (c > 0) {
                        node = node->rb_right;
                }
        }

        return match;
}

/**
 * rb_next_match() - find the next @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the next node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
              int (*cmp)(const void *key, const struct rb_node *))
{
        node = rb_next(node);
        if (node && cmp(key, node))
                node = NULL;
        return node;
}

/**
 * rb_for_each() - iterates a subtree matching @key
 * @node: iterator
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 */
#define rb_for_each(node, key, tree, cmp) \
        for ((node) = rb_find_first((key), (tree), (cmp)); \
             (node); (node) = rb_next_match((key), (node), (cmp)))

#endif        /* _LINUX_RBTREE_H */
































    5 
    5 

    5 





    5 


    6 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * cls_cgroup.h                        Control Group Classifier
 *
 * Authors:        Thomas Graf <tgraf@suug.ch>
 */

#ifndef _NET_CLS_CGROUP_H
#define _NET_CLS_CGROUP_H

#include <linux/cgroup.h>
#include <linux/hardirq.h>
#include <linux/rcupdate.h>
#include <net/sock.h>
#include <net/inet_sock.h>

#ifdef CONFIG_CGROUP_NET_CLASSID
struct cgroup_cls_state {
        struct cgroup_subsys_state css;
        u32 classid;
};

struct cgroup_cls_state *task_cls_state(struct task_struct *p);

static inline u32 task_cls_classid(struct task_struct *p)
{
        u32 classid;

        if (in_interrupt())
                return 0;

        rcu_read_lock();
        classid = container_of(task_css(p, net_cls_cgrp_id),
                               struct cgroup_cls_state, css)->classid;
        rcu_read_unlock();

        return classid;
}

static inline void sock_update_classid(struct sock_cgroup_data *skcd)
{
        u32 classid;

        classid = task_cls_classid(current);
        sock_cgroup_set_classid(skcd, classid);
}

static inline u32 __task_get_classid(struct task_struct *task)
{
        return task_cls_state(task)->classid;
}

static inline u32 task_get_classid(const struct sk_buff *skb)
{
        u32 classid = __task_get_classid(current);

        /* Due to the nature of the classifier it is required to ignore all
         * packets originating from softirq context as accessing `current'
         * would lead to false results.
         *
         * This test assumes that all callers of dev_queue_xmit() explicitly
         * disable bh. Knowing this, it is possible to detect softirq based
         * calls by looking at the number of nested bh disable calls because
         * softirqs always disables bh.
         */
        if (in_serving_softirq()) {
                struct sock *sk = skb_to_full_sk(skb);

                /* If there is an sock_cgroup_classid we'll use that. */
                if (!sk || !sk_fullsock(sk))
                        return 0;

                classid = sock_cgroup_classid(&sk->sk_cgrp_data);
        }

        return classid;
}
#else /* !CONFIG_CGROUP_NET_CLASSID */
static inline void sock_update_classid(struct sock_cgroup_data *skcd)
{
}

static inline u32 task_get_classid(const struct sk_buff *skb)
{
        return 0;
}
#endif /* CONFIG_CGROUP_NET_CLASSID */
#endif  /* _NET_CLS_CGROUP_H */





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 








































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Handle firewalling
 *        Linux ethernet bridge
 *
 *        Authors:
 *        Lennert Buytenhek                <buytenh@gnu.org>
 *        Bart De Schuymer                <bdschuym@pandora.be>
 *
 *        Lennert dedicates this file to Kerstin Wurdinger.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/netfilter_bridge.h>
#include <uapi/linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
#include <linux/in_route.h>
#include <linux/rculist.h>
#include <linux/inetdevice.h>

#include <net/ip.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
#include <net/netns/generic.h>

#include <linux/uaccess.h>
#include "br_private.h"
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#endif

static unsigned int brnf_net_id __read_mostly;

struct brnf_net {
        bool enabled;

#ifdef CONFIG_SYSCTL
        struct ctl_table_header *ctl_hdr;
#endif

        /* default value is 1 */
        int call_iptables;
        int call_ip6tables;
        int call_arptables;

        /* default value is 0 */
        int filter_vlan_tagged;
        int filter_pppoe_tagged;
        int pass_vlan_indev;
};

#define IS_IP(skb) \
        (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))

#define IS_IPV6(skb) \
        (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))

#define IS_ARP(skb) \
        (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))

static inline __be16 vlan_proto(const struct sk_buff *skb)
{
        if (skb_vlan_tag_present(skb))
                return skb->protocol;
        else if (skb->protocol == htons(ETH_P_8021Q))
                return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
        else
                return 0;
}

static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
}

static inline bool is_vlan_ipv6(const struct sk_buff *skb,
                                const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return vlan_proto(skb) == htons(ETH_P_IPV6) &&
               brnet->filter_vlan_tagged;
}

static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
}

static inline __be16 pppoe_proto(const struct sk_buff *skb)
{
        return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
                            sizeof(struct pppoe_hdr)));
}

static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return skb->protocol == htons(ETH_P_PPP_SES) &&
               pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
}

static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
                                 const struct net *net)
{
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        return skb->protocol == htons(ETH_P_PPP_SES) &&
               pppoe_proto(skb) == htons(PPP_IPV6) &&
               brnet->filter_pppoe_tagged;
}

/* largest possible L2 header, see br_nf_dev_queue_xmit() */
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)

struct brnf_frag_data {
        local_lock_t bh_lock;
        char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
        u8 encap_size;
        u8 size;
        u16 vlan_tci;
        __be16 vlan_proto;
};

static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

static void nf_bridge_info_free(struct sk_buff *skb)
{
        skb_ext_del(skb, SKB_EXT_BRIDGE_NF);
}

static inline struct net_device *bridge_parent(const struct net_device *dev)
{
        struct net_bridge_port *port;

        port = br_port_get_rcu(dev);
        return port ? port->br->dev : NULL;
}

static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
{
        return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
}

unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
{
        switch (skb->protocol) {
        case __cpu_to_be16(ETH_P_8021Q):
                return VLAN_HLEN;
        case __cpu_to_be16(ETH_P_PPP_SES):
                return PPPOE_SES_HLEN;
        default:
                return 0;
        }
}

static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
{
        unsigned int len = nf_bridge_encap_header_len(skb);

        skb_pull(skb, len);
        skb->network_header += len;
}

static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
{
        unsigned int len = nf_bridge_encap_header_len(skb);

        skb_pull_rcsum(skb, len);
        skb->network_header += len;
}

/* When handing a packet over to the IP layer
 * check whether we have a skb that is in the
 * expected format
 */

static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
{
        const struct iphdr *iph;
        u32 len;

        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto inhdr_error;

        iph = ip_hdr(skb);

        /* Basic sanity checks */
        if (iph->ihl < 5 || iph->version != 4)
                goto inhdr_error;

        if (!pskb_may_pull(skb, iph->ihl*4))
                goto inhdr_error;

        iph = ip_hdr(skb);
        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                goto csum_error;

        len = skb_ip_totlen(skb);
        if (skb->len < len) {
                __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
                goto drop;
        } else if (len < (iph->ihl*4))
                goto inhdr_error;

        if (pskb_trim_rcsum(skb, len)) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto drop;
        }

        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
        /* We should really parse IP options here but until
         * somebody who actually uses IP options complains to
         * us we'll just silently ignore the options because
         * we're lazy!
         */
        return 0;

csum_error:
        __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
        __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
        return -1;
}

void nf_bridge_update_protocol(struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        switch (nf_bridge->orig_proto) {
        case BRNF_PROTO_8021Q:
                skb->protocol = htons(ETH_P_8021Q);
                break;
        case BRNF_PROTO_PPPOE:
                skb->protocol = htons(ETH_P_PPP_SES);
                break;
        case BRNF_PROTO_UNCHANGED:
                break;
        }
}

/* Obtain the correct destination MAC address, while preserving the original
 * source MAC address. If we already know this address, we just copy it. If we
 * don't, we use the neighbour framework to find out. In both cases, we make
 * sure that br_handle_frame_finish() is called afterwards.
 */
int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct neighbour *neigh;
        struct dst_entry *dst;

        skb->dev = bridge_parent(skb->dev);
        if (!skb->dev)
                goto free_skb;
        dst = skb_dst(skb);
        neigh = dst_neigh_lookup_skb(dst, skb);
        if (neigh) {
                struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
                int ret;

                if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
                    READ_ONCE(neigh->hh.hh_len)) {
                        struct net_device *br_indev;

                        br_indev = nf_bridge_get_physindev(skb, net);
                        if (!br_indev) {
                                neigh_release(neigh);
                                goto free_skb;
                        }

                        neigh_hh_bridge(&neigh->hh, skb);
                        skb->dev = br_indev;

                        ret = br_handle_frame_finish(net, sk, skb);
                } else {
                        /* the neighbour function below overwrites the complete
                         * MAC header, so we save the Ethernet source address and
                         * protocol number.
                         */
                        skb_copy_from_linear_data_offset(skb,
                                                         -(ETH_HLEN-ETH_ALEN),
                                                         nf_bridge->neigh_header,
                                                         ETH_HLEN-ETH_ALEN);
                        /* tell br_dev_xmit to continue with forwarding */
                        nf_bridge->bridged_dnat = 1;
                        /* FIXME Need to refragment */
                        ret = READ_ONCE(neigh->output)(neigh, skb);
                }
                neigh_release(neigh);
                return ret;
        }
free_skb:
        kfree_skb(skb);
        return 0;
}

static inline bool
br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb,
                             const struct nf_bridge_info *nf_bridge)
{
        return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
}

/* This requires some explaining. If DNAT has taken place,
 * we will need to fix up the destination Ethernet address.
 * This is also true when SNAT takes place (for the reply direction).
 *
 * There are two cases to consider:
 * 1. The packet was DNAT'ed to a device in the same bridge
 *    port group as it was received on. We can still bridge
 *    the packet.
 * 2. The packet was DNAT'ed to a different device, either
 *    a non-bridged device or another bridge port group.
 *    The packet will need to be routed.
 *
 * The correct way of distinguishing between these two cases is to
 * call ip_route_input() and to look at skb->dst->dev, which is
 * changed to the destination device if ip_route_input() succeeds.
 *
 * Let's first consider the case that ip_route_input() succeeds:
 *
 * If the output device equals the logical bridge device the packet
 * came in on, we can consider this bridging. The corresponding MAC
 * address will be obtained in br_nf_pre_routing_finish_bridge.
 * Otherwise, the packet is considered to be routed and we just
 * change the destination MAC address so that the packet will
 * later be passed up to the IP stack to be routed. For a redirected
 * packet, ip_route_input() will give back the localhost as output device,
 * which differs from the bridge device.
 *
 * Let's now consider the case that ip_route_input() fails:
 *
 * This can be because the destination address is martian, in which case
 * the packet will be dropped.
 * If IP forwarding is disabled, ip_route_input() will fail, while
 * ip_route_output_key() can return success. The source
 * address for ip_route_output_key() is set to zero, so ip_route_output_key()
 * thinks we're handling a locally generated packet and won't care
 * if IP forwarding is enabled. If the output device equals the logical bridge
 * device, we proceed as if ip_route_input() succeeded. If it differs from the
 * logical bridge port or if ip_route_output_key() fails we drop the packet.
 */
static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb->dev, *br_indev;
        struct iphdr *iph = ip_hdr(skb);
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct rtable *rt;
        int err;

        br_indev = nf_bridge_get_physindev(skb, net);
        if (!br_indev) {
                kfree_skb(skb);
                return 0;
        }

        nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;

        if (nf_bridge->pkt_otherhost) {
                skb->pkt_type = PACKET_OTHERHOST;
                nf_bridge->pkt_otherhost = false;
        }
        nf_bridge->in_prerouting = 0;
        if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
                if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
                        struct in_device *in_dev = __in_dev_get_rcu(dev);

                        /* If err equals -EHOSTUNREACH the error is due to a
                         * martian destination or due to the fact that
                         * forwarding is disabled. For most martian packets,
                         * ip_route_output_key() will fail. It won't fail for 2 types of
                         * martian destinations: loopback destinations and destination
                         * 0.0.0.0. In both cases the packet will be dropped because the
                         * destination is the loopback device and not the bridge. */
                        if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
                                goto free_skb;

                        rt = ip_route_output(net, iph->daddr, 0,
                                             RT_TOS(iph->tos), 0,
                                             RT_SCOPE_UNIVERSE);
                        if (!IS_ERR(rt)) {
                                /* - Bridged-and-DNAT'ed traffic doesn't
                                 *   require ip_forwarding. */
                                if (rt->dst.dev == dev) {
                                        skb_dst_drop(skb);
                                        skb_dst_set(skb, &rt->dst);
                                        goto bridged_dnat;
                                }
                                ip_rt_put(rt);
                        }
free_skb:
                        kfree_skb(skb);
                        return 0;
                } else {
                        if (skb_dst(skb)->dev == dev) {
bridged_dnat:
                                skb->dev = br_indev;
                                nf_bridge_update_protocol(skb);
                                nf_bridge_push_encap_header(skb);
                                br_nf_hook_thresh(NF_BR_PRE_ROUTING,
                                                  net, sk, skb, skb->dev,
                                                  NULL,
                                                  br_nf_pre_routing_finish_bridge);
                                return 0;
                        }
                        ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
                        skb->pkt_type = PACKET_HOST;
                }
        } else {
                rt = bridge_parent_rtable(br_indev);
                if (!rt) {
                        kfree_skb(skb);
                        return 0;
                }
                skb_dst_drop(skb);
                skb_dst_set_noref(skb, &rt->dst);
        }

        skb->dev = br_indev;
        nf_bridge_update_protocol(skb);
        nf_bridge_push_encap_header(skb);
        br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
                          br_handle_frame_finish);
        return 0;
}

static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
                                               const struct net_device *dev,
                                               const struct net *net)
{
        struct net_device *vlan, *br;
        struct brnf_net *brnet = net_generic(net, brnf_net_id);

        br = bridge_parent(dev);

        if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
                return br;

        vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
                                    skb_vlan_tag_get(skb) & VLAN_VID_MASK);

        return vlan ? vlan : br;
}

/* Some common code for IPv4/IPv6 */
struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (skb->pkt_type == PACKET_OTHERHOST) {
                skb->pkt_type = PACKET_HOST;
                nf_bridge->pkt_otherhost = true;
        }

        nf_bridge->in_prerouting = 1;
        nf_bridge->physinif = skb->dev->ifindex;
        skb->dev = brnf_get_logical_dev(skb, skb->dev, net);

        if (skb->protocol == htons(ETH_P_8021Q))
                nf_bridge->orig_proto = BRNF_PROTO_8021Q;
        else if (skb->protocol == htons(ETH_P_PPP_SES))
                nf_bridge->orig_proto = BRNF_PROTO_PPPOE;

        /* Must drop socket now because of tproxy. */
        skb_orphan(skb);
        return skb->dev;
}

/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
 * Replicate the checks that IPv4 does on packet reception.
 * Set skb->dev to the bridge device (i.e. parent of the
 * receiving device) to make netfilter happy, the REDIRECT
 * target in particular.  Save the original destination IP
 * address to be able to detect DNAT afterwards. */
static unsigned int br_nf_pre_routing(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct nf_bridge_info *nf_bridge;
        struct net_bridge_port *p;
        struct net_bridge *br;
        __u32 len = nf_bridge_encap_header_len(skb);
        struct brnf_net *brnet;

        if (unlikely(!pskb_may_pull(skb, len)))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);

        p = br_port_get_rcu(state->in);
        if (p == NULL)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
        br = p->br;

        brnet = net_generic(state->net, brnf_net_id);
        if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
            is_pppoe_ipv6(skb, state->net)) {
                if (!brnet->call_ip6tables &&
                    !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
                        return NF_ACCEPT;
                if (!ipv6_mod_enabled()) {
                        pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported.");
                        return NF_DROP_REASON(skb, SKB_DROP_REASON_IPV6DISABLED, 0);
                }

                nf_bridge_pull_encap_header_rcsum(skb);
                return br_nf_pre_routing_ipv6(priv, skb, state);
        }

        if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
                return NF_ACCEPT;

        if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
            !is_pppoe_ip(skb, state->net))
                return NF_ACCEPT;

        nf_bridge_pull_encap_header_rcsum(skb);

        if (br_validate_ipv4(state->net, skb))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);

        if (!nf_bridge_alloc(skb))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
        if (!setup_pre_routing(skb, state->net))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);

        nf_bridge = nf_bridge_info_get(skb);
        nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;

        skb->protocol = htons(ETH_P_IP);
        skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4;

        NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
                skb->dev, NULL,
                br_nf_pre_routing_finish);

        return NF_STOLEN;
}

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
 * the same nf_conn entry, which will happen for multicast (broadcast)
 * Frames on bridges.
 *
 * Example:
 *      macvlan0
 *      br0
 *  ethX  ethY
 *
 * ethX (or Y) receives multicast or broadcast packet containing
 * an IP packet, not yet in conntrack table.
 *
 * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
 *    -> skb->_nfct now references a unconfirmed entry
 * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
 *    interface.
 * 3. skb gets passed up the stack.
 * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
 *    and schedules a work queue to send them out on the lower devices.
 *
 *    The clone skb->_nfct is not a copy, it is the same entry as the
 *    original skb.  The macvlan rx handler then returns RX_HANDLER_PASS.
 * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
 *
 * The Macvlan broadcast worker and normal confirm path will race.
 *
 * This race will not happen if step 2 already confirmed a clone. In that
 * case later steps perform skb_clone() with skb->_nfct already confirmed (in
 * hash table).  This works fine.
 *
 * But such confirmation won't happen when eb/ip/nftables rules dropped the
 * packets before they reached the nf_confirm step in postrouting.
 *
 * Work around this problem by explicit confirmation of the entry at
 * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
 * entry.
 *
 */
static unsigned int br_nf_local_in(void *priv,
                                   struct sk_buff *skb,
                                   const struct nf_hook_state *state)
{
        bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
        struct nf_conntrack *nfct = skb_nfct(skb);
        const struct nf_ct_hook *ct_hook;
        struct nf_conn *ct;
        int ret;

        if (promisc) {
                nf_reset_ct(skb);
                return NF_ACCEPT;
        }

        if (!nfct || skb->pkt_type == PACKET_HOST)
                return NF_ACCEPT;

        ct = container_of(nfct, struct nf_conn, ct_general);
        if (likely(nf_ct_is_confirmed(ct)))
                return NF_ACCEPT;

        WARN_ON_ONCE(skb_shared(skb));
        WARN_ON_ONCE(refcount_read(&nfct->use) != 1);

        /* We can't call nf_confirm here, it would create a dependency
         * on nf_conntrack module.
         */
        ct_hook = rcu_dereference(nf_ct_hook);
        if (!ct_hook) {
                skb->_nfct = 0ul;
                nf_conntrack_put(nfct);
                return NF_ACCEPT;
        }

        nf_bridge_pull_encap_header(skb);
        ret = ct_hook->confirm(skb);
        switch (ret & NF_VERDICT_MASK) {
        case NF_STOLEN:
                return NF_STOLEN;
        default:
                nf_bridge_push_encap_header(skb);
                break;
        }

        ct = container_of(nfct, struct nf_conn, ct_general);
        WARN_ON_ONCE(!nf_ct_is_confirmed(ct));

        return ret;
}
#endif

/* PF_BRIDGE/FORWARD *************************************************/
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct net_device *in;

        if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {

                if (skb->protocol == htons(ETH_P_IP))
                        nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;

                if (skb->protocol == htons(ETH_P_IPV6))
                        nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;

                in = nf_bridge_get_physindev(skb, net);
                if (!in) {
                        kfree_skb(skb);
                        return 0;
                }
                if (nf_bridge->pkt_otherhost) {
                        skb->pkt_type = PACKET_OTHERHOST;
                        nf_bridge->pkt_otherhost = false;
                }
                nf_bridge_update_protocol(skb);
        } else {
                in = *((struct net_device **)(skb->cb));
        }
        nf_bridge_push_encap_header(skb);

        br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev,
                          br_forward_finish);
        return 0;
}


static unsigned int br_nf_forward_ip(struct sk_buff *skb,
                                     const struct nf_hook_state *state,
                                     u8 pf)
{
        struct nf_bridge_info *nf_bridge;
        struct net_device *parent;

        nf_bridge = nf_bridge_info_get(skb);
        if (!nf_bridge)
                return NF_ACCEPT;

        /* Need exclusive nf_bridge_info since we might have multiple
         * different physoutdevs. */
        if (!nf_bridge_unshare(skb))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);

        nf_bridge = nf_bridge_info_get(skb);
        if (!nf_bridge)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);

        parent = bridge_parent(state->out);
        if (!parent)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);

        nf_bridge_pull_encap_header(skb);

        if (skb->pkt_type == PACKET_OTHERHOST) {
                skb->pkt_type = PACKET_HOST;
                nf_bridge->pkt_otherhost = true;
        }

        if (pf == NFPROTO_IPV4) {
                if (br_validate_ipv4(state->net, skb))
                        return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
                IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
                skb->protocol = htons(ETH_P_IP);
        } else if (pf == NFPROTO_IPV6) {
                if (br_validate_ipv6(state->net, skb))
                        return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
                IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
                skb->protocol = htons(ETH_P_IPV6);
        } else {
                WARN_ON_ONCE(1);
                return NF_DROP;
        }

        nf_bridge->physoutdev = skb->dev;

        NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
                brnf_get_logical_dev(skb, state->in, state->net),
                parent,        br_nf_forward_finish);

        return NF_STOLEN;
}

static unsigned int br_nf_forward_arp(struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct net_bridge_port *p;
        struct net_bridge *br;
        struct net_device **d = (struct net_device **)(skb->cb);
        struct brnf_net *brnet;

        p = br_port_get_rcu(state->out);
        if (p == NULL)
                return NF_ACCEPT;
        br = p->br;

        brnet = net_generic(state->net, brnf_net_id);
        if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
                return NF_ACCEPT;

        if (is_vlan_arp(skb, state->net))
                nf_bridge_pull_encap_header(skb);

        if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr))))
                return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);

        if (arp_hdr(skb)->ar_pln != 4) {
                if (is_vlan_arp(skb, state->net))
                        nf_bridge_push_encap_header(skb);
                return NF_ACCEPT;
        }
        *d = state->in;
        NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->net, state->sk, skb,
                state->in, state->out, br_nf_forward_finish);

        return NF_STOLEN;
}

/* This is the 'purely bridged' case.  For IP, we pass the packet to
 * netfilter with indev and outdev set to the bridge device,
 * but we are still able to filter on the 'real' indev/outdev
 * because of the physdev module. For ARP, indev and outdev are the
 * bridge ports.
 */
static unsigned int br_nf_forward(void *priv,
                                  struct sk_buff *skb,
                                  const struct nf_hook_state *state)
{
        if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
            is_pppoe_ip(skb, state->net))
                return br_nf_forward_ip(skb, state, NFPROTO_IPV4);
        if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
            is_pppoe_ipv6(skb, state->net))
                return br_nf_forward_ip(skb, state, NFPROTO_IPV6);
        if (IS_ARP(skb) || is_vlan_arp(skb, state->net))
                return br_nf_forward_arp(skb, state);

        return NF_ACCEPT;
}

static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct brnf_frag_data *data;
        int err;

        data = this_cpu_ptr(&brnf_frag_data_storage);
        err = skb_cow_head(skb, data->size);

        if (err) {
                kfree_skb(skb);
                return 0;
        }

        if (data->vlan_proto)
                __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);

        skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
        __skb_push(skb, data->encap_size);

        nf_bridge_info_free(skb);
        return br_dev_queue_push_xmit(net, sk, skb);
}

static int
br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                  int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        unsigned int mtu = ip_skb_dst_mtu(sk, skb);
        struct iphdr *iph = ip_hdr(skb);

        if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
                     (IPCB(skb)->frag_max_size &&
                      IPCB(skb)->frag_max_size > mtu))) {
                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        return ip_do_fragment(net, sk, skb, output);
}

static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
                return PPPOE_SES_HLEN;
        return 0;
}

static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        unsigned int mtu, mtu_reserved;
        int ret;

        mtu_reserved = nf_bridge_mtu_reduction(skb);
        mtu = skb->dev->mtu;

        if (nf_bridge->pkt_otherhost) {
                skb->pkt_type = PACKET_OTHERHOST;
                nf_bridge->pkt_otherhost = false;
        }

        if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
                mtu = nf_bridge->frag_max_size;

        nf_bridge_update_protocol(skb);
        nf_bridge_push_encap_header(skb);

        if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
                nf_bridge_info_free(skb);
                return br_dev_queue_push_xmit(net, sk, skb);
        }

        /* This is wrong! We should preserve the original fragment
         * boundaries by preserving frag_list rather than refragmenting.
         */
        if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) &&
            skb->protocol == htons(ETH_P_IP)) {
                struct brnf_frag_data *data;

                if (br_validate_ipv4(net, skb))
                        goto drop;

                IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;

                local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
                data = this_cpu_ptr(&brnf_frag_data_storage);

                if (skb_vlan_tag_present(skb)) {
                        data->vlan_tci = skb->vlan_tci;
                        data->vlan_proto = skb->vlan_proto;
                } else {
                        data->vlan_proto = 0;
                }

                data->encap_size = nf_bridge_encap_header_len(skb);
                data->size = ETH_HLEN + data->encap_size;

                skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
                                                 data->size);

                ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
                local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
                return ret;
        }
        if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) &&
            skb->protocol == htons(ETH_P_IPV6)) {
                const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
                struct brnf_frag_data *data;

                if (br_validate_ipv6(net, skb))
                        goto drop;

                IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;

                local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
                data = this_cpu_ptr(&brnf_frag_data_storage);
                data->encap_size = nf_bridge_encap_header_len(skb);
                data->size = ETH_HLEN + data->encap_size;

                skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
                                                 data->size);

                if (v6ops) {
                        ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
                        local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
                        return ret;
                }
                local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);

                kfree_skb(skb);
                return -EMSGSIZE;
        }
        nf_bridge_info_free(skb);
        return br_dev_queue_push_xmit(net, sk, skb);
 drop:
        kfree_skb(skb);
        return 0;
}

/* PF_BRIDGE/POST_ROUTING ********************************************/
static unsigned int br_nf_post_routing(void *priv,
                                       struct sk_buff *skb,
                                       const struct nf_hook_state *state)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct net_device *realoutdev = bridge_parent(skb->dev);
        u_int8_t pf;

        /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in
         * on a bridge, but was delivered locally and is now being routed:
         *
         * POST_ROUTING was already invoked from the ip stack.
         */
        if (!nf_bridge || !nf_bridge->physoutdev)
                return NF_ACCEPT;

        if (!realoutdev)
                return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);

        if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
            is_pppoe_ip(skb, state->net))
                pf = NFPROTO_IPV4;
        else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
                 is_pppoe_ipv6(skb, state->net))
                pf = NFPROTO_IPV6;
        else
                return NF_ACCEPT;

        if (skb->pkt_type == PACKET_OTHERHOST) {
                skb->pkt_type = PACKET_HOST;
                nf_bridge->pkt_otherhost = true;
        }

        nf_bridge_pull_encap_header(skb);
        if (pf == NFPROTO_IPV4)
                skb->protocol = htons(ETH_P_IP);
        else
                skb->protocol = htons(ETH_P_IPV6);

        NF_HOOK(pf, NF_INET_POST_ROUTING, state->net, state->sk, skb,
                NULL, realoutdev,
                br_nf_dev_queue_xmit);

        return NF_STOLEN;
}

/* IP/SABOTAGE *****************************************************/
/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
 * for the second time. */
static unsigned int ip_sabotage_in(void *priv,
                                   struct sk_buff *skb,
                                   const struct nf_hook_state *state)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (nf_bridge) {
                if (nf_bridge->sabotage_in_done)
                        return NF_ACCEPT;

                if (!nf_bridge->in_prerouting &&
                    !netif_is_l3_master(skb->dev) &&
                    !netif_is_l3_slave(skb->dev)) {
                        nf_bridge->sabotage_in_done = 1;
                        state->okfn(state->net, state->sk, skb);
                        return NF_STOLEN;
                }
        }

        return NF_ACCEPT;
}

/* This is called when br_netfilter has called into iptables/netfilter,
 * and DNAT has taken place on a bridge-forwarded packet.
 *
 * neigh->output has created a new MAC header, with local br0 MAC
 * as saddr.
 *
 * This restores the original MAC saddr of the bridged packet
 * before invoking bridge forward logic to transmit the packet.
 */
static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
{
        struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
        struct net_device *br_indev;

        br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev));
        if (!br_indev) {
                kfree_skb(skb);
                return;
        }

        skb_pull(skb, ETH_HLEN);
        nf_bridge->bridged_dnat = 0;

        BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));

        skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
                                       nf_bridge->neigh_header,
                                       ETH_HLEN - ETH_ALEN);
        skb->dev = br_indev;

        nf_bridge->physoutdev = NULL;
        br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
}

static int br_nf_dev_xmit(struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (nf_bridge && nf_bridge->bridged_dnat) {
                br_nf_pre_routing_finish_bridge_slow(skb);
                return 1;
        }
        return 0;
}

static const struct nf_br_ops br_ops = {
        .br_dev_xmit_hook =        br_nf_dev_xmit,
};

/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
 * br_dev_queue_push_xmit is called afterwards */
static const struct nf_hook_ops br_nf_ops[] = {
        {
                .hook = br_nf_pre_routing,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_PRE_ROUTING,
                .priority = NF_BR_PRI_BRNF,
        },
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        {
                .hook = br_nf_local_in,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_LOCAL_IN,
                .priority = NF_BR_PRI_LAST,
        },
#endif
        {
                .hook = br_nf_forward,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_FORWARD,
                .priority = NF_BR_PRI_BRNF,
        },
        {
                .hook = br_nf_post_routing,
                .pf = NFPROTO_BRIDGE,
                .hooknum = NF_BR_POST_ROUTING,
                .priority = NF_BR_PRI_LAST,
        },
        {
                .hook = ip_sabotage_in,
                .pf = NFPROTO_IPV4,
                .hooknum = NF_INET_PRE_ROUTING,
                .priority = NF_IP_PRI_FIRST,
        },
        {
                .hook = ip_sabotage_in,
                .pf = NFPROTO_IPV6,
                .hooknum = NF_INET_PRE_ROUTING,
                .priority = NF_IP6_PRI_FIRST,
        },
};

static int brnf_device_event(struct notifier_block *unused, unsigned long event,
                             void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct brnf_net *brnet;
        struct net *net;
        int ret;

        if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev))
                return NOTIFY_DONE;

        ASSERT_RTNL();

        net = dev_net(dev);
        brnet = net_generic(net, brnf_net_id);
        if (brnet->enabled)
                return NOTIFY_OK;

        ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
        if (ret)
                return NOTIFY_BAD;

        brnet->enabled = true;
        return NOTIFY_OK;
}

static struct notifier_block brnf_notifier __read_mostly = {
        .notifier_call = brnf_device_event,
};

/* recursively invokes nf_hook_slow (again), skipping already-called
 * hooks (< NF_BR_PRI_BRNF).
 *
 * Called with rcu read lock held.
 */
int br_nf_hook_thresh(unsigned int hook, struct net *net,
                      struct sock *sk, struct sk_buff *skb,
                      struct net_device *indev,
                      struct net_device *outdev,
                      int (*okfn)(struct net *, struct sock *,
                                  struct sk_buff *))
{
        const struct nf_hook_entries *e;
        struct nf_hook_state state;
        struct nf_hook_ops **ops;
        unsigned int i;
        int ret;

        e = rcu_dereference(net->nf.hooks_bridge[hook]);
        if (!e)
                return okfn(net, sk, skb);

        ops = nf_hook_entries_get_hook_ops(e);
        for (i = 0; i < e->num_hook_entries; i++) {
                /* These hooks have already been called */
                if (ops[i]->priority < NF_BR_PRI_BRNF)
                        continue;

                /* These hooks have not been called yet, run them. */
                if (ops[i]->priority > NF_BR_PRI_BRNF)
                        break;

                /* take a closer look at NF_BR_PRI_BRNF. */
                if (ops[i]->hook == br_nf_pre_routing) {
                        /* This hook diverted the skb to this function,
                         * hooks after this have not been run yet.
                         */
                        i++;
                        break;
                }
        }

        nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
                           sk, net, okfn);

        ret = nf_hook_slow(skb, &state, e, i);
        if (ret == 1)
                ret = okfn(net, sk, skb);

        return ret;
}

#ifdef CONFIG_SYSCTL
static
int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);

        if (write && *(int *)(ctl->data))
                *(int *)(ctl->data) = 1;
        return ret;
}

static struct ctl_table brnf_table[] = {
        {
                .procname        = "bridge-nf-call-arptables",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-call-iptables",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-call-ip6tables",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-filter-vlan-tagged",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-filter-pppoe-tagged",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
        {
                .procname        = "bridge-nf-pass-vlan-input-dev",
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = brnf_sysctl_call_tables,
        },
};

static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
{
        brnf->call_iptables = 1;
        brnf->call_ip6tables = 1;
        brnf->call_arptables = 1;
        brnf->filter_vlan_tagged = 0;
        brnf->filter_pppoe_tagged = 0;
        brnf->pass_vlan_indev = 0;
}

static int br_netfilter_sysctl_init_net(struct net *net)
{
        struct ctl_table *table = brnf_table;
        struct brnf_net *brnet;

        if (!net_eq(net, &init_net)) {
                table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
                if (!table)
                        return -ENOMEM;
        }

        brnet = net_generic(net, brnf_net_id);
        table[0].data = &brnet->call_arptables;
        table[1].data = &brnet->call_iptables;
        table[2].data = &brnet->call_ip6tables;
        table[3].data = &brnet->filter_vlan_tagged;
        table[4].data = &brnet->filter_pppoe_tagged;
        table[5].data = &brnet->pass_vlan_indev;

        br_netfilter_sysctl_default(brnet);

        brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table,
                                                ARRAY_SIZE(brnf_table));
        if (!brnet->ctl_hdr) {
                if (!net_eq(net, &init_net))
                        kfree(table);

                return -ENOMEM;
        }

        return 0;
}

static void br_netfilter_sysctl_exit_net(struct net *net,
                                         struct brnf_net *brnet)
{
        const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;

        unregister_net_sysctl_table(brnet->ctl_hdr);
        if (!net_eq(net, &init_net))
                kfree(table);
}

static int __net_init brnf_init_net(struct net *net)
{
        return br_netfilter_sysctl_init_net(net);
}
#endif

static void __net_exit brnf_exit_net(struct net *net)
{
        struct brnf_net *brnet;

        brnet = net_generic(net, brnf_net_id);
        if (brnet->enabled) {
                nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
                brnet->enabled = false;
        }

#ifdef CONFIG_SYSCTL
        br_netfilter_sysctl_exit_net(net, brnet);
#endif
}

static struct pernet_operations brnf_net_ops __read_mostly = {
#ifdef CONFIG_SYSCTL
        .init = brnf_init_net,
#endif
        .exit = brnf_exit_net,
        .id   = &brnf_net_id,
        .size = sizeof(struct brnf_net),
};

static int __init br_netfilter_init(void)
{
        int ret;

        ret = register_pernet_subsys(&brnf_net_ops);
        if (ret < 0)
                return ret;

        ret = register_netdevice_notifier(&brnf_notifier);
        if (ret < 0) {
                unregister_pernet_subsys(&brnf_net_ops);
                return ret;
        }

        RCU_INIT_POINTER(nf_br_ops, &br_ops);
        printk(KERN_NOTICE "Bridge firewalling registered\n");
        return 0;
}

static void __exit br_netfilter_fini(void)
{
        RCU_INIT_POINTER(nf_br_ops, NULL);
        unregister_netdevice_notifier(&brnf_notifier);
        unregister_pernet_subsys(&brnf_net_ops);
}

module_init(br_netfilter_init);
module_exit(br_netfilter_fini);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>");
MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");





















    3 





















































































































































































    2 

    2 











    3 























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

/*
 * Clear all of the marks on an inode when it is being evicted from core
 */
void __fsnotify_inode_delete(struct inode *inode)
{
        fsnotify_clear_marks_by_inode(inode);
}
EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);

void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        fsnotify_clear_marks_by_mount(mnt);
}

/**
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @sb: superblock being unmounted.
 *
 * Called during unmount with no locks held, so needs to be safe against
 * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
 */
static void fsnotify_unmount_inodes(struct super_block *sb)
{
        struct inode *inode, *iput_inode = NULL;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 * We cannot __iget() an inode in state I_FREEING,
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                /*
                 * If i_count is zero, the inode cannot have any watches and
                 * doing an __iget/iput with SB_ACTIVE clear would actually
                 * evict all inodes with zero i_count from icache which is
                 * unnecessarily violent and may in fact be illegal to do.
                 * However, we should have been called /after/ evict_inodes
                 * removed all zero refcount inodes, in any case.  Test to
                 * be sure.
                 */
                if (!atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

                iput(iput_inode);

                /* for each watch, send FS_UNMOUNT and then remove it */
                fsnotify_inode(inode, FS_UNMOUNT);

                fsnotify_inode_delete(inode);

                iput_inode = inode;

                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);

        iput(iput_inode);
}

void fsnotify_sb_delete(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return;

        fsnotify_unmount_inodes(sb);
        fsnotify_clear_marks_by_sb(sb);
        /* Wait for outstanding object references from connectors */
        wait_var_event(fsnotify_sb_watched_objects(sb),
                       !atomic_long_read(fsnotify_sb_watched_objects(sb)));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb,
                                                  FSNOTIFY_PRIO_PRE_CONTENT));
}

void fsnotify_sb_free(struct super_block *sb)
{
        kfree(sb->s_fsnotify_info);
}

/*
 * Given an inode, first check if we care what happens to our children.  Inotify
 * and dnotify both tell their parents about events.  If we care about any event
 * on a child we run all of our children and set a dentry flag saying that the
 * parent cares.  Thus when an event happens on a child it can quickly tell
 * if there is a need to find a parent and send the event to the parent.
 */
void __fsnotify_update_child_dentry_flags(struct inode *inode)
{
        struct dentry *alias;
        int watched;

        if (!S_ISDIR(inode->i_mode))
                return;

        /* determine if the children should tell inode about their events */
        watched = fsnotify_inode_watches_children(inode);

        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                struct dentry *child;

                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
                spin_lock(&alias->d_lock);
                hlist_for_each_entry(child, &alias->d_children, d_sib) {
                        if (!child->d_inode)
                                continue;

                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        if (watched)
                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        else
                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
                spin_unlock(&alias->d_lock);
        }
        spin_unlock(&inode->i_lock);
}

/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
                                        __u32 mask)
{
        __u32 marks_mask = 0;

        /* We only send parent/name to inode/sb/mount for events on non-dir */
        if (mask & FS_ISDIR)
                return false;

        /*
         * All events that are possible on child can also may be reported with
         * parent/name info to inode/sb/mount.  Otherwise, a watching parent
         * could result in events reported with unexpected name info to sb/mount.
         */
        BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);

        /* Did either inode/sb/mount subscribe for events with parent/name? */
        marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
        marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
        marks_mask |= fsnotify_parent_needed_mask(mnt_mask);

        /* Did they subscribe for this event with parent/name info? */
        return mask & marks_mask;
}

/* Are there any inode/mount/sb objects that are interested in this event? */
static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
                                           __u32 mask)
{
        __u32 marks_mask = inode->i_fsnotify_mask | mnt_mask |
                           inode->i_sb->s_fsnotify_mask;

        return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Notify this dentry's parent about a child's events with child name info
 * if parent is watching or if inode/sb/mount are interested in events with
 * parent and name info.
 *
 * Notify only the child without name info if parent is not watching and
 * inode/sb/mount are not interested in events with parent and name info.
 */
int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                      int data_type)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        __u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0;
        struct inode *inode = d_inode(dentry);
        struct dentry *parent;
        bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
        bool parent_needed, parent_interested;
        __u32 p_mask;
        struct inode *p_inode = NULL;
        struct name_snapshot name;
        struct qstr *file_name = NULL;
        int ret = 0;

        /* Optimize the likely case of nobody watching this path */
        if (likely(!parent_watched &&
                   !fsnotify_object_watched(inode, mnt_mask, mask)))
                return 0;

        parent = NULL;
        parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask);
        if (!parent_watched && !parent_needed)
                goto notify;

        /* Does parent inode care about events on children? */
        parent = dget_parent(dentry);
        p_inode = parent->d_inode;
        p_mask = fsnotify_inode_watches_children(p_inode);
        if (unlikely(parent_watched && !p_mask))
                __fsnotify_update_child_dentry_flags(p_inode);

        /*
         * Include parent/name in notification either if some notification
         * groups require parent info or the parent is interested in this event.
         */
        parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS;
        if (parent_needed || parent_interested) {
                /* When notifying parent, child should be passed as data */
                WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));

                /* Notify both parent and child with child name info */
                take_dentry_name_snapshot(&name, dentry);
                file_name = &name.name;
                if (parent_interested)
                        mask |= FS_EVENT_ON_CHILD;
        }

notify:
        ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0);

        if (file_name)
                release_dentry_name_snapshot(&name);
        dput(parent);

        return ret;
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);

static int fsnotify_handle_inode_event(struct fsnotify_group *group,
                                       struct fsnotify_mark *inode_mark,
                                       u32 mask, const void *data, int data_type,
                                       struct inode *dir, const struct qstr *name,
                                       u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct inode *inode = fsnotify_data_inode(data, data_type);
        const struct fsnotify_ops *ops = group->ops;

        if (WARN_ON_ONCE(!ops->handle_inode_event))
                return 0;

        if (WARN_ON_ONCE(!inode && !dir))
                return 0;

        if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
            path && d_unlinked(path->dentry))
                return 0;

        /* Check interest of this mark in case event was sent with two marks */
        if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
                return 0;

        return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
}

static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
                                 const void *data, int data_type,
                                 struct inode *dir, const struct qstr *name,
                                 u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
        struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
        int ret;

        if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
            WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
                return 0;

        /*
         * For FS_RENAME, 'dir' is old dir and 'data' is new dentry.
         * The only ->handle_inode_event() backend that supports FS_RENAME is
         * dnotify, where it means file was renamed within same parent.
         */
        if (mask & FS_RENAME) {
                struct dentry *moved = fsnotify_data_dentry(data, data_type);

                if (dir != moved->d_parent->d_inode)
                        return 0;
        }

        if (parent_mark) {
                ret = fsnotify_handle_inode_event(group, parent_mark, mask,
                                                  data, data_type, dir, name, 0);
                if (ret)
                        return ret;
        }

        if (!inode_mark)
                return 0;

        if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Some events can be sent on both parent dir and child marks
                 * (e.g. FS_ATTRIB).  If both parent dir and child are
                 * watching, report the event once to parent dir with name (if
                 * interested) and once to child without name (if interested).
                 * The child watcher is expecting an event without a file name
                 * and without the FS_EVENT_ON_CHILD flag.
                 */
                mask &= ~FS_EVENT_ON_CHILD;
                dir = NULL;
                name = NULL;
        }

        return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
                                           dir, name, cookie);
}

static int send_to_group(__u32 mask, const void *data, int data_type,
                         struct inode *dir, const struct qstr *file_name,
                         u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *group = NULL;
        __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        __u32 marks_mask = 0;
        __u32 marks_ignore_mask = 0;
        bool is_dir = mask & FS_ISDIR;
        struct fsnotify_mark *mark;
        int type;

        if (!iter_info->report_mask)
                return 0;

        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
                fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                        if (!(mark->flags &
                              FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                                mark->ignore_mask = 0;
                }
        }

        /* Are any of the group marks interested in this event? */
        fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                group = mark->group;
                marks_mask |= mark->mask;
                marks_ignore_mask |=
                        fsnotify_effective_ignore_mask(mark, is_dir, type);
        }

        pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
                 __func__, group, mask, marks_mask, marks_ignore_mask,
                 data, data_type, dir, cookie);

        if (!(test_mask & marks_mask & ~marks_ignore_mask))
                return 0;

        if (group->ops->handle_event) {
                return group->ops->handle_event(group, mask, data, data_type, dir,
                                                file_name, cookie, iter_info);
        }

        return fsnotify_handle_event(group, mask, data, data_type, dir,
                                     file_name, cookie, iter_info);
}

static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
{
        struct fsnotify_mark_connector *conn;
        struct hlist_node *node = NULL;

        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (conn)
                node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
        struct hlist_node *node = NULL;

        if (mark)
                node = srcu_dereference(mark->obj_list.next,
                                        &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

/*
 * iter_info is a multi head priority queue of marks.
 * Pick a subset of marks from queue heads, all with the same group
 * and set the report_mask to a subset of the selected marks.
 * Returns false if there are no more groups to iterate.
 */
static bool fsnotify_iter_select_report_types(
                struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *max_prio_group = NULL;
        struct fsnotify_mark *mark;
        int type;

        /* Choose max prio group among groups of all queue heads */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark &&
                    fsnotify_compare_groups(max_prio_group, mark->group) > 0)
                        max_prio_group = mark->group;
        }

        if (!max_prio_group)
                return false;

        /* Set the report mask for marks from same group as max prio group */
        iter_info->current_group = max_prio_group;
        iter_info->report_mask = 0;
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group) {
                        /*
                         * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
                         * is watching children and interested in this event,
                         * which is an event possible on child.
                         * But is *this mark* watching children?
                         */
                        if (type == FSNOTIFY_ITER_TYPE_PARENT &&
                            !(mark->mask & FS_EVENT_ON_CHILD) &&
                            !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
                                continue;

                        fsnotify_iter_set_report_type(iter_info, type);
                }
        }

        return true;
}

/*
 * Pop from iter_info multi head queue, the marks that belong to the group of
 * current iteration step.
 */
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *mark;
        int type;

        /*
         * We cannot use fsnotify_foreach_iter_mark_type() here because we
         * may need to advance a mark of type X that belongs to current_group
         * but was not selected for reporting.
         */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group)
                        iter_info->marks[type] =
                                fsnotify_next_mark(iter_info->marks[type]);
        }
}

/*
 * fsnotify - This is the main call to fsnotify.
 *
 * The VFS calls into hook specific functions in linux/fsnotify.h.
 * Those functions then in turn call here.  Here will call out to all of the
 * registered fsnotify_group.  Those groups can then use the notification event
 * in whatever means they feel necessary.
 *
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @inode:        optional inode associated with event -
 *                If @dir and @inode are both non-NULL, event may be
 *                reported to both.
 * @cookie:        inotify rename cookie
 */
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
             const struct qstr *file_name, struct inode *inode, u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct super_block *sb = fsnotify_data_sb(data, data_type);
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
        struct fsnotify_iter_info iter_info = {};
        struct mount *mnt = NULL;
        struct inode *inode2 = NULL;
        struct dentry *moved;
        int inode2_type;
        int ret = 0;
        __u32 test_mask, marks_mask;

        if (path)
                mnt = real_mount(path->mnt);

        if (!inode) {
                /* Dirent event - report on TYPE_INODE to dir */
                inode = dir;
                /* For FS_RENAME, inode is old_dir and inode2 is new_dir */
                if (mask & FS_RENAME) {
                        moved = fsnotify_data_dentry(data, data_type);
                        inode2 = moved->d_parent->d_inode;
                        inode2_type = FSNOTIFY_ITER_TYPE_INODE2;
                }
        } else if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Event on child - report on TYPE_PARENT to dir if it is
                 * watching children and on TYPE_INODE to child.
                 */
                inode2 = dir;
                inode2_type = FSNOTIFY_ITER_TYPE_PARENT;
        }

        /*
         * Optimization: srcu_read_lock() has a memory barrier which can
         * be expensive.  It protects walking the *_fsnotify_marks lists.
         * However, if we do not walk the lists, we do not have to do
         * SRCU because we have no references to any objects and do not
         * need SRCU to keep them "alive".
         */
        if ((!sbinfo || !sbinfo->sb_marks) &&
            (!mnt || !mnt->mnt_fsnotify_marks) &&
            (!inode || !inode->i_fsnotify_marks) &&
            (!inode2 || !inode2->i_fsnotify_marks))
                return 0;

        marks_mask = sb->s_fsnotify_mask;
        if (mnt)
                marks_mask |= mnt->mnt_fsnotify_mask;
        if (inode)
                marks_mask |= inode->i_fsnotify_mask;
        if (inode2)
                marks_mask |= inode2->i_fsnotify_mask;


        /*
         * If this is a modify event we may need to clear some ignore masks.
         * In that case, the object with ignore masks will have the FS_MODIFY
         * event in its mask.
         * Otherwise, return if none of the marks care about this type of event.
         */
        test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        if (!(test_mask & marks_mask))
                return 0;

        iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);

        if (sbinfo) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
                        fsnotify_first_mark(&sbinfo->sb_marks);
        }
        if (mnt) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
                        fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
        }
        if (inode) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] =
                        fsnotify_first_mark(&inode->i_fsnotify_marks);
        }
        if (inode2) {
                iter_info.marks[inode2_type] =
                        fsnotify_first_mark(&inode2->i_fsnotify_marks);
        }

        /*
         * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
         * ignore masks are properly reflected for mount/sb mark notifications.
         * That's why this traversal is so complicated...
         */
        while (fsnotify_iter_select_report_types(&iter_info)) {
                ret = send_to_group(mask, data, data_type, dir, file_name,
                                    cookie, &iter_info);

                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;

                fsnotify_iter_next(&iter_info);
        }
        ret = 0;
out:
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);

        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);

static __init int fsnotify_init(void)
{
        int ret;

        BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);

        ret = init_srcu_struct(&fsnotify_mark_srcu);
        if (ret)
                panic("initializing fsnotify_mark_srcu");

        fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
                                                    SLAB_PANIC);

        return 0;
}
core_initcall(fsnotify_init);












































































































































































































































































































































































































































































































































































































































   10 


























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Filesystem access notification for Linux
 *
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H

#ifdef __KERNEL__

#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
#include <linux/sched/mm.h>

/*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 * convert between them.  dnotify only needs conversion at watch creation
 * so no perf loss there.  fanotify isn't defined yet, so it can use the
 * wholes if it needs more events.
 */
#define FS_ACCESS                0x00000001        /* File was accessed */
#define FS_MODIFY                0x00000002        /* File was modified */
#define FS_ATTRIB                0x00000004        /* Metadata changed */
#define FS_CLOSE_WRITE                0x00000008        /* Writable file was closed */
#define FS_CLOSE_NOWRITE        0x00000010        /* Unwritable file closed */
#define FS_OPEN                        0x00000020        /* File was opened */
#define FS_MOVED_FROM                0x00000040        /* File was moved from X */
#define FS_MOVED_TO                0x00000080        /* File was moved to Y */
#define FS_CREATE                0x00000100        /* Subfile was created */
#define FS_DELETE                0x00000200        /* Subfile was deleted */
#define FS_DELETE_SELF                0x00000400        /* Self was deleted */
#define FS_MOVE_SELF                0x00000800        /* Self was moved */
#define FS_OPEN_EXEC                0x00001000        /* File was opened for exec */

#define FS_UNMOUNT                0x00002000        /* inode on umount fs */
#define FS_Q_OVERFLOW                0x00004000        /* Event queued overflowed */
#define FS_ERROR                0x00008000        /* Filesystem Error (fanotify) */

/*
 * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
 * which does not support FS_ERROR.
 */
#define FS_IN_IGNORED                0x00008000        /* last inotify event here */

#define FS_OPEN_PERM                0x00010000        /* open event in an permission hook */
#define FS_ACCESS_PERM                0x00020000        /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM        0x00040000        /* open/exec event in a permission hook */

/*
 * Set on inode mark that cares about things that happen to its children.
 * Always set for dnotify and inotify.
 * Set on inode/sb/mount marks that care about parent/name info.
 */
#define FS_EVENT_ON_CHILD        0x08000000

#define FS_RENAME                0x10000000        /* File was renamed */
#define FS_DN_MULTISHOT                0x20000000        /* dnotify multishot */
#define FS_ISDIR                0x40000000        /* event occurred against dir */

#define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)

/*
 * Directory entry modification events - reported only to directory
 * where entry is modified and not to a watching parent.
 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
 * when a directory entry inside a child subdir changes.
 */
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)

#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
                                  FS_OPEN_EXEC_PERM)

/*
 * This is a list of all events that may get sent to a parent that is watching
 * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
 */
#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
                                   FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
                                   FS_OPEN | FS_OPEN_EXEC)

/*
 * This is a list of all events that may get sent with the parent inode as the
 * @to_tell argument of fsnotify().
 * It may include events that can be sent to an inode/sb/mount mark, but cannot
 * be sent to a parent watching children.
 */
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)

/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | \
                             FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
                             FS_ERROR)

/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS  (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT)

#define ALL_FSNOTIFY_BITS   (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)

struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;

struct mem_cgroup;

/*
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
 * handle_event - main call for a group to handle an fs event
 * @group:        group to notify
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 * @iter_info:        array of marks from this group that are interested in the event
 *
 * handle_inode_event - simple variant of handle_event() for groups that only
 *                have inode marks and don't have ignore mask
 * @mark:        mark to notify
 * @mask:        event type and flags
 * @inode:        inode that event happened on
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to.
 *                Either @inode or @dir must be non-NULL.
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 *
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
 *                MUST be holding a reference on each mark and that reference must be
 *                dropped in this function.  inotify uses this function to send
 *                userspace messages that marks have been removed.
 */
struct fsnotify_ops {
        int (*handle_event)(struct fsnotify_group *group, u32 mask,
                            const void *data, int data_type, struct inode *dir,
                            const struct qstr *file_name, u32 cookie,
                            struct fsnotify_iter_info *iter_info);
        int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
                            struct inode *inode, struct inode *dir,
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
        void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
};

/*
 * all of the information about the original object we want to now send to
 * a group.  If you want to carry more info from the accessing task to the
 * listener this structure is where you need to be adding fields.
 */
struct fsnotify_event {
        struct list_head list;
};

/*
 * fsnotify group priorities.
 * Events are sent in order from highest priority to lowest priority.
 */
enum fsnotify_group_prio {
        FSNOTIFY_PRIO_NORMAL = 0,        /* normal notifiers, no permissions */
        FSNOTIFY_PRIO_CONTENT,                /* fanotify permission events */
        FSNOTIFY_PRIO_PRE_CONTENT,        /* fanotify pre-content events */
        __FSNOTIFY_PRIO_NUM
};

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {
        const struct fsnotify_ops *ops;        /* how this group handles things */

        /*
         * How the refcnt is used is up to each group.  When the refcnt hits 0
         * fsnotify will clean up all of the resources associated with this group.
         * As an example, the dnotify group will always have a refcnt=1 and that
         * will never change.  Inotify, on the other hand, has a group per
         * inotify_init() and the refcnt will hit 0 only when that fd has been
         * closed.
         */
        refcount_t refcnt;                /* things with interest in this group */

        /* needed to send notification to userspace */
        spinlock_t notification_lock;                /* protect the notification_list */
        struct list_head notification_list;        /* list of event_holder this group needs to send to userspace */
        wait_queue_head_t notification_waitq;        /* read() on the notification file blocks on this waitq */
        unsigned int q_len;                        /* events on the queue */
        unsigned int max_events;                /* maximum events allowed on the list */
        enum fsnotify_group_prio priority;        /* priority for sending events */
        bool shutdown;                /* group is being shut down, don't queue more events */

#define FSNOTIFY_GROUP_USER        0x01 /* user allocated group */
#define FSNOTIFY_GROUP_DUPS        0x02 /* allow multiple marks per object */
#define FSNOTIFY_GROUP_NOFS        0x04 /* group lock is not direct reclaim safe */
        int flags;
        unsigned int owner_flags;        /* stored flags of mark_mutex owner */

        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
        atomic_t user_waits;                /* Number of tasks waiting for user
                                         * response */
        struct list_head marks_list;        /* all inode marks for this group */

        struct fasync_struct *fsn_fa;    /* async notification */

        struct fsnotify_event *overflow_event;        /* Event we queue when the
                                                 * notification list is too
                                                 * full */

        struct mem_cgroup *memcg;        /* memcg to charge allocations */

        /* groups can define private fields here or use the void *private */
        union {
                void *private;
#ifdef CONFIG_INOTIFY_USER
                struct inotify_group_private_data {
                        spinlock_t        idr_lock;
                        struct idr      idr;
                        struct ucounts *ucounts;
                } inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
                struct fanotify_group_private_data {
                        /* Hash table of events for merge */
                        struct hlist_head *merge_hash;
                        /* allows a group to block waiting for a userspace response */
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        struct ucounts *ucounts;
                        mempool_t error_events_pool;
                } fanotify_data;
#endif /* CONFIG_FANOTIFY */
        };
};

/*
 * These helpers are used to prevent deadlock when reclaiming inodes with
 * evictable marks of the same group that is allocating a new mark.
 */
static inline void fsnotify_group_lock(struct fsnotify_group *group)
{
        mutex_lock(&group->mark_mutex);
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                group->owner_flags = memalloc_nofs_save();
}

static inline void fsnotify_group_unlock(struct fsnotify_group *group)
{
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                memalloc_nofs_restore(group->owner_flags);
        mutex_unlock(&group->mark_mutex);
}

static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
{
        WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}

/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
        FSNOTIFY_EVENT_DENTRY,
        FSNOTIFY_EVENT_ERROR,
};

struct fs_error_report {
        int error;
        struct inode *inode;
        struct super_block *sb;
};

static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
        case FSNOTIFY_EVENT_DENTRY:
                return d_inode(data);
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *)data)->inode;
        default:
                return NULL;
        }
}

static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_DENTRY:
                /* Non const is needed for dget() */
                return (struct dentry *)data;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry;
        default:
                return NULL;
        }
}

static inline const struct path *fsnotify_data_path(const void *data,
                                                    int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_PATH:
                return data;
        default:
                return NULL;
        }
}

static inline struct super_block *fsnotify_data_sb(const void *data,
                                                   int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return ((struct inode *)data)->i_sb;
        case FSNOTIFY_EVENT_DENTRY:
                return ((struct dentry *)data)->d_sb;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry->d_sb;
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *) data)->sb;
        default:
                return NULL;
        }
}

static inline struct fs_error_report *fsnotify_data_error_report(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_ERROR:
                return (struct fs_error_report *) data;
        default:
                return NULL;
        }
}

/*
 * Index to merged marks iterator array that correlates to a type of watch.
 * The type of watched object can be deduced from the iterator type, but not
 * the other way around, because an event can match different watched objects
 * of the same object type.
 * For example, both parent and child are watching an object of type inode.
 */
enum fsnotify_iter_type {
        FSNOTIFY_ITER_TYPE_INODE,
        FSNOTIFY_ITER_TYPE_VFSMOUNT,
        FSNOTIFY_ITER_TYPE_SB,
        FSNOTIFY_ITER_TYPE_PARENT,
        FSNOTIFY_ITER_TYPE_INODE2,
        FSNOTIFY_ITER_TYPE_COUNT
};

/* The type of object that a mark is attached to */
enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_ANY = -1,
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_VFSMOUNT,
        FSNOTIFY_OBJ_TYPE_SB,
        FSNOTIFY_OBJ_TYPE_COUNT,
        FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};

static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
{
        return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
}

struct fsnotify_iter_info {
        struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
        struct fsnotify_group *current_group;
        unsigned int report_mask;
        int srcu_idx;
};

static inline bool fsnotify_iter_should_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        return (iter_info->report_mask & (1U << iter_type));
}

static inline void fsnotify_iter_set_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        iter_info->report_mask |= (1U << iter_type);
}

static inline struct fsnotify_mark *fsnotify_iter_mark(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        if (fsnotify_iter_should_report_type(iter_info, iter_type))
                return iter_info->marks[iter_type];
        return NULL;
}

static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type,
                                     struct fsnotify_mark **markp)
{
        while (type < FSNOTIFY_ITER_TYPE_COUNT) {
                *markp = fsnotify_iter_mark(iter, type);
                if (*markp)
                        break;
                type++;
        }
        return type;
}

#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
                struct fsnotify_iter_info *iter_info) \
{ \
        return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \
}

FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
FSNOTIFY_ITER_FUNCS(sb, SB)

#define fsnotify_foreach_iter_type(type) \
        for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
#define fsnotify_foreach_iter_mark_type(iter, mark, type) \
        for (type = 0; \
             type = fsnotify_iter_step(iter, type, &mark), \
             type < FSNOTIFY_ITER_TYPE_COUNT; \
             type++)

/*
 * Inode/vfsmount/sb point to this structure which tracks all marks attached to
 * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
 * structure. We destroy this structure when there are no more marks attached
 * to it. The structure is protected by fsnotify_mark_srcu.
 */
struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned char type;        /* Type of object [lock] */
        unsigned char prio;        /* Highest priority group */
#define FSNOTIFY_CONN_FLAG_IS_WATCHED        0x01
#define FSNOTIFY_CONN_FLAG_HAS_IREF        0x02
        unsigned short flags;        /* flags [lock] */
        union {
                /* Object pointer [lock] */
                void *obj;
                /* Used listing heads to free after srcu period expires */
                struct fsnotify_mark_connector *destroy_next;
        };
        struct hlist_head list;
};

/*
 * Container for per-sb fsnotify state (sb marks and more).
 * Attached lazily on first marked object on the sb and freed when killing sb.
 */
struct fsnotify_sb_info {
        struct fsnotify_mark_connector __rcu *sb_marks;
        /*
         * Number of inode/mount/sb objects that are being watched in this sb.
         * Note that inodes objects are currently double-accounted.
         *
         * The value in watched_objects[prio] is the number of objects that are
         * watched by groups of priority >= prio, so watched_objects[0] is the
         * total number of watched objects in this sb.
         */
        atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM];
};

static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
{
#ifdef CONFIG_FSNOTIFY
        return READ_ONCE(sb->s_fsnotify_info);
#else
        return NULL;
#endif
}

static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
{
        return &fsnotify_sb_info(sb)->watched_objects[0];
}

/*
 * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
 * These are flushed when an inode is evicted from core and may be flushed
 * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
 * users (such as dnotify) will flush these when the open fd is closed and not
 * at inode eviction or modification.
 *
 * Text in brackets is showing the lock(s) protecting modifications of a
 * particular entry. obj_lock means either inode->i_lock or
 * mnt->mnt_root->d_lock depending on the mark type.
 */
struct fsnotify_mark {
        /* Mask this mark is for [mark->lock, group->mark_mutex] */
        __u32 mask;
        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
        refcount_t refcnt;
        /* Group this mark is for. Set on mark creation, stable until last ref
         * is dropped */
        struct fsnotify_group *group;
        /* List of marks by group->marks_list. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
        struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [connector->lock, mark ref] */
        struct hlist_node obj_list;
        /* Head of list of marks for an object [mark ref] */
        struct fsnotify_mark_connector *connector;
        /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
        __u32 ignore_mask;
        /* General fsnotify mark flags */
#define FSNOTIFY_MARK_FLAG_ALIVE                0x0001
#define FSNOTIFY_MARK_FLAG_ATTACHED                0x0002
        /* inotify mark flags */
#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK                0x0010
#define FSNOTIFY_MARK_FLAG_IN_ONESHOT                0x0020
        /* fanotify mark flags */
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY        0x0100
#define FSNOTIFY_MARK_FLAG_NO_IREF                0x0200
#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS        0x0400
#define FSNOTIFY_MARK_FLAG_HAS_FSID                0x0800
#define FSNOTIFY_MARK_FLAG_WEAK_FSID                0x1000
        unsigned int flags;                /* flags [mark->lock] */
};

#ifdef CONFIG_FSNOTIFY

/* called from the vfs helpers */

/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
                    struct inode *dir, const struct qstr *name,
                    struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                           int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern void fsnotify_sb_free(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);

static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
        /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
        if (!(mask & FS_EVENT_ON_CHILD))
                return 0;
        /*
         * This object might be watched by a mark that cares about parent/name
         * info, does it care about the specific set of events that can be
         * reported with parent/name info?
         */
        return mask & FS_EVENTS_POSS_TO_PARENT;
}

static inline int fsnotify_inode_watches_children(struct inode *inode)
{
        /* FS_EVENT_ON_CHILD is set if the inode may care */
        if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
                return 0;
        /* this inode might care about child events, does it care about the
         * specific set of events that can happen on a child? */
        return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
}

/*
 * Update the dentry with a flag indicating the interest of its parent to receive
 * filesystem events when those events happens to this dentry->d_inode.
 */
static inline void fsnotify_update_flags(struct dentry *dentry)
{
        assert_spin_locked(&dentry->d_lock);

        /*
         * Serialisation of setting PARENT_WATCHED on the dentries is provided
         * by d_lock. If inotify_inode_watched changes after we have taken
         * d_lock, the following __fsnotify_update_child_dentry_flags call will
         * find our entry, so it will spin until we complete here, and update
         * us with the new state.
         */
        if (fsnotify_inode_watches_children(dentry->d_parent->d_inode))
                dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
        else
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}

/* called from fsnotify listeners, such as fanotify or dnotify */

/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(
                                const struct fsnotify_ops *ops,
                                int flags);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_insert_event(struct fsnotify_group *group,
                                 struct fsnotify_event *event,
                                 int (*merge)(struct fsnotify_group *,
                                              struct fsnotify_event *),
                                 void (*insert)(struct fsnotify_group *,
                                                struct fsnotify_event *));

static inline int fsnotify_add_event(struct fsnotify_group *group,
                                     struct fsnotify_event *event,
                                     int (*merge)(struct fsnotify_group *,
                                                  struct fsnotify_event *))
{
        return fsnotify_insert_event(group, event, merge, NULL);
}

/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
        fsnotify_add_event(group, group->overflow_event, NULL);
}

static inline bool fsnotify_is_overflow_event(u32 mask)
{
        return mask & FS_Q_OVERFLOW;
}

static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        return list_empty(&group->notification_list);
}

extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                         struct fsnotify_event *event);

/* functions used to manipulate the marks attached to inodes */

/*
 * Canonical "ignore mask" including event flags.
 *
 * Note the subtle semantic difference from the legacy ->ignored_mask.
 * ->ignored_mask traditionally only meant which events should be ignored,
 * while ->ignore_mask also includes flags regarding the type of objects on
 * which events should be ignored.
 */
static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
{
        __u32 ignore_mask = mark->ignore_mask;

        /* The event flags in ignore mask take effect */
        if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
                return ignore_mask;

        /*
         * Legacy behavior:
         * - Always ignore events on dir
         * - Ignore events on child if parent is watching children
         */
        ignore_mask |= FS_ISDIR;
        ignore_mask &= ~FS_EVENT_ON_CHILD;
        ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;

        return ignore_mask;
}

/* Legacy ignored_mask - only event types to ignore */
static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
{
        return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Check if mask (or ignore mask) should be applied depending if victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
                                            int iter_type)
{
        /* Should mask be applied to a directory? */
        if (is_dir && !(mask & FS_ISDIR))
                return false;

        /* Should mask be applied to a child? */
        if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
            !(mask & FS_EVENT_ON_CHILD))
                return false;

        return true;
}

/*
 * Effective ignore mask taking into account if event victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
                                                   bool is_dir, int iter_type)
{
        __u32 ignore_mask = fsnotify_ignored_events(mark);

        if (!ignore_mask)
                return 0;

        /* For non-dir and non-child, no need to consult the event flags */
        if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
                return ignore_mask;

        ignore_mask = fsnotify_ignore_mask(mark);
        if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
                return 0;

        return ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/* Get mask for calculating object interest taking ignore mask into account */
static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
{
        __u32 mask = mark->mask;

        if (!fsnotify_ignored_events(mark))
                return mask;

        /* Interest in FS_MODIFY may be needed for clearing ignore mask */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                mask |= FS_MODIFY;

        /*
         * If mark is interested in ignoring events on children, the object must
         * show interest in those events for fsnotify_parent() to notice it.
         */
        return mask | mark->ignore_mask;
}

/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
                               struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group);
/* attach the mark to the object */
int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags);
int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj,
                             unsigned int obj_type, int add_flags);

/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int add_flags)
{
        return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                 add_flags);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int add_flags)
{
        return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                        add_flags);
}

static inline struct fsnotify_mark *fsnotify_find_inode_mark(
                                                struct inode *inode,
                                                struct fsnotify_group *group)
{
        return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group);
}

/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* Clear all of the marks of a group attached to a given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                          unsigned int obj_type);
/* run all the marks in a group, and clear all of the vfsmount marks */
static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
}
/* run all the marks in a group, and clear all of the inode marks */
static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
}
/* run all the marks in a group, and clear all of the sn marks */
static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB);
}
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);

static inline void fsnotify_init_event(struct fsnotify_event *event)
{
        INIT_LIST_HEAD(&event->list);
}

#else

static inline int fsnotify(__u32 mask, const void *data, int data_type,
                           struct inode *dir, const struct qstr *name,
                           struct inode *inode, u32 cookie)
{
        return 0;
}

static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        return 0;
}

static inline void __fsnotify_inode_delete(struct inode *inode)
{}

static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}

static inline void fsnotify_sb_delete(struct super_block *sb)
{}

static inline void fsnotify_sb_free(struct super_block *sb)
{}

static inline void fsnotify_update_flags(struct dentry *dentry)
{}

static inline u32 fsnotify_get_cookie(void)
{
        return 0;
}

static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

#endif        /* CONFIG_FSNOTIFY */

#endif        /* __KERNEL __ */

#endif        /* __LINUX_FSNOTIFY_BACKEND_H */





















































































































































































































































































































































































































































































    2 


    2 










    2 







    2 




    2 


    2 


    2 





















    1 


















    2 












    2 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP module.
 *
 * Version:        @(#)ip.h        1.0.2        05/07/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Changes:
 *                Mike McLagan    :       Routing by source
 */
#ifndef _IP_H
#define _IP_H

#include <linux/types.h>
#include <linux/ip.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/sockptr.h>
#include <linux/static_key.h>

#include <net/inet_sock.h>
#include <net/route.h>
#include <net/snmp.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/netns/hash.h>
#include <net/lwtunnel.h>

#define IPV4_MAX_PMTU                65535U                /* RFC 2675, Section 5.1 */
#define IPV4_MIN_MTU                68                        /* RFC 791 */

extern unsigned int sysctl_fib_sync_mem;
extern unsigned int sysctl_fib_sync_mem_min;
extern unsigned int sysctl_fib_sync_mem_max;

struct sock;

struct inet_skb_parm {
        int                        iif;
        struct ip_options        opt;                /* Compiled IP options                */
        u16                        flags;

#define IPSKB_FORWARDED                BIT(0)
#define IPSKB_XFRM_TUNNEL_SIZE        BIT(1)
#define IPSKB_XFRM_TRANSFORMED        BIT(2)
#define IPSKB_FRAG_COMPLETE        BIT(3)
#define IPSKB_REROUTED                BIT(4)
#define IPSKB_DOREDIRECT        BIT(5)
#define IPSKB_FRAG_PMTU                BIT(6)
#define IPSKB_L3SLAVE                BIT(7)
#define IPSKB_NOPOLICY                BIT(8)
#define IPSKB_MULTIPATH                BIT(9)

        u16                        frag_max_size;
};

static inline bool ipv4_l3mdev_skb(u16 flags)
{
        return !!(flags & IPSKB_L3SLAVE);
}

static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
{
        return ip_hdr(skb)->ihl * 4;
}

struct ipcm_cookie {
        struct sockcm_cookie        sockc;
        __be32                        addr;
        int                        oif;
        struct ip_options_rcu        *opt;
        __u8                        protocol;
        __u8                        ttl;
        __s16                        tos;
        char                        priority;
        __u16                        gso_size;
};

static inline void ipcm_init(struct ipcm_cookie *ipcm)
{
        *ipcm = (struct ipcm_cookie) { .tos = -1 };
}

static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
                                const struct inet_sock *inet)
{
        ipcm_init(ipcm);

        ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark);
        ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags);
        ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
        ipcm->addr = inet->inet_saddr;
        ipcm->protocol = inet->inet_num;
}

#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
#define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb))

/* return enslaved device index if relevant */
static inline int inet_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
                return IPCB(skb)->iif;
#endif
        return 0;
}

/* Special input handler for packets caught by router alert option.
   They are selected only by protocol field, and then processed likely
   local ones; but only if someone wants them! Otherwise, router
   not running rsvpd will kill RSVP.

   It is user level problem, what it will make with them.
   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
   but receiver should be enough clever f.e. to forward mtrace requests,
   sent to multicast group to reach destination designated router.
 */

struct ip_ra_chain {
        struct ip_ra_chain __rcu *next;
        struct sock                *sk;
        union {
                void                        (*destructor)(struct sock *);
                struct sock                *saved_sk;
        };
        struct rcu_head                rcu;
};

/* IP flags. */
#define IP_CE                0x8000                /* Flag: "Congestion"                */
#define IP_DF                0x4000                /* Flag: "Don't Fragment"        */
#define IP_MF                0x2000                /* Flag: "More Fragments"        */
#define IP_OFFSET        0x1FFF                /* "Fragment Offset" part        */

#define IP_FRAG_TIME        (30 * HZ)                /* fragment lifetime        */

struct msghdr;
struct net_device;
struct packet_type;
struct rtable;
struct sockaddr;

int igmp_mc_init(void);

/*
 *        Functions provided by ip.c
 */

int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
                          __be32 saddr, __be32 daddr,
                          struct ip_options_rcu *opt, u8 tos);
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
           struct net_device *orig_dev);
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
                 struct net_device *orig_dev);
int ip_local_deliver(struct sk_buff *skb);
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
int ip_mr_input(struct sk_buff *skb);
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                   int (*output)(struct net *, struct sock *, struct sk_buff *));

struct ip_fraglist_iter {
        struct sk_buff        *frag;
        struct iphdr        *iph;
        int                offset;
        unsigned int        hlen;
};

void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
                      unsigned int hlen, struct ip_fraglist_iter *iter);
void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter);

static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip_frag_state {
        bool                DF;
        unsigned int        hlen;
        unsigned int        ll_rs;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        __be16                not_last_frag;
};

void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs,
                  unsigned int mtu, bool DF, struct ip_frag_state *state);
struct sk_buff *ip_frag_next(struct sk_buff *skb,
                             struct ip_frag_state *state);

void ip_send_check(struct iphdr *ip);
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                    __u8 tos);
void ip_init(void);
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                   int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                   void *from, int len, int protolen,
                   struct ipcm_cookie *ipc,
                   struct rtable **rt,
                   unsigned int flags);
int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd,
                       struct sk_buff *skb);
struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                              struct sk_buff_head *queue,
                              struct inet_cork *cork);
int ip_send_skb(struct net *net, struct sk_buff *skb);
int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4);
void ip_flush_pending_frames(struct sock *sk);
struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            struct ipcm_cookie *ipc, struct rtable **rtp,
                            struct inet_cork *cork, unsigned int flags);

int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);

static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
{
        return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
}

/* Get the route scope that should be used when sending a packet. */
static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
                                  const struct ipcm_cookie *ipc,
                                  const struct msghdr *msg)
{
        if (sock_flag(&inet->sk, SOCK_LOCALROUTE) ||
            msg->msg_flags & MSG_DONTROUTE ||
            (ipc->opt && ipc->opt->opt.is_strictroute))
                return RT_SCOPE_LINK;

        return RT_SCOPE_UNIVERSE;
}

static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
{
        return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos));
}

/* datagram.c */
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);

void ip4_datagram_release_cb(struct sock *sk);

struct ip_reply_arg {
        struct kvec iov[1];
        int            flags;
        __wsum             csum;
        int            csumoffset; /* u16 offset of csum in iov[0].iov_base */
                                /* -1 if not needed */
        int            bound_dev_if;
        u8              tos;
        kuid_t            uid;
};

#define IP_REPLY_ARG_NOSRCCHECK 1

static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
{
        return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
}

void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
                           const struct ip_options *sopt,
                           __be32 daddr, __be32 saddr,
                           const struct ip_reply_arg *arg,
                           unsigned int len, u64 transmit_time, u32 txhash);

#define IP_INC_STATS(net, field)        SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define __IP_INC_STATS(net, field)        __SNMP_INC_STATS64((net)->mib.ip_statistics, field)
#define IP_ADD_STATS(net, field, val)        SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_ADD_STATS(net, field, val) __SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
#define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define __IP_UPD_PO_STATS(net, field, val) __SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
#define NET_INC_STATS(net, field)        SNMP_INC_STATS((net)->mib.net_statistics, field)
#define __NET_INC_STATS(net, field)        __SNMP_INC_STATS((net)->mib.net_statistics, field)
#define NET_ADD_STATS(net, field, adnd)        SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)
#define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd)

static inline u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
{
        return  *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
}

unsigned long snmp_fold_field(void __percpu *mib, int offt);
#if BITS_PER_LONG==32
u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                         size_t syncp_offset);
u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off);
#else
static inline u64  snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct,
                                        size_t syncp_offset)
{
        return snmp_get_cpu_field(mib, cpu, offct);

}

static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off)
{
        return snmp_fold_field(mib, offt);
}
#endif

#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; stats_list[i].name; i++) \
                        buff64[i] += snmp_get_cpu_field64( \
                                        mib_statistic, \
                                        c, stats_list[i].entry, \
                                        offset); \
        } \
}

#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \
{ \
        int i, c; \
        for_each_possible_cpu(c) { \
                for (i = 0; stats_list[i].name; i++) \
                        buff[i] += snmp_get_cpu_field( \
                                                mib_statistic, \
                                                c, stats_list[i].entry); \
        } \
}

static inline void inet_get_local_port_range(const struct net *net, int *low, int *high)
{
        u32 range = READ_ONCE(net->ipv4.ip_local_ports.range);

        *low = range & 0xffff;
        *high = range >> 16;
}
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);

#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
{
        if (!net->ipv4.sysctl_local_reserved_ports)
                return false;
        return test_bit(port, net->ipv4.sysctl_local_reserved_ports);
}

static inline bool sysctl_dev_name_is_allowed(const char *name)
{
        return strcmp(name, "default") != 0  && strcmp(name, "all") != 0;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
}

#else
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
{
        return false;
}

static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port)
{
        return port < PROT_SOCK;
}
#endif

__be32 inet_current_timestamp(void);

/* From inetpeer.c */
extern int inet_peer_threshold;
extern int inet_peer_minttl;
extern int inet_peer_maxttl;

void ipfrag_init(void);

void ip_static_sysctl_init(void);

#define IP4_REPLY_MARK(net, mark) \
        (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0)

static inline bool ip_is_fragment(const struct iphdr *iph)
{
        return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0;
}

#ifdef CONFIG_INET
#include <net/dst.h>

/* The function in 2.2 was invalid, producing wrong result for
 * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */
static inline
int ip_decrease_ttl(struct iphdr *iph)
{
        u32 check = (__force u32)iph->check;
        check += (__force u32)htons(0x0100);
        iph->check = (__force __sum16)(check + (check>=0xFFFF));
        return --iph->ttl;
}

static inline int ip_mtu_locked(const struct dst_entry *dst)
{
        const struct rtable *rt = dst_rtable(dst);

        return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
}

static inline
int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return  pmtudisc == IP_PMTUDISC_DO ||
                (pmtudisc == IP_PMTUDISC_WANT &&
                 !ip_mtu_locked(dst));
}

static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return pmtudisc != IP_PMTUDISC_INTERFACE &&
               pmtudisc != IP_PMTUDISC_OMIT;
}

static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
        return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE;
}

static inline bool ip_sk_ignore_df(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);

        return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT;
}

static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
                                                    bool forwarding)
{
        const struct rtable *rt = dst_rtable(dst);
        struct net *net = dev_net(dst->dev);
        unsigned int mtu;

        if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
            ip_mtu_locked(dst) ||
            !forwarding) {
                mtu = rt->rt_pmtu;
                if (mtu && time_before(jiffies, rt->dst.expires))
                        goto out;
        }

        /* 'forwarding = true' case should always honour route mtu */
        mtu = dst_metric_raw(dst, RTAX_MTU);
        if (mtu)
                goto out;

        mtu = READ_ONCE(dst->dev->mtu);

        if (unlikely(ip_mtu_locked(dst))) {
                if (rt->rt_uses_gateway && mtu > 576)
                        mtu = 576;
        }

out:
        mtu = min_t(unsigned int, mtu, IP_MAX_MTU);

        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
                                          const struct sk_buff *skb)
{
        unsigned int mtu;

        if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
                bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;

                return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
        }

        mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
        return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu);
}

struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len,
                                        struct netlink_ext_ack *extack);
static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
{
        if (fib_metrics != &dst_default_metrics &&
            refcount_dec_and_test(&fib_metrics->refcnt))
                kfree(fib_metrics);
}

/* ipv4 and ipv6 both use refcounted metrics if it is not the default */
static inline
void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics)
{
        dst_init_metrics(dst, fib_metrics->metrics, true);

        if (fib_metrics != &dst_default_metrics) {
                dst->_metrics |= DST_METRICS_REFCOUNTED;
                refcount_inc(&fib_metrics->refcnt);
        }
}

static inline
void ip_dst_metrics_put(struct dst_entry *dst)
{
        struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);

        if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
                kfree(p);
}

void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);

static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb,
                                        struct sock *sk, int segs)
{
        struct iphdr *iph = ip_hdr(skb);

        /* We had many attacks based on IPID, use the private
         * generator as much as we can.
         */
        if (sk && inet_sk(sk)->inet_daddr) {
                int val;

                /* avoid atomic operations for TCP,
                 * as we hold socket lock at this point.
                 */
                if (sk_is_tcp(sk)) {
                        sock_owned_by_me(sk);
                        val = atomic_read(&inet_sk(sk)->inet_id);
                        atomic_set(&inet_sk(sk)->inet_id, val + segs);
                } else {
                        val = atomic_add_return(segs, &inet_sk(sk)->inet_id);
                }
                iph->id = htons(val);
                return;
        }
        if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) {
                iph->id = 0;
        } else {
                /* Unfortunately we need the big hammer to get a suitable IPID */
                __ip_select_ident(net, iph, segs);
        }
}

static inline void ip_select_ident(struct net *net, struct sk_buff *skb,
                                   struct sock *sk)
{
        ip_select_ident_segs(net, skb, sk, 1);
}

static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)
{
        return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                                  skb->len, proto, 0);
}

/* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v4addrs.src = iph->saddr;
 *                        flow->v4addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
                                            const struct iphdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
                     offsetof(typeof(flow->addrs), v4addrs.src) +
                              sizeof(flow->addrs.v4addrs.src));
        memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}

/*
 *        Map a multicast IP onto multicast MAC for type ethernet.
 */

static inline void ip_eth_mc_map(__be32 naddr, char *buf)
{
        __u32 addr=ntohl(naddr);
        buf[0]=0x01;
        buf[1]=0x00;
        buf[2]=0x5e;
        buf[5]=addr&0xFF;
        addr>>=8;
        buf[4]=addr&0xFF;
        addr>>=8;
        buf[3]=addr&0x7F;
}

/*
 *        Map a multicast IP onto multicast MAC for type IP-over-InfiniBand.
 *        Leave P_Key as 0 to be filled in by driver.
 */

static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        __u32 addr;
        unsigned char scope = broadcast[5] & 0xF;

        buf[0]  = 0;                /* Reserved */
        buf[1]  = 0xff;                /* Multicast QPN */
        buf[2]  = 0xff;
        buf[3]  = 0xff;
        addr    = ntohl(naddr);
        buf[4]  = 0xff;
        buf[5]  = 0x10 | scope;        /* scope from broadcast address */
        buf[6]  = 0x40;                /* IPv4 signature */
        buf[7]  = 0x1b;
        buf[8]  = broadcast[8];                /* P_Key */
        buf[9]  = broadcast[9];
        buf[10] = 0;
        buf[11] = 0;
        buf[12] = 0;
        buf[13] = 0;
        buf[14] = 0;
        buf[15] = 0;
        buf[19] = addr & 0xff;
        addr  >>= 8;
        buf[18] = addr & 0xff;
        addr  >>= 8;
        buf[17] = addr & 0xff;
        addr  >>= 8;
        buf[16] = addr & 0x0f;
}

static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
{
        if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0)
                memcpy(buf, broadcast, 4);
        else
                memcpy(buf, &naddr, sizeof(naddr));
}

#if IS_ENABLED(CONFIG_IPV6)
#include <linux/ipv6.h>
#endif

static __inline__ void inet_reset_saddr(struct sock *sk)
{
        inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == PF_INET6) {
                struct ipv6_pinfo *np = inet6_sk(sk);

                memset(&np->saddr, 0, sizeof(np->saddr));
                memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
        }
#endif
}

#endif

static inline unsigned int ipv4_addr_hash(__be32 ip)
{
        return (__force unsigned int) ip;
}

static inline u32 ipv4_portaddr_hash(const struct net *net,
                                     __be32 saddr,
                                     unsigned int port)
{
        return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
}

bool ip_call_ra_chain(struct sk_buff *skb);

/*
 *        Functions provided by ip_fragment.c
 */

enum ip_defrag_users {
        IP_DEFRAG_LOCAL_DELIVER,
        IP_DEFRAG_CALL_RA_CHAIN,
        IP_DEFRAG_CONNTRACK_IN,
        __IP_DEFRAG_CONNTRACK_IN_END        = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_OUT,
        __IP_DEFRAG_CONNTRACK_OUT_END        = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
        IP_DEFRAG_CONNTRACK_BRIDGE_IN,
        __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
        IP_DEFRAG_VS_IN,
        IP_DEFRAG_VS_OUT,
        IP_DEFRAG_VS_FWD,
        IP_DEFRAG_AF_PACKET,
        IP_DEFRAG_MACVLAN,
};

/* Return true if the value of 'user' is between 'lower_bond'
 * and 'upper_bond' inclusively.
 */
static inline bool ip_defrag_user_in_between(u32 user,
                                             enum ip_defrag_users lower_bond,
                                             enum ip_defrag_users upper_bond)
{
        return user >= lower_bond && user <= upper_bond;
}

int ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
#ifdef CONFIG_INET
struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user);
#else
static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
        return skb;
}
#endif

/*
 *        Functions provided by ip_forward.c
 */

int ip_forward(struct sk_buff *skb);

/*
 *        Functions provided by ip_options.c
 */

void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
                      __be32 daddr, struct rtable *rt);

int __ip_options_echo(struct net *net, struct ip_options *dopt,
                      struct sk_buff *skb, const struct ip_options *sopt);
static inline int ip_options_echo(struct net *net, struct ip_options *dopt,
                                  struct sk_buff *skb)
{
        return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt);
}

void ip_options_fragment(struct sk_buff *skb);
int __ip_options_compile(struct net *net, struct ip_options *opt,
                         struct sk_buff *skb, __be32 *info);
int ip_options_compile(struct net *net, struct ip_options *opt,
                       struct sk_buff *skb);
int ip_options_get(struct net *net, struct ip_options_rcu **optp,
                   sockptr_t data, int optlen);
void ip_options_undo(struct ip_options *opt);
void ip_forward_options(struct sk_buff *skb);
int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev);

/*
 *        Functions provided by ip_sockglue.c
 */

void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst);
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
                         struct sk_buff *skb, int tlen, int offset);
int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
                 struct ipcm_cookie *ipc, bool allow_ipv6);
DECLARE_STATIC_KEY_FALSE(ip4_min_ttl);
int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                     unsigned int optlen);
int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                  unsigned int optlen);
int do_ip_getsockopt(struct sock *sk, int level, int optname,
                     sockptr_t optval, sockptr_t optlen);
int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                  int __user *optlen);
int ip_ra_control(struct sock *sk, unsigned char on,
                  void (*destructor)(struct sock *));

int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len);
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                   u32 info, u8 *payload);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
                    u32 info);

static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
        ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
}

bool icmp_global_allow(void);
extern int sysctl_icmp_msgs_per_sec;
extern int sysctl_icmp_msgs_burst;

#ifdef CONFIG_PROC_FS
int ip_misc_proc_init(void);
#endif

int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
                                struct netlink_ext_ack *extack);

static inline bool inetdev_valid_mtu(unsigned int mtu)
{
        return likely(mtu >= IPV4_MIN_MTU);
}

void ip_sock_set_freebind(struct sock *sk);
int ip_sock_set_mtu_discover(struct sock *sk, int val);
void ip_sock_set_pktinfo(struct sock *sk);
void ip_sock_set_recverr(struct sock *sk);
void ip_sock_set_tos(struct sock *sk, int val);
void  __ip_sock_set_tos(struct sock *sk, int val);

#endif        /* _IP_H */














    1 



    1 



    1 
    1 

    1 





























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#include <linux/bpf.h>
#include <linux/vmalloc.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/namei.h>
#include <linux/user_namespace.h>
#include <linux/security.h>

static bool bpf_ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN));
}

bool bpf_token_capable(const struct bpf_token *token, int cap)
{
        struct user_namespace *userns;

        /* BPF token allows ns_capable() level of capabilities */
        userns = token ? token->userns : &init_user_ns;
        if (!bpf_ns_capable(userns, cap))
                return false;
        if (token && security_bpf_token_capable(token, cap) < 0)
                return false;
        return true;
}

void bpf_token_inc(struct bpf_token *token)
{
        atomic64_inc(&token->refcnt);
}

static void bpf_token_free(struct bpf_token *token)
{
        security_bpf_token_free(token);
        put_user_ns(token->userns);
        kfree(token);
}

static void bpf_token_put_deferred(struct work_struct *work)
{
        struct bpf_token *token = container_of(work, struct bpf_token, work);

        bpf_token_free(token);
}

void bpf_token_put(struct bpf_token *token)
{
        if (!token)
                return;

        if (!atomic64_dec_and_test(&token->refcnt))
                return;

        INIT_WORK(&token->work, bpf_token_put_deferred);
        schedule_work(&token->work);
}

static int bpf_token_release(struct inode *inode, struct file *filp)
{
        struct bpf_token *token = filp->private_data;

        bpf_token_put(token);
        return 0;
}

static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
{
        struct bpf_token *token = filp->private_data;
        u64 mask;

        BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
        mask = BIT_ULL(__MAX_BPF_CMD) - 1;
        if ((token->allowed_cmds & mask) == mask)
                seq_printf(m, "allowed_cmds:\tany\n");
        else
                seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);

        BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
        mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1;
        if ((token->allowed_maps & mask) == mask)
                seq_printf(m, "allowed_maps:\tany\n");
        else
                seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);

        BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
        mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1;
        if ((token->allowed_progs & mask) == mask)
                seq_printf(m, "allowed_progs:\tany\n");
        else
                seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);

        BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
        mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1;
        if ((token->allowed_attachs & mask) == mask)
                seq_printf(m, "allowed_attachs:\tany\n");
        else
                seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
}

#define BPF_TOKEN_INODE_NAME "bpf-token"

static const struct inode_operations bpf_token_iops = { };

static const struct file_operations bpf_token_fops = {
        .release        = bpf_token_release,
        .show_fdinfo        = bpf_token_show_fdinfo,
};

int bpf_token_create(union bpf_attr *attr)
{
        struct bpf_mount_opts *mnt_opts;
        struct bpf_token *token = NULL;
        struct user_namespace *userns;
        struct inode *inode;
        struct file *file;
        struct path path;
        struct fd f;
        umode_t mode;
        int err, fd;

        f = fdget(attr->token_create.bpffs_fd);
        if (!f.file)
                return -EBADF;

        path = f.file->f_path;
        path_get(&path);
        fdput(f);

        if (path.dentry != path.mnt->mnt_sb->s_root) {
                err = -EINVAL;
                goto out_path;
        }
        if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
                err = -EINVAL;
                goto out_path;
        }
        err = path_permission(&path, MAY_ACCESS);
        if (err)
                goto out_path;

        userns = path.dentry->d_sb->s_user_ns;
        /*
         * Enforce that creators of BPF tokens are in the same user
         * namespace as the BPF FS instance. This makes reasoning about
         * permissions a lot easier and we can always relax this later.
         */
        if (current_user_ns() != userns) {
                err = -EPERM;
                goto out_path;
        }
        if (!ns_capable(userns, CAP_BPF)) {
                err = -EPERM;
                goto out_path;
        }

        /* Creating BPF token in init_user_ns doesn't make much sense. */
        if (current_user_ns() == &init_user_ns) {
                err = -EOPNOTSUPP;
                goto out_path;
        }

        mnt_opts = path.dentry->d_sb->s_fs_info;
        if (mnt_opts->delegate_cmds == 0 &&
            mnt_opts->delegate_maps == 0 &&
            mnt_opts->delegate_progs == 0 &&
            mnt_opts->delegate_attachs == 0) {
                err = -ENOENT; /* no BPF token delegation is set up */
                goto out_path;
        }

        mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
        inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_path;
        }

        inode->i_op = &bpf_token_iops;
        inode->i_fop = &bpf_token_fops;
        clear_nlink(inode); /* make sure it is unlinked */

        file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
        if (IS_ERR(file)) {
                iput(inode);
                err = PTR_ERR(file);
                goto out_path;
        }

        token = kzalloc(sizeof(*token), GFP_USER);
        if (!token) {
                err = -ENOMEM;
                goto out_file;
        }

        atomic64_set(&token->refcnt, 1);

        /* remember bpffs owning userns for future ns_capable() checks */
        token->userns = get_user_ns(userns);

        token->allowed_cmds = mnt_opts->delegate_cmds;
        token->allowed_maps = mnt_opts->delegate_maps;
        token->allowed_progs = mnt_opts->delegate_progs;
        token->allowed_attachs = mnt_opts->delegate_attachs;

        err = security_bpf_token_create(token, attr, &path);
        if (err)
                goto out_token;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0) {
                err = fd;
                goto out_token;
        }

        file->private_data = token;
        fd_install(fd, file);

        path_put(&path);
        return fd;

out_token:
        bpf_token_free(token);
out_file:
        fput(file);
out_path:
        path_put(&path);
        return err;
}

struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
        struct fd f = fdget(ufd);
        struct bpf_token *token;

        if (!f.file)
                return ERR_PTR(-EBADF);
        if (f.file->f_op != &bpf_token_fops) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }

        token = f.file->private_data;
        bpf_token_inc(token);
        fdput(f);

        return token;
}

bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
{
        if (!token)
                return false;
        if (!(token->allowed_cmds & BIT_ULL(cmd)))
                return false;
        return security_bpf_token_cmd(token, cmd) == 0;
}

bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
{
        if (!token || type >= __MAX_BPF_MAP_TYPE)
                return false;

        return token->allowed_maps & BIT_ULL(type);
}

bool bpf_token_allow_prog_type(const struct bpf_token *token,
                               enum bpf_prog_type prog_type,
                               enum bpf_attach_type attach_type)
{
        if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
                return false;

        return (token->allowed_progs & BIT_ULL(prog_type)) &&
               (token->allowed_attachs & BIT_ULL(attach_type));
}





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-long.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_LONG_H
#define _LINUX_ATOMIC_LONG_H

#include <linux/compiler.h>
#include <asm/types.h>

#ifdef CONFIG_64BIT
typedef atomic64_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC64_INIT(i)
#define atomic_long_cond_read_acquire        atomic64_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic64_cond_read_relaxed
#else
typedef atomic_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC_INIT(i)
#define atomic_long_cond_read_acquire        atomic_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic_cond_read_relaxed
#endif

/**
 * raw_atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read(v);
#else
        return raw_atomic_read(v);
#endif
}

/**
 * raw_atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read_acquire(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read_acquire(v);
#else
        return raw_atomic_read_acquire(v);
#endif
}

/**
 * raw_atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set(v, i);
#else
        raw_atomic_set(v, i);
#endif
}

/**
 * raw_atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set_release(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set_release(v, i);
#else
        raw_atomic_set_release(v, i);
#endif
}

/**
 * raw_atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_add(i, v);
#else
        raw_atomic_add(i, v);
#endif
}

/**
 * raw_atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return(i, v);
#else
        return raw_atomic_add_return(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_acquire(i, v);
#else
        return raw_atomic_add_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_release(i, v);
#else
        return raw_atomic_add_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_relaxed(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add(i, v);
#else
        return raw_atomic_fetch_add(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_acquire(i, v);
#else
        return raw_atomic_fetch_add_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_release(i, v);
#else
        return raw_atomic_fetch_add_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_relaxed(i, v);
#else
        return raw_atomic_fetch_add_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_sub(i, v);
#else
        raw_atomic_sub(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return(i, v);
#else
        return raw_atomic_sub_return(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_acquire(i, v);
#else
        return raw_atomic_sub_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_release(i, v);
#else
        return raw_atomic_sub_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_relaxed(i, v);
#else
        return raw_atomic_sub_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub(i, v);
#else
        return raw_atomic_fetch_sub(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_acquire(i, v);
#else
        return raw_atomic_fetch_sub_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_release(i, v);
#else
        return raw_atomic_fetch_sub_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_relaxed(i, v);
#else
        return raw_atomic_fetch_sub_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_inc(v);
#else
        raw_atomic_inc(v);
#endif
}

/**
 * raw_atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return(v);
#else
        return raw_atomic_inc_return(v);
#endif
}

/**
 * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_acquire(v);
#else
        return raw_atomic_inc_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_release(v);
#else
        return raw_atomic_inc_return_release(v);
#endif
}

/**
 * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_relaxed(v);
#else
        return raw_atomic_inc_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc(v);
#else
        return raw_atomic_fetch_inc(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_acquire(v);
#else
        return raw_atomic_fetch_inc_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_release(v);
#else
        return raw_atomic_fetch_inc_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_relaxed(v);
#else
        return raw_atomic_fetch_inc_relaxed(v);
#endif
}

/**
 * raw_atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_dec(v);
#else
        raw_atomic_dec(v);
#endif
}

/**
 * raw_atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return(v);
#else
        return raw_atomic_dec_return(v);
#endif
}

/**
 * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_acquire(v);
#else
        return raw_atomic_dec_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_release(v);
#else
        return raw_atomic_dec_return_release(v);
#endif
}

/**
 * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_relaxed(v);
#else
        return raw_atomic_dec_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec(v);
#else
        return raw_atomic_fetch_dec(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_acquire(v);
#else
        return raw_atomic_fetch_dec_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_release(v);
#else
        return raw_atomic_fetch_dec_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_relaxed(v);
#else
        return raw_atomic_fetch_dec_relaxed(v);
#endif
}

/**
 * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_and(i, v);
#else
        raw_atomic_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and(i, v);
#else
        return raw_atomic_fetch_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_acquire(i, v);
#else
        return raw_atomic_fetch_and_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_release(i, v);
#else
        return raw_atomic_fetch_and_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_relaxed(i, v);
#else
        return raw_atomic_fetch_and_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_andnot(i, v);
#else
        raw_atomic_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_acquire(i, v);
#else
        return raw_atomic_fetch_andnot_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_release(i, v);
#else
        return raw_atomic_fetch_andnot_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_relaxed(i, v);
#else
        return raw_atomic_fetch_andnot_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_or(i, v);
#else
        raw_atomic_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or(i, v);
#else
        return raw_atomic_fetch_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_acquire(i, v);
#else
        return raw_atomic_fetch_or_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_release(i, v);
#else
        return raw_atomic_fetch_or_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_relaxed(i, v);
#else
        return raw_atomic_fetch_or_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_xor(i, v);
#else
        raw_atomic_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor(i, v);
#else
        return raw_atomic_fetch_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_acquire(i, v);
#else
        return raw_atomic_fetch_xor_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_release(i, v);
#else
        return raw_atomic_fetch_xor_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_relaxed(i, v);
#else
        return raw_atomic_fetch_xor_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg(v, new);
#else
        return raw_atomic_xchg(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_acquire(v, new);
#else
        return raw_atomic_xchg_acquire(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_release(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_release(v, new);
#else
        return raw_atomic_xchg_release(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_relaxed(v, new);
#else
        return raw_atomic_xchg_relaxed(v, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg(v, old, new);
#else
        return raw_atomic_cmpxchg(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_acquire(v, old, new);
#else
        return raw_atomic_cmpxchg_acquire(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_release(v, old, new);
#else
        return raw_atomic_cmpxchg_release(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
#else
        return raw_atomic_cmpxchg_relaxed(v, old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_release(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_sub_and_test(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_and_test(i, v);
#else
        return raw_atomic_sub_and_test(i, v);
#endif
}

/**
 * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_and_test(v);
#else
        return raw_atomic_dec_and_test(v);
#endif
}

/**
 * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_and_test(v);
#else
        return raw_atomic_inc_and_test(v);
#endif
}

/**
 * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative(i, v);
#else
        return raw_atomic_add_negative(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_acquire(i, v);
#else
        return raw_atomic_add_negative_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_release(i, v);
#else
        return raw_atomic_add_negative_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_relaxed(i, v);
#else
        return raw_atomic_add_negative_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_unless(v, a, u);
#else
        return raw_atomic_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_not_zero(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_not_zero(v);
#else
        return raw_atomic_inc_not_zero(v);
#endif
}

/**
 * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_unless_negative(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_unless_negative(v);
#else
        return raw_atomic_inc_unless_negative(v);
#endif
}

/**
 * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_unless_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_unless_positive(v);
#else
        return raw_atomic_dec_unless_positive(v);
#endif
}

/**
 * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
raw_atomic_long_dec_if_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_if_positive(v);
#else
        return raw_atomic_dec_if_positive(v);
#endif
}

#endif /* _LINUX_ATOMIC_LONG_H */
// eadf183c3600b8b92b91839dd3be6bcc560c752d


























    1 











    1 



















































    1 
















    1 










































































































    1 
















































    1 



    1 











    1 







    1 












































    1 













































    1 



    1 






















































































































































































































































































































    1 






    1 




    1 











    1 






    1 




    1 











    1 






























































































































































































































































































































































































































    1 



    1 

    1 






























































    1 












    1 
    1 
    1 



    1 









    1 

































































































































    1 




    1 






































































































































    1 




    1 








    1 







    1 






    1 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains helper code to handle channel
 * settings and keeping track of what is possible at
 * any point in time.
 *
 * Copyright 2009        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright 2018-2024        Intel Corporation
 */

#include <linux/export.h>
#include <linux/bitfield.h>
#include <net/cfg80211.h>
#include "core.h"
#include "rdev-ops.h"

static bool cfg80211_valid_60g_freq(u32 freq)
{
        return freq >= 58320 && freq <= 70200;
}

void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
                             struct ieee80211_channel *chan,
                             enum nl80211_channel_type chan_type)
{
        if (WARN_ON(!chan))
                return;

        *chandef = (struct cfg80211_chan_def) {
                .chan = chan,
                .freq1_offset = chan->freq_offset,
        };

        switch (chan_type) {
        case NL80211_CHAN_NO_HT:
                chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
                chandef->center_freq1 = chan->center_freq;
                break;
        case NL80211_CHAN_HT20:
                chandef->width = NL80211_CHAN_WIDTH_20;
                chandef->center_freq1 = chan->center_freq;
                break;
        case NL80211_CHAN_HT40PLUS:
                chandef->width = NL80211_CHAN_WIDTH_40;
                chandef->center_freq1 = chan->center_freq + 10;
                break;
        case NL80211_CHAN_HT40MINUS:
                chandef->width = NL80211_CHAN_WIDTH_40;
                chandef->center_freq1 = chan->center_freq - 10;
                break;
        default:
                WARN_ON(1);
        }
}
EXPORT_SYMBOL(cfg80211_chandef_create);

struct cfg80211_per_bw_puncturing_values {
        u8 len;
        const u16 *valid_values;
};

static const u16 puncturing_values_80mhz[] = {
        0x8, 0x4, 0x2, 0x1
};

static const u16 puncturing_values_160mhz[] = {
         0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1, 0xc0, 0x30, 0xc, 0x3
};

static const u16 puncturing_values_320mhz[] = {
        0xc000, 0x3000, 0xc00, 0x300, 0xc0, 0x30, 0xc, 0x3, 0xf000, 0xf00,
        0xf0, 0xf, 0xfc00, 0xf300, 0xf0c0, 0xf030, 0xf00c, 0xf003, 0xc00f,
        0x300f, 0xc0f, 0x30f, 0xcf, 0x3f
};

#define CFG80211_PER_BW_VALID_PUNCTURING_VALUES(_bw) \
        { \
                .len = ARRAY_SIZE(puncturing_values_ ## _bw ## mhz), \
                .valid_values = puncturing_values_ ## _bw ## mhz \
        }

static const struct cfg80211_per_bw_puncturing_values per_bw_puncturing[] = {
        CFG80211_PER_BW_VALID_PUNCTURING_VALUES(80),
        CFG80211_PER_BW_VALID_PUNCTURING_VALUES(160),
        CFG80211_PER_BW_VALID_PUNCTURING_VALUES(320)
};

static bool valid_puncturing_bitmap(const struct cfg80211_chan_def *chandef)
{
        u32 idx, i, start_freq, primary_center = chandef->chan->center_freq;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_80:
                idx = 0;
                start_freq = chandef->center_freq1 - 40;
                break;
        case NL80211_CHAN_WIDTH_160:
                idx = 1;
                start_freq = chandef->center_freq1 - 80;
                break;
        case NL80211_CHAN_WIDTH_320:
                idx = 2;
                start_freq = chandef->center_freq1 - 160;
                break;
        default:
                return chandef->punctured == 0;
        }

        if (!chandef->punctured)
                return true;

        /* check if primary channel is punctured */
        if (chandef->punctured & (u16)BIT((primary_center - start_freq) / 20))
                return false;

        for (i = 0; i < per_bw_puncturing[idx].len; i++) {
                if (per_bw_puncturing[idx].valid_values[i] == chandef->punctured)
                        return true;
        }

        return false;
}

static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef)
{
        int max_contiguous = 0;
        int num_of_enabled = 0;
        int contiguous = 0;
        int i;

        if (!chandef->edmg.channels || !chandef->edmg.bw_config)
                return false;

        if (!cfg80211_valid_60g_freq(chandef->chan->center_freq))
                return false;

        for (i = 0; i < 6; i++) {
                if (chandef->edmg.channels & BIT(i)) {
                        contiguous++;
                        num_of_enabled++;
                } else {
                        contiguous = 0;
                }

                max_contiguous = max(contiguous, max_contiguous);
        }
        /* basic verification of edmg configuration according to
         * IEEE P802.11ay/D4.0 section 9.4.2.251
         */
        /* check bw_config against contiguous edmg channels */
        switch (chandef->edmg.bw_config) {
        case IEEE80211_EDMG_BW_CONFIG_4:
        case IEEE80211_EDMG_BW_CONFIG_8:
        case IEEE80211_EDMG_BW_CONFIG_12:
                if (max_contiguous < 1)
                        return false;
                break;
        case IEEE80211_EDMG_BW_CONFIG_5:
        case IEEE80211_EDMG_BW_CONFIG_9:
        case IEEE80211_EDMG_BW_CONFIG_13:
                if (max_contiguous < 2)
                        return false;
                break;
        case IEEE80211_EDMG_BW_CONFIG_6:
        case IEEE80211_EDMG_BW_CONFIG_10:
        case IEEE80211_EDMG_BW_CONFIG_14:
                if (max_contiguous < 3)
                        return false;
                break;
        case IEEE80211_EDMG_BW_CONFIG_7:
        case IEEE80211_EDMG_BW_CONFIG_11:
        case IEEE80211_EDMG_BW_CONFIG_15:
                if (max_contiguous < 4)
                        return false;
                break;

        default:
                return false;
        }

        /* check bw_config against aggregated (non contiguous) edmg channels */
        switch (chandef->edmg.bw_config) {
        case IEEE80211_EDMG_BW_CONFIG_4:
        case IEEE80211_EDMG_BW_CONFIG_5:
        case IEEE80211_EDMG_BW_CONFIG_6:
        case IEEE80211_EDMG_BW_CONFIG_7:
                break;
        case IEEE80211_EDMG_BW_CONFIG_8:
        case IEEE80211_EDMG_BW_CONFIG_9:
        case IEEE80211_EDMG_BW_CONFIG_10:
        case IEEE80211_EDMG_BW_CONFIG_11:
                if (num_of_enabled < 2)
                        return false;
                break;
        case IEEE80211_EDMG_BW_CONFIG_12:
        case IEEE80211_EDMG_BW_CONFIG_13:
        case IEEE80211_EDMG_BW_CONFIG_14:
        case IEEE80211_EDMG_BW_CONFIG_15:
                if (num_of_enabled < 4 || max_contiguous < 2)
                        return false;
                break;
        default:
                return false;
        }

        return true;
}

int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width)
{
        int mhz;

        switch (chan_width) {
        case NL80211_CHAN_WIDTH_1:
                mhz = 1;
                break;
        case NL80211_CHAN_WIDTH_2:
                mhz = 2;
                break;
        case NL80211_CHAN_WIDTH_4:
                mhz = 4;
                break;
        case NL80211_CHAN_WIDTH_8:
                mhz = 8;
                break;
        case NL80211_CHAN_WIDTH_16:
                mhz = 16;
                break;
        case NL80211_CHAN_WIDTH_5:
                mhz = 5;
                break;
        case NL80211_CHAN_WIDTH_10:
                mhz = 10;
                break;
        case NL80211_CHAN_WIDTH_20:
        case NL80211_CHAN_WIDTH_20_NOHT:
                mhz = 20;
                break;
        case NL80211_CHAN_WIDTH_40:
                mhz = 40;
                break;
        case NL80211_CHAN_WIDTH_80P80:
        case NL80211_CHAN_WIDTH_80:
                mhz = 80;
                break;
        case NL80211_CHAN_WIDTH_160:
                mhz = 160;
                break;
        case NL80211_CHAN_WIDTH_320:
                mhz = 320;
                break;
        default:
                WARN_ON_ONCE(1);
                return -1;
        }
        return mhz;
}
EXPORT_SYMBOL(nl80211_chan_width_to_mhz);

static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
{
        return nl80211_chan_width_to_mhz(c->width);
}

bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
{
        u32 control_freq, oper_freq;
        int oper_width, control_width;

        if (!chandef->chan)
                return false;

        if (chandef->freq1_offset >= 1000)
                return false;

        control_freq = chandef->chan->center_freq;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
        case NL80211_CHAN_WIDTH_10:
        case NL80211_CHAN_WIDTH_20:
        case NL80211_CHAN_WIDTH_20_NOHT:
                if (ieee80211_chandef_to_khz(chandef) !=
                    ieee80211_channel_to_khz(chandef->chan))
                        return false;
                if (chandef->center_freq2)
                        return false;
                break;
        case NL80211_CHAN_WIDTH_1:
        case NL80211_CHAN_WIDTH_2:
        case NL80211_CHAN_WIDTH_4:
        case NL80211_CHAN_WIDTH_8:
        case NL80211_CHAN_WIDTH_16:
                if (chandef->chan->band != NL80211_BAND_S1GHZ)
                        return false;

                control_freq = ieee80211_channel_to_khz(chandef->chan);
                oper_freq = ieee80211_chandef_to_khz(chandef);
                control_width = nl80211_chan_width_to_mhz(
                                        ieee80211_s1g_channel_width(
                                                                chandef->chan));
                oper_width = cfg80211_chandef_get_width(chandef);

                if (oper_width < 0 || control_width < 0)
                        return false;
                if (chandef->center_freq2)
                        return false;

                if (control_freq + MHZ_TO_KHZ(control_width) / 2 >
                    oper_freq + MHZ_TO_KHZ(oper_width) / 2)
                        return false;

                if (control_freq - MHZ_TO_KHZ(control_width) / 2 <
                    oper_freq - MHZ_TO_KHZ(oper_width) / 2)
                        return false;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                if (!chandef->center_freq2)
                        return false;
                /* adjacent is not allowed -- that's a 160 MHz channel */
                if (chandef->center_freq1 - chandef->center_freq2 == 80 ||
                    chandef->center_freq2 - chandef->center_freq1 == 80)
                        return false;
                break;
        default:
                if (chandef->center_freq2)
                        return false;
                break;
        }

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
        case NL80211_CHAN_WIDTH_10:
        case NL80211_CHAN_WIDTH_20:
        case NL80211_CHAN_WIDTH_20_NOHT:
        case NL80211_CHAN_WIDTH_1:
        case NL80211_CHAN_WIDTH_2:
        case NL80211_CHAN_WIDTH_4:
        case NL80211_CHAN_WIDTH_8:
        case NL80211_CHAN_WIDTH_16:
                /* all checked above */
                break;
        case NL80211_CHAN_WIDTH_320:
                if (chandef->center_freq1 == control_freq + 150 ||
                    chandef->center_freq1 == control_freq + 130 ||
                    chandef->center_freq1 == control_freq + 110 ||
                    chandef->center_freq1 == control_freq + 90 ||
                    chandef->center_freq1 == control_freq - 90 ||
                    chandef->center_freq1 == control_freq - 110 ||
                    chandef->center_freq1 == control_freq - 130 ||
                    chandef->center_freq1 == control_freq - 150)
                        break;
                fallthrough;
        case NL80211_CHAN_WIDTH_160:
                if (chandef->center_freq1 == control_freq + 70 ||
                    chandef->center_freq1 == control_freq + 50 ||
                    chandef->center_freq1 == control_freq - 50 ||
                    chandef->center_freq1 == control_freq - 70)
                        break;
                fallthrough;
        case NL80211_CHAN_WIDTH_80P80:
        case NL80211_CHAN_WIDTH_80:
                if (chandef->center_freq1 == control_freq + 30 ||
                    chandef->center_freq1 == control_freq - 30)
                        break;
                fallthrough;
        case NL80211_CHAN_WIDTH_40:
                if (chandef->center_freq1 == control_freq + 10 ||
                    chandef->center_freq1 == control_freq - 10)
                        break;
                fallthrough;
        default:
                return false;
        }

        /* channel 14 is only for IEEE 802.11b */
        if (chandef->center_freq1 == 2484 &&
            chandef->width != NL80211_CHAN_WIDTH_20_NOHT)
                return false;

        if (cfg80211_chandef_is_edmg(chandef) &&
            !cfg80211_edmg_chandef_valid(chandef))
                return false;

        return valid_puncturing_bitmap(chandef);
}
EXPORT_SYMBOL(cfg80211_chandef_valid);

int cfg80211_chandef_primary(const struct cfg80211_chan_def *c,
                             enum nl80211_chan_width primary_chan_width,
                             u16 *punctured)
{
        int pri_width = nl80211_chan_width_to_mhz(primary_chan_width);
        int width = cfg80211_chandef_get_width(c);
        u32 control = c->chan->center_freq;
        u32 center = c->center_freq1;
        u16 _punct = 0;

        if (WARN_ON_ONCE(pri_width < 0 || width < 0))
                return -1;

        /* not intended to be called this way, can't determine */
        if (WARN_ON_ONCE(pri_width > width))
                return -1;

        if (!punctured)
                punctured = &_punct;

        *punctured = c->punctured;

        while (width > pri_width) {
                unsigned int bits_to_drop = width / 20 / 2;

                if (control > center) {
                        center += width / 4;
                        *punctured >>= bits_to_drop;
                } else {
                        center -= width / 4;
                        *punctured &= (1 << bits_to_drop) - 1;
                }
                width /= 2;
        }

        return center;
}
EXPORT_SYMBOL(cfg80211_chandef_primary);

static const struct cfg80211_chan_def *
check_chandef_primary_compat(const struct cfg80211_chan_def *c1,
                             const struct cfg80211_chan_def *c2,
                             enum nl80211_chan_width primary_chan_width)
{
        u16 punct_c1 = 0, punct_c2 = 0;

        /* check primary is compatible -> error if not */
        if (cfg80211_chandef_primary(c1, primary_chan_width, &punct_c1) !=
            cfg80211_chandef_primary(c2, primary_chan_width, &punct_c2))
                return ERR_PTR(-EINVAL);

        if (punct_c1 != punct_c2)
                return ERR_PTR(-EINVAL);

        /* assumes c1 is smaller width, if that was just checked -> done */
        if (c1->width == primary_chan_width)
                return c2;

        /* otherwise continue checking the next width */
        return NULL;
}

static const struct cfg80211_chan_def *
_cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
                             const struct cfg80211_chan_def *c2)
{
        const struct cfg80211_chan_def *ret;

        /* If they are identical, return */
        if (cfg80211_chandef_identical(c1, c2))
                return c2;

        /* otherwise, must have same control channel */
        if (c1->chan != c2->chan)
                return NULL;

        /*
         * If they have the same width, but aren't identical,
         * then they can't be compatible.
         */
        if (c1->width == c2->width)
                return NULL;

        /*
         * can't be compatible if one of them is 5/10 MHz or S1G
         * but they don't have the same width.
         */
#define NARROW_OR_S1G(width)        ((width) == NL80211_CHAN_WIDTH_5 || \
                                 (width) == NL80211_CHAN_WIDTH_10 || \
                                 (width) == NL80211_CHAN_WIDTH_1 || \
                                 (width) == NL80211_CHAN_WIDTH_2 || \
                                 (width) == NL80211_CHAN_WIDTH_4 || \
                                 (width) == NL80211_CHAN_WIDTH_8 || \
                                 (width) == NL80211_CHAN_WIDTH_16)

        if (NARROW_OR_S1G(c1->width) || NARROW_OR_S1G(c2->width))
                return NULL;

        /*
         * Make sure that c1 is always the narrower one, so that later
         * we either return NULL or c2 and don't have to check both
         * directions.
         */
        if (c1->width > c2->width)
                swap(c1, c2);

        /*
         * No further checks needed if the "narrower" one is only 20 MHz.
         * Here "narrower" includes being a 20 MHz non-HT channel vs. a
         * 20 MHz HT (or later) one.
         */
        if (c1->width <= NL80211_CHAN_WIDTH_20)
                return c2;

        ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_40);
        if (ret)
                return ret;

        ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_80);
        if (ret)
                return ret;

        /*
         * If c1 is 80+80, then c2 is 160 or higher, but that cannot
         * match. If c2 was also 80+80 it was already either accepted
         * or rejected above (identical or not, respectively.)
         */
        if (c1->width == NL80211_CHAN_WIDTH_80P80)
                return NULL;

        ret = check_chandef_primary_compat(c1, c2, NL80211_CHAN_WIDTH_160);
        if (ret)
                return ret;

        /*
         * Getting here would mean they're both wider than 160, have the
         * same primary 160, but are not identical - this cannot happen
         * since they must be 320 (no wider chandefs exist, at least yet.)
         */
        WARN_ON_ONCE(1);

        return NULL;
}

const struct cfg80211_chan_def *
cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
                            const struct cfg80211_chan_def *c2)
{
        const struct cfg80211_chan_def *ret;

        ret = _cfg80211_chandef_compatible(c1, c2);
        if (IS_ERR(ret))
                return NULL;
        return ret;
}
EXPORT_SYMBOL(cfg80211_chandef_compatible);

static void cfg80211_set_chans_dfs_state(struct wiphy *wiphy, u32 center_freq,
                                         u32 bandwidth,
                                         enum nl80211_dfs_state dfs_state)
{
        struct ieee80211_channel *c;
        u32 freq;

        for (freq = center_freq - bandwidth/2 + 10;
             freq <= center_freq + bandwidth/2 - 10;
             freq += 20) {
                c = ieee80211_get_channel(wiphy, freq);
                if (!c || !(c->flags & IEEE80211_CHAN_RADAR))
                        continue;

                c->dfs_state = dfs_state;
                c->dfs_state_entered = jiffies;
        }
}

void cfg80211_set_dfs_state(struct wiphy *wiphy,
                            const struct cfg80211_chan_def *chandef,
                            enum nl80211_dfs_state dfs_state)
{
        int width;

        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return;

        width = cfg80211_chandef_get_width(chandef);
        if (width < 0)
                return;

        cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq1,
                                     width, dfs_state);

        if (!chandef->center_freq2)
                return;
        cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq2,
                                     width, dfs_state);
}

static u32 cfg80211_get_start_freq(u32 center_freq,
                                   u32 bandwidth)
{
        u32 start_freq;

        bandwidth = MHZ_TO_KHZ(bandwidth);
        if (bandwidth <= MHZ_TO_KHZ(20))
                start_freq = center_freq;
        else
                start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10);

        return start_freq;
}

static u32 cfg80211_get_end_freq(u32 center_freq,
                                 u32 bandwidth)
{
        u32 end_freq;

        bandwidth = MHZ_TO_KHZ(bandwidth);
        if (bandwidth <= MHZ_TO_KHZ(20))
                end_freq = center_freq;
        else
                end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10);

        return end_freq;
}

static bool
cfg80211_dfs_permissive_check_wdev(struct cfg80211_registered_device *rdev,
                                   enum nl80211_iftype iftype,
                                   struct wireless_dev *wdev,
                                   struct ieee80211_channel *chan)
{
        unsigned int link_id;

        for_each_valid_link(wdev, link_id) {
                struct ieee80211_channel *other_chan = NULL;
                struct cfg80211_chan_def chandef = {};
                int ret;

                /* In order to avoid daisy chaining only allow BSS STA */
                if (wdev->iftype != NL80211_IFTYPE_STATION ||
                    !wdev->links[link_id].client.current_bss)
                        continue;

                other_chan =
                        wdev->links[link_id].client.current_bss->pub.channel;

                if (!other_chan)
                        continue;

                if (chan == other_chan)
                        return true;

                /* continue if we can't get the channel */
                ret = rdev_get_channel(rdev, wdev, link_id, &chandef);
                if (ret)
                        continue;

                if (cfg80211_is_sub_chan(&chandef, chan, false))
                        return true;
        }

        return false;
}

/*
 * Check if P2P GO is allowed to operate on a DFS channel
 */
static bool cfg80211_dfs_permissive_chan(struct wiphy *wiphy,
                                         enum nl80211_iftype iftype,
                                         struct ieee80211_channel *chan)
{
        struct wireless_dev *wdev;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        lockdep_assert_held(&rdev->wiphy.mtx);

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_DFS_CONCURRENT) ||
            !(chan->flags & IEEE80211_CHAN_DFS_CONCURRENT))
                return false;

        /* only valid for P2P GO */
        if (iftype != NL80211_IFTYPE_P2P_GO)
                return false;

        /*
         * Allow only if there's a concurrent BSS
         */
        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                bool ret = cfg80211_dfs_permissive_check_wdev(rdev, iftype,
                                                              wdev, chan);
                if (ret)
                        return ret;
        }

        return false;
}

static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy,
                                            u32 center_freq,
                                            u32 bandwidth,
                                            enum nl80211_iftype iftype)
{
        struct ieee80211_channel *c;
        u32 freq, start_freq, end_freq;

        start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
        end_freq = cfg80211_get_end_freq(center_freq, bandwidth);

        for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
                c = ieee80211_get_channel_khz(wiphy, freq);
                if (!c)
                        return -EINVAL;

                if (c->flags & IEEE80211_CHAN_RADAR &&
                    !cfg80211_dfs_permissive_chan(wiphy, iftype, c))
                        return 1;
        }

        return 0;
}


int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
                                  const struct cfg80211_chan_def *chandef,
                                  enum nl80211_iftype iftype)
{
        int width;
        int ret;

        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return -EINVAL;

        switch (iftype) {
        case NL80211_IFTYPE_ADHOC:
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
        case NL80211_IFTYPE_MESH_POINT:
                width = cfg80211_chandef_get_width(chandef);
                if (width < 0)
                        return -EINVAL;

                ret = cfg80211_get_chans_dfs_required(wiphy,
                                        ieee80211_chandef_to_khz(chandef),
                                        width, iftype);
                if (ret < 0)
                        return ret;
                else if (ret > 0)
                        return BIT(chandef->width);

                if (!chandef->center_freq2)
                        return 0;

                ret = cfg80211_get_chans_dfs_required(wiphy,
                                        MHZ_TO_KHZ(chandef->center_freq2),
                                        width, iftype);
                if (ret < 0)
                        return ret;
                else if (ret > 0)
                        return BIT(chandef->width);

                break;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_OCB:
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_MONITOR:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_DEVICE:
        case NL80211_IFTYPE_NAN:
                break;
        case NL80211_IFTYPE_WDS:
        case NL80211_IFTYPE_UNSPECIFIED:
        case NUM_NL80211_IFTYPES:
                WARN_ON(1);
        }

        return 0;
}
EXPORT_SYMBOL(cfg80211_chandef_dfs_required);

static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy,
                                         u32 center_freq,
                                         u32 bandwidth)
{
        struct ieee80211_channel *c;
        u32 freq, start_freq, end_freq;
        int count = 0;

        start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
        end_freq = cfg80211_get_end_freq(center_freq, bandwidth);

        /*
         * Check entire range of channels for the bandwidth.
         * Check all channels are DFS channels (DFS_USABLE or
         * DFS_AVAILABLE). Return number of usable channels
         * (require CAC). Allow DFS and non-DFS channel mix.
         */
        for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
                c = ieee80211_get_channel_khz(wiphy, freq);
                if (!c)
                        return -EINVAL;

                if (c->flags & IEEE80211_CHAN_DISABLED)
                        return -EINVAL;

                if (c->flags & IEEE80211_CHAN_RADAR) {
                        if (c->dfs_state == NL80211_DFS_UNAVAILABLE)
                                return -EINVAL;

                        if (c->dfs_state == NL80211_DFS_USABLE)
                                count++;
                }
        }

        return count;
}

bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
                                 const struct cfg80211_chan_def *chandef)
{
        int width;
        int r1, r2 = 0;

        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return false;

        width = cfg80211_chandef_get_width(chandef);
        if (width < 0)
                return false;

        r1 = cfg80211_get_chans_dfs_usable(wiphy,
                                           MHZ_TO_KHZ(chandef->center_freq1),
                                           width);

        if (r1 < 0)
                return false;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_80P80:
                WARN_ON(!chandef->center_freq2);
                r2 = cfg80211_get_chans_dfs_usable(wiphy,
                                        MHZ_TO_KHZ(chandef->center_freq2),
                                        width);
                if (r2 < 0)
                        return false;
                break;
        default:
                WARN_ON(chandef->center_freq2);
                break;
        }

        return (r1 + r2 > 0);
}
EXPORT_SYMBOL(cfg80211_chandef_dfs_usable);

/*
 * Checks if center frequency of chan falls with in the bandwidth
 * range of chandef.
 */
bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
                          struct ieee80211_channel *chan,
                          bool primary_only)
{
        int width;
        u32 freq;

        if (!chandef->chan)
                return false;

        if (chandef->chan->center_freq == chan->center_freq)
                return true;

        if (primary_only)
                return false;

        width = cfg80211_chandef_get_width(chandef);
        if (width <= 20)
                return false;

        for (freq = chandef->center_freq1 - width / 2 + 10;
             freq <= chandef->center_freq1 + width / 2 - 10; freq += 20) {
                if (chan->center_freq == freq)
                        return true;
        }

        if (!chandef->center_freq2)
                return false;

        for (freq = chandef->center_freq2 - width / 2 + 10;
             freq <= chandef->center_freq2 + width / 2 - 10; freq += 20) {
                if (chan->center_freq == freq)
                        return true;
        }

        return false;
}

bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev)
{
        unsigned int link;

        lockdep_assert_wiphy(wdev->wiphy);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                for_each_valid_link(wdev, link) {
                        if (wdev->links[link].ap.beacon_interval)
                                return true;
                }
                break;
        case NL80211_IFTYPE_ADHOC:
                if (wdev->u.ibss.ssid_len)
                        return true;
                break;
        case NL80211_IFTYPE_MESH_POINT:
                if (wdev->u.mesh.id_len)
                        return true;
                break;
        case NL80211_IFTYPE_STATION:
        case NL80211_IFTYPE_OCB:
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_MONITOR:
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_P2P_DEVICE:
        /* Can NAN type be considered as beaconing interface? */
        case NL80211_IFTYPE_NAN:
                break;
        case NL80211_IFTYPE_UNSPECIFIED:
        case NL80211_IFTYPE_WDS:
        case NUM_NL80211_IFTYPES:
                WARN_ON(1);
        }

        return false;
}

bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
                               struct ieee80211_channel *chan,
                               bool primary_only)
{
        unsigned int link;

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                for_each_valid_link(wdev, link) {
                        if (cfg80211_is_sub_chan(&wdev->links[link].ap.chandef,
                                                 chan, primary_only))
                                return true;
                }
                break;
        case NL80211_IFTYPE_ADHOC:
                return cfg80211_is_sub_chan(&wdev->u.ibss.chandef, chan,
                                            primary_only);
        case NL80211_IFTYPE_MESH_POINT:
                return cfg80211_is_sub_chan(&wdev->u.mesh.chandef, chan,
                                            primary_only);
        default:
                break;
        }

        return false;
}

static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy,
                                        struct ieee80211_channel *chan)
{
        struct wireless_dev *wdev;

        lockdep_assert_wiphy(wiphy);

        list_for_each_entry(wdev, &wiphy->wdev_list, list) {
                if (!cfg80211_beaconing_iface_active(wdev))
                        continue;

                if (cfg80211_wdev_on_sub_chan(wdev, chan, false))
                        return true;
        }

        return false;
}

static bool
cfg80211_offchan_chain_is_active(struct cfg80211_registered_device *rdev,
                                 struct ieee80211_channel *channel)
{
        if (!rdev->background_radar_wdev)
                return false;

        if (!cfg80211_chandef_valid(&rdev->background_radar_chandef))
                return false;

        return cfg80211_is_sub_chan(&rdev->background_radar_chandef, channel,
                                    false);
}

bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
                                  struct ieee80211_channel *chan)
{
        struct cfg80211_registered_device *rdev;

        ASSERT_RTNL();

        if (!(chan->flags & IEEE80211_CHAN_RADAR))
                return false;

        for_each_rdev(rdev) {
                bool found;

                if (!reg_dfs_domain_same(wiphy, &rdev->wiphy))
                        continue;

                wiphy_lock(&rdev->wiphy);
                found = cfg80211_is_wiphy_oper_chan(&rdev->wiphy, chan) ||
                        cfg80211_offchan_chain_is_active(rdev, chan);
                wiphy_unlock(&rdev->wiphy);

                if (found)
                        return true;
        }

        return false;
}

static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy,
                                             u32 center_freq,
                                             u32 bandwidth)
{
        struct ieee80211_channel *c;
        u32 freq, start_freq, end_freq;
        bool dfs_offload;

        dfs_offload = wiphy_ext_feature_isset(wiphy,
                                              NL80211_EXT_FEATURE_DFS_OFFLOAD);

        start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
        end_freq = cfg80211_get_end_freq(center_freq, bandwidth);

        /*
         * Check entire range of channels for the bandwidth.
         * If any channel in between is disabled or has not
         * had gone through CAC return false
         */
        for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
                c = ieee80211_get_channel_khz(wiphy, freq);
                if (!c)
                        return false;

                if (c->flags & IEEE80211_CHAN_DISABLED)
                        return false;

                if ((c->flags & IEEE80211_CHAN_RADAR) &&
                    (c->dfs_state != NL80211_DFS_AVAILABLE) &&
                    !(c->dfs_state == NL80211_DFS_USABLE && dfs_offload))
                        return false;
        }

        return true;
}

static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy,
                                const struct cfg80211_chan_def *chandef)
{
        int width;
        int r;

        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return false;

        width = cfg80211_chandef_get_width(chandef);
        if (width < 0)
                return false;

        r = cfg80211_get_chans_dfs_available(wiphy,
                                             MHZ_TO_KHZ(chandef->center_freq1),
                                             width);

        /* If any of channels unavailable for cf1 just return */
        if (!r)
                return r;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_80P80:
                WARN_ON(!chandef->center_freq2);
                r = cfg80211_get_chans_dfs_available(wiphy,
                                        MHZ_TO_KHZ(chandef->center_freq2),
                                        width);
                break;
        default:
                WARN_ON(chandef->center_freq2);
                break;
        }

        return r;
}

static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy,
                                                    u32 center_freq,
                                                    u32 bandwidth)
{
        struct ieee80211_channel *c;
        u32 start_freq, end_freq, freq;
        unsigned int dfs_cac_ms = 0;

        start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
        end_freq = cfg80211_get_end_freq(center_freq, bandwidth);

        for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
                c = ieee80211_get_channel_khz(wiphy, freq);
                if (!c)
                        return 0;

                if (c->flags & IEEE80211_CHAN_DISABLED)
                        return 0;

                if (!(c->flags & IEEE80211_CHAN_RADAR))
                        continue;

                if (c->dfs_cac_ms > dfs_cac_ms)
                        dfs_cac_ms = c->dfs_cac_ms;
        }

        return dfs_cac_ms;
}

unsigned int
cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
                              const struct cfg80211_chan_def *chandef)
{
        int width;
        unsigned int t1 = 0, t2 = 0;

        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return 0;

        width = cfg80211_chandef_get_width(chandef);
        if (width < 0)
                return 0;

        t1 = cfg80211_get_chans_dfs_cac_time(wiphy,
                                             MHZ_TO_KHZ(chandef->center_freq1),
                                             width);

        if (!chandef->center_freq2)
                return t1;

        t2 = cfg80211_get_chans_dfs_cac_time(wiphy,
                                             MHZ_TO_KHZ(chandef->center_freq2),
                                             width);

        return max(t1, t2);
}
EXPORT_SYMBOL(cfg80211_chandef_dfs_cac_time);

static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
                                        u32 center_freq, u32 bandwidth,
                                        u32 prohibited_flags, bool monitor)
{
        struct ieee80211_channel *c;
        u32 freq, start_freq, end_freq;

        start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
        end_freq = cfg80211_get_end_freq(center_freq, bandwidth);

        for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
                c = ieee80211_get_channel_khz(wiphy, freq);
                if (!c)
                        return false;
                if (monitor && c->flags & IEEE80211_CHAN_CAN_MONITOR)
                        continue;
                if (c->flags & prohibited_flags)
                        return false;
        }

        return true;
}

/* check if the operating channels are valid and supported */
static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels,
                                 enum ieee80211_edmg_bw_config edmg_bw_config,
                                 int primary_channel,
                                 struct ieee80211_edmg *edmg_cap)
{
        struct ieee80211_channel *chan;
        int i, freq;
        int channels_counter = 0;

        if (!edmg_channels && !edmg_bw_config)
                return true;

        if ((!edmg_channels && edmg_bw_config) ||
            (edmg_channels && !edmg_bw_config))
                return false;

        if (!(edmg_channels & BIT(primary_channel - 1)))
                return false;

        /* 60GHz channels 1..6 */
        for (i = 0; i < 6; i++) {
                if (!(edmg_channels & BIT(i)))
                        continue;

                if (!(edmg_cap->channels & BIT(i)))
                        return false;

                channels_counter++;

                freq = ieee80211_channel_to_frequency(i + 1,
                                                      NL80211_BAND_60GHZ);
                chan = ieee80211_get_channel(wiphy, freq);
                if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
                        return false;
        }

        /* IEEE802.11 allows max 4 channels */
        if (channels_counter > 4)
                return false;

        /* check bw_config is a subset of what driver supports
         * (see IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13)
         */
        if ((edmg_bw_config % 4) > (edmg_cap->bw_config % 4))
                return false;

        if (edmg_bw_config > edmg_cap->bw_config)
                return false;

        return true;
}

bool _cfg80211_chandef_usable(struct wiphy *wiphy,
                              const struct cfg80211_chan_def *chandef,
                              u32 prohibited_flags, bool monitor)
{
        struct ieee80211_sta_ht_cap *ht_cap;
        struct ieee80211_sta_vht_cap *vht_cap;
        struct ieee80211_edmg *edmg_cap;
        u32 width, control_freq, cap;
        bool ext_nss_cap, support_80_80 = false, support_320 = false;
        const struct ieee80211_sband_iftype_data *iftd;
        struct ieee80211_supported_band *sband;
        int i;

        if (WARN_ON(!cfg80211_chandef_valid(chandef)))
                return false;

        ht_cap = &wiphy->bands[chandef->chan->band]->ht_cap;
        vht_cap = &wiphy->bands[chandef->chan->band]->vht_cap;
        edmg_cap = &wiphy->bands[chandef->chan->band]->edmg_cap;
        ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) &
                        IEEE80211_VHT_EXT_NSS_BW_CAPABLE;

        if (edmg_cap->channels &&
            !cfg80211_edmg_usable(wiphy,
                                  chandef->edmg.channels,
                                  chandef->edmg.bw_config,
                                  chandef->chan->hw_value,
                                  edmg_cap))
                return false;

        control_freq = chandef->chan->center_freq;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_1:
                width = 1;
                break;
        case NL80211_CHAN_WIDTH_2:
                width = 2;
                break;
        case NL80211_CHAN_WIDTH_4:
                width = 4;
                break;
        case NL80211_CHAN_WIDTH_8:
                width = 8;
                break;
        case NL80211_CHAN_WIDTH_16:
                width = 16;
                break;
        case NL80211_CHAN_WIDTH_5:
                width = 5;
                break;
        case NL80211_CHAN_WIDTH_10:
                prohibited_flags |= IEEE80211_CHAN_NO_10MHZ;
                width = 10;
                break;
        case NL80211_CHAN_WIDTH_20:
                if (!ht_cap->ht_supported &&
                    chandef->chan->band != NL80211_BAND_6GHZ)
                        return false;
                fallthrough;
        case NL80211_CHAN_WIDTH_20_NOHT:
                prohibited_flags |= IEEE80211_CHAN_NO_20MHZ;
                width = 20;
                break;
        case NL80211_CHAN_WIDTH_40:
                width = 40;
                if (chandef->chan->band == NL80211_BAND_6GHZ)
                        break;
                if (!ht_cap->ht_supported)
                        return false;
                if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ||
                    ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT)
                        return false;
                if (chandef->center_freq1 < control_freq &&
                    chandef->chan->flags & IEEE80211_CHAN_NO_HT40MINUS)
                        return false;
                if (chandef->center_freq1 > control_freq &&
                    chandef->chan->flags & IEEE80211_CHAN_NO_HT40PLUS)
                        return false;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                cap = vht_cap->cap;
                support_80_80 =
                        (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) ||
                        (cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
                         cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) ||
                        (ext_nss_cap &&
                         u32_get_bits(cap, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) > 1);
                if (chandef->chan->band != NL80211_BAND_6GHZ && !support_80_80)
                        return false;
                fallthrough;
        case NL80211_CHAN_WIDTH_80:
                prohibited_flags |= IEEE80211_CHAN_NO_80MHZ;
                width = 80;
                if (chandef->chan->band == NL80211_BAND_6GHZ)
                        break;
                if (!vht_cap->vht_supported)
                        return false;
                break;
        case NL80211_CHAN_WIDTH_160:
                prohibited_flags |= IEEE80211_CHAN_NO_160MHZ;
                width = 160;
                if (chandef->chan->band == NL80211_BAND_6GHZ)
                        break;
                if (!vht_cap->vht_supported)
                        return false;
                cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
                if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
                    cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ &&
                    !(ext_nss_cap &&
                      (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)))
                        return false;
                break;
        case NL80211_CHAN_WIDTH_320:
                prohibited_flags |= IEEE80211_CHAN_NO_320MHZ;
                width = 320;

                if (chandef->chan->band != NL80211_BAND_6GHZ)
                        return false;

                sband = wiphy->bands[NL80211_BAND_6GHZ];
                if (!sband)
                        return false;

                for_each_sband_iftype_data(sband, i, iftd) {
                        if (!iftd->eht_cap.has_eht)
                                continue;

                        if (iftd->eht_cap.eht_cap_elem.phy_cap_info[0] &
                            IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) {
                                support_320 = true;
                                break;
                        }
                }

                if (!support_320)
                        return false;
                break;
        default:
                WARN_ON_ONCE(1);
                return false;
        }

        /*
         * TODO: What if there are only certain 80/160/80+80 MHz channels
         *         allowed by the driver, or only certain combinations?
         *         For 40 MHz the driver can set the NO_HT40 flags, but for
         *         80/160 MHz and in particular 80+80 MHz this isn't really
         *         feasible and we only have NO_80MHZ/NO_160MHZ so far but
         *         no way to cover 80+80 MHz or more complex restrictions.
         *         Note that such restrictions also need to be advertised to
         *         userspace, for example for P2P channel selection.
         */

        if (width > 20)
                prohibited_flags |= IEEE80211_CHAN_NO_OFDM;

        /* 5 and 10 MHz are only defined for the OFDM PHY */
        if (width < 20)
                prohibited_flags |= IEEE80211_CHAN_NO_OFDM;


        if (!cfg80211_secondary_chans_ok(wiphy,
                                         ieee80211_chandef_to_khz(chandef),
                                         width, prohibited_flags, monitor))
                return false;

        if (!chandef->center_freq2)
                return true;
        return cfg80211_secondary_chans_ok(wiphy,
                                           MHZ_TO_KHZ(chandef->center_freq2),
                                           width, prohibited_flags, monitor);
}

bool cfg80211_chandef_usable(struct wiphy *wiphy,
                             const struct cfg80211_chan_def *chandef,
                             u32 prohibited_flags)
{
        return _cfg80211_chandef_usable(wiphy, chandef, prohibited_flags,
                                        false);
}
EXPORT_SYMBOL(cfg80211_chandef_usable);

static bool cfg80211_ir_permissive_check_wdev(enum nl80211_iftype iftype,
                                              struct wireless_dev *wdev,
                                              struct ieee80211_channel *chan)
{
        struct ieee80211_channel *other_chan = NULL;
        unsigned int link_id;
        int r1, r2;

        for_each_valid_link(wdev, link_id) {
                if (wdev->iftype == NL80211_IFTYPE_STATION &&
                    wdev->links[link_id].client.current_bss)
                        other_chan = wdev->links[link_id].client.current_bss->pub.channel;

                /*
                 * If a GO already operates on the same GO_CONCURRENT channel,
                 * this one (maybe the same one) can beacon as well. We allow
                 * the operation even if the station we relied on with
                 * GO_CONCURRENT is disconnected now. But then we must make sure
                 * we're not outdoor on an indoor-only channel.
                 */
                if (iftype == NL80211_IFTYPE_P2P_GO &&
                    wdev->iftype == NL80211_IFTYPE_P2P_GO &&
                    wdev->links[link_id].ap.beacon_interval &&
                    !(chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
                        other_chan = wdev->links[link_id].ap.chandef.chan;

                if (!other_chan)
                        continue;

                if (chan == other_chan)
                        return true;

                if (chan->band != NL80211_BAND_5GHZ &&
                    chan->band != NL80211_BAND_6GHZ)
                        continue;

                r1 = cfg80211_get_unii(chan->center_freq);
                r2 = cfg80211_get_unii(other_chan->center_freq);

                if (r1 != -EINVAL && r1 == r2) {
                        /*
                         * At some locations channels 149-165 are considered a
                         * bundle, but at other locations, e.g., Indonesia,
                         * channels 149-161 are considered a bundle while
                         * channel 165 is left out and considered to be in a
                         * different bundle. Thus, in case that there is a
                         * station interface connected to an AP on channel 165,
                         * it is assumed that channels 149-161 are allowed for
                         * GO operations. However, having a station interface
                         * connected to an AP on channels 149-161, does not
                         * allow GO operation on channel 165.
                         */
                        if (chan->center_freq == 5825 &&
                            other_chan->center_freq != 5825)
                                continue;
                        return true;
                }
        }

        return false;
}

/*
 * Check if the channel can be used under permissive conditions mandated by
 * some regulatory bodies, i.e., the channel is marked with
 * IEEE80211_CHAN_IR_CONCURRENT and there is an additional station interface
 * associated to an AP on the same channel or on the same UNII band
 * (assuming that the AP is an authorized master).
 * In addition allow operation on a channel on which indoor operation is
 * allowed, iff we are currently operating in an indoor environment.
 */
static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
                                        enum nl80211_iftype iftype,
                                        struct ieee80211_channel *chan)
{
        struct wireless_dev *wdev;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        lockdep_assert_held(&rdev->wiphy.mtx);

        if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
            !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))
                return false;

        /* only valid for GO and TDLS off-channel (station/p2p-CL) */
        if (iftype != NL80211_IFTYPE_P2P_GO &&
            iftype != NL80211_IFTYPE_STATION &&
            iftype != NL80211_IFTYPE_P2P_CLIENT)
                return false;

        if (regulatory_indoor_allowed() &&
            (chan->flags & IEEE80211_CHAN_INDOOR_ONLY))
                return true;

        if (!(chan->flags & IEEE80211_CHAN_IR_CONCURRENT))
                return false;

        /*
         * Generally, it is possible to rely on another device/driver to allow
         * the IR concurrent relaxation, however, since the device can further
         * enforce the relaxation (by doing a similar verifications as this),
         * and thus fail the GO instantiation, consider only the interfaces of
         * the current registered device.
         */
        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                bool ret;

                ret = cfg80211_ir_permissive_check_wdev(iftype, wdev, chan);
                if (ret)
                        return ret;
        }

        return false;
}

static bool _cfg80211_reg_can_beacon(struct wiphy *wiphy,
                                     struct cfg80211_chan_def *chandef,
                                     enum nl80211_iftype iftype,
                                     bool check_no_ir)
{
        bool res;
        u32 prohibited_flags = IEEE80211_CHAN_DISABLED;
        int dfs_required;

        trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);

        if (check_no_ir)
                prohibited_flags |= IEEE80211_CHAN_NO_IR;

        dfs_required = cfg80211_chandef_dfs_required(wiphy, chandef, iftype);
        if (dfs_required != 0)
                prohibited_flags |= IEEE80211_CHAN_RADAR;

        if (dfs_required > 0 &&
            cfg80211_chandef_dfs_available(wiphy, chandef)) {
                /* We can skip IEEE80211_CHAN_NO_IR if chandef dfs available */
                prohibited_flags = IEEE80211_CHAN_DISABLED;
        }

        res = cfg80211_chandef_usable(wiphy, chandef, prohibited_flags);

        trace_cfg80211_return_bool(res);
        return res;
}

bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
                             struct cfg80211_chan_def *chandef,
                             enum nl80211_iftype iftype)
{
        return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, true);
}
EXPORT_SYMBOL(cfg80211_reg_can_beacon);

bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
                                   struct cfg80211_chan_def *chandef,
                                   enum nl80211_iftype iftype)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        bool check_no_ir;

        lockdep_assert_held(&rdev->wiphy.mtx);

        /*
         * Under certain conditions suggested by some regulatory bodies a
         * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag
         * only if such relaxations are not enabled and the conditions are not
         * met.
         */
        check_no_ir = !cfg80211_ir_permissive_chan(wiphy, iftype,
                                                   chandef->chan);

        return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);
}
EXPORT_SYMBOL(cfg80211_reg_can_beacon_relax);

int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_chan_def *chandef)
{
        if (!rdev->ops->set_monitor_channel)
                return -EOPNOTSUPP;
        if (!cfg80211_has_monitors_only(rdev))
                return -EBUSY;

        return rdev_set_monitor_channel(rdev, chandef);
}

bool cfg80211_any_usable_channels(struct wiphy *wiphy,
                                  unsigned long sband_mask,
                                  u32 prohibited_flags)
{
        int idx;

        prohibited_flags |= IEEE80211_CHAN_DISABLED;

        for_each_set_bit(idx, &sband_mask, NUM_NL80211_BANDS) {
                struct ieee80211_supported_band *sband = wiphy->bands[idx];
                int chanidx;

                if (!sband)
                        continue;

                for (chanidx = 0; chanidx < sband->n_channels; chanidx++) {
                        struct ieee80211_channel *chan;

                        chan = &sband->channels[chanidx];

                        if (chan->flags & prohibited_flags)
                                continue;

                        return true;
                }
        }

        return false;
}
EXPORT_SYMBOL(cfg80211_any_usable_channels);

struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
                                       unsigned int link_id)
{
        lockdep_assert_wiphy(wdev->wiphy);

        WARN_ON(wdev->valid_links && !(wdev->valid_links & BIT(link_id)));
        WARN_ON(!wdev->valid_links && link_id > 0);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_MESH_POINT:
                return &wdev->u.mesh.chandef;
        case NL80211_IFTYPE_ADHOC:
                return &wdev->u.ibss.chandef;
        case NL80211_IFTYPE_OCB:
                return &wdev->u.ocb.chandef;
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                return &wdev->links[link_id].ap.chandef;
        default:
                return NULL;
        }
}
EXPORT_SYMBOL(wdev_chandef);











































































































































































































































































































































































































































































































































    1 











































































































    1 


































    1 



























































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 

















































































    1 







































































































    1 















    1 












    1 
























    1 




























































    1 


































































































































































































































































































































































    1 










    1 
































    1 











    1 











    1 

    1 



    1 





































    1 






















    1 




































    1 





























    1 















    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 










    1 































































































































































































































































































































































    1 





































    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2002 Intel Corp.
 * Copyright (c) 2002      Nokia Corp.
 *
 * This is part of the SCTP Linux Kernel Implementation.
 *
 * These are the state functions for the state machine.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Mathew Kotowsky       <kotowsky@sctp.org>
 *    Sridhar Samudrala     <samudrala@us.ibm.com>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Hui Huang             <hui.huang@nokia.com>
 *    Dajiang Zhang             <dajiang.zhang@nokia.com>
 *    Daisy Chang            <daisyc@us.ibm.com>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 *    Ryan Layer            <rmlayer@us.ibm.com>
 *    Kevin Gao                    <kevin.gao@intel.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/proto_memory.h>
#include <net/inet_ecn.h>
#include <linux/skbuff.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/structs.h>

#define CREATE_TRACE_POINTS
#include <trace/events/sctp.h>

static struct sctp_packet *sctp_abort_pkt_new(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk,
                                        const void *payload, size_t paylen);
static int sctp_eat_data(const struct sctp_association *asoc,
                         struct sctp_chunk *chunk,
                         struct sctp_cmd_seq *commands);
static struct sctp_packet *sctp_ootb_pkt_new(
                                        struct net *net,
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk);
static void sctp_send_stale_cookie_err(struct net *net,
                                       const struct sctp_endpoint *ep,
                                       const struct sctp_association *asoc,
                                       const struct sctp_chunk *chunk,
                                       struct sctp_cmd_seq *commands,
                                       struct sctp_chunk *err_chunk);
static enum sctp_disposition sctp_sf_do_5_2_6_stale(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);
static enum sctp_disposition sctp_sf_shut_8_4_5(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);
static enum sctp_disposition sctp_sf_tabort_8_4_8(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);
static enum sctp_disposition sctp_sf_new_encap_port(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);
static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);

static enum sctp_disposition sctp_stop_t1_and_abort(
                                        struct net *net,
                                        struct sctp_cmd_seq *commands,
                                        __be16 error, int sk_err,
                                        const struct sctp_association *asoc,
                                        struct sctp_transport *transport);

static enum sctp_disposition sctp_sf_abort_violation(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        void *arg,
                                        struct sctp_cmd_seq *commands,
                                        const __u8 *payload,
                                        const size_t paylen);

static enum sctp_disposition sctp_sf_violation_chunklen(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);

static enum sctp_disposition sctp_sf_violation_paramlen(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg, void *ext,
                                        struct sctp_cmd_seq *commands);

static enum sctp_disposition sctp_sf_violation_ctsn(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);

static enum sctp_disposition sctp_sf_violation_chunk(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);

static enum sctp_ierror sctp_sf_authenticate(
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk);

static enum sctp_disposition __sctp_sf_do_9_1_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands);

static enum sctp_disposition
__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
                           const struct sctp_association *asoc,
                           const union sctp_subtype type, void *arg,
                           struct sctp_cmd_seq *commands);

/* Small helper function that checks if the chunk length
 * is of the appropriate length.  The 'required_length' argument
 * is set to be the size of a specific chunk we are testing.
 * Return Values:  true  = Valid length
 *                    false = Invalid length
 *
 */
static inline bool sctp_chunk_length_valid(struct sctp_chunk *chunk,
                                           __u16 required_length)
{
        __u16 chunk_length = ntohs(chunk->chunk_hdr->length);

        /* Previously already marked? */
        if (unlikely(chunk->pdiscard))
                return false;
        if (unlikely(chunk_length < required_length))
                return false;

        return true;
}

/* Check for format error in an ABORT chunk */
static inline bool sctp_err_chunk_valid(struct sctp_chunk *chunk)
{
        struct sctp_errhdr *err;

        sctp_walk_errors(err, chunk->chunk_hdr);

        return (void *)err == (void *)chunk->chunk_end;
}

/**********************************************************
 * These are the state functions for handling chunk events.
 **********************************************************/

/*
 * Process the final SHUTDOWN COMPLETE.
 *
 * Section: 4 (C) (diagram), 9.2
 * Upon reception of the SHUTDOWN COMPLETE chunk the endpoint will verify
 * that it is in SHUTDOWN-ACK-SENT state, if it is not the chunk should be
 * discarded. If the endpoint is in the SHUTDOWN-ACK-SENT state the endpoint
 * should stop the T2-shutdown timer and remove all knowledge of the
 * association (and thus the association enters the CLOSED state).
 *
 * Verification Tag: 8.5.1(C), sctpimpguide 2.41.
 * C) Rules for packet carrying SHUTDOWN COMPLETE:
 * ...
 * - The receiver of a SHUTDOWN COMPLETE shall accept the packet
 *   if the Verification Tag field of the packet matches its own tag and
 *   the T bit is not set
 *   OR
 *   it is set to its peer's tag and the T bit is set in the Chunk
 *   Flags.
 *   Otherwise, the receiver MUST silently discard the packet
 *   and take no further action.  An endpoint MUST ignore the
 *   SHUTDOWN COMPLETE if it is not in the SHUTDOWN-ACK-SENT state.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_4_C(struct net *net,
                                     const struct sctp_endpoint *ep,
                                     const struct sctp_association *asoc,
                                     const union sctp_subtype type,
                                     void *arg, struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_ulpevent *ev;

        if (!sctp_vtag_verify_either(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* RFC 2960 6.10 Bundling
         *
         * An endpoint MUST NOT bundle INIT, INIT ACK or
         * SHUTDOWN COMPLETE with any other chunks.
         */
        if (!chunk->singleton)
                return sctp_sf_violation_chunk(net, ep, asoc, type, arg, commands);

        /* Make sure that the SHUTDOWN_COMPLETE chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* RFC 2960 10.2 SCTP-to-ULP
         *
         * H) SHUTDOWN COMPLETE notification
         *
         * When SCTP completes the shutdown procedures (section 9.2) this
         * notification is passed to the upper layer.
         */
        ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP,
                                             0, 0, 0, NULL, GFP_ATOMIC);
        if (ev)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(ev));

        /* Upon reception of the SHUTDOWN COMPLETE chunk the endpoint
         * will verify that it is in SHUTDOWN-ACK-SENT state, if it is
         * not the chunk should be discarded. If the endpoint is in
         * the SHUTDOWN-ACK-SENT state the endpoint should stop the
         * T2-shutdown timer and remove all knowledge of the
         * association (and thus the association enters the CLOSED
         * state).
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_CLOSED));

        SCTP_INC_STATS(net, SCTP_MIB_SHUTDOWNS);
        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);

        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());

        return SCTP_DISPOSITION_DELETE_TCB;
}

/*
 * Respond to a normal INIT chunk.
 * We are the side that is being asked for an association.
 *
 * Section: 5.1 Normal Establishment of an Association, B
 * B) "Z" shall respond immediately with an INIT ACK chunk.  The
 *    destination IP address of the INIT ACK MUST be set to the source
 *    IP address of the INIT to which this INIT ACK is responding.  In
 *    the response, besides filling in other parameters, "Z" must set the
 *    Verification Tag field to Tag_A, and also provide its own
 *    Verification Tag (Tag_Z) in the Initiate Tag field.
 *
 * Verification Tag: Must be 0.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg, *repl, *err_chunk;
        struct sctp_unrecognized_param *unk_param;
        struct sctp_association *new_asoc;
        struct sctp_packet *packet;
        int len;

        /* 6.10 Bundling
         * An endpoint MUST NOT bundle INIT, INIT ACK or
         * SHUTDOWN COMPLETE with any other chunks.
         *
         * IG Section 2.11.2
         * Furthermore, we require that the receiver of an INIT chunk MUST
         * enforce these rules by silently discarding an arriving packet
         * with an INIT chunk that is bundled with other chunks.
         */
        if (!chunk->singleton)
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the INIT chunk has a valid length.
         * Normally, this would cause an ABORT with a Protocol Violation
         * error, but since we don't have an association, we'll
         * just discard the packet.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* If the packet is an OOTB packet which is temporarily on the
         * control endpoint, respond with an ABORT.
         */
        if (ep == sctp_sk(net->sctp.ctl_sock)->ep) {
                SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
        }

        /* 3.1 A packet containing an INIT chunk MUST have a zero Verification
         * Tag.
         */
        if (chunk->sctp_hdr->vtag != 0)
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);

        /* If the INIT is coming toward a closing socket, we'll send back
         * and ABORT.  Essentially, this catches the race of INIT being
         * backloged to the socket at the same time as the user issues close().
         * Since the socket and all its associations are going away, we
         * can treat this OOTB
         */
        if (sctp_sstate(ep->base.sk, CLOSING))
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);

        /* Verify the INIT chunk before processing it. */
        err_chunk = NULL;
        if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type,
                              (struct sctp_init_chunk *)chunk->chunk_hdr, chunk,
                              &err_chunk)) {
                /* This chunk contains fatal error. It is to be discarded.
                 * Send an ABORT, with causes if there is any.
                 */
                if (err_chunk) {
                        packet = sctp_abort_pkt_new(net, ep, asoc, arg,
                                        (__u8 *)(err_chunk->chunk_hdr) +
                                        sizeof(struct sctp_chunkhdr),
                                        ntohs(err_chunk->chunk_hdr->length) -
                                        sizeof(struct sctp_chunkhdr));

                        sctp_chunk_free(err_chunk);

                        if (packet) {
                                sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
                                                SCTP_PACKET(packet));
                                SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
                                return SCTP_DISPOSITION_CONSUME;
                        } else {
                                return SCTP_DISPOSITION_NOMEM;
                        }
                } else {
                        return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg,
                                                    commands);
                }
        }

        /* Grab the INIT header.  */
        chunk->subh.init_hdr = (struct sctp_inithdr *)chunk->skb->data;

        /* Tag the variable length parameters.  */
        chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(struct sctp_inithdr));

        new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC);
        if (!new_asoc)
                goto nomem;

        /* Update socket peer label if first association. */
        if (security_sctp_assoc_request(new_asoc, chunk->skb)) {
                sctp_association_free(new_asoc);
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
                                             sctp_scope(sctp_source(chunk)),
                                             GFP_ATOMIC) < 0)
                goto nomem_init;

        /* The call, sctp_process_init(), can fail on memory allocation.  */
        if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk),
                               (struct sctp_init_chunk *)chunk->chunk_hdr,
                               GFP_ATOMIC))
                goto nomem_init;

        /* B) "Z" shall respond immediately with an INIT ACK chunk.  */

        /* If there are errors need to be reported for unknown parameters,
         * make sure to reserve enough room in the INIT ACK for them.
         */
        len = 0;
        if (err_chunk)
                len = ntohs(err_chunk->chunk_hdr->length) -
                      sizeof(struct sctp_chunkhdr);

        repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len);
        if (!repl)
                goto nomem_init;

        /* If there are errors need to be reported for unknown parameters,
         * include them in the outgoing INIT ACK as "Unrecognized parameter"
         * parameter.
         */
        if (err_chunk) {
                /* Get the "Unrecognized parameter" parameter(s) out of the
                 * ERROR chunk generated by sctp_verify_init(). Since the
                 * error cause code for "unknown parameter" and the
                 * "Unrecognized parameter" type is the same, we can
                 * construct the parameters in INIT ACK by copying the
                 * ERROR causes over.
                 */
                unk_param = (struct sctp_unrecognized_param *)
                            ((__u8 *)(err_chunk->chunk_hdr) +
                            sizeof(struct sctp_chunkhdr));
                /* Replace the cause code with the "Unrecognized parameter"
                 * parameter type.
                 */
                sctp_addto_chunk(repl, len, unk_param);
                sctp_chunk_free(err_chunk);
        }

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));

        /*
         * Note:  After sending out INIT ACK with the State Cookie parameter,
         * "Z" MUST NOT allocate any resources, nor keep any states for the
         * new association.  Otherwise, "Z" will be vulnerable to resource
         * attacks.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());

        return SCTP_DISPOSITION_DELETE_TCB;

nomem_init:
        sctp_association_free(new_asoc);
nomem:
        if (err_chunk)
                sctp_chunk_free(err_chunk);
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Respond to a normal INIT ACK chunk.
 * We are the side that is initiating the association.
 *
 * Section: 5.1 Normal Establishment of an Association, C
 * C) Upon reception of the INIT ACK from "Z", "A" shall stop the T1-init
 *    timer and leave COOKIE-WAIT state. "A" shall then send the State
 *    Cookie received in the INIT ACK chunk in a COOKIE ECHO chunk, start
 *    the T1-cookie timer, and enter the COOKIE-ECHOED state.
 *
 *    Note: The COOKIE ECHO chunk can be bundled with any pending outbound
 *    DATA chunks, but it MUST be the first chunk in the packet and
 *    until the COOKIE ACK is returned the sender MUST NOT send any
 *    other packets to the peer.
 *
 * Verification Tag: 3.3.3
 *   If the value of the Initiate Tag in a received INIT ACK chunk is
 *   found to be 0, the receiver MUST treat it as an error and close the
 *   association by transmitting an ABORT.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net,
                                          const struct sctp_endpoint *ep,
                                          const struct sctp_association *asoc,
                                          const union sctp_subtype type,
                                          void *arg,
                                          struct sctp_cmd_seq *commands)
{
        struct sctp_init_chunk *initchunk;
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *err_chunk;
        struct sctp_packet *packet;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* 6.10 Bundling
         * An endpoint MUST NOT bundle INIT, INIT ACK or
         * SHUTDOWN COMPLETE with any other chunks.
         */
        if (!chunk->singleton)
                return sctp_sf_violation_chunk(net, ep, asoc, type, arg, commands);

        /* Make sure that the INIT-ACK chunk has a valid length */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_initack_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);
        /* Grab the INIT header.  */
        chunk->subh.init_hdr = (struct sctp_inithdr *)chunk->skb->data;

        /* Verify the INIT chunk before processing it. */
        err_chunk = NULL;
        if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type,
                              (struct sctp_init_chunk *)chunk->chunk_hdr, chunk,
                              &err_chunk)) {

                enum sctp_error error = SCTP_ERROR_NO_RESOURCE;

                /* This chunk contains fatal error. It is to be discarded.
                 * Send an ABORT, with causes.  If there are no causes,
                 * then there wasn't enough memory.  Just terminate
                 * the association.
                 */
                if (err_chunk) {
                        packet = sctp_abort_pkt_new(net, ep, asoc, arg,
                                        (__u8 *)(err_chunk->chunk_hdr) +
                                        sizeof(struct sctp_chunkhdr),
                                        ntohs(err_chunk->chunk_hdr->length) -
                                        sizeof(struct sctp_chunkhdr));

                        sctp_chunk_free(err_chunk);

                        if (packet) {
                                sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
                                                SCTP_PACKET(packet));
                                SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
                                error = SCTP_ERROR_INV_PARAM;
                        }
                }

                /* SCTP-AUTH, Section 6.3:
                 *    It should be noted that if the receiver wants to tear
                 *    down an association in an authenticated way only, the
                 *    handling of malformed packets should not result in
                 *    tearing down the association.
                 *
                 * This means that if we only want to abort associations
                 * in an authenticated way (i.e AUTH+ABORT), then we
                 * can't destroy this association just because the packet
                 * was malformed.
                 */
                if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
                        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED,
                                                asoc, chunk->transport);
        }

        /* Tag the variable length parameters.  Note that we never
         * convert the parameters in an INIT chunk.
         */
        chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(struct sctp_inithdr));

        initchunk = (struct sctp_init_chunk *)chunk->chunk_hdr;

        sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
                        SCTP_PEER_INIT(initchunk));

        /* Reset init error count upon receipt of INIT-ACK.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());

        /* 5.1 C) "A" shall stop the T1-init timer and leave
         * COOKIE-WAIT state.  "A" shall then ... start the T1-cookie
         * timer, and enter the COOKIE-ECHOED state.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_COOKIE_ECHOED));

        /* SCTP-AUTH: generate the association shared keys so that
         * we can potentially sign the COOKIE-ECHO.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_SHKEY, SCTP_NULL());

        /* 5.1 C) "A" shall then send the State Cookie received in the
         * INIT ACK chunk in a COOKIE ECHO chunk, ...
         */
        /* If there is any errors to report, send the ERROR chunk generated
         * for unknown parameters as well.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_GEN_COOKIE_ECHO,
                        SCTP_CHUNK(err_chunk));

        return SCTP_DISPOSITION_CONSUME;
}

static bool sctp_auth_chunk_verify(struct net *net, struct sctp_chunk *chunk,
                                   const struct sctp_association *asoc)
{
        struct sctp_chunk auth;

        if (!chunk->auth_chunk)
                return true;

        /* SCTP-AUTH:  auth_chunk pointer is only set when the cookie-echo
         * is supposed to be authenticated and we have to do delayed
         * authentication.  We've just recreated the association using
         * the information in the cookie and now it's much easier to
         * do the authentication.
         */

        /* Make sure that we and the peer are AUTH capable */
        if (!net->sctp.auth_enable || !asoc->peer.auth_capable)
                return false;

        /* set-up our fake chunk so that we can process it */
        auth.skb = chunk->auth_chunk;
        auth.asoc = chunk->asoc;
        auth.sctp_hdr = chunk->sctp_hdr;
        auth.chunk_hdr = (struct sctp_chunkhdr *)
                                skb_push(chunk->auth_chunk,
                                         sizeof(struct sctp_chunkhdr));
        skb_pull(chunk->auth_chunk, sizeof(struct sctp_chunkhdr));
        auth.transport = chunk->transport;

        return sctp_sf_authenticate(asoc, &auth) == SCTP_IERROR_NO_ERROR;
}

/*
 * Respond to a normal COOKIE ECHO chunk.
 * We are the side that is being asked for an association.
 *
 * Section: 5.1 Normal Establishment of an Association, D
 * D) Upon reception of the COOKIE ECHO chunk, Endpoint "Z" will reply
 *    with a COOKIE ACK chunk after building a TCB and moving to
 *    the ESTABLISHED state. A COOKIE ACK chunk may be bundled with
 *    any pending DATA chunks (and/or SACK chunks), but the COOKIE ACK
 *    chunk MUST be the first chunk in the packet.
 *
 *   IMPLEMENTATION NOTE: An implementation may choose to send the
 *   Communication Up notification to the SCTP user upon reception
 *   of a valid COOKIE ECHO chunk.
 *
 * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules
 * D) Rules for packet carrying a COOKIE ECHO
 *
 * - When sending a COOKIE ECHO, the endpoint MUST use the value of the
 *   Initial Tag received in the INIT ACK.
 *
 * - The receiver of a COOKIE ECHO follows the procedures in Section 5.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
                                         const struct sctp_endpoint *ep,
                                         const struct sctp_association *asoc,
                                         const union sctp_subtype type,
                                         void *arg,
                                         struct sctp_cmd_seq *commands)
{
        struct sctp_ulpevent *ev, *ai_ev = NULL, *auth_ev = NULL;
        struct sctp_association *new_asoc;
        struct sctp_init_chunk *peer_init;
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *err_chk_p;
        struct sctp_chunk *repl;
        struct sock *sk;
        int error = 0;

        if (asoc && !sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* If the packet is an OOTB packet which is temporarily on the
         * control endpoint, respond with an ABORT.
         */
        if (ep == sctp_sk(net->sctp.ctl_sock)->ep) {
                SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
        }

        /* Make sure that the COOKIE_ECHO chunk has a valid length.
         * In this case, we check that we have enough for at least a
         * chunk header.  More detailed verification is done
         * in sctp_unpack_cookie().
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* If the endpoint is not listening or if the number of associations
         * on the TCP-style socket exceed the max backlog, respond with an
         * ABORT.
         */
        sk = ep->base.sk;
        if (!sctp_sstate(sk, LISTENING) ||
            (sctp_style(sk, TCP) && sk_acceptq_is_full(sk)))
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);

        /* "Decode" the chunk.  We have no optional parameters so we
         * are in good shape.
         */
        chunk->subh.cookie_hdr =
                (struct sctp_signed_cookie *)chunk->skb->data;
        if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
                                         sizeof(struct sctp_chunkhdr)))
                goto nomem;

        /* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint
         * "Z" will reply with a COOKIE ACK chunk after building a TCB
         * and moving to the ESTABLISHED state.
         */
        new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error,
                                      &err_chk_p);

        /* FIXME:
         * If the re-build failed, what is the proper error path
         * from here?
         *
         * [We should abort the association. --piggy]
         */
        if (!new_asoc) {
                /* FIXME: Several errors are possible.  A bad cookie should
                 * be silently discarded, but think about logging it too.
                 */
                switch (error) {
                case -SCTP_IERROR_NOMEM:
                        goto nomem;

                case -SCTP_IERROR_STALE_COOKIE:
                        sctp_send_stale_cookie_err(net, ep, asoc, chunk, commands,
                                                   err_chk_p);
                        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

                case -SCTP_IERROR_BAD_SIG:
                default:
                        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
                }
        }

        if (security_sctp_assoc_request(new_asoc, chunk->head_skb ?: chunk->skb)) {
                sctp_association_free(new_asoc);
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        /* Delay state machine commands until later.
         *
         * Re-build the bind address for the association is done in
         * the sctp_unpack_cookie() already.
         */
        /* This is a brand-new association, so these are not yet side
         * effects--it is safe to run them here.
         */
        peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
        if (!sctp_process_init(new_asoc, chunk,
                               &chunk->subh.cookie_hdr->c.peer_addr,
                               peer_init, GFP_ATOMIC))
                goto nomem_init;

        /* SCTP-AUTH:  Now that we've populate required fields in
         * sctp_process_init, set up the association shared keys as
         * necessary so that we can potentially authenticate the ACK
         */
        error = sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC);
        if (error)
                goto nomem_init;

        if (!sctp_auth_chunk_verify(net, chunk, new_asoc)) {
                sctp_association_free(new_asoc);
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        repl = sctp_make_cookie_ack(new_asoc, chunk);
        if (!repl)
                goto nomem_init;

        /* RFC 2960 5.1 Normal Establishment of an Association
         *
         * D) IMPLEMENTATION NOTE: An implementation may choose to
         * send the Communication Up notification to the SCTP user
         * upon reception of a valid COOKIE ECHO chunk.
         */
        ev = sctp_ulpevent_make_assoc_change(new_asoc, 0, SCTP_COMM_UP, 0,
                                             new_asoc->c.sinit_num_ostreams,
                                             new_asoc->c.sinit_max_instreams,
                                             NULL, GFP_ATOMIC);
        if (!ev)
                goto nomem_ev;

        /* Sockets API Draft Section 5.3.1.6
         * When a peer sends a Adaptation Layer Indication parameter , SCTP
         * delivers this notification to inform the application that of the
         * peers requested adaptation layer.
         */
        if (new_asoc->peer.adaptation_ind) {
                ai_ev = sctp_ulpevent_make_adaptation_indication(new_asoc,
                                                            GFP_ATOMIC);
                if (!ai_ev)
                        goto nomem_aiev;
        }

        if (!new_asoc->peer.auth_capable) {
                auth_ev = sctp_ulpevent_make_authkey(new_asoc, 0,
                                                     SCTP_AUTH_NO_AUTH,
                                                     GFP_ATOMIC);
                if (!auth_ev)
                        goto nomem_authev;
        }

        /* Add all the state machine commands now since we've created
         * everything.  This way we don't introduce memory corruptions
         * during side-effect processing and correctly count established
         * associations.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_ESTABLISHED));
        SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
        SCTP_INC_STATS(net, SCTP_MIB_PASSIVEESTABS);
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());

        if (new_asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));

        /* This will send the COOKIE ACK */
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));

        /* Queue the ASSOC_CHANGE event */
        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));

        /* Send up the Adaptation Layer Indication event */
        if (ai_ev)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(ai_ev));

        if (auth_ev)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(auth_ev));

        return SCTP_DISPOSITION_CONSUME;

nomem_authev:
        sctp_ulpevent_free(ai_ev);
nomem_aiev:
        sctp_ulpevent_free(ev);
nomem_ev:
        sctp_chunk_free(repl);
nomem_init:
        sctp_association_free(new_asoc);
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Respond to a normal COOKIE ACK chunk.
 * We are the side that is asking for an association.
 *
 * RFC 2960 5.1 Normal Establishment of an Association
 *
 * E) Upon reception of the COOKIE ACK, endpoint "A" will move from the
 *    COOKIE-ECHOED state to the ESTABLISHED state, stopping the T1-cookie
 *    timer. It may also notify its ULP about the successful
 *    establishment of the association with a Communication Up
 *    notification (see Section 10).
 *
 * Verification Tag:
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net,
                                         const struct sctp_endpoint *ep,
                                         const struct sctp_association *asoc,
                                         const union sctp_subtype type,
                                         void *arg,
                                         struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_ulpevent *ev;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Set peer label for connection. */
        if (security_sctp_assoc_established((struct sctp_association *)asoc,
                                            chunk->head_skb ?: chunk->skb))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Verify that the chunk length for the COOKIE-ACK is OK.
         * If we don't do this, any bundled chunks may be junked.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* Reset init error count upon receipt of COOKIE-ACK,
         * to avoid problems with the management of this
         * counter in stale cookie situations when a transition back
         * from the COOKIE-ECHOED state to the COOKIE-WAIT
         * state is performed.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());

        /* RFC 2960 5.1 Normal Establishment of an Association
         *
         * E) Upon reception of the COOKIE ACK, endpoint "A" will move
         * from the COOKIE-ECHOED state to the ESTABLISHED state,
         * stopping the T1-cookie timer.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_ESTABLISHED));
        SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
        SCTP_INC_STATS(net, SCTP_MIB_ACTIVEESTABS);
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());
        if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));

        /* It may also notify its ULP about the successful
         * establishment of the association with a Communication Up
         * notification (see Section 10).
         */
        ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP,
                                             0, asoc->c.sinit_num_ostreams,
                                             asoc->c.sinit_max_instreams,
                                             NULL, GFP_ATOMIC);

        if (!ev)
                goto nomem;

        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));

        /* Sockets API Draft Section 5.3.1.6
         * When a peer sends a Adaptation Layer Indication parameter , SCTP
         * delivers this notification to inform the application that of the
         * peers requested adaptation layer.
         */
        if (asoc->peer.adaptation_ind) {
                ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC);
                if (!ev)
                        goto nomem;

                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(ev));
        }

        if (!asoc->peer.auth_capable) {
                ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH,
                                                GFP_ATOMIC);
                if (!ev)
                        goto nomem;
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(ev));
        }

        return SCTP_DISPOSITION_CONSUME;
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/* Generate and sendout a heartbeat packet.  */
static enum sctp_disposition sctp_sf_heartbeat(
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_transport *transport = (struct sctp_transport *) arg;
        struct sctp_chunk *reply;

        /* Send a heartbeat to our peer.  */
        reply = sctp_make_heartbeat(asoc, transport, 0);
        if (!reply)
                return SCTP_DISPOSITION_NOMEM;

        /* Set rto_pending indicating that an RTT measurement
         * is started with this heartbeat chunk.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_RTO_PENDING,
                        SCTP_TRANSPORT(transport));

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
        return SCTP_DISPOSITION_CONSUME;
}

/* Generate a HEARTBEAT packet on the given transport.  */
enum sctp_disposition sctp_sf_sendbeat_8_3(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        struct sctp_transport *transport = (struct sctp_transport *) arg;

        if (asoc->overall_error_count >= asoc->max_retrans) {
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ETIMEDOUT));
                /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
                sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                SCTP_PERR(SCTP_ERROR_NO_ERROR));
                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                return SCTP_DISPOSITION_DELETE_TCB;
        }

        /* Section 3.3.5.
         * The Sender-specific Heartbeat Info field should normally include
         * information about the sender's current time when this HEARTBEAT
         * chunk is sent and the destination transport address to which this
         * HEARTBEAT is sent (see Section 8.3).
         */

        if (transport->param_flags & SPP_HB_ENABLE) {
                if (SCTP_DISPOSITION_NOMEM ==
                                sctp_sf_heartbeat(ep, asoc, type, arg,
                                                  commands))
                        return SCTP_DISPOSITION_NOMEM;

                /* Set transport error counter and association error counter
                 * when sending heartbeat.
                 */
                sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
                                SCTP_TRANSPORT(transport));
        }
        sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE,
                        SCTP_TRANSPORT(transport));
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE,
                        SCTP_TRANSPORT(transport));

        return SCTP_DISPOSITION_CONSUME;
}

/* resend asoc strreset_chunk.  */
enum sctp_disposition sctp_sf_send_reconf(struct net *net,
                                          const struct sctp_endpoint *ep,
                                          const struct sctp_association *asoc,
                                          const union sctp_subtype type,
                                          void *arg,
                                          struct sctp_cmd_seq *commands)
{
        struct sctp_transport *transport = arg;

        if (asoc->overall_error_count >= asoc->max_retrans) {
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ETIMEDOUT));
                /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
                sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                SCTP_PERR(SCTP_ERROR_NO_ERROR));
                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                return SCTP_DISPOSITION_DELETE_TCB;
        }

        sctp_chunk_hold(asoc->strreset_chunk);
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                        SCTP_CHUNK(asoc->strreset_chunk));
        sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));

        return SCTP_DISPOSITION_CONSUME;
}

/* send hb chunk with padding for PLPMUTD.  */
enum sctp_disposition sctp_sf_send_probe(struct net *net,
                                         const struct sctp_endpoint *ep,
                                         const struct sctp_association *asoc,
                                         const union sctp_subtype type,
                                         void *arg,
                                         struct sctp_cmd_seq *commands)
{
        struct sctp_transport *transport = (struct sctp_transport *)arg;
        struct sctp_chunk *reply;

        if (!sctp_transport_pl_enabled(transport))
                return SCTP_DISPOSITION_CONSUME;

        sctp_transport_pl_send(transport);
        reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size);
        if (!reply)
                return SCTP_DISPOSITION_NOMEM;
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
        sctp_add_cmd_sf(commands, SCTP_CMD_PROBE_TIMER_UPDATE,
                        SCTP_TRANSPORT(transport));

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Process an heartbeat request.
 *
 * Section: 8.3 Path Heartbeat
 * The receiver of the HEARTBEAT should immediately respond with a
 * HEARTBEAT ACK that contains the Heartbeat Information field copied
 * from the received HEARTBEAT chunk.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 * When receiving an SCTP packet, the endpoint MUST ensure that the
 * value in the Verification Tag field of the received SCTP packet
 * matches its own Tag. If the received Verification Tag value does not
 * match the receiver's own tag value, the receiver shall silently
 * discard the packet and shall not process it any further except for
 * those cases listed in Section 8.5.1 below.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_beat_8_3(struct net *net,
                                       const struct sctp_endpoint *ep,
                                       const struct sctp_association *asoc,
                                       const union sctp_subtype type,
                                       void *arg, struct sctp_cmd_seq *commands)
{
        struct sctp_paramhdr *param_hdr;
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *reply;
        size_t paylen = 0;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the HEARTBEAT chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk,
                                     sizeof(struct sctp_heartbeat_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* 8.3 The receiver of the HEARTBEAT should immediately
         * respond with a HEARTBEAT ACK that contains the Heartbeat
         * Information field copied from the received HEARTBEAT chunk.
         */
        chunk->subh.hb_hdr = (struct sctp_heartbeathdr *)chunk->skb->data;
        param_hdr = (struct sctp_paramhdr *)chunk->subh.hb_hdr;
        paylen = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_chunkhdr);

        if (ntohs(param_hdr->length) > paylen)
                return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
                                                  param_hdr, commands);

        if (!pskb_pull(chunk->skb, paylen))
                goto nomem;

        reply = sctp_make_heartbeat_ack(asoc, chunk, param_hdr, paylen);
        if (!reply)
                goto nomem;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
        return SCTP_DISPOSITION_CONSUME;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Process the returning HEARTBEAT ACK.
 *
 * Section: 8.3 Path Heartbeat
 * Upon the receipt of the HEARTBEAT ACK, the sender of the HEARTBEAT
 * should clear the error counter of the destination transport
 * address to which the HEARTBEAT was sent, and mark the destination
 * transport address as active if it is not so marked. The endpoint may
 * optionally report to the upper layer when an inactive destination
 * address is marked as active due to the reception of the latest
 * HEARTBEAT ACK. The receiver of the HEARTBEAT ACK must also
 * clear the association overall error count as well (as defined
 * in section 8.1).
 *
 * The receiver of the HEARTBEAT ACK should also perform an RTT
 * measurement for that destination transport address using the time
 * value carried in the HEARTBEAT ACK chunk.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        struct sctp_sender_hb_info *hbinfo;
        struct sctp_chunk *chunk = arg;
        struct sctp_transport *link;
        unsigned long max_interval;
        union sctp_addr from_addr;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the HEARTBEAT-ACK chunk has a valid length.  */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr) +
                                            sizeof(*hbinfo)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        hbinfo = (struct sctp_sender_hb_info *)chunk->skb->data;
        /* Make sure that the length of the parameter is what we expect */
        if (ntohs(hbinfo->param_hdr.length) != sizeof(*hbinfo))
                return SCTP_DISPOSITION_DISCARD;

        from_addr = hbinfo->daddr;
        link = sctp_assoc_lookup_paddr(asoc, &from_addr);

        /* This should never happen, but lets log it if so.  */
        if (unlikely(!link)) {
                if (from_addr.sa.sa_family == AF_INET6) {
                        net_warn_ratelimited("%s association %p could not find address %pI6\n",
                                             __func__,
                                             asoc,
                                             &from_addr.v6.sin6_addr);
                } else {
                        net_warn_ratelimited("%s association %p could not find address %pI4\n",
                                             __func__,
                                             asoc,
                                             &from_addr.v4.sin_addr.s_addr);
                }
                return SCTP_DISPOSITION_DISCARD;
        }

        /* Validate the 64-bit random nonce. */
        if (hbinfo->hb_nonce != link->hb_nonce)
                return SCTP_DISPOSITION_DISCARD;

        if (hbinfo->probe_size) {
                if (hbinfo->probe_size != link->pl.probe_size ||
                    !sctp_transport_pl_enabled(link))
                        return SCTP_DISPOSITION_DISCARD;

                if (sctp_transport_pl_recv(link))
                        return SCTP_DISPOSITION_CONSUME;

                return sctp_sf_send_probe(net, ep, asoc, type, link, commands);
        }

        max_interval = link->hbinterval + link->rto;

        /* Check if the timestamp looks valid.  */
        if (time_after(hbinfo->sent_at, jiffies) ||
            time_after(jiffies, hbinfo->sent_at + max_interval)) {
                pr_debug("%s: HEARTBEAT ACK with invalid timestamp received "
                         "for transport:%p\n", __func__, link);

                return SCTP_DISPOSITION_DISCARD;
        }

        /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of
         * the HEARTBEAT should clear the error counter of the
         * destination transport address to which the HEARTBEAT was
         * sent and mark the destination transport address as active if
         * it is not so marked.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_ON, SCTP_TRANSPORT(link));

        return SCTP_DISPOSITION_CONSUME;
}

/* Helper function to send out an abort for the restart
 * condition.
 */
static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
                                      struct sctp_chunk *init,
                                      struct sctp_cmd_seq *commands)
{
        struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family);
        union sctp_addr_param *addrparm;
        struct sctp_errhdr *errhdr;
        char buffer[sizeof(*errhdr) + sizeof(*addrparm)];
        struct sctp_endpoint *ep;
        struct sctp_packet *pkt;
        int len;

        /* Build the error on the stack.   We are way to malloc crazy
         * throughout the code today.
         */
        errhdr = (struct sctp_errhdr *)buffer;
        addrparm = (union sctp_addr_param *)(errhdr + 1);

        /* Copy into a parm format. */
        len = af->to_addr_param(ssa, addrparm);
        len += sizeof(*errhdr);

        errhdr->cause = SCTP_ERROR_RESTART;
        errhdr->length = htons(len);

        /* Assign to the control socket. */
        ep = sctp_sk(net->sctp.ctl_sock)->ep;

        /* Association is NULL since this may be a restart attack and we
         * want to send back the attacker's vtag.
         */
        pkt = sctp_abort_pkt_new(net, ep, NULL, init, errhdr, len);

        if (!pkt)
                goto out;
        sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(pkt));

        SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);

        /* Discard the rest of the inbound packet. */
        sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());

out:
        /* Even if there is no memory, treat as a failure so
         * the packet will get dropped.
         */
        return 0;
}

static bool list_has_sctp_addr(const struct list_head *list,
                               union sctp_addr *ipaddr)
{
        struct sctp_transport *addr;

        list_for_each_entry(addr, list, transports) {
                if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr))
                        return true;
        }

        return false;
}
/* A restart is occurring, check to make sure no new addresses
 * are being added as we may be under a takeover attack.
 */
static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
                                       const struct sctp_association *asoc,
                                       struct sctp_chunk *init,
                                       struct sctp_cmd_seq *commands)
{
        struct net *net = new_asoc->base.net;
        struct sctp_transport *new_addr;
        int ret = 1;

        /* Implementor's Guide - Section 5.2.2
         * ...
         * Before responding the endpoint MUST check to see if the
         * unexpected INIT adds new addresses to the association. If new
         * addresses are added to the association, the endpoint MUST respond
         * with an ABORT..
         */

        /* Search through all current addresses and make sure
         * we aren't adding any new ones.
         */
        list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list,
                            transports) {
                if (!list_has_sctp_addr(&asoc->peer.transport_addr_list,
                                        &new_addr->ipaddr)) {
                        sctp_sf_send_restart_abort(net, &new_addr->ipaddr, init,
                                                   commands);
                        ret = 0;
                        break;
                }
        }

        /* Return success if all addresses were found. */
        return ret;
}

/* Populate the verification/tie tags based on overlapping INIT
 * scenario.
 *
 * Note: Do not use in CLOSED or SHUTDOWN-ACK-SENT state.
 */
static void sctp_tietags_populate(struct sctp_association *new_asoc,
                                  const struct sctp_association *asoc)
{
        switch (asoc->state) {

        /* 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State */

        case SCTP_STATE_COOKIE_WAIT:
                new_asoc->c.my_vtag     = asoc->c.my_vtag;
                new_asoc->c.my_ttag     = asoc->c.my_vtag;
                new_asoc->c.peer_ttag   = 0;
                break;

        case SCTP_STATE_COOKIE_ECHOED:
                new_asoc->c.my_vtag     = asoc->c.my_vtag;
                new_asoc->c.my_ttag     = asoc->c.my_vtag;
                new_asoc->c.peer_ttag   = asoc->c.peer_vtag;
                break;

        /* 5.2.2 Unexpected INIT in States Other than CLOSED, COOKIE-ECHOED,
         * COOKIE-WAIT and SHUTDOWN-ACK-SENT
         */
        default:
                new_asoc->c.my_ttag   = asoc->c.my_vtag;
                new_asoc->c.peer_ttag = asoc->c.peer_vtag;
                break;
        }

        /* Other parameters for the endpoint SHOULD be copied from the
         * existing parameters of the association (e.g. number of
         * outbound streams) into the INIT ACK and cookie.
         */
        new_asoc->rwnd                  = asoc->rwnd;
        new_asoc->c.sinit_num_ostreams  = asoc->c.sinit_num_ostreams;
        new_asoc->c.sinit_max_instreams = asoc->c.sinit_max_instreams;
        new_asoc->c.initial_tsn         = asoc->c.initial_tsn;
}

/*
 * Compare vtag/tietag values to determine unexpected COOKIE-ECHO
 * handling action.
 *
 * RFC 2960 5.2.4 Handle a COOKIE ECHO when a TCB exists.
 *
 * Returns value representing action to be taken.   These action values
 * correspond to Action/Description values in RFC 2960, Table 2.
 */
static char sctp_tietags_compare(struct sctp_association *new_asoc,
                                 const struct sctp_association *asoc)
{
        /* In this case, the peer may have restarted.  */
        if ((asoc->c.my_vtag != new_asoc->c.my_vtag) &&
            (asoc->c.peer_vtag != new_asoc->c.peer_vtag) &&
            (asoc->c.my_vtag == new_asoc->c.my_ttag) &&
            (asoc->c.peer_vtag == new_asoc->c.peer_ttag))
                return 'A';

        /* Collision case B. */
        if ((asoc->c.my_vtag == new_asoc->c.my_vtag) &&
            ((asoc->c.peer_vtag != new_asoc->c.peer_vtag) ||
             (0 == asoc->c.peer_vtag))) {
                return 'B';
        }

        /* Collision case D. */
        if ((asoc->c.my_vtag == new_asoc->c.my_vtag) &&
            (asoc->c.peer_vtag == new_asoc->c.peer_vtag))
                return 'D';

        /* Collision case C. */
        if ((asoc->c.my_vtag != new_asoc->c.my_vtag) &&
            (asoc->c.peer_vtag == new_asoc->c.peer_vtag) &&
            (0 == new_asoc->c.my_ttag) &&
            (0 == new_asoc->c.peer_ttag))
                return 'C';

        /* No match to any of the special cases; discard this packet. */
        return 'E';
}

/* Common helper routine for both duplicate and simultaneous INIT
 * chunk handling.
 */
static enum sctp_disposition sctp_sf_do_unexpected_init(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg, *repl, *err_chunk;
        struct sctp_unrecognized_param *unk_param;
        struct sctp_association *new_asoc;
        enum sctp_disposition retval;
        struct sctp_packet *packet;
        int len;

        /* 6.10 Bundling
         * An endpoint MUST NOT bundle INIT, INIT ACK or
         * SHUTDOWN COMPLETE with any other chunks.
         *
         * IG Section 2.11.2
         * Furthermore, we require that the receiver of an INIT chunk MUST
         * enforce these rules by silently discarding an arriving packet
         * with an INIT chunk that is bundled with other chunks.
         */
        if (!chunk->singleton)
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the INIT chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* 3.1 A packet containing an INIT chunk MUST have a zero Verification
         * Tag.
         */
        if (chunk->sctp_hdr->vtag != 0)
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);

        if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port)
                return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands);

        /* Grab the INIT header.  */
        chunk->subh.init_hdr = (struct sctp_inithdr *)chunk->skb->data;

        /* Tag the variable length parameters.  */
        chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(struct sctp_inithdr));

        /* Verify the INIT chunk before processing it. */
        err_chunk = NULL;
        if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type,
                              (struct sctp_init_chunk *)chunk->chunk_hdr, chunk,
                              &err_chunk)) {
                /* This chunk contains fatal error. It is to be discarded.
                 * Send an ABORT, with causes if there is any.
                 */
                if (err_chunk) {
                        packet = sctp_abort_pkt_new(net, ep, asoc, arg,
                                        (__u8 *)(err_chunk->chunk_hdr) +
                                        sizeof(struct sctp_chunkhdr),
                                        ntohs(err_chunk->chunk_hdr->length) -
                                        sizeof(struct sctp_chunkhdr));

                        if (packet) {
                                sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
                                                SCTP_PACKET(packet));
                                SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
                                retval = SCTP_DISPOSITION_CONSUME;
                        } else {
                                retval = SCTP_DISPOSITION_NOMEM;
                        }
                        goto cleanup;
                } else {
                        return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg,
                                                    commands);
                }
        }

        /*
         * Other parameters for the endpoint SHOULD be copied from the
         * existing parameters of the association (e.g. number of
         * outbound streams) into the INIT ACK and cookie.
         * FIXME:  We are copying parameters from the endpoint not the
         * association.
         */
        new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC);
        if (!new_asoc)
                goto nomem;

        /* Update socket peer label if first association. */
        if (security_sctp_assoc_request(new_asoc, chunk->skb)) {
                sctp_association_free(new_asoc);
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        if (sctp_assoc_set_bind_addr_from_ep(new_asoc,
                                sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0)
                goto nomem;

        /* In the outbound INIT ACK the endpoint MUST copy its current
         * Verification Tag and Peers Verification tag into a reserved
         * place (local tie-tag and per tie-tag) within the state cookie.
         */
        if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk),
                               (struct sctp_init_chunk *)chunk->chunk_hdr,
                               GFP_ATOMIC))
                goto nomem;

        /* Make sure no new addresses are being added during the
         * restart.   Do not do this check for COOKIE-WAIT state,
         * since there are no peer addresses to check against.
         * Upon return an ABORT will have been sent if needed.
         */
        if (!sctp_state(asoc, COOKIE_WAIT)) {
                if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk,
                                                 commands)) {
                        retval = SCTP_DISPOSITION_CONSUME;
                        goto nomem_retval;
                }
        }

        sctp_tietags_populate(new_asoc, asoc);

        /* B) "Z" shall respond immediately with an INIT ACK chunk.  */

        /* If there are errors need to be reported for unknown parameters,
         * make sure to reserve enough room in the INIT ACK for them.
         */
        len = 0;
        if (err_chunk) {
                len = ntohs(err_chunk->chunk_hdr->length) -
                      sizeof(struct sctp_chunkhdr);
        }

        repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len);
        if (!repl)
                goto nomem;

        /* If there are errors need to be reported for unknown parameters,
         * include them in the outgoing INIT ACK as "Unrecognized parameter"
         * parameter.
         */
        if (err_chunk) {
                /* Get the "Unrecognized parameter" parameter(s) out of the
                 * ERROR chunk generated by sctp_verify_init(). Since the
                 * error cause code for "unknown parameter" and the
                 * "Unrecognized parameter" type is the same, we can
                 * construct the parameters in INIT ACK by copying the
                 * ERROR causes over.
                 */
                unk_param = (struct sctp_unrecognized_param *)
                            ((__u8 *)(err_chunk->chunk_hdr) +
                            sizeof(struct sctp_chunkhdr));
                /* Replace the cause code with the "Unrecognized parameter"
                 * parameter type.
                 */
                sctp_addto_chunk(repl, len, unk_param);
        }

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc));
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));

        /*
         * Note: After sending out INIT ACK with the State Cookie parameter,
         * "Z" MUST NOT allocate any resources for this new association.
         * Otherwise, "Z" will be vulnerable to resource attacks.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
        retval = SCTP_DISPOSITION_CONSUME;

        return retval;

nomem:
        retval = SCTP_DISPOSITION_NOMEM;
nomem_retval:
        if (new_asoc)
                sctp_association_free(new_asoc);
cleanup:
        if (err_chunk)
                sctp_chunk_free(err_chunk);
        return retval;
}

/*
 * Handle simultaneous INIT.
 * This means we started an INIT and then we got an INIT request from
 * our peer.
 *
 * Section: 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State (Item B)
 * This usually indicates an initialization collision, i.e., each
 * endpoint is attempting, at about the same time, to establish an
 * association with the other endpoint.
 *
 * Upon receipt of an INIT in the COOKIE-WAIT or COOKIE-ECHOED state, an
 * endpoint MUST respond with an INIT ACK using the same parameters it
 * sent in its original INIT chunk (including its Verification Tag,
 * unchanged). These original parameters are combined with those from the
 * newly received INIT chunk. The endpoint shall also generate a State
 * Cookie with the INIT ACK. The endpoint uses the parameters sent in its
 * INIT to calculate the State Cookie.
 *
 * After that, the endpoint MUST NOT change its state, the T1-init
 * timer shall be left running and the corresponding TCB MUST NOT be
 * destroyed. The normal procedures for handling State Cookies when
 * a TCB exists will resolve the duplicate INITs to a single association.
 *
 * For an endpoint that is in the COOKIE-ECHOED state it MUST populate
 * its Tie-Tags with the Tag information of itself and its peer (see
 * section 5.2.2 for a description of the Tie-Tags).
 *
 * Verification Tag: Not explicit, but an INIT can not have a valid
 * verification tag, so we skip the check.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_5_2_1_siminit(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* Call helper to do the real work for both simultaneous and
         * duplicate INIT chunk handling.
         */
        return sctp_sf_do_unexpected_init(net, ep, asoc, type, arg, commands);
}

/*
 * Handle duplicated INIT messages.  These are usually delayed
 * restransmissions.
 *
 * Section: 5.2.2 Unexpected INIT in States Other than CLOSED,
 * COOKIE-ECHOED and COOKIE-WAIT
 *
 * Unless otherwise stated, upon reception of an unexpected INIT for
 * this association, the endpoint shall generate an INIT ACK with a
 * State Cookie.  In the outbound INIT ACK the endpoint MUST copy its
 * current Verification Tag and peer's Verification Tag into a reserved
 * place within the state cookie.  We shall refer to these locations as
 * the Peer's-Tie-Tag and the Local-Tie-Tag.  The outbound SCTP packet
 * containing this INIT ACK MUST carry a Verification Tag value equal to
 * the Initiation Tag found in the unexpected INIT.  And the INIT ACK
 * MUST contain a new Initiation Tag (randomly generated see Section
 * 5.3.1).  Other parameters for the endpoint SHOULD be copied from the
 * existing parameters of the association (e.g. number of outbound
 * streams) into the INIT ACK and cookie.
 *
 * After sending out the INIT ACK, the endpoint shall take no further
 * actions, i.e., the existing association, including its current state,
 * and the corresponding TCB MUST NOT be changed.
 *
 * Note: Only when a TCB exists and the association is not in a COOKIE-
 * WAIT state are the Tie-Tags populated.  For a normal association INIT
 * (i.e. the endpoint is in a COOKIE-WAIT state), the Tie-Tags MUST be
 * set to 0 (indicating that no previous TCB existed).  The INIT ACK and
 * State Cookie are populated as specified in section 5.2.1.
 *
 * Verification Tag: Not specified, but an INIT has no way of knowing
 * what the verification tag could be, so we ignore it.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_5_2_2_dupinit(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* Call helper to do the real work for both simultaneous and
         * duplicate INIT chunk handling.
         */
        return sctp_sf_do_unexpected_init(net, ep, asoc, type, arg, commands);
}


/*
 * Unexpected INIT-ACK handler.
 *
 * Section 5.2.3
 * If an INIT ACK received by an endpoint in any state other than the
 * COOKIE-WAIT state, the endpoint should discard the INIT ACK chunk.
 * An unexpected INIT ACK usually indicates the processing of an old or
 * duplicated INIT chunk.
*/
enum sctp_disposition sctp_sf_do_5_2_3_initack(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* Per the above section, we'll discard the chunk if we have an
         * endpoint.  If this is an OOTB INIT-ACK, treat it as such.
         */
        if (ep == sctp_sk(net->sctp.ctl_sock)->ep)
                return sctp_sf_ootb(net, ep, asoc, type, arg, commands);
        else
                return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
}

static int sctp_sf_do_assoc_update(struct sctp_association *asoc,
                                   struct sctp_association *new,
                                   struct sctp_cmd_seq *cmds)
{
        struct net *net = asoc->base.net;
        struct sctp_chunk *abort;

        if (!sctp_assoc_update(asoc, new))
                return 0;

        abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr));
        if (abort) {
                sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
                sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
        }
        sctp_add_cmd_sf(cmds, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED));
        sctp_add_cmd_sf(cmds, SCTP_CMD_ASSOC_FAILED,
                        SCTP_PERR(SCTP_ERROR_RSRC_LOW));
        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);

        return -ENOMEM;
}

/* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A')
 *
 * Section 5.2.4
 *  A)  In this case, the peer may have restarted.
 */
static enum sctp_disposition sctp_sf_do_dupcook_a(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk,
                                        struct sctp_cmd_seq *commands,
                                        struct sctp_association *new_asoc)
{
        struct sctp_init_chunk *peer_init;
        enum sctp_disposition disposition;
        struct sctp_ulpevent *ev;
        struct sctp_chunk *repl;
        struct sctp_chunk *err;

        /* new_asoc is a brand-new association, so these are not yet
         * side effects--it is safe to run them here.
         */
        peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
        if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
                               GFP_ATOMIC))
                goto nomem;

        if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
                goto nomem;

        if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
                return SCTP_DISPOSITION_DISCARD;

        /* Make sure no new addresses are being added during the
         * restart.  Though this is a pretty complicated attack
         * since you'd have to get inside the cookie.
         */
        if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands))
                return SCTP_DISPOSITION_CONSUME;

        /* If the endpoint is in the SHUTDOWN-ACK-SENT state and recognizes
         * the peer has restarted (Action A), it MUST NOT setup a new
         * association but instead resend the SHUTDOWN ACK and send an ERROR
         * chunk with a "Cookie Received while Shutting Down" error cause to
         * its peer.
        */
        if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) {
                disposition = __sctp_sf_do_9_2_reshutack(net, ep, asoc,
                                                         SCTP_ST_CHUNK(chunk->chunk_hdr->type),
                                                         chunk, commands);
                if (SCTP_DISPOSITION_NOMEM == disposition)
                        goto nomem;

                err = sctp_make_op_error(asoc, chunk,
                                         SCTP_ERROR_COOKIE_IN_SHUTDOWN,
                                         NULL, 0, 0);
                if (err)
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(err));

                return SCTP_DISPOSITION_CONSUME;
        }

        /* For now, stop pending T3-rtx and SACK timers, fail any unsent/unacked
         * data. Consider the optional choice of resending of this data.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL());
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
        sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL());

        /* Stop pending T4-rto timer, teardown ASCONF queue, ASCONF-ACK queue
         * and ASCONF-ACK cache.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
        sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL());

        /* Update the content of current association. */
        if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands))
                goto nomem;

        repl = sctp_make_cookie_ack(asoc, chunk);
        if (!repl)
                goto nomem;

        /* Report association restart to upper layer. */
        ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0,
                                             asoc->c.sinit_num_ostreams,
                                             asoc->c.sinit_max_instreams,
                                             NULL, GFP_ATOMIC);
        if (!ev)
                goto nomem_ev;

        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));
        if ((sctp_state(asoc, SHUTDOWN_PENDING) ||
             sctp_state(asoc, SHUTDOWN_SENT)) &&
            (sctp_sstate(asoc->base.sk, CLOSING) ||
             sock_flag(asoc->base.sk, SOCK_DEAD))) {
                /* If the socket has been closed by user, don't
                 * transition to ESTABLISHED. Instead trigger SHUTDOWN
                 * bundled with COOKIE_ACK.
                 */
                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
                return sctp_sf_do_9_2_start_shutdown(net, ep, asoc,
                                                     SCTP_ST_CHUNK(0), repl,
                                                     commands);
        } else {
                sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                                SCTP_STATE(SCTP_STATE_ESTABLISHED));
                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
        }
        return SCTP_DISPOSITION_CONSUME;

nomem_ev:
        sctp_chunk_free(repl);
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'B')
 *
 * Section 5.2.4
 *   B) In this case, both sides may be attempting to start an association
 *      at about the same time but the peer endpoint started its INIT
 *      after responding to the local endpoint's INIT
 */
/* This case represents an initialization collision.  */
static enum sctp_disposition sctp_sf_do_dupcook_b(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk,
                                        struct sctp_cmd_seq *commands,
                                        struct sctp_association *new_asoc)
{
        struct sctp_init_chunk *peer_init;
        struct sctp_chunk *repl;

        /* new_asoc is a brand-new association, so these are not yet
         * side effects--it is safe to run them here.
         */
        peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
        if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
                               GFP_ATOMIC))
                goto nomem;

        if (sctp_auth_asoc_init_active_key(new_asoc, GFP_ATOMIC))
                goto nomem;

        if (!sctp_auth_chunk_verify(net, chunk, new_asoc))
                return SCTP_DISPOSITION_DISCARD;

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_ESTABLISHED));
        if (asoc->state < SCTP_STATE_ESTABLISHED)
                SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL());

        /* Update the content of current association.  */
        if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands))
                goto nomem;

        repl = sctp_make_cookie_ack(asoc, chunk);
        if (!repl)
                goto nomem;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));

        /* RFC 2960 5.1 Normal Establishment of an Association
         *
         * D) IMPLEMENTATION NOTE: An implementation may choose to
         * send the Communication Up notification to the SCTP user
         * upon reception of a valid COOKIE ECHO chunk.
         *
         * Sadly, this needs to be implemented as a side-effect, because
         * we are not guaranteed to have set the association id of the real
         * association and so these notifications need to be delayed until
         * the association id is allocated.
         */

        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_CHANGE, SCTP_U8(SCTP_COMM_UP));

        /* Sockets API Draft Section 5.3.1.6
         * When a peer sends a Adaptation Layer Indication parameter , SCTP
         * delivers this notification to inform the application that of the
         * peers requested adaptation layer.
         *
         * This also needs to be done as a side effect for the same reason as
         * above.
         */
        if (asoc->peer.adaptation_ind)
                sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL());

        if (!asoc->peer.auth_capable)
                sctp_add_cmd_sf(commands, SCTP_CMD_PEER_NO_AUTH, SCTP_NULL());

        return SCTP_DISPOSITION_CONSUME;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'C')
 *
 * Section 5.2.4
 *  C) In this case, the local endpoint's cookie has arrived late.
 *     Before it arrived, the local endpoint sent an INIT and received an
 *     INIT-ACK and finally sent a COOKIE ECHO with the peer's same tag
 *     but a new tag of its own.
 */
/* This case represents an initialization collision.  */
static enum sctp_disposition sctp_sf_do_dupcook_c(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk,
                                        struct sctp_cmd_seq *commands,
                                        struct sctp_association *new_asoc)
{
        /* The cookie should be silently discarded.
         * The endpoint SHOULD NOT change states and should leave
         * any timers running.
         */
        return SCTP_DISPOSITION_DISCARD;
}

/* Unexpected COOKIE-ECHO handler lost chunk (Table 2, action 'D')
 *
 * Section 5.2.4
 *
 * D) When both local and remote tags match the endpoint should always
 *    enter the ESTABLISHED state, if it has not already done so.
 */
/* This case represents an initialization collision.  */
static enum sctp_disposition sctp_sf_do_dupcook_d(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk,
                                        struct sctp_cmd_seq *commands,
                                        struct sctp_association *new_asoc)
{
        struct sctp_ulpevent *ev = NULL, *ai_ev = NULL, *auth_ev = NULL;
        struct sctp_chunk *repl;

        /* Clarification from Implementor's Guide:
         * D) When both local and remote tags match the endpoint should
         * enter the ESTABLISHED state, if it is in the COOKIE-ECHOED state.
         * It should stop any cookie timer that may be running and send
         * a COOKIE ACK.
         */

        if (!sctp_auth_chunk_verify(net, chunk, asoc))
                return SCTP_DISPOSITION_DISCARD;

        /* Don't accidentally move back into established state. */
        if (asoc->state < SCTP_STATE_ESTABLISHED) {
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
                sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                                SCTP_STATE(SCTP_STATE_ESTABLISHED));
                SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB);
                sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START,
                                SCTP_NULL());

                /* RFC 2960 5.1 Normal Establishment of an Association
                 *
                 * D) IMPLEMENTATION NOTE: An implementation may choose
                 * to send the Communication Up notification to the
                 * SCTP user upon reception of a valid COOKIE
                 * ECHO chunk.
                 */
                ev = sctp_ulpevent_make_assoc_change(asoc, 0,
                                             SCTP_COMM_UP, 0,
                                             asoc->c.sinit_num_ostreams,
                                             asoc->c.sinit_max_instreams,
                                             NULL, GFP_ATOMIC);
                if (!ev)
                        goto nomem;

                /* Sockets API Draft Section 5.3.1.6
                 * When a peer sends a Adaptation Layer Indication parameter,
                 * SCTP delivers this notification to inform the application
                 * that of the peers requested adaptation layer.
                 */
                if (asoc->peer.adaptation_ind) {
                        ai_ev = sctp_ulpevent_make_adaptation_indication(asoc,
                                                                 GFP_ATOMIC);
                        if (!ai_ev)
                                goto nomem;

                }

                if (!asoc->peer.auth_capable) {
                        auth_ev = sctp_ulpevent_make_authkey(asoc, 0,
                                                             SCTP_AUTH_NO_AUTH,
                                                             GFP_ATOMIC);
                        if (!auth_ev)
                                goto nomem;
                }
        }

        repl = sctp_make_cookie_ack(asoc, chunk);
        if (!repl)
                goto nomem;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));

        if (ev)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(ev));
        if (ai_ev)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                        SCTP_ULPEVENT(ai_ev));
        if (auth_ev)
                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(auth_ev));

        return SCTP_DISPOSITION_CONSUME;

nomem:
        if (auth_ev)
                sctp_ulpevent_free(auth_ev);
        if (ai_ev)
                sctp_ulpevent_free(ai_ev);
        if (ev)
                sctp_ulpevent_free(ev);
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Handle a duplicate COOKIE-ECHO.  This usually means a cookie-carrying
 * chunk was retransmitted and then delayed in the network.
 *
 * Section: 5.2.4 Handle a COOKIE ECHO when a TCB exists
 *
 * Verification Tag: None.  Do cookie validation.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_5_2_4_dupcook(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_association *new_asoc;
        struct sctp_chunk *chunk = arg;
        enum sctp_disposition retval;
        struct sctp_chunk *err_chk_p;
        int error = 0;
        char action;

        /* Make sure that the chunk has a valid length from the protocol
         * perspective.  In this case check to make sure we have at least
         * enough for the chunk header.  Cookie length verification is
         * done later.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) {
                if (!sctp_vtag_verify(chunk, asoc))
                        asoc = NULL;
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands);
        }

        /* "Decode" the chunk.  We have no optional parameters so we
         * are in good shape.
         */
        chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data;
        if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
                                        sizeof(struct sctp_chunkhdr)))
                goto nomem;

        /* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie
         * of a duplicate COOKIE ECHO match the Verification Tags of the
         * current association, consider the State Cookie valid even if
         * the lifespan is exceeded.
         */
        new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error,
                                      &err_chk_p);

        /* FIXME:
         * If the re-build failed, what is the proper error path
         * from here?
         *
         * [We should abort the association. --piggy]
         */
        if (!new_asoc) {
                /* FIXME: Several errors are possible.  A bad cookie should
                 * be silently discarded, but think about logging it too.
                 */
                switch (error) {
                case -SCTP_IERROR_NOMEM:
                        goto nomem;

                case -SCTP_IERROR_STALE_COOKIE:
                        sctp_send_stale_cookie_err(net, ep, asoc, chunk, commands,
                                                   err_chk_p);
                        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
                case -SCTP_IERROR_BAD_SIG:
                default:
                        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
                }
        }

        /* Update socket peer label if first association. */
        if (security_sctp_assoc_request(new_asoc, chunk->head_skb ?: chunk->skb)) {
                sctp_association_free(new_asoc);
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        /* Set temp so that it won't be added into hashtable */
        new_asoc->temp = 1;

        /* Compare the tie_tag in cookie with the verification tag of
         * current association.
         */
        action = sctp_tietags_compare(new_asoc, asoc);

        switch (action) {
        case 'A': /* Association restart. */
                retval = sctp_sf_do_dupcook_a(net, ep, asoc, chunk, commands,
                                              new_asoc);
                break;

        case 'B': /* Collision case B. */
                retval = sctp_sf_do_dupcook_b(net, ep, asoc, chunk, commands,
                                              new_asoc);
                break;

        case 'C': /* Collision case C. */
                retval = sctp_sf_do_dupcook_c(net, ep, asoc, chunk, commands,
                                              new_asoc);
                break;

        case 'D': /* Collision case D. */
                retval = sctp_sf_do_dupcook_d(net, ep, asoc, chunk, commands,
                                              new_asoc);
                break;

        default: /* Discard packet for all others. */
                retval = sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
                break;
        }

        /* Delete the temporary new association. */
        sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC, SCTP_ASOC(new_asoc));
        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());

        /* Restore association pointer to provide SCTP command interpreter
         * with a valid context in case it needs to manipulate
         * the queues */
        sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC,
                         SCTP_ASOC((struct sctp_association *)asoc));

        return retval;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Process an ABORT.  (SHUTDOWN-PENDING state)
 *
 * See sctp_sf_do_9_1_abort().
 */
enum sctp_disposition sctp_sf_shutdown_pending_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        if (!sctp_vtag_verify_either(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the ABORT chunk has a valid length.
         * Since this is an ABORT chunk, we have to discard it
         * because of the following text:
         * RFC 2960, Section 3.3.7
         *    If an endpoint receives an ABORT with a format error or for an
         *    association that doesn't exist, it MUST silently discard it.
         * Because the length is "invalid", we can't really discard just
         * as we do not know its true length.  So, to be safe, discard the
         * packet.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* ADD-IP: Special case for ABORT chunks
         * F4)  One special consideration is that ABORT Chunks arriving
         * destined to the IP address being deleted MUST be
         * ignored (see Section 5.3.1 for further details).
         */
        if (SCTP_ADDR_DEL ==
                    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        if (!sctp_err_chunk_valid(chunk))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
}

/*
 * Process an ABORT.  (SHUTDOWN-SENT state)
 *
 * See sctp_sf_do_9_1_abort().
 */
enum sctp_disposition sctp_sf_shutdown_sent_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        if (!sctp_vtag_verify_either(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the ABORT chunk has a valid length.
         * Since this is an ABORT chunk, we have to discard it
         * because of the following text:
         * RFC 2960, Section 3.3.7
         *    If an endpoint receives an ABORT with a format error or for an
         *    association that doesn't exist, it MUST silently discard it.
         * Because the length is "invalid", we can't really discard just
         * as we do not know its true length.  So, to be safe, discard the
         * packet.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* ADD-IP: Special case for ABORT chunks
         * F4)  One special consideration is that ABORT Chunks arriving
         * destined to the IP address being deleted MUST be
         * ignored (see Section 5.3.1 for further details).
         */
        if (SCTP_ADDR_DEL ==
                    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        if (!sctp_err_chunk_valid(chunk))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Stop the T2-shutdown timer. */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        /* Stop the T5-shutdown guard timer.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));

        return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
}

/*
 * Process an ABORT.  (SHUTDOWN-ACK-SENT state)
 *
 * See sctp_sf_do_9_1_abort().
 */
enum sctp_disposition sctp_sf_shutdown_ack_sent_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* The same T2 timer, so we should be able to use
         * common function with the SHUTDOWN-SENT state.
         */
        return sctp_sf_shutdown_sent_abort(net, ep, asoc, type, arg, commands);
}

/*
 * Handle an Error received in COOKIE_ECHOED state.
 *
 * Only handle the error type of stale COOKIE Error, the other errors will
 * be ignored.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_cookie_echoed_err(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_errhdr *err;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the ERROR chunk has a valid length.
         * The parameter walking depends on this as well.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_operr_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* Process the error here */
        /* FUTURE FIXME:  When PR-SCTP related and other optional
         * parms are emitted, this will have to change to handle multiple
         * errors.
         */
        sctp_walk_errors(err, chunk->chunk_hdr) {
                if (SCTP_ERROR_STALE_COOKIE == err->cause)
                        return sctp_sf_do_5_2_6_stale(net, ep, asoc, type,
                                                        arg, commands);
        }

        /* It is possible to have malformed error causes, and that
         * will cause us to end the walk early.  However, since
         * we are discarding the packet, there should be no adverse
         * affects.
         */
        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}

/*
 * Handle a Stale COOKIE Error
 *
 * Section: 5.2.6 Handle Stale COOKIE Error
 * If the association is in the COOKIE-ECHOED state, the endpoint may elect
 * one of the following three alternatives.
 * ...
 * 3) Send a new INIT chunk to the endpoint, adding a Cookie
 *    Preservative parameter requesting an extension to the lifetime of
 *    the State Cookie. When calculating the time extension, an
 *    implementation SHOULD use the RTT information measured based on the
 *    previous COOKIE ECHO / ERROR exchange, and should add no more
 *    than 1 second beyond the measured RTT, due to long State Cookie
 *    lifetimes making the endpoint more subject to a replay attack.
 *
 * Verification Tag:  Not explicit, but safe to ignore.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
static enum sctp_disposition sctp_sf_do_5_2_6_stale(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        int attempts = asoc->init_err_counter + 1;
        struct sctp_chunk *chunk = arg, *reply;
        struct sctp_cookie_preserve_param bht;
        struct sctp_bind_addr *bp;
        struct sctp_errhdr *err;
        u32 stale;

        if (attempts > asoc->max_init_attempts) {
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ETIMEDOUT));
                sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
                                SCTP_PERR(SCTP_ERROR_STALE_COOKIE));
                return SCTP_DISPOSITION_DELETE_TCB;
        }

        err = (struct sctp_errhdr *)(chunk->skb->data);

        /* When calculating the time extension, an implementation
         * SHOULD use the RTT information measured based on the
         * previous COOKIE ECHO / ERROR exchange, and should add no
         * more than 1 second beyond the measured RTT, due to long
         * State Cookie lifetimes making the endpoint more subject to
         * a replay attack.
         * Measure of Staleness's unit is usec. (1/1000000 sec)
         * Suggested Cookie Life-span Increment's unit is msec.
         * (1/1000 sec)
         * In general, if you use the suggested cookie life, the value
         * found in the field of measure of staleness should be doubled
         * to give ample time to retransmit the new cookie and thus
         * yield a higher probability of success on the reattempt.
         */
        stale = ntohl(*(__be32 *)((u8 *)err + sizeof(*err)));
        stale = (stale * 2) / 1000;

        bht.param_hdr.type = SCTP_PARAM_COOKIE_PRESERVATIVE;
        bht.param_hdr.length = htons(sizeof(bht));
        bht.lifespan_increment = htonl(stale);

        /* Build that new INIT chunk.  */
        bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
        reply = sctp_make_init(asoc, bp, GFP_ATOMIC, sizeof(bht));
        if (!reply)
                goto nomem;

        sctp_addto_chunk(reply, sizeof(bht), &bht);

        /* Clear peer's init_tag cached in assoc as we are sending a new INIT */
        sctp_add_cmd_sf(commands, SCTP_CMD_CLEAR_INIT_TAG, SCTP_NULL());

        /* Stop pending T3-rtx and heartbeat timers */
        sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL());
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());

        /* Delete non-primary peer ip addresses since we are transitioning
         * back to the COOKIE-WAIT state
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_DEL_NON_PRIMARY, SCTP_NULL());

        /* If we've sent any data bundled with COOKIE-ECHO we will need to
         * resend
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_T1_RETRAN,
                        SCTP_TRANSPORT(asoc->peer.primary_path));

        /* Cast away the const modifier, as we want to just
         * rerun it through as a sideffect.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());

        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_COOKIE_WAIT));
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));

        return SCTP_DISPOSITION_CONSUME;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Process an ABORT.
 *
 * Section: 9.1
 * After checking the Verification Tag, the receiving endpoint shall
 * remove the association from its record, and shall report the
 * termination to its upper layer.
 *
 * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules
 * B) Rules for packet carrying ABORT:
 *
 *  - The endpoint shall always fill in the Verification Tag field of the
 *    outbound packet with the destination endpoint's tag value if it
 *    is known.
 *
 *  - If the ABORT is sent in response to an OOTB packet, the endpoint
 *    MUST follow the procedure described in Section 8.4.
 *
 *  - The receiver MUST accept the packet if the Verification Tag
 *    matches either its own tag, OR the tag of its peer. Otherwise, the
 *    receiver MUST silently discard the packet and take no further
 *    action.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_9_1_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        if (!sctp_vtag_verify_either(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the ABORT chunk has a valid length.
         * Since this is an ABORT chunk, we have to discard it
         * because of the following text:
         * RFC 2960, Section 3.3.7
         *    If an endpoint receives an ABORT with a format error or for an
         *    association that doesn't exist, it MUST silently discard it.
         * Because the length is "invalid", we can't really discard just
         * as we do not know its true length.  So, to be safe, discard the
         * packet.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* ADD-IP: Special case for ABORT chunks
         * F4)  One special consideration is that ABORT Chunks arriving
         * destined to the IP address being deleted MUST be
         * ignored (see Section 5.3.1 for further details).
         */
        if (SCTP_ADDR_DEL ==
                    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        if (!sctp_err_chunk_valid(chunk))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
}

static enum sctp_disposition __sctp_sf_do_9_1_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        __be16 error = SCTP_ERROR_NO_ERROR;
        struct sctp_chunk *chunk = arg;
        unsigned int len;

        /* See if we have an error cause code in the chunk.  */
        len = ntohs(chunk->chunk_hdr->length);
        if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
                error = ((struct sctp_errhdr *)chunk->skb->data)->cause;

        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET));
        /* ASSOC_FAILED will DELETE_TCB. */
        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, SCTP_PERR(error));
        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);

        return SCTP_DISPOSITION_ABORT;
}

/*
 * Process an ABORT.  (COOKIE-WAIT state)
 *
 * See sctp_sf_do_9_1_abort() above.
 */
enum sctp_disposition sctp_sf_cookie_wait_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        __be16 error = SCTP_ERROR_NO_ERROR;
        struct sctp_chunk *chunk = arg;
        unsigned int len;

        if (!sctp_vtag_verify_either(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the ABORT chunk has a valid length.
         * Since this is an ABORT chunk, we have to discard it
         * because of the following text:
         * RFC 2960, Section 3.3.7
         *    If an endpoint receives an ABORT with a format error or for an
         *    association that doesn't exist, it MUST silently discard it.
         * Because the length is "invalid", we can't really discard just
         * as we do not know its true length.  So, to be safe, discard the
         * packet.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* See if we have an error cause code in the chunk.  */
        len = ntohs(chunk->chunk_hdr->length);
        if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
                error = ((struct sctp_errhdr *)chunk->skb->data)->cause;

        return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED, asoc,
                                      chunk->transport);
}

/*
 * Process an incoming ICMP as an ABORT.  (COOKIE-WAIT state)
 */
enum sctp_disposition sctp_sf_cookie_wait_icmp_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        return sctp_stop_t1_and_abort(net, commands, SCTP_ERROR_NO_ERROR,
                                      ENOPROTOOPT, asoc,
                                      (struct sctp_transport *)arg);
}

/*
 * Process an ABORT.  (COOKIE-ECHOED state)
 */
enum sctp_disposition sctp_sf_cookie_echoed_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* There is a single T1 timer, so we should be able to use
         * common function with the COOKIE-WAIT state.
         */
        return sctp_sf_cookie_wait_abort(net, ep, asoc, type, arg, commands);
}

/*
 * Stop T1 timer and abort association with "INIT failed".
 *
 * This is common code called by several sctp_sf_*_abort() functions above.
 */
static enum sctp_disposition sctp_stop_t1_and_abort(
                                        struct net *net,
                                        struct sctp_cmd_seq *commands,
                                        __be16 error, int sk_err,
                                        const struct sctp_association *asoc,
                                        struct sctp_transport *transport)
{
        pr_debug("%s: ABORT received (INIT)\n", __func__);

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_CLOSED));
        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(sk_err));
        /* CMD_INIT_FAILED will DELETE_TCB. */
        sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
                        SCTP_PERR(error));

        return SCTP_DISPOSITION_ABORT;
}

/*
 * sctp_sf_do_9_2_shut
 *
 * Section: 9.2
 * Upon the reception of the SHUTDOWN, the peer endpoint shall
 *  - enter the SHUTDOWN-RECEIVED state,
 *
 *  - stop accepting new data from its SCTP user
 *
 *  - verify, by checking the Cumulative TSN Ack field of the chunk,
 *    that all its outstanding DATA chunks have been received by the
 *    SHUTDOWN sender.
 *
 * Once an endpoint as reached the SHUTDOWN-RECEIVED state it MUST NOT
 * send a SHUTDOWN in response to a ULP request. And should discard
 * subsequent SHUTDOWN chunks.
 *
 * If there are still outstanding DATA chunks left, the SHUTDOWN
 * receiver shall continue to follow normal data transmission
 * procedures defined in Section 6 until all outstanding DATA chunks
 * are acknowledged; however, the SHUTDOWN receiver MUST NOT accept
 * new data from its SCTP user.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_9_2_shutdown(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        enum sctp_disposition disposition;
        struct sctp_chunk *chunk = arg;
        struct sctp_shutdownhdr *sdh;
        struct sctp_ulpevent *ev;
        __u32 ctsn;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the SHUTDOWN chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* Convert the elaborate header.  */
        sdh = (struct sctp_shutdownhdr *)chunk->skb->data;
        skb_pull(chunk->skb, sizeof(*sdh));
        chunk->subh.shutdown_hdr = sdh;
        ctsn = ntohl(sdh->cum_tsn_ack);

        if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
                pr_debug("%s: ctsn:%x, ctsn_ack_point:%x\n", __func__, ctsn,
                         asoc->ctsn_ack_point);

                return SCTP_DISPOSITION_DISCARD;
        }

        /* If Cumulative TSN Ack beyond the max tsn currently
         * send, terminating the association and respond to the
         * sender with an ABORT.
         */
        if (!TSN_lt(ctsn, asoc->next_tsn))
                return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);

        /* API 5.3.1.5 SCTP_SHUTDOWN_EVENT
         * When a peer sends a SHUTDOWN, SCTP delivers this notification to
         * inform the application that it should cease sending data.
         */
        ev = sctp_ulpevent_make_shutdown_event(asoc, 0, GFP_ATOMIC);
        if (!ev) {
                disposition = SCTP_DISPOSITION_NOMEM;
                goto out;
        }
        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));

        /* Upon the reception of the SHUTDOWN, the peer endpoint shall
         *  - enter the SHUTDOWN-RECEIVED state,
         *  - stop accepting new data from its SCTP user
         *
         * [This is implicit in the new state.]
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_SHUTDOWN_RECEIVED));
        disposition = SCTP_DISPOSITION_CONSUME;

        if (sctp_outq_is_empty(&asoc->outqueue)) {
                disposition = sctp_sf_do_9_2_shutdown_ack(net, ep, asoc, type,
                                                          arg, commands);
        }

        if (SCTP_DISPOSITION_NOMEM == disposition)
                goto out;

        /*  - verify, by checking the Cumulative TSN Ack field of the
         *    chunk, that all its outstanding DATA chunks have been
         *    received by the SHUTDOWN sender.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN,
                        SCTP_BE32(chunk->subh.shutdown_hdr->cum_tsn_ack));

out:
        return disposition;
}

/*
 * sctp_sf_do_9_2_shut_ctsn
 *
 * Once an endpoint has reached the SHUTDOWN-RECEIVED state,
 * it MUST NOT send a SHUTDOWN in response to a ULP request.
 * The Cumulative TSN Ack of the received SHUTDOWN chunk
 * MUST be processed.
 */
enum sctp_disposition sctp_sf_do_9_2_shut_ctsn(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_shutdownhdr *sdh;
        __u32 ctsn;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the SHUTDOWN chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        sdh = (struct sctp_shutdownhdr *)chunk->skb->data;
        ctsn = ntohl(sdh->cum_tsn_ack);

        if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
                pr_debug("%s: ctsn:%x, ctsn_ack_point:%x\n", __func__, ctsn,
                         asoc->ctsn_ack_point);

                return SCTP_DISPOSITION_DISCARD;
        }

        /* If Cumulative TSN Ack beyond the max tsn currently
         * send, terminating the association and respond to the
         * sender with an ABORT.
         */
        if (!TSN_lt(ctsn, asoc->next_tsn))
                return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);

        /* verify, by checking the Cumulative TSN Ack field of the
         * chunk, that all its outstanding DATA chunks have been
         * received by the SHUTDOWN sender.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN,
                        SCTP_BE32(sdh->cum_tsn_ack));

        return SCTP_DISPOSITION_CONSUME;
}

/* RFC 2960 9.2
 * If an endpoint is in SHUTDOWN-ACK-SENT state and receives an INIT chunk
 * (e.g., if the SHUTDOWN COMPLETE was lost) with source and destination
 * transport addresses (either in the IP addresses or in the INIT chunk)
 * that belong to this association, it should discard the INIT chunk and
 * retransmit the SHUTDOWN ACK chunk.
 */
static enum sctp_disposition
__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
                           const struct sctp_association *asoc,
                           const union sctp_subtype type, void *arg,
                           struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *reply;

        /* Make sure that the chunk has a valid length */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* Since we are not going to really process this INIT, there
         * is no point in verifying chunk boundaries.  Just generate
         * the SHUTDOWN ACK.
         */
        reply = sctp_make_shutdown_ack(asoc, chunk);
        if (NULL == reply)
                goto nomem;

        /* Set the transport for the SHUTDOWN ACK chunk and the timeout for
         * the T2-SHUTDOWN timer.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));

        /* and restart the T2-shutdown timer. */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));

        return SCTP_DISPOSITION_CONSUME;
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

enum sctp_disposition
sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep,
                         const struct sctp_association *asoc,
                         const union sctp_subtype type, void *arg,
                         struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        if (!chunk->singleton)
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        if (chunk->sctp_hdr->vtag != 0)
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);

        return __sctp_sf_do_9_2_reshutack(net, ep, asoc, type, arg, commands);
}

/*
 * sctp_sf_do_ecn_cwr
 *
 * Section:  Appendix A: Explicit Congestion Notification
 *
 * CWR:
 *
 * RFC 2481 details a specific bit for a sender to send in the header of
 * its next outbound TCP segment to indicate to its peer that it has
 * reduced its congestion window.  This is termed the CWR bit.  For
 * SCTP the same indication is made by including the CWR chunk.
 * This chunk contains one data element, i.e. the TSN number that
 * was sent in the ECNE chunk.  This element represents the lowest
 * TSN number in the datagram that was originally marked with the
 * CE bit.
 *
 * Verification Tag: 8.5 Verification Tag [Normal verification]
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_ecn_cwr(struct net *net,
                                         const struct sctp_endpoint *ep,
                                         const struct sctp_association *asoc,
                                         const union sctp_subtype type,
                                         void *arg,
                                         struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_cwrhdr *cwr;
        u32 lowest_tsn;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_ecne_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        cwr = (struct sctp_cwrhdr *)chunk->skb->data;
        skb_pull(chunk->skb, sizeof(*cwr));

        lowest_tsn = ntohl(cwr->lowest_tsn);

        /* Does this CWR ack the last sent congestion notification? */
        if (TSN_lte(asoc->last_ecne_tsn, lowest_tsn)) {
                /* Stop sending ECNE. */
                sctp_add_cmd_sf(commands,
                                SCTP_CMD_ECN_CWR,
                                SCTP_U32(lowest_tsn));
        }
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * sctp_sf_do_ecne
 *
 * Section:  Appendix A: Explicit Congestion Notification
 *
 * ECN-Echo
 *
 * RFC 2481 details a specific bit for a receiver to send back in its
 * TCP acknowledgements to notify the sender of the Congestion
 * Experienced (CE) bit having arrived from the network.  For SCTP this
 * same indication is made by including the ECNE chunk.  This chunk
 * contains one data element, i.e. the lowest TSN associated with the IP
 * datagram marked with the CE bit.....
 *
 * Verification Tag: 8.5 Verification Tag [Normal verification]
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_ecne(struct net *net,
                                      const struct sctp_endpoint *ep,
                                      const struct sctp_association *asoc,
                                      const union sctp_subtype type,
                                      void *arg, struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_ecnehdr *ecne;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_ecne_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        ecne = (struct sctp_ecnehdr *)chunk->skb->data;
        skb_pull(chunk->skb, sizeof(*ecne));

        /* If this is a newer ECNE than the last CWR packet we sent out */
        sctp_add_cmd_sf(commands, SCTP_CMD_ECN_ECNE,
                        SCTP_U32(ntohl(ecne->lowest_tsn)));

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Section: 6.2  Acknowledgement on Reception of DATA Chunks
 *
 * The SCTP endpoint MUST always acknowledge the reception of each valid
 * DATA chunk.
 *
 * The guidelines on delayed acknowledgement algorithm specified in
 * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an
 * acknowledgement SHOULD be generated for at least every second packet
 * (not every second DATA chunk) received, and SHOULD be generated within
 * 200 ms of the arrival of any unacknowledged DATA chunk. In some
 * situations it may be beneficial for an SCTP transmitter to be more
 * conservative than the algorithms detailed in this document allow.
 * However, an SCTP transmitter MUST NOT be more aggressive than the
 * following algorithms allow.
 *
 * A SCTP receiver MUST NOT generate more than one SACK for every
 * incoming packet, other than to update the offered window as the
 * receiving application consumes new data.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        union sctp_arg force = SCTP_NOFORCE();
        struct sctp_chunk *chunk = arg;
        int error;

        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        error = sctp_eat_data(asoc, chunk, commands);
        switch (error) {
        case SCTP_IERROR_NO_ERROR:
                break;
        case SCTP_IERROR_HIGH_TSN:
        case SCTP_IERROR_BAD_STREAM:
                SCTP_INC_STATS(net, SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
                goto discard_noforce;
        case SCTP_IERROR_DUP_TSN:
        case SCTP_IERROR_IGNORE_TSN:
                SCTP_INC_STATS(net, SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
                goto discard_force;
        case SCTP_IERROR_NO_DATA:
                return SCTP_DISPOSITION_ABORT;
        case SCTP_IERROR_PROTO_VIOLATION:
                return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
                                               (u8 *)chunk->subh.data_hdr,
                                               sctp_datahdr_len(&asoc->stream));
        default:
                BUG();
        }

        if (chunk->chunk_hdr->flags & SCTP_DATA_SACK_IMM)
                force = SCTP_FORCE();

        if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
        }

        /* If this is the last chunk in a packet, we need to count it
         * toward sack generation.  Note that we need to SACK every
         * OTHER packet containing data chunks, EVEN IF WE DISCARD
         * THEM.  We elect to NOT generate SACK's if the chunk fails
         * the verification tag test.
         *
         * RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks
         *
         * The SCTP endpoint MUST always acknowledge the reception of
         * each valid DATA chunk.
         *
         * The guidelines on delayed acknowledgement algorithm
         * specified in  Section 4.2 of [RFC2581] SHOULD be followed.
         * Specifically, an acknowledgement SHOULD be generated for at
         * least every second packet (not every second DATA chunk)
         * received, and SHOULD be generated within 200 ms of the
         * arrival of any unacknowledged DATA chunk.  In some
         * situations it may be beneficial for an SCTP transmitter to
         * be more conservative than the algorithms detailed in this
         * document allow. However, an SCTP transmitter MUST NOT be
         * more aggressive than the following algorithms allow.
         */
        if (chunk->end_of_packet)
                sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force);

        return SCTP_DISPOSITION_CONSUME;

discard_force:
        /* RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks
         *
         * When a packet arrives with duplicate DATA chunk(s) and with
         * no new DATA chunk(s), the endpoint MUST immediately send a
         * SACK with no delay.  If a packet arrives with duplicate
         * DATA chunk(s) bundled with new DATA chunks, the endpoint
         * MAY immediately send a SACK.  Normally receipt of duplicate
         * DATA chunks will occur when the original SACK chunk was lost
         * and the peer's RTO has expired.  The duplicate TSN number(s)
         * SHOULD be reported in the SACK as duplicate.
         */
        /* In our case, we split the MAY SACK advice up whether or not
         * the last chunk is a duplicate.'
         */
        if (chunk->end_of_packet)
                sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
        return SCTP_DISPOSITION_DISCARD;

discard_noforce:
        if (chunk->end_of_packet)
                sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force);

        return SCTP_DISPOSITION_DISCARD;
}

/*
 * sctp_sf_eat_data_fast_4_4
 *
 * Section: 4 (4)
 * (4) In SHUTDOWN-SENT state the endpoint MUST acknowledge any received
 *    DATA chunks without delay.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_eat_data_fast_4_4(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        int error;

        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        if (!sctp_chunk_length_valid(chunk, sctp_datachk_len(&asoc->stream)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        error = sctp_eat_data(asoc, chunk, commands);
        switch (error) {
        case SCTP_IERROR_NO_ERROR:
        case SCTP_IERROR_HIGH_TSN:
        case SCTP_IERROR_DUP_TSN:
        case SCTP_IERROR_IGNORE_TSN:
        case SCTP_IERROR_BAD_STREAM:
                break;
        case SCTP_IERROR_NO_DATA:
                return SCTP_DISPOSITION_ABORT;
        case SCTP_IERROR_PROTO_VIOLATION:
                return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
                                               (u8 *)chunk->subh.data_hdr,
                                               sctp_datahdr_len(&asoc->stream));
        default:
                BUG();
        }

        /* Go a head and force a SACK, since we are shutting down. */

        /* Implementor's Guide.
         *
         * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately
         * respond to each received packet containing one or more DATA chunk(s)
         * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer
         */
        if (chunk->end_of_packet) {
                /* We must delay the chunk creation since the cumulative
                 * TSN has not been updated yet.
                 */
                sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL());
                sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
        }

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Section: 6.2  Processing a Received SACK
 * D) Any time a SACK arrives, the endpoint performs the following:
 *
 *     i) If Cumulative TSN Ack is less than the Cumulative TSN Ack Point,
 *     then drop the SACK.   Since Cumulative TSN Ack is monotonically
 *     increasing, a SACK whose Cumulative TSN Ack is less than the
 *     Cumulative TSN Ack Point indicates an out-of-order SACK.
 *
 *     ii) Set rwnd equal to the newly received a_rwnd minus the number
 *     of bytes still outstanding after processing the Cumulative TSN Ack
 *     and the Gap Ack Blocks.
 *
 *     iii) If the SACK is missing a TSN that was previously
 *     acknowledged via a Gap Ack Block (e.g., the data receiver
 *     reneged on the data), then mark the corresponding DATA chunk
 *     as available for retransmit:  Mark it as missing for fast
 *     retransmit as described in Section 7.2.4 and if no retransmit
 *     timer is running for the destination address to which the DATA
 *     chunk was originally transmitted, then T3-rtx is started for
 *     that destination address.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_sackhdr *sackh;
        __u32 ctsn;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the SACK chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_sack_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* Pull the SACK chunk from the data buffer */
        sackh = sctp_sm_pull_sack(chunk);
        /* Was this a bogus SACK? */
        if (!sackh)
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        chunk->subh.sack_hdr = sackh;
        ctsn = ntohl(sackh->cum_tsn_ack);

        /* If Cumulative TSN Ack beyond the max tsn currently
         * send, terminating the association and respond to the
         * sender with an ABORT.
         */
        if (TSN_lte(asoc->next_tsn, ctsn))
                return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);

        trace_sctp_probe(ep, asoc, chunk);

        /* i) If Cumulative TSN Ack is less than the Cumulative TSN
         *     Ack Point, then drop the SACK.  Since Cumulative TSN
         *     Ack is monotonically increasing, a SACK whose
         *     Cumulative TSN Ack is less than the Cumulative TSN Ack
         *     Point indicates an out-of-order SACK.
         */
        if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
                pr_debug("%s: ctsn:%x, ctsn_ack_point:%x\n", __func__, ctsn,
                         asoc->ctsn_ack_point);

                return SCTP_DISPOSITION_DISCARD;
        }

        /* Return this SACK for further processing.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk));

        /* Note: We do the rest of the work on the PROCESS_SACK
         * sideeffect.
         */
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Generate an ABORT in response to a packet.
 *
 * Section: 8.4 Handle "Out of the blue" Packets, sctpimpguide 2.41
 *
 * 8) The receiver should respond to the sender of the OOTB packet with
 *    an ABORT.  When sending the ABORT, the receiver of the OOTB packet
 *    MUST fill in the Verification Tag field of the outbound packet
 *    with the value found in the Verification Tag field of the OOTB
 *    packet and set the T-bit in the Chunk Flags to indicate that the
 *    Verification Tag is reflected.  After sending this ABORT, the
 *    receiver of the OOTB packet shall discard the OOTB packet and take
 *    no further action.
 *
 * Verification Tag:
 *
 * The return value is the disposition of the chunk.
*/
static enum sctp_disposition sctp_sf_tabort_8_4_8(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_packet *packet = NULL;
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *abort;

        packet = sctp_ootb_pkt_new(net, asoc, chunk);
        if (!packet)
                return SCTP_DISPOSITION_NOMEM;

        /* Make an ABORT. The T bit will be set if the asoc
         * is NULL.
         */
        abort = sctp_make_abort(asoc, chunk, 0);
        if (!abort) {
                sctp_ootb_pkt_free(packet);
                return SCTP_DISPOSITION_NOMEM;
        }

        /* Reflect vtag if T-Bit is set */
        if (sctp_test_T_bit(abort))
                packet->vtag = ntohl(chunk->sctp_hdr->vtag);

        /* Set the skb to the belonging sock for accounting.  */
        abort->skb->sk = ep->base.sk;

        sctp_packet_append_chunk(packet, abort);

        sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(packet));

        SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);

        sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        return SCTP_DISPOSITION_CONSUME;
}

/* Handling of SCTP Packets Containing an INIT Chunk Matching an
 * Existing Associations when the UDP encap port is incorrect.
 *
 * From Section 4 at draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.
 */
static enum sctp_disposition sctp_sf_new_encap_port(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_packet *packet = NULL;
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *abort;

        packet = sctp_ootb_pkt_new(net, asoc, chunk);
        if (!packet)
                return SCTP_DISPOSITION_NOMEM;

        abort = sctp_make_new_encap_port(asoc, chunk);
        if (!abort) {
                sctp_ootb_pkt_free(packet);
                return SCTP_DISPOSITION_NOMEM;
        }

        abort->skb->sk = ep->base.sk;

        sctp_packet_append_chunk(packet, abort);

        sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
                        SCTP_PACKET(packet));

        SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);

        sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Received an ERROR chunk from peer.  Generate SCTP_REMOTE_ERROR
 * event as ULP notification for each cause included in the chunk.
 *
 * API 5.3.1.3 - SCTP_REMOTE_ERROR
 *
 * The return value is the disposition of the chunk.
*/
enum sctp_disposition sctp_sf_operr_notify(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_errhdr *err;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the ERROR chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_operr_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);
        sctp_walk_errors(err, chunk->chunk_hdr);
        if ((void *)err != (void *)chunk->chunk_end)
                return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
                                                  (void *)err, commands);

        sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
                        SCTP_CHUNK(chunk));

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Process an inbound SHUTDOWN ACK.
 *
 * From Section 9.2:
 * Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall
 * stop the T2-shutdown timer, send a SHUTDOWN COMPLETE chunk to its
 * peer, and remove all record of the association.
 *
 * The return value is the disposition.
 */
enum sctp_disposition sctp_sf_do_9_2_final(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *reply;
        struct sctp_ulpevent *ev;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);
        /* 10.2 H) SHUTDOWN COMPLETE notification
         *
         * When SCTP completes the shutdown procedures (section 9.2) this
         * notification is passed to the upper layer.
         */
        ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP,
                                             0, 0, 0, NULL, GFP_ATOMIC);
        if (!ev)
                goto nomem;

        /* ...send a SHUTDOWN COMPLETE chunk to its peer, */
        reply = sctp_make_shutdown_complete(asoc, chunk);
        if (!reply)
                goto nomem_chunk;

        /* Do all the commands now (after allocation), so that we
         * have consistent state if memory allocation fails
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev));

        /* Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall
         * stop the T2-shutdown timer,
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_CLOSED));
        SCTP_INC_STATS(net, SCTP_MIB_SHUTDOWNS);
        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));

        /* ...and remove all record of the association. */
        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());
        return SCTP_DISPOSITION_DELETE_TCB;

nomem_chunk:
        sctp_ulpevent_free(ev);
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * RFC 2960, 8.4 - Handle "Out of the blue" Packets, sctpimpguide 2.41.
 *
 * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should
 *    respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE.
 *    When sending the SHUTDOWN COMPLETE, the receiver of the OOTB
 *    packet must fill in the Verification Tag field of the outbound
 *    packet with the Verification Tag received in the SHUTDOWN ACK and
 *    set the T-bit in the Chunk Flags to indicate that the Verification
 *    Tag is reflected.
 *
 * 8) The receiver should respond to the sender of the OOTB packet with
 *    an ABORT.  When sending the ABORT, the receiver of the OOTB packet
 *    MUST fill in the Verification Tag field of the outbound packet
 *    with the value found in the Verification Tag field of the OOTB
 *    packet and set the T-bit in the Chunk Flags to indicate that the
 *    Verification Tag is reflected.  After sending this ABORT, the
 *    receiver of the OOTB packet shall discard the OOTB packet and take
 *    no further action.
 */
enum sctp_disposition sctp_sf_ootb(struct net *net,
                                   const struct sctp_endpoint *ep,
                                   const struct sctp_association *asoc,
                                   const union sctp_subtype type,
                                   void *arg, struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sk_buff *skb = chunk->skb;
        struct sctp_chunkhdr *ch;
        struct sctp_errhdr *err;
        int ootb_cookie_ack = 0;
        int ootb_shut_ack = 0;
        __u8 *ch_end;

        SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);

        if (asoc && !sctp_vtag_verify(chunk, asoc))
                asoc = NULL;

        ch = (struct sctp_chunkhdr *)chunk->chunk_hdr;
        do {
                /* Report violation if the chunk is less then minimal */
                if (ntohs(ch->length) < sizeof(*ch))
                        return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

                /* Report violation if chunk len overflows */
                ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
                if (ch_end > skb_tail_pointer(skb))
                        return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

                /* Now that we know we at least have a chunk header,
                 * do things that are type appropriate.
                 */
                if (SCTP_CID_SHUTDOWN_ACK == ch->type)
                        ootb_shut_ack = 1;

                /* RFC 2960, Section 3.3.7
                 *   Moreover, under any circumstances, an endpoint that
                 *   receives an ABORT  MUST NOT respond to that ABORT by
                 *   sending an ABORT of its own.
                 */
                if (SCTP_CID_ABORT == ch->type)
                        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

                /* RFC 8.4, 7) If the packet contains a "Stale cookie" ERROR
                 * or a COOKIE ACK the SCTP Packet should be silently
                 * discarded.
                 */

                if (SCTP_CID_COOKIE_ACK == ch->type)
                        ootb_cookie_ack = 1;

                if (SCTP_CID_ERROR == ch->type) {
                        sctp_walk_errors(err, ch) {
                                if (SCTP_ERROR_STALE_COOKIE == err->cause) {
                                        ootb_cookie_ack = 1;
                                        break;
                                }
                        }
                }

                ch = (struct sctp_chunkhdr *)ch_end;
        } while (ch_end < skb_tail_pointer(skb));

        if (ootb_shut_ack)
                return sctp_sf_shut_8_4_5(net, ep, asoc, type, arg, commands);
        else if (ootb_cookie_ack)
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        else
                return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands);
}

/*
 * Handle an "Out of the blue" SHUTDOWN ACK.
 *
 * Section: 8.4 5, sctpimpguide 2.41.
 *
 * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should
 *    respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE.
 *    When sending the SHUTDOWN COMPLETE, the receiver of the OOTB
 *    packet must fill in the Verification Tag field of the outbound
 *    packet with the Verification Tag received in the SHUTDOWN ACK and
 *    set the T-bit in the Chunk Flags to indicate that the Verification
 *    Tag is reflected.
 *
 * Inputs
 * (endpoint, asoc, type, arg, commands)
 *
 * Outputs
 * (enum sctp_disposition)
 *
 * The return value is the disposition of the chunk.
 */
static enum sctp_disposition sctp_sf_shut_8_4_5(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_packet *packet = NULL;
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *shut;

        packet = sctp_ootb_pkt_new(net, asoc, chunk);
        if (!packet)
                return SCTP_DISPOSITION_NOMEM;

        /* Make an SHUTDOWN_COMPLETE.
         * The T bit will be set if the asoc is NULL.
         */
        shut = sctp_make_shutdown_complete(asoc, chunk);
        if (!shut) {
                sctp_ootb_pkt_free(packet);
                return SCTP_DISPOSITION_NOMEM;
        }

        /* Reflect vtag if T-Bit is set */
        if (sctp_test_T_bit(shut))
                packet->vtag = ntohl(chunk->sctp_hdr->vtag);

        /* Set the skb to the belonging sock for accounting.  */
        shut->skb->sk = ep->base.sk;

        sctp_packet_append_chunk(packet, shut);

        sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
                        SCTP_PACKET(packet));

        SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);

        /* We need to discard the rest of the packet to prevent
         * potential boomming attacks from additional bundled chunks.
         * This is documented in SCTP Threats ID.
         */
        return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
}

/*
 * Handle SHUTDOWN ACK in COOKIE_ECHOED or COOKIE_WAIT state.
 *
 * Verification Tag:  8.5.1 E) Rules for packet carrying a SHUTDOWN ACK
 *   If the receiver is in COOKIE-ECHOED or COOKIE-WAIT state the
 *   procedures in section 8.4 SHOULD be followed, in other words it
 *   should be treated as an Out Of The Blue packet.
 *   [This means that we do NOT check the Verification Tag on these
 *   chunks. --piggy ]
 *
 */
enum sctp_disposition sctp_sf_do_8_5_1_E_sa(struct net *net,
                                            const struct sctp_endpoint *ep,
                                            const struct sctp_association *asoc,
                                            const union sctp_subtype type,
                                            void *arg,
                                            struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        if (!sctp_vtag_verify(chunk, asoc))
                asoc = NULL;

        /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* Although we do have an association in this case, it corresponds
         * to a restarted association. So the packet is treated as an OOTB
         * packet and the state function that handles OOTB SHUTDOWN_ACK is
         * called with a NULL association.
         */
        SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);

        return sctp_sf_shut_8_4_5(net, ep, NULL, type, arg, commands);
}

/* ADDIP Section 4.2 Upon reception of an ASCONF Chunk.  */
enum sctp_disposition sctp_sf_do_asconf(struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_paramhdr *err_param = NULL;
        struct sctp_chunk *asconf_ack = NULL;
        struct sctp_chunk *chunk = arg;
        struct sctp_addiphdr *hdr;
        __u32 serial;

        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        /* Make sure that the ASCONF ADDIP chunk has a valid length.  */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* ADD-IP: Section 4.1.1
         * This chunk MUST be sent in an authenticated way by using
         * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
         * is received unauthenticated it MUST be silently discarded as
         * described in [I-D.ietf-tsvwg-sctp-auth].
         */
        if (!asoc->peer.asconf_capable ||
            (!net->sctp.addip_noauth && !chunk->auth))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        hdr = (struct sctp_addiphdr *)chunk->skb->data;
        serial = ntohl(hdr->serial);

        /* Verify the ASCONF chunk before processing it. */
        if (!sctp_verify_asconf(asoc, chunk, true, &err_param))
                return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
                                                  (void *)err_param, commands);

        /* ADDIP 5.2 E1) Compare the value of the serial number to the value
         * the endpoint stored in a new association variable
         * 'Peer-Serial-Number'.
         */
        if (serial == asoc->peer.addip_serial + 1) {
                /* If this is the first instance of ASCONF in the packet,
                 * we can clean our old ASCONF-ACKs.
                 */
                if (!chunk->has_asconf)
                        sctp_assoc_clean_asconf_ack_cache(asoc);

                /* ADDIP 5.2 E4) When the Sequence Number matches the next one
                 * expected, process the ASCONF as described below and after
                 * processing the ASCONF Chunk, append an ASCONF-ACK Chunk to
                 * the response packet and cache a copy of it (in the event it
                 * later needs to be retransmitted).
                 *
                 * Essentially, do V1-V5.
                 */
                asconf_ack = sctp_process_asconf((struct sctp_association *)
                                                 asoc, chunk);
                if (!asconf_ack)
                        return SCTP_DISPOSITION_NOMEM;
        } else if (serial < asoc->peer.addip_serial + 1) {
                /* ADDIP 5.2 E2)
                 * If the value found in the Sequence Number is less than the
                 * ('Peer- Sequence-Number' + 1), simply skip to the next
                 * ASCONF, and include in the outbound response packet
                 * any previously cached ASCONF-ACK response that was
                 * sent and saved that matches the Sequence Number of the
                 * ASCONF.  Note: It is possible that no cached ASCONF-ACK
                 * Chunk exists.  This will occur when an older ASCONF
                 * arrives out of order.  In such a case, the receiver
                 * should skip the ASCONF Chunk and not include ASCONF-ACK
                 * Chunk for that chunk.
                 */
                asconf_ack = sctp_assoc_lookup_asconf_ack(asoc, hdr->serial);
                if (!asconf_ack)
                        return SCTP_DISPOSITION_DISCARD;

                /* Reset the transport so that we select the correct one
                 * this time around.  This is to make sure that we don't
                 * accidentally use a stale transport that's been removed.
                 */
                asconf_ack->transport = NULL;
        } else {
                /* ADDIP 5.2 E5) Otherwise, the ASCONF Chunk is discarded since
                 * it must be either a stale packet or from an attacker.
                 */
                return SCTP_DISPOSITION_DISCARD;
        }

        /* ADDIP 5.2 E6)  The destination address of the SCTP packet
         * containing the ASCONF-ACK Chunks MUST be the source address of
         * the SCTP packet that held the ASCONF Chunks.
         *
         * To do this properly, we'll set the destination address of the chunk
         * and at the transmit time, will try look up the transport to use.
         * Since ASCONFs may be bundled, the correct transport may not be
         * created until we process the entire packet, thus this workaround.
         */
        asconf_ack->dest = chunk->source;
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(asconf_ack));
        if (asoc->new_transport) {
                sctp_sf_heartbeat(ep, asoc, type, asoc->new_transport, commands);
                ((struct sctp_association *)asoc)->new_transport = NULL;
        }

        return SCTP_DISPOSITION_CONSUME;
}

static enum sctp_disposition sctp_send_next_asconf(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *asconf;
        struct list_head *entry;

        if (list_empty(&asoc->addip_chunk_list))
                return SCTP_DISPOSITION_CONSUME;

        entry = asoc->addip_chunk_list.next;
        asconf = list_entry(entry, struct sctp_chunk, list);

        list_del_init(entry);
        sctp_chunk_hold(asconf);
        asoc->addip_last_asconf = asconf;

        return sctp_sf_do_prm_asconf(net, ep, asoc, type, asconf, commands);
}

/*
 * ADDIP Section 4.3 General rules for address manipulation
 * When building TLV parameters for the ASCONF Chunk that will add or
 * delete IP addresses the D0 to D13 rules should be applied:
 */
enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
                                            const struct sctp_endpoint *ep,
                                            const struct sctp_association *asoc,
                                            const union sctp_subtype type,
                                            void *arg,
                                            struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *last_asconf = asoc->addip_last_asconf;
        struct sctp_paramhdr *err_param = NULL;
        struct sctp_chunk *asconf_ack = arg;
        struct sctp_addiphdr *addip_hdr;
        __u32 sent_serial, rcvd_serial;
        struct sctp_chunk *abort;

        if (!sctp_vtag_verify(asconf_ack, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        /* Make sure that the ADDIP chunk has a valid length.  */
        if (!sctp_chunk_length_valid(asconf_ack,
                                     sizeof(struct sctp_addip_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        /* ADD-IP, Section 4.1.2:
         * This chunk MUST be sent in an authenticated way by using
         * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk
         * is received unauthenticated it MUST be silently discarded as
         * described in [I-D.ietf-tsvwg-sctp-auth].
         */
        if (!asoc->peer.asconf_capable ||
            (!net->sctp.addip_noauth && !asconf_ack->auth))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        addip_hdr = (struct sctp_addiphdr *)asconf_ack->skb->data;
        rcvd_serial = ntohl(addip_hdr->serial);

        /* Verify the ASCONF-ACK chunk before processing it. */
        if (!sctp_verify_asconf(asoc, asconf_ack, false, &err_param))
                return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
                           (void *)err_param, commands);

        if (last_asconf) {
                addip_hdr = last_asconf->subh.addip_hdr;
                sent_serial = ntohl(addip_hdr->serial);
        } else {
                sent_serial = asoc->addip_serial - 1;
        }

        /* D0) If an endpoint receives an ASCONF-ACK that is greater than or
         * equal to the next serial number to be used but no ASCONF chunk is
         * outstanding the endpoint MUST ABORT the association. Note that a
         * sequence number is greater than if it is no more than 2^^31-1
         * larger than the current sequence number (using serial arithmetic).
         */
        if (ADDIP_SERIAL_gte(rcvd_serial, sent_serial + 1) &&
            !(asoc->addip_last_asconf)) {
                abort = sctp_make_abort(asoc, asconf_ack,
                                        sizeof(struct sctp_errhdr));
                if (abort) {
                        sctp_init_cause(abort, SCTP_ERROR_ASCONF_ACK, 0);
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(abort));
                }
                /* We are going to ABORT, so we might as well stop
                 * processing the rest of the chunks in the packet.
                 */
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
                sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ECONNABORTED));
                sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                SCTP_PERR(SCTP_ERROR_ASCONF_ACK));
                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                return SCTP_DISPOSITION_ABORT;
        }

        if ((rcvd_serial == sent_serial) && asoc->addip_last_asconf) {
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));

                if (!sctp_process_asconf_ack((struct sctp_association *)asoc,
                                             asconf_ack))
                        return sctp_send_next_asconf(net, ep,
                                        (struct sctp_association *)asoc,
                                                        type, commands);

                abort = sctp_make_abort(asoc, asconf_ack,
                                        sizeof(struct sctp_errhdr));
                if (abort) {
                        sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(abort));
                }
                /* We are going to ABORT, so we might as well stop
                 * processing the rest of the chunks in the packet.
                 */
                sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ECONNABORTED));
                sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                SCTP_PERR(SCTP_ERROR_ASCONF_ACK));
                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                return SCTP_DISPOSITION_ABORT;
        }

        return SCTP_DISPOSITION_DISCARD;
}

/* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */
enum sctp_disposition sctp_sf_do_reconf(struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_paramhdr *err_param = NULL;
        struct sctp_chunk *chunk = arg;
        struct sctp_reconf_chunk *hdr;
        union sctp_params param;

        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        /* Make sure that the RECONF chunk has a valid length.  */
        if (!sctp_chunk_length_valid(chunk, sizeof(*hdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        if (!sctp_verify_reconf(asoc, chunk, &err_param))
                return sctp_sf_violation_paramlen(net, ep, asoc, type, arg,
                                                  (void *)err_param, commands);

        hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
        sctp_walk_params(param, hdr) {
                struct sctp_chunk *reply = NULL;
                struct sctp_ulpevent *ev = NULL;

                if (param.p->type == SCTP_PARAM_RESET_OUT_REQUEST)
                        reply = sctp_process_strreset_outreq(
                                (struct sctp_association *)asoc, param, &ev);
                else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST)
                        reply = sctp_process_strreset_inreq(
                                (struct sctp_association *)asoc, param, &ev);
                else if (param.p->type == SCTP_PARAM_RESET_TSN_REQUEST)
                        reply = sctp_process_strreset_tsnreq(
                                (struct sctp_association *)asoc, param, &ev);
                else if (param.p->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS)
                        reply = sctp_process_strreset_addstrm_out(
                                (struct sctp_association *)asoc, param, &ev);
                else if (param.p->type == SCTP_PARAM_RESET_ADD_IN_STREAMS)
                        reply = sctp_process_strreset_addstrm_in(
                                (struct sctp_association *)asoc, param, &ev);
                else if (param.p->type == SCTP_PARAM_RESET_RESPONSE)
                        reply = sctp_process_strreset_resp(
                                (struct sctp_association *)asoc, param, &ev);

                if (ev)
                        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                        SCTP_ULPEVENT(ev));

                if (reply)
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(reply));
        }

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP
 *
 * When a FORWARD TSN chunk arrives, the data receiver MUST first update
 * its cumulative TSN point to the value carried in the FORWARD TSN
 * chunk, and then MUST further advance its cumulative TSN point locally
 * if possible.
 * After the above processing, the data receiver MUST stop reporting any
 * missing TSNs earlier than or equal to the new cumulative TSN point.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
                                          const struct sctp_endpoint *ep,
                                          const struct sctp_association *asoc,
                                          const union sctp_subtype type,
                                          void *arg,
                                          struct sctp_cmd_seq *commands)
{
        struct sctp_fwdtsn_hdr *fwdtsn_hdr;
        struct sctp_chunk *chunk = arg;
        __u16 len;
        __u32 tsn;

        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        if (!asoc->peer.prsctp_capable)
                return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);

        /* Make sure that the FORWARD_TSN chunk has valid length.  */
        if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data;
        chunk->subh.fwdtsn_hdr = fwdtsn_hdr;
        len = ntohs(chunk->chunk_hdr->length);
        len -= sizeof(struct sctp_chunkhdr);
        skb_pull(chunk->skb, len);

        tsn = ntohl(fwdtsn_hdr->new_cum_tsn);
        pr_debug("%s: TSN 0x%x\n", __func__, tsn);

        /* The TSN is too high--silently discard the chunk and count on it
         * getting retransmitted later.
         */
        if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
                goto discard_noforce;

        if (!asoc->stream.si->validate_ftsn(chunk))
                goto discard_noforce;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
        if (len > sctp_ftsnhdr_len(&asoc->stream))
                sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
                                SCTP_CHUNK(chunk));

        /* Count this as receiving DATA. */
        if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));
        }

        /* FIXME: For now send a SACK, but DATA processing may
         * send another.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE());

        return SCTP_DISPOSITION_CONSUME;

discard_noforce:
        return SCTP_DISPOSITION_DISCARD;
}

enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_fwdtsn_hdr *fwdtsn_hdr;
        struct sctp_chunk *chunk = arg;
        __u16 len;
        __u32 tsn;

        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        if (!asoc->peer.prsctp_capable)
                return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);

        /* Make sure that the FORWARD_TSN chunk has a valid length.  */
        if (!sctp_chunk_length_valid(chunk, sctp_ftsnchk_len(&asoc->stream)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data;
        chunk->subh.fwdtsn_hdr = fwdtsn_hdr;
        len = ntohs(chunk->chunk_hdr->length);
        len -= sizeof(struct sctp_chunkhdr);
        skb_pull(chunk->skb, len);

        tsn = ntohl(fwdtsn_hdr->new_cum_tsn);
        pr_debug("%s: TSN 0x%x\n", __func__, tsn);

        /* The TSN is too high--silently discard the chunk and count on it
         * getting retransmitted later.
         */
        if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0)
                goto gen_shutdown;

        if (!asoc->stream.si->validate_ftsn(chunk))
                goto gen_shutdown;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn));
        if (len > sctp_ftsnhdr_len(&asoc->stream))
                sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN,
                                SCTP_CHUNK(chunk));

        /* Go a head and force a SACK, since we are shutting down. */
gen_shutdown:
        /* Implementor's Guide.
         *
         * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately
         * respond to each received packet containing one or more DATA chunk(s)
         * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL());
        sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * SCTP-AUTH Section 6.3 Receiving authenticated chunks
 *
 *    The receiver MUST use the HMAC algorithm indicated in the HMAC
 *    Identifier field.  If this algorithm was not specified by the
 *    receiver in the HMAC-ALGO parameter in the INIT or INIT-ACK chunk
 *    during association setup, the AUTH chunk and all chunks after it MUST
 *    be discarded and an ERROR chunk SHOULD be sent with the error cause
 *    defined in Section 4.1.
 *
 *    If an endpoint with no shared key receives a Shared Key Identifier
 *    other than 0, it MUST silently discard all authenticated chunks.  If
 *    the endpoint has at least one endpoint pair shared key for the peer,
 *    it MUST use the key specified by the Shared Key Identifier if a
 *    key has been configured for that Shared Key Identifier.  If no
 *    endpoint pair shared key has been configured for that Shared Key
 *    Identifier, all authenticated chunks MUST be silently discarded.
 *
 * Verification Tag:  8.5 Verification Tag [Normal verification]
 *
 * The return value is the disposition of the chunk.
 */
static enum sctp_ierror sctp_sf_authenticate(
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk)
{
        struct sctp_shared_key *sh_key = NULL;
        struct sctp_authhdr *auth_hdr;
        __u8 *save_digest, *digest;
        struct sctp_hmac *hmac;
        unsigned int sig_len;
        __u16 key_id;

        /* Pull in the auth header, so we can do some more verification */
        auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
        chunk->subh.auth_hdr = auth_hdr;
        skb_pull(chunk->skb, sizeof(*auth_hdr));

        /* Make sure that we support the HMAC algorithm from the auth
         * chunk.
         */
        if (!sctp_auth_asoc_verify_hmac_id(asoc, auth_hdr->hmac_id))
                return SCTP_IERROR_AUTH_BAD_HMAC;

        /* Make sure that the provided shared key identifier has been
         * configured
         */
        key_id = ntohs(auth_hdr->shkey_id);
        if (key_id != asoc->active_key_id) {
                sh_key = sctp_auth_get_shkey(asoc, key_id);
                if (!sh_key)
                        return SCTP_IERROR_AUTH_BAD_KEYID;
        }

        /* Make sure that the length of the signature matches what
         * we expect.
         */
        sig_len = ntohs(chunk->chunk_hdr->length) -
                  sizeof(struct sctp_auth_chunk);
        hmac = sctp_auth_get_hmac(ntohs(auth_hdr->hmac_id));
        if (sig_len != hmac->hmac_len)
                return SCTP_IERROR_PROTO_VIOLATION;

        /* Now that we've done validation checks, we can compute and
         * verify the hmac.  The steps involved are:
         *  1. Save the digest from the chunk.
         *  2. Zero out the digest in the chunk.
         *  3. Compute the new digest
         *  4. Compare saved and new digests.
         */
        digest = (u8 *)(auth_hdr + 1);
        skb_pull(chunk->skb, sig_len);

        save_digest = kmemdup(digest, sig_len, GFP_ATOMIC);
        if (!save_digest)
                goto nomem;

        memset(digest, 0, sig_len);

        sctp_auth_calculate_hmac(asoc, chunk->skb,
                                 (struct sctp_auth_chunk *)chunk->chunk_hdr,
                                 sh_key, GFP_ATOMIC);

        /* Discard the packet if the digests do not match */
        if (memcmp(save_digest, digest, sig_len)) {
                kfree(save_digest);
                return SCTP_IERROR_BAD_SIG;
        }

        kfree(save_digest);
        chunk->auth = 1;

        return SCTP_IERROR_NO_ERROR;
nomem:
        return SCTP_IERROR_NOMEM;
}

enum sctp_disposition sctp_sf_eat_auth(struct net *net,
                                       const struct sctp_endpoint *ep,
                                       const struct sctp_association *asoc,
                                       const union sctp_subtype type,
                                       void *arg, struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_authhdr *auth_hdr;
        struct sctp_chunk *err_chunk;
        enum sctp_ierror error;

        /* Make sure that the peer has AUTH capable */
        if (!asoc->peer.auth_capable)
                return sctp_sf_unk_chunk(net, ep, asoc, type, arg, commands);

        if (!sctp_vtag_verify(chunk, asoc)) {
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
                                SCTP_NULL());
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        }

        /* Make sure that the AUTH chunk has valid length.  */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_auth_chunk)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
        error = sctp_sf_authenticate(asoc, chunk);
        switch (error) {
        case SCTP_IERROR_AUTH_BAD_HMAC:
                /* Generate the ERROR chunk and discard the rest
                 * of the packet
                 */
                err_chunk = sctp_make_op_error(asoc, chunk,
                                               SCTP_ERROR_UNSUP_HMAC,
                                               &auth_hdr->hmac_id,
                                               sizeof(__u16), 0);
                if (err_chunk) {
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(err_chunk));
                }
                fallthrough;
        case SCTP_IERROR_AUTH_BAD_KEYID:
        case SCTP_IERROR_BAD_SIG:
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        case SCTP_IERROR_PROTO_VIOLATION:
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        case SCTP_IERROR_NOMEM:
                return SCTP_DISPOSITION_NOMEM;

        default:                        /* Prevent gcc warnings */
                break;
        }

        if (asoc->active_key_id != ntohs(auth_hdr->shkey_id)) {
                struct sctp_ulpevent *ev;

                ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id),
                                    SCTP_AUTH_NEW_KEY, GFP_ATOMIC);

                if (!ev)
                        return SCTP_DISPOSITION_NOMEM;

                sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
                                SCTP_ULPEVENT(ev));
        }

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Process an unknown chunk.
 *
 * Section: 3.2. Also, 2.1 in the implementor's guide.
 *
 * Chunk Types are encoded such that the highest-order two bits specify
 * the action that must be taken if the processing endpoint does not
 * recognize the Chunk Type.
 *
 * 00 - Stop processing this SCTP packet and discard it, do not process
 *      any further chunks within it.
 *
 * 01 - Stop processing this SCTP packet and discard it, do not process
 *      any further chunks within it, and report the unrecognized
 *      chunk in an 'Unrecognized Chunk Type'.
 *
 * 10 - Skip this chunk and continue processing.
 *
 * 11 - Skip this chunk and continue processing, but report in an ERROR
 *      Chunk using the 'Unrecognized Chunk Type' cause of error.
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_unk_chunk(struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *unk_chunk = arg;
        struct sctp_chunk *err_chunk;
        struct sctp_chunkhdr *hdr;

        pr_debug("%s: processing unknown chunk id:%d\n", __func__, type.chunk);

        if (!sctp_vtag_verify(unk_chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the chunk has a valid length.
         * Since we don't know the chunk type, we use a general
         * chunkhdr structure to make a comparison.
         */
        if (!sctp_chunk_length_valid(unk_chunk, sizeof(*hdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        switch (type.chunk & SCTP_CID_ACTION_MASK) {
        case SCTP_CID_ACTION_DISCARD:
                /* Discard the packet.  */
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
        case SCTP_CID_ACTION_DISCARD_ERR:
                /* Generate an ERROR chunk as response. */
                hdr = unk_chunk->chunk_hdr;
                err_chunk = sctp_make_op_error(asoc, unk_chunk,
                                               SCTP_ERROR_UNKNOWN_CHUNK, hdr,
                                               SCTP_PAD4(ntohs(hdr->length)),
                                               0);
                if (err_chunk) {
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(err_chunk));
                }

                /* Discard the packet.  */
                sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
                return SCTP_DISPOSITION_CONSUME;
        case SCTP_CID_ACTION_SKIP:
                /* Skip the chunk.  */
                return SCTP_DISPOSITION_DISCARD;
        case SCTP_CID_ACTION_SKIP_ERR:
                /* Generate an ERROR chunk as response. */
                hdr = unk_chunk->chunk_hdr;
                err_chunk = sctp_make_op_error(asoc, unk_chunk,
                                               SCTP_ERROR_UNKNOWN_CHUNK, hdr,
                                               SCTP_PAD4(ntohs(hdr->length)),
                                               0);
                if (err_chunk) {
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(err_chunk));
                }
                /* Skip the chunk.  */
                return SCTP_DISPOSITION_CONSUME;
        default:
                break;
        }

        return SCTP_DISPOSITION_DISCARD;
}

/*
 * Discard the chunk.
 *
 * Section: 0.2, 5.2.3, 5.2.5, 5.2.6, 6.0, 8.4.6, 8.5.1c, 9.2
 * [Too numerous to mention...]
 * Verification Tag: No verification needed.
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_discard_chunk(struct net *net,
                                            const struct sctp_endpoint *ep,
                                            const struct sctp_association *asoc,
                                            const union sctp_subtype type,
                                            void *arg,
                                            struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        if (asoc && !sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the chunk has a valid length.
         * Since we don't know the chunk type, we use a general
         * chunkhdr structure to make a comparison.
         */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        pr_debug("%s: chunk:%d is discarded\n", __func__, type.chunk);

        return SCTP_DISPOSITION_DISCARD;
}

/*
 * Discard the whole packet.
 *
 * Section: 8.4 2)
 *
 * 2) If the OOTB packet contains an ABORT chunk, the receiver MUST
 *    silently discard the OOTB packet and take no further action.
 *
 * Verification Tag: No verification necessary
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_pdiscard(struct net *net,
                                       const struct sctp_endpoint *ep,
                                       const struct sctp_association *asoc,
                                       const union sctp_subtype type,
                                       void *arg, struct sctp_cmd_seq *commands)
{
        SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS);
        sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());

        return SCTP_DISPOSITION_CONSUME;
}


/*
 * The other end is violating protocol.
 *
 * Section: Not specified
 * Verification Tag: Not specified
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (asoc, reply_msg, msg_up, timers, counters)
 *
 * We simply tag the chunk as a violation.  The state machine will log
 * the violation and continue.
 */
enum sctp_disposition sctp_sf_violation(struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        if (!sctp_vtag_verify(chunk, asoc))
                return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);

        /* Make sure that the chunk has a valid length. */
        if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr)))
                return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
                                                  commands);

        return SCTP_DISPOSITION_VIOLATION;
}

/*
 * Common function to handle a protocol violation.
 */
static enum sctp_disposition sctp_sf_abort_violation(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        void *arg,
                                        struct sctp_cmd_seq *commands,
                                        const __u8 *payload,
                                        const size_t paylen)
{
        struct sctp_packet *packet = NULL;
        struct sctp_chunk *chunk =  arg;
        struct sctp_chunk *abort = NULL;

        /* SCTP-AUTH, Section 6.3:
         *    It should be noted that if the receiver wants to tear
         *    down an association in an authenticated way only, the
         *    handling of malformed packets should not result in
         *    tearing down the association.
         *
         * This means that if we only want to abort associations
         * in an authenticated way (i.e AUTH+ABORT), then we
         * can't destroy this association just because the packet
         * was malformed.
         */
        if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
                goto discard;

        /* Make the abort chunk. */
        abort = sctp_make_abort_violation(asoc, chunk, payload, paylen);
        if (!abort)
                goto nomem;

        if (asoc) {
                /* Treat INIT-ACK as a special case during COOKIE-WAIT. */
                if (chunk->chunk_hdr->type == SCTP_CID_INIT_ACK &&
                    !asoc->peer.i.init_tag) {
                        struct sctp_initack_chunk *initack;

                        initack = (struct sctp_initack_chunk *)chunk->chunk_hdr;
                        if (!sctp_chunk_length_valid(chunk, sizeof(*initack)))
                                abort->chunk_hdr->flags |= SCTP_CHUNK_FLAG_T;
                        else {
                                unsigned int inittag;

                                inittag = ntohl(initack->init_hdr.init_tag);
                                sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_INITTAG,
                                                SCTP_U32(inittag));
                        }
                }

                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
                SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);

                if (asoc->state <= SCTP_STATE_COOKIE_ECHOED) {
                        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
                        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                        SCTP_ERROR(ECONNREFUSED));
                        sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
                                        SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
                } else {
                        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                        SCTP_ERROR(ECONNABORTED));
                        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                        SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
                        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                }
        } else {
                packet = sctp_ootb_pkt_new(net, asoc, chunk);

                if (!packet)
                        goto nomem_pkt;

                if (sctp_test_T_bit(abort))
                        packet->vtag = ntohl(chunk->sctp_hdr->vtag);

                abort->skb->sk = ep->base.sk;

                sctp_packet_append_chunk(packet, abort);

                sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
                        SCTP_PACKET(packet));

                SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
        }

        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);

discard:
        sctp_sf_pdiscard(net, ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
        return SCTP_DISPOSITION_ABORT;

nomem_pkt:
        sctp_chunk_free(abort);
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Handle a protocol violation when the chunk length is invalid.
 * "Invalid" length is identified as smaller than the minimal length a
 * given chunk can be.  For example, a SACK chunk has invalid length
 * if its length is set to be smaller than the size of struct sctp_sack_chunk.
 *
 * We inform the other end by sending an ABORT with a Protocol Violation
 * error code.
 *
 * Section: Not specified
 * Verification Tag:  Nothing to do
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * Outputs
 * (reply_msg, msg_up, counters)
 *
 * Generate an  ABORT chunk and terminate the association.
 */
static enum sctp_disposition sctp_sf_violation_chunklen(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        static const char err_str[] = "The following chunk had invalid length:";

        return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
                                       sizeof(err_str));
}

/*
 * Handle a protocol violation when the parameter length is invalid.
 * If the length is smaller than the minimum length of a given parameter,
 * or accumulated length in multi parameters exceeds the end of the chunk,
 * the length is considered as invalid.
 */
static enum sctp_disposition sctp_sf_violation_paramlen(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg, void *ext,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_paramhdr *param = ext;
        struct sctp_chunk *abort = NULL;
        struct sctp_chunk *chunk = arg;

        if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
                goto discard;

        /* Make the abort chunk. */
        abort = sctp_make_violation_paramlen(asoc, chunk, param);
        if (!abort)
                goto nomem;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
        SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);

        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                        SCTP_ERROR(ECONNABORTED));
        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                        SCTP_PERR(SCTP_ERROR_PROTO_VIOLATION));
        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);

discard:
        sctp_sf_pdiscard(net, ep, asoc, SCTP_ST_CHUNK(0), arg, commands);
        return SCTP_DISPOSITION_ABORT;
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/* Handle a protocol violation when the peer trying to advance the
 * cumulative tsn ack to a point beyond the max tsn currently sent.
 *
 * We inform the other end by sending an ABORT with a Protocol Violation
 * error code.
 */
static enum sctp_disposition sctp_sf_violation_ctsn(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        static const char err_str[] = "The cumulative tsn ack beyond the max tsn currently sent:";

        return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
                                       sizeof(err_str));
}

/* Handle protocol violation of an invalid chunk bundling.  For example,
 * when we have an association and we receive bundled INIT-ACK, or
 * SHUTDOWN-COMPLETE, our peer is clearly violating the "MUST NOT bundle"
 * statement from the specs.  Additionally, there might be an attacker
 * on the path and we may not want to continue this communication.
 */
static enum sctp_disposition sctp_sf_violation_chunk(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        static const char err_str[] = "The following chunk violates protocol:";

        return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
                                       sizeof(err_str));
}
/***************************************************************************
 * These are the state functions for handling primitive (Section 10) events.
 ***************************************************************************/
/*
 * sctp_sf_do_prm_asoc
 *
 * Section: 10.1 ULP-to-SCTP
 * B) Associate
 *
 * Format: ASSOCIATE(local SCTP instance name, destination transport addr,
 * outbound stream count)
 * -> association id [,destination transport addr list] [,outbound stream
 * count]
 *
 * This primitive allows the upper layer to initiate an association to a
 * specific peer endpoint.
 *
 * The peer endpoint shall be specified by one of the transport addresses
 * which defines the endpoint (see Section 1.4).  If the local SCTP
 * instance has not been initialized, the ASSOCIATE is considered an
 * error.
 * [This is not relevant for the kernel implementation since we do all
 * initialization at boot time.  It we hadn't initialized we wouldn't
 * get anywhere near this code.]
 *
 * An association id, which is a local handle to the SCTP association,
 * will be returned on successful establishment of the association. If
 * SCTP is not able to open an SCTP association with the peer endpoint,
 * an error is returned.
 * [In the kernel implementation, the struct sctp_association needs to
 * be created BEFORE causing this primitive to run.]
 *
 * Other association parameters may be returned, including the
 * complete destination transport addresses of the peer as well as the
 * outbound stream count of the local endpoint. One of the transport
 * address from the returned destination addresses will be selected by
 * the local endpoint as default primary path for sending SCTP packets
 * to this peer.  The returned "destination transport addr list" can
 * be used by the ULP to change the default primary path or to force
 * sending a packet to a specific transport address.  [All of this
 * stuff happens when the INIT ACK arrives.  This is a NON-BLOCKING
 * function.]
 *
 * Mandatory attributes:
 *
 * o local SCTP instance name - obtained from the INITIALIZE operation.
 *   [This is the argument asoc.]
 * o destination transport addr - specified as one of the transport
 * addresses of the peer endpoint with which the association is to be
 * established.
 *  [This is asoc->peer.active_path.]
 * o outbound stream count - the number of outbound streams the ULP
 * would like to open towards this peer endpoint.
 * [BUG: This is not currently implemented.]
 * Optional attributes:
 *
 * None.
 *
 * The return value is a disposition.
 */
enum sctp_disposition sctp_sf_do_prm_asoc(struct net *net,
                                          const struct sctp_endpoint *ep,
                                          const struct sctp_association *asoc,
                                          const union sctp_subtype type,
                                          void *arg,
                                          struct sctp_cmd_seq *commands)
{
        struct sctp_association *my_asoc;
        struct sctp_chunk *repl;

        /* The comment below says that we enter COOKIE-WAIT AFTER
         * sending the INIT, but that doesn't actually work in our
         * implementation...
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_COOKIE_WAIT));

        /* RFC 2960 5.1 Normal Establishment of an Association
         *
         * A) "A" first sends an INIT chunk to "Z".  In the INIT, "A"
         * must provide its Verification Tag (Tag_A) in the Initiate
         * Tag field.  Tag_A SHOULD be a random number in the range of
         * 1 to 4294967295 (see 5.3.1 for Tag value selection). ...
         */

        repl = sctp_make_init(asoc, &asoc->base.bind_addr, GFP_ATOMIC, 0);
        if (!repl)
                goto nomem;

        /* Choose transport for INIT. */
        sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
                        SCTP_CHUNK(repl));

        /* Cast away the const modifier, as we want to just
         * rerun it through as a sideffect.
         */
        my_asoc = (struct sctp_association *)asoc;
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(my_asoc));

        /* After sending the INIT, "A" starts the T1-init timer and
         * enters the COOKIE-WAIT state.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
        return SCTP_DISPOSITION_CONSUME;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Process the SEND primitive.
 *
 * Section: 10.1 ULP-to-SCTP
 * E) Send
 *
 * Format: SEND(association id, buffer address, byte count [,context]
 *         [,stream id] [,life time] [,destination transport address]
 *         [,unorder flag] [,no-bundle flag] [,payload protocol-id] )
 * -> result
 *
 * This is the main method to send user data via SCTP.
 *
 * Mandatory attributes:
 *
 *  o association id - local handle to the SCTP association
 *
 *  o buffer address - the location where the user message to be
 *    transmitted is stored;
 *
 *  o byte count - The size of the user data in number of bytes;
 *
 * Optional attributes:
 *
 *  o context - an optional 32 bit integer that will be carried in the
 *    sending failure notification to the ULP if the transportation of
 *    this User Message fails.
 *
 *  o stream id - to indicate which stream to send the data on. If not
 *    specified, stream 0 will be used.
 *
 *  o life time - specifies the life time of the user data. The user data
 *    will not be sent by SCTP after the life time expires. This
 *    parameter can be used to avoid efforts to transmit stale
 *    user messages. SCTP notifies the ULP if the data cannot be
 *    initiated to transport (i.e. sent to the destination via SCTP's
 *    send primitive) within the life time variable. However, the
 *    user data will be transmitted if SCTP has attempted to transmit a
 *    chunk before the life time expired.
 *
 *  o destination transport address - specified as one of the destination
 *    transport addresses of the peer endpoint to which this packet
 *    should be sent. Whenever possible, SCTP should use this destination
 *    transport address for sending the packets, instead of the current
 *    primary path.
 *
 *  o unorder flag - this flag, if present, indicates that the user
 *    would like the data delivered in an unordered fashion to the peer
 *    (i.e., the U flag is set to 1 on all DATA chunks carrying this
 *    message).
 *
 *  o no-bundle flag - instructs SCTP not to bundle this user data with
 *    other outbound DATA chunks. SCTP MAY still bundle even when
 *    this flag is present, when faced with network congestion.
 *
 *  o payload protocol-id - A 32 bit unsigned integer that is to be
 *    passed to the peer indicating the type of payload protocol data
 *    being transmitted. This value is passed as opaque data by SCTP.
 *
 * The return value is the disposition.
 */
enum sctp_disposition sctp_sf_do_prm_send(struct net *net,
                                          const struct sctp_endpoint *ep,
                                          const struct sctp_association *asoc,
                                          const union sctp_subtype type,
                                          void *arg,
                                          struct sctp_cmd_seq *commands)
{
        struct sctp_datamsg *msg = arg;

        sctp_add_cmd_sf(commands, SCTP_CMD_SEND_MSG, SCTP_DATAMSG(msg));
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Process the SHUTDOWN primitive.
 *
 * Section: 10.1:
 * C) Shutdown
 *
 * Format: SHUTDOWN(association id)
 * -> result
 *
 * Gracefully closes an association. Any locally queued user data
 * will be delivered to the peer. The association will be terminated only
 * after the peer acknowledges all the SCTP packets sent.  A success code
 * will be returned on successful termination of the association. If
 * attempting to terminate the association results in a failure, an error
 * code shall be returned.
 *
 * Mandatory attributes:
 *
 *  o association id - local handle to the SCTP association
 *
 * Optional attributes:
 *
 * None.
 *
 * The return value is the disposition.
 */
enum sctp_disposition sctp_sf_do_9_2_prm_shutdown(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        enum sctp_disposition disposition;

        /* From 9.2 Shutdown of an Association
         * Upon receipt of the SHUTDOWN primitive from its upper
         * layer, the endpoint enters SHUTDOWN-PENDING state and
         * remains there until all outstanding data has been
         * acknowledged by its peer. The endpoint accepts no new data
         * from its upper layer, but retransmits data to the far end
         * if necessary to fill gaps.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));

        disposition = SCTP_DISPOSITION_CONSUME;
        if (sctp_outq_is_empty(&asoc->outqueue)) {
                disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type,
                                                            arg, commands);
        }

        return disposition;
}

/*
 * Process the ABORT primitive.
 *
 * Section: 10.1:
 * C) Abort
 *
 * Format: Abort(association id [, cause code])
 * -> result
 *
 * Ungracefully closes an association. Any locally queued user data
 * will be discarded and an ABORT chunk is sent to the peer.  A success code
 * will be returned on successful abortion of the association. If
 * attempting to abort the association results in a failure, an error
 * code shall be returned.
 *
 * Mandatory attributes:
 *
 *  o association id - local handle to the SCTP association
 *
 * Optional attributes:
 *
 *  o cause code - reason of the abort to be passed to the peer
 *
 * None.
 *
 * The return value is the disposition.
 */
enum sctp_disposition sctp_sf_do_9_1_prm_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* From 9.1 Abort of an Association
         * Upon receipt of the ABORT primitive from its upper
         * layer, the endpoint enters CLOSED state and
         * discard all outstanding data has been
         * acknowledged by its peer. The endpoint accepts no new data
         * from its upper layer, but retransmits data to the far end
         * if necessary to fill gaps.
         */
        struct sctp_chunk *abort = arg;

        if (abort)
                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));

        /* Even if we can't send the ABORT due to low memory delete the
         * TCB.  This is a departure from our typical NOMEM handling.
         */

        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                        SCTP_ERROR(ECONNABORTED));
        /* Delete the established association. */
        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                        SCTP_PERR(SCTP_ERROR_USER_ABORT));

        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);

        return SCTP_DISPOSITION_ABORT;
}

/* We tried an illegal operation on an association which is closed.  */
enum sctp_disposition sctp_sf_error_closed(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, SCTP_ERROR(-EINVAL));
        return SCTP_DISPOSITION_CONSUME;
}

/* We tried an illegal operation on an association which is shutting
 * down.
 */
enum sctp_disposition sctp_sf_error_shutdown(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR,
                        SCTP_ERROR(-ESHUTDOWN));
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * sctp_cookie_wait_prm_shutdown
 *
 * Section: 4 Note: 2
 * Verification Tag:
 * Inputs
 * (endpoint, asoc)
 *
 * The RFC does not explicitly address this issue, but is the route through the
 * state table when someone issues a shutdown while in COOKIE_WAIT state.
 *
 * Outputs
 * (timers)
 */
enum sctp_disposition sctp_sf_cookie_wait_prm_shutdown(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_CLOSED));

        SCTP_INC_STATS(net, SCTP_MIB_SHUTDOWNS);

        sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL());

        return SCTP_DISPOSITION_DELETE_TCB;
}

/*
 * sctp_cookie_echoed_prm_shutdown
 *
 * Section: 4 Note: 2
 * Verification Tag:
 * Inputs
 * (endpoint, asoc)
 *
 * The RFC does not explicitly address this issue, but is the route through the
 * state table when someone issues a shutdown while in COOKIE_ECHOED state.
 *
 * Outputs
 * (timers)
 */
enum sctp_disposition sctp_sf_cookie_echoed_prm_shutdown(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* There is a single T1 timer, so we should be able to use
         * common function with the COOKIE-WAIT state.
         */
        return sctp_sf_cookie_wait_prm_shutdown(net, ep, asoc, type, arg, commands);
}

/*
 * sctp_sf_cookie_wait_prm_abort
 *
 * Section: 4 Note: 2
 * Verification Tag:
 * Inputs
 * (endpoint, asoc)
 *
 * The RFC does not explicitly address this issue, but is the route through the
 * state table when someone issues an abort while in COOKIE_WAIT state.
 *
 * Outputs
 * (timers)
 */
enum sctp_disposition sctp_sf_cookie_wait_prm_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *abort = arg;

        /* Stop T1-init timer */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));

        if (abort)
                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));

        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_CLOSED));

        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);

        /* Even if we can't send the ABORT due to low memory delete the
         * TCB.  This is a departure from our typical NOMEM handling.
         */

        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                        SCTP_ERROR(ECONNREFUSED));
        /* Delete the established association. */
        sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
                        SCTP_PERR(SCTP_ERROR_USER_ABORT));

        return SCTP_DISPOSITION_ABORT;
}

/*
 * sctp_sf_cookie_echoed_prm_abort
 *
 * Section: 4 Note: 3
 * Verification Tag:
 * Inputs
 * (endpoint, asoc)
 *
 * The RFC does not explcitly address this issue, but is the route through the
 * state table when someone issues an abort while in COOKIE_ECHOED state.
 *
 * Outputs
 * (timers)
 */
enum sctp_disposition sctp_sf_cookie_echoed_prm_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* There is a single T1 timer, so we should be able to use
         * common function with the COOKIE-WAIT state.
         */
        return sctp_sf_cookie_wait_prm_abort(net, ep, asoc, type, arg, commands);
}

/*
 * sctp_sf_shutdown_pending_prm_abort
 *
 * Inputs
 * (endpoint, asoc)
 *
 * The RFC does not explicitly address this issue, but is the route through the
 * state table when someone issues an abort while in SHUTDOWN-PENDING state.
 *
 * Outputs
 * (timers)
 */
enum sctp_disposition sctp_sf_shutdown_pending_prm_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* Stop the T5-shutdown guard timer.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));

        return sctp_sf_do_9_1_prm_abort(net, ep, asoc, type, arg, commands);
}

/*
 * sctp_sf_shutdown_sent_prm_abort
 *
 * Inputs
 * (endpoint, asoc)
 *
 * The RFC does not explicitly address this issue, but is the route through the
 * state table when someone issues an abort while in SHUTDOWN-SENT state.
 *
 * Outputs
 * (timers)
 */
enum sctp_disposition sctp_sf_shutdown_sent_prm_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* Stop the T2-shutdown timer.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        /* Stop the T5-shutdown guard timer.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));

        return sctp_sf_do_9_1_prm_abort(net, ep, asoc, type, arg, commands);
}

/*
 * sctp_sf_cookie_echoed_prm_abort
 *
 * Inputs
 * (endpoint, asoc)
 *
 * The RFC does not explcitly address this issue, but is the route through the
 * state table when someone issues an abort while in COOKIE_ECHOED state.
 *
 * Outputs
 * (timers)
 */
enum sctp_disposition sctp_sf_shutdown_ack_sent_prm_abort(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        /* The same T2 timer, so we should be able to use
         * common function with the SHUTDOWN-SENT state.
         */
        return sctp_sf_shutdown_sent_prm_abort(net, ep, asoc, type, arg, commands);
}

/*
 * Process the REQUESTHEARTBEAT primitive
 *
 * 10.1 ULP-to-SCTP
 * J) Request Heartbeat
 *
 * Format: REQUESTHEARTBEAT(association id, destination transport address)
 *
 * -> result
 *
 * Instructs the local endpoint to perform a HeartBeat on the specified
 * destination transport address of the given association. The returned
 * result should indicate whether the transmission of the HEARTBEAT
 * chunk to the destination address is successful.
 *
 * Mandatory attributes:
 *
 * o association id - local handle to the SCTP association
 *
 * o destination transport address - the transport address of the
 *   association on which a heartbeat should be issued.
 */
enum sctp_disposition sctp_sf_do_prm_requestheartbeat(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type,
                                      (struct sctp_transport *)arg, commands))
                return SCTP_DISPOSITION_NOMEM;

        /*
         * RFC 2960 (bis), section 8.3
         *
         *    D) Request an on-demand HEARTBEAT on a specific destination
         *    transport address of a given association.
         *
         *    The endpoint should increment the respective error  counter of
         *    the destination transport address each time a HEARTBEAT is sent
         *    to that address and not acknowledged within one RTO.
         *
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
                        SCTP_TRANSPORT(arg));
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * ADDIP Section 4.1 ASCONF Chunk Procedures
 * When an endpoint has an ASCONF signaled change to be sent to the
 * remote endpoint it should do A1 to A9
 */
enum sctp_disposition sctp_sf_do_prm_asconf(struct net *net,
                                            const struct sctp_endpoint *ep,
                                            const struct sctp_association *asoc,
                                            const union sctp_subtype type,
                                            void *arg,
                                            struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk));
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
        return SCTP_DISPOSITION_CONSUME;
}

/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
enum sctp_disposition sctp_sf_do_prm_reconf(struct net *net,
                                            const struct sctp_endpoint *ep,
                                            const struct sctp_association *asoc,
                                            const union sctp_subtype type,
                                            void *arg,
                                            struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Ignore the primitive event
 *
 * The return value is the disposition of the primitive.
 */
enum sctp_disposition sctp_sf_ignore_primitive(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        pr_debug("%s: primitive type:%d is ignored\n", __func__,
                 type.primitive);

        return SCTP_DISPOSITION_DISCARD;
}

/***************************************************************************
 * These are the state functions for the OTHER events.
 ***************************************************************************/

/*
 * When the SCTP stack has no more user data to send or retransmit, this
 * notification is given to the user. Also, at the time when a user app
 * subscribes to this event, if there is no data to be sent or
 * retransmit, the stack will immediately send up this notification.
 */
enum sctp_disposition sctp_sf_do_no_pending_tsn(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_ulpevent *event;

        event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_ATOMIC);
        if (!event)
                return SCTP_DISPOSITION_NOMEM;

        sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event));

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Start the shutdown negotiation.
 *
 * From Section 9.2:
 * Once all its outstanding data has been acknowledged, the endpoint
 * shall send a SHUTDOWN chunk to its peer including in the Cumulative
 * TSN Ack field the last sequential TSN it has received from the peer.
 * It shall then start the T2-shutdown timer and enter the SHUTDOWN-SENT
 * state. If the timer expires, the endpoint must re-send the SHUTDOWN
 * with the updated last sequential TSN received from its peer.
 *
 * The return value is the disposition.
 */
enum sctp_disposition sctp_sf_do_9_2_start_shutdown(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *reply;

        /* Once all its outstanding data has been acknowledged, the
         * endpoint shall send a SHUTDOWN chunk to its peer including
         * in the Cumulative TSN Ack field the last sequential TSN it
         * has received from the peer.
         */
        reply = sctp_make_shutdown(asoc, arg);
        if (!reply)
                goto nomem;

        /* Set the transport for the SHUTDOWN chunk and the timeout for the
         * T2-shutdown timer.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));

        /* It shall then start the T2-shutdown timer */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        /* RFC 4960 Section 9.2
         * The sender of the SHUTDOWN MAY also start an overall guard timer
         * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));

        if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));

        /* and enter the SHUTDOWN-SENT state.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_SHUTDOWN_SENT));

        /* sctp-implguide 2.10 Issues with Heartbeating and failover
         *
         * HEARTBEAT ... is discontinued after sending either SHUTDOWN
         * or SHUTDOWN-ACK.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));

        return SCTP_DISPOSITION_CONSUME;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Generate a SHUTDOWN ACK now that everything is SACK'd.
 *
 * From Section 9.2:
 *
 * If it has no more outstanding DATA chunks, the SHUTDOWN receiver
 * shall send a SHUTDOWN ACK and start a T2-shutdown timer of its own,
 * entering the SHUTDOWN-ACK-SENT state. If the timer expires, the
 * endpoint must re-send the SHUTDOWN ACK.
 *
 * The return value is the disposition.
 */
enum sctp_disposition sctp_sf_do_9_2_shutdown_ack(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = arg;
        struct sctp_chunk *reply;

        /* There are 2 ways of getting here:
         *    1) called in response to a SHUTDOWN chunk
         *    2) called when SCTP_EVENT_NO_PENDING_TSN event is issued.
         *
         * For the case (2), the arg parameter is set to NULL.  We need
         * to check that we have a chunk before accessing it's fields.
         */
        if (chunk) {
                if (!sctp_vtag_verify(chunk, asoc))
                        return sctp_sf_pdiscard(net, ep, asoc, type, arg,
                                                commands);

                /* Make sure that the SHUTDOWN chunk has a valid length. */
                if (!sctp_chunk_length_valid(
                                chunk, sizeof(struct sctp_shutdown_chunk)))
                        return sctp_sf_violation_chunklen(net, ep, asoc, type,
                                                          arg, commands);
        }

        /* If it has no more outstanding DATA chunks, the SHUTDOWN receiver
         * shall send a SHUTDOWN ACK ...
         */
        reply = sctp_make_shutdown_ack(asoc, chunk);
        if (!reply)
                goto nomem;

        /* Set the transport for the SHUTDOWN ACK chunk and the timeout for
         * the T2-shutdown timer.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));

        /* and start/restart a T2-shutdown timer of its own, */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));

        if (asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE])
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE));

        /* Enter the SHUTDOWN-ACK-SENT state.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_SHUTDOWN_ACK_SENT));

        /* sctp-implguide 2.10 Issues with Heartbeating and failover
         *
         * HEARTBEAT ... is discontinued after sending either SHUTDOWN
         * or SHUTDOWN-ACK.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL());

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));

        return SCTP_DISPOSITION_CONSUME;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * Ignore the event defined as other
 *
 * The return value is the disposition of the event.
 */
enum sctp_disposition sctp_sf_ignore_other(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        pr_debug("%s: the event other type:%d is ignored\n",
                 __func__, type.other);

        return SCTP_DISPOSITION_DISCARD;
}

/************************************************************
 * These are the state functions for handling timeout events.
 ************************************************************/

/*
 * RTX Timeout
 *
 * Section: 6.3.3 Handle T3-rtx Expiration
 *
 * Whenever the retransmission timer T3-rtx expires for a destination
 * address, do the following:
 * [See below]
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_do_6_3_3_rtx(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        struct sctp_transport *transport = arg;

        SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS);

        if (asoc->overall_error_count >= asoc->max_retrans) {
                if (asoc->peer.zero_window_announced &&
                    asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
                        /*
                         * We are here likely because the receiver had its rwnd
                         * closed for a while and we have not been able to
                         * transmit the locally queued data within the maximum
                         * retransmission attempts limit.  Start the T5
                         * shutdown guard timer to give the receiver one last
                         * chance and some additional time to recover before
                         * aborting.
                         */
                        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START_ONCE,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD));
                } else {
                        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                        SCTP_ERROR(ETIMEDOUT));
                        /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
                        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                        SCTP_PERR(SCTP_ERROR_NO_ERROR));
                        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                        return SCTP_DISPOSITION_DELETE_TCB;
                }
        }

        /* E1) For the destination address for which the timer
         * expires, adjust its ssthresh with rules defined in Section
         * 7.2.3 and set the cwnd <- MTU.
         */

        /* E2) For the destination address for which the timer
         * expires, set RTO <- RTO * 2 ("back off the timer").  The
         * maximum value discussed in rule C7 above (RTO.max) may be
         * used to provide an upper bound to this doubling operation.
         */

        /* E3) Determine how many of the earliest (i.e., lowest TSN)
         * outstanding DATA chunks for the address for which the
         * T3-rtx has expired will fit into a single packet, subject
         * to the MTU constraint for the path corresponding to the
         * destination transport address to which the retransmission
         * is being sent (this may be different from the address for
         * which the timer expires [see Section 6.4]).  Call this
         * value K. Bundle and retransmit those K DATA chunks in a
         * single packet to the destination endpoint.
         *
         * Note: Any DATA chunks that were sent to the address for
         * which the T3-rtx timer expired but did not fit in one MTU
         * (rule E3 above), should be marked for retransmission and
         * sent as soon as cwnd allows (normally when a SACK arrives).
         */

        /* Do some failure management (Section 8.2). */
        sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport));

        /* NB: Rules E4 and F1 are implicit in R1.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_RETRAN, SCTP_TRANSPORT(transport));

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * Generate delayed SACK on timeout
 *
 * Section: 6.2  Acknowledgement on Reception of DATA Chunks
 *
 * The guidelines on delayed acknowledgement algorithm specified in
 * Section 4.2 of [RFC2581] SHOULD be followed.  Specifically, an
 * acknowledgement SHOULD be generated for at least every second packet
 * (not every second DATA chunk) received, and SHOULD be generated
 * within 200 ms of the arrival of any unacknowledged DATA chunk.  In
 * some situations it may be beneficial for an SCTP transmitter to be
 * more conservative than the algorithms detailed in this document
 * allow. However, an SCTP transmitter MUST NOT be more aggressive than
 * the following algorithms allow.
 */
enum sctp_disposition sctp_sf_do_6_2_sack(struct net *net,
                                          const struct sctp_endpoint *ep,
                                          const struct sctp_association *asoc,
                                          const union sctp_subtype type,
                                          void *arg,
                                          struct sctp_cmd_seq *commands)
{
        SCTP_INC_STATS(net, SCTP_MIB_DELAY_SACK_EXPIREDS);
        sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
        return SCTP_DISPOSITION_CONSUME;
}

/*
 * sctp_sf_t1_init_timer_expire
 *
 * Section: 4 Note: 2
 * Verification Tag:
 * Inputs
 * (endpoint, asoc)
 *
 *  RFC 2960 Section 4 Notes
 *  2) If the T1-init timer expires, the endpoint MUST retransmit INIT
 *     and re-start the T1-init timer without changing state.  This MUST
 *     be repeated up to 'Max.Init.Retransmits' times.  After that, the
 *     endpoint MUST abort the initialization process and report the
 *     error to SCTP user.
 *
 * Outputs
 * (timers, events)
 *
 */
enum sctp_disposition sctp_sf_t1_init_timer_expire(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        int attempts = asoc->init_err_counter + 1;
        struct sctp_chunk *repl = NULL;
        struct sctp_bind_addr *bp;

        pr_debug("%s: timer T1 expired (INIT)\n", __func__);

        SCTP_INC_STATS(net, SCTP_MIB_T1_INIT_EXPIREDS);

        if (attempts <= asoc->max_init_attempts) {
                bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
                repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
                if (!repl)
                        return SCTP_DISPOSITION_NOMEM;

                /* Choose transport for INIT. */
                sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
                                SCTP_CHUNK(repl));

                /* Issue a sideeffect to do the needed accounting. */
                sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));

                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
        } else {
                pr_debug("%s: giving up on INIT, attempts:%d "
                         "max_init_attempts:%d\n", __func__, attempts,
                         asoc->max_init_attempts);

                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ETIMEDOUT));
                sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
                                SCTP_PERR(SCTP_ERROR_NO_ERROR));
                return SCTP_DISPOSITION_DELETE_TCB;
        }

        return SCTP_DISPOSITION_CONSUME;
}

/*
 * sctp_sf_t1_cookie_timer_expire
 *
 * Section: 4 Note: 2
 * Verification Tag:
 * Inputs
 * (endpoint, asoc)
 *
 *  RFC 2960 Section 4 Notes
 *  3) If the T1-cookie timer expires, the endpoint MUST retransmit
 *     COOKIE ECHO and re-start the T1-cookie timer without changing
 *     state.  This MUST be repeated up to 'Max.Init.Retransmits' times.
 *     After that, the endpoint MUST abort the initialization process and
 *     report the error to SCTP user.
 *
 * Outputs
 * (timers, events)
 *
 */
enum sctp_disposition sctp_sf_t1_cookie_timer_expire(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        int attempts = asoc->init_err_counter + 1;
        struct sctp_chunk *repl = NULL;

        pr_debug("%s: timer T1 expired (COOKIE-ECHO)\n", __func__);

        SCTP_INC_STATS(net, SCTP_MIB_T1_COOKIE_EXPIREDS);

        if (attempts <= asoc->max_init_attempts) {
                repl = sctp_make_cookie_echo(asoc, NULL);
                if (!repl)
                        return SCTP_DISPOSITION_NOMEM;

                sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
                                SCTP_CHUNK(repl));
                /* Issue a sideeffect to do the needed accounting. */
                sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));

                sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
        } else {
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ETIMEDOUT));
                sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
                                SCTP_PERR(SCTP_ERROR_NO_ERROR));
                return SCTP_DISPOSITION_DELETE_TCB;
        }

        return SCTP_DISPOSITION_CONSUME;
}

/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
 * with the updated last sequential TSN received from its peer.
 *
 * An endpoint should limit the number of retransmission of the
 * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'.
 * If this threshold is exceeded the endpoint should destroy the TCB and
 * MUST report the peer endpoint unreachable to the upper layer (and
 * thus the association enters the CLOSED state).  The reception of any
 * packet from its peer (i.e. as the peer sends all of its queued DATA
 * chunks) should clear the endpoint's retransmission count and restart
 * the T2-Shutdown timer,  giving its peer ample opportunity to transmit
 * all of its queued DATA chunks that have not yet been sent.
 */
enum sctp_disposition sctp_sf_t2_timer_expire(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *reply = NULL;

        pr_debug("%s: timer T2 expired\n", __func__);

        SCTP_INC_STATS(net, SCTP_MIB_T2_SHUTDOWN_EXPIREDS);

        ((struct sctp_association *)asoc)->shutdown_retries++;

        if (asoc->overall_error_count >= asoc->max_retrans) {
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ETIMEDOUT));
                /* Note:  CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */
                sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                SCTP_PERR(SCTP_ERROR_NO_ERROR));
                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                return SCTP_DISPOSITION_DELETE_TCB;
        }

        switch (asoc->state) {
        case SCTP_STATE_SHUTDOWN_SENT:
                reply = sctp_make_shutdown(asoc, NULL);
                break;

        case SCTP_STATE_SHUTDOWN_ACK_SENT:
                reply = sctp_make_shutdown_ack(asoc, NULL);
                break;

        default:
                BUG();
                break;
        }

        if (!reply)
                goto nomem;

        /* Do some failure management (Section 8.2).
         * If we remove the transport an SHUTDOWN was last sent to, don't
         * do failure management.
         */
        if (asoc->shutdown_last_sent_to)
                sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE,
                                SCTP_TRANSPORT(asoc->shutdown_last_sent_to));

        /* Set the transport for the SHUTDOWN/ACK chunk and the timeout for
         * the T2-shutdown timer.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply));

        /* Restart the T2-shutdown timer.  */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
        return SCTP_DISPOSITION_CONSUME;

nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/*
 * ADDIP Section 4.1 ASCONF Chunk Procedures
 * If the T4 RTO timer expires the endpoint should do B1 to B5
 */
enum sctp_disposition sctp_sf_t4_timer_expire(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *chunk = asoc->addip_last_asconf;
        struct sctp_transport *transport = chunk->transport;

        SCTP_INC_STATS(net, SCTP_MIB_T4_RTO_EXPIREDS);

        /* ADDIP 4.1 B1) Increment the error counters and perform path failure
         * detection on the appropriate destination address as defined in
         * RFC2960 [5] section 8.1 and 8.2.
         */
        if (transport)
                sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE,
                                SCTP_TRANSPORT(transport));

        /* Reconfig T4 timer and transport. */
        sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk));

        /* ADDIP 4.1 B2) Increment the association error counters and perform
         * endpoint failure detection on the association as defined in
         * RFC2960 [5] section 8.1 and 8.2.
         * association error counter is incremented in SCTP_CMD_STRIKE.
         */
        if (asoc->overall_error_count >= asoc->max_retrans) {
                sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
                                SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ETIMEDOUT));
                sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                SCTP_PERR(SCTP_ERROR_NO_ERROR));
                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                return SCTP_DISPOSITION_ABORT;
        }

        /* ADDIP 4.1 B3) Back-off the destination address RTO value to which
         * the ASCONF chunk was sent by doubling the RTO timer value.
         * This is done in SCTP_CMD_STRIKE.
         */

        /* ADDIP 4.1 B4) Re-transmit the ASCONF Chunk last sent and if possible
         * choose an alternate destination address (please refer to RFC2960
         * [5] section 6.4.1). An endpoint MUST NOT add new parameters to this
         * chunk, it MUST be the same (including its serial number) as the last
         * ASCONF sent.
         */
        sctp_chunk_hold(asoc->addip_last_asconf);
        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                        SCTP_CHUNK(asoc->addip_last_asconf));

        /* ADDIP 4.1 B5) Restart the T-4 RTO timer. Note that if a different
         * destination is selected, then the RTO used will be that of the new
         * destination address.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
                        SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));

        return SCTP_DISPOSITION_CONSUME;
}

/* sctpimpguide-05 Section 2.12.2
 * The sender of the SHUTDOWN MAY also start an overall guard timer
 * 'T5-shutdown-guard' to bound the overall time for shutdown sequence.
 * At the expiration of this timer the sender SHOULD abort the association
 * by sending an ABORT chunk.
 */
enum sctp_disposition sctp_sf_t5_timer_expire(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        struct sctp_chunk *reply = NULL;

        pr_debug("%s: timer T5 expired\n", __func__);

        SCTP_INC_STATS(net, SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS);

        reply = sctp_make_abort(asoc, NULL, 0);
        if (!reply)
                goto nomem;

        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
        sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                        SCTP_ERROR(ETIMEDOUT));
        sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                        SCTP_PERR(SCTP_ERROR_NO_ERROR));

        SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
        SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);

        return SCTP_DISPOSITION_DELETE_TCB;
nomem:
        return SCTP_DISPOSITION_NOMEM;
}

/* Handle expiration of AUTOCLOSE timer.  When the autoclose timer expires,
 * the association is automatically closed by starting the shutdown process.
 * The work that needs to be done is same as when SHUTDOWN is initiated by
 * the user.  So this routine looks same as sctp_sf_do_9_2_prm_shutdown().
 */
enum sctp_disposition sctp_sf_autoclose_timer_expire(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const union sctp_subtype type,
                                        void *arg,
                                        struct sctp_cmd_seq *commands)
{
        enum sctp_disposition disposition;

        SCTP_INC_STATS(net, SCTP_MIB_AUTOCLOSE_EXPIREDS);

        /* From 9.2 Shutdown of an Association
         * Upon receipt of the SHUTDOWN primitive from its upper
         * layer, the endpoint enters SHUTDOWN-PENDING state and
         * remains there until all outstanding data has been
         * acknowledged by its peer. The endpoint accepts no new data
         * from its upper layer, but retransmits data to the far end
         * if necessary to fill gaps.
         */
        sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
                        SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING));

        disposition = SCTP_DISPOSITION_CONSUME;
        if (sctp_outq_is_empty(&asoc->outqueue)) {
                disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type,
                                                            NULL, commands);
        }

        return disposition;
}

/*****************************************************************************
 * These are sa state functions which could apply to all types of events.
 ****************************************************************************/

/*
 * This table entry is not implemented.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_not_impl(struct net *net,
                                       const struct sctp_endpoint *ep,
                                       const struct sctp_association *asoc,
                                       const union sctp_subtype type,
                                       void *arg, struct sctp_cmd_seq *commands)
{
        return SCTP_DISPOSITION_NOT_IMPL;
}

/*
 * This table entry represents a bug.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_bug(struct net *net,
                                  const struct sctp_endpoint *ep,
                                  const struct sctp_association *asoc,
                                  const union sctp_subtype type,
                                  void *arg, struct sctp_cmd_seq *commands)
{
        return SCTP_DISPOSITION_BUG;
}

/*
 * This table entry represents the firing of a timer in the wrong state.
 * Since timer deletion cannot be guaranteed a timer 'may' end up firing
 * when the association is in the wrong state.   This event should
 * be ignored, so as to prevent any rearming of the timer.
 *
 * Inputs
 * (endpoint, asoc, chunk)
 *
 * The return value is the disposition of the chunk.
 */
enum sctp_disposition sctp_sf_timer_ignore(struct net *net,
                                           const struct sctp_endpoint *ep,
                                           const struct sctp_association *asoc,
                                           const union sctp_subtype type,
                                           void *arg,
                                           struct sctp_cmd_seq *commands)
{
        pr_debug("%s: timer %d ignored\n", __func__, type.chunk);

        return SCTP_DISPOSITION_CONSUME;
}

/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

/* Pull the SACK chunk based on the SACK header. */
static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk)
{
        struct sctp_sackhdr *sack;
        __u16 num_dup_tsns;
        unsigned int len;
        __u16 num_blocks;

        /* Protect ourselves from reading too far into
         * the skb from a bogus sender.
         */
        sack = (struct sctp_sackhdr *) chunk->skb->data;

        num_blocks = ntohs(sack->num_gap_ack_blocks);
        num_dup_tsns = ntohs(sack->num_dup_tsns);
        len = sizeof(struct sctp_sackhdr);
        len += (num_blocks + num_dup_tsns) * sizeof(__u32);
        if (len > chunk->skb->len)
                return NULL;

        skb_pull(chunk->skb, len);

        return sack;
}

/* Create an ABORT packet to be sent as a response, with the specified
 * error causes.
 */
static struct sctp_packet *sctp_abort_pkt_new(
                                        struct net *net,
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk,
                                        const void *payload, size_t paylen)
{
        struct sctp_packet *packet;
        struct sctp_chunk *abort;

        packet = sctp_ootb_pkt_new(net, asoc, chunk);

        if (packet) {
                /* Make an ABORT.
                 * The T bit will be set if the asoc is NULL.
                 */
                abort = sctp_make_abort(asoc, chunk, paylen);
                if (!abort) {
                        sctp_ootb_pkt_free(packet);
                        return NULL;
                }

                /* Reflect vtag if T-Bit is set */
                if (sctp_test_T_bit(abort))
                        packet->vtag = ntohl(chunk->sctp_hdr->vtag);

                /* Add specified error causes, i.e., payload, to the
                 * end of the chunk.
                 */
                sctp_addto_chunk(abort, paylen, payload);

                /* Set the skb to the belonging sock for accounting.  */
                abort->skb->sk = ep->base.sk;

                sctp_packet_append_chunk(packet, abort);

        }

        return packet;
}

/* Allocate a packet for responding in the OOTB conditions.  */
static struct sctp_packet *sctp_ootb_pkt_new(
                                        struct net *net,
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk)
{
        struct sctp_transport *transport;
        struct sctp_packet *packet;
        __u16 sport, dport;
        __u32 vtag;

        /* Get the source and destination port from the inbound packet.  */
        sport = ntohs(chunk->sctp_hdr->dest);
        dport = ntohs(chunk->sctp_hdr->source);

        /* The V-tag is going to be the same as the inbound packet if no
         * association exists, otherwise, use the peer's vtag.
         */
        if (asoc) {
                /* Special case the INIT-ACK as there is no peer's vtag
                 * yet.
                 */
                switch (chunk->chunk_hdr->type) {
                case SCTP_CID_INIT:
                case SCTP_CID_INIT_ACK:
                {
                        struct sctp_initack_chunk *initack;

                        initack = (struct sctp_initack_chunk *)chunk->chunk_hdr;
                        vtag = ntohl(initack->init_hdr.init_tag);
                        break;
                }
                default:
                        vtag = asoc->peer.i.init_tag;
                        break;
                }
        } else {
                /* Special case the INIT and stale COOKIE_ECHO as there is no
                 * vtag yet.
                 */
                switch (chunk->chunk_hdr->type) {
                case SCTP_CID_INIT:
                {
                        struct sctp_init_chunk *init;

                        init = (struct sctp_init_chunk *)chunk->chunk_hdr;
                        vtag = ntohl(init->init_hdr.init_tag);
                        break;
                }
                default:
                        vtag = ntohl(chunk->sctp_hdr->vtag);
                        break;
                }
        }

        /* Make a transport for the bucket, Eliza... */
        transport = sctp_transport_new(net, sctp_source(chunk), GFP_ATOMIC);
        if (!transport)
                goto nomem;

        transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port;

        /* Cache a route for the transport with the chunk's destination as
         * the source address.
         */
        sctp_transport_route(transport, (union sctp_addr *)&chunk->dest,
                             sctp_sk(net->sctp.ctl_sock));

        packet = &transport->packet;
        sctp_packet_init(packet, transport, sport, dport);
        sctp_packet_config(packet, vtag, 0);

        return packet;

nomem:
        return NULL;
}

/* Free the packet allocated earlier for responding in the OOTB condition.  */
void sctp_ootb_pkt_free(struct sctp_packet *packet)
{
        sctp_transport_free(packet->transport);
}

/* Send a stale cookie error when a invalid COOKIE ECHO chunk is found  */
static void sctp_send_stale_cookie_err(struct net *net,
                                       const struct sctp_endpoint *ep,
                                       const struct sctp_association *asoc,
                                       const struct sctp_chunk *chunk,
                                       struct sctp_cmd_seq *commands,
                                       struct sctp_chunk *err_chunk)
{
        struct sctp_packet *packet;

        if (err_chunk) {
                packet = sctp_ootb_pkt_new(net, asoc, chunk);
                if (packet) {
                        struct sctp_signed_cookie *cookie;

                        /* Override the OOTB vtag from the cookie. */
                        cookie = chunk->subh.cookie_hdr;
                        packet->vtag = cookie->c.peer_vtag;

                        /* Set the skb to the belonging sock for accounting. */
                        err_chunk->skb->sk = ep->base.sk;
                        sctp_packet_append_chunk(packet, err_chunk);
                        sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
                                        SCTP_PACKET(packet));
                        SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
                } else
                        sctp_chunk_free (err_chunk);
        }
}


/* Process a data chunk */
static int sctp_eat_data(const struct sctp_association *asoc,
                         struct sctp_chunk *chunk,
                         struct sctp_cmd_seq *commands)
{
        struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);
        struct sctp_datahdr *data_hdr;
        struct sctp_chunk *err;
        enum sctp_verb deliver;
        size_t datalen;
        __u32 tsn;
        int tmp;

        data_hdr = (struct sctp_datahdr *)chunk->skb->data;
        chunk->subh.data_hdr = data_hdr;
        skb_pull(chunk->skb, sctp_datahdr_len(&asoc->stream));

        tsn = ntohl(data_hdr->tsn);
        pr_debug("%s: TSN 0x%x\n", __func__, tsn);

        /* ASSERT:  Now skb->data is really the user data.  */

        /* Process ECN based congestion.
         *
         * Since the chunk structure is reused for all chunks within
         * a packet, we use ecn_ce_done to track if we've already
         * done CE processing for this packet.
         *
         * We need to do ECN processing even if we plan to discard the
         * chunk later.
         */

        if (asoc->peer.ecn_capable && !chunk->ecn_ce_done) {
                struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af;
                chunk->ecn_ce_done = 1;

                if (af->is_ce(sctp_gso_headskb(chunk->skb))) {
                        /* Do real work as side effect. */
                        sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
                                        SCTP_U32(tsn));
                }
        }

        tmp = sctp_tsnmap_check(&asoc->peer.tsn_map, tsn);
        if (tmp < 0) {
                /* The TSN is too high--silently discard the chunk and
                 * count on it getting retransmitted later.
                 */
                if (chunk->asoc)
                        chunk->asoc->stats.outofseqtsns++;
                return SCTP_IERROR_HIGH_TSN;
        } else if (tmp > 0) {
                /* This is a duplicate.  Record it.  */
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_DUP, SCTP_U32(tsn));
                return SCTP_IERROR_DUP_TSN;
        }

        /* This is a new TSN.  */

        /* Discard if there is no room in the receive window.
         * Actually, allow a little bit of overflow (up to a MTU).
         */
        datalen = ntohs(chunk->chunk_hdr->length);
        datalen -= sctp_datachk_len(&asoc->stream);

        deliver = SCTP_CMD_CHUNK_ULP;

        /* Think about partial delivery. */
        if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) {

                /* Even if we don't accept this chunk there is
                 * memory pressure.
                 */
                sctp_add_cmd_sf(commands, SCTP_CMD_PART_DELIVER, SCTP_NULL());
        }

        /* Spill over rwnd a little bit.  Note: While allowed, this spill over
         * seems a bit troublesome in that frag_point varies based on
         * PMTU.  In cases, such as loopback, this might be a rather
         * large spill over.
         */
        if ((!chunk->data_accepted) && (!asoc->rwnd || asoc->rwnd_over ||
            (datalen > asoc->rwnd + asoc->frag_point))) {

                /* If this is the next TSN, consider reneging to make
                 * room.   Note: Playing nice with a confused sender.  A
                 * malicious sender can still eat up all our buffer
                 * space and in the future we may want to detect and
                 * do more drastic reneging.
                 */
                if (sctp_tsnmap_has_gap(map) &&
                    (sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
                        pr_debug("%s: reneging for tsn:%u\n", __func__, tsn);
                        deliver = SCTP_CMD_RENEGE;
                } else {
                        pr_debug("%s: discard tsn:%u len:%zu, rwnd:%d\n",
                                 __func__, tsn, datalen, asoc->rwnd);

                        return SCTP_IERROR_IGNORE_TSN;
                }
        }

        /*
         * Also try to renege to limit our memory usage in the event that
         * we are under memory pressure
         * If we can't renege, don't worry about it, the sk_rmem_schedule
         * in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our
         * memory usage too much
         */
        if (sk_under_memory_pressure(sk)) {
                if (sctp_tsnmap_has_gap(map) &&
                    (sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
                        pr_debug("%s: under pressure, reneging for tsn:%u\n",
                                 __func__, tsn);
                        deliver = SCTP_CMD_RENEGE;
                }
        }

        /*
         * Section 3.3.10.9 No User Data (9)
         *
         * Cause of error
         * ---------------
         * No User Data:  This error cause is returned to the originator of a
         * DATA chunk if a received DATA chunk has no user data.
         */
        if (unlikely(0 == datalen)) {
                err = sctp_make_abort_no_data(asoc, chunk, tsn);
                if (err) {
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(err));
                }
                /* We are going to ABORT, so we might as well stop
                 * processing the rest of the chunks in the packet.
                 */
                sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
                sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
                                SCTP_ERROR(ECONNABORTED));
                sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED,
                                SCTP_PERR(SCTP_ERROR_NO_DATA));
                SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
                SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
                return SCTP_IERROR_NO_DATA;
        }

        chunk->data_accepted = 1;

        /* Note: Some chunks may get overcounted (if we drop) or overcounted
         * if we renege and the chunk arrives again.
         */
        if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
                SCTP_INC_STATS(net, SCTP_MIB_INUNORDERCHUNKS);
                if (chunk->asoc)
                        chunk->asoc->stats.iuodchunks++;
        } else {
                SCTP_INC_STATS(net, SCTP_MIB_INORDERCHUNKS);
                if (chunk->asoc)
                        chunk->asoc->stats.iodchunks++;
        }

        /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
         *
         * If an endpoint receive a DATA chunk with an invalid stream
         * identifier, it shall acknowledge the reception of the DATA chunk
         * following the normal procedure, immediately send an ERROR chunk
         * with cause set to "Invalid Stream Identifier" (See Section 3.3.10)
         * and discard the DATA chunk.
         */
        if (ntohs(data_hdr->stream) >= asoc->stream.incnt) {
                /* Mark tsn as received even though we drop it */
                sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn));

                err = sctp_make_op_error(asoc, chunk, SCTP_ERROR_INV_STRM,
                                         &data_hdr->stream,
                                         sizeof(data_hdr->stream),
                                         sizeof(u16));
                if (err)
                        sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
                                        SCTP_CHUNK(err));
                return SCTP_IERROR_BAD_STREAM;
        }

        /* Check to see if the SSN is possible for this TSN.
         * The biggest gap we can record is 4K wide.  Since SSNs wrap
         * at an unsigned short, there is no way that an SSN can
         * wrap and for a valid TSN.  We can simply check if the current
         * SSN is smaller then the next expected one.  If it is, it wrapped
         * and is invalid.
         */
        if (!asoc->stream.si->validate_data(chunk))
                return SCTP_IERROR_PROTO_VIOLATION;

        /* Send the data up to the user.  Note:  Schedule  the
         * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
         * chunk needs the updated rwnd.
         */
        sctp_add_cmd_sf(commands, deliver, SCTP_CHUNK(chunk));

        return SCTP_IERROR_NO_ERROR;
}





















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Events for filesystem locks
 *
 * Copyright 2013 Jeff Layton <jlayton@poochiereds.net>
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filelock

#if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILELOCK_H

#include <linux/tracepoint.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/kdev_t.h>

#define show_fl_flags(val)                                                \
        __print_flags(val, "|",                                         \
                { FL_POSIX,                "FL_POSIX" },                        \
                { FL_FLOCK,                "FL_FLOCK" },                        \
                { FL_DELEG,                "FL_DELEG" },                        \
                { FL_ACCESS,                "FL_ACCESS" },                        \
                { FL_EXISTS,                "FL_EXISTS" },                        \
                { FL_LEASE,                "FL_LEASE" },                        \
                { FL_CLOSE,                "FL_CLOSE" },                        \
                { FL_SLEEP,                "FL_SLEEP" },                        \
                { FL_DOWNGRADE_PENDING,        "FL_DOWNGRADE_PENDING" },        \
                { FL_UNLOCK_PENDING,        "FL_UNLOCK_PENDING" },                \
                { FL_OFDLCK,                "FL_OFDLCK" })

#define show_fl_type(val)                                \
        __print_symbolic(val,                                \
                        { F_RDLCK, "F_RDLCK" },                \
                        { F_WRLCK, "F_WRLCK" },                \
                        { F_UNLCK, "F_UNLCK" })

TRACE_EVENT(locks_get_lock_context,
        TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx),

        TP_ARGS(inode, type, ctx),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned char, type)
                __field(struct file_lock_context *, ctx)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->type = type;
                __entry->ctx = ctx;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p",
                  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                  __entry->i_ino, show_fl_type(__entry->type), __entry->ctx)
);

DECLARE_EVENT_CLASS(filelock_lock,
        TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),

        TP_ARGS(inode, fl, ret),

        TP_STRUCT__entry(
                __field(struct file_lock *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(unsigned int, pid)
                __field(unsigned int, flags)
                __field(unsigned char, type)
                __field(loff_t, fl_start)
                __field(loff_t, fl_end)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->pid = fl ? fl->c.flc_pid : 0;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->fl_start = fl ? fl->fl_start : 0;
                __entry->fl_end = fl ? fl->fl_end : 0;
                __entry->ret = ret;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                __entry->pid, show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->fl_start, __entry->fl_end, __entry->ret)
);

DEFINE_EVENT(filelock_lock, posix_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, fcntl_setlk,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, locks_remove_posix,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, flock_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DECLARE_EVENT_CLASS(filelock_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(struct file_lease *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(unsigned int, flags)
                __field(unsigned char, type)
                __field(unsigned long, break_time)
                __field(unsigned long, downgrade_time)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->break_time = fl ? fl->fl_break_time : 0;
                __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->break_time, __entry->downgrade_time)
);

DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

TRACE_EVENT(generic_add_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(int, wcount)
                __field(int, rcount)
                __field(int, icount)
                __field(dev_t, s_dev)
                __field(fl_owner_t, owner)
                __field(unsigned int, flags)
                __field(unsigned char, type)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->wcount = atomic_read(&inode->i_writecount);
                __entry->rcount = atomic_read(&inode->i_readcount);
                __entry->icount = atomic_read(&inode->i_count);
                __entry->owner = fl->c.flc_owner;
                __entry->flags = fl->c.flc_flags;
                __entry->type = fl->c.flc_type;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->wcount, __entry->rcount,
                __entry->icount, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type))
);

TRACE_EVENT(leases_conflict,
        TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker),

        TP_ARGS(conflict, lease, breaker),

        TP_STRUCT__entry(
                __field(void *, lease)
                __field(void *, breaker)
                __field(unsigned int, l_fl_flags)
                __field(unsigned int, b_fl_flags)
                __field(unsigned char, l_fl_type)
                __field(unsigned char, b_fl_type)
                __field(bool, conflict)
        ),

        TP_fast_assign(
                __entry->lease = lease;
                __entry->l_fl_flags = lease->c.flc_flags;
                __entry->l_fl_type = lease->c.flc_type;
                __entry->breaker = breaker;
                __entry->b_fl_flags = breaker->c.flc_flags;
                __entry->b_fl_type = breaker->c.flc_type;
                __entry->conflict = conflict;
        ),

        TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s",
                __entry->conflict,
                __entry->lease,
                show_fl_flags(__entry->l_fl_flags),
                show_fl_type(__entry->l_fl_type),
                __entry->breaker,
                show_fl_flags(__entry->b_fl_flags),
                show_fl_type(__entry->b_fl_type))
);

#endif /* _TRACE_FILELOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

















































































































































































































    1 
























    1 
    1 

    1 





























    1 










































































































































































































    1 

    1 






    1 







































































































































































































    1 































































































































    1 
    1 



    1 








    1 










































    1 



























































































































































































































    1 
    1 
    1 

    1 





















































    1 






















































































    1 













    1 


    1 







    1 





























    1 









    1 





    1 
    1 

















    2 










    2 














































































































    1 





    2 






















































































    1 










    1 


























    1 


    2 

















    1 



















    2 





    2 
















































































































































































































































































































































































































    1 







    1 

    1 


    1 



    1 































    1 












    1 



















    1 










    1 















































































































































    1 















    1 





























































    1 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  High-resolution kernel timers
 *
 *  In contrast to the low-resolution timeout API, aka timer wheel,
 *  hrtimers provide finer resolution and accuracy depending on system
 *  configuration and capabilities.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *        Based on the original timer wheel code
 *
 *        Help, testing, suggestions, bugfixes, improvements were
 *        provided by:
 *
 *        George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
 *        et. al.
 */

#include <linux/cpu.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include <trace/events/timer.h>

#include "tick-internal.h"

/*
 * Masks for selecting the soft and hard context timers from
 * cpu_base->active
 */
#define MASK_SHIFT                (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD        ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT        (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL        (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

/*
 * The timer bases:
 *
 * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
 */
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
                        .index = HRTIMER_BASE_MONOTONIC,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
                {
                        .index = HRTIMER_BASE_MONOTONIC_SOFT,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME_SOFT,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME_SOFT,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI_SOFT,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
        }
};

static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
        /* Make sure we catch unsupported clockids */
        [0 ... MAX_CLOCKS - 1]        = HRTIMER_MAX_CLOCK_BASES,

        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
        [CLOCK_MONOTONIC]        = HRTIMER_BASE_MONOTONIC,
        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
        [CLOCK_TAI]                = HRTIMER_BASE_TAI,
};

/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
#ifdef CONFIG_SMP

/*
 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
 * such that hrtimer_callback_running() can unconditionally dereference
 * timer->base->cpu_base
 */
static struct hrtimer_cpu_base migration_cpu_base = {
        .clock_base = { {
                .cpu_base = &migration_cpu_base,
                .seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
                                                     &migration_cpu_base.lock),
        }, },
};

#define migration_base        migration_cpu_base.clock_base[0]

static inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return base == &migration_base;
}

/*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found on the lists/queues.
 *
 * When the timer's base is locked, and the timer removed from list, it is
 * possible to set timer->base = &migration_base and drop the lock: the timer
 * remains locked.
 */
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                                             unsigned long *flags)
        __acquires(&timer->base->lock)
{
        struct hrtimer_clock_base *base;

        for (;;) {
                base = READ_ONCE(timer->base);
                if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
}

/*
 * We do not migrate the timer when it is expiring before the next
 * event on the target cpu. When high resolution is enabled, we cannot
 * reprogram the target cpu hardware and we would cause it to fire
 * late. To keep it simple, we handle the high resolution enabled and
 * disabled case similar.
 *
 * Called with cpu_base->lock of target cpu held.
 */
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
        ktime_t expires;

        expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
        return expires < new_base->cpu_base->expires_next;
}

static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
                                         int pinned)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) && !pinned)
                return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
        return base;
}

/*
 * We switch the timer base to a power-optimized selected CPU target,
 * if:
 *        - NO_HZ_COMMON is enabled
 *        - timer migration is enabled
 *        - the timer callback is not running
 *        - the timer is not the first expiring timer on the new target
 *
 * If one of the above requirements is not fulfilled we move the timer
 * to the current CPU or leave it on the previously assigned CPU if
 * the timer callback is currently running.
 */
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
{
        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
        int basenum = base->index;

        this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
        new_base = &new_cpu_base->clock_base[basenum];

        if (base != new_base) {
                /*
                 * We are trying to move timer to new_base.
                 * However we can't change timer's base while it is running,
                 * so we keep it on the same CPU. No hassle vs. reprogramming
                 * the event source in the high resolution case. The softirq
                 * code will take care of this when the timer function has
                 * completed. There is no conflict as we hold the lock until
                 * the timer is enqueued.
                 */
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;

                /* See the comment in lock_hrtimer_base() */
                WRITE_ONCE(timer->base, &migration_base);
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);

                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
                        new_cpu_base = this_cpu_base;
                        WRITE_ONCE(timer->base, base);
                        goto again;
                }
                WRITE_ONCE(timer->base, new_base);
        } else {
                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
                        new_cpu_base = this_cpu_base;
                        goto again;
                }
        }
        return new_base;
}

#else /* CONFIG_SMP */

static inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return false;
}

static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __acquires(&timer->base->cpu_base->lock)
{
        struct hrtimer_clock_base *base = timer->base;

        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);

        return base;
}

# define switch_hrtimer_base(t, b, p)        (b)

#endif        /* !CONFIG_SMP */

/*
 * Functions for the union type storage format of ktime_t which are
 * too large for inlining:
 */
#if BITS_PER_LONG < 64
/*
 * Divide a ktime value by a nanosecond value
 */
s64 __ktime_divns(const ktime_t kt, s64 div)
{
        int sft = 0;
        s64 dclc;
        u64 tmp;

        dclc = ktime_to_ns(kt);
        tmp = dclc < 0 ? -dclc : dclc;

        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
                div >>= 1;
        }
        tmp >>= sft;
        do_div(tmp, (u32) div);
        return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */

/*
 * Add two ktime values and do a safety check for overflow:
 */
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
        ktime_t res = ktime_add_unsafe(lhs, rhs);

        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
         * return to user space in a timespec:
         */
        if (res < 0 || res < lhs || res < rhs)
                res = ktime_set(KTIME_SEC_MAX, 0);

        return res;
}

EXPORT_SYMBOL_GPL(ktime_add_safe);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr hrtimer_debug_descr;

static void *hrtimer_debug_hint(void *addr)
{
        return ((struct hrtimer *) addr)->function;
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_init(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_free(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr hrtimer_debug_descr = {
        .name                = "hrtimer",
        .debug_hint        = hrtimer_debug_hint,
        .fixup_init        = hrtimer_fixup_init,
        .fixup_activate        = hrtimer_fixup_activate,
        .fixup_free        = hrtimer_fixup_free,
};

static inline void debug_hrtimer_init(struct hrtimer *timer)
{
        debug_object_init(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode)
{
        debug_object_activate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
        debug_object_deactivate(timer, &hrtimer_debug_descr);
}

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode);

void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
{
        debug_object_init_on_stack(timer, &hrtimer_debug_descr);
        __hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);

static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode);

void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
        __hrtimer_init_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);

void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
        debug_object_free(timer, &hrtimer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);

#else

static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif

static inline void
debug_init(struct hrtimer *timer, clockid_t clockid,
           enum hrtimer_mode mode)
{
        debug_hrtimer_init(timer);
        trace_hrtimer_init(timer, clockid, mode);
}

static inline void debug_activate(struct hrtimer *timer,
                                  enum hrtimer_mode mode)
{
        debug_hrtimer_activate(timer, mode);
        trace_hrtimer_start(timer, mode);
}

static inline void debug_deactivate(struct hrtimer *timer)
{
        debug_hrtimer_deactivate(timer);
        trace_hrtimer_cancel(timer);
}

static struct hrtimer_clock_base *
__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
        unsigned int idx;

        if (!*active)
                return NULL;

        idx = __ffs(*active);
        *active &= ~(1U << idx);

        return &cpu_base->clock_base[idx];
}

#define for_each_active_base(base, cpu_base, active)        \
        while ((base = __next_base((cpu_base), &(active))))

static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
                                         const struct hrtimer *exclude,
                                         unsigned int active,
                                         ktime_t expires_next)
{
        struct hrtimer_clock_base *base;
        ktime_t expires;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;
                struct hrtimer *timer;

                next = timerqueue_getnext(&base->active);
                timer = container_of(next, struct hrtimer, node);
                if (timer == exclude) {
                        /* Get to the next timer in the queue. */
                        next = timerqueue_iterate_next(next);
                        if (!next)
                                continue;

                        timer = container_of(next, struct hrtimer, node);
                }
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                if (expires < expires_next) {
                        expires_next = expires;

                        /* Skip cpu_base update if a timer is being excluded. */
                        if (exclude)
                                continue;

                        if (timer->is_soft)
                                cpu_base->softirq_next_timer = timer;
                        else
                                cpu_base->next_timer = timer;
                }
        }
        /*
         * clock_was_set() might have changed base->offset of any of
         * the clock bases so the result might be negative. Fix it up
         * to prevent a false positive in clockevents_program_event().
         */
        if (expires_next < 0)
                expires_next = 0;
        return expires_next;
}

/*
 * Recomputes cpu_base::*next_timer and returns the earliest expires_next
 * but does not set cpu_base::*expires_next, that is done by
 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
 * cpu_base::*expires_next right away, reprogramming logic would no longer
 * work.
 *
 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
 * those timers will get run whenever the softirq gets handled, at the end of
 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
 *
 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
 *
 * @active_mask must be one of:
 *  - HRTIMER_ACTIVE_ALL,
 *  - HRTIMER_ACTIVE_SOFT, or
 *  - HRTIMER_ACTIVE_HARD.
 */
static ktime_t
__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
        unsigned int active;
        struct hrtimer *next_timer = NULL;
        ktime_t expires_next = KTIME_MAX;

        if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                cpu_base->softirq_next_timer = NULL;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL,
                                                         active, KTIME_MAX);

                next_timer = cpu_base->softirq_next_timer;
        }

        if (active_mask & HRTIMER_ACTIVE_HARD) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                cpu_base->next_timer = next_timer;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
                                                         expires_next);
        }

        return expires_next;
}

static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
        ktime_t expires_next, soft = KTIME_MAX;

        /*
         * If the soft interrupt has already been activated, ignore the
         * soft bases. They will be handled in the already raised soft
         * interrupt.
         */
        if (!cpu_base->softirq_activated) {
                soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
                /*
                 * Update the soft expiry time. clock_settime() might have
                 * affected it.
                 */
                cpu_base->softirq_expires_next = soft;
        }

        expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
        /*
         * If a softirq timer is expiring first, update cpu_base->next_timer
         * and program the hardware with the soft expiry time.
         */
        if (expires_next > soft) {
                cpu_base->next_timer = cpu_base->softirq_next_timer;
                expires_next = soft;
        }

        return expires_next;
}

static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;

        ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
                                            offs_real, offs_boot, offs_tai);

        base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
        base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
        base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;

        return now;
}

/*
 * Is the high resolution mode active ?
 */
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                cpu_base->hres_active : 0;
}

static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
                                struct hrtimer *next_timer,
                                ktime_t expires_next)
{
        cpu_base->expires_next = expires_next;

        /*
         * If hres is not active, hardware does not have to be
         * reprogrammed yet.
         *
         * If a hang was detected in the last timer interrupt then we
         * leave the hang delay active in the hardware. We want the
         * system to make progress. That also prevents the following
         * scenario:
         * T1 expires 50ms from now
         * T2 expires 5s from now
         *
         * T1 is removed, so this code is called and would reprogram
         * the hardware to 5s from now. Any hrtimer_start after that
         * will not reprogram the hardware due to hang_detected being
         * set. So we'd effectively block all timers until the T2 event
         * fires.
         */
        if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
                return;

        tick_program_event(expires_next, 1);
}

/*
 * Reprogram the event source with checking both queues for the
 * next event
 * Called with interrupts disabled and base->lock held
 */
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
        ktime_t expires_next;

        expires_next = hrtimer_update_next_event(cpu_base);

        if (skip_equal && expires_next == cpu_base->expires_next)
                return;

        __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}

/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer enabled ?
 */
static bool hrtimer_hres_enabled __read_mostly  = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);

/*
 * Enable / Disable high resolution mode
 */
static int __init setup_hrtimer_hres(char *str)
{
        return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}

__setup("highres=", setup_hrtimer_hres);

/*
 * hrtimer_high_res_enabled - query, if the highres mode is enabled
 */
static inline int hrtimer_is_hres_enabled(void)
{
        return hrtimer_hres_enabled;
}

static void retrigger_next_event(void *arg);

/*
 * Switch to high resolution mode
 */
static void hrtimer_switch_to_hres(void)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        if (tick_init_highres()) {
                pr_warn("Could not switch to high resolution mode on CPU %u\n",
                        base->cpu);
                return;
        }
        base->hres_active = 1;
        hrtimer_resolution = HIGH_RES_NSEC;

        tick_setup_sched_timer(true);
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
}

#else

static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }

#endif /* CONFIG_HIGH_RES_TIMERS */
/*
 * Retrigger next event is called after clock was set with interrupts
 * disabled through an SMP function call or directly from low level
 * resume code.
 *
 * This is only invoked when:
 *        - CONFIG_HIGH_RES_TIMERS is enabled.
 *        - CONFIG_NOHZ_COMMON is enabled
 *
 * For the other cases this function is empty and because the call sites
 * are optimized out it vanishes as well, i.e. no need for lots of
 * #ifdeffery.
 */
static void retrigger_next_event(void *arg)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        /*
         * When high resolution mode or nohz is active, then the offsets of
         * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
         * next tick will take care of that.
         *
         * If high resolution mode is active then the next expiring timer
         * must be reevaluated and the clock event device reprogrammed if
         * necessary.
         *
         * In the NOHZ case the update of the offset and the reevaluation
         * of the next expiring timer is enough. The return from the SMP
         * function call will take care of the reprogramming in case the
         * CPU was in a NOHZ idle sleep.
         */
        if (!hrtimer_hres_active(base) && !tick_nohz_active)
                return;

        raw_spin_lock(&base->lock);
        hrtimer_update_base(base);
        if (hrtimer_hres_active(base))
                hrtimer_force_reprogram(base, 0);
        else
                hrtimer_update_next_event(base);
        raw_spin_unlock(&base->lock);
}

/*
 * When a timer is enqueued and expires earlier than the already enqueued
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base = timer->base;
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);

        WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);

        /*
         * CLOCK_REALTIME timer might be requested with an absolute
         * expiry time which is less than base->offset. Set it to 0.
         */
        if (expires < 0)
                expires = 0;

        if (timer->is_soft) {
                /*
                 * soft hrtimer could be started on a remote CPU. In this
                 * case softirq_expires_next needs to be updated on the
                 * remote CPU. The soft hrtimer will not expire before the
                 * first hard hrtimer on the remote CPU -
                 * hrtimer_check_target() prevents this case.
                 */
                struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;

                if (timer_cpu_base->softirq_activated)
                        return;

                if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
                        return;

                timer_cpu_base->softirq_next_timer = timer;
                timer_cpu_base->softirq_expires_next = expires;

                if (!ktime_before(expires, timer_cpu_base->expires_next) ||
                    !reprogram)
                        return;
        }

        /*
         * If the timer is not on the current cpu, we cannot reprogram
         * the other cpus clock event device.
         */
        if (base->cpu_base != cpu_base)
                return;

        if (expires >= cpu_base->expires_next)
                return;

        /*
         * If the hrtimer interrupt is running, then it will reevaluate the
         * clock bases and reprogram the clock event device.
         */
        if (cpu_base->in_hrtirq)
                return;

        cpu_base->next_timer = timer;

        __hrtimer_reprogram(cpu_base, timer, expires);
}

static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
                             unsigned int active)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;
        ktime_t expires;

        /*
         * Update the base offsets unconditionally so the following
         * checks whether the SMP function call is required works.
         *
         * The update is safe even when the remote CPU is in the hrtimer
         * interrupt or the hrtimer soft interrupt and expiring affected
         * bases. Either it will see the update before handling a base or
         * it will see it when it finishes the processing and reevaluates
         * the next expiring timer.
         */
        seq = cpu_base->clock_was_set_seq;
        hrtimer_update_base(cpu_base);

        /*
         * If the sequence did not change over the update then the
         * remote CPU already handled it.
         */
        if (seq == cpu_base->clock_was_set_seq)
                return false;

        /*
         * If the remote CPU is currently handling an hrtimer interrupt, it
         * will reevaluate the first expiring timer of all clock bases
         * before reprogramming. Nothing to do here.
         */
        if (cpu_base->in_hrtirq)
                return false;

        /*
         * Walk the affected clock bases and check whether the first expiring
         * timer in a clock base is moving ahead of the first expiring timer of
         * @cpu_base. If so, the IPI must be invoked because per CPU clock
         * event devices cannot be remotely reprogrammed.
         */
        active &= cpu_base->active_bases;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;

                next = timerqueue_getnext(&base->active);
                expires = ktime_sub(next->expires, base->offset);
                if (expires < cpu_base->expires_next)
                        return true;

                /* Extra check for softirq clock bases */
                if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
                        continue;
                if (cpu_base->softirq_activated)
                        continue;
                if (expires < cpu_base->softirq_expires_next)
                        return true;
        }
        return false;
}

/*
 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
 * CLOCK_BOOTTIME (for late sleep time injection).
 *
 * This requires to update the offsets for these clocks
 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
 * also requires to eventually reprogram the per CPU clock event devices
 * when the change moves an affected timer ahead of the first expiring
 * timer on that CPU. Obviously remote per CPU clock event devices cannot
 * be reprogrammed. The other reason why an IPI has to be sent is when the
 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
 * in the tick, which obviously might be stopped, so this has to bring out
 * the remote CPU which might sleep in idle to get this sorted.
 */
void clock_was_set(unsigned int bases)
{
        struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
        cpumask_var_t mask;
        int cpu;

        if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
                goto out_timerfd;

        if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
                on_each_cpu(retrigger_next_event, NULL, 1);
                goto out_timerfd;
        }

        /* Avoid interrupting CPUs if possible */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                unsigned long flags;

                cpu_base = &per_cpu(hrtimer_bases, cpu);
                raw_spin_lock_irqsave(&cpu_base->lock, flags);

                if (update_needs_ipi(cpu_base, bases))
                        cpumask_set_cpu(cpu, mask);

                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        }

        preempt_disable();
        smp_call_function_many(mask, retrigger_next_event, NULL, 1);
        preempt_enable();
        cpus_read_unlock();
        free_cpumask_var(mask);

out_timerfd:
        timerfd_clock_was_set();
}

static void clock_was_set_work(struct work_struct *work)
{
        clock_was_set(CLOCK_SET_WALL);
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);

/*
 * Called from timekeeping code to reprogram the hrtimer interrupt device
 * on all cpus and to notify timerfd.
 */
void clock_was_set_delayed(void)
{
        schedule_work(&hrtimer_work);
}

/*
 * Called during resume either directly from via timekeeping_resume()
 * or in the case of s2idle from tick_unfreeze() to ensure that the
 * hrtimers are up to date.
 */
void hrtimers_resume_local(void)
{
        lockdep_assert_irqs_disabled();
        /* Retrigger on the local CPU */
        retrigger_next_event(NULL);
}

/*
 * Counterpart to lock_hrtimer_base above:
 */
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __releases(&timer->base->cpu_base->lock)
{
        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}

/**
 * hrtimer_forward() - forward the timer expiry
 * @timer:        hrtimer to forward
 * @now:        forward past this time
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
 *
 * .. note::
 *  This only updates the timer expiry value and does not requeue the timer.
 *
 * There is also a variant of the function hrtimer_forward_now().
 *
 * Context: Can be safely called from the callback function of @timer. If called
 *          from other contexts @timer must neither be enqueued nor running the
 *          callback and the caller needs to take care of serialization.
 *
 * Return: The number of overruns are returned.
 */
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
        u64 orun = 1;
        ktime_t delta;

        delta = ktime_sub(now, hrtimer_get_expires(timer));

        if (delta < 0)
                return 0;

        if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
                return 0;

        if (interval < hrtimer_resolution)
                interval = hrtimer_resolution;

        if (unlikely(delta >= interval)) {
                s64 incr = ktime_to_ns(interval);

                orun = ktime_divns(delta, incr);
                hrtimer_add_expires_ns(timer, incr * orun);
                if (hrtimer_get_expires_tv64(timer) > now)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 * correction for exact:
                 */
                orun++;
        }
        hrtimer_add_expires(timer, interval);

        return orun;
}
EXPORT_SYMBOL_GPL(hrtimer_forward);

/*
 * enqueue_hrtimer - internal function to (re)start a timer
 *
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)). Must hold the base lock.
 *
 * Returns 1 when the new timer is the leftmost timer in the tree.
 */
static int enqueue_hrtimer(struct hrtimer *timer,
                           struct hrtimer_clock_base *base,
                           enum hrtimer_mode mode)
{
        debug_activate(timer, mode);
        WARN_ON_ONCE(!base->cpu_base->online);

        base->cpu_base->active_bases |= 1 << base->index;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);

        return timerqueue_add(&base->active, &timer->node);
}

/*
 * __remove_hrtimer - internal function to remove a timer
 *
 * Caller must hold the base lock.
 *
 * High resolution timer mode reprograms the clock event device when the
 * timer is the one which expires next. The caller can disable this by setting
 * reprogram to zero. This is useful, when the context does a reprogramming
 * anyway (e.g. timer interrupt)
 */
static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
                             u8 newstate, int reprogram)
{
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;
        u8 state = timer->state;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, newstate);
        if (!(state & HRTIMER_STATE_ENQUEUED))
                return;

        if (!timerqueue_del(&base->active, &timer->node))
                cpu_base->active_bases &= ~(1 << base->index);

        /*
         * Note: If reprogram is false we do not update
         * cpu_base->next_timer. This happens when we remove the first
         * timer on a remote cpu. No harm as we never dereference
         * cpu_base->next_timer. So the worst thing what can happen is
         * an superfluous call to hrtimer_force_reprogram() on the
         * remote cpu later on if the same timer gets enqueued again.
         */
        if (reprogram && timer == cpu_base->next_timer)
                hrtimer_force_reprogram(cpu_base, 1);
}

/*
 * remove hrtimer, called with base lock held
 */
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
               bool restart, bool keep_local)
{
        u8 state = timer->state;

        if (state & HRTIMER_STATE_ENQUEUED) {
                bool reprogram;

                /*
                 * Remove the timer and force reprogramming when high
                 * resolution mode is active and the timer is on the current
                 * CPU. If we remove a timer on another CPU, reprogramming is
                 * skipped. The interrupt event on this CPU is fired and
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
                debug_deactivate(timer);
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);

                /*
                 * If the timer is not restarted then reprogramming is
                 * required if the timer is local. If it is local and about
                 * to be restarted, avoid programming it twice (on removal
                 * and a moment later when it's requeued).
                 */
                if (!restart)
                        state = HRTIMER_STATE_INACTIVE;
                else
                        reprogram &= !keep_local;

                __remove_hrtimer(timer, base, state, reprogram);
                return 1;
        }
        return 0;
}

static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
                                            const enum hrtimer_mode mode)
{
#ifdef CONFIG_TIME_LOW_RES
        /*
         * CONFIG_TIME_LOW_RES indicates that the system has no way to return
         * granular time values. For relative timers we add hrtimer_resolution
         * (i.e. one jiffie) to prevent short timeouts.
         */
        timer->is_rel = mode & HRTIMER_MODE_REL;
        if (timer->is_rel)
                tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
        return tim;
}

static void
hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
        ktime_t expires;

        /*
         * Find the next SOFT expiration.
         */
        expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);

        /*
         * reprogramming needs to be triggered, even if the next soft
         * hrtimer expires at the same time than the next hard
         * hrtimer. cpu_base->softirq_expires_next needs to be updated!
         */
        if (expires == KTIME_MAX)
                return;

        /*
         * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
         * cpu_base->*expires_next is only set by hrtimer_reprogram()
         */
        hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}

static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                    u64 delta_ns, const enum hrtimer_mode mode,
                                    struct hrtimer_clock_base *base)
{
        struct hrtimer_clock_base *new_base;
        bool force_local, first;

        /*
         * If the timer is on the local cpu base and is the first expiring
         * timer then this might end up reprogramming the hardware twice
         * (on removal and on enqueue). To avoid that by prevent the
         * reprogram on removal, keep the timer local to the current CPU
         * and enforce reprogramming after it is queued no matter whether
         * it is the new first expiring timer again or not.
         */
        force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
        force_local &= base->cpu_base->next_timer == timer;

        /*
         * Remove an active timer from the queue. In case it is not queued
         * on the current CPU, make sure that remove_hrtimer() updates the
         * remote data correctly.
         *
         * If it's on the current CPU and the first expiring timer, then
         * skip reprogramming, keep the timer local and enforce
         * reprogramming later if it was the first expiring timer.  This
         * avoids programming the underlying clock event twice (once at
         * removal and once after enqueue).
         */
        remove_hrtimer(timer, base, true, force_local);

        if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, base->get_time());

        tim = hrtimer_update_lowres(timer, tim, mode);

        hrtimer_set_expires_range_ns(timer, tim, delta_ns);

        /* Switch the timer base, if necessary: */
        if (!force_local) {
                new_base = switch_hrtimer_base(timer, base,
                                               mode & HRTIMER_MODE_PINNED);
        } else {
                new_base = base;
        }

        first = enqueue_hrtimer(timer, new_base, mode);
        if (!force_local)
                return first;

        /*
         * Timer was forced to stay on the current CPU to avoid
         * reprogramming on removal and enqueue. Force reprogram the
         * hardware by evaluating the new first expiring timer.
         */
        hrtimer_force_reprogram(new_base->cpu_base, 1);
        return 0;
}

/**
 * hrtimer_start_range_ns - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @delta_ns:        "slack" range for the timer
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                            u64 delta_ns, const enum hrtimer_mode mode)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;

        /*
         * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
         * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
         * expiry mode because unmarked timers are moved to softirq expiry.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
        else
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);

        base = lock_hrtimer_base(timer, &flags);

        if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
                hrtimer_reprogram(timer, true);

        unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);

/**
 * hrtimer_try_to_cancel - try to deactivate a timer
 * @timer:        hrtimer to stop
 *
 * Returns:
 *
 *  *  0 when the timer was not active
 *  *  1 when the timer was active
 *  * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;
        int ret = -1;

        /*
         * Check lockless first. If the timer is not active (neither
         * enqueued nor running the callback, nothing to do here.  The
         * base lock does not serialize against a concurrent enqueue,
         * so we can avoid taking it.
         */
        if (!hrtimer_active(timer))
                return 0;

        base = lock_hrtimer_base(timer, &flags);

        if (!hrtimer_callback_running(timer))
                ret = remove_hrtimer(timer, base, false, false);

        unlock_hrtimer_base(timer, &flags);

        return ret;

}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);

#ifdef CONFIG_PREEMPT_RT
static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
{
        spin_lock_init(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
{
        spin_lock(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
{
        spin_unlock(&base->softirq_expiry_lock);
}

/*
 * The counterpart to hrtimer_cancel_wait_running().
 *
 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
 * the timer callback to finish. Drop expiry_lock and reacquire it. That
 * allows the waiter to acquire the lock and make progress.
 */
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
                                      unsigned long flags)
{
        if (atomic_read(&cpu_base->timer_waiters)) {
                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
                spin_unlock(&cpu_base->softirq_expiry_lock);
                spin_lock(&cpu_base->softirq_expiry_lock);
                raw_spin_lock_irq(&cpu_base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion: if the soft irq thread is preempted
 * in the middle of a timer callback, then calling del_timer_sync() can
 * lead to two issues:
 *
 *  - If the caller is on a remote CPU then it has to spin wait for the timer
 *    handler to complete. This can result in unbound priority inversion.
 *
 *  - If the caller originates from the task which preempted the timer
 *    handler on the same CPU, then spin waiting for the timer handler to
 *    complete is never going to end.
 */
void hrtimer_cancel_wait_running(const struct hrtimer *timer)
{
        /* Lockless read. Prevent the compiler from reloading it below */
        struct hrtimer_clock_base *base = READ_ONCE(timer->base);

        /*
         * Just relax if the timer expires in hard interrupt context or if
         * it is currently on the migration base.
         */
        if (!timer->is_soft || is_migration_base(base)) {
                cpu_relax();
                return;
        }

        /*
         * Mark the base as contended and grab the expiry lock, which is
         * held by the softirq across the timer callback. Drop the lock
         * immediately so the softirq can expire the next timer. In theory
         * the timer could already be running again, but that's more than
         * unlikely and just causes another wait loop.
         */
        atomic_inc(&base->cpu_base->timer_waiters);
        spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
        atomic_dec(&base->cpu_base->timer_waiters);
        spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
static inline void
hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
                                             unsigned long flags) { }
#endif

/**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
 * @timer:        the timer to be cancelled
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 */
int hrtimer_cancel(struct hrtimer *timer)
{
        int ret;

        do {
                ret = hrtimer_try_to_cancel(timer);

                if (ret < 0)
                        hrtimer_cancel_wait_running(timer);
        } while (ret < 0);
        return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);

/**
 * __hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 * @adjust:        adjust relative timers when CONFIG_TIME_LOW_RES=y
 */
ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
        unsigned long flags;
        ktime_t rem;

        lock_hrtimer_base(timer, &flags);
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
                rem = hrtimer_expires_remaining_adjusted(timer);
        else
                rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);

        return rem;
}
EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);

#ifdef CONFIG_NO_HZ_COMMON
/**
 * hrtimer_get_next_event - get the time until next expiry event
 *
 * Returns the next expiry time or KTIME_MAX if no timer is pending.
 */
u64 hrtimer_get_next_event(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (!hrtimer_hres_active(cpu_base))
                expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}

/**
 * hrtimer_next_event_without - time until next expiry event w/o one timer
 * @exclude:        timer to exclude
 *
 * Returns the next expiry time over all timers except for the @exclude one or
 * KTIME_MAX if none of them is pending.
 */
u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (hrtimer_hres_active(cpu_base)) {
                unsigned int active;

                if (!cpu_base->softirq_activated) {
                        active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                        expires = __hrtimer_next_event_base(cpu_base, exclude,
                                                            active, KTIME_MAX);
                }
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                expires = __hrtimer_next_event_base(cpu_base, exclude, active,
                                                    expires);
        }

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}
#endif

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
        if (likely(clock_id < MAX_CLOCKS)) {
                int base = hrtimer_clock_to_base_table[clock_id];

                if (likely(base != HRTIMER_MAX_CLOCK_BASES))
                        return base;
        }
        WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
        return HRTIMER_BASE_MONOTONIC;
}

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
{
        bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
        struct hrtimer_cpu_base *cpu_base;
        int base;

        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context for latency reasons and because the callbacks
         * can invoke functions which might sleep on RT, e.g. spin_lock().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
                softtimer = true;

        memset(timer, 0, sizeof(struct hrtimer));

        cpu_base = raw_cpu_ptr(&hrtimer_bases);

        /*
         * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they needs to become CLOCK_MONOTONIC to
         * ensure POSIX compliance.
         */
        if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
                clock_id = CLOCK_MONOTONIC;

        base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
        base += hrtimer_clockid_to_base(clock_id);
        timer->is_soft = softtimer;
        timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
        timer->base = &cpu_base->clock_base[base];
        timerqueue_init(&timer->node);
}

/**
 * hrtimer_init - initialize a timer to the given clock
 * @timer:        the timer to be initialized
 * @clock_id:        the clock to be used
 * @mode:       The modes which are relevant for initialization:
 *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
 *              HRTIMER_MODE_REL_SOFT
 *
 *              The PINNED variants of the above can be handed in,
 *              but the PINNED bit is ignored as pinning happens
 *              when the hrtimer is started
 */
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                  enum hrtimer_mode mode)
{
        debug_init(timer, clock_id, mode);
        __hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init);

/*
 * A timer is active, when it is enqueued into the rbtree or the
 * callback function is running or it's in the state of being migrated
 * to another cpu.
 *
 * It is important for this function to not return a false negative.
 */
bool hrtimer_active(const struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;

        do {
                base = READ_ONCE(timer->base);
                seq = raw_read_seqcount_begin(&base->seq);

                if (timer->state != HRTIMER_STATE_INACTIVE ||
                    base->running == timer)
                        return true;

        } while (read_seqcount_retry(&base->seq, seq) ||
                 base != READ_ONCE(timer->base));

        return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);

/*
 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
 * distinct sections:
 *
 *  - queued:        the timer is queued
 *  - callback:        the timer is being ran
 *  - post:        the timer is inactive or (re)queued
 *
 * On the read side we ensure we observe timer->state and cpu_base->running
 * from the same section, if anything changed while we looked at it, we retry.
 * This includes timer->base changing because sequence numbers alone are
 * insufficient for that.
 *
 * The sequence numbers are required because otherwise we could still observe
 * a false negative if the read side got smeared over multiple consecutive
 * __run_hrtimer() invocations.
 */

static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
                          struct hrtimer_clock_base *base,
                          struct hrtimer *timer, ktime_t *now,
                          unsigned long flags) __must_hold(&cpu_base->lock)
{
        enum hrtimer_restart (*fn)(struct hrtimer *);
        bool expires_in_hardirq;
        int restart;

        lockdep_assert_held(&cpu_base->lock);

        debug_deactivate(timer);
        base->running = timer;

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        fn = timer->function;

        /*
         * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
         * timer is restarted with a period then it becomes an absolute
         * timer. If its not restarted it does not matter.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES))
                timer->is_rel = false;

        /*
         * The timer is marked as running in the CPU base, so it is
         * protected against migration to a different CPU even if the lock
         * is dropped.
         */
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        trace_hrtimer_expire_entry(timer, now);
        expires_in_hardirq = lockdep_hrtimer_enter(timer);

        restart = fn(timer);

        lockdep_hrtimer_exit(expires_in_hardirq);
        trace_hrtimer_expire_exit(timer);
        raw_spin_lock_irq(&cpu_base->lock);

        /*
         * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
         * hrtimer_start_range_ns() can have popped in and enqueued the timer
         * for us already.
         */
        if (restart != HRTIMER_NORESTART &&
            !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running.timer == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        WARN_ON_ONCE(base->running != timer);
        base->running = NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
                                 unsigned long flags, unsigned int active_mask)
{
        struct hrtimer_clock_base *base;
        unsigned int active = cpu_base->active_bases & active_mask;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *node;
                ktime_t basenow;

                basenow = ktime_add(now, base->offset);

                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;

                        timer = container_of(node, struct hrtimer, node);

                        /*
                         * The immediate goal for using the softexpires is
                         * minimizing wakeups, not running timers at the
                         * earliest interrupt after their soft expiration.
                         * This allows us to avoid using a Priority Search
                         * Tree, which can answer a stabbing query for
                         * overlapping intervals and instead use the simple
                         * BST we already have.
                         * We don't add extra wakeups by delaying timers that
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
                        if (basenow < hrtimer_get_softexpires_tv64(timer))
                                break;

                        __run_hrtimer(cpu_base, base, timer, &basenow, flags);
                        if (active_mask == HRTIMER_ACTIVE_SOFT)
                                hrtimer_sync_wait_running(cpu_base, flags);
                }
        }
}

static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        hrtimer_cpu_base_lock_expiry(cpu_base);
        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        now = hrtimer_update_base(cpu_base);
        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);

        cpu_base->softirq_activated = 0;
        hrtimer_update_softirq_timer(cpu_base, true);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        hrtimer_cpu_base_unlock_expiry(cpu_base);
}

#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer interrupt
 * Called with interrupts disabled
 */
void hrtimer_interrupt(struct clock_event_device *dev)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next, now, entry_time, delta;
        unsigned long flags;
        int retries = 0;

        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event = KTIME_MAX;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        entry_time = now = hrtimer_update_base(cpu_base);
retry:
        cpu_base->in_hrtirq = 1;
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
         * the migration code. This does not affect enqueueing of
         * timers which run their callback and need to be requeued on
         * this CPU.
         */
        cpu_base->expires_next = KTIME_MAX;

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_softirq_irqoff(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);

        /* Reevaluate the clock bases for the [soft] next expiry */
        expires_next = hrtimer_update_next_event(cpu_base);
        /*
         * Store the new expiry value so the migration code can verify
         * against it.
         */
        cpu_base->expires_next = expires_next;
        cpu_base->in_hrtirq = 0;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        /* Reprogramming necessary ? */
        if (!tick_program_event(expires_next, 0)) {
                cpu_base->hang_detected = 0;
                return;
        }

        /*
         * The next timer was already expired due to:
         * - tracing
         * - long lasting callbacks
         * - being scheduled away when running in a VM
         *
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
         *
         * Acquire base lock for updating the offsets and retrieving
         * the current time.
         */
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
        /*
         * Give the system a chance to do something else than looping
         * here. We stored the entry time, so we know exactly how long
         * we spent here. We schedule the next event this amount of
         * time away.
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        delta = ktime_sub(now, entry_time);
        if ((unsigned int)delta > cpu_base->max_hang_time)
                cpu_base->max_hang_time = (unsigned int) delta;
        /*
         * Limit it to a sensible value as we enforce a longer
         * delay. Give the CPU at least 100ms to catch up.
         */
        if (delta > 100 * NSEC_PER_MSEC)
                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
        else
                expires_next = ktime_add(now, delta);
        tick_program_event(expires_next, 1);
        pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
#endif /* !CONFIG_HIGH_RES_TIMERS */

/*
 * Called from run_local_timers in hardirq context every jiffy
 */
void hrtimer_run_queues(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        if (hrtimer_hres_active(cpu_base))
                return;

        /*
         * This _is_ ugly: We have to check periodically, whether we
         * can switch to highres and / or nohz mode. The clocksource
         * switch happens with xtime_lock held. Notification from
         * there only sets the check bit in the tick_oneshot code,
         * otherwise we might deadlock vs. xtime_lock.
         */
        if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                hrtimer_switch_to_hres();
                return;
        }

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_softirq_irqoff(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

/*
 * Sleep related functions:
 */
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
        struct hrtimer_sleeper *t =
                container_of(timer, struct hrtimer_sleeper, timer);
        struct task_struct *task = t->task;

        t->task = NULL;
        if (task)
                wake_up_process(task);

        return HRTIMER_NORESTART;
}

/**
 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
 * @sl:                sleeper to be started
 * @mode:        timer mode abs/rel
 *
 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
 */
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode)
{
        /*
         * Make the enqueue delivery mode check work on RT. If the sleeper
         * was initialized for hard interrupt delivery, force the mode bit.
         * This is a special case for hrtimer_sleepers because
         * hrtimer_init_sleeper() determines the delivery mode on RT so the
         * fiddling with this decision is avoided at the call sites.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
                mode |= HRTIMER_MODE_HARD;

        hrtimer_start_expires(&sl->timer, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);

static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode)
{
        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context either for latency reasons or because the
         * hrtimer callback takes regular spinlocks or invokes other
         * functions which are not suitable for hard interrupt context on
         * PREEMPT_RT.
         *
         * The hrtimer_sleeper callback is RT compatible in hard interrupt
         * context, but there is a latency concern: Untrusted userspace can
         * spawn many threads which arm timers for the same expiry time on
         * the same CPU. That causes a latency spike due to the wakeup of
         * a gazillion threads.
         *
         * OTOH, privileged real-time user space applications rely on the
         * low latency of hard interrupt wakeups. If the current task is in
         * a real-time scheduling class, mark the mode for hard interrupt
         * expiry.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
                        mode |= HRTIMER_MODE_HARD;
        }

        __hrtimer_init(&sl->timer, clock_id, mode);
        sl->timer.function = hrtimer_wakeup;
        sl->task = current;
}

/**
 * hrtimer_init_sleeper - initialize sleeper to the given clock
 * @sl:                sleeper to be initialized
 * @clock_id:        the clock to be used
 * @mode:        timer mode abs/rel
 */
void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
                          enum hrtimer_mode mode)
{
        debug_init(&sl->timer, clock_id, mode);
        __hrtimer_init_sleeper(sl, clock_id, mode);

}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);

int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
        switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT_32BIT_TIME
        case TT_COMPAT:
                if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
                        return -EFAULT;
                break;
#endif
        case TT_NATIVE:
                if (put_timespec64(ts, restart->nanosleep.rmtp))
                        return -EFAULT;
                break;
        default:
                BUG();
        }
        return -ERESTART_RESTARTBLOCK;
}

static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
        struct restart_block *restart;

        do {
                set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                hrtimer_sleeper_start_expires(t, mode);

                if (likely(t->task))
                        schedule();

                hrtimer_cancel(&t->timer);
                mode = HRTIMER_MODE_ABS;

        } while (t->task && !signal_pending(current));

        __set_current_state(TASK_RUNNING);

        if (!t->task)
                return 0;

        restart = &current->restart_block;
        if (restart->nanosleep.type != TT_NONE) {
                ktime_t rem = hrtimer_expires_remaining(&t->timer);
                struct timespec64 rmt;

                if (rem <= 0)
                        return 0;
                rmt = ktime_to_timespec64(rem);

                return nanosleep_copyout(restart, &rmt);
        }
        return -ERESTART_RESTARTBLOCK;
}

static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
        struct hrtimer_sleeper t;
        int ret;

        hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
                                      HRTIMER_MODE_ABS);
        hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
        ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                       const clockid_t clockid)
{
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;
        u64 slack;

        slack = current->timer_slack_ns;
        if (rt_task(current))
                slack = 0;

        hrtimer_init_sleeper_on_stack(&t, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
        ret = do_nanosleep(&t, mode);
        if (ret != -ERESTART_RESTARTBLOCK)
                goto out;

        /* Absolute timers do not update the rmtp value and restart: */
        if (mode == HRTIMER_MODE_ABS) {
                ret = -ERESTARTNOHAND;
                goto out;
        }

        restart = &current->restart_block;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
        set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

#ifdef CONFIG_64BIT

SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        struct timespec64 tu;

        if (get_timespec64(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}

#endif

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
                       struct old_timespec32 __user *, rmtp)
{
        struct timespec64 tu;

        if (get_old_timespec32(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}
#endif

/*
 * Functions related to boot-time initialization:
 */
int hrtimers_prepare_cpu(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];

                clock_b->cpu_base = cpu_base;
                seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
                timerqueue_init_head(&clock_b->active);
        }

        cpu_base->cpu = cpu;
        cpu_base->active_bases = 0;
        cpu_base->hres_active = 0;
        cpu_base->hang_detected = 0;
        cpu_base->next_timer = NULL;
        cpu_base->softirq_next_timer = NULL;
        cpu_base->expires_next = KTIME_MAX;
        cpu_base->softirq_expires_next = KTIME_MAX;
        cpu_base->online = 1;
        hrtimer_cpu_base_init_expiry_lock(cpu_base);
        return 0;
}

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
{
        struct hrtimer *timer;
        struct timerqueue_node *node;

        while ((node = timerqueue_getnext(&old_base->active))) {
                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);

                /*
                 * Mark it as ENQUEUED not INACTIVE otherwise the
                 * timer could be seen as !active and just vanish away
                 * under us on another CPU
                 */
                __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timers on the new cpu. This does not
                 * reprogram the event device in case the timer
                 * expires before the earliest on this CPU, but we run
                 * hrtimer_interrupt after we migrated everything to
                 * sort out already expired timers and reprogram the
                 * event device.
                 */
                enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
        }
}

int hrtimers_cpu_dying(unsigned int dying_cpu)
{
        int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
        struct hrtimer_cpu_base *old_base, *new_base;

        old_base = this_cpu_ptr(&hrtimer_bases);
        new_base = &per_cpu(hrtimer_bases, ncpu);

        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
        raw_spin_lock(&old_base->lock);
        raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
                                     &new_base->clock_base[i]);
        }

        /*
         * The migration might have changed the first expiring softirq
         * timer on this CPU. Update it.
         */
        __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
        /* Tell the other CPU to retrigger the next event */
        smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

        raw_spin_unlock(&new_base->lock);
        old_base->online = 0;
        raw_spin_unlock(&old_base->lock);

        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

void __init hrtimers_init(void)
{
        hrtimers_prepare_cpu(smp_processor_id());
        open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}

/**
 * schedule_hrtimeout_range_clock - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t) for SCHED_OTHER tasks
 * @mode:        timer mode
 * @clock_id:        timer clock to be used
 */
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
                               const enum hrtimer_mode mode, clockid_t clock_id)
{
        struct hrtimer_sleeper t;

        /*
         * Optimize when a zero timeout value is given. It does not
         * matter whether this is an absolute or a relative time.
         */
        if (expires && *expires == 0) {
                __set_current_state(TASK_RUNNING);
                return 0;
        }

        /*
         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
                return -EINTR;
        }

        /*
         * Override any slack passed by the user if under
         * rt contraints.
         */
        if (rt_task(current))
                delta = 0;

        hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
        hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
        hrtimer_sleeper_start_expires(&t, mode);

        if (likely(t.task))
                schedule();

        hrtimer_cancel(&t.timer);
        destroy_hrtimer_on_stack(&t.timer);

        __set_current_state(TASK_RUNNING);

        return !t.task ? 0 : -EINTR;
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t) for SCHED_OTHER tasks
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * The @delta argument gives the kernel the freedom to schedule the
 * actual wakeup to a time that is both power and performance friendly
 * for regular (non RT/DL) tasks.
 * The kernel give the normal best effort behavior for "@expires+@delta",
 * but may decide to fire the timer earlier, but no earlier than @expires.
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                     const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range_clock(expires, delta, mode,
                                              CLOCK_MONOTONIC);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);

/**
 * schedule_hrtimeout - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout(ktime_t *expires,
                               const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);






































































    1 


    1 







    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These are the state tables for the SCTP state machine.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Hui Huang                    <hui.huang@nokia.com>
 *    Daisy Chang            <daisyc@us.ibm.com>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 *    Sridhar Samudrala            <sri@us.ibm.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/skbuff.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

static const struct sctp_sm_table_entry
primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES];
static const struct sctp_sm_table_entry
other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES];
static const struct sctp_sm_table_entry
timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES];

static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
                                                struct net *net,
                                                enum sctp_cid cid,
                                                enum sctp_state state);


static const struct sctp_sm_table_entry bug = {
        .fn = sctp_sf_bug,
        .name = "sctp_sf_bug"
};

#define DO_LOOKUP(_max, _type, _table)                                        \
({                                                                        \
        const struct sctp_sm_table_entry *rtn;                                \
                                                                        \
        if ((event_subtype._type > (_max))) {                                \
                pr_warn("table %p possible attack: event %d exceeds max %d\n", \
                        _table, event_subtype._type, _max);                \
                rtn = &bug;                                                \
        } else                                                                \
                rtn = &_table[event_subtype._type][(int)state];                \
                                                                        \
        rtn;                                                                \
})

const struct sctp_sm_table_entry *sctp_sm_lookup_event(
                                        struct net *net,
                                        enum sctp_event_type event_type,
                                        enum sctp_state state,
                                        union sctp_subtype event_subtype)
{
        switch (event_type) {
        case SCTP_EVENT_T_CHUNK:
                return sctp_chunk_event_lookup(net, event_subtype.chunk, state);
        case SCTP_EVENT_T_TIMEOUT:
                return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
                                 timeout_event_table);
        case SCTP_EVENT_T_OTHER:
                return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other,
                                 other_event_table);
        case SCTP_EVENT_T_PRIMITIVE:
                return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
                                 primitive_event_table);
        default:
                /* Yikes!  We got an illegal event type.  */
                return &bug;
        }
}

#define TYPE_SCTP_FUNC(func) {.fn = func, .name = #func}

#define TYPE_SCTP_DATA { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_data_6_2), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_data_fast_4_4), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_DATA */

#define TYPE_SCTP_INIT { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_1B_init), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_1_siminit), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_2_dupinit), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_reshutack), \
} /* TYPE_SCTP_INIT */

#define TYPE_SCTP_INIT_ACK { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_3_initack), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_1C_ack), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_INIT_ACK */

#define TYPE_SCTP_SACK { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_sack_6_2), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_SACK */

#define TYPE_SCTP_HEARTBEAT { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        /* This should not happen, but we are nice.  */ \
        TYPE_SCTP_FUNC(sctp_sf_beat_8_3), \
} /* TYPE_SCTP_HEARTBEAT */

#define TYPE_SCTP_HEARTBEAT_ACK { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_violation), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_backbeat_8_3), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_HEARTBEAT_ACK */

#define TYPE_SCTP_ABORT { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_pdiscard), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_wait_abort), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_abort), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_abort), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_abort), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_1_abort), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_abort), \
} /* TYPE_SCTP_ABORT */

#define TYPE_SCTP_SHUTDOWN { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_SHUTDOWN */

#define TYPE_SCTP_SHUTDOWN_ACK { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_8_5_1_E_sa), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_violation), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_violation), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_violation), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_final), \
} /* TYPE_SCTP_SHUTDOWN_ACK */

#define TYPE_SCTP_ERROR { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_err), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_operr_notify), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_ERROR */

#define TYPE_SCTP_COOKIE_ECHO { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_1D_ce), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_2_4_dupcook), \
} /* TYPE_SCTP_COOKIE_ECHO */

#define TYPE_SCTP_COOKIE_ACK { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_5_1E_ca), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_COOKIE_ACK */

#define TYPE_SCTP_ECN_ECNE { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecne), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_ECN_ECNE */

#define TYPE_SCTP_ECN_CWR { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_ecn_cwr), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_ECN_CWR */

#define TYPE_SCTP_SHUTDOWN_COMPLETE { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_4_C), \
} /* TYPE_SCTP_SHUTDOWN_COMPLETE */

/* The primary index for this table is the chunk type.
 * The secondary index for this table is the state.
 *
 * For base protocol (RFC 2960).
 */
static const struct sctp_sm_table_entry
chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_DATA,
        TYPE_SCTP_INIT,
        TYPE_SCTP_INIT_ACK,
        TYPE_SCTP_SACK,
        TYPE_SCTP_HEARTBEAT,
        TYPE_SCTP_HEARTBEAT_ACK,
        TYPE_SCTP_ABORT,
        TYPE_SCTP_SHUTDOWN,
        TYPE_SCTP_SHUTDOWN_ACK,
        TYPE_SCTP_ERROR,
        TYPE_SCTP_COOKIE_ECHO,
        TYPE_SCTP_COOKIE_ACK,
        TYPE_SCTP_ECN_ECNE,
        TYPE_SCTP_ECN_CWR,
        TYPE_SCTP_SHUTDOWN_COMPLETE,
}; /* state_fn_t chunk_event_table[][] */

#define TYPE_SCTP_ASCONF { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_ASCONF */

#define TYPE_SCTP_ASCONF_ACK { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_asconf_ack), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_ASCONF_ACK */

/* The primary index for this table is the chunk type.
 * The secondary index for this table is the state.
 */
static const struct sctp_sm_table_entry
addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_ASCONF,
        TYPE_SCTP_ASCONF_ACK,
}; /*state_fn_t addip_chunk_event_table[][] */

#define TYPE_SCTP_FWD_TSN { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_fwd_tsn_fast), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_FWD_TSN */

/* The primary index for this table is the chunk type.
 * The secondary index for this table is the state.
 */
static const struct sctp_sm_table_entry
prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_FWD_TSN,
}; /*state_fn_t prsctp_chunk_event_table[][] */

#define TYPE_SCTP_RECONF { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_reconf), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
} /* TYPE_SCTP_RECONF */

/* The primary index for this table is the chunk type.
 * The secondary index for this table is the state.
 */
static const struct sctp_sm_table_entry
reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_RECONF,
}; /*state_fn_t reconf_chunk_event_table[][] */

#define TYPE_SCTP_AUTH { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ootb), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_eat_auth), \
} /* TYPE_SCTP_AUTH */

/* The primary index for this table is the chunk type.
 * The secondary index for this table is the state.
 */
static const struct sctp_sm_table_entry
auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_AUTH,
}; /*state_fn_t auth_chunk_event_table[][] */

static const struct sctp_sm_table_entry
pad_chunk_event_table[SCTP_STATE_NUM_STATES] = {
        /* SCTP_STATE_CLOSED */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
        /* SCTP_STATE_COOKIE_WAIT */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
        /* SCTP_STATE_COOKIE_ECHOED */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
        /* SCTP_STATE_ESTABLISHED */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
        /* SCTP_STATE_SHUTDOWN_PENDING */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
        /* SCTP_STATE_SHUTDOWN_SENT */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
        /* SCTP_STATE_SHUTDOWN_RECEIVED */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */
        TYPE_SCTP_FUNC(sctp_sf_discard_chunk),
};        /* chunk pad */

static const struct sctp_sm_table_entry
chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
        /* SCTP_STATE_CLOSED */
        TYPE_SCTP_FUNC(sctp_sf_ootb),
        /* SCTP_STATE_COOKIE_WAIT */
        TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
        /* SCTP_STATE_COOKIE_ECHOED */
        TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
        /* SCTP_STATE_ESTABLISHED */
        TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
        /* SCTP_STATE_SHUTDOWN_PENDING */
        TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
        /* SCTP_STATE_SHUTDOWN_SENT */
        TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
        /* SCTP_STATE_SHUTDOWN_RECEIVED */
        TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */
        TYPE_SCTP_FUNC(sctp_sf_unk_chunk),
};        /* chunk unknown */


#define TYPE_SCTP_PRIMITIVE_ASSOCIATE  { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_asoc), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_not_impl), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_not_impl), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_not_impl), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_not_impl), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_not_impl), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_not_impl), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_not_impl), \
} /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */

#define TYPE_SCTP_PRIMITIVE_SHUTDOWN  { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_shutdown), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_shutdown),\
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_prm_shutdown), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_primitive), \
} /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */

#define TYPE_SCTP_PRIMITIVE_ABORT  { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_wait_prm_abort), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_echoed_prm_abort), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_shutdown_pending_prm_abort), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_shutdown_sent_prm_abort), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_1_prm_abort), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_shutdown_ack_sent_prm_abort), \
} /* TYPE_SCTP_PRIMITIVE_ABORT */

#define TYPE_SCTP_PRIMITIVE_SEND  { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_send), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
} /* TYPE_SCTP_PRIMITIVE_SEND */

#define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT  { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_requestheartbeat),          \
} /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */

#define TYPE_SCTP_PRIMITIVE_ASCONF { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_asconf), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
} /* TYPE_SCTP_PRIMITIVE_ASCONF */

#define TYPE_SCTP_PRIMITIVE_RECONF { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_error_closed), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
} /* TYPE_SCTP_PRIMITIVE_RECONF */

/* The primary index for this table is the primitive type.
 * The secondary index for this table is the state.
 */
static const struct sctp_sm_table_entry
primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_PRIMITIVE_ASSOCIATE,
        TYPE_SCTP_PRIMITIVE_SHUTDOWN,
        TYPE_SCTP_PRIMITIVE_ABORT,
        TYPE_SCTP_PRIMITIVE_SEND,
        TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
        TYPE_SCTP_PRIMITIVE_ASCONF,
        TYPE_SCTP_PRIMITIVE_RECONF,
};

#define TYPE_SCTP_OTHER_NO_PENDING_TSN  { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_no_pending_tsn), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_start_shutdown), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
}

#define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH  { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_cookie_wait_icmp_abort), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
}

static const struct sctp_sm_table_entry
other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_OTHER_NO_PENDING_TSN,
        TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH,
};

#define TYPE_SCTP_EVENT_TIMEOUT_NONE { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_bug), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_t1_cookie_timer_expire), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_t1_init_timer_expire), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_t2_timer_expire), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_6_3_3_rtx), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_t4_timer_expire), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_t5_timer_expire), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_sendbeat_8_3), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_SACK { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_do_6_2_sack), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_autoclose_timer_expire), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_RECONF { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_send_reconf), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

#define TYPE_SCTP_EVENT_TIMEOUT_PROBE { \
        /* SCTP_STATE_CLOSED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_WAIT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_COOKIE_ECHOED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_ESTABLISHED */ \
        TYPE_SCTP_FUNC(sctp_sf_send_probe), \
        /* SCTP_STATE_SHUTDOWN_PENDING */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
        /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
        TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
}

static const struct sctp_sm_table_entry
timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
        TYPE_SCTP_EVENT_TIMEOUT_NONE,
        TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
        TYPE_SCTP_EVENT_TIMEOUT_T1_INIT,
        TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN,
        TYPE_SCTP_EVENT_TIMEOUT_T3_RTX,
        TYPE_SCTP_EVENT_TIMEOUT_T4_RTO,
        TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
        TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT,
        TYPE_SCTP_EVENT_TIMEOUT_RECONF,
        TYPE_SCTP_EVENT_TIMEOUT_PROBE,
        TYPE_SCTP_EVENT_TIMEOUT_SACK,
        TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
};

static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
                                                struct net *net,
                                                enum sctp_cid cid,
                                                enum sctp_state state)
{
        if (state > SCTP_STATE_MAX)
                return &bug;

        if (cid == SCTP_CID_I_DATA)
                cid = SCTP_CID_DATA;

        if (cid <= SCTP_CID_BASE_MAX)
                return &chunk_event_table[cid][state];

        switch ((u16)cid) {
        case SCTP_CID_FWD_TSN:
        case SCTP_CID_I_FWD_TSN:
                return &prsctp_chunk_event_table[0][state];

        case SCTP_CID_ASCONF:
                return &addip_chunk_event_table[0][state];

        case SCTP_CID_ASCONF_ACK:
                return &addip_chunk_event_table[1][state];

        case SCTP_CID_RECONF:
                return &reconf_chunk_event_table[0][state];

        case SCTP_CID_AUTH:
                return &auth_chunk_event_table[0][state];

        case SCTP_CID_PAD:
                return &pad_chunk_event_table[state];
        }

        return &chunk_event_table_unknown[state];
}


































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_GENERIC_NETLINK_H
#define __NET_GENERIC_NETLINK_H

#include <linux/net.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <uapi/linux/genetlink.h>

#define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN)

/* Non-parallel generic netlink requests are serialized by a global lock. */
void genl_lock(void);
void genl_unlock(void);

#define MODULE_ALIAS_GENL_FAMILY(family) \
 MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family)

/* Binding to multicast group requires %CAP_NET_ADMIN */
#define GENL_MCAST_CAP_NET_ADMIN        BIT(0)
/* Binding to multicast group requires %CAP_SYS_ADMIN */
#define GENL_MCAST_CAP_SYS_ADMIN        BIT(1)

/**
 * struct genl_multicast_group - generic netlink multicast group
 * @name: name of the multicast group, names are per-family
 * @flags: GENL_MCAST_* flags
 */
struct genl_multicast_group {
        char                        name[GENL_NAMSIZ];
        u8                        flags;
};

struct genl_split_ops;
struct genl_info;

/**
 * struct genl_family - generic netlink family
 * @hdrsize: length of user specific header in bytes
 * @name: name of family
 * @version: protocol version
 * @maxattr: maximum number of attributes supported
 * @policy: netlink policy
 * @netnsok: set to true if the family can handle network
 *        namespaces and should be presented in all of them
 * @parallel_ops: operations can be called in parallel and aren't
 *        synchronized by the core genetlink code
 * @pre_doit: called before an operation's doit callback, it may
 *        do additional, common, filtering and return an error
 * @post_doit: called after an operation's doit callback, it may
 *        undo operations done by pre_doit, for example release locks
 * @bind: called when family multicast group is added to a netlink socket
 * @unbind: called when family multicast group is removed from a netlink socket
 * @module: pointer to the owning module (set to THIS_MODULE)
 * @mcgrps: multicast groups used by this family
 * @n_mcgrps: number of multicast groups
 * @resv_start_op: first operation for which reserved fields of the header
 *        can be validated and policies are required (see below);
 *        new families should leave this field at zero
 * @ops: the operations supported by this family
 * @n_ops: number of operations supported by this family
 * @small_ops: the small-struct operations supported by this family
 * @n_small_ops: number of small-struct operations supported by this family
 * @split_ops: the split do/dump form of operation definition
 * @n_split_ops: number of entries in @split_ops, not that with split do/dump
 *        ops the number of entries is not the same as number of commands
 * @sock_priv_size: the size of per-socket private memory
 * @sock_priv_init: the per-socket private memory initializer
 * @sock_priv_destroy: the per-socket private memory destructor
 *
 * Attribute policies (the combination of @policy and @maxattr fields)
 * can be attached at the family level or at the operation level.
 * If both are present the per-operation policy takes precedence.
 * For operations before @resv_start_op lack of policy means that the core
 * will perform no attribute parsing or validation. For newer operations
 * if policy is not provided core will reject all TLV attributes.
 */
struct genl_family {
        unsigned int                hdrsize;
        char                        name[GENL_NAMSIZ];
        unsigned int                version;
        unsigned int                maxattr;
        u8                        netnsok:1;
        u8                        parallel_ops:1;
        u8                        n_ops;
        u8                        n_small_ops;
        u8                        n_split_ops;
        u8                        n_mcgrps;
        u8                        resv_start_op;
        const struct nla_policy *policy;
        int                        (*pre_doit)(const struct genl_split_ops *ops,
                                            struct sk_buff *skb,
                                            struct genl_info *info);
        void                        (*post_doit)(const struct genl_split_ops *ops,
                                             struct sk_buff *skb,
                                             struct genl_info *info);
        int                        (*bind)(int mcgrp);
        void                        (*unbind)(int mcgrp);
        const struct genl_ops *        ops;
        const struct genl_small_ops *small_ops;
        const struct genl_split_ops *split_ops;
        const struct genl_multicast_group *mcgrps;
        struct module                *module;

        size_t                        sock_priv_size;
        void                        (*sock_priv_init)(void *priv);
        void                        (*sock_priv_destroy)(void *priv);

/* private: internal use only */
        /* protocol family identifier */
        int                        id;
        /* starting number of multicast group IDs in this family */
        unsigned int                mcgrp_offset;
        /* list of per-socket privs */
        struct xarray                *sock_privs;
};

/**
 * struct genl_info - receiving information
 * @snd_seq: sending sequence number
 * @snd_portid: netlink portid of sender
 * @family: generic netlink family
 * @nlhdr: netlink message header
 * @genlhdr: generic netlink message header
 * @attrs: netlink attributes
 * @_net: network namespace
 * @user_ptr: user pointers
 * @extack: extended ACK report struct
 */
struct genl_info {
        u32                        snd_seq;
        u32                        snd_portid;
        const struct genl_family *family;
        const struct nlmsghdr *        nlhdr;
        struct genlmsghdr *        genlhdr;
        struct nlattr **        attrs;
        possible_net_t                _net;
        void *                        user_ptr[2];
        struct netlink_ext_ack *extack;
};

static inline struct net *genl_info_net(const struct genl_info *info)
{
        return read_pnet(&info->_net);
}

static inline void genl_info_net_set(struct genl_info *info, struct net *net)
{
        write_pnet(&info->_net, net);
}

static inline void *genl_info_userhdr(const struct genl_info *info)
{
        return (u8 *)info->genlhdr + GENL_HDRLEN;
}

#define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg)

#define GENL_SET_ERR_MSG_FMT(info, msg, args...) \
        NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args)

/* Report that a root attribute is missing */
#define GENL_REQ_ATTR_CHECK(info, attr) ({                                \
        const struct genl_info *__info = (info);                        \
                                                                        \
        NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \
})

enum genl_validate_flags {
        GENL_DONT_VALIDATE_STRICT                = BIT(0),
        GENL_DONT_VALIDATE_DUMP                        = BIT(1),
        GENL_DONT_VALIDATE_DUMP_STRICT                = BIT(2),
};

/**
 * struct genl_small_ops - generic netlink operations (small version)
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
 * @validate: validation flags from enum genl_validate_flags
 * @doit: standard command callback
 * @dumpit: callback for dumpers
 *
 * This is a cut-down version of struct genl_ops for users who don't need
 * most of the ancillary infra and want to save space.
 */
struct genl_small_ops {
        int        (*doit)(struct sk_buff *skb, struct genl_info *info);
        int        (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb);
        u8        cmd;
        u8        internal_flags;
        u8        flags;
        u8        validate;
};

/**
 * struct genl_ops - generic netlink operations
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
 * @maxattr: maximum number of attributes supported
 * @policy: netlink policy (takes precedence over family policy)
 * @validate: validation flags from enum genl_validate_flags
 * @doit: standard command callback
 * @start: start callback for dumps
 * @dumpit: callback for dumpers
 * @done: completion callback for dumps
 */
struct genl_ops {
        int                       (*doit)(struct sk_buff *skb,
                                       struct genl_info *info);
        int                       (*start)(struct netlink_callback *cb);
        int                       (*dumpit)(struct sk_buff *skb,
                                         struct netlink_callback *cb);
        int                       (*done)(struct netlink_callback *cb);
        const struct nla_policy *policy;
        unsigned int                maxattr;
        u8                        cmd;
        u8                        internal_flags;
        u8                        flags;
        u8                        validate;
};

/**
 * struct genl_split_ops - generic netlink operations (do/dump split version)
 * @cmd: command identifier
 * @internal_flags: flags used by the family
 * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
 * @validate: validation flags from enum genl_validate_flags
 * @policy: netlink policy (takes precedence over family policy)
 * @maxattr: maximum number of attributes supported
 *
 * Do callbacks:
 * @pre_doit: called before an operation's @doit callback, it may
 *        do additional, common, filtering and return an error
 * @doit: standard command callback
 * @post_doit: called after an operation's @doit callback, it may
 *        undo operations done by pre_doit, for example release locks
 *
 * Dump callbacks:
 * @start: start callback for dumps
 * @dumpit: callback for dumpers
 * @done: completion callback for dumps
 *
 * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags.
 * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags.
 * Exactly one of those flags must be set.
 */
struct genl_split_ops {
        union {
                struct {
                        int (*pre_doit)(const struct genl_split_ops *ops,
                                        struct sk_buff *skb,
                                        struct genl_info *info);
                        int (*doit)(struct sk_buff *skb,
                                    struct genl_info *info);
                        void (*post_doit)(const struct genl_split_ops *ops,
                                          struct sk_buff *skb,
                                          struct genl_info *info);
                };
                struct {
                        int (*start)(struct netlink_callback *cb);
                        int (*dumpit)(struct sk_buff *skb,
                                      struct netlink_callback *cb);
                        int (*done)(struct netlink_callback *cb);
                };
        };
        const struct nla_policy *policy;
        unsigned int                maxattr;
        u8                        cmd;
        u8                        internal_flags;
        u8                        flags;
        u8                        validate;
};

/**
 * struct genl_dumpit_info - info that is available during dumpit op call
 * @op: generic netlink ops - for internal genl code usage
 * @attrs: netlink attributes
 * @info: struct genl_info describing the request
 */
struct genl_dumpit_info {
        struct genl_split_ops op;
        struct genl_info info;
};

static inline const struct genl_dumpit_info *
genl_dumpit_info(struct netlink_callback *cb)
{
        return cb->data;
}

static inline const struct genl_info *
genl_info_dump(struct netlink_callback *cb)
{
        return &genl_dumpit_info(cb)->info;
}

/**
 * genl_info_init_ntf() - initialize genl_info for notifications
 * @info:   genl_info struct to set up
 * @family: pointer to the genetlink family
 * @cmd:    command to be used in the notification
 *
 * Initialize a locally declared struct genl_info to pass to various APIs.
 * Intended to be used when creating notifications.
 */
static inline void
genl_info_init_ntf(struct genl_info *info, const struct genl_family *family,
                   u8 cmd)
{
        struct genlmsghdr *hdr = (void *) &info->user_ptr[0];

        memset(info, 0, sizeof(*info));
        info->family = family;
        info->genlhdr = hdr;
        hdr->cmd = cmd;
}

static inline bool genl_info_is_ntf(const struct genl_info *info)
{
        return !info->nlhdr;
}

void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk);
void *genl_sk_priv_get(struct genl_family *family, struct sock *sk);
int genl_register_family(struct genl_family *family);
int genl_unregister_family(const struct genl_family *family);
void genl_notify(const struct genl_family *family, struct sk_buff *skb,
                 struct genl_info *info, u32 group, gfp_t flags);

void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                  const struct genl_family *family, int flags, u8 cmd);

static inline void *
__genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags)
{
        return genlmsg_put(skb, info->snd_portid, info->snd_seq, info->family,
                           flags, info->genlhdr->cmd);
}

/**
 * genlmsg_iput - start genetlink message based on genl_info
 * @skb: skb in which message header will be placed
 * @info: genl_info as provided to do/dump handlers
 *
 * Convenience wrapper which starts a genetlink message based on
 * information in user request. @info should be either the struct passed
 * by genetlink core to do/dump handlers (when constructing replies to
 * such requests) or a struct initialized by genl_info_init_ntf()
 * when constructing notifications.
 *
 * Returns pointer to new genetlink header.
 */
static inline void *
genlmsg_iput(struct sk_buff *skb, const struct genl_info *info)
{
        return __genlmsg_iput(skb, info, 0);
}

/**
 * genlmsg_nlhdr - Obtain netlink header from user specified header
 * @user_hdr: user header as returned from genlmsg_put()
 *
 * Returns pointer to netlink header.
 */
static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr)
{
        return (struct nlmsghdr *)((char *)user_hdr -
                                   GENL_HDRLEN -
                                   NLMSG_HDRLEN);
}

/**
 * genlmsg_parse_deprecated - parse attributes of a genetlink message
 * @nlh: netlink message header
 * @family: genetlink message family
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh,
                                           const struct genl_family *family,
                                           struct nlattr *tb[], int maxtype,
                                           const struct nla_policy *policy,
                                           struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
                             policy, NL_VALIDATE_LIBERAL, extack);
}

/**
 * genlmsg_parse - parse attributes of a genetlink message
 * @nlh: netlink message header
 * @family: genetlink message family
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int genlmsg_parse(const struct nlmsghdr *nlh,
                                const struct genl_family *family,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
                             policy, NL_VALIDATE_STRICT, extack);
}

/**
 * genl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @user_hdr: user header as returned from genlmsg_put()
 *
 * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it
 * simpler to use with generic netlink.
 */
static inline void genl_dump_check_consistent(struct netlink_callback *cb,
                                              void *user_hdr)
{
        nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr));
}

/**
 * genlmsg_put_reply - Add generic netlink header to a reply message
 * @skb: socket buffer holding the message
 * @info: receiver info
 * @family: generic netlink family
 * @flags: netlink message flags
 * @cmd: generic netlink command
 *
 * Returns pointer to user specific header
 */
static inline void *genlmsg_put_reply(struct sk_buff *skb,
                                      struct genl_info *info,
                                      const struct genl_family *family,
                                      int flags, u8 cmd)
{
        return genlmsg_put(skb, info->snd_portid, info->snd_seq, family,
                           flags, cmd);
}

/**
 * genlmsg_end - Finalize a generic netlink message
 * @skb: socket buffer the message is stored in
 * @hdr: user specific header
 */
static inline void genlmsg_end(struct sk_buff *skb, void *hdr)
{
        nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_cancel - Cancel construction of a generic netlink message
 * @skb: socket buffer the message is stored in
 * @hdr: generic netlink message header
 */
static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
{
        if (hdr)
                nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_multicast_netns_filtered - multicast a netlink message
 *                                      to a specific netns with filter
 *                                      function
 * @family: the generic netlink family
 * @net: the net namespace
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 * @filter: filter function
 * @filter_data: filter function private data
 *
 * Return: 0 on success, negative error code for failure.
 */
static inline int
genlmsg_multicast_netns_filtered(const struct genl_family *family,
                                 struct net *net, struct sk_buff *skb,
                                 u32 portid, unsigned int group, gfp_t flags,
                                 netlink_filter_fn filter,
                                 void *filter_data)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group,
                                        flags, filter, filter_data);
}

/**
 * genlmsg_multicast_netns - multicast a netlink message to a specific netns
 * @family: the generic netlink family
 * @net: the net namespace
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 */
static inline int genlmsg_multicast_netns(const struct genl_family *family,
                                          struct net *net, struct sk_buff *skb,
                                          u32 portid, unsigned int group, gfp_t flags)
{
        return genlmsg_multicast_netns_filtered(family, net, skb, portid,
                                                group, flags, NULL, NULL);
}

/**
 * genlmsg_multicast - multicast a netlink message to the default netns
 * @family: the generic netlink family
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 */
static inline int genlmsg_multicast(const struct genl_family *family,
                                    struct sk_buff *skb, u32 portid,
                                    unsigned int group, gfp_t flags)
{
        return genlmsg_multicast_netns(family, &init_net, skb,
                                       portid, group, flags);
}

/**
 * genlmsg_multicast_allns - multicast a netlink message to all net namespaces
 * @family: the generic netlink family
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: offset of multicast group in groups array
 * @flags: allocation flags
 *
 * This function must hold the RTNL or rcu_read_lock().
 */
int genlmsg_multicast_allns(const struct genl_family *family,
                            struct sk_buff *skb, u32 portid,
                            unsigned int group, gfp_t flags);

/**
 * genlmsg_unicast - unicast a netlink message
 * @net: network namespace to look up @portid in
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid)
{
        return nlmsg_unicast(net->genl_sock, skb, portid);
}

/**
 * genlmsg_reply - reply to a request
 * @skb: netlink message to be sent back
 * @info: receiver information
 */
static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info)
{
        return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid);
}

/**
 * genlmsg_data - head of message payload
 * @gnlh: genetlink message header
 */
static inline void *genlmsg_data(const struct genlmsghdr *gnlh)
{
        return ((unsigned char *) gnlh + GENL_HDRLEN);
}

/**
 * genlmsg_len - length of message payload
 * @gnlh: genetlink message header
 */
static inline int genlmsg_len(const struct genlmsghdr *gnlh)
{
        struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh -
                                                        NLMSG_HDRLEN);
        return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN);
}

/**
 * genlmsg_msg_size - length of genetlink message not including padding
 * @payload: length of message payload
 */
static inline int genlmsg_msg_size(int payload)
{
        return GENL_HDRLEN + payload;
}

/**
 * genlmsg_total_size - length of genetlink message including padding
 * @payload: length of message payload
 */
static inline int genlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(genlmsg_msg_size(payload));
}

/**
 * genlmsg_new - Allocate a new generic netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 */
static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
{
        return nlmsg_new(genlmsg_total_size(payload), flags);
}

/**
 * genl_set_err - report error to genetlink broadcast listeners
 * @family: the generic netlink family
 * @net: the network namespace to report the error to
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 *         (this is the offset of the multicast group in the groups array)
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_RECV_NO_ENOBUFS socket option.
 */
static inline int genl_set_err(const struct genl_family *family,
                               struct net *net, u32 portid,
                               u32 group, int code)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return netlink_set_err(net->genl_sock, portid, group, code);
}

static inline int genl_has_listeners(const struct genl_family *family,
                                     struct net *net, unsigned int group)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;
        group = family->mcgrp_offset + group;
        return netlink_has_listeners(net->genl_sock, group);
}
#endif        /* __NET_GENERIC_NETLINK_H */
























    4 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Global definitions for the Ethernet IEEE 802.3 interface.
 *
 * Version:        @(#)if_ether.h        1.0.1a        02/08/94
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk>
 */
#ifndef _LINUX_IF_ETHER_H
#define _LINUX_IF_ETHER_H

#include <linux/skbuff.h>
#include <uapi/linux/if_ether.h>

static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + eth_hdr()
 */
static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb->data;
}

static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_inner_mac_header(skb);
}

int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);

extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);

#endif        /* _LINUX_IF_ETHER_H */























































































































































































































































































































































































    1 





















































































    1 


















































































    1 







    1 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* include/asm-generic/tlb.h
 *
 *        Generic TLB shootdown code
 *
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
 * Copyright 2011 Red Hat, Inc., Peter Zijlstra
 */
#ifndef _ASM_GENERIC__TLB_H
#define _ASM_GENERIC__TLB_H

#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/hugetlb_inline.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>

/*
 * Blindly accessing user memory from NMI context can be dangerous
 * if we're in the middle of switching the current user task or switching
 * the loaded mm.
 */
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif

#ifdef CONFIG_MMU

/*
 * Generic MMU-gather implementation.
 *
 * The mmu_gather data structure is used by the mm code to implement the
 * correct and efficient ordering of freeing pages and TLB invalidations.
 *
 * This correct ordering is:
 *
 *  1) unhook page
 *  2) TLB invalidate page
 *  3) free page
 *
 * That is, we must never free a page before we have ensured there are no live
 * translations left to it. Otherwise it might be possible to observe (or
 * worse, change) the page content after it has been reused.
 *
 * The mmu_gather API consists of:
 *
 *  - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu()
 *
 *    start and finish a mmu_gather
 *
 *    Finish in particular will issue a (final) TLB invalidate and free
 *    all (remaining) queued pages.
 *
 *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
 *
 *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
 *    there's large holes between the VMAs.
 *
 *  - tlb_remove_table()
 *
 *    tlb_remove_table() is the basic primitive to free page-table directories
 *    (__p*_free_tlb()).  In it's most primitive form it is an alias for
 *    tlb_remove_page() below, for when page directories are pages and have no
 *    additional constraints.
 *
 *    See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
 *
 *  - tlb_remove_page() / __tlb_remove_page()
 *  - tlb_remove_page_size() / __tlb_remove_page_size()
 *  - __tlb_remove_folio_pages()
 *
 *    __tlb_remove_page_size() is the basic primitive that queues a page for
 *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
 *    boolean indicating if the queue is (now) full and a call to
 *    tlb_flush_mmu() is required.
 *
 *    tlb_remove_page() and tlb_remove_page_size() imply the call to
 *    tlb_flush_mmu() when required and has no return value.
 *
 *    __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
 *    instead of removing a single page, remove the given number of consecutive
 *    pages that are all part of the same (large) folio: just like calling
 *    __tlb_remove_page() on each page individually.
 *
 *  - tlb_change_page_size()
 *
 *    call before __tlb_remove_page*() to set the current page-size; implies a
 *    possible tlb_flush_mmu() call.
 *
 *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly()
 *
 *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
 *                              related state, like the range)
 *
 *    tlb_flush_mmu() - in addition to the above TLB invalidate, also frees
 *                        whatever pages are still batched.
 *
 *  - mmu_gather::fullmm
 *
 *    A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free
 *    the entire mm; this allows a number of optimizations.
 *
 *    - We can ignore tlb_{start,end}_vma(); because we don't
 *      care about ranges. Everything will be shot down.
 *
 *    - (RISC) architectures that use ASIDs can cycle to a new ASID
 *      and delay the invalidation until ASID space runs out.
 *
 *  - mmu_gather::need_flush_all
 *
 *    A flag that can be set by the arch code if it wants to force
 *    flush the entire TLB irrespective of the range. For instance
 *    x86-PAE needs this when changing top-level entries.
 *
 * And allows the architecture to provide and implement tlb_flush():
 *
 * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
 * use of:
 *
 *  - mmu_gather::start / mmu_gather::end
 *
 *    which provides the range that needs to be flushed to cover the pages to
 *    be freed.
 *
 *  - mmu_gather::freed_tables
 *
 *    set when we freed page table pages
 *
 *  - tlb_get_unmap_shift() / tlb_get_unmap_size()
 *
 *    returns the smallest TLB entry size unmapped in this range.
 *
 * If an architecture does not provide tlb_flush() a default implementation
 * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is
 * specified, in which case we'll default to flush_tlb_mm().
 *
 * Additionally there are a few opt-in features:
 *
 *  MMU_GATHER_PAGE_SIZE
 *
 *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
 *  changes the size and provides mmu_gather::page_size to tlb_flush().
 *
 *  This might be useful if your architecture has size specific TLB
 *  invalidation instructions.
 *
 *  MMU_GATHER_TABLE_FREE
 *
 *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
 *  for page directores (__p*_free_tlb()).
 *
 *  Useful if your architecture has non-page page directories.
 *
 *  When used, an architecture is expected to provide __tlb_remove_table()
 *  which does the actual freeing of these pages.
 *
 *  MMU_GATHER_RCU_TABLE_FREE
 *
 *  Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see
 *  comment below).
 *
 *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
 *  and therefore doesn't naturally serialize with software page-table walkers.
 *
 *  MMU_GATHER_NO_FLUSH_CACHE
 *
 *  Indicates the architecture has flush_cache_range() but it needs *NOT* be called
 *  before unmapping a VMA.
 *
 *  NOTE: strictly speaking we shouldn't have this knob and instead rely on
 *          flush_cache_range() being a NOP, except Sparc64 seems to be
 *          different here.
 *
 *  MMU_GATHER_MERGE_VMAS
 *
 *  Indicates the architecture wants to merge ranges over VMAs; typical when
 *  multiple range invalidates are more expensive than a full invalidate.
 *
 *  MMU_GATHER_NO_RANGE
 *
 *  Use this if your architecture lacks an efficient flush_tlb_range(). This
 *  option implies MMU_GATHER_MERGE_VMAS above.
 *
 *  MMU_GATHER_NO_GATHER
 *
 *  If the option is set the mmu_gather will not track individual pages for
 *  delayed page free anymore. A platform that enables the option needs to
 *  provide its own implementation of the __tlb_remove_page_size() function to
 *  free pages.
 *
 *  This is useful if your architecture already flushes TLB entries in the
 *  various ptep_get_and_clear() functions.
 */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

struct mmu_table_batch {
#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
        struct rcu_head                rcu;
#endif
        unsigned int                nr;
        void                        *tables[];
};

#define MAX_TABLE_BATCH                \
        ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))

extern void tlb_remove_table(struct mmu_gather *tlb, void *table);

#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */

/*
 * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based
 * page directories and we can use the normal page batching to free them.
 */
#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page))

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
 * This allows an architecture that does not use the linux page-tables for
 * hardware to skip the TLBI when freeing page tables.
 */
#ifndef tlb_needs_table_invalidate
#define tlb_needs_table_invalidate() (true)
#endif

void tlb_remove_table_sync_one(void);

#else

#ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif

static inline void tlb_remove_table_sync_one(void) { }

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */


#ifndef CONFIG_MMU_GATHER_NO_GATHER
/*
 * If we can't allocate a page to make a big batch of page pointers
 * to work on, then just handle a few from the on-stack structure.
 */
#define MMU_GATHER_BUNDLE        8

struct mmu_gather_batch {
        struct mmu_gather_batch        *next;
        unsigned int                nr;
        unsigned int                max;
        struct encoded_page        *encoded_pages[];
};

#define MAX_GATHER_BATCH        \
        ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))

/*
 * Limit the maximum number of mmu_gather batches to reduce a risk of soft
 * lockups for non-preemptible kernels on huge machines when a lot of memory
 * is zapped during unmapping.
 * 10K pages freed at once should be safe even without a preemption point.
 */
#define MAX_GATHER_BATCH_COUNT        (10000UL/MAX_GATHER_BATCH)

extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                bool delay_rmap, int page_size);
bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap);

#ifdef CONFIG_SMP
/*
 * This both sets 'delayed_rmap', and returns true. It would be an inline
 * function, except we define it before the 'struct mmu_gather'.
 */
#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
#endif

#endif

/*
 * We have a no-op version of the rmap removal that doesn't
 * delay anything. That is used on S390, which flushes remote
 * TLBs synchronously, and on UP, which doesn't have any
 * remote TLBs to flush and is not preemptible due to this
 * all happening under the page table lock.
 */
#ifndef tlb_delay_rmap
#define tlb_delay_rmap(tlb) (false)
static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
#endif

/*
 * struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
struct mmu_gather {
        struct mm_struct        *mm;

#ifdef CONFIG_MMU_GATHER_TABLE_FREE
        struct mmu_table_batch        *batch;
#endif

        unsigned long                start;
        unsigned long                end;
        /*
         * we are in the middle of an operation to clear
         * a full mm and can make some optimizations
         */
        unsigned int                fullmm : 1;

        /*
         * we have performed an operation which
         * requires a complete flush of the tlb
         */
        unsigned int                need_flush_all : 1;

        /*
         * we have removed page directories
         */
        unsigned int                freed_tables : 1;

        /*
         * Do we have pending delayed rmap removals?
         */
        unsigned int                delayed_rmap : 1;

        /*
         * at which levels have we cleared entries?
         */
        unsigned int                cleared_ptes : 1;
        unsigned int                cleared_pmds : 1;
        unsigned int                cleared_puds : 1;
        unsigned int                cleared_p4ds : 1;

        /*
         * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma
         */
        unsigned int                vma_exec : 1;
        unsigned int                vma_huge : 1;
        unsigned int                vma_pfn  : 1;

        unsigned int                batch_count;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        struct mmu_gather_batch *active;
        struct mmu_gather_batch        local;
        struct page                *__pages[MMU_GATHER_BUNDLE];

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        unsigned int page_size;
#endif
#endif
};

void tlb_flush_mmu(struct mmu_gather *tlb);

static inline void __tlb_adjust_range(struct mmu_gather *tlb,
                                      unsigned long address,
                                      unsigned int range_size)
{
        tlb->start = min(tlb->start, address);
        tlb->end = max(tlb->end, address + range_size);
}

static inline void __tlb_reset_range(struct mmu_gather *tlb)
{
        if (tlb->fullmm) {
                tlb->start = tlb->end = ~0;
        } else {
                tlb->start = TASK_SIZE;
                tlb->end = 0;
        }
        tlb->freed_tables = 0;
        tlb->cleared_ptes = 0;
        tlb->cleared_pmds = 0;
        tlb->cleared_puds = 0;
        tlb->cleared_p4ds = 0;
        /*
         * Do not reset mmu_gather::vma_* fields here, we do not
         * call into tlb_start_vma() again to set them if there is an
         * intermediate flush.
         */
}

#ifdef CONFIG_MMU_GATHER_NO_RANGE

#if defined(tlb_flush)
#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
#endif

/*
 * When an architecture does not have efficient means of range flushing TLBs
 * there is no point in doing intermediate flushes on tlb_end_vma() to keep the
 * range small. We equally don't have to worry about page granularity or other
 * things.
 *
 * All we need to do is issue a full flush for any !0 range.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->end)
                flush_tlb_mm(tlb->mm);
}

#else /* CONFIG_MMU_GATHER_NO_RANGE */

#ifndef tlb_flush
/*
 * When an architecture does not provide its own tlb_flush() implementation
 * but does have a reasonably efficient flush_vma_range() implementation
 * use that.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->fullmm || tlb->need_flush_all) {
                flush_tlb_mm(tlb->mm);
        } else if (tlb->end) {
                struct vm_area_struct vma = {
                        .vm_mm = tlb->mm,
                        .vm_flags = (tlb->vma_exec ? VM_EXEC    : 0) |
                                    (tlb->vma_huge ? VM_HUGETLB : 0),
                };

                flush_tlb_range(&vma, tlb->start, tlb->end);
        }
}
#endif

#endif /* CONFIG_MMU_GATHER_NO_RANGE */

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * flush_tlb_range() implementations that look at VM_HUGETLB (tile,
         * mips-4k) flush only large pages.
         *
         * flush_tlb_range() implementations that flush I-TLB also flush D-TLB
         * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing
         * range.
         *
         * We rely on tlb_end_vma() to issue a flush, such that when we reset
         * these values the batch is empty.
         */
        tlb->vma_huge = is_vm_hugetlb_page(vma);
        tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
        tlb->vma_pfn  = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
}

static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
        /*
         * Anything calling __tlb_adjust_range() also sets at least one of
         * these bits.
         */
        if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
              tlb->cleared_puds || tlb->cleared_p4ds))
                return;

        tlb_flush(tlb);
        __tlb_reset_range(tlb);
}

static inline void tlb_remove_page_size(struct mmu_gather *tlb,
                                        struct page *page, int page_size)
{
        if (__tlb_remove_page_size(tlb, page, false, page_size))
                tlb_flush_mmu(tlb);
}

static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb,
                struct page *page, bool delay_rmap)
{
        return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE);
}

/* tlb_remove_page
 *        Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
 *        required.
 */
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
{
        tlb_remove_table(tlb, pt);
}

/* Like tlb_remove_ptdesc, but for page-like page directories. */
static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
{
        tlb_remove_page(tlb, ptdesc_page(pt));
}

static inline void tlb_change_page_size(struct mmu_gather *tlb,
                                                     unsigned int page_size)
{
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        if (tlb->page_size && tlb->page_size != page_size) {
                if (!tlb->fullmm && !tlb->need_flush_all)
                        tlb_flush_mmu(tlb);
        }

        tlb->page_size = page_size;
#endif
}

static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
{
        if (tlb->cleared_ptes)
                return PAGE_SHIFT;
        if (tlb->cleared_pmds)
                return PMD_SHIFT;
        if (tlb->cleared_puds)
                return PUD_SHIFT;
        if (tlb->cleared_p4ds)
                return P4D_SHIFT;

        return PAGE_SHIFT;
}

static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
{
        return 1UL << tlb_get_unmap_shift(tlb);
}

/*
 * In the case of tlb vma handling, we can optimise these away in the
 * case where we're doing a full MM flush.  When we're doing a munmap,
 * the vmas are adjusted to only cover the region to be torn down.
 */
static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        tlb_update_vma_flags(tlb, vma);
#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE
        flush_cache_range(vma, vma->vm_start, vma->vm_end);
#endif
}

static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        /*
         * VM_PFNMAP is more fragile because the core mm will not track the
         * page mapcount -- there might not be page-frames for these PFNs after
         * all. Force flush TLBs for such ranges to avoid munmap() vs
         * unmap_mapping_range() races.
         */
        if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
                /*
                 * Do a TLB flush and reset the range at VMA boundaries; this avoids
                 * the ranges growing with the unused space between consecutive VMAs.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

/*
 * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
 * and set corresponding cleared_*.
 */
static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_ptes = 1;
}

static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_pmds = 1;
}

static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_puds = 1;
}

static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_p4ds = 1;
}

#ifndef __tlb_remove_tlb_entry
static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
{
}
#endif

/**
 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
 *
 * Record the fact that pte's were really unmapped by updating the range,
 * so we can later optimise away the tlb invalidate.   This helps when
 * userspace is unmapping already-unmapped pages, which happens quite a lot.
 */
#define tlb_remove_tlb_entry(tlb, ptep, address)                \
        do {                                                        \
                tlb_flush_pte_range(tlb, address, PAGE_SIZE);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for
 *                            later tlb invalidation.
 *
 * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple
 * consecutive ptes instead of only a single one.
 */
static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb,
                pte_t *ptep, unsigned int nr, unsigned long address)
{
        tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr);
        for (;;) {
                __tlb_remove_tlb_entry(tlb, ptep, address);
                if (--nr == 0)
                        break;
                ptep++;
                address += PAGE_SIZE;
        }
}

#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)        \
        do {                                                        \
                unsigned long _sz = huge_page_size(h);                \
                if (_sz >= P4D_SIZE)                                \
                        tlb_flush_p4d_range(tlb, address, _sz);        \
                else if (_sz >= PUD_SIZE)                        \
                        tlb_flush_pud_range(tlb, address, _sz);        \
                else if (_sz >= PMD_SIZE)                        \
                        tlb_flush_pmd_range(tlb, address, _sz);        \
                else                                                \
                        tlb_flush_pte_range(tlb, address, _sz);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
 * This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pmd_tlb_entry
#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
#endif

#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)                        \
        do {                                                                \
                tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);        \
                __tlb_remove_pmd_tlb_entry(tlb, pmdp, address);                \
        } while (0)

/**
 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
 * invalidation. This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pud_tlb_entry
#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
#endif

#define tlb_remove_pud_tlb_entry(tlb, pudp, address)                        \
        do {                                                                \
                tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);        \
                __tlb_remove_pud_tlb_entry(tlb, pudp, address);                \
        } while (0)

/*
 * For things like page tables caches (ie caching addresses "inside" the
 * page tables, like x86 does), for legacy reasons, flushing an
 * individual page had better flush the page table caches behind it. This
 * is definitely how x86 works, for example. And if you have an
 * architected non-legacy page table cache (which I'm not aware of
 * anybody actually doing), you're going to have some architecturally
 * explicit flushing for that, likely *separate* from a regular TLB entry
 * flush, and thus you'd need more than just some range expansion..
 *
 * So if we ever find an architecture
 * that would want something that odd, I think it is up to that
 * architecture to do its own odd thing, not cause pain for others
 * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
 *
 * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
 */

#ifndef pte_free_tlb
#define pte_free_tlb(tlb, ptep, address)                        \
        do {                                                        \
                tlb_flush_pmd_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pte_free_tlb(tlb, ptep, address);                \
        } while (0)
#endif

#ifndef pmd_free_tlb
#define pmd_free_tlb(tlb, pmdp, address)                        \
        do {                                                        \
                tlb_flush_pud_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pmd_free_tlb(tlb, pmdp, address);                \
        } while (0)
#endif

#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                tlb_flush_p4d_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pud_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                __tlb_adjust_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __p4d_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef pte_needs_flush
static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
{
        return true;
}
#endif

#ifndef huge_pmd_needs_flush
static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
{
        return true;
}
#endif

#endif /* CONFIG_MMU */

#endif /* _ASM_GENERIC__TLB_H */































































































































































































































    1 


    1 

    1 



    1 









    1 
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
/*
 * net/tipc/trace.h: TIPC tracepoints
 *
 * Copyright (c) 2018, Ericsson AB
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#undef TRACE_SYSTEM
#define TRACE_SYSTEM tipc

#if !defined(_TIPC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TIPC_TRACE_H

#include <linux/tracepoint.h>
#include "core.h"
#include "link.h"
#include "socket.h"
#include "node.h"

#define SKB_LMIN        (100)
#define SKB_LMAX        (SKB_LMIN * 2)
#define LIST_LMIN        (SKB_LMIN * 3)
#define LIST_LMAX        (SKB_LMIN * 11)
#define SK_LMIN                (SKB_LMIN * 2)
#define SK_LMAX                (SKB_LMIN * 11)
#define LINK_LMIN        (SKB_LMIN)
#define LINK_LMAX        (SKB_LMIN * 16)
#define NODE_LMIN        (SKB_LMIN)
#define NODE_LMAX        (SKB_LMIN * 11)

#ifndef __TIPC_TRACE_ENUM
#define __TIPC_TRACE_ENUM
enum {
        TIPC_DUMP_NONE                = 0,

        TIPC_DUMP_TRANSMQ        = 1,
        TIPC_DUMP_BACKLOGQ        = (1 << 1),
        TIPC_DUMP_DEFERDQ        = (1 << 2),
        TIPC_DUMP_INPUTQ        = (1 << 3),
        TIPC_DUMP_WAKEUP        = (1 << 4),

        TIPC_DUMP_SK_SNDQ        = (1 << 8),
        TIPC_DUMP_SK_RCVQ        = (1 << 9),
        TIPC_DUMP_SK_BKLGQ        = (1 << 10),
        TIPC_DUMP_ALL                = 0xffffu
};
#endif

/* Link & Node FSM states: */
#define state_sym(val)                                                          \
        __print_symbolic(val,                                                  \
                        {(0xe),                "ESTABLISHED"                        },\
                        {(0xe << 4),        "ESTABLISHING"                        },\
                        {(0x1 << 8),        "RESET"                                },\
                        {(0x2 << 12),        "RESETTING"                        },\
                        {(0xd << 16),        "PEER_RESET"                        },\
                        {(0xf << 20),        "FAILINGOVER"                        },\
                        {(0xc << 24),        "SYNCHING"                        },\
                        {(0xdd),        "SELF_DOWN_PEER_DOWN"                },\
                        {(0xaa),        "SELF_UP_PEER_UP"                },\
                        {(0xd1),        "SELF_DOWN_PEER_LEAVING"        },\
                        {(0xac),        "SELF_UP_PEER_COMING"                },\
                        {(0xca),        "SELF_COMING_PEER_UP"                },\
                        {(0x1d),        "SELF_LEAVING_PEER_DOWN"        },\
                        {(0xf0),        "FAILINGOVER"                        },\
                        {(0xcc),        "SYNCHING"                        })

/* Link & Node FSM events: */
#define evt_sym(val)                                                          \
        __print_symbolic(val,                                                  \
                        {(0xec1ab1e),        "ESTABLISH_EVT"                        },\
                        {(0x9eed0e),        "PEER_RESET_EVT"                },\
                        {(0xfa110e),        "FAILURE_EVT"                        },\
                        {(0x10ca1d0e),        "RESET_EVT"                        },\
                        {(0xfa110bee),        "FAILOVER_BEGIN_EVT"                },\
                        {(0xfa110ede),        "FAILOVER_END_EVT"                },\
                        {(0xc1ccbee),        "SYNCH_BEGIN_EVT"                },\
                        {(0xc1ccede),        "SYNCH_END_EVT"                        },\
                        {(0xece),        "SELF_ESTABL_CONTACT_EVT"        },\
                        {(0x1ce),        "SELF_LOST_CONTACT_EVT"                },\
                        {(0x9ece),        "PEER_ESTABL_CONTACT_EVT"        },\
                        {(0x91ce),        "PEER_LOST_CONTACT_EVT"                },\
                        {(0xfbe),        "FAILOVER_BEGIN_EVT"                },\
                        {(0xfee),        "FAILOVER_END_EVT"                },\
                        {(0xcbe),        "SYNCH_BEGIN_EVT"                },\
                        {(0xcee),        "SYNCH_END_EVT"                        })

/* Bearer, net device events: */
#define dev_evt_sym(val)                                                  \
        __print_symbolic(val,                                                  \
                        {(NETDEV_CHANGE),        "NETDEV_CHANGE"                },\
                        {(NETDEV_GOING_DOWN),        "NETDEV_GOING_DOWN"        },\
                        {(NETDEV_UP),                "NETDEV_UP"                },\
                        {(NETDEV_CHANGEMTU),        "NETDEV_CHANGEMTU"        },\
                        {(NETDEV_CHANGEADDR),        "NETDEV_CHANGEADDR"        },\
                        {(NETDEV_UNREGISTER),        "NETDEV_UNREGISTER"        },\
                        {(NETDEV_CHANGENAME),        "NETDEV_CHANGENAME"        })

extern unsigned long sysctl_tipc_sk_filter[5] __read_mostly;

int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf);
int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf);
int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf);
int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf);
int tipc_node_dump(struct tipc_node *n, bool more, char *buf);
bool tipc_sk_filtering(struct sock *sk);

DECLARE_EVENT_CLASS(tipc_skb_class,

        TP_PROTO(struct sk_buff *skb, bool more, const char *header),

        TP_ARGS(skb, more, header),

        TP_STRUCT__entry(
                __string(header, header)
                __dynamic_array(char, buf, (more) ? SKB_LMAX : SKB_LMIN)
        ),

        TP_fast_assign(
                __assign_str(header);
                tipc_skb_dump(skb, more, __get_str(buf));
        ),

        TP_printk("%s\n%s", __get_str(header), __get_str(buf))
)

#define DEFINE_SKB_EVENT(name) \
DEFINE_EVENT(tipc_skb_class, name, \
        TP_PROTO(struct sk_buff *skb, bool more, const char *header), \
        TP_ARGS(skb, more, header))
DEFINE_SKB_EVENT(tipc_skb_dump);
DEFINE_SKB_EVENT(tipc_proto_build);
DEFINE_SKB_EVENT(tipc_proto_rcv);

DECLARE_EVENT_CLASS(tipc_list_class,

        TP_PROTO(struct sk_buff_head *list, bool more, const char *header),

        TP_ARGS(list, more, header),

        TP_STRUCT__entry(
                __string(header, header)
                __dynamic_array(char, buf, (more) ? LIST_LMAX : LIST_LMIN)
        ),

        TP_fast_assign(
                __assign_str(header);
                tipc_list_dump(list, more, __get_str(buf));
        ),

        TP_printk("%s\n%s", __get_str(header), __get_str(buf))
);

#define DEFINE_LIST_EVENT(name) \
DEFINE_EVENT(tipc_list_class, name, \
        TP_PROTO(struct sk_buff_head *list, bool more, const char *header), \
        TP_ARGS(list, more, header))
DEFINE_LIST_EVENT(tipc_list_dump);

DECLARE_EVENT_CLASS(tipc_sk_class,

        TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues,
                 const char *header),

        TP_ARGS(sk, skb, dqueues, header),

        TP_STRUCT__entry(
                __string(header, header)
                __field(u32, portid)
                __dynamic_array(char, buf, (dqueues) ? SK_LMAX : SK_LMIN)
                __dynamic_array(char, skb_buf, (skb) ? SKB_LMIN : 1)
        ),

        TP_fast_assign(
                __assign_str(header);
                __entry->portid = tipc_sock_get_portid(sk);
                tipc_sk_dump(sk, dqueues, __get_str(buf));
                if (skb)
                        tipc_skb_dump(skb, false, __get_str(skb_buf));
                else
                        *(__get_str(skb_buf)) = '\0';
        ),

        TP_printk("<%u> %s\n%s%s", __entry->portid, __get_str(header),
                  __get_str(skb_buf), __get_str(buf))
);

#define DEFINE_SK_EVENT_FILTER(name) \
DEFINE_EVENT_CONDITION(tipc_sk_class, name, \
        TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \
                 const char *header), \
        TP_ARGS(sk, skb, dqueues, header), \
        TP_CONDITION(tipc_sk_filtering(sk)))
DEFINE_SK_EVENT_FILTER(tipc_sk_dump);
DEFINE_SK_EVENT_FILTER(tipc_sk_create);
DEFINE_SK_EVENT_FILTER(tipc_sk_sendmcast);
DEFINE_SK_EVENT_FILTER(tipc_sk_sendmsg);
DEFINE_SK_EVENT_FILTER(tipc_sk_sendstream);
DEFINE_SK_EVENT_FILTER(tipc_sk_poll);
DEFINE_SK_EVENT_FILTER(tipc_sk_filter_rcv);
DEFINE_SK_EVENT_FILTER(tipc_sk_advance_rx);
DEFINE_SK_EVENT_FILTER(tipc_sk_rej_msg);
DEFINE_SK_EVENT_FILTER(tipc_sk_drop_msg);
DEFINE_SK_EVENT_FILTER(tipc_sk_release);
DEFINE_SK_EVENT_FILTER(tipc_sk_shutdown);

#define DEFINE_SK_EVENT_FILTER_COND(name, cond) \
DEFINE_EVENT_CONDITION(tipc_sk_class, name, \
        TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \
                 const char *header), \
        TP_ARGS(sk, skb, dqueues, header), \
        TP_CONDITION(tipc_sk_filtering(sk) && (cond)))
DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit1, tipc_sk_overlimit1(sk, skb));
DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit2, tipc_sk_overlimit2(sk, skb));

DECLARE_EVENT_CLASS(tipc_link_class,

        TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header),

        TP_ARGS(l, dqueues, header),

        TP_STRUCT__entry(
                __string(header, header)
                __array(char, name, TIPC_MAX_LINK_NAME)
                __dynamic_array(char, buf, (dqueues) ? LINK_LMAX : LINK_LMIN)
        ),

        TP_fast_assign(
                __assign_str(header);
                memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME);
                tipc_link_dump(l, dqueues, __get_str(buf));
        ),

        TP_printk("<%s> %s\n%s", __entry->name, __get_str(header),
                  __get_str(buf))
);

#define DEFINE_LINK_EVENT(name) \
DEFINE_EVENT(tipc_link_class, name, \
        TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \
        TP_ARGS(l, dqueues, header))
DEFINE_LINK_EVENT(tipc_link_dump);
DEFINE_LINK_EVENT(tipc_link_conges);
DEFINE_LINK_EVENT(tipc_link_timeout);
DEFINE_LINK_EVENT(tipc_link_reset);

#define DEFINE_LINK_EVENT_COND(name, cond) \
DEFINE_EVENT_CONDITION(tipc_link_class, name, \
        TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \
        TP_ARGS(l, dqueues, header), \
        TP_CONDITION(cond))
DEFINE_LINK_EVENT_COND(tipc_link_too_silent, tipc_link_too_silent(l));

DECLARE_EVENT_CLASS(tipc_link_transmq_class,

        TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),

        TP_ARGS(r, f, t, tq),

        TP_STRUCT__entry(
                __array(char, name, TIPC_MAX_LINK_NAME)
                __field(u16, from)
                __field(u16, to)
                __field(u32, len)
                __field(u16, fseqno)
                __field(u16, lseqno)
        ),

        TP_fast_assign(
                memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME);
                __entry->from = f;
                __entry->to = t;
                __entry->len = skb_queue_len(tq);
                __entry->fseqno = __entry->len ?
                                  msg_seqno(buf_msg(skb_peek(tq))) : 0;
                __entry->lseqno = __entry->len ?
                                  msg_seqno(buf_msg(skb_peek_tail(tq))) : 0;
        ),

        TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n",
                  __entry->name, __entry->from, __entry->to,
                  __entry->len, __entry->fseqno, __entry->lseqno)
);

DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans,
        TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),
        TP_ARGS(r, f, t, tq),
        TP_CONDITION(less_eq(f, t))
);

DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack,
        TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq),
        TP_ARGS(r, f, t, tq),
        TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n",
                  __entry->name, __entry->from, __entry->to,
                  __entry->len, __entry->fseqno, __entry->lseqno)
);

DECLARE_EVENT_CLASS(tipc_node_class,

        TP_PROTO(struct tipc_node *n, bool more, const char *header),

        TP_ARGS(n, more, header),

        TP_STRUCT__entry(
                __string(header, header)
                __field(u32, addr)
                __dynamic_array(char, buf, (more) ? NODE_LMAX : NODE_LMIN)
        ),

        TP_fast_assign(
                __assign_str(header);
                __entry->addr = tipc_node_get_addr(n);
                tipc_node_dump(n, more, __get_str(buf));
        ),

        TP_printk("<%x> %s\n%s", __entry->addr, __get_str(header),
                  __get_str(buf))
);

#define DEFINE_NODE_EVENT(name) \
DEFINE_EVENT(tipc_node_class, name, \
        TP_PROTO(struct tipc_node *n, bool more, const char *header), \
        TP_ARGS(n, more, header))
DEFINE_NODE_EVENT(tipc_node_dump);
DEFINE_NODE_EVENT(tipc_node_create);
DEFINE_NODE_EVENT(tipc_node_delete);
DEFINE_NODE_EVENT(tipc_node_lost_contact);
DEFINE_NODE_EVENT(tipc_node_timeout);
DEFINE_NODE_EVENT(tipc_node_link_up);
DEFINE_NODE_EVENT(tipc_node_link_down);
DEFINE_NODE_EVENT(tipc_node_reset_links);
DEFINE_NODE_EVENT(tipc_node_check_state);

DECLARE_EVENT_CLASS(tipc_fsm_class,

        TP_PROTO(const char *name, u32 os, u32 ns, int evt),

        TP_ARGS(name, os, ns, evt),

        TP_STRUCT__entry(
                __string(name, name)
                __field(u32, os)
                __field(u32, ns)
                __field(u32, evt)
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->os = os;
                __entry->ns = ns;
                __entry->evt = evt;
        ),

        TP_printk("<%s> %s--(%s)->%s\n", __get_str(name),
                  state_sym(__entry->os), evt_sym(__entry->evt),
                  state_sym(__entry->ns))
);

#define DEFINE_FSM_EVENT(fsm_name) \
DEFINE_EVENT(tipc_fsm_class, fsm_name, \
        TP_PROTO(const char *name, u32 os, u32 ns, int evt), \
        TP_ARGS(name, os, ns, evt))
DEFINE_FSM_EVENT(tipc_link_fsm);
DEFINE_FSM_EVENT(tipc_node_fsm);

TRACE_EVENT(tipc_l2_device_event,

        TP_PROTO(struct net_device *dev, struct tipc_bearer *b,
                 unsigned long evt),

        TP_ARGS(dev, b, evt),

        TP_STRUCT__entry(
                __string(dev_name, dev->name)
                __string(b_name, b->name)
                __field(unsigned long, evt)
                __field(u8, b_up)
                __field(u8, carrier)
                __field(u8, oper)
        ),

        TP_fast_assign(
                __assign_str(dev_name);
                __assign_str(b_name);
                __entry->evt = evt;
                __entry->b_up = test_bit(0, &b->up);
                __entry->carrier = netif_carrier_ok(dev);
                __entry->oper = netif_oper_up(dev);
        ),

        TP_printk("%s on: <%s>/<%s> oper: %s carrier: %s bearer: %s\n",
                  dev_evt_sym(__entry->evt), __get_str(dev_name),
                  __get_str(b_name), (__entry->oper) ? "up" : "down",
                  (__entry->carrier) ? "ok" : "notok",
                  (__entry->b_up) ? "up" : "down")
);

#endif /* _TIPC_TRACE_H */

/* This part must be outside protection */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>

























































































    1 























































    1 

































    1 







    1 










    1 

    1 



    1 




    1 

    1 



















































































































































































































































































































































































































































































































































































    1 
    1 

    1 



























































































































































































































    1 















    1 





























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic INET transport hashtables
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#include <linux/module.h>
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
#include <linux/memblock.h>

#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/inet6_hashtables.h>
#endif
#include <net/secure_seq.h>
#include <net/hotdata.h>
#include <net/ip.h>
#include <net/tcp.h>
#include <net/sock_reuseport.h>

u32 inet_ehashfn(const struct net *net, const __be32 laddr,
                 const __u16 lport, const __be32 faddr,
                 const __be16 fport)
{
        net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));

        return __inet_ehashfn(laddr, lport, faddr, fport,
                              inet_ehash_secret + net_hash_mix(net));
}
EXPORT_SYMBOL_GPL(inet_ehashfn);

/* This function handles inet_sock, but also timewait and request sockets
 * for IPv4/IPv6.
 */
static u32 sk_ehashfn(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6 &&
            !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
                return inet6_ehashfn(sock_net(sk),
                                     &sk->sk_v6_rcv_saddr, sk->sk_num,
                                     &sk->sk_v6_daddr, sk->sk_dport);
#endif
        return inet_ehashfn(sock_net(sk),
                            sk->sk_rcv_saddr, sk->sk_num,
                            sk->sk_daddr, sk->sk_dport);
}

/*
 * Allocate and initialize a new local port bind bucket.
 * The bindhash mutex for snum's hash chain must be held here.
 */
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
                                                 struct net *net,
                                                 struct inet_bind_hashbucket *head,
                                                 const unsigned short snum,
                                                 int l3mdev)
{
        struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);

        if (tb) {
                write_pnet(&tb->ib_net, net);
                tb->l3mdev    = l3mdev;
                tb->port      = snum;
                tb->fastreuse = 0;
                tb->fastreuseport = 0;
                INIT_HLIST_HEAD(&tb->bhash2);
                hlist_add_head(&tb->node, &head->chain);
        }
        return tb;
}

/*
 * Caller must hold hashbucket lock for this tb with local BH disabled
 */
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
{
        if (hlist_empty(&tb->bhash2)) {
                __hlist_del(&tb->node);
                kmem_cache_free(cachep, tb);
        }
}

bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
                            unsigned short port, int l3mdev)
{
        return net_eq(ib_net(tb), net) && tb->port == port &&
                tb->l3mdev == l3mdev;
}

static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
                                   struct net *net,
                                   struct inet_bind_hashbucket *head,
                                   struct inet_bind_bucket *tb,
                                   const struct sock *sk)
{
        write_pnet(&tb2->ib_net, net);
        tb2->l3mdev = tb->l3mdev;
        tb2->port = tb->port;
#if IS_ENABLED(CONFIG_IPV6)
        BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
        if (sk->sk_family == AF_INET6) {
                tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
                tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
        } else {
                tb2->addr_type = IPV6_ADDR_MAPPED;
                ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
        }
#else
        tb2->rcv_saddr = sk->sk_rcv_saddr;
#endif
        INIT_HLIST_HEAD(&tb2->owners);
        hlist_add_head(&tb2->node, &head->chain);
        hlist_add_head(&tb2->bhash_node, &tb->bhash2);
}

struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
                                                   struct net *net,
                                                   struct inet_bind_hashbucket *head,
                                                   struct inet_bind_bucket *tb,
                                                   const struct sock *sk)
{
        struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);

        if (tb2)
                inet_bind2_bucket_init(tb2, net, head, tb, sk);

        return tb2;
}

/* Caller must hold hashbucket lock for this tb with local BH disabled */
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
        if (hlist_empty(&tb->owners)) {
                __hlist_del(&tb->node);
                __hlist_del(&tb->bhash_node);
                kmem_cache_free(cachep, tb);
        }
}

static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
                                         const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);

        if (tb2->addr_type != IPV6_ADDR_MAPPED)
                return false;
#endif
        return tb2->rcv_saddr == sk->sk_rcv_saddr;
}

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
                    struct inet_bind2_bucket *tb2, unsigned short port)
{
        inet_sk(sk)->inet_num = port;
        inet_csk(sk)->icsk_bind_hash = tb;
        inet_csk(sk)->icsk_bind2_hash = tb2;
        sk_add_bind_node(sk, &tb2->owners);
}

/*
 * Get rid of any references to a local port held by the given sock.
 */
static void __inet_put_port(struct sock *sk)
{
        struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
        struct inet_bind_hashbucket *head, *head2;
        struct net *net = sock_net(sk);
        struct inet_bind_bucket *tb;
        int bhash;

        bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
        head = &hashinfo->bhash[bhash];
        head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);

        spin_lock(&head->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
        inet_csk(sk)->icsk_bind_hash = NULL;
        inet_sk(sk)->inet_num = 0;

        spin_lock(&head2->lock);
        if (inet_csk(sk)->icsk_bind2_hash) {
                struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;

                __sk_del_bind_node(sk);
                inet_csk(sk)->icsk_bind2_hash = NULL;
                inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
        }
        spin_unlock(&head2->lock);

        inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
        spin_unlock(&head->lock);
}

void inet_put_port(struct sock *sk)
{
        local_bh_disable();
        __inet_put_port(sk);
        local_bh_enable();
}
EXPORT_SYMBOL(inet_put_port);

int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
        struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk);
        unsigned short port = inet_sk(child)->inet_num;
        struct inet_bind_hashbucket *head, *head2;
        bool created_inet_bind_bucket = false;
        struct net *net = sock_net(sk);
        bool update_fastreuse = false;
        struct inet_bind2_bucket *tb2;
        struct inet_bind_bucket *tb;
        int bhash, l3mdev;

        bhash = inet_bhashfn(net, port, table->bhash_size);
        head = &table->bhash[bhash];
        head2 = inet_bhashfn_portaddr(table, child, net, port);

        spin_lock(&head->lock);
        spin_lock(&head2->lock);
        tb = inet_csk(sk)->icsk_bind_hash;
        tb2 = inet_csk(sk)->icsk_bind2_hash;
        if (unlikely(!tb || !tb2)) {
                spin_unlock(&head2->lock);
                spin_unlock(&head->lock);
                return -ENOENT;
        }
        if (tb->port != port) {
                l3mdev = inet_sk_bound_l3mdev(sk);

                /* NOTE: using tproxy and redirecting skbs to a proxy
                 * on a different listener port breaks the assumption
                 * that the listener socket's icsk_bind_hash is the same
                 * as that of the child socket. We have to look up or
                 * create a new bind bucket for the child here. */
                inet_bind_bucket_for_each(tb, &head->chain) {
                        if (inet_bind_bucket_match(tb, net, port, l3mdev))
                                break;
                }
                if (!tb) {
                        tb = inet_bind_bucket_create(table->bind_bucket_cachep,
                                                     net, head, port, l3mdev);
                        if (!tb) {
                                spin_unlock(&head2->lock);
                                spin_unlock(&head->lock);
                                return -ENOMEM;
                        }
                        created_inet_bind_bucket = true;
                }
                update_fastreuse = true;

                goto bhash2_find;
        } else if (!inet_bind2_bucket_addr_match(tb2, child)) {
                l3mdev = inet_sk_bound_l3mdev(sk);

bhash2_find:
                tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
                if (!tb2) {
                        tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
                                                       net, head2, tb, child);
                        if (!tb2)
                                goto error;
                }
        }
        if (update_fastreuse)
                inet_csk_update_fastreuse(tb, child);
        inet_bind_hash(child, tb, tb2, port);
        spin_unlock(&head2->lock);
        spin_unlock(&head->lock);

        return 0;

error:
        if (created_inet_bind_bucket)
                inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
        spin_unlock(&head2->lock);
        spin_unlock(&head->lock);
        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);

static struct inet_listen_hashbucket *
inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
{
        u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                hash = ipv6_portaddr_hash(sock_net(sk),
                                          &sk->sk_v6_rcv_saddr,
                                          inet_sk(sk)->inet_num);
        else
#endif
                hash = ipv4_portaddr_hash(sock_net(sk),
                                          inet_sk(sk)->inet_rcv_saddr,
                                          inet_sk(sk)->inet_num);
        return inet_lhash2_bucket(h, hash);
}

static inline int compute_score(struct sock *sk, struct net *net,
                                const unsigned short hnum, const __be32 daddr,
                                const int dif, const int sdif)
{
        int score = -1;

        if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
                        !ipv6_only_sock(sk)) {
                if (sk->sk_rcv_saddr != daddr)
                        return -1;

                if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
                        return -1;
                score =  sk->sk_bound_dev_if ? 2 : 1;

                if (sk->sk_family == PF_INET)
                        score++;
                if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
                        score++;
        }
        return score;
}

/**
 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
 * @net: network namespace.
 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
 * @skb: context for a potential SK_REUSEPORT program.
 * @doff: header offset.
 * @saddr: source address.
 * @sport: source port.
 * @daddr: destination address.
 * @hnum: destination port in host byte order.
 * @ehashfn: hash function used to generate the fallback hash.
 *
 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
 *         the selected sock or an error.
 */
struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
                                   struct sk_buff *skb, int doff,
                                   __be32 saddr, __be16 sport,
                                   __be32 daddr, unsigned short hnum,
                                   inet_ehashfn_t *ehashfn)
{
        struct sock *reuse_sk = NULL;
        u32 phash;

        if (sk->sk_reuseport) {
                phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
                                        net, daddr, hnum, saddr, sport);
                reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
        }
        return reuse_sk;
}
EXPORT_SYMBOL_GPL(inet_lookup_reuseport);

/*
 * Here are some nice properties to exploit here. The BSD API
 * does not allow a listening sock to specify the remote port nor the
 * remote address for the connection. So always assume those are both
 * wildcarded during the search since they can never be otherwise.
 */

/* called with rcu_read_lock() : No refcount taken on the socket */
static struct sock *inet_lhash2_lookup(struct net *net,
                                struct inet_listen_hashbucket *ilb2,
                                struct sk_buff *skb, int doff,
                                const __be32 saddr, __be16 sport,
                                const __be32 daddr, const unsigned short hnum,
                                const int dif, const int sdif)
{
        struct sock *sk, *result = NULL;
        struct hlist_nulls_node *node;
        int score, hiscore = 0;

        sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
                score = compute_score(sk, net, hnum, daddr, dif, sdif);
                if (score > hiscore) {
                        result = inet_lookup_reuseport(net, sk, skb, doff,
                                                       saddr, sport, daddr, hnum, inet_ehashfn);
                        if (result)
                                return result;

                        result = sk;
                        hiscore = score;
                }
        }

        return result;
}

struct sock *inet_lookup_run_sk_lookup(struct net *net,
                                       int protocol,
                                       struct sk_buff *skb, int doff,
                                       __be32 saddr, __be16 sport,
                                       __be32 daddr, u16 hnum, const int dif,
                                       inet_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool no_reuseport;

        no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
                                            daddr, hnum, dif, &sk);
        if (no_reuseport || IS_ERR_OR_NULL(sk))
                return sk;

        reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
                                         ehashfn);
        if (reuse_sk)
                sk = reuse_sk;
        return sk;
}

struct sock *__inet_lookup_listener(struct net *net,
                                    struct inet_hashinfo *hashinfo,
                                    struct sk_buff *skb, int doff,
                                    const __be32 saddr, __be16 sport,
                                    const __be32 daddr, const unsigned short hnum,
                                    const int dif, const int sdif)
{
        struct inet_listen_hashbucket *ilb2;
        struct sock *result = NULL;
        unsigned int hash2;

        /* Lookup redirect from BPF */
        if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
            hashinfo == net->ipv4.tcp_death_row.hashinfo) {
                result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
                                                   saddr, sport, daddr, hnum, dif,
                                                   inet_ehashfn);
                if (result)
                        goto done;
        }

        hash2 = ipv4_portaddr_hash(net, daddr, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet_lhash2_lookup(net, ilb2, skb, doff,
                                    saddr, sport, daddr, hnum,
                                    dif, sdif);
        if (result)
                goto done;

        /* Lookup lhash2 with INADDR_ANY */
        hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet_lhash2_lookup(net, ilb2, skb, doff,
                                    saddr, sport, htonl(INADDR_ANY), hnum,
                                    dif, sdif);
done:
        if (IS_ERR(result))
                return NULL;
        return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);

/* All sockets share common refcount, but have different destructors */
void sock_gen_put(struct sock *sk)
{
        if (!refcount_dec_and_test(&sk->sk_refcnt))
                return;

        if (sk->sk_state == TCP_TIME_WAIT)
                inet_twsk_free(inet_twsk(sk));
        else if (sk->sk_state == TCP_NEW_SYN_RECV)
                reqsk_free(inet_reqsk(sk));
        else
                sk_free(sk);
}
EXPORT_SYMBOL_GPL(sock_gen_put);

void sock_edemux(struct sk_buff *skb)
{
        sock_gen_put(skb->sk);
}
EXPORT_SYMBOL(sock_edemux);

struct sock *__inet_lookup_established(struct net *net,
                                  struct inet_hashinfo *hashinfo,
                                  const __be32 saddr, const __be16 sport,
                                  const __be32 daddr, const u16 hnum,
                                  const int dif, const int sdif)
{
        INET_ADDR_COOKIE(acookie, saddr, daddr);
        const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
        struct sock *sk;
        const struct hlist_nulls_node *node;
        /* Optimize here for direct hit, only listening connections can
         * have wildcards anyways.
         */
        unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
        unsigned int slot = hash & hashinfo->ehash_mask;
        struct inet_ehash_bucket *head = &hashinfo->ehash[slot];

begin:
        sk_nulls_for_each_rcu(sk, node, &head->chain) {
                if (sk->sk_hash != hash)
                        continue;
                if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
                        if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                                goto out;
                        if (unlikely(!inet_match(net, sk, acookie,
                                                 ports, dif, sdif))) {
                                sock_gen_put(sk);
                                goto begin;
                        }
                        goto found;
                }
        }
        /*
         * if the nulls value we got at the end of this lookup is
         * not the expected one, we must restart lookup.
         * We probably met an item that was moved to another chain.
         */
        if (get_nulls_value(node) != slot)
                goto begin;
out:
        sk = NULL;
found:
        return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);

/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
                                    struct sock *sk, __u16 lport,
                                    struct inet_timewait_sock **twp)
{
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_sock *inet = inet_sk(sk);
        __be32 daddr = inet->inet_rcv_saddr;
        __be32 saddr = inet->inet_daddr;
        int dif = sk->sk_bound_dev_if;
        struct net *net = sock_net(sk);
        int sdif = l3mdev_master_ifindex_by_index(net, dif);
        INET_ADDR_COOKIE(acookie, saddr, daddr);
        const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
        unsigned int hash = inet_ehashfn(net, daddr, lport,
                                         saddr, inet->inet_dport);
        struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
        spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
        struct sock *sk2;
        const struct hlist_nulls_node *node;
        struct inet_timewait_sock *tw = NULL;

        spin_lock(lock);

        sk_nulls_for_each(sk2, node, &head->chain) {
                if (sk2->sk_hash != hash)
                        continue;

                if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
                        if (sk2->sk_state == TCP_TIME_WAIT) {
                                tw = inet_twsk(sk2);
                                if (sk->sk_protocol == IPPROTO_TCP &&
                                    tcp_twsk_unique(sk, sk2, twp))
                                        break;
                        }
                        goto not_unique;
                }
        }

        /* Must record num and sport now. Otherwise we will see
         * in hash table socket with a funny identity.
         */
        inet->inet_num = lport;
        inet->inet_sport = htons(lport);
        sk->sk_hash = hash;
        WARN_ON(!sk_unhashed(sk));
        __sk_nulls_add_node_rcu(sk, &head->chain);
        if (tw) {
                sk_nulls_del_node_init_rcu((struct sock *)tw);
                __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
        }
        spin_unlock(lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

        if (twp) {
                *twp = tw;
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
                inet_twsk_deschedule_put(tw);
        }
        return 0;

not_unique:
        spin_unlock(lock);
        return -EADDRNOTAVAIL;
}

static u64 inet_sk_port_offset(const struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);

        return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
                                          inet->inet_daddr,
                                          inet->inet_dport);
}

/* Searches for an exsiting socket in the ehash bucket list.
 * Returns true if found, false otherwise.
 */
static bool inet_ehash_lookup_by_sk(struct sock *sk,
                                    struct hlist_nulls_head *list)
{
        const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
        const int sdif = sk->sk_bound_dev_if;
        const int dif = sk->sk_bound_dev_if;
        const struct hlist_nulls_node *node;
        struct net *net = sock_net(sk);
        struct sock *esk;

        INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);

        sk_nulls_for_each_rcu(esk, node, list) {
                if (esk->sk_hash != sk->sk_hash)
                        continue;
                if (sk->sk_family == AF_INET) {
                        if (unlikely(inet_match(net, esk, acookie,
                                                ports, dif, sdif))) {
                                return true;
                        }
                }
#if IS_ENABLED(CONFIG_IPV6)
                else if (sk->sk_family == AF_INET6) {
                        if (unlikely(inet6_match(net, esk,
                                                 &sk->sk_v6_daddr,
                                                 &sk->sk_v6_rcv_saddr,
                                                 ports, dif, sdif))) {
                                return true;
                        }
                }
#endif
        }
        return false;
}

/* Insert a socket into ehash, and eventually remove another one
 * (The another one can be a SYN_RECV or TIMEWAIT)
 * If an existing socket already exists, socket sk is not inserted,
 * and sets found_dup_sk parameter to true.
 */
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
        struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
        struct inet_ehash_bucket *head;
        struct hlist_nulls_head *list;
        spinlock_t *lock;
        bool ret = true;

        WARN_ON_ONCE(!sk_unhashed(sk));

        sk->sk_hash = sk_ehashfn(sk);
        head = inet_ehash_bucket(hashinfo, sk->sk_hash);
        list = &head->chain;
        lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

        spin_lock(lock);
        if (osk) {
                WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
                ret = sk_nulls_del_node_init_rcu(osk);
        } else if (found_dup_sk) {
                *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
                if (*found_dup_sk)
                        ret = false;
        }

        if (ret)
                __sk_nulls_add_node_rcu(sk, list);

        spin_unlock(lock);

        return ret;
}

bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
        bool ok = inet_ehash_insert(sk, osk, found_dup_sk);

        if (ok) {
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        } else {
                this_cpu_inc(*sk->sk_prot->orphan_count);
                inet_sk_set_state(sk, TCP_CLOSE);
                sock_set_flag(sk, SOCK_DEAD);
                inet_csk_destroy_sock(sk);
        }
        return ok;
}
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);

static int inet_reuseport_add_sock(struct sock *sk,
                                   struct inet_listen_hashbucket *ilb)
{
        struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
        const struct hlist_nulls_node *node;
        struct sock *sk2;
        kuid_t uid = sock_i_uid(sk);

        sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
                if (sk2 != sk &&
                    sk2->sk_family == sk->sk_family &&
                    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
                    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
                    inet_csk(sk2)->icsk_bind_hash == tb &&
                    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
                    inet_rcv_saddr_equal(sk, sk2, false))
                        return reuseport_add_sock(sk, sk2,
                                                  inet_rcv_saddr_any(sk));
        }

        return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}

int __inet_hash(struct sock *sk, struct sock *osk)
{
        struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
        struct inet_listen_hashbucket *ilb2;
        int err = 0;

        if (sk->sk_state != TCP_LISTEN) {
                local_bh_disable();
                inet_ehash_nolisten(sk, osk, NULL);
                local_bh_enable();
                return 0;
        }
        WARN_ON(!sk_unhashed(sk));
        ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);

        spin_lock(&ilb2->lock);
        if (sk->sk_reuseport) {
                err = inet_reuseport_add_sock(sk, ilb2);
                if (err)
                        goto unlock;
        }
        sock_set_flag(sk, SOCK_RCU_FREE);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
                sk->sk_family == AF_INET6)
                __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
        else
                __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
        spin_unlock(&ilb2->lock);

        return err;
}
EXPORT_SYMBOL(__inet_hash);

int inet_hash(struct sock *sk)
{
        int err = 0;

        if (sk->sk_state != TCP_CLOSE)
                err = __inet_hash(sk, NULL);

        return err;
}
EXPORT_SYMBOL_GPL(inet_hash);

void inet_unhash(struct sock *sk)
{
        struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);

        if (sk_unhashed(sk))
                return;

        if (sk->sk_state == TCP_LISTEN) {
                struct inet_listen_hashbucket *ilb2;

                ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
                /* Don't disable bottom halves while acquiring the lock to
                 * avoid circular locking dependency on PREEMPT_RT.
                 */
                spin_lock(&ilb2->lock);
                if (sk_unhashed(sk)) {
                        spin_unlock(&ilb2->lock);
                        return;
                }

                if (rcu_access_pointer(sk->sk_reuseport_cb))
                        reuseport_stop_listen_sock(sk);

                __sk_nulls_del_node_init_rcu(sk);
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
                spin_unlock(&ilb2->lock);
        } else {
                spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);

                spin_lock_bh(lock);
                if (sk_unhashed(sk)) {
                        spin_unlock_bh(lock);
                        return;
                }
                __sk_nulls_del_node_init_rcu(sk);
                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
                spin_unlock_bh(lock);
        }
}
EXPORT_SYMBOL_GPL(inet_unhash);

static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
                                    const struct net *net, unsigned short port,
                                    int l3mdev, const struct sock *sk)
{
        if (!net_eq(ib2_net(tb), net) || tb->port != port ||
            tb->l3mdev != l3mdev)
                return false;

        return inet_bind2_bucket_addr_match(tb, sk);
}

bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
                                      unsigned short port, int l3mdev, const struct sock *sk)
{
        if (!net_eq(ib2_net(tb), net) || tb->port != port ||
            tb->l3mdev != l3mdev)
                return false;

#if IS_ENABLED(CONFIG_IPV6)
        if (tb->addr_type == IPV6_ADDR_ANY)
                return true;

        if (tb->addr_type != IPV6_ADDR_MAPPED)
                return false;

        if (sk->sk_family == AF_INET6 &&
            !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
                return false;
#endif
        return tb->rcv_saddr == 0;
}

/* The socket's bhash2 hashbucket spinlock must be held when this is called */
struct inet_bind2_bucket *
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
                       unsigned short port, int l3mdev, const struct sock *sk)
{
        struct inet_bind2_bucket *bhash2 = NULL;

        inet_bind_bucket_for_each(bhash2, &head->chain)
                if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
                        break;

        return bhash2;
}

struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
{
        struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
        u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                hash = ipv6_portaddr_hash(net, &in6addr_any, port);
        else
#endif
                hash = ipv4_portaddr_hash(net, 0, port);

        return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}

static void inet_update_saddr(struct sock *sk, void *saddr, int family)
{
        if (family == AF_INET) {
                inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
                sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
        }
#if IS_ENABLED(CONFIG_IPV6)
        else {
                sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
        }
#endif
}

static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
{
        struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
        struct inet_bind_hashbucket *head, *head2;
        struct inet_bind2_bucket *tb2, *new_tb2;
        int l3mdev = inet_sk_bound_l3mdev(sk);
        int port = inet_sk(sk)->inet_num;
        struct net *net = sock_net(sk);
        int bhash;

        if (!inet_csk(sk)->icsk_bind2_hash) {
                /* Not bind()ed before. */
                if (reset)
                        inet_reset_saddr(sk);
                else
                        inet_update_saddr(sk, saddr, family);

                return 0;
        }

        /* Allocate a bind2 bucket ahead of time to avoid permanently putting
         * the bhash2 table in an inconsistent state if a new tb2 bucket
         * allocation fails.
         */
        new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
        if (!new_tb2) {
                if (reset) {
                        /* The (INADDR_ANY, port) bucket might have already
                         * been freed, then we cannot fixup icsk_bind2_hash,
                         * so we give up and unlink sk from bhash/bhash2 not
                         * to leave inconsistency in bhash2.
                         */
                        inet_put_port(sk);
                        inet_reset_saddr(sk);
                }

                return -ENOMEM;
        }

        bhash = inet_bhashfn(net, port, hinfo->bhash_size);
        head = &hinfo->bhash[bhash];
        head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);

        /* If we change saddr locklessly, another thread
         * iterating over bhash might see corrupted address.
         */
        spin_lock_bh(&head->lock);

        spin_lock(&head2->lock);
        __sk_del_bind_node(sk);
        inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
        spin_unlock(&head2->lock);

        if (reset)
                inet_reset_saddr(sk);
        else
                inet_update_saddr(sk, saddr, family);

        head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);

        spin_lock(&head2->lock);
        tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
        if (!tb2) {
                tb2 = new_tb2;
                inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
        }
        inet_csk(sk)->icsk_bind2_hash = tb2;
        sk_add_bind_node(sk, &tb2->owners);
        spin_unlock(&head2->lock);

        spin_unlock_bh(&head->lock);

        if (tb2 != new_tb2)
                kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);

        return 0;
}

int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
{
        return __inet_bhash2_update_saddr(sk, saddr, family, false);
}
EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);

void inet_bhash2_reset_saddr(struct sock *sk)
{
        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
                __inet_bhash2_update_saddr(sk, NULL, 0, true);
}
EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);

/* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm
 * Note that we use 32bit integers (vs RFC 'short integers')
 * because 2^16 is not a multiple of num_ephemeral and this
 * property might be used by clever attacker.
 *
 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
 * attacks were since demonstrated, thus we use 65536 by default instead
 * to really give more isolation and privacy, at the expense of 256kB
 * of kernel memory.
 */
#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
static u32 *table_perturb;

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                struct sock *sk, u64 port_offset,
                int (*check_established)(struct inet_timewait_death_row *,
                        struct sock *, __u16, struct inet_timewait_sock **))
{
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_bind_hashbucket *head, *head2;
        struct inet_timewait_sock *tw = NULL;
        int port = inet_sk(sk)->inet_num;
        struct net *net = sock_net(sk);
        struct inet_bind2_bucket *tb2;
        struct inet_bind_bucket *tb;
        bool tb_created = false;
        u32 remaining, offset;
        int ret, i, low, high;
        bool local_ports;
        int step, l3mdev;
        u32 index;

        if (port) {
                local_bh_disable();
                ret = check_established(death_row, sk, port, NULL);
                local_bh_enable();
                return ret;
        }

        l3mdev = inet_sk_bound_l3mdev(sk);

        local_ports = inet_sk_get_local_port_range(sk, &low, &high);
        step = local_ports ? 1 : 2;

        high++; /* [32768, 60999] -> [32768, 61000[ */
        remaining = high - low;
        if (!local_ports && remaining > 1)
                remaining &= ~1U;

        get_random_sleepable_once(table_perturb,
                                  INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
        index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);

        offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
        offset %= remaining;

        /* In first pass we try ports of @low parity.
         * inet_csk_get_port() does the opposite choice.
         */
        if (!local_ports)
                offset &= ~1U;
other_parity_scan:
        port = low + offset;
        for (i = 0; i < remaining; i += step, port += step) {
                if (unlikely(port >= high))
                        port -= remaining;
                if (inet_is_local_reserved_port(net, port))
                        continue;
                head = &hinfo->bhash[inet_bhashfn(net, port,
                                                  hinfo->bhash_size)];
                spin_lock_bh(&head->lock);

                /* Does not bother with rcv_saddr checks, because
                 * the established check is already unique enough.
                 */
                inet_bind_bucket_for_each(tb, &head->chain) {
                        if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
                                if (tb->fastreuse >= 0 ||
                                    tb->fastreuseport >= 0)
                                        goto next_port;
                                WARN_ON(hlist_empty(&tb->bhash2));
                                if (!check_established(death_row, sk,
                                                       port, &tw))
                                        goto ok;
                                goto next_port;
                        }
                }

                tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
                                             net, head, port, l3mdev);
                if (!tb) {
                        spin_unlock_bh(&head->lock);
                        return -ENOMEM;
                }
                tb_created = true;
                tb->fastreuse = -1;
                tb->fastreuseport = -1;
                goto ok;
next_port:
                spin_unlock_bh(&head->lock);
                cond_resched();
        }

        if (!local_ports) {
                offset++;
                if ((offset & 1) && remaining > 1)
                        goto other_parity_scan;
        }
        return -EADDRNOTAVAIL;

ok:
        /* Find the corresponding tb2 bucket since we need to
         * add the socket to the bhash2 table as well
         */
        head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
        spin_lock(&head2->lock);

        tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
        if (!tb2) {
                tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
                                               head2, tb, sk);
                if (!tb2)
                        goto error;
        }

        /* Here we want to add a little bit of randomness to the next source
         * port that will be chosen. We use a max() with a random here so that
         * on low contention the randomness is maximal and on high contention
         * it may be inexistent.
         */
        i = max_t(int, i, get_random_u32_below(8) * step);
        WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);

        /* Head lock still held and bh's disabled */
        inet_bind_hash(sk, tb, tb2, port);

        if (sk_unhashed(sk)) {
                inet_sk(sk)->inet_sport = htons(port);
                inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
        }
        if (tw)
                inet_twsk_bind_unhash(tw, hinfo);

        spin_unlock(&head2->lock);
        spin_unlock(&head->lock);

        if (tw)
                inet_twsk_deschedule_put(tw);
        local_bh_enable();
        return 0;

error:
        if (sk_hashed(sk)) {
                spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash);

                sock_prot_inuse_add(net, sk->sk_prot, -1);

                spin_lock(lock);
                __sk_nulls_del_node_init_rcu(sk);
                spin_unlock(lock);

                sk->sk_hash = 0;
                inet_sk(sk)->inet_sport = 0;
                inet_sk(sk)->inet_num = 0;

                if (tw)
                        inet_twsk_bind_unhash(tw, hinfo);
        }

        spin_unlock(&head2->lock);
        if (tb_created)
                inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
        spin_unlock(&head->lock);

        if (tw)
                inet_twsk_deschedule_put(tw);

        local_bh_enable();

        return -ENOMEM;
}

/*
 * Bind a port for a connect operation and hash it.
 */
int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk)
{
        u64 port_offset = 0;

        if (!inet_sk(sk)->inet_num)
                port_offset = inet_sk_port_offset(sk);
        return __inet_hash_connect(death_row, sk, port_offset,
                                   __inet_check_established);
}
EXPORT_SYMBOL_GPL(inet_hash_connect);

static void init_hashinfo_lhash2(struct inet_hashinfo *h)
{
        int i;

        for (i = 0; i <= h->lhash2_mask; i++) {
                spin_lock_init(&h->lhash2[i].lock);
                INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
                                      i + LISTENING_NULLS_BASE);
        }
}

void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
                                unsigned long numentries, int scale,
                                unsigned long low_limit,
                                unsigned long high_limit)
{
        h->lhash2 = alloc_large_system_hash(name,
                                            sizeof(*h->lhash2),
                                            numentries,
                                            scale,
                                            0,
                                            NULL,
                                            &h->lhash2_mask,
                                            low_limit,
                                            high_limit);
        init_hashinfo_lhash2(h);

        /* this one is used for source ports of outgoing connections */
        table_perturb = alloc_large_system_hash("Table-perturb",
                                                sizeof(*table_perturb),
                                                INET_TABLE_PERTURB_SIZE,
                                                0, 0, NULL, NULL,
                                                INET_TABLE_PERTURB_SIZE,
                                                INET_TABLE_PERTURB_SIZE);
}

int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
{
        h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
        if (!h->lhash2)
                return -ENOMEM;

        h->lhash2_mask = INET_LHTABLE_SIZE - 1;
        /* INET_LHTABLE_SIZE must be a power of 2 */
        BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);

        init_hashinfo_lhash2(h);
        return 0;
}
EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
        unsigned int locksz = sizeof(spinlock_t);
        unsigned int i, nblocks = 1;

        if (locksz != 0) {
                /* allocate 2 cache lines or at least one spinlock per cpu */
                nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
                nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());

                /* no more locks than number of hash buckets */
                nblocks = min(nblocks, hashinfo->ehash_mask + 1);

                hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
                if (!hashinfo->ehash_locks)
                        return -ENOMEM;

                for (i = 0; i < nblocks; i++)
                        spin_lock_init(&hashinfo->ehash_locks[i]);
        }
        hashinfo->ehash_locks_mask = nblocks - 1;
        return 0;
}
EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);

struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
                                                 unsigned int ehash_entries)
{
        struct inet_hashinfo *new_hashinfo;
        int i;

        new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
        if (!new_hashinfo)
                goto err;

        new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
                                           GFP_KERNEL_ACCOUNT);
        if (!new_hashinfo->ehash)
                goto free_hashinfo;

        new_hashinfo->ehash_mask = ehash_entries - 1;

        if (inet_ehash_locks_alloc(new_hashinfo))
                goto free_ehash;

        for (i = 0; i < ehash_entries; i++)
                INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);

        new_hashinfo->pernet = true;

        return new_hashinfo;

free_ehash:
        vfree(new_hashinfo->ehash);
free_hashinfo:
        kfree(new_hashinfo);
err:
        return NULL;
}
EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc);

void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
{
        if (!hashinfo->pernet)
                return;

        inet_ehash_locks_free(hashinfo);
        vfree(hashinfo->ehash);
        kfree(hashinfo);
}
EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);































































































































































































































    2 



    1 




    3 



    1 




    2 



    2 



















    1 




    1 




    1 




    1 










    3 
    3 


    1 



































































































































































    4 
















    4 












    4 

















    2 

    2 

    3 








    5 






    5 





    5 















    5 






    5 





    5 

    4 



    5 


















































    3 


    3 






















    3 

    3 

    3 
    1 









































































    3 



    3 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 

























    4 





    4 




























    4 








    5 















    5 


    3 












    5 





    3 












    1 



































    4 












    2 




















































    5 


















    4 








    3 
    4 







































    3 
































    4 





























    2 







    3 
    5 

























































    3 




































































    2 































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/lib/vsprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
/*
 * Wirzenius wrote this portably, Torvalds fucked it up :-)
 */

/*
 * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
 * - changed to provide snprintf and vsnprintf functions
 * So Feb  1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de>
 * - scnprintf and vscnprintf
 */

#include <linux/stdarg.h>
#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/errname.h>
#include <linux/module.h>        /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/kallsyms.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
#include <linux/rtc.h>
#include <linux/sprintf.h>
#include <linux/time.h>
#include <linux/uuid.h>
#include <linux/of.h>
#include <net/addrconf.h>
#include <linux/siphash.h>
#include <linux/compiler.h>
#include <linux/property.h>
#include <linux/notifier.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
#endif

#include "../mm/internal.h"        /* For the trace_print_flags arrays */

#include <asm/page.h>                /* for PAGE_SIZE */
#include <asm/byteorder.h>        /* cpu_to_le16 */
#include <asm/unaligned.h>

#include <linux/string_helpers.h>
#include "kstrtox.h"

/* Disable pointer hashing if requested */
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);

noinline
static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
{
        const char *cp;
        unsigned long long result = 0ULL;
        size_t prefix_chars;
        unsigned int rv;

        cp = _parse_integer_fixup_radix(startp, &base);
        prefix_chars = cp - startp;
        if (prefix_chars < max_chars) {
                rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
                /* FIXME */
                cp += (rv & ~KSTRTOX_OVERFLOW);
        } else {
                /* Field too short for prefix + digit, skip over without converting */
                cp = startp + max_chars;
        }

        if (endp)
                *endp = (char *)cp;

        return result;
}

/**
 * simple_strtoull - convert a string to an unsigned long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoull instead.
 */
noinline
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoull(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoull);

/**
 * simple_strtoul - convert a string to an unsigned long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoul instead.
 */
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
        return simple_strtoull(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtoul);

/**
 * simple_strtol - convert a string to a signed long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtol instead.
 */
long simple_strtol(const char *cp, char **endp, unsigned int base)
{
        if (*cp == '-')
                return -simple_strtoul(cp + 1, endp, base);

        return simple_strtoul(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtol);

noinline
static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars)
{
        /*
         * simple_strntoull() safely handles receiving max_chars==0 in the
         * case cp[0] == '-' && max_chars == 1.
         * If max_chars == 0 we can drop through and pass it to simple_strntoull()
         * and the content of *cp is irrelevant.
         */
        if (*cp == '-' && max_chars > 0)
                return -simple_strntoull(cp + 1, endp, base, max_chars - 1);

        return simple_strntoull(cp, endp, base, max_chars);
}

/**
 * simple_strtoll - convert a string to a signed long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoll instead.
 */
long long simple_strtoll(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoll(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoll);

static noinline_for_stack
int skip_atoi(const char **s)
{
        int i = 0;

        do {
                i = i*10 + *((*s)++) - '0';
        } while (isdigit(**s));

        return i;
}

/*
 * Decimal conversion is by far the most typical, and is used for
 * /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed by emitting
 * two characters at a time, using a 200 byte lookup table. This
 * roughly halves the number of multiplications compared to computing
 * the digits one at a time. Implementation strongly inspired by the
 * previous version, which in turn used ideas described at
 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
 * from the author, Douglas W. Jones).
 *
 * It turns out there is precisely one 26 bit fixed-point
 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
 * range happens to be somewhat larger (x <= 1073741898), but that's
 * irrelevant for our purpose.
 *
 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
 * need a 32x32->64 bit multiply, so we simply use the same constant.
 *
 * For dividing a number in the range [100, 10^4-1] by 100, there are
 * several options. The simplest is (x * 0x147b) >> 19, which is valid
 * for all x <= 43698.
 */

static const u16 decpair[100] = {
#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
#undef _
};

/*
 * This will print a single '0' even if r == 0, since we would
 * immediately jump to out_r where two 0s would be written but only
 * one of them accounted for in buf. This is needed by ip4_string
 * below. All other callers pass a non-zero value of r.
*/
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
        unsigned q;

        /* 1 <= r < 10^8 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 1 <= q < 10^6 */
        if (q < 100)
                goto out_q;

        /*  100 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 1 <= r < 10^4 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
out_q:
        /* 1 <= q < 100 */
        r = q;
out_r:
        /* 1 <= r < 100 */
        *((u16 *)buf) = decpair[r];
        buf += r < 10 ? 1 : 2;
        return buf;
}

#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
char *put_dec_full8(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
        buf += 2;
        return buf;
}

static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n <= 1.6e11 */
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
}

#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64

static void
put_dec_full4(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
}

/*
 * Call put_dec_full4 on x % 10000, return x / 10000.
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
 * (second call in the put_dec code, assuming n is all-ones).
 */
static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;

        put_dec_full4(buf, x - q * 10000);
        return q;
}

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
        uint32_t d3, d2, d1, q, h;

        if (n < 100*1000*1000)
                return put_dec_trunc8(buf, n);

        d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
        h   = (n >> 32);
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */

        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);

        q += 7671 * d3 + 9496 * d2 + 6 * d1;
        q = put_dec_helper4(buf+4, q);

        q += 4749 * d3 + 42 * d2;
        q = put_dec_helper4(buf+8, q);

        q += 281 * d3;
        buf += 12;
        if (q)
                buf = put_dec_trunc8(buf, q);
        else while (buf[-1] == '0')
                --buf;

        return buf;
}

#endif

/*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
 *
 * If speed is not important, use snprintf(). It's easy to read the code.
 */
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;

        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
        if (num <= 9) {
                tmp[0] = '0' + num;
                len = 1;
        } else {
                len = put_dec(tmp, num) - tmp;
        }

        if (len > size || width > size)
                return 0;

        if (width > len) {
                width = width - len;
                for (idx = 0; idx < width; idx++)
                        buf[idx] = ' ';
        } else {
                width = 0;
        }

        for (idx = 0; idx < len; ++idx)
                buf[idx + width] = tmp[len - idx - 1];

        return len + width;
}

#define SIGN        1                /* unsigned/signed, must be 1 */
#define LEFT        2                /* left justified */
#define PLUS        4                /* show plus */
#define SPACE        8                /* space if plus */
#define ZEROPAD        16                /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL        32                /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL        64                /* prefix hex with "0x", octal with "0" */

static_assert(SIGN == 1);
static_assert(ZEROPAD == ('0' - ' '));
static_assert(SMALL == ('a' ^ 'A'));

enum format_type {
        FORMAT_TYPE_NONE, /* Just a string part */
        FORMAT_TYPE_WIDTH,
        FORMAT_TYPE_PRECISION,
        FORMAT_TYPE_CHAR,
        FORMAT_TYPE_STR,
        FORMAT_TYPE_PTR,
        FORMAT_TYPE_PERCENT_CHAR,
        FORMAT_TYPE_INVALID,
        FORMAT_TYPE_LONG_LONG,
        FORMAT_TYPE_ULONG,
        FORMAT_TYPE_LONG,
        FORMAT_TYPE_UBYTE,
        FORMAT_TYPE_BYTE,
        FORMAT_TYPE_USHORT,
        FORMAT_TYPE_SHORT,
        FORMAT_TYPE_UINT,
        FORMAT_TYPE_INT,
        FORMAT_TYPE_SIZE_T,
        FORMAT_TYPE_PTRDIFF
};

struct printf_spec {
        unsigned int        type:8;                /* format_type enum */
        signed int        field_width:24;        /* width of output field */
        unsigned int        flags:8;        /* flags to number() */
        unsigned int        base:8;                /* number base, 8, 10 or 16 only */
        signed int        precision:16;        /* # of digits/chars */
} __packed;
static_assert(sizeof(struct printf_spec) == 8);

#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)

static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
        int field_width = spec.field_width;
        int precision = spec.precision;

        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
        locase = (spec.flags & SMALL);
        if (spec.flags & LEFT)
                spec.flags &= ~ZEROPAD;
        sign = 0;
        if (spec.flags & SIGN) {
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
                        field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
                        field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
                        field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
                        field_width -= 2;
                else if (!is_zero)
                        field_width--;
        }

        /* generate full string in tmp[], in reverse order */
        i = 0;
        if (num < spec.base)
                tmp[i++] = hex_asc_upper[num] | locase;
        else if (spec.base != 10) { /* 8 or 16 */
                int mask = spec.base - 1;
                int shift = 3;

                if (spec.base == 16)
                        shift = 4;
                do {
                        tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
                        num >>= shift;
                } while (num);
        } else { /* base 10 */
                i = put_dec(tmp, num) - tmp;
        }

        /* printing 100 using %2d gives "100", not "00" */
        if (i > precision)
                precision = i;
        /* leading space padding */
        field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
                }
        }
        /* sign */
        if (sign) {
                if (buf < end)
                        *buf = sign;
                ++buf;
        }
        /* "0x" / "0" prefix */
        if (need_pfx) {
                if (spec.base == 16 || !is_zero) {
                        if (buf < end)
                                *buf = '0';
                        ++buf;
                }
                if (spec.base == 16) {
                        if (buf < end)
                                *buf = ('X' | locase);
                        ++buf;
                }
        }
        /* zero or space padding */
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);

                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
        while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
        }
        /* actual digits of result */
        while (--i >= 0) {
                if (buf < end)
                        *buf = tmp[i];
                ++buf;
        }
        /* trailing space padding */
        while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }

        return buf;
}

static noinline_for_stack
char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
{
        struct printf_spec spec;

        spec.type = FORMAT_TYPE_PTR;
        spec.field_width = 2 + 2 * size;        /* 0x + hex */
        spec.flags = SPECIAL | SMALL | ZEROPAD;
        spec.base = 16;
        spec.precision = -1;

        return number(buf, end, num, spec);
}

static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
{
        size_t size;
        if (buf >= end)        /* nowhere to put anything */
                return;
        size = end - buf;
        if (size <= spaces) {
                memset(buf, ' ', size);
                return;
        }
        if (len) {
                if (len > size - spaces)
                        len = size - spaces;
                memmove(buf + spaces, buf, len);
        }
        memset(buf, ' ', spaces);
}

/*
 * Handle field width padding for a string.
 * @buf: current buffer position
 * @n: length of string
 * @end: end of output buffer
 * @spec: for field width and flags
 * Returns: new buffer position after padding.
 */
static noinline_for_stack
char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
{
        unsigned spaces;

        if (likely(n >= spec.field_width))
                return buf;
        /* we want to pad the sucker */
        spaces = spec.field_width - n;
        if (!(spec.flags & LEFT)) {
                move_right(buf - n, end, n, spaces);
                return buf + spaces;
        }
        while (spaces--) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }
        return buf;
}

/* Handle string from a well known address. */
static char *string_nocheck(char *buf, char *end, const char *s,
                            struct printf_spec spec)
{
        int len = 0;
        int lim = spec.precision;

        while (lim--) {
                char c = *s++;
                if (!c)
                        break;
                if (buf < end)
                        *buf = c;
                ++buf;
                ++len;
        }
        return widen_string(buf, len, end, spec);
}

static char *err_ptr(char *buf, char *end, void *ptr,
                     struct printf_spec spec)
{
        int err = PTR_ERR(ptr);
        const char *sym = errname(err);

        if (sym)
                return string_nocheck(buf, end, sym, spec);

        /*
         * Somebody passed ERR_PTR(-1234) or some other non-existing
         * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
         * printing it as its decimal representation.
         */
        spec.flags |= SIGN;
        spec.base = 10;
        return number(buf, end, err, spec);
}

/* Be careful: error messages must fit into the given buffer. */
static char *error_string(char *buf, char *end, const char *s,
                          struct printf_spec spec)
{
        /*
         * Hard limit to avoid a completely insane messages. It actually
         * works pretty well because most error messages are in
         * the many pointer format modifiers.
         */
        if (spec.precision == -1)
                spec.precision = 2 * sizeof(void *);

        return string_nocheck(buf, end, s, spec);
}

/*
 * Do not call any complex external code here. Nested printk()/vsprintf()
 * might cause infinite loops. Failures might break printk() and would
 * be hard to debug.
 */
static const char *check_pointer_msg(const void *ptr)
{
        if (!ptr)
                return "(null)";

        if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr))
                return "(efault)";

        return NULL;
}

static int check_pointer(char **buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        const char *err_msg;

        err_msg = check_pointer_msg(ptr);
        if (err_msg) {
                *buf = error_string(*buf, end, err_msg, spec);
                return -EFAULT;
        }

        return 0;
}

static noinline_for_stack
char *string(char *buf, char *end, const char *s,
             struct printf_spec spec)
{
        if (check_pointer(&buf, end, s, spec))
                return buf;

        return string_nocheck(buf, end, s, spec);
}

static char *pointer_string(char *buf, char *end,
                            const void *ptr,
                            struct printf_spec spec)
{
        spec.base = 16;
        spec.flags |= SMALL;
        if (spec.field_width == -1) {
                spec.field_width = 2 * sizeof(ptr);
                spec.flags |= ZEROPAD;
        }

        return number(buf, end, (unsigned long int)ptr, spec);
}

/* Make pointers available for printing early in the boot sequence. */
static int debug_boot_weak_hash __ro_after_init;

static int __init debug_boot_weak_hash_enable(char *str)
{
        debug_boot_weak_hash = 1;
        pr_info("debug_boot_weak_hash enabled\n");
        return 0;
}
early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);

static bool filled_random_ptr_key __read_mostly;
static siphash_key_t ptr_key __read_mostly;

static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data)
{
        get_random_bytes(&ptr_key, sizeof(ptr_key));

        /* Pairs with smp_rmb() before reading ptr_key. */
        smp_wmb();
        WRITE_ONCE(filled_random_ptr_key, true);
        return NOTIFY_DONE;
}

static int __init vsprintf_init_hashval(void)
{
        static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key };
        execute_with_initialized_rng(&fill_ptr_key_nb);
        return 0;
}
subsys_initcall(vsprintf_init_hashval)

/* Maps a pointer to a 32 bit unique identifier. */
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        unsigned long hashval;

        if (!READ_ONCE(filled_random_ptr_key))
                return -EBUSY;

        /* Pairs with smp_wmb() after writing ptr_key. */
        smp_rmb();

#ifdef CONFIG_64BIT
        hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
        /*
         * Mask off the first 32 bits, this makes explicit that we have
         * modified the address (and 32 bits is plenty for a unique ID).
         */
        hashval = hashval & 0xffffffff;
#else
        hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
#endif
        *hashval_out = hashval;
        return 0;
}

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        return __ptr_to_hashval(ptr, hashval_out);
}

static char *ptr_to_id(char *buf, char *end, const void *ptr,
                       struct printf_spec spec)
{
        const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
        unsigned long hashval;
        int ret;

        /*
         * Print the real pointer value for NULL and error pointers,
         * as they are not actual addresses.
         */
        if (IS_ERR_OR_NULL(ptr))
                return pointer_string(buf, end, ptr, spec);

        /* When debugging early boot use non-cryptographically secure hash. */
        if (unlikely(debug_boot_weak_hash)) {
                hashval = hash_long((unsigned long)ptr, 32);
                return pointer_string(buf, end, (const void *)hashval, spec);
        }

        ret = __ptr_to_hashval(ptr, &hashval);
        if (ret) {
                spec.field_width = 2 * sizeof(ptr);
                /* string length must be less than default_width */
                return error_string(buf, end, str, spec);
        }

        return pointer_string(buf, end, (const void *)hashval, spec);
}

static char *default_pointer(char *buf, char *end, const void *ptr,
                             struct printf_spec spec)
{
        /*
         * default is to _not_ leak addresses, so hash before printing,
         * unless no_hash_pointers is specified on the command line.
         */
        if (unlikely(no_hash_pointers))
                return pointer_string(buf, end, ptr, spec);

        return ptr_to_id(buf, end, ptr, spec);
}

int kptr_restrict __read_mostly;

static noinline_for_stack
char *restricted_pointer(char *buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        switch (kptr_restrict) {
        case 0:
                /* Handle as %p, hash and do _not_ leak addresses. */
                return default_pointer(buf, end, ptr, spec);
        case 1: {
                const struct cred *cred;

                /*
                 * kptr_restrict==1 cannot be used in IRQ context
                 * because its test for CAP_SYSLOG would be meaningless.
                 */
                if (in_hardirq() || in_serving_softirq() || in_nmi()) {
                        if (spec.field_width == -1)
                                spec.field_width = 2 * sizeof(ptr);
                        return error_string(buf, end, "pK-error", spec);
                }

                /*
                 * Only print the real pointer value if the current
                 * process has CAP_SYSLOG and is running with the
                 * same credentials it started with. This is because
                 * access to files is checked at open() time, but %pK
                 * checks permission at read() time. We don't want to
                 * leak pointer values if a binary opens a file using
                 * %pK and then elevates privileges before reading it.
                 */
                cred = current_cred();
                if (!has_capability_noaudit(current, CAP_SYSLOG) ||
                    !uid_eq(cred->euid, cred->uid) ||
                    !gid_eq(cred->egid, cred->gid))
                        ptr = NULL;
                break;
        }
        case 2:
        default:
                /* Always print 0's for %pK */
                ptr = NULL;
                break;
        }

        return pointer_string(buf, end, ptr, spec);
}

static noinline_for_stack
char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
{
        const char *array[4], *s;
        const struct dentry *p;
        int depth;
        int i, n;

        switch (fmt[1]) {
                case '2': case '3': case '4':
                        depth = fmt[1] - '0';
                        break;
                default:
                        depth = 1;
        }

        rcu_read_lock();
        for (i = 0; i < depth; i++, d = p) {
                if (check_pointer(&buf, end, d, spec)) {
                        rcu_read_unlock();
                        return buf;
                }

                p = READ_ONCE(d->d_parent);
                array[i] = READ_ONCE(d->d_name.name);
                if (p == d) {
                        if (i)
                                array[i] = "";
                        i++;
                        break;
                }
        }
        s = array[--i];
        for (n = 0; n != spec.precision; n++, buf++) {
                char c = *s++;
                if (!c) {
                        if (!i)
                                break;
                        c = '/';
                        s = array[--i];
                }
                if (buf < end)
                        *buf = c;
        }
        rcu_read_unlock();
        return widen_string(buf, n, end, spec);
}

static noinline_for_stack
char *file_dentry_name(char *buf, char *end, const struct file *f,
                        struct printf_spec spec, const char *fmt)
{
        if (check_pointer(&buf, end, f, spec))
                return buf;

        return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
}
#ifdef CONFIG_BLOCK
static noinline_for_stack
char *bdev_name(char *buf, char *end, struct block_device *bdev,
                struct printf_spec spec, const char *fmt)
{
        struct gendisk *hd;

        if (check_pointer(&buf, end, bdev, spec))
                return buf;

        hd = bdev->bd_disk;
        buf = string(buf, end, hd->disk_name, spec);
        if (bdev_is_partition(bdev)) {
                if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
                        if (buf < end)
                                *buf = 'p';
                        buf++;
                }
                buf = number(buf, end, bdev_partno(bdev), spec);
        }
        return buf;
}
#endif

static noinline_for_stack
char *symbol_string(char *buf, char *end, void *ptr,
                    struct printf_spec spec, const char *fmt)
{
        unsigned long value;
#ifdef CONFIG_KALLSYMS
        char sym[KSYM_SYMBOL_LEN];
#endif

        if (fmt[1] == 'R')
                ptr = __builtin_extract_return_addr(ptr);
        value = (unsigned long)ptr;

#ifdef CONFIG_KALLSYMS
        if (*fmt == 'B' && fmt[1] == 'b')
                sprint_backtrace_build_id(sym, value);
        else if (*fmt == 'B')
                sprint_backtrace(sym, value);
        else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b')))
                sprint_symbol_build_id(sym, value);
        else if (*fmt != 's')
                sprint_symbol(sym, value);
        else
                sprint_symbol_no_offset(sym, value);

        return string_nocheck(buf, end, sym, spec);
#else
        return special_hex_number(buf, end, value, sizeof(void *));
#endif
}

static const struct printf_spec default_str_spec = {
        .field_width = -1,
        .precision = -1,
};

static const struct printf_spec default_flag_spec = {
        .base = 16,
        .precision = -1,
        .flags = SPECIAL | SMALL,
};

static const struct printf_spec default_dec_spec = {
        .base = 10,
        .precision = -1,
};

static const struct printf_spec default_dec02_spec = {
        .base = 10,
        .field_width = 2,
        .precision = -1,
        .flags = ZEROPAD,
};

static const struct printf_spec default_dec04_spec = {
        .base = 10,
        .field_width = 4,
        .precision = -1,
        .flags = ZEROPAD,
};

static noinline_for_stack
char *resource_string(char *buf, char *end, struct resource *res,
                      struct printf_spec spec, const char *fmt)
{
#ifndef IO_RSRC_PRINTK_SIZE
#define IO_RSRC_PRINTK_SIZE        6
#endif

#ifndef MEM_RSRC_PRINTK_SIZE
#define MEM_RSRC_PRINTK_SIZE        10
#endif
        static const struct printf_spec io_spec = {
                .base = 16,
                .field_width = IO_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec mem_spec = {
                .base = 16,
                .field_width = MEM_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec bus_spec = {
                .base = 16,
                .field_width = 2,
                .precision = -1,
                .flags = SMALL | ZEROPAD,
        };
        static const struct printf_spec str_spec = {
                .field_width = -1,
                .precision = 10,
                .flags = LEFT,
        };

        /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8)
         * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */
#define RSRC_BUF_SIZE                ((2 * sizeof(resource_size_t)) + 4)
#define FLAG_BUF_SIZE                (2 * sizeof(res->flags))
#define DECODED_BUF_SIZE        sizeof("[mem - 64bit pref window disabled]")
#define RAW_BUF_SIZE                sizeof("[mem - flags 0x]")
        char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
                     2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];

        char *p = sym, *pend = sym + sizeof(sym);
        int decode = (fmt[0] == 'R') ? 1 : 0;
        const struct printf_spec *specp;

        if (check_pointer(&buf, end, res, spec))
                return buf;

        *p++ = '[';
        if (res->flags & IORESOURCE_IO) {
                p = string_nocheck(p, pend, "io  ", str_spec);
                specp = &io_spec;
        } else if (res->flags & IORESOURCE_MEM) {
                p = string_nocheck(p, pend, "mem ", str_spec);
                specp = &mem_spec;
        } else if (res->flags & IORESOURCE_IRQ) {
                p = string_nocheck(p, pend, "irq ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_DMA) {
                p = string_nocheck(p, pend, "dma ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_BUS) {
                p = string_nocheck(p, pend, "bus ", str_spec);
                specp = &bus_spec;
        } else {
                p = string_nocheck(p, pend, "??? ", str_spec);
                specp = &mem_spec;
                decode = 0;
        }
        if (decode && res->flags & IORESOURCE_UNSET) {
                p = string_nocheck(p, pend, "size ", str_spec);
                p = number(p, pend, resource_size(res), *specp);
        } else {
                p = number(p, pend, res->start, *specp);
                if (res->start != res->end) {
                        *p++ = '-';
                        p = number(p, pend, res->end, *specp);
                }
        }
        if (decode) {
                if (res->flags & IORESOURCE_MEM_64)
                        p = string_nocheck(p, pend, " 64bit", str_spec);
                if (res->flags & IORESOURCE_PREFETCH)
                        p = string_nocheck(p, pend, " pref", str_spec);
                if (res->flags & IORESOURCE_WINDOW)
                        p = string_nocheck(p, pend, " window", str_spec);
                if (res->flags & IORESOURCE_DISABLED)
                        p = string_nocheck(p, pend, " disabled", str_spec);
        } else {
                p = string_nocheck(p, pend, " flags ", str_spec);
                p = number(p, pend, res->flags, default_flag_spec);
        }
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                 const char *fmt)
{
        int i, len = 1;                /* if we pass '%ph[CDN]', field width remains
                                   negative value, fallback to the default */
        char separator;

        if (spec.field_width == 0)
                /* nothing to print */
                return buf;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'C':
                separator = ':';
                break;
        case 'D':
                separator = '-';
                break;
        case 'N':
                separator = 0;
                break;
        default:
                separator = ' ';
                break;
        }

        if (spec.field_width > 0)
                len = min_t(int, spec.field_width, 64);

        for (i = 0; i < len; ++i) {
                if (buf < end)
                        *buf = hex_asc_hi(addr[i]);
                ++buf;
                if (buf < end)
                        *buf = hex_asc_lo(addr[i]);
                ++buf;

                if (separator && i != len - 1) {
                        if (buf < end)
                                *buf = separator;
                        ++buf;
                }
        }

        return buf;
}

static noinline_for_stack
char *bitmap_string(char *buf, char *end, const unsigned long *bitmap,
                    struct printf_spec spec, const char *fmt)
{
        const int CHUNKSZ = 32;
        int nr_bits = max_t(int, spec.field_width, 0);
        int i, chunksz;
        bool first = true;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        /* reused to print numbers */
        spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };

        chunksz = nr_bits & (CHUNKSZ - 1);
        if (chunksz == 0)
                chunksz = CHUNKSZ;

        i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ;
        for (; i >= 0; i -= CHUNKSZ) {
                u32 chunkmask, val;
                int word, bit;

                chunkmask = ((1ULL << chunksz) - 1);
                word = i / BITS_PER_LONG;
                bit = i % BITS_PER_LONG;
                val = (bitmap[word] >> bit) & chunkmask;

                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                spec.field_width = DIV_ROUND_UP(chunksz, 4);
                buf = number(buf, end, val, spec);

                chunksz = CHUNKSZ;
        }
        return buf;
}

static noinline_for_stack
char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap,
                         struct printf_spec spec, const char *fmt)
{
        int nr_bits = max_t(int, spec.field_width, 0);
        bool first = true;
        int rbot, rtop;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) {
                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                buf = number(buf, end, rbot, default_dec_spec);
                if (rtop == rbot + 1)
                        continue;

                if (buf < end)
                        *buf = '-';
                buf = number(++buf, end, rtop - 1, default_dec_spec);
        }
        return buf;
}

static noinline_for_stack
char *mac_address_string(char *buf, char *end, u8 *addr,
                         struct printf_spec spec, const char *fmt)
{
        char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
        char *p = mac_addr;
        int i;
        char separator;
        bool reversed = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                separator = '-';
                break;

        case 'R':
                reversed = true;
                fallthrough;

        default:
                separator = ':';
                break;
        }

        for (i = 0; i < 6; i++) {
                if (reversed)
                        p = hex_byte_pack(p, addr[5 - i]);
                else
                        p = hex_byte_pack(p, addr[i]);

                if (fmt[0] == 'M' && i != 5)
                        *p++ = separator;
        }
        *p = '\0';

        return string_nocheck(buf, end, mac_addr, spec);
}

static noinline_for_stack
char *ip4_string(char *p, const u8 *addr, const char *fmt)
{
        int i;
        bool leading_zeros = (fmt[0] == 'i');
        int index;
        int step;

        switch (fmt[2]) {
        case 'h':
#ifdef __BIG_ENDIAN
                index = 0;
                step = 1;
#else
                index = 3;
                step = -1;
#endif
                break;
        case 'l':
                index = 3;
                step = -1;
                break;
        case 'n':
        case 'b':
        default:
                index = 0;
                step = 1;
                break;
        }
        for (i = 0; i < 4; i++) {
                char temp[4] __aligned(2);        /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
                                *p++ = '0';
                        if (digits < 2)
                                *p++ = '0';
                }
                /* reverse the digits in the quad */
                while (digits--)
                        *p++ = temp[digits];
                if (i < 3)
                        *p++ = '.';
                index += step;
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_compressed_string(char *p, const char *addr)
{
        int i, j, range;
        unsigned char zerolength[8];
        int longest = 1;
        int colonpos = -1;
        u16 word;
        u8 hi, lo;
        bool needcolon = false;
        bool useIPv4;
        struct in6_addr in6;

        memcpy(&in6, addr, sizeof(struct in6_addr));

        useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);

        memset(zerolength, 0, sizeof(zerolength));

        if (useIPv4)
                range = 6;
        else
                range = 8;

        /* find position of longest 0 run */
        for (i = 0; i < range; i++) {
                for (j = i; j < range; j++) {
                        if (in6.s6_addr16[j] != 0)
                                break;
                        zerolength[i]++;
                }
        }
        for (i = 0; i < range; i++) {
                if (zerolength[i] > longest) {
                        longest = zerolength[i];
                        colonpos = i;
                }
        }
        if (longest == 1)                /* don't compress a single 0 */
                colonpos = -1;

        /* emit address */
        for (i = 0; i < range; i++) {
                if (i == colonpos) {
                        if (needcolon || i == 0)
                                *p++ = ':';
                        *p++ = ':';
                        needcolon = false;
                        i += longest - 1;
                        continue;
                }
                if (needcolon) {
                        *p++ = ':';
                        needcolon = false;
                }
                /* hex u16 without leading 0s */
                word = ntohs(in6.s6_addr16[i]);
                hi = word >> 8;
                lo = word & 0xff;
                if (hi) {
                        if (hi > 0x0f)
                                p = hex_byte_pack(p, hi);
                        else
                                *p++ = hex_asc_lo(hi);
                        p = hex_byte_pack(p, lo);
                }
                else if (lo > 0x0f)
                        p = hex_byte_pack(p, lo);
                else
                        *p++ = hex_asc_lo(lo);
                needcolon = true;
        }

        if (useIPv4) {
                if (needcolon)
                        *p++ = ':';
                p = ip4_string(p, &in6.s6_addr[12], "I4");
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_string(char *p, const char *addr, const char *fmt)
{
        int i;

        for (i = 0; i < 8; i++) {
                p = hex_byte_pack(p, *addr++);
                p = hex_byte_pack(p, *addr++);
                if (fmt[0] == 'I' && i != 7)
                        *p++ = ':';
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];

        if (fmt[0] == 'I' && fmt[2] == 'c')
                ip6_compressed_string(ip6_addr, addr);
        else
                ip6_string(ip6_addr, addr, fmt);

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip4_addr[sizeof("255.255.255.255")];

        ip4_string(ip4_addr, addr, fmt);

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false, have_s = false, have_f = false, have_c = false;
        char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
                      sizeof(":12345") + sizeof("/123456789") +
                      sizeof("%1234567890")];
        char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
        const u8 *addr = (const u8 *) &sa->sin6_addr;
        char fmt6[2] = { fmt[0], '6' };
        u8 off = 0;

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'f':
                        have_f = true;
                        break;
                case 's':
                        have_s = true;
                        break;
                case 'c':
                        have_c = true;
                        break;
                }
        }

        if (have_p || have_s || have_f) {
                *p = '[';
                off = 1;
        }

        if (fmt6[0] == 'I' && have_c)
                p = ip6_compressed_string(ip6_addr + off, addr);
        else
                p = ip6_string(ip6_addr + off, addr, fmt6);

        if (have_p || have_s || have_f)
                *p++ = ']';

        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin6_port), spec);
        }
        if (have_f) {
                *p++ = '/';
                p = number(p, pend, ntohl(sa->sin6_flowinfo &
                                          IPV6_FLOWINFO_MASK), spec);
        }
        if (have_s) {
                *p++ = '%';
                p = number(p, pend, sa->sin6_scope_id, spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false;
        char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
        char *pend = ip4_addr + sizeof(ip4_addr);
        const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
        char fmt4[3] = { fmt[0], '4', 0 };

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'h':
                case 'l':
                case 'n':
                case 'b':
                        fmt4[2] = *fmt;
                        break;
                }
        }

        p = ip4_string(ip4_addr, addr, fmt4);
        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin_port), spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip_addr_string(char *buf, char *end, const void *ptr,
                     struct printf_spec spec, const char *fmt)
{
        char *err_fmt_msg;

        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case '6':
                return ip6_addr_string(buf, end, ptr, spec, fmt);
        case '4':
                return ip4_addr_string(buf, end, ptr, spec, fmt);
        case 'S': {
                const union {
                        struct sockaddr                raw;
                        struct sockaddr_in        v4;
                        struct sockaddr_in6        v6;
                } *sa = ptr;

                switch (sa->raw.sa_family) {
                case AF_INET:
                        return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
                case AF_INET6:
                        return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }}
        }

        err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
        return error_string(buf, end, err_fmt_msg, spec);
}

static noinline_for_stack
char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                     const char *fmt)
{
        bool found = true;
        int count = 1;
        unsigned int flags = 0;
        int len;

        if (spec.field_width == 0)
                return buf;                                /* nothing to print */

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        do {
                switch (fmt[count++]) {
                case 'a':
                        flags |= ESCAPE_ANY;
                        break;
                case 'c':
                        flags |= ESCAPE_SPECIAL;
                        break;
                case 'h':
                        flags |= ESCAPE_HEX;
                        break;
                case 'n':
                        flags |= ESCAPE_NULL;
                        break;
                case 'o':
                        flags |= ESCAPE_OCTAL;
                        break;
                case 'p':
                        flags |= ESCAPE_NP;
                        break;
                case 's':
                        flags |= ESCAPE_SPACE;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (!flags)
                flags = ESCAPE_ANY_NP;

        len = spec.field_width < 0 ? 1 : spec.field_width;

        /*
         * string_escape_mem() writes as many characters as it can to
         * the given buffer, and returns the total size of the output
         * had the buffer been big enough.
         */
        buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);

        return buf;
}

static char *va_format(char *buf, char *end, struct va_format *va_fmt,
                       struct printf_spec spec, const char *fmt)
{
        va_list va;

        if (check_pointer(&buf, end, va_fmt, spec))
                return buf;

        va_copy(va, *va_fmt->va);
        buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
        va_end(va);

        return buf;
}

static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
                  struct printf_spec spec, const char *fmt)
{
        char uuid[UUID_STRING_LEN + 1];
        char *p = uuid;
        int i;
        const u8 *index = uuid_index;
        bool uc = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (*(++fmt)) {
        case 'L':
                uc = true;
                fallthrough;
        case 'l':
                index = guid_index;
                break;
        case 'B':
                uc = true;
                break;
        }

        for (i = 0; i < 16; i++) {
                if (uc)
                        p = hex_byte_pack_upper(p, addr[index[i]]);
                else
                        p = hex_byte_pack(p, addr[index[i]]);
                switch (i) {
                case 3:
                case 5:
                case 7:
                case 9:
                        *p++ = '-';
                        break;
                }
        }

        *p = 0;

        return string_nocheck(buf, end, uuid, spec);
}

static noinline_for_stack
char *netdev_bits(char *buf, char *end, const void *addr,
                  struct printf_spec spec,  const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                num = *(const netdev_features_t *)addr;
                size = sizeof(netdev_features_t);
                break;
        default:
                return error_string(buf, end, "(%pN?)", spec);
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *fourcc_string(char *buf, char *end, const u32 *fourcc,
                    struct printf_spec spec, const char *fmt)
{
        char output[sizeof("0123 little-endian (0x01234567)")];
        char *p = output;
        unsigned int i;
        u32 orig, val;

        if (fmt[1] != 'c' || fmt[2] != 'c')
                return error_string(buf, end, "(%p4?)", spec);

        if (check_pointer(&buf, end, fourcc, spec))
                return buf;

        orig = get_unaligned(fourcc);
        val = orig & ~BIT(31);

        for (i = 0; i < sizeof(u32); i++) {
                unsigned char c = val >> (i * 8);

                /* Print non-control ASCII characters as-is, dot otherwise */
                *p++ = isascii(c) && isprint(c) ? c : '.';
        }

        *p++ = ' ';
        strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
        p += strlen(p);

        *p++ = ' ';
        *p++ = '(';
        p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32));
        *p++ = ')';
        *p = '\0';

        return string(buf, end, output, spec);
}

static noinline_for_stack
char *address_val(char *buf, char *end, const void *addr,
                  struct printf_spec spec, const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
                size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
                size = sizeof(phys_addr_t);
                break;
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        int year = tm->tm_year + (r ? 0 : 1900);
        int mon = tm->tm_mon + (r ? 0 : 1);

        buf = number(buf, end, year, default_dec04_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        buf = number(buf, end, mon, default_dec02_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        return number(buf, end, tm->tm_mday, default_dec02_spec);
}

static noinline_for_stack
char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        buf = number(buf, end, tm->tm_hour, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        buf = number(buf, end, tm->tm_min, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        return number(buf, end, tm->tm_sec, default_dec02_spec);
}

static noinline_for_stack
char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
              struct printf_spec spec, const char *fmt)
{
        bool have_t = true, have_d = true;
        bool raw = false, iso8601_separator = true;
        bool found = true;
        int count = 2;

        if (check_pointer(&buf, end, tm, spec))
                return buf;

        switch (fmt[count]) {
        case 'd':
                have_t = false;
                count++;
                break;
        case 't':
                have_d = false;
                count++;
                break;
        }

        do {
                switch (fmt[count++]) {
                case 'r':
                        raw = true;
                        break;
                case 's':
                        iso8601_separator = false;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (have_d)
                buf = date_str(buf, end, tm, raw);
        if (have_d && have_t) {
                if (buf < end)
                        *buf = iso8601_separator ? 'T' : ' ';
                buf++;
        }
        if (have_t)
                buf = time_str(buf, end, tm, raw);

        return buf;
}

static noinline_for_stack
char *time64_str(char *buf, char *end, const time64_t time,
                 struct printf_spec spec, const char *fmt)
{
        struct rtc_time rtc_time;
        struct tm tm;

        time64_to_tm(time, 0, &tm);

        rtc_time.tm_sec = tm.tm_sec;
        rtc_time.tm_min = tm.tm_min;
        rtc_time.tm_hour = tm.tm_hour;
        rtc_time.tm_mday = tm.tm_mday;
        rtc_time.tm_mon = tm.tm_mon;
        rtc_time.tm_year = tm.tm_year;
        rtc_time.tm_wday = tm.tm_wday;
        rtc_time.tm_yday = tm.tm_yday;

        rtc_time.tm_isdst = 0;

        return rtc_str(buf, end, &rtc_time, spec, fmt);
}

static noinline_for_stack
char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
                    const char *fmt)
{
        switch (fmt[1]) {
        case 'R':
                return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
        case 'T':
                return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
        default:
                return error_string(buf, end, "(%pt?)", spec);
        }
}

static noinline_for_stack
char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
            const char *fmt)
{
        if (!IS_ENABLED(CONFIG_HAVE_CLK))
                return error_string(buf, end, "(%pC?)", spec);

        if (check_pointer(&buf, end, clk, spec))
                return buf;

        switch (fmt[1]) {
        case 'n':
        default:
#ifdef CONFIG_COMMON_CLK
                return string(buf, end, __clk_get_name(clk), spec);
#else
                return ptr_to_id(buf, end, clk, spec);
#endif
        }
}

static
char *format_flags(char *buf, char *end, unsigned long flags,
                                        const struct trace_print_flags *names)
{
        unsigned long mask;

        for ( ; flags && names->name; names++) {
                mask = names->mask;
                if ((flags & mask) != mask)
                        continue;

                buf = string(buf, end, names->name, default_str_spec);

                flags &= ~mask;
                if (flags) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }
        }

        if (flags)
                buf = number(buf, end, flags, default_flag_spec);

        return buf;
}

struct page_flags_fields {
        int width;
        int shift;
        int mask;
        const struct printf_spec *spec;
        const char *name;
};

static const struct page_flags_fields pff[] = {
        {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
         &default_dec_spec, "section"},
        {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
         &default_dec_spec, "node"},
        {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
         &default_dec_spec, "zone"},
        {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
         &default_flag_spec, "lastcpupid"},
        {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
         &default_flag_spec, "kasantag"},
};

static
char *format_page_flags(char *buf, char *end, unsigned long flags)
{
        unsigned long main_flags = flags & PAGEFLAGS_MASK;
        bool append = false;
        int i;

        buf = number(buf, end, flags, default_flag_spec);
        if (buf < end)
                *buf = '(';
        buf++;

        /* Page flags from the main area. */
        if (main_flags) {
                buf = format_flags(buf, end, main_flags, pageflag_names);
                append = true;
        }

        /* Page flags from the fields area */
        for (i = 0; i < ARRAY_SIZE(pff); i++) {
                /* Skip undefined fields. */
                if (!pff[i].width)
                        continue;

                /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
                if (append) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }

                buf = string(buf, end, pff[i].name, default_str_spec);
                if (buf < end)
                        *buf = '=';
                buf++;
                buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
                             *pff[i].spec);

                append = true;
        }
        if (buf < end)
                *buf = ')';
        buf++;

        return buf;
}

static
char *format_page_type(char *buf, char *end, unsigned int page_type)
{
        buf = number(buf, end, page_type, default_flag_spec);

        if (buf < end)
                *buf = '(';
        buf++;

        if (page_type_has_type(page_type))
                buf = format_flags(buf, end, ~page_type, pagetype_names);

        if (buf < end)
                *buf = ')';
        buf++;

        return buf;
}

static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
                   struct printf_spec spec, const char *fmt)
{
        unsigned long flags;
        const struct trace_print_flags *names;

        if (check_pointer(&buf, end, flags_ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'p':
                return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
        case 't':
                return format_page_type(buf, end, *(unsigned int *)flags_ptr);
        case 'v':
                flags = *(unsigned long *)flags_ptr;
                names = vmaflag_names;
                break;
        case 'g':
                flags = (__force unsigned long)(*(gfp_t *)flags_ptr);
                names = gfpflag_names;
                break;
        default:
                return error_string(buf, end, "(%pG?)", spec);
        }

        return format_flags(buf, end, flags, names);
}

static noinline_for_stack
char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
                              char *end)
{
        int depth;

        /* Loop starting from the root node to the current node. */
        for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) {
                /*
                 * Only get a reference for other nodes (i.e. parent nodes).
                 * fwnode refcount may be 0 here.
                 */
                struct fwnode_handle *__fwnode = depth ?
                        fwnode_get_nth_parent(fwnode, depth) : fwnode;

                buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
                             default_str_spec);
                buf = string(buf, end, fwnode_get_name(__fwnode),
                             default_str_spec);

                if (depth)
                        fwnode_handle_put(__fwnode);
        }

        return buf;
}

static noinline_for_stack
char *device_node_string(char *buf, char *end, struct device_node *dn,
                         struct printf_spec spec, const char *fmt)
{
        char tbuf[sizeof("xxxx") + 1];
        const char *p;
        int ret;
        char *buf_start = buf;
        struct property *prop;
        bool has_mult, pass;

        struct printf_spec str_spec = spec;
        str_spec.field_width = -1;

        if (fmt[0] != 'F')
                return error_string(buf, end, "(%pO?)", spec);

        if (!IS_ENABLED(CONFIG_OF))
                return error_string(buf, end, "(%pOF?)", spec);

        if (check_pointer(&buf, end, dn, spec))
                return buf;

        /* simple case without anything any more format specifiers */
        fmt++;
        if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0)
                fmt = "f";

        for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
                int precision;
                if (pass) {
                        if (buf < end)
                                *buf = ':';
                        buf++;
                }

                switch (*fmt) {
                case 'f':        /* full_name */
                        buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
                                                      end);
                        break;
                case 'n':        /* name */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        precision = str_spec.precision;
                        str_spec.precision = strchrnul(p, '@') - p;
                        buf = string(buf, end, p, str_spec);
                        str_spec.precision = precision;
                        break;
                case 'p':        /* phandle */
                        buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
                        break;
                case 'P':        /* path-spec */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        if (!p[1])
                                p = "/";
                        buf = string(buf, end, p, str_spec);
                        break;
                case 'F':        /* flags */
                        tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
                        tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
                        tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
                        tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
                        tbuf[4] = 0;
                        buf = string_nocheck(buf, end, tbuf, str_spec);
                        break;
                case 'c':        /* major compatible string */
                        ret = of_property_read_string(dn, "compatible", &p);
                        if (!ret)
                                buf = string(buf, end, p, str_spec);
                        break;
                case 'C':        /* full compatible string */
                        has_mult = false;
                        of_property_for_each_string(dn, "compatible", prop, p) {
                                if (has_mult)
                                        buf = string_nocheck(buf, end, ",", str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);
                                buf = string(buf, end, p, str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);

                                has_mult = true;
                        }
                        break;
                default:
                        break;
                }
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
                    struct printf_spec spec, const char *fmt)
{
        struct printf_spec str_spec = spec;
        char *buf_start = buf;

        str_spec.field_width = -1;

        if (*fmt != 'w')
                return error_string(buf, end, "(%pf?)", spec);

        if (check_pointer(&buf, end, fwnode, spec))
                return buf;

        fmt++;

        switch (*fmt) {
        case 'P':        /* name */
                buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
                break;
        case 'f':        /* full_name */
        default:
                buf = fwnode_full_name_string(fwnode, buf, end);
                break;
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

int __init no_hash_pointers_enable(char *str)
{
        if (no_hash_pointers)
                return 0;

        no_hash_pointers = true;

        pr_warn("**********************************************************\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** This system shows unhashed kernel memory addresses   **\n");
        pr_warn("** via the console, logs, and other interfaces. This    **\n");
        pr_warn("** might reduce the security of your system.            **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** If you see this message and you are not debugging    **\n");
        pr_warn("** the kernel, report this immediately to your system   **\n");
        pr_warn("** administrator!                                       **\n");
        pr_warn("**                                                      **\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**********************************************************\n");

        return 0;
}
early_param("no_hash_pointers", no_hash_pointers_enable);

/* Used for Rust formatting ('%pA'). */
char *rust_fmt_argument(char *buf, char *end, void *ptr);

/*
 * Show a '%p' thing.  A kernel extension is that the '%p' is followed
 * by an extra set of alphanumeric characters that are extended format
 * specifiers.
 *
 * Please update scripts/checkpatch.pl when adding/removing conversion
 * characters.  (Search for "check for vsprintf extension").
 *
 * Right now we handle:
 *
 * - 'S' For symbolic direct pointers (or function descriptors) with offset
 * - 's' For symbolic direct pointers (or function descriptors) without offset
 * - '[Ss]R' as above with __builtin_extract_return_addr() translation
 * - 'S[R]b' as above with module build ID (for use in backtraces)
 * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
 *            %ps and %pS. Be careful when re-using these specifiers.
 * - 'B' For backtraced symbolic direct pointers with offset
 * - 'Bb' as above with module build ID (for use in backtraces)
 * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
 * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
 * - 'b[l]' For a bitmap, the number of bits is determined by the field
 *       width which must be explicitly specified either as part of the
 *       format string '%32b[l]' or through '%*b[l]', [l] selects
 *       range-list format instead of hex format
 * - 'M' For a 6-byte MAC address, it prints the address in the
 *       usual colon-separated hex notation
 * - 'm' For a 6-byte MAC address, it prints the hex address without colons
 * - 'MF' For a 6-byte MAC FDDI address, it prints the address
 *       with a dash-separated hex notation
 * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
 * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
 *       IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
 *       IPv6 uses colon separated network-order 16 bit hex with leading 0's
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - 'i' [46] for 'raw' IPv4/IPv6 addresses
 *       IPv6 omits the colons (01020304...0f)
 *       IPv4 uses dot-separated decimal with leading 0's (010.123.045.006)
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order
 * - 'I[6S]c' for IPv6 addresses printed as specified by
 *       https://tools.ietf.org/html/rfc5952
 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination
 *                of the following flags (see string_escape_mem() for the
 *                details):
 *                  a - ESCAPE_ANY
 *                  c - ESCAPE_SPECIAL
 *                  h - ESCAPE_HEX
 *                  n - ESCAPE_NULL
 *                  o - ESCAPE_OCTAL
 *                  p - ESCAPE_NP
 *                  s - ESCAPE_SPACE
 *                By default ESCAPE_ANY_NP is used.
 * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form
 *       "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 *       Options for %pU are:
 *         b big endian lower case hex (default)
 *         B big endian UPPER case hex
 *         l little endian lower case hex
 *         L little endian UPPER case hex
 *           big endian output byte order is:
 *             [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
 *           little endian output byte order is:
 *             [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
 * - 'V' For a struct va_format which contains a format string * and va_list *,
 *       call vsnprintf(->format, *->va_list).
 *       Implements a "recursive vsnprintf".
 *       Do not use this feature without some mechanism to verify the
 *       correctness of the format string and va_list arguments.
 * - 'K' For a kernel pointer that should be hidden from unprivileged users.
 *       Use only for procfs, sysfs and similar files, not printk(); please
 *       read the documentation (path below) first.
 * - 'NF' For a netdev_features_t
 * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value.
 * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
 *            a certain separator (' ' by default):
 *              C colon
 *              D dash
 *              N no separator
 *            The maximum supported length is 64 bytes of the input. Consider
 *            to use print_hex_dump() for the larger input.
 * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
 *           (default assumed to be phys_addr_t, passed by reference)
 * - 'd[234]' For a dentry name (optionally 2-4 last components)
 * - 'D[234]' Same as 'd' but for a struct file
 * - 'g' For block_device name (gendisk + partition number)
 * - 't[RT][dt][r][s]' For time and date as represented by:
 *      R    struct rtc_time
 *      T    time64_t
 * - 'C' For a clock, it prints the name (Common Clock Framework) or address
 *       (legacy clock framework) of the clock
 * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
 *        (legacy clock framework) of the clock
 * - 'G' For flags to be printed as a collection of symbolic strings that would
 *       construct the specific value. Supported flags given by option:
 *       p page flags (see struct page) given as pointer to unsigned long
 *       g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
 *       v vma flags (VM_*) given as pointer to unsigned long
 * - 'OF[fnpPcCF]'  For a device tree object
 *                  Without any optional arguments prints the full_name
 *                  f device node full_name
 *                  n device node name
 *                  p device node phandle
 *                  P device node path spec (name + @unit)
 *                  F device node flags
 *                  c major compatible string
 *                  C full compatible string
 * - 'fw[fP]'        For a firmware node (struct fwnode_handle) pointer
 *                Without an option prints the full name of the node
 *                f full name
 *                P node name, including a possible unit address
 * - 'x' For printing the address unmodified. Equivalent to "%lx".
 *       Please read the documentation (path below) before using!
 * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
 *           bpf_trace_printk() where [ku] prefix specifies either kernel (k)
 *           or user (u) memory to probe, and:
 *              s a string, equivalent to "%s" on direct vsnprintf() use
 *
 * ** When making changes please also update:
 *        Documentation/core-api/printk-formats.rst
 *
 * Note: The default behaviour (unadorned %p) is to hash the address,
 * rendering it useful as a unique identifier.
 *
 * There is also a '%pA' format specifier, but it is only intended to be used
 * from Rust code to format core::fmt::Arguments. Do *not* use it from C.
 * See rust/kernel/print.rs for details.
 */
static noinline_for_stack
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
              struct printf_spec spec)
{
        switch (*fmt) {
        case 'S':
        case 's':
                ptr = dereference_symbol_descriptor(ptr);
                fallthrough;
        case 'B':
                return symbol_string(buf, end, ptr, spec, fmt);
        case 'R':
        case 'r':
                return resource_string(buf, end, ptr, spec, fmt);
        case 'h':
                return hex_string(buf, end, ptr, spec, fmt);
        case 'b':
                switch (fmt[1]) {
                case 'l':
                        return bitmap_list_string(buf, end, ptr, spec, fmt);
                default:
                        return bitmap_string(buf, end, ptr, spec, fmt);
                }
        case 'M':                        /* Colon separated: 00:01:02:03:04:05 */
        case 'm':                        /* Contiguous: 000102030405 */
                                        /* [mM]F (FDDI) */
                                        /* [mM]R (Reverse order; Bluetooth) */
                return mac_address_string(buf, end, ptr, spec, fmt);
        case 'I':                        /* Formatted IP supported
                                         * 4:        1.2.3.4
                                         * 6:        0001:0203:...:0708
                                         * 6c:        1::708 or 1::1.2.3.4
                                         */
        case 'i':                        /* Contiguous:
                                         * 4:        001.002.003.004
                                         * 6:   000102...0f
                                         */
                return ip_addr_string(buf, end, ptr, spec, fmt);
        case 'E':
                return escaped_string(buf, end, ptr, spec, fmt);
        case 'U':
                return uuid_string(buf, end, ptr, spec, fmt);
        case 'V':
                return va_format(buf, end, ptr, spec, fmt);
        case 'K':
                return restricted_pointer(buf, end, ptr, spec);
        case 'N':
                return netdev_bits(buf, end, ptr, spec, fmt);
        case '4':
                return fourcc_string(buf, end, ptr, spec, fmt);
        case 'a':
                return address_val(buf, end, ptr, spec, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 't':
                return time_and_date(buf, end, ptr, spec, fmt);
        case 'C':
                return clock(buf, end, ptr, spec, fmt);
        case 'D':
                return file_dentry_name(buf, end, ptr, spec, fmt);
#ifdef CONFIG_BLOCK
        case 'g':
                return bdev_name(buf, end, ptr, spec, fmt);
#endif

        case 'G':
                return flags_string(buf, end, ptr, spec, fmt);
        case 'O':
                return device_node_string(buf, end, ptr, spec, fmt + 1);
        case 'f':
                return fwnode_string(buf, end, ptr, spec, fmt + 1);
        case 'A':
                if (!IS_ENABLED(CONFIG_RUST)) {
                        WARN_ONCE(1, "Please remove %%pA from non-Rust code\n");
                        return error_string(buf, end, "(%pA?)", spec);
                }
                return rust_fmt_argument(buf, end, ptr);
        case 'x':
                return pointer_string(buf, end, ptr, spec);
        case 'e':
                /* %pe with a non-ERR_PTR gets treated as plain %p */
                if (!IS_ERR(ptr))
                        return default_pointer(buf, end, ptr, spec);
                return err_ptr(buf, end, ptr, spec);
        case 'u':
        case 'k':
                switch (fmt[1]) {
                case 's':
                        return string(buf, end, ptr, spec);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }
        default:
                return default_pointer(buf, end, ptr, spec);
        }
}

/*
 * Helper function to decode printf style format.
 * Each call decode a token from the format and return the
 * number of characters read (or likely the delta where it wants
 * to go on the next call).
 * The decoded token is returned through the parameters
 *
 * 'h', 'l', or 'L' for integer fields
 * 'z' support added 23/7/1999 S.H.
 * 'z' changed to 'Z' --davidm 1/25/99
 * 'Z' changed to 'z' --adobriyan 2017-01-25
 * 't' added for ptrdiff_t
 *
 * @fmt: the format string
 * @type of the token returned
 * @flags: various flags such as +, -, # tokens..
 * @field_width: overwritten width
 * @base: base of the number (octal, hex, ...)
 * @precision: precision of a number
 * @qualifier: qualifier of a number (long, size_t, ...)
 */
static noinline_for_stack
int format_decode(const char *fmt, struct printf_spec *spec)
{
        const char *start = fmt;
        char qualifier;

        /* we finished early by reading the field width */
        if (spec->type == FORMAT_TYPE_WIDTH) {
                if (spec->field_width < 0) {
                        spec->field_width = -spec->field_width;
                        spec->flags |= LEFT;
                }
                spec->type = FORMAT_TYPE_NONE;
                goto precision;
        }

        /* we finished early by reading the precision */
        if (spec->type == FORMAT_TYPE_PRECISION) {
                if (spec->precision < 0)
                        spec->precision = 0;

                spec->type = FORMAT_TYPE_NONE;
                goto qualifier;
        }

        /* By default */
        spec->type = FORMAT_TYPE_NONE;

        for (; *fmt ; ++fmt) {
                if (*fmt == '%')
                        break;
        }

        /* Return the current non-format string */
        if (fmt != start || !*fmt)
                return fmt - start;

        /* Process flags */
        spec->flags = 0;

        while (1) { /* this also skips first '%' */
                bool found = true;

                ++fmt;

                switch (*fmt) {
                case '-': spec->flags |= LEFT;    break;
                case '+': spec->flags |= PLUS;    break;
                case ' ': spec->flags |= SPACE;   break;
                case '#': spec->flags |= SPECIAL; break;
                case '0': spec->flags |= ZEROPAD; break;
                default:  found = false;
                }

                if (!found)
                        break;
        }

        /* get field width */
        spec->field_width = -1;

        if (isdigit(*fmt))
                spec->field_width = skip_atoi(&fmt);
        else if (*fmt == '*') {
                /* it's the next argument */
                spec->type = FORMAT_TYPE_WIDTH;
                return ++fmt - start;
        }

precision:
        /* get the precision */
        spec->precision = -1;
        if (*fmt == '.') {
                ++fmt;
                if (isdigit(*fmt)) {
                        spec->precision = skip_atoi(&fmt);
                        if (spec->precision < 0)
                                spec->precision = 0;
                } else if (*fmt == '*') {
                        /* it's the next argument */
                        spec->type = FORMAT_TYPE_PRECISION;
                        return ++fmt - start;
                }
        }

qualifier:
        /* get the conversion qualifier */
        qualifier = 0;
        if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
            *fmt == 'z' || *fmt == 't') {
                qualifier = *fmt++;
                if (unlikely(qualifier == *fmt)) {
                        if (qualifier == 'l') {
                                qualifier = 'L';
                                ++fmt;
                        } else if (qualifier == 'h') {
                                qualifier = 'H';
                                ++fmt;
                        }
                }
        }

        /* default base */
        spec->base = 10;
        switch (*fmt) {
        case 'c':
                spec->type = FORMAT_TYPE_CHAR;
                return ++fmt - start;

        case 's':
                spec->type = FORMAT_TYPE_STR;
                return ++fmt - start;

        case 'p':
                spec->type = FORMAT_TYPE_PTR;
                return ++fmt - start;

        case '%':
                spec->type = FORMAT_TYPE_PERCENT_CHAR;
                return ++fmt - start;

        /* integer number formats - set up the flags and "break" */
        case 'o':
                spec->base = 8;
                break;

        case 'x':
                spec->flags |= SMALL;
                fallthrough;

        case 'X':
                spec->base = 16;
                break;

        case 'd':
        case 'i':
                spec->flags |= SIGN;
                break;
        case 'u':
                break;

        case 'n':
                /*
                 * Since %n poses a greater security risk than
                 * utility, treat it as any other invalid or
                 * unsupported format specifier.
                 */
                fallthrough;

        default:
                WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt);
                spec->type = FORMAT_TYPE_INVALID;
                return fmt - start;
        }

        if (qualifier == 'L')
                spec->type = FORMAT_TYPE_LONG_LONG;
        else if (qualifier == 'l') {
                BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
                spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
        } else if (qualifier == 'z') {
                spec->type = FORMAT_TYPE_SIZE_T;
        } else if (qualifier == 't') {
                spec->type = FORMAT_TYPE_PTRDIFF;
        } else if (qualifier == 'H') {
                BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
                spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN);
        } else if (qualifier == 'h') {
                BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
                spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
        } else {
                BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT);
                spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN);
        }

        return ++fmt - start;
}

static void
set_field_width(struct printf_spec *spec, int width)
{
        spec->field_width = width;
        if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
                spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
        }
}

static void
set_precision(struct printf_spec *spec, int prec)
{
        spec->precision = prec;
        if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
                spec->precision = clamp(prec, 0, PRECISION_MAX);
        }
}

/**
 * vsnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * This function generally follows C99 vsnprintf, but has some
 * extensions and a few limitations:
 *
 *  - ``%n`` is unsupported
 *  - ``%p*`` is handled by pointer()
 *
 * See pointer() or Documentation/core-api/printk-formats.rst for more
 * extensive description.
 *
 * **Please update the documentation in both places when making changes**
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 *
 * If you're not already dealing with a va_list consider using snprintf().
 */
int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        unsigned long long num;
        char *str, *end;
        struct printf_spec spec = {0};

        /* Reject out-of-range values early.  Large positive sizes are
           used for unknown buffer sizes. */
        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt) {
                const char *old_fmt = fmt;
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE: {
                        int copy = read;
                        if (str < end) {
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        break;
                }

                case FORMAT_TYPE_WIDTH:
                        set_field_width(&spec, va_arg(args, int));
                        break;

                case FORMAT_TYPE_PRECISION:
                        set_precision(&spec, va_arg(args, int));
                        break;

                case FORMAT_TYPE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;

                                }
                        }
                        c = (unsigned char) va_arg(args, int);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        break;
                }

                case FORMAT_TYPE_STR:
                        str = string(str, end, va_arg(args, char *), spec);
                        break;

                case FORMAT_TYPE_PTR:
                        str = pointer(fmt, str, end, va_arg(args, void *),
                                      spec);
                        while (isalnum(*fmt))
                                fmt++;
                        break;

                case FORMAT_TYPE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        break;

                case FORMAT_TYPE_INVALID:
                        /*
                         * Presumably the arguments passed gcc's type
                         * checking, but there is no safe or sane way
                         * for us to continue parsing the format and
                         * fetching from the va_list; the remaining
                         * specifiers and arguments would be out of
                         * sync.
                         */
                        goto out;

                default:
                        switch (spec.type) {
                        case FORMAT_TYPE_LONG_LONG:
                                num = va_arg(args, long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                                num = va_arg(args, unsigned long);
                                break;
                        case FORMAT_TYPE_LONG:
                                num = va_arg(args, long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                if (spec.flags & SIGN)
                                        num = va_arg(args, ssize_t);
                                else
                                        num = va_arg(args, size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                num = va_arg(args, ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                                num = (unsigned char) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_BYTE:
                                num = (signed char) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_USHORT:
                                num = (unsigned short) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_SHORT:
                                num = (short) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_INT:
                                num = (int) va_arg(args, int);
                                break;
                        default:
                                num = va_arg(args, unsigned int);
                        }

                        str = number(str, end, num, spec);
                }
        }

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

        /* the trailing null byte doesn't count towards the total */
        return str-buf;

}
EXPORT_SYMBOL(vsnprintf);

/**
 * vscnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The return value is the number of characters which have been written into
 * the @buf not including the trailing '\0'. If @size is == 0 the function
 * returns 0.
 *
 * If you're not already dealing with a va_list consider using scnprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        int i;

        if (unlikely(!size))
                return 0;

        i = vsnprintf(buf, size, fmt, args);

        if (likely(i < size))
                return i;

        return size - 1;
}
EXPORT_SYMBOL(vscnprintf);

/**
 * snprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters which would be
 * generated for the given input, excluding the trailing null,
 * as per ISO C99.  If the return is greater than or equal to
 * @size, the resulting string is truncated.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int snprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(snprintf);

/**
 * scnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. If @size is == 0 the function returns 0.
 */

int scnprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vscnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(scnprintf);

/**
 * vsprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use vsnprintf() or vscnprintf() in order to avoid
 * buffer overflows.
 *
 * If you're not already dealing with a va_list consider using sprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vsprintf(char *buf, const char *fmt, va_list args)
{
        return vsnprintf(buf, INT_MAX, fmt, args);
}
EXPORT_SYMBOL(vsprintf);

/**
 * sprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use snprintf() or scnprintf() in order to avoid
 * buffer overflows.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int sprintf(char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, INT_MAX, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sprintf);

#ifdef CONFIG_BINARY_PRINTF
/*
 * bprintf service:
 * vbin_printf() - VA arguments to binary data
 * bstr_printf() - Binary data to text string
 */

/**
 * vbin_printf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The format follows C99 vsnprintf, except %n is ignored, and its argument
 * is skipped.
 *
 * The return value is the number of words(32bits) which would be generated for
 * the given input.
 *
 * NOTE:
 * If the return value is greater than @size, the resulting bin_buf is NOT
 * valid for bstr_printf().
 */
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
{
        struct printf_spec spec = {0};
        char *str, *end;
        int width;

        str = (char *)bin_buf;
        end = (char *)(bin_buf + size);

#define save_arg(type)                                                        \
({                                                                        \
        unsigned long long value;                                        \
        if (sizeof(type) == 8) {                                        \
                unsigned long long val8;                                \
                str = PTR_ALIGN(str, sizeof(u32));                        \
                val8 = va_arg(args, unsigned long long);                \
                if (str + sizeof(type) <= end) {                        \
                        *(u32 *)str = *(u32 *)&val8;                        \
                        *(u32 *)(str + 4) = *((u32 *)&val8 + 1);        \
                }                                                        \
                value = val8;                                                \
        } else {                                                        \
                unsigned int val4;                                        \
                str = PTR_ALIGN(str, sizeof(type));                        \
                val4 = va_arg(args, int);                                \
                if (str + sizeof(type) <= end)                                \
                        *(typeof(type) *)str = (type)(long)val4;        \
                value = (unsigned long long)val4;                        \
        }                                                                \
        str += sizeof(type);                                                \
        value;                                                                \
})

        while (*fmt) {
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE:
                case FORMAT_TYPE_PERCENT_CHAR:
                        break;
                case FORMAT_TYPE_INVALID:
                        goto out;

                case FORMAT_TYPE_WIDTH:
                case FORMAT_TYPE_PRECISION:
                        width = (int)save_arg(int);
                        /* Pointers may require the width */
                        if (*fmt == 'p')
                                set_field_width(&spec, width);
                        break;

                case FORMAT_TYPE_CHAR:
                        save_arg(char);
                        break;

                case FORMAT_TYPE_STR: {
                        const char *save_str = va_arg(args, char *);
                        const char *err_msg;
                        size_t len;

                        err_msg = check_pointer_msg(save_str);
                        if (err_msg)
                                save_str = err_msg;

                        len = strlen(save_str) + 1;
                        if (str + len < end)
                                memcpy(str, save_str, len);
                        str += len;
                        break;
                }

                case FORMAT_TYPE_PTR:
                        /* Dereferenced pointers must be done now */
                        switch (*fmt) {
                        /* Dereference of functions is still OK */
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                save_arg(void *);
                                break;
                        default:
                                if (!isalnum(*fmt)) {
                                        save_arg(void *);
                                        break;
                                }
                                str = pointer(fmt, str, end, va_arg(args, void *),
                                              spec);
                                if (str + 1 < end)
                                        *str++ = '\0';
                                else
                                        end[-1] = '\0'; /* Must be nul terminated */
                        }
                        /* skip all alphanumeric pointer suffixes */
                        while (isalnum(*fmt))
                                fmt++;
                        break;

                default:
                        switch (spec.type) {

                        case FORMAT_TYPE_LONG_LONG:
                                save_arg(long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                        case FORMAT_TYPE_LONG:
                                save_arg(unsigned long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                save_arg(size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                save_arg(ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                        case FORMAT_TYPE_BYTE:
                                save_arg(char);
                                break;
                        case FORMAT_TYPE_USHORT:
                        case FORMAT_TYPE_SHORT:
                                save_arg(short);
                                break;
                        default:
                                save_arg(int);
                        }
                }
        }

out:
        return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
EXPORT_SYMBOL_GPL(vbin_printf);

/**
 * bstr_printf - Format a string from binary arguments and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @bin_buf: Binary arguments for the format string
 *
 * This function like C99 vsnprintf, but the difference is that vsnprintf gets
 * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
 * a binary buffer that generated by vbin_printf.
 *
 * The format follows C99 vsnprintf, but has some extensions:
 *  see vsnprintf comment for details.
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 */
int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
{
        struct printf_spec spec = {0};
        char *str, *end;
        const char *args = (const char *)bin_buf;

        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

#define get_arg(type)                                                        \
({                                                                        \
        typeof(type) value;                                                \
        if (sizeof(type) == 8) {                                        \
                args = PTR_ALIGN(args, sizeof(u32));                        \
                *(u32 *)&value = *(u32 *)args;                                \
                *((u32 *)&value + 1) = *(u32 *)(args + 4);                \
        } else {                                                        \
                args = PTR_ALIGN(args, sizeof(type));                        \
                value = *(typeof(type) *)args;                                \
        }                                                                \
        args += sizeof(type);                                                \
        value;                                                                \
})

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt) {
                const char *old_fmt = fmt;
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE: {
                        int copy = read;
                        if (str < end) {
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        break;
                }

                case FORMAT_TYPE_WIDTH:
                        set_field_width(&spec, get_arg(int));
                        break;

                case FORMAT_TYPE_PRECISION:
                        set_precision(&spec, get_arg(int));
                        break;

                case FORMAT_TYPE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;
                                }
                        }
                        c = (unsigned char) get_arg(char);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        break;
                }

                case FORMAT_TYPE_STR: {
                        const char *str_arg = args;
                        args += strlen(str_arg) + 1;
                        str = string(str, end, (char *)str_arg, spec);
                        break;
                }

                case FORMAT_TYPE_PTR: {
                        bool process = false;
                        int copy, len;
                        /* Non function dereferences were already done */
                        switch (*fmt) {
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                process = true;
                                break;
                        default:
                                if (!isalnum(*fmt)) {
                                        process = true;
                                        break;
                                }
                                /* Pointer dereference was already processed */
                                if (str < end) {
                                        len = copy = strlen(args);
                                        if (copy > end - str)
                                                copy = end - str;
                                        memcpy(str, args, copy);
                                        str += len;
                                        args += len + 1;
                                }
                        }
                        if (process)
                                str = pointer(fmt, str, end, get_arg(void *), spec);

                        while (isalnum(*fmt))
                                fmt++;
                        break;
                }

                case FORMAT_TYPE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        break;

                case FORMAT_TYPE_INVALID:
                        goto out;

                default: {
                        unsigned long long num;

                        switch (spec.type) {

                        case FORMAT_TYPE_LONG_LONG:
                                num = get_arg(long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                        case FORMAT_TYPE_LONG:
                                num = get_arg(unsigned long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                num = get_arg(size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                num = get_arg(ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                                num = get_arg(unsigned char);
                                break;
                        case FORMAT_TYPE_BYTE:
                                num = get_arg(signed char);
                                break;
                        case FORMAT_TYPE_USHORT:
                                num = get_arg(unsigned short);
                                break;
                        case FORMAT_TYPE_SHORT:
                                num = get_arg(short);
                                break;
                        case FORMAT_TYPE_UINT:
                                num = get_arg(unsigned int);
                                break;
                        default:
                                num = get_arg(int);
                        }

                        str = number(str, end, num, spec);
                } /* default: */
                } /* switch(spec.type) */
        } /* while(*fmt) */

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

#undef get_arg

        /* the trailing null byte doesn't count towards the total */
        return str - buf;
}
EXPORT_SYMBOL_GPL(bstr_printf);

/**
 * bprintf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of words(u32) written
 * into @bin_buf.
 */
int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = vbin_printf(bin_buf, size, fmt, args);
        va_end(args);

        return ret;
}
EXPORT_SYMBOL_GPL(bprintf);

#endif /* CONFIG_BINARY_PRINTF */

/**
 * vsscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        format of buffer
 * @args:        arguments
 */
int vsscanf(const char *buf, const char *fmt, va_list args)
{
        const char *str = buf;
        char *next;
        char digit;
        int num = 0;
        u8 qualifier;
        unsigned int base;
        union {
                long long s;
                unsigned long long u;
        } val;
        s16 field_width;
        bool is_sign;

        while (*fmt) {
                /* skip any white space in format */
                /* white space in format matches any amount of
                 * white space, including none, in the input.
                 */
                if (isspace(*fmt)) {
                        fmt = skip_spaces(++fmt);
                        str = skip_spaces(str);
                }

                /* anything that is not a conversion must match exactly */
                if (*fmt != '%' && *fmt) {
                        if (*fmt++ != *str++)
                                break;
                        continue;
                }

                if (!*fmt)
                        break;
                ++fmt;

                /* skip this conversion.
                 * advance both strings to next white space
                 */
                if (*fmt == '*') {
                        if (!*str)
                                break;
                        while (!isspace(*fmt) && *fmt != '%' && *fmt) {
                                /* '%*[' not yet supported, invalid format */
                                if (*fmt == '[')
                                        return num;
                                fmt++;
                        }
                        while (!isspace(*str) && *str)
                                str++;
                        continue;
                }

                /* get field width */
                field_width = -1;
                if (isdigit(*fmt)) {
                        field_width = skip_atoi(&fmt);
                        if (field_width <= 0)
                                break;
                }

                /* get conversion qualifier */
                qualifier = -1;
                if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
                    *fmt == 'z') {
                        qualifier = *fmt++;
                        if (unlikely(qualifier == *fmt)) {
                                if (qualifier == 'h') {
                                        qualifier = 'H';
                                        fmt++;
                                } else if (qualifier == 'l') {
                                        qualifier = 'L';
                                        fmt++;
                                }
                        }
                }

                if (!*fmt)
                        break;

                if (*fmt == 'n') {
                        /* return number of characters read so far */
                        *va_arg(args, int *) = str - buf;
                        ++fmt;
                        continue;
                }

                if (!*str)
                        break;

                base = 10;
                is_sign = false;

                switch (*fmt++) {
                case 'c':
                {
                        char *s = (char *)va_arg(args, char*);
                        if (field_width == -1)
                                field_width = 1;
                        do {
                                *s++ = *str++;
                        } while (--field_width > 0 && *str);
                        num++;
                }
                continue;
                case 's':
                {
                        char *s = (char *)va_arg(args, char *);
                        if (field_width == -1)
                                field_width = SHRT_MAX;
                        /* first, skip leading white space in buffer */
                        str = skip_spaces(str);

                        /* now copy until next white space */
                        while (*str && !isspace(*str) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        num++;
                }
                continue;
                /*
                 * Warning: This implementation of the '[' conversion specifier
                 * deviates from its glibc counterpart in the following ways:
                 * (1) It does NOT support ranges i.e. '-' is NOT a special
                 *     character
                 * (2) It cannot match the closing bracket ']' itself
                 * (3) A field width is required
                 * (4) '%*[' (discard matching input) is currently not supported
                 *
                 * Example usage:
                 * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
                 *                buf1, buf2, buf3);
                 * if (ret < 3)
                 *    // etc..
                 */
                case '[':
                {
                        char *s = (char *)va_arg(args, char *);
                        DECLARE_BITMAP(set, 256) = {0};
                        unsigned int len = 0;
                        bool negate = (*fmt == '^');

                        /* field width is required */
                        if (field_width == -1)
                                return num;

                        if (negate)
                                ++fmt;

                        for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
                                __set_bit((u8)*fmt, set);

                        /* no ']' or no character set found */
                        if (!*fmt || !len)
                                return num;
                        ++fmt;

                        if (negate) {
                                bitmap_complement(set, set, 256);
                                /* exclude null '\0' byte */
                                __clear_bit(0, set);
                        }

                        /* match must be non-empty */
                        if (!test_bit((u8)*str, set))
                                return num;

                        while (test_bit((u8)*str, set) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        ++num;
                }
                continue;
                case 'o':
                        base = 8;
                        break;
                case 'x':
                case 'X':
                        base = 16;
                        break;
                case 'i':
                        base = 0;
                        fallthrough;
                case 'd':
                        is_sign = true;
                        fallthrough;
                case 'u':
                        break;
                case '%':
                        /* looking for '%' in str */
                        if (*str++ != '%')
                                return num;
                        continue;
                default:
                        /* invalid format; stop here */
                        return num;
                }

                /* have some sort of integer conversion.
                 * first, skip white space in buffer.
                 */
                str = skip_spaces(str);

                digit = *str;
                if (is_sign && digit == '-') {
                        if (field_width == 1)
                                break;

                        digit = *(str + 1);
                }

                if (!digit
                    || (base == 16 && !isxdigit(digit))
                    || (base == 10 && !isdigit(digit))
                    || (base == 8 && !isodigit(digit))
                    || (base == 0 && !isdigit(digit)))
                        break;

                if (is_sign)
                        val.s = simple_strntoll(str, &next, base,
                                                field_width >= 0 ? field_width : INT_MAX);
                else
                        val.u = simple_strntoull(str, &next, base,
                                                 field_width >= 0 ? field_width : INT_MAX);

                switch (qualifier) {
                case 'H':        /* that's 'hh' in format */
                        if (is_sign)
                                *va_arg(args, signed char *) = val.s;
                        else
                                *va_arg(args, unsigned char *) = val.u;
                        break;
                case 'h':
                        if (is_sign)
                                *va_arg(args, short *) = val.s;
                        else
                                *va_arg(args, unsigned short *) = val.u;
                        break;
                case 'l':
                        if (is_sign)
                                *va_arg(args, long *) = val.s;
                        else
                                *va_arg(args, unsigned long *) = val.u;
                        break;
                case 'L':
                        if (is_sign)
                                *va_arg(args, long long *) = val.s;
                        else
                                *va_arg(args, unsigned long long *) = val.u;
                        break;
                case 'z':
                        *va_arg(args, size_t *) = val.u;
                        break;
                default:
                        if (is_sign)
                                *va_arg(args, int *) = val.s;
                        else
                                *va_arg(args, unsigned int *) = val.u;
                        break;
                }
                num++;

                if (!next)
                        break;
                str = next;
        }

        return num;
}
EXPORT_SYMBOL(vsscanf);

/**
 * sscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        formatting of buffer
 * @...:        resulting arguments
 */
int sscanf(const char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsscanf(buf, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sscanf);


















































    1 
    4 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_USER_H
#define _LINUX_SCHED_USER_H

#include <linux/uidgid.h>
#include <linux/atomic.h>
#include <linux/percpu_counter.h>
#include <linux/refcount.h>
#include <linux/ratelimit.h>

/*
 * Some day this will be a full-fledged user tracking system..
 */
struct user_struct {
        refcount_t __count;        /* reference count */
#ifdef CONFIG_EPOLL
        struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
#endif
        unsigned long unix_inflight;        /* How many files in flight in unix sockets */
        atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */

        /* Hash table maintenance information */
        struct hlist_node uidhash_node;
        kuid_t uid;

#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
        defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
        defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD)
        atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
        atomic_t nr_watches;        /* The number of watches this user currently has */
#endif

        /* Miscellaneous per-user rate limit */
        struct ratelimit_state ratelimit;
};

extern int uids_sysfs_init(void);

extern struct user_struct *find_user(kuid_t);

extern struct user_struct root_user;
#define INIT_USER (&root_user)


/* per-UID process charging. */
extern struct user_struct * alloc_uid(kuid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
        refcount_inc(&u->__count);
        return u;
}
extern void free_uid(struct user_struct *);

#endif /* _LINUX_SCHED_USER_H */























































































































    2 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

struct mnt_namespace {
        struct ns_common        ns;
        struct mount *        root;
        struct rb_root                mounts; /* Protected by namespace_sem */
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        u64                        seq;        /* Sequence number to prevent loops */
        wait_queue_head_t poll;
        u64 event;
        unsigned int                nr_mounts; /* # of mounts in the namespace */
        unsigned int                pending_mounts;
} __randomize_layout;

struct mnt_pcp {
        int mnt_count;
        int mnt_writers;
};

struct mountpoint {
        struct hlist_node m_hash;
        struct dentry *m_dentry;
        struct hlist_head m_list;
        int m_count;
};

struct mount {
        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
        union {
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;        /* list of children, anchored here */
        struct list_head mnt_child;        /* and going through their mnt_child */
        struct list_head mnt_instance;        /* mount instance on sb->s_mounts */
        const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
        union {
                struct rb_node mnt_node;        /* Under ns->mounts */
                struct list_head mnt_list;
        };
        struct list_head mnt_expire;        /* link in fs-specific expiry list */
        struct list_head mnt_share;        /* circular list of shared mounts */
        struct list_head mnt_slave_list;/* list of slave mounts */
        struct list_head mnt_slave;        /* slave list entry */
        struct mount *mnt_master;        /* slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;        /* containing namespace */
        struct mountpoint *mnt_mp;        /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;        /* list mounts with the same mountpoint */
                struct hlist_node mnt_umount;
        };
        struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
#endif
        int mnt_id;                        /* mount identifier, reused */
        u64 mnt_id_unique;                /* mount ID unique until reboot */
        int mnt_group_id;                /* peer group identifier */
        int mnt_expiry_mark;                /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct hlist_head mnt_stuck_children;
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(struct mount *mnt)
{
        return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
        /* neither detached nor internal? */
        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
        struct mount *m = __lookup_mnt(path->mnt, path->dentry);
        return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return;
        __detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
        refcount_inc(&ns->ns.count);
}

extern seqlock_t mount_lock;

struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(struct dentry *dentry);
static inline bool is_local_mountpoint(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return false;

        return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
        return ns->seq == 0;
}

static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
        WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
        mnt->mnt.mnt_flags &= ~MNT_ONRB;
        rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
        list_add_tail(&mnt->mnt_list, dt_list);
}

extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 








































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This file is part of the SCTP kernel implementation
 *
 * Initialization/cleanup for SCTP protocol support.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson <karl@athena.chicago.il.us>
 *    Jon Grimm <jgrimm@us.ibm.com>
 *    Sridhar Samudrala <sri@us.ibm.com>
 *    Daisy Chang <daisyc@us.ibm.com>
 *    Ardelle Fan <ardelle.fan@intel.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/init.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/route.h>
#include <net/sctp/sctp.h>
#include <net/addrconf.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
#include <net/udp_tunnel.h>

#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024)

/* Global data structures. */
struct sctp_globals sctp_globals __read_mostly;

struct idr sctp_assocs_id;
DEFINE_SPINLOCK(sctp_assocs_id_lock);

static struct sctp_pf *sctp_pf_inet6_specific;
static struct sctp_pf *sctp_pf_inet_specific;
static struct sctp_af *sctp_af_v4_specific;
static struct sctp_af *sctp_af_v6_specific;

struct kmem_cache *sctp_chunk_cachep __read_mostly;
struct kmem_cache *sctp_bucket_cachep __read_mostly;

long sysctl_sctp_mem[3];
int sysctl_sctp_rmem[3];
int sysctl_sctp_wmem[3];

/* Private helper to extract ipv4 address and stash them in
 * the protocol structure.
 */
static void sctp_v4_copy_addrlist(struct list_head *addrlist,
                                  struct net_device *dev)
{
        struct in_device *in_dev;
        struct in_ifaddr *ifa;
        struct sctp_sockaddr_entry *addr;

        rcu_read_lock();
        if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
                rcu_read_unlock();
                return;
        }

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                /* Add the address to the local list.  */
                addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
                if (addr) {
                        addr->a.v4.sin_family = AF_INET;
                        addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
                        addr->valid = 1;
                        INIT_LIST_HEAD(&addr->list);
                        list_add_tail(&addr->list, addrlist);
                }
        }

        rcu_read_unlock();
}

/* Extract our IP addresses from the system and stash them in the
 * protocol structure.
 */
static void sctp_get_local_addr_list(struct net *net)
{
        struct net_device *dev;
        struct list_head *pos;
        struct sctp_af *af;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                list_for_each(pos, &sctp_address_families) {
                        af = list_entry(pos, struct sctp_af, list);
                        af->copy_addrlist(&net->sctp.local_addr_list, dev);
                }
        }
        rcu_read_unlock();
}

/* Free the existing local addresses.  */
static void sctp_free_local_addr_list(struct net *net)
{
        struct sctp_sockaddr_entry *addr;
        struct list_head *pos, *temp;

        list_for_each_safe(pos, temp, &net->sctp.local_addr_list) {
                addr = list_entry(pos, struct sctp_sockaddr_entry, list);
                list_del(pos);
                kfree(addr);
        }
}

/* Copy the local addresses which are valid for 'scope' into 'bp'.  */
int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
                              enum sctp_scope scope, gfp_t gfp, int copy_flags)
{
        struct sctp_sockaddr_entry *addr;
        union sctp_addr laddr;
        int error = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
                if (!addr->valid)
                        continue;
                if (!sctp_in_scope(net, &addr->a, scope))
                        continue;

                /* Now that the address is in scope, check to see if
                 * the address type is really supported by the local
                 * sock as well as the remote peer.
                 */
                if (addr->a.sa.sa_family == AF_INET &&
                    (!(copy_flags & SCTP_ADDR4_ALLOWED) ||
                     !(copy_flags & SCTP_ADDR4_PEERSUPP)))
                        continue;
                if (addr->a.sa.sa_family == AF_INET6 &&
                    (!(copy_flags & SCTP_ADDR6_ALLOWED) ||
                     !(copy_flags & SCTP_ADDR6_PEERSUPP)))
                        continue;

                laddr = addr->a;
                /* also works for setting ipv6 address port */
                laddr.v4.sin_port = htons(bp->port);
                if (sctp_bind_addr_state(bp, &laddr) != -1)
                        continue;

                error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a),
                                           SCTP_ADDR_SRC, GFP_ATOMIC);
                if (error)
                        break;
        }

        rcu_read_unlock();
        return error;
}

/* Copy over any ip options */
static void sctp_v4_copy_ip_options(struct sock *sk, struct sock *newsk)
{
        struct inet_sock *newinet, *inet = inet_sk(sk);
        struct ip_options_rcu *inet_opt, *newopt = NULL;

        newinet = inet_sk(newsk);

        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        if (inet_opt) {
                newopt = sock_kmalloc(newsk, sizeof(*inet_opt) +
                                      inet_opt->opt.optlen, GFP_ATOMIC);
                if (newopt)
                        memcpy(newopt, inet_opt, sizeof(*inet_opt) +
                               inet_opt->opt.optlen);
                else
                        pr_err("%s: Failed to copy ip options\n", __func__);
        }
        RCU_INIT_POINTER(newinet->inet_opt, newopt);
        rcu_read_unlock();
}

/* Account for the IP options */
static int sctp_v4_ip_options_len(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ip_options_rcu *inet_opt;
        int len = 0;

        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        if (inet_opt)
                len = inet_opt->opt.optlen;

        rcu_read_unlock();
        return len;
}

/* Initialize a sctp_addr from in incoming skb.  */
static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
                             int is_saddr)
{
        /* Always called on head skb, so this is safe */
        struct sctphdr *sh = sctp_hdr(skb);
        struct sockaddr_in *sa = &addr->v4;

        addr->v4.sin_family = AF_INET;

        if (is_saddr) {
                sa->sin_port = sh->source;
                sa->sin_addr.s_addr = ip_hdr(skb)->saddr;
        } else {
                sa->sin_port = sh->dest;
                sa->sin_addr.s_addr = ip_hdr(skb)->daddr;
        }
        memset(sa->sin_zero, 0, sizeof(sa->sin_zero));
}

/* Initialize an sctp_addr from a socket. */
static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk)
{
        addr->v4.sin_family = AF_INET;
        addr->v4.sin_port = 0;
        addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr;
        memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
}

/* Initialize sk->sk_rcv_saddr from sctp_addr. */
static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk)
{
        inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr;
}

/* Initialize sk->sk_daddr from sctp_addr. */
static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk)
{
        inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr;
}

/* Initialize a sctp_addr from an address parameter. */
static bool sctp_v4_from_addr_param(union sctp_addr *addr,
                                    union sctp_addr_param *param,
                                    __be16 port, int iif)
{
        if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param))
                return false;

        addr->v4.sin_family = AF_INET;
        addr->v4.sin_port = port;
        addr->v4.sin_addr.s_addr = param->v4.addr.s_addr;
        memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));

        return true;
}

/* Initialize an address parameter from a sctp_addr and return the length
 * of the address parameter.
 */
static int sctp_v4_to_addr_param(const union sctp_addr *addr,
                                 union sctp_addr_param *param)
{
        int length = sizeof(struct sctp_ipv4addr_param);

        param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS;
        param->v4.param_hdr.length = htons(length);
        param->v4.addr.s_addr = addr->v4.sin_addr.s_addr;

        return length;
}

/* Initialize a sctp_addr from a dst_entry. */
static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4,
                              __be16 port)
{
        saddr->v4.sin_family = AF_INET;
        saddr->v4.sin_port = port;
        saddr->v4.sin_addr.s_addr = fl4->saddr;
        memset(saddr->v4.sin_zero, 0, sizeof(saddr->v4.sin_zero));
}

/* Compare two addresses exactly. */
static int sctp_v4_cmp_addr(const union sctp_addr *addr1,
                            const union sctp_addr *addr2)
{
        if (addr1->sa.sa_family != addr2->sa.sa_family)
                return 0;
        if (addr1->v4.sin_port != addr2->v4.sin_port)
                return 0;
        if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr)
                return 0;

        return 1;
}

/* Initialize addr struct to INADDR_ANY. */
static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port)
{
        addr->v4.sin_family = AF_INET;
        addr->v4.sin_addr.s_addr = htonl(INADDR_ANY);
        addr->v4.sin_port = port;
        memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
}

/* Is this a wildcard address? */
static int sctp_v4_is_any(const union sctp_addr *addr)
{
        return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr;
}

/* This function checks if the address is a valid address to be used for
 * SCTP binding.
 *
 * Output:
 * Return 0 - If the address is a non-unicast or an illegal address.
 * Return 1 - If the address is a unicast.
 */
static int sctp_v4_addr_valid(union sctp_addr *addr,
                              struct sctp_sock *sp,
                              const struct sk_buff *skb)
{
        /* IPv4 addresses not allowed */
        if (sp && ipv6_only_sock(sctp_opt2sk(sp)))
                return 0;

        /* Is this a non-unicast address or a unusable SCTP address? */
        if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr))
                return 0;

        /* Is this a broadcast address? */
        if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST)
                return 0;

        return 1;
}

/* Should this be available for binding?   */
static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
{
        struct sock *sk = &sp->inet.sk;
        struct net *net = sock_net(sk);
        int tb_id = RT_TABLE_LOCAL;
        int ret;

        tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ?: tb_id;
        ret = inet_addr_type_table(net, addr->v4.sin_addr.s_addr, tb_id);
        if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) &&
           ret != RTN_LOCAL &&
           !inet_test_bit(FREEBIND, sk) &&
            !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind))
                return 0;

        if (ipv6_only_sock(sctp_opt2sk(sp)))
                return 0;

        return 1;
}

/* Checking the loopback, private and other address scopes as defined in
 * RFC 1918.   The IPv4 scoping is based on the draft for SCTP IPv4
 * scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>.
 *
 * Level 0 - unusable SCTP addresses
 * Level 1 - loopback address
 * Level 2 - link-local addresses
 * Level 3 - private addresses.
 * Level 4 - global addresses
 * For INIT and INIT-ACK address list, let L be the level of
 * requested destination address, sender and receiver
 * SHOULD include all of its addresses with level greater
 * than or equal to L.
 *
 * IPv4 scoping can be controlled through sysctl option
 * net.sctp.addr_scope_policy
 */
static enum sctp_scope sctp_v4_scope(union sctp_addr *addr)
{
        enum sctp_scope retval;

        /* Check for unusable SCTP addresses. */
        if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) {
                retval =  SCTP_SCOPE_UNUSABLE;
        } else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) {
                retval = SCTP_SCOPE_LOOPBACK;
        } else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) {
                retval = SCTP_SCOPE_LINK;
        } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) ||
                   ipv4_is_private_172(addr->v4.sin_addr.s_addr) ||
                   ipv4_is_private_192(addr->v4.sin_addr.s_addr) ||
                   ipv4_is_test_198(addr->v4.sin_addr.s_addr)) {
                retval = SCTP_SCOPE_PRIVATE;
        } else {
                retval = SCTP_SCOPE_GLOBAL;
        }

        return retval;
}

/* Returns a valid dst cache entry for the given source and destination ip
 * addresses. If an association is passed, trys to get a dst entry with a
 * source address that matches an address in the bind address list.
 */
static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
                                struct flowi *fl, struct sock *sk)
{
        struct sctp_association *asoc = t->asoc;
        struct rtable *rt;
        struct flowi _fl;
        struct flowi4 *fl4 = &_fl.u.ip4;
        struct sctp_bind_addr *bp;
        struct sctp_sockaddr_entry *laddr;
        struct dst_entry *dst = NULL;
        union sctp_addr *daddr = &t->ipaddr;
        union sctp_addr dst_saddr;
        u8 tos = READ_ONCE(inet_sk(sk)->tos);

        if (t->dscp & SCTP_DSCP_SET_MASK)
                tos = t->dscp & SCTP_DSCP_VAL_MASK;
        memset(&_fl, 0x0, sizeof(_fl));
        fl4->daddr  = daddr->v4.sin_addr.s_addr;
        fl4->fl4_dport = daddr->v4.sin_port;
        fl4->flowi4_proto = IPPROTO_SCTP;
        if (asoc) {
                fl4->flowi4_tos = RT_TOS(tos);
                fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk);
                fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
                fl4->fl4_sport = htons(asoc->base.bind_addr.port);
        }
        if (saddr) {
                fl4->saddr = saddr->v4.sin_addr.s_addr;
                if (!fl4->fl4_sport)
                        fl4->fl4_sport = saddr->v4.sin_port;
        }

        pr_debug("%s: dst:%pI4, src:%pI4 - ", __func__, &fl4->daddr,
                 &fl4->saddr);

        rt = ip_route_output_key(sock_net(sk), fl4);
        if (!IS_ERR(rt)) {
                dst = &rt->dst;
                t->dst = dst;
                memcpy(fl, &_fl, sizeof(_fl));
        }

        /* If there is no association or if a source address is passed, no
         * more validation is required.
         */
        if (!asoc || saddr)
                goto out;

        bp = &asoc->base.bind_addr;

        if (dst) {
                /* Walk through the bind address list and look for a bind
                 * address that matches the source address of the returned dst.
                 */
                sctp_v4_dst_saddr(&dst_saddr, fl4, htons(bp->port));
                rcu_read_lock();
                list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                        if (!laddr->valid || (laddr->state == SCTP_ADDR_DEL) ||
                            (laddr->state != SCTP_ADDR_SRC &&
                            !asoc->src_out_of_asoc_ok))
                                continue;
                        if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
                                goto out_unlock;
                }
                rcu_read_unlock();

                /* None of the bound addresses match the source address of the
                 * dst. So release it.
                 */
                dst_release(dst);
                dst = NULL;
        }

        /* Walk through the bind address list and try to get a dst that
         * matches a bind address as the source address.
         */
        rcu_read_lock();
        list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                struct net_device *odev;

                if (!laddr->valid)
                        continue;
                if (laddr->state != SCTP_ADDR_SRC ||
                    AF_INET != laddr->a.sa.sa_family)
                        continue;

                fl4->fl4_sport = laddr->a.v4.sin_port;
                flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if,
                                     daddr->v4.sin_addr.s_addr,
                                     laddr->a.v4.sin_addr.s_addr);

                rt = ip_route_output_key(sock_net(sk), fl4);
                if (IS_ERR(rt))
                        continue;

                /* Ensure the src address belongs to the output
                 * interface.
                 */
                odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
                                     false);
                if (!odev || odev->ifindex != fl4->flowi4_oif) {
                        if (!dst) {
                                dst = &rt->dst;
                                t->dst = dst;
                                memcpy(fl, &_fl, sizeof(_fl));
                        } else {
                                dst_release(&rt->dst);
                        }
                        continue;
                }

                dst_release(dst);
                dst = &rt->dst;
                t->dst = dst;
                memcpy(fl, &_fl, sizeof(_fl));
                break;
        }

out_unlock:
        rcu_read_unlock();
out:
        if (dst) {
                pr_debug("rt_dst:%pI4, rt_src:%pI4\n",
                         &fl->u.ip4.daddr, &fl->u.ip4.saddr);
        } else {
                t->dst = NULL;
                pr_debug("no route\n");
        }
}

/* For v4, the source address is cached in the route entry(dst). So no need
 * to cache it separately and hence this is an empty routine.
 */
static void sctp_v4_get_saddr(struct sctp_sock *sk,
                              struct sctp_transport *t,
                              struct flowi *fl)
{
        union sctp_addr *saddr = &t->saddr;
        struct rtable *rt = dst_rtable(t->dst);

        if (rt) {
                saddr->v4.sin_family = AF_INET;
                saddr->v4.sin_addr.s_addr = fl->u.ip4.saddr;
        }
}

/* What interface did this skb arrive on? */
static int sctp_v4_skb_iif(const struct sk_buff *skb)
{
        return inet_iif(skb);
}

static int sctp_v4_skb_sdif(const struct sk_buff *skb)
{
        return inet_sdif(skb);
}

/* Was this packet marked by Explicit Congestion Notification? */
static int sctp_v4_is_ce(const struct sk_buff *skb)
{
        return INET_ECN_is_ce(ip_hdr(skb)->tos);
}

/* Create and initialize a new sk for the socket returned by accept(). */
static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
                                             struct sctp_association *asoc,
                                             bool kern)
{
        struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
                        sk->sk_prot, kern);
        struct inet_sock *newinet;

        if (!newsk)
                goto out;

        sock_init_data(NULL, newsk);

        sctp_copy_sock(newsk, sk, asoc);
        sock_reset_flag(newsk, SOCK_ZAPPED);

        sctp_v4_copy_ip_options(sk, newsk);

        newinet = inet_sk(newsk);

        newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr;

        if (newsk->sk_prot->init(newsk)) {
                sk_common_release(newsk);
                newsk = NULL;
        }

out:
        return newsk;
}

static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
{
        /* No address mapping for V4 sockets */
        memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
        return sizeof(struct sockaddr_in);
}

/* Dump the v4 addr to the seq file. */
static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
{
        seq_printf(seq, "%pI4 ", &addr->v4.sin_addr);
}

static void sctp_v4_ecn_capable(struct sock *sk)
{
        INET_ECN_xmit(sk);
}

static void sctp_addr_wq_timeout_handler(struct timer_list *t)
{
        struct net *net = from_timer(net, t, sctp.addr_wq_timer);
        struct sctp_sockaddr_entry *addrw, *temp;
        struct sctp_sock *sp;

        spin_lock_bh(&net->sctp.addr_wq_lock);

        list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
                pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at "
                         "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa,
                         addrw->state, addrw);

#if IS_ENABLED(CONFIG_IPV6)
                /* Now we send an ASCONF for each association */
                /* Note. we currently don't handle link local IPv6 addressees */
                if (addrw->a.sa.sa_family == AF_INET6) {
                        struct in6_addr *in6;

                        if (ipv6_addr_type(&addrw->a.v6.sin6_addr) &
                            IPV6_ADDR_LINKLOCAL)
                                goto free_next;

                        in6 = (struct in6_addr *)&addrw->a.v6.sin6_addr;
                        if (ipv6_chk_addr(net, in6, NULL, 0) == 0 &&
                            addrw->state == SCTP_ADDR_NEW) {
                                unsigned long timeo_val;

                                pr_debug("%s: this is on DAD, trying %d sec "
                                         "later\n", __func__,
                                         SCTP_ADDRESS_TICK_DELAY);

                                timeo_val = jiffies;
                                timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
                                mod_timer(&net->sctp.addr_wq_timer, timeo_val);
                                break;
                        }
                }
#endif
                list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) {
                        struct sock *sk;

                        sk = sctp_opt2sk(sp);
                        /* ignore bound-specific endpoints */
                        if (!sctp_is_ep_boundall(sk))
                                continue;
                        bh_lock_sock(sk);
                        if (sctp_asconf_mgmt(sp, addrw) < 0)
                                pr_debug("%s: sctp_asconf_mgmt failed\n", __func__);
                        bh_unlock_sock(sk);
                }
#if IS_ENABLED(CONFIG_IPV6)
free_next:
#endif
                list_del(&addrw->list);
                kfree(addrw);
        }
        spin_unlock_bh(&net->sctp.addr_wq_lock);
}

static void sctp_free_addr_wq(struct net *net)
{
        struct sctp_sockaddr_entry *addrw;
        struct sctp_sockaddr_entry *temp;

        spin_lock_bh(&net->sctp.addr_wq_lock);
        del_timer(&net->sctp.addr_wq_timer);
        list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
                list_del(&addrw->list);
                kfree(addrw);
        }
        spin_unlock_bh(&net->sctp.addr_wq_lock);
}

/* lookup the entry for the same address in the addr_waitq
 * sctp_addr_wq MUST be locked
 */
static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net,
                                        struct sctp_sockaddr_entry *addr)
{
        struct sctp_sockaddr_entry *addrw;

        list_for_each_entry(addrw, &net->sctp.addr_waitq, list) {
                if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
                        continue;
                if (addrw->a.sa.sa_family == AF_INET) {
                        if (addrw->a.v4.sin_addr.s_addr ==
                            addr->a.v4.sin_addr.s_addr)
                                return addrw;
                } else if (addrw->a.sa.sa_family == AF_INET6) {
                        if (ipv6_addr_equal(&addrw->a.v6.sin6_addr,
                            &addr->a.v6.sin6_addr))
                                return addrw;
                }
        }
        return NULL;
}

void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cmd)
{
        struct sctp_sockaddr_entry *addrw;
        unsigned long timeo_val;

        /* first, we check if an opposite message already exist in the queue.
         * If we found such message, it is removed.
         * This operation is a bit stupid, but the DHCP client attaches the
         * new address after a couple of addition and deletion of that address
         */

        spin_lock_bh(&net->sctp.addr_wq_lock);
        /* Offsets existing events in addr_wq */
        addrw = sctp_addr_wq_lookup(net, addr);
        if (addrw) {
                if (addrw->state != cmd) {
                        pr_debug("%s: offsets existing entry for %d, addr:%pISc "
                                 "in wq:%p\n", __func__, addrw->state, &addrw->a.sa,
                                 &net->sctp.addr_waitq);

                        list_del(&addrw->list);
                        kfree(addrw);
                }
                spin_unlock_bh(&net->sctp.addr_wq_lock);
                return;
        }

        /* OK, we have to add the new address to the wait queue */
        addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
        if (addrw == NULL) {
                spin_unlock_bh(&net->sctp.addr_wq_lock);
                return;
        }
        addrw->state = cmd;
        list_add_tail(&addrw->list, &net->sctp.addr_waitq);

        pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n",
                 __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq);

        if (!timer_pending(&net->sctp.addr_wq_timer)) {
                timeo_val = jiffies;
                timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
                mod_timer(&net->sctp.addr_wq_timer, timeo_val);
        }
        spin_unlock_bh(&net->sctp.addr_wq_lock);
}

/* Event handler for inet address addition/deletion events.
 * The sctp_local_addr_list needs to be protocted by a spin lock since
 * multiple notifiers (say IPv4 and IPv6) may be running at the same
 * time and thus corrupt the list.
 * The reader side is protected with RCU.
 */
static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
                               void *ptr)
{
        struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
        struct sctp_sockaddr_entry *addr = NULL;
        struct sctp_sockaddr_entry *temp;
        struct net *net = dev_net(ifa->ifa_dev->dev);
        int found = 0;

        switch (ev) {
        case NETDEV_UP:
                addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
                if (addr) {
                        addr->a.v4.sin_family = AF_INET;
                        addr->a.v4.sin_addr.s_addr = ifa->ifa_local;
                        addr->valid = 1;
                        spin_lock_bh(&net->sctp.local_addr_lock);
                        list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list);
                        sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW);
                        spin_unlock_bh(&net->sctp.local_addr_lock);
                }
                break;
        case NETDEV_DOWN:
                spin_lock_bh(&net->sctp.local_addr_lock);
                list_for_each_entry_safe(addr, temp,
                                        &net->sctp.local_addr_list, list) {
                        if (addr->a.sa.sa_family == AF_INET &&
                                        addr->a.v4.sin_addr.s_addr ==
                                        ifa->ifa_local) {
                                sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL);
                                found = 1;
                                addr->valid = 0;
                                list_del_rcu(&addr->list);
                                break;
                        }
                }
                spin_unlock_bh(&net->sctp.local_addr_lock);
                if (found)
                        kfree_rcu(addr, rcu);
                break;
        }

        return NOTIFY_DONE;
}

/*
 * Initialize the control inode/socket with a control endpoint data
 * structure.  This endpoint is reserved exclusively for the OOTB processing.
 */
static int sctp_ctl_sock_init(struct net *net)
{
        int err;
        sa_family_t family = PF_INET;

        if (sctp_get_pf_specific(PF_INET6))
                family = PF_INET6;

        err = inet_ctl_sock_create(&net->sctp.ctl_sock, family,
                                   SOCK_SEQPACKET, IPPROTO_SCTP, net);

        /* If IPv6 socket could not be created, try the IPv4 socket */
        if (err < 0 && family == PF_INET6)
                err = inet_ctl_sock_create(&net->sctp.ctl_sock, AF_INET,
                                           SOCK_SEQPACKET, IPPROTO_SCTP,
                                           net);

        if (err < 0) {
                pr_err("Failed to create the SCTP control socket\n");
                return err;
        }
        return 0;
}

static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb)
{
        SCTP_INPUT_CB(skb)->encap_port = udp_hdr(skb)->source;

        skb_set_transport_header(skb, sizeof(struct udphdr));
        sctp_rcv(skb);
        return 0;
}

int sctp_udp_sock_start(struct net *net)
{
        struct udp_tunnel_sock_cfg tuncfg = {NULL};
        struct udp_port_cfg udp_conf = {0};
        struct socket *sock;
        int err;

        udp_conf.family = AF_INET;
        udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
        udp_conf.local_udp_port = htons(net->sctp.udp_port);
        err = udp_sock_create(net, &udp_conf, &sock);
        if (err) {
                pr_err("Failed to create the SCTP UDP tunneling v4 sock\n");
                return err;
        }

        tuncfg.encap_type = 1;
        tuncfg.encap_rcv = sctp_udp_rcv;
        tuncfg.encap_err_lookup = sctp_udp_v4_err;
        setup_udp_tunnel_sock(net, sock, &tuncfg);
        net->sctp.udp4_sock = sock->sk;

#if IS_ENABLED(CONFIG_IPV6)
        memset(&udp_conf, 0, sizeof(udp_conf));

        udp_conf.family = AF_INET6;
        udp_conf.local_ip6 = in6addr_any;
        udp_conf.local_udp_port = htons(net->sctp.udp_port);
        udp_conf.use_udp6_rx_checksums = true;
        udp_conf.ipv6_v6only = true;
        err = udp_sock_create(net, &udp_conf, &sock);
        if (err) {
                pr_err("Failed to create the SCTP UDP tunneling v6 sock\n");
                udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket);
                net->sctp.udp4_sock = NULL;
                return err;
        }

        tuncfg.encap_type = 1;
        tuncfg.encap_rcv = sctp_udp_rcv;
        tuncfg.encap_err_lookup = sctp_udp_v6_err;
        setup_udp_tunnel_sock(net, sock, &tuncfg);
        net->sctp.udp6_sock = sock->sk;
#endif

        return 0;
}

void sctp_udp_sock_stop(struct net *net)
{
        if (net->sctp.udp4_sock) {
                udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket);
                net->sctp.udp4_sock = NULL;
        }
        if (net->sctp.udp6_sock) {
                udp_tunnel_sock_release(net->sctp.udp6_sock->sk_socket);
                net->sctp.udp6_sock = NULL;
        }
}

/* Register address family specific functions. */
int sctp_register_af(struct sctp_af *af)
{
        switch (af->sa_family) {
        case AF_INET:
                if (sctp_af_v4_specific)
                        return 0;
                sctp_af_v4_specific = af;
                break;
        case AF_INET6:
                if (sctp_af_v6_specific)
                        return 0;
                sctp_af_v6_specific = af;
                break;
        default:
                return 0;
        }

        INIT_LIST_HEAD(&af->list);
        list_add_tail(&af->list, &sctp_address_families);
        return 1;
}

/* Get the table of functions for manipulating a particular address
 * family.
 */
struct sctp_af *sctp_get_af_specific(sa_family_t family)
{
        switch (family) {
        case AF_INET:
                return sctp_af_v4_specific;
        case AF_INET6:
                return sctp_af_v6_specific;
        default:
                return NULL;
        }
}

/* Common code to initialize a AF_INET msg_name. */
static void sctp_inet_msgname(char *msgname, int *addr_len)
{
        struct sockaddr_in *sin;

        sin = (struct sockaddr_in *)msgname;
        *addr_len = sizeof(struct sockaddr_in);
        sin->sin_family = AF_INET;
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
}

/* Copy the primary address of the peer primary address as the msg_name. */
static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname,
                                    int *addr_len)
{
        struct sockaddr_in *sin, *sinfrom;

        if (msgname) {
                struct sctp_association *asoc;

                asoc = event->asoc;
                sctp_inet_msgname(msgname, addr_len);
                sin = (struct sockaddr_in *)msgname;
                sinfrom = &asoc->peer.primary_addr.v4;
                sin->sin_port = htons(asoc->peer.port);
                sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr;
        }
}

/* Initialize and copy out a msgname from an inbound skb. */
static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len)
{
        if (msgname) {
                struct sctphdr *sh = sctp_hdr(skb);
                struct sockaddr_in *sin = (struct sockaddr_in *)msgname;

                sctp_inet_msgname(msgname, len);
                sin->sin_port = sh->source;
                sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
        }
}

/* Do we support this AF? */
static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp)
{
        /* PF_INET only supports AF_INET addresses. */
        return AF_INET == family;
}

/* Address matching with wildcards allowed. */
static int sctp_inet_cmp_addr(const union sctp_addr *addr1,
                              const union sctp_addr *addr2,
                              struct sctp_sock *opt)
{
        /* PF_INET only supports AF_INET addresses. */
        if (addr1->sa.sa_family != addr2->sa.sa_family)
                return 0;
        if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr ||
            htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr)
                return 1;
        if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr)
                return 1;

        return 0;
}

/* Verify that provided sockaddr looks bindable.  Common verification has
 * already been taken care of.
 */
static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
{
        return sctp_v4_available(addr, opt);
}

/* Verify that sockaddr looks sendable.  Common verification has already
 * been taken care of.
 */
static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
{
        return 1;
}

/* Fill in Supported Address Type information for INIT and INIT-ACK
 * chunks.  Returns number of addresses supported.
 */
static int sctp_inet_supported_addrs(const struct sctp_sock *opt,
                                     __be16 *types)
{
        types[0] = SCTP_PARAM_IPV4_ADDRESS;
        return 1;
}

/* Wrapper routine that calls the ip transmit routine. */
static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
{
        struct dst_entry *dst = dst_clone(t->dst);
        struct flowi4 *fl4 = &t->fl.u.ip4;
        struct sock *sk = skb->sk;
        struct inet_sock *inet = inet_sk(sk);
        __u8 dscp = READ_ONCE(inet->tos);
        __be16 df = 0;

        pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
                 skb->len, &fl4->saddr, &fl4->daddr);

        if (t->dscp & SCTP_DSCP_SET_MASK)
                dscp = t->dscp & SCTP_DSCP_VAL_MASK;

        inet->pmtudisc = t->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO
                                                           : IP_PMTUDISC_DONT;
        SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);

        if (!t->encap_port || !sctp_sk(sk)->udp_port) {
                skb_dst_set(skb, dst);
                return __ip_queue_xmit(sk, skb, &t->fl, dscp);
        }

        if (skb_is_gso(skb))
                skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;

        if (ip_dont_fragment(sk, dst) && !skb->ignore_df)
                df = htons(IP_DF);

        skb->encapsulation = 1;
        skb_reset_inner_mac_header(skb);
        skb_reset_inner_transport_header(skb);
        skb_set_inner_ipproto(skb, IPPROTO_SCTP);
        udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr,
                            fl4->daddr, dscp, ip4_dst_hoplimit(dst), df,
                            sctp_sk(sk)->udp_port, t->encap_port, false, false);
        return 0;
}

static struct sctp_af sctp_af_inet;

static struct sctp_pf sctp_pf_inet = {
        .event_msgname = sctp_inet_event_msgname,
        .skb_msgname   = sctp_inet_skb_msgname,
        .af_supported  = sctp_inet_af_supported,
        .cmp_addr      = sctp_inet_cmp_addr,
        .bind_verify   = sctp_inet_bind_verify,
        .send_verify   = sctp_inet_send_verify,
        .supported_addrs = sctp_inet_supported_addrs,
        .create_accept_sk = sctp_v4_create_accept_sk,
        .addr_to_user  = sctp_v4_addr_to_user,
        .to_sk_saddr   = sctp_v4_to_sk_saddr,
        .to_sk_daddr   = sctp_v4_to_sk_daddr,
        .copy_ip_options = sctp_v4_copy_ip_options,
        .af            = &sctp_af_inet
};

/* Notifier for inetaddr addition/deletion events.  */
static struct notifier_block sctp_inetaddr_notifier = {
        .notifier_call = sctp_inetaddr_event,
};

/* Socket operations.  */
static const struct proto_ops inet_seqpacket_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,        /* Needs to be wrapped... */
        .bind                   = inet_bind,
        .connect           = sctp_inet_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = inet_accept,
        .getname           = inet_getname,        /* Semantics are different.  */
        .poll                   = sctp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sctp_inet_listen,
        .shutdown           = inet_shutdown,        /* Looks harmless.  */
        .setsockopt           = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
};

/* Registration with AF_INET family.  */
static struct inet_protosw sctp_seqpacket_protosw = {
        .type       = SOCK_SEQPACKET,
        .protocol   = IPPROTO_SCTP,
        .prot       = &sctp_prot,
        .ops        = &inet_seqpacket_ops,
        .flags      = SCTP_PROTOSW_FLAG
};
static struct inet_protosw sctp_stream_protosw = {
        .type       = SOCK_STREAM,
        .protocol   = IPPROTO_SCTP,
        .prot       = &sctp_prot,
        .ops        = &inet_seqpacket_ops,
        .flags      = SCTP_PROTOSW_FLAG
};

static int sctp4_rcv(struct sk_buff *skb)
{
        SCTP_INPUT_CB(skb)->encap_port = 0;
        return sctp_rcv(skb);
}

/* Register with IP layer.  */
static const struct net_protocol sctp_protocol = {
        .handler     = sctp4_rcv,
        .err_handler = sctp_v4_err,
        .no_policy   = 1,
        .icmp_strict_tag_validation = 1,
};

/* IPv4 address related functions.  */
static struct sctp_af sctp_af_inet = {
        .sa_family           = AF_INET,
        .sctp_xmit           = sctp_v4_xmit,
        .setsockopt           = ip_setsockopt,
        .getsockopt           = ip_getsockopt,
        .get_dst           = sctp_v4_get_dst,
        .get_saddr           = sctp_v4_get_saddr,
        .copy_addrlist           = sctp_v4_copy_addrlist,
        .from_skb           = sctp_v4_from_skb,
        .from_sk           = sctp_v4_from_sk,
        .from_addr_param   = sctp_v4_from_addr_param,
        .to_addr_param           = sctp_v4_to_addr_param,
        .cmp_addr           = sctp_v4_cmp_addr,
        .addr_valid           = sctp_v4_addr_valid,
        .inaddr_any           = sctp_v4_inaddr_any,
        .is_any                   = sctp_v4_is_any,
        .available           = sctp_v4_available,
        .scope                   = sctp_v4_scope,
        .skb_iif           = sctp_v4_skb_iif,
        .skb_sdif           = sctp_v4_skb_sdif,
        .is_ce                   = sctp_v4_is_ce,
        .seq_dump_addr           = sctp_v4_seq_dump_addr,
        .ecn_capable           = sctp_v4_ecn_capable,
        .net_header_len           = sizeof(struct iphdr),
        .sockaddr_len           = sizeof(struct sockaddr_in),
        .ip_options_len           = sctp_v4_ip_options_len,
};

struct sctp_pf *sctp_get_pf_specific(sa_family_t family)
{
        switch (family) {
        case PF_INET:
                return sctp_pf_inet_specific;
        case PF_INET6:
                return sctp_pf_inet6_specific;
        default:
                return NULL;
        }
}

/* Register the PF specific function table.  */
int sctp_register_pf(struct sctp_pf *pf, sa_family_t family)
{
        switch (family) {
        case PF_INET:
                if (sctp_pf_inet_specific)
                        return 0;
                sctp_pf_inet_specific = pf;
                break;
        case PF_INET6:
                if (sctp_pf_inet6_specific)
                        return 0;
                sctp_pf_inet6_specific = pf;
                break;
        default:
                return 0;
        }
        return 1;
}

static inline int init_sctp_mibs(struct net *net)
{
        net->sctp.sctp_statistics = alloc_percpu(struct sctp_mib);
        if (!net->sctp.sctp_statistics)
                return -ENOMEM;
        return 0;
}

static inline void cleanup_sctp_mibs(struct net *net)
{
        free_percpu(net->sctp.sctp_statistics);
}

static void sctp_v4_pf_init(void)
{
        /* Initialize the SCTP specific PF functions. */
        sctp_register_pf(&sctp_pf_inet, PF_INET);
        sctp_register_af(&sctp_af_inet);
}

static void sctp_v4_pf_exit(void)
{
        list_del(&sctp_af_inet.list);
}

static int sctp_v4_protosw_init(void)
{
        int rc;

        rc = proto_register(&sctp_prot, 1);
        if (rc)
                return rc;

        /* Register SCTP(UDP and TCP style) with socket layer.  */
        inet_register_protosw(&sctp_seqpacket_protosw);
        inet_register_protosw(&sctp_stream_protosw);

        return 0;
}

static void sctp_v4_protosw_exit(void)
{
        inet_unregister_protosw(&sctp_stream_protosw);
        inet_unregister_protosw(&sctp_seqpacket_protosw);
        proto_unregister(&sctp_prot);
}

static int sctp_v4_add_protocol(void)
{
        /* Register notifier for inet address additions/deletions. */
        register_inetaddr_notifier(&sctp_inetaddr_notifier);

        /* Register SCTP with inet layer.  */
        if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0)
                return -EAGAIN;

        return 0;
}

static void sctp_v4_del_protocol(void)
{
        inet_del_protocol(&sctp_protocol, IPPROTO_SCTP);
        unregister_inetaddr_notifier(&sctp_inetaddr_notifier);
}

static int __net_init sctp_defaults_init(struct net *net)
{
        int status;

        /*
         * 14. Suggested SCTP Protocol Parameter Values
         */
        /* The following protocol parameters are RECOMMENDED:  */
        /* RTO.Initial              - 3  seconds */
        net->sctp.rto_initial                        = SCTP_RTO_INITIAL;
        /* RTO.Min                  - 1  second */
        net->sctp.rto_min                         = SCTP_RTO_MIN;
        /* RTO.Max                 -  60 seconds */
        net->sctp.rto_max                         = SCTP_RTO_MAX;
        /* RTO.Alpha                - 1/8 */
        net->sctp.rto_alpha                        = SCTP_RTO_ALPHA;
        /* RTO.Beta                 - 1/4 */
        net->sctp.rto_beta                        = SCTP_RTO_BETA;

        /* Valid.Cookie.Life        - 60  seconds */
        net->sctp.valid_cookie_life                = SCTP_DEFAULT_COOKIE_LIFE;

        /* Whether Cookie Preservative is enabled(1) or not(0) */
        net->sctp.cookie_preserve_enable         = 1;

        /* Default sctp sockets to use md5 as their hmac alg */
#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
        net->sctp.sctp_hmac_alg                        = "md5";
#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1)
        net->sctp.sctp_hmac_alg                        = "sha1";
#else
        net->sctp.sctp_hmac_alg                        = NULL;
#endif

        /* Max.Burst                    - 4 */
        net->sctp.max_burst                        = SCTP_DEFAULT_MAX_BURST;

        /* Disable of Primary Path Switchover by default */
        net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX;

        /* Enable pf state by default */
        net->sctp.pf_enable = 1;

        /* Ignore pf exposure feature by default */
        net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET;

        /* Association.Max.Retrans  - 10 attempts
         * Path.Max.Retrans         - 5  attempts (per destination address)
         * Max.Init.Retransmits     - 8  attempts
         */
        net->sctp.max_retrans_association        = 10;
        net->sctp.max_retrans_path                = 5;
        net->sctp.max_retrans_init                = 8;

        /* Sendbuffer growth            - do per-socket accounting */
        net->sctp.sndbuf_policy                        = 0;

        /* Rcvbuffer growth            - do per-socket accounting */
        net->sctp.rcvbuf_policy                        = 0;

        /* HB.interval              - 30 seconds */
        net->sctp.hb_interval                        = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;

        /* delayed SACK timeout */
        net->sctp.sack_timeout                        = SCTP_DEFAULT_TIMEOUT_SACK;

        /* Disable ADDIP by default. */
        net->sctp.addip_enable = 0;
        net->sctp.addip_noauth = 0;
        net->sctp.default_auto_asconf = 0;

        /* Enable PR-SCTP by default. */
        net->sctp.prsctp_enable = 1;

        /* Disable RECONF by default. */
        net->sctp.reconf_enable = 0;

        /* Disable AUTH by default. */
        net->sctp.auth_enable = 0;

        /* Enable ECN by default. */
        net->sctp.ecn_enable = 1;

        /* Set UDP tunneling listening port to 0 by default */
        net->sctp.udp_port = 0;

        /* Set remote encap port to 0 by default */
        net->sctp.encap_port = 0;

        /* Set SCOPE policy to enabled */
        net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE;

        /* Set the default rwnd update threshold */
        net->sctp.rwnd_upd_shift = SCTP_DEFAULT_RWND_SHIFT;

        /* Initialize maximum autoclose timeout. */
        net->sctp.max_autoclose                = INT_MAX / HZ;

#ifdef CONFIG_NET_L3_MASTER_DEV
        net->sctp.l3mdev_accept = 1;
#endif

        status = sctp_sysctl_net_register(net);
        if (status)
                goto err_sysctl_register;

        /* Allocate and initialise sctp mibs.  */
        status = init_sctp_mibs(net);
        if (status)
                goto err_init_mibs;

#ifdef CONFIG_PROC_FS
        /* Initialize proc fs directory.  */
        status = sctp_proc_init(net);
        if (status)
                goto err_init_proc;
#endif

        sctp_dbg_objcnt_init(net);

        /* Initialize the local address list. */
        INIT_LIST_HEAD(&net->sctp.local_addr_list);
        spin_lock_init(&net->sctp.local_addr_lock);
        sctp_get_local_addr_list(net);

        /* Initialize the address event list */
        INIT_LIST_HEAD(&net->sctp.addr_waitq);
        INIT_LIST_HEAD(&net->sctp.auto_asconf_splist);
        spin_lock_init(&net->sctp.addr_wq_lock);
        net->sctp.addr_wq_timer.expires = 0;
        timer_setup(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler, 0);

        return 0;

#ifdef CONFIG_PROC_FS
err_init_proc:
        cleanup_sctp_mibs(net);
#endif
err_init_mibs:
        sctp_sysctl_net_unregister(net);
err_sysctl_register:
        return status;
}

static void __net_exit sctp_defaults_exit(struct net *net)
{
        /* Free the local address list */
        sctp_free_addr_wq(net);
        sctp_free_local_addr_list(net);

#ifdef CONFIG_PROC_FS
        remove_proc_subtree("sctp", net->proc_net);
        net->sctp.proc_net_sctp = NULL;
#endif
        cleanup_sctp_mibs(net);
        sctp_sysctl_net_unregister(net);
}

static struct pernet_operations sctp_defaults_ops = {
        .init = sctp_defaults_init,
        .exit = sctp_defaults_exit,
};

static int __net_init sctp_ctrlsock_init(struct net *net)
{
        int status;

        /* Initialize the control inode/socket for handling OOTB packets.  */
        status = sctp_ctl_sock_init(net);
        if (status)
                pr_err("Failed to initialize the SCTP control sock\n");

        return status;
}

static void __net_exit sctp_ctrlsock_exit(struct net *net)
{
        /* Free the control endpoint.  */
        inet_ctl_sock_destroy(net->sctp.ctl_sock);
}

static struct pernet_operations sctp_ctrlsock_ops = {
        .init = sctp_ctrlsock_init,
        .exit = sctp_ctrlsock_exit,
};

/* Initialize the universe into something sensible.  */
static __init int sctp_init(void)
{
        unsigned long nr_pages = totalram_pages();
        unsigned long limit;
        unsigned long goal;
        int max_entry_order;
        int num_entries;
        int max_share;
        int status;
        int order;
        int i;

        sock_skb_cb_check_size(sizeof(struct sctp_ulpevent));

        /* Allocate bind_bucket and chunk caches. */
        status = -ENOBUFS;
        sctp_bucket_cachep = KMEM_CACHE(sctp_bind_bucket, SLAB_HWCACHE_ALIGN);
        if (!sctp_bucket_cachep)
                goto out;

        sctp_chunk_cachep = KMEM_CACHE(sctp_chunk, SLAB_HWCACHE_ALIGN);
        if (!sctp_chunk_cachep)
                goto err_chunk_cachep;

        status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL);
        if (status)
                goto err_percpu_counter_init;

        /* Implementation specific variables. */

        /* Initialize default stream count setup information. */
        sctp_max_instreams                    = SCTP_DEFAULT_INSTREAMS;
        sctp_max_outstreams                   = SCTP_DEFAULT_OUTSTREAMS;

        /* Initialize handle used for association ids. */
        idr_init(&sctp_assocs_id);

        limit = nr_free_buffer_pages() / 8;
        limit = max(limit, 128UL);
        sysctl_sctp_mem[0] = limit / 4 * 3;
        sysctl_sctp_mem[1] = limit;
        sysctl_sctp_mem[2] = sysctl_sctp_mem[0] * 2;

        /* Set per-socket limits to no more than 1/128 the pressure threshold*/
        limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7);
        max_share = min(4UL*1024*1024, limit);

        sysctl_sctp_rmem[0] = PAGE_SIZE; /* give each asoc 1 page min */
        sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1);
        sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share);

        sysctl_sctp_wmem[0] = PAGE_SIZE;
        sysctl_sctp_wmem[1] = 16*1024;
        sysctl_sctp_wmem[2] = max(64*1024, max_share);

        /* Size and allocate the association hash table.
         * The methodology is similar to that of the tcp hash tables.
         * Though not identical.  Start by getting a goal size
         */
        if (nr_pages >= (128 * 1024))
                goal = nr_pages >> (22 - PAGE_SHIFT);
        else
                goal = nr_pages >> (24 - PAGE_SHIFT);

        /* Then compute the page order for said goal */
        order = get_order(goal);

        /* Now compute the required page order for the maximum sized table we
         * want to create
         */
        max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES *
                                    sizeof(struct sctp_bind_hashbucket));

        /* Limit the page order by that maximum hash table size */
        order = min(order, max_entry_order);

        /* Allocate and initialize the endpoint hash table.  */
        sctp_ep_hashsize = 64;
        sctp_ep_hashtable =
                kmalloc_array(64, sizeof(struct sctp_hashbucket), GFP_KERNEL);
        if (!sctp_ep_hashtable) {
                pr_err("Failed endpoint_hash alloc\n");
                status = -ENOMEM;
                goto err_ehash_alloc;
        }
        for (i = 0; i < sctp_ep_hashsize; i++) {
                rwlock_init(&sctp_ep_hashtable[i].lock);
                INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain);
        }

        /* Allocate and initialize the SCTP port hash table.
         * Note that order is initalized to start at the max sized
         * table we want to support.  If we can't get that many pages
         * reduce the order and try again
         */
        do {
                sctp_port_hashtable = (struct sctp_bind_hashbucket *)
                        __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order);
        } while (!sctp_port_hashtable && --order > 0);

        if (!sctp_port_hashtable) {
                pr_err("Failed bind hash alloc\n");
                status = -ENOMEM;
                goto err_bhash_alloc;
        }

        /* Now compute the number of entries that will fit in the
         * port hash space we allocated
         */
        num_entries = (1UL << order) * PAGE_SIZE /
                      sizeof(struct sctp_bind_hashbucket);

        /* And finish by rounding it down to the nearest power of two.
         * This wastes some memory of course, but it's needed because
         * the hash function operates based on the assumption that
         * the number of entries is a power of two.
         */
        sctp_port_hashsize = rounddown_pow_of_two(num_entries);

        for (i = 0; i < sctp_port_hashsize; i++) {
                spin_lock_init(&sctp_port_hashtable[i].lock);
                INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
        }

        status = sctp_transport_hashtable_init();
        if (status)
                goto err_thash_alloc;

        pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize,
                num_entries);

        sctp_sysctl_register();

        INIT_LIST_HEAD(&sctp_address_families);
        sctp_v4_pf_init();
        sctp_v6_pf_init();
        sctp_sched_ops_init();

        status = register_pernet_subsys(&sctp_defaults_ops);
        if (status)
                goto err_register_defaults;

        status = sctp_v4_protosw_init();
        if (status)
                goto err_protosw_init;

        status = sctp_v6_protosw_init();
        if (status)
                goto err_v6_protosw_init;

        status = register_pernet_subsys(&sctp_ctrlsock_ops);
        if (status)
                goto err_register_ctrlsock;

        status = sctp_v4_add_protocol();
        if (status)
                goto err_add_protocol;

        /* Register SCTP with inet6 layer.  */
        status = sctp_v6_add_protocol();
        if (status)
                goto err_v6_add_protocol;

        if (sctp_offload_init() < 0)
                pr_crit("%s: Cannot add SCTP protocol offload\n", __func__);

out:
        return status;
err_v6_add_protocol:
        sctp_v4_del_protocol();
err_add_protocol:
        unregister_pernet_subsys(&sctp_ctrlsock_ops);
err_register_ctrlsock:
        sctp_v6_protosw_exit();
err_v6_protosw_init:
        sctp_v4_protosw_exit();
err_protosw_init:
        unregister_pernet_subsys(&sctp_defaults_ops);
err_register_defaults:
        sctp_v4_pf_exit();
        sctp_v6_pf_exit();
        sctp_sysctl_unregister();
        free_pages((unsigned long)sctp_port_hashtable,
                   get_order(sctp_port_hashsize *
                             sizeof(struct sctp_bind_hashbucket)));
err_bhash_alloc:
        sctp_transport_hashtable_destroy();
err_thash_alloc:
        kfree(sctp_ep_hashtable);
err_ehash_alloc:
        percpu_counter_destroy(&sctp_sockets_allocated);
err_percpu_counter_init:
        kmem_cache_destroy(sctp_chunk_cachep);
err_chunk_cachep:
        kmem_cache_destroy(sctp_bucket_cachep);
        goto out;
}

/* Exit handler for the SCTP protocol.  */
static __exit void sctp_exit(void)
{
        /* BUG.  This should probably do something useful like clean
         * up all the remaining associations and all that memory.
         */

        /* Unregister with inet6/inet layers. */
        sctp_v6_del_protocol();
        sctp_v4_del_protocol();

        unregister_pernet_subsys(&sctp_ctrlsock_ops);

        /* Free protosw registrations */
        sctp_v6_protosw_exit();
        sctp_v4_protosw_exit();

        unregister_pernet_subsys(&sctp_defaults_ops);

        /* Unregister with socket layer. */
        sctp_v6_pf_exit();
        sctp_v4_pf_exit();

        sctp_sysctl_unregister();

        free_pages((unsigned long)sctp_port_hashtable,
                   get_order(sctp_port_hashsize *
                             sizeof(struct sctp_bind_hashbucket)));
        kfree(sctp_ep_hashtable);
        sctp_transport_hashtable_destroy();

        percpu_counter_destroy(&sctp_sockets_allocated);

        rcu_barrier(); /* Wait for completion of call_rcu()'s */

        kmem_cache_destroy(sctp_chunk_cachep);
        kmem_cache_destroy(sctp_bucket_cachep);
}

module_init(sctp_init);
module_exit(sctp_exit);

/*
 * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly.
 */
MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>");
MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification");
MODULE_LICENSE("GPL");















































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * workqueue.h --- work queue handling for Linux.
 */

#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H

#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask.h>
#include <linux/rcupdate.h>
#include <linux/workqueue_types.h>

/*
 * The first word is the work queue pointer and the flags rolled into
 * one
 */
#define work_data_bits(work) ((unsigned long *)(&(work)->data))

enum work_bits {
        WORK_STRUCT_PENDING_BIT        = 0,        /* work item is pending execution */
        WORK_STRUCT_INACTIVE_BIT,        /* work item is inactive */
        WORK_STRUCT_PWQ_BIT,                /* data points to pwq */
        WORK_STRUCT_LINKED_BIT,                /* next work is linked to this one */
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC_BIT,                /* static initializer (debugobjects) */
#endif
        WORK_STRUCT_FLAG_BITS,

        /* color for workqueue flushing */
        WORK_STRUCT_COLOR_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_STRUCT_COLOR_BITS        = 4,

        /*
         * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/
         * debugobjects turned off. This makes pwqs aligned to 256 bytes (512
         * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors.
         *
         * MSB
         * [ pwq pointer ] [ flush color ] [ STRUCT flags ]
         *                     4 bits        4 or 5 bits
         */
        WORK_STRUCT_PWQ_SHIFT        = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS,

        /*
         * data contains off-queue information when !WORK_STRUCT_PWQ.
         *
         * MSB
         * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ]
         *                  16 bits          1 bit        4 or 5 bits
         */
        WORK_OFFQ_FLAG_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_OFFQ_BH_BIT        = WORK_OFFQ_FLAG_SHIFT,
        WORK_OFFQ_FLAG_END,
        WORK_OFFQ_FLAG_BITS        = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT,

        WORK_OFFQ_DISABLE_SHIFT        = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS,
        WORK_OFFQ_DISABLE_BITS        = 16,

        /*
         * When a work item is off queue, the high bits encode off-queue flags
         * and the last pool it was on. Cap pool ID to 31 bits and use the
         * highest number to indicate that no pool is associated.
         */
        WORK_OFFQ_POOL_SHIFT        = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS,
        WORK_OFFQ_LEFT                = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
        WORK_OFFQ_POOL_BITS        = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,
};

enum work_flags {
        WORK_STRUCT_PENDING        = 1 << WORK_STRUCT_PENDING_BIT,
        WORK_STRUCT_INACTIVE        = 1 << WORK_STRUCT_INACTIVE_BIT,
        WORK_STRUCT_PWQ                = 1 << WORK_STRUCT_PWQ_BIT,
        WORK_STRUCT_LINKED        = 1 << WORK_STRUCT_LINKED_BIT,
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC        = 1 << WORK_STRUCT_STATIC_BIT,
#else
        WORK_STRUCT_STATIC        = 0,
#endif
};

enum wq_misc_consts {
        WORK_NR_COLORS                = (1 << WORK_STRUCT_COLOR_BITS),

        /* not bound to any CPU, prefer the local CPU */
        WORK_CPU_UNBOUND        = NR_CPUS,

        /* bit mask for work_busy() return values */
        WORK_BUSY_PENDING        = 1 << 0,
        WORK_BUSY_RUNNING        = 1 << 1,

        /* maximum string length for set_worker_desc() */
        WORKER_DESC_LEN                = 24,
};

/* Convenience constants - of type 'unsigned long', not 'enum'! */
#define WORK_OFFQ_BH                (1ul << WORK_OFFQ_BH_BIT)
#define WORK_OFFQ_FLAG_MASK        (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
#define WORK_OFFQ_DISABLE_MASK        (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT)
#define WORK_OFFQ_POOL_NONE        ((1ul << WORK_OFFQ_POOL_BITS) - 1)
#define WORK_STRUCT_NO_POOL        (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)
#define WORK_STRUCT_PWQ_MASK        (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1))

#define WORK_DATA_INIT()        ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT()        \
        ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))

struct delayed_work {
        struct work_struct work;
        struct timer_list timer;

        /* target workqueue and CPU ->timer uses to queue ->work */
        struct workqueue_struct *wq;
        int cpu;
};

struct rcu_work {
        struct work_struct work;
        struct rcu_head rcu;

        /* target workqueue ->rcu uses to queue ->work */
        struct workqueue_struct *wq;
};

enum wq_affn_scope {
        WQ_AFFN_DFL,                        /* use system default */
        WQ_AFFN_CPU,                        /* one pod per CPU */
        WQ_AFFN_SMT,                        /* one pod poer SMT */
        WQ_AFFN_CACHE,                        /* one pod per LLC */
        WQ_AFFN_NUMA,                        /* one pod per NUMA node */
        WQ_AFFN_SYSTEM,                        /* one pod across the whole system */

        WQ_AFFN_NR_TYPES,
};

/**
 * struct workqueue_attrs - A struct for workqueue attributes.
 *
 * This can be used to change attributes of an unbound workqueue.
 */
struct workqueue_attrs {
        /**
         * @nice: nice level
         */
        int nice;

        /**
         * @cpumask: allowed CPUs
         *
         * Work items in this workqueue are affine to these CPUs and not allowed
         * to execute on other CPUs. A pool serving a workqueue must have the
         * same @cpumask.
         */
        cpumask_var_t cpumask;

        /**
         * @__pod_cpumask: internal attribute used to create per-pod pools
         *
         * Internal use only.
         *
         * Per-pod unbound worker pools are used to improve locality. Always a
         * subset of ->cpumask. A workqueue can be associated with multiple
         * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
         * of a pool's @__pod_cpumask is strict depends on @affn_strict.
         */
        cpumask_var_t __pod_cpumask;

        /**
         * @affn_strict: affinity scope is strict
         *
         * If clear, workqueue will make a best-effort attempt at starting the
         * worker inside @__pod_cpumask but the scheduler is free to migrate it
         * outside.
         *
         * If set, workers are only allowed to run inside @__pod_cpumask.
         */
        bool affn_strict;

        /*
         * Below fields aren't properties of a worker_pool. They only modify how
         * :c:func:`apply_workqueue_attrs` select pools and thus don't
         * participate in pool hash calculations or equality comparisons.
         *
         * If @affn_strict is set, @cpumask isn't a property of a worker_pool
         * either.
         */

        /**
         * @affn_scope: unbound CPU affinity scope
         *
         * CPU pods are used to improve execution locality of unbound work
         * items. There are multiple pod types, one for each wq_affn_scope, and
         * every CPU in the system belongs to one pod in every pod type. CPUs
         * that belong to the same pod share the worker pool. For example,
         * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker
         * pool for each NUMA node.
         */
        enum wq_affn_scope affn_scope;

        /**
         * @ordered: work items must be executed one by one in queueing order
         */
        bool ordered;
};

static inline struct delayed_work *to_delayed_work(struct work_struct *work)
{
        return container_of(work, struct delayed_work, work);
}

static inline struct rcu_work *to_rcu_work(struct work_struct *work)
{
        return container_of(work, struct rcu_work, work);
}

struct execute_work {
        struct work_struct work;
};

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting _key
 * here is required, otherwise it could get initialised to the
 * copy of the lockdep_map!
 */
#define __WORK_INIT_LOCKDEP_MAP(n, k) \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k),
#else
#define __WORK_INIT_LOCKDEP_MAP(n, k)
#endif

#define __WORK_INITIALIZER(n, f) {                                        \
        .data = WORK_DATA_STATIC_INIT(),                                \
        .entry        = { &(n).entry, &(n).entry },                                \
        .func = (f),                                                        \
        __WORK_INIT_LOCKDEP_MAP(#n, &(n))                                \
        }

#define __DELAYED_WORK_INITIALIZER(n, f, tflags) {                        \
        .work = __WORK_INITIALIZER((n).work, (f)),                        \
        .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
                                     (tflags) | TIMER_IRQSAFE),                \
        }

#define DECLARE_WORK(n, f)                                                \
        struct work_struct n = __WORK_INITIALIZER(n, f)

#define DECLARE_DELAYED_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)

#define DECLARE_DEFERRABLE_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE)

#ifdef CONFIG_DEBUG_OBJECTS_WORK
extern void __init_work(struct work_struct *work, int onstack);
extern void destroy_work_on_stack(struct work_struct *work);
extern void destroy_delayed_work_on_stack(struct delayed_work *work);
static inline unsigned int work_static(struct work_struct *work)
{
        return *work_data_bits(work) & WORK_STRUCT_STATIC;
}
#else
static inline void __init_work(struct work_struct *work, int onstack) { }
static inline void destroy_work_on_stack(struct work_struct *work) { }
static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
static inline unsigned int work_static(struct work_struct *work) { return 0; }
#endif

/*
 * initialize all of a work item in one go
 *
 * NOTE! No point in using "atomic_long_set()": using a direct
 * assignment of the work data initializer allows the compiler
 * to generate better code.
 */
#ifdef CONFIG_LOCKDEP
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#else
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#endif

#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                static __maybe_unused struct lock_class_key __key;        \
                                                                        \
                __INIT_WORK_KEY(_work, _func, _onstack, &__key);        \
        } while (0)

#define INIT_WORK(_work, _func)                                                \
        __INIT_WORK((_work), (_func), 0)

#define INIT_WORK_ONSTACK(_work, _func)                                        \
        __INIT_WORK((_work), (_func), 1)

#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)                        \
        __INIT_WORK_KEY((_work), (_func), 1, _key)

#define __INIT_DELAYED_WORK(_work, _func, _tflags)                        \
        do {                                                                \
                INIT_WORK(&(_work)->work, (_func));                        \
                __init_timer(&(_work)->timer,                                \
                             delayed_work_timer_fn,                        \
                             (_tflags) | TIMER_IRQSAFE);                \
        } while (0)

#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)                \
        do {                                                                \
                INIT_WORK_ONSTACK(&(_work)->work, (_func));                \
                __init_timer_on_stack(&(_work)->timer,                        \
                                      delayed_work_timer_fn,                \
                                      (_tflags) | TIMER_IRQSAFE);        \
        } while (0)

#define INIT_DELAYED_WORK(_work, _func)                                        \
        __INIT_DELAYED_WORK(_work, _func, 0)

#define INIT_DELAYED_WORK_ONSTACK(_work, _func)                                \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)

#define INIT_DEFERRABLE_WORK(_work, _func)                                \
        __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)

#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)                        \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)

#define INIT_RCU_WORK(_work, _func)                                        \
        INIT_WORK(&(_work)->work, (_func))

#define INIT_RCU_WORK_ONSTACK(_work, _func)                                \
        INIT_WORK_ONSTACK(&(_work)->work, (_func))

/**
 * work_pending - Find out whether a work item is currently pending
 * @work: The work item in question
 */
#define work_pending(work) \
        test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))

/**
 * delayed_work_pending - Find out whether a delayable work item is currently
 * pending
 * @w: The work item in question
 */
#define delayed_work_pending(w) \
        work_pending(&(w)->work)

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/core-api/workqueue.rst.
 */
enum wq_flags {
        WQ_BH                        = 1 << 0, /* execute in bottom half (softirq) context */
        WQ_UNBOUND                = 1 << 1, /* not bound to any cpu */
        WQ_FREEZABLE                = 1 << 2, /* freeze during suspend */
        WQ_MEM_RECLAIM                = 1 << 3, /* may be used for memory reclaim */
        WQ_HIGHPRI                = 1 << 4, /* high priority */
        WQ_CPU_INTENSIVE        = 1 << 5, /* cpu intensive workqueue */
        WQ_SYSFS                = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */

        /*
         * Per-cpu workqueues are generally preferred because they tend to
         * show better performance thanks to cache locality.  Per-cpu
         * workqueues exclude the scheduler from choosing the CPU to
         * execute the worker threads, which has an unfortunate side effect
         * of increasing power consumption.
         *
         * The scheduler considers a CPU idle if it doesn't have any task
         * to execute and tries to keep idle cores idle to conserve power;
         * however, for example, a per-cpu work item scheduled from an
         * interrupt handler on an idle CPU will force the scheduler to
         * execute the work item on that CPU breaking the idleness, which in
         * turn may lead to more scheduling choices which are sub-optimal
         * in terms of power consumption.
         *
         * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
         * but become unbound if workqueue.power_efficient kernel param is
         * specified.  Per-cpu workqueues which are identified to
         * contribute significantly to power-consumption are identified and
         * marked with this flag and enabling the power_efficient mode
         * leads to noticeable power saving at the cost of small
         * performance disadvantage.
         *
         * http://thread.gmane.org/gmane.linux.kernel/1480396
         */
        WQ_POWER_EFFICIENT        = 1 << 7,

        __WQ_DESTROYING                = 1 << 15, /* internal: workqueue is destroying */
        __WQ_DRAINING                = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED                = 1 << 17, /* internal: workqueue is ordered */
        __WQ_LEGACY                = 1 << 18, /* internal: create*_workqueue() */

        /* BH wq only allows the following flags */
        __WQ_BH_ALLOWS                = WQ_BH | WQ_HIGHPRI,
};

enum wq_consts {
        WQ_MAX_ACTIVE                = 512,          /* I like 512, better ideas? */
        WQ_UNBOUND_MAX_ACTIVE        = WQ_MAX_ACTIVE,
        WQ_DFL_ACTIVE                = WQ_MAX_ACTIVE / 2,

        /*
         * Per-node default cap on min_active. Unless explicitly set, min_active
         * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see
         * workqueue_struct->min_active definition.
         */
        WQ_DFL_MIN_ACTIVE        = 8,
};

/*
 * System-wide workqueues which are always present.
 *
 * system_wq is the one used by schedule[_delayed]_work[_on]().
 * Multi-CPU multi-threaded.  There are users which expect relatively
 * short queue flush time.  Don't queue works which can run for too
 * long.
 *
 * system_highpri_wq is similar to system_wq but for work items which
 * require WQ_HIGHPRI.
 *
 * system_long_wq is similar to system_wq but may host long running
 * works.  Queue flushing might take relatively long.
 *
 * system_unbound_wq is unbound workqueue.  Workers are not bound to
 * any specific CPU, not concurrency managed, and all queued works are
 * executed immediately as long as max_active limit is not reached and
 * resources are available.
 *
 * system_freezable_wq is equivalent to system_wq except that it's
 * freezable.
 *
 * *_power_efficient_wq are inclined towards saving power and converted
 * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
 * they are same as their non-power-efficient counterparts - e.g.
 * system_power_efficient_wq is identical to system_wq if
 * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
 *
 * system_bh[_highpri]_wq are convenience interface to softirq. BH work items
 * are executed in the queueing CPU's BH context in the queueing order.
 */
extern struct workqueue_struct *system_wq;
extern struct workqueue_struct *system_highpri_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_freezable_wq;
extern struct workqueue_struct *system_power_efficient_wq;
extern struct workqueue_struct *system_freezable_power_efficient_wq;
extern struct workqueue_struct *system_bh_wq;
extern struct workqueue_struct *system_bh_highpri_wq;

void workqueue_softirq_action(bool highpri);
void workqueue_softirq_dead(unsigned int cpu);

/**
 * alloc_workqueue - allocate a workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @...: args for @fmt
 *
 * For a per-cpu workqueue, @max_active limits the number of in-flight work
 * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be
 * executing at most one work item for the workqueue.
 *
 * For unbound workqueues, @max_active limits the number of in-flight work items
 * for the whole system. e.g. @max_active of 16 indicates that that there can be
 * at most 16 work items executing for the workqueue in the whole system.
 *
 * As sharing the same active counter for an unbound workqueue across multiple
 * NUMA nodes can be expensive, @max_active is distributed to each NUMA node
 * according to the proportion of the number of online CPUs and enforced
 * independently.
 *
 * Depending on online CPU distribution, a node may end up with per-node
 * max_active which is significantly lower than @max_active, which can lead to
 * deadlocks if the per-node concurrency limit is lower than the maximum number
 * of interdependent work items for the workqueue.
 *
 * To guarantee forward progress regardless of online CPU distribution, the
 * concurrency limit on every node is guaranteed to be equal to or greater than
 * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means
 * that the sum of per-node max_active's may be larger than @max_active.
 *
 * For detailed information on %WQ_* flags, please refer to
 * Documentation/core-api/workqueue.rst.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(1, 4) struct workqueue_struct *
alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...);

/**
 * alloc_ordered_workqueue - allocate an ordered workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @args: args for @fmt
 *
 * Allocate an ordered workqueue.  An ordered workqueue executes at
 * most one work item at any given time in the queued order.  They are
 * implemented as unbound workqueues with @max_active of one.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue(fmt, flags, args...)                        \
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)

#define create_workqueue(name)                                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |        \
                        WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)                                \
        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

#define from_work(var, callback_work, work_fieldname)        \
        container_of(callback_work, typeof(*var), work_fieldname)

extern void destroy_workqueue(struct workqueue_struct *wq);

struct workqueue_attrs *alloc_workqueue_attrs(void);
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs);
extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);

extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work);
extern bool queue_work_node(int node, struct workqueue_struct *wq,
                            struct work_struct *work);
extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *work, unsigned long delay);
extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay);
extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);

extern void __flush_workqueue(struct workqueue_struct *wq);
extern void drain_workqueue(struct workqueue_struct *wq);

extern int schedule_on_each_cpu(work_func_t func);

int execute_in_process_context(work_func_t fn, struct execute_work *);

extern bool flush_work(struct work_struct *work);
extern bool cancel_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);

extern bool flush_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work_sync(struct delayed_work *dwork);

extern bool disable_work(struct work_struct *work);
extern bool disable_work_sync(struct work_struct *work);
extern bool enable_work(struct work_struct *work);

extern bool disable_delayed_work(struct delayed_work *dwork);
extern bool disable_delayed_work_sync(struct delayed_work *dwork);
extern bool enable_delayed_work(struct delayed_work *dwork);

extern bool flush_rcu_work(struct rcu_work *rwork);

extern void workqueue_set_max_active(struct workqueue_struct *wq,
                                     int max_active);
extern void workqueue_set_min_active(struct workqueue_struct *wq,
                                     int min_active);
extern struct work_struct *current_work(void);
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_all_workqueues(void);
extern void show_freezable_workqueues(void);
extern void show_one_workqueue(struct workqueue_struct *wq);
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);

/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 *
 * Memory-ordering properties:  If it returns %true, guarantees that all stores
 * preceding the call to queue_work() in the program order will be visible from
 * the CPU which will execute @work by the time such work executes, e.g.,
 *
 * { x is initially 0 }
 *
 *   CPU0                                CPU1
 *
 *   WRITE_ONCE(x, 1);                        [ @work is being executed ]
 *   r0 = queue_work(wq, work);                  r1 = READ_ONCE(x);
 *
 * Forbids: r0 == true && r1 == 0
 */
static inline bool queue_work(struct workqueue_struct *wq,
                              struct work_struct *work)
{
        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
 */
static inline bool queue_delayed_work(struct workqueue_struct *wq,
                                      struct delayed_work *dwork,
                                      unsigned long delay)
{
        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * mod_delayed_work_on() on local CPU.
 */
static inline bool mod_delayed_work(struct workqueue_struct *wq,
                                    struct delayed_work *dwork,
                                    unsigned long delay)
{
        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
        return queue_work_on(cpu, system_wq, work);
}

/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
 * Returns %false if @work was already on the kernel-global workqueue and
 * %true otherwise.
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
 *
 * Shares the same memory-ordering properties of queue_work(), cf. the
 * DocBook header of queue_work().
 */
static inline bool schedule_work(struct work_struct *work)
{
        return queue_work(system_wq, work);
}

/**
 * enable_and_queue_work - Enable and queue a work item on a specific workqueue
 * @wq: The target workqueue
 * @work: The work item to be enabled and queued
 *
 * This function combines the operations of enable_work() and queue_work(),
 * providing a convenient way to enable and queue a work item in a single call.
 * It invokes enable_work() on @work and then queues it if the disable depth
 * reached 0. Returns %true if the disable depth reached 0 and @work is queued,
 * and %false otherwise.
 *
 * Note that @work is always queued when disable depth reaches zero. If the
 * desired behavior is queueing only if certain events took place while @work is
 * disabled, the user should implement the necessary state tracking and perform
 * explicit conditional queueing after enable_work().
 */
static inline bool enable_and_queue_work(struct workqueue_struct *wq,
                                         struct work_struct *work)
{
        if (enable_work(work)) {
                queue_work(wq, work);
                return true;
        }
        return false;
}

/*
 * Detect attempt to flush system-wide workqueues at compile time when possible.
 * Warn attempt to flush system-wide workqueues at runtime.
 *
 * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp
 * for reasons and steps for converting system-wide workqueues into local workqueues.
 */
extern void __warn_flushing_systemwide_wq(void)
        __compiletime_warning("Please avoid flushing system-wide workqueues.");

/* Please stop using this function, for this function will be removed in near future. */
#define flush_scheduled_work()                                                \
({                                                                        \
        __warn_flushing_systemwide_wq();                                \
        __flush_workqueue(system_wq);                                        \
})

#define flush_workqueue(wq)                                                \
({                                                                        \
        struct workqueue_struct *_wq = (wq);                                \
                                                                        \
        if ((__builtin_constant_p(_wq == system_wq) &&                        \
             _wq == system_wq) ||                                        \
            (__builtin_constant_p(_wq == system_highpri_wq) &&                \
             _wq == system_highpri_wq) ||                                \
            (__builtin_constant_p(_wq == system_long_wq) &&                \
             _wq == system_long_wq) ||                                        \
            (__builtin_constant_p(_wq == system_unbound_wq) &&                \
             _wq == system_unbound_wq) ||                                \
            (__builtin_constant_p(_wq == system_freezable_wq) &&        \
             _wq == system_freezable_wq) ||                                \
            (__builtin_constant_p(_wq == system_power_efficient_wq) &&        \
             _wq == system_power_efficient_wq) ||                        \
            (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \
             _wq == system_freezable_power_efficient_wq))                \
                __warn_flushing_systemwide_wq();                        \
        __flush_workqueue(_wq);                                                \
})

/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
                                            unsigned long delay)
{
        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}

/**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
static inline bool schedule_delayed_work(struct delayed_work *dwork,
                                         unsigned long delay)
{
        return queue_delayed_work(system_wq, dwork, delay);
}

#ifndef CONFIG_SMP
static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
#else
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key);
/*
 * A new key is defined for each caller to make sure the work
 * associated with the function doesn't share its locking class.
 */
#define work_on_cpu(_cpu, _fn, _arg)                        \
({                                                        \
        static struct lock_class_key __key;                \
                                                        \
        work_on_cpu_key(_cpu, _fn, _arg, &__key);        \
})

long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
                          void *arg, struct lock_class_key *key);

/*
 * A new key is defined for each caller to make sure the work
 * associated with the function doesn't share its locking class.
 */
#define work_on_cpu_safe(_cpu, _fn, _arg)                \
({                                                        \
        static struct lock_class_key __key;                \
                                                        \
        work_on_cpu_safe_key(_cpu, _fn, _arg, &__key);        \
})
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER
extern void freeze_workqueues_begin(void);
extern bool freeze_workqueues_busy(void);
extern void thaw_workqueues(void);
#endif /* CONFIG_FREEZER */

#ifdef CONFIG_SYSFS
int workqueue_sysfs_register(struct workqueue_struct *wq);
#else        /* CONFIG_SYSFS */
static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; }
#endif        /* CONFIG_SYSFS */

#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else        /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif        /* CONFIG_WQ_WATCHDOG */

#ifdef CONFIG_SMP
int workqueue_prepare_cpu(unsigned int cpu);
int workqueue_online_cpu(unsigned int cpu);
int workqueue_offline_cpu(unsigned int cpu);
#endif

void __init workqueue_init_early(void);
void __init workqueue_init(void);
void __init workqueue_init_topology(void);

#endif

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 */
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H

#include <linux/bitops.h>
#include <linux/gfp_types.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/math.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <linux/local_lock.h>

/* Keep unconverted code working */
#define radix_tree_root                xarray
#define radix_tree_node                xa_node

struct radix_tree_preload {
        local_lock_t lock;
        unsigned nr;
        /* nodes->parent points to next preallocated node */
        struct radix_tree_node *nodes;
};
DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads);

/*
 * The bottom two bits of the slot determine how the remaining bits in the
 * slot are interpreted:
 *
 * 00 - data pointer
 * 10 - internal entry
 * x1 - value entry
 *
 * The internal entry may be a pointer to the next level in the tree, a
 * sibling entry, or an indicator that the entry in this slot has been moved
 * to another location in the tree and the lookup should be restarted.  While
 * NULL fits the 'data pointer' pattern, it means that there is no entry in
 * the tree for this index (no matter what level of the tree it is found at).
 * This means that storing a NULL entry in the tree is the same as deleting
 * the entry from the tree.
 */
#define RADIX_TREE_ENTRY_MASK                3UL
#define RADIX_TREE_INTERNAL_NODE        2UL

static inline bool radix_tree_is_internal_node(void *ptr)
{
        return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
                                RADIX_TREE_INTERNAL_NODE;
}

/*** radix-tree API starts here ***/

#define RADIX_TREE_MAP_SHIFT        XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE        (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK        (RADIX_TREE_MAP_SIZE-1)

#define RADIX_TREE_MAX_TAGS        XA_MAX_MARKS
#define RADIX_TREE_TAG_LONGS        XA_MARK_LONGS

#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
                                          RADIX_TREE_MAP_SHIFT))

/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR        ((__force gfp_t)4)
/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT        (__GFP_BITS_SHIFT)

#define RADIX_TREE_INIT(name, mask)        XARRAY_INIT(name, mask)

#define RADIX_TREE(name, mask) \
        struct radix_tree_root name = RADIX_TREE_INIT(name, mask)

#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)

static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
        return root->xa_head == NULL;
}

/**
 * struct radix_tree_iter - radix tree iterator state
 *
 * @index:        index of current slot
 * @next_index:        one beyond the last index for this chunk
 * @tags:        bit-mask for tag-iterating
 * @node:        node that contains current slot
 *
 * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
 * subinterval of slots contained within one radix tree leaf node.  It is
 * described by a pointer to its first slot and a struct radix_tree_iter
 * which holds the chunk's position in the tree and its size.  For tagged
 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
 * radix tree tag.
 */
struct radix_tree_iter {
        unsigned long        index;
        unsigned long        next_index;
        unsigned long        tags;
        struct radix_tree_node *node;
};

/**
 * Radix-tree synchronization
 *
 * The radix-tree API requires that users provide all synchronisation (with
 * specific exceptions, noted below).
 *
 * Synchronization of access to the data items being stored in the tree, and
 * management of their lifetimes must be completely managed by API users.
 *
 * For API usage, in general,
 * - any function _modifying_ the tree or tags (inserting or deleting
 *   items, setting or clearing tags) must exclude other modifications, and
 *   exclude any functions reading the tree.
 * - any function _reading_ the tree or tags (looking up items or tags,
 *   gang lookups) must exclude modifications to the tree, but may occur
 *   concurrently with other readers.
 *
 * The notable exceptions to this rule are the following functions:
 * __radix_tree_lookup
 * radix_tree_lookup
 * radix_tree_lookup_slot
 * radix_tree_tag_get
 * radix_tree_gang_lookup
 * radix_tree_gang_lookup_tag
 * radix_tree_gang_lookup_tag_slot
 * radix_tree_tagged
 *
 * The first 7 functions are able to be called locklessly, using RCU. The
 * caller must ensure calls to these functions are made within rcu_read_lock()
 * regions. Other readers (lock-free or otherwise) and modifications may be
 * running concurrently.
 *
 * It is still required that the caller manage the synchronization and lifetimes
 * of the items. So if RCU lock-free lookups are used, typically this would mean
 * that the items have their own locks, or are amenable to lock-free access; and
 * that the items are freed by RCU (or only freed after having been deleted from
 * the radix tree *and* a synchronize_rcu() grace period).
 *
 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
 * access to data items when inserting into or looking up from the radix tree)
 *
 * Note that the value returned by radix_tree_tag_get() may not be relied upon
 * if only the RCU read lock is held.  Functions to set/clear tags and to
 * delete nodes running concurrently with it may affect its result such that
 * two consecutive reads in the same locked section may return different
 * values.  If reliability is required, modification functions must also be
 * excluded from concurrency.
 *
 * radix_tree_tagged is able to be called without locking or RCU.
 */

/**
 * radix_tree_deref_slot - dereference a slot
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
 * locked across slot lookup and dereference. Not required if write lock is
 * held (ie. items cannot be concurrently inserted).
 *
 * radix_tree_deref_retry must be used to confirm validity of the pointer if
 * only the read lock is held.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot(void __rcu **slot)
{
        return rcu_dereference(*slot);
}

/**
 * radix_tree_deref_slot_protected - dereference a slot with tree lock held
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
 * lock but it must hold the tree lock to prevent parallel updates.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
                                                        spinlock_t *treelock)
{
        return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
}

/**
 * radix_tree_deref_retry        - check radix_tree_deref_slot
 * @arg:        pointer returned by radix_tree_deref_slot
 * Returns:        0 if retry is not required, otherwise retry is required
 *
 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
 */
static inline int radix_tree_deref_retry(void *arg)
{
        return unlikely(radix_tree_is_internal_node(arg));
}

/**
 * radix_tree_exception        - radix_tree_deref_slot returned either exception?
 * @arg:        value returned by radix_tree_deref_slot
 * Returns:        0 if well-aligned pointer, non-0 if either kind of exception.
 */
static inline int radix_tree_exception(void *arg)
{
        return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}

int radix_tree_insert(struct radix_tree_root *, unsigned long index,
                        void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
                          struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
                                        unsigned long index);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
                          void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
                const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
                             void __rcu **slot, void *entry);
void radix_tree_iter_delete(struct radix_tree_root *,
                        struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
                        void **results, unsigned long first_index,
                        unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void *radix_tree_tag_clear(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
                const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
                void **results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);

static inline void radix_tree_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max);

enum {
        RADIX_TREE_ITER_TAG_MASK = 0x0f,        /* tag index in lower nybble */
        RADIX_TREE_ITER_TAGGED   = 0x10,        /* lookup tagged slots */
        RADIX_TREE_ITER_CONTIG   = 0x20,        /* stop at first hole */
};

/**
 * radix_tree_iter_init - initialize radix tree iterator
 *
 * @iter:        pointer to iterator state
 * @start:        iteration starting index
 * Returns:        NULL
 */
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
        /*
         * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
         * in the case of a successful tagged chunk lookup.  If the lookup was
         * unsuccessful or non-tagged then nobody cares about ->tags.
         *
         * Set index to zero to bypass next_index overflow protection.
         * See the comment in radix_tree_next_chunk() for details.
         */
        iter->index = 0;
        iter->next_index = start;
        return NULL;
}

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if there no more left
 *
 * This function looks up the next chunk in the radix tree starting from
 * @iter->next_index.  It returns a pointer to the chunk's first slot.
 * Also it fills @iter with data about chunk: position in the tree (index),
 * its end (next_index), and constructs a bit mask for tagged iterating (tags).
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
                             struct radix_tree_iter *iter, unsigned flags);

/**
 * radix_tree_iter_lookup - look up an index in the radix tree
 * @root: radix tree root
 * @iter: iterator state
 * @index: key to look up
 *
 * If @index is present in the radix tree, this function returns the slot
 * containing it and updates @iter to describe the entry.  If @index is not
 * present, it returns NULL.
 */
static inline void __rcu **
radix_tree_iter_lookup(const struct radix_tree_root *root,
                        struct radix_tree_iter *iter, unsigned long index)
{
        radix_tree_iter_init(iter, index);
        return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
}

/**
 * radix_tree_iter_retry - retry this chunk of the iteration
 * @iter:        iterator state
 *
 * If we iterate over a tree protected only by the RCU lock, a race
 * against deletion or creation may result in seeing a slot for which
 * radix_tree_deref_retry() returns true.  If so, call this function
 * and continue the iteration.
 */
static inline __must_check
void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}

static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
        return iter->index + slots;
}

/**
 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
 * @slot: pointer to current slot
 * @iter: iterator state
 * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
 * before releasing the lock to continue the iteration from the next index.
 */
void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter);

/**
 * radix_tree_chunk_size - get current chunk size
 *
 * @iter:        pointer to radix tree iterator
 * Returns:        current chunk size
 */
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
        return iter->next_index - iter->index;
}

/**
 * radix_tree_next_slot - find next slot in chunk
 *
 * @slot:        pointer to current slot
 * @iter:        pointer to iterator state
 * @flags:        RADIX_TREE_ITER_*, should be constant
 * Returns:        pointer to next slot, or NULL if there no more left
 *
 * This function updates @iter->index in the case of a successful lookup.
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
 * b) we are doing non-tagged iteration, and iter->index and iter->next_index
 *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
 */
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
                                struct radix_tree_iter *iter, unsigned flags)
{
        if (flags & RADIX_TREE_ITER_TAGGED) {
                iter->tags >>= 1;
                if (unlikely(!iter->tags))
                        return NULL;
                if (likely(iter->tags & 1ul)) {
                        iter->index = __radix_tree_iter_add(iter, 1);
                        slot++;
                        goto found;
                }
                if (!(flags & RADIX_TREE_ITER_CONTIG)) {
                        unsigned offset = __ffs(iter->tags);

                        iter->tags >>= offset++;
                        iter->index = __radix_tree_iter_add(iter, offset);
                        slot += offset;
                        goto found;
                }
        } else {
                long count = radix_tree_chunk_size(iter);

                while (--count > 0) {
                        slot++;
                        iter->index = __radix_tree_iter_add(iter, 1);

                        if (likely(*slot))
                                goto found;
                        if (flags & RADIX_TREE_ITER_CONTIG) {
                                /* forbid switching to the next chunk */
                                iter->next_index = 0;
                                break;
                        }
                }
        }
        return NULL;

 found:
        return slot;
}

/**
 * radix_tree_for_each_slot - iterate over non-empty slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_slot(slot, root, iter, start)                \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;        \
             slot = radix_tree_next_slot(slot, iter, 0))

/**
 * radix_tree_for_each_tagged - iterate over tagged slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 * @tag:        tag index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_tagged(slot, root, iter, start, tag)        \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter,                \
                              RADIX_TREE_ITER_TAGGED | tag)) ;                \
             slot = radix_tree_next_slot(slot, iter,                        \
                                RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */





































































































































































































































































































































































































































































    1 

    1 
    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate_wait.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/ksm.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "mm_slot.h"

enum scan_result {
        SCAN_FAIL,
        SCAN_SUCCEED,
        SCAN_PMD_NULL,
        SCAN_PMD_NONE,
        SCAN_PMD_MAPPED,
        SCAN_EXCEED_NONE_PTE,
        SCAN_EXCEED_SWAP_PTE,
        SCAN_EXCEED_SHARED_PTE,
        SCAN_PTE_NON_PRESENT,
        SCAN_PTE_UFFD_WP,
        SCAN_PTE_MAPPED_HUGEPAGE,
        SCAN_PAGE_RO,
        SCAN_LACK_REFERENCED_PAGE,
        SCAN_PAGE_NULL,
        SCAN_SCAN_ABORT,
        SCAN_PAGE_COUNT,
        SCAN_PAGE_LRU,
        SCAN_PAGE_LOCK,
        SCAN_PAGE_ANON,
        SCAN_PAGE_COMPOUND,
        SCAN_ANY_PROCESS,
        SCAN_VMA_NULL,
        SCAN_VMA_CHECK,
        SCAN_ADDRESS_RANGE,
        SCAN_DEL_PAGE_LRU,
        SCAN_ALLOC_HUGE_PAGE_FAIL,
        SCAN_CGROUP_CHARGE_FAIL,
        SCAN_TRUNCATED,
        SCAN_PAGE_HAS_PRIVATE,
        SCAN_STORE_FAILED,
        SCAN_COPY_MC,
        SCAN_PAGE_FILLED,
};

#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>

static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);

/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static unsigned long khugepaged_sleep_expire;
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
/*
 * default collapse hugepages if there is at least one pte mapped like
 * it would have happened if the vma was large enough during page
 * fault.
 *
 * Note that these are only respected if collapse was initiated by khugepaged.
 */
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;

#define MM_SLOTS_HASH_BITS 10
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct kmem_cache *mm_slot_cache __ro_after_init;

struct collapse_control {
        bool is_khugepaged;

        /* Num pages scanned per node */
        u32 node_load[MAX_NUMNODES];

        /* nodemask for allocation fallback */
        nodemask_t alloc_nmask;
};

/**
 * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
 * @slot: hash lookup from mm to mm_slot
 */
struct khugepaged_mm_slot {
        struct mm_slot slot;
};

/**
 * struct khugepaged_scan - cursor for scanning
 * @mm_head: the head of the mm list to scan
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 *
 * There is only the one khugepaged_scan instance of this cursor structure.
 */
struct khugepaged_scan {
        struct list_head mm_head;
        struct khugepaged_mm_slot *mm_slot;
        unsigned long address;
};

static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};

#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}

static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        khugepaged_scan_sleep_millisecs = msecs;
        khugepaged_sleep_expire = 0;
        wake_up_interruptible(&khugepaged_wait);

        return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
        __ATTR_RW(scan_sleep_millisecs);

static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}

static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
                                           struct kobj_attribute *attr,
                                           const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        khugepaged_alloc_sleep_millisecs = msecs;
        khugepaged_sleep_expire = 0;
        wake_up_interruptible(&khugepaged_wait);

        return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
        __ATTR_RW(alloc_sleep_millisecs);

static ssize_t pages_to_scan_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        unsigned int pages;
        int err;

        err = kstrtouint(buf, 10, &pages);
        if (err || !pages)
                return -EINVAL;

        khugepaged_pages_to_scan = pages;

        return count;
}
static struct kobj_attribute pages_to_scan_attr =
        __ATTR_RW(pages_to_scan);

static ssize_t pages_collapsed_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
        __ATTR_RO(pages_collapsed);

static ssize_t full_scans_show(struct kobject *kobj,
                               struct kobj_attribute *attr,
                               char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
        __ATTR_RO(full_scans);

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
        __ATTR_RW(defrag);

/*
 * max_ptes_none controls if khugepaged should collapse hugepages over
 * any unmapped ptes in turn potentially increasing the memory
 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
 * reduce the available free memory in the system as it
 * runs. Increasing max_ptes_none will instead potentially reduce the
 * free memory in the system during the khugepaged scan.
 */
static ssize_t max_ptes_none_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
static ssize_t max_ptes_none_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_none;

        err = kstrtoul(buf, 10, &max_ptes_none);
        if (err || max_ptes_none > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_none = max_ptes_none;

        return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
        __ATTR_RW(max_ptes_none);

static ssize_t max_ptes_swap_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}

static ssize_t max_ptes_swap_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_swap;

        err  = kstrtoul(buf, 10, &max_ptes_swap);
        if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_swap = max_ptes_swap;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_swap_attr =
        __ATTR_RW(max_ptes_swap);

static ssize_t max_ptes_shared_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}

static ssize_t max_ptes_shared_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_shared;

        err  = kstrtoul(buf, 10, &max_ptes_shared);
        if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_shared = max_ptes_shared;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_shared_attr =
        __ATTR_RW(max_ptes_shared);

static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
        &khugepaged_max_ptes_swap_attr.attr,
        &khugepaged_max_ptes_shared_attr.attr,
        &pages_to_scan_attr.attr,
        &pages_collapsed_attr.attr,
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
        NULL,
};

struct attribute_group khugepaged_attr_group = {
        .attrs = khugepaged_attr,
        .name = "khugepaged",
};
#endif /* CONFIG_SYSFS */

int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
{
        switch (advice) {
        case MADV_HUGEPAGE:
#ifdef CONFIG_S390
                /*
                 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
                 * can't handle this properly after s390_enable_sie, so we simply
                 * ignore the madvise to prevent qemu from causing a SIGSEGV.
                 */
                if (mm_has_pgste(vma->vm_mm))
                        return 0;
#endif
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
                 * If the vma become good for khugepaged to scan,
                 * register it here without waiting a page fault that
                 * may not happen any time soon.
                 */
                khugepaged_enter_vma(vma, *vm_flags);
                break;
        case MADV_NOHUGEPAGE:
                *vm_flags &= ~VM_HUGEPAGE;
                *vm_flags |= VM_NOHUGEPAGE;
                /*
                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
                 * this vma even if we leave the mm registered in khugepaged if
                 * it got registered before VM_NOHUGEPAGE was set.
                 */
                break;
        }

        return 0;
}

int __init khugepaged_init(void)
{
        mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
                                          sizeof(struct khugepaged_mm_slot),
                                          __alignof__(struct khugepaged_mm_slot),
                                          0, NULL);
        if (!mm_slot_cache)
                return -ENOMEM;

        khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
        khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
        khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;

        return 0;
}

void __init khugepaged_destroy(void)
{
        kmem_cache_destroy(mm_slot_cache);
}

static inline int hpage_collapse_test_exit(struct mm_struct *mm)
{
        return atomic_read(&mm->mm_users) == 0;
}

static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
        return hpage_collapse_test_exit(mm) ||
               test_bit(MMF_DISABLE_THP, &mm->flags);
}

void __khugepaged_enter(struct mm_struct *mm)
{
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        int wakeup;

        /* __khugepaged_exit() must not run from under us */
        VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
                return;

        mm_slot = mm_slot_alloc(mm_slot_cache);
        if (!mm_slot)
                return;

        slot = &mm_slot->slot;

        spin_lock(&khugepaged_mm_lock);
        mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * Insert just behind the scanning cursor, to let the area settle
         * down a little.
         */
        wakeup = list_empty(&khugepaged_scan.mm_head);
        list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
        spin_unlock(&khugepaged_mm_lock);

        mmgrab(mm);
        if (wakeup)
                wake_up_interruptible(&khugepaged_wait);
}

void khugepaged_enter_vma(struct vm_area_struct *vma,
                          unsigned long vm_flags)
{
        if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
            hugepage_flags_enabled()) {
                if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
                                            PMD_ORDER))
                        __khugepaged_enter(vma->vm_mm);
        }
}

void __khugepaged_exit(struct mm_struct *mm)
{
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        int free = 0;

        spin_lock(&khugepaged_mm_lock);
        slot = mm_slot_lookup(mm_slots_hash, mm);
        mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
                hash_del(&slot->hash);
                list_del(&slot->mm_node);
                free = 1;
        }
        spin_unlock(&khugepaged_mm_lock);

        if (free) {
                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        } else if (mm_slot) {
                /*
                 * This is required to serialize against
                 * hpage_collapse_test_exit() (which is guaranteed to run
                 * under mmap sem read mode). Stop here (after we return all
                 * pagetables will be destroyed) until khugepaged has finished
                 * working on the pagetables under the mmap_lock.
                 */
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }
}

static void release_pte_folio(struct folio *folio)
{
        node_stat_mod_folio(folio,
                        NR_ISOLATED_ANON + folio_is_file_lru(folio),
                        -folio_nr_pages(folio));
        folio_unlock(folio);
        folio_putback_lru(folio);
}

static void release_pte_pages(pte_t *pte, pte_t *_pte,
                struct list_head *compound_pagelist)
{
        struct folio *folio, *tmp;

        while (--_pte >= pte) {
                pte_t pteval = ptep_get(_pte);
                unsigned long pfn;

                if (pte_none(pteval))
                        continue;
                pfn = pte_pfn(pteval);
                if (is_zero_pfn(pfn))
                        continue;
                folio = pfn_folio(pfn);
                if (folio_test_large(folio))
                        continue;
                release_pte_folio(folio);
        }

        list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
                list_del(&folio->lru);
                release_pte_folio(folio);
        }
}

static bool is_refcount_suitable(struct folio *folio)
{
        int expected_refcount;

        expected_refcount = folio_mapcount(folio);
        if (folio_test_swapcache(folio))
                expected_refcount += folio_nr_pages(folio);

        return folio_ref_count(folio) == expected_refcount;
}

static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte,
                                        struct collapse_control *cc,
                                        struct list_head *compound_pagelist)
{
        struct page *page = NULL;
        struct folio *folio = NULL;
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
        bool writable = false;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = ptep_get(_pte);
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out;
                        }
                }
                if (!pte_present(pteval)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto out;
                }
                if (pte_uffd_wp(pteval)) {
                        result = SCAN_PTE_UFFD_WP;
                        goto out;
                }
                page = vm_normal_page(vma, address, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out;
                }

                folio = page_folio(page);
                VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);

                /* See hpage_collapse_scan_pmd(). */
                if (folio_likely_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out;
                        }
                }

                if (folio_test_large(folio)) {
                        struct folio *f;

                        /*
                         * Check if we have dealt with the compound page
                         * already
                         */
                        list_for_each_entry(f, compound_pagelist, lru) {
                                if (folio == f)
                                        goto next;
                        }
                }

                /*
                 * We can do it before isolate_lru_page because the
                 * page can't be freed from under us. NOTE: PG_lock
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
                if (!folio_trylock(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * The page table that maps the page has been already unlinked
                 * from the page table tree and this process cannot get
                 * an additional pin on the page.
                 *
                 * New pins can come later if the page is shared across fork,
                 * but not from this process. The other process cannot write to
                 * the page, only trigger CoW.
                 */
                if (!is_refcount_suitable(folio)) {
                        folio_unlock(folio);
                        result = SCAN_PAGE_COUNT;
                        goto out;
                }

                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
                 */
                if (!folio_isolate_lru(folio)) {
                        folio_unlock(folio);
                        result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
                node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
                VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

                if (folio_test_large(folio))
                        list_add_tail(&folio->lru, compound_pagelist);
next:
                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;

                if (pte_write(pteval))
                        writable = true;
        }

        if (unlikely(!writable)) {
                result = SCAN_PAGE_RO;
        } else if (unlikely(cc->is_khugepaged && !referenced)) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
                trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                                    referenced, writable, result);
                return result;
        }
out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                            referenced, writable, result);
        return result;
}

static void __collapse_huge_page_copy_succeeded(pte_t *pte,
                                                struct vm_area_struct *vma,
                                                unsigned long address,
                                                spinlock_t *ptl,
                                                struct list_head *compound_pagelist)
{
        struct folio *src, *tmp;
        pte_t *_pte;
        pte_t pteval;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pteval = ptep_get(_pte);
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
                        if (is_zero_pfn(pte_pfn(pteval))) {
                                /*
                                 * ptl mostly unnecessary.
                                 */
                                spin_lock(ptl);
                                ptep_clear(vma->vm_mm, address, _pte);
                                spin_unlock(ptl);
                                ksm_might_unmap_zero_page(vma->vm_mm, pteval);
                        }
                } else {
                        struct page *src_page = pte_page(pteval);

                        src = page_folio(src_page);
                        if (!folio_test_large(src))
                                release_pte_folio(src);
                        /*
                         * ptl mostly unnecessary, but preempt has to
                         * be disabled to update the per-cpu stats
                         * inside folio_remove_rmap_pte().
                         */
                        spin_lock(ptl);
                        ptep_clear(vma->vm_mm, address, _pte);
                        folio_remove_rmap_pte(src, src_page, vma);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
        }

        list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
                list_del(&src->lru);
                node_stat_sub_folio(src, NR_ISOLATED_ANON +
                                folio_is_file_lru(src));
                folio_unlock(src);
                free_swap_cache(src);
                folio_putback_lru(src);
        }
}

static void __collapse_huge_page_copy_failed(pte_t *pte,
                                             pmd_t *pmd,
                                             pmd_t orig_pmd,
                                             struct vm_area_struct *vma,
                                             struct list_head *compound_pagelist)
{
        spinlock_t *pmd_ptl;

        /*
         * Re-establish the PMD to point to the original page table
         * entry. Restoring PMD needs to be done prior to releasing
         * pages. Since pages are still isolated and locked here,
         * acquiring anon_vma_lock_write is unnecessary.
         */
        pmd_ptl = pmd_lock(vma->vm_mm, pmd);
        pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
        spin_unlock(pmd_ptl);
        /*
         * Release both raw and compound pages isolated
         * in __collapse_huge_page_isolate.
         */
        release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
}

/*
 * __collapse_huge_page_copy - attempts to copy memory contents from raw
 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
 * otherwise restores the original page table and releases isolated raw pages.
 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
 *
 * @pte: starting of the PTEs to copy from
 * @folio: the new hugepage to copy contents to
 * @pmd: pointer to the new hugepage's PMD
 * @orig_pmd: the original raw pages' PMD
 * @vma: the original raw pages' virtual memory area
 * @address: starting address to copy
 * @ptl: lock on raw pages' PTEs
 * @compound_pagelist: list that stores compound pages
 */
static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
                pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
                unsigned long address, spinlock_t *ptl,
                struct list_head *compound_pagelist)
{
        unsigned int i;
        int result = SCAN_SUCCEED;

        /*
         * Copying pages' contents is subject to memory poison at any iteration.
         */
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                pte_t pteval = ptep_get(pte + i);
                struct page *page = folio_page(folio, i);
                unsigned long src_addr = address + i * PAGE_SIZE;
                struct page *src_page;

                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        clear_user_highpage(page, src_addr);
                        continue;
                }
                src_page = pte_page(pteval);
                if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
                        result = SCAN_COPY_MC;
                        break;
                }
        }

        if (likely(result == SCAN_SUCCEED))
                __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
                                                    compound_pagelist);
        else
                __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
                                                 compound_pagelist);

        return result;
}

static void khugepaged_alloc_sleep(void)
{
        DEFINE_WAIT(wait);

        add_wait_queue(&khugepaged_wait, &wait);
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
        remove_wait_queue(&khugepaged_wait, &wait);
}

struct collapse_control khugepaged_collapse_control = {
        .is_khugepaged = true,
};

static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
        int i;

        /*
         * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
        if (!node_reclaim_enabled())
                return false;

        /* If there is a count for this node already, it must be acceptable */
        if (cc->node_load[nid])
                return false;

        for (i = 0; i < MAX_NUMNODES; i++) {
                if (!cc->node_load[i])
                        continue;
                if (node_distance(nid, i) > node_reclaim_distance)
                        return true;
        }
        return false;
}

#define khugepaged_defrag()                                        \
        (transparent_hugepage_flags &                                \
         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))

/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
        return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}

#ifdef CONFIG_NUMA
static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
        int nid, target_node = 0, max_value = 0;

        /* find first node with max normal pages hit */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                if (cc->node_load[nid] > max_value) {
                        max_value = cc->node_load[nid];
                        target_node = nid;
                }

        for_each_online_node(nid) {
                if (max_value == cc->node_load[nid])
                        node_set(nid, cc->alloc_nmask);
        }

        return target_node;
}
#else
static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
        return 0;
}
#endif

/*
 * If mmap_lock temporarily dropped, revalidate vma
 * before taking mmap_lock.
 * Returns enum scan_result value.
 */

static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
                                   bool expect_anon,
                                   struct vm_area_struct **vmap,
                                   struct collapse_control *cc)
{
        struct vm_area_struct *vma;
        unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;

        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                return SCAN_ANY_PROCESS;

        *vmap = vma = find_vma(mm, address);
        if (!vma)
                return SCAN_VMA_NULL;

        if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
                return SCAN_ADDRESS_RANGE;
        if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
                return SCAN_VMA_CHECK;
        /*
         * Anon VMA expected, the address may be unmapped then
         * remapped to file after khugepaged reaquired the mmap_lock.
         *
         * thp_vma_allowable_order may return true for qualified file
         * vmas.
         */
        if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
                return SCAN_PAGE_ANON;
        return SCAN_SUCCEED;
}

static int find_pmd_or_thp_or_none(struct mm_struct *mm,
                                   unsigned long address,
                                   pmd_t **pmd)
{
        pmd_t pmde;

        *pmd = mm_find_pmd(mm, address);
        if (!*pmd)
                return SCAN_PMD_NULL;

        pmde = pmdp_get_lockless(*pmd);
        if (pmd_none(pmde))
                return SCAN_PMD_NONE;
        if (!pmd_present(pmde))
                return SCAN_PMD_NULL;
        if (pmd_trans_huge(pmde))
                return SCAN_PMD_MAPPED;
        if (pmd_devmap(pmde))
                return SCAN_PMD_NULL;
        if (pmd_bad(pmde))
                return SCAN_PMD_NULL;
        return SCAN_SUCCEED;
}

static int check_pmd_still_valid(struct mm_struct *mm,
                                 unsigned long address,
                                 pmd_t *pmd)
{
        pmd_t *new_pmd;
        int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);

        if (result != SCAN_SUCCEED)
                return result;
        if (new_pmd != pmd)
                return SCAN_FAIL;
        return SCAN_SUCCEED;
}

/*
 * Bring missing pages in from swap, to complete THP collapse.
 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
 *
 * Called and returns without pte mapped or spinlocks held.
 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
 */
static int __collapse_huge_page_swapin(struct mm_struct *mm,
                                       struct vm_area_struct *vma,
                                       unsigned long haddr, pmd_t *pmd,
                                       int referenced)
{
        int swapped_in = 0;
        vm_fault_t ret = 0;
        unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
        int result;
        pte_t *pte = NULL;
        spinlock_t *ptl;

        for (address = haddr; address < end; address += PAGE_SIZE) {
                struct vm_fault vmf = {
                        .vma = vma,
                        .address = address,
                        .pgoff = linear_page_index(vma, address),
                        .flags = FAULT_FLAG_ALLOW_RETRY,
                        .pmd = pmd,
                };

                if (!pte++) {
                        pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
                        if (!pte) {
                                mmap_read_unlock(mm);
                                result = SCAN_PMD_NULL;
                                goto out;
                        }
                }

                vmf.orig_pte = ptep_get_lockless(pte);
                if (!is_swap_pte(vmf.orig_pte))
                        continue;

                vmf.pte = pte;
                vmf.ptl = ptl;
                ret = do_swap_page(&vmf);
                /* Which unmaps pte (after perhaps re-checking the entry) */
                pte = NULL;

                /*
                 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
                 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
                 * we do not retry here and swap entry will remain in pagetable
                 * resulting in later failure.
                 */
                if (ret & VM_FAULT_RETRY) {
                        /* Likely, but not guaranteed, that page lock failed */
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }
                if (ret & VM_FAULT_ERROR) {
                        mmap_read_unlock(mm);
                        result = SCAN_FAIL;
                        goto out;
                }
                swapped_in++;
        }

        if (pte)
                pte_unmap(pte);

        /* Drain LRU cache to remove extra pin on the swapped in pages */
        if (swapped_in)
                lru_add_drain();

        result = SCAN_SUCCEED;
out:
        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
        return result;
}

static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
                              struct collapse_control *cc)
{
        gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
                     GFP_TRANSHUGE);
        int node = hpage_collapse_find_target_node(cc);
        struct folio *folio;

        folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
        if (!folio) {
                *foliop = NULL;
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                return SCAN_ALLOC_HUGE_PAGE_FAIL;
        }

        count_vm_event(THP_COLLAPSE_ALLOC);
        if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
                folio_put(folio);
                *foliop = NULL;
                return SCAN_CGROUP_CHARGE_FAIL;
        }

        count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

        *foliop = folio;
        return SCAN_SUCCEED;
}

static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
                              int referenced, int unmapped,
                              struct collapse_control *cc)
{
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
        struct folio *folio;
        spinlock_t *pmd_ptl, *pte_ptl;
        int result = SCAN_FAIL;
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        /*
         * Before allocating the hugepage, release the mmap_lock read lock.
         * The allocation can take potentially a long time if it involves
         * sync compaction, and we do not need to hold the mmap_lock during
         * that. We will recheck the vma after taking it again in write mode.
         */
        mmap_read_unlock(mm);

        result = alloc_charge_folio(&folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out_nolock;

        mmap_read_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        result = find_pmd_or_thp_or_none(mm, address, &pmd);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        if (unmapped) {
                /*
                 * __collapse_huge_page_swapin will return with mmap_lock
                 * released when it fails. So we jump out_nolock directly in
                 * that case.  Continuing to collapse causes inconsistency.
                 */
                result = __collapse_huge_page_swapin(mm, vma, address, pmd,
                                                     referenced);
                if (result != SCAN_SUCCEED)
                        goto out_nolock;
        }

        mmap_read_unlock(mm);
        /*
         * Prevent all access to pagetables with the exception of
         * gup_fast later handled by the ptep_clear_flush and the VM
         * handled by the anon_vma lock + PG_lock.
         *
         * UFFDIO_MOVE is prevented to race as well thanks to the
         * mmap_lock.
         */
        mmap_write_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
        /* check if the pmd is still valid */
        result = check_pmd_still_valid(mm, address, pmd);
        if (result != SCAN_SUCCEED)
                goto out_up_write;

        vma_start_write(vma);
        anon_vma_lock_write(vma->anon_vma);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
                                address + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
        /*
         * This removes any huge TLB entry from the CPU so we won't allow
         * huge and small TLB entries for the same virtual address to
         * avoid the risk of CPU bugs in that area.
         *
         * Parallel GUP-fast is fine since GUP-fast will back off when
         * it detects PMD is changed.
         */
        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
        tlb_remove_table_sync_one();

        pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
        if (pte) {
                result = __collapse_huge_page_isolate(vma, address, pte, cc,
                                                      &compound_pagelist);
                spin_unlock(pte_ptl);
        } else {
                result = SCAN_PMD_NULL;
        }

        if (unlikely(result != SCAN_SUCCEED)) {
                if (pte)
                        pte_unmap(pte);
                spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
                /*
                 * We can only use set_pmd_at when establishing
                 * hugepmds and never for establishing regular pmds that
                 * points to regular pagetables. Use pmd_populate for that
                 */
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
                goto out_up_write;
        }

        /*
         * All pages are isolated and locked so anon_vma rmap
         * can't run anymore.
         */
        anon_vma_unlock_write(vma->anon_vma);

        result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
                                           vma, address, pte_ptl,
                                           &compound_pagelist);
        pte_unmap(pte);
        if (unlikely(result != SCAN_SUCCEED))
                goto out_up_write;

        /*
         * The smp_wmb() inside __folio_mark_uptodate() ensures the
         * copy_huge_page writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);
        pgtable = pmd_pgtable(_pmd);

        _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);

        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        folio_add_new_anon_rmap(folio, vma, address);
        folio_add_lru_vma(folio, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
        spin_unlock(pmd_ptl);

        folio = NULL;

        result = SCAN_SUCCEED;
out_up_write:
        mmap_write_unlock(mm);
out_nolock:
        if (folio)
                folio_put(folio);
        trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
        return result;
}

static int hpage_collapse_scan_pmd(struct mm_struct *mm,
                                   struct vm_area_struct *vma,
                                   unsigned long address, bool *mmap_locked,
                                   struct collapse_control *cc)
{
        pmd_t *pmd;
        pte_t *pte, *_pte;
        int result = SCAN_FAIL, referenced = 0;
        int none_or_zero = 0, shared = 0;
        struct page *page = NULL;
        struct folio *folio = NULL;
        unsigned long _address;
        spinlock_t *ptl;
        int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        result = find_pmd_or_thp_or_none(mm, address, &pmd);
        if (result != SCAN_SUCCEED)
                goto out;

        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!pte) {
                result = SCAN_PMD_NULL;
                goto out;
        }

        for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = ptep_get(_pte);
                if (is_swap_pte(pteval)) {
                        ++unmapped;
                        if (!cc->is_khugepaged ||
                            unmapped <= khugepaged_max_ptes_swap) {
                                /*
                                 * Always be strict with uffd-wp
                                 * enabled swap entries.  Please see
                                 * comment below for pte_uffd_wp().
                                 */
                                if (pte_swp_uffd_wp_any(pteval)) {
                                        result = SCAN_PTE_UFFD_WP;
                                        goto out_unmap;
                                }
                                continue;
                        } else {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                goto out_unmap;
                        }
                }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out_unmap;
                        }
                }
                if (pte_uffd_wp(pteval)) {
                        /*
                         * Don't collapse the page if any of the small
                         * PTEs are armed with uffd write protection.
                         * Here we can also mark the new huge pmd as
                         * write protected if any of the small ones is
                         * marked but that could bring unknown
                         * userfault messages that falls outside of
                         * the registered range.  So, just be simple.
                         */
                        result = SCAN_PTE_UFFD_WP;
                        goto out_unmap;
                }
                if (pte_write(pteval))
                        writable = true;

                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
                folio = page_folio(page);

                if (!folio_test_anon(folio)) {
                        result = SCAN_PAGE_ANON;
                        goto out_unmap;
                }

                /*
                 * We treat a single page as shared if any part of the THP
                 * is shared. "False negatives" from
                 * folio_likely_mapped_shared() are not expected to matter
                 * much in practice.
                 */
                if (folio_likely_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out_unmap;
                        }
                }

                /*
                 * Record which node the original page is from and save this
                 * information to cc->node_load[].
                 * Khugepaged will allocate hugepage from the node has the max
                 * hit record.
                 */
                node = folio_nid(folio);
                if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
                }
                cc->node_load[node]++;
                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        goto out_unmap;
                }
                if (folio_test_locked(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out_unmap;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * Here the check may be racy:
                 * it may see folio_mapcount() > folio_ref_count().
                 * But such case is ephemeral we could always retry collapse
                 * later.  However it may report false positive if the page
                 * has excessive GUP pins (i.e. 512).  Anyway the same check
                 * will be done again later the risk seems low.
                 */
                if (!is_refcount_suitable(folio)) {
                        result = SCAN_PAGE_COUNT;
                        goto out_unmap;
                }

                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;
        }
        if (!writable) {
                result = SCAN_PAGE_RO;
        } else if (cc->is_khugepaged &&
                   (!referenced ||
                    (unmapped && referenced < HPAGE_PMD_NR / 2))) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
        }
out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (result == SCAN_SUCCEED) {
                result = collapse_huge_page(mm, address, referenced,
                                            unmapped, cc);
                /* collapse_huge_page will return with the mmap_lock released */
                *mmap_locked = false;
        }
out:
        trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
                                     none_or_zero, result, unmapped);
        return result;
}

static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
{
        struct mm_slot *slot = &mm_slot->slot;
        struct mm_struct *mm = slot->mm;

        lockdep_assert_held(&khugepaged_mm_lock);

        if (hpage_collapse_test_exit(mm)) {
                /* free mm_slot */
                hash_del(&slot->hash);
                list_del(&slot->mm_node);

                /*
                 * Not strictly needed because the mm exited already.
                 *
                 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                 */

                /* khugepaged_mm_lock actually not necessary for the below */
                mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        }
}

#ifdef CONFIG_SHMEM
/* hpage must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
                        pmd_t *pmdp, struct page *hpage)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = addr,
                .flags = 0,
                .pmd = pmdp,
        };

        VM_BUG_ON(!PageTransHuge(hpage));
        mmap_assert_locked(vma->vm_mm);

        if (do_set_pmd(&vmf, hpage))
                return SCAN_FAIL;

        get_page(hpage);
        return SCAN_SUCCEED;
}

/**
 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
 * address haddr.
 *
 * @mm: process address space where collapse happens
 * @addr: THP collapse address
 * @install_pmd: If a huge PMD should be installed
 *
 * This function checks whether all the PTEs in the PMD are pointing to the
 * right THP. If so, retract the page table so the THP can refault in with
 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
 */
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                            bool install_pmd)
{
        struct mmu_notifier_range range;
        bool notified = false;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vma_lookup(mm, haddr);
        struct folio *folio;
        pte_t *start_pte, *pte;
        pmd_t *pmd, pgt_pmd;
        spinlock_t *pml = NULL, *ptl;
        int nr_ptes = 0, result = SCAN_FAIL;
        int i;

        mmap_assert_locked(mm);

        /* First check VMA found, in case page tables are being torn down */
        if (!vma || !vma->vm_file ||
            !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
                return SCAN_VMA_CHECK;

        /* Fast check before locking page if already PMD-mapped */
        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        if (result == SCAN_PMD_MAPPED)
                return result;

        /*
         * If we are here, we've succeeded in replacing all the native pages
         * in the page cache with a single hugepage. If a mm were to fault-in
         * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
         * and map it by a PMD, regardless of sysfs THP settings. As such, let's
         * analogously elide sysfs THP settings here.
         */
        if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
                return SCAN_VMA_CHECK;

        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
        if (userfaultfd_wp(vma))
                return SCAN_PTE_UFFD_WP;

        folio = filemap_lock_folio(vma->vm_file->f_mapping,
                               linear_page_index(vma, haddr));
        if (IS_ERR(folio))
                return SCAN_PAGE_NULL;

        if (folio_order(folio) != HPAGE_PMD_ORDER) {
                result = SCAN_PAGE_COMPOUND;
                goto drop_folio;
        }

        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        switch (result) {
        case SCAN_SUCCEED:
                break;
        case SCAN_PMD_NONE:
                /*
                 * All pte entries have been removed and pmd cleared.
                 * Skip all the pte checks and just update the pmd mapping.
                 */
                goto maybe_install_pmd;
        default:
                goto drop_folio;
        }

        result = SCAN_FAIL;
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto drop_folio;

        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
             i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
                pte_t ptent = ptep_get(pte);

                /* empty pte, skip */
                if (pte_none(ptent))
                        continue;

                /* page swapped out, abort */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }

                page = vm_normal_page(vma, addr, ptent);
                if (WARN_ON_ONCE(page && is_zone_device_page(page)))
                        page = NULL;
                /*
                 * Note that uprobe, debugger, or MAP_PRIVATE may change the
                 * page table, but the new page will not be a subpage of hpage.
                 */
                if (folio_page(folio, i) != page)
                        goto abort;
        }

        pte_unmap_unlock(start_pte, ptl);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                haddr, haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        notified = true;

        /*
         * pmd_lock covers a wider range than ptl, and (if split from mm's
         * page_table_lock) ptl nests inside pml. The less time we hold pml,
         * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
         * inserts a valid as-if-COWed PTE without even looking up page cache.
         * So page lock of folio does not protect from it, so we must not drop
         * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
         */
        if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
                pml = pmd_lock(mm, pmd);

        start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto abort;
        if (!pml)
                spin_lock(ptl);
        else if (ptl != pml)
                spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

        /* step 2: clear page table and adjust rmap */
        for (i = 0, addr = haddr, pte = start_pte;
             i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
                pte_t ptent = ptep_get(pte);

                if (pte_none(ptent))
                        continue;
                /*
                 * We dropped ptl after the first scan, to do the mmu_notifier:
                 * page lock stops more PTEs of the folio being faulted in, but
                 * does not stop write faults COWing anon copies from existing
                 * PTEs; and does not stop those being swapped out or migrated.
                 */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }
                page = vm_normal_page(vma, addr, ptent);
                if (folio_page(folio, i) != page)
                        goto abort;

                /*
                 * Must clear entry, or a racing truncate may re-remove it.
                 * TLB flush can be left until pmdp_collapse_flush() does it.
                 * PTE dirty? Shmem page is already dirty; file is read-only.
                 */
                ptep_clear(mm, addr, pte);
                folio_remove_rmap_pte(folio, page, vma);
                nr_ptes++;
        }

        pte_unmap(start_pte);
        if (!pml)
                spin_unlock(ptl);

        /* step 3: set proper refcount and mm_counters. */
        if (nr_ptes) {
                folio_ref_sub(folio, nr_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
        }

        /* step 4: remove empty page table */
        if (!pml) {
                pml = pmd_lock(mm, pmd);
                if (ptl != pml)
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
        }
        pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
        pmdp_get_lockless_sync();
        if (ptl != pml)
                spin_unlock(ptl);
        spin_unlock(pml);

        mmu_notifier_invalidate_range_end(&range);

        mm_dec_nr_ptes(mm);
        page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
        pte_free_defer(mm, pmd_pgtable(pgt_pmd));

maybe_install_pmd:
        /* step 5: install pmd entry */
        result = install_pmd
                        ? set_huge_pmd(vma, haddr, pmd, &folio->page)
                        : SCAN_SUCCEED;
        goto drop_folio;
abort:
        if (nr_ptes) {
                flush_tlb_mm(mm);
                folio_ref_sub(folio, nr_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
        }
        if (start_pte)
                pte_unmap_unlock(start_pte, ptl);
        if (pml && pml != ptl)
                spin_unlock(pml);
        if (notified)
                mmu_notifier_invalidate_range_end(&range);
drop_folio:
        folio_unlock(folio);
        folio_put(folio);
        return result;
}

static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
        struct vm_area_struct *vma;

        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                struct mmu_notifier_range range;
                struct mm_struct *mm;
                unsigned long addr;
                pmd_t *pmd, pgt_pmd;
                spinlock_t *pml;
                spinlock_t *ptl;
                bool skipped_uffd = false;

                /*
                 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
                 * got written to. These VMAs are likely not worth removing
                 * page tables from, as PMD-mapping is likely to be split later.
                 */
                if (READ_ONCE(vma->anon_vma))
                        continue;

                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                if (addr & ~HPAGE_PMD_MASK ||
                    vma->vm_end < addr + HPAGE_PMD_SIZE)
                        continue;

                mm = vma->vm_mm;
                if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
                        continue;

                if (hpage_collapse_test_exit(mm))
                        continue;
                /*
                 * When a vma is registered with uffd-wp, we cannot recycle
                 * the page table because there may be pte markers installed.
                 * Other vmas can still have the same file mapped hugely, but
                 * skip this one: it will always be mapped in small page size
                 * for uffd-wp registered ranges.
                 */
                if (userfaultfd_wp(vma))
                        continue;

                /* PTEs were notified when unmapped; but now for the PMD? */
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                        addr, addr + HPAGE_PMD_SIZE);
                mmu_notifier_invalidate_range_start(&range);

                pml = pmd_lock(mm, pmd);
                ptl = pte_lockptr(mm, pmd);
                if (ptl != pml)
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

                /*
                 * Huge page lock is still held, so normally the page table
                 * must remain empty; and we have already skipped anon_vma
                 * and userfaultfd_wp() vmas.  But since the mmap_lock is not
                 * held, it is still possible for a racing userfaultfd_ioctl()
                 * to have inserted ptes or markers.  Now that we hold ptlock,
                 * repeating the anon_vma check protects from one category,
                 * and repeating the userfaultfd_wp() check from another.
                 */
                if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
                        skipped_uffd = true;
                } else {
                        pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
                        pmdp_get_lockless_sync();
                }

                if (ptl != pml)
                        spin_unlock(ptl);
                spin_unlock(pml);

                mmu_notifier_invalidate_range_end(&range);

                if (!skipped_uffd) {
                        mm_dec_nr_ptes(mm);
                        page_table_check_pte_clear_range(mm, addr, pgt_pmd);
                        pte_free_defer(mm, pmd_pgtable(pgt_pmd));
                }
        }
        i_mmap_unlock_read(mapping);
}

/**
 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
 *
 * @mm: process address space where collapse happens
 * @addr: virtual collapse start address
 * @file: file that collapse on
 * @start: collapse start address
 * @cc: collapse context and scratchpad
 *
 * Basic scheme is simple, details are more complex:
 *  - allocate and lock a new huge page;
 *  - scan page cache, locking old pages
 *    + swap/gup in pages if necessary;
 *  - copy data to new page
 *  - handle shmem holes
 *    + re-validate that holes weren't filled by someone else
 *    + check for userfaultfd
 *  - finalize updates to the page cache;
 *  - if replacing succeeds:
 *    + unlock huge page;
 *    + free old pages;
 *  - if replacing failed;
 *    + unlock old pages
 *    + unlock and free huge page;
 */
static int collapse_file(struct mm_struct *mm, unsigned long addr,
                         struct file *file, pgoff_t start,
                         struct collapse_control *cc)
{
        struct address_space *mapping = file->f_mapping;
        struct page *dst;
        struct folio *folio, *tmp, *new_folio;
        pgoff_t index = 0, end = start + HPAGE_PMD_NR;
        LIST_HEAD(pagelist);
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
        int nr_none = 0, result = SCAN_SUCCEED;
        bool is_shmem = shmem_file(file);

        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

        result = alloc_charge_folio(&new_folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out;

        __folio_set_locked(new_folio);
        if (is_shmem)
                __folio_set_swapbacked(new_folio);
        new_folio->index = start;
        new_folio->mapping = mapping;

        /*
         * Ensure we have slots for all the pages in the range.  This is
         * almost certainly a no-op because most of the pages must be present
         */
        do {
                xas_lock_irq(&xas);
                xas_create_range(&xas);
                if (!xas_error(&xas))
                        break;
                xas_unlock_irq(&xas);
                if (!xas_nomem(&xas, GFP_KERNEL)) {
                        result = SCAN_FAIL;
                        goto rollback;
                }
        } while (1);

        for (index = start; index < end; index++) {
                xas_set(&xas, index);
                folio = xas_load(&xas);

                VM_BUG_ON(index != xas.xa_index);
                if (is_shmem) {
                        if (!folio) {
                                /*
                                 * Stop if extent has been truncated or
                                 * hole-punched, and is now completely
                                 * empty.
                                 */
                                if (index == start) {
                                        if (!xas_next_entry(&xas, end - 1)) {
                                                result = SCAN_TRUNCATED;
                                                goto xa_locked;
                                        }
                                }
                                nr_none++;
                                continue;
                        }

                        if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
                                xas_unlock_irq(&xas);
                                /* swap in or instantiate fallocated page */
                                if (shmem_get_folio(mapping->host, index,
                                                &folio, SGP_NOALLOC)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                                /* drain lru cache to help isolate_lru_page() */
                                lru_add_drain();
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                } else {        /* !is_shmem */
                        if (!folio || xa_is_value(folio)) {
                                xas_unlock_irq(&xas);
                                page_cache_sync_readahead(mapping, &file->f_ra,
                                                          file, index,
                                                          end - index);
                                /* drain lru cache to help isolate_lru_page() */
                                lru_add_drain();
                                folio = filemap_lock_folio(mapping, index);
                                if (IS_ERR(folio)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                        } else if (folio_test_dirty(folio)) {
                                /*
                                 * khugepaged only works on read-only fd,
                                 * so this page is dirty because it hasn't
                                 * been flushed since first write. There
                                 * won't be new dirty pages.
                                 *
                                 * Trigger async flush here and hope the
                                 * writeback is done when khugepaged
                                 * revisits this page.
                                 *
                                 * This is a one-off situation. We are not
                                 * forcing writeback in loop.
                                 */
                                xas_unlock_irq(&xas);
                                filemap_flush(mapping);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
                        } else if (folio_test_writeback(folio)) {
                                xas_unlock_irq(&xas);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                }

                /*
                 * The folio must be locked, so we can drop the i_pages lock
                 * without racing with truncate.
                 */
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

                /* make sure the folio is up to date */
                if (unlikely(!folio_test_uptodate(folio))) {
                        result = SCAN_FAIL;
                        goto out_unlock;
                }

                /*
                 * If file was truncated then extended, or hole-punched, before
                 * we locked the first folio, then a THP might be there already.
                 * This will be discovered on the first iteration.
                 */
                if (folio_test_large(folio)) {
                        result = folio_order(folio) == HPAGE_PMD_ORDER &&
                                        folio->index == start
                                        /* Maybe PMD-mapped */
                                        ? SCAN_PTE_MAPPED_HUGEPAGE
                                        : SCAN_PAGE_COMPOUND;
                        goto out_unlock;
                }

                if (folio_mapping(folio) != mapping) {
                        result = SCAN_TRUNCATED;
                        goto out_unlock;
                }

                if (!is_shmem && (folio_test_dirty(folio) ||
                                  folio_test_writeback(folio))) {
                        /*
                         * khugepaged only works on read-only fd, so this
                         * folio is dirty because it hasn't been flushed
                         * since first write.
                         */
                        result = SCAN_FAIL;
                        goto out_unlock;
                }

                if (!folio_isolate_lru(folio)) {
                        result = SCAN_DEL_PAGE_LRU;
                        goto out_unlock;
                }

                if (!filemap_release_folio(folio, GFP_KERNEL)) {
                        result = SCAN_PAGE_HAS_PRIVATE;
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                if (folio_mapped(folio))
                        try_to_unmap(folio,
                                        TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);

                xas_lock_irq(&xas);

                VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);

                /*
                 * We control three references to the folio:
                 *  - we hold a pin on it;
                 *  - one reference from page cache;
                 *  - one from lru_isolate_folio;
                 * If those are the only references, then any new usage
                 * of the folio will have to fetch it from the page
                 * cache. That requires locking the folio to handle
                 * truncate, so any new usage will be blocked until we
                 * unlock folio after collapse/during rollback.
                 */
                if (folio_ref_count(folio) != 3) {
                        result = SCAN_PAGE_COUNT;
                        xas_unlock_irq(&xas);
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                /*
                 * Accumulate the folios that are being collapsed.
                 */
                list_add_tail(&folio->lru, &pagelist);
                continue;
out_unlock:
                folio_unlock(folio);
                folio_put(folio);
                goto xa_unlocked;
        }

        if (!is_shmem) {
                filemap_nr_thps_inc(mapping);
                /*
                 * Paired with smp_mb() in do_dentry_open() to ensure
                 * i_writecount is up to date and the update to nr_thps is
                 * visible. Ensures the page cache will be truncated if the
                 * file is opened writable.
                 */
                smp_mb();
                if (inode_is_open_for_write(mapping->host)) {
                        result = SCAN_FAIL;
                        filemap_nr_thps_dec(mapping);
                }
        }

xa_locked:
        xas_unlock_irq(&xas);
xa_unlocked:

        /*
         * If collapse is successful, flush must be done now before copying.
         * If collapse is unsuccessful, does flush actually need to be done?
         * Do it anyway, to clear the state.
         */
        try_to_unmap_flush();

        if (result == SCAN_SUCCEED && nr_none &&
            !shmem_charge(mapping->host, nr_none))
                result = SCAN_FAIL;
        if (result != SCAN_SUCCEED) {
                nr_none = 0;
                goto rollback;
        }

        /*
         * The old folios are locked, so they won't change anymore.
         */
        index = start;
        dst = folio_page(new_folio, 0);
        list_for_each_entry(folio, &pagelist, lru) {
                while (index < folio->index) {
                        clear_highpage(dst);
                        index++;
                        dst++;
                }
                if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) {
                        result = SCAN_COPY_MC;
                        goto rollback;
                }
                index++;
                dst++;
        }
        while (index < end) {
                clear_highpage(dst);
                index++;
                dst++;
        }

        if (nr_none) {
                struct vm_area_struct *vma;
                int nr_none_check = 0;

                i_mmap_lock_read(mapping);
                xas_lock_irq(&xas);

                xas_set(&xas, start);
                for (index = start; index < end; index++) {
                        if (!xas_next(&xas)) {
                                xas_store(&xas, XA_RETRY_ENTRY);
                                if (xas_error(&xas)) {
                                        result = SCAN_STORE_FAILED;
                                        goto immap_locked;
                                }
                                nr_none_check++;
                        }
                }

                if (nr_none != nr_none_check) {
                        result = SCAN_PAGE_FILLED;
                        goto immap_locked;
                }

                /*
                 * If userspace observed a missing page in a VMA with
                 * a MODE_MISSING userfaultfd, then it might expect a
                 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
                 * roll back to avoid suppressing such an event. Since
                 * wp/minor userfaultfds don't give userspace any
                 * guarantees that the kernel doesn't fill a missing
                 * page with a zero page, so they don't matter here.
                 *
                 * Any userfaultfds registered after this point will
                 * not be able to observe any missing pages due to the
                 * previously inserted retry entries.
                 */
                vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
                        if (userfaultfd_missing(vma)) {
                                result = SCAN_EXCEED_NONE_PTE;
                                goto immap_locked;
                        }
                }

immap_locked:
                i_mmap_unlock_read(mapping);
                if (result != SCAN_SUCCEED) {
                        xas_set(&xas, start);
                        for (index = start; index < end; index++) {
                                if (xas_next(&xas) == XA_RETRY_ENTRY)
                                        xas_store(&xas, NULL);
                        }

                        xas_unlock_irq(&xas);
                        goto rollback;
                }
        } else {
                xas_lock_irq(&xas);
        }

        if (is_shmem)
                __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
        else
                __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);

        if (nr_none) {
                __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
                /* nr_none is always 0 for non-shmem. */
                __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
        }

        /*
         * Mark new_folio as uptodate before inserting it into the
         * page cache so that it isn't mistaken for an fallocated but
         * unwritten page.
         */
        folio_mark_uptodate(new_folio);
        folio_ref_add(new_folio, HPAGE_PMD_NR - 1);

        if (is_shmem)
                folio_mark_dirty(new_folio);
        folio_add_lru(new_folio);

        /* Join all the small entries into a single multi-index entry. */
        xas_set_order(&xas, start, HPAGE_PMD_ORDER);
        xas_store(&xas, new_folio);
        WARN_ON_ONCE(xas_error(&xas));
        xas_unlock_irq(&xas);

        /*
         * Remove pte page tables, so we can re-fault the page as huge.
         * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
         */
        retract_page_tables(mapping, start);
        if (cc && !cc->is_khugepaged)
                result = SCAN_PTE_MAPPED_HUGEPAGE;
        folio_unlock(new_folio);

        /*
         * The collapse has succeeded, so free the old folios.
         */
        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                folio->mapping = NULL;
                folio_clear_active(folio);
                folio_clear_unevictable(folio);
                folio_unlock(folio);
                folio_put_refs(folio, 3);
        }

        goto out;

rollback:
        /* Something went wrong: roll back page cache changes */
        if (nr_none) {
                xas_lock_irq(&xas);
                mapping->nrpages -= nr_none;
                xas_unlock_irq(&xas);
                shmem_uncharge(mapping->host, nr_none);
        }

        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                folio_unlock(folio);
                folio_putback_lru(folio);
                folio_put(folio);
        }
        /*
         * Undo the updates of filemap_nr_thps_inc for non-SHMEM
         * file only. This undo is not needed unless failure is
         * due to SCAN_COPY_MC.
         */
        if (!is_shmem && result == SCAN_COPY_MC) {
                filemap_nr_thps_dec(mapping);
                /*
                 * Paired with smp_mb() in do_dentry_open() to
                 * ensure the update to nr_thps is visible.
                 */
                smp_mb();
        }

        new_folio->mapping = NULL;

        folio_unlock(new_folio);
        folio_put(new_folio);
out:
        VM_BUG_ON(!list_empty(&pagelist));
        trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
        return result;
}

static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
                                    struct file *file, pgoff_t start,
                                    struct collapse_control *cc)
{
        struct folio *folio = NULL;
        struct address_space *mapping = file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, start);
        int present, swap;
        int node = NUMA_NO_NODE;
        int result = SCAN_SUCCEED;

        present = 0;
        swap = 0;
        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        rcu_read_lock();
        xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
                if (xas_retry(&xas, folio))
                        continue;

                if (xa_is_value(folio)) {
                        ++swap;
                        if (cc->is_khugepaged &&
                            swap > khugepaged_max_ptes_swap) {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                break;
                        }
                        continue;
                }

                /*
                 * TODO: khugepaged should compact smaller compound pages
                 * into a PMD sized page
                 */
                if (folio_test_large(folio)) {
                        result = folio_order(folio) == HPAGE_PMD_ORDER &&
                                        folio->index == start
                                        /* Maybe PMD-mapped */
                                        ? SCAN_PTE_MAPPED_HUGEPAGE
                                        : SCAN_PAGE_COMPOUND;
                        /*
                         * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
                         * by the caller won't touch the page cache, and so
                         * it's safe to skip LRU and refcount checks before
                         * returning.
                         */
                        break;
                }

                node = folio_nid(folio);
                if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        break;
                }
                cc->node_load[node]++;

                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        break;
                }

                if (folio_ref_count(folio) !=
                    1 + folio_mapcount(folio) + folio_test_private(folio)) {
                        result = SCAN_PAGE_COUNT;
                        break;
                }

                /*
                 * We probably should check if the folio is referenced
                 * here, but nobody would transfer pte_young() to
                 * folio_test_referenced() for us.  And rmap walk here
                 * is just too costly...
                 */

                present++;

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        if (result == SCAN_SUCCEED) {
                if (cc->is_khugepaged &&
                    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
                        result = SCAN_EXCEED_NONE_PTE;
                        count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                } else {
                        result = collapse_file(mm, addr, file, start, cc);
                }
        }

        trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
        return result;
}
#else
static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
                                    struct file *file, pgoff_t start,
                                    struct collapse_control *cc)
{
        BUILD_BUG();
}
#endif

static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
                                            struct collapse_control *cc)
        __releases(&khugepaged_mm_lock)
        __acquires(&khugepaged_mm_lock)
{
        struct vma_iterator vmi;
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        int progress = 0;

        VM_BUG_ON(!pages);
        lockdep_assert_held(&khugepaged_mm_lock);
        *result = SCAN_FAIL;

        if (khugepaged_scan.mm_slot) {
                mm_slot = khugepaged_scan.mm_slot;
                slot = &mm_slot->slot;
        } else {
                slot = list_entry(khugepaged_scan.mm_head.next,
                                     struct mm_slot, mm_node);
                mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = mm_slot;
        }
        spin_unlock(&khugepaged_mm_lock);

        mm = slot->mm;
        /*
         * Don't wait for semaphore (to avoid long wait times).  Just move to
         * the next mm on the list.
         */
        vma = NULL;
        if (unlikely(!mmap_read_trylock(mm)))
                goto breakouterloop_mmap_lock;

        progress++;
        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                goto breakouterloop;

        vma_iter_init(&vmi, mm, khugepaged_scan.address);
        for_each_vma(vmi, vma) {
                unsigned long hstart, hend;

                cond_resched();
                if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
                        progress++;
                        break;
                }
                if (!thp_vma_allowable_order(vma, vma->vm_flags,
                                        TVA_ENFORCE_SYSFS, PMD_ORDER)) {
skip:
                        progress++;
                        continue;
                }
                hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
                hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
                if (khugepaged_scan.address > hend)
                        goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);

                while (khugepaged_scan.address < hend) {
                        bool mmap_locked = true;

                        cond_resched();
                        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                                goto breakouterloop;

                        VM_BUG_ON(khugepaged_scan.address < hstart ||
                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
                                  hend);
                        if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
                                struct file *file = get_file(vma->vm_file);
                                pgoff_t pgoff = linear_page_index(vma,
                                                khugepaged_scan.address);

                                mmap_read_unlock(mm);
                                mmap_locked = false;
                                *result = hpage_collapse_scan_file(mm,
                                        khugepaged_scan.address, file, pgoff, cc);
                                fput(file);
                                if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
                                        mmap_read_lock(mm);
                                        if (hpage_collapse_test_exit_or_disable(mm))
                                                goto breakouterloop;
                                        *result = collapse_pte_mapped_thp(mm,
                                                khugepaged_scan.address, false);
                                        if (*result == SCAN_PMD_MAPPED)
                                                *result = SCAN_SUCCEED;
                                        mmap_read_unlock(mm);
                                }
                        } else {
                                *result = hpage_collapse_scan_pmd(mm, vma,
                                        khugepaged_scan.address, &mmap_locked, cc);
                        }

                        if (*result == SCAN_SUCCEED)
                                ++khugepaged_pages_collapsed;

                        /* move to next address */
                        khugepaged_scan.address += HPAGE_PMD_SIZE;
                        progress += HPAGE_PMD_NR;
                        if (!mmap_locked)
                                /*
                                 * We released mmap_lock so break loop.  Note
                                 * that we drop mmap_lock before all hugepage
                                 * allocations, so if allocation fails, we are
                                 * guaranteed to break here and report the
                                 * correct result back to caller.
                                 */
                                goto breakouterloop_mmap_lock;
                        if (progress >= pages)
                                goto breakouterloop;
                }
        }
breakouterloop:
        mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
breakouterloop_mmap_lock:

        spin_lock(&khugepaged_mm_lock);
        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
         */
        if (hpage_collapse_test_exit(mm) || !vma) {
                /*
                 * Make sure that if mm_users is reaching zero while
                 * khugepaged runs here, khugepaged_exit will find
                 * mm_slot not pointing to the exiting mm.
                 */
                if (slot->mm_node.next != &khugepaged_scan.mm_head) {
                        slot = list_entry(slot->mm_node.next,
                                          struct mm_slot, mm_node);
                        khugepaged_scan.mm_slot =
                                mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                        khugepaged_scan.address = 0;
                } else {
                        khugepaged_scan.mm_slot = NULL;
                        khugepaged_full_scans++;
                }

                collect_mm_slot(mm_slot);
        }

        return progress;
}

static int khugepaged_has_work(void)
{
        return !list_empty(&khugepaged_scan.mm_head) &&
                hugepage_flags_enabled();
}

static int khugepaged_wait_event(void)
{
        return !list_empty(&khugepaged_scan.mm_head) ||
                kthread_should_stop();
}

static void khugepaged_do_scan(struct collapse_control *cc)
{
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
        bool wait = true;
        int result = SCAN_SUCCEED;

        lru_add_drain_all();

        while (true) {
                cond_resched();

                if (unlikely(kthread_should_stop()))
                        break;

                spin_lock(&khugepaged_mm_lock);
                if (!khugepaged_scan.mm_slot)
                        pass_through_head++;
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
                                                            &result, cc);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);

                if (progress >= pages)
                        break;

                if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
                        /*
                         * If fail to allocate the first time, try to sleep for
                         * a while.  When hit again, cancel the scan.
                         */
                        if (!wait)
                                break;
                        wait = false;
                        khugepaged_alloc_sleep();
                }
        }
}

static bool khugepaged_should_wakeup(void)
{
        return kthread_should_stop() ||
               time_after_eq(jiffies, khugepaged_sleep_expire);
}

static void khugepaged_wait_work(void)
{
        if (khugepaged_has_work()) {
                const unsigned long scan_sleep_jiffies =
                        msecs_to_jiffies(khugepaged_scan_sleep_millisecs);

                if (!scan_sleep_jiffies)
                        return;

                khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
                wait_event_freezable_timeout(khugepaged_wait,
                                             khugepaged_should_wakeup(),
                                             scan_sleep_jiffies);
                return;
        }

        if (hugepage_flags_enabled())
                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}

static int khugepaged(void *none)
{
        struct khugepaged_mm_slot *mm_slot;

        set_freezable();
        set_user_nice(current, MAX_NICE);

        while (!kthread_should_stop()) {
                khugepaged_do_scan(&khugepaged_collapse_control);
                khugepaged_wait_work();
        }

        spin_lock(&khugepaged_mm_lock);
        mm_slot = khugepaged_scan.mm_slot;
        khugepaged_scan.mm_slot = NULL;
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
        return 0;
}

static void set_recommended_min_free_kbytes(void)
{
        struct zone *zone;
        int nr_zones = 0;
        unsigned long recommended_min;

        if (!hugepage_flags_enabled()) {
                calculate_min_free_kbytes();
                goto update_wmarks;
        }

        for_each_populated_zone(zone) {
                /*
                 * We don't need to worry about fragmentation of
                 * ZONE_MOVABLE since it only has movable pages.
                 */
                if (zone_idx(zone) > gfp_zone(GFP_USER))
                        continue;

                nr_zones++;
        }

        /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
        recommended_min = pageblock_nr_pages * nr_zones * 2;

        /*
         * Make sure that on average at least two pageblocks are almost free
         * of another type, one for a migratetype to fall back to and a
         * second to avoid subsequent fallbacks of other types There are 3
         * MIGRATE_TYPES we care about.
         */
        recommended_min += pageblock_nr_pages * nr_zones *
                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;

        /* don't ever allow to reserve more than 5% of the lowmem */
        recommended_min = min(recommended_min,
                              (unsigned long) nr_free_buffer_pages() / 20);
        recommended_min <<= (PAGE_SHIFT-10);

        if (recommended_min > min_free_kbytes) {
                if (user_min_free_kbytes >= 0)
                        pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
                                min_free_kbytes, recommended_min);

                min_free_kbytes = recommended_min;
        }

update_wmarks:
        setup_per_zone_wmarks();
}

int start_stop_khugepaged(void)
{
        int err = 0;

        mutex_lock(&khugepaged_mutex);
        if (hugepage_flags_enabled()) {
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
                if (IS_ERR(khugepaged_thread)) {
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                        goto fail;
                }

                if (!list_empty(&khugepaged_scan.mm_head))
                        wake_up_interruptible(&khugepaged_wait);
        } else if (khugepaged_thread) {
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
        set_recommended_min_free_kbytes();
fail:
        mutex_unlock(&khugepaged_mutex);
        return err;
}

void khugepaged_min_free_kbytes_update(void)
{
        mutex_lock(&khugepaged_mutex);
        if (hugepage_flags_enabled() && khugepaged_thread)
                set_recommended_min_free_kbytes();
        mutex_unlock(&khugepaged_mutex);
}

bool current_is_khugepaged(void)
{
        return kthread_func(current) == khugepaged;
}

static int madvise_collapse_errno(enum scan_result r)
{
        /*
         * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
         * actionable feedback to caller, so they may take an appropriate
         * fallback measure depending on the nature of the failure.
         */
        switch (r) {
        case SCAN_ALLOC_HUGE_PAGE_FAIL:
                return -ENOMEM;
        case SCAN_CGROUP_CHARGE_FAIL:
        case SCAN_EXCEED_NONE_PTE:
                return -EBUSY;
        /* Resource temporary unavailable - trying again might succeed */
        case SCAN_PAGE_COUNT:
        case SCAN_PAGE_LOCK:
        case SCAN_PAGE_LRU:
        case SCAN_DEL_PAGE_LRU:
        case SCAN_PAGE_FILLED:
                return -EAGAIN;
        /*
         * Other: Trying again likely not to succeed / error intrinsic to
         * specified memory range. khugepaged likely won't be able to collapse
         * either.
         */
        default:
                return -EINVAL;
        }
}

int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
                     unsigned long start, unsigned long end)
{
        struct collapse_control *cc;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long hstart, hend, addr;
        int thps = 0, last_fail = SCAN_FAIL;
        bool mmap_locked = true;

        BUG_ON(vma->vm_start > start);
        BUG_ON(vma->vm_end < end);

        *prev = vma;

        if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
                return -EINVAL;

        cc = kmalloc(sizeof(*cc), GFP_KERNEL);
        if (!cc)
                return -ENOMEM;
        cc->is_khugepaged = false;

        mmgrab(mm);
        lru_add_drain_all();

        hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = end & HPAGE_PMD_MASK;

        for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
                int result = SCAN_FAIL;

                if (!mmap_locked) {
                        cond_resched();
                        mmap_read_lock(mm);
                        mmap_locked = true;
                        result = hugepage_vma_revalidate(mm, addr, false, &vma,
                                                         cc);
                        if (result  != SCAN_SUCCEED) {
                                last_fail = result;
                                goto out_nolock;
                        }

                        hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
                }
                mmap_assert_locked(mm);
                memset(cc->node_load, 0, sizeof(cc->node_load));
                nodes_clear(cc->alloc_nmask);
                if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
                        struct file *file = get_file(vma->vm_file);
                        pgoff_t pgoff = linear_page_index(vma, addr);

                        mmap_read_unlock(mm);
                        mmap_locked = false;
                        result = hpage_collapse_scan_file(mm, addr, file, pgoff,
                                                          cc);
                        fput(file);
                } else {
                        result = hpage_collapse_scan_pmd(mm, vma, addr,
                                                         &mmap_locked, cc);
                }
                if (!mmap_locked)
                        *prev = NULL;  /* Tell caller we dropped mmap_lock */

handle_result:
                switch (result) {
                case SCAN_SUCCEED:
                case SCAN_PMD_MAPPED:
                        ++thps;
                        break;
                case SCAN_PTE_MAPPED_HUGEPAGE:
                        BUG_ON(mmap_locked);
                        BUG_ON(*prev);
                        mmap_read_lock(mm);
                        result = collapse_pte_mapped_thp(mm, addr, true);
                        mmap_read_unlock(mm);
                        goto handle_result;
                /* Whitelisted set of results where continuing OK */
                case SCAN_PMD_NULL:
                case SCAN_PTE_NON_PRESENT:
                case SCAN_PTE_UFFD_WP:
                case SCAN_PAGE_RO:
                case SCAN_LACK_REFERENCED_PAGE:
                case SCAN_PAGE_NULL:
                case SCAN_PAGE_COUNT:
                case SCAN_PAGE_LOCK:
                case SCAN_PAGE_COMPOUND:
                case SCAN_PAGE_LRU:
                case SCAN_DEL_PAGE_LRU:
                        last_fail = result;
                        break;
                default:
                        last_fail = result;
                        /* Other error, exit */
                        goto out_maybelock;
                }
        }

out_maybelock:
        /* Caller expects us to hold mmap_lock on return */
        if (!mmap_locked)
                mmap_read_lock(mm);
out_nolock:
        mmap_assert_locked(mm);
        mmdrop(mm);
        kfree(cc);

        return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
                        : madvise_collapse_errno(last_fail);
}



























   28 





   10 





















    7 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H

#define HAVE_JUMP_LABEL_BATCH

#include <asm/asm.h>
#include <asm/nops.h>

#ifndef __ASSEMBLY__

#include <linux/stringify.h>
#include <linux/types.h>

#define JUMP_TABLE_ENTRY                                \
        ".pushsection __jump_table,  \"aw\" \n\t"        \
        _ASM_ALIGN "\n\t"                                \
        ".long 1b - . \n\t"                                \
        ".long %l[l_yes] - . \n\t"                        \
        _ASM_PTR "%c0 + %c1 - .\n\t"                        \
        ".popsection \n\t"

#ifdef CONFIG_HAVE_JUMP_LABEL_HACK

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
        asm goto("1:"
                "jmp %l[l_yes] # objtool NOPs this \n\t"
                JUMP_TABLE_ENTRY
                : :  "i" (key), "i" (2 | branch) : : l_yes);

        return false;
l_yes:
        return true;
}

#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */

static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
        asm goto("1:"
                ".byte " __stringify(BYTES_NOP5) "\n\t"
                JUMP_TABLE_ENTRY
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */

static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
        asm goto("1:"
                "jmp %l[l_yes]\n\t"
                JUMP_TABLE_ENTRY
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

extern int arch_jump_entry_size(struct jump_entry *entry);

#endif        /* __ASSEMBLY__ */

#endif














































































































































































































































































































    4 



    4 
    4 


    3 



















    2 
    2 




































































































































































    4 










    4 

















    4 














    4 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_NEIGHBOUR_H
#define _NET_NEIGHBOUR_H

#include <linux/neighbour.h>

/*
 *        Generic neighbour manipulation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 *
 *         Changes:
 *
 *        Harald Welte:                <laforge@gnumonks.org>
 *                - Add neighbour cache statistics like rtstat
 */

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rcupdate.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>

#include <linux/err.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/rtnetlink.h>

/*
 * NUD stands for "neighbor unreachability detection"
 */

#define NUD_IN_TIMER        (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE)
#define NUD_VALID        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY)
#define NUD_CONNECTED        (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE)

struct neighbour;

enum {
        NEIGH_VAR_MCAST_PROBES,
        NEIGH_VAR_UCAST_PROBES,
        NEIGH_VAR_APP_PROBES,
        NEIGH_VAR_MCAST_REPROBES,
        NEIGH_VAR_RETRANS_TIME,
        NEIGH_VAR_BASE_REACHABLE_TIME,
        NEIGH_VAR_DELAY_PROBE_TIME,
        NEIGH_VAR_INTERVAL_PROBE_TIME_MS,
        NEIGH_VAR_GC_STALETIME,
        NEIGH_VAR_QUEUE_LEN_BYTES,
        NEIGH_VAR_PROXY_QLEN,
        NEIGH_VAR_ANYCAST_DELAY,
        NEIGH_VAR_PROXY_DELAY,
        NEIGH_VAR_LOCKTIME,
#define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1)
        /* Following are used as a second way to access one of the above */
        NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */
        NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */
        NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */
        /* Following are used by "default" only */
        NEIGH_VAR_GC_INTERVAL,
        NEIGH_VAR_GC_THRESH1,
        NEIGH_VAR_GC_THRESH2,
        NEIGH_VAR_GC_THRESH3,
        NEIGH_VAR_MAX
};

struct neigh_parms {
        possible_net_t net;
        struct net_device *dev;
        netdevice_tracker dev_tracker;
        struct list_head list;
        int        (*neigh_setup)(struct neighbour *);
        struct neigh_table *tbl;

        void        *sysctl_table;

        int dead;
        refcount_t refcnt;
        struct rcu_head rcu_head;

        int        reachable_time;
        u32        qlen;
        int        data[NEIGH_VAR_DATA_MAX];
        DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
};

static inline void neigh_var_set(struct neigh_parms *p, int index, int val)
{
        set_bit(index, p->data_state);
        p->data[index] = val;
}

#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])

/* In ndo_neigh_setup, NEIGH_VAR_INIT should be used.
 * In other cases, NEIGH_VAR_SET should be used.
 */
#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val)
#define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val)

static inline void neigh_parms_data_state_setall(struct neigh_parms *p)
{
        bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX);
}

static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p)
{
        bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX);
}

struct neigh_statistics {
        unsigned long allocs;                /* number of allocated neighs */
        unsigned long destroys;                /* number of destroyed neighs */
        unsigned long hash_grows;        /* number of hash resizes */

        unsigned long res_failed;        /* number of failed resolutions */

        unsigned long lookups;                /* number of lookups */
        unsigned long hits;                /* number of hits (among lookups) */

        unsigned long rcv_probes_mcast;        /* number of received mcast ipv6 */
        unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */

        unsigned long periodic_gc_runs;        /* number of periodic GC runs */
        unsigned long forced_gc_runs;        /* number of forced GC runs */

        unsigned long unres_discards;        /* number of unresolved drops */
        unsigned long table_fulls;      /* times even gc couldn't help */
};

#define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)

struct neighbour {
        struct neighbour __rcu        *next;
        struct neigh_table        *tbl;
        struct neigh_parms        *parms;
        unsigned long                confirmed;
        unsigned long                updated;
        rwlock_t                lock;
        refcount_t                refcnt;
        unsigned int                arp_queue_len_bytes;
        struct sk_buff_head        arp_queue;
        struct timer_list        timer;
        unsigned long                used;
        atomic_t                probes;
        u8                        nud_state;
        u8                        type;
        u8                        dead;
        u8                        protocol;
        u32                        flags;
        seqlock_t                ha_lock;
        unsigned char                ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8);
        struct hh_cache                hh;
        int                        (*output)(struct neighbour *, struct sk_buff *);
        const struct neigh_ops        *ops;
        struct list_head        gc_list;
        struct list_head        managed_list;
        struct rcu_head                rcu;
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        u8                        primary_key[];
} __randomize_layout;

struct neigh_ops {
        int                        family;
        void                        (*solicit)(struct neighbour *, struct sk_buff *);
        void                        (*error_report)(struct neighbour *, struct sk_buff *);
        int                        (*output)(struct neighbour *, struct sk_buff *);
        int                        (*connected_output)(struct neighbour *, struct sk_buff *);
};

struct pneigh_entry {
        struct pneigh_entry        *next;
        possible_net_t                net;
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;
        u32                        flags;
        u8                        protocol;
        u32                        key[];
};

/*
 *        neighbour table manipulation
 */

#define NEIGH_NUM_HASH_RND        4

struct neigh_hash_table {
        struct neighbour __rcu        **hash_buckets;
        unsigned int                hash_shift;
        __u32                        hash_rnd[NEIGH_NUM_HASH_RND];
        struct rcu_head                rcu;
};


struct neigh_table {
        int                        family;
        unsigned int                entry_size;
        unsigned int                key_len;
        __be16                        protocol;
        __u32                        (*hash)(const void *pkey,
                                        const struct net_device *dev,
                                        __u32 *hash_rnd);
        bool                        (*key_eq)(const struct neighbour *, const void *pkey);
        int                        (*constructor)(struct neighbour *);
        int                        (*pconstructor)(struct pneigh_entry *);
        void                        (*pdestructor)(struct pneigh_entry *);
        void                        (*proxy_redo)(struct sk_buff *skb);
        int                        (*is_multicast)(const void *pkey);
        bool                        (*allow_add)(const struct net_device *dev,
                                             struct netlink_ext_ack *extack);
        char                        *id;
        struct neigh_parms        parms;
        struct list_head        parms_list;
        int                        gc_interval;
        int                        gc_thresh1;
        int                        gc_thresh2;
        int                        gc_thresh3;
        unsigned long                last_flush;
        struct delayed_work        gc_work;
        struct delayed_work        managed_work;
        struct timer_list         proxy_timer;
        struct sk_buff_head        proxy_queue;
        atomic_t                entries;
        atomic_t                gc_entries;
        struct list_head        gc_list;
        struct list_head        managed_list;
        rwlock_t                lock;
        unsigned long                last_rand;
        struct neigh_statistics        __percpu *stats;
        struct neigh_hash_table __rcu *nht;
        struct pneigh_entry        **phash_buckets;
};

enum {
        NEIGH_ARP_TABLE = 0,
        NEIGH_ND_TABLE = 1,
        NEIGH_DN_TABLE = 2,
        NEIGH_NR_TABLES,
        NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
};

static inline int neigh_parms_family(struct neigh_parms *p)
{
        return p->tbl->family;
}

#define NEIGH_PRIV_ALIGN        sizeof(long long)
#define NEIGH_ENTRY_SIZE(size)        ALIGN((size), NEIGH_PRIV_ALIGN)

static inline void *neighbour_priv(const struct neighbour *n)
{
        return (char *)n + n->tbl->entry_size;
}

/* flags for neigh_update() */
#define NEIGH_UPDATE_F_OVERRIDE                        BIT(0)
#define NEIGH_UPDATE_F_WEAK_OVERRIDE                BIT(1)
#define NEIGH_UPDATE_F_OVERRIDE_ISROUTER        BIT(2)
#define NEIGH_UPDATE_F_USE                        BIT(3)
#define NEIGH_UPDATE_F_MANAGED                        BIT(4)
#define NEIGH_UPDATE_F_EXT_LEARNED                BIT(5)
#define NEIGH_UPDATE_F_ISROUTER                        BIT(6)
#define NEIGH_UPDATE_F_ADMIN                        BIT(7)

/* In-kernel representation for NDA_FLAGS_EXT flags: */
#define NTF_OLD_MASK                0xff
#define NTF_EXT_SHIFT                8
#define NTF_EXT_MASK                (NTF_EXT_MANAGED)

#define NTF_MANAGED                (NTF_EXT_MANAGED << NTF_EXT_SHIFT)

extern const struct nla_policy nda_policy[];

static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey)
{
        return *(const u32 *)n->primary_key == *(const u32 *)pkey;
}

static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey)
{
        const u32 *n32 = (const u32 *)n->primary_key;
        const u32 *p32 = pkey;

        return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) |
                (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0;
}

static inline struct neighbour *___neigh_lookup_noref(
        struct neigh_table *tbl,
        bool (*key_eq)(const struct neighbour *n, const void *pkey),
        __u32 (*hash)(const void *pkey,
                      const struct net_device *dev,
                      __u32 *hash_rnd),
        const void *pkey,
        struct net_device *dev)
{
        struct neigh_hash_table *nht = rcu_dereference(tbl->nht);
        struct neighbour *n;
        u32 hash_val;

        hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
        for (n = rcu_dereference(nht->hash_buckets[hash_val]);
             n != NULL;
             n = rcu_dereference(n->next)) {
                if (n->dev == dev && key_eq(n, pkey))
                        return n;
        }

        return NULL;
}

static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl,
                                                     const void *pkey,
                                                     struct net_device *dev)
{
        return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev);
}

static inline void neigh_confirm(struct neighbour *n)
{
        if (n) {
                unsigned long now = jiffies;

                /* avoid dirtying neighbour */
                if (READ_ONCE(n->confirmed) != now)
                        WRITE_ONCE(n->confirmed, now);
        }
}

void neigh_table_init(int index, struct neigh_table *tbl);
int neigh_table_clear(int index, struct neigh_table *tbl);
struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
                               struct net_device *dev);
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
                                 struct net_device *dev, bool want_ref);
static inline struct neighbour *neigh_create(struct neigh_table *tbl,
                                             const void *pkey,
                                             struct net_device *dev)
{
        return __neigh_create(tbl, pkey, dev, true);
}
void neigh_destroy(struct neighbour *neigh);
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
                       const bool immediate_ok);
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags,
                 u32 nlmsg_pid);
void __neigh_set_probe_once(struct neighbour *neigh);
bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev);
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb);
int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb);
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
                                                u8 *lladdr, void *saddr,
                                                struct net_device *dev);

struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
                                      struct neigh_table *tbl);
void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms);

static inline
struct net *neigh_parms_net(const struct neigh_parms *parms)
{
        return read_pnet(&parms->net);
}

unsigned long neigh_rand_reach_time(unsigned long base);

void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
                    struct sk_buff *skb);
struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
                                   const void *key, struct net_device *dev,
                                   int creat);
struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net,
                                     const void *key, struct net_device *dev);
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
                  struct net_device *dev);

static inline struct net *pneigh_net(const struct pneigh_entry *pneigh)
{
        return read_pnet(&pneigh->net);
}

void neigh_app_ns(struct neighbour *n);
void neigh_for_each(struct neigh_table *tbl,
                    void (*cb)(struct neighbour *, void *), void *cookie);
void __neigh_for_each_release(struct neigh_table *tbl,
                              int (*cb)(struct neighbour *));
int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *);

struct neigh_seq_state {
        struct seq_net_private p;
        struct neigh_table *tbl;
        struct neigh_hash_table *nht;
        void *(*neigh_sub_iter)(struct neigh_seq_state *state,
                                struct neighbour *n, loff_t *pos);
        unsigned int bucket;
        unsigned int flags;
#define NEIGH_SEQ_NEIGH_ONLY        0x00000001
#define NEIGH_SEQ_IS_PNEIGH        0x00000002
#define NEIGH_SEQ_SKIP_NOARP        0x00000004
};
void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *,
                      unsigned int);
void *neigh_seq_next(struct seq_file *, void *, loff_t *);
void neigh_seq_stop(struct seq_file *, void *);

int neigh_proc_dointvec(struct ctl_table *ctl, int write,
                        void *buffer, size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
                                void *buffer,
                                size_t *lenp, loff_t *ppos);
int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
                                   void *buffer, size_t *lenp, loff_t *ppos);

int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
                          proc_handler *proc_handler);
void neigh_sysctl_unregister(struct neigh_parms *p);

static inline void __neigh_parms_put(struct neigh_parms *parms)
{
        refcount_dec(&parms->refcnt);
}

static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms)
{
        refcount_inc(&parms->refcnt);
        return parms;
}

/*
 *        Neighbour references
 */

static inline void neigh_release(struct neighbour *neigh)
{
        if (refcount_dec_and_test(&neigh->refcnt))
                neigh_destroy(neigh);
}

static inline struct neighbour * neigh_clone(struct neighbour *neigh)
{
        if (neigh)
                refcount_inc(&neigh->refcnt);
        return neigh;
}

#define neigh_hold(n)        refcount_inc(&(n)->refcnt)

static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
                                                  struct sk_buff *skb,
                                                  const bool immediate_ok)
{
        unsigned long now = jiffies;

        if (READ_ONCE(neigh->used) != now)
                WRITE_ONCE(neigh->used, now);
        if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
                return __neigh_event_send(neigh, skb, immediate_ok);
        return 0;
}

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
        return neigh_event_send_probe(neigh, skb, true);
}

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int seq, hh_alen;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_alen = HH_DATA_ALIGN(ETH_HLEN);
                memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN);
        } while (read_seqretry(&hh->hh_lock, seq));
        return 0;
}
#endif

static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
        unsigned int hh_alen = 0;
        unsigned int seq;
        unsigned int hh_len;

        do {
                seq = read_seqbegin(&hh->hh_lock);
                hh_len = READ_ONCE(hh->hh_len);
                if (likely(hh_len <= HH_DATA_MOD)) {
                        hh_alen = HH_DATA_MOD;

                        /* skb_push() would proceed silently if we have room for
                         * the unaligned size but not for the aligned size:
                         * check headroom explicitly.
                         */
                        if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
                                /* this is inlined by gcc */
                                memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
                                       HH_DATA_MOD);
                        }
                } else {
                        hh_alen = HH_DATA_ALIGN(hh_len);

                        if (likely(skb_headroom(skb) >= hh_alen)) {
                                memcpy(skb->data - hh_alen, hh->hh_data,
                                       hh_alen);
                        }
                }
        } while (read_seqretry(&hh->hh_lock, seq));

        if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
                kfree_skb(skb);
                return NET_XMIT_DROP;
        }

        __skb_push(skb, hh_len);
        return dev_queue_xmit(skb);
}

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
                               bool skip_cache)
{
        const struct hh_cache *hh = &n->hh;

        /* n->nud_state and hh->hh_len could be changed under us.
         * neigh_hh_output() is taking care of the race later.
         */
        if (!skip_cache &&
            (READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
            READ_ONCE(hh->hh_len))
                return neigh_hh_output(hh, skb);

        return READ_ONCE(n->output)(n, skb);
}

static inline struct neighbour *
__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n || !creat)
                return n;

        n = neigh_create(tbl, pkey, dev);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *
__neigh_lookup_errno(struct neigh_table *tbl, const void *pkey,
  struct net_device *dev)
{
        struct neighbour *n = neigh_lookup(tbl, pkey, dev);

        if (n)
                return n;

        return neigh_create(tbl, pkey, dev);
}

struct neighbour_cb {
        unsigned long sched_next;
        unsigned int flags;
};

#define LOCALLY_ENQUEUED 0x1

#define NEIGH_CB(skb)        ((struct neighbour_cb *)(skb)->cb)

static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
                                     const struct net_device *dev)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&n->ha_lock);
                memcpy(dst, n->ha, dev->addr_len);
        } while (read_seqretry(&n->ha_lock, seq));
}

static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags,
                                          int *notify)
{
        u8 ndm_flags = 0;

        ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0;
        if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) {
                if (ndm_flags & NTF_ROUTER)
                        neigh->flags |= NTF_ROUTER;
                else
                        neigh->flags &= ~NTF_ROUTER;
                *notify = 1;
        }
}
#endif
























    7 
































    6 


















    3 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM skb

#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SKB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>

#undef FN
#define FN(reason)        TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
DEFINE_DROP_REASON(FN, FN)

#undef FN
#undef FNe
#define FN(reason)        { SKB_DROP_REASON_##reason, #reason },
#define FNe(reason)        { SKB_DROP_REASON_##reason, #reason }

/*
 * Tracepoint for free an sk_buff:
 */
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location,
                 enum skb_drop_reason reason, struct sock *rx_sk),

        TP_ARGS(skb, location, reason, rx_sk),

        TP_STRUCT__entry(
                __field(void *,                skbaddr)
                __field(void *,                location)
                __field(void *,                rx_sk)
                __field(unsigned short,        protocol)
                __field(enum skb_drop_reason,        reason)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->rx_sk = rx_sk;
                __entry->protocol = ntohs(skb->protocol);
                __entry->reason = reason;
        ),

        TP_printk("skbaddr=%p rx_sk=%p protocol=%u location=%pS reason: %s",
                  __entry->skbaddr, __entry->rx_sk, __entry->protocol,
                  __entry->location,
                  __print_symbolic(__entry->reason,
                                   DEFINE_DROP_REASON(FN, FNe)))
);

#undef FN
#undef FNe

TRACE_EVENT(consume_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,        skbaddr)
                __field(        void *,        location)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
        ),

        TP_printk("skbaddr=%p location=%pS", __entry->skbaddr, __entry->location)
);

TRACE_EVENT(skb_copy_datagram_iovec,

        TP_PROTO(const struct sk_buff *skb, int len),

        TP_ARGS(skb, len),

        TP_STRUCT__entry(
                __field(        const void *,                skbaddr                )
                __field(        int,                        len                )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = len;
        ),

        TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len)
);

#endif /* _TRACE_SKB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>













































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_MROUTE6_H
#define __LINUX_MROUTE6_H


#include <linux/pim.h>
#include <linux/skbuff.h>        /* for struct sk_buff_head */
#include <net/net_namespace.h>
#include <uapi/linux/mroute6.h>
#include <linux/mroute_base.h>
#include <linux/sockptr.h>
#include <net/fib_rules.h>

#ifdef CONFIG_IPV6_MROUTE
static inline int ip6_mroute_opt(int opt)
{
        return (opt >= MRT6_BASE) && (opt <= MRT6_MAX);
}
#else
static inline int ip6_mroute_opt(int opt)
{
        return 0;
}
#endif

struct sock;

#ifdef CONFIG_IPV6_MROUTE
extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
extern int ip6_mr_input(struct sk_buff *skb);
extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
extern int ip6_mr_init(void);
extern void ip6_mr_cleanup(void);
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg);
#else
static inline int ip6_mroute_setsockopt(struct sock *sock, int optname,
                sockptr_t optval, unsigned int optlen)
{
        return -ENOPROTOOPT;
}

static inline
int ip6_mroute_getsockopt(struct sock *sock,
                          int optname, sockptr_t optval, sockptr_t optlen)
{
        return -ENOPROTOOPT;
}

static inline
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
{
        return -ENOIOCTLCMD;
}

static inline int ip6_mr_init(void)
{
        return 0;
}

static inline void ip6_mr_cleanup(void)
{
        return;
}
#endif

#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
bool ip6mr_rule_default(const struct fib_rule *rule);
#else
static inline bool ip6mr_rule_default(const struct fib_rule *rule)
{
        return true;
}
#endif

#define VIFF_STATIC 0x8000

struct mfc6_cache_cmp_arg {
        struct in6_addr mf6c_mcastgrp;
        struct in6_addr mf6c_origin;
};

struct mfc6_cache {
        struct mr_mfc _c;
        union {
                struct {
                        struct in6_addr mf6c_mcastgrp;
                        struct in6_addr mf6c_origin;
                };
                struct mfc6_cache_cmp_arg cmparg;
        };
};

#define MFC_ASSERT_THRESH (3*HZ)                /* Maximal freq. of asserts */

struct rtmsg;
extern int ip6mr_get_route(struct net *net, struct sk_buff *skb,
                           struct rtmsg *rtm, u32 portid);

#ifdef CONFIG_IPV6_MROUTE
bool mroute6_is_socket(struct net *net, struct sk_buff *skb);
extern int ip6mr_sk_done(struct sock *sk);
static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd,
                                 void __user *arg)
{
        switch (cmd) {
        /* These userspace buffers will be consumed by ip6mr_ioctl() */
        case SIOCGETMIFCNT_IN6: {
                struct sioc_mif_req6 buffer;

                return sock_ioctl_inout(sk, cmd, arg, &buffer,
                                        sizeof(buffer));
                }
        case SIOCGETSGCNT_IN6: {
                struct sioc_sg_req6 buffer;

                return sock_ioctl_inout(sk, cmd, arg, &buffer,
                                        sizeof(buffer));
                }
        }

        return 1;
}
#else
static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
{
        return false;
}
static inline int ip6mr_sk_done(struct sock *sk)
{
        return 0;
}

static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd,
                                 void __user *arg)
{
        return 1;
}
#endif
#endif

























































































































































































































































































































































































































































































































































































































































































































































































    1 
































































































































































































































































































































    1 





    1 










    1 





    1 



    1 




























































    1 




    1 




























    1 




















    1 
















































    1 


    1 



























































    1 

























    1 





































    1 



































































































































































































































    1 




































    1 









    1 
















    1 












    1 





    1 



    1 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
 *
 * Copyright (c) 2002-2017 Volkswagen Group Electronic Research
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/hrtimer.h>
#include <linux/list.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/can/bcm.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/net_namespace.h>

/*
 * To send multiple CAN frame content within TX_SETUP or to filter
 * CAN messages with multiplex index within RX_SETUP, the number of
 * different filters is limited to 256 due to the one byte index value.
 */
#define MAX_NFRAMES 256

/* limit timers to 400 days for sending/timeouts */
#define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60)

/* use of last_frames[index].flags */
#define RX_LOCAL   0x10 /* frame was created on the local host */
#define RX_OWN     0x20 /* frame was sent via the socket it was received on */
#define RX_RECV    0x40 /* received data for this element */
#define RX_THR     0x80 /* element not been sent due to throttle feature */
#define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */

/* get best masking value for can_rx_register() for a given single can_id */
#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
                     (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
                     (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))

MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
MODULE_ALIAS("can-proto-2");

#define BCM_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex)

/*
 * easy access to the first 64 bit of can(fd)_frame payload. cp->data is
 * 64 bit aligned so the offset has to be multiples of 8 which is ensured
 * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler().
 */
static inline u64 get_u64(const struct canfd_frame *cp, int offset)
{
        return *(u64 *)(cp->data + offset);
}

struct bcm_op {
        struct list_head list;
        struct rcu_head rcu;
        int ifindex;
        canid_t can_id;
        u32 flags;
        unsigned long frames_abs, frames_filtered;
        struct bcm_timeval ival1, ival2;
        struct hrtimer timer, thrtimer;
        ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
        int rx_ifindex;
        int cfsiz;
        u32 count;
        u32 nframes;
        u32 currframe;
        /* void pointers to arrays of struct can[fd]_frame */
        void *frames;
        void *last_frames;
        struct canfd_frame sframe;
        struct canfd_frame last_sframe;
        struct sock *sk;
        struct net_device *rx_reg_dev;
};

struct bcm_sock {
        struct sock sk;
        int bound;
        int ifindex;
        struct list_head notifier;
        struct list_head rx_ops;
        struct list_head tx_ops;
        unsigned long dropped_usr_msgs;
        struct proc_dir_entry *bcm_proc_read;
        char procname [32]; /* inode number in decimal with \0 */
};

static LIST_HEAD(bcm_notifier_list);
static DEFINE_SPINLOCK(bcm_notifier_lock);
static struct bcm_sock *bcm_busy_notifier;

/* Return pointer to store the extra msg flags for bcm_recvmsg().
 * We use the space of one unsigned int beyond the 'struct sockaddr_can'
 * in skb->cb.
 */
static inline unsigned int *bcm_flags(struct sk_buff *skb)
{
        /* return pointer after struct sockaddr_can */
        return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
}

static inline struct bcm_sock *bcm_sk(const struct sock *sk)
{
        return (struct bcm_sock *)sk;
}

static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
{
        return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
}

/* check limitations for timeval provided by user */
static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head)
{
        if ((msg_head->ival1.tv_sec < 0) ||
            (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) ||
            (msg_head->ival1.tv_usec < 0) ||
            (msg_head->ival1.tv_usec >= USEC_PER_SEC) ||
            (msg_head->ival2.tv_sec < 0) ||
            (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) ||
            (msg_head->ival2.tv_usec < 0) ||
            (msg_head->ival2.tv_usec >= USEC_PER_SEC))
                return true;

        return false;
}

#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU)
#define OPSIZ sizeof(struct bcm_op)
#define MHSIZ sizeof(struct bcm_msg_head)

/*
 * procfs functions
 */
#if IS_ENABLED(CONFIG_PROC_FS)
static char *bcm_proc_getifname(struct net *net, char *result, int ifindex)
{
        struct net_device *dev;

        if (!ifindex)
                return "any";

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
                strcpy(result, dev->name);
        else
                strcpy(result, "???");
        rcu_read_unlock();

        return result;
}

static int bcm_proc_show(struct seq_file *m, void *v)
{
        char ifname[IFNAMSIZ];
        struct net *net = m->private;
        struct sock *sk = (struct sock *)pde_data(m->file->f_inode);
        struct bcm_sock *bo = bcm_sk(sk);
        struct bcm_op *op;

        seq_printf(m, ">>> socket %pK", sk->sk_socket);
        seq_printf(m, " / sk %pK", sk);
        seq_printf(m, " / bo %pK", bo);
        seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs);
        seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex));
        seq_printf(m, " <<<\n");

        list_for_each_entry(op, &bo->rx_ops, list) {

                unsigned long reduction;

                /* print only active entries & prevent division by zero */
                if (!op->frames_abs)
                        continue;

                seq_printf(m, "rx_op: %03X %-5s ", op->can_id,
                           bcm_proc_getifname(net, ifname, op->ifindex));

                if (op->flags & CAN_FD_FRAME)
                        seq_printf(m, "(%u)", op->nframes);
                else
                        seq_printf(m, "[%u]", op->nframes);

                seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');

                if (op->kt_ival1)
                        seq_printf(m, "timeo=%lld ",
                                   (long long)ktime_to_us(op->kt_ival1));

                if (op->kt_ival2)
                        seq_printf(m, "thr=%lld ",
                                   (long long)ktime_to_us(op->kt_ival2));

                seq_printf(m, "# recv %ld (%ld) => reduction: ",
                           op->frames_filtered, op->frames_abs);

                reduction = 100 - (op->frames_filtered * 100) / op->frames_abs;

                seq_printf(m, "%s%ld%%\n",
                           (reduction == 100) ? "near " : "", reduction);
        }

        list_for_each_entry(op, &bo->tx_ops, list) {

                seq_printf(m, "tx_op: %03X %s ", op->can_id,
                           bcm_proc_getifname(net, ifname, op->ifindex));

                if (op->flags & CAN_FD_FRAME)
                        seq_printf(m, "(%u) ", op->nframes);
                else
                        seq_printf(m, "[%u] ", op->nframes);

                if (op->kt_ival1)
                        seq_printf(m, "t1=%lld ",
                                   (long long)ktime_to_us(op->kt_ival1));

                if (op->kt_ival2)
                        seq_printf(m, "t2=%lld ",
                                   (long long)ktime_to_us(op->kt_ival2));

                seq_printf(m, "# sent %ld\n", op->frames_abs);
        }
        seq_putc(m, '\n');
        return 0;
}
#endif /* CONFIG_PROC_FS */

/*
 * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface
 *              of the given bcm tx op
 */
static void bcm_can_tx(struct bcm_op *op)
{
        struct sk_buff *skb;
        struct net_device *dev;
        struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe;
        int err;

        /* no target device? => exit */
        if (!op->ifindex)
                return;

        dev = dev_get_by_index(sock_net(op->sk), op->ifindex);
        if (!dev) {
                /* RFC: should this bcm_op remove itself here? */
                return;
        }

        skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any());
        if (!skb)
                goto out;

        can_skb_reserve(skb);
        can_skb_prv(skb)->ifindex = dev->ifindex;
        can_skb_prv(skb)->skbcnt = 0;

        skb_put_data(skb, cf, op->cfsiz);

        /* send with loopback */
        skb->dev = dev;
        can_skb_set_owner(skb, op->sk);
        err = can_send(skb, 1);
        if (!err)
                op->frames_abs++;

        op->currframe++;

        /* reached last frame? */
        if (op->currframe >= op->nframes)
                op->currframe = 0;
out:
        dev_put(dev);
}

/*
 * bcm_send_to_user - send a BCM message to the userspace
 *                    (consisting of bcm_msg_head + x CAN frames)
 */
static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
                             struct canfd_frame *frames, int has_timestamp)
{
        struct sk_buff *skb;
        struct canfd_frame *firstframe;
        struct sockaddr_can *addr;
        struct sock *sk = op->sk;
        unsigned int datalen = head->nframes * op->cfsiz;
        int err;
        unsigned int *pflags;

        skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
        if (!skb)
                return;

        skb_put_data(skb, head, sizeof(*head));

        /* ensure space for sockaddr_can and msg flags */
        sock_skb_cb_check_size(sizeof(struct sockaddr_can) +
                               sizeof(unsigned int));

        /* initialize msg flags */
        pflags = bcm_flags(skb);
        *pflags = 0;

        if (head->nframes) {
                /* CAN frames starting here */
                firstframe = (struct canfd_frame *)skb_tail_pointer(skb);

                skb_put_data(skb, frames, datalen);

                /*
                 * the BCM uses the flags-element of the canfd_frame
                 * structure for internal purposes. This is only
                 * relevant for updates that are generated by the
                 * BCM, where nframes is 1
                 */
                if (head->nframes == 1) {
                        if (firstframe->flags & RX_LOCAL)
                                *pflags |= MSG_DONTROUTE;
                        if (firstframe->flags & RX_OWN)
                                *pflags |= MSG_CONFIRM;

                        firstframe->flags &= BCM_CAN_FLAGS_MASK;
                }
        }

        if (has_timestamp) {
                /* restore rx timestamp */
                skb->tstamp = op->rx_stamp;
        }

        /*
         *  Put the datagram to the queue so that bcm_recvmsg() can
         *  get it from there.  We need to pass the interface index to
         *  bcm_recvmsg().  We pass a whole struct sockaddr_can in skb->cb
         *  containing the interface index.
         */

        addr = (struct sockaddr_can *)skb->cb;
        memset(addr, 0, sizeof(*addr));
        addr->can_family  = AF_CAN;
        addr->can_ifindex = op->rx_ifindex;

        err = sock_queue_rcv_skb(sk, skb);
        if (err < 0) {
                struct bcm_sock *bo = bcm_sk(sk);

                kfree_skb(skb);
                /* don't care about overflows in this statistic */
                bo->dropped_usr_msgs++;
        }
}

static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
{
        ktime_t ival;

        if (op->kt_ival1 && op->count)
                ival = op->kt_ival1;
        else if (op->kt_ival2)
                ival = op->kt_ival2;
        else
                return false;

        hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
        return true;
}

static void bcm_tx_start_timer(struct bcm_op *op)
{
        if (bcm_tx_set_expiry(op, &op->timer))
                hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
}

/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
{
        struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
        struct bcm_msg_head msg_head;

        if (op->kt_ival1 && (op->count > 0)) {
                op->count--;
                if (!op->count && (op->flags & TX_COUNTEVT)) {

                        /* create notification to user */
                        memset(&msg_head, 0, sizeof(msg_head));
                        msg_head.opcode  = TX_EXPIRED;
                        msg_head.flags   = op->flags;
                        msg_head.count   = op->count;
                        msg_head.ival1   = op->ival1;
                        msg_head.ival2   = op->ival2;
                        msg_head.can_id  = op->can_id;
                        msg_head.nframes = 0;

                        bcm_send_to_user(op, &msg_head, NULL, 0);
                }
                bcm_can_tx(op);

        } else if (op->kt_ival2) {
                bcm_can_tx(op);
        }

        return bcm_tx_set_expiry(op, &op->timer) ?
                HRTIMER_RESTART : HRTIMER_NORESTART;
}

/*
 * bcm_rx_changed - create a RX_CHANGED notification due to changed content
 */
static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data)
{
        struct bcm_msg_head head;

        /* update statistics */
        op->frames_filtered++;

        /* prevent statistics overflow */
        if (op->frames_filtered > ULONG_MAX/100)
                op->frames_filtered = op->frames_abs = 0;

        /* this element is not throttled anymore */
        data->flags &= ~RX_THR;

        memset(&head, 0, sizeof(head));
        head.opcode  = RX_CHANGED;
        head.flags   = op->flags;
        head.count   = op->count;
        head.ival1   = op->ival1;
        head.ival2   = op->ival2;
        head.can_id  = op->can_id;
        head.nframes = 1;

        bcm_send_to_user(op, &head, data, 1);
}

/*
 * bcm_rx_update_and_send - process a detected relevant receive content change
 *                          1. update the last received data
 *                          2. send a notification to the user (if possible)
 */
static void bcm_rx_update_and_send(struct bcm_op *op,
                                   struct canfd_frame *lastdata,
                                   const struct canfd_frame *rxdata,
                                   unsigned char traffic_flags)
{
        memcpy(lastdata, rxdata, op->cfsiz);

        /* mark as used and throttled by default */
        lastdata->flags |= (RX_RECV|RX_THR);

        /* add own/local/remote traffic flags */
        lastdata->flags |= traffic_flags;

        /* throttling mode inactive ? */
        if (!op->kt_ival2) {
                /* send RX_CHANGED to the user immediately */
                bcm_rx_changed(op, lastdata);
                return;
        }

        /* with active throttling timer we are just done here */
        if (hrtimer_active(&op->thrtimer))
                return;

        /* first reception with enabled throttling mode */
        if (!op->kt_lastmsg)
                goto rx_changed_settime;

        /* got a second frame inside a potential throttle period? */
        if (ktime_us_delta(ktime_get(), op->kt_lastmsg) <
            ktime_to_us(op->kt_ival2)) {
                /* do not send the saved data - only start throttle timer */
                hrtimer_start(&op->thrtimer,
                              ktime_add(op->kt_lastmsg, op->kt_ival2),
                              HRTIMER_MODE_ABS_SOFT);
                return;
        }

        /* the gap was that big, that throttling was not needed here */
rx_changed_settime:
        bcm_rx_changed(op, lastdata);
        op->kt_lastmsg = ktime_get();
}

/*
 * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly
 *                       received data stored in op->last_frames[]
 */
static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
                                const struct canfd_frame *rxdata,
                                unsigned char traffic_flags)
{
        struct canfd_frame *cf = op->frames + op->cfsiz * index;
        struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
        int i;

        /*
         * no one uses the MSBs of flags for comparison,
         * so we use it here to detect the first time of reception
         */

        if (!(lcf->flags & RX_RECV)) {
                /* received data for the first time => send update to user */
                bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
                return;
        }

        /* do a real check in CAN frame data section */
        for (i = 0; i < rxdata->len; i += 8) {
                if ((get_u64(cf, i) & get_u64(rxdata, i)) !=
                    (get_u64(cf, i) & get_u64(lcf, i))) {
                        bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
                        return;
                }
        }

        if (op->flags & RX_CHECK_DLC) {
                /* do a real check in CAN frame length */
                if (rxdata->len != lcf->len) {
                        bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
                        return;
                }
        }
}

/*
 * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception
 */
static void bcm_rx_starttimer(struct bcm_op *op)
{
        if (op->flags & RX_NO_AUTOTIMER)
                return;

        if (op->kt_ival1)
                hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
}

/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
{
        struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
        struct bcm_msg_head msg_head;

        /* if user wants to be informed, when cyclic CAN-Messages come back */
        if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
                /* clear received CAN frames to indicate 'nothing received' */
                memset(op->last_frames, 0, op->nframes * op->cfsiz);
        }

        /* create notification to user */
        memset(&msg_head, 0, sizeof(msg_head));
        msg_head.opcode  = RX_TIMEOUT;
        msg_head.flags   = op->flags;
        msg_head.count   = op->count;
        msg_head.ival1   = op->ival1;
        msg_head.ival2   = op->ival2;
        msg_head.can_id  = op->can_id;
        msg_head.nframes = 0;

        bcm_send_to_user(op, &msg_head, NULL, 0);

        return HRTIMER_NORESTART;
}

/*
 * bcm_rx_do_flush - helper for bcm_rx_thr_flush
 */
static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
{
        struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;

        if ((op->last_frames) && (lcf->flags & RX_THR)) {
                bcm_rx_changed(op, lcf);
                return 1;
        }
        return 0;
}

/*
 * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
 */
static int bcm_rx_thr_flush(struct bcm_op *op)
{
        int updated = 0;

        if (op->nframes > 1) {
                unsigned int i;

                /* for MUX filter we start at index 1 */
                for (i = 1; i < op->nframes; i++)
                        updated += bcm_rx_do_flush(op, i);

        } else {
                /* for RX_FILTER_ID and simple filter */
                updated += bcm_rx_do_flush(op, 0);
        }

        return updated;
}

/*
 * bcm_rx_thr_handler - the time for blocked content updates is over now:
 *                      Check for throttled data and send it to the userspace
 */
static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
{
        struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);

        if (bcm_rx_thr_flush(op)) {
                hrtimer_forward_now(hrtimer, op->kt_ival2);
                return HRTIMER_RESTART;
        } else {
                /* rearm throttle handling */
                op->kt_lastmsg = 0;
                return HRTIMER_NORESTART;
        }
}

/*
 * bcm_rx_handler - handle a CAN frame reception
 */
static void bcm_rx_handler(struct sk_buff *skb, void *data)
{
        struct bcm_op *op = (struct bcm_op *)data;
        const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data;
        unsigned int i;
        unsigned char traffic_flags;

        if (op->can_id != rxframe->can_id)
                return;

        /* make sure to handle the correct frame type (CAN / CAN FD) */
        if (op->flags & CAN_FD_FRAME) {
                if (!can_is_canfd_skb(skb))
                        return;
        } else {
                if (!can_is_can_skb(skb))
                        return;
        }

        /* disable timeout */
        hrtimer_cancel(&op->timer);

        /* save rx timestamp */
        op->rx_stamp = skb->tstamp;
        /* save originator for recvfrom() */
        op->rx_ifindex = skb->dev->ifindex;
        /* update statistics */
        op->frames_abs++;

        if (op->flags & RX_RTR_FRAME) {
                /* send reply for RTR-request (placed in op->frames[0]) */
                bcm_can_tx(op);
                return;
        }

        /* compute flags to distinguish between own/local/remote CAN traffic */
        traffic_flags = 0;
        if (skb->sk) {
                traffic_flags |= RX_LOCAL;
                if (skb->sk == op->sk)
                        traffic_flags |= RX_OWN;
        }

        if (op->flags & RX_FILTER_ID) {
                /* the easiest case */
                bcm_rx_update_and_send(op, op->last_frames, rxframe,
                                       traffic_flags);
                goto rx_starttimer;
        }

        if (op->nframes == 1) {
                /* simple compare with index 0 */
                bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags);
                goto rx_starttimer;
        }

        if (op->nframes > 1) {
                /*
                 * multiplex compare
                 *
                 * find the first multiplex mask that fits.
                 * Remark: The MUX-mask is stored in index 0 - but only the
                 * first 64 bits of the frame data[] are relevant (CAN FD)
                 */

                for (i = 1; i < op->nframes; i++) {
                        if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) ==
                            (get_u64(op->frames, 0) &
                             get_u64(op->frames + op->cfsiz * i, 0))) {
                                bcm_rx_cmp_to_index(op, i, rxframe,
                                                    traffic_flags);
                                break;
                        }
                }
        }

rx_starttimer:
        bcm_rx_starttimer(op);
}

/*
 * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements
 */
static struct bcm_op *bcm_find_op(struct list_head *ops,
                                  struct bcm_msg_head *mh, int ifindex)
{
        struct bcm_op *op;

        list_for_each_entry(op, ops, list) {
                if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
                    (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME))
                        return op;
        }

        return NULL;
}

static void bcm_free_op_rcu(struct rcu_head *rcu_head)
{
        struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu);

        if ((op->frames) && (op->frames != &op->sframe))
                kfree(op->frames);

        if ((op->last_frames) && (op->last_frames != &op->last_sframe))
                kfree(op->last_frames);

        kfree(op);
}

static void bcm_remove_op(struct bcm_op *op)
{
        hrtimer_cancel(&op->timer);
        hrtimer_cancel(&op->thrtimer);

        call_rcu(&op->rcu, bcm_free_op_rcu);
}

static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
{
        if (op->rx_reg_dev == dev) {
                can_rx_unregister(dev_net(dev), dev, op->can_id,
                                  REGMASK(op->can_id), bcm_rx_handler, op);

                /* mark as removed subscription */
                op->rx_reg_dev = NULL;
        } else
                printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device "
                       "mismatch %p %p\n", op->rx_reg_dev, dev);
}

/*
 * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops)
 */
static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
                            int ifindex)
{
        struct bcm_op *op, *n;

        list_for_each_entry_safe(op, n, ops, list) {
                if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
                    (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {

                        /* disable automatic timer on frame reception */
                        op->flags |= RX_NO_AUTOTIMER;

                        /*
                         * Don't care if we're bound or not (due to netdev
                         * problems) can_rx_unregister() is always a save
                         * thing to do here.
                         */
                        if (op->ifindex) {
                                /*
                                 * Only remove subscriptions that had not
                                 * been removed due to NETDEV_UNREGISTER
                                 * in bcm_notifier()
                                 */
                                if (op->rx_reg_dev) {
                                        struct net_device *dev;

                                        dev = dev_get_by_index(sock_net(op->sk),
                                                               op->ifindex);
                                        if (dev) {
                                                bcm_rx_unreg(dev, op);
                                                dev_put(dev);
                                        }
                                }
                        } else
                                can_rx_unregister(sock_net(op->sk), NULL,
                                                  op->can_id,
                                                  REGMASK(op->can_id),
                                                  bcm_rx_handler, op);

                        list_del(&op->list);
                        bcm_remove_op(op);
                        return 1; /* done */
                }
        }

        return 0; /* not found */
}

/*
 * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops)
 */
static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh,
                            int ifindex)
{
        struct bcm_op *op, *n;

        list_for_each_entry_safe(op, n, ops, list) {
                if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
                    (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
                        list_del(&op->list);
                        bcm_remove_op(op);
                        return 1; /* done */
                }
        }

        return 0; /* not found */
}

/*
 * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg)
 */
static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head,
                       int ifindex)
{
        struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex);

        if (!op)
                return -EINVAL;

        /* put current values into msg_head */
        msg_head->flags   = op->flags;
        msg_head->count   = op->count;
        msg_head->ival1   = op->ival1;
        msg_head->ival2   = op->ival2;
        msg_head->nframes = op->nframes;

        bcm_send_to_user(op, msg_head, op->frames, 0);

        return MHSIZ;
}

/*
 * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg)
 */
static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
                        int ifindex, struct sock *sk)
{
        struct bcm_sock *bo = bcm_sk(sk);
        struct bcm_op *op;
        struct canfd_frame *cf;
        unsigned int i;
        int err;

        /* we need a real device to send frames */
        if (!ifindex)
                return -ENODEV;

        /* check nframes boundaries - we need at least one CAN frame */
        if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
                return -EINVAL;

        /* check timeval limitations */
        if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
                return -EINVAL;

        /* check the given can_id */
        op = bcm_find_op(&bo->tx_ops, msg_head, ifindex);
        if (op) {
                /* update existing BCM operation */

                /*
                 * Do we need more space for the CAN frames than currently
                 * allocated? -> This is a _really_ unusual use-case and
                 * therefore (complexity / locking) it is not supported.
                 */
                if (msg_head->nframes > op->nframes)
                        return -E2BIG;

                /* update CAN frames content */
                for (i = 0; i < msg_head->nframes; i++) {

                        cf = op->frames + op->cfsiz * i;
                        err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);

                        if (op->flags & CAN_FD_FRAME) {
                                if (cf->len > 64)
                                        err = -EINVAL;
                        } else {
                                if (cf->len > 8)
                                        err = -EINVAL;
                        }

                        if (err < 0)
                                return err;

                        if (msg_head->flags & TX_CP_CAN_ID) {
                                /* copy can_id into frame */
                                cf->can_id = msg_head->can_id;
                        }
                }
                op->flags = msg_head->flags;

        } else {
                /* insert new BCM operation for the given can_id */

                op = kzalloc(OPSIZ, GFP_KERNEL);
                if (!op)
                        return -ENOMEM;

                op->can_id = msg_head->can_id;
                op->cfsiz = CFSIZ(msg_head->flags);
                op->flags = msg_head->flags;

                /* create array for CAN frames and copy the data */
                if (msg_head->nframes > 1) {
                        op->frames = kmalloc_array(msg_head->nframes,
                                                   op->cfsiz,
                                                   GFP_KERNEL);
                        if (!op->frames) {
                                kfree(op);
                                return -ENOMEM;
                        }
                } else
                        op->frames = &op->sframe;

                for (i = 0; i < msg_head->nframes; i++) {

                        cf = op->frames + op->cfsiz * i;
                        err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
                        if (err < 0)
                                goto free_op;

                        if (op->flags & CAN_FD_FRAME) {
                                if (cf->len > 64)
                                        err = -EINVAL;
                        } else {
                                if (cf->len > 8)
                                        err = -EINVAL;
                        }

                        if (err < 0)
                                goto free_op;

                        if (msg_head->flags & TX_CP_CAN_ID) {
                                /* copy can_id into frame */
                                cf->can_id = msg_head->can_id;
                        }
                }

                /* tx_ops never compare with previous received messages */
                op->last_frames = NULL;

                /* bcm_can_tx / bcm_tx_timeout_handler needs this */
                op->sk = sk;
                op->ifindex = ifindex;

                /* initialize uninitialized (kzalloc) structure */
                hrtimer_init(&op->timer, CLOCK_MONOTONIC,
                             HRTIMER_MODE_REL_SOFT);
                op->timer.function = bcm_tx_timeout_handler;

                /* currently unused in tx_ops */
                hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
                             HRTIMER_MODE_REL_SOFT);

                /* add this bcm_op to the list of the tx_ops */
                list_add(&op->list, &bo->tx_ops);

        } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */

        if (op->nframes != msg_head->nframes) {
                op->nframes   = msg_head->nframes;
                /* start multiple frame transmission with index 0 */
                op->currframe = 0;
        }

        /* check flags */

        if (op->flags & TX_RESET_MULTI_IDX) {
                /* start multiple frame transmission with index 0 */
                op->currframe = 0;
        }

        if (op->flags & SETTIMER) {
                /* set timer values */
                op->count = msg_head->count;
                op->ival1 = msg_head->ival1;
                op->ival2 = msg_head->ival2;
                op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
                op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);

                /* disable an active timer due to zero values? */
                if (!op->kt_ival1 && !op->kt_ival2)
                        hrtimer_cancel(&op->timer);
        }

        if (op->flags & STARTTIMER) {
                hrtimer_cancel(&op->timer);
                /* spec: send CAN frame when starting timer */
                op->flags |= TX_ANNOUNCE;
        }

        if (op->flags & TX_ANNOUNCE) {
                bcm_can_tx(op);
                if (op->count)
                        op->count--;
        }

        if (op->flags & STARTTIMER)
                bcm_tx_start_timer(op);

        return msg_head->nframes * op->cfsiz + MHSIZ;

free_op:
        if (op->frames != &op->sframe)
                kfree(op->frames);
        kfree(op);
        return err;
}

/*
 * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg)
 */
static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
                        int ifindex, struct sock *sk)
{
        struct bcm_sock *bo = bcm_sk(sk);
        struct bcm_op *op;
        int do_rx_register;
        int err = 0;

        if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) {
                /* be robust against wrong usage ... */
                msg_head->flags |= RX_FILTER_ID;
                /* ignore trailing garbage */
                msg_head->nframes = 0;
        }

        /* the first element contains the mux-mask => MAX_NFRAMES + 1  */
        if (msg_head->nframes > MAX_NFRAMES + 1)
                return -EINVAL;

        if ((msg_head->flags & RX_RTR_FRAME) &&
            ((msg_head->nframes != 1) ||
             (!(msg_head->can_id & CAN_RTR_FLAG))))
                return -EINVAL;

        /* check timeval limitations */
        if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
                return -EINVAL;

        /* check the given can_id */
        op = bcm_find_op(&bo->rx_ops, msg_head, ifindex);
        if (op) {
                /* update existing BCM operation */

                /*
                 * Do we need more space for the CAN frames than currently
                 * allocated? -> This is a _really_ unusual use-case and
                 * therefore (complexity / locking) it is not supported.
                 */
                if (msg_head->nframes > op->nframes)
                        return -E2BIG;

                if (msg_head->nframes) {
                        /* update CAN frames content */
                        err = memcpy_from_msg(op->frames, msg,
                                              msg_head->nframes * op->cfsiz);
                        if (err < 0)
                                return err;

                        /* clear last_frames to indicate 'nothing received' */
                        memset(op->last_frames, 0, msg_head->nframes * op->cfsiz);
                }

                op->nframes = msg_head->nframes;
                op->flags = msg_head->flags;

                /* Only an update -> do not call can_rx_register() */
                do_rx_register = 0;

        } else {
                /* insert new BCM operation for the given can_id */
                op = kzalloc(OPSIZ, GFP_KERNEL);
                if (!op)
                        return -ENOMEM;

                op->can_id = msg_head->can_id;
                op->nframes = msg_head->nframes;
                op->cfsiz = CFSIZ(msg_head->flags);
                op->flags = msg_head->flags;

                if (msg_head->nframes > 1) {
                        /* create array for CAN frames and copy the data */
                        op->frames = kmalloc_array(msg_head->nframes,
                                                   op->cfsiz,
                                                   GFP_KERNEL);
                        if (!op->frames) {
                                kfree(op);
                                return -ENOMEM;
                        }

                        /* create and init array for received CAN frames */
                        op->last_frames = kcalloc(msg_head->nframes,
                                                  op->cfsiz,
                                                  GFP_KERNEL);
                        if (!op->last_frames) {
                                kfree(op->frames);
                                kfree(op);
                                return -ENOMEM;
                        }

                } else {
                        op->frames = &op->sframe;
                        op->last_frames = &op->last_sframe;
                }

                if (msg_head->nframes) {
                        err = memcpy_from_msg(op->frames, msg,
                                              msg_head->nframes * op->cfsiz);
                        if (err < 0) {
                                if (op->frames != &op->sframe)
                                        kfree(op->frames);
                                if (op->last_frames != &op->last_sframe)
                                        kfree(op->last_frames);
                                kfree(op);
                                return err;
                        }
                }

                /* bcm_can_tx / bcm_tx_timeout_handler needs this */
                op->sk = sk;
                op->ifindex = ifindex;

                /* ifindex for timeout events w/o previous frame reception */
                op->rx_ifindex = ifindex;

                /* initialize uninitialized (kzalloc) structure */
                hrtimer_init(&op->timer, CLOCK_MONOTONIC,
                             HRTIMER_MODE_REL_SOFT);
                op->timer.function = bcm_rx_timeout_handler;

                hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
                             HRTIMER_MODE_REL_SOFT);
                op->thrtimer.function = bcm_rx_thr_handler;

                /* add this bcm_op to the list of the rx_ops */
                list_add(&op->list, &bo->rx_ops);

                /* call can_rx_register() */
                do_rx_register = 1;

        } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */

        /* check flags */

        if (op->flags & RX_RTR_FRAME) {
                struct canfd_frame *frame0 = op->frames;

                /* no timers in RTR-mode */
                hrtimer_cancel(&op->thrtimer);
                hrtimer_cancel(&op->timer);

                /*
                 * funny feature in RX(!)_SETUP only for RTR-mode:
                 * copy can_id into frame BUT without RTR-flag to
                 * prevent a full-load-loopback-test ... ;-]
                 */
                if ((op->flags & TX_CP_CAN_ID) ||
                    (frame0->can_id == op->can_id))
                        frame0->can_id = op->can_id & ~CAN_RTR_FLAG;

        } else {
                if (op->flags & SETTIMER) {

                        /* set timer value */
                        op->ival1 = msg_head->ival1;
                        op->ival2 = msg_head->ival2;
                        op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
                        op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);

                        /* disable an active timer due to zero value? */
                        if (!op->kt_ival1)
                                hrtimer_cancel(&op->timer);

                        /*
                         * In any case cancel the throttle timer, flush
                         * potentially blocked msgs and reset throttle handling
                         */
                        op->kt_lastmsg = 0;
                        hrtimer_cancel(&op->thrtimer);
                        bcm_rx_thr_flush(op);
                }

                if ((op->flags & STARTTIMER) && op->kt_ival1)
                        hrtimer_start(&op->timer, op->kt_ival1,
                                      HRTIMER_MODE_REL_SOFT);
        }

        /* now we can register for can_ids, if we added a new bcm_op */
        if (do_rx_register) {
                if (ifindex) {
                        struct net_device *dev;

                        dev = dev_get_by_index(sock_net(sk), ifindex);
                        if (dev) {
                                err = can_rx_register(sock_net(sk), dev,
                                                      op->can_id,
                                                      REGMASK(op->can_id),
                                                      bcm_rx_handler, op,
                                                      "bcm", sk);

                                op->rx_reg_dev = dev;
                                dev_put(dev);
                        }

                } else
                        err = can_rx_register(sock_net(sk), NULL, op->can_id,
                                              REGMASK(op->can_id),
                                              bcm_rx_handler, op, "bcm", sk);
                if (err) {
                        /* this bcm rx op is broken -> remove it */
                        list_del(&op->list);
                        bcm_remove_op(op);
                        return err;
                }
        }

        return msg_head->nframes * op->cfsiz + MHSIZ;
}

/*
 * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg)
 */
static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
                       int cfsiz)
{
        struct sk_buff *skb;
        struct net_device *dev;
        int err;

        /* we need a real device to send frames */
        if (!ifindex)
                return -ENODEV;

        skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL);
        if (!skb)
                return -ENOMEM;

        can_skb_reserve(skb);

        err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz);
        if (err < 0) {
                kfree_skb(skb);
                return err;
        }

        dev = dev_get_by_index(sock_net(sk), ifindex);
        if (!dev) {
                kfree_skb(skb);
                return -ENODEV;
        }

        can_skb_prv(skb)->ifindex = dev->ifindex;
        can_skb_prv(skb)->skbcnt = 0;
        skb->dev = dev;
        can_skb_set_owner(skb, sk);
        err = can_send(skb, 1); /* send with loopback */
        dev_put(dev);

        if (err)
                return err;

        return cfsiz + MHSIZ;
}

/*
 * bcm_sendmsg - process BCM commands (opcodes) from the userspace
 */
static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;
        struct bcm_sock *bo = bcm_sk(sk);
        int ifindex = bo->ifindex; /* default ifindex for this bcm_op */
        struct bcm_msg_head msg_head;
        int cfsiz;
        int ret; /* read bytes or error codes as return value */

        if (!bo->bound)
                return -ENOTCONN;

        /* check for valid message length from userspace */
        if (size < MHSIZ)
                return -EINVAL;

        /* read message head information */
        ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
        if (ret < 0)
                return ret;

        cfsiz = CFSIZ(msg_head.flags);
        if ((size - MHSIZ) % cfsiz)
                return -EINVAL;

        /* check for alternative ifindex for this bcm_op */

        if (!ifindex && msg->msg_name) {
                /* no bound device as default => check msg_name */
                DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);

                if (msg->msg_namelen < BCM_MIN_NAMELEN)
                        return -EINVAL;

                if (addr->can_family != AF_CAN)
                        return -EINVAL;

                /* ifindex from sendto() */
                ifindex = addr->can_ifindex;

                if (ifindex) {
                        struct net_device *dev;

                        dev = dev_get_by_index(sock_net(sk), ifindex);
                        if (!dev)
                                return -ENODEV;

                        if (dev->type != ARPHRD_CAN) {
                                dev_put(dev);
                                return -ENODEV;
                        }

                        dev_put(dev);
                }
        }

        lock_sock(sk);

        switch (msg_head.opcode) {

        case TX_SETUP:
                ret = bcm_tx_setup(&msg_head, msg, ifindex, sk);
                break;

        case RX_SETUP:
                ret = bcm_rx_setup(&msg_head, msg, ifindex, sk);
                break;

        case TX_DELETE:
                if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex))
                        ret = MHSIZ;
                else
                        ret = -EINVAL;
                break;

        case RX_DELETE:
                if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex))
                        ret = MHSIZ;
                else
                        ret = -EINVAL;
                break;

        case TX_READ:
                /* reuse msg_head for the reply to TX_READ */
                msg_head.opcode  = TX_STATUS;
                ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex);
                break;

        case RX_READ:
                /* reuse msg_head for the reply to RX_READ */
                msg_head.opcode  = RX_STATUS;
                ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex);
                break;

        case TX_SEND:
                /* we need exactly one CAN frame behind the msg head */
                if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ))
                        ret = -EINVAL;
                else
                        ret = bcm_tx_send(msg, ifindex, sk, cfsiz);
                break;

        default:
                ret = -EINVAL;
                break;
        }

        release_sock(sk);

        return ret;
}

/*
 * notification handler for netdevice status changes
 */
static void bcm_notify(struct bcm_sock *bo, unsigned long msg,
                       struct net_device *dev)
{
        struct sock *sk = &bo->sk;
        struct bcm_op *op;
        int notify_enodev = 0;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return;

        switch (msg) {

        case NETDEV_UNREGISTER:
                lock_sock(sk);

                /* remove device specific receive entries */
                list_for_each_entry(op, &bo->rx_ops, list)
                        if (op->rx_reg_dev == dev)
                                bcm_rx_unreg(dev, op);

                /* remove device reference, if this is our bound device */
                if (bo->bound && bo->ifindex == dev->ifindex) {
                        bo->bound   = 0;
                        bo->ifindex = 0;
                        notify_enodev = 1;
                }

                release_sock(sk);

                if (notify_enodev) {
                        sk->sk_err = ENODEV;
                        if (!sock_flag(sk, SOCK_DEAD))
                                sk_error_report(sk);
                }
                break;

        case NETDEV_DOWN:
                if (bo->bound && bo->ifindex == dev->ifindex) {
                        sk->sk_err = ENETDOWN;
                        if (!sock_flag(sk, SOCK_DEAD))
                                sk_error_report(sk);
                }
        }
}

static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
                        void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (dev->type != ARPHRD_CAN)
                return NOTIFY_DONE;
        if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
                return NOTIFY_DONE;
        if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */
                return NOTIFY_DONE;

        spin_lock(&bcm_notifier_lock);
        list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) {
                spin_unlock(&bcm_notifier_lock);
                bcm_notify(bcm_busy_notifier, msg, dev);
                spin_lock(&bcm_notifier_lock);
        }
        bcm_busy_notifier = NULL;
        spin_unlock(&bcm_notifier_lock);
        return NOTIFY_DONE;
}

/*
 * initial settings for all BCM sockets to be set at socket creation time
 */
static int bcm_init(struct sock *sk)
{
        struct bcm_sock *bo = bcm_sk(sk);

        bo->bound            = 0;
        bo->ifindex          = 0;
        bo->dropped_usr_msgs = 0;
        bo->bcm_proc_read    = NULL;

        INIT_LIST_HEAD(&bo->tx_ops);
        INIT_LIST_HEAD(&bo->rx_ops);

        /* set notifier */
        spin_lock(&bcm_notifier_lock);
        list_add_tail(&bo->notifier, &bcm_notifier_list);
        spin_unlock(&bcm_notifier_lock);

        return 0;
}

/*
 * standard socket functions
 */
static int bcm_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct net *net;
        struct bcm_sock *bo;
        struct bcm_op *op, *next;

        if (!sk)
                return 0;

        net = sock_net(sk);
        bo = bcm_sk(sk);

        /* remove bcm_ops, timer, rx_unregister(), etc. */

        spin_lock(&bcm_notifier_lock);
        while (bcm_busy_notifier == bo) {
                spin_unlock(&bcm_notifier_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&bcm_notifier_lock);
        }
        list_del(&bo->notifier);
        spin_unlock(&bcm_notifier_lock);

        lock_sock(sk);

#if IS_ENABLED(CONFIG_PROC_FS)
        /* remove procfs entry */
        if (net->can.bcmproc_dir && bo->bcm_proc_read)
                remove_proc_entry(bo->procname, net->can.bcmproc_dir);
#endif /* CONFIG_PROC_FS */

        list_for_each_entry_safe(op, next, &bo->tx_ops, list)
                bcm_remove_op(op);

        list_for_each_entry_safe(op, next, &bo->rx_ops, list) {
                /*
                 * Don't care if we're bound or not (due to netdev problems)
                 * can_rx_unregister() is always a save thing to do here.
                 */
                if (op->ifindex) {
                        /*
                         * Only remove subscriptions that had not
                         * been removed due to NETDEV_UNREGISTER
                         * in bcm_notifier()
                         */
                        if (op->rx_reg_dev) {
                                struct net_device *dev;

                                dev = dev_get_by_index(net, op->ifindex);
                                if (dev) {
                                        bcm_rx_unreg(dev, op);
                                        dev_put(dev);
                                }
                        }
                } else
                        can_rx_unregister(net, NULL, op->can_id,
                                          REGMASK(op->can_id),
                                          bcm_rx_handler, op);

        }

        synchronize_rcu();

        list_for_each_entry_safe(op, next, &bo->rx_ops, list)
                bcm_remove_op(op);

        /* remove device reference */
        if (bo->bound) {
                bo->bound   = 0;
                bo->ifindex = 0;
        }

        sock_orphan(sk);
        sock->sk = NULL;

        release_sock(sk);
        sock_put(sk);

        return 0;
}

static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
                       int flags)
{
        struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
        struct sock *sk = sock->sk;
        struct bcm_sock *bo = bcm_sk(sk);
        struct net *net = sock_net(sk);
        int ret = 0;

        if (len < BCM_MIN_NAMELEN)
                return -EINVAL;

        lock_sock(sk);

        if (bo->bound) {
                ret = -EISCONN;
                goto fail;
        }

        /* bind a device to this socket */
        if (addr->can_ifindex) {
                struct net_device *dev;

                dev = dev_get_by_index(net, addr->can_ifindex);
                if (!dev) {
                        ret = -ENODEV;
                        goto fail;
                }
                if (dev->type != ARPHRD_CAN) {
                        dev_put(dev);
                        ret = -ENODEV;
                        goto fail;
                }

                bo->ifindex = dev->ifindex;
                dev_put(dev);

        } else {
                /* no interface reference for ifindex = 0 ('any' CAN device) */
                bo->ifindex = 0;
        }

#if IS_ENABLED(CONFIG_PROC_FS)
        if (net->can.bcmproc_dir) {
                /* unique socket address as filename */
                sprintf(bo->procname, "%lu", sock_i_ino(sk));
                bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644,
                                                     net->can.bcmproc_dir,
                                                     bcm_proc_show, sk);
                if (!bo->bcm_proc_read) {
                        ret = -ENOMEM;
                        goto fail;
                }
        }
#endif /* CONFIG_PROC_FS */

        bo->bound = 1;

fail:
        release_sock(sk);

        return ret;
}

static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                       int flags)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        int error = 0;
        int err;

        skb = skb_recv_datagram(sk, flags, &error);
        if (!skb)
                return error;

        if (skb->len < size)
                size = skb->len;

        err = memcpy_to_msg(msg, skb->data, size);
        if (err < 0) {
                skb_free_datagram(sk, skb);
                return err;
        }

        sock_recv_cmsgs(msg, sk, skb);

        if (msg->msg_name) {
                __sockaddr_check_size(BCM_MIN_NAMELEN);
                msg->msg_namelen = BCM_MIN_NAMELEN;
                memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
        }

        /* assign the flags that have been recorded in bcm_send_to_user() */
        msg->msg_flags |= *(bcm_flags(skb));

        skb_free_datagram(sk, skb);

        return size;
}

static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
                                unsigned long arg)
{
        /* no ioctls for socket layer -> hand it down to NIC layer */
        return -ENOIOCTLCMD;
}

static const struct proto_ops bcm_ops = {
        .family        = PF_CAN,
        .release       = bcm_release,
        .bind          = sock_no_bind,
        .connect       = bcm_connect,
        .socketpair    = sock_no_socketpair,
        .accept        = sock_no_accept,
        .getname       = sock_no_getname,
        .poll          = datagram_poll,
        .ioctl         = bcm_sock_no_ioctlcmd,
        .gettstamp     = sock_gettstamp,
        .listen        = sock_no_listen,
        .shutdown      = sock_no_shutdown,
        .sendmsg       = bcm_sendmsg,
        .recvmsg       = bcm_recvmsg,
        .mmap          = sock_no_mmap,
};

static struct proto bcm_proto __read_mostly = {
        .name       = "CAN_BCM",
        .owner      = THIS_MODULE,
        .obj_size   = sizeof(struct bcm_sock),
        .init       = bcm_init,
};

static const struct can_proto bcm_can_proto = {
        .type       = SOCK_DGRAM,
        .protocol   = CAN_BCM,
        .ops        = &bcm_ops,
        .prot       = &bcm_proto,
};

static int canbcm_pernet_init(struct net *net)
{
#if IS_ENABLED(CONFIG_PROC_FS)
        /* create /proc/net/can-bcm directory */
        net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net);
#endif /* CONFIG_PROC_FS */

        return 0;
}

static void canbcm_pernet_exit(struct net *net)
{
#if IS_ENABLED(CONFIG_PROC_FS)
        /* remove /proc/net/can-bcm directory */
        if (net->can.bcmproc_dir)
                remove_proc_entry("can-bcm", net->proc_net);
#endif /* CONFIG_PROC_FS */
}

static struct pernet_operations canbcm_pernet_ops __read_mostly = {
        .init = canbcm_pernet_init,
        .exit = canbcm_pernet_exit,
};

static struct notifier_block canbcm_notifier = {
        .notifier_call = bcm_notifier
};

static int __init bcm_module_init(void)
{
        int err;

        pr_info("can: broadcast manager protocol\n");

        err = register_pernet_subsys(&canbcm_pernet_ops);
        if (err)
                return err;

        err = register_netdevice_notifier(&canbcm_notifier);
        if (err)
                goto register_notifier_failed;

        err = can_proto_register(&bcm_can_proto);
        if (err < 0) {
                printk(KERN_ERR "can: registration of bcm protocol failed\n");
                goto register_proto_failed;
        }

        return 0;

register_proto_failed:
        unregister_netdevice_notifier(&canbcm_notifier);
register_notifier_failed:
        unregister_pernet_subsys(&canbcm_pernet_ops);
        return err;
}

static void __exit bcm_module_exit(void)
{
        can_proto_unregister(&bcm_can_proto);
        unregister_netdevice_notifier(&canbcm_notifier);
        unregister_pernet_subsys(&canbcm_pernet_ops);
}

module_init(bcm_module_init);
module_exit(bcm_module_exit);





























































































































   14 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_X86_XSAVE_H
#define __ASM_X86_XSAVE_H

#include <linux/uaccess.h>
#include <linux/types.h>

#include <asm/processor.h>
#include <asm/fpu/api.h>
#include <asm/user.h>

/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND        (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))

#define XSTATE_CPUID                0x0000000d

#define TILE_CPUID                0x0000001d

#define FXSAVE_SIZE        512

#define XSAVE_HDR_SIZE            64
#define XSAVE_HDR_OFFSET    FXSAVE_SIZE

#define XSAVE_YMM_SIZE            256
#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)

#define XSAVE_ALIGNMENT     64

/* All currently supported user features */
#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
                                      XFEATURE_MASK_SSE | \
                                      XFEATURE_MASK_YMM | \
                                      XFEATURE_MASK_OPMASK | \
                                      XFEATURE_MASK_ZMM_Hi256 | \
                                      XFEATURE_MASK_Hi16_ZMM         | \
                                      XFEATURE_MASK_PKRU | \
                                      XFEATURE_MASK_BNDREGS | \
                                      XFEATURE_MASK_BNDCSR | \
                                      XFEATURE_MASK_XTILE)

/*
 * Features which are restored when returning to user space.
 * PKRU is not restored on return to user space because PKRU
 * is switched eagerly in switch_to() and flush_thread()
 */
#define XFEATURE_MASK_USER_RESTORE        \
        (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)

/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC        XFEATURE_MASK_XTILE_DATA

/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
                                            XFEATURE_MASK_CET_USER)

/*
 * A supervisor state component may not always contain valuable information,
 * and its size may be huge. Saving/restoring such supervisor state components
 * at each context switch can cause high CPU and space overhead, which should
 * be avoided. Such supervisor state components should only be saved/restored
 * on demand. The on-demand supervisor features are set in this mask.
 *
 * Unlike the existing supported supervisor features, an independent supervisor
 * feature does not allocate a buffer in task->fpu, and the corresponding
 * supervisor state component cannot be saved/restored at each context switch.
 *
 * To support an independent supervisor feature, a developer should follow the
 * dos and don'ts as below:
 * - Do dynamically allocate a buffer for the supervisor state component.
 * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
 *   state component to/from the buffer.
 * - Don't set the bit corresponding to the independent supervisor feature in
 *   IA32_XSS at run time, since it has been set at boot time.
 */
#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)

/*
 * Unsupported supervisor features. When a supervisor feature in this mask is
 * supported in the future, move it to the supported supervisor feature mask.
 */
#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
                                              XFEATURE_MASK_CET_KERNEL)

/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
                                      XFEATURE_MASK_INDEPENDENT | \
                                      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)

/*
 * The feature mask required to restore FPU state:
 * - All user states which are not eagerly switched in switch_to()/exec()
 * - The suporvisor states
 */
#define XFEATURE_MASK_FPSTATE        (XFEATURE_MASK_USER_RESTORE | \
                                 XFEATURE_MASK_SUPERVISOR_SUPPORTED)

/*
 * Features in this mask have space allocated in the signal frame, but may not
 * have that space initialized when the feature is in its init state.
 */
#define XFEATURE_MASK_SIGFRAME_INITOPT        (XFEATURE_MASK_XTILE | \
                                         XFEATURE_MASK_USER_DYNAMIC)

extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];

extern void __init update_regset_xstate_info(unsigned int size,
                                             u64 xstate_mask);

int xfeature_size(int xfeature_nr);

void xsaves(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);

int xfd_enable_feature(u64 xfd_err);

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
#endif

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);

static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return static_branch_unlikely(&__fpu_state_size_dynamic);
}
#else
static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return false;
}
#endif

#endif







































































































































































































    1 

    2 










    2 
    2 
    2 

    2 


































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
 * Declarations for Reverse Mapping functions in mm/rmap.c
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>

/*
 * The anon_vma heads a list of private "related" vmas, to scan if
 * an anonymous page pointing to this anon_vma needs to be unmapped:
 * the vmas on the list will be related by forking, or by splitting.
 *
 * Since vmas come and go as they are split and merged (particularly
 * in mprotect), the mapping field of an anonymous page cannot point
 * directly to a vma: instead it points to an anon_vma, on whose list
 * the related vmas can be easily linked or unlinked.
 *
 * After unlinking the last vma on the list, we must garbage collect
 * the anon_vma object itself: we're guaranteed no page can be
 * pointing to this anon_vma once its vma list is empty.
 */
struct anon_vma {
        struct anon_vma *root;                /* Root of this anon_vma tree */
        struct rw_semaphore rwsem;        /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
         * the duration of the operation. A caller that takes
         * the reference is responsible for clearing up the
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;

        /*
         * Count of child anon_vmas. Equals to the count of all anon_vmas that
         * have ->parent pointing to this one, including itself.
         *
         * This counter is used for making decision about reusing anon_vma
         * instead of forking new one. See comments in function anon_vma_clone.
         */
        unsigned long num_children;
        /* Count of VMAs whose ->anon_vma pointer points to this object. */
        unsigned long num_active_vmas;

        struct anon_vma *parent;        /* Parent of this anon_vma */

        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */

        /* Interval tree of private "related" vmas */
        struct rb_root_cached rb_root;
};

/*
 * The copy-on-write semantics of fork mean that an anon_vma
 * can become associated with multiple processes. Furthermore,
 * each child process will have its own anon_vma, where new
 * pages for that process are instantiated.
 *
 * This structure allows us to find the anon_vmas associated
 * with a VMA, or the VMAs associated with an anon_vma.
 * The "same_vma" list contains the anon_vma_chains linking
 * all the anon_vmas associated with this VMA.
 * The "rb" field indexes on an interval tree the anon_vma_chains
 * which link all the VMAs associated with this anon_vma.
 */
struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
        struct rb_node rb;                        /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
#endif
};

enum ttu_flags {
        TTU_SPLIT_HUGE_PMD        = 0x4,        /* split huge PMD if any */
        TTU_IGNORE_MLOCK        = 0x8,        /* ignore mlock */
        TTU_SYNC                = 0x10,        /* avoid racy checks with PVMW_SYNC */
        TTU_HWPOISON                = 0x20,        /* do convert pte to hwpoison entry */
        TTU_BATCH_FLUSH                = 0x40,        /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
        TTU_RMAP_LOCKED                = 0x80,        /* do not grab rmap lock:
                                         * caller holds it */
};

#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
        atomic_inc(&anon_vma->refcount);
}

void __put_anon_vma(struct anon_vma *anon_vma);

static inline void put_anon_vma(struct anon_vma *anon_vma)
{
        if (atomic_dec_and_test(&anon_vma->refcount))
                __put_anon_vma(anon_vma);
}

static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
        down_write(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
{
        return down_write_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
        up_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
        down_read(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
{
        return down_read_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
        up_read(&anon_vma->root->rwsem);
}


/*
 * anon_vma helper functions.
 */
void anon_vma_init(void);        /* create anon_vma_cachep */
int  __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);

static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
        if (likely(vma->anon_vma))
                return 0;

        return __anon_vma_prepare(vma);
}

static inline void anon_vma_merge(struct vm_area_struct *vma,
                                  struct vm_area_struct *next)
{
        VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
        unlink_anon_vmas(next);
}

struct anon_vma *folio_get_anon_vma(struct folio *folio);

/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;

/*
 * No special request: A mapped anonymous (sub)page is possibly shared between
 * processes.
 */
#define RMAP_NONE                ((__force rmap_t)0)

/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE                ((__force rmap_t)BIT(0))

/*
 * Internally, we're using an enum to specify the granularity. We make the
 * compiler emit specialized code for each granularity.
 */
enum rmap_level {
        RMAP_LEVEL_PTE = 0,
        RMAP_LEVEL_PMD,
};

static inline void __folio_rmap_sanity_checks(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level)
{
        /* hugetlb folios are handled separately. */
        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);

        /*
         * TODO: we get driver-allocated folios that have nothing to do with
         * the rmap using vm_insert_page(); therefore, we cannot assume that
         * folio_test_large_rmappable() holds for large folios. We should
         * handle any desired mapcount+stats accounting for these folios in
         * VM_MIXEDMAP VMAs separately, and then sanity-check here that
         * we really only get rmappable folios.
         */

        VM_WARN_ON_ONCE(nr_pages <= 0);
        VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
        VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);

        switch (level) {
        case RMAP_LEVEL_PTE:
                break;
        case RMAP_LEVEL_PMD:
                /*
                 * We don't support folios larger than a single PMD yet. So
                 * when RMAP_LEVEL_PMD is set, we assume that we are creating
                 * a single "entire" mapping of the folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
                break;
        default:
                VM_WARN_ON_ONCE(true);
        }
}

/*
 * rmap interfaces called when adding or removing pte of page
 */
void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
        folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
void folio_add_anon_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address);
void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_add_file_rmap_pte(folio, page, vma) \
        folio_add_file_rmap_ptes(folio, page, 1, vma)
void folio_add_file_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_remove_rmap_pte(folio, page, vma) \
        folio_remove_rmap_ptes(folio, page, 1, vma)
void folio_remove_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);

void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address);

/* See folio_try_dup_anon_rmap_*() */
static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        if (PageAnonExclusive(&folio->page)) {
                if (unlikely(folio_needs_cow_for_dma(vma, folio)))
                        return -EBUSY;
                ClearPageAnonExclusive(&folio->page);
        }
        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        return 0;
}

/* See folio_try_share_anon_rmap_*() */
static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(&folio->page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

static inline void hugetlb_add_file_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
}

static inline void hugetlb_remove_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        atomic_dec(&folio->_entire_mapcount);
        atomic_dec(&folio->_large_mapcount);
}

static __always_inline void __folio_dup_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level)
{
        const int orig_nr_pages = nr_pages;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        atomic_inc(&page->_mapcount);
                        break;
                }

                do {
                        atomic_inc(&page->_mapcount);
                } while (page++, --nr_pages > 0);
                atomic_add(orig_nr_pages, &folio->_large_mapcount);
                break;
        case RMAP_LEVEL_PMD:
                atomic_inc(&folio->_entire_mapcount);
                atomic_inc(&folio->_large_mapcount);
                break;
        }
}

/**
 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages)
{
        __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
}

static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
                struct page *page)
{
        __folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE);
}

/**
 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_pmd(struct folio *folio,
                struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *src_vma,
                enum rmap_level level)
{
        const int orig_nr_pages = nr_pages;
        bool maybe_pinned;
        int i;

        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /*
         * If this folio may have been pinned by the parent process,
         * don't allow to duplicate the mappings but instead require to e.g.,
         * copy the subpage immediately for the child so that we'll always
         * guarantee the pinned folio won't be randomly replaced in the
         * future on write faults.
         */
        maybe_pinned = likely(!folio_is_device_private(folio)) &&
                       unlikely(folio_needs_cow_for_dma(src_vma, folio));

        /*
         * No need to check+clear for already shared PTEs/PMDs of the
         * folio. But if any page is PageAnonExclusive, we must fallback to
         * copying if the folio maybe pinned.
         */
        switch (level) {
        case RMAP_LEVEL_PTE:
                if (unlikely(maybe_pinned)) {
                        for (i = 0; i < nr_pages; i++)
                                if (PageAnonExclusive(page + i))
                                        return -EBUSY;
                }

                if (!folio_test_large(folio)) {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        atomic_inc(&page->_mapcount);
                        break;
                }

                do {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        atomic_inc(&page->_mapcount);
                } while (page++, --nr_pages > 0);
                atomic_add(orig_nr_pages, &folio->_large_mapcount);
                break;
        case RMAP_LEVEL_PMD:
                if (PageAnonExclusive(page)) {
                        if (unlikely(maybe_pinned))
                                return -EBUSY;
                        ClearPageAnonExclusive(page);
                }
                atomic_inc(&folio->_entire_mapcount);
                atomic_inc(&folio->_large_mapcount);
                break;
        }
        return 0;
}

/**
 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
 *                                  of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @src_vma:        The vm area from which the mappings are duplicated
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mappings can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
                                         RMAP_LEVEL_PTE);
}

static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, 1, src_vma,
                                         RMAP_LEVEL_PTE);
}

/**
 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
 *                                 of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @src_vma:        The vm area from which the mapping is duplicated
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mapping can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *src_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma,
                                         RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level)
{
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /* device private folios cannot get pinned via GUP. */
        if (unlikely(folio_is_device_private(folio))) {
                ClearPageAnonExclusive(page);
                return 0;
        }

        /*
         * We have to make sure that when we clear PageAnonExclusive, that
         * the page is not pinned and that concurrent GUP-fast won't succeed in
         * concurrently pinning the page.
         *
         * Conceptually, PageAnonExclusive clearing consists of:
         * (A1) Clear PTE
         * (A2) Check if the page is pinned; back off if so.
         * (A3) Clear PageAnonExclusive
         * (A4) Restore PTE (optional, but certainly not writable)
         *
         * When clearing PageAnonExclusive, we cannot possibly map the page
         * writable again, because anon pages that may be shared must never
         * be writable. So in any case, if the PTE was writable it cannot
         * be writable anymore afterwards and there would be a PTE change. Only
         * if the PTE wasn't writable, there might not be a PTE change.
         *
         * Conceptually, GUP-fast pinning of an anon page consists of:
         * (B1) Read the PTE
         * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
         * (B3) Pin the mapped page
         * (B4) Check if the PTE changed by re-reading it; back off if so.
         * (B5) If the original PTE is not writable, check if
         *        PageAnonExclusive is not set; back off if so.
         *
         * If the PTE was writable, we only have to make sure that GUP-fast
         * observes a PTE change and properly backs off.
         *
         * If the PTE was not writable, we have to make sure that GUP-fast either
         * detects a (temporary) PTE change or that PageAnonExclusive is cleared
         * and properly backs off.
         *
         * Consequently, when clearing PageAnonExclusive(), we have to make
         * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
         * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
         * and (B5) happen in the right memory order.
         *
         * We assume that there might not be a memory barrier after
         * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
         * so we use explicit ones here.
         */

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

/**
 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
 *                                   mapped by a PTE possibly shared to prepare
 *                                   for KSM or temporary unmapping
 * @folio:        The folio to share a mapping of
 * @page:        The mapped exclusive page
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
 *
 * Marking the mapped page shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped page possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
                struct page *page)
{
        return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
}

/**
 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
 *                                   range mapped by a PMD possibly shared to
 *                                   prepare for temporary unmapping
 * @folio:        The folio to share the mapping of
 * @page:        The first page to share the mapping of
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
 * fork() to duplicate a mapping, but instead to prepare for temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
 *
 * Marking the mapped pages shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
                struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
                                           RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

/*
 * Called from mm/vmscan.c to handle paging out
 */
int folio_referenced(struct folio *, int is_locked,
                        struct mem_cgroup *memcg, unsigned long *vm_flags);

void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);

int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, struct page **pages,
                                void *arg);

/* Avoid racy checks */
#define PVMW_SYNC                (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION                (1 << 1)

struct page_vma_mapped_walk {
        unsigned long pfn;
        unsigned long nr_pages;
        pgoff_t pgoff;
        struct vm_area_struct *vma;
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned int flags;
};

#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)        \
        struct page_vma_mapped_walk name = {                                \
                .pfn = page_to_pfn(_page),                                \
                .nr_pages = compound_nr(_page),                                \
                .pgoff = page_to_pgoff(_page),                                \
                .vma = _vma,                                                \
                .address = _address,                                        \
                .flags = _flags,                                        \
        }

#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)        \
        struct page_vma_mapped_walk name = {                                \
                .pfn = folio_pfn(_folio),                                \
                .nr_pages = folio_nr_pages(_folio),                        \
                .pgoff = folio_pgoff(_folio),                                \
                .vma = _vma,                                                \
                .address = _address,                                        \
                .flags = _flags,                                        \
        }

static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
        /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
        if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
                pte_unmap(pvmw->pte);
        if (pvmw->ptl)
                spin_unlock(pvmw->ptl);
}

bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);

/*
 * Used by swapoff to help locate where page is expected in vma.
 */
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);

/*
 * Cleans the PTEs of shared mappings.
 * (and since clean PTEs should also be readonly, write protects them too)
 *
 * returns the number of cleaned PTEs.
 */
int folio_mkclean(struct folio *);

int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma);

void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);

unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);

/*
 * rmap_walk_control: To control rmap traversing for specific needs
 *
 * arg: passed to rmap_one() and invalid_vma()
 * try_lock: bail out if the rmap lock is contended
 * contended: indicate the rmap traversal bailed out due to lock contention
 * rmap_one: executed on each vma where page is mapped
 * done: for checking traversing termination condition
 * anon_lock: for getting anon_lock by optimized way rather than default
 * invalid_vma: for skipping uninterested vma
 */
struct rmap_walk_control {
        void *arg;
        bool try_lock;
        bool contended;
        /*
         * Return false if page table scanning in rmap_walk should be stopped.
         * Otherwise, return true.
         */
        bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct folio *folio);
        struct anon_vma *(*anon_lock)(struct folio *folio,
                                      struct rmap_walk_control *rwc);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
                                          struct rmap_walk_control *rwc);

#else        /* !CONFIG_MMU */

#define anon_vma_init()                do {} while (0)
#define anon_vma_prepare(vma)        (0)

static inline int folio_referenced(struct folio *folio, int is_locked,
                                  struct mem_cgroup *memcg,
                                  unsigned long *vm_flags)
{
        *vm_flags = 0;
        return 0;
}

static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
}

static inline int folio_mkclean(struct folio *folio)
{
        return 0;
}
#endif        /* CONFIG_MMU */

static inline int page_mkclean(struct page *page)
{
        return folio_mkclean(page_folio(page));
}
#endif        /* _LINUX_RMAP_H */


































































































































































































































































































































































































































































































































































































    1 












    1 


    1 


    1 



    1 

    1 


    1 

    1 








    1 

    1 








    1 



    1 



    1 














































    1 




























































    1 





    1 

    1 
    1 
    1 




    1 






    1 















    1 


    1 


    1 


    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 * Author: Mimi Zohar <zohar@us.ibm.com>
 *
 * ima_policy.c
 *        - initialize default measure policy rules
 */

#include <linux/init.h>
#include <linux/list.h>
#include <linux/kernel_read_file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/parser.h>
#include <linux/slab.h>
#include <linux/rculist.h>
#include <linux/seq_file.h>
#include <linux/ima.h>

#include "ima.h"

/* flags definitions */
#define IMA_FUNC        0x0001
#define IMA_MASK        0x0002
#define IMA_FSMAGIC        0x0004
#define IMA_UID                0x0008
#define IMA_FOWNER        0x0010
#define IMA_FSUUID        0x0020
#define IMA_INMASK        0x0040
#define IMA_EUID        0x0080
#define IMA_PCR                0x0100
#define IMA_FSNAME        0x0200
#define IMA_KEYRINGS        0x0400
#define IMA_LABEL        0x0800
#define IMA_VALIDATE_ALGOS        0x1000
#define IMA_GID                0x2000
#define IMA_EGID        0x4000
#define IMA_FGROUP        0x8000

#define UNKNOWN                0
#define MEASURE                0x0001        /* same as IMA_MEASURE */
#define DONT_MEASURE        0x0002
#define APPRAISE        0x0004        /* same as IMA_APPRAISE */
#define DONT_APPRAISE        0x0008
#define AUDIT                0x0040
#define HASH                0x0100
#define DONT_HASH        0x0200

#define INVALID_PCR(a) (((a) < 0) || \
        (a) >= (sizeof_field(struct ima_iint_cache, measured_pcrs) * 8))

int ima_policy_flag;
static int temp_ima_appraise;
static int build_ima_appraise __ro_after_init;

atomic_t ima_setxattr_allowed_hash_algorithms;

#define MAX_LSM_RULES 6
enum lsm_rule_types { LSM_OBJ_USER, LSM_OBJ_ROLE, LSM_OBJ_TYPE,
        LSM_SUBJ_USER, LSM_SUBJ_ROLE, LSM_SUBJ_TYPE
};

enum policy_types { ORIGINAL_TCB = 1, DEFAULT_TCB };

enum policy_rule_list { IMA_DEFAULT_POLICY = 1, IMA_CUSTOM_POLICY };

struct ima_rule_opt_list {
        size_t count;
        char *items[] __counted_by(count);
};

/*
 * These comparators are needed nowhere outside of ima so just define them here.
 * This pattern should hopefully never be needed outside of ima.
 */
static inline bool vfsuid_gt_kuid(vfsuid_t vfsuid, kuid_t kuid)
{
        return __vfsuid_val(vfsuid) > __kuid_val(kuid);
}

static inline bool vfsgid_gt_kgid(vfsgid_t vfsgid, kgid_t kgid)
{
        return __vfsgid_val(vfsgid) > __kgid_val(kgid);
}

static inline bool vfsuid_lt_kuid(vfsuid_t vfsuid, kuid_t kuid)
{
        return __vfsuid_val(vfsuid) < __kuid_val(kuid);
}

static inline bool vfsgid_lt_kgid(vfsgid_t vfsgid, kgid_t kgid)
{
        return __vfsgid_val(vfsgid) < __kgid_val(kgid);
}

struct ima_rule_entry {
        struct list_head list;
        int action;
        unsigned int flags;
        enum ima_hooks func;
        int mask;
        unsigned long fsmagic;
        uuid_t fsuuid;
        kuid_t uid;
        kgid_t gid;
        kuid_t fowner;
        kgid_t fgroup;
        bool (*uid_op)(kuid_t cred_uid, kuid_t rule_uid);    /* Handlers for operators       */
        bool (*gid_op)(kgid_t cred_gid, kgid_t rule_gid);
        bool (*fowner_op)(vfsuid_t vfsuid, kuid_t rule_uid); /* vfsuid_eq_kuid(), vfsuid_gt_kuid(), vfsuid_lt_kuid() */
        bool (*fgroup_op)(vfsgid_t vfsgid, kgid_t rule_gid); /* vfsgid_eq_kgid(), vfsgid_gt_kgid(), vfsgid_lt_kgid() */
        int pcr;
        unsigned int allowed_algos; /* bitfield of allowed hash algorithms */
        struct {
                void *rule;        /* LSM file metadata specific */
                char *args_p;        /* audit value */
                int type;        /* audit type */
        } lsm[MAX_LSM_RULES];
        char *fsname;
        struct ima_rule_opt_list *keyrings; /* Measure keys added to these keyrings */
        struct ima_rule_opt_list *label; /* Measure data grouped under this label */
        struct ima_template_desc *template;
};

/*
 * sanity check in case the kernels gains more hash algorithms that can
 * fit in an unsigned int
 */
static_assert(
        8 * sizeof(unsigned int) >= HASH_ALGO__LAST,
        "The bitfield allowed_algos in ima_rule_entry is too small to contain all the supported hash algorithms, consider using a bigger type");

/*
 * Without LSM specific knowledge, the default policy can only be
 * written in terms of .action, .func, .mask, .fsmagic, .uid, .gid,
 * .fowner, and .fgroup
 */

/*
 * The minimum rule set to allow for full TCB coverage.  Measures all files
 * opened or mmap for exec and everything read by root.  Dangerous because
 * normal users can easily run the machine out of memory simply building
 * and running executables.
 */
static struct ima_rule_entry dont_measure_rules[] __ro_after_init = {
        {.action = DONT_MEASURE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = CGROUP_SUPER_MAGIC,
         .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = CGROUP2_SUPER_MAGIC,
         .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC}
};

static struct ima_rule_entry original_measurement_rules[] __ro_after_init = {
        {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_MASK | IMA_UID},
        {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC},
};

static struct ima_rule_entry default_measurement_rules[] __ro_after_init = {
        {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_INMASK | IMA_EUID},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_INMASK | IMA_UID},
        {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = POLICY_CHECK, .flags = IMA_FUNC},
};

static struct ima_rule_entry default_appraise_rules[] __ro_after_init = {
        {.action = DONT_APPRAISE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = RAMFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = CGROUP_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = CGROUP2_SUPER_MAGIC, .flags = IMA_FSMAGIC},
#ifdef CONFIG_IMA_WRITE_POLICY
        {.action = APPRAISE, .func = POLICY_CHECK,
        .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifndef CONFIG_IMA_APPRAISE_SIGNED_INIT
        {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid,
         .flags = IMA_FOWNER},
#else
        /* force signature */
        {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid,
         .flags = IMA_FOWNER | IMA_DIGSIG_REQUIRED},
#endif
};

static struct ima_rule_entry build_appraise_rules[] __ro_after_init = {
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_MODULE_SIGS
        {.action = APPRAISE, .func = MODULE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_FIRMWARE_SIGS
        {.action = APPRAISE, .func = FIRMWARE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_KEXEC_SIGS
        {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_POLICY_SIGS
        {.action = APPRAISE, .func = POLICY_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
};

static struct ima_rule_entry secure_boot_rules[] __ro_after_init = {
        {.action = APPRAISE, .func = MODULE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = FIRMWARE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = POLICY_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
};

static struct ima_rule_entry critical_data_rules[] __ro_after_init = {
        {.action = MEASURE, .func = CRITICAL_DATA, .flags = IMA_FUNC},
};

/* An array of architecture specific rules */
static struct ima_rule_entry *arch_policy_entry __ro_after_init;

static LIST_HEAD(ima_default_rules);
static LIST_HEAD(ima_policy_rules);
static LIST_HEAD(ima_temp_rules);
static struct list_head __rcu *ima_rules = (struct list_head __rcu *)(&ima_default_rules);

static int ima_policy __initdata;

static int __init default_measure_policy_setup(char *str)
{
        if (ima_policy)
                return 1;

        ima_policy = ORIGINAL_TCB;
        return 1;
}
__setup("ima_tcb", default_measure_policy_setup);

static bool ima_use_appraise_tcb __initdata;
static bool ima_use_secure_boot __initdata;
static bool ima_use_critical_data __initdata;
static bool ima_fail_unverifiable_sigs __ro_after_init;
static int __init policy_setup(char *str)
{
        char *p;

        while ((p = strsep(&str, " |\n")) != NULL) {
                if (*p == ' ')
                        continue;
                if ((strcmp(p, "tcb") == 0) && !ima_policy)
                        ima_policy = DEFAULT_TCB;
                else if (strcmp(p, "appraise_tcb") == 0)
                        ima_use_appraise_tcb = true;
                else if (strcmp(p, "secure_boot") == 0)
                        ima_use_secure_boot = true;
                else if (strcmp(p, "critical_data") == 0)
                        ima_use_critical_data = true;
                else if (strcmp(p, "fail_securely") == 0)
                        ima_fail_unverifiable_sigs = true;
                else
                        pr_err("policy \"%s\" not found", p);
        }

        return 1;
}
__setup("ima_policy=", policy_setup);

static int __init default_appraise_policy_setup(char *str)
{
        ima_use_appraise_tcb = true;
        return 1;
}
__setup("ima_appraise_tcb", default_appraise_policy_setup);

static struct ima_rule_opt_list *ima_alloc_rule_opt_list(const substring_t *src)
{
        struct ima_rule_opt_list *opt_list;
        size_t count = 0;
        char *src_copy;
        char *cur, *next;
        size_t i;

        src_copy = match_strdup(src);
        if (!src_copy)
                return ERR_PTR(-ENOMEM);

        next = src_copy;
        while ((cur = strsep(&next, "|"))) {
                /* Don't accept an empty list item */
                if (!(*cur)) {
                        kfree(src_copy);
                        return ERR_PTR(-EINVAL);
                }
                count++;
        }

        /* Don't accept an empty list */
        if (!count) {
                kfree(src_copy);
                return ERR_PTR(-EINVAL);
        }

        opt_list = kzalloc(struct_size(opt_list, items, count), GFP_KERNEL);
        if (!opt_list) {
                kfree(src_copy);
                return ERR_PTR(-ENOMEM);
        }
        opt_list->count = count;

        /*
         * strsep() has already replaced all instances of '|' with '\0',
         * leaving a byte sequence of NUL-terminated strings. Reference each
         * string with the array of items.
         *
         * IMPORTANT: Ownership of the allocated buffer is transferred from
         * src_copy to the first element in the items array. To free the
         * buffer, kfree() must only be called on the first element of the
         * array.
         */
        for (i = 0, cur = src_copy; i < count; i++) {
                opt_list->items[i] = cur;
                cur = strchr(cur, '\0') + 1;
        }

        return opt_list;
}

static void ima_free_rule_opt_list(struct ima_rule_opt_list *opt_list)
{
        if (!opt_list)
                return;

        if (opt_list->count) {
                kfree(opt_list->items[0]);
                opt_list->count = 0;
        }

        kfree(opt_list);
}

static void ima_lsm_free_rule(struct ima_rule_entry *entry)
{
        int i;

        for (i = 0; i < MAX_LSM_RULES; i++) {
                ima_filter_rule_free(entry->lsm[i].rule);
                kfree(entry->lsm[i].args_p);
        }
}

static void ima_free_rule(struct ima_rule_entry *entry)
{
        if (!entry)
                return;

        /*
         * entry->template->fields may be allocated in ima_parse_rule() but that
         * reference is owned by the corresponding ima_template_desc element in
         * the defined_templates list and cannot be freed here
         */
        kfree(entry->fsname);
        ima_free_rule_opt_list(entry->keyrings);
        ima_lsm_free_rule(entry);
        kfree(entry);
}

static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry,
                                                gfp_t gfp)
{
        struct ima_rule_entry *nentry;
        int i;

        /*
         * Immutable elements are copied over as pointers and data; only
         * lsm rules can change
         */
        nentry = kmemdup(entry, sizeof(*nentry), gfp);
        if (!nentry)
                return NULL;

        memset(nentry->lsm, 0, sizeof_field(struct ima_rule_entry, lsm));

        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (!entry->lsm[i].args_p)
                        continue;

                nentry->lsm[i].type = entry->lsm[i].type;
                nentry->lsm[i].args_p = entry->lsm[i].args_p;

                ima_filter_rule_init(nentry->lsm[i].type, Audit_equal,
                                     nentry->lsm[i].args_p,
                                     &nentry->lsm[i].rule,
                                     gfp);
                if (!nentry->lsm[i].rule)
                        pr_warn("rule for LSM \'%s\' is undefined\n",
                                nentry->lsm[i].args_p);
        }
        return nentry;
}

static int ima_lsm_update_rule(struct ima_rule_entry *entry)
{
        int i;
        struct ima_rule_entry *nentry;

        nentry = ima_lsm_copy_rule(entry, GFP_KERNEL);
        if (!nentry)
                return -ENOMEM;

        list_replace_rcu(&entry->list, &nentry->list);
        synchronize_rcu();
        /*
         * ima_lsm_copy_rule() shallow copied all references, except for the
         * LSM references, from entry to nentry so we only want to free the LSM
         * references and the entry itself. All other memory references will now
         * be owned by nentry.
         */
        for (i = 0; i < MAX_LSM_RULES; i++)
                ima_filter_rule_free(entry->lsm[i].rule);
        kfree(entry);

        return 0;
}

static bool ima_rule_contains_lsm_cond(struct ima_rule_entry *entry)
{
        int i;

        for (i = 0; i < MAX_LSM_RULES; i++)
                if (entry->lsm[i].args_p)
                        return true;

        return false;
}

/*
 * The LSM policy can be reloaded, leaving the IMA LSM based rules referring
 * to the old, stale LSM policy.  Update the IMA LSM based rules to reflect
 * the reloaded LSM policy.
 */
static void ima_lsm_update_rules(void)
{
        struct ima_rule_entry *entry, *e;
        int result;

        list_for_each_entry_safe(entry, e, &ima_policy_rules, list) {
                if (!ima_rule_contains_lsm_cond(entry))
                        continue;

                result = ima_lsm_update_rule(entry);
                if (result) {
                        pr_err("lsm rule update error %d\n", result);
                        return;
                }
        }
}

int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event,
                          void *lsm_data)
{
        if (event != LSM_POLICY_CHANGE)
                return NOTIFY_DONE;

        ima_lsm_update_rules();
        return NOTIFY_OK;
}

/**
 * ima_match_rule_data - determine whether func_data matches the policy rule
 * @rule: a pointer to a rule
 * @func_data: data to match against the measure rule data
 * @cred: a pointer to a credentials structure for user validation
 *
 * Returns true if func_data matches one in the rule, false otherwise.
 */
static bool ima_match_rule_data(struct ima_rule_entry *rule,
                                const char *func_data,
                                const struct cred *cred)
{
        const struct ima_rule_opt_list *opt_list = NULL;
        bool matched = false;
        size_t i;

        if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
                return false;

        switch (rule->func) {
        case KEY_CHECK:
                if (!rule->keyrings)
                        return true;

                opt_list = rule->keyrings;
                break;
        case CRITICAL_DATA:
                if (!rule->label)
                        return true;

                opt_list = rule->label;
                break;
        default:
                return false;
        }

        if (!func_data)
                return false;

        for (i = 0; i < opt_list->count; i++) {
                if (!strcmp(opt_list->items[i], func_data)) {
                        matched = true;
                        break;
                }
        }

        return matched;
}

/**
 * ima_match_rules - determine whether an inode matches the policy rule.
 * @rule: a pointer to a rule
 * @idmap: idmap of the mount the inode was found from
 * @inode: a pointer to an inode
 * @cred: a pointer to a credentials structure for user validation
 * @secid: the secid of the task to be validated
 * @func: LIM hook identifier
 * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
 * @func_data: func specific data, may be NULL
 *
 * Returns true on rule match, false on failure.
 */
static bool ima_match_rules(struct ima_rule_entry *rule,
                            struct mnt_idmap *idmap,
                            struct inode *inode, const struct cred *cred,
                            u32 secid, enum ima_hooks func, int mask,
                            const char *func_data)
{
        int i;
        bool result = false;
        struct ima_rule_entry *lsm_rule = rule;
        bool rule_reinitialized = false;

        if ((rule->flags & IMA_FUNC) &&
            (rule->func != func && func != POST_SETATTR))
                return false;

        switch (func) {
        case KEY_CHECK:
        case CRITICAL_DATA:
                return ((rule->func == func) &&
                        ima_match_rule_data(rule, func_data, cred));
        default:
                break;
        }

        if ((rule->flags & IMA_MASK) &&
            (rule->mask != mask && func != POST_SETATTR))
                return false;
        if ((rule->flags & IMA_INMASK) &&
            (!(rule->mask & mask) && func != POST_SETATTR))
                return false;
        if ((rule->flags & IMA_FSMAGIC)
            && rule->fsmagic != inode->i_sb->s_magic)
                return false;
        if ((rule->flags & IMA_FSNAME)
            && strcmp(rule->fsname, inode->i_sb->s_type->name))
                return false;
        if ((rule->flags & IMA_FSUUID) &&
            !uuid_equal(&rule->fsuuid, &inode->i_sb->s_uuid))
                return false;
        if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
                return false;
        if (rule->flags & IMA_EUID) {
                if (has_capability_noaudit(current, CAP_SETUID)) {
                        if (!rule->uid_op(cred->euid, rule->uid)
                            && !rule->uid_op(cred->suid, rule->uid)
                            && !rule->uid_op(cred->uid, rule->uid))
                                return false;
                } else if (!rule->uid_op(cred->euid, rule->uid))
                        return false;
        }
        if ((rule->flags & IMA_GID) && !rule->gid_op(cred->gid, rule->gid))
                return false;
        if (rule->flags & IMA_EGID) {
                if (has_capability_noaudit(current, CAP_SETGID)) {
                        if (!rule->gid_op(cred->egid, rule->gid)
                            && !rule->gid_op(cred->sgid, rule->gid)
                            && !rule->gid_op(cred->gid, rule->gid))
                                return false;
                } else if (!rule->gid_op(cred->egid, rule->gid))
                        return false;
        }
        if ((rule->flags & IMA_FOWNER) &&
            !rule->fowner_op(i_uid_into_vfsuid(idmap, inode),
                             rule->fowner))
                return false;
        if ((rule->flags & IMA_FGROUP) &&
            !rule->fgroup_op(i_gid_into_vfsgid(idmap, inode),
                             rule->fgroup))
                return false;
        for (i = 0; i < MAX_LSM_RULES; i++) {
                int rc = 0;
                u32 osid;

                if (!lsm_rule->lsm[i].rule) {
                        if (!lsm_rule->lsm[i].args_p)
                                continue;
                        else
                                return false;
                }

retry:
                switch (i) {
                case LSM_OBJ_USER:
                case LSM_OBJ_ROLE:
                case LSM_OBJ_TYPE:
                        security_inode_getsecid(inode, &osid);
                        rc = ima_filter_rule_match(osid, lsm_rule->lsm[i].type,
                                                   Audit_equal,
                                                   lsm_rule->lsm[i].rule);
                        break;
                case LSM_SUBJ_USER:
                case LSM_SUBJ_ROLE:
                case LSM_SUBJ_TYPE:
                        rc = ima_filter_rule_match(secid, lsm_rule->lsm[i].type,
                                                   Audit_equal,
                                                   lsm_rule->lsm[i].rule);
                        break;
                default:
                        break;
                }

                if (rc == -ESTALE && !rule_reinitialized) {
                        lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC);
                        if (lsm_rule) {
                                rule_reinitialized = true;
                                goto retry;
                        }
                }
                if (!rc) {
                        result = false;
                        goto out;
                }
        }
        result = true;

out:
        if (rule_reinitialized) {
                for (i = 0; i < MAX_LSM_RULES; i++)
                        ima_filter_rule_free(lsm_rule->lsm[i].rule);
                kfree(lsm_rule);
        }
        return result;
}

/*
 * In addition to knowing that we need to appraise the file in general,
 * we need to differentiate between calling hooks, for hook specific rules.
 */
static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
{
        if (!(rule->flags & IMA_FUNC))
                return IMA_FILE_APPRAISE;

        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                return IMA_MMAP_APPRAISE;
        case BPRM_CHECK:
                return IMA_BPRM_APPRAISE;
        case CREDS_CHECK:
                return IMA_CREDS_APPRAISE;
        case FILE_CHECK:
        case POST_SETATTR:
                return IMA_FILE_APPRAISE;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                return IMA_READ_APPRAISE;
        }
}

/**
 * ima_match_policy - decision based on LSM and other conditions
 * @idmap: idmap of the mount the inode was found from
 * @inode: pointer to an inode for which the policy decision is being made
 * @cred: pointer to a credentials structure for which the policy decision is
 *        being made
 * @secid: LSM secid of the task to be validated
 * @func: IMA hook identifier
 * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
 * @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE)
 * @pcr: set the pcr to extend
 * @template_desc: the template that should be used for this rule
 * @func_data: func specific data, may be NULL
 * @allowed_algos: allowlist of hash algorithms for the IMA xattr
 *
 * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type)
 * conditions.
 *
 * Since the IMA policy may be updated multiple times we need to lock the
 * list when walking it.  Reads are many orders of magnitude more numerous
 * than writes so ima_match_policy() is classical RCU candidate.
 */
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
                     const struct cred *cred, u32 secid, enum ima_hooks func,
                     int mask, int flags, int *pcr,
                     struct ima_template_desc **template_desc,
                     const char *func_data, unsigned int *allowed_algos)
{
        struct ima_rule_entry *entry;
        int action = 0, actmask = flags | (flags << 1);
        struct list_head *ima_rules_tmp;

        if (template_desc && !*template_desc)
                *template_desc = ima_template_desc_current();

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {

                if (!(entry->action & actmask))
                        continue;

                if (!ima_match_rules(entry, idmap, inode, cred, secid,
                                     func, mask, func_data))
                        continue;

                action |= entry->flags & IMA_NONACTION_FLAGS;

                action |= entry->action & IMA_DO_MASK;
                if (entry->action & IMA_APPRAISE) {
                        action |= get_subaction(entry, func);
                        action &= ~IMA_HASH;
                        if (ima_fail_unverifiable_sigs)
                                action |= IMA_FAIL_UNVERIFIABLE_SIGS;

                        if (allowed_algos &&
                            entry->flags & IMA_VALIDATE_ALGOS)
                                *allowed_algos = entry->allowed_algos;
                }

                if (entry->action & IMA_DO_MASK)
                        actmask &= ~(entry->action | entry->action << 1);
                else
                        actmask &= ~(entry->action | entry->action >> 1);

                if ((pcr) && (entry->flags & IMA_PCR))
                        *pcr = entry->pcr;

                if (template_desc && entry->template)
                        *template_desc = entry->template;

                if (!actmask)
                        break;
        }
        rcu_read_unlock();

        return action;
}

/**
 * ima_update_policy_flags() - Update global IMA variables
 *
 * Update ima_policy_flag and ima_setxattr_allowed_hash_algorithms
 * based on the currently loaded policy.
 *
 * With ima_policy_flag, the decision to short circuit out of a function
 * or not call the function in the first place can be made earlier.
 *
 * With ima_setxattr_allowed_hash_algorithms, the policy can restrict the
 * set of hash algorithms accepted when updating the security.ima xattr of
 * a file.
 *
 * Context: called after a policy update and at system initialization.
 */
void ima_update_policy_flags(void)
{
        struct ima_rule_entry *entry;
        int new_policy_flag = 0;
        struct list_head *ima_rules_tmp;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                /*
                 * SETXATTR_CHECK rules do not implement a full policy check
                 * because rule checking would probably have an important
                 * performance impact on setxattr(). As a consequence, only one
                 * SETXATTR_CHECK can be active at a given time.
                 * Because we want to preserve that property, we set out to use
                 * atomic_cmpxchg. Either:
                 * - the atomic was non-zero: a setxattr hash policy is
                 *   already enforced, we do nothing
                 * - the atomic was zero: no setxattr policy was set, enable
                 *   the setxattr hash policy
                 */
                if (entry->func == SETXATTR_CHECK) {
                        atomic_cmpxchg(&ima_setxattr_allowed_hash_algorithms,
                                       0, entry->allowed_algos);
                        /* SETXATTR_CHECK doesn't impact ima_policy_flag */
                        continue;
                }

                if (entry->action & IMA_DO_MASK)
                        new_policy_flag |= entry->action;
        }
        rcu_read_unlock();

        ima_appraise |= (build_ima_appraise | temp_ima_appraise);
        if (!ima_appraise)
                new_policy_flag &= ~IMA_APPRAISE;

        ima_policy_flag = new_policy_flag;
}

static int ima_appraise_flag(enum ima_hooks func)
{
        if (func == MODULE_CHECK)
                return IMA_APPRAISE_MODULES;
        else if (func == FIRMWARE_CHECK)
                return IMA_APPRAISE_FIRMWARE;
        else if (func == POLICY_CHECK)
                return IMA_APPRAISE_POLICY;
        else if (func == KEXEC_KERNEL_CHECK)
                return IMA_APPRAISE_KEXEC;
        return 0;
}

static void add_rules(struct ima_rule_entry *entries, int count,
                      enum policy_rule_list policy_rule)
{
        int i = 0;

        for (i = 0; i < count; i++) {
                struct ima_rule_entry *entry;

                if (policy_rule & IMA_DEFAULT_POLICY)
                        list_add_tail(&entries[i].list, &ima_default_rules);

                if (policy_rule & IMA_CUSTOM_POLICY) {
                        entry = kmemdup(&entries[i], sizeof(*entry),
                                        GFP_KERNEL);
                        if (!entry)
                                continue;

                        list_add_tail(&entry->list, &ima_policy_rules);
                }
                if (entries[i].action == APPRAISE) {
                        if (entries != build_appraise_rules)
                                temp_ima_appraise |=
                                        ima_appraise_flag(entries[i].func);
                        else
                                build_ima_appraise |=
                                        ima_appraise_flag(entries[i].func);
                }
        }
}

static int ima_parse_rule(char *rule, struct ima_rule_entry *entry);

static int __init ima_init_arch_policy(void)
{
        const char * const *arch_rules;
        const char * const *rules;
        int arch_entries = 0;
        int i = 0;

        arch_rules = arch_get_ima_policy();
        if (!arch_rules)
                return arch_entries;

        /* Get number of rules */
        for (rules = arch_rules; *rules != NULL; rules++)
                arch_entries++;

        arch_policy_entry = kcalloc(arch_entries + 1,
                                    sizeof(*arch_policy_entry), GFP_KERNEL);
        if (!arch_policy_entry)
                return 0;

        /* Convert each policy string rules to struct ima_rule_entry format */
        for (rules = arch_rules, i = 0; *rules != NULL; rules++) {
                char rule[255];
                int result;

                result = strscpy(rule, *rules, sizeof(rule));

                INIT_LIST_HEAD(&arch_policy_entry[i].list);
                result = ima_parse_rule(rule, &arch_policy_entry[i]);
                if (result) {
                        pr_warn("Skipping unknown architecture policy rule: %s\n",
                                rule);
                        memset(&arch_policy_entry[i], 0,
                               sizeof(*arch_policy_entry));
                        continue;
                }
                i++;
        }
        return i;
}

/**
 * ima_init_policy - initialize the default measure rules.
 *
 * ima_rules points to either the ima_default_rules or the new ima_policy_rules.
 */
void __init ima_init_policy(void)
{
        int build_appraise_entries, arch_entries;

        /* if !ima_policy, we load NO default rules */
        if (ima_policy)
                add_rules(dont_measure_rules, ARRAY_SIZE(dont_measure_rules),
                          IMA_DEFAULT_POLICY);

        switch (ima_policy) {
        case ORIGINAL_TCB:
                add_rules(original_measurement_rules,
                          ARRAY_SIZE(original_measurement_rules),
                          IMA_DEFAULT_POLICY);
                break;
        case DEFAULT_TCB:
                add_rules(default_measurement_rules,
                          ARRAY_SIZE(default_measurement_rules),
                          IMA_DEFAULT_POLICY);
                break;
        default:
                break;
        }

        /*
         * Based on runtime secure boot flags, insert arch specific measurement
         * and appraise rules requiring file signatures for both the initial
         * and custom policies, prior to other appraise rules.
         * (Highest priority)
         */
        arch_entries = ima_init_arch_policy();
        if (!arch_entries)
                pr_info("No architecture policies found\n");
        else
                add_rules(arch_policy_entry, arch_entries,
                          IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY);

        /*
         * Insert the builtin "secure_boot" policy rules requiring file
         * signatures, prior to other appraise rules.
         */
        if (ima_use_secure_boot)
                add_rules(secure_boot_rules, ARRAY_SIZE(secure_boot_rules),
                          IMA_DEFAULT_POLICY);

        /*
         * Insert the build time appraise rules requiring file signatures
         * for both the initial and custom policies, prior to other appraise
         * rules. As the secure boot rules includes all of the build time
         * rules, include either one or the other set of rules, but not both.
         */
        build_appraise_entries = ARRAY_SIZE(build_appraise_rules);
        if (build_appraise_entries) {
                if (ima_use_secure_boot)
                        add_rules(build_appraise_rules, build_appraise_entries,
                                  IMA_CUSTOM_POLICY);
                else
                        add_rules(build_appraise_rules, build_appraise_entries,
                                  IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY);
        }

        if (ima_use_appraise_tcb)
                add_rules(default_appraise_rules,
                          ARRAY_SIZE(default_appraise_rules),
                          IMA_DEFAULT_POLICY);

        if (ima_use_critical_data)
                add_rules(critical_data_rules,
                          ARRAY_SIZE(critical_data_rules),
                          IMA_DEFAULT_POLICY);

        atomic_set(&ima_setxattr_allowed_hash_algorithms, 0);

        ima_update_policy_flags();
}

/* Make sure we have a valid policy, at least containing some rules. */
int ima_check_policy(void)
{
        if (list_empty(&ima_temp_rules))
                return -EINVAL;
        return 0;
}

/**
 * ima_update_policy - update default_rules with new measure rules
 *
 * Called on file .release to update the default rules with a complete new
 * policy.  What we do here is to splice ima_policy_rules and ima_temp_rules so
 * they make a queue.  The policy may be updated multiple times and this is the
 * RCU updater.
 *
 * Policy rules are never deleted so ima_policy_flag gets zeroed only once when
 * we switch from the default policy to user defined.
 */
void ima_update_policy(void)
{
        struct list_head *policy = &ima_policy_rules;

        list_splice_tail_init_rcu(&ima_temp_rules, policy, synchronize_rcu);

        if (ima_rules != (struct list_head __rcu *)policy) {
                ima_policy_flag = 0;

                rcu_assign_pointer(ima_rules, policy);
                /*
                 * IMA architecture specific policy rules are specified
                 * as strings and converted to an array of ima_entry_rules
                 * on boot.  After loading a custom policy, free the
                 * architecture specific rules stored as an array.
                 */
                kfree(arch_policy_entry);
        }
        ima_update_policy_flags();

        /* Custom IMA policy has been loaded */
        ima_process_queued_keys();
}

/* Keep the enumeration in sync with the policy_tokens! */
enum policy_opt {
        Opt_measure, Opt_dont_measure,
        Opt_appraise, Opt_dont_appraise,
        Opt_audit, Opt_hash, Opt_dont_hash,
        Opt_obj_user, Opt_obj_role, Opt_obj_type,
        Opt_subj_user, Opt_subj_role, Opt_subj_type,
        Opt_func, Opt_mask, Opt_fsmagic, Opt_fsname, Opt_fsuuid,
        Opt_uid_eq, Opt_euid_eq, Opt_gid_eq, Opt_egid_eq,
        Opt_fowner_eq, Opt_fgroup_eq,
        Opt_uid_gt, Opt_euid_gt, Opt_gid_gt, Opt_egid_gt,
        Opt_fowner_gt, Opt_fgroup_gt,
        Opt_uid_lt, Opt_euid_lt, Opt_gid_lt, Opt_egid_lt,
        Opt_fowner_lt, Opt_fgroup_lt,
        Opt_digest_type,
        Opt_appraise_type, Opt_appraise_flag, Opt_appraise_algos,
        Opt_permit_directio, Opt_pcr, Opt_template, Opt_keyrings,
        Opt_label, Opt_err
};

static const match_table_t policy_tokens = {
        {Opt_measure, "measure"},
        {Opt_dont_measure, "dont_measure"},
        {Opt_appraise, "appraise"},
        {Opt_dont_appraise, "dont_appraise"},
        {Opt_audit, "audit"},
        {Opt_hash, "hash"},
        {Opt_dont_hash, "dont_hash"},
        {Opt_obj_user, "obj_user=%s"},
        {Opt_obj_role, "obj_role=%s"},
        {Opt_obj_type, "obj_type=%s"},
        {Opt_subj_user, "subj_user=%s"},
        {Opt_subj_role, "subj_role=%s"},
        {Opt_subj_type, "subj_type=%s"},
        {Opt_func, "func=%s"},
        {Opt_mask, "mask=%s"},
        {Opt_fsmagic, "fsmagic=%s"},
        {Opt_fsname, "fsname=%s"},
        {Opt_fsuuid, "fsuuid=%s"},
        {Opt_uid_eq, "uid=%s"},
        {Opt_euid_eq, "euid=%s"},
        {Opt_gid_eq, "gid=%s"},
        {Opt_egid_eq, "egid=%s"},
        {Opt_fowner_eq, "fowner=%s"},
        {Opt_fgroup_eq, "fgroup=%s"},
        {Opt_uid_gt, "uid>%s"},
        {Opt_euid_gt, "euid>%s"},
        {Opt_gid_gt, "gid>%s"},
        {Opt_egid_gt, "egid>%s"},
        {Opt_fowner_gt, "fowner>%s"},
        {Opt_fgroup_gt, "fgroup>%s"},
        {Opt_uid_lt, "uid<%s"},
        {Opt_euid_lt, "euid<%s"},
        {Opt_gid_lt, "gid<%s"},
        {Opt_egid_lt, "egid<%s"},
        {Opt_fowner_lt, "fowner<%s"},
        {Opt_fgroup_lt, "fgroup<%s"},
        {Opt_digest_type, "digest_type=%s"},
        {Opt_appraise_type, "appraise_type=%s"},
        {Opt_appraise_flag, "appraise_flag=%s"},
        {Opt_appraise_algos, "appraise_algos=%s"},
        {Opt_permit_directio, "permit_directio"},
        {Opt_pcr, "pcr=%s"},
        {Opt_template, "template=%s"},
        {Opt_keyrings, "keyrings=%s"},
        {Opt_label, "label=%s"},
        {Opt_err, NULL}
};

static int ima_lsm_rule_init(struct ima_rule_entry *entry,
                             substring_t *args, int lsm_rule, int audit_type)
{
        int result;

        if (entry->lsm[lsm_rule].rule)
                return -EINVAL;

        entry->lsm[lsm_rule].args_p = match_strdup(args);
        if (!entry->lsm[lsm_rule].args_p)
                return -ENOMEM;

        entry->lsm[lsm_rule].type = audit_type;
        result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal,
                                      entry->lsm[lsm_rule].args_p,
                                      &entry->lsm[lsm_rule].rule,
                                      GFP_KERNEL);
        if (!entry->lsm[lsm_rule].rule) {
                pr_warn("rule for LSM \'%s\' is undefined\n",
                        entry->lsm[lsm_rule].args_p);

                if (ima_rules == (struct list_head __rcu *)(&ima_default_rules)) {
                        kfree(entry->lsm[lsm_rule].args_p);
                        entry->lsm[lsm_rule].args_p = NULL;
                        result = -EINVAL;
                } else
                        result = 0;
        }

        return result;
}

static void ima_log_string_op(struct audit_buffer *ab, char *key, char *value,
                              enum policy_opt rule_operator)
{
        if (!ab)
                return;

        switch (rule_operator) {
        case Opt_uid_gt:
        case Opt_euid_gt:
        case Opt_gid_gt:
        case Opt_egid_gt:
        case Opt_fowner_gt:
        case Opt_fgroup_gt:
                audit_log_format(ab, "%s>", key);
                break;
        case Opt_uid_lt:
        case Opt_euid_lt:
        case Opt_gid_lt:
        case Opt_egid_lt:
        case Opt_fowner_lt:
        case Opt_fgroup_lt:
                audit_log_format(ab, "%s<", key);
                break;
        default:
                audit_log_format(ab, "%s=", key);
        }
        audit_log_format(ab, "%s ", value);
}
static void ima_log_string(struct audit_buffer *ab, char *key, char *value)
{
        ima_log_string_op(ab, key, value, Opt_err);
}

/*
 * Validating the appended signature included in the measurement list requires
 * the file hash calculated without the appended signature (i.e., the 'd-modsig'
 * field). Therefore, notify the user if they have the 'modsig' field but not
 * the 'd-modsig' field in the template.
 */
static void check_template_modsig(const struct ima_template_desc *template)
{
#define MSG "template with 'modsig' field also needs 'd-modsig' field\n"
        bool has_modsig, has_dmodsig;
        static bool checked;
        int i;

        /* We only need to notify the user once. */
        if (checked)
                return;

        has_modsig = has_dmodsig = false;
        for (i = 0; i < template->num_fields; i++) {
                if (!strcmp(template->fields[i]->field_id, "modsig"))
                        has_modsig = true;
                else if (!strcmp(template->fields[i]->field_id, "d-modsig"))
                        has_dmodsig = true;
        }

        if (has_modsig && !has_dmodsig)
                pr_notice(MSG);

        checked = true;
#undef MSG
}

/*
 * Warn if the template does not contain the given field.
 */
static void check_template_field(const struct ima_template_desc *template,
                                 const char *field, const char *msg)
{
        int i;

        for (i = 0; i < template->num_fields; i++)
                if (!strcmp(template->fields[i]->field_id, field))
                        return;

        pr_notice_once("%s", msg);
}

static bool ima_validate_rule(struct ima_rule_entry *entry)
{
        /* Ensure that the action is set and is compatible with the flags */
        if (entry->action == UNKNOWN)
                return false;

        if (entry->action != MEASURE && entry->flags & IMA_PCR)
                return false;

        if (entry->action != APPRAISE &&
            entry->flags & (IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED |
                            IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS))
                return false;

        /*
         * The IMA_FUNC bit must be set if and only if there's a valid hook
         * function specified, and vice versa. Enforcing this property allows
         * for the NONE case below to validate a rule without an explicit hook
         * function.
         */
        if (((entry->flags & IMA_FUNC) && entry->func == NONE) ||
            (!(entry->flags & IMA_FUNC) && entry->func != NONE))
                return false;

        /*
         * Ensure that the hook function is compatible with the other
         * components of the rule
         */
        switch (entry->func) {
        case NONE:
        case FILE_CHECK:
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
        case BPRM_CHECK:
        case CREDS_CHECK:
        case POST_SETATTR:
        case FIRMWARE_CHECK:
        case POLICY_CHECK:
                if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC |
                                     IMA_UID | IMA_FOWNER | IMA_FSUUID |
                                     IMA_INMASK | IMA_EUID | IMA_PCR |
                                     IMA_FSNAME | IMA_GID | IMA_EGID |
                                     IMA_FGROUP | IMA_DIGSIG_REQUIRED |
                                     IMA_PERMIT_DIRECTIO | IMA_VALIDATE_ALGOS |
                                     IMA_CHECK_BLACKLIST | IMA_VERITY_REQUIRED))
                        return false;

                break;
        case MODULE_CHECK:
        case KEXEC_KERNEL_CHECK:
        case KEXEC_INITRAMFS_CHECK:
                if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC |
                                     IMA_UID | IMA_FOWNER | IMA_FSUUID |
                                     IMA_INMASK | IMA_EUID | IMA_PCR |
                                     IMA_FSNAME | IMA_GID | IMA_EGID |
                                     IMA_FGROUP | IMA_DIGSIG_REQUIRED |
                                     IMA_PERMIT_DIRECTIO | IMA_MODSIG_ALLOWED |
                                     IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS))
                        return false;

                break;
        case KEXEC_CMDLINE:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_FSMAGIC | IMA_UID |
                                     IMA_FOWNER | IMA_FSUUID | IMA_EUID |
                                     IMA_PCR | IMA_FSNAME | IMA_GID | IMA_EGID |
                                     IMA_FGROUP))
                        return false;

                break;
        case KEY_CHECK:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR |
                                     IMA_KEYRINGS))
                        return false;

                if (ima_rule_contains_lsm_cond(entry))
                        return false;

                break;
        case CRITICAL_DATA:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR |
                                     IMA_LABEL))
                        return false;

                if (ima_rule_contains_lsm_cond(entry))
                        return false;

                break;
        case SETXATTR_CHECK:
                /* any action other than APPRAISE is unsupported */
                if (entry->action != APPRAISE)
                        return false;

                /* SETXATTR_CHECK requires an appraise_algos parameter */
                if (!(entry->flags & IMA_VALIDATE_ALGOS))
                        return false;

                /*
                 * full policies are not supported, they would have too
                 * much of a performance impact
                 */
                if (entry->flags & ~(IMA_FUNC | IMA_VALIDATE_ALGOS))
                        return false;

                break;
        default:
                return false;
        }

        /* Ensure that combinations of flags are compatible with each other */
        if (entry->flags & IMA_CHECK_BLACKLIST &&
            !(entry->flags & IMA_DIGSIG_REQUIRED))
                return false;

        /*
         * Unlike for regular IMA 'appraise' policy rules where security.ima
         * xattr may contain either a file hash or signature, the security.ima
         * xattr for fsverity must contain a file signature (sigv3).  Ensure
         * that 'appraise' rules for fsverity require file signatures by
         * checking the IMA_DIGSIG_REQUIRED flag is set.
         */
        if (entry->action == APPRAISE &&
            (entry->flags & IMA_VERITY_REQUIRED) &&
            !(entry->flags & IMA_DIGSIG_REQUIRED))
                return false;

        return true;
}

static unsigned int ima_parse_appraise_algos(char *arg)
{
        unsigned int res = 0;
        int idx;
        char *token;

        while ((token = strsep(&arg, ",")) != NULL) {
                idx = match_string(hash_algo_name, HASH_ALGO__LAST, token);

                if (idx < 0) {
                        pr_err("unknown hash algorithm \"%s\"",
                               token);
                        return 0;
                }

                if (!crypto_has_alg(hash_algo_name[idx], 0, 0)) {
                        pr_err("unavailable hash algorithm \"%s\", check your kernel configuration",
                               token);
                        return 0;
                }

                /* Add the hash algorithm to the 'allowed' bitfield */
                res |= (1U << idx);
        }

        return res;
}

static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
{
        struct audit_buffer *ab;
        char *from;
        char *p;
        bool eid_token; /* either euid or egid */
        struct ima_template_desc *template_desc;
        int result = 0;

        ab = integrity_audit_log_start(audit_context(), GFP_KERNEL,
                                       AUDIT_INTEGRITY_POLICY_RULE);

        entry->uid = INVALID_UID;
        entry->gid = INVALID_GID;
        entry->fowner = INVALID_UID;
        entry->fgroup = INVALID_GID;
        entry->uid_op = &uid_eq;
        entry->gid_op = &gid_eq;
        entry->fowner_op = &vfsuid_eq_kuid;
        entry->fgroup_op = &vfsgid_eq_kgid;
        entry->action = UNKNOWN;
        while ((p = strsep(&rule, " \t")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                int token;
                unsigned long lnum;

                if (result < 0)
                        break;
                if ((*p == '\0') || (*p == ' ') || (*p == '\t'))
                        continue;
                token = match_token(p, policy_tokens, args);
                switch (token) {
                case Opt_measure:
                        ima_log_string(ab, "action", "measure");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = MEASURE;
                        break;
                case Opt_dont_measure:
                        ima_log_string(ab, "action", "dont_measure");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_MEASURE;
                        break;
                case Opt_appraise:
                        ima_log_string(ab, "action", "appraise");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = APPRAISE;
                        break;
                case Opt_dont_appraise:
                        ima_log_string(ab, "action", "dont_appraise");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_APPRAISE;
                        break;
                case Opt_audit:
                        ima_log_string(ab, "action", "audit");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = AUDIT;
                        break;
                case Opt_hash:
                        ima_log_string(ab, "action", "hash");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = HASH;
                        break;
                case Opt_dont_hash:
                        ima_log_string(ab, "action", "dont_hash");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_HASH;
                        break;
                case Opt_func:
                        ima_log_string(ab, "func", args[0].from);

                        if (entry->func)
                                result = -EINVAL;

                        if (strcmp(args[0].from, "FILE_CHECK") == 0)
                                entry->func = FILE_CHECK;
                        /* PATH_CHECK is for backwards compat */
                        else if (strcmp(args[0].from, "PATH_CHECK") == 0)
                                entry->func = FILE_CHECK;
                        else if (strcmp(args[0].from, "MODULE_CHECK") == 0)
                                entry->func = MODULE_CHECK;
                        else if (strcmp(args[0].from, "FIRMWARE_CHECK") == 0)
                                entry->func = FIRMWARE_CHECK;
                        else if ((strcmp(args[0].from, "FILE_MMAP") == 0)
                                || (strcmp(args[0].from, "MMAP_CHECK") == 0))
                                entry->func = MMAP_CHECK;
                        else if ((strcmp(args[0].from, "MMAP_CHECK_REQPROT") == 0))
                                entry->func = MMAP_CHECK_REQPROT;
                        else if (strcmp(args[0].from, "BPRM_CHECK") == 0)
                                entry->func = BPRM_CHECK;
                        else if (strcmp(args[0].from, "CREDS_CHECK") == 0)
                                entry->func = CREDS_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_KERNEL_CHECK") ==
                                 0)
                                entry->func = KEXEC_KERNEL_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_INITRAMFS_CHECK")
                                 == 0)
                                entry->func = KEXEC_INITRAMFS_CHECK;
                        else if (strcmp(args[0].from, "POLICY_CHECK") == 0)
                                entry->func = POLICY_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_CMDLINE") == 0)
                                entry->func = KEXEC_CMDLINE;
                        else if (IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) &&
                                 strcmp(args[0].from, "KEY_CHECK") == 0)
                                entry->func = KEY_CHECK;
                        else if (strcmp(args[0].from, "CRITICAL_DATA") == 0)
                                entry->func = CRITICAL_DATA;
                        else if (strcmp(args[0].from, "SETXATTR_CHECK") == 0)
                                entry->func = SETXATTR_CHECK;
                        else
                                result = -EINVAL;
                        if (!result)
                                entry->flags |= IMA_FUNC;
                        break;
                case Opt_mask:
                        ima_log_string(ab, "mask", args[0].from);

                        if (entry->mask)
                                result = -EINVAL;

                        from = args[0].from;
                        if (*from == '^')
                                from++;

                        if ((strcmp(from, "MAY_EXEC")) == 0)
                                entry->mask = MAY_EXEC;
                        else if (strcmp(from, "MAY_WRITE") == 0)
                                entry->mask = MAY_WRITE;
                        else if (strcmp(from, "MAY_READ") == 0)
                                entry->mask = MAY_READ;
                        else if (strcmp(from, "MAY_APPEND") == 0)
                                entry->mask = MAY_APPEND;
                        else
                                result = -EINVAL;
                        if (!result)
                                entry->flags |= (*args[0].from == '^')
                                     ? IMA_INMASK : IMA_MASK;
                        break;
                case Opt_fsmagic:
                        ima_log_string(ab, "fsmagic", args[0].from);

                        if (entry->fsmagic) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 16, &entry->fsmagic);
                        if (!result)
                                entry->flags |= IMA_FSMAGIC;
                        break;
                case Opt_fsname:
                        ima_log_string(ab, "fsname", args[0].from);

                        entry->fsname = kstrdup(args[0].from, GFP_KERNEL);
                        if (!entry->fsname) {
                                result = -ENOMEM;
                                break;
                        }
                        result = 0;
                        entry->flags |= IMA_FSNAME;
                        break;
                case Opt_keyrings:
                        ima_log_string(ab, "keyrings", args[0].from);

                        if (!IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) ||
                            entry->keyrings) {
                                result = -EINVAL;
                                break;
                        }

                        entry->keyrings = ima_alloc_rule_opt_list(args);
                        if (IS_ERR(entry->keyrings)) {
                                result = PTR_ERR(entry->keyrings);
                                entry->keyrings = NULL;
                                break;
                        }

                        entry->flags |= IMA_KEYRINGS;
                        break;
                case Opt_label:
                        ima_log_string(ab, "label", args[0].from);

                        if (entry->label) {
                                result = -EINVAL;
                                break;
                        }

                        entry->label = ima_alloc_rule_opt_list(args);
                        if (IS_ERR(entry->label)) {
                                result = PTR_ERR(entry->label);
                                entry->label = NULL;
                                break;
                        }

                        entry->flags |= IMA_LABEL;
                        break;
                case Opt_fsuuid:
                        ima_log_string(ab, "fsuuid", args[0].from);

                        if (!uuid_is_null(&entry->fsuuid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = uuid_parse(args[0].from, &entry->fsuuid);
                        if (!result)
                                entry->flags |= IMA_FSUUID;
                        break;
                case Opt_uid_gt:
                case Opt_euid_gt:
                        entry->uid_op = &uid_gt;
                        fallthrough;
                case Opt_uid_lt:
                case Opt_euid_lt:
                        if ((token == Opt_uid_lt) || (token == Opt_euid_lt))
                                entry->uid_op = &uid_lt;
                        fallthrough;
                case Opt_uid_eq:
                case Opt_euid_eq:
                        eid_token = (token == Opt_euid_eq) ||
                                    (token == Opt_euid_gt) ||
                                    (token == Opt_euid_lt);

                        ima_log_string_op(ab, eid_token ? "euid" : "uid",
                                          args[0].from, token);

                        if (uid_valid(entry->uid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->uid = make_kuid(current_user_ns(),
                                                       (uid_t) lnum);
                                if (!uid_valid(entry->uid) ||
                                    (uid_t)lnum != lnum)
                                        result = -EINVAL;
                                else
                                        entry->flags |= eid_token
                                            ? IMA_EUID : IMA_UID;
                        }
                        break;
                case Opt_gid_gt:
                case Opt_egid_gt:
                        entry->gid_op = &gid_gt;
                        fallthrough;
                case Opt_gid_lt:
                case Opt_egid_lt:
                        if ((token == Opt_gid_lt) || (token == Opt_egid_lt))
                                entry->gid_op = &gid_lt;
                        fallthrough;
                case Opt_gid_eq:
                case Opt_egid_eq:
                        eid_token = (token == Opt_egid_eq) ||
                                    (token == Opt_egid_gt) ||
                                    (token == Opt_egid_lt);

                        ima_log_string_op(ab, eid_token ? "egid" : "gid",
                                          args[0].from, token);

                        if (gid_valid(entry->gid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->gid = make_kgid(current_user_ns(),
                                                       (gid_t)lnum);
                                if (!gid_valid(entry->gid) ||
                                    (((gid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= eid_token
                                            ? IMA_EGID : IMA_GID;
                        }
                        break;
                case Opt_fowner_gt:
                        entry->fowner_op = &vfsuid_gt_kuid;
                        fallthrough;
                case Opt_fowner_lt:
                        if (token == Opt_fowner_lt)
                                entry->fowner_op = &vfsuid_lt_kuid;
                        fallthrough;
                case Opt_fowner_eq:
                        ima_log_string_op(ab, "fowner", args[0].from, token);

                        if (uid_valid(entry->fowner)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->fowner = make_kuid(current_user_ns(),
                                                          (uid_t)lnum);
                                if (!uid_valid(entry->fowner) ||
                                    (((uid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_FOWNER;
                        }
                        break;
                case Opt_fgroup_gt:
                        entry->fgroup_op = &vfsgid_gt_kgid;
                        fallthrough;
                case Opt_fgroup_lt:
                        if (token == Opt_fgroup_lt)
                                entry->fgroup_op = &vfsgid_lt_kgid;
                        fallthrough;
                case Opt_fgroup_eq:
                        ima_log_string_op(ab, "fgroup", args[0].from, token);

                        if (gid_valid(entry->fgroup)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->fgroup = make_kgid(current_user_ns(),
                                                          (gid_t)lnum);
                                if (!gid_valid(entry->fgroup) ||
                                    (((gid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_FGROUP;
                        }
                        break;
                case Opt_obj_user:
                        ima_log_string(ab, "obj_user", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_USER,
                                                   AUDIT_OBJ_USER);
                        break;
                case Opt_obj_role:
                        ima_log_string(ab, "obj_role", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_ROLE,
                                                   AUDIT_OBJ_ROLE);
                        break;
                case Opt_obj_type:
                        ima_log_string(ab, "obj_type", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_TYPE,
                                                   AUDIT_OBJ_TYPE);
                        break;
                case Opt_subj_user:
                        ima_log_string(ab, "subj_user", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_USER,
                                                   AUDIT_SUBJ_USER);
                        break;
                case Opt_subj_role:
                        ima_log_string(ab, "subj_role", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_ROLE,
                                                   AUDIT_SUBJ_ROLE);
                        break;
                case Opt_subj_type:
                        ima_log_string(ab, "subj_type", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_TYPE,
                                                   AUDIT_SUBJ_TYPE);
                        break;
                case Opt_digest_type:
                        ima_log_string(ab, "digest_type", args[0].from);
                        if (entry->flags & IMA_DIGSIG_REQUIRED)
                                result = -EINVAL;
                        else if ((strcmp(args[0].from, "verity")) == 0)
                                entry->flags |= IMA_VERITY_REQUIRED;
                        else
                                result = -EINVAL;
                        break;
                case Opt_appraise_type:
                        ima_log_string(ab, "appraise_type", args[0].from);

                        if ((strcmp(args[0].from, "imasig")) == 0) {
                                if (entry->flags & IMA_VERITY_REQUIRED)
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_DIGSIG_REQUIRED | IMA_CHECK_BLACKLIST;
                        } else if (strcmp(args[0].from, "sigv3") == 0) {
                                /* Only fsverity supports sigv3 for now */
                                if (entry->flags & IMA_VERITY_REQUIRED)
                                        entry->flags |= IMA_DIGSIG_REQUIRED | IMA_CHECK_BLACKLIST;
                                else
                                        result = -EINVAL;
                        } else if (IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG) &&
                                 strcmp(args[0].from, "imasig|modsig") == 0) {
                                if (entry->flags & IMA_VERITY_REQUIRED)
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_DIGSIG_REQUIRED |
                                                IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST;
                        } else {
                                result = -EINVAL;
                        }
                        break;
                case Opt_appraise_flag:
                        ima_log_string(ab, "appraise_flag", args[0].from);
                        break;
                case Opt_appraise_algos:
                        ima_log_string(ab, "appraise_algos", args[0].from);

                        if (entry->allowed_algos) {
                                result = -EINVAL;
                                break;
                        }

                        entry->allowed_algos =
                                ima_parse_appraise_algos(args[0].from);
                        /* invalid or empty list of algorithms */
                        if (!entry->allowed_algos) {
                                result = -EINVAL;
                                break;
                        }

                        entry->flags |= IMA_VALIDATE_ALGOS;

                        break;
                case Opt_permit_directio:
                        entry->flags |= IMA_PERMIT_DIRECTIO;
                        break;
                case Opt_pcr:
                        ima_log_string(ab, "pcr", args[0].from);

                        result = kstrtoint(args[0].from, 10, &entry->pcr);
                        if (result || INVALID_PCR(entry->pcr))
                                result = -EINVAL;
                        else
                                entry->flags |= IMA_PCR;

                        break;
                case Opt_template:
                        ima_log_string(ab, "template", args[0].from);
                        if (entry->action != MEASURE) {
                                result = -EINVAL;
                                break;
                        }
                        template_desc = lookup_template_desc(args[0].from);
                        if (!template_desc || entry->template) {
                                result = -EINVAL;
                                break;
                        }

                        /*
                         * template_desc_init_fields() does nothing if
                         * the template is already initialised, so
                         * it's safe to do this unconditionally
                         */
                        template_desc_init_fields(template_desc->fmt,
                                                 &(template_desc->fields),
                                                 &(template_desc->num_fields));
                        entry->template = template_desc;
                        break;
                case Opt_err:
                        ima_log_string(ab, "UNKNOWN", p);
                        result = -EINVAL;
                        break;
                }
        }
        if (!result && !ima_validate_rule(entry))
                result = -EINVAL;
        else if (entry->action == APPRAISE)
                temp_ima_appraise |= ima_appraise_flag(entry->func);

        if (!result && entry->flags & IMA_MODSIG_ALLOWED) {
                template_desc = entry->template ? entry->template :
                                                  ima_template_desc_current();
                check_template_modsig(template_desc);
        }

        /* d-ngv2 template field recommended for unsigned fs-verity digests */
        if (!result && entry->action == MEASURE &&
            entry->flags & IMA_VERITY_REQUIRED) {
                template_desc = entry->template ? entry->template :
                                                  ima_template_desc_current();
                check_template_field(template_desc, "d-ngv2",
                                     "verity rules should include d-ngv2");
        }

        audit_log_format(ab, "res=%d", !result);
        audit_log_end(ab);
        return result;
}

/**
 * ima_parse_add_rule - add a rule to ima_policy_rules
 * @rule: ima measurement policy rule
 *
 * Avoid locking by allowing just one writer at a time in ima_write_policy()
 * Returns the length of the rule parsed, an error code on failure
 */
ssize_t ima_parse_add_rule(char *rule)
{
        static const char op[] = "update_policy";
        char *p;
        struct ima_rule_entry *entry;
        ssize_t result, len;
        int audit_info = 0;

        p = strsep(&rule, "\n");
        len = strlen(p) + 1;
        p += strspn(p, " \t");

        if (*p == '#' || *p == '\0')
                return len;

        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry) {
                integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
                                    NULL, op, "-ENOMEM", -ENOMEM, audit_info);
                return -ENOMEM;
        }

        INIT_LIST_HEAD(&entry->list);

        result = ima_parse_rule(p, entry);
        if (result) {
                ima_free_rule(entry);
                integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
                                    NULL, op, "invalid-policy", result,
                                    audit_info);
                return result;
        }

        list_add_tail(&entry->list, &ima_temp_rules);

        return len;
}

/**
 * ima_delete_rules() - called to cleanup invalid in-flight policy.
 *
 * We don't need locking as we operate on the temp list, which is
 * different from the active one.  There is also only one user of
 * ima_delete_rules() at a time.
 */
void ima_delete_rules(void)
{
        struct ima_rule_entry *entry, *tmp;

        temp_ima_appraise = 0;
        list_for_each_entry_safe(entry, tmp, &ima_temp_rules, list) {
                list_del(&entry->list);
                ima_free_rule(entry);
        }
}

#define __ima_hook_stringify(func, str)        (#func),

const char *const func_tokens[] = {
        __ima_hooks(__ima_hook_stringify)
};

#ifdef        CONFIG_IMA_READ_POLICY
enum {
        mask_exec = 0, mask_write, mask_read, mask_append
};

static const char *const mask_tokens[] = {
        "^MAY_EXEC",
        "^MAY_WRITE",
        "^MAY_READ",
        "^MAY_APPEND"
};

void *ima_policy_start(struct seq_file *m, loff_t *pos)
{
        loff_t l = *pos;
        struct ima_rule_entry *entry;
        struct list_head *ima_rules_tmp;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                if (!l--) {
                        rcu_read_unlock();
                        return entry;
                }
        }
        rcu_read_unlock();
        return NULL;
}

void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct ima_rule_entry *entry = v;

        rcu_read_lock();
        entry = list_entry_rcu(entry->list.next, struct ima_rule_entry, list);
        rcu_read_unlock();
        (*pos)++;

        return (&entry->list == &ima_default_rules ||
                &entry->list == &ima_policy_rules) ? NULL : entry;
}

void ima_policy_stop(struct seq_file *m, void *v)
{
}

#define pt(token)        policy_tokens[token].pattern
#define mt(token)        mask_tokens[token]

/*
 * policy_func_show - display the ima_hooks policy rule
 */
static void policy_func_show(struct seq_file *m, enum ima_hooks func)
{
        if (func > 0 && func < MAX_CHECK)
                seq_printf(m, "func=%s ", func_tokens[func]);
        else
                seq_printf(m, "func=%d ", func);
}

static void ima_show_rule_opt_list(struct seq_file *m,
                                   const struct ima_rule_opt_list *opt_list)
{
        size_t i;

        for (i = 0; i < opt_list->count; i++)
                seq_printf(m, "%s%s", i ? "|" : "", opt_list->items[i]);
}

static void ima_policy_show_appraise_algos(struct seq_file *m,
                                           unsigned int allowed_hashes)
{
        int idx, list_size = 0;

        for (idx = 0; idx < HASH_ALGO__LAST; idx++) {
                if (!(allowed_hashes & (1U << idx)))
                        continue;

                /* only add commas if the list contains multiple entries */
                if (list_size++)
                        seq_puts(m, ",");

                seq_puts(m, hash_algo_name[idx]);
        }
}

int ima_policy_show(struct seq_file *m, void *v)
{
        struct ima_rule_entry *entry = v;
        int i;
        char tbuf[64] = {0,};
        int offset = 0;

        rcu_read_lock();

        /* Do not print rules with inactive LSM labels */
        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (entry->lsm[i].args_p && !entry->lsm[i].rule) {
                        rcu_read_unlock();
                        return 0;
                }
        }

        if (entry->action & MEASURE)
                seq_puts(m, pt(Opt_measure));
        if (entry->action & DONT_MEASURE)
                seq_puts(m, pt(Opt_dont_measure));
        if (entry->action & APPRAISE)
                seq_puts(m, pt(Opt_appraise));
        if (entry->action & DONT_APPRAISE)
                seq_puts(m, pt(Opt_dont_appraise));
        if (entry->action & AUDIT)
                seq_puts(m, pt(Opt_audit));
        if (entry->action & HASH)
                seq_puts(m, pt(Opt_hash));
        if (entry->action & DONT_HASH)
                seq_puts(m, pt(Opt_dont_hash));

        seq_puts(m, " ");

        if (entry->flags & IMA_FUNC)
                policy_func_show(m, entry->func);

        if ((entry->flags & IMA_MASK) || (entry->flags & IMA_INMASK)) {
                if (entry->flags & IMA_MASK)
                        offset = 1;
                if (entry->mask & MAY_EXEC)
                        seq_printf(m, pt(Opt_mask), mt(mask_exec) + offset);
                if (entry->mask & MAY_WRITE)
                        seq_printf(m, pt(Opt_mask), mt(mask_write) + offset);
                if (entry->mask & MAY_READ)
                        seq_printf(m, pt(Opt_mask), mt(mask_read) + offset);
                if (entry->mask & MAY_APPEND)
                        seq_printf(m, pt(Opt_mask), mt(mask_append) + offset);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSMAGIC) {
                snprintf(tbuf, sizeof(tbuf), "0x%lx", entry->fsmagic);
                seq_printf(m, pt(Opt_fsmagic), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSNAME) {
                snprintf(tbuf, sizeof(tbuf), "%s", entry->fsname);
                seq_printf(m, pt(Opt_fsname), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_KEYRINGS) {
                seq_puts(m, "keyrings=");
                ima_show_rule_opt_list(m, entry->keyrings);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_LABEL) {
                seq_puts(m, "label=");
                ima_show_rule_opt_list(m, entry->label);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_PCR) {
                snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr);
                seq_printf(m, pt(Opt_pcr), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSUUID) {
                seq_printf(m, "fsuuid=%pU", &entry->fsuuid);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_UID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid));
                if (entry->uid_op == &uid_gt)
                        seq_printf(m, pt(Opt_uid_gt), tbuf);
                else if (entry->uid_op == &uid_lt)
                        seq_printf(m, pt(Opt_uid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_uid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_EUID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid));
                if (entry->uid_op == &uid_gt)
                        seq_printf(m, pt(Opt_euid_gt), tbuf);
                else if (entry->uid_op == &uid_lt)
                        seq_printf(m, pt(Opt_euid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_euid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_GID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid));
                if (entry->gid_op == &gid_gt)
                        seq_printf(m, pt(Opt_gid_gt), tbuf);
                else if (entry->gid_op == &gid_lt)
                        seq_printf(m, pt(Opt_gid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_gid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_EGID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid));
                if (entry->gid_op == &gid_gt)
                        seq_printf(m, pt(Opt_egid_gt), tbuf);
                else if (entry->gid_op == &gid_lt)
                        seq_printf(m, pt(Opt_egid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_egid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FOWNER) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->fowner));
                if (entry->fowner_op == &vfsuid_gt_kuid)
                        seq_printf(m, pt(Opt_fowner_gt), tbuf);
                else if (entry->fowner_op == &vfsuid_lt_kuid)
                        seq_printf(m, pt(Opt_fowner_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_fowner_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FGROUP) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->fgroup));
                if (entry->fgroup_op == &vfsgid_gt_kgid)
                        seq_printf(m, pt(Opt_fgroup_gt), tbuf);
                else if (entry->fgroup_op == &vfsgid_lt_kgid)
                        seq_printf(m, pt(Opt_fgroup_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_fgroup_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_VALIDATE_ALGOS) {
                seq_puts(m, "appraise_algos=");
                ima_policy_show_appraise_algos(m, entry->allowed_algos);
                seq_puts(m, " ");
        }

        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (entry->lsm[i].rule) {
                        switch (i) {
                        case LSM_OBJ_USER:
                                seq_printf(m, pt(Opt_obj_user),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_OBJ_ROLE:
                                seq_printf(m, pt(Opt_obj_role),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_OBJ_TYPE:
                                seq_printf(m, pt(Opt_obj_type),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_USER:
                                seq_printf(m, pt(Opt_subj_user),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_ROLE:
                                seq_printf(m, pt(Opt_subj_role),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_TYPE:
                                seq_printf(m, pt(Opt_subj_type),
                                           entry->lsm[i].args_p);
                                break;
                        }
                        seq_puts(m, " ");
                }
        }
        if (entry->template)
                seq_printf(m, "template=%s ", entry->template->name);
        if (entry->flags & IMA_DIGSIG_REQUIRED) {
                if (entry->flags & IMA_VERITY_REQUIRED)
                        seq_puts(m, "appraise_type=sigv3 ");
                else if (entry->flags & IMA_MODSIG_ALLOWED)
                        seq_puts(m, "appraise_type=imasig|modsig ");
                else
                        seq_puts(m, "appraise_type=imasig ");
        }
        if (entry->flags & IMA_VERITY_REQUIRED)
                seq_puts(m, "digest_type=verity ");
        if (entry->flags & IMA_PERMIT_DIRECTIO)
                seq_puts(m, "permit_directio ");
        rcu_read_unlock();
        seq_puts(m, "\n");
        return 0;
}
#endif        /* CONFIG_IMA_READ_POLICY */

#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING)
/*
 * ima_appraise_signature: whether IMA will appraise a given function using
 * an IMA digital signature. This is restricted to cases where the kernel
 * has a set of built-in trusted keys in order to avoid an attacker simply
 * loading additional keys.
 */
bool ima_appraise_signature(enum kernel_read_file_id id)
{
        struct ima_rule_entry *entry;
        bool found = false;
        enum ima_hooks func;
        struct list_head *ima_rules_tmp;

        if (id >= READING_MAX_ID)
                return false;

        if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE)
            && security_locked_down(LOCKDOWN_KEXEC))
                return false;

        func = read_idmap[id] ?: FILE_CHECK;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                if (entry->action != APPRAISE)
                        continue;

                /*
                 * A generic entry will match, but otherwise require that it
                 * match the func we're looking for
                 */
                if (entry->func && entry->func != func)
                        continue;

                /*
                 * We require this to be a digital signature, not a raw IMA
                 * hash.
                 */
                if (entry->flags & IMA_DIGSIG_REQUIRED)
                        found = true;

                /*
                 * We've found a rule that matches, so break now even if it
                 * didn't require a digital signature - a later rule that does
                 * won't override it, so would be a false positive.
                 */
                break;
        }

        rcu_read_unlock();
        return found;
}
#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */





































































































    1 






























    1 

















    1 


















































    1 











    1 




















    2 



    2 























































    2 






    1 










    1 










    1 





    1 

    1 









    1 

    1 







    1 



    1 























    1 




























    1 










    1 







































    1 





    1 
    1 





    1 







    1 


    1 












































    1 


































    1 


    1 
    1 
    1 












    1 



























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
// SPDX-License-Identifier: GPL-2.0
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/tcp.h>
#include <linux/hash.h>
#include <linux/tcp_metrics.h>
#include <linux/vmalloc.h>

#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/genetlink.h>

static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
                                                   const struct inetpeer_addr *daddr,
                                                   struct net *net, unsigned int hash);

struct tcp_fastopen_metrics {
        u16        mss;
        u16        syn_loss:10,                /* Recurring Fast Open SYN losses */
                try_exp:2;                /* Request w/ exp. option (once) */
        unsigned long        last_syn_loss;        /* Last Fast Open SYN loss */
        struct        tcp_fastopen_cookie        cookie;
};

/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
 * Kernel only stores RTT and RTTVAR in usec resolution
 */
#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)

struct tcp_metrics_block {
        struct tcp_metrics_block __rcu        *tcpm_next;
        struct net                        *tcpm_net;
        struct inetpeer_addr                tcpm_saddr;
        struct inetpeer_addr                tcpm_daddr;
        unsigned long                        tcpm_stamp;
        u32                                tcpm_lock;
        u32                                tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
        struct tcp_fastopen_metrics        tcpm_fastopen;

        struct rcu_head                        rcu_head;
};

static inline struct net *tm_net(const struct tcp_metrics_block *tm)
{
        /* Paired with the WRITE_ONCE() in tcpm_new() */
        return READ_ONCE(tm->tcpm_net);
}

static bool tcp_metric_locked(struct tcp_metrics_block *tm,
                              enum tcp_metric_index idx)
{
        /* Paired with WRITE_ONCE() in tcpm_suck_dst() */
        return READ_ONCE(tm->tcpm_lock) & (1 << idx);
}

static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
                          enum tcp_metric_index idx)
{
        /* Paired with WRITE_ONCE() in tcp_metric_set() */
        return READ_ONCE(tm->tcpm_vals[idx]);
}

static void tcp_metric_set(struct tcp_metrics_block *tm,
                           enum tcp_metric_index idx,
                           u32 val)
{
        /* Paired with READ_ONCE() in tcp_metric_get() */
        WRITE_ONCE(tm->tcpm_vals[idx], val);
}

static bool addr_same(const struct inetpeer_addr *a,
                      const struct inetpeer_addr *b)
{
        return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
}

struct tcpm_hash_bucket {
        struct tcp_metrics_block __rcu        *chain;
};

static struct tcpm_hash_bucket        *tcp_metrics_hash __read_mostly;
static unsigned int                tcp_metrics_hash_log __read_mostly;

static DEFINE_SPINLOCK(tcp_metrics_lock);
static DEFINE_SEQLOCK(fastopen_seqlock);

static void tcpm_suck_dst(struct tcp_metrics_block *tm,
                          const struct dst_entry *dst,
                          bool fastopen_clear)
{
        u32 msval;
        u32 val;

        WRITE_ONCE(tm->tcpm_stamp, jiffies);

        val = 0;
        if (dst_metric_locked(dst, RTAX_RTT))
                val |= 1 << TCP_METRIC_RTT;
        if (dst_metric_locked(dst, RTAX_RTTVAR))
                val |= 1 << TCP_METRIC_RTTVAR;
        if (dst_metric_locked(dst, RTAX_SSTHRESH))
                val |= 1 << TCP_METRIC_SSTHRESH;
        if (dst_metric_locked(dst, RTAX_CWND))
                val |= 1 << TCP_METRIC_CWND;
        if (dst_metric_locked(dst, RTAX_REORDERING))
                val |= 1 << TCP_METRIC_REORDERING;
        /* Paired with READ_ONCE() in tcp_metric_locked() */
        WRITE_ONCE(tm->tcpm_lock, val);

        msval = dst_metric_raw(dst, RTAX_RTT);
        tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);

        msval = dst_metric_raw(dst, RTAX_RTTVAR);
        tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
        tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
                       dst_metric_raw(dst, RTAX_SSTHRESH));
        tcp_metric_set(tm, TCP_METRIC_CWND,
                       dst_metric_raw(dst, RTAX_CWND));
        tcp_metric_set(tm, TCP_METRIC_REORDERING,
                       dst_metric_raw(dst, RTAX_REORDERING));
        if (fastopen_clear) {
                write_seqlock(&fastopen_seqlock);
                tm->tcpm_fastopen.mss = 0;
                tm->tcpm_fastopen.syn_loss = 0;
                tm->tcpm_fastopen.try_exp = 0;
                tm->tcpm_fastopen.cookie.exp = false;
                tm->tcpm_fastopen.cookie.len = 0;
                write_sequnlock(&fastopen_seqlock);
        }
}

#define TCP_METRICS_TIMEOUT                (60 * 60 * HZ)

static void tcpm_check_stamp(struct tcp_metrics_block *tm,
                             const struct dst_entry *dst)
{
        unsigned long limit;

        if (!tm)
                return;
        limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
        if (unlikely(time_after(jiffies, limit)))
                tcpm_suck_dst(tm, dst, false);
}

#define TCP_METRICS_RECLAIM_DEPTH        5
#define TCP_METRICS_RECLAIM_PTR                (struct tcp_metrics_block *) 0x1UL

#define deref_locked(p)        \
        rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock))

static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
                                          struct inetpeer_addr *saddr,
                                          struct inetpeer_addr *daddr,
                                          unsigned int hash)
{
        struct tcp_metrics_block *tm;
        struct net *net;
        bool reclaim = false;

        spin_lock_bh(&tcp_metrics_lock);
        net = dev_net(dst->dev);

        /* While waiting for the spin-lock the cache might have been populated
         * with this entry and so we have to check again.
         */
        tm = __tcp_get_metrics(saddr, daddr, net, hash);
        if (tm == TCP_METRICS_RECLAIM_PTR) {
                reclaim = true;
                tm = NULL;
        }
        if (tm) {
                tcpm_check_stamp(tm, dst);
                goto out_unlock;
        }

        if (unlikely(reclaim)) {
                struct tcp_metrics_block *oldest;

                oldest = deref_locked(tcp_metrics_hash[hash].chain);
                for (tm = deref_locked(oldest->tcpm_next); tm;
                     tm = deref_locked(tm->tcpm_next)) {
                        if (time_before(READ_ONCE(tm->tcpm_stamp),
                                        READ_ONCE(oldest->tcpm_stamp)))
                                oldest = tm;
                }
                tm = oldest;
        } else {
                tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
                if (!tm)
                        goto out_unlock;
        }
        /* Paired with the READ_ONCE() in tm_net() */
        WRITE_ONCE(tm->tcpm_net, net);

        tm->tcpm_saddr = *saddr;
        tm->tcpm_daddr = *daddr;

        tcpm_suck_dst(tm, dst, reclaim);

        if (likely(!reclaim)) {
                tm->tcpm_next = tcp_metrics_hash[hash].chain;
                rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
        }

out_unlock:
        spin_unlock_bh(&tcp_metrics_lock);
        return tm;
}

static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
{
        if (tm)
                return tm;
        if (depth > TCP_METRICS_RECLAIM_DEPTH)
                return TCP_METRICS_RECLAIM_PTR;
        return NULL;
}

static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
                                                   const struct inetpeer_addr *daddr,
                                                   struct net *net, unsigned int hash)
{
        struct tcp_metrics_block *tm;
        int depth = 0;

        for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
             tm = rcu_dereference(tm->tcpm_next)) {
                if (addr_same(&tm->tcpm_saddr, saddr) &&
                    addr_same(&tm->tcpm_daddr, daddr) &&
                    net_eq(tm_net(tm), net))
                        break;
                depth++;
        }
        return tcp_get_encode(tm, depth);
}

static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
                                                       struct dst_entry *dst)
{
        struct tcp_metrics_block *tm;
        struct inetpeer_addr saddr, daddr;
        unsigned int hash;
        struct net *net;

        saddr.family = req->rsk_ops->family;
        daddr.family = req->rsk_ops->family;
        switch (daddr.family) {
        case AF_INET:
                inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr);
                inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr);
                hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr);
                inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr);
                hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
                break;
#endif
        default:
                return NULL;
        }

        net = dev_net(dst->dev);
        hash ^= net_hash_mix(net);
        hash = hash_32(hash, tcp_metrics_hash_log);

        for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
             tm = rcu_dereference(tm->tcpm_next)) {
                if (addr_same(&tm->tcpm_saddr, &saddr) &&
                    addr_same(&tm->tcpm_daddr, &daddr) &&
                    net_eq(tm_net(tm), net))
                        break;
        }
        tcpm_check_stamp(tm, dst);
        return tm;
}

static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
                                                 struct dst_entry *dst,
                                                 bool create)
{
        struct tcp_metrics_block *tm;
        struct inetpeer_addr saddr, daddr;
        unsigned int hash;
        struct net *net;

        if (sk->sk_family == AF_INET) {
                inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
                inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
                hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
        }
#if IS_ENABLED(CONFIG_IPV6)
        else if (sk->sk_family == AF_INET6) {
                if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
                        inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr);
                        inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr);
                        hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr);
                } else {
                        inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr);
                        inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr);
                        hash = ipv6_addr_hash(&sk->sk_v6_daddr);
                }
        }
#endif
        else
                return NULL;

        net = dev_net(dst->dev);
        hash ^= net_hash_mix(net);
        hash = hash_32(hash, tcp_metrics_hash_log);

        tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
        if (tm == TCP_METRICS_RECLAIM_PTR)
                tm = NULL;
        if (!tm && create)
                tm = tcpm_new(dst, &saddr, &daddr, hash);
        else
                tcpm_check_stamp(tm, dst);

        return tm;
}

/* Save metrics learned by this TCP session.  This function is called
 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
 * or goes from LAST-ACK to CLOSE.
 */
void tcp_update_metrics(struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct tcp_metrics_block *tm;
        unsigned long rtt;
        u32 val;
        int m;

        sk_dst_confirm(sk);
        if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst)
                return;

        rcu_read_lock();
        if (icsk->icsk_backoff || !tp->srtt_us) {
                /* This session failed to estimate rtt. Why?
                 * Probably, no packets returned in time.  Reset our
                 * results.
                 */
                tm = tcp_get_metrics(sk, dst, false);
                if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
                        tcp_metric_set(tm, TCP_METRIC_RTT, 0);
                goto out_unlock;
        } else
                tm = tcp_get_metrics(sk, dst, true);

        if (!tm)
                goto out_unlock;

        rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
        m = rtt - tp->srtt_us;

        /* If newly calculated rtt larger than stored one, store new
         * one. Otherwise, use EWMA. Remember, rtt overestimation is
         * always better than underestimation.
         */
        if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
                if (m <= 0)
                        rtt = tp->srtt_us;
                else
                        rtt -= (m >> 3);
                tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
        }

        if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
                unsigned long var;

                if (m < 0)
                        m = -m;

                /* Scale deviation to rttvar fixed point */
                m >>= 1;
                if (m < tp->mdev_us)
                        m = tp->mdev_us;

                var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
                if (m >= var)
                        var = m;
                else
                        var -= (var - m) >> 2;

                tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
        }

        if (tcp_in_initial_slowstart(tp)) {
                /* Slow start still did not finish. */
                if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
                    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
                        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
                        if (val && (tcp_snd_cwnd(tp) >> 1) > val)
                                tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
                                               tcp_snd_cwnd(tp) >> 1);
                }
                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
                        if (tcp_snd_cwnd(tp) > val)
                                tcp_metric_set(tm, TCP_METRIC_CWND,
                                               tcp_snd_cwnd(tp));
                }
        } else if (!tcp_in_slow_start(tp) &&
                   icsk->icsk_ca_state == TCP_CA_Open) {
                /* Cong. avoidance phase, cwnd is reliable. */
                if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
                    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
                        tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
                                       max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh));
                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
                        tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1);
                }
        } else {
                /* Else slow start did not finish, cwnd is non-sense,
                 * ssthresh may be also invalid.
                 */
                if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
                        val = tcp_metric_get(tm, TCP_METRIC_CWND);
                        tcp_metric_set(tm, TCP_METRIC_CWND,
                                       (val + tp->snd_ssthresh) >> 1);
                }
                if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
                    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
                        val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
                        if (val && tp->snd_ssthresh > val)
                                tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
                                               tp->snd_ssthresh);
                }
                if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
                        val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
                        if (val < tp->reordering &&
                            tp->reordering !=
                            READ_ONCE(net->ipv4.sysctl_tcp_reordering))
                                tcp_metric_set(tm, TCP_METRIC_REORDERING,
                                               tp->reordering);
                }
        }
        WRITE_ONCE(tm->tcpm_stamp, jiffies);
out_unlock:
        rcu_read_unlock();
}

/* Initialize metrics on socket. */

void tcp_init_metrics(struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct tcp_metrics_block *tm;
        u32 val, crtt = 0; /* cached RTT scaled by 8 */

        sk_dst_confirm(sk);
        /* ssthresh may have been reduced unnecessarily during.
         * 3WHS. Restore it back to its initial default.
         */
        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
        if (!dst)
                goto reset;

        rcu_read_lock();
        tm = tcp_get_metrics(sk, dst, false);
        if (!tm) {
                rcu_read_unlock();
                goto reset;
        }

        if (tcp_metric_locked(tm, TCP_METRIC_CWND))
                tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);

        val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
              0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
        if (val) {
                tp->snd_ssthresh = val;
                if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
                        tp->snd_ssthresh = tp->snd_cwnd_clamp;
        }
        val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
        if (val && tp->reordering != val)
                tp->reordering = val;

        crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
        rcu_read_unlock();
reset:
        /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
         * to seed the RTO for later data packets because SYN packets are
         * small. Use the per-dst cached values to seed the RTO but keep
         * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
         * Later the RTO will be updated immediately upon obtaining the first
         * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
         * influences the first RTO but not later RTT estimation.
         *
         * But if RTT is not available from the SYN (due to retransmits or
         * syn cookies) or the cache, force a conservative 3secs timeout.
         *
         * A bit of theory. RTT is time passed after "normal" sized packet
         * is sent until it is ACKed. In normal circumstances sending small
         * packets force peer to delay ACKs and calculation is correct too.
         * The algorithm is adaptive and, provided we follow specs, it
         * NEVER underestimate RTT. BUT! If peer tries to make some clever
         * tricks sort of "quick acks" for time long enough to decrease RTT
         * to low value, and then abruptly stops to do it and starts to delay
         * ACKs, wait for troubles.
         */
        if (crtt > tp->srtt_us) {
                /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
                crtt /= 8 * USEC_PER_SEC / HZ;
                inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
        } else if (tp->srtt_us == 0) {
                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
                 * 3WHS. This is most likely due to retransmission,
                 * including spurious one. Reset the RTO back to 3secs
                 * from the more aggressive 1sec to avoid more spurious
                 * retransmission.
                 */
                tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
                tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;

                inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
        }
}

bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
{
        struct tcp_metrics_block *tm;
        bool ret;

        if (!dst)
                return false;

        rcu_read_lock();
        tm = __tcp_get_metrics_req(req, dst);
        if (tm && tcp_metric_get(tm, TCP_METRIC_RTT))
                ret = true;
        else
                ret = false;
        rcu_read_unlock();

        return ret;
}

void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
                            struct tcp_fastopen_cookie *cookie)
{
        struct tcp_metrics_block *tm;

        rcu_read_lock();
        tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
        if (tm) {
                struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
                unsigned int seq;

                do {
                        seq = read_seqbegin(&fastopen_seqlock);
                        if (tfom->mss)
                                *mss = tfom->mss;
                        *cookie = tfom->cookie;
                        if (cookie->len <= 0 && tfom->try_exp == 1)
                                cookie->exp = true;
                } while (read_seqretry(&fastopen_seqlock, seq));
        }
        rcu_read_unlock();
}

void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
                            struct tcp_fastopen_cookie *cookie, bool syn_lost,
                            u16 try_exp)
{
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_metrics_block *tm;

        if (!dst)
                return;
        rcu_read_lock();
        tm = tcp_get_metrics(sk, dst, true);
        if (tm) {
                struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;

                write_seqlock_bh(&fastopen_seqlock);
                if (mss)
                        tfom->mss = mss;
                if (cookie && cookie->len > 0)
                        tfom->cookie = *cookie;
                else if (try_exp > tfom->try_exp &&
                         tfom->cookie.len <= 0 && !tfom->cookie.exp)
                        tfom->try_exp = try_exp;
                if (syn_lost) {
                        ++tfom->syn_loss;
                        tfom->last_syn_loss = jiffies;
                } else
                        tfom->syn_loss = 0;
                write_sequnlock_bh(&fastopen_seqlock);
        }
        rcu_read_unlock();
}

static struct genl_family tcp_metrics_nl_family;

static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
        [TCP_METRICS_ATTR_ADDR_IPV4]        = { .type = NLA_U32, },
        [TCP_METRICS_ATTR_ADDR_IPV6]        = { .type = NLA_BINARY,
                                            .len = sizeof(struct in6_addr), },
        /* Following attributes are not received for GET/DEL,
         * we keep them for reference
         */
#if 0
        [TCP_METRICS_ATTR_AGE]                = { .type = NLA_MSECS, },
        [TCP_METRICS_ATTR_TW_TSVAL]        = { .type = NLA_U32, },
        [TCP_METRICS_ATTR_TW_TS_STAMP]        = { .type = NLA_S32, },
        [TCP_METRICS_ATTR_VALS]                = { .type = NLA_NESTED, },
        [TCP_METRICS_ATTR_FOPEN_MSS]        = { .type = NLA_U16, },
        [TCP_METRICS_ATTR_FOPEN_SYN_DROPS]        = { .type = NLA_U16, },
        [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]        = { .type = NLA_MSECS, },
        [TCP_METRICS_ATTR_FOPEN_COOKIE]        = { .type = NLA_BINARY,
                                            .len = TCP_FASTOPEN_COOKIE_MAX, },
#endif
};

/* Add attributes, caller cancels its header on failure */
static int tcp_metrics_fill_info(struct sk_buff *msg,
                                 struct tcp_metrics_block *tm)
{
        struct nlattr *nest;
        int i;

        switch (tm->tcpm_daddr.family) {
        case AF_INET:
                if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4,
                                    inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0)
                        goto nla_put_failure;
                if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4,
                                    inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0)
                        goto nla_put_failure;
                break;
        case AF_INET6:
                if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6,
                                     inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0)
                        goto nla_put_failure;
                if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6,
                                     inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0)
                        goto nla_put_failure;
                break;
        default:
                return -EAFNOSUPPORT;
        }

        if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
                          jiffies - READ_ONCE(tm->tcpm_stamp),
                          TCP_METRICS_ATTR_PAD) < 0)
                goto nla_put_failure;

        {
                int n = 0;

                nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS);
                if (!nest)
                        goto nla_put_failure;
                for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
                        u32 val = tcp_metric_get(tm, i);

                        if (!val)
                                continue;
                        if (i == TCP_METRIC_RTT) {
                                if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
                                                val) < 0)
                                        goto nla_put_failure;
                                n++;
                                val = max(val / 1000, 1U);
                        }
                        if (i == TCP_METRIC_RTTVAR) {
                                if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
                                                val) < 0)
                                        goto nla_put_failure;
                                n++;
                                val = max(val / 1000, 1U);
                        }
                        if (nla_put_u32(msg, i + 1, val) < 0)
                                goto nla_put_failure;
                        n++;
                }
                if (n)
                        nla_nest_end(msg, nest);
                else
                        nla_nest_cancel(msg, nest);
        }

        {
                struct tcp_fastopen_metrics tfom_copy[1], *tfom;
                unsigned int seq;

                do {
                        seq = read_seqbegin(&fastopen_seqlock);
                        tfom_copy[0] = tm->tcpm_fastopen;
                } while (read_seqretry(&fastopen_seqlock, seq));

                tfom = tfom_copy;
                if (tfom->mss &&
                    nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
                                tfom->mss) < 0)
                        goto nla_put_failure;
                if (tfom->syn_loss &&
                    (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
                                tfom->syn_loss) < 0 ||
                     nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
                                jiffies - tfom->last_syn_loss,
                                TCP_METRICS_ATTR_PAD) < 0))
                        goto nla_put_failure;
                if (tfom->cookie.len > 0 &&
                    nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
                            tfom->cookie.len, tfom->cookie.val) < 0)
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int tcp_metrics_dump_info(struct sk_buff *skb,
                                 struct netlink_callback *cb,
                                 struct tcp_metrics_block *tm)
{
        void *hdr;

        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                          &tcp_metrics_nl_family, NLM_F_MULTI,
                          TCP_METRICS_CMD_GET);
        if (!hdr)
                return -EMSGSIZE;

        if (tcp_metrics_fill_info(skb, tm) < 0)
                goto nla_put_failure;

        genlmsg_end(skb, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(skb, hdr);
        return -EMSGSIZE;
}

static int tcp_metrics_nl_dump(struct sk_buff *skb,
                               struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        unsigned int max_rows = 1U << tcp_metrics_hash_log;
        unsigned int row, s_row = cb->args[0];
        int s_col = cb->args[1], col = s_col;
        int res = 0;

        for (row = s_row; row < max_rows; row++, s_col = 0) {
                struct tcp_metrics_block *tm;
                struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;

                rcu_read_lock();
                for (col = 0, tm = rcu_dereference(hb->chain); tm;
                     tm = rcu_dereference(tm->tcpm_next), col++) {
                        if (!net_eq(tm_net(tm), net))
                                continue;
                        if (col < s_col)
                                continue;
                        res = tcp_metrics_dump_info(skb, cb, tm);
                        if (res < 0) {
                                rcu_read_unlock();
                                goto done;
                        }
                }
                rcu_read_unlock();
        }

done:
        cb->args[0] = row;
        cb->args[1] = col;
        return res;
}

static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
                           unsigned int *hash, int optional, int v4, int v6)
{
        struct nlattr *a;

        a = info->attrs[v4];
        if (a) {
                inetpeer_set_addr_v4(addr, nla_get_in_addr(a));
                if (hash)
                        *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr));
                return 0;
        }
        a = info->attrs[v6];
        if (a) {
                struct in6_addr in6;

                if (nla_len(a) != sizeof(struct in6_addr))
                        return -EINVAL;
                in6 = nla_get_in6_addr(a);
                inetpeer_set_addr_v6(addr, &in6);
                if (hash)
                        *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr));
                return 0;
        }
        return optional ? 1 : -EAFNOSUPPORT;
}

static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
                         unsigned int *hash, int optional)
{
        return __parse_nl_addr(info, addr, hash, optional,
                               TCP_METRICS_ATTR_ADDR_IPV4,
                               TCP_METRICS_ATTR_ADDR_IPV6);
}

static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
{
        return __parse_nl_addr(info, addr, NULL, 0,
                               TCP_METRICS_ATTR_SADDR_IPV4,
                               TCP_METRICS_ATTR_SADDR_IPV6);
}

static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
{
        struct tcp_metrics_block *tm;
        struct inetpeer_addr saddr, daddr;
        unsigned int hash;
        struct sk_buff *msg;
        struct net *net = genl_info_net(info);
        void *reply;
        int ret;
        bool src = true;

        ret = parse_nl_addr(info, &daddr, &hash, 0);
        if (ret < 0)
                return ret;

        ret = parse_nl_saddr(info, &saddr);
        if (ret < 0)
                src = false;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
                                  info->genlhdr->cmd);
        if (!reply)
                goto nla_put_failure;

        hash ^= net_hash_mix(net);
        hash = hash_32(hash, tcp_metrics_hash_log);
        ret = -ESRCH;
        rcu_read_lock();
        for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
             tm = rcu_dereference(tm->tcpm_next)) {
                if (addr_same(&tm->tcpm_daddr, &daddr) &&
                    (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
                    net_eq(tm_net(tm), net)) {
                        ret = tcp_metrics_fill_info(msg, tm);
                        break;
                }
        }
        rcu_read_unlock();
        if (ret < 0)
                goto out_free;

        genlmsg_end(msg, reply);
        return genlmsg_reply(msg, info);

nla_put_failure:
        ret = -EMSGSIZE;

out_free:
        nlmsg_free(msg);
        return ret;
}

static void tcp_metrics_flush_all(struct net *net)
{
        unsigned int max_rows = 1U << tcp_metrics_hash_log;
        struct tcpm_hash_bucket *hb = tcp_metrics_hash;
        struct tcp_metrics_block *tm;
        unsigned int row;

        for (row = 0; row < max_rows; row++, hb++) {
                struct tcp_metrics_block __rcu **pp = &hb->chain;
                bool match;

                if (!rcu_access_pointer(*pp))
                        continue;

                spin_lock_bh(&tcp_metrics_lock);
                for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
                        match = net ? net_eq(tm_net(tm), net) :
                                !refcount_read(&tm_net(tm)->ns.count);
                        if (match) {
                                rcu_assign_pointer(*pp, tm->tcpm_next);
                                kfree_rcu(tm, rcu_head);
                        } else {
                                pp = &tm->tcpm_next;
                        }
                }
                spin_unlock_bh(&tcp_metrics_lock);
                cond_resched();
        }
}

static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
        struct tcpm_hash_bucket *hb;
        struct tcp_metrics_block *tm;
        struct tcp_metrics_block __rcu **pp;
        struct inetpeer_addr saddr, daddr;
        unsigned int hash;
        struct net *net = genl_info_net(info);
        int ret;
        bool src = true, found = false;

        ret = parse_nl_addr(info, &daddr, &hash, 1);
        if (ret < 0)
                return ret;
        if (ret > 0) {
                tcp_metrics_flush_all(net);
                return 0;
        }
        ret = parse_nl_saddr(info, &saddr);
        if (ret < 0)
                src = false;

        hash ^= net_hash_mix(net);
        hash = hash_32(hash, tcp_metrics_hash_log);
        hb = tcp_metrics_hash + hash;
        pp = &hb->chain;
        spin_lock_bh(&tcp_metrics_lock);
        for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
                if (addr_same(&tm->tcpm_daddr, &daddr) &&
                    (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
                    net_eq(tm_net(tm), net)) {
                        rcu_assign_pointer(*pp, tm->tcpm_next);
                        kfree_rcu(tm, rcu_head);
                        found = true;
                } else {
                        pp = &tm->tcpm_next;
                }
        }
        spin_unlock_bh(&tcp_metrics_lock);
        if (!found)
                return -ESRCH;
        return 0;
}

static const struct genl_small_ops tcp_metrics_nl_ops[] = {
        {
                .cmd = TCP_METRICS_CMD_GET,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = tcp_metrics_nl_cmd_get,
                .dumpit = tcp_metrics_nl_dump,
        },
        {
                .cmd = TCP_METRICS_CMD_DEL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = tcp_metrics_nl_cmd_del,
                .flags = GENL_ADMIN_PERM,
        },
};

static struct genl_family tcp_metrics_nl_family __ro_after_init = {
        .hdrsize        = 0,
        .name                = TCP_METRICS_GENL_NAME,
        .version        = TCP_METRICS_GENL_VERSION,
        .maxattr        = TCP_METRICS_ATTR_MAX,
        .policy = tcp_metrics_nl_policy,
        .netnsok        = true,
        .parallel_ops        = true,
        .module                = THIS_MODULE,
        .small_ops        = tcp_metrics_nl_ops,
        .n_small_ops        = ARRAY_SIZE(tcp_metrics_nl_ops),
        .resv_start_op        = TCP_METRICS_CMD_DEL + 1,
};

static unsigned int tcpmhash_entries __initdata;
static int __init set_tcpmhash_entries(char *str)
{
        ssize_t ret;

        if (!str)
                return 0;

        ret = kstrtouint(str, 0, &tcpmhash_entries);
        if (ret)
                return 0;

        return 1;
}
__setup("tcpmhash_entries=", set_tcpmhash_entries);

static void __init tcp_metrics_hash_alloc(void)
{
        unsigned int slots = tcpmhash_entries;
        size_t size;

        if (!slots) {
                if (totalram_pages() >= 128 * 1024)
                        slots = 16 * 1024;
                else
                        slots = 8 * 1024;
        }

        tcp_metrics_hash_log = order_base_2(slots);
        size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;

        tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
        if (!tcp_metrics_hash)
                panic("Could not allocate the tcp_metrics hash table\n");
}

static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
{
        tcp_metrics_flush_all(NULL);
}

static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
        .exit_batch        =        tcp_net_metrics_exit_batch,
};

void __init tcp_metrics_init(void)
{
        int ret;

        tcp_metrics_hash_alloc();

        ret = register_pernet_subsys(&tcp_net_metrics_ops);
        if (ret < 0)
                panic("Could not register tcp_net_metrics_ops\n");

        ret = genl_register_family(&tcp_metrics_nl_family);
        if (ret < 0)
                panic("Could not register tcp_metrics generic netlink\n");
}



























    6 
    5 

    6 




    5 



    6 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * netprio_cgroup.h                        Control Group Priority set
 *
 * Authors:        Neil Horman <nhorman@tuxdriver.com>
 */

#ifndef _NETPRIO_CGROUP_H
#define _NETPRIO_CGROUP_H

#include <linux/cgroup.h>
#include <linux/hardirq.h>
#include <linux/rcupdate.h>

#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
struct netprio_map {
        struct rcu_head rcu;
        u32 priomap_len;
        u32 priomap[];
};

static inline u32 task_netprioidx(struct task_struct *p)
{
        struct cgroup_subsys_state *css;
        u32 idx;

        rcu_read_lock();
        css = task_css(p, net_prio_cgrp_id);
        idx = css->id;
        rcu_read_unlock();
        return idx;
}

static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
{
        if (in_interrupt())
                return;

        sock_cgroup_set_prioidx(skcd, task_netprioidx(current));
}

#else /* !CONFIG_CGROUP_NET_PRIO */

static inline u32 task_netprioidx(struct task_struct *p)
{
        return 0;
}

static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd)
{
}

#endif /* CONFIG_CGROUP_NET_PRIO */
#endif  /* _NET_CLS_CGROUP_H */































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



































    1 



    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 











    1 
    1 

    1 






    1 

    1 


    1 





    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
#include <linux/pgalloc_tag.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "swap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

/*
 * By default, transparent hugepage support is disabled in order to avoid
 * risking an increased memory footprint for applications that are not
 * guaranteed to benefit from it. When transparent hugepage support is
 * enabled, it is for all mappings, and khugepaged scans all mappings.
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
 */
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);

static struct shrinker *deferred_split_shrinker;
static unsigned long deferred_split_count(struct shrinker *shrink,
                                          struct shrink_control *sc);
static unsigned long deferred_split_scan(struct shrinker *shrink,
                                         struct shrink_control *sc);

static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         unsigned long vm_flags,
                                         unsigned long tva_flags,
                                         unsigned long orders)
{
        bool smaps = tva_flags & TVA_SMAPS;
        bool in_pf = tva_flags & TVA_IN_PF;
        bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
        /* Check the intersection of requested and supported orders. */
        orders &= vma_is_anonymous(vma) ?
                        THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
        if (!orders)
                return 0;

        if (!vma->vm_mm)                /* vdso */
                return 0;

        /*
         * Explicitly disabled through madvise or prctl, or some
         * architectures may disable THP for some mappings, for
         * example, s390 kvm.
         * */
        if ((vm_flags & VM_NOHUGEPAGE) ||
            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
                return 0;
        /*
         * If the hardware/firmware marked hugepage support disabled.
         */
        if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
                return 0;

        /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
        if (vma_is_dax(vma))
                return in_pf ? orders : 0;

        /*
         * khugepaged special VMA and hugetlb VMA.
         * Must be checked after dax since some dax mappings may have
         * VM_MIXEDMAP set.
         */
        if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
                return 0;

        /*
         * Check alignment for file vma and size for both file and anon vma by
         * filtering out the unsuitable orders.
         *
         * Skip the check for page fault. Huge fault does the check in fault
         * handlers.
         */
        if (!in_pf) {
                int order = highest_order(orders);
                unsigned long addr;

                while (orders) {
                        addr = vma->vm_end - (PAGE_SIZE << order);
                        if (thp_vma_suitable_order(vma, addr, order))
                                break;
                        order = next_order(&orders, order);
                }

                if (!orders)
                        return 0;
        }

        /*
         * Enabled via shmem mount options or sysfs settings.
         * Must be done before hugepage flags check since shmem has its
         * own flags.
         */
        if (!in_pf && shmem_file(vma->vm_file))
                return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
                                     !enforce_sysfs, vma->vm_mm, vm_flags)
                        ? orders : 0;

        if (!vma_is_anonymous(vma)) {
                /*
                 * Enforce sysfs THP requirements as necessary. Anonymous vmas
                 * were already handled in thp_vma_allowable_orders().
                 */
                if (enforce_sysfs &&
                    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
                                                    !hugepage_global_always())))
                        return 0;

                /*
                 * Trust that ->huge_fault() handlers know what they are doing
                 * in fault path.
                 */
                if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
                        return orders;
                /* Only regular file is valid in collapse path */
                if (((!in_pf || smaps)) && file_thp_enabled(vma))
                        return orders;
                return 0;
        }

        if (vma_is_temporary_stack(vma))
                return 0;

        /*
         * THPeligible bit of smaps should show 1 for proper VMAs even
         * though anon_vma is not initialized yet.
         *
         * Allow page fault since anon_vma may be not initialized until
         * the first page fault.
         */
        if (!vma->anon_vma)
                return (smaps || in_pf) ? orders : 0;

        return orders;
}

static bool get_huge_zero_page(void)
{
        struct folio *zero_folio;
retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
                return true;

        zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
        if (!zero_folio) {
                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
                return false;
        }
        preempt_disable();
        if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
                preempt_enable();
                folio_put(zero_folio);
                goto retry;
        }
        WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));

        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
        count_vm_event(THP_ZERO_PAGE_ALLOC);
        return true;
}

static void put_huge_zero_page(void)
{
        /*
         * Counter should never go to zero here. Only shrinker can put
         * last reference.
         */
        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                return READ_ONCE(huge_zero_folio);

        if (!get_huge_zero_page())
                return NULL;

        if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                put_huge_zero_page();

        return READ_ONCE(huge_zero_folio);
}

void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                put_huge_zero_page();
}

static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
                                        struct shrink_control *sc)
{
        /* we can free zero page only if last reference remains */
        return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}

static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
                                       struct shrink_control *sc)
{
        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
                struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
                BUG_ON(zero_folio == NULL);
                WRITE_ONCE(huge_zero_pfn, ~0UL);
                folio_put(zero_folio);
                return HPAGE_PMD_NR;
        }

        return 0;
}

static struct shrinker *huge_zero_page_shrinker;

#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
                            struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
                output = "[always] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always [madvise] never";
        else
                output = "always madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t enabled_store(struct kobject *kobj,
                             struct kobj_attribute *attr,
                             const char *buf, size_t count)
{
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else
                ret = -EINVAL;

        if (ret > 0) {
                int err = start_stop_khugepaged();
                if (err)
                        ret = err;
        }
        return ret;
}

static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);

ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag)
{
        return sysfs_emit(buf, "%d\n",
                          !!test_bit(flag, &transparent_hugepage_flags));
}

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                 struct kobj_attribute *attr,
                                 const char *buf, size_t count,
                                 enum transparent_hugepage_flag flag)
{
        unsigned long value;
        int ret;

        ret = kstrtoul(buf, 10, &value);
        if (ret < 0)
                return ret;
        if (value > 1)
                return -EINVAL;

        if (value)
                set_bit(flag, &transparent_hugepage_flags);
        else
                clear_bit(flag, &transparent_hugepage_flags);

        return count;
}

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
                     &transparent_hugepage_flags))
                output = "[always] defer defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
                          &transparent_hugepage_flags))
                output = "always [defer] defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer [defer+madvise] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer defer+madvise [madvise] never";
        else
                output = "always defer defer+madvise madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        if (sysfs_streq(buf, "always")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer+madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else
                return -EINVAL;

        return count;
}
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);

static ssize_t use_zero_page_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);

static ssize_t hpage_pmd_size_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
        __ATTR_RO(hpage_pmd_size);

static struct attribute *hugepage_attr[] = {
        &enabled_attr.attr,
        &defrag_attr.attr,
        &use_zero_page_attr.attr,
        &hpage_pmd_size_attr.attr,
#ifdef CONFIG_SHMEM
        &shmem_enabled_attr.attr,
#endif
        NULL,
};

static const struct attribute_group hugepage_attr_group = {
        .attrs = hugepage_attr,
};

static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);

struct thpsize {
        struct kobject kobj;
        struct list_head node;
        int order;
};

#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)

static ssize_t thpsize_enabled_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        int order = to_thpsize(kobj)->order;
        const char *output;

        if (test_bit(order, &huge_anon_orders_always))
                output = "[always] inherit madvise never";
        else if (test_bit(order, &huge_anon_orders_inherit))
                output = "always [inherit] madvise never";
        else if (test_bit(order, &huge_anon_orders_madvise))
                output = "always inherit [madvise] never";
        else
                output = "always inherit madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t thpsize_enabled_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int order = to_thpsize(kobj)->order;
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_inherit);
                clear_bit(order, &huge_anon_orders_madvise);
                set_bit(order, &huge_anon_orders_always);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "inherit")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_madvise);
                set_bit(order, &huge_anon_orders_inherit);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "madvise")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_inherit);
                set_bit(order, &huge_anon_orders_madvise);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "never")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_inherit);
                clear_bit(order, &huge_anon_orders_madvise);
                spin_unlock(&huge_anon_orders_lock);
        } else
                ret = -EINVAL;

        return ret;
}

static struct kobj_attribute thpsize_enabled_attr =
        __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);

static struct attribute *thpsize_attrs[] = {
        &thpsize_enabled_attr.attr,
        NULL,
};

static const struct attribute_group thpsize_attr_group = {
        .attrs = thpsize_attrs,
};

static const struct kobj_type thpsize_ktype = {
        .release = &thpsize_release,
        .sysfs_ops = &kobj_sysfs_ops,
};

DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};

static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
{
        unsigned long sum = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct mthp_stat *this = &per_cpu(mthp_stats, cpu);

                sum += this->stats[order][item];
        }

        return sum;
}

#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                \
static ssize_t _name##_show(struct kobject *kobj,                        \
                        struct kobj_attribute *attr, char *buf)                \
{                                                                        \
        int order = to_thpsize(kobj)->order;                                \
                                                                        \
        return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));        \
}                                                                        \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)

DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);

static struct attribute *stats_attrs[] = {
        &anon_fault_alloc_attr.attr,
        &anon_fault_fallback_attr.attr,
        &anon_fault_fallback_charge_attr.attr,
        &swpout_attr.attr,
        &swpout_fallback_attr.attr,
        NULL,
};

static struct attribute_group stats_attr_group = {
        .name = "stats",
        .attrs = stats_attrs,
};

static struct thpsize *thpsize_create(int order, struct kobject *parent)
{
        unsigned long size = (PAGE_SIZE << order) / SZ_1K;
        struct thpsize *thpsize;
        int ret;

        thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
        if (!thpsize)
                return ERR_PTR(-ENOMEM);

        ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
                                   "hugepages-%lukB", size);
        if (ret) {
                kfree(thpsize);
                return ERR_PTR(ret);
        }

        ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
        if (ret) {
                kobject_put(&thpsize->kobj);
                return ERR_PTR(ret);
        }

        ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
        if (ret) {
                kobject_put(&thpsize->kobj);
                return ERR_PTR(ret);
        }

        thpsize->order = order;
        return thpsize;
}

static void thpsize_release(struct kobject *kobj)
{
        kfree(to_thpsize(kobj));
}

static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        int err;
        struct thpsize *thpsize;
        unsigned long orders;
        int order;

        /*
         * Default to setting PMD-sized THP to inherit the global setting and
         * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
         * constant so we have to do this here.
         */
        huge_anon_orders_inherit = BIT(PMD_ORDER);

        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
        if (unlikely(!*hugepage_kobj)) {
                pr_err("failed to create transparent hugepage kobject\n");
                return -ENOMEM;
        }

        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto delete_obj;
        }

        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto remove_hp_group;
        }

        orders = THP_ORDERS_ALL_ANON;
        order = highest_order(orders);
        while (orders) {
                thpsize = thpsize_create(order, *hugepage_kobj);
                if (IS_ERR(thpsize)) {
                        pr_err("failed to create thpsize for order %d\n", order);
                        err = PTR_ERR(thpsize);
                        goto remove_all;
                }
                list_add(&thpsize->node, &thpsize_list);
                order = next_order(&orders, order);
        }

        return 0;

remove_all:
        hugepage_exit_sysfs(*hugepage_kobj);
        return err;
remove_hp_group:
        sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
delete_obj:
        kobject_put(*hugepage_kobj);
        return err;
}

static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
        struct thpsize *thpsize, *tmp;

        list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
                list_del(&thpsize->node);
                kobject_put(&thpsize->kobj);
        }

        sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
        sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
        kobject_put(hugepage_kobj);
}
#else
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        return 0;
}

static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
}
#endif /* CONFIG_SYSFS */

static int __init thp_shrinker_init(void)
{
        huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
        if (!huge_zero_page_shrinker)
                return -ENOMEM;

        deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
                                                 SHRINKER_MEMCG_AWARE |
                                                 SHRINKER_NONSLAB,
                                                 "thp-deferred_split");
        if (!deferred_split_shrinker) {
                shrinker_free(huge_zero_page_shrinker);
                return -ENOMEM;
        }

        huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
        huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
        shrinker_register(huge_zero_page_shrinker);

        deferred_split_shrinker->count_objects = deferred_split_count;
        deferred_split_shrinker->scan_objects = deferred_split_scan;
        shrinker_register(deferred_split_shrinker);

        return 0;
}

static void __init thp_shrinker_exit(void)
{
        shrinker_free(huge_zero_page_shrinker);
        shrinker_free(deferred_split_shrinker);
}

static int __init hugepage_init(void)
{
        int err;
        struct kobject *hugepage_kobj;

        if (!has_transparent_hugepage()) {
                transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
                return -EINVAL;
        }

        /*
         * hugepages can't be allocated by the buddy allocator
         */
        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);

        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
                goto err_sysfs;

        err = khugepaged_init();
        if (err)
                goto err_slab;

        err = thp_shrinker_init();
        if (err)
                goto err_shrinker;

        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
        if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
                return 0;
        }

        err = start_stop_khugepaged();
        if (err)
                goto err_khugepaged;

        return 0;
err_khugepaged:
        thp_shrinker_exit();
err_shrinker:
        khugepaged_destroy();
err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
err_sysfs:
        return err;
}
subsys_initcall(hugepage_init);

static int __init setup_transparent_hugepage(char *str)
{
        int ret = 0;
        if (!str)
                goto out;
        if (!strcmp(str, "always")) {
                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
                        &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                        &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("transparent_hugepage= cannot parse, ignored\n");
        return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);

pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd, vma);
        return pmd;
}

#ifdef CONFIG_MEMCG
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);
        struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));

        if (memcg)
                return &memcg->deferred_split_queue;
        else
                return &pgdat->deferred_split_queue;
}
#else
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
        struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));

        return &pgdat->deferred_split_queue;
}
#endif

static inline bool is_transparent_hugepage(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return false;

        return is_huge_zero_folio(folio) ||
                folio_test_large_rmappable(folio);
}

static unsigned long __thp_get_unmapped_area(struct file *filp,
                unsigned long addr, unsigned long len,
                loff_t off, unsigned long flags, unsigned long size,
                vm_flags_t vm_flags)
{
        loff_t off_end = off + len;
        loff_t off_align = round_up(off, size);
        unsigned long len_pad, ret, off_sub;

        if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
                return 0;

        if (off_end <= off_align || (off_end - off_align) < size)
                return 0;

        len_pad = len + size;
        if (len_pad < len || (off + len_pad) < off)
                return 0;

        ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
                                           off >> PAGE_SHIFT, flags, vm_flags);

        /*
         * The failure might be due to length padding. The caller will retry
         * without the padding.
         */
        if (IS_ERR_VALUE(ret))
                return 0;

        /*
         * Do not try to align to THP boundary if allocation at the address
         * hint succeeds.
         */
        if (ret == addr)
                return addr;

        off_sub = (off - ret) & (size - 1);

        if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
                return ret + size;

        ret += off_sub;
        return ret;
}

unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags)
{
        unsigned long ret;
        loff_t off = (loff_t)pgoff << PAGE_SHIFT;

        ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
        if (ret)
                return ret;

        return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
                                            vm_flags);
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);

static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                        struct page *page, gfp_t gfp)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = page_folio(page);
        pgtable_t pgtable;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret = 0;

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                folio_put(folio);
                count_vm_event(THP_FAULT_FALLBACK);
                count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                return VM_FAULT_FALLBACK;
        }
        folio_throttle_swaprate(folio, gfp);

        pgtable = pte_alloc_one(vma->vm_mm);
        if (unlikely(!pgtable)) {
                ret = VM_FAULT_OOM;
                goto release;
        }

        clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * clear_huge_page writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd))) {
                goto unlock_release;
        } else {
                pmd_t entry;

                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock_release;

                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
                        spin_unlock(vmf->ptl);
                        folio_put(folio);
                        pte_free(vma->vm_mm, pgtable);
                        ret = handle_userfault(vmf, VM_UFFD_MISSING);
                        VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        return ret;
                }

                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                folio_add_new_anon_rmap(folio, vma, haddr);
                folio_add_lru_vma(folio, vma);
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm_inc_nr_ptes(vma->vm_mm);
                spin_unlock(vmf->ptl);
                count_vm_event(THP_FAULT_ALLOC);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
                count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
        }

        return 0;
unlock_release:
        spin_unlock(vmf->ptl);
release:
        if (pgtable)
                pte_free(vma->vm_mm, pgtable);
        folio_put(folio);
        return ret;

}

/*
 * always: directly stall for all thp allocations
 * defer: wake kswapd and fail if not immediately available
 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 *                  fail if not immediately available
 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 *            available
 * never: never stall for any thp allocation
 */
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{
        const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);

        /* Always do synchronous compaction */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);

        /* Kick kcompactd and fail quickly */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;

        /* Synchronous compaction if madvised, otherwise kick kcompactd */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                        (vma_madvised ? __GFP_DIRECT_RECLAIM :
                                        __GFP_KSWAPD_RECLAIM);

        /* Only do synchronous compaction if madvised */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);

        return GFP_TRANSHUGE_LIGHT;
}

/* Caller must hold page table lock. */
static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct folio *zero_folio)
{
        pmd_t entry;
        if (!pmd_none(*pmd))
                return;
        entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        mm_inc_nr_ptes(mm);
}

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        gfp_t gfp;
        struct folio *folio;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return VM_FAULT_FALLBACK;
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        khugepaged_enter_vma(vma, vma->vm_flags);

        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm) &&
                        transparent_hugepage_use_zero_page()) {
                pgtable_t pgtable;
                struct folio *zero_folio;
                vm_fault_t ret;

                pgtable = pte_alloc_one(vma->vm_mm);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
                zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
                if (unlikely(!zero_folio)) {
                        pte_free(vma->vm_mm, pgtable);
                        count_vm_event(THP_FAULT_FALLBACK);
                        return VM_FAULT_FALLBACK;
                }
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                ret = 0;
                if (pmd_none(*vmf->pmd)) {
                        ret = check_stable_address_space(vma->vm_mm);
                        if (ret) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                        } else if (userfaultfd_missing(vma)) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        } else {
                                set_huge_zero_folio(pgtable, vma->vm_mm, vma,
                                                   haddr, vmf->pmd, zero_folio);
                                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                spin_unlock(vmf->ptl);
                        }
                } else {
                        spin_unlock(vmf->ptl);
                        pte_free(vma->vm_mm, pgtable);
                }
                return ret;
        }
        gfp = vma_thp_gfp_mask(vma);
        folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
        if (unlikely(!folio)) {
                count_vm_event(THP_FAULT_FALLBACK);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
        return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
}

static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
                pgtable_t pgtable)
{
        struct mm_struct *mm = vma->vm_mm;
        pmd_t entry;
        spinlock_t *ptl;

        ptl = pmd_lock(mm, pmd);
        if (!pmd_none(*pmd)) {
                if (write) {
                        if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
                                goto out_unlock;
                        }
                        entry = pmd_mkyoung(*pmd);
                        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                        if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
                                update_mmu_cache_pmd(vma, addr, pmd);
                }

                goto out_unlock;
        }

        entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
        if (pfn_t_devmap(pfn))
                entry = pmd_mkdevmap(entry);
        if (write) {
                entry = pmd_mkyoung(pmd_mkdirty(entry));
                entry = maybe_pmd_mkwrite(entry, vma);
        }

        if (pgtable) {
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                mm_inc_nr_ptes(mm);
                pgtable = NULL;
        }

        set_pmd_at(mm, addr, pmd, entry);
        update_mmu_cache_pmd(vma, addr, pmd);

out_unlock:
        spin_unlock(ptl);
        if (pgtable)
                pte_free(mm, pgtable);
}

/**
 * vmf_insert_pfn_pmd - insert a pmd size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
        unsigned long addr = vmf->address & PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
        pgtable_t pgtable = NULL;

        /*
         * If we had pmd_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
                        !pfn_t_devmap(pfn));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (arch_needs_pgtable_deposit()) {
                pgtable = pte_alloc_one(vma->vm_mm);
                if (!pgtable)
                        return VM_FAULT_OOM;
        }

        track_pfn_insert(vma, &pgprot, pfn);

        insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pud = pud_mkwrite(pud);
        return pud;
}

static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, pfn_t pfn, bool write)
{
        struct mm_struct *mm = vma->vm_mm;
        pgprot_t prot = vma->vm_page_prot;
        pud_t entry;
        spinlock_t *ptl;

        ptl = pud_lock(mm, pud);
        if (!pud_none(*pud)) {
                if (write) {
                        if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_huge_zero_pud(*pud));
                                goto out_unlock;
                        }
                        entry = pud_mkyoung(*pud);
                        entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
                        if (pudp_set_access_flags(vma, addr, pud, entry, 1))
                                update_mmu_cache_pud(vma, addr, pud);
                }
                goto out_unlock;
        }

        entry = pud_mkhuge(pfn_t_pud(pfn, prot));
        if (pfn_t_devmap(pfn))
                entry = pud_mkdevmap(entry);
        if (write) {
                entry = pud_mkyoung(pud_mkdirty(entry));
                entry = maybe_pud_mkwrite(entry, vma);
        }
        set_pud_at(mm, addr, pud, entry);
        update_mmu_cache_pud(vma, addr, pud);

out_unlock:
        spin_unlock(ptl);
}

/**
 * vmf_insert_pfn_pud - insert a pud size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
        unsigned long addr = vmf->address & PUD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;

        /*
         * If we had pud_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
                        !pfn_t_devmap(pfn));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write)
{
        pmd_t _pmd;

        _pmd = pmd_mkyoung(*pmd);
        if (write)
                _pmd = pmd_mkdirty(_pmd);
        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
                                  pmd, _pmd, write))
                update_mmu_cache_pmd(vma, addr, pmd);
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
        unsigned long pfn = pmd_pfn(*pmd);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        int ret;

        assert_spin_locked(pmd_lockptr(mm, pmd));

        if (flags & FOLL_WRITE && !pmd_write(*pmd))
                return NULL;

        if (pmd_present(*pmd) && pmd_devmap(*pmd))
                /* pass */;
        else
                return NULL;

        if (flags & FOLL_TOUCH)
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);

        /*
         * device mapped pages can only be returned if the
         * caller will manage the page reference count.
         */
        if (!(flags & (FOLL_GET | FOLL_PIN)))
                return ERR_PTR(-EEXIST);

        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
        *pgmap = get_dev_pagemap(pfn, *pgmap);
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
        ret = try_grab_page(page, flags);
        if (ret)
                page = ERR_PTR(ret);

        return page;
}

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        struct folio *src_folio;
        pmd_t pmd;
        pgtable_t pgtable = NULL;
        int ret = -ENOMEM;

        /* Skip if can be re-fill on fault */
        if (!vma_is_anonymous(dst_vma))
                return 0;

        pgtable = pte_alloc_one(dst_mm);
        if (unlikely(!pgtable))
                goto out;

        dst_ptl = pmd_lock(dst_mm, dst_pmd);
        src_ptl = pmd_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pmd = *src_pmd;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (unlikely(is_swap_pmd(pmd))) {
                swp_entry_t entry = pmd_to_swp_entry(pmd);

                VM_BUG_ON(!is_pmd_migration_entry(pmd));
                if (!is_readable_migration_entry(entry)) {
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*src_pmd))
                                pmd = pmd_swp_mksoft_dirty(pmd);
                        if (pmd_swp_uffd_wp(*src_pmd))
                                pmd = pmd_swp_mkuffd_wp(pmd);
                        set_pmd_at(src_mm, addr, src_pmd, pmd);
                }
                add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm_inc_nr_ptes(dst_mm);
                pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
                if (!userfaultfd_wp(dst_vma))
                        pmd = pmd_swp_clear_uffd_wp(pmd);
                set_pmd_at(dst_mm, addr, dst_pmd, pmd);
                ret = 0;
                goto out_unlock;
        }
#endif

        if (unlikely(!pmd_trans_huge(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
        /*
         * When page table lock is held, the huge zero pmd should not be
         * under splitting since we don't split the page itself, only pmd to
         * a page table.
         */
        if (is_huge_zero_pmd(pmd)) {
                /*
                 * mm_get_huge_zero_folio() will never allocate a new
                 * folio here, since we already have a zero page to
                 * copy. It just takes a reference.
                 */
                mm_get_huge_zero_folio(dst_mm);
                goto out_zero_page;
        }

        src_page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
        src_folio = page_folio(src_page);

        folio_get(src_folio);
        if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
                /* Page maybe pinned: split and retry the fault on PTEs. */
                folio_put(src_folio);
                pte_free(dst_mm, pgtable);
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
                __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
                return -EAGAIN;
        }
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
out_zero_page:
        mm_inc_nr_ptes(dst_mm);
        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        if (!userfaultfd_wp(dst_vma))
                pmd = pmd_clear_uffd_wp(pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
out:
        return ret;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write)
{
        pud_t _pud;

        _pud = pud_mkyoung(*pud);
        if (write)
                _pud = pud_mkdirty(_pud);
        if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
                                  pud, _pud, write))
                update_mmu_cache_pud(vma, addr, pud);
}

int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        pud_t pud;
        int ret;

        dst_ptl = pud_lock(dst_mm, dst_pud);
        src_ptl = pud_lockptr(src_mm, src_pud);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pud = *src_pud;
        if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
                goto out_unlock;

        /*
         * When page table lock is held, the huge zero pud should not be
         * under splitting since we don't split the page itself, only pud to
         * a page table.
         */
        if (is_huge_zero_pud(pud)) {
                /* No huge zero pud yet */
        }

        /*
         * TODO: once we support anonymous pages, use
         * folio_try_dup_anon_rmap_*() and split if duplicating fails.
         */
        pudp_set_wrprotect(src_mm, addr, src_pud);
        pud = pud_mkold(pud_wrprotect(pud));
        set_pud_at(dst_mm, addr, dst_pud, pud);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
        return ret;
}

void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
        if (unlikely(!pud_same(*vmf->pud, orig_pud)))
                goto unlock;

        touch_pud(vmf->vma, vmf->address, vmf->pud, write);
unlock:
        spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void huge_pmd_set_accessed(struct vm_fault *vmf)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
                goto unlock;

        touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);

unlock:
        spin_unlock(vmf->ptl);
}

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t orig_pmd = vmf->orig_pmd;

        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);

        if (is_huge_zero_pmd(orig_pmd))
                goto fallback;

        spin_lock(vmf->ptl);

        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }

        page = pmd_page(orig_pmd);
        folio = page_folio(page);
        VM_BUG_ON_PAGE(!PageHead(page), page);

        /* Early check when only holding the PT lock. */
        if (PageAnonExclusive(page))
                goto reuse;

        if (!folio_trylock(folio)) {
                folio_get(folio);
                spin_unlock(vmf->ptl);
                folio_lock(folio);
                spin_lock(vmf->ptl);
                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                        spin_unlock(vmf->ptl);
                        folio_unlock(folio);
                        folio_put(folio);
                        return 0;
                }
                folio_put(folio);
        }

        /* Recheck after temporarily dropping the PT lock. */
        if (PageAnonExclusive(page)) {
                folio_unlock(folio);
                goto reuse;
        }

        /*
         * See do_wp_page(): we can only reuse the folio exclusively if
         * there are no additional references. Note that we always drain
         * the LRU cache immediately after adding a THP.
         */
        if (folio_ref_count(folio) >
                        1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
                goto unlock_fallback;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_ref_count(folio) == 1) {
                pmd_t entry;

                folio_move_anon_rmap(folio, vma);
                SetPageAnonExclusive(page);
                folio_unlock(folio);
reuse:
                if (unlikely(unshare)) {
                        spin_unlock(vmf->ptl);
                        return 0;
                }
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                spin_unlock(vmf->ptl);
                return 0;
        }

unlock_fallback:
        folio_unlock(folio);
        spin_unlock(vmf->ptl);
fallback:
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
        return VM_FAULT_FALLBACK;
}

static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
                                           unsigned long addr, pmd_t pmd)
{
        struct page *page;

        if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
                return false;

        /* Don't touch entries that are not even readable (NUMA hinting). */
        if (pmd_protnone(pmd))
                return false;

        /* Do we need write faults for softdirty tracking? */
        if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
                return false;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_huge_pmd_wp(vma, pmd))
                return false;

        if (!(vma->vm_flags & VM_SHARED)) {
                /* See can_change_pte_writable(). */
                page = vm_normal_page_pmd(vma, addr, pmd);
                return page && PageAnon(page) && PageAnonExclusive(page);
        }

        /* See can_change_pte_writable(). */
        return pmd_dirty(pmd);
}

/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pmd_t oldpmd = vmf->orig_pmd;
        pmd_t pmd;
        struct folio *folio;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int nid = NUMA_NO_NODE;
        int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
        bool migrated = false, writable = false;
        int flags = 0;

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
                spin_unlock(vmf->ptl);
                goto out;
        }

        pmd = pmd_modify(oldpmd, vma->vm_page_prot);

        /*
         * Detect now whether the PMD could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pmd_write(pmd);
        if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
            can_change_pmd_writable(vma, vmf->address, pmd))
                writable = true;

        folio = vm_normal_folio_pmd(vma, haddr, pmd);
        if (!folio)
                goto out_map;

        /* See similar comment in do_numa_page for explanation */
        if (!writable)
                flags |= TNF_NO_GROUP;

        nid = folio_nid(folio);
        /*
         * For memory tiering mode, cpupid of slow memory page is used
         * to record page access time.  So use default value.
         */
        if (node_is_toptier(nid))
                last_cpupid = folio_last_cpupid(folio);
        target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
        if (target_nid == NUMA_NO_NODE) {
                folio_put(folio);
                goto out_map;
        }

        spin_unlock(vmf->ptl);
        writable = false;

        migrated = migrate_misplaced_folio(folio, vma, target_nid);
        if (migrated) {
                flags |= TNF_MIGRATED;
                nid = target_nid;
        } else {
                flags |= TNF_MIGRATE_FAIL;
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
                        spin_unlock(vmf->ptl);
                        goto out;
                }
                goto out_map;
        }

out:
        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);

        return 0;

out_map:
        /* Restore the PMD */
        pmd = pmd_modify(oldpmd, vma->vm_page_prot);
        pmd = pmd_mkyoung(pmd);
        if (writable)
                pmd = pmd_mkwrite(pmd, vma);
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
        spin_unlock(vmf->ptl);
        goto out;
}

/*
 * Return true if we do MADV_FREE successfully on entire pmd page.
 * Otherwise, return false.
 */
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t *pmd, unsigned long addr, unsigned long next)
{
        spinlock_t *ptl;
        pmd_t orig_pmd;
        struct folio *folio;
        struct mm_struct *mm = tlb->mm;
        bool ret = false;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                goto out_unlocked;

        orig_pmd = *pmd;
        if (is_huge_zero_pmd(orig_pmd))
                goto out;

        if (unlikely(!pmd_present(orig_pmd))) {
                VM_BUG_ON(thp_migration_supported() &&
                                  !is_pmd_migration_entry(orig_pmd));
                goto out;
        }

        folio = pmd_folio(orig_pmd);
        /*
         * If other processes are mapping this folio, we couldn't discard
         * the folio unless they all do MADV_FREE so let's skip the folio.
         */
        if (folio_likely_mapped_shared(folio))
                goto out;

        if (!folio_trylock(folio))
                goto out;

        /*
         * If user want to discard part-pages of THP, split it so MADV_FREE
         * will deactivate only them.
         */
        if (next - addr != HPAGE_PMD_SIZE) {
                folio_get(folio);
                spin_unlock(ptl);
                split_folio(folio);
                folio_unlock(folio);
                folio_put(folio);
                goto out_unlocked;
        }

        if (folio_test_dirty(folio))
                folio_clear_dirty(folio);
        folio_unlock(folio);

        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
                pmdp_invalidate(vma, addr, pmd);
                orig_pmd = pmd_mkold(orig_pmd);
                orig_pmd = pmd_mkclean(orig_pmd);

                set_pmd_at(mm, addr, pmd, orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        }

        folio_mark_lazyfree(folio);
        ret = true;
out:
        spin_unlock(ptl);
out_unlocked:
        return ret;
}

static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t pgtable;

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pte_free(mm, pgtable);
        mm_dec_nr_ptes(mm);
}

int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
{
        pmd_t orig_pmd;
        spinlock_t *ptl;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
         * when calling pmdp_huge_get_and_clear. So do the
         * pgtable_trans_huge_withdraw after finishing pmdp related
         * operations.
         */
        orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
                                                tlb->fullmm);
        arch_check_zapped_pmd(vma, orig_pmd);
        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        if (vma_is_special_huge(vma)) {
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
        } else if (is_huge_zero_pmd(orig_pmd)) {
                zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
        } else {
                struct folio *folio = NULL;
                int flush_needed = 1;

                if (pmd_present(orig_pmd)) {
                        struct page *page = pmd_page(orig_pmd);

                        folio = page_folio(page);
                        folio_remove_rmap_pmd(folio, page, vma);
                        WARN_ON_ONCE(folio_mapcount(folio) < 0);
                        VM_BUG_ON_PAGE(!PageHead(page), page);
                } else if (thp_migration_supported()) {
                        swp_entry_t entry;

                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
                        entry = pmd_to_swp_entry(orig_pmd);
                        folio = pfn_swap_entry_folio(entry);
                        flush_needed = 0;
                } else
                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");

                if (folio_test_anon(folio)) {
                        zap_deposited_table(tlb->mm, pmd);
                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                } else {
                        if (arch_needs_pgtable_deposit())
                                zap_deposited_table(tlb->mm, pmd);
                        add_mm_counter(tlb->mm, mm_counter_file(folio),
                                       -HPAGE_PMD_NR);
                }

                spin_unlock(ptl);
                if (flush_needed)
                        tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
        }
        return 1;
}

#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
                                         spinlock_t *old_pmd_ptl,
                                         struct vm_area_struct *vma)
{
        /*
         * With split pmd lock we also need to move preallocated
         * PTE page table if new_pmd is on different PMD page table.
         *
         * We also don't deposit and withdraw tables for file pages.
         */
        return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif

static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
        if (unlikely(is_pmd_migration_entry(pmd)))
                pmd = pmd_swp_mksoft_dirty(pmd);
        else if (pmd_present(pmd))
                pmd = pmd_mksoft_dirty(pmd);
#endif
        return pmd;
}

bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
        spinlock_t *old_ptl, *new_ptl;
        pmd_t pmd;
        struct mm_struct *mm = vma->vm_mm;
        bool force_flush = false;

        /*
         * The destination pmd shouldn't be established, free_pgtables()
         * should have released it; but move_page_tables() might have already
         * inserted a page table, if racing against shmem/file collapse.
         */
        if (!pmd_none(*new_pmd)) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
                return false;
        }

        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
        if (old_ptl) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
                if (pmd_present(pmd))
                        force_flush = true;
                VM_BUG_ON(!pmd_none(*new_pmd));

                if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
                        pgtable_t pgtable;
                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                }
                pmd = move_soft_dirty_pmd(pmd);
                set_pmd_at(mm, new_addr, new_pmd, pmd);
                if (force_flush)
                        flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
                return true;
        }
        return false;
}

/*
 * Returns
 *  - 0 if PMD could not be locked
 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
 *      or if prot_numa but THP migration is not supported
 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
 */
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
        int ret = 1;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        if (prot_numa && !thp_migration_supported())
                return 1;

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (is_swap_pmd(*pmd)) {
                swp_entry_t entry = pmd_to_swp_entry(*pmd);
                struct folio *folio = pfn_swap_entry_folio(entry);
                pmd_t newpmd;

                VM_BUG_ON(!is_pmd_migration_entry(*pmd));
                if (is_writable_migration_entry(entry)) {
                        /*
                         * A protection check is difficult so
                         * just be safe and disable write
                         */
                        if (folio_test_anon(folio))
                                entry = make_readable_exclusive_migration_entry(swp_offset(entry));
                        else
                                entry = make_readable_migration_entry(swp_offset(entry));
                        newpmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*pmd))
                                newpmd = pmd_swp_mksoft_dirty(newpmd);
                } else {
                        newpmd = *pmd;
                }

                if (uffd_wp)
                        newpmd = pmd_swp_mkuffd_wp(newpmd);
                else if (uffd_wp_resolve)
                        newpmd = pmd_swp_clear_uffd_wp(newpmd);
                if (!pmd_same(*pmd, newpmd))
                        set_pmd_at(mm, addr, pmd, newpmd);
                goto unlock;
        }
#endif

        if (prot_numa) {
                struct folio *folio;
                bool toptier;
                /*
                 * Avoid trapping faults against the zero page. The read-only
                 * data is likely to be read-cached on the local CPU and
                 * local/remote hits to the zero page are not interesting.
                 */
                if (is_huge_zero_pmd(*pmd))
                        goto unlock;

                if (pmd_protnone(*pmd))
                        goto unlock;

                folio = pmd_folio(*pmd);
                toptier = node_is_toptier(folio_nid(folio));
                /*
                 * Skip scanning top tier node if normal numa
                 * balancing is disabled
                 */
                if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
                    toptier)
                        goto unlock;

                if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
                    !toptier)
                        folio_xchg_access_time(folio,
                                               jiffies_to_msecs(jiffies));
        }
        /*
         * In case prot_numa, we are under mmap_read_lock(mm). It's critical
         * to not clear pmd intermittently to avoid race with MADV_DONTNEED
         * which is also under mmap_read_lock(mm):
         *
         *        CPU0:                                CPU1:
         *                                change_huge_pmd(prot_numa=1)
         *                                 pmdp_huge_get_and_clear_notify()
         * madvise_dontneed()
         *  zap_pmd_range()
         *   pmd_trans_huge(*pmd) == 0 (without ptl)
         *   // skip the pmd
         *                                 set_pmd_at();
         *                                 // pmd is re-established
         *
         * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
         * which may break userspace.
         *
         * pmdp_invalidate_ad() is required to make sure we don't miss
         * dirty/young flags set by hardware.
         */
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);

        entry = pmd_modify(oldpmd, newprot);
        if (uffd_wp)
                entry = pmd_mkuffd_wp(entry);
        else if (uffd_wp_resolve)
                /*
                 * Leave the write bit to be handled by PF interrupt
                 * handler, then things like COW could be properly
                 * handled.
                 */
                entry = pmd_clear_uffd_wp(entry);

        /* See change_pte_range(). */
        if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
            can_change_pmd_writable(vma, addr, entry))
                entry = pmd_mkwrite(entry, vma);

        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);

        if (huge_pmd_needs_flush(oldpmd, entry))
                tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
unlock:
        spin_unlock(ptl);
        return ret;
}

#ifdef CONFIG_USERFAULTFD
/*
 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
 * the caller, but it must return after releasing the page_table_lock.
 * Just move the page from src_pmd to dst_pmd if possible.
 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
 * repeated by the caller, or other errors in case of failure.
 */
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
                        struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                        unsigned long dst_addr, unsigned long src_addr)
{
        pmd_t _dst_pmd, src_pmdval;
        struct page *src_page;
        struct folio *src_folio;
        struct anon_vma *src_anon_vma;
        spinlock_t *src_ptl, *dst_ptl;
        pgtable_t src_pgtable;
        struct mmu_notifier_range range;
        int err = 0;

        src_pmdval = *src_pmd;
        src_ptl = pmd_lockptr(mm, src_pmd);

        lockdep_assert_held(src_ptl);
        vma_assert_locked(src_vma);
        vma_assert_locked(dst_vma);

        /* Sanity checks before the operation */
        if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
            WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
                spin_unlock(src_ptl);
                return -EINVAL;
        }

        if (!pmd_trans_huge(src_pmdval)) {
                spin_unlock(src_ptl);
                if (is_pmd_migration_entry(src_pmdval)) {
                        pmd_migration_entry_wait(mm, &src_pmdval);
                        return -EAGAIN;
                }
                return -ENOENT;
        }

        src_page = pmd_page(src_pmdval);

        if (!is_huge_zero_pmd(src_pmdval)) {
                if (unlikely(!PageAnonExclusive(src_page))) {
                        spin_unlock(src_ptl);
                        return -EBUSY;
                }

                src_folio = page_folio(src_page);
                folio_get(src_folio);
        } else
                src_folio = NULL;

        spin_unlock(src_ptl);

        flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
                                src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        if (src_folio) {
                folio_lock(src_folio);

                /*
                 * split_huge_page walks the anon_vma chain without the page
                 * lock. Serialize against it with the anon_vma lock, the page
                 * lock is not enough.
                 */
                src_anon_vma = folio_get_anon_vma(src_folio);
                if (!src_anon_vma) {
                        err = -EAGAIN;
                        goto unlock_folio;
                }
                anon_vma_lock_write(src_anon_vma);
        } else
                src_anon_vma = NULL;

        dst_ptl = pmd_lockptr(mm, dst_pmd);
        double_pt_lock(src_ptl, dst_ptl);
        if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
                     !pmd_same(*dst_pmd, dst_pmdval))) {
                err = -EAGAIN;
                goto unlock_ptls;
        }
        if (src_folio) {
                if (folio_maybe_dma_pinned(src_folio) ||
                    !PageAnonExclusive(&src_folio->page)) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
                    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                /* Folio got pinned from under us. Put it back and fail the move. */
                if (folio_maybe_dma_pinned(src_folio)) {
                        set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                folio_move_anon_rmap(src_folio, dst_vma);
                src_folio->index = linear_page_index(dst_vma, dst_addr);

                _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
                /* Follow mremap() behavior and treat the entry dirty after the move */
                _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
        } else {
                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
        }
        set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);

        src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
        pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
        double_pt_unlock(src_ptl, dst_ptl);
        if (src_anon_vma) {
                anon_vma_unlock_write(src_anon_vma);
                put_anon_vma(src_anon_vma);
        }
unlock_folio:
        /* unblock rmap walks */
        if (src_folio)
                folio_unlock(src_folio);
        mmu_notifier_invalidate_range_end(&range);
        if (src_folio)
                folio_put(src_folio);
        return err;
}
#endif /* CONFIG_USERFAULTFD */

/*
 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
        spinlock_t *ptl;
        ptl = pmd_lock(vma->vm_mm, pmd);
        if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
                        pmd_devmap(*pmd)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

/*
 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
        spinlock_t *ptl;

        ptl = pud_lock(vma->vm_mm, pud);
        if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pud_t *pud, unsigned long addr)
{
        spinlock_t *ptl;

        ptl = __pud_trans_huge_lock(pud, vma);
        if (!ptl)
                return 0;

        pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
        tlb_remove_pud_tlb_entry(tlb, pud, addr);
        if (vma_is_special_huge(vma)) {
                spin_unlock(ptl);
                /* No zero page support yet */
        } else {
                /* No support for anonymous PUD pages yet */
                BUG();
        }
        return 1;
}

static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
                unsigned long haddr)
{
        VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
        VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));

        count_vm_event(THP_SPLIT_PUD);

        pudp_huge_clear_flush(vma, haddr, pud);
}

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PUD_MASK,
                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pud_lock(vma->vm_mm, pud);
        if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
                goto out;
        __split_huge_pud_locked(vma, pud, range.start);

out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
                unsigned long haddr, pmd_t *pmd)
{
        struct mm_struct *mm = vma->vm_mm;
        pgtable_t pgtable;
        pmd_t _pmd, old_pmd;
        unsigned long addr;
        pte_t *pte;
        int i;

        /*
         * Leave pmd empty until pte is filled note that it is fine to delay
         * notification until mmu_notifier_invalidate_range_end() as we are
         * replacing a zero pmd write protected page with a zero pte write
         * protected page.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);
        for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                pte_t entry;

                entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
                entry = pte_mkspecial(entry);
                if (pmd_uffd_wp(old_pmd))
                        entry = pte_mkuffd_wp(entry);
                VM_BUG_ON(!pte_none(ptep_get(pte)));
                set_pte_at(mm, addr, pte, entry);
                pte++;
        }
        pte_unmap(pte - 1);
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long haddr, bool freeze)
{
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio;
        struct page *page;
        pgtable_t pgtable;
        pmd_t old_pmd, _pmd;
        bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
        bool anon_exclusive = false, dirty = false;
        unsigned long addr;
        pte_t *pte;
        int i;

        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
        VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
                                && !pmd_devmap(*pmd));

        count_vm_event(THP_SPLIT_PMD);

        if (!vma_is_anonymous(vma)) {
                old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
                /*
                 * We are going to unmap this huge page. So
                 * just go ahead and zap it
                 */
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(mm, pmd);
                if (vma_is_special_huge(vma))
                        return;
                if (unlikely(is_pmd_migration_entry(old_pmd))) {
                        swp_entry_t entry;

                        entry = pmd_to_swp_entry(old_pmd);
                        folio = pfn_swap_entry_folio(entry);
                } else {
                        page = pmd_page(old_pmd);
                        folio = page_folio(page);
                        if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
                                folio_mark_dirty(folio);
                        if (!folio_test_referenced(folio) && pmd_young(old_pmd))
                                folio_set_referenced(folio);
                        folio_remove_rmap_pmd(folio, page, vma);
                        folio_put(folio);
                }
                add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
                return;
        }

        if (is_huge_zero_pmd(*pmd)) {
                /*
                 * FIXME: Do we want to invalidate secondary mmu by calling
                 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
                 * inside __split_huge_pmd() ?
                 *
                 * We are going from a zero huge page write protected to zero
                 * small page also write protected so it does not seems useful
                 * to invalidate secondary mmu at this time.
                 */
                return __split_huge_zero_page_pmd(vma, haddr, pmd);
        }

        pmd_migration = is_pmd_migration_entry(*pmd);
        if (unlikely(pmd_migration)) {
                swp_entry_t entry;

                old_pmd = *pmd;
                entry = pmd_to_swp_entry(old_pmd);
                page = pfn_swap_entry_to_page(entry);
                write = is_writable_migration_entry(entry);
                if (PageAnon(page))
                        anon_exclusive = is_readable_exclusive_migration_entry(entry);
                young = is_migration_entry_young(entry);
                dirty = is_migration_entry_dirty(entry);
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
                uffd_wp = pmd_swp_uffd_wp(old_pmd);
        } else {
                /*
                 * Up to this point the pmd is present and huge and userland has
                 * the whole access to the hugepage during the split (which
                 * happens in place). If we overwrite the pmd with the not-huge
                 * version pointing to the pte here (which of course we could if
                 * all CPUs were bug free), userland could trigger a small page
                 * size TLB miss on the small sized TLB while the hugepage TLB
                 * entry is still established in the huge TLB. Some CPU doesn't
                 * like that. See
                 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
                 * 383 on page 105. Intel should be safe but is also warns that
                 * it's only safe if the permission and cache attributes of the
                 * two entries loaded in the two TLB is identical (which should
                 * be the case here). But it is generally safer to never allow
                 * small and huge TLB entries for the same virtual address to be
                 * loaded simultaneously. So instead of doing "pmd_populate();
                 * flush_pmd_tlb_range();" we first mark the current pmd
                 * notpresent (atomically because here the pmd_trans_huge must
                 * remain set at all times on the pmd until the split is
                 * complete for this pmd), then we flush the SMP TLB and finally
                 * we write the non-huge version of the pmd entry with
                 * pmd_populate.
                 */
                old_pmd = pmdp_invalidate(vma, haddr, pmd);
                page = pmd_page(old_pmd);
                folio = page_folio(page);
                if (pmd_dirty(old_pmd)) {
                        dirty = true;
                        folio_set_dirty(folio);
                }
                write = pmd_write(old_pmd);
                young = pmd_young(old_pmd);
                soft_dirty = pmd_soft_dirty(old_pmd);
                uffd_wp = pmd_uffd_wp(old_pmd);

                VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
                VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

                /*
                 * Without "freeze", we'll simply split the PMD, propagating the
                 * PageAnonExclusive() flag for each PTE by setting it for
                 * each subpage -- no need to (temporarily) clear.
                 *
                 * With "freeze" we want to replace mapped pages by
                 * migration entries right away. This is only possible if we
                 * managed to clear PageAnonExclusive() -- see
                 * set_pmd_migration_entry().
                 *
                 * In case we cannot clear PageAnonExclusive(), split the PMD
                 * only and let try_to_migrate_one() fail later.
                 *
                 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
                 */
                anon_exclusive = PageAnonExclusive(page);
                if (freeze && anon_exclusive &&
                    folio_try_share_anon_rmap_pmd(folio, page))
                        freeze = false;
                if (!freeze) {
                        rmap_t rmap_flags = RMAP_NONE;

                        folio_ref_add(folio, HPAGE_PMD_NR - 1);
                        if (anon_exclusive)
                                rmap_flags |= RMAP_EXCLUSIVE;
                        folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
                                                 vma, haddr, rmap_flags);
                }
        }

        /*
         * Withdraw the table only after we mark the pmd entry invalid.
         * This's critical for some architectures (Power).
         */
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);

        /*
         * Note that NUMA hinting access restrictions are not transferred to
         * avoid any possibility of altering permissions across VMAs.
         */
        if (freeze || pmd_migration) {
                for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                        pte_t entry;
                        swp_entry_t swp_entry;

                        if (write)
                                swp_entry = make_writable_migration_entry(
                                                        page_to_pfn(page + i));
                        else if (anon_exclusive)
                                swp_entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(page + i));
                        else
                                swp_entry = make_readable_migration_entry(
                                                        page_to_pfn(page + i));
                        if (young)
                                swp_entry = make_migration_entry_young(swp_entry);
                        if (dirty)
                                swp_entry = make_migration_entry_dirty(swp_entry);
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
                        if (uffd_wp)
                                entry = pte_swp_mkuffd_wp(entry);

                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));
                        set_pte_at(mm, addr, pte + i, entry);
                }
        } else {
                pte_t entry;

                entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
                if (write)
                        entry = pte_mkwrite(entry, vma);
                if (!young)
                        entry = pte_mkold(entry);
                /* NOTE: this may set soft-dirty too on some archs */
                if (dirty)
                        entry = pte_mkdirty(entry);
                if (soft_dirty)
                        entry = pte_mksoft_dirty(entry);
                if (uffd_wp)
                        entry = pte_mkuffd_wp(entry);

                for (i = 0; i < HPAGE_PMD_NR; i++)
                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));

                set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
        }
        pte_unmap(pte);

        if (!pmd_migration)
                folio_remove_rmap_pmd(folio, page, vma);
        if (freeze)
                put_page(page);

        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PMD_MASK,
                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pmd_lock(vma->vm_mm, pmd);

        /*
         * If caller asks to setup a migration entry, we need a folio to check
         * pmd against. Otherwise we can end up replacing wrong folio.
         */
        VM_BUG_ON(freeze && !folio);
        VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));

        if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
            is_pmd_migration_entry(*pmd)) {
                /*
                 * It's safe to call pmd_page when folio is set because it's
                 * guaranteed that pmd is present.
                 */
                if (folio && folio != pmd_folio(*pmd))
                        goto out;
                __split_huge_pmd_locked(vma, pmd, range.start, freeze);
        }

out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}

void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze, struct folio *folio)
{
        pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);

        if (!pmd)
                return;

        __split_huge_pmd(vma, pmd, address, freeze, folio);
}

static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
{
        /*
         * If the new address isn't hpage aligned and it could previously
         * contain an hugepage: check if we need to split an huge pmd.
         */
        if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
            range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
                         ALIGN(address, HPAGE_PMD_SIZE)))
                split_huge_pmd_address(vma, address, false, NULL);
}

void vma_adjust_trans_huge(struct vm_area_struct *vma,
                             unsigned long start,
                             unsigned long end,
                             long adjust_next)
{
        /* Check if we need to split start first. */
        split_huge_pmd_if_needed(vma, start);

        /* Check if we need to split end next. */
        split_huge_pmd_if_needed(vma, end);

        /*
         * If we're also updating the next vma vm_start,
         * check if we need to split it.
         */
        if (adjust_next > 0) {
                struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
                unsigned long nstart = next->vm_start;
                nstart += adjust_next;
                split_huge_pmd_if_needed(next, nstart);
        }
}

static void unmap_folio(struct folio *folio)
{
        enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
                TTU_BATCH_FLUSH;

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (folio_test_pmd_mappable(folio))
                ttu_flags |= TTU_SPLIT_HUGE_PMD;

        /*
         * Anon pages need migration entries to preserve them, but file
         * pages can simply be left unmapped, then faulted back on demand.
         * If that is ever changed (perhaps for mlock), update remap_page().
         */
        if (folio_test_anon(folio))
                try_to_migrate(folio, ttu_flags);
        else
                try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);

        try_to_unmap_flush();
}

static void remap_page(struct folio *folio, unsigned long nr)
{
        int i = 0;

        /* If unmap_folio() uses try_to_migrate() on file, remove this check */
        if (!folio_test_anon(folio))
                return;
        for (;;) {
                remove_migration_ptes(folio, folio, true);
                i += folio_nr_pages(folio);
                if (i >= nr)
                        break;
                folio = folio_next(folio);
        }
}

static void lru_add_page_tail(struct page *head, struct page *tail,
                struct lruvec *lruvec, struct list_head *list)
{
        VM_BUG_ON_PAGE(!PageHead(head), head);
        VM_BUG_ON_PAGE(PageLRU(tail), head);
        lockdep_assert_held(&lruvec->lru_lock);

        if (list) {
                /* page reclaim is reclaiming a huge page */
                VM_WARN_ON(PageLRU(head));
                get_page(tail);
                list_add_tail(&tail->lru, list);
        } else {
                /* head is still on lru (and we have it frozen) */
                VM_WARN_ON(!PageLRU(head));
                if (PageUnevictable(tail))
                        tail->mlock_count = 0;
                else
                        list_add_tail(&tail->lru, &head->lru);
                SetPageLRU(tail);
        }
}

static void __split_huge_page_tail(struct folio *folio, int tail,
                struct lruvec *lruvec, struct list_head *list,
                unsigned int new_order)
{
        struct page *head = &folio->page;
        struct page *page_tail = head + tail;
        /*
         * Careful: new_folio is not a "real" folio before we cleared PageTail.
         * Don't pass it around before clear_compound_head().
         */
        struct folio *new_folio = (struct folio *)page_tail;

        VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);

        /*
         * Clone page flags before unfreezing refcount.
         *
         * After successful get_page_unless_zero() might follow flags change,
         * for example lock_page() which set PG_waiters.
         *
         * Note that for mapped sub-pages of an anonymous THP,
         * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
         * the migration entry instead from where remap_page() will restore it.
         * We can still have PG_anon_exclusive set on effectively unmapped and
         * unreferenced sub-pages of an anonymous THP: we can simply drop
         * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
         */
        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        page_tail->flags |= (head->flags &
                        ((1L << PG_referenced) |
                         (1L << PG_swapbacked) |
                         (1L << PG_swapcache) |
                         (1L << PG_mlocked) |
                         (1L << PG_uptodate) |
                         (1L << PG_active) |
                         (1L << PG_workingset) |
                         (1L << PG_locked) |
                         (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
                         (1L << PG_arch_2) |
                         (1L << PG_arch_3) |
#endif
                         (1L << PG_dirty) |
                         LRU_GEN_MASK | LRU_REFS_MASK));

        /* ->mapping in first and second tail page is replaced by other uses */
        VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
                        page_tail);
        page_tail->mapping = head->mapping;
        page_tail->index = head->index + tail;

        /*
         * page->private should not be set in tail pages. Fix up and warn once
         * if private is unexpectedly set.
         */
        if (unlikely(page_tail->private)) {
                VM_WARN_ON_ONCE_PAGE(true, page_tail);
                page_tail->private = 0;
        }
        if (folio_test_swapcache(folio))
                new_folio->swap.val = folio->swap.val + tail;

        /* Page flags must be visible before we make the page non-compound. */
        smp_wmb();

        /*
         * Clear PageTail before unfreezing page refcount.
         *
         * After successful get_page_unless_zero() might follow put_page()
         * which needs correct compound_head().
         */
        clear_compound_head(page_tail);
        if (new_order) {
                prep_compound_page(page_tail, new_order);
                folio_set_large_rmappable(new_folio);
        }

        /* Finally unfreeze refcount. Additional reference from page cache. */
        page_ref_unfreeze(page_tail,
                1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
                             folio_nr_pages(new_folio) : 0));

        if (folio_test_young(folio))
                folio_set_young(new_folio);
        if (folio_test_idle(folio))
                folio_set_idle(new_folio);

        folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));

        /*
         * always add to the tail because some iterators expect new
         * pages to show after the currently processed elements - e.g.
         * migrate_pages
         */
        lru_add_page_tail(head, page_tail, lruvec, list);
}

static void __split_huge_page(struct page *page, struct list_head *list,
                pgoff_t end, unsigned int new_order)
{
        struct folio *folio = page_folio(page);
        struct page *head = &folio->page;
        struct lruvec *lruvec;
        struct address_space *swap_cache = NULL;
        unsigned long offset = 0;
        int i, nr_dropped = 0;
        unsigned int new_nr = 1 << new_order;
        int order = folio_order(folio);
        unsigned int nr = 1 << order;

        /* complete memcg works before add pages to LRU */
        split_page_memcg(head, order, new_order);

        if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
                offset = swp_offset(folio->swap);
                swap_cache = swap_address_space(folio->swap);
                xa_lock(&swap_cache->i_pages);
        }

        /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
        lruvec = folio_lruvec_lock(folio);

        ClearPageHasHWPoisoned(head);

        for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
                __split_huge_page_tail(folio, i, lruvec, list, new_order);
                /* Some pages can be beyond EOF: drop them from page cache */
                if (head[i].index >= end) {
                        struct folio *tail = page_folio(head + i);

                        if (shmem_mapping(folio->mapping))
                                nr_dropped++;
                        else if (folio_test_clear_dirty(tail))
                                folio_account_cleaned(tail,
                                        inode_to_wb(folio->mapping->host));
                        __filemap_remove_folio(tail, NULL);
                        folio_put(tail);
                } else if (!PageAnon(page)) {
                        __xa_store(&folio->mapping->i_pages, head[i].index,
                                        head + i, 0);
                } else if (swap_cache) {
                        __xa_store(&swap_cache->i_pages, offset + i,
                                        head + i, 0);
                }
        }

        if (!new_order)
                ClearPageCompound(head);
        else {
                struct folio *new_folio = (struct folio *)head;

                folio_set_order(new_folio, new_order);
        }
        unlock_page_lruvec(lruvec);
        /* Caller disabled irqs, so they are still disabled here */

        split_page_owner(head, order, new_order);
        pgalloc_tag_split(head, 1 << order);

        /* See comment in __split_huge_page_tail() */
        if (folio_test_anon(folio)) {
                /* Additional pin to swap cache */
                if (folio_test_swapcache(folio)) {
                        folio_ref_add(folio, 1 + new_nr);
                        xa_unlock(&swap_cache->i_pages);
                } else {
                        folio_ref_inc(folio);
                }
        } else {
                /* Additional pin to page cache */
                folio_ref_add(folio, 1 + new_nr);
                xa_unlock(&folio->mapping->i_pages);
        }
        local_irq_enable();

        if (nr_dropped)
                shmem_uncharge(folio->mapping->host, nr_dropped);
        remap_page(folio, nr);

        /*
         * set page to its compound_head when split to non order-0 pages, so
         * we can skip unlocking it below, since PG_locked is transferred to
         * the compound_head of the page and the caller will unlock it.
         */
        if (new_order)
                page = compound_head(page);

        for (i = 0; i < nr; i += new_nr) {
                struct page *subpage = head + i;
                struct folio *new_folio = page_folio(subpage);
                if (subpage == page)
                        continue;
                folio_unlock(new_folio);

                /*
                 * Subpages may be freed if there wasn't any mapping
                 * like if add_to_swap() is running on a lru page that
                 * had its mapping zapped. And freeing these pages
                 * requires taking the lru_lock so we do the put_page
                 * of the tail pages after the split is complete.
                 */
                free_page_and_swap_cache(subpage);
        }
}

/* Racy check whether the huge page can be split */
bool can_split_folio(struct folio *folio, int *pextra_pins)
{
        int extra_pins;

        /* Additional pins from page cache */
        if (folio_test_anon(folio))
                extra_pins = folio_test_swapcache(folio) ?
                                folio_nr_pages(folio) : 0;
        else
                extra_pins = folio_nr_pages(folio);
        if (pextra_pins)
                *pextra_pins = extra_pins;
        return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
}

/*
 * This function splits a large folio into smaller folios of order @new_order.
 * @page can point to any page of the large folio to split. The split operation
 * does not change the position of @page.
 *
 * Prerequisites:
 *
 * 1) The caller must hold a reference on the @page's owning folio, also known
 *    as the large folio.
 *
 * 2) The large folio must be locked.
 *
 * 3) The folio must not be pinned. Any unexpected folio references, including
 *    GUP pins, will result in the folio not getting split; instead, the caller
 *    will receive an -EAGAIN.
 *
 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
 *    supported for non-file-backed folios, because folio->_deferred_list, which
 *    is used by partially mapped folios, is stored in subpage 2, but an order-1
 *    folio only has subpages 0 and 1. File-backed order-1 folios are supported,
 *    since they do not use _deferred_list.
 *
 * After splitting, the caller's folio reference will be transferred to @page,
 * resulting in a raised refcount of @page after this call. The other pages may
 * be freed if they are not mapped.
 *
 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
 *
 * Pages in @new_order will inherit the mapping, flags, and so on from the
 * huge page.
 *
 * Returns 0 if the huge page was split successfully.
 *
 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
 * the folio was concurrently removed from the page cache.
 *
 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
 * under writeback, if fs-specific folio metadata cannot currently be
 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
 * truncation).
 *
 * Returns -EINVAL when trying to split to an order that is incompatible
 * with the folio. Splitting to order 0 is compatible with all folios.
 */
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                                     unsigned int new_order)
{
        struct folio *folio = page_folio(page);
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
        /* reset xarray order to new order after split */
        XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
        struct anon_vma *anon_vma = NULL;
        struct address_space *mapping = NULL;
        bool is_thp = folio_test_pmd_mappable(folio);
        int extra_pins, ret;
        pgoff_t end;
        bool is_hzp;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (new_order >= folio_order(folio))
                return -EINVAL;

        if (folio_test_anon(folio)) {
                /* order-1 is not supported for anonymous THP. */
                if (new_order == 1) {
                        VM_WARN_ONCE(1, "Cannot split to order-1 folio");
                        return -EINVAL;
                }
        } else if (new_order) {
                /* Split shmem folio to non-zero order not supported */
                if (shmem_mapping(folio->mapping)) {
                        VM_WARN_ONCE(1,
                                "Cannot split shmem folio to non-0 order");
                        return -EINVAL;
                }
                /*
                 * No split if the file system does not support large folio.
                 * Note that we might still have THPs in such mappings due to
                 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
                 * does not actually support large folios properly.
                 */
                if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
                    !mapping_large_folio_support(folio->mapping)) {
                        VM_WARN_ONCE(1,
                                "Cannot split file folio to non-0 order");
                        return -EINVAL;
                }
        }

        /* Only swapping a whole PMD-mapped folio is supported */
        if (folio_test_swapcache(folio) && new_order)
                return -EINVAL;

        is_hzp = is_huge_zero_folio(folio);
        if (is_hzp) {
                pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
                return -EBUSY;
        }

        if (folio_test_writeback(folio))
                return -EBUSY;

        if (folio_test_anon(folio)) {
                /*
                 * The caller does not necessarily hold an mmap_lock that would
                 * prevent the anon_vma disappearing so we first we take a
                 * reference to it and then lock the anon_vma for write. This
                 * is similar to folio_lock_anon_vma_read except the write lock
                 * is taken to serialise against parallel split or collapse
                 * operations.
                 */
                anon_vma = folio_get_anon_vma(folio);
                if (!anon_vma) {
                        ret = -EBUSY;
                        goto out;
                }
                end = -1;
                mapping = NULL;
                anon_vma_lock_write(anon_vma);
        } else {
                gfp_t gfp;

                mapping = folio->mapping;

                /* Truncated ? */
                if (!mapping) {
                        ret = -EBUSY;
                        goto out;
                }

                gfp = current_gfp_context(mapping_gfp_mask(mapping) &
                                                        GFP_RECLAIM_MASK);

                if (!filemap_release_folio(folio, gfp)) {
                        ret = -EBUSY;
                        goto out;
                }

                xas_split_alloc(&xas, folio, folio_order(folio), gfp);
                if (xas_error(&xas)) {
                        ret = xas_error(&xas);
                        goto out;
                }

                anon_vma = NULL;
                i_mmap_lock_read(mapping);

                /*
                 *__split_huge_page() may need to trim off pages beyond EOF:
                 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
                 * which cannot be nested inside the page tree lock. So note
                 * end now: i_size itself may be changed at any moment, but
                 * folio lock is good enough to serialize the trimming.
                 */
                end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (shmem_mapping(mapping))
                        end = shmem_fallocend(mapping->host, end);
        }

        /*
         * Racy check if we can split the page, before unmap_folio() will
         * split PMDs
         */
        if (!can_split_folio(folio, &extra_pins)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        unmap_folio(folio);

        /* block interrupt reentry in xa_lock and spinlock */
        local_irq_disable();
        if (mapping) {
                /*
                 * Check if the folio is present in page cache.
                 * We assume all tail are present too, if folio is there.
                 */
                xas_lock(&xas);
                xas_reset(&xas);
                if (xas_load(&xas) != folio)
                        goto fail;
        }

        /* Prevent deferred_split_scan() touching ->_refcount */
        spin_lock(&ds_queue->split_queue_lock);
        if (folio_ref_freeze(folio, 1 + extra_pins)) {
                if (folio_order(folio) > 1 &&
                    !list_empty(&folio->_deferred_list)) {
                        ds_queue->split_queue_len--;
                        /*
                         * Reinitialize page_deferred_list after removing the
                         * page from the split_queue, otherwise a subsequent
                         * split will see list corruption when checking the
                         * page_deferred_list.
                         */
                        list_del_init(&folio->_deferred_list);
                }
                spin_unlock(&ds_queue->split_queue_lock);
                if (mapping) {
                        int nr = folio_nr_pages(folio);

                        xas_split(&xas, folio, folio_order(folio));
                        if (folio_test_pmd_mappable(folio) &&
                            new_order < HPAGE_PMD_ORDER) {
                                if (folio_test_swapbacked(folio)) {
                                        __lruvec_stat_mod_folio(folio,
                                                        NR_SHMEM_THPS, -nr);
                                } else {
                                        __lruvec_stat_mod_folio(folio,
                                                        NR_FILE_THPS, -nr);
                                        filemap_nr_thps_dec(mapping);
                                }
                        }
                }

                __split_huge_page(page, list, end, new_order);
                ret = 0;
        } else {
                spin_unlock(&ds_queue->split_queue_lock);
fail:
                if (mapping)
                        xas_unlock(&xas);
                local_irq_enable();
                remap_page(folio, folio_nr_pages(folio));
                ret = -EAGAIN;
        }

out_unlock:
        if (anon_vma) {
                anon_vma_unlock_write(anon_vma);
                put_anon_vma(anon_vma);
        }
        if (mapping)
                i_mmap_unlock_read(mapping);
out:
        xas_destroy(&xas);
        if (is_thp)
                count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
        return ret;
}

void folio_undo_large_rmappable(struct folio *folio)
{
        struct deferred_split *ds_queue;
        unsigned long flags;

        if (folio_order(folio) <= 1)
                return;

        /*
         * At this point, there is no one trying to add the folio to
         * deferred_list. If folio is not in deferred_list, it's safe
         * to check without acquiring the split_queue_lock.
         */
        if (data_race(list_empty(&folio->_deferred_list)))
                return;

        ds_queue = get_deferred_split_queue(folio);
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (!list_empty(&folio->_deferred_list)) {
                ds_queue->split_queue_len--;
                list_del_init(&folio->_deferred_list);
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}

void deferred_split_folio(struct folio *folio)
{
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG
        struct mem_cgroup *memcg = folio_memcg(folio);
#endif
        unsigned long flags;

        /*
         * Order 1 folios have no space for a deferred list, but we also
         * won't waste much memory by not adding them to the deferred list.
         */
        if (folio_order(folio) <= 1)
                return;

        /*
         * The try_to_unmap() in page reclaim path might reach here too,
         * this may cause a race condition to corrupt deferred split queue.
         * And, if page reclaim is already handling the same folio, it is
         * unnecessary to handle it again in shrinker.
         *
         * Check the swapcache flag to determine if the folio is being
         * handled by page reclaim since THP swap would add the folio into
         * swap cache before calling try_to_unmap().
         */
        if (folio_test_swapcache(folio))
                return;

        if (!list_empty(&folio->_deferred_list))
                return;

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (list_empty(&folio->_deferred_list)) {
                if (folio_test_pmd_mappable(folio))
                        count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
                if (memcg)
                        set_shrinker_bit(memcg, folio_nid(folio),
                                         deferred_split_shrinker->id);
#endif
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}

static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;

#ifdef CONFIG_MEMCG
        if (sc->memcg)
                ds_queue = &sc->memcg->deferred_split_queue;
#endif
        return READ_ONCE(ds_queue->split_queue_len);
}

static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
        unsigned long flags;
        LIST_HEAD(list);
        struct folio *folio, *next;
        int split = 0;

#ifdef CONFIG_MEMCG
        if (sc->memcg)
                ds_queue = &sc->memcg->deferred_split_queue;
#endif

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        /* Take pin on all head pages to avoid freeing them under us */
        list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
                                                        _deferred_list) {
                if (folio_try_get(folio)) {
                        list_move(&folio->_deferred_list, &list);
                } else {
                        /* We lost race with folio_put() */
                        list_del_init(&folio->_deferred_list);
                        ds_queue->split_queue_len--;
                }
                if (!--sc->nr_to_scan)
                        break;
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

        list_for_each_entry_safe(folio, next, &list, _deferred_list) {
                if (!folio_trylock(folio))
                        goto next;
                /* split_huge_page() removes page from list on success */
                if (!split_folio(folio))
                        split++;
                folio_unlock(folio);
next:
                folio_put(folio);
        }

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        list_splice_tail(&list, &ds_queue->split_queue);
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

        /*
         * Stop shrinker if we didn't split any page, but the queue is empty.
         * This can happen if pages were freed under us.
         */
        if (!split && list_empty(&ds_queue->split_queue))
                return SHRINK_STOP;
        return split;
}

#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
        struct zone *zone;
        struct page *page;
        struct folio *folio;
        unsigned long pfn, max_zone_pfn;
        unsigned long total = 0, split = 0;

        pr_debug("Split all THPs\n");
        for_each_zone(zone) {
                if (!managed_zone(zone))
                        continue;
                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
                        int nr_pages;

                        page = pfn_to_online_page(pfn);
                        if (!page || PageTail(page))
                                continue;
                        folio = page_folio(page);
                        if (!folio_try_get(folio))
                                continue;

                        if (unlikely(page_folio(page) != folio))
                                goto next;

                        if (zone != folio_zone(folio))
                                goto next;

                        if (!folio_test_large(folio)
                                || folio_test_hugetlb(folio)
                                || !folio_test_lru(folio))
                                goto next;

                        total++;
                        folio_lock(folio);
                        nr_pages = folio_nr_pages(folio);
                        if (!split_folio(folio))
                                split++;
                        pfn += nr_pages - 1;
                        folio_unlock(folio);
next:
                        folio_put(folio);
                        cond_resched();
                }
        }

        pr_debug("%lu of %lu THP split\n", split, total);
}

static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
{
        return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
                    is_vm_hugetlb_page(vma);
}

static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                                unsigned long vaddr_end, unsigned int new_order)
{
        int ret = 0;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long total = 0, split = 0;
        unsigned long addr;

        vaddr_start &= PAGE_MASK;
        vaddr_end &= PAGE_MASK;

        /* Find the task_struct from pid */
        rcu_read_lock();
        task = find_task_by_vpid(pid);
        if (!task) {
                rcu_read_unlock();
                ret = -ESRCH;
                goto out;
        }
        get_task_struct(task);
        rcu_read_unlock();

        /* Find the mm_struct */
        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                ret = -EINVAL;
                goto out;
        }

        pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
                 pid, vaddr_start, vaddr_end);

        mmap_read_lock(mm);
        /*
         * always increase addr by PAGE_SIZE, since we could have a PTE page
         * table filled with PTE-mapped THPs, each of which is distinct.
         */
        for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
                struct vm_area_struct *vma = vma_lookup(mm, addr);
                struct page *page;
                struct folio *folio;

                if (!vma)
                        break;

                /* skip special VMA and hugetlb VMA */
                if (vma_not_suitable_for_thp_split(vma)) {
                        addr = vma->vm_end;
                        continue;
                }

                /* FOLL_DUMP to ignore special (like zero) pages */
                page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);

                if (IS_ERR_OR_NULL(page))
                        continue;

                folio = page_folio(page);
                if (!is_transparent_hugepage(folio))
                        goto next;

                if (new_order >= folio_order(folio))
                        goto next;

                total++;
                /*
                 * For folios with private, split_huge_page_to_list_to_order()
                 * will try to drop it before split and then check if the folio
                 * can be split or not. So skip the check here.
                 */
                if (!folio_test_private(folio) &&
                    !can_split_folio(folio, NULL))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;

                if (!split_folio_to_order(folio, new_order))
                        split++;

                folio_unlock(folio);
next:
                folio_put(folio);
                cond_resched();
        }
        mmap_read_unlock(mm);
        mmput(mm);

        pr_debug("%lu of %lu THP split\n", split, total);

out:
        return ret;
}

static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
                                pgoff_t off_end, unsigned int new_order)
{
        struct filename *file;
        struct file *candidate;
        struct address_space *mapping;
        int ret = -EINVAL;
        pgoff_t index;
        int nr_pages = 1;
        unsigned long total = 0, split = 0;

        file = getname_kernel(file_path);
        if (IS_ERR(file))
                return ret;

        candidate = file_open_name(file, O_RDONLY, 0);
        if (IS_ERR(candidate))
                goto out;

        pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
                 file_path, off_start, off_end);

        mapping = candidate->f_mapping;

        for (index = off_start; index < off_end; index += nr_pages) {
                struct folio *folio = filemap_get_folio(mapping, index);

                nr_pages = 1;
                if (IS_ERR(folio))
                        continue;

                if (!folio_test_large(folio))
                        goto next;

                total++;
                nr_pages = folio_nr_pages(folio);

                if (new_order >= folio_order(folio))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;

                if (!split_folio_to_order(folio, new_order))
                        split++;

                folio_unlock(folio);
next:
                folio_put(folio);
                cond_resched();
        }

        filp_close(candidate, NULL);
        ret = 0;

        pr_debug("%lu of %lu file-backed THP split\n", split, total);
out:
        putname(file);
        return ret;
}

#define MAX_INPUT_BUF_SZ 255

static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppops)
{
        static DEFINE_MUTEX(split_debug_mutex);
        ssize_t ret;
        /*
         * hold pid, start_vaddr, end_vaddr, new_order or
         * file_path, off_start, off_end, new_order
         */
        char input_buf[MAX_INPUT_BUF_SZ];
        int pid;
        unsigned long vaddr_start, vaddr_end;
        unsigned int new_order = 0;

        ret = mutex_lock_interruptible(&split_debug_mutex);
        if (ret)
                return ret;

        ret = -EFAULT;

        memset(input_buf, 0, MAX_INPUT_BUF_SZ);
        if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
                goto out;

        input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';

        if (input_buf[0] == '/') {
                char *tok;
                char *buf = input_buf;
                char file_path[MAX_INPUT_BUF_SZ];
                pgoff_t off_start = 0, off_end = 0;
                size_t input_len = strlen(input_buf);

                tok = strsep(&buf, ",");
                if (tok) {
                        strcpy(file_path, tok);
                } else {
                        ret = -EINVAL;
                        goto out;
                }

                ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
                if (ret != 2 && ret != 3) {
                        ret = -EINVAL;
                        goto out;
                }
                ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
                if (!ret)
                        ret = input_len;

                goto out;
        }

        ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
        if (ret == 1 && pid == 1) {
                split_huge_pages_all();
                ret = strlen(input_buf);
                goto out;
        } else if (ret != 3 && ret != 4) {
                ret = -EINVAL;
                goto out;
        }

        ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
        if (!ret)
                ret = strlen(input_buf);
out:
        mutex_unlock(&split_debug_mutex);
        return ret;

}

static const struct file_operations split_huge_pages_fops = {
        .owner         = THIS_MODULE,
        .write         = split_huge_pages_write,
        .llseek  = no_llseek,
};

static int __init split_huge_pages_debugfs(void)
{
        debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
                            &split_huge_pages_fops);
        return 0;
}
late_initcall(split_huge_pages_debugfs);
#endif

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        struct folio *folio = page_folio(page);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        bool anon_exclusive;
        pmd_t pmdval;
        swp_entry_t entry;
        pmd_t pmdswp;

        if (!(pvmw->pmd && !pvmw->pte))
                return 0;

        flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
        pmdval = pmdp_invalidate(vma, address, pvmw->pmd);

        /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
        anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
        if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
                set_pmd_at(mm, address, pvmw->pmd, pmdval);
                return -EBUSY;
        }

        if (pmd_dirty(pmdval))
                folio_mark_dirty(folio);
        if (pmd_write(pmdval))
                entry = make_writable_migration_entry(page_to_pfn(page));
        else if (anon_exclusive)
                entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
        else
                entry = make_readable_migration_entry(page_to_pfn(page));
        if (pmd_young(pmdval))
                entry = make_migration_entry_young(entry);
        if (pmd_dirty(pmdval))
                entry = make_migration_entry_dirty(entry);
        pmdswp = swp_entry_to_pmd(entry);
        if (pmd_soft_dirty(pmdval))
                pmdswp = pmd_swp_mksoft_dirty(pmdswp);
        if (pmd_uffd_wp(pmdval))
                pmdswp = pmd_swp_mkuffd_wp(pmdswp);
        set_pmd_at(mm, address, pvmw->pmd, pmdswp);
        folio_remove_rmap_pmd(folio, page, vma);
        folio_put(folio);
        trace_set_migration_pmd(address, pmd_val(pmdswp));

        return 0;
}

void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
{
        struct folio *folio = page_folio(new);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        unsigned long haddr = address & HPAGE_PMD_MASK;
        pmd_t pmde;
        swp_entry_t entry;

        if (!(pvmw->pmd && !pvmw->pte))
                return;

        entry = pmd_to_swp_entry(*pvmw->pmd);
        folio_get(folio);
        pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
        if (pmd_swp_soft_dirty(*pvmw->pmd))
                pmde = pmd_mksoft_dirty(pmde);
        if (is_writable_migration_entry(entry))
                pmde = pmd_mkwrite(pmde, vma);
        if (pmd_swp_uffd_wp(*pvmw->pmd))
                pmde = pmd_mkuffd_wp(pmde);
        if (!is_migration_entry_young(entry))
                pmde = pmd_mkold(pmde);
        /* NOTE: this may contain setting soft-dirty on some archs */
        if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
                pmde = pmd_mkdirty(pmde);

        if (folio_test_anon(folio)) {
                rmap_t rmap_flags = RMAP_NONE;

                if (!is_readable_migration_entry(entry))
                        rmap_flags |= RMAP_EXCLUSIVE;

                folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
        } else {
                folio_add_file_rmap_pmd(folio, new, vma);
        }
        VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
        set_pmd_at(mm, haddr, pvmw->pmd, pmde);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_pmd(vma, address, pvmw->pmd);
        trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif










































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
/* SPDX-License-Identifier: GPL-2.0 */
/* interrupt.h */
#ifndef _LINUX_INTERRUPT_H
#define _LINUX_INTERRUPT_H

#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/irqreturn.h>
#include <linux/irqnr.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/hrtimer.h>
#include <linux/kref.h>
#include <linux/workqueue.h>
#include <linux/jump_label.h>

#include <linux/atomic.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/sections.h>

/*
 * These correspond to the IORESOURCE_IRQ_* defines in
 * linux/ioport.h to select the interrupt line behaviour.  When
 * requesting an interrupt without specifying a IRQF_TRIGGER, the
 * setting should be assumed to be "as already configured", which
 * may be as per machine or firmware initialisation.
 */
#define IRQF_TRIGGER_NONE        0x00000000
#define IRQF_TRIGGER_RISING        0x00000001
#define IRQF_TRIGGER_FALLING        0x00000002
#define IRQF_TRIGGER_HIGH        0x00000004
#define IRQF_TRIGGER_LOW        0x00000008
#define IRQF_TRIGGER_MASK        (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \
                                 IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING)
#define IRQF_TRIGGER_PROBE        0x00000010

/*
 * These flags used only by the kernel as part of the
 * irq handling routines.
 *
 * IRQF_SHARED - allow sharing the irq among several devices
 * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur
 * IRQF_TIMER - Flag to mark this interrupt as timer interrupt
 * IRQF_PERCPU - Interrupt is per cpu
 * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing
 * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is
 *                registered first in a shared interrupt is considered for
 *                performance reasons)
 * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
 *                Used by threaded interrupts which need to keep the
 *                irq line disabled until the threaded handler has been run.
 * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend.  Does not guarantee
 *                   that this interrupt will wake the system from a suspended
 *                   state.  See Documentation/power/suspend-and-interrupts.rst
 * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
 * IRQF_NO_THREAD - Interrupt cannot be threaded
 * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
 *                resume time.
 * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this
 *                interrupt handler after suspending interrupts. For system
 *                wakeup devices users need to implement wakeup detection in
 *                their interrupt handlers.
 * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it.
 *                Users will enable it explicitly by enable_irq() or enable_nmi()
 *                later.
 * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers,
 *                   depends on IRQF_PERCPU.
 * IRQF_COND_ONESHOT - Agree to do IRQF_ONESHOT if already set for a shared
 *                 interrupt.
 */
#define IRQF_SHARED                0x00000080
#define IRQF_PROBE_SHARED        0x00000100
#define __IRQF_TIMER                0x00000200
#define IRQF_PERCPU                0x00000400
#define IRQF_NOBALANCING        0x00000800
#define IRQF_IRQPOLL                0x00001000
#define IRQF_ONESHOT                0x00002000
#define IRQF_NO_SUSPEND                0x00004000
#define IRQF_FORCE_RESUME        0x00008000
#define IRQF_NO_THREAD                0x00010000
#define IRQF_EARLY_RESUME        0x00020000
#define IRQF_COND_SUSPEND        0x00040000
#define IRQF_NO_AUTOEN                0x00080000
#define IRQF_NO_DEBUG                0x00100000
#define IRQF_COND_ONESHOT        0x00200000

#define IRQF_TIMER                (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)

/*
 * These values can be returned by request_any_context_irq() and
 * describe the context the interrupt will be run in.
 *
 * IRQC_IS_HARDIRQ - interrupt runs in hardirq context
 * IRQC_IS_NESTED - interrupt runs in a nested threaded context
 */
enum {
        IRQC_IS_HARDIRQ        = 0,
        IRQC_IS_NESTED,
};

typedef irqreturn_t (*irq_handler_t)(int, void *);

/**
 * struct irqaction - per interrupt action descriptor
 * @handler:        interrupt handler function
 * @name:        name of the device
 * @dev_id:        cookie to identify the device
 * @percpu_dev_id:        cookie to identify the device
 * @next:        pointer to the next irqaction for shared interrupts
 * @irq:        interrupt number
 * @flags:        flags (see IRQF_* above)
 * @thread_fn:        interrupt handler function for threaded interrupts
 * @thread:        thread pointer for threaded interrupts
 * @secondary:        pointer to secondary irqaction (force threading)
 * @thread_flags:        flags related to @thread
 * @thread_mask:        bitmask for keeping track of @thread activity
 * @dir:        pointer to the proc/irq/NN/name entry
 */
struct irqaction {
        irq_handler_t                handler;
        void                        *dev_id;
        void __percpu                *percpu_dev_id;
        struct irqaction        *next;
        irq_handler_t                thread_fn;
        struct task_struct        *thread;
        struct irqaction        *secondary;
        unsigned int                irq;
        unsigned int                flags;
        unsigned long                thread_flags;
        unsigned long                thread_mask;
        const char                *name;
        struct proc_dir_entry        *dir;
} ____cacheline_internodealigned_in_smp;

extern irqreturn_t no_action(int cpl, void *dev_id);

/*
 * If a (PCI) device interrupt is not connected we set dev->irq to
 * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we
 * can distingiush that case from other error returns.
 *
 * 0x80000000 is guaranteed to be outside the available range of interrupts
 * and easy to distinguish from other possible incorrect values.
 */
#define IRQ_NOTCONNECTED        (1U << 31)

extern int __must_check
request_threaded_irq(unsigned int irq, irq_handler_t handler,
                     irq_handler_t thread_fn,
                     unsigned long flags, const char *name, void *dev);

/**
 * request_irq - Add a handler for an interrupt line
 * @irq:        The interrupt line to allocate
 * @handler:        Function to be called when the IRQ occurs.
 *                Primary handler for threaded interrupts
 *                If NULL, the default primary handler is installed
 * @flags:        Handling flags
 * @name:        Name of the device generating this interrupt
 * @dev:        A cookie passed to the handler function
 *
 * This call allocates an interrupt and establishes a handler; see
 * the documentation for request_threaded_irq() for details.
 */
static inline int __must_check
request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
            const char *name, void *dev)
{
        return request_threaded_irq(irq, handler, NULL, flags, name, dev);
}

extern int __must_check
request_any_context_irq(unsigned int irq, irq_handler_t handler,
                        unsigned long flags, const char *name, void *dev_id);

extern int __must_check
__request_percpu_irq(unsigned int irq, irq_handler_t handler,
                     unsigned long flags, const char *devname,
                     void __percpu *percpu_dev_id);

extern int __must_check
request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags,
            const char *name, void *dev);

static inline int __must_check
request_percpu_irq(unsigned int irq, irq_handler_t handler,
                   const char *devname, void __percpu *percpu_dev_id)
{
        return __request_percpu_irq(irq, handler, 0,
                                    devname, percpu_dev_id);
}

extern int __must_check
request_percpu_nmi(unsigned int irq, irq_handler_t handler,
                   const char *devname, void __percpu *dev);

extern const void *free_irq(unsigned int, void *);
extern void free_percpu_irq(unsigned int, void __percpu *);

extern const void *free_nmi(unsigned int irq, void *dev_id);
extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id);

struct device;

extern int __must_check
devm_request_threaded_irq(struct device *dev, unsigned int irq,
                          irq_handler_t handler, irq_handler_t thread_fn,
                          unsigned long irqflags, const char *devname,
                          void *dev_id);

static inline int __must_check
devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler,
                 unsigned long irqflags, const char *devname, void *dev_id)
{
        return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags,
                                         devname, dev_id);
}

extern int __must_check
devm_request_any_context_irq(struct device *dev, unsigned int irq,
                 irq_handler_t handler, unsigned long irqflags,
                 const char *devname, void *dev_id);

extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);

bool irq_has_action(unsigned int irq);
extern void disable_irq_nosync(unsigned int irq);
extern bool disable_hardirq(unsigned int irq);
extern void disable_irq(unsigned int irq);
extern void disable_percpu_irq(unsigned int irq);
extern void enable_irq(unsigned int irq);
extern void enable_percpu_irq(unsigned int irq, unsigned int type);
extern bool irq_percpu_is_enabled(unsigned int irq);
extern void irq_wake_thread(unsigned int irq, void *dev_id);

extern void disable_nmi_nosync(unsigned int irq);
extern void disable_percpu_nmi(unsigned int irq);
extern void enable_nmi(unsigned int irq);
extern void enable_percpu_nmi(unsigned int irq, unsigned int type);
extern int prepare_percpu_nmi(unsigned int irq);
extern void teardown_percpu_nmi(unsigned int irq);

extern int irq_inject_interrupt(unsigned int irq);

/* The following three functions are for the core kernel use only. */
extern void suspend_device_irqs(void);
extern void resume_device_irqs(void);
extern void rearm_wake_irq(unsigned int irq);

/**
 * struct irq_affinity_notify - context for notification of IRQ affinity changes
 * @irq:                Interrupt to which notification applies
 * @kref:                Reference count, for internal use
 * @work:                Work item, for internal use
 * @notify:                Function to be called on change.  This will be
 *                        called in process context.
 * @release:                Function to be called on release.  This will be
 *                        called in process context.  Once registered, the
 *                        structure must only be freed when this function is
 *                        called or later.
 */
struct irq_affinity_notify {
        unsigned int irq;
        struct kref kref;
        struct work_struct work;
        void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
        void (*release)(struct kref *ref);
};

#define        IRQ_AFFINITY_MAX_SETS  4

/**
 * struct irq_affinity - Description for automatic irq affinity assignements
 * @pre_vectors:        Don't apply affinity to @pre_vectors at beginning of
 *                        the MSI(-X) vector space
 * @post_vectors:        Don't apply affinity to @post_vectors at end of
 *                        the MSI(-X) vector space
 * @nr_sets:                The number of interrupt sets for which affinity
 *                        spreading is required
 * @set_size:                Array holding the size of each interrupt set
 * @calc_sets:                Callback for calculating the number and size
 *                        of interrupt sets
 * @priv:                Private data for usage by @calc_sets, usually a
 *                        pointer to driver/device specific data.
 */
struct irq_affinity {
        unsigned int        pre_vectors;
        unsigned int        post_vectors;
        unsigned int        nr_sets;
        unsigned int        set_size[IRQ_AFFINITY_MAX_SETS];
        void                (*calc_sets)(struct irq_affinity *, unsigned int nvecs);
        void                *priv;
};

/**
 * struct irq_affinity_desc - Interrupt affinity descriptor
 * @mask:        cpumask to hold the affinity assignment
 * @is_managed: 1 if the interrupt is managed internally
 */
struct irq_affinity_desc {
        struct cpumask        mask;
        unsigned int        is_managed : 1;
};

#if defined(CONFIG_SMP)

extern cpumask_var_t irq_default_affinity;

extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask);

extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);

extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
                                     bool setaffinity);

/**
 * irq_update_affinity_hint - Update the affinity hint
 * @irq:        Interrupt to update
 * @m:                cpumask pointer (NULL to clear the hint)
 *
 * Updates the affinity hint, but does not change the affinity of the interrupt.
 */
static inline int
irq_update_affinity_hint(unsigned int irq, const struct cpumask *m)
{
        return __irq_apply_affinity_hint(irq, m, false);
}

/**
 * irq_set_affinity_and_hint - Update the affinity hint and apply the provided
 *                             cpumask to the interrupt
 * @irq:        Interrupt to update
 * @m:                cpumask pointer (NULL to clear the hint)
 *
 * Updates the affinity hint and if @m is not NULL it applies it as the
 * affinity of that interrupt.
 */
static inline int
irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m)
{
        return __irq_apply_affinity_hint(irq, m, true);
}

/*
 * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint()
 * instead.
 */
static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
{
        return irq_set_affinity_and_hint(irq, m);
}

extern int irq_update_affinity_desc(unsigned int irq,
                                    struct irq_affinity_desc *affinity);

extern int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);

struct irq_affinity_desc *
irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd);

unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
                                       const struct irq_affinity *affd);

#else /* CONFIG_SMP */

static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
{
        return 0;
}

static inline int irq_can_set_affinity(unsigned int irq)
{
        return 0;
}

static inline int irq_select_affinity(unsigned int irq)  { return 0; }

static inline int irq_update_affinity_hint(unsigned int irq,
                                           const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_set_affinity_and_hint(unsigned int irq,
                                            const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_set_affinity_hint(unsigned int irq,
                                        const struct cpumask *m)
{
        return -EINVAL;
}

static inline int irq_update_affinity_desc(unsigned int irq,
                                           struct irq_affinity_desc *affinity)
{
        return -EINVAL;
}

static inline int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
        return 0;
}

static inline struct irq_affinity_desc *
irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd)
{
        return NULL;
}

static inline unsigned int
irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
                          const struct irq_affinity *affd)
{
        return maxvec;
}

#endif /* CONFIG_SMP */

/*
 * Special lockdep variants of irq disabling/enabling.
 * These should be used for locking constructs that
 * know that a particular irq context which is disabled,
 * and which is the only irq-context user of a lock,
 * that it's safe to take the lock in the irq-disabled
 * section without disabling hardirqs.
 *
 * On !CONFIG_LOCKDEP they are equivalent to the normal
 * irq disable/enable methods.
 */
static inline void disable_irq_nosync_lockdep(unsigned int irq)
{
        disable_irq_nosync(irq);
#ifdef CONFIG_LOCKDEP
        local_irq_disable();
#endif
}

static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags)
{
        disable_irq_nosync(irq);
#ifdef CONFIG_LOCKDEP
        local_irq_save(*flags);
#endif
}

static inline void disable_irq_lockdep(unsigned int irq)
{
        disable_irq(irq);
#ifdef CONFIG_LOCKDEP
        local_irq_disable();
#endif
}

static inline void enable_irq_lockdep(unsigned int irq)
{
#ifdef CONFIG_LOCKDEP
        local_irq_enable();
#endif
        enable_irq(irq);
}

static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags)
{
#ifdef CONFIG_LOCKDEP
        local_irq_restore(*flags);
#endif
        enable_irq(irq);
}

/* IRQ wakeup (PM) control: */
extern int irq_set_irq_wake(unsigned int irq, unsigned int on);

static inline int enable_irq_wake(unsigned int irq)
{
        return irq_set_irq_wake(irq, 1);
}

static inline int disable_irq_wake(unsigned int irq)
{
        return irq_set_irq_wake(irq, 0);
}

/*
 * irq_get_irqchip_state/irq_set_irqchip_state specific flags
 */
enum irqchip_irq_state {
        IRQCHIP_STATE_PENDING,                /* Is interrupt pending? */
        IRQCHIP_STATE_ACTIVE,                /* Is interrupt in progress? */
        IRQCHIP_STATE_MASKED,                /* Is interrupt masked? */
        IRQCHIP_STATE_LINE_LEVEL,        /* Is IRQ line high? */
};

extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
                                 bool *state);
extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
                                 bool state);

#ifdef CONFIG_IRQ_FORCED_THREADING
# ifdef CONFIG_PREEMPT_RT
#  define force_irqthreads()        (true)
# else
DECLARE_STATIC_KEY_FALSE(force_irqthreads_key);
#  define force_irqthreads()        (static_branch_unlikely(&force_irqthreads_key))
# endif
#else
#define force_irqthreads()        (false)
#endif

#ifndef local_softirq_pending

#ifndef local_softirq_pending_ref
#define local_softirq_pending_ref irq_stat.__softirq_pending
#endif

#define local_softirq_pending()        (__this_cpu_read(local_softirq_pending_ref))
#define set_softirq_pending(x)        (__this_cpu_write(local_softirq_pending_ref, (x)))
#define or_softirq_pending(x)        (__this_cpu_or(local_softirq_pending_ref, (x)))

#endif /* local_softirq_pending */

/* Some architectures might implement lazy enabling/disabling of
 * interrupts. In some cases, such as stop_machine, we might want
 * to ensure that after a local_irq_disable(), interrupts have
 * really been disabled in hardware. Such architectures need to
 * implement the following hook.
 */
#ifndef hard_irq_disable
#define hard_irq_disable()        do { } while(0)
#endif

/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
   frequency threaded job scheduling. For almost all the purposes
   tasklets are more than enough. F.e. all serial device BHs et
   al. should be converted to tasklets, not to softirqs.
 */

enum
{
        HI_SOFTIRQ=0,
        TIMER_SOFTIRQ,
        NET_TX_SOFTIRQ,
        NET_RX_SOFTIRQ,
        BLOCK_SOFTIRQ,
        IRQ_POLL_SOFTIRQ,
        TASKLET_SOFTIRQ,
        SCHED_SOFTIRQ,
        HRTIMER_SOFTIRQ,
        RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

        NR_SOFTIRQS
};

/*
 * The following vectors can be safely ignored after ksoftirqd is parked:
 *
 * _ RCU:
 *         1) rcutree_migrate_callbacks() migrates the queue.
 *         2) rcutree_report_cpu_dead() reports the final quiescent states.
 *
 * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
 *
 * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue
 */
#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\
                                   BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ))


/* map softirq index to softirq name. update 'softirq_to_name' in
 * kernel/softirq.c when adding a new softirq.
 */
extern const char * const softirq_to_name[NR_SOFTIRQS];

/* softirq mask and active fields moved to irq_cpustat_t in
 * asm/hardirq.h to get better cache usage.  KAO
 */

struct softirq_action
{
        void        (*action)(struct softirq_action *);
};

asmlinkage void do_softirq(void);
asmlinkage void __do_softirq(void);

#ifdef CONFIG_PREEMPT_RT
extern void do_softirq_post_smp_call_flush(unsigned int was_pending);
#else
static inline void do_softirq_post_smp_call_flush(unsigned int unused)
{
        do_softirq();
}
#endif

extern void open_softirq(int nr, void (*action)(struct softirq_action *));
extern void softirq_init(void);
extern void __raise_softirq_irqoff(unsigned int nr);

extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);

DECLARE_PER_CPU(struct task_struct *, ksoftirqd);

static inline struct task_struct *this_cpu_ksoftirqd(void)
{
        return this_cpu_read(ksoftirqd);
}

/* Tasklets --- multithreaded analogue of BHs.

   This API is deprecated. Please consider using threaded IRQs instead:
   https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de

   Main feature differing them of generic softirqs: tasklet
   is running only on one CPU simultaneously.

   Main feature differing them of BHs: different tasklets
   may be run simultaneously on different CPUs.

   Properties:
   * If tasklet_schedule() is called, then tasklet is guaranteed
     to be executed on some cpu at least once after this.
   * If the tasklet is already scheduled, but its execution is still not
     started, it will be executed only once.
   * If this tasklet is already running on another CPU (or schedule is called
     from tasklet itself), it is rescheduled for later.
   * Tasklet is strictly serialized wrt itself, but not
     wrt another tasklets. If client needs some intertask synchronization,
     he makes it with spinlocks.
 */

struct tasklet_struct
{
        struct tasklet_struct *next;
        unsigned long state;
        atomic_t count;
        bool use_callback;
        union {
                void (*func)(unsigned long data);
                void (*callback)(struct tasklet_struct *t);
        };
        unsigned long data;
};

#define DECLARE_TASKLET(name, _callback)                \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(0),                        \
        .callback = _callback,                                \
        .use_callback = true,                                \
}

#define DECLARE_TASKLET_DISABLED(name, _callback)        \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(1),                        \
        .callback = _callback,                                \
        .use_callback = true,                                \
}

#define from_tasklet(var, callback_tasklet, tasklet_fieldname)        \
        container_of(callback_tasklet, typeof(*var), tasklet_fieldname)

#define DECLARE_TASKLET_OLD(name, _func)                \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(0),                        \
        .func = _func,                                        \
}

#define DECLARE_TASKLET_DISABLED_OLD(name, _func)        \
struct tasklet_struct name = {                                \
        .count = ATOMIC_INIT(1),                        \
        .func = _func,                                        \
}

enum
{
        TASKLET_STATE_SCHED,        /* Tasklet is scheduled for execution */
        TASKLET_STATE_RUN        /* Tasklet is running (SMP only) */
};

#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
static inline int tasklet_trylock(struct tasklet_struct *t)
{
        return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
}

void tasklet_unlock(struct tasklet_struct *t);
void tasklet_unlock_wait(struct tasklet_struct *t);
void tasklet_unlock_spin_wait(struct tasklet_struct *t);

#else
static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
static inline void tasklet_unlock(struct tasklet_struct *t) { }
static inline void tasklet_unlock_wait(struct tasklet_struct *t) { }
static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { }
#endif

extern void __tasklet_schedule(struct tasklet_struct *t);

static inline void tasklet_schedule(struct tasklet_struct *t)
{
        if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
                __tasklet_schedule(t);
}

extern void __tasklet_hi_schedule(struct tasklet_struct *t);

static inline void tasklet_hi_schedule(struct tasklet_struct *t)
{
        if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
                __tasklet_hi_schedule(t);
}

static inline void tasklet_disable_nosync(struct tasklet_struct *t)
{
        atomic_inc(&t->count);
        smp_mb__after_atomic();
}

/*
 * Do not use in new code. Disabling tasklets from atomic contexts is
 * error prone and should be avoided.
 */
static inline void tasklet_disable_in_atomic(struct tasklet_struct *t)
{
        tasklet_disable_nosync(t);
        tasklet_unlock_spin_wait(t);
        smp_mb();
}

static inline void tasklet_disable(struct tasklet_struct *t)
{
        tasklet_disable_nosync(t);
        tasklet_unlock_wait(t);
        smp_mb();
}

static inline void tasklet_enable(struct tasklet_struct *t)
{
        smp_mb__before_atomic();
        atomic_dec(&t->count);
}

extern void tasklet_kill(struct tasklet_struct *t);
extern void tasklet_init(struct tasklet_struct *t,
                         void (*func)(unsigned long), unsigned long data);
extern void tasklet_setup(struct tasklet_struct *t,
                          void (*callback)(struct tasklet_struct *));

/*
 * Autoprobing for irqs:
 *
 * probe_irq_on() and probe_irq_off() provide robust primitives
 * for accurate IRQ probing during kernel initialization.  They are
 * reasonably simple to use, are not "fooled" by spurious interrupts,
 * and, unlike other attempts at IRQ probing, they do not get hung on
 * stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards).
 *
 * For reasonably foolproof probing, use them as follows:
 *
 * 1. clear and/or mask the device's internal interrupt.
 * 2. sti();
 * 3. irqs = probe_irq_on();      // "take over" all unassigned idle IRQs
 * 4. enable the device and cause it to trigger an interrupt.
 * 5. wait for the device to interrupt, using non-intrusive polling or a delay.
 * 6. irq = probe_irq_off(irqs);  // get IRQ number, 0=none, negative=multiple
 * 7. service the device to clear its pending interrupt.
 * 8. loop again if paranoia is required.
 *
 * probe_irq_on() returns a mask of allocated irq's.
 *
 * probe_irq_off() takes the mask as a parameter,
 * and returns the irq number which occurred,
 * or zero if none occurred, or a negative irq number
 * if more than one irq occurred.
 */

#if !defined(CONFIG_GENERIC_IRQ_PROBE) 
static inline unsigned long probe_irq_on(void)
{
        return 0;
}
static inline int probe_irq_off(unsigned long val)
{
        return 0;
}
static inline unsigned int probe_irq_mask(unsigned long val)
{
        return 0;
}
#else
extern unsigned long probe_irq_on(void);        /* returns 0 on failure */
extern int probe_irq_off(unsigned long);        /* returns 0 or negative on failure */
extern unsigned int probe_irq_mask(unsigned long);        /* returns mask of ISA interrupts */
#endif

#ifdef CONFIG_PROC_FS
/* Initialize /proc/irq/ */
extern void init_irq_proc(void);
#else
static inline void init_irq_proc(void)
{
}
#endif

#ifdef CONFIG_IRQ_TIMINGS
void irq_timings_enable(void);
void irq_timings_disable(void);
u64 irq_timings_next_event(u64 now);
#endif

struct seq_file;
int show_interrupts(struct seq_file *p, void *v);
int arch_show_interrupts(struct seq_file *p, int prec);

extern int early_irq_init(void);
extern int arch_probe_nr_irqs(void);
extern int arch_early_irq_init(void);

/*
 * We want to know which function is an entrypoint of a hardirq or a softirq.
 */
#ifndef __irq_entry
# define __irq_entry         __section(".irqentry.text")
#endif

#define __softirq_entry  __section(".softirqentry.text")

#endif

























































   16 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM x86_fpu

#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FPU_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(x86_fpu,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu),

        TP_STRUCT__entry(
                __field(struct fpu *, fpu)
                __field(bool, load_fpu)
                __field(u64, xfeatures)
                __field(u64, xcomp_bv)
                ),

        TP_fast_assign(
                __entry->fpu                = fpu;
                __entry->load_fpu        = test_thread_flag(TIF_NEED_FPU_LOAD);
                if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
                        __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
                        __entry->xcomp_bv  = fpu->fpstate->regs.xsave.header.xcomp_bv;
                }
        ),
        TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
                        __entry->fpu,
                        __entry->load_fpu,
                        __entry->xfeatures,
                        __entry->xcomp_bv
        )
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_src,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/trace/
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE fpu
#endif /* _TRACE_FPU_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































    9 



    9 


    2 






   11 




























































































































































    1 





    1 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
// SPDX-License-Identifier: GPL-2.0
/*
 * Fast batching percpu counters.
 */

#include <linux/percpu_counter.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/debugobjects.h>

#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
static DEFINE_SPINLOCK(percpu_counters_lock);
#endif

#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER

static const struct debug_obj_descr percpu_counter_debug_descr;

static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
        struct percpu_counter *fbc = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                percpu_counter_destroy(fbc);
                debug_object_free(fbc, &percpu_counter_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr percpu_counter_debug_descr = {
        .name                = "percpu_counter",
        .fixup_free        = percpu_counter_fixup_free,
};

static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{
        debug_object_init(fbc, &percpu_counter_debug_descr);
        debug_object_activate(fbc, &percpu_counter_debug_descr);
}

static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{
        debug_object_deactivate(fbc, &percpu_counter_debug_descr);
        debug_object_free(fbc, &percpu_counter_debug_descr);
}

#else        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{ }
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{ }
#endif        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */

void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        for_each_possible_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                *pcount = 0;
        }
        fbc->count = amount;
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_set);

/*
 * local_irq_save() is needed to make the function irq safe:
 * - The slow path would be ok as protected by an irq-safe spinlock.
 * - this_cpu_add would be ok as it is irq-safe by definition.
 * But:
 * The decision slow path/fast path and the actual update must be atomic, too.
 * Otherwise a call in process context could check the current values and
 * decide that the fast path can be used. If now an interrupt occurs before
 * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters),
 * then the this_cpu_add() that is executed after the interrupt has completed
 * can produce values larger than "batch" or even overflows.
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;
        unsigned long flags;

        local_irq_save(flags);
        count = __this_cpu_read(*fbc->counters) + amount;
        if (abs(count) >= batch) {
                raw_spin_lock(&fbc->lock);
                fbc->count += count;
                __this_cpu_sub(*fbc->counters, count - amount);
                raw_spin_unlock(&fbc->lock);
        } else {
                this_cpu_add(*fbc->counters, amount);
        }
        local_irq_restore(flags);
}
EXPORT_SYMBOL(percpu_counter_add_batch);

/*
 * For percpu_counter with a big batch, the devication of its count could
 * be big, and there is requirement to reduce the deviation, like when the
 * counter's batch could be runtime decreased to get a better accuracy,
 * which can be achieved by running this sync function on each CPU.
 */
void percpu_counter_sync(struct percpu_counter *fbc)
{
        unsigned long flags;
        s64 count;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        count = __this_cpu_read(*fbc->counters);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_sync);

/*
 * Add up all the per-cpu counts, return the result.  This is a more accurate
 * but much slower version of percpu_counter_read_positive().
 *
 * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
 * from CPUs that are in the process of being taken offline. Dying cpus have
 * been removed from the online mask, but may not have had the hotplug dead
 * notifier called to fold the percpu count back into the global counter sum.
 * By including dying CPUs in the iteration mask, we avoid this race condition
 * so __percpu_counter_sum() just does the right thing when CPUs are being taken
 * offline.
 */
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
        s64 ret;
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        ret = fbc->count;
        for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                ret += *pcount;
        }
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
        return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key)
{
        unsigned long flags __maybe_unused;
        size_t counter_size;
        s32 __percpu *counters;
        u32 i;

        counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
        counters = __alloc_percpu_gfp(nr_counters * counter_size,
                                      __alignof__(*counters), gfp);
        if (!counters) {
                fbc[0].counters = NULL;
                return -ENOMEM;
        }

        for (i = 0; i < nr_counters; i++) {
                raw_spin_lock_init(&fbc[i].lock);
                lockdep_set_class(&fbc[i].lock, key);
#ifdef CONFIG_HOTPLUG_CPU
                INIT_LIST_HEAD(&fbc[i].list);
#endif
                fbc[i].count = amount;
                fbc[i].counters = (void *)counters + (i * counter_size);

                debug_percpu_counter_activate(&fbc[i]);
        }

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_add(&fbc[i].list, &percpu_counters);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        return 0;
}
EXPORT_SYMBOL(__percpu_counter_init_many);

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
{
        unsigned long flags __maybe_unused;
        u32 i;

        if (WARN_ON_ONCE(!fbc))
                return;

        if (!fbc[0].counters)
                return;

        for (i = 0; i < nr_counters; i++)
                debug_percpu_counter_deactivate(&fbc[i]);

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_del(&fbc[i].list);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif

        free_percpu(fbc[0].counters);

        for (i = 0; i < nr_counters; i++)
                fbc[i].counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy_many);

int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);

static int compute_batch_value(unsigned int cpu)
{
        int nr = num_online_cpus();

        percpu_counter_batch = max(32, nr*2);
        return 0;
}

static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        struct percpu_counter *fbc;

        compute_batch_value(cpu);

        spin_lock_irq(&percpu_counters_lock);
        list_for_each_entry(fbc, &percpu_counters, list) {
                s32 *pcount;

                raw_spin_lock(&fbc->lock);
                pcount = per_cpu_ptr(fbc->counters, cpu);
                fbc->count += *pcount;
                *pcount = 0;
                raw_spin_unlock(&fbc->lock);
        }
        spin_unlock_irq(&percpu_counters_lock);
#endif
        return 0;
}

/*
 * Compare counter against given value.
 * Return 1 if greater, 0 if equal and -1 if less
 */
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        s64        count;

        count = percpu_counter_read(fbc);
        /* Check to see if rough count will be sufficient for comparison */
        if (abs(count - rhs) > (batch * num_online_cpus())) {
                if (count > rhs)
                        return 1;
                else
                        return -1;
        }
        /* Need to use precise count */
        count = percpu_counter_sum(fbc);
        if (count > rhs)
                return 1;
        else if (count < rhs)
                return -1;
        else
                return 0;
}
EXPORT_SYMBOL(__percpu_counter_compare);

/*
 * Compare counter, and add amount if total is: less than or equal to limit if
 * amount is positive, or greater than or equal to limit if amount is negative.
 * Return true if amount is added, or false if total would be beyond the limit.
 *
 * Negative limit is allowed, but unusual.
 * When negative amounts (subs) are given to percpu_counter_limited_add(),
 * the limit would most naturally be 0 - but other limits are also allowed.
 *
 * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
 * are all assumed to be sane (far from S64_MIN and S64_MAX).
 */
bool __percpu_counter_limited_add(struct percpu_counter *fbc,
                                  s64 limit, s64 amount, s32 batch)
{
        s64 count;
        s64 unknown;
        unsigned long flags;
        bool good = false;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        unknown = batch * num_online_cpus();
        count = __this_cpu_read(*fbc->counters);

        /* Skip taking the lock when safe */
        if (abs(count + amount) <= batch &&
            ((amount > 0 && fbc->count + unknown <= limit) ||
             (amount < 0 && fbc->count - unknown >= limit))) {
                this_cpu_add(*fbc->counters, amount);
                local_irq_restore(flags);
                return true;
        }

        raw_spin_lock(&fbc->lock);
        count = fbc->count + amount;

        /* Skip percpu_counter_sum() when safe */
        if (amount > 0) {
                if (count - unknown > limit)
                        goto out;
                if (count + unknown <= limit)
                        good = true;
        } else {
                if (count + unknown < limit)
                        goto out;
                if (count - unknown >= limit)
                        good = true;
        }

        if (!good) {
                s32 *pcount;
                int cpu;

                for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                        pcount = per_cpu_ptr(fbc->counters, cpu);
                        count += *pcount;
                }
                if (amount > 0) {
                        if (count > limit)
                                goto out;
                } else {
                        if (count < limit)
                                goto out;
                }
                good = true;
        }

        count = __this_cpu_read(*fbc->counters);
        fbc->count += count + amount;
        __this_cpu_sub(*fbc->counters, count);
out:
        raw_spin_unlock(&fbc->lock);
        local_irq_restore(flags);
        return good;
}

static int __init percpu_counter_startup(void)
{
        int ret;

        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
                                compute_batch_value, NULL);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
                                        "lib/percpu_cnt:dead", NULL,
                                        percpu_counter_cpu_dead);
        WARN_ON(ret < 0);
        return 0;
}
module_init(percpu_counter_startup);






































































    1 





































































































































































    1 






    1 


















    1 


    1 






















    1 


    1 



    1 




    1 













    1 








    1 




























    1 











    1 










    1 

    1 





























    1 

    1 





    1 

    1 






    1 




















    1 

















    1 


























































































































    1 









    1 



    1 
    1 


    1 






    1 







































    1 





















































    1 










    1 


    1 






    1 





    1 



























    1 




    1 

    1 



























































































    1 
    1 






    1 
    1 
    1 
























    1 






    1 







    1 
    1 




































































    1 
















    1 
    1 






    1 



























    1 










    1 




    1 












    1 
































































































































































































































































































































    1 





    1 









    1 
    1 












    1 
    1 

    1 


    1 







    1 







    1 
    1 




    1 




































































































































    1 
























    1 

























    1 




    1 



    1 








    1 



    1 









    1 











    1 

















































    1 

    1 


    1 


    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This file is part of the SCTP kernel implementation
 *
 * This module provides the abstraction for an SCTP association.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Xingang Guo           <xingang.guo@intel.com>
 *    Hui Huang             <hui.huang@nokia.com>
 *    Sridhar Samudrala            <sri@us.ibm.com>
 *    Daisy Chang            <daisyc@us.ibm.com>
 *    Ryan Layer            <rmlayer@us.ibm.com>
 *    Kevin Gao             <kevin.gao@intel.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
#include <linux/init.h>

#include <linux/slab.h>
#include <linux/in.h>
#include <net/ipv6.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

/* Forward declarations for internal functions. */
static void sctp_select_active_and_retran_path(struct sctp_association *asoc);
static void sctp_assoc_bh_rcv(struct work_struct *work);
static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc);
static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc);

/* 1st Level Abstractions. */

/* Initialize a new association from provided memory. */
static struct sctp_association *sctp_association_init(
                                        struct sctp_association *asoc,
                                        const struct sctp_endpoint *ep,
                                        const struct sock *sk,
                                        enum sctp_scope scope, gfp_t gfp)
{
        struct sctp_sock *sp;
        struct sctp_paramhdr *p;
        int i;

        /* Retrieve the SCTP per socket area.  */
        sp = sctp_sk((struct sock *)sk);

        /* Discarding const is appropriate here.  */
        asoc->ep = (struct sctp_endpoint *)ep;
        asoc->base.sk = (struct sock *)sk;
        asoc->base.net = sock_net(sk);

        sctp_endpoint_hold(asoc->ep);
        sock_hold(asoc->base.sk);

        /* Initialize the common base substructure.  */
        asoc->base.type = SCTP_EP_TYPE_ASSOCIATION;

        /* Initialize the object handling fields.  */
        refcount_set(&asoc->base.refcnt, 1);

        /* Initialize the bind addr area.  */
        sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port);

        asoc->state = SCTP_STATE_CLOSED;
        asoc->cookie_life = ms_to_ktime(sp->assocparams.sasoc_cookie_life);
        asoc->user_frag = sp->user_frag;

        /* Set the association max_retrans and RTO values from the
         * socket values.
         */
        asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
        asoc->pf_retrans  = sp->pf_retrans;
        asoc->ps_retrans  = sp->ps_retrans;
        asoc->pf_expose   = sp->pf_expose;

        asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
        asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
        asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);

        /* Initialize the association's heartbeat interval based on the
         * sock configured value.
         */
        asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
        asoc->probe_interval = msecs_to_jiffies(sp->probe_interval);

        asoc->encap_port = sp->encap_port;

        /* Initialize path max retrans value. */
        asoc->pathmaxrxt = sp->pathmaxrxt;

        asoc->flowlabel = sp->flowlabel;
        asoc->dscp = sp->dscp;

        /* Set association default SACK delay */
        asoc->sackdelay = msecs_to_jiffies(sp->sackdelay);
        asoc->sackfreq = sp->sackfreq;

        /* Set the association default flags controlling
         * Heartbeat, SACK delay, and Path MTU Discovery.
         */
        asoc->param_flags = sp->param_flags;

        /* Initialize the maximum number of new data packets that can be sent
         * in a burst.
         */
        asoc->max_burst = sp->max_burst;

        asoc->subscribe = sp->subscribe;

        /* initialize association timers */
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial;
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial;
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = asoc->rto_initial;

        /* sctpimpguide Section 2.12.2
         * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the
         * recommended value of 5 times 'RTO.Max'.
         */
        asoc->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
                = 5 * asoc->rto_max;

        asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay;
        asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;

        /* Initializes the timers */
        for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
                timer_setup(&asoc->timers[i], sctp_timer_events[i], 0);

        /* Pull default initialization values from the sock options.
         * Note: This assumes that the values have already been
         * validated in the sock.
         */
        asoc->c.sinit_max_instreams = sp->initmsg.sinit_max_instreams;
        asoc->c.sinit_num_ostreams  = sp->initmsg.sinit_num_ostreams;
        asoc->max_init_attempts        = sp->initmsg.sinit_max_attempts;

        asoc->max_init_timeo =
                 msecs_to_jiffies(sp->initmsg.sinit_max_init_timeo);

        /* Set the local window size for receive.
         * This is also the rcvbuf space per association.
         * RFC 6 - A SCTP receiver MUST be able to receive a minimum of
         * 1500 bytes in one SCTP packet.
         */
        if ((sk->sk_rcvbuf/2) < SCTP_DEFAULT_MINWINDOW)
                asoc->rwnd = SCTP_DEFAULT_MINWINDOW;
        else
                asoc->rwnd = sk->sk_rcvbuf/2;

        asoc->a_rwnd = asoc->rwnd;

        /* Use my own max window until I learn something better.  */
        asoc->peer.rwnd = SCTP_DEFAULT_MAXWINDOW;

        /* Initialize the receive memory counter */
        atomic_set(&asoc->rmem_alloc, 0);

        init_waitqueue_head(&asoc->wait);

        asoc->c.my_vtag = sctp_generate_tag(ep);
        asoc->c.my_port = ep->base.bind_addr.port;

        asoc->c.initial_tsn = sctp_generate_tsn(ep);

        asoc->next_tsn = asoc->c.initial_tsn;

        asoc->ctsn_ack_point = asoc->next_tsn - 1;
        asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
        asoc->highest_sacked = asoc->ctsn_ack_point;
        asoc->last_cwr_tsn = asoc->ctsn_ack_point;

        /* ADDIP Section 4.1 Asconf Chunk Procedures
         *
         * When an endpoint has an ASCONF signaled change to be sent to the
         * remote endpoint it should do the following:
         * ...
         * A2) a serial number should be assigned to the chunk. The serial
         * number SHOULD be a monotonically increasing number. The serial
         * numbers SHOULD be initialized at the start of the
         * association to the same value as the initial TSN.
         */
        asoc->addip_serial = asoc->c.initial_tsn;
        asoc->strreset_outseq = asoc->c.initial_tsn;

        INIT_LIST_HEAD(&asoc->addip_chunk_list);
        INIT_LIST_HEAD(&asoc->asconf_ack_list);

        /* Make an empty list of remote transport addresses.  */
        INIT_LIST_HEAD(&asoc->peer.transport_addr_list);

        /* RFC 2960 5.1 Normal Establishment of an Association
         *
         * After the reception of the first data chunk in an
         * association the endpoint must immediately respond with a
         * sack to acknowledge the data chunk.  Subsequent
         * acknowledgements should be done as described in Section
         * 6.2.
         *
         * [We implement this by telling a new association that it
         * already received one packet.]
         */
        asoc->peer.sack_needed = 1;
        asoc->peer.sack_generation = 1;

        /* Create an input queue.  */
        sctp_inq_init(&asoc->base.inqueue);
        sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv);

        /* Create an output queue.  */
        sctp_outq_init(asoc, &asoc->outqueue);

        sctp_ulpq_init(&asoc->ulpq, asoc);

        if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp))
                goto stream_free;

        /* Initialize default path MTU. */
        asoc->pathmtu = sp->pathmtu;
        sctp_assoc_update_frag_point(asoc);

        /* Assume that peer would support both address types unless we are
         * told otherwise.
         */
        asoc->peer.ipv4_address = 1;
        if (asoc->base.sk->sk_family == PF_INET6)
                asoc->peer.ipv6_address = 1;
        INIT_LIST_HEAD(&asoc->asocs);

        asoc->default_stream = sp->default_stream;
        asoc->default_ppid = sp->default_ppid;
        asoc->default_flags = sp->default_flags;
        asoc->default_context = sp->default_context;
        asoc->default_timetolive = sp->default_timetolive;
        asoc->default_rcv_context = sp->default_rcv_context;

        /* AUTH related initializations */
        INIT_LIST_HEAD(&asoc->endpoint_shared_keys);
        if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp))
                goto stream_free;

        asoc->active_key_id = ep->active_key_id;
        asoc->strreset_enable = ep->strreset_enable;

        /* Save the hmacs and chunks list into this association */
        if (ep->auth_hmacs_list)
                memcpy(asoc->c.auth_hmacs, ep->auth_hmacs_list,
                        ntohs(ep->auth_hmacs_list->param_hdr.length));
        if (ep->auth_chunk_list)
                memcpy(asoc->c.auth_chunks, ep->auth_chunk_list,
                        ntohs(ep->auth_chunk_list->param_hdr.length));

        /* Get the AUTH random number for this association */
        p = (struct sctp_paramhdr *)asoc->c.auth_random;
        p->type = SCTP_PARAM_RANDOM;
        p->length = htons(sizeof(*p) + SCTP_AUTH_RANDOM_LENGTH);
        get_random_bytes(p+1, SCTP_AUTH_RANDOM_LENGTH);

        return asoc;

stream_free:
        sctp_stream_free(&asoc->stream);
        sock_put(asoc->base.sk);
        sctp_endpoint_put(asoc->ep);
        return NULL;
}

/* Allocate and initialize a new association */
struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
                                              const struct sock *sk,
                                              enum sctp_scope scope, gfp_t gfp)
{
        struct sctp_association *asoc;

        asoc = kzalloc(sizeof(*asoc), gfp);
        if (!asoc)
                goto fail;

        if (!sctp_association_init(asoc, ep, sk, scope, gfp))
                goto fail_init;

        SCTP_DBG_OBJCNT_INC(assoc);

        pr_debug("Created asoc %p\n", asoc);

        return asoc;

fail_init:
        kfree(asoc);
fail:
        return NULL;
}

/* Free this association if possible.  There may still be users, so
 * the actual deallocation may be delayed.
 */
void sctp_association_free(struct sctp_association *asoc)
{
        struct sock *sk = asoc->base.sk;
        struct sctp_transport *transport;
        struct list_head *pos, *temp;
        int i;

        /* Only real associations count against the endpoint, so
         * don't bother for if this is a temporary association.
         */
        if (!list_empty(&asoc->asocs)) {
                list_del(&asoc->asocs);

                /* Decrement the backlog value for a TCP-style listening
                 * socket.
                 */
                if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
                        sk_acceptq_removed(sk);
        }

        /* Mark as dead, so other users can know this structure is
         * going away.
         */
        asoc->base.dead = true;

        /* Dispose of any data lying around in the outqueue. */
        sctp_outq_free(&asoc->outqueue);

        /* Dispose of any pending messages for the upper layer. */
        sctp_ulpq_free(&asoc->ulpq);

        /* Dispose of any pending chunks on the inqueue. */
        sctp_inq_free(&asoc->base.inqueue);

        sctp_tsnmap_free(&asoc->peer.tsn_map);

        /* Free stream information. */
        sctp_stream_free(&asoc->stream);

        if (asoc->strreset_chunk)
                sctp_chunk_free(asoc->strreset_chunk);

        /* Clean up the bound address list. */
        sctp_bind_addr_free(&asoc->base.bind_addr);

        /* Do we need to go through all of our timers and
         * delete them?   To be safe we will try to delete all, but we
         * should be able to go through and make a guess based
         * on our state.
         */
        for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) {
                if (del_timer(&asoc->timers[i]))
                        sctp_association_put(asoc);
        }

        /* Free peer's cached cookie. */
        kfree(asoc->peer.cookie);
        kfree(asoc->peer.peer_random);
        kfree(asoc->peer.peer_chunks);
        kfree(asoc->peer.peer_hmacs);

        /* Release the transport structures. */
        list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
                transport = list_entry(pos, struct sctp_transport, transports);
                list_del_rcu(pos);
                sctp_unhash_transport(transport);
                sctp_transport_free(transport);
        }

        asoc->peer.transport_count = 0;

        sctp_asconf_queue_teardown(asoc);

        /* Free pending address space being deleted */
        kfree(asoc->asconf_addr_del_pending);

        /* AUTH - Free the endpoint shared keys */
        sctp_auth_destroy_keys(&asoc->endpoint_shared_keys);

        /* AUTH - Free the association shared key */
        sctp_auth_key_put(asoc->asoc_shared_key);

        sctp_association_put(asoc);
}

/* Cleanup and free up an association. */
static void sctp_association_destroy(struct sctp_association *asoc)
{
        if (unlikely(!asoc->base.dead)) {
                WARN(1, "Attempt to destroy undead association %p!\n", asoc);
                return;
        }

        sctp_endpoint_put(asoc->ep);
        sock_put(asoc->base.sk);

        if (asoc->assoc_id != 0) {
                spin_lock_bh(&sctp_assocs_id_lock);
                idr_remove(&sctp_assocs_id, asoc->assoc_id);
                spin_unlock_bh(&sctp_assocs_id_lock);
        }

        WARN_ON(atomic_read(&asoc->rmem_alloc));

        kfree_rcu(asoc, rcu);
        SCTP_DBG_OBJCNT_DEC(assoc);
}

/* Change the primary destination address for the peer. */
void sctp_assoc_set_primary(struct sctp_association *asoc,
                            struct sctp_transport *transport)
{
        int changeover = 0;

        /* it's a changeover only if we already have a primary path
         * that we are changing
         */
        if (asoc->peer.primary_path != NULL &&
            asoc->peer.primary_path != transport)
                changeover = 1 ;

        asoc->peer.primary_path = transport;
        sctp_ulpevent_notify_peer_addr_change(transport,
                                              SCTP_ADDR_MADE_PRIM, 0);

        /* Set a default msg_name for events. */
        memcpy(&asoc->peer.primary_addr, &transport->ipaddr,
               sizeof(union sctp_addr));

        /* If the primary path is changing, assume that the
         * user wants to use this new path.
         */
        if ((transport->state == SCTP_ACTIVE) ||
            (transport->state == SCTP_UNKNOWN))
                asoc->peer.active_path = transport;

        /*
         * SFR-CACC algorithm:
         * Upon the receipt of a request to change the primary
         * destination address, on the data structure for the new
         * primary destination, the sender MUST do the following:
         *
         * 1) If CHANGEOVER_ACTIVE is set, then there was a switch
         * to this destination address earlier. The sender MUST set
         * CYCLING_CHANGEOVER to indicate that this switch is a
         * double switch to the same destination address.
         *
         * Really, only bother is we have data queued or outstanding on
         * the association.
         */
        if (!asoc->outqueue.outstanding_bytes && !asoc->outqueue.out_qlen)
                return;

        if (transport->cacc.changeover_active)
                transport->cacc.cycling_changeover = changeover;

        /* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that
         * a changeover has occurred.
         */
        transport->cacc.changeover_active = changeover;

        /* 3) The sender MUST store the next TSN to be sent in
         * next_tsn_at_change.
         */
        transport->cacc.next_tsn_at_change = asoc->next_tsn;
}

/* Remove a transport from an association.  */
void sctp_assoc_rm_peer(struct sctp_association *asoc,
                        struct sctp_transport *peer)
{
        struct sctp_transport *transport;
        struct list_head *pos;
        struct sctp_chunk *ch;

        pr_debug("%s: association:%p addr:%pISpc\n",
                 __func__, asoc, &peer->ipaddr.sa);

        /* If we are to remove the current retran_path, update it
         * to the next peer before removing this peer from the list.
         */
        if (asoc->peer.retran_path == peer)
                sctp_assoc_update_retran_path(asoc);

        /* Remove this peer from the list. */
        list_del_rcu(&peer->transports);
        /* Remove this peer from the transport hashtable */
        sctp_unhash_transport(peer);

        /* Get the first transport of asoc. */
        pos = asoc->peer.transport_addr_list.next;
        transport = list_entry(pos, struct sctp_transport, transports);

        /* Update any entries that match the peer to be deleted. */
        if (asoc->peer.primary_path == peer)
                sctp_assoc_set_primary(asoc, transport);
        if (asoc->peer.active_path == peer)
                asoc->peer.active_path = transport;
        if (asoc->peer.retran_path == peer)
                asoc->peer.retran_path = transport;
        if (asoc->peer.last_data_from == peer)
                asoc->peer.last_data_from = transport;

        if (asoc->strreset_chunk &&
            asoc->strreset_chunk->transport == peer) {
                asoc->strreset_chunk->transport = transport;
                sctp_transport_reset_reconf_timer(transport);
        }

        /* If we remove the transport an INIT was last sent to, set it to
         * NULL. Combined with the update of the retran path above, this
         * will cause the next INIT to be sent to the next available
         * transport, maintaining the cycle.
         */
        if (asoc->init_last_sent_to == peer)
                asoc->init_last_sent_to = NULL;

        /* If we remove the transport an SHUTDOWN was last sent to, set it
         * to NULL. Combined with the update of the retran path above, this
         * will cause the next SHUTDOWN to be sent to the next available
         * transport, maintaining the cycle.
         */
        if (asoc->shutdown_last_sent_to == peer)
                asoc->shutdown_last_sent_to = NULL;

        /* If we remove the transport an ASCONF was last sent to, set it to
         * NULL.
         */
        if (asoc->addip_last_asconf &&
            asoc->addip_last_asconf->transport == peer)
                asoc->addip_last_asconf->transport = NULL;

        /* If we have something on the transmitted list, we have to
         * save it off.  The best place is the active path.
         */
        if (!list_empty(&peer->transmitted)) {
                struct sctp_transport *active = asoc->peer.active_path;

                /* Reset the transport of each chunk on this list */
                list_for_each_entry(ch, &peer->transmitted,
                                        transmitted_list) {
                        ch->transport = NULL;
                        ch->rtt_in_progress = 0;
                }

                list_splice_tail_init(&peer->transmitted,
                                        &active->transmitted);

                /* Start a T3 timer here in case it wasn't running so
                 * that these migrated packets have a chance to get
                 * retransmitted.
                 */
                if (!timer_pending(&active->T3_rtx_timer))
                        if (!mod_timer(&active->T3_rtx_timer,
                                        jiffies + active->rto))
                                sctp_transport_hold(active);
        }

        list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list)
                if (ch->transport == peer)
                        ch->transport = NULL;

        asoc->peer.transport_count--;

        sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0);
        sctp_transport_free(peer);
}

/* Add a transport address to an association.  */
struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
                                           const union sctp_addr *addr,
                                           const gfp_t gfp,
                                           const int peer_state)
{
        struct sctp_transport *peer;
        struct sctp_sock *sp;
        unsigned short port;

        sp = sctp_sk(asoc->base.sk);

        /* AF_INET and AF_INET6 share common port field. */
        port = ntohs(addr->v4.sin_port);

        pr_debug("%s: association:%p addr:%pISpc state:%d\n", __func__,
                 asoc, &addr->sa, peer_state);

        /* Set the port if it has not been set yet.  */
        if (0 == asoc->peer.port)
                asoc->peer.port = port;

        /* Check to see if this is a duplicate. */
        peer = sctp_assoc_lookup_paddr(asoc, addr);
        if (peer) {
                /* An UNKNOWN state is only set on transports added by
                 * user in sctp_connectx() call.  Such transports should be
                 * considered CONFIRMED per RFC 4960, Section 5.4.
                 */
                if (peer->state == SCTP_UNKNOWN) {
                        peer->state = SCTP_ACTIVE;
                }
                return peer;
        }

        peer = sctp_transport_new(asoc->base.net, addr, gfp);
        if (!peer)
                return NULL;

        sctp_transport_set_owner(peer, asoc);

        /* Initialize the peer's heartbeat interval based on the
         * association configured value.
         */
        peer->hbinterval = asoc->hbinterval;
        peer->probe_interval = asoc->probe_interval;

        peer->encap_port = asoc->encap_port;

        /* Set the path max_retrans.  */
        peer->pathmaxrxt = asoc->pathmaxrxt;

        /* And the partial failure retrans threshold */
        peer->pf_retrans = asoc->pf_retrans;
        /* And the primary path switchover retrans threshold */
        peer->ps_retrans = asoc->ps_retrans;

        /* Initialize the peer's SACK delay timeout based on the
         * association configured value.
         */
        peer->sackdelay = asoc->sackdelay;
        peer->sackfreq = asoc->sackfreq;

        if (addr->sa.sa_family == AF_INET6) {
                __be32 info = addr->v6.sin6_flowinfo;

                if (info) {
                        peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK);
                        peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
                } else {
                        peer->flowlabel = asoc->flowlabel;
                }
        }
        peer->dscp = asoc->dscp;

        /* Enable/disable heartbeat, SACK delay, and path MTU discovery
         * based on association setting.
         */
        peer->param_flags = asoc->param_flags;

        /* Initialize the pmtu of the transport. */
        sctp_transport_route(peer, NULL, sp);

        /* If this is the first transport addr on this association,
         * initialize the association PMTU to the peer's PMTU.
         * If not and the current association PMTU is higher than the new
         * peer's PMTU, reset the association PMTU to the new peer's PMTU.
         */
        sctp_assoc_set_pmtu(asoc, asoc->pathmtu ?
                                  min_t(int, peer->pathmtu, asoc->pathmtu) :
                                  peer->pathmtu);

        peer->pmtu_pending = 0;

        /* The asoc->peer.port might not be meaningful yet, but
         * initialize the packet structure anyway.
         */
        sctp_packet_init(&peer->packet, peer, asoc->base.bind_addr.port,
                         asoc->peer.port);

        /* 7.2.1 Slow-Start
         *
         * o The initial cwnd before DATA transmission or after a sufficiently
         *   long idle period MUST be set to
         *      min(4*MTU, max(2*MTU, 4380 bytes))
         *
         * o The initial value of ssthresh MAY be arbitrarily high
         *   (for example, implementations MAY use the size of the
         *   receiver advertised window).
         */
        peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));

        /* At this point, we may not have the receiver's advertised window,
         * so initialize ssthresh to the default value and it will be set
         * later when we process the INIT.
         */
        peer->ssthresh = SCTP_DEFAULT_MAXWINDOW;

        peer->partial_bytes_acked = 0;
        peer->flight_size = 0;
        peer->burst_limited = 0;

        /* Set the transport's RTO.initial value */
        peer->rto = asoc->rto_initial;
        sctp_max_rto(asoc, peer);

        /* Set the peer's active state. */
        peer->state = peer_state;

        /* Add this peer into the transport hashtable */
        if (sctp_hash_transport(peer)) {
                sctp_transport_free(peer);
                return NULL;
        }

        sctp_transport_pl_reset(peer);

        /* Attach the remote transport to our asoc.  */
        list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
        asoc->peer.transport_count++;

        sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_ADDED, 0);

        /* If we do not yet have a primary path, set one.  */
        if (!asoc->peer.primary_path) {
                sctp_assoc_set_primary(asoc, peer);
                asoc->peer.retran_path = peer;
        }

        if (asoc->peer.active_path == asoc->peer.retran_path &&
            peer->state != SCTP_UNCONFIRMED) {
                asoc->peer.retran_path = peer;
        }

        return peer;
}

/* Delete a transport address from an association.  */
void sctp_assoc_del_peer(struct sctp_association *asoc,
                         const union sctp_addr *addr)
{
        struct list_head        *pos;
        struct list_head        *temp;
        struct sctp_transport        *transport;

        list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
                transport = list_entry(pos, struct sctp_transport, transports);
                if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
                        /* Do book keeping for removing the peer and free it. */
                        sctp_assoc_rm_peer(asoc, transport);
                        break;
                }
        }
}

/* Lookup a transport by address. */
struct sctp_transport *sctp_assoc_lookup_paddr(
                                        const struct sctp_association *asoc,
                                        const union sctp_addr *address)
{
        struct sctp_transport *t;

        /* Cycle through all transports searching for a peer address. */

        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                        transports) {
                if (sctp_cmp_addr_exact(address, &t->ipaddr))
                        return t;
        }

        return NULL;
}

/* Remove all transports except a give one */
void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc,
                                     struct sctp_transport *primary)
{
        struct sctp_transport        *temp;
        struct sctp_transport        *t;

        list_for_each_entry_safe(t, temp, &asoc->peer.transport_addr_list,
                                 transports) {
                /* if the current transport is not the primary one, delete it */
                if (t != primary)
                        sctp_assoc_rm_peer(asoc, t);
        }
}

/* Engage in transport control operations.
 * Mark the transport up or down and send a notification to the user.
 * Select and update the new active and retran paths.
 */
void sctp_assoc_control_transport(struct sctp_association *asoc,
                                  struct sctp_transport *transport,
                                  enum sctp_transport_cmd command,
                                  sctp_sn_error_t error)
{
        int spc_state = SCTP_ADDR_AVAILABLE;
        bool ulp_notify = true;

        /* Record the transition on the transport.  */
        switch (command) {
        case SCTP_TRANSPORT_UP:
                /* If we are moving from UNCONFIRMED state due
                 * to heartbeat success, report the SCTP_ADDR_CONFIRMED
                 * state to the user, otherwise report SCTP_ADDR_AVAILABLE.
                 */
                if (transport->state == SCTP_PF &&
                    asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE)
                        ulp_notify = false;
                else if (transport->state == SCTP_UNCONFIRMED &&
                         error == SCTP_HEARTBEAT_SUCCESS)
                        spc_state = SCTP_ADDR_CONFIRMED;

                transport->state = SCTP_ACTIVE;
                sctp_transport_pl_reset(transport);
                break;

        case SCTP_TRANSPORT_DOWN:
                /* If the transport was never confirmed, do not transition it
                 * to inactive state.  Also, release the cached route since
                 * there may be a better route next time.
                 */
                if (transport->state != SCTP_UNCONFIRMED) {
                        transport->state = SCTP_INACTIVE;
                        sctp_transport_pl_reset(transport);
                        spc_state = SCTP_ADDR_UNREACHABLE;
                } else {
                        sctp_transport_dst_release(transport);
                        ulp_notify = false;
                }
                break;

        case SCTP_TRANSPORT_PF:
                transport->state = SCTP_PF;
                if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE)
                        ulp_notify = false;
                else
                        spc_state = SCTP_ADDR_POTENTIALLY_FAILED;
                break;

        default:
                return;
        }

        /* Generate and send a SCTP_PEER_ADDR_CHANGE notification
         * to the user.
         */
        if (ulp_notify)
                sctp_ulpevent_notify_peer_addr_change(transport,
                                                      spc_state, error);

        /* Select new active and retran paths. */
        sctp_select_active_and_retran_path(asoc);
}

/* Hold a reference to an association. */
void sctp_association_hold(struct sctp_association *asoc)
{
        refcount_inc(&asoc->base.refcnt);
}

/* Release a reference to an association and cleanup
 * if there are no more references.
 */
void sctp_association_put(struct sctp_association *asoc)
{
        if (refcount_dec_and_test(&asoc->base.refcnt))
                sctp_association_destroy(asoc);
}

/* Allocate the next TSN, Transmission Sequence Number, for the given
 * association.
 */
__u32 sctp_association_get_next_tsn(struct sctp_association *asoc)
{
        /* From Section 1.6 Serial Number Arithmetic:
         * Transmission Sequence Numbers wrap around when they reach
         * 2**32 - 1.  That is, the next TSN a DATA chunk MUST use
         * after transmitting TSN = 2*32 - 1 is TSN = 0.
         */
        __u32 retval = asoc->next_tsn;
        asoc->next_tsn++;
        asoc->unack_data++;

        return retval;
}

/* Compare two addresses to see if they match.  Wildcard addresses
 * only match themselves.
 */
int sctp_cmp_addr_exact(const union sctp_addr *ss1,
                        const union sctp_addr *ss2)
{
        struct sctp_af *af;

        af = sctp_get_af_specific(ss1->sa.sa_family);
        if (unlikely(!af))
                return 0;

        return af->cmp_addr(ss1, ss2);
}

/* Return an ecne chunk to get prepended to a packet.
 * Note:  We are sly and return a shared, prealloced chunk.  FIXME:
 * No we don't, but we could/should.
 */
struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc)
{
        if (!asoc->need_ecne)
                return NULL;

        /* Send ECNE if needed.
         * Not being able to allocate a chunk here is not deadly.
         */
        return sctp_make_ecne(asoc, asoc->last_ecne_tsn);
}

/*
 * Find which transport this TSN was sent on.
 */
struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc,
                                             __u32 tsn)
{
        struct sctp_transport *active;
        struct sctp_transport *match;
        struct sctp_transport *transport;
        struct sctp_chunk *chunk;
        __be32 key = htonl(tsn);

        match = NULL;

        /*
         * FIXME: In general, find a more efficient data structure for
         * searching.
         */

        /*
         * The general strategy is to search each transport's transmitted
         * list.   Return which transport this TSN lives on.
         *
         * Let's be hopeful and check the active_path first.
         * Another optimization would be to know if there is only one
         * outbound path and not have to look for the TSN at all.
         *
         */

        active = asoc->peer.active_path;

        list_for_each_entry(chunk, &active->transmitted,
                        transmitted_list) {

                if (key == chunk->subh.data_hdr->tsn) {
                        match = active;
                        goto out;
                }
        }

        /* If not found, go search all the other transports. */
        list_for_each_entry(transport, &asoc->peer.transport_addr_list,
                        transports) {

                if (transport == active)
                        continue;
                list_for_each_entry(chunk, &transport->transmitted,
                                transmitted_list) {
                        if (key == chunk->subh.data_hdr->tsn) {
                                match = transport;
                                goto out;
                        }
                }
        }
out:
        return match;
}

/* Do delayed input processing.  This is scheduled by sctp_rcv(). */
static void sctp_assoc_bh_rcv(struct work_struct *work)
{
        struct sctp_association *asoc =
                container_of(work, struct sctp_association,
                             base.inqueue.immediate);
        struct net *net = asoc->base.net;
        union sctp_subtype subtype;
        struct sctp_endpoint *ep;
        struct sctp_chunk *chunk;
        struct sctp_inq *inqueue;
        int first_time = 1;        /* is this the first time through the loop */
        int error = 0;
        int state;

        /* The association should be held so we should be safe. */
        ep = asoc->ep;

        inqueue = &asoc->base.inqueue;
        sctp_association_hold(asoc);
        while (NULL != (chunk = sctp_inq_pop(inqueue))) {
                state = asoc->state;
                subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type);

                /* If the first chunk in the packet is AUTH, do special
                 * processing specified in Section 6.3 of SCTP-AUTH spec
                 */
                if (first_time && subtype.chunk == SCTP_CID_AUTH) {
                        struct sctp_chunkhdr *next_hdr;

                        next_hdr = sctp_inq_peek(inqueue);
                        if (!next_hdr)
                                goto normal;

                        /* If the next chunk is COOKIE-ECHO, skip the AUTH
                         * chunk while saving a pointer to it so we can do
                         * Authentication later (during cookie-echo
                         * processing).
                         */
                        if (next_hdr->type == SCTP_CID_COOKIE_ECHO) {
                                chunk->auth_chunk = skb_clone(chunk->skb,
                                                              GFP_ATOMIC);
                                chunk->auth = 1;
                                continue;
                        }
                }

normal:
                /* SCTP-AUTH, Section 6.3:
                 *    The receiver has a list of chunk types which it expects
                 *    to be received only after an AUTH-chunk.  This list has
                 *    been sent to the peer during the association setup.  It
                 *    MUST silently discard these chunks if they are not placed
                 *    after an AUTH chunk in the packet.
                 */
                if (sctp_auth_recv_cid(subtype.chunk, asoc) && !chunk->auth)
                        continue;

                /* Remember where the last DATA chunk came from so we
                 * know where to send the SACK.
                 */
                if (sctp_chunk_is_data(chunk))
                        asoc->peer.last_data_from = chunk->transport;
                else {
                        SCTP_INC_STATS(net, SCTP_MIB_INCTRLCHUNKS);
                        asoc->stats.ictrlchunks++;
                        if (chunk->chunk_hdr->type == SCTP_CID_SACK)
                                asoc->stats.isacks++;
                }

                if (chunk->transport)
                        chunk->transport->last_time_heard = ktime_get();

                /* Run through the state machine. */
                error = sctp_do_sm(net, SCTP_EVENT_T_CHUNK, subtype,
                                   state, ep, asoc, chunk, GFP_ATOMIC);

                /* Check to see if the association is freed in response to
                 * the incoming chunk.  If so, get out of the while loop.
                 */
                if (asoc->base.dead)
                        break;

                /* If there is an error on chunk, discard this packet. */
                if (error && chunk)
                        chunk->pdiscard = 1;

                if (first_time)
                        first_time = 0;
        }
        sctp_association_put(asoc);
}

/* This routine moves an association from its old sk to a new sk.  */
void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk)
{
        struct sctp_sock *newsp = sctp_sk(newsk);
        struct sock *oldsk = assoc->base.sk;

        /* Delete the association from the old endpoint's list of
         * associations.
         */
        list_del_init(&assoc->asocs);

        /* Decrement the backlog value for a TCP-style socket. */
        if (sctp_style(oldsk, TCP))
                sk_acceptq_removed(oldsk);

        /* Release references to the old endpoint and the sock.  */
        sctp_endpoint_put(assoc->ep);
        sock_put(assoc->base.sk);

        /* Get a reference to the new endpoint.  */
        assoc->ep = newsp->ep;
        sctp_endpoint_hold(assoc->ep);

        /* Get a reference to the new sock.  */
        assoc->base.sk = newsk;
        sock_hold(assoc->base.sk);

        /* Add the association to the new endpoint's list of associations.  */
        sctp_endpoint_add_asoc(newsp->ep, assoc);
}

/* Update an association (possibly from unexpected COOKIE-ECHO processing).  */
int sctp_assoc_update(struct sctp_association *asoc,
                      struct sctp_association *new)
{
        struct sctp_transport *trans;
        struct list_head *pos, *temp;

        /* Copy in new parameters of peer. */
        asoc->c = new->c;
        asoc->peer.rwnd = new->peer.rwnd;
        asoc->peer.sack_needed = new->peer.sack_needed;
        asoc->peer.auth_capable = new->peer.auth_capable;
        asoc->peer.i = new->peer.i;

        if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
                              asoc->peer.i.initial_tsn, GFP_ATOMIC))
                return -ENOMEM;

        /* Remove any peer addresses not present in the new association. */
        list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
                trans = list_entry(pos, struct sctp_transport, transports);
                if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) {
                        sctp_assoc_rm_peer(asoc, trans);
                        continue;
                }

                if (asoc->state >= SCTP_STATE_ESTABLISHED)
                        sctp_transport_reset(trans);
        }

        /* If the case is A (association restart), use
         * initial_tsn as next_tsn. If the case is B, use
         * current next_tsn in case data sent to peer
         * has been discarded and needs retransmission.
         */
        if (asoc->state >= SCTP_STATE_ESTABLISHED) {
                asoc->next_tsn = new->next_tsn;
                asoc->ctsn_ack_point = new->ctsn_ack_point;
                asoc->adv_peer_ack_point = new->adv_peer_ack_point;

                /* Reinitialize SSN for both local streams
                 * and peer's streams.
                 */
                sctp_stream_clear(&asoc->stream);

                /* Flush the ULP reassembly and ordered queue.
                 * Any data there will now be stale and will
                 * cause problems.
                 */
                sctp_ulpq_flush(&asoc->ulpq);

                /* reset the overall association error count so
                 * that the restarted association doesn't get torn
                 * down on the next retransmission timer.
                 */
                asoc->overall_error_count = 0;

        } else {
                /* Add any peer addresses from the new association. */
                list_for_each_entry(trans, &new->peer.transport_addr_list,
                                    transports)
                        if (!sctp_assoc_add_peer(asoc, &trans->ipaddr,
                                                 GFP_ATOMIC, trans->state))
                                return -ENOMEM;

                asoc->ctsn_ack_point = asoc->next_tsn - 1;
                asoc->adv_peer_ack_point = asoc->ctsn_ack_point;

                if (sctp_state(asoc, COOKIE_WAIT))
                        sctp_stream_update(&asoc->stream, &new->stream);

                /* get a new assoc id if we don't have one yet. */
                if (sctp_assoc_set_id(asoc, GFP_ATOMIC))
                        return -ENOMEM;
        }

        /* SCTP-AUTH: Save the peer parameters from the new associations
         * and also move the association shared keys over
         */
        kfree(asoc->peer.peer_random);
        asoc->peer.peer_random = new->peer.peer_random;
        new->peer.peer_random = NULL;

        kfree(asoc->peer.peer_chunks);
        asoc->peer.peer_chunks = new->peer.peer_chunks;
        new->peer.peer_chunks = NULL;

        kfree(asoc->peer.peer_hmacs);
        asoc->peer.peer_hmacs = new->peer.peer_hmacs;
        new->peer.peer_hmacs = NULL;

        return sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC);
}

/* Update the retran path for sending a retransmitted packet.
 * See also RFC4960, 6.4. Multi-Homed SCTP Endpoints:
 *
 *   When there is outbound data to send and the primary path
 *   becomes inactive (e.g., due to failures), or where the
 *   SCTP user explicitly requests to send data to an
 *   inactive destination transport address, before reporting
 *   an error to its ULP, the SCTP endpoint should try to send
 *   the data to an alternate active destination transport
 *   address if one exists.
 *
 *   When retransmitting data that timed out, if the endpoint
 *   is multihomed, it should consider each source-destination
 *   address pair in its retransmission selection policy.
 *   When retransmitting timed-out data, the endpoint should
 *   attempt to pick the most divergent source-destination
 *   pair from the original source-destination pair to which
 *   the packet was transmitted.
 *
 *   Note: Rules for picking the most divergent source-destination
 *   pair are an implementation decision and are not specified
 *   within this document.
 *
 * Our basic strategy is to round-robin transports in priorities
 * according to sctp_trans_score() e.g., if no such
 * transport with state SCTP_ACTIVE exists, round-robin through
 * SCTP_UNKNOWN, etc. You get the picture.
 */
static u8 sctp_trans_score(const struct sctp_transport *trans)
{
        switch (trans->state) {
        case SCTP_ACTIVE:
                return 3;        /* best case */
        case SCTP_UNKNOWN:
                return 2;
        case SCTP_PF:
                return 1;
        default: /* case SCTP_INACTIVE */
                return 0;        /* worst case */
        }
}

static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1,
                                                   struct sctp_transport *trans2)
{
        if (trans1->error_count > trans2->error_count) {
                return trans2;
        } else if (trans1->error_count == trans2->error_count &&
                   ktime_after(trans2->last_time_heard,
                               trans1->last_time_heard)) {
                return trans2;
        } else {
                return trans1;
        }
}

static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr,
                                                    struct sctp_transport *best)
{
        u8 score_curr, score_best;

        if (best == NULL || curr == best)
                return curr;

        score_curr = sctp_trans_score(curr);
        score_best = sctp_trans_score(best);

        /* First, try a score-based selection if both transport states
         * differ. If we're in a tie, lets try to make a more clever
         * decision here based on error counts and last time heard.
         */
        if (score_curr > score_best)
                return curr;
        else if (score_curr == score_best)
                return sctp_trans_elect_tie(best, curr);
        else
                return best;
}

void sctp_assoc_update_retran_path(struct sctp_association *asoc)
{
        struct sctp_transport *trans = asoc->peer.retran_path;
        struct sctp_transport *trans_next = NULL;

        /* We're done as we only have the one and only path. */
        if (asoc->peer.transport_count == 1)
                return;
        /* If active_path and retran_path are the same and active,
         * then this is the only active path. Use it.
         */
        if (asoc->peer.active_path == asoc->peer.retran_path &&
            asoc->peer.active_path->state == SCTP_ACTIVE)
                return;

        /* Iterate from retran_path's successor back to retran_path. */
        for (trans = list_next_entry(trans, transports); 1;
             trans = list_next_entry(trans, transports)) {
                /* Manually skip the head element. */
                if (&trans->transports == &asoc->peer.transport_addr_list)
                        continue;
                if (trans->state == SCTP_UNCONFIRMED)
                        continue;
                trans_next = sctp_trans_elect_best(trans, trans_next);
                /* Active is good enough for immediate return. */
                if (trans_next->state == SCTP_ACTIVE)
                        break;
                /* We've reached the end, time to update path. */
                if (trans == asoc->peer.retran_path)
                        break;
        }

        asoc->peer.retran_path = trans_next;

        pr_debug("%s: association:%p updated new path to addr:%pISpc\n",
                 __func__, asoc, &asoc->peer.retran_path->ipaddr.sa);
}

static void sctp_select_active_and_retran_path(struct sctp_association *asoc)
{
        struct sctp_transport *trans, *trans_pri = NULL, *trans_sec = NULL;
        struct sctp_transport *trans_pf = NULL;

        /* Look for the two most recently used active transports. */
        list_for_each_entry(trans, &asoc->peer.transport_addr_list,
                            transports) {
                /* Skip uninteresting transports. */
                if (trans->state == SCTP_INACTIVE ||
                    trans->state == SCTP_UNCONFIRMED)
                        continue;
                /* Keep track of the best PF transport from our
                 * list in case we don't find an active one.
                 */
                if (trans->state == SCTP_PF) {
                        trans_pf = sctp_trans_elect_best(trans, trans_pf);
                        continue;
                }
                /* For active transports, pick the most recent ones. */
                if (trans_pri == NULL ||
                    ktime_after(trans->last_time_heard,
                                trans_pri->last_time_heard)) {
                        trans_sec = trans_pri;
                        trans_pri = trans;
                } else if (trans_sec == NULL ||
                           ktime_after(trans->last_time_heard,
                                       trans_sec->last_time_heard)) {
                        trans_sec = trans;
                }
        }

        /* RFC 2960 6.4 Multi-Homed SCTP Endpoints
         *
         * By default, an endpoint should always transmit to the primary
         * path, unless the SCTP user explicitly specifies the
         * destination transport address (and possibly source transport
         * address) to use. [If the primary is active but not most recent,
         * bump the most recently used transport.]
         */
        if ((asoc->peer.primary_path->state == SCTP_ACTIVE ||
             asoc->peer.primary_path->state == SCTP_UNKNOWN) &&
             asoc->peer.primary_path != trans_pri) {
                trans_sec = trans_pri;
                trans_pri = asoc->peer.primary_path;
        }

        /* We did not find anything useful for a possible retransmission
         * path; either primary path that we found is the same as
         * the current one, or we didn't generally find an active one.
         */
        if (trans_sec == NULL)
                trans_sec = trans_pri;

        /* If we failed to find a usable transport, just camp on the
         * active or pick a PF iff it's the better choice.
         */
        if (trans_pri == NULL) {
                trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf);
                trans_sec = trans_pri;
        }

        /* Set the active and retran transports. */
        asoc->peer.active_path = trans_pri;
        asoc->peer.retran_path = trans_sec;
}

struct sctp_transport *
sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
                                  struct sctp_transport *last_sent_to)
{
        /* If this is the first time packet is sent, use the active path,
         * else use the retran path. If the last packet was sent over the
         * retran path, update the retran path and use it.
         */
        if (last_sent_to == NULL) {
                return asoc->peer.active_path;
        } else {
                if (last_sent_to == asoc->peer.retran_path)
                        sctp_assoc_update_retran_path(asoc);

                return asoc->peer.retran_path;
        }
}

void sctp_assoc_update_frag_point(struct sctp_association *asoc)
{
        int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
                                    sctp_datachk_len(&asoc->stream));

        if (asoc->user_frag)
                frag = min_t(int, frag, asoc->user_frag);

        frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN -
                                sctp_datachk_len(&asoc->stream));

        asoc->frag_point = SCTP_TRUNC4(frag);
}

void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
{
        if (asoc->pathmtu != pmtu) {
                asoc->pathmtu = pmtu;
                sctp_assoc_update_frag_point(asoc);
        }

        pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
                 asoc->pathmtu, asoc->frag_point);
}

/* Update the association's pmtu and frag_point by going through all the
 * transports. This routine is called when a transport's PMTU has changed.
 */
void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
{
        struct sctp_transport *t;
        __u32 pmtu = 0;

        if (!asoc)
                return;

        /* Get the lowest pmtu of all the transports. */
        list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
                if (t->pmtu_pending && t->dst) {
                        sctp_transport_update_pmtu(t,
                                                   atomic_read(&t->mtu_info));
                        t->pmtu_pending = 0;
                }
                if (!pmtu || (t->pathmtu < pmtu))
                        pmtu = t->pathmtu;
        }

        sctp_assoc_set_pmtu(asoc, pmtu);
}

/* Should we send a SACK to update our peer? */
static inline bool sctp_peer_needs_update(struct sctp_association *asoc)
{
        struct net *net = asoc->base.net;

        switch (asoc->state) {
        case SCTP_STATE_ESTABLISHED:
        case SCTP_STATE_SHUTDOWN_PENDING:
        case SCTP_STATE_SHUTDOWN_RECEIVED:
        case SCTP_STATE_SHUTDOWN_SENT:
                if ((asoc->rwnd > asoc->a_rwnd) &&
                    ((asoc->rwnd - asoc->a_rwnd) >= max_t(__u32,
                           (asoc->base.sk->sk_rcvbuf >> net->sctp.rwnd_upd_shift),
                           asoc->pathmtu)))
                        return true;
                break;
        default:
                break;
        }
        return false;
}

/* Increase asoc's rwnd by len and send any window update SACK if needed. */
void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
{
        struct sctp_chunk *sack;
        struct timer_list *timer;

        if (asoc->rwnd_over) {
                if (asoc->rwnd_over >= len) {
                        asoc->rwnd_over -= len;
                } else {
                        asoc->rwnd += (len - asoc->rwnd_over);
                        asoc->rwnd_over = 0;
                }
        } else {
                asoc->rwnd += len;
        }

        /* If we had window pressure, start recovering it
         * once our rwnd had reached the accumulated pressure
         * threshold.  The idea is to recover slowly, but up
         * to the initial advertised window.
         */
        if (asoc->rwnd_press) {
                int change = min(asoc->pathmtu, asoc->rwnd_press);
                asoc->rwnd += change;
                asoc->rwnd_press -= change;
        }

        pr_debug("%s: asoc:%p rwnd increased by %d to (%u, %u) - %u\n",
                 __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
                 asoc->a_rwnd);

        /* Send a window update SACK if the rwnd has increased by at least the
         * minimum of the association's PMTU and half of the receive buffer.
         * The algorithm used is similar to the one described in
         * Section 4.2.3.3 of RFC 1122.
         */
        if (sctp_peer_needs_update(asoc)) {
                asoc->a_rwnd = asoc->rwnd;

                pr_debug("%s: sending window update SACK- asoc:%p rwnd:%u "
                         "a_rwnd:%u\n", __func__, asoc, asoc->rwnd,
                         asoc->a_rwnd);

                sack = sctp_make_sack(asoc);
                if (!sack)
                        return;

                asoc->peer.sack_needed = 0;

                sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC);

                /* Stop the SACK timer.  */
                timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
                if (del_timer(timer))
                        sctp_association_put(asoc);
        }
}

/* Decrease asoc's rwnd by len. */
void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
{
        int rx_count;
        int over = 0;

        if (unlikely(!asoc->rwnd || asoc->rwnd_over))
                pr_debug("%s: association:%p has asoc->rwnd:%u, "
                         "asoc->rwnd_over:%u!\n", __func__, asoc,
                         asoc->rwnd, asoc->rwnd_over);

        if (asoc->ep->rcvbuf_policy)
                rx_count = atomic_read(&asoc->rmem_alloc);
        else
                rx_count = atomic_read(&asoc->base.sk->sk_rmem_alloc);

        /* If we've reached or overflowed our receive buffer, announce
         * a 0 rwnd if rwnd would still be positive.  Store the
         * potential pressure overflow so that the window can be restored
         * back to original value.
         */
        if (rx_count >= asoc->base.sk->sk_rcvbuf)
                over = 1;

        if (asoc->rwnd >= len) {
                asoc->rwnd -= len;
                if (over) {
                        asoc->rwnd_press += asoc->rwnd;
                        asoc->rwnd = 0;
                }
        } else {
                asoc->rwnd_over += len - asoc->rwnd;
                asoc->rwnd = 0;
        }

        pr_debug("%s: asoc:%p rwnd decreased by %d to (%u, %u, %u)\n",
                 __func__, asoc, len, asoc->rwnd, asoc->rwnd_over,
                 asoc->rwnd_press);
}

/* Build the bind address list for the association based on info from the
 * local endpoint and the remote peer.
 */
int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
                                     enum sctp_scope scope, gfp_t gfp)
{
        struct sock *sk = asoc->base.sk;
        int flags;

        /* Use scoping rules to determine the subset of addresses from
         * the endpoint.
         */
        flags = (PF_INET6 == sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0;
        if (!inet_v6_ipv6only(sk))
                flags |= SCTP_ADDR4_ALLOWED;
        if (asoc->peer.ipv4_address)
                flags |= SCTP_ADDR4_PEERSUPP;
        if (asoc->peer.ipv6_address)
                flags |= SCTP_ADDR6_PEERSUPP;

        return sctp_bind_addr_copy(asoc->base.net,
                                   &asoc->base.bind_addr,
                                   &asoc->ep->base.bind_addr,
                                   scope, gfp, flags);
}

/* Build the association's bind address list from the cookie.  */
int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
                                         struct sctp_cookie *cookie,
                                         gfp_t gfp)
{
        struct sctp_init_chunk *peer_init = (struct sctp_init_chunk *)(cookie + 1);
        int var_size2 = ntohs(peer_init->chunk_hdr.length);
        int var_size3 = cookie->raw_addr_list_len;
        __u8 *raw = (__u8 *)peer_init + var_size2;

        return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3,
                                      asoc->ep->base.bind_addr.port, gfp);
}

/* Lookup laddr in the bind address list of an association. */
int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
                            const union sctp_addr *laddr)
{
        int found = 0;

        if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) &&
            sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
                                 sctp_sk(asoc->base.sk)))
                found = 1;

        return found;
}

/* Set an association id for a given association */
int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
{
        bool preload = gfpflags_allow_blocking(gfp);
        int ret;

        /* If the id is already assigned, keep it. */
        if (asoc->assoc_id)
                return 0;

        if (preload)
                idr_preload(gfp);
        spin_lock_bh(&sctp_assocs_id_lock);
        /* 0, 1, 2 are used as SCTP_FUTURE_ASSOC, SCTP_CURRENT_ASSOC and
         * SCTP_ALL_ASSOC, so an available id must be > SCTP_ALL_ASSOC.
         */
        ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, SCTP_ALL_ASSOC + 1, 0,
                               GFP_NOWAIT);
        spin_unlock_bh(&sctp_assocs_id_lock);
        if (preload)
                idr_preload_end();
        if (ret < 0)
                return ret;

        asoc->assoc_id = (sctp_assoc_t)ret;
        return 0;
}

/* Free the ASCONF queue */
static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc)
{
        struct sctp_chunk *asconf;
        struct sctp_chunk *tmp;

        list_for_each_entry_safe(asconf, tmp, &asoc->addip_chunk_list, list) {
                list_del_init(&asconf->list);
                sctp_chunk_free(asconf);
        }
}

/* Free asconf_ack cache */
static void sctp_assoc_free_asconf_acks(struct sctp_association *asoc)
{
        struct sctp_chunk *ack;
        struct sctp_chunk *tmp;

        list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list,
                                transmitted_list) {
                list_del_init(&ack->transmitted_list);
                sctp_chunk_free(ack);
        }
}

/* Clean up the ASCONF_ACK queue */
void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc)
{
        struct sctp_chunk *ack;
        struct sctp_chunk *tmp;

        /* We can remove all the entries from the queue up to
         * the "Peer-Sequence-Number".
         */
        list_for_each_entry_safe(ack, tmp, &asoc->asconf_ack_list,
                                transmitted_list) {
                if (ack->subh.addip_hdr->serial ==
                                htonl(asoc->peer.addip_serial))
                        break;

                list_del_init(&ack->transmitted_list);
                sctp_chunk_free(ack);
        }
}

/* Find the ASCONF_ACK whose serial number matches ASCONF */
struct sctp_chunk *sctp_assoc_lookup_asconf_ack(
                                        const struct sctp_association *asoc,
                                        __be32 serial)
{
        struct sctp_chunk *ack;

        /* Walk through the list of cached ASCONF-ACKs and find the
         * ack chunk whose serial number matches that of the request.
         */
        list_for_each_entry(ack, &asoc->asconf_ack_list, transmitted_list) {
                if (sctp_chunk_pending(ack))
                        continue;
                if (ack->subh.addip_hdr->serial == serial) {
                        sctp_chunk_hold(ack);
                        return ack;
                }
        }

        return NULL;
}

void sctp_asconf_queue_teardown(struct sctp_association *asoc)
{
        /* Free any cached ASCONF_ACK chunk. */
        sctp_assoc_free_asconf_acks(asoc);

        /* Free the ASCONF queue. */
        sctp_assoc_free_asconf_queue(asoc);

        /* Free any cached ASCONF chunk. */
        if (asoc->addip_last_asconf)
                sctp_chunk_free(asoc->addip_last_asconf);
}

















   22 






   23 


   23 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/*
 * Stack trace management functions
 *
 *  Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 */
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>

void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
                     struct task_struct *task, struct pt_regs *regs)
{
        struct unwind_state state;
        unsigned long addr;

        if (regs && !consume_entry(cookie, regs->ip))
                return;

        for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
             unwind_next_frame(&state)) {
                addr = unwind_get_return_address(&state);
                if (!addr || !consume_entry(cookie, addr))
                        break;
        }
}

int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
                             void *cookie, struct task_struct *task)
{
        struct unwind_state state;
        struct pt_regs *regs;
        unsigned long addr;

        for (unwind_start(&state, task, NULL, NULL);
             !unwind_done(&state) && !unwind_error(&state);
             unwind_next_frame(&state)) {

                regs = unwind_get_entry_regs(&state, NULL);
                if (regs) {
                        /* Success path for user tasks */
                        if (user_mode(regs))
                                return 0;

                        /*
                         * Kernel mode registers on the stack indicate an
                         * in-kernel interrupt or exception (e.g., preemption
                         * or a page fault), which can make frame pointers
                         * unreliable.
                         */
                        if (IS_ENABLED(CONFIG_FRAME_POINTER))
                                return -EINVAL;
                }

                addr = unwind_get_return_address(&state);

                /*
                 * A NULL or invalid return address probably means there's some
                 * generated code which __kernel_text_address() doesn't know
                 * about.
                 */
                if (!addr)
                        return -EINVAL;

                if (!consume_entry(cookie, addr))
                        return -EINVAL;
        }

        /* Check for stack corruption */
        if (unwind_error(&state))
                return -EINVAL;

        return 0;
}

/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */

struct stack_frame_user {
        const void __user        *next_fp;
        unsigned long                ret_addr;
};

static int
copy_stack_frame(const struct stack_frame_user __user *fp,
                 struct stack_frame_user *frame)
{
        int ret;

        if (!__access_ok(fp, sizeof(*frame)))
                return 0;

        ret = 1;
        pagefault_disable();
        if (__get_user(frame->next_fp, &fp->next_fp) ||
            __get_user(frame->ret_addr, &fp->ret_addr))
                ret = 0;
        pagefault_enable();

        return ret;
}

void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
                          const struct pt_regs *regs)
{
        const void __user *fp = (const void __user *)regs->bp;

        if (!consume_entry(cookie, regs->ip))
                return;

        while (1) {
                struct stack_frame_user frame;

                frame.next_fp = NULL;
                frame.ret_addr = 0;
                if (!copy_stack_frame(fp, &frame))
                        break;
                if ((unsigned long)fp < regs->sp)
                        break;
                if (!frame.ret_addr)
                        break;
                if (!consume_entry(cookie, frame.ret_addr))
                        break;
                fp = frame.next_fp;
        }
}












    2 
    2 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <net/net_namespace.h>
#include <linux/if_arp.h>
#include <net/rtnetlink.h>

static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev)
{
        dev_lstats_add(dev, skb->len);

        dev_kfree_skb(skb);

        return NETDEV_TX_OK;
}

struct nlmon {
        struct netlink_tap nt;
};

static int nlmon_open(struct net_device *dev)
{
        struct nlmon *nlmon = netdev_priv(dev);

        nlmon->nt.dev = dev;
        nlmon->nt.module = THIS_MODULE;
        return netlink_add_tap(&nlmon->nt);
}

static int nlmon_close(struct net_device *dev)
{
        struct nlmon *nlmon = netdev_priv(dev);

        return netlink_remove_tap(&nlmon->nt);
}

static void
nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes);
}

static u32 always_on(struct net_device *dev)
{
        return 1;
}

static const struct ethtool_ops nlmon_ethtool_ops = {
        .get_link = always_on,
};

static const struct net_device_ops nlmon_ops = {
        .ndo_open = nlmon_open,
        .ndo_stop = nlmon_close,
        .ndo_start_xmit = nlmon_xmit,
        .ndo_get_stats64 = nlmon_get_stats64,
};

static void nlmon_setup(struct net_device *dev)
{
        dev->type = ARPHRD_NETLINK;
        dev->priv_flags |= IFF_NO_QUEUE;

        dev->netdev_ops        = &nlmon_ops;
        dev->ethtool_ops = &nlmon_ethtool_ops;
        dev->needs_free_netdev = true;

        dev->features = NETIF_F_SG | NETIF_F_FRAGLIST |
                        NETIF_F_HIGHDMA | NETIF_F_LLTX;
        dev->flags = IFF_NOARP;
        dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS;

        /* That's rather a softlimit here, which, of course,
         * can be altered. Not a real MTU, but what is to be
         * expected in most cases.
         */
        dev->mtu = NLMSG_GOODSIZE;
        dev->min_mtu = sizeof(struct nlmsghdr);
}

static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[],
                          struct netlink_ext_ack *extack)
{
        if (tb[IFLA_ADDRESS])
                return -EINVAL;
        return 0;
}

static struct rtnl_link_ops nlmon_link_ops __read_mostly = {
        .kind                        = "nlmon",
        .priv_size                = sizeof(struct nlmon),
        .setup                        = nlmon_setup,
        .validate                = nlmon_validate,
};

static __init int nlmon_register(void)
{
        return rtnl_link_register(&nlmon_link_ops);
}

static __exit void nlmon_unregister(void)
{
        rtnl_link_unregister(&nlmon_link_ops);
}

module_init(nlmon_register);
module_exit(nlmon_unregister);

MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>");
MODULE_DESCRIPTION("Netlink monitoring device");
MODULE_ALIAS_RTNL_LINK("nlmon");








































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
// SPDX-License-Identifier: GPL-2.0

/*
 * Copyright (C) 2020 Google LLC.
 */

#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/binfmts.h>
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
#include <linux/bpf-cgroup.h>

/* For every LSM hook that allows attachment of BPF programs, declare a nop
 * function where a BPF program can be attached.
 */
#define LSM_HOOK(RET, DEFAULT, NAME, ...)        \
noinline RET bpf_lsm_##NAME(__VA_ARGS__)        \
{                                                \
        return DEFAULT;                                \
}

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
BTF_SET_START(bpf_lsm_hooks)
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)

/* List of LSM hooks that should operate on 'current' cgroup regardless
 * of function signature.
 */
BTF_SET_START(bpf_lsm_current_hooks)
/* operate on freshly allocated sk without any cgroup association */
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
#endif
BTF_SET_END(bpf_lsm_current_hooks)

/* List of LSM hooks that trigger while the socket is properly locked.
 */
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
#endif
BTF_SET_END(bpf_lsm_locked_sockopt_hooks)

/* List of LSM hooks that trigger while the socket is _not_ locked,
 * but it's ok to call bpf_{g,s}etsockopt because the socket is still
 * in the early init phase.
 */
BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_socketpair)
#endif
BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)

#ifdef CONFIG_CGROUP_BPF
void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
                             bpf_func_t *bpf_func)
{
        const struct btf_param *args __maybe_unused;

        if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
            btf_id_set_contains(&bpf_lsm_current_hooks,
                                prog->aux->attach_btf_id)) {
                *bpf_func = __cgroup_bpf_run_lsm_current;
                return;
        }

#ifdef CONFIG_NET
        args = btf_params(prog->aux->attach_func_proto);

        if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
                *bpf_func = __cgroup_bpf_run_lsm_socket;
        else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
                *bpf_func = __cgroup_bpf_run_lsm_sock;
        else
#endif
                *bpf_func = __cgroup_bpf_run_lsm_current;
}
#endif

int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                        const struct bpf_prog *prog)
{
        if (!prog->gpl_compatible) {
                bpf_log(vlog,
                        "LSM programs must have a GPL compatible license\n");
                return -EINVAL;
        }

        if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
                bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
                        prog->aux->attach_btf_id, prog->aux->attach_func_name);
                return -EINVAL;
        }

        return 0;
}

/* Mask for all the currently supported BPRM option flags */
#define BPF_F_BRPM_OPTS_MASK        BPF_F_BPRM_SECUREEXEC

BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
{
        if (flags & ~BPF_F_BRPM_OPTS_MASK)
                return -EINVAL;

        bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
        return 0;
}

BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)

static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
        .func                = bpf_bprm_opts_set,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_bprm_opts_set_btf_ids[0],
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
{
        return ima_inode_hash(inode, dst, size);
}

static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
{
        return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
}

BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)

static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
        .func                = bpf_ima_inode_hash,
        .gpl_only        = false,
        .might_sleep        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_ima_inode_hash_btf_ids[0],
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .allowed        = bpf_ima_inode_hash_allowed,
};

BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
{
        return ima_file_hash(file, dst, size);
}

BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)

static const struct bpf_func_proto bpf_ima_file_hash_proto = {
        .func                = bpf_ima_file_hash,
        .gpl_only        = false,
        .might_sleep        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_ima_file_hash_btf_ids[0],
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .allowed        = bpf_ima_inode_hash_allowed,
};

BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
{
        struct bpf_trace_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        return run_ctx->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
        .func                = bpf_get_attach_cookie,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        if (prog->expected_attach_type == BPF_LSM_CGROUP) {
                func_proto = cgroup_common_func_proto(func_id, prog);
                if (func_proto)
                        return func_proto;
        }

        switch (func_id) {
        case BPF_FUNC_inode_storage_get:
                return &bpf_inode_storage_get_proto;
        case BPF_FUNC_inode_storage_delete:
                return &bpf_inode_storage_delete_proto;
#ifdef CONFIG_NET
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#endif /* CONFIG_NET */
        case BPF_FUNC_spin_lock:
                return &bpf_spin_lock_proto;
        case BPF_FUNC_spin_unlock:
                return &bpf_spin_unlock_proto;
        case BPF_FUNC_bprm_opts_set:
                return &bpf_bprm_opts_set_proto;
        case BPF_FUNC_ima_inode_hash:
                return &bpf_ima_inode_hash_proto;
        case BPF_FUNC_ima_file_hash:
                return &bpf_ima_file_hash_proto;
        case BPF_FUNC_get_attach_cookie:
                return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
#ifdef CONFIG_NET
        case BPF_FUNC_setsockopt:
                if (prog->expected_attach_type != BPF_LSM_CGROUP)
                        return NULL;
                if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_sk_setsockopt_proto;
                if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_unlocked_sk_setsockopt_proto;
                return NULL;
        case BPF_FUNC_getsockopt:
                if (prog->expected_attach_type != BPF_LSM_CGROUP)
                        return NULL;
                if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_sk_getsockopt_proto;
                if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_unlocked_sk_getsockopt_proto;
                return NULL;
#endif
        default:
                return tracing_prog_func_proto(func_id, prog);
        }
}

/* The set of hooks which are called without pagefaults disabled and are allowed
 * to "sleep" and thus can be used for sleepable BPF programs.
 */
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
BTF_ID(func, bpf_lsm_bpf_map)
BTF_ID(func, bpf_lsm_bpf_map_create)
BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog)
BTF_ID(func, bpf_lsm_bpf_prog_load)
BTF_ID(func, bpf_lsm_bpf_prog_free)
BTF_ID(func, bpf_lsm_bpf_token_create)
BTF_ID(func, bpf_lsm_bpf_token_free)
BTF_ID(func, bpf_lsm_bpf_token_cmd)
BTF_ID(func, bpf_lsm_bpf_token_capable)
BTF_ID(func, bpf_lsm_bprm_check_security)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
BTF_ID(func, bpf_lsm_bprm_committing_creds)
BTF_ID(func, bpf_lsm_bprm_creds_for_exec)
BTF_ID(func, bpf_lsm_bprm_creds_from_file)
BTF_ID(func, bpf_lsm_capget)
BTF_ID(func, bpf_lsm_capset)
BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
BTF_ID(func, bpf_lsm_file_receive)

BTF_ID(func, bpf_lsm_inode_create)
BTF_ID(func, bpf_lsm_inode_free_security)
BTF_ID(func, bpf_lsm_inode_getattr)
BTF_ID(func, bpf_lsm_inode_getxattr)
BTF_ID(func, bpf_lsm_inode_mknod)
BTF_ID(func, bpf_lsm_inode_need_killpriv)
BTF_ID(func, bpf_lsm_inode_post_setxattr)
BTF_ID(func, bpf_lsm_inode_readlink)
BTF_ID(func, bpf_lsm_inode_rename)
BTF_ID(func, bpf_lsm_inode_rmdir)
BTF_ID(func, bpf_lsm_inode_setattr)
BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)

#ifdef CONFIG_SECURITY_PATH
BTF_ID(func, bpf_lsm_path_unlink)
BTF_ID(func, bpf_lsm_path_mkdir)
BTF_ID(func, bpf_lsm_path_rmdir)
BTF_ID(func, bpf_lsm_path_truncate)
BTF_ID(func, bpf_lsm_path_symlink)
BTF_ID(func, bpf_lsm_path_link)
BTF_ID(func, bpf_lsm_path_rename)
BTF_ID(func, bpf_lsm_path_chmod)
BTF_ID(func, bpf_lsm_path_chown)
#endif /* CONFIG_SECURITY_PATH */

#ifdef CONFIG_KEYS
BTF_ID(func, bpf_lsm_key_free)
#endif /* CONFIG_KEYS */

BTF_ID(func, bpf_lsm_mmap_file)
BTF_ID(func, bpf_lsm_netlink_send)
BTF_ID(func, bpf_lsm_path_notify)
BTF_ID(func, bpf_lsm_release_secctx)
BTF_ID(func, bpf_lsm_sb_alloc_security)
BTF_ID(func, bpf_lsm_sb_eat_lsm_opts)
BTF_ID(func, bpf_lsm_sb_kern_mount)
BTF_ID(func, bpf_lsm_sb_mount)
BTF_ID(func, bpf_lsm_sb_remount)
BTF_ID(func, bpf_lsm_sb_set_mnt_opts)
BTF_ID(func, bpf_lsm_sb_show_options)
BTF_ID(func, bpf_lsm_sb_statfs)
BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)

#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_inet_conn_established)

BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
BTF_ID(func, bpf_lsm_socket_create)
BTF_ID(func, bpf_lsm_socket_getpeername)
BTF_ID(func, bpf_lsm_socket_getpeersec_dgram)
BTF_ID(func, bpf_lsm_socket_getsockname)
BTF_ID(func, bpf_lsm_socket_getsockopt)
BTF_ID(func, bpf_lsm_socket_listen)
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_recvmsg)
BTF_ID(func, bpf_lsm_socket_sendmsg)
BTF_ID(func, bpf_lsm_socket_shutdown)
BTF_ID(func, bpf_lsm_socket_socketpair)
#endif /* CONFIG_SECURITY_NETWORK */

BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)

BTF_SET_START(untrusted_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog_free)
BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_ID(func, bpf_lsm_file_free_security)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_task_free)
BTF_SET_END(untrusted_lsm_hooks)

bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
        return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}

bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
{
        return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
}

const struct bpf_prog_ops lsm_prog_ops = {
};

const struct bpf_verifier_ops lsm_verifier_ops = {
        .get_func_proto = bpf_lsm_func_proto,
        .is_valid_access = btf_ctx_access,
};




























































































































































































    9 






    6 


    3 




    6 









    5 







    2 






    4 


    2 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
// SPDX-License-Identifier: GPL-2.0-or-later

#define pr_fmt(fmt) "ref_tracker: " fmt

#include <linux/export.h>
#include <linux/list_sort.h>
#include <linux/ref_tracker.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/stackdepot.h>

#define REF_TRACKER_STACK_ENTRIES 16
#define STACK_BUF_SIZE 1024

struct ref_tracker {
        struct list_head        head;   /* anchor into dir->list or dir->quarantine */
        bool                        dead;
        depot_stack_handle_t        alloc_stack_handle;
        depot_stack_handle_t        free_stack_handle;
};

struct ref_tracker_dir_stats {
        int total;
        int count;
        struct {
                depot_stack_handle_t stack_handle;
                unsigned int count;
        } stacks[];
};

static struct ref_tracker_dir_stats *
ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
{
        struct ref_tracker_dir_stats *stats;
        struct ref_tracker *tracker;

        stats = kmalloc(struct_size(stats, stacks, limit),
                        GFP_NOWAIT | __GFP_NOWARN);
        if (!stats)
                return ERR_PTR(-ENOMEM);
        stats->total = 0;
        stats->count = 0;

        list_for_each_entry(tracker, &dir->list, head) {
                depot_stack_handle_t stack = tracker->alloc_stack_handle;
                int i;

                ++stats->total;
                for (i = 0; i < stats->count; ++i)
                        if (stats->stacks[i].stack_handle == stack)
                                break;
                if (i >= limit)
                        continue;
                if (i >= stats->count) {
                        stats->stacks[i].stack_handle = stack;
                        stats->stacks[i].count = 0;
                        ++stats->count;
                }
                ++stats->stacks[i].count;
        }

        return stats;
}

struct ostream {
        char *buf;
        int size, used;
};

#define pr_ostream(stream, fmt, args...) \
({ \
        struct ostream *_s = (stream); \
\
        if (!_s->buf) { \
                pr_err(fmt, ##args); \
        } else { \
                int ret, len = _s->size - _s->used; \
                ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \
                _s->used += min(ret, len); \
        } \
})

static void
__ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
                             unsigned int display_limit, struct ostream *s)
{
        struct ref_tracker_dir_stats *stats;
        unsigned int i = 0, skipped;
        depot_stack_handle_t stack;
        char *sbuf;

        lockdep_assert_held(&dir->lock);

        if (list_empty(&dir->list))
                return;

        stats = ref_tracker_get_stats(dir, display_limit);
        if (IS_ERR(stats)) {
                pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n",
                           dir->name, dir, stats);
                return;
        }

        sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN);

        for (i = 0, skipped = stats->total; i < stats->count; ++i) {
                stack = stats->stacks[i].stack_handle;
                if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4))
                        sbuf[0] = 0;
                pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir,
                           stats->stacks[i].count, stats->total, sbuf);
                skipped -= stats->stacks[i].count;
        }

        if (skipped)
                pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n",
                           dir->name, dir, skipped, stats->total);

        kfree(sbuf);

        kfree(stats);
}

void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir,
                                  unsigned int display_limit)
{
        struct ostream os = {};

        __ref_tracker_dir_pr_ostream(dir, display_limit, &os);
}
EXPORT_SYMBOL(ref_tracker_dir_print_locked);

void ref_tracker_dir_print(struct ref_tracker_dir *dir,
                           unsigned int display_limit)
{
        unsigned long flags;

        spin_lock_irqsave(&dir->lock, flags);
        ref_tracker_dir_print_locked(dir, display_limit);
        spin_unlock_irqrestore(&dir->lock, flags);
}
EXPORT_SYMBOL(ref_tracker_dir_print);

int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size)
{
        struct ostream os = { .buf = buf, .size = size };
        unsigned long flags;

        spin_lock_irqsave(&dir->lock, flags);
        __ref_tracker_dir_pr_ostream(dir, 16, &os);
        spin_unlock_irqrestore(&dir->lock, flags);

        return os.used;
}
EXPORT_SYMBOL(ref_tracker_dir_snprint);

void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
{
        struct ref_tracker *tracker, *n;
        unsigned long flags;
        bool leak = false;

        dir->dead = true;
        spin_lock_irqsave(&dir->lock, flags);
        list_for_each_entry_safe(tracker, n, &dir->quarantine, head) {
                list_del(&tracker->head);
                kfree(tracker);
                dir->quarantine_avail++;
        }
        if (!list_empty(&dir->list)) {
                ref_tracker_dir_print_locked(dir, 16);
                leak = true;
                list_for_each_entry_safe(tracker, n, &dir->list, head) {
                        list_del(&tracker->head);
                        kfree(tracker);
                }
        }
        spin_unlock_irqrestore(&dir->lock, flags);
        WARN_ON_ONCE(leak);
        WARN_ON_ONCE(refcount_read(&dir->untracked) != 1);
        WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1);
}
EXPORT_SYMBOL(ref_tracker_dir_exit);

int ref_tracker_alloc(struct ref_tracker_dir *dir,
                      struct ref_tracker **trackerp,
                      gfp_t gfp)
{
        unsigned long entries[REF_TRACKER_STACK_ENTRIES];
        struct ref_tracker *tracker;
        unsigned int nr_entries;
        gfp_t gfp_mask = gfp | __GFP_NOWARN;
        unsigned long flags;

        WARN_ON_ONCE(dir->dead);

        if (!trackerp) {
                refcount_inc(&dir->no_tracker);
                return 0;
        }
        if (gfp & __GFP_DIRECT_RECLAIM)
                gfp_mask |= __GFP_NOFAIL;
        *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask);
        if (unlikely(!tracker)) {
                pr_err_once("memory allocation failure, unreliable refcount tracker.\n");
                refcount_inc(&dir->untracked);
                return -ENOMEM;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
        tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp);

        spin_lock_irqsave(&dir->lock, flags);
        list_add(&tracker->head, &dir->list);
        spin_unlock_irqrestore(&dir->lock, flags);
        return 0;
}
EXPORT_SYMBOL_GPL(ref_tracker_alloc);

int ref_tracker_free(struct ref_tracker_dir *dir,
                     struct ref_tracker **trackerp)
{
        unsigned long entries[REF_TRACKER_STACK_ENTRIES];
        depot_stack_handle_t stack_handle;
        struct ref_tracker *tracker;
        unsigned int nr_entries;
        unsigned long flags;

        WARN_ON_ONCE(dir->dead);

        if (!trackerp) {
                refcount_dec(&dir->no_tracker);
                return 0;
        }
        tracker = *trackerp;
        if (!tracker) {
                refcount_dec(&dir->untracked);
                return -EEXIST;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
        stack_handle = stack_depot_save(entries, nr_entries,
                                        GFP_NOWAIT | __GFP_NOWARN);

        spin_lock_irqsave(&dir->lock, flags);
        if (tracker->dead) {
                pr_err("reference already released.\n");
                if (tracker->alloc_stack_handle) {
                        pr_err("allocated in:\n");
                        stack_depot_print(tracker->alloc_stack_handle);
                }
                if (tracker->free_stack_handle) {
                        pr_err("freed in:\n");
                        stack_depot_print(tracker->free_stack_handle);
                }
                spin_unlock_irqrestore(&dir->lock, flags);
                WARN_ON_ONCE(1);
                return -EINVAL;
        }
        tracker->dead = true;

        tracker->free_stack_handle = stack_handle;

        list_move_tail(&tracker->head, &dir->quarantine);
        if (!dir->quarantine_avail) {
                tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head);
                list_del(&tracker->head);
        } else {
                dir->quarantine_avail--;
                tracker = NULL;
        }
        spin_unlock_irqrestore(&dir->lock, flags);

        kfree(tracker);
        return 0;
}
EXPORT_SYMBOL_GPL(ref_tracker_free);
























    1 











    1 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains functions which manage high resolution tick
 * related events.
 *
 * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
 */
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>

#include "tick-internal.h"

/**
 * tick_program_event - program the CPU local timer device for the next event
 */
int tick_program_event(ktime_t expires, int force)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);

        if (unlikely(expires == KTIME_MAX)) {
                /*
                 * We don't need the clock event device any more, stop it.
                 */
                clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
                dev->next_event = KTIME_MAX;
                return 0;
        }

        if (unlikely(clockevent_state_oneshot_stopped(dev))) {
                /*
                 * We need the clock event again, configure it in ONESHOT mode
                 * before using it.
                 */
                clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
        }

        return clockevents_program_event(dev, expires, force);
}

/**
 * tick_resume_oneshot - resume oneshot mode
 */
void tick_resume_oneshot(void)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);

        clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(dev, ktime_get(), true);
}

/**
 * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
 */
void tick_setup_oneshot(struct clock_event_device *newdev,
                        void (*handler)(struct clock_event_device *),
                        ktime_t next_event)
{
        newdev->event_handler = handler;
        clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
        clockevents_program_event(newdev, next_event, true);
}

/**
 * tick_switch_to_oneshot - switch to oneshot mode
 */
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
        struct clock_event_device *dev = td->evtdev;

        if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
                    !tick_device_is_functional(dev)) {

                pr_info("Clockevents: could not switch to one-shot mode:");
                if (!dev) {
                        pr_cont(" no tick device\n");
                } else {
                        if (!tick_device_is_functional(dev))
                                pr_cont(" %s is not functional.\n", dev->name);
                        else
                                pr_cont(" %s does not support one-shot mode.\n",
                                        dev->name);
                }
                return -EINVAL;
        }

        td->mode = TICKDEV_MODE_ONESHOT;
        dev->event_handler = handler;
        clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
        tick_broadcast_switch_to_oneshot();
        return 0;
}

/**
 * tick_oneshot_mode_active - check whether the system is in oneshot mode
 *
 * returns 1 when either nohz or highres are enabled. otherwise 0.
 */
int tick_oneshot_mode_active(void)
{
        unsigned long flags;
        int ret;

        local_irq_save(flags);
        ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;
        local_irq_restore(flags);

        return ret;
}

#ifdef CONFIG_HIGH_RES_TIMERS
/**
 * tick_init_highres - switch to high resolution mode
 *
 * Called with interrupts disabled.
 */
int tick_init_highres(void)
{
        return tick_switch_to_oneshot(hrtimer_interrupt);
}
#endif












































































































































































    1 




    1 


    1 



























    1 






    1 
































    1 




    1 






















    1 











    1 





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/inode.c - kernfs inode implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/security.h>

#include "kernfs-internal.h"

static const struct inode_operations kernfs_iops = {
        .permission        = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
        .getattr        = kernfs_iop_getattr,
        .listxattr        = kernfs_iop_listxattr,
};

static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
{
        static DEFINE_MUTEX(iattr_mutex);
        struct kernfs_iattrs *ret;

        mutex_lock(&iattr_mutex);

        if (kn->iattr || !alloc)
                goto out_unlock;

        kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
        if (!kn->iattr)
                goto out_unlock;

        /* assign default attributes */
        kn->iattr->ia_uid = GLOBAL_ROOT_UID;
        kn->iattr->ia_gid = GLOBAL_ROOT_GID;

        ktime_get_real_ts64(&kn->iattr->ia_atime);
        kn->iattr->ia_mtime = kn->iattr->ia_atime;
        kn->iattr->ia_ctime = kn->iattr->ia_atime;

        simple_xattrs_init(&kn->iattr->xattrs);
        atomic_set(&kn->iattr->nr_user_xattrs, 0);
        atomic_set(&kn->iattr->user_xattr_size, 0);
out_unlock:
        ret = kn->iattr;
        mutex_unlock(&iattr_mutex);
        return ret;
}

static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
        return __kernfs_iattrs(kn, 1);
}

static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
{
        return __kernfs_iattrs(kn, 0);
}

int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
        struct kernfs_iattrs *attrs;
        unsigned int ia_valid = iattr->ia_valid;

        attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        if (ia_valid & ATTR_UID)
                attrs->ia_uid = iattr->ia_uid;
        if (ia_valid & ATTR_GID)
                attrs->ia_gid = iattr->ia_gid;
        if (ia_valid & ATTR_ATIME)
                attrs->ia_atime = iattr->ia_atime;
        if (ia_valid & ATTR_MTIME)
                attrs->ia_mtime = iattr->ia_mtime;
        if (ia_valid & ATTR_CTIME)
                attrs->ia_ctime = iattr->ia_ctime;
        if (ia_valid & ATTR_MODE)
                kn->mode = iattr->ia_mode;
        return 0;
}

/**
 * kernfs_setattr - set iattr on a node
 * @kn: target node
 * @iattr: iattr to set
 *
 * Return: %0 on success, -errno on failure.
 */
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
        int ret;
        struct kernfs_root *root = kernfs_root(kn);

        down_write(&root->kernfs_iattr_rwsem);
        ret = __kernfs_setattr(kn, iattr);
        up_write(&root->kernfs_iattr_rwsem);
        return ret;
}

int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root;
        int error;

        if (!kn)
                return -EINVAL;

        root = kernfs_root(kn);
        down_write(&root->kernfs_iattr_rwsem);
        error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
        if (error)
                goto out;

        error = __kernfs_setattr(kn, iattr);
        if (error)
                goto out;

        /* this ignores size changes */
        setattr_copy(&nop_mnt_idmap, inode, iattr);

out:
        up_write(&root->kernfs_iattr_rwsem);
        return error;
}

ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
{
        struct kernfs_node *kn = kernfs_dentry_node(dentry);
        struct kernfs_iattrs *attrs;

        attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}

static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
        inode->i_mode = mode;
        simple_inode_init_ts(inode);
}

static inline void set_inode_attr(struct inode *inode,
                                  struct kernfs_iattrs *attrs)
{
        inode->i_uid = attrs->ia_uid;
        inode->i_gid = attrs->ia_gid;
        inode_set_atime_to_ts(inode, attrs->ia_atime);
        inode_set_mtime_to_ts(inode, attrs->ia_mtime);
        inode_set_ctime_to_ts(inode, attrs->ia_ctime);
}

static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
{
        struct kernfs_iattrs *attrs = kn->iattr;

        inode->i_mode = kn->mode;
        if (attrs)
                /*
                 * kernfs_node has non-default attributes get them from
                 * persistent copy in kernfs_node.
                 */
                set_inode_attr(inode, attrs);

        if (kernfs_type(kn) == KERNFS_DIR)
                set_nlink(inode, kn->dir.subdirs + 2);
}

int kernfs_iop_getattr(struct mnt_idmap *idmap,
                       const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root = kernfs_root(kn);

        down_read(&root->kernfs_iattr_rwsem);
        kernfs_refresh_inode(kn, inode);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        up_read(&root->kernfs_iattr_rwsem);

        return 0;
}

static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
{
        kernfs_get(kn);
        inode->i_private = kn;
        inode->i_mapping->a_ops = &ram_aops;
        inode->i_op = &kernfs_iops;
        inode->i_generation = kernfs_gen(kn);

        set_default_inode_attr(inode, kn->mode);
        kernfs_refresh_inode(kn, inode);

        /* initialize inode according to type */
        switch (kernfs_type(kn)) {
        case KERNFS_DIR:
                inode->i_op = &kernfs_dir_iops;
                inode->i_fop = &kernfs_dir_fops;
                if (kn->flags & KERNFS_EMPTY_DIR)
                        make_empty_dir_inode(inode);
                break;
        case KERNFS_FILE:
                inode->i_size = kn->attr.size;
                inode->i_fop = &kernfs_file_fops;
                break;
        case KERNFS_LINK:
                inode->i_op = &kernfs_symlink_iops;
                break;
        default:
                BUG();
        }

        unlock_new_inode(inode);
}

/**
 *        kernfs_get_inode - get inode for kernfs_node
 *        @sb: super block
 *        @kn: kernfs_node to allocate inode for
 *
 *        Get inode for @kn.  If such inode doesn't exist, a new inode is
 *        allocated and basics are initialized.  New inode is returned
 *        locked.
 *
 *        Locking:
 *        Kernel thread context (may sleep).
 *
 *        Return:
 *        Pointer to allocated inode on success, %NULL on failure.
 */
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{
        struct inode *inode;

        inode = iget_locked(sb, kernfs_ino(kn));
        if (inode && (inode->i_state & I_NEW))
                kernfs_init_inode(kn, inode);

        return inode;
}

/*
 * The kernfs_node serves as both an inode and a directory entry for
 * kernfs.  To prevent the kernfs inode numbers from being freed
 * prematurely we take a reference to kernfs_node from the kernfs inode.  A
 * super_operations.evict_inode() implementation is needed to drop that
 * reference upon inode destruction.
 */
void kernfs_evict_inode(struct inode *inode)
{
        struct kernfs_node *kn = inode->i_private;

        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        kernfs_put(kn);
}

int kernfs_iop_permission(struct mnt_idmap *idmap,
                          struct inode *inode, int mask)
{
        struct kernfs_node *kn;
        struct kernfs_root *root;
        int ret;

        if (mask & MAY_NOT_BLOCK)
                return -ECHILD;

        kn = inode->i_private;
        root = kernfs_root(kn);

        down_read(&root->kernfs_iattr_rwsem);
        kernfs_refresh_inode(kn, inode);
        ret = generic_permission(&nop_mnt_idmap, inode, mask);
        up_read(&root->kernfs_iattr_rwsem);

        return ret;
}

int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
                     void *value, size_t size)
{
        struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
        if (!attrs)
                return -ENODATA;

        return simple_xattr_get(&attrs->xattrs, name, value, size);
}

int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
                     const void *value, size_t size, int flags)
{
        struct simple_xattr *old_xattr;
        struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags);
        if (IS_ERR(old_xattr))
                return PTR_ERR(old_xattr);

        simple_xattr_free(old_xattr);
        return 0;
}

static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
                                struct dentry *unused, struct inode *inode,
                                const char *suffix, void *value, size_t size)
{
        const char *name = xattr_full_name(handler, suffix);
        struct kernfs_node *kn = inode->i_private;

        return kernfs_xattr_get(kn, name, value, size);
}

static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
                                struct mnt_idmap *idmap,
                                struct dentry *unused, struct inode *inode,
                                const char *suffix, const void *value,
                                size_t size, int flags)
{
        const char *name = xattr_full_name(handler, suffix);
        struct kernfs_node *kn = inode->i_private;

        return kernfs_xattr_set(kn, name, value, size, flags);
}

static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
                                     const char *full_name,
                                     struct simple_xattrs *xattrs,
                                     const void *value, size_t size, int flags)
{
        atomic_t *sz = &kn->iattr->user_xattr_size;
        atomic_t *nr = &kn->iattr->nr_user_xattrs;
        struct simple_xattr *old_xattr;
        int ret;

        if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
                ret = -ENOSPC;
                goto dec_count_out;
        }

        if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
                ret = -ENOSPC;
                goto dec_size_out;
        }

        old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
        if (!old_xattr)
                return 0;

        if (IS_ERR(old_xattr)) {
                ret = PTR_ERR(old_xattr);
                goto dec_size_out;
        }

        ret = 0;
        size = old_xattr->size;
        simple_xattr_free(old_xattr);
dec_size_out:
        atomic_sub(size, sz);
dec_count_out:
        atomic_dec(nr);
        return ret;
}

static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
                                    const char *full_name,
                                    struct simple_xattrs *xattrs,
                                    const void *value, size_t size, int flags)
{
        atomic_t *sz = &kn->iattr->user_xattr_size;
        atomic_t *nr = &kn->iattr->nr_user_xattrs;
        struct simple_xattr *old_xattr;

        old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
        if (!old_xattr)
                return 0;

        if (IS_ERR(old_xattr))
                return PTR_ERR(old_xattr);

        atomic_sub(old_xattr->size, sz);
        atomic_dec(nr);
        simple_xattr_free(old_xattr);
        return 0;
}

static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
                                     struct mnt_idmap *idmap,
                                     struct dentry *unused, struct inode *inode,
                                     const char *suffix, const void *value,
                                     size_t size, int flags)
{
        const char *full_name = xattr_full_name(handler, suffix);
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_iattrs *attrs;

        if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
                return -EOPNOTSUPP;

        attrs = kernfs_iattrs(kn);
        if (!attrs)
                return -ENOMEM;

        if (value)
                return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
                                                 value, size, flags);
        else
                return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
                                                value, size, flags);

}

static const struct xattr_handler kernfs_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = kernfs_vfs_xattr_get,
        .set = kernfs_vfs_xattr_set,
};

static const struct xattr_handler kernfs_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = kernfs_vfs_xattr_get,
        .set = kernfs_vfs_xattr_set,
};

static const struct xattr_handler kernfs_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = kernfs_vfs_xattr_get,
        .set = kernfs_vfs_user_xattr_set,
};

const struct xattr_handler * const kernfs_xattr_handlers[] = {
        &kernfs_trusted_xattr_handler,
        &kernfs_security_xattr_handler,
        &kernfs_user_xattr_handler,
        NULL
};





















































































































































































































































































    1 
















































    1 











    1 









































































































































































































































































































































































































































































































    1 
















































































    1 





    1 


    1 





















    1 





    1 






    1 



    1 




    1 

    1 

    1 






    1 
    1 






















    1 












    1 


















    1 











    1 


















    1 



















    1 
    1 
    1 









    1 


    1 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2003 International Business Machines, Corp.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions handle all input from the IP layer into SCTP.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson <karl@athena.chicago.il.us>
 *    Xingang Guo <xingang.guo@intel.com>
 *    Jon Grimm <jgrimm@us.ibm.com>
 *    Hui Huang <hui.huang@nokia.com>
 *    Daisy Chang <daisyc@us.ibm.com>
 *    Sridhar Samudrala <sri@us.ibm.com>
 *    Ardelle Fan <ardelle.fan@intel.com>
 */

#include <linux/types.h>
#include <linux/list.h> /* For struct list_head */
#include <linux/socket.h>
#include <linux/ip.h>
#include <linux/time.h> /* For struct timeval */
#include <linux/slab.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/snmp.h>
#include <net/sock.h>
#include <net/xfrm.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/checksum.h>
#include <net/net_namespace.h>
#include <linux/rhashtable.h>
#include <net/sock_reuseport.h>

/* Forward declarations for internal helpers. */
static int sctp_rcv_ootb(struct sk_buff *);
static struct sctp_association *__sctp_rcv_lookup(struct net *net,
                                      struct sk_buff *skb,
                                      const union sctp_addr *paddr,
                                      const union sctp_addr *laddr,
                                      struct sctp_transport **transportp,
                                      int dif, int sdif);
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
                                        struct net *net, struct sk_buff *skb,
                                        const union sctp_addr *laddr,
                                        const union sctp_addr *daddr,
                                        int dif, int sdif);
static struct sctp_association *__sctp_lookup_association(
                                        struct net *net,
                                        const union sctp_addr *local,
                                        const union sctp_addr *peer,
                                        struct sctp_transport **pt,
                                        int dif, int sdif);

static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb);


/* Calculate the SCTP checksum of an SCTP packet.  */
static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb)
{
        struct sctphdr *sh = sctp_hdr(skb);
        __le32 cmp = sh->checksum;
        __le32 val = sctp_compute_cksum(skb, 0);

        if (val != cmp) {
                /* CRC failure, dump it. */
                __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS);
                return -1;
        }
        return 0;
}

/*
 * This is the routine which IP calls when receiving an SCTP packet.
 */
int sctp_rcv(struct sk_buff *skb)
{
        struct sock *sk;
        struct sctp_association *asoc;
        struct sctp_endpoint *ep = NULL;
        struct sctp_ep_common *rcvr;
        struct sctp_transport *transport = NULL;
        struct sctp_chunk *chunk;
        union sctp_addr src;
        union sctp_addr dest;
        int family;
        struct sctp_af *af;
        struct net *net = dev_net(skb->dev);
        bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb);
        int dif, sdif;

        if (skb->pkt_type != PACKET_HOST)
                goto discard_it;

        __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS);

        /* If packet is too small to contain a single chunk, let's not
         * waste time on it anymore.
         */
        if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) +
                       skb_transport_offset(skb))
                goto discard_it;

        /* If the packet is fragmented and we need to do crc checking,
         * it's better to just linearize it otherwise crc computing
         * takes longer.
         */
        if ((!is_gso && skb_linearize(skb)) ||
            !pskb_may_pull(skb, sizeof(struct sctphdr)))
                goto discard_it;

        /* Pull up the IP header. */
        __skb_pull(skb, skb_transport_offset(skb));

        skb->csum_valid = 0; /* Previous value not applicable */
        if (skb_csum_unnecessary(skb))
                __skb_decr_checksum_unnecessary(skb);
        else if (!sctp_checksum_disable &&
                 !is_gso &&
                 sctp_rcv_checksum(net, skb) < 0)
                goto discard_it;
        skb->csum_valid = 1;

        __skb_pull(skb, sizeof(struct sctphdr));

        family = ipver2af(ip_hdr(skb)->version);
        af = sctp_get_af_specific(family);
        if (unlikely(!af))
                goto discard_it;
        SCTP_INPUT_CB(skb)->af = af;

        /* Initialize local addresses for lookups. */
        af->from_skb(&src, skb, 1);
        af->from_skb(&dest, skb, 0);
        dif = af->skb_iif(skb);
        sdif = af->skb_sdif(skb);

        /* If the packet is to or from a non-unicast address,
         * silently discard the packet.
         *
         * This is not clearly defined in the RFC except in section
         * 8.4 - OOTB handling.  However, based on the book "Stream Control
         * Transmission Protocol" 2.1, "It is important to note that the
         * IP address of an SCTP transport address must be a routable
         * unicast address.  In other words, IP multicast addresses and
         * IP broadcast addresses cannot be used in an SCTP transport
         * address."
         */
        if (!af->addr_valid(&src, NULL, skb) ||
            !af->addr_valid(&dest, NULL, skb))
                goto discard_it;

        asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif);

        if (!asoc)
                ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif);

        /* Retrieve the common input handling substructure. */
        rcvr = asoc ? &asoc->base : &ep->base;
        sk = rcvr->sk;

        /*
         * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
         * An SCTP packet is called an "out of the blue" (OOTB)
         * packet if it is correctly formed, i.e., passed the
         * receiver's checksum check, but the receiver is not
         * able to identify the association to which this
         * packet belongs.
         */
        if (!asoc) {
                if (sctp_rcv_ootb(skb)) {
                        __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
                        goto discard_release;
                }
        }

        if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
                goto discard_release;
        nf_reset_ct(skb);

        if (sk_filter(sk, skb))
                goto discard_release;

        /* Create an SCTP packet structure. */
        chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC);
        if (!chunk)
                goto discard_release;
        SCTP_INPUT_CB(skb)->chunk = chunk;

        /* Remember what endpoint is to handle this packet. */
        chunk->rcvr = rcvr;

        /* Remember the SCTP header. */
        chunk->sctp_hdr = sctp_hdr(skb);

        /* Set the source and destination addresses of the incoming chunk.  */
        sctp_init_addrs(chunk, &src, &dest);

        /* Remember where we came from.  */
        chunk->transport = transport;

        /* Acquire access to the sock lock. Note: We are safe from other
         * bottom halves on this lock, but a user may be in the lock too,
         * so check if it is busy.
         */
        bh_lock_sock(sk);

        if (sk != rcvr->sk) {
                /* Our cached sk is different from the rcvr->sk.  This is
                 * because migrate()/accept() may have moved the association
                 * to a new socket and released all the sockets.  So now we
                 * are holding a lock on the old socket while the user may
                 * be doing something with the new socket.  Switch our veiw
                 * of the current sk.
                 */
                bh_unlock_sock(sk);
                sk = rcvr->sk;
                bh_lock_sock(sk);
        }

        if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) {
                if (sctp_add_backlog(sk, skb)) {
                        bh_unlock_sock(sk);
                        sctp_chunk_free(chunk);
                        skb = NULL; /* sctp_chunk_free already freed the skb */
                        goto discard_release;
                }
                __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG);
        } else {
                __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ);
                sctp_inq_push(&chunk->rcvr->inqueue, chunk);
        }

        bh_unlock_sock(sk);

        /* Release the asoc/ep ref we took in the lookup calls. */
        if (transport)
                sctp_transport_put(transport);
        else
                sctp_endpoint_put(ep);

        return 0;

discard_it:
        __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS);
        kfree_skb(skb);
        return 0;

discard_release:
        /* Release the asoc/ep ref we took in the lookup calls. */
        if (transport)
                sctp_transport_put(transport);
        else
                sctp_endpoint_put(ep);

        goto discard_it;
}

/* Process the backlog queue of the socket.  Every skb on
 * the backlog holds a ref on an association or endpoint.
 * We hold this ref throughout the state machine to make
 * sure that the structure we need is still around.
 */
int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
        struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
        struct sctp_transport *t = chunk->transport;
        struct sctp_ep_common *rcvr = NULL;
        int backloged = 0;

        rcvr = chunk->rcvr;

        /* If the rcvr is dead then the association or endpoint
         * has been deleted and we can safely drop the chunk
         * and refs that we are holding.
         */
        if (rcvr->dead) {
                sctp_chunk_free(chunk);
                goto done;
        }

        if (unlikely(rcvr->sk != sk)) {
                /* In this case, the association moved from one socket to
                 * another.  We are currently sitting on the backlog of the
                 * old socket, so we need to move.
                 * However, since we are here in the process context we
                 * need to take make sure that the user doesn't own
                 * the new socket when we process the packet.
                 * If the new socket is user-owned, queue the chunk to the
                 * backlog of the new socket without dropping any refs.
                 * Otherwise, we can safely push the chunk on the inqueue.
                 */

                sk = rcvr->sk;
                local_bh_disable();
                bh_lock_sock(sk);

                if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) {
                        if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))
                                sctp_chunk_free(chunk);
                        else
                                backloged = 1;
                } else
                        sctp_inq_push(inqueue, chunk);

                bh_unlock_sock(sk);
                local_bh_enable();

                /* If the chunk was backloged again, don't drop refs */
                if (backloged)
                        return 0;
        } else {
                if (!sctp_newsk_ready(sk)) {
                        if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))
                                return 0;
                        sctp_chunk_free(chunk);
                } else {
                        sctp_inq_push(inqueue, chunk);
                }
        }

done:
        /* Release the refs we took in sctp_add_backlog */
        if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
                sctp_transport_put(t);
        else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
                sctp_endpoint_put(sctp_ep(rcvr));
        else
                BUG();

        return 0;
}

static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
        struct sctp_transport *t = chunk->transport;
        struct sctp_ep_common *rcvr = chunk->rcvr;
        int ret;

        ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf));
        if (!ret) {
                /* Hold the assoc/ep while hanging on the backlog queue.
                 * This way, we know structures we need will not disappear
                 * from us
                 */
                if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type)
                        sctp_transport_hold(t);
                else if (SCTP_EP_TYPE_SOCKET == rcvr->type)
                        sctp_endpoint_hold(sctp_ep(rcvr));
                else
                        BUG();
        }
        return ret;

}

/* Handle icmp frag needed error. */
void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
                           struct sctp_transport *t, __u32 pmtu)
{
        if (!t ||
            (t->pathmtu <= pmtu &&
             t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu))
                return;

        if (sock_owned_by_user(sk)) {
                atomic_set(&t->mtu_info, pmtu);
                asoc->pmtu_pending = 1;
                t->pmtu_pending = 1;
                return;
        }

        if (!(t->param_flags & SPP_PMTUD_ENABLE))
                /* We can't allow retransmitting in such case, as the
                 * retransmission would be sized just as before, and thus we
                 * would get another icmp, and retransmit again.
                 */
                return;

        /* Update transports view of the MTU. Return if no update was needed.
         * If an update wasn't needed/possible, it also doesn't make sense to
         * try to retransmit now.
         */
        if (!sctp_transport_update_pmtu(t, pmtu))
                return;

        /* Update association pmtu. */
        sctp_assoc_sync_pmtu(asoc);

        /* Retransmit with the new pmtu setting. */
        sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD);
}

void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t,
                        struct sk_buff *skb)
{
        struct dst_entry *dst;

        if (sock_owned_by_user(sk) || !t)
                return;
        dst = sctp_transport_dst_check(t);
        if (dst)
                dst->ops->redirect(dst, sk, skb);
}

/*
 * SCTP Implementer's Guide, 2.37 ICMP handling procedures
 *
 * ICMP8) If the ICMP code is a "Unrecognized next header type encountered"
 *        or a "Protocol Unreachable" treat this message as an abort
 *        with the T bit set.
 *
 * This function sends an event to the state machine, which will abort the
 * association.
 *
 */
void sctp_icmp_proto_unreachable(struct sock *sk,
                           struct sctp_association *asoc,
                           struct sctp_transport *t)
{
        if (sock_owned_by_user(sk)) {
                if (timer_pending(&t->proto_unreach_timer))
                        return;
                else {
                        if (!mod_timer(&t->proto_unreach_timer,
                                                jiffies + (HZ/20)))
                                sctp_transport_hold(t);
                }
        } else {
                struct net *net = sock_net(sk);

                pr_debug("%s: unrecognized next header type "
                         "encountered!\n", __func__);

                if (del_timer(&t->proto_unreach_timer))
                        sctp_transport_put(t);

                sctp_do_sm(net, SCTP_EVENT_T_OTHER,
                           SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
                           asoc->state, asoc->ep, asoc, t,
                           GFP_ATOMIC);
        }
}

/* Common lookup code for icmp/icmpv6 error handler. */
struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
                             struct sctphdr *sctphdr,
                             struct sctp_association **app,
                             struct sctp_transport **tpp)
{
        struct sctp_init_chunk *chunkhdr, _chunkhdr;
        union sctp_addr saddr;
        union sctp_addr daddr;
        struct sctp_af *af;
        struct sock *sk = NULL;
        struct sctp_association *asoc;
        struct sctp_transport *transport = NULL;
        __u32 vtag = ntohl(sctphdr->vtag);
        int sdif = inet_sdif(skb);
        int dif = inet_iif(skb);

        *app = NULL; *tpp = NULL;

        af = sctp_get_af_specific(family);
        if (unlikely(!af)) {
                return NULL;
        }

        /* Initialize local addresses for lookups. */
        af->from_skb(&saddr, skb, 1);
        af->from_skb(&daddr, skb, 0);

        /* Look for an association that matches the incoming ICMP error
         * packet.
         */
        asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif);
        if (!asoc)
                return NULL;

        sk = asoc->base.sk;

        /* RFC 4960, Appendix C. ICMP Handling
         *
         * ICMP6) An implementation MUST validate that the Verification Tag
         * contained in the ICMP message matches the Verification Tag of
         * the peer.  If the Verification Tag is not 0 and does NOT
         * match, discard the ICMP message.  If it is 0 and the ICMP
         * message contains enough bytes to verify that the chunk type is
         * an INIT chunk and that the Initiate Tag matches the tag of the
         * peer, continue with ICMP7.  If the ICMP message is too short
         * or the chunk type or the Initiate Tag does not match, silently
         * discard the packet.
         */
        if (vtag == 0) {
                /* chunk header + first 4 octects of init header */
                chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) +
                                              sizeof(struct sctphdr),
                                              sizeof(struct sctp_chunkhdr) +
                                              sizeof(__be32), &_chunkhdr);
                if (!chunkhdr ||
                    chunkhdr->chunk_hdr.type != SCTP_CID_INIT ||
                    ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag)
                        goto out;

        } else if (vtag != asoc->c.peer_vtag) {
                goto out;
        }

        bh_lock_sock(sk);

        /* If too many ICMPs get dropped on busy
         * servers this needs to be solved differently.
         */
        if (sock_owned_by_user(sk))
                __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);

        *app = asoc;
        *tpp = transport;
        return sk;

out:
        sctp_transport_put(transport);
        return NULL;
}

/* Common cleanup code for icmp/icmpv6 error handler. */
void sctp_err_finish(struct sock *sk, struct sctp_transport *t)
        __releases(&((__sk)->sk_lock.slock))
{
        bh_unlock_sock(sk);
        sctp_transport_put(t);
}

static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb,
                               __u8 type, __u8 code, __u32 info)
{
        struct sctp_association *asoc = t->asoc;
        struct sock *sk = asoc->base.sk;
        int err = 0;

        switch (type) {
        case ICMP_PARAMETERPROB:
                err = EPROTO;
                break;
        case ICMP_DEST_UNREACH:
                if (code > NR_ICMP_UNREACH)
                        return;
                if (code == ICMP_FRAG_NEEDED) {
                        sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info));
                        return;
                }
                if (code == ICMP_PROT_UNREACH) {
                        sctp_icmp_proto_unreachable(sk, asoc, t);
                        return;
                }
                err = icmp_err_convert[code].errno;
                break;
        case ICMP_TIME_EXCEEDED:
                if (code == ICMP_EXC_FRAGTIME)
                        return;

                err = EHOSTUNREACH;
                break;
        case ICMP_REDIRECT:
                sctp_icmp_redirect(sk, t, skb);
                return;
        default:
                return;
        }
        if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) {
                sk->sk_err = err;
                sk_error_report(sk);
        } else {  /* Only an error on timeout */
                WRITE_ONCE(sk->sk_err_soft, err);
        }
}

/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.  After adjustment
 * header points to the first 8 bytes of the sctp header.  We need
 * to find the appropriate port.
 *
 * The locking strategy used here is very "optimistic". When
 * someone else accesses the socket the ICMP is just dropped
 * and for some paths there is no check at all.
 * A more general error queue to queue errors for later handling
 * is probably better.
 *
 */
int sctp_v4_err(struct sk_buff *skb, __u32 info)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct net *net = dev_net(skb->dev);
        struct sctp_transport *transport;
        struct sctp_association *asoc;
        __u16 saveip, savesctp;
        struct sock *sk;

        /* Fix up skb to look at the embedded net header. */
        saveip = skb->network_header;
        savesctp = skb->transport_header;
        skb_reset_network_header(skb);
        skb_set_transport_header(skb, iph->ihl * 4);
        sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport);
        /* Put back, the original values. */
        skb->network_header = saveip;
        skb->transport_header = savesctp;
        if (!sk) {
                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                return -ENOENT;
        }

        sctp_v4_err_handle(transport, skb, type, code, info);
        sctp_err_finish(sk, transport);

        return 0;
}

int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        struct sctp_association *asoc;
        struct sctp_transport *t;
        struct icmphdr *hdr;
        __u32 info = 0;

        skb->transport_header += sizeof(struct udphdr);
        sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t);
        if (!sk) {
                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                return -ENOENT;
        }

        skb->transport_header -= sizeof(struct udphdr);
        hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr));
        if (hdr->type == ICMP_REDIRECT) {
                /* can't be handled without outer iphdr known, leave it to udp_err */
                sctp_err_finish(sk, t);
                return 0;
        }
        if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED)
                info = ntohs(hdr->un.frag.mtu);
        sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info);

        sctp_err_finish(sk, t);
        return 1;
}

/*
 * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
 *
 * This function scans all the chunks in the OOTB packet to determine if
 * the packet should be discarded right away.  If a response might be needed
 * for this packet, or, if further processing is possible, the packet will
 * be queued to a proper inqueue for the next phase of handling.
 *
 * Output:
 * Return 0 - If further processing is needed.
 * Return 1 - If the packet can be discarded right away.
 */
static int sctp_rcv_ootb(struct sk_buff *skb)
{
        struct sctp_chunkhdr *ch, _ch;
        int ch_end, offset = 0;

        /* Scan through all the chunks in the packet.  */
        do {
                /* Make sure we have at least the header there */
                if (offset + sizeof(_ch) > skb->len)
                        break;

                ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch);

                /* Break out if chunk length is less then minimal. */
                if (!ch || ntohs(ch->length) < sizeof(_ch))
                        break;

                ch_end = offset + SCTP_PAD4(ntohs(ch->length));
                if (ch_end > skb->len)
                        break;

                /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the
                 * receiver MUST silently discard the OOTB packet and take no
                 * further action.
                 */
                if (SCTP_CID_ABORT == ch->type)
                        goto discard;

                /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE
                 * chunk, the receiver should silently discard the packet
                 * and take no further action.
                 */
                if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type)
                        goto discard;

                /* RFC 4460, 2.11.2
                 * This will discard packets with INIT chunk bundled as
                 * subsequent chunks in the packet.  When INIT is first,
                 * the normal INIT processing will discard the chunk.
                 */
                if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data)
                        goto discard;

                offset = ch_end;
        } while (ch_end < skb->len);

        return 0;

discard:
        return 1;
}

/* Insert endpoint into the hash table.  */
static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
{
        struct sock *sk = ep->base.sk;
        struct net *net = sock_net(sk);
        struct sctp_hashbucket *head;

        ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port);
        head = &sctp_ep_hashtable[ep->hashent];

        if (sk->sk_reuseport) {
                bool any = sctp_is_ep_boundall(sk);
                struct sctp_endpoint *ep2;
                struct list_head *list;
                int cnt = 0, err = 1;

                list_for_each(list, &ep->base.bind_addr.address_list)
                        cnt++;

                sctp_for_each_hentry(ep2, &head->chain) {
                        struct sock *sk2 = ep2->base.sk;

                        if (!net_eq(sock_net(sk2), net) || sk2 == sk ||
                            !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) ||
                            !sk2->sk_reuseport)
                                continue;

                        err = sctp_bind_addrs_check(sctp_sk(sk2),
                                                    sctp_sk(sk), cnt);
                        if (!err) {
                                err = reuseport_add_sock(sk, sk2, any);
                                if (err)
                                        return err;
                                break;
                        } else if (err < 0) {
                                return err;
                        }
                }

                if (err) {
                        err = reuseport_alloc(sk, any);
                        if (err)
                                return err;
                }
        }

        write_lock(&head->lock);
        hlist_add_head(&ep->node, &head->chain);
        write_unlock(&head->lock);
        return 0;
}

/* Add an endpoint to the hash. Local BH-safe. */
int sctp_hash_endpoint(struct sctp_endpoint *ep)
{
        int err;

        local_bh_disable();
        err = __sctp_hash_endpoint(ep);
        local_bh_enable();

        return err;
}

/* Remove endpoint from the hash table.  */
static void __sctp_unhash_endpoint(struct sctp_endpoint *ep)
{
        struct sock *sk = ep->base.sk;
        struct sctp_hashbucket *head;

        ep->hashent = sctp_ep_hashfn(sock_net(sk), ep->base.bind_addr.port);

        head = &sctp_ep_hashtable[ep->hashent];

        if (rcu_access_pointer(sk->sk_reuseport_cb))
                reuseport_detach_sock(sk);

        write_lock(&head->lock);
        hlist_del_init(&ep->node);
        write_unlock(&head->lock);
}

/* Remove endpoint from the hash.  Local BH-safe. */
void sctp_unhash_endpoint(struct sctp_endpoint *ep)
{
        local_bh_disable();
        __sctp_unhash_endpoint(ep);
        local_bh_enable();
}

static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
                                const union sctp_addr *paddr, __u32 seed)
{
        __u32 addr;

        if (paddr->sa.sa_family == AF_INET6)
                addr = jhash(&paddr->v6.sin6_addr, 16, seed);
        else
                addr = (__force __u32)paddr->v4.sin_addr.s_addr;

        return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
                             (__force __u32)lport, net_hash_mix(net), seed);
}

/* Look up an endpoint. */
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
                                        struct net *net, struct sk_buff *skb,
                                        const union sctp_addr *laddr,
                                        const union sctp_addr *paddr,
                                        int dif, int sdif)
{
        struct sctp_hashbucket *head;
        struct sctp_endpoint *ep;
        struct sock *sk;
        __be16 lport;
        int hash;

        lport = laddr->v4.sin_port;
        hash = sctp_ep_hashfn(net, ntohs(lport));
        head = &sctp_ep_hashtable[hash];
        read_lock(&head->lock);
        sctp_for_each_hentry(ep, &head->chain) {
                if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif))
                        goto hit;
        }

        ep = sctp_sk(net->sctp.ctl_sock)->ep;

hit:
        sk = ep->base.sk;
        if (sk->sk_reuseport) {
                __u32 phash = sctp_hashfn(net, lport, paddr, 0);

                sk = reuseport_select_sock(sk, phash, skb,
                                           sizeof(struct sctphdr));
                if (sk)
                        ep = sctp_sk(sk)->ep;
        }
        sctp_endpoint_hold(ep);
        read_unlock(&head->lock);
        return ep;
}

/* rhashtable for transport */
struct sctp_hash_cmp_arg {
        const union sctp_addr        *paddr;
        const struct net        *net;
        __be16                        lport;
};

static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
                                const void *ptr)
{
        struct sctp_transport *t = (struct sctp_transport *)ptr;
        const struct sctp_hash_cmp_arg *x = arg->key;
        int err = 1;

        if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
                return err;
        if (!sctp_transport_hold(t))
                return err;

        if (!net_eq(t->asoc->base.net, x->net))
                goto out;
        if (x->lport != htons(t->asoc->base.bind_addr.port))
                goto out;

        err = 0;
out:
        sctp_transport_put(t);
        return err;
}

static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
{
        const struct sctp_transport *t = data;

        return sctp_hashfn(t->asoc->base.net,
                           htons(t->asoc->base.bind_addr.port),
                           &t->ipaddr, seed);
}

static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed)
{
        const struct sctp_hash_cmp_arg *x = data;

        return sctp_hashfn(x->net, x->lport, x->paddr, seed);
}

static const struct rhashtable_params sctp_hash_params = {
        .head_offset                = offsetof(struct sctp_transport, node),
        .hashfn                        = sctp_hash_key,
        .obj_hashfn                = sctp_hash_obj,
        .obj_cmpfn                = sctp_hash_cmp,
        .automatic_shrinking        = true,
};

int sctp_transport_hashtable_init(void)
{
        return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params);
}

void sctp_transport_hashtable_destroy(void)
{
        rhltable_destroy(&sctp_transport_hashtable);
}

int sctp_hash_transport(struct sctp_transport *t)
{
        struct sctp_transport *transport;
        struct rhlist_head *tmp, *list;
        struct sctp_hash_cmp_arg arg;
        int err;

        if (t->asoc->temp)
                return 0;

        arg.net   = t->asoc->base.net;
        arg.paddr = &t->ipaddr;
        arg.lport = htons(t->asoc->base.bind_addr.port);

        rcu_read_lock();
        list = rhltable_lookup(&sctp_transport_hashtable, &arg,
                               sctp_hash_params);

        rhl_for_each_entry_rcu(transport, tmp, list, node)
                if (transport->asoc->ep == t->asoc->ep) {
                        rcu_read_unlock();
                        return -EEXIST;
                }
        rcu_read_unlock();

        err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
                                  &t->node, sctp_hash_params);
        if (err)
                pr_err_once("insert transport fail, errno %d\n", err);

        return err;
}

void sctp_unhash_transport(struct sctp_transport *t)
{
        if (t->asoc->temp)
                return;

        rhltable_remove(&sctp_transport_hashtable, &t->node,
                        sctp_hash_params);
}

bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif)
{
        bool l3mdev_accept = true;

#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept);
#endif
        return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif);
}

/* return a transport with holding it */
struct sctp_transport *sctp_addrs_lookup_transport(
                                struct net *net,
                                const union sctp_addr *laddr,
                                const union sctp_addr *paddr,
                                int dif, int sdif)
{
        struct rhlist_head *tmp, *list;
        struct sctp_transport *t;
        int bound_dev_if;
        struct sctp_hash_cmp_arg arg = {
                .paddr = paddr,
                .net   = net,
                .lport = laddr->v4.sin_port,
        };

        list = rhltable_lookup(&sctp_transport_hashtable, &arg,
                               sctp_hash_params);

        rhl_for_each_entry_rcu(t, tmp, list, node) {
                if (!sctp_transport_hold(t))
                        continue;

                bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if);
                if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) &&
                    sctp_bind_addr_match(&t->asoc->base.bind_addr,
                                         laddr, sctp_sk(t->asoc->base.sk)))
                        return t;
                sctp_transport_put(t);
        }

        return NULL;
}

/* return a transport without holding it, as it's only used under sock lock */
struct sctp_transport *sctp_epaddr_lookup_transport(
                                const struct sctp_endpoint *ep,
                                const union sctp_addr *paddr)
{
        struct rhlist_head *tmp, *list;
        struct sctp_transport *t;
        struct sctp_hash_cmp_arg arg = {
                .paddr = paddr,
                .net   = ep->base.net,
                .lport = htons(ep->base.bind_addr.port),
        };

        list = rhltable_lookup(&sctp_transport_hashtable, &arg,
                               sctp_hash_params);

        rhl_for_each_entry_rcu(t, tmp, list, node)
                if (ep == t->asoc->ep)
                        return t;

        return NULL;
}

/* Look up an association. */
static struct sctp_association *__sctp_lookup_association(
                                        struct net *net,
                                        const union sctp_addr *local,
                                        const union sctp_addr *peer,
                                        struct sctp_transport **pt,
                                        int dif, int sdif)
{
        struct sctp_transport *t;
        struct sctp_association *asoc = NULL;

        t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif);
        if (!t)
                goto out;

        asoc = t->asoc;
        *pt = t;

out:
        return asoc;
}

/* Look up an association. protected by RCU read lock */
static
struct sctp_association *sctp_lookup_association(struct net *net,
                                                 const union sctp_addr *laddr,
                                                 const union sctp_addr *paddr,
                                                 struct sctp_transport **transportp,
                                                 int dif, int sdif)
{
        struct sctp_association *asoc;

        rcu_read_lock();
        asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
        rcu_read_unlock();

        return asoc;
}

/* Is there an association matching the given local and peer addresses? */
bool sctp_has_association(struct net *net,
                          const union sctp_addr *laddr,
                          const union sctp_addr *paddr,
                          int dif, int sdif)
{
        struct sctp_transport *transport;

        if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) {
                sctp_transport_put(transport);
                return true;
        }

        return false;
}

/*
 * SCTP Implementors Guide, 2.18 Handling of address
 * parameters within the INIT or INIT-ACK.
 *
 * D) When searching for a matching TCB upon reception of an INIT
 *    or INIT-ACK chunk the receiver SHOULD use not only the
 *    source address of the packet (containing the INIT or
 *    INIT-ACK) but the receiver SHOULD also use all valid
 *    address parameters contained within the chunk.
 *
 * 2.18.3 Solution description
 *
 * This new text clearly specifies to an implementor the need
 * to look within the INIT or INIT-ACK. Any implementation that
 * does not do this, may not be able to establish associations
 * in certain circumstances.
 *
 */
static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
        struct sk_buff *skb,
        const union sctp_addr *laddr, struct sctp_transport **transportp,
        int dif, int sdif)
{
        struct sctp_association *asoc;
        union sctp_addr addr;
        union sctp_addr *paddr = &addr;
        struct sctphdr *sh = sctp_hdr(skb);
        union sctp_params params;
        struct sctp_init_chunk *init;
        struct sctp_af *af;

        /*
         * This code will NOT touch anything inside the chunk--it is
         * strictly READ-ONLY.
         *
         * RFC 2960 3  SCTP packet Format
         *
         * Multiple chunks can be bundled into one SCTP packet up to
         * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN
         * COMPLETE chunks.  These chunks MUST NOT be bundled with any
         * other chunk in a packet.  See Section 6.10 for more details
         * on chunk bundling.
         */

        /* Find the start of the TLVs and the end of the chunk.  This is
         * the region we search for address parameters.
         */
        init = (struct sctp_init_chunk *)skb->data;

        /* Walk the parameters looking for embedded addresses. */
        sctp_walk_params(params, init) {

                /* Note: Ignoring hostname addresses. */
                af = sctp_get_af_specific(param_type2af(params.p->type));
                if (!af)
                        continue;

                if (!af->from_addr_param(paddr, params.addr, sh->source, 0))
                        continue;

                asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
                if (asoc)
                        return asoc;
        }

        return NULL;
}

/* ADD-IP, Section 5.2
 * When an endpoint receives an ASCONF Chunk from the remote peer
 * special procedures may be needed to identify the association the
 * ASCONF Chunk is associated with. To properly find the association
 * the following procedures SHOULD be followed:
 *
 * D2) If the association is not found, use the address found in the
 * Address Parameter TLV combined with the port number found in the
 * SCTP common header. If found proceed to rule D4.
 *
 * D2-ext) If more than one ASCONF Chunks are packed together, use the
 * address found in the ASCONF Address Parameter TLV of each of the
 * subsequent ASCONF Chunks. If found, proceed to rule D4.
 */
static struct sctp_association *__sctp_rcv_asconf_lookup(
                                        struct net *net,
                                        struct sctp_chunkhdr *ch,
                                        const union sctp_addr *laddr,
                                        __be16 peer_port,
                                        struct sctp_transport **transportp,
                                        int dif, int sdif)
{
        struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch;
        struct sctp_af *af;
        union sctp_addr_param *param;
        union sctp_addr paddr;

        if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr))
                return NULL;

        /* Skip over the ADDIP header and find the Address parameter */
        param = (union sctp_addr_param *)(asconf + 1);

        af = sctp_get_af_specific(param_type2af(param->p.type));
        if (unlikely(!af))
                return NULL;

        if (!af->from_addr_param(&paddr, param, peer_port, 0))
                return NULL;

        return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif);
}


/* SCTP-AUTH, Section 6.3:
*    If the receiver does not find a STCB for a packet containing an AUTH
*    chunk as the first chunk and not a COOKIE-ECHO chunk as the second
*    chunk, it MUST use the chunks after the AUTH chunk to look up an existing
*    association.
*
* This means that any chunks that can help us identify the association need
* to be looked at to find this association.
*/
static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
                                      struct sk_buff *skb,
                                      const union sctp_addr *laddr,
                                      struct sctp_transport **transportp,
                                      int dif, int sdif)
{
        struct sctp_association *asoc = NULL;
        struct sctp_chunkhdr *ch;
        int have_auth = 0;
        unsigned int chunk_num = 1;
        __u8 *ch_end;

        /* Walk through the chunks looking for AUTH or ASCONF chunks
         * to help us find the association.
         */
        ch = (struct sctp_chunkhdr *)skb->data;
        do {
                /* Break out if chunk length is less then minimal. */
                if (ntohs(ch->length) < sizeof(*ch))
                        break;

                ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
                if (ch_end > skb_tail_pointer(skb))
                        break;

                switch (ch->type) {
                case SCTP_CID_AUTH:
                        have_auth = chunk_num;
                        break;

                case SCTP_CID_COOKIE_ECHO:
                        /* If a packet arrives containing an AUTH chunk as
                         * a first chunk, a COOKIE-ECHO chunk as the second
                         * chunk, and possibly more chunks after them, and
                         * the receiver does not have an STCB for that
                         * packet, then authentication is based on
                         * the contents of the COOKIE- ECHO chunk.
                         */
                        if (have_auth == 1 && chunk_num == 2)
                                return NULL;
                        break;

                case SCTP_CID_ASCONF:
                        if (have_auth || net->sctp.addip_noauth)
                                asoc = __sctp_rcv_asconf_lookup(
                                                net, ch, laddr,
                                                sctp_hdr(skb)->source,
                                                transportp, dif, sdif);
                        break;
                default:
                        break;
                }

                if (asoc)
                        break;

                ch = (struct sctp_chunkhdr *)ch_end;
                chunk_num++;
        } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb));

        return asoc;
}

/*
 * There are circumstances when we need to look inside the SCTP packet
 * for information to help us find the association.   Examples
 * include looking inside of INIT/INIT-ACK chunks or after the AUTH
 * chunks.
 */
static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
                                      struct sk_buff *skb,
                                      const union sctp_addr *laddr,
                                      struct sctp_transport **transportp,
                                      int dif, int sdif)
{
        struct sctp_chunkhdr *ch;

        /* We do not allow GSO frames here as we need to linearize and
         * then cannot guarantee frame boundaries. This shouldn't be an
         * issue as packets hitting this are mostly INIT or INIT-ACK and
         * those cannot be on GSO-style anyway.
         */
        if (skb_is_gso(skb) && skb_is_gso_sctp(skb))
                return NULL;

        ch = (struct sctp_chunkhdr *)skb->data;

        /* The code below will attempt to walk the chunk and extract
         * parameter information.  Before we do that, we need to verify
         * that the chunk length doesn't cause overflow.  Otherwise, we'll
         * walk off the end.
         */
        if (SCTP_PAD4(ntohs(ch->length)) > skb->len)
                return NULL;

        /* If this is INIT/INIT-ACK look inside the chunk too. */
        if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK)
                return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif);

        return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif);
}

/* Lookup an association for an inbound skb. */
static struct sctp_association *__sctp_rcv_lookup(struct net *net,
                                      struct sk_buff *skb,
                                      const union sctp_addr *paddr,
                                      const union sctp_addr *laddr,
                                      struct sctp_transport **transportp,
                                      int dif, int sdif)
{
        struct sctp_association *asoc;

        asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
        if (asoc)
                goto out;

        /* Further lookup for INIT/INIT-ACK packets.
         * SCTP Implementors Guide, 2.18 Handling of address
         * parameters within the INIT or INIT-ACK.
         */
        asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif);
        if (asoc)
                goto out;

        if (paddr->sa.sa_family == AF_INET)
                pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n",
                         &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port),
                         &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port));
        else
                pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n",
                         &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port),
                         &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port));

out:
        return asoc;
}
















































































































































































    2 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Routing netlink socket interface: protocol independent part.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *        Fixes:
 *        Vitaly E. Lavrov                RTA_OK arithmetic was wrong.
 */

#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/mutex.h>
#include <linux/if_addr.h>
#include <linux/if_bridge.h>
#include <linux/if_vlan.h>
#include <linux/pci.h>
#include <linux/etherdevice.h>
#include <linux/bpf.h>

#include <linux/uaccess.h>

#include <linux/inet.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/udp.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/fib_rules.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/devlink.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/addrconf.h>
#endif
#include <linux/dpll.h>

#include "dev.h"

#define RTNL_MAX_TYPE                50
#define RTNL_SLAVE_MAX_TYPE        44

struct rtnl_link {
        rtnl_doit_func                doit;
        rtnl_dumpit_func        dumpit;
        struct module                *owner;
        unsigned int                flags;
        struct rcu_head                rcu;
};

static DEFINE_MUTEX(rtnl_mutex);

void rtnl_lock(void)
{
        mutex_lock(&rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_lock);

int rtnl_lock_killable(void)
{
        return mutex_lock_killable(&rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_lock_killable);

static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
{
        if (head && tail) {
                tail->next = defer_kfree_skb_list;
                defer_kfree_skb_list = head;
        }
}
EXPORT_SYMBOL(rtnl_kfree_skbs);

void __rtnl_unlock(void)
{
        struct sk_buff *head = defer_kfree_skb_list;

        defer_kfree_skb_list = NULL;

        /* Ensure that we didn't actually add any TODO item when __rtnl_unlock()
         * is used. In some places, e.g. in cfg80211, we have code that will do
         * something like
         *   rtnl_lock()
         *   wiphy_lock()
         *   ...
         *   rtnl_unlock()
         *
         * and because netdev_run_todo() acquires the RTNL for items on the list
         * we could cause a situation such as this:
         * Thread 1                        Thread 2
         *                                  rtnl_lock()
         *                                  unregister_netdevice()
         *                                  __rtnl_unlock()
         * rtnl_lock()
         * wiphy_lock()
         * rtnl_unlock()
         *   netdev_run_todo()
         *     __rtnl_unlock()
         *
         *     // list not empty now
         *     // because of thread 2
         *                                  rtnl_lock()
         *     while (!list_empty(...))
         *       rtnl_lock()
         *                                  wiphy_lock()
         * **** DEADLOCK ****
         *
         * However, usage of __rtnl_unlock() is rare, and so we can ensure that
         * it's not used in cases where something is added to do the list.
         */
        WARN_ON(!list_empty(&net_todo_list));

        mutex_unlock(&rtnl_mutex);

        while (head) {
                struct sk_buff *next = head->next;

                kfree_skb(head);
                cond_resched();
                head = next;
        }
}

void rtnl_unlock(void)
{
        /* This fellow will unlock it for us. */
        netdev_run_todo();
}
EXPORT_SYMBOL(rtnl_unlock);

int rtnl_trylock(void)
{
        return mutex_trylock(&rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_trylock);

int rtnl_is_locked(void)
{
        return mutex_is_locked(&rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_is_locked);

bool refcount_dec_and_rtnl_lock(refcount_t *r)
{
        return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
}
EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);

#ifdef CONFIG_PROVE_LOCKING
bool lockdep_rtnl_is_held(void)
{
        return lockdep_is_held(&rtnl_mutex);
}
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */

static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];

static inline int rtm_msgindex(int msgtype)
{
        int msgindex = msgtype - RTM_BASE;

        /*
         * msgindex < 0 implies someone tried to register a netlink
         * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
         * the message type has not been added to linux/rtnetlink.h
         */
        BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);

        return msgindex;
}

static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
{
        struct rtnl_link __rcu **tab;

        if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
                protocol = PF_UNSPEC;

        tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
        if (!tab)
                tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);

        return rcu_dereference_rtnl(tab[msgtype]);
}

static int rtnl_register_internal(struct module *owner,
                                  int protocol, int msgtype,
                                  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
                                  unsigned int flags)
{
        struct rtnl_link *link, *old;
        struct rtnl_link __rcu **tab;
        int msgindex;
        int ret = -ENOBUFS;

        BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
        msgindex = rtm_msgindex(msgtype);

        rtnl_lock();
        tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
        if (tab == NULL) {
                tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
                if (!tab)
                        goto unlock;

                /* ensures we see the 0 stores */
                rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
        }

        old = rtnl_dereference(tab[msgindex]);
        if (old) {
                link = kmemdup(old, sizeof(*old), GFP_KERNEL);
                if (!link)
                        goto unlock;
        } else {
                link = kzalloc(sizeof(*link), GFP_KERNEL);
                if (!link)
                        goto unlock;
        }

        WARN_ON(link->owner && link->owner != owner);
        link->owner = owner;

        WARN_ON(doit && link->doit && link->doit != doit);
        if (doit)
                link->doit = doit;
        WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
        if (dumpit)
                link->dumpit = dumpit;

        WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL &&
                (flags & RTNL_FLAG_BULK_DEL_SUPPORTED));
        link->flags |= flags;

        /* publish protocol:msgtype */
        rcu_assign_pointer(tab[msgindex], link);
        ret = 0;
        if (old)
                kfree_rcu(old, rcu);
unlock:
        rtnl_unlock();
        return ret;
}

/**
 * rtnl_register_module - Register a rtnetlink message type
 *
 * @owner: module registering the hook (THIS_MODULE)
 * @protocol: Protocol family or PF_UNSPEC
 * @msgtype: rtnetlink message type
 * @doit: Function pointer called for each request message
 * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
 * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
 *
 * Like rtnl_register, but for use by removable modules.
 */
int rtnl_register_module(struct module *owner,
                         int protocol, int msgtype,
                         rtnl_doit_func doit, rtnl_dumpit_func dumpit,
                         unsigned int flags)
{
        return rtnl_register_internal(owner, protocol, msgtype,
                                      doit, dumpit, flags);
}
EXPORT_SYMBOL_GPL(rtnl_register_module);

/**
 * rtnl_register - Register a rtnetlink message type
 * @protocol: Protocol family or PF_UNSPEC
 * @msgtype: rtnetlink message type
 * @doit: Function pointer called for each request message
 * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
 * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
 *
 * Registers the specified function pointers (at least one of them has
 * to be non-NULL) to be called whenever a request message for the
 * specified protocol family and message type is received.
 *
 * The special protocol family PF_UNSPEC may be used to define fallback
 * function pointers for the case when no entry for the specific protocol
 * family exists.
 */
void rtnl_register(int protocol, int msgtype,
                   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
                   unsigned int flags)
{
        int err;

        err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
                                     flags);
        if (err)
                pr_err("Unable to register rtnetlink message handler, "
                       "protocol = %d, message type = %d\n", protocol, msgtype);
}

/**
 * rtnl_unregister - Unregister a rtnetlink message type
 * @protocol: Protocol family or PF_UNSPEC
 * @msgtype: rtnetlink message type
 *
 * Returns 0 on success or a negative error code.
 */
int rtnl_unregister(int protocol, int msgtype)
{
        struct rtnl_link __rcu **tab;
        struct rtnl_link *link;
        int msgindex;

        BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
        msgindex = rtm_msgindex(msgtype);

        rtnl_lock();
        tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
        if (!tab) {
                rtnl_unlock();
                return -ENOENT;
        }

        link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
        rtnl_unlock();

        kfree_rcu(link, rcu);

        return 0;
}
EXPORT_SYMBOL_GPL(rtnl_unregister);

/**
 * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
 * @protocol : Protocol family or PF_UNSPEC
 *
 * Identical to calling rtnl_unregster() for all registered message types
 * of a certain protocol family.
 */
void rtnl_unregister_all(int protocol)
{
        struct rtnl_link __rcu **tab;
        struct rtnl_link *link;
        int msgindex;

        BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);

        rtnl_lock();
        tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL);
        if (!tab) {
                rtnl_unlock();
                return;
        }
        for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
                link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
                kfree_rcu(link, rcu);
        }
        rtnl_unlock();

        synchronize_net();

        kfree(tab);
}
EXPORT_SYMBOL_GPL(rtnl_unregister_all);

static LIST_HEAD(link_ops);

static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
{
        const struct rtnl_link_ops *ops;

        list_for_each_entry(ops, &link_ops, list) {
                if (!strcmp(ops->kind, kind))
                        return ops;
        }
        return NULL;
}

/**
 * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
 * @ops: struct rtnl_link_ops * to register
 *
 * The caller must hold the rtnl_mutex. This function should be used
 * by drivers that create devices during module initialization. It
 * must be called before registering the devices.
 *
 * Returns 0 on success or a negative error code.
 */
int __rtnl_link_register(struct rtnl_link_ops *ops)
{
        if (rtnl_link_ops_get(ops->kind))
                return -EEXIST;

        /* The check for alloc/setup is here because if ops
         * does not have that filled up, it is not possible
         * to use the ops for creating device. So do not
         * fill up dellink as well. That disables rtnl_dellink.
         */
        if ((ops->alloc || ops->setup) && !ops->dellink)
                ops->dellink = unregister_netdevice_queue;

        list_add_tail(&ops->list, &link_ops);
        return 0;
}
EXPORT_SYMBOL_GPL(__rtnl_link_register);

/**
 * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
 * @ops: struct rtnl_link_ops * to register
 *
 * Returns 0 on success or a negative error code.
 */
int rtnl_link_register(struct rtnl_link_ops *ops)
{
        int err;

        /* Sanity-check max sizes to avoid stack buffer overflow. */
        if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
                    ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
                return -EINVAL;

        rtnl_lock();
        err = __rtnl_link_register(ops);
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL_GPL(rtnl_link_register);

static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
{
        struct net_device *dev;
        LIST_HEAD(list_kill);

        for_each_netdev(net, dev) {
                if (dev->rtnl_link_ops == ops)
                        ops->dellink(dev, &list_kill);
        }
        unregister_netdevice_many(&list_kill);
}

/**
 * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
 * @ops: struct rtnl_link_ops * to unregister
 *
 * The caller must hold the rtnl_mutex and guarantee net_namespace_list
 * integrity (hold pernet_ops_rwsem for writing to close the race
 * with setup_net() and cleanup_net()).
 */
void __rtnl_link_unregister(struct rtnl_link_ops *ops)
{
        struct net *net;

        for_each_net(net) {
                __rtnl_kill_links(net, ops);
        }
        list_del(&ops->list);
}
EXPORT_SYMBOL_GPL(__rtnl_link_unregister);

/* Return with the rtnl_lock held when there are no network
 * devices unregistering in any network namespace.
 */
static void rtnl_lock_unregistering_all(void)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);

        add_wait_queue(&netdev_unregistering_wq, &wait);
        for (;;) {
                rtnl_lock();
                /* We held write locked pernet_ops_rwsem, and parallel
                 * setup_net() and cleanup_net() are not possible.
                 */
                if (!atomic_read(&dev_unreg_count))
                        break;
                __rtnl_unlock();

                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
        }
        remove_wait_queue(&netdev_unregistering_wq, &wait);
}

/**
 * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
 * @ops: struct rtnl_link_ops * to unregister
 */
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
        /* Close the race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);
        rtnl_lock_unregistering_all();
        __rtnl_link_unregister(ops);
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);

static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
{
        struct net_device *master_dev;
        const struct rtnl_link_ops *ops;
        size_t size = 0;

        rcu_read_lock();

        master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
        if (!master_dev)
                goto out;

        ops = master_dev->rtnl_link_ops;
        if (!ops || !ops->get_slave_size)
                goto out;
        /* IFLA_INFO_SLAVE_DATA + nested data */
        size = nla_total_size(sizeof(struct nlattr)) +
               ops->get_slave_size(master_dev, dev);

out:
        rcu_read_unlock();
        return size;
}

static size_t rtnl_link_get_size(const struct net_device *dev)
{
        const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
        size_t size;

        if (!ops)
                return 0;

        size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
               nla_total_size(strlen(ops->kind) + 1);  /* IFLA_INFO_KIND */

        if (ops->get_size)
                /* IFLA_INFO_DATA + nested data */
                size += nla_total_size(sizeof(struct nlattr)) +
                        ops->get_size(dev);

        if (ops->get_xstats_size)
                /* IFLA_INFO_XSTATS */
                size += nla_total_size(ops->get_xstats_size(dev));

        size += rtnl_link_get_slave_info_data_size(dev);

        return size;
}

static LIST_HEAD(rtnl_af_ops);

static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
{
        const struct rtnl_af_ops *ops;

        ASSERT_RTNL();

        list_for_each_entry(ops, &rtnl_af_ops, list) {
                if (ops->family == family)
                        return ops;
        }

        return NULL;
}

/**
 * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
 * @ops: struct rtnl_af_ops * to register
 *
 * Returns 0 on success or a negative error code.
 */
void rtnl_af_register(struct rtnl_af_ops *ops)
{
        rtnl_lock();
        list_add_tail_rcu(&ops->list, &rtnl_af_ops);
        rtnl_unlock();
}
EXPORT_SYMBOL_GPL(rtnl_af_register);

/**
 * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
 * @ops: struct rtnl_af_ops * to unregister
 */
void rtnl_af_unregister(struct rtnl_af_ops *ops)
{
        rtnl_lock();
        list_del_rcu(&ops->list);
        rtnl_unlock();

        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(rtnl_af_unregister);

static size_t rtnl_link_get_af_size(const struct net_device *dev,
                                    u32 ext_filter_mask)
{
        struct rtnl_af_ops *af_ops;
        size_t size;

        /* IFLA_AF_SPEC */
        size = nla_total_size(sizeof(struct nlattr));

        rcu_read_lock();
        list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                if (af_ops->get_link_af_size) {
                        /* AF_* + nested data */
                        size += nla_total_size(sizeof(struct nlattr)) +
                                af_ops->get_link_af_size(dev, ext_filter_mask);
                }
        }
        rcu_read_unlock();

        return size;
}

static bool rtnl_have_link_slave_info(const struct net_device *dev)
{
        struct net_device *master_dev;
        bool ret = false;

        rcu_read_lock();

        master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
        if (master_dev && master_dev->rtnl_link_ops)
                ret = true;
        rcu_read_unlock();
        return ret;
}

static int rtnl_link_slave_info_fill(struct sk_buff *skb,
                                     const struct net_device *dev)
{
        struct net_device *master_dev;
        const struct rtnl_link_ops *ops;
        struct nlattr *slave_data;
        int err;

        master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
        if (!master_dev)
                return 0;
        ops = master_dev->rtnl_link_ops;
        if (!ops)
                return 0;
        if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
                return -EMSGSIZE;
        if (ops->fill_slave_info) {
                slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA);
                if (!slave_data)
                        return -EMSGSIZE;
                err = ops->fill_slave_info(skb, master_dev, dev);
                if (err < 0)
                        goto err_cancel_slave_data;
                nla_nest_end(skb, slave_data);
        }
        return 0;

err_cancel_slave_data:
        nla_nest_cancel(skb, slave_data);
        return err;
}

static int rtnl_link_info_fill(struct sk_buff *skb,
                               const struct net_device *dev)
{
        const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
        struct nlattr *data;
        int err;

        if (!ops)
                return 0;
        if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
                return -EMSGSIZE;
        if (ops->fill_xstats) {
                err = ops->fill_xstats(skb, dev);
                if (err < 0)
                        return err;
        }
        if (ops->fill_info) {
                data = nla_nest_start_noflag(skb, IFLA_INFO_DATA);
                if (data == NULL)
                        return -EMSGSIZE;
                err = ops->fill_info(skb, dev);
                if (err < 0)
                        goto err_cancel_data;
                nla_nest_end(skb, data);
        }
        return 0;

err_cancel_data:
        nla_nest_cancel(skb, data);
        return err;
}

static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
{
        struct nlattr *linkinfo;
        int err = -EMSGSIZE;

        linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO);
        if (linkinfo == NULL)
                goto out;

        err = rtnl_link_info_fill(skb, dev);
        if (err < 0)
                goto err_cancel_link;

        err = rtnl_link_slave_info_fill(skb, dev);
        if (err < 0)
                goto err_cancel_link;

        nla_nest_end(skb, linkinfo);
        return 0;

err_cancel_link:
        nla_nest_cancel(skb, linkinfo);
out:
        return err;
}

int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
        struct sock *rtnl = net->rtnl;

        return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL);
}

int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
{
        struct sock *rtnl = net->rtnl;

        return nlmsg_unicast(rtnl, skb, pid);
}
EXPORT_SYMBOL(rtnl_unicast);

void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
                 const struct nlmsghdr *nlh, gfp_t flags)
{
        struct sock *rtnl = net->rtnl;

        nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags);
}
EXPORT_SYMBOL(rtnl_notify);

void rtnl_set_sk_err(struct net *net, u32 group, int error)
{
        struct sock *rtnl = net->rtnl;

        netlink_set_err(rtnl, 0, group, error);
}
EXPORT_SYMBOL(rtnl_set_sk_err);

int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
{
        struct nlattr *mx;
        int i, valid = 0;

        /* nothing is dumped for dst_default_metrics, so just skip the loop */
        if (metrics == dst_default_metrics.metrics)
                return 0;

        mx = nla_nest_start_noflag(skb, RTA_METRICS);
        if (mx == NULL)
                return -ENOBUFS;

        for (i = 0; i < RTAX_MAX; i++) {
                if (metrics[i]) {
                        if (i == RTAX_CC_ALGO - 1) {
                                char tmp[TCP_CA_NAME_MAX], *name;

                                name = tcp_ca_get_name_by_key(metrics[i], tmp);
                                if (!name)
                                        continue;
                                if (nla_put_string(skb, i + 1, name))
                                        goto nla_put_failure;
                        } else if (i == RTAX_FEATURES - 1) {
                                u32 user_features = metrics[i] & RTAX_FEATURE_MASK;

                                if (!user_features)
                                        continue;
                                BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
                                if (nla_put_u32(skb, i + 1, user_features))
                                        goto nla_put_failure;
                        } else {
                                if (nla_put_u32(skb, i + 1, metrics[i]))
                                        goto nla_put_failure;
                        }
                        valid++;
                }
        }

        if (!valid) {
                nla_nest_cancel(skb, mx);
                return 0;
        }

        return nla_nest_end(skb, mx);

nla_put_failure:
        nla_nest_cancel(skb, mx);
        return -EMSGSIZE;
}
EXPORT_SYMBOL(rtnetlink_put_metrics);

int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
                       long expires, u32 error)
{
        struct rta_cacheinfo ci = {
                .rta_error = error,
                .rta_id =  id,
        };

        if (dst) {
                ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
                ci.rta_used = dst->__use;
                ci.rta_clntref = rcuref_read(&dst->__rcuref);
        }
        if (expires) {
                unsigned long clock;

                clock = jiffies_to_clock_t(abs(expires));
                clock = min_t(unsigned long, clock, INT_MAX);
                ci.rta_expires = (expires > 0) ? clock : -clock;
        }
        return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);

void netdev_set_operstate(struct net_device *dev, int newstate)
{
        unsigned int old = READ_ONCE(dev->operstate);

        do {
                if (old == newstate)
                        return;
        } while (!try_cmpxchg(&dev->operstate, &old, newstate));

        netdev_state_change(dev);
}
EXPORT_SYMBOL(netdev_set_operstate);

static void set_operstate(struct net_device *dev, unsigned char transition)
{
        unsigned char operstate = READ_ONCE(dev->operstate);

        switch (transition) {
        case IF_OPER_UP:
                if ((operstate == IF_OPER_DORMANT ||
                     operstate == IF_OPER_TESTING ||
                     operstate == IF_OPER_UNKNOWN) &&
                    !netif_dormant(dev) && !netif_testing(dev))
                        operstate = IF_OPER_UP;
                break;

        case IF_OPER_TESTING:
                if (netif_oper_up(dev))
                        operstate = IF_OPER_TESTING;
                break;

        case IF_OPER_DORMANT:
                if (netif_oper_up(dev))
                        operstate = IF_OPER_DORMANT;
                break;
        }

        netdev_set_operstate(dev, operstate);
}

static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
{
        return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
               (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
}

static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
                                           const struct ifinfomsg *ifm)
{
        unsigned int flags = ifm->ifi_flags;

        /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
        if (ifm->ifi_change)
                flags = (flags & ifm->ifi_change) |
                        (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);

        return flags;
}

static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
                                 const struct rtnl_link_stats64 *b)
{
        a->rx_packets = b->rx_packets;
        a->tx_packets = b->tx_packets;
        a->rx_bytes = b->rx_bytes;
        a->tx_bytes = b->tx_bytes;
        a->rx_errors = b->rx_errors;
        a->tx_errors = b->tx_errors;
        a->rx_dropped = b->rx_dropped;
        a->tx_dropped = b->tx_dropped;

        a->multicast = b->multicast;
        a->collisions = b->collisions;

        a->rx_length_errors = b->rx_length_errors;
        a->rx_over_errors = b->rx_over_errors;
        a->rx_crc_errors = b->rx_crc_errors;
        a->rx_frame_errors = b->rx_frame_errors;
        a->rx_fifo_errors = b->rx_fifo_errors;
        a->rx_missed_errors = b->rx_missed_errors;

        a->tx_aborted_errors = b->tx_aborted_errors;
        a->tx_carrier_errors = b->tx_carrier_errors;
        a->tx_fifo_errors = b->tx_fifo_errors;
        a->tx_heartbeat_errors = b->tx_heartbeat_errors;
        a->tx_window_errors = b->tx_window_errors;

        a->rx_compressed = b->rx_compressed;
        a->tx_compressed = b->tx_compressed;

        a->rx_nohandler = b->rx_nohandler;
}

/* All VF info */
static inline int rtnl_vfinfo_size(const struct net_device *dev,
                                   u32 ext_filter_mask)
{
        if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
                int num_vfs = dev_num_vf(dev->dev.parent);
                size_t size = nla_total_size(0);
                size += num_vfs *
                        (nla_total_size(0) +
                         nla_total_size(sizeof(struct ifla_vf_mac)) +
                         nla_total_size(sizeof(struct ifla_vf_broadcast)) +
                         nla_total_size(sizeof(struct ifla_vf_vlan)) +
                         nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
                         nla_total_size(MAX_VLAN_LIST_LEN *
                                        sizeof(struct ifla_vf_vlan_info)) +
                         nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
                         nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
                         nla_total_size(sizeof(struct ifla_vf_rate)) +
                         nla_total_size(sizeof(struct ifla_vf_link_state)) +
                         nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
                         nla_total_size(sizeof(struct ifla_vf_trust)));
                if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
                        size += num_vfs *
                                (nla_total_size(0) + /* nest IFLA_VF_STATS */
                                 /* IFLA_VF_STATS_RX_PACKETS */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_TX_PACKETS */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_RX_BYTES */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_TX_BYTES */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_BROADCAST */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_MULTICAST */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_RX_DROPPED */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_TX_DROPPED */
                                 nla_total_size_64bit(sizeof(__u64)));
                }
                return size;
        } else
                return 0;
}

static size_t rtnl_port_size(const struct net_device *dev,
                             u32 ext_filter_mask)
{
        size_t port_size = nla_total_size(4)                /* PORT_VF */
                + nla_total_size(PORT_PROFILE_MAX)        /* PORT_PROFILE */
                + nla_total_size(PORT_UUID_MAX)                /* PORT_INSTANCE_UUID */
                + nla_total_size(PORT_UUID_MAX)                /* PORT_HOST_UUID */
                + nla_total_size(1)                        /* PROT_VDP_REQUEST */
                + nla_total_size(2);                        /* PORT_VDP_RESPONSE */
        size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
        size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
                + port_size;
        size_t port_self_size = nla_total_size(sizeof(struct nlattr))
                + port_size;

        if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
            !(ext_filter_mask & RTEXT_FILTER_VF))
                return 0;
        if (dev_num_vf(dev->dev.parent))
                return port_self_size + vf_ports_size +
                        vf_port_size * dev_num_vf(dev->dev.parent);
        else
                return port_self_size;
}

static size_t rtnl_xdp_size(void)
{
        size_t xdp_size = nla_total_size(0) +        /* nest IFLA_XDP */
                          nla_total_size(1) +        /* XDP_ATTACHED */
                          nla_total_size(4) +        /* XDP_PROG_ID (or 1st mode) */
                          nla_total_size(4);        /* XDP_<mode>_PROG_ID */

        return xdp_size;
}

static size_t rtnl_prop_list_size(const struct net_device *dev)
{
        struct netdev_name_node *name_node;
        unsigned int cnt = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(name_node, &dev->name_node->list, list)
                cnt++;
        rcu_read_unlock();

        if (!cnt)
                return 0;

        return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ);
}

static size_t rtnl_proto_down_size(const struct net_device *dev)
{
        size_t size = nla_total_size(1);

        /* Assume dev->proto_down_reason is not zero. */
        size += nla_total_size(0) + nla_total_size(4);

        return size;
}

static size_t rtnl_devlink_port_size(const struct net_device *dev)
{
        size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */

        if (dev->devlink_port)
                size += devlink_nl_port_handle_size(dev->devlink_port);

        return size;
}

static size_t rtnl_dpll_pin_size(const struct net_device *dev)
{
        size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */

        size += dpll_netdev_pin_handle_size(dev);

        return size;
}

static noinline size_t if_nlmsg_size(const struct net_device *dev,
                                     u32 ext_filter_mask)
{
        return NLMSG_ALIGN(sizeof(struct ifinfomsg))
               + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
               + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
               + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
               + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
               + nla_total_size(sizeof(struct rtnl_link_stats))
               + nla_total_size_64bit(sizeof(struct rtnl_link_stats64))
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
               + nla_total_size(4) /* IFLA_TXQLEN */
               + nla_total_size(4) /* IFLA_WEIGHT */
               + nla_total_size(4) /* IFLA_MTU */
               + nla_total_size(4) /* IFLA_LINK */
               + nla_total_size(4) /* IFLA_MASTER */
               + nla_total_size(1) /* IFLA_CARRIER */
               + nla_total_size(4) /* IFLA_PROMISCUITY */
               + nla_total_size(4) /* IFLA_ALLMULTI */
               + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
               + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
               + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
               + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
               + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
               + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */
               + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */
               + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */
               + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
               + nla_total_size(1) /* IFLA_OPERSTATE */
               + nla_total_size(1) /* IFLA_LINKMODE */
               + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
               + nla_total_size(4) /* IFLA_LINK_NETNSID */
               + nla_total_size(4) /* IFLA_GROUP */
               + nla_total_size(ext_filter_mask
                                & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
               + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
               + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
               + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
               + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
               + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
               + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
               + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
               + rtnl_xdp_size() /* IFLA_XDP */
               + nla_total_size(4)  /* IFLA_EVENT */
               + nla_total_size(4)  /* IFLA_NEW_NETNSID */
               + nla_total_size(4)  /* IFLA_NEW_IFINDEX */
               + rtnl_proto_down_size(dev)  /* proto down */
               + nla_total_size(4)  /* IFLA_TARGET_NETNSID */
               + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
               + nla_total_size(4)  /* IFLA_CARRIER_DOWN_COUNT */
               + nla_total_size(4)  /* IFLA_MIN_MTU */
               + nla_total_size(4)  /* IFLA_MAX_MTU */
               + rtnl_prop_list_size(dev)
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
               + rtnl_devlink_port_size(dev)
               + rtnl_dpll_pin_size(dev)
               + 0;
}

static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct nlattr *vf_ports;
        struct nlattr *vf_port;
        int vf;
        int err;

        vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS);
        if (!vf_ports)
                return -EMSGSIZE;

        for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
                vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT);
                if (!vf_port)
                        goto nla_put_failure;
                if (nla_put_u32(skb, IFLA_PORT_VF, vf))
                        goto nla_put_failure;
                err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
                if (err == -EMSGSIZE)
                        goto nla_put_failure;
                if (err) {
                        nla_nest_cancel(skb, vf_port);
                        continue;
                }
                nla_nest_end(skb, vf_port);
        }

        nla_nest_end(skb, vf_ports);

        return 0;

nla_put_failure:
        nla_nest_cancel(skb, vf_ports);
        return -EMSGSIZE;
}

static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct nlattr *port_self;
        int err;

        port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF);
        if (!port_self)
                return -EMSGSIZE;

        err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
        if (err) {
                nla_nest_cancel(skb, port_self);
                return (err == -EMSGSIZE) ? err : 0;
        }

        nla_nest_end(skb, port_self);

        return 0;
}

static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
                          u32 ext_filter_mask)
{
        int err;

        if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
            !(ext_filter_mask & RTEXT_FILTER_VF))
                return 0;

        err = rtnl_port_self_fill(skb, dev);
        if (err)
                return err;

        if (dev_num_vf(dev->dev.parent)) {
                err = rtnl_vf_ports_fill(skb, dev);
                if (err)
                        return err;
        }

        return 0;
}

static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
{
        int err;
        struct netdev_phys_item_id ppid;

        err = dev_get_phys_port_id(dev, &ppid);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
                return -EMSGSIZE;

        return 0;
}

static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
{
        char name[IFNAMSIZ];
        int err;

        err = dev_get_phys_port_name(dev, name, sizeof(name));
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name))
                return -EMSGSIZE;

        return 0;
}

static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct netdev_phys_item_id ppid = { };
        int err;

        err = dev_get_port_parent_id(dev, &ppid, false);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id))
                return -EMSGSIZE;

        return 0;
}

static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
                                              struct net_device *dev)
{
        struct rtnl_link_stats64 *sp;
        struct nlattr *attr;

        attr = nla_reserve_64bit(skb, IFLA_STATS64,
                                 sizeof(struct rtnl_link_stats64), IFLA_PAD);
        if (!attr)
                return -EMSGSIZE;

        sp = nla_data(attr);
        dev_get_stats(dev, sp);

        attr = nla_reserve(skb, IFLA_STATS,
                           sizeof(struct rtnl_link_stats));
        if (!attr)
                return -EMSGSIZE;

        copy_rtnl_link_stats(nla_data(attr), sp);

        return 0;
}

static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
                                               struct net_device *dev,
                                               int vfs_num,
                                               u32 ext_filter_mask)
{
        struct ifla_vf_rss_query_en vf_rss_query_en;
        struct nlattr *vf, *vfstats, *vfvlanlist;
        struct ifla_vf_link_state vf_linkstate;
        struct ifla_vf_vlan_info vf_vlan_info;
        struct ifla_vf_spoofchk vf_spoofchk;
        struct ifla_vf_tx_rate vf_tx_rate;
        struct ifla_vf_stats vf_stats;
        struct ifla_vf_trust vf_trust;
        struct ifla_vf_vlan vf_vlan;
        struct ifla_vf_rate vf_rate;
        struct ifla_vf_mac vf_mac;
        struct ifla_vf_broadcast vf_broadcast;
        struct ifla_vf_info ivi;
        struct ifla_vf_guid node_guid;
        struct ifla_vf_guid port_guid;

        memset(&ivi, 0, sizeof(ivi));

        /* Not all SR-IOV capable drivers support the
         * spoofcheck and "RSS query enable" query.  Preset to
         * -1 so the user space tool can detect that the driver
         * didn't report anything.
         */
        ivi.spoofchk = -1;
        ivi.rss_query_en = -1;
        ivi.trusted = -1;
        /* The default value for VF link state is "auto"
         * IFLA_VF_LINK_STATE_AUTO which equals zero
         */
        ivi.linkstate = 0;
        /* VLAN Protocol by default is 802.1Q */
        ivi.vlan_proto = htons(ETH_P_8021Q);
        if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
                return 0;

        memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
        memset(&node_guid, 0, sizeof(node_guid));
        memset(&port_guid, 0, sizeof(port_guid));

        vf_mac.vf =
                vf_vlan.vf =
                vf_vlan_info.vf =
                vf_rate.vf =
                vf_tx_rate.vf =
                vf_spoofchk.vf =
                vf_linkstate.vf =
                vf_rss_query_en.vf =
                vf_trust.vf =
                node_guid.vf =
                port_guid.vf = ivi.vf;

        memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
        memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
        vf_vlan.vlan = ivi.vlan;
        vf_vlan.qos = ivi.qos;
        vf_vlan_info.vlan = ivi.vlan;
        vf_vlan_info.qos = ivi.qos;
        vf_vlan_info.vlan_proto = ivi.vlan_proto;
        vf_tx_rate.rate = ivi.max_tx_rate;
        vf_rate.min_tx_rate = ivi.min_tx_rate;
        vf_rate.max_tx_rate = ivi.max_tx_rate;
        vf_spoofchk.setting = ivi.spoofchk;
        vf_linkstate.link_state = ivi.linkstate;
        vf_rss_query_en.setting = ivi.rss_query_en;
        vf_trust.setting = ivi.trusted;
        vf = nla_nest_start_noflag(skb, IFLA_VF_INFO);
        if (!vf)
                return -EMSGSIZE;
        if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
            nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
            nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
            nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
                    &vf_rate) ||
            nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
                    &vf_tx_rate) ||
            nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
                    &vf_spoofchk) ||
            nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
                    &vf_linkstate) ||
            nla_put(skb, IFLA_VF_RSS_QUERY_EN,
                    sizeof(vf_rss_query_en),
                    &vf_rss_query_en) ||
            nla_put(skb, IFLA_VF_TRUST,
                    sizeof(vf_trust), &vf_trust))
                goto nla_put_vf_failure;

        if (dev->netdev_ops->ndo_get_vf_guid &&
            !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid,
                                              &port_guid)) {
                if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid),
                            &node_guid) ||
                    nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid),
                            &port_guid))
                        goto nla_put_vf_failure;
        }
        vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST);
        if (!vfvlanlist)
                goto nla_put_vf_failure;
        if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
                    &vf_vlan_info)) {
                nla_nest_cancel(skb, vfvlanlist);
                goto nla_put_vf_failure;
        }
        nla_nest_end(skb, vfvlanlist);
        if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
                memset(&vf_stats, 0, sizeof(vf_stats));
                if (dev->netdev_ops->ndo_get_vf_stats)
                        dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
                                                          &vf_stats);
                vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS);
                if (!vfstats)
                        goto nla_put_vf_failure;
                if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
                                      vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
                                      vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
                                      vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
                                      vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
                                      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
                                      vf_stats.multicast, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
                                      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
                                      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
                        nla_nest_cancel(skb, vfstats);
                        goto nla_put_vf_failure;
                }
                nla_nest_end(skb, vfstats);
        }
        nla_nest_end(skb, vf);
        return 0;

nla_put_vf_failure:
        nla_nest_cancel(skb, vf);
        return -EMSGSIZE;
}

static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
                                           struct net_device *dev,
                                           u32 ext_filter_mask)
{
        struct nlattr *vfinfo;
        int i, num_vfs;

        if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0))
                return 0;

        num_vfs = dev_num_vf(dev->dev.parent);
        if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs))
                return -EMSGSIZE;

        if (!dev->netdev_ops->ndo_get_vf_config)
                return 0;

        vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST);
        if (!vfinfo)
                return -EMSGSIZE;

        for (i = 0; i < num_vfs; i++) {
                if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) {
                        nla_nest_cancel(skb, vfinfo);
                        return -EMSGSIZE;
                }
        }

        nla_nest_end(skb, vfinfo);
        return 0;
}

static int rtnl_fill_link_ifmap(struct sk_buff *skb,
                                const struct net_device *dev)
{
        struct rtnl_link_ifmap map;

        memset(&map, 0, sizeof(map));
        map.mem_start = READ_ONCE(dev->mem_start);
        map.mem_end   = READ_ONCE(dev->mem_end);
        map.base_addr = READ_ONCE(dev->base_addr);
        map.irq       = READ_ONCE(dev->irq);
        map.dma       = READ_ONCE(dev->dma);
        map.port      = READ_ONCE(dev->if_port);

        if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
                return -EMSGSIZE;

        return 0;
}

static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
        const struct bpf_prog *generic_xdp_prog;
        u32 res = 0;

        rcu_read_lock();
        generic_xdp_prog = rcu_dereference(dev->xdp_prog);
        if (generic_xdp_prog)
                res = generic_xdp_prog->aux->id;
        rcu_read_unlock();

        return res;
}

static u32 rtnl_xdp_prog_drv(struct net_device *dev)
{
        return dev_xdp_prog_id(dev, XDP_MODE_DRV);
}

static u32 rtnl_xdp_prog_hw(struct net_device *dev)
{
        return dev_xdp_prog_id(dev, XDP_MODE_HW);
}

static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
                               u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
                               u32 (*get_prog_id)(struct net_device *dev))
{
        u32 curr_id;
        int err;

        curr_id = get_prog_id(dev);
        if (!curr_id)
                return 0;

        *prog_id = curr_id;
        err = nla_put_u32(skb, attr, curr_id);
        if (err)
                return err;

        if (*mode != XDP_ATTACHED_NONE)
                *mode = XDP_ATTACHED_MULTI;
        else
                *mode = tgt_mode;

        return 0;
}

static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct nlattr *xdp;
        u32 prog_id;
        int err;
        u8 mode;

        xdp = nla_nest_start_noflag(skb, IFLA_XDP);
        if (!xdp)
                return -EMSGSIZE;

        prog_id = 0;
        mode = XDP_ATTACHED_NONE;
        err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
                                  IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
        if (err)
                goto err_cancel;
        err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
                                  IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
        if (err)
                goto err_cancel;
        err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
                                  IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
        if (err)
                goto err_cancel;

        err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
        if (err)
                goto err_cancel;

        if (prog_id && mode != XDP_ATTACHED_MULTI) {
                err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
                if (err)
                        goto err_cancel;
        }

        nla_nest_end(skb, xdp);
        return 0;

err_cancel:
        nla_nest_cancel(skb, xdp);
        return err;
}

static u32 rtnl_get_event(unsigned long event)
{
        u32 rtnl_event_type = IFLA_EVENT_NONE;

        switch (event) {
        case NETDEV_REBOOT:
                rtnl_event_type = IFLA_EVENT_REBOOT;
                break;
        case NETDEV_FEAT_CHANGE:
                rtnl_event_type = IFLA_EVENT_FEATURES;
                break;
        case NETDEV_BONDING_FAILOVER:
                rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER;
                break;
        case NETDEV_NOTIFY_PEERS:
                rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS;
                break;
        case NETDEV_RESEND_IGMP:
                rtnl_event_type = IFLA_EVENT_IGMP_RESEND;
                break;
        case NETDEV_CHANGEINFODATA:
                rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS;
                break;
        default:
                break;
        }

        return rtnl_event_type;
}

static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
{
        const struct net_device *upper_dev;
        int ret = 0;

        rcu_read_lock();

        upper_dev = netdev_master_upper_dev_get_rcu(dev);
        if (upper_dev)
                ret = nla_put_u32(skb, IFLA_MASTER,
                                  READ_ONCE(upper_dev->ifindex));

        rcu_read_unlock();
        return ret;
}

static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev,
                          bool force)
{
        int iflink = dev_get_iflink(dev);

        if (force || READ_ONCE(dev->ifindex) != iflink)
                return nla_put_u32(skb, IFLA_LINK, iflink);

        return 0;
}

static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
                                              struct net_device *dev)
{
        char buf[IFALIASZ];
        int ret;

        ret = dev_get_alias(dev, buf, sizeof(buf));
        return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
}

static int rtnl_fill_link_netnsid(struct sk_buff *skb,
                                  const struct net_device *dev,
                                  struct net *src_net, gfp_t gfp)
{
        bool put_iflink = false;

        if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
                struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);

                if (!net_eq(dev_net(dev), link_net)) {
                        int id = peernet2id_alloc(src_net, link_net, gfp);

                        if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
                                return -EMSGSIZE;

                        put_iflink = true;
                }
        }

        return nla_put_iflink(skb, dev, put_iflink);
}

static int rtnl_fill_link_af(struct sk_buff *skb,
                             const struct net_device *dev,
                             u32 ext_filter_mask)
{
        const struct rtnl_af_ops *af_ops;
        struct nlattr *af_spec;

        af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
        if (!af_spec)
                return -EMSGSIZE;

        list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                struct nlattr *af;
                int err;

                if (!af_ops->fill_link_af)
                        continue;

                af = nla_nest_start_noflag(skb, af_ops->family);
                if (!af)
                        return -EMSGSIZE;

                err = af_ops->fill_link_af(skb, dev, ext_filter_mask);
                /*
                 * Caller may return ENODATA to indicate that there
                 * was no data to be dumped. This is not an error, it
                 * means we should trim the attribute header and
                 * continue.
                 */
                if (err == -ENODATA)
                        nla_nest_cancel(skb, af);
                else if (err < 0)
                        return -EMSGSIZE;

                nla_nest_end(skb, af);
        }

        nla_nest_end(skb, af_spec);
        return 0;
}

static int rtnl_fill_alt_ifnames(struct sk_buff *skb,
                                 const struct net_device *dev)
{
        struct netdev_name_node *name_node;
        int count = 0;

        list_for_each_entry_rcu(name_node, &dev->name_node->list, list) {
                if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name))
                        return -EMSGSIZE;
                count++;
        }
        return count;
}

/* RCU protected. */
static int rtnl_fill_prop_list(struct sk_buff *skb,
                               const struct net_device *dev)
{
        struct nlattr *prop_list;
        int ret;

        prop_list = nla_nest_start(skb, IFLA_PROP_LIST);
        if (!prop_list)
                return -EMSGSIZE;

        ret = rtnl_fill_alt_ifnames(skb, dev);
        if (ret <= 0)
                goto nest_cancel;

        nla_nest_end(skb, prop_list);
        return 0;

nest_cancel:
        nla_nest_cancel(skb, prop_list);
        return ret;
}

static int rtnl_fill_proto_down(struct sk_buff *skb,
                                const struct net_device *dev)
{
        struct nlattr *pr;
        u32 preason;

        if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))
                goto nla_put_failure;

        preason = READ_ONCE(dev->proto_down_reason);
        if (!preason)
                return 0;

        pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON);
        if (!pr)
                return -EMSGSIZE;

        if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) {
                nla_nest_cancel(skb, pr);
                goto nla_put_failure;
        }

        nla_nest_end(skb, pr);
        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int rtnl_fill_devlink_port(struct sk_buff *skb,
                                  const struct net_device *dev)
{
        struct nlattr *devlink_port_nest;
        int ret;

        devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT);
        if (!devlink_port_nest)
                return -EMSGSIZE;

        if (dev->devlink_port) {
                ret = devlink_nl_port_handle_fill(skb, dev->devlink_port);
                if (ret < 0)
                        goto nest_cancel;
        }

        nla_nest_end(skb, devlink_port_nest);
        return 0;

nest_cancel:
        nla_nest_cancel(skb, devlink_port_nest);
        return ret;
}

static int rtnl_fill_dpll_pin(struct sk_buff *skb,
                              const struct net_device *dev)
{
        struct nlattr *dpll_pin_nest;
        int ret;

        dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
        if (!dpll_pin_nest)
                return -EMSGSIZE;

        ret = dpll_netdev_add_pin_handle(skb, dev);
        if (ret < 0)
                goto nest_cancel;

        nla_nest_end(skb, dpll_pin_nest);
        return 0;

nest_cancel:
        nla_nest_cancel(skb, dpll_pin_nest);
        return ret;
}

static int rtnl_fill_ifinfo(struct sk_buff *skb,
                            struct net_device *dev, struct net *src_net,
                            int type, u32 pid, u32 seq, u32 change,
                            unsigned int flags, u32 ext_filter_mask,
                            u32 event, int *new_nsid, int new_ifindex,
                            int tgt_netnsid, gfp_t gfp)
{
        char devname[IFNAMSIZ];
        struct ifinfomsg *ifm;
        struct nlmsghdr *nlh;
        struct Qdisc *qdisc;

        ASSERT_RTNL();
        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifi_family = AF_UNSPEC;
        ifm->__ifi_pad = 0;
        ifm->ifi_type = READ_ONCE(dev->type);
        ifm->ifi_index = READ_ONCE(dev->ifindex);
        ifm->ifi_flags = dev_get_flags(dev);
        ifm->ifi_change = change;

        if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
                goto nla_put_failure;

        netdev_copy_name(dev, devname);
        if (nla_put_string(skb, IFLA_IFNAME, devname))
                goto nla_put_failure;

        if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||
            nla_put_u8(skb, IFLA_OPERSTATE,
                       netif_running(dev) ? READ_ONCE(dev->operstate) :
                                            IF_OPER_DOWN) ||
            nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) ||
            nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
            nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) ||
            nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) ||
            nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) ||
            nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) ||
            nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) ||
            nla_put_u32(skb, IFLA_NUM_TX_QUEUES,
                        READ_ONCE(dev->num_tx_queues)) ||
            nla_put_u32(skb, IFLA_GSO_MAX_SEGS,
                        READ_ONCE(dev->gso_max_segs)) ||
            nla_put_u32(skb, IFLA_GSO_MAX_SIZE,
                        READ_ONCE(dev->gso_max_size)) ||
            nla_put_u32(skb, IFLA_GRO_MAX_SIZE,
                        READ_ONCE(dev->gro_max_size)) ||
            nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE,
                        READ_ONCE(dev->gso_ipv4_max_size)) ||
            nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE,
                        READ_ONCE(dev->gro_ipv4_max_size)) ||
            nla_put_u32(skb, IFLA_TSO_MAX_SIZE,
                        READ_ONCE(dev->tso_max_size)) ||
            nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
                        READ_ONCE(dev->tso_max_segs)) ||
#ifdef CONFIG_RPS
            nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
                        READ_ONCE(dev->num_rx_queues)) ||
#endif
            put_master_ifindex(skb, dev) ||
            nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
            nla_put_ifalias(skb, dev) ||
            nla_put_u32(skb, IFLA_CARRIER_CHANGES,
                        atomic_read(&dev->carrier_up_count) +
                        atomic_read(&dev->carrier_down_count)) ||
            nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
                        atomic_read(&dev->carrier_up_count)) ||
            nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
                        atomic_read(&dev->carrier_down_count)))
                goto nla_put_failure;

        if (rtnl_fill_proto_down(skb, dev))
                goto nla_put_failure;

        if (event != IFLA_EVENT_NONE) {
                if (nla_put_u32(skb, IFLA_EVENT, event))
                        goto nla_put_failure;
        }

        if (dev->addr_len) {
                if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
                    nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
                        goto nla_put_failure;
        }

        if (rtnl_phys_port_id_fill(skb, dev))
                goto nla_put_failure;

        if (rtnl_phys_port_name_fill(skb, dev))
                goto nla_put_failure;

        if (rtnl_phys_switch_id_fill(skb, dev))
                goto nla_put_failure;

        if (rtnl_fill_stats(skb, dev))
                goto nla_put_failure;

        if (rtnl_fill_vf(skb, dev, ext_filter_mask))
                goto nla_put_failure;

        if (rtnl_port_fill(skb, dev, ext_filter_mask))
                goto nla_put_failure;

        if (rtnl_xdp_fill(skb, dev))
                goto nla_put_failure;

        if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
                if (rtnl_link_fill(skb, dev) < 0)
                        goto nla_put_failure;
        }

        if (new_nsid &&
            nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
                goto nla_put_failure;
        if (new_ifindex &&
            nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
                goto nla_put_failure;

        if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) &&
            nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr))
                goto nla_put_failure;

        rcu_read_lock();
        if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC))
                goto nla_put_failure_rcu;
        qdisc = rcu_dereference(dev->qdisc);
        if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id))
                goto nla_put_failure_rcu;
        if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
                goto nla_put_failure_rcu;
        if (rtnl_fill_link_ifmap(skb, dev))
                goto nla_put_failure_rcu;
        if (rtnl_fill_prop_list(skb, dev))
                goto nla_put_failure_rcu;
        rcu_read_unlock();

        if (dev->dev.parent &&
            nla_put_string(skb, IFLA_PARENT_DEV_NAME,
                           dev_name(dev->dev.parent)))
                goto nla_put_failure;

        if (dev->dev.parent && dev->dev.parent->bus &&
            nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME,
                           dev->dev.parent->bus->name))
                goto nla_put_failure;

        if (rtnl_fill_devlink_port(skb, dev))
                goto nla_put_failure;

        if (rtnl_fill_dpll_pin(skb, dev))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure_rcu:
        rcu_read_unlock();
nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
        [IFLA_IFNAME]                = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
        [IFLA_ADDRESS]                = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [IFLA_BROADCAST]        = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [IFLA_MAP]                = { .len = sizeof(struct rtnl_link_ifmap) },
        [IFLA_MTU]                = { .type = NLA_U32 },
        [IFLA_LINK]                = { .type = NLA_U32 },
        [IFLA_MASTER]                = { .type = NLA_U32 },
        [IFLA_CARRIER]                = { .type = NLA_U8 },
        [IFLA_TXQLEN]                = { .type = NLA_U32 },
        [IFLA_WEIGHT]                = { .type = NLA_U32 },
        [IFLA_OPERSTATE]        = { .type = NLA_U8 },
        [IFLA_LINKMODE]                = { .type = NLA_U8 },
        [IFLA_LINKINFO]                = { .type = NLA_NESTED },
        [IFLA_NET_NS_PID]        = { .type = NLA_U32 },
        [IFLA_NET_NS_FD]        = { .type = NLA_U32 },
        /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to
         * allow 0-length string (needed to remove an alias).
         */
        [IFLA_IFALIAS]                = { .type = NLA_BINARY, .len = IFALIASZ - 1 },
        [IFLA_VFINFO_LIST]        = {. type = NLA_NESTED },
        [IFLA_VF_PORTS]                = { .type = NLA_NESTED },
        [IFLA_PORT_SELF]        = { .type = NLA_NESTED },
        [IFLA_AF_SPEC]                = { .type = NLA_NESTED },
        [IFLA_EXT_MASK]                = { .type = NLA_U32 },
        [IFLA_PROMISCUITY]        = { .type = NLA_U32 },
        [IFLA_NUM_TX_QUEUES]        = { .type = NLA_U32 },
        [IFLA_NUM_RX_QUEUES]        = { .type = NLA_U32 },
        [IFLA_GSO_MAX_SEGS]        = { .type = NLA_U32 },
        [IFLA_GSO_MAX_SIZE]        = { .type = NLA_U32 },
        [IFLA_PHYS_PORT_ID]        = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
        [IFLA_CARRIER_CHANGES]        = { .type = NLA_U32 },  /* ignored */
        [IFLA_PHYS_SWITCH_ID]        = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
        [IFLA_LINK_NETNSID]        = { .type = NLA_S32 },
        [IFLA_PROTO_DOWN]        = { .type = NLA_U8 },
        [IFLA_XDP]                = { .type = NLA_NESTED },
        [IFLA_EVENT]                = { .type = NLA_U32 },
        [IFLA_GROUP]                = { .type = NLA_U32 },
        [IFLA_TARGET_NETNSID]        = { .type = NLA_S32 },
        [IFLA_CARRIER_UP_COUNT]        = { .type = NLA_U32 },
        [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
        [IFLA_MIN_MTU]                = { .type = NLA_U32 },
        [IFLA_MAX_MTU]                = { .type = NLA_U32 },
        [IFLA_PROP_LIST]        = { .type = NLA_NESTED },
        [IFLA_ALT_IFNAME]        = { .type = NLA_STRING,
                                    .len = ALTIFNAMSIZ - 1 },
        [IFLA_PERM_ADDRESS]        = { .type = NLA_REJECT },
        [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
        [IFLA_NEW_IFINDEX]        = NLA_POLICY_MIN(NLA_S32, 1),
        [IFLA_PARENT_DEV_NAME]        = { .type = NLA_NUL_STRING },
        [IFLA_GRO_MAX_SIZE]        = { .type = NLA_U32 },
        [IFLA_TSO_MAX_SIZE]        = { .type = NLA_REJECT },
        [IFLA_TSO_MAX_SEGS]        = { .type = NLA_REJECT },
        [IFLA_ALLMULTI]                = { .type = NLA_REJECT },
        [IFLA_GSO_IPV4_MAX_SIZE]        = { .type = NLA_U32 },
        [IFLA_GRO_IPV4_MAX_SIZE]        = { .type = NLA_U32 },
};

static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
        [IFLA_INFO_KIND]        = { .type = NLA_STRING },
        [IFLA_INFO_DATA]        = { .type = NLA_NESTED },
        [IFLA_INFO_SLAVE_KIND]        = { .type = NLA_STRING },
        [IFLA_INFO_SLAVE_DATA]        = { .type = NLA_NESTED },
};

static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
        [IFLA_VF_MAC]                = { .len = sizeof(struct ifla_vf_mac) },
        [IFLA_VF_BROADCAST]        = { .type = NLA_REJECT },
        [IFLA_VF_VLAN]                = { .len = sizeof(struct ifla_vf_vlan) },
        [IFLA_VF_VLAN_LIST]     = { .type = NLA_NESTED },
        [IFLA_VF_TX_RATE]        = { .len = sizeof(struct ifla_vf_tx_rate) },
        [IFLA_VF_SPOOFCHK]        = { .len = sizeof(struct ifla_vf_spoofchk) },
        [IFLA_VF_RATE]                = { .len = sizeof(struct ifla_vf_rate) },
        [IFLA_VF_LINK_STATE]        = { .len = sizeof(struct ifla_vf_link_state) },
        [IFLA_VF_RSS_QUERY_EN]        = { .len = sizeof(struct ifla_vf_rss_query_en) },
        [IFLA_VF_STATS]                = { .type = NLA_NESTED },
        [IFLA_VF_TRUST]                = { .len = sizeof(struct ifla_vf_trust) },
        [IFLA_VF_IB_NODE_GUID]        = { .len = sizeof(struct ifla_vf_guid) },
        [IFLA_VF_IB_PORT_GUID]        = { .len = sizeof(struct ifla_vf_guid) },
};

static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
        [IFLA_PORT_VF]                = { .type = NLA_U32 },
        [IFLA_PORT_PROFILE]        = { .type = NLA_STRING,
                                    .len = PORT_PROFILE_MAX },
        [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
                                      .len = PORT_UUID_MAX },
        [IFLA_PORT_HOST_UUID]        = { .type = NLA_STRING,
                                    .len = PORT_UUID_MAX },
        [IFLA_PORT_REQUEST]        = { .type = NLA_U8, },
        [IFLA_PORT_RESPONSE]        = { .type = NLA_U16, },

        /* Unused, but we need to keep it here since user space could
         * fill it. It's also broken with regard to NLA_BINARY use in
         * combination with structs.
         */
        [IFLA_PORT_VSI_TYPE]        = { .type = NLA_BINARY,
                                    .len = sizeof(struct ifla_port_vsi) },
};

static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
        [IFLA_XDP_UNSPEC]        = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
        [IFLA_XDP_FD]                = { .type = NLA_S32 },
        [IFLA_XDP_EXPECTED_FD]        = { .type = NLA_S32 },
        [IFLA_XDP_ATTACHED]        = { .type = NLA_U8 },
        [IFLA_XDP_FLAGS]        = { .type = NLA_U32 },
        [IFLA_XDP_PROG_ID]        = { .type = NLA_U32 },
};

static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
{
        const struct rtnl_link_ops *ops = NULL;
        struct nlattr *linfo[IFLA_INFO_MAX + 1];

        if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0)
                return NULL;

        if (linfo[IFLA_INFO_KIND]) {
                char kind[MODULE_NAME_LEN];

                nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
                ops = rtnl_link_ops_get(kind);
        }

        return ops;
}

static bool link_master_filtered(struct net_device *dev, int master_idx)
{
        struct net_device *master;

        if (!master_idx)
                return false;

        master = netdev_master_upper_dev_get(dev);

        /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need
         * another invalid value for ifindex to denote "no master".
         */
        if (master_idx == -1)
                return !!master;

        if (!master || master->ifindex != master_idx)
                return true;

        return false;
}

static bool link_kind_filtered(const struct net_device *dev,
                               const struct rtnl_link_ops *kind_ops)
{
        if (kind_ops && dev->rtnl_link_ops != kind_ops)
                return true;

        return false;
}

static bool link_dump_filtered(struct net_device *dev,
                               int master_idx,
                               const struct rtnl_link_ops *kind_ops)
{
        if (link_master_filtered(dev, master_idx) ||
            link_kind_filtered(dev, kind_ops))
                return true;

        return false;
}

/**
 * rtnl_get_net_ns_capable - Get netns if sufficiently privileged.
 * @sk: netlink socket
 * @netnsid: network namespace identifier
 *
 * Returns the network namespace identified by netnsid on success or an error
 * pointer on failure.
 */
struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
{
        struct net *net;

        net = get_net_ns_by_id(sock_net(sk), netnsid);
        if (!net)
                return ERR_PTR(-EINVAL);

        /* For now, the caller is required to have CAP_NET_ADMIN in
         * the user namespace owning the target net ns.
         */
        if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) {
                put_net(net);
                return ERR_PTR(-EACCES);
        }
        return net;
}
EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable);

static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
                                      bool strict_check, struct nlattr **tb,
                                      struct netlink_ext_ack *extack)
{
        int hdrlen;

        if (strict_check) {
                struct ifinfomsg *ifm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                        NL_SET_ERR_MSG(extack, "Invalid header for link dump");
                        return -EINVAL;
                }

                ifm = nlmsg_data(nlh);
                if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
                    ifm->ifi_change) {
                        NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
                        return -EINVAL;
                }
                if (ifm->ifi_index) {
                        NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps");
                        return -EINVAL;
                }

                return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb,
                                                     IFLA_MAX, ifla_policy,
                                                     extack);
        }

        /* A hack to preserve kernel<->userspace interface.
         * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
         * However, before Linux v3.9 the code here assumed rtgenmsg and that's
         * what iproute2 < v3.9.0 used.
         * We can detect the old iproute2. Even including the IFLA_EXT_MASK
         * attribute, its netlink message is shorter than struct ifinfomsg.
         */
        hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
                 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);

        return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy,
                                      extack);
}

static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct rtnl_link_ops *kind_ops = NULL;
        struct netlink_ext_ack *extack = cb->extack;
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int flags = NLM_F_MULTI;
        struct nlattr *tb[IFLA_MAX+1];
        struct {
                unsigned long ifindex;
        } *ctx = (void *)cb->ctx;
        struct net *tgt_net = net;
        u32 ext_filter_mask = 0;
        struct net_device *dev;
        int master_idx = 0;
        int netnsid = -1;
        int err, i;

        err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
        if (err < 0) {
                if (cb->strict_check)
                        return err;

                goto walk_entries;
        }

        for (i = 0; i <= IFLA_MAX; ++i) {
                if (!tb[i])
                        continue;

                /* new attributes should only be added with strict checking */
                switch (i) {
                case IFLA_TARGET_NETNSID:
                        netnsid = nla_get_s32(tb[i]);
                        tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
                        if (IS_ERR(tgt_net)) {
                                NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
                                return PTR_ERR(tgt_net);
                        }
                        break;
                case IFLA_EXT_MASK:
                        ext_filter_mask = nla_get_u32(tb[i]);
                        break;
                case IFLA_MASTER:
                        master_idx = nla_get_u32(tb[i]);
                        break;
                case IFLA_LINKINFO:
                        kind_ops = linkinfo_to_kind_ops(tb[i]);
                        break;
                default:
                        if (cb->strict_check) {
                                NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
                                return -EINVAL;
                        }
                }
        }

        if (master_idx || kind_ops)
                flags |= NLM_F_DUMP_FILTERED;

walk_entries:
        err = 0;
        for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
                if (link_dump_filtered(dev, master_idx, kind_ops))
                        continue;
                err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
                                       NETLINK_CB(cb->skb).portid,
                                       nlh->nlmsg_seq, 0, flags,
                                       ext_filter_mask, 0, NULL, 0,
                                       netnsid, GFP_KERNEL);
                if (err < 0)
                        break;
        }
        cb->seq = tgt_net->dev_base_seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        if (netnsid >= 0)
                put_net(tgt_net);

        return err;
}

int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
                             struct netlink_ext_ack *exterr)
{
        const struct ifinfomsg *ifmp;
        const struct nlattr *attrs;
        size_t len;

        ifmp = nla_data(nla_peer);
        attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg);
        len = nla_len(nla_peer) - sizeof(struct ifinfomsg);

        if (ifmp->ifi_index < 0) {
                NL_SET_ERR_MSG_ATTR(exterr, nla_peer,
                                    "ifindex can't be negative");
                return -EINVAL;
        }

        return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy,
                                    exterr);
}
EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);

struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
{
        struct net *net;
        /* Examine the link attributes and figure out which
         * network namespace we are talking about.
         */
        if (tb[IFLA_NET_NS_PID])
                net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
        else if (tb[IFLA_NET_NS_FD])
                net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
        else
                net = get_net(src_net);
        return net;
}
EXPORT_SYMBOL(rtnl_link_get_net);

/* Figure out which network namespace we are talking about by
 * examining the link attributes in the following order:
 *
 * 1. IFLA_NET_NS_PID
 * 2. IFLA_NET_NS_FD
 * 3. IFLA_TARGET_NETNSID
 */
static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
                                               struct nlattr *tb[])
{
        struct net *net;

        if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
                return rtnl_link_get_net(src_net, tb);

        if (!tb[IFLA_TARGET_NETNSID])
                return get_net(src_net);

        net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID]));
        if (!net)
                return ERR_PTR(-EINVAL);

        return net;
}

static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
                                             struct net *src_net,
                                             struct nlattr *tb[], int cap)
{
        struct net *net;

        net = rtnl_link_get_net_by_nlattr(src_net, tb);
        if (IS_ERR(net))
                return net;

        if (!netlink_ns_capable(skb, net->user_ns, cap)) {
                put_net(net);
                return ERR_PTR(-EPERM);
        }

        return net;
}

/* Verify that rtnetlink requests do not pass additional properties
 * potentially referring to different network namespaces.
 */
static int rtnl_ensure_unique_netns(struct nlattr *tb[],
                                    struct netlink_ext_ack *extack,
                                    bool netns_id_only)
{

        if (netns_id_only) {
                if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD])
                        return 0;

                NL_SET_ERR_MSG(extack, "specified netns attribute not supported");
                return -EOPNOTSUPP;
        }

        if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
                goto invalid_attr;

        if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD]))
                goto invalid_attr;

        if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID]))
                goto invalid_attr;

        return 0;

invalid_attr:
        NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified");
        return -EINVAL;
}

static        int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
                             int max_tx_rate)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_set_vf_rate)
                return -EOPNOTSUPP;
        if (max_tx_rate && max_tx_rate < min_tx_rate)
                return -EINVAL;

        return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate);
}

static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
                            struct netlink_ext_ack *extack)
{
        if (tb[IFLA_ADDRESS] &&
            nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
                return -EINVAL;

        if (tb[IFLA_BROADCAST] &&
            nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
                return -EINVAL;

        if (tb[IFLA_GSO_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) {
                NL_SET_ERR_MSG(extack, "too big gso_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_GSO_MAX_SEGS] &&
            (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS ||
             nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) {
                NL_SET_ERR_MSG(extack, "too big gso_max_segs");
                return -EINVAL;
        }

        if (tb[IFLA_GRO_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) {
                NL_SET_ERR_MSG(extack, "too big gro_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_GSO_IPV4_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) {
                NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_GRO_IPV4_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) {
                NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_AF_SPEC]) {
                struct nlattr *af;
                int rem, err;

                nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
                        const struct rtnl_af_ops *af_ops;

                        af_ops = rtnl_af_lookup(nla_type(af));
                        if (!af_ops)
                                return -EAFNOSUPPORT;

                        if (!af_ops->set_link_af)
                                return -EOPNOTSUPP;

                        if (af_ops->validate_link_af) {
                                err = af_ops->validate_link_af(dev, af, extack);
                                if (err < 0)
                                        return err;
                        }
                }
        }

        return 0;
}

static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
                                  int guid_type)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
}

static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
{
        if (dev->type != ARPHRD_INFINIBAND)
                return -EOPNOTSUPP;

        return handle_infiniband_guid(dev, ivt, guid_type);
}

static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err = -EINVAL;

        if (tb[IFLA_VF_MAC]) {
                struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);

                if (ivm->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_mac)
                        err = ops->ndo_set_vf_mac(dev, ivm->vf,
                                                  ivm->mac);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_VLAN]) {
                struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);

                if (ivv->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_vlan)
                        err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
                                                   ivv->qos,
                                                   htons(ETH_P_8021Q));
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_VLAN_LIST]) {
                struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
                struct nlattr *attr;
                int rem, len = 0;

                err = -EOPNOTSUPP;
                if (!ops->ndo_set_vf_vlan)
                        return err;

                nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
                        if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
                            nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) {
                                return -EINVAL;
                        }
                        if (len >= MAX_VLAN_LIST_LEN)
                                return -EOPNOTSUPP;
                        ivvl[len] = nla_data(attr);

                        len++;
                }
                if (len == 0)
                        return -EINVAL;

                if (ivvl[0]->vf >= INT_MAX)
                        return -EINVAL;
                err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
                                           ivvl[0]->qos, ivvl[0]->vlan_proto);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_TX_RATE]) {
                struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
                struct ifla_vf_info ivf;

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_get_vf_config)
                        err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
                if (err < 0)
                        return err;

                err = rtnl_set_vf_rate(dev, ivt->vf,
                                       ivf.min_tx_rate, ivt->rate);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_RATE]) {
                struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;

                err = rtnl_set_vf_rate(dev, ivt->vf,
                                       ivt->min_tx_rate, ivt->max_tx_rate);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_SPOOFCHK]) {
                struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);

                if (ivs->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_spoofchk)
                        err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
                                                       ivs->setting);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_LINK_STATE]) {
                struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);

                if (ivl->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_link_state)
                        err = ops->ndo_set_vf_link_state(dev, ivl->vf,
                                                         ivl->link_state);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_RSS_QUERY_EN]) {
                struct ifla_vf_rss_query_en *ivrssq_en;

                err = -EOPNOTSUPP;
                ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
                if (ivrssq_en->vf >= INT_MAX)
                        return -EINVAL;
                if (ops->ndo_set_vf_rss_query_en)
                        err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
                                                           ivrssq_en->setting);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_TRUST]) {
                struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_trust)
                        err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_IB_NODE_GUID]) {
                struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                if (!ops->ndo_set_vf_guid)
                        return -EOPNOTSUPP;
                return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
        }

        if (tb[IFLA_VF_IB_PORT_GUID]) {
                struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                if (!ops->ndo_set_vf_guid)
                        return -EOPNOTSUPP;

                return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
        }

        return err;
}

static int do_set_master(struct net_device *dev, int ifindex,
                         struct netlink_ext_ack *extack)
{
        struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
        const struct net_device_ops *ops;
        int err;

        if (upper_dev) {
                if (upper_dev->ifindex == ifindex)
                        return 0;
                ops = upper_dev->netdev_ops;
                if (ops->ndo_del_slave) {
                        err = ops->ndo_del_slave(upper_dev, dev);
                        if (err)
                                return err;
                } else {
                        return -EOPNOTSUPP;
                }
        }

        if (ifindex) {
                upper_dev = __dev_get_by_index(dev_net(dev), ifindex);
                if (!upper_dev)
                        return -EINVAL;
                ops = upper_dev->netdev_ops;
                if (ops->ndo_add_slave) {
                        err = ops->ndo_add_slave(upper_dev, dev, extack);
                        if (err)
                                return err;
                } else {
                        return -EOPNOTSUPP;
                }
        }
        return 0;
}

static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = {
        [IFLA_PROTO_DOWN_REASON_MASK]        = { .type = NLA_U32 },
        [IFLA_PROTO_DOWN_REASON_VALUE]        = { .type = NLA_U32 },
};

static int do_set_proto_down(struct net_device *dev,
                             struct nlattr *nl_proto_down,
                             struct nlattr *nl_proto_down_reason,
                             struct netlink_ext_ack *extack)
{
        struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1];
        unsigned long mask = 0;
        u32 value;
        bool proto_down;
        int err;

        if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) {
                NL_SET_ERR_MSG(extack,  "Protodown not supported by device");
                return -EOPNOTSUPP;
        }

        if (nl_proto_down_reason) {
                err = nla_parse_nested_deprecated(pdreason,
                                                  IFLA_PROTO_DOWN_REASON_MAX,
                                                  nl_proto_down_reason,
                                                  ifla_proto_down_reason_policy,
                                                  NULL);
                if (err < 0)
                        return err;

                if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) {
                        NL_SET_ERR_MSG(extack, "Invalid protodown reason value");
                        return -EINVAL;
                }

                value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]);

                if (pdreason[IFLA_PROTO_DOWN_REASON_MASK])
                        mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]);

                dev_change_proto_down_reason(dev, mask, value);
        }

        if (nl_proto_down) {
                proto_down = nla_get_u8(nl_proto_down);

                /* Don't turn off protodown if there are active reasons */
                if (!proto_down && dev->proto_down_reason) {
                        NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
                        return -EBUSY;
                }
                err = dev_change_proto_down(dev,
                                            proto_down);
                if (err)
                        return err;
        }

        return 0;
}

#define DO_SETLINK_MODIFIED        0x01
/* notify flag means notify + modified. */
#define DO_SETLINK_NOTIFY        0x03
static int do_setlink(const struct sk_buff *skb,
                      struct net_device *dev, struct ifinfomsg *ifm,
                      struct netlink_ext_ack *extack,
                      struct nlattr **tb, int status)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        char ifname[IFNAMSIZ];
        int err;

        if (tb[IFLA_IFNAME])
                nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
        else
                ifname[0] = '\0';

        if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
                const char *pat = ifname[0] ? ifname : NULL;
                struct net *net;
                int new_ifindex;

                net = rtnl_link_get_net_capable(skb, dev_net(dev),
                                                tb, CAP_NET_ADMIN);
                if (IS_ERR(net)) {
                        err = PTR_ERR(net);
                        goto errout;
                }

                if (tb[IFLA_NEW_IFINDEX])
                        new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]);
                else
                        new_ifindex = 0;

                err = __dev_change_net_namespace(dev, net, pat, new_ifindex);
                put_net(net);
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_MAP]) {
                struct rtnl_link_ifmap *u_map;
                struct ifmap k_map;

                if (!ops->ndo_set_config) {
                        err = -EOPNOTSUPP;
                        goto errout;
                }

                if (!netif_device_present(dev)) {
                        err = -ENODEV;
                        goto errout;
                }

                u_map = nla_data(tb[IFLA_MAP]);
                k_map.mem_start = (unsigned long) u_map->mem_start;
                k_map.mem_end = (unsigned long) u_map->mem_end;
                k_map.base_addr = (unsigned short) u_map->base_addr;
                k_map.irq = (unsigned char) u_map->irq;
                k_map.dma = (unsigned char) u_map->dma;
                k_map.port = (unsigned char) u_map->port;

                err = ops->ndo_set_config(dev, &k_map);
                if (err < 0)
                        goto errout;

                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_ADDRESS]) {
                struct sockaddr *sa;
                int len;

                len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len,
                                                  sizeof(*sa));
                sa = kmalloc(len, GFP_KERNEL);
                if (!sa) {
                        err = -ENOMEM;
                        goto errout;
                }
                sa->sa_family = dev->type;
                memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
                       dev->addr_len);
                err = dev_set_mac_address_user(dev, sa, extack);
                kfree(sa);
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_MTU]) {
                err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_GROUP]) {
                dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
                status |= DO_SETLINK_NOTIFY;
        }

        /*
         * Interface selected by interface index but interface
         * name provided implies that a name change has been
         * requested.
         */
        if (ifm->ifi_index > 0 && ifname[0]) {
                err = dev_change_name(dev, ifname);
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_IFALIAS]) {
                err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
                                    nla_len(tb[IFLA_IFALIAS]));
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_BROADCAST]) {
                nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        }

        if (ifm->ifi_flags || ifm->ifi_change) {
                err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
                                       extack);
                if (err < 0)
                        goto errout;
        }

        if (tb[IFLA_MASTER]) {
                err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_CARRIER]) {
                err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_TXQLEN]) {
                unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);

                err = dev_change_tx_queue_len(dev, value);
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_GSO_MAX_SIZE]) {
                u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);

                if (dev->gso_max_size ^ max_size) {
                        netif_set_gso_max_size(dev, max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GSO_MAX_SEGS]) {
                u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);

                if (dev->gso_max_segs ^ max_segs) {
                        netif_set_gso_max_segs(dev, max_segs);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GRO_MAX_SIZE]) {
                u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);

                if (dev->gro_max_size ^ gro_max_size) {
                        netif_set_gro_max_size(dev, gro_max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GSO_IPV4_MAX_SIZE]) {
                u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]);

                if (dev->gso_ipv4_max_size ^ max_size) {
                        netif_set_gso_ipv4_max_size(dev, max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GRO_IPV4_MAX_SIZE]) {
                u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]);

                if (dev->gro_ipv4_max_size ^ gro_max_size) {
                        netif_set_gro_ipv4_max_size(dev, gro_max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_OPERSTATE])
                set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));

        if (tb[IFLA_LINKMODE]) {
                unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);

                if (dev->link_mode ^ value)
                        status |= DO_SETLINK_NOTIFY;
                WRITE_ONCE(dev->link_mode, value);
        }

        if (tb[IFLA_VFINFO_LIST]) {
                struct nlattr *vfinfo[IFLA_VF_MAX + 1];
                struct nlattr *attr;
                int rem;

                nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
                        if (nla_type(attr) != IFLA_VF_INFO ||
                            nla_len(attr) < NLA_HDRLEN) {
                                err = -EINVAL;
                                goto errout;
                        }
                        err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX,
                                                          attr,
                                                          ifla_vf_policy,
                                                          NULL);
                        if (err < 0)
                                goto errout;
                        err = do_setvfinfo(dev, vfinfo);
                        if (err < 0)
                                goto errout;
                        status |= DO_SETLINK_NOTIFY;
                }
        }
        err = 0;

        if (tb[IFLA_VF_PORTS]) {
                struct nlattr *port[IFLA_PORT_MAX+1];
                struct nlattr *attr;
                int vf;
                int rem;

                err = -EOPNOTSUPP;
                if (!ops->ndo_set_vf_port)
                        goto errout;

                nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
                        if (nla_type(attr) != IFLA_VF_PORT ||
                            nla_len(attr) < NLA_HDRLEN) {
                                err = -EINVAL;
                                goto errout;
                        }
                        err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
                                                          attr,
                                                          ifla_port_policy,
                                                          NULL);
                        if (err < 0)
                                goto errout;
                        if (!port[IFLA_PORT_VF]) {
                                err = -EOPNOTSUPP;
                                goto errout;
                        }
                        vf = nla_get_u32(port[IFLA_PORT_VF]);
                        err = ops->ndo_set_vf_port(dev, vf, port);
                        if (err < 0)
                                goto errout;
                        status |= DO_SETLINK_NOTIFY;
                }
        }
        err = 0;

        if (tb[IFLA_PORT_SELF]) {
                struct nlattr *port[IFLA_PORT_MAX+1];

                err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
                                                  tb[IFLA_PORT_SELF],
                                                  ifla_port_policy, NULL);
                if (err < 0)
                        goto errout;

                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_port)
                        err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_AF_SPEC]) {
                struct nlattr *af;
                int rem;

                nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
                        const struct rtnl_af_ops *af_ops;

                        BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));

                        err = af_ops->set_link_af(dev, af, extack);
                        if (err < 0)
                                goto errout;

                        status |= DO_SETLINK_NOTIFY;
                }
        }
        err = 0;

        if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) {
                err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN],
                                        tb[IFLA_PROTO_DOWN_REASON], extack);
                if (err)
                        goto errout;
                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_XDP]) {
                struct nlattr *xdp[IFLA_XDP_MAX + 1];
                u32 xdp_flags = 0;

                err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX,
                                                  tb[IFLA_XDP],
                                                  ifla_xdp_policy, NULL);
                if (err < 0)
                        goto errout;

                if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) {
                        err = -EINVAL;
                        goto errout;
                }

                if (xdp[IFLA_XDP_FLAGS]) {
                        xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
                        if (xdp_flags & ~XDP_FLAGS_MASK) {
                                err = -EINVAL;
                                goto errout;
                        }
                        if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) {
                                err = -EINVAL;
                                goto errout;
                        }
                }

                if (xdp[IFLA_XDP_FD]) {
                        int expected_fd = -1;

                        if (xdp_flags & XDP_FLAGS_REPLACE) {
                                if (!xdp[IFLA_XDP_EXPECTED_FD]) {
                                        err = -EINVAL;
                                        goto errout;
                                }
                                expected_fd =
                                        nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
                        }

                        err = dev_change_xdp_fd(dev, extack,
                                                nla_get_s32(xdp[IFLA_XDP_FD]),
                                                expected_fd,
                                                xdp_flags);
                        if (err)
                                goto errout;
                        status |= DO_SETLINK_NOTIFY;
                }
        }

errout:
        if (status & DO_SETLINK_MODIFIED) {
                if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
                        netdev_state_change(dev);

                if (err < 0)
                        net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
                                             dev->name);
        }

        return err;
}

static struct net_device *rtnl_dev_get(struct net *net,
                                       struct nlattr *tb[])
{
        char ifname[ALTIFNAMSIZ];

        if (tb[IFLA_IFNAME])
                nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
        else if (tb[IFLA_ALT_IFNAME])
                nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ);
        else
                return NULL;

        return __dev_get_by_name(net, ifname);
}

static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifinfomsg *ifm;
        struct net_device *dev;
        int err;
        struct nlattr *tb[IFLA_MAX+1];

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                     ifla_policy, extack);
        if (err < 0)
                goto errout;

        err = rtnl_ensure_unique_netns(tb, extack, false);
        if (err < 0)
                goto errout;

        err = -EINVAL;
        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(net, tb);
        else
                goto errout;

        if (dev == NULL) {
                err = -ENODEV;
                goto errout;
        }

        err = validate_linkmsg(dev, tb, extack);
        if (err < 0)
                goto errout;

        err = do_setlink(skb, dev, ifm, extack, tb, 0);
errout:
        return err;
}

static int rtnl_group_dellink(const struct net *net, int group)
{
        struct net_device *dev, *aux;
        LIST_HEAD(list_kill);
        bool found = false;

        if (!group)
                return -EPERM;

        for_each_netdev(net, dev) {
                if (dev->group == group) {
                        const struct rtnl_link_ops *ops;

                        found = true;
                        ops = dev->rtnl_link_ops;
                        if (!ops || !ops->dellink)
                                return -EOPNOTSUPP;
                }
        }

        if (!found)
                return -ENODEV;

        for_each_netdev_safe(net, dev, aux) {
                if (dev->group == group) {
                        const struct rtnl_link_ops *ops;

                        ops = dev->rtnl_link_ops;
                        ops->dellink(dev, &list_kill);
                }
        }
        unregister_netdevice_many(&list_kill);

        return 0;
}

int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh)
{
        const struct rtnl_link_ops *ops;
        LIST_HEAD(list_kill);

        ops = dev->rtnl_link_ops;
        if (!ops || !ops->dellink)
                return -EOPNOTSUPP;

        ops->dellink(dev, &list_kill);
        unregister_netdevice_many_notify(&list_kill, portid, nlh);

        return 0;
}
EXPORT_SYMBOL_GPL(rtnl_delete_link);

static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        u32 portid = NETLINK_CB(skb).portid;
        struct net *tgt_net = net;
        struct net_device *dev = NULL;
        struct ifinfomsg *ifm;
        struct nlattr *tb[IFLA_MAX+1];
        int err;
        int netnsid = -1;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                     ifla_policy, extack);
        if (err < 0)
                return err;

        err = rtnl_ensure_unique_netns(tb, extack, true);
        if (err < 0)
                return err;

        if (tb[IFLA_TARGET_NETNSID]) {
                netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
                tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
                if (IS_ERR(tgt_net))
                        return PTR_ERR(tgt_net);
        }

        err = -EINVAL;
        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(net, tb);
        else if (tb[IFLA_GROUP])
                err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
        else
                goto out;

        if (!dev) {
                if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0)
                        err = -ENODEV;

                goto out;
        }

        err = rtnl_delete_link(dev, portid, nlh);

out:
        if (netnsid >= 0)
                put_net(tgt_net);

        return err;
}

int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
                        u32 portid, const struct nlmsghdr *nlh)
{
        unsigned int old_flags;
        int err;

        old_flags = dev->flags;
        if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
                err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
                                         NULL);
                if (err < 0)
                        return err;
        }

        if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
                __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
        } else {
                dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
                __dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
        }
        return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);

struct net_device *rtnl_create_link(struct net *net, const char *ifname,
                                    unsigned char name_assign_type,
                                    const struct rtnl_link_ops *ops,
                                    struct nlattr *tb[],
                                    struct netlink_ext_ack *extack)
{
        struct net_device *dev;
        unsigned int num_tx_queues = 1;
        unsigned int num_rx_queues = 1;
        int err;

        if (tb[IFLA_NUM_TX_QUEUES])
                num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
        else if (ops->get_num_tx_queues)
                num_tx_queues = ops->get_num_tx_queues();

        if (tb[IFLA_NUM_RX_QUEUES])
                num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
        else if (ops->get_num_rx_queues)
                num_rx_queues = ops->get_num_rx_queues();

        if (num_tx_queues < 1 || num_tx_queues > 4096) {
                NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
                return ERR_PTR(-EINVAL);
        }

        if (num_rx_queues < 1 || num_rx_queues > 4096) {
                NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
                return ERR_PTR(-EINVAL);
        }

        if (ops->alloc) {
                dev = ops->alloc(tb, ifname, name_assign_type,
                                 num_tx_queues, num_rx_queues);
                if (IS_ERR(dev))
                        return dev;
        } else {
                dev = alloc_netdev_mqs(ops->priv_size, ifname,
                                       name_assign_type, ops->setup,
                                       num_tx_queues, num_rx_queues);
        }

        if (!dev)
                return ERR_PTR(-ENOMEM);

        err = validate_linkmsg(dev, tb, extack);
        if (err < 0) {
                free_netdev(dev);
                return ERR_PTR(err);
        }

        dev_net_set(dev, net);
        dev->rtnl_link_ops = ops;
        dev->rtnl_link_state = RTNL_LINK_INITIALIZING;

        if (tb[IFLA_MTU]) {
                u32 mtu = nla_get_u32(tb[IFLA_MTU]);

                err = dev_validate_mtu(dev, mtu, extack);
                if (err) {
                        free_netdev(dev);
                        return ERR_PTR(err);
                }
                dev->mtu = mtu;
        }
        if (tb[IFLA_ADDRESS]) {
                __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]),
                               nla_len(tb[IFLA_ADDRESS]));
                dev->addr_assign_type = NET_ADDR_SET;
        }
        if (tb[IFLA_BROADCAST])
                memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
                                nla_len(tb[IFLA_BROADCAST]));
        if (tb[IFLA_TXQLEN])
                dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
        if (tb[IFLA_OPERSTATE])
                set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
        if (tb[IFLA_LINKMODE])
                dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
        if (tb[IFLA_GROUP])
                dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
        if (tb[IFLA_GSO_MAX_SIZE])
                netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
        if (tb[IFLA_GSO_MAX_SEGS])
                netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
        if (tb[IFLA_GRO_MAX_SIZE])
                netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
        if (tb[IFLA_GSO_IPV4_MAX_SIZE])
                netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]));
        if (tb[IFLA_GRO_IPV4_MAX_SIZE])
                netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]));

        return dev;
}
EXPORT_SYMBOL(rtnl_create_link);

static int rtnl_group_changelink(const struct sk_buff *skb,
                struct net *net, int group,
                struct ifinfomsg *ifm,
                struct netlink_ext_ack *extack,
                struct nlattr **tb)
{
        struct net_device *dev, *aux;
        int err;

        for_each_netdev_safe(net, dev, aux) {
                if (dev->group == group) {
                        err = validate_linkmsg(dev, tb, extack);
                        if (err < 0)
                                return err;
                        err = do_setlink(skb, dev, ifm, extack, tb, 0);
                        if (err < 0)
                                return err;
                }
        }

        return 0;
}

static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
                               const struct rtnl_link_ops *ops,
                               const struct nlmsghdr *nlh,
                               struct nlattr **tb, struct nlattr **data,
                               struct netlink_ext_ack *extack)
{
        unsigned char name_assign_type = NET_NAME_USER;
        struct net *net = sock_net(skb->sk);
        u32 portid = NETLINK_CB(skb).portid;
        struct net *dest_net, *link_net;
        struct net_device *dev;
        char ifname[IFNAMSIZ];
        int err;

        if (!ops->alloc && !ops->setup)
                return -EOPNOTSUPP;

        if (tb[IFLA_IFNAME]) {
                nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
        } else {
                snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
                name_assign_type = NET_NAME_ENUM;
        }

        dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
        if (IS_ERR(dest_net))
                return PTR_ERR(dest_net);

        if (tb[IFLA_LINK_NETNSID]) {
                int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);

                link_net = get_net_ns_by_id(dest_net, id);
                if (!link_net) {
                        NL_SET_ERR_MSG(extack, "Unknown network namespace id");
                        err =  -EINVAL;
                        goto out;
                }
                err = -EPERM;
                if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
                        goto out;
        } else {
                link_net = NULL;
        }

        dev = rtnl_create_link(link_net ? : dest_net, ifname,
                               name_assign_type, ops, tb, extack);
        if (IS_ERR(dev)) {
                err = PTR_ERR(dev);
                goto out;
        }

        dev->ifindex = ifm->ifi_index;

        if (ops->newlink)
                err = ops->newlink(link_net ? : net, dev, tb, data, extack);
        else
                err = register_netdevice(dev);
        if (err < 0) {
                free_netdev(dev);
                goto out;
        }

        err = rtnl_configure_link(dev, ifm, portid, nlh);
        if (err < 0)
                goto out_unregister;
        if (link_net) {
                err = dev_change_net_namespace(dev, dest_net, ifname);
                if (err < 0)
                        goto out_unregister;
        }
        if (tb[IFLA_MASTER]) {
                err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
                if (err)
                        goto out_unregister;
        }
out:
        if (link_net)
                put_net(link_net);
        put_net(dest_net);
        return err;
out_unregister:
        if (ops->newlink) {
                LIST_HEAD(list_kill);

                ops->dellink(dev, &list_kill);
                unregister_netdevice_many(&list_kill);
        } else {
                unregister_netdevice(dev);
        }
        goto out;
}

struct rtnl_newlink_tbs {
        struct nlattr *tb[IFLA_MAX + 1];
        struct nlattr *attr[RTNL_MAX_TYPE + 1];
        struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
};

static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct rtnl_newlink_tbs *tbs,
                          struct netlink_ext_ack *extack)
{
        struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
        struct nlattr ** const tb = tbs->tb;
        const struct rtnl_link_ops *m_ops;
        struct net_device *master_dev;
        struct net *net = sock_net(skb->sk);
        const struct rtnl_link_ops *ops;
        struct nlattr **slave_data;
        char kind[MODULE_NAME_LEN];
        struct net_device *dev;
        struct ifinfomsg *ifm;
        struct nlattr **data;
        bool link_specified;
        int err;

#ifdef CONFIG_MODULES
replay:
#endif
        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                     ifla_policy, extack);
        if (err < 0)
                return err;

        err = rtnl_ensure_unique_netns(tb, extack, false);
        if (err < 0)
                return err;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0) {
                link_specified = true;
                dev = __dev_get_by_index(net, ifm->ifi_index);
        } else if (ifm->ifi_index < 0) {
                NL_SET_ERR_MSG(extack, "ifindex can't be negative");
                return -EINVAL;
        } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
                link_specified = true;
                dev = rtnl_dev_get(net, tb);
        } else {
                link_specified = false;
                dev = NULL;
        }

        master_dev = NULL;
        m_ops = NULL;
        if (dev) {
                master_dev = netdev_master_upper_dev_get(dev);
                if (master_dev)
                        m_ops = master_dev->rtnl_link_ops;
        }

        if (tb[IFLA_LINKINFO]) {
                err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
                                                  tb[IFLA_LINKINFO],
                                                  ifla_info_policy, NULL);
                if (err < 0)
                        return err;
        } else
                memset(linkinfo, 0, sizeof(linkinfo));

        if (linkinfo[IFLA_INFO_KIND]) {
                nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
                ops = rtnl_link_ops_get(kind);
        } else {
                kind[0] = '\0';
                ops = NULL;
        }

        data = NULL;
        if (ops) {
                if (ops->maxtype > RTNL_MAX_TYPE)
                        return -EINVAL;

                if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
                        err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
                                                          linkinfo[IFLA_INFO_DATA],
                                                          ops->policy, extack);
                        if (err < 0)
                                return err;
                        data = tbs->attr;
                }
                if (ops->validate) {
                        err = ops->validate(tb, data, extack);
                        if (err < 0)
                                return err;
                }
        }

        slave_data = NULL;
        if (m_ops) {
                if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
                        return -EINVAL;

                if (m_ops->slave_maxtype &&
                    linkinfo[IFLA_INFO_SLAVE_DATA]) {
                        err = nla_parse_nested_deprecated(tbs->slave_attr,
                                                          m_ops->slave_maxtype,
                                                          linkinfo[IFLA_INFO_SLAVE_DATA],
                                                          m_ops->slave_policy,
                                                          extack);
                        if (err < 0)
                                return err;
                        slave_data = tbs->slave_attr;
                }
        }

        if (dev) {
                int status = 0;

                if (nlh->nlmsg_flags & NLM_F_EXCL)
                        return -EEXIST;
                if (nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                err = validate_linkmsg(dev, tb, extack);
                if (err < 0)
                        return err;

                if (linkinfo[IFLA_INFO_DATA]) {
                        if (!ops || ops != dev->rtnl_link_ops ||
                            !ops->changelink)
                                return -EOPNOTSUPP;

                        err = ops->changelink(dev, tb, data, extack);
                        if (err < 0)
                                return err;
                        status |= DO_SETLINK_NOTIFY;
                }

                if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
                        if (!m_ops || !m_ops->slave_changelink)
                                return -EOPNOTSUPP;

                        err = m_ops->slave_changelink(master_dev, dev, tb,
                                                      slave_data, extack);
                        if (err < 0)
                                return err;
                        status |= DO_SETLINK_NOTIFY;
                }

                return do_setlink(skb, dev, ifm, extack, tb, status);
        }

        if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
                /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
                 * or it's for a group
                */
                if (link_specified)
                        return -ENODEV;
                if (tb[IFLA_GROUP])
                        return rtnl_group_changelink(skb, net,
                                                nla_get_u32(tb[IFLA_GROUP]),
                                                ifm, extack, tb);
                return -ENODEV;
        }

        if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
                return -EOPNOTSUPP;

        if (!ops) {
#ifdef CONFIG_MODULES
                if (kind[0]) {
                        __rtnl_unlock();
                        request_module("rtnl-link-%s", kind);
                        rtnl_lock();
                        ops = rtnl_link_ops_get(kind);
                        if (ops)
                                goto replay;
                }
#endif
                NL_SET_ERR_MSG(extack, "Unknown device type");
                return -EOPNOTSUPP;
        }

        return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack);
}

static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct rtnl_newlink_tbs *tbs;
        int ret;

        tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
        if (!tbs)
                return -ENOMEM;

        ret = __rtnl_newlink(skb, nlh, tbs, extack);
        kfree(tbs);
        return ret;
}

static int rtnl_valid_getlink_req(struct sk_buff *skb,
                                  const struct nlmsghdr *nlh,
                                  struct nlattr **tb,
                                  struct netlink_ext_ack *extack)
{
        struct ifinfomsg *ifm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for get link");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                              ifla_policy, extack);

        ifm = nlmsg_data(nlh);
        if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
            ifm->ifi_change) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for get link request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                            ifla_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= IFLA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case IFLA_IFNAME:
                case IFLA_ALT_IFNAME:
                case IFLA_EXT_MASK:
                case IFLA_TARGET_NETNSID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct net *tgt_net = net;
        struct ifinfomsg *ifm;
        struct nlattr *tb[IFLA_MAX+1];
        struct net_device *dev = NULL;
        struct sk_buff *nskb;
        int netnsid = -1;
        int err;
        u32 ext_filter_mask = 0;

        err = rtnl_valid_getlink_req(skb, nlh, tb, extack);
        if (err < 0)
                return err;

        err = rtnl_ensure_unique_netns(tb, extack, true);
        if (err < 0)
                return err;

        if (tb[IFLA_TARGET_NETNSID]) {
                netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
                tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
                if (IS_ERR(tgt_net))
                        return PTR_ERR(tgt_net);
        }

        if (tb[IFLA_EXT_MASK])
                ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);

        err = -EINVAL;
        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(tgt_net, tb);
        else
                goto out;

        err = -ENODEV;
        if (dev == NULL)
                goto out;

        err = -ENOBUFS;
        nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask));
        if (nskb == NULL)
                goto out;

        /* Synchronize the carrier state so we don't report a state
         * that we're not actually going to honour immediately; if
         * the driver just did a carrier off->on transition, we can
         * only TX if link watch work has run, but without this we'd
         * already report carrier on, even if it doesn't work yet.
         */
        linkwatch_sync_dev(dev);

        err = rtnl_fill_ifinfo(nskb, dev, net,
                               RTM_NEWLINK, NETLINK_CB(skb).portid,
                               nlh->nlmsg_seq, 0, 0, ext_filter_mask,
                               0, NULL, 0, netnsid, GFP_KERNEL);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in if_nlmsg_size */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(nskb);
        } else
                err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
out:
        if (netnsid >= 0)
                put_net(tgt_net);

        return err;
}

static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
                           bool *changed, struct netlink_ext_ack *extack)
{
        char *alt_ifname;
        size_t size;
        int err;

        err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
        if (err)
                return err;

        if (cmd == RTM_NEWLINKPROP) {
                size = rtnl_prop_list_size(dev);
                size += nla_total_size(ALTIFNAMSIZ);
                if (size >= U16_MAX) {
                        NL_SET_ERR_MSG(extack,
                                       "effective property list too long");
                        return -EINVAL;
                }
        }

        alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
        if (!alt_ifname)
                return -ENOMEM;

        if (cmd == RTM_NEWLINKPROP) {
                err = netdev_name_node_alt_create(dev, alt_ifname);
                if (!err)
                        alt_ifname = NULL;
        } else if (cmd == RTM_DELLINKPROP) {
                err = netdev_name_node_alt_destroy(dev, alt_ifname);
        } else {
                WARN_ON_ONCE(1);
                err = -EINVAL;
        }

        kfree(alt_ifname);
        if (!err)
                *changed = true;
        return err;
}

static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
                         struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[IFLA_MAX + 1];
        struct net_device *dev;
        struct ifinfomsg *ifm;
        bool changed = false;
        struct nlattr *attr;
        int err, rem;

        err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
        if (err)
                return err;

        err = rtnl_ensure_unique_netns(tb, extack, true);
        if (err)
                return err;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(net, tb);
        else
                return -EINVAL;

        if (!dev)
                return -ENODEV;

        if (!tb[IFLA_PROP_LIST])
                return 0;

        nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) {
                switch (nla_type(attr)) {
                case IFLA_ALT_IFNAME:
                        err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack);
                        if (err)
                                return err;
                        break;
                }
        }

        if (changed)
                netdev_state_change(dev);
        return 0;
}

static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack);
}

static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
}

static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        struct net *net = sock_net(skb->sk);
        size_t min_ifinfo_dump_size = 0;
        struct nlattr *tb[IFLA_MAX+1];
        u32 ext_filter_mask = 0;
        struct net_device *dev;
        int hdrlen;

        /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
        hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
                 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);

        if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) {
                if (tb[IFLA_EXT_MASK])
                        ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
        }

        if (!ext_filter_mask)
                return NLMSG_GOODSIZE;
        /*
         * traverse the list of net devices and compute the minimum
         * buffer size based upon the filter mask.
         */
        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                min_ifinfo_dump_size = max(min_ifinfo_dump_size,
                                           if_nlmsg_size(dev, ext_filter_mask));
        }
        rcu_read_unlock();

        return nlmsg_total_size(min_ifinfo_dump_size);
}

static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
        int idx;
        int s_idx = cb->family;
        int type = cb->nlh->nlmsg_type - RTM_BASE;
        int ret = 0;

        if (s_idx == 0)
                s_idx = 1;

        for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
                struct rtnl_link __rcu **tab;
                struct rtnl_link *link;
                rtnl_dumpit_func dumpit;

                if (idx < s_idx || idx == PF_PACKET)
                        continue;

                if (type < 0 || type >= RTM_NR_MSGTYPES)
                        continue;

                tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
                if (!tab)
                        continue;

                link = rcu_dereference_rtnl(tab[type]);
                if (!link)
                        continue;

                dumpit = link->dumpit;
                if (!dumpit)
                        continue;

                if (idx > s_idx) {
                        memset(&cb->args[0], 0, sizeof(cb->args));
                        cb->prev_seq = 0;
                        cb->seq = 0;
                }
                ret = dumpit(skb, cb);
                if (ret)
                        break;
        }
        cb->family = idx;

        return skb->len ? : ret;
}

struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
                                       unsigned int change,
                                       u32 event, gfp_t flags, int *new_nsid,
                                       int new_ifindex, u32 portid,
                                       const struct nlmsghdr *nlh)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOBUFS;
        u32 seq = 0;

        skb = nlmsg_new(if_nlmsg_size(dev, 0), flags);
        if (skb == NULL)
                goto errout;

        if (nlmsg_report(nlh))
                seq = nlmsg_seq(nlh);
        else
                portid = 0;

        err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
                               type, portid, seq, change, 0, 0, event,
                               new_nsid, new_ifindex, -1, flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in if_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        return skb;
errout:
        if (err < 0)
                rtnl_set_sk_err(net, RTNLGRP_LINK, err);
        return NULL;
}

void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags,
                       u32 portid, const struct nlmsghdr *nlh)
{
        struct net *net = dev_net(dev);

        rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags);
}

static void rtmsg_ifinfo_event(int type, struct net_device *dev,
                               unsigned int change, u32 event,
                               gfp_t flags, int *new_nsid, int new_ifindex,
                               u32 portid, const struct nlmsghdr *nlh)
{
        struct sk_buff *skb;

        if (dev->reg_state != NETREG_REGISTERED)
                return;

        skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
                                     new_ifindex, portid, nlh);
        if (skb)
                rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
}

void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
                  gfp_t flags, u32 portid, const struct nlmsghdr *nlh)
{
        rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
                           NULL, 0, portid, nlh);
}

void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
                         gfp_t flags, int *new_nsid, int new_ifindex)
{
        rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
                           new_nsid, new_ifindex, 0, NULL);
}

static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
                                   struct net_device *dev,
                                   u8 *addr, u16 vid, u32 pid, u32 seq,
                                   int type, unsigned int flags,
                                   int nlflags, u16 ndm_state)
{
        struct nlmsghdr *nlh;
        struct ndmsg *ndm;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
        if (!nlh)
                return -EMSGSIZE;

        ndm = nlmsg_data(nlh);
        ndm->ndm_family  = AF_BRIDGE;
        ndm->ndm_pad1         = 0;
        ndm->ndm_pad2    = 0;
        ndm->ndm_flags         = flags;
        ndm->ndm_type         = 0;
        ndm->ndm_ifindex = dev->ifindex;
        ndm->ndm_state   = ndm_state;

        if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr))
                goto nla_put_failure;
        if (vid)
                if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
                        goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev)
{
        return NLMSG_ALIGN(sizeof(struct ndmsg)) +
               nla_total_size(dev->addr_len) +        /* NDA_LLADDR */
               nla_total_size(sizeof(u16)) +        /* NDA_VLAN */
               0;
}

static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
                            u16 ndm_state)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
                                      0, 0, type, NTF_SELF, 0, ndm_state);
        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

/*
 * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
 */
int ndo_dflt_fdb_add(struct ndmsg *ndm,
                     struct nlattr *tb[],
                     struct net_device *dev,
                     const unsigned char *addr, u16 vid,
                     u16 flags)
{
        int err = -EINVAL;

        /* If aging addresses are supported device will need to
         * implement its own handler for this.
         */
        if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
                netdev_info(dev, "default FDB implementation only supports local addresses\n");
                return err;
        }

        if (tb[NDA_FLAGS_EXT]) {
                netdev_info(dev, "invalid flags given to default FDB implementation\n");
                return err;
        }

        if (vid) {
                netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
                return err;
        }

        if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
                err = dev_uc_add_excl(dev, addr);
        else if (is_multicast_ether_addr(addr))
                err = dev_mc_add_excl(dev, addr);

        /* Only return duplicate errors if NLM_F_EXCL is set */
        if (err == -EEXIST && !(flags & NLM_F_EXCL))
                err = 0;

        return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_add);

static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid,
                         struct netlink_ext_ack *extack)
{
        u16 vid = 0;

        if (vlan_attr) {
                if (nla_len(vlan_attr) != sizeof(u16)) {
                        NL_SET_ERR_MSG(extack, "invalid vlan attribute size");
                        return -EINVAL;
                }

                vid = nla_get_u16(vlan_attr);

                if (!vid || vid >= VLAN_VID_MASK) {
                        NL_SET_ERR_MSG(extack, "invalid vlan id");
                        return -EINVAL;
                }
        }
        *p_vid = vid;
        return 0;
}

static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ndmsg *ndm;
        struct nlattr *tb[NDA_MAX+1];
        struct net_device *dev;
        u8 *addr;
        u16 vid;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL,
                                     extack);
        if (err < 0)
                return err;

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_ifindex == 0) {
                NL_SET_ERR_MSG(extack, "invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, ndm->ndm_ifindex);
        if (dev == NULL) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
                NL_SET_ERR_MSG(extack, "invalid address");
                return -EINVAL;
        }

        if (dev->type != ARPHRD_ETHER) {
                NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices");
                return -EINVAL;
        }

        addr = nla_data(tb[NDA_LLADDR]);

        err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
        if (err)
                return err;

        err = -EOPNOTSUPP;

        /* Support fdb on master device the net/bridge default case */
        if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
            netif_is_bridge_port(dev)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);
                const struct net_device_ops *ops = br_dev->netdev_ops;

                err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
                                       nlh->nlmsg_flags, extack);
                if (err)
                        goto out;
                else
                        ndm->ndm_flags &= ~NTF_MASTER;
        }

        /* Embedded bridge, macvlan, and any other device support */
        if ((ndm->ndm_flags & NTF_SELF)) {
                if (dev->netdev_ops->ndo_fdb_add)
                        err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
                                                           vid,
                                                           nlh->nlmsg_flags,
                                                           extack);
                else
                        err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
                                               nlh->nlmsg_flags);

                if (!err) {
                        rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
                                        ndm->ndm_state);
                        ndm->ndm_flags &= ~NTF_SELF;
                }
        }
out:
        return err;
}

/*
 * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
 */
int ndo_dflt_fdb_del(struct ndmsg *ndm,
                     struct nlattr *tb[],
                     struct net_device *dev,
                     const unsigned char *addr, u16 vid)
{
        int err = -EINVAL;

        /* If aging addresses are supported device will need to
         * implement its own handler for this.
         */
        if (!(ndm->ndm_state & NUD_PERMANENT)) {
                netdev_info(dev, "default FDB implementation only supports local addresses\n");
                return err;
        }

        if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
                err = dev_uc_del(dev, addr);
        else if (is_multicast_ether_addr(addr))
                err = dev_mc_del(dev, addr);

        return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_del);

static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
        struct net *net = sock_net(skb->sk);
        const struct net_device_ops *ops;
        struct ndmsg *ndm;
        struct nlattr *tb[NDA_MAX+1];
        struct net_device *dev;
        __u8 *addr = NULL;
        int err;
        u16 vid;

        if (!netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if (!del_bulk) {
                err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
                                             NULL, extack);
        } else {
                /* For bulk delete, the drivers will parse the message with
                 * policy.
                 */
                err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
        }
        if (err < 0)
                return err;

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_ifindex == 0) {
                NL_SET_ERR_MSG(extack, "invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, ndm->ndm_ifindex);
        if (dev == NULL) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        if (!del_bulk) {
                if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
                        NL_SET_ERR_MSG(extack, "invalid address");
                        return -EINVAL;
                }
                addr = nla_data(tb[NDA_LLADDR]);

                err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
                if (err)
                        return err;
        }

        if (dev->type != ARPHRD_ETHER) {
                NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices");
                return -EINVAL;
        }

        err = -EOPNOTSUPP;

        /* Support fdb on master device the net/bridge default case */
        if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
            netif_is_bridge_port(dev)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);

                ops = br_dev->netdev_ops;
                if (!del_bulk) {
                        if (ops->ndo_fdb_del)
                                err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
                } else {
                        if (ops->ndo_fdb_del_bulk)
                                err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
                }

                if (err)
                        goto out;
                else
                        ndm->ndm_flags &= ~NTF_MASTER;
        }

        /* Embedded bridge, macvlan, and any other device support */
        if (ndm->ndm_flags & NTF_SELF) {
                ops = dev->netdev_ops;
                if (!del_bulk) {
                        if (ops->ndo_fdb_del)
                                err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
                        else
                                err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
                } else {
                        /* in case err was cleared by NTF_MASTER call */
                        err = -EOPNOTSUPP;
                        if (ops->ndo_fdb_del_bulk)
                                err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
                }

                if (!err) {
                        if (!del_bulk)
                                rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
                                                ndm->ndm_state);
                        ndm->ndm_flags &= ~NTF_SELF;
                }
        }
out:
        return err;
}

static int nlmsg_populate_fdb(struct sk_buff *skb,
                              struct netlink_callback *cb,
                              struct net_device *dev,
                              int *idx,
                              struct netdev_hw_addr_list *list)
{
        struct netdev_hw_addr *ha;
        int err;
        u32 portid, seq;

        portid = NETLINK_CB(cb->skb).portid;
        seq = cb->nlh->nlmsg_seq;

        list_for_each_entry(ha, &list->list, list) {
                if (*idx < cb->args[2])
                        goto skip;

                err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
                                              portid, seq,
                                              RTM_NEWNEIGH, NTF_SELF,
                                              NLM_F_MULTI, NUD_PERMANENT);
                if (err < 0)
                        return err;
skip:
                *idx += 1;
        }
        return 0;
}

/**
 * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
 * @skb: socket buffer to store message in
 * @cb: netlink callback
 * @dev: netdevice
 * @filter_dev: ignored
 * @idx: the number of FDB table entries dumped is added to *@idx
 *
 * Default netdevice operation to dump the existing unicast address list.
 * Returns number of addresses from list put in skb.
 */
int ndo_dflt_fdb_dump(struct sk_buff *skb,
                      struct netlink_callback *cb,
                      struct net_device *dev,
                      struct net_device *filter_dev,
                      int *idx)
{
        int err;

        if (dev->type != ARPHRD_ETHER)
                return -EINVAL;

        netif_addr_lock_bh(dev);
        err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
        if (err)
                goto out;
        err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
out:
        netif_addr_unlock_bh(dev);
        return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);

static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
                                 int *br_idx, int *brport_idx,
                                 struct netlink_ext_ack *extack)
{
        struct nlattr *tb[NDA_MAX + 1];
        struct ndmsg *ndm;
        int err, i;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
                return -EINVAL;
        }

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_pad1  || ndm->ndm_pad2  || ndm->ndm_state ||
            ndm->ndm_flags || ndm->ndm_type) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
                                            NDA_MAX, NULL, extack);
        if (err < 0)
                return err;

        *brport_idx = ndm->ndm_ifindex;
        for (i = 0; i <= NDA_MAX; ++i) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NDA_IFINDEX:
                        if (nla_len(tb[i]) != sizeof(u32)) {
                                NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request");
                                return -EINVAL;
                        }
                        *brport_idx = nla_get_u32(tb[NDA_IFINDEX]);
                        break;
                case NDA_MASTER:
                        if (nla_len(tb[i]) != sizeof(u32)) {
                                NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request");
                                return -EINVAL;
                        }
                        *br_idx = nla_get_u32(tb[NDA_MASTER]);
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
                                 int *br_idx, int *brport_idx,
                                 struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_MAX+1];
        int err;

        /* A hack to preserve kernel<->userspace interface.
         * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0.
         * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails.
         * So, check for ndmsg with an optional u32 attribute (not used here).
         * Fortunately these sizes don't conflict with the size of ifinfomsg
         * with an optional attribute.
         */
        if (nlmsg_len(nlh) != sizeof(struct ndmsg) &&
            (nlmsg_len(nlh) != sizeof(struct ndmsg) +
             nla_attr_size(sizeof(u32)))) {
                struct ifinfomsg *ifm;

                err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
                                             tb, IFLA_MAX, ifla_policy,
                                             extack);
                if (err < 0) {
                        return -EINVAL;
                } else if (err == 0) {
                        if (tb[IFLA_MASTER])
                                *br_idx = nla_get_u32(tb[IFLA_MASTER]);
                }

                ifm = nlmsg_data(nlh);
                *brport_idx = ifm->ifi_index;
        }
        return 0;
}

static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net_device *dev;
        struct net_device *br_dev = NULL;
        const struct net_device_ops *ops = NULL;
        const struct net_device_ops *cops = NULL;
        struct net *net = sock_net(skb->sk);
        struct hlist_head *head;
        int brport_idx = 0;
        int br_idx = 0;
        int h, s_h;
        int idx = 0, s_idx;
        int err = 0;
        int fidx = 0;

        if (cb->strict_check)
                err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
                                            cb->extack);
        else
                err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
                                            cb->extack);
        if (err < 0)
                return err;

        if (br_idx) {
                br_dev = __dev_get_by_index(net, br_idx);
                if (!br_dev)
                        return -ENODEV;

                ops = br_dev->netdev_ops;
        }

        s_h = cb->args[0];
        s_idx = cb->args[1];

        for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
                idx = 0;
                head = &net->dev_index_head[h];
                hlist_for_each_entry(dev, head, index_hlist) {

                        if (brport_idx && (dev->ifindex != brport_idx))
                                continue;

                        if (!br_idx) { /* user did not specify a specific bridge */
                                if (netif_is_bridge_port(dev)) {
                                        br_dev = netdev_master_upper_dev_get(dev);
                                        cops = br_dev->netdev_ops;
                                }
                        } else {
                                if (dev != br_dev &&
                                    !netif_is_bridge_port(dev))
                                        continue;

                                if (br_dev != netdev_master_upper_dev_get(dev) &&
                                    !netif_is_bridge_master(dev))
                                        continue;
                                cops = ops;
                        }

                        if (idx < s_idx)
                                goto cont;

                        if (netif_is_bridge_port(dev)) {
                                if (cops && cops->ndo_fdb_dump) {
                                        err = cops->ndo_fdb_dump(skb, cb,
                                                                br_dev, dev,
                                                                &fidx);
                                        if (err == -EMSGSIZE)
                                                goto out;
                                }
                        }

                        if (dev->netdev_ops->ndo_fdb_dump)
                                err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
                                                                    dev, NULL,
                                                                    &fidx);
                        else
                                err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
                                                        &fidx);
                        if (err == -EMSGSIZE)
                                goto out;

                        cops = NULL;

                        /* reset fdb offset to 0 for rest of the interfaces */
                        cb->args[2] = 0;
                        fidx = 0;
cont:
                        idx++;
                }
        }

out:
        cb->args[0] = h;
        cb->args[1] = idx;
        cb->args[2] = fidx;

        return skb->len;
}

static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
                                struct nlattr **tb, u8 *ndm_flags,
                                int *br_idx, int *brport_idx, u8 **addr,
                                u16 *vid, struct netlink_ext_ack *extack)
{
        struct ndmsg *ndm;
        int err, i;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
                return -EINVAL;
        }

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_pad1  || ndm->ndm_pad2  || ndm->ndm_state ||
            ndm->ndm_type) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
                return -EINVAL;
        }

        if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) {
                NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
                                            NDA_MAX, nda_policy, extack);
        if (err < 0)
                return err;

        *ndm_flags = ndm->ndm_flags;
        *brport_idx = ndm->ndm_ifindex;
        for (i = 0; i <= NDA_MAX; ++i) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NDA_MASTER:
                        *br_idx = nla_get_u32(tb[i]);
                        break;
                case NDA_LLADDR:
                        if (nla_len(tb[i]) != ETH_ALEN) {
                                NL_SET_ERR_MSG(extack, "Invalid address in fdb get request");
                                return -EINVAL;
                        }
                        *addr = nla_data(tb[i]);
                        break;
                case NDA_VLAN:
                        err = fdb_vid_parse(tb[i], vid, extack);
                        if (err)
                                return err;
                        break;
                case NDA_VNI:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net_device *dev = NULL, *br_dev = NULL;
        const struct net_device_ops *ops = NULL;
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[NDA_MAX + 1];
        struct sk_buff *skb;
        int brport_idx = 0;
        u8 ndm_flags = 0;
        int br_idx = 0;
        u8 *addr = NULL;
        u16 vid = 0;
        int err;

        err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx,
                                   &brport_idx, &addr, &vid, extack);
        if (err < 0)
                return err;

        if (!addr) {
                NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request");
                return -EINVAL;
        }

        if (brport_idx) {
                dev = __dev_get_by_index(net, brport_idx);
                if (!dev) {
                        NL_SET_ERR_MSG(extack, "Unknown device ifindex");
                        return -ENODEV;
                }
        }

        if (br_idx) {
                if (dev) {
                        NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive");
                        return -EINVAL;
                }

                br_dev = __dev_get_by_index(net, br_idx);
                if (!br_dev) {
                        NL_SET_ERR_MSG(extack, "Invalid master ifindex");
                        return -EINVAL;
                }
                ops = br_dev->netdev_ops;
        }

        if (dev) {
                if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
                        if (!netif_is_bridge_port(dev)) {
                                NL_SET_ERR_MSG(extack, "Device is not a bridge port");
                                return -EINVAL;
                        }
                        br_dev = netdev_master_upper_dev_get(dev);
                        if (!br_dev) {
                                NL_SET_ERR_MSG(extack, "Master of device not found");
                                return -EINVAL;
                        }
                        ops = br_dev->netdev_ops;
                } else {
                        if (!(ndm_flags & NTF_SELF)) {
                                NL_SET_ERR_MSG(extack, "Missing NTF_SELF");
                                return -EINVAL;
                        }
                        ops = dev->netdev_ops;
                }
        }

        if (!br_dev && !dev) {
                NL_SET_ERR_MSG(extack, "No device specified");
                return -ENODEV;
        }

        if (!ops || !ops->ndo_fdb_get) {
                NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device");
                return -EOPNOTSUPP;
        }

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;

        if (br_dev)
                dev = br_dev;
        err = ops->ndo_fdb_get(skb, tb, dev, addr, vid,
                               NETLINK_CB(in_skb).portid,
                               nlh->nlmsg_seq, extack);
        if (err)
                goto out;

        return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
out:
        kfree_skb(skb);
        return err;
}

static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
                               unsigned int attrnum, unsigned int flag)
{
        if (mask & flag)
                return nla_put_u8(skb, attrnum, !!(flags & flag));
        return 0;
}

int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
                            struct net_device *dev, u16 mode,
                            u32 flags, u32 mask, int nlflags,
                            u32 filter_mask,
                            int (*vlan_fill)(struct sk_buff *skb,
                                             struct net_device *dev,
                                             u32 filter_mask))
{
        struct nlmsghdr *nlh;
        struct ifinfomsg *ifm;
        struct nlattr *br_afspec;
        struct nlattr *protinfo;
        u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
        struct net_device *br_dev = netdev_master_upper_dev_get(dev);
        int err = 0;

        nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
        if (nlh == NULL)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifi_family = AF_BRIDGE;
        ifm->__ifi_pad = 0;
        ifm->ifi_type = dev->type;
        ifm->ifi_index = dev->ifindex;
        ifm->ifi_flags = dev_get_flags(dev);
        ifm->ifi_change = 0;


        if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
            nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
            nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
            (br_dev &&
             nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
            (dev->addr_len &&
             nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
            (dev->ifindex != dev_get_iflink(dev) &&
             nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
                goto nla_put_failure;

        br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
        if (!br_afspec)
                goto nla_put_failure;

        if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
                nla_nest_cancel(skb, br_afspec);
                goto nla_put_failure;
        }

        if (mode != BRIDGE_MODE_UNDEF) {
                if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
                        nla_nest_cancel(skb, br_afspec);
                        goto nla_put_failure;
                }
        }
        if (vlan_fill) {
                err = vlan_fill(skb, dev, filter_mask);
                if (err) {
                        nla_nest_cancel(skb, br_afspec);
                        goto nla_put_failure;
                }
        }
        nla_nest_end(skb, br_afspec);

        protinfo = nla_nest_start(skb, IFLA_PROTINFO);
        if (!protinfo)
                goto nla_put_failure;

        if (brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_FAST_LEAVE,
                                BR_MULTICAST_FAST_LEAVE) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_LEARNING, BR_LEARNING) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_PROXYARP, BR_PROXYARP) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
                nla_nest_cancel(skb, protinfo);
                goto nla_put_failure;
        }

        nla_nest_end(skb, protinfo);

        nlmsg_end(skb, nlh);
        return 0;
nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return err ? err : -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);

static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
                                    bool strict_check, u32 *filter_mask,
                                    struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_MAX+1];
        int err, i;

        if (strict_check) {
                struct ifinfomsg *ifm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                        NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
                        return -EINVAL;
                }

                ifm = nlmsg_data(nlh);
                if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
                    ifm->ifi_change || ifm->ifi_index) {
                        NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
                        return -EINVAL;
                }

                err = nlmsg_parse_deprecated_strict(nlh,
                                                    sizeof(struct ifinfomsg),
                                                    tb, IFLA_MAX, ifla_policy,
                                                    extack);
        } else {
                err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
                                             tb, IFLA_MAX, ifla_policy,
                                             extack);
        }
        if (err < 0)
                return err;

        /* new attributes should only be added with strict checking */
        for (i = 0; i <= IFLA_MAX; ++i) {
                if (!tb[i])
                        continue;

                switch (i) {
                case IFLA_EXT_MASK:
                        *filter_mask = nla_get_u32(tb[i]);
                        break;
                default:
                        if (strict_check) {
                                NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct net_device *dev;
        int idx = 0;
        u32 portid = NETLINK_CB(cb->skb).portid;
        u32 seq = nlh->nlmsg_seq;
        u32 filter_mask = 0;
        int err;

        err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask,
                                       cb->extack);
        if (err < 0 && cb->strict_check)
                return err;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                const struct net_device_ops *ops = dev->netdev_ops;
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);

                if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
                        if (idx >= cb->args[0]) {
                                err = br_dev->netdev_ops->ndo_bridge_getlink(
                                                skb, portid, seq, dev,
                                                filter_mask, NLM_F_MULTI);
                                if (err < 0 && err != -EOPNOTSUPP) {
                                        if (likely(skb->len))
                                                break;

                                        goto out_err;
                                }
                        }
                        idx++;
                }

                if (ops->ndo_bridge_getlink) {
                        if (idx >= cb->args[0]) {
                                err = ops->ndo_bridge_getlink(skb, portid,
                                                              seq, dev,
                                                              filter_mask,
                                                              NLM_F_MULTI);
                                if (err < 0 && err != -EOPNOTSUPP) {
                                        if (likely(skb->len))
                                                break;

                                        goto out_err;
                                }
                        }
                        idx++;
                }
        }
        err = skb->len;
out_err:
        rcu_read_unlock();
        cb->args[0] = idx;

        return err;
}

static inline size_t bridge_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ifinfomsg))
                + nla_total_size(IFNAMSIZ)        /* IFLA_IFNAME */
                + nla_total_size(MAX_ADDR_LEN)        /* IFLA_ADDRESS */
                + nla_total_size(sizeof(u32))        /* IFLA_MASTER */
                + nla_total_size(sizeof(u32))        /* IFLA_MTU */
                + nla_total_size(sizeof(u32))        /* IFLA_LINK */
                + nla_total_size(sizeof(u32))        /* IFLA_OPERSTATE */
                + nla_total_size(sizeof(u8))        /* IFLA_PROTINFO */
                + nla_total_size(sizeof(struct nlattr))        /* IFLA_AF_SPEC */
                + nla_total_size(sizeof(u16))        /* IFLA_BRIDGE_FLAGS */
                + nla_total_size(sizeof(u16));        /* IFLA_BRIDGE_MODE */
}

static int rtnl_bridge_notify(struct net_device *dev)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -EOPNOTSUPP;

        if (!dev->netdev_ops->ndo_bridge_getlink)
                return 0;

        skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
        if (!skb) {
                err = -ENOMEM;
                goto errout;
        }

        err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
        if (err < 0)
                goto errout;

        /* Notification info is only filled for bridge ports, not the bridge
         * device itself. Therefore, a zero notification length is valid and
         * should not result in an error.
         */
        if (!skb->len)
                goto errout;

        rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
        return 0;
errout:
        WARN_ON(err == -EMSGSIZE);
        kfree_skb(skb);
        if (err)
                rtnl_set_sk_err(net, RTNLGRP_LINK, err);
        return err;
}

static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifinfomsg *ifm;
        struct net_device *dev;
        struct nlattr *br_spec, *attr, *br_flags_attr = NULL;
        int rem, err = -EOPNOTSUPP;
        u16 flags = 0;

        if (nlmsg_len(nlh) < sizeof(*ifm))
                return -EINVAL;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_family != AF_BRIDGE)
                return -EPFNOSUPPORT;

        dev = __dev_get_by_index(net, ifm->ifi_index);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
        if (br_spec) {
                nla_for_each_nested(attr, br_spec, rem) {
                        if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) {
                                if (nla_len(attr) < sizeof(flags))
                                        return -EINVAL;

                                br_flags_attr = attr;
                                flags = nla_get_u16(attr);
                        }

                        if (nla_type(attr) == IFLA_BRIDGE_MODE) {
                                if (nla_len(attr) < sizeof(u16))
                                        return -EINVAL;
                        }
                }
        }

        if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);

                if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
                        err = -EOPNOTSUPP;
                        goto out;
                }

                err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags,
                                                             extack);
                if (err)
                        goto out;

                flags &= ~BRIDGE_FLAGS_MASTER;
        }

        if ((flags & BRIDGE_FLAGS_SELF)) {
                if (!dev->netdev_ops->ndo_bridge_setlink)
                        err = -EOPNOTSUPP;
                else
                        err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
                                                                  flags,
                                                                  extack);
                if (!err) {
                        flags &= ~BRIDGE_FLAGS_SELF;

                        /* Generate event to notify upper layer of bridge
                         * change
                         */
                        err = rtnl_bridge_notify(dev);
                }
        }

        if (br_flags_attr)
                memcpy(nla_data(br_flags_attr), &flags, sizeof(flags));
out:
        return err;
}

static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifinfomsg *ifm;
        struct net_device *dev;
        struct nlattr *br_spec, *attr = NULL;
        int rem, err = -EOPNOTSUPP;
        u16 flags = 0;
        bool have_flags = false;

        if (nlmsg_len(nlh) < sizeof(*ifm))
                return -EINVAL;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_family != AF_BRIDGE)
                return -EPFNOSUPPORT;

        dev = __dev_get_by_index(net, ifm->ifi_index);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
        if (br_spec) {
                nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec,
                                         rem) {
                        if (nla_len(attr) < sizeof(flags))
                                return -EINVAL;

                        have_flags = true;
                        flags = nla_get_u16(attr);
                        break;
                }
        }

        if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);

                if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
                        err = -EOPNOTSUPP;
                        goto out;
                }

                err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
                if (err)
                        goto out;

                flags &= ~BRIDGE_FLAGS_MASTER;
        }

        if ((flags & BRIDGE_FLAGS_SELF)) {
                if (!dev->netdev_ops->ndo_bridge_dellink)
                        err = -EOPNOTSUPP;
                else
                        err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh,
                                                                  flags);

                if (!err) {
                        flags &= ~BRIDGE_FLAGS_SELF;

                        /* Generate event to notify upper layer of bridge
                         * change
                         */
                        err = rtnl_bridge_notify(dev);
                }
        }

        if (have_flags)
                memcpy(nla_data(attr), &flags, sizeof(flags));
out:
        return err;
}

static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
{
        return (mask & IFLA_STATS_FILTER_BIT(attrid)) &&
               (!idxattr || idxattr == attrid);
}

static bool
rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id)
{
        return dev->netdev_ops &&
               dev->netdev_ops->ndo_has_offload_stats &&
               dev->netdev_ops->ndo_get_offload_stats &&
               dev->netdev_ops->ndo_has_offload_stats(dev, attr_id);
}

static unsigned int
rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id)
{
        return rtnl_offload_xstats_have_ndo(dev, attr_id) ?
               sizeof(struct rtnl_link_stats64) : 0;
}

static int
rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
                             struct sk_buff *skb)
{
        unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id);
        struct nlattr *attr = NULL;
        void *attr_data;
        int err;

        if (!size)
                return -ENODATA;

        attr = nla_reserve_64bit(skb, attr_id, size,
                                 IFLA_OFFLOAD_XSTATS_UNSPEC);
        if (!attr)
                return -EMSGSIZE;

        attr_data = nla_data(attr);
        memset(attr_data, 0, size);

        err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data);
        if (err)
                return err;

        return 0;
}

static unsigned int
rtnl_offload_xstats_get_size_stats(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type)
{
        bool enabled = netdev_offload_xstats_enabled(dev, type);

        return enabled ? sizeof(struct rtnl_hw_stats64) : 0;
}

struct rtnl_offload_xstats_request_used {
        bool request;
        bool used;
};

static int
rtnl_offload_xstats_get_stats(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_offload_xstats_request_used *ru,
                              struct rtnl_hw_stats64 *stats,
                              struct netlink_ext_ack *extack)
{
        bool request;
        bool used;
        int err;

        request = netdev_offload_xstats_enabled(dev, type);
        if (!request) {
                used = false;
                goto out;
        }

        err = netdev_offload_xstats_get(dev, type, stats, &used, extack);
        if (err)
                return err;

out:
        if (ru) {
                ru->request = request;
                ru->used = used;
        }
        return 0;
}

static int
rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id,
                                       struct rtnl_offload_xstats_request_used *ru)
{
        struct nlattr *nest;

        nest = nla_nest_start(skb, attr_id);
        if (!nest)
                return -EMSGSIZE;

        if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request))
                goto nla_put_failure;

        if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static int
rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev,
                                   struct netlink_ext_ack *extack)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        struct rtnl_offload_xstats_request_used ru_l3;
        struct nlattr *nest;
        int err;

        err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack);
        if (err)
                return err;

        nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO);
        if (!nest)
                return -EMSGSIZE;

        if (rtnl_offload_xstats_fill_hw_s_info_one(skb,
                                                   IFLA_OFFLOAD_XSTATS_L3_STATS,
                                                   &ru_l3))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
                                    int *prividx, u32 off_filter_mask,
                                    struct netlink_ext_ack *extack)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO;
        int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS;
        int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
        bool have_data = false;
        int err;

        if (*prividx <= attr_id_cpu_hit &&
            (off_filter_mask &
             IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) {
                err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb);
                if (!err) {
                        have_data = true;
                } else if (err != -ENODATA) {
                        *prividx = attr_id_cpu_hit;
                        return err;
                }
        }

        if (*prividx <= attr_id_hw_s_info &&
            (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) {
                *prividx = attr_id_hw_s_info;

                err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack);
                if (err)
                        return err;

                have_data = true;
                *prividx = 0;
        }

        if (*prividx <= attr_id_l3_stats &&
            (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) {
                unsigned int size_l3;
                struct nlattr *attr;

                *prividx = attr_id_l3_stats;

                size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3);
                if (!size_l3)
                        goto skip_l3_stats;
                attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3,
                                         IFLA_OFFLOAD_XSTATS_UNSPEC);
                if (!attr)
                        return -EMSGSIZE;

                err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL,
                                                    nla_data(attr), extack);
                if (err)
                        return err;

                have_data = true;
skip_l3_stats:
                *prividx = 0;
        }

        if (!have_data)
                return -ENODATA;

        *prividx = 0;
        return 0;
}

static unsigned int
rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
                                           enum netdev_offload_xstats_type type)
{
        return nla_total_size(0) +
                /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
                nla_total_size(sizeof(u8)) +
                /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
                nla_total_size(sizeof(u8)) +
                0;
}

static unsigned int
rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;

        return nla_total_size(0) +
                /* IFLA_OFFLOAD_XSTATS_L3_STATS */
                rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) +
                0;
}

static int rtnl_offload_xstats_get_size(const struct net_device *dev,
                                        u32 off_filter_mask)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
        int nla_size = 0;
        int size;

        if (off_filter_mask &
            IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) {
                size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
                nla_size += nla_total_size_64bit(size);
        }

        if (off_filter_mask &
            IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO))
                nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev);

        if (off_filter_mask &
            IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) {
                size = rtnl_offload_xstats_get_size_stats(dev, t_l3);
                nla_size += nla_total_size_64bit(size);
        }

        if (nla_size != 0)
                nla_size += nla_total_size(0);

        return nla_size;
}

struct rtnl_stats_dump_filters {
        /* mask[0] filters outer attributes. Then individual nests have their
         * filtering mask at the index of the nested attribute.
         */
        u32 mask[IFLA_STATS_MAX + 1];
};

static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
                               int type, u32 pid, u32 seq, u32 change,
                               unsigned int flags,
                               const struct rtnl_stats_dump_filters *filters,
                               int *idxattr, int *prividx,
                               struct netlink_ext_ack *extack)
{
        unsigned int filter_mask = filters->mask[0];
        struct if_stats_msg *ifsm;
        struct nlmsghdr *nlh;
        struct nlattr *attr;
        int s_prividx = *prividx;
        int err;

        ASSERT_RTNL();

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags);
        if (!nlh)
                return -EMSGSIZE;

        ifsm = nlmsg_data(nlh);
        ifsm->family = PF_UNSPEC;
        ifsm->pad1 = 0;
        ifsm->pad2 = 0;
        ifsm->ifindex = dev->ifindex;
        ifsm->filter_mask = filter_mask;

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) {
                struct rtnl_link_stats64 *sp;

                attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
                                         sizeof(struct rtnl_link_stats64),
                                         IFLA_STATS_UNSPEC);
                if (!attr) {
                        err = -EMSGSIZE;
                        goto nla_put_failure;
                }

                sp = nla_data(attr);
                dev_get_stats(dev, sp);
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) {
                const struct rtnl_link_ops *ops = dev->rtnl_link_ops;

                if (ops && ops->fill_linkxstats) {
                        *idxattr = IFLA_STATS_LINK_XSTATS;
                        attr = nla_nest_start_noflag(skb,
                                                     IFLA_STATS_LINK_XSTATS);
                        if (!attr) {
                                err = -EMSGSIZE;
                                goto nla_put_failure;
                        }

                        err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
                        nla_nest_end(skb, attr);
                        if (err)
                                goto nla_put_failure;
                        *idxattr = 0;
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE,
                             *idxattr)) {
                const struct rtnl_link_ops *ops = NULL;
                const struct net_device *master;

                master = netdev_master_upper_dev_get(dev);
                if (master)
                        ops = master->rtnl_link_ops;
                if (ops && ops->fill_linkxstats) {
                        *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
                        attr = nla_nest_start_noflag(skb,
                                                     IFLA_STATS_LINK_XSTATS_SLAVE);
                        if (!attr) {
                                err = -EMSGSIZE;
                                goto nla_put_failure;
                        }

                        err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
                        nla_nest_end(skb, attr);
                        if (err)
                                goto nla_put_failure;
                        *idxattr = 0;
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
                             *idxattr)) {
                u32 off_filter_mask;

                off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
                *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
                attr = nla_nest_start_noflag(skb,
                                             IFLA_STATS_LINK_OFFLOAD_XSTATS);
                if (!attr) {
                        err = -EMSGSIZE;
                        goto nla_put_failure;
                }

                err = rtnl_offload_xstats_fill(skb, dev, prividx,
                                               off_filter_mask, extack);
                if (err == -ENODATA)
                        nla_nest_cancel(skb, attr);
                else
                        nla_nest_end(skb, attr);

                if (err && err != -ENODATA)
                        goto nla_put_failure;
                *idxattr = 0;
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
                struct rtnl_af_ops *af_ops;

                *idxattr = IFLA_STATS_AF_SPEC;
                attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC);
                if (!attr) {
                        err = -EMSGSIZE;
                        goto nla_put_failure;
                }

                rcu_read_lock();
                list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                        if (af_ops->fill_stats_af) {
                                struct nlattr *af;

                                af = nla_nest_start_noflag(skb,
                                                           af_ops->family);
                                if (!af) {
                                        rcu_read_unlock();
                                        err = -EMSGSIZE;
                                        goto nla_put_failure;
                                }
                                err = af_ops->fill_stats_af(skb, dev);

                                if (err == -ENODATA) {
                                        nla_nest_cancel(skb, af);
                                } else if (err < 0) {
                                        rcu_read_unlock();
                                        goto nla_put_failure;
                                }

                                nla_nest_end(skb, af);
                        }
                }
                rcu_read_unlock();

                nla_nest_end(skb, attr);

                *idxattr = 0;
        }

        nlmsg_end(skb, nlh);

        return 0;

nla_put_failure:
        /* not a multi message or no progress mean a real error */
        if (!(flags & NLM_F_MULTI) || s_prividx == *prividx)
                nlmsg_cancel(skb, nlh);
        else
                nlmsg_end(skb, nlh);

        return err;
}

static size_t if_nlmsg_stats_size(const struct net_device *dev,
                                  const struct rtnl_stats_dump_filters *filters)
{
        size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
        unsigned int filter_mask = filters->mask[0];

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
                size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
                const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
                int attr = IFLA_STATS_LINK_XSTATS;

                if (ops && ops->get_linkxstats_size) {
                        size += nla_total_size(ops->get_linkxstats_size(dev,
                                                                        attr));
                        /* for IFLA_STATS_LINK_XSTATS */
                        size += nla_total_size(0);
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) {
                struct net_device *_dev = (struct net_device *)dev;
                const struct rtnl_link_ops *ops = NULL;
                const struct net_device *master;

                /* netdev_master_upper_dev_get can't take const */
                master = netdev_master_upper_dev_get(_dev);
                if (master)
                        ops = master->rtnl_link_ops;
                if (ops && ops->get_linkxstats_size) {
                        int attr = IFLA_STATS_LINK_XSTATS_SLAVE;

                        size += nla_total_size(ops->get_linkxstats_size(dev,
                                                                        attr));
                        /* for IFLA_STATS_LINK_XSTATS_SLAVE */
                        size += nla_total_size(0);
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) {
                u32 off_filter_mask;

                off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
                size += rtnl_offload_xstats_get_size(dev, off_filter_mask);
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
                struct rtnl_af_ops *af_ops;

                /* for IFLA_STATS_AF_SPEC */
                size += nla_total_size(0);

                rcu_read_lock();
                list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                        if (af_ops->get_stats_af_size) {
                                size += nla_total_size(
                                        af_ops->get_stats_af_size(dev));

                                /* for AF_* */
                                size += nla_total_size(0);
                        }
                }
                rcu_read_unlock();
        }

        return size;
}

#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1)

static const struct nla_policy
rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = {
        [IFLA_STATS_LINK_OFFLOAD_XSTATS] =
                    NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID),
};

static const struct nla_policy
rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
        [IFLA_STATS_GET_FILTERS] =
                    NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
};

static const struct nla_policy
ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
        [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1),
};

static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
                                        struct rtnl_stats_dump_filters *filters,
                                        struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_STATS_MAX + 1];
        int err;
        int at;

        err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters,
                               rtnl_stats_get_policy_filters, extack);
        if (err < 0)
                return err;

        for (at = 1; at <= IFLA_STATS_MAX; at++) {
                if (tb[at]) {
                        if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) {
                                NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask");
                                return -EINVAL;
                        }
                        filters->mask[at] = nla_get_u32(tb[at]);
                }
        }

        return 0;
}

static int rtnl_stats_get_parse(const struct nlmsghdr *nlh,
                                u32 filter_mask,
                                struct rtnl_stats_dump_filters *filters,
                                struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
        int err;
        int i;

        filters->mask[0] = filter_mask;
        for (i = 1; i < ARRAY_SIZE(filters->mask); i++)
                filters->mask[i] = -1U;

        err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb,
                          IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack);
        if (err < 0)
                return err;

        if (tb[IFLA_STATS_GET_FILTERS]) {
                err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS],
                                                   filters, extack);
                if (err)
                        return err;
        }

        return 0;
}

static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
                                bool is_dump, struct netlink_ext_ack *extack)
{
        struct if_stats_msg *ifsm;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
                return -EINVAL;
        }

        if (!strict_check)
                return 0;

        ifsm = nlmsg_data(nlh);

        /* only requests using strict checks can pass data to influence
         * the dump. The legacy exception is filter_mask.
         */
        if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
                return -EINVAL;
        }
        if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) {
                NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask");
                return -EINVAL;
        }

        return 0;
}

static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct rtnl_stats_dump_filters filters;
        struct net *net = sock_net(skb->sk);
        struct net_device *dev = NULL;
        int idxattr = 0, prividx = 0;
        struct if_stats_msg *ifsm;
        struct sk_buff *nskb;
        int err;

        err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
                                   false, extack);
        if (err)
                return err;

        ifsm = nlmsg_data(nlh);
        if (ifsm->ifindex > 0)
                dev = __dev_get_by_index(net, ifsm->ifindex);
        else
                return -EINVAL;

        if (!dev)
                return -ENODEV;

        if (!ifsm->filter_mask) {
                NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
                return -EINVAL;
        }

        err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack);
        if (err)
                return err;

        nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL);
        if (!nskb)
                return -ENOBUFS;

        err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
                                  NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
                                  0, &filters, &idxattr, &prividx, extack);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(nskb);
        } else {
                err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
        }

        return err;
}

static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct rtnl_stats_dump_filters filters;
        struct net *net = sock_net(skb->sk);
        unsigned int flags = NLM_F_MULTI;
        struct if_stats_msg *ifsm;
        struct {
                unsigned long ifindex;
                int idxattr;
                int prividx;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        int err;

        cb->seq = net->dev_base_seq;

        err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack);
        if (err)
                return err;

        ifsm = nlmsg_data(cb->nlh);
        if (!ifsm->filter_mask) {
                NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
                return -EINVAL;
        }

        err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters,
                                   extack);
        if (err)
                return err;

        for_each_netdev_dump(net, dev, ctx->ifindex) {
                err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
                                          NETLINK_CB(cb->skb).portid,
                                          cb->nlh->nlmsg_seq, 0,
                                          flags, &filters,
                                          &ctx->idxattr, &ctx->prividx,
                                          extack);
                /* If we ran out of room on the first message,
                 * we're in trouble.
                 */
                WARN_ON((err == -EMSGSIZE) && (skb->len == 0));

                if (err < 0)
                        break;
                ctx->prividx = 0;
                ctx->idxattr = 0;
                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        }

        return err;
}

void rtnl_offload_xstats_notify(struct net_device *dev)
{
        struct rtnl_stats_dump_filters response_filters = {};
        struct net *net = dev_net(dev);
        int idxattr = 0, prividx = 0;
        struct sk_buff *skb;
        int err = -ENOBUFS;

        ASSERT_RTNL();

        response_filters.mask[0] |=
                IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
        response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
                IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);

        skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
                        GFP_KERNEL);
        if (!skb)
                goto errout;

        err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0,
                                  &response_filters, &idxattr, &prividx, NULL);
        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL);
        return;

errout:
        rtnl_set_sk_err(net, RTNLGRP_STATS, err);
}
EXPORT_SYMBOL(rtnl_offload_xstats_notify);

static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        struct rtnl_stats_dump_filters response_filters = {};
        struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct net_device *dev = NULL;
        struct if_stats_msg *ifsm;
        bool notify = false;
        int err;

        err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
                                   false, extack);
        if (err)
                return err;

        ifsm = nlmsg_data(nlh);
        if (ifsm->family != AF_UNSPEC) {
                NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC");
                return -EINVAL;
        }

        if (ifsm->ifindex > 0)
                dev = __dev_get_by_index(net, ifsm->ifindex);
        else
                return -EINVAL;

        if (!dev)
                return -ENODEV;

        if (ifsm->filter_mask) {
                NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set");
                return -EINVAL;
        }

        err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX,
                          ifla_stats_set_policy, extack);
        if (err < 0)
                return err;

        if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) {
                u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]);

                if (req)
                        err = netdev_offload_xstats_enable(dev, t_l3, extack);
                else
                        err = netdev_offload_xstats_disable(dev, t_l3);

                if (!err)
                        notify = true;
                else if (err != -EALREADY)
                        return err;

                response_filters.mask[0] |=
                        IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
                response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
                        IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
        }

        if (notify)
                rtnl_offload_xstats_notify(dev);

        return 0;
}

static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh,
                                   struct netlink_ext_ack *extack)
{
        struct br_port_msg *bpm;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request");
                return -EINVAL;
        }

        bpm = nlmsg_data(nlh);
        if (bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request");
                return -EINVAL;
        }
        if (nlmsg_attrlen(nlh, sizeof(*bpm))) {
                NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request");
                return -EINVAL;
        }

        return 0;
}

struct rtnl_mdb_dump_ctx {
        long idx;
};

static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx;
        struct net *net = sock_net(skb->sk);
        struct net_device *dev;
        int idx, s_idx;
        int err;

        NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx);

        if (cb->strict_check) {
                err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack);
                if (err)
                        return err;
        }

        s_idx = ctx->idx;
        idx = 0;

        for_each_netdev(net, dev) {
                if (idx < s_idx)
                        goto skip;
                if (!dev->netdev_ops->ndo_mdb_dump)
                        goto skip;

                err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb);
                if (err == -EMSGSIZE)
                        goto out;
                /* Moving on to next device, reset markers and sequence
                 * counters since they are all maintained per-device.
                 */
                memset(cb->ctx, 0, sizeof(cb->ctx));
                cb->prev_seq = 0;
                cb->seq = 0;
skip:
                idx++;
        }

out:
        ctx->idx = idx;
        return skb->len;
}

static int rtnl_validate_mdb_entry_get(const struct nlattr *attr,
                                       struct netlink_ext_ack *extack)
{
        struct br_mdb_entry *entry = nla_data(attr);

        if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
                NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
                return -EINVAL;
        }

        if (entry->ifindex) {
                NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified");
                return -EINVAL;
        }

        if (entry->state) {
                NL_SET_ERR_MSG(extack, "Entry state cannot be specified");
                return -EINVAL;
        }

        if (entry->flags) {
                NL_SET_ERR_MSG(extack, "Entry flags cannot be specified");
                return -EINVAL;
        }

        if (entry->vid >= VLAN_VID_MASK) {
                NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
                return -EINVAL;
        }

        if (entry->addr.proto != htons(ETH_P_IP) &&
            entry->addr.proto != htons(ETH_P_IPV6) &&
            entry->addr.proto != 0) {
                NL_SET_ERR_MSG(extack, "Unknown entry protocol");
                return -EINVAL;
        }

        return 0;
}

static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = {
        [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
                                                  rtnl_validate_mdb_entry_get,
                                                  sizeof(struct br_mdb_entry)),
        [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED },
};

static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1];
        struct net *net = sock_net(in_skb->sk);
        struct br_port_msg *bpm;
        struct net_device *dev;
        int err;

        err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb,
                          MDBA_GET_ENTRY_MAX, mdba_get_policy, extack);
        if (err)
                return err;

        bpm = nlmsg_data(nlh);
        if (!bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, bpm->ifindex);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Device doesn't exist");
                return -ENODEV;
        }

        if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) {
                NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute");
                return -EINVAL;
        }

        if (!dev->netdev_ops->ndo_mdb_get) {
                NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
                return -EOPNOTSUPP;
        }

        return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, extack);
}

static int rtnl_validate_mdb_entry(const struct nlattr *attr,
                                   struct netlink_ext_ack *extack)
{
        struct br_mdb_entry *entry = nla_data(attr);

        if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
                NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
                return -EINVAL;
        }

        if (entry->ifindex == 0) {
                NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed");
                return -EINVAL;
        }

        if (entry->addr.proto == htons(ETH_P_IP)) {
                if (!ipv4_is_multicast(entry->addr.u.ip4) &&
                    !ipv4_is_zeronet(entry->addr.u.ip4)) {
                        NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0");
                        return -EINVAL;
                }
                if (ipv4_is_local_multicast(entry->addr.u.ip4)) {
                        NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast");
                        return -EINVAL;
                }
#if IS_ENABLED(CONFIG_IPV6)
        } else if (entry->addr.proto == htons(ETH_P_IPV6)) {
                if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) {
                        NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes");
                        return -EINVAL;
                }
#endif
        } else if (entry->addr.proto == 0) {
                /* L2 mdb */
                if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) {
                        NL_SET_ERR_MSG(extack, "L2 entry group is not multicast");
                        return -EINVAL;
                }
        } else {
                NL_SET_ERR_MSG(extack, "Unknown entry protocol");
                return -EINVAL;
        }

        if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
                NL_SET_ERR_MSG(extack, "Unknown entry state");
                return -EINVAL;
        }
        if (entry->vid >= VLAN_VID_MASK) {
                NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
                return -EINVAL;
        }

        return 0;
}

static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = {
        [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 },
        [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
                                                  rtnl_validate_mdb_entry,
                                                  sizeof(struct br_mdb_entry)),
        [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
};

static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct br_port_msg *bpm;
        struct net_device *dev;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
                                     MDBA_SET_ENTRY_MAX, mdba_policy, extack);
        if (err)
                return err;

        bpm = nlmsg_data(nlh);
        if (!bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, bpm->ifindex);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Device doesn't exist");
                return -ENODEV;
        }

        if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
                NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
                return -EINVAL;
        }

        if (!dev->netdev_ops->ndo_mdb_add) {
                NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
                return -EOPNOTSUPP;
        }

        return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack);
}

static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr,
                                            struct netlink_ext_ack *extack)
{
        struct br_mdb_entry *entry = nla_data(attr);
        struct br_mdb_entry zero_entry = {};

        if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
                NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
                return -EINVAL;
        }

        if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
                NL_SET_ERR_MSG(extack, "Unknown entry state");
                return -EINVAL;
        }

        if (entry->flags) {
                NL_SET_ERR_MSG(extack, "Entry flags cannot be set");
                return -EINVAL;
        }

        if (entry->vid >= VLAN_N_VID - 1) {
                NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
                return -EINVAL;
        }

        if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) {
                NL_SET_ERR_MSG(extack, "Entry address cannot be set");
                return -EINVAL;
        }

        return 0;
}

static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = {
        [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
                                                  rtnl_validate_mdb_entry_del_bulk,
                                                  sizeof(struct br_mdb_entry)),
        [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
};

static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
        struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct br_port_msg *bpm;
        struct net_device *dev;
        int err;

        if (!del_bulk)
                err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
                                             MDBA_SET_ENTRY_MAX, mdba_policy,
                                             extack);
        else
                err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX,
                                  mdba_del_bulk_policy, extack);
        if (err)
                return err;

        bpm = nlmsg_data(nlh);
        if (!bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, bpm->ifindex);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Device doesn't exist");
                return -ENODEV;
        }

        if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
                NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
                return -EINVAL;
        }

        if (del_bulk) {
                if (!dev->netdev_ops->ndo_mdb_del_bulk) {
                        NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion");
                        return -EOPNOTSUPP;
                }
                return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack);
        }

        if (!dev->netdev_ops->ndo_mdb_del) {
                NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
                return -EOPNOTSUPP;
        }

        return dev->netdev_ops->ndo_mdb_del(dev, tb, extack);
}

/* Process one rtnetlink message. */

static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED);
        rtnl_dumpit_func dumpit = cb->data;
        int err;

        /* Previous iteration have already finished, avoid calling->dumpit()
         * again, it may not expect to be called after it reached the end.
         */
        if (!dumpit)
                return 0;

        if (needs_lock)
                rtnl_lock();
        err = dumpit(skb, cb);
        if (needs_lock)
                rtnl_unlock();

        /* Old dump handlers used to send NLM_DONE as in a separate recvmsg().
         * Some applications which parse netlink manually depend on this.
         */
        if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) {
                if (err < 0 && err != -EMSGSIZE)
                        return err;
                if (!err)
                        cb->data = NULL;

                return skb->len;
        }
        return err;
}

static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                                const struct nlmsghdr *nlh,
                                struct netlink_dump_control *control)
{
        if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE ||
            !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) {
                WARN_ON(control->data);
                control->data = control->dump;
                control->dump = rtnl_dumpit;
        }

        return netlink_dump_start(ssk, skb, nlh, control);
}

static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct rtnl_link *link;
        enum rtnl_kinds kind;
        struct module *owner;
        int err = -EOPNOTSUPP;
        rtnl_doit_func doit;
        unsigned int flags;
        int family;
        int type;

        type = nlh->nlmsg_type;
        if (type > RTM_MAX)
                return -EOPNOTSUPP;

        type -= RTM_BASE;

        /* All the messages must have at least 1 byte length */
        if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
                return 0;

        family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
        kind = rtnl_msgtype_kind(type);

        if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        rcu_read_lock();
        if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) {
                struct sock *rtnl;
                rtnl_dumpit_func dumpit;
                u32 min_dump_alloc = 0;

                link = rtnl_get_link(family, type);
                if (!link || !link->dumpit) {
                        family = PF_UNSPEC;
                        link = rtnl_get_link(family, type);
                        if (!link || !link->dumpit)
                                goto err_unlock;
                }
                owner = link->owner;
                dumpit = link->dumpit;
                flags = link->flags;

                if (type == RTM_GETLINK - RTM_BASE)
                        min_dump_alloc = rtnl_calcit(skb, nlh);

                err = 0;
                /* need to do this before rcu_read_unlock() */
                if (!try_module_get(owner))
                        err = -EPROTONOSUPPORT;

                rcu_read_unlock();

                rtnl = net->rtnl;
                if (err == 0) {
                        struct netlink_dump_control c = {
                                .dump                = dumpit,
                                .min_dump_alloc        = min_dump_alloc,
                                .module                = owner,
                                .flags                = flags,
                        };
                        err = rtnetlink_dump_start(rtnl, skb, nlh, &c);
                        /* netlink_dump_start() will keep a reference on
                         * module if dump is still in progress.
                         */
                        module_put(owner);
                }
                return err;
        }

        link = rtnl_get_link(family, type);
        if (!link || !link->doit) {
                family = PF_UNSPEC;
                link = rtnl_get_link(PF_UNSPEC, type);
                if (!link || !link->doit)
                        goto out_unlock;
        }

        owner = link->owner;
        if (!try_module_get(owner)) {
                err = -EPROTONOSUPPORT;
                goto out_unlock;
        }

        flags = link->flags;
        if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) &&
            !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) {
                NL_SET_ERR_MSG(extack, "Bulk delete is not supported");
                module_put(owner);
                goto err_unlock;
        }

        if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
                doit = link->doit;
                rcu_read_unlock();
                if (doit)
                        err = doit(skb, nlh, extack);
                module_put(owner);
                return err;
        }
        rcu_read_unlock();

        rtnl_lock();
        link = rtnl_get_link(family, type);
        if (link && link->doit)
                err = link->doit(skb, nlh, extack);
        rtnl_unlock();

        module_put(owner);

        return err;

out_unlock:
        rcu_read_unlock();
        return err;

err_unlock:
        rcu_read_unlock();
        return -EOPNOTSUPP;
}

static void rtnetlink_rcv(struct sk_buff *skb)
{
        netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
}

static int rtnetlink_bind(struct net *net, int group)
{
        switch (group) {
        case RTNLGRP_IPV4_MROUTE_R:
        case RTNLGRP_IPV6_MROUTE_R:
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                break;
        }
        return 0;
}

static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_REBOOT:
        case NETDEV_CHANGEMTU:
        case NETDEV_CHANGEADDR:
        case NETDEV_CHANGENAME:
        case NETDEV_FEAT_CHANGE:
        case NETDEV_BONDING_FAILOVER:
        case NETDEV_POST_TYPE_CHANGE:
        case NETDEV_NOTIFY_PEERS:
        case NETDEV_CHANGEUPPER:
        case NETDEV_RESEND_IGMP:
        case NETDEV_CHANGEINFODATA:
        case NETDEV_CHANGELOWERSTATE:
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
                                   GFP_KERNEL, NULL, 0, 0, NULL);
                break;
        default:
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block rtnetlink_dev_notifier = {
        .notifier_call        = rtnetlink_event,
};


static int __net_init rtnetlink_net_init(struct net *net)
{
        struct sock *sk;
        struct netlink_kernel_cfg cfg = {
                .groups                = RTNLGRP_MAX,
                .input                = rtnetlink_rcv,
                .flags                = NL_CFG_F_NONROOT_RECV,
                .bind                = rtnetlink_bind,
        };

        sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
        if (!sk)
                return -ENOMEM;
        net->rtnl = sk;
        return 0;
}

static void __net_exit rtnetlink_net_exit(struct net *net)
{
        netlink_kernel_release(net->rtnl);
        net->rtnl = NULL;
}

static struct pernet_operations rtnetlink_net_ops = {
        .init = rtnetlink_net_init,
        .exit = rtnetlink_net_exit,
};

void __init rtnetlink_init(void)
{
        if (register_pernet_subsys(&rtnetlink_net_ops))
                panic("rtnetlink_init: cannot initialize rtnetlink\n");

        register_netdevice_notifier(&rtnetlink_dev_notifier);

        rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
                      rtnl_dump_ifinfo, RTNL_FLAG_DUMP_SPLIT_NLM_DONE);
        rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);

        rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
        rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
        rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);

        rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0);
        rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);

        rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
        rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL,
                      RTNL_FLAG_BULK_DEL_SUPPORTED);
        rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);

        rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
        rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
        rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);

        rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
                      0);
        rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);

        rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0);
        rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0);
        rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL,
                      RTNL_FLAG_BULK_DEL_SUPPORTED);
}

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





















































































































































































































































































































































































































































































































































































































































































































































































































    1 


















    1 









































    1 














































































































































































































































































    1 












































































































































































































































    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
// SPDX-License-Identifier: GPL-2.0
/*
 *  ext4.h
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/include/linux/minix_fs.h
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#ifndef _EXT4_H
#define _EXT4_H

#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <crypto/hash.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
#include <uapi/linux/ext4.h>

#include <linux/fscrypt.h>
#include <linux/fsverity.h>

#include <linux/compiler.h>

/*
 * The fourth extended filesystem constants/structures
 */

/*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
 * structures. these checks slow things down a lot
 */
#define AGGRESSIVE_CHECK__

/*
 * with DOUBLE_CHECK defined mballoc creates persistent in-core
 * bitmaps, maintains and uses them to check for double allocations
 */
#define DOUBLE_CHECK__

/*
 * Define EXT4FS_DEBUG to produce debug messages
 */
#undef EXT4FS_DEBUG

/*
 * Debug code
 */
#ifdef EXT4FS_DEBUG
#define ext4_debug(f, a...)                                                \
        do {                                                                \
                printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",        \
                        __FILE__, __LINE__, __func__);                        \
                printk(KERN_DEBUG f, ## a);                                \
        } while (0)
#else
#define ext4_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

 /*
  * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
  */
#define EXT_DEBUG__

/*
 * Dynamic printk for controlled extents debugging.
 */
#ifdef CONFIG_EXT4_DEBUG
#define ext_debug(ino, fmt, ...)                                        \
        pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt,        \
                 current->comm, task_pid_nr(current),                        \
                 ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__,        \
                 __func__, ##__VA_ARGS__)
#else
#define ext_debug(ino, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

#define ASSERT(assert)                                                \
do {                                                                        \
        if (unlikely(!(assert))) {                                        \
                printk(KERN_EMERG                                        \
                       "Assertion failure in %s() at %s:%d: '%s'\n",        \
                       __func__, __FILE__, __LINE__, #assert);                \
                BUG();                                                        \
        }                                                                \
} while (0)

/* data type for block offset of block group */
typedef int ext4_grpblk_t;

/* data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;

/* data type for file logical block number */
typedef __u32 ext4_lblk_t;

/* data type for block group number */
typedef unsigned int ext4_group_t;

enum SHIFT_DIRECTION {
        SHIFT_LEFT = 0,
        SHIFT_RIGHT,
};

/*
 * For each criteria, mballoc has slightly different way of finding
 * the required blocks nad usually, higher the criteria the slower the
 * allocation.  We start at lower criterias and keep falling back to
 * higher ones if we are not able to find any blocks.  Lower (earlier)
 * criteria are faster.
 */
enum criteria {
        /*
         * Used when number of blocks needed is a power of 2. This
         * doesn't trigger any disk IO except prefetch and is the
         * fastest criteria.
         */
        CR_POWER2_ALIGNED,

        /*
         * Tries to lookup in-memory data structures to find the most
         * suitable group that satisfies goal request. No disk IO
         * except block prefetch.
         */
        CR_GOAL_LEN_FAST,

        /*
         * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal
         * length to the best available length for faster allocation.
         */
        CR_BEST_AVAIL_LEN,

        /*
         * Reads each block group sequentially, performing disk IO if
         * necessary, to find find_suitable block group. Tries to
         * allocate goal length but might trim the request if nothing
         * is found after enough tries.
         */
        CR_GOAL_LEN_SLOW,

        /*
         * Finds the first free set of blocks and allocates
         * those. This is only used in rare cases when
         * CR_GOAL_LEN_SLOW also fails to allocate anything.
         */
        CR_ANY_FREE,

        /*
         * Number of criterias defined.
         */
        EXT4_MB_NUM_CRS
};

/*
 * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
 */

/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE                0x0001
/* blocks already reserved */
#define EXT4_MB_HINT_RESERVED                0x0002
/* metadata is being allocated */
#define EXT4_MB_HINT_METADATA                0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST                0x0008
/* search for the best chunk */
#define EXT4_MB_HINT_BEST                0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA                0x0020
/* don't preallocate (for tails) */
#define EXT4_MB_HINT_NOPREALLOC                0x0040
/* allocate for locality group */
#define EXT4_MB_HINT_GROUP_ALLOC        0x0080
/* allocate goal blocks or none */
#define EXT4_MB_HINT_GOAL_ONLY                0x0100
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL                0x0200
/* blocks already pre-reserved by delayed allocation */
#define EXT4_MB_DELALLOC_RESERVED        0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC                0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS                0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED                0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK                0x4000
/* Large fragment size list lookup succeeded at least once for
 * CR_POWER2_ALIGNED */
#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED                0x8000
/* Avg fragment size rb tree lookup succeeded at least once for
 * CR_GOAL_LEN_FAST */
#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED                0x00010000
/* Avg fragment size rb tree lookup succeeded at least once for
 * CR_BEST_AVAIL_LEN */
#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED                0x00020000

struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
        /* how many blocks we want to allocate */
        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
        /* phys. target (a hint) */
        ext4_fsblk_t goal;
        /* phys. block for the closest logical allocated block to the left */
        ext4_fsblk_t pleft;
        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
};

/*
 * Logical to physical block mapping, used by ext4_map_blocks()
 *
 * This structure is used to pass requests into ext4_map_blocks() as
 * well as to store the information returned by ext4_map_blocks().  It
 * takes less room on the stack than a struct buffer_head.
 */
#define EXT4_MAP_NEW                BIT(BH_New)
#define EXT4_MAP_MAPPED                BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN        BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY        BIT(BH_Boundary)
#define EXT4_MAP_DELAYED        BIT(BH_Delay)
#define EXT4_MAP_FLAGS                (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
                                 EXT4_MAP_DELAYED)

struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
        ext4_lblk_t m_lblk;
        unsigned int m_len;
        unsigned int m_flags;
};

/*
 * Block validity checking, system zone rbtree.
 */
struct ext4_system_blocks {
        struct rb_root root;
        struct rcu_head rcu;
};

/*
 * Flags for ext4_io_end->flags
 */
#define        EXT4_IO_END_UNWRITTEN        0x0001

struct ext4_io_end_vec {
        struct list_head list;                /* list of io_end_vec */
        loff_t offset;                        /* offset in the file */
        ssize_t size;                        /* size of the extent */
};

/*
 * For converting unwritten extents on a work queue. 'handle' is used for
 * buffered writeback.
 */
typedef struct ext4_io_end {
        struct list_head        list;                /* per-file finished IO list */
        handle_t                *handle;        /* handle reserved for extent
                                                 * conversion */
        struct inode                *inode;                /* file being written to */
        struct bio                *bio;                /* Linked list of completed
                                                 * bios covering the extent */
        unsigned int                flag;                /* unwritten or not */
        refcount_t                count;                /* reference counter */
        struct list_head        list_vec;        /* list of ext4_io_end_vec */
} ext4_io_end_t;

struct ext4_io_submit {
        struct writeback_control *io_wbc;
        struct bio                *io_bio;
        ext4_io_end_t                *io_end;
        sector_t                io_next_block;
};

/*
 * Special inodes numbers
 */
#define        EXT4_BAD_INO                 1        /* Bad blocks inode */
#define EXT4_ROOT_INO                 2        /* Root inode */
#define EXT4_USR_QUOTA_INO         3        /* User quota inode */
#define EXT4_GRP_QUOTA_INO         4        /* Group quota inode */
#define EXT4_BOOT_LOADER_INO         5        /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO         6        /* Undelete directory inode */
#define EXT4_RESIZE_INO                 7        /* Reserved group descriptors inode */
#define EXT4_JOURNAL_INO         8        /* Journal inode */

/* First non-reserved inode for old ext4 filesystems */
#define EXT4_GOOD_OLD_FIRST_INO        11

/*
 * Maximal count of links to a file
 */
#define EXT4_LINK_MAX                65000

/*
 * Macro-instructions used to manage several block sizes
 */
#define EXT4_MIN_BLOCK_SIZE                1024
#define        EXT4_MAX_BLOCK_SIZE                65536
#define EXT4_MIN_BLOCK_LOG_SIZE                10
#define EXT4_MAX_BLOCK_LOG_SIZE                16
#define EXT4_MAX_CLUSTER_LOG_SIZE        30
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE(s)                ((s)->s_blocksize)
#else
# define EXT4_BLOCK_SIZE(s)                (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define        EXT4_ADDR_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
#define EXT4_CLUSTER_SIZE(s)                (EXT4_BLOCK_SIZE(s) << \
                                         EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
# define EXT4_CLUSTER_BITS(s)                (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_log_block_size + 10)
#endif
#ifdef __KERNEL__
#define        EXT4_ADDR_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_addr_per_block_bits)
#define EXT4_INODE_SIZE(s)                (EXT4_SB(s)->s_inode_size)
#define EXT4_FIRST_INO(s)                (EXT4_SB(s)->s_first_ino)
#else
#define EXT4_INODE_SIZE(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_INODE_SIZE : \
                                 (s)->s_inode_size)
#define EXT4_FIRST_INO(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_FIRST_INO : \
                                 (s)->s_first_ino)
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits)                ALIGN((size), (1 << (blkbits)))
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
        ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
                                                                  blkbits))

/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk)        ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
#define EXT4_C2B(sbi, cluster)        ((cluster) << (sbi)->s_cluster_bits)
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks)        (((blks) + (sbi)->s_cluster_ratio - 1) >> \
                                 (sbi)->s_cluster_bits)
/* Mask out the low bits to get the starting block of the cluster */
#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &                                \
                                  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &                                \
                                  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/* Fill in the low bits to get the last block of the cluster */
#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |                                \
                                    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) &                                \
                                 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_COFF(s, lblk) ((lblk) &                                \
                                 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))

/*
 * Structure of a blocks group descriptor
 */
struct ext4_group_desc
{
        __le32        bg_block_bitmap_lo;        /* Blocks bitmap block */
        __le32        bg_inode_bitmap_lo;        /* Inodes bitmap block */
        __le32        bg_inode_table_lo;        /* Inodes table block */
        __le16        bg_free_blocks_count_lo;/* Free blocks count */
        __le16        bg_free_inodes_count_lo;/* Free inodes count */
        __le16        bg_used_dirs_count_lo;        /* Directories count */
        __le16        bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
        __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
        __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
        __le16  bg_itable_unused_lo;        /* Unused inodes count */
        __le16  bg_checksum;                /* crc16(sb_uuid+group+desc) */
        __le32        bg_block_bitmap_hi;        /* Blocks bitmap block MSB */
        __le32        bg_inode_bitmap_hi;        /* Inodes bitmap block MSB */
        __le32        bg_inode_table_hi;        /* Inodes table block MSB */
        __le16        bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16        bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16        bg_used_dirs_count_hi;        /* Directories count MSB */
        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
        __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
        __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
        __u32   bg_reserved;
};

#define EXT4_BG_INODE_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
         sizeof(__le16))
#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
         sizeof(__le16))

/*
 * Structure of a flex block group info
 */

struct flex_groups {
        atomic64_t        free_clusters;
        atomic_t        free_inodes;
        atomic_t        used_dirs;
};

#define EXT4_BG_INODE_UNINIT        0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT        0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED        0x0004 /* On-disk itable initialized to zero */

/*
 * Macro-instructions used to manage group descriptors
 */
#define EXT4_MIN_DESC_SIZE                32
#define EXT4_MIN_DESC_SIZE_64BIT        64
#define        EXT4_MAX_DESC_SIZE                EXT4_MIN_BLOCK_SIZE
#define EXT4_DESC_SIZE(s)                (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s)        (EXT4_SB(s)->s_blocks_per_group)
# define EXT4_CLUSTERS_PER_GROUP(s)        (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s)        (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_desc_per_block_bits)
#else
# define EXT4_BLOCKS_PER_GROUP(s)        ((s)->s_blocks_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
# define EXT4_INODES_PER_GROUP(s)        ((s)->s_inodes_per_group)
#endif

/*
 * Constants relative to the data blocks
 */
#define        EXT4_NDIR_BLOCKS                12
#define        EXT4_IND_BLOCK                        EXT4_NDIR_BLOCKS
#define        EXT4_DIND_BLOCK                        (EXT4_IND_BLOCK + 1)
#define        EXT4_TIND_BLOCK                        (EXT4_DIND_BLOCK + 1)
#define        EXT4_N_BLOCKS                        (EXT4_TIND_BLOCK + 1)

/*
 * Inode flags
 */
#define        EXT4_SECRM_FL                        0x00000001 /* Secure deletion */
#define        EXT4_UNRM_FL                        0x00000002 /* Undelete */
#define        EXT4_COMPR_FL                        0x00000004 /* Compress file */
#define EXT4_SYNC_FL                        0x00000008 /* Synchronous updates */
#define EXT4_IMMUTABLE_FL                0x00000010 /* Immutable file */
#define EXT4_APPEND_FL                        0x00000020 /* writes to file may only append */
#define EXT4_NODUMP_FL                        0x00000040 /* do not dump file */
#define EXT4_NOATIME_FL                        0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define EXT4_DIRTY_FL                        0x00000100
#define EXT4_COMPRBLK_FL                0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL                        0x00000400 /* Don't compress */
        /* nb: was previously EXT2_ECOMPR_FL */
#define EXT4_ENCRYPT_FL                        0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL                        0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL                        0x00002000 /* AFS directory */
#define EXT4_JOURNAL_DATA_FL                0x00004000 /* file data should be journaled */
#define EXT4_NOTAIL_FL                        0x00008000 /* file tail should not be merged */
#define EXT4_DIRSYNC_FL                        0x00010000 /* dirsync behaviour (directories only) */
#define EXT4_TOPDIR_FL                        0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL                        0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */

#define EXT4_DAX_FL                        0x02000000 /* Inode is DAX */

#define EXT4_INLINE_DATA_FL                0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL                0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL                0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */

/* User modifiable flags */
#define EXT4_FL_USER_MODIFIABLE                (EXT4_SECRM_FL | \
                                         EXT4_UNRM_FL | \
                                         EXT4_COMPR_FL | \
                                         EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_JOURNAL_DATA_FL | \
                                         EXT4_NOTAIL_FL | \
                                         EXT4_DIRSYNC_FL | \
                                         EXT4_TOPDIR_FL | \
                                         EXT4_EXTENTS_FL | \
                                         0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
                                         EXT4_DAX_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_CASEFOLD_FL)

/* User visible flags */
#define EXT4_FL_USER_VISIBLE                (EXT4_FL_USER_MODIFIABLE | \
                                         EXT4_DIRTY_FL | \
                                         EXT4_COMPRBLK_FL | \
                                         EXT4_NOCOMPR_FL | \
                                         EXT4_ENCRYPT_FL | \
                                         EXT4_INDEX_FL | \
                                         EXT4_VERITY_FL | \
                                         EXT4_INLINE_DATA_FL)

/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
                           EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
                           EXT4_DAX_FL)

/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
                           EXT4_PROJINHERIT_FL))

/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)

/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)

/* Flags which are mutually exclusive to DAX */
#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
                           EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
        if (S_ISDIR(mode))
                return flags;
        else if (S_ISREG(mode))
                return flags & EXT4_REG_FLMASK;
        else
                return flags & EXT4_OTHER_FLMASK;
}

/*
 * Inode flags used for atomic set/get
 */
enum {
        EXT4_INODE_SECRM        = 0,        /* Secure deletion */
        EXT4_INODE_UNRM                = 1,        /* Undelete */
        EXT4_INODE_COMPR        = 2,        /* Compress file */
        EXT4_INODE_SYNC                = 3,        /* Synchronous updates */
        EXT4_INODE_IMMUTABLE        = 4,        /* Immutable file */
        EXT4_INODE_APPEND        = 5,        /* writes to file may only append */
        EXT4_INODE_NODUMP        = 6,        /* do not dump file */
        EXT4_INODE_NOATIME        = 7,        /* do not update atime */
/* Reserved for compression usage... */
        EXT4_INODE_DIRTY        = 8,
        EXT4_INODE_COMPRBLK        = 9,        /* One or more compressed clusters */
        EXT4_INODE_NOCOMPR        = 10,        /* Don't compress */
        EXT4_INODE_ENCRYPT        = 11,        /* Encrypted file */
/* End compression flags --- maybe not all used */
        EXT4_INODE_INDEX        = 12,        /* hash-indexed directory */
        EXT4_INODE_IMAGIC        = 13,        /* AFS directory */
        EXT4_INODE_JOURNAL_DATA        = 14,        /* file data should be journaled */
        EXT4_INODE_NOTAIL        = 15,        /* file tail should not be merged */
        EXT4_INODE_DIRSYNC        = 16,        /* dirsync behaviour (directories only) */
        EXT4_INODE_TOPDIR        = 17,        /* Top of directory hierarchies*/
        EXT4_INODE_HUGE_FILE        = 18,        /* Set to each huge file */
        EXT4_INODE_EXTENTS        = 19,        /* Inode uses extents */
        EXT4_INODE_VERITY        = 20,        /* Verity protected inode */
        EXT4_INODE_EA_INODE        = 21,        /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
        EXT4_INODE_DAX                = 25,        /* Inode is DAX */
        EXT4_INODE_INLINE_DATA        = 28,        /* Data in inode. */
        EXT4_INODE_PROJINHERIT        = 29,        /* Create with parents projid */
        EXT4_INODE_CASEFOLD        = 30,        /* Casefolded directory */
        EXT4_INODE_RESERVED        = 31,        /* reserved for ext4 lib */
};

/*
 * Since it's pretty easy to mix up bit numbers and hex values, we use a
 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
 * any extra space in the compiled kernel image, otherwise, the build will fail.
 * It's important that these values are the same, since we are using
 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
 * values found in ext2, ext3 and ext4 filesystems, and of course the values
 * defined in e2fsprogs.
 *
 * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
 */
#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))

static inline void ext4_check_flag_values(void)
{
        CHECK_FLAG_VALUE(SECRM);
        CHECK_FLAG_VALUE(UNRM);
        CHECK_FLAG_VALUE(COMPR);
        CHECK_FLAG_VALUE(SYNC);
        CHECK_FLAG_VALUE(IMMUTABLE);
        CHECK_FLAG_VALUE(APPEND);
        CHECK_FLAG_VALUE(NODUMP);
        CHECK_FLAG_VALUE(NOATIME);
        CHECK_FLAG_VALUE(DIRTY);
        CHECK_FLAG_VALUE(COMPRBLK);
        CHECK_FLAG_VALUE(NOCOMPR);
        CHECK_FLAG_VALUE(ENCRYPT);
        CHECK_FLAG_VALUE(INDEX);
        CHECK_FLAG_VALUE(IMAGIC);
        CHECK_FLAG_VALUE(JOURNAL_DATA);
        CHECK_FLAG_VALUE(NOTAIL);
        CHECK_FLAG_VALUE(DIRSYNC);
        CHECK_FLAG_VALUE(TOPDIR);
        CHECK_FLAG_VALUE(HUGE_FILE);
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(VERITY);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(PROJINHERIT);
        CHECK_FLAG_VALUE(CASEFOLD);
        CHECK_FLAG_VALUE(RESERVED);
}

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
struct compat_ext4_new_group_input {
        u32 group;
        compat_u64 block_bitmap;
        compat_u64 inode_bitmap;
        compat_u64 inode_table;
        u32 blocks_count;
        u16 reserved_blocks;
        u16 unused;
};
#endif

/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
        __u32 group;
        __u64 block_bitmap;
        __u64 inode_bitmap;
        __u64 inode_table;
        __u32 blocks_count;
        __u16 reserved_blocks;
        __u16 mdata_blocks;
        __u32 free_clusters_count;
};

/* Indexes used to index group tables in ext4_new_group_data */
enum {
        BLOCK_BITMAP = 0,        /* block bitmap */
        INODE_BITMAP,                /* inode bitmap */
        INODE_TABLE,                /* inode tables */
        GROUP_TABLE_COUNT,
};

/*
 * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unwritten
           extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE                        0x0001
        /* Request the creation of an unwritten extent */
#define EXT4_GET_BLOCKS_UNWRIT_EXT                0x0002
#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT        (EXT4_GET_BLOCKS_UNWRIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path
         * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
        unwritten extents if not allocated, split the unwritten
        extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO                        0x0008
#define EXT4_GET_BLOCKS_CONVERT                        0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT                (EXT4_GET_BLOCKS_PRE_IO|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT                (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Eventual metadata allocation (due to growing extent tree)
         * should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
        /* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE                0x0040
        /* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN        0x0100
        /* Write zeros to newly created written extents */
#define EXT4_GET_BLOCKS_ZERO                        0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO                (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
        /* Caller will submit data before dropping transaction handle. This
         * allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT                0x0400
        /* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT                0x0800

/*
 * The bit position of these flags must not overlap with any of the
 * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
 * read_extent_tree_block(), ext4_split_extent_at(),
 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 * caching the extents when reading from the extent tree while a
 * truncate or punch hole operation is in progress.
 */
#define EXT4_EX_NOCACHE                                0x40000000
#define EXT4_EX_FORCE_CACHE                        0x20000000
#define EXT4_EX_NOFAIL                                0x10000000

/*
 * Flags used by ext4_free_blocks
 */
#define EXT4_FREE_BLOCKS_METADATA                0x0001
#define EXT4_FREE_BLOCKS_FORGET                        0x0002
#define EXT4_FREE_BLOCKS_VALIDATED                0x0004
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE                0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER        0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER        0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
 * ioctl commands in 32 bit emulation
 */
#define EXT4_IOC32_GETVERSION                _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION                _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ                _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ                _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND                _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD                _IOW('f', 8, struct compat_ext4_new_group_input)
#define EXT4_IOC32_GETVERSION_OLD        FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD        FS_IOC32_SETVERSION
#endif

/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF

/* Max logical block we can support */
#define EXT4_MAX_LOGICAL_BLOCK                0xFFFFFFFE

/*
 * Structure of an inode on the disk
 */
struct ext4_inode {
        __le16        i_mode;                /* File mode */
        __le16        i_uid;                /* Low 16 bits of Owner Uid */
        __le32        i_size_lo;        /* Size in bytes */
        __le32        i_atime;        /* Access time */
        __le32        i_ctime;        /* Inode Change time */
        __le32        i_mtime;        /* Modification time */
        __le32        i_dtime;        /* Deletion Time */
        __le16        i_gid;                /* Low 16 bits of Group Id */
        __le16        i_links_count;        /* Links count */
        __le32        i_blocks_lo;        /* Blocks count */
        __le32        i_flags;        /* File flags */
        union {
                struct {
                        __le32  l_i_version;
                } linux1;
                struct {
                        __u32  h_i_translator;
                } hurd1;
                struct {
                        __u32  m_i_reserved1;
                } masix1;
        } osd1;                                /* OS dependent 1 */
        __le32        i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
        __le32        i_generation;        /* File version (for NFS) */
        __le32        i_file_acl_lo;        /* File ACL */
        __le32        i_size_high;
        __le32        i_obso_faddr;        /* Obsoleted fragment address */
        union {
                struct {
                        __le16        l_i_blocks_high; /* were l_i_reserved1 */
                        __le16        l_i_file_acl_high;
                        __le16        l_i_uid_high;        /* these 2 fields */
                        __le16        l_i_gid_high;        /* were reserved2[0] */
                        __le16        l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
                        __le16        l_i_reserved;
                } linux2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __u16        h_i_mode_high;
                        __u16        h_i_uid_high;
                        __u16        h_i_gid_high;
                        __u32        h_i_author;
                } hurd2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __le16        m_i_file_acl_high;
                        __u32        m_i_reserved2[2];
                } masix2;
        } osd2;                                /* OS dependent 2 */
        __le16        i_extra_isize;
        __le16        i_checksum_hi;        /* crc32c(uuid+inum+inode) BE */
        __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
        __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
        __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
        __le32  i_crtime;       /* File Creation time */
        __le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
        __le32  i_version_hi;        /* high 32 bits for 64-bit version */
        __le32        i_projid;        /* Project ID */
};

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)

/*
 * Extended fields will fit into an inode if the filesystem was formatted
 * with large inodes (-I 256 or larger) and there are not currently any EAs
 * consuming all of the available space. For new inodes we always reserve
 * enough space for the kernel's known extended fields, but for inodes
 * created with an old kernel this might not have been the case. None of
 * the extended inode fields is critical for correct filesystem operation.
 * This macro checks if a certain field fits in the inode. Note that
 * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
 */
#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)        \
        ((offsetof(typeof(*ext4_inode), field) +        \
          sizeof((ext4_inode)->field))                        \
        <= (EXT4_GOOD_OLD_INODE_SIZE +                        \
            (einode)->i_extra_isize))                        \

/*
 * We use an encoding that preserves the times for extra epoch "00":
 *
 * extra  msb of                         adjust for signed
 * epoch  32-bit                         32-bit tv_sec to
 * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
 * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
 * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
 * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
 * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
 * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
 * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
 * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
 * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
 *
 * Note that previous versions of the kernel on 64-bit systems would
 * incorrectly use extra epoch bits 1,1 for dates between 1901 and
 * 1970.  e2fsck will correct this, assuming that it is run on the
 * affected filesystem before 2242.
 */

static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
        u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
        return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}

static inline struct timespec64 ext4_decode_extra_time(__le32 base,
                                                       __le32 extra)
{
        struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };

        if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
                ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
        ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
        return ts;
}

#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts)                        \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {        \
                (raw_inode)->xtime = cpu_to_le32((ts).tv_sec);                        \
                (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts);        \
        } else                                                                        \
                (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX));        \
} while (0)

#define EXT4_INODE_SET_ATIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))

#define EXT4_INODE_SET_MTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))

#define EXT4_INODE_SET_CTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))

#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                        \
                EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode),                \
                                         raw_inode, (einode)->xtime)

#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode)                        \
        (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ?        \
                ext4_decode_extra_time((raw_inode)->xtime,                                \
                                       (raw_inode)->xtime ## _extra) :                \
                (struct timespec64) {                                                \
                        .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime)        \
                })

#define EXT4_INODE_GET_ATIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_atime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_MTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_mtime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_CTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_ctime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode));                \
} while (0)

#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                                \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                         \
                (einode)->xtime =                                                \
                        EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode),        \
                                                 raw_inode);                        \
        else                                                                        \
                (einode)->xtime = (struct timespec64){0, 0};                        \
} while (0)

#define i_disk_version osd1.linux1.l_i_version

#if defined(__KERNEL__) || defined(__linux__)
#define i_reserved1        osd1.linux1.l_i_reserved1
#define i_file_acl_high        osd2.linux2.l_i_file_acl_high
#define i_blocks_high        osd2.linux2.l_i_blocks_high
#define i_uid_low        i_uid
#define i_gid_low        i_gid
#define i_uid_high        osd2.linux2.l_i_uid_high
#define i_gid_high        osd2.linux2.l_i_gid_high
#define i_checksum_lo        osd2.linux2.l_i_checksum_lo

#elif defined(__GNU__)

#define i_translator        osd1.hurd1.h_i_translator
#define i_uid_high        osd2.hurd2.h_i_uid_high
#define i_gid_high        osd2.hurd2.h_i_gid_high
#define i_author        osd2.hurd2.h_i_author

#elif defined(__masix__)

#define i_reserved1        osd1.masix1.m_i_reserved1
#define i_file_acl_high        osd2.masix2.m_i_file_acl_high
#define i_reserved2        osd2.masix2.m_i_reserved2

#endif /* defined(__KERNEL__) || defined(__linux__) */

#include "extents_status.h"
#include "fast_commit.h"

/*
 * Lock subclasses for i_data_sem in the ext4_inode_info structure.
 *
 * These are needed to avoid lockdep false positives when we need to
 * allocate blocks to the quota inode during ext4_map_blocks(), while
 * holding i_data_sem for a normal (non-quota) inode.  Since we don't
 * do quota tracking for the quota inode, this avoids deadlock (as
 * well as infinite recursion, since it isn't turtles all the way
 * down...)
 *
 *  I_DATA_SEM_NORMAL - Used for most inodes
 *  I_DATA_SEM_OTHER  - Used by move_inode.c for the second normal inode
 *                          where the second inode has larger inode number
 *                          than the first
 *  I_DATA_SEM_QUOTA  - Used for quota inodes only
 *  I_DATA_SEM_EA     - Used for ea_inodes only
 */
enum {
        I_DATA_SEM_NORMAL = 0,
        I_DATA_SEM_OTHER,
        I_DATA_SEM_QUOTA,
        I_DATA_SEM_EA
};


/*
 * fourth extended file system inode data in memory
 */
struct ext4_inode_info {
        __le32        i_data[15];        /* unconverted */
        __u32        i_dtime;
        ext4_fsblk_t        i_file_acl;

        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
        ext4_group_t        i_block_group;
        ext4_lblk_t        i_dir_start_lookup;
#if (BITS_PER_LONG < 64)
        unsigned long        i_state_flags;                /* Dynamic state flags */
#endif
        unsigned long        i_flags;

        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_rwsem even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;

        /*
         * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
         * i_orphan is used.
         */
        union {
                struct list_head i_orphan;        /* unlinked but open inodes */
                unsigned int i_orphan_idx;        /* Index in orphan file */
        };

        /* Fast commit related info */

        /* For tracking dentry create updates */
        struct list_head i_fc_dilist;
        struct list_head i_fc_list;        /*
                                         * inodes that need fast commit
                                         * protected by sbi->s_fc_lock.
                                         */

        /* Start of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_start;

        /* End of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_len;

        /* Number of ongoing updates on this inode */
        atomic_t  i_fc_updates;

        /* Fast commit wait queue for this inode */
        wait_queue_head_t i_fc_wait;

        /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
        struct mutex i_fc_lock;

        /*
         * i_disksize keeps track of what the inode size is ON DISK, not
         * in memory.  During truncate, i_size is set to the new size by
         * the VFS prior to calling ext4_truncate(), but the filesystem won't
         * set i_disksize to 0 until the truncate is actually under way.
         *
         * The intent is that i_disksize always represents the blocks which
         * are used by this file.  This allows recovery to restart truncate
         * on orphans if we crash during truncate.  We actually write i_disksize
         * into the on-disk inode when writing inodes out, instead of i_size.
         *
         * The only time when i_disksize and i_size may be different is when
         * a truncate is in progress.  The only things which change i_disksize
         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
         */
        loff_t        i_disksize;

        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
         * data tree are chopped off during truncate. We can't do that in
         * ext4 because whenever we perform intermediate commits during
         * truncate, the inode and all the metadata blocks *must* be in a
         * consistent state which allows truncation of the orphans to restart
         * during recovery.  Hence we must fix the get_block-vs-truncate race
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;

        spinlock_t i_raw_lock;        /* protects updates to the raw inode */

        /*
         * File creation time. Its function is same as that of
         * struct timespec64 i_{a,c,m}time in the generic inode.
         */
        struct timespec64 i_crtime;

        /* mballoc */
        atomic_t i_prealloc_active;
        struct rb_root i_prealloc_node;
        rwlock_t i_prealloc_lock;

        /* extents status tree */
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_list;
        unsigned int i_es_all_nr;        /* protected by i_es_lock */
        unsigned int i_es_shk_nr;        /* protected by i_es_lock */
        ext4_lblk_t i_es_shrink_lblk;        /* Offset where we start searching for
                                           extents to shrink. Protected by
                                           i_es_lock  */

        /* ialloc */
        ext4_group_t        i_last_alloc_group;

        /* allocation reservation info for delalloc */
        /* In case of bigalloc, this refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;

        /* pending cluster reservations for bigalloc file systems */
        struct ext4_pending_tree i_pending_tree;

        /* on-disk additional length */
        __u16 i_extra_isize;

        /* Indicate the inline data space. */
        u16 i_inline_off;
        u16 i_inline_size;

#ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
#endif

        /* Lock protecting lists below */
        spinlock_t i_completed_io_lock;
        /*
         * Completed IOs that need unwritten extents handling and have
         * transaction reserved
         */
        struct list_head i_rsv_conversion_list;
        struct work_struct i_rsv_conversion_work;
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */

        spinlock_t i_block_reservation_lock;

        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
         */
        tid_t i_sync_tid;
        tid_t i_datasync_tid;

#ifdef CONFIG_QUOTA
        struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif

        /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
        __u32 i_csum_seed;

        kprojid_t i_projid;
};

/*
 * File system states
 */
#define        EXT4_VALID_FS                        0x0001        /* Unmounted cleanly */
#define        EXT4_ERROR_FS                        0x0002        /* Errors detected */
#define        EXT4_ORPHAN_FS                        0x0004        /* Orphans being recovered */
#define EXT4_FC_REPLAY                        0x0020        /* Fast commit replay ongoing */

/*
 * Misc. filesystem flags
 */
#define EXT2_FLAGS_SIGNED_HASH                0x0001  /* Signed dirhash in use */
#define EXT2_FLAGS_UNSIGNED_HASH        0x0002  /* Unsigned dirhash in use */
#define EXT2_FLAGS_TEST_FILESYS                0x0004        /* to test development code */

/*
 * Mount flags set via mount options or defaults
 */
#define EXT4_MOUNT_NO_MBCACHE                0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID                0x00004        /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG                0x00008        /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT                0x00010        /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO                0x00020        /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC                0x00040        /* Panic on errors */
#define EXT4_MOUNT_ERRORS_MASK                0x00070
#define EXT4_MOUNT_MINIX_DF                0x00080        /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD                0x00100        /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
#define EXT4_MOUNT_DAX_ALWAYS                0x00200        /* Direct Access */
#else
#define EXT4_MOUNT_DAX_ALWAYS                0
#endif
#define EXT4_MOUNT_DATA_FLAGS                0x00C00        /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA                0x00400        /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA                0x00800        /* Flush data before commit */
#define EXT4_MOUNT_WRITEBACK_DATA        0x00C00        /* No data ordering */
#define EXT4_MOUNT_UPDATE_JOURNAL        0x01000        /* Update the journal format */
#define EXT4_MOUNT_NO_UID32                0x02000  /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER                0x04000        /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL                0x08000        /* POSIX Access Control Lists */
#define EXT4_MOUNT_NO_AUTO_DA_ALLOC        0x10000        /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER                0x20000 /* Use block barriers */
#define EXT4_MOUNT_QUOTA                0x40000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA                0x80000 /* "old" user quota,
                                                 * enable enforcement for hidden
                                                 * quota files */
#define EXT4_MOUNT_GRPQUOTA                0x100000 /* "old" group quota, enable
                                                  * enforcement for hidden quota
                                                  * files */
#define EXT4_MOUNT_PRJQUOTA                0x200000 /* Enable project quota
                                                  * enforcement */
#define EXT4_MOUNT_DIOREAD_NOLOCK        0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM        0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR        0x2000000 /* Trigger WARN_ON on error */
#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC                0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT        0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY        0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD                0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE        0x80000000 /* Initialize uninitialized itables */

/*
 * Mount flags set either automatically (could not be set by mount option)
 * based on per file system feature or property or in special cases such as
 * distinguishing between explicit mount option definition and default.
 */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC        0x00000001 /* User explicitly
                                                      specified delalloc */
#define EXT4_MOUNT2_STD_GROUP_SIZE        0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
                                                      file systems */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM        0x00000008 /* User explicitly
                                                specified journal checksum */

#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT        0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER                0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE                0x00000040 /* For printing options only */
#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN        0x00000080 /* Optimize group
                                                    * scanning in mballoc
                                                    */
#define EXT4_MOUNT2_ABORT                0x00000100 /* Abort filesystem */

#define clear_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
                                                EXT4_MOUNT_##opt
#define test_opt(sb, opt)                (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)

#define clear_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 &= \
                                                ~EXT4_MOUNT2_##opt
#define set_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 |= \
                                                EXT4_MOUNT2_##opt
#define test_opt2(sb, opt)                (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)

#define ext4_test_and_set_bit                __test_and_set_bit_le
#define ext4_set_bit                        __set_bit_le
#define ext4_test_and_clear_bit                __test_and_clear_bit_le
#define ext4_clear_bit                        __clear_bit_le
#define ext4_test_bit                        test_bit_le
#define ext4_find_next_zero_bit                find_next_zero_bit_le
#define ext4_find_next_bit                find_next_bit_le

extern void mb_set_bits(void *bm, int cur, int len);

/*
 * Maximal mount counts between two filesystem checks
 */
#define EXT4_DFL_MAX_MNT_COUNT                20        /* Allow 20 mounts */
#define EXT4_DFL_CHECKINTERVAL                0        /* Don't use interval check */

/*
 * Behaviour when detecting errors
 */
#define EXT4_ERRORS_CONTINUE                1        /* Continue execution */
#define EXT4_ERRORS_RO                        2        /* Remount fs read-only */
#define EXT4_ERRORS_PANIC                3        /* Panic */
#define EXT4_ERRORS_DEFAULT                EXT4_ERRORS_CONTINUE

/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM                1

#define EXT4_LABEL_MAX                        16

/*
 * Structure of the super block
 */
struct ext4_super_block {
/*00*/        __le32        s_inodes_count;                /* Inodes count */
        __le32        s_blocks_count_lo;        /* Blocks count */
        __le32        s_r_blocks_count_lo;        /* Reserved blocks count */
        __le32        s_free_blocks_count_lo;        /* Free blocks count */
/*10*/        __le32        s_free_inodes_count;        /* Free inodes count */
        __le32        s_first_data_block;        /* First Data Block */
        __le32        s_log_block_size;        /* Block size */
        __le32        s_log_cluster_size;        /* Allocation cluster size */
/*20*/        __le32        s_blocks_per_group;        /* # Blocks per group */
        __le32        s_clusters_per_group;        /* # Clusters per group */
        __le32        s_inodes_per_group;        /* # Inodes per group */
        __le32        s_mtime;                /* Mount time */
/*30*/        __le32        s_wtime;                /* Write time */
        __le16        s_mnt_count;                /* Mount count */
        __le16        s_max_mnt_count;        /* Maximal mount count */
        __le16        s_magic;                /* Magic signature */
        __le16        s_state;                /* File system state */
        __le16        s_errors;                /* Behaviour when detecting errors */
        __le16        s_minor_rev_level;        /* minor revision level */
/*40*/        __le32        s_lastcheck;                /* time of last check */
        __le32        s_checkinterval;        /* max. time between checks */
        __le32        s_creator_os;                /* OS */
        __le32        s_rev_level;                /* Revision level */
/*50*/        __le16        s_def_resuid;                /* Default uid for reserved blocks */
        __le16        s_def_resgid;                /* Default gid for reserved blocks */
        /*
         * These fields are for EXT4_DYNAMIC_REV superblocks only.
         *
         * Note: the difference between the compatible feature set and
         * the incompatible feature set is that if there is a bit set
         * in the incompatible feature set that the kernel doesn't
         * know about, it should refuse to mount the filesystem.
         *
         * e2fsck's requirements are more strict; if it doesn't know
         * about a feature in either the compatible or incompatible
         * feature set, it must abort and not try to meddle with
         * things it doesn't understand...
         */
        __le32        s_first_ino;                /* First non-reserved inode */
        __le16  s_inode_size;                /* size of inode structure */
        __le16        s_block_group_nr;        /* block group # of this superblock */
        __le32        s_feature_compat;        /* compatible feature set */
/*60*/        __le32        s_feature_incompat;        /* incompatible feature set */
        __le32        s_feature_ro_compat;        /* readonly-compatible feature set */
/*68*/        __u8        s_uuid[16];                /* 128-bit uuid for volume */
/*78*/        char        s_volume_name[EXT4_LABEL_MAX];        /* volume name */
/*88*/        char        s_last_mounted[64] __nonstring;        /* directory where last mounted */
/*C8*/        __le32        s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
         * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
         */
        __u8        s_prealloc_blocks;        /* Nr of blocks to try to preallocate*/
        __u8        s_prealloc_dir_blocks;        /* Nr to preallocate for dirs */
        __le16        s_reserved_gdt_blocks;        /* Per group desc for online growth */
        /*
         * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
         */
/*D0*/        __u8        s_journal_uuid[16];        /* uuid of journal superblock */
/*E0*/        __le32        s_journal_inum;                /* inode number of journal file */
        __le32        s_journal_dev;                /* device number of journal file */
        __le32        s_last_orphan;                /* start of list of inodes to delete */
        __le32        s_hash_seed[4];                /* HTREE hash seed */
        __u8        s_def_hash_version;        /* Default hash version to use */
        __u8        s_jnl_backup_type;
        __le16  s_desc_size;                /* size of group descriptor */
/*100*/        __le32        s_default_mount_opts;
        __le32        s_first_meta_bg;        /* First metablock block group */
        __le32        s_mkfs_time;                /* When the filesystem was created */
        __le32        s_jnl_blocks[17];        /* Backup of the journal inode */
        /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */
/*150*/        __le32        s_blocks_count_hi;        /* Blocks count */
        __le32        s_r_blocks_count_hi;        /* Reserved blocks count */
        __le32        s_free_blocks_count_hi;        /* Free blocks count */
        __le16        s_min_extra_isize;        /* All inodes have at least # bytes */
        __le16        s_want_extra_isize;         /* New inodes should reserve # bytes */
        __le32        s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;                /* RAID stride */
        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8        s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8        s_checksum_type;        /* metadata checksum algorithm used */
        __u8        s_encryption_level;        /* versioning level for encryption */
        __u8        s_reserved_pad;                /* Padding to next 32bits */
        __le64        s_kbytes_written;        /* nr of lifetime kilobytes written */
        __le32        s_snapshot_inum;        /* Inode number of active snapshot */
        __le32        s_snapshot_id;                /* sequential ID of active snapshot */
        __le64        s_snapshot_r_blocks_count; /* reserved blocks for active
                                              snapshot's future use */
        __le32        s_snapshot_list;        /* inode number of the head of the
                                           on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
        __le32        s_error_count;                /* number of fs errors */
        __le32        s_first_error_time;        /* first time an error happened */
        __le32        s_first_error_ino;        /* inode involved in first error */
        __le64        s_first_error_block;        /* block involved of first error */
        __u8        s_first_error_func[32] __nonstring;        /* function where the error happened */
        __le32        s_first_error_line;        /* line number where error happened */
        __le32        s_last_error_time;        /* most recent time of an error */
        __le32        s_last_error_ino;        /* inode involved in last error */
        __le32        s_last_error_line;        /* line number where error happened */
        __le64        s_last_error_block;        /* block involved of last error */
        __u8        s_last_error_func[32] __nonstring;        /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8        s_mount_opts[64];
        __le32        s_usr_quota_inum;        /* inode for tracking user quota */
        __le32        s_grp_quota_inum;        /* inode for tracking group quota */
        __le32        s_overhead_clusters;        /* overhead blocks/clusters in fs */
        __le32        s_backup_bgs[2];        /* groups with sparse_super2 SBs */
        __u8        s_encrypt_algos[4];        /* Encryption algorithms in use  */
        __u8        s_encrypt_pw_salt[16];        /* Salt used for string2key algorithm */
        __le32        s_lpf_ino;                /* Location of the lost+found inode */
        __le32        s_prj_quota_inum;        /* inode for tracking project quota */
        __le32        s_checksum_seed;        /* crc32c(uuid) if csum_seed set */
        __u8        s_wtime_hi;
        __u8        s_mtime_hi;
        __u8        s_mkfs_time_hi;
        __u8        s_lastcheck_hi;
        __u8        s_first_error_time_hi;
        __u8        s_last_error_time_hi;
        __u8        s_first_error_errcode;
        __u8    s_last_error_errcode;
        __le16  s_encoding;                /* Filename charset encoding */
        __le16  s_encoding_flags;        /* Filename charset encoding flags */
        __le32  s_orphan_file_inum;        /* Inode for tracking orphan inodes */
        __le32        s_reserved[94];                /* Padding to the end of the block */
        __le32        s_checksum;                /* crc32c(superblock) */
};

#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)

#ifdef __KERNEL__

/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3

#define EXT4_ENC_UTF8_12_1        1

/* Types of ext4 journal triggers */
enum ext4_journal_trigger_type {
        EXT4_JTR_ORPHAN_FILE,
        EXT4_JTR_NONE        /* This must be the last entry for indexing to work! */
};

#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE

struct ext4_journal_trigger {
        struct jbd2_buffer_trigger_type tr_triggers;
        struct super_block *sb;
};

static inline struct ext4_journal_trigger *EXT4_TRIGGER(
                                struct jbd2_buffer_trigger_type *trigger)
{
        return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
}

#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04

/* Structure at the tail of orphan block */
struct ext4_orphan_block_tail {
        __le32 ob_magic;
        __le32 ob_checksum;
};

static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
{
        return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
                        sizeof(u32);
}

struct ext4_orphan_block {
        atomic_t ob_free_entries;        /* Number of free orphan entries in block */
        struct buffer_head *ob_bh;        /* Buffer for orphan block */
};

/*
 * Info about orphan file.
 */
struct ext4_orphan_info {
        int of_blocks;                        /* Number of orphan blocks in a file */
        __u32 of_csum_seed;                /* Checksum seed for orphan file */
        struct ext4_orphan_block *of_binfo;        /* Array with info about orphan
                                                 * file blocks */
};

/*
 * fourth extended-fs super-block data in memory
 */
struct ext4_sb_info {
        unsigned long s_desc_size;        /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
        unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;        /* Number of inode table blocks per group */
        unsigned long s_gdb_count;        /* Number of group descriptor blocks */
        unsigned long s_desc_per_block;        /* Number of group descriptors per block */
        ext4_group_t s_groups_count;        /* Number of groups in the fs */
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead;  /* # of fs overhead clusters */
        unsigned int s_cluster_ratio;        /* Number of blocks per cluster */
        unsigned int s_cluster_bits;        /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;        /* max bytes for bitmap files */
        struct buffer_head * s_sbh;        /* Buffer containing the super block */
        struct ext4_super_block *s_es;        /* Pointer to the super block in the buffer */
        /* Array of bh's for the block group descriptors */
        struct buffer_head * __rcu *s_group_desc;
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned long s_mount_flags;
        unsigned int s_def_mount_opt;
        unsigned int s_def_mount_opt2;
        ext4_fsblk_t s_sb_block;
        atomic64_t s_resv_clusters;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned short s_mount_state;
        unsigned short s_pad;
        int s_addr_per_block_bits;
        int s_desc_per_block_bits;
        int s_inode_size;
        int s_first_ino;
        unsigned int s_inode_readahead_blks;
        unsigned int s_inode_goal;
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;        /* 3 if hash should be unsigned, 0 if not */
        struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyclusters_counter;
        struct percpu_counter s_sra_exceeded_retry_limit;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
        struct completion s_kobj_unregister;
        struct super_block *s_sb;
        struct buffer_head *s_mmp_bh;

        /* Journaling */
        struct journal_s *s_journal;
        unsigned long s_ext4_flags;                /* Ext4 superblock flags */
        struct mutex s_orphan_lock;        /* Protects on disk list changes */
        struct list_head s_orphan;        /* List of orphaned inodes in on disk
                                           list */
        struct ext4_orphan_info s_orphan_info;
        unsigned long s_commit_interval;
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
        /* Names of quota files with journalled quota */
        char __rcu *s_qf_names[EXT4_MAXQUOTAS];
        int s_jquota_fmt;                        /* Format of quota to use */
#endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
        struct ext4_system_blocks __rcu *s_system_blks;

#ifdef EXTENTS_STATS
        /* ext4 extents stats */
        unsigned long s_ext_min;
        unsigned long s_ext_max;
        unsigned long s_depth_max;
        spinlock_t s_ext_stats_lock;
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
#endif

        /* for buddy allocator */
        struct ext4_group_info ** __rcu *s_group_info;
        struct inode *s_buddy_cache;
        spinlock_t s_md_lock;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
        unsigned int s_group_info_size;
        unsigned int s_mb_free_pending;
        struct list_head s_freed_data_list[2];        /* List of blocks to be freed
                                                   after commit completed */
        struct list_head s_discard_list;
        struct work_struct s_discard_work;
        atomic_t s_retry_alloc_pending;
        struct list_head *s_mb_avg_fragment_size;
        rwlock_t *s_mb_avg_fragment_size_locks;
        struct list_head *s_mb_largest_free_orders;
        rwlock_t *s_mb_largest_free_orders_locks;

        /* tunables */
        unsigned long s_stripe;
        unsigned int s_mb_max_linear_groups;
        unsigned int s_mb_stream_request;
        unsigned int s_mb_max_to_scan;
        unsigned int s_mb_min_to_scan;
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
        unsigned int s_mb_prefetch;
        unsigned int s_mb_prefetch_limit;
        unsigned int s_mb_best_avail_max_trim_order;

        /* stats for buddy allocator */
        atomic_t s_bal_reqs;        /* number of reqs with len > 1 */
        atomic_t s_bal_success;        /* we found long enough chunks */
        atomic_t s_bal_allocated;        /* in blocks */
        atomic_t s_bal_ex_scanned;        /* total extents scanned */
        atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS];        /* total extents scanned */
        atomic_t s_bal_groups_scanned;        /* number of groups scanned */
        atomic_t s_bal_goals;        /* goal hits */
        atomic_t s_bal_len_goals;        /* len goal hits */
        atomic_t s_bal_breaks;        /* too long searches */
        atomic_t s_bal_2orders;        /* 2^order hits */
        atomic_t s_bal_p2_aligned_bad_suggestions;
        atomic_t s_bal_goal_fast_bad_suggestions;
        atomic_t s_bal_best_avail_bad_suggestions;
        atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS];                /* cX loop didn't find blocks */
        atomic_t s_mb_buddies_generated;        /* number of buddies generated */
        atomic64_t s_mb_generation_time;
        atomic_t s_mb_lost_chunks;
        atomic_t s_mb_preallocated;
        atomic_t s_mb_discarded;
        atomic_t s_lock_busy;

        /* locality groups */
        struct ext4_locality_group __percpu *s_locality_groups;

        /* for write statistics */
        unsigned long s_sectors_written_start;
        u64 s_kbytes_written;

        /* the size of zero-out chunk */
        unsigned int s_extent_max_zeroout_kb;

        unsigned int s_log_groups_per_flex;
        struct flex_groups * __rcu *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;

        /* workqueue for reserved extent conversions (buffered io) */
        struct workqueue_struct *rsv_conversion_wq;

        /* timer for periodic error stats printing */
        struct timer_list s_err_report;

        /* Lazy inode table initialization info */
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;

        /* Kernel thread for multiple mount protection */
        struct task_struct *s_mmp_tsk;

        /* record the last minlen when FITRIM is called. */
        unsigned long s_last_trim_minblks;

        /* Reference to checksum algorithm driver via cryptoapi */
        struct crypto_shash *s_chksum_driver;

        /* Precomputed FS UUID checksum for seeding other checksums */
        __u32 s_csum_seed;

        /* Reclaim extents from extent status tree */
        struct shrinker *s_es_shrinker;
        struct list_head s_es_list;        /* List of inodes with reclaimable extents */
        long s_es_nr_inode;
        struct ext4_es_stats s_es_stats;
        struct mb_cache *s_ea_block_cache;
        struct mb_cache *s_ea_inode_cache;
        spinlock_t s_es_lock ____cacheline_aligned_in_smp;

        /* Journal triggers for checksum computation */
        struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT];

        /* Ratelimit ext4 messages. */
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
        atomic_t s_warning_count;
        atomic_t s_msg_count;

        /* Encryption policy for '-o test_dummy_encryption' */
        struct fscrypt_dummy_policy s_dummy_enc_policy;

        /*
         * Barrier between writepages ops and changing any inode's JOURNAL_DATA
         * or EXTENTS flag or between writepages ops and changing DELALLOC or
         * DIOREAD_NOLOCK mount options on remount.
         */
        struct percpu_rw_semaphore s_writepages_rwsem;
        struct dax_device *s_daxdev;
        u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
        unsigned long s_simulate_fail;
#endif
        /* Record the errseq of the backing block device */
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;

        /* Information about errors that happened during this mount */
        spinlock_t s_error_lock;
        int s_add_error_count;
        int s_first_error_code;
        __u32 s_first_error_line;
        __u32 s_first_error_ino;
        __u64 s_first_error_block;
        const char *s_first_error_func;
        time64_t s_first_error_time;
        int s_last_error_code;
        __u32 s_last_error_line;
        __u32 s_last_error_ino;
        __u64 s_last_error_block;
        const char *s_last_error_func;
        time64_t s_last_error_time;
        /*
         * If we are in a context where we cannot update the on-disk
         * superblock, we queue the work here.  This is used to update
         * the error information in the superblock, and for periodic
         * updates of the superblock called from the commit callback
         * function.
         */
        struct work_struct s_sb_upd_work;

        /* Ext4 fast commit sub transaction ID */
        atomic_t s_fc_subtid;

        /*
         * After commit starts, the main queue gets locked, and the further
         * updates get added in the staging queue.
         */
#define FC_Q_MAIN        0
#define FC_Q_STAGING        1
        struct list_head s_fc_q[2];        /* Inodes staged for fast commit
                                         * that have data changes in them.
                                         */
        struct list_head s_fc_dentry_q[2];        /* directory entry updates */
        unsigned int s_fc_bytes;
        /*
         * Main fast commit lock. This lock protects accesses to the
         * following fields:
         * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
         */
        spinlock_t s_fc_lock;
        struct buffer_head *s_fc_bh;
        struct ext4_fc_stats s_fc_stats;
        tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        struct ext4_fc_replay_state s_fc_replay_state;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}
static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
{
        return container_of(inode, struct ext4_inode_info, vfs_inode);
}

static inline int ext4_writepages_down_read(struct super_block *sb)
{
        percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_read(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_writepages_down_write(struct super_block *sb)
{
        percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
        return ino == EXT4_ROOT_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}

/*
 * Returns: sbi->field[index]
 * Used to access an array element from the following sbi fields which require
 * rcu protection to avoid dereferencing an invalid pointer due to reassignment
 * - s_group_desc
 * - s_group_info
 * - s_flex_group
 */
#define sbi_array_rcu_deref(sbi, field, index)                                   \
({                                                                           \
        typeof(*((sbi)->field)) _v;                                           \
        rcu_read_lock();                                                   \
        _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];           \
        rcu_read_unlock();                                                   \
        _v;                                                                   \
})

/*
 * run-time mount flags
 */
enum {
        EXT4_MF_MNTDIR_SAMPLED,
        EXT4_MF_FC_INELIGIBLE        /* Fast commit ineligible */
};

static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
{
        set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
{
        clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
{
        return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}


/*
 * Simulate_fail codes
 */
#define EXT4_SIM_BBITMAP_EIO        1
#define EXT4_SIM_BBITMAP_CRC        2
#define EXT4_SIM_IBITMAP_EIO        3
#define EXT4_SIM_IBITMAP_CRC        4
#define EXT4_SIM_INODE_EIO        5
#define EXT4_SIM_INODE_CRC        6
#define EXT4_SIM_DIRBLOCK_EIO        7
#define EXT4_SIM_DIRBLOCK_CRC        8

static inline bool ext4_simulate_fail(struct super_block *sb,
                                     unsigned long code)
{
#ifdef CONFIG_EXT4_DEBUG
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(sbi->s_simulate_fail == code)) {
                sbi->s_simulate_fail = 0;
                return true;
        }
#endif
        return false;
}

static inline void ext4_simulate_fail_bh(struct super_block *sb,
                                         struct buffer_head *bh,
                                         unsigned long code)
{
        if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
                clear_buffer_uptodate(bh);
}

/*
 * Error number codes for s_{first,last}_error_errno
 *
 * Linux errno numbers are architecture specific, so we need to translate
 * them into something which is architecture independent.   We don't define
 * codes for all errno's; just the ones which are most likely to be the cause
 * of an ext4_error() call.
 */
#define EXT4_ERR_UNKNOWN         1
#define EXT4_ERR_EIO                 2
#define EXT4_ERR_ENOMEM                 3
#define EXT4_ERR_EFSBADCRC         4
#define EXT4_ERR_EFSCORRUPTED         5
#define EXT4_ERR_ENOSPC                 6
#define EXT4_ERR_ENOKEY                 7
#define EXT4_ERR_EROFS                 8
#define EXT4_ERR_EFBIG                 9
#define EXT4_ERR_EEXIST                10
#define EXT4_ERR_ERANGE                11
#define EXT4_ERR_EOVERFLOW        12
#define EXT4_ERR_EBUSY                13
#define EXT4_ERR_ENOTDIR        14
#define EXT4_ERR_ENOTEMPTY        15
#define EXT4_ERR_ESHUTDOWN        16
#define EXT4_ERR_EFAULT                17

/*
 * Inode dynamic state flags
 */
enum {
        EXT4_STATE_NEW,                        /* inode is newly created */
        EXT4_STATE_XATTR,                /* has in-inode xattrs */
        EXT4_STATE_NO_EXPAND,                /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,        /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,                /* Inode is migrating */
        EXT4_STATE_NEWENTRY,                /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,        /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,        /* extents have been precached */
        EXT4_STATE_LUSTRE_EA_INODE,        /* Lustre-style ea_inode */
        EXT4_STATE_VERITY_IN_PROGRESS,        /* building fs-verity Merkle tree */
        EXT4_STATE_FC_COMMITTING,        /* Fast commit ongoing */
        EXT4_STATE_ORPHAN_FILE,                /* Inode orphaned in orphan file */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)                                \
static inline int ext4_test_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);        \
}                                                                        \
static inline void ext4_set_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}                                                                        \
static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
{                                                                        \
        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_flag(struct inode *inode, int bit);
static inline void ext4_set_inode_flag(struct inode *inode, int bit);
static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_state(struct inode *inode, int bit);
static inline void ext4_set_inode_state(struct inode *inode, int bit);
static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        (ei)->i_state_flags = 0;
}
#else
EXT4_INODE_BIT_FNS(state, flags, 32)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        /* We depend on the fact that callers will set i_flags */
}
#endif
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
 * macros from user land. */
#define EXT4_SB(sb)        (sb)
#endif

static inline bool ext4_verity_in_progress(struct inode *inode)
{
        return IS_ENABLED(CONFIG_FS_VERITY) &&
               ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
}

#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime

/*
 * Codes for operating systems
 */
#define EXT4_OS_LINUX                0
#define EXT4_OS_HURD                1
#define EXT4_OS_MASIX                2
#define EXT4_OS_FREEBSD                3
#define EXT4_OS_LITES                4

/*
 * Revision levels
 */
#define EXT4_GOOD_OLD_REV        0        /* The good old (original) format */
#define EXT4_DYNAMIC_REV        1        /* V2 format w/ dynamic inode sizes */

#define EXT4_MAX_SUPP_REV        EXT4_DYNAMIC_REV

#define EXT4_GOOD_OLD_INODE_SIZE 128

#define EXT4_EXTRA_TIMESTAMP_MAX        (((s64)1 << 34) - 1  + S32_MIN)
#define EXT4_NON_EXTRA_TIMESTAMP_MAX        S32_MAX
#define EXT4_TIMESTAMP_MIN                S32_MIN

/*
 * Feature set definitions
 */

#define EXT4_FEATURE_COMPAT_DIR_PREALLOC        0x0001
#define EXT4_FEATURE_COMPAT_IMAGIC_INODES        0x0002
#define EXT4_FEATURE_COMPAT_HAS_JOURNAL                0x0004
#define EXT4_FEATURE_COMPAT_EXT_ATTR                0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE        0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX                0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2        0x0200
/*
 * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
 * incompatible only if fast commit blocks are present in the FS. Since we
 * clear the journal (and thus the fast commit blocks), we don't mark FS as
 * incompatible. We also have a JBD2 incompat feature, which gets set when
 * there are fast commit blocks present in the journal.
 */
#define EXT4_FEATURE_COMPAT_FAST_COMMIT                0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES        0x0800
#define EXT4_FEATURE_COMPAT_ORPHAN_FILE                0x1000        /* Orphan file exists */

#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER        0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE        0x0002
#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR        0x0004
#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM                0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE        0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA                0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
/*
 * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
 * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
 * all other data structures' checksums.  However, the METADATA_CSUM and
 * GDT_CSUM bits are mutually exclusive.
 */
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM        0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY                0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT                0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY                0x8000
#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT        0x10000 /* Orphan file may be
                                                           non-empty */

#define EXT4_FEATURE_INCOMPAT_COMPRESSION        0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE                0x0002
#define EXT4_FEATURE_INCOMPAT_RECOVER                0x0004 /* Needs recovery */
#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV        0x0008 /* Journal device */
#define EXT4_FEATURE_INCOMPAT_META_BG                0x0010
#define EXT4_FEATURE_INCOMPAT_EXTENTS                0x0040 /* extents support */
#define EXT4_FEATURE_INCOMPAT_64BIT                0x0080
#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
#define EXT4_FEATURE_INCOMPAT_FLEX_BG                0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA                0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_CSUM_SEED                0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA        0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT                0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD                0x20000

extern void ext4_update_dynamic_rev(struct super_block *sb);

#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_compat |= \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
}

#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
}

#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_incompat |= \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_incompat &= \
                ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
}

EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc,                DIR_PREALLOC)
EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes,        IMAGIC_INODES)
EXT4_FEATURE_COMPAT_FUNCS(journal,                HAS_JOURNAL)
EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode,                RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index,                DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,        SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit,                FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,        STABLE_INODES)
EXT4_FEATURE_COMPAT_FUNCS(orphan_file,                ORPHAN_FILE)

EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,        SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,        LARGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir,                BTREE_DIR)
EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file,                HUGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum,                GDT_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink,                DIR_NLINK)
EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize,        EXTRA_ISIZE)
EXT4_FEATURE_RO_COMPAT_FUNCS(quota,                QUOTA)
EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,                BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,        METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,                READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project,                PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity,                VERITY)
EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present,        ORPHAN_PRESENT)

EXT4_FEATURE_INCOMPAT_FUNCS(compression,        COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype,                FILETYPE)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery,        RECOVER)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev,        JOURNAL_DEV)
EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg,                META_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(extents,                EXTENTS)
EXT4_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
EXT4_FEATURE_INCOMPAT_FUNCS(mmp,                MMP)
EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg,                FLEX_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode,                EA_INODE)
EXT4_FEATURE_INCOMPAT_FUNCS(dirdata,                DIRDATA)
EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed,                CSUM_SEED)
EXT4_FEATURE_INCOMPAT_FUNCS(largedir,                LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,        INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,                ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold,                CASEFOLD)

#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT4_FEATURE_COMPAT_SUPP        (EXT4_FEATURE_COMPAT_EXT_ATTR| \
                                         EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
                                         EXT4_FEATURE_INCOMPAT_EA_INODE| \
                                         EXT4_FEATURE_INCOMPAT_MMP | \
                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
                                         EXT4_FEATURE_INCOMPAT_ENCRYPT | \
                                         EXT4_FEATURE_INCOMPAT_CASEFOLD | \
                                         EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
                                         EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
                                         EXT4_FEATURE_RO_COMPAT_QUOTA |\
                                         EXT4_FEATURE_RO_COMPAT_PROJECT |\
                                         EXT4_FEATURE_RO_COMPAT_VERITY |\
                                         EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)

#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
}

EXTN_FEATURE_FUNCS(2)
EXTN_FEATURE_FUNCS(3)
EXTN_FEATURE_FUNCS(4)

static inline bool ext4_has_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
}
static inline bool ext4_has_ro_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
}
static inline bool ext4_has_incompat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}

extern int ext4_feature_set_ok(struct super_block *sb, int readonly);

/*
 * Superblock flags
 */
#define EXT4_FLAGS_RESIZING        0
#define EXT4_FLAGS_SHUTDOWN        1
#define EXT4_FLAGS_BDEV_IS_DAX        2

static inline int ext4_forced_shutdown(struct super_block *sb)
{
        return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}

/*
 * Default values for user and/or group using reserved blocks
 */
#define        EXT4_DEF_RESUID                0
#define        EXT4_DEF_RESGID                0

/*
 * Default project ID
 */
#define        EXT4_DEF_PROJID                0

#define EXT4_DEF_INODE_READAHEAD_BLKS        32

/*
 * Default mount options
 */
#define EXT4_DEFM_DEBUG                0x0001
#define EXT4_DEFM_BSDGROUPS        0x0002
#define EXT4_DEFM_XATTR_USER        0x0004
#define EXT4_DEFM_ACL                0x0008
#define EXT4_DEFM_UID16                0x0010
#define EXT4_DEFM_JMODE                0x0060
#define EXT4_DEFM_JMODE_DATA        0x0020
#define EXT4_DEFM_JMODE_ORDERED        0x0040
#define EXT4_DEFM_JMODE_WBACK        0x0060
#define EXT4_DEFM_NOBARRIER        0x0100
#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
#define EXT4_DEFM_DISCARD        0x0400
#define EXT4_DEFM_NODELALLOC        0x0800

/*
 * Default journal batch times
 */
#define EXT4_DEF_MIN_BATCH_TIME        0
#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */

/*
 * Minimum number of groups in a flexgroup before we separate out
 * directories into the first block group of a flexgroup
 */
#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4

/*
 * Structure of a directory entry
 */
#define EXT4_NAME_LEN 255
/*
 * Base length of the ext4 directory entry excluding the name length
 */
#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)

struct ext4_dir_entry {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __le16        name_len;                /* Name length */
        char        name[EXT4_NAME_LEN];        /* File name */
};


/*
 * Encrypted Casefolded entries require saving the hash on disk. This structure
 * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
 * boundary.
 */
struct ext4_dir_entry_hash {
        __le32 hash;
        __le32 minor_hash;
};

/*
 * The new version of the directory entry.  Since EXT4 structures are
 * stored in intel byte order, and the name_len field could never be
 * bigger than 255 chars, it's safe to reclaim the extra byte for the
 * file_type field.
 */
struct ext4_dir_entry_2 {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __u8        name_len;                /* Name length */
        __u8        file_type;                /* See file type macros EXT4_FT_* below */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * Access the hashes at the end of ext4_dir_entry_2
 */
#define EXT4_DIRENT_HASHES(entry) \
        ((struct ext4_dir_entry_hash *) \
                (((void *)(entry)) + \
                ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \
                le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)

static inline bool ext4_hash_in_dirent(const struct inode *inode)
{
        return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
}

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
 */
struct ext4_dir_entry_tail {
        __le32        det_reserved_zero1;        /* Pretend to be unused */
        __le16        det_rec_len;                /* 12 */
        __u8        det_reserved_zero2;        /* Zero name length */
        __u8        det_reserved_ft;        /* 0xDE, fake file type */
        __le32        det_checksum;                /* crc32c(uuid+inum+dirblock) */
};

#define EXT4_DIRENT_TAIL(block, blocksize) \
        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
                                        ((blocksize) - \
                                         sizeof(struct ext4_dir_entry_tail))))

/*
 * Ext4 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT4_FT_UNKNOWN                0
#define EXT4_FT_REG_FILE        1
#define EXT4_FT_DIR                2
#define EXT4_FT_CHRDEV                3
#define EXT4_FT_BLKDEV                4
#define EXT4_FT_FIFO                5
#define EXT4_FT_SOCK                6
#define EXT4_FT_SYMLINK                7

#define EXT4_FT_MAX                8

#define EXT4_FT_DIR_CSUM        0xDE

/*
 * EXT4_DIR_PAD defines the directory entries boundaries
 *
 * NOTE: It must be a multiple of 4
 */
#define EXT4_DIR_PAD                        4
#define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
#define EXT4_MAX_REC_LEN                ((1<<16)-1)

/*
 * The rec_len is dependent on the type of directory. Directories that are
 * casefolded and encrypted need to store the hash as well, so we add room for
 * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
 * pass NULL for dir, as those entries do not use the extra fields.
 */
static inline unsigned int ext4_dir_rec_len(__u8 name_len,
                                                const struct inode *dir)
{
        int rec_len = (name_len + 8 + EXT4_DIR_ROUND);

        if (dir && ext4_hash_in_dirent(dir))
                rec_len += sizeof(struct ext4_dir_entry_hash);
        return (rec_len & ~EXT4_DIR_ROUND);
}

/*
 * If we ever get support for fs block sizes > page_size, we'll need
 * to remove the #if statements in the next two functions...
 */
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
        unsigned len = le16_to_cpu(dlen);

#if (PAGE_SIZE >= 65536)
        if (len == EXT4_MAX_REC_LEN || len == 0)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
#else
        return len;
#endif
}

static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
        BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
#if (PAGE_SIZE >= 65536)
        if (len < 65536)
                return cpu_to_le16(len);
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
#else
        return cpu_to_le16(len);
#endif
}

/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */

#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \
                    !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir)))
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

/* Legal values for the dx_root hash_version field: */

#define DX_HASH_LEGACY                        0
#define DX_HASH_HALF_MD4                1
#define DX_HASH_TEA                        2
#define DX_HASH_LEGACY_UNSIGNED                3
#define DX_HASH_HALF_MD4_UNSIGNED        4
#define DX_HASH_TEA_UNSIGNED                5
#define DX_HASH_SIPHASH                        6

static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
                              const void *address, unsigned int length)
{
        struct {
                struct shash_desc shash;
                char ctx[4];
        } desc;

        BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));

        desc.shash.tfm = sbi->s_chksum_driver;
        *(u32 *)desc.ctx = crc;

        BUG_ON(crypto_shash_update(&desc.shash, address, length));

        return *(u32 *)desc.ctx;
}

#ifdef __KERNEL__

/* hash info structure used by the directory hash */
struct dx_hash_info
{
        u32                hash;
        u32                minor_hash;
        int                hash_version;
        u32                *seed;
};


/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)


/*
 * Control parameters used by ext4_htree_next_block
 */
#define HASH_NB_ALWAYS                1

struct ext4_filename {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        struct dx_hash_info hinfo;
#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_str crypto_buf;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct fscrypt_str cf_name;
#endif
};

#define fname_name(p) ((p)->disk_name.name)
#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p)  ((p)->disk_name.len)

/*
 * Describe an inode's exact location on disk and in memory
 */
struct ext4_iloc
{
        struct buffer_head *bh;
        unsigned long offset;
        ext4_group_t block_group;
};

static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
{
        return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}

static inline bool ext4_is_quota_file(struct inode *inode)
{
        return IS_NOQUOTA(inode) &&
               !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
}

/*
 * This structure is stuffed into the struct file's private_data field
 * for directories.  It is where we put information so that we can do
 * readdir operations in hash tree order.
 */
struct dir_private_info {
        struct rb_root        root;
        struct rb_node        *curr_node;
        struct fname        *extra_fname;
        loff_t                last_pos;
        __u32                curr_hash;
        __u32                curr_minor_hash;
        __u32                next_hash;
};

/* calculate the first block number of the group */
static inline ext4_fsblk_t
ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
{
        return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
}

/*
 * Special error return code only used by dx_probe() and its callers.
 */
#define ERR_BAD_DX_DIR        (-(MAX_ERRNO - 1))

/* htree levels for ext4 */
#define        EXT4_HTREE_LEVEL_COMPAT        2
#define        EXT4_HTREE_LEVEL        3

static inline int ext4_dir_htree_level(struct super_block *sb)
{
        return ext4_has_feature_largedir(sb) ?
                EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
}

/*
 * Timeout and state flag for lazy initialization inode thread.
 */
#define EXT4_DEF_LI_WAIT_MULT                        10
#define EXT4_DEF_LI_MAX_START_DELAY                5
#define EXT4_LAZYINIT_QUIT                        0x0001
#define EXT4_LAZYINIT_RUNNING                        0x0002

/*
 * Lazy inode table initialization info
 */
struct ext4_lazy_init {
        unsigned long                li_state;
        struct list_head        li_request_list;
        struct mutex                li_list_mtx;
};

enum ext4_li_mode {
        EXT4_LI_MODE_PREFETCH_BBITMAP,
        EXT4_LI_MODE_ITABLE,
};

struct ext4_li_request {
        struct super_block        *lr_super;
        enum ext4_li_mode        lr_mode;
        ext4_group_t                lr_first_not_zeroed;
        ext4_group_t                lr_next_group;
        struct list_head        lr_request;
        unsigned long                lr_next_sched;
        unsigned long                lr_timeout;
};

struct ext4_features {
        struct kobject f_kobj;
        struct completion f_kobj_unregister;
};

/*
 * This structure will be used for multiple mount protection. It will be
 * written into the block number saved in the s_mmp_block field in the
 * superblock. Programs that check MMP should assume that if
 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
 * to use the filesystem, regardless of how old the timestamp is.
 */
#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */

struct mmp_struct {
        __le32        mmp_magic;                /* Magic number for MMP */
        __le32        mmp_seq;                /* Sequence no. updated periodically */

        /*
         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
         * purposes and do not affect the correctness of the algorithm
         */
        __le64        mmp_time;                /* Time last updated */
        char        mmp_nodename[64];        /* Node which last updated MMP block */
        char        mmp_bdevname[32];        /* Bdev which last updated MMP block */

        /*
         * mmp_check_interval is used to verify if the MMP block has been
         * updated on the block device. The value is updated based on the
         * maximum time to write the MMP block during an update cycle.
         */
        __le16        mmp_check_interval;

        __le16        mmp_pad1;
        __le32        mmp_pad2[226];
        __le32        mmp_checksum;                /* crc32c(uuid+mmp_block) */
};

/* arguments passed to the mmp thread */
struct mmpd_data {
        struct buffer_head *bh; /* bh from initial read_mmp_block() */
        struct super_block *sb;  /* super block of the fs */
};

/*
 * Check interval multiplier
 * The MMP block is written every update interval and initially checked every
 * update interval x the multiplier (the value is then adapted based on the
 * write latency). The reason is that writes can be delayed under load and we
 * don't want readers to incorrectly assume that the filesystem is no longer
 * in use.
 */
#define EXT4_MMP_CHECK_MULT                2UL

/*
 * Minimum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MIN_CHECK_INTERVAL        5UL

/*
 * Maximum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MAX_CHECK_INTERVAL        300UL

/*
 * Function prototypes
 */

/*
 * Ok, these declarations are also in <linux/kernel.h> but none of the
 * ext4 source programs needs to include it so they are duplicated here.
 */
# define NORET_TYPE        /**/
# define ATTRIB_NORET        __attribute__((noreturn))
# define NORET_AND        noreturn,

/* bitmap.c */
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh, int sz);
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh, int sz);
void ext4_block_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_block_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);

/* balloc.c */
extern void ext4_get_group_no_and_offset(struct super_block *sb,
                                         ext4_fsblk_t blocknr,
                                         ext4_group_t *blockgrpp,
                                         ext4_grpblk_t *offsetp);
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);

extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         ext4_fsblk_t goal,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                                    s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                                   ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
                                                ext4_group_t block_group,
                                                bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  ext4_group_t block_group,
                                  struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         struct ext4_filename *fname);
#endif

/* ext4 encryption related stuff goes here crypto.c */
#ifdef CONFIG_FS_ENCRYPTION
extern const struct fscrypt_operations ext4_cryptops;

int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
                              int lookup, struct ext4_filename *fname);

int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
                              struct ext4_filename *fname);

void ext4_fname_free_filename(struct ext4_filename *fname);

int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);

#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        int err = 0;
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *) iname->name;
        fname->disk_name.len = iname->len;

#if IS_ENABLED(CONFIG_UNICODE)
        err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif

        return err;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
#if IS_ENABLED(CONFIG_UNICODE)
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
#endif
}

static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
                                                   void __user *arg)
{
        return -EOPNOTSUPP;
}
#endif /* !CONFIG_FS_ENCRYPTION */

/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, char *, int,
                                  unsigned int);
#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
                                (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                __u32 minor_hash,
                                struct ext4_dir_entry_2 *dirent,
                                struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                             struct buffer_head *bh,
                             void *buf, int buf_size,
                             struct ext4_filename *fname,
                             struct ext4_dir_entry_2 **dest_de);
void ext4_insert_dentry(struct inode *dir, struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
        if (!ext4_has_feature_dir_index(inode->i_sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                /* ext4_iget() should have caught this... */
                WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
        }
}
static const unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};

static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
{
        if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
                return DT_UNKNOWN;

        return ext4_filetype_table[filetype];
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size);

/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);

/* hash.c */
extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                          struct dx_hash_info *hinfo);

/* ialloc.c */
extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *,
                                      struct inode *, umode_t,
                                      const struct qstr *qstr, __u32 goal,
                                      uid_t *owner, __u32 i_flags,
                                      int handle_type, unsigned int line_no,
                                      int nblocks);

#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags)          \
        __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr),      \
                         (goal), (owner), i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \
                                    type, nblocks)                    \
        __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \
                         0, (type), __LINE__, (nblocks))


extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);

/* fast_commit.c */
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end);
void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                            struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
int ext4_fc_record_regions(struct super_block *sb, int ino,
                           ext4_lblk_t lblk, ext4_fsblk_t pblk,
                           int len, int replay);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int nr, int *cnt);
extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                                  unsigned int nr);

extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
                                   ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                            int len, bool state);
static inline bool ext4_mb_cr_expensive(enum criteria cr)
{
        return cr >= CR_GOAL_LEN_SLOW;
}

/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
                           struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA         2

typedef enum {
        EXT4_IGET_NORMAL =        0,
        EXT4_IGET_SPECIAL =        0x0001, /* OK to iget a system inode */
        EXT4_IGET_HANDLE =         0x0002,        /* Inode # is from a handle */
        EXT4_IGET_BAD =                0x0004, /* Allow to iget a bad inode */
        EXT4_IGET_EA_INODE =        0x0008        /* Inode should contain an EA value */
} ext4_iget_flags;

extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                                 ext4_iget_flags flags, const char *function,
                                 unsigned int line);

#define ext4_iget(sb, ino, flags) \
        __ext4_iget((sb), (ino), (flags), __func__, __LINE__)

extern int  ext4_write_inode(struct inode *, struct writeback_control *);
extern int  ext4_setattr(struct mnt_idmap *, struct dentry *,
                         struct iattr *);
extern u32  ext4_dio_alignment(struct inode *inode);
extern int  ext4_getattr(struct mnt_idmap *, const struct path *,
                         struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int  ext4_file_getattr(struct mnt_idmap *, const struct path *,
                              struct kstat *, u32, unsigned int);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
                              ext4_fsblk_t pblk, ext4_lblk_t len);

/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t start, ext4_lblk_t end);

/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
                      struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);

/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);

/* namei.c */
extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
                                     struct buffer_head *bh);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
                           char *search_buf,
                           int buf_size,
                           struct inode *dir,
                           struct ext4_filename *fname,
                           unsigned int offset,
                           struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(struct inode *dir,
                                     struct ext4_dir_entry_2 *de_del,
                                     struct buffer_head *bh,
                                     void *entry_buf,
                                     int buf_size,
                                     int csum_size);
extern bool ext4_empty_dir(struct inode *inode);

/* resize.c */
extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
                                struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
extern unsigned int ext4_list_backups(struct super_block *sb,
                                      unsigned int *three, unsigned int *five,
                                      unsigned int *seven);

/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
                                         sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                                   sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
                                bh_end_io_t *end_io);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                        bh_end_io_t *end_io);
extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern __le32 ext4_superblock_csum(struct super_block *sb,
                                   struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                              ext4_group_t block_group);

extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
                  int, __u64, const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
extern __printf(4, 5)
void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...);
extern __printf(3, 4)
void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);

#define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)

#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...)                        \
        __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)

#define ext4_error_inode_block(inode, block, err, fmt, a...)                \
        __ext4_error_inode((inode), __func__, __LINE__, (block), (err),        \
                           (fmt), ## a)

#define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)

#define ext4_abort(sb, err, fmt, a...)                                        \
        __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)

#ifdef CONFIG_PRINTK

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
        __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
        __ext4_error_inode((inode), (func), (line), (block),                 \
                           (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...)                                        \
        __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...)                                \
        __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...)                                        \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...)                                \
        __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_msg(sb, level, fmt, ...)                                \
        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
                                fmt, ##__VA_ARGS__)

#else

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, 0, " ");                \
} while (0)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, err, " ");                \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_file(file, "", 0, block, " ");                        \
} while (0)
#define ext4_error(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, 0, 0, " ");                        \
} while (0)
#define ext4_error_err(sb, err, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, err, 0, " ");                        \
} while (0)
#define ext4_warning(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning(sb, "", 0, " ");                                        \
} while (0)
#define ext4_warning_inode(inode, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning_inode(inode, "", 0, " ");                        \
} while (0)
#define ext4_msg(sb, level, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_msg(sb, "", " ");                                        \
} while (0)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, "", 0, "")
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                \
        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");        \
} while (0)

#endif

extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
extern __u32 ext4_free_group_clusters(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
                                struct ext4_group_desc *bg);
extern __u32 ext4_itable_unused_count(struct super_block *sb,
                                   struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_free_group_clusters_set(struct super_block *sb,
                                         struct ext4_group_desc *bg,
                                         __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                       struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
                                     struct ext4_group_desc *gdp);
extern int ext4_register_li_request(struct super_block *sb,
                                    ext4_group_t first_not_zeroed);

static inline int ext4_has_metadata_csum(struct super_block *sb)
{
        WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
                     !EXT4_SB(sb)->s_chksum_driver);

        return ext4_has_feature_metadata_csum(sb) &&
               (EXT4_SB(sb)->s_chksum_driver != NULL);
}

static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
        return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}

#define ext4_read_incompat_64bit_val(es, name) \
        (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
                ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
                le32_to_cpu(es->name##_lo))

static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_blocks_count);
}

static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}

static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}

static inline void ext4_blocks_count_set(struct ext4_super_block *es,
                                         ext4_fsblk_t blk)
{
        es->s_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
                                              ext4_fsblk_t blk)
{
        es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
                                           ext4_fsblk_t blk)
{
        es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline loff_t ext4_isize(struct super_block *sb,
                                struct ext4_inode *raw_inode)
{
        if (ext4_has_feature_largedir(sb) ||
            S_ISREG(le16_to_cpu(raw_inode->i_mode)))
                return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
                        le32_to_cpu(raw_inode->i_size_lo);

        return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
{
        raw_inode->i_size_lo = cpu_to_le32(i_size);
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}

/*
 * Reading s_groups_count requires using smp_rmb() afterwards.  See
 * the locking protocol documented in the comments of ext4_group_add()
 * in resize.c
 */
static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
{
        ext4_group_t        ngroups = EXT4_SB(sb)->s_groups_count;

        smp_rmb();
        return ngroups;
}

static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
{
        return block_group >> sbi->s_log_groups_per_flex;
}

static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
{
        return 1 << sbi->s_log_groups_per_flex;
}

#define ext4_std_error(sb, errno)                                \
do {                                                                \
        if ((errno))                                                \
                __ext4_std_error((sb), __func__, __LINE__, (errno));        \
} while (0)

#ifdef CONFIG_SMP
/* Each CPU can accumulate percpu_counter_batch clusters in their local
 * counters. So we need to make sure we have free clusters more
 * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
        WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
                     !inode_is_locked(inode));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
        up_write(&EXT4_I(inode)->i_data_sem);
}

/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
        int changed = 0;

        if (newsize > inode->i_size) {
                i_size_write(inode, newsize);
                changed = 1;
        }
        if (newsize > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, newsize);
                changed |= 2;
        }
        return changed;
}

int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len);

struct ext4_group_info {
        unsigned long   bb_state;
#ifdef AGGRESSIVE_CHECK
        unsigned long        bb_check_counter;
#endif
        struct rb_root  bb_free_root;
        ext4_grpblk_t        bb_first_free;        /* first free block */
        ext4_grpblk_t        bb_free;        /* total free blocks */
        ext4_grpblk_t        bb_fragments;        /* nr of freespace fragments */
        int                bb_avg_fragment_size_order;        /* order of average
                                                           fragment in BG */
        ext4_grpblk_t        bb_largest_free_order;/* order of largest frag in BG */
        ext4_group_t        bb_group;        /* Group number */
        struct          list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
        void            *bb_bitmap;
#endif
        struct rw_semaphore alloc_sem;
        struct list_head bb_avg_fragment_size_node;
        struct list_head bb_largest_free_order_node;
        ext4_grpblk_t        bb_counters[];        /* Nr of free power-of-two-block
                                         * regions, index is order.
                                         * bb_counters[3] = 5 means
                                         * 5 free 8-block regions. */
};

#define EXT4_GROUP_INFO_NEED_INIT_BIT                0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT        2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT        3
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_BBITMAP_READ_BIT        4

#define EXT4_MB_GRP_NEED_INIT(grp)        \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))

#define EXT4_MB_GRP_WAS_TRIMMED(grp)        \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_SET_TRIMMED(grp)        \
        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)        \
        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)        \
        (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION                8
#define EXT4_CONTENTION_THRESHOLD        2

static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
                                              ext4_group_t group)
{
        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}

/*
 * Returns true if the filesystem is busy enough that attempts to
 * access the block group locks has run into contention.
 */
static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
{
        return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}

static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
        spinlock_t *lock = ext4_group_lock_ptr(sb, group);
        if (spin_trylock(lock))
                /*
                 * We're able to grab the lock right away, so drop the
                 * lock contention counter.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
        else {
                /*
                 * The lock is busy, so bump the contention counter,
                 * and then wait on the spin lock.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
                                  EXT4_MAX_CONTENTION);
                spin_lock(lock);
        }
}

static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
{
        spin_unlock(ext4_group_lock_ptr(sb, group));
}

#ifdef CONFIG_QUOTA
static inline bool ext4_quota_capable(struct super_block *sb)
{
        return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
}

static inline bool ext4_is_quota_journalled(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        return (ext4_has_feature_quota(sb) ||
                sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
}
int ext4_enable_quotas(struct super_block *sb);
#endif

/*
 * Block validity checking
 */
#define ext4_check_indirect_blockref(inode, bh)                                \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            (__le32 *)(bh)->b_data,                        \
                            EXT4_ADDR_PER_BLOCK((inode)->i_sb))

#define ext4_ind_check_inode(inode)                                        \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            EXT4_I(inode)->i_data,                        \
                            EXT4_NDIR_BLOCKS)

/*
 * Inodes and files operations
 */

/* dir.c */
extern const struct file_operations ext4_dir_operations;

/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);

/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);

int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
                                         struct inode *inode,
                                         loff_t pos, unsigned len,
                                         struct page **pagep);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
                               unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
                                           struct inode *inode,
                                           loff_t pos, unsigned len,
                                           struct page **pagep,
                                           void **fsdata);
extern int ext4_try_add_inline_entry(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
                                struct dir_context *ctx,
                                int *has_inline_data);
extern int ext4_inlinedir_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash,
                                  int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data);
extern int ext4_delete_inline_entry(handle_t *handle,
                                    struct inode *dir,
                                    struct ext4_dir_entry_2 *de_del,
                                    struct buffer_head *bh,
                                    int *has_inline_data);
extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval);
extern void *ext4_read_inline_link(struct inode *inode);

struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);

extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);

extern int ext4_convert_inline_data(struct inode *inode);

static inline int ext4_has_inline_data(struct inode *inode)
{
        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
               EXT4_I(inode)->i_inline_off;
}

/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                                 struct ext4_dir_entry_2 *de,
                                 int blocksize, int csum_size,
                                 unsigned int parent_ino, int dotdot_real_len);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                        unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
                                      struct buffer_head *bh);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                         struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
                       struct dentry *dentry);

#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
        [S_IFREG >> S_SHIFT]        = EXT4_FT_REG_FILE,
        [S_IFDIR >> S_SHIFT]        = EXT4_FT_DIR,
        [S_IFCHR >> S_SHIFT]        = EXT4_FT_CHRDEV,
        [S_IFBLK >> S_SHIFT]        = EXT4_FT_BLKDEV,
        [S_IFIFO >> S_SHIFT]        = EXT4_FT_FIFO,
        [S_IFSOCK >> S_SHIFT]        = EXT4_FT_SOCK,
        [S_IFLNK >> S_SHIFT]        = EXT4_FT_SYMLINK,
};

static inline void ext4_set_de_type(struct super_block *sb,
                                struct ext4_dir_entry_2 *de,
                                umode_t mode) {
        if (ext4_has_feature_filetype(sb))
                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}

/* readpages.c */
extern int ext4_mpage_readpages(struct inode *inode,
                struct readahead_control *rac, struct folio *folio);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);

/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;

/* sysfs.c */
extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
extern void ext4_exit_sysfs(void);

/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
extern int ext4_inode_block_valid(struct inode *inode,
                                  ext4_fsblk_t start_blk,
                                  unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count);


/* extents.c */
struct ext4_ext_path;
struct ext4_extent;

/*
 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 * __le32.
 */
#define EXT_MAX_BLOCKS        0xffffffff

extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                                 ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
                                             ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
                                  struct ext4_ext_path **,
                                  struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
                                              struct ext4_ext_path **,
                                              int flags);
extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
extern int ext4_get_es_cache(struct inode *inode,
                             struct fiemap_extent_info *fieinfo,
                             __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
                                struct inode *inode2, ext4_lblk_t lblk1,
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                       int check_cred, int restart_cred,
                                       int revoke_cred);
extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
extern int ext4_ext_replay_set_iblocks(struct inode *inode);
extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                int len, int unwritten, ext4_fsblk_t pblk);
extern int ext4_ext_clear_bb(struct inode *inode);


/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
                                            struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
                                          struct inode *donor_inode);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
                                struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
                size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);

/* mmp.c */
extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);

/* verity.c */
extern const struct fsverity_operations ext4_verityops;

/* orphan.c */
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern void ext4_orphan_cleanup(struct super_block *sb,
                                struct ext4_super_block *es);
extern void ext4_release_orphan_info(struct super_block *sb);
extern int ext4_init_orphan_info(struct super_block *sb);
extern int ext4_orphan_file_empty(struct super_block *sb);
extern void ext4_orphan_file_block_trigger(
                                struct jbd2_buffer_trigger_type *triggers,
                                struct buffer_head *bh,
                                void *data, size_t size);

/*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
#define BH_BITMAP_UPTODATE BH_JBDPrivateStart

static inline int bitmap_uptodate(struct buffer_head *bh)
{
        return (buffer_uptodate(bh) &&
                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
}
static inline void set_bitmap_uptodate(struct buffer_head *bh)
{
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}

/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ                37
#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
                                            EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];

extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);

static inline void ext4_set_io_unwritten_flag(struct inode *inode,
                                              struct ext4_io_end *io_end)
{
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
                atomic_inc(&EXT4_I(inode)->i_unwritten);
        }
}

static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
        struct inode *inode = io_end->inode;

        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
                io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
                /* Wake up anyone waiting on unwritten extent conversion */
                if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
                        wake_up_all(ext4_ioend_wq(inode));
        }
}

extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If the buffer has the write error flag, we have failed
         * to write out data in the block.  In this  case, we don't
         * have to read the block because we may read the old data
         * successfully.
         */
        if (buffer_write_io_error(bh))
                set_buffer_uptodate(bh);
        return buffer_uptodate(bh);
}

#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _EXT4_H */




















































    1 

















    3 

    1 
    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6
 *
 * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
 * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
 */
#include <linux/module.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/slab.h>
#include <net/ipv6.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("ip6tables mangle table");

#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
                            (1 << NF_INET_LOCAL_IN) | \
                            (1 << NF_INET_FORWARD) | \
                            (1 << NF_INET_LOCAL_OUT) | \
                            (1 << NF_INET_POST_ROUTING))

static const struct xt_table packet_mangler = {
        .name                = "mangle",
        .valid_hooks        = MANGLE_VALID_HOOKS,
        .me                = THIS_MODULE,
        .af                = NFPROTO_IPV6,
        .priority        = NF_IP6_PRI_MANGLE,
};

static unsigned int
ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
        struct in6_addr saddr, daddr;
        unsigned int ret, verdict;
        u32 flowlabel, mark;
        u8 hop_limit;
        int err;

        /* save source/dest address, mark, hoplimit, flowlabel, priority,  */
        memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
        memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
        mark = skb->mark;
        hop_limit = ipv6_hdr(skb)->hop_limit;

        /* flowlabel and prio (includes version, which shouldn't change either */
        flowlabel = *((u_int32_t *)ipv6_hdr(skb));

        ret = ip6t_do_table(priv, skb, state);
        verdict = ret & NF_VERDICT_MASK;

        if (verdict != NF_DROP && verdict != NF_STOLEN &&
            (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
             !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) ||
             skb->mark != mark ||
             ipv6_hdr(skb)->hop_limit != hop_limit ||
             flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
                err = ip6_route_me_harder(state->net, state->sk, skb);
                if (err < 0)
                        ret = NF_DROP_ERR(err);
        }

        return ret;
}

/* The work comes in here from netfilter.c. */
static unsigned int
ip6table_mangle_hook(void *priv, struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        if (state->hook == NF_INET_LOCAL_OUT)
                return ip6t_mangle_out(priv, skb, state);
        return ip6t_do_table(priv, skb, state);
}

static struct nf_hook_ops *mangle_ops __read_mostly;
static int ip6table_mangle_table_init(struct net *net)
{
        struct ip6t_replace *repl;
        int ret;

        repl = ip6t_alloc_initial_table(&packet_mangler);
        if (repl == NULL)
                return -ENOMEM;
        ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops);
        kfree(repl);
        return ret;
}

static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
{
        ip6t_unregister_table_pre_exit(net, "mangle");
}

static void __net_exit ip6table_mangle_net_exit(struct net *net)
{
        ip6t_unregister_table_exit(net, "mangle");
}

static struct pernet_operations ip6table_mangle_net_ops = {
        .pre_exit = ip6table_mangle_net_pre_exit,
        .exit = ip6table_mangle_net_exit,
};

static int __init ip6table_mangle_init(void)
{
        int ret = xt_register_template(&packet_mangler,
                                       ip6table_mangle_table_init);

        if (ret < 0)
                return ret;

        mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
        if (IS_ERR(mangle_ops)) {
                xt_unregister_template(&packet_mangler);
                return PTR_ERR(mangle_ops);
        }

        ret = register_pernet_subsys(&ip6table_mangle_net_ops);
        if (ret < 0) {
                xt_unregister_template(&packet_mangler);
                kfree(mangle_ops);
                return ret;
        }

        return ret;
}

static void __exit ip6table_mangle_fini(void)
{
        unregister_pernet_subsys(&ip6table_mangle_net_ops);
        xt_unregister_template(&packet_mangler);
        kfree(mangle_ops);
}

module_init(ip6table_mangle_init);
module_exit(ip6table_mangle_fini);


















































    1 

    1 





















    1 






    1 


    1 








    1 





































































    3 
    3 














































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
// SPDX-License-Identifier: GPL-2.0-only
/*
 * net/core/dst.c        Protocol independent destination cache.
 *
 * Authors:                Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/net_namespace.h>
#include <linux/sched.h>
#include <linux/prefetch.h>
#include <net/lwtunnel.h>
#include <net/xfrm.h>

#include <net/dst.h>
#include <net/dst_metadata.h>

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        return 0;
}
EXPORT_SYMBOL(dst_discard_out);

const struct dst_metrics dst_default_metrics = {
        /* This initializer is needed to force linker to place this variable
         * into const section. Otherwise it might end into bss section.
         * We really want to avoid false sharing on this variable, and catch
         * any writes on it.
         */
        .refcnt = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL(dst_default_metrics);

void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_obsolete,
              unsigned short flags)
{
        dst->dev = dev;
        netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC);
        dst->ops = ops;
        dst_init_metrics(dst, dst_default_metrics.metrics, true);
        dst->expires = 0UL;
#ifdef CONFIG_XFRM
        dst->xfrm = NULL;
#endif
        dst->input = dst_discard;
        dst->output = dst_discard_out;
        dst->error = 0;
        dst->obsolete = initial_obsolete;
        dst->header_len = 0;
        dst->trailer_len = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
        dst->tclassid = 0;
#endif
        dst->lwtstate = NULL;
        rcuref_init(&dst->__rcuref, 1);
        INIT_LIST_HEAD(&dst->rt_uncached);
        dst->__use = 0;
        dst->lastuse = jiffies;
        dst->flags = flags;
        if (!(flags & DST_NOCOUNT))
                dst_entries_add(ops, 1);
}
EXPORT_SYMBOL(dst_init);

void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
                int initial_obsolete, unsigned short flags)
{
        struct dst_entry *dst;

        if (ops->gc &&
            !(flags & DST_NOCOUNT) &&
            dst_entries_get_fast(ops) > ops->gc_thresh)
                ops->gc(ops);

        dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
        if (!dst)
                return NULL;

        dst_init(dst, ops, dev, initial_obsolete, flags);

        return dst;
}
EXPORT_SYMBOL(dst_alloc);

static void dst_destroy(struct dst_entry *dst)
{
        struct dst_entry *child = NULL;

        smp_rmb();

#ifdef CONFIG_XFRM
        if (dst->xfrm) {
                struct xfrm_dst *xdst = (struct xfrm_dst *) dst;

                child = xdst->child;
        }
#endif
        if (!(dst->flags & DST_NOCOUNT))
                dst_entries_add(dst->ops, -1);

        if (dst->ops->destroy)
                dst->ops->destroy(dst);
        netdev_put(dst->dev, &dst->dev_tracker);

        lwtstate_put(dst->lwtstate);

        if (dst->flags & DST_METADATA)
                metadata_dst_free((struct metadata_dst *)dst);
        else
                kmem_cache_free(dst->ops->kmem_cachep, dst);

        dst = child;
        if (dst)
                dst_release_immediate(dst);
}

static void dst_destroy_rcu(struct rcu_head *head)
{
        struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);

        dst_destroy(dst);
}

/* Operations to mark dst as DEAD and clean up the net device referenced
 * by dst:
 * 1. put the dst under blackhole interface and discard all tx/rx packets
 *    on this route.
 * 2. release the net_device
 * This function should be called when removing routes from the fib tree
 * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
 * make the next dst_ops->check() fail.
 */
void dst_dev_put(struct dst_entry *dst)
{
        struct net_device *dev = dst->dev;

        dst->obsolete = DST_OBSOLETE_DEAD;
        if (dst->ops->ifdown)
                dst->ops->ifdown(dst, dev);
        dst->input = dst_discard;
        dst->output = dst_discard_out;
        dst->dev = blackhole_netdev;
        netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
                           GFP_ATOMIC);
}
EXPORT_SYMBOL(dst_dev_put);

void dst_release(struct dst_entry *dst)
{
        if (dst && rcuref_put(&dst->__rcuref))
                call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
}
EXPORT_SYMBOL(dst_release);

void dst_release_immediate(struct dst_entry *dst)
{
        if (dst && rcuref_put(&dst->__rcuref))
                dst_destroy(dst);
}
EXPORT_SYMBOL(dst_release_immediate);

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
        struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);

        if (p) {
                struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
                unsigned long prev, new;

                refcount_set(&p->refcnt, 1);
                memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));

                new = (unsigned long) p;
                prev = cmpxchg(&dst->_metrics, old, new);

                if (prev != old) {
                        kfree(p);
                        p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
                        if (prev & DST_METRICS_READ_ONLY)
                                p = NULL;
                } else if (prev & DST_METRICS_REFCOUNTED) {
                        if (refcount_dec_and_test(&old_p->refcnt))
                                kfree(old_p);
                }
        }
        BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
        return (u32 *)p;
}
EXPORT_SYMBOL(dst_cow_metrics_generic);

/* Caller asserts that dst_metrics_read_only(dst) is false.  */
void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
        unsigned long prev, new;

        new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
        prev = cmpxchg(&dst->_metrics, old, new);
        if (prev == old)
                kfree(__DST_METRICS_PTR(old));
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie)
{
        return NULL;
}

u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old)
{
        return NULL;
}

struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr)
{
        return NULL;
}

void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu,
                               bool confirm_neigh)
{
}
EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu);

void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb)
{
}
EXPORT_SYMBOL_GPL(dst_blackhole_redirect);

unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
{
        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

        return mtu ? : dst->dev->mtu;
}
EXPORT_SYMBOL_GPL(dst_blackhole_mtu);

static struct dst_ops dst_blackhole_ops = {
        .family                = AF_UNSPEC,
        .neigh_lookup        = dst_blackhole_neigh_lookup,
        .check                = dst_blackhole_check,
        .cow_metrics        = dst_blackhole_cow_metrics,
        .update_pmtu        = dst_blackhole_update_pmtu,
        .redirect        = dst_blackhole_redirect,
        .mtu                = dst_blackhole_mtu,
};

static void __metadata_dst_init(struct metadata_dst *md_dst,
                                enum metadata_type type, u8 optslen)
{
        struct dst_entry *dst;

        dst = &md_dst->dst;
        dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
                 DST_METADATA | DST_NOCOUNT);
        memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
        md_dst->type = type;
}

struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
                                        gfp_t flags)
{
        struct metadata_dst *md_dst;

        md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
        if (!md_dst)
                return NULL;

        __metadata_dst_init(md_dst, type, optslen);

        return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc);

void metadata_dst_free(struct metadata_dst *md_dst)
{
#ifdef CONFIG_DST_CACHE
        if (md_dst->type == METADATA_IP_TUNNEL)
                dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
#endif
        if (md_dst->type == METADATA_XFRM)
                dst_release(md_dst->u.xfrm_info.dst_orig);
        kfree(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free);

struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
{
        int cpu;
        struct metadata_dst __percpu *md_dst;

        md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
                                    __alignof__(struct metadata_dst), flags);
        if (!md_dst)
                return NULL;

        for_each_possible_cpu(cpu)
                __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);

        return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);

void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);

#ifdef CONFIG_DST_CACHE
                if (one_md_dst->type == METADATA_IP_TUNNEL)
                        dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
#endif
                if (one_md_dst->type == METADATA_XFRM)
                        dst_release(one_md_dst->u.xfrm_info.dst_orig);
        }
        free_percpu(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);




















































































    3 
































































    6 

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H

#include <linux/pid_types.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/wait.h>

/*
 * What is struct pid?
 *
 * A struct pid is the kernel's internal notion of a process identifier.
 * It refers to individual tasks, process groups, and sessions.  While
 * there are processes attached to it the struct pid lives in a hash
 * table, so it and then the processes that it refers to can be found
 * quickly from the numeric pid value.  The attached processes may be
 * quickly accessed by following pointers from struct pid.
 *
 * Storing pid_t values in the kernel and referring to them later has a
 * problem.  The process originally with that pid may have exited and the
 * pid allocator wrapped, and another process could have come along
 * and been assigned that pid.
 *
 * Referring to user space processes by holding a reference to struct
 * task_struct has a problem.  When the user space process exits
 * the now useless task_struct is still kept.  A task_struct plus a
 * stack consumes around 10K of low kernel memory.  More precisely
 * this is THREAD_SIZE + sizeof(struct task_struct).  By comparison
 * a struct pid is about 64 bytes.
 *
 * Holding a reference to struct pid solves both of these problems.
 * It is small so holding a reference does not consume a lot of
 * resources, and since a new struct pid is allocated when the numeric pid
 * value is reused (when pids wrap around) we don't mistakenly refer to new
 * processes.
 */


/*
 * struct upid is used to get the id of the struct pid, as it is
 * seen in particular namespace. Later the struct pid is found with
 * find_pid_ns() using the int nr and struct pid_namespace *ns.
 */

#define RESERVED_PIDS 300

struct upid {
        int nr;
        struct pid_namespace *ns;
};

struct pid
{
        refcount_t count;
        unsigned int level;
        spinlock_t lock;
        struct dentry *stashed;
        u64 ino;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
        struct hlist_head inodes;
        /* wait queue for pidfd notifications */
        wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[];
};

extern struct pid init_struct_pid;

struct file;

struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
void do_notify_pidfd(struct task_struct *task);

static inline struct pid *get_pid(struct pid *pid)
{
        if (pid)
                refcount_inc(&pid->count);
        return pid;
}

extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
        return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);

extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);

/*
 * these helpers must be called with the tasklist_lock write-held.
 */
extern void attach_pid(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
                        struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                         enum pid_type);

extern int pid_max;
extern int pid_max_min, pid_max_max;

/*
 * look up a PID in the hash table. Must be called with the tasklist_lock
 * or rcu_read_lock() held.
 *
 * find_pid_ns() finds the pid in the namespace specified
 * find_vpid() finds the pid by its virtual id, i.e. in the current namespace
 *
 * see also find_task_by_vpid() set in include/linux/sched.h
 */
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);

/*
 * Lookup a PID in the hash table, and return with it's count elevated.
 */
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                             size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);

/*
 * ns_of_pid() returns the pid namespace in which the specified pid was
 * allocated.
 *
 * NOTE:
 *         ns_of_pid() is expected to be called for a process (task) that has
 *         an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
 *         is expected to be non-NULL. If @pid is NULL, caller should handle
 *         the resulting NULL pid-ns.
 */
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
        struct pid_namespace *ns = NULL;
        if (pid)
                ns = pid->numbers[pid->level].ns;
        return ns;
}

/*
 * is_child_reaper returns true if the pid is the init process
 * of the current namespace. As this one could be checked before
 * pid_ns->child_reaper is assigned in copy_process, we check
 * with the pid number.
 */
static inline bool is_child_reaper(struct pid *pid)
{
        return pid->numbers[pid->level].nr == 1;
}

/*
 * the helpers to get the pid's id seen from different namespaces
 *
 * pid_nr()    : global id, i.e. the id seen from the init namespace;
 * pid_vnr()   : virtual id, i.e. the id seen from the pid namespace of
 *               current.
 * pid_nr_ns() : id seen from the ns specified.
 *
 * see also task_xid_nr() etc in include/linux/sched.h
 */

static inline pid_t pid_nr(struct pid *pid)
{
        pid_t nr = 0;
        if (pid)
                nr = pid->numbers[0].nr;
        return nr;
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);

#define do_each_pid_task(pid, type, task)                                \
        do {                                                                \
                if ((pid) != NULL)                                        \
                        hlist_for_each_entry_rcu((task),                \
                                &(pid)->tasks[type], pid_links[type]) {

                        /*
                         * Both old and new leaders may be attached to
                         * the same pid in the middle of de_thread().
                         */
#define while_each_pid_task(pid, type, task)                                \
                                if (type == PIDTYPE_PID)                \
                                        break;                                \
                        }                                                \
        } while (0)

#define do_each_pid_thread(pid, type, task)                                \
        do_each_pid_task(pid, type, task) {                                \
                struct task_struct *tg___ = task;                        \
                for_each_thread(tg___, task) {

#define while_each_pid_thread(pid, type, task)                                \
                }                                                        \
                task = tg___;                                                \
        } while_each_pid_task(pid, type, task)

static inline struct pid *task_pid(struct task_struct *task)
{
        return task->thread_pid;
}

/*
 * the helpers to get the task's different pids as they are seen
 * from various namespaces
 *
 * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
 * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
 *                     current.
 * task_xid_nr_ns()  : id seen from the ns specified;
 *
 * see also pid_nr() etc in include/linux/pid.h
 */
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);

static inline pid_t task_pid_nr(struct task_struct *tsk)
{
        return tsk->pid;
}

static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}

static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}


static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
        return tsk->tgid;
}

/**
 * pid_alive - check that a task structure is not stale
 * @p: Task structure to be checked.
 *
 * Test if a process is not yet dead (at most zombie state)
 * If pid_alive fails, then pointers within the task structure
 * can be stale and must not be dereferenced.
 *
 * Return: 1 if the process is alive. 0 otherwise.
 */
static inline int pid_alive(const struct task_struct *p)
{
        return p->thread_pid != NULL;
}

static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}

static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}


static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}

static inline pid_t task_session_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}

static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}

static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
        pid_t pid = 0;

        rcu_read_lock();
        if (pid_alive(tsk))
                pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
        rcu_read_unlock();

        return pid;
}

static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
        return task_ppid_nr_ns(tsk, &init_pid_ns);
}

/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
        return task_pgrp_nr_ns(tsk, &init_pid_ns);
}

/**
 * is_global_init - check if a task structure is init. Since init
 * is free to have sub-threads we need to check tgid.
 * @tsk: Task structure to be checked.
 *
 * Check if a task structure is the first user space task the kernel created.
 *
 * Return: 1 if the task structure is init. 0 otherwise.
 */
static inline int is_global_init(struct task_struct *tsk)
{
        return task_tgid_nr(tsk) == 1;
}

#endif /* _LINUX_PID_H */






























































































































































































































































































































































































































    2 







    1 









































































































































































































































































































































































































































































































































    1 










































































































































































































































































    3 




    2 













































































































































































































    1 
    1 



































































    9 




















    1 




    1 













































































    2 








    2 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock LSM - Filesystem management and hooks
 *
 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 * Copyright © 2021-2022 Microsoft Corporation
 * Copyright © 2022 Günther Noack <gnoack3000@gmail.com>
 * Copyright © 2023-2024 Google LLC
 */

#include <asm/ioctls.h>
#include <kunit/test.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/bits.h>
#include <linux/compiler_types.h>
#include <linux/dcache.h>
#include <linux/err.h>
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/limits.h>
#include <linux/list.h>
#include <linux/lsm_hooks.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/path.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/workqueue.h>
#include <uapi/linux/fiemap.h>
#include <uapi/linux/landlock.h>

#include "common.h"
#include "cred.h"
#include "fs.h"
#include "limits.h"
#include "object.h"
#include "ruleset.h"
#include "setup.h"

/* Underlying object management */

static void release_inode(struct landlock_object *const object)
        __releases(object->lock)
{
        struct inode *const inode = object->underobj;
        struct super_block *sb;

        if (!inode) {
                spin_unlock(&object->lock);
                return;
        }

        /*
         * Protects against concurrent use by hook_sb_delete() of the reference
         * to the underlying inode.
         */
        object->underobj = NULL;
        /*
         * Makes sure that if the filesystem is concurrently unmounted,
         * hook_sb_delete() will wait for us to finish iput().
         */
        sb = inode->i_sb;
        atomic_long_inc(&landlock_superblock(sb)->inode_refs);
        spin_unlock(&object->lock);
        /*
         * Because object->underobj was not NULL, hook_sb_delete() and
         * get_inode_object() guarantee that it is safe to reset
         * landlock_inode(inode)->object while it is not NULL.  It is therefore
         * not necessary to lock inode->i_lock.
         */
        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
        /*
         * Now, new rules can safely be tied to @inode with get_inode_object().
         */

        iput(inode);
        if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
                wake_up_var(&landlock_superblock(sb)->inode_refs);
}

static const struct landlock_object_underops landlock_fs_underops = {
        .release = release_inode
};

/* IOCTL helpers */

/**
 * is_masked_device_ioctl - Determine whether an IOCTL command is always
 * permitted with Landlock for device files.  These commands can not be
 * restricted on device files by enforcing a Landlock policy.
 *
 * @cmd: The IOCTL command that is supposed to be run.
 *
 * By default, any IOCTL on a device file requires the
 * LANDLOCK_ACCESS_FS_IOCTL_DEV right.  However, we blanket-permit some
 * commands, if:
 *
 * 1. The command is implemented in fs/ioctl.c's do_vfs_ioctl(),
 *    not in f_ops->unlocked_ioctl() or f_ops->compat_ioctl().
 *
 * 2. The command is harmless when invoked on devices.
 *
 * We also permit commands that do not make sense for devices, but where the
 * do_vfs_ioctl() implementation returns a more conventional error code.
 *
 * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl()
 * should be considered for inclusion here.
 *
 * Returns: true if the IOCTL @cmd can not be restricted with Landlock for
 * device files.
 */
static __attribute_const__ bool is_masked_device_ioctl(const unsigned int cmd)
{
        switch (cmd) {
        /*
         * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
         * close-on-exec and the file's buffered-IO and async flags.  These
         * operations are also available through fcntl(2), and are
         * unconditionally permitted in Landlock.
         */
        case FIOCLEX:
        case FIONCLEX:
        case FIONBIO:
        case FIOASYNC:
        /*
         * FIOQSIZE queries the size of a regular file, directory, or link.
         *
         * We still permit it, because it always returns -ENOTTY for
         * other file types.
         */
        case FIOQSIZE:
        /*
         * FIFREEZE and FITHAW freeze and thaw the file system which the
         * given file belongs to.  Requires CAP_SYS_ADMIN.
         *
         * These commands operate on the file system's superblock rather
         * than on the file itself.  The same operations can also be
         * done through any other file or directory on the same file
         * system, so it is safe to permit these.
         */
        case FIFREEZE:
        case FITHAW:
        /*
         * FS_IOC_FIEMAP queries information about the allocation of
         * blocks within a file.
         *
         * This IOCTL command only makes sense for regular files and is
         * not implemented by devices. It is harmless to permit.
         */
        case FS_IOC_FIEMAP:
        /*
         * FIGETBSZ queries the file system's block size for a file or
         * directory.
         *
         * This command operates on the file system's superblock rather
         * than on the file itself.  The same operation can also be done
         * through any other file or directory on the same file system,
         * so it is safe to permit it.
         */
        case FIGETBSZ:
        /*
         * FICLONE, FICLONERANGE and FIDEDUPERANGE make files share
         * their underlying storage ("reflink") between source and
         * destination FDs, on file systems which support that.
         *
         * These IOCTL commands only apply to regular files
         * and are harmless to permit for device files.
         */
        case FICLONE:
        case FICLONERANGE:
        case FIDEDUPERANGE:
        /*
         * FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on
         * the file system superblock, not on the specific file, so
         * these operations are available through any other file on the
         * same file system as well.
         */
        case FS_IOC_GETFSUUID:
        case FS_IOC_GETFSSYSFSPATH:
                return true;

        /*
         * FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and
         * FS_IOC_FSSETXATTR are forwarded to device implementations.
         */

        /*
         * file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64,
         * FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are
         * forwarded to device implementations, so not permitted.
         */

        /* Other commands are guarded by the access right. */
        default:
                return false;
        }
}

/*
 * is_masked_device_ioctl_compat - same as the helper above, but checking the
 * "compat" IOCTL commands.
 *
 * The IOCTL commands with special handling in compat-mode should behave the
 * same as their non-compat counterparts.
 */
static __attribute_const__ bool
is_masked_device_ioctl_compat(const unsigned int cmd)
{
        switch (cmd) {
        /* FICLONE is permitted, same as in the non-compat variant. */
        case FICLONE:
                return true;

#if defined(CONFIG_X86_64)
        /*
         * FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32,
         * FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted,
         * for consistency with their non-compat variants.
         */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
        case FS_IOC_ZERO_RANGE_32:
#endif

        /*
         * FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device
         * implementations.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                return false;
        default:
                return is_masked_device_ioctl(cmd);
        }
}

/* Ruleset management */

static struct landlock_object *get_inode_object(struct inode *const inode)
{
        struct landlock_object *object, *new_object;
        struct landlock_inode_security *inode_sec = landlock_inode(inode);

        rcu_read_lock();
retry:
        object = rcu_dereference(inode_sec->object);
        if (object) {
                if (likely(refcount_inc_not_zero(&object->usage))) {
                        rcu_read_unlock();
                        return object;
                }
                /*
                 * We are racing with release_inode(), the object is going
                 * away.  Wait for release_inode(), then retry.
                 */
                spin_lock(&object->lock);
                spin_unlock(&object->lock);
                goto retry;
        }
        rcu_read_unlock();

        /*
         * If there is no object tied to @inode, then create a new one (without
         * holding any locks).
         */
        new_object = landlock_create_object(&landlock_fs_underops, inode);
        if (IS_ERR(new_object))
                return new_object;

        /*
         * Protects against concurrent calls to get_inode_object() or
         * hook_sb_delete().
         */
        spin_lock(&inode->i_lock);
        if (unlikely(rcu_access_pointer(inode_sec->object))) {
                /* Someone else just created the object, bail out and retry. */
                spin_unlock(&inode->i_lock);
                kfree(new_object);

                rcu_read_lock();
                goto retry;
        }

        /*
         * @inode will be released by hook_sb_delete() on its superblock
         * shutdown, or by release_inode() when no more ruleset references the
         * related object.
         */
        ihold(inode);
        rcu_assign_pointer(inode_sec->object, new_object);
        spin_unlock(&inode->i_lock);
        return new_object;
}

/* All access rights that can be tied to files. */
/* clang-format off */
#define ACCESS_FILE ( \
        LANDLOCK_ACCESS_FS_EXECUTE | \
        LANDLOCK_ACCESS_FS_WRITE_FILE | \
        LANDLOCK_ACCESS_FS_READ_FILE | \
        LANDLOCK_ACCESS_FS_TRUNCATE | \
        LANDLOCK_ACCESS_FS_IOCTL_DEV)
/* clang-format on */

/*
 * @path: Should have been checked by get_path_from_fd().
 */
int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
                            const struct path *const path,
                            access_mask_t access_rights)
{
        int err;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Files only get access rights that make sense. */
        if (!d_is_dir(path->dentry) &&
            (access_rights | ACCESS_FILE) != ACCESS_FILE)
                return -EINVAL;
        if (WARN_ON_ONCE(ruleset->num_layers != 1))
                return -EINVAL;

        /* Transforms relative access rights to absolute ones. */
        access_rights |= LANDLOCK_MASK_ACCESS_FS &
                         ~landlock_get_fs_access_mask(ruleset, 0);
        id.key.object = get_inode_object(d_backing_inode(path->dentry));
        if (IS_ERR(id.key.object))
                return PTR_ERR(id.key.object);
        mutex_lock(&ruleset->lock);
        err = landlock_insert_rule(ruleset, id, access_rights);
        mutex_unlock(&ruleset->lock);
        /*
         * No need to check for an error because landlock_insert_rule()
         * increments the refcount for the new object if needed.
         */
        landlock_put_object(id.key.object);
        return err;
}

/* Access-control management */

/*
 * The lifetime of the returned rule is tied to @domain.
 *
 * Returns NULL if no rule is found or if @dentry is negative.
 */
static const struct landlock_rule *
find_rule(const struct landlock_ruleset *const domain,
          const struct dentry *const dentry)
{
        const struct landlock_rule *rule;
        const struct inode *inode;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Ignores nonexistent leafs. */
        if (d_is_negative(dentry))
                return NULL;

        inode = d_backing_inode(dentry);
        rcu_read_lock();
        id.key.object = rcu_dereference(landlock_inode(inode)->object);
        rule = landlock_find_rule(domain, id);
        rcu_read_unlock();
        return rule;
}

/*
 * Allows access to pseudo filesystems that will never be mountable (e.g.
 * sockfs, pipefs), but can still be reachable through
 * /proc/<pid>/fd/<file-descriptor>
 */
static bool is_nouser_or_private(const struct dentry *dentry)
{
        return (dentry->d_sb->s_flags & SB_NOUSER) ||
               (d_is_positive(dentry) &&
                unlikely(IS_PRIVATE(d_backing_inode(dentry))));
}

static access_mask_t
get_raw_handled_fs_accesses(const struct landlock_ruleset *const domain)
{
        access_mask_t access_dom = 0;
        size_t layer_level;

        for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
                access_dom |=
                        landlock_get_raw_fs_access_mask(domain, layer_level);
        return access_dom;
}

static access_mask_t
get_handled_fs_accesses(const struct landlock_ruleset *const domain)
{
        /* Handles all initially denied by default access rights. */
        return get_raw_handled_fs_accesses(domain) |
               LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
}

static const struct landlock_ruleset *
get_fs_domain(const struct landlock_ruleset *const domain)
{
        if (!domain || !get_raw_handled_fs_accesses(domain))
                return NULL;

        return domain;
}

static const struct landlock_ruleset *get_current_fs_domain(void)
{
        return get_fs_domain(landlock_get_current_domain());
}

/*
 * Check that a destination file hierarchy has more restrictions than a source
 * file hierarchy.  This is only used for link and rename actions.
 *
 * @layer_masks_child2: Optional child masks.
 */
static bool no_more_access(
        const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
        const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS],
        const bool child1_is_directory,
        const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
        const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS],
        const bool child2_is_directory)
{
        unsigned long access_bit;

        for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2);
             access_bit++) {
                /* Ignores accesses that only make sense for directories. */
                const bool is_file_access =
                        !!(BIT_ULL(access_bit) & ACCESS_FILE);

                if (child1_is_directory || is_file_access) {
                        /*
                         * Checks if the destination restrictions are a
                         * superset of the source ones (i.e. inherited access
                         * rights without child exceptions):
                         * restrictions(parent2) >= restrictions(child1)
                         */
                        if ((((*layer_masks_parent1)[access_bit] &
                              (*layer_masks_child1)[access_bit]) |
                             (*layer_masks_parent2)[access_bit]) !=
                            (*layer_masks_parent2)[access_bit])
                                return false;
                }

                if (!layer_masks_child2)
                        continue;
                if (child2_is_directory || is_file_access) {
                        /*
                         * Checks inverted restrictions for RENAME_EXCHANGE:
                         * restrictions(parent1) >= restrictions(child2)
                         */
                        if ((((*layer_masks_parent2)[access_bit] &
                              (*layer_masks_child2)[access_bit]) |
                             (*layer_masks_parent1)[access_bit]) !=
                            (*layer_masks_parent1)[access_bit])
                                return false;
                }
        }
        return true;
}

#define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__))
#define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_no_more_access(struct kunit *const test)
{
        const layer_mask_t rx0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_READ_FILE)] = BIT_ULL(0),
        };
        const layer_mask_t mx0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_MAKE_REG)] = BIT_ULL(0),
        };
        const layer_mask_t x0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
        };
        const layer_mask_t x1[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(1),
        };
        const layer_mask_t x01[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0) |
                                                          BIT_ULL(1),
        };
        const layer_mask_t allows_all[LANDLOCK_NUM_ACCESS_FS] = {};

        /* Checks without restriction. */
        NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false);
        NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false);
        NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false);

        /*
         * Checks that we can only refer a file if no more access could be
         * inherited.
         */
        NMA_TRUE(&x0, &x0, false, &rx0, NULL, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false);

        /* Checks allowed referring with different nested domains. */
        NMA_TRUE(&x0, &x1, false, &x0, NULL, false);
        NMA_TRUE(&x1, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &rx0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &rx0, NULL, false);
        NMA_FALSE(&x01, &x01, false, &x0, NULL, false);

        /* Checks that file access rights are also enforced for a directory. */
        NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false);

        /* Checks that directory access rights don't impact file referring... */
        NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false);
        /* ...but only directory referring. */
        NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false);

        /* Checks directory exchange. */
        NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true);
        NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true);

        /* Checks file exchange with directory access rights... */
        NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false);
        /* ...and with file access rights. */
        NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false);

        /*
         * Allowing the following requests should not be a security risk
         * because domain 0 denies execute access, and domain 1 is always
         * nested with domain 0.  However, adding an exception for this case
         * would mean to check all nested domains to make sure none can get
         * more privileges (e.g. processes only sandboxed by domain 0).
         * Moreover, this behavior (i.e. composition of N domains) could then
         * be inconsistent compared to domain 1's ruleset alone (e.g. it might
         * be denied to link/rename with domain 1's ruleset, whereas it would
         * be allowed if nested on top of domain 0).  Another drawback would be
         * to create a cover channel that could enable sandboxed processes to
         * infer most of the filesystem restrictions from their domain.  To
         * make it simple, efficient, safe, and more consistent, this case is
         * always denied.
         */
        NMA_FALSE(&x1, &x1, false, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, false, &rx0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &rx0, NULL, false);

        /* Checks the same case of exclusive domains with a file... */
        NMA_TRUE(&x1, &x1, false, &x01, NULL, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x0, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x01, false);
        NMA_FALSE(&x1, &x1, false, &x0, &x0, false);
        /* ...and with a directory. */
        NMA_FALSE(&x1, &x1, false, &x0, &x0, true);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, false);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, true);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef NMA_TRUE
#undef NMA_FALSE

/*
 * Removes @layer_masks accesses that are not requested.
 *
 * Returns true if the request is allowed, false otherwise.
 */
static bool
scope_to_request(const access_mask_t access_request,
                 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
{
        const unsigned long access_req = access_request;
        unsigned long access_bit;

        if (WARN_ON_ONCE(!layer_masks))
                return true;

        for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks))
                (*layer_masks)[access_bit] = 0;
        return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_scope_to_request_with_exec_none(struct kunit *const test)
{
        /* Allows everything. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_TRUE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
                                                 &layer_masks));
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

static void test_scope_to_request_with_exec_some(struct kunit *const test)
{
        /* Denies execute and write. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
        };

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
                                                  &layer_masks));
        KUNIT_EXPECT_EQ(test, BIT_ULL(0),
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

static void test_scope_to_request_without_access(struct kunit *const test)
{
        /* Denies execute and write. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
        };

        /* Checks and scopes without access request. */
        KUNIT_EXPECT_TRUE(test, scope_to_request(0, &layer_masks));
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

/*
 * Returns true if there is at least one access right different than
 * LANDLOCK_ACCESS_FS_REFER.
 */
static bool
is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
          const access_mask_t access_request)
{
        unsigned long access_bit;
        /* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */
        const unsigned long access_check = access_request &
                                           ~LANDLOCK_ACCESS_FS_REFER;

        if (!layer_masks)
                return false;

        for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) {
                if ((*layer_masks)[access_bit])
                        return true;
        }
        return false;
}

#define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__))
#define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_is_eacces_with_none(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_refer(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_REFER)] = BIT_ULL(0),
        };

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_write(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(0),
        };

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);

        IE_TRUE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef IE_TRUE
#undef IE_FALSE

/**
 * is_access_to_paths_allowed - Check accesses for requests with a common path
 *
 * @domain: Domain to check against.
 * @path: File hierarchy to walk through.
 * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is
 *     equal to @layer_masks_parent2 (if any).  This is tied to the unique
 *     requested path for most actions, or the source in case of a refer action
 *     (i.e. rename or link), or the source and destination in case of
 *     RENAME_EXCHANGE.
 * @layer_masks_parent1: Pointer to a matrix of layer masks per access
 *     masks, identifying the layers that forbid a specific access.  Bits from
 *     this matrix can be unset according to the @path walk.  An empty matrix
 *     means that @domain allows all possible Landlock accesses (i.e. not only
 *     those identified by @access_request_parent1).  This matrix can
 *     initially refer to domain layer masks and, when the accesses for the
 *     destination and source are the same, to requested layer masks.
 * @dentry_child1: Dentry to the initial child of the parent1 path.  This
 *     pointer must be NULL for non-refer actions (i.e. not link nor rename).
 * @access_request_parent2: Similar to @access_request_parent1 but for a
 *     request involving a source and a destination.  This refers to the
 *     destination, except in case of RENAME_EXCHANGE where it also refers to
 *     the source.  Must be set to 0 when using a simple path request.
 * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer
 *     action.  This must be NULL otherwise.
 * @dentry_child2: Dentry to the initial child of the parent2 path.  This
 *     pointer is only set for RENAME_EXCHANGE actions and must be NULL
 *     otherwise.
 *
 * This helper first checks that the destination has a superset of restrictions
 * compared to the source (if any) for a common path.  Because of
 * RENAME_EXCHANGE actions, source and destinations may be swapped.  It then
 * checks that the collected accesses and the remaining ones are enough to
 * allow the request.
 *
 * Returns:
 * - true if the access request is granted;
 * - false otherwise.
 */
static bool is_access_to_paths_allowed(
        const struct landlock_ruleset *const domain,
        const struct path *const path,
        const access_mask_t access_request_parent1,
        layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
        const struct dentry *const dentry_child1,
        const access_mask_t access_request_parent2,
        layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
        const struct dentry *const dentry_child2)
{
        bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check,
             child1_is_directory = true, child2_is_directory = true;
        struct path walker_path;
        access_mask_t access_masked_parent1, access_masked_parent2;
        layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS],
                _layer_masks_child2[LANDLOCK_NUM_ACCESS_FS];
        layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL,
        (*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL;

        if (!access_request_parent1 && !access_request_parent2)
                return true;
        if (WARN_ON_ONCE(!domain || !path))
                return true;
        if (is_nouser_or_private(path->dentry))
                return true;
        if (WARN_ON_ONCE(domain->num_layers < 1 || !layer_masks_parent1))
                return false;

        if (unlikely(layer_masks_parent2)) {
                if (WARN_ON_ONCE(!dentry_child1))
                        return false;
                /*
                 * For a double request, first check for potential privilege
                 * escalation by looking at domain handled accesses (which are
                 * a superset of the meaningful requested accesses).
                 */
                access_masked_parent1 = access_masked_parent2 =
                        get_handled_fs_accesses(domain);
                is_dom_check = true;
        } else {
                if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
                        return false;
                /* For a simple request, only check for requested accesses. */
                access_masked_parent1 = access_request_parent1;
                access_masked_parent2 = access_request_parent2;
                is_dom_check = false;
        }

        if (unlikely(dentry_child1)) {
                landlock_unmask_layers(
                        find_rule(domain, dentry_child1),
                        landlock_init_layer_masks(
                                domain, LANDLOCK_MASK_ACCESS_FS,
                                &_layer_masks_child1, LANDLOCK_KEY_INODE),
                        &_layer_masks_child1, ARRAY_SIZE(_layer_masks_child1));
                layer_masks_child1 = &_layer_masks_child1;
                child1_is_directory = d_is_dir(dentry_child1);
        }
        if (unlikely(dentry_child2)) {
                landlock_unmask_layers(
                        find_rule(domain, dentry_child2),
                        landlock_init_layer_masks(
                                domain, LANDLOCK_MASK_ACCESS_FS,
                                &_layer_masks_child2, LANDLOCK_KEY_INODE),
                        &_layer_masks_child2, ARRAY_SIZE(_layer_masks_child2));
                layer_masks_child2 = &_layer_masks_child2;
                child2_is_directory = d_is_dir(dentry_child2);
        }

        walker_path = *path;
        path_get(&walker_path);
        /*
         * We need to walk through all the hierarchy to not miss any relevant
         * restriction.
         */
        while (true) {
                struct dentry *parent_dentry;
                const struct landlock_rule *rule;

                /*
                 * If at least all accesses allowed on the destination are
                 * already allowed on the source, respectively if there is at
                 * least as much as restrictions on the destination than on the
                 * source, then we can safely refer files from the source to
                 * the destination without risking a privilege escalation.
                 * This also applies in the case of RENAME_EXCHANGE, which
                 * implies checks on both direction.  This is crucial for
                 * standalone multilayered security policies.  Furthermore,
                 * this helps avoid policy writers to shoot themselves in the
                 * foot.
                 */
                if (unlikely(is_dom_check &&
                             no_more_access(
                                     layer_masks_parent1, layer_masks_child1,
                                     child1_is_directory, layer_masks_parent2,
                                     layer_masks_child2,
                                     child2_is_directory))) {
                        allowed_parent1 = scope_to_request(
                                access_request_parent1, layer_masks_parent1);
                        allowed_parent2 = scope_to_request(
                                access_request_parent2, layer_masks_parent2);

                        /* Stops when all accesses are granted. */
                        if (allowed_parent1 && allowed_parent2)
                                break;

                        /*
                         * Now, downgrades the remaining checks from domain
                         * handled accesses to requested accesses.
                         */
                        is_dom_check = false;
                        access_masked_parent1 = access_request_parent1;
                        access_masked_parent2 = access_request_parent2;
                }

                rule = find_rule(domain, walker_path.dentry);
                allowed_parent1 = landlock_unmask_layers(
                        rule, access_masked_parent1, layer_masks_parent1,
                        ARRAY_SIZE(*layer_masks_parent1));
                allowed_parent2 = landlock_unmask_layers(
                        rule, access_masked_parent2, layer_masks_parent2,
                        ARRAY_SIZE(*layer_masks_parent2));

                /* Stops when a rule from each layer grants access. */
                if (allowed_parent1 && allowed_parent2)
                        break;
jump_up:
                if (walker_path.dentry == walker_path.mnt->mnt_root) {
                        if (follow_up(&walker_path)) {
                                /* Ignores hidden mount points. */
                                goto jump_up;
                        } else {
                                /*
                                 * Stops at the real root.  Denies access
                                 * because not all layers have granted access.
                                 */
                                break;
                        }
                }
                if (unlikely(IS_ROOT(walker_path.dentry))) {
                        /*
                         * Stops at disconnected root directories.  Only allows
                         * access to internal filesystems (e.g. nsfs, which is
                         * reachable through /proc/<pid>/ns/<namespace>).
                         */
                        allowed_parent1 = allowed_parent2 =
                                !!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
                        break;
                }
                parent_dentry = dget_parent(walker_path.dentry);
                dput(walker_path.dentry);
                walker_path.dentry = parent_dentry;
        }
        path_put(&walker_path);

        return allowed_parent1 && allowed_parent2;
}

static int check_access_path(const struct landlock_ruleset *const domain,
                             const struct path *const path,
                             access_mask_t access_request)
{
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        access_request = landlock_init_layer_masks(
                domain, access_request, &layer_masks, LANDLOCK_KEY_INODE);
        if (is_access_to_paths_allowed(domain, path, access_request,
                                       &layer_masks, NULL, 0, NULL, NULL))
                return 0;
        return -EACCES;
}

static int current_check_access_path(const struct path *const path,
                                     const access_mask_t access_request)
{
        const struct landlock_ruleset *const dom = get_current_fs_domain();

        if (!dom)
                return 0;
        return check_access_path(dom, path, access_request);
}

static access_mask_t get_mode_access(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFLNK:
                return LANDLOCK_ACCESS_FS_MAKE_SYM;
        case 0:
                /* A zero mode translates to S_IFREG. */
        case S_IFREG:
                return LANDLOCK_ACCESS_FS_MAKE_REG;
        case S_IFDIR:
                return LANDLOCK_ACCESS_FS_MAKE_DIR;
        case S_IFCHR:
                return LANDLOCK_ACCESS_FS_MAKE_CHAR;
        case S_IFBLK:
                return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
        case S_IFIFO:
                return LANDLOCK_ACCESS_FS_MAKE_FIFO;
        case S_IFSOCK:
                return LANDLOCK_ACCESS_FS_MAKE_SOCK;
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static access_mask_t maybe_remove(const struct dentry *const dentry)
{
        if (d_is_negative(dentry))
                return 0;
        return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
                                  LANDLOCK_ACCESS_FS_REMOVE_FILE;
}

/**
 * collect_domain_accesses - Walk through a file path and collect accesses
 *
 * @domain: Domain to check against.
 * @mnt_root: Last directory to check.
 * @dir: Directory to start the walk from.
 * @layer_masks_dom: Where to store the collected accesses.
 *
 * This helper is useful to begin a path walk from the @dir directory to a
 * @mnt_root directory used as a mount point.  This mount point is the common
 * ancestor between the source and the destination of a renamed and linked
 * file.  While walking from @dir to @mnt_root, we record all the domain's
 * allowed accesses in @layer_masks_dom.
 *
 * This is similar to is_access_to_paths_allowed() but much simpler because it
 * only handles walking on the same mount point and only checks one set of
 * accesses.
 *
 * Returns:
 * - true if all the domain access rights are allowed for @dir;
 * - false if the walk reached @mnt_root.
 */
static bool collect_domain_accesses(
        const struct landlock_ruleset *const domain,
        const struct dentry *const mnt_root, struct dentry *dir,
        layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS])
{
        unsigned long access_dom;
        bool ret = false;

        if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom))
                return true;
        if (is_nouser_or_private(dir))
                return true;

        access_dom = landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
                                               layer_masks_dom,
                                               LANDLOCK_KEY_INODE);

        dget(dir);
        while (true) {
                struct dentry *parent_dentry;

                /* Gets all layers allowing all domain accesses. */
                if (landlock_unmask_layers(find_rule(domain, dir), access_dom,
                                           layer_masks_dom,
                                           ARRAY_SIZE(*layer_masks_dom))) {
                        /*
                         * Stops when all handled accesses are allowed by at
                         * least one rule in each layer.
                         */
                        ret = true;
                        break;
                }

                /* We should not reach a root other than @mnt_root. */
                if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir)))
                        break;

                parent_dentry = dget_parent(dir);
                dput(dir);
                dir = parent_dentry;
        }
        dput(dir);
        return ret;
}

/**
 * current_check_refer_path - Check if a rename or link action is allowed
 *
 * @old_dentry: File or directory requested to be moved or linked.
 * @new_dir: Destination parent directory.
 * @new_dentry: Destination file or directory.
 * @removable: Sets to true if it is a rename operation.
 * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE.
 *
 * Because of its unprivileged constraints, Landlock relies on file hierarchies
 * (and not only inodes) to tie access rights to files.  Being able to link or
 * rename a file hierarchy brings some challenges.  Indeed, moving or linking a
 * file (i.e. creating a new reference to an inode) can have an impact on the
 * actions allowed for a set of files if it would change its parent directory
 * (i.e. reparenting).
 *
 * To avoid trivial access right bypasses, Landlock first checks if the file or
 * directory requested to be moved would gain new access rights inherited from
 * its new hierarchy.  Before returning any error, Landlock then checks that
 * the parent source hierarchy and the destination hierarchy would allow the
 * link or rename action.  If it is not the case, an error with EACCES is
 * returned to inform user space that there is no way to remove or create the
 * requested source file type.  If it should be allowed but the new inherited
 * access rights would be greater than the source access rights, then the
 * kernel returns an error with EXDEV.  Prioritizing EACCES over EXDEV enables
 * user space to abort the whole operation if there is no way to do it, or to
 * manually copy the source to the destination if this remains allowed, e.g.
 * because file creation is allowed on the destination directory but not direct
 * linking.
 *
 * To achieve this goal, the kernel needs to compare two file hierarchies: the
 * one identifying the source file or directory (including itself), and the
 * destination one.  This can be seen as a multilayer partial ordering problem.
 * The kernel walks through these paths and collects in a matrix the access
 * rights that are denied per layer.  These matrices are then compared to see
 * if the destination one has more (or the same) restrictions as the source
 * one.  If this is the case, the requested action will not return EXDEV, which
 * doesn't mean the action is allowed.  The parent hierarchy of the source
 * (i.e. parent directory), and the destination hierarchy must also be checked
 * to verify that they explicitly allow such action (i.e.  referencing,
 * creation and potentially removal rights).  The kernel implementation is then
 * required to rely on potentially four matrices of access rights: one for the
 * source file or directory (i.e. the child), a potentially other one for the
 * other source/destination (in case of RENAME_EXCHANGE), one for the source
 * parent hierarchy and a last one for the destination hierarchy.  These
 * ephemeral matrices take some space on the stack, which limits the number of
 * layers to a deemed reasonable number: 16.
 *
 * Returns:
 * - 0 if access is allowed;
 * - -EXDEV if @old_dentry would inherit new access rights from @new_dir;
 * - -EACCES if file removal or creation is denied.
 */
static int current_check_refer_path(struct dentry *const old_dentry,
                                    const struct path *const new_dir,
                                    struct dentry *const new_dentry,
                                    const bool removable, const bool exchange)
{
        const struct landlock_ruleset *const dom = get_current_fs_domain();
        bool allow_parent1, allow_parent2;
        access_mask_t access_request_parent1, access_request_parent2;
        struct path mnt_dir;
        struct dentry *old_parent;
        layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {},
                     layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {};

        if (!dom)
                return 0;
        if (WARN_ON_ONCE(dom->num_layers < 1))
                return -EACCES;
        if (unlikely(d_is_negative(old_dentry)))
                return -ENOENT;
        if (exchange) {
                if (unlikely(d_is_negative(new_dentry)))
                        return -ENOENT;
                access_request_parent1 =
                        get_mode_access(d_backing_inode(new_dentry)->i_mode);
        } else {
                access_request_parent1 = 0;
        }
        access_request_parent2 =
                get_mode_access(d_backing_inode(old_dentry)->i_mode);
        if (removable) {
                access_request_parent1 |= maybe_remove(old_dentry);
                access_request_parent2 |= maybe_remove(new_dentry);
        }

        /* The mount points are the same for old and new paths, cf. EXDEV. */
        if (old_dentry->d_parent == new_dir->dentry) {
                /*
                 * The LANDLOCK_ACCESS_FS_REFER access right is not required
                 * for same-directory referer (i.e. no reparenting).
                 */
                access_request_parent1 = landlock_init_layer_masks(
                        dom, access_request_parent1 | access_request_parent2,
                        &layer_masks_parent1, LANDLOCK_KEY_INODE);
                if (is_access_to_paths_allowed(
                            dom, new_dir, access_request_parent1,
                            &layer_masks_parent1, NULL, 0, NULL, NULL))
                        return 0;
                return -EACCES;
        }

        access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
        access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER;

        /* Saves the common mount point. */
        mnt_dir.mnt = new_dir->mnt;
        mnt_dir.dentry = new_dir->mnt->mnt_root;

        /*
         * old_dentry may be the root of the common mount point and
         * !IS_ROOT(old_dentry) at the same time (e.g. with open_tree() and
         * OPEN_TREE_CLONE).  We do not need to call dget(old_parent) because
         * we keep a reference to old_dentry.
         */
        old_parent = (old_dentry == mnt_dir.dentry) ? old_dentry :
                                                      old_dentry->d_parent;

        /* new_dir->dentry is equal to new_dentry->d_parent */
        allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, old_parent,
                                                &layer_masks_parent1);
        allow_parent2 = collect_domain_accesses(
                dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2);

        if (allow_parent1 && allow_parent2)
                return 0;

        /*
         * To be able to compare source and destination domain access rights,
         * take into account the @old_dentry access rights aggregated with its
         * parent access rights.  This will be useful to compare with the
         * destination parent access rights.
         */
        if (is_access_to_paths_allowed(
                    dom, &mnt_dir, access_request_parent1, &layer_masks_parent1,
                    old_dentry, access_request_parent2, &layer_masks_parent2,
                    exchange ? new_dentry : NULL))
                return 0;

        /*
         * This prioritizes EACCES over EXDEV for all actions, including
         * renames with RENAME_EXCHANGE.
         */
        if (likely(is_eacces(&layer_masks_parent1, access_request_parent1) ||
                   is_eacces(&layer_masks_parent2, access_request_parent2)))
                return -EACCES;

        /*
         * Gracefully forbids reparenting if the destination directory
         * hierarchy is not a superset of restrictions of the source directory
         * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the
         * source or the destination.
         */
        return -EXDEV;
}

/* Inode hooks */

static void hook_inode_free_security(struct inode *const inode)
{
        /*
         * All inodes must already have been untied from their object by
         * release_inode() or hook_sb_delete().
         */
        WARN_ON_ONCE(landlock_inode(inode)->object);
}

/* Super-block hooks */

/*
 * Release the inodes used in a security policy.
 *
 * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
 */
static void hook_sb_delete(struct super_block *const sb)
{
        struct inode *inode, *prev_inode = NULL;

        if (!landlock_initialized)
                return;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                struct landlock_object *object;

                /* Only handles referenced inodes. */
                if (!atomic_read(&inode->i_count))
                        continue;

                /*
                 * Protects against concurrent modification of inode (e.g.
                 * from get_inode_object()).
                 */
                spin_lock(&inode->i_lock);
                /*
                 * Checks I_FREEING and I_WILL_FREE  to protect against a race
                 * condition when release_inode() just called iput(), which
                 * could lead to a NULL dereference of inode->security or a
                 * second call to iput() for the same Landlock object.  Also
                 * checks I_NEW because such inode cannot be tied to an object.
                 */
                if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                rcu_read_lock();
                object = rcu_dereference(landlock_inode(inode)->object);
                if (!object) {
                        rcu_read_unlock();
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                /* Keeps a reference to this inode until the next loop walk. */
                __iget(inode);
                spin_unlock(&inode->i_lock);

                /*
                 * If there is no concurrent release_inode() ongoing, then we
                 * are in charge of calling iput() on this inode, otherwise we
                 * will just wait for it to finish.
                 */
                spin_lock(&object->lock);
                if (object->underobj == inode) {
                        object->underobj = NULL;
                        spin_unlock(&object->lock);
                        rcu_read_unlock();

                        /*
                         * Because object->underobj was not NULL,
                         * release_inode() and get_inode_object() guarantee
                         * that it is safe to reset
                         * landlock_inode(inode)->object while it is not NULL.
                         * It is therefore not necessary to lock inode->i_lock.
                         */
                        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
                        /*
                         * At this point, we own the ihold() reference that was
                         * originally set up by get_inode_object() and the
                         * __iget() reference that we just set in this loop
                         * walk.  Therefore the following call to iput() will
                         * not sleep nor drop the inode because there is now at
                         * least two references to it.
                         */
                        iput(inode);
                } else {
                        spin_unlock(&object->lock);
                        rcu_read_unlock();
                }

                if (prev_inode) {
                        /*
                         * At this point, we still own the __iget() reference
                         * that we just set in this loop walk.  Therefore we
                         * can drop the list lock and know that the inode won't
                         * disappear from under us until the next loop walk.
                         */
                        spin_unlock(&sb->s_inode_list_lock);
                        /*
                         * We can now actually put the inode reference from the
                         * previous loop walk, which is not needed anymore.
                         */
                        iput(prev_inode);
                        cond_resched();
                        spin_lock(&sb->s_inode_list_lock);
                }
                prev_inode = inode;
        }
        spin_unlock(&sb->s_inode_list_lock);

        /* Puts the inode reference from the last loop walk, if any. */
        if (prev_inode)
                iput(prev_inode);
        /* Waits for pending iput() in release_inode(). */
        wait_var_event(&landlock_superblock(sb)->inode_refs,
                       !atomic_long_read(&landlock_superblock(sb)->inode_refs));
}

/*
 * Because a Landlock security policy is defined according to the filesystem
 * topology (i.e. the mount namespace), changing it may grant access to files
 * not previously allowed.
 *
 * To make it simple, deny any filesystem topology modification by landlocked
 * processes.  Non-landlocked processes may still change the namespace of a
 * landlocked process, but this kind of threat must be handled by a system-wide
 * access-control security policy.
 *
 * This could be lifted in the future if Landlock can safely handle mount
 * namespace updates requested by a landlocked process.  Indeed, we could
 * update the current domain (which is currently read-only) by taking into
 * account the accesses of the source and the destination of a new mount point.
 * However, it would also require to make all the child domains dynamically
 * inherit these new constraints.  Anyway, for backward compatibility reasons,
 * a dedicated user space option would be required (e.g. as a ruleset flag).
 */
static int hook_sb_mount(const char *const dev_name,
                         const struct path *const path, const char *const type,
                         const unsigned long flags, void *const data)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

static int hook_move_mount(const struct path *const from_path,
                           const struct path *const to_path)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

/*
 * Removing a mount point may reveal a previously hidden file hierarchy, which
 * may then grant access to files, which may have previously been forbidden.
 */
static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

/*
 * pivot_root(2), like mount(2), changes the current mount namespace.  It must
 * then be forbidden for a landlocked process.
 *
 * However, chroot(2) may be allowed because it only changes the relative root
 * directory of the current process.  Moreover, it can be used to restrict the
 * view of the filesystem.
 */
static int hook_sb_pivotroot(const struct path *const old_path,
                             const struct path *const new_path)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

/* Path hooks */

static int hook_path_link(struct dentry *const old_dentry,
                          const struct path *const new_dir,
                          struct dentry *const new_dentry)
{
        return current_check_refer_path(old_dentry, new_dir, new_dentry, false,
                                        false);
}

static int hook_path_rename(const struct path *const old_dir,
                            struct dentry *const old_dentry,
                            const struct path *const new_dir,
                            struct dentry *const new_dentry,
                            const unsigned int flags)
{
        /* old_dir refers to old_dentry->d_parent and new_dir->mnt */
        return current_check_refer_path(old_dentry, new_dir, new_dentry, true,
                                        !!(flags & RENAME_EXCHANGE));
}

static int hook_path_mkdir(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
}

static int hook_path_mknod(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode,
                           const unsigned int dev)
{
        const struct landlock_ruleset *const dom = get_current_fs_domain();

        if (!dom)
                return 0;
        return check_access_path(dom, dir, get_mode_access(mode));
}

static int hook_path_symlink(const struct path *const dir,
                             struct dentry *const dentry,
                             const char *const old_name)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
}

static int hook_path_unlink(const struct path *const dir,
                            struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
}

static int hook_path_rmdir(const struct path *const dir,
                           struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
}

static int hook_path_truncate(const struct path *const path)
{
        return current_check_access_path(path, LANDLOCK_ACCESS_FS_TRUNCATE);
}

/* File hooks */

/**
 * get_required_file_open_access - Get access needed to open a file
 *
 * @file: File being opened.
 *
 * Returns the access rights that are required for opening the given file,
 * depending on the file type and open mode.
 */
static access_mask_t
get_required_file_open_access(const struct file *const file)
{
        access_mask_t access = 0;

        if (file->f_mode & FMODE_READ) {
                /* A directory can only be opened in read mode. */
                if (S_ISDIR(file_inode(file)->i_mode))
                        return LANDLOCK_ACCESS_FS_READ_DIR;
                access = LANDLOCK_ACCESS_FS_READ_FILE;
        }
        if (file->f_mode & FMODE_WRITE)
                access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
        /* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
        if (file->f_flags & __FMODE_EXEC)
                access |= LANDLOCK_ACCESS_FS_EXECUTE;
        return access;
}

static int hook_file_alloc_security(struct file *const file)
{
        /*
         * Grants all access rights, even if most of them are not checked later
         * on. It is more consistent.
         *
         * Notably, file descriptors for regular files can also be acquired
         * without going through the file_open hook, for example when using
         * memfd_create(2).
         */
        landlock_file(file)->allowed_access = LANDLOCK_MASK_ACCESS_FS;
        return 0;
}

static bool is_device(const struct file *const file)
{
        const struct inode *inode = file_inode(file);

        return S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode);
}

static int hook_file_open(struct file *const file)
{
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
        access_mask_t open_access_request, full_access_request, allowed_access,
                optional_access;
        const struct landlock_ruleset *const dom =
                get_fs_domain(landlock_cred(file->f_cred)->domain);

        if (!dom)
                return 0;

        /*
         * Because a file may be opened with O_PATH, get_required_file_open_access()
         * may return 0.  This case will be handled with a future Landlock
         * evolution.
         */
        open_access_request = get_required_file_open_access(file);

        /*
         * We look up more access than what we immediately need for open(), so
         * that we can later authorize operations on opened files.
         */
        optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
        if (is_device(file))
                optional_access |= LANDLOCK_ACCESS_FS_IOCTL_DEV;

        full_access_request = open_access_request | optional_access;

        if (is_access_to_paths_allowed(
                    dom, &file->f_path,
                    landlock_init_layer_masks(dom, full_access_request,
                                              &layer_masks, LANDLOCK_KEY_INODE),
                    &layer_masks, NULL, 0, NULL, NULL)) {
                allowed_access = full_access_request;
        } else {
                unsigned long access_bit;
                const unsigned long access_req = full_access_request;

                /*
                 * Calculate the actual allowed access rights from layer_masks.
                 * Add each access right to allowed_access which has not been
                 * vetoed by any layer.
                 */
                allowed_access = 0;
                for_each_set_bit(access_bit, &access_req,
                                 ARRAY_SIZE(layer_masks)) {
                        if (!layer_masks[access_bit])
                                allowed_access |= BIT_ULL(access_bit);
                }
        }

        /*
         * For operations on already opened files (i.e. ftruncate()), it is the
         * access rights at the time of open() which decide whether the
         * operation is permitted. Therefore, we record the relevant subset of
         * file access rights in the opened struct file.
         */
        landlock_file(file)->allowed_access = allowed_access;

        if ((open_access_request & allowed_access) == open_access_request)
                return 0;

        return -EACCES;
}

static int hook_file_truncate(struct file *const file)
{
        /*
         * Allows truncation if the truncate right was available at the time of
         * opening the file, to get a consistent access check as for read, write
         * and execute operations.
         *
         * Note: For checks done based on the file's Landlock allowed access, we
         * enforce them independently of whether the current thread is in a
         * Landlock domain, so that open files passed between independent
         * processes retain their behaviour.
         */
        if (landlock_file(file)->allowed_access & LANDLOCK_ACCESS_FS_TRUNCATE)
                return 0;
        return -EACCES;
}

static int hook_file_ioctl(struct file *file, unsigned int cmd,
                           unsigned long arg)
{
        access_mask_t allowed_access = landlock_file(file)->allowed_access;

        /*
         * It is the access rights at the time of opening the file which
         * determine whether IOCTL can be used on the opened file later.
         *
         * The access right is attached to the opened file in hook_file_open().
         */
        if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
                return 0;

        if (!is_device(file))
                return 0;

        if (is_masked_device_ioctl(cmd))
                return 0;

        return -EACCES;
}

static int hook_file_ioctl_compat(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        access_mask_t allowed_access = landlock_file(file)->allowed_access;

        /*
         * It is the access rights at the time of opening the file which
         * determine whether IOCTL can be used on the opened file later.
         *
         * The access right is attached to the opened file in hook_file_open().
         */
        if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
                return 0;

        if (!is_device(file))
                return 0;

        if (is_masked_device_ioctl_compat(cmd))
                return 0;

        return -EACCES;
}

static struct security_hook_list landlock_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),

        LSM_HOOK_INIT(sb_delete, hook_sb_delete),
        LSM_HOOK_INIT(sb_mount, hook_sb_mount),
        LSM_HOOK_INIT(move_mount, hook_move_mount),
        LSM_HOOK_INIT(sb_umount, hook_sb_umount),
        LSM_HOOK_INIT(sb_remount, hook_sb_remount),
        LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),

        LSM_HOOK_INIT(path_link, hook_path_link),
        LSM_HOOK_INIT(path_rename, hook_path_rename),
        LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
        LSM_HOOK_INIT(path_mknod, hook_path_mknod),
        LSM_HOOK_INIT(path_symlink, hook_path_symlink),
        LSM_HOOK_INIT(path_unlink, hook_path_unlink),
        LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
        LSM_HOOK_INIT(path_truncate, hook_path_truncate),

        LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
        LSM_HOOK_INIT(file_open, hook_file_open),
        LSM_HOOK_INIT(file_truncate, hook_file_truncate),
        LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat),
};

__init void landlock_add_fs_hooks(void)
{
        security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
                           &landlock_lsmid);
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

/* clang-format off */
static struct kunit_case test_cases[] = {
        KUNIT_CASE(test_no_more_access),
        KUNIT_CASE(test_scope_to_request_with_exec_none),
        KUNIT_CASE(test_scope_to_request_with_exec_some),
        KUNIT_CASE(test_scope_to_request_without_access),
        KUNIT_CASE(test_is_eacces_with_none),
        KUNIT_CASE(test_is_eacces_with_refer),
        KUNIT_CASE(test_is_eacces_with_write),
        {}
};
/* clang-format on */

static struct kunit_suite test_suite = {
        .name = "landlock_fs",
        .test_cases = test_cases,
};

kunit_test_suite(test_suite);

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/* SPDX-License-Identifier: GPL-2.0-or-later */

#ifndef _LINUX_RSTREASON_H
#define _LINUX_RSTREASON_H
#include <net/dropreason-core.h>
#include <uapi/linux/mptcp.h>

#define DEFINE_RST_REASON(FN, FNe)        \
        FN(NOT_SPECIFIED)                \
        FN(NO_SOCKET)                        \
        FN(TCP_INVALID_ACK_SEQUENCE)        \
        FN(TCP_RFC7323_PAWS)                \
        FN(TCP_TOO_OLD_ACK)                \
        FN(TCP_ACK_UNSENT_DATA)                \
        FN(TCP_FLAGS)                        \
        FN(TCP_OLD_ACK)                        \
        FN(TCP_ABORT_ON_DATA)                \
        FN(TCP_TIMEWAIT_SOCKET)                \
        FN(INVALID_SYN)                        \
        FN(MPTCP_RST_EUNSPEC)                \
        FN(MPTCP_RST_EMPTCP)                \
        FN(MPTCP_RST_ERESOURCE)                \
        FN(MPTCP_RST_EPROHIBIT)                \
        FN(MPTCP_RST_EWQ2BIG)                \
        FN(MPTCP_RST_EBADPERF)                \
        FN(MPTCP_RST_EMIDDLEBOX)        \
        FN(ERROR)                        \
        FNe(MAX)

/**
 * enum sk_rst_reason - the reasons of socket reset
 *
 * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols.
 *
 * There are three parts in order:
 * 1) skb drop reasons: relying on drop reasons for such as passive reset
 * 2) independent reset reasons: such as active reset reasons
 * 3) reset reasons in MPTCP: only for MPTCP use
 */
enum sk_rst_reason {
        /* Refer to include/net/dropreason-core.h
         * Rely on skb drop reasons because it indicates exactly why RST
         * could happen.
         */
        /** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */
        SK_RST_REASON_NOT_SPECIFIED,
        /** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */
        SK_RST_REASON_NO_SOCKET,
        /**
         * @SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
         * field because ack sequence is not in the window between snd_una
         * and snd_nxt
         */
        SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE,
        /**
         * @SK_RST_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
         * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED
         */
        SK_RST_REASON_TCP_RFC7323_PAWS,
        /** @SK_RST_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
        SK_RST_REASON_TCP_TOO_OLD_ACK,
        /**
         * @SK_RST_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
         * sent yet
         */
        SK_RST_REASON_TCP_ACK_UNSENT_DATA,
        /** @SK_RST_REASON_TCP_FLAGS: TCP flags invalid */
        SK_RST_REASON_TCP_FLAGS,
        /** @SK_RST_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
        SK_RST_REASON_TCP_OLD_ACK,
        /**
         * @SK_RST_REASON_TCP_ABORT_ON_DATA: abort on data
         * corresponding to LINUX_MIB_TCPABORTONDATA
         */
        SK_RST_REASON_TCP_ABORT_ON_DATA,

        /* Here start with the independent reasons */
        /** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */
        SK_RST_REASON_TCP_TIMEWAIT_SOCKET,
        /**
         * @SK_RST_REASON_INVALID_SYN: receive bad syn packet
         * RFC 793 says if the state is not CLOSED/LISTEN/SYN-SENT then
         * "fourth, check the SYN bit,...If the SYN is in the window it is
         * an error, send a reset"
         */
        SK_RST_REASON_INVALID_SYN,

        /* Copy from include/uapi/linux/mptcp.h.
         * These reset fields will not be changed since they adhere to
         * RFC 8684. So do not touch them. I'm going to list each definition
         * of them respectively.
         */
        /**
         * @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error.
         * This is the default error; it implies that the subflow is no
         * longer available. The presence of this option shows that the
         * RST was generated by an MPTCP-aware device.
         */
        SK_RST_REASON_MPTCP_RST_EUNSPEC,
        /**
         * @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error.
         * An error has been detected in the processing of MPTCP options.
         * This is the usual reason code to return in the cases where a RST
         * is being sent to close a subflow because of an invalid response.
         */
        SK_RST_REASON_MPTCP_RST_EMPTCP,
        /**
         * @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources.
         * This code indicates that the sending host does not have enough
         * resources to support the terminated subflow.
         */
        SK_RST_REASON_MPTCP_RST_ERESOURCE,
        /**
         * @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited.
         * This code indicates that the requested subflow is prohibited by
         * the policies of the sending host.
         */
        SK_RST_REASON_MPTCP_RST_EPROHIBIT,
        /**
         * @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data.
         * This code indicates that there is an excessive amount of data
         * that needs to be transmitted over the terminated subflow while
         * having already been acknowledged over one or more other subflows.
         * This may occur if a path has been unavailable for a short period
         * and it is more efficient to reset and start again than it is to
         * retransmit the queued data.
         */
        SK_RST_REASON_MPTCP_RST_EWQ2BIG,
        /**
         * @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance.
         * This code indicates that the performance of this subflow was
         * too low compared to the other subflows of this Multipath TCP
         * connection.
         */
        SK_RST_REASON_MPTCP_RST_EBADPERF,
        /**
         * @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference.
         * Middlebox interference has been detected over this subflow,
         * making MPTCP signaling invalid. For example, this may be sent
         * if the checksum does not validate.
         */
        SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,

        /** @SK_RST_REASON_ERROR: unexpected error happens */
        SK_RST_REASON_ERROR,

        /**
         * @SK_RST_REASON_MAX: Maximum of socket reset reasons.
         * It shouldn't be used as a real 'reason'.
         */
        SK_RST_REASON_MAX,
};

/* Convert skb drop reasons to enum sk_rst_reason type */
static inline enum sk_rst_reason
sk_rst_convert_drop_reason(enum skb_drop_reason reason)
{
        switch (reason) {
        case SKB_DROP_REASON_NOT_SPECIFIED:
                return SK_RST_REASON_NOT_SPECIFIED;
        case SKB_DROP_REASON_NO_SOCKET:
                return SK_RST_REASON_NO_SOCKET;
        case SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE:
                return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE;
        case SKB_DROP_REASON_TCP_RFC7323_PAWS:
                return SK_RST_REASON_TCP_RFC7323_PAWS;
        case SKB_DROP_REASON_TCP_TOO_OLD_ACK:
                return SK_RST_REASON_TCP_TOO_OLD_ACK;
        case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA:
                return SK_RST_REASON_TCP_ACK_UNSENT_DATA;
        case SKB_DROP_REASON_TCP_FLAGS:
                return SK_RST_REASON_TCP_FLAGS;
        case SKB_DROP_REASON_TCP_OLD_ACK:
                return SK_RST_REASON_TCP_OLD_ACK;
        case SKB_DROP_REASON_TCP_ABORT_ON_DATA:
                return SK_RST_REASON_TCP_ABORT_ON_DATA;
        default:
                /* If we don't have our own corresponding reason */
                return SK_RST_REASON_NOT_SPECIFIED;
        }
}
#endif

























































































































































    3 





























    1 







    1 

    1 







































    1 





    1 








    1 












    1 
    1 




























































    1 

    1 
    1 










    3 

















































    1 
    1 











    1 





    3 























    1 









    1 
































































    1 







    1 





    1 



    1 
    1 






    1 


























    1 










    1 







    1 








































    1 




























































    1 

    1 


















    1 












    1 



    1 

    1 











    1 




    1 






































    1 


















    1 















    1 






    1 

    1 














    1 












    1 




    1 


















































    2 


    2 




    1 
    2 




    2 









    1 





    1 


































































    1 






















































































































































    2 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 















































































































































































































































































































































    3 

































































































































































































































































































































































































































































































































































































































































































    3 







    3 





    3 
















    3 
    2 










    3 
    3 
































    2 










    3 



    2 







    2 







    1 
    3 





























    3 



































    3 






    3 
    2 

















    3 












    2 
    3 



    3 














    2 

    1 






    3 


    3 

    3 

    3 






    1 
    3 

    3 
    2 




    2 



    1 
    1 
    2 








    3 








    3 



















    3 





    2 









    1 
    2 








    2 
    3 
    2 
    2 

    3 




    3 




    3 









































































    2 

    2 

    2 









    3 
    1 

























    2 



























    2 















    3 

    3 
    3 

    3 



    2 






    2 

    1 
    1 

    1 
    1 





    2 










    1 





































    1 





    1 





    1 




























    1 







    3 









































    3 











    3 





















    3 







    3 













































    2 




    2 





    3 






    3 


















    2 



    3 

    3 



    3 




    3 






    3 









    3 











    3 




    2 


    3 












    2 
    3 

    2 
    2 

    3 



    2 
    3 














































    1 











































































    1 

    1 






    1 







    1 











    1 
    1 









    1 



    1 










    1 
    1 







    1 

    1 



















    1 
























    1 



















    2 




















    1 

    2 





    1 




























































































































    2 























































































    1 






























    1 





















    1 





    1 

























































































































































































































































    2 








    2 


    1 



    2 






    2 






















































































    2 

    2 

































































































































































    2 



    1 
    2 

    2 
    2 

    1 

























































    2 
    2 




    3 








    3 





    2 


    2 
    2 









    2 

















    2 















    1 
    1 
    1 
    1 

    2 









    2 


    2 

    1 
    2 

    2 






































































    2 




    2 














































































































































































































































































































































    3 


    3 








    1 
    3 







    1 




    1 






    1 

    1 







    1 







    1 





































    3 



    1 













    1 

    1 























    1 




















    1 


















    2 



    2 
    1 


    2 




    1 
































    2 




    3 

    2 




















































































    1 

    1 







    3 





































    1 






    1 


    1 



























    1 









































































































    1 











    1 








    1 







    1 
    1 














    1 













    1 


    1 


    1 


    1 
    1 





















































































































    1 















    1 

























































































































































    1 






















    1 


















    1 



















































































    2 


    1 








































    1 












    2 

    2 















    1 







    2 

















    1 





    1 















    1 






    1 


    1 





    1 
    1 






    1 


    1 
















































    1 

















    2 


















    1 

















    2 


    2 















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
// SPDX-License-Identifier: GPL-2.0
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *                Linus Torvalds, <torvalds@cs.helsinki.fi>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Matthew Dillon, <dillon@apollo.west.oic.com>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *                Pedro Roque        :        Fast Retransmit/Recovery.
 *                                        Two receive queues.
 *                                        Retransmit queue handled by TCP.
 *                                        Better retransmit timer handling.
 *                                        New congestion avoidance.
 *                                        Header prediction.
 *                                        Variable renaming.
 *
 *                Eric                :        Fast Retransmit.
 *                Randy Scott        :        MSS option defines.
 *                Eric Schenk        :        Fixes to slow start algorithm.
 *                Eric Schenk        :        Yet another double ACK bug.
 *                Eric Schenk        :        Delayed ACK bug fixes.
 *                Eric Schenk        :        Floyd style fast retrans war avoidance.
 *                David S. Miller        :        Don't allow zero congestion window.
 *                Eric Schenk        :        Fix retransmitter so that it sends
 *                                        next packet on ack of previous packet.
 *                Andi Kleen        :        Moved open_request checking here
 *                                        and process RSTs for open_requests.
 *                Andi Kleen        :        Better prune_queue, and other fixes.
 *                Andrey Savochkin:        Fix RTT measurements in the presence of
 *                                        timestamps.
 *                Andrey Savochkin:        Check sequence numbers correctly when
 *                                        removing SACKs due to in sequence incoming
 *                                        data segments.
 *                Andi Kleen:                Make sure we never ack data there is not
 *                                        enough room for. Also make this condition
 *                                        a fatal error if it might still happen.
 *                Andi Kleen:                Add tcp_measure_rcv_mss to make
 *                                        connections with MSS<min(MTU,ann. MSS)
 *                                        work without delayed acks.
 *                Andi Kleen:                Process packets with PSH set in the
 *                                        fast path.
 *                J Hadi Salim:                ECN support
 *                 Andrei Gurtov,
 *                Pasi Sarolahti,
 *                Panu Kuhlberg:                Experimental audit of TCP (re)transmission
 *                                        engine. Lots of bugs are found.
 *                Pasi Sarolahti:                F-RTO for dealing with spurious RTOs
 */

#define pr_fmt(fmt) "TCP: " fmt

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
#include <net/dst.h>
#include <net/tcp.h>
#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <linux/errqueue.h>
#include <trace/events/tcp.h>
#include <linux/jump_label_ratelimit.h>
#include <net/busy_poll.h>
#include <net/mptcp.h>

int sysctl_tcp_max_orphans __read_mostly = NR_FILE;

#define FLAG_DATA                0x01 /* Incoming frame contained data.                */
#define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.        */
#define FLAG_DATA_ACKED                0x04 /* This ACK acknowledged new data.                */
#define FLAG_RETRANS_DATA_ACKED        0x08 /* "" "" some of which was retransmitted.        */
#define FLAG_SYN_ACKED                0x10 /* This ACK acknowledged SYN.                */
#define FLAG_DATA_SACKED        0x20 /* New SACK.                                */
#define FLAG_ECE                0x40 /* ECE in this ACK                                */
#define FLAG_LOST_RETRANS        0x80 /* This ACK marks some retransmission lost */
#define FLAG_SLOWPATH                0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED        0x200 /* Never retransmitted data are (s)acked        */
#define FLAG_SND_UNA_ADVANCED        0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK        0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SET_XMIT_TIMER        0x1000 /* Set TLP or RTO timer */
#define FLAG_SACK_RENEGING        0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT        0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK        0x8000 /* do not call tcp_send_challenge_ack()        */
#define FLAG_ACK_MAYBE_DELAYED        0x10000 /* Likely a delayed ACK */
#define FLAG_DSACK_TLP                0x20000 /* DSACK for tail loss probe */

#define FLAG_ACKED                (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP                (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT                (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
#define FLAG_FORWARD_PROGRESS        (FLAG_ACKED|FLAG_DATA_SACKED)

#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))

#define REXMIT_NONE        0 /* no loss recovery to do */
#define REXMIT_LOST        1 /* retransmit packets marked lost */
#define REXMIT_NEW        2 /* FRTO-style transmit of unsent/new packets */

#if IS_ENABLED(CONFIG_TLS_DEVICE)
static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);

void clean_acked_data_enable(struct inet_connection_sock *icsk,
                             void (*cad)(struct sock *sk, u32 ack_seq))
{
        icsk->icsk_clean_acked = cad;
        static_branch_deferred_inc(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);

void clean_acked_data_disable(struct inet_connection_sock *icsk)
{
        static_branch_slow_dec_deferred(&clean_acked_data_enabled);
        icsk->icsk_clean_acked = NULL;
}
EXPORT_SYMBOL_GPL(clean_acked_data_disable);

void clean_acked_data_flush(void)
{
        static_key_deferred_flush(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
#endif

#ifdef CONFIG_CGROUP_BPF
static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
{
        bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
                BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
                                       BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
        bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
                                                    BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
        struct bpf_sock_ops_kern sock_ops;

        if (likely(!unknown_opt && !parse_all_opt))
                return;

        /* The skb will be handled in the
         * bpf_skops_established() or
         * bpf_skops_write_hdr_opt().
         */
        switch (sk->sk_state) {
        case TCP_SYN_RECV:
        case TCP_SYN_SENT:
        case TCP_LISTEN:
                return;
        }

        sock_owned_by_me(sk);

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
        sock_ops.is_fullsock = 1;
        sock_ops.sk = sk;
        bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));

        BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
}

static void bpf_skops_established(struct sock *sk, int bpf_op,
                                  struct sk_buff *skb)
{
        struct bpf_sock_ops_kern sock_ops;

        sock_owned_by_me(sk);

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        sock_ops.op = bpf_op;
        sock_ops.is_fullsock = 1;
        sock_ops.sk = sk;
        /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
        if (skb)
                bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));

        BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
}
#else
static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
{
}

static void bpf_skops_established(struct sock *sk, int bpf_op,
                                  struct sk_buff *skb)
{
}
#endif

static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb,
                                    unsigned int len)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
        if (!dev || len >= READ_ONCE(dev->mtu))
                pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
                        dev ? dev->name : "Unknown driver");
        rcu_read_unlock();
}

/* Adapt the MSS value used to make delayed ack decision to the
 * real world.
 */
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        const unsigned int lss = icsk->icsk_ack.last_seg_size;
        unsigned int len;

        icsk->icsk_ack.last_seg_size = 0;

        /* skb->len may jitter because of SACKs, even if peer
         * sends good full-sized frames.
         */
        len = skb_shinfo(skb)->gso_size ? : skb->len;
        if (len >= icsk->icsk_ack.rcv_mss) {
                /* Note: divides are still a bit expensive.
                 * For the moment, only adjust scaling_ratio
                 * when we update icsk_ack.rcv_mss.
                 */
                if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
                        u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;

                        do_div(val, skb->truesize);
                        tcp_sk(sk)->scaling_ratio = val ? val : 1;
                }
                icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
                                               tcp_sk(sk)->advmss);
                /* Account for possibly-removed options */
                DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE,
                                tcp_gro_dev_warn, sk, skb, len);
                /* If the skb has a len of exactly 1*MSS and has the PSH bit
                 * set then it is likely the end of an application write. So
                 * more data may not be arriving soon, and yet the data sender
                 * may be waiting for an ACK if cwnd-bound or using TX zero
                 * copy. So we set ICSK_ACK_PUSHED here so that
                 * tcp_cleanup_rbuf() will send an ACK immediately if the app
                 * reads all of the data and is not ping-pong. If len > MSS
                 * then this logic does not matter (and does not hurt) because
                 * tcp_cleanup_rbuf() will always ACK immediately if the app
                 * reads data and there is more than an MSS of unACKed data.
                 */
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
                        icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
        } else {
                /* Otherwise, we make more careful check taking into account,
                 * that SACKs block is variable.
                 *
                 * "len" is invariant segment length, including TCP header.
                 */
                len += skb->data - skb_transport_header(skb);
                if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
                    /* If PSH is not set, packet should be
                     * full sized, provided peer TCP is not badly broken.
                     * This observation (if it is correct 8)) allows
                     * to handle super-low mtu links fairly.
                     */
                    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
                     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
                        /* Subtract also invariant (if peer is RFC compliant),
                         * tcp header plus fixed timestamp option length.
                         * Resulting "len" is MSS free of SACK jitter.
                         */
                        len -= tcp_sk(sk)->tcp_header_len;
                        icsk->icsk_ack.last_seg_size = len;
                        if (len == lss) {
                                icsk->icsk_ack.rcv_mss = len;
                                return;
                        }
                }
                if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
                        icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
        }
}

static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);

        if (quickacks == 0)
                quickacks = 2;
        quickacks = min(quickacks, max_quickacks);
        if (quickacks > icsk->icsk_ack.quick)
                icsk->icsk_ack.quick = quickacks;
}

static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_incr_quickack(sk, max_quickacks);
        inet_csk_exit_pingpong_mode(sk);
        icsk->icsk_ack.ato = TCP_ATO_MIN;
}

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

static bool tcp_in_quickack_mode(struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct dst_entry *dst = __sk_dst_get(sk);

        return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
                (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}

static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
{
        if (tp->ecn_flags & TCP_ECN_OK)
                tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}

static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
{
        if (tcp_hdr(skb)->cwr) {
                tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;

                /* If the sender is telling us it has entered CWR, then its
                 * cwnd may be very low (even just 1 packet), so we should ACK
                 * immediately.
                 */
                if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
                        inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
        }
}

static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
        tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}

static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

        switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
        case INET_ECN_NOT_ECT:
                /* Funny extension: if ECT is not set on a segment,
                 * and we already seen ECT on a previous segment,
                 * it is probably a retransmit.
                 */
                if (tp->ecn_flags & TCP_ECN_SEEN)
                        tcp_enter_quickack_mode(sk, 2);
                break;
        case INET_ECN_CE:
                if (tcp_ca_needs_ecn(sk))
                        tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);

                if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
                        /* Better not delay acks, sender can have a very low cwnd */
                        tcp_enter_quickack_mode(sk, 2);
                        tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
                }
                tp->ecn_flags |= TCP_ECN_SEEN;
                break;
        default:
                if (tcp_ca_needs_ecn(sk))
                        tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
                tp->ecn_flags |= TCP_ECN_SEEN;
                break;
        }
}

static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
        if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
                __tcp_ecn_check_ce(sk, skb);
}

static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
{
        if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
                tp->ecn_flags &= ~TCP_ECN_OK;
}

static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
{
        if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
                tp->ecn_flags &= ~TCP_ECN_OK;
}

static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
{
        if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
                return true;
        return false;
}

/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

static void tcp_sndbuf_expand(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        int sndmem, per_mss;
        u32 nr_segs;

        /* Worst case is non GSO/TSO : each frame consumes one skb
         * and skb->head is kmalloced using power of two area of memory
         */
        per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
                  MAX_TCP_HEADER +
                  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        per_mss = roundup_pow_of_two(per_mss) +
                  SKB_DATA_ALIGN(sizeof(struct sk_buff));

        nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
        nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

        /* Fast Recovery (RFC 5681 3.2) :
         * Cubic needs 1.7 factor, rounded to 2 to include
         * extra cushion (application might react slowly to EPOLLOUT)
         */
        sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
        sndmem *= nr_segs * per_mss;

        if (sk->sk_sndbuf < sndmem)
                WRITE_ONCE(sk->sk_sndbuf,
                           min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
 * window and then starts to feed us spaghetti. But it should work
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
                             unsigned int skbtruesize)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        /* Optimize this! */
        int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
        int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;

        while (tp->rcv_ssthresh <= window) {
                if (truesize <= skb->len)
                        return 2 * inet_csk(sk)->icsk_ack.rcv_mss;

                truesize >>= 1;
                window >>= 1;
        }
        return 0;
}

/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
 * can play nice with us, as sk_buff and skb->head might be either
 * freed or shared with up to MAX_SKB_FRAGS segments.
 * Only give a boost to drivers using page frag(s) to hold the frame(s),
 * and if no payload was pulled in skb->head before reaching us.
 */
static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
{
        u32 truesize = skb->truesize;

        if (adjust && !skb_headlen(skb)) {
                truesize -= SKB_TRUESIZE(skb_end_offset(skb));
                /* paranoid check, some drivers might be buggy */
                if (unlikely((int)truesize < (int)skb->len))
                        truesize = skb->truesize;
        }
        return truesize;
}

static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
                            bool adjust)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int room;

        room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;

        if (room <= 0)
                return;

        /* Check #1 */
        if (!tcp_under_memory_pressure(sk)) {
                unsigned int truesize = truesize_adjust(adjust, skb);
                int incr;

                /* Check #2. Increase window, if skb with such overhead
                 * will fit to rcvbuf in future.
                 */
                if (tcp_win_from_space(sk, truesize) <= skb->len)
                        incr = 2 * tp->advmss;
                else
                        incr = __tcp_grow_window(sk, skb, truesize);

                if (incr) {
                        incr = max_t(int, incr, 2 * skb->len);
                        tp->rcv_ssthresh += min(room, incr);
                        inet_csk(sk)->icsk_ack.quick |= 1;
                }
        } else {
                /* Under pressure:
                 * Adjust rcv_ssthresh according to reserved mem
                 */
                tcp_adjust_rcv_ssthresh(sk);
        }
}

/* 3. Try to fixup all. It is made immediately after connection enters
 *    established state.
 */
static void tcp_init_buffer_space(struct sock *sk)
{
        int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
        struct tcp_sock *tp = tcp_sk(sk);
        int maxwin;

        if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
                tcp_sndbuf_expand(sk);

        tcp_mstamp_refresh(tp);
        tp->rcvq_space.time = tp->tcp_mstamp;
        tp->rcvq_space.seq = tp->copied_seq;

        maxwin = tcp_full_space(sk);

        if (tp->window_clamp >= maxwin) {
                WRITE_ONCE(tp->window_clamp, maxwin);

                if (tcp_app_win && maxwin > 4 * tp->advmss)
                        WRITE_ONCE(tp->window_clamp,
                                   max(maxwin - (maxwin >> tcp_app_win),
                                       4 * tp->advmss));
        }

        /* Force reservation of one segment. */
        if (tcp_app_win &&
            tp->window_clamp > 2 * tp->advmss &&
            tp->window_clamp + tp->advmss > maxwin)
                WRITE_ONCE(tp->window_clamp,
                           max(2 * tp->advmss, maxwin - tp->advmss));

        tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
        tp->snd_cwnd_stamp = tcp_jiffies32;
        tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
                                    (u32)TCP_INIT_CWND * tp->advmss);
}

/* 4. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct net *net = sock_net(sk);
        int rmem2;

        icsk->icsk_ack.quick = 0;
        rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);

        if (sk->sk_rcvbuf < rmem2 &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
            !tcp_under_memory_pressure(sk) &&
            sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
                WRITE_ONCE(sk->sk_rcvbuf,
                           min(atomic_read(&sk->sk_rmem_alloc), rmem2));
        }
        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
                tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
}

/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

        hint = min(hint, tp->rcv_wnd / 2);
        hint = min(hint, TCP_MSS_DEFAULT);
        hint = max(hint, TCP_MIN_MSS);

        inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
EXPORT_SYMBOL(tcp_initialize_rcv_mss);

/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
 * <https://public.lanl.gov/radiant/pubs.html#DRS>
 *
 * More detail on this code can be found at
 * <http://staff.psc.edu/jheffner/>,
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
        u32 new_sample = tp->rcv_rtt_est.rtt_us;
        long m = sample;

        if (new_sample != 0) {
                /* If we sample in larger samples in the non-timestamp
                 * case, we could grossly overestimate the RTT especially
                 * with chatty applications or bulk transfer apps which
                 * are stalled on filesystem I/O.
                 *
                 * Also, since we are only going for a minimum in the
                 * non-timestamp case, we do not smooth things out
                 * else with timestamps disabled convergence takes too
                 * long.
                 */
                if (!win_dep) {
                        m -= (new_sample >> 3);
                        new_sample += m;
                } else {
                        m <<= 3;
                        if (m < new_sample)
                                new_sample = m;
                }
        } else {
                /* No previous measure. */
                new_sample = m << 3;
        }

        tp->rcv_rtt_est.rtt_us = new_sample;
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
        u32 delta_us;

        if (tp->rcv_rtt_est.time == 0)
                goto new_measure;
        if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
                return;
        delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
        if (!delta_us)
                delta_us = 1;
        tcp_rcv_rtt_update(tp, delta_us, 1);

new_measure:
        tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
        tp->rcv_rtt_est.time = tp->tcp_mstamp;
}

static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
{
        u32 delta, delta_us;

        delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
        if (tp->tcp_usec_ts)
                return delta;

        if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
                if (!delta)
                        delta = 1;
                delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
                return delta_us;
        }
        return -1;
}

static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
                                          const struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
                return;
        tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;

        if (TCP_SKB_CB(skb)->end_seq -
            TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
                s32 delta = tcp_rtt_tsopt_us(tp);

                if (delta >= 0)
                        tcp_rcv_rtt_update(tp, delta, 0);
        }
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 copied;
        int time;

        trace_tcp_rcv_space_adjust(sk);

        tcp_mstamp_refresh(tp);
        time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
        if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
                return;

        /* Number of bytes copied to user in last RTT */
        copied = tp->copied_seq - tp->rcvq_space.seq;
        if (copied <= tp->rcvq_space.space)
                goto new_measure;

        /* A bit of theory :
         * copied = bytes received in previous RTT, our base window
         * To cope with packet losses, we need a 2x factor
         * To cope with slow start, and sender growing its cwin by 100 %
         * every RTT, we need a 4x factor, because the ACK we are sending
         * now is for the next RTT, not the current one :
         * <prev RTT . ><current RTT .. ><next RTT .... >
         */

        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
                u64 rcvwin, grow;
                int rcvbuf;

                /* minimal window to cope with packet losses, assuming
                 * steady state. Add some cushion because of small variations.
                 */
                rcvwin = ((u64)copied << 1) + 16 * tp->advmss;

                /* Accommodate for sender rate increase (eg. slow start) */
                grow = rcvwin * (copied - tp->rcvq_space.space);
                do_div(grow, tp->rcvq_space.space);
                rcvwin += (grow << 1);

                rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
                               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
                if (rcvbuf > sk->sk_rcvbuf) {
                        WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);

                        /* Make the window clamp follow along.  */
                        WRITE_ONCE(tp->window_clamp,
                                   tcp_win_from_space(sk, rcvbuf));
                }
        }
        tp->rcvq_space.space = copied;

new_measure:
        tp->rcvq_space.seq = tp->copied_seq;
        tp->rcvq_space.time = tp->tcp_mstamp;
}

static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IPV6)
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (skb->protocol == htons(ETH_P_IPV6))
                icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
#endif
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        u32 now;

        inet_csk_schedule_ack(sk);

        tcp_measure_rcv_mss(sk, skb);

        tcp_rcv_rtt_measure(tp);

        now = tcp_jiffies32;

        if (!icsk->icsk_ack.ato) {
                /* The _first_ data packet received, initialize
                 * delayed ACK engine.
                 */
                tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
                icsk->icsk_ack.ato = TCP_ATO_MIN;
        } else {
                int m = now - icsk->icsk_ack.lrcvtime;

                if (m <= TCP_ATO_MIN / 2) {
                        /* The fastest case is the first. */
                        icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
                } else if (m < icsk->icsk_ack.ato) {
                        icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
                        if (icsk->icsk_ack.ato > icsk->icsk_rto)
                                icsk->icsk_ack.ato = icsk->icsk_rto;
                } else if (m > icsk->icsk_rto) {
                        /* Too long gap. Apparently sender failed to
                         * restart window, so that we send ACKs quickly.
                         */
                        tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
                }
        }
        icsk->icsk_ack.lrcvtime = now;
        tcp_save_lrcv_flowlabel(sk, skb);

        tcp_ecn_check_ce(sk, skb);

        if (skb->len >= 128)
                tcp_grow_window(sk, skb, true);
}

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
{
        struct tcp_sock *tp = tcp_sk(sk);
        long m = mrtt_us; /* RTT */
        u32 srtt = tp->srtt_us;

        /*        The following amusing code comes from Jacobson's
         *        article in SIGCOMM '88.  Note that rtt and mdev
         *        are scaled versions of rtt and mean deviation.
         *        This is designed to be as fast as possible
         *        m stands for "measurement".
         *
         *        On a 1990 paper the rto value is changed to:
         *        RTO = rtt + 4 * mdev
         *
         * Funny. This algorithm seems to be very broken.
         * These formulae increase RTO, when it should be decreased, increase
         * too slowly, when it should be increased quickly, decrease too quickly
         * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
         * does not matter how to _calculate_ it. Seems, it was trap
         * that VJ failed to avoid. 8)
         */
        if (srtt != 0) {
                m -= (srtt >> 3);        /* m is now error in rtt est */
                srtt += m;                /* rtt = 7/8 rtt + 1/8 new */
                if (m < 0) {
                        m = -m;                /* m is now abs(error) */
                        m -= (tp->mdev_us >> 2);   /* similar update on mdev */
                        /* This is similar to one of Eifel findings.
                         * Eifel blocks mdev updates when rtt decreases.
                         * This solution is a bit different: we use finer gain
                         * for mdev in this case (alpha*beta).
                         * Like Eifel it also prevents growth of rto,
                         * but also it limits too fast rto decreases,
                         * happening in pure Eifel.
                         */
                        if (m > 0)
                                m >>= 3;
                } else {
                        m -= (tp->mdev_us >> 2);   /* similar update on mdev */
                }
                tp->mdev_us += m;                /* mdev = 3/4 mdev + 1/4 new */
                if (tp->mdev_us > tp->mdev_max_us) {
                        tp->mdev_max_us = tp->mdev_us;
                        if (tp->mdev_max_us > tp->rttvar_us)
                                tp->rttvar_us = tp->mdev_max_us;
                }
                if (after(tp->snd_una, tp->rtt_seq)) {
                        if (tp->mdev_max_us < tp->rttvar_us)
                                tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
                        tp->rtt_seq = tp->snd_nxt;
                        tp->mdev_max_us = tcp_rto_min_us(sk);

                        tcp_bpf_rtt(sk, mrtt_us, srtt);
                }
        } else {
                /* no previous measure. */
                srtt = m << 3;                /* take the measured time to be rtt */
                tp->mdev_us = m << 1;        /* make sure rto = 3*rtt */
                tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
                tp->mdev_max_us = tp->rttvar_us;
                tp->rtt_seq = tp->snd_nxt;

                tcp_bpf_rtt(sk, mrtt_us, srtt);
        }
        tp->srtt_us = max(1U, srtt);
}

static void tcp_update_pacing_rate(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        u64 rate;

        /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
        rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);

        /* current rate is (cwnd * mss) / srtt
         * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
         * In Congestion Avoidance phase, set it to 120 % the current rate.
         *
         * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
         *         If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
         *         end of slow start and should slow down.
         */
        if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2)
                rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
        else
                rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);

        rate *= max(tcp_snd_cwnd(tp), tp->packets_out);

        if (likely(tp->srtt_us))
                do_div(rate, tp->srtt_us);

        /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
         * without any lock. We want to make sure compiler wont store
         * intermediate values in this location.
         */
        WRITE_ONCE(sk->sk_pacing_rate,
                   min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
}

/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
static void tcp_set_rto(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        /* Old crap is replaced with new one. 8)
         *
         * More seriously:
         * 1. If rtt variance happened to be less 50msec, it is hallucination.
         *    It cannot be less due to utterly erratic ACK generation made
         *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
         *    to do with delayed acks, because at cwnd>2 true delack timeout
         *    is invisible. Actually, Linux-2.4 also generates erratic
         *    ACKs in some circumstances.
         */
        inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);

        /* 2. Fixups made earlier cannot be right.
         *    If we do not estimate RTO correctly without them,
         *    all the algo is pure shit and should be replaced
         *    with correct one. It is exactly, which we pretend to do.
         */

        /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
         * guarantees that rto is higher.
         */
        tcp_bound_rto(sk);
}

__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
{
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

        if (!cwnd)
                cwnd = TCP_INIT_CWND;
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

struct tcp_sacktag_state {
        /* Timestamps for earliest and latest never-retransmitted segment
         * that was SACKed. RTO needs the earliest RTT to stay conservative,
         * but congestion control should still get an accurate delay signal.
         */
        u64        first_sackt;
        u64        last_sackt;
        u32        reord;
        u32        sack_delivered;
        int        flag;
        unsigned int mss_now;
        struct rate_sample *rate;
};

/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
 * and spurious retransmission information if this DSACK is unlikely caused by
 * sender's action:
 * - DSACKed sequence range is larger than maximum receiver's window.
 * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
 */
static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
                          u32 end_seq, struct tcp_sacktag_state *state)
{
        u32 seq_len, dup_segs = 1;

        if (!before(start_seq, end_seq))
                return 0;

        seq_len = end_seq - start_seq;
        /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
        if (seq_len > tp->max_window)
                return 0;
        if (seq_len > tp->mss_cache)
                dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
        else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
                state->flag |= FLAG_DSACK_TLP;

        tp->dsack_dups += dup_segs;
        /* Skip the DSACK if dup segs weren't retransmitted by sender */
        if (tp->dsack_dups > tp->total_retrans)
                return 0;

        tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
        /* We increase the RACK ordering window in rounds where we receive
         * DSACKs that may have been due to reordering causing RACK to trigger
         * a spurious fast recovery. Thus RACK ignores DSACKs that happen
         * without having seen reordering, or that match TLP probes (TLP
         * is timer-driven, not triggered by RACK).
         */
        if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
                tp->rack.dsack_seen = 1;

        state->flag |= FLAG_DSACKING_ACK;
        /* A spurious retransmission is delivered */
        state->sack_delivered += dup_segs;

        return dup_segs;
}

/* It's reordering when higher sequence was delivered (i.e. sacked) before
 * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
 * distance is approximated in full-mss packet distance ("reordering").
 */
static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
                                      const int ts)
{
        struct tcp_sock *tp = tcp_sk(sk);
        const u32 mss = tp->mss_cache;
        u32 fack, metric;

        fack = tcp_highest_sack_seq(tp);
        if (!before(low_seq, fack))
                return;

        metric = fack - low_seq;
        if ((metric > tp->reordering * mss) && mss) {
#if FASTRETRANS_DEBUG > 1
                pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
                         tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
                         tp->reordering,
                         0,
                         tp->sacked_out,
                         tp->undo_marker ? tp->undo_retrans : 0);
#endif
                tp->reordering = min_t(u32, (metric + mss - 1) / mss,
                                       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
        }

        /* This exciting event is worth to be remembered. 8) */
        tp->reord_seen++;
        NET_INC_STATS(sock_net(sk),
                      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}

 /* This must be called before lost_out or retrans_out are updated
  * on a new loss, because we want to know if all skbs previously
  * known to be lost have already been retransmitted, indicating
  * that this newly lost skb is our next skb to retransmit.
  */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
        if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
            (tp->retransmit_skb_hint &&
             before(TCP_SKB_CB(skb)->seq,
                    TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
                tp->retransmit_skb_hint = skb;
}

/* Sum the number of packets on the wire we have marked as lost, and
 * notify the congestion control module that the given skb was marked lost.
 */
static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
        tp->lost += tcp_skb_pcount(skb);
}

void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
        __u8 sacked = TCP_SKB_CB(skb)->sacked;
        struct tcp_sock *tp = tcp_sk(sk);

        if (sacked & TCPCB_SACKED_ACKED)
                return;

        tcp_verify_retransmit_hint(tp, skb);
        if (sacked & TCPCB_LOST) {
                if (sacked & TCPCB_SACKED_RETRANS) {
                        /* Account for retransmits that are lost again */
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                        tp->retrans_out -= tcp_skb_pcount(skb);
                        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
                                      tcp_skb_pcount(skb));
                        tcp_notify_skb_loss_event(tp, skb);
                }
        } else {
                tp->lost_out += tcp_skb_pcount(skb);
                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                tcp_notify_skb_loss_event(tp, skb);
        }
}

/* Updates the delivered and delivered_ce counts */
static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
                                bool ece_ack)
{
        tp->delivered += delivered;
        if (ece_ack)
                tp->delivered_ce += delivered;
}

/* This procedure tags the retransmission queue when SACKs arrive.
 *
 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 * Packets in queue with these bits set are counted in variables
 * sacked_out, retrans_out and lost_out, correspondingly.
 *
 * Valid combinations are:
 * Tag  InFlight        Description
 * 0        1                - orig segment is in flight.
 * S        0                - nothing flies, orig reached receiver.
 * L        0                - nothing flies, orig lost by net.
 * R        2                - both orig and retransmit are in flight.
 * L|R        1                - orig is lost, retransmit is in flight.
 * S|R  1                - orig reached receiver, retrans is still in flight.
 * (L|S|R is logically valid, it could occur when L|R is sacked,
 *  but it is equivalent to plain S and code short-circuits it to S.
 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 *
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
 * 3. Loss detection event of two flavors:
 *        A. Scoreboard estimator decided the packet is lost.
 *           A'. Reno "three dupacks" marks head of queue lost.
 *        B. SACK arrives sacking SND.NXT at the moment, when the
 *           segment was retransmitted.
 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 *
 * It is pleasant to note, that state diagram turns out to be commutative,
 * so that we are allowed not to be bothered by order of our actions,
 * when multiple events arrive simultaneously. (see the function below).
 *
 * Reordering detection.
 * --------------------
 * Reordering metric is maximal distance, which a packet can be displaced
 * in packet stream. With SACKs we can estimate it:
 *
 * 1. SACK fills old hole and the corresponding segment was not
 *    ever retransmitted -> reordering. Alas, we cannot use it
 *    when segment was retransmitted.
 * 2. The last flaw is solved with D-SACK. D-SACK arrives
 *    for retransmitted and already SACKed segment -> reordering..
 * Both of these heuristics are not used in Loss state, when we cannot
 * account for retransmits accurately.
 *
 * SACK block validation.
 * ----------------------
 *
 * SACK block range validation checks that the received SACK block fits to
 * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
 * Note that SND.UNA is not included to the range though being valid because
 * it means that the receiver is rather inconsistent with itself reporting
 * SACK reneging when it should advance SND.UNA. Such SACK block this is
 * perfectly valid, however, in light of RFC2018 which explicitly states
 * that "SACK block MUST reflect the newest segment.  Even if the newest
 * segment is going to be discarded ...", not that it looks very clever
 * in case of head skb. Due to potentional receiver driven attacks, we
 * choose to avoid immediate execution of a walk in write queue due to
 * reneging and defer head skb's loss recovery to standard loss recovery
 * procedure that will eventually trigger (nothing forbids us doing this).
 *
 * Implements also blockage to start_seq wrap-around. Problem lies in the
 * fact that though start_seq (s) is before end_seq (i.e., not reversed),
 * there's no guarantee that it will be before snd_nxt (n). The problem
 * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
 * wrap (s_w):
 *
 *         <- outs wnd ->                          <- wrapzone ->
 *         u     e      n                         u_w   e_w  s n_w
 *         |     |      |                          |     |   |  |
 * |<------------+------+----- TCP seqno space --------------+---------->|
 * ...-- <2^31 ->|                                           |<--------...
 * ...---- >2^31 ------>|                                    |<--------...
 *
 * Current code wouldn't be vulnerable but it's better still to discard such
 * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
 * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
 * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
 * equal to the ideal case (infinite seqno space without wrap caused issues).
 *
 * With D-SACK the lower bound is extended to cover sequence space below
 * SND.UNA down to undo_marker, which is the last point of interest. Yet
 * again, D-SACK block must not to go across snd_una (for the same reason as
 * for the normal SACK blocks, explained above). But there all simplicity
 * ends, TCP might receive valid D-SACKs below that. As long as they reside
 * fully below undo_marker they do not affect behavior in anyway and can
 * therefore be safely ignored. In rare cases (which are more or less
 * theoretical ones), the D-SACK will nicely cross that boundary due to skb
 * fragmentation and packet reordering past skb's retransmission. To consider
 * them correctly, the acceptable range must be extended even more though
 * the exact amount is rather hard to quantify. However, tp->max_window can
 * be used as an exaggerated estimate.
 */
static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
                                   u32 start_seq, u32 end_seq)
{
        /* Too far in future, or reversed (interpretation is ambiguous) */
        if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
                return false;

        /* Nasty start_seq wrap-around check (see comments above) */
        if (!before(start_seq, tp->snd_nxt))
                return false;

        /* In outstanding window? ...This is valid exit for D-SACKs too.
         * start_seq == snd_una is non-sensical (see comments above)
         */
        if (after(start_seq, tp->snd_una))
                return true;

        if (!is_dsack || !tp->undo_marker)
                return false;

        /* ...Then it's D-SACK, and must reside below snd_una completely */
        if (after(end_seq, tp->snd_una))
                return false;

        if (!before(start_seq, tp->undo_marker))
                return true;

        /* Too old */
        if (!after(end_seq, tp->undo_marker))
                return false;

        /* Undo_marker boundary crossing (overestimates a lot). Known already:
         *   start_seq < undo_marker and end_seq >= undo_marker.
         */
        return !before(start_seq, end_seq - tp->max_window);
}

static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
                            struct tcp_sack_block_wire *sp, int num_sacks,
                            u32 prior_snd_una, struct tcp_sacktag_state *state)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
        u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
        u32 dup_segs;

        if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
        } else if (num_sacks > 1) {
                u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
                u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);

                if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
                        return false;
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
        } else {
                return false;
        }

        dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
        if (!dup_segs) {        /* Skip dubious DSACK */
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
                return false;
        }

        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);

        /* D-SACK for already forgotten data... Do dumb counting. */
        if (tp->undo_marker && tp->undo_retrans > 0 &&
            !after(end_seq_0, prior_snd_una) &&
            after(end_seq_0, tp->undo_marker))
                tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);

        return true;
}

/* Check if skb is fully within the SACK block. In presence of GSO skbs,
 * the incoming SACK may not exactly match but we can find smaller MSS
 * aligned portion of it that matches. Therefore we might need to fragment
 * which may fail and creates some hassle (caller must handle error case
 * returns).
 *
 * FIXME: this could be merged to shift decision code
 */
static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                                  u32 start_seq, u32 end_seq)
{
        int err;
        bool in_sack;
        unsigned int pkt_len;
        unsigned int mss;

        in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
                  !before(end_seq, TCP_SKB_CB(skb)->end_seq);

        if (tcp_skb_pcount(skb) > 1 && !in_sack &&
            after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
                mss = tcp_skb_mss(skb);
                in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);

                if (!in_sack) {
                        pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
                        if (pkt_len < mss)
                                pkt_len = mss;
                } else {
                        pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
                        if (pkt_len < mss)
                                return -EINVAL;
                }

                /* Round if necessary so that SACKs cover only full MSSes
                 * and/or the remaining small portion (if present)
                 */
                if (pkt_len > mss) {
                        unsigned int new_len = (pkt_len / mss) * mss;
                        if (!in_sack && new_len < pkt_len)
                                new_len += mss;
                        pkt_len = new_len;
                }

                if (pkt_len >= skb->len && !in_sack)
                        return 0;

                err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
                                   pkt_len, mss, GFP_ATOMIC);
                if (err < 0)
                        return err;
        }

        return in_sack;
}

/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
static u8 tcp_sacktag_one(struct sock *sk,
                          struct tcp_sacktag_state *state, u8 sacked,
                          u32 start_seq, u32 end_seq,
                          int dup_sack, int pcount,
                          u64 xmit_time)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* Account D-SACK for retransmitted packet. */
        if (dup_sack && (sacked & TCPCB_RETRANS)) {
                if (tp->undo_marker && tp->undo_retrans > 0 &&
                    after(end_seq, tp->undo_marker))
                        tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
                if ((sacked & TCPCB_SACKED_ACKED) &&
                    before(start_seq, state->reord))
                                state->reord = start_seq;
        }

        /* Nothing to do; acked frame is about to be dropped (was ACKed). */
        if (!after(end_seq, tp->snd_una))
                return sacked;

        if (!(sacked & TCPCB_SACKED_ACKED)) {
                tcp_rack_advance(tp, sacked, end_seq, xmit_time);

                if (sacked & TCPCB_SACKED_RETRANS) {
                        /* If the segment is not tagged as lost,
                         * we do not clear RETRANS, believing
                         * that retransmission is still in flight.
                         */
                        if (sacked & TCPCB_LOST) {
                                sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
                                tp->lost_out -= pcount;
                                tp->retrans_out -= pcount;
                        }
                } else {
                        if (!(sacked & TCPCB_RETRANS)) {
                                /* New sack for not retransmitted frame,
                                 * which was in hole. It is reordering.
                                 */
                                if (before(start_seq,
                                           tcp_highest_sack_seq(tp)) &&
                                    before(start_seq, state->reord))
                                        state->reord = start_seq;

                                if (!after(end_seq, tp->high_seq))
                                        state->flag |= FLAG_ORIG_SACK_ACKED;
                                if (state->first_sackt == 0)
                                        state->first_sackt = xmit_time;
                                state->last_sackt = xmit_time;
                        }

                        if (sacked & TCPCB_LOST) {
                                sacked &= ~TCPCB_LOST;
                                tp->lost_out -= pcount;
                        }
                }

                sacked |= TCPCB_SACKED_ACKED;
                state->flag |= FLAG_DATA_SACKED;
                tp->sacked_out += pcount;
                /* Out-of-order packets delivered */
                state->sack_delivered += pcount;

                /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
                if (tp->lost_skb_hint &&
                    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
                        tp->lost_cnt_hint += pcount;
        }

        /* D-SACK. We can detect redundant retransmission in S|R and plain R
         * frames and clear it. undo_retrans is decreased above, L|R frames
         * are accounted above as well.
         */
        if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
                sacked &= ~TCPCB_SACKED_RETRANS;
                tp->retrans_out -= pcount;
        }

        return sacked;
}

/* Shift newly-SACKed bytes from this skb to the immediately previous
 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
 */
static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
                            struct sk_buff *skb,
                            struct tcp_sacktag_state *state,
                            unsigned int pcount, int shifted, int mss,
                            bool dup_sack)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 start_seq = TCP_SKB_CB(skb)->seq;        /* start of newly-SACKed */
        u32 end_seq = start_seq + shifted;        /* end of newly-SACKed */

        BUG_ON(!pcount);

        /* Adjust counters and hints for the newly sacked sequence
         * range but discard the return value since prev is already
         * marked. We must tag the range first because the seq
         * advancement below implicitly advances
         * tcp_highest_sack_seq() when skb is highest_sack.
         */
        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
                        start_seq, end_seq, dup_sack, pcount,
                        tcp_skb_timestamp_us(skb));
        tcp_rate_skb_delivered(sk, skb, state->rate);

        if (skb == tp->lost_skb_hint)
                tp->lost_cnt_hint += pcount;

        TCP_SKB_CB(prev)->end_seq += shifted;
        TCP_SKB_CB(skb)->seq += shifted;

        tcp_skb_pcount_add(prev, pcount);
        WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
        tcp_skb_pcount_add(skb, -pcount);

        /* When we're adding to gso_segs == 1, gso_size will be zero,
         * in theory this shouldn't be necessary but as long as DSACK
         * code can come after this skb later on it's better to keep
         * setting gso_size to something.
         */
        if (!TCP_SKB_CB(prev)->tcp_gso_size)
                TCP_SKB_CB(prev)->tcp_gso_size = mss;

        /* CHECKME: To clear or not to clear? Mimics normal skb currently */
        if (tcp_skb_pcount(skb) <= 1)
                TCP_SKB_CB(skb)->tcp_gso_size = 0;

        /* Difference in this won't matter, both ACKed by the same cumul. ACK */
        TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);

        if (skb->len > 0) {
                BUG_ON(!tcp_skb_pcount(skb));
                NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
                return false;
        }

        /* Whole SKB was eaten :-) */

        if (skb == tp->retransmit_skb_hint)
                tp->retransmit_skb_hint = prev;
        if (skb == tp->lost_skb_hint) {
                tp->lost_skb_hint = prev;
                tp->lost_cnt_hint -= tcp_skb_pcount(prev);
        }

        TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
        TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                TCP_SKB_CB(prev)->end_seq++;

        if (skb == tcp_highest_sack(sk))
                tcp_advance_highest_sack(sk, skb);

        tcp_skb_collapse_tstamp(prev, skb);
        if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
                TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;

        tcp_rtx_queue_unlink_and_free(skb, sk);

        NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);

        return true;
}

/* I wish gso_size would have a bit more sane initialization than
 * something-or-zero which complicates things
 */
static int tcp_skb_seglen(const struct sk_buff *skb)
{
        return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
}

/* Shifting pages past head area doesn't work */
static int skb_can_shift(const struct sk_buff *skb)
{
        return !skb_headlen(skb) && skb_is_nonlinear(skb);
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
                  int pcount, int shiftlen)
{
        /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
         * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
         * to make sure not storing more than 65535 * 8 bytes per skb,
         * even if current MSS is bigger.
         */
        if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
                return 0;
        if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
                return 0;
        return skb_shift(to, from, shiftlen);
}

/* Try collapsing SACK blocks spanning across multiple skbs to a single
 * skb.
 */
static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
                                          struct tcp_sacktag_state *state,
                                          u32 start_seq, u32 end_seq,
                                          bool dup_sack)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *prev;
        int mss;
        int pcount = 0;
        int len;
        int in_sack;

        /* Normally R but no L won't result in plain S */
        if (!dup_sack &&
            (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
                goto fallback;
        if (!skb_can_shift(skb))
                goto fallback;
        /* This frame is about to be dropped (was ACKed). */
        if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
                goto fallback;

        /* Can only happen with delayed DSACK + discard craziness */
        prev = skb_rb_prev(skb);
        if (!prev)
                goto fallback;

        if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
                goto fallback;

        if (!tcp_skb_can_collapse(prev, skb))
                goto fallback;

        in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
                  !before(end_seq, TCP_SKB_CB(skb)->end_seq);

        if (in_sack) {
                len = skb->len;
                pcount = tcp_skb_pcount(skb);
                mss = tcp_skb_seglen(skb);

                /* TODO: Fix DSACKs to not fragment already SACKed and we can
                 * drop this restriction as unnecessary
                 */
                if (mss != tcp_skb_seglen(prev))
                        goto fallback;
        } else {
                if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
                        goto noop;
                /* CHECKME: This is non-MSS split case only?, this will
                 * cause skipped skbs due to advancing loop btw, original
                 * has that feature too
                 */
                if (tcp_skb_pcount(skb) <= 1)
                        goto noop;

                in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
                if (!in_sack) {
                        /* TODO: head merge to next could be attempted here
                         * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
                         * though it might not be worth of the additional hassle
                         *
                         * ...we can probably just fallback to what was done
                         * previously. We could try merging non-SACKed ones
                         * as well but it probably isn't going to buy off
                         * because later SACKs might again split them, and
                         * it would make skb timestamp tracking considerably
                         * harder problem.
                         */
                        goto fallback;
                }

                len = end_seq - TCP_SKB_CB(skb)->seq;
                BUG_ON(len < 0);
                BUG_ON(len > skb->len);

                /* MSS boundaries should be honoured or else pcount will
                 * severely break even though it makes things bit trickier.
                 * Optimize common case to avoid most of the divides
                 */
                mss = tcp_skb_mss(skb);

                /* TODO: Fix DSACKs to not fragment already SACKed and we can
                 * drop this restriction as unnecessary
                 */
                if (mss != tcp_skb_seglen(prev))
                        goto fallback;

                if (len == mss) {
                        pcount = 1;
                } else if (len < mss) {
                        goto noop;
                } else {
                        pcount = len / mss;
                        len = pcount * mss;
                }
        }

        /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
        if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
                goto fallback;

        if (!tcp_skb_shift(prev, skb, pcount, len))
                goto fallback;
        if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
                goto out;

        /* Hole filled allows collapsing with the next as well, this is very
         * useful when hole on every nth skb pattern happens
         */
        skb = skb_rb_next(prev);
        if (!skb)
                goto out;

        if (!skb_can_shift(skb) ||
            ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
            (mss != tcp_skb_seglen(skb)))
                goto out;

        if (!tcp_skb_can_collapse(prev, skb))
                goto out;
        len = skb->len;
        pcount = tcp_skb_pcount(skb);
        if (tcp_skb_shift(prev, skb, pcount, len))
                tcp_shifted_skb(sk, prev, skb, state, pcount,
                                len, mss, 0);

out:
        return prev;

noop:
        return skb;

fallback:
        NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
        return NULL;
}

static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                        struct tcp_sack_block *next_dup,
                                        struct tcp_sacktag_state *state,
                                        u32 start_seq, u32 end_seq,
                                        bool dup_sack_in)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *tmp;

        skb_rbtree_walk_from(skb) {
                int in_sack = 0;
                bool dup_sack = dup_sack_in;

                /* queue is in-order => we can short-circuit the walk early */
                if (!before(TCP_SKB_CB(skb)->seq, end_seq))
                        break;

                if (next_dup  &&
                    before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
                        in_sack = tcp_match_skb_to_sack(sk, skb,
                                                        next_dup->start_seq,
                                                        next_dup->end_seq);
                        if (in_sack > 0)
                                dup_sack = true;
                }

                /* skb reference here is a bit tricky to get right, since
                 * shifting can eat and free both this skb and the next,
                 * so not even _safe variant of the loop is enough.
                 */
                if (in_sack <= 0) {
                        tmp = tcp_shift_skb_data(sk, skb, state,
                                                 start_seq, end_seq, dup_sack);
                        if (tmp) {
                                if (tmp != skb) {
                                        skb = tmp;
                                        continue;
                                }

                                in_sack = 0;
                        } else {
                                in_sack = tcp_match_skb_to_sack(sk, skb,
                                                                start_seq,
                                                                end_seq);
                        }
                }

                if (unlikely(in_sack < 0))
                        break;

                if (in_sack) {
                        TCP_SKB_CB(skb)->sacked =
                                tcp_sacktag_one(sk,
                                                state,
                                                TCP_SKB_CB(skb)->sacked,
                                                TCP_SKB_CB(skb)->seq,
                                                TCP_SKB_CB(skb)->end_seq,
                                                dup_sack,
                                                tcp_skb_pcount(skb),
                                                tcp_skb_timestamp_us(skb));
                        tcp_rate_skb_delivered(sk, skb, state->rate);
                        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                                list_del_init(&skb->tcp_tsorted_anchor);

                        if (!before(TCP_SKB_CB(skb)->seq,
                                    tcp_highest_sack_seq(tp)))
                                tcp_advance_highest_sack(sk, skb);
                }
        }
        return skb;
}

static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
{
        struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
        struct sk_buff *skb;

        while (*p) {
                parent = *p;
                skb = rb_to_skb(parent);
                if (before(seq, TCP_SKB_CB(skb)->seq)) {
                        p = &parent->rb_left;
                        continue;
                }
                if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
                        p = &parent->rb_right;
                        continue;
                }
                return skb;
        }
        return NULL;
}

static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
                                        u32 skip_to_seq)
{
        if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
                return skb;

        return tcp_sacktag_bsearch(sk, skip_to_seq);
}

static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
                                                struct sock *sk,
                                                struct tcp_sack_block *next_dup,
                                                struct tcp_sacktag_state *state,
                                                u32 skip_to_seq)
{
        if (!next_dup)
                return skb;

        if (before(next_dup->start_seq, skip_to_seq)) {
                skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
                skb = tcp_sacktag_walk(skb, sk, NULL, state,
                                       next_dup->start_seq, next_dup->end_seq,
                                       1);
        }

        return skb;
}

static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
{
        return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
}

static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
                        u32 prior_snd_una, struct tcp_sacktag_state *state)
{
        struct tcp_sock *tp = tcp_sk(sk);
        const unsigned char *ptr = (skb_transport_header(ack_skb) +
                                    TCP_SKB_CB(ack_skb)->sacked);
        struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
        struct tcp_sack_block sp[TCP_NUM_SACKS];
        struct tcp_sack_block *cache;
        struct sk_buff *skb;
        int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
        int used_sacks;
        bool found_dup_sack = false;
        int i, j;
        int first_sack_index;

        state->flag = 0;
        state->reord = tp->snd_nxt;

        if (!tp->sacked_out)
                tcp_highest_sack_reset(sk);

        found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
                                         num_sacks, prior_snd_una, state);

        /* Eliminate too old ACKs, but take into
         * account more or less fresh ones, they can
         * contain valid SACK info.
         */
        if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
                return 0;

        if (!tp->packets_out)
                goto out;

        used_sacks = 0;
        first_sack_index = 0;
        for (i = 0; i < num_sacks; i++) {
                bool dup_sack = !i && found_dup_sack;

                sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
                sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);

                if (!tcp_is_sackblock_valid(tp, dup_sack,
                                            sp[used_sacks].start_seq,
                                            sp[used_sacks].end_seq)) {
                        int mib_idx;

                        if (dup_sack) {
                                if (!tp->undo_marker)
                                        mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
                                else
                                        mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
                        } else {
                                /* Don't count olds caused by ACK reordering */
                                if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
                                    !after(sp[used_sacks].end_seq, tp->snd_una))
                                        continue;
                                mib_idx = LINUX_MIB_TCPSACKDISCARD;
                        }

                        NET_INC_STATS(sock_net(sk), mib_idx);
                        if (i == 0)
                                first_sack_index = -1;
                        continue;
                }

                /* Ignore very old stuff early */
                if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
                        if (i == 0)
                                first_sack_index = -1;
                        continue;
                }

                used_sacks++;
        }

        /* order SACK blocks to allow in order walk of the retrans queue */
        for (i = used_sacks - 1; i > 0; i--) {
                for (j = 0; j < i; j++) {
                        if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
                                swap(sp[j], sp[j + 1]);

                                /* Track where the first SACK block goes to */
                                if (j == first_sack_index)
                                        first_sack_index = j + 1;
                        }
                }
        }

        state->mss_now = tcp_current_mss(sk);
        skb = NULL;
        i = 0;

        if (!tp->sacked_out) {
                /* It's already past, so skip checking against it */
                cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
        } else {
                cache = tp->recv_sack_cache;
                /* Skip empty blocks in at head of the cache */
                while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
                       !cache->end_seq)
                        cache++;
        }

        while (i < used_sacks) {
                u32 start_seq = sp[i].start_seq;
                u32 end_seq = sp[i].end_seq;
                bool dup_sack = (found_dup_sack && (i == first_sack_index));
                struct tcp_sack_block *next_dup = NULL;

                if (found_dup_sack && ((i + 1) == first_sack_index))
                        next_dup = &sp[i + 1];

                /* Skip too early cached blocks */
                while (tcp_sack_cache_ok(tp, cache) &&
                       !before(start_seq, cache->end_seq))
                        cache++;

                /* Can skip some work by looking recv_sack_cache? */
                if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
                    after(end_seq, cache->start_seq)) {

                        /* Head todo? */
                        if (before(start_seq, cache->start_seq)) {
                                skb = tcp_sacktag_skip(skb, sk, start_seq);
                                skb = tcp_sacktag_walk(skb, sk, next_dup,
                                                       state,
                                                       start_seq,
                                                       cache->start_seq,
                                                       dup_sack);
                        }

                        /* Rest of the block already fully processed? */
                        if (!after(end_seq, cache->end_seq))
                                goto advance_sp;

                        skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
                                                       state,
                                                       cache->end_seq);

                        /* ...tail remains todo... */
                        if (tcp_highest_sack_seq(tp) == cache->end_seq) {
                                /* ...but better entrypoint exists! */
                                skb = tcp_highest_sack(sk);
                                if (!skb)
                                        break;
                                cache++;
                                goto walk;
                        }

                        skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
                        /* Check overlap against next cached too (past this one already) */
                        cache++;
                        continue;
                }

                if (!before(start_seq, tcp_highest_sack_seq(tp))) {
                        skb = tcp_highest_sack(sk);
                        if (!skb)
                                break;
                }
                skb = tcp_sacktag_skip(skb, sk, start_seq);

walk:
                skb = tcp_sacktag_walk(skb, sk, next_dup, state,
                                       start_seq, end_seq, dup_sack);

advance_sp:
                i++;
        }

        /* Clear the head of the cache sack blocks so we can skip it next time */
        for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
                tp->recv_sack_cache[i].start_seq = 0;
                tp->recv_sack_cache[i].end_seq = 0;
        }
        for (j = 0; j < used_sacks; j++)
                tp->recv_sack_cache[i++] = sp[j];

        if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker)
                tcp_check_sack_reordering(sk, state->reord, 0);

        tcp_verify_left_out(tp);
out:

#if FASTRETRANS_DEBUG > 0
        WARN_ON((int)tp->sacked_out < 0);
        WARN_ON((int)tp->lost_out < 0);
        WARN_ON((int)tp->retrans_out < 0);
        WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
        return state->flag;
}

/* Limits sacked_out so that sum with lost_out isn't ever larger than
 * packets_out. Returns false if sacked_out adjustement wasn't necessary.
 */
static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
{
        u32 holes;

        holes = max(tp->lost_out, 1U);
        holes = min(holes, tp->packets_out);

        if ((tp->sacked_out + holes) > tp->packets_out) {
                tp->sacked_out = tp->packets_out - holes;
                return true;
        }
        return false;
}

/* If we receive more dupacks than we expected counting segments
 * in assumption of absent reordering, interpret this as reordering.
 * The only another reason could be bug in receiver TCP.
 */
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!tcp_limit_reno_sacked(tp))
                return;

        tp->reordering = min_t(u32, tp->packets_out + addend,
                               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
        tp->reord_seen++;
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}

/* Emulate SACKs for SACKless connection: account for a new dupack. */

static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
{
        if (num_dupack) {
                struct tcp_sock *tp = tcp_sk(sk);
                u32 prior_sacked = tp->sacked_out;
                s32 delivered;

                tp->sacked_out += num_dupack;
                tcp_check_reno_reordering(sk, 0);
                delivered = tp->sacked_out - prior_sacked;
                if (delivered > 0)
                        tcp_count_delivered(tp, delivered, ece_ack);
                tcp_verify_left_out(tp);
        }
}

/* Account for ACK, ACKing some data in Reno Recovery phase. */

static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (acked > 0) {
                /* One ACK acked hole. The rest eat duplicate ACKs. */
                tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
                                    ece_ack);
                if (acked - 1 >= tp->sacked_out)
                        tp->sacked_out = 0;
                else
                        tp->sacked_out -= acked - 1;
        }
        tcp_check_reno_reordering(sk, acked);
        tcp_verify_left_out(tp);
}

static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
        tp->sacked_out = 0;
}

void tcp_clear_retrans(struct tcp_sock *tp)
{
        tp->retrans_out = 0;
        tp->lost_out = 0;
        tp->undo_marker = 0;
        tp->undo_retrans = -1;
        tp->sacked_out = 0;
        tp->rto_stamp = 0;
        tp->total_rto = 0;
        tp->total_rto_recoveries = 0;
        tp->total_rto_time = 0;
}

static inline void tcp_init_undo(struct tcp_sock *tp)
{
        tp->undo_marker = tp->snd_una;
        /* Retransmission still in flight may cause DSACKs later. */
        tp->undo_retrans = tp->retrans_out ? : -1;
}

static bool tcp_is_rack(const struct sock *sk)
{
        return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
                TCP_RACK_LOSS_DETECTION;
}

/* If we detect SACK reneging, forget all SACK information
 * and reset tags completely, otherwise preserve SACKs. If receiver
 * dropped its ofo queue, we will know this due to reneging detection.
 */
static void tcp_timeout_mark_lost(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb, *head;
        bool is_reneg;                        /* is receiver reneging on SACKs? */

        head = tcp_rtx_queue_head(sk);
        is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
        if (is_reneg) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
                tp->sacked_out = 0;
                /* Mark SACK reneging until we recover from this loss event. */
                tp->is_sack_reneg = 1;
        } else if (tcp_is_reno(tp)) {
                tcp_reset_reno_sack(tp);
        }

        skb = head;
        skb_rbtree_walk_from(skb) {
                if (is_reneg)
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
                else if (tcp_is_rack(sk) && skb != head &&
                         tcp_rack_skb_timeout(tp, skb, 0) > 0)
                        continue; /* Don't mark recently sent ones lost yet */
                tcp_mark_skb_lost(sk, skb);
        }
        tcp_verify_left_out(tp);
        tcp_clear_all_retrans_hints(tp);
}

/* Enter Loss state. */
void tcp_enter_loss(struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
        u8 reordering;

        tcp_timeout_mark_lost(sk);

        /* Reduce ssthresh if it has not yet been made inside this window. */
        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
            !after(tp->high_seq, tp->snd_una) ||
            (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
                tp->prior_ssthresh = tcp_current_ssthresh(sk);
                tp->prior_cwnd = tcp_snd_cwnd(tp);
                tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
                tcp_ca_event(sk, CA_EVENT_LOSS);
                tcp_init_undo(tp);
        }
        tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1);
        tp->snd_cwnd_cnt   = 0;
        tp->snd_cwnd_stamp = tcp_jiffies32;

        /* Timeout in disordered state after receiving substantial DUPACKs
         * suggests that the degree of reordering is over-estimated.
         */
        reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
        if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
            tp->sacked_out >= reordering)
                tp->reordering = min_t(unsigned int, tp->reordering,
                                       reordering);

        tcp_set_ca_state(sk, TCP_CA_Loss);
        tp->high_seq = tp->snd_nxt;
        tcp_ecn_queue_cwr(tp);

        /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
         * loss recovery is underway except recurring timeout(s) on
         * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
         */
        tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
                   (new_recovery || icsk->icsk_retransmits) &&
                   !inet_csk(sk)->icsk_mtup.probe_size;
}

/* If ACK arrived pointing to a remembered SACK, it means that our
 * remembered SACKs do not reflect real state of receiver i.e.
 * receiver _host_ is heavily congested (or buggy).
 *
 * To avoid big spurious retransmission bursts due to transient SACK
 * scoreboard oddities that look like reneging, we give the receiver a
 * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
 * restore sanity to the SACK scoreboard. If the apparent reneging
 * persists until this RTO then we'll clear the SACK scoreboard.
 */
static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag)
{
        if (*ack_flag & FLAG_SACK_RENEGING &&
            *ack_flag & FLAG_SND_UNA_ADVANCED) {
                struct tcp_sock *tp = tcp_sk(sk);
                unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
                                          msecs_to_jiffies(10));

                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                          delay, TCP_RTO_MAX);
                *ack_flag &= ~FLAG_SET_XMIT_TIMER;
                return true;
        }
        return false;
}

/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
 * counter when SACK is enabled (without SACK, sacked_out is used for
 * that purpose).
 *
 * With reordering, holes may still be in flight, so RFC3517 recovery
 * uses pure sacked_out (total number of SACKed segments) even though
 * it violates the RFC that uses duplicate ACKs, often these are equal
 * but when e.g. out-of-window ACKs or packet duplication occurs,
 * they differ. Since neither occurs due to loss, TCP should really
 * ignore them.
 */
static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
{
        return tp->sacked_out + 1;
}

/* Linux NewReno/SACK/ECN state machine.
 * --------------------------------------
 *
 * "Open"        Normal state, no dubious events, fast path.
 * "Disorder"   In all the respects it is "Open",
 *                but requires a bit more attention. It is entered when
 *                we see some SACKs or dupacks. It is split of "Open"
 *                mainly to move some processing from fast path to slow one.
 * "CWR"        CWND was reduced due to some Congestion Notification event.
 *                It can be ECN, ICMP source quench, local device congestion.
 * "Recovery"        CWND was reduced, we are fast-retransmitting.
 * "Loss"        CWND was reduced due to RTO timeout or SACK reneging.
 *
 * tcp_fastretrans_alert() is entered:
 * - each incoming ACK, if state is not "Open"
 * - when arrived ACK is unusual, namely:
 *        * SACK
 *        * Duplicate ACK.
 *        * ECN ECE.
 *
 * Counting packets in flight is pretty simple.
 *
 *        in_flight = packets_out - left_out + retrans_out
 *
 *        packets_out is SND.NXT-SND.UNA counted in packets.
 *
 *        retrans_out is number of retransmitted segments.
 *
 *        left_out is number of segments left network, but not ACKed yet.
 *
 *                left_out = sacked_out + lost_out
 *
 *     sacked_out: Packets, which arrived to receiver out of order
 *                   and hence not ACKed. With SACKs this number is simply
 *                   amount of SACKed data. Even without SACKs
 *                   it is easy to give pretty reliable estimate of this number,
 *                   counting duplicate ACKs.
 *
 *       lost_out: Packets lost by network. TCP has no explicit
 *                   "loss notification" feedback from network (for now).
 *                   It means that this number can be only _guessed_.
 *                   Actually, it is the heuristics to predict lossage that
 *                   distinguishes different algorithms.
 *
 *        F.e. after RTO, when all the queue is considered as lost,
 *        lost_out = packets_out and in_flight = retrans_out.
 *
 *                Essentially, we have now a few algorithms detecting
 *                lost packets.
 *
 *                If the receiver supports SACK:
 *
 *                RFC6675/3517: It is the conventional algorithm. A packet is
 *                considered lost if the number of higher sequence packets
 *                SACKed is greater than or equal the DUPACK thoreshold
 *                (reordering). This is implemented in tcp_mark_head_lost and
 *                tcp_update_scoreboard.
 *
 *                RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
 *                (2017-) that checks timing instead of counting DUPACKs.
 *                Essentially a packet is considered lost if it's not S/ACKed
 *                after RTT + reordering_window, where both metrics are
 *                dynamically measured and adjusted. This is implemented in
 *                tcp_rack_mark_lost.
 *
 *                If the receiver does not support SACK:
 *
 *                NewReno (RFC6582): in Recovery we assume that one segment
 *                is lost (classic Reno). While we are in Recovery and
 *                a partial ACK arrives, we assume that one more packet
 *                is lost (NewReno). This heuristics are the same in NewReno
 *                and SACK.
 *
 * Really tricky (and requiring careful tuning) part of algorithm
 * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
 * The first determines the moment _when_ we should reduce CWND and,
 * hence, slow down forward transmission. In fact, it determines the moment
 * when we decide that hole is caused by loss, rather than by a reorder.
 *
 * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
 * holes, caused by lost packets.
 *
 * And the most logically complicated part of algorithm is undo
 * heuristics. We detect false retransmits due to both too early
 * fast retransmit (reordering) and underestimated RTO, analyzing
 * timestamps and D-SACKs. When we detect that some segments were
 * retransmitted by mistake and CWND reduction was wrong, we undo
 * window reduction and abort recovery phase. This logic is hidden
 * inside several functions named tcp_try_undo_<something>.
 */

/* This function decides, when we should leave Disordered state
 * and enter Recovery phase, reducing congestion window.
 *
 * Main question: may we further continue forward transmission
 * with the same cwnd?
 */
static bool tcp_time_to_recover(struct sock *sk, int flag)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* Trick#1: The loss is proven. */
        if (tp->lost_out)
                return true;

        /* Not-A-Trick#2 : Classic rule... */
        if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
                return true;

        return false;
}

/* Detect loss in event "A" above by marking head of queue up as lost.
 * For RFC3517 SACK, a segment is considered lost if it
 * has at least tp->reordering SACKed seqments above it; "packets" refers to
 * the maximum SACKed segments to pass before reaching this limit.
 */
static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int cnt;
        /* Use SACK to deduce losses of new sequences sent during recovery */
        const u32 loss_high = tp->snd_nxt;

        WARN_ON(packets > tp->packets_out);
        skb = tp->lost_skb_hint;
        if (skb) {
                /* Head already handled? */
                if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
                        return;
                cnt = tp->lost_cnt_hint;
        } else {
                skb = tcp_rtx_queue_head(sk);
                cnt = 0;
        }

        skb_rbtree_walk_from(skb) {
                /* TODO: do this better */
                /* this is not the most efficient way to do this... */
                tp->lost_skb_hint = skb;
                tp->lost_cnt_hint = cnt;

                if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
                        break;

                if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                        cnt += tcp_skb_pcount(skb);

                if (cnt > packets)
                        break;

                if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
                        tcp_mark_skb_lost(sk, skb);

                if (mark_head)
                        break;
        }
        tcp_verify_left_out(tp);
}

/* Account newly detected lost packet(s) */

static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_is_sack(tp)) {
                int sacked_upto = tp->sacked_out - tp->reordering;
                if (sacked_upto >= 0)
                        tcp_mark_head_lost(sk, sacked_upto, 0);
                else if (fast_rexmit)
                        tcp_mark_head_lost(sk, 1, 1);
        }
}

static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
        return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
               before(tp->rx_opt.rcv_tsecr, when);
}

/* skb is spurious retransmitted if the returned timestamp echo
 * reply is prior to the skb transmission time
 */
static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
                                     const struct sk_buff *skb)
{
        return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
               tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
}

/* Nothing was retransmitted or returned timestamp is less
 * than timestamp of the first retransmission.
 */
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
        return tp->retrans_stamp &&
               tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
}

/* Undo procedures. */

/* We can clear retrans_stamp when there are no retransmissions in the
 * window. It would seem that it is trivially available for us in
 * tp->retrans_out, however, that kind of assumptions doesn't consider
 * what will happen if errors occur when sending retransmission for the
 * second time. ...It could the that such segment has only
 * TCPCB_EVER_RETRANS set at the present time. It seems that checking
 * the head skb is enough except for some reneging corner cases that
 * are not worth the effort.
 *
 * Main reason for all this complexity is the fact that connection dying
 * time now depends on the validity of the retrans_stamp, in particular,
 * that successive retransmissions of a segment must not advance
 * retrans_stamp under any conditions.
 */
static bool tcp_any_retrans_done(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;

        if (tp->retrans_out)
                return true;

        skb = tcp_rtx_queue_head(sk);
        if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
                return true;

        return false;
}

static void DBGUNDO(struct sock *sk, const char *msg)
{
#if FASTRETRANS_DEBUG > 1
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_sock *inet = inet_sk(sk);

        if (sk->sk_family == AF_INET) {
                pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
                         msg,
                         &inet->inet_daddr, ntohs(inet->inet_dport),
                         tcp_snd_cwnd(tp), tcp_left_out(tp),
                         tp->snd_ssthresh, tp->prior_ssthresh,
                         tp->packets_out);
        }
#if IS_ENABLED(CONFIG_IPV6)
        else if (sk->sk_family == AF_INET6) {
                pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
                         msg,
                         &sk->sk_v6_daddr, ntohs(inet->inet_dport),
                         tcp_snd_cwnd(tp), tcp_left_out(tp),
                         tp->snd_ssthresh, tp->prior_ssthresh,
                         tp->packets_out);
        }
#endif
#endif
}

static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (unmark_loss) {
                struct sk_buff *skb;

                skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
                }
                tp->lost_out = 0;
                tcp_clear_all_retrans_hints(tp);
        }

        if (tp->prior_ssthresh) {
                const struct inet_connection_sock *icsk = inet_csk(sk);

                tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));

                if (tp->prior_ssthresh > tp->snd_ssthresh) {
                        tp->snd_ssthresh = tp->prior_ssthresh;
                        tcp_ecn_withdraw_cwr(tp);
                }
        }
        tp->snd_cwnd_stamp = tcp_jiffies32;
        tp->undo_marker = 0;
        tp->rack.advanced = 1; /* Force RACK to re-exam losses */
}

static inline bool tcp_may_undo(const struct tcp_sock *tp)
{
        return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}

static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
                /* Hold old state until something *above* high_seq
                 * is ACKed. For Reno it is MUST to prevent false
                 * fast retransmits (RFC2582). SACK TCP is safe. */
                if (!tcp_any_retrans_done(sk))
                        tp->retrans_stamp = 0;
                return true;
        }
        return false;
}

/* People celebrate: "We love our President!" */
static bool tcp_try_undo_recovery(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_may_undo(tp)) {
                int mib_idx;

                /* Happy end! We did not retransmit anything
                 * or our original transmission succeeded.
                 */
                DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
                tcp_undo_cwnd_reduction(sk, false);
                if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
                        mib_idx = LINUX_MIB_TCPLOSSUNDO;
                else
                        mib_idx = LINUX_MIB_TCPFULLUNDO;

                NET_INC_STATS(sock_net(sk), mib_idx);
        } else if (tp->rack.reo_wnd_persist) {
                tp->rack.reo_wnd_persist--;
        }
        if (tcp_is_non_sack_preventing_reopen(sk))
                return true;
        tcp_set_ca_state(sk, TCP_CA_Open);
        tp->is_sack_reneg = 0;
        return false;
}

/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
static bool tcp_try_undo_dsack(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tp->undo_marker && !tp->undo_retrans) {
                tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
                                               tp->rack.reo_wnd_persist + 1);
                DBGUNDO(sk, "D-SACK");
                tcp_undo_cwnd_reduction(sk, false);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
                return true;
        }
        return false;
}

/* Undo during loss recovery after partial ACK or using F-RTO. */
static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (frto_undo || tcp_may_undo(tp)) {
                tcp_undo_cwnd_reduction(sk, true);

                DBGUNDO(sk, "partial loss");
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
                if (frto_undo)
                        NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPSPURIOUSRTOS);
                inet_csk(sk)->icsk_retransmits = 0;
                if (tcp_is_non_sack_preventing_reopen(sk))
                        return true;
                if (frto_undo || tcp_is_sack(tp)) {
                        tcp_set_ca_state(sk, TCP_CA_Open);
                        tp->is_sack_reneg = 0;
                }
                return true;
        }
        return false;
}

/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.
 * It computes the number of packets to send (sndcnt) based on packets newly
 * delivered:
 *   1) If the packets in flight is larger than ssthresh, PRR spreads the
 *        cwnd reductions across a full RTT.
 *   2) Otherwise PRR uses packet conservation to send as much as delivered.
 *      But when SND_UNA is acked without further losses,
 *      slow starts cwnd up to ssthresh to speed up the recovery.
 */
static void tcp_init_cwnd_reduction(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        tp->high_seq = tp->snd_nxt;
        tp->tlp_high_seq = 0;
        tp->snd_cwnd_cnt = 0;
        tp->prior_cwnd = tcp_snd_cwnd(tp);
        tp->prr_delivered = 0;
        tp->prr_out = 0;
        tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
        tcp_ecn_queue_cwr(tp);
}

void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int sndcnt = 0;
        int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);

        if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
                return;

        tp->prr_delivered += newly_acked_sacked;
        if (delta < 0) {
                u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
                               tp->prior_cwnd - 1;
                sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
        } else {
                sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
                               newly_acked_sacked);
                if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
                        sndcnt++;
                sndcnt = min(delta, sndcnt);
        }
        /* Force a fast retransmit upon entering fast recovery */
        sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
        tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
}

static inline void tcp_end_cwnd_reduction(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (inet_csk(sk)->icsk_ca_ops->cong_control)
                return;

        /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
        if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
            (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
                tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
                tp->snd_cwnd_stamp = tcp_jiffies32;
        }
        tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
}

/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
void tcp_enter_cwr(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        tp->prior_ssthresh = 0;
        if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
                tp->undo_marker = 0;
                tcp_init_cwnd_reduction(sk);
                tcp_set_ca_state(sk, TCP_CA_CWR);
        }
}
EXPORT_SYMBOL(tcp_enter_cwr);

static void tcp_try_keep_open(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int state = TCP_CA_Open;

        if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
                state = TCP_CA_Disorder;

        if (inet_csk(sk)->icsk_ca_state != state) {
                tcp_set_ca_state(sk, state);
                tp->high_seq = tp->snd_nxt;
        }
}

static void tcp_try_to_open(struct sock *sk, int flag)
{
        struct tcp_sock *tp = tcp_sk(sk);

        tcp_verify_left_out(tp);

        if (!tcp_any_retrans_done(sk))
                tp->retrans_stamp = 0;

        if (flag & FLAG_ECE)
                tcp_enter_cwr(sk);

        if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
                tcp_try_keep_open(sk);
        }
}

static void tcp_mtup_probe_failed(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
        icsk->icsk_mtup.probe_size = 0;
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}

static void tcp_mtup_probe_success(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        u64 val;

        tp->prior_ssthresh = tcp_current_ssthresh(sk);

        val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache);
        do_div(val, icsk->icsk_mtup.probe_size);
        DEBUG_NET_WARN_ON_ONCE((u32)val != val);
        tcp_snd_cwnd_set(tp, max_t(u32, 1U, val));

        tp->snd_cwnd_cnt = 0;
        tp->snd_cwnd_stamp = tcp_jiffies32;
        tp->snd_ssthresh = tcp_current_ssthresh(sk);

        icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
        icsk->icsk_mtup.probe_size = 0;
        tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}

/* Do a simple retransmit without using the backoff mechanisms in
 * tcp_timer. This is used for path mtu discovery.
 * The socket is already locked here.
 */
void tcp_simple_retransmit(struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int mss;

        /* A fastopen SYN request is stored as two separate packets within
         * the retransmit queue, this is done by tcp_send_syn_data().
         * As a result simply checking the MSS of the frames in the queue
         * will not work for the SYN packet.
         *
         * Us being here is an indication of a path MTU issue so we can
         * assume that the fastopen SYN was lost and just mark all the
         * frames in the retransmit queue as lost. We will use an MSS of
         * -1 to mark all frames as lost, otherwise compute the current MSS.
         */
        if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
                mss = -1;
        else
                mss = tcp_current_mss(sk);

        skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                if (tcp_skb_seglen(skb) > mss)
                        tcp_mark_skb_lost(sk, skb);
        }

        tcp_clear_retrans_hints_partial(tp);

        if (!tp->lost_out)
                return;

        if (tcp_is_reno(tp))
                tcp_limit_reno_sacked(tp);

        tcp_verify_left_out(tp);

        /* Don't muck with the congestion window here.
         * Reason is that we do not increase amount of _data_
         * in network, but units changed and effective
         * cwnd/ssthresh really reduced now.
         */
        if (icsk->icsk_ca_state != TCP_CA_Loss) {
                tp->high_seq = tp->snd_nxt;
                tp->snd_ssthresh = tcp_current_ssthresh(sk);
                tp->prior_ssthresh = 0;
                tp->undo_marker = 0;
                tcp_set_ca_state(sk, TCP_CA_Loss);
        }
        tcp_xmit_retransmit_queue(sk);
}
EXPORT_SYMBOL(tcp_simple_retransmit);

void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int mib_idx;

        if (tcp_is_reno(tp))
                mib_idx = LINUX_MIB_TCPRENORECOVERY;
        else
                mib_idx = LINUX_MIB_TCPSACKRECOVERY;

        NET_INC_STATS(sock_net(sk), mib_idx);

        tp->prior_ssthresh = 0;
        tcp_init_undo(tp);

        if (!tcp_in_cwnd_reduction(sk)) {
                if (!ece_ack)
                        tp->prior_ssthresh = tcp_current_ssthresh(sk);
                tcp_init_cwnd_reduction(sk);
        }
        tcp_set_ca_state(sk, TCP_CA_Recovery);
}

static void tcp_update_rto_time(struct tcp_sock *tp)
{
        if (tp->rto_stamp) {
                tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
                tp->rto_stamp = 0;
        }
}

/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
 * recovered or spurious. Otherwise retransmits more on partial ACKs.
 */
static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
                             int *rexmit)
{
        struct tcp_sock *tp = tcp_sk(sk);
        bool recovered = !before(tp->snd_una, tp->high_seq);

        if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
            tcp_try_undo_loss(sk, false))
                return;

        if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
                /* Step 3.b. A timeout is spurious if not all data are
                 * lost, i.e., never-retransmitted data are (s)acked.
                 */
                if ((flag & FLAG_ORIG_SACK_ACKED) &&
                    tcp_try_undo_loss(sk, true))
                        return;

                if (after(tp->snd_nxt, tp->high_seq)) {
                        if (flag & FLAG_DATA_SACKED || num_dupack)
                                tp->frto = 0; /* Step 3.a. loss was real */
                } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
                        tp->high_seq = tp->snd_nxt;
                        /* Step 2.b. Try send new data (but deferred until cwnd
                         * is updated in tcp_ack()). Otherwise fall back to
                         * the conventional recovery.
                         */
                        if (!tcp_write_queue_empty(sk) &&
                            after(tcp_wnd_end(tp), tp->snd_nxt)) {
                                *rexmit = REXMIT_NEW;
                                return;
                        }
                        tp->frto = 0;
                }
        }

        if (recovered) {
                /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
                tcp_try_undo_recovery(sk);
                return;
        }
        if (tcp_is_reno(tp)) {
                /* A Reno DUPACK means new data in F-RTO step 2.b above are
                 * delivered. Lower inflight to clock out (re)transmissions.
                 */
                if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
                        tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
                else if (flag & FLAG_SND_UNA_ADVANCED)
                        tcp_reset_reno_sack(tp);
        }
        *rexmit = REXMIT_LOST;
}

static bool tcp_force_fast_retransmit(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        return after(tcp_highest_sack_seq(tp),
                     tp->snd_una + tp->reordering * tp->mss_cache);
}

/* Undo during fast recovery after partial ACK. */
static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
                                 bool *do_lost)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tp->undo_marker && tcp_packet_delayed(tp)) {
                /* Plain luck! Hole if filled with delayed
                 * packet, rather than with a retransmit. Check reordering.
                 */
                tcp_check_sack_reordering(sk, prior_snd_una, 1);

                /* We are getting evidence that the reordering degree is higher
                 * than we realized. If there are no retransmits out then we
                 * can undo. Otherwise we clock out new packets but do not
                 * mark more packets lost or retransmit more.
                 */
                if (tp->retrans_out)
                        return true;

                if (!tcp_any_retrans_done(sk))
                        tp->retrans_stamp = 0;

                DBGUNDO(sk, "partial recovery");
                tcp_undo_cwnd_reduction(sk, true);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
                tcp_try_keep_open(sk);
        } else {
                /* Partial ACK arrived. Force fast retransmit. */
                *do_lost = tcp_force_fast_retransmit(sk);
        }
        return false;
}

static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_rtx_queue_empty(sk))
                return;

        if (unlikely(tcp_is_reno(tp))) {
                tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
        } else if (tcp_is_rack(sk)) {
                u32 prior_retrans = tp->retrans_out;

                if (tcp_rack_mark_lost(sk))
                        *ack_flag &= ~FLAG_SET_XMIT_TIMER;
                if (prior_retrans > tp->retrans_out)
                        *ack_flag |= FLAG_LOST_RETRANS;
        }
}

/* Process an event, which can update packets-in-flight not trivially.
 * Main goal of this function is to calculate new estimate for left_out,
 * taking into account both packets sitting in receiver's buffer and
 * packets lost by network.
 *
 * Besides that it updates the congestion state when packet loss or ECN
 * is detected. But it does not reduce the cwnd, it is done by the
 * congestion control later.
 *
 * It does _not_ decide what to send, it is made in function
 * tcp_xmit_retransmit_queue().
 */
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
                                  int num_dupack, int *ack_flag, int *rexmit)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int fast_rexmit = 0, flag = *ack_flag;
        bool ece_ack = flag & FLAG_ECE;
        bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
                                      tcp_force_fast_retransmit(sk));

        if (!tp->packets_out && tp->sacked_out)
                tp->sacked_out = 0;

        /* Now state machine starts.
         * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
        if (ece_ack)
                tp->prior_ssthresh = 0;

        /* B. In all the states check for reneging SACKs. */
        if (tcp_check_sack_reneging(sk, ack_flag))
                return;

        /* C. Check consistency of the current state. */
        tcp_verify_left_out(tp);

        /* D. Check state exit conditions. State can be terminated
         *    when high_seq is ACKed. */
        if (icsk->icsk_ca_state == TCP_CA_Open) {
                WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
                tp->retrans_stamp = 0;
        } else if (!before(tp->snd_una, tp->high_seq)) {
                switch (icsk->icsk_ca_state) {
                case TCP_CA_CWR:
                        /* CWR is to be held something *above* high_seq
                         * is ACKed for CWR bit to reach receiver. */
                        if (tp->snd_una != tp->high_seq) {
                                tcp_end_cwnd_reduction(sk);
                                tcp_set_ca_state(sk, TCP_CA_Open);
                        }
                        break;

                case TCP_CA_Recovery:
                        if (tcp_is_reno(tp))
                                tcp_reset_reno_sack(tp);
                        if (tcp_try_undo_recovery(sk))
                                return;
                        tcp_end_cwnd_reduction(sk);
                        break;
                }
        }

        /* E. Process state. */
        switch (icsk->icsk_ca_state) {
        case TCP_CA_Recovery:
                if (!(flag & FLAG_SND_UNA_ADVANCED)) {
                        if (tcp_is_reno(tp))
                                tcp_add_reno_sack(sk, num_dupack, ece_ack);
                } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
                        return;

                if (tcp_try_undo_dsack(sk))
                        tcp_try_keep_open(sk);

                tcp_identify_packet_loss(sk, ack_flag);
                if (icsk->icsk_ca_state != TCP_CA_Recovery) {
                        if (!tcp_time_to_recover(sk, flag))
                                return;
                        /* Undo reverts the recovery state. If loss is evident,
                         * starts a new recovery (e.g. reordering then loss);
                         */
                        tcp_enter_recovery(sk, ece_ack);
                }
                break;
        case TCP_CA_Loss:
                tcp_process_loss(sk, flag, num_dupack, rexmit);
                if (icsk->icsk_ca_state != TCP_CA_Loss)
                        tcp_update_rto_time(tp);
                tcp_identify_packet_loss(sk, ack_flag);
                if (!(icsk->icsk_ca_state == TCP_CA_Open ||
                      (*ack_flag & FLAG_LOST_RETRANS)))
                        return;
                /* Change state if cwnd is undone or retransmits are lost */
                fallthrough;
        default:
                if (tcp_is_reno(tp)) {
                        if (flag & FLAG_SND_UNA_ADVANCED)
                                tcp_reset_reno_sack(tp);
                        tcp_add_reno_sack(sk, num_dupack, ece_ack);
                }

                if (icsk->icsk_ca_state <= TCP_CA_Disorder)
                        tcp_try_undo_dsack(sk);

                tcp_identify_packet_loss(sk, ack_flag);
                if (!tcp_time_to_recover(sk, flag)) {
                        tcp_try_to_open(sk, flag);
                        return;
                }

                /* MTU probe failure: don't reduce cwnd */
                if (icsk->icsk_ca_state < TCP_CA_CWR &&
                    icsk->icsk_mtup.probe_size &&
                    tp->snd_una == tp->mtu_probe.probe_seq_start) {
                        tcp_mtup_probe_failed(sk);
                        /* Restores the reduction we did in tcp_mtup_probe() */
                        tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
                        tcp_simple_retransmit(sk);
                        return;
                }

                /* Otherwise enter Recovery state */
                tcp_enter_recovery(sk, ece_ack);
                fast_rexmit = 1;
        }

        if (!tcp_is_rack(sk) && do_lost)
                tcp_update_scoreboard(sk, fast_rexmit);
        *rexmit = REXMIT_LOST;
}

static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
{
        u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
        struct tcp_sock *tp = tcp_sk(sk);

        if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
                /* If the remote keeps returning delayed ACKs, eventually
                 * the min filter would pick it up and overestimate the
                 * prop. delay when it expires. Skip suspected delayed ACKs.
                 */
                return;
        }
        minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32,
                           rtt_us ? : jiffies_to_usecs(1));
}

static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
                               long seq_rtt_us, long sack_rtt_us,
                               long ca_rtt_us, struct rate_sample *rs)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
         * broken middle-boxes or peers may corrupt TS-ECR fields. But
         * Karn's algorithm forbids taking RTT if some retransmitted data
         * is acked (RFC6298).
         */
        if (seq_rtt_us < 0)
                seq_rtt_us = sack_rtt_us;

        /* RTTM Rule: A TSecr value received in a segment is used to
         * update the averaged RTT measurement only if the segment
         * acknowledges some new data, i.e., only if it advances the
         * left edge of the send window.
         * See draft-ietf-tcplw-high-performance-00, section 3.3.
         */
        if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
            tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
                seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);

        rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
        if (seq_rtt_us < 0)
                return false;

        /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
         * always taken together with ACK, SACK, or TS-opts. Any negative
         * values will be skipped with the seq_rtt_us < 0 check above.
         */
        tcp_update_rtt_min(sk, ca_rtt_us, flag);
        tcp_rtt_estimator(sk, seq_rtt_us);
        tcp_set_rto(sk);

        /* RFC6298: only reset backoff on valid RTT measurement. */
        inet_csk(sk)->icsk_backoff = 0;
        return true;
}

/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
        struct rate_sample rs;
        long rtt_us = -1L;

        if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
                rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);

        tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
}


static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
        tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
}

/* Restart timer after forward progress on connection.
 * RFC2988 recommends to restart timer to now+rto.
 */
void tcp_rearm_rto(struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);

        /* If the retrans timer is currently being used by Fast Open
         * for SYN-ACK retrans purpose, stay put.
         */
        if (rcu_access_pointer(tp->fastopen_rsk))
                return;

        if (!tp->packets_out) {
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
        } else {
                u32 rto = inet_csk(sk)->icsk_rto;
                /* Offset the time elapsed after installing regular RTO */
                if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
                    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                        s64 delta_us = tcp_rto_delta_us(sk);
                        /* delta_us may not be positive if the socket is locked
                         * when the retrans timer fires and is rescheduled.
                         */
                        rto = usecs_to_jiffies(max_t(int, delta_us, 1));
                }
                tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
                                     TCP_RTO_MAX);
        }
}

/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
static void tcp_set_xmit_timer(struct sock *sk)
{
        if (!tcp_schedule_loss_probe(sk, true))
                tcp_rearm_rto(sk);
}

/* If we get here, the whole TSO packet has not been acked. */
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 packets_acked;

        BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));

        packets_acked = tcp_skb_pcount(skb);
        if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
                return 0;
        packets_acked -= tcp_skb_pcount(skb);

        if (packets_acked) {
                BUG_ON(tcp_skb_pcount(skb) == 0);
                BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
        }

        return packets_acked;
}

static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
                           const struct sk_buff *ack_skb, u32 prior_snd_una)
{
        const struct skb_shared_info *shinfo;

        /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
        if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
                return;

        shinfo = skb_shinfo(skb);
        if (!before(shinfo->tskey, prior_snd_una) &&
            before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
                tcp_skb_tsorted_save(skb) {
                        __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
                } tcp_skb_tsorted_restore(skb);
        }
}

/* Remove acknowledged frames from the retransmission queue. If our packet
 * is before the ack sequence we can discard it as it's confirmed to have
 * arrived at the other end.
 */
static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
                               u32 prior_fack, u32 prior_snd_una,
                               struct tcp_sacktag_state *sack, bool ece_ack)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u64 first_ackt, last_ackt;
        struct tcp_sock *tp = tcp_sk(sk);
        u32 prior_sacked = tp->sacked_out;
        u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */
        struct sk_buff *skb, *next;
        bool fully_acked = true;
        long sack_rtt_us = -1L;
        long seq_rtt_us = -1L;
        long ca_rtt_us = -1L;
        u32 pkts_acked = 0;
        bool rtt_update;
        int flag = 0;

        first_ackt = 0;

        for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                const u32 start_seq = scb->seq;
                u8 sacked = scb->sacked;
                u32 acked_pcount;

                /* Determine how many packets and what bytes were acked, tso and else */
                if (after(scb->end_seq, tp->snd_una)) {
                        if (tcp_skb_pcount(skb) == 1 ||
                            !after(tp->snd_una, scb->seq))
                                break;

                        acked_pcount = tcp_tso_acked(sk, skb);
                        if (!acked_pcount)
                                break;
                        fully_acked = false;
                } else {
                        acked_pcount = tcp_skb_pcount(skb);
                }

                if (unlikely(sacked & TCPCB_RETRANS)) {
                        if (sacked & TCPCB_SACKED_RETRANS)
                                tp->retrans_out -= acked_pcount;
                        flag |= FLAG_RETRANS_DATA_ACKED;
                } else if (!(sacked & TCPCB_SACKED_ACKED)) {
                        last_ackt = tcp_skb_timestamp_us(skb);
                        WARN_ON_ONCE(last_ackt == 0);
                        if (!first_ackt)
                                first_ackt = last_ackt;

                        if (before(start_seq, reord))
                                reord = start_seq;
                        if (!after(scb->end_seq, tp->high_seq))
                                flag |= FLAG_ORIG_SACK_ACKED;
                }

                if (sacked & TCPCB_SACKED_ACKED) {
                        tp->sacked_out -= acked_pcount;
                } else if (tcp_is_sack(tp)) {
                        tcp_count_delivered(tp, acked_pcount, ece_ack);
                        if (!tcp_skb_spurious_retrans(tp, skb))
                                tcp_rack_advance(tp, sacked, scb->end_seq,
                                                 tcp_skb_timestamp_us(skb));
                }
                if (sacked & TCPCB_LOST)
                        tp->lost_out -= acked_pcount;

                tp->packets_out -= acked_pcount;
                pkts_acked += acked_pcount;
                tcp_rate_skb_delivered(sk, skb, sack->rate);

                /* Initial outgoing SYN's get put onto the write_queue
                 * just like anything else we transmit.  It is not
                 * true data, and if we misinform our callers that
                 * this ACK acks real data, we will erroneously exit
                 * connection startup slow start one packet too
                 * quickly.  This is severely frowned upon behavior.
                 */
                if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
                        flag |= FLAG_DATA_ACKED;
                } else {
                        flag |= FLAG_SYN_ACKED;
                        tp->retrans_stamp = 0;
                }

                if (!fully_acked)
                        break;

                tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);

                next = skb_rb_next(skb);
                if (unlikely(skb == tp->retransmit_skb_hint))
                        tp->retransmit_skb_hint = NULL;
                if (unlikely(skb == tp->lost_skb_hint))
                        tp->lost_skb_hint = NULL;
                tcp_highest_sack_replace(sk, skb, next);
                tcp_rtx_queue_unlink_and_free(skb, sk);
        }

        if (!skb)
                tcp_chrono_stop(sk, TCP_CHRONO_BUSY);

        if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
                tp->snd_up = tp->snd_una;

        if (skb) {
                tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
                if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                        flag |= FLAG_SACK_RENEGING;
        }

        if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
                seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
                ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);

                if (pkts_acked == 1 && fully_acked && !prior_sacked &&
                    (tp->snd_una - prior_snd_una) < tp->mss_cache &&
                    sack->rate->prior_delivered + 1 == tp->delivered &&
                    !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
                        /* Conservatively mark a delayed ACK. It's typically
                         * from a lone runt packet over the round trip to
                         * a receiver w/o out-of-order or CE events.
                         */
                        flag |= FLAG_ACK_MAYBE_DELAYED;
                }
        }
        if (sack->first_sackt) {
                sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
                ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
        }
        rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
                                        ca_rtt_us, sack->rate);

        if (flag & FLAG_ACKED) {
                flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
                if (unlikely(icsk->icsk_mtup.probe_size &&
                             !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
                        tcp_mtup_probe_success(sk);
                }

                if (tcp_is_reno(tp)) {
                        tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);

                        /* If any of the cumulatively ACKed segments was
                         * retransmitted, non-SACK case cannot confirm that
                         * progress was due to original transmission due to
                         * lack of TCPCB_SACKED_ACKED bits even if some of
                         * the packets may have been never retransmitted.
                         */
                        if (flag & FLAG_RETRANS_DATA_ACKED)
                                flag &= ~FLAG_ORIG_SACK_ACKED;
                } else {
                        int delta;

                        /* Non-retransmitted hole got filled? That's reordering */
                        if (before(reord, prior_fack))
                                tcp_check_sack_reordering(sk, reord, 0);

                        delta = prior_sacked - tp->sacked_out;
                        tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
                }
        } else if (skb && rtt_update && sack_rtt_us >= 0 &&
                   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
                                                    tcp_skb_timestamp_us(skb))) {
                /* Do not re-arm RTO if the sack RTT is measured from data sent
                 * after when the head was last (re)transmitted. Otherwise the
                 * timeout may continue to extend in loss recovery.
                 */
                flag |= FLAG_SET_XMIT_TIMER;  /* set TLP or RTO timer */
        }

        if (icsk->icsk_ca_ops->pkts_acked) {
                struct ack_sample sample = { .pkts_acked = pkts_acked,
                                             .rtt_us = sack->rate->rtt_us };

                sample.in_flight = tp->mss_cache *
                        (tp->delivered - sack->rate->prior_delivered);
                icsk->icsk_ca_ops->pkts_acked(sk, &sample);
        }

#if FASTRETRANS_DEBUG > 0
        WARN_ON((int)tp->sacked_out < 0);
        WARN_ON((int)tp->lost_out < 0);
        WARN_ON((int)tp->retrans_out < 0);
        if (!tp->packets_out && tcp_is_sack(tp)) {
                icsk = inet_csk(sk);
                if (tp->lost_out) {
                        pr_debug("Leak l=%u %d\n",
                                 tp->lost_out, icsk->icsk_ca_state);
                        tp->lost_out = 0;
                }
                if (tp->sacked_out) {
                        pr_debug("Leak s=%u %d\n",
                                 tp->sacked_out, icsk->icsk_ca_state);
                        tp->sacked_out = 0;
                }
                if (tp->retrans_out) {
                        pr_debug("Leak r=%u %d\n",
                                 tp->retrans_out, icsk->icsk_ca_state);
                        tp->retrans_out = 0;
                }
        }
#endif
        return flag;
}

static void tcp_ack_probe(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *head = tcp_send_head(sk);
        const struct tcp_sock *tp = tcp_sk(sk);

        /* Was it a usable window open? */
        if (!head)
                return;
        if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
                icsk->icsk_backoff = 0;
                icsk->icsk_probes_tstamp = 0;
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
                /* Socket must be waked up by subsequent tcp_data_snd_check().
                 * This function is not for random using!
                 */
        } else {
                unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);

                when = tcp_clamp_probe0_to_user_timeout(sk, when);
                tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
        }
}

static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
{
        return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
                inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
}

/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
        /* If reordering is high then always grow cwnd whenever data is
         * delivered regardless of its ordering. Otherwise stay conservative
         * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
         * new SACK or ECE mark may first advance cwnd here and later reduce
         * cwnd in tcp_fastretrans_alert() based on more states.
         */
        if (tcp_sk(sk)->reordering >
            READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
                return flag & FLAG_FORWARD_PROGRESS;

        return flag & FLAG_DATA_ACKED;
}

/* The "ultimate" congestion control function that aims to replace the rigid
 * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
 * It's called toward the end of processing an ACK with precise rate
 * information. All transmission or retransmission are delayed afterwards.
 */
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
                             int flag, const struct rate_sample *rs)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ca_ops->cong_control) {
                icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
                return;
        }

        if (tcp_in_cwnd_reduction(sk)) {
                /* Reduce cwnd if state mandates */
                tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
        } else if (tcp_may_raise_cwnd(sk, flag)) {
                /* Advance cwnd if state allows */
                tcp_cong_avoid(sk, ack, acked_sacked);
        }
        tcp_update_pacing_rate(sk);
}

/* Check that window update is acceptable.
 * The function assumes that snd_una<=ack<=snd_next.
 */
static inline bool tcp_may_update_window(const struct tcp_sock *tp,
                                        const u32 ack, const u32 ack_seq,
                                        const u32 nwin)
{
        return        after(ack, tp->snd_una) ||
                after(ack_seq, tp->snd_wl1) ||
                (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
}

static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
{
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao;

        if (!static_branch_unlikely(&tcp_ao_needed.key))
                return;

        ao = rcu_dereference_protected(tp->ao_info,
                                       lockdep_sock_is_held((struct sock *)tp));
        if (ao && ack < tp->snd_una) {
                ao->snd_sne++;
                trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne);
        }
#endif
}

/* If we update tp->snd_una, also update tp->bytes_acked */
static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
{
        u32 delta = ack - tp->snd_una;

        sock_owned_by_me((struct sock *)tp);
        tp->bytes_acked += delta;
        tcp_snd_sne_update(tp, ack);
        tp->snd_una = ack;
}

static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
{
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao;

        if (!static_branch_unlikely(&tcp_ao_needed.key))
                return;

        ao = rcu_dereference_protected(tp->ao_info,
                                       lockdep_sock_is_held((struct sock *)tp));
        if (ao && seq < tp->rcv_nxt) {
                ao->rcv_sne++;
                trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne);
        }
#endif
}

/* If we update tp->rcv_nxt, also update tp->bytes_received */
static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
        u32 delta = seq - tp->rcv_nxt;

        sock_owned_by_me((struct sock *)tp);
        tp->bytes_received += delta;
        tcp_rcv_sne_update(tp, seq);
        WRITE_ONCE(tp->rcv_nxt, seq);
}

/* Update our send window.
 *
 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
 */
static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
                                 u32 ack_seq)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int flag = 0;
        u32 nwin = ntohs(tcp_hdr(skb)->window);

        if (likely(!tcp_hdr(skb)->syn))
                nwin <<= tp->rx_opt.snd_wscale;

        if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
                flag |= FLAG_WIN_UPDATE;
                tcp_update_wl(tp, ack_seq);

                if (tp->snd_wnd != nwin) {
                        tp->snd_wnd = nwin;

                        /* Note, it is the only place, where
                         * fast path is recovered for sending TCP.
                         */
                        tp->pred_flags = 0;
                        tcp_fast_path_check(sk);

                        if (!tcp_write_queue_empty(sk))
                                tcp_slow_start_after_idle_check(sk);

                        if (nwin > tp->max_window) {
                                tp->max_window = nwin;
                                tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
                        }
                }
        }

        tcp_snd_una_update(tp, ack);

        return flag;
}

static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
                                   u32 *last_oow_ack_time)
{
        /* Paired with the WRITE_ONCE() in this function. */
        u32 val = READ_ONCE(*last_oow_ack_time);

        if (val) {
                s32 elapsed = (s32)(tcp_jiffies32 - val);

                if (0 <= elapsed &&
                    elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
                        NET_INC_STATS(net, mib_idx);
                        return true;        /* rate-limited: don't send yet! */
                }
        }

        /* Paired with the prior READ_ONCE() and with itself,
         * as we might be lockless.
         */
        WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);

        return false;        /* not rate-limited: go ahead, send dupack now! */
}

/* Return true if we're currently rate-limiting out-of-window ACKs and
 * thus shouldn't send a dupack right now. We rate-limit dupacks in
 * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
 * attacks that send repeated SYNs or ACKs for the same connection. To
 * do this, we do not send a duplicate SYNACK or ACK if the remote
 * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
 */
bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
                          int mib_idx, u32 *last_oow_ack_time)
{
        /* Data packets without SYNs are not likely part of an ACK loop. */
        if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
            !tcp_hdr(skb)->syn)
                return false;

        return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}

/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        u32 count, now, ack_limit;

        /* First check our per-socket dupack rate limit. */
        if (__tcp_oow_rate_limited(net,
                                   LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
                                   &tp->last_oow_ack_time))
                return;

        ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
        if (ack_limit == INT_MAX)
                goto send_ack;

        /* Then check host-wide RFC 5961 rate limit. */
        now = jiffies / HZ;
        if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
                u32 half = (ack_limit + 1) >> 1;

                WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
                WRITE_ONCE(net->ipv4.tcp_challenge_count,
                           get_random_u32_inclusive(half, ack_limit + half - 1));
        }
        count = READ_ONCE(net->ipv4.tcp_challenge_count);
        if (count > 0) {
                WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
send_ack:
                NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
                tcp_send_ack(sk);
        }
}

static void tcp_store_ts_recent(struct tcp_sock *tp)
{
        tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
        tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
}

static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
{
        if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
                /* PAWS bug workaround wrt. ACK frames, the PAWS discard
                 * extra check below makes sure this can only happen
                 * for pure ACK frames.  -DaveM
                 *
                 * Not only, also it occurs for expired timestamps.
                 */

                if (tcp_paws_check(&tp->rx_opt, 0))
                        tcp_store_ts_recent(tp);
        }
}

/* This routine deals with acks during a TLP episode and ends an episode by
 * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
 */
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (before(ack, tp->tlp_high_seq))
                return;

        if (!tp->tlp_retrans) {
                /* TLP of new data has been acknowledged */
                tp->tlp_high_seq = 0;
        } else if (flag & FLAG_DSACK_TLP) {
                /* This DSACK means original and TLP probe arrived; no loss */
                tp->tlp_high_seq = 0;
        } else if (after(ack, tp->tlp_high_seq)) {
                /* ACK advances: there was a loss, so reduce cwnd. Reset
                 * tlp_high_seq in tcp_init_cwnd_reduction()
                 */
                tcp_init_cwnd_reduction(sk);
                tcp_set_ca_state(sk, TCP_CA_CWR);
                tcp_end_cwnd_reduction(sk);
                tcp_try_keep_open(sk);
                NET_INC_STATS(sock_net(sk),
                                LINUX_MIB_TCPLOSSPROBERECOVERY);
        } else if (!(flag & (FLAG_SND_UNA_ADVANCED |
                             FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
                /* Pure dupack: original and TLP probe arrived; no loss */
                tp->tlp_high_seq = 0;
        }
}

static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ca_ops->in_ack_event)
                icsk->icsk_ca_ops->in_ack_event(sk, flags);
}

/* Congestion control has updated the cwnd already. So if we're in
 * loss recovery then now we do any new sends (for FRTO) or
 * retransmits (for CA_Loss or CA_recovery) that make sense.
 */
static void tcp_xmit_recovery(struct sock *sk, int rexmit)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
                return;

        if (unlikely(rexmit == REXMIT_NEW)) {
                __tcp_push_pending_frames(sk, tcp_current_mss(sk),
                                          TCP_NAGLE_OFF);
                if (after(tp->snd_nxt, tp->high_seq))
                        return;
                tp->frto = 0;
        }
        tcp_xmit_retransmit_queue(sk);
}

/* Returns the number of packets newly acked or sacked by the current ACK */
static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
{
        const struct net *net = sock_net(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        u32 delivered;

        delivered = tp->delivered - prior_delivered;
        NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
        if (flag & FLAG_ECE)
                NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);

        return delivered;
}

/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_sacktag_state sack_state;
        struct rate_sample rs = { .prior_delivered = 0 };
        u32 prior_snd_una = tp->snd_una;
        bool is_sack_reneg = tp->is_sack_reneg;
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        int num_dupack = 0;
        int prior_packets = tp->packets_out;
        u32 delivered = tp->delivered;
        u32 lost = tp->lost;
        int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
        u32 prior_fack;

        sack_state.first_sackt = 0;
        sack_state.rate = &rs;
        sack_state.sack_delivered = 0;

        /* We very likely will need to access rtx queue. */
        prefetch(sk->tcp_rtx_queue.rb_node);

        /* If the ack is older than previous acks
         * then we can probably ignore it.
         */
        if (before(ack, prior_snd_una)) {
                u32 max_window;

                /* do not accept ACK for bytes we never sent. */
                max_window = min_t(u64, tp->max_window, tp->bytes_acked);
                /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
                if (before(ack, prior_snd_una - max_window)) {
                        if (!(flag & FLAG_NO_CHALLENGE_ACK))
                                tcp_send_challenge_ack(sk);
                        return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
                }
                goto old_ack;
        }

        /* If the ack includes data we haven't sent yet, discard
         * this segment (RFC793 Section 3.9).
         */
        if (after(ack, tp->snd_nxt))
                return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;

        if (after(ack, prior_snd_una)) {
                flag |= FLAG_SND_UNA_ADVANCED;
                icsk->icsk_retransmits = 0;

#if IS_ENABLED(CONFIG_TLS_DEVICE)
                if (static_branch_unlikely(&clean_acked_data_enabled.key))
                        if (icsk->icsk_clean_acked)
                                icsk->icsk_clean_acked(sk, ack);
#endif
        }

        prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
        rs.prior_in_flight = tcp_packets_in_flight(tp);

        /* ts_recent update must be made after we are sure that the packet
         * is in window.
         */
        if (flag & FLAG_UPDATE_TS_RECENT)
                tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);

        if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
            FLAG_SND_UNA_ADVANCED) {
                /* Window is constant, pure forward advance.
                 * No more checks are required.
                 * Note, we use the fact that SND.UNA>=SND.WL2.
                 */
                tcp_update_wl(tp, ack_seq);
                tcp_snd_una_update(tp, ack);
                flag |= FLAG_WIN_UPDATE;

                tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);

                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
        } else {
                u32 ack_ev_flags = CA_ACK_SLOWPATH;

                if (ack_seq != TCP_SKB_CB(skb)->end_seq)
                        flag |= FLAG_DATA;
                else
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);

                flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);

                if (TCP_SKB_CB(skb)->sacked)
                        flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
                                                        &sack_state);

                if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
                        flag |= FLAG_ECE;
                        ack_ev_flags |= CA_ACK_ECE;
                }

                if (sack_state.sack_delivered)
                        tcp_count_delivered(tp, sack_state.sack_delivered,
                                            flag & FLAG_ECE);

                if (flag & FLAG_WIN_UPDATE)
                        ack_ev_flags |= CA_ACK_WIN_UPDATE;

                tcp_in_ack_event(sk, ack_ev_flags);
        }

        /* This is a deviation from RFC3168 since it states that:
         * "When the TCP data sender is ready to set the CWR bit after reducing
         * the congestion window, it SHOULD set the CWR bit only on the first
         * new data packet that it transmits."
         * We accept CWR on pure ACKs to be more robust
         * with widely-deployed TCP implementations that do this.
         */
        tcp_ecn_accept_cwr(sk, skb);

        /* We passed data and got it acked, remove any soft error
         * log. Something worked...
         */
        WRITE_ONCE(sk->sk_err_soft, 0);
        icsk->icsk_probes_out = 0;
        tp->rcv_tstamp = tcp_jiffies32;
        if (!prior_packets)
                goto no_queue;

        /* See if we can take anything off of the retransmit queue. */
        flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
                                    &sack_state, flag & FLAG_ECE);

        tcp_rack_update_reo_wnd(sk, &rs);

        if (tp->tlp_high_seq)
                tcp_process_tlp_ack(sk, ack, flag);

        if (tcp_ack_is_dubious(sk, flag)) {
                if (!(flag & (FLAG_SND_UNA_ADVANCED |
                              FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
                        num_dupack = 1;
                        /* Consider if pure acks were aggregated in tcp_add_backlog() */
                        if (!(flag & FLAG_DATA))
                                num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
                }
                tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
                                      &rexmit);
        }

        /* If needed, reset TLP/RTO timer when RACK doesn't set. */
        if (flag & FLAG_SET_XMIT_TIMER)
                tcp_set_xmit_timer(sk);

        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
                sk_dst_confirm(sk);

        delivered = tcp_newly_delivered(sk, delivered, flag);
        lost = tp->lost - lost;                        /* freshly marked lost */
        rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
        tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
        tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
        tcp_xmit_recovery(sk, rexmit);
        return 1;

no_queue:
        /* If data was DSACKed, see if we can undo a cwnd reduction. */
        if (flag & FLAG_DSACKING_ACK) {
                tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
                                      &rexmit);
                tcp_newly_delivered(sk, delivered, flag);
        }
        /* If this ack opens up a zero window, clear backoff.  It was
         * being used to time the probes, and is probably far higher than
         * it needs to be for normal retransmission.
         */
        tcp_ack_probe(sk);

        if (tp->tlp_high_seq)
                tcp_process_tlp_ack(sk, ack, flag);
        return 1;

old_ack:
        /* If data was SACKed, tag it and see if we should send more data.
         * If data was DSACKed, see if we can undo a cwnd reduction.
         */
        if (TCP_SKB_CB(skb)->sacked) {
                flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
                                                &sack_state);
                tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
                                      &rexmit);
                tcp_newly_delivered(sk, delivered, flag);
                tcp_xmit_recovery(sk, rexmit);
        }

        return 0;
}

static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
                                      bool syn, struct tcp_fastopen_cookie *foc,
                                      bool exp_opt)
{
        /* Valid only in SYN or SYN-ACK with an even length.  */
        if (!foc || !syn || len < 0 || (len & 1))
                return;

        if (len >= TCP_FASTOPEN_COOKIE_MIN &&
            len <= TCP_FASTOPEN_COOKIE_MAX)
                memcpy(foc->val, cookie, len);
        else if (len != 0)
                len = -1;
        foc->len = len;
        foc->exp = exp_opt;
}

static bool smc_parse_options(const struct tcphdr *th,
                              struct tcp_options_received *opt_rx,
                              const unsigned char *ptr,
                              int opsize)
{
#if IS_ENABLED(CONFIG_SMC)
        if (static_branch_unlikely(&tcp_have_smc)) {
                if (th->syn && !(opsize & 1) &&
                    opsize >= TCPOLEN_EXP_SMC_BASE &&
                    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
                        opt_rx->smc_ok = 1;
                        return true;
                }
        }
#endif
        return false;
}

/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
 * value on success.
 */
u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
{
        const unsigned char *ptr = (const unsigned char *)(th + 1);
        int length = (th->doff * 4) - sizeof(struct tcphdr);
        u16 mss = 0;

        while (length > 0) {
                int opcode = *ptr++;
                int opsize;

                switch (opcode) {
                case TCPOPT_EOL:
                        return mss;
                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
                        length--;
                        continue;
                default:
                        if (length < 2)
                                return mss;
                        opsize = *ptr++;
                        if (opsize < 2) /* "silly options" */
                                return mss;
                        if (opsize > length)
                                return mss;        /* fail on partial options */
                        if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
                                u16 in_mss = get_unaligned_be16(ptr);

                                if (in_mss) {
                                        if (user_mss && user_mss < in_mss)
                                                in_mss = user_mss;
                                        mss = in_mss;
                                }
                        }
                        ptr += opsize - 2;
                        length -= opsize;
                }
        }
        return mss;
}
EXPORT_SYMBOL_GPL(tcp_parse_mss_option);

/* Look for tcp options. Normally only called on SYN and SYNACK packets.
 * But, this can also be called on packets in the established flow when
 * the fast version below fails.
 */
void tcp_parse_options(const struct net *net,
                       const struct sk_buff *skb,
                       struct tcp_options_received *opt_rx, int estab,
                       struct tcp_fastopen_cookie *foc)
{
        const unsigned char *ptr;
        const struct tcphdr *th = tcp_hdr(skb);
        int length = (th->doff * 4) - sizeof(struct tcphdr);

        ptr = (const unsigned char *)(th + 1);
        opt_rx->saw_tstamp = 0;
        opt_rx->saw_unknown = 0;

        while (length > 0) {
                int opcode = *ptr++;
                int opsize;

                switch (opcode) {
                case TCPOPT_EOL:
                        return;
                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
                        length--;
                        continue;
                default:
                        if (length < 2)
                                return;
                        opsize = *ptr++;
                        if (opsize < 2) /* "silly options" */
                                return;
                        if (opsize > length)
                                return;        /* don't parse partial options */
                        switch (opcode) {
                        case TCPOPT_MSS:
                                if (opsize == TCPOLEN_MSS && th->syn && !estab) {
                                        u16 in_mss = get_unaligned_be16(ptr);
                                        if (in_mss) {
                                                if (opt_rx->user_mss &&
                                                    opt_rx->user_mss < in_mss)
                                                        in_mss = opt_rx->user_mss;
                                                opt_rx->mss_clamp = in_mss;
                                        }
                                }
                                break;
                        case TCPOPT_WINDOW:
                                if (opsize == TCPOLEN_WINDOW && th->syn &&
                                    !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
                                        __u8 snd_wscale = *(__u8 *)ptr;
                                        opt_rx->wscale_ok = 1;
                                        if (snd_wscale > TCP_MAX_WSCALE) {
                                                net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
                                                                     __func__,
                                                                     snd_wscale,
                                                                     TCP_MAX_WSCALE);
                                                snd_wscale = TCP_MAX_WSCALE;
                                        }
                                        opt_rx->snd_wscale = snd_wscale;
                                }
                                break;
                        case TCPOPT_TIMESTAMP:
                                if ((opsize == TCPOLEN_TIMESTAMP) &&
                                    ((estab && opt_rx->tstamp_ok) ||
                                     (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
                                        opt_rx->saw_tstamp = 1;
                                        opt_rx->rcv_tsval = get_unaligned_be32(ptr);
                                        opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
                                }
                                break;
                        case TCPOPT_SACK_PERM:
                                if (opsize == TCPOLEN_SACK_PERM && th->syn &&
                                    !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
                                        opt_rx->sack_ok = TCP_SACK_SEEN;
                                        tcp_sack_reset(opt_rx);
                                }
                                break;

                        case TCPOPT_SACK:
                                if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
                                   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
                                   opt_rx->sack_ok) {
                                        TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
                                }
                                break;
#ifdef CONFIG_TCP_MD5SIG
                        case TCPOPT_MD5SIG:
                                /* The MD5 Hash has already been
                                 * checked (see tcp_v{4,6}_rcv()).
                                 */
                                break;
#endif
                        case TCPOPT_FASTOPEN:
                                tcp_parse_fastopen_option(
                                        opsize - TCPOLEN_FASTOPEN_BASE,
                                        ptr, th->syn, foc, false);
                                break;

                        case TCPOPT_EXP:
                                /* Fast Open option shares code 254 using a
                                 * 16 bits magic number.
                                 */
                                if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
                                    get_unaligned_be16(ptr) ==
                                    TCPOPT_FASTOPEN_MAGIC) {
                                        tcp_parse_fastopen_option(opsize -
                                                TCPOLEN_EXP_FASTOPEN_BASE,
                                                ptr + 2, th->syn, foc, true);
                                        break;
                                }

                                if (smc_parse_options(th, opt_rx, ptr, opsize))
                                        break;

                                opt_rx->saw_unknown = 1;
                                break;

                        default:
                                opt_rx->saw_unknown = 1;
                        }
                        ptr += opsize-2;
                        length -= opsize;
                }
        }
}
EXPORT_SYMBOL(tcp_parse_options);

static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
{
        const __be32 *ptr = (const __be32 *)(th + 1);

        if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
                          | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
                tp->rx_opt.saw_tstamp = 1;
                ++ptr;
                tp->rx_opt.rcv_tsval = ntohl(*ptr);
                ++ptr;
                if (*ptr)
                        tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
                else
                        tp->rx_opt.rcv_tsecr = 0;
                return true;
        }
        return false;
}

/* Fast parse options. This hopes to only see timestamps.
 * If it is wrong it falls back on tcp_parse_options().
 */
static bool tcp_fast_parse_options(const struct net *net,
                                   const struct sk_buff *skb,
                                   const struct tcphdr *th, struct tcp_sock *tp)
{
        /* In the spirit of fast parsing, compare doff directly to constant
         * values.  Because equality is used, short doff can be ignored here.
         */
        if (th->doff == (sizeof(*th) / 4)) {
                tp->rx_opt.saw_tstamp = 0;
                return false;
        } else if (tp->rx_opt.tstamp_ok &&
                   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
                if (tcp_parse_aligned_timestamp(tp, th))
                        return true;
        }

        tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
                tp->rx_opt.rcv_tsecr -= tp->tsoffset;

        return true;
}

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
/*
 * Parse Signature options
 */
int tcp_do_parse_auth_options(const struct tcphdr *th,
                              const u8 **md5_hash, const u8 **ao_hash)
{
        int length = (th->doff << 2) - sizeof(*th);
        const u8 *ptr = (const u8 *)(th + 1);
        unsigned int minlen = TCPOLEN_MD5SIG;

        if (IS_ENABLED(CONFIG_TCP_AO))
                minlen = sizeof(struct tcp_ao_hdr) + 1;

        *md5_hash = NULL;
        *ao_hash = NULL;

        /* If not enough data remaining, we can short cut */
        while (length >= minlen) {
                int opcode = *ptr++;
                int opsize;

                switch (opcode) {
                case TCPOPT_EOL:
                        return 0;
                case TCPOPT_NOP:
                        length--;
                        continue;
                default:
                        opsize = *ptr++;
                        if (opsize < 2 || opsize > length)
                                return -EINVAL;
                        if (opcode == TCPOPT_MD5SIG) {
                                if (opsize != TCPOLEN_MD5SIG)
                                        return -EINVAL;
                                if (unlikely(*md5_hash || *ao_hash))
                                        return -EEXIST;
                                *md5_hash = ptr;
                        } else if (opcode == TCPOPT_AO) {
                                if (opsize <= sizeof(struct tcp_ao_hdr))
                                        return -EINVAL;
                                if (unlikely(*md5_hash || *ao_hash))
                                        return -EEXIST;
                                *ao_hash = ptr;
                        }
                }
                ptr += opsize - 2;
                length -= opsize;
        }
        return 0;
}
EXPORT_SYMBOL(tcp_do_parse_auth_options);
#endif

/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 *
 * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
 * it can pass through stack. So, the following predicate verifies that
 * this segment is not used for anything but congestion avoidance or
 * fast retransmit. Moreover, we even are able to eliminate most of such
 * second order effects, if we apply some small "replay" window (~RTO)
 * to timestamp space.
 *
 * All these measures still do not guarantee that we reject wrapped ACKs
 * on networks with high bandwidth, when sequence space is recycled fastly,
 * but it guarantees that such events will be very rare and do not affect
 * connection seriously. This doesn't look nice, but alas, PAWS is really
 * buggy extension.
 *
 * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
 * states that events when retransmit arrives after original data are rare.
 * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
 * the biggest problem on large power networks even with minor reordering.
 * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
 * up to bandwidth of 18Gigabit/sec. 8) ]
 */

/* Estimates max number of increments of remote peer TSval in
 * a replay window (based on our current RTO estimation).
 */
static u32 tcp_tsval_replay(const struct sock *sk)
{
        /* If we use usec TS resolution,
         * then expect the remote peer to use the same resolution.
         */
        if (tcp_sk(sk)->tcp_usec_ts)
                return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ);

        /* RFC 7323 recommends a TSval clock between 1ms and 1sec.
         * We know that some OS (including old linux) can use 1200 Hz.
         */
        return inet_csk(sk)->icsk_rto * 1200 / HZ;
}

static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct tcphdr *th = tcp_hdr(skb);
        u32 seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;

        return        /* 1. Pure ACK with correct sequence number. */
                (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&

                /* 2. ... and duplicate ACK. */
                ack == tp->snd_una &&

                /* 3. ... and does not update window. */
                !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&

                /* 4. ... and sits in replay window. */
                (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <=
                tcp_tsval_replay(sk);
}

static inline bool tcp_paws_discard(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
               !tcp_disordered_ack(sk, skb);
}

/* Check segment sequence number for validity.
 *
 * Segment controls are considered valid, if the segment
 * fits to the window after truncation to the window. Acceptability
 * of data (and SYN, FIN, of course) is checked separately.
 * See tcp_data_queue(), for example.
 *
 * Also, controls (RST is main one) are accepted using RCV.WUP instead
 * of RCV.NXT. Peer still did not advance his SND.UNA when we
 * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
 * (borrowed from freebsd)
 */

static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
                                         u32 seq, u32 end_seq)
{
        if (before(end_seq, tp->rcv_wup))
                return SKB_DROP_REASON_TCP_OLD_SEQUENCE;

        if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
                return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;

        return SKB_NOT_DROPPED_YET;
}


void tcp_done_with_error(struct sock *sk, int err)
{
        /* This barrier is coupled with smp_rmb() in tcp_poll() */
        WRITE_ONCE(sk->sk_err, err);
        smp_wmb();

        tcp_write_queue_purge(sk);
        tcp_done(sk);

        if (!sock_flag(sk, SOCK_DEAD))
                sk_error_report(sk);
}
EXPORT_SYMBOL(tcp_done_with_error);

/* When we get a reset we do this. */
void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
        int err;

        trace_tcp_receive_reset(sk);

        /* mptcp can't tell us to ignore reset pkts,
         * so just ignore the return value of mptcp_incoming_options().
         */
        if (sk_is_mptcp(sk))
                mptcp_incoming_options(sk, skb);

        /* We want the right error as BSD sees it (and indeed as we do). */
        switch (sk->sk_state) {
        case TCP_SYN_SENT:
                err = ECONNREFUSED;
                break;
        case TCP_CLOSE_WAIT:
                err = EPIPE;
                break;
        case TCP_CLOSE:
                return;
        default:
                err = ECONNRESET;
        }
        tcp_done_with_error(sk, err);
}

/*
 *         Process the FIN bit. This now behaves as it is supposed to work
 *        and the FIN takes effect when it is validly part of sequence
 *        space. Not before when we get holes.
 *
 *        If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
 *        (and thence onto LAST-ACK and finally, CLOSE, we never enter
 *        TIME-WAIT)
 *
 *        If we are in FINWAIT-1, a received FIN indicates simultaneous
 *        close and we go into CLOSING (and later onto TIME-WAIT)
 *
 *        If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
 */
void tcp_fin(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        inet_csk_schedule_ack(sk);

        WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
        sock_set_flag(sk, SOCK_DONE);

        switch (sk->sk_state) {
        case TCP_SYN_RECV:
        case TCP_ESTABLISHED:
                /* Move to CLOSE_WAIT */
                tcp_set_state(sk, TCP_CLOSE_WAIT);
                inet_csk_enter_pingpong_mode(sk);
                break;

        case TCP_CLOSE_WAIT:
        case TCP_CLOSING:
                /* Received a retransmission of the FIN, do
                 * nothing.
                 */
                break;
        case TCP_LAST_ACK:
                /* RFC793: Remain in the LAST-ACK state. */
                break;

        case TCP_FIN_WAIT1:
                /* This case occurs when a simultaneous close
                 * happens, we must ack the received FIN and
                 * enter the CLOSING state.
                 */
                tcp_send_ack(sk);
                tcp_set_state(sk, TCP_CLOSING);
                break;
        case TCP_FIN_WAIT2:
                /* Received a FIN -- send ACK and enter TIME_WAIT. */
                tcp_send_ack(sk);
                tcp_time_wait(sk, TCP_TIME_WAIT, 0);
                break;
        default:
                /* Only TCP_LISTEN and TCP_CLOSE are left, in these
                 * cases we should never reach this piece of code.
                 */
                pr_err("%s: Impossible, sk->sk_state=%d\n",
                       __func__, sk->sk_state);
                break;
        }

        /* It _is_ possible, that we have something out-of-order _after_ FIN.
         * Probably, we should reset in this case. For now drop them.
         */
        skb_rbtree_purge(&tp->out_of_order_queue);
        if (tcp_is_sack(tp))
                tcp_sack_reset(&tp->rx_opt);

        if (!sock_flag(sk, SOCK_DEAD)) {
                sk->sk_state_change(sk);

                /* Do not send POLL_HUP for half duplex close. */
                if (sk->sk_shutdown == SHUTDOWN_MASK ||
                    sk->sk_state == TCP_CLOSE)
                        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
                else
                        sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
        }
}

static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
                                  u32 end_seq)
{
        if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
                if (before(seq, sp->start_seq))
                        sp->start_seq = seq;
                if (after(end_seq, sp->end_seq))
                        sp->end_seq = end_seq;
                return true;
        }
        return false;
}

static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
                int mib_idx;

                if (before(seq, tp->rcv_nxt))
                        mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
                else
                        mib_idx = LINUX_MIB_TCPDSACKOFOSENT;

                NET_INC_STATS(sock_net(sk), mib_idx);

                tp->rx_opt.dsack = 1;
                tp->duplicate_sack[0].start_seq = seq;
                tp->duplicate_sack[0].end_seq = end_seq;
        }
}

static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!tp->rx_opt.dsack)
                tcp_dsack_set(sk, seq, end_seq);
        else
                tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}

static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
{
        /* When the ACK path fails or drops most ACKs, the sender would
         * timeout and spuriously retransmit the same segment repeatedly.
         * If it seems our ACKs are not reaching the other side,
         * based on receiving a duplicate data segment with new flowlabel
         * (suggesting the sender suffered an RTO), and we are not already
         * repathing due to our own RTO, then rehash the socket to repath our
         * packets.
         */
#if IS_ENABLED(CONFIG_IPV6)
        if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss &&
            skb->protocol == htons(ETH_P_IPV6) &&
            (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
             ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
            sk_rethink_txhash(sk))
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);

        /* Save last flowlabel after a spurious retrans. */
        tcp_save_lrcv_flowlabel(sk, skb);
#endif
}

static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
            before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
                tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);

                if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
                        u32 end_seq = TCP_SKB_CB(skb)->end_seq;

                        tcp_rcv_spurious_retrans(sk, skb);
                        if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
                                end_seq = tp->rcv_nxt;
                        tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
                }
        }

        tcp_send_ack(sk);
}

/* These routines update the SACK block as out-of-order packets arrive or
 * in-order packets close up the sequence space.
 */
static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
{
        int this_sack;
        struct tcp_sack_block *sp = &tp->selective_acks[0];
        struct tcp_sack_block *swalk = sp + 1;

        /* See if the recent change to the first SACK eats into
         * or hits the sequence space of other SACK blocks, if so coalesce.
         */
        for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
                if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
                        int i;

                        /* Zap SWALK, by moving every further SACK up by one slot.
                         * Decrease num_sacks.
                         */
                        tp->rx_opt.num_sacks--;
                        for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
                                sp[i] = sp[i + 1];
                        continue;
                }
                this_sack++;
                swalk++;
        }
}

void tcp_sack_compress_send_ack(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!tp->compressed_ack)
                return;

        if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
                __sock_put(sk);

        /* Since we have to send one ack finally,
         * substract one from tp->compressed_ack to keep
         * LINUX_MIB_TCPACKCOMPRESSED accurate.
         */
        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
                      tp->compressed_ack - 1);

        tp->compressed_ack = 0;
        tcp_send_ack(sk);
}

/* Reasonable amount of sack blocks included in TCP SACK option
 * The max is 4, but this becomes 3 if TCP timestamps are there.
 * Given that SACK packets might be lost, be conservative and use 2.
 */
#define TCP_SACK_BLOCKS_EXPECTED 2

static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_sack_block *sp = &tp->selective_acks[0];
        int cur_sacks = tp->rx_opt.num_sacks;
        int this_sack;

        if (!cur_sacks)
                goto new_sack;

        for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
                if (tcp_sack_extend(sp, seq, end_seq)) {
                        if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
                                tcp_sack_compress_send_ack(sk);
                        /* Rotate this_sack to the first one. */
                        for (; this_sack > 0; this_sack--, sp--)
                                swap(*sp, *(sp - 1));
                        if (cur_sacks > 1)
                                tcp_sack_maybe_coalesce(tp);
                        return;
                }
        }

        if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
                tcp_sack_compress_send_ack(sk);

        /* Could not find an adjacent existing SACK, build a new one,
         * put it at the front, and shift everyone else down.  We
         * always know there is at least one SACK present already here.
         *
         * If the sack array is full, forget about the last one.
         */
        if (this_sack >= TCP_NUM_SACKS) {
                this_sack--;
                tp->rx_opt.num_sacks--;
                sp--;
        }
        for (; this_sack > 0; this_sack--, sp--)
                *sp = *(sp - 1);

new_sack:
        /* Build the new head SACK, and we're done. */
        sp->start_seq = seq;
        sp->end_seq = end_seq;
        tp->rx_opt.num_sacks++;
}

/* RCV.NXT advances, some SACKs should be eaten. */

static void tcp_sack_remove(struct tcp_sock *tp)
{
        struct tcp_sack_block *sp = &tp->selective_acks[0];
        int num_sacks = tp->rx_opt.num_sacks;
        int this_sack;

        /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
        if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                tp->rx_opt.num_sacks = 0;
                return;
        }

        for (this_sack = 0; this_sack < num_sacks;) {
                /* Check if the start of the sack is covered by RCV.NXT. */
                if (!before(tp->rcv_nxt, sp->start_seq)) {
                        int i;

                        /* RCV.NXT must cover all the block! */
                        WARN_ON(before(tp->rcv_nxt, sp->end_seq));

                        /* Zap this SACK, by moving forward any other SACKS. */
                        for (i = this_sack+1; i < num_sacks; i++)
                                tp->selective_acks[i-1] = tp->selective_acks[i];
                        num_sacks--;
                        continue;
                }
                this_sack++;
                sp++;
        }
        tp->rx_opt.num_sacks = num_sacks;
}

/**
 * tcp_try_coalesce - try to merge skb to prior one
 * @sk: socket
 * @to: prior buffer
 * @from: buffer to add in queue
 * @fragstolen: pointer to boolean
 *
 * Before queueing skb @from after @to, try to merge them
 * to reduce overall memory use and queue lengths, if cost is small.
 * Packets in ofo or receive queues can stay a long time.
 * Better try to coalesce them right now to avoid future collapses.
 * Returns true if caller should free @from instead of queueing it
 */
static bool tcp_try_coalesce(struct sock *sk,
                             struct sk_buff *to,
                             struct sk_buff *from,
                             bool *fragstolen)
{
        int delta;

        *fragstolen = false;

        /* Its possible this segment overlaps with prior segment in queue */
        if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
                return false;

        if (!tcp_skb_can_collapse_rx(to, from))
                return false;

        if (!skb_try_coalesce(to, from, fragstolen, &delta))
                return false;

        atomic_add(delta, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, delta);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
        TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
        TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
        TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;

        if (TCP_SKB_CB(from)->has_rxtstamp) {
                TCP_SKB_CB(to)->has_rxtstamp = true;
                to->tstamp = from->tstamp;
                skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
        }

        return true;
}

static bool tcp_ooo_try_coalesce(struct sock *sk,
                             struct sk_buff *to,
                             struct sk_buff *from,
                             bool *fragstolen)
{
        bool res = tcp_try_coalesce(sk, to, from, fragstolen);

        /* In case tcp_drop_reason() is called later, update to->gso_segs */
        if (res) {
                u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
                               max_t(u16, 1, skb_shinfo(from)->gso_segs);

                skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
        }
        return res;
}

static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb,
                            enum skb_drop_reason reason)
{
        sk_drops_add(sk, skb);
        sk_skb_reason_drop(sk, skb, reason);
}

/* This one checks to see if we can put data from the
 * out_of_order queue into the receive_queue.
 */
static void tcp_ofo_queue(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        __u32 dsack_high = tp->rcv_nxt;
        bool fin, fragstolen, eaten;
        struct sk_buff *skb, *tail;
        struct rb_node *p;

        p = rb_first(&tp->out_of_order_queue);
        while (p) {
                skb = rb_to_skb(p);
                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
                        break;

                if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
                        __u32 dsack = dsack_high;
                        if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
                                dsack_high = TCP_SKB_CB(skb)->end_seq;
                        tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
                }
                p = rb_next(p);
                rb_erase(&skb->rbnode, &tp->out_of_order_queue);

                if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
                        tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP);
                        continue;
                }

                tail = skb_peek_tail(&sk->sk_receive_queue);
                eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
                tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
                fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
                if (!eaten)
                        __skb_queue_tail(&sk->sk_receive_queue, skb);
                else
                        kfree_skb_partial(skb, fragstolen);

                if (unlikely(fin)) {
                        tcp_fin(sk);
                        /* tcp_fin() purges tp->out_of_order_queue,
                         * so we must end this loop right now.
                         */
                        break;
                }
        }
}

static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);

static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
                                 unsigned int size)
{
        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
            !sk_rmem_schedule(sk, skb, size)) {

                if (tcp_prune_queue(sk, skb) < 0)
                        return -1;

                while (!sk_rmem_schedule(sk, skb, size)) {
                        if (!tcp_prune_ofo_queue(sk, skb))
                                return -1;
                }
        }
        return 0;
}

static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct rb_node **p, *parent;
        struct sk_buff *skb1;
        u32 seq, end_seq;
        bool fragstolen;

        tcp_save_lrcv_flowlabel(sk, skb);
        tcp_ecn_check_ce(sk, skb);

        if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
                sk->sk_data_ready(sk);
                tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM);
                return;
        }

        /* Disable header prediction. */
        tp->pred_flags = 0;
        inet_csk_schedule_ack(sk);

        tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
        seq = TCP_SKB_CB(skb)->seq;
        end_seq = TCP_SKB_CB(skb)->end_seq;

        p = &tp->out_of_order_queue.rb_node;
        if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                /* Initial out of order segment, build 1 SACK. */
                if (tcp_is_sack(tp)) {
                        tp->rx_opt.num_sacks = 1;
                        tp->selective_acks[0].start_seq = seq;
                        tp->selective_acks[0].end_seq = end_seq;
                }
                rb_link_node(&skb->rbnode, NULL, p);
                rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
                tp->ooo_last_skb = skb;
                goto end;
        }

        /* In the typical case, we are adding an skb to the end of the list.
         * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
         */
        if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
                                 skb, &fragstolen)) {
coalesce_done:
                /* For non sack flows, do not grow window to force DUPACK
                 * and trigger fast retransmit.
                 */
                if (tcp_is_sack(tp))
                        tcp_grow_window(sk, skb, true);
                kfree_skb_partial(skb, fragstolen);
                skb = NULL;
                goto add_sack;
        }
        /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
        if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
                parent = &tp->ooo_last_skb->rbnode;
                p = &parent->rb_right;
                goto insert;
        }

        /* Find place to insert this segment. Handle overlaps on the way. */
        parent = NULL;
        while (*p) {
                parent = *p;
                skb1 = rb_to_skb(parent);
                if (before(seq, TCP_SKB_CB(skb1)->seq)) {
                        p = &parent->rb_left;
                        continue;
                }
                if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
                        if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
                                /* All the bits are present. Drop. */
                                NET_INC_STATS(sock_net(sk),
                                              LINUX_MIB_TCPOFOMERGE);
                                tcp_drop_reason(sk, skb,
                                                SKB_DROP_REASON_TCP_OFOMERGE);
                                skb = NULL;
                                tcp_dsack_set(sk, seq, end_seq);
                                goto add_sack;
                        }
                        if (after(seq, TCP_SKB_CB(skb1)->seq)) {
                                /* Partial overlap. */
                                tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
                        } else {
                                /* skb's seq == skb1's seq and skb covers skb1.
                                 * Replace skb1 with skb.
                                 */
                                rb_replace_node(&skb1->rbnode, &skb->rbnode,
                                                &tp->out_of_order_queue);
                                tcp_dsack_extend(sk,
                                                 TCP_SKB_CB(skb1)->seq,
                                                 TCP_SKB_CB(skb1)->end_seq);
                                NET_INC_STATS(sock_net(sk),
                                              LINUX_MIB_TCPOFOMERGE);
                                tcp_drop_reason(sk, skb1,
                                                SKB_DROP_REASON_TCP_OFOMERGE);
                                goto merge_right;
                        }
                } else if (tcp_ooo_try_coalesce(sk, skb1,
                                                skb, &fragstolen)) {
                        goto coalesce_done;
                }
                p = &parent->rb_right;
        }
insert:
        /* Insert segment into RB tree. */
        rb_link_node(&skb->rbnode, parent, p);
        rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);

merge_right:
        /* Remove other segments covered by skb. */
        while ((skb1 = skb_rb_next(skb)) != NULL) {
                if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
                        break;
                if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
                        tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
                                         end_seq);
                        break;
                }
                rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
                                 TCP_SKB_CB(skb1)->end_seq);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
                tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE);
        }
        /* If there is no skb after us, we are the last_skb ! */
        if (!skb1)
                tp->ooo_last_skb = skb;

add_sack:
        if (tcp_is_sack(tp))
                tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
        if (skb) {
                /* For non sack flows, do not grow window to force DUPACK
                 * and trigger fast retransmit.
                 */
                if (tcp_is_sack(tp))
                        tcp_grow_window(sk, skb, false);
                skb_condense(skb);
                skb_set_owner_r(skb, sk);
        }
}

static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
                                      bool *fragstolen)
{
        int eaten;
        struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);

        eaten = (tail &&
                 tcp_try_coalesce(sk, tail,
                                  skb, fragstolen)) ? 1 : 0;
        tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
        if (!eaten) {
                __skb_queue_tail(&sk->sk_receive_queue, skb);
                skb_set_owner_r(skb, sk);
        }
        return eaten;
}

int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
        struct sk_buff *skb;
        int err = -ENOMEM;
        int data_len = 0;
        bool fragstolen;

        if (size == 0)
                return 0;

        if (size > PAGE_SIZE) {
                int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);

                data_len = npages << PAGE_SHIFT;
                size = data_len + (size & ~PAGE_MASK);
        }
        skb = alloc_skb_with_frags(size - data_len, data_len,
                                   PAGE_ALLOC_COSTLY_ORDER,
                                   &err, sk->sk_allocation);
        if (!skb)
                goto err;

        skb_put(skb, size - data_len);
        skb->data_len = data_len;
        skb->len = size;

        if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
                goto err_free;
        }

        err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
        if (err)
                goto err_free;

        TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
        TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;

        if (tcp_queue_rcv(sk, skb, &fragstolen)) {
                WARN_ON_ONCE(fragstolen); /* should not happen */
                __kfree_skb(skb);
        }
        return size;

err_free:
        kfree_skb(skb);
err:
        return err;

}

void tcp_data_ready(struct sock *sk)
{
        if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
                sk->sk_data_ready(sk);
}

static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        enum skb_drop_reason reason;
        bool fragstolen;
        int eaten;

        /* If a subflow has been reset, the packet should not continue
         * to be processed, drop the packet.
         */
        if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
                __kfree_skb(skb);
                return;
        }

        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
                __kfree_skb(skb);
                return;
        }
        skb_dst_drop(skb);
        __skb_pull(skb, tcp_hdr(skb)->doff * 4);

        reason = SKB_DROP_REASON_NOT_SPECIFIED;
        tp->rx_opt.dsack = 0;

        /*  Queue data for delivery to the user.
         *  Packets in sequence go to the receive queue.
         *  Out of sequence packets to the out_of_order_queue.
         */
        if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
                if (tcp_receive_window(tp) == 0) {
                        /* Some stacks are known to send bare FIN packets
                         * in a loop even if we send RWIN 0 in our ACK.
                         * Accepting this FIN does not hurt memory pressure
                         * because the FIN flag will simply be merged to the
                         * receive queue tail skb in most cases.
                         */
                        if (!skb->len &&
                            (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
                                goto queue_and_out;

                        reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
                        goto out_of_window;
                }

                /* Ok. In sequence. In window. */
queue_and_out:
                if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
                        /* TODO: maybe ratelimit these WIN 0 ACK ? */
                        inet_csk(sk)->icsk_ack.pending |=
                                        (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
                        inet_csk_schedule_ack(sk);
                        sk->sk_data_ready(sk);

                        if (skb_queue_len(&sk->sk_receive_queue) && skb->len) {
                                reason = SKB_DROP_REASON_PROTO_MEM;
                                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
                                goto drop;
                        }
                        sk_forced_mem_schedule(sk, skb->truesize);
                }

                eaten = tcp_queue_rcv(sk, skb, &fragstolen);
                if (skb->len)
                        tcp_event_data_recv(sk, skb);
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        tcp_fin(sk);

                if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                        tcp_ofo_queue(sk);

                        /* RFC5681. 4.2. SHOULD send immediate ACK, when
                         * gap in queue is filled.
                         */
                        if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
                                inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
                }

                if (tp->rx_opt.num_sacks)
                        tcp_sack_remove(tp);

                tcp_fast_path_check(sk);

                if (eaten > 0)
                        kfree_skb_partial(skb, fragstolen);
                if (!sock_flag(sk, SOCK_DEAD))
                        tcp_data_ready(sk);
                return;
        }

        if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
                tcp_rcv_spurious_retrans(sk, skb);
                /* A retransmit, 2nd most common case.  Force an immediate ack. */
                reason = SKB_DROP_REASON_TCP_OLD_DATA;
                NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
                tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);

out_of_window:
                tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
                inet_csk_schedule_ack(sk);
drop:
                tcp_drop_reason(sk, skb, reason);
                return;
        }

        /* Out of window. F.e. zero window probe. */
        if (!before(TCP_SKB_CB(skb)->seq,
                    tp->rcv_nxt + tcp_receive_window(tp))) {
                reason = SKB_DROP_REASON_TCP_OVERWINDOW;
                goto out_of_window;
        }

        if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                /* Partial packet, seq < rcv_next < end_seq */
                tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);

                /* If window is closed, drop tail of packet. But after
                 * remembering D-SACK for its head made in previous line.
                 */
                if (!tcp_receive_window(tp)) {
                        reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
                        goto out_of_window;
                }
                goto queue_and_out;
        }

        tcp_data_queue_ofo(sk, skb);
}

static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
{
        if (list)
                return !skb_queue_is_last(list, skb) ? skb->next : NULL;

        return skb_rb_next(skb);
}

static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
                                        struct sk_buff_head *list,
                                        struct rb_root *root)
{
        struct sk_buff *next = tcp_skb_next(skb, list);

        if (list)
                __skb_unlink(skb, list);
        else
                rb_erase(&skb->rbnode, root);

        __kfree_skb(skb);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);

        return next;
}

/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
        struct sk_buff *skb1;

        while (*p) {
                parent = *p;
                skb1 = rb_to_skb(parent);
                if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;
        }
        rb_link_node(&skb->rbnode, parent, p);
        rb_insert_color(&skb->rbnode, root);
}

/* Collapse contiguous sequence of skbs head..tail with
 * sequence numbers start..end.
 *
 * If tail is NULL, this means until the end of the queue.
 *
 * Segments with FIN/SYN are not collapsed (only because this
 * simplifies code)
 */
static void
tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
             struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
{
        struct sk_buff *skb = head, *n;
        struct sk_buff_head tmp;
        bool end_of_skbs;

        /* First, check that queue is collapsible and find
         * the point where collapsing can be useful.
         */
restart:
        for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
                n = tcp_skb_next(skb, list);

                /* No new bits? It is possible on ofo queue. */
                if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
                        skb = tcp_collapse_one(sk, skb, list, root);
                        if (!skb)
                                break;
                        goto restart;
                }

                /* The first skb to collapse is:
                 * - not SYN/FIN and
                 * - bloated or contains data before "start" or
                 *   overlaps to the next one and mptcp allow collapsing.
                 */
                if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
                    (tcp_win_from_space(sk, skb->truesize) > skb->len ||
                     before(TCP_SKB_CB(skb)->seq, start))) {
                        end_of_skbs = false;
                        break;
                }

                if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) &&
                    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
                        end_of_skbs = false;
                        break;
                }

                /* Decided to skip this, advance start seq. */
                start = TCP_SKB_CB(skb)->end_seq;
        }
        if (end_of_skbs ||
            (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
                return;

        __skb_queue_head_init(&tmp);

        while (before(start, end)) {
                int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
                struct sk_buff *nskb;

                nskb = alloc_skb(copy, GFP_ATOMIC);
                if (!nskb)
                        break;

                memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
                skb_copy_decrypted(nskb, skb);
                TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
                if (list)
                        __skb_queue_before(list, skb, nskb);
                else
                        __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
                skb_set_owner_r(nskb, sk);
                mptcp_skb_ext_move(nskb, skb);

                /* Copy data, releasing collapsed skbs. */
                while (copy > 0) {
                        int offset = start - TCP_SKB_CB(skb)->seq;
                        int size = TCP_SKB_CB(skb)->end_seq - start;

                        BUG_ON(offset < 0);
                        if (size > 0) {
                                size = min(copy, size);
                                if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
                                        BUG();
                                TCP_SKB_CB(nskb)->end_seq += size;
                                copy -= size;
                                start += size;
                        }
                        if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
                                skb = tcp_collapse_one(sk, skb, list, root);
                                if (!skb ||
                                    skb == tail ||
                                    !tcp_skb_can_collapse_rx(nskb, skb) ||
                                    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
                                        goto end;
                        }
                }
        }
end:
        skb_queue_walk_safe(&tmp, skb, n)
                tcp_rbtree_insert(root, skb);
}

/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
 * and tcp_collapse() them until all the queue is collapsed.
 */
static void tcp_collapse_ofo_queue(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 range_truesize, sum_tiny = 0;
        struct sk_buff *skb, *head;
        u32 start, end;

        skb = skb_rb_first(&tp->out_of_order_queue);
new_range:
        if (!skb) {
                tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
                return;
        }
        start = TCP_SKB_CB(skb)->seq;
        end = TCP_SKB_CB(skb)->end_seq;
        range_truesize = skb->truesize;

        for (head = skb;;) {
                skb = skb_rb_next(skb);

                /* Range is terminated when we see a gap or when
                 * we are at the queue end.
                 */
                if (!skb ||
                    after(TCP_SKB_CB(skb)->seq, end) ||
                    before(TCP_SKB_CB(skb)->end_seq, start)) {
                        /* Do not attempt collapsing tiny skbs */
                        if (range_truesize != head->truesize ||
                            end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
                                tcp_collapse(sk, NULL, &tp->out_of_order_queue,
                                             head, skb, start, end);
                        } else {
                                sum_tiny += range_truesize;
                                if (sum_tiny > sk->sk_rcvbuf >> 3)
                                        return;
                        }
                        goto new_range;
                }

                range_truesize += skb->truesize;
                if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
                        start = TCP_SKB_CB(skb)->seq;
                if (after(TCP_SKB_CB(skb)->end_seq, end))
                        end = TCP_SKB_CB(skb)->end_seq;
        }
}

/*
 * Clean the out-of-order queue to make room.
 * We drop high sequences packets to :
 * 1) Let a chance for holes to be filled.
 *    This means we do not drop packets from ooo queue if their sequence
 *    is before incoming packet sequence.
 * 2) not add too big latencies if thousands of packets sit there.
 *    (But if application shrinks SO_RCVBUF, we could still end up
 *     freeing whole queue here)
 * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
 *
 * Return true if queue has shrunk.
 */
static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct rb_node *node, *prev;
        bool pruned = false;
        int goal;

        if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
                return false;

        goal = sk->sk_rcvbuf >> 3;
        node = &tp->ooo_last_skb->rbnode;

        do {
                struct sk_buff *skb = rb_to_skb(node);

                /* If incoming skb would land last in ofo queue, stop pruning. */
                if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
                        break;
                pruned = true;
                prev = rb_prev(node);
                rb_erase(node, &tp->out_of_order_queue);
                goal -= skb->truesize;
                tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
                tp->ooo_last_skb = rb_to_skb(prev);
                if (!prev || goal <= 0) {
                        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
                            !tcp_under_memory_pressure(sk))
                                break;
                        goal = sk->sk_rcvbuf >> 3;
                }
                node = prev;
        } while (node);

        if (pruned) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
                /* Reset SACK state.  A conforming SACK implementation will
                 * do the same at a timeout based retransmit.  When a connection
                 * is in a sad state like this, we care only about integrity
                 * of the connection not performance.
                 */
                if (tp->rx_opt.sack_ok)
                        tcp_sack_reset(&tp->rx_opt);
        }
        return pruned;
}

/* Reduce allocated memory if we can, trying to get
 * the socket within its memory limits again.
 *
 * Return less than zero if we should start dropping frames
 * until the socket owning process reads some of the data
 * to stabilize the situation.
 */
static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

        NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);

        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
                tcp_clamp_window(sk);
        else if (tcp_under_memory_pressure(sk))
                tcp_adjust_rcv_ssthresh(sk);

        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
                return 0;

        tcp_collapse_ofo_queue(sk);
        if (!skb_queue_empty(&sk->sk_receive_queue))
                tcp_collapse(sk, &sk->sk_receive_queue, NULL,
                             skb_peek(&sk->sk_receive_queue),
                             NULL,
                             tp->copied_seq, tp->rcv_nxt);

        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
                return 0;

        /* Collapsing did not help, destructive actions follow.
         * This must not ever occur. */

        tcp_prune_ofo_queue(sk, in_skb);

        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
                return 0;

        /* If we are really being abused, tell the caller to silently
         * drop receive data on the floor.  It will get retransmitted
         * and hopefully then we'll have sufficient space.
         */
        NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);

        /* Massive buffer overcommit. */
        tp->pred_flags = 0;
        return -1;
}

static bool tcp_should_expand_sndbuf(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        /* If the user specified a specific send buffer setting, do
         * not modify it.
         */
        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return false;

        /* If we are under global TCP memory pressure, do not expand.  */
        if (tcp_under_memory_pressure(sk)) {
                int unused_mem = sk_unused_reserved_mem(sk);

                /* Adjust sndbuf according to reserved mem. But make sure
                 * it never goes below SOCK_MIN_SNDBUF.
                 * See sk_stream_moderate_sndbuf() for more details.
                 */
                if (unused_mem > SOCK_MIN_SNDBUF)
                        WRITE_ONCE(sk->sk_sndbuf, unused_mem);

                return false;
        }

        /* If we are under soft global TCP memory pressure, do not expand.  */
        if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
                return false;

        /* If we filled the congestion window, do not expand.  */
        if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
                return false;

        return true;
}

static void tcp_new_space(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tcp_should_expand_sndbuf(sk)) {
                tcp_sndbuf_expand(sk);
                tp->snd_cwnd_stamp = tcp_jiffies32;
        }

        INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
}

/* Caller made space either from:
 * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
 * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
 *
 * We might be able to generate EPOLLOUT to the application if:
 * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
 * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
 *    small enough that tcp_stream_memory_free() decides it
 *    is time to generate EPOLLOUT.
 */
void tcp_check_space(struct sock *sk)
{
        /* pairs with tcp_poll() */
        smp_mb();
        if (sk->sk_socket &&
            test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
                tcp_new_space(sk);
                if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
                        tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
        }
}

static inline void tcp_data_snd_check(struct sock *sk)
{
        tcp_push_pending_frames(sk);
        tcp_check_space(sk);
}

/*
 * Check if sending an ack is needed.
 */
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned long rtt, delay;

            /* More than one full frame received... */
        if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
             /* ... and right edge of window advances far enough.
              * (tcp_recvmsg() will send ACK otherwise).
              * If application uses SO_RCVLOWAT, we want send ack now if
              * we have not received enough bytes to satisfy the condition.
              */
            (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
             __tcp_select_window(sk) >= tp->rcv_wnd)) ||
            /* We ACK each frame or... */
            tcp_in_quickack_mode(sk) ||
            /* Protocol state mandates a one-time immediate ACK */
            inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
                /* If we are running from __release_sock() in user context,
                 * Defer the ack until tcp_release_cb().
                 */
                if (sock_owned_by_user_nocheck(sk) &&
                    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
                        set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
                        return;
                }
send_now:
                tcp_send_ack(sk);
                return;
        }

        if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                tcp_send_delayed_ack(sk);
                return;
        }

        if (!tcp_is_sack(tp) ||
            tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
                goto send_now;

        if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
                tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
                tp->dup_ack_counter = 0;
        }
        if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
                tp->dup_ack_counter++;
                goto send_now;
        }
        tp->compressed_ack++;
        if (hrtimer_is_queued(&tp->compressed_ack_timer))
                return;

        /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */

        rtt = tp->rcv_rtt_est.rtt_us;
        if (tp->srtt_us && tp->srtt_us < rtt)
                rtt = tp->srtt_us;

        delay = min_t(unsigned long,
                      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
                      rtt * (NSEC_PER_USEC >> 3)/20);
        sock_hold(sk);
        hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
                               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
                               HRTIMER_MODE_REL_PINNED_SOFT);
}

static inline void tcp_ack_snd_check(struct sock *sk)
{
        if (!inet_csk_ack_scheduled(sk)) {
                /* We sent a data segment already. */
                return;
        }
        __tcp_ack_snd_check(sk, 1);
}

/*
 *        This routine is only called when we have urgent data
 *        signaled. Its the 'slow' part of tcp_urg. It could be
 *        moved inline now as tcp_urg is only called from one
 *        place. We handle URGent data wrong. We have to - as
 *        BSD still doesn't use the correction from RFC961.
 *        For 1003.1g we should support a new option TCP_STDURG to permit
 *        either form (or just set the sysctl tcp_stdurg).
 */

static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 ptr = ntohs(th->urg_ptr);

        if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
                ptr--;
        ptr += ntohl(th->seq);

        /* Ignore urgent data that we've already seen and read. */
        if (after(tp->copied_seq, ptr))
                return;

        /* Do not replay urg ptr.
         *
         * NOTE: interesting situation not covered by specs.
         * Misbehaving sender may send urg ptr, pointing to segment,
         * which we already have in ofo queue. We are not able to fetch
         * such data and will stay in TCP_URG_NOTYET until will be eaten
         * by recvmsg(). Seems, we are not obliged to handle such wicked
         * situations. But it is worth to think about possibility of some
         * DoSes using some hypothetical application level deadlock.
         */
        if (before(ptr, tp->rcv_nxt))
                return;

        /* Do we already have a newer (or duplicate) urgent pointer? */
        if (tp->urg_data && !after(ptr, tp->urg_seq))
                return;

        /* Tell the world about our new urgent pointer. */
        sk_send_sigurg(sk);

        /* We may be adding urgent data when the last byte read was
         * urgent. To do this requires some care. We cannot just ignore
         * tp->copied_seq since we would read the last urgent byte again
         * as data, nor can we alter copied_seq until this data arrives
         * or we break the semantics of SIOCATMARK (and thus sockatmark())
         *
         * NOTE. Double Dutch. Rendering to plain English: author of comment
         * above did something sort of         send("A", MSG_OOB); send("B", MSG_OOB);
         * and expect that both A and B disappear from stream. This is _wrong_.
         * Though this happens in BSD with high probability, this is occasional.
         * Any application relying on this is buggy. Note also, that fix "works"
         * only in this artificial test. Insert some normal data between A and B and we will
         * decline of BSD again. Verdict: it is better to remove to trap
         * buggy users.
         */
        if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
            !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
                struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
                tp->copied_seq++;
                if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
                        __skb_unlink(skb, &sk->sk_receive_queue);
                        __kfree_skb(skb);
                }
        }

        WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
        WRITE_ONCE(tp->urg_seq, ptr);

        /* Disable header prediction. */
        tp->pred_flags = 0;
}

/* This is the 'fast' part of urgent handling. */
static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* Check if we get a new urgent pointer - normally not. */
        if (unlikely(th->urg))
                tcp_check_urg(sk, th);

        /* Do we wait for any urgent data? - normally not... */
        if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
                u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
                          th->syn;

                /* Is the urgent pointer pointing into this packet? */
                if (ptr < skb->len) {
                        u8 tmp;
                        if (skb_copy_bits(skb, ptr, &tmp, 1))
                                BUG();
                        WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
                        if (!sock_flag(sk, SOCK_DEAD))
                                sk->sk_data_ready(sk);
                }
        }
}

/* Accept RST for rcv_nxt - 1 after a FIN.
 * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
 * FIN is sent followed by a RST packet. The RST is sent with the same
 * sequence number as the FIN, and thus according to RFC 5961 a challenge
 * ACK should be sent. However, Mac OSX rate limits replies to challenge
 * ACKs on the closed socket. In addition middleboxes can drop either the
 * challenge ACK or a subsequent RST.
 */
static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
                        (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
                                               TCPF_CLOSING));
}

/* Does PAWS and seqno based validation of an incoming segment, flags will
 * play significant role here.
 */
static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
                                  const struct tcphdr *th, int syn_inerr)
{
        struct tcp_sock *tp = tcp_sk(sk);
        SKB_DR(reason);

        /* RFC1323: H1. Apply PAWS check first. */
        if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
            tp->rx_opt.saw_tstamp &&
            tcp_paws_discard(sk, skb)) {
                if (!th->rst) {
                        if (unlikely(th->syn))
                                goto syn_challenge;
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
                        if (!tcp_oow_rate_limited(sock_net(sk), skb,
                                                  LINUX_MIB_TCPACKSKIPPEDPAWS,
                                                  &tp->last_oow_ack_time))
                                tcp_send_dupack(sk, skb);
                        SKB_DR_SET(reason, TCP_RFC7323_PAWS);
                        goto discard;
                }
                /* Reset is accepted even if it did not pass PAWS. */
        }

        /* Step 1: check sequence number */
        reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
        if (reason) {
                /* RFC793, page 37: "In all states except SYN-SENT, all reset
                 * (RST) segments are validated by checking their SEQ-fields."
                 * And page 69: "If an incoming segment is not acceptable,
                 * an acknowledgment should be sent in reply (unless the RST
                 * bit is set, if so drop the segment and return)".
                 */
                if (!th->rst) {
                        if (th->syn)
                                goto syn_challenge;
                        if (!tcp_oow_rate_limited(sock_net(sk), skb,
                                                  LINUX_MIB_TCPACKSKIPPEDSEQ,
                                                  &tp->last_oow_ack_time))
                                tcp_send_dupack(sk, skb);
                } else if (tcp_reset_check(sk, skb)) {
                        goto reset;
                }
                goto discard;
        }

        /* Step 2: check RST bit */
        if (th->rst) {
                /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a
                 * FIN and SACK too if available):
                 * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
                 * the right-most SACK block,
                 * then
                 *     RESET the connection
                 * else
                 *     Send a challenge ACK
                 */
                if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
                    tcp_reset_check(sk, skb))
                        goto reset;

                if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
                        struct tcp_sack_block *sp = &tp->selective_acks[0];
                        int max_sack = sp[0].end_seq;
                        int this_sack;

                        for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;
                             ++this_sack) {
                                max_sack = after(sp[this_sack].end_seq,
                                                 max_sack) ?
                                        sp[this_sack].end_seq : max_sack;
                        }

                        if (TCP_SKB_CB(skb)->seq == max_sack)
                                goto reset;
                }

                /* Disable TFO if RST is out-of-order
                 * and no data has been received
                 * for current active TFO socket
                 */
                if (tp->syn_fastopen && !tp->data_segs_in &&
                    sk->sk_state == TCP_ESTABLISHED)
                        tcp_fastopen_active_disable(sk);
                tcp_send_challenge_ack(sk);
                SKB_DR_SET(reason, TCP_RESET);
                goto discard;
        }

        /* step 3: check security and precedence [ignored] */

        /* step 4: Check for a SYN
         * RFC 5961 4.2 : Send a challenge ack
         */
        if (th->syn) {
syn_challenge:
                if (syn_inerr)
                        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
                tcp_send_challenge_ack(sk);
                SKB_DR_SET(reason, TCP_INVALID_SYN);
                goto discard;
        }

        bpf_skops_parse_hdr(sk, skb);

        return true;

discard:
        tcp_drop_reason(sk, skb, reason);
        return false;

reset:
        tcp_reset(sk, skb);
        __kfree_skb(skb);
        return false;
}

/*
 *        TCP receive function for the ESTABLISHED state.
 *
 *        It is split into a fast path and a slow path. The fast path is
 *         disabled when:
 *        - A zero window was announced from us - zero window probing
 *        is only handled properly in the slow path.
 *        - Out of order segments arrived.
 *        - Urgent data is expected.
 *        - There is no buffer space left
 *        - Unexpected TCP flags/window values/header lengths are received
 *          (detected by checking the TCP header against pred_flags)
 *        - Data is sent in both directions. Fast path only supports pure senders
 *          or pure receivers (this means either the sequence number or the ack
 *          value must stay constant)
 *        - Unexpected TCP option.
 *
 *        When these conditions are not satisfied it drops into a standard
 *        receive procedure patterned after RFC793 to handle all cases.
 *        The first three cases are guaranteed by proper pred_flags setting,
 *        the rest is checked inline. Fast processing is turned on in
 *        tcp_data_queue when everything is OK.
 */
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        const struct tcphdr *th = (const struct tcphdr *)skb->data;
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int len = skb->len;

        /* TCP congestion window tracking */
        trace_tcp_probe(sk, skb);

        tcp_mstamp_refresh(tp);
        if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
                inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
        /*
         *        Header prediction.
         *        The code loosely follows the one in the famous
         *        "30 instruction TCP receive" Van Jacobson mail.
         *
         *        Van's trick is to deposit buffers into socket queue
         *        on a device interrupt, to call tcp_recv function
         *        on the receive process context and checksum and copy
         *        the buffer to user space. smart...
         *
         *        Our current scheme is not silly either but we take the
         *        extra cost of the net_bh soft interrupt processing...
         *        We do checksum and copy also but from device to kernel.
         */

        tp->rx_opt.saw_tstamp = 0;

        /*        pred_flags is 0xS?10 << 16 + snd_wnd
         *        if header_prediction is to be made
         *        'S' will always be tp->tcp_header_len >> 2
         *        '?' will be 0 for the fast path, otherwise pred_flags is 0 to
         *  turn it off        (when there are holes in the receive
         *         space for instance)
         *        PSH flag is ignored.
         */

        if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
            TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
            !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
                int tcp_header_len = tp->tcp_header_len;

                /* Timestamp header prediction: tcp_header_len
                 * is automatically equal to th->doff*4 due to pred_flags
                 * match.
                 */

                /* Check timestamp */
                if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
                        /* No? Slow path! */
                        if (!tcp_parse_aligned_timestamp(tp, th))
                                goto slow_path;

                        /* If PAWS failed, check it more carefully in slow path */
                        if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
                                goto slow_path;

                        /* DO NOT update ts_recent here, if checksum fails
                         * and timestamp was corrupted part, it will result
                         * in a hung connection since we will drop all
                         * future packets due to the PAWS test.
                         */
                }

                if (len <= tcp_header_len) {
                        /* Bulk data transfer: sender */
                        if (len == tcp_header_len) {
                                /* Predicted packet is in window by definition.
                                 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                 * Hence, check seq<=rcv_wup reduces to:
                                 */
                                if (tcp_header_len ==
                                    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
                                    tp->rcv_nxt == tp->rcv_wup)
                                        tcp_store_ts_recent(tp);

                                /* We know that such packets are checksummed
                                 * on entry.
                                 */
                                tcp_ack(sk, skb, 0);
                                __kfree_skb(skb);
                                tcp_data_snd_check(sk);
                                /* When receiving pure ack in fast path, update
                                 * last ts ecr directly instead of calling
                                 * tcp_rcv_rtt_measure_ts()
                                 */
                                tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
                                return;
                        } else { /* Header too small */
                                reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                                TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
                                goto discard;
                        }
                } else {
                        int eaten = 0;
                        bool fragstolen = false;

                        if (tcp_checksum_complete(skb))
                                goto csum_error;

                        if ((int)skb->truesize > sk->sk_forward_alloc)
                                goto step5;

                        /* Predicted packet is in window by definition.
                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                         * Hence, check seq<=rcv_wup reduces to:
                         */
                        if (tcp_header_len ==
                            (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
                            tp->rcv_nxt == tp->rcv_wup)
                                tcp_store_ts_recent(tp);

                        tcp_rcv_rtt_measure_ts(sk, skb);

                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);

                        /* Bulk data transfer: receiver */
                        skb_dst_drop(skb);
                        __skb_pull(skb, tcp_header_len);
                        eaten = tcp_queue_rcv(sk, skb, &fragstolen);

                        tcp_event_data_recv(sk, skb);

                        if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
                                /* Well, only one small jumplet in fast path... */
                                tcp_ack(sk, skb, FLAG_DATA);
                                tcp_data_snd_check(sk);
                                if (!inet_csk_ack_scheduled(sk))
                                        goto no_ack;
                        } else {
                                tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
                        }

                        __tcp_ack_snd_check(sk, 0);
no_ack:
                        if (eaten)
                                kfree_skb_partial(skb, fragstolen);
                        tcp_data_ready(sk);
                        return;
                }
        }

slow_path:
        if (len < (th->doff << 2) || tcp_checksum_complete(skb))
                goto csum_error;

        if (!th->ack && !th->rst && !th->syn) {
                reason = SKB_DROP_REASON_TCP_FLAGS;
                goto discard;
        }

        /*
         *        Standard slow path.
         */

        if (!tcp_validate_incoming(sk, skb, th, 1))
                return;

step5:
        reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
        if ((int)reason < 0) {
                reason = -reason;
                goto discard;
        }
        tcp_rcv_rtt_measure_ts(sk, skb);

        /* Process urgent data. */
        tcp_urg(sk, skb, th);

        /* step 7: process the segment text */
        tcp_data_queue(sk, skb);

        tcp_data_snd_check(sk);
        tcp_ack_snd_check(sk);
        return;

csum_error:
        reason = SKB_DROP_REASON_TCP_CSUM;
        trace_tcp_bad_csum(skb);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);

discard:
        tcp_drop_reason(sk, skb, reason);
}
EXPORT_SYMBOL(tcp_rcv_established);

void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);

        tcp_mtup_init(sk);
        icsk->icsk_af_ops->rebuild_header(sk);
        tcp_init_metrics(sk);

        /* Initialize the congestion window to start the transfer.
         * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
         * retransmitted. In light of RFC6298 more aggressive 1sec
         * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
         * retransmission has occurred.
         */
        if (tp->total_retrans > 1 && tp->undo_marker)
                tcp_snd_cwnd_set(tp, 1);
        else
                tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk)));
        tp->snd_cwnd_stamp = tcp_jiffies32;

        bpf_skops_established(sk, bpf_op, skb);
        /* Initialize congestion control unless BPF initialized it already: */
        if (!icsk->icsk_ca_initialized)
                tcp_init_congestion_control(sk);
        tcp_init_buffer_space(sk);
}

void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_ao_finish_connect(sk, skb);
        tcp_set_state(sk, TCP_ESTABLISHED);
        icsk->icsk_ack.lrcvtime = tcp_jiffies32;

        if (skb) {
                icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
                security_inet_conn_established(sk, skb);
                sk_mark_napi_id(sk, skb);
        }

        tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);

        /* Prevent spurious tcp_cwnd_restart() on first data
         * packet.
         */
        tp->lsndtime = tcp_jiffies32;

        if (sock_flag(sk, SOCK_KEEPOPEN))
                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));

        if (!tp->rx_opt.snd_wscale)
                __tcp_fast_path_on(tp, tp->snd_wnd);
        else
                tp->pred_flags = 0;
}

static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
                                    struct tcp_fastopen_cookie *cookie)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
        u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
        bool syn_drop = false;

        if (mss == tp->rx_opt.user_mss) {
                struct tcp_options_received opt;

                /* Get original SYNACK MSS value if user MSS sets mss_clamp */
                tcp_clear_options(&opt);
                opt.user_mss = opt.mss_clamp = 0;
                tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
                mss = opt.mss_clamp;
        }

        if (!tp->syn_fastopen) {
                /* Ignore an unsolicited cookie */
                cookie->len = -1;
        } else if (tp->total_retrans) {
                /* SYN timed out and the SYN-ACK neither has a cookie nor
                 * acknowledges data. Presumably the remote received only
                 * the retransmitted (regular) SYNs: either the original
                 * SYN-data or the corresponding SYN-ACK was dropped.
                 */
                syn_drop = (cookie->len < 0 && data);
        } else if (cookie->len < 0 && !tp->syn_data) {
                /* We requested a cookie but didn't get it. If we did not use
                 * the (old) exp opt format then try so next time (try_exp=1).
                 * Otherwise we go back to use the RFC7413 opt (try_exp=2).
                 */
                try_exp = tp->syn_fastopen_exp ? 2 : 1;
        }

        tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);

        if (data) { /* Retransmit unacked data in SYN */
                if (tp->total_retrans)
                        tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
                else
                        tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
                skb_rbtree_walk_from(data)
                         tcp_mark_skb_lost(sk, data);
                tcp_xmit_retransmit_queue(sk);
                tp->retrans_stamp = 0;
                NET_INC_STATS(sock_net(sk),
                                LINUX_MIB_TCPFASTOPENACTIVEFAIL);
                return true;
        }
        tp->syn_data_acked = tp->syn_data;
        if (tp->syn_data_acked) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
                /* SYN-data is counted as two separate packets in tcp_ack() */
                if (tp->delivered > 1)
                        --tp->delivered;
        }

        tcp_fastopen_add_skb(sk, synack);

        return false;
}

static void smc_check_reset_syn(struct tcp_sock *tp)
{
#if IS_ENABLED(CONFIG_SMC)
        if (static_branch_unlikely(&tcp_have_smc)) {
                if (tp->syn_smc && !tp->rx_opt.smc_ok)
                        tp->syn_smc = 0;
        }
#endif
}

static void tcp_try_undo_spurious_syn(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 syn_stamp;

        /* undo_marker is set when SYN or SYNACK times out. The timeout is
         * spurious if the ACK's timestamp option echo value matches the
         * original SYN timestamp.
         */
        syn_stamp = tp->retrans_stamp;
        if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
            syn_stamp == tp->rx_opt.rcv_tsecr)
                tp->undo_marker = 0;
}

static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                                         const struct tcphdr *th)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_fastopen_cookie foc = { .len = -1 };
        int saved_clamp = tp->rx_opt.mss_clamp;
        bool fastopen_fail;
        SKB_DR(reason);

        tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
                tp->rx_opt.rcv_tsecr -= tp->tsoffset;

        if (th->ack) {
                /* rfc793:
                 * "If the state is SYN-SENT then
                 *    first check the ACK bit
                 *      If the ACK bit is set
                 *          If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
                 *        a reset (unless the RST bit is set, if so drop
                 *        the segment and return)"
                 */
                if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
                    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
                        /* Previous FIN/ACK or RST/ACK might be ignored. */
                        if (icsk->icsk_retransmits == 0)
                                inet_csk_reset_xmit_timer(sk,
                                                ICSK_TIME_RETRANS,
                                                TCP_TIMEOUT_MIN, TCP_RTO_MAX);
                        SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE);
                        goto reset_and_undo;
                }

                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
                    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
                             tcp_time_stamp_ts(tp))) {
                        NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_PAWSACTIVEREJECTED);
                        SKB_DR_SET(reason, TCP_RFC7323_PAWS);
                        goto reset_and_undo;
                }

                /* Now ACK is acceptable.
                 *
                 * "If the RST bit is set
                 *    If the ACK was acceptable then signal the user "error:
                 *    connection reset", drop the segment, enter CLOSED state,
                 *    delete TCB, and return."
                 */

                if (th->rst) {
                        tcp_reset(sk, skb);
consume:
                        __kfree_skb(skb);
                        return 0;
                }

                /* rfc793:
                 *   "fifth, if neither of the SYN or RST bits is set then
                 *    drop the segment and return."
                 *
                 *    See note below!
                 *                                        --ANK(990513)
                 */
                if (!th->syn) {
                        SKB_DR_SET(reason, TCP_FLAGS);
                        goto discard_and_undo;
                }
                /* rfc793:
                 *   "If the SYN bit is on ...
                 *    are acceptable then ...
                 *    (our SYN has been ACKed), change the connection
                 *    state to ESTABLISHED..."
                 */

                tcp_ecn_rcv_synack(tp, th);

                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                tcp_try_undo_spurious_syn(sk);
                tcp_ack(sk, skb, FLAG_SLOWPATH);

                /* Ok.. it's good. Set up sequence numbers and
                 * move to established.
                 */
                WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

                /* RFC1323: The window in SYN & SYN/ACK segments is
                 * never scaled.
                 */
                tp->snd_wnd = ntohs(th->window);

                if (!tp->rx_opt.wscale_ok) {
                        tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
                        WRITE_ONCE(tp->window_clamp,
                                   min(tp->window_clamp, 65535U));
                }

                if (tp->rx_opt.saw_tstamp) {
                        tp->rx_opt.tstamp_ok           = 1;
                        tp->tcp_header_len =
                                sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
                        tp->advmss            -= TCPOLEN_TSTAMP_ALIGNED;
                        tcp_store_ts_recent(tp);
                } else {
                        tp->tcp_header_len = sizeof(struct tcphdr);
                }

                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
                tcp_initialize_rcv_mss(sk);

                /* Remember, tcp_poll() does not lock socket!
                 * Change state from SYN-SENT only after copied_seq
                 * is initialized. */
                WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);

                smc_check_reset_syn(tp);

                smp_mb();

                tcp_finish_connect(sk, skb);

                fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
                                tcp_rcv_fastopen_synack(sk, skb, &foc);

                if (!sock_flag(sk, SOCK_DEAD)) {
                        sk->sk_state_change(sk);
                        sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
                }
                if (fastopen_fail)
                        return -1;
                if (sk->sk_write_pending ||
                    READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
                    inet_csk_in_pingpong_mode(sk)) {
                        /* Save one ACK. Data will be ready after
                         * several ticks, if write_pending is set.
                         *
                         * It may be deleted, but with this feature tcpdumps
                         * look so _wonderfully_ clever, that I was not able
                         * to stand against the temptation 8)     --ANK
                         */
                        inet_csk_schedule_ack(sk);
                        tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
                                                  TCP_DELACK_MAX, TCP_RTO_MAX);
                        goto consume;
                }
                tcp_send_ack(sk);
                return -1;
        }

        /* No ACK in the segment */

        if (th->rst) {
                /* rfc793:
                 * "If the RST bit is set
                 *
                 *      Otherwise (no ACK) drop the segment and return."
                 */
                SKB_DR_SET(reason, TCP_RESET);
                goto discard_and_undo;
        }

        /* PAWS check. */
        if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
            tcp_paws_reject(&tp->rx_opt, 0)) {
                SKB_DR_SET(reason, TCP_RFC7323_PAWS);
                goto discard_and_undo;
        }
        if (th->syn) {
                /* We see SYN without ACK. It is attempt of
                 * simultaneous connect with crossed SYNs.
                 * Particularly, it can be connect to self.
                 */
#ifdef CONFIG_TCP_AO
                struct tcp_ao_info *ao;

                ao = rcu_dereference_protected(tp->ao_info,
                                               lockdep_sock_is_held(sk));
                if (ao) {
                        WRITE_ONCE(ao->risn, th->seq);
                        ao->rcv_sne = 0;
                }
#endif
                tcp_set_state(sk, TCP_SYN_RECV);

                if (tp->rx_opt.saw_tstamp) {
                        tp->rx_opt.tstamp_ok = 1;
                        tcp_store_ts_recent(tp);
                        tp->tcp_header_len =
                                sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
                } else {
                        tp->tcp_header_len = sizeof(struct tcphdr);
                }

                WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
                WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;

                /* RFC1323: The window in SYN & SYN/ACK segments is
                 * never scaled.
                 */
                tp->snd_wnd    = ntohs(th->window);
                tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
                tp->max_window = tp->snd_wnd;

                tcp_ecn_rcv_syn(tp, th);

                tcp_mtup_init(sk);
                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
                tcp_initialize_rcv_mss(sk);

                tcp_send_synack(sk);
#if 0
                /* Note, we could accept data and URG from this segment.
                 * There are no obstacles to make this (except that we must
                 * either change tcp_recvmsg() to prevent it from returning data
                 * before 3WHS completes per RFC793, or employ TCP Fast Open).
                 *
                 * However, if we ignore data in ACKless segments sometimes,
                 * we have no reasons to accept it sometimes.
                 * Also, seems the code doing it in step6 of tcp_rcv_state_process
                 * is not flawless. So, discard packet for sanity.
                 * Uncomment this return to process the data.
                 */
                return -1;
#else
                goto consume;
#endif
        }
        /* "fifth, if neither of the SYN or RST bits is set then
         * drop the segment and return."
         */

discard_and_undo:
        tcp_clear_options(&tp->rx_opt);
        tp->rx_opt.mss_clamp = saved_clamp;
        tcp_drop_reason(sk, skb, reason);
        return 0;

reset_and_undo:
        tcp_clear_options(&tp->rx_opt);
        tp->rx_opt.mss_clamp = saved_clamp;
        /* we can reuse/return @reason to its caller to handle the exception */
        return reason;
}

static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct request_sock *req;

        /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
         * undo. If peer SACKs triggered fast recovery, we can't undo here.
         */
        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
                tcp_try_undo_recovery(sk);

        /* Reset rtx states to prevent spurious retransmits_timed_out() */
        tcp_update_rto_time(tp);
        tp->retrans_stamp = 0;
        inet_csk(sk)->icsk_retransmits = 0;

        /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
         * we no longer need req so release it.
         */
        req = rcu_dereference_protected(tp->fastopen_rsk,
                                        lockdep_sock_is_held(sk));
        reqsk_fastopen_remove(sk, req, false);

        /* Re-arm the timer because data may have been sent out.
         * This is similar to the regular data transmission case
         * when new data has just been ack'ed.
         *
         * (TFO) - we could try to be more aggressive and
         * retransmitting any data sooner based on when they
         * are sent out.
         */
        tcp_rearm_rto(sk);
}

/*
 *        This function implements the receiving procedure of RFC 793 for
 *        all states except ESTABLISHED and TIME_WAIT.
 *        It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
 *        address independent.
 */

enum skb_drop_reason
tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcphdr *th = tcp_hdr(skb);
        struct request_sock *req;
        int queued = 0;
        SKB_DR(reason);

        switch (sk->sk_state) {
        case TCP_CLOSE:
                SKB_DR_SET(reason, TCP_CLOSE);
                goto discard;

        case TCP_LISTEN:
                if (th->ack)
                        return SKB_DROP_REASON_TCP_FLAGS;

                if (th->rst) {
                        SKB_DR_SET(reason, TCP_RESET);
                        goto discard;
                }
                if (th->syn) {
                        if (th->fin) {
                                SKB_DR_SET(reason, TCP_FLAGS);
                                goto discard;
                        }
                        /* It is possible that we process SYN packets from backlog,
                         * so we need to make sure to disable BH and RCU right there.
                         */
                        rcu_read_lock();
                        local_bh_disable();
                        icsk->icsk_af_ops->conn_request(sk, skb);
                        local_bh_enable();
                        rcu_read_unlock();

                        consume_skb(skb);
                        return 0;
                }
                SKB_DR_SET(reason, TCP_FLAGS);
                goto discard;

        case TCP_SYN_SENT:
                tp->rx_opt.saw_tstamp = 0;
                tcp_mstamp_refresh(tp);
                queued = tcp_rcv_synsent_state_process(sk, skb, th);
                if (queued >= 0)
                        return queued;

                /* Do step6 onward by hand. */
                tcp_urg(sk, skb, th);
                __kfree_skb(skb);
                tcp_data_snd_check(sk);
                return 0;
        }

        tcp_mstamp_refresh(tp);
        tp->rx_opt.saw_tstamp = 0;
        req = rcu_dereference_protected(tp->fastopen_rsk,
                                        lockdep_sock_is_held(sk));
        if (req) {
                bool req_stolen;

                WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
                    sk->sk_state != TCP_FIN_WAIT1);

                if (!tcp_check_req(sk, skb, req, true, &req_stolen)) {
                        SKB_DR_SET(reason, TCP_FASTOPEN);
                        goto discard;
                }
        }

        if (!th->ack && !th->rst && !th->syn) {
                SKB_DR_SET(reason, TCP_FLAGS);
                goto discard;
        }
        if (!tcp_validate_incoming(sk, skb, th, 0))
                return 0;

        /* step 5: check the ACK field */
        reason = tcp_ack(sk, skb, FLAG_SLOWPATH |
                                  FLAG_UPDATE_TS_RECENT |
                                  FLAG_NO_CHALLENGE_ACK);

        if ((int)reason <= 0) {
                if (sk->sk_state == TCP_SYN_RECV) {
                        /* send one RST */
                        if (!reason)
                                return SKB_DROP_REASON_TCP_OLD_ACK;
                        return -reason;
                }
                /* accept old ack during closing */
                if ((int)reason < 0) {
                        tcp_send_challenge_ack(sk);
                        reason = -reason;
                        goto discard;
                }
        }
        SKB_DR_SET(reason, NOT_SPECIFIED);
        switch (sk->sk_state) {
        case TCP_SYN_RECV:
                tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
                if (!tp->srtt_us)
                        tcp_synack_rtt_meas(sk, req);

                if (req) {
                        tcp_rcv_synrecv_state_fastopen(sk);
                } else {
                        tcp_try_undo_spurious_syn(sk);
                        tp->retrans_stamp = 0;
                        tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
                                          skb);
                        WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
                }
                tcp_ao_established(sk);
                smp_mb();
                tcp_set_state(sk, TCP_ESTABLISHED);
                sk->sk_state_change(sk);

                /* Note, that this wakeup is only for marginal crossed SYN case.
                 * Passively open sockets are not waked up, because
                 * sk->sk_sleep == NULL and sk->sk_socket == NULL.
                 */
                if (sk->sk_socket)
                        sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);

                tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
                tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);

                if (tp->rx_opt.tstamp_ok)
                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;

                if (!inet_csk(sk)->icsk_ca_ops->cong_control)
                        tcp_update_pacing_rate(sk);

                /* Prevent spurious tcp_cwnd_restart() on first data packet */
                tp->lsndtime = tcp_jiffies32;

                tcp_initialize_rcv_mss(sk);
                tcp_fast_path_on(tp);
                if (sk->sk_shutdown & SEND_SHUTDOWN)
                        tcp_shutdown(sk, SEND_SHUTDOWN);
                break;

        case TCP_FIN_WAIT1: {
                int tmo;

                if (req)
                        tcp_rcv_synrecv_state_fastopen(sk);

                if (tp->snd_una != tp->write_seq)
                        break;

                tcp_set_state(sk, TCP_FIN_WAIT2);
                WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN);

                sk_dst_confirm(sk);

                if (!sock_flag(sk, SOCK_DEAD)) {
                        /* Wake up lingering close() */
                        sk->sk_state_change(sk);
                        break;
                }

                if (READ_ONCE(tp->linger2) < 0) {
                        tcp_done(sk);
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                        return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
                }
                if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
                    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
                        /* Receive out of order FIN after close() */
                        if (tp->syn_fastopen && th->fin)
                                tcp_fastopen_active_disable(sk);
                        tcp_done(sk);
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                        return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
                }

                tmo = tcp_fin_time(sk);
                if (tmo > TCP_TIMEWAIT_LEN) {
                        inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
                } else if (th->fin || sock_owned_by_user(sk)) {
                        /* Bad case. We could lose such FIN otherwise.
                         * It is not a big problem, but it looks confusing
                         * and not so rare event. We still can lose it now,
                         * if it spins in bh_lock_sock(), but it is really
                         * marginal case.
                         */
                        inet_csk_reset_keepalive_timer(sk, tmo);
                } else {
                        tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                        goto consume;
                }
                break;
        }

        case TCP_CLOSING:
                if (tp->snd_una == tp->write_seq) {
                        tcp_time_wait(sk, TCP_TIME_WAIT, 0);
                        goto consume;
                }
                break;

        case TCP_LAST_ACK:
                if (tp->snd_una == tp->write_seq) {
                        tcp_update_metrics(sk);
                        tcp_done(sk);
                        goto consume;
                }
                break;
        }

        /* step 6: check the URG bit */
        tcp_urg(sk, skb, th);

        /* step 7: process the segment text */
        switch (sk->sk_state) {
        case TCP_CLOSE_WAIT:
        case TCP_CLOSING:
        case TCP_LAST_ACK:
                if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
                        /* If a subflow has been reset, the packet should not
                         * continue to be processed, drop the packet.
                         */
                        if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
                                goto discard;
                        break;
                }
                fallthrough;
        case TCP_FIN_WAIT1:
        case TCP_FIN_WAIT2:
                /* RFC 793 says to queue data in these states,
                 * RFC 1122 says we MUST send a reset.
                 * BSD 4.4 also does reset.
                 */
                if (sk->sk_shutdown & RCV_SHUTDOWN) {
                        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
                            after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
                                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                                tcp_reset(sk, skb);
                                return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
                        }
                }
                fallthrough;
        case TCP_ESTABLISHED:
                tcp_data_queue(sk, skb);
                queued = 1;
                break;
        }

        /* tcp_data could move socket to TIME-WAIT */
        if (sk->sk_state != TCP_CLOSE) {
                tcp_data_snd_check(sk);
                tcp_ack_snd_check(sk);
        }

        if (!queued) {
discard:
                tcp_drop_reason(sk, skb, reason);
        }
        return 0;

consume:
        __kfree_skb(skb);
        return 0;
}
EXPORT_SYMBOL(tcp_rcv_state_process);

static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
{
        struct inet_request_sock *ireq = inet_rsk(req);

        if (family == AF_INET)
                net_dbg_ratelimited("drop open request from %pI4/%u\n",
                                    &ireq->ir_rmt_addr, port);
#if IS_ENABLED(CONFIG_IPV6)
        else if (family == AF_INET6)
                net_dbg_ratelimited("drop open request from %pI6/%u\n",
                                    &ireq->ir_v6_rmt_addr, port);
#endif
}

/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
 *
 * If we receive a SYN packet with these bits set, it means a
 * network is playing bad games with TOS bits. In order to
 * avoid possible false congestion notifications, we disable
 * TCP ECN negotiation.
 *
 * Exception: tcp_ca wants ECN. This is required for DCTCP
 * congestion control: Linux DCTCP asserts ECT on all packets,
 * including SYN, which is most optimal solution; however,
 * others, such as FreeBSD do not.
 *
 * Exception: At least one of the reserved bits of the TCP header (th->res1) is
 * set, indicating the use of a future TCP extension (such as AccECN). See
 * RFC8311 §4.3 which updates RFC3168 to allow the development of such
 * extensions.
 */
static void tcp_ecn_create_request(struct request_sock *req,
                                   const struct sk_buff *skb,
                                   const struct sock *listen_sk,
                                   const struct dst_entry *dst)
{
        const struct tcphdr *th = tcp_hdr(skb);
        const struct net *net = sock_net(listen_sk);
        bool th_ecn = th->ece && th->cwr;
        bool ect, ecn_ok;
        u32 ecn_ok_dst;

        if (!th_ecn)
                return;

        ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
        ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
        ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;

        if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
            (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
            tcp_bpf_ca_needs_ecn((struct sock *)req))
                inet_rsk(req)->ecn_ok = 1;
}

static void tcp_openreq_init(struct request_sock *req,
                             const struct tcp_options_received *rx_opt,
                             struct sk_buff *skb, const struct sock *sk)
{
        struct inet_request_sock *ireq = inet_rsk(req);

        req->rsk_rcv_wnd = 0;                /* So that tcp_send_synack() knows! */
        tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
        tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
        tcp_rsk(req)->snt_synack = 0;
        tcp_rsk(req)->last_oow_ack_time = 0;
        req->mss = rx_opt->mss_clamp;
        req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
        ireq->tstamp_ok = rx_opt->tstamp_ok;
        ireq->sack_ok = rx_opt->sack_ok;
        ireq->snd_wscale = rx_opt->snd_wscale;
        ireq->wscale_ok = rx_opt->wscale_ok;
        ireq->acked = 0;
        ireq->ecn_ok = 0;
        ireq->ir_rmt_port = tcp_hdr(skb)->source;
        ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
        ireq->ir_mark = inet_request_mark(sk, skb);
#if IS_ENABLED(CONFIG_SMC)
        ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
                        tcp_sk(sk)->smc_hs_congested(sk));
#endif
}

/*
 * Return true if a syncookie should be sent
 */
static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
{
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
        const char *msg = "Dropping request";
        struct net *net = sock_net(sk);
        bool want_cookie = false;
        u8 syncookies;

        syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);

#ifdef CONFIG_SYN_COOKIES
        if (syncookies) {
                msg = "Sending cookies";
                want_cookie = true;
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
        } else
#endif
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);

        if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 &&
            xchg(&queue->synflood_warned, 1) == 0) {
                if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
                        net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
                                        proto, inet6_rcv_saddr(sk),
                                        sk->sk_num, msg);
                } else {
                        net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n",
                                        proto, &sk->sk_rcv_saddr,
                                        sk->sk_num, msg);
                }
        }

        return want_cookie;
}

static void tcp_reqsk_record_syn(const struct sock *sk,
                                 struct request_sock *req,
                                 const struct sk_buff *skb)
{
        if (tcp_sk(sk)->save_syn) {
                u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
                struct saved_syn *saved_syn;
                u32 mac_hdrlen;
                void *base;

                if (tcp_sk(sk)->save_syn == 2) {  /* Save full header. */
                        base = skb_mac_header(skb);
                        mac_hdrlen = skb_mac_header_len(skb);
                        len += mac_hdrlen;
                } else {
                        base = skb_network_header(skb);
                        mac_hdrlen = 0;
                }

                saved_syn = kmalloc(struct_size(saved_syn, data, len),
                                    GFP_ATOMIC);
                if (saved_syn) {
                        saved_syn->mac_hdrlen = mac_hdrlen;
                        saved_syn->network_hdrlen = skb_network_header_len(skb);
                        saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
                        memcpy(saved_syn->data, base, len);
                        req->saved_syn = saved_syn;
                }
        }
}

/* If a SYN cookie is required and supported, returns a clamped MSS value to be
 * used for SYN cookie generation.
 */
u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
                          const struct tcp_request_sock_ops *af_ops,
                          struct sock *sk, struct tcphdr *th)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u16 mss;

        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
            !inet_csk_reqsk_queue_is_full(sk))
                return 0;

        if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
                return 0;

        if (sk_acceptq_is_full(sk)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
                return 0;
        }

        mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
        if (!mss)
                mss = af_ops->mss_clamp;

        return mss;
}
EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);

int tcp_conn_request(struct request_sock_ops *rsk_ops,
                     const struct tcp_request_sock_ops *af_ops,
                     struct sock *sk, struct sk_buff *skb)
{
        struct tcp_fastopen_cookie foc = { .len = -1 };
        struct tcp_options_received tmp_opt;
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct sock *fastopen_sk = NULL;
        struct request_sock *req;
        bool want_cookie = false;
        struct dst_entry *dst;
        struct flowi fl;
        u8 syncookies;
        u32 isn;

#ifdef CONFIG_TCP_AO
        const struct tcp_ao_hdr *aoh;
#endif

        isn = __this_cpu_read(tcp_tw_isn);
        if (isn) {
                /* TW buckets are converted to open requests without
                 * limitations, they conserve resources and peer is
                 * evidently real one.
                 */
                __this_cpu_write(tcp_tw_isn, 0);
        } else {
                syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);

                if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
                        want_cookie = tcp_syn_flood_action(sk,
                                                           rsk_ops->slab_name);
                        if (!want_cookie)
                                goto drop;
                }
        }

        if (sk_acceptq_is_full(sk)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
                goto drop;
        }

        req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
        if (!req)
                goto drop;

        req->syncookie = want_cookie;
        tcp_rsk(req)->af_specific = af_ops;
        tcp_rsk(req)->ts_off = 0;
        tcp_rsk(req)->req_usec_ts = false;
#if IS_ENABLED(CONFIG_MPTCP)
        tcp_rsk(req)->is_mptcp = 0;
#endif

        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = af_ops->mss_clamp;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
        tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
                          want_cookie ? NULL : &foc);

        if (want_cookie && !tmp_opt.saw_tstamp)
                tcp_clear_options(&tmp_opt);

        if (IS_ENABLED(CONFIG_SMC) && want_cookie)
                tmp_opt.smc_ok = 0;

        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
        tcp_openreq_init(req, &tmp_opt, skb, sk);
        inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk);

        /* Note: tcp_v6_init_req() might override ir_iif for link locals */
        inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);

        dst = af_ops->route_req(sk, skb, &fl, req, isn);
        if (!dst)
                goto drop_and_free;

        if (tmp_opt.tstamp_ok) {
                tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
                tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
        }
        if (!want_cookie && !isn) {
                int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);

                /* Kill the following clause, if you dislike this way. */
                if (!syncookies &&
                    (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                     (max_syn_backlog >> 2)) &&
                    !tcp_peer_is_proven(req, dst)) {
                        /* Without syncookies last quarter of
                         * backlog is filled with destinations,
                         * proven to be alive.
                         * It means that we continue to communicate
                         * to destinations, already remembered
                         * to the moment of synflood.
                         */
                        pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
                                    rsk_ops->family);
                        goto drop_and_release;
                }

                isn = af_ops->init_seq(skb);
        }

        tcp_ecn_create_request(req, skb, sk, dst);

        if (want_cookie) {
                isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
                if (!tmp_opt.tstamp_ok)
                        inet_rsk(req)->ecn_ok = 0;
        }

#ifdef CONFIG_TCP_AO
        if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
                goto drop_and_release; /* Invalid TCP options */
        if (aoh) {
                tcp_rsk(req)->used_tcp_ao = true;
                tcp_rsk(req)->ao_rcv_next = aoh->keyid;
                tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;

        } else {
                tcp_rsk(req)->used_tcp_ao = false;
        }
#endif
        tcp_rsk(req)->snt_isn = isn;
        tcp_rsk(req)->txhash = net_tx_rndhash();
        tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
        tcp_openreq_init_rwin(req, sk, dst);
        sk_rx_queue_set(req_to_sk(req), skb);
        if (!want_cookie) {
                tcp_reqsk_record_syn(sk, req, skb);
                fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
        }
        if (fastopen_sk) {
                af_ops->send_synack(fastopen_sk, dst, &fl, req,
                                    &foc, TCP_SYNACK_FASTOPEN, skb);
                /* Add the child socket directly into the accept queue */
                if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
                        reqsk_fastopen_remove(fastopen_sk, req, false);
                        bh_unlock_sock(fastopen_sk);
                        sock_put(fastopen_sk);
                        goto drop_and_free;
                }
                sk->sk_data_ready(sk);
                bh_unlock_sock(fastopen_sk);
                sock_put(fastopen_sk);
        } else {
                tcp_rsk(req)->tfo_listener = false;
                if (!want_cookie) {
                        req->timeout = tcp_timeout_init((struct sock *)req);
                        inet_csk_reqsk_queue_hash_add(sk, req, req->timeout);
                }
                af_ops->send_synack(sk, dst, &fl, req, &foc,
                                    !want_cookie ? TCP_SYNACK_NORMAL :
                                                   TCP_SYNACK_COOKIE,
                                    skb);
                if (want_cookie) {
                        reqsk_free(req);
                        return 0;
                }
        }
        reqsk_put(req);
        return 0;

drop_and_release:
        dst_release(dst);
drop_and_free:
        __reqsk_free(req);
drop:
        tcp_listendrop(sk);
        return 0;
}
EXPORT_SYMBOL(tcp_conn_request);




































































    2 





























































































































































    2 






























    2 














    2 


    2 






    2 
    2 






    2 






    2 



    2 



    2 











    2 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Packet matching code.
 *
 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
 * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
 * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cache.h>
#include <linux/capability.h>
#include <linux/skbuff.h>
#include <linux/kmod.h>
#include <linux/vmalloc.h>
#include <linux/netdevice.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/compat.h>
#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
#include <linux/cpumask.h>

#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <net/netfilter/nf_log.h>
#include "../../netfilter/xt_repldata.h"

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv4 packet filter");

void *ipt_alloc_initial_table(const struct xt_table *info)
{
        return xt_alloc_initial_table(ipt, IPT);
}
EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);

/* Returns whether matches rule or not. */
/* Performance critical - called for every packet */
static inline bool
ip_packet_match(const struct iphdr *ip,
                const char *indev,
                const char *outdev,
                const struct ipt_ip *ipinfo,
                int isfrag)
{
        unsigned long ret;

        if (NF_INVF(ipinfo, IPT_INV_SRCIP,
                    (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) ||
            NF_INVF(ipinfo, IPT_INV_DSTIP,
                    (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr))
                return false;

        ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);

        if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0))
                return false;

        ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);

        if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0))
                return false;

        /* Check specific protocol */
        if (ipinfo->proto &&
            NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto))
                return false;

        /* If we have a fragment rule but the packet is not a fragment
         * then we return zero */
        if (NF_INVF(ipinfo, IPT_INV_FRAG,
                    (ipinfo->flags & IPT_F_FRAG) && !isfrag))
                return false;

        return true;
}

static bool
ip_checkentry(const struct ipt_ip *ip)
{
        if (ip->flags & ~IPT_F_MASK)
                return false;
        if (ip->invflags & ~IPT_INV_MASK)
                return false;
        return true;
}

static unsigned int
ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
        net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);

        return NF_DROP;
}

/* Performance critical */
static inline struct ipt_entry *
get_entry(const void *base, unsigned int offset)
{
        return (struct ipt_entry *)(base + offset);
}

/* All zeroes == unconditional rule. */
/* Mildly perf critical (only if packet tracing is on) */
static inline bool unconditional(const struct ipt_entry *e)
{
        static const struct ipt_ip uncond;

        return e->target_offset == sizeof(struct ipt_entry) &&
               memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
}

/* for const-correctness */
static inline const struct xt_entry_target *
ipt_get_target_c(const struct ipt_entry *e)
{
        return ipt_get_target((struct ipt_entry *)e);
}

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
static const char *const hooknames[] = {
        [NF_INET_PRE_ROUTING]                = "PREROUTING",
        [NF_INET_LOCAL_IN]                = "INPUT",
        [NF_INET_FORWARD]                = "FORWARD",
        [NF_INET_LOCAL_OUT]                = "OUTPUT",
        [NF_INET_POST_ROUTING]                = "POSTROUTING",
};

enum nf_ip_trace_comments {
        NF_IP_TRACE_COMMENT_RULE,
        NF_IP_TRACE_COMMENT_RETURN,
        NF_IP_TRACE_COMMENT_POLICY,
};

static const char *const comments[] = {
        [NF_IP_TRACE_COMMENT_RULE]        = "rule",
        [NF_IP_TRACE_COMMENT_RETURN]        = "return",
        [NF_IP_TRACE_COMMENT_POLICY]        = "policy",
};

static const struct nf_loginfo trace_loginfo = {
        .type = NF_LOG_TYPE_LOG,
        .u = {
                .log = {
                        .level = 4,
                        .logflags = NF_LOG_DEFAULT_MASK,
                },
        },
};

/* Mildly perf critical (only if packet tracing is on) */
static inline int
get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
                      const char *hookname, const char **chainname,
                      const char **comment, unsigned int *rulenum)
{
        const struct xt_standard_target *t = (void *)ipt_get_target_c(s);

        if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
                /* Head of user chain: ERROR target with chainname */
                *chainname = t->target.data;
                (*rulenum) = 0;
        } else if (s == e) {
                (*rulenum)++;

                if (unconditional(s) &&
                    strcmp(t->target.u.kernel.target->name,
                           XT_STANDARD_TARGET) == 0 &&
                   t->verdict < 0) {
                        /* Tail of chains: STANDARD target (return/policy) */
                        *comment = *chainname == hookname
                                ? comments[NF_IP_TRACE_COMMENT_POLICY]
                                : comments[NF_IP_TRACE_COMMENT_RETURN];
                }
                return 1;
        } else
                (*rulenum)++;

        return 0;
}

static void trace_packet(struct net *net,
                         const struct sk_buff *skb,
                         unsigned int hook,
                         const struct net_device *in,
                         const struct net_device *out,
                         const char *tablename,
                         const struct xt_table_info *private,
                         const struct ipt_entry *e)
{
        const struct ipt_entry *root;
        const char *hookname, *chainname, *comment;
        const struct ipt_entry *iter;
        unsigned int rulenum = 0;

        root = get_entry(private->entries, private->hook_entry[hook]);

        hookname = chainname = hooknames[hook];
        comment = comments[NF_IP_TRACE_COMMENT_RULE];

        xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
                if (get_chainname_rulenum(iter, e, hookname,
                    &chainname, &comment, &rulenum) != 0)
                        break;

        nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo,
                     "TRACE: %s:%s:%s:%u ",
                     tablename, chainname, comment, rulenum);
}
#endif

static inline
struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
{
        return (void *)entry + entry->next_offset;
}

/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(void *priv,
             struct sk_buff *skb,
             const struct nf_hook_state *state)
{
        const struct xt_table *table = priv;
        unsigned int hook = state->hook;
        static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
        const struct iphdr *ip;
        /* Initializing verdict to NF_DROP keeps gcc happy. */
        unsigned int verdict = NF_DROP;
        const char *indev, *outdev;
        const void *table_base;
        struct ipt_entry *e, **jumpstack;
        unsigned int stackidx, cpu;
        const struct xt_table_info *private;
        struct xt_action_param acpar;
        unsigned int addend;

        /* Initialization */
        stackidx = 0;
        ip = ip_hdr(skb);
        indev = state->in ? state->in->name : nulldevname;
        outdev = state->out ? state->out->name : nulldevname;
        /* We handle fragments by dealing with the first fragment as
         * if it was a normal packet.  All other fragments are treated
         * normally, except that they will NEVER match rules that ask
         * things we don't know, ie. tcp syn flag or ports).  If the
         * rule is also a fragment-specific rule, non-fragments won't
         * match it. */
        acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
        acpar.thoff   = ip_hdrlen(skb);
        acpar.hotdrop = false;
        acpar.state   = state;

        WARN_ON(!(table->valid_hooks & (1 << hook)));
        local_bh_disable();
        addend = xt_write_recseq_begin();
        private = READ_ONCE(table->private); /* Address dependency. */
        cpu        = smp_processor_id();
        table_base = private->entries;
        jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];

        /* Switch to alternate jumpstack if we're being invoked via TEE.
         * TEE issues XT_CONTINUE verdict on original skb so we must not
         * clobber the jumpstack.
         *
         * For recursion via REJECT or SYNPROXY the stack will be clobbered
         * but it is no problem since absolute verdict is issued by these.
         */
        if (static_key_false(&xt_tee_enabled))
                jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);

        e = get_entry(table_base, private->hook_entry[hook]);

        do {
                const struct xt_entry_target *t;
                const struct xt_entry_match *ematch;
                struct xt_counters *counter;

                WARN_ON(!e);
                if (!ip_packet_match(ip, indev, outdev,
                    &e->ip, acpar.fragoff)) {
 no_match:
                        e = ipt_next_entry(e);
                        continue;
                }

                xt_ematch_foreach(ematch, e) {
                        acpar.match     = ematch->u.kernel.match;
                        acpar.matchinfo = ematch->data;
                        if (!acpar.match->match(skb, &acpar))
                                goto no_match;
                }

                counter = xt_get_this_cpu_counter(&e->counters);
                ADD_COUNTER(*counter, skb->len, 1);

                t = ipt_get_target_c(e);
                WARN_ON(!t->u.kernel.target);

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
                /* The packet is traced: log it */
                if (unlikely(skb->nf_trace))
                        trace_packet(state->net, skb, hook, state->in,
                                     state->out, table->name, private, e);
#endif
                /* Standard target? */
                if (!t->u.kernel.target->target) {
                        int v;

                        v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
                                if (v != XT_RETURN) {
                                        verdict = (unsigned int)(-v) - 1;
                                        break;
                                }
                                if (stackidx == 0) {
                                        e = get_entry(table_base,
                                            private->underflow[hook]);
                                } else {
                                        e = jumpstack[--stackidx];
                                        e = ipt_next_entry(e);
                                }
                                continue;
                        }
                        if (table_base + v != ipt_next_entry(e) &&
                            !(e->ip.flags & IPT_F_GOTO)) {
                                if (unlikely(stackidx >= private->stacksize)) {
                                        verdict = NF_DROP;
                                        break;
                                }
                                jumpstack[stackidx++] = e;
                        }

                        e = get_entry(table_base, v);
                        continue;
                }

                acpar.target   = t->u.kernel.target;
                acpar.targinfo = t->data;

                verdict = t->u.kernel.target->target(skb, &acpar);
                if (verdict == XT_CONTINUE) {
                        /* Target might have changed stuff. */
                        ip = ip_hdr(skb);
                        e = ipt_next_entry(e);
                } else {
                        /* Verdict */
                        break;
                }
        } while (!acpar.hotdrop);

        xt_write_recseq_end(addend);
        local_bh_enable();

        if (acpar.hotdrop)
                return NF_DROP;
        else return verdict;
}

/* Figures out from what hook each rule can be called: returns 0 if
   there are loops.  Puts hook bitmask in comefrom. */
static int
mark_source_chains(const struct xt_table_info *newinfo,
                   unsigned int valid_hooks, void *entry0,
                   unsigned int *offsets)
{
        unsigned int hook;

        /* No recursion; use packet counter to save back ptrs (reset
           to 0 as we leave), and comefrom to save source hook bitmask */
        for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
                unsigned int pos = newinfo->hook_entry[hook];
                struct ipt_entry *e = entry0 + pos;

                if (!(valid_hooks & (1 << hook)))
                        continue;

                /* Set initial back pointer. */
                e->counters.pcnt = pos;

                for (;;) {
                        const struct xt_standard_target *t
                                = (void *)ipt_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);

                        if (e->comefrom & (1 << NF_INET_NUMHOOKS))
                                return 0;

                        e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));

                        /* Unconditional return/END. */
                        if ((unconditional(e) &&
                             (strcmp(t->target.u.user.name,
                                     XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0) || visited) {
                                unsigned int oldpos, size;

                                /* Return: backtrack through the last
                                   big jump. */
                                do {
                                        e->comefrom ^= (1<<NF_INET_NUMHOOKS);
                                        oldpos = pos;
                                        pos = e->counters.pcnt;
                                        e->counters.pcnt = 0;

                                        /* We're at the start. */
                                        if (pos == oldpos)
                                                goto next;

                                        e = entry0 + pos;
                                } while (oldpos == pos + e->next_offset);

                                /* Move along one */
                                size = e->next_offset;
                                e = entry0 + pos + size;
                                if (pos + size >= newinfo->size)
                                        return 0;
                                e->counters.pcnt = pos;
                                pos += size;
                        } else {
                                int newpos = t->verdict;

                                if (strcmp(t->target.u.user.name,
                                           XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        /* This a jump; chase it. */
                                        if (!xt_find_jump_offset(offsets, newpos,
                                                                 newinfo->number))
                                                return 0;
                                } else {
                                        /* ... this is a fallthru */
                                        newpos = pos + e->next_offset;
                                        if (newpos >= newinfo->size)
                                                return 0;
                                }
                                e = entry0 + newpos;
                                e->counters.pcnt = pos;
                                pos = newpos;
                        }
                }
next:                ;
        }
        return 1;
}

static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
        struct xt_mtdtor_param par;

        par.net       = net;
        par.match     = m->u.kernel.match;
        par.matchinfo = m->data;
        par.family    = NFPROTO_IPV4;
        if (par.match->destroy != NULL)
                par.match->destroy(&par);
        module_put(par.match->me);
}

static int
check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        const struct ipt_ip *ip = par->entryinfo;

        par->match     = m->u.kernel.match;
        par->matchinfo = m->data;

        return xt_check_match(par, m->u.match_size - sizeof(*m),
                              ip->proto, ip->invflags & IPT_INV_PROTO);
}

static int
find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        struct xt_match *match;
        int ret;

        match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);
        m->u.kernel.match = match;

        ret = check_match(m, par);
        if (ret)
                goto err;

        return 0;
err:
        module_put(m->u.kernel.match->me);
        return ret;
}

static int check_target(struct ipt_entry *e, struct net *net, const char *name)
{
        struct xt_entry_target *t = ipt_get_target(e);
        struct xt_tgchk_param par = {
                .net       = net,
                .table     = name,
                .entryinfo = e,
                .target    = t->u.kernel.target,
                .targinfo  = t->data,
                .hook_mask = e->comefrom,
                .family    = NFPROTO_IPV4,
        };

        return xt_check_target(&par, t->u.target_size - sizeof(*t),
                               e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
}

static int
find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
                 unsigned int size,
                 struct xt_percpu_counter_alloc_state *alloc_state)
{
        struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
        unsigned int j;
        struct xt_mtchk_param mtpar;
        struct xt_entry_match *ematch;

        if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
                return -ENOMEM;

        j = 0;
        memset(&mtpar, 0, sizeof(mtpar));
        mtpar.net        = net;
        mtpar.table     = name;
        mtpar.entryinfo = &e->ip;
        mtpar.hook_mask = e->comefrom;
        mtpar.family    = NFPROTO_IPV4;
        xt_ematch_foreach(ematch, e) {
                ret = find_check_match(ematch, &mtpar);
                if (ret != 0)
                        goto cleanup_matches;
                ++j;
        }

        t = ipt_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto cleanup_matches;
        }
        t->u.kernel.target = target;

        ret = check_target(e, net, name);
        if (ret)
                goto err;

        return 0;
 err:
        module_put(t->u.kernel.target->me);
 cleanup_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                cleanup_match(ematch, net);
        }

        xt_percpu_counter_free(&e->counters);

        return ret;
}

static bool check_underflow(const struct ipt_entry *e)
{
        const struct xt_entry_target *t;
        unsigned int verdict;

        if (!unconditional(e))
                return false;
        t = ipt_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
        verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
}

static int
check_entry_size_and_hooks(struct ipt_entry *e,
                           struct xt_table_info *newinfo,
                           const unsigned char *base,
                           const unsigned char *limit,
                           const unsigned int *hook_entries,
                           const unsigned int *underflows,
                           unsigned int valid_hooks)
{
        unsigned int h;
        int err;

        if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
            (unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset
            < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target))
                return -EINVAL;

        if (!ip_checkentry(&e->ip))
                return -EINVAL;

        err = xt_check_entry_offsets(e, e->elems, e->target_offset,
                                     e->next_offset);
        if (err)
                return err;

        /* Check hooks & underflows */
        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if (!(valid_hooks & (1 << h)))
                        continue;
                if ((unsigned char *)e - base == hook_entries[h])
                        newinfo->hook_entry[h] = hook_entries[h];
                if ((unsigned char *)e - base == underflows[h]) {
                        if (!check_underflow(e))
                                return -EINVAL;

                        newinfo->underflow[h] = underflows[h];
                }
        }

        /* Clear counters and comefrom */
        e->counters = ((struct xt_counters) { 0, 0 });
        e->comefrom = 0;
        return 0;
}

static void
cleanup_entry(struct ipt_entry *e, struct net *net)
{
        struct xt_tgdtor_param par;
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                cleanup_match(ematch, net);
        t = ipt_get_target(e);

        par.net      = net;
        par.target   = t->u.kernel.target;
        par.targinfo = t->data;
        par.family   = NFPROTO_IPV4;
        if (par.target->destroy != NULL)
                par.target->destroy(&par);
        module_put(par.target->me);
        xt_percpu_counter_free(&e->counters);
}

/* Checks and translates the user-supplied table segment (held in
   newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
                const struct ipt_replace *repl)
{
        struct xt_percpu_counter_alloc_state alloc_state = { 0 };
        struct ipt_entry *iter;
        unsigned int *offsets;
        unsigned int i;
        int ret = 0;

        newinfo->size = repl->size;
        newinfo->number = repl->num_entries;

        /* Init all hooks to impossible value. */
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = 0xFFFFFFFF;
                newinfo->underflow[i] = 0xFFFFFFFF;
        }

        offsets = xt_alloc_entry_offsets(newinfo->number);
        if (!offsets)
                return -ENOMEM;
        i = 0;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = check_entry_size_and_hooks(iter, newinfo, entry0,
                                                 entry0 + repl->size,
                                                 repl->hook_entry,
                                                 repl->underflow,
                                                 repl->valid_hooks);
                if (ret != 0)
                        goto out_free;
                if (i < repl->num_entries)
                        offsets[i] = (void *)iter - entry0;
                ++i;
                if (strcmp(ipt_get_target(iter)->u.user.name,
                    XT_ERROR_TARGET) == 0)
                        ++newinfo->stacksize;
        }

        ret = -EINVAL;
        if (i != repl->num_entries)
                goto out_free;

        ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
        if (ret)
                goto out_free;

        if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
                ret = -ELOOP;
                goto out_free;
        }
        kvfree(offsets);

        /* Finally, each sanity check must pass */
        i = 0;
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = find_check_entry(iter, net, repl->name, repl->size,
                                       &alloc_state);
                if (ret != 0)
                        break;
                ++i;
        }

        if (ret != 0) {
                xt_entry_foreach(iter, entry0, newinfo->size) {
                        if (i-- == 0)
                                break;
                        cleanup_entry(iter, net);
                }
                return ret;
        }

        return ret;
 out_free:
        kvfree(offsets);
        return ret;
}

static void
get_counters(const struct xt_table_info *t,
             struct xt_counters counters[])
{
        struct ipt_entry *iter;
        unsigned int cpu;
        unsigned int i;

        for_each_possible_cpu(cpu) {
                seqcount_t *s = &per_cpu(xt_recseq, cpu);

                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        struct xt_counters *tmp;
                        u64 bcnt, pcnt;
                        unsigned int start;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        do {
                                start = read_seqcount_begin(s);
                                bcnt = tmp->bcnt;
                                pcnt = tmp->pcnt;
                        } while (read_seqcount_retry(s, start));

                        ADD_COUNTER(counters[i], bcnt, pcnt);
                        ++i; /* macro does multi eval of i */
                        cond_resched();
                }
        }
}

static void get_old_counters(const struct xt_table_info *t,
                             struct xt_counters counters[])
{
        struct ipt_entry *iter;
        unsigned int cpu, i;

        for_each_possible_cpu(cpu) {
                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        const struct xt_counters *tmp;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
                        ++i; /* macro does multi eval of i */
                }

                cond_resched();
        }
}

static struct xt_counters *alloc_counters(const struct xt_table *table)
{
        unsigned int countersize;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;

        /* We need atomic snapshot of counters: rest doesn't change
           (other than comefrom, which userspace doesn't care
           about). */
        countersize = sizeof(struct xt_counters) * private->number;
        counters = vzalloc(countersize);

        if (counters == NULL)
                return ERR_PTR(-ENOMEM);

        get_counters(private, counters);

        return counters;
}

static int
copy_entries_to_user(unsigned int total_size,
                     const struct xt_table *table,
                     void __user *userptr)
{
        unsigned int off, num;
        const struct ipt_entry *e;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        int ret = 0;
        const void *loc_cpu_entry;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        loc_cpu_entry = private->entries;

        /* FIXME: use iterator macros --RR */
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
                unsigned int i;
                const struct xt_entry_match *m;
                const struct xt_entry_target *t;

                e = loc_cpu_entry + off;
                if (copy_to_user(userptr + off, e, sizeof(*e))) {
                        ret = -EFAULT;
                        goto free_counters;
                }
                if (copy_to_user(userptr + off
                                 + offsetof(struct ipt_entry, counters),
                                 &counters[num],
                                 sizeof(counters[num])) != 0) {
                        ret = -EFAULT;
                        goto free_counters;
                }

                for (i = sizeof(struct ipt_entry);
                     i < e->target_offset;
                     i += m->u.match_size) {
                        m = (void *)e + i;

                        if (xt_match_to_user(m, userptr + off + i)) {
                                ret = -EFAULT;
                                goto free_counters;
                        }
                }

                t = ipt_get_target_c(e);
                if (xt_target_to_user(t, userptr + off + e->target_offset)) {
                        ret = -EFAULT;
                        goto free_counters;
                }
        }

 free_counters:
        vfree(counters);
        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
        int v = *(compat_int_t *)src;

        if (v > 0)
                v += xt_compat_calc_jump(AF_INET, v);
        memcpy(dst, &v, sizeof(v));
}

static int compat_standard_to_user(void __user *dst, const void *src)
{
        compat_int_t cv = *(int *)src;

        if (cv > 0)
                cv -= xt_compat_calc_jump(AF_INET, cv);
        return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}

static int compat_calc_entry(const struct ipt_entry *e,
                             const struct xt_table_info *info,
                             const void *base, struct xt_table_info *newinfo)
{
        const struct xt_entry_match *ematch;
        const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;

        off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
        entry_offset = (void *)e - base;
        xt_ematch_foreach(ematch, e)
                off += xt_compat_match_offset(ematch->u.kernel.match);
        t = ipt_get_target_c(e);
        off += xt_compat_target_offset(t->u.kernel.target);
        newinfo->size -= off;
        ret = xt_compat_add_offset(AF_INET, entry_offset, off);
        if (ret)
                return ret;

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                if (info->hook_entry[i] &&
                    (e < (struct ipt_entry *)(base + info->hook_entry[i])))
                        newinfo->hook_entry[i] -= off;
                if (info->underflow[i] &&
                    (e < (struct ipt_entry *)(base + info->underflow[i])))
                        newinfo->underflow[i] -= off;
        }
        return 0;
}

static int compat_table_info(const struct xt_table_info *info,
                             struct xt_table_info *newinfo)
{
        struct ipt_entry *iter;
        const void *loc_cpu_entry;
        int ret;

        if (!newinfo || !info)
                return -EINVAL;

        /* we dont care about newinfo->entries */
        memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
        newinfo->initial_entries = 0;
        loc_cpu_entry = info->entries;
        ret = xt_compat_init_offsets(AF_INET, info->number);
        if (ret)
                return ret;
        xt_entry_foreach(iter, loc_cpu_entry, info->size) {
                ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
                if (ret != 0)
                        return ret;
        }
        return 0;
}
#endif

static int get_info(struct net *net, void __user *user, const int *len)
{
        char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;

        if (*len != sizeof(struct ipt_getinfo))
                return -EINVAL;

        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;

        name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_lock(AF_INET);
#endif
        t = xt_request_find_table_lock(net, AF_INET, name);
        if (!IS_ERR(t)) {
                struct ipt_getinfo info;
                const struct xt_table_info *private = t->private;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                struct xt_table_info tmp;

                if (in_compat_syscall()) {
                        ret = compat_table_info(private, &tmp);
                        xt_compat_flush_offsets(AF_INET);
                        private = &tmp;
                }
#endif
                memset(&info, 0, sizeof(info));
                info.valid_hooks = t->valid_hooks;
                memcpy(info.hook_entry, private->hook_entry,
                       sizeof(info.hook_entry));
                memcpy(info.underflow, private->underflow,
                       sizeof(info.underflow));
                info.num_entries = private->number;
                info.size = private->size;
                strcpy(info.name, name);

                if (copy_to_user(user, &info, *len) != 0)
                        ret = -EFAULT;
                else
                        ret = 0;

                xt_table_unlock(t);
                module_put(t->me);
        } else
                ret = PTR_ERR(t);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_unlock(AF_INET);
#endif
        return ret;
}

static int
get_entries(struct net *net, struct ipt_get_entries __user *uptr,
            const int *len)
{
        int ret;
        struct ipt_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;
        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;
        if (*len != sizeof(struct ipt_get_entries) + get.size)
                return -EINVAL;
        get.name[sizeof(get.name) - 1] = '\0';

        t = xt_find_table_lock(net, AF_INET, get.name);
        if (!IS_ERR(t)) {
                const struct xt_table_info *private = t->private;
                if (get.size == private->size)
                        ret = copy_entries_to_user(private->size,
                                                   t, uptr->entrytable);
                else
                        ret = -EAGAIN;

                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        return ret;
}

static int
__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
             struct xt_table_info *newinfo, unsigned int num_counters,
             void __user *counters_ptr)
{
        int ret;
        struct xt_table *t;
        struct xt_table_info *oldinfo;
        struct xt_counters *counters;
        struct ipt_entry *iter;

        counters = xt_counters_alloc(num_counters);
        if (!counters) {
                ret = -ENOMEM;
                goto out;
        }

        t = xt_request_find_table_lock(net, AF_INET, name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free_newinfo_counters_untrans;
        }

        /* You lied! */
        if (valid_hooks != t->valid_hooks) {
                ret = -EINVAL;
                goto put_module;
        }

        oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
        if (!oldinfo)
                goto put_module;

        /* Update module usage count based on number of rules */
        if ((oldinfo->number > oldinfo->initial_entries) ||
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);
        if ((oldinfo->number > oldinfo->initial_entries) &&
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);

        xt_table_unlock(t);

        get_old_counters(oldinfo, counters);

        /* Decrease module usage counts and free resource */
        xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
                cleanup_entry(iter, net);

        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
                         sizeof(struct xt_counters) * num_counters) != 0) {
                /* Silent error, can't fail, new table is already in place */
                net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
        }
        vfree(counters);
        return 0;

 put_module:
        module_put(t->me);
        xt_table_unlock(t);
 free_newinfo_counters_untrans:
        vfree(counters);
 out:
        return ret;
}

static int
do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct ipt_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ipt_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, tmp.counters);
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

static int
do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
        unsigned int i;
        struct xt_counters_info tmp;
        struct xt_counters *paddc;
        struct xt_table *t;
        const struct xt_table_info *private;
        int ret = 0;
        struct ipt_entry *iter;
        unsigned int addend;

        paddc = xt_copy_counters(arg, len, &tmp);
        if (IS_ERR(paddc))
                return PTR_ERR(paddc);

        t = xt_find_table_lock(net, AF_INET, tmp.name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free;
        }

        local_bh_disable();
        private = t->private;
        if (private->number != tmp.num_counters) {
                ret = -EINVAL;
                goto unlock_up_free;
        }

        i = 0;
        addend = xt_write_recseq_begin();
        xt_entry_foreach(iter, private->entries, private->size) {
                struct xt_counters *tmp;

                tmp = xt_get_this_cpu_counter(&iter->counters);
                ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
                ++i;
        }
        xt_write_recseq_end(addend);
 unlock_up_free:
        local_bh_enable();
        xt_table_unlock(t);
        module_put(t->me);
 free:
        vfree(paddc);

        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ipt_replace {
        char                        name[XT_TABLE_MAXNAMELEN];
        u32                        valid_hooks;
        u32                        num_entries;
        u32                        size;
        u32                        hook_entry[NF_INET_NUMHOOKS];
        u32                        underflow[NF_INET_NUMHOOKS];
        u32                        num_counters;
        compat_uptr_t                counters;        /* struct xt_counters * */
        struct compat_ipt_entry        entries[];
};

static int
compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
                          unsigned int *size, struct xt_counters *counters,
                          unsigned int i)
{
        struct xt_entry_target *t;
        struct compat_ipt_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
        const struct xt_entry_match *ematch;
        int ret = 0;

        origsize = *size;
        ce = *dstptr;
        if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
            copy_to_user(&ce->counters, &counters[i],
            sizeof(counters[i])) != 0)
                return -EFAULT;

        *dstptr += sizeof(struct compat_ipt_entry);
        *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);

        xt_ematch_foreach(ematch, e) {
                ret = xt_compat_match_to_user(ematch, dstptr, size);
                if (ret != 0)
                        return ret;
        }
        target_offset = e->target_offset - (origsize - *size);
        t = ipt_get_target(e);
        ret = xt_compat_target_to_user(t, dstptr, size);
        if (ret)
                return ret;
        next_offset = e->next_offset - (origsize - *size);
        if (put_user(target_offset, &ce->target_offset) != 0 ||
            put_user(next_offset, &ce->next_offset) != 0)
                return -EFAULT;
        return 0;
}

static int
compat_find_calc_match(struct xt_entry_match *m,
                       const struct ipt_ip *ip,
                       int *size)
{
        struct xt_match *match;

        match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);

        m->u.kernel.match = match;
        *size += xt_compat_match_offset(match);
        return 0;
}

static void compat_release_entry(struct compat_ipt_entry *e)
{
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                module_put(ematch->u.kernel.match->me);
        t = compat_ipt_get_target(e);
        module_put(t->u.kernel.target->me);
}

static int
check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
                                  struct xt_table_info *newinfo,
                                  unsigned int *size,
                                  const unsigned char *base,
                                  const unsigned char *limit)
{
        struct xt_entry_match *ematch;
        struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        unsigned int j;
        int ret, off;

        if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
            (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset < sizeof(struct compat_ipt_entry) +
                             sizeof(struct compat_xt_entry_target))
                return -EINVAL;

        if (!ip_checkentry(&e->ip))
                return -EINVAL;

        ret = xt_compat_check_entry_offsets(e, e->elems,
                                            e->target_offset, e->next_offset);
        if (ret)
                return ret;

        off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
        entry_offset = (void *)e - (void *)base;
        j = 0;
        xt_ematch_foreach(ematch, e) {
                ret = compat_find_calc_match(ematch, &e->ip, &off);
                if (ret != 0)
                        goto release_matches;
                ++j;
        }

        t = compat_ipt_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto release_matches;
        }
        t->u.kernel.target = target;

        off += xt_compat_target_offset(target);
        *size += off;
        ret = xt_compat_add_offset(AF_INET, entry_offset, off);
        if (ret)
                goto out;

        return 0;

out:
        module_put(t->u.kernel.target->me);
release_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                module_put(ematch->u.kernel.match->me);
        }
        return ret;
}

static void
compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
                            unsigned int *size,
                            struct xt_table_info *newinfo, unsigned char *base)
{
        struct xt_entry_target *t;
        struct ipt_entry *de;
        unsigned int origsize;
        int h;
        struct xt_entry_match *ematch;

        origsize = *size;
        de = *dstptr;
        memcpy(de, e, sizeof(struct ipt_entry));
        memcpy(&de->counters, &e->counters, sizeof(e->counters));

        *dstptr += sizeof(struct ipt_entry);
        *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);

        xt_ematch_foreach(ematch, e)
                xt_compat_match_from_user(ematch, dstptr, size);

        de->target_offset = e->target_offset - (origsize - *size);
        t = compat_ipt_get_target(e);
        xt_compat_target_from_user(t, dstptr, size);

        de->next_offset = e->next_offset - (origsize - *size);

        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if ((unsigned char *)de - base < newinfo->hook_entry[h])
                        newinfo->hook_entry[h] -= origsize - *size;
                if ((unsigned char *)de - base < newinfo->underflow[h])
                        newinfo->underflow[h] -= origsize - *size;
        }
}

static int
translate_compat_table(struct net *net,
                       struct xt_table_info **pinfo,
                       void **pentry0,
                       const struct compat_ipt_replace *compatr)
{
        unsigned int i, j;
        struct xt_table_info *newinfo, *info;
        void *pos, *entry0, *entry1;
        struct compat_ipt_entry *iter0;
        struct ipt_replace repl;
        unsigned int size;
        int ret;

        info = *pinfo;
        entry0 = *pentry0;
        size = compatr->size;
        info->number = compatr->num_entries;

        j = 0;
        xt_compat_lock(AF_INET);
        ret = xt_compat_init_offsets(AF_INET, compatr->num_entries);
        if (ret)
                goto out_unlock;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter0, entry0, compatr->size) {
                ret = check_compat_entry_size_and_hooks(iter0, info, &size,
                                                        entry0,
                                                        entry0 + compatr->size);
                if (ret != 0)
                        goto out_unlock;
                ++j;
        }

        ret = -EINVAL;
        if (j != compatr->num_entries)
                goto out_unlock;

        ret = -ENOMEM;
        newinfo = xt_alloc_table_info(size);
        if (!newinfo)
                goto out_unlock;

        memset(newinfo->entries, 0, size);

        newinfo->number = compatr->num_entries;
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = compatr->hook_entry[i];
                newinfo->underflow[i] = compatr->underflow[i];
        }
        entry1 = newinfo->entries;
        pos = entry1;
        size = compatr->size;
        xt_entry_foreach(iter0, entry0, compatr->size)
                compat_copy_entry_from_user(iter0, &pos, &size,
                                            newinfo, entry1);

        /* all module references in entry0 are now gone.
         * entry1/newinfo contains a 64bit ruleset that looks exactly as
         * generated by 64bit userspace.
         *
         * Call standard translate_table() to validate all hook_entrys,
         * underflows, check for loops, etc.
         */
        xt_compat_flush_offsets(AF_INET);
        xt_compat_unlock(AF_INET);

        memcpy(&repl, compatr, sizeof(*compatr));

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                repl.hook_entry[i] = newinfo->hook_entry[i];
                repl.underflow[i] = newinfo->underflow[i];
        }

        repl.num_counters = 0;
        repl.counters = NULL;
        repl.size = newinfo->size;
        ret = translate_table(net, newinfo, entry1, &repl);
        if (ret)
                goto free_newinfo;

        *pinfo = newinfo;
        *pentry0 = entry1;
        xt_free_table_info(info);
        return 0;

free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
out_unlock:
        xt_compat_flush_offsets(AF_INET);
        xt_compat_unlock(AF_INET);
        xt_entry_foreach(iter0, entry0, compatr->size) {
                if (j-- == 0)
                        break;
                compat_release_entry(iter0);
        }
        return ret;
}

static int
compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct compat_ipt_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ipt_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, compat_ptr(tmp.counters));
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

struct compat_ipt_get_entries {
        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_ipt_entry entrytable[];
};

static int
compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
                            void __user *userptr)
{
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        void __user *pos;
        unsigned int size;
        int ret = 0;
        unsigned int i = 0;
        struct ipt_entry *iter;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        pos = userptr;
        size = total_size;
        xt_entry_foreach(iter, private->entries, total_size) {
                ret = compat_copy_entry_to_user(iter, &pos,
                                                &size, counters, i++);
                if (ret != 0)
                        break;
        }

        vfree(counters);
        return ret;
}

static int
compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
                   int *len)
{
        int ret;
        struct compat_ipt_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;

        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;

        if (*len != sizeof(struct compat_ipt_get_entries) + get.size)
                return -EINVAL;

        get.name[sizeof(get.name) - 1] = '\0';

        xt_compat_lock(AF_INET);
        t = xt_find_table_lock(net, AF_INET, get.name);
        if (!IS_ERR(t)) {
                const struct xt_table_info *private = t->private;
                struct xt_table_info info;
                ret = compat_table_info(private, &info);
                if (!ret && get.size == info.size)
                        ret = compat_copy_entries_to_user(private->size,
                                                          t, uptr->entrytable);
                else if (!ret)
                        ret = -EAGAIN;

                xt_compat_flush_offsets(AF_INET);
                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        xt_compat_unlock(AF_INET);
        return ret;
}
#endif

static int
do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IPT_SO_SET_REPLACE:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_do_replace(sock_net(sk), arg, len);
                else
#endif
                        ret = do_replace(sock_net(sk), arg, len);
                break;

        case IPT_SO_SET_ADD_COUNTERS:
                ret = do_add_counters(sock_net(sk), arg, len);
                break;

        default:
                ret = -EINVAL;
        }

        return ret;
}

static int
do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IPT_SO_GET_INFO:
                ret = get_info(sock_net(sk), user, len);
                break;

        case IPT_SO_GET_ENTRIES:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_get_entries(sock_net(sk), user, len);
                else
#endif
                        ret = get_entries(sock_net(sk), user, len);
                break;

        case IPT_SO_GET_REVISION_MATCH:
        case IPT_SO_GET_REVISION_TARGET: {
                struct xt_get_revision rev;
                int target;

                if (*len != sizeof(rev)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
                        ret = -EFAULT;
                        break;
                }
                rev.name[sizeof(rev.name)-1] = 0;

                if (cmd == IPT_SO_GET_REVISION_TARGET)
                        target = 1;
                else
                        target = 0;

                try_then_request_module(xt_find_revision(AF_INET, rev.name,
                                                         rev.revision,
                                                         target, &ret),
                                        "ipt_%s", rev.name);
                break;
        }

        default:
                ret = -EINVAL;
        }

        return ret;
}

static void __ipt_unregister_table(struct net *net, struct xt_table *table)
{
        struct xt_table_info *private;
        void *loc_cpu_entry;
        struct module *table_owner = table->me;
        struct ipt_entry *iter;

        private = xt_unregister_table(table);

        /* Decrease module usage counts and free resources */
        loc_cpu_entry = private->entries;
        xt_entry_foreach(iter, loc_cpu_entry, private->size)
                cleanup_entry(iter, net);
        if (private->number > private->initial_entries)
                module_put(table_owner);
        xt_free_table_info(private);
}

int ipt_register_table(struct net *net, const struct xt_table *table,
                       const struct ipt_replace *repl,
                       const struct nf_hook_ops *template_ops)
{
        struct nf_hook_ops *ops;
        unsigned int num_ops;
        int ret, i;
        struct xt_table_info *newinfo;
        struct xt_table_info bootstrap = {0};
        void *loc_cpu_entry;
        struct xt_table *new_table;

        newinfo = xt_alloc_table_info(repl->size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        memcpy(loc_cpu_entry, repl->entries, repl->size);

        ret = translate_table(net, newinfo, loc_cpu_entry, repl);
        if (ret != 0) {
                xt_free_table_info(newinfo);
                return ret;
        }

        new_table = xt_register_table(net, table, &bootstrap, newinfo);
        if (IS_ERR(new_table)) {
                struct ipt_entry *iter;

                xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                        cleanup_entry(iter, net);
                xt_free_table_info(newinfo);
                return PTR_ERR(new_table);
        }

        /* No template? No need to do anything. This is used by 'nat' table, it registers
         * with the nat core instead of the netfilter core.
         */
        if (!template_ops)
                return 0;

        num_ops = hweight32(table->valid_hooks);
        if (num_ops == 0) {
                ret = -EINVAL;
                goto out_free;
        }

        ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
        if (!ops) {
                ret = -ENOMEM;
                goto out_free;
        }

        for (i = 0; i < num_ops; i++)
                ops[i].priv = new_table;

        new_table->ops = ops;

        ret = nf_register_net_hooks(net, ops, num_ops);
        if (ret != 0)
                goto out_free;

        return ret;

out_free:
        __ipt_unregister_table(net, new_table);
        return ret;
}

void ipt_unregister_table_pre_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);

        if (table)
                nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}

void ipt_unregister_table_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);

        if (table)
                __ipt_unregister_table(net, table);
}

static struct xt_target ipt_builtin_tg[] __read_mostly = {
        {
                .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_IPV4,
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                .compatsize       = sizeof(compat_int_t),
                .compat_from_user = compat_standard_from_user,
                .compat_to_user   = compat_standard_to_user,
#endif
        },
        {
                .name             = XT_ERROR_TARGET,
                .target           = ipt_error,
                .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_IPV4,
        },
};

static struct nf_sockopt_ops ipt_sockopts = {
        .pf                = PF_INET,
        .set_optmin        = IPT_BASE_CTL,
        .set_optmax        = IPT_SO_SET_MAX+1,
        .set                = do_ipt_set_ctl,
        .get_optmin        = IPT_BASE_CTL,
        .get_optmax        = IPT_SO_GET_MAX+1,
        .get                = do_ipt_get_ctl,
        .owner                = THIS_MODULE,
};

static int __net_init ip_tables_net_init(struct net *net)
{
        return xt_proto_init(net, NFPROTO_IPV4);
}

static void __net_exit ip_tables_net_exit(struct net *net)
{
        xt_proto_fini(net, NFPROTO_IPV4);
}

static struct pernet_operations ip_tables_net_ops = {
        .init = ip_tables_net_init,
        .exit = ip_tables_net_exit,
};

static int __init ip_tables_init(void)
{
        int ret;

        ret = register_pernet_subsys(&ip_tables_net_ops);
        if (ret < 0)
                goto err1;

        /* No one else will be downing sem now, so we won't sleep */
        ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
        if (ret < 0)
                goto err2;

        /* Register setsockopt */
        ret = nf_register_sockopt(&ipt_sockopts);
        if (ret < 0)
                goto err4;

        return 0;

err4:
        xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
err2:
        unregister_pernet_subsys(&ip_tables_net_ops);
err1:
        return ret;
}

static void __exit ip_tables_fini(void)
{
        nf_unregister_sockopt(&ipt_sockopts);

        xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
        unregister_pernet_subsys(&ip_tables_net_ops);
}

EXPORT_SYMBOL(ipt_register_table);
EXPORT_SYMBOL(ipt_unregister_table_pre_exit);
EXPORT_SYMBOL(ipt_unregister_table_exit);
EXPORT_SYMBOL(ipt_do_table);
module_init(ip_tables_init);
module_exit(ip_tables_fini);




























































































































































































































































































































































    3 




    3 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
// SPDX-License-Identifier: GPL-2.0
/*
 * shstk.c - Intel shadow stack support
 *
 * Copyright (c) 2021, Intel Corporation.
 * Yu-cheng Yu <yu-cheng.yu@intel.com>
 */

#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/signal.h>
#include <linux/compat.h>
#include <linux/sizes.h>
#include <linux/user.h>
#include <linux/syscalls.h>
#include <asm/msr.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/types.h>
#include <asm/shstk.h>
#include <asm/special_insns.h>
#include <asm/fpu/api.h>
#include <asm/prctl.h>

#define SS_FRAME_SIZE 8

static bool features_enabled(unsigned long features)
{
        return current->thread.features & features;
}

static void features_set(unsigned long features)
{
        current->thread.features |= features;
}

static void features_clr(unsigned long features)
{
        current->thread.features &= ~features;
}

/*
 * Create a restore token on the shadow stack.  A token is always 8-byte
 * and aligned to 8.
 */
static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
{
        unsigned long addr;

        /* Token must be aligned */
        if (!IS_ALIGNED(ssp, 8))
                return -EINVAL;

        addr = ssp - SS_FRAME_SIZE;

        /*
         * SSP is aligned, so reserved bits and mode bit are a zero, just mark
         * the token 64-bit.
         */
        ssp |= BIT(0);

        if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
                return -EFAULT;

        if (token_addr)
                *token_addr = addr;

        return 0;
}

/*
 * VM_SHADOW_STACK will have a guard page. This helps userspace protect
 * itself from attacks. The reasoning is as follows:
 *
 * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
 * INCSSP instruction can increment the shadow stack pointer. It is the
 * shadow stack analog of an instruction like:
 *
 *   addq $0x80, %rsp
 *
 * However, there is one important difference between an ADD on %rsp
 * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
 * memory of the first and last elements that were "popped". It can be
 * thought of as acting like this:
 *
 * READ_ONCE(ssp);       // read+discard top element on stack
 * ssp += nr_to_pop * 8; // move the shadow stack
 * READ_ONCE(ssp-8);     // read+discard last popped stack element
 *
 * The maximum distance INCSSP can move the SSP is 2040 bytes, before
 * it would read the memory. Therefore a single page gap will be enough
 * to prevent any operation from shifting the SSP to an adjacent stack,
 * since it would have to land in the gap at least once, causing a
 * fault.
 */
static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
                                 unsigned long token_offset, bool set_res_tok)
{
        int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
        struct mm_struct *mm = current->mm;
        unsigned long mapped_addr, unused;

        if (addr)
                flags |= MAP_FIXED_NOREPLACE;

        mmap_write_lock(mm);
        mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
                              VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
        mmap_write_unlock(mm);

        if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
                goto out;

        if (create_rstor_token(mapped_addr + token_offset, NULL)) {
                vm_munmap(mapped_addr, size);
                return -EINVAL;
        }

out:
        return mapped_addr;
}

static unsigned long adjust_shstk_size(unsigned long size)
{
        if (size)
                return PAGE_ALIGN(size);

        return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
}

static void unmap_shadow_stack(u64 base, u64 size)
{
        int r;

        r = vm_munmap(base, size);

        /*
         * mmap_write_lock_killable() failed with -EINTR. This means
         * the process is about to die and have it's MM cleaned up.
         * This task shouldn't ever make it back to userspace. In this
         * case it is ok to leak a shadow stack, so just exit out.
         */
        if (r == -EINTR)
                return;

        /*
         * For all other types of vm_munmap() failure, either the
         * system is out of memory or there is bug.
         */
        WARN_ON_ONCE(r);
}

static int shstk_setup(void)
{
        struct thread_shstk *shstk = &current->thread.shstk;
        unsigned long addr, size;

        /* Already enabled */
        if (features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        /* Also not supported for 32 bit */
        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
                return -EOPNOTSUPP;

        size = adjust_shstk_size(0);
        addr = alloc_shstk(0, size, 0, false);
        if (IS_ERR_VALUE(addr))
                return PTR_ERR((void *)addr);

        fpregs_lock_and_load();
        wrmsrl(MSR_IA32_PL3_SSP, addr + size);
        wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
        fpregs_unlock();

        shstk->base = addr;
        shstk->size = size;
        features_set(ARCH_SHSTK_SHSTK);

        return 0;
}

void reset_thread_features(void)
{
        memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
        current->thread.features = 0;
        current->thread.features_locked = 0;
}

unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
                                       unsigned long stack_size)
{
        struct thread_shstk *shstk = &tsk->thread.shstk;
        unsigned long addr, size;

        /*
         * If shadow stack is not enabled on the new thread, skip any
         * switch to a new shadow stack.
         */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        /*
         * For CLONE_VFORK the child will share the parents shadow stack.
         * Make sure to clear the internal tracking of the thread shadow
         * stack so the freeing logic run for child knows to leave it alone.
         */
        if (clone_flags & CLONE_VFORK) {
                shstk->base = 0;
                shstk->size = 0;
                return 0;
        }

        /*
         * For !CLONE_VM the child will use a copy of the parents shadow
         * stack.
         */
        if (!(clone_flags & CLONE_VM))
                return 0;

        size = adjust_shstk_size(stack_size);
        addr = alloc_shstk(0, size, 0, false);
        if (IS_ERR_VALUE(addr))
                return addr;

        shstk->base = addr;
        shstk->size = size;

        return addr + size;
}

static unsigned long get_user_shstk_addr(void)
{
        unsigned long long ssp;

        fpregs_lock_and_load();

        rdmsrl(MSR_IA32_PL3_SSP, ssp);

        fpregs_unlock();

        return ssp;
}

#define SHSTK_DATA_BIT BIT(63)

static int put_shstk_data(u64 __user *addr, u64 data)
{
        if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
                return -EINVAL;

        /*
         * Mark the high bit so that the sigframe can't be processed as a
         * return address.
         */
        if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
                return -EFAULT;
        return 0;
}

static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
{
        unsigned long ldata;

        if (unlikely(get_user(ldata, addr)))
                return -EFAULT;

        if (!(ldata & SHSTK_DATA_BIT))
                return -EINVAL;

        *data = ldata & ~SHSTK_DATA_BIT;

        return 0;
}

static int shstk_push_sigframe(unsigned long *ssp)
{
        unsigned long target_ssp = *ssp;

        /* Token must be aligned */
        if (!IS_ALIGNED(target_ssp, 8))
                return -EINVAL;

        *ssp -= SS_FRAME_SIZE;
        if (put_shstk_data((void __user *)*ssp, target_ssp))
                return -EFAULT;

        return 0;
}

static int shstk_pop_sigframe(unsigned long *ssp)
{
        struct vm_area_struct *vma;
        unsigned long token_addr;
        bool need_to_check_vma;
        int err = 1;

        /*
         * It is possible for the SSP to be off the end of a shadow stack by 4
         * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
         * before it, it might be this case, so check that the address being
         * read is actually shadow stack.
         */
        if (!IS_ALIGNED(*ssp, 8))
                return -EINVAL;

        need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;

        if (need_to_check_vma)
                mmap_read_lock_killable(current->mm);

        err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
        if (unlikely(err))
                goto out_err;

        if (need_to_check_vma) {
                vma = find_vma(current->mm, *ssp);
                if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
                        err = -EFAULT;
                        goto out_err;
                }

                mmap_read_unlock(current->mm);
        }

        /* Restore SSP aligned? */
        if (unlikely(!IS_ALIGNED(token_addr, 8)))
                return -EINVAL;

        /* SSP in userspace? */
        if (unlikely(token_addr >= TASK_SIZE_MAX))
                return -EINVAL;

        *ssp = token_addr;

        return 0;
out_err:
        if (need_to_check_vma)
                mmap_read_unlock(current->mm);
        return err;
}

int setup_signal_shadow_stack(struct ksignal *ksig)
{
        void __user *restorer = ksig->ka.sa.sa_restorer;
        unsigned long ssp;
        int err;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        if (!restorer)
                return -EINVAL;

        ssp = get_user_shstk_addr();
        if (unlikely(!ssp))
                return -EINVAL;

        err = shstk_push_sigframe(&ssp);
        if (unlikely(err))
                return err;

        /* Push restorer address */
        ssp -= SS_FRAME_SIZE;
        err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
        if (unlikely(err))
                return -EFAULT;

        fpregs_lock_and_load();
        wrmsrl(MSR_IA32_PL3_SSP, ssp);
        fpregs_unlock();

        return 0;
}

int restore_signal_shadow_stack(void)
{
        unsigned long ssp;
        int err;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        ssp = get_user_shstk_addr();
        if (unlikely(!ssp))
                return -EINVAL;

        err = shstk_pop_sigframe(&ssp);
        if (unlikely(err))
                return err;

        fpregs_lock_and_load();
        wrmsrl(MSR_IA32_PL3_SSP, ssp);
        fpregs_unlock();

        return 0;
}

void shstk_free(struct task_struct *tsk)
{
        struct thread_shstk *shstk = &tsk->thread.shstk;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return;

        /*
         * When fork() with CLONE_VM fails, the child (tsk) already has a
         * shadow stack allocated, and exit_thread() calls this function to
         * free it.  In this case the parent (current) and the child share
         * the same mm struct.
         */
        if (!tsk->mm || tsk->mm != current->mm)
                return;

        /*
         * If shstk->base is NULL, then this task is not managing its
         * own shadow stack (CLONE_VFORK). So skip freeing it.
         */
        if (!shstk->base)
                return;

        /*
         * shstk->base is NULL for CLONE_VFORK child tasks, and so is
         * normal. But size = 0 on a shstk->base is not normal and
         * indicated an attempt to free the thread shadow stack twice.
         * Warn about it.
         */
        if (WARN_ON(!shstk->size))
                return;

        unmap_shadow_stack(shstk->base, shstk->size);

        shstk->size = 0;
}

static int wrss_control(bool enable)
{
        u64 msrval;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        /*
         * Only enable WRSS if shadow stack is enabled. If shadow stack is not
         * enabled, WRSS will already be disabled, so don't bother clearing it
         * when disabling.
         */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return -EPERM;

        /* Already enabled/disabled? */
        if (features_enabled(ARCH_SHSTK_WRSS) == enable)
                return 0;

        fpregs_lock_and_load();
        rdmsrl(MSR_IA32_U_CET, msrval);

        if (enable) {
                features_set(ARCH_SHSTK_WRSS);
                msrval |= CET_WRSS_EN;
        } else {
                features_clr(ARCH_SHSTK_WRSS);
                if (!(msrval & CET_WRSS_EN))
                        goto unlock;

                msrval &= ~CET_WRSS_EN;
        }

        wrmsrl(MSR_IA32_U_CET, msrval);

unlock:
        fpregs_unlock();

        return 0;
}

static int shstk_disable(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        /* Already disabled? */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        fpregs_lock_and_load();
        /* Disable WRSS too when disabling shadow stack */
        wrmsrl(MSR_IA32_U_CET, 0);
        wrmsrl(MSR_IA32_PL3_SSP, 0);
        fpregs_unlock();

        shstk_free(current);
        features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);

        return 0;
}

SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
{
        bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
        unsigned long aligned_size;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        if (flags & ~SHADOW_STACK_SET_TOKEN)
                return -EINVAL;

        /* If there isn't space for a token */
        if (set_tok && size < 8)
                return -ENOSPC;

        if (addr && addr < SZ_4G)
                return -ERANGE;

        /*
         * An overflow would result in attempting to write the restore token
         * to the wrong location. Not catastrophic, but just return the right
         * error code and block it.
         */
        aligned_size = PAGE_ALIGN(size);
        if (aligned_size < size)
                return -EOVERFLOW;

        return alloc_shstk(addr, aligned_size, size, set_tok);
}

long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
{
        unsigned long features = arg2;

        if (option == ARCH_SHSTK_STATUS) {
                return put_user(task->thread.features, (unsigned long __user *)arg2);
        }

        if (option == ARCH_SHSTK_LOCK) {
                task->thread.features_locked |= features;
                return 0;
        }

        /* Only allow via ptrace */
        if (task != current) {
                if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
                        task->thread.features_locked &= ~features;
                        return 0;
                }
                return -EINVAL;
        }

        /* Do not allow to change locked features */
        if (features & task->thread.features_locked)
                return -EPERM;

        /* Only support enabling/disabling one feature at a time. */
        if (hweight_long(features) > 1)
                return -EINVAL;

        if (option == ARCH_SHSTK_DISABLE) {
                if (features & ARCH_SHSTK_WRSS)
                        return wrss_control(false);
                if (features & ARCH_SHSTK_SHSTK)
                        return shstk_disable();
                return -EINVAL;
        }

        /* Handle ARCH_SHSTK_ENABLE */
        if (features & ARCH_SHSTK_SHSTK)
                return shstk_setup();
        if (features & ARCH_SHSTK_WRSS)
                return wrss_control(true);
        return -EINVAL;
}























    1 












    1 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGALLOC_TRACK_H
#define _LINUX_PGALLOC_TRACK_H

#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pgd_none(*pgd))) {
                if (__p4d_alloc(mm, pgd, address))
                        return NULL;
                *mod_mask |= PGTBL_PGD_MODIFIED;
        }

        return p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(p4d_none(*p4d))) {
                if (__pud_alloc(mm, p4d, address))
                        return NULL;
                *mod_mask |= PGTBL_P4D_MODIFIED;
        }

        return pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pud_none(*pud))) {
                if (__pmd_alloc(mm, pud, address))
                        return NULL;
                *mod_mask |= PGTBL_PUD_MODIFIED;
        }

        return pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

#define pte_alloc_kernel_track(pmd, address, mask)                        \
        ((unlikely(pmd_none(*(pmd))) &&                                        \
          (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
                NULL: pte_offset_kernel(pmd, address))

#endif /* _LINUX_PGALLOC_TRACK_H */











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2023 Isovalent */
#ifndef __NET_TCX_H
#define __NET_TCX_H

#include <linux/bpf.h>
#include <linux/bpf_mprog.h>

#include <net/sch_generic.h>

struct mini_Qdisc;

struct tcx_entry {
        struct mini_Qdisc __rcu *miniq;
        struct bpf_mprog_bundle bundle;
        bool miniq_active;
        struct rcu_head rcu;
};

struct tcx_link {
        struct bpf_link link;
        struct net_device *dev;
        u32 location;
};

static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
{
#ifdef CONFIG_NET_XGRESS
        skb->tc_at_ingress = ingress;
#endif
}

#ifdef CONFIG_NET_XGRESS
static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
{
        struct bpf_mprog_bundle *bundle = entry->parent;

        return container_of(bundle, struct tcx_entry, bundle);
}

static inline struct tcx_link *tcx_link(const struct bpf_link *link)
{
        return container_of(link, struct tcx_link, link);
}

void tcx_inc(void);
void tcx_dec(void);

static inline void tcx_entry_sync(void)
{
        /* bpf_mprog_entry got a/b swapped, therefore ensure that
         * there are no inflight users on the old one anymore.
         */
        synchronize_rcu();
}

static inline void
tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry,
                 bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                rcu_assign_pointer(dev->tcx_ingress, entry);
        else
                rcu_assign_pointer(dev->tcx_egress, entry);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch(struct net_device *dev, bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                return rcu_dereference_rtnl(dev->tcx_ingress);
        else
                return rcu_dereference_rtnl(dev->tcx_egress);
}

static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void)
{
        struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL);

        if (tcx) {
                bpf_mprog_bundle_init(&tcx->bundle);
                return &tcx->bundle.a;
        }
        return NULL;
}
#define tcx_entry_create(...)        alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__))

static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
{
        kfree_rcu(tcx_entry(entry), rcu);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created)
{
        struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress);

        *created = false;
        if (!entry) {
                entry = tcx_entry_create();
                if (!entry)
                        return NULL;
                *created = true;
        }
        return entry;
}

static inline void tcx_skeys_inc(bool ingress)
{
        tcx_inc();
        if (ingress)
                net_inc_ingress_queue();
        else
                net_inc_egress_queue();
}

static inline void tcx_skeys_dec(bool ingress)
{
        if (ingress)
                net_dec_ingress_queue();
        else
                net_dec_egress_queue();
        tcx_dec();
}

static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry,
                                        const bool active)
{
        ASSERT_RTNL();
        tcx_entry(entry)->miniq_active = active;
}

static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active;
}

static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb,
                                                   int code)
{
        switch (code) {
        case TCX_PASS:
                skb->tc_index = qdisc_skb_cb(skb)->tc_classid;
                fallthrough;
        case TCX_DROP:
        case TCX_REDIRECT:
                return code;
        case TCX_NEXT:
        default:
                return TCX_NEXT;
        }
}
#endif /* CONFIG_NET_XGRESS */

#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL)
int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
void tcx_uninstall(struct net_device *dev, bool ingress);

int tcx_prog_query(const union bpf_attr *attr,
                   union bpf_attr __user *uattr);

static inline void dev_tcx_uninstall(struct net_device *dev)
{
        ASSERT_RTNL();
        tcx_uninstall(dev, true);
        tcx_uninstall(dev, false);
}
#else
static inline int tcx_prog_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_link_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_detach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_query(const union bpf_attr *attr,
                                 union bpf_attr __user *uattr)
{
        return -EINVAL;
}

static inline void dev_tcx_uninstall(struct net_device *dev)
{
}
#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */
#endif /* __NET_TCX_H */





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
 * Linux wait queue related types and methods
 */
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>

#include <asm/current.h>

typedef struct wait_queue_entry wait_queue_entry_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE        0x01
#define WQ_FLAG_WOKEN                0x02
#define WQ_FLAG_CUSTOM                0x04
#define WQ_FLAG_DONE                0x08
#define WQ_FLAG_PRIORITY        0x10

/*
 * A single wait-queue entry structure:
 */
struct wait_queue_entry {
        unsigned int                flags;
        void                        *private;
        wait_queue_func_t        func;
        struct list_head        entry;
};

struct wait_queue_head {
        spinlock_t                lock;
        struct list_head        head;
};
typedef struct wait_queue_head wait_queue_head_t;

struct task_struct;

/*
 * Macros for declaration and initialisaton of the datatypes
 */

#define __WAITQUEUE_INITIALIZER(name, tsk) {                                        \
        .private        = tsk,                                                        \
        .func                = default_wake_function,                                \
        .entry                = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                                                \
        struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                        \
        .lock                = __SPIN_LOCK_UNLOCKED(name.lock),                        \
        .head                = LIST_HEAD_INIT(name.head) }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);

#define init_waitqueue_head(wq_head)                                                \
        do {                                                                        \
                static struct lock_class_key __key;                                \
                                                                                \
                __init_waitqueue_head((wq_head), #wq_head, &__key);                \
        } while (0)

#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
        ({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
        wq_entry->flags                = 0;
        wq_entry->private        = p;
        wq_entry->func                = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
        wq_entry->flags                = 0;
        wq_entry->private        = NULL;
        wq_entry->func                = func;
}

/**
 * waitqueue_active -- locklessly test for waiters on the queue
 * @wq_head: the waitqueue to test for waiters
 *
 * returns true if the wait list is not empty
 *
 * NOTE: this function is lockless and requires care, incorrect usage _will_
 * lead to sporadic and non-obvious failure.
 *
 * Use either while holding wait_queue_head::lock or when used for wakeups
 * with an extra smp_mb() like::
 *
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
 *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (waitqueue_active(wq_head))         if (@cond)
 *        wake_up(wq_head);                      break;
 *                                        schedule();
 *                                      }
 *                                      finish_wait(&wq_head, &wait);
 *
 * Because without the explicit smp_mb() it's possible for the
 * waitqueue_active() load to get hoisted over the @cond store such that we'll
 * observe an empty wait list while the waiter might not observe @cond.
 *
 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
 * which (when the lock is uncontended) are of roughly equal cost.
 */
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
        return !list_empty(&wq_head->head);
}

/**
 * wq_has_single_sleeper - check if there is only one sleeper
 * @wq_head: wait queue head
 *
 * Returns true of wq_head has only one sleeper on the list.
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
        return list_is_singular(&wq_head->head);
}

/**
 * wq_has_sleeper - check if there are any waiting processes
 * @wq_head: wait queue head
 *
 * Returns true if wq_head has waiting processes
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
        /*
         * We need to be sure we are in sync with the
         * add_wait_queue modifications to the wait queue.
         *
         * This memory barrier should be paired with one on the
         * waiting side.
         */
        smp_mb();
        return waitqueue_active(wq_head);
}

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        struct list_head *head = &wq_head->head;
        struct wait_queue_entry *wq;

        list_for_each_entry(wq, &wq_head->head, entry) {
                if (!(wq->flags & WQ_FLAG_PRIORITY))
                        break;
                head = &wq->entry;
        }
        list_add(&wq_entry->entry, head);
}

/*
 * Used for wake-one threads:
 */
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq_head, wq_entry);
}

static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add_tail(&wq_entry->entry, &wq_head->head);
}

static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_entry_tail(wq_head, wq_entry);
}

static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_del(&wq_entry->entry);
}

int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)                __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x)                        __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x)                __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x)                __wake_up_locked((x), TASK_NORMAL, 0)

#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x)        __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x)        __wake_up_sync((x), TASK_INTERRUPTIBLE)

/*
 * Wakeup macros to be used to report events to the targets.
 */
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m)                                                        \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_poll_on_current_cpu(x, m)                                        \
        __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m)                                                \
        __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m)                                        \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m)                                        \
        __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m)                                \
        __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
        /*
         * For performance reasons, we don't always take the queue lock here.
         * Therefore, we might race with someone removing the last entry from
         * the queue, and proceed while they still hold the queue lock.
         * However, rcu_read_lock() is required to be held in such cases, so we
         * can safely proceed with an RCU-delayed free.
         */
        if (waitqueue_active(wq_head))
                __wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)                                                \
({                                                                                \
        bool __cond = (condition);                                                \
        if (__cond && !__ret)                                                        \
                __ret = 1;                                                        \
        __cond || !__ret;                                                        \
})

#define ___wait_is_interruptible(state)                                                \
        (!__builtin_constant_p(state) ||                                        \
         (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))

extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);

/*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
 *
 * This is so that both can use the ___wait_cond_timeout() construct
 * to wrap the condition.
 *
 * The type inconsistency of the wait_event_*() __ret variable is also
 * on purpose; we use long where we can return timeout values and int
 * otherwise.
 */

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)                \
({                                                                                \
        __label__ __out;                                                        \
        struct wait_queue_entry __wq_entry;                                        \
        long __ret = ret;        /* explicit shadow */                                \
                                                                                \
        init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);        \
        for (;;) {                                                                \
                long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
                                                                                \
                if (___wait_is_interruptible(state) && __int) {                        \
                        __ret = __int;                                                \
                        goto __out;                                                \
                }                                                                \
                                                                                \
                cmd;                                                                \
        }                                                                        \
        finish_wait(&wq_head, &__wq_entry);                                        \
__out:        __ret;                                                                        \
})

#define __wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            schedule())

/**
 * wait_event - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq_head, condition)                                                \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event(wq_head, condition);                                        \
} while (0)

#define __io_wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            io_schedule())

/*
 * io_wait_event() -- like wait_event() but with io_schedule()
 */
#define io_wait_event(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __io_wait_event(wq_head, condition);                                        \
} while (0)

#define __wait_event_freezable(wq_head, condition)                                \
        ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE),        \
                        0, 0, schedule())

/**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
 * to system load) until the @condition evaluates to true. The
 * @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_freezable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_UNINTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_timeout(wq_head, condition, timeout)                                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_freezable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout,                \
                      __ret = schedule_timeout(__ret))

/*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
 * increasing load and is freezable.
 */
#define wait_event_freezable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
        __ret;                                                                        \
})

#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0,        \
                            cmd1; schedule(); cmd2)
/*
 * Just like wait_event_cmd(), except it sets exclusive flag
 */
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);                \
} while (0)

#define __wait_event_cmd(wq_head, condition, cmd1, cmd2)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            cmd1; schedule(); cmd2)

/**
 * wait_event_cmd - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @cmd1: the command will be executed before sleep
 * @cmd2: the command will be executed after sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_cmd(wq_head, condition, cmd1, cmd2)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_cmd(wq_head, condition, cmd1, cmd2);                        \
} while (0)

#define __wait_event_interruptible(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      schedule())

/**
 * wait_event_interruptible - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a signal.
 */
#define wait_event_interruptible_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_interruptible_timeout(wq_head,                \
                                                condition, timeout);                \
        __ret;                                                                        \
})

#define __wait_event_hrtimeout(wq_head, condition, timeout, state)                \
({                                                                                \
        int __ret = 0;                                                                \
        struct hrtimer_sleeper __t;                                                \
                                                                                \
        hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                        \
                                      HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX) {                                                \
                hrtimer_set_expires_range_ns(&__t.timer, timeout,                \
                                        current->timer_slack_ns);                \
                hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);                \
        }                                                                        \
                                                                                \
        __ret = ___wait_event(wq_head, condition, state, 0, 0,                        \
                if (!__t.task) {                                                \
                        __ret = -ETIME;                                                \
                        break;                                                        \
                }                                                                \
                schedule());                                                        \
                                                                                \
        hrtimer_cancel(&__t.timer);                                                \
        destroy_hrtimer_on_stack(&__t.timer);                                        \
        __ret;                                                                        \
})

/**
 * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, or -ETIME if the timeout
 * elapsed.
 */
#define wait_event_hrtimeout(wq_head, condition, timeout)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq_head, condition, timeout,        \
                                               TASK_UNINTERRUPTIBLE);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, -ERESTARTSYS if it was
 * interrupted by a signal, or -ETIME if the timeout elapsed.
 */
#define wait_event_interruptible_hrtimeout(wq, condition, timeout)                \
({                                                                                \
        long __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq, condition, timeout,                \
                                               TASK_INTERRUPTIBLE);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_exclusive(wq, condition)                        \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                      schedule())

#define wait_event_interruptible_exclusive(wq, condition)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_exclusive(wq, condition);        \
        __ret;                                                                        \
})

#define __wait_event_killable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_KILLABLE, 1, 0,                        \
                      schedule())

#define wait_event_killable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable_exclusive(wq, condition);                \
        __ret;                                                                        \
})


#define __wait_event_freezable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
                        schedule())

#define wait_event_freezable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable_exclusive(wq, condition);        \
        __ret;                                                                        \
})

/**
 * wait_event_idle - wait for a condition without contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule());        \
} while (0)

/**
 * wait_event_idle_exclusive - wait for a condition with contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle_exclusive(wq_head, condition)                                \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule());        \
} while (0)

#define __wait_event_idle_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 0, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_timeout(wq_head, condition, timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 1, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
        __ret;                                                                        \
})

extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);

#define __wait_event_interruptible_locked(wq, condition, exclusive, fn)                \
({                                                                                \
        int __ret;                                                                \
        DEFINE_WAIT(__wait);                                                        \
        if (exclusive)                                                                \
                __wait.flags |= WQ_FLAG_EXCLUSIVE;                                \
        do {                                                                        \
                __ret = fn(&(wq), &__wait);                                        \
                if (__ret)                                                        \
                        break;                                                        \
        } while (!(condition));                                                        \
        __remove_wait_queue(&(wq), &__wait);                                        \
        __set_current_state(TASK_RUNNING);                                        \
        __ret;                                                                        \
})


/**
 * wait_event_interruptible_locked - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked(wq, condition)                                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))

/**
 * wait_event_interruptible_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked_irq(wq, condition)                        \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))

/**
 * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))

/**
 * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked_irq(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))


#define __wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())

/**
 * wait_event_killable - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_killable(wq_head, condition)                                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_state(wq, condition, state)                                \
        ___wait_event(wq, condition, state, 0, 0, schedule())

/**
 * wait_event_state - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @state: state to sleep in
 *
 * The process is put to sleep (@state) until the @condition evaluates to true
 * or a signal is received (when allowed by @state).  The @condition is checked
 * each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a signal
 * (when allowed by @state) and 0 if @condition evaluated to true.
 */
#define wait_event_state(wq_head, condition, state)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_state(wq_head, condition, state);                \
        __ret;                                                                        \
})

#define __wait_event_killable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_KILLABLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a kill signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a kill signal.
 *
 * Only kill signals interrupt this process.
 */
#define wait_event_killable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_killable_timeout(wq_head,                        \
                                                condition, timeout);                \
        __ret;                                                                        \
})


#define __wait_event_lock_irq(wq_head, condition, lock, cmd)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            spin_unlock_irq(&lock);                                \
                            cmd;                                                \
                            schedule();                                                \
                            spin_lock_irq(&lock))

/**
 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
 *                             condition is checked under the lock. This
 *                             is expected to be called with the lock
 *                             taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd
 *          and schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 */
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd)                        \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, cmd);                        \
} while (0)

/**
 * wait_event_lock_irq - sleep until a condition gets true. The
 *                         condition is checked under the lock. This
 *                         is expected to be called with the lock
 *                         taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 */
#define wait_event_lock_irq(wq_head, condition, lock)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, );                        \
} while (0)


#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd)        \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      spin_unlock_irq(&lock);                                        \
                      cmd;                                                        \
                      schedule();                                                \
                      spin_lock_irq(&lock))

/**
 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected to
 *                be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd and
 *          schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd)        \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock, cmd);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_lock_irq - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected
 *                to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq(wq_head, condition, lock)                \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock,);                \
        __ret;                                                                        \
})

#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      state, 0, timeout,                                        \
                      spin_unlock_irq(&lock);                                        \
                      __ret = schedule_timeout(__ret);                                \
                      spin_lock_irq(&lock));

/**
 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
 *                true or a timeout elapses. The condition is checked under
 *                the lock. This is expected to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
 * was interrupted by a signal, and the remaining jiffies otherwise
 * if the condition evaluated to true before the timeout elapsed.
 */
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock,        \
                                                  timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_INTERRUPTIBLE);                        \
        __ret;                                                                        \
})

#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_UNINTERRUPTIBLE);                        \
        __ret;                                                                        \
})

/*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
 */
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_FUNC(name, function)                                        \
        struct wait_queue_entry name = {                                        \
                .private        = current,                                        \
                .func                = function,                                        \
                .entry                = LIST_HEAD_INIT((name).entry),                        \
        }

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define init_wait(wait)                                                                \
        do {                                                                        \
                (wait)->private = current;                                        \
                (wait)->func = autoremove_wake_function;                        \
                INIT_LIST_HEAD(&(wait)->entry);                                        \
                (wait)->flags = 0;                                                \
        } while (0)

typedef int (*task_call_f)(struct task_struct *p, void *arg);
extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);

#endif /* _LINUX_WAIT_H */













































































    2 





    2 











































    3 









    4 










    5 


    3 

    2 

    3 

























































    2 















    2 























































































































































    1 






    2 


























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
// SPDX-License-Identifier: GPL-2.0

#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>

#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_bridge.h>
#include <net/netfilter/nf_log.h>

#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/sysctl.h>
#include <net/route.h>
#include <net/ip.h>

#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>

#include <linux/ipv6.h>
#include <linux/in6.h>
#include <net/ipv6.h>
#include <net/inet_frag.h>

static DEFINE_MUTEX(nf_ct_proto_mutex);

#ifdef CONFIG_SYSCTL
__printf(4, 5)
void nf_l4proto_log_invalid(const struct sk_buff *skb,
                            const struct nf_hook_state *state,
                            u8 protonum,
                            const char *fmt, ...)
{
        struct net *net = state->net;
        struct va_format vaf;
        va_list args;

        if (net->ct.sysctl_log_invalid != protonum &&
            net->ct.sysctl_log_invalid != IPPROTO_RAW)
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;

        nf_log_packet(net, state->pf, 0, skb, state->in, state->out,
                      NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf);
        va_end(args);
}
EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid);

__printf(4, 5)
void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
                               const struct nf_conn *ct,
                               const struct nf_hook_state *state,
                               const char *fmt, ...)
{
        struct va_format vaf;
        struct net *net;
        va_list args;

        net = nf_ct_net(ct);
        if (likely(net->ct.sysctl_log_invalid == 0))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;

        nf_l4proto_log_invalid(skb, state,
                               nf_ct_protonum(ct), "%pV", &vaf);
        va_end(args);
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid);
#endif

const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
{
        switch (l4proto) {
        case IPPROTO_UDP: return &nf_conntrack_l4proto_udp;
        case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp;
        case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp;
#ifdef CONFIG_NF_CT_PROTO_DCCP
        case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp;
#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
        case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp;
#endif
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
        case IPPROTO_UDPLITE: return &nf_conntrack_l4proto_udplite;
#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
        case IPPROTO_GRE: return &nf_conntrack_l4proto_gre;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ICMPV6: return &nf_conntrack_l4proto_icmpv6;
#endif /* CONFIG_IPV6 */
        }

        return &nf_conntrack_l4proto_generic;
};
EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);

static bool in_vrf_postrouting(const struct nf_hook_state *state)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (state->hook == NF_INET_POST_ROUTING &&
            netif_is_l3_master(state->out))
                return true;
#endif
        return false;
}

unsigned int nf_confirm(void *priv,
                        struct sk_buff *skb,
                        const struct nf_hook_state *state)
{
        const struct nf_conn_help *help;
        enum ip_conntrack_info ctinfo;
        unsigned int protoff;
        struct nf_conn *ct;
        bool seqadj_needed;
        __be16 frag_off;
        int start;
        u8 pnum;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct || in_vrf_postrouting(state))
                return NF_ACCEPT;

        help = nfct_help(ct);

        seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb);
        if (!help && !seqadj_needed)
                return nf_conntrack_confirm(skb);

        /* helper->help() do not expect ICMP packets */
        if (ctinfo == IP_CT_RELATED_REPLY)
                return nf_conntrack_confirm(skb);

        switch (nf_ct_l3num(ct)) {
        case NFPROTO_IPV4:
                protoff = skb_network_offset(skb) + ip_hdrlen(skb);
                break;
        case NFPROTO_IPV6:
                pnum = ipv6_hdr(skb)->nexthdr;
                start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off);
                if (start < 0 || (frag_off & htons(~0x7)) != 0)
                        return nf_conntrack_confirm(skb);

                protoff = start;
                break;
        default:
                return nf_conntrack_confirm(skb);
        }

        if (help) {
                const struct nf_conntrack_helper *helper;
                int ret;

                /* rcu_read_lock()ed by nf_hook */
                helper = rcu_dereference(help->helper);
                if (helper) {
                        ret = helper->help(skb,
                                           protoff,
                                           ct, ctinfo);
                        if (ret != NF_ACCEPT)
                                return ret;
                }
        }

        if (seqadj_needed &&
            !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
                NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
                return NF_DROP;
        }

        /* We've seen it coming out the other side: confirm it */
        return nf_conntrack_confirm(skb);
}
EXPORT_SYMBOL_GPL(nf_confirm);

static unsigned int ipv4_conntrack_in(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        return nf_conntrack_in(skb, state);
}

static unsigned int ipv4_conntrack_local(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
                enum ip_conntrack_info ctinfo;
                struct nf_conn *tmpl;

                tmpl = nf_ct_get(skb, &ctinfo);
                if (tmpl && nf_ct_is_template(tmpl)) {
                        /* when skipping ct, clear templates to avoid fooling
                         * later targets/matches
                         */
                        skb->_nfct = 0;
                        nf_ct_put(tmpl);
                }
                return NF_ACCEPT;
        }

        return nf_conntrack_in(skb, state);
}

/* Connection tracking may drop packets, but never alters them, so
 * make it the first hook.
 */
static const struct nf_hook_ops ipv4_conntrack_ops[] = {
        {
                .hook                = ipv4_conntrack_in,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP_PRI_CONNTRACK,
        },
        {
                .hook                = ipv4_conntrack_local,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority        = NF_IP_PRI_CONNTRACK,
        },
        {
                .hook                = nf_confirm,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority        = NF_IP_PRI_CONNTRACK_CONFIRM,
        },
        {
                .hook                = nf_confirm,
                .pf                = NFPROTO_IPV4,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority        = NF_IP_PRI_CONNTRACK_CONFIRM,
        },
};

/* Fast function for those who don't want to parse /proc (and I don't
 * blame them).
 * Reversing the socket's dst/src point of view gives us the reply
 * mapping.
 */
static int
getorigdst(struct sock *sk, int optval, void __user *user, int *len)
{
        const struct inet_sock *inet = inet_sk(sk);
        const struct nf_conntrack_tuple_hash *h;
        struct nf_conntrack_tuple tuple;

        memset(&tuple, 0, sizeof(tuple));

        lock_sock(sk);
        tuple.src.u3.ip = inet->inet_rcv_saddr;
        tuple.src.u.tcp.port = inet->inet_sport;
        tuple.dst.u3.ip = inet->inet_daddr;
        tuple.dst.u.tcp.port = inet->inet_dport;
        tuple.src.l3num = PF_INET;
        tuple.dst.protonum = sk->sk_protocol;
        release_sock(sk);

        /* We only do TCP and SCTP at the moment: is there a better way? */
        if (tuple.dst.protonum != IPPROTO_TCP &&
            tuple.dst.protonum != IPPROTO_SCTP)
                return -ENOPROTOOPT;

        if ((unsigned int)*len < sizeof(struct sockaddr_in))
                return -EINVAL;

        h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
        if (h) {
                struct sockaddr_in sin;
                struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

                sin.sin_family = AF_INET;
                sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                        .tuple.dst.u.tcp.port;
                sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
                        .tuple.dst.u3.ip;
                memset(sin.sin_zero, 0, sizeof(sin.sin_zero));

                nf_ct_put(ct);
                if (copy_to_user(user, &sin, sizeof(sin)) != 0)
                        return -EFAULT;
                else
                        return 0;
        }
        return -ENOENT;
}

static struct nf_sockopt_ops so_getorigdst = {
        .pf                = PF_INET,
        .get_optmin        = SO_ORIGINAL_DST,
        .get_optmax        = SO_ORIGINAL_DST + 1,
        .get                = getorigdst,
        .owner                = THIS_MODULE,
};

#if IS_ENABLED(CONFIG_IPV6)
static int
ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
{
        struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
        const struct ipv6_pinfo *inet6 = inet6_sk(sk);
        const struct inet_sock *inet = inet_sk(sk);
        const struct nf_conntrack_tuple_hash *h;
        struct sockaddr_in6 sin6;
        struct nf_conn *ct;
        __be32 flow_label;
        int bound_dev_if;

        lock_sock(sk);
        tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
        tuple.src.u.tcp.port = inet->inet_sport;
        tuple.dst.u3.in6 = sk->sk_v6_daddr;
        tuple.dst.u.tcp.port = inet->inet_dport;
        tuple.dst.protonum = sk->sk_protocol;
        bound_dev_if = sk->sk_bound_dev_if;
        flow_label = inet6->flow_label;
        release_sock(sk);

        if (tuple.dst.protonum != IPPROTO_TCP &&
            tuple.dst.protonum != IPPROTO_SCTP)
                return -ENOPROTOOPT;

        if (*len < 0 || (unsigned int)*len < sizeof(sin6))
                return -EINVAL;

        h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
        if (!h)
                return -ENOENT;

        ct = nf_ct_tuplehash_to_ctrack(h);

        sin6.sin6_family = AF_INET6;
        sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
        sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
        memcpy(&sin6.sin6_addr,
               &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
               sizeof(sin6.sin6_addr));

        nf_ct_put(ct);
        sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
        return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
}

static struct nf_sockopt_ops so_getorigdst6 = {
        .pf                = NFPROTO_IPV6,
        .get_optmin        = IP6T_SO_ORIGINAL_DST,
        .get_optmax        = IP6T_SO_ORIGINAL_DST + 1,
        .get                = ipv6_getorigdst,
        .owner                = THIS_MODULE,
};

static unsigned int ipv6_conntrack_in(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        return nf_conntrack_in(skb, state);
}

static unsigned int ipv6_conntrack_local(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        return nf_conntrack_in(skb, state);
}

static const struct nf_hook_ops ipv6_conntrack_ops[] = {
        {
                .hook                = ipv6_conntrack_in,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP6_PRI_CONNTRACK,
        },
        {
                .hook                = ipv6_conntrack_local,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority        = NF_IP6_PRI_CONNTRACK,
        },
        {
                .hook                = nf_confirm,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_POST_ROUTING,
                .priority        = NF_IP6_PRI_LAST,
        },
        {
                .hook                = nf_confirm,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_IN,
                .priority        = NF_IP6_PRI_LAST - 1,
        },
};
#endif

static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto)
{
        u8 nfproto = (unsigned long)_nfproto;

        if (nf_ct_l3num(ct) != nfproto)
                return 0;

        if (nf_ct_protonum(ct) == IPPROTO_TCP &&
            ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED) {
                ct->proto.tcp.seen[0].td_maxwin = 0;
                ct->proto.tcp.seen[1].td_maxwin = 0;
        }

        return 0;
}

static struct nf_ct_bridge_info *nf_ct_bridge_info;

static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);
        bool fixup_needed = false, retry = true;
        int err = 0;
retry:
        mutex_lock(&nf_ct_proto_mutex);

        switch (nfproto) {
        case NFPROTO_IPV4:
                cnet->users4++;
                if (cnet->users4 > 1)
                        goto out_unlock;
                err = nf_defrag_ipv4_enable(net);
                if (err) {
                        cnet->users4 = 0;
                        goto out_unlock;
                }

                err = nf_register_net_hooks(net, ipv4_conntrack_ops,
                                            ARRAY_SIZE(ipv4_conntrack_ops));
                if (err)
                        cnet->users4 = 0;
                else
                        fixup_needed = true;
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case NFPROTO_IPV6:
                cnet->users6++;
                if (cnet->users6 > 1)
                        goto out_unlock;
                err = nf_defrag_ipv6_enable(net);
                if (err < 0) {
                        cnet->users6 = 0;
                        goto out_unlock;
                }

                err = nf_register_net_hooks(net, ipv6_conntrack_ops,
                                            ARRAY_SIZE(ipv6_conntrack_ops));
                if (err)
                        cnet->users6 = 0;
                else
                        fixup_needed = true;
                break;
#endif
        case NFPROTO_BRIDGE:
                if (!nf_ct_bridge_info) {
                        if (!retry) {
                                err = -EPROTO;
                                goto out_unlock;
                        }
                        mutex_unlock(&nf_ct_proto_mutex);
                        request_module("nf_conntrack_bridge");
                        retry = false;
                        goto retry;
                }
                if (!try_module_get(nf_ct_bridge_info->me)) {
                        err = -EPROTO;
                        goto out_unlock;
                }
                cnet->users_bridge++;
                if (cnet->users_bridge > 1)
                        goto out_unlock;

                err = nf_register_net_hooks(net, nf_ct_bridge_info->ops,
                                            nf_ct_bridge_info->ops_size);
                if (err)
                        cnet->users_bridge = 0;
                else
                        fixup_needed = true;
                break;
        default:
                err = -EPROTO;
                break;
        }
 out_unlock:
        mutex_unlock(&nf_ct_proto_mutex);

        if (fixup_needed) {
                struct nf_ct_iter_data iter_data = {
                        .net        = net,
                        .data        = (void *)(unsigned long)nfproto,
                };
                nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
        }

        return err;
}

static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);

        mutex_lock(&nf_ct_proto_mutex);
        switch (nfproto) {
        case NFPROTO_IPV4:
                if (cnet->users4 && (--cnet->users4 == 0)) {
                        nf_unregister_net_hooks(net, ipv4_conntrack_ops,
                                                ARRAY_SIZE(ipv4_conntrack_ops));
                        nf_defrag_ipv4_disable(net);
                }
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case NFPROTO_IPV6:
                if (cnet->users6 && (--cnet->users6 == 0)) {
                        nf_unregister_net_hooks(net, ipv6_conntrack_ops,
                                                ARRAY_SIZE(ipv6_conntrack_ops));
                        nf_defrag_ipv6_disable(net);
                }
                break;
#endif
        case NFPROTO_BRIDGE:
                if (!nf_ct_bridge_info)
                        break;
                if (cnet->users_bridge && (--cnet->users_bridge == 0))
                        nf_unregister_net_hooks(net, nf_ct_bridge_info->ops,
                                                nf_ct_bridge_info->ops_size);

                module_put(nf_ct_bridge_info->me);
                break;
        }
        mutex_unlock(&nf_ct_proto_mutex);
}

static int nf_ct_netns_inet_get(struct net *net)
{
        int err;

        err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
#if IS_ENABLED(CONFIG_IPV6)
        if (err < 0)
                goto err1;
        err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
        if (err < 0)
                goto err2;

        return err;
err2:
        nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
#endif
        return err;
}

int nf_ct_netns_get(struct net *net, u8 nfproto)
{
        int err;

        switch (nfproto) {
        case NFPROTO_INET:
                err = nf_ct_netns_inet_get(net);
                break;
        case NFPROTO_BRIDGE:
                err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE);
                if (err < 0)
                        return err;

                err = nf_ct_netns_inet_get(net);
                if (err < 0) {
                        nf_ct_netns_put(net, NFPROTO_BRIDGE);
                        return err;
                }
                break;
        default:
                err = nf_ct_netns_do_get(net, nfproto);
                break;
        }
        return err;
}
EXPORT_SYMBOL_GPL(nf_ct_netns_get);

void nf_ct_netns_put(struct net *net, uint8_t nfproto)
{
        switch (nfproto) {
        case NFPROTO_BRIDGE:
                nf_ct_netns_do_put(net, NFPROTO_BRIDGE);
                fallthrough;
        case NFPROTO_INET:
                nf_ct_netns_do_put(net, NFPROTO_IPV4);
                nf_ct_netns_do_put(net, NFPROTO_IPV6);
                break;
        default:
                nf_ct_netns_do_put(net, nfproto);
                break;
        }
}
EXPORT_SYMBOL_GPL(nf_ct_netns_put);

void nf_ct_bridge_register(struct nf_ct_bridge_info *info)
{
        WARN_ON(nf_ct_bridge_info);
        mutex_lock(&nf_ct_proto_mutex);
        nf_ct_bridge_info = info;
        mutex_unlock(&nf_ct_proto_mutex);
}
EXPORT_SYMBOL_GPL(nf_ct_bridge_register);

void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info)
{
        WARN_ON(!nf_ct_bridge_info);
        mutex_lock(&nf_ct_proto_mutex);
        nf_ct_bridge_info = NULL;
        mutex_unlock(&nf_ct_proto_mutex);
}
EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister);

int nf_conntrack_proto_init(void)
{
        int ret;

        ret = nf_register_sockopt(&so_getorigdst);
        if (ret < 0)
                return ret;

#if IS_ENABLED(CONFIG_IPV6)
        ret = nf_register_sockopt(&so_getorigdst6);
        if (ret < 0)
                goto cleanup_sockopt;
#endif

        return ret;

#if IS_ENABLED(CONFIG_IPV6)
cleanup_sockopt:
        nf_unregister_sockopt(&so_getorigdst);
#endif
        return ret;
}

void nf_conntrack_proto_fini(void)
{
        nf_unregister_sockopt(&so_getorigdst);
#if IS_ENABLED(CONFIG_IPV6)
        nf_unregister_sockopt(&so_getorigdst6);
#endif
}

void nf_conntrack_proto_pernet_init(struct net *net)
{
        nf_conntrack_generic_init_net(net);
        nf_conntrack_udp_init_net(net);
        nf_conntrack_tcp_init_net(net);
        nf_conntrack_icmp_init_net(net);
#if IS_ENABLED(CONFIG_IPV6)
        nf_conntrack_icmpv6_init_net(net);
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
        nf_conntrack_dccp_init_net(net);
#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
        nf_conntrack_sctp_init_net(net);
#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
        nf_conntrack_gre_init_net(net);
#endif
}

module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
                  &nf_conntrack_htable_size, 0600);

MODULE_ALIAS("ip_conntrack");
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking");































































































































































































    1 

































































































































    2 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IPV6_H
#define _IPV6_H

#include <uapi/linux/ipv6.h>
#include <linux/cache.h>

#define ipv6_optlen(p)  (((p)->hdrlen+1) << 3)
#define ipv6_authlen(p) (((p)->hdrlen+2) << 2)
/*
 * This structure contains configuration options per IPv6 link.
 */
struct ipv6_devconf {
        /* RX & TX fastpath fields. */
        __cacheline_group_begin(ipv6_devconf_read_txrx);
        __s32                disable_ipv6;
        __s32                hop_limit;
        __s32                mtu6;
        __s32                forwarding;
        __s32                disable_policy;
        __s32                proxy_ndp;
        __cacheline_group_end(ipv6_devconf_read_txrx);

        __s32                accept_ra;
        __s32                accept_redirects;
        __s32                autoconf;
        __s32                dad_transmits;
        __s32                rtr_solicits;
        __s32                rtr_solicit_interval;
        __s32                rtr_solicit_max_interval;
        __s32                rtr_solicit_delay;
        __s32                force_mld_version;
        __s32                mldv1_unsolicited_report_interval;
        __s32                mldv2_unsolicited_report_interval;
        __s32                use_tempaddr;
        __s32                temp_valid_lft;
        __s32                temp_prefered_lft;
        __s32                regen_min_advance;
        __s32                regen_max_retry;
        __s32                max_desync_factor;
        __s32                max_addresses;
        __s32                accept_ra_defrtr;
        __u32                ra_defrtr_metric;
        __s32                accept_ra_min_hop_limit;
        __s32                accept_ra_min_lft;
        __s32                accept_ra_pinfo;
        __s32                ignore_routes_with_linkdown;
#ifdef CONFIG_IPV6_ROUTER_PREF
        __s32                accept_ra_rtr_pref;
        __s32                rtr_probe_interval;
#ifdef CONFIG_IPV6_ROUTE_INFO
        __s32                accept_ra_rt_info_min_plen;
        __s32                accept_ra_rt_info_max_plen;
#endif
#endif
        __s32                accept_source_route;
        __s32                accept_ra_from_local;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        __s32                optimistic_dad;
        __s32                use_optimistic;
#endif
#ifdef CONFIG_IPV6_MROUTE
        atomic_t        mc_forwarding;
#endif
        __s32                drop_unicast_in_l2_multicast;
        __s32                accept_dad;
        __s32                force_tllao;
        __s32           ndisc_notify;
        __s32                suppress_frag_ndisc;
        __s32                accept_ra_mtu;
        __s32                drop_unsolicited_na;
        __s32                accept_untracked_na;
        struct ipv6_stable_secret {
                bool initialized;
                struct in6_addr secret;
        } stable_secret;
        __s32                use_oif_addrs_only;
        __s32                keep_addr_on_down;
        __s32                seg6_enabled;
#ifdef CONFIG_IPV6_SEG6_HMAC
        __s32                seg6_require_hmac;
#endif
        __u32                enhanced_dad;
        __u32                addr_gen_mode;
        __s32           ndisc_tclass;
        __s32                rpl_seg_enabled;
        __u32                ioam6_id;
        __u32                ioam6_id_wide;
        __u8                ioam6_enabled;
        __u8                ndisc_evict_nocarrier;
        __u8                ra_honor_pio_life;

        struct ctl_table_header *sysctl_header;
};

struct ipv6_params {
        __s32 disable_ipv6;
        __s32 autoconf;
};
extern struct ipv6_params ipv6_defaults;
#include <linux/tcp.h>
#include <linux/udp.h>

#include <net/inet_sock.h>

static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_network_header(skb);
}

static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_inner_network_header(skb);
}

static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb)
{
        return (struct ipv6hdr *)skb_transport_header(skb);
}

static inline unsigned int ipv6_transport_len(const struct sk_buff *skb)
{
        return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) -
               skb_network_header_len(skb);
}

/* 
   This structure contains results of exthdrs parsing
   as offsets from skb->nh.
 */

struct inet6_skb_parm {
        int                        iif;
        __be16                        ra;
        __u16                        dst0;
        __u16                        srcrt;
        __u16                        dst1;
        __u16                        lastopt;
        __u16                        nhoff;
        __u16                        flags;
#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
        __u16                        dsthao;
#endif
        __u16                        frag_max_size;
        __u16                        srhoff;

#define IP6SKB_XFRM_TRANSFORMED        1
#define IP6SKB_FORWARDED        2
#define IP6SKB_REROUTED                4
#define IP6SKB_ROUTERALERT        8
#define IP6SKB_FRAGMENTED      16
#define IP6SKB_HOPBYHOP        32
#define IP6SKB_L3SLAVE         64
#define IP6SKB_JUMBOGRAM      128
#define IP6SKB_SEG6              256
#define IP6SKB_FAKEJUMBO      512
#define IP6SKB_MULTIPATH      1024
};

#if defined(CONFIG_NET_L3_MASTER_DEV)
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return flags & IP6SKB_L3SLAVE;
}
#else
static inline bool ipv6_l3mdev_skb(__u16 flags)
{
        return false;
}
#endif

#define IP6CB(skb)        ((struct inet6_skb_parm*)((skb)->cb))
#define IP6CBMTU(skb)        ((struct ip6_mtuinfo *)((skb)->cb))

static inline int inet6_iif(const struct sk_buff *skb)
{
        bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags);

        return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
}

static inline bool inet6_is_jumbogram(const struct sk_buff *skb)
{
        return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM);
}

/* can not be used in TCP layer after tcp_v6_fill_cb */
static inline int inet6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
        if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
                return IP6CB(skb)->iif;
#endif
        return 0;
}

struct tcp6_request_sock {
        struct tcp_request_sock          tcp6rsk_tcp;
};

struct ipv6_mc_socklist;
struct ipv6_ac_socklist;
struct ipv6_fl_socklist;

struct inet6_cork {
        struct ipv6_txoptions *opt;
        u8 hop_limit;
        u8 tclass;
};

/* struct ipv6_pinfo - ipv6 private area */
struct ipv6_pinfo {
        struct in6_addr         saddr;
        struct in6_pktinfo        sticky_pktinfo;
        const struct in6_addr                *daddr_cache;
#ifdef CONFIG_IPV6_SUBTREES
        const struct in6_addr                *saddr_cache;
#endif

        __be32                        flow_label;
        __u32                        frag_size;

        s16                        hop_limit;
        u8                        mcast_hops;

        int                        ucast_oif;
        int                        mcast_oif;

        /* pktoption flags */
        union {
                struct {
                        __u16        srcrt:1,
                                osrcrt:1,
                                rxinfo:1,
                                rxoinfo:1,
                                rxhlim:1,
                                rxohlim:1,
                                hopopts:1,
                                ohopopts:1,
                                dstopts:1,
                                odstopts:1,
                                rxflow:1,
                                rxtclass:1,
                                rxpmtu:1,
                                rxorigdstaddr:1,
                                recvfragsize:1;
                                /* 1 bits hole */
                } bits;
                __u16                all;
        } rxopt;

        /* sockopt flags */
        __u8                        srcprefs;        /* 001: prefer temporary address
                                                 * 010: prefer public address
                                                 * 100: prefer care-of address
                                                 */
        __u8                        pmtudisc;
        __u8                        min_hopcount;
        __u8                        tclass;
        __be32                        rcv_flowinfo;

        __u32                        dst_cookie;

        struct ipv6_mc_socklist        __rcu *ipv6_mc_list;
        struct ipv6_ac_socklist        *ipv6_ac_list;
        struct ipv6_fl_socklist __rcu *ipv6_fl_list;

        struct ipv6_txoptions __rcu        *opt;
        struct sk_buff                *pktoptions;
        struct sk_buff                *rxpmtu;
        struct inet6_cork        cork;
};

/* We currently use available bits from inet_sk(sk)->inet_flags,
 * this could change in the future.
 */
#define inet6_test_bit(nr, sk)                        \
        test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_set_bit(nr, sk)                        \
        set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_clear_bit(nr, sk)                        \
        clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
#define inet6_assign_bit(nr, sk, val)                \
        assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)

/* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
struct raw6_sock {
        /* inet_sock has to be the first member of raw6_sock */
        struct inet_sock        inet;
        __u32                        checksum;        /* perform checksum */
        __u32                        offset;                /* checksum offset  */
        struct icmp6_filter        filter;
        __u32                        ip6mr_table;

        struct ipv6_pinfo        inet6;
};

struct udp6_sock {
        struct udp_sock          udp;

        struct ipv6_pinfo inet6;
};

struct tcp6_sock {
        struct tcp_sock          tcp;

        struct ipv6_pinfo inet6;
};

extern int inet6_sk_rebuild_header(struct sock *sk);

struct tcp6_timewait_sock {
        struct tcp_timewait_sock   tcp6tw_tcp;
};

#if IS_ENABLED(CONFIG_IPV6)
bool ipv6_mod_enabled(void);

static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
{
        return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
}

#define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk)

#define ipv6_only_sock(sk)        (sk->sk_ipv6only)
#define ipv6_sk_rxinfo(sk)        ((sk)->sk_family == PF_INET6 && \
                                 inet6_sk(sk)->rxopt.bits.rxinfo)

static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
{
        if (sk->sk_family == AF_INET6)
                return &sk->sk_v6_rcv_saddr;
        return NULL;
}

static inline int inet_v6_ipv6only(const struct sock *sk)
{
        /* ipv6only field is at same position for timewait and other sockets */
        return ipv6_only_sock(sk);
}
#else
#define ipv6_only_sock(sk)        0
#define ipv6_sk_rxinfo(sk)        0

static inline bool ipv6_mod_enabled(void)
{
        return false;
}

static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
{
        return NULL;
}

static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
        return NULL;
}

#define inet6_rcv_saddr(__sk)        NULL
#define inet_v6_ipv6only(__sk)                0
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif /* _IPV6_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















































































    1 
    1 

    1 

























    1 
    1 
    1 









    1 




    1 

    1 






    1 













    1 







    1 












    1 





    1 




    1 












































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 



































































































































































































































    4 



    4 









    5 













































    4 



















    4 

    2 
    3 




















    5 












    1 






























    5 



    5 
    1 





    5 












    5 

    5 









    5 



    4 
    5 
    4 






    5 



    6 



    6 




















    4 









    1 


    2 















    3 



    4 
    6 


    6 



    4 






    1 



    4 







    2 






    3 




















































    5 













    1 









    1 










































































































































    5 
    6 































































































































































































































































































































































































    4 









    4 











    5 
























    5 







    3 

    6 





    3 

    6 

    5 


    4 






















    2 

    4 


    3 

    4 

    5 































    5 






    5 























    4 





















































    2 


    3 

    3 


















    4 





    4 




































































































































































































































    4 
    3 



















    3 






























    3 


































    3 












    2 







    3 
    1 





    3 























































































































































































































































































    3 


    4 

    4 


































    4 


    3 


















































































































































































































































    1 








    1 

    1 















    1 













    1 

    1 









    1 





    1 







    1 

















    1 



    1 





    1 



    1 
    1 




















    1 






    1 

































    1 




    1 


    1 





    1 


































    1 






























































































    1 


    1 















    1 



































































    1 




    1 

    1 











    1 





















































    1 


    1 

    1 
    1 




























































































































































































































































































    3 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *      NET3    Protocol independent device support routines.
 *
 *        Derived from the non IP parts of dev.c 1.0.19
 *              Authors:        Ross Biro
 *                                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *
 *        Additional Authors:
 *                Florian la Roche <rzsfl@rz.uni-sb.de>
 *                Alan Cox <gw4pts@gw4pts.ampr.org>
 *                David Hinds <dahinds@users.sourceforge.net>
 *                Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                Adam Sulmicki <adam@cfar.umd.edu>
 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 *
 *        Changes:
 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 *                                      to 2 if register_netdev gets called
 *                                      before net_dev_init & also removed a
 *                                      few lines of code in the process.
 *                Alan Cox        :        device private ioctl copies fields back.
 *                Alan Cox        :        Transmit queue code does relevant
 *                                        stunts to keep the queue safe.
 *                Alan Cox        :        Fixed double lock.
 *                Alan Cox        :        Fixed promisc NULL pointer trap
 *                ????????        :        Support the full private ioctl range
 *                Alan Cox        :        Moved ioctl permission check into
 *                                        drivers
 *                Tim Kordas        :        SIOCADDMULTI/SIOCDELMULTI
 *                Alan Cox        :        100 backlog just doesn't cut it when
 *                                        you start doing multicast video 8)
 *                Alan Cox        :        Rewrote net_bh and list manager.
 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 *                Alan Cox        :        Took out transmit every packet pass
 *                                        Saved a few bytes in the ioctl handler
 *                Alan Cox        :        Network driver sets packet type before
 *                                        calling netif_rx. Saves a function
 *                                        call a packet.
 *                Alan Cox        :        Hashed net_bh()
 *                Richard Kooijman:        Timestamp fixes.
 *                Alan Cox        :        Wrong field in SIOCGIFDSTADDR
 *                Alan Cox        :        Device lock protection.
 *              Alan Cox        :       Fixed nasty side effect of device close
 *                                        changes.
 *                Rudi Cilibrasi        :        Pass the right thing to
 *                                        set_mac_address()
 *                Dave Miller        :        32bit quantity for the device lock to
 *                                        make it work out on a Sparc.
 *                Bjorn Ekwall        :        Added KERNELD hack.
 *                Alan Cox        :        Cleaned up the backlog initialise.
 *                Craig Metz        :        SIOCGIFCONF fix if space for under
 *                                        1 device.
 *            Thomas Bogendoerfer :        Return ENODEV for dev_open, if there
 *                                        is no device open function.
 *                Andi Kleen        :        Fix error reporting for SIOCGIFCONF
 *            Michael Chastain        :        Fix signed/unsigned for SIOCGIFCONF
 *                Cyrus Durgin        :        Cleaned for KMOD
 *                Adam Sulmicki   :        Bug Fix : Network Device Unload
 *                                        A network device unload needs to purge
 *                                        the backlog queue.
 *        Paul Rusty Russell        :        SIOCSIFNAME
 *              Pekka Riikonen  :        Netdev boot-time settings code
 *              Andrew Morton   :       Make unregister_netdevice wait
 *                                      indefinitely on dev->refcnt
 *              J Hadi Salim    :       - Backlog queue sampling
 *                                        - netif_rx() feedback
 */

#include <linux/uaccess.h>
#include <linux/bitmap.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/isolation.h>
#include <linux/sched/mm.h>
#include <linux/smpboot.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/tcx.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
#include <trace/events/qdisc.h>
#include <trace/events/xdp.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_netdev.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
#include <net/rps.h>

#include "dev.h"
#include "net-sysfs.h"

static DEFINE_SPINLOCK(ptype_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack);

static DEFINE_MUTEX(ifalias_mutex);

/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);

static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);

static DECLARE_RWSEM(devnet_rename_sem);

static inline void dev_base_seq_inc(struct net *net)
{
        unsigned int val = net->dev_base_seq + 1;

        WRITE_ONCE(net->dev_base_seq, val ?: 1);
}

static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));

        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}

static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
{
        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}

#ifndef CONFIG_PREEMPT_RT

static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);

static int __init setup_backlog_napi_threads(char *arg)
{
        static_branch_enable(&use_backlog_threads_key);
        return 0;
}
early_param("thread_backlog_napi", setup_backlog_napi_threads);

static bool use_backlog_threads(void)
{
        return static_branch_unlikely(&use_backlog_threads_key);
}

#else

static bool use_backlog_threads(void)
{
        return true;
}

#endif

static inline void backlog_lock_irq_save(struct softnet_data *sd,
                                         unsigned long *flags)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
        else
                local_irq_save(*flags);
}

static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_lock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_disable();
}

static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
                                              unsigned long *flags)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
        else
                local_irq_restore(*flags);
}

static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_unlock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_enable();
}

static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
                                                       const char *name)
{
        struct netdev_name_node *name_node;

        name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
        if (!name_node)
                return NULL;
        INIT_HLIST_NODE(&name_node->hlist);
        name_node->dev = dev;
        name_node->name = name;
        return name_node;
}

static struct netdev_name_node *
netdev_name_node_head_alloc(struct net_device *dev)
{
        struct netdev_name_node *name_node;

        name_node = netdev_name_node_alloc(dev, dev->name);
        if (!name_node)
                return NULL;
        INIT_LIST_HEAD(&name_node->list);
        return name_node;
}

static void netdev_name_node_free(struct netdev_name_node *name_node)
{
        kfree(name_node);
}

static void netdev_name_node_add(struct net *net,
                                 struct netdev_name_node *name_node)
{
        hlist_add_head_rcu(&name_node->hlist,
                           dev_name_hash(net, name_node->name));
}

static void netdev_name_node_del(struct netdev_name_node *name_node)
{
        hlist_del_rcu(&name_node->hlist);
}

static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
                                                        const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
                                                            const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry_rcu(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

bool netdev_name_in_use(struct net *net, const char *name)
{
        return netdev_name_node_lookup(net, name);
}
EXPORT_SYMBOL(netdev_name_in_use);

int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (name_node)
                return -EEXIST;
        name_node = netdev_name_node_alloc(dev, name);
        if (!name_node)
                return -ENOMEM;
        netdev_name_node_add(net, name_node);
        /* The node that holds dev->name acts as a head of per-device list. */
        list_add_tail_rcu(&name_node->list, &dev->name_node->list);

        return 0;
}

static void netdev_name_node_alt_free(struct rcu_head *head)
{
        struct netdev_name_node *name_node =
                container_of(head, struct netdev_name_node, rcu);

        kfree(name_node->name);
        netdev_name_node_free(name_node);
}

static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
        netdev_name_node_del(name_node);
        list_del(&name_node->list);
        call_rcu(&name_node->rcu, netdev_name_node_alt_free);
}

int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (!name_node)
                return -ENOENT;
        /* lookup might have found our primary name or a name belonging
         * to another device.
         */
        if (name_node == dev->name_node || name_node->dev != dev)
                return -EINVAL;

        __netdev_name_node_alt_destroy(name_node);
        return 0;
}

static void netdev_name_node_alt_flush(struct net_device *dev)
{
        struct netdev_name_node *name_node, *tmp;

        list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
                list_del(&name_node->list);
                netdev_name_node_alt_free(&name_node->rcu);
        }
}

/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
        netdev_name_node_add(net, dev->name_node);
        hlist_add_head_rcu(&dev->index_hlist,
                           dev_index_hash(net, dev->ifindex));

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_add(net, name_node);

        /* We reserved the ifindex, this can't fail */
        WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));

        dev_base_seq_inc(net);
}

/* Device list removal
 * caller must respect a RCU grace period before freeing/reusing dev
 */
static void unlist_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        xa_erase(&net->dev_by_index, dev->ifindex);

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_del(name_node);

        /* Unlink dev from the device chain */
        list_del_rcu(&dev->dev_list);
        netdev_name_node_del(dev->name_node);
        hlist_del_rcu(&dev->index_hlist);

        dev_base_seq_inc(dev_net(dev));
}

/*
 *        Our notifier list
 */

static RAW_NOTIFIER_HEAD(netdev_chain);

/*
 *        Device drivers call our routines to queue packets here. We empty the
 *        queue in the local softnet handler.
 */

DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
        .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
};
EXPORT_PER_CPU_SYMBOL(softnet_data);

/* Page_pool has a lockless array/stack to alloc/recycle pages.
 * PP consumers must pay attention to run APIs in the appropriate context
 * (e.g. NAPI context).
 */
static DEFINE_PER_CPU(struct page_pool *, system_page_pool);

#ifdef CONFIG_LOCKDEP
/*
 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 * according to dev->type
 */
static const unsigned short netdev_lock_type[] = {
         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};

static const char *const netdev_lock_name[] = {
        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
        "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
        "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
        "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};

static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];

static inline unsigned short netdev_lock_pos(unsigned short dev_type)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
                if (netdev_lock_type[i] == dev_type)
                        return i;
        /* the last key is used by default */
        return ARRAY_SIZE(netdev_lock_type) - 1;
}

static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
        int i;

        i = netdev_lock_pos(dev_type);
        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
                                   netdev_lock_name[i]);
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
        int i;

        i = netdev_lock_pos(dev->type);
        lockdep_set_class_and_name(&dev->addr_list_lock,
                                   &netdev_addr_lock_key[i],
                                   netdev_lock_name[i]);
}
#else
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
#endif

/*******************************************************************************
 *
 *                Protocol management and registration routines
 *
 *******************************************************************************/


/*
 *        Add a protocol ID to the list. Now that the input handler is
 *        smarter we can dispense with all the messy stuff that used to be
 *        here.
 *
 *        BEWARE!!! Protocol handlers, mangling input packets,
 *        MUST BE last in hash buckets and checking protocol handlers
 *        MUST start from promiscuous ptype_all chain in net_bh.
 *        It is true now, do not change it.
 *        Explanation follows: if protocol handler, mangling packet, will
 *        be the first on list, it is not able to sense, that packet
 *        is cloned and should be copied-on-write, so that it will
 *        change it and subsequent readers will get broken packet.
 *                                                        --ANK (980803)
 */

static inline struct list_head *ptype_head(const struct packet_type *pt)
{
        if (pt->type == htons(ETH_P_ALL))
                return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
        else
                return pt->dev ? &pt->dev->ptype_specific :
                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

/**
 *        dev_add_pack - add packet handler
 *        @pt: packet type declaration
 *
 *        Add a protocol handler to the networking stack. The passed &packet_type
 *        is linked into kernel lists and may not be freed until it has been
 *        removed from the kernel lists.
 *
 *        This call does not sleep therefore it can not
 *        guarantee all CPU's that are in middle of receiving packets
 *        will see the new packet type (until the next received packet).
 */

void dev_add_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);

        spin_lock(&ptype_lock);
        list_add_rcu(&pt->list, head);
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);

/**
 *        __dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *      The packet type might still be in use by receivers
 *        and must not be freed until after all the CPU's have gone
 *        through a quiescent state.
 */
void __dev_remove_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);
        struct packet_type *pt1;

        spin_lock(&ptype_lock);

        list_for_each_entry(pt1, head, list) {
                if (pt == pt1) {
                        list_del_rcu(&pt->list);
                        goto out;
                }
        }

        pr_warn("dev_remove_pack: %p not found\n", pt);
out:
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);

/**
 *        dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *        This call sleeps to guarantee that no CPU is looking at the packet
 *        type after return.
 */
void dev_remove_pack(struct packet_type *pt)
{
        __dev_remove_pack(pt);

        synchronize_net();
}
EXPORT_SYMBOL(dev_remove_pack);


/*******************************************************************************
 *
 *                            Device Interface Subroutines
 *
 *******************************************************************************/

/**
 *        dev_get_iflink        - get 'iflink' value of a interface
 *        @dev: targeted interface
 *
 *        Indicates the ifindex the interface is linked to.
 *        Physical interfaces have the same 'ifindex' and 'iflink' values.
 */

int dev_get_iflink(const struct net_device *dev)
{
        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
                return dev->netdev_ops->ndo_get_iflink(dev);

        return READ_ONCE(dev->ifindex);
}
EXPORT_SYMBOL(dev_get_iflink);

/**
 *        dev_fill_metadata_dst - Retrieve tunnel egress information.
 *        @dev: targeted interface
 *        @skb: The packet.
 *
 *        For better visibility of tunnel traffic OVS needs to retrieve
 *        egress tunnel information for a packet. Following API allows
 *        user to get this info.
 */
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
        struct ip_tunnel_info *info;

        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
                return -EINVAL;

        info = skb_tunnel_info_unclone(skb);
        if (!info)
                return -ENOMEM;
        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
                return -EINVAL;

        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);

static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
{
        int k = stack->num_paths++;

        if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
                return NULL;

        return &stack->path[k];
}

int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack)
{
        const struct net_device *last_dev;
        struct net_device_path_ctx ctx = {
                .dev        = dev,
        };
        struct net_device_path *path;
        int ret = 0;

        memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
        stack->num_paths = 0;
        while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
                last_dev = ctx.dev;
                path = dev_fwd_path(stack);
                if (!path)
                        return -1;

                memset(path, 0, sizeof(struct net_device_path));
                ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
                if (ret < 0)
                        return -1;

                if (WARN_ON_ONCE(last_dev == ctx.dev))
                        return -1;
        }

        if (!ctx.dev)
                return ret;

        path = dev_fwd_path(stack);
        if (!path)
                return -1;
        path->type = DEV_PATH_ETHERNET;
        path->dev = ctx.dev;

        return ret;
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);

/**
 *        __dev_get_by_name        - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *
 *        Find an interface by name. Must be called under RTNL semaphore.
 *        If the name is found a pointer to the device is returned.
 *        If the name is not found then %NULL is returned. The
 *        reference counters are not incremented so the caller must be
 *        careful with locks.
 */

struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);

/**
 * dev_get_by_name_rcu        - find a device by its name
 * @net: the applicable net namespace
 * @name: name to find
 *
 * Find an interface by name.
 * If the name is found a pointer to the device is returned.
 * If the name is not found then %NULL is returned.
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup_rcu(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);

/* Deprecated for new users, call netdev_get_by_name() instead */
struct net_device *dev_get_by_name(struct net *net, const char *name)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_name_rcu(net, name);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_name);

/**
 *        netdev_get_by_name() - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Find an interface by name. This can be called from any
 *        context and does its own locking. The returned handle has
 *        the usage count incremented and the caller must use netdev_put() to
 *        release it when it is no longer needed. %NULL is returned if no
 *        matching device is found.
 */
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_name(net, name);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_name);

/**
 *        __dev_get_by_index - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold the RTNL semaphore.
 */

struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(__dev_get_by_index);

/**
 *        dev_get_by_index_rcu - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry_rcu(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_get_by_index_rcu);

/* Deprecated for new users, call netdev_get_by_index() instead */
struct net_device *dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_index);

/**
 *        netdev_get_by_index() - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Search for an interface by index. Returns NULL if the device
 *        is not found or a pointer to the device. The device returned has
 *        had a reference added and the pointer is safe until the user calls
 *        netdev_put() to indicate they have finished with it.
 */
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_index);

/**
 *        dev_get_by_napi_id - find a device by napi_id
 *        @napi_id: ID of the NAPI struct
 *
 *        Search for an interface by NAPI ID. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not had
 *        its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (napi_id < MIN_NAPI_ID)
                return NULL;

        napi = napi_by_id(napi_id);

        return napi ? napi->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_napi_id);

static DEFINE_SEQLOCK(netdev_rename_lock);

void netdev_copy_name(struct net_device *dev, char *name)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&netdev_rename_lock);
                strscpy(name, dev->name, IFNAMSIZ);
        } while (read_seqretry(&netdev_rename_lock, seq));
}

/**
 *        netdev_get_name - get a netdevice name, knowing its ifindex.
 *        @net: network namespace
 *        @name: a pointer to the buffer where the name will be stored.
 *        @ifindex: the ifindex of the interface to get the name from.
 */
int netdev_get_name(struct net *net, char *name, int ifindex)
{
        struct net_device *dev;
        int ret;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, ifindex);
        if (!dev) {
                ret = -ENODEV;
                goto out;
        }

        netdev_copy_name(dev, name);

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

/**
 *        dev_getbyhwaddr_rcu - find a device by its hardware address
 *        @net: the applicable net namespace
 *        @type: media type of device
 *        @ha: hardware address
 *
 *        Search for an interface by MAC address. Returns NULL if the device
 *        is not found or a pointer to the device.
 *        The caller must hold RCU or RTNL.
 *        The returned device has not had its ref count increased
 *        and the caller must therefore be careful about locking
 *
 */

struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *ha)
{
        struct net_device *dev;

        for_each_netdev_rcu(net, dev)
                if (dev->type == type &&
                    !memcmp(dev->dev_addr, ha, dev->addr_len))
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);

struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
        struct net_device *dev, *ret = NULL;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev)
                if (dev->type == type) {
                        dev_hold(dev);
                        ret = dev;
                        break;
                }
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);

/**
 *        __dev_get_by_flags - find any device with given flags
 *        @net: the applicable net namespace
 *        @if_flags: IFF_* values
 *        @mask: bitmask of bits in if_flags to check
 *
 *        Search for any interface with the given flags. Returns NULL if a device
 *        is not found or a pointer to the device. Must be called inside
 *        rtnl_lock(), and result refcount is unchanged.
 */

struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
                                      unsigned short mask)
{
        struct net_device *dev, *ret;

        ASSERT_RTNL();

        ret = NULL;
        for_each_netdev(net, dev) {
                if (((dev->flags ^ if_flags) & mask) == 0) {
                        ret = dev;
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL(__dev_get_by_flags);

/**
 *        dev_valid_name - check if name is okay for network device
 *        @name: name string
 *
 *        Network device names need to be valid file names to
 *        allow sysfs to work.  We also disallow any kind of
 *        whitespace.
 */
bool dev_valid_name(const char *name)
{
        if (*name == '\0')
                return false;
        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
                return false;
        if (!strcmp(name, ".") || !strcmp(name, ".."))
                return false;

        while (*name) {
                if (*name == '/' || *name == ':' || isspace(*name))
                        return false;
                name++;
        }
        return true;
}
EXPORT_SYMBOL(dev_valid_name);

/**
 *        __dev_alloc_name - allocate a name for a device
 *        @net: network namespace to allocate the device name in
 *        @name: name format string
 *        @res: result name string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
        int i = 0;
        const char *p;
        const int max_netdevices = 8*PAGE_SIZE;
        unsigned long *inuse;
        struct net_device *d;
        char buf[IFNAMSIZ];

        /* Verify the string as this thing may have come from the user.
         * There must be one "%d" and no other "%" characters.
         */
        p = strchr(name, '%');
        if (!p || p[1] != 'd' || strchr(p + 2, '%'))
                return -EINVAL;

        /* Use one page as a bit array of possible slots */
        inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
        if (!inuse)
                return -ENOMEM;

        for_each_netdev(net, d) {
                struct netdev_name_node *name_node;

                netdev_for_each_altname(d, name_node) {
                        if (!sscanf(name_node->name, name, &i))
                                continue;
                        if (i < 0 || i >= max_netdevices)
                                continue;

                        /* avoid cases where sscanf is not exact inverse of printf */
                        snprintf(buf, IFNAMSIZ, name, i);
                        if (!strncmp(buf, name_node->name, IFNAMSIZ))
                                __set_bit(i, inuse);
                }
                if (!sscanf(d->name, name, &i))
                        continue;
                if (i < 0 || i >= max_netdevices)
                        continue;

                /* avoid cases where sscanf is not exact inverse of printf */
                snprintf(buf, IFNAMSIZ, name, i);
                if (!strncmp(buf, d->name, IFNAMSIZ))
                        __set_bit(i, inuse);
        }

        i = find_first_zero_bit(inuse, max_netdevices);
        bitmap_free(inuse);
        if (i == max_netdevices)
                return -ENFILE;

        /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
        strscpy(buf, name, IFNAMSIZ);
        snprintf(res, IFNAMSIZ, buf, i);
        return i;
}

/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
static int dev_prep_valid_name(struct net *net, struct net_device *dev,
                               const char *want_name, char *out_name,
                               int dup_errno)
{
        if (!dev_valid_name(want_name))
                return -EINVAL;

        if (strchr(want_name, '%'))
                return __dev_alloc_name(net, want_name, out_name);

        if (netdev_name_in_use(net, want_name))
                return -dup_errno;
        if (out_name != want_name)
                strscpy(out_name, want_name, IFNAMSIZ);
        return 0;
}

/**
 *        dev_alloc_name - allocate a name for a device
 *        @dev: device
 *        @name: name format string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

int dev_alloc_name(struct net_device *dev, const char *name)
{
        return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);

static int dev_get_valid_name(struct net *net, struct net_device *dev,
                              const char *name)
{
        int ret;

        ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
        return ret < 0 ? ret : 0;
}

/**
 *        dev_change_name - change name of a device
 *        @dev: device
 *        @newname: name (or format string) must be at least IFNAMSIZ
 *
 *        Change name of a device, can pass format strings "eth%d".
 *        for wildcarding.
 */
int dev_change_name(struct net_device *dev, const char *newname)
{
        unsigned char old_assign_type;
        char oldname[IFNAMSIZ];
        int err = 0;
        int ret;
        struct net *net;

        ASSERT_RTNL();
        BUG_ON(!dev_net(dev));

        net = dev_net(dev);

        down_write(&devnet_rename_sem);

        if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
                up_write(&devnet_rename_sem);
                return 0;
        }

        memcpy(oldname, dev->name, IFNAMSIZ);

        write_seqlock(&netdev_rename_lock);
        err = dev_get_valid_name(net, dev, newname);
        write_sequnlock(&netdev_rename_lock);

        if (err < 0) {
                up_write(&devnet_rename_sem);
                return err;
        }

        if (oldname[0] && !strchr(oldname, '%'))
                netdev_info(dev, "renamed from %s%s\n", oldname,
                            dev->flags & IFF_UP ? " (while UP)" : "");

        old_assign_type = dev->name_assign_type;
        WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);

rollback:
        ret = device_rename(&dev->dev, dev->name);
        if (ret) {
                memcpy(dev->name, oldname, IFNAMSIZ);
                WRITE_ONCE(dev->name_assign_type, old_assign_type);
                up_write(&devnet_rename_sem);
                return ret;
        }

        up_write(&devnet_rename_sem);

        netdev_adjacent_rename_links(dev, oldname);

        netdev_name_node_del(dev->name_node);

        synchronize_net();

        netdev_name_node_add(net, dev->name_node);

        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
        ret = notifier_to_errno(ret);

        if (ret) {
                /* err >= 0 after dev_alloc_name() or stores the first errno */
                if (err >= 0) {
                        err = ret;
                        down_write(&devnet_rename_sem);
                        write_seqlock(&netdev_rename_lock);
                        memcpy(dev->name, oldname, IFNAMSIZ);
                        write_sequnlock(&netdev_rename_lock);
                        memcpy(oldname, newname, IFNAMSIZ);
                        WRITE_ONCE(dev->name_assign_type, old_assign_type);
                        old_assign_type = NET_NAME_RENAMED;
                        goto rollback;
                } else {
                        netdev_err(dev, "name change rollback failed: %d\n",
                                   ret);
                }
        }

        return err;
}

/**
 *        dev_set_alias - change ifalias of a device
 *        @dev: device
 *        @alias: name up to IFALIASZ
 *        @len: limit of bytes to copy from info
 *
 *        Set ifalias for a device,
 */
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
        struct dev_ifalias *new_alias = NULL;

        if (len >= IFALIASZ)
                return -EINVAL;

        if (len) {
                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
                if (!new_alias)
                        return -ENOMEM;

                memcpy(new_alias->ifalias, alias, len);
                new_alias->ifalias[len] = 0;
        }

        mutex_lock(&ifalias_mutex);
        new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
                                        mutex_is_locked(&ifalias_mutex));
        mutex_unlock(&ifalias_mutex);

        if (new_alias)
                kfree_rcu(new_alias, rcuhead);

        return len;
}
EXPORT_SYMBOL(dev_set_alias);

/**
 *        dev_get_alias - get ifalias of a device
 *        @dev: device
 *        @name: buffer to store name of ifalias
 *        @len: size of buffer
 *
 *        get ifalias for a device.  Caller must make sure dev cannot go
 *        away,  e.g. rcu read lock or own a reference count to device.
 */
int dev_get_alias(const struct net_device *dev, char *name, size_t len)
{
        const struct dev_ifalias *alias;
        int ret = 0;

        rcu_read_lock();
        alias = rcu_dereference(dev->ifalias);
        if (alias)
                ret = snprintf(name, len, "%s", alias->ifalias);
        rcu_read_unlock();

        return ret;
}

/**
 *        netdev_features_change - device changes features
 *        @dev: device to cause notification
 *
 *        Called to indicate a device has changed features.
 */
void netdev_features_change(struct net_device *dev)
{
        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);

/**
 *        netdev_state_change - device changes state
 *        @dev: device to cause notification
 *
 *        Called to indicate a device has changed state. This function calls
 *        the notifier chains for netdev_chain and sends a NEWLINK message
 *        to the routing socket.
 */
void netdev_state_change(struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                struct netdev_notifier_change_info change_info = {
                        .info.dev = dev,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE,
                                              &change_info.info);
                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
        }
}
EXPORT_SYMBOL(netdev_state_change);

/**
 * __netdev_notify_peers - notify network peers about existence of @dev,
 * to be called when rtnl lock is already held.
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void __netdev_notify_peers(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
}
EXPORT_SYMBOL(__netdev_notify_peers);

/**
 * netdev_notify_peers - notify network peers about existence of @dev
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void netdev_notify_peers(struct net_device *dev)
{
        rtnl_lock();
        __netdev_notify_peers(dev);
        rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);

static int napi_threaded_poll(void *data);

static int napi_kthread_create(struct napi_struct *n)
{
        int err = 0;

        /* Create and wake up the kthread once to put it in
         * TASK_INTERRUPTIBLE mode to avoid the blocked task
         * warning and work with loadavg.
         */
        n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
                                n->dev->name, n->napi_id);
        if (IS_ERR(n->thread)) {
                err = PTR_ERR(n->thread);
                pr_err("kthread_run failed with err %d\n", err);
                n->thread = NULL;
        }

        return err;
}

static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int ret;

        ASSERT_RTNL();
        dev_addr_check(dev);

        if (!netif_device_present(dev)) {
                /* may be detached because parent is runtime-suspended */
                if (dev->dev.parent)
                        pm_runtime_resume(dev->dev.parent);
                if (!netif_device_present(dev))
                        return -ENODEV;
        }

        /* Block netpoll from trying to do any rx path servicing.
         * If we don't do this there is a chance ndo_poll_controller
         * or ndo_poll may be running while we open the device
         */
        netpoll_poll_disable(dev);

        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        set_bit(__LINK_STATE_START, &dev->state);

        if (ops->ndo_validate_addr)
                ret = ops->ndo_validate_addr(dev);

        if (!ret && ops->ndo_open)
                ret = ops->ndo_open(dev);

        netpoll_poll_enable(dev);

        if (ret)
                clear_bit(__LINK_STATE_START, &dev->state);
        else {
                dev->flags |= IFF_UP;
                dev_set_rx_mode(dev);
                dev_activate(dev);
                add_device_randomness(dev->dev_addr, dev->addr_len);
        }

        return ret;
}

/**
 *        dev_open        - prepare an interface for use.
 *        @dev: device to open
 *        @extack: netlink extended ack
 *
 *        Takes a device from down to up state. The device's private open
 *        function is invoked and then the multicast lists are loaded. Finally
 *        the device is moved into the up state and a %NETDEV_UP message is
 *        sent to the netdev notifier chain.
 *
 *        Calling this function on an active interface is a nop. On a failure
 *        a negative errno code is returned.
 */
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        int ret;

        if (dev->flags & IFF_UP)
                return 0;

        ret = __dev_open(dev, extack);
        if (ret < 0)
                return ret;

        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
        call_netdevice_notifiers(NETDEV_UP, dev);

        return ret;
}
EXPORT_SYMBOL(dev_open);

static void __dev_close_many(struct list_head *head)
{
        struct net_device *dev;

        ASSERT_RTNL();
        might_sleep();

        list_for_each_entry(dev, head, close_list) {
                /* Temporarily disable netpoll until the interface is down */
                netpoll_poll_disable(dev);

                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);

                clear_bit(__LINK_STATE_START, &dev->state);

                /* Synchronize to scheduled poll. We cannot touch poll list, it
                 * can be even on different cpu. So just clear netif_running().
                 *
                 * dev->stop() will invoke napi_disable() on all of it's
                 * napi_struct instances on this device.
                 */
                smp_mb__after_atomic(); /* Commit netif_running(). */
        }

        dev_deactivate_many(head);

        list_for_each_entry(dev, head, close_list) {
                const struct net_device_ops *ops = dev->netdev_ops;

                /*
                 *        Call the device specific close. This cannot fail.
                 *        Only if device is UP
                 *
                 *        We allow it to be called even after a DETACH hot-plug
                 *        event.
                 */
                if (ops->ndo_stop)
                        ops->ndo_stop(dev);

                dev->flags &= ~IFF_UP;
                netpoll_poll_enable(dev);
        }
}

static void __dev_close(struct net_device *dev)
{
        LIST_HEAD(single);

        list_add(&dev->close_list, &single);
        __dev_close_many(&single);
        list_del(&single);
}

void dev_close_many(struct list_head *head, bool unlink)
{
        struct net_device *dev, *tmp;

        /* Remove the devices that don't need to be closed */
        list_for_each_entry_safe(dev, tmp, head, close_list)
                if (!(dev->flags & IFF_UP))
                        list_del_init(&dev->close_list);

        __dev_close_many(head);

        list_for_each_entry_safe(dev, tmp, head, close_list) {
                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
                call_netdevice_notifiers(NETDEV_DOWN, dev);
                if (unlink)
                        list_del_init(&dev->close_list);
        }
}
EXPORT_SYMBOL(dev_close_many);

/**
 *        dev_close - shutdown an interface.
 *        @dev: device to shutdown
 *
 *        This function moves an active device into down state. A
 *        %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 *        is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 *        chain.
 */
void dev_close(struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                LIST_HEAD(single);

                list_add(&dev->close_list, &single);
                dev_close_many(&single, true);
                list_del(&single);
        }
}
EXPORT_SYMBOL(dev_close);


/**
 *        dev_disable_lro - disable Large Receive Offload on a device
 *        @dev: device
 *
 *        Disable Large Receive Offload (LRO) on a net device.  Must be
 *        called under RTNL.  This is needed if received packets may be
 *        forwarded to another interface.
 */
void dev_disable_lro(struct net_device *dev)
{
        struct net_device *lower_dev;
        struct list_head *iter;

        dev->wanted_features &= ~NETIF_F_LRO;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_LRO))
                netdev_WARN(dev, "failed to disable LRO!\n");

        netdev_for_each_lower_dev(dev, lower_dev, iter)
                dev_disable_lro(lower_dev);
}
EXPORT_SYMBOL(dev_disable_lro);

/**
 *        dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 *        @dev: device
 *
 *        Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 *        called under RTNL.  This is needed if Generic XDP is installed on
 *        the device.
 */
static void dev_disable_gro_hw(struct net_device *dev)
{
        dev->wanted_features &= ~NETIF_F_GRO_HW;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_GRO_HW))
                netdev_WARN(dev, "failed to disable GRO_HW!\n");
}

const char *netdev_cmd_to_name(enum netdev_cmd cmd)
{
#define N(val)                                                 \
        case NETDEV_##val:                                \
                return "NETDEV_" __stringify(val);
        switch (cmd) {
        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
        N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
        N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
        N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
        N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
        N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
        N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
        N(XDP_FEAT_CHANGE)
        }
#undef N
        return "UNKNOWN_NETDEV_EVENT";
}
EXPORT_SYMBOL_GPL(netdev_cmd_to_name);

static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
                                   struct net_device *dev)
{
        struct netdev_notifier_info info = {
                .dev = dev,
        };

        return nb->notifier_call(nb, val, &info);
}

static int call_netdevice_register_notifiers(struct notifier_block *nb,
                                             struct net_device *dev)
{
        int err;

        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        if (!(dev->flags & IFF_UP))
                return 0;

        call_netdevice_notifier(nb, NETDEV_UP, dev);
        return 0;
}

static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
                                                struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
                                        dev);
                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
        }
        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}

static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
                                                 struct net *net)
{
        struct net_device *dev;
        int err;

        for_each_netdev(net, dev) {
                err = call_netdevice_register_notifiers(nb, dev);
                if (err)
                        goto rollback;
        }
        return 0;

rollback:
        for_each_netdev_continue_reverse(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
        return err;
}

static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
                                                    struct net *net)
{
        struct net_device *dev;

        for_each_netdev(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
}

static int dev_boot_phase = 1;

/**
 * register_netdevice_notifier - register a network notifier block
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);
        rtnl_lock();
        err = raw_notifier_chain_register(&netdev_chain, nb);
        if (err)
                goto unlock;
        if (dev_boot_phase)
                goto unlock;
        for_each_net(net) {
                err = call_netdevice_register_net_notifiers(nb, net);
                if (err)
                        goto rollback;
        }

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;

rollback:
        for_each_net_continue_reverse(net)
                call_netdevice_unregister_net_notifiers(nb, net);

        raw_notifier_chain_unregister(&netdev_chain, nb);
        goto unlock;
}
EXPORT_SYMBOL(register_netdevice_notifier);

/**
 * unregister_netdevice_notifier - unregister a network notifier block
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier(). The notifier is unlinked into the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);
        rtnl_lock();
        err = raw_notifier_chain_unregister(&netdev_chain, nb);
        if (err)
                goto unlock;

        for_each_net(net)
                call_netdevice_unregister_net_notifiers(nb, net);

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);

static int __register_netdevice_notifier_net(struct net *net,
                                             struct notifier_block *nb,
                                             bool ignore_call_fail)
{
        int err;

        err = raw_notifier_chain_register(&net->netdev_chain, nb);
        if (err)
                return err;
        if (dev_boot_phase)
                return 0;

        err = call_netdevice_register_net_notifiers(nb, net);
        if (err && !ignore_call_fail)
                goto chain_unregister;

        return 0;

chain_unregister:
        raw_notifier_chain_unregister(&net->netdev_chain, nb);
        return err;
}

static int __unregister_netdevice_notifier_net(struct net *net,
                                               struct notifier_block *nb)
{
        int err;

        err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
        if (err)
                return err;

        call_netdevice_unregister_net_notifiers(nb, net);
        return 0;
}

/**
 * register_netdevice_notifier_net - register a per-netns network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
        int err;

        rtnl_lock();
        err = __register_netdevice_notifier_net(net, nb, false);
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);

/**
 * unregister_netdevice_notifier_net - unregister a per-netns
 *                                     network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier_net(). The notifier is unlinked from the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb)
{
        int err;

        rtnl_lock();
        err = __unregister_netdevice_notifier_net(net, nb);
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);

static void __move_netdevice_notifier_net(struct net *src_net,
                                          struct net *dst_net,
                                          struct notifier_block *nb)
{
        __unregister_netdevice_notifier_net(src_net, nb);
        __register_netdevice_notifier_net(dst_net, nb, true);
}

int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn)
{
        int err;

        rtnl_lock();
        err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
        if (!err) {
                nn->nb = nb;
                list_add(&nn->list, &dev->net_notifier_list);
        }
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);

int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn)
{
        int err;

        rtnl_lock();
        list_del(&nn->list);
        err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);

static void move_netdevice_notifiers_dev_net(struct net_device *dev,
                                             struct net *net)
{
        struct netdev_net_notifier *nn;

        list_for_each_entry(nn, &dev->net_notifier_list, list)
                __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
}

/**
 *        call_netdevice_notifiers_info - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @info: notifier information data
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);
        int ret;

        ASSERT_RTNL();

        /* Run per-netns notifier block chain first, then run the global one.
         * Hopefully, one day, the global one is going to be removed after
         * all notifier block registrators get converted to be per-netns.
         */
        ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
        if (ret & NOTIFY_STOP_MASK)
                return ret;
        return raw_notifier_call_chain(&netdev_chain, val, info);
}

/**
 *        call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 *                                               for and rollback on error
 *        @val_up: value passed unmodified to notifier function
 *        @val_down: value passed unmodified to the notifier function when
 *                   recovering from an error on @val_up
 *        @info: notifier information data
 *
 *        Call all per-netns network notifier blocks, but not notifier blocks on
 *        the global notifier chain. Parameters and return value are as for
 *        raw_notifier_call_chain_robust().
 */

static int
call_netdevice_notifiers_info_robust(unsigned long val_up,
                                     unsigned long val_down,
                                     struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);

        ASSERT_RTNL();

        return raw_notifier_call_chain_robust(&net->netdev_chain,
                                              val_up, val_down, info);
}

static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_info info = {
                .dev = dev,
                .extack = extack,
        };

        return call_netdevice_notifiers_info(val, &info);
}

/**
 *        call_netdevice_notifiers - call all network notifier blocks
 *      @val: value passed unmodified to notifier function
 *      @dev: net_device pointer passed unmodified to notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
        return call_netdevice_notifiers_extack(val, dev, NULL);
}
EXPORT_SYMBOL(call_netdevice_notifiers);

/**
 *        call_netdevice_notifiers_mtu - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @dev: net_device pointer passed unmodified to notifier function
 *        @arg: additional u32 argument passed to the notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */
static int call_netdevice_notifiers_mtu(unsigned long val,
                                        struct net_device *dev, u32 arg)
{
        struct netdev_notifier_info_ext info = {
                .info.dev = dev,
                .ext.mtu = arg,
        };

        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);

        return call_netdevice_notifiers_info(val, &info.info);
}

#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);

void net_inc_ingress_queue(void)
{
        static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);

void net_dec_ingress_queue(void)
{
        static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif

#ifdef CONFIG_NET_EGRESS
static DEFINE_STATIC_KEY_FALSE(egress_needed_key);

void net_inc_egress_queue(void)
{
        static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);

void net_dec_egress_queue(void)
{
        static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif

#ifdef CONFIG_NET_CLS_ACT
DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key);
EXPORT_SYMBOL(tcf_bypass_check_needed_key);
#endif

DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
static void netstamp_clear(struct work_struct *work)
{
        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
        int wanted;

        wanted = atomic_add_return(deferred, &netstamp_wanted);
        if (wanted > 0)
                static_branch_enable(&netstamp_needed_key);
        else
                static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif

void net_enable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 0) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
                        return;
        }
        atomic_inc(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);

void net_disable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 1) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
                        return;
        }
        atomic_dec(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);

static inline void net_timestamp_set(struct sk_buff *skb)
{
        skb->tstamp = 0;
        skb->tstamp_type = SKB_CLOCK_REALTIME;
        if (static_branch_unlikely(&netstamp_needed_key))
                skb->tstamp = ktime_get_real();
}

#define net_timestamp_check(COND, SKB)                                \
        if (static_branch_unlikely(&netstamp_needed_key)) {        \
                if ((COND) && !(SKB)->tstamp)                        \
                        (SKB)->tstamp = ktime_get_real();        \
        }                                                        \

bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
        return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);

static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
                              bool check_mtu)
{
        int ret = ____dev_forward_skb(dev, skb, check_mtu);

        if (likely(!ret)) {
                skb->protocol = eth_type_trans(skb, dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
        }

        return ret;
}

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, true);
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

/**
 * dev_forward_skb - loopback an skb to another netif
 *
 * @dev: destination network device
 * @skb: buffer to forward
 *
 * return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped, but freed)
 *
 * dev_forward_skb can be used for injecting an skb from the
 * start_xmit function of one device into the receive queue
 * of another device.
 *
 * The receiving device may be in another namespace, so
 * we have to clear all information in the skb that could
 * impact namespace isolation.
 */
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);

int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
}

static inline int deliver_skb(struct sk_buff *skb,
                              struct packet_type *pt_prev,
                              struct net_device *orig_dev)
{
        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                return -ENOMEM;
        refcount_inc(&skb->users);
        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

static inline void deliver_ptype_list_skb(struct sk_buff *skb,
                                          struct packet_type **pt,
                                          struct net_device *orig_dev,
                                          __be16 type,
                                          struct list_head *ptype_list)
{
        struct packet_type *ptype, *pt_prev = *pt;

        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (ptype->type != type)
                        continue;
                if (pt_prev)
                        deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }
        *pt = pt_prev;
}

static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
        if (!ptype->af_packet_priv || !skb->sk)
                return false;

        if (ptype->id_match)
                return ptype->id_match(ptype, skb->sk);
        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
                return true;

        return false;
}

/**
 * dev_nit_active - return true if any network interface taps are in use
 *
 * @dev: network device to check for the presence of taps
 */
bool dev_nit_active(struct net_device *dev)
{
        return !list_empty(&net_hotdata.ptype_all) ||
               !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active);

/*
 *        Support routine. Sends outgoing frames to any network
 *        taps currently in use.
 */

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
        struct list_head *ptype_list = &net_hotdata.ptype_all;
        struct packet_type *ptype, *pt_prev = NULL;
        struct sk_buff *skb2 = NULL;

        rcu_read_lock();
again:
        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (READ_ONCE(ptype->ignore_outgoing))
                        continue;

                /* Never send packets back to the socket
                 * they originated from - MvS (miquels@drinkel.ow.org)
                 */
                if (skb_loop_sk(ptype, skb))
                        continue;

                if (pt_prev) {
                        deliver_skb(skb2, pt_prev, skb->dev);
                        pt_prev = ptype;
                        continue;
                }

                /* need to clone skb, done only once */
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (!skb2)
                        goto out_unlock;

                net_timestamp_set(skb2);

                /* skb->nh should be correctly
                 * set by sender, so that the second statement is
                 * just protection against buggy protocols.
                 */
                skb_reset_mac_header(skb2);

                if (skb_network_header(skb2) < skb2->data ||
                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
                                             ntohs(skb2->protocol),
                                             dev->name);
                        skb_reset_network_header(skb2);
                }

                skb2->transport_header = skb2->network_header;
                skb2->pkt_type = PACKET_OUTGOING;
                pt_prev = ptype;
        }

        if (ptype_list == &net_hotdata.ptype_all) {
                ptype_list = &dev->ptype_all;
                goto again;
        }
out_unlock:
        if (pt_prev) {
                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
                else
                        kfree_skb(skb2);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);

/**
 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 * @dev: Network device
 * @txq: number of queues available
 *
 * If real_num_tx_queues is changed the tc mappings may no longer be
 * valid. To resolve this verify the tc mapping remains valid and if
 * not NULL the mapping. With no priorities mapping to this
 * offset/count pair it will no longer be used. In the worst case TC0
 * is invalid nothing can be done so disable priority mappings. If is
 * expected that drivers will fix this mapping if they can before
 * calling netif_set_real_num_tx_queues.
 */
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
{
        int i;
        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];

        /* If TC0 is invalidated disable TC mapping */
        if (tc->offset + tc->count > txq) {
                netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
                dev->num_tc = 0;
                return;
        }

        /* Invalidated prio to tc mappings set to TC0 */
        for (i = 1; i < TC_BITMASK + 1; i++) {
                int q = netdev_get_prio_tc_map(dev, i);

                tc = &dev->tc_to_txq[q];
                if (tc->offset + tc->count > txq) {
                        netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
                                    i, q);
                        netdev_set_prio_tc_map(dev, i, 0);
                }
        }
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
{
        if (dev->num_tc) {
                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
                int i;

                /* walk through the TCs and see if it falls into any of them */
                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
                        if ((txq - tc->offset) < tc->count)
                                return i;
                }

                /* didn't find it, just return -1 to indicate no match */
                return -1;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_txq_to_tc);

#ifdef CONFIG_XPS
static struct static_key xps_needed __read_mostly;
static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P)                \
        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))

static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
                             struct xps_dev_maps *old_maps, int tci, u16 index)
{
        struct xps_map *map = NULL;
        int pos;

        map = xmap_dereference(dev_maps->attr_map[tci]);
        if (!map)
                return false;

        for (pos = map->len; pos--;) {
                if (map->queues[pos] != index)
                        continue;

                if (map->len > 1) {
                        map->queues[pos] = map->queues[--map->len];
                        break;
                }

                if (old_maps)
                        RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                kfree_rcu(map, rcu);
                return false;
        }

        return true;
}

static bool remove_xps_queue_cpu(struct net_device *dev,
                                 struct xps_dev_maps *dev_maps,
                                 int cpu, u16 offset, u16 count)
{
        int num_tc = dev_maps->num_tc;
        bool active = false;
        int tci;

        for (tci = cpu * num_tc; num_tc--; tci++) {
                int i, j;

                for (i = count, j = offset; i--; j++) {
                        if (!remove_xps_queue(dev_maps, NULL, tci, j))
                                break;
                }

                active |= i < 0;
        }

        return active;
}

static void reset_xps_maps(struct net_device *dev,
                           struct xps_dev_maps *dev_maps,
                           enum xps_map_type type)
{
        static_key_slow_dec_cpuslocked(&xps_needed);
        if (type == XPS_RXQS)
                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);

        RCU_INIT_POINTER(dev->xps_maps[type], NULL);

        kfree_rcu(dev_maps, rcu);
}

static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
                           u16 offset, u16 count)
{
        struct xps_dev_maps *dev_maps;
        bool active = false;
        int i, j;

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (!dev_maps)
                return;

        for (j = 0; j < dev_maps->nr_ids; j++)
                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

        if (type == XPS_CPUS) {
                for (i = offset + (count - 1); count--; i--)
                        netdev_queue_numa_node_write(
                                netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
        }
}

static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
                                   u16 count)
{
        if (!static_key_false(&xps_needed))
                return;

        cpus_read_lock();
        mutex_lock(&xps_map_mutex);

        if (static_key_false(&xps_rxqs_needed))
                clean_xps_maps(dev, XPS_RXQS, offset, count);

        clean_xps_maps(dev, XPS_CPUS, offset, count);

        mutex_unlock(&xps_map_mutex);
        cpus_read_unlock();
}

static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
{
        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}

static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
                                      u16 index, bool is_rxqs_map)
{
        struct xps_map *new_map;
        int alloc_len = XPS_MIN_MAP_ALLOC;
        int i, pos;

        for (pos = 0; map && pos < map->len; pos++) {
                if (map->queues[pos] != index)
                        continue;
                return map;
        }

        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
        if (map) {
                if (pos < map->alloc_len)
                        return map;

                alloc_len = map->alloc_len * 2;
        }

        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
         *  map
         */
        if (is_rxqs_map)
                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
        else
                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
                                       cpu_to_node(attr_index));
        if (!new_map)
                return NULL;

        for (i = 0; i < pos; i++)
                new_map->queues[i] = map->queues[i];
        new_map->alloc_len = alloc_len;
        new_map->len = pos;

        return new_map;
}

/* Copy xps maps at a given index */
static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
                              struct xps_dev_maps *new_dev_maps, int index,
                              int tc, bool skip_tc)
{
        int i, tci = index * dev_maps->num_tc;
        struct xps_map *map;

        /* copy maps belonging to foreign traffic classes */
        for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                if (i == tc && skip_tc)
                        continue;

                /* fill in the new device map from the old device map */
                map = xmap_dereference(dev_maps->attr_map[tci]);
                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }
}

/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type)
{
        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
        const unsigned long *online_mask = NULL;
        bool active = false, copy = false;
        int i, j, tci, numa_node_id = -2;
        int maps_sz, num_tc = 1, tc = 0;
        struct xps_map *map, *new_map;
        unsigned int nr_ids;

        WARN_ON_ONCE(index >= dev->num_tx_queues);

        if (dev->num_tc) {
                /* Do not allow XPS on subordinate device directly */
                num_tc = dev->num_tc;
                if (num_tc < 0)
                        return -EINVAL;

                /* If queue belongs to subordinate dev use its map */
                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;

                tc = netdev_txq_to_tc(dev, index);
                if (tc < 0)
                        return -EINVAL;
        }

        mutex_lock(&xps_map_mutex);

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (type == XPS_RXQS) {
                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
                nr_ids = dev->num_rx_queues;
        } else {
                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
                if (num_possible_cpus() > 1)
                        online_mask = cpumask_bits(cpu_online_mask);
                nr_ids = nr_cpu_ids;
        }

        if (maps_sz < L1_CACHE_BYTES)
                maps_sz = L1_CACHE_BYTES;

        /* The old dev_maps could be larger or smaller than the one we're
         * setting up now, as dev->num_tc or nr_ids could have been updated in
         * between. We could try to be smart, but let's be safe instead and only
         * copy foreign traffic classes if the two map sizes match.
         */
        if (dev_maps &&
            dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
                copy = true;

        /* allocate memory for queue storage */
        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
             j < nr_ids;) {
                if (!new_dev_maps) {
                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
                        if (!new_dev_maps) {
                                mutex_unlock(&xps_map_mutex);
                                return -ENOMEM;
                        }

                        new_dev_maps->nr_ids = nr_ids;
                        new_dev_maps->num_tc = num_tc;
                }

                tci = j * num_tc + tc;
                map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;

                map = expand_xps_map(map, j, index, type == XPS_RXQS);
                if (!map)
                        goto error;

                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }

        if (!new_dev_maps)
                goto out_no_new_maps;

        if (!dev_maps) {
                /* Increment static keys at most once per type */
                static_key_slow_inc_cpuslocked(&xps_needed);
                if (type == XPS_RXQS)
                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
        }

        for (j = 0; j < nr_ids; j++) {
                bool skip_tc = false;

                tci = j * num_tc + tc;
                if (netif_attr_test_mask(j, mask, nr_ids) &&
                    netif_attr_test_online(j, online_mask, nr_ids)) {
                        /* add tx-queue to CPU/rx-queue maps */
                        int pos = 0;

                        skip_tc = true;

                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        while ((pos < map->len) && (map->queues[pos] != index))
                                pos++;

                        if (pos == map->len)
                                map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
                        if (type == XPS_CPUS) {
                                if (numa_node_id == -2)
                                        numa_node_id = cpu_to_node(j);
                                else if (numa_node_id != cpu_to_node(j))
                                        numa_node_id = -1;
                        }
#endif
                }

                if (copy)
                        xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
                                          skip_tc);
        }

        rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);

        /* Cleanup old maps */
        if (!dev_maps)
                goto out_no_old_maps;

        for (j = 0; j < dev_maps->nr_ids; j++) {
                for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
                        map = xmap_dereference(dev_maps->attr_map[tci]);
                        if (!map)
                                continue;

                        if (copy) {
                                new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                                if (map == new_map)
                                        continue;
                        }

                        RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                        kfree_rcu(map, rcu);
                }
        }

        old_dev_maps = dev_maps;

out_no_old_maps:
        dev_maps = new_dev_maps;
        active = true;

out_no_new_maps:
        if (type == XPS_CPUS)
                /* update Tx queue numa node */
                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
                                             (numa_node_id >= 0) ?
                                             numa_node_id : NUMA_NO_NODE);

        if (!dev_maps)
                goto out_no_maps;

        /* removes tx-queue from unused CPUs/rx-queues */
        for (j = 0; j < dev_maps->nr_ids; j++) {
                tci = j * dev_maps->num_tc;

                for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                        if (i == tc &&
                            netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
                            netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
                                continue;

                        active |= remove_xps_queue(dev_maps,
                                                   copy ? old_dev_maps : NULL,
                                                   tci, index);
                }
        }

        if (old_dev_maps)
                kfree_rcu(old_dev_maps, rcu);

        /* free map if not active */
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

out_no_maps:
        mutex_unlock(&xps_map_mutex);

        return 0;
error:
        /* remove any maps that we added */
        for (j = 0; j < nr_ids; j++) {
                for (i = num_tc, tci = j * num_tc; i--; tci++) {
                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        map = copy ?
                              xmap_dereference(dev_maps->attr_map[tci]) :
                              NULL;
                        if (new_map && new_map != map)
                                kfree(new_map);
                }
        }

        mutex_unlock(&xps_map_mutex);

        kfree(new_dev_maps);
        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__netif_set_xps_queue);

int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index)
{
        int ret;

        cpus_read_lock();
        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
        cpus_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_set_xps_queue);

#endif
static void netdev_unbind_all_sb_channels(struct net_device *dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

        /* Unbind any subordinate channels */
        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev)
                        netdev_unbind_sb_channel(dev, txq->sb_dev);
        }
}

void netdev_reset_tc(struct net_device *dev)
{
#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        /* Reset TC configuration of device */
        dev->num_tc = 0;
        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
}
EXPORT_SYMBOL(netdev_reset_tc);

int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues(dev, offset, count);
#endif
        dev->tc_to_txq[tc].count = count;
        dev->tc_to_txq[tc].offset = offset;
        return 0;
}
EXPORT_SYMBOL(netdev_set_tc_queue);

int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
{
        if (num_tc > TC_MAX_QUEUE)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        dev->num_tc = num_tc;
        return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(sb_dev, 0);
#endif
        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));

        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev == sb_dev)
                        txq->sb_dev = NULL;
        }
}
EXPORT_SYMBOL(netdev_unbind_sb_channel);

int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset)
{
        /* Make certain the sb_dev and dev are already configured */
        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
                return -EINVAL;

        /* We cannot hand out queues we don't have */
        if ((offset + count) > dev->real_num_tx_queues)
                return -EINVAL;

        /* Record the mapping */
        sb_dev->tc_to_txq[tc].count = count;
        sb_dev->tc_to_txq[tc].offset = offset;

        /* Provide a way for Tx queue to find the tc_to_txq map or
         * XPS map for itself.
         */
        while (count--)
                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;

        return 0;
}
EXPORT_SYMBOL(netdev_bind_sb_channel_queue);

int netdev_set_sb_channel(struct net_device *dev, u16 channel)
{
        /* Do not use a multiqueue device to represent a subordinate channel */
        if (netif_is_multiqueue(dev))
                return -ENODEV;

        /* We allow channels 1 - 32767 to be used for subordinate channels.
         * Channel 0 is meant to be "native" mode and used only to represent
         * the main root device. We allow writing 0 to reset the device back
         * to normal mode after being used as a subordinate channel.
         */
        if (channel > S16_MAX)
                return -EINVAL;

        dev->num_tc = -channel;

        return 0;
}
EXPORT_SYMBOL(netdev_set_sb_channel);

/*
 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 */
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
        bool disabling;
        int rc;

        disabling = txq < dev->real_num_tx_queues;

        if (txq < 1 || txq > dev->num_tx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING) {
                ASSERT_RTNL();

                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
                                                  txq);
                if (rc)
                        return rc;

                if (dev->num_tc)
                        netif_setup_tc(dev, txq);

                dev_qdisc_change_real_num_tx(dev, txq);

                dev->real_num_tx_queues = txq;

                if (disabling) {
                        synchronize_net();
                        qdisc_reset_all_tx_gt(dev, txq);
#ifdef CONFIG_XPS
                        netif_reset_xps_queues_gt(dev, txq);
#endif
                }
        } else {
                dev->real_num_tx_queues = txq;
        }

        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);

#ifdef CONFIG_SYSFS
/**
 *        netif_set_real_num_rx_queues - set actual number of RX queues used
 *        @dev: Network device
 *        @rxq: Actual number of RX queues
 *
 *        This must be called either with the rtnl_lock held or before
 *        registration of the net device.  Returns 0 on success, or a
 *        negative error code.  If called before registration, it always
 *        succeeds.
 */
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
{
        int rc;

        if (rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED) {
                ASSERT_RTNL();

                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
                                                  rxq);
                if (rc)
                        return rc;
        }

        dev->real_num_rx_queues = rxq;
        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
#endif

/**
 *        netif_set_real_num_queues - set actual number of RX and TX queues used
 *        @dev: Network device
 *        @txq: Actual number of TX queues
 *        @rxq: Actual number of RX queues
 *
 *        Set the real number of both TX and RX queues.
 *        Does nothing if the number of queues is already correct.
 */
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq)
{
        unsigned int old_rxq = dev->real_num_rx_queues;
        int err;

        if (txq < 1 || txq > dev->num_tx_queues ||
            rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        /* Start from increases, so the error path only does decreases -
         * decreases can't fail.
         */
        if (rxq > dev->real_num_rx_queues) {
                err = netif_set_real_num_rx_queues(dev, rxq);
                if (err)
                        return err;
        }
        if (txq > dev->real_num_tx_queues) {
                err = netif_set_real_num_tx_queues(dev, txq);
                if (err)
                        goto undo_rx;
        }
        if (rxq < dev->real_num_rx_queues)
                WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
        if (txq < dev->real_num_tx_queues)
                WARN_ON(netif_set_real_num_tx_queues(dev, txq));

        return 0;
undo_rx:
        WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
        return err;
}
EXPORT_SYMBOL(netif_set_real_num_queues);

/**
 * netif_set_tso_max_size() - set the max size of TSO frames supported
 * @dev:        netdev to update
 * @size:        max skb->len of a TSO frame
 *
 * Set the limit on the size of TSO super-frames the device can handle.
 * Unless explicitly set the stack will assume the value of
 * %GSO_LEGACY_MAX_SIZE.
 */
void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
{
        dev->tso_max_size = min(GSO_MAX_SIZE, size);
        if (size < READ_ONCE(dev->gso_max_size))
                netif_set_gso_max_size(dev, size);
        if (size < READ_ONCE(dev->gso_ipv4_max_size))
                netif_set_gso_ipv4_max_size(dev, size);
}
EXPORT_SYMBOL(netif_set_tso_max_size);

/**
 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 * @dev:        netdev to update
 * @segs:        max number of TCP segments
 *
 * Set the limit on the number of TCP segments the device can generate from
 * a single TSO super-frame.
 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 */
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
{
        dev->tso_max_segs = segs;
        if (segs < READ_ONCE(dev->gso_max_segs))
                netif_set_gso_max_segs(dev, segs);
}
EXPORT_SYMBOL(netif_set_tso_max_segs);

/**
 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 * @to:                netdev to update
 * @from:        netdev from which to copy the limits
 */
void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
{
        netif_set_tso_max_size(to, from->tso_max_size);
        netif_set_tso_max_segs(to, from->tso_max_segs);
}
EXPORT_SYMBOL(netif_inherit_tso_max);

/**
 * netif_get_num_default_rss_queues - default number of RSS queues
 *
 * Default value is the number of physical cores if there are only 1 or 2, or
 * divided by 2 if there are more.
 */
int netif_get_num_default_rss_queues(void)
{
        cpumask_var_t cpus;
        int cpu, count = 0;

        if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
                return 1;

        cpumask_copy(cpus, cpu_online_mask);
        for_each_cpu(cpu, cpus) {
                ++count;
                cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
        }
        free_cpumask_var(cpus);

        return count > 2 ? DIV_ROUND_UP(count, 2) : count;
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);

static void __netif_reschedule(struct Qdisc *q)
{
        struct softnet_data *sd;
        unsigned long flags;

        local_irq_save(flags);
        sd = this_cpu_ptr(&softnet_data);
        q->next_sched = NULL;
        *sd->output_queue_tailp = q;
        sd->output_queue_tailp = &q->next_sched;
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}

void __netif_schedule(struct Qdisc *q)
{
        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
                __netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);

struct dev_kfree_skb_cb {
        enum skb_drop_reason reason;
};

static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
{
        return (struct dev_kfree_skb_cb *)skb->cb;
}

void netif_schedule_queue(struct netdev_queue *txq)
{
        rcu_read_lock();
        if (!netif_xmit_stopped(txq)) {
                struct Qdisc *q = rcu_dereference(txq->qdisc);

                __netif_schedule(q);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(netif_schedule_queue);

void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
                struct Qdisc *q;

                rcu_read_lock();
                q = rcu_dereference(dev_queue->qdisc);
                __netif_schedule(q);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(netif_tx_wake_queue);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        unsigned long flags;

        if (unlikely(!skb))
                return;

        if (likely(refcount_read(&skb->users) == 1)) {
                smp_rmb();
                refcount_set(&skb->users, 0);
        } else if (likely(!refcount_dec_and_test(&skb->users))) {
                return;
        }
        get_kfree_skb_cb(skb)->reason = reason;
        local_irq_save(flags);
        skb->next = __this_cpu_read(softnet_data.completion_queue);
        __this_cpu_write(softnet_data.completion_queue, skb);
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(dev_kfree_skb_irq_reason);

void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (in_hardirq() || irqs_disabled())
                dev_kfree_skb_irq_reason(skb, reason);
        else
                kfree_skb_reason(skb, reason);
}
EXPORT_SYMBOL(dev_kfree_skb_any_reason);


/**
 * netif_device_detach - mark device as removed
 * @dev: network device
 *
 * Mark device as removed from system and therefore no longer available.
 */
void netif_device_detach(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_stop_all_queues(dev);
        }
}
EXPORT_SYMBOL(netif_device_detach);

/**
 * netif_device_attach - mark device as attached
 * @dev: network device
 *
 * Mark device as attached from system and restart if needed.
 */
void netif_device_attach(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_wake_all_queues(dev);
                __netdev_watchdog_up(dev);
        }
}
EXPORT_SYMBOL(netif_device_attach);

/*
 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 * to be used as a distribution range.
 */
static u16 skb_tx_hash(const struct net_device *dev,
                       const struct net_device *sb_dev,
                       struct sk_buff *skb)
{
        u32 hash;
        u16 qoffset = 0;
        u16 qcount = dev->real_num_tx_queues;

        if (dev->num_tc) {
                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);

                qoffset = sb_dev->tc_to_txq[tc].offset;
                qcount = sb_dev->tc_to_txq[tc].count;
                if (unlikely(!qcount)) {
                        net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
                                             sb_dev->name, qoffset, tc);
                        qoffset = 0;
                        qcount = dev->real_num_tx_queues;
                }
        }

        if (skb_rx_queue_recorded(skb)) {
                DEBUG_NET_WARN_ON_ONCE(qcount == 0);
                hash = skb_get_rx_queue(skb);
                if (hash >= qoffset)
                        hash -= qoffset;
                while (unlikely(hash >= qcount))
                        hash -= qcount;
                return hash + qoffset;
        }

        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}

void skb_warn_bad_offload(const struct sk_buff *skb)
{
        static const netdev_features_t null_features;
        struct net_device *dev = skb->dev;
        const char *name = "";

        if (!net_ratelimit())
                return;

        if (dev) {
                if (dev->dev.parent)
                        name = dev_driver_string(dev->dev.parent);
                else
                        name = netdev_name(dev);
        }
        skb_dump(KERN_WARNING, skb, false);
        WARN(1, "%s: caps=(%pNF, %pNF)\n",
             name, dev ? &dev->features : &null_features,
             skb->sk ? &skb->sk->sk_route_caps : &null_features);
}

/*
 * Invalidate hardware checksum when packet is to be mangled, and
 * complete checksum manually on outgoing path.
 */
int skb_checksum_help(struct sk_buff *skb)
{
        __wsum csum;
        int ret = 0, offset;

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                goto out_set_summed;

        if (unlikely(skb_is_gso(skb))) {
                skb_warn_bad_offload(skb);
                return -EINVAL;
        }

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (skb_has_shared_frag(skb)) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }

        offset = skb_checksum_start_offset(skb);
        ret = -EINVAL;
        if (unlikely(offset >= skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
                          offset, skb_headlen(skb));
                goto out;
        }
        csum = skb_checksum(skb, offset, skb->len - offset, 0);

        offset += skb->csum_offset;
        if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
                          offset + sizeof(__sum16), skb_headlen(skb));
                goto out;
        }
        ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
        if (ret)
                goto out;

        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
        skb->ip_summed = CHECKSUM_NONE;
out:
        return ret;
}
EXPORT_SYMBOL(skb_checksum_help);

int skb_crc32c_csum_help(struct sk_buff *skb)
{
        __le32 crc32c_csum;
        int ret = 0, offset, start;

        if (skb->ip_summed != CHECKSUM_PARTIAL)
                goto out;

        if (unlikely(skb_is_gso(skb)))
                goto out;

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (unlikely(skb_has_shared_frag(skb))) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }
        start = skb_checksum_start_offset(skb);
        offset = start + offsetof(struct sctphdr, checksum);
        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
                ret = -EINVAL;
                goto out;
        }

        ret = skb_ensure_writable(skb, offset + sizeof(__le32));
        if (ret)
                goto out;

        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
                                                  skb->len - start, ~(__u32)0,
                                                  crc32c_csum_stub));
        *(__le32 *)(skb->data + offset) = crc32c_csum;
        skb_reset_csum_not_inet(skb);
out:
        return ret;
}

__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
        __be16 type = skb->protocol;

        /* Tunnel gso handlers can set protocol to ethernet. */
        if (type == htons(ETH_P_TEB)) {
                struct ethhdr *eth;

                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
                        return 0;

                eth = (struct ethhdr *)skb->data;
                type = eth->h_proto;
        }

        return vlan_get_protocol_and_depth(skb, type, depth);
}


/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        netdev_err(dev, "hw csum failure\n");
        skb_dump(KERN_ERR, skb, true);
        dump_stack();
}

void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif

/* XXX: check that highmem exists at all on the given machine. */
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
        int i;

        if (!(dev->features & NETIF_F_HIGHDMA)) {
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                        if (PageHighMem(skb_frag_page(frag)))
                                return 1;
                }
        }
#endif
        return 0;
}

/* If MPLS offload request, verify we are testing hardware MPLS features
 * instead of standard features for the netdev.
 */
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        if (eth_p_mpls(type))
                features &= skb->dev->mpls_features;

        return features;
}
#else
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        return features;
}
#endif

static netdev_features_t harmonize_features(struct sk_buff *skb,
        netdev_features_t features)
{
        __be16 type;

        type = skb_network_protocol(skb, NULL);
        features = net_mpls_features(skb, features, type);

        if (skb->ip_summed != CHECKSUM_NONE &&
            !can_checksum_protocol(features, type)) {
                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
        }
        if (illegal_highdma(skb->dev, skb))
                features &= ~NETIF_F_SG;

        return features;
}

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features)
{
        return features;
}
EXPORT_SYMBOL(passthru_features_check);

static netdev_features_t dflt_features_check(struct sk_buff *skb,
                                             struct net_device *dev,
                                             netdev_features_t features)
{
        return vlan_features_check(skb, features);
}

static netdev_features_t gso_features_check(const struct sk_buff *skb,
                                            struct net_device *dev,
                                            netdev_features_t features)
{
        u16 gso_segs = skb_shinfo(skb)->gso_segs;

        if (gso_segs > READ_ONCE(dev->gso_max_segs))
                return features & ~NETIF_F_GSO_MASK;

        if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
                return features & ~NETIF_F_GSO_MASK;

        if (!skb_shinfo(skb)->gso_type) {
                skb_warn_bad_offload(skb);
                return features & ~NETIF_F_GSO_MASK;
        }

        /* Support for GSO partial features requires software
         * intervention before we can actually process the packets
         * so we need to strip support for any partial features now
         * and we can pull them back in after we have partially
         * segmented the frame.
         */
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
                features &= ~dev->gso_partial_features;

        /* Make sure to clear the IPv4 ID mangling feature if the
         * IPv4 header has the potential to be fragmented.
         */
        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
                struct iphdr *iph = skb->encapsulation ?
                                    inner_ip_hdr(skb) : ip_hdr(skb);

                if (!(iph->frag_off & htons(IP_DF)))
                        features &= ~NETIF_F_TSO_MANGLEID;
        }

        return features;
}

netdev_features_t netif_skb_features(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        netdev_features_t features = dev->features;

        if (skb_is_gso(skb))
                features = gso_features_check(skb, dev, features);

        /* If encapsulation offload request, verify we are testing
         * hardware encapsulation features instead of standard
         * features for the netdev
         */
        if (skb->encapsulation)
                features &= dev->hw_enc_features;

        if (skb_vlan_tagged(skb))
                features = netdev_intersect_features(features,
                                                     dev->vlan_features |
                                                     NETIF_F_HW_VLAN_CTAG_TX |
                                                     NETIF_F_HW_VLAN_STAG_TX);

        if (dev->netdev_ops->ndo_features_check)
                features &= dev->netdev_ops->ndo_features_check(skb, dev,
                                                                features);
        else
                features &= dflt_features_check(skb, dev, features);

        return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);

static int xmit_one(struct sk_buff *skb, struct net_device *dev,
                    struct netdev_queue *txq, bool more)
{
        unsigned int len;
        int rc;

        if (dev_nit_active(dev))
                dev_queue_xmit_nit(skb, dev);

        len = skb->len;
        trace_net_dev_start_xmit(skb, dev);
        rc = netdev_start_xmit(skb, dev, txq, more);
        trace_net_dev_xmit(skb, rc, dev, len);

        return rc;
}

struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret)
{
        struct sk_buff *skb = first;
        int rc = NETDEV_TX_OK;

        while (skb) {
                struct sk_buff *next = skb->next;

                skb_mark_not_on_list(skb);
                rc = xmit_one(skb, dev, txq, next != NULL);
                if (unlikely(!dev_xmit_complete(rc))) {
                        skb->next = next;
                        goto out;
                }

                skb = next;
                if (netif_tx_queue_stopped(txq) && skb) {
                        rc = NETDEV_TX_BUSY;
                        break;
                }
        }

out:
        *ret = rc;
        return skb;
}

static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
                                          netdev_features_t features)
{
        if (skb_vlan_tag_present(skb) &&
            !vlan_hw_offload_capable(features, skb->vlan_proto))
                skb = __vlan_hwaccel_push_inside(skb);
        return skb;
}

int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features)
{
        if (unlikely(skb_csum_is_sctp(skb)))
                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
                        skb_crc32c_csum_help(skb);

        if (features & NETIF_F_HW_CSUM)
                return 0;

        if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
                switch (skb->csum_offset) {
                case offsetof(struct tcphdr, check):
                case offsetof(struct udphdr, check):
                        return 0;
                }
        }

        return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);

static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        netdev_features_t features;

        features = netif_skb_features(skb);
        skb = validate_xmit_vlan(skb, features);
        if (unlikely(!skb))
                goto out_null;

        skb = sk_validate_xmit_skb(skb, dev);
        if (unlikely(!skb))
                goto out_null;

        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs;

                segs = skb_gso_segment(skb, features);
                if (IS_ERR(segs)) {
                        goto out_kfree_skb;
                } else if (segs) {
                        consume_skb(skb);
                        skb = segs;
                }
        } else {
                if (skb_needs_linearize(skb, features) &&
                    __skb_linearize(skb))
                        goto out_kfree_skb;

                /* If packet is not checksummed and device does not
                 * support checksumming for this protocol, complete
                 * checksumming here.
                 */
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        if (skb->encapsulation)
                                skb_set_inner_transport_header(skb,
                                                               skb_checksum_start_offset(skb));
                        else
                                skb_set_transport_header(skb,
                                                         skb_checksum_start_offset(skb));
                        if (skb_csum_hwoffload_help(skb, features))
                                goto out_kfree_skb;
                }
        }

        skb = validate_xmit_xfrm(skb, features, again);

        return skb;

out_kfree_skb:
        kfree_skb(skb);
out_null:
        dev_core_stats_tx_dropped_inc(dev);
        return NULL;
}

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        struct sk_buff *next, *head = NULL, *tail;

        for (; skb != NULL; skb = next) {
                next = skb->next;
                skb_mark_not_on_list(skb);

                /* in case skb wont be segmented, point to itself */
                skb->prev = skb;

                skb = validate_xmit_skb(skb, dev, again);
                if (!skb)
                        continue;

                if (!head)
                        head = skb;
                else
                        tail->next = skb;
                /* If skb was segmented, skb->prev points to
                 * the last segment. If not, it still contains skb.
                 */
                tail = skb->prev;
        }
        return head;
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);

static void qdisc_pkt_len_init(struct sk_buff *skb)
{
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        qdisc_skb_cb(skb)->pkt_len = skb->len;

        /* To get more precise estimation of bytes sent on wire,
         * we add to pkt_len the headers size of all segments
         */
        if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
                u16 gso_segs = shinfo->gso_segs;
                unsigned int hdr_len;

                /* mac layer + network layer */
                hdr_len = skb_transport_offset(skb);

                /* + transport layer */
                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
                        const struct tcphdr *th;
                        struct tcphdr _tcphdr;

                        th = skb_header_pointer(skb, hdr_len,
                                                sizeof(_tcphdr), &_tcphdr);
                        if (likely(th))
                                hdr_len += __tcp_hdrlen(th);
                } else {
                        struct udphdr _udphdr;

                        if (skb_header_pointer(skb, hdr_len,
                                               sizeof(_udphdr), &_udphdr))
                                hdr_len += sizeof(struct udphdr);
                }

                if (shinfo->gso_type & SKB_GSO_DODGY)
                        gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
                                                shinfo->gso_size);

                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
        }
}

static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
                             struct sk_buff **to_free,
                             struct netdev_queue *txq)
{
        int rc;

        rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
        if (rc == NET_XMIT_SUCCESS)
                trace_qdisc_enqueue(q, txq, skb);
        return rc;
}

static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                 struct net_device *dev,
                                 struct netdev_queue *txq)
{
        spinlock_t *root_lock = qdisc_lock(q);
        struct sk_buff *to_free = NULL;
        bool contended;
        int rc;

        qdisc_calculate_pkt_len(skb, q);

        tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);

        if (q->flags & TCQ_F_NOLOCK) {
                if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
                    qdisc_run_begin(q)) {
                        /* Retest nolock_qdisc_is_empty() within the protection
                         * of q->seqlock to protect from racing with requeuing.
                         */
                        if (unlikely(!nolock_qdisc_is_empty(q))) {
                                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                                __qdisc_run(q);
                                qdisc_run_end(q);

                                goto no_lock_out;
                        }

                        qdisc_bstats_cpu_update(q, skb);
                        if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
                            !nolock_qdisc_is_empty(q))
                                __qdisc_run(q);

                        qdisc_run_end(q);
                        return NET_XMIT_SUCCESS;
                }

                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                qdisc_run(q);

no_lock_out:
                if (unlikely(to_free))
                        kfree_skb_list_reason(to_free,
                                              tcf_get_drop_reason(to_free));
                return rc;
        }

        if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
                kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
                return NET_XMIT_DROP;
        }
        /*
         * Heuristic to force contended enqueues to serialize on a
         * separate lock before trying to get qdisc main lock.
         * This permits qdisc->running owner to get the lock more
         * often and dequeue packets faster.
         * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
         * and then other tasks will only enqueue packets. The packets will be
         * sent after the qdisc owner is scheduled again. To prevent this
         * scenario the task always serialize on the lock.
         */
        contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
        if (unlikely(contended))
                spin_lock(&q->busylock);

        spin_lock(root_lock);
        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
                __qdisc_drop(skb, &to_free);
                rc = NET_XMIT_DROP;
        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
                   qdisc_run_begin(q)) {
                /*
                 * This is a work-conserving queue; there are no old skbs
                 * waiting to be sent out; and the qdisc is not running -
                 * xmit the skb directly.
                 */

                qdisc_bstats_update(q, skb);

                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);
                                contended = false;
                        }
                        __qdisc_run(q);
                }

                qdisc_run_end(q);
                rc = NET_XMIT_SUCCESS;
        } else {
                WRITE_ONCE(q->owner, smp_processor_id());
                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                WRITE_ONCE(q->owner, -1);
                if (qdisc_run_begin(q)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);
                                contended = false;
                        }
                        __qdisc_run(q);
                        qdisc_run_end(q);
                }
        }
        spin_unlock(root_lock);
        if (unlikely(to_free))
                kfree_skb_list_reason(to_free,
                                      tcf_get_drop_reason(to_free));
        if (unlikely(contended))
                spin_unlock(&q->busylock);
        return rc;
}

#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
        const struct netprio_map *map;
        const struct sock *sk;
        unsigned int prioidx;

        if (skb->priority)
                return;
        map = rcu_dereference_bh(skb->dev->priomap);
        if (!map)
                return;
        sk = skb_to_full_sk(skb);
        if (!sk)
                return;

        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);

        if (prioidx < map->priomap_len)
                skb->priority = map->priomap[prioidx];
}
#else
#define skb_update_prio(skb)
#endif

/**
 *        dev_loopback_xmit - loop back @skb
 *        @net: network namespace this loopback is happening in
 *        @sk:  sk needed to be a netfilter okfn
 *        @skb: buffer to transmit
 */
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb_reset_mac_header(skb);
        __skb_pull(skb, skb_network_offset(skb));
        skb->pkt_type = PACKET_LOOPBACK;
        if (skb->ip_summed == CHECKSUM_NONE)
                skb->ip_summed = CHECKSUM_UNNECESSARY;
        DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
        skb_dst_force(skb);
        netif_rx(skb);
        return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);

#ifdef CONFIG_NET_EGRESS
static struct netdev_queue *
netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
{
        int qm = skb_get_queue_mapping(skb);

        return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}

#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
        return __this_cpu_read(softnet_data.xmit.skip_txqueue);
}

void netdev_xmit_skip_txqueue(bool skip)
{
        __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);

#else
static bool netdev_xmit_txqueue_skipped(void)
{
        return current->net_xmit.skip_txqueue;
}

void netdev_xmit_skip_txqueue(bool skip)
{
        current->net_xmit.skip_txqueue = skip;
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif
#endif /* CONFIG_NET_EGRESS */

#ifdef CONFIG_NET_XGRESS
static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
                  enum skb_drop_reason *drop_reason)
{
        int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
        struct tcf_result res;

        if (!miniq)
                return ret;

        if (static_branch_unlikely(&tcf_bypass_check_needed_key)) {
                if (tcf_block_bypass_sw(miniq->block))
                        return ret;
        }

        tc_skb_cb(skb)->mru = 0;
        tc_skb_cb(skb)->post_ct = false;
        tcf_set_drop_reason(skb, *drop_reason);

        mini_qdisc_bstats_cpu_update(miniq, skb);
        ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
        /* Only tcf related quirks below. */
        switch (ret) {
        case TC_ACT_SHOT:
                *drop_reason = tcf_get_drop_reason(skb);
                mini_qdisc_qstats_cpu_drop(miniq);
                break;
        case TC_ACT_OK:
        case TC_ACT_RECLASSIFY:
                skb->tc_index = TC_H_MIN(res.classid);
                break;
        }
#endif /* CONFIG_NET_CLS_ACT */
        return ret;
}

static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);

void tcx_inc(void)
{
        static_branch_inc(&tcx_needed_key);
}

void tcx_dec(void)
{
        static_branch_dec(&tcx_needed_key);
}

static __always_inline enum tcx_action_base
tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
        const bool needs_mac)
{
        const struct bpf_mprog_fp *fp;
        const struct bpf_prog *prog;
        int ret = TCX_NEXT;

        if (needs_mac)
                __skb_push(skb, skb->mac_len);
        bpf_mprog_foreach_prog(entry, fp, prog) {
                bpf_compute_data_pointers(skb);
                ret = bpf_prog_run(prog, skb);
                if (ret != TCX_NEXT)
                        break;
        }
        if (needs_mac)
                __skb_pull(skb, skb->mac_len);
        return tcx_action_code(skb, ret);
}

static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
        if (*pt_prev) {
                *ret = deliver_skb(skb, *pt_prev, orig_dev);
                *pt_prev = NULL;
        }

        qdisc_skb_cb(skb)->pkt_len = skb->len;
        tcx_set_ingress(skb, true);

        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, true);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto ingress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* skb_mac_header check was done by BPF, so we can safely
                 * push the L2 header back before redirecting to another
                 * netdev.
                 */
                __skb_push(skb, skb->mac_len);
                if (skb_do_redirect(skb) == -EAGAIN) {
                        __skb_pull(skb, skb->mac_len);
                        *another = true;
                        break;
                }
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_RX_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
         * already set by the caller.
         */
        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, false);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto egress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* No need to push/pop skb's mac_header here on egress! */
                skb_do_redirect(skb);
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_XMIT_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}
#else
static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NET_XGRESS */

#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
                               struct xps_dev_maps *dev_maps, unsigned int tci)
{
        int tc = netdev_get_prio_tc_map(dev, skb->priority);
        struct xps_map *map;
        int queue_index = -1;

        if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
                return queue_index;

        tci *= dev_maps->num_tc;
        tci += tc;

        map = rcu_dereference(dev_maps->attr_map[tci]);
        if (map) {
                if (map->len == 1)
                        queue_index = map->queues[0];
                else
                        queue_index = map->queues[reciprocal_scale(
                                                skb_get_hash(skb), map->len)];
                if (unlikely(queue_index >= dev->real_num_tx_queues))
                        queue_index = -1;
        }
        return queue_index;
}
#endif

static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
                         struct sk_buff *skb)
{
#ifdef CONFIG_XPS
        struct xps_dev_maps *dev_maps;
        struct sock *sk = skb->sk;
        int queue_index = -1;

        if (!static_key_false(&xps_needed))
                return -1;

        rcu_read_lock();
        if (!static_key_false(&xps_rxqs_needed))
                goto get_cpus_map;

        dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
        if (dev_maps) {
                int tci = sk_rx_queue_get(sk);

                if (tci >= 0)
                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
        }

get_cpus_map:
        if (queue_index < 0) {
                dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
                if (dev_maps) {
                        unsigned int tci = skb->sender_cpu - 1;

                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
                }
        }
        rcu_read_unlock();

        return queue_index;
#else
        return -1;
#endif
}

u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);

u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
                       struct net_device *sb_dev)
{
        return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
}
EXPORT_SYMBOL(dev_pick_tx_cpu_id);

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        struct sock *sk = skb->sk;
        int queue_index = sk_tx_queue_get(sk);

        sb_dev = sb_dev ? : dev;

        if (queue_index < 0 || skb->ooo_okay ||
            queue_index >= dev->real_num_tx_queues) {
                int new_index = get_xps_queue(dev, sb_dev, skb);

                if (new_index < 0)
                        new_index = skb_tx_hash(dev, sb_dev, skb);

                if (queue_index != new_index && sk &&
                    sk_fullsock(sk) &&
                    rcu_access_pointer(sk->sk_dst_cache))
                        sk_tx_queue_set(sk, new_index);

                queue_index = new_index;
        }

        return queue_index;
}
EXPORT_SYMBOL(netdev_pick_tx);

struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev)
{
        int queue_index = 0;

#ifdef CONFIG_XPS
        u32 sender_cpu = skb->sender_cpu - 1;

        if (sender_cpu >= (u32)NR_CPUS)
                skb->sender_cpu = raw_smp_processor_id() + 1;
#endif

        if (dev->real_num_tx_queues != 1) {
                const struct net_device_ops *ops = dev->netdev_ops;

                if (ops->ndo_select_queue)
                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
                else
                        queue_index = netdev_pick_tx(dev, skb, sb_dev);

                queue_index = netdev_cap_txqueue(dev, queue_index);
        }

        skb_set_queue_mapping(skb, queue_index);
        return netdev_get_tx_queue(dev, queue_index);
}

/**
 * __dev_queue_xmit() - transmit a buffer
 * @skb:        buffer to transmit
 * @sb_dev:        suboordinate device used for L2 forwarding offload
 *
 * Queue a buffer for transmission to a network device. The caller must
 * have set the device and priority and built the buffer before calling
 * this function. The function can be called from an interrupt.
 *
 * When calling this method, interrupts MUST be enabled. This is because
 * the BH enable code must have IRQs enabled so that it will not deadlock.
 *
 * Regardless of the return value, the skb is consumed, so it is currently
 * difficult to retry a send to this method. (You can bump the ref count
 * before sending to hold a reference for retry if you are careful.)
 *
 * Return:
 * * 0                                - buffer successfully transmitted
 * * positive qdisc return code        - NET_XMIT_DROP etc.
 * * negative errno                - other errors
 */
int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq = NULL;
        struct Qdisc *q;
        int rc = -ENOMEM;
        bool again = false;

        skb_reset_mac_header(skb);
        skb_assert_len(skb);

        if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
                __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);

        /* Disable soft irqs for various locks below. Also
         * stops preemption for RCU.
         */
        rcu_read_lock_bh();

        skb_update_prio(skb);

        qdisc_pkt_len_init(skb);
        tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
        if (static_branch_unlikely(&egress_needed_key)) {
                if (nf_hook_egress_active()) {
                        skb = nf_hook_egress(skb, &rc, dev);
                        if (!skb)
                                goto out;
                }

                netdev_xmit_skip_txqueue(false);

                nf_skip_egress(skb, true);
                skb = sch_handle_egress(skb, &rc, dev);
                if (!skb)
                        goto out;
                nf_skip_egress(skb, false);

                if (netdev_xmit_txqueue_skipped())
                        txq = netdev_tx_queue_mapping(dev, skb);
        }
#endif
        /* If device/qdisc don't need skb->dst, release it right now while
         * its hot in this cpu cache.
         */
        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
                skb_dst_drop(skb);
        else
                skb_dst_force(skb);

        if (!txq)
                txq = netdev_core_pick_tx(dev, skb, sb_dev);

        q = rcu_dereference_bh(txq->qdisc);

        trace_net_dev_queue(skb);
        if (q->enqueue) {
                rc = __dev_xmit_skb(skb, q, dev, txq);
                goto out;
        }

        /* The device has no queue. Common case for software devices:
         * loopback, all the sorts of tunnels...

         * Really, it is unlikely that netif_tx_lock protection is necessary
         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
         * counters.)
         * However, it is possible, that they rely on protection
         * made by us here.

         * Check this and shot the lock. It is not prone from deadlocks.
         *Either shot noqueue qdisc, it is even simpler 8)
         */
        if (dev->flags & IFF_UP) {
                int cpu = smp_processor_id(); /* ok because BHs are off */

                /* Other cpus might concurrently change txq->xmit_lock_owner
                 * to -1 or to their cpu id, but not to our id.
                 */
                if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
                        if (dev_xmit_recursion())
                                goto recursion_alert;

                        skb = validate_xmit_skb(skb, dev, &again);
                        if (!skb)
                                goto out;

                        HARD_TX_LOCK(dev, txq, cpu);

                        if (!netif_xmit_stopped(txq)) {
                                dev_xmit_recursion_inc();
                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
                                dev_xmit_recursion_dec();
                                if (dev_xmit_complete(rc)) {
                                        HARD_TX_UNLOCK(dev, txq);
                                        goto out;
                                }
                        }
                        HARD_TX_UNLOCK(dev, txq);
                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
                                             dev->name);
                } else {
                        /* Recursion is detected! It is possible,
                         * unfortunately
                         */
recursion_alert:
                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
                                             dev->name);
                }
        }

        rc = -ENETDOWN;
        rcu_read_unlock_bh();

        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return rc;
out:
        rcu_read_unlock_bh();
        return rc;
}
EXPORT_SYMBOL(__dev_queue_xmit);

int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        struct net_device *dev = skb->dev;
        struct sk_buff *orig_skb = skb;
        struct netdev_queue *txq;
        int ret = NETDEV_TX_BUSY;
        bool again = false;

        if (unlikely(!netif_running(dev) ||
                     !netif_carrier_ok(dev)))
                goto drop;

        skb = validate_xmit_skb_list(skb, dev, &again);
        if (skb != orig_skb)
                goto drop;

        skb_set_queue_mapping(skb, queue_id);
        txq = skb_get_tx_queue(dev, skb);

        local_bh_disable();

        dev_xmit_recursion_inc();
        HARD_TX_LOCK(dev, txq, smp_processor_id());
        if (!netif_xmit_frozen_or_drv_stopped(txq))
                ret = netdev_start_xmit(skb, dev, txq, false);
        HARD_TX_UNLOCK(dev, txq);
        dev_xmit_recursion_dec();

        local_bh_enable();
        return ret;
drop:
        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return NET_XMIT_DROP;
}
EXPORT_SYMBOL(__dev_direct_xmit);

/*************************************************************************
 *                        Receiver routines
 *************************************************************************/
static DEFINE_PER_CPU(struct task_struct *, backlog_napi);

int weight_p __read_mostly = 64;           /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
                                     struct napi_struct *napi)
{
        struct task_struct *thread;

        lockdep_assert_irqs_disabled();

        if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
                /* Paired with smp_mb__before_atomic() in
                 * napi_enable()/dev_set_threaded().
                 * Use READ_ONCE() to guarantee a complete
                 * read on napi->thread. Only call
                 * wake_up_process() when it's not NULL.
                 */
                thread = READ_ONCE(napi->thread);
                if (thread) {
                        if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
                                goto use_local_napi;

                        set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
                        wake_up_process(thread);
                        return;
                }
        }

use_local_napi:
        list_add_tail(&napi->poll_list, &sd->poll_list);
        WRITE_ONCE(napi->list_owner, smp_processor_id());
        /* If not called from net_rx_action()
         * we have to raise NET_RX_SOFTIRQ.
         */
        if (!sd->in_net_rx_action)
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

#ifdef CONFIG_RPS

struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);

static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
            struct rps_dev_flow *rflow, u16 next_cpu)
{
        if (next_cpu < nr_cpu_ids) {
                u32 head;
#ifdef CONFIG_RFS_ACCEL
                struct netdev_rx_queue *rxqueue;
                struct rps_dev_flow_table *flow_table;
                struct rps_dev_flow *old_rflow;
                u16 rxq_index;
                u32 flow_id;
                int rc;

                /* Should we steer this flow to a different hardware queue? */
                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
                    !(dev->features & NETIF_F_NTUPLE))
                        goto out;
                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
                if (rxq_index == skb_get_rx_queue(skb))
                        goto out;

                rxqueue = dev->_rx + rxq_index;
                flow_table = rcu_dereference(rxqueue->rps_flow_table);
                if (!flow_table)
                        goto out;
                flow_id = skb_get_hash(skb) & flow_table->mask;
                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
                                                        rxq_index, flow_id);
                if (rc < 0)
                        goto out;
                old_rflow = rflow;
                rflow = &flow_table->flows[flow_id];
                WRITE_ONCE(rflow->filter, rc);
                if (old_rflow->filter == rc)
                        WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
        out:
#endif
                head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
                rps_input_queue_tail_save(&rflow->last_qtail, head);
        }

        WRITE_ONCE(rflow->cpu, next_cpu);
        return rflow;
}

/*
 * get_rps_cpu is called from netif_receive_skb and returns the target
 * CPU from the RPS map of the receiving queue for a given skb.
 * rcu_read_lock must be held on entry.
 */
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                       struct rps_dev_flow **rflowp)
{
        const struct rps_sock_flow_table *sock_flow_table;
        struct netdev_rx_queue *rxqueue = dev->_rx;
        struct rps_dev_flow_table *flow_table;
        struct rps_map *map;
        int cpu = -1;
        u32 tcpu;
        u32 hash;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);
                        goto done;
                }
                rxqueue += index;
        }

        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */

        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        map = rcu_dereference(rxqueue->rps_map);
        if (!flow_table && !map)
                goto done;

        skb_reset_network_header(skb);
        hash = skb_get_hash(skb);
        if (!hash)
                goto done;

        sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
        if (flow_table && sock_flow_table) {
                struct rps_dev_flow *rflow;
                u32 next_cpu;
                u32 ident;

                /* First check into global flow table if there is a match.
                 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
                 */
                ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
                if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
                        goto try_rps;

                next_cpu = ident & net_hotdata.rps_cpu_mask;

                /* OK, now we know there is a match,
                 * we can look at the local (per receive queue) flow table
                 */
                rflow = &flow_table->flows[hash & flow_table->mask];
                tcpu = rflow->cpu;

                /*
                 * If the desired CPU (where last recvmsg was done) is
                 * different from current CPU (one in the rx-queue flow
                 * table entry), switch if one of the following holds:
                 *   - Current CPU is unset (>= nr_cpu_ids).
                 *   - Current CPU is offline.
                 *   - The current CPU's queue tail has advanced beyond the
                 *     last packet that was enqueued using this table entry.
                 *     This guarantees that all previous packets for the flow
                 *     have been dequeued, thus preserving in order delivery.
                 */
                if (unlikely(tcpu != next_cpu) &&
                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
                     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
                      rflow->last_qtail)) >= 0)) {
                        tcpu = next_cpu;
                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
                }

                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
                        *rflowp = rflow;
                        cpu = tcpu;
                        goto done;
                }
        }

try_rps:

        if (map) {
                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
                if (cpu_online(tcpu)) {
                        cpu = tcpu;
                        goto done;
                }
        }

done:
        return cpu;
}

#ifdef CONFIG_RFS_ACCEL

/**
 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 * @dev: Device on which the filter was set
 * @rxq_index: RX queue index
 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 *
 * Drivers that implement ndo_rx_flow_steer() should periodically call
 * this function for each installed filter and remove the filters for
 * which it returns %true.
 */
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
                         u32 flow_id, u16 filter_id)
{
        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
        struct rps_dev_flow_table *flow_table;
        struct rps_dev_flow *rflow;
        bool expire = true;
        unsigned int cpu;

        rcu_read_lock();
        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        if (flow_table && flow_id <= flow_table->mask) {
                rflow = &flow_table->flows[flow_id];
                cpu = READ_ONCE(rflow->cpu);
                if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
                    ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
                           READ_ONCE(rflow->last_qtail)) <
                     (int)(10 * flow_table->mask)))
                        expire = false;
        }
        rcu_read_unlock();
        return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);

#endif /* CONFIG_RFS_ACCEL */

/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
        struct softnet_data *sd = data;

        ____napi_schedule(sd, &sd->backlog);
        sd->received_rps++;
}

#endif /* CONFIG_RPS */

/* Called from hardirq (IPI) context */
static void trigger_rx_softirq(void *data)
{
        struct softnet_data *sd = data;

        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        smp_store_release(&sd->defer_ipi_scheduled, 0);
}

/*
 * After we queued a packet into sd->input_pkt_queue,
 * we need to make sure this queue is serviced soon.
 *
 * - If this is another cpu queue, link it to our rps_ipi_list,
 *   and make sure we will process rps_ipi_list from net_rx_action().
 *
 * - If this is our own queue, NAPI schedule our backlog.
 *   Note that this also raises NET_RX_SOFTIRQ.
 */
static void napi_schedule_rps(struct softnet_data *sd)
{
        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);

#ifdef CONFIG_RPS
        if (sd != mysd) {
                if (use_backlog_threads()) {
                        __napi_schedule_irqoff(&sd->backlog);
                        return;
                }

                sd->rps_ipi_next = mysd->rps_ipi_list;
                mysd->rps_ipi_list = sd;

                /* If not called from net_rx_action() or napi_threaded_poll()
                 * we have to raise NET_RX_SOFTIRQ.
                 */
                if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
                        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
                return;
        }
#endif /* CONFIG_RPS */
        __napi_schedule_irqoff(&mysd->backlog);
}

void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
{
        unsigned long flags;

        if (use_backlog_threads()) {
                backlog_lock_irq_save(sd, &flags);

                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
                        __napi_schedule_irqoff(&sd->backlog);

                backlog_unlock_irq_restore(sd, &flags);

        } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
                smp_call_function_single_async(cpu, &sd->defer_csd);
        }
}

#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif

static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
{
#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit *fl;
        struct softnet_data *sd;
        unsigned int old_flow, new_flow;

        if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
                return false;

        sd = this_cpu_ptr(&softnet_data);

        rcu_read_lock();
        fl = rcu_dereference(sd->flow_limit);
        if (fl) {
                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
                old_flow = fl->history[fl->history_head];
                fl->history[fl->history_head] = new_flow;

                fl->history_head++;
                fl->history_head &= FLOW_LIMIT_HISTORY - 1;

                if (likely(fl->buckets[old_flow]))
                        fl->buckets[old_flow]--;

                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
                        fl->count++;
                        rcu_read_unlock();
                        return true;
                }
        }
        rcu_read_unlock();
#endif
        return false;
}

/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
                              unsigned int *qtail)
{
        enum skb_drop_reason reason;
        struct softnet_data *sd;
        unsigned long flags;
        unsigned int qlen;
        int max_backlog;
        u32 tail;

        reason = SKB_DROP_REASON_DEV_READY;
        if (!netif_running(skb->dev))
                goto bad_dev;

        reason = SKB_DROP_REASON_CPU_BACKLOG;
        sd = &per_cpu(softnet_data, cpu);

        qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
        max_backlog = READ_ONCE(net_hotdata.max_backlog);
        if (unlikely(qlen > max_backlog))
                goto cpu_backlog_drop;
        backlog_lock_irq_save(sd, &flags);
        qlen = skb_queue_len(&sd->input_pkt_queue);
        if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
                if (!qlen) {
                        /* Schedule NAPI for backlog device. We can use
                         * non atomic operation as we own the queue lock.
                         */
                        if (!__test_and_set_bit(NAPI_STATE_SCHED,
                                                &sd->backlog.state))
                                napi_schedule_rps(sd);
                }
                __skb_queue_tail(&sd->input_pkt_queue, skb);
                tail = rps_input_queue_tail_incr(sd);
                backlog_unlock_irq_restore(sd, &flags);

                /* save the tail outside of the critical section */
                rps_input_queue_tail_save(qtail, tail);
                return NET_RX_SUCCESS;
        }

        backlog_unlock_irq_restore(sd, &flags);

cpu_backlog_drop:
        atomic_inc(&sd->dropped);
bad_dev:
        dev_core_stats_rx_dropped_inc(skb->dev);
        kfree_skb_reason(skb, reason);
        return NET_RX_DROP;
}

static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct netdev_rx_queue *rxqueue;

        rxqueue = dev->_rx;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);

                        return rxqueue; /* Return first rxqueue */
                }
                rxqueue += index;
        }
        return rxqueue;
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             struct bpf_prog *xdp_prog)
{
        void *orig_data, *orig_data_end, *hard_start;
        struct netdev_rx_queue *rxqueue;
        bool orig_bcast, orig_host;
        u32 mac_len, frame_sz;
        __be16 orig_eth_type;
        struct ethhdr *eth;
        u32 metalen, act;
        int off;

        /* The XDP program wants to see the packet starting at the MAC
         * header.
         */
        mac_len = skb->data - skb_mac_header(skb);
        hard_start = skb->data - skb_headroom(skb);

        /* SKB "head" area always have tailroom for skb_shared_info */
        frame_sz = (void *)skb_end_pointer(skb) - hard_start;
        frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        rxqueue = netif_get_rxqueue(skb);
        xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
        xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
                         skb_headlen(skb) + mac_len, true);
        if (skb_is_nonlinear(skb)) {
                skb_shinfo(skb)->xdp_frags_size = skb->data_len;
                xdp_buff_set_frags_flag(xdp);
        } else {
                xdp_buff_clear_frags_flag(xdp);
        }

        orig_data_end = xdp->data_end;
        orig_data = xdp->data;
        eth = (struct ethhdr *)xdp->data;
        orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
        orig_eth_type = eth->h_proto;

        act = bpf_prog_run_xdp(xdp_prog, xdp);

        /* check if bpf_xdp_adjust_head was used */
        off = xdp->data - orig_data;
        if (off) {
                if (off > 0)
                        __skb_pull(skb, off);
                else if (off < 0)
                        __skb_push(skb, -off);

                skb->mac_header += off;
                skb_reset_network_header(skb);
        }

        /* check if bpf_xdp_adjust_tail was used */
        off = xdp->data_end - orig_data_end;
        if (off != 0) {
                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
                skb->len += off; /* positive on grow, negative on shrink */
        }

        /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
         * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
         */
        if (xdp_buff_has_frags(xdp))
                skb->data_len = skb_shinfo(skb)->xdp_frags_size;
        else
                skb->data_len = 0;

        /* check if XDP changed eth hdr such SKB needs update */
        eth = (struct ethhdr *)xdp->data;
        if ((orig_eth_type != eth->h_proto) ||
            (orig_host != ether_addr_equal_64bits(eth->h_dest,
                                                  skb->dev->dev_addr)) ||
            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
                __skb_push(skb, ETH_HLEN);
                skb->pkt_type = PACKET_HOST;
                skb->protocol = eth_type_trans(skb, skb->dev);
        }

        /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
         * before calling us again on redirect path. We do not call do_redirect
         * as we leave that up to the caller.
         *
         * Caller is responsible for managing lifetime of skb (i.e. calling
         * kfree_skb in response to actions it cannot handle/XDP_DROP).
         */
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
                __skb_push(skb, mac_len);
                break;
        case XDP_PASS:
                metalen = xdp->data - xdp->data_meta;
                if (metalen)
                        skb_metadata_set(skb, metalen);
                break;
        }

        return act;
}

static int
netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
{
        struct sk_buff *skb = *pskb;
        int err, hroom, troom;

        if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
                return 0;

        /* In case we have to go down the path and also linearize,
         * then lets do the pskb_expand_head() work just once here.
         */
        hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
        troom = skb->tail + skb->data_len - skb->end;
        err = pskb_expand_head(skb,
                               hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
                               troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
        if (err)
                return err;

        return skb_linearize(skb);
}

static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
                                     struct xdp_buff *xdp,
                                     struct bpf_prog *xdp_prog)
{
        struct sk_buff *skb = *pskb;
        u32 mac_len, act = XDP_DROP;

        /* Reinjected packets coming from act_mirred or similar should
         * not get XDP generic processing.
         */
        if (skb_is_redirected(skb))
                return XDP_PASS;

        /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
         * bytes. This is the guarantee that also native XDP provides,
         * thus we need to do it here as well.
         */
        mac_len = skb->data - skb_mac_header(skb);
        __skb_push(skb, mac_len);

        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
                if (netif_skb_check_for_xdp(pskb, xdp_prog))
                        goto do_drop;
        }

        __skb_pull(*pskb, mac_len);

        act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
        case XDP_PASS:
                break;
        default:
                bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
        do_drop:
                kfree_skb(*pskb);
                break;
        }

        return act;
}

/* When doing generic XDP we have to bypass the qdisc layer and the
 * network taps in order to match in-driver-XDP behavior. This also means
 * that XDP packets are able to starve other packets going through a qdisc,
 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 * queues, so they do not have this starvation issue.
 */
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq;
        bool free_skb = true;
        int cpu, rc;

        txq = netdev_core_pick_tx(dev, skb, NULL);
        cpu = smp_processor_id();
        HARD_TX_LOCK(dev, txq, cpu);
        if (!netif_xmit_frozen_or_drv_stopped(txq)) {
                rc = netdev_start_xmit(skb, dev, txq, 0);
                if (dev_xmit_complete(rc))
                        free_skb = false;
        }
        HARD_TX_UNLOCK(dev, txq);
        if (free_skb) {
                trace_xdp_exception(dev, xdp_prog, XDP_TX);
                dev_core_stats_tx_dropped_inc(dev);
                kfree_skb(skb);
        }
}

static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);

int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;
                int err;

                act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
                if (act != XDP_PASS) {
                        switch (act) {
                        case XDP_REDIRECT:
                                err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
                                                              &xdp, xdp_prog);
                                if (err)
                                        goto out_redir;
                                break;
                        case XDP_TX:
                                generic_xdp_tx(*pskb, xdp_prog);
                                break;
                        }
                        return XDP_DROP;
                }
        }
        return XDP_PASS;
out_redir:
        kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
        return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);

static int netif_rx_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_rx(skb);

#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu;

                rcu_read_lock();

                cpu = get_rps_cpu(skb->dev, skb, &rflow);
                if (cpu < 0)
                        cpu = smp_processor_id();

                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

                rcu_read_unlock();
        } else
#endif
        {
                unsigned int qtail;

                ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
        }
        return ret;
}

/**
 *        __netif_rx        -        Slightly optimized version of netif_rx
 *        @skb: buffer to post
 *
 *        This behaves as netif_rx except that it does not disable bottom halves.
 *        As a result this function may only be invoked from the interrupt context
 *        (either hard or soft interrupt).
 */
int __netif_rx(struct sk_buff *skb)
{
        int ret;

        lockdep_assert_once(hardirq_count() | softirq_count());

        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        return ret;
}
EXPORT_SYMBOL(__netif_rx);

/**
 *        netif_rx        -        post buffer to the network code
 *        @skb: buffer to post
 *
 *        This function receives a packet from a device driver and queues it for
 *        the upper (protocol) levels to process via the backlog NAPI device. It
 *        always succeeds. The buffer may be dropped during processing for
 *        congestion control or by the protocol layers.
 *        The network buffer is passed via the backlog NAPI device. Modern NIC
 *        driver should use NAPI and GRO.
 *        This function can used from interrupt and from process context. The
 *        caller from process context must not disable interrupts before invoking
 *        this function.
 *
 *        return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped)
 *
 */
int netif_rx(struct sk_buff *skb)
{
        bool need_bh_off = !(hardirq_count() | softirq_count());
        int ret;

        if (need_bh_off)
                local_bh_disable();
        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        if (need_bh_off)
                local_bh_enable();
        return ret;
}
EXPORT_SYMBOL(netif_rx);

static __latent_entropy void net_tx_action(struct softirq_action *h)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);

        if (sd->completion_queue) {
                struct sk_buff *clist;

                local_irq_disable();
                clist = sd->completion_queue;
                sd->completion_queue = NULL;
                local_irq_enable();

                while (clist) {
                        struct sk_buff *skb = clist;

                        clist = clist->next;

                        WARN_ON(refcount_read(&skb->users));
                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
                                trace_consume_skb(skb, net_tx_action);
                        else
                                trace_kfree_skb(skb, net_tx_action,
                                                get_kfree_skb_cb(skb)->reason, NULL);

                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
                                __kfree_skb(skb);
                        else
                                __napi_kfree_skb(skb,
                                                 get_kfree_skb_cb(skb)->reason);
                }
        }

        if (sd->output_queue) {
                struct Qdisc *head;

                local_irq_disable();
                head = sd->output_queue;
                sd->output_queue = NULL;
                sd->output_queue_tailp = &sd->output_queue;
                local_irq_enable();

                rcu_read_lock();

                while (head) {
                        struct Qdisc *q = head;
                        spinlock_t *root_lock = NULL;

                        head = head->next_sched;

                        /* We need to make sure head->next_sched is read
                         * before clearing __QDISC_STATE_SCHED
                         */
                        smp_mb__before_atomic();

                        if (!(q->flags & TCQ_F_NOLOCK)) {
                                root_lock = qdisc_lock(q);
                                spin_lock(root_lock);
                        } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
                                                     &q->state))) {
                                /* There is a synchronize_net() between
                                 * STATE_DEACTIVATED flag being set and
                                 * qdisc_reset()/some_qdisc_is_busy() in
                                 * dev_deactivate(), so we can safely bail out
                                 * early here to avoid data race between
                                 * qdisc_deactivate() and some_qdisc_is_busy()
                                 * for lockless qdisc.
                                 */
                                clear_bit(__QDISC_STATE_SCHED, &q->state);
                                continue;
                        }

                        clear_bit(__QDISC_STATE_SCHED, &q->state);
                        qdisc_run(q);
                        if (root_lock)
                                spin_unlock(root_lock);
                }

                rcu_read_unlock();
        }

        xfrm_dev_backlog(sd);
}

#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
                             unsigned char *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif

/**
 *        netdev_is_rx_handler_busy - check if receive handler is registered
 *        @dev: device to check
 *
 *        Check if a receive handler is already registered for a given device.
 *        Return true if there one.
 *
 *        The caller must hold the rtnl_mutex.
 */
bool netdev_is_rx_handler_busy(struct net_device *dev)
{
        ASSERT_RTNL();
        return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);

/**
 *        netdev_rx_handler_register - register receive handler
 *        @dev: device to register a handler for
 *        @rx_handler: receive handler to register
 *        @rx_handler_data: data pointer that is used by rx handler
 *
 *        Register a receive handler for a device. This handler will then be
 *        called from __netif_receive_skb. A negative errno code is returned
 *        on a failure.
 *
 *        The caller must hold the rtnl_mutex.
 *
 *        For a general description of rx_handler, see enum rx_handler_result.
 */
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data)
{
        if (netdev_is_rx_handler_busy(dev))
                return -EBUSY;

        if (dev->priv_flags & IFF_NO_RX_HANDLER)
                return -EINVAL;

        /* Note: rx_handler_data must be set before rx_handler */
        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
        rcu_assign_pointer(dev->rx_handler, rx_handler);

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);

/**
 *        netdev_rx_handler_unregister - unregister receive handler
 *        @dev: device to unregister a handler from
 *
 *        Unregister a receive handler from a device.
 *
 *        The caller must hold the rtnl_mutex.
 */
void netdev_rx_handler_unregister(struct net_device *dev)
{

        ASSERT_RTNL();
        RCU_INIT_POINTER(dev->rx_handler, NULL);
        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
         * section has a guarantee to see a non NULL rx_handler_data
         * as well.
         */
        synchronize_net();
        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);

/*
 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 * the special handling of PFMEMALLOC skbs.
 */
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_ARP):
        case htons(ETH_P_IP):
        case htons(ETH_P_IPV6):
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
                             int *ret, struct net_device *orig_dev)
{
        if (nf_hook_ingress_active(skb)) {
                int ingress_retval;

                if (*pt_prev) {
                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
                        *pt_prev = NULL;
                }

                rcu_read_lock();
                ingress_retval = nf_hook_ingress(skb);
                rcu_read_unlock();
                return ingress_retval;
        }
        return 0;
}

static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
                                    struct packet_type **ppt_prev)
{
        struct packet_type *ptype, *pt_prev;
        rx_handler_func_t *rx_handler;
        struct sk_buff *skb = *pskb;
        struct net_device *orig_dev;
        bool deliver_exact = false;
        int ret = NET_RX_DROP;
        __be16 type;

        net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_receive_skb(skb);

        orig_dev = skb->dev;

        skb_reset_network_header(skb);
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);

        pt_prev = NULL;

another_round:
        skb->skb_iif = skb->dev->ifindex;

        __this_cpu_inc(softnet_data.processed);

        if (static_branch_unlikely(&generic_xdp_needed_key)) {
                int ret2;

                migrate_disable();
                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
                                      &skb);
                migrate_enable();

                if (ret2 != XDP_PASS) {
                        ret = NET_RX_DROP;
                        goto out;
                }
        }

        if (eth_type_vlan(skb->protocol)) {
                skb = skb_vlan_untag(skb);
                if (unlikely(!skb))
                        goto out;
        }

        if (skb_skip_tc_classify(skb))
                goto skip_classify;

        if (pfmemalloc)
                goto skip_taps;

        list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
                if (pt_prev)
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
                if (pt_prev)
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

skip_taps:
#ifdef CONFIG_NET_INGRESS
        if (static_branch_unlikely(&ingress_needed_key)) {
                bool another = false;

                nf_skip_egress(skb, true);
                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
                                         &another);
                if (another)
                        goto another_round;
                if (!skb)
                        goto out;

                nf_skip_egress(skb, false);
                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
                        goto out;
        }
#endif
        skb_reset_redirect(skb);
skip_classify:
        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
                goto drop;

        if (skb_vlan_tag_present(skb)) {
                if (pt_prev) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                if (vlan_do_receive(&skb))
                        goto another_round;
                else if (unlikely(!skb))
                        goto out;
        }

        rx_handler = rcu_dereference(skb->dev->rx_handler);
        if (rx_handler) {
                if (pt_prev) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                switch (rx_handler(&skb)) {
                case RX_HANDLER_CONSUMED:
                        ret = NET_RX_SUCCESS;
                        goto out;
                case RX_HANDLER_ANOTHER:
                        goto another_round;
                case RX_HANDLER_EXACT:
                        deliver_exact = true;
                        break;
                case RX_HANDLER_PASS:
                        break;
                default:
                        BUG();
                }
        }

        if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
                if (skb_vlan_tag_get_id(skb)) {
                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
                         * find vlan device.
                         */
                        skb->pkt_type = PACKET_OTHERHOST;
                } else if (eth_type_vlan(skb->protocol)) {
                        /* Outer header is 802.1P with vlan 0, inner header is
                         * 802.1Q or 802.1AD and vlan_do_receive() above could
                         * not find vlan dev for vlan id 0.
                         */
                        __vlan_hwaccel_clear_tag(skb);
                        skb = skb_vlan_untag(skb);
                        if (unlikely(!skb))
                                goto out;
                        if (vlan_do_receive(&skb))
                                /* After stripping off 802.1P header with vlan 0
                                 * vlan dev is found for inner header.
                                 */
                                goto another_round;
                        else if (unlikely(!skb))
                                goto out;
                        else
                                /* We have stripped outer 802.1P vlan 0 header.
                                 * But could not find vlan dev.
                                 * check again for vlan id to set OTHERHOST.
                                 */
                                goto check_vlan_id;
                }
                /* Note: we might in the future use prio bits
                 * and set skb->priority like in vlan_do_receive()
                 * For the time being, just ignore Priority Code Point
                 */
                __vlan_hwaccel_clear_tag(skb);
        }

        type = skb->protocol;

        /* deliver only exact match when indicated */
        if (likely(!deliver_exact)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &ptype_base[ntohs(type) &
                                                   PTYPE_HASH_MASK]);
        }

        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                               &orig_dev->ptype_specific);

        if (unlikely(skb->dev != orig_dev)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &skb->dev->ptype_specific);
        }

        if (pt_prev) {
                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                        goto drop;
                *ppt_prev = pt_prev;
        } else {
drop:
                if (!deliver_exact)
                        dev_core_stats_rx_dropped_inc(skb->dev);
                else
                        dev_core_stats_rx_nohandler_inc(skb->dev);
                kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
                /* Jamal, now you will not able to escape explaining
                 * me how you were going to use this. :-)
                 */
                ret = NET_RX_DROP;
        }

out:
        /* The invariant here is that if *ppt_prev is not NULL
         * then skb should also be non-NULL.
         *
         * Apparently *ppt_prev assignment above holds this invariant due to
         * skb dereferencing near it.
         */
        *pskb = skb;
        return ret;
}

static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
        struct net_device *orig_dev = skb->dev;
        struct packet_type *pt_prev = NULL;
        int ret;

        ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
        if (pt_prev)
                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
                                         skb->dev, pt_prev, orig_dev);
        return ret;
}

/**
 *        netif_receive_skb_core - special purpose version of netif_receive_skb
 *        @skb: buffer to process
 *
 *        More direct receive version of netif_receive_skb().  It should
 *        only be used by callers that have a need to skip RPS and Generic XDP.
 *        Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb_core(struct sk_buff *skb)
{
        int ret;

        rcu_read_lock();
        ret = __netif_receive_skb_one_core(skb, false);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);

static inline void __netif_receive_skb_list_ptype(struct list_head *head,
                                                  struct packet_type *pt_prev,
                                                  struct net_device *orig_dev)
{
        struct sk_buff *skb, *next;

        if (!pt_prev)
                return;
        if (list_empty(head))
                return;
        if (pt_prev->list_func != NULL)
                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
                                   ip_list_rcv, head, pt_prev, orig_dev);
        else
                list_for_each_entry_safe(skb, next, head, list) {
                        skb_list_del_init(skb);
                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
                }
}

static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
{
        /* Fast-path assumptions:
         * - There is no RX handler.
         * - Only one packet_type matches.
         * If either of these fails, we will end up doing some per-packet
         * processing in-line, then handling the 'last ptype' for the whole
         * sublist.  This can't cause out-of-order delivery to any single ptype,
         * because the 'last ptype' must be constant across the sublist, and all
         * other ptypes are handled per-packet.
         */
        /* Current (common) ptype of sublist */
        struct packet_type *pt_curr = NULL;
        /* Current (common) orig_dev of sublist */
        struct net_device *od_curr = NULL;
        struct list_head sublist;
        struct sk_buff *skb, *next;

        INIT_LIST_HEAD(&sublist);
        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *orig_dev = skb->dev;
                struct packet_type *pt_prev = NULL;

                skb_list_del_init(skb);
                __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
                if (!pt_prev)
                        continue;
                if (pt_curr != pt_prev || od_curr != orig_dev) {
                        /* dispatch old sublist */
                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        pt_curr = pt_prev;
                        od_curr = orig_dev;
                }
                list_add_tail(&skb->list, &sublist);
        }

        /* dispatch final sublist */
        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
}

static int __netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
                unsigned int noreclaim_flag;

                /*
                 * PFMEMALLOC skbs are special, they should
                 * - be delivered to SOCK_MEMALLOC sockets only
                 * - stay away from userspace
                 * - have bounded memory usage
                 *
                 * Use PF_MEMALLOC as this saves us from propagating the allocation
                 * context down to all allocation sites.
                 */
                noreclaim_flag = memalloc_noreclaim_save();
                ret = __netif_receive_skb_one_core(skb, true);
                memalloc_noreclaim_restore(noreclaim_flag);
        } else
                ret = __netif_receive_skb_one_core(skb, false);

        return ret;
}

static void __netif_receive_skb_list(struct list_head *head)
{
        unsigned long noreclaim_flag = 0;
        struct sk_buff *skb, *next;
        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */

        list_for_each_entry_safe(skb, next, head, list) {
                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
                        struct list_head sublist;

                        /* Handle the previous sublist */
                        list_cut_before(&sublist, head, &skb->list);
                        if (!list_empty(&sublist))
                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
                        pfmemalloc = !pfmemalloc;
                        /* See comments in __netif_receive_skb */
                        if (pfmemalloc)
                                noreclaim_flag = memalloc_noreclaim_save();
                        else
                                memalloc_noreclaim_restore(noreclaim_flag);
                }
        }
        /* Handle the remaining sublist */
        if (!list_empty(head))
                __netif_receive_skb_list_core(head, pfmemalloc);
        /* Restore pflags */
        if (pfmemalloc)
                memalloc_noreclaim_restore(noreclaim_flag);
}

static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
        struct bpf_prog *new = xdp->prog;
        int ret = 0;

        switch (xdp->command) {
        case XDP_SETUP_PROG:
                rcu_assign_pointer(dev->xdp_prog, new);
                if (old)
                        bpf_prog_put(old);

                if (old && !new) {
                        static_branch_dec(&generic_xdp_needed_key);
                } else if (new && !old) {
                        static_branch_inc(&generic_xdp_needed_key);
                        dev_disable_lro(dev);
                        dev_disable_gro_hw(dev);
                }
                break;

        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int netif_receive_skb_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        if (skb_defer_rx_timestamp(skb))
                return NET_RX_SUCCESS;

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                if (cpu >= 0) {
                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        rcu_read_unlock();
                        return ret;
                }
        }
#endif
        ret = __netif_receive_skb(skb);
        rcu_read_unlock();
        return ret;
}

void netif_receive_skb_list_internal(struct list_head *head)
{
        struct sk_buff *skb, *next;
        struct list_head sublist;

        INIT_LIST_HEAD(&sublist);
        list_for_each_entry_safe(skb, next, head, list) {
                net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
                                    skb);
                skb_list_del_init(skb);
                if (!skb_defer_rx_timestamp(skb))
                        list_add_tail(&skb->list, &sublist);
        }
        list_splice_init(&sublist, head);

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                list_for_each_entry_safe(skb, next, head, list) {
                        struct rps_dev_flow voidflow, *rflow = &voidflow;
                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                        if (cpu >= 0) {
                                /* Will be handled, remove from list */
                                skb_list_del_init(skb);
                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        }
                }
        }
#endif
        __netif_receive_skb_list(head);
        rcu_read_unlock();
}

/**
 *        netif_receive_skb - process receive buffer from network
 *        @skb: buffer to process
 *
 *        netif_receive_skb() is the main receive data processing function.
 *        It always succeeds. The buffer may be dropped during processing
 *        for congestion control or by the protocol layers.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        trace_netif_receive_skb_entry(skb);

        ret = netif_receive_skb_internal(skb);
        trace_netif_receive_skb_exit(ret);

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb);

/**
 *        netif_receive_skb_list - process many receive buffers from network
 *        @head: list of skbs to process.
 *
 *        Since return value of netif_receive_skb() is normally ignored, and
 *        wouldn't be meaningful for a list, this function returns void.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 */
void netif_receive_skb_list(struct list_head *head)
{
        struct sk_buff *skb;

        if (list_empty(head))
                return;
        if (trace_netif_receive_skb_list_entry_enabled()) {
                list_for_each_entry(skb, head, list)
                        trace_netif_receive_skb_list_entry(skb);
        }
        netif_receive_skb_list_internal(head);
        trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);

static DEFINE_PER_CPU(struct work_struct, flush_works);

/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
        struct sk_buff *skb, *tmp;
        struct softnet_data *sd;

        local_bh_disable();
        sd = this_cpu_ptr(&softnet_data);

        backlog_lock_irq_disable(sd);
        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->input_pkt_queue);
                        dev_kfree_skb_irq(skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        backlog_unlock_irq_enable(sd);

        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->process_queue);
                        kfree_skb(skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
        local_bh_enable();
}

static bool flush_required(int cpu)
{
#if IS_ENABLED(CONFIG_RPS)
        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
        bool do_flush;

        backlog_lock_irq_disable(sd);

        /* as insertion into process_queue happens with the rps lock held,
         * process_queue access may race only with dequeue
         */
        do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
                   !skb_queue_empty_lockless(&sd->process_queue);
        backlog_unlock_irq_enable(sd);

        return do_flush;
#endif
        /* without RPS we can't safely check input_pkt_queue: during a
         * concurrent remote skb_queue_splice() we can detect as empty both
         * input_pkt_queue and process_queue even if the latter could end-up
         * containing a lot of packets.
         */
        return true;
}

static void flush_all_backlogs(void)
{
        static cpumask_t flush_cpus;
        unsigned int cpu;

        /* since we are under rtnl lock protection we can use static data
         * for the cpumask and avoid allocating on stack the possibly
         * large mask
         */
        ASSERT_RTNL();

        cpus_read_lock();

        cpumask_clear(&flush_cpus);
        for_each_online_cpu(cpu) {
                if (flush_required(cpu)) {
                        queue_work_on(cpu, system_highpri_wq,
                                      per_cpu_ptr(&flush_works, cpu));
                        cpumask_set_cpu(cpu, &flush_cpus);
                }
        }

        /* we can have in flight packet[s] on the cpus we are not flushing,
         * synchronize_net() in unregister_netdevice_many() will take care of
         * them
         */
        for_each_cpu(cpu, &flush_cpus)
                flush_work(per_cpu_ptr(&flush_works, cpu));

        cpus_read_unlock();
}

static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
        while (remsd) {
                struct softnet_data *next = remsd->rps_ipi_next;

                if (cpu_online(remsd->cpu))
                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
                remsd = next;
        }
#endif
}

/*
 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 * Note: called with local irq disabled, but exits with local irq enabled.
 */
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        struct softnet_data *remsd = sd->rps_ipi_list;

        if (!use_backlog_threads() && remsd) {
                sd->rps_ipi_list = NULL;

                local_irq_enable();

                /* Send pending IPI's to kick RPS processing on remote cpus. */
                net_rps_send_ipi(remsd);
        } else
#endif
                local_irq_enable();
}

static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        return !use_backlog_threads() && sd->rps_ipi_list;
#else
        return false;
#endif
}

static int process_backlog(struct napi_struct *napi, int quota)
{
        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
        bool again = true;
        int work = 0;

        /* Check if we have pending ipi, its better to send them now,
         * not waiting net_rx_action() end.
         */
        if (sd_has_rps_ipi_waiting(sd)) {
                local_irq_disable();
                net_rps_action_and_irq_enable(sd);
        }

        napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
        while (again) {
                struct sk_buff *skb;

                local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                while ((skb = __skb_dequeue(&sd->process_queue))) {
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                        rcu_read_lock();
                        __netif_receive_skb(skb);
                        rcu_read_unlock();
                        if (++work >= quota) {
                                rps_input_queue_head_add(sd, work);
                                return work;
                        }

                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);

                backlog_lock_irq_disable(sd);
                if (skb_queue_empty(&sd->input_pkt_queue)) {
                        /*
                         * Inline a custom version of __napi_complete().
                         * only current cpu owns and manipulates this napi,
                         * and NAPI_STATE_SCHED is the only possible flag set
                         * on backlog.
                         * We can use a plain write instead of clear_bit(),
                         * and we dont need an smp_mb() memory barrier.
                         */
                        napi->state &= NAPIF_STATE_THREADED;
                        again = false;
                } else {
                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
                                                   &sd->process_queue);
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                backlog_unlock_irq_enable(sd);
        }

        if (work)
                rps_input_queue_head_add(sd, work);
        return work;
}

/**
 * __napi_schedule - schedule for receive
 * @n: entry to schedule
 *
 * The entry's receive function will be scheduled to run.
 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 */
void __napi_schedule(struct napi_struct *n)
{
        unsigned long flags;

        local_irq_save(flags);
        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);

/**
 *        napi_schedule_prep - check if napi can be scheduled
 *        @n: napi context
 *
 * Test if NAPI routine is already running, and if not mark
 * it as running.  This is used as a condition variable to
 * insure only one NAPI poll instance runs.  We also make
 * sure there is no pending NAPI disable.
 */
bool napi_schedule_prep(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        do {
                if (unlikely(val & NAPIF_STATE_DISABLE))
                        return false;
                new = val | NAPIF_STATE_SCHED;

                /* Sets STATE_MISSED bit if STATE_SCHED was already set
                 * This was suggested by Alexander Duyck, as compiler
                 * emits better code than :
                 * if (val & NAPIF_STATE_SCHED)
                 *     new |= NAPIF_STATE_MISSED;
                 */
                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
                                                   NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return !(val & NAPIF_STATE_SCHED);
}
EXPORT_SYMBOL(napi_schedule_prep);

/**
 * __napi_schedule_irqoff - schedule for receive
 * @n: entry to schedule
 *
 * Variant of __napi_schedule() assuming hard irqs are masked.
 *
 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 * because the interrupt disabled assumption might not be true
 * due to force-threaded interrupts and spinlock substitution.
 */
void __napi_schedule_irqoff(struct napi_struct *n)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        else
                __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);

bool napi_complete_done(struct napi_struct *n, int work_done)
{
        unsigned long flags, val, new, timeout = 0;
        bool ret = true;

        /*
         * 1) Don't let napi dequeue from the cpu poll list
         *    just in case its running on a different cpu.
         * 2) If we are busy polling, do nothing here, we have
         *    the guarantee we will be called later.
         */
        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
                                 NAPIF_STATE_IN_BUSY_POLL)))
                return false;

        if (work_done) {
                if (n->gro_bitmask)
                        timeout = READ_ONCE(n->dev->gro_flush_timeout);
                n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
        }
        if (n->defer_hard_irqs_count > 0) {
                n->defer_hard_irqs_count--;
                timeout = READ_ONCE(n->dev->gro_flush_timeout);
                if (timeout)
                        ret = false;
        }
        if (n->gro_bitmask) {
                /* When the NAPI instance uses a timeout and keeps postponing
                 * it, we need to bound somehow the time packets are kept in
                 * the GRO layer
                 */
                napi_gro_flush(n, !!timeout);
        }

        gro_normal_list(n);

        if (unlikely(!list_empty(&n->poll_list))) {
                /* If n->poll_list is not empty, we need to mask irqs */
                local_irq_save(flags);
                list_del_init(&n->poll_list);
                local_irq_restore(flags);
        }
        WRITE_ONCE(n->list_owner, -1);

        val = READ_ONCE(n->state);
        do {
                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));

                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
                              NAPIF_STATE_SCHED_THREADED |
                              NAPIF_STATE_PREFER_BUSY_POLL);

                /* If STATE_MISSED was set, leave STATE_SCHED set,
                 * because we will call napi->poll() one more time.
                 * This C code was suggested by Alexander Duyck to help gcc.
                 */
                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
                                                    NAPIF_STATE_SCHED;
        } while (!try_cmpxchg(&n->state, &val, new));

        if (unlikely(val & NAPIF_STATE_MISSED)) {
                __napi_schedule(n);
                return false;
        }

        if (timeout)
                hrtimer_start(&n->timer, ns_to_ktime(timeout),
                              HRTIMER_MODE_REL_PINNED);
        return ret;
}
EXPORT_SYMBOL(napi_complete_done);

/* must be called under rcu_read_lock(), as we dont take a reference */
struct napi_struct *napi_by_id(unsigned int napi_id)
{
        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
        struct napi_struct *napi;

        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
                if (napi->napi_id == napi_id)
                        return napi;

        return NULL;
}

static void skb_defer_free_flush(struct softnet_data *sd)
{
        struct sk_buff *skb, *next;

        /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
        if (!READ_ONCE(sd->defer_list))
                return;

        spin_lock(&sd->defer_lock);
        skb = sd->defer_list;
        sd->defer_list = NULL;
        sd->defer_count = 0;
        spin_unlock(&sd->defer_lock);

        while (skb != NULL) {
                next = skb->next;
                napi_consume_skb(skb, 1);
                skb = next;
        }
}

#if defined(CONFIG_NET_RX_BUSY_POLL)

static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
        if (!skip_schedule) {
                gro_normal_list(napi);
                __napi_schedule(napi);
                return;
        }

        if (napi->gro_bitmask) {
                /* flush too old packets
                 * If HZ < 1000, flush all packets.
                 */
                napi_gro_flush(napi, HZ >= 1000);
        }

        gro_normal_list(napi);
        clear_bit(NAPI_STATE_SCHED, &napi->state);
}

enum {
        NAPI_F_PREFER_BUSY_POLL        = 1,
        NAPI_F_END_ON_RESCHED        = 2,
};

static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
                           unsigned flags, u16 budget)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        bool skip_schedule = false;
        unsigned long timeout;
        int rc;

        /* Busy polling means there is a high chance device driver hard irq
         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
         * set in napi_schedule_prep().
         * Since we are about to call napi->poll() once more, we can safely
         * clear NAPI_STATE_MISSED.
         *
         * Note: x86 could use a single "lock and ..." instruction
         * to perform these two clear_bit()
         */
        clear_bit(NAPI_STATE_MISSED, &napi->state);
        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);

        local_bh_disable();
        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        if (flags & NAPI_F_PREFER_BUSY_POLL) {
                napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
                timeout = READ_ONCE(napi->dev->gro_flush_timeout);
                if (napi->defer_hard_irqs_count && timeout) {
                        hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
                        skip_schedule = true;
                }
        }

        /* All we really want here is to re-enable device interrupts.
         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
         */
        rc = napi->poll(napi, budget);
        /* We can't gro_normal_list() here, because napi->poll() might have
         * rearmed the napi (napi_complete_done()) in which case it could
         * already be running on another CPU.
         */
        trace_napi_poll(napi, rc, budget);
        netpoll_poll_unlock(have_poll_lock);
        if (rc == budget)
                __busy_poll_stop(napi, skip_schedule);
        bpf_net_ctx_clear(bpf_net_ctx);
        local_bh_enable();
}

static void __napi_busy_loop(unsigned int napi_id,
                      bool (*loop_end)(void *, unsigned long),
                      void *loop_end_arg, unsigned flags, u16 budget)
{
        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
        int (*napi_poll)(struct napi_struct *napi, int budget);
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        void *have_poll_lock = NULL;
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

restart:
        napi_poll = NULL;

        napi = napi_by_id(napi_id);
        if (!napi)
                return;

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        for (;;) {
                int work = 0;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
                if (!napi_poll) {
                        unsigned long val = READ_ONCE(napi->state);

                        /* If multiple threads are competing for this napi,
                         * we avoid dirtying napi->state as much as we can.
                         */
                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
                                   NAPIF_STATE_IN_BUSY_POLL)) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        if (cmpxchg(&napi->state, val,
                                    val | NAPIF_STATE_IN_BUSY_POLL |
                                          NAPIF_STATE_SCHED) != val) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        have_poll_lock = netpoll_poll_lock(napi);
                        napi_poll = napi->poll;
                }
                work = napi_poll(napi, budget);
                trace_napi_poll(napi, work, budget);
                gro_normal_list(napi);
count:
                if (work > 0)
                        __NET_ADD_STATS(dev_net(napi->dev),
                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
                skb_defer_free_flush(this_cpu_ptr(&softnet_data));
                bpf_net_ctx_clear(bpf_net_ctx);
                local_bh_enable();

                if (!loop_end || loop_end(loop_end_arg, start_time))
                        break;

                if (unlikely(need_resched())) {
                        if (flags & NAPI_F_END_ON_RESCHED)
                                break;
                        if (napi_poll)
                                busy_poll_stop(napi, have_poll_lock, flags, budget);
                        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                                preempt_enable();
                        rcu_read_unlock();
                        cond_resched();
                        rcu_read_lock();
                        if (loop_end(loop_end_arg, start_time))
                                return;
                        goto restart;
                }
                cpu_relax();
        }
        if (napi_poll)
                busy_poll_stop(napi, have_poll_lock, flags, budget);
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
}

void napi_busy_loop_rcu(unsigned int napi_id,
                        bool (*loop_end)(void *, unsigned long),
                        void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = NAPI_F_END_ON_RESCHED;

        if (prefer_busy_poll)
                flags |= NAPI_F_PREFER_BUSY_POLL;

        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
}

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;

        rcu_read_lock();
        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
        rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);

#endif /* CONFIG_NET_RX_BUSY_POLL */

static void napi_hash_add(struct napi_struct *napi)
{
        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
                return;

        spin_lock(&napi_hash_lock);

        /* 0..NR_CPUS range is reserved for sender_cpu use */
        do {
                if (unlikely(++napi_gen_id < MIN_NAPI_ID))
                        napi_gen_id = MIN_NAPI_ID;
        } while (napi_by_id(napi_gen_id));
        napi->napi_id = napi_gen_id;

        hlist_add_head_rcu(&napi->napi_hash_node,
                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);

        spin_unlock(&napi_hash_lock);
}

/* Warning : caller is responsible to make sure rcu grace period
 * is respected before freeing memory containing @napi
 */
static void napi_hash_del(struct napi_struct *napi)
{
        spin_lock(&napi_hash_lock);

        hlist_del_init_rcu(&napi->napi_hash_node);

        spin_unlock(&napi_hash_lock);
}

static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
        struct napi_struct *napi;

        napi = container_of(timer, struct napi_struct, timer);

        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
         */
        if (!napi_disable_pending(napi) &&
            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
                clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                __napi_schedule_irqoff(napi);
        }

        return HRTIMER_NORESTART;
}

static void init_gro_hash(struct napi_struct *napi)
{
        int i;

        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
                INIT_LIST_HEAD(&napi->gro_hash[i].list);
                napi->gro_hash[i].count = 0;
        }
        napi->gro_bitmask = 0;
}

int dev_set_threaded(struct net_device *dev, bool threaded)
{
        struct napi_struct *napi;
        int err = 0;

        if (dev->threaded == threaded)
                return 0;

        if (threaded) {
                list_for_each_entry(napi, &dev->napi_list, dev_list) {
                        if (!napi->thread) {
                                err = napi_kthread_create(napi);
                                if (err) {
                                        threaded = false;
                                        break;
                                }
                        }
                }
        }

        WRITE_ONCE(dev->threaded, threaded);

        /* Make sure kthread is created before THREADED bit
         * is set.
         */
        smp_mb__before_atomic();

        /* Setting/unsetting threaded mode on a napi might not immediately
         * take effect, if the current napi instance is actively being
         * polled. In this case, the switch between threaded mode and
         * softirq mode will happen in the next round of napi_schedule().
         * This should not cause hiccups/stalls to the live traffic.
         */
        list_for_each_entry(napi, &dev->napi_list, dev_list)
                assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);

        return err;
}
EXPORT_SYMBOL(dev_set_threaded);

/**
 * netif_queue_set_napi - Associate queue with the napi
 * @dev: device to which NAPI and queue belong
 * @queue_index: Index of queue
 * @type: queue type as RX or TX
 * @napi: NAPI context, pass NULL to clear previously set NAPI
 *
 * Set queue with its corresponding napi context. This should be done after
 * registering the NAPI handler for the queue-vector and the queues have been
 * mapped to the corresponding interrupt vector.
 */
void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type, struct napi_struct *napi)
{
        struct netdev_rx_queue *rxq;
        struct netdev_queue *txq;

        if (WARN_ON_ONCE(napi && !napi->dev))
                return;
        if (dev->reg_state >= NETREG_REGISTERED)
                ASSERT_RTNL();

        switch (type) {
        case NETDEV_QUEUE_TYPE_RX:
                rxq = __netif_get_rx_queue(dev, queue_index);
                rxq->napi = napi;
                return;
        case NETDEV_QUEUE_TYPE_TX:
                txq = netdev_get_tx_queue(dev, queue_index);
                txq->napi = napi;
                return;
        default:
                return;
        }
}
EXPORT_SYMBOL(netif_queue_set_napi);

void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
                           int (*poll)(struct napi_struct *, int), int weight)
{
        if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
                return;

        INIT_LIST_HEAD(&napi->poll_list);
        INIT_HLIST_NODE(&napi->napi_hash_node);
        hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
        napi->timer.function = napi_watchdog;
        init_gro_hash(napi);
        napi->skb = NULL;
        INIT_LIST_HEAD(&napi->rx_list);
        napi->rx_count = 0;
        napi->poll = poll;
        if (weight > NAPI_POLL_WEIGHT)
                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
                                weight);
        napi->weight = weight;
        napi->dev = dev;
#ifdef CONFIG_NETPOLL
        napi->poll_owner = -1;
#endif
        napi->list_owner = -1;
        set_bit(NAPI_STATE_SCHED, &napi->state);
        set_bit(NAPI_STATE_NPSVC, &napi->state);
        list_add_rcu(&napi->dev_list, &dev->napi_list);
        napi_hash_add(napi);
        napi_get_frags_check(napi);
        /* Create kthread for this napi if dev->threaded is set.
         * Clear dev->threaded if kthread creation failed so that
         * threaded mode will not be enabled in napi_enable().
         */
        if (dev->threaded && napi_kthread_create(napi))
                dev->threaded = false;
        netif_napi_set_irq(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight);

void napi_disable(struct napi_struct *n)
{
        unsigned long val, new;

        might_sleep();
        set_bit(NAPI_STATE_DISABLE, &n->state);

        val = READ_ONCE(n->state);
        do {
                while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
                        usleep_range(20, 200);
                        val = READ_ONCE(n->state);
                }

                new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
                new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
        } while (!try_cmpxchg(&n->state, &val, new));

        hrtimer_cancel(&n->timer);

        clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);

/**
 *        napi_enable - enable NAPI scheduling
 *        @n: NAPI context
 *
 * Resume NAPI from being scheduled on this context.
 * Must be paired with napi_disable.
 */
void napi_enable(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        do {
                BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));

                new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
                if (n->dev->threaded && n->thread)
                        new |= NAPIF_STATE_THREADED;
        } while (!try_cmpxchg(&n->state, &val, new));
}
EXPORT_SYMBOL(napi_enable);

static void flush_gro_hash(struct napi_struct *napi)
{
        int i;

        for (i = 0; i < GRO_HASH_BUCKETS; i++) {
                struct sk_buff *skb, *n;

                list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
                        kfree_skb(skb);
                napi->gro_hash[i].count = 0;
        }
}

/* Must be called in process context */
void __netif_napi_del(struct napi_struct *napi)
{
        if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
                return;

        napi_hash_del(napi);
        list_del_rcu(&napi->dev_list);
        napi_free_frags(napi);

        flush_gro_hash(napi);
        napi->gro_bitmask = 0;

        if (napi->thread) {
                kthread_stop(napi->thread);
                napi->thread = NULL;
        }
}
EXPORT_SYMBOL(__netif_napi_del);

static int __napi_poll(struct napi_struct *n, bool *repoll)
{
        int work, weight;

        weight = n->weight;

        /* This NAPI_STATE_SCHED test is for avoiding a race
         * with netpoll's poll_napi().  Only the entity which
         * obtains the lock and sees NAPI_STATE_SCHED set will
         * actually make the ->poll() call.  Therefore we avoid
         * accidentally calling ->poll() when NAPI is not scheduled.
         */
        work = 0;
        if (napi_is_scheduled(n)) {
                work = n->poll(n, weight);
                trace_napi_poll(n, work, weight);

                xdp_do_check_flushed(n);
        }

        if (unlikely(work > weight))
                netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
                                n->poll, work, weight);

        if (likely(work < weight))
                return work;

        /* Drivers must not modify the NAPI state if they
         * consume the entire weight.  In such cases this code
         * still "owns" the NAPI instance and therefore can
         * move the instance around on the list at-will.
         */
        if (unlikely(napi_disable_pending(n))) {
                napi_complete(n);
                return work;
        }

        /* The NAPI context has more processing work, but busy-polling
         * is preferred. Exit early.
         */
        if (napi_prefer_busy_poll(n)) {
                if (napi_complete_done(n, work)) {
                        /* If timeout is not set, we need to make sure
                         * that the NAPI is re-scheduled.
                         */
                        napi_schedule(n);
                }
                return work;
        }

        if (n->gro_bitmask) {
                /* flush too old packets
                 * If HZ < 1000, flush all packets.
                 */
                napi_gro_flush(n, HZ >= 1000);
        }

        gro_normal_list(n);

        /* Some drivers may have called napi_schedule
         * prior to exhausting their budget.
         */
        if (unlikely(!list_empty(&n->poll_list))) {
                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
                             n->dev ? n->dev->name : "backlog");
                return work;
        }

        *repoll = true;

        return work;
}

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
        bool do_repoll = false;
        void *have;
        int work;

        list_del_init(&n->poll_list);

        have = netpoll_poll_lock(n);

        work = __napi_poll(n, &do_repoll);

        if (do_repoll)
                list_add_tail(&n->poll_list, repoll);

        netpoll_poll_unlock(have);

        return work;
}

static int napi_thread_wait(struct napi_struct *napi)
{
        set_current_state(TASK_INTERRUPTIBLE);

        while (!kthread_should_stop()) {
                /* Testing SCHED_THREADED bit here to make sure the current
                 * kthread owns this napi and could poll on this napi.
                 * Testing SCHED bit is not enough because SCHED bit might be
                 * set by some other busy poll thread or by napi_disable().
                 */
                if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
                        WARN_ON(!list_empty(&napi->poll_list));
                        __set_current_state(TASK_RUNNING);
                        return 0;
                }

                schedule();
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);

        return -1;
}

static void napi_threaded_poll_loop(struct napi_struct *napi)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        struct softnet_data *sd;
        unsigned long last_qs = jiffies;

        for (;;) {
                bool repoll = false;
                void *have;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

                sd = this_cpu_ptr(&softnet_data);
                sd->in_napi_threaded_poll = true;

                have = netpoll_poll_lock(napi);
                __napi_poll(napi, &repoll);
                netpoll_poll_unlock(have);

                sd->in_napi_threaded_poll = false;
                barrier();

                if (sd_has_rps_ipi_waiting(sd)) {
                        local_irq_disable();
                        net_rps_action_and_irq_enable(sd);
                }
                skb_defer_free_flush(sd);
                bpf_net_ctx_clear(bpf_net_ctx);
                local_bh_enable();

                if (!repoll)
                        break;

                rcu_softirq_qs_periodic(last_qs);
                cond_resched();
        }
}

static int napi_threaded_poll(void *data)
{
        struct napi_struct *napi = data;

        while (!napi_thread_wait(napi))
                napi_threaded_poll_loop(napi);

        return 0;
}

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
        unsigned long time_limit = jiffies +
                usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int budget = READ_ONCE(net_hotdata.netdev_budget);
        LIST_HEAD(list);
        LIST_HEAD(repoll);

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
        sd->in_net_rx_action = true;
        local_irq_disable();
        list_splice_init(&sd->poll_list, &list);
        local_irq_enable();

        for (;;) {
                struct napi_struct *n;

                skb_defer_free_flush(sd);

                if (list_empty(&list)) {
                        if (list_empty(&repoll)) {
                                sd->in_net_rx_action = false;
                                barrier();
                                /* We need to check if ____napi_schedule()
                                 * had refilled poll_list while
                                 * sd->in_net_rx_action was true.
                                 */
                                if (!list_empty(&sd->poll_list))
                                        goto start;
                                if (!sd_has_rps_ipi_waiting(sd))
                                        goto end;
                        }
                        break;
                }

                n = list_first_entry(&list, struct napi_struct, poll_list);
                budget -= napi_poll(n, &repoll);

                /* If softirq window is exhausted then punt.
                 * Allow this to run for 2 jiffies since which will allow
                 * an average latency of 1.5/HZ.
                 */
                if (unlikely(budget <= 0 ||
                             time_after_eq(jiffies, time_limit))) {
                        sd->time_squeeze++;
                        break;
                }
        }

        local_irq_disable();

        list_splice_tail_init(&sd->poll_list, &list);
        list_splice_tail(&repoll, &list);
        list_splice(&list, &sd->poll_list);
        if (!list_empty(&sd->poll_list))
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        else
                sd->in_net_rx_action = false;

        net_rps_action_and_irq_enable(sd);
end:
        bpf_net_ctx_clear(bpf_net_ctx);
}

struct netdev_adjacent {
        struct net_device *dev;
        netdevice_tracker dev_tracker;

        /* upper master flag, there can only be one master device per list */
        bool master;

        /* lookup ignore flag */
        bool ignore;

        /* counter for the number of times this device was added to us */
        u16 ref_nr;

        /* private field for the users */
        void *private;

        struct list_head list;
        struct rcu_head rcu;
};

static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
                                                 struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        list_for_each_entry(adj, adj_list, list) {
                if (adj->dev == adj_dev)
                        return adj;
        }
        return NULL;
}

static int ____netdev_has_upper_dev(struct net_device *upper_dev,
                                    struct netdev_nested_priv *priv)
{
        struct net_device *dev = (struct net_device *)priv->data;

        return upper_dev == dev;
}

/**
 * netdev_has_upper_dev - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks only immediate upper device,
 * not through a complete stack of devices. The caller must hold the RTNL lock.
 */
bool netdev_has_upper_dev(struct net_device *dev,
                          struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                             &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);

/**
 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks the entire upper device chain.
 * The caller must hold rcu lock.
 */

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                               &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);

/**
 * netdev_has_any_upper_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to an upper device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
bool netdev_has_any_upper_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_any_upper_dev);

/**
 * netdev_master_upper_dev_get - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RTNL lock.
 */
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);

static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master) && !upper->ignore)
                return upper->dev;
        return NULL;
}

/**
 * netdev_has_any_lower_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to a lower device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
static bool netdev_has_any_lower_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.lower);
}

void *netdev_adjacent_get_private(struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        adj = list_entry(adj_list, struct netdev_adjacent, list);

        return adj->private;
}
EXPORT_SYMBOL(netdev_adjacent_get_private);

/**
 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next device from the dev's upper list, starting from iter
 * position. The caller must hold RCU read lock.
 */
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                 struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);

static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *upper;

        upper = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;
        *ignore = upper->ignore;

        return upper->dev;
}

static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
                                                    struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}

static int __netdev_walk_all_upper_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
                        if (!udev)
                                break;
                        if (ignore)
                                continue;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = netdev_next_upper_dev_rcu(now, &iter);
                        if (!udev)
                                break;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);

static bool __netdev_has_upper_dev(struct net_device *dev,
                                   struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
                                           &priv);
}

/**
 * netdev_lower_get_next_private - Get the next ->private from the
 *                                   lower neighbour list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold either hold the
 * RTNL lock or its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private);

/**
 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RCU read lock.
 */
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter)
{
        struct netdev_adjacent *lower;

        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);

/**
 * netdev_lower_get_next - Get the next device from the lower neighbour
 *                         list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RTNL lock or
 * its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_lower_get_next);

static struct net_device *netdev_next_lower_dev(struct net_device *dev,
                                                struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}

static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;
        *ignore = lower->ignore;

        return lower->dev;
}

int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);

static int __netdev_walk_all_lower_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
                        if (!ldev)
                                break;
                        if (ignore)
                                continue;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_next_lower_dev_rcu);

static u8 __netdev_upper_depth(struct net_device *dev)
{
        struct net_device *udev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.upper,
             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
             udev;
             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < udev->upper_level)
                        max_depth = udev->upper_level;
        }

        return max_depth;
}

static u8 __netdev_lower_depth(struct net_device *dev)
{
        struct net_device *ldev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.lower,
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
             ldev;
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < ldev->lower_level)
                        max_depth = ldev->lower_level;
        }

        return max_depth;
}

static int __netdev_update_upper_level(struct net_device *dev,
                                       struct netdev_nested_priv *__unused)
{
        dev->upper_level = __netdev_upper_depth(dev) + 1;
        return 0;
}

#ifdef CONFIG_LOCKDEP
static LIST_HEAD(net_unlink_list);

static void net_unlink_todo(struct net_device *dev)
{
        if (list_empty(&dev->unlink_list))
                list_add_tail(&dev->unlink_list, &net_unlink_list);
}
#endif

static int __netdev_update_lower_level(struct net_device *dev,
                                       struct netdev_nested_priv *priv)
{
        dev->lower_level = __netdev_lower_depth(dev) + 1;

#ifdef CONFIG_LOCKDEP
        if (!priv)
                return 0;

        if (priv->flags & NESTED_SYNC_IMM)
                dev->nested_level = dev->lower_level - 1;
        if (priv->flags & NESTED_SYNC_TODO)
                net_unlink_todo(dev);
#endif
        return 0;
}

int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev_rcu(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);

/**
 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 *
 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 * list. The caller must hold RCU read lock.
 */
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
        struct netdev_adjacent *lower;

        lower = list_first_or_null_rcu(&dev->adj_list.lower,
                        struct netdev_adjacent, list);
        if (lower)
                return lower->private;
        return NULL;
}
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);

/**
 * netdev_master_upper_dev_get_rcu - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RCU read lock.
 */
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        upper = list_first_or_null_rcu(&dev->adj_list.upper,
                                       struct netdev_adjacent, list);
        if (upper && likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);

static int netdev_adjacent_sysfs_add(struct net_device *dev,
                              struct net_device *adj_dev,
                              struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", adj_dev->name);
        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
                                 linkname);
}
static void netdev_adjacent_sysfs_del(struct net_device *dev,
                               char *name,
                               struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", name);
        sysfs_remove_link(&(dev->dev.kobj), linkname);
}

static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
                                                 struct net_device *adj_dev,
                                                 struct list_head *dev_list)
{
        return (dev_list == &dev->adj_list.upper ||
                dev_list == &dev->adj_list.lower) &&
                net_eq(dev_net(dev), dev_net(adj_dev));
}

static int __netdev_adjacent_dev_insert(struct net_device *dev,
                                        struct net_device *adj_dev,
                                        struct list_head *dev_list,
                                        void *private, bool master)
{
        struct netdev_adjacent *adj;
        int ret;

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (adj) {
                adj->ref_nr += 1;
                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
                         dev->name, adj_dev->name, adj->ref_nr);

                return 0;
        }

        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
        if (!adj)
                return -ENOMEM;

        adj->dev = adj_dev;
        adj->master = master;
        adj->ref_nr = 1;
        adj->private = private;
        adj->ignore = false;
        netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);

        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
                if (ret)
                        goto free_adj;
        }

        /* Ensure that master link is always the first item in list. */
        if (master) {
                ret = sysfs_create_link(&(dev->dev.kobj),
                                        &(adj_dev->dev.kobj), "master");
                if (ret)
                        goto remove_symlinks;

                list_add_rcu(&adj->list, dev_list);
        } else {
                list_add_tail_rcu(&adj->list, dev_list);
        }

        return 0;

remove_symlinks:
        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree(adj);

        return ret;
}

static void __netdev_adjacent_dev_remove(struct net_device *dev,
                                         struct net_device *adj_dev,
                                         u16 ref_nr,
                                         struct list_head *dev_list)
{
        struct netdev_adjacent *adj;

        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
                 dev->name, adj_dev->name, ref_nr);

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (!adj) {
                pr_err("Adjacency does not exist for device %s from %s\n",
                       dev->name, adj_dev->name);
                WARN_ON(1);
                return;
        }

        if (adj->ref_nr > ref_nr) {
                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
                         dev->name, adj_dev->name, ref_nr,
                         adj->ref_nr - ref_nr);
                adj->ref_nr -= ref_nr;
                return;
        }

        if (adj->master)
                sysfs_remove_link(&(dev->dev.kobj), "master");

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);

        list_del_rcu(&adj->list);
        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
                 adj_dev->name, dev->name, adj_dev->name);
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree_rcu(adj, rcu);
}

static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
                                            struct net_device *upper_dev,
                                            struct list_head *up_list,
                                            struct list_head *down_list,
                                            void *private, bool master)
{
        int ret;

        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
                                           private, master);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
                                           private, false);
        if (ret) {
                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
                return ret;
        }

        return 0;
}

static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
                                               struct net_device *upper_dev,
                                               u16 ref_nr,
                                               struct list_head *up_list,
                                               struct list_head *down_list)
{
        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}

static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
                                                struct net_device *upper_dev,
                                                void *private, bool master)
{
        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
                                                &dev->adj_list.upper,
                                                &upper_dev->adj_list.lower,
                                                private, master);
}

static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
                                                   struct net_device *upper_dev)
{
        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
                                           &dev->adj_list.upper,
                                           &upper_dev->adj_list.lower);
}

static int __netdev_upper_dev_link(struct net_device *dev,
                                   struct net_device *upper_dev, bool master,
                                   void *upper_priv, void *upper_info,
                                   struct netdev_nested_priv *priv,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                        .extack = extack,
                },
                .upper_dev = upper_dev,
                .master = master,
                .linking = true,
                .upper_info = upper_info,
        };
        struct net_device *master_dev;
        int ret = 0;

        ASSERT_RTNL();

        if (dev == upper_dev)
                return -EBUSY;

        /* To prevent loops, check if dev is not upper device to upper_dev. */
        if (__netdev_has_upper_dev(upper_dev, dev))
                return -EBUSY;

        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
                return -EMLINK;

        if (!master) {
                if (__netdev_has_upper_dev(dev, upper_dev))
                        return -EEXIST;
        } else {
                master_dev = __netdev_master_upper_dev_get(dev);
                if (master_dev)
                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
        }

        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
                                                   master);
        if (ret)
                return ret;

        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                goto rollback;

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);

        return 0;

rollback:
        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        return ret;
}

/**
 * netdev_upper_dev_link - Add a link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. The caller must hold
 * the RTNL lock. On a failure a negative errno code is returned.
 * On success the reference counts are adjusted and the function
 * returns zero.
 */
int netdev_upper_dev_link(struct net_device *dev,
                          struct net_device *upper_dev,
                          struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, false,
                                       NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);

/**
 * netdev_master_upper_dev_link - Add a master link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @upper_priv: upper device private
 * @upper_info: upper info to be passed down via notifier
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. In this case, only
 * one master upper device can be linked, although other non-master devices
 * might be linked as well. The caller must hold the RTNL lock.
 * On a failure a negative errno code is returned. On success the reference
 * counts are adjusted and the function returns zero.
 */
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, true,
                                       upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);

static void __netdev_upper_dev_unlink(struct net_device *dev,
                                      struct net_device *upper_dev,
                                      struct netdev_nested_priv *priv)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                },
                .upper_dev = upper_dev,
                .linking = false,
        };

        ASSERT_RTNL();

        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;

        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);
}

/**
 * netdev_upper_dev_unlink - Removes a link to upper device
 * @dev: device
 * @upper_dev: new upper device
 *
 * Removes a link to device which is upper to this one. The caller must hold
 * the RTNL lock.
 */
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_TODO,
                .data = NULL,
        };

        __netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);

static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
                                      struct net_device *lower_dev,
                                      bool val)
{
        struct netdev_adjacent *adj;

        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
        if (adj)
                adj->ignore = val;

        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
        if (adj)
                adj->ignore = val;
}

static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
                                        struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
}

static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
                                       struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
}

int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };
        int err;

        if (!new_dev)
                return 0;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_disable(dev, old_dev);
        err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
                                      extack);
        if (err) {
                if (old_dev && new_dev != old_dev)
                        netdev_adjacent_dev_enable(dev, old_dev);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_adjacent_change_prepare);

void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        if (!new_dev || !old_dev)
                return;

        if (new_dev == old_dev)
                return;

        netdev_adjacent_dev_enable(dev, old_dev);
        __netdev_upper_dev_unlink(old_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_commit);

void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };

        if (!new_dev)
                return;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_enable(dev, old_dev);

        __netdev_upper_dev_unlink(new_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_abort);

/**
 * netdev_bonding_info_change - Dispatch event about slave change
 * @dev: device
 * @bonding_info: info to dispatch
 *
 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info)
{
        struct netdev_notifier_bonding_info info = {
                .info.dev = dev,
        };

        memcpy(&info.bonding_info, bonding_info,
               sizeof(struct netdev_bonding_info));
        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
                                      &info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);

static int netdev_offload_xstats_enable_l3(struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };
        int err;
        int rc;

        dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
                                         GFP_KERNEL);
        if (!dev->offload_xstats_l3)
                return -ENOMEM;

        rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
                                                  NETDEV_OFFLOAD_XSTATS_DISABLE,
                                                  &info.info);
        err = notifier_to_errno(rc);
        if (err)
                goto free_stats;

        return 0;

free_stats:
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
        return err;
}

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return netdev_offload_xstats_enable_l3(dev, extack);
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_enable);

static void netdev_offload_xstats_disable_l3(struct net_device *dev)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };

        call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
                                      &info.info);
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
}

int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        if (!netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                netdev_offload_xstats_disable_l3(dev);
                return 0;
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_disable);

static void netdev_offload_xstats_disable_all(struct net_device *dev)
{
        netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
}

static struct rtnl_hw_stats64 *
netdev_offload_xstats_get_ptr(const struct net_device *dev,
                              enum netdev_offload_xstats_type type)
{
        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return dev->offload_xstats_l3;
        }

        WARN_ON(1);
        return NULL;
}

bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        return netdev_offload_xstats_get_ptr(dev, type);
}
EXPORT_SYMBOL(netdev_offload_xstats_enabled);

struct netdev_notifier_offload_xstats_ru {
        bool used;
};

struct netdev_notifier_offload_xstats_rd {
        struct rtnl_hw_stats64 stats;
        bool used;
};

static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
                                  const struct rtnl_hw_stats64 *src)
{
        dest->rx_packets          += src->rx_packets;
        dest->tx_packets          += src->tx_packets;
        dest->rx_bytes                  += src->rx_bytes;
        dest->tx_bytes                  += src->tx_bytes;
        dest->rx_errors                  += src->rx_errors;
        dest->tx_errors                  += src->tx_errors;
        dest->rx_dropped          += src->rx_dropped;
        dest->tx_dropped          += src->tx_dropped;
        dest->multicast                  += src->multicast;
}

static int netdev_offload_xstats_get_used(struct net_device *dev,
                                          enum netdev_offload_xstats_type type,
                                          bool *p_used,
                                          struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_ru report_used = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_used = &report_used,
        };
        int rc;

        WARN_ON(!netdev_offload_xstats_enabled(dev, type));
        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
                                           &info.info);
        *p_used = report_used.used;
        return notifier_to_errno(rc);
}

static int netdev_offload_xstats_get_stats(struct net_device *dev,
                                           enum netdev_offload_xstats_type type,
                                           struct rtnl_hw_stats64 *p_stats,
                                           bool *p_used,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_rd report_delta = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_delta = &report_delta,
        };
        struct rtnl_hw_stats64 *stats;
        int rc;

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return -EINVAL;

        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
                                           &info.info);

        /* Cache whatever we got, even if there was an error, otherwise the
         * successful stats retrievals would get lost.
         */
        netdev_hw_stats64_add(stats, &report_delta.stats);

        if (p_stats)
                *p_stats = *stats;
        *p_used = report_delta.used;

        return notifier_to_errno(rc);
}

int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *p_stats, bool *p_used,
                              struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (p_stats)
                return netdev_offload_xstats_get_stats(dev, type, p_stats,
                                                       p_used, extack);
        else
                return netdev_offload_xstats_get_used(dev, type, p_used,
                                                      extack);
}
EXPORT_SYMBOL(netdev_offload_xstats_get);

void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
                                   const struct rtnl_hw_stats64 *stats)
{
        report_delta->used = true;
        netdev_hw_stats64_add(&report_delta->stats, stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_report_delta);

void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
{
        report_used->used = true;
}
EXPORT_SYMBOL(netdev_offload_xstats_report_used);

void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *p_stats)
{
        struct rtnl_hw_stats64 *stats;

        ASSERT_RTNL();

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return;

        netdev_hw_stats64_add(stats, p_stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_push_delta);

/**
 * netdev_get_xmit_slave - Get the xmit slave of master device
 * @dev: device
 * @skb: The packet
 * @all_slaves: assume all the slaves are active
 *
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 * %NULL is returned if no slave is found.
 */

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_xmit_slave)
                return NULL;
        return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
}
EXPORT_SYMBOL(netdev_get_xmit_slave);

static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
                                                  struct sock *sk)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_sk_get_lower_dev)
                return NULL;
        return ops->ndo_sk_get_lower_dev(dev, sk);
}

/**
 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 * @dev: device
 * @sk: the socket
 *
 * %NULL is returned if no lower device is found.
 */

struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk)
{
        struct net_device *lower;

        lower = netdev_sk_get_lower_dev(dev, sk);
        while (lower) {
                dev = lower;
                lower = netdev_sk_get_lower_dev(dev, sk);
        }

        return dev;
}
EXPORT_SYMBOL(netdev_sk_get_lowest_dev);

static void netdev_adjacent_add_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.lower);
        }
}

static void netdev_adjacent_del_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.lower);
        }
}

void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
        }
}

void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev)
{
        struct netdev_adjacent *lower;

        if (!lower_dev)
                return NULL;
        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
        if (!lower)
                return NULL;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);


/**
 * netdev_lower_state_changed - Dispatch event about lower device state change
 * @lower_dev: device
 * @lower_state_info: state to dispatch
 *
 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info)
{
        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
                .info.dev = lower_dev,
        };

        ASSERT_RTNL();
        changelowerstate_info.lower_state_info = lower_state_info;
        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
                                      &changelowerstate_info.info);
}
EXPORT_SYMBOL(netdev_lower_state_changed);

static void dev_change_rx_flags(struct net_device *dev, int flags)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_rx_flags)
                ops->ndo_change_rx_flags(dev, flags);
}

static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags;
        unsigned int promiscuity, flags;
        kuid_t uid;
        kgid_t gid;

        ASSERT_RTNL();

        promiscuity = dev->promiscuity + inc;
        if (promiscuity == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch promisc and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_PROMISC;
        } else {
                flags = old_flags | IFF_PROMISC;
        }
        WRITE_ONCE(dev->promiscuity, promiscuity);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s promiscuous mode\n",
                            dev->flags & IFF_PROMISC ? "entered" : "left");
                if (audit_enabled) {
                        current_uid_gid(&uid, &gid);
                        audit_log(audit_context(), GFP_ATOMIC,
                                  AUDIT_ANOM_PROMISCUOUS,
                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
                                  dev->name, (dev->flags & IFF_PROMISC),
                                  (old_flags & IFF_PROMISC),
                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                  from_kuid(&init_user_ns, uid),
                                  from_kgid(&init_user_ns, gid),
                                  audit_get_sessionid(current));
                }

                dev_change_rx_flags(dev, IFF_PROMISC);
        }
        if (notify)
                __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
        return 0;
}

/**
 *        dev_set_promiscuity        - update promiscuity count on a device
 *        @dev: device
 *        @inc: modifier
 *
 *        Add or remove promiscuity from a device. While the count in the device
 *        remains above zero the interface remains promiscuous. Once it hits zero
 *        the device reverts back to normal filtering operation. A negative inc
 *        value is used to drop promiscuity on the device.
 *        Return 0 if successful or a negative errno code on error.
 */
int dev_set_promiscuity(struct net_device *dev, int inc)
{
        unsigned int old_flags = dev->flags;
        int err;

        err = __dev_set_promiscuity(dev, inc, true);
        if (err < 0)
                return err;
        if (dev->flags != old_flags)
                dev_set_rx_mode(dev);
        return err;
}
EXPORT_SYMBOL(dev_set_promiscuity);

static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
        unsigned int allmulti, flags;

        ASSERT_RTNL();

        allmulti = dev->allmulti + inc;
        if (allmulti == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch allmulti and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_ALLMULTI;
        } else {
                flags = old_flags | IFF_ALLMULTI;
        }
        WRITE_ONCE(dev->allmulti, allmulti);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s allmulticast mode\n",
                            dev->flags & IFF_ALLMULTI ? "entered" : "left");
                dev_change_rx_flags(dev, IFF_ALLMULTI);
                dev_set_rx_mode(dev);
                if (notify)
                        __dev_notify_flags(dev, old_flags,
                                           dev->gflags ^ old_gflags, 0, NULL);
        }
        return 0;
}

/**
 *        dev_set_allmulti        - update allmulti count on a device
 *        @dev: device
 *        @inc: modifier
 *
 *        Add or remove reception of all multicast frames to a device. While the
 *        count in the device remains above zero the interface remains listening
 *        to all interfaces. Once it hits zero the device reverts back to normal
 *        filtering operation. A negative @inc value is used to drop the counter
 *        when releasing a resource needing all multicasts.
 *        Return 0 if successful or a negative errno code on error.
 */

int dev_set_allmulti(struct net_device *dev, int inc)
{
        return __dev_set_allmulti(dev, inc, true);
}
EXPORT_SYMBOL(dev_set_allmulti);

/*
 *        Upload unicast and multicast address lists to device and
 *        configure RX filtering. When the device doesn't support unicast
 *        filtering it is put in promiscuous mode while unicast addresses
 *        are present.
 */
void __dev_set_rx_mode(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        /* dev_open will call this function so the list will stay sane. */
        if (!(dev->flags&IFF_UP))
                return;

        if (!netif_device_present(dev))
                return;

        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
                /* Unicast addresses changes may only happen under the rtnl,
                 * therefore calling __dev_set_promiscuity here is safe.
                 */
                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
                        __dev_set_promiscuity(dev, 1, false);
                        dev->uc_promisc = true;
                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
                        __dev_set_promiscuity(dev, -1, false);
                        dev->uc_promisc = false;
                }
        }

        if (ops->ndo_set_rx_mode)
                ops->ndo_set_rx_mode(dev);
}

void dev_set_rx_mode(struct net_device *dev)
{
        netif_addr_lock_bh(dev);
        __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
}

/**
 *        dev_get_flags - get flags reported to userspace
 *        @dev: device
 *
 *        Get the combination of flag bits exported through APIs to userspace.
 */
unsigned int dev_get_flags(const struct net_device *dev)
{
        unsigned int flags;

        flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
                                IFF_ALLMULTI |
                                IFF_RUNNING |
                                IFF_LOWER_UP |
                                IFF_DORMANT)) |
                (READ_ONCE(dev->gflags) & (IFF_PROMISC |
                                IFF_ALLMULTI));

        if (netif_running(dev)) {
                if (netif_oper_up(dev))
                        flags |= IFF_RUNNING;
                if (netif_carrier_ok(dev))
                        flags |= IFF_LOWER_UP;
                if (netif_dormant(dev))
                        flags |= IFF_DORMANT;
        }

        return flags;
}
EXPORT_SYMBOL(dev_get_flags);

int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack)
{
        unsigned int old_flags = dev->flags;
        int ret;

        ASSERT_RTNL();

        /*
         *        Set the flags on our device.
         */

        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
                               IFF_AUTOMEDIA)) |
                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
                                    IFF_ALLMULTI));

        /*
         *        Load in the correct multicast list now the flags have changed.
         */

        if ((old_flags ^ flags) & IFF_MULTICAST)
                dev_change_rx_flags(dev, IFF_MULTICAST);

        dev_set_rx_mode(dev);

        /*
         *        Have we downed the interface. We handle IFF_UP ourselves
         *        according to user attempts to set it, rather than blindly
         *        setting it.
         */

        ret = 0;
        if ((old_flags ^ flags) & IFF_UP) {
                if (old_flags & IFF_UP)
                        __dev_close(dev);
                else
                        ret = __dev_open(dev, extack);
        }

        if ((flags ^ dev->gflags) & IFF_PROMISC) {
                int inc = (flags & IFF_PROMISC) ? 1 : -1;
                unsigned int old_flags = dev->flags;

                dev->gflags ^= IFF_PROMISC;

                if (__dev_set_promiscuity(dev, inc, false) >= 0)
                        if (dev->flags != old_flags)
                                dev_set_rx_mode(dev);
        }

        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
         * is important. Some (broken) drivers set IFF_PROMISC, when
         * IFF_ALLMULTI is requested not asking us and not reporting.
         */
        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;

                dev->gflags ^= IFF_ALLMULTI;
                __dev_set_allmulti(dev, inc, false);
        }

        return ret;
}

void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
                        unsigned int gchanges, u32 portid,
                        const struct nlmsghdr *nlh)
{
        unsigned int changes = dev->flags ^ old_flags;

        if (gchanges)
                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);

        if (changes & IFF_UP) {
                if (dev->flags & IFF_UP)
                        call_netdevice_notifiers(NETDEV_UP, dev);
                else
                        call_netdevice_notifiers(NETDEV_DOWN, dev);
        }

        if (dev->flags & IFF_UP &&
            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
                struct netdev_notifier_change_info change_info = {
                        .info = {
                                .dev = dev,
                        },
                        .flags_changed = changes,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
        }
}

/**
 *        dev_change_flags - change device settings
 *        @dev: device
 *        @flags: device state flags
 *        @extack: netlink extended ack
 *
 *        Change settings on device based state flags. The flags are
 *        in the userspace exported format.
 */
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack)
{
        int ret;
        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;

        ret = __dev_change_flags(dev, flags, extack);
        if (ret < 0)
                return ret;

        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
        __dev_notify_flags(dev, old_flags, changes, 0, NULL);
        return ret;
}
EXPORT_SYMBOL(dev_change_flags);

int __dev_set_mtu(struct net_device *dev, int new_mtu)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_mtu)
                return ops->ndo_change_mtu(dev, new_mtu);

        /* Pairs with all the lockless reads of dev->mtu in the stack */
        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}
EXPORT_SYMBOL(__dev_set_mtu);

int dev_validate_mtu(struct net_device *dev, int new_mtu,
                     struct netlink_ext_ack *extack)
{
        /* MTU must be positive, and in range */
        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
                return -EINVAL;
        }

        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
                return -EINVAL;
        }
        return 0;
}

/**
 *        dev_set_mtu_ext - Change maximum transfer unit
 *        @dev: device
 *        @new_mtu: new transfer unit
 *        @extack: netlink extended ack
 *
 *        Change the maximum transfer size of the network device.
 */
int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
                    struct netlink_ext_ack *extack)
{
        int err, orig_mtu;

        if (new_mtu == dev->mtu)
                return 0;

        err = dev_validate_mtu(dev, new_mtu, extack);
        if (err)
                return err;

        if (!netif_device_present(dev))
                return -ENODEV;

        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        orig_mtu = dev->mtu;
        err = __dev_set_mtu(dev, new_mtu);

        if (!err) {
                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                   orig_mtu);
                err = notifier_to_errno(err);
                if (err) {
                        /* setting mtu back and notifying everyone again,
                         * so that they have a chance to revert changes.
                         */
                        __dev_set_mtu(dev, orig_mtu);
                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                     new_mtu);
                }
        }
        return err;
}

int dev_set_mtu(struct net_device *dev, int new_mtu)
{
        struct netlink_ext_ack extack;
        int err;

        memset(&extack, 0, sizeof(extack));
        err = dev_set_mtu_ext(dev, new_mtu, &extack);
        if (err && extack._msg)
                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
        return err;
}
EXPORT_SYMBOL(dev_set_mtu);

/**
 *        dev_change_tx_queue_len - Change TX queue length of a netdevice
 *        @dev: device
 *        @new_len: new tx queue length
 */
int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
        unsigned int orig_len = dev->tx_queue_len;
        int res;

        if (new_len != (unsigned int)new_len)
                return -ERANGE;

        if (new_len != orig_len) {
                WRITE_ONCE(dev->tx_queue_len, new_len);
                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
                res = notifier_to_errno(res);
                if (res)
                        goto err_rollback;
                res = dev_qdisc_change_tx_queue_len(dev);
                if (res)
                        goto err_rollback;
        }

        return 0;

err_rollback:
        netdev_err(dev, "refused to change device tx_queue_len\n");
        WRITE_ONCE(dev->tx_queue_len, orig_len);
        return res;
}

/**
 *        dev_set_group - Change group this device belongs to
 *        @dev: device
 *        @new_group: group this device should belong to
 */
void dev_set_group(struct net_device *dev, int new_group)
{
        dev->group = new_group;
}

/**
 *        dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 *        @dev: device
 *        @addr: new address
 *        @extack: netlink extended ack
 */
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                              struct netlink_ext_ack *extack)
{
        struct netdev_notifier_pre_changeaddr_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .dev_addr = addr,
        };
        int rc;

        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
        return notifier_to_errno(rc);
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);

/**
 *        dev_set_mac_address - Change Media Access Control Address
 *        @dev: device
 *        @sa: new address
 *        @extack: netlink extended ack
 *
 *        Change the hardware (MAC) address of the device
 */
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                        struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (!ops->ndo_set_mac_address)
                return -EOPNOTSUPP;
        if (sa->sa_family != dev->type)
                return -EINVAL;
        if (!netif_device_present(dev))
                return -ENODEV;
        err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
        if (err)
                return err;
        if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
                err = ops->ndo_set_mac_address(dev, sa);
                if (err)
                        return err;
        }
        dev->addr_assign_type = NET_ADDR_SET;
        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        add_device_randomness(dev->dev_addr, dev->addr_len);
        return 0;
}
EXPORT_SYMBOL(dev_set_mac_address);

DECLARE_RWSEM(dev_addr_sem);

int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                             struct netlink_ext_ack *extack)
{
        int ret;

        down_write(&dev_addr_sem);
        ret = dev_set_mac_address(dev, sa, extack);
        up_write(&dev_addr_sem);
        return ret;
}
EXPORT_SYMBOL(dev_set_mac_address_user);

int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
        size_t size = sizeof(sa->sa_data_min);
        struct net_device *dev;
        int ret = 0;

        down_read(&dev_addr_sem);
        rcu_read_lock();

        dev = dev_get_by_name_rcu(net, dev_name);
        if (!dev) {
                ret = -ENODEV;
                goto unlock;
        }
        if (!dev->addr_len)
                memset(sa->sa_data, 0, size);
        else
                memcpy(sa->sa_data, dev->dev_addr,
                       min_t(size_t, size, dev->addr_len));
        sa->sa_family = dev->type;

unlock:
        rcu_read_unlock();
        up_read(&dev_addr_sem);
        return ret;
}
EXPORT_SYMBOL(dev_get_mac_address);

/**
 *        dev_change_carrier - Change device carrier
 *        @dev: device
 *        @new_carrier: new value
 *
 *        Change device carrier
 */
int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_change_carrier)
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        return ops->ndo_change_carrier(dev, new_carrier);
}

/**
 *        dev_get_phys_port_id - Get device physical port ID
 *        @dev: device
 *        @ppid: port ID
 *
 *        Get device physical port ID
 */
int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_phys_port_id)
                return -EOPNOTSUPP;
        return ops->ndo_get_phys_port_id(dev, ppid);
}

/**
 *        dev_get_phys_port_name - Get device physical port name
 *        @dev: device
 *        @name: port name
 *        @len: limit of bytes to copy to name
 *
 *        Get device physical port name
 */
int dev_get_phys_port_name(struct net_device *dev,
                           char *name, size_t len)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (ops->ndo_get_phys_port_name) {
                err = ops->ndo_get_phys_port_name(dev, name, len);
                if (err != -EOPNOTSUPP)
                        return err;
        }
        return devlink_compat_phys_port_name_get(dev, name, len);
}

/**
 *        dev_get_port_parent_id - Get the device's port parent identifier
 *        @dev: network device
 *        @ppid: pointer to a storage for the port's parent identifier
 *        @recurse: allow/disallow recursion to lower devices
 *
 *        Get the devices's port parent identifier
 */
int dev_get_port_parent_id(struct net_device *dev,
                           struct netdev_phys_item_id *ppid,
                           bool recurse)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct netdev_phys_item_id first = { };
        struct net_device *lower_dev;
        struct list_head *iter;
        int err;

        if (ops->ndo_get_port_parent_id) {
                err = ops->ndo_get_port_parent_id(dev, ppid);
                if (err != -EOPNOTSUPP)
                        return err;
        }

        err = devlink_compat_switch_id_get(dev, ppid);
        if (!recurse || err != -EOPNOTSUPP)
                return err;

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                err = dev_get_port_parent_id(lower_dev, ppid, true);
                if (err)
                        break;
                if (!first.id_len)
                        first = *ppid;
                else if (memcmp(&first, ppid, sizeof(*ppid)))
                        return -EOPNOTSUPP;
        }

        return err;
}
EXPORT_SYMBOL(dev_get_port_parent_id);

/**
 *        netdev_port_same_parent_id - Indicate if two network devices have
 *        the same port parent identifier
 *        @a: first network device
 *        @b: second network device
 */
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
{
        struct netdev_phys_item_id a_id = { };
        struct netdev_phys_item_id b_id = { };

        if (dev_get_port_parent_id(a, &a_id, true) ||
            dev_get_port_parent_id(b, &b_id, true))
                return false;

        return netdev_phys_item_id_same(&a_id, &b_id);
}
EXPORT_SYMBOL(netdev_port_same_parent_id);

/**
 *        dev_change_proto_down - set carrier according to proto_down.
 *
 *        @dev: device
 *        @proto_down: new value
 */
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
        if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        if (proto_down)
                netif_carrier_off(dev);
        else
                netif_carrier_on(dev);
        WRITE_ONCE(dev->proto_down, proto_down);
        return 0;
}

/**
 *        dev_change_proto_down_reason - proto down reason
 *
 *        @dev: device
 *        @mask: proto down mask
 *        @value: proto down value
 */
void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
                                  u32 value)
{
        u32 proto_down_reason;
        int b;

        if (!mask) {
                proto_down_reason = value;
        } else {
                proto_down_reason = dev->proto_down_reason;
                for_each_set_bit(b, &mask, 32) {
                        if (value & (1 << b))
                                proto_down_reason |= BIT(b);
                        else
                                proto_down_reason &= ~BIT(b);
                }
        }
        WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
}

struct bpf_xdp_link {
        struct bpf_link link;
        struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
        int flags;
};

static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
{
        if (flags & XDP_FLAGS_HW_MODE)
                return XDP_MODE_HW;
        if (flags & XDP_FLAGS_DRV_MODE)
                return XDP_MODE_DRV;
        if (flags & XDP_FLAGS_SKB_MODE)
                return XDP_MODE_SKB;
        return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
}

static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
{
        switch (mode) {
        case XDP_MODE_SKB:
                return generic_xdp_install;
        case XDP_MODE_DRV:
        case XDP_MODE_HW:
                return dev->netdev_ops->ndo_bpf;
        default:
                return NULL;
        }
}

static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
                                         enum bpf_xdp_mode mode)
{
        return dev->xdp_state[mode].link;
}

static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
                                     enum bpf_xdp_mode mode)
{
        struct bpf_xdp_link *link = dev_xdp_link(dev, mode);

        if (link)
                return link->link.prog;
        return dev->xdp_state[mode].prog;
}

u8 dev_xdp_prog_count(struct net_device *dev)
{
        u8 count = 0;
        int i;

        for (i = 0; i < __MAX_XDP_MODE; i++)
                if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
                        count++;
        return count;
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);

u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
        struct bpf_prog *prog = dev_xdp_prog(dev, mode);

        return prog ? prog->aux->id : 0;
}

static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_xdp_link *link)
{
        dev->xdp_state[mode].link = link;
        dev->xdp_state[mode].prog = NULL;
}

static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_prog *prog)
{
        dev->xdp_state[mode].link = NULL;
        dev->xdp_state[mode].prog = prog;
}

static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
                           bpf_op_t bpf_op, struct netlink_ext_ack *extack,
                           u32 flags, struct bpf_prog *prog)
{
        struct netdev_bpf xdp;
        int err;

        memset(&xdp, 0, sizeof(xdp));
        xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
        xdp.extack = extack;
        xdp.flags = flags;
        xdp.prog = prog;

        /* Drivers assume refcnt is already incremented (i.e, prog pointer is
         * "moved" into driver), so they don't increment it on their own, but
         * they do decrement refcnt when program is detached or replaced.
         * Given net_device also owns link/prog, we need to bump refcnt here
         * to prevent drivers from underflowing it.
         */
        if (prog)
                bpf_prog_inc(prog);
        err = bpf_op(dev, &xdp);
        if (err) {
                if (prog)
                        bpf_prog_put(prog);
                return err;
        }

        if (mode != XDP_MODE_HW)
                bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);

        return 0;
}

static void dev_xdp_uninstall(struct net_device *dev)
{
        struct bpf_xdp_link *link;
        struct bpf_prog *prog;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
                prog = dev_xdp_prog(dev, mode);
                if (!prog)
                        continue;

                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op)
                        continue;

                WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));

                /* auto-detach link from net device */
                link = dev_xdp_link(dev, mode);
                if (link)
                        link->dev = NULL;
                else
                        bpf_prog_put(prog);

                dev_xdp_set_link(dev, mode, NULL);
        }
}

static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
                          struct bpf_xdp_link *link, struct bpf_prog *new_prog,
                          struct bpf_prog *old_prog, u32 flags)
{
        unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
        struct bpf_prog *cur_prog;
        struct net_device *upper;
        struct list_head *iter;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err;

        ASSERT_RTNL();

        /* either link or prog attachment, never both */
        if (link && (new_prog || old_prog))
                return -EINVAL;
        /* link supports only XDP mode flags */
        if (link && (flags & ~XDP_FLAGS_MODES)) {
                NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
                return -EINVAL;
        }
        /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
        if (num_modes > 1) {
                NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
                return -EINVAL;
        }
        /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
        if (!num_modes && dev_xdp_prog_count(dev) > 1) {
                NL_SET_ERR_MSG(extack,
                               "More than one program loaded, unset mode is ambiguous");
                return -EINVAL;
        }
        /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
        if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
                NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
                return -EINVAL;
        }

        mode = dev_xdp_mode(dev, flags);
        /* can't replace attached link */
        if (dev_xdp_link(dev, mode)) {
                NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
                return -EBUSY;
        }

        /* don't allow if an upper device already has a program */
        netdev_for_each_upper_dev_rcu(dev, upper, iter) {
                if (dev_xdp_prog_count(upper) > 0) {
                        NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
                        return -EEXIST;
                }
        }

        cur_prog = dev_xdp_prog(dev, mode);
        /* can't replace attached prog with link */
        if (link && cur_prog) {
                NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
                return -EBUSY;
        }
        if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
                NL_SET_ERR_MSG(extack, "Active program does not match expected");
                return -EEXIST;
        }

        /* put effective new program into new_prog */
        if (link)
                new_prog = link->link.prog;

        if (new_prog) {
                bool offload = mode == XDP_MODE_HW;
                enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
                                               ? XDP_MODE_DRV : XDP_MODE_SKB;

                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
                        NL_SET_ERR_MSG(extack, "XDP program already attached");
                        return -EBUSY;
                }
                if (!offload && dev_xdp_prog(dev, other_mode)) {
                        NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
                        return -EEXIST;
                }
                if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
                        NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
                        return -EINVAL;
                }
                if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
                        NL_SET_ERR_MSG(extack, "Program bound to different device");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
                        return -EINVAL;
                }
        }

        /* don't call drivers if the effective program didn't change */
        if (new_prog != cur_prog) {
                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op) {
                        NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
                        return -EOPNOTSUPP;
                }

                err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
                if (err)
                        return err;
        }

        if (link)
                dev_xdp_set_link(dev, mode, link);
        else
                dev_xdp_set_prog(dev, mode, new_prog);
        if (cur_prog)
                bpf_prog_put(cur_prog);

        return 0;
}

static int dev_xdp_attach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
}

static int dev_xdp_detach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        mode = dev_xdp_mode(dev, link->flags);
        if (dev_xdp_link(dev, mode) != link)
                return -EINVAL;

        bpf_op = dev_xdp_bpf_op(dev, mode);
        WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
        dev_xdp_set_link(dev, mode, NULL);
        return 0;
}

static void bpf_xdp_link_release(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        rtnl_lock();

        /* if racing with net_device's tear down, xdp_link->dev might be
         * already NULL, in which case link was already auto-detached
         */
        if (xdp_link->dev) {
                WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
                xdp_link->dev = NULL;
        }

        rtnl_unlock();
}

static int bpf_xdp_link_detach(struct bpf_link *link)
{
        bpf_xdp_link_release(link);
        return 0;
}

static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        kfree(xdp_link);
}

static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
                                     struct seq_file *seq)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        seq_printf(seq, "ifindex:\t%u\n", ifindex);
}

static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
                                       struct bpf_link_info *info)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        info->xdp.ifindex = ifindex;
        return 0;
}

static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
                               struct bpf_prog *old_prog)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err = 0;

        rtnl_lock();

        /* link might have been auto-released already, so fail */
        if (!xdp_link->dev) {
                err = -ENOLINK;
                goto out_unlock;
        }

        if (old_prog && link->prog != old_prog) {
                err = -EPERM;
                goto out_unlock;
        }
        old_prog = link->prog;
        if (old_prog->type != new_prog->type ||
            old_prog->expected_attach_type != new_prog->expected_attach_type) {
                err = -EINVAL;
                goto out_unlock;
        }

        if (old_prog == new_prog) {
                /* no-op, don't disturb drivers */
                bpf_prog_put(new_prog);
                goto out_unlock;
        }

        mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
        bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
        err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
                              xdp_link->flags, new_prog);
        if (err)
                goto out_unlock;

        old_prog = xchg(&link->prog, new_prog);
        bpf_prog_put(old_prog);

out_unlock:
        rtnl_unlock();
        return err;
}

static const struct bpf_link_ops bpf_xdp_link_lops = {
        .release = bpf_xdp_link_release,
        .dealloc = bpf_xdp_link_dealloc,
        .detach = bpf_xdp_link_detach,
        .show_fdinfo = bpf_xdp_link_show_fdinfo,
        .fill_link_info = bpf_xdp_link_fill_link_info,
        .update_prog = bpf_xdp_link_update,
};

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct net *net = current->nsproxy->net_ns;
        struct bpf_link_primer link_primer;
        struct netlink_ext_ack extack = {};
        struct bpf_xdp_link *link;
        struct net_device *dev;
        int err, fd;

        rtnl_lock();
        dev = dev_get_by_index(net, attr->link_create.target_ifindex);
        if (!dev) {
                rtnl_unlock();
                return -EINVAL;
        }

        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto unlock;
        }

        bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
        link->dev = dev;
        link->flags = attr->link_create.flags;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto unlock;
        }

        err = dev_xdp_attach_link(dev, &extack, link);
        rtnl_unlock();

        if (err) {
                link->dev = NULL;
                bpf_link_cleanup(&link_primer);
                trace_bpf_xdp_link_attach_failed(extack._msg);
                goto out_put_dev;
        }

        fd = bpf_link_settle(&link_primer);
        /* link itself doesn't hold dev's refcnt to not complicate shutdown */
        dev_put(dev);
        return fd;

unlock:
        rtnl_unlock();

out_put_dev:
        dev_put(dev);
        return err;
}

/**
 *        dev_change_xdp_fd - set or clear a bpf program for a device rx path
 *        @dev: device
 *        @extack: netlink extended ack
 *        @fd: new program fd or negative value to clear
 *        @expected_fd: old program fd that userspace expects to replace or clear
 *        @flags: xdp-related flags
 *
 *        Set or clear a bpf program for a device
 */
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags)
{
        enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
        struct bpf_prog *new_prog = NULL, *old_prog = NULL;
        int err;

        ASSERT_RTNL();

        if (fd >= 0) {
                new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(new_prog))
                        return PTR_ERR(new_prog);
        }

        if (expected_fd >= 0) {
                old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(old_prog)) {
                        err = PTR_ERR(old_prog);
                        old_prog = NULL;
                        goto err_out;
                }
        }

        err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);

err_out:
        if (err && new_prog)
                bpf_prog_put(new_prog);
        if (old_prog)
                bpf_prog_put(old_prog);
        return err;
}

/**
 * dev_index_reserve() - allocate an ifindex in a namespace
 * @net: the applicable net namespace
 * @ifindex: requested ifindex, pass %0 to get one allocated
 *
 * Allocate a ifindex for a new device. Caller must either use the ifindex
 * to store the device (via list_netdevice()) or call dev_index_release()
 * to give the index up.
 *
 * Return: a suitable unique value for a new device interface number or -errno.
 */
static int dev_index_reserve(struct net *net, u32 ifindex)
{
        int err;

        if (ifindex > INT_MAX) {
                DEBUG_NET_WARN_ON_ONCE(1);
                return -EINVAL;
        }

        if (!ifindex)
                err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
                                      xa_limit_31b, &net->ifindex, GFP_KERNEL);
        else
                err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
        if (err < 0)
                return err;

        return ifindex;
}

static void dev_index_release(struct net *net, int ifindex)
{
        /* Expect only unused indexes, unlist_netdevice() removes the used */
        WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}

/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
atomic_t dev_unreg_count = ATOMIC_INIT(0);

static void net_set_todo(struct net_device *dev)
{
        list_add_tail(&dev->todo_list, &net_todo_list);
}

static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
        struct net_device *upper, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(upper->wanted_features & feature)
                    && (features & feature)) {
                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
                                   &feature, upper->name);
                        features &= ~feature;
                }
        }

        return features;
}

static void netdev_sync_lower_features(struct net_device *upper,
        struct net_device *lower, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(features & feature) && (lower->features & feature)) {
                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
                                   &feature, lower->name);
                        lower->wanted_features &= ~feature;
                        __netdev_update_features(lower);

                        if (unlikely(lower->features & feature))
                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
                                            &feature, lower->name);
                        else
                                netdev_features_change(lower);
                }
        }
}

static netdev_features_t netdev_fix_features(struct net_device *dev,
        netdev_features_t features)
{
        /* Fix illegal checksum combinations */
        if ((features & NETIF_F_HW_CSUM) &&
            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        /* TSO requires that SG is present as well. */
        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
                features &= ~NETIF_F_ALL_TSO;
        }

        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
                                        !(features & NETIF_F_IP_CSUM)) {
                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO;
                features &= ~NETIF_F_TSO_ECN;
        }

        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
                                         !(features & NETIF_F_IPV6_CSUM)) {
                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO6;
        }

        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
                features &= ~NETIF_F_TSO_MANGLEID;

        /* TSO ECN requires that TSO is present as well. */
        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
                features &= ~NETIF_F_TSO_ECN;

        /* Software GSO depends on SG. */
        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
                features &= ~NETIF_F_GSO;
        }

        /* GSO partial features require GSO partial be set */
        if ((features & dev->gso_partial_features) &&
            !(features & NETIF_F_GSO_PARTIAL)) {
                netdev_dbg(dev,
                           "Dropping partially supported GSO features since no GSO partial.\n");
                features &= ~dev->gso_partial_features;
        }

        if (!(features & NETIF_F_RXCSUM)) {
                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
                 * successfully merged by hardware must also have the
                 * checksum verified by hardware.  If the user does not
                 * want to enable RXCSUM, logically, we should disable GRO_HW.
                 */
                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        /* LRO/HW-GRO features cannot be combined with RX-FCS */
        if (features & NETIF_F_RXFCS) {
                if (features & NETIF_F_LRO) {
                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_LRO;
                }

                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
                netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
                features &= ~NETIF_F_LRO;
        }

        if (features & NETIF_F_HW_TLS_TX) {
                bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
                        (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
                bool hw_csum = features & NETIF_F_HW_CSUM;

                if (!ip_csum && !hw_csum) {
                        netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
                        features &= ~NETIF_F_HW_TLS_TX;
                }
        }

        if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
                netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
                features &= ~NETIF_F_HW_TLS_RX;
        }

        return features;
}

int __netdev_update_features(struct net_device *dev)
{
        struct net_device *upper, *lower;
        netdev_features_t features;
        struct list_head *iter;
        int err = -1;

        ASSERT_RTNL();

        features = netdev_get_wanted_features(dev);

        if (dev->netdev_ops->ndo_fix_features)
                features = dev->netdev_ops->ndo_fix_features(dev, features);

        /* driver might be less strict about feature dependencies */
        features = netdev_fix_features(dev, features);

        /* some features can't be enabled if they're off on an upper device */
        netdev_for_each_upper_dev_rcu(dev, upper, iter)
                features = netdev_sync_upper_features(dev, upper, features);

        if (dev->features == features)
                goto sync_lower;

        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
                &dev->features, &features);

        if (dev->netdev_ops->ndo_set_features)
                err = dev->netdev_ops->ndo_set_features(dev, features);
        else
                err = 0;

        if (unlikely(err < 0)) {
                netdev_err(dev,
                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
                        err, &features, &dev->features);
                /* return non-0 since some features might have changed and
                 * it's better to fire a spurious notification than miss it
                 */
                return -1;
        }

sync_lower:
        /* some features must be disabled on lower devices when disabled
         * on an upper device (think: bonding master or bridge)
         */
        netdev_for_each_lower_dev(dev, lower, iter)
                netdev_sync_lower_features(dev, lower, features);

        if (!err) {
                netdev_features_t diff = features ^ dev->features;

                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
                        /* udp_tunnel_{get,drop}_rx_info both need
                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
                         * device, or they won't do anything.
                         * Thus we need to update dev->features
                         * *before* calling udp_tunnel_get_rx_info,
                         * but *after* calling udp_tunnel_drop_rx_info.
                         */
                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
                                dev->features = features;
                                udp_tunnel_get_rx_info(dev);
                        } else {
                                udp_tunnel_drop_rx_info(dev);
                        }
                }

                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_ctag_filter_info(dev);
                        } else {
                                vlan_drop_rx_ctag_filter_info(dev);
                        }
                }

                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_stag_filter_info(dev);
                        } else {
                                vlan_drop_rx_stag_filter_info(dev);
                        }
                }

                dev->features = features;
        }

        return err < 0 ? 0 : 1;
}

/**
 *        netdev_update_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications if it
 *        has changed. Should be called after driver or hardware dependent
 *        conditions might have changed that influence the features.
 */
void netdev_update_features(struct net_device *dev)
{
        if (__netdev_update_features(dev))
                netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_update_features);

/**
 *        netdev_change_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications even
 *        if they have not changed. Should be called instead of
 *        netdev_update_features() if also dev->vlan_features might
 *        have changed to allow the changes to be propagated to stacked
 *        VLAN devices.
 */
void netdev_change_features(struct net_device *dev)
{
        __netdev_update_features(dev);
        netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);

/**
 *        netif_stacked_transfer_operstate -        transfer operstate
 *        @rootdev: the root or lower level device to transfer state from
 *        @dev: the device to transfer operstate to
 *
 *        Transfer operational state from root to device. This is normally
 *        called when a stacking relationship exists between the root
 *        device and the device(a leaf device).
 */
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev)
{
        if (rootdev->operstate == IF_OPER_DORMANT)
                netif_dormant_on(dev);
        else
                netif_dormant_off(dev);

        if (rootdev->operstate == IF_OPER_TESTING)
                netif_testing_on(dev);
        else
                netif_testing_off(dev);

        if (netif_carrier_ok(rootdev))
                netif_carrier_on(dev);
        else
                netif_carrier_off(dev);
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);

static int netif_alloc_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;
        struct netdev_rx_queue *rx;
        size_t sz = count * sizeof(*rx);
        int err = 0;

        BUG_ON(count < 1);

        rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!rx)
                return -ENOMEM;

        dev->_rx = rx;

        for (i = 0; i < count; i++) {
                rx[i].dev = dev;

                /* XDP RX-queue setup */
                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
                if (err < 0)
                        goto err_rxq_info;
        }
        return 0;

err_rxq_info:
        /* Rollback successful reg's and free other resources */
        while (i--)
                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
        kvfree(dev->_rx);
        dev->_rx = NULL;
        return err;
}

static void netif_free_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;

        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
        if (!dev->_rx)
                return;

        for (i = 0; i < count; i++)
                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);

        kvfree(dev->_rx);
}

static void netdev_init_one_queue(struct net_device *dev,
                                  struct netdev_queue *queue, void *_unused)
{
        /* Initialize queue lock */
        spin_lock_init(&queue->_xmit_lock);
        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
        queue->xmit_lock_owner = -1;
        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
        queue->dev = dev;
#ifdef CONFIG_BQL
        dql_init(&queue->dql, HZ);
#endif
}

static void netif_free_tx_queues(struct net_device *dev)
{
        kvfree(dev->_tx);
}

static int netif_alloc_netdev_queues(struct net_device *dev)
{
        unsigned int count = dev->num_tx_queues;
        struct netdev_queue *tx;
        size_t sz = count * sizeof(*tx);

        if (count < 1 || count > 0xffff)
                return -EINVAL;

        tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!tx)
                return -ENOMEM;

        dev->_tx = tx;

        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
        spin_lock_init(&dev->tx_global_lock);

        return 0;
}

void netif_tx_stop_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                netif_tx_stop_queue(txq);
        }
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);

static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
{
        void __percpu *v;

        /* Drivers implementing ndo_get_peer_dev must support tstat
         * accounting, so that skb_do_redirect() can bump the dev's
         * RX stats upon network namespace switch.
         */
        if (dev->netdev_ops->ndo_get_peer_dev &&
            dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
                return -EOPNOTSUPP;

        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return 0;
        case NETDEV_PCPU_STAT_LSTATS:
                v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
                break;
        default:
                return -EINVAL;
        }

        return v ? 0 : -ENOMEM;
}

static void netdev_do_free_pcpu_stats(struct net_device *dev)
{
        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return;
        case NETDEV_PCPU_STAT_LSTATS:
                free_percpu(dev->lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                free_percpu(dev->tstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                free_percpu(dev->dstats);
                break;
        }
}

/**
 * register_netdevice() - register a network device
 * @dev: device to register
 *
 * Take a prepared network device structure and make it externally accessible.
 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
 * Callers must hold the rtnl lock - you may want register_netdev()
 * instead of this.
 */
int register_netdevice(struct net_device *dev)
{
        int ret;
        struct net *net = dev_net(dev);

        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
                     NETDEV_FEATURE_COUNT);
        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        might_sleep();

        /* When net_device's are persistent, this will be fatal. */
        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
        BUG_ON(!net);

        ret = ethtool_check_ops(dev->ethtool_ops);
        if (ret)
                return ret;

        spin_lock_init(&dev->addr_list_lock);
        netdev_set_addr_lockdep_class(dev);

        ret = dev_get_valid_name(net, dev, dev->name);
        if (ret < 0)
                goto out;

        ret = -ENOMEM;
        dev->name_node = netdev_name_node_head_alloc(dev);
        if (!dev->name_node)
                goto out;

        /* Init, if this function is available */
        if (dev->netdev_ops->ndo_init) {
                ret = dev->netdev_ops->ndo_init(dev);
                if (ret) {
                        if (ret > 0)
                                ret = -EIO;
                        goto err_free_name;
                }
        }

        if (((dev->hw_features | dev->features) &
             NETIF_F_HW_VLAN_CTAG_FILTER) &&
            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
                ret = -EINVAL;
                goto err_uninit;
        }

        ret = netdev_do_alloc_pcpu_stats(dev);
        if (ret)
                goto err_uninit;

        ret = dev_index_reserve(net, dev->ifindex);
        if (ret < 0)
                goto err_free_pcpu;
        dev->ifindex = ret;

        /* Transfer changeable features to wanted_features and enable
         * software offloads (GSO and GRO).
         */
        dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
        dev->features |= NETIF_F_SOFT_FEATURES;

        if (dev->udp_tunnel_nic_info) {
                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
        }

        dev->wanted_features = dev->features & dev->hw_features;

        if (!(dev->flags & IFF_LOOPBACK))
                dev->hw_features |= NETIF_F_NOCACHE_COPY;

        /* If IPv4 TCP segmentation offload is supported we should also
         * allow the device to enable segmenting the frame with the option
         * of ignoring a static IP ID value.  This doesn't enable the
         * feature itself but allows the user to enable it later.
         */
        if (dev->hw_features & NETIF_F_TSO)
                dev->hw_features |= NETIF_F_TSO_MANGLEID;
        if (dev->vlan_features & NETIF_F_TSO)
                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
        if (dev->mpls_features & NETIF_F_TSO)
                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
        if (dev->hw_enc_features & NETIF_F_TSO)
                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;

        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
         */
        dev->vlan_features |= NETIF_F_HIGHDMA;

        /* Make NETIF_F_SG inheritable to tunnel devices.
         */
        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;

        /* Make NETIF_F_SG inheritable to MPLS.
         */
        dev->mpls_features |= NETIF_F_SG;

        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
        ret = notifier_to_errno(ret);
        if (ret)
                goto err_ifindex_release;

        ret = netdev_register_kobject(dev);

        WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);

        if (ret)
                goto err_uninit_notify;

        __netdev_update_features(dev);

        /*
         *        Default initial state at registry is that the
         *        device is present.
         */

        set_bit(__LINK_STATE_PRESENT, &dev->state);

        linkwatch_init_dev(dev);

        dev_init_scheduler(dev);

        netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
        list_netdevice(dev);

        add_device_randomness(dev->dev_addr, dev->addr_len);

        /* If the device has permanent device address, driver should
         * set dev_addr and also addr_assign_type should be set to
         * NET_ADDR_PERM (default value).
         */
        if (dev->addr_assign_type == NET_ADDR_PERM)
                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);

        /* Notify protocols, that a new device appeared. */
        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
        ret = notifier_to_errno(ret);
        if (ret) {
                /* Expect explicit free_netdev() on failure */
                dev->needs_free_netdev = false;
                unregister_netdevice_queue(dev, NULL);
                goto out;
        }
        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        if (!dev->rtnl_link_ops ||
            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

out:
        return ret;

err_uninit_notify:
        call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_ifindex_release:
        dev_index_release(net, dev->ifindex);
err_free_pcpu:
        netdev_do_free_pcpu_stats(dev);
err_uninit:
        if (dev->netdev_ops->ndo_uninit)
                dev->netdev_ops->ndo_uninit(dev);
        if (dev->priv_destructor)
                dev->priv_destructor(dev);
err_free_name:
        netdev_name_node_free(dev->name_node);
        goto out;
}
EXPORT_SYMBOL(register_netdevice);

/* Initialize the core of a dummy net device.
 * This is useful if you are calling this function after alloc_netdev(),
 * since it does not memset the net_device fields.
 */
static void init_dummy_netdev_core(struct net_device *dev)
{
        /* make sure we BUG if trying to hit standard
         * register/unregister code path
         */
        dev->reg_state = NETREG_DUMMY;

        /* NAPI wants this */
        INIT_LIST_HEAD(&dev->napi_list);

        /* a dummy interface is started by default */
        set_bit(__LINK_STATE_PRESENT, &dev->state);
        set_bit(__LINK_STATE_START, &dev->state);

        /* napi_busy_loop stats accounting wants this */
        dev_net_set(dev, &init_net);

        /* Note : We dont allocate pcpu_refcnt for dummy devices,
         * because users of this 'device' dont need to change
         * its refcount.
         */
}

/**
 *        init_dummy_netdev        - init a dummy network device for NAPI
 *        @dev: device to init
 *
 *        This takes a network device structure and initializes the minimum
 *        amount of fields so it can be used to schedule NAPI polls without
 *        registering a full blown interface. This is to be used by drivers
 *        that need to tie several hardware interfaces to a single NAPI
 *        poll scheduler due to HW limitations.
 */
void init_dummy_netdev(struct net_device *dev)
{
        /* Clear everything. Note we don't initialize spinlocks
         * as they aren't supposed to be taken by any of the
         * NAPI code and this dummy netdev is supposed to be
         * only ever used for NAPI polls
         */
        memset(dev, 0, sizeof(struct net_device));
        init_dummy_netdev_core(dev);
}
EXPORT_SYMBOL_GPL(init_dummy_netdev);

/**
 *        register_netdev        - register a network device
 *        @dev: device to register
 *
 *        Take a completed network device structure and add it to the kernel
 *        interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 *        chain. 0 is returned on success. A negative errno code is returned
 *        on a failure to set up the device, or if the name is a duplicate.
 *
 *        This is a wrapper around register_netdevice that takes the rtnl semaphore
 *        and expands the device name if you passed a format string to
 *        alloc_netdev.
 */
int register_netdev(struct net_device *dev)
{
        int err;

        if (rtnl_lock_killable())
                return -EINTR;
        err = register_netdevice(dev);
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL(register_netdev);

int netdev_refcnt_read(const struct net_device *dev)
{
#ifdef CONFIG_PCPU_DEV_REFCNT
        int i, refcnt = 0;

        for_each_possible_cpu(i)
                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
        return refcnt;
#else
        return refcount_read(&dev->dev_refcnt);
#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);

int netdev_unregister_timeout_secs __read_mostly = 10;

#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
 * netdev_wait_allrefs_any - wait until all references are gone.
 * @list: list of net_devices to wait on
 *
 * This is called when unregistering network devices.
 *
 * Any protocol or device that holds a reference should register
 * for netdevice notification, and cleanup and put back the
 * reference if they receive an UNREGISTER event.
 * We can get stuck here if buggy protocols don't correctly
 * call dev_put.
 */
static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
        unsigned long rebroadcast_time, warning_time;
        struct net_device *dev;
        int wait = 0;

        rebroadcast_time = warning_time = jiffies;

        list_for_each_entry(dev, list, todo_list)
                if (netdev_refcnt_read(dev) == 1)
                        return dev;

        while (true) {
                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                        rtnl_lock();

                        /* Rebroadcast unregister notification */
                        list_for_each_entry(dev, list, todo_list)
                                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                        __rtnl_unlock();
                        rcu_barrier();
                        rtnl_lock();

                        list_for_each_entry(dev, list, todo_list)
                                if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
                                             &dev->state)) {
                                        /* We must not have linkwatch events
                                         * pending on unregister. If this
                                         * happens, we simply run the queue
                                         * unscheduled, resulting in a noop
                                         * for this device.
                                         */
                                        linkwatch_run_queue();
                                        break;
                                }

                        __rtnl_unlock();

                        rebroadcast_time = jiffies;
                }

                rcu_barrier();

                if (!wait) {
                        wait = WAIT_REFS_MIN_MSECS;
                } else {
                        msleep(wait);
                        wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
                }

                list_for_each_entry(dev, list, todo_list)
                        if (netdev_refcnt_read(dev) == 1)
                                return dev;

                if (time_after(jiffies, warning_time +
                               READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
                        list_for_each_entry(dev, list, todo_list) {
                                pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
                                         dev->name, netdev_refcnt_read(dev));
                                ref_tracker_dir_print(&dev->refcnt_tracker, 10);
                        }

                        warning_time = jiffies;
                }
        }
}

/* The sequence is:
 *
 *        rtnl_lock();
 *        ...
 *        register_netdevice(x1);
 *        register_netdevice(x2);
 *        ...
 *        unregister_netdevice(y1);
 *        unregister_netdevice(y2);
 *      ...
 *        rtnl_unlock();
 *        free_netdev(y1);
 *        free_netdev(y2);
 *
 * We are invoked by rtnl_unlock().
 * This allows us to deal with problems:
 * 1) We can delete sysfs objects which invoke hotplug
 *    without deadlocking with linkwatch via keventd.
 * 2) Since we run with the RTNL semaphore not held, we can sleep
 *    safely in order to wait for the netdev refcnt to drop to zero.
 *
 * We must not return until all unregister events added during
 * the interval the lock was held have been completed.
 */
void netdev_run_todo(void)
{
        struct net_device *dev, *tmp;
        struct list_head list;
        int cnt;
#ifdef CONFIG_LOCKDEP
        struct list_head unlink_list;

        list_replace_init(&net_unlink_list, &unlink_list);

        while (!list_empty(&unlink_list)) {
                struct net_device *dev = list_first_entry(&unlink_list,
                                                          struct net_device,
                                                          unlink_list);
                list_del_init(&dev->unlink_list);
                dev->nested_level = dev->lower_level - 1;
        }
#endif

        /* Snapshot list, allow later requests */
        list_replace_init(&net_todo_list, &list);

        __rtnl_unlock();

        /* Wait for rcu callbacks to finish before next phase */
        if (!list_empty(&list))
                rcu_barrier();

        list_for_each_entry_safe(dev, tmp, &list, todo_list) {
                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
                        netdev_WARN(dev, "run_todo but not unregistering\n");
                        list_del(&dev->todo_list);
                        continue;
                }

                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
                linkwatch_sync_dev(dev);
        }

        cnt = 0;
        while (!list_empty(&list)) {
                dev = netdev_wait_allrefs_any(&list);
                list_del(&dev->todo_list);

                /* paranoia */
                BUG_ON(netdev_refcnt_read(dev) != 1);
                BUG_ON(!list_empty(&dev->ptype_all));
                BUG_ON(!list_empty(&dev->ptype_specific));
                WARN_ON(rcu_access_pointer(dev->ip_ptr));
                WARN_ON(rcu_access_pointer(dev->ip6_ptr));

                netdev_do_free_pcpu_stats(dev);
                if (dev->priv_destructor)
                        dev->priv_destructor(dev);
                if (dev->needs_free_netdev)
                        free_netdev(dev);

                cnt++;

                /* Free network device */
                kobject_put(&dev->dev.kobj);
        }
        if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
                wake_up(&netdev_unregistering_wq);
}

/* Collate per-cpu network dstats statistics
 *
 * Read per-cpu network statistics from dev->dstats and populate the related
 * fields in @s.
 */
static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
                             const struct pcpu_dstats __percpu *dstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, rx_drops;
                u64 tx_packets, tx_bytes, tx_drops;
                const struct pcpu_dstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(dstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        rx_drops   = u64_stats_read(&stats->rx_drops);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                        tx_drops   = u64_stats_read(&stats->tx_drops);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->rx_dropped += rx_drops;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
                s->tx_dropped += tx_drops;
        }
}

/* ndo_get_stats64 implementation for dtstats-based accounting.
 *
 * Populate @s from dev->stats and dev->dstats. This is used internally by the
 * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
 */
static void dev_get_dstats64(const struct net_device *dev,
                             struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_dstats(s, dev->dstats);
}

/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 * all the same fields in the same order as net_device_stats, with only
 * the type differing, but rtnl_link_stats64 may have additional fields
 * at the end for newer counters.
 */
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats)
{
        size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
        const atomic_long_t *src = (atomic_long_t *)netdev_stats;
        u64 *dst = (u64 *)stats64;

        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
        for (i = 0; i < n; i++)
                dst[i] = (unsigned long)atomic_long_read(&src[i]);
        /* zero out counters that only exist in rtnl_link_stats64 */
        memset((char *)stats64 + n * sizeof(u64), 0,
               sizeof(*stats64) - n * sizeof(u64));
}
EXPORT_SYMBOL(netdev_stats_to_stats64);

static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
                struct net_device *dev)
{
        struct net_device_core_stats __percpu *p;

        p = alloc_percpu_gfp(struct net_device_core_stats,
                             GFP_ATOMIC | __GFP_NOWARN);

        if (p && cmpxchg(&dev->core_stats, NULL, p))
                free_percpu(p);

        /* This READ_ONCE() pairs with the cmpxchg() above */
        return READ_ONCE(dev->core_stats);
}

noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
{
        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
        unsigned long __percpu *field;

        if (unlikely(!p)) {
                p = netdev_core_stats_alloc(dev);
                if (!p)
                        return;
        }

        field = (__force unsigned long __percpu *)((__force void *)p + offset);
        this_cpu_inc(*field);
}
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);

/**
 *        dev_get_stats        - get network device statistics
 *        @dev: device to get statistics from
 *        @storage: place to store stats
 *
 *        Get network statistics from device. Return @storage.
 *        The device driver may provide its own method by setting
 *        dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 *        otherwise the internal statistics structure is used.
 */
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        const struct net_device_core_stats __percpu *p;

        if (ops->ndo_get_stats64) {
                memset(storage, 0, sizeof(*storage));
                ops->ndo_get_stats64(dev, storage);
        } else if (ops->ndo_get_stats) {
                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
                dev_get_tstats64(dev, storage);
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
                dev_get_dstats64(dev, storage);
        } else {
                netdev_stats_to_stats64(storage, &dev->stats);
        }

        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        p = READ_ONCE(dev->core_stats);
        if (p) {
                const struct net_device_core_stats *core_stats;
                int i;

                for_each_possible_cpu(i) {
                        core_stats = per_cpu_ptr(p, i);
                        storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
                        storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
                        storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
                        storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
                }
        }
        return storage;
}
EXPORT_SYMBOL(dev_get_stats);

/**
 *        dev_fetch_sw_netstats - get per-cpu network device statistics
 *        @s: place to store stats
 *        @netstats: per-cpu network stats to read from
 *
 *        Read per-cpu network statistics and populate the related fields in @s.
 */
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
                const struct pcpu_sw_netstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(netstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
        }
}
EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);

/**
 *        dev_get_tstats64 - ndo_get_stats64 implementation
 *        @dev: device to get statistics from
 *        @s: place to store stats
 *
 *        Populate @s from dev->stats and dev->tstats. Can be used as
 *        ndo_get_stats64() callback.
 */
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_sw_netstats(s, dev->tstats);
}
EXPORT_SYMBOL_GPL(dev_get_tstats64);

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
        struct netdev_queue *queue = dev_ingress_queue(dev);

#ifdef CONFIG_NET_CLS_ACT
        if (queue)
                return queue;
        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
        if (!queue)
                return NULL;
        netdev_init_one_queue(dev, queue, NULL);
        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
        RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
        rcu_assign_pointer(dev->ingress_queue, queue);
#endif
        return queue;
}

static const struct ethtool_ops default_ethtool_ops;

void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops)
{
        if (dev->ethtool_ops == &default_ethtool_ops)
                dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);

/**
 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
 * @dev: netdev to enable the IRQ coalescing on
 *
 * Sets a conservative default for SW IRQ coalescing. Users can use
 * sysfs attributes to override the default values.
 */
void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
{
        WARN_ON(dev->reg_state == NETREG_REGISTERED);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                dev->gro_flush_timeout = 20000;
                dev->napi_defer_hard_irqs = 1;
        }
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);

void netdev_freemem(struct net_device *dev)
{
        char *addr = (char *)dev - dev->padded;

        kvfree(addr);
}

/**
 * alloc_netdev_mqs - allocate network device
 * @sizeof_priv: size of private data to allocate space for
 * @name: device name format string
 * @name_assign_type: origin of device name
 * @setup: callback to initialize device
 * @txqs: the number of TX subqueues to allocate
 * @rxqs: the number of RX subqueues to allocate
 *
 * Allocates a struct net_device with private data area for driver use
 * and performs basic initialization.  Also allocates subqueue structs
 * for each queue on the device.
 */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                unsigned char name_assign_type,
                void (*setup)(struct net_device *),
                unsigned int txqs, unsigned int rxqs)
{
        struct net_device *dev;
        unsigned int alloc_size;
        struct net_device *p;

        BUG_ON(strlen(name) >= sizeof(dev->name));

        if (txqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
                return NULL;
        }

        if (rxqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
                return NULL;
        }

        alloc_size = sizeof(struct net_device);
        if (sizeof_priv) {
                /* ensure 32-byte alignment of private area */
                alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
                alloc_size += sizeof_priv;
        }
        /* ensure 32-byte alignment of whole construct */
        alloc_size += NETDEV_ALIGN - 1;

        p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!p)
                return NULL;

        dev = PTR_ALIGN(p, NETDEV_ALIGN);
        dev->padded = (char *)dev - (char *)p;

        ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
#ifdef CONFIG_PCPU_DEV_REFCNT
        dev->pcpu_refcnt = alloc_percpu(int);
        if (!dev->pcpu_refcnt)
                goto free_dev;
        __dev_hold(dev);
#else
        refcount_set(&dev->dev_refcnt, 1);
#endif

        if (dev_addr_init(dev))
                goto free_pcpu;

        dev_mc_init(dev);
        dev_uc_init(dev);

        dev_net_set(dev, &init_net);

        dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
        dev->xdp_zc_max_segs = 1;
        dev->gso_max_segs = GSO_MAX_SEGS;
        dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
        dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
        dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
        dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
        dev->tso_max_segs = TSO_MAX_SEGS;
        dev->upper_level = 1;
        dev->lower_level = 1;
#ifdef CONFIG_LOCKDEP
        dev->nested_level = 0;
        INIT_LIST_HEAD(&dev->unlink_list);
#endif

        INIT_LIST_HEAD(&dev->napi_list);
        INIT_LIST_HEAD(&dev->unreg_list);
        INIT_LIST_HEAD(&dev->close_list);
        INIT_LIST_HEAD(&dev->link_watch_list);
        INIT_LIST_HEAD(&dev->adj_list.upper);
        INIT_LIST_HEAD(&dev->adj_list.lower);
        INIT_LIST_HEAD(&dev->ptype_all);
        INIT_LIST_HEAD(&dev->ptype_specific);
        INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
        hash_init(dev->qdisc_hash);
#endif
        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        setup(dev);

        if (!dev->tx_queue_len) {
                dev->priv_flags |= IFF_NO_QUEUE;
                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
        }

        dev->num_tx_queues = txqs;
        dev->real_num_tx_queues = txqs;
        if (netif_alloc_netdev_queues(dev))
                goto free_all;

        dev->num_rx_queues = rxqs;
        dev->real_num_rx_queues = rxqs;
        if (netif_alloc_rx_queues(dev))
                goto free_all;

        strcpy(dev->name, name);
        dev->name_assign_type = name_assign_type;
        dev->group = INIT_NETDEV_GROUP;
        if (!dev->ethtool_ops)
                dev->ethtool_ops = &default_ethtool_ops;

        nf_hook_netdev_init(dev);

        return dev;

free_all:
        free_netdev(dev);
        return NULL;

free_pcpu:
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
free_dev:
#endif
        netdev_freemem(dev);
        return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);

/**
 * free_netdev - free network device
 * @dev: device
 *
 * This function does the last stage of destroying an allocated device
 * interface. The reference to the device object is released. If this
 * is the last reference then it will be freed.Must be called in process
 * context.
 */
void free_netdev(struct net_device *dev)
{
        struct napi_struct *p, *n;

        might_sleep();

        /* When called immediately after register_netdevice() failed the unwind
         * handling may still be dismantling the device. Handle that case by
         * deferring the free.
         */
        if (dev->reg_state == NETREG_UNREGISTERING) {
                ASSERT_RTNL();
                dev->needs_free_netdev = true;
                return;
        }

        netif_free_tx_queues(dev);
        netif_free_rx_queues(dev);

        kfree(rcu_dereference_protected(dev->ingress_queue, 1));

        /* Flush device addresses */
        dev_addr_flush(dev);

        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
                netif_napi_del(p);

        ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
        dev->pcpu_refcnt = NULL;
#endif
        free_percpu(dev->core_stats);
        dev->core_stats = NULL;
        free_percpu(dev->xdp_bulkq);
        dev->xdp_bulkq = NULL;

        /*  Compatibility with error handling in drivers */
        if (dev->reg_state == NETREG_UNINITIALIZED ||
            dev->reg_state == NETREG_DUMMY) {
                netdev_freemem(dev);
                return;
        }

        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
        WRITE_ONCE(dev->reg_state, NETREG_RELEASED);

        /* will free via device release */
        put_device(&dev->dev);
}
EXPORT_SYMBOL(free_netdev);

/**
 * alloc_netdev_dummy - Allocate and initialize a dummy net device.
 * @sizeof_priv: size of private data to allocate space for
 *
 * Return: the allocated net_device on success, NULL otherwise
 */
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
        return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
                            init_dummy_netdev_core);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);

/**
 *        synchronize_net -  Synchronize with packet receive processing
 *
 *        Wait for packets currently being received to be done.
 *        Does not block later packets from starting.
 */
void synchronize_net(void)
{
        might_sleep();
        if (rtnl_is_locked())
                synchronize_rcu_expedited();
        else
                synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);

/**
 *        unregister_netdevice_queue - remove device from the kernel
 *        @dev: device
 *        @head: list
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *        If head not NULL, device is queued to be unregistered later.
 *
 *        Callers must hold the rtnl semaphore.  You may want
 *        unregister_netdev() instead of this.
 */

void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
{
        ASSERT_RTNL();

        if (head) {
                list_move_tail(&dev->unreg_list, head);
        } else {
                LIST_HEAD(single);

                list_add(&dev->unreg_list, &single);
                unregister_netdevice_many(&single);
        }
}
EXPORT_SYMBOL(unregister_netdevice_queue);

void unregister_netdevice_many_notify(struct list_head *head,
                                      u32 portid, const struct nlmsghdr *nlh)
{
        struct net_device *dev, *tmp;
        LIST_HEAD(close_head);
        int cnt = 0;

        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        if (list_empty(head))
                return;

        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
                /* Some devices call without registering
                 * for initialization unwind. Remove those
                 * devices and proceed with the remaining.
                 */
                if (dev->reg_state == NETREG_UNINITIALIZED) {
                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
                                 dev->name, dev);

                        WARN_ON(1);
                        list_del(&dev->unreg_list);
                        continue;
                }
                dev->dismantle = true;
                BUG_ON(dev->reg_state != NETREG_REGISTERED);
        }

        /* If device is running, close it first. */
        list_for_each_entry(dev, head, unreg_list)
                list_add_tail(&dev->close_list, &close_head);
        dev_close_many(&close_head, true);

        list_for_each_entry(dev, head, unreg_list) {
                /* And unlink it from device chain. */
                unlist_netdevice(dev);
                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
        }
        flush_all_backlogs();

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                struct sk_buff *skb = NULL;

                /* Shutdown queueing discipline. */
                dev_shutdown(dev);
                dev_tcx_uninstall(dev);
                dev_xdp_uninstall(dev);
                bpf_dev_bound_netdev_unregister(dev);

                netdev_offload_xstats_disable_all(dev);

                /* Notify protocols, that we are about to destroy
                 * this device. They should clean all the things.
                 */
                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                if (!dev->rtnl_link_ops ||
                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
                                                     GFP_KERNEL, NULL, 0,
                                                     portid, nlh);

                /*
                 *        Flush the unicast and multicast chains
                 */
                dev_uc_flush(dev);
                dev_mc_flush(dev);

                netdev_name_node_alt_flush(dev);
                netdev_name_node_free(dev->name_node);

                call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);

                if (dev->netdev_ops->ndo_uninit)
                        dev->netdev_ops->ndo_uninit(dev);

                if (skb)
                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);

                /* Notifier chain MUST detach us all upper devices. */
                WARN_ON(netdev_has_any_upper_dev(dev));
                WARN_ON(netdev_has_any_lower_dev(dev));

                /* Remove entries from kobject tree */
                netdev_unregister_kobject(dev);
#ifdef CONFIG_XPS
                /* Remove XPS queueing entries */
                netif_reset_xps_queues_gt(dev, 0);
#endif
        }

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                netdev_put(dev, &dev->dev_registered_tracker);
                net_set_todo(dev);
                cnt++;
        }
        atomic_add(cnt, &dev_unreg_count);

        list_del(head);
}

/**
 *        unregister_netdevice_many - unregister many devices
 *        @head: list of devices
 *
 *  Note: As most callers use a stack allocated list_head,
 *  we force a list_del() to make sure stack wont be corrupted later.
 */
void unregister_netdevice_many(struct list_head *head)
{
        unregister_netdevice_many_notify(head, 0, NULL);
}
EXPORT_SYMBOL(unregister_netdevice_many);

/**
 *        unregister_netdev - remove device from the kernel
 *        @dev: device
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *
 *        This is just a wrapper for unregister_netdevice that takes
 *        the rtnl semaphore.  In general you want to use this and not
 *        unregister_netdevice.
 */
void unregister_netdev(struct net_device *dev)
{
        rtnl_lock();
        unregister_netdevice(dev);
        rtnl_unlock();
}
EXPORT_SYMBOL(unregister_netdev);

/**
 *        __dev_change_net_namespace - move device to different nethost namespace
 *        @dev: device
 *        @net: network namespace
 *        @pat: If not NULL name pattern to try if the current device name
 *              is already taken in the destination network namespace.
 *        @new_ifindex: If not zero, specifies device index in the target
 *                      namespace.
 *
 *        This function shuts down a device interface and moves it
 *        to a new network namespace. On success 0 is returned, on
 *        a failure a netagive errno code is returned.
 *
 *        Callers must hold the rtnl semaphore.
 */

int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex)
{
        struct netdev_name_node *name_node;
        struct net *net_old = dev_net(dev);
        char new_name[IFNAMSIZ] = {};
        int err, new_nsid;

        ASSERT_RTNL();

        /* Don't allow namespace local devices to be moved. */
        err = -EINVAL;
        if (dev->features & NETIF_F_NETNS_LOCAL)
                goto out;

        /* Ensure the device has been registrered */
        if (dev->reg_state != NETREG_REGISTERED)
                goto out;

        /* Get out if there is nothing todo */
        err = 0;
        if (net_eq(net_old, net))
                goto out;

        /* Pick the destination device name, and ensure
         * we can use it in the destination network namespace.
         */
        err = -EEXIST;
        if (netdev_name_in_use(net, dev->name)) {
                /* We get here if we can't use the current device name */
                if (!pat)
                        goto out;
                err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
                if (err < 0)
                        goto out;
        }
        /* Check that none of the altnames conflicts. */
        err = -EEXIST;
        netdev_for_each_altname(dev, name_node)
                if (netdev_name_in_use(net, name_node->name))
                        goto out;

        /* Check that new_ifindex isn't used yet. */
        if (new_ifindex) {
                err = dev_index_reserve(net, new_ifindex);
                if (err < 0)
                        goto out;
        } else {
                /* If there is an ifindex conflict assign a new one */
                err = dev_index_reserve(net, dev->ifindex);
                if (err == -EBUSY)
                        err = dev_index_reserve(net, 0);
                if (err < 0)
                        goto out;
                new_ifindex = err;
        }

        /*
         * And now a mini version of register_netdevice unregister_netdevice.
         */

        /* If device is running close it first. */
        dev_close(dev);

        /* And unlink it from device chain */
        unlist_netdevice(dev);

        synchronize_net();

        /* Shutdown queueing discipline. */
        dev_shutdown(dev);

        /* Notify protocols, that we are about to destroy
         * this device. They should clean all the things.
         *
         * Note that dev->reg_state stays at NETREG_REGISTERED.
         * This is wanted because this way 8021q and macvlan know
         * the device is just moving and can keep their slaves up.
         */
        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
        rcu_barrier();

        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);

        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
                            new_ifindex);

        /*
         *        Flush the unicast and multicast chains
         */
        dev_uc_flush(dev);
        dev_mc_flush(dev);

        /* Send a netdev-removed uevent to the old namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
        netdev_adjacent_del_links(dev);

        /* Move per-net netdevice notifiers that are following the netdevice */
        move_netdevice_notifiers_dev_net(dev, net);

        /* Actually switch the network namespace */
        dev_net_set(dev, net);
        dev->ifindex = new_ifindex;

        if (new_name[0]) {
                /* Rename the netdev to prepared name */
                write_seqlock(&netdev_rename_lock);
                strscpy(dev->name, new_name, IFNAMSIZ);
                write_sequnlock(&netdev_rename_lock);
        }

        /* Fixup kobjects */
        dev_set_uevent_suppress(&dev->dev, 1);
        err = device_rename(&dev->dev, dev->name);
        dev_set_uevent_suppress(&dev->dev, 0);
        WARN_ON(err);

        /* Send a netdev-add uevent to the new namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
        netdev_adjacent_add_links(dev);

        /* Adapt owner in case owning user namespace of target network
         * namespace is different from the original one.
         */
        err = netdev_change_owner(dev, net_old, net);
        WARN_ON(err);

        /* Add the device back in the hashes */
        list_netdevice(dev);

        /* Notify protocols, that a new device appeared. */
        call_netdevice_notifiers(NETDEV_REGISTER, dev);

        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

        synchronize_net();
        err = 0;
out:
        return err;
}
EXPORT_SYMBOL_GPL(__dev_change_net_namespace);

static int dev_cpu_dead(unsigned int oldcpu)
{
        struct sk_buff **list_skb;
        struct sk_buff *skb;
        unsigned int cpu;
        struct softnet_data *sd, *oldsd, *remsd = NULL;

        local_irq_disable();
        cpu = smp_processor_id();
        sd = &per_cpu(softnet_data, cpu);
        oldsd = &per_cpu(softnet_data, oldcpu);

        /* Find end of our completion_queue. */
        list_skb = &sd->completion_queue;
        while (*list_skb)
                list_skb = &(*list_skb)->next;
        /* Append completion queue from offline CPU. */
        *list_skb = oldsd->completion_queue;
        oldsd->completion_queue = NULL;

        /* Append output queue from offline CPU. */
        if (oldsd->output_queue) {
                *sd->output_queue_tailp = oldsd->output_queue;
                sd->output_queue_tailp = oldsd->output_queue_tailp;
                oldsd->output_queue = NULL;
                oldsd->output_queue_tailp = &oldsd->output_queue;
        }
        /* Append NAPI poll list from offline CPU, with one exception :
         * process_backlog() must be called by cpu owning percpu backlog.
         * We properly handle process_queue & input_pkt_queue later.
         */
        while (!list_empty(&oldsd->poll_list)) {
                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
                                                            struct napi_struct,
                                                            poll_list);

                list_del_init(&napi->poll_list);
                if (napi->poll == process_backlog)
                        napi->state &= NAPIF_STATE_THREADED;
                else
                        ____napi_schedule(sd, napi);
        }

        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_enable();

        if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
                remsd = oldsd->rps_ipi_list;
                oldsd->rps_ipi_list = NULL;
#endif
                /* send out pending IPI's on offline CPU */
                net_rps_send_ipi(remsd);
        }

        /* Process offline CPU's input_pkt_queue */
        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }
        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }

        return 0;
}

/**
 *        netdev_increment_features - increment feature set by one
 *        @all: current feature set
 *        @one: new feature set
 *        @mask: mask feature set
 *
 *        Computes a new feature set after adding a device with feature set
 *        @one to the master device with current feature set @all.  Will not
 *        enable anything that is off in @mask. Returns the new feature set.
 */
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask)
{
        if (mask & NETIF_F_HW_CSUM)
                mask |= NETIF_F_CSUM_MASK;
        mask |= NETIF_F_VLAN_CHALLENGED;

        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
        all &= one | ~NETIF_F_ALL_FOR_ALL;

        /* If one device supports hw checksumming, set for all. */
        if (all & NETIF_F_HW_CSUM)
                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);

        return all;
}
EXPORT_SYMBOL(netdev_increment_features);

static struct hlist_head * __net_init netdev_create_hash(void)
{
        int i;
        struct hlist_head *hash;

        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
        if (hash != NULL)
                for (i = 0; i < NETDEV_HASHENTRIES; i++)
                        INIT_HLIST_HEAD(&hash[i]);

        return hash;
}

/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
        BUILD_BUG_ON(GRO_HASH_BUCKETS >
                     8 * sizeof_field(struct napi_struct, gro_bitmask));

        INIT_LIST_HEAD(&net->dev_base_head);

        net->dev_name_head = netdev_create_hash();
        if (net->dev_name_head == NULL)
                goto err_name;

        net->dev_index_head = netdev_create_hash();
        if (net->dev_index_head == NULL)
                goto err_idx;

        xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);

        RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);

        return 0;

err_idx:
        kfree(net->dev_name_head);
err_name:
        return -ENOMEM;
}

/**
 *        netdev_drivername - network driver for the device
 *        @dev: network device
 *
 *        Determine network driver for device.
 */
const char *netdev_drivername(const struct net_device *dev)
{
        const struct device_driver *driver;
        const struct device *parent;
        const char *empty = "";

        parent = dev->dev.parent;
        if (!parent)
                return empty;

        driver = parent->driver;
        if (driver && driver->name)
                return driver->name;
        return empty;
}

static void __netdev_printk(const char *level, const struct net_device *dev,
                            struct va_format *vaf)
{
        if (dev && dev->dev.parent) {
                dev_printk_emit(level[1] - '0',
                                dev->dev.parent,
                                "%s %s %s%s: %pV",
                                dev_driver_string(dev->dev.parent),
                                dev_name(dev->dev.parent),
                                netdev_name(dev), netdev_reg_state(dev),
                                vaf);
        } else if (dev) {
                printk("%s%s%s: %pV",
                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
        } else {
                printk("%s(NULL net_device): %pV", level, vaf);
        }
}

void netdev_printk(const char *level, const struct net_device *dev,
                   const char *format, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, format);

        vaf.fmt = format;
        vaf.va = &args;

        __netdev_printk(level, dev, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(netdev_printk);

#define define_netdev_printk_level(func, level)                        \
void func(const struct net_device *dev, const char *fmt, ...)        \
{                                                                \
        struct va_format vaf;                                        \
        va_list args;                                                \
                                                                \
        va_start(args, fmt);                                        \
                                                                \
        vaf.fmt = fmt;                                                \
        vaf.va = &args;                                                \
                                                                \
        __netdev_printk(level, dev, &vaf);                        \
                                                                \
        va_end(args);                                                \
}                                                                \
EXPORT_SYMBOL(func);

define_netdev_printk_level(netdev_emerg, KERN_EMERG);
define_netdev_printk_level(netdev_alert, KERN_ALERT);
define_netdev_printk_level(netdev_crit, KERN_CRIT);
define_netdev_printk_level(netdev_err, KERN_ERR);
define_netdev_printk_level(netdev_warn, KERN_WARNING);
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
define_netdev_printk_level(netdev_info, KERN_INFO);

static void __net_exit netdev_exit(struct net *net)
{
        kfree(net->dev_name_head);
        kfree(net->dev_index_head);
        xa_destroy(&net->dev_by_index);
        if (net != &init_net)
                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}

static struct pernet_operations __net_initdata netdev_net_ops = {
        .init = netdev_init,
        .exit = netdev_exit,
};

static void __net_exit default_device_exit_net(struct net *net)
{
        struct netdev_name_node *name_node, *tmp;
        struct net_device *dev, *aux;
        /*
         * Push all migratable network devices back to the
         * initial network namespace
         */
        ASSERT_RTNL();
        for_each_netdev_safe(net, dev, aux) {
                int err;
                char fb_name[IFNAMSIZ];

                /* Ignore unmoveable devices (i.e. loopback) */
                if (dev->features & NETIF_F_NETNS_LOCAL)
                        continue;

                /* Leave virtual devices for the generic cleanup */
                if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
                        continue;

                /* Push remaining network devices to init_net */
                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
                if (netdev_name_in_use(&init_net, fb_name))
                        snprintf(fb_name, IFNAMSIZ, "dev%%d");

                netdev_for_each_altname_safe(dev, name_node, tmp)
                        if (netdev_name_in_use(&init_net, name_node->name))
                                __netdev_name_node_alt_destroy(name_node);

                err = dev_change_net_namespace(dev, &init_net, fb_name);
                if (err) {
                        pr_emerg("%s: failed to move %s to init_net: %d\n",
                                 __func__, dev->name, err);
                        BUG();
                }
        }
}

static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
        /* At exit all network devices most be removed from a network
         * namespace.  Do this in the reverse order of registration.
         * Do this across as many network namespaces as possible to
         * improve batching efficiency.
         */
        struct net_device *dev;
        struct net *net;
        LIST_HEAD(dev_kill_list);

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                default_device_exit_net(net);
                cond_resched();
        }

        list_for_each_entry(net, net_list, exit_list) {
                for_each_netdev_reverse(net, dev) {
                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
                        else
                                unregister_netdevice_queue(dev, &dev_kill_list);
                }
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();
}

static struct pernet_operations __net_initdata default_device_ops = {
        .exit_batch = default_device_exit_batch,
};

static void __init net_dev_struct_check(void)
{
        /* TX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
#ifdef CONFIG_XPS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);

        /* TXRX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);

        /* RX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
#ifdef CONFIG_NETPOLL
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
}

/*
 *        Initialize the DEV module. At boot time this walks the device list and
 *        unhooks any devices that fail to initialise (normally hardware not
 *        present) and leaves us with a valid list of present and active devices.
 *
 */

/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
#define SYSTEM_PERCPU_PAGE_POOL_SIZE        ((1 << 20) / PAGE_SIZE)

static int net_page_pool_create(int cpuid)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        struct page_pool_params page_pool_params = {
                .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
                .flags = PP_FLAG_SYSTEM_POOL,
                .nid = cpu_to_mem(cpuid),
        };
        struct page_pool *pp_ptr;

        pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
        if (IS_ERR(pp_ptr))
                return -ENOMEM;

        per_cpu(system_page_pool, cpuid) = pp_ptr;
#endif
        return 0;
}

static int backlog_napi_should_run(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
}

static void run_backlog_napi(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);

        napi_threaded_poll_loop(&sd->backlog);
}

static void backlog_napi_setup(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        napi->thread = this_cpu_read(backlog_napi);
        set_bit(NAPI_STATE_THREADED, &napi->state);
}

static struct smp_hotplug_thread backlog_threads = {
        .store                        = &backlog_napi,
        .thread_should_run        = backlog_napi_should_run,
        .thread_fn                = run_backlog_napi,
        .thread_comm                = "backlog_napi/%u",
        .setup                        = backlog_napi_setup,
};

/*
 *       This is called single threaded during boot, so no need
 *       to take the rtnl semaphore.
 */
static int __init net_dev_init(void)
{
        int i, rc = -ENOMEM;

        BUG_ON(!dev_boot_phase);

        net_dev_struct_check();

        if (dev_proc_init())
                goto out;

        if (netdev_kobject_init())
                goto out;

        for (i = 0; i < PTYPE_HASH_SIZE; i++)
                INIT_LIST_HEAD(&ptype_base[i]);

        if (register_pernet_subsys(&netdev_net_ops))
                goto out;

        /*
         *        Initialise the packet receive queues.
         */

        for_each_possible_cpu(i) {
                struct work_struct *flush = per_cpu_ptr(&flush_works, i);
                struct softnet_data *sd = &per_cpu(softnet_data, i);

                INIT_WORK(flush, flush_backlog);

                skb_queue_head_init(&sd->input_pkt_queue);
                skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
                skb_queue_head_init(&sd->xfrm_backlog);
#endif
                INIT_LIST_HEAD(&sd->poll_list);
                sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
                INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
                sd->cpu = i;
#endif
                INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
                spin_lock_init(&sd->defer_lock);

                init_gro_hash(&sd->backlog);
                sd->backlog.poll = process_backlog;
                sd->backlog.weight = weight_p;
                INIT_LIST_HEAD(&sd->backlog.poll_list);

                if (net_page_pool_create(i))
                        goto out;
        }
        if (use_backlog_threads())
                smpboot_register_percpu_thread(&backlog_threads);

        dev_boot_phase = 0;

        /* The loopback device is special if any other network devices
         * is present in a network namespace the loopback device must
         * be present. Since we now dynamically allocate and free the
         * loopback device ensure this invariant is maintained by
         * keeping the loopback device as the first device on the
         * list of network devices.  Ensuring the loopback devices
         * is the first device that appears and the last network device
         * that disappears.
         */
        if (register_pernet_device(&loopback_net_ops))
                goto out;

        if (register_pernet_device(&default_device_ops))
                goto out;

        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
        open_softirq(NET_RX_SOFTIRQ, net_rx_action);

        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
                                       NULL, dev_cpu_dead);
        WARN_ON(rc < 0);
        rc = 0;

        /* avoid static key IPIs to isolated CPUs */
        if (housekeeping_enabled(HK_TYPE_MISC))
                net_enable_timestamp();
out:
        if (rc < 0) {
                for_each_possible_cpu(i) {
                        struct page_pool *pp_ptr;

                        pp_ptr = per_cpu(system_page_pool, i);
                        if (!pp_ptr)
                                continue;

                        page_pool_destroy(pp_ptr);
                        per_cpu(system_page_pool, i) = NULL;
                }
        }

        return rc;
}

subsys_initcall(net_dev_init);










































































































































    2 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __IPC_NAMESPACE_H__
#define __IPC_NAMESPACE_H__

#include <linux/err.h>
#include <linux/idr.h>
#include <linux/rwsem.h>
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>

struct user_namespace;

struct ipc_ids {
        int in_use;
        unsigned short seq;
        struct rw_semaphore rwsem;
        struct idr ipcs_idr;
        int max_idx;
        int last_idx;        /* For wrap around detection */
#ifdef CONFIG_CHECKPOINT_RESTORE
        int next_id;
#endif
        struct rhashtable key_ht;
};

struct ipc_namespace {
        struct ipc_ids        ids[3];

        int                sem_ctls[4];
        int                used_sems;

        unsigned int        msg_ctlmax;
        unsigned int        msg_ctlmnb;
        unsigned int        msg_ctlmni;
        struct percpu_counter percpu_msg_bytes;
        struct percpu_counter percpu_msg_hdrs;

        size_t                shm_ctlmax;
        size_t                shm_ctlall;
        unsigned long        shm_tot;
        int                shm_ctlmni;
        /*
         * Defines whether IPC_RMID is forced for _all_ shm segments regardless
         * of shmctl()
         */
        int                shm_rmid_forced;

        struct notifier_block ipcns_nb;

        /* The kern_mount of the mqueuefs sb.  We take a ref on it */
        struct vfsmount        *mq_mnt;

        /* # queues in this ns, protected by mq_lock */
        unsigned int    mq_queues_count;

        /* next fields are set through sysctl */
        unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
        unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
        unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
        unsigned int    mq_msg_default;
        unsigned int    mq_msgsize_default;

        struct ctl_table_set        mq_set;
        struct ctl_table_header        *mq_sysctls;

        struct ctl_table_set        ipc_set;
        struct ctl_table_header        *ipc_sysctls;

        /* user_ns which owns the ipc ns */
        struct user_namespace *user_ns;
        struct ucounts *ucounts;

        struct llist_node mnt_llist;

        struct ns_common ns;
} __randomize_layout;

extern struct ipc_namespace init_ipc_ns;
extern spinlock_t mq_lock;

#ifdef CONFIG_SYSVIPC
extern void shm_destroy_orphaned(struct ipc_namespace *ns);
#else /* CONFIG_SYSVIPC */
static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
#endif /* CONFIG_SYSVIPC */

#ifdef CONFIG_POSIX_MQUEUE
extern int mq_init_ns(struct ipc_namespace *ns);
/*
 * POSIX Message Queue default values:
 *
 * MIN_*: Lowest value an admin can set the maximum unprivileged limit to
 * DFLT_*MAX: Default values for the maximum unprivileged limits
 * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply
 *   an attribute to the open call and the queue must be created
 * HARD_*: Highest value the maximums can be set to.  These are enforced
 *   on CAP_SYS_RESOURCE apps as well making them inviolate (so make them
 *   suitably high)
 *
 * POSIX Requirements:
 *   Per app minimum openable message queues - 8.  This does not map well
 *     to the fact that we limit the number of queues on a per namespace
 *     basis instead of a per app basis.  So, make the default high enough
 *     that no given app should have a hard time opening 8 queues.
 *   Minimum maximum for HARD_MSGMAX - 32767.  I bumped this to 65536.
 *   Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this.  However,
 *     we have run into a situation where running applications in the wild
 *     require this to be at least 5MB, and preferably 10MB, so I set the
 *     value to 16MB in hopes that this user is the worst of the bunch and
 *     the new maximum will handle anyone else.  I may have to revisit this
 *     in the future.
 */
#define DFLT_QUEUESMAX                      256
#define MIN_MSGMAX                        1
#define DFLT_MSG                       10U
#define DFLT_MSGMAX                       10
#define HARD_MSGMAX                    65536
#define MIN_MSGSIZEMAX                      128
#define DFLT_MSGSIZE                     8192U
#define DFLT_MSGSIZEMAX                     8192
#define HARD_MSGSIZEMAX            (16*1024*1024)
#else
static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
#endif

#if defined(CONFIG_IPC_NS)
extern struct ipc_namespace *copy_ipcs(unsigned long flags,
        struct user_namespace *user_ns, struct ipc_namespace *ns);

static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->ns.count);
        return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
        if (ns) {
                if (refcount_inc_not_zero(&ns->ns.count))
                        return ns;
        }

        return NULL;
}

extern void put_ipc_ns(struct ipc_namespace *ns);
#else
static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
        struct user_namespace *user_ns, struct ipc_namespace *ns)
{
        if (flags & CLONE_NEWIPC)
                return ERR_PTR(-EINVAL);

        return ns;
}

static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
{
        return ns;
}

static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
{
        return ns;
}

static inline void put_ipc_ns(struct ipc_namespace *ns)
{
}
#endif

#ifdef CONFIG_POSIX_MQUEUE_SYSCTL

void retire_mq_sysctls(struct ipc_namespace *ns);
bool setup_mq_sysctls(struct ipc_namespace *ns);

#else /* CONFIG_POSIX_MQUEUE_SYSCTL */

static inline void retire_mq_sysctls(struct ipc_namespace *ns)
{
}

static inline bool setup_mq_sysctls(struct ipc_namespace *ns)
{
        return true;
}

#endif /* CONFIG_POSIX_MQUEUE_SYSCTL */

#ifdef CONFIG_SYSVIPC_SYSCTL

bool setup_ipc_sysctls(struct ipc_namespace *ns);
void retire_ipc_sysctls(struct ipc_namespace *ns);

#else /* CONFIG_SYSVIPC_SYSCTL */

static inline void retire_ipc_sysctls(struct ipc_namespace *ns)
{
}

static inline bool setup_ipc_sysctls(struct ipc_namespace *ns)
{
        return true;
}

#endif /* CONFIG_SYSVIPC_SYSCTL */
#endif

















































































































































































































































































































































































































































































































































































































































































    2 


    2 





    2 


    2 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 






    1 


    1 
    1 

    1 
    1 









    1 
















    1 


    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    2 



























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 *                IPv4 specific functions
 *
 *                code split from:
 *                linux/ipv4/tcp.c
 *                linux/ipv4/tcp_input.c
 *                linux/ipv4/tcp_output.c
 *
 *                See tcp.c for author information
 */

/*
 * Changes:
 *                David S. Miller        :        New socket lookup architecture.
 *                                        This code is dedicated to John Dyson.
 *                David S. Miller :        Change semantics of established hash,
 *                                        half is devoted to TIME_WAIT sockets
 *                                        and the rest go in the other half.
 *                Andi Kleen :                Add support for syncookies and fixed
 *                                        some bugs: ip options weren't passed to
 *                                        the TCP layer, missed a check for an
 *                                        ACK bit.
 *                Andi Kleen :                Implemented fast path mtu discovery.
 *                                             Fixed many serious bugs in the
 *                                        request_sock handling and moved
 *                                        most of it into the af independent code.
 *                                        Added tail drop and some other bugfixes.
 *                                        Added new listen semantics.
 *                Mike McLagan        :        Routing by source
 *        Juan Jose Ciarlante:                ip_dynaddr bits
 *                Andi Kleen:                various fixes.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year
 *                                        coma.
 *        Andi Kleen                :        Fix new listen.
 *        Andi Kleen                :        Fix accept error reporting.
 *        YOSHIFUJI Hideaki @USAGI and:        Support IPV6_V6ONLY socket option, which
 *        Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 *                                        a single port at the same time.
 */

#define pr_fmt(fmt) "TCP: " fmt

#include <linux/bottom_half.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
#include <linux/slab.h>
#include <linux/sched.h>

#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>

#include <linux/inet.h>
#include <linux/ipv6.h>
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/inetdevice.h>
#include <linux/btf_ids.h>

#include <crypto/hash.h>
#include <linux/scatterlist.h>

#include <trace/events/tcp.h>

#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
                               __be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif

struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);

static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
        return secure_tcp_seq(ip_hdr(skb)->daddr,
                              ip_hdr(skb)->saddr,
                              tcp_hdr(skb)->dest,
                              tcp_hdr(skb)->source);
}

static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
        return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
}

int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
        int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
        const struct inet_timewait_sock *tw = inet_twsk(sktw);
        const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
        struct tcp_sock *tp = tcp_sk(sk);
        int ts_recent_stamp;

        if (reuse == 2) {
                /* Still does not detect *everything* that goes through
                 * lo, since we require a loopback src or dst address
                 * or direct binding to 'lo' interface.
                 */
                bool loopback = false;
                if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
                        loopback = true;
#if IS_ENABLED(CONFIG_IPV6)
                if (tw->tw_family == AF_INET6) {
                        if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
                            ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
                            ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
                                loopback = true;
                } else
#endif
                {
                        if (ipv4_is_loopback(tw->tw_daddr) ||
                            ipv4_is_loopback(tw->tw_rcv_saddr))
                                loopback = true;
                }
                if (!loopback)
                        reuse = 0;
        }

        /* With PAWS, it is safe from the viewpoint
           of data integrity. Even without PAWS it is safe provided sequence
           spaces do not overlap i.e. at data rates <= 80Mbit/sec.

           Actually, the idea is close to VJ's one, only timestamp cache is
           held not per host, but per port pair and TW bucket is used as state
           holder.

           If TW bucket has been already destroyed we fall back to VJ's scheme
           and use initial timestamp retrieved from peer table.
         */
        ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
        if (ts_recent_stamp &&
            (!twp || (reuse && time_after32(ktime_get_seconds(),
                                            ts_recent_stamp)))) {
                /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
                 * and releasing the bucket lock.
                 */
                if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
                        return 0;

                /* In case of repair and re-using TIME-WAIT sockets we still
                 * want to be sure that it is safe as above but honor the
                 * sequence numbers and time stamps set as part of the repair
                 * process.
                 *
                 * Without this check re-using a TIME-WAIT socket with TCP
                 * repair would accumulate a -1 on the repair assigned
                 * sequence number. The first time it is reused the sequence
                 * is -1, the second time -2, etc. This fixes that issue
                 * without appearing to create any others.
                 */
                if (likely(!tp->repair)) {
                        u32 seq = tcptw->tw_snd_nxt + 65535 + 2;

                        if (!seq)
                                seq = 1;
                        WRITE_ONCE(tp->write_seq, seq);
                        tp->rx_opt.ts_recent           = READ_ONCE(tcptw->tw_ts_recent);
                        tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
                }

                return 1;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);

static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
                              int addr_len)
{
        /* This check is replicated from tcp_v4_connect() and intended to
         * prevent BPF program called below from accessing bytes that are out
         * of the bound specified by user in addr_len.
         */
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        sock_owned_by_me(sk);

        return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
}

/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
        struct inet_timewait_death_row *tcp_death_row;
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct ip_options_rcu *inet_opt;
        struct net *net = sock_net(sk);
        __be16 orig_sport, orig_dport;
        __be32 daddr, nexthop;
        struct flowi4 *fl4;
        struct rtable *rt;
        int err;

        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        if (usin->sin_family != AF_INET)
                return -EAFNOSUPPORT;

        nexthop = daddr = usin->sin_addr.s_addr;
        inet_opt = rcu_dereference_protected(inet->inet_opt,
                                             lockdep_sock_is_held(sk));
        if (inet_opt && inet_opt->opt.srr) {
                if (!daddr)
                        return -EINVAL;
                nexthop = inet_opt->opt.faddr;
        }

        orig_sport = inet->inet_sport;
        orig_dport = usin->sin_port;
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
                              sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
                              orig_dport, sk);
        if (IS_ERR(rt)) {
                err = PTR_ERR(rt);
                if (err == -ENETUNREACH)
                        IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
                return err;
        }

        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
                ip_rt_put(rt);
                return -ENETUNREACH;
        }

        if (!inet_opt || !inet_opt->opt.srr)
                daddr = fl4->daddr;

        tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;

        if (!inet->inet_saddr) {
                err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
                if (err) {
                        ip_rt_put(rt);
                        return err;
                }
        } else {
                sk_rcv_saddr_set(sk, inet->inet_saddr);
        }

        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
                /* Reset inherited state */
                tp->rx_opt.ts_recent           = 0;
                tp->rx_opt.ts_recent_stamp = 0;
                if (likely(!tp->repair))
                        WRITE_ONCE(tp->write_seq, 0);
        }

        inet->inet_dport = usin->sin_port;
        sk_daddr_set(sk, daddr);

        inet_csk(sk)->icsk_ext_hdr_len = 0;
        if (inet_opt)
                inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;

        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;

        /* Socket identity is still unknown (sport may be zero).
         * However we set state to SYN-SENT and not releasing socket
         * lock select source port, enter ourselves into the hash tables and
         * complete initialization after this.
         */
        tcp_set_state(sk, TCP_SYN_SENT);
        err = inet_hash_connect(tcp_death_row, sk);
        if (err)
                goto failure;

        sk_set_txhash(sk);

        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
                               inet->inet_sport, inet->inet_dport, sk);
        if (IS_ERR(rt)) {
                err = PTR_ERR(rt);
                rt = NULL;
                goto failure;
        }
        tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
        /* OK, now commit destination to socket.  */
        sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->dst);
        rt = NULL;

        if (likely(!tp->repair)) {
                if (!tp->write_seq)
                        WRITE_ONCE(tp->write_seq,
                                   secure_tcp_seq(inet->inet_saddr,
                                                  inet->inet_daddr,
                                                  inet->inet_sport,
                                                  usin->sin_port));
                WRITE_ONCE(tp->tsoffset,
                           secure_tcp_ts_off(net, inet->inet_saddr,
                                             inet->inet_daddr));
        }

        atomic_set(&inet->inet_id, get_random_u16());

        if (tcp_fastopen_defer_connect(sk, &err))
                return err;
        if (err)
                goto failure;

        err = tcp_connect(sk);

        if (err)
                goto failure;

        return 0;

failure:
        /*
         * This unhashes the socket and releases the local port,
         * if necessary.
         */
        tcp_set_state(sk, TCP_CLOSE);
        inet_bhash2_reset_saddr(sk);
        ip_rt_put(rt);
        sk->sk_route_caps = 0;
        inet->inet_dport = 0;
        return err;
}
EXPORT_SYMBOL(tcp_v4_connect);

/*
 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 * It can be called through tcp_release_cb() if socket was owned by user
 * at the time tcp_v4_err() was called to handle ICMP message.
 */
void tcp_v4_mtu_reduced(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        struct dst_entry *dst;
        u32 mtu;

        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                return;
        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
        dst = inet_csk_update_pmtu(sk, mtu);
        if (!dst)
                return;

        /* Something is about to be wrong... Remember soft error
         * for the case, if this connection will not able to recover.
         */
        if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
                WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);

        mtu = dst_mtu(dst);

        if (inet->pmtudisc != IP_PMTUDISC_DONT &&
            ip_sk_accept_pmtu(sk) &&
            inet_csk(sk)->icsk_pmtu_cookie > mtu) {
                tcp_sync_mss(sk, mtu);

                /* Resend the TCP packet because it's
                 * clear that the old packet has been
                 * dropped. This is the new "fast" path mtu
                 * discovery.
                 */
                tcp_simple_retransmit(sk);
        } /* else let the usual retransmit timer handle it */
}
EXPORT_SYMBOL(tcp_v4_mtu_reduced);

static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_check(sk, 0);

        if (dst)
                dst->ops->redirect(dst, sk, skb);
}


/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
void tcp_req_err(struct sock *sk, u32 seq, bool abort)
{
        struct request_sock *req = inet_reqsk(sk);
        struct net *net = sock_net(sk);

        /* ICMPs are not backlogged, hence we cannot get
         * an established socket here.
         */
        if (seq != tcp_rsk(req)->snt_isn) {
                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
        } else if (abort) {
                /*
                 * Still in SYN_RECV, just remove it silently.
                 * There is no good way to pass the error to the newly
                 * created socket, and POSIX does not want network
                 * errors returned from accept().
                 */
                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
                tcp_listendrop(req->rsk_listener);
        }
        reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);

/* TCP-LD (RFC 6069) logic */
void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        s32 remaining;
        u32 delta_us;

        if (sock_owned_by_user(sk))
                return;

        if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
            !icsk->icsk_backoff)
                return;

        skb = tcp_rtx_queue_head(sk);
        if (WARN_ON_ONCE(!skb))
                return;

        icsk->icsk_backoff--;
        icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
        icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);

        tcp_mstamp_refresh(tp);
        delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
        remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);

        if (remaining > 0) {
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                          remaining, TCP_RTO_MAX);
        } else {
                /* RTO revert clocked out retransmission.
                 * Will retransmit now.
                 */
                tcp_retransmit_timer(sk);
        }
}
EXPORT_SYMBOL(tcp_ld_RTO_revert);

/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.  After adjustment
 * header points to the first 8 bytes of the tcp header.  We need
 * to find the appropriate port.
 *
 * The locking strategy used here is very "optimistic". When
 * someone else accesses the socket the ICMP is just dropped
 * and for some paths there is no check at all.
 * A more general error queue to queue errors for later handling
 * is probably better.
 *
 */

int tcp_v4_err(struct sk_buff *skb, u32 info)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
        struct tcp_sock *tp;
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
        struct sock *sk;
        struct request_sock *fastopen;
        u32 seq, snd_una;
        int err;
        struct net *net = dev_net(skb->dev);

        sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
                                       iph->daddr, th->dest, iph->saddr,
                                       ntohs(th->source), inet_iif(skb), 0);
        if (!sk) {
                __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
                return -ENOENT;
        }
        if (sk->sk_state == TCP_TIME_WAIT) {
                /* To increase the counter of ignored icmps for TCP-AO */
                tcp_ao_ignore_icmp(sk, AF_INET, type, code);
                inet_twsk_put(inet_twsk(sk));
                return 0;
        }
        seq = ntohl(th->seq);
        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
                                     type == ICMP_TIME_EXCEEDED ||
                                     (type == ICMP_DEST_UNREACH &&
                                      (code == ICMP_NET_UNREACH ||
                                       code == ICMP_HOST_UNREACH)));
                return 0;
        }

        if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
                sock_put(sk);
                return 0;
        }

        bh_lock_sock(sk);
        /* If too many ICMPs get dropped on busy
         * servers this needs to be solved differently.
         * We do take care of PMTU discovery (RFC1191) special case :
         * we can receive locally generated ICMP messages while socket is held.
         */
        if (sock_owned_by_user(sk)) {
                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
        }
        if (sk->sk_state == TCP_CLOSE)
                goto out;

        if (static_branch_unlikely(&ip4_min_ttl)) {
                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        goto out;
                }
        }

        tp = tcp_sk(sk);
        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
        fastopen = rcu_dereference(tp->fastopen_rsk);
        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
        if (sk->sk_state != TCP_LISTEN &&
            !between(seq, snd_una, tp->snd_nxt)) {
                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }

        switch (type) {
        case ICMP_REDIRECT:
                if (!sock_owned_by_user(sk))
                        do_redirect(skb, sk);
                goto out;
        case ICMP_SOURCE_QUENCH:
                /* Just silently ignore these. */
                goto out;
        case ICMP_PARAMETERPROB:
                err = EPROTO;
                break;
        case ICMP_DEST_UNREACH:
                if (code > NR_ICMP_UNREACH)
                        goto out;

                if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
                        /* We are not interested in TCP_LISTEN and open_requests
                         * (SYN-ACKs send out by Linux are always <576bytes so
                         * they should go through unfragmented).
                         */
                        if (sk->sk_state == TCP_LISTEN)
                                goto out;

                        WRITE_ONCE(tp->mtu_info, info);
                        if (!sock_owned_by_user(sk)) {
                                tcp_v4_mtu_reduced(sk);
                        } else {
                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
                                        sock_hold(sk);
                        }
                        goto out;
                }

                err = icmp_err_convert[code].errno;
                /* check if this ICMP message allows revert of backoff.
                 * (see RFC 6069)
                 */
                if (!fastopen &&
                    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
                        tcp_ld_RTO_revert(sk, seq);
                break;
        case ICMP_TIME_EXCEEDED:
                err = EHOSTUNREACH;
                break;
        default:
                goto out;
        }

        switch (sk->sk_state) {
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:
                /* Only in fast or simultaneous open. If a fast open socket is
                 * already accepted it is treated as a connected one below.
                 */
                if (fastopen && !fastopen->sk)
                        break;

                ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);

                if (!sock_owned_by_user(sk))
                        tcp_done_with_error(sk, err);
                else
                        WRITE_ONCE(sk->sk_err_soft, err);
                goto out;
        }

        /* If we've already connected we will keep trying
         * until we time out, or the user gives up.
         *
         * rfc1122 4.2.3.9 allows to consider as hard errors
         * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
         * but it is obsoleted by pmtu discovery).
         *
         * Note, that in modern internet, where routing is unreliable
         * and in each dark corner broken firewalls sit, sending random
         * errors ordered by their masters even this two messages finally lose
         * their original sense (even Linux sends invalid PORT_UNREACHs)
         *
         * Now we are in compliance with RFCs.
         *                                                        --ANK (980905)
         */

        if (!sock_owned_by_user(sk) &&
            inet_test_bit(RECVERR, sk)) {
                WRITE_ONCE(sk->sk_err, err);
                sk_error_report(sk);
        } else        { /* Only an error on timeout */
                WRITE_ONCE(sk->sk_err_soft, err);
        }

out:
        bh_unlock_sock(sk);
        sock_put(sk);
        return 0;
}

void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
{
        struct tcphdr *th = tcp_hdr(skb);

        th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
        skb->csum_start = skb_transport_header(skb) - skb->head;
        skb->csum_offset = offsetof(struct tcphdr, check);
}

/* This routine computes an IPv4 TCP checksum. */
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
        const struct inet_sock *inet = inet_sk(sk);

        __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
EXPORT_SYMBOL(tcp_v4_send_check);

#define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))

static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
                                 const struct tcp_ao_hdr *aoh,
                                 struct ip_reply_arg *arg, struct tcphdr *reply,
                                 __be32 reply_options[REPLY_OPTIONS_LEN])
{
#ifdef CONFIG_TCP_AO
        int sdif = tcp_v4_sdif(skb);
        int dif = inet_iif(skb);
        int l3index = sdif ? dif : 0;
        bool allocated_traffic_key;
        struct tcp_ao_key *key;
        char *traffic_key;
        bool drop = true;
        u32 ao_sne = 0;
        u8 keyid;

        rcu_read_lock();
        if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
                                 &key, &traffic_key, &allocated_traffic_key,
                                 &keyid, &ao_sne))
                goto out;

        reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
                                 (aoh->rnext_keyid << 8) | keyid);
        arg->iov[0].iov_len += tcp_ao_len_aligned(key);
        reply->doff = arg->iov[0].iov_len / 4;

        if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
                            key, traffic_key,
                            (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
                            (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
                            reply, ao_sne))
                goto out;
        drop = false;
out:
        rcu_read_unlock();
        if (allocated_traffic_key)
                kfree(traffic_key);
        return drop;
#else
        return true;
#endif
}

/*
 *        This routine will send an RST to the other tcp.
 *
 *        Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 *                      for reset.
 *        Answer: if a packet caused RST, it is not for a socket
 *                existing in our system, if it is matched to a socket,
 *                it is just duplicate segment or bug in other side's TCP.
 *                So that we build reply only basing on parameters
 *                arrived with segment.
 *        Exception: precedence violation. We do not implement it in any case.
 */

static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
                              enum sk_rst_reason reason)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct {
                struct tcphdr th;
                __be32 opt[REPLY_OPTIONS_LEN];
        } rep;
        const __u8 *md5_hash_location = NULL;
        const struct tcp_ao_hdr *aoh;
        struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *key = NULL;
        unsigned char newhash[16];
        struct sock *sk1 = NULL;
        int genhash;
#endif
        u64 transmit_time = 0;
        struct sock *ctl_sk;
        struct net *net;
        u32 txhash = 0;

        /* Never send a reset in response to a reset. */
        if (th->rst)
                return;

        /* If sk not NULL, it means we did a successful lookup and incoming
         * route had to be correct. prequeue might have dropped our dst.
         */
        if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
                return;

        /* Swap the send and the receive. */
        memset(&rep, 0, sizeof(rep));
        rep.th.dest   = th->source;
        rep.th.source = th->dest;
        rep.th.doff   = sizeof(struct tcphdr) / 4;
        rep.th.rst    = 1;

        if (th->ack) {
                rep.th.seq = th->ack_seq;
        } else {
                rep.th.ack = 1;
                rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
                                       skb->len - (th->doff << 2));
        }

        memset(&arg, 0, sizeof(arg));
        arg.iov[0].iov_base = (unsigned char *)&rep;
        arg.iov[0].iov_len  = sizeof(rep.th);

        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);

        /* Invalid TCP option size or twice included auth */
        if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
                return;

        if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
                return;

#ifdef CONFIG_TCP_MD5SIG
        rcu_read_lock();
        if (sk && sk_fullsock(sk)) {
                const union tcp_md5_addr *addr;
                int l3index;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and inet_iif is set to it.
                 */
                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
        } else if (md5_hash_location) {
                const union tcp_md5_addr *addr;
                int sdif = tcp_v4_sdif(skb);
                int dif = inet_iif(skb);
                int l3index;

                /*
                 * active side is lost. Try to find listening socket through
                 * source port, and then find md5 key through listening socket.
                 * we are not loose security here:
                 * Incoming packet is checked with md5 hash with finding key,
                 * no RST generated if md5 hash doesn't match.
                 */
                sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
                                             NULL, 0, ip_hdr(skb)->saddr,
                                             th->source, ip_hdr(skb)->daddr,
                                             ntohs(th->source), dif, sdif);
                /* don't send rst if it can't find key */
                if (!sk1)
                        goto out;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and dif is set to it.
                 */
                l3index = sdif ? dif : 0;
                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
                if (!key)
                        goto out;


                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
                if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
                        goto out;

        }

        if (key) {
                rep.opt[0] = htonl((TCPOPT_NOP << 24) |
                                   (TCPOPT_NOP << 16) |
                                   (TCPOPT_MD5SIG << 8) |
                                   TCPOLEN_MD5SIG);
                /* Update length and the length the header thinks exists */
                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
                rep.th.doff = arg.iov[0].iov_len / 4;

                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
                                     key, ip_hdr(skb)->saddr,
                                     ip_hdr(skb)->daddr, &rep.th);
        }
#endif
        /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
        if (rep.opt[0] == 0) {
                __be32 mrst = mptcp_reset_option(skb);

                if (mrst) {
                        rep.opt[0] = mrst;
                        arg.iov[0].iov_len += sizeof(mrst);
                        rep.th.doff = arg.iov[0].iov_len / 4;
                }
        }

        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
                                      ip_hdr(skb)->saddr, /* XXX */
                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
        arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;

        /* When socket is gone, all binding information is lost.
         * routing might fail in this case. No choice here, if we choose to force
         * input interface, we will misroute in case of asymmetric route.
         */
        if (sk)
                arg.bound_dev_if = sk->sk_bound_dev_if;

        trace_tcp_send_reset(sk, skb, reason);

        BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
                     offsetof(struct inet_timewait_sock, tw_bound_dev_if));

        arg.tos = ip_hdr(skb)->tos;
        arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
        local_bh_disable();
        local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
        ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);

        sock_net_set(ctl_sk, net);
        if (sk) {
                ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
                                   inet_twsk(sk)->tw_mark : sk->sk_mark;
                ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
                                   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
                transmit_time = tcp_transmit_time(sk);
                xfrm_sk_clone_policy(ctl_sk, sk);
                txhash = (sk->sk_state == TCP_TIME_WAIT) ?
                         inet_twsk(sk)->tw_txhash : sk->sk_txhash;
        } else {
                ctl_sk->sk_mark = 0;
                ctl_sk->sk_priority = 0;
        }
        ip_send_unicast_reply(ctl_sk,
                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                              &arg, arg.iov[0].iov_len,
                              transmit_time, txhash);

        xfrm_sk_free_policy(ctl_sk);
        sock_net_set(ctl_sk, &init_net);
        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
        __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
        local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
        local_bh_enable();

#ifdef CONFIG_TCP_MD5SIG
out:
        rcu_read_unlock();
#endif
}

/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
   outside socket context is ugly, certainly. What can I do?
 */

static void tcp_v4_send_ack(const struct sock *sk,
                            struct sk_buff *skb, u32 seq, u32 ack,
                            u32 win, u32 tsval, u32 tsecr, int oif,
                            struct tcp_key *key,
                            int reply_flags, u8 tos, u32 txhash)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct {
                struct tcphdr th;
                __be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
        } rep;
        struct net *net = sock_net(sk);
        struct ip_reply_arg arg;
        struct sock *ctl_sk;
        u64 transmit_time;

        memset(&rep.th, 0, sizeof(struct tcphdr));
        memset(&arg, 0, sizeof(arg));

        arg.iov[0].iov_base = (unsigned char *)&rep;
        arg.iov[0].iov_len  = sizeof(rep.th);
        if (tsecr) {
                rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
                                   (TCPOPT_TIMESTAMP << 8) |
                                   TCPOLEN_TIMESTAMP);
                rep.opt[1] = htonl(tsval);
                rep.opt[2] = htonl(tsecr);
                arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
        }

        /* Swap the send and the receive. */
        rep.th.dest    = th->source;
        rep.th.source  = th->dest;
        rep.th.doff    = arg.iov[0].iov_len / 4;
        rep.th.seq     = htonl(seq);
        rep.th.ack_seq = htonl(ack);
        rep.th.ack     = 1;
        rep.th.window  = htons(win);

#ifdef CONFIG_TCP_MD5SIG
        if (tcp_key_is_md5(key)) {
                int offset = (tsecr) ? 3 : 0;

                rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
                                          (TCPOPT_NOP << 16) |
                                          (TCPOPT_MD5SIG << 8) |
                                          TCPOLEN_MD5SIG);
                arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
                rep.th.doff = arg.iov[0].iov_len/4;

                tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
                                    key->md5_key, ip_hdr(skb)->saddr,
                                    ip_hdr(skb)->daddr, &rep.th);
        }
#endif
#ifdef CONFIG_TCP_AO
        if (tcp_key_is_ao(key)) {
                int offset = (tsecr) ? 3 : 0;

                rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
                                          (tcp_ao_len(key->ao_key) << 16) |
                                          (key->ao_key->sndid << 8) |
                                          key->rcv_next);
                arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
                rep.th.doff = arg.iov[0].iov_len / 4;

                tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
                                key->ao_key, key->traffic_key,
                                (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
                                (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
                                &rep.th, key->sne);
        }
#endif
        arg.flags = reply_flags;
        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
                                      ip_hdr(skb)->saddr, /* XXX */
                                      arg.iov[0].iov_len, IPPROTO_TCP, 0);
        arg.csumoffset = offsetof(struct tcphdr, check) / 2;
        if (oif)
                arg.bound_dev_if = oif;
        arg.tos = tos;
        arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
        local_bh_disable();
        local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
        ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
        sock_net_set(ctl_sk, net);
        ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
                           inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
        ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
                           inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
        transmit_time = tcp_transmit_time(sk);
        ip_send_unicast_reply(ctl_sk,
                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                              &arg, arg.iov[0].iov_len,
                              transmit_time, txhash);

        sock_net_set(ctl_sk, &init_net);
        __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
        local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
        local_bh_enable();
}

static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
        struct tcp_key key = {};
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao_info;

        if (static_branch_unlikely(&tcp_ao_needed.key)) {
                /* FIXME: the segment to-be-acked is not verified yet */
                ao_info = rcu_dereference(tcptw->ao_info);
                if (ao_info) {
                        const struct tcp_ao_hdr *aoh;

                        if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
                                inet_twsk_put(tw);
                                return;
                        }

                        if (aoh)
                                key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1);
                }
        }
        if (key.ao_key) {
                struct tcp_ao_key *rnext_key;

                key.traffic_key = snd_other_key(key.ao_key);
                key.sne = READ_ONCE(ao_info->snd_sne);
                rnext_key = READ_ONCE(ao_info->rnext_key);
                key.rcv_next = rnext_key->rcvid;
                key.type = TCP_KEY_AO;
#else
        if (0) {
#endif
        } else if (static_branch_tcp_md5()) {
                key.md5_key = tcp_twsk_md5_key(tcptw);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
        }

        tcp_v4_send_ack(sk, skb,
                        tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
                        tcp_tw_tsval(tcptw),
                        READ_ONCE(tcptw->tw_ts_recent),
                        tw->tw_bound_dev_if, &key,
                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
                        tw->tw_tos,
                        tw->tw_txhash);

        inet_twsk_put(tw);
}

static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
{
        struct tcp_key key = {};

        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
         */
        u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
                                             tcp_sk(sk)->snd_nxt;

#ifdef CONFIG_TCP_AO
        if (static_branch_unlikely(&tcp_ao_needed.key) &&
            tcp_rsk_used_ao(req)) {
                const union tcp_md5_addr *addr;
                const struct tcp_ao_hdr *aoh;
                int l3index;

                /* Invalid TCP option size or twice included auth */
                if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
                        return;
                if (!aoh)
                        return;

                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
                key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
                                              aoh->rnext_keyid, -1);
                if (unlikely(!key.ao_key)) {
                        /* Send ACK with any matching MKT for the peer */
                        key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
                        /* Matching key disappeared (user removed the key?)
                         * let the handshake timeout.
                         */
                        if (!key.ao_key) {
                                net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
                                                     addr,
                                                     ntohs(tcp_hdr(skb)->source),
                                                     &ip_hdr(skb)->daddr,
                                                     ntohs(tcp_hdr(skb)->dest));
                                return;
                        }
                }
                key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
                if (!key.traffic_key)
                        return;

                key.type = TCP_KEY_AO;
                key.rcv_next = aoh->keyid;
                tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
#else
        if (0) {
#endif
        } else if (static_branch_tcp_md5()) {
                const union tcp_md5_addr *addr;
                int l3index;

                addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
                l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
                key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
        }

        tcp_v4_send_ack(sk, skb, seq,
                        tcp_rsk(req)->rcv_nxt,
                        tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
                        tcp_rsk_tsval(tcp_rsk(req)),
                        READ_ONCE(req->ts_recent),
                        0, &key,
                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
                        ip_hdr(skb)->tos,
                        READ_ONCE(tcp_rsk(req)->txhash));
        if (tcp_key_is_ao(&key))
                kfree(key.traffic_key);
}

/*
 *        Send a SYN-ACK after having received a SYN.
 *        This still operates on a request_sock only, not on a big
 *        socket.
 */
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
                              struct flowi *fl,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              enum tcp_synack_type synack_type,
                              struct sk_buff *syn_skb)
{
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
        int err = -1;
        struct sk_buff *skb;
        u8 tos;

        /* First, grab a route. */
        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
                return -1;

        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);

        if (skb) {
                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);

                tos = READ_ONCE(inet_sk(sk)->tos);

                if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
                        tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
                              (tos & INET_ECN_MASK);

                if (!INET_ECN_is_capable(tos) &&
                    tcp_bpf_ca_needs_ecn((struct sock *)req))
                        tos |= INET_ECN_ECT_0;

                rcu_read_lock();
                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
                                            ireq->ir_rmt_addr,
                                            rcu_dereference(ireq->ireq_opt),
                                            tos);
                rcu_read_unlock();
                err = net_xmit_eval(err);
        }

        return err;
}

/*
 *        IPv4 request_sock destructor.
 */
static void tcp_v4_reqsk_destructor(struct request_sock *req)
{
        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
}

#ifdef CONFIG_TCP_MD5SIG
/*
 * RFC2385 MD5 checksumming requires a mapping of
 * IP address->MD5 Key.
 * We need to maintain these in the sk structure.
 */

DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
EXPORT_SYMBOL(tcp_md5_needed);

static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
{
        if (!old)
                return true;

        /* l3index always overrides non-l3index */
        if (old->l3index && new->l3index == 0)
                return false;
        if (old->l3index == 0 && new->l3index)
                return true;

        return old->prefixlen < new->prefixlen;
}

/* Find the Key structure for an address.  */
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
                                           const union tcp_md5_addr *addr,
                                           int family, bool any_l3index)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        const struct tcp_md5sig_info *md5sig;
        __be32 mask;
        struct tcp_md5sig_key *best_match = NULL;
        bool match;

        /* caller either holds rcu_read_lock() or socket lock */
        md5sig = rcu_dereference_check(tp->md5sig_info,
                                       lockdep_sock_is_held(sk));
        if (!md5sig)
                return NULL;

        hlist_for_each_entry_rcu(key, &md5sig->head, node,
                                 lockdep_sock_is_held(sk)) {
                if (key->family != family)
                        continue;
                if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
                    key->l3index != l3index)
                        continue;
                if (family == AF_INET) {
                        mask = inet_make_mask(key->prefixlen);
                        match = (key->addr.a4.s_addr & mask) ==
                                (addr->a4.s_addr & mask);
#if IS_ENABLED(CONFIG_IPV6)
                } else if (family == AF_INET6) {
                        match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
                                                  key->prefixlen);
#endif
                } else {
                        match = false;
                }

                if (match && better_md5_match(best_match, key))
                        best_match = key;
        }
        return best_match;
}
EXPORT_SYMBOL(__tcp_md5_do_lookup);

static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
                                                      const union tcp_md5_addr *addr,
                                                      int family, u8 prefixlen,
                                                      int l3index, u8 flags)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        unsigned int size = sizeof(struct in_addr);
        const struct tcp_md5sig_info *md5sig;

        /* caller either holds rcu_read_lock() or socket lock */
        md5sig = rcu_dereference_check(tp->md5sig_info,
                                       lockdep_sock_is_held(sk));
        if (!md5sig)
                return NULL;
#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6)
                size = sizeof(struct in6_addr);
#endif
        hlist_for_each_entry_rcu(key, &md5sig->head, node,
                                 lockdep_sock_is_held(sk)) {
                if (key->family != family)
                        continue;
                if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
                        continue;
                if (key->l3index != l3index)
                        continue;
                if (!memcmp(&key->addr, addr, size) &&
                    key->prefixlen == prefixlen)
                        return key;
        }
        return NULL;
}

struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
                                         const struct sock *addr_sk)
{
        const union tcp_md5_addr *addr;
        int l3index;

        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
                                                 addr_sk->sk_bound_dev_if);
        addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
        return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);

static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_info *md5sig;

        md5sig = kmalloc(sizeof(*md5sig), gfp);
        if (!md5sig)
                return -ENOMEM;

        sk_gso_disable(sk);
        INIT_HLIST_HEAD(&md5sig->head);
        rcu_assign_pointer(tp->md5sig_info, md5sig);
        return 0;
}

/* This can be called on a newly created socket, from other files */
static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
                            int family, u8 prefixlen, int l3index, u8 flags,
                            const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
        /* Add Key to the list */
        struct tcp_md5sig_key *key;
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_info *md5sig;

        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
        if (key) {
                /* Pre-existing entry - just update that one.
                 * Note that the key might be used concurrently.
                 * data_race() is telling kcsan that we do not care of
                 * key mismatches, since changing MD5 key on live flows
                 * can lead to packet drops.
                 */
                data_race(memcpy(key->key, newkey, newkeylen));

                /* Pairs with READ_ONCE() in tcp_md5_hash_key().
                 * Also note that a reader could catch new key->keylen value
                 * but old key->key[], this is the reason we use __GFP_ZERO
                 * at sock_kmalloc() time below these lines.
                 */
                WRITE_ONCE(key->keylen, newkeylen);

                return 0;
        }

        md5sig = rcu_dereference_protected(tp->md5sig_info,
                                           lockdep_sock_is_held(sk));

        key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
        if (!key)
                return -ENOMEM;

        memcpy(key->key, newkey, newkeylen);
        key->keylen = newkeylen;
        key->family = family;
        key->prefixlen = prefixlen;
        key->l3index = l3index;
        key->flags = flags;
        memcpy(&key->addr, addr,
               (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
                                                                 sizeof(struct in_addr));
        hlist_add_head_rcu(&key->node, &md5sig->head);
        return 0;
}

int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
                   int family, u8 prefixlen, int l3index, u8 flags,
                   const u8 *newkey, u8 newkeylen)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
                if (tcp_md5_alloc_sigpool())
                        return -ENOMEM;

                if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
                        tcp_md5_release_sigpool();
                        return -ENOMEM;
                }

                if (!static_branch_inc(&tcp_md5_needed.key)) {
                        struct tcp_md5sig_info *md5sig;

                        md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
                        rcu_assign_pointer(tp->md5sig_info, NULL);
                        kfree_rcu(md5sig, rcu);
                        tcp_md5_release_sigpool();
                        return -EUSERS;
                }
        }

        return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
                                newkey, newkeylen, GFP_KERNEL);
}
EXPORT_SYMBOL(tcp_md5_do_add);

int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
                     int family, u8 prefixlen, int l3index,
                     struct tcp_md5sig_key *key)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
                tcp_md5_add_sigpool();

                if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) {
                        tcp_md5_release_sigpool();
                        return -ENOMEM;
                }

                if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
                        struct tcp_md5sig_info *md5sig;

                        md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
                        net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
                        rcu_assign_pointer(tp->md5sig_info, NULL);
                        kfree_rcu(md5sig, rcu);
                        tcp_md5_release_sigpool();
                        return -EUSERS;
                }
        }

        return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
                                key->flags, key->key, key->keylen,
                                sk_gfp_mask(sk, GFP_ATOMIC));
}
EXPORT_SYMBOL(tcp_md5_key_copy);

int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
                   u8 prefixlen, int l3index, u8 flags)
{
        struct tcp_md5sig_key *key;

        key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
        if (!key)
                return -ENOENT;
        hlist_del_rcu(&key->node);
        atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
        kfree_rcu(key, rcu);
        return 0;
}
EXPORT_SYMBOL(tcp_md5_do_del);

void tcp_clear_md5_list(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        struct hlist_node *n;
        struct tcp_md5sig_info *md5sig;

        md5sig = rcu_dereference_protected(tp->md5sig_info, 1);

        hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
                hlist_del_rcu(&key->node);
                atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
                kfree_rcu(key, rcu);
        }
}

static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
                                 sockptr_t optval, int optlen)
{
        struct tcp_md5sig cmd;
        struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
        const union tcp_md5_addr *addr;
        u8 prefixlen = 32;
        int l3index = 0;
        bool l3flag;
        u8 flags;

        if (optlen < sizeof(cmd))
                return -EINVAL;

        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
                return -EFAULT;

        if (sin->sin_family != AF_INET)
                return -EINVAL;

        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
        l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;

        if (optname == TCP_MD5SIG_EXT &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
                prefixlen = cmd.tcpm_prefixlen;
                if (prefixlen > 32)
                        return -EINVAL;
        }

        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
                if (dev && netif_is_l3_master(dev))
                        l3index = dev->ifindex;

                rcu_read_unlock();

                /* ok to reference set/not set outside of rcu;
                 * right now device MUST be an L3 master
                 */
                if (!dev || !l3index)
                        return -EINVAL;
        }

        addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;

        if (!cmd.tcpm_keylen)
                return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);

        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
                return -EINVAL;

        /* Don't allow keys for peers that have a matching TCP-AO key.
         * See the comment in tcp_ao_add_cmd()
         */
        if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
                return -EKEYREJECTED;

        return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
                              cmd.tcpm_key, cmd.tcpm_keylen);
}

static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
                                   __be32 daddr, __be32 saddr,
                                   const struct tcphdr *th, int nbytes)
{
        struct tcp4_pseudohdr *bp;
        struct scatterlist sg;
        struct tcphdr *_th;

        bp = hp->scratch;
        bp->saddr = saddr;
        bp->daddr = daddr;
        bp->pad = 0;
        bp->protocol = IPPROTO_TCP;
        bp->len = cpu_to_be16(nbytes);

        _th = (struct tcphdr *)(bp + 1);
        memcpy(_th, th, sizeof(*th));
        _th->check = 0;

        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
        ahash_request_set_crypt(hp->req, &sg, NULL,
                                sizeof(*bp) + sizeof(*th));
        return crypto_ahash_update(hp->req);
}

static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
                               __be32 daddr, __be32 saddr, const struct tcphdr *th)
{
        struct tcp_sigpool hp;

        if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
                goto clear_hash_nostart;

        if (crypto_ahash_init(hp.req))
                goto clear_hash;
        if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
                goto clear_hash;
        if (tcp_md5_hash_key(&hp, key))
                goto clear_hash;
        ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
        if (crypto_ahash_final(hp.req))
                goto clear_hash;

        tcp_sigpool_end(&hp);
        return 0;

clear_hash:
        tcp_sigpool_end(&hp);
clear_hash_nostart:
        memset(md5_hash, 0, 16);
        return 1;
}

int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
                        const struct sock *sk,
                        const struct sk_buff *skb)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct tcp_sigpool hp;
        __be32 saddr, daddr;

        if (sk) { /* valid for establish/request sockets */
                saddr = sk->sk_rcv_saddr;
                daddr = sk->sk_daddr;
        } else {
                const struct iphdr *iph = ip_hdr(skb);
                saddr = iph->saddr;
                daddr = iph->daddr;
        }

        if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
                goto clear_hash_nostart;

        if (crypto_ahash_init(hp.req))
                goto clear_hash;

        if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
                goto clear_hash;
        if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
                goto clear_hash;
        if (tcp_md5_hash_key(&hp, key))
                goto clear_hash;
        ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
        if (crypto_ahash_final(hp.req))
                goto clear_hash;

        tcp_sigpool_end(&hp);
        return 0;

clear_hash:
        tcp_sigpool_end(&hp);
clear_hash_nostart:
        memset(md5_hash, 0, 16);
        return 1;
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);

#endif

static void tcp_v4_init_req(struct request_sock *req,
                            const struct sock *sk_listener,
                            struct sk_buff *skb)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        struct net *net = sock_net(sk_listener);

        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
}

static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
                                          struct sk_buff *skb,
                                          struct flowi *fl,
                                          struct request_sock *req,
                                          u32 tw_isn)
{
        tcp_v4_init_req(req, sk, skb);

        if (security_inet_conn_request(sk, skb, req))
                return NULL;

        return inet_csk_route_req(sk, &fl->u.ip4, req);
}

struct request_sock_ops tcp_request_sock_ops __read_mostly = {
        .family                =        PF_INET,
        .obj_size        =        sizeof(struct tcp_request_sock),
        .rtx_syn_ack        =        tcp_rtx_synack,
        .send_ack        =        tcp_v4_reqsk_send_ack,
        .destructor        =        tcp_v4_reqsk_destructor,
        .send_reset        =        tcp_v4_send_reset,
        .syn_ack_timeout =        tcp_syn_ack_timeout,
};

const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
        .mss_clamp        =        TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
        .req_md5_lookup        =        tcp_v4_md5_lookup,
        .calc_md5_hash        =        tcp_v4_md5_hash_skb,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v4_ao_lookup_rsk,
        .ao_calc_key        =        tcp_v4_ao_calc_key_rsk,
        .ao_synack_hash        =        tcp_v4_ao_synack_hash,
#endif
#ifdef CONFIG_SYN_COOKIES
        .cookie_init_seq =        cookie_v4_init_sequence,
#endif
        .route_req        =        tcp_v4_route_req,
        .init_seq        =        tcp_v4_init_seq,
        .init_ts_off        =        tcp_v4_init_ts_off,
        .send_synack        =        tcp_v4_send_synack,
};

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
        /* Never answer to SYNs send to broadcast or multicast */
        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
                goto drop;

        return tcp_conn_request(&tcp_request_sock_ops,
                                &tcp_request_sock_ipv4_ops, sk, skb);

drop:
        tcp_listendrop(sk);
        return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);


/*
 * The three way handshake has completed - we got a valid synack -
 * now create the new socket.
 */
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req,
                                  struct dst_entry *dst,
                                  struct request_sock *req_unhash,
                                  bool *own_req)
{
        struct inet_request_sock *ireq;
        bool found_dup_sk = false;
        struct inet_sock *newinet;
        struct tcp_sock *newtp;
        struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
        const union tcp_md5_addr *addr;
        struct tcp_md5sig_key *key;
        int l3index;
#endif
        struct ip_options_rcu *inet_opt;

        if (sk_acceptq_is_full(sk))
                goto exit_overflow;

        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
                goto exit_nonewsk;

        newsk->sk_gso_type = SKB_GSO_TCPV4;
        inet_sk_rx_dst_set(newsk, skb);

        newtp                      = tcp_sk(newsk);
        newinet                      = inet_sk(newsk);
        ireq                      = inet_rsk(req);
        sk_daddr_set(newsk, ireq->ir_rmt_addr);
        sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
        newsk->sk_bound_dev_if = ireq->ir_iif;
        newinet->inet_saddr   = ireq->ir_loc_addr;
        inet_opt              = rcu_dereference(ireq->ireq_opt);
        RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
        newinet->mc_index     = inet_iif(skb);
        newinet->mc_ttl              = ip_hdr(skb)->ttl;
        newinet->rcv_tos      = ip_hdr(skb)->tos;
        inet_csk(newsk)->icsk_ext_hdr_len = 0;
        if (inet_opt)
                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
        atomic_set(&newinet->inet_id, get_random_u16());

        /* Set ToS of the new socket based upon the value of incoming SYN.
         * ECT bits are set later in tcp_init_transfer().
         */
        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
                newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;

        if (!dst) {
                dst = inet_csk_route_child_sock(sk, newsk, req);
                if (!dst)
                        goto put_and_exit;
        } else {
                /* syncookie case : see end of cookie_v4_check() */
        }
        sk_setup_caps(newsk, dst);

        tcp_ca_openreq_child(newsk, dst);

        tcp_sync_mss(newsk, dst_mtu(dst));
        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));

        tcp_initialize_rcv_mss(newsk);

#ifdef CONFIG_TCP_MD5SIG
        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
        /* Copy over the MD5 key from the original socket */
        addr = (union tcp_md5_addr *)&newinet->inet_daddr;
        key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
        if (key && !tcp_rsk_used_ao(req)) {
                if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
                        goto put_and_exit;
                sk_gso_disable(newsk);
        }
#endif
#ifdef CONFIG_TCP_AO
        if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
                goto put_and_exit; /* OOM, release back memory */
#endif

        if (__inet_inherit_port(sk, newsk) < 0)
                goto put_and_exit;
        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
                                       &found_dup_sk);
        if (likely(*own_req)) {
                tcp_move_syn(newtp, req);
                ireq->ireq_opt = NULL;
        } else {
                newinet->inet_opt = NULL;

                if (!req_unhash && found_dup_sk) {
                        /* This code path should only be executed in the
                         * syncookie case only
                         */
                        bh_unlock_sock(newsk);
                        sock_put(newsk);
                        newsk = NULL;
                }
        }
        return newsk;

exit_overflow:
        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
        dst_release(dst);
exit:
        tcp_listendrop(sk);
        return NULL;
put_and_exit:
        newinet->inet_opt = NULL;
        inet_csk_prepare_forced_close(newsk);
        tcp_done(newsk);
        goto exit;
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);

static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
        const struct tcphdr *th = tcp_hdr(skb);

        if (!th->syn)
                sk = cookie_v4_check(sk, skb);
#endif
        return sk;
}

u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
                         struct tcphdr *th, u32 *cookie)
{
        u16 mss = 0;
#ifdef CONFIG_SYN_COOKIES
        mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
                                    &tcp_request_sock_ipv4_ops, sk, th);
        if (mss) {
                *cookie = __cookie_v4_init_sequence(iph, th, &mss);
                tcp_synq_overflow(sk);
        }
#endif
        return mss;
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
/* The socket must have it's spinlock held when we get
 * here, unless it is a TCP_LISTEN socket.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        enum skb_drop_reason reason;
        struct sock *rsk;

        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                struct dst_entry *dst;

                dst = rcu_dereference_protected(sk->sk_rx_dst,
                                                lockdep_sock_is_held(sk));

                sock_rps_save_rxhash(sk, skb);
                sk_mark_napi_id(sk, skb);
                if (dst) {
                        if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
                            !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
                                             dst, 0)) {
                                RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
                                dst_release(dst);
                        }
                }
                tcp_rcv_established(sk, skb);
                return 0;
        }

        if (tcp_checksum_complete(skb))
                goto csum_err;

        if (sk->sk_state == TCP_LISTEN) {
                struct sock *nsk = tcp_v4_cookie_check(sk, skb);

                if (!nsk)
                        return 0;
                if (nsk != sk) {
                        reason = tcp_child_process(sk, nsk, skb);
                        if (reason) {
                                rsk = nsk;
                                goto reset;
                        }
                        return 0;
                }
        } else
                sock_rps_save_rxhash(sk, skb);

        reason = tcp_rcv_state_process(sk, skb);
        if (reason) {
                rsk = sk;
                goto reset;
        }
        return 0;

reset:
        tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
discard:
        sk_skb_reason_drop(sk, skb, reason);
        /* Be careful here. If this function gets more complicated and
         * gcc suffers from register pressure on the x86, sk (in %ebx)
         * might be destroyed here. This current version compiles correctly,
         * but you have been warned.
         */
        return 0;

csum_err:
        reason = SKB_DROP_REASON_TCP_CSUM;
        trace_tcp_bad_csum(skb);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
        goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);

int tcp_v4_early_demux(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        const struct iphdr *iph;
        const struct tcphdr *th;
        struct sock *sk;

        if (skb->pkt_type != PACKET_HOST)
                return 0;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
                return 0;

        iph = ip_hdr(skb);
        th = tcp_hdr(skb);

        if (th->doff < sizeof(struct tcphdr) / 4)
                return 0;

        sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
                                       iph->saddr, th->source,
                                       iph->daddr, ntohs(th->dest),
                                       skb->skb_iif, inet_sdif(skb));
        if (sk) {
                skb->sk = sk;
                skb->destructor = sock_edemux;
                if (sk_fullsock(sk)) {
                        struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);

                        if (dst)
                                dst = dst_check(dst, 0);
                        if (dst &&
                            sk->sk_rx_dst_ifindex == skb->skb_iif)
                                skb_dst_set_noref(skb, dst);
                }
        }
        return 0;
}

bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
                     enum skb_drop_reason *reason)
{
        u32 tail_gso_size, tail_gso_segs;
        struct skb_shared_info *shinfo;
        const struct tcphdr *th;
        struct tcphdr *thtail;
        struct sk_buff *tail;
        unsigned int hdrlen;
        bool fragstolen;
        u32 gso_segs;
        u32 gso_size;
        u64 limit;
        int delta;

        /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
         * we can fix skb->truesize to its real value to avoid future drops.
         * This is valid because skb is not yet charged to the socket.
         * It has been noticed pure SACK packets were sometimes dropped
         * (if cooked by drivers without copybreak feature).
         */
        skb_condense(skb);

        skb_dst_drop(skb);

        if (unlikely(tcp_checksum_complete(skb))) {
                bh_unlock_sock(sk);
                trace_tcp_bad_csum(skb);
                *reason = SKB_DROP_REASON_TCP_CSUM;
                __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
                __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
                return true;
        }

        /* Attempt coalescing to last skb in backlog, even if we are
         * above the limits.
         * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
         */
        th = (const struct tcphdr *)skb->data;
        hdrlen = th->doff * 4;

        tail = sk->sk_backlog.tail;
        if (!tail)
                goto no_coalesce;
        thtail = (struct tcphdr *)tail->data;

        if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
            TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
            ((TCP_SKB_CB(tail)->tcp_flags |
              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
            !((TCP_SKB_CB(tail)->tcp_flags &
              TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
            ((TCP_SKB_CB(tail)->tcp_flags ^
              TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
            !tcp_skb_can_collapse_rx(tail, skb) ||
            thtail->doff != th->doff ||
            memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
                goto no_coalesce;

        __skb_pull(skb, hdrlen);

        shinfo = skb_shinfo(skb);
        gso_size = shinfo->gso_size ?: skb->len;
        gso_segs = shinfo->gso_segs ?: 1;

        shinfo = skb_shinfo(tail);
        tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
        tail_gso_segs = shinfo->gso_segs ?: 1;

        if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
                TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;

                if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
                        TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
                        thtail->window = th->window;
                }

                /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
                 * thtail->fin, so that the fast path in tcp_rcv_established()
                 * is not entered if we append a packet with a FIN.
                 * SYN, RST, URG are not present.
                 * ACK is set on both packets.
                 * PSH : we do not really care in TCP stack,
                 *       at least for 'GRO' packets.
                 */
                thtail->fin |= th->fin;
                TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;

                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        TCP_SKB_CB(tail)->has_rxtstamp = true;
                        tail->tstamp = skb->tstamp;
                        skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
                }

                /* Not as strict as GRO. We only need to carry mss max value */
                shinfo->gso_size = max(gso_size, tail_gso_size);
                shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);

                sk->sk_backlog.len += delta;
                __NET_INC_STATS(sock_net(sk),
                                LINUX_MIB_TCPBACKLOGCOALESCE);
                kfree_skb_partial(skb, fragstolen);
                return false;
        }
        __skb_push(skb, hdrlen);

no_coalesce:
        /* sk->sk_backlog.len is reset only at the end of __release_sock().
         * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
         * sk_rcvbuf in normal conditions.
         */
        limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;

        limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;

        /* Only socket owner can try to collapse/prune rx queues
         * to reduce memory overhead, so add a little headroom here.
         * Few sockets backlog are possibly concurrently non empty.
         */
        limit += 64 * 1024;

        limit = min_t(u64, limit, UINT_MAX);

        if (unlikely(sk_add_backlog(sk, skb, limit))) {
                bh_unlock_sock(sk);
                *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
                return true;
        }
        return false;
}
EXPORT_SYMBOL(tcp_add_backlog);

int tcp_filter(struct sock *sk, struct sk_buff *skb)
{
        struct tcphdr *th = (struct tcphdr *)skb->data;

        return sk_filter_trim_cap(sk, skb, th->doff * 4);
}
EXPORT_SYMBOL(tcp_filter);

static void tcp_v4_restore_cb(struct sk_buff *skb)
{
        memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
                sizeof(struct inet_skb_parm));
}

static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
                           const struct tcphdr *th)
{
        /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
         * barrier() makes sure compiler wont play fool^Waliasing games.
         */
        memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
                sizeof(struct inet_skb_parm));
        barrier();

        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                                    skb->len - th->doff * 4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
        TCP_SKB_CB(skb)->sacked         = 0;
        TCP_SKB_CB(skb)->has_rxtstamp =
                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}

/*
 *        From tcp_input.c
 */

int tcp_v4_rcv(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        enum skb_drop_reason drop_reason;
        int sdif = inet_sdif(skb);
        int dif = inet_iif(skb);
        const struct iphdr *iph;
        const struct tcphdr *th;
        struct sock *sk = NULL;
        bool refcounted;
        int ret;
        u32 isn;

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (skb->pkt_type != PACKET_HOST)
                goto discard_it;

        /* Count it even if it's bad */
        __TCP_INC_STATS(net, TCP_MIB_INSEGS);

        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                goto discard_it;

        th = (const struct tcphdr *)skb->data;

        if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                goto bad_packet;
        }
        if (!pskb_may_pull(skb, th->doff * 4))
                goto discard_it;

        /* An explanation is required here, I think.
         * Packet length and doff are validated by header prediction,
         * provided case of th->doff==0 is eliminated.
         * So, we defer the checks. */

        if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
                goto csum_error;

        th = (const struct tcphdr *)skb->data;
        iph = ip_hdr(skb);
lookup:
        sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
                               skb, __tcp_hdrlen(th), th->source,
                               th->dest, sdif, &refcounted);
        if (!sk)
                goto no_tcp_socket;

        if (sk->sk_state == TCP_TIME_WAIT)
                goto do_time_wait;

        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
                bool req_stolen = false;
                struct sock *nsk;

                sk = req->rsk_listener;
                if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
                        drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                else
                        drop_reason = tcp_inbound_hash(sk, req, skb,
                                                       &iph->saddr, &iph->daddr,
                                                       AF_INET, dif, sdif);
                if (unlikely(drop_reason)) {
                        sk_drops_add(sk, skb);
                        reqsk_put(req);
                        goto discard_it;
                }
                if (tcp_checksum_complete(skb)) {
                        reqsk_put(req);
                        goto csum_error;
                }
                if (unlikely(sk->sk_state != TCP_LISTEN)) {
                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
                        if (!nsk) {
                                inet_csk_reqsk_queue_drop_and_put(sk, req);
                                goto lookup;
                        }
                        sk = nsk;
                        /* reuseport_migrate_sock() has already held one sk_refcnt
                         * before returning.
                         */
                } else {
                        /* We own a reference on the listener, increase it again
                         * as we might lose it too soon.
                         */
                        sock_hold(sk);
                }
                refcounted = true;
                nsk = NULL;
                if (!tcp_filter(sk, skb)) {
                        th = (const struct tcphdr *)skb->data;
                        iph = ip_hdr(skb);
                        tcp_v4_fill_cb(skb, iph, th);
                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
                } else {
                        drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                }
                if (!nsk) {
                        reqsk_put(req);
                        if (req_stolen) {
                                /* Another cpu got exclusive access to req
                                 * and created a full blown socket.
                                 * Try to feed this packet to this socket
                                 * instead of discarding it.
                                 */
                                tcp_v4_restore_cb(skb);
                                sock_put(sk);
                                goto lookup;
                        }
                        goto discard_and_relse;
                }
                nf_reset_ct(skb);
                if (nsk == sk) {
                        reqsk_put(req);
                        tcp_v4_restore_cb(skb);
                } else {
                        drop_reason = tcp_child_process(sk, nsk, skb);
                        if (drop_reason) {
                                enum sk_rst_reason rst_reason;

                                rst_reason = sk_rst_convert_drop_reason(drop_reason);
                                tcp_v4_send_reset(nsk, skb, rst_reason);
                                goto discard_and_relse;
                        }
                        sock_put(sk);
                        return 0;
                }
        }

process:
        if (static_branch_unlikely(&ip4_min_ttl)) {
                /* min_ttl can be changed concurrently from do_ip_setsockopt() */
                if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        drop_reason = SKB_DROP_REASON_TCP_MINTTL;
                        goto discard_and_relse;
                }
        }

        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                goto discard_and_relse;
        }

        drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
                                       AF_INET, dif, sdif);
        if (drop_reason)
                goto discard_and_relse;

        nf_reset_ct(skb);

        if (tcp_filter(sk, skb)) {
                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                goto discard_and_relse;
        }
        th = (const struct tcphdr *)skb->data;
        iph = ip_hdr(skb);
        tcp_v4_fill_cb(skb, iph, th);

        skb->dev = NULL;

        if (sk->sk_state == TCP_LISTEN) {
                ret = tcp_v4_do_rcv(sk, skb);
                goto put_and_return;
        }

        sk_incoming_cpu_update(sk);

        bh_lock_sock_nested(sk);
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
                ret = tcp_v4_do_rcv(sk, skb);
        } else {
                if (tcp_add_backlog(sk, skb, &drop_reason))
                        goto discard_and_relse;
        }
        bh_unlock_sock(sk);

put_and_return:
        if (refcounted)
                sock_put(sk);

        return ret;

no_tcp_socket:
        drop_reason = SKB_DROP_REASON_NO_SOCKET;
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto discard_it;

        tcp_v4_fill_cb(skb, iph, th);

        if (tcp_checksum_complete(skb)) {
csum_error:
                drop_reason = SKB_DROP_REASON_TCP_CSUM;
                trace_tcp_bad_csum(skb);
                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
                __TCP_INC_STATS(net, TCP_MIB_INERRS);
        } else {
                tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
        }

discard_it:
        SKB_DR_OR(drop_reason, NOT_SPECIFIED);
        /* Discard frame. */
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;

discard_and_relse:
        sk_drops_add(sk, skb);
        if (refcounted)
                sock_put(sk);
        goto discard_it;

do_time_wait:
        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                inet_twsk_put(inet_twsk(sk));
                goto discard_it;
        }

        tcp_v4_fill_cb(skb, iph, th);

        if (tcp_checksum_complete(skb)) {
                inet_twsk_put(inet_twsk(sk));
                goto csum_error;
        }
        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
        case TCP_TW_SYN: {
                struct sock *sk2 = inet_lookup_listener(net,
                                                        net->ipv4.tcp_death_row.hashinfo,
                                                        skb, __tcp_hdrlen(th),
                                                        iph->saddr, th->source,
                                                        iph->daddr, th->dest,
                                                        inet_iif(skb),
                                                        sdif);
                if (sk2) {
                        inet_twsk_deschedule_put(inet_twsk(sk));
                        sk = sk2;
                        tcp_v4_restore_cb(skb);
                        refcounted = false;
                        __this_cpu_write(tcp_tw_isn, isn);
                        goto process;
                }
        }
                /* to ACK */
                fallthrough;
        case TCP_TW_ACK:
                tcp_v4_timewait_ack(sk, skb);
                break;
        case TCP_TW_RST:
                tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
                inet_twsk_deschedule_put(inet_twsk(sk));
                goto discard_it;
        case TCP_TW_SUCCESS:;
        }
        goto discard_it;
}

static struct timewait_sock_ops tcp_timewait_sock_ops = {
        .twsk_obj_size        = sizeof(struct tcp_timewait_sock),
        .twsk_destructor= tcp_twsk_destructor,
};

void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst_hold_safe(dst)) {
                rcu_assign_pointer(sk->sk_rx_dst, dst);
                sk->sk_rx_dst_ifindex = skb->skb_iif;
        }
}
EXPORT_SYMBOL(inet_sk_rx_dst_set);

const struct inet_connection_sock_af_ops ipv4_specific = {
        .queue_xmit           = ip_queue_xmit,
        .send_check           = tcp_v4_send_check,
        .rebuild_header           = inet_sk_rebuild_header,
        .sk_rx_dst_set           = inet_sk_rx_dst_set,
        .conn_request           = tcp_v4_conn_request,
        .syn_recv_sock           = tcp_v4_syn_recv_sock,
        .net_header_len           = sizeof(struct iphdr),
        .setsockopt           = ip_setsockopt,
        .getsockopt           = ip_getsockopt,
        .addr2sockaddr           = inet_csk_addr2sockaddr,
        .sockaddr_len           = sizeof(struct sockaddr_in),
        .mtu_reduced           = tcp_v4_mtu_reduced,
};
EXPORT_SYMBOL(ipv4_specific);

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
#ifdef CONFIG_TCP_MD5SIG
        .md5_lookup                = tcp_v4_md5_lookup,
        .calc_md5_hash                = tcp_v4_md5_hash_skb,
        .md5_parse                = tcp_v4_parse_md5_keys,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup                = tcp_v4_ao_lookup,
        .calc_ao_hash                = tcp_v4_ao_hash_skb,
        .ao_parse                = tcp_v4_parse_ao,
        .ao_calc_key_sk                = tcp_v4_ao_calc_key_sk,
#endif
};
#endif

/* NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
static int tcp_v4_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_init_sock(sk);

        icsk->icsk_af_ops = &ipv4_specific;

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif

        return 0;
}

#ifdef CONFIG_TCP_MD5SIG
static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
{
        struct tcp_md5sig_info *md5sig;

        md5sig = container_of(head, struct tcp_md5sig_info, rcu);
        kfree(md5sig);
        static_branch_slow_dec_deferred(&tcp_md5_needed);
        tcp_md5_release_sigpool();
}
#endif

void tcp_v4_destroy_sock(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        trace_tcp_destroy_sock(sk);

        tcp_clear_xmit_timers(sk);

        tcp_cleanup_congestion_control(sk);

        tcp_cleanup_ulp(sk);

        /* Cleanup up the write buffer. */
        tcp_write_queue_purge(sk);

        /* Check if we want to disable active TFO */
        tcp_fastopen_active_disable_ofo_check(sk);

        /* Cleans up our, hopefully empty, out_of_order_queue. */
        skb_rbtree_purge(&tp->out_of_order_queue);

#ifdef CONFIG_TCP_MD5SIG
        /* Clean up the MD5 key list, if any */
        if (tp->md5sig_info) {
                struct tcp_md5sig_info *md5sig;

                md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
                tcp_clear_md5_list(sk);
                call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
                rcu_assign_pointer(tp->md5sig_info, NULL);
        }
#endif
        tcp_ao_destroy_sock(sk, false);

        /* Clean up a referenced TCP bind bucket. */
        if (inet_csk(sk)->icsk_bind_hash)
                inet_put_port(sk);

        BUG_ON(rcu_access_pointer(tp->fastopen_rsk));

        /* If socket is aborted during connect operation */
        tcp_free_fastopen_req(tp);
        tcp_fastopen_destroy_cipher(sk);
        tcp_saved_syn_free(tp);

        sk_sockets_allocated_dec(sk);
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);

#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */

static unsigned short seq_file_family(const struct seq_file *seq);

static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
{
        unsigned short family = seq_file_family(seq);

        /* AF_UNSPEC is used as a match all */
        return ((family == AF_UNSPEC || family == sk->sk_family) &&
                net_eq(sock_net(sk), seq_file_net(seq)));
}

/* Find a non empty bucket (starting from st->bucket)
 * and return the first sk from it.
 */
static void *listening_get_first(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;

        st->offset = 0;
        for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
                struct inet_listen_hashbucket *ilb2;
                struct hlist_nulls_node *node;
                struct sock *sk;

                ilb2 = &hinfo->lhash2[st->bucket];
                if (hlist_nulls_empty(&ilb2->nulls_head))
                        continue;

                spin_lock(&ilb2->lock);
                sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
                        if (seq_sk_match(seq, sk))
                                return sk;
                }
                spin_unlock(&ilb2->lock);
        }

        return NULL;
}

/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
 * If "cur" is the last one in the st->bucket,
 * call listening_get_first() to return the first sk of the next
 * non empty bucket.
 */
static void *listening_get_next(struct seq_file *seq, void *cur)
{
        struct tcp_iter_state *st = seq->private;
        struct inet_listen_hashbucket *ilb2;
        struct hlist_nulls_node *node;
        struct inet_hashinfo *hinfo;
        struct sock *sk = cur;

        ++st->num;
        ++st->offset;

        sk = sk_nulls_next(sk);
        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk))
                        return sk;
        }

        hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        ilb2 = &hinfo->lhash2[st->bucket];
        spin_unlock(&ilb2->lock);
        ++st->bucket;
        return listening_get_first(seq);
}

static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc;

        st->bucket = 0;
        st->offset = 0;
        rc = listening_get_first(seq);

        while (rc && *pos) {
                rc = listening_get_next(seq, rc);
                --*pos;
        }
        return rc;
}

static inline bool empty_bucket(struct inet_hashinfo *hinfo,
                                const struct tcp_iter_state *st)
{
        return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
}

/*
 * Get first established socket starting from bucket given in st->bucket.
 * If st->bucket is zero, the very first socket in the hash is returned.
 */
static void *established_get_first(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;

        st->offset = 0;
        for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
                struct sock *sk;
                struct hlist_nulls_node *node;
                spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);

                cond_resched();

                /* Lockless fast path for the common case of empty buckets */
                if (empty_bucket(hinfo, st))
                        continue;

                spin_lock_bh(lock);
                sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
                        if (seq_sk_match(seq, sk))
                                return sk;
                }
                spin_unlock_bh(lock);
        }

        return NULL;
}

static void *established_get_next(struct seq_file *seq, void *cur)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;
        struct hlist_nulls_node *node;
        struct sock *sk = cur;

        ++st->num;
        ++st->offset;

        sk = sk_nulls_next(sk);

        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk))
                        return sk;
        }

        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
        ++st->bucket;
        return established_get_first(seq);
}

static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc;

        st->bucket = 0;
        rc = established_get_first(seq);

        while (rc && pos) {
                rc = established_get_next(seq, rc);
                --pos;
        }
        return rc;
}

static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
{
        void *rc;
        struct tcp_iter_state *st = seq->private;

        st->state = TCP_SEQ_STATE_LISTENING;
        rc          = listening_get_idx(seq, &pos);

        if (!rc) {
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                rc          = established_get_idx(seq, pos);
        }

        return rc;
}

static void *tcp_seek_last_pos(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;
        int bucket = st->bucket;
        int offset = st->offset;
        int orig_num = st->num;
        void *rc = NULL;

        switch (st->state) {
        case TCP_SEQ_STATE_LISTENING:
                if (st->bucket > hinfo->lhash2_mask)
                        break;
                rc = listening_get_first(seq);
                while (offset-- && rc && bucket == st->bucket)
                        rc = listening_get_next(seq, rc);
                if (rc)
                        break;
                st->bucket = 0;
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                fallthrough;
        case TCP_SEQ_STATE_ESTABLISHED:
                if (st->bucket > hinfo->ehash_mask)
                        break;
                rc = established_get_first(seq);
                while (offset-- && rc && bucket == st->bucket)
                        rc = established_get_next(seq, rc);
        }

        st->num = orig_num;

        return rc;
}

void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc;

        if (*pos && *pos == st->last_pos) {
                rc = tcp_seek_last_pos(seq);
                if (rc)
                        goto out;
        }

        st->state = TCP_SEQ_STATE_LISTENING;
        st->num = 0;
        st->bucket = 0;
        st->offset = 0;
        rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;

out:
        st->last_pos = *pos;
        return rc;
}
EXPORT_SYMBOL(tcp_seq_start);

void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct tcp_iter_state *st = seq->private;
        void *rc = NULL;

        if (v == SEQ_START_TOKEN) {
                rc = tcp_get_idx(seq, 0);
                goto out;
        }

        switch (st->state) {
        case TCP_SEQ_STATE_LISTENING:
                rc = listening_get_next(seq, v);
                if (!rc) {
                        st->state = TCP_SEQ_STATE_ESTABLISHED;
                        st->bucket = 0;
                        st->offset = 0;
                        rc          = established_get_first(seq);
                }
                break;
        case TCP_SEQ_STATE_ESTABLISHED:
                rc = established_get_next(seq, v);
                break;
        }
out:
        ++*pos;
        st->last_pos = *pos;
        return rc;
}
EXPORT_SYMBOL(tcp_seq_next);

void tcp_seq_stop(struct seq_file *seq, void *v)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct tcp_iter_state *st = seq->private;

        switch (st->state) {
        case TCP_SEQ_STATE_LISTENING:
                if (v != SEQ_START_TOKEN)
                        spin_unlock(&hinfo->lhash2[st->bucket].lock);
                break;
        case TCP_SEQ_STATE_ESTABLISHED:
                if (v)
                        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
                break;
        }
}
EXPORT_SYMBOL(tcp_seq_stop);

static void get_openreq4(const struct request_sock *req,
                         struct seq_file *f, int i)
{
        const struct inet_request_sock *ireq = inet_rsk(req);
        long delta = req->rsk_timer.expires - jiffies;

        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
                i,
                ireq->ir_loc_addr,
                ireq->ir_num,
                ireq->ir_rmt_addr,
                ntohs(ireq->ir_rmt_port),
                TCP_SYN_RECV,
                0, 0, /* could print option size, but that is af dependent. */
                1,    /* timers active (only the expire timer) */
                jiffies_delta_to_clock_t(delta),
                req->num_timeout,
                from_kuid_munged(seq_user_ns(f),
                                 sock_i_uid(req->rsk_listener)),
                0,  /* non standard timer */
                0, /* open_requests have no inode */
                0,
                req);
}

static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
{
        int timer_active;
        unsigned long timer_expires;
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct inet_sock *inet = inet_sk(sk);
        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
        __be32 dest = inet->inet_daddr;
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
        __u16 srcp = ntohs(inet->inet_sport);
        int rx_queue;
        int state;

        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                timer_active        = 1;
                timer_expires        = icsk->icsk_timeout;
        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
                timer_active        = 4;
                timer_expires        = icsk->icsk_timeout;
        } else if (timer_pending(&sk->sk_timer)) {
                timer_active        = 2;
                timer_expires        = sk->sk_timer.expires;
        } else {
                timer_active        = 0;
                timer_expires = jiffies;
        }

        state = inet_sk_state_load(sk);
        if (state == TCP_LISTEN)
                rx_queue = READ_ONCE(sk->sk_ack_backlog);
        else
                /* Because we don't lock the socket,
                 * we might find a transient negative value.
                 */
                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
                                      READ_ONCE(tp->copied_seq), 0);

        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
                        "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
                i, src, srcp, dest, destp, state,
                READ_ONCE(tp->write_seq) - tp->snd_una,
                rx_queue,
                timer_active,
                jiffies_delta_to_clock_t(timer_expires - jiffies),
                icsk->icsk_retransmits,
                from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
                icsk->icsk_probes_out,
                sock_i_ino(sk),
                refcount_read(&sk->sk_refcnt), sk,
                jiffies_to_clock_t(icsk->icsk_rto),
                jiffies_to_clock_t(icsk->icsk_ack.ato),
                (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
                tcp_snd_cwnd(tp),
                state == TCP_LISTEN ?
                    fastopenq->max_qlen :
                    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}

static void get_timewait4_sock(const struct inet_timewait_sock *tw,
                               struct seq_file *f, int i)
{
        long delta = tw->tw_timer.expires - jiffies;
        __be32 dest, src;
        __u16 destp, srcp;

        dest  = tw->tw_daddr;
        src   = tw->tw_rcv_saddr;
        destp = ntohs(tw->tw_dport);
        srcp  = ntohs(tw->tw_sport);

        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
                i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
                3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
                refcount_read(&tw->tw_refcnt), tw);
}

#define TMPSZ 150

static int tcp4_seq_show(struct seq_file *seq, void *v)
{
        struct tcp_iter_state *st;
        struct sock *sk = v;

        seq_setwidth(seq, TMPSZ - 1);
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
                           "rx_queue tr tm->when retrnsmt   uid  timeout "
                           "inode");
                goto out;
        }
        st = seq->private;

        if (sk->sk_state == TCP_TIME_WAIT)
                get_timewait4_sock(v, seq, st->num);
        else if (sk->sk_state == TCP_NEW_SYN_RECV)
                get_openreq4(v, seq, st->num);
        else
                get_tcp4_sock(v, seq, st->num);
out:
        seq_pad(seq, '\n');
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_tcp_iter_state {
        struct tcp_iter_state state;
        unsigned int cur_sk;
        unsigned int end_sk;
        unsigned int max_sk;
        struct sock **batch;
        bool st_bucket_done;
};

struct bpf_iter__tcp {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct sock_common *, sk_common);
        uid_t uid __aligned(8);
};

static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
                             struct sock_common *sk_common, uid_t uid)
{
        struct bpf_iter__tcp ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.sk_common = sk_common;
        ctx.uid = uid;
        return bpf_iter_run_prog(prog, &ctx);
}

static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
{
        while (iter->cur_sk < iter->end_sk)
                sock_gen_put(iter->batch[iter->cur_sk++]);
}

static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
                                      unsigned int new_batch_sz)
{
        struct sock **new_batch;

        new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
                             GFP_USER | __GFP_NOWARN);
        if (!new_batch)
                return -ENOMEM;

        bpf_iter_tcp_put_batch(iter);
        kvfree(iter->batch);
        iter->batch = new_batch;
        iter->max_sk = new_batch_sz;

        return 0;
}

static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
                                                 struct sock *start_sk)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        struct hlist_nulls_node *node;
        unsigned int expected = 1;
        struct sock *sk;

        sock_hold(start_sk);
        iter->batch[iter->end_sk++] = start_sk;

        sk = sk_nulls_next(start_sk);
        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk)) {
                        if (iter->end_sk < iter->max_sk) {
                                sock_hold(sk);
                                iter->batch[iter->end_sk++] = sk;
                        }
                        expected++;
                }
        }
        spin_unlock(&hinfo->lhash2[st->bucket].lock);

        return expected;
}

static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
                                                   struct sock *start_sk)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        struct hlist_nulls_node *node;
        unsigned int expected = 1;
        struct sock *sk;

        sock_hold(start_sk);
        iter->batch[iter->end_sk++] = start_sk;

        sk = sk_nulls_next(start_sk);
        sk_nulls_for_each_from(sk, node) {
                if (seq_sk_match(seq, sk)) {
                        if (iter->end_sk < iter->max_sk) {
                                sock_hold(sk);
                                iter->batch[iter->end_sk++] = sk;
                        }
                        expected++;
                }
        }
        spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));

        return expected;
}

static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
{
        struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        unsigned int expected;
        bool resized = false;
        struct sock *sk;

        /* The st->bucket is done.  Directly advance to the next
         * bucket instead of having the tcp_seek_last_pos() to skip
         * one by one in the current bucket and eventually find out
         * it has to advance to the next bucket.
         */
        if (iter->st_bucket_done) {
                st->offset = 0;
                st->bucket++;
                if (st->state == TCP_SEQ_STATE_LISTENING &&
                    st->bucket > hinfo->lhash2_mask) {
                        st->state = TCP_SEQ_STATE_ESTABLISHED;
                        st->bucket = 0;
                }
        }

again:
        /* Get a new batch */
        iter->cur_sk = 0;
        iter->end_sk = 0;
        iter->st_bucket_done = false;

        sk = tcp_seek_last_pos(seq);
        if (!sk)
                return NULL; /* Done */

        if (st->state == TCP_SEQ_STATE_LISTENING)
                expected = bpf_iter_tcp_listening_batch(seq, sk);
        else
                expected = bpf_iter_tcp_established_batch(seq, sk);

        if (iter->end_sk == expected) {
                iter->st_bucket_done = true;
                return sk;
        }

        if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
                resized = true;
                goto again;
        }

        return sk;
}

static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
        /* bpf iter does not support lseek, so it always
         * continue from where it was stop()-ped.
         */
        if (*pos)
                return bpf_iter_tcp_batch(seq);

        return SEQ_START_TOKEN;
}

static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct tcp_iter_state *st = &iter->state;
        struct sock *sk;

        /* Whenever seq_next() is called, the iter->cur_sk is
         * done with seq_show(), so advance to the next sk in
         * the batch.
         */
        if (iter->cur_sk < iter->end_sk) {
                /* Keeping st->num consistent in tcp_iter_state.
                 * bpf_iter_tcp does not use st->num.
                 * meta.seq_num is used instead.
                 */
                st->num++;
                /* Move st->offset to the next sk in the bucket such that
                 * the future start() will resume at st->offset in
                 * st->bucket.  See tcp_seek_last_pos().
                 */
                st->offset++;
                sock_gen_put(iter->batch[iter->cur_sk++]);
        }

        if (iter->cur_sk < iter->end_sk)
                sk = iter->batch[iter->cur_sk];
        else
                sk = bpf_iter_tcp_batch(seq);

        ++*pos;
        /* Keeping st->last_pos consistent in tcp_iter_state.
         * bpf iter does not do lseek, so st->last_pos always equals to *pos.
         */
        st->last_pos = *pos;
        return sk;
}

static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        struct sock *sk = v;
        uid_t uid;
        int ret;

        if (v == SEQ_START_TOKEN)
                return 0;

        if (sk_fullsock(sk))
                lock_sock(sk);

        if (unlikely(sk_unhashed(sk))) {
                ret = SEQ_SKIP;
                goto unlock;
        }

        if (sk->sk_state == TCP_TIME_WAIT) {
                uid = 0;
        } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
                const struct request_sock *req = v;

                uid = from_kuid_munged(seq_user_ns(seq),
                                       sock_i_uid(req->rsk_listener));
        } else {
                uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
        }

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        ret = tcp_prog_seq_show(prog, &meta, v, uid);

unlock:
        if (sk_fullsock(sk))
                release_sock(sk);
        return ret;

}

static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_tcp_iter_state *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)tcp_prog_seq_show(prog, &meta, v, 0);
        }

        if (iter->cur_sk < iter->end_sk) {
                bpf_iter_tcp_put_batch(iter);
                iter->st_bucket_done = false;
        }
}

static const struct seq_operations bpf_iter_tcp_seq_ops = {
        .show                = bpf_iter_tcp_seq_show,
        .start                = bpf_iter_tcp_seq_start,
        .next                = bpf_iter_tcp_seq_next,
        .stop                = bpf_iter_tcp_seq_stop,
};
#endif
static unsigned short seq_file_family(const struct seq_file *seq)
{
        const struct tcp_seq_afinfo *afinfo;

#ifdef CONFIG_BPF_SYSCALL
        /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
        if (seq->op == &bpf_iter_tcp_seq_ops)
                return AF_UNSPEC;
#endif

        /* Iterated from proc fs */
        afinfo = pde_data(file_inode(seq->file));
        return afinfo->family;
}

static const struct seq_operations tcp4_seq_ops = {
        .show                = tcp4_seq_show,
        .start                = tcp_seq_start,
        .next                = tcp_seq_next,
        .stop                = tcp_seq_stop,
};

static struct tcp_seq_afinfo tcp4_seq_afinfo = {
        .family                = AF_INET,
};

static int __net_init tcp4_proc_init_net(struct net *net)
{
        if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
                        sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
                return -ENOMEM;
        return 0;
}

static void __net_exit tcp4_proc_exit_net(struct net *net)
{
        remove_proc_entry("tcp", net->proc_net);
}

static struct pernet_operations tcp4_net_ops = {
        .init = tcp4_proc_init_net,
        .exit = tcp4_proc_exit_net,
};

int __init tcp4_proc_init(void)
{
        return register_pernet_subsys(&tcp4_net_ops);
}

void tcp4_proc_exit(void)
{
        unregister_pernet_subsys(&tcp4_net_ops);
}
#endif /* CONFIG_PROC_FS */

/* @wake is one when sk_stream_write_space() calls us.
 * This sends EPOLLOUT only if notsent_bytes is half the limit.
 * This mimics the strategy used in sock_def_write_space().
 */
bool tcp_stream_memory_free(const struct sock *sk, int wake)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 notsent_bytes = READ_ONCE(tp->write_seq) -
                            READ_ONCE(tp->snd_nxt);

        return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
}
EXPORT_SYMBOL(tcp_stream_memory_free);

struct proto tcp_prot = {
        .name                        = "TCP",
        .owner                        = THIS_MODULE,
        .close                        = tcp_close,
        .pre_connect                = tcp_v4_pre_connect,
        .connect                = tcp_v4_connect,
        .disconnect                = tcp_disconnect,
        .accept                        = inet_csk_accept,
        .ioctl                        = tcp_ioctl,
        .init                        = tcp_v4_init_sock,
        .destroy                = tcp_v4_destroy_sock,
        .shutdown                = tcp_shutdown,
        .setsockopt                = tcp_setsockopt,
        .getsockopt                = tcp_getsockopt,
        .bpf_bypass_getsockopt        = tcp_bpf_bypass_getsockopt,
        .keepalive                = tcp_set_keepalive,
        .recvmsg                = tcp_recvmsg,
        .sendmsg                = tcp_sendmsg,
        .splice_eof                = tcp_splice_eof,
        .backlog_rcv                = tcp_v4_do_rcv,
        .release_cb                = tcp_release_cb,
        .hash                        = inet_hash,
        .unhash                        = inet_unhash,
        .get_port                = inet_csk_get_port,
        .put_port                = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
        .psock_update_sk_prot        = tcp_bpf_update_proto,
#endif
        .enter_memory_pressure        = tcp_enter_memory_pressure,
        .leave_memory_pressure        = tcp_leave_memory_pressure,
        .stream_memory_free        = tcp_stream_memory_free,
        .sockets_allocated        = &tcp_sockets_allocated,
        .orphan_count                = &tcp_orphan_count,

        .memory_allocated        = &tcp_memory_allocated,
        .per_cpu_fw_alloc        = &tcp_memory_per_cpu_fw_alloc,

        .memory_pressure        = &tcp_memory_pressure,
        .sysctl_mem                = sysctl_tcp_mem,
        .sysctl_wmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_wmem),
        .sysctl_rmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_rmem),
        .max_header                = MAX_TCP_HEADER,
        .obj_size                = sizeof(struct tcp_sock),
        .slab_flags                = SLAB_TYPESAFE_BY_RCU,
        .twsk_prot                = &tcp_timewait_sock_ops,
        .rsk_prot                = &tcp_request_sock_ops,
        .h.hashinfo                = NULL,
        .no_autobind                = true,
        .diag_destroy                = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);

static void __net_exit tcp_sk_exit(struct net *net)
{
        if (net->ipv4.tcp_congestion_control)
                bpf_module_put(net->ipv4.tcp_congestion_control,
                               net->ipv4.tcp_congestion_control->owner);
}

static void __net_init tcp_set_hashinfo(struct net *net)
{
        struct inet_hashinfo *hinfo;
        unsigned int ehash_entries;
        struct net *old_net;

        if (net_eq(net, &init_net))
                goto fallback;

        old_net = current->nsproxy->net_ns;
        ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
        if (!ehash_entries)
                goto fallback;

        ehash_entries = roundup_pow_of_two(ehash_entries);
        hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
        if (!hinfo) {
                pr_warn("Failed to allocate TCP ehash (entries: %u) "
                        "for a netns, fallback to the global one\n",
                        ehash_entries);
fallback:
                hinfo = &tcp_hashinfo;
                ehash_entries = tcp_hashinfo.ehash_mask + 1;
        }

        net->ipv4.tcp_death_row.hashinfo = hinfo;
        net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
        net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
}

static int __net_init tcp_sk_init(struct net *net)
{
        net->ipv4.sysctl_tcp_ecn = 2;
        net->ipv4.sysctl_tcp_ecn_fallback = 1;

        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
        net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
        net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
        net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
        net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;

        net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
        net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
        net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;

        net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
        net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
        net->ipv4.sysctl_tcp_syncookies = 1;
        net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
        net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
        net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
        net->ipv4.sysctl_tcp_orphan_retries = 0;
        net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
        net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
        net->ipv4.sysctl_tcp_tw_reuse = 2;
        net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;

        refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
        tcp_set_hashinfo(net);

        net->ipv4.sysctl_tcp_sack = 1;
        net->ipv4.sysctl_tcp_window_scaling = 1;
        net->ipv4.sysctl_tcp_timestamps = 1;
        net->ipv4.sysctl_tcp_early_retrans = 3;
        net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
        net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
        net->ipv4.sysctl_tcp_retrans_collapse = 1;
        net->ipv4.sysctl_tcp_max_reordering = 300;
        net->ipv4.sysctl_tcp_dsack = 1;
        net->ipv4.sysctl_tcp_app_win = 31;
        net->ipv4.sysctl_tcp_adv_win_scale = 1;
        net->ipv4.sysctl_tcp_frto = 2;
        net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
        /* This limits the percentage of the congestion window which we
         * will allow a single TSO frame to consume.  Building TSO frames
         * which are too large can cause TCP streams to be bursty.
         */
        net->ipv4.sysctl_tcp_tso_win_divisor = 3;
        /* Default TSQ limit of 16 TSO segments */
        net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;

        /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
        net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;

        net->ipv4.sysctl_tcp_min_tso_segs = 2;
        net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
        net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
        net->ipv4.sysctl_tcp_autocorking = 1;
        net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
        net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
        net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
        if (net != &init_net) {
                memcpy(net->ipv4.sysctl_tcp_rmem,
                       init_net.ipv4.sysctl_tcp_rmem,
                       sizeof(init_net.ipv4.sysctl_tcp_rmem));
                memcpy(net->ipv4.sysctl_tcp_wmem,
                       init_net.ipv4.sysctl_tcp_wmem,
                       sizeof(init_net.ipv4.sysctl_tcp_wmem));
        }
        net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
        net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
        net->ipv4.sysctl_tcp_comp_sack_nr = 44;
        net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
        net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
        net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
        atomic_set(&net->ipv4.tfo_active_disable_times, 0);

        /* Set default values for PLB */
        net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
        net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
        net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
        net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
        /* Default congestion threshold for PLB to mark a round is 50% */
        net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;

        /* Reno is always built in */
        if (!net_eq(net, &init_net) &&
            bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
                               init_net.ipv4.tcp_congestion_control->owner))
                net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
        else
                net->ipv4.tcp_congestion_control = &tcp_reno;

        net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
        net->ipv4.sysctl_tcp_shrink_window = 0;

        net->ipv4.sysctl_tcp_pingpong_thresh = 1;
        net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);

        return 0;
}

static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
        struct net *net;

        tcp_twsk_purge(net_exit_list);

        list_for_each_entry(net, net_exit_list, exit_list) {
                inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
                WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
                tcp_fastopen_ctx_destroy(net);
        }
}

static struct pernet_operations __net_initdata tcp_sk_ops = {
       .init           = tcp_sk_init,
       .exit           = tcp_sk_exit,
       .exit_batch = tcp_sk_exit_batch,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
                     struct sock_common *sk_common, uid_t uid)

#define INIT_BATCH_SZ 16

static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
{
        struct bpf_tcp_iter_state *iter = priv_data;
        int err;

        err = bpf_iter_init_seq_net(priv_data, aux);
        if (err)
                return err;

        err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
        if (err) {
                bpf_iter_fini_seq_net(priv_data);
                return err;
        }

        return 0;
}

static void bpf_iter_fini_tcp(void *priv_data)
{
        struct bpf_tcp_iter_state *iter = priv_data;

        bpf_iter_fini_seq_net(priv_data);
        kvfree(iter->batch);
}

static const struct bpf_iter_seq_info tcp_seq_info = {
        .seq_ops                = &bpf_iter_tcp_seq_ops,
        .init_seq_private        = bpf_iter_init_tcp,
        .fini_seq_private        = bpf_iter_fini_tcp,
        .seq_priv_size                = sizeof(struct bpf_tcp_iter_state),
};

static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
                            const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sk_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sk_getsockopt_proto;
        default:
                return NULL;
        }
}

static struct bpf_iter_reg tcp_reg_info = {
        .target                        = "tcp",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__tcp, sk_common),
                  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
        },
        .get_func_proto                = bpf_iter_tcp_get_func_proto,
        .seq_info                = &tcp_seq_info,
};

static void __init bpf_iter_register(void)
{
        tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
        if (bpf_iter_reg_target(&tcp_reg_info))
                pr_warn("Warning: could not register bpf iterator tcp\n");
}

#endif

void __init tcp_v4_init(void)
{
        int cpu, res;

        for_each_possible_cpu(cpu) {
                struct sock *sk;

                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
                                           IPPROTO_TCP, &init_net);
                if (res)
                        panic("Failed to create the TCP control socket.\n");
                sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

                /* Please enforce IP_DF and IPID==0 for RST and
                 * ACK sent in SYN-RECV and TIME-WAIT state.
                 */
                inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;

                sk->sk_clockid = CLOCK_MONOTONIC;

                per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
        }
        if (register_pernet_subsys(&tcp_sk_ops))
                panic("Failed to create the TCP control socket.\n");

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_register();
#endif
}


















































































    3 













































    3 




























    1 


















































































































































































































































































































































































































































    6 





















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list.h>
#include <linux/rcupdate.h>

/*
 * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
 * @list: list to be initialized
 *
 * You should instead use INIT_LIST_HEAD() for normal initialization and
 * cleanup tasks, when readers have no access to the list being initialized.
 * However, if the list being initialized is visible to readers, you
 * need to keep the compiler from being too mischievous.
 */
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

/*
 * return the ->next pointer of a list_head in an rcu safe
 * way, we must not access it directly
 */
#define list_next_rcu(list)        (*((struct list_head __rcu **)(&(list)->next)))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
 * @head: the head of the list
 *
 * Note: This should only be used with the list header, and even then
 * only if list_del() and similar primitives are not also used on the
 * list header.
 */
#define list_tail_rcu(head)        (*((struct list_head __rcu **)(&(head)->prev)))

/*
 * Check during list traversal that we are within an RCU reader
 */

#define check_arg_count_one(dummy)

#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({                                                                \
        check_arg_count_one(extra);                                        \
        RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(),                \
                         "RCU-list traversed in non-reader section!");        \
        })

#define __list_check_srcu(cond)                                         \
        ({                                                                 \
        RCU_LOCKDEP_WARN(!(cond),                                         \
                "RCU-list traversed without holding the required lock!");\
        })
#else
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({ check_arg_count_one(extra); })

#define __list_check_srcu(cond) ({ })
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add_rcu(struct list_head *new,
                struct list_head *prev, struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        new->next = next;
        new->prev = prev;
        rcu_assign_pointer(list_next_rcu(prev), new);
        next->prev = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
        __list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_tail_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_tail_rcu(struct list_head *new,
                                        struct list_head *head)
{
        __list_add_rcu(new, head->prev, head);
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_del_rcu()
 * or list_add_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = LIST_POISON2;
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_add_head_rcu() or
 * hlist_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_for_each_entry_rcu().
 */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * list_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 * Note: @old should not be empty.
 */
static inline void list_replace_rcu(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->prev = old->prev;
        rcu_assign_pointer(list_next_rcu(new->prev), new);
        new->next->prev = new;
        old->prev = LIST_POISON2;
}

/**
 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
 * @list:        the RCU-protected list to splice
 * @prev:        points to the last element of the existing list
 * @next:        points to the first element of the existing list
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 *
 * The list pointed to by @prev and @next can be RCU-read traversed
 * concurrently with this function.
 *
 * Note that this function blocks.
 *
 * Important note: the caller must take whatever action is necessary to prevent
 * any other updates to the existing list.  In principle, it is possible to
 * modify the list as soon as sync() begins execution. If this sort of thing
 * becomes necessary, an alternative version based on call_rcu() could be
 * created.  But only if -really- needed -- there is no shortage of RCU API
 * members.
 */
static inline void __list_splice_init_rcu(struct list_head *list,
                                          struct list_head *prev,
                                          struct list_head *next,
                                          void (*sync)(void))
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        /*
         * "first" and "last" tracking list, so initialize it.  RCU readers
         * have access to this list, so we must use INIT_LIST_HEAD_RCU()
         * instead of INIT_LIST_HEAD().
         */

        INIT_LIST_HEAD_RCU(list);

        /*
         * At this point, the list body still points to the source list.
         * Wait for any readers to finish using the list before splicing
         * the list body into the new list.  Any new readers will see
         * an empty list.
         */

        sync();
        ASSERT_EXCLUSIVE_ACCESS(*first);
        ASSERT_EXCLUSIVE_ACCESS(*last);

        /*
         * Readers are finished with the source list, so perform splice.
         * The order is important if the new list is global and accessible
         * to concurrent RCU readers.  Note that RCU readers are not
         * permitted to traverse the prev pointers without excluding
         * this function.
         */

        last->next = next;
        rcu_assign_pointer(list_next_rcu(prev), first);
        first->prev = prev;
        next->prev = last;
}

/**
 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
 *                        designed for stacks.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_init_rcu(struct list_head *list,
                                        struct list_head *head,
                                        void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head, head->next, sync);
}

/**
 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
 *                             list, designed for queues.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_tail_init_rcu(struct list_head *list,
                                             struct list_head *head,
                                             void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head->prev, head, sync);
}

/**
 * list_entry_rcu - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_entry_rcu(ptr, type, member) \
        container_of(READ_ONCE(ptr), type, member)

/*
 * Where are list_empty_rcu() and list_first_entry_rcu()?
 *
 * They do not exist because they would lead to subtle race conditions:
 *
 * if (!list_empty_rcu(mylist)) {
 *        struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
 *        do_something(bar);
 * }
 *
 * The list might be non-empty when list_empty_rcu() checks it, but it
 * might have become empty by the time that list_first_entry_rcu() rereads
 * the ->next pointer, which would result in a SEGV.
 *
 * When not using RCU, it is OK for list_first_entry() to re-read that
 * pointer because both functions should be protected by some lock that
 * blocks writers.
 *
 * When using RCU, list_empty() uses READ_ONCE() to fetch the
 * RCU-protected ->next pointer and then compares it to the address of the
 * list head.  However, it neither dereferences this pointer nor provides
 * this pointer to its caller.  Thus, READ_ONCE() suffices (that is,
 * rcu_dereference() is not needed), which means that list_empty() can be
 * used anywhere you would want to use list_empty_rcu().  Just don't
 * expect anything useful to happen if you do a subsequent lockless
 * call to list_first_entry_rcu()!!!
 *
 * See list_first_or_null_rcu for an alternative.
 */

/**
 * list_first_or_null_rcu - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_first_or_null_rcu(ptr, type, member) \
({ \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})

/**
 * list_next_or_null_rcu - get the next element from a list
 * @head:        the head for the list.
 * @ptr:        the list head to take the next element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the ptr is at the end of the list, NULL is returned.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
        struct list_head *__head = (head); \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__next != __head) ? list_entry_rcu(__next, type, \
                                                  member) : NULL; \
})

/**
 * list_for_each_entry_rcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define list_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_srcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define list_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_entry_lockless - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_entry_lockless(ptr, type, member) \
        container_of((typeof(ptr))READ_ONCE(ptr), type, member)

/**
 * list_for_each_entry_lockless - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_struct within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_for_each_entry_lockless(pos, head, member) \
        for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
             &pos->member != (head); \
             pos = list_entry_lockless(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_continue_rcu - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_from_rcu() except
 * this starts after the given position and that one starts at the given
 * position.
 */
#define list_for_each_entry_continue_rcu(pos, head, member)                 \
        for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
             &pos->member != (head);        \
             pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_from_rcu - iterate over a list from current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_node within the struct.
 *
 * Iterate over the tail of a list starting from a given position,
 * which must have been in the list when the RCU read lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_continue_rcu() except
 * this starts from the given position and that one starts from the position
 * after the given position.
 */
#define list_for_each_entry_from_rcu(pos, head, member)                        \
        for (; &(pos)->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry().
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 */
static inline void hlist_replace_rcu(struct hlist_node *old,
                                        struct hlist_node *new)
{
        struct hlist_node *next = old->next;

        new->next = next;
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
        if (next)
                WRITE_ONCE(new->next->pprev, &new->next);
        WRITE_ONCE(old->pprev, LIST_POISON2);
}

/**
 * hlists_swap_heads_rcu - swap the lists the hlist heads point to
 * @left:  The hlist head on the left
 * @right: The hlist head on the right
 *
 * The lists start out as [@left  ][node1 ... ] and
 *                        [@right ][node2 ... ]
 * The lists end up as    [@left  ][node2 ... ]
 *                        [@right ][node1 ... ]
 */
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
        struct hlist_node *node1 = left->first;
        struct hlist_node *node2 = right->first;

        rcu_assign_pointer(left->first, node2);
        rcu_assign_pointer(right->first, node1);
        WRITE_ONCE(node2->pprev, &left->first);
        WRITE_ONCE(node1->pprev, &right->first);
}

/*
 * return the first or the next element in an RCU protected hlist
 */
#define hlist_first_rcu(head)        (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node)        (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node)        (*((struct hlist_node __rcu **)((node)->pprev)))

/**
 * hlist_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_head_rcu(struct hlist_node *n,
                                        struct hlist_head *h)
{
        struct hlist_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_first_rcu(h), n);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_tail_rcu(struct hlist_node *n,
                                      struct hlist_head *h)
{
        struct hlist_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; i; i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                WRITE_ONCE(n->pprev, &last->next);
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_add_head_rcu(n, h);
        }
}

/**
 * hlist_add_before_rcu
 * @n: the new element to add to the hash list.
 * @next: the existing element to add the new element before.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * before the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_before_rcu(struct hlist_node *n,
                                        struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
        rcu_assign_pointer(hlist_pprev_rcu(n), n);
        WRITE_ONCE(next->pprev, &n->next);
}

/**
 * hlist_add_behind_rcu
 * @n: the new element to add to the hash list.
 * @prev: the existing element to add the new element after.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * after the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_behind_rcu(struct hlist_node *n,
                                        struct hlist_node *prev)
{
        n->next = prev->next;
        WRITE_ONCE(n->pprev, &prev->next);
        rcu_assign_pointer(hlist_next_rcu(prev), n);
        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

#define __hlist_for_each_rcu(pos, head)                                \
        for (pos = rcu_dereference(hlist_first_rcu(head));        \
             pos;                                                \
             pos = rcu_dereference(hlist_next_rcu(pos)))

/**
 * hlist_for_each_entry_rcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_srcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define hlist_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 *
 * This is the same as hlist_for_each_entry_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hlist_for_each_entry_rcu_notrace(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu_bh(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu(pos, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu_bh(pos, member)                \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from_rcu(pos, member)                        \
        for (; pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

#endif        /* __KERNEL__ */
#endif































































































































































































































































































































































































































































































































































































































































































































































































































    8 






    4 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
// SPDX-License-Identifier: GPL-2.0-only
/* Common code for 32 and 64-bit NUMA */
#include <linux/acpi.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/of.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <linux/topology.h>
#include <linux/sort.h>

#include <asm/e820/api.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/amd_nb.h>

#include "numa_internal.h"

int numa_off;
nodemask_t numa_nodes_parsed __initdata;

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);

static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;

static int numa_distance_cnt;
static u8 *numa_distance;

static __init int numa_setup(char *opt)
{
        if (!opt)
                return -EINVAL;
        if (!strncmp(opt, "off", 3))
                numa_off = 1;
        if (!strncmp(opt, "fake=", 5))
                return numa_emu_cmdline(opt + 5);
        if (!strncmp(opt, "noacpi", 6))
                disable_srat();
        if (!strncmp(opt, "nohmat", 6))
                disable_hmat();
        return 0;
}
early_param("numa", numa_setup);

/*
 * apicid, cpu, node mappings
 */
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};

int numa_cpu_node(int cpu)
{
        u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);

        if (apicid != BAD_APICID)
                return __apicid_to_node[apicid];
        return NUMA_NO_NODE;
}

cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);

/*
 * Map cpu index to node index
 */
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);

void numa_set_node(int cpu, int node)
{
        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

        /* early setting, no percpu area yet */
        if (cpu_to_node_map) {
                cpu_to_node_map[cpu] = node;
                return;
        }

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
                dump_stack();
                return;
        }
#endif
        per_cpu(x86_cpu_to_node_map, cpu) = node;

        set_cpu_numa_node(cpu, node);
}

void numa_clear_node(int cpu)
{
        numa_set_node(cpu, NUMA_NO_NODE);
}

/*
 * Allocate node_to_cpumask_map based on number of available nodes
 * Requires node_possible_map to be valid.
 *
 * Note: cpumask_of_node() is not valid until after this is done.
 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
 */
void __init setup_node_to_cpumask_map(void)
{
        unsigned int node;

        /* setup nr_node_ids if not done yet */
        if (nr_node_ids == MAX_NUMNODES)
                setup_nr_node_ids();

        /* allocate the map */
        for (node = 0; node < nr_node_ids; node++)
                alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);

        /* cpumask_of_node() will now work */
        pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}

static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
                                     struct numa_meminfo *mi)
{
        /* ignore zero length blks */
        if (start == end)
                return 0;

        /* whine about and ignore invalid blks */
        if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
                pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
                        nid, start, end - 1);
                return 0;
        }

        if (mi->nr_blks >= NR_NODE_MEMBLKS) {
                pr_err("too many memblk ranges\n");
                return -EINVAL;
        }

        mi->blk[mi->nr_blks].start = start;
        mi->blk[mi->nr_blks].end = end;
        mi->blk[mi->nr_blks].nid = nid;
        mi->nr_blks++;
        return 0;
}

/**
 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
 * @idx: Index of memblk to remove
 * @mi: numa_meminfo to remove memblk from
 *
 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
 * decrementing @mi->nr_blks.
 */
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
{
        mi->nr_blks--;
        memmove(&mi->blk[idx], &mi->blk[idx + 1],
                (mi->nr_blks - idx) * sizeof(mi->blk[0]));
}

/**
 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
 * @dst: numa_meminfo to append block to
 * @idx: Index of memblk to remove
 * @src: numa_meminfo to remove memblk from
 */
static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
                                         struct numa_meminfo *src)
{
        dst->blk[dst->nr_blks++] = src->blk[idx];
        numa_remove_memblk_from(idx, src);
}

/**
 * numa_add_memblk - Add one numa_memblk to numa_meminfo
 * @nid: NUMA node ID of the new memblk
 * @start: Start address of the new memblk
 * @end: End address of the new memblk
 *
 * Add a new memblk to the default numa_meminfo.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init numa_add_memblk(int nid, u64 start, u64 end)
{
        return numa_add_memblk_to(nid, start, end, &numa_meminfo);
}

/* Allocate NODE_DATA for a node on the local memory */
static void __init alloc_node_data(int nid)
{
        const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
        u64 nd_pa;
        void *nd;
        int tnid;

        /*
         * Allocate node data.  Try node-local memory and then any node.
         * Never allocate in DMA zone.
         */
        nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
        if (!nd_pa) {
                pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
                       nd_size, nid);
                return;
        }
        nd = __va(nd_pa);

        /* report and initialize */
        printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
               nd_pa, nd_pa + nd_size - 1);
        tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
        if (tnid != nid)
                printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);

        node_data[nid] = nd;
        memset(NODE_DATA(nid), 0, sizeof(pg_data_t));

        node_set_online(nid);
}

/**
 * numa_cleanup_meminfo - Cleanup a numa_meminfo
 * @mi: numa_meminfo to clean up
 *
 * Sanitize @mi by merging and removing unnecessary memblks.  Also check for
 * conflicts and clear unused memblks.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
{
        const u64 low = 0;
        const u64 high = PFN_PHYS(max_pfn);
        int i, j, k;

        /* first, trim all entries */
        for (i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *bi = &mi->blk[i];

                /* move / save reserved memory ranges */
                if (!memblock_overlaps_region(&memblock.memory,
                                        bi->start, bi->end - bi->start)) {
                        numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
                        continue;
                }

                /* make sure all non-reserved blocks are inside the limits */
                bi->start = max(bi->start, low);

                /* preserve info for non-RAM areas above 'max_pfn': */
                if (bi->end > high) {
                        numa_add_memblk_to(bi->nid, high, bi->end,
                                           &numa_reserved_meminfo);
                        bi->end = high;
                }

                /* and there's no empty block */
                if (bi->start >= bi->end)
                        numa_remove_memblk_from(i--, mi);
        }

        /* merge neighboring / overlapping entries */
        for (i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *bi = &mi->blk[i];

                for (j = i + 1; j < mi->nr_blks; j++) {
                        struct numa_memblk *bj = &mi->blk[j];
                        u64 start, end;

                        /*
                         * See whether there are overlapping blocks.  Whine
                         * about but allow overlaps of the same nid.  They
                         * will be merged below.
                         */
                        if (bi->end > bj->start && bi->start < bj->end) {
                                if (bi->nid != bj->nid) {
                                        pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
                                               bi->nid, bi->start, bi->end - 1,
                                               bj->nid, bj->start, bj->end - 1);
                                        return -EINVAL;
                                }
                                pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
                                        bi->nid, bi->start, bi->end - 1,
                                        bj->start, bj->end - 1);
                        }

                        /*
                         * Join together blocks on the same node, holes
                         * between which don't overlap with memory on other
                         * nodes.
                         */
                        if (bi->nid != bj->nid)
                                continue;
                        start = min(bi->start, bj->start);
                        end = max(bi->end, bj->end);
                        for (k = 0; k < mi->nr_blks; k++) {
                                struct numa_memblk *bk = &mi->blk[k];

                                if (bi->nid == bk->nid)
                                        continue;
                                if (start < bk->end && end > bk->start)
                                        break;
                        }
                        if (k < mi->nr_blks)
                                continue;
                        printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
                               bi->nid, bi->start, bi->end - 1, bj->start,
                               bj->end - 1, start, end - 1);
                        bi->start = start;
                        bi->end = end;
                        numa_remove_memblk_from(j--, mi);
                }
        }

        /* clear unused ones */
        for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
                mi->blk[i].start = mi->blk[i].end = 0;
                mi->blk[i].nid = NUMA_NO_NODE;
        }

        return 0;
}

/*
 * Set nodes, which have memory in @mi, in *@nodemask.
 */
static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
                                              const struct numa_meminfo *mi)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
                if (mi->blk[i].start != mi->blk[i].end &&
                    mi->blk[i].nid != NUMA_NO_NODE)
                        node_set(mi->blk[i].nid, *nodemask);
}

/**
 * numa_reset_distance - Reset NUMA distance table
 *
 * The current table is freed.  The next numa_set_distance() call will
 * create a new one.
 */
void __init numa_reset_distance(void)
{
        size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);

        /* numa_distance could be 1LU marking allocation failure, test cnt */
        if (numa_distance_cnt)
                memblock_free(numa_distance, size);
        numa_distance_cnt = 0;
        numa_distance = NULL;        /* enable table creation */
}

static int __init numa_alloc_distance(void)
{
        nodemask_t nodes_parsed;
        size_t size;
        int i, j, cnt = 0;
        u64 phys;

        /* size the new table and allocate it */
        nodes_parsed = numa_nodes_parsed;
        numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);

        for_each_node_mask(i, nodes_parsed)
                cnt = i;
        cnt++;
        size = cnt * cnt * sizeof(numa_distance[0]);

        phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
                                         PFN_PHYS(max_pfn_mapped));
        if (!phys) {
                pr_warn("Warning: can't allocate distance table!\n");
                /* don't retry until explicitly reset */
                numa_distance = (void *)1LU;
                return -ENOMEM;
        }

        numa_distance = __va(phys);
        numa_distance_cnt = cnt;

        /* fill with the default distances */
        for (i = 0; i < cnt; i++)
                for (j = 0; j < cnt; j++)
                        numa_distance[i * cnt + j] = i == j ?
                                LOCAL_DISTANCE : REMOTE_DISTANCE;
        printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);

        return 0;
}

/**
 * numa_set_distance - Set NUMA distance from one NUMA to another
 * @from: the 'from' node to set distance
 * @to: the 'to'  node to set distance
 * @distance: NUMA distance
 *
 * Set the distance from node @from to @to to @distance.  If distance table
 * doesn't exist, one which is large enough to accommodate all the currently
 * known nodes will be created.
 *
 * If such table cannot be allocated, a warning is printed and further
 * calls are ignored until the distance table is reset with
 * numa_reset_distance().
 *
 * If @from or @to is higher than the highest known node or lower than zero
 * at the time of table creation or @distance doesn't make sense, the call
 * is ignored.
 * This is to allow simplification of specific NUMA config implementations.
 */
void __init numa_set_distance(int from, int to, int distance)
{
        if (!numa_distance && numa_alloc_distance() < 0)
                return;

        if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
                        from < 0 || to < 0) {
                pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
                             from, to, distance);
                return;
        }

        if ((u8)distance != distance ||
            (from == to && distance != LOCAL_DISTANCE)) {
                pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
                             from, to, distance);
                return;
        }

        numa_distance[from * numa_distance_cnt + to] = distance;
}

int __node_distance(int from, int to)
{
        if (from >= numa_distance_cnt || to >= numa_distance_cnt)
                return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
        return numa_distance[from * numa_distance_cnt + to];
}
EXPORT_SYMBOL(__node_distance);

/*
 * Mark all currently memblock-reserved physical memory (which covers the
 * kernel's own memory ranges) as hot-unswappable.
 */
static void __init numa_clear_kernel_node_hotplug(void)
{
        nodemask_t reserved_nodemask = NODE_MASK_NONE;
        struct memblock_region *mb_region;
        int i;

        /*
         * We have to do some preprocessing of memblock regions, to
         * make them suitable for reservation.
         *
         * At this time, all memory regions reserved by memblock are
         * used by the kernel, but those regions are not split up
         * along node boundaries yet, and don't necessarily have their
         * node ID set yet either.
         *
         * So iterate over all memory known to the x86 architecture,
         * and use those ranges to set the nid in memblock.reserved.
         * This will split up the memblock regions along node
         * boundaries and will set the node IDs as well.
         */
        for (i = 0; i < numa_meminfo.nr_blks; i++) {
                struct numa_memblk *mb = numa_meminfo.blk + i;
                int ret;

                ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
                WARN_ON_ONCE(ret);
        }

        /*
         * Now go over all reserved memblock regions, to construct a
         * node mask of all kernel reserved memory areas.
         *
         * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
         *   numa_meminfo might not include all memblock.reserved
         *   memory ranges, because quirks such as trim_snb_memory()
         *   reserve specific pages for Sandy Bridge graphics. ]
         */
        for_each_reserved_mem_region(mb_region) {
                int nid = memblock_get_region_node(mb_region);

                if (nid != NUMA_NO_NODE)
                        node_set(nid, reserved_nodemask);
        }

        /*
         * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
         * belonging to the reserved node mask.
         *
         * Note that this will include memory regions that reside
         * on nodes that contain kernel memory - entire nodes
         * become hot-unpluggable:
         */
        for (i = 0; i < numa_meminfo.nr_blks; i++) {
                struct numa_memblk *mb = numa_meminfo.blk + i;

                if (!node_isset(mb->nid, reserved_nodemask))
                        continue;

                memblock_clear_hotplug(mb->start, mb->end - mb->start);
        }
}

static int __init numa_register_memblks(struct numa_meminfo *mi)
{
        int i, nid;

        /* Account for nodes with cpus and no memory */
        node_possible_map = numa_nodes_parsed;
        numa_nodemask_from_meminfo(&node_possible_map, mi);
        if (WARN_ON(nodes_empty(node_possible_map)))
                return -EINVAL;

        for (i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *mb = &mi->blk[i];
                memblock_set_node(mb->start, mb->end - mb->start,
                                  &memblock.memory, mb->nid);
        }

        /*
         * At very early time, the kernel have to use some memory such as
         * loading the kernel image. We cannot prevent this anyway. So any
         * node the kernel resides in should be un-hotpluggable.
         *
         * And when we come here, alloc node data won't fail.
         */
        numa_clear_kernel_node_hotplug();

        /*
         * If sections array is gonna be used for pfn -> nid mapping, check
         * whether its granularity is fine enough.
         */
        if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
                unsigned long pfn_align = node_map_pfn_alignment();

                if (pfn_align && pfn_align < PAGES_PER_SECTION) {
                        pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
                                PFN_PHYS(pfn_align) >> 20,
                                PFN_PHYS(PAGES_PER_SECTION) >> 20);
                        return -EINVAL;
                }
        }

        if (!memblock_validate_numa_coverage(SZ_1M))
                return -EINVAL;

        /* Finally register nodes. */
        for_each_node_mask(nid, node_possible_map) {
                u64 start = PFN_PHYS(max_pfn);
                u64 end = 0;

                for (i = 0; i < mi->nr_blks; i++) {
                        if (nid != mi->blk[i].nid)
                                continue;
                        start = min(mi->blk[i].start, start);
                        end = max(mi->blk[i].end, end);
                }

                if (start >= end)
                        continue;

                alloc_node_data(nid);
        }

        /* Dump memblock with node info and return. */
        memblock_dump_all();
        return 0;
}

/*
 * There are unfortunately some poorly designed mainboards around that
 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 * mapping. To avoid this fill in the mapping for all possible CPUs,
 * as the number of CPUs is not known yet. We round robin the existing
 * nodes.
 */
static void __init numa_init_array(void)
{
        int rr, i;

        rr = first_node(node_online_map);
        for (i = 0; i < nr_cpu_ids; i++) {
                if (early_cpu_to_node(i) != NUMA_NO_NODE)
                        continue;
                numa_set_node(i, rr);
                rr = next_node_in(rr, node_online_map);
        }
}

static int __init numa_init(int (*init_func)(void))
{
        int i;
        int ret;

        for (i = 0; i < MAX_LOCAL_APIC; i++)
                set_apicid_to_node(i, NUMA_NO_NODE);

        nodes_clear(numa_nodes_parsed);
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
        memset(&numa_meminfo, 0, sizeof(numa_meminfo));
        WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
                                  NUMA_NO_NODE));
        WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
                                  NUMA_NO_NODE));
        /* In case that parsing SRAT failed. */
        WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
        numa_reset_distance();

        ret = init_func();
        if (ret < 0)
                return ret;

        /*
         * We reset memblock back to the top-down direction
         * here because if we configured ACPI_NUMA, we have
         * parsed SRAT in init_func(). It is ok to have the
         * reset here even if we did't configure ACPI_NUMA
         * or acpi numa init fails and fallbacks to dummy
         * numa init.
         */
        memblock_set_bottom_up(false);

        ret = numa_cleanup_meminfo(&numa_meminfo);
        if (ret < 0)
                return ret;

        numa_emulation(&numa_meminfo, numa_distance_cnt);

        ret = numa_register_memblks(&numa_meminfo);
        if (ret < 0)
                return ret;

        for (i = 0; i < nr_cpu_ids; i++) {
                int nid = early_cpu_to_node(i);

                if (nid == NUMA_NO_NODE)
                        continue;
                if (!node_online(nid))
                        numa_clear_node(i);
        }
        numa_init_array();

        return 0;
}

/**
 * dummy_numa_init - Fallback dummy NUMA init
 *
 * Used if there's no underlying NUMA architecture, NUMA initialization
 * fails, or NUMA is disabled on the command line.
 *
 * Must online at least one node and add memory blocks that cover all
 * allowed memory.  This function must not fail.
 */
static int __init dummy_numa_init(void)
{
        printk(KERN_INFO "%s\n",
               numa_off ? "NUMA turned off" : "No NUMA configuration found");
        printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
               0LLU, PFN_PHYS(max_pfn) - 1);

        node_set(0, numa_nodes_parsed);
        numa_add_memblk(0, 0, PFN_PHYS(max_pfn));

        return 0;
}

/**
 * x86_numa_init - Initialize NUMA
 *
 * Try each configured NUMA initialization method until one succeeds.  The
 * last fallback is dummy single node config encompassing whole memory and
 * never fails.
 */
void __init x86_numa_init(void)
{
        if (!numa_off) {
#ifdef CONFIG_ACPI_NUMA
                if (!numa_init(x86_acpi_numa_init))
                        return;
#endif
#ifdef CONFIG_AMD_NUMA
                if (!numa_init(amd_numa_init))
                        return;
#endif
                if (acpi_disabled && !numa_init(of_numa_init))
                        return;
        }

        numa_init(dummy_numa_init);
}


/*
 * A node may exist which has one or more Generic Initiators but no CPUs and no
 * memory.
 *
 * This function must be called after init_cpu_to_node(), to ensure that any
 * memoryless CPU nodes have already been brought online, and before the
 * node_data[nid] is needed for zone list setup in build_all_zonelists().
 *
 * When this function is called, any nodes containing either memory and/or CPUs
 * will already be online and there is no need to do anything extra, even if
 * they also contain one or more Generic Initiators.
 */
void __init init_gi_nodes(void)
{
        int nid;

        /*
         * Exclude this node from
         * bringup_nonboot_cpus
         *  cpu_up
         *   __try_online_node
         *    register_one_node
         * because node_subsys is not initialized yet.
         * TODO remove dependency on node_online
         */
        for_each_node_state(nid, N_GENERIC_INITIATOR)
                if (!node_online(nid))
                        node_set_online(nid);
}

/*
 * Setup early cpu_to_node.
 *
 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 * and apicid_to_node[] tables have valid entries for a CPU.
 * This means we skip cpu_to_node[] initialisation for NUMA
 * emulation and faking node case (when running a kernel compiled
 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 * is already initialized in a round robin manner at numa_init_array,
 * prior to this call, and this initialization is good enough
 * for the fake NUMA cases.
 *
 * Called before the per_cpu areas are setup.
 */
void __init init_cpu_to_node(void)
{
        int cpu;
        u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);

        BUG_ON(cpu_to_apicid == NULL);

        for_each_possible_cpu(cpu) {
                int node = numa_cpu_node(cpu);

                if (node == NUMA_NO_NODE)
                        continue;

                /*
                 * Exclude this node from
                 * bringup_nonboot_cpus
                 *  cpu_up
                 *   __try_online_node
                 *    register_one_node
                 * because node_subsys is not initialized yet.
                 * TODO remove dependency on node_online
                 */
                if (!node_online(node))
                        node_set_online(node);

                numa_set_node(cpu, node);
        }
}

#ifndef CONFIG_DEBUG_PER_CPU_MAPS

# ifndef CONFIG_NUMA_EMU
void numa_add_cpu(int cpu)
{
        cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}

void numa_remove_cpu(int cpu)
{
        cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
# endif        /* !CONFIG_NUMA_EMU */

#else        /* !CONFIG_DEBUG_PER_CPU_MAPS */

int __cpu_to_node(int cpu)
{
        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
                printk(KERN_WARNING
                        "cpu_to_node(%d): usage too early!\n", cpu);
                dump_stack();
                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
        }
        return per_cpu(x86_cpu_to_node_map, cpu);
}
EXPORT_SYMBOL(__cpu_to_node);

/*
 * Same function as cpu_to_node() but used if called before the
 * per_cpu areas are setup.
 */
int early_cpu_to_node(int cpu)
{
        if (early_per_cpu_ptr(x86_cpu_to_node_map))
                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];

        if (!cpu_possible(cpu)) {
                printk(KERN_WARNING
                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
                dump_stack();
                return NUMA_NO_NODE;
        }
        return per_cpu(x86_cpu_to_node_map, cpu);
}

void debug_cpumask_set_cpu(int cpu, int node, bool enable)
{
        struct cpumask *mask;

        if (node == NUMA_NO_NODE) {
                /* early_cpu_to_node() already emits a warning and trace */
                return;
        }
        mask = node_to_cpumask_map[node];
        if (!cpumask_available(mask)) {
                pr_err("node_to_cpumask_map[%i] NULL\n", node);
                dump_stack();
                return;
        }

        if (enable)
                cpumask_set_cpu(cpu, mask);
        else
                cpumask_clear_cpu(cpu, mask);

        printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
                enable ? "numa_add_cpu" : "numa_remove_cpu",
                cpu, node, cpumask_pr_args(mask));
        return;
}

# ifndef CONFIG_NUMA_EMU
static void numa_set_cpumask(int cpu, bool enable)
{
        debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}

void numa_add_cpu(int cpu)
{
        numa_set_cpumask(cpu, true);
}

void numa_remove_cpu(int cpu)
{
        numa_set_cpumask(cpu, false);
}
# endif        /* !CONFIG_NUMA_EMU */

/*
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
 */
const struct cpumask *cpumask_of_node(int node)
{
        if ((unsigned)node >= nr_node_ids) {
                printk(KERN_WARNING
                        "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
                        node, nr_node_ids);
                dump_stack();
                return cpu_none_mask;
        }
        if (!cpumask_available(node_to_cpumask_map[node])) {
                printk(KERN_WARNING
                        "cpumask_of_node(%d): no node_to_cpumask_map!\n",
                        node);
                dump_stack();
                return cpu_online_mask;
        }
        return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(cpumask_of_node);

#endif        /* !CONFIG_DEBUG_PER_CPU_MAPS */

#ifdef CONFIG_NUMA_KEEP_MEMINFO
static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
{
        int i;

        for (i = 0; i < mi->nr_blks; i++)
                if (mi->blk[i].start <= start && mi->blk[i].end > start)
                        return mi->blk[i].nid;
        return NUMA_NO_NODE;
}

int phys_to_target_node(phys_addr_t start)
{
        int nid = meminfo_to_nid(&numa_meminfo, start);

        /*
         * Prefer online nodes, but if reserved memory might be
         * hot-added continue the search with reserved ranges.
         */
        if (nid != NUMA_NO_NODE)
                return nid;

        return meminfo_to_nid(&numa_reserved_meminfo, start);
}
EXPORT_SYMBOL_GPL(phys_to_target_node);

int memory_add_physaddr_to_nid(u64 start)
{
        int nid = meminfo_to_nid(&numa_meminfo, start);

        if (nid == NUMA_NO_NODE)
                nid = numa_meminfo.blk[0].nid;
        return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);

#endif

static int __init cmp_memblk(const void *a, const void *b)
{
        const struct numa_memblk *ma = *(const struct numa_memblk **)a;
        const struct numa_memblk *mb = *(const struct numa_memblk **)b;

        return (ma->start > mb->start) - (ma->start < mb->start);
}

static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;

/**
 * numa_fill_memblks - Fill gaps in numa_meminfo memblks
 * @start: address to begin fill
 * @end: address to end fill
 *
 * Find and extend numa_meminfo memblks to cover the physical
 * address range @start-@end
 *
 * RETURNS:
 * 0                  : Success
 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
 */

int __init numa_fill_memblks(u64 start, u64 end)
{
        struct numa_memblk **blk = &numa_memblk_list[0];
        struct numa_meminfo *mi = &numa_meminfo;
        int count = 0;
        u64 prev_end;

        /*
         * Create a list of pointers to numa_meminfo memblks that
         * overlap start, end. The list is used to make in-place
         * changes that fill out the numa_meminfo memblks.
         */
        for (int i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *bi = &mi->blk[i];

                if (memblock_addrs_overlap(start, end - start, bi->start,
                                           bi->end - bi->start)) {
                        blk[count] = &mi->blk[i];
                        count++;
                }
        }
        if (!count)
                return NUMA_NO_MEMBLK;

        /* Sort the list of pointers in memblk->start order */
        sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);

        /* Make sure the first/last memblks include start/end */
        blk[0]->start = min(blk[0]->start, start);
        blk[count - 1]->end = max(blk[count - 1]->end, end);

        /*
         * Fill any gaps by tracking the previous memblks
         * end address and backfilling to it if needed.
         */
        prev_end = blk[0]->end;
        for (int i = 1; i < count; i++) {
                struct numa_memblk *curr = blk[i];

                if (prev_end >= curr->start) {
                        if (prev_end < curr->end)
                                prev_end = curr->end;
                } else {
                        curr->start = prev_end;
                        prev_end = curr->end;
                }
        }
        return 0;
}














































































    1 







































































































































































































    1 













    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cgroup

#if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CGROUP_H

#include <linux/cgroup.h>
#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(cgroup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        u16,                ss_mask                        )
                __string(        name,                root->name                )
        ),

        TP_fast_assign(
                __entry->root = root->hierarchy_id;
                __entry->ss_mask = root->subsys_mask;
                __assign_str(name);
        ),

        TP_printk("root=%d ss_mask=%#x name=%s",
                  __entry->root, __entry->ss_mask, __get_str(name))
);

DEFINE_EVENT(cgroup_root, cgroup_setup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_destroy_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_remount,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DECLARE_EVENT_CLASS(cgroup,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path);
        ),

        TP_printk("root=%d id=%llu level=%d path=%s",
                  __entry->root, __entry->id, __entry->level, __get_str(path))
);

DEFINE_EVENT(cgroup, cgroup_mkdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rmdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_release,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rename,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_freeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_unfreeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DECLARE_EVENT_CLASS(cgroup_migrate,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup),

        TP_STRUCT__entry(
                __field(        int,                dst_root                )
                __field(        int,                dst_level                )
                __field(        u64,                dst_id                        )
                __field(        int,                pid                        )
                __string(        dst_path,        path                        )
                __string(        comm,                task->comm                )
        ),

        TP_fast_assign(
                __entry->dst_root = dst_cgrp->root->hierarchy_id;
                __entry->dst_id = cgroup_id(dst_cgrp);
                __entry->dst_level = dst_cgrp->level;
                __assign_str(dst_path);
                __entry->pid = task->pid;
                __assign_str(comm);
        ),

        TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
                  __entry->dst_root, __entry->dst_id, __entry->dst_level,
                  __get_str(dst_path), __entry->pid, __get_str(comm))
);

DEFINE_EVENT(cgroup_migrate, cgroup_attach_task,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DECLARE_EVENT_CLASS(cgroup_event,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
                __field(        int,                val                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path);
                __entry->val = val;
        ),

        TP_printk("root=%d id=%llu level=%d path=%s val=%d",
                  __entry->root, __entry->id, __entry->level, __get_str(path),
                  __entry->val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_populated,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

DECLARE_EVENT_CLASS(cgroup_rstat,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __field(        int,                cpu                        )
                __field(        bool,                contended                )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __entry->cpu = cpu;
                __entry->contended = contended;
        ),

        TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
                  __entry->root, __entry->id, __entry->level,
                  __entry->cpu, __entry->contended)
);

/* Related to global: cgroup_rstat_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

/* Related to per CPU: cgroup_rstat_cpu_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

#endif /* _TRACE_CGROUP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>








































































































































    1 

































































    3 





















































    5 

























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

#include <linux/sched/coredump.h>
#include <linux/mm_types.h>

#include <linux/fs.h> /* only for vma_is_dax() */

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
void huge_pmd_set_accessed(struct vm_fault *vmf);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
#else
static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
}
#endif

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                           pmd_t *pmd, unsigned long addr, unsigned long next);
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr);
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
                 unsigned long addr);
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags);

vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);

enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_UNSUPPORTED,
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
        TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
};

struct kobject;
struct kobj_attribute;

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count,
                                   enum transparent_hugepage_flag flag);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag);
extern struct kobj_attribute shmem_enabled_attr;

/*
 * Mask of all large folio orders supported for anonymous THP; all orders up to
 * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
 * (which is a limitation of the THP implementation).
 */
#define THP_ORDERS_ALL_ANON        ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))

/*
 * Mask of all large folio orders supported for file THP.
 */
#define THP_ORDERS_ALL_FILE        (BIT(PMD_ORDER) | BIT(PUD_ORDER))

/*
 * Mask of all large folio orders supported for THP.
 */
#define THP_ORDERS_ALL                (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)

#define TVA_SMAPS                (1 << 0)        /* Will be used for procfs */
#define TVA_IN_PF                (1 << 1)        /* Page fault handler */
#define TVA_ENFORCE_SYSFS        (1 << 2)        /* Obey sysfs configuration */

#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
        (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PUD_SHIFT PUD_SHIFT
#else
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
#endif

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#define HPAGE_PMD_MASK        (~(HPAGE_PMD_SIZE - 1))
#define HPAGE_PMD_SIZE        ((1UL) << HPAGE_PMD_SHIFT)

#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
#define HPAGE_PUD_MASK        (~(HPAGE_PUD_SIZE - 1))
#define HPAGE_PUD_SIZE        ((1UL) << HPAGE_PUD_SHIFT)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
extern unsigned long huge_anon_orders_madvise;
extern unsigned long huge_anon_orders_inherit;

static inline bool hugepage_global_enabled(void)
{
        return transparent_hugepage_flags &
                        ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
                        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
}

static inline bool hugepage_global_always(void)
{
        return transparent_hugepage_flags &
                        (1<<TRANSPARENT_HUGEPAGE_FLAG);
}

static inline bool hugepage_flags_enabled(void)
{
        /*
         * We cover both the anon and the file-backed case here; we must return
         * true if globally enabled, even when all anon sizes are set to never.
         * So we don't need to look at huge_anon_orders_inherit.
         */
        return hugepage_global_enabled() ||
               huge_anon_orders_always ||
               huge_anon_orders_madvise;
}

static inline int highest_order(unsigned long orders)
{
        return fls_long(orders) - 1;
}

static inline int next_order(unsigned long *orders, int prev)
{
        *orders &= ~BIT(prev);
        return highest_order(*orders);
}

/*
 * Do the below checks:
 *   - For file vma, check if the linear page offset of vma is
 *     order-aligned within the file.  The hugepage is
 *     guaranteed to be order-aligned within the file, but we must
 *     check that the order-aligned addresses in the VMA map to
 *     order-aligned offsets within the file, else the hugepage will
 *     not be mappable.
 *   - For all vmas, check if the haddr is in an aligned hugepage
 *     area.
 */
static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        unsigned long hpage_size = PAGE_SIZE << order;
        unsigned long haddr;

        /* Don't have to check pgoff for anonymous vma */
        if (!vma_is_anonymous(vma)) {
                if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                hpage_size >> PAGE_SHIFT))
                        return false;
        }

        haddr = ALIGN_DOWN(addr, hpage_size);

        if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end)
                return false;
        return true;
}

/*
 * Filter the bitfield of input orders to the ones suitable for use in the vma.
 * See thp_vma_suitable_order().
 * All orders that pass the checks are returned as a bitfield.
 */
static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        int order;

        /*
         * Iterate over orders, highest to lowest, removing orders that don't
         * meet alignment requirements from the set. Exit loop at first order
         * that meets requirements, since all lower orders must also meet
         * requirements.
         */

        order = highest_order(orders);

        while (orders) {
                if (thp_vma_suitable_order(vma, addr, order))
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}

static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!vma->vm_file)
                return false;

        inode = vma->vm_file->f_inode;

        return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
               !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         unsigned long vm_flags,
                                         unsigned long tva_flags,
                                         unsigned long orders);

/**
 * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
 * @vma:  the vm area to check
 * @vm_flags: use these vm_flags instead of vma->vm_flags
 * @tva_flags: Which TVA flags to honour
 * @orders: bitfield of all orders to consider
 *
 * Calculates the intersection of the requested hugepage orders and the allowed
 * hugepage orders for the provided vma. Permitted orders are encoded as a set
 * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3
 * corresponds to order-3, etc). Order-0 is never considered a hugepage order.
 *
 * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage
 * orders are allowed.
 */
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                       unsigned long vm_flags,
                                       unsigned long tva_flags,
                                       unsigned long orders)
{
        /* Optimization to check if required orders are enabled early. */
        if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
                unsigned long mask = READ_ONCE(huge_anon_orders_always);

                if (vm_flags & VM_HUGEPAGE)
                        mask |= READ_ONCE(huge_anon_orders_madvise);
                if (hugepage_global_always() ||
                    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
                        mask |= READ_ONCE(huge_anon_orders_inherit);

                orders &= mask;
                if (!orders)
                        return 0;
        }

        return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}

enum mthp_stat_item {
        MTHP_STAT_ANON_FAULT_ALLOC,
        MTHP_STAT_ANON_FAULT_FALLBACK,
        MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
        MTHP_STAT_SWPOUT,
        MTHP_STAT_SWPOUT_FALLBACK,
        __MTHP_STAT_COUNT
};

struct mthp_stat {
        unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
};

#ifdef CONFIG_SYSFS
DECLARE_PER_CPU(struct mthp_stat, mthp_stats);

static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
        if (order <= 0 || order > PMD_ORDER)
                return;

        this_cpu_inc(mthp_stats.stats[order][item]);
}
#else
static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
}
#endif

#define transparent_hugepage_use_zero_page()                                \
        (transparent_hugepage_flags &                                        \
         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags);

bool can_split_folio(struct folio *folio, int *pextra_pins);
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order);
static inline int split_huge_page(struct page *page)
{
        return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio);

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio);

#define split_huge_pmd(__vma, __pmd, __address)                                \
        do {                                                                \
                pmd_t *____pmd = (__pmd);                                \
                if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)        \
                                        || pmd_devmap(*____pmd))        \
                        __split_huge_pmd(__vma, __pmd, __address,        \
                                                false, NULL);                \
        }  while (0)


void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze, struct folio *folio);

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address);

#define split_huge_pud(__vma, __pud, __address)                                \
        do {                                                                \
                pud_t *____pud = (__pud);                                \
                if (pud_trans_huge(*____pud)                                \
                                        || pud_devmap(*____pud))        \
                        __split_huge_pud(__vma, __pud, __address);        \
        }  while (0)

int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
                     int advice);
int madvise_collapse(struct vm_area_struct *vma,
                     struct vm_area_struct **prev,
                     unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);

static inline int is_swap_pmd(pmd_t pmd)
{
        return !pmd_none(pmd) && !pmd_present(pmd);
}

/* mmap_lock must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
                return __pmd_trans_huge_lock(pmd, vma);
        else
                return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        if (pud_trans_huge(*pud) || pud_devmap(*pud))
                return __pud_trans_huge_lock(pud, vma);
        else
                return NULL;
}

/**
 * folio_test_pmd_mappable - Can we map this folio with a PMD?
 * @folio: The folio to test
 */
static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return folio_order(folio) >= HPAGE_PMD_ORDER;
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap);

vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);

extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        return READ_ONCE(huge_zero_folio) == folio;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
}

static inline bool is_huge_zero_pud(pud_t pud)
{
        return false;
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);

#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))

static inline bool thp_migration_supported(void)
{
        return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}

#else /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return false;
}

static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        return false;
}

static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        return 0;
}

static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                        unsigned long vm_flags,
                                        unsigned long tva_flags,
                                        unsigned long orders)
{
        return 0;
}

#define transparent_hugepage_flags 0UL

#define thp_get_unmapped_area        NULL

static inline unsigned long
thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                              unsigned long len, unsigned long pgoff,
                              unsigned long flags, vm_flags_t vm_flags)
{
        return 0;
}

static inline bool
can_split_folio(struct folio *folio, int *pextra_pins)
{
        return false;
}
static inline int
split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order)
{
        return 0;
}
static inline int split_huge_page(struct page *page)
{
        return 0;
}
static inline void deferred_split_folio(struct folio *folio) {}
#define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)

static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
                unsigned long address, bool freeze, struct folio *folio) {}

#define split_huge_pud(__vma, __pmd, __address)        \
        do { } while (0)

static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
{
        return -EINVAL;
}

static inline int madvise_collapse(struct vm_area_struct *vma,
                                   struct vm_area_struct **prev,
                                   unsigned long start, unsigned long end)
{
        return -EINVAL;
}

static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         unsigned long start,
                                         unsigned long end,
                                         long adjust_next)
{
}
static inline int is_swap_pmd(pmd_t pmd)
{
        return 0;
}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        return NULL;
}

static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        return 0;
}

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        return false;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return false;
}

static inline bool is_huge_zero_pud(pud_t pud)
{
        return false;
}

static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        return;
}

static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
        unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
        return NULL;
}

static inline bool thp_migration_supported(void)
{
        return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline int split_folio_to_list_to_order(struct folio *folio,
                struct list_head *list, int new_order)
{
        return split_huge_page_to_list_to_order(&folio->page, list, new_order);
}

static inline int split_folio_to_order(struct folio *folio, int new_order)
{
        return split_folio_to_list_to_order(folio, NULL, new_order);
}

#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
#define split_folio(f) split_folio_to_order(f, 0)

#endif /* _LINUX_HUGE_MM_H */


















































































































































































































































































































































































































































































































































































































































































































































    1 



















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
// SPDX-License-Identifier: GPL-2.0-or-later
/** -*- linux-c -*- ***********************************************************
 * Linux PPP over Ethernet (PPPoX/PPPoE) Sockets
 *
 * PPPoX --- Generic PPP encapsulation socket family
 * PPPoE --- PPP over Ethernet (RFC 2516)
 *
 * Version:        0.7.0
 *
 * 070228 :        Fix to allow multiple sessions with same remote MAC and same
 *                session id by including the local device ifindex in the
 *                tuple identifying a session. This also ensures packets can't
 *                be injected into a session from interfaces other than the one
 *                specified by userspace. Florian Zumbiehl <florz@florz.de>
 *                (Oh, BTW, this one is YYMMDD, in case you were wondering ...)
 * 220102 :        Fix module use count on failure in pppoe_create, pppox_sk -acme
 * 030700 :        Fixed connect logic to allow for disconnect.
 * 270700 :        Fixed potential SMP problems; we must protect against
 *                simultaneous invocation of ppp_input
 *                and ppp_unregister_channel.
 * 040800 :        Respect reference count mechanisms on net-devices.
 * 200800 :        fix kfree(skb) in pppoe_rcv (acme)
 *                Module reference count is decremented in the right spot now,
 *                guards against sock_put not actually freeing the sk
 *                in pppoe_release.
 * 051000 :        Initialization cleanup.
 * 111100 :        Fix recvmsg.
 * 050101 :        Fix PADT processing.
 * 140501 :        Use pppoe_rcv_core to handle all backlog. (Alexey)
 * 170701 :        Do not lock_sock with rwlock held. (DaveM)
 *                Ignore discovery frames if user has socket
 *                locked. (DaveM)
 *                Ignore return value of dev_queue_xmit in __pppoe_xmit
 *                or else we may kfree an SKB twice. (DaveM)
 * 190701 :        When doing copies of skb's in __pppoe_xmit, always delete
 *                the original skb that was passed in on success, never on
 *                failure.  Delete the copy of the skb on failure to avoid
 *                a memory leak.
 * 081001 :        Misc. cleanup (licence string, non-blocking, prevent
 *                reference of device on close).
 * 121301 :        New ppp channels interface; cannot unregister a channel
 *                from interrupts.  Thus, we mark the socket as a ZOMBIE
 *                and do the unregistration later.
 * 081002 :        seq_file support for proc stuff -acme
 * 111602 :        Merge all 2.4 fixes into 2.5/2.6 tree.  Label 2.5/2.6
 *                as version 0.7.  Spacing cleanup.
 * Author:        Michal Ostrowski <mostrows@speakeasy.net>
 * Contributors:
 *                 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *                David S. Miller (davem@redhat.com)
 *
 * License:
 */

#include <linux/string.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/net.h>
#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/if_pppox.h>
#include <linux/ppp_channel.h>
#include <linux/ppp_defs.h>
#include <linux/ppp-ioctl.h>
#include <linux/notifier.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

#include <linux/nsproxy.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>

#include <linux/uaccess.h>

#define PPPOE_HASH_BITS CONFIG_PPPOE_HASH_BITS
#define PPPOE_HASH_SIZE (1 << PPPOE_HASH_BITS)
#define PPPOE_HASH_MASK        (PPPOE_HASH_SIZE - 1)

static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb);

static const struct proto_ops pppoe_ops;
static const struct ppp_channel_ops pppoe_chan_ops;

/* per-net private data for this module */
static unsigned int pppoe_net_id __read_mostly;
struct pppoe_net {
        /*
         * we could use _single_ hash table for all
         * nets by injecting net id into the hash but
         * it would increase hash chains and add
         * a few additional math comparisons messy
         * as well, moreover in case of SMP less locking
         * controversy here
         */
        struct pppox_sock *hash_table[PPPOE_HASH_SIZE];
        rwlock_t hash_lock;
};

/*
 * PPPoE could be in the following stages:
 * 1) Discovery stage (to obtain remote MAC and Session ID)
 * 2) Session stage (MAC and SID are known)
 *
 * Ethernet frames have a special tag for this but
 * we use simpler approach based on session id
 */
static inline bool stage_session(__be16 sid)
{
        return sid != 0;
}

static inline struct pppoe_net *pppoe_pernet(struct net *net)
{
        return net_generic(net, pppoe_net_id);
}

static inline int cmp_2_addr(struct pppoe_addr *a, struct pppoe_addr *b)
{
        return a->sid == b->sid && ether_addr_equal(a->remote, b->remote);
}

static inline int cmp_addr(struct pppoe_addr *a, __be16 sid, char *addr)
{
        return a->sid == sid && ether_addr_equal(a->remote, addr);
}

#if 8 % PPPOE_HASH_BITS
#error 8 must be a multiple of PPPOE_HASH_BITS
#endif

static int hash_item(__be16 sid, unsigned char *addr)
{
        unsigned char hash = 0;
        unsigned int i;

        for (i = 0; i < ETH_ALEN; i++)
                hash ^= addr[i];
        for (i = 0; i < sizeof(sid_t) * 8; i += 8)
                hash ^= (__force __u32)sid >> i;
        for (i = 8; (i >>= 1) >= PPPOE_HASH_BITS;)
                hash ^= hash >> i;

        return hash & PPPOE_HASH_MASK;
}

/**********************************************************************
 *
 *  Set/get/delete/rehash items  (internal versions)
 *
 **********************************************************************/
static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid,
                                unsigned char *addr, int ifindex)
{
        int hash = hash_item(sid, addr);
        struct pppox_sock *ret;

        ret = pn->hash_table[hash];
        while (ret) {
                if (cmp_addr(&ret->pppoe_pa, sid, addr) &&
                    ret->pppoe_ifindex == ifindex)
                        return ret;

                ret = ret->next;
        }

        return NULL;
}

static int __set_item(struct pppoe_net *pn, struct pppox_sock *po)
{
        int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote);
        struct pppox_sock *ret;

        ret = pn->hash_table[hash];
        while (ret) {
                if (cmp_2_addr(&ret->pppoe_pa, &po->pppoe_pa) &&
                    ret->pppoe_ifindex == po->pppoe_ifindex)
                        return -EALREADY;

                ret = ret->next;
        }

        po->next = pn->hash_table[hash];
        pn->hash_table[hash] = po;

        return 0;
}

static void __delete_item(struct pppoe_net *pn, __be16 sid,
                                        char *addr, int ifindex)
{
        int hash = hash_item(sid, addr);
        struct pppox_sock *ret, **src;

        ret = pn->hash_table[hash];
        src = &pn->hash_table[hash];

        while (ret) {
                if (cmp_addr(&ret->pppoe_pa, sid, addr) &&
                    ret->pppoe_ifindex == ifindex) {
                        *src = ret->next;
                        break;
                }

                src = &ret->next;
                ret = ret->next;
        }
}

/**********************************************************************
 *
 *  Set/get/delete/rehash items
 *
 **********************************************************************/
static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid,
                                        unsigned char *addr, int ifindex)
{
        struct pppox_sock *po;

        read_lock_bh(&pn->hash_lock);
        po = __get_item(pn, sid, addr, ifindex);
        if (po)
                sock_hold(sk_pppox(po));
        read_unlock_bh(&pn->hash_lock);

        return po;
}

static inline struct pppox_sock *get_item_by_addr(struct net *net,
                                                struct sockaddr_pppox *sp)
{
        struct net_device *dev;
        struct pppoe_net *pn;
        struct pppox_sock *pppox_sock = NULL;

        int ifindex;

        rcu_read_lock();
        dev = dev_get_by_name_rcu(net, sp->sa_addr.pppoe.dev);
        if (dev) {
                ifindex = dev->ifindex;
                pn = pppoe_pernet(net);
                pppox_sock = get_item(pn, sp->sa_addr.pppoe.sid,
                                sp->sa_addr.pppoe.remote, ifindex);
        }
        rcu_read_unlock();
        return pppox_sock;
}

static inline void delete_item(struct pppoe_net *pn, __be16 sid,
                                        char *addr, int ifindex)
{
        write_lock_bh(&pn->hash_lock);
        __delete_item(pn, sid, addr, ifindex);
        write_unlock_bh(&pn->hash_lock);
}

/***************************************************************************
 *
 *  Handler for device events.
 *  Certain device events require that sockets be unconnected.
 *
 **************************************************************************/

static void pppoe_flush_dev(struct net_device *dev)
{
        struct pppoe_net *pn;
        int i;

        pn = pppoe_pernet(dev_net(dev));
        write_lock_bh(&pn->hash_lock);
        for (i = 0; i < PPPOE_HASH_SIZE; i++) {
                struct pppox_sock *po = pn->hash_table[i];
                struct sock *sk;

                while (po) {
                        while (po && po->pppoe_dev != dev) {
                                po = po->next;
                        }

                        if (!po)
                                break;

                        sk = sk_pppox(po);

                        /* We always grab the socket lock, followed by the
                         * hash_lock, in that order.  Since we should hold the
                         * sock lock while doing any unbinding, we need to
                         * release the lock we're holding.  Hold a reference to
                         * the sock so it doesn't disappear as we're jumping
                         * between locks.
                         */

                        sock_hold(sk);
                        write_unlock_bh(&pn->hash_lock);
                        lock_sock(sk);

                        if (po->pppoe_dev == dev &&
                            sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) {
                                pppox_unbind_sock(sk);
                                sk->sk_state_change(sk);
                                po->pppoe_dev = NULL;
                                dev_put(dev);
                        }

                        release_sock(sk);
                        sock_put(sk);

                        /* Restart the process from the start of the current
                         * hash chain. We dropped locks so the world may have
                         * change from underneath us.
                         */

                        BUG_ON(pppoe_pernet(dev_net(dev)) == NULL);
                        write_lock_bh(&pn->hash_lock);
                        po = pn->hash_table[i];
                }
        }
        write_unlock_bh(&pn->hash_lock);
}

static int pppoe_device_event(struct notifier_block *this,
                              unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        /* Only look at sockets that are using this specific device. */
        switch (event) {
        case NETDEV_CHANGEADDR:
        case NETDEV_CHANGEMTU:
                /* A change in mtu or address is a bad thing, requiring
                 * LCP re-negotiation.
                 */

        case NETDEV_GOING_DOWN:
        case NETDEV_DOWN:
                /* Find every socket on this device and kill it. */
                pppoe_flush_dev(dev);
                break;

        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block pppoe_notifier = {
        .notifier_call = pppoe_device_event,
};

/************************************************************************
 *
 * Do the real work of receiving a PPPoE Session frame.
 *
 ***********************************************************************/
static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb)
{
        struct pppox_sock *po = pppox_sk(sk);
        struct pppox_sock *relay_po;

        /* Backlog receive. Semantics of backlog rcv preclude any code from
         * executing in lock_sock()/release_sock() bounds; meaning sk->sk_state
         * can't change.
         */

        if (skb->pkt_type == PACKET_OTHERHOST)
                goto abort_kfree;

        if (sk->sk_state & PPPOX_BOUND) {
                ppp_input(&po->chan, skb);
        } else if (sk->sk_state & PPPOX_RELAY) {
                relay_po = get_item_by_addr(sock_net(sk),
                                            &po->pppoe_relay);
                if (relay_po == NULL)
                        goto abort_kfree;

                if ((sk_pppox(relay_po)->sk_state & PPPOX_CONNECTED) == 0)
                        goto abort_put;

                if (!__pppoe_xmit(sk_pppox(relay_po), skb))
                        goto abort_put;

                sock_put(sk_pppox(relay_po));
        } else {
                if (sock_queue_rcv_skb(sk, skb))
                        goto abort_kfree;
        }

        return NET_RX_SUCCESS;

abort_put:
        sock_put(sk_pppox(relay_po));

abort_kfree:
        kfree_skb(skb);
        return NET_RX_DROP;
}

/************************************************************************
 *
 * Receive wrapper called in BH context.
 *
 ***********************************************************************/
static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev,
                     struct packet_type *pt, struct net_device *orig_dev)
{
        struct pppoe_hdr *ph;
        struct pppox_sock *po;
        struct pppoe_net *pn;
        int len;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb)
                goto out;

        if (skb_mac_header_len(skb) < ETH_HLEN)
                goto drop;

        if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
                goto drop;

        ph = pppoe_hdr(skb);
        len = ntohs(ph->length);

        skb_pull_rcsum(skb, sizeof(*ph));
        if (skb->len < len)
                goto drop;

        if (pskb_trim_rcsum(skb, len))
                goto drop;

        ph = pppoe_hdr(skb);
        pn = pppoe_pernet(dev_net(dev));

        /* Note that get_item does a sock_hold(), so sk_pppox(po)
         * is known to be safe.
         */
        po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex);
        if (!po)
                goto drop;

        return sk_receive_skb(sk_pppox(po), skb, 0);

drop:
        kfree_skb(skb);
out:
        return NET_RX_DROP;
}

static void pppoe_unbind_sock_work(struct work_struct *work)
{
        struct pppox_sock *po = container_of(work, struct pppox_sock,
                                             proto.pppoe.padt_work);
        struct sock *sk = sk_pppox(po);

        lock_sock(sk);
        if (po->pppoe_dev) {
                dev_put(po->pppoe_dev);
                po->pppoe_dev = NULL;
        }
        pppox_unbind_sock(sk);
        release_sock(sk);
        sock_put(sk);
}

/************************************************************************
 *
 * Receive a PPPoE Discovery frame.
 * This is solely for detection of PADT frames
 *
 ***********************************************************************/
static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev,
                          struct packet_type *pt, struct net_device *orig_dev)

{
        struct pppoe_hdr *ph;
        struct pppox_sock *po;
        struct pppoe_net *pn;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb)
                goto out;

        if (skb->pkt_type != PACKET_HOST)
                goto abort;

        if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr)))
                goto abort;

        ph = pppoe_hdr(skb);
        if (ph->code != PADT_CODE)
                goto abort;

        pn = pppoe_pernet(dev_net(dev));
        po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex);
        if (po)
                if (!schedule_work(&po->proto.pppoe.padt_work))
                        sock_put(sk_pppox(po));

abort:
        kfree_skb(skb);
out:
        return NET_RX_SUCCESS; /* Lies... :-) */
}

static struct packet_type pppoes_ptype __read_mostly = {
        .type        = cpu_to_be16(ETH_P_PPP_SES),
        .func        = pppoe_rcv,
};

static struct packet_type pppoed_ptype __read_mostly = {
        .type        = cpu_to_be16(ETH_P_PPP_DISC),
        .func        = pppoe_disc_rcv,
};

static struct proto pppoe_sk_proto __read_mostly = {
        .name          = "PPPOE",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct pppox_sock),
};

/***********************************************************************
 *
 * Initialize a new struct sock.
 *
 **********************************************************************/
static int pppoe_create(struct net *net, struct socket *sock, int kern)
{
        struct sock *sk;

        sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);

        sock->state        = SS_UNCONNECTED;
        sock->ops        = &pppoe_ops;

        sk->sk_backlog_rcv        = pppoe_rcv_core;
        sk->sk_state                = PPPOX_NONE;
        sk->sk_type                = SOCK_STREAM;
        sk->sk_family                = PF_PPPOX;
        sk->sk_protocol                = PX_PROTO_OE;

        INIT_WORK(&pppox_sk(sk)->proto.pppoe.padt_work,
                  pppoe_unbind_sock_work);

        return 0;
}

static int pppoe_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct pppox_sock *po;
        struct pppoe_net *pn;
        struct net *net = NULL;

        if (!sk)
                return 0;

        lock_sock(sk);
        if (sock_flag(sk, SOCK_DEAD)) {
                release_sock(sk);
                return -EBADF;
        }

        po = pppox_sk(sk);

        if (po->pppoe_dev) {
                dev_put(po->pppoe_dev);
                po->pppoe_dev = NULL;
        }

        pppox_unbind_sock(sk);

        /* Signal the death of the socket. */
        sk->sk_state = PPPOX_DEAD;

        net = sock_net(sk);
        pn = pppoe_pernet(net);

        /*
         * protect "po" from concurrent updates
         * on pppoe_flush_dev
         */
        delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote,
                    po->pppoe_ifindex);

        sock_orphan(sk);
        sock->sk = NULL;

        skb_queue_purge(&sk->sk_receive_queue);
        release_sock(sk);
        sock_put(sk);

        return 0;
}

static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr,
                  int sockaddr_len, int flags)
{
        struct sock *sk = sock->sk;
        struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr;
        struct pppox_sock *po = pppox_sk(sk);
        struct net_device *dev = NULL;
        struct pppoe_net *pn;
        struct net *net = NULL;
        int error;

        lock_sock(sk);

        error = -EINVAL;

        if (sockaddr_len != sizeof(struct sockaddr_pppox))
                goto end;

        if (sp->sa_protocol != PX_PROTO_OE)
                goto end;

        /* Check for already bound sockets */
        error = -EBUSY;
        if ((sk->sk_state & PPPOX_CONNECTED) &&
             stage_session(sp->sa_addr.pppoe.sid))
                goto end;

        /* Check for already disconnected sockets, on attempts to disconnect */
        error = -EALREADY;
        if ((sk->sk_state & PPPOX_DEAD) &&
             !stage_session(sp->sa_addr.pppoe.sid))
                goto end;

        error = 0;

        /* Delete the old binding */
        if (stage_session(po->pppoe_pa.sid)) {
                pppox_unbind_sock(sk);
                pn = pppoe_pernet(sock_net(sk));
                delete_item(pn, po->pppoe_pa.sid,
                            po->pppoe_pa.remote, po->pppoe_ifindex);
                if (po->pppoe_dev) {
                        dev_put(po->pppoe_dev);
                        po->pppoe_dev = NULL;
                }

                po->pppoe_ifindex = 0;
                memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa));
                memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay));
                memset(&po->chan, 0, sizeof(po->chan));
                po->next = NULL;
                po->num = 0;

                sk->sk_state = PPPOX_NONE;
        }

        /* Re-bind in session stage only */
        if (stage_session(sp->sa_addr.pppoe.sid)) {
                error = -ENODEV;
                net = sock_net(sk);
                dev = dev_get_by_name(net, sp->sa_addr.pppoe.dev);
                if (!dev)
                        goto err_put;

                po->pppoe_dev = dev;
                po->pppoe_ifindex = dev->ifindex;
                pn = pppoe_pernet(net);
                if (!(dev->flags & IFF_UP)) {
                        goto err_put;
                }

                memcpy(&po->pppoe_pa,
                       &sp->sa_addr.pppoe,
                       sizeof(struct pppoe_addr));

                write_lock_bh(&pn->hash_lock);
                error = __set_item(pn, po);
                write_unlock_bh(&pn->hash_lock);
                if (error < 0)
                        goto err_put;

                po->chan.hdrlen = (sizeof(struct pppoe_hdr) +
                                   dev->hard_header_len);

                po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr) - 2;
                po->chan.private = sk;
                po->chan.ops = &pppoe_chan_ops;

                error = ppp_register_net_channel(dev_net(dev), &po->chan);
                if (error) {
                        delete_item(pn, po->pppoe_pa.sid,
                                    po->pppoe_pa.remote, po->pppoe_ifindex);
                        goto err_put;
                }

                sk->sk_state = PPPOX_CONNECTED;
        }

        po->num = sp->sa_addr.pppoe.sid;

end:
        release_sock(sk);
        return error;
err_put:
        if (po->pppoe_dev) {
                dev_put(po->pppoe_dev);
                po->pppoe_dev = NULL;
        }
        goto end;
}

static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr,
                  int peer)
{
        int len = sizeof(struct sockaddr_pppox);
        struct sockaddr_pppox sp;

        sp.sa_family        = AF_PPPOX;
        sp.sa_protocol        = PX_PROTO_OE;
        memcpy(&sp.sa_addr.pppoe, &pppox_sk(sock->sk)->pppoe_pa,
               sizeof(struct pppoe_addr));

        memcpy(uaddr, &sp, len);

        return len;
}

static int pppoe_ioctl(struct socket *sock, unsigned int cmd,
                unsigned long arg)
{
        struct sock *sk = sock->sk;
        struct pppox_sock *po = pppox_sk(sk);
        int val;
        int err;

        switch (cmd) {
        case PPPIOCGMRU:
                err = -ENXIO;
                if (!(sk->sk_state & PPPOX_CONNECTED))
                        break;

                err = -EFAULT;
                if (put_user(po->pppoe_dev->mtu -
                             sizeof(struct pppoe_hdr) -
                             PPP_HDRLEN,
                             (int __user *)arg))
                        break;
                err = 0;
                break;

        case PPPIOCSMRU:
                err = -ENXIO;
                if (!(sk->sk_state & PPPOX_CONNECTED))
                        break;

                err = -EFAULT;
                if (get_user(val, (int __user *)arg))
                        break;

                if (val < (po->pppoe_dev->mtu
                           - sizeof(struct pppoe_hdr)
                           - PPP_HDRLEN))
                        err = 0;
                else
                        err = -EINVAL;
                break;

        case PPPIOCSFLAGS:
                err = -EFAULT;
                if (get_user(val, (int __user *)arg))
                        break;
                err = 0;
                break;

        case PPPOEIOCSFWD:
        {
                struct pppox_sock *relay_po;

                err = -EBUSY;
                if (sk->sk_state & (PPPOX_BOUND | PPPOX_DEAD))
                        break;

                err = -ENOTCONN;
                if (!(sk->sk_state & PPPOX_CONNECTED))
                        break;

                /* PPPoE address from the user specifies an outbound
                   PPPoE address which frames are forwarded to */
                err = -EFAULT;
                if (copy_from_user(&po->pppoe_relay,
                                   (void __user *)arg,
                                   sizeof(struct sockaddr_pppox)))
                        break;

                err = -EINVAL;
                if (po->pppoe_relay.sa_family != AF_PPPOX ||
                    po->pppoe_relay.sa_protocol != PX_PROTO_OE)
                        break;

                /* Check that the socket referenced by the address
                   actually exists. */
                relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay);
                if (!relay_po)
                        break;

                sock_put(sk_pppox(relay_po));
                sk->sk_state |= PPPOX_RELAY;
                err = 0;
                break;
        }

        case PPPOEIOCDFWD:
                err = -EALREADY;
                if (!(sk->sk_state & PPPOX_RELAY))
                        break;

                sk->sk_state &= ~PPPOX_RELAY;
                err = 0;
                break;

        default:
                err = -ENOTTY;
        }

        return err;
}

static int pppoe_sendmsg(struct socket *sock, struct msghdr *m,
                         size_t total_len)
{
        struct sk_buff *skb;
        struct sock *sk = sock->sk;
        struct pppox_sock *po = pppox_sk(sk);
        int error;
        struct pppoe_hdr hdr;
        struct pppoe_hdr *ph;
        struct net_device *dev;
        char *start;
        int hlen;

        lock_sock(sk);
        if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) {
                error = -ENOTCONN;
                goto end;
        }

        hdr.ver = 1;
        hdr.type = 1;
        hdr.code = 0;
        hdr.sid = po->num;

        dev = po->pppoe_dev;

        error = -EMSGSIZE;
        if (total_len > (dev->mtu + dev->hard_header_len))
                goto end;

        hlen = LL_RESERVED_SPACE(dev);
        skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len +
                           dev->needed_tailroom, 0, GFP_KERNEL);
        if (!skb) {
                error = -ENOMEM;
                goto end;
        }

        /* Reserve space for headers. */
        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);

        skb->dev = dev;

        skb->priority = READ_ONCE(sk->sk_priority);
        skb->protocol = cpu_to_be16(ETH_P_PPP_SES);

        ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr));
        start = (char *)&ph->tag[0];

        error = memcpy_from_msg(start, m, total_len);
        if (error < 0) {
                kfree_skb(skb);
                goto end;
        }

        error = total_len;
        dev_hard_header(skb, dev, ETH_P_PPP_SES,
                        po->pppoe_pa.remote, NULL, total_len);

        memcpy(ph, &hdr, sizeof(struct pppoe_hdr));

        ph->length = htons(total_len);

        dev_queue_xmit(skb);

end:
        release_sock(sk);
        return error;
}

/************************************************************************
 *
 * xmit function for internal use.
 *
 ***********************************************************************/
static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
{
        struct pppox_sock *po = pppox_sk(sk);
        struct net_device *dev = po->pppoe_dev;
        struct pppoe_hdr *ph;
        int data_len = skb->len;

        /* The higher-level PPP code (ppp_unregister_channel()) ensures the PPP
         * xmit operations conclude prior to an unregistration call.  Thus
         * sk->sk_state cannot change, so we don't need to do lock_sock().
         * But, we also can't do a lock_sock since that introduces a potential
         * deadlock as we'd reverse the lock ordering used when calling
         * ppp_unregister_channel().
         */

        if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED))
                goto abort;

        if (!dev)
                goto abort;

        /* Copy the data if there is no space for the header or if it's
         * read-only.
         */
        if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph)))
                goto abort;

        __skb_push(skb, sizeof(*ph));
        skb_reset_network_header(skb);

        ph = pppoe_hdr(skb);
        ph->ver        = 1;
        ph->type = 1;
        ph->code = 0;
        ph->sid        = po->num;
        ph->length = htons(data_len);

        skb->protocol = cpu_to_be16(ETH_P_PPP_SES);
        skb->dev = dev;

        dev_hard_header(skb, dev, ETH_P_PPP_SES,
                        po->pppoe_pa.remote, NULL, data_len);

        dev_queue_xmit(skb);
        return 1;

abort:
        kfree_skb(skb);
        return 1;
}

/************************************************************************
 *
 * xmit function called by generic PPP driver
 * sends PPP frame over PPPoE socket
 *
 ***********************************************************************/
static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb)
{
        struct sock *sk = chan->private;
        return __pppoe_xmit(sk, skb);
}

static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx,
                                   struct net_device_path *path,
                                   const struct ppp_channel *chan)
{
        struct sock *sk = chan->private;
        struct pppox_sock *po = pppox_sk(sk);
        struct net_device *dev = po->pppoe_dev;

        if (sock_flag(sk, SOCK_DEAD) ||
            !(sk->sk_state & PPPOX_CONNECTED) || !dev)
                return -1;

        path->type = DEV_PATH_PPPOE;
        path->encap.proto = htons(ETH_P_PPP_SES);
        path->encap.id = be16_to_cpu(po->num);
        memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN);
        memcpy(ctx->daddr, po->pppoe_pa.remote, ETH_ALEN);
        path->dev = ctx->dev;
        ctx->dev = dev;

        return 0;
}

static const struct ppp_channel_ops pppoe_chan_ops = {
        .start_xmit = pppoe_xmit,
        .fill_forward_path = pppoe_fill_forward_path,
};

static int pppoe_recvmsg(struct socket *sock, struct msghdr *m,
                         size_t total_len, int flags)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        int error = 0;

        if (sk->sk_state & PPPOX_BOUND)
                return -EIO;

        skb = skb_recv_datagram(sk, flags, &error);
        if (!skb)
                return error;

        total_len = min_t(size_t, total_len, skb->len);
        error = skb_copy_datagram_msg(skb, 0, m, total_len);
        if (error == 0) {
                consume_skb(skb);
                return total_len;
        }

        kfree_skb(skb);
        return error;
}

#ifdef CONFIG_PROC_FS
static int pppoe_seq_show(struct seq_file *seq, void *v)
{
        struct pppox_sock *po;
        char *dev_name;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "Id       Address              Device\n");
                goto out;
        }

        po = v;
        dev_name = po->pppoe_pa.dev;

        seq_printf(seq, "%08X %pM %8s\n",
                po->pppoe_pa.sid, po->pppoe_pa.remote, dev_name);
out:
        return 0;
}

static inline struct pppox_sock *pppoe_get_idx(struct pppoe_net *pn, loff_t pos)
{
        struct pppox_sock *po;
        int i;

        for (i = 0; i < PPPOE_HASH_SIZE; i++) {
                po = pn->hash_table[i];
                while (po) {
                        if (!pos--)
                                goto out;
                        po = po->next;
                }
        }

out:
        return po;
}

static void *pppoe_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(pn->hash_lock)
{
        struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq));
        loff_t l = *pos;

        read_lock_bh(&pn->hash_lock);
        return l ? pppoe_get_idx(pn, --l) : SEQ_START_TOKEN;
}

static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq));
        struct pppox_sock *po;

        ++*pos;
        if (v == SEQ_START_TOKEN) {
                po = pppoe_get_idx(pn, 0);
                goto out;
        }
        po = v;
        if (po->next)
                po = po->next;
        else {
                int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote);

                po = NULL;
                while (++hash < PPPOE_HASH_SIZE) {
                        po = pn->hash_table[hash];
                        if (po)
                                break;
                }
        }

out:
        return po;
}

static void pppoe_seq_stop(struct seq_file *seq, void *v)
        __releases(pn->hash_lock)
{
        struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq));
        read_unlock_bh(&pn->hash_lock);
}

static const struct seq_operations pppoe_seq_ops = {
        .start                = pppoe_seq_start,
        .next                = pppoe_seq_next,
        .stop                = pppoe_seq_stop,
        .show                = pppoe_seq_show,
};
#endif /* CONFIG_PROC_FS */

static const struct proto_ops pppoe_ops = {
        .family                = AF_PPPOX,
        .owner                = THIS_MODULE,
        .release        = pppoe_release,
        .bind                = sock_no_bind,
        .connect        = pppoe_connect,
        .socketpair        = sock_no_socketpair,
        .accept                = sock_no_accept,
        .getname        = pppoe_getname,
        .poll                = datagram_poll,
        .listen                = sock_no_listen,
        .shutdown        = sock_no_shutdown,
        .sendmsg        = pppoe_sendmsg,
        .recvmsg        = pppoe_recvmsg,
        .mmap                = sock_no_mmap,
        .ioctl                = pppox_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = pppox_compat_ioctl,
#endif
};

static const struct pppox_proto pppoe_proto = {
        .create        = pppoe_create,
        .ioctl        = pppoe_ioctl,
        .owner        = THIS_MODULE,
};

static __net_init int pppoe_init_net(struct net *net)
{
        struct pppoe_net *pn = pppoe_pernet(net);
        struct proc_dir_entry *pde;

        rwlock_init(&pn->hash_lock);

        pde = proc_create_net("pppoe", 0444, net->proc_net,
                        &pppoe_seq_ops, sizeof(struct seq_net_private));
#ifdef CONFIG_PROC_FS
        if (!pde)
                return -ENOMEM;
#endif

        return 0;
}

static __net_exit void pppoe_exit_net(struct net *net)
{
        remove_proc_entry("pppoe", net->proc_net);
}

static struct pernet_operations pppoe_net_ops = {
        .init = pppoe_init_net,
        .exit = pppoe_exit_net,
        .id   = &pppoe_net_id,
        .size = sizeof(struct pppoe_net),
};

static int __init pppoe_init(void)
{
        int err;

        err = register_pernet_device(&pppoe_net_ops);
        if (err)
                goto out;

        err = proto_register(&pppoe_sk_proto, 0);
        if (err)
                goto out_unregister_net_ops;

        err = register_pppox_proto(PX_PROTO_OE, &pppoe_proto);
        if (err)
                goto out_unregister_pppoe_proto;

        dev_add_pack(&pppoes_ptype);
        dev_add_pack(&pppoed_ptype);
        register_netdevice_notifier(&pppoe_notifier);

        return 0;

out_unregister_pppoe_proto:
        proto_unregister(&pppoe_sk_proto);
out_unregister_net_ops:
        unregister_pernet_device(&pppoe_net_ops);
out:
        return err;
}

static void __exit pppoe_exit(void)
{
        unregister_netdevice_notifier(&pppoe_notifier);
        dev_remove_pack(&pppoed_ptype);
        dev_remove_pack(&pppoes_ptype);
        unregister_pppox_proto(PX_PROTO_OE);
        proto_unregister(&pppoe_sk_proto);
        unregister_pernet_device(&pppoe_net_ops);
}

module_init(pppoe_init);
module_exit(pppoe_exit);

MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>");
MODULE_DESCRIPTION("PPP over Ethernet driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OE);










    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __VDSO_MATH64_H
#define __VDSO_MATH64_H

static __always_inline u32
__iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
{
        u32 ret = 0;

        while (dividend >= divisor) {
                /* The following asm() prevents the compiler from
                   optimising this loop into a modulo operation.  */
                asm("" : "+rm"(dividend));

                dividend -= divisor;
                ret++;
        }

        *remainder = dividend;

        return ret;
}

#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)

#ifndef mul_u64_u32_add_u64_shr
static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
{
        return (u64)((((unsigned __int128)a * mul) + b) >> shift);
}
#endif /* mul_u64_u32_add_u64_shr */

#else

#ifndef mul_u64_u32_add_u64_shr
#ifndef mul_u32_u32
static inline u64 mul_u32_u32(u32 a, u32 b)
{
        return (u64)a * b;
}
#define mul_u32_u32 mul_u32_u32
#endif
static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
{
        u32 ah = a >> 32, al = a;
        bool ovf;
        u64 ret;

        ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret);
        ret >>= shift;
        if (ovf && shift)
                ret += 1ULL << (64 - shift);
        if (ah)
                ret += mul_u32_u32(ah, mul) << (32 - shift);

        return ret;
}
#endif /* mul_u64_u32_add_u64_shr */

#endif

#endif /* __VDSO_MATH64_H */











































































    1 





    1 













    1 






    1 

    1 



    1 



    1 








    1 


    1 







    1 
    1 
    1 





    1 






    1 
    1 













    1 


    1 

    1 















    1 


    1 


    1 

































    1 





































    1 


























    1 









































    1 




    1 




    1 
















    1 















    1 











    1 


    1 





    1 






    1 




    1 







    1 




































































    1 
    1 








    1 


    1 





    1 










    1 





    1 



























    1 









    1 












    1 








    1 





    1 



































    1 
    1 



    1 


    1 














    1 
    1 


    1 




    1 




    1 







































































































































    1 










    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions handle output processing.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Jon Grimm             <jgrimm@austin.ibm.com>
 *    Sridhar Samudrala     <sri@us.ibm.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/time.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <net/inet_ecn.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/net_namespace.h>

#include <linux/socket.h> /* for sa_family_t */
#include <net/sock.h>

#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/checksum.h>

/* Forward declarations for private helpers. */
static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet,
                                                 struct sctp_chunk *chunk);
static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet,
                                                  struct sctp_chunk *chunk);
static void sctp_packet_append_data(struct sctp_packet *packet,
                                    struct sctp_chunk *chunk);
static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
                                           struct sctp_chunk *chunk,
                                           u16 chunk_len);

static void sctp_packet_reset(struct sctp_packet *packet)
{
        /* sctp_packet_transmit() relies on this to reset size to the
         * current overhead after sending packets.
         */
        packet->size = packet->overhead;

        packet->has_cookie_echo = 0;
        packet->has_sack = 0;
        packet->has_data = 0;
        packet->has_auth = 0;
        packet->ipfragok = 0;
        packet->auth = NULL;
}

/* Config a packet.
 * This appears to be a followup set of initializations.
 */
void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
                        int ecn_capable)
{
        struct sctp_transport *tp = packet->transport;
        struct sctp_association *asoc = tp->asoc;
        struct sctp_sock *sp = NULL;
        struct sock *sk;

        pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
        packet->vtag = vtag;

        /* do the following jobs only once for a flush schedule */
        if (!sctp_packet_empty(packet))
                return;

        /* set packet max_size with pathmtu, then calculate overhead */
        packet->max_size = tp->pathmtu;

        if (asoc) {
                sk = asoc->base.sk;
                sp = sctp_sk(sk);
        }
        packet->overhead = sctp_mtu_payload(sp, 0, 0);
        packet->size = packet->overhead;

        if (!asoc)
                return;

        /* update dst or transport pathmtu if in need */
        if (!sctp_transport_dst_check(tp)) {
                sctp_transport_route(tp, NULL, sp);
                if (asoc->param_flags & SPP_PMTUD_ENABLE)
                        sctp_assoc_sync_pmtu(asoc);
        } else if (!sctp_transport_pl_enabled(tp) &&
                   asoc->param_flags & SPP_PMTUD_ENABLE) {
                if (!sctp_transport_pmtu_check(tp))
                        sctp_assoc_sync_pmtu(asoc);
        }

        if (asoc->pmtu_pending) {
                if (asoc->param_flags & SPP_PMTUD_ENABLE)
                        sctp_assoc_sync_pmtu(asoc);
                asoc->pmtu_pending = 0;
        }

        /* If there a is a prepend chunk stick it on the list before
         * any other chunks get appended.
         */
        if (ecn_capable) {
                struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc);

                if (chunk)
                        sctp_packet_append_chunk(packet, chunk);
        }

        if (!tp->dst)
                return;

        /* set packet max_size with gso_max_size if gso is enabled*/
        rcu_read_lock();
        if (__sk_dst_get(sk) != tp->dst) {
                dst_hold(tp->dst);
                sk_setup_caps(sk, tp->dst);
        }
        packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size),
                                                GSO_LEGACY_MAX_SIZE)
                                          : asoc->pathmtu;
        rcu_read_unlock();
}

/* Initialize the packet structure. */
void sctp_packet_init(struct sctp_packet *packet,
                      struct sctp_transport *transport,
                      __u16 sport, __u16 dport)
{
        pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport);

        packet->transport = transport;
        packet->source_port = sport;
        packet->destination_port = dport;
        INIT_LIST_HEAD(&packet->chunk_list);
        /* The overhead will be calculated by sctp_packet_config() */
        packet->overhead = 0;
        sctp_packet_reset(packet);
        packet->vtag = 0;
}

/* Free a packet.  */
void sctp_packet_free(struct sctp_packet *packet)
{
        struct sctp_chunk *chunk, *tmp;

        pr_debug("%s: packet:%p\n", __func__, packet);

        list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
                list_del_init(&chunk->list);
                sctp_chunk_free(chunk);
        }
}

/* This routine tries to append the chunk to the offered packet. If adding
 * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk
 * is not present in the packet, it transmits the input packet.
 * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long
 * as it can fit in the packet, but any more data that does not fit in this
 * packet can be sent only after receiving the COOKIE_ACK.
 */
enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet,
                                          struct sctp_chunk *chunk,
                                          int one_packet, gfp_t gfp)
{
        enum sctp_xmit retval;

        pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
                 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);

        switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
        case SCTP_XMIT_PMTU_FULL:
                if (!packet->has_cookie_echo) {
                        int error = 0;

                        error = sctp_packet_transmit(packet, gfp);
                        if (error < 0)
                                chunk->skb->sk->sk_err = -error;

                        /* If we have an empty packet, then we can NOT ever
                         * return PMTU_FULL.
                         */
                        if (!one_packet)
                                retval = sctp_packet_append_chunk(packet,
                                                                  chunk);
                }
                break;

        case SCTP_XMIT_RWND_FULL:
        case SCTP_XMIT_OK:
        case SCTP_XMIT_DELAY:
                break;
        }

        return retval;
}

/* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */
static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk)
{
        struct sctp_transport *t = pkt->transport;
        struct sctp_chunk *pad;
        int overhead = 0;

        if (!chunk->pmtu_probe)
                return SCTP_XMIT_OK;

        /* calculate the Padding Data size for the pad chunk */
        overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
        overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk);
        pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead);
        if (!pad)
                return SCTP_XMIT_DELAY;

        list_add_tail(&pad->list, &pkt->chunk_list);
        pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length));
        chunk->transport = t;

        return SCTP_XMIT_OK;
}

/* Try to bundle an auth chunk into the packet. */
static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt,
                                              struct sctp_chunk *chunk)
{
        struct sctp_association *asoc = pkt->transport->asoc;
        enum sctp_xmit retval = SCTP_XMIT_OK;
        struct sctp_chunk *auth;

        /* if we don't have an association, we can't do authentication */
        if (!asoc)
                return retval;

        /* See if this is an auth chunk we are bundling or if
         * auth is already bundled.
         */
        if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth)
                return retval;

        /* if the peer did not request this chunk to be authenticated,
         * don't do it
         */
        if (!chunk->auth)
                return retval;

        auth = sctp_make_auth(asoc, chunk->shkey->key_id);
        if (!auth)
                return retval;

        auth->shkey = chunk->shkey;
        sctp_auth_shkey_hold(auth->shkey);

        retval = __sctp_packet_append_chunk(pkt, auth);

        if (retval != SCTP_XMIT_OK)
                sctp_chunk_free(auth);

        return retval;
}

/* Try to bundle a SACK with the packet. */
static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt,
                                              struct sctp_chunk *chunk)
{
        enum sctp_xmit retval = SCTP_XMIT_OK;

        /* If sending DATA and haven't aleady bundled a SACK, try to
         * bundle one in to the packet.
         */
        if (sctp_chunk_is_data(chunk) && !pkt->has_sack &&
            !pkt->has_cookie_echo) {
                struct sctp_association *asoc;
                struct timer_list *timer;
                asoc = pkt->transport->asoc;
                timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];

                /* If the SACK timer is running, we have a pending SACK */
                if (timer_pending(timer)) {
                        struct sctp_chunk *sack;

                        if (pkt->transport->sack_generation !=
                            pkt->transport->asoc->peer.sack_generation)
                                return retval;

                        asoc->a_rwnd = asoc->rwnd;
                        sack = sctp_make_sack(asoc);
                        if (sack) {
                                retval = __sctp_packet_append_chunk(pkt, sack);
                                if (retval != SCTP_XMIT_OK) {
                                        sctp_chunk_free(sack);
                                        goto out;
                                }
                                SCTP_INC_STATS(asoc->base.net,
                                               SCTP_MIB_OUTCTRLCHUNKS);
                                asoc->stats.octrlchunks++;
                                asoc->peer.sack_needed = 0;
                                if (del_timer(timer))
                                        sctp_association_put(asoc);
                        }
                }
        }
out:
        return retval;
}


/* Append a chunk to the offered packet reporting back any inability to do
 * so.
 */
static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet,
                                                 struct sctp_chunk *chunk)
{
        __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
        enum sctp_xmit retval = SCTP_XMIT_OK;

        /* Check to see if this chunk will fit into the packet */
        retval = sctp_packet_will_fit(packet, chunk, chunk_len);
        if (retval != SCTP_XMIT_OK)
                goto finish;

        /* We believe that this chunk is OK to add to the packet */
        switch (chunk->chunk_hdr->type) {
        case SCTP_CID_DATA:
        case SCTP_CID_I_DATA:
                /* Account for the data being in the packet */
                sctp_packet_append_data(packet, chunk);
                /* Disallow SACK bundling after DATA. */
                packet->has_sack = 1;
                /* Disallow AUTH bundling after DATA */
                packet->has_auth = 1;
                /* Let it be knows that packet has DATA in it */
                packet->has_data = 1;
                /* timestamp the chunk for rtx purposes */
                chunk->sent_at = jiffies;
                /* Mainly used for prsctp RTX policy */
                chunk->sent_count++;
                break;
        case SCTP_CID_COOKIE_ECHO:
                packet->has_cookie_echo = 1;
                break;

        case SCTP_CID_SACK:
                packet->has_sack = 1;
                if (chunk->asoc)
                        chunk->asoc->stats.osacks++;
                break;

        case SCTP_CID_AUTH:
                packet->has_auth = 1;
                packet->auth = chunk;
                break;
        }

        /* It is OK to send this chunk.  */
        list_add_tail(&chunk->list, &packet->chunk_list);
        packet->size += chunk_len;
        chunk->transport = packet->transport;
finish:
        return retval;
}

/* Append a chunk to the offered packet reporting back any inability to do
 * so.
 */
enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet,
                                        struct sctp_chunk *chunk)
{
        enum sctp_xmit retval = SCTP_XMIT_OK;

        pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk);

        /* Data chunks are special.  Before seeing what else we can
         * bundle into this packet, check to see if we are allowed to
         * send this DATA.
         */
        if (sctp_chunk_is_data(chunk)) {
                retval = sctp_packet_can_append_data(packet, chunk);
                if (retval != SCTP_XMIT_OK)
                        goto finish;
        }

        /* Try to bundle AUTH chunk */
        retval = sctp_packet_bundle_auth(packet, chunk);
        if (retval != SCTP_XMIT_OK)
                goto finish;

        /* Try to bundle SACK chunk */
        retval = sctp_packet_bundle_sack(packet, chunk);
        if (retval != SCTP_XMIT_OK)
                goto finish;

        retval = __sctp_packet_append_chunk(packet, chunk);
        if (retval != SCTP_XMIT_OK)
                goto finish;

        retval = sctp_packet_bundle_pad(packet, chunk);

finish:
        return retval;
}

static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb)
{
        if (SCTP_OUTPUT_CB(head)->last == head)
                skb_shinfo(head)->frag_list = skb;
        else
                SCTP_OUTPUT_CB(head)->last->next = skb;
        SCTP_OUTPUT_CB(head)->last = skb;

        head->truesize += skb->truesize;
        head->data_len += skb->len;
        head->len += skb->len;
        refcount_add(skb->truesize, &head->sk->sk_wmem_alloc);

        __skb_header_release(skb);
}

static int sctp_packet_pack(struct sctp_packet *packet,
                            struct sk_buff *head, int gso, gfp_t gfp)
{
        struct sctp_transport *tp = packet->transport;
        struct sctp_auth_chunk *auth = NULL;
        struct sctp_chunk *chunk, *tmp;
        int pkt_count = 0, pkt_size;
        struct sock *sk = head->sk;
        struct sk_buff *nskb;
        int auth_len = 0;

        if (gso) {
                skb_shinfo(head)->gso_type = sk->sk_gso_type;
                SCTP_OUTPUT_CB(head)->last = head;
        } else {
                nskb = head;
                pkt_size = packet->size;
                goto merge;
        }

        do {
                /* calculate the pkt_size and alloc nskb */
                pkt_size = packet->overhead;
                list_for_each_entry_safe(chunk, tmp, &packet->chunk_list,
                                         list) {
                        int padded = SCTP_PAD4(chunk->skb->len);

                        if (chunk == packet->auth)
                                auth_len = padded;
                        else if (auth_len + padded + packet->overhead >
                                 tp->pathmtu)
                                return 0;
                        else if (pkt_size + padded > tp->pathmtu)
                                break;
                        pkt_size += padded;
                }
                nskb = alloc_skb(pkt_size + MAX_HEADER, gfp);
                if (!nskb)
                        return 0;
                skb_reserve(nskb, packet->overhead + MAX_HEADER);

merge:
                /* merge chunks into nskb and append nskb into head list */
                pkt_size -= packet->overhead;
                list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
                        int padding;

                        list_del_init(&chunk->list);
                        if (sctp_chunk_is_data(chunk)) {
                                if (!sctp_chunk_retransmitted(chunk) &&
                                    !tp->rto_pending) {
                                        chunk->rtt_in_progress = 1;
                                        tp->rto_pending = 1;
                                }
                        }

                        padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len;
                        if (padding)
                                skb_put_zero(chunk->skb, padding);

                        if (chunk == packet->auth)
                                auth = (struct sctp_auth_chunk *)
                                                        skb_tail_pointer(nskb);

                        skb_put_data(nskb, chunk->skb->data, chunk->skb->len);

                        pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n",
                                 chunk,
                                 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
                                 chunk->has_tsn ? "TSN" : "No TSN",
                                 chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0,
                                 ntohs(chunk->chunk_hdr->length), chunk->skb->len,
                                 chunk->rtt_in_progress);

                        pkt_size -= SCTP_PAD4(chunk->skb->len);

                        if (!sctp_chunk_is_data(chunk) && chunk != packet->auth)
                                sctp_chunk_free(chunk);

                        if (!pkt_size)
                                break;
                }

                if (auth) {
                        sctp_auth_calculate_hmac(tp->asoc, nskb, auth,
                                                 packet->auth->shkey, gfp);
                        /* free auth if no more chunks, or add it back */
                        if (list_empty(&packet->chunk_list))
                                sctp_chunk_free(packet->auth);
                        else
                                list_add(&packet->auth->list,
                                         &packet->chunk_list);
                }

                if (gso)
                        sctp_packet_gso_append(head, nskb);

                pkt_count++;
        } while (!list_empty(&packet->chunk_list));

        if (gso) {
                memset(head->cb, 0, max(sizeof(struct inet_skb_parm),
                                        sizeof(struct inet6_skb_parm)));
                skb_shinfo(head)->gso_segs = pkt_count;
                skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
                goto chksum;
        }

        if (sctp_checksum_disable)
                return 1;

        if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) ||
            dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) {
                struct sctphdr *sh =
                        (struct sctphdr *)skb_transport_header(head);

                sh->checksum = sctp_compute_cksum(head, 0);
        } else {
chksum:
                head->ip_summed = CHECKSUM_PARTIAL;
                head->csum_not_inet = 1;
                head->csum_start = skb_transport_header(head) - head->head;
                head->csum_offset = offsetof(struct sctphdr, checksum);
        }

        return pkt_count;
}

/* All packets are sent to the network through this function from
 * sctp_outq_tail().
 *
 * The return value is always 0 for now.
 */
int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
{
        struct sctp_transport *tp = packet->transport;
        struct sctp_association *asoc = tp->asoc;
        struct sctp_chunk *chunk, *tmp;
        int pkt_count, gso = 0;
        struct sk_buff *head;
        struct sctphdr *sh;
        struct sock *sk;

        pr_debug("%s: packet:%p\n", __func__, packet);
        if (list_empty(&packet->chunk_list))
                return 0;
        chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
        sk = chunk->skb->sk;

        if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) {
                if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */
                        packet->ipfragok = 1;
                } else {
                        if (!sk_can_gso(sk)) { /* check gso */
                                pr_err_once("Trying to GSO but underlying device doesn't support it.");
                                goto out;
                        }
                        gso = 1;
                }
        }

        /* alloc head skb */
        head = alloc_skb((gso ? packet->overhead : packet->size) +
                         MAX_HEADER, gfp);
        if (!head)
                goto out;
        skb_reserve(head, packet->overhead + MAX_HEADER);
        skb_set_owner_w(head, sk);

        /* set sctp header */
        sh = skb_push(head, sizeof(struct sctphdr));
        skb_reset_transport_header(head);
        sh->source = htons(packet->source_port);
        sh->dest = htons(packet->destination_port);
        sh->vtag = htonl(packet->vtag);
        sh->checksum = 0;

        /* drop packet if no dst */
        if (!tp->dst) {
                IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
                kfree_skb(head);
                goto out;
        }

        /* pack up chunks */
        pkt_count = sctp_packet_pack(packet, head, gso, gfp);
        if (!pkt_count) {
                kfree_skb(head);
                goto out;
        }
        pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len);

        /* start autoclose timer */
        if (packet->has_data && sctp_state(asoc, ESTABLISHED) &&
            asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) {
                struct timer_list *timer =
                        &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE];
                unsigned long timeout =
                        asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE];

                if (!mod_timer(timer, jiffies + timeout))
                        sctp_association_hold(asoc);
        }

        /* sctp xmit */
        tp->af_specific->ecn_capable(sk);
        if (asoc) {
                asoc->stats.opackets += pkt_count;
                if (asoc->peer.last_sent_to != tp)
                        asoc->peer.last_sent_to = tp;
        }
        head->ignore_df = packet->ipfragok;
        if (tp->dst_pending_confirm)
                skb_set_dst_pending_confirm(head, 1);
        /* neighbour should be confirmed on successful transmission or
         * positive error
         */
        if (tp->af_specific->sctp_xmit(head, tp) >= 0 &&
            tp->dst_pending_confirm)
                tp->dst_pending_confirm = 0;

out:
        list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
                list_del_init(&chunk->list);
                if (!sctp_chunk_is_data(chunk))
                        sctp_chunk_free(chunk);
        }
        sctp_packet_reset(packet);
        return 0;
}

/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

/* This private function check to see if a chunk can be added */
static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet,
                                                  struct sctp_chunk *chunk)
{
        size_t datasize, rwnd, inflight, flight_size;
        struct sctp_transport *transport = packet->transport;
        struct sctp_association *asoc = transport->asoc;
        struct sctp_outq *q = &asoc->outqueue;

        /* RFC 2960 6.1  Transmission of DATA Chunks
         *
         * A) At any given time, the data sender MUST NOT transmit new data to
         * any destination transport address if its peer's rwnd indicates
         * that the peer has no buffer space (i.e. rwnd is 0, see Section
         * 6.2.1).  However, regardless of the value of rwnd (including if it
         * is 0), the data sender can always have one DATA chunk in flight to
         * the receiver if allowed by cwnd (see rule B below).  This rule
         * allows the sender to probe for a change in rwnd that the sender
         * missed due to the SACK having been lost in transit from the data
         * receiver to the data sender.
         */

        rwnd = asoc->peer.rwnd;
        inflight = q->outstanding_bytes;
        flight_size = transport->flight_size;

        datasize = sctp_data_size(chunk);

        if (datasize > rwnd && inflight > 0)
                /* We have (at least) one data chunk in flight,
                 * so we can't fall back to rule 6.1 B).
                 */
                return SCTP_XMIT_RWND_FULL;

        /* RFC 2960 6.1  Transmission of DATA Chunks
         *
         * B) At any given time, the sender MUST NOT transmit new data
         * to a given transport address if it has cwnd or more bytes
         * of data outstanding to that transport address.
         */
        /* RFC 7.2.4 & the Implementers Guide 2.8.
         *
         * 3) ...
         *    When a Fast Retransmit is being performed the sender SHOULD
         *    ignore the value of cwnd and SHOULD NOT delay retransmission.
         */
        if (chunk->fast_retransmit != SCTP_NEED_FRTX &&
            flight_size >= transport->cwnd)
                return SCTP_XMIT_RWND_FULL;

        /* Nagle's algorithm to solve small-packet problem:
         * Inhibit the sending of new chunks when new outgoing data arrives
         * if any previously transmitted data on the connection remains
         * unacknowledged.
         */

        if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) &&
            !asoc->force_delay)
                /* Nothing unacked */
                return SCTP_XMIT_OK;

        if (!sctp_packet_empty(packet))
                /* Append to packet */
                return SCTP_XMIT_OK;

        if (!sctp_state(asoc, ESTABLISHED))
                return SCTP_XMIT_OK;

        /* Check whether this chunk and all the rest of pending data will fit
         * or delay in hopes of bundling a full sized packet.
         */
        if (chunk->skb->len + q->out_qlen > transport->pathmtu -
            packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4)
                /* Enough data queued to fill a packet */
                return SCTP_XMIT_OK;

        /* Don't delay large message writes that may have been fragmented */
        if (!chunk->msg->can_delay)
                return SCTP_XMIT_OK;

        /* Defer until all data acked or packet full */
        return SCTP_XMIT_DELAY;
}

/* This private function does management things when adding DATA chunk */
static void sctp_packet_append_data(struct sctp_packet *packet,
                                struct sctp_chunk *chunk)
{
        struct sctp_transport *transport = packet->transport;
        size_t datasize = sctp_data_size(chunk);
        struct sctp_association *asoc = transport->asoc;
        u32 rwnd = asoc->peer.rwnd;

        /* Keep track of how many bytes are in flight over this transport. */
        transport->flight_size += datasize;

        /* Keep track of how many bytes are in flight to the receiver. */
        asoc->outqueue.outstanding_bytes += datasize;

        /* Update our view of the receiver's rwnd. */
        if (datasize < rwnd)
                rwnd -= datasize;
        else
                rwnd = 0;

        asoc->peer.rwnd = rwnd;
        sctp_chunk_assign_tsn(chunk);
        asoc->stream.si->assign_number(chunk);
}

static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
                                           struct sctp_chunk *chunk,
                                           u16 chunk_len)
{
        enum sctp_xmit retval = SCTP_XMIT_OK;
        size_t psize, pmtu, maxsize;

        /* Don't bundle in this packet if this chunk's auth key doesn't
         * match other chunks already enqueued on this packet. Also,
         * don't bundle the chunk with auth key if other chunks in this
         * packet don't have auth key.
         */
        if ((packet->auth && chunk->shkey != packet->auth->shkey) ||
            (!packet->auth && chunk->shkey &&
             chunk->chunk_hdr->type != SCTP_CID_AUTH))
                return SCTP_XMIT_PMTU_FULL;

        psize = packet->size;
        if (packet->transport->asoc)
                pmtu = packet->transport->asoc->pathmtu;
        else
                pmtu = packet->transport->pathmtu;

        /* Decide if we need to fragment or resubmit later. */
        if (psize + chunk_len > pmtu) {
                /* It's OK to fragment at IP level if any one of the following
                 * is true:
                 *        1. The packet is empty (meaning this chunk is greater
                 *           the MTU)
                 *        2. The packet doesn't have any data in it yet and data
                 *           requires authentication.
                 */
                if (sctp_packet_empty(packet) ||
                    (!packet->has_data && chunk->auth)) {
                        /* We no longer do re-fragmentation.
                         * Just fragment at the IP layer, if we
                         * actually hit this condition
                         */
                        packet->ipfragok = 1;
                        goto out;
                }

                /* Similarly, if this chunk was built before a PMTU
                 * reduction, we have to fragment it at IP level now. So
                 * if the packet already contains something, we need to
                 * flush.
                 */
                maxsize = pmtu - packet->overhead;
                if (packet->auth)
                        maxsize -= SCTP_PAD4(packet->auth->skb->len);
                if (chunk_len > maxsize)
                        retval = SCTP_XMIT_PMTU_FULL;

                /* It is also okay to fragment if the chunk we are
                 * adding is a control chunk, but only if current packet
                 * is not a GSO one otherwise it causes fragmentation of
                 * a large frame. So in this case we allow the
                 * fragmentation by forcing it to be in a new packet.
                 */
                if (!sctp_chunk_is_data(chunk) && packet->has_data)
                        retval = SCTP_XMIT_PMTU_FULL;

                if (psize + chunk_len > packet->max_size)
                        /* Hit GSO/PMTU limit, gotta flush */
                        retval = SCTP_XMIT_PMTU_FULL;

                if (!packet->transport->burst_limited &&
                    psize + chunk_len > (packet->transport->cwnd >> 1))
                        /* Do not allow a single GSO packet to use more
                         * than half of cwnd.
                         */
                        retval = SCTP_XMIT_PMTU_FULL;

                if (packet->transport->burst_limited &&
                    psize + chunk_len > (packet->transport->burst_limited >> 1))
                        /* Do not allow a single GSO packet to use more
                         * than half of original cwnd.
                         */
                        retval = SCTP_XMIT_PMTU_FULL;
                /* Otherwise it will fit in the GSO packet */
        }

out:
        return retval;
}



















































































    1 

    1 













































































































    3 







    3 

















    3 
    3 































































































































    6 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Berkeley style UIO structures        -        Alan Cox 1994.
 */
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H

#include <linux/kernel.h>
#include <linux/thread_info.h>
#include <linux/mm_types.h>
#include <uapi/linux/uio.h>

struct page;

typedef unsigned int __bitwise iov_iter_extraction_t;

struct kvec {
        void *iov_base; /* and that should *never* hold a userland pointer */
        size_t iov_len;
};

enum iter_type {
        /* iter types */
        ITER_UBUF,
        ITER_IOVEC,
        ITER_BVEC,
        ITER_KVEC,
        ITER_XARRAY,
        ITER_DISCARD,
};

#define ITER_SOURCE        1        // == WRITE
#define ITER_DEST        0        // == READ

struct iov_iter_state {
        size_t iov_offset;
        size_t count;
        unsigned long nr_segs;
};

struct iov_iter {
        u8 iter_type;
        bool nofault;
        bool data_source;
        size_t iov_offset;
        /*
         * Hack alert: overlay ubuf_iovec with iovec + count, so
         * that the members resolve correctly regardless of the type
         * of iterator used. This means that you can use:
         *
         * &iter->__ubuf_iovec or iter->__iov
         *
         * interchangably for the user_backed cases, hence simplifying
         * some of the cases that need to deal with both.
         */
        union {
                /*
                 * This really should be a const, but we cannot do that without
                 * also modifying any of the zero-filling iter init functions.
                 * Leave it non-const for now, but it should be treated as such.
                 */
                struct iovec __ubuf_iovec;
                struct {
                        union {
                                /* use iter_iov() to get the current vec */
                                const struct iovec *__iov;
                                const struct kvec *kvec;
                                const struct bio_vec *bvec;
                                struct xarray *xarray;
                                void __user *ubuf;
                        };
                        size_t count;
                };
        };
        union {
                unsigned long nr_segs;
                loff_t xarray_start;
        };
};

static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
        if (iter->iter_type == ITER_UBUF)
                return (const struct iovec *) &iter->__ubuf_iovec;
        return iter->__iov;
}

#define iter_iov_addr(iter)        (iter_iov(iter)->iov_base + (iter)->iov_offset)
#define iter_iov_len(iter)        (iter_iov(iter)->iov_len - (iter)->iov_offset)

static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
        return i->iter_type;
}

static inline void iov_iter_save_state(struct iov_iter *iter,
                                       struct iov_iter_state *state)
{
        state->iov_offset = iter->iov_offset;
        state->count = iter->count;
        state->nr_segs = iter->nr_segs;
}

static inline bool iter_is_ubuf(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_UBUF;
}

static inline bool iter_is_iovec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_IOVEC;
}

static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_KVEC;
}

static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_BVEC;
}

static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_DISCARD;
}

static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_XARRAY;
}

static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
        return i->data_source ? WRITE : READ;
}

static inline bool user_backed_iter(const struct iov_iter *i)
{
        return iter_is_ubuf(i) || iter_is_iovec(i);
}

/*
 * Total number of bytes covered by an iovec.
 *
 * NOTE that it is not safe to use this function until all the iovec's
 * segment lengths have been validated.  Because the individual lengths can
 * overflow a size_t when added together.
 */
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
        unsigned long seg;
        size_t ret = 0;

        for (seg = 0; seg < nr_segs; seg++)
                ret += iov[seg].iov_len;
        return ret;
}

size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
                                  size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);

static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        return copy_page_to_iter(&folio->page, offset, bytes, i);
}

static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
                size_t offset, size_t bytes, struct iov_iter *i)
{
        return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
}

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
                                 size_t bytes, struct iov_iter *i);

static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, true))
                return _copy_to_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_to_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter_nocache(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter_nocache(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
 * Note, users like pmem that depend on the stricter semantics of
 * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
 * destination is flushed from the cache on return.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
                        unsigned len_mask);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
                     loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                        size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);

static inline size_t iov_iter_count(const struct iov_iter *i)
{
        return i->count;
}

/*
 * Cap the iov_iter by given limit; note that the second argument is
 * *not* the new size - it's upper limit for such.  Passing it a value
 * greater than the amount of data in iov_iter is fine - it'll just do
 * nothing in that case.
 */
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
        /*
         * count doesn't have to fit in size_t - comparison extends both
         * operands to u64 here and any value that would be truncated by
         * conversion in assignement is by definition greater than all
         * values of size_t, including old i->count.
         */
        if (i->count > count)
                i->count = count;
}

/*
 * reexpand a previously truncated iterator; count must be no more than how much
 * we had shrunk it.
 */
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
        i->count = count;
}

static inline int
iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
{
        size_t shorted = 0;
        int npages;

        if (iov_iter_count(i) > max_bytes) {
                shorted = iov_iter_count(i) - max_bytes;
                iov_iter_truncate(i, max_bytes);
        }
        npages = iov_iter_npages(i, maxpages);
        if (shorted)
                iov_iter_reexpand(i, iov_iter_count(i) + shorted);

        return npages;
}

struct iovec *iovec_from_user(const struct iovec __user *uvector,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);

static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
                        void __user *buf, size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_UBUF,
                .data_source = direction,
                .ubuf = buf,
                .count = count,
                .nr_segs = 1
        };
}
/* Flags for iov_iter_get/extract_pages*() */
/* Allow P2PDMA on the extracted pages */
#define ITER_ALLOW_P2PDMA        ((__force iov_iter_extraction_t)0x01)

ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
                               size_t maxsize, unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0);

/**
 * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
 * @iter: The iterator
 *
 * Examine the iterator and indicate by returning true or false as to how, if
 * at all, pages extracted from the iterator will be retained by the extraction
 * function.
 *
 * %true indicates that the pages will have a pin placed in them that the
 * caller must unpin.  This is must be done for DMA/async DIO to force fork()
 * to forcibly copy a page for the child (the parent must retain the original
 * page).
 *
 * %false indicates that no measures are taken and that it's up to the caller
 * to retain the pages.
 */
static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter)
{
        return user_backed_iter(iter);
}

struct sg_table;
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len,
                           struct sg_table *sgtable, unsigned int sg_max,
                           iov_iter_extraction_t extraction_flags);

#endif






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLB_H
#define _ASM_X86_TLB_H

#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);

#include <asm-generic/tlb.h>

static inline void tlb_flush(struct mmu_gather *tlb)
{
        unsigned long start = 0UL, end = TLB_FLUSH_ALL;
        unsigned int stride_shift = tlb_get_unmap_shift(tlb);

        if (!tlb->fullmm && !tlb->need_flush_all) {
                start = tlb->start;
                end = tlb->end;
        }

        flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}

/*
 * While x86 architecture in general requires an IPI to perform TLB
 * shootdown, enablement code for several hypervisors overrides
 * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
 * a hypercall. To keep software pagetable walkers safe in this case we
 * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment
 * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
 * for more details.
 */
static inline void __tlb_remove_table(void *table)
{
        free_page_and_swap_cache(table);
}

#endif /* _ASM_X86_TLB_H */

































































































































































































































































































































































































































































































    2 





    2 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 











    1 
    1 



    1 







    1 
    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/idr.h>
#include <linux/rculist.h>
#include <linux/nsproxy.h>
#include <linux/fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/cookie.h>
#include <linux/proc_fs.h>

#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>

/*
 *        Our network namespace constructor/destructor lists
 */

static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;

LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);

/* Protects net_namespace_list. Nests iside rtnl_lock() */
DECLARE_RWSEM(net_rwsem);
EXPORT_SYMBOL_GPL(net_rwsem);

#ifdef CONFIG_KEYS
static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif

struct net init_net;
EXPORT_SYMBOL(init_net);

static bool init_net_initialized;
/*
 * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
 * init_net_initialized and first_device pointer.
 * This is internal net namespace object. Please, don't use it
 * outside.
 */
DECLARE_RWSEM(pernet_ops_rwsem);
EXPORT_SYMBOL_GPL(pernet_ops_rwsem);

#define MIN_PERNET_OPS_ID        \
        ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))

#define INITIAL_NET_GEN_PTRS        13 /* +1 for len +2 for rcu_head */

static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;

DEFINE_COOKIE(net_cookie);

static struct net_generic *net_alloc_generic(void)
{
        unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
        unsigned int generic_size;
        struct net_generic *ng;

        generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);

        ng = kzalloc(generic_size, GFP_KERNEL);
        if (ng)
                ng->s.len = gen_ptrs;

        return ng;
}

static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
        struct net_generic *ng, *old_ng;

        BUG_ON(id < MIN_PERNET_OPS_ID);

        old_ng = rcu_dereference_protected(net->gen,
                                           lockdep_is_held(&pernet_ops_rwsem));
        if (old_ng->s.len > id) {
                old_ng->ptr[id] = data;
                return 0;
        }

        ng = net_alloc_generic();
        if (!ng)
                return -ENOMEM;

        /*
         * Some synchronisation notes:
         *
         * The net_generic explores the net->gen array inside rcu
         * read section. Besides once set the net->gen->ptr[x]
         * pointer never changes (see rules in netns/generic.h).
         *
         * That said, we simply duplicate this array and schedule
         * the old copy for kfree after a grace period.
         */

        memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
               (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
        ng->ptr[id] = data;

        rcu_assign_pointer(net->gen, ng);
        kfree_rcu(old_ng, s.rcu);
        return 0;
}

static int ops_init(const struct pernet_operations *ops, struct net *net)
{
        struct net_generic *ng;
        int err = -ENOMEM;
        void *data = NULL;

        if (ops->id && ops->size) {
                data = kzalloc(ops->size, GFP_KERNEL);
                if (!data)
                        goto out;

                err = net_assign_generic(net, *ops->id, data);
                if (err)
                        goto cleanup;
        }
        err = 0;
        if (ops->init)
                err = ops->init(net);
        if (!err)
                return 0;

        if (ops->id && ops->size) {
                ng = rcu_dereference_protected(net->gen,
                                               lockdep_is_held(&pernet_ops_rwsem));
                ng->ptr[*ops->id] = NULL;
        }

cleanup:
        kfree(data);

out:
        return err;
}

static void ops_pre_exit_list(const struct pernet_operations *ops,
                              struct list_head *net_exit_list)
{
        struct net *net;

        if (ops->pre_exit) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        ops->pre_exit(net);
        }
}

static void ops_exit_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        struct net *net;
        if (ops->exit) {
                list_for_each_entry(net, net_exit_list, exit_list) {
                        ops->exit(net);
                        cond_resched();
                }
        }
        if (ops->exit_batch)
                ops->exit_batch(net_exit_list);
}

static void ops_free_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        struct net *net;
        if (ops->size && ops->id) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        kfree(net_generic(net, *ops->id));
        }
}

/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
        int min = 0, max = 0;

        if (reqid >= 0) {
                min = reqid;
                max = reqid + 1;
        }

        return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}

/* This function is used by idr_for_each(). If net is equal to peer, the
 * function returns the id so that idr_for_each() stops. Because we cannot
 * returns the id 0 (idr_for_each() will not stop), we return the magic value
 * NET_ID_ZERO (-1) for it.
 */
#define NET_ID_ZERO -1
static int net_eq_idr(int id, void *net, void *peer)
{
        if (net_eq(net, peer))
                return id ? : NET_ID_ZERO;
        return 0;
}

/* Must be called from RCU-critical section or with nsid_lock held */
static int __peernet2id(const struct net *net, struct net *peer)
{
        int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);

        /* Magic value for id 0. */
        if (id == NET_ID_ZERO)
                return 0;
        if (id > 0)
                return id;

        return NETNSA_NSID_NOT_ASSIGNED;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp);
/* This function returns the id of a peer netns. If no id is assigned, one will
 * be allocated and returned.
 */
int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
        int id;

        if (refcount_read(&net->ns.count) == 0)
                return NETNSA_NSID_NOT_ASSIGNED;

        spin_lock_bh(&net->nsid_lock);
        id = __peernet2id(net, peer);
        if (id >= 0) {
                spin_unlock_bh(&net->nsid_lock);
                return id;
        }

        /* When peer is obtained from RCU lists, we may race with
         * its cleanup. Check whether it's alive, and this guarantees
         * we never hash a peer back to net->netns_ids, after it has
         * just been idr_remove()'d from there in cleanup_net().
         */
        if (!maybe_get_net(peer)) {
                spin_unlock_bh(&net->nsid_lock);
                return NETNSA_NSID_NOT_ASSIGNED;
        }

        id = alloc_netid(net, peer, -1);
        spin_unlock_bh(&net->nsid_lock);

        put_net(peer);
        if (id < 0)
                return NETNSA_NSID_NOT_ASSIGNED;

        rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);

        return id;
}
EXPORT_SYMBOL_GPL(peernet2id_alloc);

/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(const struct net *net, struct net *peer)
{
        int id;

        rcu_read_lock();
        id = __peernet2id(net, peer);
        rcu_read_unlock();

        return id;
}
EXPORT_SYMBOL(peernet2id);

/* This function returns true is the peer netns has an id assigned into the
 * current netns.
 */
bool peernet_has_id(const struct net *net, struct net *peer)
{
        return peernet2id(net, peer) >= 0;
}

struct net *get_net_ns_by_id(const struct net *net, int id)
{
        struct net *peer;

        if (id < 0)
                return NULL;

        rcu_read_lock();
        peer = idr_find(&net->netns_ids, id);
        if (peer)
                peer = maybe_get_net(peer);
        rcu_read_unlock();

        return peer;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_id);

/* init code that must occur even if setup_net() is not called. */
static __net_init void preinit_net(struct net *net)
{
        ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
}

/*
 * setup_net runs the initializers for the network namespace object.
 */
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
        /* Must be called with pernet_ops_rwsem held */
        const struct pernet_operations *ops, *saved_ops;
        LIST_HEAD(net_exit_list);
        LIST_HEAD(dev_kill_list);
        int error = 0;

        refcount_set(&net->ns.count, 1);
        ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");

        refcount_set(&net->passive, 1);
        get_random_bytes(&net->hash_mix, sizeof(u32));
        preempt_disable();
        net->net_cookie = gen_cookie_next(&net_cookie);
        preempt_enable();
        net->dev_base_seq = 1;
        net->user_ns = user_ns;
        idr_init(&net->netns_ids);
        spin_lock_init(&net->nsid_lock);
        mutex_init(&net->ipv4.ra_mutex);

        list_for_each_entry(ops, &pernet_list, list) {
                error = ops_init(ops, net);
                if (error < 0)
                        goto out_undo;
        }
        down_write(&net_rwsem);
        list_add_tail_rcu(&net->list, &net_namespace_list);
        up_write(&net_rwsem);
out:
        return error;

out_undo:
        /* Walk through the list backwards calling the exit functions
         * for the pernet modules whose init functions did not fail.
         */
        list_add(&net->exit_list, &net_exit_list);
        saved_ops = ops;
        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                ops_pre_exit_list(ops, &net_exit_list);

        synchronize_rcu();

        ops = saved_ops;
        rtnl_lock();
        list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
                if (ops->exit_batch_rtnl)
                        ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                ops_exit_list(ops, &net_exit_list);

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                ops_free_list(ops, &net_exit_list);

        rcu_barrier();
        goto out;
}

static int __net_init net_defaults_init_net(struct net *net)
{
        net->core.sysctl_somaxconn = SOMAXCONN;
        /* Limits per socket sk_omem_alloc usage.
         * TCP zerocopy regular usage needs 128 KB.
         */
        net->core.sysctl_optmem_max = 128 * 1024;
        net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;

        return 0;
}

static struct pernet_operations net_defaults_ops = {
        .init = net_defaults_init_net,
};

static __init int net_defaults_init(void)
{
        if (register_pernet_subsys(&net_defaults_ops))
                panic("Cannot initialize net default settings");

        return 0;
}

core_initcall(net_defaults_init);

#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
}

static void dec_net_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
}

static struct kmem_cache *net_cachep __ro_after_init;
static struct workqueue_struct *netns_wq;

static struct net *net_alloc(void)
{
        struct net *net = NULL;
        struct net_generic *ng;

        ng = net_alloc_generic();
        if (!ng)
                goto out;

        net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
        if (!net)
                goto out_free;

#ifdef CONFIG_KEYS
        net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL);
        if (!net->key_domain)
                goto out_free_2;
        refcount_set(&net->key_domain->usage, 1);
#endif

        rcu_assign_pointer(net->gen, ng);
out:
        return net;

#ifdef CONFIG_KEYS
out_free_2:
        kmem_cache_free(net_cachep, net);
        net = NULL;
#endif
out_free:
        kfree(ng);
        goto out;
}

static void net_free(struct net *net)
{
        if (refcount_dec_and_test(&net->passive)) {
                kfree(rcu_access_pointer(net->gen));

                /* There should not be any trackers left there. */
                ref_tracker_dir_exit(&net->notrefcnt_tracker);

                kmem_cache_free(net_cachep, net);
        }
}

void net_drop_ns(void *p)
{
        struct net *net = (struct net *)p;

        if (net)
                net_free(net);
}

struct net *copy_net_ns(unsigned long flags,
                        struct user_namespace *user_ns, struct net *old_net)
{
        struct ucounts *ucounts;
        struct net *net;
        int rv;

        if (!(flags & CLONE_NEWNET))
                return get_net(old_net);

        ucounts = inc_net_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        net = net_alloc();
        if (!net) {
                rv = -ENOMEM;
                goto dec_ucounts;
        }

        preinit_net(net);
        refcount_set(&net->passive, 1);
        net->ucounts = ucounts;
        get_user_ns(user_ns);

        rv = down_read_killable(&pernet_ops_rwsem);
        if (rv < 0)
                goto put_userns;

        rv = setup_net(net, user_ns);

        up_read(&pernet_ops_rwsem);

        if (rv < 0) {
put_userns:
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(user_ns);
                net_free(net);
dec_ucounts:
                dec_net_namespaces(ucounts);
                return ERR_PTR(rv);
        }
        return net;
}

/**
 * net_ns_get_ownership - get sysfs ownership data for @net
 * @net: network namespace in question (can be NULL)
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns the uid/gid pair of root in the user namespace associated with the
 * given network namespace.
 */
void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
{
        if (net) {
                kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
                kgid_t ns_root_gid = make_kgid(net->user_ns, 0);

                if (uid_valid(ns_root_uid))
                        *uid = ns_root_uid;

                if (gid_valid(ns_root_gid))
                        *gid = ns_root_gid;
        } else {
                *uid = GLOBAL_ROOT_UID;
                *gid = GLOBAL_ROOT_GID;
        }
}
EXPORT_SYMBOL_GPL(net_ns_get_ownership);

static void unhash_nsid(struct net *net, struct net *last)
{
        struct net *tmp;
        /* This function is only called from cleanup_net() work,
         * and this work is the only process, that may delete
         * a net from net_namespace_list. So, when the below
         * is executing, the list may only grow. Thus, we do not
         * use for_each_net_rcu() or net_rwsem.
         */
        for_each_net(tmp) {
                int id;

                spin_lock_bh(&tmp->nsid_lock);
                id = __peernet2id(tmp, net);
                if (id >= 0)
                        idr_remove(&tmp->netns_ids, id);
                spin_unlock_bh(&tmp->nsid_lock);
                if (id >= 0)
                        rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
                                          GFP_KERNEL);
                if (tmp == last)
                        break;
        }
        spin_lock_bh(&net->nsid_lock);
        idr_destroy(&net->netns_ids);
        spin_unlock_bh(&net->nsid_lock);
}

static LLIST_HEAD(cleanup_list);

static void cleanup_net(struct work_struct *work)
{
        const struct pernet_operations *ops;
        struct net *net, *tmp, *last;
        struct llist_node *net_kill_list;
        LIST_HEAD(net_exit_list);
        LIST_HEAD(dev_kill_list);

        /* Atomically snapshot the list of namespaces to cleanup */
        net_kill_list = llist_del_all(&cleanup_list);

        down_read(&pernet_ops_rwsem);

        /* Don't let anyone else find us. */
        down_write(&net_rwsem);
        llist_for_each_entry(net, net_kill_list, cleanup_list)
                list_del_rcu(&net->list);
        /* Cache last net. After we unlock rtnl, no one new net
         * added to net_namespace_list can assign nsid pointer
         * to a net from net_kill_list (see peernet2id_alloc()).
         * So, we skip them in unhash_nsid().
         *
         * Note, that unhash_nsid() does not delete nsid links
         * between net_kill_list's nets, as they've already
         * deleted from net_namespace_list. But, this would be
         * useless anyway, as netns_ids are destroyed there.
         */
        last = list_last_entry(&net_namespace_list, struct net, list);
        up_write(&net_rwsem);

        llist_for_each_entry(net, net_kill_list, cleanup_list) {
                unhash_nsid(net, last);
                list_add_tail(&net->exit_list, &net_exit_list);
        }

        /* Run all of the network namespace pre_exit methods */
        list_for_each_entry_reverse(ops, &pernet_list, list)
                ops_pre_exit_list(ops, &net_exit_list);

        /*
         * Another CPU might be rcu-iterating the list, wait for it.
         * This needs to be before calling the exit() notifiers, so
         * the rcu_barrier() below isn't sufficient alone.
         * Also the pre_exit() and exit() methods need this barrier.
         */
        synchronize_rcu_expedited();

        rtnl_lock();
        list_for_each_entry_reverse(ops, &pernet_list, list) {
                if (ops->exit_batch_rtnl)
                        ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();

        /* Run all of the network namespace exit methods */
        list_for_each_entry_reverse(ops, &pernet_list, list)
                ops_exit_list(ops, &net_exit_list);

        /* Free the net generic variables */
        list_for_each_entry_reverse(ops, &pernet_list, list)
                ops_free_list(ops, &net_exit_list);

        up_read(&pernet_ops_rwsem);

        /* Ensure there are no outstanding rcu callbacks using this
         * network namespace.
         */
        rcu_barrier();

        /* Finally it is safe to free my network namespace structure */
        list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                list_del_init(&net->exit_list);
                dec_net_namespaces(net->ucounts);
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(net->user_ns);
                net_free(net);
        }
}

/**
 * net_ns_barrier - wait until concurrent net_cleanup_work is done
 *
 * cleanup_net runs from work queue and will first remove namespaces
 * from the global list, then run net exit functions.
 *
 * Call this in module exit path to make sure that all netns
 * ->exit ops have been invoked before the function is removed.
 */
void net_ns_barrier(void)
{
        down_write(&pernet_ops_rwsem);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL(net_ns_barrier);

static DECLARE_WORK(net_cleanup_work, cleanup_net);

void __put_net(struct net *net)
{
        ref_tracker_dir_exit(&net->refcnt_tracker);
        /* Cleanup the network namespace in process context */
        if (llist_add(&net->cleanup_list, &cleanup_list))
                queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);

/**
 * get_net_ns - increment the refcount of the network namespace
 * @ns: common namespace (net)
 *
 * Returns the net's common namespace or ERR_PTR() if ref is zero.
 */
struct ns_common *get_net_ns(struct ns_common *ns)
{
        struct net *net;

        net = maybe_get_net(container_of(ns, struct net, ns));
        if (net)
                return &net->ns;
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns);

struct net *get_net_ns_by_fd(int fd)
{
        struct fd f = fdget(fd);
        struct net *net = ERR_PTR(-EINVAL);

        if (!f.file)
                return ERR_PTR(-EBADF);

        if (proc_ns_file(f.file)) {
                struct ns_common *ns = get_proc_ns(file_inode(f.file));
                if (ns->ops == &netns_operations)
                        net = get_net(container_of(ns, struct net, ns));
        }
        fdput(f);

        return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
#endif

struct net *get_net_ns_by_pid(pid_t pid)
{
        struct task_struct *tsk;
        struct net *net;

        /* Lookup the network namespace */
        net = ERR_PTR(-ESRCH);
        rcu_read_lock();
        tsk = find_task_by_vpid(pid);
        if (tsk) {
                struct nsproxy *nsproxy;
                task_lock(tsk);
                nsproxy = tsk->nsproxy;
                if (nsproxy)
                        net = get_net(nsproxy->net_ns);
                task_unlock(tsk);
        }
        rcu_read_unlock();
        return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);

static __net_init int net_ns_net_init(struct net *net)
{
#ifdef CONFIG_NET_NS
        net->ns.ops = &netns_operations;
#endif
        return ns_alloc_inum(&net->ns);
}

static __net_exit void net_ns_net_exit(struct net *net)
{
        ns_free_inum(&net->ns);
}

static struct pernet_operations __net_initdata net_ns_ops = {
        .init = net_ns_net_init,
        .exit = net_ns_net_exit,
};

static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
        [NETNSA_NONE]                = { .type = NLA_UNSPEC },
        [NETNSA_NSID]                = { .type = NLA_S32 },
        [NETNSA_PID]                = { .type = NLA_U32 },
        [NETNSA_FD]                = { .type = NLA_U32 },
        [NETNSA_TARGET_NSID]        = { .type = NLA_S32 },
};

static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct nlattr *nla;
        struct net *peer;
        int nsid, err;

        err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
                                     NETNSA_MAX, rtnl_net_policy, extack);
        if (err < 0)
                return err;
        if (!tb[NETNSA_NSID]) {
                NL_SET_ERR_MSG(extack, "nsid is missing");
                return -EINVAL;
        }
        nsid = nla_get_s32(tb[NETNSA_NSID]);

        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }
        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        spin_lock_bh(&net->nsid_lock);
        if (__peernet2id(net, peer) >= 0) {
                spin_unlock_bh(&net->nsid_lock);
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack,
                               "Peer netns already has a nsid assigned");
                goto out;
        }

        err = alloc_netid(net, peer, nsid);
        spin_unlock_bh(&net->nsid_lock);
        if (err >= 0) {
                rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
                                  nlh, GFP_KERNEL);
                err = 0;
        } else if (err == -ENOSPC && nsid >= 0) {
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
                NL_SET_ERR_MSG(extack, "The specified nsid is already used");
        }
out:
        put_net(peer);
        return err;
}

static int rtnl_net_get_size(void)
{
        return NLMSG_ALIGN(sizeof(struct rtgenmsg))
               + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
               + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
               ;
}

struct net_fill_args {
        u32 portid;
        u32 seq;
        int flags;
        int cmd;
        int nsid;
        bool add_ref;
        int ref_nsid;
};

static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
{
        struct nlmsghdr *nlh;
        struct rtgenmsg *rth;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
                        args->flags);
        if (!nlh)
                return -EMSGSIZE;

        rth = nlmsg_data(nlh);
        rth->rtgen_family = AF_UNSPEC;

        if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
                goto nla_put_failure;

        if (args->add_ref &&
            nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int rtnl_net_valid_getid_req(struct sk_buff *skb,
                                    const struct nlmsghdr *nlh,
                                    struct nlattr **tb,
                                    struct netlink_ext_ack *extack)
{
        int i, err;

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
                                              tb, NETNSA_MAX, rtnl_net_policy,
                                              extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETNSA_PID:
                case NETNSA_FD:
                case NETNSA_NSID:
                case NETNSA_TARGET_NSID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct net_fill_args fillargs = {
                .portid = NETLINK_CB(skb).portid,
                .seq = nlh->nlmsg_seq,
                .cmd = RTM_NEWNSID,
        };
        struct net *peer, *target = net;
        struct nlattr *nla;
        struct sk_buff *msg;
        int err;

        err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
        if (err < 0)
                return err;
        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else if (tb[NETNSA_NSID]) {
                peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
                if (!peer)
                        peer = ERR_PTR(-ENOENT);
                nla = tb[NETNSA_NSID];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }

        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        if (tb[NETNSA_TARGET_NSID]) {
                int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);

                target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
                if (IS_ERR(target)) {
                        NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
                        NL_SET_ERR_MSG(extack,
                                       "Target netns reference is invalid");
                        err = PTR_ERR(target);
                        goto out;
                }
                fillargs.add_ref = true;
                fillargs.ref_nsid = peernet2id(net, peer);
        }

        msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
        if (!msg) {
                err = -ENOMEM;
                goto out;
        }

        fillargs.nsid = peernet2id(target, peer);
        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
        goto out;

err_out:
        nlmsg_free(msg);
out:
        if (fillargs.add_ref)
                put_net(target);
        put_net(peer);
        return err;
}

struct rtnl_net_dump_cb {
        struct net *tgt_net;
        struct net *ref_net;
        struct sk_buff *skb;
        struct net_fill_args fillargs;
        int idx;
        int s_idx;
};

/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
        struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
        int ret;

        if (net_cb->idx < net_cb->s_idx)
                goto cont;

        net_cb->fillargs.nsid = id;
        if (net_cb->fillargs.add_ref)
                net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
        ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
        if (ret < 0)
                return ret;

cont:
        net_cb->idx++;
        return 0;
}

static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
                                   struct rtnl_net_dump_cb *net_cb,
                                   struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[NETNSA_MAX + 1];
        int err, i;

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err < 0)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                if (i == NETNSA_TARGET_NSID) {
                        struct net *net;

                        net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
                        if (IS_ERR(net)) {
                                NL_SET_BAD_ATTR(extack, tb[i]);
                                NL_SET_ERR_MSG(extack,
                                               "Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        net_cb->fillargs.add_ref = true;
                        net_cb->ref_net = net_cb->tgt_net;
                        net_cb->tgt_net = net;
                } else {
                        NL_SET_BAD_ATTR(extack, tb[i]);
                        NL_SET_ERR_MSG(extack,
                                       "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rtnl_net_dump_cb net_cb = {
                .tgt_net = sock_net(skb->sk),
                .skb = skb,
                .fillargs = {
                        .portid = NETLINK_CB(cb->skb).portid,
                        .seq = cb->nlh->nlmsg_seq,
                        .flags = NLM_F_MULTI,
                        .cmd = RTM_NEWNSID,
                },
                .idx = 0,
                .s_idx = cb->args[0],
        };
        int err = 0;

        if (cb->strict_check) {
                err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
                if (err < 0)
                        goto end;
        }

        rcu_read_lock();
        idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
        rcu_read_unlock();

        cb->args[0] = net_cb.idx;
end:
        if (net_cb.fillargs.add_ref)
                put_net(net_cb.tgt_net);
        return err;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp)
{
        struct net_fill_args fillargs = {
                .portid = portid,
                .seq = nlh ? nlh->nlmsg_seq : 0,
                .cmd = cmd,
                .nsid = id,
        };
        struct sk_buff *msg;
        int err = -ENOMEM;

        msg = nlmsg_new(rtnl_net_get_size(), gfp);
        if (!msg)
                goto out;

        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
        return;

err_out:
        nlmsg_free(msg);
out:
        rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}

#ifdef CONFIG_NET_NS
static void __init netns_ipv4_struct_check(void)
{
        /* TX readonly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_early_retrans);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_win_divisor);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_rtt_log);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_autocorking);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_snd_mss);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_notsent_lowat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_limit_output_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_rtt_wlen);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_wmem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_ip_fwd_use_pmtu);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);

        /* TXRX readonly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
                                      sysctl_tcp_moderate_rcvbuf);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);

        /* RX readonly hotpath cache line */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_ip_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_reordering);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_rmem);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18);
}
#endif

void __init net_ns_init(void)
{
        struct net_generic *ng;

#ifdef CONFIG_NET_NS
        netns_ipv4_struct_check();
        net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
                                        SMP_CACHE_BYTES,
                                        SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Create workqueue for cleanup */
        netns_wq = create_singlethread_workqueue("netns");
        if (!netns_wq)
                panic("Could not create netns workq");
#endif

        ng = net_alloc_generic();
        if (!ng)
                panic("Could not allocate generic netns");

        rcu_assign_pointer(init_net.gen, ng);

#ifdef CONFIG_KEYS
        init_net.key_domain = &init_net_key_domain;
#endif
        down_write(&pernet_ops_rwsem);
        preinit_net(&init_net);
        if (setup_net(&init_net, &init_user_ns))
                panic("Could not setup the initial network namespace");

        init_net_initialized = true;
        up_write(&pernet_ops_rwsem);

        if (register_pernet_subsys(&net_ns_ops))
                panic("Could not register network namespace subsystems");

        rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
                      RTNL_FLAG_DOIT_UNLOCKED);
        rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
                      RTNL_FLAG_DOIT_UNLOCKED |
                      RTNL_FLAG_DUMP_UNLOCKED);
}

static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
{
        ops_pre_exit_list(ops, net_exit_list);
        synchronize_rcu();

        if (ops->exit_batch_rtnl) {
                LIST_HEAD(dev_kill_list);

                rtnl_lock();
                ops->exit_batch_rtnl(net_exit_list, &dev_kill_list);
                unregister_netdevice_many(&dev_kill_list);
                rtnl_unlock();
        }
        ops_exit_list(ops, net_exit_list);

        ops_free_list(ops, net_exit_list);
}

#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        struct net *net;
        int error;
        LIST_HEAD(net_exit_list);

        list_add_tail(&ops->list, list);
        if (ops->init || (ops->id && ops->size)) {
                /* We held write locked pernet_ops_rwsem, and parallel
                 * setup_net() and cleanup_net() are not possible.
                 */
                for_each_net(net) {
                        error = ops_init(ops, net);
                        if (error)
                                goto out_undo;
                        list_add_tail(&net->exit_list, &net_exit_list);
                }
        }
        return 0;

out_undo:
        /* If I have an error cleanup all namespaces I initialized */
        list_del(&ops->list);
        free_exit_list(ops, &net_exit_list);
        return error;
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        struct net *net;
        LIST_HEAD(net_exit_list);

        list_del(&ops->list);
        /* See comment in __register_pernet_operations() */
        for_each_net(net)
                list_add_tail(&net->exit_list, &net_exit_list);

        free_exit_list(ops, &net_exit_list);
}

#else

static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_add_tail(&ops->list, list);
                return 0;
        }

        return ops_init(ops, &init_net);
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_del(&ops->list);
        } else {
                LIST_HEAD(net_exit_list);
                list_add(&init_net.exit_list, &net_exit_list);
                free_exit_list(ops, &net_exit_list);
        }
}

#endif /* CONFIG_NET_NS */

static DEFINE_IDA(net_generic_ids);

static int register_pernet_operations(struct list_head *list,
                                      struct pernet_operations *ops)
{
        int error;

        if (ops->id) {
                error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
                                GFP_KERNEL);
                if (error < 0)
                        return error;
                *ops->id = error;
                /* This does not require READ_ONCE as writers already hold
                 * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
                 * net_alloc_generic.
                 */
                WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
        }
        error = __register_pernet_operations(list, ops);
        if (error) {
                rcu_barrier();
                if (ops->id)
                        ida_free(&net_generic_ids, *ops->id);
        }

        return error;
}

static void unregister_pernet_operations(struct pernet_operations *ops)
{
        __unregister_pernet_operations(ops);
        rcu_barrier();
        if (ops->id)
                ida_free(&net_generic_ids, *ops->id);
}

/**
 *      register_pernet_subsys - register a network namespace subsystem
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a subsystem which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_subsys(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error =  register_pernet_operations(first_device, ops);
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);

/**
 *      unregister_pernet_subsys - unregister a network namespace subsystem
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_subsys(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);

/**
 *      register_pernet_device - register a network namespace device
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a device which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_device(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error = register_pernet_operations(&pernet_list, ops);
        if (!error && (first_device == &pernet_list))
                first_device = &ops->list;
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);

/**
 *      unregister_pernet_device - unregister a network namespace netdevice
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_device(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        if (&ops->list == first_device)
                first_device = first_device->next;
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);

#ifdef CONFIG_NET_NS
static struct ns_common *netns_get(struct task_struct *task)
{
        struct net *net = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy)
                net = get_net(nsproxy->net_ns);
        task_unlock(task);

        return net ? &net->ns : NULL;
}

static inline struct net *to_net_ns(struct ns_common *ns)
{
        return container_of(ns, struct net, ns);
}

static void netns_put(struct ns_common *ns)
{
        put_net(to_net_ns(ns));
}

static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct net *net = to_net_ns(ns);

        if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        put_net(nsproxy->net_ns);
        nsproxy->net_ns = get_net(net);
        return 0;
}

static struct user_namespace *netns_owner(struct ns_common *ns)
{
        return to_net_ns(ns)->user_ns;
}

const struct proc_ns_operations netns_operations = {
        .name                = "net",
        .type                = CLONE_NEWNET,
        .get                = netns_get,
        .put                = netns_put,
        .install        = netns_install,
        .owner                = netns_owner,
};
#endif




















































































































































































































































































































































































































































    5 





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X_TABLES_H
#define _X_TABLES_H


#include <linux/netdevice.h>
#include <linux/static_key.h>
#include <linux/netfilter.h>
#include <uapi/linux/netfilter/x_tables.h>

/* Test a struct->invflags and a boolean for inequality */
#define NF_INVF(ptr, flag, boolean)                                        \
        ((boolean) ^ !!((ptr)->invflags & (flag)))

/**
 * struct xt_action_param - parameters for matches/targets
 *
 * @match:        the match extension
 * @target:        the target extension
 * @matchinfo:        per-match data
 * @targetinfo:        per-target data
 * @state:        pointer to hook state this packet came from
 * @fragoff:        packet is a fragment, this is the data offset
 * @thoff:        position of transport header relative to skb->data
 *
 * Fields written to by extensions:
 *
 * @hotdrop:        drop packet if we had inspection problems
 */
struct xt_action_param {
        union {
                const struct xt_match *match;
                const struct xt_target *target;
        };
        union {
                const void *matchinfo, *targinfo;
        };
        const struct nf_hook_state *state;
        unsigned int thoff;
        u16 fragoff;
        bool hotdrop;
};

static inline struct net *xt_net(const struct xt_action_param *par)
{
        return par->state->net;
}

static inline struct net_device *xt_in(const struct xt_action_param *par)
{
        return par->state->in;
}

static inline const char *xt_inname(const struct xt_action_param *par)
{
        return par->state->in->name;
}

static inline struct net_device *xt_out(const struct xt_action_param *par)
{
        return par->state->out;
}

static inline const char *xt_outname(const struct xt_action_param *par)
{
        return par->state->out->name;
}

static inline unsigned int xt_hooknum(const struct xt_action_param *par)
{
        return par->state->hook;
}

static inline u_int8_t xt_family(const struct xt_action_param *par)
{
        return par->state->pf;
}

/**
 * struct xt_mtchk_param - parameters for match extensions'
 * checkentry functions
 *
 * @net:        network namespace through which the check was invoked
 * @table:        table the rule is tried to be inserted into
 * @entryinfo:        the family-specific rule data
 *                 (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry)
 * @match:        struct xt_match through which this function was invoked
 * @matchinfo:        per-match data
 * @hook_mask:        via which hooks the new rule is reachable
 * Other fields as above.
 */
struct xt_mtchk_param {
        struct net *net;
        const char *table;
        const void *entryinfo;
        const struct xt_match *match;
        void *matchinfo;
        unsigned int hook_mask;
        u_int8_t family;
        bool nft_compat;
};

/**
 * struct xt_mdtor_param - match destructor parameters
 * Fields as above.
 */
struct xt_mtdtor_param {
        struct net *net;
        const struct xt_match *match;
        void *matchinfo;
        u_int8_t family;
};

/**
 * struct xt_tgchk_param - parameters for target extensions'
 * checkentry functions
 *
 * @entryinfo:        the family-specific rule data
 *                 (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry)
 *
 * Other fields see above.
 */
struct xt_tgchk_param {
        struct net *net;
        const char *table;
        const void *entryinfo;
        const struct xt_target *target;
        void *targinfo;
        unsigned int hook_mask;
        u_int8_t family;
        bool nft_compat;
};

/* Target destructor parameters */
struct xt_tgdtor_param {
        struct net *net;
        const struct xt_target *target;
        void *targinfo;
        u_int8_t family;
};

struct xt_match {
        struct list_head list;

        const char name[XT_EXTENSION_MAXNAMELEN];
        u_int8_t revision;

        /* Return true or false: return FALSE and set *hotdrop = 1 to
           force immediate packet drop. */
        /* Arguments changed since 2.6.9, as this must now handle
           non-linear skb, using skb_header_pointer and
           skb_ip_make_writable. */
        bool (*match)(const struct sk_buff *skb,
                      struct xt_action_param *);

        /* Called when user tries to insert an entry of this type. */
        int (*checkentry)(const struct xt_mtchk_param *);

        /* Called when entry of this type deleted. */
        void (*destroy)(const struct xt_mtdtor_param *);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        /* Called when userspace align differs from kernel space one */
        void (*compat_from_user)(void *dst, const void *src);
        int (*compat_to_user)(void __user *dst, const void *src);
#endif
        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        const char *table;
        unsigned int matchsize;
        unsigned int usersize;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        unsigned int compatsize;
#endif
        unsigned int hooks;
        unsigned short proto;

        unsigned short family;
};

/* Registration hooks for targets. */
struct xt_target {
        struct list_head list;

        const char name[XT_EXTENSION_MAXNAMELEN];
        u_int8_t revision;

        /* Returns verdict. Argument order changed since 2.6.9, as this
           must now handle non-linear skbs, using skb_copy_bits and
           skb_ip_make_writable. */
        unsigned int (*target)(struct sk_buff *skb,
                               const struct xt_action_param *);

        /* Called when user tries to insert an entry of this type:
           hook_mask is a bitmask of hooks from which it can be
           called. */
        /* Should return 0 on success or an error code otherwise (-Exxxx). */
        int (*checkentry)(const struct xt_tgchk_param *);

        /* Called when entry of this type deleted. */
        void (*destroy)(const struct xt_tgdtor_param *);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        /* Called when userspace align differs from kernel space one */
        void (*compat_from_user)(void *dst, const void *src);
        int (*compat_to_user)(void __user *dst, const void *src);
#endif
        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        const char *table;
        unsigned int targetsize;
        unsigned int usersize;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        unsigned int compatsize;
#endif
        unsigned int hooks;
        unsigned short proto;

        unsigned short family;
};

/* Furniture shopping... */
struct xt_table {
        struct list_head list;

        /* What hooks you will enter on */
        unsigned int valid_hooks;

        /* Man behind the curtain... */
        struct xt_table_info *private;

        /* hook ops that register the table with the netfilter core */
        struct nf_hook_ops *ops;

        /* Set this to THIS_MODULE if you are a module, otherwise NULL */
        struct module *me;

        u_int8_t af;                /* address/protocol family */
        int priority;                /* hook order */

        /* A unique name... */
        const char name[XT_TABLE_MAXNAMELEN];
};

#include <linux/netfilter_ipv4.h>

/* The table itself */
struct xt_table_info {
        /* Size per table */
        unsigned int size;
        /* Number of entries: FIXME. --RR */
        unsigned int number;
        /* Initial number of entries. Needed for module usage count */
        unsigned int initial_entries;

        /* Entry points and underflows */
        unsigned int hook_entry[NF_INET_NUMHOOKS];
        unsigned int underflow[NF_INET_NUMHOOKS];

        /*
         * Number of user chains. Since tables cannot have loops, at most
         * @stacksize jumps (number of user chains) can possibly be made.
         */
        unsigned int stacksize;
        void ***jumpstack;

        unsigned char entries[] __aligned(8);
};

int xt_register_target(struct xt_target *target);
void xt_unregister_target(struct xt_target *target);
int xt_register_targets(struct xt_target *target, unsigned int n);
void xt_unregister_targets(struct xt_target *target, unsigned int n);

int xt_register_match(struct xt_match *target);
void xt_unregister_match(struct xt_match *target);
int xt_register_matches(struct xt_match *match, unsigned int n);
void xt_unregister_matches(struct xt_match *match, unsigned int n);

int xt_check_entry_offsets(const void *base, const char *elems,
                           unsigned int target_offset,
                           unsigned int next_offset);

int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks);

unsigned int *xt_alloc_entry_offsets(unsigned int size);
bool xt_find_jump_offset(const unsigned int *offsets,
                         unsigned int target, unsigned int size);

int xt_check_proc_name(const char *name, unsigned int size);

int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto,
                   bool inv_proto);
int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto,
                    bool inv_proto);

int xt_match_to_user(const struct xt_entry_match *m,
                     struct xt_entry_match __user *u);
int xt_target_to_user(const struct xt_entry_target *t,
                      struct xt_entry_target __user *u);
int xt_data_to_user(void __user *dst, const void *src,
                    int usersize, int size, int aligned_size);

void *xt_copy_counters(sockptr_t arg, unsigned int len,
                       struct xt_counters_info *info);
struct xt_counters *xt_counters_alloc(unsigned int counters);

struct xt_table *xt_register_table(struct net *net,
                                   const struct xt_table *table,
                                   struct xt_table_info *bootstrap,
                                   struct xt_table_info *newinfo);
void *xt_unregister_table(struct xt_table *table);

struct xt_table_info *xt_replace_table(struct xt_table *table,
                                       unsigned int num_counters,
                                       struct xt_table_info *newinfo,
                                       int *error);

struct xt_match *xt_find_match(u8 af, const char *name, u8 revision);
struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision);
struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision);
int xt_find_revision(u8 af, const char *name, u8 revision, int target,
                     int *err);

struct xt_table *xt_find_table(struct net *net, u8 af, const char *name);
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
                                    const char *name);
struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af,
                                            const char *name);
void xt_table_unlock(struct xt_table *t);

int xt_proto_init(struct net *net, u_int8_t af);
void xt_proto_fini(struct net *net, u_int8_t af);

struct xt_table_info *xt_alloc_table_info(unsigned int size);
void xt_free_table_info(struct xt_table_info *info);

/**
 * xt_recseq - recursive seqcount for netfilter use
 *
 * Packet processing changes the seqcount only if no recursion happened
 * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
 * because we use the normal seqcount convention :
 * Low order bit set to 1 if a writer is active.
 */
DECLARE_PER_CPU(seqcount_t, xt_recseq);

/* xt_tee_enabled - true if x_tables needs to handle reentrancy
 *
 * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
 */
extern struct static_key xt_tee_enabled;

/**
 * xt_write_recseq_begin - start of a write section
 *
 * Begin packet processing : all readers must wait the end
 * 1) Must be called with preemption disabled
 * 2) softirqs must be disabled too (or we should use this_cpu_add())
 * Returns :
 *  1 if no recursion on this cpu
 *  0 if recursion detected
 */
static inline unsigned int xt_write_recseq_begin(void)
{
        unsigned int addend;

        /*
         * Low order bit of sequence is set if we already
         * called xt_write_recseq_begin().
         */
        addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;

        /*
         * This is kind of a write_seqcount_begin(), but addend is 0 or 1
         * We dont check addend value to avoid a test and conditional jump,
         * since addend is most likely 1
         */
        __this_cpu_add(xt_recseq.sequence, addend);
        smp_mb();

        return addend;
}

/**
 * xt_write_recseq_end - end of a write section
 * @addend: return value from previous xt_write_recseq_begin()
 *
 * End packet processing : all readers can proceed
 * 1) Must be called with preemption disabled
 * 2) softirqs must be disabled too (or we should use this_cpu_add())
 */
static inline void xt_write_recseq_end(unsigned int addend)
{
        /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
        smp_wmb();
        __this_cpu_add(xt_recseq.sequence, addend);
}

/*
 * This helper is performance critical and must be inlined
 */
static inline unsigned long ifname_compare_aligned(const char *_a,
                                                   const char *_b,
                                                   const char *_mask)
{
        const unsigned long *a = (const unsigned long *)_a;
        const unsigned long *b = (const unsigned long *)_b;
        const unsigned long *mask = (const unsigned long *)_mask;
        unsigned long ret;

        ret = (a[0] ^ b[0]) & mask[0];
        if (IFNAMSIZ > sizeof(unsigned long))
                ret |= (a[1] ^ b[1]) & mask[1];
        if (IFNAMSIZ > 2 * sizeof(unsigned long))
                ret |= (a[2] ^ b[2]) & mask[2];
        if (IFNAMSIZ > 3 * sizeof(unsigned long))
                ret |= (a[3] ^ b[3]) & mask[3];
        BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long));
        return ret;
}

struct xt_percpu_counter_alloc_state {
        unsigned int off;
        const char __percpu *mem;
};

bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
                             struct xt_counters *counter);
void xt_percpu_counter_free(struct xt_counters *cnt);

static inline struct xt_counters *
xt_get_this_cpu_counter(struct xt_counters *cnt)
{
        if (nr_cpu_ids > 1)
                return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt);

        return cnt;
}

static inline struct xt_counters *
xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
{
        if (nr_cpu_ids > 1)
                return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu);

        return cnt;
}

struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *);

int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net));
void xt_unregister_template(const struct xt_table *t);

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
#include <net/compat.h>

struct compat_xt_entry_match {
        union {
                struct {
                        u_int16_t match_size;
                        char name[XT_FUNCTION_MAXNAMELEN - 1];
                        u_int8_t revision;
                } user;
                struct {
                        u_int16_t match_size;
                        compat_uptr_t match;
                } kernel;
                u_int16_t match_size;
        } u;
        unsigned char data[];
};

struct compat_xt_entry_target {
        union {
                struct {
                        u_int16_t target_size;
                        char name[XT_FUNCTION_MAXNAMELEN - 1];
                        u_int8_t revision;
                } user;
                struct {
                        u_int16_t target_size;
                        compat_uptr_t target;
                } kernel;
                u_int16_t target_size;
        } u;
        unsigned char data[];
};

/* FIXME: this works only on 32 bit tasks
 * need to change whole approach in order to calculate align as function of
 * current task alignment */

struct compat_xt_counters {
        compat_u64 pcnt, bcnt;                        /* Packet and byte counters */
};

struct compat_xt_counters_info {
        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t num_counters;
        struct compat_xt_counters counters[];
};

struct _compat_xt_align {
        __u8 u8;
        __u16 u16;
        __u32 u32;
        compat_u64 u64;
};

#define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align))

void xt_compat_lock(u_int8_t af);
void xt_compat_unlock(u_int8_t af);

int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta);
void xt_compat_flush_offsets(u_int8_t af);
int xt_compat_init_offsets(u8 af, unsigned int number);
int xt_compat_calc_jump(u_int8_t af, unsigned int offset);

int xt_compat_match_offset(const struct xt_match *match);
void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
                              unsigned int *size);
int xt_compat_match_to_user(const struct xt_entry_match *m,
                            void __user **dstptr, unsigned int *size);

int xt_compat_target_offset(const struct xt_target *target);
void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
                                unsigned int *size);
int xt_compat_target_to_user(const struct xt_entry_target *t,
                             void __user **dstptr, unsigned int *size);
int xt_compat_check_entry_offsets(const void *base, const char *elems,
                                  unsigned int target_offset,
                                  unsigned int next_offset);

#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
#endif /* _X_TABLES_H */






















































    2 








    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_NS_H
#define _LINUX_PID_NS_H

#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/idr.h>

/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

struct fs_pin;

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
/* modes for vm.memfd_noexec sysctl */
#define MEMFD_NOEXEC_SCOPE_EXEC                        0 /* MFD_EXEC implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL                1 /* MFD_NOEXEC_SEAL implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED        2 /* same as 1, except MFD_EXEC rejected */
#endif

struct pid_namespace {
        struct idr idr;
        struct rcu_head rcu;
        unsigned int pid_allocated;
        struct task_struct *child_reaper;
        struct kmem_cache *pid_cachep;
        unsigned int level;
        struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
        struct fs_pin *bacct;
#endif
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        int reboot;        /* group exit code if this pidns was rebooted */
        struct ns_common ns;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        int memfd_noexec_scope;
#endif
} __randomize_layout;

extern struct pid_namespace init_pid_ns;

#define PIDNS_ADDING (1U << 31)

#ifdef CONFIG_PID_NS
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        if (ns != &init_pid_ns)
                refcount_inc(&ns->ns.count);
        return ns;
}

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        int scope = MEMFD_NOEXEC_SCOPE_EXEC;

        for (; ns; ns = ns->parent)
                scope = max(scope, READ_ONCE(ns->memfd_noexec_scope));

        return scope;
}
#else
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}
#endif

extern struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
extern void put_pid_ns(struct pid_namespace *ns);

#else /* !CONFIG_PID_NS */
#include <linux/err.h>

static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        return ns;
}

static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}

static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns)
{
        if (flags & CLONE_NEWPID)
                ns = ERR_PTR(-EINVAL);
        return ns;
}

static inline void put_pid_ns(struct pid_namespace *ns)
{
}

static inline void zap_pid_ns_processes(struct pid_namespace *ns)
{
        BUG();
}

static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
        return 0;
}
#endif /* CONFIG_PID_NS */

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);

static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{
        return task_active_pid_ns(tsk) == &init_pid_ns;
}

#endif /* _LINUX_PID_NS_H */





















































































































































































    1 













    1 




    1 
    1 
    1 





    1 

























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
// SPDX-License-Identifier: GPL-2.0
#include <linux/memcontrol.h>
#include <linux/rwsem.h>
#include <linux/shrinker.h>
#include <linux/rculist.h>
#include <trace/events/vmscan.h>

#include "internal.h"

LIST_HEAD(shrinker_list);
DEFINE_MUTEX(shrinker_mutex);

#ifdef CONFIG_MEMCG
static int shrinker_nr_max;

static inline int shrinker_unit_size(int nr_items)
{
        return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
}

static inline void shrinker_unit_free(struct shrinker_info *info, int start)
{
        struct shrinker_info_unit **unit;
        int nr, i;

        if (!info)
                return;

        unit = info->unit;
        nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);

        for (i = start; i < nr; i++) {
                if (!unit[i])
                        break;

                kfree(unit[i]);
                unit[i] = NULL;
        }
}

static inline int shrinker_unit_alloc(struct shrinker_info *new,
                                       struct shrinker_info *old, int nid)
{
        struct shrinker_info_unit *unit;
        int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
        int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
        int i;

        for (i = start; i < nr; i++) {
                unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
                if (!unit) {
                        shrinker_unit_free(new, start);
                        return -ENOMEM;
                }

                new->unit[i] = unit;
        }

        return 0;
}

void free_shrinker_info(struct mem_cgroup *memcg)
{
        struct mem_cgroup_per_node *pn;
        struct shrinker_info *info;
        int nid;

        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
                info = rcu_dereference_protected(pn->shrinker_info, true);
                shrinker_unit_free(info, 0);
                kvfree(info);
                rcu_assign_pointer(pn->shrinker_info, NULL);
        }
}

int alloc_shrinker_info(struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        int nid, ret = 0;
        int array_size = 0;

        mutex_lock(&shrinker_mutex);
        array_size = shrinker_unit_size(shrinker_nr_max);
        for_each_node(nid) {
                info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
                if (!info)
                        goto err;
                info->map_nr_max = shrinker_nr_max;
                if (shrinker_unit_alloc(info, NULL, nid))
                        goto err;
                rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
        }
        mutex_unlock(&shrinker_mutex);

        return ret;

err:
        mutex_unlock(&shrinker_mutex);
        free_shrinker_info(memcg);
        return -ENOMEM;
}

static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
                                                     int nid)
{
        return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
                                         lockdep_is_held(&shrinker_mutex));
}

static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
                                    int old_size, int new_nr_max)
{
        struct shrinker_info *new, *old;
        struct mem_cgroup_per_node *pn;
        int nid;

        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
                old = shrinker_info_protected(memcg, nid);
                /* Not yet online memcg */
                if (!old)
                        return 0;

                /* Already expanded this shrinker_info */
                if (new_nr_max <= old->map_nr_max)
                        continue;

                new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
                if (!new)
                        return -ENOMEM;

                new->map_nr_max = new_nr_max;

                memcpy(new->unit, old->unit, old_size);
                if (shrinker_unit_alloc(new, old, nid)) {
                        kvfree(new);
                        return -ENOMEM;
                }

                rcu_assign_pointer(pn->shrinker_info, new);
                kvfree_rcu(old, rcu);
        }

        return 0;
}

static int expand_shrinker_info(int new_id)
{
        int ret = 0;
        int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
        int new_size, old_size = 0;
        struct mem_cgroup *memcg;

        if (!root_mem_cgroup)
                goto out;

        lockdep_assert_held(&shrinker_mutex);

        new_size = shrinker_unit_size(new_nr_max);
        old_size = shrinker_unit_size(shrinker_nr_max);

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                ret = expand_one_shrinker_info(memcg, new_size, old_size,
                                               new_nr_max);
                if (ret) {
                        mem_cgroup_iter_break(NULL, memcg);
                        goto out;
                }
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
out:
        if (!ret)
                shrinker_nr_max = new_nr_max;

        return ret;
}

static inline int shrinker_id_to_index(int shrinker_id)
{
        return shrinker_id / SHRINKER_UNIT_BITS;
}

static inline int shrinker_id_to_offset(int shrinker_id)
{
        return shrinker_id % SHRINKER_UNIT_BITS;
}

static inline int calc_shrinker_id(int index, int offset)
{
        return index * SHRINKER_UNIT_BITS + offset;
}

void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
        if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
                struct shrinker_info *info;
                struct shrinker_info_unit *unit;

                rcu_read_lock();
                info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
                unit = info->unit[shrinker_id_to_index(shrinker_id)];
                if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
                        /* Pairs with smp mb in shrink_slab() */
                        smp_mb__before_atomic();
                        set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
                }
                rcu_read_unlock();
        }
}

static DEFINE_IDR(shrinker_idr);

static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
        int id, ret = -ENOMEM;

        if (mem_cgroup_disabled())
                return -ENOSYS;

        mutex_lock(&shrinker_mutex);
        id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;

        if (id >= shrinker_nr_max) {
                if (expand_shrinker_info(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }
        }
        shrinker->id = id;
        ret = 0;
unlock:
        mutex_unlock(&shrinker_mutex);
        return ret;
}

static void shrinker_memcg_remove(struct shrinker *shrinker)
{
        int id = shrinker->id;

        BUG_ON(id < 0);

        lockdep_assert_held(&shrinker_mutex);

        idr_remove(&shrinker_idr, id);
}

static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
                                   struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        struct shrinker_info_unit *unit;
        long nr_deferred;

        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        unit = info->unit[shrinker_id_to_index(shrinker->id)];
        nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
        rcu_read_unlock();

        return nr_deferred;
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
                                  struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        struct shrinker_info_unit *unit;
        long nr_deferred;

        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        unit = info->unit[shrinker_id_to_index(shrinker->id)];
        nr_deferred =
                atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
        rcu_read_unlock();

        return nr_deferred;
}

void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
        int nid, index, offset;
        long nr;
        struct mem_cgroup *parent;
        struct shrinker_info *child_info, *parent_info;
        struct shrinker_info_unit *child_unit, *parent_unit;

        parent = parent_mem_cgroup(memcg);
        if (!parent)
                parent = root_mem_cgroup;

        /* Prevent from concurrent shrinker_info expand */
        mutex_lock(&shrinker_mutex);
        for_each_node(nid) {
                child_info = shrinker_info_protected(memcg, nid);
                parent_info = shrinker_info_protected(parent, nid);
                for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
                        child_unit = child_info->unit[index];
                        parent_unit = parent_info->unit[index];
                        for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
                                nr = atomic_long_read(&child_unit->nr_deferred[offset]);
                                atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
                        }
                }
        }
        mutex_unlock(&shrinker_mutex);
}
#else
static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
        return -ENOSYS;
}

static void shrinker_memcg_remove(struct shrinker *shrinker)
{
}

static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
                                   struct mem_cgroup *memcg)
{
        return 0;
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
                                  struct mem_cgroup *memcg)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

static long xchg_nr_deferred(struct shrinker *shrinker,
                             struct shrink_control *sc)
{
        int nid = sc->nid;

        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                nid = 0;

        if (sc->memcg &&
            (shrinker->flags & SHRINKER_MEMCG_AWARE))
                return xchg_nr_deferred_memcg(nid, shrinker,
                                              sc->memcg);

        return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
}


static long add_nr_deferred(long nr, struct shrinker *shrinker,
                            struct shrink_control *sc)
{
        int nid = sc->nid;

        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                nid = 0;

        if (sc->memcg &&
            (shrinker->flags & SHRINKER_MEMCG_AWARE))
                return add_nr_deferred_memcg(nr, nid, shrinker,
                                             sc->memcg);

        return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}

#define SHRINK_BATCH 128

static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                                    struct shrinker *shrinker, int priority)
{
        unsigned long freed = 0;
        unsigned long long delta;
        long total_scan;
        long freeable;
        long nr;
        long new_nr;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;

        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;

        /*
         * copy the current shrinker scan count into a local variable
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
        nr = xchg_nr_deferred(shrinker, shrinkctl);

        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
                do_div(delta, shrinker->seeks);
        } else {
                /*
                 * These objects don't require any IO to create. Trim
                 * them aggressively under memory pressure to keep
                 * them from causing refetches in the IO caches.
                 */
                delta = freeable / 2;
        }

        total_scan = nr >> priority;
        total_scan += delta;
        total_scan = min(total_scan, (2 * freeable));

        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority);

        /*
         * Normally, we should not scan less than batch_size objects in one
         * pass to avoid too frequent shrinker calls, but if the slab has less
         * than batch_size objects in total and we are really tight on memory,
         * we will try to reclaim all available objects, otherwise we can end
         * up failing allocations although there are plenty of reclaimable
         * objects spread over several slabs with usage less than the
         * batch_size.
         *
         * We detect the "tight on memory" situations by looking at the total
         * number of objects we want to scan (total_scan). If it is greater
         * than the total number of objects on slab (freeable), we must be
         * scanning at high prio and therefore should try to reclaim as much as
         * possible.
         */
        while (total_scan >= batch_size ||
               total_scan >= freeable) {
                unsigned long ret;
                unsigned long nr_to_scan = min(batch_size, total_scan);

                shrinkctl->nr_to_scan = nr_to_scan;
                shrinkctl->nr_scanned = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;

                count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
                total_scan -= shrinkctl->nr_scanned;
                scanned += shrinkctl->nr_scanned;

                cond_resched();
        }

        /*
         * The deferred work is increased by any new work (delta) that wasn't
         * done, decreased by old deferred work that was done now.
         *
         * And it is capped to two times of the freeable items.
         */
        next_deferred = max_t(long, (nr + delta - scanned), 0);
        next_deferred = min(next_deferred, (2 * freeable));

        /*
         * move the unused scan count back into the shrinker in a
         * manner that handles concurrent updates.
         */
        new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);

        trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
        return freed;
}

#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        struct shrinker_info *info;
        unsigned long ret, freed = 0;
        int offset, index = 0;

        if (!mem_cgroup_online(memcg))
                return 0;

        /*
         * lockless algorithm of memcg shrink.
         *
         * The shrinker_info may be freed asynchronously via RCU in the
         * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
         * to ensure the existence of the shrinker_info.
         *
         * The shrinker_info_unit is never freed unless its corresponding memcg
         * is destroyed. Here we already hold the refcount of memcg, so the
         * memcg will not be destroyed, and of course shrinker_info_unit will
         * not be freed.
         *
         * So in the memcg shrink:
         *  step 1: use rcu_read_lock() to guarantee existence of the
         *          shrinker_info.
         *  step 2: after getting shrinker_info_unit we can safely release the
         *          RCU lock.
         *  step 3: traverse the bitmap and calculate shrinker_id
         *  step 4: use rcu_read_lock() to guarantee existence of the shrinker.
         *  step 5: use shrinker_id to find the shrinker, then use
         *          shrinker_try_get() to guarantee existence of the shrinker,
         *          then we can release the RCU lock to do do_shrink_slab() that
         *          may sleep.
         *  step 6: do shrinker_put() paired with step 5 to put the refcount,
         *          if the refcount reaches 0, then wake up the waiter in
         *          shrinker_free() by calling complete().
         *          Note: here is different from the global shrink, we don't
         *                need to acquire the RCU lock to guarantee existence of
         *                the shrinker, because we don't need to use this
         *                shrinker to traverse the next shrinker in the bitmap.
         *  step 7: we have already exited the read-side of rcu critical section
         *          before calling do_shrink_slab(), the shrinker_info may be
         *          released in expand_one_shrinker_info(), so go back to step 1
         *          to reacquire the shrinker_info.
         */
again:
        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        if (unlikely(!info))
                goto unlock;

        if (index < shrinker_id_to_index(info->map_nr_max)) {
                struct shrinker_info_unit *unit;

                unit = info->unit[index];

                rcu_read_unlock();

                for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
                        struct shrink_control sc = {
                                .gfp_mask = gfp_mask,
                                .nid = nid,
                                .memcg = memcg,
                        };
                        struct shrinker *shrinker;
                        int shrinker_id = calc_shrinker_id(index, offset);

                        rcu_read_lock();
                        shrinker = idr_find(&shrinker_idr, shrinker_id);
                        if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
                                clear_bit(offset, unit->map);
                                rcu_read_unlock();
                                continue;
                        }
                        rcu_read_unlock();

                        /* Call non-slab shrinkers even though kmem is disabled */
                        if (!memcg_kmem_online() &&
                            !(shrinker->flags & SHRINKER_NONSLAB))
                                continue;

                        ret = do_shrink_slab(&sc, shrinker, priority);
                        if (ret == SHRINK_EMPTY) {
                                clear_bit(offset, unit->map);
                                /*
                                 * After the shrinker reported that it had no objects to
                                 * free, but before we cleared the corresponding bit in
                                 * the memcg shrinker map, a new object might have been
                                 * added. To make sure, we have the bit set in this
                                 * case, we invoke the shrinker one more time and reset
                                 * the bit if it reports that it is not empty anymore.
                                 * The memory barrier here pairs with the barrier in
                                 * set_shrinker_bit():
                                 *
                                 * list_lru_add()     shrink_slab_memcg()
                                 *   list_add_tail()    clear_bit()
                                 *   <MB>               <MB>
                                 *   set_bit()          do_shrink_slab()
                                 */
                                smp_mb__after_atomic();
                                ret = do_shrink_slab(&sc, shrinker, priority);
                                if (ret == SHRINK_EMPTY)
                                        ret = 0;
                                else
                                        set_shrinker_bit(memcg, nid, shrinker_id);
                        }
                        freed += ret;
                        shrinker_put(shrinker);
                }

                index++;
                goto again;
        }
unlock:
        rcu_read_unlock();
        return freed;
}
#else /* !CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

/**
 * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
 * @memcg: memory cgroup whose slab caches to target
 * @priority: the reclaim priority
 *
 * Call the shrink functions to age shrinkable caches.
 *
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 * are called only if it is the root cgroup.
 *
 * @priority is sc->priority, we take the number of objects and >> by priority
 * in order to get the scan target.
 *
 * Returns the number of reclaimed slab objects.
 */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority)
{
        unsigned long ret, freed = 0;
        struct shrinker *shrinker;

        /*
         * The root memcg might be allocated even though memcg is disabled
         * via "cgroup_disable=memory" boot parameter.  This could make
         * mem_cgroup_is_root() return false, then just run memcg slab
         * shrink, but skip global shrink.  This may result in premature
         * oom.
         */
        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
                return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

        /*
         * lockless algorithm of global shrink.
         *
         * In the unregistration setp, the shrinker will be freed asynchronously
         * via RCU after its refcount reaches 0. So both rcu_read_lock() and
         * shrinker_try_get() can be used to ensure the existence of the shrinker.
         *
         * So in the global shrink:
         *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
         *          and the validity of the shrinker_list walk.
         *  step 2: use shrinker_try_get() to try get the refcount, if successful,
         *          then the existence of the shrinker can also be guaranteed,
         *          so we can release the RCU lock to do do_shrink_slab() that
         *          may sleep.
         *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
         *          which ensures that neither this shrinker nor the next shrinker
         *          will be freed in the next traversal operation.
         *  step 4: do shrinker_put() paired with step 2 to put the refcount,
         *          if the refcount reaches 0, then wake up the waiter in
         *          shrinker_free() by calling complete().
         */
        rcu_read_lock();
        list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
                        .memcg = memcg,
                };

                if (!shrinker_try_get(shrinker))
                        continue;

                rcu_read_unlock();

                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY)
                        ret = 0;
                freed += ret;

                rcu_read_lock();
                shrinker_put(shrinker);
        }

        rcu_read_unlock();
        cond_resched();
        return freed;
}

struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
{
        struct shrinker *shrinker;
        unsigned int size;
        va_list ap;
        int err;

        shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
        if (!shrinker)
                return NULL;

        va_start(ap, fmt);
        err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
        va_end(ap);
        if (err)
                goto err_name;

        shrinker->flags = flags | SHRINKER_ALLOCATED;
        shrinker->seeks = DEFAULT_SEEKS;

        if (flags & SHRINKER_MEMCG_AWARE) {
                err = shrinker_memcg_alloc(shrinker);
                if (err == -ENOSYS) {
                        /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
                        shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
                        goto non_memcg;
                }

                if (err)
                        goto err_flags;

                return shrinker;
        }

non_memcg:
        /*
         * The nr_deferred is available on per memcg level for memcg aware
         * shrinkers, so only allocate nr_deferred in the following cases:
         *  - non-memcg-aware shrinkers
         *  - !CONFIG_MEMCG
         *  - memcg is disabled by kernel command line
         */
        size = sizeof(*shrinker->nr_deferred);
        if (flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;

        shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
        if (!shrinker->nr_deferred)
                goto err_flags;

        return shrinker;

err_flags:
        shrinker_debugfs_name_free(shrinker);
err_name:
        kfree(shrinker);
        return NULL;
}
EXPORT_SYMBOL_GPL(shrinker_alloc);

void shrinker_register(struct shrinker *shrinker)
{
        if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
                pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
                return;
        }

        mutex_lock(&shrinker_mutex);
        list_add_tail_rcu(&shrinker->list, &shrinker_list);
        shrinker->flags |= SHRINKER_REGISTERED;
        shrinker_debugfs_add(shrinker);
        mutex_unlock(&shrinker_mutex);

        init_completion(&shrinker->done);
        /*
         * Now the shrinker is fully set up, take the first reference to it to
         * indicate that lookup operations are now allowed to use it via
         * shrinker_try_get().
         */
        refcount_set(&shrinker->refcount, 1);
}
EXPORT_SYMBOL_GPL(shrinker_register);

static void shrinker_free_rcu_cb(struct rcu_head *head)
{
        struct shrinker *shrinker = container_of(head, struct shrinker, rcu);

        kfree(shrinker->nr_deferred);
        kfree(shrinker);
}

void shrinker_free(struct shrinker *shrinker)
{
        struct dentry *debugfs_entry = NULL;
        int debugfs_id;

        if (!shrinker)
                return;

        if (shrinker->flags & SHRINKER_REGISTERED) {
                /* drop the initial refcount */
                shrinker_put(shrinker);
                /*
                 * Wait for all lookups of the shrinker to complete, after that,
                 * no shrinker is running or will run again, then we can safely
                 * free it asynchronously via RCU and safely free the structure
                 * where the shrinker is located, such as super_block etc.
                 */
                wait_for_completion(&shrinker->done);
        }

        mutex_lock(&shrinker_mutex);
        if (shrinker->flags & SHRINKER_REGISTERED) {
                /*
                 * Now we can safely remove it from the shrinker_list and then
                 * free it.
                 */
                list_del_rcu(&shrinker->list);
                debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
                shrinker->flags &= ~SHRINKER_REGISTERED;
        }

        shrinker_debugfs_name_free(shrinker);

        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
                shrinker_memcg_remove(shrinker);
        mutex_unlock(&shrinker_mutex);

        if (debugfs_entry)
                shrinker_debugfs_remove(debugfs_entry, debugfs_id);

        call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
}
EXPORT_SYMBOL_GPL(shrinker_free);

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_LE_H_
#define _ASM_GENERIC_BITOPS_LE_H_

#include <asm/types.h>
#include <asm/byteorder.h>

#if defined(__LITTLE_ENDIAN)

#define BITOP_LE_SWIZZLE        0

#elif defined(__BIG_ENDIAN)

#define BITOP_LE_SWIZZLE        ((BITS_PER_LONG-1) & ~0x7)

#endif


static inline int test_bit_le(int nr, const void *addr)
{
        return test_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void set_bit_le(int nr, void *addr)
{
        set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void clear_bit_le(int nr, void *addr)
{
        clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __set_bit_le(int nr, void *addr)
{
        __set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __clear_bit_le(int nr, void *addr)
{
        __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_set_bit_le(int nr, void *addr)
{
        return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_clear_bit_le(int nr, void *addr)
{
        return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_set_bit_le(int nr, void *addr)
{
        return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_clear_bit_le(int nr, void *addr)
{
        return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

#endif /* _ASM_GENERIC_BITOPS_LE_H_ */










































































































    1 











    1 










    1 
















    1 






































































    9 




    9 


    8 






























    7 
    3 






    9 





















    1 























    1 











    1 
























    1 
    1 




    1 


    1 





















    1 
    1 







    1 


    1 







    1 



















































    1 
    1 





















































    8 







    9 





    8 





















    8 

    9 

    1 

    8 



    8 
















    9 





    2 
    2 


    2 





























    8 






    8 












    9 


    9 
















    2 
    2 


    2 



































































































































































    2 































































































































































    1 








































   24 














   21 


   22 
   25 
   23 












































































































    3 





    1 

   20 



   21 
   22 




















    5 




    4 
    5 


    5 














































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/file.c
 *
 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
 *
 *  Manage the dynamic fd arrays in the process files_struct.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <net/sock.h>

#include "internal.h"

unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
        __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

static void __free_fdtable(struct fdtable *fdt)
{
        kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
}

static void free_fdtable_rcu(struct rcu_head *rcu)
{
        __free_fdtable(container_of(rcu, struct fdtable, rcu));
}

#define BITBIT_NR(nr)        BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)        (BITBIT_NR(nr) * sizeof(long))

/*
 * Copy 'count' fd bits from the old table to the new table and clear the extra
 * space if any.  This does not copy the file pointers.  Called with the files
 * spinlock held for write.
 */
static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
                            unsigned int count)
{
        unsigned int cpy, set;

        cpy = count / BITS_PER_BYTE;
        set = (nfdt->max_fds - count) / BITS_PER_BYTE;
        memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
        memset((char *)nfdt->open_fds + cpy, 0, set);
        memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
        memset((char *)nfdt->close_on_exec + cpy, 0, set);

        cpy = BITBIT_SIZE(count);
        set = BITBIT_SIZE(nfdt->max_fds) - cpy;
        memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
        memset((char *)nfdt->full_fds_bits + cpy, 0, set);
}

/*
 * Copy all file descriptors from the old table to the new, expanded table and
 * clear the extra space.  Called with the files spinlock held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
        size_t cpy, set;

        BUG_ON(nfdt->max_fds < ofdt->max_fds);

        cpy = ofdt->max_fds * sizeof(struct file *);
        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
        memcpy(nfdt->fd, ofdt->fd, cpy);
        memset((char *)nfdt->fd + cpy, 0, set);

        copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}

/*
 * Note how the fdtable bitmap allocations very much have to be a multiple of
 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 * 'unsigned long' in some places, but simply because that is how the Linux
 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 * they are very much "bits in an array of unsigned long".
 *
 * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
 * by that "1024/sizeof(ptr)" before, we already know there are sufficient
 * clear low bits. Clang seems to realize that, gcc ends up being confused.
 *
 * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
 * let's consider it documentation (and maybe a test-case for gcc to improve
 * its code generation ;)
 */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
        struct fdtable *fdt;
        void *data;

        /*
         * Figure out how many fds we actually want to support in this fdtable.
         * Allocation steps are keyed to the size of the fdarray, since it
         * grows far faster than any of the other dynamic data. We try to fit
         * the fdarray into comfortable page-tuned chunks: starting at 1024B
         * and growing in powers of two from there on.
         */
        nr /= (1024 / sizeof(struct file *));
        nr = roundup_pow_of_two(nr + 1);
        nr *= (1024 / sizeof(struct file *));
        nr = ALIGN(nr, BITS_PER_LONG);
        /*
         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
         * had been set lower between the check in expand_files() and here.  Deal
         * with that in caller, it's cheaper that way.
         *
         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
         * bitmaps handling below becomes unpleasant, to put it mildly...
         */
        if (unlikely(nr > sysctl_nr_open))
                nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;

        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
        data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_fdt;
        fdt->fd = data;

        data = kvmalloc(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                                 GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_arr;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
        data += nr / BITS_PER_BYTE;
        fdt->full_fds_bits = data;

        return fdt;

out_arr:
        kvfree(fdt->fd);
out_fdt:
        kfree(fdt);
out:
        return NULL;
}

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 1 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *new_fdt, *cur_fdt;

        spin_unlock(&files->file_lock);
        new_fdt = alloc_fdtable(nr);

        /* make sure all fd_install() have seen resize_in_progress
         * or have finished their rcu_read_lock_sched() section.
         */
        if (atomic_read(&files->count) > 1)
                synchronize_rcu();

        spin_lock(&files->file_lock);
        if (!new_fdt)
                return -ENOMEM;
        /*
         * extremely unlikely race - sysctl_nr_open decreased between the check in
         * caller and alloc_fdtable().  Cheaper to catch it here...
         */
        if (unlikely(new_fdt->max_fds <= nr)) {
                __free_fdtable(new_fdt);
                return -EMFILE;
        }
        cur_fdt = files_fdtable(files);
        BUG_ON(nr < cur_fdt->max_fds);
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt != &files->fdtab)
                call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
        /* coupled with smp_rmb() in fd_install() */
        smp_wmb();
        return 1;
}

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 when nothing done; 1 when files were
 * expanded and execution may have blocked.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_files(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *fdt;
        int expanded = 0;

repeat:
        fdt = files_fdtable(files);

        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return expanded;

        /* Can we expand? */
        if (nr >= sysctl_nr_open)
                return -EMFILE;

        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
                expanded = 1;
                wait_event(files->resize_wait, !files->resize_in_progress);
                spin_lock(&files->file_lock);
                goto repeat;
        }

        /* All good, so we try */
        files->resize_in_progress = true;
        expanded = expand_fdtable(files, nr);
        files->resize_in_progress = false;

        wake_up_all(&files->resize_wait);
        return expanded;
}

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->close_on_exec);
}

static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        if (test_bit(fd, fdt->close_on_exec))
                __clear_bit(fd, fdt->close_on_exec);
}

static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->open_fds);
        fd /= BITS_PER_LONG;
        if (!~fdt->open_fds[fd])
                __set_bit(fd, fdt->full_fds_bits);
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->open_fds);
}

static unsigned int count_open_files(struct fdtable *fdt)
{
        unsigned int size = fdt->max_fds;
        unsigned int i;

        /* Find the last open fd */
        for (i = size / BITS_PER_LONG; i > 0; ) {
                if (fdt->open_fds[--i])
                        break;
        }
        i = (i + 1) * BITS_PER_LONG;
        return i;
}

/*
 * Note that a sane fdtable size always has to be a multiple of
 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 *
 * 'max_fds' will normally already be properly aligned, but it
 * turns out that in the close_range() -> __close_range() ->
 * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
 * up having a 'max_fds' value that isn't already aligned.
 *
 * Rather than make close_range() have to worry about this,
 * just make that BITS_PER_LONG alignment be part of a sane
 * fdtable size. Becuase that's really what it is.
 */
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
        unsigned int count;

        count = count_open_files(fdt);
        if (max_fds < NR_OPEN_DEFAULT)
                max_fds = NR_OPEN_DEFAULT;
        return ALIGN(min(count, max_fds), BITS_PER_LONG);
}

/*
 * Allocate a new files structure and copy contents from the
 * passed in files structure.
 * errorp will be valid only when the returned files_struct is NULL.
 */
struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
        unsigned int open_files, i;
        struct fdtable *old_fdt, *new_fdt;

        *errorp = -ENOMEM;
        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                goto out;

        atomic_set(&newf->count, 1);

        spin_lock_init(&newf->file_lock);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
        new_fdt = &newf->fdtab;
        new_fdt->max_fds = NR_OPEN_DEFAULT;
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
        new_fdt->fd = &newf->fd_array[0];

        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, max_fds);

        /*
         * Check whether we need to allocate a larger fd array and fd set.
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);

                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);

                new_fdt = alloc_fdtable(open_files - 1);
                if (!new_fdt) {
                        *errorp = -ENOMEM;
                        goto out_release;
                }

                /* beyond sysctl_nr_open; nothing to do */
                if (unlikely(new_fdt->max_fds < open_files)) {
                        __free_fdtable(new_fdt);
                        *errorp = -EMFILE;
                        goto out_release;
                }

                /*
                 * Reacquire the oldf lock and a pointer to its fd table
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = sane_fdtable_size(old_fdt, max_fds);
        }

        copy_fd_bitmaps(new_fdt, old_fdt, open_files);

        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;

        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
                if (f) {
                        get_file(f);
                } else {
                        /*
                         * The fd may be claimed in the fd bitmap but not yet
                         * instantiated in the files array if a sibling thread
                         * is partway through open().  So make sure that this
                         * fd is available to the new process.
                         */
                        __clear_open_fd(open_files - i, new_fdt);
                }
                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);

        /* clear the remainder */
        memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

        rcu_assign_pointer(newf->fdt, new_fdt);

        return newf;

out_release:
        kmem_cache_free(files_cachep, newf);
out:
        return NULL;
}

static struct fdtable *close_files(struct files_struct * files)
{
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
         * files structure.
         */
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned int i, j = 0;

        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file * file = xchg(&fdt->fd[i], NULL);
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched();
                                }
                        }
                        i++;
                        set >>= 1;
                }
        }

        return fdt;
}

void put_files_struct(struct files_struct *files)
{
        if (atomic_dec_and_test(&files->count)) {
                struct fdtable *fdt = close_files(files);

                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
                kmem_cache_free(files_cachep, files);
        }
}

void exit_files(struct task_struct *tsk)
{
        struct files_struct * files = tsk->files;

        if (files) {
                task_lock(tsk);
                tsk->files = NULL;
                task_unlock(tsk);
                put_files_struct(files);
        }
}

struct files_struct init_files = {
        .count                = ATOMIC_INIT(1),
        .fdt                = &init_files.fdtab,
        .fdtab                = {
                .max_fds        = NR_OPEN_DEFAULT,
                .fd                = &init_files.fd_array[0],
                .close_on_exec        = init_files.close_on_exec_init,
                .open_fds        = init_files.open_fds_init,
                .full_fds_bits        = init_files.full_fds_bits_init,
        },
        .file_lock        = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
        .resize_wait        = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};

static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
        unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
        unsigned int maxbit = maxfd / BITS_PER_LONG;
        unsigned int bitbit = start / BITS_PER_LONG;

        bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
        if (bitbit >= maxfd)
                return maxfd;
        if (bitbit > start)
                start = bitbit;
        return find_next_zero_bit(fdt->open_fds, maxfd, start);
}

/*
 * allocate a file descriptor, mark it busy.
 */
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
        struct files_struct *files = current->files;
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (fd < fdt->max_fds)
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (fd >= end)
                goto out;

        error = expand_files(files, fd);
        if (error < 0)
                goto out;

        /*
         * If we needed to expand the fs array we
         * might have blocked - try again.
         */
        if (error)
                goto repeat;

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        error = fd;
#if 1
        /* Sanity check */
        if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
#endif

out:
        spin_unlock(&files->file_lock);
        return error;
}

int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
        return alloc_fd(0, nofile, flags);
}

int get_unused_fd_flags(unsigned flags)
{
        return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

void put_unused_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
}

EXPORT_SYMBOL(put_unused_fd);

/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 *
 * This consumes the "file" refcount, so callers should treat it
 * as if they had called fput(file).
 */

void fd_install(unsigned int fd, struct file *file)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;

        if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
                return;

        rcu_read_lock_sched();

        if (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                spin_lock(&files->file_lock);
                fdt = files_fdtable(files);
                BUG_ON(fdt->fd[fd] != NULL);
                rcu_assign_pointer(fdt->fd[fd], file);
                spin_unlock(&files->file_lock);
                return;
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        BUG_ON(fdt->fd[fd] != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

EXPORT_SYMBOL(fd_install);

/**
 * file_close_fd_locked - return file associated with fd
 * @files: file struct to retrieve file from
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Context: files_lock must be held.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
{
        struct fdtable *fdt = files_fdtable(files);
        struct file *file;

        lockdep_assert_held(&files->file_lock);

        if (fd >= fdt->max_fds)
                return NULL;

        fd = array_index_nospec(fd, fdt->max_fds);
        file = fdt->fd[fd];
        if (file) {
                rcu_assign_pointer(fdt->fd[fd], NULL);
                __put_unused_fd(files, fd);
        }
        return file;
}

int close_fd(unsigned fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
}
EXPORT_SYMBOL(close_fd); /* for ksys_close() */

/**
 * last_fd - return last valid index into fd table
 * @fdt: File descriptor table.
 *
 * Context: Either rcu read lock or files_lock must be held.
 *
 * Returns: Last valid index into fdtable.
 */
static inline unsigned last_fd(struct fdtable *fdt)
{
        return fdt->max_fds - 1;
}

static inline void __range_cloexec(struct files_struct *cur_fds,
                                   unsigned int fd, unsigned int max_fd)
{
        struct fdtable *fdt;

        /* make sure we're using the correct maximum value */
        spin_lock(&cur_fds->file_lock);
        fdt = files_fdtable(cur_fds);
        max_fd = min(last_fd(fdt), max_fd);
        if (fd <= max_fd)
                bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
        spin_unlock(&cur_fds->file_lock);
}

static inline void __range_close(struct files_struct *files, unsigned int fd,
                                 unsigned int max_fd)
{
        struct file *file;
        unsigned n;

        spin_lock(&files->file_lock);
        n = last_fd(files_fdtable(files));
        max_fd = min(max_fd, n);

        for (; fd <= max_fd; fd++) {
                file = file_close_fd_locked(files, fd);
                if (file) {
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                } else if (need_resched()) {
                        spin_unlock(&files->file_lock);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }
        }
        spin_unlock(&files->file_lock);
}

/**
 * __close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  CLOSE_RANGE flags.
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 */
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
{
        struct task_struct *me = current;
        struct files_struct *cur_fds = me->files, *fds = NULL;

        if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
                return -EINVAL;

        if (fd > max_fd)
                return -EINVAL;

        if (flags & CLOSE_RANGE_UNSHARE) {
                int ret;
                unsigned int max_unshare_fds = NR_OPEN_MAX;

                /*
                 * If the caller requested all fds to be made cloexec we always
                 * copy all of the file descriptors since they still want to
                 * use them.
                 */
                if (!(flags & CLOSE_RANGE_CLOEXEC)) {
                        /*
                         * If the requested range is greater than the current
                         * maximum, we're closing everything so only copy all
                         * file descriptors beneath the lowest file descriptor.
                         */
                        rcu_read_lock();
                        if (max_fd >= last_fd(files_fdtable(cur_fds)))
                                max_unshare_fds = fd;
                        rcu_read_unlock();
                }

                ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
                if (ret)
                        return ret;

                /*
                 * We used to share our file descriptor table, and have now
                 * created a private one, make sure we're using it below.
                 */
                if (fds)
                        swap(cur_fds, fds);
        }

        if (flags & CLOSE_RANGE_CLOEXEC)
                __range_cloexec(cur_fds, fd, max_fd);
        else
                __range_close(cur_fds, fd, max_fd);

        if (fds) {
                /*
                 * We're done closing the files we were supposed to. Time to install
                 * the new file descriptor table and drop the old one.
                 */
                task_lock(me);
                me->files = cur_fds;
                task_unlock(me);
                put_files_struct(fds);
        }

        return 0;
}

/**
 * file_close_fd - return file associated with fd
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);

        return file;
}

void do_close_on_exec(struct files_struct *files)
{
        unsigned i;
        struct fdtable *fdt;

        /* exec unshares first */
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
                unsigned fd = i * BITS_PER_LONG;
                fdt = files_fdtable(files);
                if (fd >= fdt->max_fds)
                        break;
                set = fdt->close_on_exec[i];
                if (!set)
                        continue;
                fdt->close_on_exec[i] = 0;
                for ( ; set ; fd++, set >>= 1) {
                        struct file *file;
                        if (!(set & 1))
                                continue;
                        file = fdt->fd[fd];
                        if (!file)
                                continue;
                        rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }

        }
        spin_unlock(&files->file_lock);
}

static struct file *__get_file_rcu(struct file __rcu **f)
{
        struct file __rcu *file;
        struct file __rcu *file_reloaded;
        struct file __rcu *file_reloaded_cmp;

        file = rcu_dereference_raw(*f);
        if (!file)
                return NULL;

        if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
                return ERR_PTR(-EAGAIN);

        file_reloaded = rcu_dereference_raw(*f);

        /*
         * Ensure that all accesses have a dependency on the load from
         * rcu_dereference_raw() above so we get correct ordering
         * between reuse/allocation and the pointer check below.
         */
        file_reloaded_cmp = file_reloaded;
        OPTIMIZER_HIDE_VAR(file_reloaded_cmp);

        /*
         * atomic_long_inc_not_zero() above provided a full memory
         * barrier when we acquired a reference.
         *
         * This is paired with the write barrier from assigning to the
         * __rcu protected file pointer so that if that pointer still
         * matches the current file, we know we have successfully
         * acquired a reference to the right file.
         *
         * If the pointers don't match the file has been reallocated by
         * SLAB_TYPESAFE_BY_RCU.
         */
        if (file == file_reloaded_cmp)
                return file_reloaded;

        fput(file);
        return ERR_PTR(-EAGAIN);
}

/**
 * get_file_rcu - try go get a reference to a file under rcu
 * @f: the file to get a reference on
 *
 * This function tries to get a reference on @f carefully verifying that
 * @f hasn't been reused.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_rcu(struct file __rcu **f)
{
        for (;;) {
                struct file __rcu *file;

                file = __get_file_rcu(f);
                if (!IS_ERR(file))
                        return file;
        }
}
EXPORT_SYMBOL_GPL(get_file_rcu);

/**
 * get_file_active - try go get a reference to a file
 * @f: the file to get a reference on
 *
 * In contast to get_file_rcu() the pointer itself isn't part of the
 * reference counting.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_active(struct file **f)
{
        struct file __rcu *file;

        rcu_read_lock();
        file = __get_file_rcu(f);
        rcu_read_unlock();
        if (IS_ERR(file))
                file = NULL;
        return file;
}
EXPORT_SYMBOL_GPL(get_file_active);

static inline struct file *__fget_files_rcu(struct files_struct *files,
       unsigned int fd, fmode_t mask)
{
        for (;;) {
                struct file *file;
                struct fdtable *fdt = rcu_dereference_raw(files->fdt);
                struct file __rcu **fdentry;
                unsigned long nospec_mask;

                /* Mask is a 0 for invalid fd's, ~0 for valid ones */
                nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);

                /*
                 * fdentry points to the 'fd' offset, or fdt->fd[0].
                 * Loading from fdt->fd[0] is always safe, because the
                 * array always exists.
                 */
                fdentry = fdt->fd + (fd & nospec_mask);

                /* Do the load, then mask any invalid result */
                file = rcu_dereference_raw(*fdentry);
                file = (void *)(nospec_mask & (unsigned long)file);
                if (unlikely(!file))
                        return NULL;

                /*
                 * Ok, we have a file pointer that was valid at
                 * some point, but it might have become stale since.
                 *
                 * We need to confirm it by incrementing the refcount
                 * and then check the lookup again.
                 *
                 * atomic_long_inc_not_zero() gives us a full memory
                 * barrier. We only really need an 'acquire' one to
                 * protect the loads below, but we don't have that.
                 */
                if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
                        continue;

                /*
                 * Such a race can take two forms:
                 *
                 *  (a) the file ref already went down to zero and the
                 *      file hasn't been reused yet or the file count
                 *      isn't zero but the file has already been reused.
                 *
                 *  (b) the file table entry has changed under us.
                 *       Note that we don't need to re-check the 'fdt->fd'
                 *       pointer having changed, because it always goes
                 *       hand-in-hand with 'fdt'.
                 *
                 * If so, we need to put our ref and try again.
                 */
                if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
                    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
                        fput(file);
                        continue;
                }

                /*
                 * This isn't the file we're looking for or we're not
                 * allowed to get a reference to it.
                 */
                if (unlikely(file->f_mode & mask)) {
                        fput(file);
                        return NULL;
                }

                /*
                 * Ok, we have a ref to the file, and checked that it
                 * still exists.
                 */
                return file;
        }
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                                 fmode_t mask)
{
        struct file *file;

        rcu_read_lock();
        file = __fget_files_rcu(files, fd, mask);
        rcu_read_unlock();

        return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
        return __fget_files(current->files, fd, mask);
}

struct file *fget(unsigned int fd)
{
        return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);

struct file *fget_raw(unsigned int fd)
{
        return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
        struct file *file = NULL;

        task_lock(task);
        if (task->files)
                file = __fget_files(task->files, fd, 0);
        task_unlock(task);

        return file;
}

struct file *lookup_fdget_rcu(unsigned int fd)
{
        return __fget_files_rcu(current->files, fd, 0);

}
EXPORT_SYMBOL_GPL(lookup_fdget_rcu);

struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files)
                file = __fget_files_rcu(files, fd, 0);
        task_unlock(task);

        return file;
}

struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        unsigned int fd = *ret_fd;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files) {
                for (; fd < files_fdtable(files)->max_fds; fd++) {
                        file = __fget_files_rcu(files, fd, 0);
                        if (file)
                                break;
                }
        }
        task_unlock(task);
        *ret_fd = fd;
        return file;
}
EXPORT_SYMBOL(task_lookup_next_fdget_rcu);

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 */
static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
        struct files_struct *files = current->files;
        struct file *file;

        /*
         * If another thread is concurrently calling close_fd() followed
         * by put_files_struct(), we must not observe the old table
         * entry combined with the new refcount - otherwise we could
         * return a file that is concurrently being freed.
         *
         * atomic_read_acquire() pairs with atomic_dec_and_test() in
         * put_files_struct().
         */
        if (likely(atomic_read_acquire(&files->count) == 1)) {
                file = files_lookup_fd_raw(files, fd);
                if (!file || unlikely(file->f_mode & mask))
                        return 0;
                return (unsigned long)file;
        } else {
                file = __fget_files(files, fd, mask);
                if (!file)
                        return 0;
                return FDPUT_FPUT | (unsigned long)file;
        }
}
unsigned long __fdget(unsigned int fd)
{
        return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(__fdget);

unsigned long __fdget_raw(unsigned int fd)
{
        return __fget_light(fd, 0);
}

/*
 * Try to avoid f_pos locking. We only need it if the
 * file is marked for FMODE_ATOMIC_POS, and it can be
 * accessed multiple ways.
 *
 * Always do it for directories, because pidfd_getfd()
 * can make a file accessible even if it otherwise would
 * not be, and for directories this is a correctness
 * issue, not a "POSIX requirement".
 */
static inline bool file_needs_f_pos_lock(struct file *file)
{
        return (file->f_mode & FMODE_ATOMIC_POS) &&
                (file_count(file) > 1 || file->f_op->iterate_shared);
}

unsigned long __fdget_pos(unsigned int fd)
{
        unsigned long v = __fdget(fd);
        struct file *file = (struct file *)(v & ~3);

        if (file && file_needs_f_pos_lock(file)) {
                v |= FDPUT_POS_UNLOCK;
                mutex_lock(&file->f_pos_lock);
        }
        return v;
}

void __f_unlock_pos(struct file *f)
{
        mutex_unlock(&f->f_pos_lock);
}

/*
 * We only lock f_pos if we have threads or if the file might be
 * shared with another process. In both cases we'll have an elevated
 * file count (done either by fdget() or by fork()).
 */

void set_close_on_exec(unsigned int fd, int flag)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;
        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (flag)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);
}

bool get_close_on_exec(unsigned int fd)
{
        bool res;
        rcu_read_lock();
        res = close_on_exec(fd, current->files);
        rcu_read_unlock();
        return res;
}

static int do_dup2(struct files_struct *files,
        struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
        struct file *tofree;
        struct fdtable *fdt;

        /*
         * We need to detect attempts to do dup2() over allocated but still
         * not finished descriptor.  NB: OpenBSD avoids that at the price of
         * extra work in their equivalent of fget() - they insert struct
         * file immediately after grabbing descriptor, mark it larval if
         * more work (e.g. actual opening) is needed and make sure that
         * fget() treats larval files as absent.  Potentially interesting,
         * but while extra work in fget() is trivial, locking implications
         * and amount of surgery on open()-related paths in VFS are not.
         * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
         * deadlocks in rather amusing ways, AFAICS.  All of that is out of
         * scope of POSIX or SUS, since neither considers shared descriptor
         * tables and this condition does not arise without those.
         */
        fdt = files_fdtable(files);
        tofree = fdt->fd[fd];
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
        rcu_assign_pointer(fdt->fd[fd], file);
        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);

        if (tofree)
                filp_close(tofree, files);

        return fd;

Ebusy:
        spin_unlock(&files->file_lock);
        return -EBUSY;
}

int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
        int err;
        struct files_struct *files = current->files;

        if (!file)
                return close_fd(fd);

        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
                goto out_unlock;
        return do_dup2(files, file, fd, flags);

out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

/**
 * receive_fd() - Install received file into file descriptor table
 * @file: struct file that was received from another process
 * @ufd: __user pointer to write new fd number to
 * @o_flags: the O_* flags to apply to the new fd entry
 *
 * Installs a received file into the file descriptor table, with appropriate
 * checks and count updates. Optionally writes the fd number to userspace, if
 * @ufd is non-NULL.
 *
 * This helper handles its own reference counting of the incoming
 * struct file.
 *
 * Returns newly install fd or -ve on error.
 */
int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
        int new_fd;
        int error;

        error = security_file_receive(file);
        if (error)
                return error;

        new_fd = get_unused_fd_flags(o_flags);
        if (new_fd < 0)
                return new_fd;

        if (ufd) {
                error = put_user(new_fd, ufd);
                if (error) {
                        put_unused_fd(new_fd);
                        return error;
                }
        }

        fd_install(new_fd, get_file(file));
        __receive_sock(file);
        return new_fd;
}
EXPORT_SYMBOL_GPL(receive_fd);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
        int error;

        error = security_file_receive(file);
        if (error)
                return error;
        error = replace_fd(new_fd, file, o_flags);
        if (error)
                return error;
        __receive_sock(file);
        return new_fd;
}

static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
        int err = -EBADF;
        struct file *file;
        struct files_struct *files = current->files;

        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;

        if (unlikely(oldfd == newfd))
                return -EINVAL;

        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = files_lookup_fd_locked(files, oldfd);
        if (unlikely(!file))
                goto Ebadf;
        if (unlikely(err < 0)) {
                if (err == -EMFILE)
                        goto Ebadf;
                goto out_unlock;
        }
        return do_dup2(files, file, newfd, flags);

Ebadf:
        err = -EBADF;
out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
        return ksys_dup3(oldfd, newfd, flags);
}

SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
                struct file *f;
                int retval = oldfd;

                rcu_read_lock();
                f = __fget_files_rcu(files, oldfd, 0);
                if (!f)
                        retval = -EBADF;
                rcu_read_unlock();
                if (f)
                        fput(f);
                return retval;
        }
        return ksys_dup3(oldfd, newfd, 0);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
        unsigned long nofile = rlimit(RLIMIT_NOFILE);
        int err;
        if (from >= nofile)
                return -EINVAL;
        err = alloc_fd(from, nofile, flags);
        if (err >= 0) {
                get_file(file);
                fd_install(err, file);
        }
        return err;
}

int iterate_fd(struct files_struct *files, unsigned n,
                int (*f)(const void *, struct file *, unsigned),
                const void *p)
{
        struct fdtable *fdt;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
                if (!file)
                        continue;
                res = f(p, file, n);
                if (res)
                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
}
EXPORT_SYMBOL(iterate_fd);















































































    2 




    2 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_H
#define _ASM_X86_PAGE_H

#include <linux/types.h>

#ifdef __KERNEL__

#include <asm/page_types.h>

#ifdef CONFIG_X86_64
#include <asm/page_64.h>
#else
#include <asm/page_32.h>
#endif        /* CONFIG_X86_64 */

#ifndef __ASSEMBLY__

struct page;

#include <linux/range.h>
extern struct range pfn_mapped[];
extern int nr_pfn_mapped;

static inline void clear_user_page(void *page, unsigned long vaddr,
                                   struct page *pg)
{
        clear_page(page);
}

static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
                                  struct page *topage)
{
        copy_page(to, from);
}

#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
        vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)

#ifndef __pa
#define __pa(x)                __phys_addr((unsigned long)(x))
#endif

#define __pa_nodebug(x)        __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
   This seems to be the official gcc blessed way to do such arithmetic. */
/*
 * We need __phys_reloc_hide() here because gcc may assume that there is no
 * overflow during __pa() calculation and can optimize it unexpectedly.
 * Newer versions of gcc provide -fno-strict-overflow switch to handle this
 * case properly. Once all supported versions of gcc understand it, we can
 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
 */
#define __pa_symbol(x) \
        __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))

#ifndef __va
#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
#endif

#define __boot_va(x)                __va(x)
#define __boot_pa(x)                __pa(x)

/*
 * virt_to_page(kaddr) returns a valid pointer if and only if
 * virt_addr_valid(kaddr) returns true.
 */
#define virt_to_page(kaddr)        pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr)        __virt_addr_valid((unsigned long) (kaddr))

static __always_inline void *pfn_to_kaddr(unsigned long pfn)
{
        return __va(pfn << PAGE_SHIFT);
}

static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
}

static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return __canonical_address(vaddr, vaddr_bits) == vaddr;
}

#endif        /* __ASSEMBLY__ */

#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>

#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA

#endif        /* __KERNEL__ */
#endif /* _ASM_X86_PAGE_H */





























































































































































    7 




























































    7 










































    6 





















    2 

    3 



    2 





    2 

    3 












    2 






















































    2 




    2 
















































































    3 
    3 











































    2 






    3 












    1 


























































































    2 





















    2 

















    2 





    2 


    3 










    1 

    1 


    3 


    3 






    2 





















































































































































































































































    1 



    1 







































    5 
    6 



    6 









    5 


    7 















    6 
    7 






















    1 




    1 











    1 





















    1 
    1 

    1 





























































































































































































    1 
    1 
















    1 










    1 

    1 






























































































































































































































































































































































































































    3 




    2 

    2 





















    3 
















    3 





    2 






    3 















































    2 





    1 























































    1 






















    1 



















    1 














    2 














    2 




    2 




    2 










    2 



    1 


    1 













    1 


    1 

























































































































































































































































































































































































































































































    2 



    2 






















    3 



    2 











    3 













































    1 
    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
// SPDX-License-Identifier: GPL-2.0-only
/*
 * (C) 1997 Linus Torvalds
 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
 */
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
#include <trace/events/writeback.h>
#include "internal.h"

/*
 * Inode locking rules:
 *
 * inode->i_lock protects:
 *   inode->i_state, inode->i_hash, __iget(), inode->i_io_list
 * Inode LRU list locks protect:
 *   inode->i_sb->s_inode_lru, inode->i_lru
 * inode->i_sb->s_inode_list_lock protects:
 *   inode->i_sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
 * Lock ordering:
 *
 * inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *     Inode LRU list locks
 *
 * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
 *   inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *
 * iunique_lock
 *   inode_hash_lock
 */

static unsigned int i_hash_mask __ro_after_init;
static unsigned int i_hash_shift __ro_after_init;
static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);

/*
 * Empty aops. Can be used for the cases where the user does not
 * define any of the address_space operations.
 */
const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);

static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);

static struct kmem_cache *inode_cachep __ro_after_init;

static long get_nr_inodes(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_inodes, i);
        return sum < 0 ? 0 : sum;
}

static inline long get_nr_inodes_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_unused, i);
        return sum < 0 ? 0 : sum;
}

long get_nr_dirty_inodes(void)
{
        /* not actually dirty inodes, but a wild approximation */
        long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
}

/*
 * Handle nr_inode sysctl
 */
#ifdef CONFIG_SYSCTL
/*
 * Statistics gathering..
 */
static struct inodes_stat_t inodes_stat;

static int proc_nr_inodes(struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        inodes_stat.nr_inodes = get_nr_inodes();
        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table inodes_sysctls[] = {
        {
                .procname        = "inode-nr",
                .data                = &inodes_stat,
                .maxlen                = 2*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
        {
                .procname        = "inode-state",
                .data                = &inodes_stat,
                .maxlen                = 7*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
};

static int __init init_fs_inode_sysctls(void)
{
        register_sysctl_init("fs", inodes_sysctls);
        return 0;
}
early_initcall(init_fs_inode_sysctls);
#endif

static int no_open(struct inode *inode, struct file *file)
{
        return -ENXIO;
}

/**
 * inode_init_always - perform inode structure initialisation
 * @sb: superblock inode belongs to
 * @inode: inode to initialise
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
int inode_init_always(struct super_block *sb, struct inode *inode)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;

        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->i_ino = 0;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;

#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif

        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);

        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);

        atomic_set(&inode->i_dio_count, 0);

        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->i_private_data = NULL;
        mapping->writeback_index = 0;
        init_rwsem(&mapping->invalidate_lock);
        lockdep_set_class_and_name(&mapping->invalidate_lock,
                                   &sb->s_type->invalidate_lock_key,
                                   "mapping.invalidate_lock");
        if (sb->s_iflags & SB_I_STABLE_WRITES)
                mapping_set_stable_writes(mapping);
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);        /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif

#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;

        if (unlikely(security_inode_alloc(inode)))
                return -ENOMEM;
        this_cpu_inc(nr_inodes);

        return 0;
}
EXPORT_SYMBOL(inode_init_always);

void free_inode_nonrcu(struct inode *inode)
{
        kmem_cache_free(inode_cachep, inode);
}
EXPORT_SYMBOL(free_inode_nonrcu);

static void i_callback(struct rcu_head *head)
{
        struct inode *inode = container_of(head, struct inode, i_rcu);
        if (inode->free_inode)
                inode->free_inode(inode);
        else
                free_inode_nonrcu(inode);
}

static struct inode *alloc_inode(struct super_block *sb)
{
        const struct super_operations *ops = sb->s_op;
        struct inode *inode;

        if (ops->alloc_inode)
                inode = ops->alloc_inode(sb);
        else
                inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);

        if (!inode)
                return NULL;

        if (unlikely(inode_init_always(sb, inode))) {
                if (ops->destroy_inode) {
                        ops->destroy_inode(inode);
                        if (!ops->free_inode)
                                return NULL;
                }
                inode->free_inode = ops->free_inode;
                i_callback(&inode->i_rcu);
                return NULL;
        }

        return inode;
}

void __destroy_inode(struct inode *inode)
{
        BUG_ON(inode_has_buffers(inode));
        inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

#ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl && !is_uncached_acl(inode->i_acl))
                posix_acl_release(inode->i_acl);
        if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
                posix_acl_release(inode->i_default_acl);
#endif
        this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);

static void destroy_inode(struct inode *inode)
{
        const struct super_operations *ops = inode->i_sb->s_op;

        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (ops->destroy_inode) {
                ops->destroy_inode(inode);
                if (!ops->free_inode)
                        return;
        }
        inode->free_inode = ops->free_inode;
        call_rcu(&inode->i_rcu, i_callback);
}

/**
 * drop_nlink - directly drop an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  In cases
 * where we are attempting to track writes to the
 * filesystem, a decrement to zero means an imminent
 * write when the file is truncated and actually unlinked
 * on the filesystem.
 */
void drop_nlink(struct inode *inode)
{
        WARN_ON(inode->i_nlink == 0);
        inode->__i_nlink--;
        if (!inode->i_nlink)
                atomic_long_inc(&inode->i_sb->s_remove_count);
}
EXPORT_SYMBOL(drop_nlink);

/**
 * clear_nlink - directly zero an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  See
 * drop_nlink() for why we care about i_nlink hitting zero.
 */
void clear_nlink(struct inode *inode)
{
        if (inode->i_nlink) {
                inode->__i_nlink = 0;
                atomic_long_inc(&inode->i_sb->s_remove_count);
        }
}
EXPORT_SYMBOL(clear_nlink);

/**
 * set_nlink - directly set an inode's link count
 * @inode: inode
 * @nlink: new nlink (should be non-zero)
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.
 */
void set_nlink(struct inode *inode, unsigned int nlink)
{
        if (!nlink) {
                clear_nlink(inode);
        } else {
                /* Yes, some filesystems do change nlink from zero to one */
                if (inode->i_nlink == 0)
                        atomic_long_dec(&inode->i_sb->s_remove_count);

                inode->__i_nlink = nlink;
        }
}
EXPORT_SYMBOL(set_nlink);

/**
 * inc_nlink - directly increment an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  Currently,
 * it is only here for parity with dec_nlink().
 */
void inc_nlink(struct inode *inode)
{
        if (unlikely(inode->i_nlink == 0)) {
                WARN_ON(!(inode->i_state & I_LINKABLE));
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

        inode->__i_nlink++;
}
EXPORT_SYMBOL(inc_nlink);

static void __address_space_init_once(struct address_space *mapping)
{
        xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
        init_rwsem(&mapping->i_mmap_rwsem);
        INIT_LIST_HEAD(&mapping->i_private_list);
        spin_lock_init(&mapping->i_private_lock);
        mapping->i_mmap = RB_ROOT_CACHED;
}

void address_space_init_once(struct address_space *mapping)
{
        memset(mapping, 0, sizeof(*mapping));
        __address_space_init_once(mapping);
}
EXPORT_SYMBOL(address_space_init_once);

/*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
 * of the inode, so let the slab aware of that.
 */
void inode_init_once(struct inode *inode)
{
        memset(inode, 0, sizeof(*inode));
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_io_list);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
        INIT_LIST_HEAD(&inode->i_sb_list);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);

static void init_once(void *foo)
{
        struct inode *inode = (struct inode *) foo;

        inode_init_once(inode);
}

/*
 * inode->i_lock must be held
 */
void __iget(struct inode *inode)
{
        atomic_inc(&inode->i_count);
}

/*
 * get additional reference to inode; caller must already hold one.
 */
void ihold(struct inode *inode)
{
        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
}
EXPORT_SYMBOL(ihold);

static void __inode_add_lru(struct inode *inode, bool rotate)
{
        if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
                return;
        if (atomic_read(&inode->i_count))
                return;
        if (!(inode->i_sb->s_flags & SB_ACTIVE))
                return;
        if (!mapping_shrinkable(&inode->i_data))
                return;

        if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_inc(nr_unused);
        else if (rotate)
                inode->i_state |= I_REFERENCED;
}

/*
 * Add inode to LRU if needed (inode is unused and clean).
 *
 * Needs inode->i_lock held.
 */
void inode_add_lru(struct inode *inode)
{
        __inode_add_lru(inode, false);
}

static void inode_lru_list_del(struct inode *inode)
{
        if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_dec(nr_unused);
}

/**
 * inode_sb_list_add - add inode to the superblock list of inodes
 * @inode: inode to add
 */
void inode_sb_list_add(struct inode *inode)
{
        spin_lock(&inode->i_sb->s_inode_list_lock);
        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
        spin_unlock(&inode->i_sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);

static inline void inode_sb_list_del(struct inode *inode)
{
        if (!list_empty(&inode->i_sb_list)) {
                spin_lock(&inode->i_sb->s_inode_list_lock);
                list_del_init(&inode->i_sb_list);
                spin_unlock(&inode->i_sb->s_inode_list_lock);
        }
}

static unsigned long hash(struct super_block *sb, unsigned long hashval)
{
        unsigned long tmp;

        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
                        L1_CACHE_BYTES;
        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
        return tmp & i_hash_mask;
}

/**
 *        __insert_inode_hash - hash an inode
 *        @inode: unhashed inode
 *        @hashval: unsigned long value used to locate this object in the
 *                inode_hashtable.
 *
 *        Add an inode to the inode hash for this superblock.
 */
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);

        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_add_head_rcu(&inode->i_hash, b);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);

/**
 *        __remove_inode_hash - remove an inode from the hash
 *        @inode: inode to unhash
 *
 *        Remove an inode from the superblock.
 */
void __remove_inode_hash(struct inode *inode)
{
        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_del_init_rcu(&inode->i_hash);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__remove_inode_hash);

void dump_mapping(const struct address_space *mapping)
{
        struct inode *host;
        const struct address_space_operations *a_ops;
        struct hlist_node *dentry_first;
        struct dentry *dentry_ptr;
        struct dentry dentry;
        unsigned long ino;

        /*
         * If mapping is an invalid pointer, we don't want to crash
         * accessing it, so probe everything depending on it carefully.
         */
        if (get_kernel_nofault(host, &mapping->host) ||
            get_kernel_nofault(a_ops, &mapping->a_ops)) {
                pr_warn("invalid mapping:%px\n", mapping);
                return;
        }

        if (!host) {
                pr_warn("aops:%ps\n", a_ops);
                return;
        }

        if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
            get_kernel_nofault(ino, &host->i_ino)) {
                pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
                return;
        }

        if (!dentry_first) {
                pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
                return;
        }

        dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
        if (get_kernel_nofault(dentry, dentry_ptr) ||
            !dentry.d_parent || !dentry.d_name.name) {
                pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
                                a_ops, ino, dentry_ptr);
                return;
        }

        /*
         * if dentry is corrupted, the %pd handler may still crash,
         * but it's unlikely that we reach here with a corrupt mapping
         */
        pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
}

void clear_inode(struct inode *inode)
{
        /*
         * We have to cycle the i_pages lock here because reclaim can be in the
         * process of removing the last page (in __filemap_remove_folio())
         * and we must not free the mapping under it.
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
        /*
         * Almost always, mapping_empty(&inode->i_data) here; but there are
         * two known and long-standing ways in which nodes may get left behind
         * (when deep radix-tree node allocation failed partway; or when THP
         * collapse_file() failed). Until those two known cases are cleaned up,
         * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
         * nor even WARN_ON(!mapping_empty).
         */
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!list_empty(&inode->i_data.i_private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        BUG_ON(!list_empty(&inode->i_wb_list));
        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
}
EXPORT_SYMBOL(clear_inode);

/*
 * Free the inode passed in, removing it from the lists it is still connected
 * to. We remove any pages still attached to the inode and wait for any IO that
 * is still in progress before finally destroying the inode.
 *
 * An inode must already be marked I_FREEING so that we avoid the inode being
 * moved back onto lists if we race with other code that manipulates the lists
 * (e.g. writeback_single_inode). The caller is responsible for setting this.
 *
 * An inode must already be removed from the LRU list before being evicted from
 * the cache. This should occur atomically with setting the I_FREEING state
 * flag, so no inodes here should ever be on the LRU when being evicted.
 */
static void evict(struct inode *inode)
{
        const struct super_operations *op = inode->i_sb->s_op;

        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(!list_empty(&inode->i_lru));

        if (!list_empty(&inode->i_io_list))
                inode_io_list_del(inode);

        inode_sb_list_del(inode);

        /*
         * Wait for flusher thread to be done with the inode so that filesystem
         * does not start destroying it while writeback is still running. Since
         * the inode has I_FREEING set, flusher thread won't start new work on
         * the inode.  We just have to wait for running writeback to finish.
         */
        inode_wait_for_writeback(inode);

        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
                truncate_inode_pages_final(&inode->i_data);
                clear_inode(inode);
        }
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);

        remove_inode_hash(inode);

        spin_lock(&inode->i_lock);
        wake_up_bit(&inode->i_state, __I_NEW);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        spin_unlock(&inode->i_lock);

        destroy_inode(inode);
}

/*
 * dispose_list - dispose of the contents of a local list
 * @head: the head of the list to free
 *
 * Dispose-list gets a local list with local inodes in it, so it doesn't
 * need to worry about list corruption and SMP locks.
 */
static void dispose_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct inode *inode;

                inode = list_first_entry(head, struct inode, i_lru);
                list_del_init(&inode->i_lru);

                evict(inode);
                cond_resched();
        }
}

/**
 * evict_inodes        - evict all evictable inodes for a superblock
 * @sb:                superblock to operate on
 *
 * Make sure that no inodes with zero refcount are retained.  This is
 * called by superblock shutdown after having SB_ACTIVE flag removed,
 * so any inode reaching zero refcount during or after that call will
 * be immediately evicted.
 */
void evict_inodes(struct super_block *sb)
{
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);

                /*
                 * We can have a ton of inodes to evict at unmount time given
                 * enough memory, check to see if we need to go to sleep for a
                 * bit so we don't livelock.
                 */
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);

/**
 * invalidate_inodes        - attempt to free all inodes on a superblock
 * @sb:                superblock to operate on
 *
 * Attempts to free all inodes (including dirty inodes) for a given superblock.
 */
void invalidate_inodes(struct super_block *sb)
{
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}

/*
 * Isolate the inode from the LRU in preparation for freeing it.
 *
 * If the inode has the I_REFERENCED flag set, then it means that it has been
 * used recently - the flag is set in iput_final(). When we encounter such an
 * inode, clear the flag and move it to the back of the LRU so it gets another
 * pass through the LRU before it gets reclaimed. This is necessary because of
 * the fact we are doing lazy LRU updates to minimise lock contention so the
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
static enum lru_status inode_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct inode        *inode = container_of(item, struct inode, i_lru);

        /*
         * We are inverting the lru lock/inode->i_lock here, so use a
         * trylock. If we fail to get the lock, just skip it.
         */
        if (!spin_trylock(&inode->i_lock))
                return LRU_SKIP;

        /*
         * Inodes can get referenced, redirtied, or repopulated while
         * they're already on the LRU, and this can make them
         * unreclaimable for a while. Remove them lazily here; iput,
         * sync, or the last page cache deletion will requeue them.
         */
        if (atomic_read(&inode->i_count) ||
            (inode->i_state & ~I_REFERENCED) ||
            !mapping_shrinkable(&inode->i_data)) {
                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
        }

        /* Recently referenced inodes get one more pass */
        if (inode->i_state & I_REFERENCED) {
                inode->i_state &= ~I_REFERENCED;
                spin_unlock(&inode->i_lock);
                return LRU_ROTATE;
        }

        /*
         * On highmem systems, mapping_shrinkable() permits dropping
         * page cache in order to free up struct inodes: lowmem might
         * be under pressure before the cache inside the highmem zone.
         */
        if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(lru_lock);
                if (remove_inode_buffers(inode)) {
                        unsigned long reap;
                        reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
                        if (current_is_kswapd())
                                __count_vm_events(KSWAPD_INODESTEAL, reap);
                        else
                                __count_vm_events(PGINODESTEAL, reap);
                        mm_account_reclaimed_pages(reap);
                }
                iput(inode);
                spin_lock(lru_lock);
                return LRU_RETRY;
        }

        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);

        this_cpu_dec(nr_unused);
        return LRU_REMOVED;
}

/*
 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 * This is called from the superblock shrinker function with a number of inodes
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(freeable);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
}

static void __wait_on_freeing_inode(struct inode *inode);
/*
 * Called with the inode lock held.
 */
static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
                                int (*test)(struct inode *, void *),
                                void *data)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                if (!test(inode, data))
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * find_inode_fast is the fast path version of find_inode, see the comment at
 * iget_locked for details.
 */
static struct inode *find_inode_fast(struct super_block *sb,
                                struct hlist_head *head, unsigned long ino)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_ino != ino)
                        continue;
                if (inode->i_sb != sb)
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * Each cpu owns a range of LAST_INO_BATCH numbers.
 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
 * to renew the exhausted range.
 *
 * This does not significantly increase overflow rate because every CPU can
 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
 * overflow rate by 2x, which does not seem too significant.
 *
 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
 * error if st_ino won't fit in target struct field. Use 32bit counter
 * here to attempt to avoid that.
 */
#define LAST_INO_BATCH 1024
static DEFINE_PER_CPU(unsigned int, last_ino);

unsigned int get_next_ino(void)
{
        unsigned int *p = &get_cpu_var(last_ino);
        unsigned int res = *p;

#ifdef CONFIG_SMP
        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
                static atomic_t shared_last_ino;
                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);

                res = next - LAST_INO_BATCH;
        }
#endif

        res++;
        /* get_next_ino should not provide a 0 inode number */
        if (unlikely(!res))
                res++;
        *p = res;
        put_cpu_var(last_ino);
        return res;
}
EXPORT_SYMBOL(get_next_ino);

/**
 *        new_inode_pseudo         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock.
 *        Inode wont be chained in superblock s_inodes list
 *        This means :
 *        - fs can't be unmount
 *        - quotas, fsnotify, writeback can't work
 */
struct inode *new_inode_pseudo(struct super_block *sb)
{
        struct inode *inode = alloc_inode(sb);

        if (inode) {
                spin_lock(&inode->i_lock);
                inode->i_state = 0;
                spin_unlock(&inode->i_lock);
        }
        return inode;
}

/**
 *        new_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock. The default gfp_mask
 *        for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *        If HIGHMEM pages are unsuitable or it is known that pages allocated
 *        for the page cache are not reclaimable or migratable,
 *        mapping_set_gfp_mask() must be called with suitable flags on the
 *        newly created inode's mapping
 *
 */
struct inode *new_inode(struct super_block *sb)
{
        struct inode *inode;

        inode = new_inode_pseudo(sb);
        if (inode)
                inode_sb_list_add(inode);
        return inode;
}
EXPORT_SYMBOL(new_inode);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;

                /* Set new key only if filesystem hasn't already changed it */
                if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
                        /*
                         * ensure nobody is actually holding i_mutex
                         */
                        // mutex_destroy(&inode->i_mutex);
                        init_rwsem(&inode->i_rwsem);
                        lockdep_set_class(&inode->i_rwsem,
                                          &type->i_mutex_dir_key);
                }
        }
}
EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
#endif

/**
 * unlock_new_inode - clear the I_NEW state and wake up any waiters
 * @inode:        new inode to unlock
 *
 * Called when the inode is fully initialised to clear the new state of the
 * inode and wake up anyone waiting for the inode to finish initialisation.
 */
void unlock_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);

void discard_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
        iput(inode);
}
EXPORT_SYMBOL(discard_new_inode);

/**
 * lock_two_nondirectories - take two i_mutexes on non-directory objects
 *
 * Lock any non-NULL argument. Passed objects must not be directories.
 * Zero, one or two objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 */
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1)
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
        if (inode2)
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
        if (inode1 > inode2)
                swap(inode1, inode2);
        if (inode1)
                inode_lock(inode1);
        if (inode2 && inode2 != inode1)
                inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);

/**
 * unlock_two_nondirectories - release locks from lock_two_nondirectories()
 * @inode1: first inode to unlock
 * @inode2: second inode to unlock
 */
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1) {
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
                inode_unlock(inode1);
        }
        if (inode2 && inode2 != inode1) {
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
                inode_unlock(inode2);
        }
}
EXPORT_SYMBOL(unlock_two_nondirectories);

/**
 * inode_insert5 - obtain an inode from a mounted file system
 * @inode:        pre-allocated inode to use for insert to cache
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a variant of iget5_locked() for callers that don't want to fail on memory
 * allocation of inode.
 *
 * If the inode is not in cache, insert the pre-allocated inode to cache and
 * return it locked, hashed, and with the I_NEW flag set. The file system gets
 * to fill it in before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                            int (*test)(struct inode *, void *),
                            int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
        struct inode *old;

again:
        spin_lock(&inode_hash_lock);
        old = find_inode(inode->i_sb, head, test, data);
        if (unlikely(old)) {
                /*
                 * Uhhuh, somebody else created the same inode under us.
                 * Use the old inode instead of the preallocated one.
                 */
                spin_unlock(&inode_hash_lock);
                if (IS_ERR(old))
                        return NULL;
                wait_on_inode(old);
                if (unlikely(inode_unhashed(old))) {
                        iput(old);
                        goto again;
                }
                return old;
        }

        if (set && unlikely(set(inode, data))) {
                inode = NULL;
                goto unlock;
        }

        /*
         * Return the locked inode with I_NEW set, the
         * caller is responsible for filling in the contents
         */
        spin_lock(&inode->i_lock);
        inode->i_state |= I_NEW;
        hlist_add_head_rcu(&inode->i_hash, head);
        spin_unlock(&inode->i_lock);

        /*
         * Add inode to the sb list if it's not already. It has I_NEW at this
         * point, so it should be safe to test i_sb_list locklessly.
         */
        if (list_empty(&inode->i_sb_list))
                inode_sb_list_add(inode);
unlock:
        spin_unlock(&inode_hash_lock);

        return inode;
}
EXPORT_SYMBOL(inode_insert5);

/**
 * iget5_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a generalized version of iget_locked() for file systems where the inode
 * number is not sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set. The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct inode *inode = ilookup5(sb, hashval, test, data);

        if (!inode) {
                struct inode *new = alloc_inode(sb);

                if (new) {
                        new->i_state = 0;
                        inode = inode_insert5(new, hashval, test, set, data);
                        if (unlikely(inode != new))
                                destroy_inode(new);
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget5_locked);

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @ino:        inode number to get
 *
 * Search for the inode specified by @ino in the inode cache and if present
 * return it with an increased reference count. This is for file systems
 * where the inode number is sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set.  The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 */
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;

                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
                        spin_lock(&inode->i_lock);
                        inode->i_state = I_NEW;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        inode_sb_list_add(inode);
                        spin_unlock(&inode_hash_lock);

                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
                         */
                        return inode;
                }

                /*
                 * Uhhuh, somebody else created the same inode under
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                if (IS_ERR(old))
                        return NULL;
                inode = old;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget_locked);

/*
 * search the inode cache for a matching inode number.
 * If we find one, then the inode number we are trying to
 * allocate is not unique and so we should not use it.
 *
 * Returns 1 if the inode number is unique, 0 if it is not.
 */
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *b = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        hlist_for_each_entry_rcu(inode, b, i_hash) {
                if (inode->i_ino == ino && inode->i_sb == sb)
                        return 0;
        }
        return 1;
}

/**
 *        iunique - get a unique inode number
 *        @sb: superblock
 *        @max_reserved: highest reserved inode number
 *
 *        Obtain an inode number that is unique on the system for a given
 *        superblock. This is used by file systems that have no natural
 *        permanent inode numbering system. An inode number is returned that
 *        is higher than the reserved limit but unique.
 *
 *        BUGS:
 *        With a large number of inodes live on the file system this function
 *        currently becomes quite slow.
 */
ino_t iunique(struct super_block *sb, ino_t max_reserved)
{
        /*
         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
        ino_t res;

        rcu_read_lock();
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(iunique);

struct inode *igrab(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
        }
        return inode;
}
EXPORT_SYMBOL(igrab);

/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache.
 * If the inode is in the cache, the inode is returned with an incremented
 * reference count.
 *
 * Note: I_NEW is not waited upon so you have to be very careful what you do
 * with the returned inode.  You probably should be using ilookup5() instead.
 *
 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        spin_lock(&inode_hash_lock);
        inode = find_inode(sb, head, test, data);
        spin_unlock(&inode_hash_lock);

        return IS_ERR(inode) ? NULL : inode;
}
EXPORT_SYMBOL(ilookup5_nowait);

/**
 * ilookup5 - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if the inode is in the cache, return the inode with an incremented
 * reference count.  Waits on I_NEW before returning the inode.
 * returned with an incremented reference count.
 *
 * This is a generalized version of ilookup() for file systems where the
 * inode number is not sufficient for unique identification of an inode.
 *
 * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *inode;
again:
        inode = ilookup5_nowait(sb, hashval, test, data);
        if (inode) {
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup5);

/**
 * ilookup - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @ino:        inode number to search for
 *
 * Search for the inode @ino in the inode cache, and if the inode is in the
 * cache, the inode is returned with an incremented reference count.
 */
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);

        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup);

/**
 * find_inode_nowait - find an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @match:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @match
 *
 * Search for the inode specified by @hashval and @data in the inode
 * cache, where the helper function @match will return 0 if the inode
 * does not match, 1 if the inode does match, and -1 if the search
 * should be stopped.  The @match function must be responsible for
 * taking the i_lock spin_lock and checking i_state for an inode being
 * freed or being initialized, and incrementing the reference count
 * before returning 1.  It also must not sleep, since it is called with
 * the inode_hash_lock spinlock held.
 *
 * This is a even more generalized version of ilookup5() when the
 * function must never block --- find_inode() can block in
 * __wait_on_freeing_inode() --- or when the caller can not increment
 * the reference count because the resulting iput() might cause an
 * inode eviction.  The tradeoff is that the @match funtion must be
 * very carefully implemented.
 */
struct inode *find_inode_nowait(struct super_block *sb,
                                unsigned long hashval,
                                int (*match)(struct inode *, unsigned long,
                                             void *),
                                void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *ret_inode = NULL;
        int mval;

        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                mval = match(inode, hashval, data);
                if (mval == 0)
                        continue;
                if (mval == 1)
                        ret_inode = inode;
                goto out;
        }
out:
        spin_unlock(&inode_hash_lock);
        return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);

/**
 * find_inode_rcu - find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @hashval:        Key to hash
 * @test:        Function to test match on an inode
 * @data:        Data for test function
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
                             int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
                    test(inode, data))
                        return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_rcu);

/**
 * find_inode_by_ino_rcu - Find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @ino:        The inode number to match
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
                                    unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_by_ino_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino == ino &&
                    inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
                    return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_by_ino_rcu);

int insert_inode_locked(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);

        while (1) {
                struct inode *old = NULL;
                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
                        spin_lock(&old->i_lock);
                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
                                spin_unlock(&old->i_lock);
                                continue;
                        }
                        break;
                }
                if (likely(!old)) {
                        spin_lock(&inode->i_lock);
                        inode->i_state |= I_NEW | I_CREATING;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                if (unlikely(old->i_state & I_CREATING)) {
                        spin_unlock(&old->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return -EBUSY;
                }
                __iget(old);
                spin_unlock(&old->i_lock);
                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
                iput(old);
        }
}
EXPORT_SYMBOL(insert_inode_locked);

int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *old;

        inode->i_state |= I_CREATING;
        old = inode_insert5(inode, hashval, test, NULL, data);

        if (old != inode) {
                iput(old);
                return -EBUSY;
        }
        return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);


int generic_delete_inode(struct inode *inode)
{
        return 1;
}
EXPORT_SYMBOL(generic_delete_inode);

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        const struct super_operations *op = inode->i_sb->s_op;
        unsigned long state;
        int drop;

        WARN_ON(inode->i_state & I_NEW);

        if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);

        if (!drop &&
            !(inode->i_state & I_DONTCACHE) &&
            (sb->s_flags & SB_ACTIVE)) {
                __inode_add_lru(inode, true);
                spin_unlock(&inode->i_lock);
                return;
        }

        state = inode->i_state;
        if (!drop) {
                WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
                spin_unlock(&inode->i_lock);

                write_inode_now(inode, 1);

                spin_lock(&inode->i_lock);
                state = inode->i_state;
                WARN_ON(state & I_NEW);
                state &= ~I_WILL_FREE;
        }

        WRITE_ONCE(inode->i_state, state | I_FREEING);
        if (!list_empty(&inode->i_lru))
                inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);

        evict(inode);
}

/**
 *        iput        - put an inode
 *        @inode: inode to put
 *
 *        Puts an inode, dropping its usage count. If the inode use count hits
 *        zero, the inode is then freed and may also be destroyed.
 *
 *        Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
        if (!inode)
                return;
        BUG_ON(inode->i_state & I_CLEAR);
retry:
        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
                        atomic_inc(&inode->i_count);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_lazytime_iput(inode);
                        mark_inode_dirty_sync(inode);
                        goto retry;
                }
                iput_final(inode);
        }
}
EXPORT_SYMBOL(iput);

#ifdef CONFIG_BLOCK
/**
 *        bmap        - find a block number in a file
 *        @inode:  inode owning the block number being requested
 *        @block: pointer containing the block to find
 *
 *        Replaces the value in ``*block`` with the block number on the device holding
 *        corresponding to the requested block number in the file.
 *        That is, asked for block 4 of inode 1 the function will replace the
 *        4 in ``*block``, with disk block relative to the disk start that holds that
 *        block of the file.
 *
 *        Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
 *        hole, returns 0 and ``*block`` is also set to 0.
 */
int bmap(struct inode *inode, sector_t *block)
{
        if (!inode->i_mapping->a_ops->bmap)
                return -EINVAL;

        *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
        return 0;
}
EXPORT_SYMBOL(bmap);
#endif

/*
 * With relative atime, only update atime if the previous atime is
 * earlier than or equal to either the ctime or mtime,
 * or if at least a day has passed since the last atime update.
 */
static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                             struct timespec64 now)
{
        struct timespec64 atime, mtime, ctime;

        if (!(mnt->mnt_flags & MNT_RELATIME))
                return true;
        /*
         * Is mtime younger than or equal to atime? If yes, update atime:
         */
        atime = inode_get_atime(inode);
        mtime = inode_get_mtime(inode);
        if (timespec64_compare(&mtime, &atime) >= 0)
                return true;
        /*
         * Is ctime younger than or equal to atime? If yes, update atime:
         */
        ctime = inode_get_ctime(inode);
        if (timespec64_compare(&ctime, &atime) >= 0)
                return true;

        /*
         * Is the previous atime value older than a day? If yes,
         * update atime:
         */
        if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
                return true;
        /*
         * Good, we can skip the atime update:
         */
        return false;
}

/**
 * inode_update_timestamps - update the timestamps on the inode
 * @inode: inode to be updated
 * @flags: S_* flags that needed to be updated
 *
 * The update_time function is called when an inode's timestamps need to be
 * updated for a read or write operation. This function handles updating the
 * actual timestamps. It's up to the caller to ensure that the inode is marked
 * dirty appropriately.
 *
 * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated,
 * attempt to update all three of them. S_ATIME updates can be handled
 * independently of the rest.
 *
 * Returns a set of S_* flags indicating which values changed.
 */
int inode_update_timestamps(struct inode *inode, int flags)
{
        int updated = 0;
        struct timespec64 now;

        if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
                struct timespec64 ctime = inode_get_ctime(inode);
                struct timespec64 mtime = inode_get_mtime(inode);

                now = inode_set_ctime_current(inode);
                if (!timespec64_equal(&now, &ctime))
                        updated |= S_CTIME;
                if (!timespec64_equal(&now, &mtime)) {
                        inode_set_mtime_to_ts(inode, now);
                        updated |= S_MTIME;
                }
                if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
                        updated |= S_VERSION;
        } else {
                now = current_time(inode);
        }

        if (flags & S_ATIME) {
                struct timespec64 atime = inode_get_atime(inode);

                if (!timespec64_equal(&now, &atime)) {
                        inode_set_atime_to_ts(inode, now);
                        updated |= S_ATIME;
                }
        }
        return updated;
}
EXPORT_SYMBOL(inode_update_timestamps);

/**
 * generic_update_time - update the timestamps on the inode
 * @inode: inode to be updated
 * @flags: S_* flags that needed to be updated
 *
 * The update_time function is called when an inode's timestamps need to be
 * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME,
 * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME
 * updates can be handled done independently of the rest.
 *
 * Returns a S_* mask indicating which fields were updated.
 */
int generic_update_time(struct inode *inode, int flags)
{
        int updated = inode_update_timestamps(inode, flags);
        int dirty_flags = 0;

        if (updated & (S_ATIME|S_MTIME|S_CTIME))
                dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC;
        if (updated & S_VERSION)
                dirty_flags |= I_DIRTY_SYNC;
        __mark_inode_dirty(inode, dirty_flags);
        return updated;
}
EXPORT_SYMBOL(generic_update_time);

/*
 * This does the actual work of updating an inodes time or version.  Must have
 * had called mnt_want_write() before calling this.
 */
int inode_update_time(struct inode *inode, int flags)
{
        if (inode->i_op->update_time)
                return inode->i_op->update_time(inode, flags);
        generic_update_time(inode, flags);
        return 0;
}
EXPORT_SYMBOL(inode_update_time);

/**
 *        atime_needs_update        -        update the access time
 *        @path: the &struct path to update
 *        @inode: inode to update
 *
 *        Update the accessed time on an inode and mark it for writeback.
 *        This function automatically handles read only file systems and media,
 *        as well as the "noatime" flag and inode specific "noatime" markers.
 */
bool atime_needs_update(const struct path *path, struct inode *inode)
{
        struct vfsmount *mnt = path->mnt;
        struct timespec64 now, atime;

        if (inode->i_flags & S_NOATIME)
                return false;

        /* Atime updates will likely cause i_uid and i_gid to be written
         * back improprely if their true value is unknown to the vfs.
         */
        if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode))
                return false;

        if (IS_NOATIME(inode))
                return false;
        if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        if (mnt->mnt_flags & MNT_NOATIME)
                return false;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        now = current_time(inode);

        if (!relatime_need_update(mnt, inode, now))
                return false;

        atime = inode_get_atime(inode);
        if (timespec64_equal(&atime, &now))
                return false;

        return true;
}

void touch_atime(const struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        struct inode *inode = d_inode(path->dentry);

        if (!atime_needs_update(path, inode))
                return;

        if (!sb_start_write_trylock(inode->i_sb))
                return;

        if (mnt_get_write_access(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * allocate new space to modify an inode (such is the case for
         * Btrfs), but since we touch atime while walking down the path we
         * really don't care if we failed to update the atime of the file,
         * so just ignore the return value.
         * We may also fail on filesystems that have the ability to make parts
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        inode_update_time(inode, S_ATIME);
        mnt_put_write_access(mnt);
skip_update:
        sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);

/*
 * Return mask of changes for notify_change() that need to be done as a
 * response to write or truncate. Return 0 if nothing has to be changed.
 * Negative value on error (change should be denied).
 */
int dentry_needs_remove_privs(struct mnt_idmap *idmap,
                              struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        int mask = 0;
        int ret;

        if (IS_NOSEC(inode))
                return 0;

        mask = setattr_should_drop_suidgid(idmap, inode);
        ret = security_inode_need_killpriv(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                mask |= ATTR_KILL_PRIV;
        return mask;
}

static int __remove_privs(struct mnt_idmap *idmap,
                          struct dentry *dentry, int kill)
{
        struct iattr newattrs;

        newattrs.ia_valid = ATTR_FORCE | kill;
        /*
         * Note we call this on write, so notify_change will not
         * encounter any conflicting delegations:
         */
        return notify_change(idmap, dentry, &newattrs, NULL);
}

int file_remove_privs_flags(struct file *file, unsigned int flags)
{
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = file_inode(file);
        int error = 0;
        int kill;

        if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
                return 0;

        kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry);
        if (kill < 0)
                return kill;

        if (kill) {
                if (flags & IOCB_NOWAIT)
                        return -EAGAIN;

                error = __remove_privs(file_mnt_idmap(file), dentry, kill);
        }

        if (!error)
                inode_has_no_xattr(inode);
        return error;
}
EXPORT_SYMBOL_GPL(file_remove_privs_flags);

/**
 * file_remove_privs - remove special file privileges (suid, capabilities)
 * @file: file to remove privileges from
 *
 * When file is modified by a write or truncation ensure that special
 * file privileges are removed.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_remove_privs(struct file *file)
{
        return file_remove_privs_flags(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);

static int inode_needs_update_time(struct inode *inode)
{
        int sync_it = 0;
        struct timespec64 now = current_time(inode);
        struct timespec64 ts;

        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return 0;

        ts = inode_get_mtime(inode);
        if (!timespec64_equal(&ts, &now))
                sync_it = S_MTIME;

        ts = inode_get_ctime(inode);
        if (!timespec64_equal(&ts, &now))
                sync_it |= S_CTIME;

        if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
                sync_it |= S_VERSION;

        return sync_it;
}

static int __file_update_time(struct file *file, int sync_mode)
{
        int ret = 0;
        struct inode *inode = file_inode(file);

        /* try to update time settings */
        if (!mnt_get_write_access_file(file)) {
                ret = inode_update_time(inode, sync_mode);
                mnt_put_write_access_file(file);
        }

        return ret;
}

/**
 * file_update_time - update mtime and ctime time
 * @file: file accessed
 *
 * Update the mtime and ctime members of an inode and mark the inode for
 * writeback. Note that this function is meant exclusively for usage in
 * the file write path of filesystems, and filesystems may choose to
 * explicitly ignore updates via this function with the _NOCMTIME inode
 * flag, e.g. for network filesystem where these imestamps are handled
 * by the server. This can return an error for file systems who need to
 * allocate space in order to update an inode.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_update_time(struct file *file)
{
        int ret;
        struct inode *inode = file_inode(file);

        ret = inode_needs_update_time(inode);
        if (ret <= 0)
                return ret;

        return __file_update_time(file, ret);
}
EXPORT_SYMBOL(file_update_time);

/**
 * file_modified_flags - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 * @flags: kiocb flags
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * If IOCB_NOWAIT is set, special file privileges will not be removed and
 * time settings will not be updated. It will return -EAGAIN.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
static int file_modified_flags(struct file *file, int flags)
{
        int ret;
        struct inode *inode = file_inode(file);

        /*
         * Clear the security bits if the process is not being run by root.
         * This keeps people from modifying setuid and setgid binaries.
         */
        ret = file_remove_privs_flags(file, flags);
        if (ret)
                return ret;

        if (unlikely(file->f_mode & FMODE_NOCMTIME))
                return 0;

        ret = inode_needs_update_time(inode);
        if (ret <= 0)
                return ret;
        if (flags & IOCB_NOWAIT)
                return -EAGAIN;

        return __file_update_time(file, ret);
}

/**
 * file_modified - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_modified(struct file *file)
{
        return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);

/**
 * kiocb_modified - handle mandated vfs changes when modifying a file
 * @iocb: iocb that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int kiocb_modified(struct kiocb *iocb)
{
        return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
}
EXPORT_SYMBOL_GPL(kiocb_modified);

int inode_needs_sync(struct inode *inode)
{
        if (IS_SYNC(inode))
                return 1;
        if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
                return 1;
        return 0;
}
EXPORT_SYMBOL(inode_needs_sync);

/*
 * If we try to find an inode in the inode hash while it is being
 * deleted, we have to wait until the filesystem completes its
 * deletion before reporting that it isn't found.  This function waits
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
 * will DTRT.
 */
static void __wait_on_freeing_inode(struct inode *inode)
{
        wait_queue_head_t *wq;
        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq, &wait.wq_entry);
        spin_lock(&inode_hash_lock);
}

static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
        if (!str)
                return 0;
        ihash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("ihash_entries=", set_ihash_entries);

/*
 * Initialize the waitqueues and inode hash table.
 */
void __init inode_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_EARLY | HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void __init inode_init(void)
{
        /* inode slab cache */
        inode_cachep = kmem_cache_create("inode_cache",
                                         sizeof(struct inode),
                                         0,
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_ACCOUNT),
                                         init_once);

        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
        inode->i_mode = mode;
        if (S_ISCHR(mode)) {
                inode->i_fop = &def_chr_fops;
                inode->i_rdev = rdev;
        } else if (S_ISBLK(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK))
                        inode->i_fop = &def_blk_fops;
                inode->i_rdev = rdev;
        } else if (S_ISFIFO(mode))
                inode->i_fop = &pipefifo_fops;
        else if (S_ISSOCK(mode))
                ;        /* leave it no_open_fops */
        else
                printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
                                  " inode %s:%lu\n", mode, inode->i_sb->s_id,
                                  inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);

/**
 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @idmap: idmap of the mount the inode was created from
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
 *
 * If the inode has been created through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions
 * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
 * checking is to be performed on the raw inode simply pass @nop_mnt_idmap.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode)
{
        inode_fsuid_set(inode, idmap);
        if (dir && dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;

                /* Directories are special, and always inherit S_ISGID */
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
                inode_fsgid_set(inode, idmap);
        inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);

/**
 * inode_owner_or_capable - check current task permissions to inode
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode being checked
 *
 * Return true if current either has CAP_FOWNER in a namespace with the
 * inode owner uid mapped, or owns the file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode)
{
        vfsuid_t vfsuid;
        struct user_namespace *ns;

        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return true;

        ns = current_user_ns();
        if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
                return true;
        return false;
}
EXPORT_SYMBOL(inode_owner_or_capable);

/*
 * Direct i/o helper functions
 */
static void __inode_dio_wait(struct inode *inode)
{
        wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
        DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);

        do {
                prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&inode->i_dio_count))
                        schedule();
        } while (atomic_read(&inode->i_dio_count));
        finish_wait(wq, &q.wq_entry);
}

/**
 * inode_dio_wait - wait for outstanding DIO requests to finish
 * @inode: inode to wait for
 *
 * Waits for all pending direct I/O requests to finish so that we can
 * proceed with a truncate or equivalent operation.
 *
 * Must be called under a lock that serializes taking new references
 * to i_dio_count, usually by inode->i_mutex.
 */
void inode_dio_wait(struct inode *inode)
{
        if (atomic_read(&inode->i_dio_count))
                __inode_dio_wait(inode);
}
EXPORT_SYMBOL(inode_dio_wait);

/*
 * inode_set_flags - atomically set some inode flags
 *
 * Note: the caller should be holding i_mutex, or else be sure that
 * they have exclusive access to the inode structure (i.e., while the
 * inode is being instantiated).  The reason for the cmpxchg() loop
 * --- which wouldn't be necessary if all code paths which modify
 * i_flags actually followed this rule, is that there is at least one
 * code path which doesn't today so we use cmpxchg() out of an abundance
 * of caution.
 *
 * In the long run, i_mutex is overkill, and we should probably look
 * at using the i_lock spinlock to protect i_flags, and then make sure
 * it is so documented in include/linux/fs.h and that all code follows
 * the locking convention!!
 */
void inode_set_flags(struct inode *inode, unsigned int flags,
                     unsigned int mask)
{
        WARN_ON_ONCE(flags & ~mask);
        set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);

void inode_nohighmem(struct inode *inode)
{
        mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);

/**
 * timestamp_truncate - Truncate timespec to a granularity
 * @t: Timespec
 * @inode: inode being updated
 *
 * Truncate a timespec to the granularity supported by the fs
 * containing the inode. Always rounds down. gran must
 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
 */
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned int gran = sb->s_time_gran;

        t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
        if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
                t.tv_nsec = 0;

        /* Avoid division in the common cases 1 ns and 1 s. */
        if (gran == 1)
                ; /* nothing */
        else if (gran == NSEC_PER_SEC)
                t.tv_nsec = 0;
        else if (gran > 1 && gran < NSEC_PER_SEC)
                t.tv_nsec -= t.tv_nsec % gran;
        else
                WARN(1, "invalid file time granularity: %u", gran);
        return t;
}
EXPORT_SYMBOL(timestamp_truncate);

/**
 * current_time - Return FS time
 * @inode: inode.
 *
 * Return the current time truncated to the time granularity supported by
 * the fs.
 *
 * Note that inode and inode->sb cannot be NULL.
 * Otherwise, the function warns and returns time without truncation.
 */
struct timespec64 current_time(struct inode *inode)
{
        struct timespec64 now;

        ktime_get_coarse_real_ts64(&now);
        return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);

/**
 * inode_set_ctime_current - set the ctime to current_time
 * @inode: inode
 *
 * Set the inode->i_ctime to the current value for the inode. Returns
 * the current value that was assigned to i_ctime.
 */
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
        struct timespec64 now = current_time(inode);

        inode_set_ctime_to_ts(inode, now);
        return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);

/**
 * in_group_or_capable - check whether caller is CAP_FSETID privileged
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 * @vfsgid:        the new/current vfsgid of @inode
 *
 * Check wether @vfsgid is in the caller's group list or if the caller is
 * privileged with CAP_FSETID over @inode. This can be used to determine
 * whether the setgid bit can be kept or must be dropped.
 *
 * Return: true if the caller is sufficiently privileged, false if not.
 */
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid)
{
        if (vfsgid_in_group_p(vfsgid))
                return true;
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
                return true;
        return false;
}

/**
 * mode_strip_sgid - handle the sgid bit for non-directories
 * @idmap: idmap of the mount the inode was created from
 * @dir: parent directory inode
 * @mode: mode of the file to be created in @dir
 *
 * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
 * raised and @dir has the S_ISGID bit raised ensure that the caller is
 * either in the group of the parent directory or they have CAP_FSETID
 * in their user namespace and are privileged over the parent directory.
 * In all other cases, strip the S_ISGID bit from @mode.
 *
 * Return: the new mode to use for the file
 */
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode)
{
        if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
                return mode;
        if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
                return mode;
        if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir)))
                return mode;
        return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);

































    2 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Supervisor Mode Access Prevention support
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: H. Peter Anvin <hpa@linux.intel.com>
 */

#ifndef _ASM_X86_SMAP_H
#define _ASM_X86_SMAP_H

#include <asm/nops.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

/* "Raw" instruction opcodes */
#define __ASM_CLAC        ".byte 0x0f,0x01,0xca"
#define __ASM_STAC        ".byte 0x0f,0x01,0xcb"

#ifdef __ASSEMBLY__

#define ASM_CLAC \
        ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP

#define ASM_STAC \
        ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP

#else /* __ASSEMBLY__ */

static __always_inline void clac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_CLAC, X86_FEATURE_SMAP);
}

static __always_inline void stac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_STAC, X86_FEATURE_SMAP);
}

static __always_inline unsigned long smap_save(void)
{
        unsigned long flags;

        asm volatile ("# smap_save\n\t"
                      ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t",
                                  X86_FEATURE_SMAP)
                      : "=rm" (flags) : : "memory", "cc");

        return flags;
}

static __always_inline void smap_restore(unsigned long flags)
{
        asm volatile ("# smap_restore\n\t"
                      ALTERNATIVE("", "push %0; popf\n\t",
                                  X86_FEATURE_SMAP)
                      : : "g" (flags) : "memory", "cc");
}

/* These macros can be used in asm() statements */
#define ASM_CLAC \
        ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP)
#define ASM_STAC \
        ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP)

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_SMAP_H */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


















    2 










    2 
    2 

    1 
    1 


    2 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/file.c - kernfs file implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/uio.h>

#include "kernfs-internal.h"

struct kernfs_open_node {
        struct rcu_head                rcu_head;
        atomic_t                event;
        wait_queue_head_t        poll;
        struct list_head        files; /* goes through kernfs_open_file.list */
        unsigned int                nr_mmapped;
        unsigned int                nr_to_release;
};

/*
 * kernfs_notify() may be called from any context and bounces notifications
 * through a work item.  To minimize space overhead in kernfs_node, the
 * pending queue is implemented as a singly linked list of kernfs_nodes.
 * The list is terminated with the self pointer so that whether a
 * kernfs_node is on the list or not can be determined by testing the next
 * pointer for %NULL.
 */
#define KERNFS_NOTIFY_EOL                        ((void *)&kernfs_notify_list)

static DEFINE_SPINLOCK(kernfs_notify_lock);
static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;

static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
{
        int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);

        return &kernfs_locks->open_file_mutex[idx];
}

static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
{
        struct mutex *lock;

        lock = kernfs_open_file_mutex_ptr(kn);

        mutex_lock(lock);

        return lock;
}

/**
 * of_on - Get the kernfs_open_node of the specified kernfs_open_file
 * @of: target kernfs_open_file
 *
 * Return: the kernfs_open_node of the kernfs_open_file
 */
static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
{
        return rcu_dereference_protected(of->kn->attr.open,
                                         !list_empty(&of->list));
}

/**
 * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn
 *
 * @kn: target kernfs_node.
 *
 * Fetch and return ->attr.open of @kn when caller holds the
 * kernfs_open_file_mutex_ptr(kn).
 *
 * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when
 * the caller guarantees that this mutex is being held, other updaters can't
 * change ->attr.open and this means that we can safely deref ->attr.open
 * outside RCU read-side critical section.
 *
 * The caller needs to make sure that kernfs_open_file_mutex is held.
 *
 * Return: @kn->attr.open when kernfs_open_file_mutex is held.
 */
static struct kernfs_open_node *
kernfs_deref_open_node_locked(struct kernfs_node *kn)
{
        return rcu_dereference_protected(kn->attr.open,
                                lockdep_is_held(kernfs_open_file_mutex_ptr(kn)));
}

static struct kernfs_open_file *kernfs_of(struct file *file)
{
        return ((struct seq_file *)file->private_data)->private;
}

/*
 * Determine the kernfs_ops for the given kernfs_node.  This function must
 * be called while holding an active reference.
 */
static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
{
        if (kn->flags & KERNFS_LOCKDEP)
                lockdep_assert_held(kn);
        return kn->attr.ops;
}

/*
 * As kernfs_seq_stop() is also called after kernfs_seq_start() or
 * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
 * a seq_file iteration which is fully initialized with an active reference
 * or an aborted kernfs_seq_start() due to get_active failure.  The
 * position pointer is the only context for each seq_file iteration and
 * thus the stop condition should be encoded in it.  As the return value is
 * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
 * choice to indicate get_active failure.
 *
 * Unfortunately, this is complicated due to the optional custom seq_file
 * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
 * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
 * custom seq_file operations and thus can't decide whether put_active
 * should be performed or not only on ERR_PTR(-ENODEV).
 *
 * This is worked around by factoring out the custom seq_stop() and
 * put_active part into kernfs_seq_stop_active(), skipping it from
 * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
 * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
 * that kernfs_seq_stop_active() is skipped only after get_active failure.
 */
static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops = kernfs_ops(of->kn);

        if (ops->seq_stop)
                ops->seq_stop(sf, v);
        kernfs_put_active(of->kn);
}

static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops;

        /*
         * @of->mutex nests outside active ref and is primarily to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active(of->kn))
                return ERR_PTR(-ENODEV);

        ops = kernfs_ops(of->kn);
        if (ops->seq_start) {
                void *next = ops->seq_start(sf, ppos);
                /* see the comment above kernfs_seq_stop_active() */
                if (next == ERR_PTR(-ENODEV))
                        kernfs_seq_stop_active(sf, next);
                return next;
        }
        return single_start(sf, ppos);
}

static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
{
        struct kernfs_open_file *of = sf->private;
        const struct kernfs_ops *ops = kernfs_ops(of->kn);

        if (ops->seq_next) {
                void *next = ops->seq_next(sf, v, ppos);
                /* see the comment above kernfs_seq_stop_active() */
                if (next == ERR_PTR(-ENODEV))
                        kernfs_seq_stop_active(sf, next);
                return next;
        } else {
                /*
                 * The same behavior and code as single_open(), always
                 * terminate after the initial read.
                 */
                ++*ppos;
                return NULL;
        }
}

static void kernfs_seq_stop(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;

        if (v != ERR_PTR(-ENODEV))
                kernfs_seq_stop_active(sf, v);
        mutex_unlock(&of->mutex);
}

static int kernfs_seq_show(struct seq_file *sf, void *v)
{
        struct kernfs_open_file *of = sf->private;

        of->event = atomic_read(&of_on(of)->event);

        return of->kn->attr.ops->seq_show(sf, v);
}

static const struct seq_operations kernfs_seq_ops = {
        .start = kernfs_seq_start,
        .next = kernfs_seq_next,
        .stop = kernfs_seq_stop,
        .show = kernfs_seq_show,
};

/*
 * As reading a bin file can have side-effects, the exact offset and bytes
 * specified in read(2) call should be passed to the read callback making
 * it difficult to use seq_file.  Implement simplistic custom buffering for
 * bin files.
 */
static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
        ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
        const struct kernfs_ops *ops;
        char *buf;

        buf = of->prealloc_buf;
        if (buf)
                mutex_lock(&of->prealloc_mutex);
        else
                buf = kmalloc(len, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        /*
         * @of->mutex nests outside active ref and is used both to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active(of->kn)) {
                len = -ENODEV;
                mutex_unlock(&of->mutex);
                goto out_free;
        }

        of->event = atomic_read(&of_on(of)->event);

        ops = kernfs_ops(of->kn);
        if (ops->read)
                len = ops->read(of, buf, len, iocb->ki_pos);
        else
                len = -EINVAL;

        kernfs_put_active(of->kn);
        mutex_unlock(&of->mutex);

        if (len < 0)
                goto out_free;

        if (copy_to_iter(buf, len, iter) != len) {
                len = -EFAULT;
                goto out_free;
        }

        iocb->ki_pos += len;

 out_free:
        if (buf == of->prealloc_buf)
                mutex_unlock(&of->prealloc_mutex);
        else
                kfree(buf);
        return len;
}

static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW)
                return seq_read_iter(iocb, iter);
        return kernfs_file_read_iter(iocb, iter);
}

/*
 * Copy data in from userland and pass it to the matching kernfs write
 * operation.
 *
 * There is no easy way for us to know if userspace is only doing a partial
 * write, so we don't support them. We expect the entire buffer to come on
 * the first write.  Hint: if you're writing a value, first read the file,
 * modify only the value you're changing, then write entire buffer
 * back.
 */
static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
        ssize_t len = iov_iter_count(iter);
        const struct kernfs_ops *ops;
        char *buf;

        if (of->atomic_write_len) {
                if (len > of->atomic_write_len)
                        return -E2BIG;
        } else {
                len = min_t(size_t, len, PAGE_SIZE);
        }

        buf = of->prealloc_buf;
        if (buf)
                mutex_lock(&of->prealloc_mutex);
        else
                buf = kmalloc(len + 1, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        if (copy_from_iter(buf, len, iter) != len) {
                len = -EFAULT;
                goto out_free;
        }
        buf[len] = '\0';        /* guarantee string termination */

        /*
         * @of->mutex nests outside active ref and is used both to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active(of->kn)) {
                mutex_unlock(&of->mutex);
                len = -ENODEV;
                goto out_free;
        }

        ops = kernfs_ops(of->kn);
        if (ops->write)
                len = ops->write(of, buf, len, iocb->ki_pos);
        else
                len = -EINVAL;

        kernfs_put_active(of->kn);
        mutex_unlock(&of->mutex);

        if (len > 0)
                iocb->ki_pos += len;

out_free:
        if (buf == of->prealloc_buf)
                mutex_unlock(&of->prealloc_mutex);
        else
                kfree(buf);
        return len;
}

static void kernfs_vma_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);

        if (!of->vm_ops)
                return;

        if (!kernfs_get_active(of->kn))
                return;

        if (of->vm_ops->open)
                of->vm_ops->open(vma);

        kernfs_put_active(of->kn);
}

static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        vm_fault_t ret;

        if (!of->vm_ops)
                return VM_FAULT_SIGBUS;

        if (!kernfs_get_active(of->kn))
                return VM_FAULT_SIGBUS;

        ret = VM_FAULT_SIGBUS;
        if (of->vm_ops->fault)
                ret = of->vm_ops->fault(vmf);

        kernfs_put_active(of->kn);
        return ret;
}

static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        vm_fault_t ret;

        if (!of->vm_ops)
                return VM_FAULT_SIGBUS;

        if (!kernfs_get_active(of->kn))
                return VM_FAULT_SIGBUS;

        ret = 0;
        if (of->vm_ops->page_mkwrite)
                ret = of->vm_ops->page_mkwrite(vmf);
        else
                file_update_time(file);

        kernfs_put_active(of->kn);
        return ret;
}

static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
                             void *buf, int len, int write)
{
        struct file *file = vma->vm_file;
        struct kernfs_open_file *of = kernfs_of(file);
        int ret;

        if (!of->vm_ops)
                return -EINVAL;

        if (!kernfs_get_active(of->kn))
                return -EINVAL;

        ret = -EINVAL;
        if (of->vm_ops->access)
                ret = of->vm_ops->access(vma, addr, buf, len, write);

        kernfs_put_active(of->kn);
        return ret;
}

static const struct vm_operations_struct kernfs_vm_ops = {
        .open                = kernfs_vma_open,
        .fault                = kernfs_vma_fault,
        .page_mkwrite        = kernfs_vma_page_mkwrite,
        .access                = kernfs_vma_access,
};

static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct kernfs_open_file *of = kernfs_of(file);
        const struct kernfs_ops *ops;
        int rc;

        /*
         * mmap path and of->mutex are prone to triggering spurious lockdep
         * warnings and we don't want to add spurious locking dependency
         * between the two.  Check whether mmap is actually implemented
         * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
         * comment in kernfs_fop_open() for more details.
         */
        if (!(of->kn->flags & KERNFS_HAS_MMAP))
                return -ENODEV;

        mutex_lock(&of->mutex);

        rc = -ENODEV;
        if (!kernfs_get_active(of->kn))
                goto out_unlock;

        ops = kernfs_ops(of->kn);
        rc = ops->mmap(of, vma);
        if (rc)
                goto out_put;

        /*
         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
         * to satisfy versions of X which crash if the mmap fails: that
         * substitutes a new vm_file, and we don't then want bin_vm_ops.
         */
        if (vma->vm_file != file)
                goto out_put;

        rc = -EINVAL;
        if (of->mmapped && of->vm_ops != vma->vm_ops)
                goto out_put;

        /*
         * It is not possible to successfully wrap close.
         * So error if someone is trying to use close.
         */
        if (vma->vm_ops && vma->vm_ops->close)
                goto out_put;

        rc = 0;
        if (!of->mmapped) {
                of->mmapped = true;
                of_on(of)->nr_mmapped++;
                of->vm_ops = vma->vm_ops;
        }
        vma->vm_ops = &kernfs_vm_ops;
out_put:
        kernfs_put_active(of->kn);
out_unlock:
        mutex_unlock(&of->mutex);

        return rc;
}

/**
 *        kernfs_get_open_node - get or create kernfs_open_node
 *        @kn: target kernfs_node
 *        @of: kernfs_open_file for this instance of open
 *
 *        If @kn->attr.open exists, increment its reference count; otherwise,
 *        create one.  @of is chained to the files list.
 *
 *        Locking:
 *        Kernel thread context (may sleep).
 *
 *        Return:
 *        %0 on success, -errno on failure.
 */
static int kernfs_get_open_node(struct kernfs_node *kn,
                                struct kernfs_open_file *of)
{
        struct kernfs_open_node *on;
        struct mutex *mutex;

        mutex = kernfs_open_file_mutex_lock(kn);
        on = kernfs_deref_open_node_locked(kn);

        if (!on) {
                /* not there, initialize a new one */
                on = kzalloc(sizeof(*on), GFP_KERNEL);
                if (!on) {
                        mutex_unlock(mutex);
                        return -ENOMEM;
                }
                atomic_set(&on->event, 1);
                init_waitqueue_head(&on->poll);
                INIT_LIST_HEAD(&on->files);
                rcu_assign_pointer(kn->attr.open, on);
        }

        list_add_tail(&of->list, &on->files);
        if (kn->flags & KERNFS_HAS_RELEASE)
                on->nr_to_release++;

        mutex_unlock(mutex);
        return 0;
}

/**
 *        kernfs_unlink_open_file - Unlink @of from @kn.
 *
 *        @kn: target kernfs_node
 *        @of: associated kernfs_open_file
 *        @open_failed: ->open() failed, cancel ->release()
 *
 *        Unlink @of from list of @kn's associated open files. If list of
 *        associated open files becomes empty, disassociate and free
 *        kernfs_open_node.
 *
 *        LOCKING:
 *        None.
 */
static void kernfs_unlink_open_file(struct kernfs_node *kn,
                                    struct kernfs_open_file *of,
                                    bool open_failed)
{
        struct kernfs_open_node *on;
        struct mutex *mutex;

        mutex = kernfs_open_file_mutex_lock(kn);

        on = kernfs_deref_open_node_locked(kn);
        if (!on) {
                mutex_unlock(mutex);
                return;
        }

        if (of) {
                if (kn->flags & KERNFS_HAS_RELEASE) {
                        WARN_ON_ONCE(of->released == open_failed);
                        if (open_failed)
                                on->nr_to_release--;
                }
                if (of->mmapped)
                        on->nr_mmapped--;
                list_del(&of->list);
        }

        if (list_empty(&on->files)) {
                rcu_assign_pointer(kn->attr.open, NULL);
                kfree_rcu(on, rcu_head);
        }

        mutex_unlock(mutex);
}

static int kernfs_fop_open(struct inode *inode, struct file *file)
{
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_root *root = kernfs_root(kn);
        const struct kernfs_ops *ops;
        struct kernfs_open_file *of;
        bool has_read, has_write, has_mmap;
        int error = -EACCES;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        ops = kernfs_ops(kn);

        has_read = ops->seq_show || ops->read || ops->mmap;
        has_write = ops->write || ops->mmap;
        has_mmap = ops->mmap;

        /* see the flag definition for details */
        if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
                if ((file->f_mode & FMODE_WRITE) &&
                    (!(inode->i_mode & S_IWUGO) || !has_write))
                        goto err_out;

                if ((file->f_mode & FMODE_READ) &&
                    (!(inode->i_mode & S_IRUGO) || !has_read))
                        goto err_out;
        }

        /* allocate a kernfs_open_file for the file */
        error = -ENOMEM;
        of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
        if (!of)
                goto err_out;

        /*
         * The following is done to give a different lockdep key to
         * @of->mutex for files which implement mmap.  This is a rather
         * crude way to avoid false positive lockdep warning around
         * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and
         * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
         * which mm->mmap_lock nests, while holding @of->mutex.  As each
         * open file has a separate mutex, it's okay as long as those don't
         * happen on the same file.  At this point, we can't easily give
         * each file a separate locking class.  Let's differentiate on
         * whether the file has mmap or not for now.
         *
         * For similar reasons, writable and readonly files are given different
         * lockdep key, because the writable file /sys/power/resume may call vfs
         * lookup helpers for arbitrary paths and readonly files can be read by
         * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs.
         *
         * All three cases look the same.  They're supposed to
         * look that way and give @of->mutex different static lockdep keys.
         */
        if (has_mmap)
                mutex_init(&of->mutex);
        else if (file->f_mode & FMODE_WRITE)
                mutex_init(&of->mutex);
        else
                mutex_init(&of->mutex);

        of->kn = kn;
        of->file = file;

        /*
         * Write path needs to atomic_write_len outside active reference.
         * Cache it in open_file.  See kernfs_fop_write_iter() for details.
         */
        of->atomic_write_len = ops->atomic_write_len;

        error = -EINVAL;
        /*
         * ->seq_show is incompatible with ->prealloc,
         * as seq_read does its own allocation.
         * ->read must be used instead.
         */
        if (ops->prealloc && ops->seq_show)
                goto err_free;
        if (ops->prealloc) {
                int len = of->atomic_write_len ?: PAGE_SIZE;
                of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
                error = -ENOMEM;
                if (!of->prealloc_buf)
                        goto err_free;
                mutex_init(&of->prealloc_mutex);
        }

        /*
         * Always instantiate seq_file even if read access doesn't use
         * seq_file or is not requested.  This unifies private data access
         * and readable regular files are the vast majority anyway.
         */
        if (ops->seq_show)
                error = seq_open(file, &kernfs_seq_ops);
        else
                error = seq_open(file, NULL);
        if (error)
                goto err_free;

        of->seq_file = file->private_data;
        of->seq_file->private = of;

        /* seq_file clears PWRITE unconditionally, restore it if WRITE */
        if (file->f_mode & FMODE_WRITE)
                file->f_mode |= FMODE_PWRITE;

        /* make sure we have open node struct */
        error = kernfs_get_open_node(kn, of);
        if (error)
                goto err_seq_release;

        if (ops->open) {
                /* nobody has access to @of yet, skip @of->mutex */
                error = ops->open(of);
                if (error)
                        goto err_put_node;
        }

        /* open succeeded, put active references */
        kernfs_put_active(kn);
        return 0;

err_put_node:
        kernfs_unlink_open_file(kn, of, true);
err_seq_release:
        seq_release(inode, file);
err_free:
        kfree(of->prealloc_buf);
        kfree(of);
err_out:
        kernfs_put_active(kn);
        return error;
}

/* used from release/drain to ensure that ->release() is called exactly once */
static void kernfs_release_file(struct kernfs_node *kn,
                                struct kernfs_open_file *of)
{
        /*
         * @of is guaranteed to have no other file operations in flight and
         * we just want to synchronize release and drain paths.
         * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used
         * here because drain path may be called from places which can
         * cause circular dependency.
         */
        lockdep_assert_held(kernfs_open_file_mutex_ptr(kn));

        if (!of->released) {
                /*
                 * A file is never detached without being released and we
                 * need to be able to release files which are deactivated
                 * and being drained.  Don't use kernfs_ops().
                 */
                kn->attr.ops->release(of);
                of->released = true;
                of_on(of)->nr_to_release--;
        }
}

static int kernfs_fop_release(struct inode *inode, struct file *filp)
{
        struct kernfs_node *kn = inode->i_private;
        struct kernfs_open_file *of = kernfs_of(filp);

        if (kn->flags & KERNFS_HAS_RELEASE) {
                struct mutex *mutex;

                mutex = kernfs_open_file_mutex_lock(kn);
                kernfs_release_file(kn, of);
                mutex_unlock(mutex);
        }

        kernfs_unlink_open_file(kn, of, false);
        seq_release(inode, filp);
        kfree(of->prealloc_buf);
        kfree(of);

        return 0;
}

bool kernfs_should_drain_open_files(struct kernfs_node *kn)
{
        struct kernfs_open_node *on;
        bool ret;

        /*
         * @kn being deactivated guarantees that @kn->attr.open can't change
         * beneath us making the lockless test below safe.
         */
        WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);

        rcu_read_lock();
        on = rcu_dereference(kn->attr.open);
        ret = on && (on->nr_mmapped || on->nr_to_release);
        rcu_read_unlock();

        return ret;
}

void kernfs_drain_open_files(struct kernfs_node *kn)
{
        struct kernfs_open_node *on;
        struct kernfs_open_file *of;
        struct mutex *mutex;

        mutex = kernfs_open_file_mutex_lock(kn);
        on = kernfs_deref_open_node_locked(kn);
        if (!on) {
                mutex_unlock(mutex);
                return;
        }

        list_for_each_entry(of, &on->files, list) {
                struct inode *inode = file_inode(of->file);

                if (of->mmapped) {
                        unmap_mapping_range(inode->i_mapping, 0, 0, 1);
                        of->mmapped = false;
                        on->nr_mmapped--;
                }

                if (kn->flags & KERNFS_HAS_RELEASE)
                        kernfs_release_file(kn, of);
        }

        WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release);
        mutex_unlock(mutex);
}

/*
 * Kernfs attribute files are pollable.  The idea is that you read
 * the content and then you use 'poll' or 'select' to wait for
 * the content to change.  When the content changes (assuming the
 * manager for the kobject supports notification), poll will
 * return EPOLLERR|EPOLLPRI, and select will return the fd whether
 * it is waiting for read, write, or exceptions.
 * Once poll/select indicates that the value has changed, you
 * need to close and re-open the file, or seek to 0 and read again.
 * Reminder: this only works for attributes which actively support
 * it, and it is not possible to test an attribute from userspace
 * to see if it supports poll (Neither 'poll' nor 'select' return
 * an appropriate error code).  When in doubt, set a suitable timeout value.
 */
__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
{
        struct kernfs_open_node *on = of_on(of);

        poll_wait(of->file, &on->poll, wait);

        if (of->event != atomic_read(&on->event))
                return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;

        return DEFAULT_POLLMASK;
}

static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
        struct kernfs_open_file *of = kernfs_of(filp);
        struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
        __poll_t ret;

        if (!kernfs_get_active(kn))
                return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;

        if (kn->attr.ops->poll)
                ret = kn->attr.ops->poll(of, wait);
        else
                ret = kernfs_generic_poll(of, wait);

        kernfs_put_active(kn);
        return ret;
}

static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence)
{
        struct kernfs_open_file *of = kernfs_of(file);
        const struct kernfs_ops *ops;
        loff_t ret;

        /*
         * @of->mutex nests outside active ref and is primarily to ensure that
         * the ops aren't called concurrently for the same open file.
         */
        mutex_lock(&of->mutex);
        if (!kernfs_get_active(of->kn)) {
                mutex_unlock(&of->mutex);
                return -ENODEV;
        }

        ops = kernfs_ops(of->kn);
        if (ops->llseek)
                ret = ops->llseek(of, offset, whence);
        else
                ret = generic_file_llseek(file, offset, whence);

        kernfs_put_active(of->kn);
        mutex_unlock(&of->mutex);
        return ret;
}

static void kernfs_notify_workfn(struct work_struct *work)
{
        struct kernfs_node *kn;
        struct kernfs_super_info *info;
        struct kernfs_root *root;
repeat:
        /* pop one off the notify_list */
        spin_lock_irq(&kernfs_notify_lock);
        kn = kernfs_notify_list;
        if (kn == KERNFS_NOTIFY_EOL) {
                spin_unlock_irq(&kernfs_notify_lock);
                return;
        }
        kernfs_notify_list = kn->attr.notify_next;
        kn->attr.notify_next = NULL;
        spin_unlock_irq(&kernfs_notify_lock);

        root = kernfs_root(kn);
        /* kick fsnotify */

        down_read(&root->kernfs_supers_rwsem);
        list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
                struct kernfs_node *parent;
                struct inode *p_inode = NULL;
                struct inode *inode;
                struct qstr name;

                /*
                 * We want fsnotify_modify() on @kn but as the
                 * modifications aren't originating from userland don't
                 * have the matching @file available.  Look up the inodes
                 * and generate the events manually.
                 */
                inode = ilookup(info->sb, kernfs_ino(kn));
                if (!inode)
                        continue;

                name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
                parent = kernfs_get_parent(kn);
                if (parent) {
                        p_inode = ilookup(info->sb, kernfs_ino(parent));
                        if (p_inode) {
                                fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD,
                                         inode, FSNOTIFY_EVENT_INODE,
                                         p_inode, &name, inode, 0);
                                iput(p_inode);
                        }

                        kernfs_put(parent);
                }

                if (!p_inode)
                        fsnotify_inode(inode, FS_MODIFY);

                iput(inode);
        }

        up_read(&root->kernfs_supers_rwsem);
        kernfs_put(kn);
        goto repeat;
}

/**
 * kernfs_notify - notify a kernfs file
 * @kn: file to notify
 *
 * Notify @kn such that poll(2) on @kn wakes up.  Maybe be called from any
 * context.
 */
void kernfs_notify(struct kernfs_node *kn)
{
        static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
        unsigned long flags;
        struct kernfs_open_node *on;

        if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
                return;

        /* kick poll immediately */
        rcu_read_lock();
        on = rcu_dereference(kn->attr.open);
        if (on) {
                atomic_inc(&on->event);
                wake_up_interruptible(&on->poll);
        }
        rcu_read_unlock();

        /* schedule work to kick fsnotify */
        spin_lock_irqsave(&kernfs_notify_lock, flags);
        if (!kn->attr.notify_next) {
                kernfs_get(kn);
                kn->attr.notify_next = kernfs_notify_list;
                kernfs_notify_list = kn;
                schedule_work(&kernfs_notify_work);
        }
        spin_unlock_irqrestore(&kernfs_notify_lock, flags);
}
EXPORT_SYMBOL_GPL(kernfs_notify);

const struct file_operations kernfs_file_fops = {
        .read_iter        = kernfs_fop_read_iter,
        .write_iter        = kernfs_fop_write_iter,
        .llseek                = kernfs_fop_llseek,
        .mmap                = kernfs_fop_mmap,
        .open                = kernfs_fop_open,
        .release        = kernfs_fop_release,
        .poll                = kernfs_fop_poll,
        .fsync                = noop_fsync,
        .splice_read        = copy_splice_read,
        .splice_write        = iter_file_splice_write,
};

/**
 * __kernfs_create_file - kernfs internal function to create a file
 * @parent: directory to create the file in
 * @name: name of the file
 * @mode: mode of the file
 * @uid: uid of the file
 * @gid: gid of the file
 * @size: size of the file
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Return: the created node on success, ERR_PTR() value on error.
 */
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         const char *name,
                                         umode_t mode, kuid_t uid, kgid_t gid,
                                         loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const void *ns,
                                         struct lock_class_key *key)
{
        struct kernfs_node *kn;
        unsigned flags;
        int rc;

        flags = KERNFS_FILE;

        kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG,
                             uid, gid, flags);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->attr.ops = ops;
        kn->attr.size = size;
        kn->ns = ns;
        kn->priv = priv;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        if (key) {
                lockdep_init_map(&kn->dep_map, "kn->active", key, 0);
                kn->flags |= KERNFS_LOCKDEP;
        }
#endif

        /*
         * kn->attr.ops is accessible only while holding active ref.  We
         * need to know whether some ops are implemented outside active
         * ref.  Cache their existence in flags.
         */
        if (ops->seq_show)
                kn->flags |= KERNFS_HAS_SEQ_SHOW;
        if (ops->mmap)
                kn->flags |= KERNFS_HAS_MMAP;
        if (ops->release)
                kn->flags |= KERNFS_HAS_RELEASE;

        rc = kernfs_add_one(kn);
        if (rc) {
                kernfs_put(kn);
                return ERR_PTR(rc);
        }
        return kn;
}









































































































































    1 






















    1 















    1 












    1 
















    1 

























































    1 







    1 













    1 















    1 
















    1 














    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


































    1 

























    1 
















    1 




    1 



























































































































































































































































































































































































    1 
    1 












































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        TCP over IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Based on:
 *        linux/net/ipv4/tcp.c
 *        linux/net/ipv4/tcp_input.c
 *        linux/net/ipv4/tcp_output.c
 *
 *        Fixes:
 *        Hideaki YOSHIFUJI        :        sin6_scope_id support
 *        YOSHIFUJI Hideaki @USAGI and:        Support IPV6_V6ONLY socket option, which
 *        Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
 *                                        a single port at the same time.
 *        YOSHIFUJI Hideaki @USAGI:        convert /proc/net/tcp6 to seq_file.
 */

#include <linux/bottom_half.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/jiffies.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/jhash.h>
#include <linux/ipsec.h>
#include <linux/times.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/random.h>
#include <linux/indirect_call_wrapper.h>

#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/inet6_hashtables.h>
#include <net/inet6_connection_sock.h>
#include <net/ipv6.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/ip6_checksum.h>
#include <net/inet_ecn.h>
#include <net/protocol.h>
#include <net/xfrm.h>
#include <net/snmp.h>
#include <net/dsfield.h>
#include <net/timewait_sock.h>
#include <net/inet_common.h>
#include <net/secure_seq.h>
#include <net/hotdata.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>

#include <linux/proc_fs.h>
#include <linux/seq_file.h>

#include <crypto/hash.h>
#include <linux/scatterlist.h>

#include <trace/events/tcp.h>

static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
                              enum sk_rst_reason reason);
static void        tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
                                      struct request_sock *req);

INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);

static const struct inet_connection_sock_af_ops ipv6_mapped;
const struct inet_connection_sock_af_ops ipv6_specific;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
#endif

/* Helper returning the inet6 address from a given tcp socket.
 * It can be used in TCP stack instead of inet6_sk(sk).
 * This avoids a dereference and allow compiler optimizations.
 * It is a specialized version of inet6_sk_generic().
 */
#define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \
                                              struct tcp6_sock, tcp)->inet6)

static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst_hold_safe(dst)) {
                rcu_assign_pointer(sk->sk_rx_dst, dst);
                sk->sk_rx_dst_ifindex = skb->skb_iif;
                sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
        }
}

static u32 tcp_v6_init_seq(const struct sk_buff *skb)
{
        return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32,
                                ipv6_hdr(skb)->saddr.s6_addr32,
                                tcp_hdr(skb)->dest,
                                tcp_hdr(skb)->source);
}

static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
        return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32,
                                   ipv6_hdr(skb)->saddr.s6_addr32);
}

static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
                              int addr_len)
{
        /* This check is replicated from tcp_v6_connect() and intended to
         * prevent BPF program called below from accessing bytes that are out
         * of the bound specified by user in addr_len.
         */
        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;

        sock_owned_by_me(sk);

        return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
}

static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                          int addr_len)
{
        struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct in6_addr *saddr = NULL, *final_p, final;
        struct inet_timewait_death_row *tcp_death_row;
        struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct ipv6_txoptions *opt;
        struct dst_entry *dst;
        struct flowi6 fl6;
        int addr_type;
        int err;

        if (addr_len < SIN6_LEN_RFC2133)
                return -EINVAL;

        if (usin->sin6_family != AF_INET6)
                return -EAFNOSUPPORT;

        memset(&fl6, 0, sizeof(fl6));

        if (inet6_test_bit(SNDFLOW, sk)) {
                fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
                IP6_ECN_flow_init(fl6.flowlabel);
                if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
                        struct ip6_flowlabel *flowlabel;
                        flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
                        if (IS_ERR(flowlabel))
                                return -EINVAL;
                        fl6_sock_release(flowlabel);
                }
        }

        /*
         *        connect() to INADDR_ANY means loopback (BSD'ism).
         */

        if (ipv6_addr_any(&usin->sin6_addr)) {
                if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
                        ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK),
                                               &usin->sin6_addr);
                else
                        usin->sin6_addr = in6addr_loopback;
        }

        addr_type = ipv6_addr_type(&usin->sin6_addr);

        if (addr_type & IPV6_ADDR_MULTICAST)
                return -ENETUNREACH;

        if (addr_type&IPV6_ADDR_LINKLOCAL) {
                if (addr_len >= sizeof(struct sockaddr_in6) &&
                    usin->sin6_scope_id) {
                        /* If interface is set while binding, indices
                         * must coincide.
                         */
                        if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id))
                                return -EINVAL;

                        sk->sk_bound_dev_if = usin->sin6_scope_id;
                }

                /* Connect to link-local address requires an interface */
                if (!sk->sk_bound_dev_if)
                        return -EINVAL;
        }

        if (tp->rx_opt.ts_recent_stamp &&
            !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
                tp->rx_opt.ts_recent = 0;
                tp->rx_opt.ts_recent_stamp = 0;
                WRITE_ONCE(tp->write_seq, 0);
        }

        sk->sk_v6_daddr = usin->sin6_addr;
        np->flow_label = fl6.flowlabel;

        /*
         *        TCP over IPv4
         */

        if (addr_type & IPV6_ADDR_MAPPED) {
                u32 exthdrlen = icsk->icsk_ext_hdr_len;
                struct sockaddr_in sin;

                if (ipv6_only_sock(sk))
                        return -ENETUNREACH;

                sin.sin_family = AF_INET;
                sin.sin_port = usin->sin6_port;
                sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];

                /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
                WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped);
                if (sk_is_mptcp(sk))
                        mptcpv6_handle_mapped(sk, true);
                sk->sk_backlog_rcv = tcp_v4_do_rcv;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
                tp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif

                err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));

                if (err) {
                        icsk->icsk_ext_hdr_len = exthdrlen;
                        /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
                        WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific);
                        if (sk_is_mptcp(sk))
                                mptcpv6_handle_mapped(sk, false);
                        sk->sk_backlog_rcv = tcp_v6_do_rcv;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
                        tp->af_specific = &tcp_sock_ipv6_specific;
#endif
                        goto failure;
                }
                np->saddr = sk->sk_v6_rcv_saddr;

                return err;
        }

        if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
                saddr = &sk->sk_v6_rcv_saddr;

        fl6.flowi6_proto = IPPROTO_TCP;
        fl6.daddr = sk->sk_v6_daddr;
        fl6.saddr = saddr ? *saddr : np->saddr;
        fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
        fl6.flowi6_oif = sk->sk_bound_dev_if;
        fl6.flowi6_mark = sk->sk_mark;
        fl6.fl6_dport = usin->sin6_port;
        fl6.fl6_sport = inet->inet_sport;
        fl6.flowi6_uid = sk->sk_uid;

        opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
        final_p = fl6_update_dst(&fl6, opt, &final);

        security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));

        dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                goto failure;
        }

        tp->tcp_usec_ts = dst_tcp_usec_ts(dst);
        tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;

        if (!saddr) {
                saddr = &fl6.saddr;

                err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
                if (err)
                        goto failure;
        }

        /* set the source address */
        np->saddr = *saddr;
        inet->inet_rcv_saddr = LOOPBACK4_IPV6;

        sk->sk_gso_type = SKB_GSO_TCPV6;
        ip6_dst_store(sk, dst, NULL, NULL);

        icsk->icsk_ext_hdr_len = 0;
        if (opt)
                icsk->icsk_ext_hdr_len = opt->opt_flen +
                                         opt->opt_nflen;

        tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);

        inet->inet_dport = usin->sin6_port;

        tcp_set_state(sk, TCP_SYN_SENT);
        err = inet6_hash_connect(tcp_death_row, sk);
        if (err)
                goto late_failure;

        sk_set_txhash(sk);

        if (likely(!tp->repair)) {
                if (!tp->write_seq)
                        WRITE_ONCE(tp->write_seq,
                                   secure_tcpv6_seq(np->saddr.s6_addr32,
                                                    sk->sk_v6_daddr.s6_addr32,
                                                    inet->inet_sport,
                                                    inet->inet_dport));
                tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32,
                                                   sk->sk_v6_daddr.s6_addr32);
        }

        if (tcp_fastopen_defer_connect(sk, &err))
                return err;
        if (err)
                goto late_failure;

        err = tcp_connect(sk);
        if (err)
                goto late_failure;

        return 0;

late_failure:
        tcp_set_state(sk, TCP_CLOSE);
        inet_bhash2_reset_saddr(sk);
failure:
        inet->inet_dport = 0;
        sk->sk_route_caps = 0;
        return err;
}

static void tcp_v6_mtu_reduced(struct sock *sk)
{
        struct dst_entry *dst;
        u32 mtu;

        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                return;

        mtu = READ_ONCE(tcp_sk(sk)->mtu_info);

        /* Drop requests trying to increase our current mss.
         * Check done in __ip6_rt_update_pmtu() is too late.
         */
        if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache)
                return;

        dst = inet6_csk_update_pmtu(sk, mtu);
        if (!dst)
                return;

        if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
                tcp_sync_mss(sk, dst_mtu(dst));
                tcp_simple_retransmit(sk);
        }
}

static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                u8 type, u8 code, int offset, __be32 info)
{
        const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
        const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
        struct net *net = dev_net(skb->dev);
        struct request_sock *fastopen;
        struct ipv6_pinfo *np;
        struct tcp_sock *tp;
        __u32 seq, snd_una;
        struct sock *sk;
        bool fatal;
        int err;

        sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
                                        &hdr->daddr, th->dest,
                                        &hdr->saddr, ntohs(th->source),
                                        skb->dev->ifindex, inet6_sdif(skb));

        if (!sk) {
                __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
                                  ICMP6_MIB_INERRORS);
                return -ENOENT;
        }

        if (sk->sk_state == TCP_TIME_WAIT) {
                /* To increase the counter of ignored icmps for TCP-AO */
                tcp_ao_ignore_icmp(sk, AF_INET6, type, code);
                inet_twsk_put(inet_twsk(sk));
                return 0;
        }
        seq = ntohl(th->seq);
        fatal = icmpv6_err_convert(type, code, &err);
        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                tcp_req_err(sk, seq, fatal);
                return 0;
        }

        if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) {
                sock_put(sk);
                return 0;
        }

        bh_lock_sock(sk);
        if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
                __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);

        if (sk->sk_state == TCP_CLOSE)
                goto out;

        if (static_branch_unlikely(&ip6_min_hopcount)) {
                /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
                if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        goto out;
                }
        }

        tp = tcp_sk(sk);
        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
        fastopen = rcu_dereference(tp->fastopen_rsk);
        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
        if (sk->sk_state != TCP_LISTEN &&
            !between(seq, snd_una, tp->snd_nxt)) {
                __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
                goto out;
        }

        np = tcp_inet6_sk(sk);

        if (type == NDISC_REDIRECT) {
                if (!sock_owned_by_user(sk)) {
                        struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);

                        if (dst)
                                dst->ops->redirect(dst, sk, skb);
                }
                goto out;
        }

        if (type == ICMPV6_PKT_TOOBIG) {
                u32 mtu = ntohl(info);

                /* We are not interested in TCP_LISTEN and open_requests
                 * (SYN-ACKs send out by Linux are always <576bytes so
                 * they should go through unfragmented).
                 */
                if (sk->sk_state == TCP_LISTEN)
                        goto out;

                if (!ip6_sk_accept_pmtu(sk))
                        goto out;

                if (mtu < IPV6_MIN_MTU)
                        goto out;

                WRITE_ONCE(tp->mtu_info, mtu);

                if (!sock_owned_by_user(sk))
                        tcp_v6_mtu_reduced(sk);
                else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
                                           &sk->sk_tsq_flags))
                        sock_hold(sk);
                goto out;
        }


        /* Might be for an request_sock */
        switch (sk->sk_state) {
        case TCP_SYN_SENT:
        case TCP_SYN_RECV:
                /* Only in fast or simultaneous open. If a fast open socket is
                 * already accepted it is treated as a connected one below.
                 */
                if (fastopen && !fastopen->sk)
                        break;

                ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th);

                if (!sock_owned_by_user(sk))
                        tcp_done_with_error(sk, err);
                else
                        WRITE_ONCE(sk->sk_err_soft, err);
                goto out;
        case TCP_LISTEN:
                break;
        default:
                /* check if this ICMP message allows revert of backoff.
                 * (see RFC 6069)
                 */
                if (!fastopen && type == ICMPV6_DEST_UNREACH &&
                    code == ICMPV6_NOROUTE)
                        tcp_ld_RTO_revert(sk, seq);
        }

        if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
                WRITE_ONCE(sk->sk_err, err);
                sk_error_report(sk);
        } else {
                WRITE_ONCE(sk->sk_err_soft, err);
        }
out:
        bh_unlock_sock(sk);
        sock_put(sk);
        return 0;
}


static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
                              struct flowi *fl,
                              struct request_sock *req,
                              struct tcp_fastopen_cookie *foc,
                              enum tcp_synack_type synack_type,
                              struct sk_buff *syn_skb)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct ipv6_txoptions *opt;
        struct flowi6 *fl6 = &fl->u.ip6;
        struct sk_buff *skb;
        int err = -ENOMEM;
        u8 tclass;

        /* First, grab a route. */
        if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
                                               IPPROTO_TCP)) == NULL)
                goto done;

        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);

        if (skb) {
                __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
                                    &ireq->ir_v6_rmt_addr);

                fl6->daddr = ireq->ir_v6_rmt_addr;
                if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts)
                        fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));

                tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
                                (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
                                (np->tclass & INET_ECN_MASK) :
                                np->tclass;

                if (!INET_ECN_is_capable(tclass) &&
                    tcp_bpf_ca_needs_ecn((struct sock *)req))
                        tclass |= INET_ECN_ECT_0;

                rcu_read_lock();
                opt = ireq->ipv6_opt;
                if (!opt)
                        opt = rcu_dereference(np->opt);
                err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark),
                               opt, tclass, READ_ONCE(sk->sk_priority));
                rcu_read_unlock();
                err = net_xmit_eval(err);
        }

done:
        return err;
}


static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
        kfree(inet_rsk(req)->ipv6_opt);
        consume_skb(inet_rsk(req)->pktopts);
}

#ifdef CONFIG_TCP_MD5SIG
static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
                                                   const struct in6_addr *addr,
                                                   int l3index)
{
        return tcp_md5_do_lookup(sk, l3index,
                                 (union tcp_md5_addr *)addr, AF_INET6);
}

static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
                                                const struct sock *addr_sk)
{
        int l3index;

        l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
                                                 addr_sk->sk_bound_dev_if);
        return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr,
                                    l3index);
}

static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
                                 sockptr_t optval, int optlen)
{
        struct tcp_md5sig cmd;
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
        union tcp_ao_addr *addr;
        int l3index = 0;
        u8 prefixlen;
        bool l3flag;
        u8 flags;

        if (optlen < sizeof(cmd))
                return -EINVAL;

        if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
                return -EFAULT;

        if (sin6->sin6_family != AF_INET6)
                return -EINVAL;

        flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
        l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;

        if (optname == TCP_MD5SIG_EXT &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
                prefixlen = cmd.tcpm_prefixlen;
                if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) &&
                                        prefixlen > 32))
                        return -EINVAL;
        } else {
                prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
        }

        if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
            cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
                if (dev && netif_is_l3_master(dev))
                        l3index = dev->ifindex;
                rcu_read_unlock();

                /* ok to reference set/not set outside of rcu;
                 * right now device MUST be an L3 master
                 */
                if (!dev || !l3index)
                        return -EINVAL;
        }

        if (!cmd.tcpm_keylen) {
                if (ipv6_addr_v4mapped(&sin6->sin6_addr))
                        return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
                                              AF_INET, prefixlen,
                                              l3index, flags);
                return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
                                      AF_INET6, prefixlen, l3index, flags);
        }

        if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
                return -EINVAL;

        if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
                addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3];

                /* Don't allow keys for peers that have a matching TCP-AO key.
                 * See the comment in tcp_ao_add_cmd()
                 */
                if (tcp_ao_required(sk, addr, AF_INET,
                                    l3flag ? l3index : -1, false))
                        return -EKEYREJECTED;
                return tcp_md5_do_add(sk, addr,
                                      AF_INET, prefixlen, l3index, flags,
                                      cmd.tcpm_key, cmd.tcpm_keylen);
        }

        addr = (union tcp_md5_addr *)&sin6->sin6_addr;

        /* Don't allow keys for peers that have a matching TCP-AO key.
         * See the comment in tcp_ao_add_cmd()
         */
        if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false))
                return -EKEYREJECTED;

        return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags,
                              cmd.tcpm_key, cmd.tcpm_keylen);
}

static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr,
                                   const struct tcphdr *th, int nbytes)
{
        struct tcp6_pseudohdr *bp;
        struct scatterlist sg;
        struct tcphdr *_th;

        bp = hp->scratch;
        /* 1. TCP pseudo-header (RFC2460) */
        bp->saddr = *saddr;
        bp->daddr = *daddr;
        bp->protocol = cpu_to_be32(IPPROTO_TCP);
        bp->len = cpu_to_be32(nbytes);

        _th = (struct tcphdr *)(bp + 1);
        memcpy(_th, th, sizeof(*th));
        _th->check = 0;

        sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
        ahash_request_set_crypt(hp->req, &sg, NULL,
                                sizeof(*bp) + sizeof(*th));
        return crypto_ahash_update(hp->req);
}

static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
                               const struct in6_addr *daddr, struct in6_addr *saddr,
                               const struct tcphdr *th)
{
        struct tcp_sigpool hp;

        if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
                goto clear_hash_nostart;

        if (crypto_ahash_init(hp.req))
                goto clear_hash;
        if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2))
                goto clear_hash;
        if (tcp_md5_hash_key(&hp, key))
                goto clear_hash;
        ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
        if (crypto_ahash_final(hp.req))
                goto clear_hash;

        tcp_sigpool_end(&hp);
        return 0;

clear_hash:
        tcp_sigpool_end(&hp);
clear_hash_nostart:
        memset(md5_hash, 0, 16);
        return 1;
}

static int tcp_v6_md5_hash_skb(char *md5_hash,
                               const struct tcp_md5sig_key *key,
                               const struct sock *sk,
                               const struct sk_buff *skb)
{
        const struct tcphdr *th = tcp_hdr(skb);
        const struct in6_addr *saddr, *daddr;
        struct tcp_sigpool hp;

        if (sk) { /* valid for establish/request sockets */
                saddr = &sk->sk_v6_rcv_saddr;
                daddr = &sk->sk_v6_daddr;
        } else {
                const struct ipv6hdr *ip6h = ipv6_hdr(skb);
                saddr = &ip6h->saddr;
                daddr = &ip6h->daddr;
        }

        if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp))
                goto clear_hash_nostart;

        if (crypto_ahash_init(hp.req))
                goto clear_hash;

        if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len))
                goto clear_hash;
        if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
                goto clear_hash;
        if (tcp_md5_hash_key(&hp, key))
                goto clear_hash;
        ahash_request_set_crypt(hp.req, NULL, md5_hash, 0);
        if (crypto_ahash_final(hp.req))
                goto clear_hash;

        tcp_sigpool_end(&hp);
        return 0;

clear_hash:
        tcp_sigpool_end(&hp);
clear_hash_nostart:
        memset(md5_hash, 0, 16);
        return 1;
}
#endif

static void tcp_v6_init_req(struct request_sock *req,
                            const struct sock *sk_listener,
                            struct sk_buff *skb,
                            u32 tw_isn)
{
        bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener);

        ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
        ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;

        /* So that link locals have meaning */
        if ((!sk_listener->sk_bound_dev_if || l3_slave) &&
            ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
                ireq->ir_iif = tcp_v6_iif(skb);

        if (!tw_isn &&
            (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
             np->rxopt.bits.rxinfo ||
             np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
             np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) {
                refcount_inc(&skb->users);
                ireq->pktopts = skb;
        }
}

static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
                                          struct sk_buff *skb,
                                          struct flowi *fl,
                                          struct request_sock *req,
                                          u32 tw_isn)
{
        tcp_v6_init_req(req, sk, skb, tw_isn);

        if (security_inet_conn_request(sk, skb, req))
                return NULL;

        return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
}

struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
        .family                =        AF_INET6,
        .obj_size        =        sizeof(struct tcp6_request_sock),
        .rtx_syn_ack        =        tcp_rtx_synack,
        .send_ack        =        tcp_v6_reqsk_send_ack,
        .destructor        =        tcp_v6_reqsk_destructor,
        .send_reset        =        tcp_v6_send_reset,
        .syn_ack_timeout =        tcp_syn_ack_timeout,
};

const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
        .mss_clamp        =        IPV6_MIN_MTU - sizeof(struct tcphdr) -
                                sizeof(struct ipv6hdr),
#ifdef CONFIG_TCP_MD5SIG
        .req_md5_lookup        =        tcp_v6_md5_lookup,
        .calc_md5_hash        =        tcp_v6_md5_hash_skb,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v6_ao_lookup_rsk,
        .ao_calc_key        =        tcp_v6_ao_calc_key_rsk,
        .ao_synack_hash =        tcp_v6_ao_synack_hash,
#endif
#ifdef CONFIG_SYN_COOKIES
        .cookie_init_seq =        cookie_v6_init_sequence,
#endif
        .route_req        =        tcp_v6_route_req,
        .init_seq        =        tcp_v6_init_seq,
        .init_ts_off        =        tcp_v6_init_ts_off,
        .send_synack        =        tcp_v6_send_synack,
};

static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
                                 u32 ack, u32 win, u32 tsval, u32 tsecr,
                                 int oif, int rst, u8 tclass, __be32 label,
                                 u32 priority, u32 txhash, struct tcp_key *key)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct tcphdr *t1;
        struct sk_buff *buff;
        struct flowi6 fl6;
        struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
        struct sock *ctl_sk = net->ipv6.tcp_sk;
        unsigned int tot_len = sizeof(struct tcphdr);
        __be32 mrst = 0, *topt;
        struct dst_entry *dst;
        __u32 mark = 0;

        if (tsecr)
                tot_len += TCPOLEN_TSTAMP_ALIGNED;
        if (tcp_key_is_md5(key))
                tot_len += TCPOLEN_MD5SIG_ALIGNED;
        if (tcp_key_is_ao(key))
                tot_len += tcp_ao_len_aligned(key->ao_key);

#ifdef CONFIG_MPTCP
        if (rst && !tcp_key_is_md5(key)) {
                mrst = mptcp_reset_option(skb);

                if (mrst)
                        tot_len += sizeof(__be32);
        }
#endif

        buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
        if (!buff)
                return;

        skb_reserve(buff, MAX_TCP_HEADER);

        t1 = skb_push(buff, tot_len);
        skb_reset_transport_header(buff);

        /* Swap the send and the receive. */
        memset(t1, 0, sizeof(*t1));
        t1->dest = th->source;
        t1->source = th->dest;
        t1->doff = tot_len / 4;
        t1->seq = htonl(seq);
        t1->ack_seq = htonl(ack);
        t1->ack = !rst || !th->ack;
        t1->rst = rst;
        t1->window = htons(win);

        topt = (__be32 *)(t1 + 1);

        if (tsecr) {
                *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
                                (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
                *topt++ = htonl(tsval);
                *topt++ = htonl(tsecr);
        }

        if (mrst)
                *topt++ = mrst;

#ifdef CONFIG_TCP_MD5SIG
        if (tcp_key_is_md5(key)) {
                *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
                                (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
                tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key,
                                    &ipv6_hdr(skb)->saddr,
                                    &ipv6_hdr(skb)->daddr, t1);
        }
#endif
#ifdef CONFIG_TCP_AO
        if (tcp_key_is_ao(key)) {
                *topt++ = htonl((TCPOPT_AO << 24) |
                                (tcp_ao_len(key->ao_key) << 16) |
                                (key->ao_key->sndid << 8) |
                                (key->rcv_next));

                tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key,
                                key->traffic_key,
                                (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr,
                                (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr,
                                t1, key->sne);
        }
#endif

        memset(&fl6, 0, sizeof(fl6));
        fl6.daddr = ipv6_hdr(skb)->saddr;
        fl6.saddr = ipv6_hdr(skb)->daddr;
        fl6.flowlabel = label;

        buff->ip_summed = CHECKSUM_PARTIAL;

        __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);

        fl6.flowi6_proto = IPPROTO_TCP;
        if (rt6_need_strict(&fl6.daddr) && !oif)
                fl6.flowi6_oif = tcp_v6_iif(skb);
        else {
                if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
                        oif = skb->skb_iif;

                fl6.flowi6_oif = oif;
        }

        if (sk) {
                if (sk->sk_state == TCP_TIME_WAIT)
                        mark = inet_twsk(sk)->tw_mark;
                else
                        mark = READ_ONCE(sk->sk_mark);
                skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC);
        }
        if (txhash) {
                /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
                skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4);
        }
        fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
        fl6.fl6_dport = t1->dest;
        fl6.fl6_sport = t1->source;
        fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
        security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));

        /* Pass a socket to ip6_dst_lookup either it is for RST
         * Underlying function will use this to retrieve the network
         * namespace
         */
        if (sk && sk->sk_state != TCP_TIME_WAIT)
                dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/
        else
                dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL);
        if (!IS_ERR(dst)) {
                skb_dst_set(buff, dst);
                ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
                         tclass & ~INET_ECN_MASK, priority);
                TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
                if (rst)
                        TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
                return;
        }

        kfree_skb(buff);
}

static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
                              enum sk_rst_reason reason)
{
        const struct tcphdr *th = tcp_hdr(skb);
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        const __u8 *md5_hash_location = NULL;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        bool allocated_traffic_key = false;
#endif
        const struct tcp_ao_hdr *aoh;
        struct tcp_key key = {};
        u32 seq = 0, ack_seq = 0;
        __be32 label = 0;
        u32 priority = 0;
        struct net *net;
        u32 txhash = 0;
        int oif = 0;
#ifdef CONFIG_TCP_MD5SIG
        unsigned char newhash[16];
        int genhash;
        struct sock *sk1 = NULL;
#endif

        if (th->rst)
                return;

        /* If sk not NULL, it means we did a successful lookup and incoming
         * route had to be correct. prequeue might have dropped our dst.
         */
        if (!sk && !ipv6_unicast_destination(skb))
                return;

        net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
        /* Invalid TCP option size or twice included auth */
        if (tcp_parse_auth_options(th, &md5_hash_location, &aoh))
                return;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        rcu_read_lock();
#endif
#ifdef CONFIG_TCP_MD5SIG
        if (sk && sk_fullsock(sk)) {
                int l3index;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and inet_iif is set to it.
                 */
                l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
                key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
        } else if (md5_hash_location) {
                int dif = tcp_v6_iif_l3_slave(skb);
                int sdif = tcp_v6_sdif(skb);
                int l3index;

                /*
                 * active side is lost. Try to find listening socket through
                 * source port, and then find md5 key through listening socket.
                 * we are not loose security here:
                 * Incoming packet is checked with md5 hash with finding key,
                 * no RST generated if md5 hash doesn't match.
                 */
                sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
                                            NULL, 0, &ipv6h->saddr, th->source,
                                            &ipv6h->daddr, ntohs(th->source),
                                            dif, sdif);
                if (!sk1)
                        goto out;

                /* sdif set, means packet ingressed via a device
                 * in an L3 domain and dif is set to it.
                 */
                l3index = tcp_v6_sdif(skb) ? dif : 0;

                key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index);
                if (!key.md5_key)
                        goto out;
                key.type = TCP_KEY_MD5;

                genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb);
                if (genhash || memcmp(md5_hash_location, newhash, 16) != 0)
                        goto out;
        }
#endif

        if (th->ack)
                seq = ntohl(th->ack_seq);
        else
                ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
                          (th->doff << 2);

#ifdef CONFIG_TCP_AO
        if (aoh) {
                int l3index;

                l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
                if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq,
                                         &key.ao_key, &key.traffic_key,
                                         &allocated_traffic_key,
                                         &key.rcv_next, &key.sne))
                        goto out;
                key.type = TCP_KEY_AO;
        }
#endif

        if (sk) {
                oif = sk->sk_bound_dev_if;
                if (sk_fullsock(sk)) {
                        if (inet6_test_bit(REPFLOW, sk))
                                label = ip6_flowlabel(ipv6h);
                        priority = READ_ONCE(sk->sk_priority);
                        txhash = sk->sk_txhash;
                }
                if (sk->sk_state == TCP_TIME_WAIT) {
                        label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
                        priority = inet_twsk(sk)->tw_priority;
                        txhash = inet_twsk(sk)->tw_txhash;
                }
        } else {
                if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
                        label = ip6_flowlabel(ipv6h);
        }

        trace_tcp_send_reset(sk, skb, reason);

        tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
                             ipv6_get_dsfield(ipv6h), label, priority, txhash,
                             &key);

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
out:
        if (allocated_traffic_key)
                kfree(key.traffic_key);
        rcu_read_unlock();
#endif
}

static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
                            u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
                            struct tcp_key *key, u8 tclass,
                            __be32 label, u32 priority, u32 txhash)
{
        tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0,
                             tclass, label, priority, txhash, key);
}

static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
        struct tcp_key key = {};
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info *ao_info;

        if (static_branch_unlikely(&tcp_ao_needed.key)) {

                /* FIXME: the segment to-be-acked is not verified yet */
                ao_info = rcu_dereference(tcptw->ao_info);
                if (ao_info) {
                        const struct tcp_ao_hdr *aoh;

                        /* Invalid TCP option size or twice included auth */
                        if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
                                goto out;
                        if (aoh)
                                key.ao_key = tcp_ao_established_key(ao_info,
                                                aoh->rnext_keyid, -1);
                }
        }
        if (key.ao_key) {
                struct tcp_ao_key *rnext_key;

                key.traffic_key = snd_other_key(key.ao_key);
                /* rcv_next switches to our rcv_next */
                rnext_key = READ_ONCE(ao_info->rnext_key);
                key.rcv_next = rnext_key->rcvid;
                key.sne = READ_ONCE(ao_info->snd_sne);
                key.type = TCP_KEY_AO;
#else
        if (0) {
#endif
#ifdef CONFIG_TCP_MD5SIG
        } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
                key.md5_key = tcp_twsk_md5_key(tcptw);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
#endif
        }

        tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
                        tcp_tw_tsval(tcptw),
                        READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if,
                        &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel),
                        tw->tw_priority, tw->tw_txhash);

#ifdef CONFIG_TCP_AO
out:
#endif
        inet_twsk_put(tw);
}

static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req)
{
        struct tcp_key key = {};

#ifdef CONFIG_TCP_AO
        if (static_branch_unlikely(&tcp_ao_needed.key) &&
            tcp_rsk_used_ao(req)) {
                const struct in6_addr *addr = &ipv6_hdr(skb)->saddr;
                const struct tcp_ao_hdr *aoh;
                int l3index;

                l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
                /* Invalid TCP option size or twice included auth */
                if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
                        return;
                if (!aoh)
                        return;
                key.ao_key = tcp_ao_do_lookup(sk, l3index,
                                              (union tcp_ao_addr *)addr,
                                              AF_INET6, aoh->rnext_keyid, -1);
                if (unlikely(!key.ao_key)) {
                        /* Send ACK with any matching MKT for the peer */
                        key.ao_key = tcp_ao_do_lookup(sk, l3index,
                                                      (union tcp_ao_addr *)addr,
                                                      AF_INET6, -1, -1);
                        /* Matching key disappeared (user removed the key?)
                         * let the handshake timeout.
                         */
                        if (!key.ao_key) {
                                net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n",
                                                     addr,
                                                     ntohs(tcp_hdr(skb)->source),
                                                     &ipv6_hdr(skb)->daddr,
                                                     ntohs(tcp_hdr(skb)->dest));
                                return;
                        }
                }
                key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
                if (!key.traffic_key)
                        return;

                key.type = TCP_KEY_AO;
                key.rcv_next = aoh->keyid;
                tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
#else
        if (0) {
#endif
#ifdef CONFIG_TCP_MD5SIG
        } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
                int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;

                key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
                                                   l3index);
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
#endif
        }

        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
         */
        tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
                        tcp_rsk(req)->rcv_nxt,
                        tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
                        tcp_rsk_tsval(tcp_rsk(req)),
                        READ_ONCE(req->ts_recent), sk->sk_bound_dev_if,
                        &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0,
                        READ_ONCE(sk->sk_priority),
                        READ_ONCE(tcp_rsk(req)->txhash));
        if (tcp_key_is_ao(&key))
                kfree(key.traffic_key);
}


static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{
#ifdef CONFIG_SYN_COOKIES
        const struct tcphdr *th = tcp_hdr(skb);

        if (!th->syn)
                sk = cookie_v6_check(sk, skb);
#endif
        return sk;
}

u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
                         struct tcphdr *th, u32 *cookie)
{
        u16 mss = 0;
#ifdef CONFIG_SYN_COOKIES
        mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops,
                                    &tcp_request_sock_ipv6_ops, sk, th);
        if (mss) {
                *cookie = __cookie_v6_init_sequence(iph, th, &mss);
                tcp_synq_overflow(sk);
        }
#endif
        return mss;
}

static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
        if (skb->protocol == htons(ETH_P_IP))
                return tcp_v4_conn_request(sk, skb);

        if (!ipv6_unicast_destination(skb))
                goto drop;

        if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
                __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
                return 0;
        }

        return tcp_conn_request(&tcp6_request_sock_ops,
                                &tcp_request_sock_ipv6_ops, sk, skb);

drop:
        tcp_listendrop(sk);
        return 0; /* don't send reset */
}

static void tcp_v6_restore_cb(struct sk_buff *skb)
{
        /* We need to move header back to the beginning if xfrm6_policy_check()
         * and tcp_v6_fill_cb() are going to be called again.
         * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
         */
        memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
                sizeof(struct inet6_skb_parm));
}

static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                                         struct request_sock *req,
                                         struct dst_entry *dst,
                                         struct request_sock *req_unhash,
                                         bool *own_req)
{
        struct inet_request_sock *ireq;
        struct ipv6_pinfo *newnp;
        const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct ipv6_txoptions *opt;
        struct inet_sock *newinet;
        bool found_dup_sk = false;
        struct tcp_sock *newtp;
        struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *key;
        int l3index;
#endif
        struct flowi6 fl6;

        if (skb->protocol == htons(ETH_P_IP)) {
                /*
                 *        v6 mapped
                 */

                newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst,
                                             req_unhash, own_req);

                if (!newsk)
                        return NULL;

                inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);

                newnp = tcp_inet6_sk(newsk);
                newtp = tcp_sk(newsk);

                memcpy(newnp, np, sizeof(struct ipv6_pinfo));

                newnp->saddr = newsk->sk_v6_rcv_saddr;

                inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
                if (sk_is_mptcp(newsk))
                        mptcpv6_handle_mapped(newsk, true);
                newsk->sk_backlog_rcv = tcp_v4_do_rcv;
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
                newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif

                newnp->ipv6_mc_list = NULL;
                newnp->ipv6_ac_list = NULL;
                newnp->ipv6_fl_list = NULL;
                newnp->pktoptions  = NULL;
                newnp->opt           = NULL;
                newnp->mcast_oif   = inet_iif(skb);
                newnp->mcast_hops  = ip_hdr(skb)->ttl;
                newnp->rcv_flowinfo = 0;
                if (inet6_test_bit(REPFLOW, sk))
                        newnp->flow_label = 0;

                /*
                 * No need to charge this sock to the relevant IPv6 refcnt debug socks count
                 * here, tcp_create_openreq_child now does this for us, see the comment in
                 * that function for the gory details. -acme
                 */

                /* It is tricky place. Until this moment IPv4 tcp
                   worked with IPv6 icsk.icsk_af_ops.
                   Sync it now.
                 */
                tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);

                return newsk;
        }

        ireq = inet_rsk(req);

        if (sk_acceptq_is_full(sk))
                goto out_overflow;

        if (!dst) {
                dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP);
                if (!dst)
                        goto out;
        }

        newsk = tcp_create_openreq_child(sk, req, skb);
        if (!newsk)
                goto out_nonewsk;

        /*
         * No need to charge this sock to the relevant IPv6 refcnt debug socks
         * count here, tcp_create_openreq_child now does this for us, see the
         * comment in that function for the gory details. -acme
         */

        newsk->sk_gso_type = SKB_GSO_TCPV6;
        inet6_sk_rx_dst_set(newsk, skb);

        inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk);

        newtp = tcp_sk(newsk);
        newinet = inet_sk(newsk);
        newnp = tcp_inet6_sk(newsk);

        memcpy(newnp, np, sizeof(struct ipv6_pinfo));

        ip6_dst_store(newsk, dst, NULL, NULL);

        newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
        newnp->saddr = ireq->ir_v6_loc_addr;
        newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
        newsk->sk_bound_dev_if = ireq->ir_iif;

        /* Now IPv6 options...

           First: no IPv4 options.
         */
        newinet->inet_opt = NULL;
        newnp->ipv6_mc_list = NULL;
        newnp->ipv6_ac_list = NULL;
        newnp->ipv6_fl_list = NULL;

        /* Clone RX bits */
        newnp->rxopt.all = np->rxopt.all;

        newnp->pktoptions = NULL;
        newnp->opt          = NULL;
        newnp->mcast_oif  = tcp_v6_iif(skb);
        newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
        newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
        if (inet6_test_bit(REPFLOW, sk))
                newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));

        /* Set ToS of the new socket based upon the value of incoming SYN.
         * ECT bits are set later in tcp_init_transfer().
         */
        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
                newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;

        /* Clone native IPv6 options from listening socket (if any)

           Yes, keeping reference count would be much more clever,
           but we make one more one thing there: reattach optmem
           to newsk.
         */
        opt = ireq->ipv6_opt;
        if (!opt)
                opt = rcu_dereference(np->opt);
        if (opt) {
                opt = ipv6_dup_options(newsk, opt);
                RCU_INIT_POINTER(newnp->opt, opt);
        }
        inet_csk(newsk)->icsk_ext_hdr_len = 0;
        if (opt)
                inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
                                                    opt->opt_flen;

        tcp_ca_openreq_child(newsk, dst);

        tcp_sync_mss(newsk, dst_mtu(dst));
        newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));

        tcp_initialize_rcv_mss(newsk);

        newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
        newinet->inet_rcv_saddr = LOOPBACK4_IPV6;

#ifdef CONFIG_TCP_MD5SIG
        l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);

        if (!tcp_rsk_used_ao(req)) {
                /* Copy over the MD5 key from the original socket */
                key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
                if (key) {
                        const union tcp_md5_addr *addr;

                        addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
                        if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) {
                                inet_csk_prepare_forced_close(newsk);
                                tcp_done(newsk);
                                goto out;
                        }
                }
        }
#endif
#ifdef CONFIG_TCP_AO
        /* Copy over tcp_ao_info if any */
        if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
                goto out; /* OOM */
#endif

        if (__inet_inherit_port(sk, newsk) < 0) {
                inet_csk_prepare_forced_close(newsk);
                tcp_done(newsk);
                goto out;
        }
        *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
                                       &found_dup_sk);
        if (*own_req) {
                tcp_move_syn(newtp, req);

                /* Clone pktoptions received with SYN, if we own the req */
                if (ireq->pktopts) {
                        newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk);
                        consume_skb(ireq->pktopts);
                        ireq->pktopts = NULL;
                        if (newnp->pktoptions)
                                tcp_v6_restore_cb(newnp->pktoptions);
                }
        } else {
                if (!req_unhash && found_dup_sk) {
                        /* This code path should only be executed in the
                         * syncookie case only
                         */
                        bh_unlock_sock(newsk);
                        sock_put(newsk);
                        newsk = NULL;
                }
        }

        return newsk;

out_overflow:
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
out_nonewsk:
        dst_release(dst);
out:
        tcp_listendrop(sk);
        return NULL;
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
/* The socket must have it's spinlock held when we get
 * here, unless it is a TCP_LISTEN socket.
 *
 * We have a potential double-lock case here, so even when
 * doing backlog processing we use the BH locking scheme.
 * This is because we cannot sleep with the original spinlock
 * held.
 */
INDIRECT_CALLABLE_SCOPE
int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct ipv6_pinfo *np = tcp_inet6_sk(sk);
        struct sk_buff *opt_skb = NULL;
        enum skb_drop_reason reason;
        struct tcp_sock *tp;

        /* Imagine: socket is IPv6. IPv4 packet arrives,
           goes to IPv4 receive handler and backlogged.
           From backlog it always goes here. Kerboom...
           Fortunately, tcp_rcv_established and rcv_established
           handle them correctly, but it is not case with
           tcp_v6_hnd_req and tcp_v6_send_reset().   --ANK
         */

        if (skb->protocol == htons(ETH_P_IP))
                return tcp_v4_do_rcv(sk, skb);

        /*
         *        socket locking is here for SMP purposes as backlog rcv
         *        is currently called with bh processing disabled.
         */

        /* Do Stevens' IPV6_PKTOPTIONS.

           Yes, guys, it is the only place in our code, where we
           may make it not affecting IPv4.
           The rest of code is protocol independent,
           and I do not like idea to uglify IPv4.

           Actually, all the idea behind IPV6_PKTOPTIONS
           looks not very well thought. For now we latch
           options, received in the last packet, enqueued
           by tcp. Feel free to propose better solution.
                                               --ANK (980728)
         */
        if (np->rxopt.all)
                opt_skb = skb_clone_and_charge_r(skb, sk);

        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                struct dst_entry *dst;

                dst = rcu_dereference_protected(sk->sk_rx_dst,
                                                lockdep_sock_is_held(sk));

                sock_rps_save_rxhash(sk, skb);
                sk_mark_napi_id(sk, skb);
                if (dst) {
                        if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
                            INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
                                            dst, sk->sk_rx_dst_cookie) == NULL) {
                                RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
                                dst_release(dst);
                        }
                }

                tcp_rcv_established(sk, skb);
                if (opt_skb)
                        goto ipv6_pktoptions;
                return 0;
        }

        if (tcp_checksum_complete(skb))
                goto csum_err;

        if (sk->sk_state == TCP_LISTEN) {
                struct sock *nsk = tcp_v6_cookie_check(sk, skb);

                if (nsk != sk) {
                        if (nsk) {
                                reason = tcp_child_process(sk, nsk, skb);
                                if (reason)
                                        goto reset;
                        }
                        if (opt_skb)
                                __kfree_skb(opt_skb);
                        return 0;
                }
        } else
                sock_rps_save_rxhash(sk, skb);

        reason = tcp_rcv_state_process(sk, skb);
        if (reason)
                goto reset;
        if (opt_skb)
                goto ipv6_pktoptions;
        return 0;

reset:
        tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
discard:
        if (opt_skb)
                __kfree_skb(opt_skb);
        sk_skb_reason_drop(sk, skb, reason);
        return 0;
csum_err:
        reason = SKB_DROP_REASON_TCP_CSUM;
        trace_tcp_bad_csum(skb);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
        goto discard;


ipv6_pktoptions:
        /* Do you ask, what is it?

           1. skb was enqueued by tcp.
           2. skb is added to tail of read queue, rather than out of order.
           3. socket is not in passive state.
           4. Finally, it really contains options, which user wants to receive.
         */
        tp = tcp_sk(sk);
        if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
            !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
                if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
                        WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb));
                if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
                        WRITE_ONCE(np->mcast_hops,
                                   ipv6_hdr(opt_skb)->hop_limit);
                if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
                        np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
                if (inet6_test_bit(REPFLOW, sk))
                        np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
                if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
                        tcp_v6_restore_cb(opt_skb);
                        opt_skb = xchg(&np->pktoptions, opt_skb);
                } else {
                        __kfree_skb(opt_skb);
                        opt_skb = xchg(&np->pktoptions, NULL);
                }
        }

        consume_skb(opt_skb);
        return 0;
}

static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
                           const struct tcphdr *th)
{
        /* This is tricky: we move IP6CB at its correct location into
         * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because
         * _decode_session6() uses IP6CB().
         * barrier() makes sure compiler won't play aliasing games.
         */
        memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb),
                sizeof(struct inet6_skb_parm));
        barrier();

        TCP_SKB_CB(skb)->seq = ntohl(th->seq);
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                                    skb->len - th->doff*4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
        TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
        TCP_SKB_CB(skb)->sacked = 0;
        TCP_SKB_CB(skb)->has_rxtstamp =
                        skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}

INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
        enum skb_drop_reason drop_reason;
        int sdif = inet6_sdif(skb);
        int dif = inet6_iif(skb);
        const struct tcphdr *th;
        const struct ipv6hdr *hdr;
        struct sock *sk = NULL;
        bool refcounted;
        int ret;
        u32 isn;
        struct net *net = dev_net(skb->dev);

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (skb->pkt_type != PACKET_HOST)
                goto discard_it;

        /*
         *        Count it even if it's bad.
         */
        __TCP_INC_STATS(net, TCP_MIB_INSEGS);

        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                goto discard_it;

        th = (const struct tcphdr *)skb->data;

        if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                goto bad_packet;
        }
        if (!pskb_may_pull(skb, th->doff*4))
                goto discard_it;

        if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo))
                goto csum_error;

        th = (const struct tcphdr *)skb->data;
        hdr = ipv6_hdr(skb);

lookup:
        sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th),
                                th->source, th->dest, inet6_iif(skb), sdif,
                                &refcounted);
        if (!sk)
                goto no_tcp_socket;

        if (sk->sk_state == TCP_TIME_WAIT)
                goto do_time_wait;

        if (sk->sk_state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);
                bool req_stolen = false;
                struct sock *nsk;

                sk = req->rsk_listener;
                if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
                        drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                else
                        drop_reason = tcp_inbound_hash(sk, req, skb,
                                                       &hdr->saddr, &hdr->daddr,
                                                       AF_INET6, dif, sdif);
                if (drop_reason) {
                        sk_drops_add(sk, skb);
                        reqsk_put(req);
                        goto discard_it;
                }
                if (tcp_checksum_complete(skb)) {
                        reqsk_put(req);
                        goto csum_error;
                }
                if (unlikely(sk->sk_state != TCP_LISTEN)) {
                        nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
                        if (!nsk) {
                                inet_csk_reqsk_queue_drop_and_put(sk, req);
                                goto lookup;
                        }
                        sk = nsk;
                        /* reuseport_migrate_sock() has already held one sk_refcnt
                         * before returning.
                         */
                } else {
                        sock_hold(sk);
                }
                refcounted = true;
                nsk = NULL;
                if (!tcp_filter(sk, skb)) {
                        th = (const struct tcphdr *)skb->data;
                        hdr = ipv6_hdr(skb);
                        tcp_v6_fill_cb(skb, hdr, th);
                        nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
                } else {
                        drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                }
                if (!nsk) {
                        reqsk_put(req);
                        if (req_stolen) {
                                /* Another cpu got exclusive access to req
                                 * and created a full blown socket.
                                 * Try to feed this packet to this socket
                                 * instead of discarding it.
                                 */
                                tcp_v6_restore_cb(skb);
                                sock_put(sk);
                                goto lookup;
                        }
                        goto discard_and_relse;
                }
                nf_reset_ct(skb);
                if (nsk == sk) {
                        reqsk_put(req);
                        tcp_v6_restore_cb(skb);
                } else {
                        drop_reason = tcp_child_process(sk, nsk, skb);
                        if (drop_reason) {
                                enum sk_rst_reason rst_reason;

                                rst_reason = sk_rst_convert_drop_reason(drop_reason);
                                tcp_v6_send_reset(nsk, skb, rst_reason);
                                goto discard_and_relse;
                        }
                        sock_put(sk);
                        return 0;
                }
        }

process:
        if (static_branch_unlikely(&ip6_min_hopcount)) {
                /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
                if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) {
                        __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                        drop_reason = SKB_DROP_REASON_TCP_MINTTL;
                        goto discard_and_relse;
                }
        }

        if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                goto discard_and_relse;
        }

        drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr,
                                       AF_INET6, dif, sdif);
        if (drop_reason)
                goto discard_and_relse;

        nf_reset_ct(skb);

        if (tcp_filter(sk, skb)) {
                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                goto discard_and_relse;
        }
        th = (const struct tcphdr *)skb->data;
        hdr = ipv6_hdr(skb);
        tcp_v6_fill_cb(skb, hdr, th);

        skb->dev = NULL;

        if (sk->sk_state == TCP_LISTEN) {
                ret = tcp_v6_do_rcv(sk, skb);
                goto put_and_return;
        }

        sk_incoming_cpu_update(sk);

        bh_lock_sock_nested(sk);
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
                ret = tcp_v6_do_rcv(sk, skb);
        } else {
                if (tcp_add_backlog(sk, skb, &drop_reason))
                        goto discard_and_relse;
        }
        bh_unlock_sock(sk);
put_and_return:
        if (refcounted)
                sock_put(sk);
        return ret ? -1 : 0;

no_tcp_socket:
        drop_reason = SKB_DROP_REASON_NO_SOCKET;
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
                goto discard_it;

        tcp_v6_fill_cb(skb, hdr, th);

        if (tcp_checksum_complete(skb)) {
csum_error:
                drop_reason = SKB_DROP_REASON_TCP_CSUM;
                trace_tcp_bad_csum(skb);
                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
                __TCP_INC_STATS(net, TCP_MIB_INERRS);
        } else {
                tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
        }

discard_it:
        SKB_DR_OR(drop_reason, NOT_SPECIFIED);
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;

discard_and_relse:
        sk_drops_add(sk, skb);
        if (refcounted)
                sock_put(sk);
        goto discard_it;

do_time_wait:
        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                drop_reason = SKB_DROP_REASON_XFRM_POLICY;
                inet_twsk_put(inet_twsk(sk));
                goto discard_it;
        }

        tcp_v6_fill_cb(skb, hdr, th);

        if (tcp_checksum_complete(skb)) {
                inet_twsk_put(inet_twsk(sk));
                goto csum_error;
        }

        switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
        case TCP_TW_SYN:
        {
                struct sock *sk2;

                sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
                                            skb, __tcp_hdrlen(th),
                                            &ipv6_hdr(skb)->saddr, th->source,
                                            &ipv6_hdr(skb)->daddr,
                                            ntohs(th->dest),
                                            tcp_v6_iif_l3_slave(skb),
                                            sdif);
                if (sk2) {
                        struct inet_timewait_sock *tw = inet_twsk(sk);
                        inet_twsk_deschedule_put(tw);
                        sk = sk2;
                        tcp_v6_restore_cb(skb);
                        refcounted = false;
                        __this_cpu_write(tcp_tw_isn, isn);
                        goto process;
                }
        }
                /* to ACK */
                fallthrough;
        case TCP_TW_ACK:
                tcp_v6_timewait_ack(sk, skb);
                break;
        case TCP_TW_RST:
                tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
                inet_twsk_deschedule_put(inet_twsk(sk));
                goto discard_it;
        case TCP_TW_SUCCESS:
                ;
        }
        goto discard_it;
}

void tcp_v6_early_demux(struct sk_buff *skb)
{
        struct net *net = dev_net(skb->dev);
        const struct ipv6hdr *hdr;
        const struct tcphdr *th;
        struct sock *sk;

        if (skb->pkt_type != PACKET_HOST)
                return;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
                return;

        hdr = ipv6_hdr(skb);
        th = tcp_hdr(skb);

        if (th->doff < sizeof(struct tcphdr) / 4)
                return;

        /* Note : We use inet6_iif() here, not tcp_v6_iif() */
        sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
                                        &hdr->saddr, th->source,
                                        &hdr->daddr, ntohs(th->dest),
                                        inet6_iif(skb), inet6_sdif(skb));
        if (sk) {
                skb->sk = sk;
                skb->destructor = sock_edemux;
                if (sk_fullsock(sk)) {
                        struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);

                        if (dst)
                                dst = dst_check(dst, sk->sk_rx_dst_cookie);
                        if (dst &&
                            sk->sk_rx_dst_ifindex == skb->skb_iif)
                                skb_dst_set_noref(skb, dst);
                }
        }
}

static struct timewait_sock_ops tcp6_timewait_sock_ops = {
        .twsk_obj_size        = sizeof(struct tcp6_timewait_sock),
        .twsk_destructor = tcp_twsk_destructor,
};

INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
{
        __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr);
}

const struct inet_connection_sock_af_ops ipv6_specific = {
        .queue_xmit           = inet6_csk_xmit,
        .send_check           = tcp_v6_send_check,
        .rebuild_header           = inet6_sk_rebuild_header,
        .sk_rx_dst_set           = inet6_sk_rx_dst_set,
        .conn_request           = tcp_v6_conn_request,
        .syn_recv_sock           = tcp_v6_syn_recv_sock,
        .net_header_len           = sizeof(struct ipv6hdr),
        .setsockopt           = ipv6_setsockopt,
        .getsockopt           = ipv6_getsockopt,
        .addr2sockaddr           = inet6_csk_addr2sockaddr,
        .sockaddr_len           = sizeof(struct sockaddr_in6),
        .mtu_reduced           = tcp_v6_mtu_reduced,
};

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
#ifdef CONFIG_TCP_MD5SIG
        .md5_lookup        =        tcp_v6_md5_lookup,
        .calc_md5_hash        =        tcp_v6_md5_hash_skb,
        .md5_parse        =        tcp_v6_parse_md5_keys,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v6_ao_lookup,
        .calc_ao_hash        =        tcp_v6_ao_hash_skb,
        .ao_parse        =        tcp_v6_parse_ao,
        .ao_calc_key_sk        =        tcp_v6_ao_calc_key_sk,
#endif
};
#endif

/*
 *        TCP over IPv4 via INET6 API
 */
static const struct inet_connection_sock_af_ops ipv6_mapped = {
        .queue_xmit           = ip_queue_xmit,
        .send_check           = tcp_v4_send_check,
        .rebuild_header           = inet_sk_rebuild_header,
        .sk_rx_dst_set           = inet_sk_rx_dst_set,
        .conn_request           = tcp_v6_conn_request,
        .syn_recv_sock           = tcp_v6_syn_recv_sock,
        .net_header_len           = sizeof(struct iphdr),
        .setsockopt           = ipv6_setsockopt,
        .getsockopt           = ipv6_getsockopt,
        .addr2sockaddr           = inet6_csk_addr2sockaddr,
        .sockaddr_len           = sizeof(struct sockaddr_in6),
        .mtu_reduced           = tcp_v4_mtu_reduced,
};

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
#ifdef CONFIG_TCP_MD5SIG
        .md5_lookup        =        tcp_v4_md5_lookup,
        .calc_md5_hash        =        tcp_v4_md5_hash_skb,
        .md5_parse        =        tcp_v6_parse_md5_keys,
#endif
#ifdef CONFIG_TCP_AO
        .ao_lookup        =        tcp_v6_ao_lookup,
        .calc_ao_hash        =        tcp_v4_ao_hash_skb,
        .ao_parse        =        tcp_v6_parse_ao,
        .ao_calc_key_sk        =        tcp_v4_ao_calc_key_sk,
#endif
};
#endif

/* NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
static int tcp_v6_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_init_sock(sk);

        icsk->icsk_af_ops = &ipv6_specific;

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
#endif

        return 0;
}

#ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq,
                         const struct request_sock *req, int i)
{
        long ttd = req->rsk_timer.expires - jiffies;
        const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
        const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr;

        if (ttd < 0)
                ttd = 0;

        seq_printf(seq,
                   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
                   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n",
                   i,
                   src->s6_addr32[0], src->s6_addr32[1],
                   src->s6_addr32[2], src->s6_addr32[3],
                   inet_rsk(req)->ir_num,
                   dest->s6_addr32[0], dest->s6_addr32[1],
                   dest->s6_addr32[2], dest->s6_addr32[3],
                   ntohs(inet_rsk(req)->ir_rmt_port),
                   TCP_SYN_RECV,
                   0, 0, /* could print option size, but that is af dependent. */
                   1,   /* timers active (only the expire timer) */
                   jiffies_to_clock_t(ttd),
                   req->num_timeout,
                   from_kuid_munged(seq_user_ns(seq),
                                    sock_i_uid(req->rsk_listener)),
                   0,  /* non standard timer */
                   0, /* open_requests have no inode */
                   0, req);
}

static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
{
        const struct in6_addr *dest, *src;
        __u16 destp, srcp;
        int timer_active;
        unsigned long timer_expires;
        const struct inet_sock *inet = inet_sk(sp);
        const struct tcp_sock *tp = tcp_sk(sp);
        const struct inet_connection_sock *icsk = inet_csk(sp);
        const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
        int rx_queue;
        int state;

        dest  = &sp->sk_v6_daddr;
        src   = &sp->sk_v6_rcv_saddr;
        destp = ntohs(inet->inet_dport);
        srcp  = ntohs(inet->inet_sport);

        if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
            icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
                timer_active        = 1;
                timer_expires        = icsk->icsk_timeout;
        } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
                timer_active        = 4;
                timer_expires        = icsk->icsk_timeout;
        } else if (timer_pending(&sp->sk_timer)) {
                timer_active        = 2;
                timer_expires        = sp->sk_timer.expires;
        } else {
                timer_active        = 0;
                timer_expires = jiffies;
        }

        state = inet_sk_state_load(sp);
        if (state == TCP_LISTEN)
                rx_queue = READ_ONCE(sp->sk_ack_backlog);
        else
                /* Because we don't lock the socket,
                 * we might find a transient negative value.
                 */
                rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
                                      READ_ONCE(tp->copied_seq), 0);

        seq_printf(seq,
                   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
                   "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n",
                   i,
                   src->s6_addr32[0], src->s6_addr32[1],
                   src->s6_addr32[2], src->s6_addr32[3], srcp,
                   dest->s6_addr32[0], dest->s6_addr32[1],
                   dest->s6_addr32[2], dest->s6_addr32[3], destp,
                   state,
                   READ_ONCE(tp->write_seq) - tp->snd_una,
                   rx_queue,
                   timer_active,
                   jiffies_delta_to_clock_t(timer_expires - jiffies),
                   icsk->icsk_retransmits,
                   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
                   icsk->icsk_probes_out,
                   sock_i_ino(sp),
                   refcount_read(&sp->sk_refcnt), sp,
                   jiffies_to_clock_t(icsk->icsk_rto),
                   jiffies_to_clock_t(icsk->icsk_ack.ato),
                   (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp),
                   tcp_snd_cwnd(tp),
                   state == TCP_LISTEN ?
                        fastopenq->max_qlen :
                        (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
                   );
}

static void get_timewait6_sock(struct seq_file *seq,
                               struct inet_timewait_sock *tw, int i)
{
        long delta = tw->tw_timer.expires - jiffies;
        const struct in6_addr *dest, *src;
        __u16 destp, srcp;

        dest = &tw->tw_v6_daddr;
        src  = &tw->tw_v6_rcv_saddr;
        destp = ntohs(tw->tw_dport);
        srcp  = ntohs(tw->tw_sport);

        seq_printf(seq,
                   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
                   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
                   i,
                   src->s6_addr32[0], src->s6_addr32[1],
                   src->s6_addr32[2], src->s6_addr32[3], srcp,
                   dest->s6_addr32[0], dest->s6_addr32[1],
                   dest->s6_addr32[2], dest->s6_addr32[3], destp,
                   tw->tw_substate, 0, 0,
                   3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
                   refcount_read(&tw->tw_refcnt), tw);
}

static int tcp6_seq_show(struct seq_file *seq, void *v)
{
        struct tcp_iter_state *st;
        struct sock *sk = v;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "  sl  "
                         "local_address                         "
                         "remote_address                        "
                         "st tx_queue rx_queue tr tm->when retrnsmt"
                         "   uid  timeout inode\n");
                goto out;
        }
        st = seq->private;

        if (sk->sk_state == TCP_TIME_WAIT)
                get_timewait6_sock(seq, v, st->num);
        else if (sk->sk_state == TCP_NEW_SYN_RECV)
                get_openreq6(seq, v, st->num);
        else
                get_tcp6_sock(seq, v, st->num);
out:
        return 0;
}

static const struct seq_operations tcp6_seq_ops = {
        .show                = tcp6_seq_show,
        .start                = tcp_seq_start,
        .next                = tcp_seq_next,
        .stop                = tcp_seq_stop,
};

static struct tcp_seq_afinfo tcp6_seq_afinfo = {
        .family                = AF_INET6,
};

int __net_init tcp6_proc_init(struct net *net)
{
        if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops,
                        sizeof(struct tcp_iter_state), &tcp6_seq_afinfo))
                return -ENOMEM;
        return 0;
}

void tcp6_proc_exit(struct net *net)
{
        remove_proc_entry("tcp6", net->proc_net);
}
#endif

struct proto tcpv6_prot = {
        .name                        = "TCPv6",
        .owner                        = THIS_MODULE,
        .close                        = tcp_close,
        .pre_connect                = tcp_v6_pre_connect,
        .connect                = tcp_v6_connect,
        .disconnect                = tcp_disconnect,
        .accept                        = inet_csk_accept,
        .ioctl                        = tcp_ioctl,
        .init                        = tcp_v6_init_sock,
        .destroy                = tcp_v4_destroy_sock,
        .shutdown                = tcp_shutdown,
        .setsockopt                = tcp_setsockopt,
        .getsockopt                = tcp_getsockopt,
        .bpf_bypass_getsockopt        = tcp_bpf_bypass_getsockopt,
        .keepalive                = tcp_set_keepalive,
        .recvmsg                = tcp_recvmsg,
        .sendmsg                = tcp_sendmsg,
        .splice_eof                = tcp_splice_eof,
        .backlog_rcv                = tcp_v6_do_rcv,
        .release_cb                = tcp_release_cb,
        .hash                        = inet6_hash,
        .unhash                        = inet_unhash,
        .get_port                = inet_csk_get_port,
        .put_port                = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
        .psock_update_sk_prot        = tcp_bpf_update_proto,
#endif
        .enter_memory_pressure        = tcp_enter_memory_pressure,
        .leave_memory_pressure        = tcp_leave_memory_pressure,
        .stream_memory_free        = tcp_stream_memory_free,
        .sockets_allocated        = &tcp_sockets_allocated,

        .memory_allocated        = &tcp_memory_allocated,
        .per_cpu_fw_alloc        = &tcp_memory_per_cpu_fw_alloc,

        .memory_pressure        = &tcp_memory_pressure,
        .orphan_count                = &tcp_orphan_count,
        .sysctl_mem                = sysctl_tcp_mem,
        .sysctl_wmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_wmem),
        .sysctl_rmem_offset        = offsetof(struct net, ipv4.sysctl_tcp_rmem),
        .max_header                = MAX_TCP_HEADER,
        .obj_size                = sizeof(struct tcp6_sock),
        .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
        .slab_flags                = SLAB_TYPESAFE_BY_RCU,
        .twsk_prot                = &tcp6_timewait_sock_ops,
        .rsk_prot                = &tcp6_request_sock_ops,
        .h.hashinfo                = NULL,
        .no_autobind                = true,
        .diag_destroy                = tcp_abort,
};
EXPORT_SYMBOL_GPL(tcpv6_prot);


static struct inet_protosw tcpv6_protosw = {
        .type                =        SOCK_STREAM,
        .protocol        =        IPPROTO_TCP,
        .prot                =        &tcpv6_prot,
        .ops                =        &inet6_stream_ops,
        .flags                =        INET_PROTOSW_PERMANENT |
                                INET_PROTOSW_ICSK,
};

static int __net_init tcpv6_net_init(struct net *net)
{
        int res;

        res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
                                   SOCK_RAW, IPPROTO_TCP, net);
        if (!res)
                net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC;

        return res;
}

static void __net_exit tcpv6_net_exit(struct net *net)
{
        inet_ctl_sock_destroy(net->ipv6.tcp_sk);
}

static struct pernet_operations tcpv6_net_ops = {
        .init            = tcpv6_net_init,
        .exit            = tcpv6_net_exit,
};

int __init tcpv6_init(void)
{
        int ret;

        net_hotdata.tcpv6_protocol = (struct inet6_protocol) {
                .handler     = tcp_v6_rcv,
                .err_handler = tcp_v6_err,
                .flags             = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
        };
        ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
        if (ret)
                goto out;

        /* register inet6 protocol */
        ret = inet6_register_protosw(&tcpv6_protosw);
        if (ret)
                goto out_tcpv6_protocol;

        ret = register_pernet_subsys(&tcpv6_net_ops);
        if (ret)
                goto out_tcpv6_protosw;

        ret = mptcpv6_init();
        if (ret)
                goto out_tcpv6_pernet_subsys;

out:
        return ret;

out_tcpv6_pernet_subsys:
        unregister_pernet_subsys(&tcpv6_net_ops);
out_tcpv6_protosw:
        inet6_unregister_protosw(&tcpv6_protosw);
out_tcpv6_protocol:
        inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
        goto out;
}

void tcpv6_exit(void)
{
        unregister_pernet_subsys(&tcpv6_net_ops);
        inet6_unregister_protosw(&tcpv6_protosw);
        inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
}












































































































































































    1 













    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * These are the definitions needed for the sctp_ulpevent type.  The
 * sctp_ulpevent type is used to carry information from the state machine
 * upwards to the ULP.
 *
 * This file is part of the SCTP kernel implementation
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *   Jon Grimm             <jgrimm@us.ibm.com>
 *   La Monte H.P. Yarroll <piggy@acm.org>
 *   Karl Knutson          <karl@athena.chicago.il.us>
 *   Sridhar Samudrala     <sri@us.ibm.com>
 */

#ifndef __sctp_ulpevent_h__
#define __sctp_ulpevent_h__

/* A structure to carry information to the ULP (e.g. Sockets API) */
/* Warning: This sits inside an skb.cb[] area.  Be very careful of
 * growing this structure as it is at the maximum limit now.
 *
 * sctp_ulpevent is saved in sk->cb(48 bytes), whose last 4 bytes
 * have been taken by sock_skb_cb, So here it has to use 'packed'
 * to make sctp_ulpevent fit into the rest 44 bytes.
 */
struct sctp_ulpevent {
        struct sctp_association *asoc;
        struct sctp_chunk *chunk;
        unsigned int rmem_len;
        union {
                __u32 mid;
                __u16 ssn;
        };
        union {
                __u32 ppid;
                __u32 fsn;
        };
        __u32 tsn;
        __u32 cumtsn;
        __u16 stream;
        __u16 flags;
        __u16 msg_flags;
} __packed;

/* Retrieve the skb this event sits inside of. */
static inline struct sk_buff *sctp_event2skb(const struct sctp_ulpevent *ev)
{
        return container_of((void *)ev, struct sk_buff, cb);
}

/* Retrieve & cast the event sitting inside the skb. */
static inline struct sctp_ulpevent *sctp_skb2event(struct sk_buff *skb)
{
        return (struct sctp_ulpevent *)skb->cb;
}

void sctp_ulpevent_free(struct sctp_ulpevent *);
int sctp_ulpevent_is_notification(const struct sctp_ulpevent *);
unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list);

struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
        const struct sctp_association *asoc,
        __u16 flags,
        __u16 state,
        __u16 error,
        __u16 outbound,
        __u16 inbound,
        struct sctp_chunk *chunk,
        gfp_t gfp);

void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport,
                                           int state, int error);

struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
        const struct sctp_association *asoc,
        struct sctp_chunk *chunk,
        __u16 flags,
        gfp_t gfp);
struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
        const struct sctp_association *asoc,
        struct sctp_chunk *chunk,
        __u16 flags,
        __u32 error,
        gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event(
        const struct sctp_association *asoc,
        struct sctp_chunk *chunk,
        __u16 flags,
        __u32 error,
        gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
        const struct sctp_association *asoc,
        __u16 flags,
        gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
        const struct sctp_association *asoc,
        __u32 indication, __u32 sid, __u32 seq,
        __u32 flags, gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication(
        const struct sctp_association *asoc, gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
        struct sctp_chunk *chunk,
        gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_authkey(
        const struct sctp_association *asoc, __u16 key_id,
        __u32 indication, gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
        const struct sctp_association *asoc, gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
        const struct sctp_association *asoc, __u16 flags,
        __u16 stream_num, __be16 *stream_list, gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event(
        const struct sctp_association *asoc, __u16 flags,
         __u32 local_tsn, __u32 remote_tsn, gfp_t gfp);

struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event(
        const struct sctp_association *asoc, __u16 flags,
        __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp);

struct sctp_ulpevent *sctp_make_reassembled_event(
        struct net *net, struct sk_buff_head *queue,
        struct sk_buff *f_frag, struct sk_buff *l_frag);

void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
                                   struct msghdr *);
void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event,
                                struct msghdr *);
void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event,
                                struct msghdr *, struct sock *sk);

__u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event);

static inline void sctp_ulpevent_type_set(__u16 *subscribe,
                                          __u16 sn_type, __u8 on)
{
        if (sn_type > SCTP_SN_TYPE_MAX)
                return;

        if (on)
                *subscribe |=  (1 << (sn_type - SCTP_SN_TYPE_BASE));
        else
                *subscribe &= ~(1 << (sn_type - SCTP_SN_TYPE_BASE));
}

/* Is this event type enabled? */
static inline bool sctp_ulpevent_type_enabled(__u16 subscribe, __u16 sn_type)
{
        if (sn_type > SCTP_SN_TYPE_MAX)
                return false;

        return subscribe & (1 << (sn_type - SCTP_SN_TYPE_BASE));
}

/* Given an event subscription, is this event enabled? */
static inline bool sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event,
                                            __u16 subscribe)
{
        __u16 sn_type;

        if (!sctp_ulpevent_is_notification(event))
                return true;

        sn_type = sctp_ulpevent_get_notification_type(event);

        return sctp_ulpevent_type_enabled(subscribe, sn_type);
}

#endif /* __sctp_ulpevent_h__ */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net>
 *
 * Development of this code funded by Astaro AG (http://www.astaro.com/)
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/vmalloc.h>
#include <linux/rhashtable.h>
#include <linux/audit.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_offload.h>
#include <net/net_namespace.h>
#include <net/sock.h>

#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
#define NFT_SET_MAX_ANONLEN 16

unsigned int nf_tables_net_id __read_mostly;

static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static LIST_HEAD(nf_tables_destroy_list);
static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
static DEFINE_SPINLOCK(nf_tables_gc_list_lock);

enum {
        NFT_VALIDATE_SKIP        = 0,
        NFT_VALIDATE_NEED,
        NFT_VALIDATE_DO,
};

static struct rhltable nft_objname_ht;

static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);

static u32 nft_objname_hash(const void *data, u32 len, u32 seed);
static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed);
static int nft_objname_hash_cmp(struct rhashtable_compare_arg *, const void *);

static const struct rhashtable_params nft_chain_ht_params = {
        .head_offset                = offsetof(struct nft_chain, rhlhead),
        .key_offset                = offsetof(struct nft_chain, name),
        .hashfn                        = nft_chain_hash,
        .obj_hashfn                = nft_chain_hash_obj,
        .obj_cmpfn                = nft_chain_hash_cmp,
        .automatic_shrinking        = true,
};

static const struct rhashtable_params nft_objname_ht_params = {
        .head_offset                = offsetof(struct nft_object, rhlhead),
        .key_offset                = offsetof(struct nft_object, key),
        .hashfn                        = nft_objname_hash,
        .obj_hashfn                = nft_objname_hash_obj,
        .obj_cmpfn                = nft_objname_hash_cmp,
        .automatic_shrinking        = true,
};

struct nft_audit_data {
        struct nft_table *table;
        int entries;
        int op;
        struct list_head list;
};

static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
        [NFT_MSG_NEWTABLE]        = AUDIT_NFT_OP_TABLE_REGISTER,
        [NFT_MSG_GETTABLE]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELTABLE]        = AUDIT_NFT_OP_TABLE_UNREGISTER,
        [NFT_MSG_NEWCHAIN]        = AUDIT_NFT_OP_CHAIN_REGISTER,
        [NFT_MSG_GETCHAIN]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELCHAIN]        = AUDIT_NFT_OP_CHAIN_UNREGISTER,
        [NFT_MSG_NEWRULE]        = AUDIT_NFT_OP_RULE_REGISTER,
        [NFT_MSG_GETRULE]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELRULE]        = AUDIT_NFT_OP_RULE_UNREGISTER,
        [NFT_MSG_NEWSET]        = AUDIT_NFT_OP_SET_REGISTER,
        [NFT_MSG_GETSET]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELSET]        = AUDIT_NFT_OP_SET_UNREGISTER,
        [NFT_MSG_NEWSETELEM]        = AUDIT_NFT_OP_SETELEM_REGISTER,
        [NFT_MSG_GETSETELEM]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELSETELEM]        = AUDIT_NFT_OP_SETELEM_UNREGISTER,
        [NFT_MSG_NEWGEN]        = AUDIT_NFT_OP_GEN_REGISTER,
        [NFT_MSG_GETGEN]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_TRACE]                = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_NEWOBJ]        = AUDIT_NFT_OP_OBJ_REGISTER,
        [NFT_MSG_GETOBJ]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELOBJ]        = AUDIT_NFT_OP_OBJ_UNREGISTER,
        [NFT_MSG_GETOBJ_RESET]        = AUDIT_NFT_OP_OBJ_RESET,
        [NFT_MSG_NEWFLOWTABLE]        = AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        [NFT_MSG_GETFLOWTABLE]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELFLOWTABLE]        = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET,
};

static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
{
        switch (table->validate_state) {
        case NFT_VALIDATE_SKIP:
                WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
                break;
        case NFT_VALIDATE_NEED:
                break;
        case NFT_VALIDATE_DO:
                if (new_validate_state == NFT_VALIDATE_NEED)
                        return;
        }

        table->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);
static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);

static void nft_trans_gc_work(struct work_struct *work);
static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);

static void nft_ctx_init(struct nft_ctx *ctx,
                         struct net *net,
                         const struct sk_buff *skb,
                         const struct nlmsghdr *nlh,
                         u8 family,
                         struct nft_table *table,
                         struct nft_chain *chain,
                         const struct nlattr * const *nla)
{
        ctx->net        = net;
        ctx->family        = family;
        ctx->level        = 0;
        ctx->table        = table;
        ctx->chain        = chain;
        ctx->nla           = nla;
        ctx->portid        = NETLINK_CB(skb).portid;
        ctx->report        = nlmsg_report(nlh);
        ctx->flags        = nlh->nlmsg_flags;
        ctx->seq        = nlh->nlmsg_seq;
}

static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
                                             int msg_type, u32 size, gfp_t gfp)
{
        struct nft_trans *trans;

        trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
        if (trans == NULL)
                return NULL;

        INIT_LIST_HEAD(&trans->list);
        INIT_LIST_HEAD(&trans->binding_list);
        trans->msg_type = msg_type;
        trans->ctx        = *ctx;

        return trans;
}

static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
                                         int msg_type, u32 size)
{
        return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
}

static void nft_trans_list_del(struct nft_trans *trans)
{
        list_del(&trans->list);
        list_del(&trans->binding_list);
}

static void nft_trans_destroy(struct nft_trans *trans)
{
        nft_trans_list_del(trans);
        kfree(trans);
}

static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set,
                                 bool bind)
{
        struct nftables_pernet *nft_net;
        struct net *net = ctx->net;
        struct nft_trans *trans;

        if (!nft_set_is_anonymous(set))
                return;

        nft_net = nft_pernet(net);
        list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
                switch (trans->msg_type) {
                case NFT_MSG_NEWSET:
                        if (nft_trans_set(trans) == set)
                                nft_trans_set_bound(trans) = bind;
                        break;
                case NFT_MSG_NEWSETELEM:
                        if (nft_trans_elem_set(trans) == set)
                                nft_trans_elem_set_bound(trans) = bind;
                        break;
                }
        }
}

static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
{
        return __nft_set_trans_bind(ctx, set, true);
}

static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set)
{
        return __nft_set_trans_bind(ctx, set, false);
}

static void __nft_chain_trans_bind(const struct nft_ctx *ctx,
                                   struct nft_chain *chain, bool bind)
{
        struct nftables_pernet *nft_net;
        struct net *net = ctx->net;
        struct nft_trans *trans;

        if (!nft_chain_binding(chain))
                return;

        nft_net = nft_pernet(net);
        list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
                switch (trans->msg_type) {
                case NFT_MSG_NEWCHAIN:
                        if (nft_trans_chain(trans) == chain)
                                nft_trans_chain_bound(trans) = bind;
                        break;
                case NFT_MSG_NEWRULE:
                        if (trans->ctx.chain == chain)
                                nft_trans_rule_bound(trans) = bind;
                        break;
                }
        }
}

static void nft_chain_trans_bind(const struct nft_ctx *ctx,
                                 struct nft_chain *chain)
{
        __nft_chain_trans_bind(ctx, chain, true);
}

int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
{
        if (!nft_chain_binding(chain))
                return 0;

        if (nft_chain_binding(ctx->chain))
                return -EOPNOTSUPP;

        if (chain->bound)
                return -EBUSY;

        if (!nft_use_inc(&chain->use))
                return -EMFILE;

        chain->bound = true;
        nft_chain_trans_bind(ctx, chain);

        return 0;
}

void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
{
        __nft_chain_trans_bind(ctx, chain, false);
}

static int nft_netdev_register_hooks(struct net *net,
                                     struct list_head *hook_list)
{
        struct nft_hook *hook;
        int err, j;

        j = 0;
        list_for_each_entry(hook, hook_list, list) {
                err = nf_register_net_hook(net, &hook->ops);
                if (err < 0)
                        goto err_register;

                j++;
        }
        return 0;

err_register:
        list_for_each_entry(hook, hook_list, list) {
                if (j-- <= 0)
                        break;

                nf_unregister_net_hook(net, &hook->ops);
        }
        return err;
}

static void nft_netdev_unregister_hooks(struct net *net,
                                        struct list_head *hook_list,
                                        bool release_netdev)
{
        struct nft_hook *hook, *next;

        list_for_each_entry_safe(hook, next, hook_list, list) {
                nf_unregister_net_hook(net, &hook->ops);
                if (release_netdev) {
                        list_del(&hook->list);
                        kfree_rcu(hook, rcu);
                }
        }
}

static int nf_tables_register_hook(struct net *net,
                                   const struct nft_table *table,
                                   struct nft_chain *chain)
{
        struct nft_base_chain *basechain;
        const struct nf_hook_ops *ops;

        if (table->flags & NFT_TABLE_F_DORMANT ||
            !nft_is_base_chain(chain))
                return 0;

        basechain = nft_base_chain(chain);
        ops = &basechain->ops;

        if (basechain->type->ops_register)
                return basechain->type->ops_register(net, ops);

        if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
                return nft_netdev_register_hooks(net, &basechain->hook_list);

        return nf_register_net_hook(net, &basechain->ops);
}

static void __nf_tables_unregister_hook(struct net *net,
                                        const struct nft_table *table,
                                        struct nft_chain *chain,
                                        bool release_netdev)
{
        struct nft_base_chain *basechain;
        const struct nf_hook_ops *ops;

        if (table->flags & NFT_TABLE_F_DORMANT ||
            !nft_is_base_chain(chain))
                return;
        basechain = nft_base_chain(chain);
        ops = &basechain->ops;

        if (basechain->type->ops_unregister)
                return basechain->type->ops_unregister(net, ops);

        if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
                nft_netdev_unregister_hooks(net, &basechain->hook_list,
                                            release_netdev);
        else
                nf_unregister_net_hook(net, &basechain->ops);
}

static void nf_tables_unregister_hook(struct net *net,
                                      const struct nft_table *table,
                                      struct nft_chain *chain)
{
        return __nf_tables_unregister_hook(net, table, chain, false);
}

static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        switch (trans->msg_type) {
        case NFT_MSG_NEWSET:
                if (!nft_trans_set_update(trans) &&
                    nft_set_is_anonymous(nft_trans_set(trans)))
                        list_add_tail(&trans->binding_list, &nft_net->binding_list);
                break;
        case NFT_MSG_NEWCHAIN:
                if (!nft_trans_chain_update(trans) &&
                    nft_chain_binding(nft_trans_chain(trans)))
                        list_add_tail(&trans->binding_list, &nft_net->binding_list);
                break;
        }

        list_add_tail(&trans->list, &nft_net->commit_list);
}

static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table));
        if (trans == NULL)
                return -ENOMEM;

        if (msg_type == NFT_MSG_NEWTABLE)
                nft_activate_next(ctx->net, ctx->table);

        nft_trans_commit_list_add_tail(ctx->net, trans);
        return 0;
}

static int nft_deltable(struct nft_ctx *ctx)
{
        int err;

        err = nft_trans_table_add(ctx, NFT_MSG_DELTABLE);
        if (err < 0)
                return err;

        nft_deactivate_next(ctx->net, ctx->table);
        return err;
}

static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
        if (trans == NULL)
                return ERR_PTR(-ENOMEM);

        if (msg_type == NFT_MSG_NEWCHAIN) {
                nft_activate_next(ctx->net, ctx->chain);

                if (ctx->nla[NFTA_CHAIN_ID]) {
                        nft_trans_chain_id(trans) =
                                ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID]));
                }
        }
        nft_trans_chain(trans) = ctx->chain;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return trans;
}

static int nft_delchain(struct nft_ctx *ctx)
{
        struct nft_trans *trans;

        trans = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        nft_use_dec(&ctx->table->use);
        nft_deactivate_next(ctx->net, ctx->chain);

        return 0;
}

void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule)
{
        struct nft_expr *expr;

        expr = nft_expr_first(rule);
        while (nft_expr_more(rule, expr)) {
                if (expr->ops->activate)
                        expr->ops->activate(ctx, expr);

                expr = nft_expr_next(expr);
        }
}

void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule,
                              enum nft_trans_phase phase)
{
        struct nft_expr *expr;

        expr = nft_expr_first(rule);
        while (nft_expr_more(rule, expr)) {
                if (expr->ops->deactivate)
                        expr->ops->deactivate(ctx, expr, phase);

                expr = nft_expr_next(expr);
        }
}

static int
nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
{
        /* You cannot delete the same rule twice */
        if (nft_is_active_next(ctx->net, rule)) {
                nft_deactivate_next(ctx->net, rule);
                nft_use_dec(&ctx->chain->use);
                return 0;
        }
        return -ENOENT;
}

static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
                                            struct nft_rule *rule)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule));
        if (trans == NULL)
                return NULL;

        if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
                nft_trans_rule_id(trans) =
                        ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
        }
        nft_trans_rule(trans) = rule;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return trans;
}

static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
{
        struct nft_flow_rule *flow;
        struct nft_trans *trans;
        int err;

        trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule);
        if (trans == NULL)
                return -ENOMEM;

        if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) {
                flow = nft_flow_rule_create(ctx->net, rule);
                if (IS_ERR(flow)) {
                        nft_trans_destroy(trans);
                        return PTR_ERR(flow);
                }

                nft_trans_flow_rule(trans) = flow;
        }

        err = nf_tables_delrule_deactivate(ctx, rule);
        if (err < 0) {
                nft_trans_destroy(trans);
                return err;
        }
        nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE);

        return 0;
}

static int nft_delrule_by_chain(struct nft_ctx *ctx)
{
        struct nft_rule *rule;
        int err;

        list_for_each_entry(rule, &ctx->chain->rules, list) {
                if (!nft_is_active_next(ctx->net, rule))
                        continue;

                err = nft_delrule(ctx, rule);
                if (err < 0)
                        return err;
        }
        return 0;
}

static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
                               struct nft_set *set,
                               const struct nft_set_desc *desc)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
        if (trans == NULL)
                return -ENOMEM;

        if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) {
                nft_trans_set_id(trans) =
                        ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
                nft_activate_next(ctx->net, set);
        }
        nft_trans_set(trans) = set;
        if (desc) {
                nft_trans_set_update(trans) = true;
                nft_trans_set_gc_int(trans) = desc->gc_int;
                nft_trans_set_timeout(trans) = desc->timeout;
                nft_trans_set_size(trans) = desc->size;
        }
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;
}

static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
                             struct nft_set *set)
{
        return __nft_trans_set_add(ctx, msg_type, set, NULL);
}

static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
                                  struct nft_set *set,
                                  const struct nft_set_iter *iter,
                                  struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        nft_set_elem_change_active(ctx->net, set, ext);
        nft_setelem_data_deactivate(ctx->net, set, elem_priv);

        return 0;
}

struct nft_set_elem_catchall {
        struct list_head        list;
        struct rcu_head                rcu;
        struct nft_elem_priv        *elem;
};

static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
                                        struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                nft_set_elem_change_active(ctx->net, set, ext);
                nft_setelem_data_deactivate(ctx->net, set, catchall->elem);
                break;
        }
}

static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
{
        struct nft_set_iter iter = {
                .genmask        = nft_genmask_next(ctx->net),
                .type                = NFT_ITER_UPDATE,
                .fn                = nft_mapelem_deactivate,
        };

        set->ops->walk(ctx, set, &iter);
        WARN_ON_ONCE(iter.err);

        nft_map_catchall_deactivate(ctx, set);
}

static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
{
        int err;

        err = nft_trans_set_add(ctx, NFT_MSG_DELSET, set);
        if (err < 0)
                return err;

        if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                nft_map_deactivate(ctx, set);

        nft_deactivate_next(ctx->net, set);
        nft_use_dec(&ctx->table->use);

        return err;
}

static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
                             struct nft_object *obj)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj));
        if (trans == NULL)
                return -ENOMEM;

        if (msg_type == NFT_MSG_NEWOBJ)
                nft_activate_next(ctx->net, obj);

        nft_trans_obj(trans) = obj;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;
}

static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
{
        int err;

        err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj);
        if (err < 0)
                return err;

        nft_deactivate_next(ctx->net, obj);
        nft_use_dec(&ctx->table->use);

        return err;
}

static struct nft_trans *
nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
                        struct nft_flowtable *flowtable)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type,
                                sizeof(struct nft_trans_flowtable));
        if (trans == NULL)
                return ERR_PTR(-ENOMEM);

        if (msg_type == NFT_MSG_NEWFLOWTABLE)
                nft_activate_next(ctx->net, flowtable);

        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
        nft_trans_flowtable(trans) = flowtable;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return trans;
}

static int nft_delflowtable(struct nft_ctx *ctx,
                            struct nft_flowtable *flowtable)
{
        struct nft_trans *trans;

        trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        nft_deactivate_next(ctx->net, flowtable);
        nft_use_dec(&ctx->table->use);

        return 0;
}

static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg)
{
        int i;

        for (i = track->regs[dreg].num_reg; i > 0; i--)
                __nft_reg_track_cancel(track, dreg - i);
}

static void __nft_reg_track_update(struct nft_regs_track *track,
                                   const struct nft_expr *expr,
                                   u8 dreg, u8 num_reg)
{
        track->regs[dreg].selector = expr;
        track->regs[dreg].bitwise = NULL;
        track->regs[dreg].num_reg = num_reg;
}

void nft_reg_track_update(struct nft_regs_track *track,
                          const struct nft_expr *expr, u8 dreg, u8 len)
{
        unsigned int regcount;
        int i;

        __nft_reg_track_clobber(track, dreg);

        regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
        for (i = 0; i < regcount; i++, dreg++)
                __nft_reg_track_update(track, expr, dreg, i);
}
EXPORT_SYMBOL_GPL(nft_reg_track_update);

void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len)
{
        unsigned int regcount;
        int i;

        __nft_reg_track_clobber(track, dreg);

        regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
        for (i = 0; i < regcount; i++, dreg++)
                __nft_reg_track_cancel(track, dreg);
}
EXPORT_SYMBOL_GPL(nft_reg_track_cancel);

void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg)
{
        track->regs[dreg].selector = NULL;
        track->regs[dreg].bitwise = NULL;
        track->regs[dreg].num_reg = 0;
}
EXPORT_SYMBOL_GPL(__nft_reg_track_cancel);

/*
 * Tables
 */

static struct nft_table *nft_table_lookup(const struct net *net,
                                          const struct nlattr *nla,
                                          u8 family, u8 genmask, u32 nlpid)
{
        struct nftables_pernet *nft_net;
        struct nft_table *table;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        nft_net = nft_pernet(net);
        list_for_each_entry_rcu(table, &nft_net->tables, list,
                                lockdep_is_held(&nft_net->commit_mutex)) {
                if (!nla_strcmp(nla, table->name) &&
                    table->family == family &&
                    nft_active_genmask(table, genmask)) {
                        if (nft_table_has_owner(table) &&
                            nlpid && table->nlpid != nlpid)
                                return ERR_PTR(-EPERM);

                        return table;
                }
        }

        return ERR_PTR(-ENOENT);
}

static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
                                                   const struct nlattr *nla,
                                                   int family, u8 genmask, u32 nlpid)
{
        struct nftables_pernet *nft_net;
        struct nft_table *table;

        nft_net = nft_pernet(net);
        list_for_each_entry(table, &nft_net->tables, list) {
                if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
                    table->family == family &&
                    nft_active_genmask(table, genmask)) {
                        if (nft_table_has_owner(table) &&
                            nlpid && table->nlpid != nlpid)
                                return ERR_PTR(-EPERM);

                        return table;
                }
        }

        return ERR_PTR(-ENOENT);
}

static inline u64 nf_tables_alloc_handle(struct nft_table *table)
{
        return ++table->hgenerator;
}

static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];

static const struct nft_chain_type *
__nft_chain_type_get(u8 family, enum nft_chain_types type)
{
        if (family >= NFPROTO_NUMPROTO ||
            type >= NFT_CHAIN_T_MAX)
                return NULL;

        return chain_type[family][type];
}

static const struct nft_chain_type *
__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
{
        const struct nft_chain_type *type;
        int i;

        for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
                type = __nft_chain_type_get(family, i);
                if (!type)
                        continue;
                if (!nla_strcmp(nla, type->name))
                        return type;
        }
        return NULL;
}

struct nft_module_request {
        struct list_head        list;
        char                        module[MODULE_NAME_LEN];
        bool                        done;
};

#ifdef CONFIG_MODULES
__printf(2, 3) int nft_request_module(struct net *net, const char *fmt,
                                      ...)
{
        char module_name[MODULE_NAME_LEN];
        struct nftables_pernet *nft_net;
        struct nft_module_request *req;
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
        if (ret >= MODULE_NAME_LEN)
                return 0;

        nft_net = nft_pernet(net);
        list_for_each_entry(req, &nft_net->module_list, list) {
                if (!strcmp(req->module, module_name)) {
                        if (req->done)
                                return 0;

                        /* A request to load this module already exists. */
                        return -EAGAIN;
                }
        }

        req = kmalloc(sizeof(*req), GFP_KERNEL);
        if (!req)
                return -ENOMEM;

        req->done = false;
        strscpy(req->module, module_name, MODULE_NAME_LEN);
        list_add_tail(&req->list, &nft_net->module_list);

        return -EAGAIN;
}
EXPORT_SYMBOL_GPL(nft_request_module);
#endif

static void lockdep_nfnl_nft_mutex_not_held(void)
{
#ifdef CONFIG_PROVE_LOCKING
        if (debug_locks)
                WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
#endif
}

static const struct nft_chain_type *
nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
                            u8 family, bool autoload)
{
        const struct nft_chain_type *type;

        type = __nf_tables_chain_type_lookup(nla, family);
        if (type != NULL)
                return type;

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (autoload) {
                if (nft_request_module(net, "nft-chain-%u-%.*s", family,
                                       nla_len(nla),
                                       (const char *)nla_data(nla)) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

static __be16 nft_base_seq(const struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        return htons(nft_net->base_seq & 0xffff);
}

static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
        [NFTA_TABLE_NAME]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_TABLE_FLAGS]        = { .type = NLA_U32 },
        [NFTA_TABLE_HANDLE]        = { .type = NLA_U64 },
        [NFTA_TABLE_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN }
};

static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
                                     u32 portid, u32 seq, int event, u32 flags,
                                     int family, const struct nft_table *table)
{
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
            nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
            nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
                         NFTA_TABLE_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELTABLE) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nla_put_be32(skb, NFTA_TABLE_FLAGS,
                         htonl(table->flags & NFT_TABLE_F_MASK)))
                goto nla_put_failure;

        if (nft_table_has_owner(table) &&
            nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid)))
                goto nla_put_failure;

        if (table->udata) {
                if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata))
                        goto nla_put_failure;
        }

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

struct nftnl_skb_parms {
        bool report;
};
#define NFT_CB(skb)        (*(struct nftnl_skb_parms*)&((skb)->cb))

static void nft_notify_enqueue(struct sk_buff *skb, bool report,
                               struct list_head *notify_list)
{
        NFT_CB(skb).report = report;
        list_add_tail(&skb->list, notify_list);
}

static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
{
        struct nftables_pernet *nft_net;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq,
                                        event, flags, ctx->family, ctx->table);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_net = nft_pernet(ctx->net);
        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static int nf_tables_dump_tables(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nftables_pernet *nft_net;
        const struct nft_table *table;
        unsigned int idx = 0, s_idx = cb->args[0];
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                if (idx < s_idx)
                        goto cont;
                if (idx > s_idx)
                        memset(&cb->args[1], 0,
                               sizeof(cb->args) - sizeof(cb->args[0]));
                if (!nft_is_active(net, table))
                        continue;
                if (nf_tables_fill_table_info(skb, net,
                                              NETLINK_CB(cb->skb).portid,
                                              cb->nlh->nlmsg_seq,
                                              NFT_MSG_NEWTABLE, NLM_F_MULTI,
                                              table->family, table) < 0)
                        goto done;

                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                idx++;
        }
done:
        rcu_read_unlock();
        cb->args[0] = idx;
        return skb->len;
}

static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
                                      const struct nlmsghdr *nlh,
                                      struct netlink_dump_control *c)
{
        int err;

        if (!try_module_get(THIS_MODULE))
                return -EINVAL;

        rcu_read_unlock();
        err = netlink_dump_start(nlsk, skb, nlh, c);
        rcu_read_lock();
        module_put(THIS_MODULE);

        return err;
}

/* called with rcu_read_lock held */
static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_table *table;
        struct net *net = info->net;
        struct sk_buff *skb2;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_tables,
                        .module = THIS_MODULE,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
                return PTR_ERR(table);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;

        err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid,
                                        info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE,
                                        0, family, table);
        if (err < 0)
                goto err_fill_table_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_table_info:
        kfree_skb(skb2);
        return err;
}

static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
{
        struct nft_chain *chain;
        u32 i = 0;

        list_for_each_entry(chain, &table->chains, list) {
                if (!nft_is_active_next(net, chain))
                        continue;
                if (!nft_is_base_chain(chain))
                        continue;

                if (cnt && i++ == cnt)
                        break;

                nf_tables_unregister_hook(net, table, chain);
        }
}

static int nf_tables_table_enable(struct net *net, struct nft_table *table)
{
        struct nft_chain *chain;
        int err, i = 0;

        list_for_each_entry(chain, &table->chains, list) {
                if (!nft_is_active_next(net, chain))
                        continue;
                if (!nft_is_base_chain(chain))
                        continue;

                err = nf_tables_register_hook(net, table, chain);
                if (err < 0)
                        goto err_register_hooks;

                i++;
        }
        return 0;

err_register_hooks:
        if (i)
                nft_table_disable(net, table, i);
        return err;
}

static void nf_tables_table_disable(struct net *net, struct nft_table *table)
{
        table->flags &= ~NFT_TABLE_F_DORMANT;
        nft_table_disable(net, table, 0);
        table->flags |= NFT_TABLE_F_DORMANT;
}

#define __NFT_TABLE_F_INTERNAL                (NFT_TABLE_F_MASK + 1)
#define __NFT_TABLE_F_WAS_DORMANT        (__NFT_TABLE_F_INTERNAL << 0)
#define __NFT_TABLE_F_WAS_AWAKEN        (__NFT_TABLE_F_INTERNAL << 1)
#define __NFT_TABLE_F_WAS_ORPHAN        (__NFT_TABLE_F_INTERNAL << 2)
#define __NFT_TABLE_F_UPDATE                (__NFT_TABLE_F_WAS_DORMANT | \
                                         __NFT_TABLE_F_WAS_AWAKEN | \
                                         __NFT_TABLE_F_WAS_ORPHAN)

static bool nft_table_pending_update(const struct nft_ctx *ctx)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        struct nft_trans *trans;

        if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
                return true;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                if (trans->ctx.table == ctx->table &&
                    ((trans->msg_type == NFT_MSG_NEWCHAIN &&
                      nft_trans_chain_update(trans)) ||
                     (trans->msg_type == NFT_MSG_DELCHAIN &&
                      nft_is_base_chain(trans->ctx.chain))))
                        return true;
        }

        return false;
}

static int nf_tables_updtable(struct nft_ctx *ctx)
{
        struct nft_trans *trans;
        u32 flags;
        int ret;

        if (!ctx->nla[NFTA_TABLE_FLAGS])
                return 0;

        flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
        if (flags & ~NFT_TABLE_F_MASK)
                return -EOPNOTSUPP;

        if (flags == (ctx->table->flags & NFT_TABLE_F_MASK))
                return 0;

        if ((nft_table_has_owner(ctx->table) &&
             !(flags & NFT_TABLE_F_OWNER)) ||
            (flags & NFT_TABLE_F_OWNER &&
             !nft_table_is_orphan(ctx->table)))
                return -EOPNOTSUPP;

        if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST)
                return -EOPNOTSUPP;

        /* No dormant off/on/off/on games in single transaction */
        if (nft_table_pending_update(ctx))
                return -EINVAL;

        trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
                                sizeof(struct nft_trans_table));
        if (trans == NULL)
                return -ENOMEM;

        if ((flags & NFT_TABLE_F_DORMANT) &&
            !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
                ctx->table->flags |= NFT_TABLE_F_DORMANT;
                if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE))
                        ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN;
        } else if (!(flags & NFT_TABLE_F_DORMANT) &&
                   ctx->table->flags & NFT_TABLE_F_DORMANT) {
                ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
                if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) {
                        ret = nf_tables_table_enable(ctx->net, ctx->table);
                        if (ret < 0)
                                goto err_register_hooks;

                        ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT;
                }
        }

        if ((flags & NFT_TABLE_F_OWNER) &&
            !nft_table_has_owner(ctx->table)) {
                ctx->table->nlpid = ctx->portid;
                ctx->table->flags |= NFT_TABLE_F_OWNER |
                                     __NFT_TABLE_F_WAS_ORPHAN;
        }

        nft_trans_table_update(trans) = true;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_register_hooks:
        ctx->table->flags |= NFT_TABLE_F_DORMANT;
        nft_trans_destroy(trans);
        return ret;
}

static u32 nft_chain_hash(const void *data, u32 len, u32 seed)
{
        const char *name = data;

        return jhash(name, strlen(name), seed);
}

static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed)
{
        const struct nft_chain *chain = data;

        return nft_chain_hash(chain->name, 0, seed);
}

static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
                              const void *ptr)
{
        const struct nft_chain *chain = ptr;
        const char *name = arg->key;

        return strcmp(chain->name, name);
}

static u32 nft_objname_hash(const void *data, u32 len, u32 seed)
{
        const struct nft_object_hash_key *k = data;

        seed ^= hash_ptr(k->table, 32);

        return jhash(k->name, strlen(k->name), seed);
}

static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed)
{
        const struct nft_object *obj = data;

        return nft_objname_hash(&obj->key, 0, seed);
}

static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg,
                                const void *ptr)
{
        const struct nft_object_hash_key *k = arg->key;
        const struct nft_object *obj = ptr;

        if (obj->key.table != k->table)
                return -1;

        return strcmp(obj->key.name, k->name);
}

static bool nft_supported_family(u8 family)
{
        return false
#ifdef CONFIG_NF_TABLES_INET
                || family == NFPROTO_INET
#endif
#ifdef CONFIG_NF_TABLES_IPV4
                || family == NFPROTO_IPV4
#endif
#ifdef CONFIG_NF_TABLES_ARP
                || family == NFPROTO_ARP
#endif
#ifdef CONFIG_NF_TABLES_NETDEV
                || family == NFPROTO_NETDEV
#endif
#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
                || family == NFPROTO_BRIDGE
#endif
#ifdef CONFIG_NF_TABLES_IPV6
                || family == NFPROTO_IPV6
#endif
                ;
}

static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_ctx ctx;
        u32 flags = 0;
        int err;

        if (!nft_supported_family(family))
                return -EOPNOTSUPP;

        lockdep_assert_held(&nft_net->commit_mutex);
        attr = nla[NFTA_TABLE_NAME];
        table = nft_table_lookup(net, attr, family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                if (PTR_ERR(table) != -ENOENT)
                        return PTR_ERR(table);
        } else {
                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

                return nf_tables_updtable(&ctx);
        }

        if (nla[NFTA_TABLE_FLAGS]) {
                flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
                if (flags & ~NFT_TABLE_F_MASK)
                        return -EOPNOTSUPP;
        }

        err = -ENOMEM;
        table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
        if (table == NULL)
                goto err_kzalloc;

        table->validate_state = nft_net->validate_state;
        table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
        if (table->name == NULL)
                goto err_strdup;

        if (nla[NFTA_TABLE_USERDATA]) {
                table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
                if (table->udata == NULL)
                        goto err_table_udata;

                table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
        }

        err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
        if (err)
                goto err_chain_ht;

        INIT_LIST_HEAD(&table->chains);
        INIT_LIST_HEAD(&table->sets);
        INIT_LIST_HEAD(&table->objects);
        INIT_LIST_HEAD(&table->flowtables);
        table->family = family;
        table->flags = flags;
        table->handle = ++nft_net->table_handle;
        if (table->flags & NFT_TABLE_F_OWNER)
                table->nlpid = NETLINK_CB(skb).portid;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
        err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
        if (err < 0)
                goto err_trans;

        list_add_tail_rcu(&table->list, &nft_net->tables);
        return 0;
err_trans:
        rhltable_destroy(&table->chains_ht);
err_chain_ht:
        kfree(table->udata);
err_table_udata:
        kfree(table->name);
err_strdup:
        kfree(table);
err_kzalloc:
        return err;
}

static int nft_flush_table(struct nft_ctx *ctx)
{
        struct nft_flowtable *flowtable, *nft;
        struct nft_chain *chain, *nc;
        struct nft_object *obj, *ne;
        struct nft_set *set, *ns;
        int err;

        list_for_each_entry(chain, &ctx->table->chains, list) {
                if (!nft_is_active_next(ctx->net, chain))
                        continue;

                if (nft_chain_binding(chain))
                        continue;

                ctx->chain = chain;

                err = nft_delrule_by_chain(ctx);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(set, ns, &ctx->table->sets, list) {
                if (!nft_is_active_next(ctx->net, set))
                        continue;

                if (nft_set_is_anonymous(set))
                        continue;

                err = nft_delset(ctx, set);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) {
                if (!nft_is_active_next(ctx->net, flowtable))
                        continue;

                err = nft_delflowtable(ctx, flowtable);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
                if (!nft_is_active_next(ctx->net, obj))
                        continue;

                err = nft_delobj(ctx, obj);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
                if (!nft_is_active_next(ctx->net, chain))
                        continue;

                if (nft_chain_binding(chain))
                        continue;

                ctx->chain = chain;

                err = nft_delchain(ctx);
                if (err < 0)
                        goto out;
        }

        err = nft_deltable(ctx);
out:
        return err;
}

static int nft_flush(struct nft_ctx *ctx, int family)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        const struct nlattr * const *nla = ctx->nla;
        struct nft_table *table, *nt;
        int err = 0;

        list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
                if (family != AF_UNSPEC && table->family != family)
                        continue;

                ctx->family = table->family;

                if (!nft_is_active_next(ctx->net, table))
                        continue;

                if (nft_table_has_owner(table) && table->nlpid != ctx->portid)
                        continue;

                if (nla[NFTA_TABLE_NAME] &&
                    nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
                        continue;

                ctx->table = table;

                err = nft_flush_table(ctx);
                if (err < 0)
                        goto out;
        }
out:
        return err;
}

static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_ctx ctx;

        nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla);
        if (family == AF_UNSPEC ||
            (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
                return nft_flush(&ctx, family);

        if (nla[NFTA_TABLE_HANDLE]) {
                attr = nla[NFTA_TABLE_HANDLE];
                table = nft_table_lookup_byhandle(net, attr, family, genmask,
                                                  NETLINK_CB(skb).portid);
        } else {
                attr = nla[NFTA_TABLE_NAME];
                table = nft_table_lookup(net, attr, family, genmask,
                                         NETLINK_CB(skb).portid);
        }

        if (IS_ERR(table)) {
                if (PTR_ERR(table) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(table);
        }

        if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
            table->use > 0)
                return -EBUSY;

        ctx.family = family;
        ctx.table = table;

        return nft_flush_table(&ctx);
}

static void nf_tables_table_destroy(struct nft_ctx *ctx)
{
        if (WARN_ON(ctx->table->use > 0))
                return;

        rhltable_destroy(&ctx->table->chains_ht);
        kfree(ctx->table->name);
        kfree(ctx->table->udata);
        kfree(ctx->table);
}

void nft_register_chain_type(const struct nft_chain_type *ctype)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) {
                nfnl_unlock(NFNL_SUBSYS_NFTABLES);
                return;
        }
        chain_type[ctype->family][ctype->type] = ctype;
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_register_chain_type);

void nft_unregister_chain_type(const struct nft_chain_type *ctype)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        chain_type[ctype->family][ctype->type] = NULL;
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_chain_type);

/*
 * Chains
 */

static struct nft_chain *
nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
{
        struct nft_chain *chain;

        list_for_each_entry(chain, &table->chains, list) {
                if (chain->handle == handle &&
                    nft_active_genmask(chain, genmask))
                        return chain;
        }

        return ERR_PTR(-ENOENT);
}

static bool lockdep_commit_lock_is_held(const struct net *net)
{
#ifdef CONFIG_PROVE_LOCKING
        struct nftables_pernet *nft_net = nft_pernet(net);

        return lockdep_is_held(&nft_net->commit_mutex);
#else
        return true;
#endif
}

static struct nft_chain *nft_chain_lookup(struct net *net,
                                          struct nft_table *table,
                                          const struct nlattr *nla, u8 genmask)
{
        char search[NFT_CHAIN_MAXNAMELEN + 1];
        struct rhlist_head *tmp, *list;
        struct nft_chain *chain;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        nla_strscpy(search, nla, sizeof(search));

        WARN_ON(!rcu_read_lock_held() &&
                !lockdep_commit_lock_is_held(net));

        chain = ERR_PTR(-ENOENT);
        rcu_read_lock();
        list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params);
        if (!list)
                goto out_unlock;

        rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
                if (nft_active_genmask(chain, genmask))
                        goto out_unlock;
        }
        chain = ERR_PTR(-ENOENT);
out_unlock:
        rcu_read_unlock();
        return chain;
}

static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
        [NFTA_CHAIN_TABLE]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_CHAIN_HANDLE]        = { .type = NLA_U64 },
        [NFTA_CHAIN_NAME]        = { .type = NLA_STRING,
                                    .len = NFT_CHAIN_MAXNAMELEN - 1 },
        [NFTA_CHAIN_HOOK]        = { .type = NLA_NESTED },
        [NFTA_CHAIN_POLICY]        = { .type = NLA_U32 },
        [NFTA_CHAIN_TYPE]        = { .type = NLA_STRING,
                                    .len = NFT_MODULE_AUTOLOAD_LIMIT },
        [NFTA_CHAIN_COUNTERS]        = { .type = NLA_NESTED },
        [NFTA_CHAIN_FLAGS]        = { .type = NLA_U32 },
        [NFTA_CHAIN_ID]                = { .type = NLA_U32 },
        [NFTA_CHAIN_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN },
};

static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
        [NFTA_HOOK_HOOKNUM]        = { .type = NLA_U32 },
        [NFTA_HOOK_PRIORITY]        = { .type = NLA_U32 },
        [NFTA_HOOK_DEV]                = { .type = NLA_STRING,
                                    .len = IFNAMSIZ - 1 },
};

static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
{
        struct nft_stats *cpu_stats, total;
        struct nlattr *nest;
        unsigned int seq;
        u64 pkts, bytes;
        int cpu;

        if (!stats)
                return 0;

        memset(&total, 0, sizeof(total));
        for_each_possible_cpu(cpu) {
                cpu_stats = per_cpu_ptr(stats, cpu);
                do {
                        seq = u64_stats_fetch_begin(&cpu_stats->syncp);
                        pkts = cpu_stats->pkts;
                        bytes = cpu_stats->bytes;
                } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq));
                total.pkts += pkts;
                total.bytes += bytes;
        }
        nest = nla_nest_start_noflag(skb, NFTA_CHAIN_COUNTERS);
        if (nest == NULL)
                goto nla_put_failure;

        if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts),
                         NFTA_COUNTER_PAD) ||
            nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
                         NFTA_COUNTER_PAD))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -ENOSPC;
}

static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
                                   const struct nft_base_chain *basechain,
                                   const struct list_head *hook_list)
{
        const struct nf_hook_ops *ops = &basechain->ops;
        struct nft_hook *hook, *first = NULL;
        struct nlattr *nest, *nest_devs;
        int n = 0;

        nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK);
        if (nest == NULL)
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
                goto nla_put_failure;

        if (nft_base_chain_netdev(family, ops->hooknum)) {
                nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
                if (!nest_devs)
                        goto nla_put_failure;

                if (!hook_list)
                        hook_list = &basechain->hook_list;

                list_for_each_entry(hook, hook_list, list) {
                        if (!first)
                                first = hook;

                        if (nla_put_string(skb, NFTA_DEVICE_NAME,
                                           hook->ops.dev->name))
                                goto nla_put_failure;
                        n++;
                }
                nla_nest_end(skb, nest_devs);

                if (n == 1 &&
                    nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name))
                        goto nla_put_failure;
        }
        nla_nest_end(skb, nest);

        return 0;
nla_put_failure:
        return -1;
}

static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
                                     u32 portid, u32 seq, int event, u32 flags,
                                     int family, const struct nft_table *table,
                                     const struct nft_chain *chain,
                                     const struct list_head *hook_list)
{
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) ||
            nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) ||
            nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
                         NFTA_CHAIN_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELCHAIN && !hook_list) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nft_is_base_chain(chain)) {
                const struct nft_base_chain *basechain = nft_base_chain(chain);
                struct nft_stats __percpu *stats;

                if (nft_dump_basechain_hook(skb, family, basechain, hook_list))
                        goto nla_put_failure;

                if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
                                 htonl(basechain->policy)))
                        goto nla_put_failure;

                if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
                        goto nla_put_failure;

                stats = rcu_dereference_check(basechain->stats,
                                              lockdep_commit_lock_is_held(net));
                if (nft_dump_stats(skb, stats))
                        goto nla_put_failure;
        }

        if (chain->flags &&
            nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags)))
                goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
                goto nla_put_failure;

        if (chain->udata &&
            nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
                                   const struct list_head *hook_list)
{
        struct nftables_pernet *nft_net;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
                                        event, flags, ctx->family, ctx->table,
                                        ctx->chain, hook_list);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_net = nft_pernet(ctx->net);
        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static int nf_tables_dump_chains(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        unsigned int idx = 0, s_idx = cb->args[0];
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nftables_pernet *nft_net;
        const struct nft_table *table;
        const struct nft_chain *chain;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                list_for_each_entry_rcu(chain, &table->chains, list) {
                        if (idx < s_idx)
                                goto cont;
                        if (idx > s_idx)
                                memset(&cb->args[1], 0,
                                       sizeof(cb->args) - sizeof(cb->args[0]));
                        if (!nft_is_active(net, chain))
                                continue;
                        if (nf_tables_fill_chain_info(skb, net,
                                                      NETLINK_CB(cb->skb).portid,
                                                      cb->nlh->nlmsg_seq,
                                                      NFT_MSG_NEWCHAIN,
                                                      NLM_F_MULTI,
                                                      table->family, table,
                                                      chain, NULL) < 0)
                                goto done;

                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
        }
done:
        rcu_read_unlock();
        cb->args[0] = idx;
        return skb->len;
}

/* called with rcu_read_lock held */
static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_chain *chain;
        struct net *net = info->net;
        struct nft_table *table;
        struct sk_buff *skb2;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_chains,
                        .module = THIS_MODULE,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
                return PTR_ERR(table);
        }

        chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask);
        if (IS_ERR(chain)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                return PTR_ERR(chain);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;

        err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
                                        info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
                                        0, family, table, chain, NULL);
        if (err < 0)
                goto err_fill_chain_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_chain_info:
        kfree_skb(skb2);
        return err;
}

static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
        [NFTA_COUNTER_PACKETS]        = { .type = NLA_U64 },
        [NFTA_COUNTER_BYTES]        = { .type = NLA_U64 },
};

static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
{
        struct nlattr *tb[NFTA_COUNTER_MAX+1];
        struct nft_stats __percpu *newstats;
        struct nft_stats *stats;
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr,
                                          nft_counter_policy, NULL);
        if (err < 0)
                return ERR_PTR(err);

        if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
                return ERR_PTR(-EINVAL);

        newstats = netdev_alloc_pcpu_stats(struct nft_stats);
        if (newstats == NULL)
                return ERR_PTR(-ENOMEM);

        /* Restore old counters on this cpu, no problem. Per-cpu statistics
         * are not exposed to userspace.
         */
        preempt_disable();
        stats = this_cpu_ptr(newstats);
        stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
        stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
        preempt_enable();

        return newstats;
}

static void nft_chain_stats_replace(struct nft_trans *trans)
{
        struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain);

        if (!nft_trans_chain_stats(trans))
                return;

        nft_trans_chain_stats(trans) =
                rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans),
                                    lockdep_commit_lock_is_held(trans->ctx.net));

        if (!nft_trans_chain_stats(trans))
                static_branch_inc(&nft_counters_enabled);
}

static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
{
        struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0);
        struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1);

        if (g0 != g1)
                kvfree(g1);
        kvfree(g0);

        /* should be NULL either via abort or via successful commit */
        WARN_ON_ONCE(chain->blob_next);
        kvfree(chain->blob_next);
}

void nf_tables_chain_destroy(struct nft_ctx *ctx)
{
        struct nft_chain *chain = ctx->chain;
        struct nft_hook *hook, *next;

        if (WARN_ON(chain->use > 0))
                return;

        /* no concurrent access possible anymore */
        nf_tables_chain_free_chain_rules(chain);

        if (nft_is_base_chain(chain)) {
                struct nft_base_chain *basechain = nft_base_chain(chain);

                if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
                        list_for_each_entry_safe(hook, next,
                                                 &basechain->hook_list, list) {
                                list_del_rcu(&hook->list);
                                kfree_rcu(hook, rcu);
                        }
                }
                module_put(basechain->type->owner);
                if (rcu_access_pointer(basechain->stats)) {
                        static_branch_dec(&nft_counters_enabled);
                        free_percpu(rcu_dereference_raw(basechain->stats));
                }
                kfree(chain->name);
                kfree(chain->udata);
                kfree(basechain);
        } else {
                kfree(chain->name);
                kfree(chain->udata);
                kfree(chain);
        }
}

static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
                                              const struct nlattr *attr)
{
        struct net_device *dev;
        char ifname[IFNAMSIZ];
        struct nft_hook *hook;
        int err;

        hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
        if (!hook) {
                err = -ENOMEM;
                goto err_hook_alloc;
        }

        nla_strscpy(ifname, attr, IFNAMSIZ);
        /* nf_tables_netdev_event() is called under rtnl_mutex, this is
         * indirectly serializing all the other holders of the commit_mutex with
         * the rtnl_mutex.
         */
        dev = __dev_get_by_name(net, ifname);
        if (!dev) {
                err = -ENOENT;
                goto err_hook_dev;
        }
        hook->ops.dev = dev;

        return hook;

err_hook_dev:
        kfree(hook);
err_hook_alloc:
        return ERR_PTR(err);
}

static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
                                           const struct nft_hook *this)
{
        struct nft_hook *hook;

        list_for_each_entry(hook, hook_list, list) {
                if (this->ops.dev == hook->ops.dev)
                        return hook;
        }

        return NULL;
}

static int nf_tables_parse_netdev_hooks(struct net *net,
                                        const struct nlattr *attr,
                                        struct list_head *hook_list,
                                        struct netlink_ext_ack *extack)
{
        struct nft_hook *hook, *next;
        const struct nlattr *tmp;
        int rem, n = 0, err;

        nla_for_each_nested(tmp, attr, rem) {
                if (nla_type(tmp) != NFTA_DEVICE_NAME) {
                        err = -EINVAL;
                        goto err_hook;
                }

                hook = nft_netdev_hook_alloc(net, tmp);
                if (IS_ERR(hook)) {
                        NL_SET_BAD_ATTR(extack, tmp);
                        err = PTR_ERR(hook);
                        goto err_hook;
                }
                if (nft_hook_list_find(hook_list, hook)) {
                        NL_SET_BAD_ATTR(extack, tmp);
                        kfree(hook);
                        err = -EEXIST;
                        goto err_hook;
                }
                list_add_tail(&hook->list, hook_list);
                n++;

                if (n == NFT_NETDEVICE_MAX) {
                        err = -EFBIG;
                        goto err_hook;
                }
        }

        return 0;

err_hook:
        list_for_each_entry_safe(hook, next, hook_list, list) {
                list_del(&hook->list);
                kfree(hook);
        }
        return err;
}

struct nft_chain_hook {
        u32                                num;
        s32                                priority;
        const struct nft_chain_type        *type;
        struct list_head                list;
};

static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[],
                                  struct list_head *hook_list,
                                  struct netlink_ext_ack *extack, u32 flags)
{
        struct nft_hook *hook;
        int err;

        if (tb[NFTA_HOOK_DEV]) {
                hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]);
                if (IS_ERR(hook)) {
                        NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]);
                        return PTR_ERR(hook);
                }

                list_add_tail(&hook->list, hook_list);
        } else if (tb[NFTA_HOOK_DEVS]) {
                err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS],
                                                   hook_list, extack);
                if (err < 0)
                        return err;

        }

        if (flags & NFT_CHAIN_HW_OFFLOAD &&
            list_empty(hook_list))
                return -EINVAL;

        return 0;
}

static int nft_chain_parse_hook(struct net *net,
                                struct nft_base_chain *basechain,
                                const struct nlattr * const nla[],
                                struct nft_chain_hook *hook, u8 family,
                                u32 flags, struct netlink_ext_ack *extack)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nlattr *ha[NFTA_HOOK_MAX + 1];
        const struct nft_chain_type *type;
        int err;

        lockdep_assert_held(&nft_net->commit_mutex);
        lockdep_nfnl_nft_mutex_not_held();

        err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX,
                                          nla[NFTA_CHAIN_HOOK],
                                          nft_hook_policy, NULL);
        if (err < 0)
                return err;

        if (!basechain) {
                if (!ha[NFTA_HOOK_HOOKNUM] ||
                    !ha[NFTA_HOOK_PRIORITY]) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                        return -ENOENT;
                }

                hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
                hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));

                type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
                if (!type)
                        return -EOPNOTSUPP;

                if (nla[NFTA_CHAIN_TYPE]) {
                        type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
                                                           family, true);
                        if (IS_ERR(type)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
                                return PTR_ERR(type);
                        }
                }
                if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
                        return -EOPNOTSUPP;

                if (type->type == NFT_CHAIN_T_NAT &&
                    hook->priority <= NF_IP_PRI_CONNTRACK)
                        return -EOPNOTSUPP;
        } else {
                if (ha[NFTA_HOOK_HOOKNUM]) {
                        hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
                        if (hook->num != basechain->ops.hooknum)
                                return -EOPNOTSUPP;
                }
                if (ha[NFTA_HOOK_PRIORITY]) {
                        hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
                        if (hook->priority != basechain->ops.priority)
                                return -EOPNOTSUPP;
                }

                if (nla[NFTA_CHAIN_TYPE]) {
                        type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
                                                             family);
                        if (!type) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
                                return -ENOENT;
                        }
                } else {
                        type = basechain->type;
                }
        }

        if (!try_module_get(type->owner)) {
                if (nla[NFTA_CHAIN_TYPE])
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
                return -ENOENT;
        }

        hook->type = type;

        INIT_LIST_HEAD(&hook->list);
        if (nft_base_chain_netdev(family, hook->num)) {
                err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags);
                if (err < 0) {
                        module_put(type->owner);
                        return err;
                }
        } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) {
                module_put(type->owner);
                return -EOPNOTSUPP;
        }

        return 0;
}

static void nft_chain_release_hook(struct nft_chain_hook *hook)
{
        struct nft_hook *h, *next;

        list_for_each_entry_safe(h, next, &hook->list, list) {
                list_del(&h->list);
                kfree(h);
        }
        module_put(hook->type->owner);
}

static void nft_last_rule(const struct nft_chain *chain, const void *ptr)
{
        struct nft_rule_dp_last *lrule;

        BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0);

        lrule = (struct nft_rule_dp_last *)ptr;
        lrule->end.is_last = 1;
        lrule->chain = chain;
        /* blob size does not include the trailer rule */
}

static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain,
                                                         unsigned int size)
{
        struct nft_rule_blob *blob;

        if (size > INT_MAX)
                return NULL;

        size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last);

        blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
        if (!blob)
                return NULL;

        blob->size = 0;
        nft_last_rule(chain, blob->data);

        return blob;
}

static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
                                    const struct nft_chain_hook *hook,
                                    struct nft_chain *chain)
{
        ops->pf                        = family;
        ops->hooknum                = hook->num;
        ops->priority                = hook->priority;
        ops->priv                = chain;
        ops->hook                = hook->type->hooks[ops->hooknum];
        ops->hook_ops_type        = NF_HOOK_OP_NF_TABLES;
}

static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
                              struct nft_chain_hook *hook, u32 flags)
{
        struct nft_chain *chain;
        struct nft_hook *h;

        basechain->type = hook->type;
        INIT_LIST_HEAD(&basechain->hook_list);
        chain = &basechain->chain;

        if (nft_base_chain_netdev(family, hook->num)) {
                list_splice_init(&hook->list, &basechain->hook_list);
                list_for_each_entry(h, &basechain->hook_list, list)
                        nft_basechain_hook_init(&h->ops, family, hook, chain);
        }
        nft_basechain_hook_init(&basechain->ops, family, hook, chain);

        chain->flags |= NFT_CHAIN_BASE | flags;
        basechain->policy = NF_ACCEPT;
        if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
            !nft_chain_offload_support(basechain)) {
                list_splice_init(&basechain->hook_list, &hook->list);
                return -EOPNOTSUPP;
        }

        flow_block_init(&basechain->flow_block);

        return 0;
}

int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
{
        int err;

        err = rhltable_insert_key(&table->chains_ht, chain->name,
                                  &chain->rhlhead, nft_chain_ht_params);
        if (err)
                return err;

        list_add_tail_rcu(&chain->list, &table->chains);

        return 0;
}

static u64 chain_id;

static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
                              u8 policy, u32 flags,
                              struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_table *table = ctx->table;
        struct nft_base_chain *basechain;
        struct net *net = ctx->net;
        char name[NFT_NAME_MAXLEN];
        struct nft_rule_blob *blob;
        struct nft_trans *trans;
        struct nft_chain *chain;
        int err;

        if (nla[NFTA_CHAIN_HOOK]) {
                struct nft_stats __percpu *stats = NULL;
                struct nft_chain_hook hook = {};

                if (table->flags & __NFT_TABLE_F_UPDATE)
                        return -EINVAL;

                if (flags & NFT_CHAIN_BINDING)
                        return -EOPNOTSUPP;

                err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags,
                                           extack);
                if (err < 0)
                        return err;

                basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
                if (basechain == NULL) {
                        nft_chain_release_hook(&hook);
                        return -ENOMEM;
                }
                chain = &basechain->chain;

                if (nla[NFTA_CHAIN_COUNTERS]) {
                        stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
                        if (IS_ERR(stats)) {
                                nft_chain_release_hook(&hook);
                                kfree(basechain);
                                return PTR_ERR(stats);
                        }
                        rcu_assign_pointer(basechain->stats, stats);
                }

                err = nft_basechain_init(basechain, family, &hook, flags);
                if (err < 0) {
                        nft_chain_release_hook(&hook);
                        kfree(basechain);
                        free_percpu(stats);
                        return err;
                }
                if (stats)
                        static_branch_inc(&nft_counters_enabled);
        } else {
                if (flags & NFT_CHAIN_BASE)
                        return -EINVAL;
                if (flags & NFT_CHAIN_HW_OFFLOAD)
                        return -EOPNOTSUPP;

                chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
                if (chain == NULL)
                        return -ENOMEM;

                chain->flags = flags;
        }
        ctx->chain = chain;

        INIT_LIST_HEAD(&chain->rules);
        chain->handle = nf_tables_alloc_handle(table);
        chain->table = table;

        if (nla[NFTA_CHAIN_NAME]) {
                chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
        } else {
                if (!(flags & NFT_CHAIN_BINDING)) {
                        err = -EINVAL;
                        goto err_destroy_chain;
                }

                snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
                chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
        }

        if (!chain->name) {
                err = -ENOMEM;
                goto err_destroy_chain;
        }

        if (nla[NFTA_CHAIN_USERDATA]) {
                chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
                if (chain->udata == NULL) {
                        err = -ENOMEM;
                        goto err_destroy_chain;
                }
                chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
        }

        blob = nf_tables_chain_alloc_rules(chain, 0);
        if (!blob) {
                err = -ENOMEM;
                goto err_destroy_chain;
        }

        RCU_INIT_POINTER(chain->blob_gen_0, blob);
        RCU_INIT_POINTER(chain->blob_gen_1, blob);

        if (!nft_use_inc(&table->use)) {
                err = -EMFILE;
                goto err_destroy_chain;
        }

        trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto err_trans;
        }

        nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
        if (nft_is_base_chain(chain))
                nft_trans_chain_policy(trans) = policy;

        err = nft_chain_add(table, chain);
        if (err < 0)
                goto err_chain_add;

        /* This must be LAST to ensure no packets are walking over this chain. */
        err = nf_tables_register_hook(net, table, chain);
        if (err < 0)
                goto err_register_hook;

        return 0;

err_register_hook:
        nft_chain_del(chain);
err_chain_add:
        nft_trans_destroy(trans);
err_trans:
        nft_use_dec_restore(&table->use);
err_destroy_chain:
        nf_tables_chain_destroy(ctx);

        return err;
}

static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
                              u32 flags, const struct nlattr *attr,
                              struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_base_chain *basechain = NULL;
        struct nft_table *table = ctx->table;
        struct nft_chain *chain = ctx->chain;
        struct nft_chain_hook hook = {};
        struct nft_stats *stats = NULL;
        struct nft_hook *h, *next;
        struct nf_hook_ops *ops;
        struct nft_trans *trans;
        bool unregister = false;
        int err;

        if (chain->flags ^ flags)
                return -EOPNOTSUPP;

        INIT_LIST_HEAD(&hook.list);

        if (nla[NFTA_CHAIN_HOOK]) {
                if (!nft_is_base_chain(chain)) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }

                basechain = nft_base_chain(chain);
                err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook,
                                           ctx->family, flags, extack);
                if (err < 0)
                        return err;

                if (basechain->type != hook.type) {
                        nft_chain_release_hook(&hook);
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }

                if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
                        list_for_each_entry_safe(h, next, &hook.list, list) {
                                h->ops.pf        = basechain->ops.pf;
                                h->ops.hooknum        = basechain->ops.hooknum;
                                h->ops.priority        = basechain->ops.priority;
                                h->ops.priv        = basechain->ops.priv;
                                h->ops.hook        = basechain->ops.hook;

                                if (nft_hook_list_find(&basechain->hook_list, h)) {
                                        list_del(&h->list);
                                        kfree(h);
                                }
                        }
                } else {
                        ops = &basechain->ops;
                        if (ops->hooknum != hook.num ||
                            ops->priority != hook.priority) {
                                nft_chain_release_hook(&hook);
                                NL_SET_BAD_ATTR(extack, attr);
                                return -EEXIST;
                        }
                }
        }

        if (nla[NFTA_CHAIN_HANDLE] &&
            nla[NFTA_CHAIN_NAME]) {
                struct nft_chain *chain2;

                chain2 = nft_chain_lookup(ctx->net, table,
                                          nla[NFTA_CHAIN_NAME], genmask);
                if (!IS_ERR(chain2)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                        err = -EEXIST;
                        goto err_hooks;
                }
        }

        if (table->flags & __NFT_TABLE_F_UPDATE &&
            !list_empty(&hook.list)) {
                NL_SET_BAD_ATTR(extack, attr);
                err = -EOPNOTSUPP;
                goto err_hooks;
        }

        if (!(table->flags & NFT_TABLE_F_DORMANT) &&
            nft_is_base_chain(chain) &&
            !list_empty(&hook.list)) {
                basechain = nft_base_chain(chain);
                ops = &basechain->ops;

                if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
                        err = nft_netdev_register_hooks(ctx->net, &hook.list);
                        if (err < 0)
                                goto err_hooks;
                }
        }

        unregister = true;

        if (nla[NFTA_CHAIN_COUNTERS]) {
                if (!nft_is_base_chain(chain)) {
                        err = -EOPNOTSUPP;
                        goto err_hooks;
                }

                stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
                if (IS_ERR(stats)) {
                        err = PTR_ERR(stats);
                        goto err_hooks;
                }
        }

        err = -ENOMEM;
        trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
                                sizeof(struct nft_trans_chain));
        if (trans == NULL)
                goto err_trans;

        nft_trans_chain_stats(trans) = stats;
        nft_trans_chain_update(trans) = true;

        if (nla[NFTA_CHAIN_POLICY])
                nft_trans_chain_policy(trans) = policy;
        else
                nft_trans_chain_policy(trans) = -1;

        if (nla[NFTA_CHAIN_HANDLE] &&
            nla[NFTA_CHAIN_NAME]) {
                struct nftables_pernet *nft_net = nft_pernet(ctx->net);
                struct nft_trans *tmp;
                char *name;

                err = -ENOMEM;
                name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
                if (!name)
                        goto err_trans;

                err = -EEXIST;
                list_for_each_entry(tmp, &nft_net->commit_list, list) {
                        if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
                            tmp->ctx.table == table &&
                            nft_trans_chain_update(tmp) &&
                            nft_trans_chain_name(tmp) &&
                            strcmp(name, nft_trans_chain_name(tmp)) == 0) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                                kfree(name);
                                goto err_trans;
                        }
                }

                nft_trans_chain_name(trans) = name;
        }

        nft_trans_basechain(trans) = basechain;
        INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
        list_splice(&hook.list, &nft_trans_chain_hooks(trans));
        if (nla[NFTA_CHAIN_HOOK])
                module_put(hook.type->owner);

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_trans:
        free_percpu(stats);
        kfree(trans);
err_hooks:
        if (nla[NFTA_CHAIN_HOOK]) {
                list_for_each_entry_safe(h, next, &hook.list, list) {
                        if (unregister)
                                nf_unregister_net_hook(ctx->net, &h->ops);
                        list_del(&h->list);
                        kfree_rcu(h, rcu);
                }
                module_put(hook.type->owner);
        }

        return err;
}

static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
                                               const struct nft_table *table,
                                               const struct nlattr *nla, u8 genmask)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        u32 id = ntohl(nla_get_be32(nla));
        struct nft_trans *trans;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                struct nft_chain *chain = trans->ctx.chain;

                if (trans->msg_type == NFT_MSG_NEWCHAIN &&
                    chain->table == table &&
                    id == nft_trans_chain_id(trans) &&
                    nft_active_genmask(chain, genmask))
                        return chain;
        }
        return ERR_PTR(-ENOENT);
}

static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_chain *chain = NULL;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        u8 policy = NF_ACCEPT;
        struct nft_ctx ctx;
        u64 handle = 0;
        u32 flags = 0;

        lockdep_assert_held(&nft_net->commit_mutex);

        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
                return PTR_ERR(table);
        }

        chain = NULL;
        attr = nla[NFTA_CHAIN_NAME];

        if (nla[NFTA_CHAIN_HANDLE]) {
                handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
                chain = nft_chain_lookup_byhandle(table, handle, genmask);
                if (IS_ERR(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
                        return PTR_ERR(chain);
                }
                attr = nla[NFTA_CHAIN_HANDLE];
        } else if (nla[NFTA_CHAIN_NAME]) {
                chain = nft_chain_lookup(net, table, attr, genmask);
                if (IS_ERR(chain)) {
                        if (PTR_ERR(chain) != -ENOENT) {
                                NL_SET_BAD_ATTR(extack, attr);
                                return PTR_ERR(chain);
                        }
                        chain = NULL;
                }
        } else if (!nla[NFTA_CHAIN_ID]) {
                return -EINVAL;
        }

        if (nla[NFTA_CHAIN_POLICY]) {
                if (chain != NULL &&
                    !nft_is_base_chain(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
                        return -EOPNOTSUPP;
                }

                if (chain == NULL &&
                    nla[NFTA_CHAIN_HOOK] == NULL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
                        return -EOPNOTSUPP;
                }

                policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
                switch (policy) {
                case NF_DROP:
                case NF_ACCEPT:
                        break;
                default:
                        return -EINVAL;
                }
        }

        if (nla[NFTA_CHAIN_FLAGS])
                flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS]));
        else if (chain)
                flags = chain->flags;

        if (flags & ~NFT_CHAIN_FLAGS)
                return -EOPNOTSUPP;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        if (chain != NULL) {
                if (chain->flags & NFT_CHAIN_BINDING)
                        return -EINVAL;

                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                flags |= chain->flags & NFT_CHAIN_BASE;
                return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
                                          extack);
        }

        return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack);
}

static int nft_delchain_hook(struct nft_ctx *ctx,
                             struct nft_base_chain *basechain,
                             struct netlink_ext_ack *extack)
{
        const struct nft_chain *chain = &basechain->chain;
        const struct nlattr * const *nla = ctx->nla;
        struct nft_chain_hook chain_hook = {};
        struct nft_hook *this, *hook;
        LIST_HEAD(chain_del_list);
        struct nft_trans *trans;
        int err;

        if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
                return -EOPNOTSUPP;

        err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook,
                                   ctx->family, chain->flags, extack);
        if (err < 0)
                return err;

        list_for_each_entry(this, &chain_hook.list, list) {
                hook = nft_hook_list_find(&basechain->hook_list, this);
                if (!hook) {
                        err = -ENOENT;
                        goto err_chain_del_hook;
                }
                list_move(&hook->list, &chain_del_list);
        }

        trans = nft_trans_alloc(ctx, NFT_MSG_DELCHAIN,
                                sizeof(struct nft_trans_chain));
        if (!trans) {
                err = -ENOMEM;
                goto err_chain_del_hook;
        }

        nft_trans_basechain(trans) = basechain;
        nft_trans_chain_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
        list_splice(&chain_del_list, &nft_trans_chain_hooks(trans));
        nft_chain_release_hook(&chain_hook);

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_chain_del_hook:
        list_splice(&chain_del_list, &basechain->hook_list);
        nft_chain_release_hook(&chain_hook);

        return err;
}

static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_chain *chain;
        struct nft_rule *rule;
        struct nft_ctx ctx;
        u64 handle;
        u32 use;
        int err;

        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_CHAIN_HANDLE]) {
                attr = nla[NFTA_CHAIN_HANDLE];
                handle = be64_to_cpu(nla_get_be64(attr));
                chain = nft_chain_lookup_byhandle(table, handle, genmask);
        } else {
                attr = nla[NFTA_CHAIN_NAME];
                chain = nft_chain_lookup(net, table, attr, genmask);
        }
        if (IS_ERR(chain)) {
                if (PTR_ERR(chain) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(chain);
        }

        if (nft_chain_binding(chain))
                return -EOPNOTSUPP;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        if (nla[NFTA_CHAIN_HOOK]) {
                if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN ||
                    chain->flags & NFT_CHAIN_HW_OFFLOAD)
                        return -EOPNOTSUPP;

                if (nft_is_base_chain(chain)) {
                        struct nft_base_chain *basechain = nft_base_chain(chain);

                        if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
                                return nft_delchain_hook(&ctx, basechain, extack);
                }
        }

        if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
            chain->use > 0)
                return -EBUSY;

        use = chain->use;
        list_for_each_entry(rule, &chain->rules, list) {
                if (!nft_is_active_next(net, rule))
                        continue;
                use--;

                err = nft_delrule(&ctx, rule);
                if (err < 0)
                        return err;
        }

        /* There are rules and elements that are still holding references to us,
         * we cannot do a recursive removal in this case.
         */
        if (use > 0) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        return nft_delchain(&ctx);
}

/*
 * Expressions
 */

/**
 *        nft_register_expr - register nf_tables expr type
 *        @type: expr type
 *
 *        Registers the expr type for use with nf_tables. Returns zero on
 *        success or a negative errno code otherwise.
 */
int nft_register_expr(struct nft_expr_type *type)
{
        if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR))
                return -ENOMEM;

        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        if (type->family == NFPROTO_UNSPEC)
                list_add_tail_rcu(&type->list, &nf_tables_expressions);
        else
                list_add_rcu(&type->list, &nf_tables_expressions);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
        return 0;
}
EXPORT_SYMBOL_GPL(nft_register_expr);

/**
 *        nft_unregister_expr - unregister nf_tables expr type
 *        @type: expr type
 *
 *         Unregisters the expr typefor use with nf_tables.
 */
void nft_unregister_expr(struct nft_expr_type *type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_del_rcu(&type->list);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_expr);

static const struct nft_expr_type *__nft_expr_type_get(u8 family,
                                                       struct nlattr *nla)
{
        const struct nft_expr_type *type, *candidate = NULL;

        list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
                if (!nla_strcmp(nla, type->name)) {
                        if (!type->family && !candidate)
                                candidate = type;
                        else if (type->family == family)
                                candidate = type;
                }
        }
        return candidate;
}

#ifdef CONFIG_MODULES
static int nft_expr_type_request_module(struct net *net, u8 family,
                                        struct nlattr *nla)
{
        if (nft_request_module(net, "nft-expr-%u-%.*s", family,
                               nla_len(nla), (char *)nla_data(nla)) == -EAGAIN)
                return -EAGAIN;

        return 0;
}
#endif

static const struct nft_expr_type *nft_expr_type_get(struct net *net,
                                                     u8 family,
                                                     struct nlattr *nla)
{
        const struct nft_expr_type *type;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        rcu_read_lock();
        type = __nft_expr_type_get(family, nla);
        if (type != NULL && try_module_get(type->owner)) {
                rcu_read_unlock();
                return type;
        }
        rcu_read_unlock();

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (type == NULL) {
                if (nft_expr_type_request_module(net, family, nla) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);

                if (nft_request_module(net, "nft-expr-%.*s",
                                       nla_len(nla),
                                       (char *)nla_data(nla)) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
        [NFTA_EXPR_NAME]        = { .type = NLA_STRING,
                                    .len = NFT_MODULE_AUTOLOAD_LIMIT },
        [NFTA_EXPR_DATA]        = { .type = NLA_NESTED },
};

static int nf_tables_fill_expr_info(struct sk_buff *skb,
                                    const struct nft_expr *expr, bool reset)
{
        if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
                goto nla_put_failure;

        if (expr->ops->dump) {
                struct nlattr *data = nla_nest_start_noflag(skb,
                                                            NFTA_EXPR_DATA);
                if (data == NULL)
                        goto nla_put_failure;
                if (expr->ops->dump(skb, expr, reset) < 0)
                        goto nla_put_failure;
                nla_nest_end(skb, data);
        }

        return skb->len;

nla_put_failure:
        return -1;
};

int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
                  const struct nft_expr *expr, bool reset)
{
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, attr);
        if (!nest)
                goto nla_put_failure;
        if (nf_tables_fill_expr_info(skb, expr, reset) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -1;
}

struct nft_expr_info {
        const struct nft_expr_ops        *ops;
        const struct nlattr                *attr;
        struct nlattr                        *tb[NFT_EXPR_MAXATTR + 1];
};

static int nf_tables_expr_parse(const struct nft_ctx *ctx,
                                const struct nlattr *nla,
                                struct nft_expr_info *info)
{
        const struct nft_expr_type *type;
        const struct nft_expr_ops *ops;
        struct nlattr *tb[NFTA_EXPR_MAX + 1];
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
                                          nft_expr_policy, NULL);
        if (err < 0)
                return err;

        type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]);
        if (IS_ERR(type))
                return PTR_ERR(type);

        if (tb[NFTA_EXPR_DATA]) {
                err = nla_parse_nested_deprecated(info->tb, type->maxattr,
                                                  tb[NFTA_EXPR_DATA],
                                                  type->policy, NULL);
                if (err < 0)
                        goto err1;
        } else
                memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));

        if (type->select_ops != NULL) {
                ops = type->select_ops(ctx,
                                       (const struct nlattr * const *)info->tb);
                if (IS_ERR(ops)) {
                        err = PTR_ERR(ops);
#ifdef CONFIG_MODULES
                        if (err == -EAGAIN)
                                if (nft_expr_type_request_module(ctx->net,
                                                                 ctx->family,
                                                                 tb[NFTA_EXPR_NAME]) != -EAGAIN)
                                        err = -ENOENT;
#endif
                        goto err1;
                }
        } else
                ops = type->ops;

        info->attr = nla;
        info->ops = ops;

        return 0;

err1:
        module_put(type->owner);
        return err;
}

int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
                         struct nft_expr_info *info)
{
        struct nlattr *tb[NFTA_EXPR_MAX + 1];
        const struct nft_expr_type *type;
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
                                          nft_expr_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME])
                return -EINVAL;

        type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
        if (!type)
                return -ENOENT;

        if (!type->inner_ops)
                return -EOPNOTSUPP;

        err = nla_parse_nested_deprecated(info->tb, type->maxattr,
                                          tb[NFTA_EXPR_DATA],
                                          type->policy, NULL);
        if (err < 0)
                goto err_nla_parse;

        info->attr = nla;
        info->ops = type->inner_ops;

        return 0;

err_nla_parse:
        return err;
}

static int nf_tables_newexpr(const struct nft_ctx *ctx,
                             const struct nft_expr_info *expr_info,
                             struct nft_expr *expr)
{
        const struct nft_expr_ops *ops = expr_info->ops;
        int err;

        expr->ops = ops;
        if (ops->init) {
                err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
                if (err < 0)
                        goto err1;
        }

        return 0;
err1:
        expr->ops = NULL;
        return err;
}

static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
                                   struct nft_expr *expr)
{
        const struct nft_expr_type *type = expr->ops->type;

        if (expr->ops->destroy)
                expr->ops->destroy(ctx, expr);
        module_put(type->owner);
}

static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
                                      const struct nlattr *nla)
{
        struct nft_expr_info expr_info;
        struct nft_expr *expr;
        struct module *owner;
        int err;

        err = nf_tables_expr_parse(ctx, nla, &expr_info);
        if (err < 0)
                goto err_expr_parse;

        err = -EOPNOTSUPP;
        if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
                goto err_expr_stateful;

        err = -ENOMEM;
        expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
        if (expr == NULL)
                goto err_expr_stateful;

        err = nf_tables_newexpr(ctx, &expr_info, expr);
        if (err < 0)
                goto err_expr_new;

        return expr;
err_expr_new:
        kfree(expr);
err_expr_stateful:
        owner = expr_info.ops->type->owner;
        if (expr_info.ops->type->release_ops)
                expr_info.ops->type->release_ops(expr_info.ops);

        module_put(owner);
err_expr_parse:
        return ERR_PTR(err);
}

int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
{
        int err;

        if (WARN_ON_ONCE(!src->ops->clone))
                return -EINVAL;

        dst->ops = src->ops;
        err = src->ops->clone(dst, src, gfp);
        if (err < 0)
                return err;

        __module_get(src->ops->type->owner);

        return 0;
}

void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
        nf_tables_expr_destroy(ctx, expr);
        kfree(expr);
}

/*
 * Rules
 */

static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
                                          u64 handle)
{
        struct nft_rule *rule;

        // FIXME: this sucks
        list_for_each_entry_rcu(rule, &chain->rules, list) {
                if (handle == rule->handle)
                        return rule;
        }

        return ERR_PTR(-ENOENT);
}

static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
                                        const struct nlattr *nla)
{
        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
}

static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
        [NFTA_RULE_TABLE]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_RULE_CHAIN]        = { .type = NLA_STRING,
                                    .len = NFT_CHAIN_MAXNAMELEN - 1 },
        [NFTA_RULE_HANDLE]        = { .type = NLA_U64 },
        [NFTA_RULE_EXPRESSIONS]        = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
        [NFTA_RULE_COMPAT]        = { .type = NLA_NESTED },
        [NFTA_RULE_POSITION]        = { .type = NLA_U64 },
        [NFTA_RULE_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN },
        [NFTA_RULE_ID]                = { .type = NLA_U32 },
        [NFTA_RULE_POSITION_ID]        = { .type = NLA_U32 },
        [NFTA_RULE_CHAIN_ID]        = { .type = NLA_U32 },
};

static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
                                    u32 portid, u32 seq, int event,
                                    u32 flags, int family,
                                    const struct nft_table *table,
                                    const struct nft_chain *chain,
                                    const struct nft_rule *rule, u64 handle,
                                    bool reset)
{
        struct nlmsghdr *nlh;
        const struct nft_expr *expr, *next;
        struct nlattr *list;
        u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);

        nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
                           nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
                goto nla_put_failure;
        if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle),
                         NFTA_RULE_PAD))
                goto nla_put_failure;

        if (event != NFT_MSG_DELRULE && handle) {
                if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle),
                                 NFTA_RULE_PAD))
                        goto nla_put_failure;
        }

        if (chain->flags & NFT_CHAIN_HW_OFFLOAD)
                nft_flow_rule_stats(chain, rule);

        list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS);
        if (list == NULL)
                goto nla_put_failure;
        nft_rule_for_each_expr(expr, next, rule) {
                if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
                        goto nla_put_failure;
        }
        nla_nest_end(skb, list);

        if (rule->udata) {
                struct nft_userdata *udata = nft_userdata(rule);
                if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1,
                            udata->data) < 0)
                        goto nla_put_failure;
        }

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void nf_tables_rule_notify(const struct nft_ctx *ctx,
                                  const struct nft_rule *rule, int event)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        const struct nft_rule *prule;
        struct sk_buff *skb;
        u64 handle = 0;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (event == NFT_MSG_NEWRULE &&
            !list_is_first(&rule->list, &ctx->chain->rules) &&
            !list_is_last(&rule->list, &ctx->chain->rules)) {
                prule = list_prev_entry(rule, list);
                handle = prule->handle;
        }
        if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE))
                flags |= NLM_F_APPEND;
        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
                                       event, flags, ctx->family, ctx->table,
                                       ctx->chain, rule, handle, false);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static void audit_log_rule_reset(const struct nft_table *table,
                                 unsigned int base_seq,
                                 unsigned int nentries)
{
        char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
                              table->name, base_seq);

        audit_log_nfcfg(buf, table->family, nentries,
                        AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
        kfree(buf);
}

struct nft_rule_dump_ctx {
        unsigned int s_idx;
        char *table;
        char *chain;
        bool reset;
};

static int __nf_tables_dump_rules(struct sk_buff *skb,
                                  unsigned int *idx,
                                  struct netlink_callback *cb,
                                  const struct nft_table *table,
                                  const struct nft_chain *chain)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
        struct net *net = sock_net(skb->sk);
        const struct nft_rule *rule, *prule;
        unsigned int entries = 0;
        int ret = 0;
        u64 handle;

        prule = NULL;
        list_for_each_entry_rcu(rule, &chain->rules, list) {
                if (!nft_is_active(net, rule))
                        goto cont_skip;
                if (*idx < ctx->s_idx)
                        goto cont;
                if (prule)
                        handle = prule->handle;
                else
                        handle = 0;

                if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
                                        cb->nlh->nlmsg_seq,
                                        NFT_MSG_NEWRULE,
                                        NLM_F_MULTI | NLM_F_APPEND,
                                        table->family,
                                        table, chain, rule, handle, ctx->reset) < 0) {
                        ret = 1;
                        break;
                }
                entries++;
                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                prule = rule;
cont_skip:
                (*idx)++;
        }

        if (ctx->reset && entries)
                audit_log_rule_reset(table, cb->seq, entries);

        return ret;
}

static int nf_tables_dump_rules(struct sk_buff *skb,
                                struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
        struct nft_table *table;
        const struct nft_chain *chain;
        unsigned int idx = 0;
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nftables_pernet *nft_net;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                if (ctx->table && strcmp(ctx->table, table->name) != 0)
                        continue;

                if (ctx->table && ctx->chain) {
                        struct rhlist_head *list, *tmp;

                        list = rhltable_lookup(&table->chains_ht, ctx->chain,
                                               nft_chain_ht_params);
                        if (!list)
                                goto done;

                        rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
                                if (!nft_is_active(net, chain))
                                        continue;
                                __nf_tables_dump_rules(skb, &idx,
                                                       cb, table, chain);
                                break;
                        }
                        goto done;
                }

                list_for_each_entry_rcu(chain, &table->chains, list) {
                        if (__nf_tables_dump_rules(skb, &idx,
                                                   cb, table, chain))
                                goto done;
                }

                if (ctx->table)
                        break;
        }
done:
        rcu_read_unlock();

        ctx->s_idx = idx;
        return skb->len;
}

static int nf_tables_dumpreset_rules(struct sk_buff *skb,
                                     struct netlink_callback *cb)
{
        struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
        int ret;

        /* Mutex is held is to prevent that two concurrent dump-and-reset calls
         * do not underrun counters and quotas. The commit_mutex is used for
         * the lack a better lock, this is not transaction path.
         */
        mutex_lock(&nft_net->commit_mutex);
        ret = nf_tables_dump_rules(skb, cb);
        mutex_unlock(&nft_net->commit_mutex);

        return ret;
}

static int nf_tables_dump_rules_start(struct netlink_callback *cb)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
        const struct nlattr * const *nla = cb->data;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));

        if (nla[NFTA_RULE_TABLE]) {
                ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
                if (!ctx->table)
                        return -ENOMEM;
        }
        if (nla[NFTA_RULE_CHAIN]) {
                ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC);
                if (!ctx->chain) {
                        kfree(ctx->table);
                        return -ENOMEM;
                }
        }
        return 0;
}

static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;

        ctx->reset = true;

        return nf_tables_dump_rules_start(cb);
}

static int nf_tables_dump_rules_done(struct netlink_callback *cb)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;

        kfree(ctx->table);
        kfree(ctx->chain);
        return 0;
}

/* called with rcu_read_lock held */
static struct sk_buff *
nf_tables_getrule_single(u32 portid, const struct nfnl_info *info,
                         const struct nlattr * const nla[], bool reset)
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_chain *chain;
        const struct nft_rule *rule;
        struct net *net = info->net;
        struct nft_table *table;
        struct sk_buff *skb2;
        int err;

        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
                return ERR_CAST(table);
        }

        chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
        if (IS_ERR(chain)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
                return ERR_CAST(chain);
        }

        rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
        if (IS_ERR(rule)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                return ERR_CAST(rule);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return ERR_PTR(-ENOMEM);

        err = nf_tables_fill_rule_info(skb2, net, portid,
                                       info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
                                       family, table, chain, rule, 0, reset);
        if (err < 0) {
                kfree_skb(skb2);
                return ERR_PTR(err);
        }

        return skb2;
}

static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nla[])
{
        u32 portid = NETLINK_CB(skb).portid;
        struct net *net = info->net;
        struct sk_buff *skb2;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start= nf_tables_dump_rules_start,
                        .dump = nf_tables_dump_rules,
                        .done = nf_tables_dump_rules_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        skb2 = nf_tables_getrule_single(portid, info, nla, false);
        if (IS_ERR(skb2))
                return PTR_ERR(skb2);

        return nfnetlink_unicast(skb2, net, portid);
}

static int nf_tables_getrule_reset(struct sk_buff *skb,
                                   const struct nfnl_info *info,
                                   const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        u32 portid = NETLINK_CB(skb).portid;
        struct net *net = info->net;
        struct sk_buff *skb2;
        char *buf;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start= nf_tables_dumpreset_rules_start,
                        .dump = nf_tables_dumpreset_rules,
                        .done = nf_tables_dump_rules_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!try_module_get(THIS_MODULE))
                return -EINVAL;
        rcu_read_unlock();
        mutex_lock(&nft_net->commit_mutex);
        skb2 = nf_tables_getrule_single(portid, info, nla, true);
        mutex_unlock(&nft_net->commit_mutex);
        rcu_read_lock();
        module_put(THIS_MODULE);

        if (IS_ERR(skb2))
                return PTR_ERR(skb2);

        buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
                        nla_len(nla[NFTA_RULE_TABLE]),
                        (char *)nla_data(nla[NFTA_RULE_TABLE]),
                        nft_net->base_seq);
        audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
                        AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
        kfree(buf);

        return nfnetlink_unicast(skb2, net, portid);
}

void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
{
        struct nft_expr *expr, *next;

        /*
         * Careful: some expressions might not be initialized in case this
         * is called on error from nf_tables_newrule().
         */
        expr = nft_expr_first(rule);
        while (nft_expr_more(rule, expr)) {
                next = nft_expr_next(expr);
                nf_tables_expr_destroy(ctx, expr);
                expr = next;
        }
        kfree(rule);
}

static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
{
        nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
        nf_tables_rule_destroy(ctx, rule);
}

int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
{
        struct nft_expr *expr, *last;
        const struct nft_data *data;
        struct nft_rule *rule;
        int err;

        if (ctx->level == NFT_JUMP_STACK_SIZE)
                return -EMLINK;

        list_for_each_entry(rule, &chain->rules, list) {
                if (fatal_signal_pending(current))
                        return -EINTR;

                if (!nft_is_active_next(ctx->net, rule))
                        continue;

                nft_rule_for_each_expr(expr, last, rule) {
                        if (!expr->ops->validate)
                                continue;

                        err = expr->ops->validate(ctx, expr, &data);
                        if (err < 0)
                                return err;
                }
        }

        return 0;
}
EXPORT_SYMBOL_GPL(nft_chain_validate);

static int nft_table_validate(struct net *net, const struct nft_table *table)
{
        struct nft_chain *chain;
        struct nft_ctx ctx = {
                .net        = net,
                .family        = table->family,
        };
        int err;

        list_for_each_entry(chain, &table->chains, list) {
                if (!nft_is_base_chain(chain))
                        continue;

                ctx.chain = chain;
                err = nft_chain_validate(&ctx, chain);
                if (err < 0)
                        return err;

                cond_resched();
        }

        return 0;
}

int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
                         const struct nft_set_iter *iter,
                         struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        struct nft_ctx *pctx = (struct nft_ctx *)ctx;
        const struct nft_data *data;
        int err;

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
            *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
                return 0;

        data = nft_set_ext_data(ext);
        switch (data->verdict.code) {
        case NFT_JUMP:
        case NFT_GOTO:
                pctx->level++;
                err = nft_chain_validate(ctx, data->verdict.chain);
                if (err < 0)
                        return err;
                pctx->level--;
                break;
        default:
                break;
        }

        return 0;
}

int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
{
        struct nft_set_iter dummy_iter = {
                .genmask        = nft_genmask_next(ctx->net),
        };
        struct nft_set_elem_catchall *catchall;

        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, dummy_iter.genmask))
                        continue;

                ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem);
                if (ret < 0)
                        return ret;
        }

        return ret;
}

static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
                                             const struct nft_chain *chain,
                                             const struct nlattr *nla);

#define NFT_RULE_MAXEXPRS        128

static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        unsigned int size, i, n, ulen = 0, usize = 0;
        u8 genmask = nft_genmask_next(info->net);
        struct nft_rule *rule, *old_rule = NULL;
        struct nft_expr_info *expr_info = NULL;
        u8 family = info->nfmsg->nfgen_family;
        struct nft_flow_rule *flow = NULL;
        struct net *net = info->net;
        struct nft_userdata *udata;
        struct nft_table *table;
        struct nft_chain *chain;
        struct nft_trans *trans;
        u64 handle, pos_handle;
        struct nft_expr *expr;
        struct nft_ctx ctx;
        struct nlattr *tmp;
        int err, rem;

        lockdep_assert_held(&nft_net->commit_mutex);

        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_RULE_CHAIN]) {
                chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
                                         genmask);
                if (IS_ERR(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
                        return PTR_ERR(chain);
                }

        } else if (nla[NFTA_RULE_CHAIN_ID]) {
                chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID],
                                              genmask);
                if (IS_ERR(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
                        return PTR_ERR(chain);
                }
        } else {
                return -EINVAL;
        }

        if (nft_chain_is_bound(chain))
                return -EOPNOTSUPP;

        if (nla[NFTA_RULE_HANDLE]) {
                handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
                rule = __nft_rule_lookup(chain, handle);
                if (IS_ERR(rule)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                        return PTR_ERR(rule);
                }

                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        old_rule = rule;
                else
                        return -EOPNOTSUPP;
        } else {
                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
                    info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EINVAL;
                handle = nf_tables_alloc_handle(table);

                if (nla[NFTA_RULE_POSITION]) {
                        pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
                        old_rule = __nft_rule_lookup(chain, pos_handle);
                        if (IS_ERR(old_rule)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
                                return PTR_ERR(old_rule);
                        }
                } else if (nla[NFTA_RULE_POSITION_ID]) {
                        old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]);
                        if (IS_ERR(old_rule)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
                                return PTR_ERR(old_rule);
                        }
                }
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        n = 0;
        size = 0;
        if (nla[NFTA_RULE_EXPRESSIONS]) {
                expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
                                           sizeof(struct nft_expr_info),
                                           GFP_KERNEL);
                if (!expr_info)
                        return -ENOMEM;

                nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
                        err = -EINVAL;
                        if (nla_type(tmp) != NFTA_LIST_ELEM)
                                goto err_release_expr;
                        if (n == NFT_RULE_MAXEXPRS)
                                goto err_release_expr;
                        err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
                        if (err < 0) {
                                NL_SET_BAD_ATTR(extack, tmp);
                                goto err_release_expr;
                        }
                        size += expr_info[n].ops->size;
                        n++;
                }
        }
        /* Check for overflow of dlen field */
        err = -EFBIG;
        if (size >= 1 << 12)
                goto err_release_expr;

        if (nla[NFTA_RULE_USERDATA]) {
                ulen = nla_len(nla[NFTA_RULE_USERDATA]);
                if (ulen > 0)
                        usize = sizeof(struct nft_userdata) + ulen;
        }

        err = -ENOMEM;
        rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
        if (rule == NULL)
                goto err_release_expr;

        nft_activate_next(net, rule);

        rule->handle = handle;
        rule->dlen   = size;
        rule->udata  = ulen ? 1 : 0;

        if (ulen) {
                udata = nft_userdata(rule);
                udata->len = ulen - 1;
                nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen);
        }

        expr = nft_expr_first(rule);
        for (i = 0; i < n; i++) {
                err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, expr_info[i].attr);
                        goto err_release_rule;
                }

                if (expr_info[i].ops->validate)
                        nft_validate_state_update(table, NFT_VALIDATE_NEED);

                expr_info[i].ops = NULL;
                expr = nft_expr_next(expr);
        }

        if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
                flow = nft_flow_rule_create(net, rule);
                if (IS_ERR(flow)) {
                        err = PTR_ERR(flow);
                        goto err_release_rule;
                }
        }

        if (!nft_use_inc(&chain->use)) {
                err = -EMFILE;
                goto err_release_rule;
        }

        if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
                if (nft_chain_binding(chain)) {
                        err = -EOPNOTSUPP;
                        goto err_destroy_flow_rule;
                }

                err = nft_delrule(&ctx, old_rule);
                if (err < 0)
                        goto err_destroy_flow_rule;

                trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
                if (trans == NULL) {
                        err = -ENOMEM;
                        goto err_destroy_flow_rule;
                }
                list_add_tail_rcu(&rule->list, &old_rule->list);
        } else {
                trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
                if (!trans) {
                        err = -ENOMEM;
                        goto err_destroy_flow_rule;
                }

                if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
                        if (old_rule)
                                list_add_rcu(&rule->list, &old_rule->list);
                        else
                                list_add_tail_rcu(&rule->list, &chain->rules);
                 } else {
                        if (old_rule)
                                list_add_tail_rcu(&rule->list, &old_rule->list);
                        else
                                list_add_rcu(&rule->list, &chain->rules);
                }
        }
        kvfree(expr_info);

        if (flow)
                nft_trans_flow_rule(trans) = flow;

        if (table->validate_state == NFT_VALIDATE_DO)
                return nft_table_validate(net, table);

        return 0;

err_destroy_flow_rule:
        nft_use_dec_restore(&chain->use);
        if (flow)
                nft_flow_rule_destroy(flow);
err_release_rule:
        nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);
        nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
        for (i = 0; i < n; i++) {
                if (expr_info[i].ops) {
                        module_put(expr_info[i].ops->type->owner);
                        if (expr_info[i].ops->type->release_ops)
                                expr_info[i].ops->type->release_ops(expr_info[i].ops);
                }
        }
        kvfree(expr_info);

        return err;
}

static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
                                             const struct nft_chain *chain,
                                             const struct nlattr *nla)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        u32 id = ntohl(nla_get_be32(nla));
        struct nft_trans *trans;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                if (trans->msg_type == NFT_MSG_NEWRULE &&
                    trans->ctx.chain == chain &&
                    id == nft_trans_rule_id(trans))
                        return nft_trans_rule(trans);
        }
        return ERR_PTR(-ENOENT);
}

static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_chain *chain = NULL;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_rule *rule;
        struct nft_ctx ctx;
        int err = 0;

        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_RULE_CHAIN]) {
                chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
                                         genmask);
                if (IS_ERR(chain)) {
                        if (PTR_ERR(chain) == -ENOENT &&
                            NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
                                return 0;

                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
                        return PTR_ERR(chain);
                }
                if (nft_chain_binding(chain))
                        return -EOPNOTSUPP;
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        if (chain) {
                if (nla[NFTA_RULE_HANDLE]) {
                        rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
                        if (IS_ERR(rule)) {
                                if (PTR_ERR(rule) == -ENOENT &&
                                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
                                        return 0;

                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                                return PTR_ERR(rule);
                        }

                        err = nft_delrule(&ctx, rule);
                } else if (nla[NFTA_RULE_ID]) {
                        rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]);
                        if (IS_ERR(rule)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
                                return PTR_ERR(rule);
                        }

                        err = nft_delrule(&ctx, rule);
                } else {
                        err = nft_delrule_by_chain(&ctx);
                }
        } else {
                list_for_each_entry(chain, &table->chains, list) {
                        if (!nft_is_active_next(net, chain))
                                continue;
                        if (nft_chain_binding(chain))
                                continue;

                        ctx.chain = chain;
                        err = nft_delrule_by_chain(&ctx);
                        if (err < 0)
                                break;
                }
        }

        return err;
}

/*
 * Sets
 */
static const struct nft_set_type *nft_set_types[] = {
        &nft_set_hash_fast_type,
        &nft_set_hash_type,
        &nft_set_rhash_type,
        &nft_set_bitmap_type,
        &nft_set_rbtree_type,
#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
        &nft_set_pipapo_avx2_type,
#endif
        &nft_set_pipapo_type,
};

#define NFT_SET_FEATURES        (NFT_SET_INTERVAL | NFT_SET_MAP | \
                                 NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
                                 NFT_SET_EVAL)

static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
{
        return (flags & type->features) == (flags & NFT_SET_FEATURES);
}

/*
 * Select a set implementation based on the data characteristics and the
 * given policy. The total memory use might not be known if no size is
 * given, in that case the amount of memory per element is used.
 */
static const struct nft_set_ops *
nft_select_set_ops(const struct nft_ctx *ctx, u32 flags,
                   const struct nft_set_desc *desc)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        const struct nft_set_ops *ops, *bops;
        struct nft_set_estimate est, best;
        const struct nft_set_type *type;
        int i;

        lockdep_assert_held(&nft_net->commit_mutex);
        lockdep_nfnl_nft_mutex_not_held();

        bops            = NULL;
        best.size   = ~0;
        best.lookup = ~0;
        best.space  = ~0;

        for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) {
                type = nft_set_types[i];
                ops = &type->ops;

                if (!nft_set_ops_candidate(type, flags))
                        continue;
                if (!ops->estimate(desc, flags, &est))
                        continue;

                switch (desc->policy) {
                case NFT_SET_POL_PERFORMANCE:
                        if (est.lookup < best.lookup)
                                break;
                        if (est.lookup == best.lookup &&
                            est.space < best.space)
                                break;
                        continue;
                case NFT_SET_POL_MEMORY:
                        if (!desc->size) {
                                if (est.space < best.space)
                                        break;
                                if (est.space == best.space &&
                                    est.lookup < best.lookup)
                                        break;
                        } else if (est.size < best.size || !bops) {
                                break;
                        }
                        continue;
                default:
                        break;
                }

                bops = ops;
                best = est;
        }

        if (bops != NULL)
                return bops;

        return ERR_PTR(-EOPNOTSUPP);
}

static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
        [NFTA_SET_TABLE]                = { .type = NLA_STRING,
                                            .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_SET_NAME]                        = { .type = NLA_STRING,
                                            .len = NFT_SET_MAXNAMELEN - 1 },
        [NFTA_SET_FLAGS]                = { .type = NLA_U32 },
        [NFTA_SET_KEY_TYPE]                = { .type = NLA_U32 },
        [NFTA_SET_KEY_LEN]                = { .type = NLA_U32 },
        [NFTA_SET_DATA_TYPE]                = { .type = NLA_U32 },
        [NFTA_SET_DATA_LEN]                = { .type = NLA_U32 },
        [NFTA_SET_POLICY]                = { .type = NLA_U32 },
        [NFTA_SET_DESC]                        = { .type = NLA_NESTED },
        [NFTA_SET_ID]                        = { .type = NLA_U32 },
        [NFTA_SET_TIMEOUT]                = { .type = NLA_U64 },
        [NFTA_SET_GC_INTERVAL]                = { .type = NLA_U32 },
        [NFTA_SET_USERDATA]                = { .type = NLA_BINARY,
                                            .len  = NFT_USERDATA_MAXLEN },
        [NFTA_SET_OBJ_TYPE]                = { .type = NLA_U32 },
        [NFTA_SET_HANDLE]                = { .type = NLA_U64 },
        [NFTA_SET_EXPR]                        = { .type = NLA_NESTED },
        [NFTA_SET_EXPRESSIONS]                = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};

static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
        [NFTA_SET_FIELD_LEN]        = { .type = NLA_U32 },
};

static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
        [NFTA_SET_DESC_SIZE]                = { .type = NLA_U32 },
        [NFTA_SET_DESC_CONCAT]                = NLA_POLICY_NESTED_ARRAY(nft_concat_policy),
};

static struct nft_set *nft_set_lookup(const struct nft_table *table,
                                      const struct nlattr *nla, u8 genmask)
{
        struct nft_set *set;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        list_for_each_entry_rcu(set, &table->sets, list) {
                if (!nla_strcmp(nla, set->name) &&
                    nft_active_genmask(set, genmask))
                        return set;
        }
        return ERR_PTR(-ENOENT);
}

static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
                                               const struct nlattr *nla,
                                               u8 genmask)
{
        struct nft_set *set;

        list_for_each_entry(set, &table->sets, list) {
                if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
                    nft_active_genmask(set, genmask))
                        return set;
        }
        return ERR_PTR(-ENOENT);
}

static struct nft_set *nft_set_lookup_byid(const struct net *net,
                                           const struct nft_table *table,
                                           const struct nlattr *nla, u8 genmask)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        u32 id = ntohl(nla_get_be32(nla));
        struct nft_trans *trans;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                if (trans->msg_type == NFT_MSG_NEWSET) {
                        struct nft_set *set = nft_trans_set(trans);

                        if (id == nft_trans_set_id(trans) &&
                            set->table == table &&
                            nft_active_genmask(set, genmask))
                                return set;
                }
        }
        return ERR_PTR(-ENOENT);
}

struct nft_set *nft_set_lookup_global(const struct net *net,
                                      const struct nft_table *table,
                                      const struct nlattr *nla_set_name,
                                      const struct nlattr *nla_set_id,
                                      u8 genmask)
{
        struct nft_set *set;

        set = nft_set_lookup(table, nla_set_name, genmask);
        if (IS_ERR(set)) {
                if (!nla_set_id)
                        return set;

                set = nft_set_lookup_byid(net, table, nla_set_id, genmask);
        }
        return set;
}
EXPORT_SYMBOL_GPL(nft_set_lookup_global);

static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
                                    const char *name)
{
        const struct nft_set *i;
        const char *p;
        unsigned long *inuse;
        unsigned int n = 0, min = 0;

        p = strchr(name, '%');
        if (p != NULL) {
                if (p[1] != 'd' || strchr(p + 2, '%'))
                        return -EINVAL;

                if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
                        return -EINVAL;

                inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
                if (inuse == NULL)
                        return -ENOMEM;
cont:
                list_for_each_entry(i, &ctx->table->sets, list) {
                        int tmp;

                        if (!nft_is_active_next(ctx->net, i))
                                continue;
                        if (!sscanf(i->name, name, &tmp))
                                continue;
                        if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
                                continue;

                        set_bit(tmp - min, inuse);
                }

                n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
                if (n >= BITS_PER_BYTE * PAGE_SIZE) {
                        min += BITS_PER_BYTE * PAGE_SIZE;
                        memset(inuse, 0, PAGE_SIZE);
                        goto cont;
                }
                free_page((unsigned long)inuse);
        }

        set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
        if (!set->name)
                return -ENOMEM;

        list_for_each_entry(i, &ctx->table->sets, list) {
                if (!nft_is_active_next(ctx->net, i))
                        continue;
                if (!strcmp(set->name, i->name)) {
                        kfree(set->name);
                        set->name = NULL;
                        return -ENFILE;
                }
        }
        return 0;
}

int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
{
        u64 ms = be64_to_cpu(nla_get_be64(nla));
        u64 max = (u64)(~((u64)0));

        max = div_u64(max, NSEC_PER_MSEC);
        if (ms >= max)
                return -ERANGE;

        ms *= NSEC_PER_MSEC;
        *result = nsecs_to_jiffies64(ms);
        return 0;
}

__be64 nf_jiffies64_to_msecs(u64 input)
{
        return cpu_to_be64(jiffies64_to_msecs(input));
}

static int nf_tables_fill_set_concat(struct sk_buff *skb,
                                     const struct nft_set *set)
{
        struct nlattr *concat, *field;
        int i;

        concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT);
        if (!concat)
                return -ENOMEM;

        for (i = 0; i < set->field_count; i++) {
                field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM);
                if (!field)
                        return -ENOMEM;

                if (nla_put_be32(skb, NFTA_SET_FIELD_LEN,
                                 htonl(set->field_len[i])))
                        return -ENOMEM;

                nla_nest_end(skb, field);
        }

        nla_nest_end(skb, concat);

        return 0;
}

static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
                              const struct nft_set *set, u16 event, u16 flags)
{
        u64 timeout = READ_ONCE(set->timeout);
        u32 gc_int = READ_ONCE(set->gc_int);
        u32 portid = ctx->portid;
        struct nlmsghdr *nlh;
        struct nlattr *nest;
        u32 seq = ctx->seq;
        int i;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
                           NFNETLINK_V0, nft_base_seq(ctx->net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_SET_NAME, set->name))
                goto nla_put_failure;
        if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
                         NFTA_SET_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELSET) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (set->flags != 0)
                if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
                        goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype)))
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen)))
                goto nla_put_failure;
        if (set->flags & NFT_SET_MAP) {
                if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype)))
                        goto nla_put_failure;
                if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
                        goto nla_put_failure;
        }
        if (set->flags & NFT_SET_OBJECT &&
            nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
                goto nla_put_failure;

        if (timeout &&
            nla_put_be64(skb, NFTA_SET_TIMEOUT,
                         nf_jiffies64_to_msecs(timeout),
                         NFTA_SET_PAD))
                goto nla_put_failure;
        if (gc_int &&
            nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int)))
                goto nla_put_failure;

        if (set->policy != NFT_SET_POL_PERFORMANCE) {
                if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy)))
                        goto nla_put_failure;
        }

        if (set->udata &&
            nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
        if (!nest)
                goto nla_put_failure;
        if (set->size &&
            nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
                goto nla_put_failure;

        if (set->field_count > 1 &&
            nf_tables_fill_set_concat(skb, set))
                goto nla_put_failure;

        nla_nest_end(skb, nest);

        if (set->num_exprs == 1) {
                nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
                if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
                        goto nla_put_failure;

                nla_nest_end(skb, nest);
        } else if (set->num_exprs > 1) {
                nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS);
                if (nest == NULL)
                        goto nla_put_failure;

                for (i = 0; i < set->num_exprs; i++) {
                        if (nft_expr_dump(skb, NFTA_LIST_ELEM,
                                          set->exprs[i], false) < 0)
                                goto nla_put_failure;
                }
                nla_nest_end(skb, nest);
        }

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void nf_tables_set_notify(const struct nft_ctx *ctx,
                                 const struct nft_set *set, int event,
                                 gfp_t gfp_flags)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        u32 portid = ctx->portid;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_set(skb, ctx, set, event, flags);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nft_set *set;
        unsigned int idx, s_idx = cb->args[0];
        struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
        struct net *net = sock_net(skb->sk);
        struct nft_ctx *ctx = cb->data, ctx_set;
        struct nftables_pernet *nft_net;

        if (cb->args[1])
                return skb->len;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (ctx->family != NFPROTO_UNSPEC &&
                    ctx->family != table->family)
                        continue;

                if (ctx->table && ctx->table != table)
                        continue;

                if (cur_table) {
                        if (cur_table != table)
                                continue;

                        cur_table = NULL;
                }
                idx = 0;
                list_for_each_entry_rcu(set, &table->sets, list) {
                        if (idx < s_idx)
                                goto cont;
                        if (!nft_is_active(net, set))
                                goto cont;

                        ctx_set = *ctx;
                        ctx_set.table = table;
                        ctx_set.family = table->family;

                        if (nf_tables_fill_set(skb, &ctx_set, set,
                                               NFT_MSG_NEWSET,
                                               NLM_F_MULTI) < 0) {
                                cb->args[0] = idx;
                                cb->args[2] = (unsigned long) table;
                                goto done;
                        }
                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
                if (s_idx)
                        s_idx = 0;
        }
        cb->args[1] = 1;
done:
        rcu_read_unlock();
        return skb->len;
}

static int nf_tables_dump_sets_start(struct netlink_callback *cb)
{
        struct nft_ctx *ctx_dump = NULL;

        ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC);
        if (ctx_dump == NULL)
                return -ENOMEM;

        cb->data = ctx_dump;
        return 0;
}

static int nf_tables_dump_sets_done(struct netlink_callback *cb)
{
        kfree(cb->data);
        return 0;
}

/* called with rcu_read_lock held */
static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_table *table = NULL;
        struct net *net = info->net;
        const struct nft_set *set;
        struct sk_buff *skb2;
        struct nft_ctx ctx;
        int err;

        if (nla[NFTA_SET_TABLE]) {
                table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
                                         genmask, 0);
                if (IS_ERR(table)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
                        return PTR_ERR(table);
                }
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_sets_start,
                        .dump = nf_tables_dump_sets,
                        .done = nf_tables_dump_sets_done,
                        .data = &ctx,
                        .module = THIS_MODULE,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        /* Only accept unspec with dump */
        if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
                return -EAFNOSUPPORT;
        if (!nla[NFTA_SET_TABLE])
                return -EINVAL;

        set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                return PTR_ERR(set);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;

        err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
        if (err < 0)
                goto err_fill_set_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_set_info:
        kfree_skb(skb2);
        return err;
}

static int nft_set_desc_concat_parse(const struct nlattr *attr,
                                     struct nft_set_desc *desc)
{
        struct nlattr *tb[NFTA_SET_FIELD_MAX + 1];
        u32 len;
        int err;

        if (desc->field_count >= ARRAY_SIZE(desc->field_len))
                return -E2BIG;

        err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
                                          nft_concat_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[NFTA_SET_FIELD_LEN])
                return -EINVAL;

        len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
        if (!len || len > U8_MAX)
                return -EINVAL;

        desc->field_len[desc->field_count++] = len;

        return 0;
}

static int nft_set_desc_concat(struct nft_set_desc *desc,
                               const struct nlattr *nla)
{
        u32 num_regs = 0, key_num_regs = 0;
        struct nlattr *attr;
        int rem, err, i;

        nla_for_each_nested(attr, nla, rem) {
                if (nla_type(attr) != NFTA_LIST_ELEM)
                        return -EINVAL;

                err = nft_set_desc_concat_parse(attr, desc);
                if (err < 0)
                        return err;
        }

        for (i = 0; i < desc->field_count; i++)
                num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32));

        key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32));
        if (key_num_regs != num_regs)
                return -EINVAL;

        if (num_regs > NFT_REG32_COUNT)
                return -E2BIG;

        return 0;
}

static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
                                    const struct nlattr *nla)
{
        struct nlattr *da[NFTA_SET_DESC_MAX + 1];
        int err;

        err = nla_parse_nested_deprecated(da, NFTA_SET_DESC_MAX, nla,
                                          nft_set_desc_policy, NULL);
        if (err < 0)
                return err;

        if (da[NFTA_SET_DESC_SIZE] != NULL)
                desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
        if (da[NFTA_SET_DESC_CONCAT])
                err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]);

        return err;
}

static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set,
                              const struct nlattr * const *nla,
                              struct nft_expr **exprs, int *num_exprs,
                              u32 flags)
{
        struct nft_expr *expr;
        int err, i;

        if (nla[NFTA_SET_EXPR]) {
                expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]);
                if (IS_ERR(expr)) {
                        err = PTR_ERR(expr);
                        goto err_set_expr_alloc;
                }
                exprs[0] = expr;
                (*num_exprs)++;
        } else if (nla[NFTA_SET_EXPRESSIONS]) {
                struct nlattr *tmp;
                int left;

                if (!(flags & NFT_SET_EXPR)) {
                        err = -EINVAL;
                        goto err_set_expr_alloc;
                }
                i = 0;
                nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) {
                        if (i == NFT_SET_EXPR_MAX) {
                                err = -E2BIG;
                                goto err_set_expr_alloc;
                        }
                        if (nla_type(tmp) != NFTA_LIST_ELEM) {
                                err = -EINVAL;
                                goto err_set_expr_alloc;
                        }
                        expr = nft_set_elem_expr_alloc(ctx, set, tmp);
                        if (IS_ERR(expr)) {
                                err = PTR_ERR(expr);
                                goto err_set_expr_alloc;
                        }
                        exprs[i++] = expr;
                        (*num_exprs)++;
                }
        }

        return 0;

err_set_expr_alloc:
        for (i = 0; i < *num_exprs; i++)
                nft_expr_destroy(ctx, exprs[i]);

        return err;
}

static bool nft_set_is_same(const struct nft_set *set,
                            const struct nft_set_desc *desc,
                            struct nft_expr *exprs[], u32 num_exprs, u32 flags)
{
        int i;

        if (set->ktype != desc->ktype ||
            set->dtype != desc->dtype ||
            set->flags != flags ||
            set->klen != desc->klen ||
            set->dlen != desc->dlen ||
            set->field_count != desc->field_count ||
            set->num_exprs != num_exprs)
                return false;

        for (i = 0; i < desc->field_count; i++) {
                if (set->field_len[i] != desc->field_len[i])
                        return false;
        }

        for (i = 0; i < num_exprs; i++) {
                if (set->exprs[i]->ops != exprs[i]->ops)
                        return false;
        }

        return true;
}

static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_set_ops *ops;
        struct net *net = info->net;
        struct nft_set_desc desc;
        struct nft_table *table;
        unsigned char *udata;
        struct nft_set *set;
        struct nft_ctx ctx;
        size_t alloc_size;
        int num_exprs = 0;
        char *name;
        int err, i;
        u16 udlen;
        u32 flags;
        u64 size;

        if (nla[NFTA_SET_TABLE] == NULL ||
            nla[NFTA_SET_NAME] == NULL ||
            nla[NFTA_SET_KEY_LEN] == NULL ||
            nla[NFTA_SET_ID] == NULL)
                return -EINVAL;

        memset(&desc, 0, sizeof(desc));

        desc.ktype = NFT_DATA_VALUE;
        if (nla[NFTA_SET_KEY_TYPE] != NULL) {
                desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
                if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
                        return -EINVAL;
        }

        desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
        if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN)
                return -EINVAL;

        flags = 0;
        if (nla[NFTA_SET_FLAGS] != NULL) {
                flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
                if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
                              NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
                              NFT_SET_MAP | NFT_SET_EVAL |
                              NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR))
                        return -EOPNOTSUPP;
                /* Only one of these operations is supported */
                if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
                             (NFT_SET_MAP | NFT_SET_OBJECT))
                        return -EOPNOTSUPP;
                if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) ==
                             (NFT_SET_EVAL | NFT_SET_OBJECT))
                        return -EOPNOTSUPP;
                if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) ==
                             (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT))
                        return -EOPNOTSUPP;
                if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) ==
                             (NFT_SET_CONSTANT | NFT_SET_TIMEOUT))
                        return -EOPNOTSUPP;
        }

        desc.dtype = 0;
        if (nla[NFTA_SET_DATA_TYPE] != NULL) {
                if (!(flags & NFT_SET_MAP))
                        return -EINVAL;

                desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
                if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
                    desc.dtype != NFT_DATA_VERDICT)
                        return -EINVAL;

                if (desc.dtype != NFT_DATA_VERDICT) {
                        if (nla[NFTA_SET_DATA_LEN] == NULL)
                                return -EINVAL;
                        desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
                        if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN)
                                return -EINVAL;
                } else
                        desc.dlen = sizeof(struct nft_verdict);
        } else if (flags & NFT_SET_MAP)
                return -EINVAL;

        if (nla[NFTA_SET_OBJ_TYPE] != NULL) {
                if (!(flags & NFT_SET_OBJECT))
                        return -EINVAL;

                desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
                if (desc.objtype == NFT_OBJECT_UNSPEC ||
                    desc.objtype > NFT_OBJECT_MAX)
                        return -EOPNOTSUPP;
        } else if (flags & NFT_SET_OBJECT)
                return -EINVAL;
        else
                desc.objtype = NFT_OBJECT_UNSPEC;

        desc.timeout = 0;
        if (nla[NFTA_SET_TIMEOUT] != NULL) {
                if (!(flags & NFT_SET_TIMEOUT))
                        return -EINVAL;

                if (flags & NFT_SET_ANONYMOUS)
                        return -EOPNOTSUPP;

                err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout);
                if (err)
                        return err;
        }
        desc.gc_int = 0;
        if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
                if (!(flags & NFT_SET_TIMEOUT))
                        return -EINVAL;

                if (flags & NFT_SET_ANONYMOUS)
                        return -EOPNOTSUPP;

                desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
        }

        desc.policy = NFT_SET_POL_PERFORMANCE;
        if (nla[NFTA_SET_POLICY] != NULL) {
                desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
                switch (desc.policy) {
                case NFT_SET_POL_PERFORMANCE:
                case NFT_SET_POL_MEMORY:
                        break;
                default:
                        return -EOPNOTSUPP;
                }
        }

        if (nla[NFTA_SET_DESC] != NULL) {
                err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
                if (err < 0)
                        return err;

                if (desc.field_count > 1) {
                        if (!(flags & NFT_SET_CONCAT))
                                return -EINVAL;
                } else if (flags & NFT_SET_CONCAT) {
                        return -EINVAL;
                }
        } else if (flags & NFT_SET_CONCAT) {
                return -EINVAL;
        }

        if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
                desc.expr = true;

        table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
                return PTR_ERR(table);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
        if (IS_ERR(set)) {
                if (PTR_ERR(set) != -ENOENT) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                        return PTR_ERR(set);
                }
        } else {
                struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {};

                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                if (nft_set_is_anonymous(set))
                        return -EOPNOTSUPP;

                err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags);
                if (err < 0)
                        return err;

                err = 0;
                if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                        err = -EEXIST;
                }

                for (i = 0; i < num_exprs; i++)
                        nft_expr_destroy(&ctx, exprs[i]);

                if (err < 0)
                        return err;

                return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc);
        }

        if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
                return -ENOENT;

        ops = nft_select_set_ops(&ctx, flags, &desc);
        if (IS_ERR(ops))
                return PTR_ERR(ops);

        udlen = 0;
        if (nla[NFTA_SET_USERDATA])
                udlen = nla_len(nla[NFTA_SET_USERDATA]);

        size = 0;
        if (ops->privsize != NULL)
                size = ops->privsize(nla, &desc);
        alloc_size = sizeof(*set) + size + udlen;
        if (alloc_size < size || alloc_size > INT_MAX)
                return -ENOMEM;

        if (!nft_use_inc(&table->use))
                return -EMFILE;

        set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
        if (!set) {
                err = -ENOMEM;
                goto err_alloc;
        }

        name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
        if (!name) {
                err = -ENOMEM;
                goto err_set_name;
        }

        err = nf_tables_set_alloc_name(&ctx, set, name);
        kfree(name);
        if (err < 0)
                goto err_set_name;

        udata = NULL;
        if (udlen) {
                udata = set->data + size;
                nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen);
        }

        INIT_LIST_HEAD(&set->bindings);
        INIT_LIST_HEAD(&set->catchall_list);
        refcount_set(&set->refs, 1);
        set->table = table;
        write_pnet(&set->net, net);
        set->ops = ops;
        set->ktype = desc.ktype;
        set->klen = desc.klen;
        set->dtype = desc.dtype;
        set->objtype = desc.objtype;
        set->dlen = desc.dlen;
        set->flags = flags;
        set->size = desc.size;
        set->policy = desc.policy;
        set->udlen = udlen;
        set->udata = udata;
        set->timeout = desc.timeout;
        set->gc_int = desc.gc_int;

        set->field_count = desc.field_count;
        for (i = 0; i < desc.field_count; i++)
                set->field_len[i] = desc.field_len[i];

        err = ops->init(set, &desc, nla);
        if (err < 0)
                goto err_set_init;

        err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags);
        if (err < 0)
                goto err_set_destroy;

        set->num_exprs = num_exprs;
        set->handle = nf_tables_alloc_handle(table);
        INIT_LIST_HEAD(&set->pending_update);

        err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
        if (err < 0)
                goto err_set_expr_alloc;

        list_add_tail_rcu(&set->list, &table->sets);

        return 0;

err_set_expr_alloc:
        for (i = 0; i < set->num_exprs; i++)
                nft_expr_destroy(&ctx, set->exprs[i]);
err_set_destroy:
        ops->destroy(&ctx, set);
err_set_init:
        kfree(set->name);
err_set_name:
        kvfree(set);
err_alloc:
        nft_use_dec_restore(&table->use);

        return err;
}

static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
                                     struct nft_set *set)
{
        struct nft_set_elem_catchall *next, *catchall;

        list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
                list_del_rcu(&catchall->list);
                nf_tables_set_elem_destroy(ctx, set, catchall->elem);
                kfree_rcu(catchall, rcu);
        }
}

static void nft_set_put(struct nft_set *set)
{
        if (refcount_dec_and_test(&set->refs)) {
                kfree(set->name);
                kvfree(set);
        }
}

static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
        int i;

        if (WARN_ON(set->use > 0))
                return;

        for (i = 0; i < set->num_exprs; i++)
                nft_expr_destroy(ctx, set->exprs[i]);

        set->ops->destroy(ctx, set);
        nft_set_catchall_destroy(ctx, set);
        nft_set_put(set);
}

static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_ctx ctx;

        if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
                return -EAFNOSUPPORT;

        table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_SET_HANDLE]) {
                attr = nla[NFTA_SET_HANDLE];
                set = nft_set_lookup_byhandle(table, attr, genmask);
        } else {
                attr = nla[NFTA_SET_NAME];
                set = nft_set_lookup(table, attr, genmask);
        }

        if (IS_ERR(set)) {
                if (PTR_ERR(set) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(set);
        }
        if (set->use ||
            (info->nlh->nlmsg_flags & NLM_F_NONREC &&
             atomic_read(&set->nelems) > 0)) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        return nft_delset(&ctx, set);
}

static int nft_validate_register_store(const struct nft_ctx *ctx,
                                       enum nft_registers reg,
                                       const struct nft_data *data,
                                       enum nft_data_types type,
                                       unsigned int len);

static int nft_setelem_data_validate(const struct nft_ctx *ctx,
                                     struct nft_set *set,
                                     struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        enum nft_registers dreg;

        dreg = nft_type_to_reg(set->dtype);
        return nft_validate_register_store(ctx, dreg, nft_set_ext_data(ext),
                                           set->dtype == NFT_DATA_VERDICT ?
                                           NFT_DATA_VERDICT : NFT_DATA_VALUE,
                                           set->dlen);
}

static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
                                        struct nft_set *set,
                                        const struct nft_set_iter *iter,
                                        struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        return nft_setelem_data_validate(ctx, set, elem_priv);
}

static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
                                       struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                ret = nft_setelem_data_validate(ctx, set, catchall->elem);
                if (ret < 0)
                        break;
        }

        return ret;
}

int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
                       struct nft_set_binding *binding)
{
        struct nft_set_binding *i;
        struct nft_set_iter iter;

        if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
                return -EBUSY;

        if (binding->flags & NFT_SET_MAP) {
                /* If the set is already bound to the same chain all
                 * jumps are already validated for that chain.
                 */
                list_for_each_entry(i, &set->bindings, list) {
                        if (i->flags & NFT_SET_MAP &&
                            i->chain == binding->chain)
                                goto bind;
                }

                iter.genmask        = nft_genmask_next(ctx->net);
                iter.type        = NFT_ITER_UPDATE;
                iter.skip         = 0;
                iter.count        = 0;
                iter.err        = 0;
                iter.fn                = nf_tables_bind_check_setelem;

                set->ops->walk(ctx, set, &iter);
                if (!iter.err)
                        iter.err = nft_set_catchall_bind_check(ctx, set);

                if (iter.err < 0)
                        return iter.err;
        }
bind:
        if (!nft_use_inc(&set->use))
                return -EMFILE;

        binding->chain = ctx->chain;
        list_add_tail_rcu(&binding->list, &set->bindings);
        nft_set_trans_bind(ctx, set);

        return 0;
}
EXPORT_SYMBOL_GPL(nf_tables_bind_set);

static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
                                 struct nft_set_binding *binding, bool event)
{
        list_del_rcu(&binding->list);

        if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) {
                list_del_rcu(&set->list);
                set->dead = 1;
                if (event)
                        nf_tables_set_notify(ctx, set, NFT_MSG_DELSET,
                                             GFP_KERNEL);
        }
}

static void nft_setelem_data_activate(const struct net *net,
                                      const struct nft_set *set,
                                      struct nft_elem_priv *elem_priv);

static int nft_mapelem_activate(const struct nft_ctx *ctx,
                                struct nft_set *set,
                                const struct nft_set_iter *iter,
                                struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        /* called from abort path, reverse check to undo changes. */
        if (nft_set_elem_active(ext, iter->genmask))
                return 0;

        nft_clear(ctx->net, ext);
        nft_setelem_data_activate(ctx->net, set, elem_priv);

        return 0;
}

static void nft_map_catchall_activate(const struct nft_ctx *ctx,
                                      struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                nft_clear(ctx->net, ext);
                nft_setelem_data_activate(ctx->net, set, catchall->elem);
                break;
        }
}

static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set)
{
        struct nft_set_iter iter = {
                .genmask        = nft_genmask_next(ctx->net),
                .type                = NFT_ITER_UPDATE,
                .fn                = nft_mapelem_activate,
        };

        set->ops->walk(ctx, set, &iter);
        WARN_ON_ONCE(iter.err);

        nft_map_catchall_activate(ctx, set);
}

void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set)
{
        if (nft_set_is_anonymous(set)) {
                if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                        nft_map_activate(ctx, set);

                nft_clear(ctx->net, set);
        }

        nft_use_inc_restore(&set->use);
}
EXPORT_SYMBOL_GPL(nf_tables_activate_set);

void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
                              struct nft_set_binding *binding,
                              enum nft_trans_phase phase)
{
        switch (phase) {
        case NFT_TRANS_PREPARE_ERROR:
                nft_set_trans_unbind(ctx, set);
                if (nft_set_is_anonymous(set))
                        nft_deactivate_next(ctx->net, set);
                else
                        list_del_rcu(&binding->list);

                nft_use_dec(&set->use);
                break;
        case NFT_TRANS_PREPARE:
                if (nft_set_is_anonymous(set)) {
                        if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                                nft_map_deactivate(ctx, set);

                        nft_deactivate_next(ctx->net, set);
                }
                nft_use_dec(&set->use);
                return;
        case NFT_TRANS_ABORT:
        case NFT_TRANS_RELEASE:
                if (nft_set_is_anonymous(set) &&
                    set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                        nft_map_deactivate(ctx, set);

                nft_use_dec(&set->use);
                fallthrough;
        default:
                nf_tables_unbind_set(ctx, set, binding,
                                     phase == NFT_TRANS_COMMIT);
        }
}
EXPORT_SYMBOL_GPL(nf_tables_deactivate_set);

void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
        if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
                nft_set_destroy(ctx, set);
}
EXPORT_SYMBOL_GPL(nf_tables_destroy_set);

const struct nft_set_ext_type nft_set_ext_types[] = {
        [NFT_SET_EXT_KEY]                = {
                .align        = __alignof__(u32),
        },
        [NFT_SET_EXT_DATA]                = {
                .align        = __alignof__(u32),
        },
        [NFT_SET_EXT_EXPRESSIONS]        = {
                .align        = __alignof__(struct nft_set_elem_expr),
        },
        [NFT_SET_EXT_OBJREF]                = {
                .len        = sizeof(struct nft_object *),
                .align        = __alignof__(struct nft_object *),
        },
        [NFT_SET_EXT_FLAGS]                = {
                .len        = sizeof(u8),
                .align        = __alignof__(u8),
        },
        [NFT_SET_EXT_TIMEOUT]                = {
                .len        = sizeof(u64),
                .align        = __alignof__(u64),
        },
        [NFT_SET_EXT_EXPIRATION]        = {
                .len        = sizeof(u64),
                .align        = __alignof__(u64),
        },
        [NFT_SET_EXT_USERDATA]                = {
                .len        = sizeof(struct nft_userdata),
                .align        = __alignof__(struct nft_userdata),
        },
        [NFT_SET_EXT_KEY_END]                = {
                .align        = __alignof__(u32),
        },
};

/*
 * Set elements
 */

static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
        [NFTA_SET_ELEM_KEY]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_DATA]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_FLAGS]                = { .type = NLA_U32 },
        [NFTA_SET_ELEM_TIMEOUT]                = { .type = NLA_U64 },
        [NFTA_SET_ELEM_EXPIRATION]        = { .type = NLA_U64 },
        [NFTA_SET_ELEM_USERDATA]        = { .type = NLA_BINARY,
                                            .len = NFT_USERDATA_MAXLEN },
        [NFTA_SET_ELEM_EXPR]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_OBJREF]                = { .type = NLA_STRING,
                                            .len = NFT_OBJ_MAXNAMELEN - 1 },
        [NFTA_SET_ELEM_KEY_END]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_EXPRESSIONS]        = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};

static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
        [NFTA_SET_ELEM_LIST_TABLE]        = { .type = NLA_STRING,
                                            .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_SET_ELEM_LIST_SET]        = { .type = NLA_STRING,
                                            .len = NFT_SET_MAXNAMELEN - 1 },
        [NFTA_SET_ELEM_LIST_ELEMENTS]        = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy),
        [NFTA_SET_ELEM_LIST_SET_ID]        = { .type = NLA_U32 },
};

static int nft_set_elem_expr_dump(struct sk_buff *skb,
                                  const struct nft_set *set,
                                  const struct nft_set_ext *ext,
                                  bool reset)
{
        struct nft_set_elem_expr *elem_expr;
        u32 size, num_exprs = 0;
        struct nft_expr *expr;
        struct nlattr *nest;

        elem_expr = nft_set_ext_expr(ext);
        nft_setelem_expr_foreach(expr, elem_expr, size)
                num_exprs++;

        if (num_exprs == 1) {
                expr = nft_setelem_expr_at(elem_expr, 0);
                if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0)
                        return -1;

                return 0;
        } else if (num_exprs > 1) {
                nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS);
                if (nest == NULL)
                        goto nla_put_failure;

                nft_setelem_expr_foreach(expr, elem_expr, size) {
                        expr = nft_setelem_expr_at(elem_expr, size);
                        if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
                                goto nla_put_failure;
                }
                nla_nest_end(skb, nest);
        }
        return 0;

nla_put_failure:
        return -1;
}

static int nf_tables_fill_setelem(struct sk_buff *skb,
                                  const struct nft_set *set,
                                  const struct nft_elem_priv *elem_priv,
                                  bool reset)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        unsigned char *b = skb_tail_pointer(skb);
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM);
        if (nest == NULL)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
            nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
                          NFT_DATA_VALUE, set->klen) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
            nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext),
                          NFT_DATA_VALUE, set->klen) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
            nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext),
                          set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
                          set->dlen) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) &&
            nft_set_elem_expr_dump(skb, set, ext, reset))
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
            nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
                           (*nft_set_ext_obj(ext))->key.name) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
            nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
                         htonl(*nft_set_ext_flags(ext))))
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
            nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
                         nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
                         NFTA_SET_ELEM_PAD))
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
                u64 expires, now = get_jiffies_64();

                expires = *nft_set_ext_expiration(ext);
                if (time_before64(now, expires))
                        expires -= now;
                else
                        expires = 0;

                if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
                                 nf_jiffies64_to_msecs(expires),
                                 NFTA_SET_ELEM_PAD))
                        goto nla_put_failure;
        }

        if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) {
                struct nft_userdata *udata;

                udata = nft_set_ext_userdata(ext);
                if (nla_put(skb, NFTA_SET_ELEM_USERDATA,
                            udata->len + 1, udata->data))
                        goto nla_put_failure;
        }

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, b);
        return -EMSGSIZE;
}

struct nft_set_dump_args {
        const struct netlink_callback        *cb;
        struct nft_set_iter                iter;
        struct sk_buff                        *skb;
        bool                                reset;
};

static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
                                  struct nft_set *set,
                                  const struct nft_set_iter *iter,
                                  struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        struct nft_set_dump_args *args;

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext))
                return 0;

        args = container_of(iter, struct nft_set_dump_args, iter);
        return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset);
}

static void audit_log_nft_set_reset(const struct nft_table *table,
                                    unsigned int base_seq,
                                    unsigned int nentries)
{
        char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);

        audit_log_nfcfg(buf, table->family, nentries,
                        AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC);
        kfree(buf);
}

struct nft_set_dump_ctx {
        const struct nft_set        *set;
        struct nft_ctx                ctx;
        bool                        reset;
};

static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
                                 const struct nft_set *set, bool reset,
                                 unsigned int base_seq)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_cur(net);
        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask) ||
                    nft_set_elem_expired(ext))
                        continue;

                ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset);
                if (reset && !ret)
                        audit_log_nft_set_reset(set->table, base_seq, 1);
                break;
        }

        return ret;
}

static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct nft_set_dump_ctx *dump_ctx = cb->data;
        struct net *net = sock_net(skb->sk);
        struct nftables_pernet *nft_net;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_set_dump_args args;
        bool set_found = false;
        struct nlmsghdr *nlh;
        struct nlattr *nest;
        u32 portid, seq;
        int event;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
                    dump_ctx->ctx.family != table->family)
                        continue;

                if (table != dump_ctx->ctx.table)
                        continue;

                list_for_each_entry_rcu(set, &table->sets, list) {
                        if (set == dump_ctx->set) {
                                set_found = true;
                                break;
                        }
                }
                break;
        }

        if (!set_found) {
                rcu_read_unlock();
                return -ENOENT;
        }

        event  = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM);
        portid = NETLINK_CB(cb->skb).portid;
        seq    = cb->nlh->nlmsg_seq;

        nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
                           table->family, NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
        if (nest == NULL)
                goto nla_put_failure;

        args.cb                        = cb;
        args.skb                = skb;
        args.reset                = dump_ctx->reset;
        args.iter.genmask        = nft_genmask_cur(net);
        args.iter.type                = NFT_ITER_READ;
        args.iter.skip                = cb->args[0];
        args.iter.count                = 0;
        args.iter.err                = 0;
        args.iter.fn                = nf_tables_dump_setelem;
        set->ops->walk(&dump_ctx->ctx, set, &args.iter);

        if (!args.iter.err && args.iter.count == cb->args[0])
                args.iter.err = nft_set_catchall_dump(net, skb, set,
                                                      dump_ctx->reset, cb->seq);
        nla_nest_end(skb, nest);
        nlmsg_end(skb, nlh);

        rcu_read_unlock();

        if (args.iter.err && args.iter.err != -EMSGSIZE)
                return args.iter.err;
        if (args.iter.count == cb->args[0])
                return 0;

        cb->args[0] = args.iter.count;
        return skb->len;

nla_put_failure:
        rcu_read_unlock();
        return -ENOSPC;
}

static int nf_tables_dumpreset_set(struct sk_buff *skb,
                                   struct netlink_callback *cb)
{
        struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
        struct nft_set_dump_ctx *dump_ctx = cb->data;
        int ret, skip = cb->args[0];

        mutex_lock(&nft_net->commit_mutex);

        ret = nf_tables_dump_set(skb, cb);

        if (cb->args[0] > skip)
                audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
                                        cb->args[0] - skip);

        mutex_unlock(&nft_net->commit_mutex);

        return ret;
}

static int nf_tables_dump_set_start(struct netlink_callback *cb)
{
        struct nft_set_dump_ctx *dump_ctx = cb->data;

        cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC);

        return cb->data ? 0 : -ENOMEM;
}

static int nf_tables_dump_set_done(struct netlink_callback *cb)
{
        kfree(cb->data);
        return 0;
}

static int nf_tables_fill_setelem_info(struct sk_buff *skb,
                                       const struct nft_ctx *ctx, u32 seq,
                                       u32 portid, int event, u16 flags,
                                       const struct nft_set *set,
                                       const struct nft_elem_priv *elem_priv,
                                       bool reset)
{
        struct nlmsghdr *nlh;
        struct nlattr *nest;
        int err;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
                           NFNETLINK_V0, nft_base_seq(ctx->net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_SET_NAME, set->name))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
        if (nest == NULL)
                goto nla_put_failure;

        err = nf_tables_fill_setelem(skb, set, elem_priv, reset);
        if (err < 0)
                goto nla_put_failure;

        nla_nest_end(skb, nest);

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static int nft_setelem_parse_flags(const struct nft_set *set,
                                   const struct nlattr *attr, u32 *flags)
{
        if (attr == NULL)
                return 0;

        *flags = ntohl(nla_get_be32(attr));
        if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
                return -EOPNOTSUPP;
        if (!(set->flags & NFT_SET_INTERVAL) &&
            *flags & NFT_SET_ELEM_INTERVAL_END)
                return -EINVAL;
        if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) ==
            (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
                return -EINVAL;

        return 0;
}

static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set,
                                 struct nft_data *key, struct nlattr *attr)
{
        struct nft_data_desc desc = {
                .type        = NFT_DATA_VALUE,
                .size        = NFT_DATA_VALUE_MAXLEN,
                .len        = set->klen,
        };

        return nft_data_init(ctx, key, &desc, attr);
}

static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
                                  struct nft_data_desc *desc,
                                  struct nft_data *data,
                                  struct nlattr *attr)
{
        u32 dtype;

        if (set->dtype == NFT_DATA_VERDICT)
                dtype = NFT_DATA_VERDICT;
        else
                dtype = NFT_DATA_VALUE;

        desc->type = dtype;
        desc->size = NFT_DATA_VALUE_MAXLEN;
        desc->len = set->dlen;
        desc->flags = NFT_DATA_DESC_SETELEM;

        return nft_data_init(ctx, data, desc, attr);
}

static void *nft_setelem_catchall_get(const struct net *net,
                                      const struct nft_set *set)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_cur(net);
        struct nft_set_ext *ext;
        void *priv = NULL;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask) ||
                    nft_set_elem_expired(ext))
                        continue;

                priv = catchall->elem;
                break;
        }

        return priv;
}

static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set,
                           struct nft_set_elem *elem, u32 flags)
{
        void *priv;

        if (!(flags & NFT_SET_ELEM_CATCHALL)) {
                priv = set->ops->get(ctx->net, set, elem, flags);
                if (IS_ERR(priv))
                        return PTR_ERR(priv);
        } else {
                priv = nft_setelem_catchall_get(ctx->net, set);
                if (!priv)
                        return -ENOENT;
        }
        elem->priv = priv;

        return 0;
}

static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set,
                            const struct nlattr *attr, bool reset)
{
        struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
        struct nft_set_elem elem;
        struct sk_buff *skb;
        uint32_t flags = 0;
        int err;

        err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
                                          nft_set_elem_policy, NULL);
        if (err < 0)
                return err;

        err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
        if (err < 0)
                return err;

        if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
                return -EINVAL;

        if (nla[NFTA_SET_ELEM_KEY]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key.val,
                                            nla[NFTA_SET_ELEM_KEY]);
                if (err < 0)
                        return err;
        }

        if (nla[NFTA_SET_ELEM_KEY_END]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
                                            nla[NFTA_SET_ELEM_KEY_END]);
                if (err < 0)
                        return err;
        }

        err = nft_setelem_get(ctx, set, &elem, flags);
        if (err < 0)
                return err;

        err = -ENOMEM;
        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb == NULL)
                return err;

        err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
                                          NFT_MSG_NEWSETELEM, 0, set, elem.priv,
                                          reset);
        if (err < 0)
                goto err_fill_setelem;

        return nfnetlink_unicast(skb, ctx->net, ctx->portid);

err_fill_setelem:
        kfree_skb(skb);
        return err;
}

static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx,
                                 const struct sk_buff *skb,
                                 const struct nfnl_info *info,
                                 const struct nlattr * const nla[],
                                 bool reset)
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_set *set;

        table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
                                 genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
                return PTR_ERR(table);
        }

        set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
                return PTR_ERR(set);
        }

        nft_ctx_init(&dump_ctx->ctx, net, skb,
                     info->nlh, family, table, NULL, nla);
        dump_ctx->set = set;
        dump_ctx->reset = reset;
        return 0;
}

/* called with rcu_read_lock held */
static int nf_tables_getsetelem(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        struct nft_set_dump_ctx dump_ctx;
        struct nlattr *attr;
        int rem, err = 0;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_set_start,
                        .dump = nf_tables_dump_set,
                        .done = nf_tables_dump_set_done,
                        .module = THIS_MODULE,
                };

                err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
                if (err)
                        return err;

                c.data = &dump_ctx;
                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
                return -EINVAL;

        err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
        if (err)
                return err;

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        break;
                }
        }

        return err;
}

static int nf_tables_getsetelem_reset(struct sk_buff *skb,
                                      const struct nfnl_info *info,
                                      const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        struct nft_set_dump_ctx dump_ctx;
        int rem, err = 0, nelems = 0;
        struct nlattr *attr;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_set_start,
                        .dump = nf_tables_dumpreset_set,
                        .done = nf_tables_dump_set_done,
                        .module = THIS_MODULE,
                };

                err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
                if (err)
                        return err;

                c.data = &dump_ctx;
                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
                return -EINVAL;

        if (!try_module_get(THIS_MODULE))
                return -EINVAL;
        rcu_read_unlock();
        mutex_lock(&nft_net->commit_mutex);
        rcu_read_lock();

        err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
        if (err)
                goto out_unlock;

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        break;
                }
                nelems++;
        }
        audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems);

out_unlock:
        rcu_read_unlock();
        mutex_unlock(&nft_net->commit_mutex);
        rcu_read_lock();
        module_put(THIS_MODULE);

        return err;
}

static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
                                     const struct nft_set *set,
                                     const struct nft_elem_priv *elem_priv,
                                     int event)
{
        struct nftables_pernet *nft_net;
        struct net *net = ctx->net;
        u32 portid = ctx->portid;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
                                          set, elem_priv, false);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_net = nft_pernet(net);
        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
                                              int msg_type,
                                              struct nft_set *set)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem));
        if (trans == NULL)
                return NULL;

        nft_trans_elem_set(trans) = set;
        return trans;
}

struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
                                         const struct nft_set *set,
                                         const struct nlattr *attr)
{
        struct nft_expr *expr;
        int err;

        expr = nft_expr_init(ctx, attr);
        if (IS_ERR(expr))
                return expr;

        err = -EOPNOTSUPP;
        if (expr->ops->type->flags & NFT_EXPR_GC) {
                if (set->flags & NFT_SET_TIMEOUT)
                        goto err_set_elem_expr;
                if (!set->ops->gc_init)
                        goto err_set_elem_expr;
                set->ops->gc_init(set);
        }

        return expr;

err_set_elem_expr:
        nft_expr_destroy(ctx, expr);
        return ERR_PTR(err);
}

static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len)
{
        len += nft_set_ext_types[id].len;
        if (len > tmpl->ext_len[id] ||
            len > U8_MAX)
                return -1;

        return 0;
}

static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
                              void *to, const void *from, u32 len)
{
        if (nft_set_ext_check(tmpl, id, len) < 0)
                return -1;

        memcpy(to, from, len);

        return 0;
}

struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
                                        const struct nft_set_ext_tmpl *tmpl,
                                        const u32 *key, const u32 *key_end,
                                        const u32 *data,
                                        u64 timeout, u64 expiration, gfp_t gfp)
{
        struct nft_set_ext *ext;
        void *elem;

        elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
        if (elem == NULL)
                return ERR_PTR(-ENOMEM);

        ext = nft_set_elem_ext(set, elem);
        nft_set_ext_init(ext, tmpl);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
            nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY,
                               nft_set_ext_key(ext), key, set->klen) < 0)
                goto err_ext_check;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
            nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END,
                               nft_set_ext_key_end(ext), key_end, set->klen) < 0)
                goto err_ext_check;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
            nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA,
                               nft_set_ext_data(ext), data, set->dlen) < 0)
                goto err_ext_check;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
                *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
                if (expiration == 0)
                        *nft_set_ext_expiration(ext) += timeout;
        }
        if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
                *nft_set_ext_timeout(ext) = timeout;

        return elem;

err_ext_check:
        kfree(elem);

        return ERR_PTR(-EINVAL);
}

static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
                                        struct nft_expr *expr)
{
        if (expr->ops->destroy_clone) {
                expr->ops->destroy_clone(ctx, expr);
                module_put(expr->ops->type->owner);
        } else {
                nf_tables_expr_destroy(ctx, expr);
        }
}

static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
                                      struct nft_set_elem_expr *elem_expr)
{
        struct nft_expr *expr;
        u32 size;

        nft_setelem_expr_foreach(expr, elem_expr, size)
                __nft_set_elem_expr_destroy(ctx, expr);
}

/* Drop references and destroy. Called from gc, dynset and abort path. */
void nft_set_elem_destroy(const struct nft_set *set,
                          const struct nft_elem_priv *elem_priv,
                          bool destroy_expr)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        struct nft_ctx ctx = {
                .net        = read_pnet(&set->net),
                .family        = set->table->family,
        };

        nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
                nft_data_release(nft_set_ext_data(ext), set->dtype);
        if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
                nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
                nft_use_dec(&(*nft_set_ext_obj(ext))->use);

        kfree(elem_priv);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);

/* Destroy element. References have been already dropped in the preparation
 * path via nft_setelem_data_deactivate().
 */
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
                                const struct nft_set *set,
                                const struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
                nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));

        kfree(elem_priv);
}

int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
                            struct nft_expr *expr_array[])
{
        struct nft_expr *expr;
        int err, i, k;

        for (i = 0; i < set->num_exprs; i++) {
                expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
                if (!expr)
                        goto err_expr;

                err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT);
                if (err < 0) {
                        kfree(expr);
                        goto err_expr;
                }
                expr_array[i] = expr;
        }

        return 0;

err_expr:
        for (k = i - 1; k >= 0; k--)
                nft_expr_destroy(ctx, expr_array[k]);

        return -ENOMEM;
}

static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
                                   const struct nft_set_ext_tmpl *tmpl,
                                   const struct nft_set_ext *ext,
                                   struct nft_expr *expr_array[],
                                   u32 num_exprs)
{
        struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
        u32 len = sizeof(struct nft_set_elem_expr);
        struct nft_expr *expr;
        int i, err;

        if (num_exprs == 0)
                return 0;

        for (i = 0; i < num_exprs; i++)
                len += expr_array[i]->ops->size;

        if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0)
                return -EINVAL;

        for (i = 0; i < num_exprs; i++) {
                expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
                err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT);
                if (err < 0)
                        goto err_elem_expr_setup;

                elem_expr->size += expr_array[i]->ops->size;
                nft_expr_destroy(ctx, expr_array[i]);
                expr_array[i] = NULL;
        }

        return 0;

err_elem_expr_setup:
        for (; i < num_exprs; i++) {
                nft_expr_destroy(ctx, expr_array[i]);
                expr_array[i] = NULL;
        }

        return -ENOMEM;
}

struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
                                            const struct nft_set *set)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_cur(net);
        struct nft_set_ext *ext;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (nft_set_elem_active(ext, genmask) &&
                    !nft_set_elem_expired(ext) &&
                    !nft_set_elem_is_dead(ext))
                        return ext;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);

static int nft_setelem_catchall_insert(const struct net *net,
                                       struct nft_set *set,
                                       const struct nft_set_elem *elem,
                                       struct nft_elem_priv **priv)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_next(net);
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (nft_set_elem_active(ext, genmask)) {
                        *priv = catchall->elem;
                        return -EEXIST;
                }
        }

        catchall = kmalloc(sizeof(*catchall), GFP_KERNEL);
        if (!catchall)
                return -ENOMEM;

        catchall->elem = elem->priv;
        list_add_tail_rcu(&catchall->list, &set->catchall_list);

        return 0;
}

static int nft_setelem_insert(const struct net *net,
                              struct nft_set *set,
                              const struct nft_set_elem *elem,
                              struct nft_elem_priv **elem_priv,
                              unsigned int flags)
{
        int ret;

        if (flags & NFT_SET_ELEM_CATCHALL)
                ret = nft_setelem_catchall_insert(net, set, elem, elem_priv);
        else
                ret = set->ops->insert(net, set, elem, elem_priv);

        return ret;
}

static bool nft_setelem_is_catchall(const struct nft_set *set,
                                    const struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
            *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
                return true;

        return false;
}

static void nft_setelem_activate(struct net *net, struct nft_set *set,
                                 struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_setelem_is_catchall(set, elem_priv)) {
                nft_clear(net, ext);
        } else {
                set->ops->activate(net, set, elem_priv);
        }
}

static int nft_setelem_catchall_deactivate(const struct net *net,
                                           struct nft_set *set,
                                           struct nft_set_elem *elem)
{
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_is_active_next(net, ext))
                        continue;

                kfree(elem->priv);
                elem->priv = catchall->elem;
                nft_set_elem_change_active(net, set, ext);
                return 0;
        }

        return -ENOENT;
}

static int __nft_setelem_deactivate(const struct net *net,
                                    struct nft_set *set,
                                    struct nft_set_elem *elem)
{
        void *priv;

        priv = set->ops->deactivate(net, set, elem);
        if (!priv)
                return -ENOENT;

        kfree(elem->priv);
        elem->priv = priv;
        set->ndeact++;

        return 0;
}

static int nft_setelem_deactivate(const struct net *net,
                                  struct nft_set *set,
                                  struct nft_set_elem *elem, u32 flags)
{
        int ret;

        if (flags & NFT_SET_ELEM_CATCHALL)
                ret = nft_setelem_catchall_deactivate(net, set, elem);
        else
                ret = __nft_setelem_deactivate(net, set, elem);

        return ret;
}

static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall)
{
        list_del_rcu(&catchall->list);
        kfree_rcu(catchall, rcu);
}

static void nft_setelem_catchall_remove(const struct net *net,
                                        const struct nft_set *set,
                                        struct nft_elem_priv *elem_priv)
{
        struct nft_set_elem_catchall *catchall, *next;

        list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
                if (catchall->elem == elem_priv) {
                        nft_setelem_catchall_destroy(catchall);
                        break;
                }
        }
}

static void nft_setelem_remove(const struct net *net,
                               const struct nft_set *set,
                               struct nft_elem_priv *elem_priv)
{
        if (nft_setelem_is_catchall(set, elem_priv))
                nft_setelem_catchall_remove(net, set, elem_priv);
        else
                set->ops->remove(net, set, elem_priv);
}

static bool nft_setelem_valid_key_end(const struct nft_set *set,
                                      struct nlattr **nla, u32 flags)
{
        if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) ==
                          (NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
                if (flags & NFT_SET_ELEM_INTERVAL_END)
                        return false;

                if (nla[NFTA_SET_ELEM_KEY_END] &&
                    flags & NFT_SET_ELEM_CATCHALL)
                        return false;
        } else {
                if (nla[NFTA_SET_ELEM_KEY_END])
                        return false;
        }

        return true;
}

static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
                            const struct nlattr *attr, u32 nlmsg_flags)
{
        struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
        struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
        u8 genmask = nft_genmask_next(ctx->net);
        u32 flags = 0, size = 0, num_exprs = 0;
        struct nft_set_ext_tmpl tmpl;
        struct nft_set_ext *ext, *ext2;
        struct nft_set_elem elem;
        struct nft_set_binding *binding;
        struct nft_elem_priv *elem_priv;
        struct nft_object *obj = NULL;
        struct nft_userdata *udata;
        struct nft_data_desc desc;
        enum nft_registers dreg;
        struct nft_trans *trans;
        u64 expiration;
        u64 timeout;
        int err, i;
        u8 ulen;

        err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
                                          nft_set_elem_policy, NULL);
        if (err < 0)
                return err;

        nft_set_ext_prepare(&tmpl);

        err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
        if (err < 0)
                return err;

        if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) ||
            (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY]))
                return -EINVAL;

        if (flags != 0) {
                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
                if (err < 0)
                        return err;
        }

        if (set->flags & NFT_SET_MAP) {
                if (nla[NFTA_SET_ELEM_DATA] == NULL &&
                    !(flags & NFT_SET_ELEM_INTERVAL_END))
                        return -EINVAL;
        } else {
                if (nla[NFTA_SET_ELEM_DATA] != NULL)
                        return -EINVAL;
        }

        if (set->flags & NFT_SET_OBJECT) {
                if (!nla[NFTA_SET_ELEM_OBJREF] &&
                    !(flags & NFT_SET_ELEM_INTERVAL_END))
                        return -EINVAL;
        } else {
                if (nla[NFTA_SET_ELEM_OBJREF])
                        return -EINVAL;
        }

        if (!nft_setelem_valid_key_end(set, nla, flags))
                return -EINVAL;

        if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
             (nla[NFTA_SET_ELEM_DATA] ||
              nla[NFTA_SET_ELEM_OBJREF] ||
              nla[NFTA_SET_ELEM_TIMEOUT] ||
              nla[NFTA_SET_ELEM_EXPIRATION] ||
              nla[NFTA_SET_ELEM_USERDATA] ||
              nla[NFTA_SET_ELEM_EXPR] ||
              nla[NFTA_SET_ELEM_KEY_END] ||
              nla[NFTA_SET_ELEM_EXPRESSIONS]))
                return -EINVAL;

        timeout = 0;
        if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
                if (!(set->flags & NFT_SET_TIMEOUT))
                        return -EINVAL;
                err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
                                            &timeout);
                if (err)
                        return err;
        } else if (set->flags & NFT_SET_TIMEOUT &&
                   !(flags & NFT_SET_ELEM_INTERVAL_END)) {
                timeout = READ_ONCE(set->timeout);
        }

        expiration = 0;
        if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
                if (!(set->flags & NFT_SET_TIMEOUT))
                        return -EINVAL;
                err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
                                            &expiration);
                if (err)
                        return err;
        }

        if (nla[NFTA_SET_ELEM_EXPR]) {
                struct nft_expr *expr;

                if (set->num_exprs && set->num_exprs != 1)
                        return -EOPNOTSUPP;

                expr = nft_set_elem_expr_alloc(ctx, set,
                                               nla[NFTA_SET_ELEM_EXPR]);
                if (IS_ERR(expr))
                        return PTR_ERR(expr);

                expr_array[0] = expr;
                num_exprs = 1;

                if (set->num_exprs && set->exprs[0]->ops != expr->ops) {
                        err = -EOPNOTSUPP;
                        goto err_set_elem_expr;
                }
        } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) {
                struct nft_expr *expr;
                struct nlattr *tmp;
                int left;

                i = 0;
                nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) {
                        if (i == NFT_SET_EXPR_MAX ||
                            (set->num_exprs && set->num_exprs == i)) {
                                err = -E2BIG;
                                goto err_set_elem_expr;
                        }
                        if (nla_type(tmp) != NFTA_LIST_ELEM) {
                                err = -EINVAL;
                                goto err_set_elem_expr;
                        }
                        expr = nft_set_elem_expr_alloc(ctx, set, tmp);
                        if (IS_ERR(expr)) {
                                err = PTR_ERR(expr);
                                goto err_set_elem_expr;
                        }
                        expr_array[i] = expr;
                        num_exprs++;

                        if (set->num_exprs && expr->ops != set->exprs[i]->ops) {
                                err = -EOPNOTSUPP;
                                goto err_set_elem_expr;
                        }
                        i++;
                }
                if (set->num_exprs && set->num_exprs != i) {
                        err = -EOPNOTSUPP;
                        goto err_set_elem_expr;
                }
        } else if (set->num_exprs > 0 &&
                   !(flags & NFT_SET_ELEM_INTERVAL_END)) {
                err = nft_set_elem_expr_clone(ctx, set, expr_array);
                if (err < 0)
                        goto err_set_elem_expr_clone;

                num_exprs = set->num_exprs;
        }

        if (nla[NFTA_SET_ELEM_KEY]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key.val,
                                            nla[NFTA_SET_ELEM_KEY]);
                if (err < 0)
                        goto err_set_elem_expr;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
                if (err < 0)
                        goto err_parse_key;
        }

        if (nla[NFTA_SET_ELEM_KEY_END]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
                                            nla[NFTA_SET_ELEM_KEY_END]);
                if (err < 0)
                        goto err_parse_key;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
                if (err < 0)
                        goto err_parse_key_end;
        }

        if (timeout > 0) {
                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
                if (err < 0)
                        goto err_parse_key_end;

                if (timeout != READ_ONCE(set->timeout)) {
                        err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
                        if (err < 0)
                                goto err_parse_key_end;
                }
        }

        if (num_exprs) {
                for (i = 0; i < num_exprs; i++)
                        size += expr_array[i]->ops->size;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
                                             sizeof(struct nft_set_elem_expr) + size);
                if (err < 0)
                        goto err_parse_key_end;
        }

        if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
                obj = nft_obj_lookup(ctx->net, ctx->table,
                                     nla[NFTA_SET_ELEM_OBJREF],
                                     set->objtype, genmask);
                if (IS_ERR(obj)) {
                        err = PTR_ERR(obj);
                        obj = NULL;
                        goto err_parse_key_end;
                }

                if (!nft_use_inc(&obj->use)) {
                        err = -EMFILE;
                        obj = NULL;
                        goto err_parse_key_end;
                }

                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
                if (err < 0)
                        goto err_parse_key_end;
        }

        if (nla[NFTA_SET_ELEM_DATA] != NULL) {
                err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val,
                                             nla[NFTA_SET_ELEM_DATA]);
                if (err < 0)
                        goto err_parse_key_end;

                dreg = nft_type_to_reg(set->dtype);
                list_for_each_entry(binding, &set->bindings, list) {
                        struct nft_ctx bind_ctx = {
                                .net        = ctx->net,
                                .family        = ctx->family,
                                .table        = ctx->table,
                                .chain        = (struct nft_chain *)binding->chain,
                        };

                        if (!(binding->flags & NFT_SET_MAP))
                                continue;

                        err = nft_validate_register_store(&bind_ctx, dreg,
                                                          &elem.data.val,
                                                          desc.type, desc.len);
                        if (err < 0)
                                goto err_parse_data;

                        if (desc.type == NFT_DATA_VERDICT &&
                            (elem.data.val.verdict.code == NFT_GOTO ||
                             elem.data.val.verdict.code == NFT_JUMP))
                                nft_validate_state_update(ctx->table,
                                                          NFT_VALIDATE_NEED);
                }

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
                if (err < 0)
                        goto err_parse_data;
        }

        /* The full maximum length of userdata can exceed the maximum
         * offset value (U8_MAX) for following extensions, therefor it
         * must be the last extension added.
         */
        ulen = 0;
        if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
                ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
                if (ulen > 0) {
                        err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
                                                     ulen);
                        if (err < 0)
                                goto err_parse_data;
                }
        }

        elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
                                      elem.key_end.val.data, elem.data.val.data,
                                      timeout, expiration, GFP_KERNEL_ACCOUNT);
        if (IS_ERR(elem.priv)) {
                err = PTR_ERR(elem.priv);
                goto err_parse_data;
        }

        ext = nft_set_elem_ext(set, elem.priv);
        if (flags)
                *nft_set_ext_flags(ext) = flags;

        if (obj)
                *nft_set_ext_obj(ext) = obj;

        if (ulen > 0) {
                if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
                        err = -EINVAL;
                        goto err_elem_free;
                }
                udata = nft_set_ext_userdata(ext);
                udata->len = ulen - 1;
                nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
        }
        err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
        if (err < 0)
                goto err_elem_free;

        trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
        if (trans == NULL) {
                err = -ENOMEM;
                goto err_elem_free;
        }

        ext->genmask = nft_genmask_cur(ctx->net);

        err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags);
        if (err) {
                if (err == -EEXIST) {
                        ext2 = nft_set_elem_ext(set, elem_priv);
                        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
                            nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
                            nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
                            nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
                                goto err_element_clash;
                        if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
                             nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
                             memcmp(nft_set_ext_data(ext),
                                    nft_set_ext_data(ext2), set->dlen) != 0) ||
                            (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
                             nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
                             *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
                                goto err_element_clash;
                        else if (!(nlmsg_flags & NLM_F_EXCL))
                                err = 0;
                } else if (err == -ENOTEMPTY) {
                        /* ENOTEMPTY reports overlapping between this element
                         * and an existing one.
                         */
                        err = -EEXIST;
                }
                goto err_element_clash;
        }

        if (!(flags & NFT_SET_ELEM_CATCHALL)) {
                unsigned int max = set->size ? set->size + set->ndeact : UINT_MAX;

                if (!atomic_add_unless(&set->nelems, 1, max)) {
                        err = -ENFILE;
                        goto err_set_full;
                }
        }

        nft_trans_elem_priv(trans) = elem.priv;
        nft_trans_commit_list_add_tail(ctx->net, trans);
        return 0;

err_set_full:
        nft_setelem_remove(ctx->net, set, elem.priv);
err_element_clash:
        kfree(trans);
err_elem_free:
        nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
        if (nla[NFTA_SET_ELEM_DATA] != NULL)
                nft_data_release(&elem.data.val, desc.type);
err_parse_key_end:
        if (obj)
                nft_use_dec_restore(&obj->use);

        nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
err_parse_key:
        nft_data_release(&elem.key.val, NFT_DATA_VALUE);
err_set_elem_expr:
        for (i = 0; i < num_exprs && expr_array[i]; i++)
                nft_expr_destroy(ctx, expr_array[i]);
err_set_elem_expr_clone:
        return err;
}

static int nf_tables_newsetelem(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_ctx ctx;
        int rem, err;

        if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
                return PTR_ERR(table);
        }

        set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET],
                                    nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
                return PTR_ERR(set);
        }

        if (!list_empty(&set->bindings) &&
            (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS)))
                return -EBUSY;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return err;
                }
        }

        if (table->validate_state == NFT_VALIDATE_DO)
                return nft_table_validate(net, table);

        return 0;
}

/**
 *        nft_data_hold - hold a nft_data item
 *
 *        @data: struct nft_data to release
 *        @type: type of data
 *
 *        Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded,
 *        NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and
 *        NFT_GOTO verdicts. This function must be called on active data objects
 *        from the second phase of the commit protocol.
 */
void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
{
        struct nft_chain *chain;

        if (type == NFT_DATA_VERDICT) {
                switch (data->verdict.code) {
                case NFT_JUMP:
                case NFT_GOTO:
                        chain = data->verdict.chain;
                        nft_use_inc_restore(&chain->use);
                        break;
                }
        }
}

static int nft_setelem_active_next(const struct net *net,
                                   const struct nft_set *set,
                                   struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        u8 genmask = nft_genmask_next(net);

        return nft_set_elem_active(ext, genmask);
}

static void nft_setelem_data_activate(const struct net *net,
                                      const struct nft_set *set,
                                      struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
                nft_data_hold(nft_set_ext_data(ext), set->dtype);
        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
                nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}

void nft_setelem_data_deactivate(const struct net *net,
                                 const struct nft_set *set,
                                 struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
                nft_data_release(nft_set_ext_data(ext), set->dtype);
        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
                nft_use_dec(&(*nft_set_ext_obj(ext))->use);
}

static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
                           const struct nlattr *attr)
{
        struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
        struct nft_set_ext_tmpl tmpl;
        struct nft_set_elem elem;
        struct nft_set_ext *ext;
        struct nft_trans *trans;
        u32 flags = 0;
        int err;

        err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
                                          nft_set_elem_policy, NULL);
        if (err < 0)
                return err;

        err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
        if (err < 0)
                return err;

        if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
                return -EINVAL;

        if (!nft_setelem_valid_key_end(set, nla, flags))
                return -EINVAL;

        nft_set_ext_prepare(&tmpl);

        if (flags != 0) {
                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
                if (err < 0)
                        return err;
        }

        if (nla[NFTA_SET_ELEM_KEY]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key.val,
                                            nla[NFTA_SET_ELEM_KEY]);
                if (err < 0)
                        return err;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
                if (err < 0)
                        goto fail_elem;
        }

        if (nla[NFTA_SET_ELEM_KEY_END]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
                                            nla[NFTA_SET_ELEM_KEY_END]);
                if (err < 0)
                        goto fail_elem;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
                if (err < 0)
                        goto fail_elem_key_end;
        }

        err = -ENOMEM;
        elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
                                      elem.key_end.val.data, NULL, 0, 0,
                                      GFP_KERNEL_ACCOUNT);
        if (IS_ERR(elem.priv)) {
                err = PTR_ERR(elem.priv);
                goto fail_elem_key_end;
        }

        ext = nft_set_elem_ext(set, elem.priv);
        if (flags)
                *nft_set_ext_flags(ext) = flags;

        trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
        if (trans == NULL)
                goto fail_trans;

        err = nft_setelem_deactivate(ctx->net, set, &elem, flags);
        if (err < 0)
                goto fail_ops;

        nft_setelem_data_deactivate(ctx->net, set, elem.priv);

        nft_trans_elem_priv(trans) = elem.priv;
        nft_trans_commit_list_add_tail(ctx->net, trans);
        return 0;

fail_ops:
        kfree(trans);
fail_trans:
        kfree(elem.priv);
fail_elem_key_end:
        nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
fail_elem:
        nft_data_release(&elem.key.val, NFT_DATA_VALUE);
        return err;
}

static int nft_setelem_flush(const struct nft_ctx *ctx,
                             struct nft_set *set,
                             const struct nft_set_iter *iter,
                             struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        struct nft_trans *trans;

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
                                    sizeof(struct nft_trans_elem), GFP_ATOMIC);
        if (!trans)
                return -ENOMEM;

        set->ops->flush(ctx->net, set, elem_priv);
        set->ndeact++;

        nft_setelem_data_deactivate(ctx->net, set, elem_priv);
        nft_trans_elem_set(trans) = set;
        nft_trans_elem_priv(trans) = elem_priv;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;
}

static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
                                    struct nft_set *set,
                                    struct nft_elem_priv *elem_priv)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
                                    sizeof(struct nft_trans_elem), GFP_KERNEL);
        if (!trans)
                return -ENOMEM;

        nft_setelem_data_deactivate(ctx->net, set, elem_priv);
        nft_trans_elem_set(trans) = set;
        nft_trans_elem_priv(trans) = elem_priv;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;
}

static int nft_set_catchall_flush(const struct nft_ctx *ctx,
                                  struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                ret = __nft_set_catchall_flush(ctx, set, catchall->elem);
                if (ret < 0)
                        break;
                nft_set_elem_change_active(ctx->net, set, ext);
        }

        return ret;
}

static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
{
        struct nft_set_iter iter = {
                .genmask        = genmask,
                .type                = NFT_ITER_UPDATE,
                .fn                = nft_setelem_flush,
        };

        set->ops->walk(ctx, set, &iter);
        if (!iter.err)
                iter.err = nft_set_catchall_flush(ctx, set);

        return iter.err;
}

static int nf_tables_delsetelem(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_ctx ctx;
        int rem, err = 0;

        table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
                return PTR_ERR(table);
        }

        set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
                return PTR_ERR(set);
        }

        if (nft_set_is_anonymous(set))
                return -EOPNOTSUPP;

        if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT))
                return -EBUSY;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
                return nft_set_flush(&ctx, set, genmask);

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_del_setelem(&ctx, set, attr);
                if (err == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
                        continue;

                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return err;
                }
        }

        return 0;
}

/*
 * Stateful objects
 */

/**
 *        nft_register_obj- register nf_tables stateful object type
 *        @obj_type: object type
 *
 *        Registers the object type for use with nf_tables. Returns zero on
 *        success or a negative errno code otherwise.
 */
int nft_register_obj(struct nft_object_type *obj_type)
{
        if (obj_type->type == NFT_OBJECT_UNSPEC)
                return -EINVAL;

        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_add_rcu(&obj_type->list, &nf_tables_objects);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
        return 0;
}
EXPORT_SYMBOL_GPL(nft_register_obj);

/**
 *        nft_unregister_obj - unregister nf_tables object type
 *        @obj_type: object type
 *
 *         Unregisters the object type for use with nf_tables.
 */
void nft_unregister_obj(struct nft_object_type *obj_type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_del_rcu(&obj_type->list);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_obj);

struct nft_object *nft_obj_lookup(const struct net *net,
                                  const struct nft_table *table,
                                  const struct nlattr *nla, u32 objtype,
                                  u8 genmask)
{
        struct nft_object_hash_key k = { .table = table };
        char search[NFT_OBJ_MAXNAMELEN];
        struct rhlist_head *tmp, *list;
        struct nft_object *obj;

        nla_strscpy(search, nla, sizeof(search));
        k.name = search;

        WARN_ON_ONCE(!rcu_read_lock_held() &&
                     !lockdep_commit_lock_is_held(net));

        rcu_read_lock();
        list = rhltable_lookup(&nft_objname_ht, &k, nft_objname_ht_params);
        if (!list)
                goto out;

        rhl_for_each_entry_rcu(obj, tmp, list, rhlhead) {
                if (objtype == obj->ops->type->type &&
                    nft_active_genmask(obj, genmask)) {
                        rcu_read_unlock();
                        return obj;
                }
        }
out:
        rcu_read_unlock();
        return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL_GPL(nft_obj_lookup);

static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table,
                                                  const struct nlattr *nla,
                                                  u32 objtype, u8 genmask)
{
        struct nft_object *obj;

        list_for_each_entry(obj, &table->objects, list) {
                if (be64_to_cpu(nla_get_be64(nla)) == obj->handle &&
                    objtype == obj->ops->type->type &&
                    nft_active_genmask(obj, genmask))
                        return obj;
        }
        return ERR_PTR(-ENOENT);
}

static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
        [NFTA_OBJ_TABLE]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_OBJ_NAME]                = { .type = NLA_STRING,
                                    .len = NFT_OBJ_MAXNAMELEN - 1 },
        [NFTA_OBJ_TYPE]                = { .type = NLA_U32 },
        [NFTA_OBJ_DATA]                = { .type = NLA_NESTED },
        [NFTA_OBJ_HANDLE]        = { .type = NLA_U64},
        [NFTA_OBJ_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN },
};

static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
                                       const struct nft_object_type *type,
                                       const struct nlattr *attr)
{
        struct nlattr **tb;
        const struct nft_object_ops *ops;
        struct nft_object *obj;
        int err = -ENOMEM;

        tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL);
        if (!tb)
                goto err1;

        if (attr) {
                err = nla_parse_nested_deprecated(tb, type->maxattr, attr,
                                                  type->policy, NULL);
                if (err < 0)
                        goto err2;
        } else {
                memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
        }

        if (type->select_ops) {
                ops = type->select_ops(ctx, (const struct nlattr * const *)tb);
                if (IS_ERR(ops)) {
                        err = PTR_ERR(ops);
                        goto err2;
                }
        } else {
                ops = type->ops;
        }

        err = -ENOMEM;
        obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
        if (!obj)
                goto err2;

        err = ops->init(ctx, (const struct nlattr * const *)tb, obj);
        if (err < 0)
                goto err3;

        obj->ops = ops;

        kfree(tb);
        return obj;
err3:
        kfree(obj);
err2:
        kfree(tb);
err1:
        return ERR_PTR(err);
}

static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
                           struct nft_object *obj, bool reset)
{
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, attr);
        if (!nest)
                goto nla_put_failure;
        if (obj->ops->dump(skb, obj, reset) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -1;
}

static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family)
{
        const struct nft_object_type *type;

        list_for_each_entry_rcu(type, &nf_tables_objects, list) {
                if (type->family != NFPROTO_UNSPEC &&
                    type->family != family)
                        continue;

                if (objtype == type->type)
                        return type;
        }
        return NULL;
}

static const struct nft_object_type *
nft_obj_type_get(struct net *net, u32 objtype, u8 family)
{
        const struct nft_object_type *type;

        rcu_read_lock();
        type = __nft_obj_type_get(objtype, family);
        if (type != NULL && try_module_get(type->owner)) {
                rcu_read_unlock();
                return type;
        }
        rcu_read_unlock();

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (type == NULL) {
                if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

static int nf_tables_updobj(const struct nft_ctx *ctx,
                            const struct nft_object_type *type,
                            const struct nlattr *attr,
                            struct nft_object *obj)
{
        struct nft_object *newobj;
        struct nft_trans *trans;
        int err = -ENOMEM;

        if (!try_module_get(type->owner))
                return -ENOENT;

        trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
                                sizeof(struct nft_trans_obj));
        if (!trans)
                goto err_trans;

        newobj = nft_obj_init(ctx, type, attr);
        if (IS_ERR(newobj)) {
                err = PTR_ERR(newobj);
                goto err_free_trans;
        }

        nft_trans_obj(trans) = obj;
        nft_trans_obj_update(trans) = true;
        nft_trans_obj_newobj(trans) = newobj;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_free_trans:
        kfree(trans);
err_trans:
        module_put(type->owner);
        return err;
}

static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_object_type *type;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_object *obj;
        struct nft_ctx ctx;
        u32 objtype;
        int err;

        if (!nla[NFTA_OBJ_TYPE] ||
            !nla[NFTA_OBJ_NAME] ||
            !nla[NFTA_OBJ_DATA])
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
                return PTR_ERR(table);
        }

        objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
        obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
        if (IS_ERR(obj)) {
                err = PTR_ERR(obj);
                if (err != -ENOENT) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
                        return err;
                }
        } else {
                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                type = __nft_obj_type_get(objtype, family);
                if (WARN_ON_ONCE(!type))
                        return -ENOENT;

                if (!obj->ops->update)
                        return 0;

                nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

                return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (!nft_use_inc(&table->use))
                return -EMFILE;

        type = nft_obj_type_get(net, objtype, family);
        if (IS_ERR(type)) {
                err = PTR_ERR(type);
                goto err_type;
        }

        obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
        if (IS_ERR(obj)) {
                err = PTR_ERR(obj);
                goto err_init;
        }
        obj->key.table = table;
        obj->handle = nf_tables_alloc_handle(table);

        obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
        if (!obj->key.name) {
                err = -ENOMEM;
                goto err_strdup;
        }

        if (nla[NFTA_OBJ_USERDATA]) {
                obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT);
                if (obj->udata == NULL)
                        goto err_userdata;

                obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]);
        }

        err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
        if (err < 0)
                goto err_trans;

        err = rhltable_insert(&nft_objname_ht, &obj->rhlhead,
                              nft_objname_ht_params);
        if (err < 0)
                goto err_obj_ht;

        list_add_tail_rcu(&obj->list, &table->objects);

        return 0;
err_obj_ht:
        /* queued in transaction log */
        INIT_LIST_HEAD(&obj->list);
        return err;
err_trans:
        kfree(obj->udata);
err_userdata:
        kfree(obj->key.name);
err_strdup:
        if (obj->ops->destroy)
                obj->ops->destroy(&ctx, obj);
        kfree(obj);
err_init:
        module_put(type->owner);
err_type:
        nft_use_dec_restore(&table->use);

        return err;
}

static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
                                   u32 portid, u32 seq, int event, u32 flags,
                                   int family, const struct nft_table *table,
                                   struct nft_object *obj, bool reset)
{
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
            nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
            nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
                         NFTA_OBJ_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELOBJ) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
            nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
            nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
                goto nla_put_failure;

        if (obj->udata &&
            nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void audit_log_obj_reset(const struct nft_table *table,
                                unsigned int base_seq, unsigned int nentries)
{
        char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);

        audit_log_nfcfg(buf, table->family, nentries,
                        AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
        kfree(buf);
}

struct nft_obj_dump_ctx {
        unsigned int        s_idx;
        char                *table;
        u32                type;
        bool                reset;
};

static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nftables_pernet *nft_net;
        const struct nft_table *table;
        unsigned int entries = 0;
        struct nft_object *obj;
        unsigned int idx = 0;
        int rc = 0;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                entries = 0;
                list_for_each_entry_rcu(obj, &table->objects, list) {
                        if (!nft_is_active(net, obj))
                                goto cont;
                        if (idx < ctx->s_idx)
                                goto cont;
                        if (ctx->table && strcmp(ctx->table, table->name))
                                goto cont;
                        if (ctx->type != NFT_OBJECT_UNSPEC &&
                            obj->ops->type->type != ctx->type)
                                goto cont;

                        rc = nf_tables_fill_obj_info(skb, net,
                                                     NETLINK_CB(cb->skb).portid,
                                                     cb->nlh->nlmsg_seq,
                                                     NFT_MSG_NEWOBJ,
                                                     NLM_F_MULTI | NLM_F_APPEND,
                                                     table->family, table,
                                                     obj, ctx->reset);
                        if (rc < 0)
                                break;

                        entries++;
                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
                if (ctx->reset && entries)
                        audit_log_obj_reset(table, nft_net->base_seq, entries);
                if (rc < 0)
                        break;
        }
        rcu_read_unlock();

        ctx->s_idx = idx;
        return skb->len;
}

static int nf_tables_dump_obj_start(struct netlink_callback *cb)
{
        struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
        const struct nlattr * const *nla = cb->data;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));

        if (nla[NFTA_OBJ_TABLE]) {
                ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
                if (!ctx->table)
                        return -ENOMEM;
        }

        if (nla[NFTA_OBJ_TYPE])
                ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));

        if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
                ctx->reset = true;

        return 0;
}

static int nf_tables_dump_obj_done(struct netlink_callback *cb)
{
        struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;

        kfree(ctx->table);

        return 0;
}

/* called with rcu_read_lock held */
static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_table *table;
        struct net *net = info->net;
        struct nft_object *obj;
        struct sk_buff *skb2;
        bool reset = false;
        u32 objtype;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_obj_start,
                        .dump = nf_tables_dump_obj,
                        .done = nf_tables_dump_obj_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!nla[NFTA_OBJ_NAME] ||
            !nla[NFTA_OBJ_TYPE])
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
                return PTR_ERR(table);
        }

        objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
        obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
        if (IS_ERR(obj)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
                return PTR_ERR(obj);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;

        if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
                reset = true;

        if (reset) {
                const struct nftables_pernet *nft_net;
                char *buf;

                nft_net = nft_pernet(net);
                buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq);

                audit_log_nfcfg(buf,
                                family,
                                1,
                                AUDIT_NFT_OP_OBJ_RESET,
                                GFP_ATOMIC);
                kfree(buf);
        }

        err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
                                      info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
                                      family, table, obj, reset);
        if (err < 0)
                goto err_fill_obj_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_obj_info:
        kfree_skb(skb2);
        return err;
}

static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
{
        if (obj->ops->destroy)
                obj->ops->destroy(ctx, obj);

        module_put(obj->ops->type->owner);
        kfree(obj->key.name);
        kfree(obj->udata);
        kfree(obj);
}

static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_object *obj;
        struct nft_ctx ctx;
        u32 objtype;

        if (!nla[NFTA_OBJ_TYPE] ||
            (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
                return PTR_ERR(table);
        }

        objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
        if (nla[NFTA_OBJ_HANDLE]) {
                attr = nla[NFTA_OBJ_HANDLE];
                obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask);
        } else {
                attr = nla[NFTA_OBJ_NAME];
                obj = nft_obj_lookup(net, table, attr, objtype, genmask);
        }

        if (IS_ERR(obj)) {
                if (PTR_ERR(obj) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(obj);
        }
        if (obj->use > 0) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        return nft_delobj(&ctx, obj);
}

static void
__nft_obj_notify(struct net *net, const struct nft_table *table,
                 struct nft_object *obj, u32 portid, u32 seq, int event,
                 u16 flags, int family, int report, gfp_t gfp)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct sk_buff *skb;
        int err;

        if (!report &&
            !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
        if (skb == NULL)
                goto err;

        err = nf_tables_fill_obj_info(skb, net, portid, seq, event,
                                      flags & (NLM_F_CREATE | NLM_F_EXCL),
                                      family, table, obj, false);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

void nft_obj_notify(struct net *net, const struct nft_table *table,
                    struct nft_object *obj, u32 portid, u32 seq, int event,
                    u16 flags, int family, int report, gfp_t gfp)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        char *buf = kasprintf(gfp, "%s:%u",
                              table->name, nft_net->base_seq);

        audit_log_nfcfg(buf,
                        family,
                        obj->handle,
                        event == NFT_MSG_NEWOBJ ?
                                 AUDIT_NFT_OP_OBJ_REGISTER :
                                 AUDIT_NFT_OP_OBJ_UNREGISTER,
                        gfp);
        kfree(buf);

        __nft_obj_notify(net, table, obj, portid, seq, event,
                         flags, family, report, gfp);
}
EXPORT_SYMBOL_GPL(nft_obj_notify);

static void nf_tables_obj_notify(const struct nft_ctx *ctx,
                                 struct nft_object *obj, int event)
{
        __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
                         ctx->seq, event, ctx->flags, ctx->family,
                         ctx->report, GFP_KERNEL);
}

/*
 * Flow tables
 */
void nft_register_flowtable_type(struct nf_flowtable_type *type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_add_tail_rcu(&type->list, &nf_tables_flowtables);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_register_flowtable_type);

void nft_unregister_flowtable_type(struct nf_flowtable_type *type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_del_rcu(&type->list);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type);

static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
        [NFTA_FLOWTABLE_TABLE]                = { .type = NLA_STRING,
                                            .len = NFT_NAME_MAXLEN - 1 },
        [NFTA_FLOWTABLE_NAME]                = { .type = NLA_STRING,
                                            .len = NFT_NAME_MAXLEN - 1 },
        [NFTA_FLOWTABLE_HOOK]                = { .type = NLA_NESTED },
        [NFTA_FLOWTABLE_HANDLE]                = { .type = NLA_U64 },
        [NFTA_FLOWTABLE_FLAGS]                = { .type = NLA_U32 },
};

struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
                                           const struct nlattr *nla, u8 genmask)
{
        struct nft_flowtable *flowtable;

        list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
                if (!nla_strcmp(nla, flowtable->name) &&
                    nft_active_genmask(flowtable, genmask))
                        return flowtable;
        }
        return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL_GPL(nft_flowtable_lookup);

void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
                                    struct nft_flowtable *flowtable,
                                    enum nft_trans_phase phase)
{
        switch (phase) {
        case NFT_TRANS_PREPARE_ERROR:
        case NFT_TRANS_PREPARE:
        case NFT_TRANS_ABORT:
        case NFT_TRANS_RELEASE:
                nft_use_dec(&flowtable->use);
                fallthrough;
        default:
                return;
        }
}
EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable);

static struct nft_flowtable *
nft_flowtable_lookup_byhandle(const struct nft_table *table,
                              const struct nlattr *nla, u8 genmask)
{
       struct nft_flowtable *flowtable;

       list_for_each_entry(flowtable, &table->flowtables, list) {
               if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle &&
                   nft_active_genmask(flowtable, genmask))
                       return flowtable;
       }
       return ERR_PTR(-ENOENT);
}

struct nft_flowtable_hook {
        u32                        num;
        int                        priority;
        struct list_head        list;
};

static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
        [NFTA_FLOWTABLE_HOOK_NUM]        = { .type = NLA_U32 },
        [NFTA_FLOWTABLE_HOOK_PRIORITY]        = { .type = NLA_U32 },
        [NFTA_FLOWTABLE_HOOK_DEVS]        = { .type = NLA_NESTED },
};

static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
                                    const struct nlattr * const nla[],
                                    struct nft_flowtable_hook *flowtable_hook,
                                    struct nft_flowtable *flowtable,
                                    struct netlink_ext_ack *extack, bool add)
{
        struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
        struct nft_hook *hook;
        int hooknum, priority;
        int err;

        INIT_LIST_HEAD(&flowtable_hook->list);

        err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX,
                                          nla[NFTA_FLOWTABLE_HOOK],
                                          nft_flowtable_hook_policy, NULL);
        if (err < 0)
                return err;

        if (add) {
                if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
                    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                        return -ENOENT;
                }

                hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
                if (hooknum != NF_NETDEV_INGRESS)
                        return -EOPNOTSUPP;

                priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));

                flowtable_hook->priority        = priority;
                flowtable_hook->num                = hooknum;
        } else {
                if (tb[NFTA_FLOWTABLE_HOOK_NUM]) {
                        hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
                        if (hooknum != flowtable->hooknum)
                                return -EOPNOTSUPP;
                }

                if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
                        priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
                        if (priority != flowtable->data.priority)
                                return -EOPNOTSUPP;
                }

                flowtable_hook->priority        = flowtable->data.priority;
                flowtable_hook->num                = flowtable->hooknum;
        }

        if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
                err = nf_tables_parse_netdev_hooks(ctx->net,
                                                   tb[NFTA_FLOWTABLE_HOOK_DEVS],
                                                   &flowtable_hook->list,
                                                   extack);
                if (err < 0)
                        return err;
        }

        list_for_each_entry(hook, &flowtable_hook->list, list) {
                hook->ops.pf                = NFPROTO_NETDEV;
                hook->ops.hooknum        = flowtable_hook->num;
                hook->ops.priority        = flowtable_hook->priority;
                hook->ops.priv                = &flowtable->data;
                hook->ops.hook                = flowtable->data.type->hook;
        }

        return err;
}

/* call under rcu_read_lock */
static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
{
        const struct nf_flowtable_type *type;

        list_for_each_entry_rcu(type, &nf_tables_flowtables, list) {
                if (family == type->family)
                        return type;
        }
        return NULL;
}

static const struct nf_flowtable_type *
nft_flowtable_type_get(struct net *net, u8 family)
{
        const struct nf_flowtable_type *type;

        rcu_read_lock();
        type = __nft_flowtable_type_get(family);
        if (type != NULL && try_module_get(type->owner)) {
                rcu_read_unlock();
                return type;
        }
        rcu_read_unlock();

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (type == NULL) {
                if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

/* Only called from error and netdev event paths. */
static void nft_unregister_flowtable_hook(struct net *net,
                                          struct nft_flowtable *flowtable,
                                          struct nft_hook *hook)
{
        nf_unregister_net_hook(net, &hook->ops);
        flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
                                    FLOW_BLOCK_UNBIND);
}

static void __nft_unregister_flowtable_net_hooks(struct net *net,
                                                 struct list_head *hook_list,
                                                 bool release_netdev)
{
        struct nft_hook *hook, *next;

        list_for_each_entry_safe(hook, next, hook_list, list) {
                nf_unregister_net_hook(net, &hook->ops);
                if (release_netdev) {
                        list_del(&hook->list);
                        kfree_rcu(hook, rcu);
                }
        }
}

static void nft_unregister_flowtable_net_hooks(struct net *net,
                                               struct list_head *hook_list)
{
        __nft_unregister_flowtable_net_hooks(net, hook_list, false);
}

static int nft_register_flowtable_net_hooks(struct net *net,
                                            struct nft_table *table,
                                            struct list_head *hook_list,
                                            struct nft_flowtable *flowtable)
{
        struct nft_hook *hook, *hook2, *next;
        struct nft_flowtable *ft;
        int err, i = 0;

        list_for_each_entry(hook, hook_list, list) {
                list_for_each_entry(ft, &table->flowtables, list) {
                        if (!nft_is_active_next(net, ft))
                                continue;

                        list_for_each_entry(hook2, &ft->hook_list, list) {
                                if (hook->ops.dev == hook2->ops.dev &&
                                    hook->ops.pf == hook2->ops.pf) {
                                        err = -EEXIST;
                                        goto err_unregister_net_hooks;
                                }
                        }
                }

                err = flowtable->data.type->setup(&flowtable->data,
                                                  hook->ops.dev,
                                                  FLOW_BLOCK_BIND);
                if (err < 0)
                        goto err_unregister_net_hooks;

                err = nf_register_net_hook(net, &hook->ops);
                if (err < 0) {
                        flowtable->data.type->setup(&flowtable->data,
                                                    hook->ops.dev,
                                                    FLOW_BLOCK_UNBIND);
                        goto err_unregister_net_hooks;
                }

                i++;
        }

        return 0;

err_unregister_net_hooks:
        list_for_each_entry_safe(hook, next, hook_list, list) {
                if (i-- <= 0)
                        break;

                nft_unregister_flowtable_hook(net, flowtable, hook);
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
        }

        return err;
}

static void nft_hooks_destroy(struct list_head *hook_list)
{
        struct nft_hook *hook, *next;

        list_for_each_entry_safe(hook, next, hook_list, list) {
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
        }
}

static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
                                struct nft_flowtable *flowtable,
                                struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_flowtable_hook flowtable_hook;
        struct nft_hook *hook, *next;
        struct nft_trans *trans;
        bool unregister = false;
        u32 flags;
        int err;

        err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
                                       extack, false);
        if (err < 0)
                return err;

        list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
                if (nft_hook_list_find(&flowtable->hook_list, hook)) {
                        list_del(&hook->list);
                        kfree(hook);
                }
        }

        if (nla[NFTA_FLOWTABLE_FLAGS]) {
                flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
                if (flags & ~NFT_FLOWTABLE_MASK) {
                        err = -EOPNOTSUPP;
                        goto err_flowtable_update_hook;
                }
                if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
                    (flags & NFT_FLOWTABLE_HW_OFFLOAD)) {
                        err = -EOPNOTSUPP;
                        goto err_flowtable_update_hook;
                }
        } else {
                flags = flowtable->data.flags;
        }

        err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
                                               &flowtable_hook.list, flowtable);
        if (err < 0)
                goto err_flowtable_update_hook;

        trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE,
                                sizeof(struct nft_trans_flowtable));
        if (!trans) {
                unregister = true;
                err = -ENOMEM;
                goto err_flowtable_update_hook;
        }

        nft_trans_flowtable_flags(trans) = flags;
        nft_trans_flowtable(trans) = flowtable;
        nft_trans_flowtable_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
        list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans));

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_flowtable_update_hook:
        list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
                if (unregister)
                        nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
        }

        return err;

}

static int nf_tables_newflowtable(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        struct nft_flowtable_hook flowtable_hook;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nf_flowtable_type *type;
        struct nft_flowtable *flowtable;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_trans *trans;
        struct nft_ctx ctx;
        int err;

        if (!nla[NFTA_FLOWTABLE_TABLE] ||
            !nla[NFTA_FLOWTABLE_NAME] ||
            !nla[NFTA_FLOWTABLE_HOOK])
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
                return PTR_ERR(table);
        }

        flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
                                         genmask);
        if (IS_ERR(flowtable)) {
                err = PTR_ERR(flowtable);
                if (err != -ENOENT) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                        return err;
                }
        } else {
                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                        return -EEXIST;
                }

                nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

                return nft_flowtable_update(&ctx, info->nlh, flowtable, extack);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (!nft_use_inc(&table->use))
                return -EMFILE;

        flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
        if (!flowtable) {
                err = -ENOMEM;
                goto flowtable_alloc;
        }

        flowtable->table = table;
        flowtable->handle = nf_tables_alloc_handle(table);
        INIT_LIST_HEAD(&flowtable->hook_list);

        flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
        if (!flowtable->name) {
                err = -ENOMEM;
                goto err1;
        }

        type = nft_flowtable_type_get(net, family);
        if (IS_ERR(type)) {
                err = PTR_ERR(type);
                goto err2;
        }

        if (nla[NFTA_FLOWTABLE_FLAGS]) {
                flowtable->data.flags =
                        ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
                if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
                        err = -EOPNOTSUPP;
                        goto err3;
                }
        }

        write_pnet(&flowtable->data.net, net);
        flowtable->data.type = type;
        err = type->init(&flowtable->data);
        if (err < 0)
                goto err3;

        err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable,
                                       extack, true);
        if (err < 0)
                goto err_flowtable_parse_hooks;

        list_splice(&flowtable_hook.list, &flowtable->hook_list);
        flowtable->data.priority = flowtable_hook.priority;
        flowtable->hooknum = flowtable_hook.num;

        trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto err_flowtable_trans;
        }

        /* This must be LAST to ensure no packets are walking over this flowtable. */
        err = nft_register_flowtable_net_hooks(ctx.net, table,
                                               &flowtable->hook_list,
                                               flowtable);
        if (err < 0)
                goto err_flowtable_hooks;

        list_add_tail_rcu(&flowtable->list, &table->flowtables);

        return 0;

err_flowtable_hooks:
        nft_trans_destroy(trans);
err_flowtable_trans:
        nft_hooks_destroy(&flowtable->hook_list);
err_flowtable_parse_hooks:
        flowtable->data.type->free(&flowtable->data);
err3:
        module_put(type->owner);
err2:
        kfree(flowtable->name);
err1:
        kfree(flowtable);
flowtable_alloc:
        nft_use_dec_restore(&table->use);

        return err;
}

static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook)
{
        struct nft_hook *this, *next;

        list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
                list_del(&this->list);
                kfree(this);
        }
}

static int nft_delflowtable_hook(struct nft_ctx *ctx,
                                 struct nft_flowtable *flowtable,
                                 struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_flowtable_hook flowtable_hook;
        LIST_HEAD(flowtable_del_list);
        struct nft_hook *this, *hook;
        struct nft_trans *trans;
        int err;

        err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
                                       extack, false);
        if (err < 0)
                return err;

        list_for_each_entry(this, &flowtable_hook.list, list) {
                hook = nft_hook_list_find(&flowtable->hook_list, this);
                if (!hook) {
                        err = -ENOENT;
                        goto err_flowtable_del_hook;
                }
                list_move(&hook->list, &flowtable_del_list);
        }

        trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
                                sizeof(struct nft_trans_flowtable));
        if (!trans) {
                err = -ENOMEM;
                goto err_flowtable_del_hook;
        }

        nft_trans_flowtable(trans) = flowtable;
        nft_trans_flowtable_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
        list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans));
        nft_flowtable_hook_release(&flowtable_hook);

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_flowtable_del_hook:
        list_splice(&flowtable_del_list, &flowtable->hook_list);
        nft_flowtable_hook_release(&flowtable_hook);

        return err;
}

static int nf_tables_delflowtable(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_flowtable *flowtable;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_ctx ctx;

        if (!nla[NFTA_FLOWTABLE_TABLE] ||
            (!nla[NFTA_FLOWTABLE_NAME] &&
             !nla[NFTA_FLOWTABLE_HANDLE]))
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_FLOWTABLE_HANDLE]) {
                attr = nla[NFTA_FLOWTABLE_HANDLE];
                flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
        } else {
                attr = nla[NFTA_FLOWTABLE_NAME];
                flowtable = nft_flowtable_lookup(table, attr, genmask);
        }

        if (IS_ERR(flowtable)) {
                if (PTR_ERR(flowtable) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(flowtable);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (nla[NFTA_FLOWTABLE_HOOK])
                return nft_delflowtable_hook(&ctx, flowtable, extack);

        if (flowtable->use > 0) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        return nft_delflowtable(&ctx, flowtable);
}

static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
                                         u32 portid, u32 seq, int event,
                                         u32 flags, int family,
                                         struct nft_flowtable *flowtable,
                                         struct list_head *hook_list)
{
        struct nlattr *nest, *nest_devs;
        struct nft_hook *hook;
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
            nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
            nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
                         NFTA_FLOWTABLE_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELFLOWTABLE && !hook_list) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
            nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK);
        if (!nest)
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
            nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority)))
                goto nla_put_failure;

        nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS);
        if (!nest_devs)
                goto nla_put_failure;

        if (!hook_list)
                hook_list = &flowtable->hook_list;

        list_for_each_entry_rcu(hook, hook_list, list) {
                if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
                        goto nla_put_failure;
        }
        nla_nest_end(skb, nest_devs);
        nla_nest_end(skb, nest);

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

struct nft_flowtable_filter {
        char                *table;
};

static int nf_tables_dump_flowtable(struct sk_buff *skb,
                                    struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nft_flowtable_filter *filter = cb->data;
        unsigned int idx = 0, s_idx = cb->args[0];
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nft_flowtable *flowtable;
        struct nftables_pernet *nft_net;
        const struct nft_table *table;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
                        if (!nft_is_active(net, flowtable))
                                goto cont;
                        if (idx < s_idx)
                                goto cont;
                        if (idx > s_idx)
                                memset(&cb->args[1], 0,
                                       sizeof(cb->args) - sizeof(cb->args[0]));
                        if (filter && filter->table &&
                            strcmp(filter->table, table->name))
                                goto cont;

                        if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid,
                                                          cb->nlh->nlmsg_seq,
                                                          NFT_MSG_NEWFLOWTABLE,
                                                          NLM_F_MULTI | NLM_F_APPEND,
                                                          table->family,
                                                          flowtable, NULL) < 0)
                                goto done;

                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
        }
done:
        rcu_read_unlock();

        cb->args[0] = idx;
        return skb->len;
}

static int nf_tables_dump_flowtable_start(struct netlink_callback *cb)
{
        const struct nlattr * const *nla = cb->data;
        struct nft_flowtable_filter *filter = NULL;

        if (nla[NFTA_FLOWTABLE_TABLE]) {
                filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
                if (!filter)
                        return -ENOMEM;

                filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
                                           GFP_ATOMIC);
                if (!filter->table) {
                        kfree(filter);
                        return -ENOMEM;
                }
        }

        cb->data = filter;
        return 0;
}

static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
{
        struct nft_flowtable_filter *filter = cb->data;

        if (!filter)
                return 0;

        kfree(filter->table);
        kfree(filter);

        return 0;
}

/* called with rcu_read_lock held */
static int nf_tables_getflowtable(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_flowtable *flowtable;
        const struct nft_table *table;
        struct net *net = info->net;
        struct sk_buff *skb2;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_flowtable_start,
                        .dump = nf_tables_dump_flowtable,
                        .done = nf_tables_dump_flowtable_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!nla[NFTA_FLOWTABLE_NAME])
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
                                 genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
                return PTR_ERR(table);
        }

        flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
                                         genmask);
        if (IS_ERR(flowtable)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                return PTR_ERR(flowtable);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;

        err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
                                            info->nlh->nlmsg_seq,
                                            NFT_MSG_NEWFLOWTABLE, 0, family,
                                            flowtable, NULL);
        if (err < 0)
                goto err_fill_flowtable_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_flowtable_info:
        kfree_skb(skb2);
        return err;
}

static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
                                       struct nft_flowtable *flowtable,
                                       struct list_head *hook_list, int event)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
                                            ctx->seq, event, flags,
                                            ctx->family, flowtable, hook_list);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
        struct nft_hook *hook, *next;

        flowtable->data.type->free(&flowtable->data);
        list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
                flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
                                            FLOW_BLOCK_UNBIND);
                list_del_rcu(&hook->list);
                kfree(hook);
        }
        kfree(flowtable->name);
        module_put(flowtable->data.type->owner);
        kfree(flowtable);
}

static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
                                   u32 portid, u32 seq)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nlmsghdr *nlh;
        char buf[TASK_COMM_LEN];
        int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);

        nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) ||
            nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
            nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -EMSGSIZE;
}

static void nft_flowtable_event(unsigned long event, struct net_device *dev,
                                struct nft_flowtable *flowtable)
{
        struct nft_hook *hook;

        list_for_each_entry(hook, &flowtable->hook_list, list) {
                if (hook->ops.dev != dev)
                        continue;

                /* flow_offload_netdev_event() cleans up entries for us. */
                nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
                break;
        }
}

static int nf_tables_flowtable_event(struct notifier_block *this,
                                     unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct nft_flowtable *flowtable;
        struct nftables_pernet *nft_net;
        struct nft_table *table;
        struct net *net;

        if (event != NETDEV_UNREGISTER)
                return 0;

        net = dev_net(dev);
        nft_net = nft_pernet(net);
        mutex_lock(&nft_net->commit_mutex);
        list_for_each_entry(table, &nft_net->tables, list) {
                list_for_each_entry(flowtable, &table->flowtables, list) {
                        nft_flowtable_event(event, dev, flowtable);
                }
        }
        mutex_unlock(&nft_net->commit_mutex);

        return NOTIFY_DONE;
}

static struct notifier_block nf_tables_flowtable_notifier = {
        .notifier_call        = nf_tables_flowtable_event,
};

static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
                                 int event)
{
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        struct sk_buff *skb2;
        int err;

        if (!nlmsg_report(nlh) &&
            !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
                return;

        skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb2 == NULL)
                goto err;

        err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
                                      nlh->nlmsg_seq);
        if (err < 0) {
                kfree_skb(skb2);
                goto err;
        }

        nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
                       nlmsg_report(nlh), GFP_KERNEL);
        return;
err:
        nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
                          -ENOBUFS);
}

static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct sk_buff *skb2;
        int err;

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;

        err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid,
                                      info->nlh->nlmsg_seq);
        if (err < 0)
                goto err_fill_gen_info;

        return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);

err_fill_gen_info:
        kfree_skb(skb2);
        return err;
}

static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
        [NFT_MSG_NEWTABLE] = {
                .call                = nf_tables_newtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_GETTABLE] = {
                .call                = nf_tables_gettable,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_DELTABLE] = {
                .call                = nf_tables_deltable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_DESTROYTABLE] = {
                .call                = nf_tables_deltable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_NEWCHAIN] = {
                .call                = nf_tables_newchain,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_GETCHAIN] = {
                .call                = nf_tables_getchain,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_DELCHAIN] = {
                .call                = nf_tables_delchain,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_DESTROYCHAIN] = {
                .call                = nf_tables_delchain,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_NEWRULE] = {
                .call                = nf_tables_newrule,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_GETRULE] = {
                .call                = nf_tables_getrule,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_GETRULE_RESET] = {
                .call                = nf_tables_getrule_reset,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_DELRULE] = {
                .call                = nf_tables_delrule,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_DESTROYRULE] = {
                .call                = nf_tables_delrule,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_NEWSET] = {
                .call                = nf_tables_newset,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_GETSET] = {
                .call                = nf_tables_getset,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_DELSET] = {
                .call                = nf_tables_delset,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_DESTROYSET] = {
                .call                = nf_tables_delset,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_NEWSETELEM] = {
                .call                = nf_tables_newsetelem,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETSETELEM] = {
                .call                = nf_tables_getsetelem,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETSETELEM_RESET] = {
                .call                = nf_tables_getsetelem_reset,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_DELSETELEM] = {
                .call                = nf_tables_delsetelem,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_DESTROYSETELEM] = {
                .call                = nf_tables_delsetelem,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETGEN] = {
                .call                = nf_tables_getgen,
                .type                = NFNL_CB_RCU,
        },
        [NFT_MSG_NEWOBJ] = {
                .call                = nf_tables_newobj,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ] = {
                .call                = nf_tables_getobj,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_DELOBJ] = {
                .call                = nf_tables_delobj,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_DESTROYOBJ] = {
                .call                = nf_tables_delobj,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ_RESET] = {
                .call                = nf_tables_getobj,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_NEWFLOWTABLE] = {
                .call                = nf_tables_newflowtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
        [NFT_MSG_GETFLOWTABLE] = {
                .call                = nf_tables_getflowtable,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
        [NFT_MSG_DELFLOWTABLE] = {
                .call                = nf_tables_delflowtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
        [NFT_MSG_DESTROYFLOWTABLE] = {
                .call                = nf_tables_delflowtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
};

static int nf_tables_validate(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_table *table;

        list_for_each_entry(table, &nft_net->tables, list) {
                switch (table->validate_state) {
                case NFT_VALIDATE_SKIP:
                        continue;
                case NFT_VALIDATE_NEED:
                        nft_validate_state_update(table, NFT_VALIDATE_DO);
                        fallthrough;
                case NFT_VALIDATE_DO:
                        if (nft_table_validate(net, table) < 0)
                                return -EAGAIN;

                        nft_validate_state_update(table, NFT_VALIDATE_SKIP);
                        break;
                }
        }

        return 0;
}

/* a drop policy has to be deferred until all rules have been activated,
 * otherwise a large ruleset that contains a drop-policy base chain will
 * cause all packets to get dropped until the full transaction has been
 * processed.
 *
 * We defer the drop policy until the transaction has been finalized.
 */
static void nft_chain_commit_drop_policy(struct nft_trans *trans)
{
        struct nft_base_chain *basechain;

        if (nft_trans_chain_policy(trans) != NF_DROP)
                return;

        if (!nft_is_base_chain(trans->ctx.chain))
                return;

        basechain = nft_base_chain(trans->ctx.chain);
        basechain->policy = NF_DROP;
}

static void nft_chain_commit_update(struct nft_trans *trans)
{
        struct nft_base_chain *basechain;

        if (nft_trans_chain_name(trans)) {
                rhltable_remove(&trans->ctx.table->chains_ht,
                                &trans->ctx.chain->rhlhead,
                                nft_chain_ht_params);
                swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
                rhltable_insert_key(&trans->ctx.table->chains_ht,
                                    trans->ctx.chain->name,
                                    &trans->ctx.chain->rhlhead,
                                    nft_chain_ht_params);
        }

        if (!nft_is_base_chain(trans->ctx.chain))
                return;

        nft_chain_stats_replace(trans);

        basechain = nft_base_chain(trans->ctx.chain);

        switch (nft_trans_chain_policy(trans)) {
        case NF_DROP:
        case NF_ACCEPT:
                basechain->policy = nft_trans_chain_policy(trans);
                break;
        }
}

static void nft_obj_commit_update(struct nft_trans *trans)
{
        struct nft_object *newobj;
        struct nft_object *obj;

        obj = nft_trans_obj(trans);
        newobj = nft_trans_obj_newobj(trans);

        if (WARN_ON_ONCE(!obj->ops->update))
                return;

        obj->ops->update(obj, newobj);
        nft_obj_destroy(&trans->ctx, newobj);
}

static void nft_commit_release(struct nft_trans *trans)
{
        switch (trans->msg_type) {
        case NFT_MSG_DELTABLE:
        case NFT_MSG_DESTROYTABLE:
                nf_tables_table_destroy(&trans->ctx);
                break;
        case NFT_MSG_NEWCHAIN:
                free_percpu(nft_trans_chain_stats(trans));
                kfree(nft_trans_chain_name(trans));
                break;
        case NFT_MSG_DELCHAIN:
        case NFT_MSG_DESTROYCHAIN:
                if (nft_trans_chain_update(trans))
                        nft_hooks_destroy(&nft_trans_chain_hooks(trans));
                else
                        nf_tables_chain_destroy(&trans->ctx);
                break;
        case NFT_MSG_DELRULE:
        case NFT_MSG_DESTROYRULE:
                nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
                break;
        case NFT_MSG_DELSET:
        case NFT_MSG_DESTROYSET:
                nft_set_destroy(&trans->ctx, nft_trans_set(trans));
                break;
        case NFT_MSG_DELSETELEM:
        case NFT_MSG_DESTROYSETELEM:
                nf_tables_set_elem_destroy(&trans->ctx,
                                           nft_trans_elem_set(trans),
                                           nft_trans_elem_priv(trans));
                break;
        case NFT_MSG_DELOBJ:
        case NFT_MSG_DESTROYOBJ:
                nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
                break;
        case NFT_MSG_DELFLOWTABLE:
        case NFT_MSG_DESTROYFLOWTABLE:
                if (nft_trans_flowtable_update(trans))
                        nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
                else
                        nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
                break;
        }

        if (trans->put_net)
                put_net(trans->ctx.net);

        kfree(trans);
}

static void nf_tables_trans_destroy_work(struct work_struct *w)
{
        struct nft_trans *trans, *next;
        LIST_HEAD(head);

        spin_lock(&nf_tables_destroy_list_lock);
        list_splice_init(&nf_tables_destroy_list, &head);
        spin_unlock(&nf_tables_destroy_list_lock);

        if (list_empty(&head))
                return;

        synchronize_rcu();

        list_for_each_entry_safe(trans, next, &head, list) {
                nft_trans_list_del(trans);
                nft_commit_release(trans);
        }
}

void nf_tables_trans_destroy_flush_work(void)
{
        flush_work(&trans_destroy_work);
}
EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);

static bool nft_expr_reduce(struct nft_regs_track *track,
                            const struct nft_expr *expr)
{
        return false;
}

static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
{
        const struct nft_expr *expr, *last;
        struct nft_regs_track track = {};
        unsigned int size, data_size;
        void *data, *data_boundary;
        struct nft_rule_dp *prule;
        struct nft_rule *rule;

        /* already handled or inactive chain? */
        if (chain->blob_next || !nft_is_active_next(net, chain))
                return 0;

        data_size = 0;
        list_for_each_entry(rule, &chain->rules, list) {
                if (nft_is_active_next(net, rule)) {
                        data_size += sizeof(*prule) + rule->dlen;
                        if (data_size > INT_MAX)
                                return -ENOMEM;
                }
        }

        chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size);
        if (!chain->blob_next)
                return -ENOMEM;

        data = (void *)chain->blob_next->data;
        data_boundary = data + data_size;
        size = 0;

        list_for_each_entry(rule, &chain->rules, list) {
                if (!nft_is_active_next(net, rule))
                        continue;

                prule = (struct nft_rule_dp *)data;
                data += offsetof(struct nft_rule_dp, data);
                if (WARN_ON_ONCE(data > data_boundary))
                        return -ENOMEM;

                size = 0;
                track.last = nft_expr_last(rule);
                nft_rule_for_each_expr(expr, last, rule) {
                        track.cur = expr;

                        if (nft_expr_reduce(&track, expr)) {
                                expr = track.cur;
                                continue;
                        }

                        if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary))
                                return -ENOMEM;

                        memcpy(data + size, expr, expr->ops->size);
                        size += expr->ops->size;
                }
                if (WARN_ON_ONCE(size >= 1 << 12))
                        return -ENOMEM;

                prule->handle = rule->handle;
                prule->dlen = size;
                prule->is_last = 0;

                data += size;
                size = 0;
                chain->blob_next->size += (unsigned long)(data - (void *)prule);
        }

        if (WARN_ON_ONCE(data > data_boundary))
                return -ENOMEM;

        prule = (struct nft_rule_dp *)data;
        nft_last_rule(chain, prule);

        return 0;
}

static void nf_tables_commit_chain_prepare_cancel(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans, *next;

        list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
                struct nft_chain *chain = trans->ctx.chain;

                if (trans->msg_type == NFT_MSG_NEWRULE ||
                    trans->msg_type == NFT_MSG_DELRULE) {
                        kvfree(chain->blob_next);
                        chain->blob_next = NULL;
                }
        }
}

static void __nf_tables_commit_chain_free_rules(struct rcu_head *h)
{
        struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h);

        kvfree(l->blob);
}

static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob)
{
        struct nft_rule_dp_last *last;

        /* last rule trailer is after end marker */
        last = (void *)blob + sizeof(*blob) + blob->size;
        last->blob = blob;

        call_rcu(&last->h, __nf_tables_commit_chain_free_rules);
}

static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
{
        struct nft_rule_blob *g0, *g1;
        bool next_genbit;

        next_genbit = nft_gencursor_next(net);

        g0 = rcu_dereference_protected(chain->blob_gen_0,
                                       lockdep_commit_lock_is_held(net));
        g1 = rcu_dereference_protected(chain->blob_gen_1,
                                       lockdep_commit_lock_is_held(net));

        /* No changes to this chain? */
        if (chain->blob_next == NULL) {
                /* chain had no change in last or next generation */
                if (g0 == g1)
                        return;
                /*
                 * chain had no change in this generation; make sure next
                 * one uses same rules as current generation.
                 */
                if (next_genbit) {
                        rcu_assign_pointer(chain->blob_gen_1, g0);
                        nf_tables_commit_chain_free_rules_old(g1);
                } else {
                        rcu_assign_pointer(chain->blob_gen_0, g1);
                        nf_tables_commit_chain_free_rules_old(g0);
                }

                return;
        }

        if (next_genbit)
                rcu_assign_pointer(chain->blob_gen_1, chain->blob_next);
        else
                rcu_assign_pointer(chain->blob_gen_0, chain->blob_next);

        chain->blob_next = NULL;

        if (g0 == g1)
                return;

        if (next_genbit)
                nf_tables_commit_chain_free_rules_old(g1);
        else
                nf_tables_commit_chain_free_rules_old(g0);
}

static void nft_obj_del(struct nft_object *obj)
{
        rhltable_remove(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params);
        list_del_rcu(&obj->list);
}

void nft_chain_del(struct nft_chain *chain)
{
        struct nft_table *table = chain->table;

        WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead,
                                     nft_chain_ht_params));
        list_del_rcu(&chain->list);
}

static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
                                        struct nft_trans_gc *trans)
{
        struct nft_elem_priv **priv = trans->priv;
        unsigned int i;

        for (i = 0; i < trans->count; i++) {
                nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]);
                nft_setelem_remove(ctx->net, trans->set, priv[i]);
        }
}

void nft_trans_gc_destroy(struct nft_trans_gc *trans)
{
        nft_set_put(trans->set);
        put_net(trans->net);
        kfree(trans);
}

static void nft_trans_gc_trans_free(struct rcu_head *rcu)
{
        struct nft_elem_priv *elem_priv;
        struct nft_trans_gc *trans;
        struct nft_ctx ctx = {};
        unsigned int i;

        trans = container_of(rcu, struct nft_trans_gc, rcu);
        ctx.net        = read_pnet(&trans->set->net);

        for (i = 0; i < trans->count; i++) {
                elem_priv = trans->priv[i];
                if (!nft_setelem_is_catchall(trans->set, elem_priv))
                        atomic_dec(&trans->set->nelems);

                nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv);
        }

        nft_trans_gc_destroy(trans);
}

static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
{
        struct nftables_pernet *nft_net;
        struct nft_ctx ctx = {};

        nft_net = nft_pernet(trans->net);

        mutex_lock(&nft_net->commit_mutex);

        /* Check for race with transaction, otherwise this batch refers to
         * stale objects that might not be there anymore. Skip transaction if
         * set has been destroyed from control plane transaction in case gc
         * worker loses race.
         */
        if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
                mutex_unlock(&nft_net->commit_mutex);
                return false;
        }

        ctx.net = trans->net;
        ctx.table = trans->set->table;

        nft_trans_gc_setelem_remove(&ctx, trans);
        mutex_unlock(&nft_net->commit_mutex);

        return true;
}

static void nft_trans_gc_work(struct work_struct *work)
{
        struct nft_trans_gc *trans, *next;
        LIST_HEAD(trans_gc_list);

        spin_lock(&nf_tables_gc_list_lock);
        list_splice_init(&nf_tables_gc_list, &trans_gc_list);
        spin_unlock(&nf_tables_gc_list_lock);

        list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
                list_del(&trans->list);
                if (!nft_trans_gc_work_done(trans)) {
                        nft_trans_gc_destroy(trans);
                        continue;
                }
                call_rcu(&trans->rcu, nft_trans_gc_trans_free);
        }
}

struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
                                        unsigned int gc_seq, gfp_t gfp)
{
        struct net *net = read_pnet(&set->net);
        struct nft_trans_gc *trans;

        trans = kzalloc(sizeof(*trans), gfp);
        if (!trans)
                return NULL;

        trans->net = maybe_get_net(net);
        if (!trans->net) {
                kfree(trans);
                return NULL;
        }

        refcount_inc(&set->refs);
        trans->set = set;
        trans->seq = gc_seq;

        return trans;
}

void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
{
        trans->priv[trans->count++] = priv;
}

static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
{
        spin_lock(&nf_tables_gc_list_lock);
        list_add_tail(&trans->list, &nf_tables_gc_list);
        spin_unlock(&nf_tables_gc_list_lock);

        schedule_work(&trans_gc_work);
}

static int nft_trans_gc_space(struct nft_trans_gc *trans)
{
        return NFT_TRANS_GC_BATCHCOUNT - trans->count;
}

struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
                                              unsigned int gc_seq, gfp_t gfp)
{
        struct nft_set *set;

        if (nft_trans_gc_space(gc))
                return gc;

        set = gc->set;
        nft_trans_gc_queue_work(gc);

        return nft_trans_gc_alloc(set, gc_seq, gfp);
}

void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
{
        if (trans->count == 0) {
                nft_trans_gc_destroy(trans);
                return;
        }

        nft_trans_gc_queue_work(trans);
}

struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
{
        struct nft_set *set;

        if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
                return NULL;

        if (nft_trans_gc_space(gc))
                return gc;

        set = gc->set;
        call_rcu(&gc->rcu, nft_trans_gc_trans_free);

        return nft_trans_gc_alloc(set, 0, gfp);
}

void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
{
        WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));

        if (trans->count == 0) {
                nft_trans_gc_destroy(trans);
                return;
        }

        call_rcu(&trans->rcu, nft_trans_gc_trans_free);
}

struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
                                                 unsigned int gc_seq)
{
        struct nft_set_elem_catchall *catchall;
        const struct nft_set *set = gc->set;
        struct nft_set_ext *ext;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);

                if (!nft_set_elem_expired(ext))
                        continue;
                if (nft_set_elem_is_dead(ext))
                        goto dead_elem;

                nft_set_elem_dead(ext);
dead_elem:
                gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
                if (!gc)
                        return NULL;

                nft_trans_gc_elem_add(gc, catchall->elem);
        }

        return gc;
}

struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc)
{
        struct nft_set_elem_catchall *catchall, *next;
        u64 tstamp = nft_net_tstamp(gc->net);
        const struct nft_set *set = gc->set;
        struct nft_elem_priv *elem_priv;
        struct nft_set_ext *ext;

        WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net));

        list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);

                if (!__nft_set_elem_expired(ext, tstamp))
                        continue;

                gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
                if (!gc)
                        return NULL;

                elem_priv = catchall->elem;
                nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
                nft_setelem_catchall_destroy(catchall);
                nft_trans_gc_elem_add(gc, elem_priv);
        }

        return gc;
}

static void nf_tables_module_autoload_cleanup(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_module_request *req, *next;

        WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
        list_for_each_entry_safe(req, next, &nft_net->module_list, list) {
                WARN_ON_ONCE(!req->done);
                list_del(&req->list);
                kfree(req);
        }
}

static void nf_tables_commit_release(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans;

        /* all side effects have to be made visible.
         * For example, if a chain named 'foo' has been deleted, a
         * new transaction must not find it anymore.
         *
         * Memory reclaim happens asynchronously from work queue
         * to prevent expensive synchronize_rcu() in commit phase.
         */
        if (list_empty(&nft_net->commit_list)) {
                nf_tables_module_autoload_cleanup(net);
                mutex_unlock(&nft_net->commit_mutex);
                return;
        }

        trans = list_last_entry(&nft_net->commit_list,
                                struct nft_trans, list);
        get_net(trans->ctx.net);
        WARN_ON_ONCE(trans->put_net);

        trans->put_net = true;
        spin_lock(&nf_tables_destroy_list_lock);
        list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list);
        spin_unlock(&nf_tables_destroy_list_lock);

        nf_tables_module_autoload_cleanup(net);
        schedule_work(&trans_destroy_work);

        mutex_unlock(&nft_net->commit_mutex);
}

static void nft_commit_notify(struct net *net, u32 portid)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct sk_buff *batch_skb = NULL, *nskb, *skb;
        unsigned char *data;
        int len;

        list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) {
                if (!batch_skb) {
new_batch:
                        batch_skb = skb;
                        len = NLMSG_GOODSIZE - skb->len;
                        list_del(&skb->list);
                        continue;
                }
                len -= skb->len;
                if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) {
                        data = skb_put(batch_skb, skb->len);
                        memcpy(data, skb->data, skb->len);
                        list_del(&skb->list);
                        kfree_skb(skb);
                        continue;
                }
                nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
                               NFT_CB(batch_skb).report, GFP_KERNEL);
                goto new_batch;
        }

        if (batch_skb) {
                nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
                               NFT_CB(batch_skb).report, GFP_KERNEL);
        }

        WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
}

static int nf_tables_commit_audit_alloc(struct list_head *adl,
                                        struct nft_table *table)
{
        struct nft_audit_data *adp;

        list_for_each_entry(adp, adl, list) {
                if (adp->table == table)
                        return 0;
        }
        adp = kzalloc(sizeof(*adp), GFP_KERNEL);
        if (!adp)
                return -ENOMEM;
        adp->table = table;
        list_add(&adp->list, adl);
        return 0;
}

static void nf_tables_commit_audit_free(struct list_head *adl)
{
        struct nft_audit_data *adp, *adn;

        list_for_each_entry_safe(adp, adn, adl, list) {
                list_del(&adp->list);
                kfree(adp);
        }
}

static void nf_tables_commit_audit_collect(struct list_head *adl,
                                           struct nft_table *table, u32 op)
{
        struct nft_audit_data *adp;

        list_for_each_entry(adp, adl, list) {
                if (adp->table == table)
                        goto found;
        }
        WARN_ONCE(1, "table=%s not expected in commit list", table->name);
        return;
found:
        adp->entries++;
        if (!adp->op || adp->op > op)
                adp->op = op;
}

#define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22)

static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation)
{
        struct nft_audit_data *adp, *adn;
        char aubuf[AUNFTABLENAMELEN];

        list_for_each_entry_safe(adp, adn, adl, list) {
                snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name,
                         generation);
                audit_log_nfcfg(aubuf, adp->table->family, adp->entries,
                                nft2audit_op[adp->op], GFP_KERNEL);
                list_del(&adp->list);
                kfree(adp);
        }
}

static void nft_set_commit_update(struct list_head *set_update_list)
{
        struct nft_set *set, *next;

        list_for_each_entry_safe(set, next, set_update_list, pending_update) {
                list_del_init(&set->pending_update);

                if (!set->ops->commit || set->dead)
                        continue;

                set->ops->commit(set);
        }
}

static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net)
{
        unsigned int gc_seq;

        /* Bump gc counter, it becomes odd, this is the busy mark. */
        gc_seq = READ_ONCE(nft_net->gc_seq);
        WRITE_ONCE(nft_net->gc_seq, ++gc_seq);

        return gc_seq;
}

static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq)
{
        WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
}

static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans, *next;
        unsigned int base_seq, gc_seq;
        LIST_HEAD(set_update_list);
        struct nft_trans_elem *te;
        struct nft_chain *chain;
        struct nft_table *table;
        LIST_HEAD(adl);
        int err;

        if (list_empty(&nft_net->commit_list)) {
                mutex_unlock(&nft_net->commit_mutex);
                return 0;
        }

        list_for_each_entry(trans, &nft_net->binding_list, binding_list) {
                switch (trans->msg_type) {
                case NFT_MSG_NEWSET:
                        if (!nft_trans_set_update(trans) &&
                            nft_set_is_anonymous(nft_trans_set(trans)) &&
                            !nft_trans_set_bound(trans)) {
                                pr_warn_once("nftables ruleset with unbound set\n");
                                return -EINVAL;
                        }
                        break;
                case NFT_MSG_NEWCHAIN:
                        if (!nft_trans_chain_update(trans) &&
                            nft_chain_binding(nft_trans_chain(trans)) &&
                            !nft_trans_chain_bound(trans)) {
                                pr_warn_once("nftables ruleset with unbound chain\n");
                                return -EINVAL;
                        }
                        break;
                }
        }

        /* 0. Validate ruleset, otherwise roll back for error reporting. */
        if (nf_tables_validate(net) < 0) {
                nft_net->validate_state = NFT_VALIDATE_DO;
                return -EAGAIN;
        }

        err = nft_flow_rule_offload_commit(net);
        if (err < 0)
                return err;

        /* 1.  Allocate space for next generation rules_gen_X[] */
        list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
                int ret;

                ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table);
                if (ret) {
                        nf_tables_commit_chain_prepare_cancel(net);
                        nf_tables_commit_audit_free(&adl);
                        return ret;
                }
                if (trans->msg_type == NFT_MSG_NEWRULE ||
                    trans->msg_type == NFT_MSG_DELRULE) {
                        chain = trans->ctx.chain;

                        ret = nf_tables_commit_chain_prepare(net, chain);
                        if (ret < 0) {
                                nf_tables_commit_chain_prepare_cancel(net);
                                nf_tables_commit_audit_free(&adl);
                                return ret;
                        }
                }
        }

        /* step 2.  Make rules_gen_X visible to packet path */
        list_for_each_entry(table, &nft_net->tables, list) {
                list_for_each_entry(chain, &table->chains, list)
                        nf_tables_commit_chain(net, chain);
        }

        /*
         * Bump generation counter, invalidate any dump in progress.
         * Cannot fail after this point.
         */
        base_seq = READ_ONCE(nft_net->base_seq);
        while (++base_seq == 0)
                ;

        WRITE_ONCE(nft_net->base_seq, base_seq);

        gc_seq = nft_gc_seq_begin(nft_net);

        /* step 3. Start new generation, rules_gen_X now in use. */
        net->nft.gencursor = nft_gencursor_next(net);

        list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
                nf_tables_commit_audit_collect(&adl, trans->ctx.table,
                                               trans->msg_type);
                switch (trans->msg_type) {
                case NFT_MSG_NEWTABLE:
                        if (nft_trans_table_update(trans)) {
                                if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) {
                                        nft_trans_destroy(trans);
                                        break;
                                }
                                if (trans->ctx.table->flags & NFT_TABLE_F_DORMANT)
                                        nf_tables_table_disable(net, trans->ctx.table);

                                trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE;
                        } else {
                                nft_clear(net, trans->ctx.table);
                        }
                        nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELTABLE:
                case NFT_MSG_DESTROYTABLE:
                        list_del_rcu(&trans->ctx.table->list);
                        nf_tables_table_notify(&trans->ctx, trans->msg_type);
                        break;
                case NFT_MSG_NEWCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                nft_chain_commit_update(trans);
                                nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN,
                                                       &nft_trans_chain_hooks(trans));
                                list_splice(&nft_trans_chain_hooks(trans),
                                            &nft_trans_basechain(trans)->hook_list);
                                /* trans destroyed after rcu grace period */
                        } else {
                                nft_chain_commit_drop_policy(trans);
                                nft_clear(net, trans->ctx.chain);
                                nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN, NULL);
                                nft_trans_destroy(trans);
                        }
                        break;
                case NFT_MSG_DELCHAIN:
                case NFT_MSG_DESTROYCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN,
                                                       &nft_trans_chain_hooks(trans));
                                if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) {
                                        nft_netdev_unregister_hooks(net,
                                                                    &nft_trans_chain_hooks(trans),
                                                                    true);
                                }
                        } else {
                                nft_chain_del(trans->ctx.chain);
                                nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN,
                                                       NULL);
                                nf_tables_unregister_hook(trans->ctx.net,
                                                          trans->ctx.table,
                                                          trans->ctx.chain);
                        }
                        break;
                case NFT_MSG_NEWRULE:
                        nft_clear(trans->ctx.net, nft_trans_rule(trans));
                        nf_tables_rule_notify(&trans->ctx,
                                              nft_trans_rule(trans),
                                              NFT_MSG_NEWRULE);
                        if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));

                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELRULE:
                case NFT_MSG_DESTROYRULE:
                        list_del_rcu(&nft_trans_rule(trans)->list);
                        nf_tables_rule_notify(&trans->ctx,
                                              nft_trans_rule(trans),
                                              trans->msg_type);
                        nft_rule_expr_deactivate(&trans->ctx,
                                                 nft_trans_rule(trans),
                                                 NFT_TRANS_COMMIT);

                        if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));
                        break;
                case NFT_MSG_NEWSET:
                        if (nft_trans_set_update(trans)) {
                                struct nft_set *set = nft_trans_set(trans);

                                WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans));
                                WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans));

                                if (nft_trans_set_size(trans))
                                        WRITE_ONCE(set->size, nft_trans_set_size(trans));
                        } else {
                                nft_clear(net, nft_trans_set(trans));
                                /* This avoids hitting -EBUSY when deleting the table
                                 * from the transaction.
                                 */
                                if (nft_set_is_anonymous(nft_trans_set(trans)) &&
                                    !list_empty(&nft_trans_set(trans)->bindings))
                                        nft_use_dec(&trans->ctx.table->use);
                        }
                        nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
                                             NFT_MSG_NEWSET, GFP_KERNEL);
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELSET:
                case NFT_MSG_DESTROYSET:
                        nft_trans_set(trans)->dead = 1;
                        list_del_rcu(&nft_trans_set(trans)->list);
                        nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
                                             trans->msg_type, GFP_KERNEL);
                        break;
                case NFT_MSG_NEWSETELEM:
                        te = (struct nft_trans_elem *)trans->data;

                        nft_setelem_activate(net, te->set, te->elem_priv);
                        nf_tables_setelem_notify(&trans->ctx, te->set,
                                                 te->elem_priv,
                                                 NFT_MSG_NEWSETELEM);
                        if (te->set->ops->commit &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELSETELEM:
                case NFT_MSG_DESTROYSETELEM:
                        te = (struct nft_trans_elem *)trans->data;

                        nf_tables_setelem_notify(&trans->ctx, te->set,
                                                 te->elem_priv,
                                                 trans->msg_type);
                        nft_setelem_remove(net, te->set, te->elem_priv);
                        if (!nft_setelem_is_catchall(te->set, te->elem_priv)) {
                                atomic_dec(&te->set->nelems);
                                te->set->ndeact--;
                        }
                        if (te->set->ops->commit &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        break;
                case NFT_MSG_NEWOBJ:
                        if (nft_trans_obj_update(trans)) {
                                nft_obj_commit_update(trans);
                                nf_tables_obj_notify(&trans->ctx,
                                                     nft_trans_obj(trans),
                                                     NFT_MSG_NEWOBJ);
                        } else {
                                nft_clear(net, nft_trans_obj(trans));
                                nf_tables_obj_notify(&trans->ctx,
                                                     nft_trans_obj(trans),
                                                     NFT_MSG_NEWOBJ);
                                nft_trans_destroy(trans);
                        }
                        break;
                case NFT_MSG_DELOBJ:
                case NFT_MSG_DESTROYOBJ:
                        nft_obj_del(nft_trans_obj(trans));
                        nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
                                             trans->msg_type);
                        break;
                case NFT_MSG_NEWFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                nft_trans_flowtable(trans)->data.flags =
                                        nft_trans_flowtable_flags(trans);
                                nf_tables_flowtable_notify(&trans->ctx,
                                                           nft_trans_flowtable(trans),
                                                           &nft_trans_flowtable_hooks(trans),
                                                           NFT_MSG_NEWFLOWTABLE);
                                list_splice(&nft_trans_flowtable_hooks(trans),
                                            &nft_trans_flowtable(trans)->hook_list);
                        } else {
                                nft_clear(net, nft_trans_flowtable(trans));
                                nf_tables_flowtable_notify(&trans->ctx,
                                                           nft_trans_flowtable(trans),
                                                           NULL,
                                                           NFT_MSG_NEWFLOWTABLE);
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELFLOWTABLE:
                case NFT_MSG_DESTROYFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                nf_tables_flowtable_notify(&trans->ctx,
                                                           nft_trans_flowtable(trans),
                                                           &nft_trans_flowtable_hooks(trans),
                                                           trans->msg_type);
                                nft_unregister_flowtable_net_hooks(net,
                                                                   &nft_trans_flowtable_hooks(trans));
                        } else {
                                list_del_rcu(&nft_trans_flowtable(trans)->list);
                                nf_tables_flowtable_notify(&trans->ctx,
                                                           nft_trans_flowtable(trans),
                                                           NULL,
                                                           trans->msg_type);
                                nft_unregister_flowtable_net_hooks(net,
                                                &nft_trans_flowtable(trans)->hook_list);
                        }
                        break;
                }
        }

        nft_set_commit_update(&set_update_list);

        nft_commit_notify(net, NETLINK_CB(skb).portid);
        nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
        nf_tables_commit_audit_log(&adl, nft_net->base_seq);

        nft_gc_seq_end(nft_net, gc_seq);
        nft_net->validate_state = NFT_VALIDATE_SKIP;
        nf_tables_commit_release(net);

        return 0;
}

static void nf_tables_module_autoload(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_module_request *req, *next;
        LIST_HEAD(module_list);

        list_splice_init(&nft_net->module_list, &module_list);
        mutex_unlock(&nft_net->commit_mutex);
        list_for_each_entry_safe(req, next, &module_list, list) {
                request_module("%s", req->module);
                req->done = true;
        }
        mutex_lock(&nft_net->commit_mutex);
        list_splice(&module_list, &nft_net->module_list);
}

static void nf_tables_abort_release(struct nft_trans *trans)
{
        switch (trans->msg_type) {
        case NFT_MSG_NEWTABLE:
                nf_tables_table_destroy(&trans->ctx);
                break;
        case NFT_MSG_NEWCHAIN:
                if (nft_trans_chain_update(trans))
                        nft_hooks_destroy(&nft_trans_chain_hooks(trans));
                else
                        nf_tables_chain_destroy(&trans->ctx);
                break;
        case NFT_MSG_NEWRULE:
                nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
                break;
        case NFT_MSG_NEWSET:
                nft_set_destroy(&trans->ctx, nft_trans_set(trans));
                break;
        case NFT_MSG_NEWSETELEM:
                nft_set_elem_destroy(nft_trans_elem_set(trans),
                                     nft_trans_elem_priv(trans), true);
                break;
        case NFT_MSG_NEWOBJ:
                nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
                break;
        case NFT_MSG_NEWFLOWTABLE:
                if (nft_trans_flowtable_update(trans))
                        nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
                else
                        nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
                break;
        }
        kfree(trans);
}

static void nft_set_abort_update(struct list_head *set_update_list)
{
        struct nft_set *set, *next;

        list_for_each_entry_safe(set, next, set_update_list, pending_update) {
                list_del_init(&set->pending_update);

                if (!set->ops->abort)
                        continue;

                set->ops->abort(set);
        }
}

static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans, *next;
        LIST_HEAD(set_update_list);
        struct nft_trans_elem *te;
        int err = 0;

        if (action == NFNL_ABORT_VALIDATE &&
            nf_tables_validate(net) < 0)
                err = -EAGAIN;

        list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list,
                                         list) {
                switch (trans->msg_type) {
                case NFT_MSG_NEWTABLE:
                        if (nft_trans_table_update(trans)) {
                                if (!(trans->ctx.table->flags & __NFT_TABLE_F_UPDATE)) {
                                        nft_trans_destroy(trans);
                                        break;
                                }
                                if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_DORMANT) {
                                        nf_tables_table_disable(net, trans->ctx.table);
                                        trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
                                } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
                                        trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT;
                                }
                                if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_ORPHAN) {
                                        trans->ctx.table->flags &= ~NFT_TABLE_F_OWNER;
                                        trans->ctx.table->nlpid = 0;
                                }
                                trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE;
                                nft_trans_destroy(trans);
                        } else {
                                list_del_rcu(&trans->ctx.table->list);
                        }
                        break;
                case NFT_MSG_DELTABLE:
                case NFT_MSG_DESTROYTABLE:
                        nft_clear(trans->ctx.net, trans->ctx.table);
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) {
                                        nft_netdev_unregister_hooks(net,
                                                                    &nft_trans_chain_hooks(trans),
                                                                    true);
                                }
                                free_percpu(nft_trans_chain_stats(trans));
                                kfree(nft_trans_chain_name(trans));
                                nft_trans_destroy(trans);
                        } else {
                                if (nft_trans_chain_bound(trans)) {
                                        nft_trans_destroy(trans);
                                        break;
                                }
                                nft_use_dec_restore(&trans->ctx.table->use);
                                nft_chain_del(trans->ctx.chain);
                                nf_tables_unregister_hook(trans->ctx.net,
                                                          trans->ctx.table,
                                                          trans->ctx.chain);
                        }
                        break;
                case NFT_MSG_DELCHAIN:
                case NFT_MSG_DESTROYCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                list_splice(&nft_trans_chain_hooks(trans),
                                            &nft_trans_basechain(trans)->hook_list);
                        } else {
                                nft_use_inc_restore(&trans->ctx.table->use);
                                nft_clear(trans->ctx.net, trans->ctx.chain);
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWRULE:
                        if (nft_trans_rule_bound(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        nft_use_dec_restore(&trans->ctx.chain->use);
                        list_del_rcu(&nft_trans_rule(trans)->list);
                        nft_rule_expr_deactivate(&trans->ctx,
                                                 nft_trans_rule(trans),
                                                 NFT_TRANS_ABORT);
                        if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));
                        break;
                case NFT_MSG_DELRULE:
                case NFT_MSG_DESTROYRULE:
                        nft_use_inc_restore(&trans->ctx.chain->use);
                        nft_clear(trans->ctx.net, nft_trans_rule(trans));
                        nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans));
                        if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));

                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWSET:
                        if (nft_trans_set_update(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        nft_use_dec_restore(&trans->ctx.table->use);
                        if (nft_trans_set_bound(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        nft_trans_set(trans)->dead = 1;
                        list_del_rcu(&nft_trans_set(trans)->list);
                        break;
                case NFT_MSG_DELSET:
                case NFT_MSG_DESTROYSET:
                        nft_use_inc_restore(&trans->ctx.table->use);
                        nft_clear(trans->ctx.net, nft_trans_set(trans));
                        if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                                nft_map_activate(&trans->ctx, nft_trans_set(trans));

                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWSETELEM:
                        if (nft_trans_elem_set_bound(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        te = (struct nft_trans_elem *)trans->data;
                        nft_setelem_remove(net, te->set, te->elem_priv);
                        if (!nft_setelem_is_catchall(te->set, te->elem_priv))
                                atomic_dec(&te->set->nelems);

                        if (te->set->ops->abort &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        break;
                case NFT_MSG_DELSETELEM:
                case NFT_MSG_DESTROYSETELEM:
                        te = (struct nft_trans_elem *)trans->data;

                        if (!nft_setelem_active_next(net, te->set, te->elem_priv)) {
                                nft_setelem_data_activate(net, te->set, te->elem_priv);
                                nft_setelem_activate(net, te->set, te->elem_priv);
                        }
                        if (!nft_setelem_is_catchall(te->set, te->elem_priv))
                                te->set->ndeact--;

                        if (te->set->ops->abort &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWOBJ:
                        if (nft_trans_obj_update(trans)) {
                                nft_obj_destroy(&trans->ctx, nft_trans_obj_newobj(trans));
                                nft_trans_destroy(trans);
                        } else {
                                nft_use_dec_restore(&trans->ctx.table->use);
                                nft_obj_del(nft_trans_obj(trans));
                        }
                        break;
                case NFT_MSG_DELOBJ:
                case NFT_MSG_DESTROYOBJ:
                        nft_use_inc_restore(&trans->ctx.table->use);
                        nft_clear(trans->ctx.net, nft_trans_obj(trans));
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                nft_unregister_flowtable_net_hooks(net,
                                                &nft_trans_flowtable_hooks(trans));
                        } else {
                                nft_use_dec_restore(&trans->ctx.table->use);
                                list_del_rcu(&nft_trans_flowtable(trans)->list);
                                nft_unregister_flowtable_net_hooks(net,
                                                &nft_trans_flowtable(trans)->hook_list);
                        }
                        break;
                case NFT_MSG_DELFLOWTABLE:
                case NFT_MSG_DESTROYFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                list_splice(&nft_trans_flowtable_hooks(trans),
                                            &nft_trans_flowtable(trans)->hook_list);
                        } else {
                                nft_use_inc_restore(&trans->ctx.table->use);
                                nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
                        }
                        nft_trans_destroy(trans);
                        break;
                }
        }

        nft_set_abort_update(&set_update_list);

        synchronize_rcu();

        list_for_each_entry_safe_reverse(trans, next,
                                         &nft_net->commit_list, list) {
                nft_trans_list_del(trans);
                nf_tables_abort_release(trans);
        }

        return err;
}

static int nf_tables_abort(struct net *net, struct sk_buff *skb,
                           enum nfnl_abort_action action)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        unsigned int gc_seq;
        int ret;

        gc_seq = nft_gc_seq_begin(nft_net);
        ret = __nf_tables_abort(net, action);
        nft_gc_seq_end(nft_net, gc_seq);

        WARN_ON_ONCE(!list_empty(&nft_net->commit_list));

        /* module autoload needs to happen after GC sequence update because it
         * temporarily releases and grabs mutex again.
         */
        if (action == NFNL_ABORT_AUTOLOAD)
                nf_tables_module_autoload(net);
        else
                nf_tables_module_autoload_cleanup(net);

        mutex_unlock(&nft_net->commit_mutex);

        return ret;
}

static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        bool genid_ok;

        mutex_lock(&nft_net->commit_mutex);
        nft_net->tstamp = get_jiffies_64();

        genid_ok = genid == 0 || nft_net->base_seq == genid;
        if (!genid_ok)
                mutex_unlock(&nft_net->commit_mutex);

        /* else, commit mutex has to be released by commit or abort function */
        return genid_ok;
}

static const struct nfnetlink_subsystem nf_tables_subsys = {
        .name                = "nf_tables",
        .subsys_id        = NFNL_SUBSYS_NFTABLES,
        .cb_count        = NFT_MSG_MAX,
        .cb                = nf_tables_cb,
        .commit                = nf_tables_commit,
        .abort                = nf_tables_abort,
        .valid_genid        = nf_tables_valid_genid,
        .owner                = THIS_MODULE,
};

int nft_chain_validate_dependency(const struct nft_chain *chain,
                                  enum nft_chain_types type)
{
        const struct nft_base_chain *basechain;

        if (nft_is_base_chain(chain)) {
                basechain = nft_base_chain(chain);
                if (basechain->type->type != type)
                        return -EOPNOTSUPP;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(nft_chain_validate_dependency);

int nft_chain_validate_hooks(const struct nft_chain *chain,
                             unsigned int hook_flags)
{
        struct nft_base_chain *basechain;

        if (nft_is_base_chain(chain)) {
                basechain = nft_base_chain(chain);

                if ((1 << basechain->ops.hooknum) & hook_flags)
                        return 0;

                return -EOPNOTSUPP;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);

/*
 * Loop detection - walk through the ruleset beginning at the destination chain
 * of a new jump until either the source chain is reached (loop) or all
 * reachable chains have been traversed.
 *
 * The loop check is performed whenever a new jump verdict is added to an
 * expression or verdict map or a verdict map is bound to a new chain.
 */

static int nf_tables_check_loops(const struct nft_ctx *ctx,
                                 const struct nft_chain *chain);

static int nft_check_loops(const struct nft_ctx *ctx,
                           const struct nft_set_ext *ext)
{
        const struct nft_data *data;
        int ret;

        data = nft_set_ext_data(ext);
        switch (data->verdict.code) {
        case NFT_JUMP:
        case NFT_GOTO:
                ret = nf_tables_check_loops(ctx, data->verdict.chain);
                break;
        default:
                ret = 0;
                break;
        }

        return ret;
}

static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
                                        struct nft_set *set,
                                        const struct nft_set_iter *iter,
                                        struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
            *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
                return 0;

        return nft_check_loops(ctx, ext);
}

static int nft_set_catchall_loops(const struct nft_ctx *ctx,
                                  struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                ret = nft_check_loops(ctx, ext);
                if (ret < 0)
                        return ret;
        }

        return ret;
}

static int nf_tables_check_loops(const struct nft_ctx *ctx,
                                 const struct nft_chain *chain)
{
        const struct nft_rule *rule;
        const struct nft_expr *expr, *last;
        struct nft_set *set;
        struct nft_set_binding *binding;
        struct nft_set_iter iter;

        if (ctx->chain == chain)
                return -ELOOP;

        if (fatal_signal_pending(current))
                return -EINTR;

        list_for_each_entry(rule, &chain->rules, list) {
                nft_rule_for_each_expr(expr, last, rule) {
                        struct nft_immediate_expr *priv;
                        const struct nft_data *data;
                        int err;

                        if (strcmp(expr->ops->type->name, "immediate"))
                                continue;

                        priv = nft_expr_priv(expr);
                        if (priv->dreg != NFT_REG_VERDICT)
                                continue;

                        data = &priv->data;
                        switch (data->verdict.code) {
                        case NFT_JUMP:
                        case NFT_GOTO:
                                err = nf_tables_check_loops(ctx,
                                                        data->verdict.chain);
                                if (err < 0)
                                        return err;
                                break;
                        default:
                                break;
                        }
                }
        }

        list_for_each_entry(set, &ctx->table->sets, list) {
                if (!nft_is_active_next(ctx->net, set))
                        continue;
                if (!(set->flags & NFT_SET_MAP) ||
                    set->dtype != NFT_DATA_VERDICT)
                        continue;

                list_for_each_entry(binding, &set->bindings, list) {
                        if (!(binding->flags & NFT_SET_MAP) ||
                            binding->chain != chain)
                                continue;

                        iter.genmask        = nft_genmask_next(ctx->net);
                        iter.type        = NFT_ITER_UPDATE;
                        iter.skip         = 0;
                        iter.count        = 0;
                        iter.err        = 0;
                        iter.fn                = nf_tables_loop_check_setelem;

                        set->ops->walk(ctx, set, &iter);
                        if (!iter.err)
                                iter.err = nft_set_catchall_loops(ctx, set);

                        if (iter.err < 0)
                                return iter.err;
                }
        }

        return 0;
}

/**
 *        nft_parse_u32_check - fetch u32 attribute and check for maximum value
 *
 *        @attr: netlink attribute to fetch value from
 *        @max: maximum value to be stored in dest
 *        @dest: pointer to the variable
 *
 *        Parse, check and store a given u32 netlink attribute into variable.
 *        This function returns -ERANGE if the value goes over maximum value.
 *        Otherwise a 0 is returned and the attribute value is stored in the
 *        destination variable.
 */
int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
{
        u32 val;

        val = ntohl(nla_get_be32(attr));
        if (val > max)
                return -ERANGE;

        *dest = val;
        return 0;
}
EXPORT_SYMBOL_GPL(nft_parse_u32_check);

static int nft_parse_register(const struct nlattr *attr, u32 *preg)
{
        unsigned int reg;

        reg = ntohl(nla_get_be32(attr));
        switch (reg) {
        case NFT_REG_VERDICT...NFT_REG_4:
                *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE;
                break;
        case NFT_REG32_00...NFT_REG32_15:
                *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
                break;
        default:
                return -ERANGE;
        }

        return 0;
}

/**
 *        nft_dump_register - dump a register value to a netlink attribute
 *
 *        @skb: socket buffer
 *        @attr: attribute number
 *        @reg: register number
 *
 *        Construct a netlink attribute containing the register number. For
 *        compatibility reasons, register numbers being a multiple of 4 are
 *        translated to the corresponding 128 bit register numbers.
 */
int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg)
{
        if (reg % (NFT_REG_SIZE / NFT_REG32_SIZE) == 0)
                reg = reg / (NFT_REG_SIZE / NFT_REG32_SIZE);
        else
                reg = reg - NFT_REG_SIZE / NFT_REG32_SIZE + NFT_REG32_00;

        return nla_put_be32(skb, attr, htonl(reg));
}
EXPORT_SYMBOL_GPL(nft_dump_register);

static int nft_validate_register_load(enum nft_registers reg, unsigned int len)
{
        if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
                return -EINVAL;
        if (len == 0)
                return -EINVAL;
        if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data))
                return -ERANGE;

        return 0;
}

int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len)
{
        u32 reg;
        int err;

        err = nft_parse_register(attr, &reg);
        if (err < 0)
                return err;

        err = nft_validate_register_load(reg, len);
        if (err < 0)
                return err;

        *sreg = reg;
        return 0;
}
EXPORT_SYMBOL_GPL(nft_parse_register_load);

static int nft_validate_register_store(const struct nft_ctx *ctx,
                                       enum nft_registers reg,
                                       const struct nft_data *data,
                                       enum nft_data_types type,
                                       unsigned int len)
{
        int err;

        switch (reg) {
        case NFT_REG_VERDICT:
                if (type != NFT_DATA_VERDICT)
                        return -EINVAL;

                if (data != NULL &&
                    (data->verdict.code == NFT_GOTO ||
                     data->verdict.code == NFT_JUMP)) {
                        err = nf_tables_check_loops(ctx, data->verdict.chain);
                        if (err < 0)
                                return err;
                }

                return 0;
        default:
                if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
                        return -EINVAL;
                if (len == 0)
                        return -EINVAL;
                if (reg * NFT_REG32_SIZE + len >
                    sizeof_field(struct nft_regs, data))
                        return -ERANGE;

                if (data != NULL && type != NFT_DATA_VALUE)
                        return -EINVAL;
                return 0;
        }
}

int nft_parse_register_store(const struct nft_ctx *ctx,
                             const struct nlattr *attr, u8 *dreg,
                             const struct nft_data *data,
                             enum nft_data_types type, unsigned int len)
{
        int err;
        u32 reg;

        err = nft_parse_register(attr, &reg);
        if (err < 0)
                return err;

        err = nft_validate_register_store(ctx, reg, data, type, len);
        if (err < 0)
                return err;

        *dreg = reg;
        return 0;
}
EXPORT_SYMBOL_GPL(nft_parse_register_store);

static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
        [NFTA_VERDICT_CODE]        = { .type = NLA_U32 },
        [NFTA_VERDICT_CHAIN]        = { .type = NLA_STRING,
                                    .len = NFT_CHAIN_MAXNAMELEN - 1 },
        [NFTA_VERDICT_CHAIN_ID]        = { .type = NLA_U32 },
};

static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
                            struct nft_data_desc *desc, const struct nlattr *nla)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nlattr *tb[NFTA_VERDICT_MAX + 1];
        struct nft_chain *chain;
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_VERDICT_MAX, nla,
                                          nft_verdict_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[NFTA_VERDICT_CODE])
                return -EINVAL;

        /* zero padding hole for memcmp */
        memset(data, 0, sizeof(*data));
        data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));

        switch (data->verdict.code) {
        case NF_ACCEPT:
        case NF_DROP:
        case NF_QUEUE:
                break;
        case NFT_CONTINUE:
        case NFT_BREAK:
        case NFT_RETURN:
                break;
        case NFT_JUMP:
        case NFT_GOTO:
                if (tb[NFTA_VERDICT_CHAIN]) {
                        chain = nft_chain_lookup(ctx->net, ctx->table,
                                                 tb[NFTA_VERDICT_CHAIN],
                                                 genmask);
                } else if (tb[NFTA_VERDICT_CHAIN_ID]) {
                        chain = nft_chain_lookup_byid(ctx->net, ctx->table,
                                                      tb[NFTA_VERDICT_CHAIN_ID],
                                                      genmask);
                        if (IS_ERR(chain))
                                return PTR_ERR(chain);
                } else {
                        return -EINVAL;
                }

                if (IS_ERR(chain))
                        return PTR_ERR(chain);
                if (nft_is_base_chain(chain))
                        return -EOPNOTSUPP;
                if (nft_chain_is_bound(chain))
                        return -EINVAL;
                if (desc->flags & NFT_DATA_DESC_SETELEM &&
                    chain->flags & NFT_CHAIN_BINDING)
                        return -EINVAL;
                if (!nft_use_inc(&chain->use))
                        return -EMFILE;

                data->verdict.chain = chain;
                break;
        default:
                return -EINVAL;
        }

        desc->len = sizeof(data->verdict);

        return 0;
}

static void nft_verdict_uninit(const struct nft_data *data)
{
        struct nft_chain *chain;

        switch (data->verdict.code) {
        case NFT_JUMP:
        case NFT_GOTO:
                chain = data->verdict.chain;
                nft_use_dec(&chain->use);
                break;
        }
}

int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v)
{
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, type);
        if (!nest)
                goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(v->code)))
                goto nla_put_failure;

        switch (v->code) {
        case NFT_JUMP:
        case NFT_GOTO:
                if (nla_put_string(skb, NFTA_VERDICT_CHAIN,
                                   v->chain->name))
                        goto nla_put_failure;
        }
        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -1;
}

static int nft_value_init(const struct nft_ctx *ctx,
                          struct nft_data *data, struct nft_data_desc *desc,
                          const struct nlattr *nla)
{
        unsigned int len;

        len = nla_len(nla);
        if (len == 0)
                return -EINVAL;
        if (len > desc->size)
                return -EOVERFLOW;
        if (desc->len) {
                if (len != desc->len)
                        return -EINVAL;
        } else {
                desc->len = len;
        }

        nla_memcpy(data->data, nla, len);

        return 0;
}

static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
                          unsigned int len)
{
        return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
}

static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
        [NFTA_DATA_VALUE]        = { .type = NLA_BINARY },
        [NFTA_DATA_VERDICT]        = { .type = NLA_NESTED },
};

/**
 *        nft_data_init - parse nf_tables data netlink attributes
 *
 *        @ctx: context of the expression using the data
 *        @data: destination struct nft_data
 *        @desc: data description
 *        @nla: netlink attribute containing data
 *
 *        Parse the netlink data attributes and initialize a struct nft_data.
 *        The type and length of data are returned in the data description.
 *
 *        The caller can indicate that it only wants to accept data of type
 *        NFT_DATA_VALUE by passing NULL for the ctx argument.
 */
int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
                  struct nft_data_desc *desc, const struct nlattr *nla)
{
        struct nlattr *tb[NFTA_DATA_MAX + 1];
        int err;

        if (WARN_ON_ONCE(!desc->size))
                return -EINVAL;

        err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
                                          nft_data_policy, NULL);
        if (err < 0)
                return err;

        if (tb[NFTA_DATA_VALUE]) {
                if (desc->type != NFT_DATA_VALUE)
                        return -EINVAL;

                err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
        } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) {
                if (desc->type != NFT_DATA_VERDICT)
                        return -EINVAL;

                err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
        } else {
                err = -EINVAL;
        }

        return err;
}
EXPORT_SYMBOL_GPL(nft_data_init);

/**
 *        nft_data_release - release a nft_data item
 *
 *        @data: struct nft_data to release
 *        @type: type of data
 *
 *        Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
 *        all others need to be released by calling this function.
 */
void nft_data_release(const struct nft_data *data, enum nft_data_types type)
{
        if (type < NFT_DATA_VERDICT)
                return;
        switch (type) {
        case NFT_DATA_VERDICT:
                return nft_verdict_uninit(data);
        default:
                WARN_ON(1);
        }
}
EXPORT_SYMBOL_GPL(nft_data_release);

int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
                  enum nft_data_types type, unsigned int len)
{
        struct nlattr *nest;
        int err;

        nest = nla_nest_start_noflag(skb, attr);
        if (nest == NULL)
                return -1;

        switch (type) {
        case NFT_DATA_VALUE:
                err = nft_value_dump(skb, data, len);
                break;
        case NFT_DATA_VERDICT:
                err = nft_verdict_dump(skb, NFTA_DATA_VERDICT, &data->verdict);
                break;
        default:
                err = -EINVAL;
                WARN_ON(1);
        }

        nla_nest_end(skb, nest);
        return err;
}
EXPORT_SYMBOL_GPL(nft_data_dump);

int __nft_release_basechain(struct nft_ctx *ctx)
{
        struct nft_rule *rule, *nr;

        if (WARN_ON(!nft_is_base_chain(ctx->chain)))
                return 0;

        nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
        list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
                list_del(&rule->list);
                nft_use_dec(&ctx->chain->use);
                nf_tables_rule_release(ctx, rule);
        }
        nft_chain_del(ctx->chain);
        nft_use_dec(&ctx->table->use);
        nf_tables_chain_destroy(ctx);

        return 0;
}
EXPORT_SYMBOL_GPL(__nft_release_basechain);

static void __nft_release_hook(struct net *net, struct nft_table *table)
{
        struct nft_flowtable *flowtable;
        struct nft_chain *chain;

        list_for_each_entry(chain, &table->chains, list)
                __nf_tables_unregister_hook(net, table, chain, true);
        list_for_each_entry(flowtable, &table->flowtables, list)
                __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list,
                                                     true);
}

static void __nft_release_hooks(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_table *table;

        list_for_each_entry(table, &nft_net->tables, list) {
                if (nft_table_has_owner(table))
                        continue;

                __nft_release_hook(net, table);
        }
}

static void __nft_release_table(struct net *net, struct nft_table *table)
{
        struct nft_flowtable *flowtable, *nf;
        struct nft_chain *chain, *nc;
        struct nft_object *obj, *ne;
        struct nft_rule *rule, *nr;
        struct nft_set *set, *ns;
        struct nft_ctx ctx = {
                .net        = net,
                .family        = NFPROTO_NETDEV,
        };

        ctx.family = table->family;
        ctx.table = table;
        list_for_each_entry(chain, &table->chains, list) {
                if (nft_chain_binding(chain))
                        continue;

                ctx.chain = chain;
                list_for_each_entry_safe(rule, nr, &chain->rules, list) {
                        list_del(&rule->list);
                        nft_use_dec(&chain->use);
                        nf_tables_rule_release(&ctx, rule);
                }
        }
        list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
                list_del(&flowtable->list);
                nft_use_dec(&table->use);
                nf_tables_flowtable_destroy(flowtable);
        }
        list_for_each_entry_safe(set, ns, &table->sets, list) {
                list_del(&set->list);
                nft_use_dec(&table->use);
                if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                        nft_map_deactivate(&ctx, set);

                nft_set_destroy(&ctx, set);
        }
        list_for_each_entry_safe(obj, ne, &table->objects, list) {
                nft_obj_del(obj);
                nft_use_dec(&table->use);
                nft_obj_destroy(&ctx, obj);
        }
        list_for_each_entry_safe(chain, nc, &table->chains, list) {
                ctx.chain = chain;
                nft_chain_del(chain);
                nft_use_dec(&table->use);
                nf_tables_chain_destroy(&ctx);
        }
        nf_tables_table_destroy(&ctx);
}

static void __nft_release_tables(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_table *table, *nt;

        list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
                if (nft_table_has_owner(table))
                        continue;

                list_del(&table->list);

                __nft_release_table(net, table);
        }
}

static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
                            void *ptr)
{
        struct nft_table *table, *to_delete[8];
        struct nftables_pernet *nft_net;
        struct netlink_notify *n = ptr;
        struct net *net = n->net;
        unsigned int deleted;
        bool restart = false;
        unsigned int gc_seq;

        if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER)
                return NOTIFY_DONE;

        nft_net = nft_pernet(net);
        deleted = 0;
        mutex_lock(&nft_net->commit_mutex);

        gc_seq = nft_gc_seq_begin(nft_net);

        if (!list_empty(&nf_tables_destroy_list))
                nf_tables_trans_destroy_flush_work();
again:
        list_for_each_entry(table, &nft_net->tables, list) {
                if (nft_table_has_owner(table) &&
                    n->portid == table->nlpid) {
                        if (table->flags & NFT_TABLE_F_PERSIST) {
                                table->flags &= ~NFT_TABLE_F_OWNER;
                                continue;
                        }
                        __nft_release_hook(net, table);
                        list_del_rcu(&table->list);
                        to_delete[deleted++] = table;
                        if (deleted >= ARRAY_SIZE(to_delete))
                                break;
                }
        }
        if (deleted) {
                restart = deleted >= ARRAY_SIZE(to_delete);
                synchronize_rcu();
                while (deleted)
                        __nft_release_table(net, to_delete[--deleted]);

                if (restart)
                        goto again;
        }
        nft_gc_seq_end(nft_net, gc_seq);

        mutex_unlock(&nft_net->commit_mutex);

        return NOTIFY_DONE;
}

static struct notifier_block nft_nl_notifier = {
        .notifier_call  = nft_rcv_nl_event,
};

static int __net_init nf_tables_init_net(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        INIT_LIST_HEAD(&nft_net->tables);
        INIT_LIST_HEAD(&nft_net->commit_list);
        INIT_LIST_HEAD(&nft_net->binding_list);
        INIT_LIST_HEAD(&nft_net->module_list);
        INIT_LIST_HEAD(&nft_net->notify_list);
        mutex_init(&nft_net->commit_mutex);
        nft_net->base_seq = 1;
        nft_net->gc_seq = 0;
        nft_net->validate_state = NFT_VALIDATE_SKIP;

        return 0;
}

static void __net_exit nf_tables_pre_exit_net(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        mutex_lock(&nft_net->commit_mutex);
        __nft_release_hooks(net);
        mutex_unlock(&nft_net->commit_mutex);
}

static void __net_exit nf_tables_exit_net(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        unsigned int gc_seq;

        mutex_lock(&nft_net->commit_mutex);

        gc_seq = nft_gc_seq_begin(nft_net);

        WARN_ON_ONCE(!list_empty(&nft_net->commit_list));

        if (!list_empty(&nft_net->module_list))
                nf_tables_module_autoload_cleanup(net);

        __nft_release_tables(net);

        nft_gc_seq_end(nft_net, gc_seq);

        mutex_unlock(&nft_net->commit_mutex);
        WARN_ON_ONCE(!list_empty(&nft_net->tables));
        WARN_ON_ONCE(!list_empty(&nft_net->module_list));
        WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
}

static void nf_tables_exit_batch(struct list_head *net_exit_list)
{
        flush_work(&trans_gc_work);
}

static struct pernet_operations nf_tables_net_ops = {
        .init                = nf_tables_init_net,
        .pre_exit        = nf_tables_pre_exit_net,
        .exit                = nf_tables_exit_net,
        .exit_batch        = nf_tables_exit_batch,
        .id                = &nf_tables_net_id,
        .size                = sizeof(struct nftables_pernet),
};

static int __init nf_tables_module_init(void)
{
        int err;

        err = register_pernet_subsys(&nf_tables_net_ops);
        if (err < 0)
                return err;

        err = nft_chain_filter_init();
        if (err < 0)
                goto err_chain_filter;

        err = nf_tables_core_module_init();
        if (err < 0)
                goto err_core_module;

        err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
        if (err < 0)
                goto err_netdev_notifier;

        err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params);
        if (err < 0)
                goto err_rht_objname;

        err = nft_offload_init();
        if (err < 0)
                goto err_offload;

        err = netlink_register_notifier(&nft_nl_notifier);
        if (err < 0)
                goto err_netlink_notifier;

        /* must be last */
        err = nfnetlink_subsys_register(&nf_tables_subsys);
        if (err < 0)
                goto err_nfnl_subsys;

        nft_chain_route_init();

        return err;

err_nfnl_subsys:
        netlink_unregister_notifier(&nft_nl_notifier);
err_netlink_notifier:
        nft_offload_exit();
err_offload:
        rhltable_destroy(&nft_objname_ht);
err_rht_objname:
        unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
err_netdev_notifier:
        nf_tables_core_module_exit();
err_core_module:
        nft_chain_filter_fini();
err_chain_filter:
        unregister_pernet_subsys(&nf_tables_net_ops);
        return err;
}

static void __exit nf_tables_module_exit(void)
{
        nfnetlink_subsys_unregister(&nf_tables_subsys);
        netlink_unregister_notifier(&nft_nl_notifier);
        nft_offload_exit();
        unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
        nft_chain_filter_fini();
        nft_chain_route_fini();
        nf_tables_trans_destroy_flush_work();
        unregister_pernet_subsys(&nf_tables_net_ops);
        cancel_work_sync(&trans_gc_work);
        cancel_work_sync(&trans_destroy_work);
        rcu_barrier();
        rhltable_destroy(&nft_objname_ht);
        nf_tables_core_module_exit();
}

module_init(nf_tables_module_init);
module_exit(nf_tables_module_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("Framework for packet filtering and classification");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);


















































































































    1 
    2 
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* fs/ internal definitions
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

struct super_block;
struct file_system_type;
struct iomap;
struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
struct fs_context;
struct pipe_inode_info;
struct iov_iter;
struct mnt_idmap;

/*
 * block/bdev.c
 */
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
#else
static inline void bdev_cache_init(void)
{
}
#endif /* CONFIG_BLOCK */

/*
 * buffer.c
 */
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap);

/*
 * char_dev.c
 */
extern void __init chrdev_init(void);

/*
 * fs_context.c
 */
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);

/*
 * namei.c
 */
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
                           struct path *path, struct path *root);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
                 struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
int do_linkat(int olddfd, struct filename *old, int newdfd,
                        struct filename *new, int flags);
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode);

/*
 * namespace.c
 */
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, const struct path *);

extern int sb_prepare_remount_readonly(struct super_block *);

extern void __init mnt_init(void);

int mnt_get_write_access_file(struct file *file);
void mnt_put_write_access_file(struct file *file);

extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);

int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);

int show_path(struct seq_file *m, struct dentry *root);

/*
 * fs_struct.c
 */
extern void chroot_fs_refs(const struct path *, const struct path *);

/*
 * file_table.c
 */
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);

static inline void file_put_write_access(struct file *file)
{
        put_write_access(file->f_inode);
        mnt_put_write_access(file->f_path.mnt);
        if (unlikely(file->f_mode & FMODE_BACKING))
                mnt_put_write_access(backing_file_user_path(file)->mnt);
}

static inline void put_file_access(struct file *file)
{
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_dec(file->f_inode);
        } else if (file->f_mode & FMODE_WRITER) {
                file_put_write_access(file);
        }
}

/*
 * super.c
 */
extern int reconfigure_super(struct fs_context *);
extern bool super_trylock_shared(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
int sb_init_dio_done_wq(struct super_block *sb);

/*
 * Prepare superblock for changing its read-only state (i.e., either remount
 * read-write superblock read-only or vice versa). After this function returns
 * mnt_is_readonly() will return true for any mount of the superblock if its
 * caller is able to observe any changes done by the remount. This holds until
 * sb_end_ro_state_change() is called.
 */
static inline void sb_start_ro_state_change(struct super_block *sb)
{
        WRITE_ONCE(sb->s_readonly_remount, 1);
        /*
         * For RO->RW transition, the barrier pairs with the barrier in
         * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
         * cleared, it will see s_readonly_remount set.
         * For RW->RO transition, the barrier pairs with the barrier in
         * mnt_get_write_access() before the mnt_is_readonly() check.
         * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
         * already cleared, it will see s_readonly_remount set.
         */
        smp_wmb();
}

/*
 * Ends section changing read-only state of the superblock. After this function
 * returns if mnt_is_readonly() returns false, the caller will be able to
 * observe all the changes remount did to the superblock.
 */
static inline void sb_end_ro_state_change(struct super_block *sb)
{
        /*
         * This barrier provides release semantics that pairs with
         * the smp_rmb() acquire semantics in mnt_is_readonly().
         * This barrier pair ensure that when mnt_is_readonly() sees
         * 0 for sb->s_readonly_remount, it will also see all the
         * preceding flag changes that were made during the RO state
         * change.
         */
        smp_wmb();
        WRITE_ONCE(sb->s_readonly_remount, 0);
}

/*
 * open.c
 */
struct open_flags {
        int open_flag;
        umode_t mode;
        int acc_mode;
        int intent;
        int lookup_flags;
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op);
extern struct file *do_file_open_root(const struct path *,
                const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);

long do_ftruncate(struct file *file, loff_t length, int small);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag);
int chown_common(const struct path *path, uid_t user, gid_t group);
extern int vfs_open(const struct path *, struct file *);

/*
 * inode.c
 */
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid);

/*
 * fs-writeback.c
 */
extern long get_nr_dirty_inodes(void);
void invalidate_inodes(struct super_block *sb);

/*
 * dcache.c
 */
extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
extern struct dentry *d_alloc_cursor(struct dentry *);
extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
extern void shrink_dcache_for_umount(struct super_block *);
extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name, unsigned *seq);
extern void d_genocide(struct dentry *);

/*
 * pipe.c
 */
extern const struct file_operations pipefifo_fops;

/*
 * fs_pin.c
 */
extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);

/*
 * fs/nsfs.c
 */
extern const struct dentry_operations ns_dentry_operations;

/*
 * fs/stat.c:
 */

int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
             unsigned int mask, struct statx __user *buffer);

/*
 * fs/splice.c:
 */
ssize_t splice_file_to_pipe(struct file *in,
                            struct pipe_inode_info *opipe,
                            loff_t *offset,
                            size_t len, unsigned int flags);

/*
 * fs/xattr.c:
 */
struct xattr_name {
        char name[XATTR_NAME_MAX + 1];
};

struct xattr_ctx {
        /* Value of attribute */
        union {
                const void __user *cvalue;
                void __user *value;
        };
        void *kvalue;
        size_t size;
        /* Attribute name */
        struct xattr_name *kname;
        unsigned int flags;
};


ssize_t do_getxattr(struct mnt_idmap *idmap,
                    struct dentry *d,
                    struct xattr_ctx *ctx);

int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct xattr_ctx *ctx);
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);

#ifdef CONFIG_FS_POSIX_ACL
int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
               const char *acl_name, const void *kvalue, size_t size);
ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name, void *kvalue, size_t size);
#else
static inline int do_set_acl(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *acl_name,
                             const void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name,
                                 void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
#endif

ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);

/*
 * fs/attr.c
 */
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
struct stashed_operations {
        void (*put_data)(void *data);
        int (*init_inode)(struct inode *inode, void *data);
};
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path);
void stashed_dentry_prune(struct dentry *dentry);














































































    3 



    3 




















    1 



























    3 













    3 


































































    3 










    3 













    3 


























    3 


    3 











    3 



























    3 




    3 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
 *
 *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
 *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
 *  2000-2002   x86-64 support by Andi Kleen
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/kstrtox.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/syscalls.h>
#include <linux/rseq.h>

#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xstate.h>
#include <asm/vdso.h>
#include <asm/mce.h>
#include <asm/sighandling.h>
#include <asm/vm86.h>

#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>
#include <asm/shstk.h>

static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
        return IS_ENABLED(CONFIG_IA32_EMULATION) &&
                ksig->ka.sa.sa_flags & SA_IA32_ABI;
}

static inline int is_ia32_frame(struct ksignal *ksig)
{
        return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
}

static inline int is_x32_frame(struct ksignal *ksig)
{
        return IS_ENABLED(CONFIG_X86_X32_ABI) &&
                ksig->ka.sa.sa_flags & SA_X32_ABI;
}

/*
 * Set up a signal frame.
 */

/* x86 ABI requires 16-byte alignment */
#define FRAME_ALIGNMENT        16UL

#define MAX_FRAME_PADDING        (FRAME_ALIGNMENT - 1)

/*
 * Determine which stack to use..
 */
void __user *
get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
             void __user **fpstate)
{
        struct k_sigaction *ka = &ksig->ka;
        int ia32_frame = is_ia32_frame(ksig);
        /* Default to using normal stack */
        bool nested_altstack = on_sig_stack(regs->sp);
        bool entering_altstack = false;
        unsigned long math_size = 0;
        unsigned long sp = regs->sp;
        unsigned long buf_fx = 0;

        /* redzone */
        if (!ia32_frame)
                sp -= 128;

        /* This is the X/Open sanctioned signal stack switching.  */
        if (ka->sa.sa_flags & SA_ONSTACK) {
                /*
                 * This checks nested_altstack via sas_ss_flags(). Sensible
                 * programs use SS_AUTODISARM, which disables that check, and
                 * programs that don't use SS_AUTODISARM get compatible.
                 */
                if (sas_ss_flags(sp) == 0) {
                        sp = current->sas_ss_sp + current->sas_ss_size;
                        entering_altstack = true;
                }
        } else if (ia32_frame &&
                   !nested_altstack &&
                   regs->ss != __USER_DS &&
                   !(ka->sa.sa_flags & SA_RESTORER) &&
                   ka->sa.sa_restorer) {
                /* This is the legacy signal stack switching. */
                sp = (unsigned long) ka->sa.sa_restorer;
                entering_altstack = true;
        }

        sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size);
        *fpstate = (void __user *)sp;

        sp -= frame_size;

        if (ia32_frame)
                /*
                 * Align the stack pointer according to the i386 ABI,
                 * i.e. so that on function entry ((sp + 4) & 15) == 0.
                 */
                sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
        else
                sp = round_down(sp, FRAME_ALIGNMENT) - 8;

        /*
         * If we are on the alternate signal stack and would overflow it, don't.
         * Return an always-bogus address instead so we will die with SIGSEGV.
         */
        if (unlikely((nested_altstack || entering_altstack) &&
                     !__on_sig_stack(sp))) {

                if (show_unhandled_signals && printk_ratelimit())
                        pr_info("%s[%d] overflowed sigaltstack\n",
                                current->comm, task_pid_nr(current));

                return (void __user *)-1L;
        }

        /* save i387 and extended state */
        if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
                return (void __user *)-1L;

        return (void __user *)sp;
}

/*
 * There are four different struct types for signal frame: sigframe_ia32,
 * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
 * -- the largest size. It means the size for 64-bit apps is a bit more
 * than needed, but this keeps the code simple.
 */
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
# define MAX_FRAME_SIGINFO_UCTXT_SIZE        sizeof(struct sigframe_ia32)
#else
# define MAX_FRAME_SIGINFO_UCTXT_SIZE        sizeof(struct rt_sigframe)
#endif

/*
 * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
 * If a signal frame starts at an unaligned address, extra space is required.
 * This is the max alignment padding, conservatively.
 */
#define MAX_XSAVE_PADDING        63UL

/*
 * The frame data is composed of the following areas and laid out as:
 *
 * -------------------------
 * | alignment padding     |
 * -------------------------
 * | (f)xsave frame        |
 * -------------------------
 * | fsave header          |
 * -------------------------
 * | alignment padding     |
 * -------------------------
 * | siginfo + ucontext    |
 * -------------------------
 */

/* max_frame_size tells userspace the worst case signal stack size. */
static unsigned long __ro_after_init max_frame_size;
static unsigned int __ro_after_init fpu_default_state_size;

static int __init init_sigframe_size(void)
{
        fpu_default_state_size = fpu__get_fpstate_size();

        max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;

        max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;

        /* Userspace expects an aligned size. */
        max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);

        pr_info("max sigframe size: %lu\n", max_frame_size);
        return 0;
}
early_initcall(init_sigframe_size);

unsigned long get_sigframe_size(void)
{
        return max_frame_size;
}

static int
setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
        /* Perform fixup for the pre-signal frame. */
        rseq_signal_deliver(ksig, regs);

        /* Set up the stack frame */
        if (is_ia32_frame(ksig)) {
                if (ksig->ka.sa.sa_flags & SA_SIGINFO)
                        return ia32_setup_rt_frame(ksig, regs);
                else
                        return ia32_setup_frame(ksig, regs);
        } else if (is_x32_frame(ksig)) {
                return x32_setup_rt_frame(ksig, regs);
        } else {
                return x64_setup_rt_frame(ksig, regs);
        }
}

static void
handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
        bool stepping, failed;
        struct fpu *fpu = &current->thread.fpu;

        if (v8086_mode(regs))
                save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);

        /* Are we from a system call? */
        if (syscall_get_nr(current, regs) != -1) {
                /* If so, check system call restarting.. */
                switch (syscall_get_error(current, regs)) {
                case -ERESTART_RESTARTBLOCK:
                case -ERESTARTNOHAND:
                        regs->ax = -EINTR;
                        break;

                case -ERESTARTSYS:
                        if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
                                regs->ax = -EINTR;
                                break;
                        }
                        fallthrough;
                case -ERESTARTNOINTR:
                        regs->ax = regs->orig_ax;
                        regs->ip -= 2;
                        break;
                }
        }

        /*
         * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
         * so that register information in the sigcontext is correct and
         * then notify the tracer before entering the signal handler.
         */
        stepping = test_thread_flag(TIF_SINGLESTEP);
        if (stepping)
                user_disable_single_step(current);

        failed = (setup_rt_frame(ksig, regs) < 0);
        if (!failed) {
                /*
                 * Clear the direction flag as per the ABI for function entry.
                 *
                 * Clear RF when entering the signal handler, because
                 * it might disable possible debug exception from the
                 * signal handler.
                 *
                 * Clear TF for the case when it wasn't set by debugger to
                 * avoid the recursive send_sigtrap() in SIGTRAP handler.
                 */
                regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
                /*
                 * Ensure the signal handler starts with the new fpu state.
                 */
                fpu__clear_user_states(fpu);
        }
        signal_setup_done(failed, ksig, stepping);
}

static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
{
#ifdef CONFIG_IA32_EMULATION
        if (current->restart_block.arch_data & TS_COMPAT)
                return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
        return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
#else
        return __NR_restart_syscall;
#endif
}

/*
 * Note that 'init' is a special process: it doesn't get signals it doesn't
 * want to handle. Thus you cannot kill init even with a SIGKILL even by
 * mistake.
 */
void arch_do_signal_or_restart(struct pt_regs *regs)
{
        struct ksignal ksig;

        if (get_signal(&ksig)) {
                /* Whee! Actually deliver the signal.  */
                handle_signal(&ksig, regs);
                return;
        }

        /* Did we come from a system call? */
        if (syscall_get_nr(current, regs) != -1) {
                /* Restart the system call - no handlers present */
                switch (syscall_get_error(current, regs)) {
                case -ERESTARTNOHAND:
                case -ERESTARTSYS:
                case -ERESTARTNOINTR:
                        regs->ax = regs->orig_ax;
                        regs->ip -= 2;
                        break;

                case -ERESTART_RESTARTBLOCK:
                        regs->ax = get_nr_restart_syscall(regs);
                        regs->ip -= 2;
                        break;
                }
        }

        /*
         * If there's no signal to deliver, we just put the saved sigmask
         * back.
         */
        restore_saved_sigmask();
}

void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
{
        struct task_struct *me = current;

        if (show_unhandled_signals && printk_ratelimit()) {
                printk("%s"
                       "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
                       task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
                       me->comm, me->pid, where, frame,
                       regs->ip, regs->sp, regs->orig_ax);
                print_vma_addr(KERN_CONT " in ", regs->ip);
                pr_cont("\n");
        }

        force_sig(SIGSEGV);
}

#ifdef CONFIG_DYNAMIC_SIGFRAME
#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
static bool strict_sigaltstack_size __ro_after_init = true;
#else
static bool strict_sigaltstack_size __ro_after_init = false;
#endif

static int __init strict_sas_size(char *arg)
{
        return kstrtobool(arg, &strict_sigaltstack_size) == 0;
}
__setup("strict_sas_size", strict_sas_size);

/*
 * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
 * exceeds that size already. As such programs might never use the
 * sigaltstack they just continued to work. While always checking against
 * the real size would be correct, this might be considered a regression.
 *
 * Therefore avoid the sanity check, unless enforced by kernel
 * configuration or command line option.
 *
 * When dynamic FPU features are supported, the check is also enforced when
 * the task has permissions to use dynamic features. Tasks which have no
 * permission are checked against the size of the non-dynamic feature set
 * if strict checking is enabled. This avoids forcing all tasks on the
 * system to allocate large sigaltstacks even if they are never going
 * to use a dynamic feature. As this is serialized via sighand::siglock
 * any permission request for a dynamic feature either happened already
 * or will see the newly install sigaltstack size in the permission checks.
 */
bool sigaltstack_size_valid(size_t ss_size)
{
        unsigned long fsize = max_frame_size - fpu_default_state_size;
        u64 mask;

        lockdep_assert_held(&current->sighand->siglock);

        if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
                return true;

        fsize += current->group_leader->thread.fpu.perm.__user_state_size;
        if (likely(ss_size > fsize))
                return true;

        if (strict_sigaltstack_size)
                return ss_size > fsize;

        mask = current->group_leader->thread.fpu.perm.__state_perm;
        if (mask & XFEATURE_MASK_USER_DYNAMIC)
                return ss_size > fsize;

        return true;
}
#endif /* CONFIG_DYNAMIC_SIGFRAME */












































    2 















































    1 






















    6 
    3 














    1 







    2 

    2 







































    2 

    9 
    8 
    2 
    2 
    2 



    1 
    1 










    1 
    1 





























    1 
















    1 




























    2 
    7 
































    3 
    3 

    1 
    1 

    1 

    1 
    1 
    7 

    5 

    1 
    1 
    1 
    2 
    1 
    1 


    2 

    6 

    2 

    1 
    2 

















    1 

    1 






























    1 


    1 

























    1 

    1 



    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Linux Security Module Hook declarations.
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2001 James Morris <jmorris@intercode.com.au>
 * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group)
 * Copyright (C) 2015 Intel Corporation.
 * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com>
 * Copyright (C) 2016 Mellanox Techonologies
 * Copyright (C) 2020 Google LLC.
 */

/*
 * The macro LSM_HOOK is used to define the data structures required by
 * the LSM framework using the pattern:
 *
 *        LSM_HOOK(<return_type>, <default_value>, <hook_name>, args...)
 *
 * struct security_hook_heads {
 *   #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME;
 *   #include <linux/lsm_hook_defs.h>
 *   #undef LSM_HOOK
 * };
 */
LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr)
LSM_HOOK(int, 0, binder_transaction, const struct cred *from,
         const struct cred *to)
LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from,
         const struct cred *to)
LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from,
         const struct cred *to, const struct file *file)
LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child,
         unsigned int mode)
LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent)
LSM_HOOK(int, 0, capget, const struct task_struct *target, kernel_cap_t *effective,
         kernel_cap_t *inheritable, kernel_cap_t *permitted)
LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old,
         const kernel_cap_t *effective, const kernel_cap_t *inheritable,
         const kernel_cap_t *permitted)
LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns,
         int cap, unsigned int opts)
LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, const struct super_block *sb)
LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
LSM_HOOK(int, 0, syslog, int type)
LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
         const struct timezone *tz)
LSM_HOOK(int, 1, vm_enough_memory, struct mm_struct *mm, long pages)
LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm)
LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, const struct linux_binprm *bprm)
LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
         struct fs_context *src_sc)
LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
         struct fs_parameter *param)
LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts)
LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts)
LSM_HOOK(int, 0, sb_kern_mount, const struct super_block *sb)
LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb)
LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry)
LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path,
         const char *type, unsigned long flags, void *data)
LSM_HOOK(int, 0, sb_umount, struct vfsmount *mnt, int flags)
LSM_HOOK(int, 0, sb_pivotroot, const struct path *old_path,
         const struct path *new_path)
LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts,
         unsigned long kern_flags, unsigned long *set_kern_flags)
LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb,
         struct super_block *newsb, unsigned long kern_flags,
         unsigned long *set_kern_flags)
LSM_HOOK(int, 0, move_mount, const struct path *from_path,
         const struct path *to_path)
LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry,
         int mode, const struct qstr *name, const char **xattr_name,
         void **ctx, u32 *ctxlen)
LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode,
         struct qstr *name, const struct cred *old, struct cred *new)

#ifdef CONFIG_SECURITY_PATH
LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry)
LSM_HOOK(int, 0, path_mkdir, const struct path *dir, struct dentry *dentry,
         umode_t mode)
LSM_HOOK(int, 0, path_rmdir, const struct path *dir, struct dentry *dentry)
LSM_HOOK(int, 0, path_mknod, const struct path *dir, struct dentry *dentry,
         umode_t mode, unsigned int dev)
LSM_HOOK(void, LSM_RET_VOID, path_post_mknod, struct mnt_idmap *idmap,
         struct dentry *dentry)
LSM_HOOK(int, 0, path_truncate, const struct path *path)
LSM_HOOK(int, 0, path_symlink, const struct path *dir, struct dentry *dentry,
         const char *old_name)
LSM_HOOK(int, 0, path_link, struct dentry *old_dentry,
         const struct path *new_dir, struct dentry *new_dentry)
LSM_HOOK(int, 0, path_rename, const struct path *old_dir,
         struct dentry *old_dentry, const struct path *new_dir,
         struct dentry *new_dentry, unsigned int flags)
LSM_HOOK(int, 0, path_chmod, const struct path *path, umode_t mode)
LSM_HOOK(int, 0, path_chown, const struct path *path, kuid_t uid, kgid_t gid)
LSM_HOOK(int, 0, path_chroot, const struct path *path)
#endif /* CONFIG_SECURITY_PATH */

/* Needed for inode based security check */
LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask,
         unsigned int obj_type)
LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode)
LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode)
LSM_HOOK(int, -EOPNOTSUPP, inode_init_security, struct inode *inode,
         struct inode *dir, const struct qstr *qstr, struct xattr *xattrs,
         int *xattr_count)
LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode,
         const struct qstr *name, const struct inode *context_inode)
LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry,
         umode_t mode)
LSM_HOOK(void, LSM_RET_VOID, inode_post_create_tmpfile, struct mnt_idmap *idmap,
         struct inode *inode)
LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir,
         struct dentry *new_dentry)
LSM_HOOK(int, 0, inode_unlink, struct inode *dir, struct dentry *dentry)
LSM_HOOK(int, 0, inode_symlink, struct inode *dir, struct dentry *dentry,
         const char *old_name)
LSM_HOOK(int, 0, inode_mkdir, struct inode *dir, struct dentry *dentry,
         umode_t mode)
LSM_HOOK(int, 0, inode_rmdir, struct inode *dir, struct dentry *dentry)
LSM_HOOK(int, 0, inode_mknod, struct inode *dir, struct dentry *dentry,
         umode_t mode, dev_t dev)
LSM_HOOK(int, 0, inode_rename, struct inode *old_dir, struct dentry *old_dentry,
         struct inode *new_dir, struct dentry *new_dentry)
LSM_HOOK(int, 0, inode_readlink, struct dentry *dentry)
LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode,
         bool rcu)
LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
LSM_HOOK(int, 0, inode_setattr, struct mnt_idmap *idmap, struct dentry *dentry,
         struct iattr *attr)
LSM_HOOK(void, LSM_RET_VOID, inode_post_setattr, struct mnt_idmap *idmap,
         struct dentry *dentry, int ia_valid)
LSM_HOOK(int, 0, inode_getattr, const struct path *path)
LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *name, const void *value,
         size_t size, int flags)
LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
         const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *name)
LSM_HOOK(void, LSM_RET_VOID, inode_post_removexattr, struct dentry *dentry,
         const char *name)
LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name, struct posix_acl *kacl)
LSM_HOOK(void, LSM_RET_VOID, inode_post_set_acl, struct dentry *dentry,
         const char *acl_name, struct posix_acl *kacl)
LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name)
LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name)
LSM_HOOK(void, LSM_RET_VOID, inode_post_remove_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name)
LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
LSM_HOOK(int, 0, inode_killpriv, struct mnt_idmap *idmap,
         struct dentry *dentry)
LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct mnt_idmap *idmap,
         struct inode *inode, const char *name, void **buffer, bool alloc)
LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
         const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
         size_t buffer_size)
LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid)
LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new)
LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src,
         const char *name)
LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir,
         struct kernfs_node *kn)
LSM_HOOK(int, 0, file_permission, struct file *file, int mask)
LSM_HOOK(int, 0, file_alloc_security, struct file *file)
LSM_HOOK(void, LSM_RET_VOID, file_release, struct file *file)
LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file)
LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd,
         unsigned long arg)
LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd,
         unsigned long arg)
LSM_HOOK(int, 0, mmap_addr, unsigned long addr)
LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot,
         unsigned long prot, unsigned long flags)
LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma,
         unsigned long reqprot, unsigned long prot)
LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd)
LSM_HOOK(int, 0, file_fcntl, struct file *file, unsigned int cmd,
         unsigned long arg)
LSM_HOOK(void, LSM_RET_VOID, file_set_fowner, struct file *file)
LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk,
         struct fown_struct *fown, int sig)
LSM_HOOK(int, 0, file_receive, struct file *file)
LSM_HOOK(int, 0, file_open, struct file *file)
LSM_HOOK(int, 0, file_post_open, struct file *file, int mask)
LSM_HOOK(int, 0, file_truncate, struct file *file)
LSM_HOOK(int, 0, task_alloc, struct task_struct *task,
         unsigned long clone_flags)
LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task)
LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp)
LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred)
LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old,
         gfp_t gfp)
LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new,
         const struct cred *old)
LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid)
LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid)
LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode)
LSM_HOOK(int, 0, kernel_module_request, char *kmod_name)
LSM_HOOK(int, 0, kernel_load_data, enum kernel_load_data_id id, bool contents)
LSM_HOOK(int, 0, kernel_post_load_data, char *buf, loff_t size,
         enum kernel_load_data_id id, char *description)
LSM_HOOK(int, 0, kernel_read_file, struct file *file,
         enum kernel_read_file_id id, bool contents)
LSM_HOOK(int, 0, kernel_post_read_file, struct file *file, char *buf,
         loff_t size, enum kernel_read_file_id id)
LSM_HOOK(int, 0, task_fix_setuid, struct cred *new, const struct cred *old,
         int flags)
LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old,
         int flags)
LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old)
LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid)
LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj,
         struct task_struct *p, u32 *secid)
LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice)
LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio)
LSM_HOOK(int, 0, task_getioprio, struct task_struct *p)
LSM_HOOK(int, 0, task_prlimit, const struct cred *cred,
         const struct cred *tcred, unsigned int flags)
LSM_HOOK(int, 0, task_setrlimit, struct task_struct *p, unsigned int resource,
         struct rlimit *new_rlim)
LSM_HOOK(int, 0, task_setscheduler, struct task_struct *p)
LSM_HOOK(int, 0, task_getscheduler, struct task_struct *p)
LSM_HOOK(int, 0, task_movememory, struct task_struct *p)
LSM_HOOK(int, 0, task_kill, struct task_struct *p, struct kernel_siginfo *info,
         int sig, const struct cred *cred)
LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
         unsigned long arg3, unsigned long arg4, unsigned long arg5)
LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
         struct inode *inode)
LSM_HOOK(int, 0, userns_create, const struct cred *cred)
LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp,
         u32 *secid)
LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg)
LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg)
LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm)
LSM_HOOK(void, LSM_RET_VOID, msg_queue_free_security,
         struct kern_ipc_perm *perm)
LSM_HOOK(int, 0, msg_queue_associate, struct kern_ipc_perm *perm, int msqflg)
LSM_HOOK(int, 0, msg_queue_msgctl, struct kern_ipc_perm *perm, int cmd)
LSM_HOOK(int, 0, msg_queue_msgsnd, struct kern_ipc_perm *perm,
         struct msg_msg *msg, int msqflg)
LSM_HOOK(int, 0, msg_queue_msgrcv, struct kern_ipc_perm *perm,
         struct msg_msg *msg, struct task_struct *target, long type, int mode)
LSM_HOOK(int, 0, shm_alloc_security, struct kern_ipc_perm *perm)
LSM_HOOK(void, LSM_RET_VOID, shm_free_security, struct kern_ipc_perm *perm)
LSM_HOOK(int, 0, shm_associate, struct kern_ipc_perm *perm, int shmflg)
LSM_HOOK(int, 0, shm_shmctl, struct kern_ipc_perm *perm, int cmd)
LSM_HOOK(int, 0, shm_shmat, struct kern_ipc_perm *perm, char __user *shmaddr,
         int shmflg)
LSM_HOOK(int, 0, sem_alloc_security, struct kern_ipc_perm *perm)
LSM_HOOK(void, LSM_RET_VOID, sem_free_security, struct kern_ipc_perm *perm)
LSM_HOOK(int, 0, sem_associate, struct kern_ipc_perm *perm, int semflg)
LSM_HOOK(int, 0, sem_semctl, struct kern_ipc_perm *perm, int cmd)
LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops,
         unsigned nsops, int alter)
LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb)
LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry,
         struct inode *inode)
LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr,
         struct lsm_ctx __user *ctx, u32 *size, u32 flags)
LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr,
         struct lsm_ctx *ctx, u32 size, u32 flags)
LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name,
         char **value)
LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
LSM_HOOK(int, 0, ismaclabel, const char *name)
LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata,
         u32 *seclen)
LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid)
LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen)
LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen)
LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx,
         u32 *ctxlen)

#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
LSM_HOOK(int, 0, post_notification, const struct cred *w_cred,
         const struct cred *cred, struct watch_notification *n)
#endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */

#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS)
LSM_HOOK(int, 0, watch_key, struct key *key)
#endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */

#ifdef CONFIG_SECURITY_NETWORK
LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other,
         struct sock *newsk)
LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other)
LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type,
         int protocol, int kern)
LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb)
LSM_HOOK(int, 0, socket_bind, struct socket *sock, struct sockaddr *address,
         int addrlen)
LSM_HOOK(int, 0, socket_connect, struct socket *sock, struct sockaddr *address,
         int addrlen)
LSM_HOOK(int, 0, socket_listen, struct socket *sock, int backlog)
LSM_HOOK(int, 0, socket_accept, struct socket *sock, struct socket *newsock)
LSM_HOOK(int, 0, socket_sendmsg, struct socket *sock, struct msghdr *msg,
         int size)
LSM_HOOK(int, 0, socket_recvmsg, struct socket *sock, struct msghdr *msg,
         int size, int flags)
LSM_HOOK(int, 0, socket_getsockname, struct socket *sock)
LSM_HOOK(int, 0, socket_getpeername, struct socket *sock)
LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname)
LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname)
LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how)
LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb)
LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock,
         sockptr_t optval, sockptr_t optlen, unsigned int len)
LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock,
         struct sk_buff *skb, u32 *secid)
LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority)
LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk)
LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk,
         struct sock *newsk)
LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, const struct sock *sk, u32 *secid)
LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent)
LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb,
         struct request_sock *req)
LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk,
         const struct request_sock *req)
LSM_HOOK(void, LSM_RET_VOID, inet_conn_established, struct sock *sk,
         struct sk_buff *skb)
LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid)
LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void)
LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void)
LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req,
         struct flowi_common *flic)
LSM_HOOK(int, 0, tun_dev_alloc_security, void **security)
LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security)
LSM_HOOK(int, 0, tun_dev_create, void)
LSM_HOOK(int, 0, tun_dev_attach_queue, void *security)
LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security)
LSM_HOOK(int, 0, tun_dev_open, void *security)
LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc,
         struct sk_buff *skb)
LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
         struct sockaddr *address, int addrlen)
LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
         struct sock *sk, struct sock *newsk)
LSM_HOOK(int, 0, sctp_assoc_established, struct sctp_association *asoc,
         struct sk_buff *skb)
LSM_HOOK(int, 0, mptcp_add_subflow, struct sock *sk, struct sock *ssk)
#endif /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND
LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey)
LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name,
         u8 port_num)
LSM_HOOK(int, 0, ib_alloc_security, void **sec)
LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec)
#endif /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM
LSM_HOOK(int, 0, xfrm_policy_alloc_security, struct xfrm_sec_ctx **ctxp,
         struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp)
LSM_HOOK(int, 0, xfrm_policy_clone_security, struct xfrm_sec_ctx *old_ctx,
         struct xfrm_sec_ctx **new_ctx)
LSM_HOOK(void, LSM_RET_VOID, xfrm_policy_free_security,
         struct xfrm_sec_ctx *ctx)
LSM_HOOK(int, 0, xfrm_policy_delete_security, struct xfrm_sec_ctx *ctx)
LSM_HOOK(int, 0, xfrm_state_alloc, struct xfrm_state *x,
         struct xfrm_user_sec_ctx *sec_ctx)
LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x,
         struct xfrm_sec_ctx *polsec, u32 secid)
LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x)
LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x)
LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid)
LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x,
         struct xfrm_policy *xp, const struct flowi_common *flic)
LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid,
         int ckall)
#endif /* CONFIG_SECURITY_NETWORK_XFRM */

/* key management security hooks */
#ifdef CONFIG_KEYS
LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred,
         unsigned long flags)
LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key)
LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred,
         enum key_need_perm need_perm)
LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer)
LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring,
         struct key *key, const void *payload, size_t payload_len,
         unsigned long flags, bool create)
#endif /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT
LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr,
         void **lsmrule, gfp_t gfp)
LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule)
LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule)
LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
         struct bpf_token *token)
LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
         struct bpf_token *token)
LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
         struct path *path)
LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
#endif /* CONFIG_BPF_SYSCALL */

LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)

#ifdef CONFIG_PERF_EVENTS
LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type)
LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event)
LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event)
LSM_HOOK(int, 0, perf_event_read, struct perf_event *event)
LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
#endif /* CONFIG_PERF_EVENTS */

#ifdef CONFIG_IO_URING
LSM_HOOK(int, 0, uring_override_creds, const struct cred *new)
LSM_HOOK(int, 0, uring_sqpoll, void)
LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd)
#endif /* CONFIG_IO_URING */






















































































































































































































    1 





    1 









































































    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
// SPDX-License-Identifier: GPL-2.0
/*
 * A fast, small, non-recursive O(n log n) sort for the Linux kernel
 *
 * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
 * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
 *
 * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
 * better) at the expense of stack usage and much larger code to avoid
 * quicksort's O(n^2) worst case.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/export.h>
#include <linux/sort.h>

/**
 * is_aligned - is this pointer & size okay for word-wide copying?
 * @base: pointer to data
 * @size: size of each element
 * @align: required alignment (typically 4 or 8)
 *
 * Returns true if elements can be copied using word loads and stores.
 * The size must be a multiple of the alignment, and the base address must
 * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
 *
 * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
 * to "if ((a | b) & mask)", so we do that by hand.
 */
__attribute_const__ __always_inline
static bool is_aligned(const void *base, size_t size, unsigned char align)
{
        unsigned char lsbits = (unsigned char)size;

        (void)base;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        lsbits |= (unsigned char)(uintptr_t)base;
#endif
        return (lsbits & (align - 1)) == 0;
}

/**
 * swap_words_32 - swap two elements in 32-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 4)
 *
 * Exchange the two objects in memory.  This exploits base+index addressing,
 * which basically all CPUs have, to minimize loop overhead computations.
 *
 * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
 * bottom of the loop, even though the zero flag is still valid from the
 * subtract (since the intervening mov instructions don't alter the flags).
 * Gcc 8.1.0 doesn't have that problem.
 */
static void swap_words_32(void *a, void *b, size_t n)
{
        do {
                u32 t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;
        } while (n);
}

/**
 * swap_words_64 - swap two elements in 64-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 8)
 *
 * Exchange the two objects in memory.  This exploits base+index
 * addressing, which basically all CPUs have, to minimize loop overhead
 * computations.
 *
 * We'd like to use 64-bit loads if possible.  If they're not, emulating
 * one requires base+index+4 addressing which x86 has but most other
 * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
 * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
 * x32 ABI).  Are there any cases the kernel needs to worry about?
 */
static void swap_words_64(void *a, void *b, size_t n)
{
        do {
#ifdef CONFIG_64BIT
                u64 t = *(u64 *)(a + (n -= 8));
                *(u64 *)(a + n) = *(u64 *)(b + n);
                *(u64 *)(b + n) = t;
#else
                /* Use two 32-bit transfers to avoid base+index+4 addressing */
                u32 t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;

                t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;
#endif
        } while (n);
}

/**
 * swap_bytes - swap two elements a byte at a time
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size
 *
 * This is the fallback if alignment doesn't allow using larger chunks.
 */
static void swap_bytes(void *a, void *b, size_t n)
{
        do {
                char t = ((char *)a)[--n];
                ((char *)a)[n] = ((char *)b)[n];
                ((char *)b)[n] = t;
        } while (n);
}

/*
 * The values are arbitrary as long as they can't be confused with
 * a pointer, but small integers make for the smallest compare
 * instructions.
 */
#define SWAP_WORDS_64 (swap_r_func_t)0
#define SWAP_WORDS_32 (swap_r_func_t)1
#define SWAP_BYTES    (swap_r_func_t)2
#define SWAP_WRAPPER  (swap_r_func_t)3

struct wrapper {
        cmp_func_t cmp;
        swap_func_t swap;
};

/*
 * The function pointer is last to make tail calls most efficient if the
 * compiler decides not to inline this function.
 */
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
        if (swap_func == SWAP_WRAPPER) {
                ((const struct wrapper *)priv)->swap(a, b, (int)size);
                return;
        }

        if (swap_func == SWAP_WORDS_64)
                swap_words_64(a, b, size);
        else if (swap_func == SWAP_WORDS_32)
                swap_words_32(a, b, size);
        else if (swap_func == SWAP_BYTES)
                swap_bytes(a, b, size);
        else
                swap_func(a, b, (int)size, priv);
}

#define _CMP_WRAPPER ((cmp_r_func_t)0L)

static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
{
        if (cmp == _CMP_WRAPPER)
                return ((const struct wrapper *)priv)->cmp(a, b);
        return cmp(a, b, priv);
}

/**
 * parent - given the offset of the child, find the offset of the parent.
 * @i: the offset of the heap element whose parent is sought.  Non-zero.
 * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
 * @size: size of each element
 *
 * In terms of array indexes, the parent of element j = @i/@size is simply
 * (j-1)/2.  But when working in byte offsets, we can't use implicit
 * truncation of integer divides.
 *
 * Fortunately, we only need one bit of the quotient, not the full divide.
 * @size has a least significant bit.  That bit will be clear if @i is
 * an even multiple of @size, and set if it's an odd multiple.
 *
 * Logically, we're doing "if (i & lsbit) i -= size;", but since the
 * branch is unpredictable, it's done with a bit of clever branch-free
 * code instead.
 */
__attribute_const__ __always_inline
static size_t parent(size_t i, unsigned int lsbit, size_t size)
{
        i -= size;
        i -= size & -(i & lsbit);
        return i / 2;
}

/**
 * sort_r - sort an array of elements
 * @base: pointer to data to sort
 * @num: number of elements
 * @size: size of each element
 * @cmp_func: pointer to comparison function
 * @swap_func: pointer to swap function or NULL
 * @priv: third argument passed to comparison function
 *
 * This function does a heapsort on the given array.  You may provide
 * a swap_func function if you need to do something more than a memory
 * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
 * avoids a slow retpoline and so is significantly faster.
 *
 * Sorting time is O(n log n) both on average and worst-case. While
 * quicksort is slightly faster on average, it suffers from exploitable
 * O(n*n) worst-case behavior and extra memory requirements that make
 * it less suitable for kernel use.
 */
void sort_r(void *base, size_t num, size_t size,
            cmp_r_func_t cmp_func,
            swap_r_func_t swap_func,
            const void *priv)
{
        /* pre-scale counters for performance */
        size_t n = num * size, a = (num/2) * size;
        const unsigned int lsbit = size & -size;  /* Used to find parent */
        size_t shift = 0;

        if (!a)                /* num < 2 || size == 0 */
                return;

        /* called from 'sort' without swap function, let's pick the default */
        if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
                swap_func = NULL;

        if (!swap_func) {
                if (is_aligned(base, size, 8))
                        swap_func = SWAP_WORDS_64;
                else if (is_aligned(base, size, 4))
                        swap_func = SWAP_WORDS_32;
                else
                        swap_func = SWAP_BYTES;
        }

        /*
         * Loop invariants:
         * 1. elements [a,n) satisfy the heap property (compare greater than
         *    all of their children),
         * 2. elements [n,num*size) are sorted, and
         * 3. a <= b <= c <= d <= n (whenever they are valid).
         */
        for (;;) {
                size_t b, c, d;

                if (a)                        /* Building heap: sift down a */
                        a -= size << shift;
                else if (n > 3 * size) { /* Sorting: Extract two largest elements */
                        n -= size;
                        do_swap(base, base + n, size, swap_func, priv);
                        shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0;
                        a = size << shift;
                        n -= size;
                        do_swap(base + a, base + n, size, swap_func, priv);
                } else if (n > size) {        /* Sorting: Extract root */
                        n -= size;
                        do_swap(base, base + n, size, swap_func, priv);
                } else        {                /* Sort complete */
                        break;
                }

                /*
                 * Sift element at "a" down into heap.  This is the
                 * "bottom-up" variant, which significantly reduces
                 * calls to cmp_func(): we find the sift-down path all
                 * the way to the leaves (one compare per level), then
                 * backtrack to find where to insert the target element.
                 *
                 * Because elements tend to sift down close to the leaves,
                 * this uses fewer compares than doing two per level
                 * on the way down.  (A bit more than half as many on
                 * average, 3/4 worst-case.)
                 */
                for (b = a; c = 2*b + size, (d = c + size) < n;)
                        b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d;
                if (d == n)        /* Special case last leaf with no sibling */
                        b = c;

                /* Now backtrack from "b" to the correct location for "a" */
                while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
                        b = parent(b, lsbit, size);
                c = b;                        /* Where "a" belongs */
                while (b != a) {        /* Shift it into place */
                        b = parent(b, lsbit, size);
                        do_swap(base + b, base + c, size, swap_func, priv);
                }
        }
}
EXPORT_SYMBOL(sort_r);

void sort(void *base, size_t num, size_t size,
          cmp_func_t cmp_func,
          swap_func_t swap_func)
{
        struct wrapper w = {
                .cmp  = cmp_func,
                .swap = swap_func,
        };

        return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
}
EXPORT_SYMBOL(sort);






































































    1 




    1 





    1 


















































































    1 





































































































































































































































































































    1 



















    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 



























    1 



    1 










































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
// SPDX-License-Identifier: GPL-2.0
/*
 * Wireless utility functions
 *
 * Copyright 2007-2009        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright 2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2023 Intel Corporation
 */
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/etherdevice.h>
#include <linux/slab.h>
#include <linux/ieee80211.h>
#include <net/cfg80211.h>
#include <net/ip.h>
#include <net/dsfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/gcd.h>
#include <linux/bitfield.h>
#include <linux/nospec.h>
#include "core.h"
#include "rdev-ops.h"


const struct ieee80211_rate *
ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
                            u32 basic_rates, int bitrate)
{
        struct ieee80211_rate *result = &sband->bitrates[0];
        int i;

        for (i = 0; i < sband->n_bitrates; i++) {
                if (!(basic_rates & BIT(i)))
                        continue;
                if (sband->bitrates[i].bitrate > bitrate)
                        continue;
                result = &sband->bitrates[i];
        }

        return result;
}
EXPORT_SYMBOL(ieee80211_get_response_rate);

u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband)
{
        struct ieee80211_rate *bitrates;
        u32 mandatory_rates = 0;
        enum ieee80211_rate_flags mandatory_flag;
        int i;

        if (WARN_ON(!sband))
                return 1;

        if (sband->band == NL80211_BAND_2GHZ)
                mandatory_flag = IEEE80211_RATE_MANDATORY_B;
        else
                mandatory_flag = IEEE80211_RATE_MANDATORY_A;

        bitrates = sband->bitrates;
        for (i = 0; i < sband->n_bitrates; i++)
                if (bitrates[i].flags & mandatory_flag)
                        mandatory_rates |= BIT(i);
        return mandatory_rates;
}
EXPORT_SYMBOL(ieee80211_mandatory_rates);

u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band)
{
        /* see 802.11 17.3.8.3.2 and Annex J
         * there are overlapping channel numbers in 5GHz and 2GHz bands */
        if (chan <= 0)
                return 0; /* not supported */
        switch (band) {
        case NL80211_BAND_2GHZ:
        case NL80211_BAND_LC:
                if (chan == 14)
                        return MHZ_TO_KHZ(2484);
                else if (chan < 14)
                        return MHZ_TO_KHZ(2407 + chan * 5);
                break;
        case NL80211_BAND_5GHZ:
                if (chan >= 182 && chan <= 196)
                        return MHZ_TO_KHZ(4000 + chan * 5);
                else
                        return MHZ_TO_KHZ(5000 + chan * 5);
                break;
        case NL80211_BAND_6GHZ:
                /* see 802.11ax D6.1 27.3.23.2 */
                if (chan == 2)
                        return MHZ_TO_KHZ(5935);
                if (chan <= 233)
                        return MHZ_TO_KHZ(5950 + chan * 5);
                break;
        case NL80211_BAND_60GHZ:
                if (chan < 7)
                        return MHZ_TO_KHZ(56160 + chan * 2160);
                break;
        case NL80211_BAND_S1GHZ:
                return 902000 + chan * 500;
        default:
                ;
        }
        return 0; /* not supported */
}
EXPORT_SYMBOL(ieee80211_channel_to_freq_khz);

enum nl80211_chan_width
ieee80211_s1g_channel_width(const struct ieee80211_channel *chan)
{
        if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ))
                return NL80211_CHAN_WIDTH_20_NOHT;

        /*S1G defines a single allowed channel width per channel.
         * Extract that width here.
         */
        if (chan->flags & IEEE80211_CHAN_1MHZ)
                return NL80211_CHAN_WIDTH_1;
        else if (chan->flags & IEEE80211_CHAN_2MHZ)
                return NL80211_CHAN_WIDTH_2;
        else if (chan->flags & IEEE80211_CHAN_4MHZ)
                return NL80211_CHAN_WIDTH_4;
        else if (chan->flags & IEEE80211_CHAN_8MHZ)
                return NL80211_CHAN_WIDTH_8;
        else if (chan->flags & IEEE80211_CHAN_16MHZ)
                return NL80211_CHAN_WIDTH_16;

        pr_err("unknown channel width for channel at %dKHz?\n",
               ieee80211_channel_to_khz(chan));

        return NL80211_CHAN_WIDTH_1;
}
EXPORT_SYMBOL(ieee80211_s1g_channel_width);

int ieee80211_freq_khz_to_channel(u32 freq)
{
        /* TODO: just handle MHz for now */
        freq = KHZ_TO_MHZ(freq);

        /* see 802.11 17.3.8.3.2 and Annex J */
        if (freq == 2484)
                return 14;
        else if (freq < 2484)
                return (freq - 2407) / 5;
        else if (freq >= 4910 && freq <= 4980)
                return (freq - 4000) / 5;
        else if (freq < 5925)
                return (freq - 5000) / 5;
        else if (freq == 5935)
                return 2;
        else if (freq <= 45000) /* DMG band lower limit */
                /* see 802.11ax D6.1 27.3.22.2 */
                return (freq - 5950) / 5;
        else if (freq >= 58320 && freq <= 70200)
                return (freq - 56160) / 2160;
        else
                return 0;
}
EXPORT_SYMBOL(ieee80211_freq_khz_to_channel);

struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy,
                                                    u32 freq)
{
        enum nl80211_band band;
        struct ieee80211_supported_band *sband;
        int i;

        for (band = 0; band < NUM_NL80211_BANDS; band++) {
                sband = wiphy->bands[band];

                if (!sband)
                        continue;

                for (i = 0; i < sband->n_channels; i++) {
                        struct ieee80211_channel *chan = &sband->channels[i];

                        if (ieee80211_channel_to_khz(chan) == freq)
                                return chan;
                }
        }

        return NULL;
}
EXPORT_SYMBOL(ieee80211_get_channel_khz);

static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
{
        int i, want;

        switch (sband->band) {
        case NL80211_BAND_5GHZ:
        case NL80211_BAND_6GHZ:
                want = 3;
                for (i = 0; i < sband->n_bitrates; i++) {
                        if (sband->bitrates[i].bitrate == 60 ||
                            sband->bitrates[i].bitrate == 120 ||
                            sband->bitrates[i].bitrate == 240) {
                                sband->bitrates[i].flags |=
                                        IEEE80211_RATE_MANDATORY_A;
                                want--;
                        }
                }
                WARN_ON(want);
                break;
        case NL80211_BAND_2GHZ:
        case NL80211_BAND_LC:
                want = 7;
                for (i = 0; i < sband->n_bitrates; i++) {
                        switch (sband->bitrates[i].bitrate) {
                        case 10:
                        case 20:
                        case 55:
                        case 110:
                                sband->bitrates[i].flags |=
                                        IEEE80211_RATE_MANDATORY_B |
                                        IEEE80211_RATE_MANDATORY_G;
                                want--;
                                break;
                        case 60:
                        case 120:
                        case 240:
                                sband->bitrates[i].flags |=
                                        IEEE80211_RATE_MANDATORY_G;
                                want--;
                                fallthrough;
                        default:
                                sband->bitrates[i].flags |=
                                        IEEE80211_RATE_ERP_G;
                                break;
                        }
                }
                WARN_ON(want != 0 && want != 3);
                break;
        case NL80211_BAND_60GHZ:
                /* check for mandatory HT MCS 1..4 */
                WARN_ON(!sband->ht_cap.ht_supported);
                WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e);
                break;
        case NL80211_BAND_S1GHZ:
                /* Figure 9-589bd: 3 means unsupported, so != 3 means at least
                 * mandatory is ok.
                 */
                WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3);
                break;
        case NUM_NL80211_BANDS:
        default:
                WARN_ON(1);
                break;
        }
}

void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
{
        enum nl80211_band band;

        for (band = 0; band < NUM_NL80211_BANDS; band++)
                if (wiphy->bands[band])
                        set_mandatory_flags_band(wiphy->bands[band]);
}

bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher)
{
        int i;
        for (i = 0; i < wiphy->n_cipher_suites; i++)
                if (cipher == wiphy->cipher_suites[i])
                        return true;
        return false;
}

static bool
cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int i;

        for (i = 0; i < wiphy->n_cipher_suites; i++) {
                switch (wiphy->cipher_suites[i]) {
                case WLAN_CIPHER_SUITE_AES_CMAC:
                case WLAN_CIPHER_SUITE_BIP_CMAC_256:
                case WLAN_CIPHER_SUITE_BIP_GMAC_128:
                case WLAN_CIPHER_SUITE_BIP_GMAC_256:
                        return true;
                }
        }

        return false;
}

bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev,
                            int key_idx, bool pairwise)
{
        int max_key_idx;

        if (pairwise)
                max_key_idx = 3;
        else if (wiphy_ext_feature_isset(&rdev->wiphy,
                                         NL80211_EXT_FEATURE_BEACON_PROTECTION) ||
                 wiphy_ext_feature_isset(&rdev->wiphy,
                                         NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT))
                max_key_idx = 7;
        else if (cfg80211_igtk_cipher_supported(rdev))
                max_key_idx = 5;
        else
                max_key_idx = 3;

        if (key_idx < 0 || key_idx > max_key_idx)
                return false;

        return true;
}

int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
                                   struct key_params *params, int key_idx,
                                   bool pairwise, const u8 *mac_addr)
{
        if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise))
                return -EINVAL;

        if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
                return -EINVAL;

        if (pairwise && !mac_addr)
                return -EINVAL;

        switch (params->cipher) {
        case WLAN_CIPHER_SUITE_TKIP:
                /* Extended Key ID can only be used with CCMP/GCMP ciphers */
                if ((pairwise && key_idx) ||
                    params->mode != NL80211_KEY_RX_TX)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_CCMP:
        case WLAN_CIPHER_SUITE_CCMP_256:
        case WLAN_CIPHER_SUITE_GCMP:
        case WLAN_CIPHER_SUITE_GCMP_256:
                /* IEEE802.11-2016 allows only 0 and - when supporting
                 * Extended Key ID - 1 as index for pairwise keys.
                 * @NL80211_KEY_NO_TX is only allowed for pairwise keys when
                 * the driver supports Extended Key ID.
                 * @NL80211_KEY_SET_TX can't be set when installing and
                 * validating a key.
                 */
                if ((params->mode == NL80211_KEY_NO_TX && !pairwise) ||
                    params->mode == NL80211_KEY_SET_TX)
                        return -EINVAL;
                if (wiphy_ext_feature_isset(&rdev->wiphy,
                                            NL80211_EXT_FEATURE_EXT_KEY_ID)) {
                        if (pairwise && (key_idx < 0 || key_idx > 1))
                                return -EINVAL;
                } else if (pairwise && key_idx) {
                        return -EINVAL;
                }
                break;
        case WLAN_CIPHER_SUITE_AES_CMAC:
        case WLAN_CIPHER_SUITE_BIP_CMAC_256:
        case WLAN_CIPHER_SUITE_BIP_GMAC_128:
        case WLAN_CIPHER_SUITE_BIP_GMAC_256:
                /* Disallow BIP (group-only) cipher as pairwise cipher */
                if (pairwise)
                        return -EINVAL;
                if (key_idx < 4)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_WEP40:
        case WLAN_CIPHER_SUITE_WEP104:
                if (key_idx > 3)
                        return -EINVAL;
                break;
        default:
                break;
        }

        switch (params->cipher) {
        case WLAN_CIPHER_SUITE_WEP40:
                if (params->key_len != WLAN_KEY_LEN_WEP40)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_TKIP:
                if (params->key_len != WLAN_KEY_LEN_TKIP)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_CCMP:
                if (params->key_len != WLAN_KEY_LEN_CCMP)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_CCMP_256:
                if (params->key_len != WLAN_KEY_LEN_CCMP_256)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_GCMP:
                if (params->key_len != WLAN_KEY_LEN_GCMP)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_GCMP_256:
                if (params->key_len != WLAN_KEY_LEN_GCMP_256)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_WEP104:
                if (params->key_len != WLAN_KEY_LEN_WEP104)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_AES_CMAC:
                if (params->key_len != WLAN_KEY_LEN_AES_CMAC)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_BIP_CMAC_256:
                if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_BIP_GMAC_128:
                if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128)
                        return -EINVAL;
                break;
        case WLAN_CIPHER_SUITE_BIP_GMAC_256:
                if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256)
                        return -EINVAL;
                break;
        default:
                /*
                 * We don't know anything about this algorithm,
                 * allow using it -- but the driver must check
                 * all parameters! We still check below whether
                 * or not the driver supports this algorithm,
                 * of course.
                 */
                break;
        }

        if (params->seq) {
                switch (params->cipher) {
                case WLAN_CIPHER_SUITE_WEP40:
                case WLAN_CIPHER_SUITE_WEP104:
                        /* These ciphers do not use key sequence */
                        return -EINVAL;
                case WLAN_CIPHER_SUITE_TKIP:
                case WLAN_CIPHER_SUITE_CCMP:
                case WLAN_CIPHER_SUITE_CCMP_256:
                case WLAN_CIPHER_SUITE_GCMP:
                case WLAN_CIPHER_SUITE_GCMP_256:
                case WLAN_CIPHER_SUITE_AES_CMAC:
                case WLAN_CIPHER_SUITE_BIP_CMAC_256:
                case WLAN_CIPHER_SUITE_BIP_GMAC_128:
                case WLAN_CIPHER_SUITE_BIP_GMAC_256:
                        if (params->seq_len != 6)
                                return -EINVAL;
                        break;
                }
        }

        if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher))
                return -EINVAL;

        return 0;
}

unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
{
        unsigned int hdrlen = 24;

        if (ieee80211_is_ext(fc)) {
                hdrlen = 4;
                goto out;
        }

        if (ieee80211_is_data(fc)) {
                if (ieee80211_has_a4(fc))
                        hdrlen = 30;
                if (ieee80211_is_data_qos(fc)) {
                        hdrlen += IEEE80211_QOS_CTL_LEN;
                        if (ieee80211_has_order(fc))
                                hdrlen += IEEE80211_HT_CTL_LEN;
                }
                goto out;
        }

        if (ieee80211_is_mgmt(fc)) {
                if (ieee80211_has_order(fc))
                        hdrlen += IEEE80211_HT_CTL_LEN;
                goto out;
        }

        if (ieee80211_is_ctl(fc)) {
                /*
                 * ACK and CTS are 10 bytes, all others 16. To see how
                 * to get this condition consider
                 *   subtype mask:   0b0000000011110000 (0x00F0)
                 *   ACK subtype:    0b0000000011010000 (0x00D0)
                 *   CTS subtype:    0b0000000011000000 (0x00C0)
                 *   bits that matter:         ^^^      (0x00E0)
                 *   value of those: 0b0000000011000000 (0x00C0)
                 */
                if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0))
                        hdrlen = 10;
                else
                        hdrlen = 16;
        }
out:
        return hdrlen;
}
EXPORT_SYMBOL(ieee80211_hdrlen);

unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb)
{
        const struct ieee80211_hdr *hdr =
                        (const struct ieee80211_hdr *)skb->data;
        unsigned int hdrlen;

        if (unlikely(skb->len < 10))
                return 0;
        hdrlen = ieee80211_hdrlen(hdr->frame_control);
        if (unlikely(hdrlen > skb->len))
                return 0;
        return hdrlen;
}
EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb);

static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags)
{
        int ae = flags & MESH_FLAGS_AE;
        /* 802.11-2012, 8.2.4.7.3 */
        switch (ae) {
        default:
        case 0:
                return 6;
        case MESH_FLAGS_AE_A4:
                return 12;
        case MESH_FLAGS_AE_A5_A6:
                return 18;
        }
}

unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
{
        return __ieee80211_get_mesh_hdrlen(meshhdr->flags);
}
EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen);

bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto)
{
        const __be16 *hdr_proto = hdr + ETH_ALEN;

        if (!(ether_addr_equal(hdr, rfc1042_header) &&
              *hdr_proto != htons(ETH_P_AARP) &&
              *hdr_proto != htons(ETH_P_IPX)) &&
            !ether_addr_equal(hdr, bridge_tunnel_header))
                return false;

        *proto = *hdr_proto;

        return true;
}
EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto);

int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb)
{
        const void *mesh_addr;
        struct {
                struct ethhdr eth;
                u8 flags;
        } payload;
        int hdrlen;
        int ret;

        ret = skb_copy_bits(skb, 0, &payload, sizeof(payload));
        if (ret)
                return ret;

        hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags);

        if (likely(pskb_may_pull(skb, hdrlen + 8) &&
                   ieee80211_get_8023_tunnel_proto(skb->data + hdrlen,
                                                   &payload.eth.h_proto)))
                hdrlen += ETH_ALEN + 2;
        else if (!pskb_may_pull(skb, hdrlen))
                return -EINVAL;
        else
                payload.eth.h_proto = htons(skb->len - hdrlen);

        mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN;
        switch (payload.flags & MESH_FLAGS_AE) {
        case MESH_FLAGS_AE_A4:
                memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN);
                break;
        case MESH_FLAGS_AE_A5_A6:
                memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN);
                break;
        default:
                break;
        }

        pskb_pull(skb, hdrlen - sizeof(payload.eth));
        memcpy(skb->data, &payload.eth, sizeof(payload.eth));

        return 0;
}
EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr);

int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
                                  const u8 *addr, enum nl80211_iftype iftype,
                                  u8 data_offset, bool is_amsdu)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
        struct {
                u8 hdr[ETH_ALEN] __aligned(2);
                __be16 proto;
        } payload;
        struct ethhdr tmp;
        u16 hdrlen;

        if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
                return -1;

        hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset;
        if (skb->len < hdrlen)
                return -1;

        /* convert IEEE 802.11 header + possible LLC headers into Ethernet
         * header
         * IEEE 802.11 address fields:
         * ToDS FromDS Addr1 Addr2 Addr3 Addr4
         *   0     0   DA    SA    BSSID n/a
         *   0     1   DA    BSSID SA    n/a
         *   1     0   BSSID SA    DA    n/a
         *   1     1   RA    TA    DA    SA
         */
        memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN);
        memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN);

        switch (hdr->frame_control &
                cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
        case cpu_to_le16(IEEE80211_FCTL_TODS):
                if (unlikely(iftype != NL80211_IFTYPE_AP &&
                             iftype != NL80211_IFTYPE_AP_VLAN &&
                             iftype != NL80211_IFTYPE_P2P_GO))
                        return -1;
                break;
        case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
                if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT &&
                             iftype != NL80211_IFTYPE_AP_VLAN &&
                             iftype != NL80211_IFTYPE_STATION))
                        return -1;
                break;
        case cpu_to_le16(IEEE80211_FCTL_FROMDS):
                if ((iftype != NL80211_IFTYPE_STATION &&
                     iftype != NL80211_IFTYPE_P2P_CLIENT &&
                     iftype != NL80211_IFTYPE_MESH_POINT) ||
                    (is_multicast_ether_addr(tmp.h_dest) &&
                     ether_addr_equal(tmp.h_source, addr)))
                        return -1;
                break;
        case cpu_to_le16(0):
                if (iftype != NL80211_IFTYPE_ADHOC &&
                    iftype != NL80211_IFTYPE_STATION &&
                    iftype != NL80211_IFTYPE_OCB)
                                return -1;
                break;
        }

        if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT &&
                   skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 &&
                   ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) {
                /* remove RFC1042 or Bridge-Tunnel encapsulation */
                hdrlen += ETH_ALEN + 2;
                skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2);
        } else {
                tmp.h_proto = htons(skb->len - hdrlen);
        }

        pskb_pull(skb, hdrlen);

        if (!ehdr)
                ehdr = skb_push(skb, sizeof(struct ethhdr));
        memcpy(ehdr, &tmp, sizeof(tmp));

        return 0;
}
EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr);

static void
__frame_add_frag(struct sk_buff *skb, struct page *page,
                 void *ptr, int len, int size)
{
        struct skb_shared_info *sh = skb_shinfo(skb);
        int page_offset;

        get_page(page);
        page_offset = ptr - page_address(page);
        skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
}

static void
__ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame,
                            int offset, int len)
{
        struct skb_shared_info *sh = skb_shinfo(skb);
        const skb_frag_t *frag = &sh->frags[0];
        struct page *frag_page;
        void *frag_ptr;
        int frag_len, frag_size;
        int head_size = skb->len - skb->data_len;
        int cur_len;

        frag_page = virt_to_head_page(skb->head);
        frag_ptr = skb->data;
        frag_size = head_size;

        while (offset >= frag_size) {
                offset -= frag_size;
                frag_page = skb_frag_page(frag);
                frag_ptr = skb_frag_address(frag);
                frag_size = skb_frag_size(frag);
                frag++;
        }

        frag_ptr += offset;
        frag_len = frag_size - offset;

        cur_len = min(len, frag_len);

        __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size);
        len -= cur_len;

        while (len > 0) {
                frag_len = skb_frag_size(frag);
                cur_len = min(len, frag_len);
                __frame_add_frag(frame, skb_frag_page(frag),
                                 skb_frag_address(frag), cur_len, frag_len);
                len -= cur_len;
                frag++;
        }
}

static struct sk_buff *
__ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
                       int offset, int len, bool reuse_frag,
                       int min_len)
{
        struct sk_buff *frame;
        int cur_len = len;

        if (skb->len - offset < len)
                return NULL;

        /*
         * When reusing framents, copy some data to the head to simplify
         * ethernet header handling and speed up protocol header processing
         * in the stack later.
         */
        if (reuse_frag)
                cur_len = min_t(int, len, min_len);

        /*
         * Allocate and reserve two bytes more for payload
         * alignment since sizeof(struct ethhdr) is 14.
         */
        frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len);
        if (!frame)
                return NULL;

        frame->priority = skb->priority;
        skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
        skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len);

        len -= cur_len;
        if (!len)
                return frame;

        offset += cur_len;
        __ieee80211_amsdu_copy_frag(skb, frame, offset, len);

        return frame;
}

static u16
ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type)
{
        __le16 *field_le = field;
        __be16 *field_be = field;
        u16 len;

        if (hdr_type >= 2)
                len = le16_to_cpu(*field_le);
        else
                len = be16_to_cpu(*field_be);
        if (hdr_type)
                len += __ieee80211_get_mesh_hdrlen(mesh_flags);

        return len;
}

bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr)
{
        int offset = 0, subframe_len, padding;

        for (offset = 0; offset < skb->len; offset += subframe_len + padding) {
                int remaining = skb->len - offset;
                struct {
                    __be16 len;
                    u8 mesh_flags;
                } hdr;
                u16 len;

                if (sizeof(hdr) > remaining)
                        return false;

                if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0)
                        return false;

                len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags,
                                                      mesh_hdr);
                subframe_len = sizeof(struct ethhdr) + len;
                padding = (4 - subframe_len) & 0x3;

                if (subframe_len > remaining)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL(ieee80211_is_valid_amsdu);

void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
                              const u8 *addr, enum nl80211_iftype iftype,
                              const unsigned int extra_headroom,
                              const u8 *check_da, const u8 *check_sa,
                              u8 mesh_control)
{
        unsigned int hlen = ALIGN(extra_headroom, 4);
        struct sk_buff *frame = NULL;
        int offset = 0;
        struct {
                struct ethhdr eth;
                uint8_t flags;
        } hdr;
        bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
        bool reuse_skb = false;
        bool last = false;
        int copy_len = sizeof(hdr.eth);

        if (iftype == NL80211_IFTYPE_MESH_POINT)
                copy_len = sizeof(hdr);

        while (!last) {
                int remaining = skb->len - offset;
                unsigned int subframe_len;
                int len, mesh_len = 0;
                u8 padding;

                if (copy_len > remaining)
                        goto purge;

                skb_copy_bits(skb, offset, &hdr, copy_len);
                if (iftype == NL80211_IFTYPE_MESH_POINT)
                        mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags);
                len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags,
                                                      mesh_control);
                subframe_len = sizeof(struct ethhdr) + len;
                padding = (4 - subframe_len) & 0x3;

                /* the last MSDU has no padding */
                if (subframe_len > remaining)
                        goto purge;
                /* mitigate A-MSDU aggregation injection attacks */
                if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header))
                        goto purge;

                offset += sizeof(struct ethhdr);
                last = remaining <= subframe_len + padding;

                /* FIXME: should we really accept multicast DA? */
                if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) &&
                     !ether_addr_equal(check_da, hdr.eth.h_dest)) ||
                    (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) {
                        offset += len + padding;
                        continue;
                }

                /* reuse skb for the last subframe */
                if (!skb_is_nonlinear(skb) && !reuse_frag && last) {
                        skb_pull(skb, offset);
                        frame = skb;
                        reuse_skb = true;
                } else {
                        frame = __ieee80211_amsdu_copy(skb, hlen, offset, len,
                                                       reuse_frag, 32 + mesh_len);
                        if (!frame)
                                goto purge;

                        offset += len + padding;
                }

                skb_reset_network_header(frame);
                frame->dev = skb->dev;
                frame->priority = skb->priority;

                if (likely(iftype != NL80211_IFTYPE_MESH_POINT &&
                           ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto)))
                        skb_pull(frame, ETH_ALEN + 2);

                memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth));
                __skb_queue_tail(list, frame);
        }

        if (!reuse_skb)
                dev_kfree_skb(skb);

        return;

 purge:
        __skb_queue_purge(list);
        dev_kfree_skb(skb);
}
EXPORT_SYMBOL(ieee80211_amsdu_to_8023s);

/* Given a data frame determine the 802.1p/1d tag to use. */
unsigned int cfg80211_classify8021d(struct sk_buff *skb,
                                    struct cfg80211_qos_map *qos_map)
{
        unsigned int dscp;
        unsigned char vlan_priority;
        unsigned int ret;

        /* skb->priority values from 256->263 are magic values to
         * directly indicate a specific 802.1d priority.  This is used
         * to allow 802.1d priority to be passed directly in from VLAN
         * tags, etc.
         */
        if (skb->priority >= 256 && skb->priority <= 263) {
                ret = skb->priority - 256;
                goto out;
        }

        if (skb_vlan_tag_present(skb)) {
                vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK)
                        >> VLAN_PRIO_SHIFT;
                if (vlan_priority > 0) {
                        ret = vlan_priority;
                        goto out;
                }
        }

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc;
                break;
        case htons(ETH_P_IPV6):
                dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc;
                break;
        case htons(ETH_P_MPLS_UC):
        case htons(ETH_P_MPLS_MC): {
                struct mpls_label mpls_tmp, *mpls;

                mpls = skb_header_pointer(skb, sizeof(struct ethhdr),
                                          sizeof(*mpls), &mpls_tmp);
                if (!mpls)
                        return 0;

                ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK)
                        >> MPLS_LS_TC_SHIFT;
                goto out;
        }
        case htons(ETH_P_80221):
                /* 802.21 is always network control traffic */
                return 7;
        default:
                return 0;
        }

        if (qos_map) {
                unsigned int i, tmp_dscp = dscp >> 2;

                for (i = 0; i < qos_map->num_des; i++) {
                        if (tmp_dscp == qos_map->dscp_exception[i].dscp) {
                                ret = qos_map->dscp_exception[i].up;
                                goto out;
                        }
                }

                for (i = 0; i < 8; i++) {
                        if (tmp_dscp >= qos_map->up[i].low &&
                            tmp_dscp <= qos_map->up[i].high) {
                                ret = i;
                                goto out;
                        }
                }
        }

        /* The default mapping as defined Section 2.3 in RFC8325: The three
         * Most Significant Bits (MSBs) of the DSCP are used as the
         * corresponding L2 markings.
         */
        ret = dscp >> 5;

        /* Handle specific DSCP values for which the default mapping (as
         * described above) doesn't adhere to the intended usage of the DSCP
         * value. See section 4 in RFC8325. Specifically, for the following
         * Diffserv Service Classes no update is needed:
         * - Standard: DF
         * - Low Priority Data: CS1
         * - Multimedia Streaming: AF31, AF32, AF33
         * - Multimedia Conferencing: AF41, AF42, AF43
         * - Network Control Traffic: CS7
         * - Real-Time Interactive: CS4
         */
        switch (dscp >> 2) {
        case 10:
        case 12:
        case 14:
                /* High throughput data: AF11, AF12, AF13 */
                ret = 0;
                break;
        case 16:
                /* Operations, Administration, and Maintenance and Provisioning:
                 * CS2
                 */
                ret = 0;
                break;
        case 18:
        case 20:
        case 22:
                /* Low latency data: AF21, AF22, AF23 */
                ret = 3;
                break;
        case 24:
                /* Broadcasting video: CS3 */
                ret = 4;
                break;
        case 40:
                /* Signaling: CS5 */
                ret = 5;
                break;
        case 44:
                /* Voice Admit: VA */
                ret = 6;
                break;
        case 46:
                /* Telephony traffic: EF */
                ret = 6;
                break;
        case 48:
                /* Network Control Traffic: CS6 */
                ret = 7;
                break;
        }
out:
        return array_index_nospec(ret, IEEE80211_NUM_TIDS);
}
EXPORT_SYMBOL(cfg80211_classify8021d);

const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id)
{
        const struct cfg80211_bss_ies *ies;

        ies = rcu_dereference(bss->ies);
        if (!ies)
                return NULL;

        return cfg80211_find_elem(id, ies->data, ies->len);
}
EXPORT_SYMBOL(ieee80211_bss_get_elem);

void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct net_device *dev = wdev->netdev;
        int i;

        if (!wdev->connect_keys)
                return;

        for (i = 0; i < 4; i++) {
                if (!wdev->connect_keys->params[i].cipher)
                        continue;
                if (rdev_add_key(rdev, dev, -1, i, false, NULL,
                                 &wdev->connect_keys->params[i])) {
                        netdev_err(dev, "failed to set key %d\n", i);
                        continue;
                }
                if (wdev->connect_keys->def == i &&
                    rdev_set_default_key(rdev, dev, -1, i, true, true)) {
                        netdev_err(dev, "failed to set defkey %d\n", i);
                        continue;
                }
        }

        kfree_sensitive(wdev->connect_keys);
        wdev->connect_keys = NULL;
}

void cfg80211_process_wdev_events(struct wireless_dev *wdev)
{
        struct cfg80211_event *ev;
        unsigned long flags;

        spin_lock_irqsave(&wdev->event_lock, flags);
        while (!list_empty(&wdev->event_list)) {
                ev = list_first_entry(&wdev->event_list,
                                      struct cfg80211_event, list);
                list_del(&ev->list);
                spin_unlock_irqrestore(&wdev->event_lock, flags);

                switch (ev->type) {
                case EVENT_CONNECT_RESULT:
                        __cfg80211_connect_result(
                                wdev->netdev,
                                &ev->cr,
                                ev->cr.status == WLAN_STATUS_SUCCESS);
                        break;
                case EVENT_ROAMED:
                        __cfg80211_roamed(wdev, &ev->rm);
                        break;
                case EVENT_DISCONNECTED:
                        __cfg80211_disconnected(wdev->netdev,
                                                ev->dc.ie, ev->dc.ie_len,
                                                ev->dc.reason,
                                                !ev->dc.locally_generated);
                        break;
                case EVENT_IBSS_JOINED:
                        __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid,
                                               ev->ij.channel);
                        break;
                case EVENT_STOPPED:
                        cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
                        break;
                case EVENT_PORT_AUTHORIZED:
                        __cfg80211_port_authorized(wdev, ev->pa.peer_addr,
                                                   ev->pa.td_bitmap,
                                                   ev->pa.td_bitmap_len);
                        break;
                }

                kfree(ev);

                spin_lock_irqsave(&wdev->event_lock, flags);
        }
        spin_unlock_irqrestore(&wdev->event_lock, flags);
}

void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev)
{
        struct wireless_dev *wdev;

        lockdep_assert_held(&rdev->wiphy.mtx);

        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
                cfg80211_process_wdev_events(wdev);
}

int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
                          struct net_device *dev, enum nl80211_iftype ntype,
                          struct vif_params *params)
{
        int err;
        enum nl80211_iftype otype = dev->ieee80211_ptr->iftype;

        lockdep_assert_held(&rdev->wiphy.mtx);

        /* don't support changing VLANs, you just re-create them */
        if (otype == NL80211_IFTYPE_AP_VLAN)
                return -EOPNOTSUPP;

        /* cannot change into P2P device or NAN */
        if (ntype == NL80211_IFTYPE_P2P_DEVICE ||
            ntype == NL80211_IFTYPE_NAN)
                return -EOPNOTSUPP;

        if (!rdev->ops->change_virtual_intf ||
            !(rdev->wiphy.interface_modes & (1 << ntype)))
                return -EOPNOTSUPP;

        if (ntype != otype) {
                /* if it's part of a bridge, reject changing type to station/ibss */
                if (netif_is_bridge_port(dev) &&
                    (ntype == NL80211_IFTYPE_ADHOC ||
                     ntype == NL80211_IFTYPE_STATION ||
                     ntype == NL80211_IFTYPE_P2P_CLIENT))
                        return -EBUSY;

                dev->ieee80211_ptr->use_4addr = false;
                rdev_set_qos_map(rdev, dev, NULL);

                switch (otype) {
                case NL80211_IFTYPE_AP:
                case NL80211_IFTYPE_P2P_GO:
                        cfg80211_stop_ap(rdev, dev, -1, true);
                        break;
                case NL80211_IFTYPE_ADHOC:
                        cfg80211_leave_ibss(rdev, dev, false);
                        break;
                case NL80211_IFTYPE_STATION:
                case NL80211_IFTYPE_P2P_CLIENT:
                        cfg80211_disconnect(rdev, dev,
                                            WLAN_REASON_DEAUTH_LEAVING, true);
                        break;
                case NL80211_IFTYPE_MESH_POINT:
                        /* mesh should be handled? */
                        break;
                case NL80211_IFTYPE_OCB:
                        cfg80211_leave_ocb(rdev, dev);
                        break;
                default:
                        break;
                }

                cfg80211_process_rdev_events(rdev);
                cfg80211_mlme_purge_registrations(dev->ieee80211_ptr);

                memset(&dev->ieee80211_ptr->u, 0,
                       sizeof(dev->ieee80211_ptr->u));
                memset(&dev->ieee80211_ptr->links, 0,
                       sizeof(dev->ieee80211_ptr->links));
        }

        err = rdev_change_virtual_intf(rdev, dev, ntype, params);

        WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype);

        if (!err && params && params->use_4addr != -1)
                dev->ieee80211_ptr->use_4addr = params->use_4addr;

        if (!err) {
                dev->priv_flags &= ~IFF_DONT_BRIDGE;
                switch (ntype) {
                case NL80211_IFTYPE_STATION:
                        if (dev->ieee80211_ptr->use_4addr)
                                break;
                        fallthrough;
                case NL80211_IFTYPE_OCB:
                case NL80211_IFTYPE_P2P_CLIENT:
                case NL80211_IFTYPE_ADHOC:
                        dev->priv_flags |= IFF_DONT_BRIDGE;
                        break;
                case NL80211_IFTYPE_P2P_GO:
                case NL80211_IFTYPE_AP:
                case NL80211_IFTYPE_AP_VLAN:
                case NL80211_IFTYPE_MESH_POINT:
                        /* bridging OK */
                        break;
                case NL80211_IFTYPE_MONITOR:
                        /* monitor can't bridge anyway */
                        break;
                case NL80211_IFTYPE_UNSPECIFIED:
                case NUM_NL80211_IFTYPES:
                        /* not happening */
                        break;
                case NL80211_IFTYPE_P2P_DEVICE:
                case NL80211_IFTYPE_WDS:
                case NL80211_IFTYPE_NAN:
                        WARN_ON(1);
                        break;
                }
        }

        if (!err && ntype != otype && netif_running(dev)) {
                cfg80211_update_iface_num(rdev, ntype, 1);
                cfg80211_update_iface_num(rdev, otype, -1);
        }

        return err;
}

static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate)
{
        int modulation, streams, bitrate;

        /* the formula below does only work for MCS values smaller than 32 */
        if (WARN_ON_ONCE(rate->mcs >= 32))
                return 0;

        modulation = rate->mcs & 7;
        streams = (rate->mcs >> 3) + 1;

        bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000;

        if (modulation < 4)
                bitrate *= (modulation + 1);
        else if (modulation == 4)
                bitrate *= (modulation + 2);
        else
                bitrate *= (modulation + 3);

        bitrate *= streams;

        if (rate->flags & RATE_INFO_FLAGS_SHORT_GI)
                bitrate = (bitrate / 9) * 10;

        /* do NOT round down here */
        return (bitrate + 50000) / 100000;
}

static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate)
{
        static const u32 __mcs2bitrate[] = {
                /* control PHY */
                [0] =   275,
                /* SC PHY */
                [1] =  3850,
                [2] =  7700,
                [3] =  9625,
                [4] = 11550,
                [5] = 12512, /* 1251.25 mbps */
                [6] = 15400,
                [7] = 19250,
                [8] = 23100,
                [9] = 25025,
                [10] = 30800,
                [11] = 38500,
                [12] = 46200,
                /* OFDM PHY */
                [13] =  6930,
                [14] =  8662, /* 866.25 mbps */
                [15] = 13860,
                [16] = 17325,
                [17] = 20790,
                [18] = 27720,
                [19] = 34650,
                [20] = 41580,
                [21] = 45045,
                [22] = 51975,
                [23] = 62370,
                [24] = 67568, /* 6756.75 mbps */
                /* LP-SC PHY */
                [25] =  6260,
                [26] =  8340,
                [27] = 11120,
                [28] = 12510,
                [29] = 16680,
                [30] = 22240,
                [31] = 25030,
        };

        if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate)))
                return 0;

        return __mcs2bitrate[rate->mcs];
}

static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate)
{
        static const u32 __mcs2bitrate[] = {
                [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */
                [7 - 6] = 50050, /* MCS 12.1 */
                [8 - 6] = 53900,
                [9 - 6] = 57750,
                [10 - 6] = 63900,
                [11 - 6] = 75075,
                [12 - 6] = 80850,
        };

        /* Extended SC MCS not defined for base MCS below 6 or above 12 */
        if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12))
                return 0;

        return __mcs2bitrate[rate->mcs - 6];
}

static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate)
{
        static const u32 __mcs2bitrate[] = {
                /* control PHY */
                [0] =   275,
                /* SC PHY */
                [1] =  3850,
                [2] =  7700,
                [3] =  9625,
                [4] = 11550,
                [5] = 12512, /* 1251.25 mbps */
                [6] = 13475,
                [7] = 15400,
                [8] = 19250,
                [9] = 23100,
                [10] = 25025,
                [11] = 26950,
                [12] = 30800,
                [13] = 38500,
                [14] = 46200,
                [15] = 50050,
                [16] = 53900,
                [17] = 57750,
                [18] = 69300,
                [19] = 75075,
                [20] = 80850,
        };

        if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate)))
                return 0;

        return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch;
}

static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
{
        static const u32 base[4][12] = {
                {   6500000,
                   13000000,
                   19500000,
                   26000000,
                   39000000,
                   52000000,
                   58500000,
                   65000000,
                   78000000,
                /* not in the spec, but some devices use this: */
                   86700000,
                   97500000,
                  108300000,
                },
                {  13500000,
                   27000000,
                   40500000,
                   54000000,
                   81000000,
                  108000000,
                  121500000,
                  135000000,
                  162000000,
                  180000000,
                  202500000,
                  225000000,
                },
                {  29300000,
                   58500000,
                   87800000,
                  117000000,
                  175500000,
                  234000000,
                  263300000,
                  292500000,
                  351000000,
                  390000000,
                  438800000,
                  487500000,
                },
                {  58500000,
                  117000000,
                  175500000,
                  234000000,
                  351000000,
                  468000000,
                  526500000,
                  585000000,
                  702000000,
                  780000000,
                  877500000,
                  975000000,
                },
        };
        u32 bitrate;
        int idx;

        if (rate->mcs > 11)
                goto warn;

        switch (rate->bw) {
        case RATE_INFO_BW_160:
                idx = 3;
                break;
        case RATE_INFO_BW_80:
                idx = 2;
                break;
        case RATE_INFO_BW_40:
                idx = 1;
                break;
        case RATE_INFO_BW_5:
        case RATE_INFO_BW_10:
        default:
                goto warn;
        case RATE_INFO_BW_20:
                idx = 0;
        }

        bitrate = base[idx][rate->mcs];
        bitrate *= rate->nss;

        if (rate->flags & RATE_INFO_FLAGS_SHORT_GI)
                bitrate = (bitrate / 9) * 10;

        /* do NOT round down here */
        return (bitrate + 50000) / 100000;
 warn:
        WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n",
                  rate->bw, rate->mcs, rate->nss);
        return 0;
}

static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
{
#define SCALE 6144
        u32 mcs_divisors[14] = {
                102399, /* 16.666666... */
                 51201, /*  8.333333... */
                 34134, /*  5.555555... */
                 25599, /*  4.166666... */
                 17067, /*  2.777777... */
                 12801, /*  2.083333... */
                 11377, /*  1.851725... */
                 10239, /*  1.666666... */
                  8532, /*  1.388888... */
                  7680, /*  1.250000... */
                  6828, /*  1.111111... */
                  6144, /*  1.000000... */
                  5690, /*  0.926106... */
                  5120, /*  0.833333... */
        };
        u32 rates_160M[3] = { 960777777, 907400000, 816666666 };
        u32 rates_969[3] =  { 480388888, 453700000, 408333333 };
        u32 rates_484[3] =  { 229411111, 216666666, 195000000 };
        u32 rates_242[3] =  { 114711111, 108333333,  97500000 };
        u32 rates_106[3] =  {  40000000,  37777777,  34000000 };
        u32 rates_52[3]  =  {  18820000,  17777777,  16000000 };
        u32 rates_26[3]  =  {   9411111,   8888888,   8000000 };
        u64 tmp;
        u32 result;

        if (WARN_ON_ONCE(rate->mcs > 13))
                return 0;

        if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2))
                return 0;
        if (WARN_ON_ONCE(rate->he_ru_alloc >
                         NL80211_RATE_INFO_HE_RU_ALLOC_2x996))
                return 0;
        if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
                return 0;

        if (rate->bw == RATE_INFO_BW_160)
                result = rates_160M[rate->he_gi];
        else if (rate->bw == RATE_INFO_BW_80 ||
                 (rate->bw == RATE_INFO_BW_HE_RU &&
                  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996))
                result = rates_969[rate->he_gi];
        else if (rate->bw == RATE_INFO_BW_40 ||
                 (rate->bw == RATE_INFO_BW_HE_RU &&
                  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484))
                result = rates_484[rate->he_gi];
        else if (rate->bw == RATE_INFO_BW_20 ||
                 (rate->bw == RATE_INFO_BW_HE_RU &&
                  rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242))
                result = rates_242[rate->he_gi];
        else if (rate->bw == RATE_INFO_BW_HE_RU &&
                 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106)
                result = rates_106[rate->he_gi];
        else if (rate->bw == RATE_INFO_BW_HE_RU &&
                 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52)
                result = rates_52[rate->he_gi];
        else if (rate->bw == RATE_INFO_BW_HE_RU &&
                 rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26)
                result = rates_26[rate->he_gi];
        else {
                WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
                     rate->bw, rate->he_ru_alloc);
                return 0;
        }

        /* now scale to the appropriate MCS */
        tmp = result;
        tmp *= SCALE;
        do_div(tmp, mcs_divisors[rate->mcs]);
        result = tmp;

        /* and take NSS, DCM into account */
        result = (result * rate->nss) / 8;
        if (rate->he_dcm)
                result /= 2;

        return result / 10000;
}

static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
{
#define SCALE 6144
        static const u32 mcs_divisors[16] = {
                102399, /* 16.666666... */
                 51201, /*  8.333333... */
                 34134, /*  5.555555... */
                 25599, /*  4.166666... */
                 17067, /*  2.777777... */
                 12801, /*  2.083333... */
                 11377, /*  1.851725... */
                 10239, /*  1.666666... */
                  8532, /*  1.388888... */
                  7680, /*  1.250000... */
                  6828, /*  1.111111... */
                  6144, /*  1.000000... */
                  5690, /*  0.926106... */
                  5120, /*  0.833333... */
                409600, /* 66.666666... */
                204800, /* 33.333333... */
        };
        static const u32 rates_996[3] =  { 480388888, 453700000, 408333333 };
        static const u32 rates_484[3] =  { 229411111, 216666666, 195000000 };
        static const u32 rates_242[3] =  { 114711111, 108333333,  97500000 };
        static const u32 rates_106[3] =  {  40000000,  37777777,  34000000 };
        static const u32 rates_52[3]  =  {  18820000,  17777777,  16000000 };
        static const u32 rates_26[3]  =  {   9411111,   8888888,   8000000 };
        u64 tmp;
        u32 result;

        if (WARN_ON_ONCE(rate->mcs > 15))
                return 0;
        if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2))
                return 0;
        if (WARN_ON_ONCE(rate->eht_ru_alloc >
                         NL80211_RATE_INFO_EHT_RU_ALLOC_4x996))
                return 0;
        if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
                return 0;

        /* Bandwidth checks for MCS 14 */
        if (rate->mcs == 14) {
                if ((rate->bw != RATE_INFO_BW_EHT_RU &&
                     rate->bw != RATE_INFO_BW_80 &&
                     rate->bw != RATE_INFO_BW_160 &&
                     rate->bw != RATE_INFO_BW_320) ||
                    (rate->bw == RATE_INFO_BW_EHT_RU &&
                     rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 &&
                     rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 &&
                     rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) {
                        WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n",
                             rate->bw, rate->eht_ru_alloc);
                        return 0;
                }
        }

        if (rate->bw == RATE_INFO_BW_320 ||
            (rate->bw == RATE_INFO_BW_EHT_RU &&
             rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996))
                result = 4 * rates_996[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484)
                result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996)
                result = 3 * rates_996[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484)
                result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_160 ||
                 (rate->bw == RATE_INFO_BW_EHT_RU &&
                  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996))
                result = 2 * rates_996[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc ==
                 NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242)
                result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]
                         + rates_242[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484)
                result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_80 ||
                 (rate->bw == RATE_INFO_BW_EHT_RU &&
                  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996))
                result = rates_996[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242)
                result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_40 ||
                 (rate->bw == RATE_INFO_BW_EHT_RU &&
                  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484))
                result = rates_484[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_20 ||
                 (rate->bw == RATE_INFO_BW_EHT_RU &&
                  rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242))
                result = rates_242[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26)
                result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106)
                result = rates_106[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26)
                result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52)
                result = rates_52[rate->eht_gi];
        else if (rate->bw == RATE_INFO_BW_EHT_RU &&
                 rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26)
                result = rates_26[rate->eht_gi];
        else {
                WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n",
                     rate->bw, rate->eht_ru_alloc);
                return 0;
        }

        /* now scale to the appropriate MCS */
        tmp = result;
        tmp *= SCALE;
        do_div(tmp, mcs_divisors[rate->mcs]);

        /* and take NSS */
        tmp *= rate->nss;
        do_div(tmp, 8);

        result = tmp;

        return result / 10000;
}

static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate)
{
        /* For 1, 2, 4, 8 and 16 MHz channels */
        static const u32 base[5][11] = {
                {  300000,
                   600000,
                   900000,
                  1200000,
                  1800000,
                  2400000,
                  2700000,
                  3000000,
                  3600000,
                  4000000,
                  /* MCS 10 supported in 1 MHz only */
                  150000,
                },
                {  650000,
                  1300000,
                  1950000,
                  2600000,
                  3900000,
                  5200000,
                  5850000,
                  6500000,
                  7800000,
                  /* MCS 9 not valid */
                },
                {  1350000,
                   2700000,
                   4050000,
                   5400000,
                   8100000,
                  10800000,
                  12150000,
                  13500000,
                  16200000,
                  18000000,
                },
                {  2925000,
                   5850000,
                   8775000,
                  11700000,
                  17550000,
                  23400000,
                  26325000,
                  29250000,
                  35100000,
                  39000000,
                },
                {  8580000,
                  11700000,
                  17550000,
                  23400000,
                  35100000,
                  46800000,
                  52650000,
                  58500000,
                  70200000,
                  78000000,
                },
        };
        u32 bitrate;
        /* default is 1 MHz index */
        int idx = 0;

        if (rate->mcs >= 11)
                goto warn;

        switch (rate->bw) {
        case RATE_INFO_BW_16:
                idx = 4;
                break;
        case RATE_INFO_BW_8:
                idx = 3;
                break;
        case RATE_INFO_BW_4:
                idx = 2;
                break;
        case RATE_INFO_BW_2:
                idx = 1;
                break;
        case RATE_INFO_BW_1:
                idx = 0;
                break;
        case RATE_INFO_BW_5:
        case RATE_INFO_BW_10:
        case RATE_INFO_BW_20:
        case RATE_INFO_BW_40:
        case RATE_INFO_BW_80:
        case RATE_INFO_BW_160:
        default:
                goto warn;
        }

        bitrate = base[idx][rate->mcs];
        bitrate *= rate->nss;

        if (rate->flags & RATE_INFO_FLAGS_SHORT_GI)
                bitrate = (bitrate / 9) * 10;
        /* do NOT round down here */
        return (bitrate + 50000) / 100000;
warn:
        WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n",
                  rate->bw, rate->mcs, rate->nss);
        return 0;
}

u32 cfg80211_calculate_bitrate(struct rate_info *rate)
{
        if (rate->flags & RATE_INFO_FLAGS_MCS)
                return cfg80211_calculate_bitrate_ht(rate);
        if (rate->flags & RATE_INFO_FLAGS_DMG)
                return cfg80211_calculate_bitrate_dmg(rate);
        if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG)
                return cfg80211_calculate_bitrate_extended_sc_dmg(rate);
        if (rate->flags & RATE_INFO_FLAGS_EDMG)
                return cfg80211_calculate_bitrate_edmg(rate);
        if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
                return cfg80211_calculate_bitrate_vht(rate);
        if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
                return cfg80211_calculate_bitrate_he(rate);
        if (rate->flags & RATE_INFO_FLAGS_EHT_MCS)
                return cfg80211_calculate_bitrate_eht(rate);
        if (rate->flags & RATE_INFO_FLAGS_S1G_MCS)
                return cfg80211_calculate_bitrate_s1g(rate);

        return rate->legacy;
}
EXPORT_SYMBOL(cfg80211_calculate_bitrate);

int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
                          enum ieee80211_p2p_attr_id attr,
                          u8 *buf, unsigned int bufsize)
{
        u8 *out = buf;
        u16 attr_remaining = 0;
        bool desired_attr = false;
        u16 desired_len = 0;

        while (len > 0) {
                unsigned int iedatalen;
                unsigned int copy;
                const u8 *iedata;

                if (len < 2)
                        return -EILSEQ;
                iedatalen = ies[1];
                if (iedatalen + 2 > len)
                        return -EILSEQ;

                if (ies[0] != WLAN_EID_VENDOR_SPECIFIC)
                        goto cont;

                if (iedatalen < 4)
                        goto cont;

                iedata = ies + 2;

                /* check WFA OUI, P2P subtype */
                if (iedata[0] != 0x50 || iedata[1] != 0x6f ||
                    iedata[2] != 0x9a || iedata[3] != 0x09)
                        goto cont;

                iedatalen -= 4;
                iedata += 4;

                /* check attribute continuation into this IE */
                copy = min_t(unsigned int, attr_remaining, iedatalen);
                if (copy && desired_attr) {
                        desired_len += copy;
                        if (out) {
                                memcpy(out, iedata, min(bufsize, copy));
                                out += min(bufsize, copy);
                                bufsize -= min(bufsize, copy);
                        }


                        if (copy == attr_remaining)
                                return desired_len;
                }

                attr_remaining -= copy;
                if (attr_remaining)
                        goto cont;

                iedatalen -= copy;
                iedata += copy;

                while (iedatalen > 0) {
                        u16 attr_len;

                        /* P2P attribute ID & size must fit */
                        if (iedatalen < 3)
                                return -EILSEQ;
                        desired_attr = iedata[0] == attr;
                        attr_len = get_unaligned_le16(iedata + 1);
                        iedatalen -= 3;
                        iedata += 3;

                        copy = min_t(unsigned int, attr_len, iedatalen);

                        if (desired_attr) {
                                desired_len += copy;
                                if (out) {
                                        memcpy(out, iedata, min(bufsize, copy));
                                        out += min(bufsize, copy);
                                        bufsize -= min(bufsize, copy);
                                }

                                if (copy == attr_len)
                                        return desired_len;
                        }

                        iedata += copy;
                        iedatalen -= copy;
                        attr_remaining = attr_len - copy;
                }

 cont:
                len -= ies[1] + 2;
                ies += ies[1] + 2;
        }

        if (attr_remaining && desired_attr)
                return -EILSEQ;

        return -ENOENT;
}
EXPORT_SYMBOL(cfg80211_get_p2p_attr);

static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext)
{
        int i;

        /* Make sure array values are legal */
        if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION))
                return false;

        i = 0;
        while (i < n_ids) {
                if (ids[i] == WLAN_EID_EXTENSION) {
                        if (id_ext && (ids[i + 1] == id))
                                return true;

                        i += 2;
                        continue;
                }

                if (ids[i] == id && !id_ext)
                        return true;

                i++;
        }
        return false;
}

static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos)
{
        /* we assume a validly formed IEs buffer */
        u8 len = ies[pos + 1];

        pos += 2 + len;

        /* the IE itself must have 255 bytes for fragments to follow */
        if (len < 255)
                return pos;

        while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) {
                len = ies[pos + 1];
                pos += 2 + len;
        }

        return pos;
}

size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
                              const u8 *ids, int n_ids,
                              const u8 *after_ric, int n_after_ric,
                              size_t offset)
{
        size_t pos = offset;

        while (pos < ielen) {
                u8 ext = 0;

                if (ies[pos] == WLAN_EID_EXTENSION)
                        ext = 2;
                if ((pos + ext) >= ielen)
                        break;

                if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext],
                                          ies[pos] == WLAN_EID_EXTENSION))
                        break;

                if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) {
                        pos = skip_ie(ies, ielen, pos);

                        while (pos < ielen) {
                                if (ies[pos] == WLAN_EID_EXTENSION)
                                        ext = 2;
                                else
                                        ext = 0;

                                if ((pos + ext) >= ielen)
                                        break;

                                if (!ieee80211_id_in_list(after_ric,
                                                          n_after_ric,
                                                          ies[pos + ext],
                                                          ext == 2))
                                        pos = skip_ie(ies, ielen, pos);
                                else
                                        break;
                        }
                } else {
                        pos = skip_ie(ies, ielen, pos);
                }
        }

        return pos;
}
EXPORT_SYMBOL(ieee80211_ie_split_ric);

void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id)
{
        unsigned int elem_len;

        if (!len_pos)
                return;

        elem_len = skb->data + skb->len - len_pos - 1;

        while (elem_len > 255) {
                /* this one is 255 */
                *len_pos = 255;
                /* remaining data gets smaller */
                elem_len -= 255;
                /* make space for the fragment ID/len in SKB */
                skb_put(skb, 2);
                /* shift back the remaining data to place fragment ID/len */
                memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len);
                /* place the fragment ID */
                len_pos += 255 + 1;
                *len_pos = frag_id;
                /* and point to fragment length to update later */
                len_pos++;
        }

        *len_pos = elem_len;
}
EXPORT_SYMBOL(ieee80211_fragment_element);

bool ieee80211_operating_class_to_band(u8 operating_class,
                                       enum nl80211_band *band)
{
        switch (operating_class) {
        case 112:
        case 115 ... 127:
        case 128 ... 130:
                *band = NL80211_BAND_5GHZ;
                return true;
        case 131 ... 135:
        case 137:
                *band = NL80211_BAND_6GHZ;
                return true;
        case 81:
        case 82:
        case 83:
        case 84:
                *band = NL80211_BAND_2GHZ;
                return true;
        case 180:
                *band = NL80211_BAND_60GHZ;
                return true;
        }

        return false;
}
EXPORT_SYMBOL(ieee80211_operating_class_to_band);

bool ieee80211_operating_class_to_chandef(u8 operating_class,
                                          struct ieee80211_channel *chan,
                                          struct cfg80211_chan_def *chandef)
{
        u32 control_freq, offset = 0;
        enum nl80211_band band;

        if (!ieee80211_operating_class_to_band(operating_class, &band) ||
            !chan || band != chan->band)
                return false;

        control_freq = chan->center_freq;
        chandef->chan = chan;

        if (control_freq >= 5955)
                offset = control_freq - 5955;
        else if (control_freq >= 5745)
                offset = control_freq - 5745;
        else if (control_freq >= 5180)
                offset = control_freq - 5180;
        offset /= 20;

        switch (operating_class) {
        case 81:  /* 2 GHz band; 20 MHz; channels 1..13 */
        case 82:  /* 2 GHz band; 20 MHz; channel 14 */
        case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */
        case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */
        case 121: /* 5 GHz band; 20 MHz; channels 100..144 */
        case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */
        case 125: /* 5 GHz band; 20 MHz; channels 149..177 */
        case 131: /* 6 GHz band; 20 MHz; channels 1..233*/
        case 136: /* 6 GHz band; 20 MHz; channel 2 */
                chandef->center_freq1 = control_freq;
                chandef->width = NL80211_CHAN_WIDTH_20;
                return true;
        case 83:  /* 2 GHz band; 40 MHz; channels 1..9 */
        case 116: /* 5 GHz band; 40 MHz; channels 36,44 */
        case 119: /* 5 GHz band; 40 MHz; channels 52,60 */
        case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */
        case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */
                chandef->center_freq1 = control_freq + 10;
                chandef->width = NL80211_CHAN_WIDTH_40;
                return true;
        case 84:  /* 2 GHz band; 40 MHz; channels 5..13 */
        case 117: /* 5 GHz band; 40 MHz; channels 40,48 */
        case 120: /* 5 GHz band; 40 MHz; channels 56,64 */
        case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */
        case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */
                chandef->center_freq1 = control_freq - 10;
                chandef->width = NL80211_CHAN_WIDTH_40;
                return true;
        case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/
                chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20;
                chandef->width = NL80211_CHAN_WIDTH_40;
                return true;
        case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */
        case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */
                chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20;
                chandef->width = NL80211_CHAN_WIDTH_80;
                return true;
        case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */
        case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */
                chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20;
                chandef->width = NL80211_CHAN_WIDTH_160;
                return true;
        case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */
        case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */
                  /* The center_freq2 of 80+80 MHz is unknown */
        case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */
                  /* 320-1 or 320-2 channelization is unknown */
        default:
                return false;
        }
}
EXPORT_SYMBOL(ieee80211_operating_class_to_chandef);

bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
                                          u8 *op_class)
{
        u8 vht_opclass;
        u32 freq = chandef->center_freq1;

        if (freq >= 2412 && freq <= 2472) {
                if (chandef->width > NL80211_CHAN_WIDTH_40)
                        return false;

                /* 2.407 GHz, channels 1..13 */
                if (chandef->width == NL80211_CHAN_WIDTH_40) {
                        if (freq > chandef->chan->center_freq)
                                *op_class = 83; /* HT40+ */
                        else
                                *op_class = 84; /* HT40- */
                } else {
                        *op_class = 81;
                }

                return true;
        }

        if (freq == 2484) {
                /* channel 14 is only for IEEE 802.11b */
                if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT)
                        return false;

                *op_class = 82; /* channel 14 */
                return true;
        }

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_80:
                vht_opclass = 128;
                break;
        case NL80211_CHAN_WIDTH_160:
                vht_opclass = 129;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                vht_opclass = 130;
                break;
        case NL80211_CHAN_WIDTH_10:
        case NL80211_CHAN_WIDTH_5:
                return false; /* unsupported for now */
        default:
                vht_opclass = 0;
                break;
        }

        /* 5 GHz, channels 36..48 */
        if (freq >= 5180 && freq <= 5240) {
                if (vht_opclass) {
                        *op_class = vht_opclass;
                } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
                        if (freq > chandef->chan->center_freq)
                                *op_class = 116;
                        else
                                *op_class = 117;
                } else {
                        *op_class = 115;
                }

                return true;
        }

        /* 5 GHz, channels 52..64 */
        if (freq >= 5260 && freq <= 5320) {
                if (vht_opclass) {
                        *op_class = vht_opclass;
                } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
                        if (freq > chandef->chan->center_freq)
                                *op_class = 119;
                        else
                                *op_class = 120;
                } else {
                        *op_class = 118;
                }

                return true;
        }

        /* 5 GHz, channels 100..144 */
        if (freq >= 5500 && freq <= 5720) {
                if (vht_opclass) {
                        *op_class = vht_opclass;
                } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
                        if (freq > chandef->chan->center_freq)
                                *op_class = 122;
                        else
                                *op_class = 123;
                } else {
                        *op_class = 121;
                }

                return true;
        }

        /* 5 GHz, channels 149..169 */
        if (freq >= 5745 && freq <= 5845) {
                if (vht_opclass) {
                        *op_class = vht_opclass;
                } else if (chandef->width == NL80211_CHAN_WIDTH_40) {
                        if (freq > chandef->chan->center_freq)
                                *op_class = 126;
                        else
                                *op_class = 127;
                } else if (freq <= 5805) {
                        *op_class = 124;
                } else {
                        *op_class = 125;
                }

                return true;
        }

        /* 56.16 GHz, channel 1..4 */
        if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) {
                if (chandef->width >= NL80211_CHAN_WIDTH_40)
                        return false;

                *op_class = 180;
                return true;
        }

        /* not supported yet */
        return false;
}
EXPORT_SYMBOL(ieee80211_chandef_to_operating_class);

static int cfg80211_wdev_bi(struct wireless_dev *wdev)
{
        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                WARN_ON(wdev->valid_links);
                return wdev->links[0].ap.beacon_interval;
        case NL80211_IFTYPE_MESH_POINT:
                return wdev->u.mesh.beacon_interval;
        case NL80211_IFTYPE_ADHOC:
                return wdev->u.ibss.beacon_interval;
        default:
                break;
        }

        return 0;
}

static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int,
                                       u32 *beacon_int_gcd,
                                       bool *beacon_int_different)
{
        struct wireless_dev *wdev;

        *beacon_int_gcd = 0;
        *beacon_int_different = false;

        list_for_each_entry(wdev, &wiphy->wdev_list, list) {
                int wdev_bi;

                /* this feature isn't supported with MLO */
                if (wdev->valid_links)
                        continue;

                wdev_bi = cfg80211_wdev_bi(wdev);

                if (!wdev_bi)
                        continue;

                if (!*beacon_int_gcd) {
                        *beacon_int_gcd = wdev_bi;
                        continue;
                }

                if (wdev_bi == *beacon_int_gcd)
                        continue;

                *beacon_int_different = true;
                *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi);
        }

        if (new_beacon_int && *beacon_int_gcd != new_beacon_int) {
                if (*beacon_int_gcd)
                        *beacon_int_different = true;
                *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int);
        }
}

int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
                                 enum nl80211_iftype iftype, u32 beacon_int)
{
        /*
         * This is just a basic pre-condition check; if interface combinations
         * are possible the driver must already be checking those with a call
         * to cfg80211_check_combinations(), in which case we'll validate more
         * through the cfg80211_calculate_bi_data() call and code in
         * cfg80211_iter_combinations().
         */

        if (beacon_int < 10 || beacon_int > 10000)
                return -EINVAL;

        return 0;
}

int cfg80211_iter_combinations(struct wiphy *wiphy,
                               struct iface_combination_params *params,
                               void (*iter)(const struct ieee80211_iface_combination *c,
                                            void *data),
                               void *data)
{
        const struct ieee80211_regdomain *regdom;
        enum nl80211_dfs_regions region = 0;
        int i, j, iftype;
        int num_interfaces = 0;
        u32 used_iftypes = 0;
        u32 beacon_int_gcd;
        bool beacon_int_different;

        /*
         * This is a bit strange, since the iteration used to rely only on
         * the data given by the driver, but here it now relies on context,
         * in form of the currently operating interfaces.
         * This is OK for all current users, and saves us from having to
         * push the GCD calculations into all the drivers.
         * In the future, this should probably rely more on data that's in
         * cfg80211 already - the only thing not would appear to be any new
         * interfaces (while being brought up) and channel/radar data.
         */
        cfg80211_calculate_bi_data(wiphy, params->new_beacon_int,
                                   &beacon_int_gcd, &beacon_int_different);

        if (params->radar_detect) {
                rcu_read_lock();
                regdom = rcu_dereference(cfg80211_regdomain);
                if (regdom)
                        region = regdom->dfs_region;
                rcu_read_unlock();
        }

        for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
                num_interfaces += params->iftype_num[iftype];
                if (params->iftype_num[iftype] > 0 &&
                    !cfg80211_iftype_allowed(wiphy, iftype, 0, 1))
                        used_iftypes |= BIT(iftype);
        }

        for (i = 0; i < wiphy->n_iface_combinations; i++) {
                const struct ieee80211_iface_combination *c;
                struct ieee80211_iface_limit *limits;
                u32 all_iftypes = 0;

                c = &wiphy->iface_combinations[i];

                if (num_interfaces > c->max_interfaces)
                        continue;
                if (params->num_different_channels > c->num_different_channels)
                        continue;

                limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits,
                                 GFP_KERNEL);
                if (!limits)
                        return -ENOMEM;

                for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) {
                        if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1))
                                continue;
                        for (j = 0; j < c->n_limits; j++) {
                                all_iftypes |= limits[j].types;
                                if (!(limits[j].types & BIT(iftype)))
                                        continue;
                                if (limits[j].max < params->iftype_num[iftype])
                                        goto cont;
                                limits[j].max -= params->iftype_num[iftype];
                        }
                }

                if (params->radar_detect !=
                        (c->radar_detect_widths & params->radar_detect))
                        goto cont;

                if (params->radar_detect && c->radar_detect_regions &&
                    !(c->radar_detect_regions & BIT(region)))
                        goto cont;

                /* Finally check that all iftypes that we're currently
                 * using are actually part of this combination. If they
                 * aren't then we can't use this combination and have
                 * to continue to the next.
                 */
                if ((all_iftypes & used_iftypes) != used_iftypes)
                        goto cont;

                if (beacon_int_gcd) {
                        if (c->beacon_int_min_gcd &&
                            beacon_int_gcd < c->beacon_int_min_gcd)
                                goto cont;
                        if (!c->beacon_int_min_gcd && beacon_int_different)
                                goto cont;
                }

                /* This combination covered all interface types and
                 * supported the requested numbers, so we're good.
                 */

                (*iter)(c, data);
 cont:
                kfree(limits);
        }

        return 0;
}
EXPORT_SYMBOL(cfg80211_iter_combinations);

static void
cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c,
                          void *data)
{
        int *num = data;
        (*num)++;
}

int cfg80211_check_combinations(struct wiphy *wiphy,
                                struct iface_combination_params *params)
{
        int err, num = 0;

        err = cfg80211_iter_combinations(wiphy, params,
                                         cfg80211_iter_sum_ifcombs, &num);
        if (err)
                return err;
        if (num == 0)
                return -EBUSY;

        return 0;
}
EXPORT_SYMBOL(cfg80211_check_combinations);

int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
                           const u8 *rates, unsigned int n_rates,
                           u32 *mask)
{
        int i, j;

        if (!sband)
                return -EINVAL;

        if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES)
                return -EINVAL;

        *mask = 0;

        for (i = 0; i < n_rates; i++) {
                int rate = (rates[i] & 0x7f) * 5;
                bool found = false;

                for (j = 0; j < sband->n_bitrates; j++) {
                        if (sband->bitrates[j].bitrate == rate) {
                                found = true;
                                *mask |= BIT(j);
                                break;
                        }
                }
                if (!found)
                        return -EINVAL;
        }

        /*
         * mask must have at least one bit set here since we
         * didn't accept a 0-length rates array nor allowed
         * entries in the array that didn't exist
         */

        return 0;
}

unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy)
{
        enum nl80211_band band;
        unsigned int n_channels = 0;

        for (band = 0; band < NUM_NL80211_BANDS; band++)
                if (wiphy->bands[band])
                        n_channels += wiphy->bands[band]->n_channels;

        return n_channels;
}
EXPORT_SYMBOL(ieee80211_get_num_supported_channels);

int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
                         struct station_info *sinfo)
{
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;
        int ret;

        wdev = dev->ieee80211_ptr;
        if (!wdev)
                return -EOPNOTSUPP;

        rdev = wiphy_to_rdev(wdev->wiphy);
        if (!rdev->ops->get_station)
                return -EOPNOTSUPP;

        memset(sinfo, 0, sizeof(*sinfo));

        wiphy_lock(&rdev->wiphy);
        ret = rdev_get_station(rdev, dev, mac_addr, sinfo);
        wiphy_unlock(&rdev->wiphy);

        return ret;
}
EXPORT_SYMBOL(cfg80211_get_station);

void cfg80211_free_nan_func(struct cfg80211_nan_func *f)
{
        int i;

        if (!f)
                return;

        kfree(f->serv_spec_info);
        kfree(f->srf_bf);
        kfree(f->srf_macs);
        for (i = 0; i < f->num_rx_filters; i++)
                kfree(f->rx_filters[i].filter);

        for (i = 0; i < f->num_tx_filters; i++)
                kfree(f->tx_filters[i].filter);

        kfree(f->rx_filters);
        kfree(f->tx_filters);
        kfree(f);
}
EXPORT_SYMBOL(cfg80211_free_nan_func);

bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
                                u32 center_freq_khz, u32 bw_khz)
{
        u32 start_freq_khz, end_freq_khz;

        start_freq_khz = center_freq_khz - (bw_khz / 2);
        end_freq_khz = center_freq_khz + (bw_khz / 2);

        if (start_freq_khz >= freq_range->start_freq_khz &&
            end_freq_khz <= freq_range->end_freq_khz)
                return true;

        return false;
}

int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
{
        sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1,
                                sizeof(*(sinfo->pertid)),
                                gfp);
        if (!sinfo->pertid)
                return -ENOMEM;

        return 0;
}
EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats);

/* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
/* Ethernet-II snap header (RFC1042 for most EtherTypes) */
const unsigned char rfc1042_header[] __aligned(2) =
        { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 };
EXPORT_SYMBOL(rfc1042_header);

/* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */
const unsigned char bridge_tunnel_header[] __aligned(2) =
        { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
EXPORT_SYMBOL(bridge_tunnel_header);

/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
struct iapp_layer2_update {
        u8 da[ETH_ALEN];        /* broadcast */
        u8 sa[ETH_ALEN];        /* STA addr */
        __be16 len;                /* 6 */
        u8 dsap;                /* 0 */
        u8 ssap;                /* 0 */
        u8 control;
        u8 xid_info[3];
} __packed;

void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr)
{
        struct iapp_layer2_update *msg;
        struct sk_buff *skb;

        /* Send Level 2 Update Frame to update forwarding tables in layer 2
         * bridge devices */

        skb = dev_alloc_skb(sizeof(*msg));
        if (!skb)
                return;
        msg = skb_put(skb, sizeof(*msg));

        /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
         * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */

        eth_broadcast_addr(msg->da);
        ether_addr_copy(msg->sa, addr);
        msg->len = htons(6);
        msg->dsap = 0;
        msg->ssap = 0x01;        /* NULL LSAP, CR Bit: Response */
        msg->control = 0xaf;        /* XID response lsb.1111F101.
                                 * F=0 (no poll command; unsolicited frame) */
        msg->xid_info[0] = 0x81;        /* XID format identifier */
        msg->xid_info[1] = 1;        /* LLC types/classes: Type 1 LLC */
        msg->xid_info[2] = 0;        /* XID sender's receive window size (RW) */

        skb->dev = dev;
        skb->protocol = eth_type_trans(skb, dev);
        memset(skb->cb, 0, sizeof(skb->cb));
        netif_rx(skb);
}
EXPORT_SYMBOL(cfg80211_send_layer2_update);

int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
                              enum ieee80211_vht_chanwidth bw,
                              int mcs, bool ext_nss_bw_capable,
                              unsigned int max_vht_nss)
{
        u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map);
        int ext_nss_bw;
        int supp_width;
        int i, mcs_encoding;

        if (map == 0xffff)
                return 0;

        if (WARN_ON(mcs > 9 || max_vht_nss > 8))
                return 0;
        if (mcs <= 7)
                mcs_encoding = 0;
        else if (mcs == 8)
                mcs_encoding = 1;
        else
                mcs_encoding = 2;

        if (!max_vht_nss) {
                /* find max_vht_nss for the given MCS */
                for (i = 7; i >= 0; i--) {
                        int supp = (map >> (2 * i)) & 3;

                        if (supp == 3)
                                continue;

                        if (supp >= mcs_encoding) {
                                max_vht_nss = i + 1;
                                break;
                        }
                }
        }

        if (!(cap->supp_mcs.tx_mcs_map &
                        cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
                return max_vht_nss;

        ext_nss_bw = le32_get_bits(cap->vht_cap_info,
                                   IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
        supp_width = le32_get_bits(cap->vht_cap_info,
                                   IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK);

        /* if not capable, treat ext_nss_bw as 0 */
        if (!ext_nss_bw_capable)
                ext_nss_bw = 0;

        /* This is invalid */
        if (supp_width == 3)
                return 0;

        /* This is an invalid combination so pretend nothing is supported */
        if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2))
                return 0;

        /*
         * Cover all the special cases according to IEEE 802.11-2016
         * Table 9-250. All other cases are either factor of 1 or not
         * valid/supported.
         */
        switch (bw) {
        case IEEE80211_VHT_CHANWIDTH_USE_HT:
        case IEEE80211_VHT_CHANWIDTH_80MHZ:
                if ((supp_width == 1 || supp_width == 2) &&
                    ext_nss_bw == 3)
                        return 2 * max_vht_nss;
                break;
        case IEEE80211_VHT_CHANWIDTH_160MHZ:
                if (supp_width == 0 &&
                    (ext_nss_bw == 1 || ext_nss_bw == 2))
                        return max_vht_nss / 2;
                if (supp_width == 0 &&
                    ext_nss_bw == 3)
                        return (3 * max_vht_nss) / 4;
                if (supp_width == 1 &&
                    ext_nss_bw == 3)
                        return 2 * max_vht_nss;
                break;
        case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
                if (supp_width == 0 && ext_nss_bw == 1)
                        return 0; /* not possible */
                if (supp_width == 0 &&
                    ext_nss_bw == 2)
                        return max_vht_nss / 2;
                if (supp_width == 0 &&
                    ext_nss_bw == 3)
                        return (3 * max_vht_nss) / 4;
                if (supp_width == 1 &&
                    ext_nss_bw == 0)
                        return 0; /* not possible */
                if (supp_width == 1 &&
                    ext_nss_bw == 1)
                        return max_vht_nss / 2;
                if (supp_width == 1 &&
                    ext_nss_bw == 2)
                        return (3 * max_vht_nss) / 4;
                break;
        }

        /* not covered or invalid combination received */
        return max_vht_nss;
}
EXPORT_SYMBOL(ieee80211_get_vht_max_nss);

bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype,
                             bool is_4addr, u8 check_swif)

{
        bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN;

        switch (check_swif) {
        case 0:
                if (is_vlan && is_4addr)
                        return wiphy->flags & WIPHY_FLAG_4ADDR_AP;
                return wiphy->interface_modes & BIT(iftype);
        case 1:
                if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan)
                        return wiphy->flags & WIPHY_FLAG_4ADDR_AP;
                return wiphy->software_iftypes & BIT(iftype);
        default:
                break;
        }

        return false;
}
EXPORT_SYMBOL(cfg80211_iftype_allowed);

void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);

        lockdep_assert_wiphy(wdev->wiphy);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                cfg80211_stop_ap(rdev, wdev->netdev, link_id, true);
                break;
        default:
                /* per-link not relevant */
                break;
        }

        wdev->valid_links &= ~BIT(link_id);

        rdev_del_intf_link(rdev, wdev, link_id);

        eth_zero_addr(wdev->links[link_id].addr);
}

void cfg80211_remove_links(struct wireless_dev *wdev)
{
        unsigned int link_id;

        /*
         * links are controlled by upper layers (userspace/cfg)
         * only for AP mode, so only remove them here for AP
         */
        if (wdev->iftype != NL80211_IFTYPE_AP)
                return;

        if (wdev->valid_links) {
                for_each_valid_link(wdev, link_id)
                        cfg80211_remove_link(wdev, link_id);
        }
}

int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev)
{
        cfg80211_remove_links(wdev);

        return rdev_del_virtual_intf(rdev, wdev);
}

const struct wiphy_iftype_ext_capab *
cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type)
{
        int i;

        for (i = 0; i < wiphy->num_iftype_ext_capab; i++) {
                if (wiphy->iftype_ext_capab[i].iftype == type)
                        return &wiphy->iftype_ext_capab[i];
        }

        return NULL;
}
EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa);
















































































    8 



















































































































































































































































































    1 
    1 
    1 






















    1 

    1 
    1 





















    1 
    1 


















    1 
















    1 













    1 







































    1 












    1 













    1 




    1 












    1 




    1 

















    1 





















    1 


    1 


















    1 






    1 






































































































    1 




















    1 
















































    1 















































































































































    1 

















    1 






























    1 


    1 




















    1 




    1 

    1 



































































































    1 





















    1 

    1 






    1 


































    1 
    1 
























    1 







    1 






















    1 






    1 






































































    1 









    1 



    1 

    1 

















































































































































































    1 



    1 







    1 












    1 


    1 







    1 













































































































    1 
































































    1 

    1 

























    1 































    1 



    1 

















































    1 











































    1 
















    1 

































    1 
    1 

    1 
    1 























    1 
























    1 




    1 










    1 
    1 
    1 




    1 














    1 















    1 









































































































































































































































































































    1 


    1 














    1 





    1 


    1 

















    1 

























    1 


    1 







    1 












    1 

    1 






































































































































































































































































































































































































































































































































































































































































    1 









    1 



























    1 









































































    1 


























































































































    1 






















    1 













    1 





































































































































































    1 

















    1 



























    1 


































































    1 













    1 



    1 


    1 































    1 





    1 





    1 
















































    1 

















    1 






























































    1 












    1 
































































    1 


















    1 







































    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
 */

#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
#include <linux/page_owner.h>

#define CREATE_TRACE_POINTS
#include <trace/events/vmalloc.h>

#include "internal.h"
#include "pgalloc-track.h"

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;

static int __init set_nohugeiomap(char *str)
{
        ioremap_max_page_shift = PAGE_SHIFT;
        return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;

static int __init set_nohugevmalloc(char *str)
{
        vmap_allow_huge = false;
        return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */

bool is_vmalloc_addr(const void *x)
{
        unsigned long addr = (unsigned long)kasan_reset_tag(x);

        return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);

struct vfree_deferred {
        struct llist_head list;
        struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pte_t *pte;
        u64 pfn;
        struct page *page;
        unsigned long size = PAGE_SIZE;

        pfn = phys_addr >> PAGE_SHIFT;
        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;
        do {
                if (!pte_none(ptep_get(pte))) {
                        if (pfn_valid(pfn)) {
                                page = pfn_to_page(pfn);
                                dump_page(page, "remapping already mapped page");
                        }
                        BUG();
                }

#ifdef CONFIG_HUGETLB_PAGE
                size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
                if (size != PAGE_SIZE) {
                        pte_t entry = pfn_pte(pfn, prot);

                        entry = arch_make_huge_pte(entry, ilog2(size), 0);
                        set_huge_pte_at(&init_mm, addr, pte, entry, size);
                        pfn += PFN_DOWN(size);
                        continue;
                }
#endif
                set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
                pfn++;
        } while (pte += PFN_DOWN(size), addr += size, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PMD_SHIFT)
                return 0;

        if (!arch_vmap_pmd_supported(prot))
                return 0;

        if ((end - addr) != PMD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PMD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PMD_SIZE))
                return 0;

        if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
                return 0;

        return pmd_set_huge(pmd, phys_addr, prot);
}

static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);

                if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PMD_MODIFIED;
                        continue;
                }

                if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
                        return -ENOMEM;
        } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PUD_SHIFT)
                return 0;

        if (!arch_vmap_pud_supported(prot))
                return 0;

        if ((end - addr) != PUD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PUD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PUD_SIZE))
                return 0;

        if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
                return 0;

        return pud_set_huge(pud, phys_addr, prot);
}

static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);

                if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PUD_MODIFIED;
                        continue;
                }

                if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < P4D_SHIFT)
                return 0;

        if (!arch_vmap_p4d_supported(prot))
                return 0;

        if ((end - addr) != P4D_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, P4D_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, P4D_SIZE))
                return 0;

        if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
                return 0;

        return p4d_set_huge(p4d, phys_addr, prot);
}

static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);

                if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_P4D_MODIFIED;
                        continue;
                }

                if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_range_noflush(unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        pgd_t *pgd;
        unsigned long start;
        unsigned long next;
        int err;
        pgtbl_mod_mask mask = 0;

        might_sleep();
        BUG_ON(addr >= end);

        start = addr;
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
                                        max_page_shift, &mask);
                if (err)
                        break;
        } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

int vmap_page_range(unsigned long addr, unsigned long end,
                    phys_addr_t phys_addr, pgprot_t prot)
{
        int err;

        err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
                                 ioremap_max_page_shift);
        flush_cache_vmap(addr, end);
        if (!err)
                err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
                                               ioremap_max_page_shift);
        return err;
}

int ioremap_page_range(unsigned long addr, unsigned long end,
                phys_addr_t phys_addr, pgprot_t prot)
{
        struct vm_struct *area;

        area = find_vm_area((void *)addr);
        if (!area || !(area->flags & VM_IOREMAP)) {
                WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
                return -EINVAL;
        }
        if (addr != (unsigned long)area->addr ||
            (void *)end != area->addr + get_vm_area_size(area)) {
                WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
                          addr, end, (long)area->addr,
                          (long)area->addr + get_vm_area_size(area));
                return -ERANGE;
        }
        return vmap_page_range(addr, end, phys_addr, prot);
}

static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pte_t *pte;

        pte = pte_offset_kernel(pmd, addr);
        do {
                pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
                WARN_ON(!pte_none(ptent) && !pte_present(ptent));
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
}

static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int cleared;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);

                cleared = pmd_clear_huge(pmd);
                if (cleared || pmd_bad(*pmd))
                        *mask |= PGTBL_PMD_MODIFIED;

                if (cleared)
                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next, mask);

                cond_resched();
        } while (pmd++, addr = next, addr != end);
}

static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int cleared;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);

                cleared = pud_clear_huge(pud);
                if (cleared || pud_bad(*pud))
                        *mask |= PGTBL_PUD_MODIFIED;

                if (cleared)
                        continue;
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next, mask);
        } while (pud++, addr = next, addr != end);
}

static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);

                p4d_clear_huge(p4d);
                if (p4d_bad(*p4d))
                        *mask |= PGTBL_P4D_MODIFIED;

                if (p4d_none_or_clear_bad(p4d))
                        continue;
                vunmap_pud_range(p4d, addr, next, mask);
        } while (p4d++, addr = next, addr != end);
}

/*
 * vunmap_range_noflush is similar to vunmap_range, but does not
 * flush caches or TLBs.
 *
 * The caller is responsible for calling flush_cache_vmap() before calling
 * this function, and flush_tlb_kernel_range after it has returned
 * successfully (and before the addresses are expected to cause a page fault
 * or be re-mapped for something else, if TLB flushes are being delayed or
 * coalesced).
 *
 * This is an internal function only. Do not use outside mm/.
 */
void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
        unsigned long next;
        pgd_t *pgd;
        unsigned long addr = start;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                vunmap_p4d_range(pgd, addr, next, &mask);
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);
}

void vunmap_range_noflush(unsigned long start, unsigned long end)
{
        kmsan_vunmap_range_noflush(start, end);
        __vunmap_range_noflush(start, end);
}

/**
 * vunmap_range - unmap kernel virtual addresses
 * @addr: start of the VM area to unmap
 * @end: end of the VM area to unmap (non-inclusive)
 *
 * Clears any present PTEs in the virtual address range, flushes TLBs and
 * caches. Any subsequent access to the address before it has been re-mapped
 * is a kernel bug.
 */
void vunmap_range(unsigned long addr, unsigned long end)
{
        flush_cache_vunmap(addr, end);
        vunmap_range_noflush(addr, end);
        flush_tlb_kernel_range(addr, end);
}

static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pte_t *pte;

        /*
         * nr is a running index into the array which helps higher level
         * callers keep track of where we're up to.
         */

        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;
        do {
                struct page *page = pages[*nr];

                if (WARN_ON(!pte_none(ptep_get(pte))))
                        return -EBUSY;
                if (WARN_ON(!page))
                        return -ENOMEM;
                if (WARN_ON(!pfn_valid(page_to_pfn(page))))
                        return -EINVAL;

                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
                (*nr)++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
                if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages)
{
        unsigned long start = addr;
        pgd_t *pgd;
        unsigned long next;
        int err = 0;
        int nr = 0;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return 0;
}

/*
 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
 * flush caches.
 *
 * The caller is responsible for calling flush_cache_vmap() after this
 * function returns successfully and before the addresses are accessed.
 *
 * This is an internal function only. Do not use outside mm/.
 */
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        unsigned int i, nr = (end - addr) >> PAGE_SHIFT;

        WARN_ON(page_shift < PAGE_SHIFT);

        if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
                        page_shift == PAGE_SHIFT)
                return vmap_small_pages_range_noflush(addr, end, prot, pages);

        for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
                int err;

                err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                                        page_to_phys(pages[i]), prot,
                                        page_shift);
                if (err)
                        return err;

                addr += 1UL << page_shift;
        }

        return 0;
}

int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
                                                 page_shift);

        if (ret)
                return ret;
        return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}

/**
 * vmap_pages_range - map pages to a kernel virtual address
 * @addr: start of the VM area to map
 * @end: end of the VM area to map (non-inclusive)
 * @prot: page protection flags to use
 * @pages: pages to map (always PAGE_SIZE pages)
 * @page_shift: maximum shift that the pages may be mapped with, @pages must
 * be aligned and contiguous up to at least this shift.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
static int vmap_pages_range(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int err;

        err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
        flush_cache_vmap(addr, end);
        return err;
}

static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
                                unsigned long end)
{
        might_sleep();
        if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
                return -EINVAL;
        if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
                return -EINVAL;
        if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
                return -EINVAL;
        if ((end - start) >> PAGE_SHIFT > totalram_pages())
                return -E2BIG;
        if (start < (unsigned long)area->addr ||
            (void *)end > area->addr + get_vm_area_size(area))
                return -ERANGE;
        return 0;
}

/**
 * vm_area_map_pages - map pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 * @pages: pages to map (always PAGE_SIZE pages)
 */
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
                      unsigned long end, struct page **pages)
{
        int err;

        err = check_sparse_vm_area(area, start, end);
        if (err)
                return err;

        return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
}

/**
 * vm_area_unmap_pages - unmap pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 */
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
                         unsigned long end)
{
        if (check_sparse_vm_area(area, start, end))
                return;

        vunmap_range(start, end);
}

int is_vmalloc_or_module_addr(const void *x)
{
        /*
         * ARM, x86-64 and sparc64 put modules in a special place,
         * and fall back on vmalloc() if that fails. Others
         * just put it in the vmalloc space.
         */
#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
        unsigned long addr = (unsigned long)kasan_reset_tag(x);
        if (addr >= MODULES_VADDR && addr < MODULES_END)
                return 1;
#endif
        return is_vmalloc_addr(x);
}
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);

/*
 * Walk a vmap address to the struct page it maps. Huge vmap mappings will
 * return the tail page that corresponds to the base page address, which
 * matches small vmap mappings.
 */
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
        unsigned long addr = (unsigned long) vmalloc_addr;
        struct page *page = NULL;
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;

        /*
         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
         * architectures that do not vmalloc module space
         */
        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));

        if (pgd_none(*pgd))
                return NULL;
        if (WARN_ON_ONCE(pgd_leaf(*pgd)))
                return NULL; /* XXX: no allowance for huge pgd */
        if (WARN_ON_ONCE(pgd_bad(*pgd)))
                return NULL;

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d))
                return NULL;
        if (p4d_leaf(*p4d))
                return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(p4d_bad(*p4d)))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud))
                return NULL;
        if (pud_leaf(*pud))
                return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pud_bad(*pud)))
                return NULL;

        pmd = pmd_offset(pud, addr);
        if (pmd_none(*pmd))
                return NULL;
        if (pmd_leaf(*pmd))
                return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pmd_bad(*pmd)))
                return NULL;

        ptep = pte_offset_kernel(pmd, addr);
        pte = ptep_get(ptep);
        if (pte_present(pte))
                page = pte_page(pte);

        return page;
}
EXPORT_SYMBOL(vmalloc_to_page);

/*
 * Map a vmalloc()-space virtual address to the physical page frame number.
 */
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);


/*** Global kva allocator ***/

#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0


static DEFINE_SPINLOCK(free_vmap_area_lock);
static bool vmap_initialized __read_mostly;

/*
 * This kmem_cache is used for vmap_area objects. Instead of
 * allocating from slab we reuse an object from this cache to
 * make things faster. Especially in "no edge" splitting of
 * free block.
 */
static struct kmem_cache *vmap_area_cachep;

/*
 * This linked list is used in pair with free_vmap_area_root.
 * It gives O(1) access to prev/next to perform fast coalescing.
 */
static LIST_HEAD(free_vmap_area_list);

/*
 * This augment red-black tree represents the free vmap space.
 * All vmap_area objects in this tree are sorted by va->va_start
 * address. It is used for allocation and merging when a vmap
 * object is released.
 *
 * Each vmap_area node contains a maximum available free block
 * of its sub-tree, right or left. Therefore it is possible to
 * find a lowest match of free area.
 */
static struct rb_root free_vmap_area_root = RB_ROOT;

/*
 * Preload a CPU with one object for "no edge" split case. The
 * aim is to get rid of allocations from the atomic context, thus
 * to use more permissive allocation masks.
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

/*
 * This structure defines a single, solid model where a list and
 * rb-tree are part of one entity protected by the lock. Nodes are
 * sorted in ascending order, thus for O(1) access to left/right
 * neighbors a list is used as well as for sequential traversal.
 */
struct rb_list {
        struct rb_root root;
        struct list_head head;
        spinlock_t lock;
};

/*
 * A fast size storage contains VAs up to 1M size. A pool consists
 * of linked between each other ready to go VAs of certain sizes.
 * An index in the pool-array corresponds to number of pages + 1.
 */
#define MAX_VA_SIZE_PAGES 256

struct vmap_pool {
        struct list_head head;
        unsigned long len;
};

/*
 * An effective vmap-node logic. Users make use of nodes instead
 * of a global heap. It allows to balance an access and mitigate
 * contention.
 */
static struct vmap_node {
        /* Simple size segregated storage. */
        struct vmap_pool pool[MAX_VA_SIZE_PAGES];
        spinlock_t pool_lock;
        bool skip_populate;

        /* Bookkeeping data of this node. */
        struct rb_list busy;
        struct rb_list lazy;

        /*
         * Ready-to-free areas.
         */
        struct list_head purge_list;
        struct work_struct purge_work;
        unsigned long nr_purged;
} single;

/*
 * Initial setup consists of one single node, i.e. a balancing
 * is fully disabled. Later on, after vmap is initialized these
 * parameters are updated based on a system capacity.
 */
static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
        return (addr / vmap_zone_size) % nr_vmap_nodes;
}

static inline struct vmap_node *
addr_to_node(unsigned long addr)
{
        return &vmap_nodes[addr_to_node_id(addr)];
}

static inline struct vmap_node *
id_to_node(unsigned int id)
{
        return &vmap_nodes[id % nr_vmap_nodes];
}

/*
 * We use the value 0 to represent "no node", that is why
 * an encoded value will be the node-id incremented by 1.
 * It is always greater then 0. A valid node_id which can
 * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
 * is not valid 0 is returned.
 */
static unsigned int
encode_vn_id(unsigned int node_id)
{
        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return (node_id + 1) << BITS_PER_BYTE;

        /* Warn and no node encoded. */
        WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
        return 0;
}

/*
 * Returns an encoded node-id, the valid range is within
 * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
 * returned if extracted data is wrong.
 */
static unsigned int
decode_vn_id(unsigned int val)
{
        unsigned int node_id = (val >> BITS_PER_BYTE) - 1;

        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return node_id;

        /* If it was _not_ zero, warn. */
        WARN_ONCE(node_id != UINT_MAX,
                "Decode wrong node id (%d)\n", node_id);

        return nr_vmap_nodes;
}

static bool
is_vn_id_valid(unsigned int node_id)
{
        if (node_id < nr_vmap_nodes)
                return true;

        return false;
}

static __always_inline unsigned long
va_size(struct vmap_area *va)
{
        return (va->va_end - va->va_start);
}

static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
        struct vmap_area *va;

        va = rb_entry_safe(node, struct vmap_area, rb_node);
        return va ? va->subtree_max_size : 0;
}

RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
        struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)

static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);

static atomic_long_t nr_vmalloc_pages;

unsigned long vmalloc_nr_pages(void)
{
        return atomic_long_read(&nr_vmalloc_pages);
}

static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *va;

                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
        }

        return NULL;
}

/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *
__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
        struct vmap_area *va = NULL;
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *tmp;

                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_end > addr) {
                        va = tmp;
                        if (tmp->va_start <= addr)
                                break;

                        n = n->rb_left;
                } else
                        n = n->rb_right;
        }

        return va;
}

/*
 * Returns a node where a first VA, that satisfies addr < va_end, resides.
 * If success, a node is locked. A user is responsible to unlock it when a
 * VA is no longer needed to be accessed.
 *
 * Returns NULL if nothing found.
 */
static struct vmap_node *
find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
        unsigned long va_start_lowest;
        struct vmap_node *vn;
        int i;

repeat:
        for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);

                if (*va)
                        if (!va_start_lowest || (*va)->va_start < va_start_lowest)
                                va_start_lowest = (*va)->va_start;
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Check if found VA exists, it might have gone away.  In this case we
         * repeat the search because a VA has been removed concurrently and we
         * need to proceed to the next one, which is a rare case.
         */
        if (va_start_lowest) {
                vn = addr_to_node(va_start_lowest);

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area(va_start_lowest, &vn->busy.root);

                if (*va)
                        return vn;

                spin_unlock(&vn->busy.lock);
                goto repeat;
        }

        return NULL;
}

/*
 * This function returns back addresses of parent node
 * and its left or right link for further processing.
 *
 * Otherwise NULL is returned. In that case all further
 * steps regarding inserting of conflicting overlap range
 * have to be declined and actually considered as a bug.
 */
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
        struct rb_root *root, struct rb_node *from,
        struct rb_node **parent)
{
        struct vmap_area *tmp_va;
        struct rb_node **link;

        if (root) {
                link = &root->rb_node;
                if (unlikely(!*link)) {
                        *parent = NULL;
                        return link;
                }
        } else {
                link = &from;
        }

        /*
         * Go to the bottom of the tree. When we hit the last point
         * we end up with parent rb_node and correct direction, i name
         * it link, where the new va->rb_node will be attached to.
         */
        do {
                tmp_va = rb_entry(*link, struct vmap_area, rb_node);

                /*
                 * During the traversal we also do some sanity check.
                 * Trigger the BUG() if there are sides(left/right)
                 * or full overlaps.
                 */
                if (va->va_end <= tmp_va->va_start)
                        link = &(*link)->rb_left;
                else if (va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
                else {
                        WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
                                va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);

                        return NULL;
                }
        } while (*link);

        *parent = &tmp_va->rb_node;
        return link;
}

static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
        struct list_head *list;

        if (unlikely(!parent))
                /*
                 * The red-black tree where we try to find VA neighbors
                 * before merging or inserting is empty, i.e. it means
                 * there is no free vmap space. Normally it does not
                 * happen but we handle this case anyway.
                 */
                return NULL;

        list = &rb_entry(parent, struct vmap_area, rb_node)->list;
        return (&parent->rb_right == link ? list->next : list);
}

static __always_inline void
__link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head, bool augment)
{
        /*
         * VA is still not in the list, but we can
         * identify its future previous list_head node.
         */
        if (likely(parent)) {
                head = &rb_entry(parent, struct vmap_area, rb_node)->list;
                if (&parent->rb_right != link)
                        head = head->prev;
        }

        /* Insert to the rb-tree */
        rb_link_node(&va->rb_node, parent, link);
        if (augment) {
                /*
                 * Some explanation here. Just perform simple insertion
                 * to the tree. We do not set va->subtree_max_size to
                 * its current size before calling rb_insert_augmented().
                 * It is because we populate the tree from the bottom
                 * to parent levels when the node _is_ in the tree.
                 *
                 * Therefore we set subtree_max_size to zero after insertion,
                 * to let __augment_tree_propagate_from() puts everything to
                 * the correct order later on.
                 */
                rb_insert_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
                va->subtree_max_size = 0;
        } else {
                rb_insert_color(&va->rb_node, root);
        }

        /* Address-sort this list */
        list_add(&va->list, head);
}

static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, false);
}

static __always_inline void
link_va_augment(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, true);
}

static __always_inline void
__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
        if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
                return;

        if (augment)
                rb_erase_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
        else
                rb_erase(&va->rb_node, root);

        list_del_init(&va->list);
        RB_CLEAR_NODE(&va->rb_node);
}

static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, false);
}

static __always_inline void
unlink_va_augment(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, true);
}

#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
 * Gets called when remove the node and rotate.
 */
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
        return max3(va_size(va),
                get_subtree_max_size(va->rb_node.rb_left),
                get_subtree_max_size(va->rb_node.rb_right));
}

static void
augment_tree_propagate_check(void)
{
        struct vmap_area *va;
        unsigned long computed_size;

        list_for_each_entry(va, &free_vmap_area_list, list) {
                computed_size = compute_subtree_max_size(va);
                if (computed_size != va->subtree_max_size)
                        pr_emerg("tree is corrupted: %lu, %lu\n",
                                va_size(va), va->subtree_max_size);
        }
}
#endif

/*
 * This function populates subtree_max_size from bottom to upper
 * levels starting from VA point. The propagation must be done
 * when VA size is modified by changing its va_start/va_end. Or
 * in case of newly inserting of VA to the tree.
 *
 * It means that __augment_tree_propagate_from() must be called:
 * - After VA has been inserted to the tree(free path);
 * - After VA has been shrunk(allocation path);
 * - After VA has been increased(merging path).
 *
 * Please note that, it does not mean that upper parent nodes
 * and their subtree_max_size are recalculated all the time up
 * to the root node.
 *
 *       4--8
 *        /\
 *       /  \
 *      /    \
 *    2--2  8--8
 *
 * For example if we modify the node 4, shrinking it to 2, then
 * no any modification is required. If we shrink the node 2 to 1
 * its subtree_max_size is updated only, and set to 1. If we shrink
 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 * node becomes 4--6.
 */
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
        /*
         * Populate the tree from bottom towards the root until
         * the calculated maximum available size of checked node
         * is equal to its current one.
         */
        free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);

#if DEBUG_AUGMENT_PROPAGATE_CHECK
        augment_tree_propagate_check();
#endif
}

static void
insert_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        link = find_va_links(va, root, NULL, &parent);
        if (link)
                link_va(va, root, parent, link, head);
}

static void
insert_vmap_area_augment(struct vmap_area *va,
        struct rb_node *from, struct rb_root *root,
        struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        if (from)
                link = find_va_links(va, NULL, from, &parent);
        else
                link = find_va_links(va, root, NULL, &parent);

        if (link) {
                link_va_augment(va, root, parent, link, head);
                augment_tree_propagate_from(va);
        }
}

/*
 * Merge de-allocated chunk of VA memory with previous
 * and next free blocks. If coalesce is not done a new
 * free area is inserted. If VA has been merged, it is
 * freed.
 *
 * Please note, it can return NULL in case of overlap
 * ranges, followed by WARN() report. Despite it is a
 * buggy behaviour, a system can be alive and keep
 * ongoing.
 */
static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head, bool augment)
{
        struct vmap_area *sibling;
        struct list_head *next;
        struct rb_node **link;
        struct rb_node *parent;
        bool merged = false;

        /*
         * Find a place in the tree where VA potentially will be
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
        if (!link)
                return NULL;

        /*
         * Get next node of VA to check if merging can be done.
         */
        next = get_va_next_sibling(parent, link);
        if (unlikely(next == NULL))
                goto insert;

        /*
         * start            end
         * |                |
         * |<------VA------>|<-----Next----->|
         *                  |                |
         *                  start            end
         */
        if (next != head) {
                sibling = list_entry(next, struct vmap_area, list);
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

        /*
         * start            end
         * |                |
         * |<-----Prev----->|<------VA------>|
         *                  |                |
         *                  start            end
         */
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
                        /*
                         * If both neighbors are coalesced, it is important
                         * to unlink the "next" node first, followed by merging
                         * with "previous" one. Otherwise the tree might not be
                         * fully populated if a sibling's augmented value is
                         * "normalized" because of rotation operations.
                         */
                        if (merged)
                                __unlink_va(va, root, augment);

                        sibling->va_end = va->va_end;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

insert:
        if (!merged)
                __link_va(va, root, parent, link, head, augment);

        return va;
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        return __merge_or_add_vmap_area(va, root, head, false);
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        va = __merge_or_add_vmap_area(va, root, head, true);
        if (va)
                augment_tree_propagate_from(va);

        return va;
}

static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        unsigned long nva_start_addr;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Can be overflowed due to big size or alignment. */
        if (nva_start_addr + size < nva_start_addr ||
                        nva_start_addr < vstart)
                return false;

        return (nva_start_addr + size <= va->va_end);
}

/*
 * Find the first free block(lowest start address) in the tree,
 * that will accomplish the request corresponding to passing
 * parameters. Please note, with an alignment bigger than PAGE_SIZE,
 * a search length is adjusted to account for worst case alignment
 * overhead.
 */
static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root *root, unsigned long size,
        unsigned long align, unsigned long vstart, bool adjust_search_size)
{
        struct vmap_area *va;
        struct rb_node *node;
        unsigned long length;

        /* Start from the root. */
        node = root->rb_node;

        /* Adjust the search size for alignment overhead. */
        length = adjust_search_size ? size + align - 1 : size;

        while (node) {
                va = rb_entry(node, struct vmap_area, rb_node);

                if (get_subtree_max_size(node->rb_left) >= length &&
                                vstart < va->va_start) {
                        node = node->rb_left;
                } else {
                        if (is_within_this_va(va, size, align, vstart))
                                return va;

                        /*
                         * Does not make sense to go deeper towards the right
                         * sub-tree if it does not have a free block that is
                         * equal or bigger to the requested search length.
                         */
                        if (get_subtree_max_size(node->rb_right) >= length) {
                                node = node->rb_right;
                                continue;
                        }

                        /*
                         * OK. We roll back and find the first right sub-tree,
                         * that will satisfy the search criteria. It can happen
                         * due to "vstart" restriction or an alignment overhead
                         * that is bigger then PAGE_SIZE.
                         */
                        while ((node = rb_parent(node))) {
                                va = rb_entry(node, struct vmap_area, rb_node);
                                if (is_within_this_va(va, size, align, vstart))
                                        return va;

                                if (get_subtree_max_size(node->rb_right) >= length &&
                                                vstart <= va->va_start) {
                                        /*
                                         * Shift the vstart forward. Please note, we update it with
                                         * parent's start address adding "1" because we do not want
                                         * to enter same sub-tree after it has already been checked
                                         * and no suitable free block found there.
                                         */
                                        vstart = va->va_start + 1;
                                        node = node->rb_right;
                                        break;
                                }
                        }
                }
        }

        return NULL;
}

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>

static struct vmap_area *
find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        struct vmap_area *va;

        list_for_each_entry(va, head, list) {
                if (!is_within_this_va(va, size, align, vstart))
                        continue;

                return va;
        }

        return NULL;
}

static void
find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
                             unsigned long size, unsigned long align)
{
        struct vmap_area *va_1, *va_2;
        unsigned long vstart;
        unsigned int rnd;

        get_random_bytes(&rnd, sizeof(rnd));
        vstart = VMALLOC_START + rnd;

        va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
        va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);

        if (va_1 != va_2)
                pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
                        va_1, va_2, vstart);
}
#endif

enum fit_type {
        NOTHING_FIT = 0,
        FL_FIT_TYPE = 1,        /* full fit */
        LE_FIT_TYPE = 2,        /* left edge fit */
        RE_FIT_TYPE = 3,        /* right edge fit */
        NE_FIT_TYPE = 4                /* no edge fit */
};

static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
        unsigned long nva_start_addr, unsigned long size)
{
        enum fit_type type;

        /* Check if it is within VA. */
        if (nva_start_addr < va->va_start ||
                        nva_start_addr + size > va->va_end)
                return NOTHING_FIT;

        /* Now classify. */
        if (va->va_start == nva_start_addr) {
                if (va->va_end == nva_start_addr + size)
                        type = FL_FIT_TYPE;
                else
                        type = LE_FIT_TYPE;
        } else if (va->va_end == nva_start_addr + size) {
                type = RE_FIT_TYPE;
        } else {
                type = NE_FIT_TYPE;
        }

        return type;
}

static __always_inline int
va_clip(struct rb_root *root, struct list_head *head,
                struct vmap_area *va, unsigned long nva_start_addr,
                unsigned long size)
{
        struct vmap_area *lva = NULL;
        enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);

        if (type == FL_FIT_TYPE) {
                /*
                 * No need to split VA, it fully fits.
                 *
                 * |               |
                 * V      NVA      V
                 * |---------------|
                 */
                unlink_va_augment(va, root);
                kmem_cache_free(vmap_area_cachep, va);
        } else if (type == LE_FIT_TYPE) {
                /*
                 * Split left edge of fit VA.
                 *
                 * |       |
                 * V  NVA  V   R
                 * |-------|-------|
                 */
                va->va_start += size;
        } else if (type == RE_FIT_TYPE) {
                /*
                 * Split right edge of fit VA.
                 *
                 *         |       |
                 *     L   V  NVA  V
                 * |-------|-------|
                 */
                va->va_end = nva_start_addr;
        } else if (type == NE_FIT_TYPE) {
                /*
                 * Split no edge of fit VA.
                 *
                 *     |       |
                 *   L V  NVA  V R
                 * |---|-------|---|
                 */
                lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
                if (unlikely(!lva)) {
                        /*
                         * For percpu allocator we do not do any pre-allocation
                         * and leave it as it is. The reason is it most likely
                         * never ends up with NE_FIT_TYPE splitting. In case of
                         * percpu allocations offsets and sizes are aligned to
                         * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
                         * are its main fitting cases.
                         *
                         * There are a few exceptions though, as an example it is
                         * a first allocation (early boot up) when we have "one"
                         * big free space that has to be split.
                         *
                         * Also we can hit this path in case of regular "vmap"
                         * allocations, if "this" current CPU was not preloaded.
                         * See the comment in alloc_vmap_area() why. If so, then
                         * GFP_NOWAIT is used instead to get an extra object for
                         * split purpose. That is rare and most time does not
                         * occur.
                         *
                         * What happens if an allocation gets failed. Basically,
                         * an "overflow" path is triggered to purge lazily freed
                         * areas to free some memory, then, the "retry" path is
                         * triggered to repeat one more time. See more details
                         * in alloc_vmap_area() function.
                         */
                        lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!lva)
                                return -1;
                }

                /*
                 * Build the remainder.
                 */
                lva->va_start = va->va_start;
                lva->va_end = nva_start_addr;

                /*
                 * Shrink this VA to remaining size.
                 */
                va->va_start = nva_start_addr + size;
        } else {
                return -1;
        }

        if (type != FL_FIT_TYPE) {
                augment_tree_propagate_from(va);

                if (lva)        /* type == NE_FIT_TYPE */
                        insert_vmap_area_augment(lva, &va->rb_node, root, head);
        }

        return 0;
}

static unsigned long
va_alloc(struct vmap_area *va,
                struct rb_root *root, struct list_head *head,
                unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend)
{
        unsigned long nva_start_addr;
        int ret;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Check the "vend" restriction. */
        if (nva_start_addr + size > vend)
                return vend;

        /* Update the free vmap_area. */
        ret = va_clip(root, head, va, nva_start_addr, size);
        if (WARN_ON_ONCE(ret))
                return vend;

        return nva_start_addr;
}

/*
 * Returns a start address of the newly allocated area, if success.
 * Otherwise a vend is returned that indicates failure.
 */
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
        unsigned long size, unsigned long align,
        unsigned long vstart, unsigned long vend)
{
        bool adjust_search_size = true;
        unsigned long nva_start_addr;
        struct vmap_area *va;

        /*
         * Do not adjust when:
         *   a) align <= PAGE_SIZE, because it does not make any sense.
         *      All blocks(their start addresses) are at least PAGE_SIZE
         *      aligned anyway;
         *   b) a short range where a requested size corresponds to exactly
         *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
         *      With adjusted search length an allocation would not succeed.
         */
        if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
                adjust_search_size = false;

        va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
        if (unlikely(!va))
                return vend;

        nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
        if (nva_start_addr == vend)
                return vend;

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
        find_vmap_lowest_match_check(root, head, size, align);
#endif

        return nva_start_addr;
}

/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
        struct vmap_node *vn = addr_to_node(va->va_start);

        /*
         * Remove from the busy tree/list.
         */
        spin_lock(&vn->busy.lock);
        unlink_va(va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        /*
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
        merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
        struct vmap_area *va = NULL;

        /*
         * Preload this CPU with one extra vmap_area object. It is used
         * when fit type of free area is NE_FIT_TYPE. It guarantees that
         * a CPU that does an allocation is preloaded.
         *
         * We do it in non-atomic context, thus it allows us to use more
         * permissive allocation masks to be more stable under low memory
         * condition and high memory pressure.
         */
        if (!this_cpu_read(ne_fit_preload_node))
                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);

        spin_lock(lock);

        if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
                kmem_cache_free(vmap_area_cachep, va);
}

static struct vmap_pool *
size_to_va_pool(struct vmap_node *vn, unsigned long size)
{
        unsigned int idx = (size - 1) / PAGE_SIZE;

        if (idx < MAX_VA_SIZE_PAGES)
                return &vn->pool[idx];

        return NULL;
}

static bool
node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
{
        struct vmap_pool *vp;

        vp = size_to_va_pool(n, va_size(va));
        if (!vp)
                return false;

        spin_lock(&n->pool_lock);
        list_add(&va->list, &vp->head);
        WRITE_ONCE(vp->len, vp->len + 1);
        spin_unlock(&n->pool_lock);

        return true;
}

static struct vmap_area *
node_pool_del_va(struct vmap_node *vn, unsigned long size,
                unsigned long align, unsigned long vstart,
                unsigned long vend)
{
        struct vmap_area *va = NULL;
        struct vmap_pool *vp;
        int err = 0;

        vp = size_to_va_pool(vn, size);
        if (!vp || list_empty(&vp->head))
                return NULL;

        spin_lock(&vn->pool_lock);
        if (!list_empty(&vp->head)) {
                va = list_first_entry(&vp->head, struct vmap_area, list);

                if (IS_ALIGNED(va->va_start, align)) {
                        /*
                         * Do some sanity check and emit a warning
                         * if one of below checks detects an error.
                         */
                        err |= (va_size(va) != size);
                        err |= (va->va_start < vstart);
                        err |= (va->va_end > vend);

                        if (!WARN_ON_ONCE(err)) {
                                list_del_init(&va->list);
                                WRITE_ONCE(vp->len, vp->len - 1);
                        } else {
                                va = NULL;
                        }
                } else {
                        list_move_tail(&va->list, &vp->head);
                        va = NULL;
                }
        }
        spin_unlock(&vn->pool_lock);

        return va;
}

static struct vmap_area *
node_alloc(unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend,
                unsigned long *addr, unsigned int *vn_id)
{
        struct vmap_area *va;

        *vn_id = 0;
        *addr = vend;

        /*
         * Fallback to a global heap if not vmalloc or there
         * is only one node.
         */
        if (vstart != VMALLOC_START || vend != VMALLOC_END ||
                        nr_vmap_nodes == 1)
                return NULL;

        *vn_id = raw_smp_processor_id() % nr_vmap_nodes;
        va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
        *vn_id = encode_vn_id(*vn_id);

        if (va)
                *addr = va->va_start;

        return va;
}

static inline void setup_vmalloc_vm(struct vm_struct *vm,
        struct vmap_area *va, unsigned long flags, const void *caller)
{
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
        va->vm = vm;
}

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend. If vm is passed in, the two will also be bound.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long align,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask,
                                unsigned long va_flags, struct vm_struct *vm)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        unsigned long freed;
        unsigned long addr;
        unsigned int vn_id;
        int purged = 0;
        int ret;

        if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
                return ERR_PTR(-EINVAL);

        if (unlikely(!vmap_initialized))
                return ERR_PTR(-EBUSY);

        might_sleep();

        /*
         * If a VA is obtained from a global heap(if it fails here)
         * it is anyway marked with this "vn_id" so it is returned
         * to this pool's node later. Such way gives a possibility
         * to populate pools based on users demand.
         *
         * On success a ready to go VA is returned.
         */
        va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
        if (!va) {
                gfp_mask = gfp_mask & GFP_RECLAIM_MASK;

                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
                if (unlikely(!va))
                        return ERR_PTR(-ENOMEM);

                /*
                 * Only scan the relevant parts containing pointers to other objects
                 * to avoid false negatives.
                 */
                kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
        }

retry:
        if (addr == vend) {
                preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
                addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
                        size, align, vstart, vend);
                spin_unlock(&free_vmap_area_lock);
        }

        trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);

        /*
         * If an allocation fails, the "vend" address is
         * returned. Therefore trigger the overflow path.
         */
        if (unlikely(addr == vend))
                goto overflow;

        va->va_start = addr;
        va->va_end = addr + size;
        va->vm = NULL;
        va->flags = (va_flags | vn_id);

        if (vm) {
                vm->addr = (void *)va->va_start;
                vm->size = va->va_end - va->va_start;
                va->vm = vm;
        }

        vn = addr_to_node(va->va_start);

        spin_lock(&vn->busy.lock);
        insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        spin_unlock(&vn->busy.lock);

        BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);

        ret = kasan_populate_vmalloc(addr, size);
        if (ret) {
                free_vmap_area(va);
                return ERR_PTR(ret);
        }

        return va;

overflow:
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = 1;
                goto retry;
        }

        freed = 0;
        blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);

        if (freed > 0) {
                purged = 0;
                goto retry;
        }

        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
                pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
                        size);

        kmem_cache_free(vmap_area_cachep, va);
        return ERR_PTR(-EBUSY);
}

int register_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
        unsigned int log;

        log = fls(num_online_cpus());

        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);

/*
 * Serialize vmap purging.  There is no actual critical section protected
 * by this lock, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
static DEFINE_MUTEX(vmap_purge_lock);

/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
static cpumask_t purge_nodes;

static void
reclaim_list_global(struct list_head *head)
{
        struct vmap_area *va, *n;

        if (list_empty(head))
                return;

        spin_lock(&free_vmap_area_lock);
        list_for_each_entry_safe(va, n, head, list)
                merge_or_add_vmap_area_augment(va,
                        &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
        struct vmap_area *va, *nva;
        struct list_head decay_list;
        struct rb_root decay_root;
        unsigned long n_decay;
        int i;

        decay_root = RB_ROOT;
        INIT_LIST_HEAD(&decay_list);

        for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                struct list_head tmp_list;

                if (list_empty(&vn->pool[i].head))
                        continue;

                INIT_LIST_HEAD(&tmp_list);

                /* Detach the pool, so no-one can access it. */
                spin_lock(&vn->pool_lock);
                list_replace_init(&vn->pool[i].head, &tmp_list);
                spin_unlock(&vn->pool_lock);

                if (full_decay)
                        WRITE_ONCE(vn->pool[i].len, 0);

                /* Decay a pool by ~25% out of left objects. */
                n_decay = vn->pool[i].len >> 2;

                list_for_each_entry_safe(va, nva, &tmp_list, list) {
                        list_del_init(&va->list);
                        merge_or_add_vmap_area(va, &decay_root, &decay_list);

                        if (!full_decay) {
                                WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);

                                if (!--n_decay)
                                        break;
                        }
                }

                /*
                 * Attach the pool back if it has been partly decayed.
                 * Please note, it is supposed that nobody(other contexts)
                 * can populate the pool therefore a simple list replace
                 * operation takes place here.
                 */
                if (!full_decay && !list_empty(&tmp_list)) {
                        spin_lock(&vn->pool_lock);
                        list_replace_init(&tmp_list, &vn->pool[i].head);
                        spin_unlock(&vn->pool_lock);
                }
        }

        reclaim_list_global(&decay_list);
}

static void purge_vmap_node(struct work_struct *work)
{
        struct vmap_node *vn = container_of(work,
                struct vmap_node, purge_work);
        struct vmap_area *va, *n_va;
        LIST_HEAD(local_list);

        vn->nr_purged = 0;

        list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
                unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
                unsigned long orig_start = va->va_start;
                unsigned long orig_end = va->va_end;
                unsigned int vn_id = decode_vn_id(va->flags);

                list_del_init(&va->list);

                if (is_vmalloc_or_module_addr((void *)orig_start))
                        kasan_release_vmalloc(orig_start, orig_end,
                                              va->va_start, va->va_end);

                atomic_long_sub(nr, &vmap_lazy_nr);
                vn->nr_purged++;

                if (is_vn_id_valid(vn_id) && !vn->skip_populate)
                        if (node_pool_add_va(vn, va))
                                continue;

                /* Go back to global. */
                list_add(&va->list, &local_list);
        }

        reclaim_list_global(&local_list);
}

/*
 * Purges all lazily-freed vmap areas.
 */
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
                bool full_pool_decay)
{
        unsigned long nr_purged_areas = 0;
        unsigned int nr_purge_helpers;
        unsigned int nr_purge_nodes;
        struct vmap_node *vn;
        int i;

        lockdep_assert_held(&vmap_purge_lock);

        /*
         * Use cpumask to mark which node has to be processed.
         */
        purge_nodes = CPU_MASK_NONE;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                INIT_LIST_HEAD(&vn->purge_list);
                vn->skip_populate = full_pool_decay;
                decay_va_pool_node(vn, full_pool_decay);

                if (RB_EMPTY_ROOT(&vn->lazy.root))
                        continue;

                spin_lock(&vn->lazy.lock);
                WRITE_ONCE(vn->lazy.root.rb_node, NULL);
                list_replace_init(&vn->lazy.head, &vn->purge_list);
                spin_unlock(&vn->lazy.lock);

                start = min(start, list_first_entry(&vn->purge_list,
                        struct vmap_area, list)->va_start);

                end = max(end, list_last_entry(&vn->purge_list,
                        struct vmap_area, list)->va_end);

                cpumask_set_cpu(i, &purge_nodes);
        }

        nr_purge_nodes = cpumask_weight(&purge_nodes);
        if (nr_purge_nodes > 0) {
                flush_tlb_kernel_range(start, end);

                /* One extra worker is per a lazy_max_pages() full set minus one. */
                nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
                nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (nr_purge_helpers > 0) {
                                INIT_WORK(&vn->purge_work, purge_vmap_node);

                                if (cpumask_test_cpu(i, cpu_online_mask))
                                        schedule_work_on(i, &vn->purge_work);
                                else
                                        schedule_work(&vn->purge_work);

                                nr_purge_helpers--;
                        } else {
                                vn->purge_work.func = NULL;
                                purge_vmap_node(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (vn->purge_work.func) {
                                flush_work(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }
        }

        trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
        return nr_purged_areas > 0;
}

/*
 * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
 */
static void reclaim_and_purge_vmap_areas(void)

{
        mutex_lock(&vmap_purge_lock);
        purge_fragmented_blocks_allcpus();
        __purge_vmap_area_lazy(ULONG_MAX, 0, true);
        mutex_unlock(&vmap_purge_lock);
}

static void drain_vmap_area_work(struct work_struct *work)
{
        mutex_lock(&vmap_purge_lock);
        __purge_vmap_area_lazy(ULONG_MAX, 0, false);
        mutex_unlock(&vmap_purge_lock);
}

/*
 * Free a vmap area, caller ensuring that the area has been unmapped,
 * unlinked and flush_cache_vunmap had been called for the correct
 * range previously.
 */
static void free_vmap_area_noflush(struct vmap_area *va)
{
        unsigned long nr_lazy_max = lazy_max_pages();
        unsigned long va_start = va->va_start;
        unsigned int vn_id = decode_vn_id(va->flags);
        struct vmap_node *vn;
        unsigned long nr_lazy;

        if (WARN_ON_ONCE(!list_empty(&va->list)))
                return;

        nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
                                PAGE_SHIFT, &vmap_lazy_nr);

        /*
         * If it was request by a certain node we would like to
         * return it to that node, i.e. its pool for later reuse.
         */
        vn = is_vn_id_valid(vn_id) ?
                id_to_node(vn_id):addr_to_node(va->va_start);

        spin_lock(&vn->lazy.lock);
        insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
        spin_unlock(&vn->lazy.lock);

        trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);

        /* After this point, we may free va at any time */
        if (unlikely(nr_lazy > nr_lazy_max))
                schedule_work(&drain_vmap_work);
}

/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
        flush_cache_vunmap(va->va_start, va->va_end);
        vunmap_range_noflush(va->va_start, va->va_end);
        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(va->va_start, va->va_end);

        free_vmap_area_noflush(va);
}

struct vmap_area *find_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        if (unlikely(!vmap_initialized))
                return NULL;

        /*
         * An addr_to_node_id(addr) converts an address to a node index
         * where a VA is located. If VA spans several zones and passed
         * addr is not the same as va->va_start, what is not common, we
         * may need to scan extra nodes. See an example:
         *
         *      <----va---->
         * -|-----|-----|-----|-----|-
         *     1     2     0     1
         *
         * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
         * addr is within 2 or 0 nodes we should do extra work.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + 1) % nr_vmap_nodes) != j);

        return NULL;
}

static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        /*
         * Check the comment in the find_vmap_area() about the loop.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                if (va)
                        unlink_va(va, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + 1) % nr_vmap_nodes) != j);

        return NULL;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE                (VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE                (128UL*1024*1024)
#else
#define VMALLOC_SPACE                (128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES                (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC                BITS_PER_LONG        /* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX        1024        /* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN        (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)                ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)                ((x) > (y) ? (x) : (y)) /* can't use max() */
#define VMAP_BBMAP_BITS                \
                VMAP_MIN(VMAP_BBMAP_BITS_MAX,        \
                VMAP_MAX(VMAP_BBMAP_BITS_MIN,        \
                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))

#define VMAP_BLOCK_SIZE                (VMAP_BBMAP_BITS * PAGE_SIZE)

/*
 * Purge threshold to prevent overeager purging of fragmented blocks for
 * regular operations: Purge if vb->free is less than 1/4 of the capacity.
 */
#define VMAP_PURGE_THRESHOLD        (VMAP_BBMAP_BITS / 4)

#define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
#define VMAP_BLOCK                0x2 /* mark out the vmap_block sub-type*/
#define VMAP_FLAGS_MASK                0x3

struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;

        /*
         * An xarray requires an extra memory dynamically to
         * be allocated. If it is an issue, we can use rb-tree
         * instead.
         */
        struct xarray vmap_blocks;
};

struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
        DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * In order to fast access to any "vmap_block" associated with a
 * specific address, we use a hash.
 *
 * A per-cpu vmap_block_queue is used in both ways, to serialize
 * an access to free block chains among CPUs(alloc path) and it
 * also acts as a vmap_block hash(alloc/free paths). It means we
 * overload it, since we already have the per-cpu array which is
 * used as a hash table. When used as a hash a 'cpu' passed to
 * per_cpu() is not actually a CPU but rather a hash index.
 *
 * A hash function is addr_to_vb_xa() which hashes any address
 * to a specific index(in a hash) it belongs to. This then uses a
 * per_cpu() macro to access an array with generated index.
 *
 * An example:
 *
 *  CPU_1  CPU_2  CPU_0
 *    |      |      |
 *    V      V      V
 * 0     10     20     30     40     50     60
 * |------|------|------|------|------|------|...<vmap address space>
 *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
 *
 * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
 *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
 *
 * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
 *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
 *
 * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
 *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
 *
 * This technique almost always avoids lock contention on insert/remove,
 * however xarray spinlocks protect against any contention that remains.
 */
static struct xarray *
addr_to_vb_xa(unsigned long addr)
{
        int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();

        return &per_cpu(vmap_block_queue, index).vmap_blocks;
}

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
        addr /= VMAP_BLOCK_SIZE;
        return addr;
}

static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
        unsigned long addr;

        addr = va_start + (pages_off << PAGE_SHIFT);
        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
        return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        struct xarray *xa;
        unsigned long vb_idx;
        int node, err;
        void *vaddr;

        node = numa_node_id();

        vb = kmalloc_node(sizeof(struct vmap_block),
                        gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!vb))
                return ERR_PTR(-ENOMEM);

        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask,
                                        VMAP_RAM|VMAP_BLOCK, NULL);
        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }

        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
        /* At least something should be left free */
        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
        bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
        vb->dirty_min = VMAP_BBMAP_BITS;
        vb->dirty_max = 0;
        bitmap_set(vb->used_map, 0, (1UL << order));
        INIT_LIST_HEAD(&vb->free_list);

        xa = addr_to_vb_xa(va->va_start);
        vb_idx = addr_to_vb_idx(va->va_start);
        err = xa_insert(xa, vb_idx, vb, gfp_mask);
        if (err) {
                kfree(vb);
                free_vmap_area(va);
                return ERR_PTR(err);
        }

        vbq = raw_cpu_ptr(&vmap_block_queue);
        spin_lock(&vbq->lock);
        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);

        return vaddr;
}

static void free_vmap_block(struct vmap_block *vb)
{
        struct vmap_node *vn;
        struct vmap_block *tmp;
        struct xarray *xa;

        xa = addr_to_vb_xa(vb->va->va_start);
        tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);

        vn = addr_to_node(vb->va->va_start);
        spin_lock(&vn->busy.lock);
        unlink_va(vb->va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        free_vmap_area_noflush(vb->va);
        kfree_rcu(vb, rcu_head);
}

static bool purge_fragmented_block(struct vmap_block *vb,
                struct vmap_block_queue *vbq, struct list_head *purge_list,
                bool force_purge)
{
        if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
            vb->dirty == VMAP_BBMAP_BITS)
                return false;

        /* Don't overeagerly purge usable blocks unless requested */
        if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
                return false;

        /* prevent further allocs after releasing lock */
        WRITE_ONCE(vb->free, 0);
        /* prevent purging it again */
        WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
        vb->dirty_min = 0;
        vb->dirty_max = VMAP_BBMAP_BITS;
        spin_lock(&vbq->lock);
        list_del_rcu(&vb->free_list);
        spin_unlock(&vbq->lock);
        list_add_tail(&vb->purge, purge_list);
        return true;
}

static void free_purged_blocks(struct list_head *purge_list)
{
        struct vmap_block *vb, *n_vb;

        list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
                list_del(&vb->purge);
                free_vmap_block(vb);
        }
}

static void purge_fragmented_blocks(int cpu)
{
        LIST_HEAD(purge);
        struct vmap_block *vb;
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);

        rcu_read_lock();
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long free = READ_ONCE(vb->free);
                unsigned long dirty = READ_ONCE(vb->dirty);

                if (free + dirty != VMAP_BBMAP_BITS ||
                    dirty == VMAP_BBMAP_BITS)
                        continue;

                spin_lock(&vb->lock);
                purge_fragmented_block(vb, vbq, &purge, true);
                spin_unlock(&vb->lock);
        }
        rcu_read_unlock();
        free_purged_blocks(&purge);
}

static void purge_fragmented_blocks_allcpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                purge_fragmented_blocks(cpu);
}

static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        void *vaddr = NULL;
        unsigned int order;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
        if (WARN_ON(size == 0)) {
                /*
                 * Allocating 0 bytes isn't what caller wants since
                 * get_order(0) returns funny result. Just warn and terminate
                 * early.
                 */
                return ERR_PTR(-EINVAL);
        }
        order = get_order(size);

        rcu_read_lock();
        vbq = raw_cpu_ptr(&vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long pages_off;

                if (READ_ONCE(vb->free) < (1UL << order))
                        continue;

                spin_lock(&vb->lock);
                if (vb->free < (1UL << order)) {
                        spin_unlock(&vb->lock);
                        continue;
                }

                pages_off = VMAP_BBMAP_BITS - vb->free;
                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
                WRITE_ONCE(vb->free, vb->free - (1UL << order));
                bitmap_set(vb->used_map, pages_off, (1UL << order));
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }

                spin_unlock(&vb->lock);
                break;
        }

        rcu_read_unlock();

        /* Allocate new block if nothing was found */
        if (!vaddr)
                vaddr = new_vmap_block(order, gfp_mask);

        return vaddr;
}

static void vb_free(unsigned long addr, unsigned long size)
{
        unsigned long offset;
        unsigned int order;
        struct vmap_block *vb;
        struct xarray *xa;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);

        flush_cache_vunmap(addr, addr + size);

        order = get_order(size);
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;

        xa = addr_to_vb_xa(addr);
        vb = xa_load(xa, addr_to_vb_idx(addr));

        spin_lock(&vb->lock);
        bitmap_clear(vb->used_map, offset, (1UL << order));
        spin_unlock(&vb->lock);

        vunmap_range_noflush(addr, addr + size);

        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(addr, addr + size);

        spin_lock(&vb->lock);

        /* Expand the not yet TLB flushed dirty range */
        vb->dirty_min = min(vb->dirty_min, offset);
        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));

        WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
        if (vb->dirty == VMAP_BBMAP_BITS) {
                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
                spin_unlock(&vb->lock);
}

static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
        LIST_HEAD(purge_list);
        int cpu;

        if (unlikely(!vmap_initialized))
                return;

        mutex_lock(&vmap_purge_lock);

        for_each_possible_cpu(cpu) {
                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
                struct vmap_block *vb;
                unsigned long idx;

                rcu_read_lock();
                xa_for_each(&vbq->vmap_blocks, idx, vb) {
                        spin_lock(&vb->lock);

                        /*
                         * Try to purge a fragmented block first. If it's
                         * not purgeable, check whether there is dirty
                         * space to be flushed.
                         */
                        if (!purge_fragmented_block(vb, vbq, &purge_list, false) &&
                            vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;

                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
                                e = va_start + (vb->dirty_max << PAGE_SHIFT);

                                start = min(s, start);
                                end   = max(e, end);

                                /* Prevent that this is flushed again */
                                vb->dirty_min = VMAP_BBMAP_BITS;
                                vb->dirty_max = 0;

                                flush = 1;
                        }
                        spin_unlock(&vb->lock);
                }
                rcu_read_unlock();
        }
        free_purged_blocks(&purge_list);

        if (!__purge_vmap_area_lazy(start, end, false) && flush)
                flush_tlb_kernel_range(start, end);
        mutex_unlock(&vmap_purge_lock);
}

/**
 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 *
 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 * to amortize TLB flushing overheads. What this means is that any page you
 * have now, may, in a former life, have been mapped into kernel virtual
 * address by the vmap layer and so there might be some CPUs with TLB entries
 * still referencing that page (additional to the regular 1:1 kernel mapping).
 *
 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 * be sure that none of the pages we have control over will have any aliases
 * from the vmap layer.
 */
void vm_unmap_aliases(void)
{
        unsigned long start = ULONG_MAX, end = 0;
        int flush = 0;

        _vm_unmap_aliases(start, end, flush);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

/**
 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 * @mem: the pointer returned by vm_map_ram
 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 */
void vm_unmap_ram(const void *mem, unsigned int count)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr = (unsigned long)kasan_reset_tag(mem);
        struct vmap_area *va;

        might_sleep();
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
        BUG_ON(!PAGE_ALIGNED(addr));

        kasan_poison_vmalloc(mem, size);

        if (likely(count <= VMAP_MAX_ALLOC)) {
                debug_check_no_locks_freed(mem, size);
                vb_free(addr, size);
                return;
        }

        va = find_unlink_vmap_area(addr);
        if (WARN_ON_ONCE(!va))
                return;

        debug_check_no_locks_freed((void *)va->va_start,
                                    (va->va_end - va->va_start));
        free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);

/**
 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 * @pages: an array of pointers to the pages to be mapped
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 *
 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
 * faster than vmap so it's good.  But if you mix long-life and short-life
 * objects with vm_map_ram(), it could consume lots of address space through
 * fragmentation (especially on a 32bit machine).  You could see failures in
 * the end.  Please use this function for short-lived objects.
 *
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr;
        void *mem;

        if (likely(count <= VMAP_MAX_ALLOC)) {
                mem = vb_alloc(size, GFP_KERNEL);
                if (IS_ERR(mem))
                        return NULL;
                addr = (unsigned long)mem;
        } else {
                struct vmap_area *va;
                va = alloc_vmap_area(size, PAGE_SIZE,
                                VMALLOC_START, VMALLOC_END,
                                node, GFP_KERNEL, VMAP_RAM,
                                NULL);
                if (IS_ERR(va))
                        return NULL;

                addr = va->va_start;
                mem = (void *)addr;
        }

        if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
                                pages, PAGE_SHIFT) < 0) {
                vm_unmap_ram(mem, count);
                return NULL;
        }

        /*
         * Mark the pages as accessible, now that they are mapped.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);

        return mem;
}
EXPORT_SYMBOL(vm_map_ram);

static struct vm_struct *vmlist __initdata;

static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        return vm->page_order;
#else
        return 0;
#endif
}

static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        vm->page_order = order;
#else
        BUG_ON(order != 0);
#endif
}

/**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
 *
 * This function is used to add fixed kernel vm area to vmlist before
 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 * should contain proper values and the other fields should be zero.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_add_early(struct vm_struct *vm)
{
        struct vm_struct *tmp, **p;

        BUG_ON(vmap_initialized);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr) {
                        BUG_ON(tmp->addr < vm->addr + vm->size);
                        break;
                } else
                        BUG_ON(tmp->addr + tmp->size > vm->addr);
        }
        vm->next = *p;
        *p = vm;
}

/**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
 * @align: requested alignment
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
        unsigned long addr = ALIGN(VMALLOC_START, align);
        struct vm_struct *cur, **p;

        BUG_ON(vmap_initialized);

        for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
                if ((unsigned long)cur->addr - addr >= vm->size)
                        break;
                addr = ALIGN((unsigned long)cur->addr + cur->size, align);
        }

        BUG_ON(addr > VMALLOC_END - vm->size);
        vm->addr = (void *)addr;
        vm->next = *p;
        *p = vm;
        kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
        /*
         * Before removing VM_UNINITIALIZED,
         * we should make sure that vm has proper values.
         * Pair with smp_rmb() in show_numa_info().
         */
        smp_wmb();
        vm->flags &= ~VM_UNINITIALIZED;
}

static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long shift, unsigned long flags,
                unsigned long start, unsigned long end, int node,
                gfp_t gfp_mask, const void *caller)
{
        struct vmap_area *va;
        struct vm_struct *area;
        unsigned long requested_size = size;

        BUG_ON(in_interrupt());
        size = ALIGN(size, 1ul << shift);
        if (unlikely(!size))
                return NULL;

        if (flags & VM_IOREMAP)
                align = 1ul << clamp_t(int, get_count_order_long(size),
                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);

        area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;

        if (!(flags & VM_NO_GUARD))
                size += PAGE_SIZE;

        area->flags = flags;
        area->caller = caller;

        va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
        if (IS_ERR(va)) {
                kfree(area);
                return NULL;
        }

        /*
         * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
         * best-effort approach, as they can be mapped outside of vmalloc code.
         * For VM_ALLOC mappings, the pages are marked as accessible after
         * getting mapped in __vmalloc_node_range().
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        if (!(flags & VM_ALLOC))
                area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
                                                    KASAN_VMALLOC_PROT_NORMAL);

        return area;
}

struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * get_vm_area - reserve a contiguous kernel virtual area
 * @size:         size of the area
 * @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 *
 * Search an area of @size in the kernel virtual mapping area,
 * and reserved it for out purposes.  Returns the area descriptor
 * on success or %NULL on failure.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL,
                                  __builtin_return_address(0));
}

struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * find_vm_area - find a continuous kernel virtual area
 * @addr:          base address
 *
 * Search for the kernel VM area starting at @addr, and return it.
 * It is up to the caller to do all required locking to keep the returned
 * pointer valid.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *find_vm_area(const void *addr)
{
        struct vmap_area *va;

        va = find_vmap_area((unsigned long)addr);
        if (!va)
                return NULL;

        return va->vm;
}

/**
 * remove_vm_area - find and remove a continuous kernel virtual area
 * @addr:            base address
 *
 * Search for the kernel VM area starting at @addr, and remove it.
 * This function returns the found VM area, but using it is NOT safe
 * on SMP machines, except for its size or flags.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *remove_vm_area(const void *addr)
{
        struct vmap_area *va;
        struct vm_struct *vm;

        might_sleep();

        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
                        addr))
                return NULL;

        va = find_unlink_vmap_area((unsigned long)addr);
        if (!va || !va->vm)
                return NULL;
        vm = va->vm;

        debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
        debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
        kasan_free_module_shadow(vm);
        kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));

        free_unmap_vmap_area(va);
        return vm;
}

static inline void set_area_direct_map(const struct vm_struct *area,
                                       int (*set_direct_map)(struct page *page))
{
        int i;

        /* HUGE_VMALLOC passes small pages to set_direct_map */
        for (i = 0; i < area->nr_pages; i++)
                if (page_address(area->pages[i]))
                        set_direct_map(area->pages[i]);
}

/*
 * Flush the vm mapping and reset the direct map.
 */
static void vm_reset_perms(struct vm_struct *area)
{
        unsigned long start = ULONG_MAX, end = 0;
        unsigned int page_order = vm_area_page_order(area);
        int flush_dmap = 0;
        int i;

        /*
         * Find the start and end range of the direct mappings to make sure that
         * the vm_unmap_aliases() flush includes the direct map.
         */
        for (i = 0; i < area->nr_pages; i += 1U << page_order) {
                unsigned long addr = (unsigned long)page_address(area->pages[i]);

                if (addr) {
                        unsigned long page_size;

                        page_size = PAGE_SIZE << page_order;
                        start = min(addr, start);
                        end = max(addr + page_size, end);
                        flush_dmap = 1;
                }
        }

        /*
         * Set direct map to something invalid so that it won't be cached if
         * there are any accesses after the TLB flush, then flush the TLB and
         * reset the direct map permissions to the default.
         */
        set_area_direct_map(area, set_direct_map_invalid_noflush);
        _vm_unmap_aliases(start, end, flush_dmap);
        set_area_direct_map(area, set_direct_map_default_noflush);
}

static void delayed_vfree_work(struct work_struct *w)
{
        struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
        struct llist_node *t, *llnode;

        llist_for_each_safe(llnode, t, llist_del_all(&p->list))
                vfree(llnode);
}

/**
 * vfree_atomic - release memory allocated by vmalloc()
 * @addr:          memory base address
 *
 * This one is just like vfree() but can be called in any atomic context
 * except NMIs.
 */
void vfree_atomic(const void *addr)
{
        struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);

        BUG_ON(in_nmi());
        kmemleak_free(addr);

        /*
         * Use raw_cpu_ptr() because this can be called from preemptible
         * context. Preemption is absolutely fine here, because the llist_add()
         * implementation is lockless, so it works even if we are adding to
         * another cpu's list. schedule_work() should be fine with this too.
         */
        if (addr && llist_add((struct llist_node *)addr, &p->list))
                schedule_work(&p->wq);
}

/**
 * vfree - Release memory allocated by vmalloc()
 * @addr:  Memory base address
 *
 * Free the virtually continuous memory area starting at @addr, as obtained
 * from one of the vmalloc() family of APIs.  This will usually also free the
 * physical memory underlying the virtual allocation, but that memory is
 * reference counted, so it will not be freed until the last user goes away.
 *
 * If @addr is NULL, no operation is performed.
 *
 * Context:
 * May sleep if called *not* from interrupt context.
 * Must not be called in NMI context (strictly speaking, it could be
 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
 * conventions for vfree() arch-dependent would be a really bad idea).
 */
void vfree(const void *addr)
{
        struct vm_struct *vm;
        int i;

        if (unlikely(in_interrupt())) {
                vfree_atomic(addr);
                return;
        }

        BUG_ON(in_nmi());
        kmemleak_free(addr);
        might_sleep();

        if (!addr)
                return;

        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
                return;
        }

        if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
                vm_reset_perms(vm);
        for (i = 0; i < vm->nr_pages; i++) {
                struct page *page = vm->pages[i];

                BUG_ON(!page);
                mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
                /*
                 * High-order allocs for huge vmallocs are split, so
                 * can be freed as an array of order-0 allocations
                 */
                __free_page(page);
                cond_resched();
        }
        atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
        kvfree(vm->pages);
        kfree(vm);
}
EXPORT_SYMBOL(vfree);

/**
 * vunmap - release virtual mapping obtained by vmap()
 * @addr:   memory base address
 *
 * Free the virtually contiguous memory area starting at @addr,
 * which was created from the page array passed to vmap().
 *
 * Must not be called in interrupt context.
 */
void vunmap(const void *addr)
{
        struct vm_struct *vm;

        BUG_ON(in_interrupt());
        might_sleep();

        if (!addr)
                return;
        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
                                addr);
                return;
        }
        kfree(vm);
}
EXPORT_SYMBOL(vunmap);

/**
 * vmap - map an array of pages into virtually contiguous space
 * @pages: array of page pointers
 * @count: number of pages to map
 * @flags: vm_area->flags
 * @prot: page protection for the mapping
 *
 * Maps @count pages from @pages into contiguous kernel virtual space.
 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
 * are transferred from the caller to vmap(), and will be freed / dropped when
 * vfree() is called on the return value.
 *
 * Return: the address of the area or %NULL on failure
 */
void *vmap(struct page **pages, unsigned int count,
           unsigned long flags, pgprot_t prot)
{
        struct vm_struct *area;
        unsigned long addr;
        unsigned long size;                /* In bytes */

        might_sleep();

        if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
                return NULL;

        /*
         * Your top guard is someone else's bottom guard. Not having a top
         * guard compromises someone else's mappings too.
         */
        if (WARN_ON_ONCE(flags & VM_NO_GUARD))
                flags &= ~VM_NO_GUARD;

        if (count > totalram_pages())
                return NULL;

        size = (unsigned long)count << PAGE_SHIFT;
        area = get_vm_area_caller(size, flags, __builtin_return_address(0));
        if (!area)
                return NULL;

        addr = (unsigned long)area->addr;
        if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
                                pages, PAGE_SHIFT) < 0) {
                vunmap(area->addr);
                return NULL;
        }

        if (flags & VM_MAP_PUT_PAGES) {
                area->pages = pages;
                area->nr_pages = count;
        }
        return area->addr;
}
EXPORT_SYMBOL(vmap);

#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
        unsigned long        *pfns;
        pgprot_t        prot;
        unsigned int        idx;
};

static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
        struct vmap_pfn_data *data = private;
        unsigned long pfn = data->pfns[data->idx];
        pte_t ptent;

        if (WARN_ON_ONCE(pfn_valid(pfn)))
                return -EINVAL;

        ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
        set_pte_at(&init_mm, addr, pte, ptent);

        data->idx++;
        return 0;
}

/**
 * vmap_pfn - map an array of PFNs into virtually contiguous space
 * @pfns: array of PFNs
 * @count: number of pages to map
 * @prot: page protection for the mapping
 *
 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
 * the start address of the mapping.
 */
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
        struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
        struct vm_struct *area;

        area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
                        __builtin_return_address(0));
        if (!area)
                return NULL;
        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
                        count * PAGE_SIZE, vmap_pfn_apply, &data)) {
                free_vm_area(area);
                return NULL;
        }

        flush_cache_vmap((unsigned long)area->addr,
                         (unsigned long)area->addr + count * PAGE_SIZE);

        return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */

static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
                unsigned int order, unsigned int nr_pages, struct page **pages)
{
        unsigned int nr_allocated = 0;
        gfp_t alloc_gfp = gfp;
        bool nofail = gfp & __GFP_NOFAIL;
        struct page *page;
        int i;

        /*
         * For order-0 pages we make use of bulk allocator, if
         * the page array is partly or not at all populated due
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
        if (!order) {
                /* bulk allocator doesn't support nofail req. officially */
                gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;

                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;

                        /*
                         * A maximum allowed request is hard-coded and is 100
                         * pages per call. That is done in order to prevent a
                         * long preemption off scenario in the bulk-allocator
                         * so the range is [1:100].
                         */
                        nr_pages_request = min(100U, nr_pages - nr_allocated);

                        /* memory allocation should consider mempolicy, we can't
                         * wrongly use nearest node when nid == NUMA_NO_NODE,
                         * otherwise memory may be allocated in only one node,
                         * but mempolicy wants to alloc memory by interleaving.
                         */
                        if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
                                nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp,
                                                        nr_pages_request,
                                                        pages + nr_allocated);

                        else
                                nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid,
                                                        nr_pages_request,
                                                        pages + nr_allocated);

                        nr_allocated += nr;
                        cond_resched();

                        /*
                         * If zero or pages were obtained partly,
                         * fallback to a single page allocator.
                         */
                        if (nr != nr_pages_request)
                                break;
                }
        } else if (gfp & __GFP_NOFAIL) {
                /*
                 * Higher order nofail allocations are really expensive and
                 * potentially dangerous (pre-mature OOM, disruptive reclaim
                 * and compaction etc.
                 */
                alloc_gfp &= ~__GFP_NOFAIL;
        }

        /* High-order pages or fallback path if "bulk" fails. */
        while (nr_allocated < nr_pages) {
                if (!nofail && fatal_signal_pending(current))
                        break;

                if (nid == NUMA_NO_NODE)
                        page = alloc_pages_noprof(alloc_gfp, order);
                else
                        page = alloc_pages_node_noprof(nid, alloc_gfp, order);
                if (unlikely(!page)) {
                        if (!nofail)
                                break;

                        /* fall back to the zero order allocations */
                        alloc_gfp |= __GFP_NOFAIL;
                        order = 0;
                        continue;
                }

                /*
                 * Higher order allocations must be able to be treated as
                 * indepdenent small pages by callers (as they can with
                 * small-page vmallocs). Some drivers do their own refcounting
                 * on vmalloc_to_page() pages, some use page->mapping,
                 * page->lru, etc.
                 */
                if (order)
                        split_page(page, order);

                /*
                 * Careful, we allocate and map page-order pages, but
                 * tracking is done per PAGE_SIZE page so as to keep the
                 * vm_struct APIs independent of the physical/mapped size.
                 */
                for (i = 0; i < (1U << order); i++)
                        pages[nr_allocated + i] = page + i;

                cond_resched();
                nr_allocated += 1U << order;
        }

        return nr_allocated;
}

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, unsigned int page_shift,
                                 int node)
{
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        bool nofail = gfp_mask & __GFP_NOFAIL;
        unsigned long addr = (unsigned long)area->addr;
        unsigned long size = get_vm_area_size(area);
        unsigned long array_size;
        unsigned int nr_small_pages = size >> PAGE_SHIFT;
        unsigned int page_order;
        unsigned int flags;
        int ret;

        array_size = (unsigned long)nr_small_pages * sizeof(struct page *);

        if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
                gfp_mask |= __GFP_HIGHMEM;

        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
                                        area->caller);
        } else {
                area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
        }

        if (!area->pages) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to allocated page array size %lu",
                        nr_small_pages * PAGE_SIZE, array_size);
                free_vm_area(area);
                return NULL;
        }

        set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
        page_order = vm_area_page_order(area);

        area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
                node, page_order, nr_small_pages, area->pages);

        atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
        if (gfp_mask & __GFP_ACCOUNT) {
                int i;

                for (i = 0; i < area->nr_pages; i++)
                        mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
        }

        /*
         * If not enough pages were obtained to accomplish an
         * allocation request, free them via vfree() if any.
         */
        if (area->nr_pages != nr_small_pages) {
                /*
                 * vm_area_alloc_pages() can fail due to insufficient memory but
                 * also:-
                 *
                 * - a pending fatal signal
                 * - insufficient huge page-order pages
                 *
                 * Since we always retry allocations at order-0 in the huge page
                 * case a warning for either is spurious.
                 */
                if (!fatal_signal_pending(current) && page_order == 0)
                        warn_alloc(gfp_mask, NULL,
                                "vmalloc error: size %lu, failed to allocate pages",
                                area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        /*
         * page tables allocations ignore external gfp mask, enforce it
         * by the scope API
         */
        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                flags = memalloc_nofs_save();
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                flags = memalloc_noio_save();

        do {
                ret = vmap_pages_range(addr, addr + size, prot, area->pages,
                        page_shift);
                if (nofail && (ret < 0))
                        schedule_timeout_uninterruptible(1);
        } while (nofail && (ret < 0));

        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                memalloc_nofs_restore(flags);
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                memalloc_noio_restore(flags);

        if (ret < 0) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to map pages",
                        area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        return area->addr;

fail:
        vfree(area->addr);
        return NULL;
}

/**
 * __vmalloc_node_range - allocate virtually contiguous memory
 * @size:                  allocation size
 * @align:                  desired alignment
 * @start:                  vm area range start
 * @end:                  vm area range end
 * @gfp_mask:                  flags for the page level allocator
 * @prot:                  protection mask for the allocated pages
 * @vm_flags:                  additional vm area flags (e.g. %VM_NO_GUARD)
 * @node:                  node to use for allocation or NUMA_NO_NODE
 * @caller:                  caller's return address
 *
 * Allocate enough pages to cover @size from the page level
 * allocator with @gfp_mask flags. Please note that the full set of gfp
 * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
 * supported.
 * Zone modifiers are not supported. From the reclaim modifiers
 * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
 * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
 * __GFP_RETRY_MAYFAIL are not supported).
 *
 * __GFP_NOWARN can be used to suppress failures messages.
 *
 * Map them into contiguous kernel virtual space, using a pagetable
 * protection of @prot.
 *
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller)
{
        struct vm_struct *area;
        void *ret;
        kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
        unsigned long real_size = size;
        unsigned long real_align = align;
        unsigned int shift = PAGE_SHIFT;

        if (WARN_ON_ONCE(!size))
                return NULL;

        if ((size >> PAGE_SHIFT) > totalram_pages()) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, exceeds total pages",
                        real_size);
                return NULL;
        }

        if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
                unsigned long size_per_node;

                /*
                 * Try huge pages. Only try for PAGE_KERNEL allocations,
                 * others like modules don't yet expect huge pages in
                 * their allocations due to apply_to_page_range not
                 * supporting them.
                 */

                size_per_node = size;
                if (node == NUMA_NO_NODE)
                        size_per_node /= num_online_nodes();
                if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
                        shift = PMD_SHIFT;
                else
                        shift = arch_vmap_pte_supported_shift(size_per_node);

                align = max(real_align, 1UL << shift);
                size = ALIGN(real_size, 1UL << shift);
        }

again:
        area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
                                  VM_UNINITIALIZED | vm_flags, start, end, node,
                                  gfp_mask, caller);
        if (!area) {
                bool nofail = gfp_mask & __GFP_NOFAIL;
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, vm_struct allocation failed%s",
                        real_size, (nofail) ? ". Retrying." : "");
                if (nofail) {
                        schedule_timeout_uninterruptible(1);
                        goto again;
                }
                goto fail;
        }

        /*
         * Prepare arguments for __vmalloc_area_node() and
         * kasan_unpoison_vmalloc().
         */
        if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
                if (kasan_hw_tags_enabled()) {
                        /*
                         * Modify protection bits to allow tagging.
                         * This must be done before mapping.
                         */
                        prot = arch_vmap_pgprot_tagged(prot);

                        /*
                         * Skip page_alloc poisoning and zeroing for physical
                         * pages backing VM_ALLOC mapping. Memory is instead
                         * poisoned and zeroed by kasan_unpoison_vmalloc().
                         */
                        gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
                }

                /* Take note that the mapping is PAGE_KERNEL. */
                kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
        }

        /* Allocate physical pages and map them into vmalloc space. */
        ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
        if (!ret)
                goto fail;

        /*
         * Mark the pages as accessible, now that they are mapped.
         * The condition for setting KASAN_VMALLOC_INIT should complement the
         * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
         * to make sure that memory is initialized under the same conditions.
         * Tag-based KASAN modes only assign tags to normal non-executable
         * allocations, see __kasan_unpoison_vmalloc().
         */
        kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
        if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
            (gfp_mask & __GFP_SKIP_ZERO))
                kasan_flags |= KASAN_VMALLOC_INIT;
        /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
        area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);

        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
         * flag. It means that vm_struct is not fully initialized.
         * Now, it is fully initialized, so remove this flag here.
         */
        clear_vm_uninitialized_flag(area);

        size = PAGE_ALIGN(size);
        if (!(vm_flags & VM_DEFER_KMEMLEAK))
                kmemleak_vmalloc(area, size, gfp_mask);

        return area->addr;

fail:
        if (shift > PAGE_SHIFT) {
                shift = PAGE_SHIFT;
                align = real_align;
                size = real_size;
                goto again;
        }

        return NULL;
}

/**
 * __vmalloc_node - allocate virtually contiguous memory
 * @size:            allocation size
 * @align:            desired alignment
 * @gfp_mask:            flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 * @caller:            caller's return address
 *
 * Allocate enough pages to cover @size from the page level allocator with
 * @gfp_mask flags.  Map them into contiguous kernel virtual space.
 *
 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
 * and __GFP_NOFAIL are not supported
 *
 * Any use of gfp flags outside of GFP_KERNEL should be consulted
 * with mm people.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, int node, const void *caller)
{
        return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
 * This is only for performance analysis of vmalloc and stress purpose.
 * It is required by vmalloc test module, therefore do not use it other
 * than that.
 */
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
#endif

void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc_noprof);

/**
 * vmalloc - allocate virtually contiguous memory
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_noprof);

/**
 * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
 * @size:      allocation size
 * @gfp_mask:  flags for the page level allocator
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * If @size is greater than or equal to PMD_SIZE, allow using
 * huge pages for the memory
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
                                    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
                                    NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);

/**
 * vzalloc - allocate virtually contiguous memory with zero fill
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_noprof);

/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
 * The resulting memory area is zeroed so it can be mapped to userspace
 * without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_noprof);

/**
 * vmalloc_node - allocate memory on a specific node
 * @size:          allocation size
 * @node:          numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node_noprof);

/**
 * vzalloc_node - allocate memory on a specific node with zero fill
 * @size:        allocation size
 * @node:        numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node_noprof);

#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
/*
 * 64b systems should always have either DMA or DMA32 zones. For others
 * GFP_DMA32 should do the right thing and use the normal zone.
 */
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif

/**
 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
 * @size:        allocation size
 *
 * Allocate enough 32bit PA addressable pages to cover @size from the
 * page level allocator and map them into contiguous kernel virtual space.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_noprof);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 * @size:             allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user_noprof);

/*
 * Atomically zero bytes in the iterator.
 *
 * Returns the number of zeroed bytes.
 */
static size_t zero_iter(struct iov_iter *iter, size_t count)
{
        size_t remains = count;

        while (remains > 0) {
                size_t num, copied;

                num = min_t(size_t, remains, PAGE_SIZE);
                copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
                remains -= copied;

                if (copied < num)
                        break;
        }

        return count - remains;
}

/*
 * small helper routine, copy contents to iter from addr.
 * If the page is not present, fill zero.
 *
 * Returns the number of copied bytes.
 */
static size_t aligned_vread_iter(struct iov_iter *iter,
                                 const char *addr, size_t count)
{
        size_t remains = count;
        struct page *page;

        while (remains > 0) {
                unsigned long offset, length;
                size_t copied = 0;

                offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > remains)
                        length = remains;
                page = vmalloc_to_page(addr);
                /*
                 * To do safe access to this _mapped_ area, we need lock. But
                 * adding lock here means that we need to add overhead of
                 * vmalloc()/vfree() calls for this _debug_ interface, rarely
                 * used. Instead of that, we'll use an local mapping via
                 * copy_page_to_iter_nofault() and accept a small overhead in
                 * this access function.
                 */
                if (page)
                        copied = copy_page_to_iter_nofault(page, offset,
                                                           length, iter);
                else
                        copied = zero_iter(iter, length);

                addr += copied;
                remains -= copied;

                if (copied != length)
                        break;
        }

        return count - remains;
}

/*
 * Read from a vm_map_ram region of memory.
 *
 * Returns the number of copied bytes.
 */
static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
                                  size_t count, unsigned long flags)
{
        char *start;
        struct vmap_block *vb;
        struct xarray *xa;
        unsigned long offset;
        unsigned int rs, re;
        size_t remains, n;

        /*
         * If it's area created by vm_map_ram() interface directly, but
         * not further subdividing and delegating management to vmap_block,
         * handle it here.
         */
        if (!(flags & VMAP_BLOCK))
                return aligned_vread_iter(iter, addr, count);

        remains = count;

        /*
         * Area is split into regions and tracked with vmap_block, read out
         * each region and zero fill the hole between regions.
         */
        xa = addr_to_vb_xa((unsigned long) addr);
        vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
        if (!vb)
                goto finished_zero;

        spin_lock(&vb->lock);
        if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
                spin_unlock(&vb->lock);
                goto finished_zero;
        }

        for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
                size_t copied;

                if (remains == 0)
                        goto finished;

                start = vmap_block_vaddr(vb->va->va_start, rs);

                if (addr < start) {
                        size_t to_zero = min_t(size_t, start - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                /*it could start reading from the middle of used region*/
                offset = offset_in_page(addr);
                n = ((re - rs + 1) << PAGE_SHIFT) - offset;
                if (n > remains)
                        n = remains;

                copied = aligned_vread_iter(iter, start + offset, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;
        }

        spin_unlock(&vb->lock);

finished_zero:
        /* zero-fill the left dirty or free regions */
        return count - remains + zero_iter(iter, remains);
finished:
        /* We couldn't copy/zero everything */
        spin_unlock(&vb->lock);
        return count - remains;
}

/**
 * vread_iter() - read vmalloc area in a safe way to an iterator.
 * @iter:         the iterator to which data should be written.
 * @addr:         vm address.
 * @count:        number of bytes to be read.
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * copy data from that area to a given buffer. If the given memory range
 * of [addr...addr+count) includes some valid address, data is copied to
 * proper area of @buf. If there are memory holes, they'll be zero-filled.
 * IOREMAP area is treated as memory hole and no copy is done.
 *
 * If [addr...addr+count) doesn't includes any intersects with alive
 * vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 * Note: In usual ops, vread() is never necessary because the caller
 * should know vmalloc() area is valid and can use memcpy().
 * This is for routines which have to access vmalloc area without
 * any information, as /proc/kcore.
 *
 * Return: number of bytes for which addr and buf should be increased
 * (same number as @count) or %0 if [addr...addr+count) doesn't
 * include any intersection with valid vmalloc area
 */
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *vm;
        char *vaddr;
        size_t n, size, flags, remains;
        unsigned long next;

        addr = kasan_reset_tag(addr);

        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;

        remains = count;

        vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
        if (!vn)
                goto finished_zero;

        /* no intersects with alive vmap_area */
        if ((unsigned long)addr + remains <= va->va_start)
                goto finished_zero;

        do {
                size_t copied;

                if (remains == 0)
                        goto finished;

                vm = va->vm;
                flags = va->flags & VMAP_FLAGS_MASK;
                /*
                 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
                 * be set together with VMAP_RAM.
                 */
                WARN_ON(flags == VMAP_BLOCK);

                if (!vm && !flags)
                        goto next_va;

                if (vm && (vm->flags & VM_UNINITIALIZED))
                        goto next_va;

                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                vaddr = (char *) va->va_start;
                size = vm ? get_vm_area_size(vm) : va_size(va);

                if (addr >= vaddr + size)
                        goto next_va;

                if (addr < vaddr) {
                        size_t to_zero = min_t(size_t, vaddr - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                n = vaddr + size - addr;
                if (n > remains)
                        n = remains;

                if (flags & VMAP_RAM)
                        copied = vmap_ram_vread_iter(iter, addr, n, flags);
                else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
                        copied = aligned_vread_iter(iter, addr, n);
                else /* IOREMAP | SPARSE area is treated as memory hole */
                        copied = zero_iter(iter, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;

        next_va:
                next = va->va_end;
                spin_unlock(&vn->busy.lock);
        } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));

finished_zero:
        if (vn)
                spin_unlock(&vn->busy.lock);

        /* zero-fill memory holes */
        return count - remains + zero_iter(iter, remains);
finished:
        /* Nothing remains, or We couldn't copy/zero everything. */
        if (vn)
                spin_unlock(&vn->busy.lock);

        return count - remains;
}

/**
 * remap_vmalloc_range_partial - map vmalloc pages to userspace
 * @vma:                vma to cover
 * @uaddr:                target user address to start at
 * @kaddr:                virtual address of vmalloc kernel memory
 * @pgoff:                offset from @kaddr to start at
 * @size:                size of map area
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that @kaddr is a valid vmalloc'ed area,
 * and that it is big enough to cover the range starting at
 * @uaddr in @vma. Will return failure if that criteria isn't
 * met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
                                void *kaddr, unsigned long pgoff,
                                unsigned long size)
{
        struct vm_struct *area;
        unsigned long off;
        unsigned long end_index;

        if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
                return -EINVAL;

        size = PAGE_ALIGN(size);

        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;

        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;

        if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
                return -EINVAL;

        if (check_add_overflow(size, off, &end_index) ||
            end_index > get_vm_area_size(area))
                return -EINVAL;
        kaddr += off;

        do {
                struct page *page = vmalloc_to_page(kaddr);
                int ret;

                ret = vm_insert_page(vma, uaddr, page);
                if (ret)
                        return ret;

                uaddr += PAGE_SIZE;
                kaddr += PAGE_SIZE;
                size -= PAGE_SIZE;
        } while (size > 0);

        vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);

        return 0;
}

/**
 * remap_vmalloc_range - map vmalloc pages to userspace
 * @vma:                vma to cover (map full range of vma)
 * @addr:                vmalloc memory
 * @pgoff:                number of pages into addr before first page to map
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * that it is big enough to cover the vma. Will return failure if
 * that criteria isn't met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                unsigned long pgoff)
{
        return remap_vmalloc_range_partial(vma, vma->vm_start,
                                           addr, pgoff,
                                           vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);

void free_vm_area(struct vm_struct *area)
{
        struct vm_struct *ret;
        ret = remove_vm_area(area->addr);
        BUG_ON(ret != area);
        kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);

#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
        return rb_entry_safe(n, struct vmap_area, rb_node);
}

/**
 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
 * @addr: target address
 *
 * Returns: vmap_area if it is found. If there is no such area
 *   the first highest(reverse order) vmap_area is returned
 *   i.e. va->va_start < addr && va->va_end < addr or NULL
 *   if there are no any areas before @addr.
 */
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
        struct vmap_area *va, *tmp;
        struct rb_node *n;

        n = free_vmap_area_root.rb_node;
        va = NULL;

        while (n) {
                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_start <= addr) {
                        va = tmp;
                        if (tmp->va_end >= addr)
                                break;

                        n = n->rb_right;
                } else {
                        n = n->rb_left;
                }
        }

        return va;
}

/**
 * pvm_determine_end_from_reverse - find the highest aligned address
 * of free block below VMALLOC_END
 * @va:
 *   in - the VA we start the search(reverse order);
 *   out - the VA with the highest aligned end address.
 * @align: alignment for required highest address
 *
 * Returns: determined end address within vmap_area
 */
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
        unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        unsigned long addr;

        if (likely(*va)) {
                list_for_each_entry_from_reverse((*va),
                                &free_vmap_area_list, list) {
                        addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
                        if ((*va)->va_start < addr)
                                return addr;
                }
        }

        return 0;
}

/**
 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 * @offsets: array containing offset of each area
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *            vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 * be scattered pretty far, distance between two areas easily going up
 * to gigabytes.  To avoid interacting with regular vmallocs, these
 * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple. It
 * does everything top-down and scans free blocks from the end looking
 * for matching base. While scanning, if any of the areas do not fit the
 * base address is pulled down to fit the area. Scanning is repeated till
 * all the areas fit and then all necessary data structures are inserted
 * and the result is returned.
 */
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align)
{
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        struct vmap_area **vas, *va;
        struct vm_struct **vms;
        int area, area2, last_area, term_area;
        unsigned long base, start, size, end, last_end, orig_start, orig_end;
        bool purged = false;

        /* verify parameters and allocate data structures */
        BUG_ON(offset_in_page(align) || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
                start = offsets[area];
                end = start + sizes[area];

                /* is everything aligned properly? */
                BUG_ON(!IS_ALIGNED(offsets[area], align));
                BUG_ON(!IS_ALIGNED(sizes[area], align));

                /* detect the area with the highest address */
                if (start > offsets[last_area])
                        last_area = area;

                for (area2 = area + 1; area2 < nr_vms; area2++) {
                        unsigned long start2 = offsets[area2];
                        unsigned long end2 = start2 + sizes[area2];

                        BUG_ON(start2 < end && start < end2);
                }
        }
        last_end = offsets[last_area] + sizes[last_area];

        if (vmalloc_end - vmalloc_start < last_end) {
                WARN_ON(true);
                return NULL;
        }

        vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
        vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
        if (!vas || !vms)
                goto err_free2;

        for (area = 0; area < nr_vms; area++) {
                vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
retry:
        spin_lock(&free_vmap_area_lock);

        /* start scanning - we scan from the top, begin with the last area */
        area = term_area = last_area;
        start = offsets[area];
        end = start + sizes[area];

        va = pvm_find_va_enclose_addr(vmalloc_end);
        base = pvm_determine_end_from_reverse(&va, align) - end;

        while (true) {
                /*
                 * base might have underflowed, add last_end before
                 * comparing.
                 */
                if (base + last_end < vmalloc_start + last_end)
                        goto overflow;

                /*
                 * Fitting base has not been found.
                 */
                if (va == NULL)
                        goto overflow;

                /*
                 * If required width exceeds current VA block, move
                 * base downwards and then recheck.
                 */
                if (base + end > va->va_end) {
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * If this VA does not fit, move base downwards and recheck.
                 */
                if (base + start < va->va_start) {
                        va = node_to_va(rb_prev(&va->rb_node));
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * This area fits, move on to the previous one.  If
                 * the previous one is the terminal one, we're done.
                 */
                area = (area + nr_vms - 1) % nr_vms;
                if (area == term_area)
                        break;

                start = offsets[area];
                end = start + sizes[area];
                va = pvm_find_va_enclose_addr(base + end);
        }

        /* we've found a fitting base, insert all va's */
        for (area = 0; area < nr_vms; area++) {
                int ret;

                start = base + offsets[area];
                size = sizes[area];

                va = pvm_find_va_enclose_addr(start);
                if (WARN_ON_ONCE(va == NULL))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                ret = va_clip(&free_vmap_area_root,
                        &free_vmap_area_list, va, start, size);
                if (WARN_ON_ONCE(unlikely(ret)))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                /* Allocated area. */
                va = vas[area];
                va->va_start = start;
                va->va_end = start + size;
        }

        spin_unlock(&free_vmap_area_lock);

        /* populate the kasan shadow space */
        for (area = 0; area < nr_vms; area++) {
                if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
                        goto err_free_shadow;
        }

        /* insert all vm's */
        for (area = 0; area < nr_vms; area++) {
                struct vmap_node *vn = addr_to_node(vas[area]->va_start);

                spin_lock(&vn->busy.lock);
                insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
                setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
                                 pcpu_get_vm_areas);
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Mark allocated areas as accessible. Do it now as a best-effort
         * approach, as they can be mapped outside of vmalloc code.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        for (area = 0; area < nr_vms; area++)
                vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
                                vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);

        kfree(vas);
        return vms;

recovery:
        /*
         * Remove previously allocated areas. There is no
         * need in removing these areas from the busy tree,
         * because they are inserted only on the final step
         * and when pcpu_get_vm_areas() is success.
         */
        while (area--) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
                vas[area] = NULL;
        }

overflow:
        spin_unlock(&free_vmap_area_lock);
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = true;

                /* Before "retry", check if we recover. */
                for (area = 0; area < nr_vms; area++) {
                        if (vas[area])
                                continue;

                        vas[area] = kmem_cache_zalloc(
                                vmap_area_cachep, GFP_KERNEL);
                        if (!vas[area])
                                goto err_free;
                }

                goto retry;
        }

err_free:
        for (area = 0; area < nr_vms; area++) {
                if (vas[area])
                        kmem_cache_free(vmap_area_cachep, vas[area]);

                kfree(vms[area]);
        }
err_free2:
        kfree(vas);
        kfree(vms);
        return NULL;

err_free_shadow:
        spin_lock(&free_vmap_area_lock);
        /*
         * We release all the vmalloc shadows, even the ones for regions that
         * hadn't been successfully added. This relies on kasan_release_vmalloc
         * being able to tolerate this case.
         */
        for (area = 0; area < nr_vms; area++) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
                vas[area] = NULL;
                kfree(vms[area]);
        }
        spin_unlock(&free_vmap_area_lock);
        kfree(vas);
        kfree(vms);
        return NULL;
}

/**
 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 * @nr_vms: the number of allocated areas
 *
 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 */
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
        int i;

        for (i = 0; i < nr_vms; i++)
                free_vm_area(vms[i]);
        kfree(vms);
}
#endif        /* CONFIG_SMP */

#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
        const void *caller;
        struct vm_struct *vm;
        struct vmap_area *va;
        struct vmap_node *vn;
        unsigned long addr;
        unsigned int nr_pages;

        addr = PAGE_ALIGN((unsigned long) object);
        vn = addr_to_node(addr);

        if (!spin_trylock(&vn->busy.lock))
                return false;

        va = __find_vmap_area(addr, &vn->busy.root);
        if (!va || !va->vm) {
                spin_unlock(&vn->busy.lock);
                return false;
        }

        vm = va->vm;
        addr = (unsigned long) vm->addr;
        caller = vm->caller;
        nr_pages = vm->nr_pages;
        spin_unlock(&vn->busy.lock);

        pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
                nr_pages, addr, caller);

        return true;
}
#endif

#ifdef CONFIG_PROC_FS
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
        if (IS_ENABLED(CONFIG_NUMA)) {
                unsigned int nr, *counters = m->private;
                unsigned int step = 1U << vm_area_page_order(v);

                if (!counters)
                        return;

                if (v->flags & VM_UNINITIALIZED)
                        return;
                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                memset(counters, 0, nr_node_ids * sizeof(unsigned int));

                for (nr = 0; nr < v->nr_pages; nr += step)
                        counters[page_to_nid(v->pages[nr])] += step;
                for_each_node_state(nr, N_HIGH_MEMORY)
                        if (counters[nr])
                                seq_printf(m, " N%u=%u", nr, counters[nr]);
        }
}

static void show_purge_info(struct seq_file *m)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->lazy.lock);
                list_for_each_entry(va, &vn->lazy.head, list) {
                        seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
                                (void *)va->va_start, (void *)va->va_end,
                                va->va_end - va->va_start);
                }
                spin_unlock(&vn->lazy.lock);
        }
}

static int vmalloc_info_show(struct seq_file *m, void *p)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *v;
        int i;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                list_for_each_entry(va, &vn->busy.head, list) {
                        if (!va->vm) {
                                if (va->flags & VMAP_RAM)
                                        seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
                                                (void *)va->va_start, (void *)va->va_end,
                                                va->va_end - va->va_start);

                                continue;
                        }

                        v = va->vm;

                        seq_printf(m, "0x%pK-0x%pK %7ld",
                                v->addr, v->addr + v->size, v->size);

                        if (v->caller)
                                seq_printf(m, " %pS", v->caller);

                        if (v->nr_pages)
                                seq_printf(m, " pages=%d", v->nr_pages);

                        if (v->phys_addr)
                                seq_printf(m, " phys=%pa", &v->phys_addr);

                        if (v->flags & VM_IOREMAP)
                                seq_puts(m, " ioremap");

                        if (v->flags & VM_SPARSE)
                                seq_puts(m, " sparse");

                        if (v->flags & VM_ALLOC)
                                seq_puts(m, " vmalloc");

                        if (v->flags & VM_MAP)
                                seq_puts(m, " vmap");

                        if (v->flags & VM_USERMAP)
                                seq_puts(m, " user");

                        if (v->flags & VM_DMA_COHERENT)
                                seq_puts(m, " dma-coherent");

                        if (is_vmalloc_addr(v->pages))
                                seq_puts(m, " vpages");

                        show_numa_info(m, v);
                        seq_putc(m, '\n');
                }
                spin_unlock(&vn->busy.lock);
        }

        /*
         * As a final step, dump "unpurged" areas.
         */
        show_purge_info(m);
        return 0;
}

static int __init proc_vmalloc_init(void)
{
        void *priv_data = NULL;

        if (IS_ENABLED(CONFIG_NUMA))
                priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);

        proc_create_single_data("vmallocinfo",
                0400, NULL, vmalloc_info_show, priv_data);

        return 0;
}
module_init(proc_vmalloc_init);

#endif

static void __init vmap_init_free_space(void)
{
        unsigned long vmap_start = 1;
        const unsigned long vmap_end = ULONG_MAX;
        struct vmap_area *free;
        struct vm_struct *busy;

        /*
         *     B     F     B     B     B     F
         * -|-----|.....|-----|-----|-----|.....|-
         *  |           The KVA space           |
         *  |<--------------------------------->|
         */
        for (busy = vmlist; busy; busy = busy->next) {
                if ((unsigned long) busy->addr - vmap_start > 0) {
                        free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!WARN_ON_ONCE(!free)) {
                                free->va_start = vmap_start;
                                free->va_end = (unsigned long) busy->addr;

                                insert_vmap_area_augment(free, NULL,
                                        &free_vmap_area_root,
                                                &free_vmap_area_list);
                        }
                }

                vmap_start = (unsigned long) busy->addr + busy->size;
        }

        if (vmap_end - vmap_start > 0) {
                free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (!WARN_ON_ONCE(!free)) {
                        free->va_start = vmap_start;
                        free->va_end = vmap_end;

                        insert_vmap_area_augment(free, NULL,
                                &free_vmap_area_root,
                                        &free_vmap_area_list);
                }
        }
}

static void vmap_init_nodes(void)
{
        struct vmap_node *vn;
        int i, n;

#if BITS_PER_LONG == 64
        /*
         * A high threshold of max nodes is fixed and bound to 128,
         * thus a scale factor is 1 for systems where number of cores
         * are less or equal to specified threshold.
         *
         * As for NUMA-aware notes. For bigger systems, for example
         * NUMA with multi-sockets, where we can end-up with thousands
         * of cores in total, a "sub-numa-clustering" should be added.
         *
         * In this case a NUMA domain is considered as a single entity
         * with dedicated sub-nodes in it which describe one group or
         * set of cores. Therefore a per-domain purging is supposed to
         * be added as well as a per-domain balancing.
         */
        n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);

        if (n > 1) {
                vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
                if (vn) {
                        /* Node partition is 16 pages. */
                        vmap_zone_size = (1 << 4) * PAGE_SIZE;
                        nr_vmap_nodes = n;
                        vmap_nodes = vn;
                } else {
                        pr_err("Failed to allocate an array. Disable a node layer\n");
                }
        }
#endif

        for (n = 0; n < nr_vmap_nodes; n++) {
                vn = &vmap_nodes[n];
                vn->busy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->busy.head);
                spin_lock_init(&vn->busy.lock);

                vn->lazy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->lazy.head);
                spin_lock_init(&vn->lazy.lock);

                for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                        INIT_LIST_HEAD(&vn->pool[i].head);
                        WRITE_ONCE(vn->pool[i].len, 0);
                }

                spin_lock_init(&vn->pool_lock);
        }
}

static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        unsigned long count;
        struct vmap_node *vn;
        int i, j;

        for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
                        count += READ_ONCE(vn->pool[j].len);
        }

        return count ? count : SHRINK_EMPTY;
}

static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        int i;

        for (i = 0; i < nr_vmap_nodes; i++)
                decay_va_pool_node(&vmap_nodes[i], true);

        return SHRINK_STOP;
}

void __init vmalloc_init(void)
{
        struct shrinker *vmap_node_shrinker;
        struct vmap_area *va;
        struct vmap_node *vn;
        struct vm_struct *tmp;
        int i;

        /*
         * Create the cache for vmap_area objects.
         */
        vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);

        for_each_possible_cpu(i) {
                struct vmap_block_queue *vbq;
                struct vfree_deferred *p;

                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
                p = &per_cpu(vfree_deferred, i);
                init_llist_head(&p->list);
                INIT_WORK(&p->wq, delayed_vfree_work);
                xa_init(&vbq->vmap_blocks);
        }

        /*
         * Setup nodes before importing vmlist.
         */
        vmap_init_nodes();

        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (WARN_ON_ONCE(!va))
                        continue;

                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
                va->vm = tmp;

                vn = addr_to_node(va->va_start);
                insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        }

        /*
         * Now we can initialize a free vmap space.
         */
        vmap_init_free_space();
        vmap_initialized = true;

        vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
        if (!vmap_node_shrinker) {
                pr_err("Failed to allocate vmap-node shrinker!\n");
                return;
        }

        vmap_node_shrinker->count_objects = vmap_node_shrink_count;
        vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
        shrinker_register(vmap_node_shrinker);
}


































































































































































































    1 













































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 *
 * Author: Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_api.c
 *        Implements must_appraise_or_measure, collect_measurement,
 *        appraise_measurement, store_measurement and store_template.
 */
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/xattr.h>
#include <linux/evm.h>
#include <linux/fsverity.h>

#include "ima.h"

/*
 * ima_free_template_entry - free an existing template entry
 */
void ima_free_template_entry(struct ima_template_entry *entry)
{
        int i;

        for (i = 0; i < entry->template_desc->num_fields; i++)
                kfree(entry->template_data[i].data);

        kfree(entry->digests);
        kfree(entry);
}

/*
 * ima_alloc_init_template - create and initialize a new template entry
 */
int ima_alloc_init_template(struct ima_event_data *event_data,
                            struct ima_template_entry **entry,
                            struct ima_template_desc *desc)
{
        struct ima_template_desc *template_desc;
        struct tpm_digest *digests;
        int i, result = 0;

        if (desc)
                template_desc = desc;
        else
                template_desc = ima_template_desc_current();

        *entry = kzalloc(struct_size(*entry, template_data,
                                     template_desc->num_fields), GFP_NOFS);
        if (!*entry)
                return -ENOMEM;

        digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots,
                          sizeof(*digests), GFP_NOFS);
        if (!digests) {
                kfree(*entry);
                *entry = NULL;
                return -ENOMEM;
        }

        (*entry)->digests = digests;
        (*entry)->template_desc = template_desc;
        for (i = 0; i < template_desc->num_fields; i++) {
                const struct ima_template_field *field =
                        template_desc->fields[i];
                u32 len;

                result = field->field_init(event_data,
                                           &((*entry)->template_data[i]));
                if (result != 0)
                        goto out;

                len = (*entry)->template_data[i].len;
                (*entry)->template_data_len += sizeof(len);
                (*entry)->template_data_len += len;
        }
        return 0;
out:
        ima_free_template_entry(*entry);
        *entry = NULL;
        return result;
}

/*
 * ima_store_template - store ima template measurements
 *
 * Calculate the hash of a template entry, add the template entry
 * to an ordered list of measurement entries maintained inside the kernel,
 * and also update the aggregate integrity value (maintained inside the
 * configured TPM PCR) over the hashes of the current list of measurement
 * entries.
 *
 * Applications retrieve the current kernel-held measurement list through
 * the securityfs entries in /sys/kernel/security/ima. The signed aggregate
 * TPM PCR (called quote) can be retrieved using a TPM user space library
 * and is used to validate the measurement list.
 *
 * Returns 0 on success, error code otherwise
 */
int ima_store_template(struct ima_template_entry *entry,
                       int violation, struct inode *inode,
                       const unsigned char *filename, int pcr)
{
        static const char op[] = "add_template_measure";
        static const char audit_cause[] = "hashing_error";
        char *template_name = entry->template_desc->name;
        int result;

        if (!violation) {
                result = ima_calc_field_array_hash(&entry->template_data[0],
                                                   entry);
                if (result < 0) {
                        integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
                                            template_name, op,
                                            audit_cause, result, 0);
                        return result;
                }
        }
        entry->pcr = pcr;
        result = ima_add_template_entry(entry, violation, op, inode, filename);
        return result;
}

/*
 * ima_add_violation - add violation to measurement list.
 *
 * Violations are flagged in the measurement list with zero hash values.
 * By extending the PCR with 0xFF's instead of with zeroes, the PCR
 * value is invalidated.
 */
void ima_add_violation(struct file *file, const unsigned char *filename,
                       struct ima_iint_cache *iint, const char *op,
                       const char *cause)
{
        struct ima_template_entry *entry;
        struct inode *inode = file_inode(file);
        struct ima_event_data event_data = { .iint = iint,
                                             .file = file,
                                             .filename = filename,
                                             .violation = cause };
        int violation = 1;
        int result;

        /* can overflow, only indicator */
        atomic_long_inc(&ima_htable.violations);

        result = ima_alloc_init_template(&event_data, &entry, NULL);
        if (result < 0) {
                result = -ENOMEM;
                goto err_out;
        }
        result = ima_store_template(entry, violation, inode,
                                    filename, CONFIG_IMA_MEASURE_PCR_IDX);
        if (result < 0)
                ima_free_template_entry(entry);
err_out:
        integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                            op, cause, result, 0);
}

/**
 * ima_get_action - appraise & measure decision based on policy.
 * @idmap: idmap of the mount the inode was found from
 * @inode: pointer to the inode associated with the object being validated
 * @cred: pointer to credentials structure to validate
 * @secid: secid of the task being validated
 * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC,
 *        MAY_APPEND)
 * @func: caller identifier
 * @pcr: pointer filled in if matched measure policy sets pcr=
 * @template_desc: pointer filled in if matched measure policy sets template=
 * @func_data: func specific data, may be NULL
 * @allowed_algos: allowlist of hash algorithms for the IMA xattr
 *
 * The policy is defined in terms of keypairs:
 *                subj=, obj=, type=, func=, mask=, fsmagic=
 *        subj,obj, and type: are LSM specific.
 *        func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
 *        | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA | SETXATTR_CHECK
 *        | MMAP_CHECK_REQPROT
 *        mask: contains the permission mask
 *        fsmagic: hex value
 *
 * Returns IMA_MEASURE, IMA_APPRAISE mask.
 *
 */
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
                   const struct cred *cred, u32 secid, int mask,
                   enum ima_hooks func, int *pcr,
                   struct ima_template_desc **template_desc,
                   const char *func_data, unsigned int *allowed_algos)
{
        int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH;

        flags &= ima_policy_flag;

        return ima_match_policy(idmap, inode, cred, secid, func, mask,
                                flags, pcr, template_desc, func_data,
                                allowed_algos);
}

static bool ima_get_verity_digest(struct ima_iint_cache *iint,
                                  struct inode *inode,
                                  struct ima_max_digest_data *hash)
{
        enum hash_algo alg;
        int digest_len;

        /*
         * On failure, 'measure' policy rules will result in a file data
         * hash containing 0's.
         */
        digest_len = fsverity_get_digest(inode, hash->digest, NULL, &alg);
        if (digest_len == 0)
                return false;

        /*
         * Unlike in the case of actually calculating the file hash, in
         * the fsverity case regardless of the hash algorithm, return
         * the verity digest to be included in the measurement list. A
         * mismatch between the verity algorithm and the xattr signature
         * algorithm, if one exists, will be detected later.
         */
        hash->hdr.algo = alg;
        hash->hdr.length = digest_len;
        return true;
}

/*
 * ima_collect_measurement - collect file measurement
 *
 * Calculate the file hash, if it doesn't already exist,
 * storing the measurement and i_version in the iint.
 *
 * Must be called with iint->mutex held.
 *
 * Return 0 on success, error code otherwise
 */
int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file,
                            void *buf, loff_t size, enum hash_algo algo,
                            struct modsig *modsig)
{
        const char *audit_cause = "failed";
        struct inode *inode = file_inode(file);
        struct inode *real_inode = d_real_inode(file_dentry(file));
        struct ima_max_digest_data hash;
        struct ima_digest_data *hash_hdr = container_of(&hash.hdr,
                                                struct ima_digest_data, hdr);
        struct name_snapshot filename;
        struct kstat stat;
        int result = 0;
        int length;
        void *tmpbuf;
        u64 i_version = 0;

        /*
         * Always collect the modsig, because IMA might have already collected
         * the file digest without collecting the modsig in a previous
         * measurement rule.
         */
        if (modsig)
                ima_collect_modsig(modsig, buf, size);

        if (iint->flags & IMA_COLLECTED)
                goto out;

        /*
         * Detecting file change is based on i_version. On filesystems
         * which do not support i_version, support was originally limited
         * to an initial measurement/appraisal/audit, but was modified to
         * assume the file changed.
         */
        result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE,
                                   AT_STATX_SYNC_AS_STAT);
        if (!result && (stat.result_mask & STATX_CHANGE_COOKIE))
                i_version = stat.change_cookie;
        hash.hdr.algo = algo;
        hash.hdr.length = hash_digest_size[algo];

        /* Initialize hash digest to 0's in case of failure */
        memset(&hash.digest, 0, sizeof(hash.digest));

        if (iint->flags & IMA_VERITY_REQUIRED) {
                if (!ima_get_verity_digest(iint, inode, &hash)) {
                        audit_cause = "no-verity-digest";
                        result = -ENODATA;
                }
        } else if (buf) {
                result = ima_calc_buffer_hash(buf, size, hash_hdr);
        } else {
                result = ima_calc_file_hash(file, hash_hdr);
        }

        if (result && result != -EBADF && result != -EINVAL)
                goto out;

        length = sizeof(hash.hdr) + hash.hdr.length;
        tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS);
        if (!tmpbuf) {
                result = -ENOMEM;
                goto out;
        }

        iint->ima_hash = tmpbuf;
        memcpy(iint->ima_hash, &hash, length);
        if (real_inode == inode)
                iint->real_inode.version = i_version;
        else
                integrity_inode_attrs_store(&iint->real_inode, i_version,
                                            real_inode);

        /* Possibly temporary failure due to type of read (eg. O_DIRECT) */
        if (!result)
                iint->flags |= IMA_COLLECTED;
out:
        if (result) {
                if (file->f_flags & O_DIRECT)
                        audit_cause = "failed(directio)";

                take_dentry_name_snapshot(&filename, file->f_path.dentry);

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode,
                                    filename.name.name, "collect_data",
                                    audit_cause, result, 0);

                release_dentry_name_snapshot(&filename);
        }
        return result;
}

/*
 * ima_store_measurement - store file measurement
 *
 * Create an "ima" template and then store the template by calling
 * ima_store_template.
 *
 * We only get here if the inode has not already been measured,
 * but the measurement could already exist:
 *        - multiple copies of the same file on either the same or
 *          different filesystems.
 *        - the inode was previously flushed as well as the iint info,
 *          containing the hashing info.
 *
 * Must be called with iint->mutex held.
 */
void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
                           const unsigned char *filename,
                           struct evm_ima_xattr_data *xattr_value,
                           int xattr_len, const struct modsig *modsig, int pcr,
                           struct ima_template_desc *template_desc)
{
        static const char op[] = "add_template_measure";
        static const char audit_cause[] = "ENOMEM";
        int result = -ENOMEM;
        struct inode *inode = file_inode(file);
        struct ima_template_entry *entry;
        struct ima_event_data event_data = { .iint = iint,
                                             .file = file,
                                             .filename = filename,
                                             .xattr_value = xattr_value,
                                             .xattr_len = xattr_len,
                                             .modsig = modsig };
        int violation = 0;

        /*
         * We still need to store the measurement in the case of MODSIG because
         * we only have its contents to put in the list at the time of
         * appraisal, but a file measurement from earlier might already exist in
         * the measurement list.
         */
        if (iint->measured_pcrs & (0x1 << pcr) && !modsig)
                return;

        result = ima_alloc_init_template(&event_data, &entry, template_desc);
        if (result < 0) {
                integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                                    op, audit_cause, result, 0);
                return;
        }

        result = ima_store_template(entry, violation, inode, filename, pcr);
        if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) {
                iint->flags |= IMA_MEASURED;
                iint->measured_pcrs |= (0x1 << pcr);
        }
        if (result < 0)
                ima_free_template_entry(entry);
}

void ima_audit_measurement(struct ima_iint_cache *iint,
                           const unsigned char *filename)
{
        struct audit_buffer *ab;
        char *hash;
        const char *algo_name = hash_algo_name[iint->ima_hash->algo];
        int i;

        if (iint->flags & IMA_AUDITED)
                return;

        hash = kzalloc((iint->ima_hash->length * 2) + 1, GFP_KERNEL);
        if (!hash)
                return;

        for (i = 0; i < iint->ima_hash->length; i++)
                hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]);
        hash[i * 2] = '\0';

        ab = audit_log_start(audit_context(), GFP_KERNEL,
                             AUDIT_INTEGRITY_RULE);
        if (!ab)
                goto out;

        audit_log_format(ab, "file=");
        audit_log_untrustedstring(ab, filename);
        audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash);

        audit_log_task_info(ab);
        audit_log_end(ab);

        iint->flags |= IMA_AUDITED;
out:
        kfree(hash);
        return;
}

/*
 * ima_d_path - return a pointer to the full pathname
 *
 * Attempt to return a pointer to the full pathname for use in the
 * IMA measurement list, IMA audit records, and auditing logs.
 *
 * On failure, return a pointer to a copy of the filename, not dname.
 * Returning a pointer to dname, could result in using the pointer
 * after the memory has been freed.
 */
const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf)
{
        struct name_snapshot filename;
        char *pathname = NULL;

        *pathbuf = __getname();
        if (*pathbuf) {
                pathname = d_absolute_path(path, *pathbuf, PATH_MAX);
                if (IS_ERR(pathname)) {
                        __putname(*pathbuf);
                        *pathbuf = NULL;
                        pathname = NULL;
                }
        }

        if (!pathname) {
                take_dentry_name_snapshot(&filename, path->dentry);
                strscpy(namebuf, filename.name.name, NAME_MAX);
                release_dentry_name_snapshot(&filename);

                pathname = namebuf;
        }

        return pathname;
}




























































































































































































































































































































































































































































































































































































    1 


























































































































































    1 



    1 













    1 


    1 






    1 



    1 


















































































    1 











    1 










































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Symmetric key cipher operations.
 *
 * Generic encrypt/decrypt wrapper for ciphers, handles operations across
 * multiple page boundaries by using temporary blocks.  In user context,
 * the kernel is given a chance to schedule us once per page.
 *
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/aead.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/bug.h>
#include <linux/cryptouser.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <net/netlink.h>
#include "skcipher.h"

#define CRYPTO_ALG_TYPE_SKCIPHER_MASK        0x0000000e

enum {
        SKCIPHER_WALK_PHYS = 1 << 0,
        SKCIPHER_WALK_SLOW = 1 << 1,
        SKCIPHER_WALK_COPY = 1 << 2,
        SKCIPHER_WALK_DIFF = 1 << 3,
        SKCIPHER_WALK_SLEEP = 1 << 4,
};

struct skcipher_walk_buffer {
        struct list_head entry;
        struct scatter_walk dst;
        unsigned int len;
        u8 *data;
        u8 buffer[];
};

static const struct crypto_type crypto_skcipher_type;

static int skcipher_walk_next(struct skcipher_walk *walk);

static inline void skcipher_map_src(struct skcipher_walk *walk)
{
        walk->src.virt.addr = scatterwalk_map(&walk->in);
}

static inline void skcipher_map_dst(struct skcipher_walk *walk)
{
        walk->dst.virt.addr = scatterwalk_map(&walk->out);
}

static inline void skcipher_unmap_src(struct skcipher_walk *walk)
{
        scatterwalk_unmap(walk->src.virt.addr);
}

static inline void skcipher_unmap_dst(struct skcipher_walk *walk)
{
        scatterwalk_unmap(walk->dst.virt.addr);
}

static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk)
{
        return walk->flags & SKCIPHER_WALK_SLEEP ? GFP_KERNEL : GFP_ATOMIC;
}

/* Get a spot of the specified length that does not straddle a page.
 * The caller needs to ensure that there is enough space for this operation.
 */
static inline u8 *skcipher_get_spot(u8 *start, unsigned int len)
{
        u8 *end_page = (u8 *)(((unsigned long)(start + len - 1)) & PAGE_MASK);

        return max(start, end_page);
}

static inline struct skcipher_alg *__crypto_skcipher_alg(
        struct crypto_alg *alg)
{
        return container_of(alg, struct skcipher_alg, base);
}

static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
{
        u8 *addr;

        addr = (u8 *)ALIGN((unsigned long)walk->buffer, walk->alignmask + 1);
        addr = skcipher_get_spot(addr, bsize);
        scatterwalk_copychunks(addr, &walk->out, bsize,
                               (walk->flags & SKCIPHER_WALK_PHYS) ? 2 : 1);
        return 0;
}

int skcipher_walk_done(struct skcipher_walk *walk, int err)
{
        unsigned int n = walk->nbytes;
        unsigned int nbytes = 0;

        if (!n)
                goto finish;

        if (likely(err >= 0)) {
                n -= err;
                nbytes = walk->total - n;
        }

        if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS |
                                    SKCIPHER_WALK_SLOW |
                                    SKCIPHER_WALK_COPY |
                                    SKCIPHER_WALK_DIFF)))) {
unmap_src:
                skcipher_unmap_src(walk);
        } else if (walk->flags & SKCIPHER_WALK_DIFF) {
                skcipher_unmap_dst(walk);
                goto unmap_src;
        } else if (walk->flags & SKCIPHER_WALK_COPY) {
                skcipher_map_dst(walk);
                memcpy(walk->dst.virt.addr, walk->page, n);
                skcipher_unmap_dst(walk);
        } else if (unlikely(walk->flags & SKCIPHER_WALK_SLOW)) {
                if (err > 0) {
                        /*
                         * Didn't process all bytes.  Either the algorithm is
                         * broken, or this was the last step and it turned out
                         * the message wasn't evenly divisible into blocks but
                         * the algorithm requires it.
                         */
                        err = -EINVAL;
                        nbytes = 0;
                } else
                        n = skcipher_done_slow(walk, n);
        }

        if (err > 0)
                err = 0;

        walk->total = nbytes;
        walk->nbytes = 0;

        scatterwalk_advance(&walk->in, n);
        scatterwalk_advance(&walk->out, n);
        scatterwalk_done(&walk->in, 0, nbytes);
        scatterwalk_done(&walk->out, 1, nbytes);

        if (nbytes) {
                crypto_yield(walk->flags & SKCIPHER_WALK_SLEEP ?
                             CRYPTO_TFM_REQ_MAY_SLEEP : 0);
                return skcipher_walk_next(walk);
        }

finish:
        /* Short-circuit for the common/fast path. */
        if (!((unsigned long)walk->buffer | (unsigned long)walk->page))
                goto out;

        if (walk->flags & SKCIPHER_WALK_PHYS)
                goto out;

        if (walk->iv != walk->oiv)
                memcpy(walk->oiv, walk->iv, walk->ivsize);
        if (walk->buffer != walk->page)
                kfree(walk->buffer);
        if (walk->page)
                free_page((unsigned long)walk->page);

out:
        return err;
}
EXPORT_SYMBOL_GPL(skcipher_walk_done);

void skcipher_walk_complete(struct skcipher_walk *walk, int err)
{
        struct skcipher_walk_buffer *p, *tmp;

        list_for_each_entry_safe(p, tmp, &walk->buffers, entry) {
                u8 *data;

                if (err)
                        goto done;

                data = p->data;
                if (!data) {
                        data = PTR_ALIGN(&p->buffer[0], walk->alignmask + 1);
                        data = skcipher_get_spot(data, walk->stride);
                }

                scatterwalk_copychunks(data, &p->dst, p->len, 1);

                if (offset_in_page(p->data) + p->len + walk->stride >
                    PAGE_SIZE)
                        free_page((unsigned long)p->data);

done:
                list_del(&p->entry);
                kfree(p);
        }

        if (!err && walk->iv != walk->oiv)
                memcpy(walk->oiv, walk->iv, walk->ivsize);
        if (walk->buffer != walk->page)
                kfree(walk->buffer);
        if (walk->page)
                free_page((unsigned long)walk->page);
}
EXPORT_SYMBOL_GPL(skcipher_walk_complete);

static void skcipher_queue_write(struct skcipher_walk *walk,
                                 struct skcipher_walk_buffer *p)
{
        p->dst = walk->out;
        list_add_tail(&p->entry, &walk->buffers);
}

static int skcipher_next_slow(struct skcipher_walk *walk, unsigned int bsize)
{
        bool phys = walk->flags & SKCIPHER_WALK_PHYS;
        unsigned alignmask = walk->alignmask;
        struct skcipher_walk_buffer *p;
        unsigned a;
        unsigned n;
        u8 *buffer;
        void *v;

        if (!phys) {
                if (!walk->buffer)
                        walk->buffer = walk->page;
                buffer = walk->buffer;
                if (buffer)
                        goto ok;
        }

        /* Start with the minimum alignment of kmalloc. */
        a = crypto_tfm_ctx_alignment() - 1;
        n = bsize;

        if (phys) {
                /* Calculate the minimum alignment of p->buffer. */
                a &= (sizeof(*p) ^ (sizeof(*p) - 1)) >> 1;
                n += sizeof(*p);
        }

        /* Minimum size to align p->buffer by alignmask. */
        n += alignmask & ~a;

        /* Minimum size to ensure p->buffer does not straddle a page. */
        n += (bsize - 1) & ~(alignmask | a);

        v = kzalloc(n, skcipher_walk_gfp(walk));
        if (!v)
                return skcipher_walk_done(walk, -ENOMEM);

        if (phys) {
                p = v;
                p->len = bsize;
                skcipher_queue_write(walk, p);
                buffer = p->buffer;
        } else {
                walk->buffer = v;
                buffer = v;
        }

ok:
        walk->dst.virt.addr = PTR_ALIGN(buffer, alignmask + 1);
        walk->dst.virt.addr = skcipher_get_spot(walk->dst.virt.addr, bsize);
        walk->src.virt.addr = walk->dst.virt.addr;

        scatterwalk_copychunks(walk->src.virt.addr, &walk->in, bsize, 0);

        walk->nbytes = bsize;
        walk->flags |= SKCIPHER_WALK_SLOW;

        return 0;
}

static int skcipher_next_copy(struct skcipher_walk *walk)
{
        struct skcipher_walk_buffer *p;
        u8 *tmp = walk->page;

        skcipher_map_src(walk);
        memcpy(tmp, walk->src.virt.addr, walk->nbytes);
        skcipher_unmap_src(walk);

        walk->src.virt.addr = tmp;
        walk->dst.virt.addr = tmp;

        if (!(walk->flags & SKCIPHER_WALK_PHYS))
                return 0;

        p = kmalloc(sizeof(*p), skcipher_walk_gfp(walk));
        if (!p)
                return -ENOMEM;

        p->data = walk->page;
        p->len = walk->nbytes;
        skcipher_queue_write(walk, p);

        if (offset_in_page(walk->page) + walk->nbytes + walk->stride >
            PAGE_SIZE)
                walk->page = NULL;
        else
                walk->page += walk->nbytes;

        return 0;
}

static int skcipher_next_fast(struct skcipher_walk *walk)
{
        unsigned long diff;

        walk->src.phys.page = scatterwalk_page(&walk->in);
        walk->src.phys.offset = offset_in_page(walk->in.offset);
        walk->dst.phys.page = scatterwalk_page(&walk->out);
        walk->dst.phys.offset = offset_in_page(walk->out.offset);

        if (walk->flags & SKCIPHER_WALK_PHYS)
                return 0;

        diff = walk->src.phys.offset - walk->dst.phys.offset;
        diff |= walk->src.virt.page - walk->dst.virt.page;

        skcipher_map_src(walk);
        walk->dst.virt.addr = walk->src.virt.addr;

        if (diff) {
                walk->flags |= SKCIPHER_WALK_DIFF;
                skcipher_map_dst(walk);
        }

        return 0;
}

static int skcipher_walk_next(struct skcipher_walk *walk)
{
        unsigned int bsize;
        unsigned int n;
        int err;

        walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY |
                         SKCIPHER_WALK_DIFF);

        n = walk->total;
        bsize = min(walk->stride, max(n, walk->blocksize));
        n = scatterwalk_clamp(&walk->in, n);
        n = scatterwalk_clamp(&walk->out, n);

        if (unlikely(n < bsize)) {
                if (unlikely(walk->total < walk->blocksize))
                        return skcipher_walk_done(walk, -EINVAL);

slow_path:
                err = skcipher_next_slow(walk, bsize);
                goto set_phys_lowmem;
        }

        if (unlikely((walk->in.offset | walk->out.offset) & walk->alignmask)) {
                if (!walk->page) {
                        gfp_t gfp = skcipher_walk_gfp(walk);

                        walk->page = (void *)__get_free_page(gfp);
                        if (!walk->page)
                                goto slow_path;
                }

                walk->nbytes = min_t(unsigned, n,
                                     PAGE_SIZE - offset_in_page(walk->page));
                walk->flags |= SKCIPHER_WALK_COPY;
                err = skcipher_next_copy(walk);
                goto set_phys_lowmem;
        }

        walk->nbytes = n;

        return skcipher_next_fast(walk);

set_phys_lowmem:
        if (!err && (walk->flags & SKCIPHER_WALK_PHYS)) {
                walk->src.phys.page = virt_to_page(walk->src.virt.addr);
                walk->dst.phys.page = virt_to_page(walk->dst.virt.addr);
                walk->src.phys.offset &= PAGE_SIZE - 1;
                walk->dst.phys.offset &= PAGE_SIZE - 1;
        }
        return err;
}

static int skcipher_copy_iv(struct skcipher_walk *walk)
{
        unsigned a = crypto_tfm_ctx_alignment() - 1;
        unsigned alignmask = walk->alignmask;
        unsigned ivsize = walk->ivsize;
        unsigned bs = walk->stride;
        unsigned aligned_bs;
        unsigned size;
        u8 *iv;

        aligned_bs = ALIGN(bs, alignmask + 1);

        /* Minimum size to align buffer by alignmask. */
        size = alignmask & ~a;

        if (walk->flags & SKCIPHER_WALK_PHYS)
                size += ivsize;
        else {
                size += aligned_bs + ivsize;

                /* Minimum size to ensure buffer does not straddle a page. */
                size += (bs - 1) & ~(alignmask | a);
        }

        walk->buffer = kmalloc(size, skcipher_walk_gfp(walk));
        if (!walk->buffer)
                return -ENOMEM;

        iv = PTR_ALIGN(walk->buffer, alignmask + 1);
        iv = skcipher_get_spot(iv, bs) + aligned_bs;

        walk->iv = memcpy(iv, walk->iv, walk->ivsize);
        return 0;
}

static int skcipher_walk_first(struct skcipher_walk *walk)
{
        if (WARN_ON_ONCE(in_hardirq()))
                return -EDEADLK;

        walk->buffer = NULL;
        if (unlikely(((unsigned long)walk->iv & walk->alignmask))) {
                int err = skcipher_copy_iv(walk);
                if (err)
                        return err;
        }

        walk->page = NULL;

        return skcipher_walk_next(walk);
}

static int skcipher_walk_skcipher(struct skcipher_walk *walk,
                                  struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        walk->total = req->cryptlen;
        walk->nbytes = 0;
        walk->iv = req->iv;
        walk->oiv = req->iv;

        if (unlikely(!walk->total))
                return 0;

        scatterwalk_start(&walk->in, req->src);
        scatterwalk_start(&walk->out, req->dst);

        walk->flags &= ~SKCIPHER_WALK_SLEEP;
        walk->flags |= req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
                       SKCIPHER_WALK_SLEEP : 0;

        walk->blocksize = crypto_skcipher_blocksize(tfm);
        walk->ivsize = crypto_skcipher_ivsize(tfm);
        walk->alignmask = crypto_skcipher_alignmask(tfm);

        if (alg->co.base.cra_type != &crypto_skcipher_type)
                walk->stride = alg->co.chunksize;
        else
                walk->stride = alg->walksize;

        return skcipher_walk_first(walk);
}

int skcipher_walk_virt(struct skcipher_walk *walk,
                       struct skcipher_request *req, bool atomic)
{
        int err;

        might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP);

        walk->flags &= ~SKCIPHER_WALK_PHYS;

        err = skcipher_walk_skcipher(walk, req);

        walk->flags &= atomic ? ~SKCIPHER_WALK_SLEEP : ~0;

        return err;
}
EXPORT_SYMBOL_GPL(skcipher_walk_virt);

int skcipher_walk_async(struct skcipher_walk *walk,
                        struct skcipher_request *req)
{
        walk->flags |= SKCIPHER_WALK_PHYS;

        INIT_LIST_HEAD(&walk->buffers);

        return skcipher_walk_skcipher(walk, req);
}
EXPORT_SYMBOL_GPL(skcipher_walk_async);

static int skcipher_walk_aead_common(struct skcipher_walk *walk,
                                     struct aead_request *req, bool atomic)
{
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);
        int err;

        walk->nbytes = 0;
        walk->iv = req->iv;
        walk->oiv = req->iv;

        if (unlikely(!walk->total))
                return 0;

        walk->flags &= ~SKCIPHER_WALK_PHYS;

        scatterwalk_start(&walk->in, req->src);
        scatterwalk_start(&walk->out, req->dst);

        scatterwalk_copychunks(NULL, &walk->in, req->assoclen, 2);
        scatterwalk_copychunks(NULL, &walk->out, req->assoclen, 2);

        scatterwalk_done(&walk->in, 0, walk->total);
        scatterwalk_done(&walk->out, 0, walk->total);

        if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP)
                walk->flags |= SKCIPHER_WALK_SLEEP;
        else
                walk->flags &= ~SKCIPHER_WALK_SLEEP;

        walk->blocksize = crypto_aead_blocksize(tfm);
        walk->stride = crypto_aead_chunksize(tfm);
        walk->ivsize = crypto_aead_ivsize(tfm);
        walk->alignmask = crypto_aead_alignmask(tfm);

        err = skcipher_walk_first(walk);

        if (atomic)
                walk->flags &= ~SKCIPHER_WALK_SLEEP;

        return err;
}

int skcipher_walk_aead_encrypt(struct skcipher_walk *walk,
                               struct aead_request *req, bool atomic)
{
        walk->total = req->cryptlen;

        return skcipher_walk_aead_common(walk, req, atomic);
}
EXPORT_SYMBOL_GPL(skcipher_walk_aead_encrypt);

int skcipher_walk_aead_decrypt(struct skcipher_walk *walk,
                               struct aead_request *req, bool atomic)
{
        struct crypto_aead *tfm = crypto_aead_reqtfm(req);

        walk->total = req->cryptlen - crypto_aead_authsize(tfm);

        return skcipher_walk_aead_common(walk, req, atomic);
}
EXPORT_SYMBOL_GPL(skcipher_walk_aead_decrypt);

static void skcipher_set_needkey(struct crypto_skcipher *tfm)
{
        if (crypto_skcipher_max_keysize(tfm) != 0)
                crypto_skcipher_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
}

static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm,
                                     const u8 *key, unsigned int keylen)
{
        unsigned long alignmask = crypto_skcipher_alignmask(tfm);
        struct skcipher_alg *cipher = crypto_skcipher_alg(tfm);
        u8 *buffer, *alignbuffer;
        unsigned long absize;
        int ret;

        absize = keylen + alignmask;
        buffer = kmalloc(absize, GFP_ATOMIC);
        if (!buffer)
                return -ENOMEM;

        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        ret = cipher->setkey(tfm, alignbuffer, keylen);
        kfree_sensitive(buffer);
        return ret;
}

int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
                           unsigned int keylen)
{
        struct skcipher_alg *cipher = crypto_skcipher_alg(tfm);
        unsigned long alignmask = crypto_skcipher_alignmask(tfm);
        int err;

        if (cipher->co.base.cra_type != &crypto_skcipher_type) {
                struct crypto_lskcipher **ctx = crypto_skcipher_ctx(tfm);

                crypto_lskcipher_clear_flags(*ctx, CRYPTO_TFM_REQ_MASK);
                crypto_lskcipher_set_flags(*ctx,
                                           crypto_skcipher_get_flags(tfm) &
                                           CRYPTO_TFM_REQ_MASK);
                err = crypto_lskcipher_setkey(*ctx, key, keylen);
                goto out;
        }

        if (keylen < cipher->min_keysize || keylen > cipher->max_keysize)
                return -EINVAL;

        if ((unsigned long)key & alignmask)
                err = skcipher_setkey_unaligned(tfm, key, keylen);
        else
                err = cipher->setkey(tfm, key, keylen);

out:
        if (unlikely(err)) {
                skcipher_set_needkey(tfm);
                return err;
        }

        crypto_skcipher_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_skcipher_setkey);

int crypto_skcipher_encrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_encrypt_sg(req);
        return alg->encrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt);

int crypto_skcipher_decrypt(struct skcipher_request *req)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;
        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_decrypt_sg(req);
        return alg->decrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt);

static int crypto_lskcipher_export(struct skcipher_request *req, void *out)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        u8 *ivs = skcipher_request_ctx(req);

        ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1);

        memcpy(out, ivs + crypto_skcipher_ivsize(tfm),
               crypto_skcipher_statesize(tfm));

        return 0;
}

static int crypto_lskcipher_import(struct skcipher_request *req, const void *in)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        u8 *ivs = skcipher_request_ctx(req);

        ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1);

        memcpy(ivs + crypto_skcipher_ivsize(tfm), in,
               crypto_skcipher_statesize(tfm));

        return 0;
}

static int skcipher_noexport(struct skcipher_request *req, void *out)
{
        return 0;
}

static int skcipher_noimport(struct skcipher_request *req, const void *in)
{
        return 0;
}

int crypto_skcipher_export(struct skcipher_request *req, void *out)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_export(req, out);
        return alg->export(req, out);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_export);

int crypto_skcipher_import(struct skcipher_request *req, const void *in)
{
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct skcipher_alg *alg = crypto_skcipher_alg(tfm);

        if (alg->co.base.cra_type != &crypto_skcipher_type)
                return crypto_lskcipher_import(req, in);
        return alg->import(req, in);
}
EXPORT_SYMBOL_GPL(crypto_skcipher_import);

static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm);
        struct skcipher_alg *alg = crypto_skcipher_alg(skcipher);

        alg->exit(skcipher);
}

static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm);
        struct skcipher_alg *alg = crypto_skcipher_alg(skcipher);

        skcipher_set_needkey(skcipher);

        if (tfm->__crt_alg->cra_type != &crypto_skcipher_type) {
                unsigned am = crypto_skcipher_alignmask(skcipher);
                unsigned reqsize;

                reqsize = am & ~(crypto_tfm_ctx_alignment() - 1);
                reqsize += crypto_skcipher_ivsize(skcipher);
                reqsize += crypto_skcipher_statesize(skcipher);
                crypto_skcipher_set_reqsize(skcipher, reqsize);

                return crypto_init_lskcipher_ops_sg(tfm);
        }

        if (alg->exit)
                skcipher->base.exit = crypto_skcipher_exit_tfm;

        if (alg->init)
                return alg->init(skcipher);

        return 0;
}

static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg)
{
        if (alg->cra_type != &crypto_skcipher_type)
                return sizeof(struct crypto_lskcipher *);

        return crypto_alg_extsize(alg);
}

static void crypto_skcipher_free_instance(struct crypto_instance *inst)
{
        struct skcipher_instance *skcipher =
                container_of(inst, struct skcipher_instance, s.base);

        skcipher->free(skcipher);
}

static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);

        seq_printf(m, "type         : skcipher\n");
        seq_printf(m, "async        : %s\n",
                   alg->cra_flags & CRYPTO_ALG_ASYNC ?  "yes" : "no");
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "min keysize  : %u\n", skcipher->min_keysize);
        seq_printf(m, "max keysize  : %u\n", skcipher->max_keysize);
        seq_printf(m, "ivsize       : %u\n", skcipher->ivsize);
        seq_printf(m, "chunksize    : %u\n", skcipher->chunksize);
        seq_printf(m, "walksize     : %u\n", skcipher->walksize);
        seq_printf(m, "statesize    : %u\n", skcipher->statesize);
}

static int __maybe_unused crypto_skcipher_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);
        struct crypto_report_blkcipher rblkcipher;

        memset(&rblkcipher, 0, sizeof(rblkcipher));

        strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type));
        strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv));

        rblkcipher.blocksize = alg->cra_blocksize;
        rblkcipher.min_keysize = skcipher->min_keysize;
        rblkcipher.max_keysize = skcipher->max_keysize;
        rblkcipher.ivsize = skcipher->ivsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER,
                       sizeof(rblkcipher), &rblkcipher);
}

static const struct crypto_type crypto_skcipher_type = {
        .extsize = crypto_skcipher_extsize,
        .init_tfm = crypto_skcipher_init_tfm,
        .free = crypto_skcipher_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_skcipher_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_skcipher_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK,
        .type = CRYPTO_ALG_TYPE_SKCIPHER,
        .tfmsize = offsetof(struct crypto_skcipher, base),
};

int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn,
                         struct crypto_instance *inst,
                         const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_skcipher_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_skcipher);

struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
                                              u32 type, u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_skcipher);

struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(
                                const char *alg_name, u32 type, u32 mask)
{
        struct crypto_skcipher *tfm;

        /* Only sync algorithms allowed. */
        mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE;

        tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask);

        /*
         * Make sure we do not allocate something that might get used with
         * an on-stack request: check the request size.
         */
        if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) >
                                    MAX_SYNC_SKCIPHER_REQSIZE)) {
                crypto_free_skcipher(tfm);
                return ERR_PTR(-EINVAL);
        }

        return (struct crypto_sync_skcipher *)tfm;
}
EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher);

int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_skcipher);

int skcipher_prepare_alg_common(struct skcipher_alg_common *alg)
{
        struct crypto_alg *base = &alg->base;

        if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
            alg->statesize > PAGE_SIZE / 2 ||
            (alg->ivsize + alg->statesize) > PAGE_SIZE / 2)
                return -EINVAL;

        if (!alg->chunksize)
                alg->chunksize = base->cra_blocksize;

        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;

        return 0;
}

static int skcipher_prepare_alg(struct skcipher_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = skcipher_prepare_alg_common(&alg->co);
        if (err)
                return err;

        if (alg->walksize > PAGE_SIZE / 8)
                return -EINVAL;

        if (!alg->walksize)
                alg->walksize = alg->chunksize;

        if (!alg->statesize) {
                alg->import = skcipher_noimport;
                alg->export = skcipher_noexport;
        } else if (!(alg->import && alg->export))
                return -EINVAL;

        base->cra_type = &crypto_skcipher_type;
        base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER;

        return 0;
}

int crypto_register_skcipher(struct skcipher_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = skcipher_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_skcipher);

void crypto_unregister_skcipher(struct skcipher_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_skcipher);

int crypto_register_skciphers(struct skcipher_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_skcipher(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_skcipher(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_skciphers);

void crypto_unregister_skciphers(struct skcipher_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_skcipher(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_skciphers);

int skcipher_register_instance(struct crypto_template *tmpl,
                           struct skcipher_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = skcipher_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, skcipher_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(skcipher_register_instance);

static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key,
                                  unsigned int keylen)
{
        struct crypto_cipher *cipher = skcipher_cipher_simple(tfm);

        crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK);
        crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) &
                                CRYPTO_TFM_REQ_MASK);
        return crypto_cipher_setkey(cipher, key, keylen);
}

static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm)
{
        struct skcipher_instance *inst = skcipher_alg_instance(tfm);
        struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst);
        struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm);
        struct crypto_cipher *cipher;

        cipher = crypto_spawn_cipher(spawn);
        if (IS_ERR(cipher))
                return PTR_ERR(cipher);

        ctx->cipher = cipher;
        return 0;
}

static void skcipher_exit_tfm_simple(struct crypto_skcipher *tfm)
{
        struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm);

        crypto_free_cipher(ctx->cipher);
}

static void skcipher_free_instance_simple(struct skcipher_instance *inst)
{
        crypto_drop_cipher(skcipher_instance_ctx(inst));
        kfree(inst);
}

/**
 * skcipher_alloc_instance_simple - allocate instance of simple block cipher mode
 *
 * Allocate an skcipher_instance for a simple block cipher mode of operation,
 * e.g. cbc or ecb.  The instance context will have just a single crypto_spawn,
 * that for the underlying cipher.  The {min,max}_keysize, ivsize, blocksize,
 * alignmask, and priority are set from the underlying cipher but can be
 * overridden if needed.  The tfm context defaults to skcipher_ctx_simple, and
 * default ->setkey(), ->init(), and ->exit() methods are installed.
 *
 * @tmpl: the template being instantiated
 * @tb: the template parameters
 *
 * Return: a pointer to the new instance, or an ERR_PTR().  The caller still
 *           needs to register the instance.
 */
struct skcipher_instance *skcipher_alloc_instance_simple(
        struct crypto_template *tmpl, struct rtattr **tb)
{
        u32 mask;
        struct skcipher_instance *inst;
        struct crypto_cipher_spawn *spawn;
        struct crypto_alg *cipher_alg;
        int err;

        err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask);
        if (err)
                return ERR_PTR(err);

        inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
        if (!inst)
                return ERR_PTR(-ENOMEM);
        spawn = skcipher_instance_ctx(inst);

        err = crypto_grab_cipher(spawn, skcipher_crypto_instance(inst),
                                 crypto_attr_alg_name(tb[1]), 0, mask);
        if (err)
                goto err_free_inst;
        cipher_alg = crypto_spawn_cipher_alg(spawn);

        err = crypto_inst_setname(skcipher_crypto_instance(inst), tmpl->name,
                                  cipher_alg);
        if (err)
                goto err_free_inst;

        inst->free = skcipher_free_instance_simple;

        /* Default algorithm properties, can be overridden */
        inst->alg.base.cra_blocksize = cipher_alg->cra_blocksize;
        inst->alg.base.cra_alignmask = cipher_alg->cra_alignmask;
        inst->alg.base.cra_priority = cipher_alg->cra_priority;
        inst->alg.min_keysize = cipher_alg->cra_cipher.cia_min_keysize;
        inst->alg.max_keysize = cipher_alg->cra_cipher.cia_max_keysize;
        inst->alg.ivsize = cipher_alg->cra_blocksize;

        /* Use skcipher_ctx_simple by default, can be overridden */
        inst->alg.base.cra_ctxsize = sizeof(struct skcipher_ctx_simple);
        inst->alg.setkey = skcipher_setkey_simple;
        inst->alg.init = skcipher_init_tfm_simple;
        inst->alg.exit = skcipher_exit_tfm_simple;

        return inst;

err_free_inst:
        skcipher_free_instance_simple(inst);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Symmetric key cipher type");
MODULE_IMPORT_NS(CRYPTO_INTERNAL);













































































































































































































































































































































































































































































    2 




    2 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_DEFS_H
#define _ASM_X86_PGTABLE_DEFS_H

#include <linux/const.h>
#include <linux/mem_encrypt.h>

#include <asm/page_types.h>

#define _PAGE_BIT_PRESENT        0        /* is present */
#define _PAGE_BIT_RW                1        /* writeable */
#define _PAGE_BIT_USER                2        /* userspace addressable */
#define _PAGE_BIT_PWT                3        /* page write through */
#define _PAGE_BIT_PCD                4        /* page cache disabled */
#define _PAGE_BIT_ACCESSED        5        /* was accessed (raised by CPU) */
#define _PAGE_BIT_DIRTY                6        /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE                7        /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT                7        /* on 4KB pages */
#define _PAGE_BIT_GLOBAL        8        /* Global TLB entry PPro+ */
#define _PAGE_BIT_SOFTW1        9        /* available for programmer */
#define _PAGE_BIT_SOFTW2        10        /* " */
#define _PAGE_BIT_SOFTW3        11        /* " */
#define _PAGE_BIT_PAT_LARGE        12        /* On 2MB or 1GB pages */
#define _PAGE_BIT_SOFTW4        57        /* available for programmer */
#define _PAGE_BIT_SOFTW5        58        /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0        59        /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1        60        /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2        61        /* Protection Keys, bit 3/4 */
#define _PAGE_BIT_PKEY_BIT3        62        /* Protection Keys, bit 4/4 */
#define _PAGE_BIT_NX                63        /* No execute: only valid after cpuid check */

#define _PAGE_BIT_SPECIAL        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_UFFD_WP        _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY        _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP        _PAGE_BIT_SOFTW4

#ifdef CONFIG_X86_64
#define _PAGE_BIT_SAVED_DIRTY        _PAGE_BIT_SOFTW5 /* Saved Dirty bit */
#else
/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */
#define _PAGE_BIT_SAVED_DIRTY        _PAGE_BIT_SOFTW2 /* Saved Dirty bit */
#endif

/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE        _PAGE_BIT_GLOBAL

#define _PAGE_PRESENT        (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW        (_AT(pteval_t, 1) << _PAGE_BIT_RW)
#define _PAGE_USER        (_AT(pteval_t, 1) << _PAGE_BIT_USER)
#define _PAGE_PWT        (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD        (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED        (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE        (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL        (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_SOFTW3        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT        (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL        (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
#else
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 0))
#endif

#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
                         _PAGE_PKEY_BIT1 | \
                         _PAGE_PKEY_BIT2 | \
                         _PAGE_PKEY_BIT3)

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif

#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

/*
 * Tracking soft dirty bit when a page goes to a swap is tricky.
 * We need a bit which can be stored in pte _and_ not conflict
 * with swap entry format. On x86 bits 1-4 are *not* involved
 * into swap entry computation, but bit 7 is used for thp migration,
 * so we borrow bit 1 for soft dirty tracking.
 *
 * Please note that this bit must be treated as swap dirty page
 * mark if and only if the PTE/PMD has present bit clear!
 */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SWP_SOFT_DIRTY        _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define _PAGE_UFFD_WP                (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
#define _PAGE_SWP_UFFD_WP        _PAGE_USER
#else
#define _PAGE_UFFD_WP                (_AT(pteval_t, 0))
#define _PAGE_SWP_UFFD_WP        (_AT(pteval_t, 0))
#endif

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX        (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_DEVMAP        (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
#define _PAGE_SOFTW4        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
#else
#define _PAGE_NX        (_AT(pteval_t, 0))
#define _PAGE_DEVMAP        (_AT(pteval_t, 0))
#define _PAGE_SOFTW4        (_AT(pteval_t, 0))
#endif

/*
 * The hardware requires shadow stack to be Write=0,Dirty=1. However,
 * there are valid cases where the kernel might create read-only PTEs that
 * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In
 * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit,
 * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have
 * (Write=0,SavedDirty=1,Dirty=0) set.
 */
#define _PAGE_SAVED_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY)

#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY)

#define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

/*
 * Set of bits not changed in pte_modify.  The pte's
 * protection key is treated like _PAGE_RW, for
 * instance, and is *not* included in this mask since
 * pte_modify() does modify it.
 */
#define _COMMON_PAGE_CHG_MASK        (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |        \
                                 _PAGE_SPECIAL | _PAGE_ACCESSED |        \
                                 _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY |        \
                                 _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK        (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)

/*
 * The cache modes defined here are used to translate between pure SW usage
 * and the HW defined cache mode bits and/or PAT entries.
 *
 * The resulting bits for PWT, PCD and PAT should be chosen in a way
 * to have the WB mode at index 0 (all bits clear). This is the default
 * right now and likely would break too much if changed.
 */
#ifndef __ASSEMBLY__
enum page_cache_mode {
        _PAGE_CACHE_MODE_WB       = 0,
        _PAGE_CACHE_MODE_WC       = 1,
        _PAGE_CACHE_MODE_UC_MINUS = 2,
        _PAGE_CACHE_MODE_UC       = 3,
        _PAGE_CACHE_MODE_WT       = 4,
        _PAGE_CACHE_MODE_WP       = 5,

        _PAGE_CACHE_MODE_NUM      = 8
};
#endif

#define _PAGE_CC                (_AT(pteval_t, cc_mask))
#define _PAGE_ENC                (_AT(pteval_t, sme_me_mask))

#define _PAGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
#define _PAGE_LARGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)

#define _PAGE_NOCACHE                (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP                (cachemode2protval(_PAGE_CACHE_MODE_WP))

#define __PP _PAGE_PRESENT
#define __RW _PAGE_RW
#define _USR _PAGE_USER
#define ___A _PAGE_ACCESSED
#define ___D _PAGE_DIRTY
#define ___G _PAGE_GLOBAL
#define __NX _PAGE_NX

#define _ENC _PAGE_ENC
#define __WP _PAGE_CACHE_WP
#define __NC _PAGE_NOCACHE
#define _PSE _PAGE_PSE

#define pgprot_val(x)                ((x).pgprot)
#define __pgprot(x)                ((pgprot_t) { (x) } )
#define __pg(x)                        __pgprot(x)

#define PAGE_NONE             __pg(   0|   0|   0|___A|   0|   0|   0|___G)
#define PAGE_SHARED             __pg(__PP|__RW|_USR|___A|__NX|   0|   0|   0)
#define PAGE_SHARED_EXEC     __pg(__PP|__RW|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY_NOEXEC     __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_COPY_EXEC             __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY_EXEC   __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)

#define __PAGE_KERNEL                 (__PP|__RW|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_EXEC         (__PP|__RW|   0|___A|   0|___D|   0|___G)

/*
 * Page tables needs to have Write=1 in order for any lower PTEs to be
 * writable. This includes shadow stack memory (Write=0, Dirty=1)
 */
#define _KERNPG_TABLE_NOENC         (__PP|__RW|   0|___A|   0|___D|   0|   0)
#define _KERNPG_TABLE                 (__PP|__RW|   0|___A|   0|___D|   0|   0| _ENC)
#define _PAGE_TABLE_NOENC         (__PP|__RW|_USR|___A|   0|___D|   0|   0)
#define _PAGE_TABLE                 (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)

#define __PAGE_KERNEL_RO         (__PP|   0|   0|___A|__NX|   0|   0|___G)
#define __PAGE_KERNEL_ROX         (__PP|   0|   0|___A|   0|   0|   0|___G)
#define __PAGE_KERNEL                 (__PP|__RW|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_EXEC         (__PP|__RW|   0|___A|   0|___D|   0|___G)
#define __PAGE_KERNEL_NOCACHE         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
#define __PAGE_KERNEL_VVAR         (__PP|   0|_USR|___A|__NX|   0|   0|___G)
#define __PAGE_KERNEL_LARGE         (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW|   0|___A|   0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __WP)


#define __PAGE_KERNEL_IO                __PAGE_KERNEL
#define __PAGE_KERNEL_IO_NOCACHE        __PAGE_KERNEL_NOCACHE


#ifndef __ASSEMBLY__

#define __PAGE_KERNEL_ENC        (__PAGE_KERNEL    | _ENC)
#define __PAGE_KERNEL_ENC_WP        (__PAGE_KERNEL_WP | _ENC)
#define __PAGE_KERNEL_NOENC        (__PAGE_KERNEL    |    0)
#define __PAGE_KERNEL_NOENC_WP        (__PAGE_KERNEL_WP |    0)

#define __pgprot_mask(x)        __pgprot((x) & __default_kernel_pte_mask)

#define PAGE_KERNEL                __pgprot_mask(__PAGE_KERNEL            | _ENC)
#define PAGE_KERNEL_NOENC        __pgprot_mask(__PAGE_KERNEL            |    0)
#define PAGE_KERNEL_RO                __pgprot_mask(__PAGE_KERNEL_RO         | _ENC)
#define PAGE_KERNEL_EXEC        __pgprot_mask(__PAGE_KERNEL_EXEC       | _ENC)
#define PAGE_KERNEL_EXEC_NOENC        __pgprot_mask(__PAGE_KERNEL_EXEC       |    0)
#define PAGE_KERNEL_ROX                __pgprot_mask(__PAGE_KERNEL_ROX        | _ENC)
#define PAGE_KERNEL_NOCACHE        __pgprot_mask(__PAGE_KERNEL_NOCACHE    | _ENC)
#define PAGE_KERNEL_LARGE        __pgprot_mask(__PAGE_KERNEL_LARGE      | _ENC)
#define PAGE_KERNEL_LARGE_EXEC        __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
#define PAGE_KERNEL_VVAR        __pgprot_mask(__PAGE_KERNEL_VVAR       | _ENC)

#define PAGE_KERNEL_IO                __pgprot_mask(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE        __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)

#endif        /* __ASSEMBLY__ */

/*
 * early identity mapping  pte attrib macros.
 */
#ifdef CONFIG_X86_64
#define __PAGE_KERNEL_IDENT_LARGE_EXEC        __PAGE_KERNEL_LARGE_EXEC
#else
#define PTE_IDENT_ATTR         0x003                /* PRESENT+RW */
#define PDE_IDENT_ATTR         0x063                /* PRESENT+RW+DIRTY+ACCESSED */
#define PGD_IDENT_ATTR         0x001                /* PRESENT (no other attributes) */
#endif

#ifdef CONFIG_X86_32
# include <asm/pgtable_32_types.h>
#else
# include <asm/pgtable_64_types.h>
#endif

#ifndef __ASSEMBLY__

#include <linux/types.h>

/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK                ((pteval_t)PHYSICAL_PAGE_MASK)

/*
 *  Extracts the flags from a (pte|pmd|pud|pgd)val_t
 *  This includes the protection key value.
 */
#define PTE_FLAGS_MASK                (~PTE_PFN_MASK)

typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;

typedef struct { pgdval_t pgd; } pgd_t;

static inline pgprot_t pgprot_nx(pgprot_t prot)
{
        return __pgprot(pgprot_val(prot) | _PAGE_NX);
}
#define pgprot_nx pgprot_nx

#ifdef CONFIG_X86_PAE

/*
 * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
 * use it here.
 */

#define PGD_PAE_PAGE_MASK        ((signed long)PAGE_MASK)
#define PGD_PAE_PHYS_MASK        (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)

/*
 * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
 * All other bits are Reserved MBZ
 */
#define PGD_ALLOWED_BITS        (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
                                 _PAGE_PWT | _PAGE_PCD | \
                                 _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)

#else
/* No need to mask any bits for !PAE */
#define PGD_ALLOWED_BITS        (~0ULL)
#endif

static inline pgd_t native_make_pgd(pgdval_t val)
{
        return (pgd_t) { val & PGD_ALLOWED_BITS };
}

static inline pgdval_t native_pgd_val(pgd_t pgd)
{
        return pgd.pgd & PGD_ALLOWED_BITS;
}

static inline pgdval_t pgd_flags(pgd_t pgd)
{
        return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}

#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { val };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return p4d.p4d;
}
#else
#include <asm-generic/pgtable-nop4d.h>

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return native_pgd_val(p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;

static inline pud_t native_make_pud(pmdval_t val)
{
        return (pud_t) { val };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return pud.pud;
}
#else
#include <asm-generic/pgtable-nopud.h>

static inline pud_t native_make_pud(pudval_t val)
{
        return (pud_t) { .p4d.pgd = native_make_pgd(val) };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return native_pgd_val(pud.p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pmd = val };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return pmd.pmd;
}
#else
#include <asm-generic/pgtable-nopmd.h>

static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif

static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
{
        /* No 512 GiB huge pages yet */
        return PTE_PFN_MASK;
}

static inline p4dval_t p4d_flags_mask(p4d_t p4d)
{
        return ~p4d_pfn_mask(p4d);
}

static inline p4dval_t p4d_flags(p4d_t p4d)
{
        return native_p4d_val(p4d) & p4d_flags_mask(p4d);
}

static inline pudval_t pud_pfn_mask(pud_t pud)
{
        if (native_pud_val(pud) & _PAGE_PSE)
                return PHYSICAL_PUD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pudval_t pud_flags_mask(pud_t pud)
{
        return ~pud_pfn_mask(pud);
}

static inline pudval_t pud_flags(pud_t pud)
{
        return native_pud_val(pud) & pud_flags_mask(pud);
}

static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
{
        if (native_pmd_val(pmd) & _PAGE_PSE)
                return PHYSICAL_PMD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pmdval_t pmd_flags_mask(pmd_t pmd)
{
        return ~pmd_pfn_mask(pmd);
}

static inline pmdval_t pmd_flags(pmd_t pmd)
{
        return native_pmd_val(pmd) & pmd_flags_mask(pmd);
}

static inline pte_t native_make_pte(pteval_t val)
{
        return (pte_t) { .pte = val };
}

static inline pteval_t native_pte_val(pte_t pte)
{
        return pte.pte;
}

static inline pteval_t pte_flags(pte_t pte)
{
        return native_pte_val(pte) & PTE_FLAGS_MASK;
}

#define __pte2cm_idx(cb)                                \
        ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) |                \
         (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) |                \
         (((cb) >> _PAGE_BIT_PWT) & 1))
#define __cm_idx2pte(i)                                        \
        ((((i) & 4) << (_PAGE_BIT_PAT - 2)) |                \
         (((i) & 2) << (_PAGE_BIT_PCD - 1)) |                \
         (((i) & 1) << _PAGE_BIT_PWT))

unsigned long cachemode2protval(enum page_cache_mode pcm);

static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
        return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
}
static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT_LARGE) >>
                 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
        return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
}


typedef struct page *pgtable_t;

extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
extern void set_nx(void);
extern int nx_enabled;

#define pgprot_writecombine        pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);

#define pgprot_writethrough        pgprot_writethrough
extern pgprot_t pgprot_writethrough(pgprot_t prot);

/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING

#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot);

/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);

#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
#define native_pagetable_init        paging_init
#endif

enum pg_level {
        PG_LEVEL_NONE,
        PG_LEVEL_4K,
        PG_LEVEL_2M,
        PG_LEVEL_1G,
        PG_LEVEL_512G,
        PG_LEVEL_NUM
};

#ifdef CONFIG_PROC_FS
extern void update_page_count(int level, unsigned long pages);
#else
static inline void update_page_count(int level, unsigned long pages) { }
#endif

/*
 * Helper function that returns the kernel pagetable entry controlling
 * the virtual address 'address'. NULL means no pagetable entry present.
 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
 * as a pte too.
 */
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                                    unsigned int *level);
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
                                          unsigned long address,
                                          unsigned numpages,
                                          unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                            unsigned long numpages);
#endif        /* !__ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_DEFS_H */

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 








    1 









    1 





    1 

    1 













































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mac80211_hwsim - software simulator of 802.11 radio(s) for mac80211
 * Copyright (c) 2008, Jouni Malinen <j@w1.fi>
 * Copyright (c) 2011, Javier Lopez <jlopex@gmail.com>
 * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
 * Copyright (C) 2018 - 2024 Intel Corporation
 */

/*
 * TODO:
 * - Add TSF sync and fix IBSS beacon transmission by adding
 *   competition for "air time" at TBTT
 * - RX filtering based on filter configuration (data->rx_filter)
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <net/dst.h>
#include <net/xfrm.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/etherdevice.h>
#include <linux/platform_device.h>
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/ktime.h>
#include <net/genetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <linux/rhashtable.h>
#include <linux/nospec.h>
#include <linux/virtio.h>
#include <linux/virtio_ids.h>
#include <linux/virtio_config.h>
#include "mac80211_hwsim.h"

#define WARN_QUEUE 100
#define MAX_QUEUE 200

MODULE_AUTHOR("Jouni Malinen");
MODULE_DESCRIPTION("Software simulator of 802.11 radio(s) for mac80211");
MODULE_LICENSE("GPL");

static int radios = 2;
module_param(radios, int, 0444);
MODULE_PARM_DESC(radios, "Number of simulated radios");

static int channels = 1;
module_param(channels, int, 0444);
MODULE_PARM_DESC(channels, "Number of concurrent channels");

static bool paged_rx = false;
module_param(paged_rx, bool, 0644);
MODULE_PARM_DESC(paged_rx, "Use paged SKBs for RX instead of linear ones");

static bool rctbl = false;
module_param(rctbl, bool, 0444);
MODULE_PARM_DESC(rctbl, "Handle rate control table");

static bool support_p2p_device = true;
module_param(support_p2p_device, bool, 0444);
MODULE_PARM_DESC(support_p2p_device, "Support P2P-Device interface type");

static bool mlo;
module_param(mlo, bool, 0444);
MODULE_PARM_DESC(mlo, "Support MLO");

/**
 * enum hwsim_regtest - the type of regulatory tests we offer
 *
 * @HWSIM_REGTEST_DISABLED: No regulatory tests are performed,
 *         this is the default value.
 * @HWSIM_REGTEST_DRIVER_REG_FOLLOW: Used for testing the driver regulatory
 *        hint, only one driver regulatory hint will be sent as such the
 *         secondary radios are expected to follow.
 * @HWSIM_REGTEST_DRIVER_REG_ALL: Used for testing the driver regulatory
 *         request with all radios reporting the same regulatory domain.
 * @HWSIM_REGTEST_DIFF_COUNTRY: Used for testing the drivers calling
 *         different regulatory domains requests. Expected behaviour is for
 *         an intersection to occur but each device will still use their
 *         respective regulatory requested domains. Subsequent radios will
 *         use the resulting intersection.
 * @HWSIM_REGTEST_WORLD_ROAM: Used for testing the world roaming. We accomplish
 *        this by using a custom beacon-capable regulatory domain for the first
 *        radio. All other device world roam.
 * @HWSIM_REGTEST_CUSTOM_WORLD: Used for testing the custom world regulatory
 *         domain requests. All radios will adhere to this custom world regulatory
 *         domain.
 * @HWSIM_REGTEST_CUSTOM_WORLD_2: Used for testing 2 custom world regulatory
 *         domain requests. The first radio will adhere to the first custom world
 *         regulatory domain, the second one to the second custom world regulatory
 *         domain. All other devices will world roam.
 * @HWSIM_REGTEST_STRICT_FOLLOW: Used for testing strict regulatory domain
 *        settings, only the first radio will send a regulatory domain request
 *        and use strict settings. The rest of the radios are expected to follow.
 * @HWSIM_REGTEST_STRICT_ALL: Used for testing strict regulatory domain
 *        settings. All radios will adhere to this.
 * @HWSIM_REGTEST_STRICT_AND_DRIVER_REG: Used for testing strict regulatory
 *        domain settings, combined with secondary driver regulatory domain
 *        settings. The first radio will get a strict regulatory domain setting
 *        using the first driver regulatory request and the second radio will use
 *        non-strict settings using the second driver regulatory request. All
 *        other devices should follow the intersection created between the
 *        first two.
 * @HWSIM_REGTEST_ALL: Used for testing every possible mix. You will need
 *         at least 6 radios for a complete test. We will test in this order:
 *         1 - driver custom world regulatory domain
 *         2 - second custom world regulatory domain
 *         3 - first driver regulatory domain request
 *         4 - second driver regulatory domain request
 *         5 - strict regulatory domain settings using the third driver regulatory
 *             domain request
 *         6 and on - should follow the intersection of the 3rd, 4rth and 5th radio
 *                    regulatory requests.
 *
 * These are the different values you can use for the regtest
 * module parameter. This is useful to help test world roaming
 * and the driver regulatory_hint() call and combinations of these.
 * If you want to do specific alpha2 regulatory domain tests simply
 * use the userspace regulatory request as that will be respected as
 * well without the need of this module parameter. This is designed
 * only for testing the driver regulatory request, world roaming
 * and all possible combinations.
 */
enum hwsim_regtest {
        HWSIM_REGTEST_DISABLED = 0,
        HWSIM_REGTEST_DRIVER_REG_FOLLOW = 1,
        HWSIM_REGTEST_DRIVER_REG_ALL = 2,
        HWSIM_REGTEST_DIFF_COUNTRY = 3,
        HWSIM_REGTEST_WORLD_ROAM = 4,
        HWSIM_REGTEST_CUSTOM_WORLD = 5,
        HWSIM_REGTEST_CUSTOM_WORLD_2 = 6,
        HWSIM_REGTEST_STRICT_FOLLOW = 7,
        HWSIM_REGTEST_STRICT_ALL = 8,
        HWSIM_REGTEST_STRICT_AND_DRIVER_REG = 9,
        HWSIM_REGTEST_ALL = 10,
};

/* Set to one of the HWSIM_REGTEST_* values above */
static int regtest = HWSIM_REGTEST_DISABLED;
module_param(regtest, int, 0444);
MODULE_PARM_DESC(regtest, "The type of regulatory test we want to run");

static const char *hwsim_alpha2s[] = {
        "FI",
        "AL",
        "US",
        "DE",
        "JP",
        "AL",
};

static const struct ieee80211_regdomain hwsim_world_regdom_custom_01 = {
        .n_reg_rules = 5,
        .alpha2 =  "99",
        .reg_rules = {
                REG_RULE(2412-10, 2462+10, 40, 0, 20, 0),
                REG_RULE(2484-10, 2484+10, 40, 0, 20, 0),
                REG_RULE(5150-10, 5240+10, 40, 0, 30, 0),
                REG_RULE(5745-10, 5825+10, 40, 0, 30, 0),
                REG_RULE(5855-10, 5925+10, 40, 0, 33, 0),
        }
};

static const struct ieee80211_regdomain hwsim_world_regdom_custom_02 = {
        .n_reg_rules = 3,
        .alpha2 =  "99",
        .reg_rules = {
                REG_RULE(2412-10, 2462+10, 40, 0, 20, 0),
                REG_RULE(5725-10, 5850+10, 40, 0, 30,
                         NL80211_RRF_NO_IR),
                REG_RULE(5855-10, 5925+10, 40, 0, 33, 0),
        }
};

static const struct ieee80211_regdomain hwsim_world_regdom_custom_03 = {
        .n_reg_rules = 6,
        .alpha2 =  "99",
        .reg_rules = {
                REG_RULE(2412 - 10, 2462 + 10, 40, 0, 20, 0),
                REG_RULE(2484 - 10, 2484 + 10, 40, 0, 20, 0),
                REG_RULE(5150 - 10, 5240 + 10, 40, 0, 30, 0),
                REG_RULE(5745 - 10, 5825 + 10, 40, 0, 30, 0),
                REG_RULE(5855 - 10, 5925 + 10, 40, 0, 33, 0),
                REG_RULE(5955 - 10, 7125 + 10, 320, 0, 33, 0),
        }
};

static const struct ieee80211_regdomain hwsim_world_regdom_custom_04 = {
        .n_reg_rules = 6,
        .alpha2 =  "99",
        .reg_rules = {
                REG_RULE(2412 - 10, 2462 + 10, 40, 0, 20, 0),
                REG_RULE(2484 - 10, 2484 + 10, 40, 0, 20, 0),
                REG_RULE(5150 - 10, 5240 + 10, 80, 0, 30, NL80211_RRF_AUTO_BW),
                REG_RULE(5260 - 10, 5320 + 10, 80, 0, 30,
                         NL80211_RRF_DFS_CONCURRENT | NL80211_RRF_DFS |
                         NL80211_RRF_AUTO_BW),
                REG_RULE(5500 - 10, 5720 + 10, 160, 0, 30,
                         NL80211_RRF_DFS_CONCURRENT | NL80211_RRF_DFS),
                REG_RULE(5745 - 10, 5825 + 10, 80, 0, 30, 0),
                REG_RULE(5855 - 10, 5925 + 10, 80, 0, 33, 0),
        }
};

static const struct ieee80211_regdomain *hwsim_world_regdom_custom[] = {
        &hwsim_world_regdom_custom_01,
        &hwsim_world_regdom_custom_02,
        &hwsim_world_regdom_custom_03,
        &hwsim_world_regdom_custom_04,
};

struct hwsim_vif_priv {
        u32 magic;
        u32 skip_beacons[IEEE80211_MLD_MAX_NUM_LINKS];
        u8 bssid[ETH_ALEN];
        bool assoc;
        bool bcn_en;
        u16 aid;
};

#define HWSIM_VIF_MAGIC        0x69537748

static inline void hwsim_check_magic(struct ieee80211_vif *vif)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
        WARN(vp->magic != HWSIM_VIF_MAGIC,
             "Invalid VIF (%p) magic %#x, %pM, %d/%d\n",
             vif, vp->magic, vif->addr, vif->type, vif->p2p);
}

static inline void hwsim_set_magic(struct ieee80211_vif *vif)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
        vp->magic = HWSIM_VIF_MAGIC;
}

static inline void hwsim_clear_magic(struct ieee80211_vif *vif)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
        vp->magic = 0;
}

struct hwsim_sta_priv {
        u32 magic;
        unsigned int last_link;
        u16 active_links_rx;
};

#define HWSIM_STA_MAGIC        0x6d537749

static inline void hwsim_check_sta_magic(struct ieee80211_sta *sta)
{
        struct hwsim_sta_priv *sp = (void *)sta->drv_priv;
        WARN_ON(sp->magic != HWSIM_STA_MAGIC);
}

static inline void hwsim_set_sta_magic(struct ieee80211_sta *sta)
{
        struct hwsim_sta_priv *sp = (void *)sta->drv_priv;
        sp->magic = HWSIM_STA_MAGIC;
}

static inline void hwsim_clear_sta_magic(struct ieee80211_sta *sta)
{
        struct hwsim_sta_priv *sp = (void *)sta->drv_priv;
        sp->magic = 0;
}

struct hwsim_chanctx_priv {
        u32 magic;
};

#define HWSIM_CHANCTX_MAGIC 0x6d53774a

static inline void hwsim_check_chanctx_magic(struct ieee80211_chanctx_conf *c)
{
        struct hwsim_chanctx_priv *cp = (void *)c->drv_priv;
        WARN_ON(cp->magic != HWSIM_CHANCTX_MAGIC);
}

static inline void hwsim_set_chanctx_magic(struct ieee80211_chanctx_conf *c)
{
        struct hwsim_chanctx_priv *cp = (void *)c->drv_priv;
        cp->magic = HWSIM_CHANCTX_MAGIC;
}

static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c)
{
        struct hwsim_chanctx_priv *cp = (void *)c->drv_priv;
        cp->magic = 0;
}

static unsigned int hwsim_net_id;

static DEFINE_IDA(hwsim_netgroup_ida);

struct hwsim_net {
        int netgroup;
        u32 wmediumd;
};

static inline int hwsim_net_get_netgroup(struct net *net)
{
        struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id);

        return hwsim_net->netgroup;
}

static inline int hwsim_net_set_netgroup(struct net *net)
{
        struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id);

        hwsim_net->netgroup = ida_alloc(&hwsim_netgroup_ida, GFP_KERNEL);
        return hwsim_net->netgroup >= 0 ? 0 : -ENOMEM;
}

static inline u32 hwsim_net_get_wmediumd(struct net *net)
{
        struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id);

        return hwsim_net->wmediumd;
}

static inline void hwsim_net_set_wmediumd(struct net *net, u32 portid)
{
        struct hwsim_net *hwsim_net = net_generic(net, hwsim_net_id);

        hwsim_net->wmediumd = portid;
}

static struct class *hwsim_class;

static struct net_device *hwsim_mon; /* global monitor netdev */

#define CHAN2G(_freq)  { \
        .band = NL80211_BAND_2GHZ, \
        .center_freq = (_freq), \
        .hw_value = (_freq), \
}

#define CHAN5G(_freq) { \
        .band = NL80211_BAND_5GHZ, \
        .center_freq = (_freq), \
        .hw_value = (_freq), \
}

#define CHAN6G(_freq) { \
        .band = NL80211_BAND_6GHZ, \
        .center_freq = (_freq), \
        .hw_value = (_freq), \
}

static const struct ieee80211_channel hwsim_channels_2ghz[] = {
        CHAN2G(2412), /* Channel 1 */
        CHAN2G(2417), /* Channel 2 */
        CHAN2G(2422), /* Channel 3 */
        CHAN2G(2427), /* Channel 4 */
        CHAN2G(2432), /* Channel 5 */
        CHAN2G(2437), /* Channel 6 */
        CHAN2G(2442), /* Channel 7 */
        CHAN2G(2447), /* Channel 8 */
        CHAN2G(2452), /* Channel 9 */
        CHAN2G(2457), /* Channel 10 */
        CHAN2G(2462), /* Channel 11 */
        CHAN2G(2467), /* Channel 12 */
        CHAN2G(2472), /* Channel 13 */
        CHAN2G(2484), /* Channel 14 */
};

static const struct ieee80211_channel hwsim_channels_5ghz[] = {
        CHAN5G(5180), /* Channel 36 */
        CHAN5G(5200), /* Channel 40 */
        CHAN5G(5220), /* Channel 44 */
        CHAN5G(5240), /* Channel 48 */

        CHAN5G(5260), /* Channel 52 */
        CHAN5G(5280), /* Channel 56 */
        CHAN5G(5300), /* Channel 60 */
        CHAN5G(5320), /* Channel 64 */

        CHAN5G(5500), /* Channel 100 */
        CHAN5G(5520), /* Channel 104 */
        CHAN5G(5540), /* Channel 108 */
        CHAN5G(5560), /* Channel 112 */
        CHAN5G(5580), /* Channel 116 */
        CHAN5G(5600), /* Channel 120 */
        CHAN5G(5620), /* Channel 124 */
        CHAN5G(5640), /* Channel 128 */
        CHAN5G(5660), /* Channel 132 */
        CHAN5G(5680), /* Channel 136 */
        CHAN5G(5700), /* Channel 140 */

        CHAN5G(5745), /* Channel 149 */
        CHAN5G(5765), /* Channel 153 */
        CHAN5G(5785), /* Channel 157 */
        CHAN5G(5805), /* Channel 161 */
        CHAN5G(5825), /* Channel 165 */
        CHAN5G(5845), /* Channel 169 */

        CHAN5G(5855), /* Channel 171 */
        CHAN5G(5860), /* Channel 172 */
        CHAN5G(5865), /* Channel 173 */
        CHAN5G(5870), /* Channel 174 */

        CHAN5G(5875), /* Channel 175 */
        CHAN5G(5880), /* Channel 176 */
        CHAN5G(5885), /* Channel 177 */
        CHAN5G(5890), /* Channel 178 */
        CHAN5G(5895), /* Channel 179 */
        CHAN5G(5900), /* Channel 180 */
        CHAN5G(5905), /* Channel 181 */

        CHAN5G(5910), /* Channel 182 */
        CHAN5G(5915), /* Channel 183 */
        CHAN5G(5920), /* Channel 184 */
        CHAN5G(5925), /* Channel 185 */
};

static const struct ieee80211_channel hwsim_channels_6ghz[] = {
        CHAN6G(5955), /* Channel 1 */
        CHAN6G(5975), /* Channel 5 */
        CHAN6G(5995), /* Channel 9 */
        CHAN6G(6015), /* Channel 13 */
        CHAN6G(6035), /* Channel 17 */
        CHAN6G(6055), /* Channel 21 */
        CHAN6G(6075), /* Channel 25 */
        CHAN6G(6095), /* Channel 29 */
        CHAN6G(6115), /* Channel 33 */
        CHAN6G(6135), /* Channel 37 */
        CHAN6G(6155), /* Channel 41 */
        CHAN6G(6175), /* Channel 45 */
        CHAN6G(6195), /* Channel 49 */
        CHAN6G(6215), /* Channel 53 */
        CHAN6G(6235), /* Channel 57 */
        CHAN6G(6255), /* Channel 61 */
        CHAN6G(6275), /* Channel 65 */
        CHAN6G(6295), /* Channel 69 */
        CHAN6G(6315), /* Channel 73 */
        CHAN6G(6335), /* Channel 77 */
        CHAN6G(6355), /* Channel 81 */
        CHAN6G(6375), /* Channel 85 */
        CHAN6G(6395), /* Channel 89 */
        CHAN6G(6415), /* Channel 93 */
        CHAN6G(6435), /* Channel 97 */
        CHAN6G(6455), /* Channel 181 */
        CHAN6G(6475), /* Channel 105 */
        CHAN6G(6495), /* Channel 109 */
        CHAN6G(6515), /* Channel 113 */
        CHAN6G(6535), /* Channel 117 */
        CHAN6G(6555), /* Channel 121 */
        CHAN6G(6575), /* Channel 125 */
        CHAN6G(6595), /* Channel 129 */
        CHAN6G(6615), /* Channel 133 */
        CHAN6G(6635), /* Channel 137 */
        CHAN6G(6655), /* Channel 141 */
        CHAN6G(6675), /* Channel 145 */
        CHAN6G(6695), /* Channel 149 */
        CHAN6G(6715), /* Channel 153 */
        CHAN6G(6735), /* Channel 157 */
        CHAN6G(6755), /* Channel 161 */
        CHAN6G(6775), /* Channel 165 */
        CHAN6G(6795), /* Channel 169 */
        CHAN6G(6815), /* Channel 173 */
        CHAN6G(6835), /* Channel 177 */
        CHAN6G(6855), /* Channel 181 */
        CHAN6G(6875), /* Channel 185 */
        CHAN6G(6895), /* Channel 189 */
        CHAN6G(6915), /* Channel 193 */
        CHAN6G(6935), /* Channel 197 */
        CHAN6G(6955), /* Channel 201 */
        CHAN6G(6975), /* Channel 205 */
        CHAN6G(6995), /* Channel 209 */
        CHAN6G(7015), /* Channel 213 */
        CHAN6G(7035), /* Channel 217 */
        CHAN6G(7055), /* Channel 221 */
        CHAN6G(7075), /* Channel 225 */
        CHAN6G(7095), /* Channel 229 */
        CHAN6G(7115), /* Channel 233 */
};

#define NUM_S1G_CHANS_US 51
static struct ieee80211_channel hwsim_channels_s1g[NUM_S1G_CHANS_US];

static const struct ieee80211_sta_s1g_cap hwsim_s1g_cap = {
        .s1g = true,
        .cap = { S1G_CAP0_SGI_1MHZ | S1G_CAP0_SGI_2MHZ,
                 0,
                 0,
                 S1G_CAP3_MAX_MPDU_LEN,
                 0,
                 S1G_CAP5_AMPDU,
                 0,
                 S1G_CAP7_DUP_1MHZ,
                 S1G_CAP8_TWT_RESPOND | S1G_CAP8_TWT_REQUEST,
                 0},
        .nss_mcs = { 0xfc | 1, /* MCS 7 for 1 SS */
        /* RX Highest Supported Long GI Data Rate 0:7 */
                     0,
        /* RX Highest Supported Long GI Data Rate 0:7 */
        /* TX S1G MCS Map 0:6 */
                     0xfa,
        /* TX S1G MCS Map :7 */
        /* TX Highest Supported Long GI Data Rate 0:6 */
                     0x80,
        /* TX Highest Supported Long GI Data Rate 7:8 */
        /* Rx Single spatial stream and S1G-MCS Map for 1MHz */
        /* Tx Single spatial stream and S1G-MCS Map for 1MHz */
                     0 },
};

static void hwsim_init_s1g_channels(struct ieee80211_channel *chans)
{
        int ch, freq;

        for (ch = 0; ch < NUM_S1G_CHANS_US; ch++) {
                freq = 902000 + (ch + 1) * 500;
                chans[ch].band = NL80211_BAND_S1GHZ;
                chans[ch].center_freq = KHZ_TO_MHZ(freq);
                chans[ch].freq_offset = freq % 1000;
                chans[ch].hw_value = ch + 1;
        }
}

static const struct ieee80211_rate hwsim_rates[] = {
        { .bitrate = 10 },
        { .bitrate = 20, .flags = IEEE80211_RATE_SHORT_PREAMBLE },
        { .bitrate = 55, .flags = IEEE80211_RATE_SHORT_PREAMBLE },
        { .bitrate = 110, .flags = IEEE80211_RATE_SHORT_PREAMBLE },
        { .bitrate = 60 },
        { .bitrate = 90 },
        { .bitrate = 120 },
        { .bitrate = 180 },
        { .bitrate = 240 },
        { .bitrate = 360 },
        { .bitrate = 480 },
        { .bitrate = 540 }
};

#define DEFAULT_RX_RSSI -50

static const u32 hwsim_ciphers[] = {
        WLAN_CIPHER_SUITE_WEP40,
        WLAN_CIPHER_SUITE_WEP104,
        WLAN_CIPHER_SUITE_TKIP,
        WLAN_CIPHER_SUITE_CCMP,
        WLAN_CIPHER_SUITE_CCMP_256,
        WLAN_CIPHER_SUITE_GCMP,
        WLAN_CIPHER_SUITE_GCMP_256,
        WLAN_CIPHER_SUITE_AES_CMAC,
        WLAN_CIPHER_SUITE_BIP_CMAC_256,
        WLAN_CIPHER_SUITE_BIP_GMAC_128,
        WLAN_CIPHER_SUITE_BIP_GMAC_256,
};

#define OUI_QCA 0x001374
#define QCA_NL80211_SUBCMD_TEST 1
enum qca_nl80211_vendor_subcmds {
        QCA_WLAN_VENDOR_ATTR_TEST = 8,
        QCA_WLAN_VENDOR_ATTR_MAX = QCA_WLAN_VENDOR_ATTR_TEST
};

static const struct nla_policy
hwsim_vendor_test_policy[QCA_WLAN_VENDOR_ATTR_MAX + 1] = {
        [QCA_WLAN_VENDOR_ATTR_MAX] = { .type = NLA_U32 },
};

static int mac80211_hwsim_vendor_cmd_test(struct wiphy *wiphy,
                                          struct wireless_dev *wdev,
                                          const void *data, int data_len)
{
        struct sk_buff *skb;
        struct nlattr *tb[QCA_WLAN_VENDOR_ATTR_MAX + 1];
        int err;
        u32 val;

        err = nla_parse_deprecated(tb, QCA_WLAN_VENDOR_ATTR_MAX, data,
                                   data_len, hwsim_vendor_test_policy, NULL);
        if (err)
                return err;
        if (!tb[QCA_WLAN_VENDOR_ATTR_TEST])
                return -EINVAL;
        val = nla_get_u32(tb[QCA_WLAN_VENDOR_ATTR_TEST]);
        wiphy_dbg(wiphy, "%s: test=%u\n", __func__, val);

        /* Send a vendor event as a test. Note that this would not normally be
         * done within a command handler, but rather, based on some other
         * trigger. For simplicity, this command is used to trigger the event
         * here.
         *
         * event_idx = 0 (index in mac80211_hwsim_vendor_commands)
         */
        skb = cfg80211_vendor_event_alloc(wiphy, wdev, 100, 0, GFP_KERNEL);
        if (skb) {
                /* skb_put() or nla_put() will fill up data within
                 * NL80211_ATTR_VENDOR_DATA.
                 */

                /* Add vendor data */
                nla_put_u32(skb, QCA_WLAN_VENDOR_ATTR_TEST, val + 1);

                /* Send the event - this will call nla_nest_end() */
                cfg80211_vendor_event(skb, GFP_KERNEL);
        }

        /* Send a response to the command */
        skb = cfg80211_vendor_cmd_alloc_reply_skb(wiphy, 10);
        if (!skb)
                return -ENOMEM;

        /* skb_put() or nla_put() will fill up data within
         * NL80211_ATTR_VENDOR_DATA
         */
        nla_put_u32(skb, QCA_WLAN_VENDOR_ATTR_TEST, val + 2);

        return cfg80211_vendor_cmd_reply(skb);
}

static struct wiphy_vendor_command mac80211_hwsim_vendor_commands[] = {
        {
                .info = { .vendor_id = OUI_QCA,
                          .subcmd = QCA_NL80211_SUBCMD_TEST },
                .flags = WIPHY_VENDOR_CMD_NEED_NETDEV,
                .doit = mac80211_hwsim_vendor_cmd_test,
                .policy = hwsim_vendor_test_policy,
                .maxattr = QCA_WLAN_VENDOR_ATTR_MAX,
        }
};

/* Advertise support vendor specific events */
static const struct nl80211_vendor_cmd_info mac80211_hwsim_vendor_events[] = {
        { .vendor_id = OUI_QCA, .subcmd = 1 },
};

static DEFINE_SPINLOCK(hwsim_radio_lock);
static LIST_HEAD(hwsim_radios);
static struct rhashtable hwsim_radios_rht;
static int hwsim_radio_idx;
static int hwsim_radios_generation = 1;

static struct platform_driver mac80211_hwsim_driver = {
        .driver = {
                .name = "mac80211_hwsim",
        },
};

struct mac80211_hwsim_link_data {
        u32 link_id;
        u64 beacon_int        /* beacon interval in us */;
        struct hrtimer beacon_timer;
};

struct mac80211_hwsim_data {
        struct list_head list;
        struct rhash_head rht;
        struct ieee80211_hw *hw;
        struct device *dev;
        struct ieee80211_supported_band bands[NUM_NL80211_BANDS];
        struct ieee80211_channel channels_2ghz[ARRAY_SIZE(hwsim_channels_2ghz)];
        struct ieee80211_channel channels_5ghz[ARRAY_SIZE(hwsim_channels_5ghz)];
        struct ieee80211_channel channels_6ghz[ARRAY_SIZE(hwsim_channels_6ghz)];
        struct ieee80211_channel channels_s1g[ARRAY_SIZE(hwsim_channels_s1g)];
        struct ieee80211_rate rates[ARRAY_SIZE(hwsim_rates)];
        struct ieee80211_iface_combination if_combination;
        struct ieee80211_iface_limit if_limits[3];
        int n_if_limits;

        u32 ciphers[ARRAY_SIZE(hwsim_ciphers)];

        struct mac_address addresses[2];
        int channels, idx;
        bool use_chanctx;
        bool destroy_on_close;
        u32 portid;
        char alpha2[2];
        const struct ieee80211_regdomain *regd;

        struct ieee80211_channel *tmp_chan;
        struct ieee80211_channel *roc_chan;
        u32 roc_duration;
        struct delayed_work roc_start;
        struct delayed_work roc_done;
        struct delayed_work hw_scan;
        struct cfg80211_scan_request *hw_scan_request;
        struct ieee80211_vif *hw_scan_vif;
        int scan_chan_idx;
        u8 scan_addr[ETH_ALEN];
        struct {
                struct ieee80211_channel *channel;
                unsigned long next_start, start, end;
        } survey_data[ARRAY_SIZE(hwsim_channels_2ghz) +
                      ARRAY_SIZE(hwsim_channels_5ghz) +
                      ARRAY_SIZE(hwsim_channels_6ghz)];

        struct ieee80211_channel *channel;
        enum nl80211_chan_width bw;
        unsigned int rx_filter;
        bool started, idle, scanning;
        struct mutex mutex;
        enum ps_mode {
                PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
        } ps;
        bool ps_poll_pending;
        struct dentry *debugfs;

        atomic_t pending_cookie;
        struct sk_buff_head pending;        /* packets pending */
        /*
         * Only radios in the same group can communicate together (the
         * channel has to match too). Each bit represents a group. A
         * radio can be in more than one group.
         */
        u64 group;

        /* group shared by radios created in the same netns */
        int netgroup;
        /* wmediumd portid responsible for netgroup of this radio */
        u32 wmediumd;

        /* difference between this hw's clock and the real clock, in usecs */
        s64 tsf_offset;
        s64 bcn_delta;
        /* absolute beacon transmission time. Used to cover up "tx" delay. */
        u64 abs_bcn_ts;

        /* Stats */
        u64 tx_pkts;
        u64 rx_pkts;
        u64 tx_bytes;
        u64 rx_bytes;
        u64 tx_dropped;
        u64 tx_failed;

        /* RSSI in rx status of the receiver */
        int rx_rssi;

        /* only used when pmsr capability is supplied */
        struct cfg80211_pmsr_capabilities pmsr_capa;
        struct cfg80211_pmsr_request *pmsr_request;
        struct wireless_dev *pmsr_request_wdev;

        struct mac80211_hwsim_link_data link_data[IEEE80211_MLD_MAX_NUM_LINKS];
};

static const struct rhashtable_params hwsim_rht_params = {
        .nelem_hint = 2,
        .automatic_shrinking = true,
        .key_len = ETH_ALEN,
        .key_offset = offsetof(struct mac80211_hwsim_data, addresses[1]),
        .head_offset = offsetof(struct mac80211_hwsim_data, rht),
};

struct hwsim_radiotap_hdr {
        struct ieee80211_radiotap_header hdr;
        __le64 rt_tsft;
        u8 rt_flags;
        u8 rt_rate;
        __le16 rt_channel;
        __le16 rt_chbitmask;
} __packed;

struct hwsim_radiotap_ack_hdr {
        struct ieee80211_radiotap_header hdr;
        u8 rt_flags;
        u8 pad;
        __le16 rt_channel;
        __le16 rt_chbitmask;
} __packed;

static struct mac80211_hwsim_data *get_hwsim_data_ref_from_addr(const u8 *addr)
{
        return rhashtable_lookup_fast(&hwsim_radios_rht, addr, hwsim_rht_params);
}

/* MAC80211_HWSIM netlink family */
static struct genl_family hwsim_genl_family;

enum hwsim_multicast_groups {
        HWSIM_MCGRP_CONFIG,
};

static const struct genl_multicast_group hwsim_mcgrps[] = {
        [HWSIM_MCGRP_CONFIG] = { .name = "config", },
};

/* MAC80211_HWSIM netlink policy */

static const struct nla_policy
hwsim_rate_info_policy[HWSIM_RATE_INFO_ATTR_MAX + 1] = {
        [HWSIM_RATE_INFO_ATTR_FLAGS] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_MCS] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_LEGACY] = { .type = NLA_U16 },
        [HWSIM_RATE_INFO_ATTR_NSS] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_BW] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_HE_GI] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_HE_DCM] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_EHT_GI] = { .type = NLA_U8 },
        [HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC] = { .type = NLA_U8 },
};

static const struct nla_policy
hwsim_ftm_result_policy[NL80211_PMSR_FTM_RESP_ATTR_MAX + 1] = {
        [NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX] = { .type = NLA_U16 },
        [NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME] = { .type = NLA_U8 },
        [NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP] = { .type = NLA_U8 },
        [NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION] = { .type = NLA_U8 },
        [NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST] = { .type = NLA_U8 },
        [NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_RESP_ATTR_TX_RATE] = NLA_POLICY_NESTED(hwsim_rate_info_policy),
        [NL80211_PMSR_FTM_RESP_ATTR_RX_RATE] = NLA_POLICY_NESTED(hwsim_rate_info_policy),
        [NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG] = { .type = NLA_U64 },
        [NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE] = { .type = NLA_U64 },
        [NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD] = { .type = NLA_U64 },
        [NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG] = { .type = NLA_U64 },
        [NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE] = { .type = NLA_U64 },
        [NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD] = { .type = NLA_U64 },
        [NL80211_PMSR_FTM_RESP_ATTR_LCI] = { .type = NLA_STRING },
        [NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_STRING },
};

static const struct nla_policy
hwsim_pmsr_resp_type_policy[NL80211_PMSR_TYPE_MAX + 1] = {
        [NL80211_PMSR_TYPE_FTM] = NLA_POLICY_NESTED(hwsim_ftm_result_policy),
};

static const struct nla_policy
hwsim_pmsr_resp_policy[NL80211_PMSR_RESP_ATTR_MAX + 1] = {
        [NL80211_PMSR_RESP_ATTR_STATUS] = { .type = NLA_U32 },
        [NL80211_PMSR_RESP_ATTR_HOST_TIME] = { .type = NLA_U64 },
        [NL80211_PMSR_RESP_ATTR_AP_TSF] = { .type = NLA_U64 },
        [NL80211_PMSR_RESP_ATTR_FINAL] = { .type = NLA_FLAG },
        [NL80211_PMSR_RESP_ATTR_DATA] = NLA_POLICY_NESTED(hwsim_pmsr_resp_type_policy),
};

static const struct nla_policy
hwsim_pmsr_peer_result_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = {
        [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR_COMPAT,
        [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_REJECT },
        [NL80211_PMSR_PEER_ATTR_REQ] = { .type = NLA_REJECT },
        [NL80211_PMSR_PEER_ATTR_RESP] = NLA_POLICY_NESTED(hwsim_pmsr_resp_policy),
};

static const struct nla_policy
hwsim_pmsr_peers_result_policy[NL80211_PMSR_ATTR_MAX + 1] = {
        [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_TYPE_CAPA] = { .type = NLA_REJECT },
        [NL80211_PMSR_ATTR_PEERS] = NLA_POLICY_NESTED_ARRAY(hwsim_pmsr_peer_result_policy),
};

static const struct nla_policy
hwsim_ftm_capa_policy[NL80211_PMSR_FTM_CAPA_ATTR_MAX + 1] = {
        [NL80211_PMSR_FTM_CAPA_ATTR_ASAP] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS] = { .type = NLA_U32 },
        [NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT] = NLA_POLICY_MAX(NLA_U8, 15),
        [NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST] = NLA_POLICY_MAX(NLA_U8, 31),
        [NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG },
        [NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG },
};

static const struct nla_policy
hwsim_pmsr_capa_type_policy[NL80211_PMSR_TYPE_MAX + 1] = {
        [NL80211_PMSR_TYPE_FTM] = NLA_POLICY_NESTED(hwsim_ftm_capa_policy),
};

static const struct nla_policy
hwsim_pmsr_capa_policy[NL80211_PMSR_ATTR_MAX + 1] = {
        [NL80211_PMSR_ATTR_MAX_PEERS] = { .type = NLA_U32 },
        [NL80211_PMSR_ATTR_REPORT_AP_TSF] = { .type = NLA_FLAG },
        [NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR] = { .type = NLA_FLAG },
        [NL80211_PMSR_ATTR_TYPE_CAPA] = NLA_POLICY_NESTED(hwsim_pmsr_capa_type_policy),
        [NL80211_PMSR_ATTR_PEERS] = { .type = NLA_REJECT }, // only for request.
};

static const struct nla_policy hwsim_genl_policy[HWSIM_ATTR_MAX + 1] = {
        [HWSIM_ATTR_ADDR_RECEIVER] = NLA_POLICY_ETH_ADDR_COMPAT,
        [HWSIM_ATTR_ADDR_TRANSMITTER] = NLA_POLICY_ETH_ADDR_COMPAT,
        [HWSIM_ATTR_FRAME] = { .type = NLA_BINARY,
                               .len = IEEE80211_MAX_DATA_LEN },
        [HWSIM_ATTR_FLAGS] = { .type = NLA_U32 },
        [HWSIM_ATTR_RX_RATE] = { .type = NLA_U32 },
        [HWSIM_ATTR_SIGNAL] = { .type = NLA_U32 },
        [HWSIM_ATTR_TX_INFO] = { .type = NLA_BINARY,
                                 .len = IEEE80211_TX_MAX_RATES *
                                        sizeof(struct hwsim_tx_rate)},
        [HWSIM_ATTR_COOKIE] = { .type = NLA_U64 },
        [HWSIM_ATTR_CHANNELS] = { .type = NLA_U32 },
        [HWSIM_ATTR_RADIO_ID] = { .type = NLA_U32 },
        [HWSIM_ATTR_REG_HINT_ALPHA2] = { .type = NLA_STRING, .len = 2 },
        [HWSIM_ATTR_REG_CUSTOM_REG] = { .type = NLA_U32 },
        [HWSIM_ATTR_REG_STRICT_REG] = { .type = NLA_FLAG },
        [HWSIM_ATTR_SUPPORT_P2P_DEVICE] = { .type = NLA_FLAG },
        [HWSIM_ATTR_USE_CHANCTX] = { .type = NLA_FLAG },
        [HWSIM_ATTR_DESTROY_RADIO_ON_CLOSE] = { .type = NLA_FLAG },
        [HWSIM_ATTR_RADIO_NAME] = { .type = NLA_STRING },
        [HWSIM_ATTR_NO_VIF] = { .type = NLA_FLAG },
        [HWSIM_ATTR_FREQ] = { .type = NLA_U32 },
        [HWSIM_ATTR_TX_INFO_FLAGS] = { .type = NLA_BINARY },
        [HWSIM_ATTR_PERM_ADDR] = NLA_POLICY_ETH_ADDR_COMPAT,
        [HWSIM_ATTR_IFTYPE_SUPPORT] = { .type = NLA_U32 },
        [HWSIM_ATTR_CIPHER_SUPPORT] = { .type = NLA_BINARY },
        [HWSIM_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
        [HWSIM_ATTR_PMSR_SUPPORT] = NLA_POLICY_NESTED(hwsim_pmsr_capa_policy),
        [HWSIM_ATTR_PMSR_RESULT] = NLA_POLICY_NESTED(hwsim_pmsr_peers_result_policy),
};

#if IS_REACHABLE(CONFIG_VIRTIO)

/* MAC80211_HWSIM virtio queues */
static struct virtqueue *hwsim_vqs[HWSIM_NUM_VQS];
static bool hwsim_virtio_enabled;
static DEFINE_SPINLOCK(hwsim_virtio_lock);

static void hwsim_virtio_rx_work(struct work_struct *work);
static DECLARE_WORK(hwsim_virtio_rx, hwsim_virtio_rx_work);

static int hwsim_tx_virtio(struct mac80211_hwsim_data *data,
                           struct sk_buff *skb)
{
        struct scatterlist sg[1];
        unsigned long flags;
        int err;

        spin_lock_irqsave(&hwsim_virtio_lock, flags);
        if (!hwsim_virtio_enabled) {
                err = -ENODEV;
                goto out_free;
        }

        sg_init_one(sg, skb->head, skb_end_offset(skb));
        err = virtqueue_add_outbuf(hwsim_vqs[HWSIM_VQ_TX], sg, 1, skb,
                                   GFP_ATOMIC);
        if (err)
                goto out_free;
        virtqueue_kick(hwsim_vqs[HWSIM_VQ_TX]);
        spin_unlock_irqrestore(&hwsim_virtio_lock, flags);
        return 0;

out_free:
        spin_unlock_irqrestore(&hwsim_virtio_lock, flags);
        nlmsg_free(skb);
        return err;
}
#else
/* cause a linker error if this ends up being needed */
extern int hwsim_tx_virtio(struct mac80211_hwsim_data *data,
                           struct sk_buff *skb);
#define hwsim_virtio_enabled false
#endif

static int hwsim_get_chanwidth(enum nl80211_chan_width bw)
{
        switch (bw) {
        case NL80211_CHAN_WIDTH_20_NOHT:
        case NL80211_CHAN_WIDTH_20:
                return 20;
        case NL80211_CHAN_WIDTH_40:
                return 40;
        case NL80211_CHAN_WIDTH_80:
                return 80;
        case NL80211_CHAN_WIDTH_80P80:
        case NL80211_CHAN_WIDTH_160:
                return 160;
        case NL80211_CHAN_WIDTH_320:
                return 320;
        case NL80211_CHAN_WIDTH_5:
                return 5;
        case NL80211_CHAN_WIDTH_10:
                return 10;
        case NL80211_CHAN_WIDTH_1:
                return 1;
        case NL80211_CHAN_WIDTH_2:
                return 2;
        case NL80211_CHAN_WIDTH_4:
                return 4;
        case NL80211_CHAN_WIDTH_8:
                return 8;
        case NL80211_CHAN_WIDTH_16:
                return 16;
        }

        return INT_MAX;
}

static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw,
                                    struct sk_buff *skb,
                                    struct ieee80211_channel *chan);

/* sysfs attributes */
static void hwsim_send_ps_poll(void *dat, u8 *mac, struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_data *data = dat;
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
        struct sk_buff *skb;
        struct ieee80211_pspoll *pspoll;

        if (!vp->assoc)
                return;

        wiphy_dbg(data->hw->wiphy,
                  "%s: send PS-Poll to %pM for aid %d\n",
                  __func__, vp->bssid, vp->aid);

        skb = dev_alloc_skb(sizeof(*pspoll));
        if (!skb)
                return;
        pspoll = skb_put(skb, sizeof(*pspoll));
        pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
                                            IEEE80211_STYPE_PSPOLL |
                                            IEEE80211_FCTL_PM);
        pspoll->aid = cpu_to_le16(0xc000 | vp->aid);
        memcpy(pspoll->bssid, vp->bssid, ETH_ALEN);
        memcpy(pspoll->ta, mac, ETH_ALEN);

        rcu_read_lock();
        mac80211_hwsim_tx_frame(data->hw, skb,
                                rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan);
        rcu_read_unlock();
}

static void hwsim_send_nullfunc(struct mac80211_hwsim_data *data, u8 *mac,
                                struct ieee80211_vif *vif, int ps)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
        struct sk_buff *skb;
        struct ieee80211_hdr *hdr;
        struct ieee80211_tx_info *cb;

        if (!vp->assoc)
                return;

        wiphy_dbg(data->hw->wiphy,
                  "%s: send data::nullfunc to %pM ps=%d\n",
                  __func__, vp->bssid, ps);

        skb = dev_alloc_skb(sizeof(*hdr));
        if (!skb)
                return;
        hdr = skb_put(skb, sizeof(*hdr) - ETH_ALEN);
        hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA |
                                         IEEE80211_STYPE_NULLFUNC |
                                         IEEE80211_FCTL_TODS |
                                         (ps ? IEEE80211_FCTL_PM : 0));
        hdr->duration_id = cpu_to_le16(0);
        memcpy(hdr->addr1, vp->bssid, ETH_ALEN);
        memcpy(hdr->addr2, mac, ETH_ALEN);
        memcpy(hdr->addr3, vp->bssid, ETH_ALEN);

        cb = IEEE80211_SKB_CB(skb);
        cb->control.rates[0].count = 1;
        cb->control.rates[1].idx = -1;

        rcu_read_lock();
        mac80211_hwsim_tx_frame(data->hw, skb,
                                rcu_dereference(vif->bss_conf.chanctx_conf)->def.chan);
        rcu_read_unlock();
}


static void hwsim_send_nullfunc_ps(void *dat, u8 *mac,
                                   struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_data *data = dat;
        hwsim_send_nullfunc(data, mac, vif, 1);
}

static void hwsim_send_nullfunc_no_ps(void *dat, u8 *mac,
                                      struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_data *data = dat;
        hwsim_send_nullfunc(data, mac, vif, 0);
}

static int hwsim_fops_ps_read(void *dat, u64 *val)
{
        struct mac80211_hwsim_data *data = dat;
        *val = data->ps;
        return 0;
}

static int hwsim_fops_ps_write(void *dat, u64 val)
{
        struct mac80211_hwsim_data *data = dat;
        enum ps_mode old_ps;

        if (val != PS_DISABLED && val != PS_ENABLED && val != PS_AUTO_POLL &&
            val != PS_MANUAL_POLL)
                return -EINVAL;

        if (val == PS_MANUAL_POLL) {
                if (data->ps != PS_ENABLED)
                        return -EINVAL;
                local_bh_disable();
                ieee80211_iterate_active_interfaces_atomic(
                        data->hw, IEEE80211_IFACE_ITER_NORMAL,
                        hwsim_send_ps_poll, data);
                local_bh_enable();
                return 0;
        }
        old_ps = data->ps;
        data->ps = val;

        local_bh_disable();
        if (old_ps == PS_DISABLED && val != PS_DISABLED) {
                ieee80211_iterate_active_interfaces_atomic(
                        data->hw, IEEE80211_IFACE_ITER_NORMAL,
                        hwsim_send_nullfunc_ps, data);
        } else if (old_ps != PS_DISABLED && val == PS_DISABLED) {
                ieee80211_iterate_active_interfaces_atomic(
                        data->hw, IEEE80211_IFACE_ITER_NORMAL,
                        hwsim_send_nullfunc_no_ps, data);
        }
        local_bh_enable();

        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_ps, hwsim_fops_ps_read, hwsim_fops_ps_write,
                         "%llu\n");

static int hwsim_write_simulate_radar(void *dat, u64 val)
{
        struct mac80211_hwsim_data *data = dat;

        ieee80211_radar_detected(data->hw);

        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(hwsim_simulate_radar, NULL,
                         hwsim_write_simulate_radar, "%llu\n");

static int hwsim_fops_group_read(void *dat, u64 *val)
{
        struct mac80211_hwsim_data *data = dat;
        *val = data->group;
        return 0;
}

static int hwsim_fops_group_write(void *dat, u64 val)
{
        struct mac80211_hwsim_data *data = dat;
        data->group = val;
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_group,
                         hwsim_fops_group_read, hwsim_fops_group_write,
                         "%llx\n");

static int hwsim_fops_rx_rssi_read(void *dat, u64 *val)
{
        struct mac80211_hwsim_data *data = dat;
        *val = data->rx_rssi;
        return 0;
}

static int hwsim_fops_rx_rssi_write(void *dat, u64 val)
{
        struct mac80211_hwsim_data *data = dat;
        int rssi = (int)val;

        if (rssi >= 0 || rssi < -100)
                return -EINVAL;

        data->rx_rssi = rssi;
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(hwsim_fops_rx_rssi,
                         hwsim_fops_rx_rssi_read, hwsim_fops_rx_rssi_write,
                         "%lld\n");

static netdev_tx_t hwsim_mon_xmit(struct sk_buff *skb,
                                        struct net_device *dev)
{
        /* TODO: allow packet injection */
        dev_kfree_skb(skb);
        return NETDEV_TX_OK;
}

static inline u64 mac80211_hwsim_get_tsf_raw(void)
{
        return ktime_to_us(ktime_get_real());
}

static __le64 __mac80211_hwsim_get_tsf(struct mac80211_hwsim_data *data)
{
        u64 now = mac80211_hwsim_get_tsf_raw();
        return cpu_to_le64(now + data->tsf_offset);
}

static u64 mac80211_hwsim_get_tsf(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_data *data = hw->priv;
        return le64_to_cpu(__mac80211_hwsim_get_tsf(data));
}

static void mac80211_hwsim_set_tsf(struct ieee80211_hw *hw,
                struct ieee80211_vif *vif, u64 tsf)
{
        struct mac80211_hwsim_data *data = hw->priv;
        u64 now = mac80211_hwsim_get_tsf(hw, vif);
        /* MLD not supported here */
        u32 bcn_int = data->link_data[0].beacon_int;
        u64 delta = abs(tsf - now);

        /* adjust after beaconing with new timestamp at old TBTT */
        if (tsf > now) {
                data->tsf_offset += delta;
                data->bcn_delta = do_div(delta, bcn_int);
        } else {
                data->tsf_offset -= delta;
                data->bcn_delta = -(s64)do_div(delta, bcn_int);
        }
}

static void mac80211_hwsim_monitor_rx(struct ieee80211_hw *hw,
                                      struct sk_buff *tx_skb,
                                      struct ieee80211_channel *chan)
{
        struct mac80211_hwsim_data *data = hw->priv;
        struct sk_buff *skb;
        struct hwsim_radiotap_hdr *hdr;
        u16 flags, bitrate;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx_skb);
        struct ieee80211_rate *txrate = ieee80211_get_tx_rate(hw, info);

        if (!txrate)
                bitrate = 0;
        else
                bitrate = txrate->bitrate;

        if (!netif_running(hwsim_mon))
                return;

        skb = skb_copy_expand(tx_skb, sizeof(*hdr), 0, GFP_ATOMIC);
        if (skb == NULL)
                return;

        hdr = skb_push(skb, sizeof(*hdr));
        hdr->hdr.it_version = PKTHDR_RADIOTAP_VERSION;
        hdr->hdr.it_pad = 0;
        hdr->hdr.it_len = cpu_to_le16(sizeof(*hdr));
        hdr->hdr.it_present = cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) |
                                          (1 << IEEE80211_RADIOTAP_RATE) |
                                          (1 << IEEE80211_RADIOTAP_TSFT) |
                                          (1 << IEEE80211_RADIOTAP_CHANNEL));
        hdr->rt_tsft = __mac80211_hwsim_get_tsf(data);
        hdr->rt_flags = 0;
        hdr->rt_rate = bitrate / 5;
        hdr->rt_channel = cpu_to_le16(chan->center_freq);
        flags = IEEE80211_CHAN_2GHZ;
        if (txrate && txrate->flags & IEEE80211_RATE_ERP_G)
                flags |= IEEE80211_CHAN_OFDM;
        else
                flags |= IEEE80211_CHAN_CCK;
        hdr->rt_chbitmask = cpu_to_le16(flags);

        skb->dev = hwsim_mon;
        skb_reset_mac_header(skb);
        skb->ip_summed = CHECKSUM_UNNECESSARY;
        skb->pkt_type = PACKET_OTHERHOST;
        skb->protocol = htons(ETH_P_802_2);
        memset(skb->cb, 0, sizeof(skb->cb));
        netif_rx(skb);
}


static void mac80211_hwsim_monitor_ack(struct ieee80211_channel *chan,
                                       const u8 *addr)
{
        struct sk_buff *skb;
        struct hwsim_radiotap_ack_hdr *hdr;
        u16 flags;
        struct ieee80211_hdr *hdr11;

        if (!netif_running(hwsim_mon))
                return;

        skb = dev_alloc_skb(100);
        if (skb == NULL)
                return;

        hdr = skb_put(skb, sizeof(*hdr));
        hdr->hdr.it_version = PKTHDR_RADIOTAP_VERSION;
        hdr->hdr.it_pad = 0;
        hdr->hdr.it_len = cpu_to_le16(sizeof(*hdr));
        hdr->hdr.it_present = cpu_to_le32((1 << IEEE80211_RADIOTAP_FLAGS) |
                                          (1 << IEEE80211_RADIOTAP_CHANNEL));
        hdr->rt_flags = 0;
        hdr->pad = 0;
        hdr->rt_channel = cpu_to_le16(chan->center_freq);
        flags = IEEE80211_CHAN_2GHZ;
        hdr->rt_chbitmask = cpu_to_le16(flags);

        hdr11 = skb_put(skb, 10);
        hdr11->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
                                           IEEE80211_STYPE_ACK);
        hdr11->duration_id = cpu_to_le16(0);
        memcpy(hdr11->addr1, addr, ETH_ALEN);

        skb->dev = hwsim_mon;
        skb_reset_mac_header(skb);
        skb->ip_summed = CHECKSUM_UNNECESSARY;
        skb->pkt_type = PACKET_OTHERHOST;
        skb->protocol = htons(ETH_P_802_2);
        memset(skb->cb, 0, sizeof(skb->cb));
        netif_rx(skb);
}

struct mac80211_hwsim_addr_match_data {
        u8 addr[ETH_ALEN];
        bool ret;
};

static void mac80211_hwsim_addr_iter(void *data, u8 *mac,
                                     struct ieee80211_vif *vif)
{
        int i;
        struct mac80211_hwsim_addr_match_data *md = data;

        if (memcmp(mac, md->addr, ETH_ALEN) == 0) {
                md->ret = true;
                return;
        }

        /* Match the link address */
        for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) {
                struct ieee80211_bss_conf *conf;

                conf = rcu_dereference(vif->link_conf[i]);
                if (!conf)
                        continue;

                if (memcmp(conf->addr, md->addr, ETH_ALEN) == 0) {
                        md->ret = true;
                        return;
                }
        }
}

static bool mac80211_hwsim_addr_match(struct mac80211_hwsim_data *data,
                                      const u8 *addr)
{
        struct mac80211_hwsim_addr_match_data md = {
                .ret = false,
        };

        if (data->scanning && memcmp(addr, data->scan_addr, ETH_ALEN) == 0)
                return true;

        memcpy(md.addr, addr, ETH_ALEN);

        ieee80211_iterate_active_interfaces_atomic(data->hw,
                                                   IEEE80211_IFACE_ITER_NORMAL,
                                                   mac80211_hwsim_addr_iter,
                                                   &md);

        return md.ret;
}

static bool hwsim_ps_rx_ok(struct mac80211_hwsim_data *data,
                           struct sk_buff *skb)
{
        switch (data->ps) {
        case PS_DISABLED:
                return true;
        case PS_ENABLED:
                return false;
        case PS_AUTO_POLL:
                /* TODO: accept (some) Beacons by default and other frames only
                 * if pending PS-Poll has been sent */
                return true;
        case PS_MANUAL_POLL:
                /* Allow unicast frames to own address if there is a pending
                 * PS-Poll */
                if (data->ps_poll_pending &&
                    mac80211_hwsim_addr_match(data, skb->data + 4)) {
                        data->ps_poll_pending = false;
                        return true;
                }
                return false;
        }

        return true;
}

static int hwsim_unicast_netgroup(struct mac80211_hwsim_data *data,
                                  struct sk_buff *skb, int portid)
{
        struct net *net;
        bool found = false;
        int res = -ENOENT;

        rcu_read_lock();
        for_each_net_rcu(net) {
                if (data->netgroup == hwsim_net_get_netgroup(net)) {
                        res = genlmsg_unicast(net, skb, portid);
                        found = true;
                        break;
                }
        }
        rcu_read_unlock();

        if (!found)
                nlmsg_free(skb);

        return res;
}

static void mac80211_hwsim_config_mac_nl(struct ieee80211_hw *hw,
                                         const u8 *addr, bool add)
{
        struct mac80211_hwsim_data *data = hw->priv;
        u32 _portid = READ_ONCE(data->wmediumd);
        struct sk_buff *skb;
        void *msg_head;

        WARN_ON(!is_valid_ether_addr(addr));

        if (!_portid && !hwsim_virtio_enabled)
                return;

        skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!skb)
                return;

        msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0,
                               add ? HWSIM_CMD_ADD_MAC_ADDR :
                                     HWSIM_CMD_DEL_MAC_ADDR);
        if (!msg_head) {
                pr_debug("mac80211_hwsim: problem with msg_head\n");
                goto nla_put_failure;
        }

        if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER,
                    ETH_ALEN, data->addresses[1].addr))
                goto nla_put_failure;

        if (nla_put(skb, HWSIM_ATTR_ADDR_RECEIVER, ETH_ALEN, addr))
                goto nla_put_failure;

        genlmsg_end(skb, msg_head);

        if (hwsim_virtio_enabled)
                hwsim_tx_virtio(data, skb);
        else
                hwsim_unicast_netgroup(data, skb, _portid);
        return;
nla_put_failure:
        nlmsg_free(skb);
}

static inline u16 trans_tx_rate_flags_ieee2hwsim(struct ieee80211_tx_rate *rate)
{
        u16 result = 0;

        if (rate->flags & IEEE80211_TX_RC_USE_RTS_CTS)
                result |= MAC80211_HWSIM_TX_RC_USE_RTS_CTS;
        if (rate->flags & IEEE80211_TX_RC_USE_CTS_PROTECT)
                result |= MAC80211_HWSIM_TX_RC_USE_CTS_PROTECT;
        if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)
                result |= MAC80211_HWSIM_TX_RC_USE_SHORT_PREAMBLE;
        if (rate->flags & IEEE80211_TX_RC_MCS)
                result |= MAC80211_HWSIM_TX_RC_MCS;
        if (rate->flags & IEEE80211_TX_RC_GREEN_FIELD)
                result |= MAC80211_HWSIM_TX_RC_GREEN_FIELD;
        if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
                result |= MAC80211_HWSIM_TX_RC_40_MHZ_WIDTH;
        if (rate->flags & IEEE80211_TX_RC_DUP_DATA)
                result |= MAC80211_HWSIM_TX_RC_DUP_DATA;
        if (rate->flags & IEEE80211_TX_RC_SHORT_GI)
                result |= MAC80211_HWSIM_TX_RC_SHORT_GI;
        if (rate->flags & IEEE80211_TX_RC_VHT_MCS)
                result |= MAC80211_HWSIM_TX_RC_VHT_MCS;
        if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
                result |= MAC80211_HWSIM_TX_RC_80_MHZ_WIDTH;
        if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH)
                result |= MAC80211_HWSIM_TX_RC_160_MHZ_WIDTH;

        return result;
}

static void mac80211_hwsim_tx_frame_nl(struct ieee80211_hw *hw,
                                       struct sk_buff *my_skb,
                                       int dst_portid,
                                       struct ieee80211_channel *channel)
{
        struct sk_buff *skb;
        struct mac80211_hwsim_data *data = hw->priv;
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) my_skb->data;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(my_skb);
        void *msg_head;
        unsigned int hwsim_flags = 0;
        int i;
        struct hwsim_tx_rate tx_attempts[IEEE80211_TX_MAX_RATES];
        struct hwsim_tx_rate_flag tx_attempts_flags[IEEE80211_TX_MAX_RATES];
        uintptr_t cookie;

        if (data->ps != PS_DISABLED)
                hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);
        /* If the queue contains MAX_QUEUE skb's drop some */
        if (skb_queue_len(&data->pending) >= MAX_QUEUE) {
                /* Dropping until WARN_QUEUE level */
                while (skb_queue_len(&data->pending) >= WARN_QUEUE) {
                        ieee80211_free_txskb(hw, skb_dequeue(&data->pending));
                        data->tx_dropped++;
                }
        }

        skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (skb == NULL)
                goto nla_put_failure;

        msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0,
                               HWSIM_CMD_FRAME);
        if (msg_head == NULL) {
                pr_debug("mac80211_hwsim: problem with msg_head\n");
                goto nla_put_failure;
        }

        if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER,
                    ETH_ALEN, data->addresses[1].addr))
                goto nla_put_failure;

        /* We get the skb->data */
        if (nla_put(skb, HWSIM_ATTR_FRAME, my_skb->len, my_skb->data))
                goto nla_put_failure;

        /* We get the flags for this transmission, and we translate them to
           wmediumd flags  */

        if (info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS)
                hwsim_flags |= HWSIM_TX_CTL_REQ_TX_STATUS;

        if (info->flags & IEEE80211_TX_CTL_NO_ACK)
                hwsim_flags |= HWSIM_TX_CTL_NO_ACK;

        if (nla_put_u32(skb, HWSIM_ATTR_FLAGS, hwsim_flags))
                goto nla_put_failure;

        if (nla_put_u32(skb, HWSIM_ATTR_FREQ, channel->center_freq))
                goto nla_put_failure;

        /* We get the tx control (rate and retries) info*/

        for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
                tx_attempts[i].idx = info->status.rates[i].idx;
                tx_attempts_flags[i].idx = info->status.rates[i].idx;
                tx_attempts[i].count = info->status.rates[i].count;
                tx_attempts_flags[i].flags =
                                trans_tx_rate_flags_ieee2hwsim(
                                                &info->status.rates[i]);
        }

        if (nla_put(skb, HWSIM_ATTR_TX_INFO,
                    sizeof(struct hwsim_tx_rate)*IEEE80211_TX_MAX_RATES,
                    tx_attempts))
                goto nla_put_failure;

        if (nla_put(skb, HWSIM_ATTR_TX_INFO_FLAGS,
                    sizeof(struct hwsim_tx_rate_flag) * IEEE80211_TX_MAX_RATES,
                    tx_attempts_flags))
                goto nla_put_failure;

        /* We create a cookie to identify this skb */
        cookie = atomic_inc_return(&data->pending_cookie);
        info->rate_driver_data[0] = (void *)cookie;
        if (nla_put_u64_64bit(skb, HWSIM_ATTR_COOKIE, cookie, HWSIM_ATTR_PAD))
                goto nla_put_failure;

        genlmsg_end(skb, msg_head);

        if (hwsim_virtio_enabled) {
                if (hwsim_tx_virtio(data, skb))
                        goto err_free_txskb;
        } else {
                if (hwsim_unicast_netgroup(data, skb, dst_portid))
                        goto err_free_txskb;
        }

        /* Enqueue the packet */
        skb_queue_tail(&data->pending, my_skb);
        data->tx_pkts++;
        data->tx_bytes += my_skb->len;
        return;

nla_put_failure:
        nlmsg_free(skb);
err_free_txskb:
        pr_debug("mac80211_hwsim: error occurred in %s\n", __func__);
        ieee80211_free_txskb(hw, my_skb);
        data->tx_failed++;
}

static bool hwsim_chans_compat(struct ieee80211_channel *c1,
                               struct ieee80211_channel *c2)
{
        if (!c1 || !c2)
                return false;

        return c1->center_freq == c2->center_freq;
}

struct tx_iter_data {
        struct ieee80211_channel *channel;
        bool receive;
};

static void mac80211_hwsim_tx_iter(void *_data, u8 *addr,
                                   struct ieee80211_vif *vif)
{
        struct tx_iter_data *data = _data;
        int i;

        for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) {
                struct ieee80211_bss_conf *conf;
                struct ieee80211_chanctx_conf *chanctx;

                conf = rcu_dereference(vif->link_conf[i]);
                if (!conf)
                        continue;

                chanctx = rcu_dereference(conf->chanctx_conf);
                if (!chanctx)
                        continue;

                if (!hwsim_chans_compat(data->channel, chanctx->def.chan))
                        continue;

                data->receive = true;
                return;
        }
}

static void mac80211_hwsim_add_vendor_rtap(struct sk_buff *skb)
{
        /*
         * To enable this code, #define the HWSIM_RADIOTAP_OUI,
         * e.g. like this:
         * #define HWSIM_RADIOTAP_OUI "\x02\x00\x00"
         * (but you should use a valid OUI, not that)
         *
         * If anyone wants to 'donate' a radiotap OUI/subns code
         * please send a patch removing this #ifdef and changing
         * the values accordingly.
         */
#ifdef HWSIM_RADIOTAP_OUI
        struct ieee80211_radiotap_vendor_tlv *rtap;
        static const char vendor_data[8] = "ABCDEFGH";

        // Make sure no padding is needed
        BUILD_BUG_ON(sizeof(vendor_data) % 4);
        /* this is last radiotap info before the mac header, so
         * skb_reset_mac_header for mac8022 to know the end of
         * the radiotap TLV/beginning of the 802.11 header
         */
        skb_reset_mac_header(skb);

        /*
         * Note that this code requires the headroom in the SKB
         * that was allocated earlier.
         */
        rtap = skb_push(skb, sizeof(*rtap) + sizeof(vendor_data));

        rtap->len = cpu_to_le16(sizeof(*rtap) -
                                sizeof(struct ieee80211_radiotap_tlv) +
                                sizeof(vendor_data));
        rtap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE);

        rtap->content.oui[0] = HWSIM_RADIOTAP_OUI[0];
        rtap->content.oui[1] = HWSIM_RADIOTAP_OUI[1];
        rtap->content.oui[2] = HWSIM_RADIOTAP_OUI[2];
        rtap->content.oui_subtype = 127;
        /* clear reserved field */
        rtap->content.reserved = 0;
        rtap->content.vendor_type = 0;
        memcpy(rtap->content.data, vendor_data, sizeof(vendor_data));

        IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
#endif
}

static void mac80211_hwsim_rx(struct mac80211_hwsim_data *data,
                              struct ieee80211_rx_status *rx_status,
                              struct sk_buff *skb)
{
        struct ieee80211_hdr *hdr = (void *)skb->data;

        if (!ieee80211_has_morefrags(hdr->frame_control) &&
            !is_multicast_ether_addr(hdr->addr1) &&
            (ieee80211_is_mgmt(hdr->frame_control) ||
             ieee80211_is_data(hdr->frame_control))) {
                struct ieee80211_sta *sta;
                unsigned int link_id;

                rcu_read_lock();
                sta = ieee80211_find_sta_by_link_addrs(data->hw, hdr->addr2,
                                                       hdr->addr1, &link_id);
                if (sta) {
                        struct hwsim_sta_priv *sp = (void *)sta->drv_priv;

                        if (ieee80211_has_pm(hdr->frame_control))
                                sp->active_links_rx &= ~BIT(link_id);
                        else
                                sp->active_links_rx |= BIT(link_id);

                        rx_status->link_valid = true;
                        rx_status->link_id = link_id;
                }
                rcu_read_unlock();
        }

        memcpy(IEEE80211_SKB_RXCB(skb), rx_status, sizeof(*rx_status));

        mac80211_hwsim_add_vendor_rtap(skb);

        data->rx_pkts++;
        data->rx_bytes += skb->len;
        ieee80211_rx_irqsafe(data->hw, skb);
}

static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw,
                                          struct sk_buff *skb,
                                          struct ieee80211_channel *chan)
{
        struct mac80211_hwsim_data *data = hw->priv, *data2;
        bool ack = false;
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        struct ieee80211_rx_status rx_status;
        u64 now;

        memset(&rx_status, 0, sizeof(rx_status));
        rx_status.flag |= RX_FLAG_MACTIME_START;
        rx_status.freq = chan->center_freq;
        rx_status.freq_offset = chan->freq_offset ? 1 : 0;
        rx_status.band = chan->band;
        if (info->control.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) {
                rx_status.rate_idx =
                        ieee80211_rate_get_vht_mcs(&info->control.rates[0]);
                rx_status.nss =
                        ieee80211_rate_get_vht_nss(&info->control.rates[0]);
                rx_status.encoding = RX_ENC_VHT;
        } else {
                rx_status.rate_idx = info->control.rates[0].idx;
                if (info->control.rates[0].flags & IEEE80211_TX_RC_MCS)
                        rx_status.encoding = RX_ENC_HT;
        }
        if (info->control.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
                rx_status.bw = RATE_INFO_BW_40;
        else if (info->control.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
                rx_status.bw = RATE_INFO_BW_80;
        else if (info->control.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH)
                rx_status.bw = RATE_INFO_BW_160;
        else
                rx_status.bw = RATE_INFO_BW_20;
        if (info->control.rates[0].flags & IEEE80211_TX_RC_SHORT_GI)
                rx_status.enc_flags |= RX_ENC_FLAG_SHORT_GI;
        /* TODO: simulate optional packet loss */
        rx_status.signal = data->rx_rssi;
        if (info->control.vif)
                rx_status.signal += info->control.vif->bss_conf.txpower;

        if (data->ps != PS_DISABLED)
                hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);

        /* release the skb's source info */
        skb_orphan(skb);
        skb_dst_drop(skb);
        skb->mark = 0;
        skb_ext_reset(skb);
        nf_reset_ct(skb);

        /*
         * Get absolute mactime here so all HWs RX at the "same time", and
         * absolute TX time for beacon mactime so the timestamp matches.
         * Giving beacons a different mactime than non-beacons looks messy, but
         * it helps the Toffset be exact and a ~10us mactime discrepancy
         * probably doesn't really matter.
         */
        if (ieee80211_is_beacon(hdr->frame_control) ||
            ieee80211_is_probe_resp(hdr->frame_control)) {
                rx_status.boottime_ns = ktime_get_boottime_ns();
                now = data->abs_bcn_ts;
        } else {
                now = mac80211_hwsim_get_tsf_raw();
        }

        /* Copy skb to all enabled radios that are on the current frequency */
        spin_lock(&hwsim_radio_lock);
        list_for_each_entry(data2, &hwsim_radios, list) {
                struct sk_buff *nskb;
                struct tx_iter_data tx_iter_data = {
                        .receive = false,
                        .channel = chan,
                };

                if (data == data2)
                        continue;

                if (!data2->started || (data2->idle && !data2->tmp_chan) ||
                    !hwsim_ps_rx_ok(data2, skb))
                        continue;

                if (!(data->group & data2->group))
                        continue;

                if (data->netgroup != data2->netgroup)
                        continue;

                if (!hwsim_chans_compat(chan, data2->tmp_chan) &&
                    !hwsim_chans_compat(chan, data2->channel)) {
                        ieee80211_iterate_active_interfaces_atomic(
                                data2->hw, IEEE80211_IFACE_ITER_NORMAL,
                                mac80211_hwsim_tx_iter, &tx_iter_data);
                        if (!tx_iter_data.receive)
                                continue;
                }

                /*
                 * reserve some space for our vendor and the normal
                 * radiotap header, since we're copying anyway
                 */
                if (skb->len < PAGE_SIZE && paged_rx) {
                        struct page *page = alloc_page(GFP_ATOMIC);

                        if (!page)
                                continue;

                        nskb = dev_alloc_skb(128);
                        if (!nskb) {
                                __free_page(page);
                                continue;
                        }

                        memcpy(page_address(page), skb->data, skb->len);
                        skb_add_rx_frag(nskb, 0, page, 0, skb->len, skb->len);
                } else {
                        nskb = skb_copy(skb, GFP_ATOMIC);
                        if (!nskb)
                                continue;
                }

                if (mac80211_hwsim_addr_match(data2, hdr->addr1))
                        ack = true;

                rx_status.mactime = now + data2->tsf_offset;

                mac80211_hwsim_rx(data2, &rx_status, nskb);
        }
        spin_unlock(&hwsim_radio_lock);

        return ack;
}

static struct ieee80211_bss_conf *
mac80211_hwsim_select_tx_link(struct mac80211_hwsim_data *data,
                              struct ieee80211_vif *vif,
                              struct ieee80211_sta *sta,
                              struct ieee80211_hdr *hdr,
                              struct ieee80211_link_sta **link_sta)
{
        struct hwsim_sta_priv *sp = (void *)sta->drv_priv;
        int i;

        if (!ieee80211_vif_is_mld(vif))
                return &vif->bss_conf;

        WARN_ON(is_multicast_ether_addr(hdr->addr1));

        if (WARN_ON_ONCE(!sta || !sta->valid_links))
                return &vif->bss_conf;

        for (i = 0; i < ARRAY_SIZE(vif->link_conf); i++) {
                struct ieee80211_bss_conf *bss_conf;
                unsigned int link_id;

                /* round-robin the available link IDs */
                link_id = (sp->last_link + i + 1) % ARRAY_SIZE(vif->link_conf);

                if (!(vif->active_links & BIT(link_id)))
                        continue;

                if (!(sp->active_links_rx & BIT(link_id)))
                        continue;

                *link_sta = rcu_dereference(sta->link[link_id]);
                if (!*link_sta)
                        continue;

                bss_conf = rcu_dereference(vif->link_conf[link_id]);
                if (WARN_ON_ONCE(!bss_conf))
                        continue;

                /* can happen while switching links */
                if (!rcu_access_pointer(bss_conf->chanctx_conf))
                        continue;

                sp->last_link = link_id;
                return bss_conf;
        }

        return NULL;
}

static void mac80211_hwsim_tx(struct ieee80211_hw *hw,
                              struct ieee80211_tx_control *control,
                              struct sk_buff *skb)
{
        struct mac80211_hwsim_data *data = hw->priv;
        struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb);
        struct ieee80211_hdr *hdr = (void *)skb->data;
        struct ieee80211_chanctx_conf *chanctx_conf;
        struct ieee80211_channel *channel;
        bool ack;
        enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT;
        u32 _portid, i;

        if (WARN_ON(skb->len < 10)) {
                /* Should not happen; just a sanity check for addr1 use */
                ieee80211_free_txskb(hw, skb);
                return;
        }

        if (!data->use_chanctx) {
                channel = data->channel;
                confbw = data->bw;
        } else if (txi->hw_queue == 4) {
                channel = data->tmp_chan;
        } else {
                u8 link = u32_get_bits(IEEE80211_SKB_CB(skb)->control.flags,
                                       IEEE80211_TX_CTRL_MLO_LINK);
                struct ieee80211_vif *vif = txi->control.vif;
                struct ieee80211_link_sta *link_sta = NULL;
                struct ieee80211_sta *sta = control->sta;
                struct ieee80211_bss_conf *bss_conf;

                if (link != IEEE80211_LINK_UNSPECIFIED) {
                        bss_conf = rcu_dereference(txi->control.vif->link_conf[link]);
                        if (sta)
                                link_sta = rcu_dereference(sta->link[link]);
                } else {
                        bss_conf = mac80211_hwsim_select_tx_link(data, vif, sta,
                                                                 hdr, &link_sta);
                }

                if (unlikely(!bss_conf)) {
                        /* if it's an MLO STA, it might have deactivated all
                         * links temporarily - but we don't handle real PS in
                         * this code yet, so just drop the frame in that case
                         */
                        WARN(link != IEEE80211_LINK_UNSPECIFIED || !sta || !sta->mlo,
                             "link:%d, sta:%pM, sta->mlo:%d\n",
                             link, sta ? sta->addr : NULL, sta ? sta->mlo : -1);
                        ieee80211_free_txskb(hw, skb);
                        return;
                }

                if (sta && sta->mlo) {
                        if (WARN_ON(!link_sta)) {
                                ieee80211_free_txskb(hw, skb);
                                return;
                        }
                        /* address translation to link addresses on TX */
                        ether_addr_copy(hdr->addr1, link_sta->addr);
                        ether_addr_copy(hdr->addr2, bss_conf->addr);
                        /* translate A3 only if it's the BSSID */
                        if (!ieee80211_has_tods(hdr->frame_control) &&
                            !ieee80211_has_fromds(hdr->frame_control)) {
                                if (ether_addr_equal(hdr->addr3, sta->addr))
                                        ether_addr_copy(hdr->addr3, link_sta->addr);
                                else if (ether_addr_equal(hdr->addr3, vif->addr))
                                        ether_addr_copy(hdr->addr3, bss_conf->addr);
                        }
                        /* no need to look at A4, if present it's SA */
                }

                chanctx_conf = rcu_dereference(bss_conf->chanctx_conf);
                if (chanctx_conf) {
                        channel = chanctx_conf->def.chan;
                        confbw = chanctx_conf->def.width;
                } else {
                        channel = NULL;
                }
        }

        if (WARN(!channel, "TX w/o channel - queue = %d\n", txi->hw_queue)) {
                ieee80211_free_txskb(hw, skb);
                return;
        }

        if (data->idle && !data->tmp_chan) {
                wiphy_dbg(hw->wiphy, "Trying to TX when idle - reject\n");
                ieee80211_free_txskb(hw, skb);
                return;
        }

        if (txi->control.vif)
                hwsim_check_magic(txi->control.vif);
        if (control->sta)
                hwsim_check_sta_magic(control->sta);

        if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE))
                ieee80211_get_tx_rates(txi->control.vif, control->sta, skb,
                                       txi->control.rates,
                                       ARRAY_SIZE(txi->control.rates));

        for (i = 0; i < ARRAY_SIZE(txi->control.rates); i++) {
                u16 rflags = txi->control.rates[i].flags;
                /* initialize to data->bw for 5/10 MHz handling */
                enum nl80211_chan_width bw = data->bw;

                if (txi->control.rates[i].idx == -1)
                        break;

                if (rflags & IEEE80211_TX_RC_40_MHZ_WIDTH)
                        bw = NL80211_CHAN_WIDTH_40;
                else if (rflags & IEEE80211_TX_RC_80_MHZ_WIDTH)
                        bw = NL80211_CHAN_WIDTH_80;
                else if (rflags & IEEE80211_TX_RC_160_MHZ_WIDTH)
                        bw = NL80211_CHAN_WIDTH_160;

                if (WARN_ON(hwsim_get_chanwidth(bw) > hwsim_get_chanwidth(confbw)))
                        return;
        }

        if (skb->len >= 24 + 8 &&
            ieee80211_is_probe_resp(hdr->frame_control)) {
                /* fake header transmission time */
                struct ieee80211_mgmt *mgmt;
                struct ieee80211_rate *txrate;
                /* TODO: get MCS */
                int bitrate = 100;
                u64 ts;

                mgmt = (struct ieee80211_mgmt *)skb->data;
                txrate = ieee80211_get_tx_rate(hw, txi);
                if (txrate)
                        bitrate = txrate->bitrate;
                ts = mac80211_hwsim_get_tsf_raw();
                mgmt->u.probe_resp.timestamp =
                        cpu_to_le64(ts + data->tsf_offset +
                                    24 * 8 * 10 / bitrate);
        }

        mac80211_hwsim_monitor_rx(hw, skb, channel);

        /* wmediumd mode check */
        _portid = READ_ONCE(data->wmediumd);

        if (_portid || hwsim_virtio_enabled)
                return mac80211_hwsim_tx_frame_nl(hw, skb, _portid, channel);

        /* NO wmediumd detected, perfect medium simulation */
        data->tx_pkts++;
        data->tx_bytes += skb->len;
        ack = mac80211_hwsim_tx_frame_no_nl(hw, skb, channel);

        if (ack && skb->len >= 16)
                mac80211_hwsim_monitor_ack(channel, hdr->addr2);

        ieee80211_tx_info_clear_status(txi);

        /* frame was transmitted at most favorable rate at first attempt */
        txi->control.rates[0].count = 1;
        txi->control.rates[1].idx = -1;

        if (!(txi->flags & IEEE80211_TX_CTL_NO_ACK) && ack)
                txi->flags |= IEEE80211_TX_STAT_ACK;
        ieee80211_tx_status_irqsafe(hw, skb);
}


static int mac80211_hwsim_start(struct ieee80211_hw *hw)
{
        struct mac80211_hwsim_data *data = hw->priv;
        wiphy_dbg(hw->wiphy, "%s\n", __func__);
        data->started = true;
        return 0;
}


static void mac80211_hwsim_stop(struct ieee80211_hw *hw)
{
        struct mac80211_hwsim_data *data = hw->priv;
        int i;

        data->started = false;

        for (i = 0; i < ARRAY_SIZE(data->link_data); i++)
                hrtimer_cancel(&data->link_data[i].beacon_timer);

        while (!skb_queue_empty(&data->pending))
                ieee80211_free_txskb(hw, skb_dequeue(&data->pending));

        wiphy_dbg(hw->wiphy, "%s\n", __func__);
}


static int mac80211_hwsim_add_interface(struct ieee80211_hw *hw,
                                        struct ieee80211_vif *vif)
{
        wiphy_dbg(hw->wiphy, "%s (type=%d mac_addr=%pM)\n",
                  __func__, ieee80211_vif_type_p2p(vif),
                  vif->addr);
        hwsim_set_magic(vif);

        if (vif->type != NL80211_IFTYPE_MONITOR)
                mac80211_hwsim_config_mac_nl(hw, vif->addr, true);

        vif->cab_queue = 0;
        vif->hw_queue[IEEE80211_AC_VO] = 0;
        vif->hw_queue[IEEE80211_AC_VI] = 1;
        vif->hw_queue[IEEE80211_AC_BE] = 2;
        vif->hw_queue[IEEE80211_AC_BK] = 3;

        return 0;
}

#ifdef CONFIG_MAC80211_DEBUGFS
static void
mac80211_hwsim_link_add_debugfs(struct ieee80211_hw *hw,
                                struct ieee80211_vif *vif,
                                struct ieee80211_bss_conf *link_conf,
                                struct dentry *dir)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;

        debugfs_create_u32("skip_beacons", 0600, dir,
                           &vp->skip_beacons[link_conf->link_id]);
}
#endif

static int mac80211_hwsim_change_interface(struct ieee80211_hw *hw,
                                           struct ieee80211_vif *vif,
                                           enum nl80211_iftype newtype,
                                           bool newp2p)
{
        newtype = ieee80211_iftype_p2p(newtype, newp2p);
        wiphy_dbg(hw->wiphy,
                  "%s (old type=%d, new type=%d, mac_addr=%pM)\n",
                  __func__, ieee80211_vif_type_p2p(vif),
                    newtype, vif->addr);
        hwsim_check_magic(vif);

        /*
         * interface may change from non-AP to AP in
         * which case this needs to be set up again
         */
        vif->cab_queue = 0;

        return 0;
}

static void mac80211_hwsim_remove_interface(
        struct ieee80211_hw *hw, struct ieee80211_vif *vif)
{
        wiphy_dbg(hw->wiphy, "%s (type=%d mac_addr=%pM)\n",
                  __func__, ieee80211_vif_type_p2p(vif),
                  vif->addr);
        hwsim_check_magic(vif);
        hwsim_clear_magic(vif);
        if (vif->type != NL80211_IFTYPE_MONITOR)
                mac80211_hwsim_config_mac_nl(hw, vif->addr, false);
}

static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw,
                                    struct sk_buff *skb,
                                    struct ieee80211_channel *chan)
{
        struct mac80211_hwsim_data *data = hw->priv;
        u32 _portid = READ_ONCE(data->wmediumd);

        if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) {
                struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb);
                ieee80211_get_tx_rates(txi->control.vif, NULL, skb,
                                       txi->control.rates,
                                       ARRAY_SIZE(txi->control.rates));
        }

        mac80211_hwsim_monitor_rx(hw, skb, chan);

        if (_portid || hwsim_virtio_enabled)
                return mac80211_hwsim_tx_frame_nl(hw, skb, _portid, chan);

        data->tx_pkts++;
        data->tx_bytes += skb->len;
        mac80211_hwsim_tx_frame_no_nl(hw, skb, chan);
        dev_kfree_skb(skb);
}

static void __mac80211_hwsim_beacon_tx(struct ieee80211_bss_conf *link_conf,
                                       struct mac80211_hwsim_data *data,
                                       struct ieee80211_hw *hw,
                                       struct ieee80211_vif *vif,
                                       struct sk_buff *skb)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
        struct ieee80211_tx_info *info;
        struct ieee80211_rate *txrate;
        struct ieee80211_mgmt *mgmt;
        /* TODO: get MCS */
        int bitrate = 100;

        if (vp->skip_beacons[link_conf->link_id]) {
                vp->skip_beacons[link_conf->link_id]--;
                dev_kfree_skb(skb);
                return;
        }

        info = IEEE80211_SKB_CB(skb);
        if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE))
                ieee80211_get_tx_rates(vif, NULL, skb,
                                       info->control.rates,
                                       ARRAY_SIZE(info->control.rates));

        txrate = ieee80211_get_tx_rate(hw, info);
        if (txrate)
                bitrate = txrate->bitrate;

        mgmt = (struct ieee80211_mgmt *) skb->data;
        /* fake header transmission time */
        data->abs_bcn_ts = mac80211_hwsim_get_tsf_raw();
        if (ieee80211_is_s1g_beacon(mgmt->frame_control)) {
                struct ieee80211_ext *ext = (void *) mgmt;

                ext->u.s1g_beacon.timestamp = cpu_to_le32(data->abs_bcn_ts +
                                                          data->tsf_offset +
                                                          10 * 8 * 10 /
                                                          bitrate);
        } else {
                mgmt->u.beacon.timestamp = cpu_to_le64(data->abs_bcn_ts +
                                                       data->tsf_offset +
                                                       24 * 8 * 10 /
                                                       bitrate);
        }

        mac80211_hwsim_tx_frame(hw, skb,
                        rcu_dereference(link_conf->chanctx_conf)->def.chan);
}

static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac,
                                     struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_link_data *link_data = arg;
        u32 link_id = link_data->link_id;
        struct ieee80211_bss_conf *link_conf;
        struct mac80211_hwsim_data *data =
                container_of(link_data, struct mac80211_hwsim_data,
                             link_data[link_id]);
        struct ieee80211_hw *hw = data->hw;
        struct sk_buff *skb;

        hwsim_check_magic(vif);

        link_conf = rcu_dereference(vif->link_conf[link_id]);
        if (!link_conf)
                return;

        if (vif->type != NL80211_IFTYPE_AP &&
            vif->type != NL80211_IFTYPE_MESH_POINT &&
            vif->type != NL80211_IFTYPE_ADHOC &&
            vif->type != NL80211_IFTYPE_OCB)
                return;

        if (vif->mbssid_tx_vif && vif->mbssid_tx_vif != vif)
                return;

        if (vif->bss_conf.ema_ap) {
                struct ieee80211_ema_beacons *ema;
                u8 i = 0;

                ema = ieee80211_beacon_get_template_ema_list(hw, vif, link_id);
                if (!ema || !ema->cnt)
                        return;

                for (i = 0; i < ema->cnt; i++) {
                        __mac80211_hwsim_beacon_tx(link_conf, data, hw, vif,
                                                   ema->bcn[i].skb);
                        ema->bcn[i].skb = NULL; /* Already freed */
                }
                ieee80211_beacon_free_ema_list(ema);
        } else {
                skb = ieee80211_beacon_get(hw, vif, link_id);
                if (!skb)
                        return;

                __mac80211_hwsim_beacon_tx(link_conf, data, hw, vif, skb);
        }

        while ((skb = ieee80211_get_buffered_bc(hw, vif)) != NULL) {
                mac80211_hwsim_tx_frame(hw, skb,
                        rcu_dereference(link_conf->chanctx_conf)->def.chan);
        }

        if (link_conf->csa_active && ieee80211_beacon_cntdwn_is_complete(vif, link_id))
                ieee80211_csa_finish(vif, link_id);

        if (link_conf->color_change_active &&
            ieee80211_beacon_cntdwn_is_complete(vif, link_id))
                ieee80211_color_change_finish(vif, link_id);
}

static enum hrtimer_restart
mac80211_hwsim_beacon(struct hrtimer *timer)
{
        struct mac80211_hwsim_link_data *link_data =
                container_of(timer, struct mac80211_hwsim_link_data, beacon_timer);
        struct mac80211_hwsim_data *data =
                container_of(link_data, struct mac80211_hwsim_data,
                             link_data[link_data->link_id]);
        struct ieee80211_hw *hw = data->hw;
        u64 bcn_int = link_data->beacon_int;

        if (!data->started)
                return HRTIMER_NORESTART;

        ieee80211_iterate_active_interfaces_atomic(
                hw, IEEE80211_IFACE_ITER_NORMAL,
                mac80211_hwsim_beacon_tx, link_data);

        /* beacon at new TBTT + beacon interval */
        if (data->bcn_delta) {
                bcn_int -= data->bcn_delta;
                data->bcn_delta = 0;
        }
        hrtimer_forward_now(&link_data->beacon_timer,
                            ns_to_ktime(bcn_int * NSEC_PER_USEC));
        return HRTIMER_RESTART;
}

static const char * const hwsim_chanwidths[] = {
        [NL80211_CHAN_WIDTH_5] = "ht5",
        [NL80211_CHAN_WIDTH_10] = "ht10",
        [NL80211_CHAN_WIDTH_20_NOHT] = "noht",
        [NL80211_CHAN_WIDTH_20] = "ht20",
        [NL80211_CHAN_WIDTH_40] = "ht40",
        [NL80211_CHAN_WIDTH_80] = "vht80",
        [NL80211_CHAN_WIDTH_80P80] = "vht80p80",
        [NL80211_CHAN_WIDTH_160] = "vht160",
        [NL80211_CHAN_WIDTH_1] = "1MHz",
        [NL80211_CHAN_WIDTH_2] = "2MHz",
        [NL80211_CHAN_WIDTH_4] = "4MHz",
        [NL80211_CHAN_WIDTH_8] = "8MHz",
        [NL80211_CHAN_WIDTH_16] = "16MHz",
};

static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed)
{
        struct mac80211_hwsim_data *data = hw->priv;
        struct ieee80211_conf *conf = &hw->conf;
        static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
                [IEEE80211_SMPS_AUTOMATIC] = "auto",
                [IEEE80211_SMPS_OFF] = "off",
                [IEEE80211_SMPS_STATIC] = "static",
                [IEEE80211_SMPS_DYNAMIC] = "dynamic",
        };
        int idx;

        if (conf->chandef.chan)
                wiphy_dbg(hw->wiphy,
                          "%s (freq=%d(%d - %d)/%s idle=%d ps=%d smps=%s)\n",
                          __func__,
                          conf->chandef.chan->center_freq,
                          conf->chandef.center_freq1,
                          conf->chandef.center_freq2,
                          hwsim_chanwidths[conf->chandef.width],
                          !!(conf->flags & IEEE80211_CONF_IDLE),
                          !!(conf->flags & IEEE80211_CONF_PS),
                          smps_modes[conf->smps_mode]);
        else
                wiphy_dbg(hw->wiphy,
                          "%s (freq=0 idle=%d ps=%d smps=%s)\n",
                          __func__,
                          !!(conf->flags & IEEE80211_CONF_IDLE),
                          !!(conf->flags & IEEE80211_CONF_PS),
                          smps_modes[conf->smps_mode]);

        data->idle = !!(conf->flags & IEEE80211_CONF_IDLE);

        WARN_ON(conf->chandef.chan && data->use_chanctx);

        mutex_lock(&data->mutex);
        if (data->scanning && conf->chandef.chan) {
                for (idx = 0; idx < ARRAY_SIZE(data->survey_data); idx++) {
                        if (data->survey_data[idx].channel == data->channel) {
                                data->survey_data[idx].start =
                                        data->survey_data[idx].next_start;
                                data->survey_data[idx].end = jiffies;
                                break;
                        }
                }

                data->channel = conf->chandef.chan;
                data->bw = conf->chandef.width;

                for (idx = 0; idx < ARRAY_SIZE(data->survey_data); idx++) {
                        if (data->survey_data[idx].channel &&
                            data->survey_data[idx].channel != data->channel)
                                continue;
                        data->survey_data[idx].channel = data->channel;
                        data->survey_data[idx].next_start = jiffies;
                        break;
                }
        } else {
                data->channel = conf->chandef.chan;
                data->bw = conf->chandef.width;
        }
        mutex_unlock(&data->mutex);

        for (idx = 0; idx < ARRAY_SIZE(data->link_data); idx++) {
                struct mac80211_hwsim_link_data *link_data =
                        &data->link_data[idx];

                if (!data->started || !link_data->beacon_int) {
                        hrtimer_cancel(&link_data->beacon_timer);
                } else if (!hrtimer_is_queued(&link_data->beacon_timer)) {
                        u64 tsf = mac80211_hwsim_get_tsf(hw, NULL);
                        u32 bcn_int = link_data->beacon_int;
                        u64 until_tbtt = bcn_int - do_div(tsf, bcn_int);

                        hrtimer_start(&link_data->beacon_timer,
                                      ns_to_ktime(until_tbtt * NSEC_PER_USEC),
                                      HRTIMER_MODE_REL_SOFT);
                }
        }

        return 0;
}


static void mac80211_hwsim_configure_filter(struct ieee80211_hw *hw,
                                            unsigned int changed_flags,
                                            unsigned int *total_flags,u64 multicast)
{
        struct mac80211_hwsim_data *data = hw->priv;

        wiphy_dbg(hw->wiphy, "%s\n", __func__);

        data->rx_filter = 0;
        if (*total_flags & FIF_ALLMULTI)
                data->rx_filter |= FIF_ALLMULTI;
        if (*total_flags & FIF_MCAST_ACTION)
                data->rx_filter |= FIF_MCAST_ACTION;

        *total_flags = data->rx_filter;
}

static void mac80211_hwsim_bcn_en_iter(void *data, u8 *mac,
                                       struct ieee80211_vif *vif)
{
        unsigned int *count = data;
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;

        if (vp->bcn_en)
                (*count)++;
}

static void mac80211_hwsim_vif_info_changed(struct ieee80211_hw *hw,
                                            struct ieee80211_vif *vif,
                                            u64 changed)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;

        hwsim_check_magic(vif);

        wiphy_dbg(hw->wiphy, "%s(changed=0x%llx vif->addr=%pM)\n",
                  __func__, changed, vif->addr);

        if (changed & BSS_CHANGED_ASSOC) {
                wiphy_dbg(hw->wiphy, "  ASSOC: assoc=%d aid=%d\n",
                          vif->cfg.assoc, vif->cfg.aid);
                vp->assoc = vif->cfg.assoc;
                vp->aid = vif->cfg.aid;
        }

        if (vif->type == NL80211_IFTYPE_STATION &&
            changed & (BSS_CHANGED_MLD_VALID_LINKS | BSS_CHANGED_MLD_TTLM)) {
                u16 usable_links = ieee80211_vif_usable_links(vif);

                if (vif->active_links != usable_links)
                        ieee80211_set_active_links_async(vif, usable_links);
        }
}

static void mac80211_hwsim_link_info_changed(struct ieee80211_hw *hw,
                                             struct ieee80211_vif *vif,
                                             struct ieee80211_bss_conf *info,
                                             u64 changed)
{
        struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
        struct mac80211_hwsim_data *data = hw->priv;
        unsigned int link_id = info->link_id;
        struct mac80211_hwsim_link_data *link_data = &data->link_data[link_id];

        hwsim_check_magic(vif);

        wiphy_dbg(hw->wiphy, "%s(changed=0x%llx vif->addr=%pM, link id %u)\n",
                  __func__, (unsigned long long)changed, vif->addr, link_id);

        if (changed & BSS_CHANGED_BSSID) {
                wiphy_dbg(hw->wiphy, "%s: BSSID changed: %pM\n",
                          __func__, info->bssid);
                memcpy(vp->bssid, info->bssid, ETH_ALEN);
        }

        if (changed & BSS_CHANGED_BEACON_ENABLED) {
                wiphy_dbg(hw->wiphy, "  BCN EN: %d (BI=%u)\n",
                          info->enable_beacon, info->beacon_int);
                vp->bcn_en = info->enable_beacon;
                if (data->started &&
                    !hrtimer_is_queued(&link_data->beacon_timer) &&
                    info->enable_beacon) {
                        u64 tsf, until_tbtt;
                        u32 bcn_int;
                        link_data->beacon_int = info->beacon_int * 1024;
                        tsf = mac80211_hwsim_get_tsf(hw, vif);
                        bcn_int = link_data->beacon_int;
                        until_tbtt = bcn_int - do_div(tsf, bcn_int);

                        hrtimer_start(&link_data->beacon_timer,
                                      ns_to_ktime(until_tbtt * NSEC_PER_USEC),
                                      HRTIMER_MODE_REL_SOFT);
                } else if (!info->enable_beacon) {
                        unsigned int count = 0;
                        ieee80211_iterate_active_interfaces_atomic(
                                data->hw, IEEE80211_IFACE_ITER_NORMAL,
                                mac80211_hwsim_bcn_en_iter, &count);
                        wiphy_dbg(hw->wiphy, "  beaconing vifs remaining: %u",
                                  count);
                        if (count == 0) {
                                hrtimer_cancel(&link_data->beacon_timer);
                                link_data->beacon_int = 0;
                        }
                }
        }

        if (changed & BSS_CHANGED_ERP_CTS_PROT) {
                wiphy_dbg(hw->wiphy, "  ERP_CTS_PROT: %d\n",
                          info->use_cts_prot);
        }

        if (changed & BSS_CHANGED_ERP_PREAMBLE) {
                wiphy_dbg(hw->wiphy, "  ERP_PREAMBLE: %d\n",
                          info->use_short_preamble);
        }

        if (changed & BSS_CHANGED_ERP_SLOT) {
                wiphy_dbg(hw->wiphy, "  ERP_SLOT: %d\n", info->use_short_slot);
        }

        if (changed & BSS_CHANGED_HT) {
                wiphy_dbg(hw->wiphy, "  HT: op_mode=0x%x\n",
                          info->ht_operation_mode);
        }

        if (changed & BSS_CHANGED_BASIC_RATES) {
                wiphy_dbg(hw->wiphy, "  BASIC_RATES: 0x%llx\n",
                          (unsigned long long) info->basic_rates);
        }

        if (changed & BSS_CHANGED_TXPOWER)
                wiphy_dbg(hw->wiphy, "  TX Power: %d dBm\n", info->txpower);
}

static void
mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw,
                             struct ieee80211_vif *vif,
                             struct ieee80211_sta *sta,
                             u32 changed)
{
        struct mac80211_hwsim_data *data = hw->priv;
        u32 bw = U32_MAX;
        int link_id;

        rcu_read_lock();
        for (link_id = 0;
             link_id < ARRAY_SIZE(vif->link_conf);
             link_id++) {
                enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT;
                struct ieee80211_bss_conf *vif_conf;
                struct ieee80211_link_sta *link_sta;

                link_sta = rcu_dereference(sta->link[link_id]);

                if (!link_sta)
                        continue;

                switch (link_sta->bandwidth) {
#define C(_bw) case IEEE80211_STA_RX_BW_##_bw: bw = _bw; break
                C(20);
                C(40);
                C(80);
                C(160);
                C(320);
#undef C
                }

                if (!data->use_chanctx) {
                        confbw = data->bw;
                } else {
                        struct ieee80211_chanctx_conf *chanctx_conf;

                        vif_conf = rcu_dereference(vif->link_conf[link_id]);
                        if (WARN_ON(!vif_conf))
                                continue;

                        chanctx_conf = rcu_dereference(vif_conf->chanctx_conf);

                        if (!WARN_ON(!chanctx_conf))
                                confbw = chanctx_conf->def.width;
                }

                WARN(bw > hwsim_get_chanwidth(confbw),
                     "intf %pM [link=%d]: bad STA %pM bandwidth %d MHz (%d) > channel config %d MHz (%d)\n",
                     vif->addr, link_id, sta->addr, bw, sta->deflink.bandwidth,
                     hwsim_get_chanwidth(data->bw), data->bw);


        }
        rcu_read_unlock();


}

static int mac80211_hwsim_sta_add(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif,
                                  struct ieee80211_sta *sta)
{
        struct hwsim_sta_priv *sp = (void *)sta->drv_priv;

        hwsim_check_magic(vif);
        hwsim_set_sta_magic(sta);
        mac80211_hwsim_sta_rc_update(hw, vif, sta, 0);

        if (sta->valid_links) {
                WARN(hweight16(sta->valid_links) > 1,
                     "expect to add STA with single link, have 0x%x\n",
                     sta->valid_links);
                sp->active_links_rx = sta->valid_links;
        }

        return 0;
}

static int mac80211_hwsim_sta_remove(struct ieee80211_hw *hw,
                                     struct ieee80211_vif *vif,
                                     struct ieee80211_sta *sta)
{
        hwsim_check_magic(vif);
        hwsim_clear_sta_magic(sta);

        return 0;
}

static int mac80211_hwsim_sta_state(struct ieee80211_hw *hw,
                                    struct ieee80211_vif *vif,
                                    struct ieee80211_sta *sta,
                                    enum ieee80211_sta_state old_state,
                                    enum ieee80211_sta_state new_state)
{
        if (new_state == IEEE80211_STA_NOTEXIST)
                return mac80211_hwsim_sta_remove(hw, vif, sta);

        if (old_state == IEEE80211_STA_NOTEXIST)
                return mac80211_hwsim_sta_add(hw, vif, sta);

        /*
         * in an MLO connection, when client is authorized
         * (AP station marked as such), enable all links
         */
        if (ieee80211_vif_is_mld(vif) &&
            vif->type == NL80211_IFTYPE_STATION &&
            new_state == IEEE80211_STA_AUTHORIZED && !sta->tdls)
                ieee80211_set_active_links_async(vif,
                                                 ieee80211_vif_usable_links(vif));

        return 0;
}

static void mac80211_hwsim_sta_notify(struct ieee80211_hw *hw,
                                      struct ieee80211_vif *vif,
                                      enum sta_notify_cmd cmd,
                                      struct ieee80211_sta *sta)
{
        hwsim_check_magic(vif);

        switch (cmd) {
        case STA_NOTIFY_SLEEP:
        case STA_NOTIFY_AWAKE:
                /* TODO: make good use of these flags */
                break;
        default:
                WARN(1, "Invalid sta notify: %d\n", cmd);
                break;
        }
}

static int mac80211_hwsim_set_tim(struct ieee80211_hw *hw,
                                  struct ieee80211_sta *sta,
                                  bool set)
{
        hwsim_check_sta_magic(sta);
        return 0;
}

static int mac80211_hwsim_conf_tx(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif,
                                  unsigned int link_id, u16 queue,
                                  const struct ieee80211_tx_queue_params *params)
{
        wiphy_dbg(hw->wiphy,
                  "%s (queue=%d txop=%d cw_min=%d cw_max=%d aifs=%d)\n",
                  __func__, queue,
                  params->txop, params->cw_min,
                  params->cw_max, params->aifs);
        return 0;
}

static int mac80211_hwsim_get_survey(struct ieee80211_hw *hw, int idx,
                                     struct survey_info *survey)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;

        if (idx < 0 || idx >= ARRAY_SIZE(hwsim->survey_data))
                return -ENOENT;

        mutex_lock(&hwsim->mutex);
        survey->channel = hwsim->survey_data[idx].channel;
        if (!survey->channel) {
                mutex_unlock(&hwsim->mutex);
                return -ENOENT;
        }

        /*
         * Magically conjured dummy values --- this is only ok for simulated hardware.
         *
         * A real driver which cannot determine real values noise MUST NOT
         * report any, especially not a magically conjured ones :-)
         */
        survey->filled = SURVEY_INFO_NOISE_DBM |
                         SURVEY_INFO_TIME |
                         SURVEY_INFO_TIME_BUSY;
        survey->noise = -92;
        survey->time =
                jiffies_to_msecs(hwsim->survey_data[idx].end -
                                 hwsim->survey_data[idx].start);
        /* report 12.5% of channel time is used */
        survey->time_busy = survey->time/8;
        mutex_unlock(&hwsim->mutex);

        return 0;
}

static enum ieee80211_neg_ttlm_res
mac80211_hwsim_can_neg_ttlm(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
                            struct ieee80211_neg_ttlm *neg_ttlm)
{
        u32 i;

        /* For testing purposes, accept if all TIDs are mapped to the same links
         * set, otherwise reject.
         */
        for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) {
                if (neg_ttlm->downlink[i] != neg_ttlm->uplink[i] ||
                    neg_ttlm->downlink[i] != neg_ttlm->downlink[0])
                        return NEG_TTLM_RES_REJECT;
        }

        return NEG_TTLM_RES_ACCEPT;
}

#ifdef CONFIG_NL80211_TESTMODE
/*
 * This section contains example code for using netlink
 * attributes with the testmode command in nl80211.
 */

/* These enums need to be kept in sync with userspace */
enum hwsim_testmode_attr {
        __HWSIM_TM_ATTR_INVALID        = 0,
        HWSIM_TM_ATTR_CMD        = 1,
        HWSIM_TM_ATTR_PS        = 2,

        /* keep last */
        __HWSIM_TM_ATTR_AFTER_LAST,
        HWSIM_TM_ATTR_MAX        = __HWSIM_TM_ATTR_AFTER_LAST - 1
};

enum hwsim_testmode_cmd {
        HWSIM_TM_CMD_SET_PS                = 0,
        HWSIM_TM_CMD_GET_PS                = 1,
        HWSIM_TM_CMD_STOP_QUEUES        = 2,
        HWSIM_TM_CMD_WAKE_QUEUES        = 3,
};

static const struct nla_policy hwsim_testmode_policy[HWSIM_TM_ATTR_MAX + 1] = {
        [HWSIM_TM_ATTR_CMD] = { .type = NLA_U32 },
        [HWSIM_TM_ATTR_PS] = { .type = NLA_U32 },
};

static int mac80211_hwsim_testmode_cmd(struct ieee80211_hw *hw,
                                       struct ieee80211_vif *vif,
                                       void *data, int len)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;
        struct nlattr *tb[HWSIM_TM_ATTR_MAX + 1];
        struct sk_buff *skb;
        int err, ps;

        err = nla_parse_deprecated(tb, HWSIM_TM_ATTR_MAX, data, len,
                                   hwsim_testmode_policy, NULL);
        if (err)
                return err;

        if (!tb[HWSIM_TM_ATTR_CMD])
                return -EINVAL;

        switch (nla_get_u32(tb[HWSIM_TM_ATTR_CMD])) {
        case HWSIM_TM_CMD_SET_PS:
                if (!tb[HWSIM_TM_ATTR_PS])
                        return -EINVAL;
                ps = nla_get_u32(tb[HWSIM_TM_ATTR_PS]);
                return hwsim_fops_ps_write(hwsim, ps);
        case HWSIM_TM_CMD_GET_PS:
                skb = cfg80211_testmode_alloc_reply_skb(hw->wiphy,
                                                nla_total_size(sizeof(u32)));
                if (!skb)
                        return -ENOMEM;
                if (nla_put_u32(skb, HWSIM_TM_ATTR_PS, hwsim->ps))
                        goto nla_put_failure;
                return cfg80211_testmode_reply(skb);
        case HWSIM_TM_CMD_STOP_QUEUES:
                ieee80211_stop_queues(hw);
                return 0;
        case HWSIM_TM_CMD_WAKE_QUEUES:
                ieee80211_wake_queues(hw);
                return 0;
        default:
                return -EOPNOTSUPP;
        }

 nla_put_failure:
        kfree_skb(skb);
        return -ENOBUFS;
}
#endif

static int mac80211_hwsim_ampdu_action(struct ieee80211_hw *hw,
                                       struct ieee80211_vif *vif,
                                       struct ieee80211_ampdu_params *params)
{
        struct ieee80211_sta *sta = params->sta;
        enum ieee80211_ampdu_mlme_action action = params->action;
        u16 tid = params->tid;

        switch (action) {
        case IEEE80211_AMPDU_TX_START:
                return IEEE80211_AMPDU_TX_START_IMMEDIATE;
        case IEEE80211_AMPDU_TX_STOP_CONT:
        case IEEE80211_AMPDU_TX_STOP_FLUSH:
        case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT:
                ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
                break;
        case IEEE80211_AMPDU_TX_OPERATIONAL:
                break;
        case IEEE80211_AMPDU_RX_START:
        case IEEE80211_AMPDU_RX_STOP:
                break;
        default:
                return -EOPNOTSUPP;
        }

        return 0;
}

static void mac80211_hwsim_flush(struct ieee80211_hw *hw,
                                 struct ieee80211_vif *vif,
                                 u32 queues, bool drop)
{
        /* Not implemented, queues only on kernel side */
}

static void hw_scan_work(struct work_struct *work)
{
        struct mac80211_hwsim_data *hwsim =
                container_of(work, struct mac80211_hwsim_data, hw_scan.work);
        struct cfg80211_scan_request *req = hwsim->hw_scan_request;
        int dwell, i;

        mutex_lock(&hwsim->mutex);
        if (hwsim->scan_chan_idx >= req->n_channels) {
                struct cfg80211_scan_info info = {
                        .aborted = false,
                };

                wiphy_dbg(hwsim->hw->wiphy, "hw scan complete\n");
                ieee80211_scan_completed(hwsim->hw, &info);
                hwsim->hw_scan_request = NULL;
                hwsim->hw_scan_vif = NULL;
                hwsim->tmp_chan = NULL;
                mutex_unlock(&hwsim->mutex);
                mac80211_hwsim_config_mac_nl(hwsim->hw, hwsim->scan_addr,
                                             false);
                return;
        }

        wiphy_dbg(hwsim->hw->wiphy, "hw scan %d MHz\n",
                  req->channels[hwsim->scan_chan_idx]->center_freq);

        hwsim->tmp_chan = req->channels[hwsim->scan_chan_idx];
        if (hwsim->tmp_chan->flags & (IEEE80211_CHAN_NO_IR |
                                      IEEE80211_CHAN_RADAR) ||
            !req->n_ssids) {
                dwell = 120;
        } else {
                dwell = 30;
                /* send probes */
                for (i = 0; i < req->n_ssids; i++) {
                        struct sk_buff *probe;
                        struct ieee80211_mgmt *mgmt;

                        probe = ieee80211_probereq_get(hwsim->hw,
                                                       hwsim->scan_addr,
                                                       req->ssids[i].ssid,
                                                       req->ssids[i].ssid_len,
                                                       req->ie_len);
                        if (!probe)
                                continue;

                        mgmt = (struct ieee80211_mgmt *) probe->data;
                        memcpy(mgmt->da, req->bssid, ETH_ALEN);
                        memcpy(mgmt->bssid, req->bssid, ETH_ALEN);

                        if (req->ie_len)
                                skb_put_data(probe, req->ie, req->ie_len);

                        rcu_read_lock();
                        if (!ieee80211_tx_prepare_skb(hwsim->hw,
                                                      hwsim->hw_scan_vif,
                                                      probe,
                                                      hwsim->tmp_chan->band,
                                                      NULL)) {
                                rcu_read_unlock();
                                kfree_skb(probe);
                                continue;
                        }

                        local_bh_disable();
                        mac80211_hwsim_tx_frame(hwsim->hw, probe,
                                                hwsim->tmp_chan);
                        rcu_read_unlock();
                        local_bh_enable();
                }
        }
        ieee80211_queue_delayed_work(hwsim->hw, &hwsim->hw_scan,
                                     msecs_to_jiffies(dwell));
        hwsim->survey_data[hwsim->scan_chan_idx].channel = hwsim->tmp_chan;
        hwsim->survey_data[hwsim->scan_chan_idx].start = jiffies;
        hwsim->survey_data[hwsim->scan_chan_idx].end =
                jiffies + msecs_to_jiffies(dwell);
        hwsim->scan_chan_idx++;
        mutex_unlock(&hwsim->mutex);
}

static int mac80211_hwsim_hw_scan(struct ieee80211_hw *hw,
                                  struct ieee80211_vif *vif,
                                  struct ieee80211_scan_request *hw_req)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;
        struct cfg80211_scan_request *req = &hw_req->req;

        mutex_lock(&hwsim->mutex);
        if (WARN_ON(hwsim->tmp_chan || hwsim->hw_scan_request)) {
                mutex_unlock(&hwsim->mutex);
                return -EBUSY;
        }
        hwsim->hw_scan_request = req;
        hwsim->hw_scan_vif = vif;
        hwsim->scan_chan_idx = 0;
        if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR)
                get_random_mask_addr(hwsim->scan_addr,
                                     hw_req->req.mac_addr,
                                     hw_req->req.mac_addr_mask);
        else
                memcpy(hwsim->scan_addr, vif->addr, ETH_ALEN);
        memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data));
        mutex_unlock(&hwsim->mutex);

        mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true);
        wiphy_dbg(hw->wiphy, "hwsim hw_scan request\n");

        ieee80211_queue_delayed_work(hwsim->hw, &hwsim->hw_scan, 0);

        return 0;
}

static void mac80211_hwsim_cancel_hw_scan(struct ieee80211_hw *hw,
                                          struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;
        struct cfg80211_scan_info info = {
                .aborted = true,
        };

        wiphy_dbg(hw->wiphy, "hwsim cancel_hw_scan\n");

        cancel_delayed_work_sync(&hwsim->hw_scan);

        mutex_lock(&hwsim->mutex);
        ieee80211_scan_completed(hwsim->hw, &info);
        hwsim->tmp_chan = NULL;
        hwsim->hw_scan_request = NULL;
        hwsim->hw_scan_vif = NULL;
        mutex_unlock(&hwsim->mutex);
}

static void mac80211_hwsim_sw_scan(struct ieee80211_hw *hw,
                                   struct ieee80211_vif *vif,
                                   const u8 *mac_addr)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;

        mutex_lock(&hwsim->mutex);

        if (hwsim->scanning) {
                pr_debug("two hwsim sw_scans detected!\n");
                goto out;
        }

        pr_debug("hwsim sw_scan request, prepping stuff\n");

        memcpy(hwsim->scan_addr, mac_addr, ETH_ALEN);
        mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, true);
        hwsim->scanning = true;
        memset(hwsim->survey_data, 0, sizeof(hwsim->survey_data));

out:
        mutex_unlock(&hwsim->mutex);
}

static void mac80211_hwsim_sw_scan_complete(struct ieee80211_hw *hw,
                                            struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;

        mutex_lock(&hwsim->mutex);

        pr_debug("hwsim sw_scan_complete\n");
        hwsim->scanning = false;
        mac80211_hwsim_config_mac_nl(hw, hwsim->scan_addr, false);
        eth_zero_addr(hwsim->scan_addr);

        mutex_unlock(&hwsim->mutex);
}

static void hw_roc_start(struct work_struct *work)
{
        struct mac80211_hwsim_data *hwsim =
                container_of(work, struct mac80211_hwsim_data, roc_start.work);

        mutex_lock(&hwsim->mutex);

        wiphy_dbg(hwsim->hw->wiphy, "hwsim ROC begins\n");
        hwsim->tmp_chan = hwsim->roc_chan;
        ieee80211_ready_on_channel(hwsim->hw);

        ieee80211_queue_delayed_work(hwsim->hw, &hwsim->roc_done,
                                     msecs_to_jiffies(hwsim->roc_duration));

        mutex_unlock(&hwsim->mutex);
}

static void hw_roc_done(struct work_struct *work)
{
        struct mac80211_hwsim_data *hwsim =
                container_of(work, struct mac80211_hwsim_data, roc_done.work);

        mutex_lock(&hwsim->mutex);
        ieee80211_remain_on_channel_expired(hwsim->hw);
        hwsim->tmp_chan = NULL;
        mutex_unlock(&hwsim->mutex);

        wiphy_dbg(hwsim->hw->wiphy, "hwsim ROC expired\n");
}

static int mac80211_hwsim_roc(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif,
                              struct ieee80211_channel *chan,
                              int duration,
                              enum ieee80211_roc_type type)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;

        mutex_lock(&hwsim->mutex);
        if (WARN_ON(hwsim->tmp_chan || hwsim->hw_scan_request)) {
                mutex_unlock(&hwsim->mutex);
                return -EBUSY;
        }

        hwsim->roc_chan = chan;
        hwsim->roc_duration = duration;
        mutex_unlock(&hwsim->mutex);

        wiphy_dbg(hw->wiphy, "hwsim ROC (%d MHz, %d ms)\n",
                  chan->center_freq, duration);
        ieee80211_queue_delayed_work(hw, &hwsim->roc_start, HZ/50);

        return 0;
}

static int mac80211_hwsim_croc(struct ieee80211_hw *hw,
                               struct ieee80211_vif *vif)
{
        struct mac80211_hwsim_data *hwsim = hw->priv;

        cancel_delayed_work_sync(&hwsim->roc_start);
        cancel_delayed_work_sync(&hwsim->roc_done);

        mutex_lock(&hwsim->mutex);
        hwsim->tmp_chan = NULL;
        mutex_unlock(&hwsim->mutex);

        wiphy_dbg(hw->wiphy, "hwsim ROC canceled\n");

        return 0;
}

static int mac80211_hwsim_add_chanctx(struct ieee80211_hw *hw,
                                      struct ieee80211_chanctx_conf *ctx)
{
        hwsim_set_chanctx_magic(ctx);
        wiphy_dbg(hw->wiphy,
                  "add channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n",
                  ctx->def.chan->center_freq, ctx->def.width,
                  ctx->def.center_freq1, ctx->def.center_freq2);
        return 0;
}

static void mac80211_hwsim_remove_chanctx(struct ieee80211_hw *hw,
                                          struct ieee80211_chanctx_conf *ctx)
{
        wiphy_dbg(hw->wiphy,
                  "remove channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n",
                  ctx->def.chan->center_freq, ctx->def.width,
                  ctx->def.center_freq1, ctx->def.center_freq2);
        hwsim_check_chanctx_magic(ctx);
        hwsim_clear_chanctx_magic(ctx);
}

static void mac80211_hwsim_change_chanctx(struct ieee80211_hw *hw,
                                          struct ieee80211_chanctx_conf *ctx,
                                          u32 changed)
{
        hwsim_check_chanctx_magic(ctx);
        wiphy_dbg(hw->wiphy,
                  "change channel context control: %d MHz/width: %d/cfreqs:%d/%d MHz\n",
                  ctx->def.chan->center_freq, ctx->def.width,
                  ctx->def.center_freq1, ctx->def.center_freq2);
}

static int mac80211_hwsim_assign_vif_chanctx(struct ieee80211_hw *hw,
                                             struct ieee80211_vif *vif,
                                             struct ieee80211_bss_conf *link_conf,
                                             struct ieee80211_chanctx_conf *ctx)
{
        hwsim_check_magic(vif);
        hwsim_check_chanctx_magic(ctx);

        /* if we activate a link while already associated wake it up */
        if (vif->type == NL80211_IFTYPE_STATION && vif->cfg.assoc) {
                struct sk_buff *skb;

                skb = ieee80211_nullfunc_get(hw, vif, link_conf->link_id, true);
                if (skb) {
                        local_bh_disable();
                        mac80211_hwsim_tx_frame(hw, skb, ctx->def.chan);
                        local_bh_enable();
                }
        }

        return 0;
}

static void mac80211_hwsim_unassign_vif_chanctx(struct ieee80211_hw *hw,
                                                struct ieee80211_vif *vif,
                                                struct ieee80211_bss_conf *link_conf,
                                                struct ieee80211_chanctx_conf *ctx)
{
        hwsim_check_magic(vif);
        hwsim_check_chanctx_magic(ctx);

        /* if we deactivate a link while associated suspend it first */
        if (vif->type == NL80211_IFTYPE_STATION && vif->cfg.assoc) {
                struct sk_buff *skb;

                skb = ieee80211_nullfunc_get(hw, vif, link_conf->link_id, true);
                if (skb) {
                        struct ieee80211_hdr *hdr = (void *)skb->data;

                        hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);

                        local_bh_disable();
                        mac80211_hwsim_tx_frame(hw, skb, ctx->def.chan);
                        local_bh_enable();
                }
        }
}

static int mac80211_hwsim_switch_vif_chanctx(struct ieee80211_hw *hw,
                                             struct ieee80211_vif_chanctx_switch *vifs,
                                             int n_vifs,
                                             enum ieee80211_chanctx_switch_mode mode)
{
        int i;

        if (n_vifs <= 0)
                return -EINVAL;

        wiphy_dbg(hw->wiphy,
                  "switch vif channel context mode: %u\n", mode);

        for (i = 0; i < n_vifs; i++) {
                hwsim_check_chanctx_magic(vifs[i].old_ctx);
                wiphy_dbg(hw->wiphy,
                          "switch vif channel context: %d MHz/width: %d/cfreqs:%d/%d MHz -> %d MHz/width: %d/cfreqs:%d/%d MHz\n",
                          vifs[i].old_ctx->def.chan->center_freq,
                          vifs[i].old_ctx->def.width,
                          vifs[i].old_ctx->def.center_freq1,
                          vifs[i].old_ctx->def.center_freq2,
                          vifs[i].new_ctx->def.chan->center_freq,
                          vifs[i].new_ctx->def.width,
                          vifs[i].new_ctx->def.center_freq1,
                          vifs[i].new_ctx->def.center_freq2);

                switch (mode) {
                case CHANCTX_SWMODE_REASSIGN_VIF:
                        hwsim_check_chanctx_magic(vifs[i].new_ctx);
                        break;
                case CHANCTX_SWMODE_SWAP_CONTEXTS:
                        hwsim_set_chanctx_magic(vifs[i].new_ctx);
                        hwsim_clear_chanctx_magic(vifs[i].old_ctx);
                        break;
                default:
                        WARN_ON("Invalid mode");
                }
        }
        return 0;
}

static const char mac80211_hwsim_gstrings_stats[][ETH_GSTRING_LEN] = {
        "tx_pkts_nic",
        "tx_bytes_nic",
        "rx_pkts_nic",
        "rx_bytes_nic",
        "d_tx_dropped",
        "d_tx_failed",
        "d_ps_mode",
        "d_group",
};

#define MAC80211_HWSIM_SSTATS_LEN ARRAY_SIZE(mac80211_hwsim_gstrings_stats)

static void mac80211_hwsim_get_et_strings(struct ieee80211_hw *hw,
                                          struct ieee80211_vif *vif,
                                          u32 sset, u8 *data)
{
        if (sset == ETH_SS_STATS)
                memcpy(data, mac80211_hwsim_gstrings_stats,
                       sizeof(mac80211_hwsim_gstrings_stats));
}

static int mac80211_hwsim_get_et_sset_count(struct ieee80211_hw *hw,
                                            struct ieee80211_vif *vif, int sset)
{
        if (sset == ETH_SS_STATS)
                return MAC80211_HWSIM_SSTATS_LEN;
        return 0;
}

static void mac80211_hwsim_get_et_stats(struct ieee80211_hw *hw,
                                        struct ieee80211_vif *vif,
                                        struct ethtool_stats *stats, u64 *data)
{
        struct mac80211_hwsim_data *ar = hw->priv;
        int i = 0;

        data[i++] = ar->tx_pkts;
        data[i++] = ar->tx_bytes;
        data[i++] = ar->rx_pkts;
        data[i++] = ar->rx_bytes;
        data[i++] = ar->tx_dropped;
        data[i++] = ar->tx_failed;
        data[i++] = ar->ps;
        data[i++] = ar->group;

        WARN_ON(i != MAC80211_HWSIM_SSTATS_LEN);
}

static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw)
{
        return 1;
}

static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
{
        return -EOPNOTSUPP;
}

static int mac80211_hwsim_change_vif_links(struct ieee80211_hw *hw,
                                           struct ieee80211_vif *vif,
                                           u16 old_links, u16 new_links,
                                           struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS])
{
        unsigned long rem = old_links & ~new_links;
        unsigned long add = new_links & ~old_links;
        int i;

        if (!old_links)
                rem |= BIT(0);
        if (!new_links)
                add |= BIT(0);

        for_each_set_bit(i, &rem, IEEE80211_MLD_MAX_NUM_LINKS)
                mac80211_hwsim_config_mac_nl(hw, old[i]->addr, false);

        for_each_set_bit(i, &add, IEEE80211_MLD_MAX_NUM_LINKS) {
                struct ieee80211_bss_conf *link_conf;

                link_conf = link_conf_dereference_protected(vif, i);
                if (WARN_ON(!link_conf))
                        continue;

                mac80211_hwsim_config_mac_nl(hw, link_conf->addr, true);
        }

        return 0;
}

static int mac80211_hwsim_change_sta_links(struct ieee80211_hw *hw,
                                           struct ieee80211_vif *vif,
                                           struct ieee80211_sta *sta,
                                           u16 old_links, u16 new_links)
{
        struct hwsim_sta_priv *sp = (void *)sta->drv_priv;

        hwsim_check_sta_magic(sta);

        if (vif->type == NL80211_IFTYPE_STATION)
                sp->active_links_rx = new_links;

        return 0;
}

static int mac80211_hwsim_send_pmsr_ftm_request_peer(struct sk_buff *msg,
                                                     struct cfg80211_pmsr_ftm_request_peer *request)
{
        struct nlattr *ftm;

        if (!request->requested)
                return -EINVAL;

        ftm = nla_nest_start(msg, NL80211_PMSR_TYPE_FTM);
        if (!ftm)
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE, request->preamble))
                return -ENOBUFS;

        if (nla_put_u16(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD, request->burst_period))
                return -ENOBUFS;

        if (request->asap && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_ASAP))
                return -ENOBUFS;

        if (request->request_lci && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI))
                return -ENOBUFS;

        if (request->request_civicloc &&
            nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC))
                return -ENOBUFS;

        if (request->trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED))
                return -ENOBUFS;

        if (request->non_trigger_based &&
            nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED))
                return -ENOBUFS;

        if (request->lmr_feedback && nla_put_flag(msg, NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK))
                return -ENOBUFS;

        if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP, request->num_bursts_exp))
                return -ENOBUFS;

        if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION, request->burst_duration))
                return -ENOBUFS;

        if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST, request->ftms_per_burst))
                return -ENOBUFS;

        if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES, request->ftmr_retries))
                return -ENOBUFS;

        if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION, request->burst_duration))
                return -ENOBUFS;

        if (nla_put_u8(msg, NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR, request->bss_color))
                return -ENOBUFS;

        nla_nest_end(msg, ftm);

        return 0;
}

static int mac80211_hwsim_send_pmsr_request_peer(struct sk_buff *msg,
                                                 struct cfg80211_pmsr_request_peer *request)
{
        struct nlattr *peer, *chandef, *req, *data;
        int err;

        peer = nla_nest_start(msg, NL80211_PMSR_ATTR_PEERS);
        if (!peer)
                return -ENOBUFS;

        if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN,
                    request->addr))
                return -ENOBUFS;

        chandef = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_CHAN);
        if (!chandef)
                return -ENOBUFS;

        err = nl80211_send_chandef(msg, &request->chandef);
        if (err)
                return err;

        nla_nest_end(msg, chandef);

        req = nla_nest_start(msg, NL80211_PMSR_PEER_ATTR_REQ);
        if (!req)
                return -ENOBUFS;

        if (request->report_ap_tsf && nla_put_flag(msg, NL80211_PMSR_REQ_ATTR_GET_AP_TSF))
                return -ENOBUFS;

        data = nla_nest_start(msg, NL80211_PMSR_REQ_ATTR_DATA);
        if (!data)
                return -ENOBUFS;

        err = mac80211_hwsim_send_pmsr_ftm_request_peer(msg, &request->ftm);
        if (err)
                return err;

        nla_nest_end(msg, data);
        nla_nest_end(msg, req);
        nla_nest_end(msg, peer);

        return 0;
}

static int mac80211_hwsim_send_pmsr_request(struct sk_buff *msg,
                                            struct cfg80211_pmsr_request *request)
{
        struct nlattr *pmsr;
        int err;

        pmsr = nla_nest_start(msg, NL80211_ATTR_PEER_MEASUREMENTS);
        if (!pmsr)
                return -ENOBUFS;

        if (nla_put_u32(msg, NL80211_ATTR_TIMEOUT, request->timeout))
                return -ENOBUFS;

        if (!is_zero_ether_addr(request->mac_addr)) {
                if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, request->mac_addr))
                        return -ENOBUFS;
                if (nla_put(msg, NL80211_ATTR_MAC_MASK, ETH_ALEN, request->mac_addr_mask))
                        return -ENOBUFS;
        }

        for (int i = 0; i < request->n_peers; i++) {
                err = mac80211_hwsim_send_pmsr_request_peer(msg, &request->peers[i]);
                if (err)
                        return err;
        }

        nla_nest_end(msg, pmsr);

        return 0;
}

static int mac80211_hwsim_start_pmsr(struct ieee80211_hw *hw,
                                     struct ieee80211_vif *vif,
                                     struct cfg80211_pmsr_request *request)
{
        struct mac80211_hwsim_data *data;
        struct sk_buff *skb = NULL;
        struct nlattr *pmsr;
        void *msg_head;
        u32 _portid;
        int err = 0;

        data = hw->priv;
        _portid = READ_ONCE(data->wmediumd);
        if (!_portid && !hwsim_virtio_enabled)
                return -EOPNOTSUPP;

        mutex_lock(&data->mutex);

        if (data->pmsr_request) {
                err = -EBUSY;
                goto out_free;
        }

        skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);

        if (!skb) {
                err = -ENOMEM;
                goto out_free;
        }

        msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_START_PMSR);

        if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER,
                    ETH_ALEN, data->addresses[1].addr)) {
                err = -ENOMEM;
                goto out_free;
        }

        pmsr = nla_nest_start(skb, HWSIM_ATTR_PMSR_REQUEST);
        if (!pmsr) {
                err = -ENOMEM;
                goto out_free;
        }

        err = mac80211_hwsim_send_pmsr_request(skb, request);
        if (err)
                goto out_free;

        nla_nest_end(skb, pmsr);

        genlmsg_end(skb, msg_head);
        if (hwsim_virtio_enabled)
                hwsim_tx_virtio(data, skb);
        else
                hwsim_unicast_netgroup(data, skb, _portid);

        data->pmsr_request = request;
        data->pmsr_request_wdev = ieee80211_vif_to_wdev(vif);

out_free:
        if (err && skb)
                nlmsg_free(skb);

        mutex_unlock(&data->mutex);
        return err;
}

static void mac80211_hwsim_abort_pmsr(struct ieee80211_hw *hw,
                                      struct ieee80211_vif *vif,
                                      struct cfg80211_pmsr_request *request)
{
        struct mac80211_hwsim_data *data;
        struct sk_buff *skb = NULL;
        struct nlattr *pmsr;
        void *msg_head;
        u32 _portid;
        int err = 0;

        data = hw->priv;
        _portid = READ_ONCE(data->wmediumd);
        if (!_portid && !hwsim_virtio_enabled)
                return;

        mutex_lock(&data->mutex);

        if (data->pmsr_request != request) {
                err = -EINVAL;
                goto out;
        }

        skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!skb) {
                err = -ENOMEM;
                goto out;
        }

        msg_head = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0, HWSIM_CMD_ABORT_PMSR);

        if (nla_put(skb, HWSIM_ATTR_ADDR_TRANSMITTER, ETH_ALEN, data->addresses[1].addr))
                goto out;

        pmsr = nla_nest_start(skb, HWSIM_ATTR_PMSR_REQUEST);
        if (!pmsr) {
                err = -ENOMEM;
                goto out;
        }

        err = mac80211_hwsim_send_pmsr_request(skb, request);
        if (err)
                goto out;

        err = nla_nest_end(skb, pmsr);
        if (err)
                goto out;

        genlmsg_end(skb, msg_head);
        if (hwsim_virtio_enabled)
                hwsim_tx_virtio(data, skb);
        else
                hwsim_unicast_netgroup(data, skb, _portid);

out:
        if (err && skb)
                nlmsg_free(skb);

        mutex_unlock(&data->mutex);
}

static int mac80211_hwsim_parse_rate_info(struct nlattr *rateattr,
                                          struct rate_info *rate_info,
                                          struct genl_info *info)
{
        struct nlattr *tb[HWSIM_RATE_INFO_ATTR_MAX + 1];
        int ret;

        ret = nla_parse_nested(tb, HWSIM_RATE_INFO_ATTR_MAX,
                               rateattr, hwsim_rate_info_policy, info->extack);
        if (ret)
                return ret;

        if (tb[HWSIM_RATE_INFO_ATTR_FLAGS])
                rate_info->flags = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_FLAGS]);

        if (tb[HWSIM_RATE_INFO_ATTR_MCS])
                rate_info->mcs = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_MCS]);

        if (tb[HWSIM_RATE_INFO_ATTR_LEGACY])
                rate_info->legacy = nla_get_u16(tb[HWSIM_RATE_INFO_ATTR_LEGACY]);

        if (tb[HWSIM_RATE_INFO_ATTR_NSS])
                rate_info->nss = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_NSS]);

        if (tb[HWSIM_RATE_INFO_ATTR_BW])
                rate_info->bw = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_BW]);

        if (tb[HWSIM_RATE_INFO_ATTR_HE_GI])
                rate_info->he_gi = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_GI]);

        if (tb[HWSIM_RATE_INFO_ATTR_HE_DCM])
                rate_info->he_dcm = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_DCM]);

        if (tb[HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC])
                rate_info->he_ru_alloc =
                        nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_HE_RU_ALLOC]);

        if (tb[HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH])
                rate_info->n_bonded_ch = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_N_BOUNDED_CH]);

        if (tb[HWSIM_RATE_INFO_ATTR_EHT_GI])
                rate_info->eht_gi = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_EHT_GI]);

        if (tb[HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC])
                rate_info->eht_ru_alloc = nla_get_u8(tb[HWSIM_RATE_INFO_ATTR_EHT_RU_ALLOC]);

        return 0;
}

static int mac80211_hwsim_parse_ftm_result(struct nlattr *ftm,
                                           struct cfg80211_pmsr_ftm_result *result,
                                           struct genl_info *info)
{
        struct nlattr *tb[NL80211_PMSR_FTM_RESP_ATTR_MAX + 1];
        int ret;

        ret = nla_parse_nested(tb, NL80211_PMSR_FTM_RESP_ATTR_MAX,
                               ftm, hwsim_ftm_result_policy, info->extack);
        if (ret)
                return ret;

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON])
                result->failure_reason = nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON]);

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX])
                result->burst_index = nla_get_u16(tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_INDEX]);

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS]) {
                result->num_ftmr_attempts_valid = 1;
                result->num_ftmr_attempts =
                        nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS]);
        }

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES]) {
                result->num_ftmr_successes_valid = 1;
                result->num_ftmr_successes =
                        nla_get_u32(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES]);
        }

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME])
                result->busy_retry_time =
                        nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME]);

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP])
                result->num_bursts_exp = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP]);

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION])
                result->burst_duration = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_BURST_DURATION]);

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST])
                result->ftms_per_burst = nla_get_u8(tb[NL80211_PMSR_FTM_RESP_ATTR_FTMS_PER_BURST]);

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG]) {
                result->rssi_avg_valid = 1;
                result->rssi_avg = nla_get_s32(tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_AVG]);
        }
        if (tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD]) {
                result->rssi_spread_valid = 1;
                result->rssi_spread =
                        nla_get_s32(tb[NL80211_PMSR_FTM_RESP_ATTR_RSSI_SPREAD]);
        }

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_TX_RATE]) {
                result->tx_rate_valid = 1;
                ret = mac80211_hwsim_parse_rate_info(tb[NL80211_PMSR_FTM_RESP_ATTR_TX_RATE],
                                                     &result->tx_rate, info);
                if (ret)
                        return ret;
        }

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_RX_RATE]) {
                result->rx_rate_valid = 1;
                ret = mac80211_hwsim_parse_rate_info(tb[NL80211_PMSR_FTM_RESP_ATTR_RX_RATE],
                                                     &result->rx_rate, info);
                if (ret)
                        return ret;
        }

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG]) {
                result->rtt_avg_valid = 1;
                result->rtt_avg =
                        nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_AVG]);
        }
        if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE]) {
                result->rtt_variance_valid = 1;
                result->rtt_variance =
                        nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_VARIANCE]);
        }
        if (tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD]) {
                result->rtt_spread_valid = 1;
                result->rtt_spread =
                        nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_RTT_SPREAD]);
        }
        if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG]) {
                result->dist_avg_valid = 1;
                result->dist_avg =
                        nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_AVG]);
        }
        if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE]) {
                result->dist_variance_valid = 1;
                result->dist_variance =
                        nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_VARIANCE]);
        }
        if (tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD]) {
                result->dist_spread_valid = 1;
                result->dist_spread =
                        nla_get_u64(tb[NL80211_PMSR_FTM_RESP_ATTR_DIST_SPREAD]);
        }

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]) {
                result->lci = nla_data(tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]);
                result->lci_len = nla_len(tb[NL80211_PMSR_FTM_RESP_ATTR_LCI]);
        }

        if (tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]) {
                result->civicloc = nla_data(tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]);
                result->civicloc_len = nla_len(tb[NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC]);
        }

        return 0;
}

static int mac80211_hwsim_parse_pmsr_resp(struct nlattr *resp,
                                          struct cfg80211_pmsr_result *result,
                                          struct genl_info *info)
{
        struct nlattr *tb[NL80211_PMSR_RESP_ATTR_MAX + 1];
        struct nlattr *pmsr;
        int rem;
        int ret;

        ret = nla_parse_nested(tb, NL80211_PMSR_RESP_ATTR_MAX, resp, hwsim_pmsr_resp_policy,
                               info->extack);
        if (ret)
                return ret;

        if (tb[NL80211_PMSR_RESP_ATTR_STATUS])
                result->status = nla_get_u32(tb[NL80211_PMSR_RESP_ATTR_STATUS]);

        if (tb[NL80211_PMSR_RESP_ATTR_HOST_TIME])
                result->host_time = nla_get_u64(tb[NL80211_PMSR_RESP_ATTR_HOST_TIME]);

        if (tb[NL80211_PMSR_RESP_ATTR_AP_TSF]) {
                result->ap_tsf_valid = 1;
                result->ap_tsf = nla_get_u64(tb[NL80211_PMSR_RESP_ATTR_AP_TSF]);
        }

        result->final = !!tb[NL80211_PMSR_RESP_ATTR_FINAL];

        if (!tb[NL80211_PMSR_RESP_ATTR_DATA])
                return 0;

        nla_for_each_nested(pmsr, tb[NL80211_PMSR_RESP_ATTR_DATA], rem) {
                switch (nla_type(pmsr)) {
                case NL80211_PMSR_TYPE_FTM:
                        result->type = NL80211_PMSR_TYPE_FTM;
                        ret = mac80211_hwsim_parse_ftm_result(pmsr, &result->ftm, info);
                        if (ret)
                                return ret;
                        break;
                default:
                        NL_SET_ERR_MSG_ATTR(info->extack, pmsr, "Unknown pmsr resp type");
                        return -EINVAL;
                }
        }

        return 0;
}

static int mac80211_hwsim_parse_pmsr_result(struct nlattr *peer,
                                            struct cfg80211_pmsr_result *result,
                                            struct genl_info *info)
{
        struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1];
        int ret;

        if (!peer)
                return -EINVAL;

        ret = nla_parse_nested(tb, NL80211_PMSR_PEER_ATTR_MAX, peer,
                               hwsim_pmsr_peer_result_policy, info->extack);
        if (ret)
                return ret;

        if (tb[NL80211_PMSR_PEER_ATTR_ADDR])
                memcpy(result->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]),
                       ETH_ALEN);

        if (tb[NL80211_PMSR_PEER_ATTR_RESP]) {
                ret = mac80211_hwsim_parse_pmsr_resp(tb[NL80211_PMSR_PEER_ATTR_RESP], result, info);
                if (ret)
                        return ret;
        }

        return 0;
};

static int hwsim_pmsr_report_nl(struct sk_buff *msg, struct genl_info *info)
{
        struct mac80211_hwsim_data *data;
        struct nlattr *peers, *peer;
        struct nlattr *reqattr;
        const u8 *src;
        int err;
        int rem;

        if (!info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER])
                return -EINVAL;

        src = nla_data(info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]);
        data = get_hwsim_data_ref_from_addr(src);
        if (!data)
                return -EINVAL;

        mutex_lock(&data->mutex);
        if (!data->pmsr_request) {
                err = -EINVAL;
                goto out;
        }

        reqattr = info->attrs[HWSIM_ATTR_PMSR_RESULT];
        if (!reqattr) {
                err = -EINVAL;
                goto out;
        }

        peers = nla_find_nested(reqattr, NL80211_PMSR_ATTR_PEERS);
        if (!peers) {
                err = -EINVAL;
                goto out;
        }

        nla_for_each_nested(peer, peers, rem) {
                struct cfg80211_pmsr_result result = {};

                err = mac80211_hwsim_parse_pmsr_result(peer, &result, info);
                if (err)
                        goto out;

                cfg80211_pmsr_report(data->pmsr_request_wdev,
                                     data->pmsr_request, &result, GFP_KERNEL);
        }

        cfg80211_pmsr_complete(data->pmsr_request_wdev, data->pmsr_request, GFP_KERNEL);

        err = 0;
out:
        data->pmsr_request = NULL;
        data->pmsr_request_wdev = NULL;

        mutex_unlock(&data->mutex);
        return err;
}

#ifdef CONFIG_MAC80211_DEBUGFS
#define HWSIM_DEBUGFS_OPS                                        \
        .link_add_debugfs = mac80211_hwsim_link_add_debugfs,
#else
#define HWSIM_DEBUGFS_OPS
#endif

#define HWSIM_COMMON_OPS                                        \
        .tx = mac80211_hwsim_tx,                                \
        .wake_tx_queue = ieee80211_handle_wake_tx_queue,        \
        .start = mac80211_hwsim_start,                                \
        .stop = mac80211_hwsim_stop,                                \
        .add_interface = mac80211_hwsim_add_interface,                \
        .change_interface = mac80211_hwsim_change_interface,        \
        .remove_interface = mac80211_hwsim_remove_interface,        \
        .config = mac80211_hwsim_config,                        \
        .configure_filter = mac80211_hwsim_configure_filter,        \
        .vif_cfg_changed = mac80211_hwsim_vif_info_changed,        \
        .link_info_changed = mac80211_hwsim_link_info_changed,  \
        .tx_last_beacon = mac80211_hwsim_tx_last_beacon,        \
        .sta_notify = mac80211_hwsim_sta_notify,                \
        .sta_rc_update = mac80211_hwsim_sta_rc_update,                \
        .conf_tx = mac80211_hwsim_conf_tx,                        \
        .get_survey = mac80211_hwsim_get_survey,                \
        CFG80211_TESTMODE_CMD(mac80211_hwsim_testmode_cmd)        \
        .ampdu_action = mac80211_hwsim_ampdu_action,                \
        .flush = mac80211_hwsim_flush,                                \
        .get_et_sset_count = mac80211_hwsim_get_et_sset_count,        \
        .get_et_stats = mac80211_hwsim_get_et_stats,                \
        .get_et_strings = mac80211_hwsim_get_et_strings,        \
        .start_pmsr = mac80211_hwsim_start_pmsr,                \
        .abort_pmsr = mac80211_hwsim_abort_pmsr,                \
        HWSIM_DEBUGFS_OPS

#define HWSIM_NON_MLO_OPS                                        \
        .sta_add = mac80211_hwsim_sta_add,                        \
        .sta_remove = mac80211_hwsim_sta_remove,                \
        .set_tim = mac80211_hwsim_set_tim,                        \
        .get_tsf = mac80211_hwsim_get_tsf,                        \
        .set_tsf = mac80211_hwsim_set_tsf,

static const struct ieee80211_ops mac80211_hwsim_ops = {
        HWSIM_COMMON_OPS
        HWSIM_NON_MLO_OPS
        .sw_scan_start = mac80211_hwsim_sw_scan,
        .sw_scan_complete = mac80211_hwsim_sw_scan_complete,
        .add_chanctx = ieee80211_emulate_add_chanctx,
        .remove_chanctx = ieee80211_emulate_remove_chanctx,
        .change_chanctx = ieee80211_emulate_change_chanctx,
        .switch_vif_chanctx = ieee80211_emulate_switch_vif_chanctx,
};

#define HWSIM_CHANCTX_OPS                                        \
        .hw_scan = mac80211_hwsim_hw_scan,                        \
        .cancel_hw_scan = mac80211_hwsim_cancel_hw_scan,        \
        .remain_on_channel = mac80211_hwsim_roc,                \
        .cancel_remain_on_channel = mac80211_hwsim_croc,        \
        .add_chanctx = mac80211_hwsim_add_chanctx,                \
        .remove_chanctx = mac80211_hwsim_remove_chanctx,        \
        .change_chanctx = mac80211_hwsim_change_chanctx,        \
        .assign_vif_chanctx = mac80211_hwsim_assign_vif_chanctx,\
        .unassign_vif_chanctx = mac80211_hwsim_unassign_vif_chanctx, \
        .switch_vif_chanctx = mac80211_hwsim_switch_vif_chanctx,

static const struct ieee80211_ops mac80211_hwsim_mchan_ops = {
        HWSIM_COMMON_OPS
        HWSIM_NON_MLO_OPS
        HWSIM_CHANCTX_OPS
};

static const struct ieee80211_ops mac80211_hwsim_mlo_ops = {
        HWSIM_COMMON_OPS
        HWSIM_CHANCTX_OPS
        .set_rts_threshold = mac80211_hwsim_set_rts_threshold,
        .change_vif_links = mac80211_hwsim_change_vif_links,
        .change_sta_links = mac80211_hwsim_change_sta_links,
        .sta_state = mac80211_hwsim_sta_state,
        .can_neg_ttlm = mac80211_hwsim_can_neg_ttlm,
};

struct hwsim_new_radio_params {
        unsigned int channels;
        const char *reg_alpha2;
        const struct ieee80211_regdomain *regd;
        bool reg_strict;
        bool p2p_device;
        bool use_chanctx;
        bool destroy_on_close;
        const char *hwname;
        bool no_vif;
        const u8 *perm_addr;
        u32 iftypes;
        u32 *ciphers;
        u8 n_ciphers;
        bool mlo;
        const struct cfg80211_pmsr_capabilities *pmsr_capa;
};

static void hwsim_mcast_config_msg(struct sk_buff *mcast_skb,
                                   struct genl_info *info)
{
        if (info)
                genl_notify(&hwsim_genl_family, mcast_skb, info,
                            HWSIM_MCGRP_CONFIG, GFP_KERNEL);
        else
                genlmsg_multicast(&hwsim_genl_family, mcast_skb, 0,
                                  HWSIM_MCGRP_CONFIG, GFP_KERNEL);
}

static int append_radio_msg(struct sk_buff *skb, int id,
                            struct hwsim_new_radio_params *param)
{
        int ret;

        ret = nla_put_u32(skb, HWSIM_ATTR_RADIO_ID, id);
        if (ret < 0)
                return ret;

        if (param->channels) {
                ret = nla_put_u32(skb, HWSIM_ATTR_CHANNELS, param->channels);
                if (ret < 0)
                        return ret;
        }

        if (param->reg_alpha2) {
                ret = nla_put(skb, HWSIM_ATTR_REG_HINT_ALPHA2, 2,
                              param->reg_alpha2);
                if (ret < 0)
                        return ret;
        }

        if (param->regd) {
                int i;

                for (i = 0; i < ARRAY_SIZE(hwsim_world_regdom_custom); i++) {
                        if (hwsim_world_regdom_custom[i] != param->regd)
                                continue;

                        ret = nla_put_u32(skb, HWSIM_ATTR_REG_CUSTOM_REG, i);
                        if (ret < 0)
                                return ret;
                        break;
                }
        }

        if (param->reg_strict) {
                ret = nla_put_flag(skb, HWSIM_ATTR_REG_STRICT_REG);
                if (ret < 0)
                        return ret;
        }

        if (param->p2p_device) {
                ret = nla_put_flag(skb, HWSIM_ATTR_SUPPORT_P2P_DEVICE);
                if (ret < 0)
                        return ret;
        }

        if (param->use_chanctx) {
                ret = nla_put_flag(skb, HWSIM_ATTR_USE_CHANCTX);
                if (ret < 0)
                        return ret;
        }

        if (param->hwname) {
                ret = nla_put(skb, HWSIM_ATTR_RADIO_NAME,
                              strlen(param->hwname), param->hwname);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

static void hwsim_mcast_new_radio(int id, struct genl_info *info,
                                  struct hwsim_new_radio_params *param)
{
        struct sk_buff *mcast_skb;
        void *data;

        mcast_skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!mcast_skb)
                return;

        data = genlmsg_put(mcast_skb, 0, 0, &hwsim_genl_family, 0,
                           HWSIM_CMD_NEW_RADIO);
        if (!data)
                goto out_err;

        if (append_radio_msg(mcast_skb, id, param) < 0)
                goto out_err;

        genlmsg_end(mcast_skb, data);

        hwsim_mcast_config_msg(mcast_skb, info);
        return;

out_err:
        nlmsg_free(mcast_skb);
}

static const struct ieee80211_sband_iftype_data sband_capa_2ghz[] = {
        {
                .types_mask = BIT(NL80211_IFTYPE_STATION) |
                              BIT(NL80211_IFTYPE_P2P_CLIENT),
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_BSR |
                                        IEEE80211_HE_MAC_CAP2_MU_CASCADING |
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] =
                                        IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
                                        IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xffff),
                                .tx_mcs_160 = cpu_to_le16(0xffff),
                                .rx_mcs_80p80 = cpu_to_le16(0xffff),
                                .tx_mcs_80p80 = cpu_to_le16(0xffff),
                        },
                },
                .eht_cap = {
                        .has_eht = true,
                        .eht_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
                                        IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
                                        IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
                                .phy_cap_info[0] =
                                        IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ |
                                        IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI |
                                        IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE,
                                .phy_cap_info[3] =
                                        IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK,
                                .phy_cap_info[4] =
                                        IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI |
                                        IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK,
                                .phy_cap_info[5] =
                                        IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT |
                                        IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK |
                                        IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK,
                                .phy_cap_info[6] =
                                        IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK |
                                        IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK,
                                .phy_cap_info[7] =
                                        IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW,
                        },

                        /* For all MCS and bandwidth, set 8 NSS for both Tx and
                         * Rx
                         */
                        .eht_mcs_nss_supp = {
                                /*
                                 * Since B0, B1, B2 and B3 are not set in
                                 * the supported channel width set field in the
                                 * HE PHY capabilities information field the
                                 * device is a 20MHz only device on 2.4GHz band.
                                 */
                                .only_20mhz = {
                                        .rx_tx_mcs7_max_nss = 0x88,
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                        },
                        /* PPE threshold information is not supported */
                },
        },
        {
                .types_mask = BIT(NL80211_IFTYPE_AP) |
                              BIT(NL80211_IFTYPE_P2P_GO),
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_BSR |
                                        IEEE80211_HE_MAC_CAP2_MU_CASCADING |
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] =
                                        IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
                                        IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xffff),
                                .tx_mcs_160 = cpu_to_le16(0xffff),
                                .rx_mcs_80p80 = cpu_to_le16(0xffff),
                                .tx_mcs_80p80 = cpu_to_le16(0xffff),
                        },
                },
                .eht_cap = {
                        .has_eht = true,
                        .eht_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
                                        IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
                                        IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
                                .phy_cap_info[0] =
                                        IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ |
                                        IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI |
                                        IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE,
                                .phy_cap_info[3] =
                                        IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK,
                                .phy_cap_info[4] =
                                        IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI |
                                        IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK,
                                .phy_cap_info[5] =
                                        IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT |
                                        IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK |
                                        IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK,
                                .phy_cap_info[6] =
                                        IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK |
                                        IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK,
                                .phy_cap_info[7] =
                                        IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW,
                        },

                        /* For all MCS and bandwidth, set 8 NSS for both Tx and
                         * Rx
                         */
                        .eht_mcs_nss_supp = {
                                /*
                                 * Since B0, B1, B2 and B3 are not set in
                                 * the supported channel width set field in the
                                 * HE PHY capabilities information field the
                                 * device is a 20MHz only device on 2.4GHz band.
                                 */
                                .only_20mhz = {
                                        .rx_tx_mcs7_max_nss = 0x88,
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                        },
                        /* PPE threshold information is not supported */
                },
        },
#ifdef CONFIG_MAC80211_MESH
        {
                .types_mask = BIT(NL80211_IFTYPE_MESH_POINT),
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] = 0,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xffff),
                                .tx_mcs_160 = cpu_to_le16(0xffff),
                                .rx_mcs_80p80 = cpu_to_le16(0xffff),
                                .tx_mcs_80p80 = cpu_to_le16(0xffff),
                        },
                },
        },
#endif
};

static const struct ieee80211_sband_iftype_data sband_capa_5ghz[] = {
        {
                .types_mask = BIT(NL80211_IFTYPE_STATION) |
                              BIT(NL80211_IFTYPE_P2P_CLIENT),
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_BSR |
                                        IEEE80211_HE_MAC_CAP2_MU_CASCADING |
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] =
                                        IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
                                        IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xfffa),
                                .tx_mcs_160 = cpu_to_le16(0xfffa),
                                .rx_mcs_80p80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80p80 = cpu_to_le16(0xfffa),
                        },
                },
                .eht_cap = {
                        .has_eht = true,
                        .eht_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
                                        IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
                                        IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
                                .phy_cap_info[0] =
                                        IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ |
                                        IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI |
                                        IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE |
                                        IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK,
                                .phy_cap_info[1] =
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK,
                                .phy_cap_info[2] =
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK,
                                .phy_cap_info[3] =
                                        IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK,
                                .phy_cap_info[4] =
                                        IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI |
                                        IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK,
                                .phy_cap_info[5] =
                                        IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT |
                                        IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK |
                                        IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK,
                                .phy_cap_info[6] =
                                        IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK |
                                        IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK,
                                .phy_cap_info[7] =
                                        IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ,
                        },

                        /* For all MCS and bandwidth, set 8 NSS for both Tx and
                         * Rx
                         */
                        .eht_mcs_nss_supp = {
                                /*
                                 * As B1 and B2 are set in the supported
                                 * channel width set field in the HE PHY
                                 * capabilities information field include all
                                 * the following MCS/NSS.
                                 */
                                .bw._80 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                                .bw._160 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                        },
                        /* PPE threshold information is not supported */
                },
        },
        {
                .types_mask = BIT(NL80211_IFTYPE_AP) |
                              BIT(NL80211_IFTYPE_P2P_GO),
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_BSR |
                                        IEEE80211_HE_MAC_CAP2_MU_CASCADING |
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] =
                                        IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
                                        IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xfffa),
                                .tx_mcs_160 = cpu_to_le16(0xfffa),
                                .rx_mcs_80p80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80p80 = cpu_to_le16(0xfffa),
                        },
                },
                .eht_cap = {
                        .has_eht = true,
                        .eht_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
                                        IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
                                        IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
                                .phy_cap_info[0] =
                                        IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ |
                                        IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI |
                                        IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE |
                                        IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK,
                                .phy_cap_info[1] =
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK,
                                .phy_cap_info[2] =
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK,
                                .phy_cap_info[3] =
                                        IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK,
                                .phy_cap_info[4] =
                                        IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI |
                                        IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK,
                                .phy_cap_info[5] =
                                        IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT |
                                        IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK |
                                        IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK,
                                .phy_cap_info[6] =
                                        IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK |
                                        IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK,
                                .phy_cap_info[7] =
                                        IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ,
                        },

                        /* For all MCS and bandwidth, set 8 NSS for both Tx and
                         * Rx
                         */
                        .eht_mcs_nss_supp = {
                                /*
                                 * As B1 and B2 are set in the supported
                                 * channel width set field in the HE PHY
                                 * capabilities information field include all
                                 * the following MCS/NSS.
                                 */
                                .bw._80 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                                .bw._160 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                        },
                        /* PPE threshold information is not supported */
                },
        },
#ifdef CONFIG_MAC80211_MESH
        {
                /* TODO: should we support other types, e.g., IBSS?*/
                .types_mask = BIT(NL80211_IFTYPE_MESH_POINT),
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] = 0,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xfffa),
                                .tx_mcs_160 = cpu_to_le16(0xfffa),
                                .rx_mcs_80p80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80p80 = cpu_to_le16(0xfffa),
                        },
                },
        },
#endif
};

static const struct ieee80211_sband_iftype_data sband_capa_6ghz[] = {
        {
                .types_mask = BIT(NL80211_IFTYPE_STATION) |
                              BIT(NL80211_IFTYPE_P2P_CLIENT),
                .he_6ghz_capa = {
                        .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START |
                                            IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP |
                                            IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN |
                                            IEEE80211_HE_6GHZ_CAP_SM_PS |
                                            IEEE80211_HE_6GHZ_CAP_RD_RESPONDER |
                                            IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS |
                                            IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS),
                },
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_BSR |
                                        IEEE80211_HE_MAC_CAP2_MU_CASCADING |
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] =
                                        IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
                                        IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xfffa),
                                .tx_mcs_160 = cpu_to_le16(0xfffa),
                                .rx_mcs_80p80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80p80 = cpu_to_le16(0xfffa),
                        },
                },
                .eht_cap = {
                        .has_eht = true,
                        .eht_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
                                        IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
                                        IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
                                .phy_cap_info[0] =
                                        IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ |
                                        IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ |
                                        IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI |
                                        IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE |
                                        IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK,
                                .phy_cap_info[1] =
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK,
                                .phy_cap_info[2] =
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK,
                                .phy_cap_info[3] =
                                        IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK,
                                .phy_cap_info[4] =
                                        IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI |
                                        IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK,
                                .phy_cap_info[5] =
                                        IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT |
                                        IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK |
                                        IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK,
                                .phy_cap_info[6] =
                                        IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK |
                                        IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK |
                                        IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP,
                                .phy_cap_info[7] =
                                        IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ,
                        },

                        /* For all MCS and bandwidth, set 8 NSS for both Tx and
                         * Rx
                         */
                        .eht_mcs_nss_supp = {
                                /*
                                 * As B1 and B2 are set in the supported
                                 * channel width set field in the HE PHY
                                 * capabilities information field and 320MHz in
                                 * 6GHz is supported include all the following
                                 * MCS/NSS.
                                 */
                                .bw._80 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                                .bw._160 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                                .bw._320 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                        },
                        /* PPE threshold information is not supported */
                },
        },
        {
                .types_mask = BIT(NL80211_IFTYPE_AP) |
                              BIT(NL80211_IFTYPE_P2P_GO),
                .he_6ghz_capa = {
                        .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START |
                                            IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP |
                                            IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN |
                                            IEEE80211_HE_6GHZ_CAP_SM_PS |
                                            IEEE80211_HE_6GHZ_CAP_RD_RESPONDER |
                                            IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS |
                                            IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS),
                },
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_BSR |
                                        IEEE80211_HE_MAC_CAP2_MU_CASCADING |
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] =
                                        IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
                                        IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
                                        IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xfffa),
                                .tx_mcs_160 = cpu_to_le16(0xfffa),
                                .rx_mcs_80p80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80p80 = cpu_to_le16(0xfffa),
                        },
                },
                .eht_cap = {
                        .has_eht = true,
                        .eht_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
                                        IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
                                        IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
                                .phy_cap_info[0] =
                                        IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ |
                                        IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ |
                                        IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI |
                                        IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER |
                                        IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE |
                                        IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK,
                                .phy_cap_info[1] =
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK,
                                .phy_cap_info[2] =
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK |
                                        IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK,
                                .phy_cap_info[3] =
                                        IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK |
                                        IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK,
                                .phy_cap_info[4] =
                                        IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO |
                                        IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP |
                                        IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI |
                                        IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK,
                                .phy_cap_info[5] =
                                        IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK |
                                        IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP |
                                        IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT |
                                        IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK |
                                        IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK,
                                .phy_cap_info[6] =
                                        IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK |
                                        IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK |
                                        IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP,
                                .phy_cap_info[7] =
                                        IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ |
                                        IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ |
                                        IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ,
                        },

                        /* For all MCS and bandwidth, set 8 NSS for both Tx and
                         * Rx
                         */
                        .eht_mcs_nss_supp = {
                                /*
                                 * As B1 and B2 are set in the supported
                                 * channel width set field in the HE PHY
                                 * capabilities information field and 320MHz in
                                 * 6GHz is supported include all the following
                                 * MCS/NSS.
                                 */
                                .bw._80 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                                .bw._160 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                                .bw._320 = {
                                        .rx_tx_mcs9_max_nss = 0x88,
                                        .rx_tx_mcs11_max_nss = 0x88,
                                        .rx_tx_mcs13_max_nss = 0x88,
                                },
                        },
                        /* PPE threshold information is not supported */
                },
        },
#ifdef CONFIG_MAC80211_MESH
        {
                /* TODO: should we support other types, e.g., IBSS?*/
                .types_mask = BIT(NL80211_IFTYPE_MESH_POINT),
                .he_6ghz_capa = {
                        .capa = cpu_to_le16(IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START |
                                            IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP |
                                            IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN |
                                            IEEE80211_HE_6GHZ_CAP_SM_PS |
                                            IEEE80211_HE_6GHZ_CAP_RD_RESPONDER |
                                            IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS |
                                            IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS),
                },
                .he_cap = {
                        .has_he = true,
                        .he_cap_elem = {
                                .mac_cap_info[0] =
                                        IEEE80211_HE_MAC_CAP0_HTC_HE,
                                .mac_cap_info[1] =
                                        IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
                                .mac_cap_info[2] =
                                        IEEE80211_HE_MAC_CAP2_ACK_EN,
                                .mac_cap_info[3] =
                                        IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
                                        IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
                                .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
                                .phy_cap_info[0] =
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
                                        IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
                                .phy_cap_info[1] =
                                        IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
                                        IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
                                        IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
                                        IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
                                .phy_cap_info[2] = 0,

                                /* Leave all the other PHY capability bytes
                                 * unset, as DCM, beam forming, RU and PPE
                                 * threshold information are not supported
                                 */
                        },
                        .he_mcs_nss_supp = {
                                .rx_mcs_80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80 = cpu_to_le16(0xfffa),
                                .rx_mcs_160 = cpu_to_le16(0xfffa),
                                .tx_mcs_160 = cpu_to_le16(0xfffa),
                                .rx_mcs_80p80 = cpu_to_le16(0xfffa),
                                .tx_mcs_80p80 = cpu_to_le16(0xfffa),
                        },
                },
        },
#endif
};

static void mac80211_hwsim_sband_capab(struct ieee80211_supported_band *sband)
{
        switch (sband->band) {
        case NL80211_BAND_2GHZ:
                ieee80211_set_sband_iftype_data(sband, sband_capa_2ghz);
                break;
        case NL80211_BAND_5GHZ:
                ieee80211_set_sband_iftype_data(sband, sband_capa_5ghz);
                break;
        case NL80211_BAND_6GHZ:
                ieee80211_set_sband_iftype_data(sband, sband_capa_6ghz);
                break;
        default:
                break;
        }
}

#ifdef CONFIG_MAC80211_MESH
#define HWSIM_MESH_BIT BIT(NL80211_IFTYPE_MESH_POINT)
#else
#define HWSIM_MESH_BIT 0
#endif

#define HWSIM_DEFAULT_IF_LIMIT \
        (BIT(NL80211_IFTYPE_STATION) | \
         BIT(NL80211_IFTYPE_P2P_CLIENT) | \
         BIT(NL80211_IFTYPE_AP) | \
         BIT(NL80211_IFTYPE_P2P_GO) | \
         HWSIM_MESH_BIT)

#define HWSIM_IFTYPE_SUPPORT_MASK \
        (BIT(NL80211_IFTYPE_STATION) | \
         BIT(NL80211_IFTYPE_AP) | \
         BIT(NL80211_IFTYPE_P2P_CLIENT) | \
         BIT(NL80211_IFTYPE_P2P_GO) | \
         BIT(NL80211_IFTYPE_ADHOC) | \
         BIT(NL80211_IFTYPE_MESH_POINT) | \
         BIT(NL80211_IFTYPE_OCB))

static const u8 iftypes_ext_capa_ap[] = {
         [0] = WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING,
         [2] = WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT,
         [7] = WLAN_EXT_CAPA8_OPMODE_NOTIF |
               WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB,
         [8] = WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB,
         [9] = WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT,
};

#define MAC80211_HWSIM_MLD_CAPA_OPS                                \
        FIELD_PREP_CONST(IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP, \
                         IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME) | \
        FIELD_PREP_CONST(IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS, \
                         IEEE80211_MLD_MAX_NUM_LINKS - 1)

static const struct wiphy_iftype_ext_capab mac80211_hwsim_iftypes_ext_capa[] = {
        {
                .iftype = NL80211_IFTYPE_AP,
                .extended_capabilities = iftypes_ext_capa_ap,
                .extended_capabilities_mask = iftypes_ext_capa_ap,
                .extended_capabilities_len = sizeof(iftypes_ext_capa_ap),
                .eml_capabilities = IEEE80211_EML_CAP_EMLSR_SUPP |
                                    IEEE80211_EML_CAP_EMLMR_SUPPORT,
                .mld_capa_and_ops = MAC80211_HWSIM_MLD_CAPA_OPS,
        },
};

static int mac80211_hwsim_new_radio(struct genl_info *info,
                                    struct hwsim_new_radio_params *param)
{
        int err;
        u8 addr[ETH_ALEN];
        struct mac80211_hwsim_data *data;
        struct ieee80211_hw *hw;
        enum nl80211_band band;
        const struct ieee80211_ops *ops = &mac80211_hwsim_ops;
        struct net *net;
        int idx, i;
        int n_limits = 0;

        if (WARN_ON(param->channels > 1 && !param->use_chanctx))
                return -EINVAL;

        spin_lock_bh(&hwsim_radio_lock);
        idx = hwsim_radio_idx++;
        spin_unlock_bh(&hwsim_radio_lock);

        if (param->mlo)
                ops = &mac80211_hwsim_mlo_ops;
        else if (param->use_chanctx)
                ops = &mac80211_hwsim_mchan_ops;
        hw = ieee80211_alloc_hw_nm(sizeof(*data), ops, param->hwname);
        if (!hw) {
                pr_debug("mac80211_hwsim: ieee80211_alloc_hw failed\n");
                err = -ENOMEM;
                goto failed;
        }

        /* ieee80211_alloc_hw_nm may have used a default name */
        param->hwname = wiphy_name(hw->wiphy);

        if (info)
                net = genl_info_net(info);
        else
                net = &init_net;
        wiphy_net_set(hw->wiphy, net);

        data = hw->priv;
        data->hw = hw;

        data->dev = device_create(hwsim_class, NULL, 0, hw, "hwsim%d", idx);
        if (IS_ERR(data->dev)) {
                printk(KERN_DEBUG
                       "mac80211_hwsim: device_create failed (%ld)\n",
                       PTR_ERR(data->dev));
                err = -ENOMEM;
                goto failed_drvdata;
        }
        data->dev->driver = &mac80211_hwsim_driver.driver;
        err = device_bind_driver(data->dev);
        if (err != 0) {
                pr_debug("mac80211_hwsim: device_bind_driver failed (%d)\n",
                       err);
                goto failed_bind;
        }

        skb_queue_head_init(&data->pending);

        SET_IEEE80211_DEV(hw, data->dev);
        if (!param->perm_addr) {
                eth_zero_addr(addr);
                addr[0] = 0x02;
                addr[3] = idx >> 8;
                addr[4] = idx;
                memcpy(data->addresses[0].addr, addr, ETH_ALEN);
                /* Why need here second address ? */
                memcpy(data->addresses[1].addr, addr, ETH_ALEN);
                data->addresses[1].addr[0] |= 0x40;
                hw->wiphy->n_addresses = 2;
                hw->wiphy->addresses = data->addresses;
                /* possible address clash is checked at hash table insertion */
        } else {
                memcpy(data->addresses[0].addr, param->perm_addr, ETH_ALEN);
                /* compatibility with automatically generated mac addr */
                memcpy(data->addresses[1].addr, param->perm_addr, ETH_ALEN);
                hw->wiphy->n_addresses = 2;
                hw->wiphy->addresses = data->addresses;
        }

        data->channels = param->channels;
        data->use_chanctx = param->use_chanctx;
        data->idx = idx;
        data->destroy_on_close = param->destroy_on_close;
        if (info)
                data->portid = info->snd_portid;

        /* setup interface limits, only on interface types we support */
        if (param->iftypes & BIT(NL80211_IFTYPE_ADHOC)) {
                data->if_limits[n_limits].max = 1;
                data->if_limits[n_limits].types = BIT(NL80211_IFTYPE_ADHOC);
                n_limits++;
        }

        if (param->iftypes & HWSIM_DEFAULT_IF_LIMIT) {
                data->if_limits[n_limits].max = 2048;
                /*
                 * For this case, we may only support a subset of
                 * HWSIM_DEFAULT_IF_LIMIT, therefore we only want to add the
                 * bits that both param->iftype & HWSIM_DEFAULT_IF_LIMIT have.
                 */
                data->if_limits[n_limits].types =
                                        HWSIM_DEFAULT_IF_LIMIT & param->iftypes;
                n_limits++;
        }

        if (param->iftypes & BIT(NL80211_IFTYPE_P2P_DEVICE)) {
                data->if_limits[n_limits].max = 1;
                data->if_limits[n_limits].types =
                                                BIT(NL80211_IFTYPE_P2P_DEVICE);
                n_limits++;
        }

        if (data->use_chanctx) {
                hw->wiphy->max_scan_ssids = 255;
                hw->wiphy->max_scan_ie_len = IEEE80211_MAX_DATA_LEN;
                hw->wiphy->max_remain_on_channel_duration = 1000;
                data->if_combination.radar_detect_widths = 0;
                data->if_combination.num_different_channels = data->channels;
        } else {
                data->if_combination.num_different_channels = 1;
                data->if_combination.radar_detect_widths =
                                        BIT(NL80211_CHAN_WIDTH_5) |
                                        BIT(NL80211_CHAN_WIDTH_10) |
                                        BIT(NL80211_CHAN_WIDTH_20_NOHT) |
                                        BIT(NL80211_CHAN_WIDTH_20) |
                                        BIT(NL80211_CHAN_WIDTH_40) |
                                        BIT(NL80211_CHAN_WIDTH_80) |
                                        BIT(NL80211_CHAN_WIDTH_160);
        }

        if (!n_limits) {
                err = -EINVAL;
                goto failed_hw;
        }

        data->if_combination.max_interfaces = 0;
        for (i = 0; i < n_limits; i++)
                data->if_combination.max_interfaces +=
                        data->if_limits[i].max;

        data->if_combination.n_limits = n_limits;
        data->if_combination.limits = data->if_limits;

        /*
         * If we actually were asked to support combinations,
         * advertise them - if there's only a single thing like
         * only IBSS then don't advertise it as combinations.
         */
        if (data->if_combination.max_interfaces > 1) {
                hw->wiphy->iface_combinations = &data->if_combination;
                hw->wiphy->n_iface_combinations = 1;
        }

        if (param->ciphers) {
                memcpy(data->ciphers, param->ciphers,
                       param->n_ciphers * sizeof(u32));
                hw->wiphy->cipher_suites = data->ciphers;
                hw->wiphy->n_cipher_suites = param->n_ciphers;
        }

        hw->wiphy->mbssid_max_interfaces = 8;
        hw->wiphy->ema_max_profile_periodicity = 3;

        data->rx_rssi = DEFAULT_RX_RSSI;

        INIT_DELAYED_WORK(&data->roc_start, hw_roc_start);
        INIT_DELAYED_WORK(&data->roc_done, hw_roc_done);
        INIT_DELAYED_WORK(&data->hw_scan, hw_scan_work);

        hw->queues = 5;
        hw->offchannel_tx_hw_queue = 4;

        ieee80211_hw_set(hw, SUPPORT_FAST_XMIT);
        ieee80211_hw_set(hw, CHANCTX_STA_CSA);
        ieee80211_hw_set(hw, SUPPORTS_HT_CCK_RATES);
        ieee80211_hw_set(hw, QUEUE_CONTROL);
        ieee80211_hw_set(hw, WANT_MONITOR_VIF);
        ieee80211_hw_set(hw, AMPDU_AGGREGATION);
        ieee80211_hw_set(hw, MFP_CAPABLE);
        ieee80211_hw_set(hw, SIGNAL_DBM);
        ieee80211_hw_set(hw, SUPPORTS_PS);
        ieee80211_hw_set(hw, REPORTS_TX_ACK_STATUS);
        ieee80211_hw_set(hw, TDLS_WIDER_BW);
        ieee80211_hw_set(hw, SUPPORTS_MULTI_BSSID);

        if (param->mlo) {
                hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_MLO;
                ieee80211_hw_set(hw, HAS_RATE_CONTROL);
                ieee80211_hw_set(hw, SUPPORTS_DYNAMIC_PS);
                ieee80211_hw_set(hw, CONNECTION_MONITOR);
                ieee80211_hw_set(hw, AP_LINK_PS);

                hw->wiphy->iftype_ext_capab = mac80211_hwsim_iftypes_ext_capa;
                hw->wiphy->num_iftype_ext_capab =
                        ARRAY_SIZE(mac80211_hwsim_iftypes_ext_capa);
        } else {
                ieee80211_hw_set(hw, HOST_BROADCAST_PS_BUFFERING);
                ieee80211_hw_set(hw, PS_NULLFUNC_STACK);
                if (rctbl)
                        ieee80211_hw_set(hw, SUPPORTS_RC_TABLE);
        }

        hw->wiphy->flags &= ~WIPHY_FLAG_PS_ON_BY_DEFAULT;
        hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_TDLS |
                            WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL |
                            WIPHY_FLAG_AP_UAPSD |
                            WIPHY_FLAG_SUPPORTS_5_10_MHZ |
                            WIPHY_FLAG_HAS_CHANNEL_SWITCH;
        hw->wiphy->features |= NL80211_FEATURE_ACTIVE_MONITOR |
                               NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE |
                               NL80211_FEATURE_STATIC_SMPS |
                               NL80211_FEATURE_DYNAMIC_SMPS |
                               NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR;
        wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_VHT_IBSS);
        wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION);
        wiphy_ext_feature_set(hw->wiphy,
                              NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS);
        wiphy_ext_feature_set(hw->wiphy,
                              NL80211_EXT_FEATURE_BEACON_RATE_LEGACY);
        wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER);

        wiphy_ext_feature_set(hw->wiphy,
                              NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT);
        wiphy_ext_feature_set(hw->wiphy,
                              NL80211_EXT_FEATURE_BSS_COLOR);

        hw->wiphy->interface_modes = param->iftypes;

        /* ask mac80211 to reserve space for magic */
        hw->vif_data_size = sizeof(struct hwsim_vif_priv);
        hw->sta_data_size = sizeof(struct hwsim_sta_priv);
        hw->chanctx_data_size = sizeof(struct hwsim_chanctx_priv);

        memcpy(data->channels_2ghz, hwsim_channels_2ghz,
                sizeof(hwsim_channels_2ghz));
        memcpy(data->channels_5ghz, hwsim_channels_5ghz,
                sizeof(hwsim_channels_5ghz));
        memcpy(data->channels_6ghz, hwsim_channels_6ghz,
                sizeof(hwsim_channels_6ghz));
        memcpy(data->channels_s1g, hwsim_channels_s1g,
               sizeof(hwsim_channels_s1g));
        memcpy(data->rates, hwsim_rates, sizeof(hwsim_rates));

        for (band = NL80211_BAND_2GHZ; band < NUM_NL80211_BANDS; band++) {
                struct ieee80211_supported_band *sband = &data->bands[band];

                sband->band = band;

                switch (band) {
                case NL80211_BAND_2GHZ:
                        sband->channels = data->channels_2ghz;
                        sband->n_channels = ARRAY_SIZE(hwsim_channels_2ghz);
                        sband->bitrates = data->rates;
                        sband->n_bitrates = ARRAY_SIZE(hwsim_rates);
                        break;
                case NL80211_BAND_5GHZ:
                        sband->channels = data->channels_5ghz;
                        sband->n_channels = ARRAY_SIZE(hwsim_channels_5ghz);
                        sband->bitrates = data->rates + 4;
                        sband->n_bitrates = ARRAY_SIZE(hwsim_rates) - 4;

                        sband->vht_cap.vht_supported = true;
                        sband->vht_cap.cap =
                                IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 |
                                IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ |
                                IEEE80211_VHT_CAP_RXLDPC |
                                IEEE80211_VHT_CAP_SHORT_GI_80 |
                                IEEE80211_VHT_CAP_SHORT_GI_160 |
                                IEEE80211_VHT_CAP_TXSTBC |
                                IEEE80211_VHT_CAP_RXSTBC_4 |
                                IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK;
                        sband->vht_cap.vht_mcs.rx_mcs_map =
                                cpu_to_le16(IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 |
                                            IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 |
                                            IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 |
                                            IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 |
                                            IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 |
                                            IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 |
                                            IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 |
                                            IEEE80211_VHT_MCS_SUPPORT_0_9 << 14);
                        sband->vht_cap.vht_mcs.tx_mcs_map =
                                sband->vht_cap.vht_mcs.rx_mcs_map;
                        break;
                case NL80211_BAND_6GHZ:
                        sband->channels = data->channels_6ghz;
                        sband->n_channels = ARRAY_SIZE(hwsim_channels_6ghz);
                        sband->bitrates = data->rates + 4;
                        sband->n_bitrates = ARRAY_SIZE(hwsim_rates) - 4;
                        break;
                case NL80211_BAND_S1GHZ:
                        memcpy(&sband->s1g_cap, &hwsim_s1g_cap,
                               sizeof(sband->s1g_cap));
                        sband->channels = data->channels_s1g;
                        sband->n_channels = ARRAY_SIZE(hwsim_channels_s1g);
                        break;
                default:
                        continue;
                }

                if (band != NL80211_BAND_6GHZ){
                        sband->ht_cap.ht_supported = true;
                        sband->ht_cap.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
                                            IEEE80211_HT_CAP_GRN_FLD |
                                            IEEE80211_HT_CAP_SGI_20 |
                                            IEEE80211_HT_CAP_SGI_40 |
                                            IEEE80211_HT_CAP_DSSSCCK40;
                        sband->ht_cap.ampdu_factor = 0x3;
                        sband->ht_cap.ampdu_density = 0x6;
                        memset(&sband->ht_cap.mcs, 0,
                               sizeof(sband->ht_cap.mcs));
                        sband->ht_cap.mcs.rx_mask[0] = 0xff;
                        sband->ht_cap.mcs.rx_mask[1] = 0xff;
                        sband->ht_cap.mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED;
                }

                mac80211_hwsim_sband_capab(sband);

                hw->wiphy->bands[band] = sband;
        }

        /* By default all radios belong to the first group */
        data->group = 1;
        mutex_init(&data->mutex);

        data->netgroup = hwsim_net_get_netgroup(net);
        data->wmediumd = hwsim_net_get_wmediumd(net);

        /* Enable frame retransmissions for lossy channels */
        hw->max_rates = 4;
        hw->max_rate_tries = 11;

        hw->wiphy->vendor_commands = mac80211_hwsim_vendor_commands;
        hw->wiphy->n_vendor_commands =
                ARRAY_SIZE(mac80211_hwsim_vendor_commands);
        hw->wiphy->vendor_events = mac80211_hwsim_vendor_events;
        hw->wiphy->n_vendor_events = ARRAY_SIZE(mac80211_hwsim_vendor_events);

        if (param->reg_strict)
                hw->wiphy->regulatory_flags |= REGULATORY_STRICT_REG;
        if (param->regd) {
                data->regd = param->regd;
                hw->wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG;
                wiphy_apply_custom_regulatory(hw->wiphy, param->regd);
                /* give the regulatory workqueue a chance to run */
                schedule_timeout_interruptible(1);
        }

        wiphy_ext_feature_set(hw->wiphy,
                              NL80211_EXT_FEATURE_DFS_CONCURRENT);

        if (param->no_vif)
                ieee80211_hw_set(hw, NO_AUTO_VIF);

        wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST);

        for (i = 0; i < ARRAY_SIZE(data->link_data); i++) {
                hrtimer_init(&data->link_data[i].beacon_timer, CLOCK_MONOTONIC,
                             HRTIMER_MODE_ABS_SOFT);
                data->link_data[i].beacon_timer.function =
                        mac80211_hwsim_beacon;
                data->link_data[i].link_id = i;
        }

        err = ieee80211_register_hw(hw);
        if (err < 0) {
                pr_debug("mac80211_hwsim: ieee80211_register_hw failed (%d)\n",
                       err);
                goto failed_hw;
        }

        wiphy_dbg(hw->wiphy, "hwaddr %pM registered\n", hw->wiphy->perm_addr);

        if (param->reg_alpha2) {
                data->alpha2[0] = param->reg_alpha2[0];
                data->alpha2[1] = param->reg_alpha2[1];
                regulatory_hint(hw->wiphy, param->reg_alpha2);
        }

        data->debugfs = debugfs_create_dir("hwsim", hw->wiphy->debugfsdir);
        debugfs_create_file("ps", 0666, data->debugfs, data, &hwsim_fops_ps);
        debugfs_create_file("group", 0666, data->debugfs, data,
                            &hwsim_fops_group);
        debugfs_create_file("rx_rssi", 0666, data->debugfs, data,
                            &hwsim_fops_rx_rssi);
        if (!data->use_chanctx)
                debugfs_create_file("dfs_simulate_radar", 0222,
                                    data->debugfs,
                                    data, &hwsim_simulate_radar);

        if (param->pmsr_capa) {
                data->pmsr_capa = *param->pmsr_capa;
                hw->wiphy->pmsr_capa = &data->pmsr_capa;
        }

        spin_lock_bh(&hwsim_radio_lock);
        err = rhashtable_insert_fast(&hwsim_radios_rht, &data->rht,
                                     hwsim_rht_params);
        if (err < 0) {
                if (info) {
                        GENL_SET_ERR_MSG(info, "perm addr already present");
                        NL_SET_BAD_ATTR(info->extack,
                                        info->attrs[HWSIM_ATTR_PERM_ADDR]);
                }
                spin_unlock_bh(&hwsim_radio_lock);
                goto failed_final_insert;
        }

        list_add_tail(&data->list, &hwsim_radios);
        hwsim_radios_generation++;
        spin_unlock_bh(&hwsim_radio_lock);

        hwsim_mcast_new_radio(idx, info, param);

        return idx;

failed_final_insert:
        debugfs_remove_recursive(data->debugfs);
        ieee80211_unregister_hw(data->hw);
failed_hw:
        device_release_driver(data->dev);
failed_bind:
        device_unregister(data->dev);
failed_drvdata:
        ieee80211_free_hw(hw);
failed:
        return err;
}

static void hwsim_mcast_del_radio(int id, const char *hwname,
                                  struct genl_info *info)
{
        struct sk_buff *skb;
        void *data;
        int ret;

        skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!skb)
                return;

        data = genlmsg_put(skb, 0, 0, &hwsim_genl_family, 0,
                           HWSIM_CMD_DEL_RADIO);
        if (!data)
                goto error;

        ret = nla_put_u32(skb, HWSIM_ATTR_RADIO_ID, id);
        if (ret < 0)
                goto error;

        ret = nla_put(skb, HWSIM_ATTR_RADIO_NAME, strlen(hwname),
                      hwname);
        if (ret < 0)
                goto error;

        genlmsg_end(skb, data);

        hwsim_mcast_config_msg(skb, info);

        return;

error:
        nlmsg_free(skb);
}

static void mac80211_hwsim_del_radio(struct mac80211_hwsim_data *data,
                                     const char *hwname,
                                     struct genl_info *info)
{
        hwsim_mcast_del_radio(data->idx, hwname, info);
        debugfs_remove_recursive(data->debugfs);
        ieee80211_unregister_hw(data->hw);
        device_release_driver(data->dev);
        device_unregister(data->dev);
        ieee80211_free_hw(data->hw);
}

static int mac80211_hwsim_get_radio(struct sk_buff *skb,
                                    struct mac80211_hwsim_data *data,
                                    u32 portid, u32 seq,
                                    struct netlink_callback *cb, int flags)
{
        void *hdr;
        struct hwsim_new_radio_params param = { };
        int res = -EMSGSIZE;

        hdr = genlmsg_put(skb, portid, seq, &hwsim_genl_family, flags,
                          HWSIM_CMD_GET_RADIO);
        if (!hdr)
                return -EMSGSIZE;

        if (cb)
                genl_dump_check_consistent(cb, hdr);

        if (data->alpha2[0] && data->alpha2[1])
                param.reg_alpha2 = data->alpha2;

        param.reg_strict = !!(data->hw->wiphy->regulatory_flags &
                                        REGULATORY_STRICT_REG);
        param.p2p_device = !!(data->hw->wiphy->interface_modes &
                                        BIT(NL80211_IFTYPE_P2P_DEVICE));
        param.use_chanctx = data->use_chanctx;
        param.regd = data->regd;
        param.channels = data->channels;
        param.hwname = wiphy_name(data->hw->wiphy);
        param.pmsr_capa = &data->pmsr_capa;

        res = append_radio_msg(skb, data->idx, &param);
        if (res < 0)
                goto out_err;

        genlmsg_end(skb, hdr);
        return 0;

out_err:
        genlmsg_cancel(skb, hdr);
        return res;
}

static void mac80211_hwsim_free(void)
{
        struct mac80211_hwsim_data *data;

        spin_lock_bh(&hwsim_radio_lock);
        while ((data = list_first_entry_or_null(&hwsim_radios,
                                                struct mac80211_hwsim_data,
                                                list))) {
                list_del(&data->list);
                spin_unlock_bh(&hwsim_radio_lock);
                mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy),
                                         NULL);
                spin_lock_bh(&hwsim_radio_lock);
        }
        spin_unlock_bh(&hwsim_radio_lock);
        class_destroy(hwsim_class);
}

static const struct net_device_ops hwsim_netdev_ops = {
        .ndo_start_xmit         = hwsim_mon_xmit,
        .ndo_set_mac_address         = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
};

static void hwsim_mon_setup(struct net_device *dev)
{
        u8 addr[ETH_ALEN];

        dev->netdev_ops = &hwsim_netdev_ops;
        dev->needs_free_netdev = true;
        ether_setup(dev);
        dev->priv_flags |= IFF_NO_QUEUE;
        dev->type = ARPHRD_IEEE80211_RADIOTAP;
        eth_zero_addr(addr);
        addr[0] = 0x12;
        eth_hw_addr_set(dev, addr);
}

static void hwsim_register_wmediumd(struct net *net, u32 portid)
{
        struct mac80211_hwsim_data *data;

        hwsim_net_set_wmediumd(net, portid);

        spin_lock_bh(&hwsim_radio_lock);
        list_for_each_entry(data, &hwsim_radios, list) {
                if (data->netgroup == hwsim_net_get_netgroup(net))
                        data->wmediumd = portid;
        }
        spin_unlock_bh(&hwsim_radio_lock);
}

static int hwsim_tx_info_frame_received_nl(struct sk_buff *skb_2,
                                           struct genl_info *info)
{

        struct ieee80211_hdr *hdr;
        struct mac80211_hwsim_data *data2;
        struct ieee80211_tx_info *txi;
        struct hwsim_tx_rate *tx_attempts;
        u64 ret_skb_cookie;
        struct sk_buff *skb, *tmp;
        const u8 *src;
        unsigned int hwsim_flags;
        int i;
        unsigned long flags;
        bool found = false;

        if (!info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER] ||
            !info->attrs[HWSIM_ATTR_FLAGS] ||
            !info->attrs[HWSIM_ATTR_COOKIE] ||
            !info->attrs[HWSIM_ATTR_SIGNAL] ||
            !info->attrs[HWSIM_ATTR_TX_INFO])
                goto out;

        src = (void *)nla_data(info->attrs[HWSIM_ATTR_ADDR_TRANSMITTER]);
        hwsim_flags = nla_get_u32(info->attrs[HWSIM_ATTR_FLAGS]);
        ret_skb_cookie = nla_get_u64(info->attrs[HWSIM_ATTR_COOKIE]);

        data2 = get_hwsim_data_ref_from_addr(src);
        if (!data2)
                goto out;

        if (!hwsim_virtio_enabled) {
                if (hwsim_net_get_netgroup(genl_info_net(info)) !=
                    data2->netgroup)
                        goto out;

                if (info->snd_portid != data2->wmediumd)
                        goto out;
        }

        /* look for the skb matching the cookie passed back from user */
        spin_lock_irqsave(&data2->pending.lock, flags);
        skb_queue_walk_safe(&data2->pending, skb, tmp) {
                uintptr_t skb_cookie;

                txi = IEEE80211_SKB_CB(skb);
                skb_cookie = (uintptr_t)txi->rate_driver_data[0];

                if (skb_cookie == ret_skb_cookie) {
                        __skb_unlink(skb, &data2->pending);
                        found = true;
                        break;
                }
        }
        spin_unlock_irqrestore(&data2->pending.lock, flags);

        /* not found */
        if (!found)
                goto out;

        /* Tx info received because the frame was broadcasted on user space,
         so we get all the necessary info: tx attempts and skb control buff */

        tx_attempts = (struct hwsim_tx_rate *)nla_data(
                       info->attrs[HWSIM_ATTR_TX_INFO]);

        /* now send back TX status */
        txi = IEEE80211_SKB_CB(skb);

        ieee80211_tx_info_clear_status(txi);

        for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
                txi->status.rates[i].idx = tx_attempts[i].idx;
                txi->status.rates[i].count = tx_attempts[i].count;
        }

        txi->status.ack_signal = nla_get_u32(info->attrs[HWSIM_ATTR_SIGNAL]);

        if (!(hwsim_flags & HWSIM_TX_CTL_NO_ACK) &&
           (hwsim_flags & HWSIM_TX_STAT_ACK)) {
                if (skb->len >= 16) {
                        hdr = (struct ieee80211_hdr *) skb->data;
                        mac80211_hwsim_monitor_ack(data2->channel,
                                                   hdr->addr2);
                }
                txi->flags |= IEEE80211_TX_STAT_ACK;
        }

        if (hwsim_flags & HWSIM_TX_CTL_NO_ACK)
                txi->flags |= IEEE80211_TX_STAT_NOACK_TRANSMITTED;

        ieee80211_tx_status_irqsafe(data2->hw, skb);
        return 0;
out:
        return -EINVAL;

}

static int hwsim_cloned_frame_received_nl(struct sk_buff *skb_2,
                                          struct genl_info *info)
{
        struct mac80211_hwsim_data *data2;
        struct ieee80211_rx_status rx_status;
        struct ieee80211_hdr *hdr;
        const u8 *dst;
        int frame_data_len;
        void *frame_data;
        struct sk_buff *skb = NULL;
        struct ieee80211_channel *channel = NULL;

        if (!info->attrs[HWSIM_ATTR_ADDR_RECEIVER] ||
            !info->attrs[HWSIM_ATTR_FRAME] ||
            !info->attrs[HWSIM_ATTR_RX_RATE] ||
            !info->attrs[HWSIM_ATTR_SIGNAL])
                goto out;

        dst = (void *)nla_data(info->attrs[HWSIM_ATTR_ADDR_RECEIVER]);
        frame_data_len = nla_len(info->attrs[HWSIM_ATTR_FRAME]);
        frame_data = (void *)nla_data(info->attrs[HWSIM_ATTR_FRAME]);

        if (frame_data_len < sizeof(struct ieee80211_hdr_3addr) ||
            frame_data_len > IEEE80211_MAX_DATA_LEN)
                goto err;

        /* Allocate new skb here */
        skb = alloc_skb(frame_data_len, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        /* Copy the data */
        skb_put_data(skb, frame_data, frame_data_len);

        data2 = get_hwsim_data_ref_from_addr(dst);
        if (!data2)
                goto out;

        if (data2->use_chanctx) {
                if (data2->tmp_chan)
                        channel = data2->tmp_chan;
        } else {
                channel = data2->channel;
        }

        if (!hwsim_virtio_enabled) {
                if (hwsim_net_get_netgroup(genl_info_net(info)) !=
                    data2->netgroup)
                        goto out;

                if (info->snd_portid != data2->wmediumd)
                        goto out;
        }

        /* check if radio is configured properly */

        if ((data2->idle && !data2->tmp_chan) || !data2->started)
                goto out;

        /* A frame is received from user space */
        memset(&rx_status, 0, sizeof(rx_status));
        if (info->attrs[HWSIM_ATTR_FREQ]) {
                struct tx_iter_data iter_data = {};

                /* throw away off-channel packets, but allow both the temporary
                 * ("hw" scan/remain-on-channel), regular channels and links,
                 * since the internal datapath also allows this
                 */
                rx_status.freq = nla_get_u32(info->attrs[HWSIM_ATTR_FREQ]);

                iter_data.channel = ieee80211_get_channel(data2->hw->wiphy,
                                                          rx_status.freq);
                if (!iter_data.channel)
                        goto out;
                rx_status.band = iter_data.channel->band;

                mutex_lock(&data2->mutex);
                if (!hwsim_chans_compat(iter_data.channel, channel)) {
                        ieee80211_iterate_active_interfaces_atomic(
                                data2->hw, IEEE80211_IFACE_ITER_NORMAL,
                                mac80211_hwsim_tx_iter, &iter_data);
                        if (!iter_data.receive) {
                                mutex_unlock(&data2->mutex);
                                goto out;
                        }
                }
                mutex_unlock(&data2->mutex);
        } else if (!channel) {
                goto out;
        } else {
                rx_status.freq = channel->center_freq;
                rx_status.band = channel->band;
        }

        rx_status.rate_idx = nla_get_u32(info->attrs[HWSIM_ATTR_RX_RATE]);
        if (rx_status.rate_idx >= data2->hw->wiphy->bands[rx_status.band]->n_bitrates)
                goto out;
        rx_status.signal = nla_get_u32(info->attrs[HWSIM_ATTR_SIGNAL]);

        hdr = (void *)skb->data;

        if (ieee80211_is_beacon(hdr->frame_control) ||
            ieee80211_is_probe_resp(hdr->frame_control))
                rx_status.boottime_ns = ktime_get_boottime_ns();

        mac80211_hwsim_rx(data2, &rx_status, skb);

        return 0;
err:
        pr_debug("mac80211_hwsim: error occurred in %s\n", __func__);
out:
        dev_kfree_skb(skb);
        return -EINVAL;
}

static int hwsim_register_received_nl(struct sk_buff *skb_2,
                                      struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct mac80211_hwsim_data *data;
        int chans = 1;

        spin_lock_bh(&hwsim_radio_lock);
        list_for_each_entry(data, &hwsim_radios, list)
                chans = max(chans, data->channels);
        spin_unlock_bh(&hwsim_radio_lock);

        /* In the future we should revise the userspace API and allow it
         * to set a flag that it does support multi-channel, then we can
         * let this pass conditionally on the flag.
         * For current userspace, prohibit it since it won't work right.
         */
        if (chans > 1)
                return -EOPNOTSUPP;

        if (hwsim_net_get_wmediumd(net))
                return -EBUSY;

        hwsim_register_wmediumd(net, info->snd_portid);

        pr_debug("mac80211_hwsim: received a REGISTER, "
               "switching to wmediumd mode with pid %d\n", info->snd_portid);

        return 0;
}

/* ensures ciphers only include ciphers listed in 'hwsim_ciphers' array */
static bool hwsim_known_ciphers(const u32 *ciphers, int n_ciphers)
{
        int i;

        for (i = 0; i < n_ciphers; i++) {
                int j;
                int found = 0;

                for (j = 0; j < ARRAY_SIZE(hwsim_ciphers); j++) {
                        if (ciphers[i] == hwsim_ciphers[j]) {
                                found = 1;
                                break;
                        }
                }

                if (!found)
                        return false;
        }

        return true;
}

static int parse_ftm_capa(const struct nlattr *ftm_capa, struct cfg80211_pmsr_capabilities *out,
                          struct genl_info *info)
{
        struct nlattr *tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX + 1];
        int ret;

        ret = nla_parse_nested(tb, NL80211_PMSR_FTM_CAPA_ATTR_MAX, ftm_capa, hwsim_ftm_capa_policy,
                               NULL);
        if (ret) {
                NL_SET_ERR_MSG_ATTR(info->extack, ftm_capa, "malformed FTM capability");
                return -EINVAL;
        }

        out->ftm.supported = 1;
        if (tb[NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES])
                out->ftm.preambles = nla_get_u32(tb[NL80211_PMSR_FTM_CAPA_ATTR_PREAMBLES]);
        if (tb[NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS])
                out->ftm.bandwidths = nla_get_u32(tb[NL80211_PMSR_FTM_CAPA_ATTR_BANDWIDTHS]);
        if (tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT])
                out->ftm.max_bursts_exponent =
                        nla_get_u8(tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_BURSTS_EXPONENT]);
        if (tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST])
                out->ftm.max_ftms_per_burst =
                        nla_get_u8(tb[NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST]);
        out->ftm.asap = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_ASAP];
        out->ftm.non_asap = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_NON_ASAP];
        out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_REQ_LCI];
        out->ftm.request_civicloc = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_REQ_CIVICLOC];
        out->ftm.trigger_based = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED];
        out->ftm.non_trigger_based = !!tb[NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED];

        return 0;
}

static int parse_pmsr_capa(const struct nlattr *pmsr_capa, struct cfg80211_pmsr_capabilities *out,
                           struct genl_info *info)
{
        struct nlattr *tb[NL80211_PMSR_ATTR_MAX + 1];
        struct nlattr *nla;
        int size;
        int ret;

        ret = nla_parse_nested(tb, NL80211_PMSR_ATTR_MAX, pmsr_capa, hwsim_pmsr_capa_policy, NULL);
        if (ret) {
                NL_SET_ERR_MSG_ATTR(info->extack, pmsr_capa, "malformed PMSR capability");
                return -EINVAL;
        }

        if (tb[NL80211_PMSR_ATTR_MAX_PEERS])
                out->max_peers = nla_get_u32(tb[NL80211_PMSR_ATTR_MAX_PEERS]);
        out->report_ap_tsf = !!tb[NL80211_PMSR_ATTR_REPORT_AP_TSF];
        out->randomize_mac_addr = !!tb[NL80211_PMSR_ATTR_RANDOMIZE_MAC_ADDR];

        if (!tb[NL80211_PMSR_ATTR_TYPE_CAPA]) {
                NL_SET_ERR_MSG_ATTR(info->extack, tb[NL80211_PMSR_ATTR_TYPE_CAPA],
                                    "malformed PMSR type");
                return -EINVAL;
        }

        nla_for_each_nested(nla, tb[NL80211_PMSR_ATTR_TYPE_CAPA], size) {
                switch (nla_type(nla)) {
                case NL80211_PMSR_TYPE_FTM:
                        parse_ftm_capa(nla, out, info);
                        break;
                default:
                        NL_SET_ERR_MSG_ATTR(info->extack, nla, "unsupported measurement type");
                        return -EINVAL;
                }
        }

        return 0;
}

static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info)
{
        struct hwsim_new_radio_params param = { 0 };
        const char *hwname = NULL;
        int ret;

        param.reg_strict = info->attrs[HWSIM_ATTR_REG_STRICT_REG];
        param.p2p_device = info->attrs[HWSIM_ATTR_SUPPORT_P2P_DEVICE];
        param.channels = channels;
        param.destroy_on_close =
                info->attrs[HWSIM_ATTR_DESTROY_RADIO_ON_CLOSE];

        if (info->attrs[HWSIM_ATTR_CHANNELS])
                param.channels = nla_get_u32(info->attrs[HWSIM_ATTR_CHANNELS]);

        if (param.channels < 1) {
                GENL_SET_ERR_MSG(info, "must have at least one channel");
                return -EINVAL;
        }

        if (info->attrs[HWSIM_ATTR_NO_VIF])
                param.no_vif = true;

        if (info->attrs[HWSIM_ATTR_USE_CHANCTX])
                param.use_chanctx = true;
        else
                param.use_chanctx = (param.channels > 1);

        if (info->attrs[HWSIM_ATTR_REG_HINT_ALPHA2])
                param.reg_alpha2 =
                        nla_data(info->attrs[HWSIM_ATTR_REG_HINT_ALPHA2]);

        if (info->attrs[HWSIM_ATTR_REG_CUSTOM_REG]) {
                u32 idx = nla_get_u32(info->attrs[HWSIM_ATTR_REG_CUSTOM_REG]);

                if (idx >= ARRAY_SIZE(hwsim_world_regdom_custom))
                        return -EINVAL;

                idx = array_index_nospec(idx,
                                         ARRAY_SIZE(hwsim_world_regdom_custom));
                param.regd = hwsim_world_regdom_custom[idx];
        }

        if (info->attrs[HWSIM_ATTR_PERM_ADDR]) {
                if (!is_valid_ether_addr(
                                nla_data(info->attrs[HWSIM_ATTR_PERM_ADDR]))) {
                        GENL_SET_ERR_MSG(info,"MAC is no valid source addr");
                        NL_SET_BAD_ATTR(info->extack,
                                        info->attrs[HWSIM_ATTR_PERM_ADDR]);
                        return -EINVAL;
                }

                param.perm_addr = nla_data(info->attrs[HWSIM_ATTR_PERM_ADDR]);
        }

        if (info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT]) {
                param.iftypes =
                        nla_get_u32(info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT]);

                if (param.iftypes & ~HWSIM_IFTYPE_SUPPORT_MASK) {
                        NL_SET_ERR_MSG_ATTR(info->extack,
                                            info->attrs[HWSIM_ATTR_IFTYPE_SUPPORT],
                                            "cannot support more iftypes than kernel");
                        return -EINVAL;
                }
        } else {
                param.iftypes = HWSIM_IFTYPE_SUPPORT_MASK;
        }

        /* ensure both flag and iftype support is honored */
        if (param.p2p_device ||
            param.iftypes & BIT(NL80211_IFTYPE_P2P_DEVICE)) {
                param.iftypes |= BIT(NL80211_IFTYPE_P2P_DEVICE);
                param.p2p_device = true;
        }

        if (info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]) {
                u32 len = nla_len(info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]);

                param.ciphers =
                        nla_data(info->attrs[HWSIM_ATTR_CIPHER_SUPPORT]);

                if (len % sizeof(u32)) {
                        NL_SET_ERR_MSG_ATTR(info->extack,
                                            info->attrs[HWSIM_ATTR_CIPHER_SUPPORT],
                                            "bad cipher list length");
                        return -EINVAL;
                }

                param.n_ciphers = len / sizeof(u32);

                if (param.n_ciphers > ARRAY_SIZE(hwsim_ciphers)) {
                        NL_SET_ERR_MSG_ATTR(info->extack,
                                            info->attrs[HWSIM_ATTR_CIPHER_SUPPORT],
                                            "too many ciphers specified");
                        return -EINVAL;
                }

                if (!hwsim_known_ciphers(param.ciphers, param.n_ciphers)) {
                        NL_SET_ERR_MSG_ATTR(info->extack,
                                            info->attrs[HWSIM_ATTR_CIPHER_SUPPORT],
                                            "unsupported ciphers specified");
                        return -EINVAL;
                }
        }

        param.mlo = info->attrs[HWSIM_ATTR_MLO_SUPPORT];

        if (param.mlo)
                param.use_chanctx = true;

        if (info->attrs[HWSIM_ATTR_RADIO_NAME]) {
                hwname = kstrndup((char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]),
                                  nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]),
                                  GFP_KERNEL);
                if (!hwname)
                        return -ENOMEM;
                param.hwname = hwname;
        }

        if (info->attrs[HWSIM_ATTR_PMSR_SUPPORT]) {
                struct cfg80211_pmsr_capabilities *pmsr_capa;

                pmsr_capa = kmalloc(sizeof(*pmsr_capa), GFP_KERNEL);
                if (!pmsr_capa) {
                        ret = -ENOMEM;
                        goto out_free;
                }
                param.pmsr_capa = pmsr_capa;

                ret = parse_pmsr_capa(info->attrs[HWSIM_ATTR_PMSR_SUPPORT], pmsr_capa, info);
                if (ret)
                        goto out_free;
        }

        ret = mac80211_hwsim_new_radio(info, &param);

out_free:
        kfree(hwname);
        kfree(param.pmsr_capa);
        return ret;
}

static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info)
{
        struct mac80211_hwsim_data *data;
        s64 idx = -1;
        const char *hwname = NULL;

        if (info->attrs[HWSIM_ATTR_RADIO_ID]) {
                idx = nla_get_u32(info->attrs[HWSIM_ATTR_RADIO_ID]);
        } else if (info->attrs[HWSIM_ATTR_RADIO_NAME]) {
                hwname = kstrndup((char *)nla_data(info->attrs[HWSIM_ATTR_RADIO_NAME]),
                                  nla_len(info->attrs[HWSIM_ATTR_RADIO_NAME]),
                                  GFP_KERNEL);
                if (!hwname)
                        return -ENOMEM;
        } else
                return -EINVAL;

        spin_lock_bh(&hwsim_radio_lock);
        list_for_each_entry(data, &hwsim_radios, list) {
                if (idx >= 0) {
                        if (data->idx != idx)
                                continue;
                } else {
                        if (!hwname ||
                            strcmp(hwname, wiphy_name(data->hw->wiphy)))
                                continue;
                }

                if (!net_eq(wiphy_net(data->hw->wiphy), genl_info_net(info)))
                        continue;

                list_del(&data->list);
                rhashtable_remove_fast(&hwsim_radios_rht, &data->rht,
                                       hwsim_rht_params);
                hwsim_radios_generation++;
                spin_unlock_bh(&hwsim_radio_lock);
                mac80211_hwsim_del_radio(data, wiphy_name(data->hw->wiphy),
                                         info);
                kfree(hwname);
                return 0;
        }
        spin_unlock_bh(&hwsim_radio_lock);

        kfree(hwname);
        return -ENODEV;
}

static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info)
{
        struct mac80211_hwsim_data *data;
        struct sk_buff *skb;
        int idx, res = -ENODEV;

        if (!info->attrs[HWSIM_ATTR_RADIO_ID])
                return -EINVAL;
        idx = nla_get_u32(info->attrs[HWSIM_ATTR_RADIO_ID]);

        spin_lock_bh(&hwsim_radio_lock);
        list_for_each_entry(data, &hwsim_radios, list) {
                if (data->idx != idx)
                        continue;

                if (!net_eq(wiphy_net(data->hw->wiphy), genl_info_net(info)))
                        continue;

                skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
                if (!skb) {
                        res = -ENOMEM;
                        goto out_err;
                }

                res = mac80211_hwsim_get_radio(skb, data, info->snd_portid,
                                               info->snd_seq, NULL, 0);
                if (res < 0) {
                        nlmsg_free(skb);
                        goto out_err;
                }

                res = genlmsg_reply(skb, info);
                break;
        }

out_err:
        spin_unlock_bh(&hwsim_radio_lock);

        return res;
}

static int hwsim_dump_radio_nl(struct sk_buff *skb,
                               struct netlink_callback *cb)
{
        int last_idx = cb->args[0] - 1;
        struct mac80211_hwsim_data *data = NULL;
        int res = 0;
        void *hdr;

        spin_lock_bh(&hwsim_radio_lock);
        cb->seq = hwsim_radios_generation;

        if (last_idx >= hwsim_radio_idx-1)
                goto done;

        list_for_each_entry(data, &hwsim_radios, list) {
                if (data->idx <= last_idx)
                        continue;

                if (!net_eq(wiphy_net(data->hw->wiphy), sock_net(skb->sk)))
                        continue;

                res = mac80211_hwsim_get_radio(skb, data,
                                               NETLINK_CB(cb->skb).portid,
                                               cb->nlh->nlmsg_seq, cb,
                                               NLM_F_MULTI);
                if (res < 0)
                        break;

                last_idx = data->idx;
        }

        cb->args[0] = last_idx + 1;

        /* list changed, but no new element sent, set interrupted flag */
        if (skb->len == 0 && cb->prev_seq && cb->seq != cb->prev_seq) {
                hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
                                  cb->nlh->nlmsg_seq, &hwsim_genl_family,
                                  NLM_F_MULTI, HWSIM_CMD_GET_RADIO);
                if (hdr) {
                        genl_dump_check_consistent(cb, hdr);
                        genlmsg_end(skb, hdr);
                } else {
                        res = -EMSGSIZE;
                }
        }

done:
        spin_unlock_bh(&hwsim_radio_lock);
        return res ?: skb->len;
}

/* Generic Netlink operations array */
static const struct genl_small_ops hwsim_ops[] = {
        {
                .cmd = HWSIM_CMD_REGISTER,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = hwsim_register_received_nl,
                .flags = GENL_UNS_ADMIN_PERM,
        },
        {
                .cmd = HWSIM_CMD_FRAME,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = hwsim_cloned_frame_received_nl,
        },
        {
                .cmd = HWSIM_CMD_TX_INFO_FRAME,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = hwsim_tx_info_frame_received_nl,
        },
        {
                .cmd = HWSIM_CMD_NEW_RADIO,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = hwsim_new_radio_nl,
                .flags = GENL_UNS_ADMIN_PERM,
        },
        {
                .cmd = HWSIM_CMD_DEL_RADIO,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = hwsim_del_radio_nl,
                .flags = GENL_UNS_ADMIN_PERM,
        },
        {
                .cmd = HWSIM_CMD_GET_RADIO,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = hwsim_get_radio_nl,
                .dumpit = hwsim_dump_radio_nl,
        },
        {
                .cmd = HWSIM_CMD_REPORT_PMSR,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = hwsim_pmsr_report_nl,
        },
};

static struct genl_family hwsim_genl_family __ro_after_init = {
        .name = "MAC80211_HWSIM",
        .version = 1,
        .maxattr = HWSIM_ATTR_MAX,
        .policy = hwsim_genl_policy,
        .netnsok = true,
        .module = THIS_MODULE,
        .small_ops = hwsim_ops,
        .n_small_ops = ARRAY_SIZE(hwsim_ops),
        .resv_start_op = HWSIM_CMD_REPORT_PMSR + 1, // match with __HWSIM_CMD_MAX
        .mcgrps = hwsim_mcgrps,
        .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps),
};

static void remove_user_radios(u32 portid)
{
        struct mac80211_hwsim_data *entry, *tmp;
        LIST_HEAD(list);

        spin_lock_bh(&hwsim_radio_lock);
        list_for_each_entry_safe(entry, tmp, &hwsim_radios, list) {
                if (entry->destroy_on_close && entry->portid == portid) {
                        list_move(&entry->list, &list);
                        rhashtable_remove_fast(&hwsim_radios_rht, &entry->rht,
                                               hwsim_rht_params);
                        hwsim_radios_generation++;
                }
        }
        spin_unlock_bh(&hwsim_radio_lock);

        list_for_each_entry_safe(entry, tmp, &list, list) {
                list_del(&entry->list);
                mac80211_hwsim_del_radio(entry, wiphy_name(entry->hw->wiphy),
                                         NULL);
        }
}

static int mac80211_hwsim_netlink_notify(struct notifier_block *nb,
                                         unsigned long state,
                                         void *_notify)
{
        struct netlink_notify *notify = _notify;

        if (state != NETLINK_URELEASE)
                return NOTIFY_DONE;

        remove_user_radios(notify->portid);

        if (notify->portid == hwsim_net_get_wmediumd(notify->net)) {
                printk(KERN_INFO "mac80211_hwsim: wmediumd released netlink"
                       " socket, switching to perfect channel medium\n");
                hwsim_register_wmediumd(notify->net, 0);
        }
        return NOTIFY_DONE;

}

static struct notifier_block hwsim_netlink_notifier = {
        .notifier_call = mac80211_hwsim_netlink_notify,
};

static int __init hwsim_init_netlink(void)
{
        int rc;

        printk(KERN_INFO "mac80211_hwsim: initializing netlink\n");

        rc = genl_register_family(&hwsim_genl_family);
        if (rc)
                goto failure;

        rc = netlink_register_notifier(&hwsim_netlink_notifier);
        if (rc) {
                genl_unregister_family(&hwsim_genl_family);
                goto failure;
        }

        return 0;

failure:
        pr_debug("mac80211_hwsim: error occurred in %s\n", __func__);
        return -EINVAL;
}

static __net_init int hwsim_init_net(struct net *net)
{
        return hwsim_net_set_netgroup(net);
}

static void __net_exit hwsim_exit_net(struct net *net)
{
        struct mac80211_hwsim_data *data, *tmp;
        LIST_HEAD(list);

        spin_lock_bh(&hwsim_radio_lock);
        list_for_each_entry_safe(data, tmp, &hwsim_radios, list) {
                if (!net_eq(wiphy_net(data->hw->wiphy), net))
                        continue;

                /* Radios created in init_net are returned to init_net. */
                if (data->netgroup == hwsim_net_get_netgroup(&init_net))
                        continue;

                list_move(&data->list, &list);
                rhashtable_remove_fast(&hwsim_radios_rht, &data->rht,
                                       hwsim_rht_params);
                hwsim_radios_generation++;
        }
        spin_unlock_bh(&hwsim_radio_lock);

        list_for_each_entry_safe(data, tmp, &list, list) {
                list_del(&data->list);
                mac80211_hwsim_del_radio(data,
                                         wiphy_name(data->hw->wiphy),
                                         NULL);
        }

        ida_free(&hwsim_netgroup_ida, hwsim_net_get_netgroup(net));
}

static struct pernet_operations hwsim_net_ops = {
        .init = hwsim_init_net,
        .exit = hwsim_exit_net,
        .id   = &hwsim_net_id,
        .size = sizeof(struct hwsim_net),
};

static void hwsim_exit_netlink(void)
{
        /* unregister the notifier */
        netlink_unregister_notifier(&hwsim_netlink_notifier);
        /* unregister the family */
        genl_unregister_family(&hwsim_genl_family);
}

#if IS_REACHABLE(CONFIG_VIRTIO)
static void hwsim_virtio_tx_done(struct virtqueue *vq)
{
        unsigned int len;
        struct sk_buff *skb;
        unsigned long flags;

        spin_lock_irqsave(&hwsim_virtio_lock, flags);
        while ((skb = virtqueue_get_buf(vq, &len)))
                dev_kfree_skb_irq(skb);
        spin_unlock_irqrestore(&hwsim_virtio_lock, flags);
}

static int hwsim_virtio_handle_cmd(struct sk_buff *skb)
{
        struct nlmsghdr *nlh;
        struct genlmsghdr *gnlh;
        struct nlattr *tb[HWSIM_ATTR_MAX + 1];
        struct genl_info info = {};
        int err;

        nlh = nlmsg_hdr(skb);
        gnlh = nlmsg_data(nlh);

        if (skb->len < nlh->nlmsg_len)
                return -EINVAL;

        err = genlmsg_parse(nlh, &hwsim_genl_family, tb, HWSIM_ATTR_MAX,
                            hwsim_genl_policy, NULL);
        if (err) {
                pr_err_ratelimited("hwsim: genlmsg_parse returned %d\n", err);
                return err;
        }

        info.attrs = tb;

        switch (gnlh->cmd) {
        case HWSIM_CMD_FRAME:
                hwsim_cloned_frame_received_nl(skb, &info);
                break;
        case HWSIM_CMD_TX_INFO_FRAME:
                hwsim_tx_info_frame_received_nl(skb, &info);
                break;
        case HWSIM_CMD_REPORT_PMSR:
                hwsim_pmsr_report_nl(skb, &info);
                break;
        default:
                pr_err_ratelimited("hwsim: invalid cmd: %d\n", gnlh->cmd);
                return -EPROTO;
        }
        return 0;
}

static void hwsim_virtio_rx_work(struct work_struct *work)
{
        struct virtqueue *vq;
        unsigned int len;
        struct sk_buff *skb;
        struct scatterlist sg[1];
        int err;
        unsigned long flags;

        spin_lock_irqsave(&hwsim_virtio_lock, flags);
        if (!hwsim_virtio_enabled)
                goto out_unlock;

        skb = virtqueue_get_buf(hwsim_vqs[HWSIM_VQ_RX], &len);
        if (!skb)
                goto out_unlock;
        spin_unlock_irqrestore(&hwsim_virtio_lock, flags);

        skb->data = skb->head;
        skb_reset_tail_pointer(skb);
        skb_put(skb, len);
        hwsim_virtio_handle_cmd(skb);

        spin_lock_irqsave(&hwsim_virtio_lock, flags);
        if (!hwsim_virtio_enabled) {
                dev_kfree_skb_irq(skb);
                goto out_unlock;
        }
        vq = hwsim_vqs[HWSIM_VQ_RX];
        sg_init_one(sg, skb->head, skb_end_offset(skb));
        err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_ATOMIC);
        if (WARN(err, "virtqueue_add_inbuf returned %d\n", err))
                dev_kfree_skb_irq(skb);
        else
                virtqueue_kick(vq);
        schedule_work(&hwsim_virtio_rx);

out_unlock:
        spin_unlock_irqrestore(&hwsim_virtio_lock, flags);
}

static void hwsim_virtio_rx_done(struct virtqueue *vq)
{
        schedule_work(&hwsim_virtio_rx);
}

static int init_vqs(struct virtio_device *vdev)
{
        vq_callback_t *callbacks[HWSIM_NUM_VQS] = {
                [HWSIM_VQ_TX] = hwsim_virtio_tx_done,
                [HWSIM_VQ_RX] = hwsim_virtio_rx_done,
        };
        const char *names[HWSIM_NUM_VQS] = {
                [HWSIM_VQ_TX] = "tx",
                [HWSIM_VQ_RX] = "rx",
        };

        return virtio_find_vqs(vdev, HWSIM_NUM_VQS,
                               hwsim_vqs, callbacks, names, NULL);
}

static int fill_vq(struct virtqueue *vq)
{
        int i, err;
        struct sk_buff *skb;
        struct scatterlist sg[1];

        for (i = 0; i < virtqueue_get_vring_size(vq); i++) {
                skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
                if (!skb)
                        return -ENOMEM;

                sg_init_one(sg, skb->head, skb_end_offset(skb));
                err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL);
                if (err) {
                        nlmsg_free(skb);
                        return err;
                }
        }
        virtqueue_kick(vq);
        return 0;
}

static void remove_vqs(struct virtio_device *vdev)
{
        int i;

        virtio_reset_device(vdev);

        for (i = 0; i < ARRAY_SIZE(hwsim_vqs); i++) {
                struct virtqueue *vq = hwsim_vqs[i];
                struct sk_buff *skb;

                while ((skb = virtqueue_detach_unused_buf(vq)))
                        nlmsg_free(skb);
        }

        vdev->config->del_vqs(vdev);
}

static int hwsim_virtio_probe(struct virtio_device *vdev)
{
        int err;
        unsigned long flags;

        spin_lock_irqsave(&hwsim_virtio_lock, flags);
        if (hwsim_virtio_enabled) {
                spin_unlock_irqrestore(&hwsim_virtio_lock, flags);
                return -EEXIST;
        }
        spin_unlock_irqrestore(&hwsim_virtio_lock, flags);

        err = init_vqs(vdev);
        if (err)
                return err;

        virtio_device_ready(vdev);

        err = fill_vq(hwsim_vqs[HWSIM_VQ_RX]);
        if (err)
                goto out_remove;

        spin_lock_irqsave(&hwsim_virtio_lock, flags);
        hwsim_virtio_enabled = true;
        spin_unlock_irqrestore(&hwsim_virtio_lock, flags);

        schedule_work(&hwsim_virtio_rx);
        return 0;

out_remove:
        remove_vqs(vdev);
        return err;
}

static void hwsim_virtio_remove(struct virtio_device *vdev)
{
        hwsim_virtio_enabled = false;

        cancel_work_sync(&hwsim_virtio_rx);

        remove_vqs(vdev);
}

/* MAC80211_HWSIM virtio device id table */
static const struct virtio_device_id id_table[] = {
        { VIRTIO_ID_MAC80211_HWSIM, VIRTIO_DEV_ANY_ID },
        { 0 }
};
MODULE_DEVICE_TABLE(virtio, id_table);

static struct virtio_driver virtio_hwsim = {
        .driver.name = KBUILD_MODNAME,
        .id_table = id_table,
        .probe = hwsim_virtio_probe,
        .remove = hwsim_virtio_remove,
};

static int hwsim_register_virtio_driver(void)
{
        return register_virtio_driver(&virtio_hwsim);
}

static void hwsim_unregister_virtio_driver(void)
{
        unregister_virtio_driver(&virtio_hwsim);
}
#else
static inline int hwsim_register_virtio_driver(void)
{
        return 0;
}

static inline void hwsim_unregister_virtio_driver(void)
{
}
#endif

static int __init init_mac80211_hwsim(void)
{
        int i, err;

        if (radios < 0 || radios > 100)
                return -EINVAL;

        if (channels < 1)
                return -EINVAL;

        err = rhashtable_init(&hwsim_radios_rht, &hwsim_rht_params);
        if (err)
                return err;

        err = register_pernet_device(&hwsim_net_ops);
        if (err)
                goto out_free_rht;

        err = platform_driver_register(&mac80211_hwsim_driver);
        if (err)
                goto out_unregister_pernet;

        err = hwsim_init_netlink();
        if (err)
                goto out_unregister_driver;

        err = hwsim_register_virtio_driver();
        if (err)
                goto out_exit_netlink;

        hwsim_class = class_create("mac80211_hwsim");
        if (IS_ERR(hwsim_class)) {
                err = PTR_ERR(hwsim_class);
                goto out_exit_virtio;
        }

        hwsim_init_s1g_channels(hwsim_channels_s1g);

        for (i = 0; i < radios; i++) {
                struct hwsim_new_radio_params param = { 0 };

                param.channels = channels;

                switch (regtest) {
                case HWSIM_REGTEST_DIFF_COUNTRY:
                        if (i < ARRAY_SIZE(hwsim_alpha2s))
                                param.reg_alpha2 = hwsim_alpha2s[i];
                        break;
                case HWSIM_REGTEST_DRIVER_REG_FOLLOW:
                        if (!i)
                                param.reg_alpha2 = hwsim_alpha2s[0];
                        break;
                case HWSIM_REGTEST_STRICT_ALL:
                        param.reg_strict = true;
                        fallthrough;
                case HWSIM_REGTEST_DRIVER_REG_ALL:
                        param.reg_alpha2 = hwsim_alpha2s[0];
                        break;
                case HWSIM_REGTEST_WORLD_ROAM:
                        if (i == 0)
                                param.regd = &hwsim_world_regdom_custom_01;
                        break;
                case HWSIM_REGTEST_CUSTOM_WORLD:
                        param.regd = &hwsim_world_regdom_custom_03;
                        break;
                case HWSIM_REGTEST_CUSTOM_WORLD_2:
                        if (i == 0)
                                param.regd = &hwsim_world_regdom_custom_03;
                        else if (i == 1)
                                param.regd = &hwsim_world_regdom_custom_02;
                        break;
                case HWSIM_REGTEST_STRICT_FOLLOW:
                        if (i == 0) {
                                param.reg_strict = true;
                                param.reg_alpha2 = hwsim_alpha2s[0];
                        }
                        break;
                case HWSIM_REGTEST_STRICT_AND_DRIVER_REG:
                        if (i == 0) {
                                param.reg_strict = true;
                                param.reg_alpha2 = hwsim_alpha2s[0];
                        } else if (i == 1) {
                                param.reg_alpha2 = hwsim_alpha2s[1];
                        }
                        break;
                case HWSIM_REGTEST_ALL:
                        switch (i) {
                        case 0:
                                param.regd = &hwsim_world_regdom_custom_01;
                                break;
                        case 1:
                                param.regd = &hwsim_world_regdom_custom_02;
                                break;
                        case 2:
                                param.reg_alpha2 = hwsim_alpha2s[0];
                                break;
                        case 3:
                                param.reg_alpha2 = hwsim_alpha2s[1];
                                break;
                        case 4:
                                param.reg_strict = true;
                                param.reg_alpha2 = hwsim_alpha2s[2];
                                break;
                        }
                        break;
                default:
                        break;
                }

                param.p2p_device = support_p2p_device;
                param.mlo = mlo;
                param.use_chanctx = channels > 1 || mlo;
                param.iftypes = HWSIM_IFTYPE_SUPPORT_MASK;
                if (param.p2p_device)
                        param.iftypes |= BIT(NL80211_IFTYPE_P2P_DEVICE);

                err = mac80211_hwsim_new_radio(NULL, &param);
                if (err < 0)
                        goto out_free_radios;
        }

        hwsim_mon = alloc_netdev(0, "hwsim%d", NET_NAME_UNKNOWN,
                                 hwsim_mon_setup);
        if (hwsim_mon == NULL) {
                err = -ENOMEM;
                goto out_free_radios;
        }

        rtnl_lock();
        err = dev_alloc_name(hwsim_mon, hwsim_mon->name);
        if (err < 0) {
                rtnl_unlock();
                goto out_free_mon;
        }

        err = register_netdevice(hwsim_mon);
        if (err < 0) {
                rtnl_unlock();
                goto out_free_mon;
        }
        rtnl_unlock();

        return 0;

out_free_mon:
        free_netdev(hwsim_mon);
out_free_radios:
        mac80211_hwsim_free();
out_exit_virtio:
        hwsim_unregister_virtio_driver();
out_exit_netlink:
        hwsim_exit_netlink();
out_unregister_driver:
        platform_driver_unregister(&mac80211_hwsim_driver);
out_unregister_pernet:
        unregister_pernet_device(&hwsim_net_ops);
out_free_rht:
        rhashtable_destroy(&hwsim_radios_rht);
        return err;
}
module_init(init_mac80211_hwsim);

static void __exit exit_mac80211_hwsim(void)
{
        pr_debug("mac80211_hwsim: unregister radios\n");

        hwsim_unregister_virtio_driver();
        hwsim_exit_netlink();

        mac80211_hwsim_free();

        rhashtable_destroy(&hwsim_radios_rht);
        unregister_netdev(hwsim_mon);
        platform_driver_unregister(&mac80211_hwsim_driver);
        unregister_pernet_device(&hwsim_net_ops);
}
module_exit(exit_mac80211_hwsim);



















































    2 

















































































































































































































































































































































































































































































































































































































































































































































































    2 















































































    2 







    2 













    2 


    2 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ioctl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
#include <linux/sched/signal.h>
#include <linux/fiemap.h>
#include <linux/mount.h>
#include <linux/fscrypt.h>
#include <linux/fileattr.h>

#include "internal.h"

#include <asm/ioctls.h>

/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS        (UINT_MAX / sizeof(struct fiemap_extent))

/**
 * vfs_ioctl - call filesystem specific ioctl methods
 * @filp:        open file to invoke ioctl method on
 * @cmd:        ioctl command to execute
 * @arg:        command-specific argument for ioctl
 *
 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
 * returns -ENOTTY.
 *
 * Returns 0 on success, -errno on error.
 */
long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        int error = -ENOTTY;

        if (!filp->f_op->unlocked_ioctl)
                goto out;

        error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = -ENOTTY;
 out:
        return error;
}
EXPORT_SYMBOL(vfs_ioctl);

static int ioctl_fibmap(struct file *filp, int __user *p)
{
        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
        int error, ur_block;
        sector_t block;

        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;

        error = get_user(ur_block, p);
        if (error)
                return error;

        if (ur_block < 0)
                return -EINVAL;

        block = ur_block;
        error = bmap(inode, &block);

        if (block > INT_MAX) {
                error = -ERANGE;
                pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n",
                                    current->comm, task_pid_nr(current),
                                    sb->s_id, filp);
        }

        if (error)
                ur_block = 0;
        else
                ur_block = block;

        if (put_user(ur_block, p))
                error = -EFAULT;

        return error;
}

/**
 * fiemap_fill_next_extent - Fiemap helper function
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @logical:        Extent logical start offset, in bytes
 * @phys:        Extent physical start offset, in bytes
 * @len:        Extent length, in bytes
 * @flags:        FIEMAP_EXTENT flags that describe this extent
 *
 * Called from file system ->fiemap callback. Will populate extent
 * info as passed in via arguments and copy to user memory. On
 * success, extent count on fieinfo is incremented.
 *
 * Returns 0 on success, -errno on error, 1 if this was the last
 * extent that will fit in user array.
 */
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
{
        struct fiemap_extent extent;
        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;

        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
                fieinfo->fi_extents_mapped++;
                return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
        }

        if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
                return 1;

#define SET_UNKNOWN_FLAGS        (FIEMAP_EXTENT_DELALLOC)
#define SET_NO_UNMOUNTED_IO_FLAGS        (FIEMAP_EXTENT_DATA_ENCRYPTED)
#define SET_NOT_ALIGNED_FLAGS        (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)

        if (flags & SET_UNKNOWN_FLAGS)
                flags |= FIEMAP_EXTENT_UNKNOWN;
        if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
                flags |= FIEMAP_EXTENT_ENCODED;
        if (flags & SET_NOT_ALIGNED_FLAGS)
                flags |= FIEMAP_EXTENT_NOT_ALIGNED;

        memset(&extent, 0, sizeof(extent));
        extent.fe_logical = logical;
        extent.fe_physical = phys;
        extent.fe_length = len;
        extent.fe_flags = flags;

        dest += fieinfo->fi_extents_mapped;
        if (copy_to_user(dest, &extent, sizeof(extent)))
                return -EFAULT;

        fieinfo->fi_extents_mapped++;
        if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
                return 1;
        return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
}
EXPORT_SYMBOL(fiemap_fill_next_extent);

/**
 * fiemap_prep - check validity of requested flags for fiemap
 * @inode:        Inode to operate on
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @start:        Start of the mapped range
 * @len:        Length of the mapped range, can be truncated by this function.
 * @supported_flags:        Set of fiemap flags that the file system understands
 *
 * This function must be called from each ->fiemap instance to validate the
 * fiemap request against the file system parameters.
 *
 * Returns 0 on success, or a negative error on failure.
 */
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 *len, u32 supported_flags)
{
        u64 maxbytes = inode->i_sb->s_maxbytes;
        u32 incompat_flags;
        int ret = 0;

        if (*len == 0)
                return -EINVAL;
        if (start >= maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;

        supported_flags |= FIEMAP_FLAG_SYNC;
        supported_flags &= FIEMAP_FLAGS_COMPAT;
        incompat_flags = fieinfo->fi_flags & ~supported_flags;
        if (incompat_flags) {
                fieinfo->fi_flags = incompat_flags;
                return -EBADR;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
                ret = filemap_write_and_wait(inode->i_mapping);
        return ret;
}
EXPORT_SYMBOL(fiemap_prep);

static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
{
        struct fiemap fiemap;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = file_inode(filp);
        int error;

        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;

        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
                return -EFAULT;

        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
                return -EINVAL;

        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
        fieinfo.fi_extents_start = ufiemap->fm_extents;

        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start,
                        fiemap.fm_length);

        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;

        return error;
}

static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
                             u64 off, u64 olen, u64 destoff)
{
        struct fd src_file = fdget(srcfd);
        loff_t cloned;
        int ret;

        if (!src_file.file)
                return -EBADF;
        cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
                                      olen, 0);
        if (cloned < 0)
                ret = cloned;
        else if (olen && cloned != olen)
                ret = -EINVAL;
        else
                ret = 0;
        fdput(src_file);
        return ret;
}

static long ioctl_file_clone_range(struct file *file,
                                   struct file_clone_range __user *argp)
{
        struct file_clone_range args;

        if (copy_from_user(&args, argp, sizeof(args)))
                return -EFAULT;
        return ioctl_file_clone(file, args.src_fd, args.src_offset,
                                args.src_length, args.dest_offset);
}

/*
 * This provides compatibility with legacy XFS pre-allocation ioctls
 * which predate the fallocate syscall.
 *
 * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
 * are used here, rest are ignored.
 */
static int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
{
        struct inode *inode = file_inode(filp);
        struct space_resv sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += filp->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start,
                        sr.l_len);
}

/* on ia32 l_start is on a 32-bit boundary */
#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
/* just account for different alignment */
static int compat_ioctl_preallocate(struct file *file, int mode,
                                    struct space_resv_32 __user *argp)
{
        struct inode *inode = file_inode(file);
        struct space_resv_32 sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += file->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
}
#endif

static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p)
{
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
        case FS_IOC_RESVSP:
        case FS_IOC_RESVSP64:
                return ioctl_preallocate(filp, 0, p);
        case FS_IOC_UNRESVSP:
        case FS_IOC_UNRESVSP64:
                return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p);
        case FS_IOC_ZERO_RANGE:
                return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p);
        }

        return -ENOIOCTLCMD;
}

static int ioctl_fionbio(struct file *filp, int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = O_NONBLOCK;
#ifdef __sparc__
        /* SunOS compatibility item. */
        if (O_NONBLOCK != O_NDELAY)
                flag |= O_NDELAY;
#endif
        spin_lock(&filp->f_lock);
        if (on)
                filp->f_flags |= flag;
        else
                filp->f_flags &= ~flag;
        spin_unlock(&filp->f_lock);
        return error;
}

static int ioctl_fioasync(unsigned int fd, struct file *filp,
                          int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = on ? FASYNC : 0;

        /* Did FASYNC state change ? */
        if ((flag ^ filp->f_flags) & FASYNC) {
                if (filp->f_op->fasync)
                        /* fasync() adjusts filp->f_flags */
                        error = filp->f_op->fasync(fd, filp, on);
                else
                        error = -ENOTTY;
        }
        return error < 0 ? error : 0;
}

static int ioctl_fsfreeze(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* If filesystem doesn't support freeze feature, return. */
        if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL)
                return -EOPNOTSUPP;

        /* Freeze */
        if (sb->s_op->freeze_super)
                return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
        return freeze_super(sb, FREEZE_HOLDER_USERSPACE);
}

static int ioctl_fsthaw(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Thaw */
        if (sb->s_op->thaw_super)
                return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
        return thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}

static int ioctl_file_dedupe_range(struct file *file,
                                   struct file_dedupe_range __user *argp)
{
        struct file_dedupe_range *same = NULL;
        int ret;
        unsigned long size;
        u16 count;

        if (get_user(count, &argp->dest_count)) {
                ret = -EFAULT;
                goto out;
        }

        size = offsetof(struct file_dedupe_range, info[count]);
        if (size > PAGE_SIZE) {
                ret = -ENOMEM;
                goto out;
        }

        same = memdup_user(argp, size);
        if (IS_ERR(same)) {
                ret = PTR_ERR(same);
                same = NULL;
                goto out;
        }

        same->dest_count = count;
        ret = vfs_dedupe_file_range(file, same);
        if (ret)
                goto out;

        ret = copy_to_user(argp, same, size);
        if (ret)
                ret = -EFAULT;

out:
        kfree(same);
        return ret;
}

/**
 * fileattr_fill_xflags - initialize fileattr with xflags
 * @fa:                fileattr pointer
 * @xflags:        FS_XFLAG_* flags
 *
 * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
 * other fields are zeroed.
 */
void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
{
        memset(fa, 0, sizeof(*fa));
        fa->fsx_valid = true;
        fa->fsx_xflags = xflags;
        if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
                fa->flags |= FS_IMMUTABLE_FL;
        if (fa->fsx_xflags & FS_XFLAG_APPEND)
                fa->flags |= FS_APPEND_FL;
        if (fa->fsx_xflags & FS_XFLAG_SYNC)
                fa->flags |= FS_SYNC_FL;
        if (fa->fsx_xflags & FS_XFLAG_NOATIME)
                fa->flags |= FS_NOATIME_FL;
        if (fa->fsx_xflags & FS_XFLAG_NODUMP)
                fa->flags |= FS_NODUMP_FL;
        if (fa->fsx_xflags & FS_XFLAG_DAX)
                fa->flags |= FS_DAX_FL;
        if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
                fa->flags |= FS_PROJINHERIT_FL;
}
EXPORT_SYMBOL(fileattr_fill_xflags);

/**
 * fileattr_fill_flags - initialize fileattr with flags
 * @fa:                fileattr pointer
 * @flags:        FS_*_FL flags
 *
 * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
 * All other fields are zeroed.
 */
void fileattr_fill_flags(struct fileattr *fa, u32 flags)
{
        memset(fa, 0, sizeof(*fa));
        fa->flags_valid = true;
        fa->flags = flags;
        if (fa->flags & FS_SYNC_FL)
                fa->fsx_xflags |= FS_XFLAG_SYNC;
        if (fa->flags & FS_IMMUTABLE_FL)
                fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
        if (fa->flags & FS_APPEND_FL)
                fa->fsx_xflags |= FS_XFLAG_APPEND;
        if (fa->flags & FS_NODUMP_FL)
                fa->fsx_xflags |= FS_XFLAG_NODUMP;
        if (fa->flags & FS_NOATIME_FL)
                fa->fsx_xflags |= FS_XFLAG_NOATIME;
        if (fa->flags & FS_DAX_FL)
                fa->fsx_xflags |= FS_XFLAG_DAX;
        if (fa->flags & FS_PROJINHERIT_FL)
                fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
}
EXPORT_SYMBOL(fileattr_fill_flags);

/**
 * vfs_fileattr_get - retrieve miscellaneous file attributes
 * @dentry:        the object to retrieve from
 * @fa:                fileattr pointer
 *
 * Call i_op->fileattr_get() callback, if exists.
 *
 * Return: 0 on success, or a negative error on failure.
 */
int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);

        if (!inode->i_op->fileattr_get)
                return -ENOIOCTLCMD;

        return inode->i_op->fileattr_get(dentry, fa);
}
EXPORT_SYMBOL(vfs_fileattr_get);

/**
 * copy_fsxattr_to_user - copy fsxattr to userspace.
 * @fa:                fileattr pointer
 * @ufa:        fsxattr user pointer
 *
 * Return: 0 on success, or -EFAULT on failure.
 */
int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
{
        struct fsxattr xfa;

        memset(&xfa, 0, sizeof(xfa));
        xfa.fsx_xflags = fa->fsx_xflags;
        xfa.fsx_extsize = fa->fsx_extsize;
        xfa.fsx_nextents = fa->fsx_nextents;
        xfa.fsx_projid = fa->fsx_projid;
        xfa.fsx_cowextsize = fa->fsx_cowextsize;

        if (copy_to_user(ufa, &xfa, sizeof(xfa)))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(copy_fsxattr_to_user);

static int copy_fsxattr_from_user(struct fileattr *fa,
                                  struct fsxattr __user *ufa)
{
        struct fsxattr xfa;

        if (copy_from_user(&xfa, ufa, sizeof(xfa)))
                return -EFAULT;

        fileattr_fill_xflags(fa, xfa.fsx_xflags);
        fa->fsx_extsize = xfa.fsx_extsize;
        fa->fsx_nextents = xfa.fsx_nextents;
        fa->fsx_projid = xfa.fsx_projid;
        fa->fsx_cowextsize = xfa.fsx_cowextsize;

        return 0;
}

/*
 * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
 * any invalid configurations.
 *
 * Note: must be called with inode lock held.
 */
static int fileattr_set_prepare(struct inode *inode,
                              const struct fileattr *old_ma,
                              struct fileattr *fa)
{
        int err;

        /*
         * The IMMUTABLE and APPEND_ONLY flags can only be changed by
         * the relevant capability.
         */
        if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;

        err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
        if (err)
                return err;

        /*
         * Project Quota ID state is only allowed to change from within the init
         * namespace. Enforce that restriction only if we are trying to change
         * the quota ID state. Everything else is allowed in user namespaces.
         */
        if (current_user_ns() != &init_user_ns) {
                if (old_ma->fsx_projid != fa->fsx_projid)
                        return -EINVAL;
                if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
                                FS_XFLAG_PROJINHERIT)
                        return -EINVAL;
        } else {
                /*
                 * Caller is allowed to change the project ID. If it is being
                 * changed, make sure that the new value is valid.
                 */
                if (old_ma->fsx_projid != fa->fsx_projid &&
                    !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
                        return -EINVAL;
        }

        /* Check extent size hints. */
        if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
                        !S_ISDIR(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
            !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                return -EINVAL;

        /*
         * It is only valid to set the DAX flag on regular files and
         * directories on filesystems.
         */
        if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
            !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                return -EINVAL;

        /* Extent size hints of zero turn off the flags. */
        if (fa->fsx_extsize == 0)
                fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
        if (fa->fsx_cowextsize == 0)
                fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;

        return 0;
}

/**
 * vfs_fileattr_set - change miscellaneous file attributes
 * @idmap:        idmap of the mount
 * @dentry:        the object to change
 * @fa:                fileattr pointer
 *
 * After verifying permissions, call i_op->fileattr_set() callback, if
 * exists.
 *
 * Verifying attributes involves retrieving current attributes with
 * i_op->fileattr_get(), this also allows initializing attributes that have
 * not been set by the caller to current values.  Inode lock is held
 * thoughout to prevent racing with another instance.
 *
 * Return: 0 on success, or a negative error on failure.
 */
int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
                     struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct fileattr old_ma = {};
        int err;

        if (!inode->i_op->fileattr_set)
                return -ENOIOCTLCMD;

        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        inode_lock(inode);
        err = vfs_fileattr_get(dentry, &old_ma);
        if (!err) {
                /* initialize missing bits from old_ma */
                if (fa->flags_valid) {
                        fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
                        fa->fsx_extsize = old_ma.fsx_extsize;
                        fa->fsx_nextents = old_ma.fsx_nextents;
                        fa->fsx_projid = old_ma.fsx_projid;
                        fa->fsx_cowextsize = old_ma.fsx_cowextsize;
                } else {
                        fa->flags |= old_ma.flags & ~FS_COMMON_FL;
                }
                err = fileattr_set_prepare(inode, &old_ma, fa);
                if (!err)
                        err = inode->i_op->fileattr_set(idmap, dentry, fa);
        }
        inode_unlock(inode);

        return err;
}
EXPORT_SYMBOL(vfs_fileattr_set);

static int ioctl_getflags(struct file *file, unsigned int __user *argp)
{
        struct fileattr fa = { .flags_valid = true }; /* hint only */
        int err;

        err = vfs_fileattr_get(file->f_path.dentry, &fa);
        if (!err)
                err = put_user(fa.flags, argp);
        return err;
}

static int ioctl_setflags(struct file *file, unsigned int __user *argp)
{
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        struct dentry *dentry = file->f_path.dentry;
        struct fileattr fa;
        unsigned int flags;
        int err;

        err = get_user(flags, argp);
        if (!err) {
                err = mnt_want_write_file(file);
                if (!err) {
                        fileattr_fill_flags(&fa, flags);
                        err = vfs_fileattr_set(idmap, dentry, &fa);
                        mnt_drop_write_file(file);
                }
        }
        return err;
}

static int ioctl_fsgetxattr(struct file *file, void __user *argp)
{
        struct fileattr fa = { .fsx_valid = true }; /* hint only */
        int err;

        err = vfs_fileattr_get(file->f_path.dentry, &fa);
        if (!err)
                err = copy_fsxattr_to_user(&fa, argp);

        return err;
}

static int ioctl_fssetxattr(struct file *file, void __user *argp)
{
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        struct dentry *dentry = file->f_path.dentry;
        struct fileattr fa;
        int err;

        err = copy_fsxattr_from_user(&fa, argp);
        if (!err) {
                err = mnt_want_write_file(file);
                if (!err) {
                        err = vfs_fileattr_set(idmap, dentry, &fa);
                        mnt_drop_write_file(file);
                }
        }
        return err;
}

static int ioctl_getfsuuid(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;
        struct fsuuid2 u = { .len = sb->s_uuid_len, };

        if (!sb->s_uuid_len)
                return -ENOTTY;

        memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;

        if (!strlen(sb->s_sysfs_name))
                return -ENOTTY;

        struct fs_sysfs_path u = {};

        u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

/*
 * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
 * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
 *
 * When you add any new common ioctls to the switches above and below,
 * please ensure they have compatible arguments in compat mode.
 *
 * The LSM mailing list should also be notified of any command additions or
 * changes, as specific LSMs may be affected.
 */
static int do_vfs_ioctl(struct file *filp, unsigned int fd,
                        unsigned int cmd, unsigned long arg)
{
        void __user *argp = (void __user *)arg;
        struct inode *inode = file_inode(filp);

        switch (cmd) {
        case FIOCLEX:
                set_close_on_exec(fd, 1);
                return 0;

        case FIONCLEX:
                set_close_on_exec(fd, 0);
                return 0;

        case FIONBIO:
                return ioctl_fionbio(filp, argp);

        case FIOASYNC:
                return ioctl_fioasync(fd, filp, argp);

        case FIOQSIZE:
                if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
                    S_ISLNK(inode->i_mode)) {
                        loff_t res = inode_get_bytes(inode);
                        return copy_to_user(argp, &res, sizeof(res)) ?
                                            -EFAULT : 0;
                }

                return -ENOTTY;

        case FIFREEZE:
                return ioctl_fsfreeze(filp);

        case FITHAW:
                return ioctl_fsthaw(filp);

        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, argp);

        case FIGETBSZ:
                /* anon_bdev filesystems may not have a block size */
                if (!inode->i_sb->s_blocksize)
                        return -EINVAL;

                return put_user(inode->i_sb->s_blocksize, (int __user *)argp);

        case FICLONE:
                return ioctl_file_clone(filp, arg, 0, 0, 0);

        case FICLONERANGE:
                return ioctl_file_clone_range(filp, argp);

        case FIDEDUPERANGE:
                return ioctl_file_dedupe_range(filp, argp);

        case FIONREAD:
                if (!S_ISREG(inode->i_mode))
                        return vfs_ioctl(filp, cmd, arg);

                return put_user(i_size_read(inode) - filp->f_pos,
                                (int __user *)argp);

        case FS_IOC_GETFLAGS:
                return ioctl_getflags(filp, argp);

        case FS_IOC_SETFLAGS:
                return ioctl_setflags(filp, argp);

        case FS_IOC_FSGETXATTR:
                return ioctl_fsgetxattr(filp, argp);

        case FS_IOC_FSSETXATTR:
                return ioctl_fssetxattr(filp, argp);

        case FS_IOC_GETFSUUID:
                return ioctl_getfsuuid(filp, argp);

        case FS_IOC_GETFSSYSFSPATH:
                return ioctl_get_fs_sysfs_path(filp, argp);

        default:
                if (S_ISREG(inode->i_mode))
                        return file_ioctl(filp, cmd, argp);
                break;
        }

        return -ENOIOCTLCMD;
}

SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
        struct fd f = fdget(fd);
        int error;

        if (!f.file)
                return -EBADF;

        error = security_file_ioctl(f.file, cmd, arg);
        if (error)
                goto out;

        error = do_vfs_ioctl(f.file, fd, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = vfs_ioctl(f.file, cmd, arg);

out:
        fdput(f);
        return error;
}

#ifdef CONFIG_COMPAT
/**
 * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
 * @file: The file to operate on.
 * @cmd: The ioctl command number.
 * @arg: The argument to the ioctl.
 *
 * This is not normally called as a function, but instead set in struct
 * file_operations as
 *
 *     .compat_ioctl = compat_ptr_ioctl,
 *
 * On most architectures, the compat_ptr_ioctl() just passes all arguments
 * to the corresponding ->ioctl handler. The exception is arch/s390, where
 * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
 * pointers to the second 2GB alias the first 2GB, as is the case for
 * native 32-bit s390 user space.
 *
 * The compat_ptr_ioctl() function must therefore be used only with ioctl
 * functions that either ignore the argument or pass a pointer to a
 * compatible data type.
 *
 * If any ioctl command handled by fops->unlocked_ioctl passes a plain
 * integer instead of a pointer, or any of the passed data types
 * is incompatible between 32-bit and 64-bit architectures, a proper
 * handler is required instead of compat_ptr_ioctl.
 */
long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        if (!file->f_op->unlocked_ioctl)
                return -ENOIOCTLCMD;

        return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
EXPORT_SYMBOL(compat_ptr_ioctl);

COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        struct fd f = fdget(fd);
        int error;

        if (!f.file)
                return -EBADF;

        error = security_file_ioctl_compat(f.file, cmd, arg);
        if (error)
                goto out;

        switch (cmd) {
        /* FICLONE takes an int argument, so don't use compat_ptr() */
        case FICLONE:
                error = ioctl_file_clone(f.file, arg, 0, 0, 0);
                break;

#if defined(CONFIG_X86_64)
        /* these get messy on amd64 due to alignment differences */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
                error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
                break;
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
                error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
                                compat_ptr(arg));
                break;
        case FS_IOC_ZERO_RANGE_32:
                error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
                                compat_ptr(arg));
                break;
#endif

        /*
         * These access 32-bit values anyway so no further handling is
         * necessary.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                cmd = (cmd == FS_IOC32_GETFLAGS) ?
                        FS_IOC_GETFLAGS : FS_IOC_SETFLAGS;
                fallthrough;
        /*
         * everything else in do_vfs_ioctl() takes either a compatible
         * pointer argument or no argument -- call it with a modified
         * argument.
         */
        default:
                error = do_vfs_ioctl(f.file, fd, cmd,
                                     (unsigned long)compat_ptr(arg));
                if (error != -ENOIOCTLCMD)
                        break;

                if (f.file->f_op->compat_ioctl)
                        error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
                if (error == -ENOIOCTLCMD)
                        error = -ENOTTY;
                break;
        }

 out:
        fdput(f);

        return error;
}
#endif


































































































    4 








    4 


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_LWTUNNEL_H
#define __NET_LWTUNNEL_H 1

#include <linux/lwtunnel.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/route.h>

#define LWTUNNEL_HASH_BITS   7
#define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)

/* lw tunnel state flags */
#define LWTUNNEL_STATE_OUTPUT_REDIRECT        BIT(0)
#define LWTUNNEL_STATE_INPUT_REDIRECT        BIT(1)
#define LWTUNNEL_STATE_XMIT_REDIRECT        BIT(2)

/* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return
 * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety.
 */
enum {
        LWTUNNEL_XMIT_DONE,
        LWTUNNEL_XMIT_CONTINUE = 0x100,
};


struct lwtunnel_state {
        __u16                type;
        __u16                flags;
        __u16                headroom;
        atomic_t        refcnt;
        int                (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int                (*orig_input)(struct sk_buff *);
        struct                rcu_head rcu;
        __u8            data[];
};

struct lwtunnel_encap_ops {
        int (*build_state)(struct net *net, struct nlattr *encap,
                           unsigned int family, const void *cfg,
                           struct lwtunnel_state **ts,
                           struct netlink_ext_ack *extack);
        void (*destroy_state)(struct lwtunnel_state *lws);
        int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
        int (*input)(struct sk_buff *skb);
        int (*fill_encap)(struct sk_buff *skb,
                          struct lwtunnel_state *lwtstate);
        int (*get_encap_size)(struct lwtunnel_state *lwtstate);
        int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
        int (*xmit)(struct sk_buff *skb);

        struct module *owner;
};

#ifdef CONFIG_LWTUNNEL

DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);

void lwtstate_free(struct lwtunnel_state *lws);

static inline struct lwtunnel_state *
lwtstate_get(struct lwtunnel_state *lws)
{
        if (lws)
                atomic_inc(&lws->refcnt);

        return lws;
}

static inline void lwtstate_put(struct lwtunnel_state *lws)
{
        if (!lws)
                return;

        if (atomic_dec_and_test(&lws->refcnt))
                lwtstate_free(lws);
}

static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT))
                return true;

        return false;
}

static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT))
                return true;

        return false;
}

static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
        if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT))
                return true;

        return false;
}

static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
                                             unsigned int mtu)
{
        if ((lwtunnel_xmit_redirect(lwtstate) ||
             lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu)
                return lwtstate->headroom;

        return 0;
}

int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
                           unsigned int num);
int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
                           unsigned int num);
int lwtunnel_valid_encap_type(u16 encap_type,
                              struct netlink_ext_ack *extack);
int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
                                   struct netlink_ext_ack *extack);
int lwtunnel_build_state(struct net *net, u16 encap_type,
                         struct nlattr *encap,
                         unsigned int family, const void *cfg,
                         struct lwtunnel_state **lws,
                         struct netlink_ext_ack *extack);
int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
                        int encap_attr, int encap_type_attr);
int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int lwtunnel_input(struct sk_buff *skb);
int lwtunnel_xmit(struct sk_buff *skb);
int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                          bool ingress);

static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
        if (lwtunnel_output_redirect(dst->lwtstate)) {
                dst->lwtstate->orig_output = dst->output;
                dst->output = lwtunnel_output;
        }
        if (lwtunnel_input_redirect(dst->lwtstate)) {
                dst->lwtstate->orig_input = dst->input;
                dst->input = lwtunnel_input;
        }
}
#else

static inline void lwtstate_free(struct lwtunnel_state *lws)
{
}

static inline struct lwtunnel_state *
lwtstate_get(struct lwtunnel_state *lws)
{
        return lws;
}

static inline void lwtstate_put(struct lwtunnel_state *lws)
{
}

static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
        return false;
}

static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
}

static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
                                             unsigned int mtu)
{
        return 0;
}

static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
                                         unsigned int num)
{
        return -EOPNOTSUPP;

}

static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
                                         unsigned int num)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_valid_encap_type(u16 encap_type,
                                            struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel");
        return -EOPNOTSUPP;
}
static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
                                                 struct netlink_ext_ack *extack)
{
        /* return 0 since we are not walking attr looking for
         * RTA_ENCAP_TYPE attribute on nexthops.
         */
        return 0;
}

static inline int lwtunnel_build_state(struct net *net, u16 encap_type,
                                       struct nlattr *encap,
                                       unsigned int family, const void *cfg,
                                       struct lwtunnel_state **lws,
                                       struct netlink_ext_ack *extack)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_fill_encap(struct sk_buff *skb,
                                      struct lwtunnel_state *lwtstate,
                                      int encap_attr, int encap_type_attr)
{
        return 0;
}

static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
{
        return 0;
}

static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len)
{
        return NULL;
}

static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
                                     struct lwtunnel_state *b)
{
        return 0;
}

static inline int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_input(struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

static inline int lwtunnel_xmit(struct sk_buff *skb)
{
        return -EOPNOTSUPP;
}

#endif /* CONFIG_LWTUNNEL */

#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type))

#endif /* __NET_LWTUNNEL_H */












































    2 
    1 




















    5 

















    6 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_POLL_H
#define _LINUX_POLL_H


#include <linux/compiler.h>
#include <linux/ktime.h>
#include <linux/wait.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <uapi/linux/poll.h>
#include <uapi/linux/eventpoll.h>

/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
   additional memory. */
#define MAX_STACK_ALLOC 832
#define FRONTEND_STACK_ALLOC        256
#define SELECT_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define WQUEUES_STACK_ALLOC        (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
#define N_INLINE_POLL_ENTRIES        (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))

#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

struct poll_table_struct;

/* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

/*
 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
 */
typedef struct poll_table_struct {
        poll_queue_proc _qproc;
        __poll_t _key;
} poll_table;

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
        if (p && p->_qproc && wait_address)
                p->_qproc(filp, wait_address, p);
}

/*
 * Return true if it is guaranteed that poll will not wait. This is the case
 * if the poll() of another file descriptor in the set got an event, so there
 * is no need for waiting.
 */
static inline bool poll_does_not_wait(const poll_table *p)
{
        return p == NULL || p->_qproc == NULL;
}

/*
 * Return the set of events that the application wants to poll for.
 * This is useful for drivers that need to know whether a DMA transfer has
 * to be started implicitly on poll(). You typically only want to do that
 * if the application is actually polling for POLLIN and/or POLLOUT.
 */
static inline __poll_t poll_requested_events(const poll_table *p)
{
        return p ? p->_key : ~(__poll_t)0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
        pt->_qproc = qproc;
        pt->_key   = ~(__poll_t)0; /* all events enabled */
}

static inline bool file_can_poll(struct file *file)
{
        return file->f_op->poll;
}

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);
}

struct poll_table_entry {
        struct file *filp;
        __poll_t key;
        wait_queue_entry_t wait;
        wait_queue_head_t *wait_address;
};

/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
        poll_table pt;
        struct poll_table_page *table;
        struct task_struct *polling_task;
        int triggered;
        int error;
        int inline_index;
        struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern u64 select_estimate_accuracy(struct timespec64 *tv);

#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)

extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time);

extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
                                   long nsec);

#define __MAP(v, from, to) \
        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __u16 mangle_poll(__poll_t val)
{
        __u16 v = (__force __u16)val;
#define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}

static inline __poll_t demangle_poll(u16 val)
{
#define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}
#undef __MAP


#endif /* _LINUX_POLL_H */
















































































































    2 





















    2 
    1 





    2 



    1 
    1 



    1 



    1 





























































    4 





















    2 





    2 
    2 












    3 















    3 







    2 













    4 




    2 



    1 







    3 








    1 













    3 















    2 












    2 































    1 






    4 


    2 


    1 


























    2 










    2 



















    2 





























































































































































































































    3 






    4 

    3 












































    2 


    2 
















    2 











    4 



    4 





    3 



    3 










































    3 

    2 





















































































































    2 


























    3 


    3 


    3 





































































































































































    3 

























    2 





    2 



















    1 




















    2 








    3 
























    3 
    3 












    1 



    3 

















    1 










    2 

    3 
    3 

    3 







































    2 

































































































































































































































































    2 







    2 

















    3 





































































    3 












    2 





    2 













    2 
    2 
    2 


    2 

    3 









    3 























    2 















    3 






























    4 

    5 
    3 


    2 
    2 
    1 











    3 





    3 







    3 




























































    2 

















    1 









    1 









    4 






    4 















    4 











    5 








    3 







    4 



    2 









    2 






    3 

    1 

    3 












    2 





    3 
    2 

    2 
    2 


    2 























































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
// SPDX-License-Identifier: GPL-2.0-only
/* Connection state tracking for netfilter.  This is separated from,
   but required by, the NAT layer; it can also be used by an iptables
   extension. */

/* (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
 * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
#include <linux/notifier.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <linux/mm.h>
#include <linux/nsproxy.h>
#include <linux/rculist_nulls.h>

#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netns/hash.h>
#include <net/ip.h>

#include "nf_internals.h"

__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
EXPORT_SYMBOL_GPL(nf_conntrack_locks);

__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);

struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_hash);

struct conntrack_gc_work {
        struct delayed_work        dwork;
        u32                        next_bucket;
        u32                        avg_timeout;
        u32                        count;
        u32                        start_time;
        bool                        exiting;
        bool                        early_drop;
};

static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;

/* serialize hash resizes and nf_ct_iterate_cleanup */
static DEFINE_MUTEX(nf_conntrack_mutex);

#define GC_SCAN_INTERVAL_MAX        (60ul * HZ)
#define GC_SCAN_INTERVAL_MIN        (1ul * HZ)

/* clamp timeouts to this value (TCP unacked) */
#define GC_SCAN_INTERVAL_CLAMP        (300ul * HZ)

/* Initial bias pretending we have 100 entries at the upper bound so we don't
 * wakeup often just because we have three entries with a 1s timeout while still
 * allowing non-idle machines to wakeup more often when needed.
 */
#define GC_SCAN_INITIAL_COUNT        100
#define GC_SCAN_INTERVAL_INIT        GC_SCAN_INTERVAL_MAX

#define GC_SCAN_MAX_DURATION        msecs_to_jiffies(10)
#define GC_SCAN_EXPIRED_MAX        (64000u / HZ)

#define MIN_CHAINLEN        50u
#define MAX_CHAINLEN        (80u - MIN_CHAINLEN)

static struct conntrack_gc_work conntrack_gc_work;

void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
{
        /* 1) Acquire the lock */
        spin_lock(lock);

        /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
         * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
         */
        if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
                return;

        /* fast path failed, unlock */
        spin_unlock(lock);

        /* Slow path 1) get global lock */
        spin_lock(&nf_conntrack_locks_all_lock);

        /* Slow path 2) get the lock we want */
        spin_lock(lock);

        /* Slow path 3) release the global lock */
        spin_unlock(&nf_conntrack_locks_all_lock);
}
EXPORT_SYMBOL_GPL(nf_conntrack_lock);

static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
{
        h1 %= CONNTRACK_LOCKS;
        h2 %= CONNTRACK_LOCKS;
        spin_unlock(&nf_conntrack_locks[h1]);
        if (h1 != h2)
                spin_unlock(&nf_conntrack_locks[h2]);
}

/* return true if we need to recompute hashes (in case hash table was resized) */
static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
                                     unsigned int h2, unsigned int sequence)
{
        h1 %= CONNTRACK_LOCKS;
        h2 %= CONNTRACK_LOCKS;
        if (h1 <= h2) {
                nf_conntrack_lock(&nf_conntrack_locks[h1]);
                if (h1 != h2)
                        spin_lock_nested(&nf_conntrack_locks[h2],
                                         SINGLE_DEPTH_NESTING);
        } else {
                nf_conntrack_lock(&nf_conntrack_locks[h2]);
                spin_lock_nested(&nf_conntrack_locks[h1],
                                 SINGLE_DEPTH_NESTING);
        }
        if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
                nf_conntrack_double_unlock(h1, h2);
                return true;
        }
        return false;
}

static void nf_conntrack_all_lock(void)
        __acquires(&nf_conntrack_locks_all_lock)
{
        int i;

        spin_lock(&nf_conntrack_locks_all_lock);

        /* For nf_contrack_locks_all, only the latest time when another
         * CPU will see an update is controlled, by the "release" of the
         * spin_lock below.
         * The earliest time is not controlled, an thus KCSAN could detect
         * a race when nf_conntract_lock() reads the variable.
         * WRITE_ONCE() is used to ensure the compiler will not
         * optimize the write.
         */
        WRITE_ONCE(nf_conntrack_locks_all, true);

        for (i = 0; i < CONNTRACK_LOCKS; i++) {
                spin_lock(&nf_conntrack_locks[i]);

                /* This spin_unlock provides the "release" to ensure that
                 * nf_conntrack_locks_all==true is visible to everyone that
                 * acquired spin_lock(&nf_conntrack_locks[]).
                 */
                spin_unlock(&nf_conntrack_locks[i]);
        }
}

static void nf_conntrack_all_unlock(void)
        __releases(&nf_conntrack_locks_all_lock)
{
        /* All prior stores must be complete before we clear
         * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
         * might observe the false value but not the entire
         * critical section.
         * It pairs with the smp_load_acquire() in nf_conntrack_lock()
         */
        smp_store_release(&nf_conntrack_locks_all, false);
        spin_unlock(&nf_conntrack_locks_all_lock);
}

unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);

unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static siphash_aligned_key_t nf_conntrack_hash_rnd;

static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
                              unsigned int zoneid,
                              const struct net *net)
{
        siphash_key_t key;

        get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));

        key = nf_conntrack_hash_rnd;

        key.key[0] ^= zoneid;
        key.key[1] ^= net_hash_mix(net);

        return siphash((void *)tuple,
                        offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend),
                        &key);
}

static u32 scale_hash(u32 hash)
{
        return reciprocal_scale(hash, nf_conntrack_htable_size);
}

static u32 __hash_conntrack(const struct net *net,
                            const struct nf_conntrack_tuple *tuple,
                            unsigned int zoneid,
                            unsigned int size)
{
        return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
}

static u32 hash_conntrack(const struct net *net,
                          const struct nf_conntrack_tuple *tuple,
                          unsigned int zoneid)
{
        return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
}

static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
                                  unsigned int dataoff,
                                  struct nf_conntrack_tuple *tuple)
{        struct {
                __be16 sport;
                __be16 dport;
        } _inet_hdr, *inet_hdr;

        /* Actually only need first 4 bytes to get ports. */
        inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
        if (!inet_hdr)
                return false;

        tuple->src.u.udp.port = inet_hdr->sport;
        tuple->dst.u.udp.port = inet_hdr->dport;
        return true;
}

static bool
nf_ct_get_tuple(const struct sk_buff *skb,
                unsigned int nhoff,
                unsigned int dataoff,
                u_int16_t l3num,
                u_int8_t protonum,
                struct net *net,
                struct nf_conntrack_tuple *tuple)
{
        unsigned int size;
        const __be32 *ap;
        __be32 _addrs[8];

        memset(tuple, 0, sizeof(*tuple));

        tuple->src.l3num = l3num;
        switch (l3num) {
        case NFPROTO_IPV4:
                nhoff += offsetof(struct iphdr, saddr);
                size = 2 * sizeof(__be32);
                break;
        case NFPROTO_IPV6:
                nhoff += offsetof(struct ipv6hdr, saddr);
                size = sizeof(_addrs);
                break;
        default:
                return true;
        }

        ap = skb_header_pointer(skb, nhoff, size, _addrs);
        if (!ap)
                return false;

        switch (l3num) {
        case NFPROTO_IPV4:
                tuple->src.u3.ip = ap[0];
                tuple->dst.u3.ip = ap[1];
                break;
        case NFPROTO_IPV6:
                memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
                memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
                break;
        }

        tuple->dst.protonum = protonum;
        tuple->dst.dir = IP_CT_DIR_ORIGINAL;

        switch (protonum) {
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ICMPV6:
                return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
#endif
        case IPPROTO_ICMP:
                return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
#ifdef CONFIG_NF_CT_PROTO_GRE
        case IPPROTO_GRE:
                return gre_pkt_to_tuple(skb, dataoff, net, tuple);
#endif
        case IPPROTO_TCP:
        case IPPROTO_UDP:
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
        case IPPROTO_UDPLITE:
#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
        case IPPROTO_SCTP:
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
        case IPPROTO_DCCP:
#endif
                /* fallthrough */
                return nf_ct_get_tuple_ports(skb, dataoff, tuple);
        default:
                break;
        }

        return true;
}

static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
                            u_int8_t *protonum)
{
        int dataoff = -1;
        const struct iphdr *iph;
        struct iphdr _iph;

        iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
        if (!iph)
                return -1;

        /* Conntrack defragments packets, we might still see fragments
         * inside ICMP packets though.
         */
        if (iph->frag_off & htons(IP_OFFSET))
                return -1;

        dataoff = nhoff + (iph->ihl << 2);
        *protonum = iph->protocol;

        /* Check bogus IP headers */
        if (dataoff > skb->len) {
                pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
                         nhoff, iph->ihl << 2, skb->len);
                return -1;
        }
        return dataoff;
}

#if IS_ENABLED(CONFIG_IPV6)
static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
                            u8 *protonum)
{
        int protoff = -1;
        unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
        __be16 frag_off;
        u8 nexthdr;

        if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
                          &nexthdr, sizeof(nexthdr)) != 0) {
                pr_debug("can't get nexthdr\n");
                return -1;
        }
        protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
        /*
         * (protoff == skb->len) means the packet has not data, just
         * IPv6 and possibly extensions headers, but it is tracked anyway
         */
        if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
                pr_debug("can't find proto in pkt\n");
                return -1;
        }

        *protonum = nexthdr;
        return protoff;
}
#endif

static int get_l4proto(const struct sk_buff *skb,
                       unsigned int nhoff, u8 pf, u8 *l4num)
{
        switch (pf) {
        case NFPROTO_IPV4:
                return ipv4_get_l4proto(skb, nhoff, l4num);
#if IS_ENABLED(CONFIG_IPV6)
        case NFPROTO_IPV6:
                return ipv6_get_l4proto(skb, nhoff, l4num);
#endif
        default:
                *l4num = 0;
                break;
        }
        return -1;
}

bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
                       u_int16_t l3num,
                       struct net *net, struct nf_conntrack_tuple *tuple)
{
        u8 protonum;
        int protoff;

        protoff = get_l4proto(skb, nhoff, l3num, &protonum);
        if (protoff <= 0)
                return false;

        return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
}
EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);

bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
                   const struct nf_conntrack_tuple *orig)
{
        memset(inverse, 0, sizeof(*inverse));

        inverse->src.l3num = orig->src.l3num;

        switch (orig->src.l3num) {
        case NFPROTO_IPV4:
                inverse->src.u3.ip = orig->dst.u3.ip;
                inverse->dst.u3.ip = orig->src.u3.ip;
                break;
        case NFPROTO_IPV6:
                inverse->src.u3.in6 = orig->dst.u3.in6;
                inverse->dst.u3.in6 = orig->src.u3.in6;
                break;
        default:
                break;
        }

        inverse->dst.dir = !orig->dst.dir;

        inverse->dst.protonum = orig->dst.protonum;

        switch (orig->dst.protonum) {
        case IPPROTO_ICMP:
                return nf_conntrack_invert_icmp_tuple(inverse, orig);
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ICMPV6:
                return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
#endif
        }

        inverse->src.u.all = orig->dst.u.all;
        inverse->dst.u.all = orig->src.u.all;
        return true;
}
EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);

/* Generate a almost-unique pseudo-id for a given conntrack.
 *
 * intentionally doesn't re-use any of the seeds used for hash
 * table location, we assume id gets exposed to userspace.
 *
 * Following nf_conn items do not change throughout lifetime
 * of the nf_conn:
 *
 * 1. nf_conn address
 * 2. nf_conn->master address (normally NULL)
 * 3. the associated net namespace
 * 4. the original direction tuple
 */
u32 nf_ct_get_id(const struct nf_conn *ct)
{
        static siphash_aligned_key_t ct_id_seed;
        unsigned long a, b, c, d;

        net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));

        a = (unsigned long)ct;
        b = (unsigned long)ct->master;
        c = (unsigned long)nf_ct_net(ct);
        d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                   sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
                                   &ct_id_seed);
#ifdef CONFIG_64BIT
        return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
#else
        return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
#endif
}
EXPORT_SYMBOL_GPL(nf_ct_get_id);

static void
clean_from_lists(struct nf_conn *ct)
{
        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);

        /* Destroy all pending expectations */
        nf_ct_remove_expectations(ct);
}

#define NFCT_ALIGN(len)        (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)

/* Released via nf_ct_destroy() */
struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
                                 const struct nf_conntrack_zone *zone,
                                 gfp_t flags)
{
        struct nf_conn *tmpl, *p;

        if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
                tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags);
                if (!tmpl)
                        return NULL;

                p = tmpl;
                tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
                if (tmpl != p) {
                        tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
                        tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
                }
        } else {
                tmpl = kzalloc(sizeof(*tmpl), flags);
                if (!tmpl)
                        return NULL;
        }

        tmpl->status = IPS_TEMPLATE;
        write_pnet(&tmpl->ct_net, net);
        nf_ct_zone_add(tmpl, zone);
        refcount_set(&tmpl->ct_general.use, 1);

        return tmpl;
}
EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);

void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
        kfree(tmpl->ext);

        if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
                kfree((char *)tmpl - tmpl->proto.tmpl_padto);
        else
                kfree(tmpl);
}
EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);

static void destroy_gre_conntrack(struct nf_conn *ct)
{
#ifdef CONFIG_NF_CT_PROTO_GRE
        struct nf_conn *master = ct->master;

        if (master)
                nf_ct_gre_keymap_destroy(master);
#endif
}

void nf_ct_destroy(struct nf_conntrack *nfct)
{
        struct nf_conn *ct = (struct nf_conn *)nfct;

        WARN_ON(refcount_read(&nfct->use) != 0);

        if (unlikely(nf_ct_is_template(ct))) {
                nf_ct_tmpl_free(ct);
                return;
        }

        if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
                destroy_gre_conntrack(ct);

        /* Expectations will have been removed in clean_from_lists,
         * except TFTP can create an expectation on the first packet,
         * before connection is in the list, so we need to clean here,
         * too.
         */
        nf_ct_remove_expectations(ct);

        if (ct->master)
                nf_ct_put(ct->master);

        nf_conntrack_free(ct);
}
EXPORT_SYMBOL(nf_ct_destroy);

static void __nf_ct_delete_from_lists(struct nf_conn *ct)
{
        struct net *net = nf_ct_net(ct);
        unsigned int hash, reply_hash;
        unsigned int sequence;

        do {
                sequence = read_seqcount_begin(&nf_conntrack_generation);
                hash = hash_conntrack(net,
                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                      nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
                reply_hash = hash_conntrack(net,
                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                           nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

        clean_from_lists(ct);
        nf_conntrack_double_unlock(hash, reply_hash);
}

static void nf_ct_delete_from_lists(struct nf_conn *ct)
{
        nf_ct_helper_destroy(ct);
        local_bh_disable();

        __nf_ct_delete_from_lists(ct);

        local_bh_enable();
}

static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));

        spin_lock(&cnet->ecache.dying_lock);
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
                                 &cnet->ecache.dying_list);
        spin_unlock(&cnet->ecache.dying_lock);
#endif
}

bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
        struct nf_conn_tstamp *tstamp;
        struct net *net;

        if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
                return false;

        tstamp = nf_conn_tstamp_find(ct);
        if (tstamp) {
                s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;

                tstamp->stop = ktime_get_real_ns();
                if (timeout < 0)
                        tstamp->stop -= jiffies_to_nsecs(-timeout);
        }

        if (nf_conntrack_event_report(IPCT_DESTROY, ct,
                                    portid, report) < 0) {
                /* destroy event was not delivered. nf_ct_put will
                 * be done by event cache worker on redelivery.
                 */
                nf_ct_helper_destroy(ct);
                local_bh_disable();
                __nf_ct_delete_from_lists(ct);
                nf_ct_add_to_ecache_list(ct);
                local_bh_enable();

                nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
                return false;
        }

        net = nf_ct_net(ct);
        if (nf_conntrack_ecache_dwork_pending(net))
                nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
        nf_ct_delete_from_lists(ct);
        nf_ct_put(ct);
        return true;
}
EXPORT_SYMBOL_GPL(nf_ct_delete);

static inline bool
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
                const struct nf_conntrack_tuple *tuple,
                const struct nf_conntrack_zone *zone,
                const struct net *net)
{
        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

        /* A conntrack can be recreated with the equal tuple,
         * so we need to check that the conntrack is confirmed
         */
        return nf_ct_tuple_equal(tuple, &h->tuple) &&
               nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
               nf_ct_is_confirmed(ct) &&
               net_eq(net, nf_ct_net(ct));
}

static inline bool
nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
{
        return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
               nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
                                 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
               nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
               nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
               net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
}

/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
        if (!refcount_inc_not_zero(&ct->ct_general.use))
                return;

        /* load ->status after refcount increase */
        smp_acquire__after_ctrl_dep();

        if (nf_ct_should_gc(ct))
                nf_ct_kill(ct);

        nf_ct_put(ct);
}

/*
 * Warning :
 * - Caller must take a reference on returned object
 *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
 */
static struct nf_conntrack_tuple_hash *
____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
                      const struct nf_conntrack_tuple *tuple, u32 hash)
{
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_head *ct_hash;
        struct hlist_nulls_node *n;
        unsigned int bucket, hsize;

begin:
        nf_conntrack_get_ht(&ct_hash, &hsize);
        bucket = reciprocal_scale(hash, hsize);

        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
                struct nf_conn *ct;

                ct = nf_ct_tuplehash_to_ctrack(h);
                if (nf_ct_is_expired(ct)) {
                        nf_ct_gc_expired(ct);
                        continue;
                }

                if (nf_ct_key_equal(h, tuple, zone, net))
                        return h;
        }
        /*
         * if the nulls value we got at the end of this lookup is
         * not the expected one, we must restart lookup.
         * We probably met an item that was moved to another chain.
         */
        if (get_nulls_value(n) != bucket) {
                NF_CT_STAT_INC_ATOMIC(net, search_restart);
                goto begin;
        }

        return NULL;
}

/* Find a connection corresponding to a tuple. */
static struct nf_conntrack_tuple_hash *
__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
                        const struct nf_conntrack_tuple *tuple, u32 hash)
{
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;

        h = ____nf_conntrack_find(net, zone, tuple, hash);
        if (h) {
                /* We have a candidate that matches the tuple we're interested
                 * in, try to obtain a reference and re-check tuple
                 */
                ct = nf_ct_tuplehash_to_ctrack(h);
                if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
                        /* re-check key after refcount */
                        smp_acquire__after_ctrl_dep();

                        if (likely(nf_ct_key_equal(h, tuple, zone, net)))
                                return h;

                        /* TYPESAFE_BY_RCU recycled the candidate */
                        nf_ct_put(ct);
                }

                h = NULL;
        }

        return h;
}

struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
                      const struct nf_conntrack_tuple *tuple)
{
        unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
        struct nf_conntrack_tuple_hash *thash;

        rcu_read_lock();

        thash = __nf_conntrack_find_get(net, zone, tuple,
                                        hash_conntrack_raw(tuple, zone_id, net));

        if (thash)
                goto out_unlock;

        rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
        if (rid != zone_id)
                thash = __nf_conntrack_find_get(net, zone, tuple,
                                                hash_conntrack_raw(tuple, rid, net));

out_unlock:
        rcu_read_unlock();
        return thash;
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);

static void __nf_conntrack_hash_insert(struct nf_conn *ct,
                                       unsigned int hash,
                                       unsigned int reply_hash)
{
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
                           &nf_conntrack_hash[hash]);
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
                           &nf_conntrack_hash[reply_hash]);
}

static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
{
        /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
         * may contain stale pointers to e.g. helper that has been removed.
         *
         * The helper can't clear this because the nf_conn object isn't in
         * any hash and synchronize_rcu() isn't enough because associated skb
         * might sit in a queue.
         */
        return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
}

static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
{
        if (!ext)
                return true;

        if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
                return false;

        /* inserted into conntrack table, nf_ct_iterate_cleanup()
         * will find it.  Disable nf_ct_ext_find() id check.
         */
        WRITE_ONCE(ext->gen_id, 0);
        return true;
}

int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
        const struct nf_conntrack_zone *zone;
        struct net *net = nf_ct_net(ct);
        unsigned int hash, reply_hash;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        unsigned int max_chainlen;
        unsigned int chainlen = 0;
        unsigned int sequence;
        int err = -EEXIST;

        zone = nf_ct_zone(ct);

        if (!nf_ct_ext_valid_pre(ct->ext))
                return -EAGAIN;

        local_bh_disable();
        do {
                sequence = read_seqcount_begin(&nf_conntrack_generation);
                hash = hash_conntrack(net,
                                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                      nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
                reply_hash = hash_conntrack(net,
                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                           nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

        max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);

        /* See if there's one in the list already, including reverse */
        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                    zone, net))
                        goto out;

                if (chainlen++ > max_chainlen)
                        goto chaintoolong;
        }

        chainlen = 0;

        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                    zone, net))
                        goto out;
                if (chainlen++ > max_chainlen)
                        goto chaintoolong;
        }

        /* If genid has changed, we can't insert anymore because ct
         * extensions could have stale pointers and nf_ct_iterate_destroy
         * might have completed its table scan already.
         *
         * Increment of the ext genid right after this check is fine:
         * nf_ct_iterate_destroy blocks until locks are released.
         */
        if (!nf_ct_ext_valid_post(ct->ext)) {
                err = -EAGAIN;
                goto out;
        }

        smp_wmb();
        /* The caller holds a reference to this object */
        refcount_set(&ct->ct_general.use, 2);
        __nf_conntrack_hash_insert(ct, hash, reply_hash);
        nf_conntrack_double_unlock(hash, reply_hash);
        NF_CT_STAT_INC(net, insert);
        local_bh_enable();

        return 0;
chaintoolong:
        NF_CT_STAT_INC(net, chaintoolong);
        err = -ENOSPC;
out:
        nf_conntrack_double_unlock(hash, reply_hash);
        local_bh_enable();
        return err;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);

void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
                    unsigned int bytes)
{
        struct nf_conn_acct *acct;

        acct = nf_conn_acct_find(ct);
        if (acct) {
                struct nf_conn_counter *counter = acct->counter;

                atomic64_add(packets, &counter[dir].packets);
                atomic64_add(bytes, &counter[dir].bytes);
        }
}
EXPORT_SYMBOL_GPL(nf_ct_acct_add);

static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                             const struct nf_conn *loser_ct)
{
        struct nf_conn_acct *acct;

        acct = nf_conn_acct_find(loser_ct);
        if (acct) {
                struct nf_conn_counter *counter = acct->counter;
                unsigned int bytes;

                /* u32 should be fine since we must have seen one packet. */
                bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
                nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
        }
}

static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
{
        struct nf_conn_tstamp *tstamp;

        refcount_inc(&ct->ct_general.use);

        /* set conntrack timestamp, if enabled. */
        tstamp = nf_conn_tstamp_find(ct);
        if (tstamp)
                tstamp->start = ktime_get_real_ns();
}

/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
                                 struct nf_conntrack_tuple_hash *h)
{
        /* This is the conntrack entry already in hashes that won race. */
        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
        enum ip_conntrack_info ctinfo;
        struct nf_conn *loser_ct;

        loser_ct = nf_ct_get(skb, &ctinfo);

        if (nf_ct_is_dying(ct))
                return NF_DROP;

        if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
            nf_ct_match(ct, loser_ct)) {
                struct net *net = nf_ct_net(ct);

                nf_conntrack_get(&ct->ct_general);

                nf_ct_acct_merge(ct, ctinfo, loser_ct);
                nf_ct_put(loser_ct);
                nf_ct_set(skb, ct, ctinfo);

                NF_CT_STAT_INC(net, clash_resolve);
                return NF_ACCEPT;
        }

        return NF_DROP;
}

/**
 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
 *
 * @skb: skb that causes the collision
 * @repl_idx: hash slot for reply direction
 *
 * Called when origin or reply direction had a clash.
 * The skb can be handled without packet drop provided the reply direction
 * is unique or there the existing entry has the identical tuple in both
 * directions.
 *
 * Caller must hold conntrack table locks to prevent concurrent updates.
 *
 * Returns NF_DROP if the clash could not be handled.
 */
static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
{
        struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
        const struct nf_conntrack_zone *zone;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        struct net *net;

        zone = nf_ct_zone(loser_ct);
        net = nf_ct_net(loser_ct);

        /* Reply direction must never result in a clash, unless both origin
         * and reply tuples are identical.
         */
        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
                if (nf_ct_key_equal(h,
                                    &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                    zone, net))
                        return __nf_ct_resolve_clash(skb, h);
        }

        /* We want the clashing entry to go away real soon: 1 second timeout. */
        WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);

        /* IPS_NAT_CLASH removes the entry automatically on the first
         * reply.  Also prevents UDP tracker from moving the entry to
         * ASSURED state, i.e. the entry can always be evicted under
         * pressure.
         */
        loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;

        __nf_conntrack_insert_prepare(loser_ct);

        /* fake add for ORIGINAL dir: we want lookups to only find the entry
         * already in the table.  This also hides the clashing entry from
         * ctnetlink iteration, i.e. conntrack -L won't show them.
         */
        hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);

        hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
                                 &nf_conntrack_hash[repl_idx]);

        NF_CT_STAT_INC(net, clash_resolve);
        return NF_ACCEPT;
}

/**
 * nf_ct_resolve_clash - attempt to handle clash without packet drop
 *
 * @skb: skb that causes the clash
 * @h: tuplehash of the clashing entry already in table
 * @reply_hash: hash slot for reply direction
 *
 * A conntrack entry can be inserted to the connection tracking table
 * if there is no existing entry with an identical tuple.
 *
 * If there is one, @skb (and the assocated, unconfirmed conntrack) has
 * to be dropped.  In case @skb is retransmitted, next conntrack lookup
 * will find the already-existing entry.
 *
 * The major problem with such packet drop is the extra delay added by
 * the packet loss -- it will take some time for a retransmit to occur
 * (or the sender to time out when waiting for a reply).
 *
 * This function attempts to handle the situation without packet drop.
 *
 * If @skb has no NAT transformation or if the colliding entries are
 * exactly the same, only the to-be-confirmed conntrack entry is discarded
 * and @skb is associated with the conntrack entry already in the table.
 *
 * Failing that, the new, unconfirmed conntrack is still added to the table
 * provided that the collision only occurs in the ORIGINAL direction.
 * The new entry will be added only in the non-clashing REPLY direction,
 * so packets in the ORIGINAL direction will continue to match the existing
 * entry.  The new entry will also have a fixed timeout so it expires --
 * due to the collision, it will only see reply traffic.
 *
 * Returns NF_DROP if the clash could not be resolved.
 */
static __cold noinline int
nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
                    u32 reply_hash)
{
        /* This is the conntrack entry already in hashes that won race. */
        struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
        const struct nf_conntrack_l4proto *l4proto;
        enum ip_conntrack_info ctinfo;
        struct nf_conn *loser_ct;
        struct net *net;
        int ret;

        loser_ct = nf_ct_get(skb, &ctinfo);
        net = nf_ct_net(loser_ct);

        l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
        if (!l4proto->allow_clash)
                goto drop;

        ret = __nf_ct_resolve_clash(skb, h);
        if (ret == NF_ACCEPT)
                return ret;

        ret = nf_ct_resolve_clash_harder(skb, reply_hash);
        if (ret == NF_ACCEPT)
                return ret;

drop:
        NF_CT_STAT_INC(net, drop);
        NF_CT_STAT_INC(net, insert_failed);
        return NF_DROP;
}

/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
        unsigned int chainlen = 0, sequence, max_chainlen;
        const struct nf_conntrack_zone *zone;
        unsigned int hash, reply_hash;
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct nf_conn_help *help;
        struct hlist_nulls_node *n;
        enum ip_conntrack_info ctinfo;
        struct net *net;
        int ret = NF_DROP;

        ct = nf_ct_get(skb, &ctinfo);
        net = nf_ct_net(ct);

        /* ipt_REJECT uses nf_conntrack_attach to attach related
           ICMP/TCP RST packets in other direction.  Actual packet
           which created connection will be IP_CT_NEW or for an
           expected connection, IP_CT_RELATED. */
        if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
                return NF_ACCEPT;

        zone = nf_ct_zone(ct);
        local_bh_disable();

        do {
                sequence = read_seqcount_begin(&nf_conntrack_generation);
                /* reuse the hash saved before */
                hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
                hash = scale_hash(hash);
                reply_hash = hash_conntrack(net,
                                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                           nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
        } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));

        /* We're not in hash table, and we refuse to set up related
         * connections for unconfirmed conns.  But packet copies and
         * REJECT will give spurious warnings here.
         */

        /* Another skb with the same unconfirmed conntrack may
         * win the race. This may happen for bridge(br_flood)
         * or broadcast/multicast packets do skb_clone with
         * unconfirmed conntrack.
         */
        if (unlikely(nf_ct_is_confirmed(ct))) {
                WARN_ON_ONCE(1);
                nf_conntrack_double_unlock(hash, reply_hash);
                local_bh_enable();
                return NF_DROP;
        }

        if (!nf_ct_ext_valid_pre(ct->ext)) {
                NF_CT_STAT_INC(net, insert_failed);
                goto dying;
        }

        /* We have to check the DYING flag after unlink to prevent
         * a race against nf_ct_get_next_corpse() possibly called from
         * user context, else we insert an already 'dead' hash, blocking
         * further use of that particular connection -JM.
         */
        ct->status |= IPS_CONFIRMED;

        if (unlikely(nf_ct_is_dying(ct))) {
                NF_CT_STAT_INC(net, insert_failed);
                goto dying;
        }

        max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
        /* See if there's one in the list already, including reverse:
           NAT could have grabbed it without realizing, since we're
           not in the hash.  If there is, we lost race. */
        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                    zone, net))
                        goto out;
                if (chainlen++ > max_chainlen)
                        goto chaintoolong;
        }

        chainlen = 0;
        hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
                if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                    zone, net))
                        goto out;
                if (chainlen++ > max_chainlen) {
chaintoolong:
                        NF_CT_STAT_INC(net, chaintoolong);
                        NF_CT_STAT_INC(net, insert_failed);
                        ret = NF_DROP;
                        goto dying;
                }
        }

        /* Timer relative to confirmation time, not original
           setting time, otherwise we'd get timer wrap in
           weird delay cases. */
        ct->timeout += nfct_time_stamp;

        __nf_conntrack_insert_prepare(ct);

        /* Since the lookup is lockless, hash insertion must be done after
         * starting the timer and setting the CONFIRMED bit. The RCU barriers
         * guarantee that no other CPU can find the conntrack before the above
         * stores are visible.
         */
        __nf_conntrack_hash_insert(ct, hash, reply_hash);
        nf_conntrack_double_unlock(hash, reply_hash);
        local_bh_enable();

        /* ext area is still valid (rcu read lock is held,
         * but will go out of scope soon, we need to remove
         * this conntrack again.
         */
        if (!nf_ct_ext_valid_post(ct->ext)) {
                nf_ct_kill(ct);
                NF_CT_STAT_INC_ATOMIC(net, drop);
                return NF_DROP;
        }

        help = nfct_help(ct);
        if (help && help->helper)
                nf_conntrack_event_cache(IPCT_HELPER, ct);

        nf_conntrack_event_cache(master_ct(ct) ?
                                 IPCT_RELATED : IPCT_NEW, ct);
        return NF_ACCEPT;

out:
        ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
        nf_conntrack_double_unlock(hash, reply_hash);
        local_bh_enable();
        return ret;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);

/* Returns true if a connection corresponds to the tuple (required
   for NAT). */
int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
                         const struct nf_conn *ignored_conntrack)
{
        struct net *net = nf_ct_net(ignored_conntrack);
        const struct nf_conntrack_zone *zone;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_head *ct_hash;
        unsigned int hash, hsize;
        struct hlist_nulls_node *n;
        struct nf_conn *ct;

        zone = nf_ct_zone(ignored_conntrack);

        rcu_read_lock();
 begin:
        nf_conntrack_get_ht(&ct_hash, &hsize);
        hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);

        hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
                ct = nf_ct_tuplehash_to_ctrack(h);

                if (ct == ignored_conntrack)
                        continue;

                if (nf_ct_is_expired(ct)) {
                        nf_ct_gc_expired(ct);
                        continue;
                }

                if (nf_ct_key_equal(h, tuple, zone, net)) {
                        /* Tuple is taken already, so caller will need to find
                         * a new source port to use.
                         *
                         * Only exception:
                         * If the *original tuples* are identical, then both
                         * conntracks refer to the same flow.
                         * This is a rare situation, it can occur e.g. when
                         * more than one UDP packet is sent from same socket
                         * in different threads.
                         *
                         * Let nf_ct_resolve_clash() deal with this later.
                         */
                        if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                              &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
                                              nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
                                continue;

                        NF_CT_STAT_INC_ATOMIC(net, found);
                        rcu_read_unlock();
                        return 1;
                }
        }

        if (get_nulls_value(n) != hash) {
                NF_CT_STAT_INC_ATOMIC(net, search_restart);
                goto begin;
        }

        rcu_read_unlock();

        return 0;
}
EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);

#define NF_CT_EVICTION_RANGE        8

/* There's a small race here where we may free a just-assured
   connection.  Too bad: we're in trouble anyway. */
static unsigned int early_drop_list(struct net *net,
                                    struct hlist_nulls_head *head)
{
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        unsigned int drops = 0;
        struct nf_conn *tmp;

        hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
                tmp = nf_ct_tuplehash_to_ctrack(h);

                if (nf_ct_is_expired(tmp)) {
                        nf_ct_gc_expired(tmp);
                        continue;
                }

                if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
                    !net_eq(nf_ct_net(tmp), net) ||
                    nf_ct_is_dying(tmp))
                        continue;

                if (!refcount_inc_not_zero(&tmp->ct_general.use))
                        continue;

                /* load ->ct_net and ->status after refcount increase */
                smp_acquire__after_ctrl_dep();

                /* kill only if still in same netns -- might have moved due to
                 * SLAB_TYPESAFE_BY_RCU rules.
                 *
                 * We steal the timer reference.  If that fails timer has
                 * already fired or someone else deleted it. Just drop ref
                 * and move to next entry.
                 */
                if (net_eq(nf_ct_net(tmp), net) &&
                    nf_ct_is_confirmed(tmp) &&
                    nf_ct_delete(tmp, 0, 0))
                        drops++;

                nf_ct_put(tmp);
        }

        return drops;
}

static noinline int early_drop(struct net *net, unsigned int hash)
{
        unsigned int i, bucket;

        for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
                struct hlist_nulls_head *ct_hash;
                unsigned int hsize, drops;

                rcu_read_lock();
                nf_conntrack_get_ht(&ct_hash, &hsize);
                if (!i)
                        bucket = reciprocal_scale(hash, hsize);
                else
                        bucket = (bucket + 1) % hsize;

                drops = early_drop_list(net, &ct_hash[bucket]);
                rcu_read_unlock();

                if (drops) {
                        NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
                        return true;
                }
        }

        return false;
}

static bool gc_worker_skip_ct(const struct nf_conn *ct)
{
        return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct);
}

static bool gc_worker_can_early_drop(const struct nf_conn *ct)
{
        const struct nf_conntrack_l4proto *l4proto;
        u8 protonum = nf_ct_protonum(ct);

        if (!test_bit(IPS_ASSURED_BIT, &ct->status))
                return true;

        l4proto = nf_ct_l4proto_find(protonum);
        if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
                return true;

        return false;
}

static void gc_worker(struct work_struct *work)
{
        unsigned int i, hashsz, nf_conntrack_max95 = 0;
        u32 end_time, start_time = nfct_time_stamp;
        struct conntrack_gc_work *gc_work;
        unsigned int expired_count = 0;
        unsigned long next_run;
        s32 delta_time;
        long count;

        gc_work = container_of(work, struct conntrack_gc_work, dwork.work);

        i = gc_work->next_bucket;
        if (gc_work->early_drop)
                nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;

        if (i == 0) {
                gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
                gc_work->count = GC_SCAN_INITIAL_COUNT;
                gc_work->start_time = start_time;
        }

        next_run = gc_work->avg_timeout;
        count = gc_work->count;

        end_time = start_time + GC_SCAN_MAX_DURATION;

        do {
                struct nf_conntrack_tuple_hash *h;
                struct hlist_nulls_head *ct_hash;
                struct hlist_nulls_node *n;
                struct nf_conn *tmp;

                rcu_read_lock();

                nf_conntrack_get_ht(&ct_hash, &hashsz);
                if (i >= hashsz) {
                        rcu_read_unlock();
                        break;
                }

                hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
                        struct nf_conntrack_net *cnet;
                        struct net *net;
                        long expires;

                        tmp = nf_ct_tuplehash_to_ctrack(h);

                        if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
                                nf_ct_offload_timeout(tmp);
                                if (!nf_conntrack_max95)
                                        continue;
                        }

                        if (expired_count > GC_SCAN_EXPIRED_MAX) {
                                rcu_read_unlock();

                                gc_work->next_bucket = i;
                                gc_work->avg_timeout = next_run;
                                gc_work->count = count;

                                delta_time = nfct_time_stamp - gc_work->start_time;

                                /* re-sched immediately if total cycle time is exceeded */
                                next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
                                goto early_exit;
                        }

                        if (nf_ct_is_expired(tmp)) {
                                nf_ct_gc_expired(tmp);
                                expired_count++;
                                continue;
                        }

                        expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
                        expires = (expires - (long)next_run) / ++count;
                        next_run += expires;

                        if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
                                continue;

                        net = nf_ct_net(tmp);
                        cnet = nf_ct_pernet(net);
                        if (atomic_read(&cnet->count) < nf_conntrack_max95)
                                continue;

                        /* need to take reference to avoid possible races */
                        if (!refcount_inc_not_zero(&tmp->ct_general.use))
                                continue;

                        /* load ->status after refcount increase */
                        smp_acquire__after_ctrl_dep();

                        if (gc_worker_skip_ct(tmp)) {
                                nf_ct_put(tmp);
                                continue;
                        }

                        if (gc_worker_can_early_drop(tmp)) {
                                nf_ct_kill(tmp);
                                expired_count++;
                        }

                        nf_ct_put(tmp);
                }

                /* could check get_nulls_value() here and restart if ct
                 * was moved to another chain.  But given gc is best-effort
                 * we will just continue with next hash slot.
                 */
                rcu_read_unlock();
                cond_resched();
                i++;

                delta_time = nfct_time_stamp - end_time;
                if (delta_time > 0 && i < hashsz) {
                        gc_work->avg_timeout = next_run;
                        gc_work->count = count;
                        gc_work->next_bucket = i;
                        next_run = 0;
                        goto early_exit;
                }
        } while (i < hashsz);

        gc_work->next_bucket = 0;

        next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);

        delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
        if (next_run > (unsigned long)delta_time)
                next_run -= delta_time;
        else
                next_run = 1;

early_exit:
        if (gc_work->exiting)
                return;

        if (next_run)
                gc_work->early_drop = false;

        queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
}

static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
{
        INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
        gc_work->exiting = false;
}

static struct nf_conn *
__nf_conntrack_alloc(struct net *net,
                     const struct nf_conntrack_zone *zone,
                     const struct nf_conntrack_tuple *orig,
                     const struct nf_conntrack_tuple *repl,
                     gfp_t gfp, u32 hash)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);
        unsigned int ct_count;
        struct nf_conn *ct;

        /* We don't want any race condition at early drop stage */
        ct_count = atomic_inc_return(&cnet->count);

        if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
                if (!early_drop(net, hash)) {
                        if (!conntrack_gc_work.early_drop)
                                conntrack_gc_work.early_drop = true;
                        atomic_dec(&cnet->count);
                        net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
                        return ERR_PTR(-ENOMEM);
                }
        }

        /*
         * Do not use kmem_cache_zalloc(), as this cache uses
         * SLAB_TYPESAFE_BY_RCU.
         */
        ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
        if (ct == NULL)
                goto out;

        spin_lock_init(&ct->lock);
        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
        ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
        /* save hash for reusing when confirming */
        *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
        ct->status = 0;
        WRITE_ONCE(ct->timeout, 0);
        write_pnet(&ct->ct_net, net);
        memset_after(ct, 0, __nfct_init_offset);

        nf_ct_zone_add(ct, zone);

        /* Because we use RCU lookups, we set ct_general.use to zero before
         * this is inserted in any list.
         */
        refcount_set(&ct->ct_general.use, 0);
        return ct;
out:
        atomic_dec(&cnet->count);
        return ERR_PTR(-ENOMEM);
}

struct nf_conn *nf_conntrack_alloc(struct net *net,
                                   const struct nf_conntrack_zone *zone,
                                   const struct nf_conntrack_tuple *orig,
                                   const struct nf_conntrack_tuple *repl,
                                   gfp_t gfp)
{
        return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
}
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);

void nf_conntrack_free(struct nf_conn *ct)
{
        struct net *net = nf_ct_net(ct);
        struct nf_conntrack_net *cnet;

        /* A freed object has refcnt == 0, that's
         * the golden rule for SLAB_TYPESAFE_BY_RCU
         */
        WARN_ON(refcount_read(&ct->ct_general.use) != 0);

        if (ct->status & IPS_SRC_NAT_DONE) {
                const struct nf_nat_hook *nat_hook;

                rcu_read_lock();
                nat_hook = rcu_dereference(nf_nat_hook);
                if (nat_hook)
                        nat_hook->remove_nat_bysrc(ct);
                rcu_read_unlock();
        }

        kfree(ct->ext);
        kmem_cache_free(nf_conntrack_cachep, ct);
        cnet = nf_ct_pernet(net);

        smp_mb__before_atomic();
        atomic_dec(&cnet->count);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);


/* Allocate a new conntrack: we return -ENOMEM if classification
   failed due to stress.  Otherwise it really is unclassifiable. */
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
               const struct nf_conntrack_tuple *tuple,
               struct sk_buff *skb,
               unsigned int dataoff, u32 hash)
{
        struct nf_conn *ct;
        struct nf_conn_help *help;
        struct nf_conntrack_tuple repl_tuple;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        struct nf_conntrack_ecache *ecache;
#endif
        struct nf_conntrack_expect *exp = NULL;
        const struct nf_conntrack_zone *zone;
        struct nf_conn_timeout *timeout_ext;
        struct nf_conntrack_zone tmp;
        struct nf_conntrack_net *cnet;

        if (!nf_ct_invert_tuple(&repl_tuple, tuple))
                return NULL;

        zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
        ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
                                  hash);
        if (IS_ERR(ct))
                return (struct nf_conntrack_tuple_hash *)ct;

        if (!nf_ct_add_synproxy(ct, tmpl)) {
                nf_conntrack_free(ct);
                return ERR_PTR(-ENOMEM);
        }

        timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;

        if (timeout_ext)
                nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
                                      GFP_ATOMIC);

        nf_ct_acct_ext_add(ct, GFP_ATOMIC);
        nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
        nf_ct_labels_ext_add(ct);

#ifdef CONFIG_NF_CONNTRACK_EVENTS
        ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;

        if ((ecache || net->ct.sysctl_events) &&
            !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
                                  ecache ? ecache->expmask : 0,
                                  GFP_ATOMIC)) {
                nf_conntrack_free(ct);
                return ERR_PTR(-ENOMEM);
        }
#endif

        cnet = nf_ct_pernet(net);
        if (cnet->expect_count) {
                spin_lock_bh(&nf_conntrack_expect_lock);
                exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl));
                if (exp) {
                        /* Welcome, Mr. Bond.  We've been expecting you... */
                        __set_bit(IPS_EXPECTED_BIT, &ct->status);
                        /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
                        ct->master = exp->master;
                        if (exp->helper) {
                                help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
                                if (help)
                                        rcu_assign_pointer(help->helper, exp->helper);
                        }

#ifdef CONFIG_NF_CONNTRACK_MARK
                        ct->mark = READ_ONCE(exp->master->mark);
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
                        ct->secmark = exp->master->secmark;
#endif
                        NF_CT_STAT_INC(net, expect_new);
                }
                spin_unlock_bh(&nf_conntrack_expect_lock);
        }
        if (!exp && tmpl)
                __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);

        /* Other CPU might have obtained a pointer to this object before it was
         * released.  Because refcount is 0, refcount_inc_not_zero() will fail.
         *
         * After refcount_set(1) it will succeed; ensure that zeroing of
         * ct->status and the correct ct->net pointer are visible; else other
         * core might observe CONFIRMED bit which means the entry is valid and
         * in the hash table, but its not (anymore).
         */
        smp_wmb();

        /* Now it is going to be associated with an sk_buff, set refcount to 1. */
        refcount_set(&ct->ct_general.use, 1);

        if (exp) {
                if (exp->expectfn)
                        exp->expectfn(ct, exp);
                nf_ct_expect_put(exp);
        }

        return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
}

/* On success, returns 0, sets skb->_nfct | ctinfo */
static int
resolve_normal_ct(struct nf_conn *tmpl,
                  struct sk_buff *skb,
                  unsigned int dataoff,
                  u_int8_t protonum,
                  const struct nf_hook_state *state)
{
        const struct nf_conntrack_zone *zone;
        struct nf_conntrack_tuple tuple;
        struct nf_conntrack_tuple_hash *h;
        enum ip_conntrack_info ctinfo;
        struct nf_conntrack_zone tmp;
        u32 hash, zone_id, rid;
        struct nf_conn *ct;

        if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
                             dataoff, state->pf, protonum, state->net,
                             &tuple))
                return 0;

        /* look for tuple match */
        zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);

        zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
        hash = hash_conntrack_raw(&tuple, zone_id, state->net);
        h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);

        if (!h) {
                rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
                if (zone_id != rid) {
                        u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);

                        h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
                }
        }

        if (!h) {
                h = init_conntrack(state->net, tmpl, &tuple,
                                   skb, dataoff, hash);
                if (!h)
                        return 0;
                if (IS_ERR(h))
                        return PTR_ERR(h);
        }
        ct = nf_ct_tuplehash_to_ctrack(h);

        /* It exists; we have (non-exclusive) reference. */
        if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
                ctinfo = IP_CT_ESTABLISHED_REPLY;
        } else {
                unsigned long status = READ_ONCE(ct->status);

                /* Once we've had two way comms, always ESTABLISHED. */
                if (likely(status & IPS_SEEN_REPLY))
                        ctinfo = IP_CT_ESTABLISHED;
                else if (status & IPS_EXPECTED)
                        ctinfo = IP_CT_RELATED;
                else
                        ctinfo = IP_CT_NEW;
        }
        nf_ct_set(skb, ct, ctinfo);
        return 0;
}

/*
 * icmp packets need special treatment to handle error messages that are
 * related to a connection.
 *
 * Callers need to check if skb has a conntrack assigned when this
 * helper returns; in such case skb belongs to an already known connection.
 */
static unsigned int __cold
nf_conntrack_handle_icmp(struct nf_conn *tmpl,
                         struct sk_buff *skb,
                         unsigned int dataoff,
                         u8 protonum,
                         const struct nf_hook_state *state)
{
        int ret;

        if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
                ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
#if IS_ENABLED(CONFIG_IPV6)
        else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
                ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
#endif
        else
                return NF_ACCEPT;

        if (ret <= 0)
                NF_CT_STAT_INC_ATOMIC(state->net, error);

        return ret;
}

static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
                          enum ip_conntrack_info ctinfo)
{
        const unsigned int *timeout = nf_ct_timeout_lookup(ct);

        if (!timeout)
                timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;

        nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
        return NF_ACCEPT;
}

/* Returns verdict for packet, or -1 for invalid. */
static int nf_conntrack_handle_packet(struct nf_conn *ct,
                                      struct sk_buff *skb,
                                      unsigned int dataoff,
                                      enum ip_conntrack_info ctinfo,
                                      const struct nf_hook_state *state)
{
        switch (nf_ct_protonum(ct)) {
        case IPPROTO_TCP:
                return nf_conntrack_tcp_packet(ct, skb, dataoff,
                                               ctinfo, state);
        case IPPROTO_UDP:
                return nf_conntrack_udp_packet(ct, skb, dataoff,
                                               ctinfo, state);
        case IPPROTO_ICMP:
                return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
#if IS_ENABLED(CONFIG_IPV6)
        case IPPROTO_ICMPV6:
                return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
#endif
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
        case IPPROTO_UDPLITE:
                return nf_conntrack_udplite_packet(ct, skb, dataoff,
                                                   ctinfo, state);
#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
        case IPPROTO_SCTP:
                return nf_conntrack_sctp_packet(ct, skb, dataoff,
                                                ctinfo, state);
#endif
#ifdef CONFIG_NF_CT_PROTO_DCCP
        case IPPROTO_DCCP:
                return nf_conntrack_dccp_packet(ct, skb, dataoff,
                                                ctinfo, state);
#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
        case IPPROTO_GRE:
                return nf_conntrack_gre_packet(ct, skb, dataoff,
                                               ctinfo, state);
#endif
        }

        return generic_packet(ct, skb, ctinfo);
}

unsigned int
nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
{
        enum ip_conntrack_info ctinfo;
        struct nf_conn *ct, *tmpl;
        u_int8_t protonum;
        int dataoff, ret;

        tmpl = nf_ct_get(skb, &ctinfo);
        if (tmpl || ctinfo == IP_CT_UNTRACKED) {
                /* Previously seen (loopback or untracked)?  Ignore. */
                if ((tmpl && !nf_ct_is_template(tmpl)) ||
                     ctinfo == IP_CT_UNTRACKED)
                        return NF_ACCEPT;
                skb->_nfct = 0;
        }

        /* rcu_read_lock()ed by nf_hook_thresh */
        dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
        if (dataoff <= 0) {
                NF_CT_STAT_INC_ATOMIC(state->net, invalid);
                ret = NF_ACCEPT;
                goto out;
        }

        if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
                ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
                                               protonum, state);
                if (ret <= 0) {
                        ret = -ret;
                        goto out;
                }
                /* ICMP[v6] protocol trackers may assign one conntrack. */
                if (skb->_nfct)
                        goto out;
        }
repeat:
        ret = resolve_normal_ct(tmpl, skb, dataoff,
                                protonum, state);
        if (ret < 0) {
                /* Too stressed to deal. */
                NF_CT_STAT_INC_ATOMIC(state->net, drop);
                ret = NF_DROP;
                goto out;
        }

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct) {
                /* Not valid part of a connection */
                NF_CT_STAT_INC_ATOMIC(state->net, invalid);
                ret = NF_ACCEPT;
                goto out;
        }

        ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
        if (ret <= 0) {
                /* Invalid: inverse of the return code tells
                 * the netfilter core what to do */
                nf_ct_put(ct);
                skb->_nfct = 0;
                /* Special case: TCP tracker reports an attempt to reopen a
                 * closed/aborted connection. We have to go back and create a
                 * fresh conntrack.
                 */
                if (ret == -NF_REPEAT)
                        goto repeat;

                NF_CT_STAT_INC_ATOMIC(state->net, invalid);
                if (ret == NF_DROP)
                        NF_CT_STAT_INC_ATOMIC(state->net, drop);

                ret = -ret;
                goto out;
        }

        if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
            !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
                nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
        if (tmpl)
                nf_ct_put(tmpl);

        return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_in);

/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
                          enum ip_conntrack_info ctinfo,
                          const struct sk_buff *skb,
                          u32 extra_jiffies,
                          bool do_acct)
{
        /* Only update if this is not a fixed timeout */
        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
                goto acct;

        /* If not in hash table, timer will not be active yet */
        if (nf_ct_is_confirmed(ct))
                extra_jiffies += nfct_time_stamp;

        if (READ_ONCE(ct->timeout) != extra_jiffies)
                WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
        if (do_acct)
                nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);

bool nf_ct_kill_acct(struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo,
                     const struct sk_buff *skb)
{
        nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);

        return nf_ct_delete(ct, 0, 0);
}
EXPORT_SYMBOL_GPL(nf_ct_kill_acct);

#if IS_ENABLED(CONFIG_NF_CT_NETLINK)

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
#include <linux/mutex.h>

/* Generic function for tcp/udp/sctp/dccp and alike. */
int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
                               const struct nf_conntrack_tuple *tuple)
{
        if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
            nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -1;
}
EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);

const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
        [CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
        [CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
};
EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);

int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
                               struct nf_conntrack_tuple *t,
                               u_int32_t flags)
{
        if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
                if (!tb[CTA_PROTO_SRC_PORT])
                        return -EINVAL;

                t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
        }

        if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
                if (!tb[CTA_PROTO_DST_PORT])
                        return -EINVAL;

                t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);

unsigned int nf_ct_port_nlattr_tuple_size(void)
{
        static unsigned int size __read_mostly;

        if (!size)
                size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);

        return size;
}
EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
#endif

/* Used by ipt_REJECT and ip6t_REJECT. */
static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
{
        struct nf_conn *ct;
        enum ip_conntrack_info ctinfo;

        /* This ICMP is in reverse direction to the packet which caused it */
        ct = nf_ct_get(skb, &ctinfo);
        if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
                ctinfo = IP_CT_RELATED_REPLY;
        else
                ctinfo = IP_CT_RELATED;

        /* Attach to new skbuff, and increment count */
        nf_ct_set(nskb, ct, ctinfo);
        nf_conntrack_get(skb_nfct(nskb));
}

static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
                                 struct nf_conn *ct,
                                 enum ip_conntrack_info ctinfo)
{
        const struct nf_nat_hook *nat_hook;
        struct nf_conntrack_tuple_hash *h;
        struct nf_conntrack_tuple tuple;
        unsigned int status;
        int dataoff;
        u16 l3num;
        u8 l4num;

        l3num = nf_ct_l3num(ct);

        dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
        if (dataoff <= 0)
                return NF_DROP;

        if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
                             l4num, net, &tuple))
                return NF_DROP;

        if (ct->status & IPS_SRC_NAT) {
                memcpy(tuple.src.u3.all,
                       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
                       sizeof(tuple.src.u3.all));
                tuple.src.u.all =
                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
        }

        if (ct->status & IPS_DST_NAT) {
                memcpy(tuple.dst.u3.all,
                       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
                       sizeof(tuple.dst.u3.all));
                tuple.dst.u.all =
                        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
        }

        h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
        if (!h)
                return NF_ACCEPT;

        /* Store status bits of the conntrack that is clashing to re-do NAT
         * mangling according to what it has been done already to this packet.
         */
        status = ct->status;

        nf_ct_put(ct);
        ct = nf_ct_tuplehash_to_ctrack(h);
        nf_ct_set(skb, ct, ctinfo);

        nat_hook = rcu_dereference(nf_nat_hook);
        if (!nat_hook)
                return NF_ACCEPT;

        if (status & IPS_SRC_NAT) {
                unsigned int verdict = nat_hook->manip_pkt(skb, ct,
                                                           NF_NAT_MANIP_SRC,
                                                           IP_CT_DIR_ORIGINAL);
                if (verdict != NF_ACCEPT)
                        return verdict;
        }

        if (status & IPS_DST_NAT) {
                unsigned int verdict = nat_hook->manip_pkt(skb, ct,
                                                           NF_NAT_MANIP_DST,
                                                           IP_CT_DIR_ORIGINAL);
                if (verdict != NF_ACCEPT)
                        return verdict;
        }

        return NF_ACCEPT;
}

/* This packet is coming from userspace via nf_queue, complete the packet
 * processing after the helper invocation in nf_confirm().
 */
static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
                               enum ip_conntrack_info ctinfo)
{
        const struct nf_conntrack_helper *helper;
        const struct nf_conn_help *help;
        int protoff;

        help = nfct_help(ct);
        if (!help)
                return NF_ACCEPT;

        helper = rcu_dereference(help->helper);
        if (!helper)
                return NF_ACCEPT;

        if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
                return NF_ACCEPT;

        switch (nf_ct_l3num(ct)) {
        case NFPROTO_IPV4:
                protoff = skb_network_offset(skb) + ip_hdrlen(skb);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case NFPROTO_IPV6: {
                __be16 frag_off;
                u8 pnum;

                pnum = ipv6_hdr(skb)->nexthdr;
                protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
                                           &frag_off);
                if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
                        return NF_ACCEPT;
                break;
        }
#endif
        default:
                return NF_ACCEPT;
        }

        if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
            !nf_is_loopback_packet(skb)) {
                if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
                        NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
                        return NF_DROP;
                }
        }

        /* We've seen it coming out the other side: confirm it */
        return nf_conntrack_confirm(skb);
}

static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
{
        enum ip_conntrack_info ctinfo;
        struct nf_conn *ct;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct)
                return NF_ACCEPT;

        if (!nf_ct_is_confirmed(ct)) {
                int ret = __nf_conntrack_update(net, skb, ct, ctinfo);

                if (ret != NF_ACCEPT)
                        return ret;

                ct = nf_ct_get(skb, &ctinfo);
                if (!ct)
                        return NF_ACCEPT;
        }

        return nf_confirm_cthelper(skb, ct, ctinfo);
}

static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                                       const struct sk_buff *skb)
{
        const struct nf_conntrack_tuple *src_tuple;
        const struct nf_conntrack_tuple_hash *hash;
        struct nf_conntrack_tuple srctuple;
        enum ip_conntrack_info ctinfo;
        struct nf_conn *ct;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct) {
                src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
                memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
                return true;
        }

        if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
                               NFPROTO_IPV4, dev_net(skb->dev),
                               &srctuple))
                return false;

        hash = nf_conntrack_find_get(dev_net(skb->dev),
                                     &nf_ct_zone_dflt,
                                     &srctuple);
        if (!hash)
                return false;

        ct = nf_ct_tuplehash_to_ctrack(hash);
        src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
        memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
        nf_ct_put(ct);

        return true;
}

/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
                const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
{
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct hlist_nulls_node *n;
        spinlock_t *lockp;

        for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
                struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];

                if (hlist_nulls_empty(hslot))
                        continue;

                lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
                local_bh_disable();
                nf_conntrack_lock(lockp);
                hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
                        if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
                                continue;
                        /* All nf_conn objects are added to hash table twice, one
                         * for original direction tuple, once for the reply tuple.
                         *
                         * Exception: In the IPS_NAT_CLASH case, only the reply
                         * tuple is added (the original tuple already existed for
                         * a different object).
                         *
                         * We only need to call the iterator once for each
                         * conntrack, so we just use the 'reply' direction
                         * tuple while iterating.
                         */
                        ct = nf_ct_tuplehash_to_ctrack(h);

                        if (iter_data->net &&
                            !net_eq(iter_data->net, nf_ct_net(ct)))
                                continue;

                        if (iter(ct, iter_data->data))
                                goto found;
                }
                spin_unlock(lockp);
                local_bh_enable();
                cond_resched();
        }

        return NULL;
found:
        refcount_inc(&ct->ct_general.use);
        spin_unlock(lockp);
        local_bh_enable();
        return ct;
}

static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
                                  const struct nf_ct_iter_data *iter_data)
{
        unsigned int bucket = 0;
        struct nf_conn *ct;

        might_sleep();

        mutex_lock(&nf_conntrack_mutex);
        while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
                /* Time to push up daises... */

                nf_ct_delete(ct, iter_data->portid, iter_data->report);
                nf_ct_put(ct);
                cond_resched();
        }
        mutex_unlock(&nf_conntrack_mutex);
}

void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
                               const struct nf_ct_iter_data *iter_data)
{
        struct net *net = iter_data->net;
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);

        might_sleep();

        if (atomic_read(&cnet->count) == 0)
                return;

        nf_ct_iterate_cleanup(iter, iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);

/**
 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
 * @iter: callback to invoke for each conntrack
 * @data: data to pass to @iter
 *
 * Like nf_ct_iterate_cleanup, but first marks conntracks on the
 * unconfirmed list as dying (so they will not be inserted into
 * main table).
 *
 * Can only be called in module exit path.
 */
void
nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
{
        struct nf_ct_iter_data iter_data = {};
        struct net *net;

        down_read(&net_rwsem);
        for_each_net(net) {
                struct nf_conntrack_net *cnet = nf_ct_pernet(net);

                if (atomic_read(&cnet->count) == 0)
                        continue;
                nf_queue_nf_hook_drop(net);
        }
        up_read(&net_rwsem);

        /* Need to wait for netns cleanup worker to finish, if its
         * running -- it might have deleted a net namespace from
         * the global list, so hook drop above might not have
         * affected all namespaces.
         */
        net_ns_barrier();

        /* a skb w. unconfirmed conntrack could have been reinjected just
         * before we called nf_queue_nf_hook_drop().
         *
         * This makes sure its inserted into conntrack table.
         */
        synchronize_net();

        nf_ct_ext_bump_genid();
        iter_data.data = data;
        nf_ct_iterate_cleanup(iter, &iter_data);

        /* Another cpu might be in a rcu read section with
         * rcu protected pointer cleared in iter callback
         * or hidden via nf_ct_ext_bump_genid() above.
         *
         * Wait until those are done.
         */
        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);

static int kill_all(struct nf_conn *i, void *data)
{
        return 1;
}

void nf_conntrack_cleanup_start(void)
{
        cleanup_nf_conntrack_bpf();
        conntrack_gc_work.exiting = true;
}

void nf_conntrack_cleanup_end(void)
{
        RCU_INIT_POINTER(nf_ct_hook, NULL);
        cancel_delayed_work_sync(&conntrack_gc_work.dwork);
        kvfree(nf_conntrack_hash);

        nf_conntrack_proto_fini();
        nf_conntrack_helper_fini();
        nf_conntrack_expect_fini();

        kmem_cache_destroy(nf_conntrack_cachep);
}

/*
 * Mishearing the voices in his head, our hero wonders how he's
 * supposed to kill the mall.
 */
void nf_conntrack_cleanup_net(struct net *net)
{
        LIST_HEAD(single);

        list_add(&net->exit_list, &single);
        nf_conntrack_cleanup_net_list(&single);
}

void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
        struct nf_ct_iter_data iter_data = {};
        struct net *net;
        int busy;

        /*
         * This makes sure all current packets have passed through
         *  netfilter framework.  Roll on, two-stage module
         *  delete...
         */
        synchronize_rcu_expedited();
i_see_dead_people:
        busy = 0;
        list_for_each_entry(net, net_exit_list, exit_list) {
                struct nf_conntrack_net *cnet = nf_ct_pernet(net);

                iter_data.net = net;
                nf_ct_iterate_cleanup_net(kill_all, &iter_data);
                if (atomic_read(&cnet->count) != 0)
                        busy = 1;
        }
        if (busy) {
                schedule();
                goto i_see_dead_people;
        }

        list_for_each_entry(net, net_exit_list, exit_list) {
                nf_conntrack_ecache_pernet_fini(net);
                nf_conntrack_expect_pernet_fini(net);
                free_percpu(net->ct.stat);
        }
}

void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
        struct hlist_nulls_head *hash;
        unsigned int nr_slots, i;

        if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
                return NULL;

        BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
        nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));

        hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);

        if (hash && nulls)
                for (i = 0; i < nr_slots; i++)
                        INIT_HLIST_NULLS_HEAD(&hash[i], i);

        return hash;
}
EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);

int nf_conntrack_hash_resize(unsigned int hashsize)
{
        int i, bucket;
        unsigned int old_size;
        struct hlist_nulls_head *hash, *old_hash;
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;

        if (!hashsize)
                return -EINVAL;

        hash = nf_ct_alloc_hashtable(&hashsize, 1);
        if (!hash)
                return -ENOMEM;

        mutex_lock(&nf_conntrack_mutex);
        old_size = nf_conntrack_htable_size;
        if (old_size == hashsize) {
                mutex_unlock(&nf_conntrack_mutex);
                kvfree(hash);
                return 0;
        }

        local_bh_disable();
        nf_conntrack_all_lock();
        write_seqcount_begin(&nf_conntrack_generation);

        /* Lookups in the old hash might happen in parallel, which means we
         * might get false negatives during connection lookup. New connections
         * created because of a false negative won't make it into the hash
         * though since that required taking the locks.
         */

        for (i = 0; i < nf_conntrack_htable_size; i++) {
                while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
                        unsigned int zone_id;

                        h = hlist_nulls_entry(nf_conntrack_hash[i].first,
                                              struct nf_conntrack_tuple_hash, hnnode);
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        hlist_nulls_del_rcu(&h->hnnode);

                        zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
                        bucket = __hash_conntrack(nf_ct_net(ct),
                                                  &h->tuple, zone_id, hashsize);
                        hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
                }
        }
        old_hash = nf_conntrack_hash;

        nf_conntrack_hash = hash;
        nf_conntrack_htable_size = hashsize;

        write_seqcount_end(&nf_conntrack_generation);
        nf_conntrack_all_unlock();
        local_bh_enable();

        mutex_unlock(&nf_conntrack_mutex);

        synchronize_net();
        kvfree(old_hash);
        return 0;
}

int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
{
        unsigned int hashsize;
        int rc;

        if (current->nsproxy->net_ns != &init_net)
                return -EOPNOTSUPP;

        /* On boot, we can set this without any fancy locking. */
        if (!nf_conntrack_hash)
                return param_set_uint(val, kp);

        rc = kstrtouint(val, 0, &hashsize);
        if (rc)
                return rc;

        return nf_conntrack_hash_resize(hashsize);
}

int nf_conntrack_init_start(void)
{
        unsigned long nr_pages = totalram_pages();
        int max_factor = 8;
        int ret = -ENOMEM;
        int i;

        seqcount_spinlock_init(&nf_conntrack_generation,
                               &nf_conntrack_locks_all_lock);

        for (i = 0; i < CONNTRACK_LOCKS; i++)
                spin_lock_init(&nf_conntrack_locks[i]);

        if (!nf_conntrack_htable_size) {
                nf_conntrack_htable_size
                        = (((nr_pages << PAGE_SHIFT) / 16384)
                           / sizeof(struct hlist_head));
                if (BITS_PER_LONG >= 64 &&
                    nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
                        nf_conntrack_htable_size = 262144;
                else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
                        nf_conntrack_htable_size = 65536;

                if (nf_conntrack_htable_size < 1024)
                        nf_conntrack_htable_size = 1024;
                /* Use a max. factor of one by default to keep the average
                 * hash chain length at 2 entries.  Each entry has to be added
                 * twice (once for original direction, once for reply).
                 * When a table size is given we use the old value of 8 to
                 * avoid implicit reduction of the max entries setting.
                 */
                max_factor = 1;
        }

        nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
        if (!nf_conntrack_hash)
                return -ENOMEM;

        nf_conntrack_max = max_factor * nf_conntrack_htable_size;

        nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
                                                sizeof(struct nf_conn),
                                                NFCT_INFOMASK + 1,
                                                SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
        if (!nf_conntrack_cachep)
                goto err_cachep;

        ret = nf_conntrack_expect_init();
        if (ret < 0)
                goto err_expect;

        ret = nf_conntrack_helper_init();
        if (ret < 0)
                goto err_helper;

        ret = nf_conntrack_proto_init();
        if (ret < 0)
                goto err_proto;

        conntrack_gc_work_init(&conntrack_gc_work);
        queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);

        ret = register_nf_conntrack_bpf();
        if (ret < 0)
                goto err_kfunc;

        return 0;

err_kfunc:
        cancel_delayed_work_sync(&conntrack_gc_work.dwork);
        nf_conntrack_proto_fini();
err_proto:
        nf_conntrack_helper_fini();
err_helper:
        nf_conntrack_expect_fini();
err_expect:
        kmem_cache_destroy(nf_conntrack_cachep);
err_cachep:
        kvfree(nf_conntrack_hash);
        return ret;
}

static void nf_conntrack_set_closing(struct nf_conntrack *nfct)
{
        struct nf_conn *ct = nf_ct_to_nf_conn(nfct);

        switch (nf_ct_protonum(ct)) {
        case IPPROTO_TCP:
                nf_conntrack_tcp_set_closing(ct);
                break;
        }
}

static const struct nf_ct_hook nf_conntrack_hook = {
        .update                = nf_conntrack_update,
        .destroy        = nf_ct_destroy,
        .get_tuple_skb  = nf_conntrack_get_tuple_skb,
        .attach                = nf_conntrack_attach,
        .set_closing        = nf_conntrack_set_closing,
        .confirm        = __nf_conntrack_confirm,
};

void nf_conntrack_init_end(void)
{
        RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
}

/*
 * We need to use special "null" values, not used in hash table
 */
#define UNCONFIRMED_NULLS_VAL        ((1<<30)+0)

int nf_conntrack_init_net(struct net *net)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);
        int ret = -ENOMEM;

        BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
        BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
        atomic_set(&cnet->count, 0);

        net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
        if (!net->ct.stat)
                return ret;

        ret = nf_conntrack_expect_pernet_init(net);
        if (ret < 0)
                goto err_expect;

        nf_conntrack_acct_pernet_init(net);
        nf_conntrack_tstamp_pernet_init(net);
        nf_conntrack_ecache_pernet_init(net);
        nf_conntrack_proto_pernet_init(net);

        return 0;

err_expect:
        free_percpu(net->ct.stat);
        return ret;
}

/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */

int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
{
        if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
                return -EPERM;

        __nf_ct_set_timeout(ct, timeout);

        if (test_bit(IPS_DYING_BIT, &ct->status))
                return -ETIME;

        return 0;
}
EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);

void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
{
        unsigned int bit;

        /* Ignore these unchangable bits */
        on &= ~IPS_UNCHANGEABLE_MASK;
        off &= ~IPS_UNCHANGEABLE_MASK;

        for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
                if (on & (1 << bit))
                        set_bit(bit, &ct->status);
                else if (off & (1 << bit))
                        clear_bit(bit, &ct->status);
        }
}
EXPORT_SYMBOL_GPL(__nf_ct_change_status);

int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
{
        unsigned long d;

        d = ct->status ^ status;

        if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
                /* unchangeable */
                return -EBUSY;

        if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
                /* SEEN_REPLY bit can only be set */
                return -EBUSY;

        if (d & IPS_ASSURED && !(status & IPS_ASSURED))
                /* ASSURED bit can only be set */
                return -EBUSY;

        __nf_ct_change_status(ct, status, 0);
        return 0;
}
EXPORT_SYMBOL_GPL(nf_ct_change_status_common);






































































































































































































































































    1 


























































































    1 



    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 













    1 



    1 
    1 



    1 





























































































































































































































































    1 







    1 





















    1 




































































































































































































































































































































































































































































































































































































































































    1 







    1 











    1 







    1 



























    1 


























    1 










































































    1 


    1 







    1 







    1 















    1 





































































































































































































































































































































































    1 








































    1 
    1 

















    1 

























    1 







    1 








    1 






    1 
    1 



    1 










    1 




    1 







    1 












    1 



    1 



























































































    1 









    1 












































































    1 















    1 



















    1 









    1 
























    1 










    1 

    1 








    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






















































































































































































































































































































































































































































































































    1 




























































































































































































































































































































































































































































































































































































































































































































































    1 






































































































































































































    1 












































    1 





    1 







    1 


    1 




















    1 













































    1 







    1 












    1 






    1 
    1 
    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 



    1 


    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 













    1 

































































































































































































    1 





    1 





    1 
    1 




















    1 



    1 




    1 








    1 








    1 

























































































































































































    1 



    1 























    1 


















































    1 









    1 





















































































    1 




















    1 


    1 
    1 































    1 







    1 
    1 




















    1 






































































































































































































    1 























































































































































































































    1 






















    1 



    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















    1 

















    1 


















    1 








    1 


    1 








    1 

















    1 






    1 
    1 
































































































    1 


























































    1 






    1 




    1 




    1 







































    1 



    1 
    1 




























    1 


































































































































































































































































































    1 








    1 
    1 




















    1 









































































































































































    1 

























































































































































































































































































































    1 





    1 










    1 
    1 






















    1 



    1 

    1 


















    1 
    1 



























    1 





    1 




    1 
























































































    1 
























    1 



    1 





















































































































































































    1 






































































































































































































































































    1 



































































































































































































































    1 













    1 

















    1 
    1 











    1 













































    1 






























    1 


















    1 





















































































































































































































































































































































































































































































































































































    1 
    1 










    1 



    1 



















    1 















    1 

















































































    1 























    1 



    1 
















    1 
    1 


















    1 










    1 


    1 










    1 

    1 



















    1 



    1 


    1 





































































    1 
















    1 


    1 

    1 
    1 
























    1 




    1 



























































    1 


































































































































































































































































































































































































































































    1 





















    1 








































































    1 

























    1 


    1 




    1 
    1 





    1 



























    1 









    1 



    1 



















    1 



















    1 



    1 

    1 

    1 

    1 

    1 







    1 


    1 





    1 




    1 
    1 


    1 







    1 









    1 














    1 













    1 






    1 






    1 

    1 




    1 






    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
13444
13445
13446
13447
13448
13449
13450
13451
13452
13453
13454
13455
13456
13457
13458
13459
13460
13461
13462
13463
13464
13465
13466
13467
13468
13469
13470
13471
13472
13473
13474
13475
13476
13477
13478
13479
13480
13481
13482
13483
13484
13485
13486
13487
13488
13489
13490
13491
13492
13493
13494
13495
13496
13497
13498
13499
13500
13501
13502
13503
13504
13505
13506
13507
13508
13509
13510
13511
13512
13513
13514
13515
13516
13517
13518
13519
13520
13521
13522
13523
13524
13525
13526
13527
13528
13529
13530
13531
13532
13533
13534
13535
13536
13537
13538
13539
13540
13541
13542
13543
13544
13545
13546
13547
13548
13549
13550
13551
13552
13553
13554
13555
13556
13557
13558
13559
13560
13561
13562
13563
13564
13565
13566
13567
13568
13569
13570
13571
13572
13573
13574
13575
13576
13577
13578
13579
13580
13581
13582
13583
13584
13585
13586
13587
13588
13589
13590
13591
13592
13593
13594
13595
13596
13597
13598
13599
13600
13601
13602
13603
13604
13605
13606
13607
13608
13609
13610
13611
13612
13613
13614
13615
13616
13617
13618
13619
13620
13621
13622
13623
13624
13625
13626
13627
13628
13629
13630
13631
13632
13633
13634
13635
13636
13637
13638
13639
13640
13641
13642
13643
13644
13645
13646
13647
13648
13649
13650
13651
13652
13653
13654
13655
13656
13657
13658
13659
13660
13661
13662
13663
13664
13665
13666
13667
13668
13669
13670
13671
13672
13673
13674
13675
13676
13677
13678
13679
13680
13681
13682
13683
13684
13685
13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
13916
13917
13918
13919
13920
13921
13922
13923
13924
13925
13926
13927
13928
13929
13930
13931
13932
13933
13934
13935
13936
13937
13938
13939
13940
13941
13942
13943
13944
13945
13946
13947
13948
13949
13950
13951
13952
13953
13954
13955
13956
13957
13958
13959
13960
13961
13962
13963
13964
13965
13966
13967
13968
13969
13970
13971
13972
13973
13974
13975
13976
13977
13978
13979
13980
13981
13982
13983
13984
13985
13986
13987
13988
13989
13990
13991
13992
13993
13994
13995
13996
13997
13998
13999
14000
14001
14002
14003
14004
14005
14006
14007
14008
14009
14010
14011
14012
14013
14014
14015
14016
14017
14018
14019
14020
14021
14022
14023
14024
14025
14026
14027
14028
14029
14030
14031
14032
14033
14034
14035
14036
14037
14038
14039
14040
14041
14042
14043
14044
14045
14046
14047
14048
14049
14050
14051
14052
14053
14054
14055
14056
14057
14058
14059
14060
14061
14062
14063
14064
14065
14066
14067
14068
14069
14070
14071
14072
14073
14074
14075
14076
14077
14078
14079
14080
14081
14082
14083
14084
14085
14086
14087
14088
14089
14090
14091
14092
14093
14094
14095
14096
14097
14098
14099
14100
14101
14102
14103
14104
14105
14106
14107
14108
14109
14110
14111
14112
14113
14114
14115
14116
14117
14118
14119
14120
14121
14122
14123
14124
14125
14126
14127
14128
14129
14130
14131
14132
14133
14134
14135
14136
14137
14138
14139
14140
14141
14142
14143
14144
14145
14146
14147
14148
14149
14150
14151
14152
14153
14154
14155
14156
14157
14158
14159
14160
14161
14162
14163
14164
14165
14166
14167
14168
14169
14170
14171
14172
14173
14174
14175
14176
14177
14178
14179
14180
14181
14182
14183
14184
14185
14186
14187
14188
14189
14190
14191
14192
14193
14194
14195
14196
14197
14198
14199
14200
14201
14202
14203
14204
14205
14206
14207
14208
14209
14210
14211
14212
14213
14214
14215
14216
14217
14218
14219
14220
14221
14222
14223
14224
14225
14226
14227
14228
14229
14230
14231
14232
14233
14234
14235
14236
14237
14238
14239
14240
14241
14242
14243
14244
14245
14246
14247
14248
14249
14250
14251
14252
14253
14254
14255
14256
14257
14258
14259
14260
14261
14262
14263
14264
14265
14266
14267
14268
14269
14270
14271
14272
14273
14274
14275
14276
14277
14278
14279
14280
14281
14282
14283
14284
14285
14286
14287
14288
14289
14290
14291
14292
14293
14294
14295
14296
14297
14298
14299
14300
14301
14302
14303
14304
14305
14306
14307
14308
14309
14310
14311
14312
14313
14314
14315
14316
14317
14318
14319
14320
14321
14322
14323
14324
14325
14326
14327
14328
14329
14330
14331
14332
14333
14334
14335
14336
14337
14338
14339
14340
14341
14342
14343
14344
14345
14346
14347
14348
14349
14350
14351
14352
14353
14354
14355
14356
14357
14358
14359
14360
14361
14362
14363
14364
14365
14366
14367
14368
14369
14370
14371
14372
14373
14374
14375
14376
14377
14378
14379
14380
14381
14382
14383
14384
14385
14386
14387
14388
14389
14390
14391
14392
14393
14394
14395
14396
14397
14398
14399
14400
14401
14402
14403
14404
14405
14406
14407
14408
14409
14410
14411
14412
14413
14414
14415
14416
14417
14418
14419
14420
14421
14422
14423
14424
14425
14426
14427
14428
14429
14430
14431
14432
14433
14434
14435
14436
14437
14438
14439
14440
14441
14442
14443
14444
14445
14446
14447
14448
14449
14450
14451
14452
14453
14454
14455
14456
14457
14458
14459
14460
14461
14462
14463
14464
14465
14466
14467
14468
14469
14470
14471
14472
14473
14474
14475
14476
14477
14478
14479
14480
14481
14482
14483
14484
14485
14486
14487
14488
14489
14490
14491
14492
14493
14494
14495
14496
14497
14498
14499
14500
14501
14502
14503
14504
14505
14506
14507
14508
14509
14510
14511
14512
14513
14514
14515
14516
14517
14518
14519
14520
14521
14522
14523
14524
14525
14526
14527
14528
14529
14530
14531
14532
14533
14534
14535
14536
14537
14538
14539
14540
14541
14542
14543
14544
14545
14546
14547
14548
14549
14550
14551
14552
14553
14554
14555
14556
14557
14558
14559
14560
14561
14562
14563
14564
14565
14566
14567
14568
14569
14570
14571
14572
14573
14574
14575
14576
14577
14578
14579
14580
14581
14582
14583
14584
14585
14586
14587
14588
14589
14590
14591
14592
14593
14594
14595
14596
14597
14598
14599
14600
14601
14602
14603
14604
14605
14606
14607
14608
14609
14610
14611
14612
14613
14614
14615
14616
14617
14618
14619
14620
14621
14622
14623
14624
14625
14626
14627
14628
14629
14630
14631
14632
14633
14634
14635
14636
14637
14638
14639
14640
14641
14642
14643
14644
14645
14646
14647
14648
14649
14650
14651
14652
14653
14654
14655
14656
14657
14658
14659
14660
14661
14662
14663
14664
14665
14666
14667
14668
14669
14670
14671
14672
14673
14674
14675
14676
14677
14678
14679
14680
14681
14682
14683
14684
14685
14686
14687
14688
14689
14690
14691
14692
14693
14694
14695
14696
14697
14698
14699
14700
14701
14702
14703
14704
14705
14706
14707
14708
14709
14710
14711
14712
14713
14714
14715
14716
14717
14718
14719
14720
14721
14722
14723
14724
14725
14726
14727
14728
14729
14730
14731
14732
14733
14734
14735
14736
14737
14738
14739
14740
14741
14742
14743
14744
14745
14746
14747
14748
14749
14750
14751
14752
14753
14754
14755
14756
14757
14758
14759
14760
14761
14762
14763
14764
14765
14766
14767
14768
14769
14770
14771
14772
14773
14774
14775
14776
14777
14778
14779
14780
14781
14782
14783
14784
14785
14786
14787
14788
14789
14790
14791
14792
14793
14794
14795
14796
14797
14798
14799
14800
14801
14802
14803
14804
14805
14806
14807
14808
14809
14810
14811
14812
14813
14814
14815
14816
14817
14818
14819
14820
14821
14822
14823
14824
14825
14826
14827
14828
14829
14830
14831
14832
14833
14834
14835
14836
14837
14838
14839
14840
14841
14842
14843
14844
14845
14846
14847
14848
14849
14850
14851
14852
14853
14854
14855
14856
14857
14858
14859
14860
14861
14862
14863
14864
14865
14866
14867
14868
14869
14870
14871
14872
14873
14874
14875
14876
14877
14878
14879
14880
14881
14882
14883
14884
14885
14886
14887
14888
14889
14890
14891
14892
14893
14894
14895
14896
14897
14898
14899
14900
14901
14902
14903
14904
14905
14906
14907
14908
14909
14910
14911
14912
14913
14914
14915
14916
14917
14918
14919
14920
14921
14922
14923
14924
14925
14926
14927
14928
14929
14930
14931
14932
14933
14934
14935
14936
14937
14938
14939
14940
14941
14942
14943
14944
14945
14946
14947
14948
14949
14950
14951
14952
14953
14954
14955
14956
14957
14958
14959
14960
14961
14962
14963
14964
14965
14966
14967
14968
14969
14970
14971
14972
14973
14974
14975
14976
14977
14978
14979
14980
14981
14982
14983
14984
14985
14986
14987
14988
14989
14990
14991
14992
14993
14994
14995
14996
14997
14998
14999
15000
15001
15002
15003
15004
15005
15006
15007
15008
15009
15010
15011
15012
15013
15014
15015
15016
15017
15018
15019
15020
15021
15022
15023
15024
15025
15026
15027
15028
15029
15030
15031
15032
15033
15034
15035
15036
15037
15038
15039
15040
15041
15042
15043
15044
15045
15046
15047
15048
15049
15050
15051
15052
15053
15054
15055
15056
15057
15058
15059
15060
15061
15062
15063
15064
15065
15066
15067
15068
15069
15070
15071
15072
15073
15074
15075
15076
15077
15078
15079
15080
15081
15082
15083
15084
15085
15086
15087
15088
15089
15090
15091
15092
15093
15094
15095
15096
15097
15098
15099
15100
15101
15102
15103
15104
15105
15106
15107
15108
15109
15110
15111
15112
15113
15114
15115
15116
15117
15118
15119
15120
15121
15122
15123
15124
15125
15126
15127
15128
15129
15130
15131
15132
15133
15134
15135
15136
15137
15138
15139
15140
15141
15142
15143
15144
15145
15146
15147
15148
15149
15150
15151
15152
15153
15154
15155
15156
15157
15158
15159
15160
15161
15162
15163
15164
15165
15166
15167
15168
15169
15170
15171
15172
15173
15174
15175
15176
15177
15178
15179
15180
15181
15182
15183
15184
15185
15186
15187
15188
15189
15190
15191
15192
15193
15194
15195
15196
15197
15198
15199
15200
15201
15202
15203
15204
15205
15206
15207
15208
15209
15210
15211
15212
15213
15214
15215
15216
15217
15218
15219
15220
15221
15222
15223
15224
15225
15226
15227
15228
15229
15230
15231
15232
15233
15234
15235
15236
15237
15238
15239
15240
15241
15242
15243
15244
15245
15246
15247
15248
15249
15250
15251
15252
15253
15254
15255
15256
15257
15258
15259
15260
15261
15262
15263
15264
15265
15266
15267
15268
15269
15270
15271
15272
15273
15274
15275
15276
15277
15278
15279
15280
15281
15282
15283
15284
15285
15286
15287
15288
15289
15290
15291
15292
15293
15294
15295
15296
15297
15298
15299
15300
15301
15302
15303
15304
15305
15306
15307
15308
15309
15310
15311
15312
15313
15314
15315
15316
15317
15318
15319
15320
15321
15322
15323
15324
15325
15326
15327
15328
15329
15330
15331
15332
15333
15334
15335
15336
15337
15338
15339
15340
15341
15342
15343
15344
15345
15346
15347
15348
15349
15350
15351
15352
15353
15354
15355
15356
15357
15358
15359
15360
15361
15362
15363
15364
15365
15366
15367
15368
15369
15370
15371
15372
15373
15374
15375
15376
15377
15378
15379
15380
15381
15382
15383
15384
15385
15386
15387
15388
15389
15390
15391
15392
15393
15394
15395
15396
15397
15398
15399
15400
15401
15402
15403
15404
15405
15406
15407
15408
15409
15410
15411
15412
15413
15414
15415
15416
15417
15418
15419
15420
15421
15422
15423
15424
15425
15426
15427
15428
15429
15430
15431
15432
15433
15434
15435
15436
15437
15438
15439
15440
15441
15442
15443
15444
15445
15446
15447
15448
15449
15450
15451
15452
15453
15454
15455
15456
15457
15458
15459
15460
15461
15462
15463
15464
15465
15466
15467
15468
15469
15470
15471
15472
15473
15474
15475
15476
15477
15478
15479
15480
15481
15482
15483
15484
15485
15486
15487
15488
15489
15490
15491
15492
15493
15494
15495
15496
15497
15498
15499
15500
15501
15502
15503
15504
15505
15506
15507
15508
15509
15510
15511
15512
15513
15514
15515
15516
15517
15518
15519
15520
15521
15522
15523
15524
15525
15526
15527
15528
15529
15530
15531
15532
15533
15534
15535
15536
15537
15538
15539
15540
15541
15542
15543
15544
15545
15546
15547
15548
15549
15550
15551
15552
15553
15554
15555
15556
15557
15558
15559
15560
15561
15562
15563
15564
15565
15566
15567
15568
15569
15570
15571
15572
15573
15574
15575
15576
15577
15578
15579
15580
15581
15582
15583
15584
15585
15586
15587
15588
15589
15590
15591
15592
15593
15594
15595
15596
15597
15598
15599
15600
15601
15602
15603
15604
15605
15606
15607
15608
15609
15610
15611
15612
15613
15614
15615
15616
15617
15618
15619
15620
15621
15622
15623
15624
15625
15626
15627
15628
15629
15630
15631
15632
15633
15634
15635
15636
15637
15638
15639
15640
15641
15642
15643
15644
15645
15646
15647
15648
15649
15650
15651
15652
15653
15654
15655
15656
15657
15658
15659
15660
15661
15662
15663
15664
15665
15666
15667
15668
15669
15670
15671
15672
15673
15674
15675
15676
15677
15678
15679
15680
15681
15682
15683
15684
15685
15686
15687
15688
15689
15690
15691
15692
15693
15694
15695
15696
15697
15698
15699
15700
15701
15702
15703
15704
15705
15706
15707
15708
15709
15710
15711
15712
15713
15714
15715
15716
15717
15718
15719
15720
15721
15722
15723
15724
15725
15726
15727
15728
15729
15730
15731
15732
15733
15734
15735
15736
15737
15738
15739
15740
15741
15742
15743
15744
15745
15746
15747
15748
15749
15750
15751
15752
15753
15754
15755
15756
15757
15758
15759
15760
15761
15762
15763
15764
15765
15766
15767
15768
15769
15770
15771
15772
15773
15774
15775
15776
15777
15778
15779
15780
15781
15782
15783
15784
15785
15786
15787
15788
15789
15790
15791
15792
15793
15794
15795
15796
15797
15798
15799
15800
15801
15802
15803
15804
15805
15806
15807
15808
15809
15810
15811
15812
15813
15814
15815
15816
15817
15818
15819
15820
15821
15822
15823
15824
15825
15826
15827
15828
15829
15830
15831
15832
15833
15834
15835
15836
15837
15838
15839
15840
15841
15842
15843
15844
15845
15846
15847
15848
15849
15850
15851
15852
15853
15854
15855
15856
15857
15858
15859
15860
15861
15862
15863
15864
15865
15866
15867
15868
15869
15870
15871
15872
15873
15874
15875
15876
15877
15878
15879
15880
15881
15882
15883
15884
15885
15886
15887
15888
15889
15890
15891
15892
15893
15894
15895
15896
15897
15898
15899
15900
15901
15902
15903
15904
15905
15906
15907
15908
15909
15910
15911
15912
15913
15914
15915
15916
15917
15918
15919
15920
15921
15922
15923
15924
15925
15926
15927
15928
15929
15930
15931
15932
15933
15934
15935
15936
15937
15938
15939
15940
15941
15942
15943
15944
15945
15946
15947
15948
15949
15950
15951
15952
15953
15954
15955
15956
15957
15958
15959
15960
15961
15962
15963
15964
15965
15966
15967
15968
15969
15970
15971
15972
15973
15974
15975
15976
15977
15978
15979
15980
15981
15982
15983
15984
15985
15986
15987
15988
15989
15990
15991
15992
15993
15994
15995
15996
15997
15998
15999
16000
16001
16002
16003
16004
16005
16006
16007
16008
16009
16010
16011
16012
16013
16014
16015
16016
16017
16018
16019
16020
16021
16022
16023
16024
16025
16026
16027
16028
16029
16030
16031
16032
16033
16034
16035
16036
16037
16038
16039
16040
16041
16042
16043
16044
16045
16046
16047
16048
16049
16050
16051
16052
16053
16054
16055
16056
16057
16058
16059
16060
16061
16062
16063
16064
16065
16066
16067
16068
16069
16070
16071
16072
16073
16074
16075
16076
16077
16078
16079
16080
16081
16082
16083
16084
16085
16086
16087
16088
16089
16090
16091
16092
16093
16094
16095
16096
16097
16098
16099
16100
16101
16102
16103
16104
16105
16106
16107
16108
16109
16110
16111
16112
16113
16114
16115
16116
16117
16118
16119
16120
16121
16122
16123
16124
16125
16126
16127
16128
16129
16130
16131
16132
16133
16134
16135
16136
16137
16138
16139
16140
16141
16142
16143
16144
16145
16146
16147
16148
16149
16150
16151
16152
16153
16154
16155
16156
16157
16158
16159
16160
16161
16162
16163
16164
16165
16166
16167
16168
16169
16170
16171
16172
16173
16174
16175
16176
16177
16178
16179
16180
16181
16182
16183
16184
16185
16186
16187
16188
16189
16190
16191
16192
16193
16194
16195
16196
16197
16198
16199
16200
16201
16202
16203
16204
16205
16206
16207
16208
16209
16210
16211
16212
16213
16214
16215
16216
16217
16218
16219
16220
16221
16222
16223
16224
16225
16226
16227
16228
16229
16230
16231
16232
16233
16234
16235
16236
16237
16238
16239
16240
16241
16242
16243
16244
16245
16246
16247
16248
16249
16250
16251
16252
16253
16254
16255
16256
16257
16258
16259
16260
16261
16262
16263
16264
16265
16266
16267
16268
16269
16270
16271
16272
16273
16274
16275
16276
16277
16278
16279
16280
16281
16282
16283
16284
16285
16286
16287
16288
16289
16290
16291
16292
16293
16294
16295
16296
16297
16298
16299
16300
16301
16302
16303
16304
16305
16306
16307
16308
16309
16310
16311
16312
16313
16314
16315
16316
16317
16318
16319
16320
16321
16322
16323
16324
16325
16326
16327
16328
16329
16330
16331
16332
16333
16334
16335
16336
16337
16338
16339
16340
16341
16342
16343
16344
16345
16346
16347
16348
16349
16350
16351
16352
16353
16354
16355
16356
16357
16358
16359
16360
16361
16362
16363
16364
16365
16366
16367
16368
16369
16370
16371
16372
16373
16374
16375
16376
16377
16378
16379
16380
16381
16382
16383
16384
16385
16386
16387
16388
16389
16390
16391
16392
16393
16394
16395
16396
16397
16398
16399
16400
16401
16402
16403
16404
16405
16406
16407
16408
16409
16410
16411
16412
16413
16414
16415
16416
16417
16418
16419
16420
16421
16422
16423
16424
16425
16426
16427
16428
16429
16430
16431
16432
16433
16434
16435
16436
16437
16438
16439
16440
16441
16442
16443
16444
16445
16446
16447
16448
16449
16450
16451
16452
16453
16454
16455
16456
16457
16458
16459
16460
16461
16462
16463
16464
16465
16466
16467
16468
16469
16470
16471
16472
16473
16474
16475
16476
16477
16478
16479
16480
16481
16482
16483
16484
16485
16486
16487
16488
16489
16490
16491
16492
16493
16494
16495
16496
16497
16498
16499
16500
16501
16502
16503
16504
16505
16506
16507
16508
16509
16510
16511
16512
16513
16514
16515
16516
16517
16518
16519
16520
16521
16522
16523
16524
16525
16526
16527
16528
16529
16530
16531
16532
16533
16534
16535
16536
16537
16538
16539
16540
16541
16542
16543
16544
16545
16546
16547
16548
16549
16550
16551
16552
16553
16554
16555
16556
16557
16558
16559
16560
16561
16562
16563
16564
16565
16566
16567
16568
16569
16570
16571
16572
16573
16574
16575
16576
16577
16578
16579
16580
16581
16582
16583
16584
16585
16586
16587
16588
16589
16590
16591
16592
16593
16594
16595
16596
16597
16598
16599
16600
16601
16602
16603
16604
16605
16606
16607
16608
16609
16610
16611
16612
16613
16614
16615
16616
16617
16618
16619
16620
16621
16622
16623
16624
16625
16626
16627
16628
16629
16630
16631
16632
16633
16634
16635
16636
16637
16638
16639
16640
16641
16642
16643
16644
16645
16646
16647
16648
16649
16650
16651
16652
16653
16654
16655
16656
16657
16658
16659
16660
16661
16662
16663
16664
16665
16666
16667
16668
16669
16670
16671
16672
16673
16674
16675
16676
16677
16678
16679
16680
16681
16682
16683
16684
16685
16686
16687
16688
16689
16690
16691
16692
16693
16694
16695
16696
16697
16698
16699
16700
16701
16702
16703
16704
16705
16706
16707
16708
16709
16710
16711
16712
16713
16714
16715
16716
16717
16718
16719
16720
16721
16722
16723
16724
16725
16726
16727
16728
16729
16730
16731
16732
16733
16734
16735
16736
16737
16738
16739
16740
16741
16742
16743
16744
16745
16746
16747
16748
16749
16750
16751
16752
16753
16754
16755
16756
16757
16758
16759
16760
16761
16762
16763
16764
16765
16766
16767
16768
16769
16770
16771
16772
16773
16774
16775
16776
16777
16778
16779
16780
16781
16782
16783
16784
16785
16786
16787
16788
16789
16790
16791
16792
16793
16794
16795
16796
16797
16798
16799
16800
16801
16802
16803
16804
16805
16806
16807
16808
16809
16810
16811
16812
16813
16814
16815
16816
16817
16818
16819
16820
16821
16822
16823
16824
16825
16826
16827
16828
16829
16830
16831
16832
16833
16834
16835
16836
16837
16838
16839
16840
16841
16842
16843
16844
16845
16846
16847
16848
16849
16850
16851
16852
16853
16854
16855
16856
16857
16858
16859
16860
16861
16862
16863
16864
16865
16866
16867
16868
16869
16870
16871
16872
16873
16874
16875
16876
16877
16878
16879
16880
16881
16882
16883
16884
16885
16886
16887
16888
16889
16890
16891
16892
16893
16894
16895
16896
16897
16898
16899
16900
16901
16902
16903
16904
16905
16906
16907
16908
16909
16910
16911
16912
16913
16914
16915
16916
16917
16918
16919
16920
16921
16922
16923
16924
16925
16926
16927
16928
16929
16930
16931
16932
16933
16934
16935
16936
16937
16938
16939
16940
16941
16942
16943
16944
16945
16946
16947
16948
16949
16950
16951
16952
16953
16954
16955
16956
16957
16958
16959
16960
16961
16962
16963
16964
16965
16966
16967
16968
16969
16970
16971
16972
16973
16974
16975
16976
16977
16978
16979
16980
16981
16982
16983
16984
16985
16986
16987
16988
16989
16990
16991
16992
16993
16994
16995
16996
16997
16998
16999
17000
17001
17002
17003
17004
17005
17006
17007
17008
17009
17010
17011
17012
17013
17014
17015
17016
17017
17018
17019
17020
17021
17022
17023
17024
17025
17026
17027
17028
17029
17030
17031
17032
17033
17034
17035
17036
17037
17038
17039
17040
17041
17042
17043
17044
17045
17046
17047
17048
17049
17050
17051
17052
17053
17054
17055
17056
17057
17058
17059
17060
17061
17062
17063
17064
17065
17066
17067
17068
17069
17070
17071
17072
17073
17074
17075
17076
17077
17078
17079
17080
17081
17082
17083
17084
17085
17086
17087
17088
17089
17090
17091
17092
17093
17094
17095
17096
17097
17098
17099
17100
17101
17102
17103
17104
17105
17106
17107
17108
17109
17110
17111
17112
17113
17114
17115
17116
17117
17118
17119
17120
17121
17122
17123
17124
17125
17126
17127
17128
17129
17130
17131
17132
17133
17134
17135
17136
17137
17138
17139
17140
17141
17142
17143
17144
17145
17146
17147
17148
17149
17150
17151
17152
17153
17154
17155
17156
17157
17158
17159
17160
17161
17162
17163
17164
17165
17166
17167
17168
17169
17170
17171
17172
17173
17174
17175
17176
17177
17178
17179
17180
17181
17182
17183
17184
17185
17186
17187
17188
17189
17190
17191
17192
17193
17194
17195
17196
17197
17198
17199
17200
17201
17202
17203
17204
17205
17206
17207
17208
17209
17210
17211
17212
17213
17214
17215
17216
17217
17218
17219
17220
17221
17222
17223
17224
17225
17226
17227
17228
17229
17230
17231
17232
17233
17234
17235
17236
17237
17238
17239
17240
17241
17242
17243
17244
17245
17246
17247
17248
17249
17250
17251
17252
17253
17254
17255
17256
17257
17258
17259
17260
17261
17262
17263
17264
17265
17266
17267
17268
17269
17270
17271
17272
17273
17274
17275
17276
17277
17278
17279
17280
17281
17282
17283
17284
17285
17286
17287
17288
17289
17290
17291
17292
17293
17294
17295
17296
17297
17298
17299
17300
17301
17302
17303
17304
17305
17306
17307
17308
17309
17310
17311
17312
17313
17314
17315
17316
17317
17318
17319
17320
17321
17322
17323
17324
17325
17326
17327
17328
17329
17330
17331
17332
17333
17334
17335
17336
17337
17338
17339
17340
17341
17342
17343
17344
17345
17346
17347
17348
17349
17350
17351
17352
17353
17354
17355
17356
17357
17358
17359
17360
17361
17362
17363
17364
17365
17366
17367
17368
17369
17370
17371
17372
17373
17374
17375
17376
17377
17378
17379
17380
17381
17382
17383
17384
17385
17386
17387
17388
17389
17390
17391
17392
17393
17394
17395
17396
17397
17398
17399
17400
17401
17402
17403
17404
17405
17406
17407
17408
17409
17410
17411
17412
17413
17414
17415
17416
17417
17418
17419
17420
17421
17422
17423
17424
17425
17426
17427
17428
17429
17430
17431
17432
17433
17434
17435
17436
17437
17438
17439
17440
17441
17442
17443
17444
17445
17446
17447
17448
17449
17450
17451
17452
17453
17454
17455
17456
17457
17458
17459
17460
17461
17462
17463
17464
17465
17466
17467
17468
17469
17470
17471
17472
17473
17474
17475
17476
17477
17478
17479
17480
17481
17482
17483
17484
17485
17486
17487
17488
17489
17490
17491
17492
17493
17494
17495
17496
17497
17498
17499
17500
17501
17502
17503
17504
17505
17506
17507
17508
17509
17510
17511
17512
17513
17514
17515
17516
17517
17518
17519
17520
17521
17522
17523
17524
17525
17526
17527
17528
17529
17530
17531
17532
17533
17534
17535
17536
17537
17538
17539
17540
17541
17542
17543
17544
17545
17546
17547
17548
17549
17550
17551
17552
17553
17554
17555
17556
17557
17558
17559
17560
17561
17562
17563
17564
17565
17566
17567
17568
17569
17570
17571
17572
17573
17574
17575
17576
17577
17578
17579
17580
17581
17582
17583
17584
17585
17586
17587
17588
17589
17590
17591
17592
17593
17594
17595
17596
17597
17598
17599
17600
17601
17602
17603
17604
17605
17606
17607
17608
17609
17610
17611
17612
17613
17614
17615
17616
17617
17618
17619
17620
17621
17622
17623
17624
17625
17626
17627
17628
17629
17630
17631
17632
17633
17634
17635
17636
17637
17638
17639
17640
17641
17642
17643
17644
17645
17646
17647
17648
17649
17650
17651
17652
17653
17654
17655
17656
17657
17658
17659
17660
17661
17662
17663
17664
17665
17666
17667
17668
17669
17670
17671
17672
17673
17674
17675
17676
17677
17678
17679
17680
17681
17682
17683
17684
17685
17686
17687
17688
17689
17690
17691
17692
17693
17694
17695
17696
17697
17698
17699
17700
17701
17702
17703
17704
17705
17706
17707
17708
17709
17710
17711
17712
17713
17714
17715
17716
17717
17718
17719
17720
17721
17722
17723
17724
17725
17726
17727
17728
17729
17730
17731
17732
17733
17734
17735
17736
17737
17738
17739
17740
17741
17742
17743
17744
17745
17746
17747
17748
17749
17750
17751
17752
17753
17754
17755
17756
17757
17758
17759
17760
17761
17762
17763
17764
17765
17766
17767
17768
17769
17770
17771
17772
17773
17774
17775
17776
17777
17778
17779
17780
17781
17782
17783
17784
17785
17786
17787
17788
17789
17790
17791
17792
17793
17794
17795
17796
17797
17798
17799
17800
17801
17802
17803
17804
17805
17806
17807
17808
17809
17810
17811
17812
17813
17814
17815
17816
17817
17818
17819
17820
17821
17822
17823
17824
17825
17826
17827
17828
17829
17830
17831
17832
17833
17834
17835
17836
17837
17838
17839
17840
17841
17842
17843
17844
17845
17846
17847
17848
17849
17850
17851
17852
17853
17854
17855
17856
17857
17858
17859
17860
17861
17862
17863
17864
17865
17866
17867
17868
17869
17870
17871
17872
17873
17874
17875
17876
17877
17878
17879
17880
17881
17882
17883
17884
17885
17886
17887
17888
17889
17890
17891
17892
17893
17894
17895
17896
17897
17898
17899
17900
17901
17902
17903
17904
17905
17906
17907
17908
17909
17910
17911
17912
17913
17914
17915
17916
17917
17918
17919
17920
17921
17922
17923
17924
17925
17926
17927
17928
17929
17930
17931
17932
17933
17934
17935
17936
17937
17938
17939
17940
17941
17942
17943
17944
17945
17946
17947
17948
17949
17950
17951
17952
17953
17954
17955
17956
17957
17958
17959
17960
17961
17962
17963
17964
17965
17966
17967
17968
17969
17970
17971
17972
17973
17974
17975
17976
17977
17978
17979
17980
17981
17982
17983
17984
17985
17986
17987
17988
17989
17990
17991
17992
17993
17994
17995
17996
17997
17998
17999
18000
18001
18002
18003
18004
18005
18006
18007
18008
18009
18010
18011
18012
18013
18014
18015
18016
18017
18018
18019
18020
18021
18022
18023
18024
18025
18026
18027
18028
18029
18030
18031
18032
18033
18034
18035
18036
18037
18038
18039
18040
18041
18042
18043
18044
18045
18046
18047
18048
18049
18050
18051
18052
18053
18054
18055
18056
18057
18058
18059
18060
18061
18062
18063
18064
18065
18066
18067
18068
18069
18070
18071
18072
18073
18074
18075
18076
18077
18078
18079
18080
18081
18082
18083
18084
18085
18086
18087
18088
18089
18090
18091
18092
18093
18094
18095
18096
18097
18098
18099
18100
18101
18102
18103
18104
18105
18106
18107
18108
18109
18110
18111
18112
18113
18114
18115
18116
18117
18118
18119
18120
18121
18122
18123
18124
18125
18126
18127
18128
18129
18130
18131
18132
18133
18134
18135
18136
18137
18138
18139
18140
18141
18142
18143
18144
18145
18146
18147
18148
18149
18150
18151
18152
18153
18154
18155
18156
18157
18158
18159
18160
18161
18162
18163
18164
18165
18166
18167
18168
18169
18170
18171
18172
18173
18174
18175
18176
18177
18178
18179
18180
18181
18182
18183
18184
18185
18186
18187
18188
18189
18190
18191
18192
18193
18194
18195
18196
18197
18198
18199
18200
18201
18202
18203
18204
18205
18206
18207
18208
18209
18210
18211
18212
18213
18214
18215
18216
18217
18218
18219
18220
18221
18222
18223
18224
18225
18226
18227
18228
18229
18230
18231
18232
18233
18234
18235
18236
18237
18238
18239
18240
18241
18242
18243
18244
18245
18246
18247
18248
18249
18250
18251
18252
18253
18254
18255
18256
18257
18258
18259
18260
18261
18262
18263
18264
18265
18266
18267
18268
18269
18270
18271
18272
18273
18274
18275
18276
18277
18278
18279
18280
18281
18282
18283
18284
18285
18286
18287
18288
18289
18290
18291
18292
18293
18294
18295
18296
18297
18298
18299
18300
18301
18302
18303
18304
18305
18306
18307
18308
18309
18310
18311
18312
18313
18314
18315
18316
18317
18318
18319
18320
18321
18322
18323
18324
18325
18326
18327
18328
18329
18330
18331
18332
18333
18334
18335
18336
18337
18338
18339
18340
18341
18342
18343
18344
18345
18346
18347
18348
18349
18350
18351
18352
18353
18354
18355
18356
18357
18358
18359
18360
18361
18362
18363
18364
18365
18366
18367
18368
18369
18370
18371
18372
18373
18374
18375
18376
18377
18378
18379
18380
18381
18382
18383
18384
18385
18386
18387
18388
18389
18390
18391
18392
18393
18394
18395
18396
18397
18398
18399
18400
18401
18402
18403
18404
18405
18406
18407
18408
18409
18410
18411
18412
18413
18414
18415
18416
18417
18418
18419
18420
18421
18422
18423
18424
18425
18426
18427
18428
18429
18430
18431
18432
18433
18434
18435
18436
18437
18438
18439
18440
18441
18442
18443
18444
18445
18446
18447
18448
18449
18450
18451
18452
18453
18454
18455
18456
18457
18458
18459
18460
18461
18462
18463
18464
18465
18466
18467
18468
18469
18470
18471
18472
18473
18474
18475
18476
18477
18478
18479
18480
18481
18482
18483
18484
18485
18486
18487
18488
18489
18490
18491
18492
18493
18494
18495
18496
18497
18498
18499
18500
18501
18502
18503
18504
18505
18506
18507
18508
18509
18510
18511
18512
18513
18514
18515
18516
18517
18518
18519
18520
18521
18522
18523
18524
18525
18526
18527
18528
18529
18530
18531
18532
18533
18534
18535
18536
18537
18538
18539
18540
18541
18542
18543
18544
18545
18546
18547
18548
18549
18550
18551
18552
18553
18554
18555
18556
18557
18558
18559
18560
18561
18562
18563
18564
18565
18566
18567
18568
18569
18570
18571
18572
18573
18574
18575
18576
18577
18578
18579
18580
18581
18582
18583
18584
18585
18586
18587
18588
18589
18590
18591
18592
18593
18594
18595
18596
18597
18598
18599
18600
18601
18602
18603
18604
18605
18606
18607
18608
18609
18610
18611
18612
18613
18614
18615
18616
18617
18618
18619
18620
18621
18622
18623
18624
18625
18626
18627
18628
18629
18630
18631
18632
18633
18634
18635
18636
18637
18638
18639
18640
18641
18642
18643
18644
18645
18646
18647
18648
18649
18650
18651
18652
18653
18654
18655
18656
18657
18658
18659
18660
18661
18662
18663
18664
18665
18666
18667
18668
18669
18670
18671
18672
18673
18674
18675
18676
18677
18678
18679
18680
18681
18682
18683
18684
18685
18686
18687
18688
18689
18690
18691
18692
18693
18694
18695
18696
18697
18698
18699
18700
18701
18702
18703
18704
18705
18706
18707
18708
18709
18710
18711
18712
18713
18714
18715
18716
18717
18718
18719
18720
18721
18722
18723
18724
18725
18726
18727
18728
18729
18730
18731
18732
18733
18734
18735
18736
18737
18738
18739
18740
18741
18742
18743
18744
18745
18746
18747
18748
18749
18750
18751
18752
18753
18754
18755
18756
18757
18758
18759
18760
18761
18762
18763
18764
18765
18766
18767
18768
18769
18770
18771
18772
18773
18774
18775
18776
18777
18778
18779
18780
18781
18782
18783
18784
18785
18786
18787
18788
18789
18790
18791
18792
18793
18794
18795
18796
18797
18798
18799
18800
18801
18802
18803
18804
18805
18806
18807
18808
18809
18810
18811
18812
18813
18814
18815
18816
18817
18818
18819
18820
18821
18822
18823
18824
18825
18826
18827
18828
18829
18830
18831
18832
18833
18834
18835
18836
18837
18838
18839
18840
18841
18842
18843
18844
18845
18846
18847
18848
18849
18850
18851
18852
18853
18854
18855
18856
18857
18858
18859
18860
18861
18862
18863
18864
18865
18866
18867
18868
18869
18870
18871
18872
18873
18874
18875
18876
18877
18878
18879
18880
18881
18882
18883
18884
18885
18886
18887
18888
18889
18890
18891
18892
18893
18894
18895
18896
18897
18898
18899
18900
18901
18902
18903
18904
18905
18906
18907
18908
18909
18910
18911
18912
18913
18914
18915
18916
18917
18918
18919
18920
18921
18922
18923
18924
18925
18926
18927
18928
18929
18930
18931
18932
18933
18934
18935
18936
18937
18938
18939
18940
18941
18942
18943
18944
18945
18946
18947
18948
18949
18950
18951
18952
18953
18954
18955
18956
18957
18958
18959
18960
18961
18962
18963
18964
18965
18966
18967
18968
18969
18970
18971
18972
18973
18974
18975
18976
18977
18978
18979
18980
18981
18982
18983
18984
18985
18986
18987
18988
18989
18990
18991
18992
18993
18994
18995
18996
18997
18998
18999
19000
19001
19002
19003
19004
19005
19006
19007
19008
19009
19010
19011
19012
19013
19014
19015
19016
19017
19018
19019
19020
19021
19022
19023
19024
19025
19026
19027
19028
19029
19030
19031
19032
19033
19034
19035
19036
19037
19038
19039
19040
19041
19042
19043
19044
19045
19046
19047
19048
19049
19050
19051
19052
19053
19054
19055
19056
19057
19058
19059
19060
19061
19062
19063
19064
19065
19066
19067
19068
19069
19070
19071
19072
19073
19074
19075
19076
19077
19078
19079
19080
19081
19082
19083
19084
19085
19086
19087
19088
19089
19090
19091
19092
19093
19094
19095
19096
19097
19098
19099
19100
19101
19102
19103
19104
19105
19106
19107
19108
19109
19110
19111
19112
19113
19114
19115
19116
19117
19118
19119
19120
19121
19122
19123
19124
19125
19126
19127
19128
19129
19130
19131
19132
19133
19134
19135
19136
19137
19138
19139
19140
19141
19142
19143
19144
19145
19146
19147
19148
19149
19150
19151
19152
19153
19154
19155
19156
19157
19158
19159
19160
19161
19162
19163
19164
19165
19166
19167
19168
19169
19170
19171
19172
19173
19174
19175
19176
19177
19178
19179
19180
19181
19182
19183
19184
19185
19186
19187
19188
19189
19190
19191
19192
19193
19194
19195
19196
19197
19198
19199
19200
19201
19202
19203
19204
19205
19206
19207
19208
19209
19210
19211
19212
19213
19214
19215
19216
19217
19218
19219
19220
19221
19222
19223
19224
19225
19226
19227
19228
19229
19230
19231
19232
19233
19234
19235
19236
19237
19238
19239
19240
19241
19242
19243
19244
19245
19246
19247
19248
19249
19250
19251
19252
19253
19254
19255
19256
19257
19258
19259
19260
19261
19262
19263
19264
19265
19266
19267
19268
19269
19270
19271
19272
19273
19274
19275
19276
19277
19278
19279
19280
19281
19282
19283
19284
19285
19286
19287
19288
19289
19290
19291
19292
19293
19294
19295
19296
19297
19298
19299
19300
19301
19302
19303
19304
19305
19306
19307
19308
19309
19310
19311
19312
19313
19314
19315
19316
19317
19318
19319
19320
19321
19322
19323
19324
19325
19326
19327
19328
19329
19330
19331
19332
19333
19334
19335
19336
19337
19338
19339
19340
19341
19342
19343
19344
19345
19346
19347
19348
19349
19350
19351
19352
19353
19354
19355
19356
19357
19358
19359
19360
19361
19362
19363
19364
19365
19366
19367
19368
19369
19370
19371
19372
19373
19374
19375
19376
19377
19378
19379
19380
19381
19382
19383
19384
19385
19386
19387
19388
19389
19390
19391
19392
19393
19394
19395
19396
19397
19398
19399
19400
19401
19402
19403
19404
19405
19406
19407
19408
19409
19410
19411
19412
19413
19414
19415
19416
19417
19418
19419
19420
19421
19422
19423
19424
19425
19426
19427
19428
19429
19430
19431
19432
19433
19434
19435
19436
19437
19438
19439
19440
19441
19442
19443
19444
19445
19446
19447
19448
19449
19450
19451
19452
19453
19454
19455
19456
19457
19458
19459
19460
19461
19462
19463
19464
19465
19466
19467
19468
19469
19470
19471
19472
19473
19474
19475
19476
19477
19478
19479
19480
19481
19482
19483
19484
19485
19486
19487
19488
19489
19490
19491
19492
19493
19494
19495
19496
19497
19498
19499
19500
19501
19502
19503
19504
19505
19506
19507
19508
19509
19510
19511
19512
19513
19514
19515
19516
19517
19518
19519
19520
19521
19522
19523
19524
19525
19526
19527
19528
19529
19530
19531
19532
19533
19534
19535
19536
19537
19538
19539
19540
19541
19542
19543
19544
19545
19546
19547
19548
19549
19550
19551
19552
19553
19554
19555
19556
19557
19558
19559
19560
19561
19562
19563
19564
19565
19566
19567
19568
19569
19570
19571
19572
19573
19574
19575
19576
19577
19578
19579
19580
19581
19582
19583
19584
19585
19586
19587
19588
19589
19590
19591
19592
19593
19594
19595
19596
19597
19598
19599
19600
19601
19602
19603
19604
19605
19606
19607
19608
19609
19610
19611
19612
19613
19614
19615
19616
19617
19618
19619
19620
19621
19622
19623
19624
19625
19626
19627
19628
19629
19630
19631
19632
19633
19634
19635
19636
19637
19638
19639
19640
19641
19642
19643
19644
19645
19646
19647
19648
19649
19650
19651
19652
19653
19654
19655
19656
19657
19658
19659
19660
19661
19662
19663
19664
19665
19666
19667
19668
19669
19670
19671
19672
19673
19674
19675
19676
19677
19678
19679
19680
19681
19682
19683
19684
19685
19686
19687
19688
19689
19690
19691
19692
19693
19694
19695
19696
19697
19698
19699
19700
19701
19702
19703
19704
19705
19706
19707
19708
19709
19710
19711
19712
19713
19714
19715
19716
19717
19718
19719
19720
19721
19722
19723
19724
19725
19726
19727
19728
19729
19730
19731
19732
19733
19734
19735
19736
19737
19738
19739
19740
19741
19742
19743
19744
19745
19746
19747
19748
19749
19750
19751
19752
19753
19754
19755
19756
19757
19758
19759
19760
19761
19762
19763
19764
19765
19766
19767
19768
19769
19770
19771
19772
19773
19774
19775
19776
19777
19778
19779
19780
19781
19782
19783
19784
19785
19786
19787
19788
19789
19790
19791
19792
19793
19794
19795
19796
19797
19798
19799
19800
19801
19802
19803
19804
19805
19806
19807
19808
19809
19810
19811
19812
19813
19814
19815
19816
19817
19818
19819
19820
19821
19822
19823
19824
19825
19826
19827
19828
19829
19830
19831
19832
19833
19834
19835
19836
19837
19838
19839
19840
19841
19842
19843
19844
19845
19846
19847
19848
19849
19850
19851
19852
19853
19854
19855
19856
19857
19858
19859
19860
19861
19862
19863
19864
19865
19866
19867
19868
19869
19870
19871
19872
19873
19874
19875
19876
19877
19878
19879
19880
19881
19882
19883
19884
19885
19886
19887
19888
19889
19890
19891
19892
19893
19894
19895
19896
19897
19898
19899
19900
19901
19902
19903
19904
19905
19906
19907
19908
19909
19910
19911
19912
19913
19914
19915
19916
19917
19918
19919
19920
19921
19922
19923
19924
19925
19926
19927
19928
19929
19930
19931
19932
19933
19934
19935
19936
19937
19938
19939
19940
19941
19942
19943
19944
19945
19946
19947
19948
19949
19950
19951
19952
19953
19954
19955
19956
19957
19958
19959
19960
19961
19962
19963
19964
19965
19966
19967
19968
19969
19970
19971
19972
19973
19974
19975
19976
19977
19978
19979
19980
19981
19982
19983
19984
19985
19986
19987
19988
19989
19990
19991
19992
19993
19994
19995
19996
19997
19998
19999
20000
20001
20002
20003
20004
20005
20006
20007
20008
20009
20010
20011
20012
20013
20014
20015
20016
20017
20018
20019
20020
20021
20022
20023
20024
20025
20026
20027
20028
20029
20030
20031
20032
20033
20034
20035
20036
20037
20038
20039
20040
20041
20042
20043
20044
20045
20046
20047
20048
20049
20050
20051
20052
20053
20054
20055
20056
20057
20058
20059
20060
20061
20062
20063
20064
20065
20066
20067
20068
20069
20070
20071
20072
20073
20074
20075
20076
20077
20078
20079
20080
20081
20082
20083
20084
20085
20086
20087
20088
20089
20090
20091
20092
20093
20094
20095
20096
20097
20098
20099
20100
20101
20102
20103
20104
20105
20106
20107
20108
20109
20110
20111
20112
20113
20114
20115
20116
20117
20118
20119
20120
20121
20122
20123
20124
20125
20126
20127
20128
20129
20130
20131
20132
20133
20134
20135
20136
20137
20138
20139
20140
20141
20142
20143
20144
20145
20146
20147
20148
20149
20150
20151
20152
20153
20154
20155
20156
20157
20158
20159
20160
20161
20162
20163
20164
20165
20166
20167
20168
20169
20170
20171
20172
20173
20174
20175
20176
20177
20178
20179
20180
20181
20182
20183
20184
20185
20186
20187
20188
20189
20190
20191
20192
20193
20194
20195
20196
20197
20198
20199
20200
20201
20202
20203
20204
20205
20206
20207
20208
20209
20210
20211
20212
20213
20214
20215
20216
20217
20218
20219
20220
20221
20222
20223
20224
20225
20226
20227
20228
20229
20230
20231
20232
20233
20234
20235
20236
20237
20238
20239
20240
20241
20242
20243
20244
20245
20246
20247
20248
20249
20250
20251
20252
20253
20254
20255
20256
20257
20258
20259
20260
20261
20262
20263
20264
20265
20266
20267
20268
20269
20270
20271
20272
20273
20274
20275
20276
20277
20278
20279
20280
20281
20282
20283
20284
20285
20286
20287
20288
20289
20290
20291
20292
20293
20294
20295
20296
20297
20298
20299
20300
20301
20302
20303
20304
20305
20306
20307
20308
20309
20310
20311
20312
20313
20314
20315
20316
20317
20318
20319
20320
20321
20322
20323
20324
20325
20326
20327
20328
20329
20330
20331
20332
20333
20334
20335
20336
20337
20338
20339
20340
20341
20342
20343
20344
20345
20346
20347
20348
20349
20350
20351
20352
20353
20354
20355
20356
20357
20358
20359
20360
20361
20362
20363
20364
20365
20366
20367
20368
20369
20370
20371
20372
20373
20374
20375
20376
20377
20378
20379
20380
20381
20382
20383
20384
20385
20386
20387
20388
20389
20390
20391
20392
20393
20394
20395
20396
20397
20398
20399
20400
20401
20402
20403
20404
20405
20406
20407
20408
20409
20410
20411
20412
20413
20414
20415
20416
20417
20418
20419
20420
20421
20422
20423
20424
20425
20426
20427
20428
20429
20430
20431
20432
20433
20434
20435
20436
20437
20438
20439
20440
20441
20442
20443
20444
20445
20446
20447
20448
20449
20450
20451
20452
20453
20454
20455
20456
20457
20458
20459
20460
20461
20462
20463
20464
20465
20466
20467
20468
20469
20470
20471
20472
20473
20474
20475
20476
20477
20478
20479
20480
20481
20482
20483
20484
20485
20486
20487
20488
20489
20490
20491
20492
20493
20494
20495
20496
20497
20498
20499
20500
20501
20502
20503
20504
20505
20506
20507
20508
20509
20510
20511
20512
20513
20514
20515
20516
20517
20518
20519
20520
20521
20522
20523
20524
20525
20526
20527
20528
20529
20530
20531
20532
20533
20534
20535
20536
20537
20538
20539
20540
20541
20542
20543
20544
20545
20546
20547
20548
20549
20550
20551
20552
20553
20554
20555
20556
20557
20558
20559
20560
20561
20562
20563
20564
20565
20566
20567
20568
20569
20570
20571
20572
20573
20574
20575
20576
20577
20578
20579
20580
20581
20582
20583
20584
20585
20586
20587
20588
20589
20590
20591
20592
20593
20594
20595
20596
20597
20598
20599
20600
20601
20602
20603
20604
20605
20606
20607
20608
20609
20610
20611
20612
20613
20614
20615
20616
20617
20618
20619
20620
20621
20622
20623
20624
20625
20626
20627
20628
20629
20630
20631
20632
20633
20634
20635
20636
20637
20638
20639
20640
20641
20642
20643
20644
20645
20646
20647
20648
20649
20650
20651
20652
20653
20654
20655
20656
20657
20658
20659
20660
20661
20662
20663
20664
20665
20666
20667
20668
20669
20670
20671
20672
20673
20674
20675
20676
20677
20678
20679
20680
20681
20682
20683
20684
20685
20686
20687
20688
20689
20690
20691
20692
20693
20694
20695
20696
20697
20698
20699
20700
20701
20702
20703
20704
20705
20706
20707
20708
20709
20710
20711
20712
20713
20714
20715
20716
20717
20718
20719
20720
20721
20722
20723
20724
20725
20726
20727
20728
20729
20730
20731
20732
20733
20734
20735
20736
20737
20738
20739
20740
20741
20742
20743
20744
20745
20746
20747
20748
20749
20750
20751
20752
20753
20754
20755
20756
20757
20758
20759
20760
20761
20762
20763
20764
20765
20766
20767
20768
20769
20770
20771
20772
20773
20774
20775
20776
20777
20778
20779
20780
20781
20782
20783
20784
20785
20786
20787
20788
20789
20790
20791
20792
20793
20794
20795
20796
20797
20798
20799
20800
20801
20802
20803
20804
20805
20806
20807
20808
20809
20810
20811
20812
20813
20814
20815
20816
20817
20818
20819
20820
20821
20822
20823
20824
20825
20826
20827
20828
20829
20830
20831
20832
20833
20834
20835
20836
20837
20838
20839
20840
20841
20842
20843
20844
20845
20846
20847
20848
20849
20850
20851
20852
20853
20854
20855
20856
20857
20858
20859
20860
20861
20862
20863
20864
20865
20866
20867
20868
20869
20870
20871
20872
20873
20874
20875
20876
20877
20878
20879
20880
20881
20882
20883
20884
20885
20886
20887
20888
20889
20890
20891
20892
20893
20894
20895
20896
20897
20898
20899
20900
20901
20902
20903
20904
20905
20906
20907
20908
20909
20910
20911
20912
20913
20914
20915
20916
20917
20918
20919
20920
20921
20922
20923
20924
20925
20926
20927
20928
20929
20930
20931
20932
20933
20934
20935
20936
20937
20938
20939
20940
20941
20942
20943
20944
20945
20946
20947
20948
20949
20950
20951
20952
20953
20954
20955
20956
20957
20958
20959
20960
20961
20962
20963
20964
20965
20966
20967
20968
20969
20970
20971
20972
20973
20974
20975
20976
20977
20978
20979
20980
20981
20982
20983
20984
20985
20986
20987
20988
20989
20990
20991
20992
20993
20994
20995
20996
20997
20998
20999
21000
21001
21002
21003
21004
21005
21006
21007
21008
21009
21010
21011
21012
21013
21014
21015
21016
21017
21018
21019
21020
21021
21022
21023
21024
21025
21026
21027
21028
21029
21030
21031
21032
21033
21034
21035
21036
21037
21038
21039
21040
21041
21042
21043
21044
21045
21046
21047
21048
21049
21050
21051
21052
21053
21054
21055
21056
21057
21058
21059
21060
21061
21062
21063
21064
21065
21066
21067
21068
21069
21070
21071
21072
21073
21074
21075
21076
21077
21078
21079
21080
21081
21082
21083
21084
21085
21086
21087
21088
21089
21090
21091
21092
21093
21094
21095
21096
21097
21098
21099
21100
21101
21102
21103
21104
21105
21106
21107
21108
21109
21110
21111
21112
21113
21114
21115
21116
21117
21118
21119
21120
21121
21122
21123
21124
21125
21126
21127
21128
21129
21130
21131
21132
21133
21134
21135
21136
21137
21138
21139
21140
21141
21142
21143
21144
21145
21146
21147
21148
21149
21150
21151
21152
21153
21154
21155
21156
21157
21158
21159
21160
21161
21162
21163
21164
21165
21166
21167
21168
21169
21170
21171
21172
21173
21174
21175
21176
21177
21178
21179
21180
21181
21182
21183
21184
21185
21186
21187
21188
21189
21190
21191
21192
21193
21194
21195
21196
21197
21198
21199
21200
21201
21202
21203
21204
21205
21206
21207
21208
21209
21210
21211
21212
21213
21214
21215
21216
21217
21218
21219
21220
21221
21222
21223
21224
21225
21226
21227
21228
21229
21230
21231
21232
21233
21234
21235
21236
21237
21238
21239
21240
21241
21242
21243
21244
21245
21246
21247
21248
21249
21250
21251
21252
21253
21254
21255
21256
21257
21258
21259
21260
21261
21262
21263
21264
21265
21266
21267
21268
21269
21270
21271
21272
21273
21274
21275
21276
21277
21278
21279
21280
21281
21282
21283
21284
21285
21286
21287
21288
21289
21290
21291
21292
21293
21294
21295
21296
21297
21298
21299
21300
21301
21302
21303
21304
21305
21306
21307
21308
21309
21310
21311
21312
21313
21314
21315
21316
21317
21318
21319
21320
21321
21322
21323
21324
21325
21326
21327
21328
21329
21330
21331
21332
21333
21334
21335
21336
21337
21338
21339
21340
21341
21342
21343
21344
21345
21346
21347
21348
21349
21350
21351
21352
21353
21354
21355
21356
21357
21358
21359
21360
21361
21362
21363
21364
21365
21366
21367
21368
21369
21370
21371
21372
21373
21374
21375
21376
21377
21378
21379
21380
21381
21382
21383
21384
21385
21386
21387
21388
21389
21390
21391
21392
21393
21394
21395
21396
21397
21398
21399
21400
21401
21402
21403
21404
21405
21406
21407
21408
21409
21410
21411
21412
21413
21414
21415
21416
21417
21418
21419
21420
21421
21422
21423
21424
21425
21426
21427
21428
21429
21430
21431
21432
21433
21434
21435
21436
21437
21438
21439
21440
21441
21442
21443
21444
21445
21446
21447
21448
21449
21450
21451
21452
21453
21454
21455
21456
21457
21458
21459
21460
21461
21462
21463
21464
21465
21466
21467
21468
21469
21470
21471
21472
21473
21474
21475
21476
21477
21478
21479
21480
21481
21482
21483
21484
21485
21486
21487
21488
21489
21490
21491
21492
21493
21494
21495
21496
21497
21498
21499
21500
21501
21502
21503
21504
21505
21506
21507
21508
21509
21510
21511
21512
21513
21514
21515
21516
21517
21518
21519
21520
21521
21522
21523
21524
21525
21526
21527
21528
21529
21530
21531
21532
21533
21534
21535
21536
21537
21538
21539
21540
21541
21542
21543
21544
21545
21546
21547
21548
21549
21550
21551
21552
21553
21554
21555
21556
21557
21558
21559
21560
21561
21562
21563
21564
21565
21566
21567
21568
21569
21570
21571
21572
21573
21574
21575
21576
21577
21578
21579
21580
21581
21582
21583
21584
21585
21586
21587
21588
21589
21590
21591
21592
21593
21594
21595
21596
21597
21598
21599
21600
21601
21602
21603
21604
21605
21606
21607
21608
21609
21610
21611
21612
21613
21614
21615
21616
21617
21618
21619
21620
21621
21622
21623
21624
21625
21626
21627
21628
21629
21630
21631
21632
21633
21634
21635
21636
21637
21638
21639
21640
21641
21642
21643
21644
21645
21646
21647
21648
21649
21650
21651
21652
21653
21654
21655
21656
21657
21658
21659
21660
21661
21662
21663
21664
21665
21666
21667
21668
21669
21670
21671
21672
21673
21674
21675
21676
21677
21678
21679
21680
21681
21682
21683
21684
21685
21686
21687
21688
21689
21690
21691
21692
21693
21694
21695
21696
21697
21698
21699
21700
21701
21702
21703
21704
21705
21706
21707
21708
21709
21710
21711
21712
21713
21714
21715
21716
21717
21718
21719
21720
21721
21722
21723
21724
21725
21726
21727
21728
21729
21730
21731
21732
21733
21734
21735
21736
21737
21738
21739
21740
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
 */
#include <uapi/linux/btf.h>
#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
#include <linux/file.h>
#include <linux/vmalloc.h>
#include <linux/stringify.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/perf_event.h>
#include <linux/ctype.h>
#include <linux/error-injection.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/poison.h>
#include <linux/module.h>
#include <linux/cpumask.h>
#include <linux/bpf_mem_alloc.h>
#include <net/xdp.h>

#include "disasm.h"

static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        [_id] = & _name ## _verifier_ops,
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};

struct bpf_mem_alloc bpf_global_percpu_ma;
static bool bpf_global_percpu_ma_set;

/* bpf_check() is a static code analyzer that walks eBPF program
 * instruction by instruction and updates register/stack state.
 * All paths of conditional branches are analyzed until 'bpf_exit' insn.
 *
 * The first pass is depth-first-search to check that the program is a DAG.
 * It rejects the following programs:
 * - larger than BPF_MAXINSNS insns
 * - if loop is present (detected via back-edge)
 * - unreachable insns exist (shouldn't be a forest. program = one function)
 * - out of bounds or malformed jumps
 * The second pass is all possible path descent from the 1st insn.
 * Since it's analyzing all paths through the program, the length of the
 * analysis is limited to 64k insn, which may be hit even if total number of
 * insn is less then 4K, but there are too many branches that change stack/regs.
 * Number of 'branches to be analyzed' is limited to 1k
 *
 * On entry to each instruction, each register has a type, and the instruction
 * changes the types of the registers depending on instruction semantics.
 * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
 * copied to R1.
 *
 * All registers are 64-bit.
 * R0 - return register
 * R1-R5 argument passing registers
 * R6-R9 callee saved registers
 * R10 - frame pointer read-only
 *
 * At the start of BPF program the register R1 contains a pointer to bpf_context
 * and has type PTR_TO_CTX.
 *
 * Verifier tracks arithmetic operations on pointers in case:
 *    BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
 * 1st insn copies R10 (which has FRAME_PTR) type into R1
 * and 2nd arithmetic instruction is pattern matched to recognize
 * that it wants to construct a pointer to some element within stack.
 * So after 2nd insn, the register R1 has type PTR_TO_STACK
 * (and -20 constant is saved for further stack bounds checking).
 * Meaning that this reg is a pointer to stack plus known immediate constant.
 *
 * Most of the time the registers have SCALAR_VALUE type, which
 * means the register has some value, but it's not a valid pointer.
 * (like pointer plus pointer becomes SCALAR_VALUE type)
 *
 * When verifier sees load or store instructions the type of base register
 * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
 * four pointer types recognized by check_mem_access() function.
 *
 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
 * and the range of [ptr, ptr + map's value_size) is accessible.
 *
 * registers used to pass values to function calls are checked against
 * function argument constraints.
 *
 * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
 * It means that the register type passed to this function must be
 * PTR_TO_STACK and it will be used inside the function as
 * 'pointer to map element key'
 *
 * For example the argument constraints for bpf_map_lookup_elem():
 *   .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
 *   .arg1_type = ARG_CONST_MAP_PTR,
 *   .arg2_type = ARG_PTR_TO_MAP_KEY,
 *
 * ret_type says that this function returns 'pointer to map elem value or null'
 * function expects 1st argument to be a const pointer to 'struct bpf_map' and
 * 2nd argument should be a pointer to stack, which will be used inside
 * the helper function as a pointer to map element key.
 *
 * On the kernel side the helper function looks like:
 * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 * {
 *    struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 *    void *key = (void *) (unsigned long) r2;
 *    void *value;
 *
 *    here kernel can access 'key' and 'map' pointers safely, knowing that
 *    [key, key + map->key_size) bytes are valid and were initialized on
 *    the stack of eBPF program.
 * }
 *
 * Corresponding eBPF program may look like:
 *    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),  // after this insn R2 type is FRAME_PTR
 *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
 *    BPF_LD_MAP_FD(BPF_REG_1, map_fd),      // after this insn R1 type is CONST_PTR_TO_MAP
 *    BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 * here verifier looks at prototype of map_lookup_elem() and sees:
 * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
 * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
 *
 * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
 * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
 * and were initialized prior to this call.
 * If it's ok, then verifier allows this BPF_CALL insn and looks at
 * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
 * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
 * returns either pointer to map value or NULL.
 *
 * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
 * insn, the register holding that pointer in the true branch changes state to
 * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
 * branch. See check_cond_jmp_op().
 *
 * After the call R0 is set to return type of the function and registers R1-R5
 * are set to NOT_INIT to indicate that they are no longer readable.
 *
 * The following reference types represent a potential reference to a kernel
 * resource which, after first being allocated, must be checked and freed by
 * the BPF program:
 * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
 *
 * When the verifier sees a helper call return a reference type, it allocates a
 * pointer id for the reference and stores it in the current function state.
 * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
 * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
 * passes through a NULL-check conditional. For the branch wherein the state is
 * changed to CONST_IMM, the verifier releases the reference.
 *
 * For each helper function that allocates a reference, such as
 * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
 * bpf_sk_release(). When a reference type passes into the release function,
 * the verifier also releases the reference. If any unchecked or unreleased
 * reference remains at the end of the program, the verifier rejects it.
 */

/* verifier_state + insn_idx are pushed to stack when branch is encountered */
struct bpf_verifier_stack_elem {
        /* verifier state is 'st'
         * before processing instruction 'insn_idx'
         * and after processing instruction 'prev_insn_idx'
         */
        struct bpf_verifier_state st;
        int insn_idx;
        int prev_insn_idx;
        struct bpf_verifier_stack_elem *next;
        /* length of verifier log at the time this state was pushed on stack */
        u32 log_pos;
};

#define BPF_COMPLEXITY_LIMIT_JMP_SEQ        8192
#define BPF_COMPLEXITY_LIMIT_STATES        64

#define BPF_MAP_KEY_POISON        (1ULL << 63)
#define BPF_MAP_KEY_SEEN        (1ULL << 62)

#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512

static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
                              struct bpf_reg_state *reg);
static void specialize_kfunc(struct bpf_verifier_env *env,
                             u32 func_id, u16 offset, unsigned long *addr);
static bool is_trusted_reg(const struct bpf_reg_state *reg);

static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
        return aux->map_ptr_state.poison;
}

static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
{
        return aux->map_ptr_state.unpriv;
}

static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
                              struct bpf_map *map,
                              bool unpriv, bool poison)
{
        unpriv |= bpf_map_ptr_unpriv(aux);
        aux->map_ptr_state.unpriv = unpriv;
        aux->map_ptr_state.poison = poison;
        aux->map_ptr_state.map_ptr = map;
}

static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
{
        return aux->map_key_state & BPF_MAP_KEY_POISON;
}

static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
{
        return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
}

static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
{
        return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
}

static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
{
        bool poisoned = bpf_map_key_poisoned(aux);

        aux->map_key_state = state | BPF_MAP_KEY_SEEN |
                             (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
}

static bool bpf_helper_call(const struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
               insn->src_reg == 0;
}

static bool bpf_pseudo_call(const struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
               insn->src_reg == BPF_PSEUDO_CALL;
}

static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
               insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
}

struct bpf_call_arg_meta {
        struct bpf_map *map_ptr;
        bool raw_mode;
        bool pkt_access;
        u8 release_regno;
        int regno;
        int access_size;
        int mem_size;
        u64 msize_max_value;
        int ref_obj_id;
        int dynptr_id;
        int map_uid;
        int func_id;
        struct btf *btf;
        u32 btf_id;
        struct btf *ret_btf;
        u32 ret_btf_id;
        u32 subprogno;
        struct btf_field *kptr_field;
};

struct bpf_kfunc_call_arg_meta {
        /* In parameters */
        struct btf *btf;
        u32 func_id;
        u32 kfunc_flags;
        const struct btf_type *func_proto;
        const char *func_name;
        /* Out parameters */
        u32 ref_obj_id;
        u8 release_regno;
        bool r0_rdonly;
        u32 ret_btf_id;
        u64 r0_size;
        u32 subprogno;
        struct {
                u64 value;
                bool found;
        } arg_constant;

        /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
         * generally to pass info about user-defined local kptr types to later
         * verification logic
         *   bpf_obj_drop/bpf_percpu_obj_drop
         *     Record the local kptr type to be drop'd
         *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
         *     Record the local kptr type to be refcount_incr'd and use
         *     arg_owning_ref to determine whether refcount_acquire should be
         *     fallible
         */
        struct btf *arg_btf;
        u32 arg_btf_id;
        bool arg_owning_ref;

        struct {
                struct btf_field *field;
        } arg_list_head;
        struct {
                struct btf_field *field;
        } arg_rbtree_root;
        struct {
                enum bpf_dynptr_type type;
                u32 id;
                u32 ref_obj_id;
        } initialized_dynptr;
        struct {
                u8 spi;
                u8 frameno;
        } iter;
        struct {
                struct bpf_map *ptr;
                int uid;
        } map;
        u64 mem_size;
};

struct btf *btf_vmlinux;

static const char *btf_type_name(const struct btf *btf, u32 id)
{
        return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}

static DEFINE_MUTEX(bpf_verifier_lock);
static DEFINE_MUTEX(bpf_percpu_ma_lock);

__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
        struct bpf_verifier_env *env = private_data;
        va_list args;

        if (!bpf_verifier_log_needed(&env->log))
                return;

        va_start(args, fmt);
        bpf_verifier_vlog(&env->log, fmt, args);
        va_end(args);
}

static void verbose_invalid_scalar(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg,
                                   struct bpf_retval_range range, const char *ctx,
                                   const char *reg_name)
{
        bool unknown = true;

        verbose(env, "%s the register %s has", ctx, reg_name);
        if (reg->smin_value > S64_MIN) {
                verbose(env, " smin=%lld", reg->smin_value);
                unknown = false;
        }
        if (reg->smax_value < S64_MAX) {
                verbose(env, " smax=%lld", reg->smax_value);
                unknown = false;
        }
        if (unknown)
                verbose(env, " unknown scalar value");
        verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
}

static bool type_may_be_null(u32 type)
{
        return type & PTR_MAYBE_NULL;
}

static bool reg_not_null(const struct bpf_reg_state *reg)
{
        enum bpf_reg_type type;

        type = reg->type;
        if (type_may_be_null(type))
                return false;

        type = base_type(type);
        return type == PTR_TO_SOCKET ||
                type == PTR_TO_TCP_SOCK ||
                type == PTR_TO_MAP_VALUE ||
                type == PTR_TO_MAP_KEY ||
                type == PTR_TO_SOCK_COMMON ||
                (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
                type == PTR_TO_MEM;
}

static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
{
        struct btf_record *rec = NULL;
        struct btf_struct_meta *meta;

        if (reg->type == PTR_TO_MAP_VALUE) {
                rec = reg->map_ptr->record;
        } else if (type_is_ptr_alloc_obj(reg->type)) {
                meta = btf_find_struct_meta(reg->btf, reg->btf_id);
                if (meta)
                        rec = meta->record;
        }
        return rec;
}

static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
{
        struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;

        return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
}

static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
{
        struct bpf_func_info *info;

        if (!env->prog->aux->func_info)
                return "";

        info = &env->prog->aux->func_info[subprog];
        return btf_type_name(env->prog->aux->btf, info->type_id);
}

static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
{
        struct bpf_subprog_info *info = subprog_info(env, subprog);

        info->is_cb = true;
        info->is_async_cb = true;
        info->is_exception_cb = true;
}

static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
{
        return subprog_info(env, subprog)->is_exception_cb;
}

static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
        return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
}

static bool type_is_rdonly_mem(u32 type)
{
        return type & MEM_RDONLY;
}

static bool is_acquire_function(enum bpf_func_id func_id,
                                const struct bpf_map *map)
{
        enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;

        if (func_id == BPF_FUNC_sk_lookup_tcp ||
            func_id == BPF_FUNC_sk_lookup_udp ||
            func_id == BPF_FUNC_skc_lookup_tcp ||
            func_id == BPF_FUNC_ringbuf_reserve ||
            func_id == BPF_FUNC_kptr_xchg)
                return true;

        if (func_id == BPF_FUNC_map_lookup_elem &&
            (map_type == BPF_MAP_TYPE_SOCKMAP ||
             map_type == BPF_MAP_TYPE_SOCKHASH))
                return true;

        return false;
}

static bool is_ptr_cast_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_tcp_sock ||
                func_id == BPF_FUNC_sk_fullsock ||
                func_id == BPF_FUNC_skc_to_tcp_sock ||
                func_id == BPF_FUNC_skc_to_tcp6_sock ||
                func_id == BPF_FUNC_skc_to_udp6_sock ||
                func_id == BPF_FUNC_skc_to_mptcp_sock ||
                func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
                func_id == BPF_FUNC_skc_to_tcp_request_sock;
}

static bool is_dynptr_ref_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_dynptr_data;
}

static bool is_sync_callback_calling_kfunc(u32 btf_id);
static bool is_async_callback_calling_kfunc(u32 btf_id);
static bool is_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);

static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);

static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_for_each_map_elem ||
               func_id == BPF_FUNC_find_vma ||
               func_id == BPF_FUNC_loop ||
               func_id == BPF_FUNC_user_ringbuf_drain;
}

static bool is_async_callback_calling_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_timer_set_callback;
}

static bool is_callback_calling_function(enum bpf_func_id func_id)
{
        return is_sync_callback_calling_function(func_id) ||
               is_async_callback_calling_function(func_id);
}

static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
{
        return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
               (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
}

static bool is_async_callback_calling_insn(struct bpf_insn *insn)
{
        return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
               (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
}

static bool is_may_goto_insn(struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
}

static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
{
        return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
}

static bool is_storage_get_function(enum bpf_func_id func_id)
{
        return func_id == BPF_FUNC_sk_storage_get ||
               func_id == BPF_FUNC_inode_storage_get ||
               func_id == BPF_FUNC_task_storage_get ||
               func_id == BPF_FUNC_cgrp_storage_get;
}

static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
                                        const struct bpf_map *map)
{
        int ref_obj_uses = 0;

        if (is_ptr_cast_function(func_id))
                ref_obj_uses++;
        if (is_acquire_function(func_id, map))
                ref_obj_uses++;
        if (is_dynptr_ref_function(func_id))
                ref_obj_uses++;

        return ref_obj_uses > 1;
}

static bool is_cmpxchg_insn(const struct bpf_insn *insn)
{
        return BPF_CLASS(insn->code) == BPF_STX &&
               BPF_MODE(insn->code) == BPF_ATOMIC &&
               insn->imm == BPF_CMPXCHG;
}

static int __get_spi(s32 off)
{
        return (-off - 1) / BPF_REG_SIZE;
}

static struct bpf_func_state *func(struct bpf_verifier_env *env,
                                   const struct bpf_reg_state *reg)
{
        struct bpf_verifier_state *cur = env->cur_state;

        return cur->frame[reg->frameno];
}

static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
{
       int allocated_slots = state->allocated_stack / BPF_REG_SIZE;

       /* We need to check that slots between [spi - nr_slots + 1, spi] are
        * within [0, allocated_stack).
        *
        * Please note that the spi grows downwards. For example, a dynptr
        * takes the size of two stack slots; the first slot will be at
        * spi and the second slot will be at spi - 1.
        */
       return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}

static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                  const char *obj_kind, int nr_slots)
{
        int off, spi;

        if (!tnum_is_const(reg->var_off)) {
                verbose(env, "%s has to be at a constant offset\n", obj_kind);
                return -EINVAL;
        }

        off = reg->off + reg->var_off.value;
        if (off % BPF_REG_SIZE) {
                verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
                return -EINVAL;
        }

        spi = __get_spi(off);
        if (spi + 1 < nr_slots) {
                verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
                return -EINVAL;
        }

        if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
                return -ERANGE;
        return spi;
}

static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
}

static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
{
        return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}

static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
        switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
        case DYNPTR_TYPE_LOCAL:
                return BPF_DYNPTR_TYPE_LOCAL;
        case DYNPTR_TYPE_RINGBUF:
                return BPF_DYNPTR_TYPE_RINGBUF;
        case DYNPTR_TYPE_SKB:
                return BPF_DYNPTR_TYPE_SKB;
        case DYNPTR_TYPE_XDP:
                return BPF_DYNPTR_TYPE_XDP;
        default:
                return BPF_DYNPTR_TYPE_INVALID;
        }
}

static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
{
        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
                return DYNPTR_TYPE_LOCAL;
        case BPF_DYNPTR_TYPE_RINGBUF:
                return DYNPTR_TYPE_RINGBUF;
        case BPF_DYNPTR_TYPE_SKB:
                return DYNPTR_TYPE_SKB;
        case BPF_DYNPTR_TYPE_XDP:
                return DYNPTR_TYPE_XDP;
        default:
                return 0;
        }
}

static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
        return type == BPF_DYNPTR_TYPE_RINGBUF;
}

static void __mark_dynptr_reg(struct bpf_reg_state *reg,
                              enum bpf_dynptr_type type,
                              bool first_slot, int dynptr_id);

static void __mark_reg_not_init(const struct bpf_verifier_env *env,
                                struct bpf_reg_state *reg);

static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *sreg1,
                                   struct bpf_reg_state *sreg2,
                                   enum bpf_dynptr_type type)
{
        int id = ++env->id_gen;

        __mark_dynptr_reg(sreg1, type, true, id);
        __mark_dynptr_reg(sreg2, type, false, id);
}

static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg,
                               enum bpf_dynptr_type type)
{
        __mark_dynptr_reg(reg, type, true, ++env->id_gen);
}

static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
                                        struct bpf_func_state *state, int spi);

static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                   enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
{
        struct bpf_func_state *state = func(env, reg);
        enum bpf_dynptr_type type;
        int spi, i, err;

        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;

        /* We cannot assume both spi and spi - 1 belong to the same dynptr,
         * hence we need to call destroy_if_dynptr_stack_slot twice for both,
         * to ensure that for the following example:
         *        [d1][d1][d2][d2]
         * spi    3   2   1   0
         * So marking spi = 2 should lead to destruction of both d1 and d2. In
         * case they do belong to same dynptr, second call won't see slot_type
         * as STACK_DYNPTR and will simply skip destruction.
         */
        err = destroy_if_dynptr_stack_slot(env, state, spi);
        if (err)
                return err;
        err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
        if (err)
                return err;

        for (i = 0; i < BPF_REG_SIZE; i++) {
                state->stack[spi].slot_type[i] = STACK_DYNPTR;
                state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
        }

        type = arg_to_dynptr_type(arg_type);
        if (type == BPF_DYNPTR_TYPE_INVALID)
                return -EINVAL;

        mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
                               &state->stack[spi - 1].spilled_ptr, type);

        if (dynptr_type_refcounted(type)) {
                /* The id is used to track proper releasing */
                int id;

                if (clone_ref_obj_id)
                        id = clone_ref_obj_id;
                else
                        id = acquire_reference_state(env, insn_idx);

                if (id < 0)
                        return id;

                state->stack[spi].spilled_ptr.ref_obj_id = id;
                state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
        }

        state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
        state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;

        return 0;
}

static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
{
        int i;

        for (i = 0; i < BPF_REG_SIZE; i++) {
                state->stack[spi].slot_type[i] = STACK_INVALID;
                state->stack[spi - 1].slot_type[i] = STACK_INVALID;
        }

        __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
        __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);

        /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot?
         *
         * While we don't allow reading STACK_INVALID, it is still possible to
         * do <8 byte writes marking some but not all slots as STACK_MISC. Then,
         * helpers or insns can do partial read of that part without failing,
         * but check_stack_range_initialized, check_stack_read_var_off, and
         * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of
         * the slot conservatively. Hence we need to prevent those liveness
         * marking walks.
         *
         * This was not a problem before because STACK_INVALID is only set by
         * default (where the default reg state has its reg->parent as NULL), or
         * in clean_live_states after REG_LIVE_DONE (at which point
         * mark_reg_read won't walk reg->parent chain), but not randomly during
         * verifier state exploration (like we did above). Hence, for our case
         * parentage chain will still be live (i.e. reg->parent may be
         * non-NULL), while earlier reg->parent was NULL, so we need
         * REG_LIVE_WRITTEN to screen off read marker propagation when it is
         * done later on reads or by mark_dynptr_read as well to unnecessary
         * mark registers in verifier state.
         */
        state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
        state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;
}

static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = func(env, reg);
        int spi, ref_obj_id, i;

        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;

        if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
                invalidate_dynptr(env, state, spi);
                return 0;
        }

        ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;

        /* If the dynptr has a ref_obj_id, then we need to invalidate
         * two things:
         *
         * 1) Any dynptrs with a matching ref_obj_id (clones)
         * 2) Any slices derived from this dynptr.
         */

        /* Invalidate any slices associated with this dynptr */
        WARN_ON_ONCE(release_reference(env, ref_obj_id));

        /* Invalidate any dynptr clones */
        for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
                        continue;

                /* it should always be the case that if the ref obj id
                 * matches then the stack slot also belongs to a
                 * dynptr
                 */
                if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
                        verbose(env, "verifier internal error: misconfigured ref_obj_id\n");
                        return -EFAULT;
                }
                if (state->stack[i].spilled_ptr.dynptr.first_slot)
                        invalidate_dynptr(env, state, i);
        }

        return 0;
}

static void __mark_reg_unknown(const struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg);

static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        if (!env->allow_ptr_leaks)
                __mark_reg_not_init(env, reg);
        else
                __mark_reg_unknown(env, reg);
}

static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
                                        struct bpf_func_state *state, int spi)
{
        struct bpf_func_state *fstate;
        struct bpf_reg_state *dreg;
        int i, dynptr_id;

        /* We always ensure that STACK_DYNPTR is never set partially,
         * hence just checking for slot_type[0] is enough. This is
         * different for STACK_SPILL, where it may be only set for
         * 1 byte, so code has to use is_spilled_reg.
         */
        if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
                return 0;

        /* Reposition spi to first slot */
        if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
                spi = spi + 1;

        if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
                verbose(env, "cannot overwrite referenced dynptr\n");
                return -EINVAL;
        }

        mark_stack_slot_scratched(env, spi);
        mark_stack_slot_scratched(env, spi - 1);

        /* Writing partially to one dynptr stack slot destroys both. */
        for (i = 0; i < BPF_REG_SIZE; i++) {
                state->stack[spi].slot_type[i] = STACK_INVALID;
                state->stack[spi - 1].slot_type[i] = STACK_INVALID;
        }

        dynptr_id = state->stack[spi].spilled_ptr.id;
        /* Invalidate any slices associated with this dynptr */
        bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
                /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
                if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
                        continue;
                if (dreg->dynptr_id == dynptr_id)
                        mark_reg_invalid(env, dreg);
        }));

        /* Do not release reference state, we are destroying dynptr on stack,
         * not using some helper to release it. Just reset register.
         */
        __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
        __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);

        /* Same reason as unmark_stack_slots_dynptr above */
        state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
        state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN;

        return 0;
}

static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return false;

        spi = dynptr_get_spi(env, reg);

        /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
         * error because this just means the stack state hasn't been updated yet.
         * We will do check_mem_access to check and update stack bounds later.
         */
        if (spi < 0 && spi != -ERANGE)
                return false;

        /* We don't need to check if the stack slots are marked by previous
         * dynptr initializations because we allow overwriting existing unreferenced
         * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
         * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
         * touching are completely destructed before we reinitialize them for a new
         * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
         * instead of delaying it until the end where the user will get "Unreleased
         * reference" error.
         */
        return true;
}

static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = func(env, reg);
        int i, spi;

        /* This already represents first slot of initialized bpf_dynptr.
         *
         * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
         * check_func_arg_reg_off's logic, so we don't need to check its
         * offset and alignment.
         */
        if (reg->type == CONST_PTR_TO_DYNPTR)
                return true;

        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return false;
        if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
                return false;

        for (i = 0; i < BPF_REG_SIZE; i++) {
                if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
                    state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
                        return false;
        }

        return true;
}

static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                    enum bpf_arg_type arg_type)
{
        struct bpf_func_state *state = func(env, reg);
        enum bpf_dynptr_type dynptr_type;
        int spi;

        /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
        if (arg_type == ARG_PTR_TO_DYNPTR)
                return true;

        dynptr_type = arg_to_dynptr_type(arg_type);
        if (reg->type == CONST_PTR_TO_DYNPTR) {
                return reg->dynptr.type == dynptr_type;
        } else {
                spi = dynptr_get_spi(env, reg);
                if (spi < 0)
                        return false;
                return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
        }
}

static void __mark_reg_known_zero(struct bpf_reg_state *reg);

static bool in_rcu_cs(struct bpf_verifier_env *env);

static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);

static int mark_stack_slots_iter(struct bpf_verifier_env *env,
                                 struct bpf_kfunc_call_arg_meta *meta,
                                 struct bpf_reg_state *reg, int insn_idx,
                                 struct btf *btf, u32 btf_id, int nr_slots)
{
        struct bpf_func_state *state = func(env, reg);
        int spi, i, j, id;

        spi = iter_get_spi(env, reg, nr_slots);
        if (spi < 0)
                return spi;

        id = acquire_reference_state(env, insn_idx);
        if (id < 0)
                return id;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];
                struct bpf_reg_state *st = &slot->spilled_ptr;

                __mark_reg_known_zero(st);
                st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
                if (is_kfunc_rcu_protected(meta)) {
                        if (in_rcu_cs(env))
                                st->type |= MEM_RCU;
                        else
                                st->type |= PTR_UNTRUSTED;
                }
                st->live |= REG_LIVE_WRITTEN;
                st->ref_obj_id = i == 0 ? id : 0;
                st->iter.btf = btf;
                st->iter.btf_id = btf_id;
                st->iter.state = BPF_ITER_STATE_ACTIVE;
                st->iter.depth = 0;

                for (j = 0; j < BPF_REG_SIZE; j++)
                        slot->slot_type[j] = STACK_ITER;

                mark_stack_slot_scratched(env, spi - i);
        }

        return 0;
}

static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, int nr_slots)
{
        struct bpf_func_state *state = func(env, reg);
        int spi, i, j;

        spi = iter_get_spi(env, reg, nr_slots);
        if (spi < 0)
                return spi;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];
                struct bpf_reg_state *st = &slot->spilled_ptr;

                if (i == 0)
                        WARN_ON_ONCE(release_reference(env, st->ref_obj_id));

                __mark_reg_not_init(env, st);

                /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
                st->live |= REG_LIVE_WRITTEN;

                for (j = 0; j < BPF_REG_SIZE; j++)
                        slot->slot_type[j] = STACK_INVALID;

                mark_stack_slot_scratched(env, spi - i);
        }

        return 0;
}

static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
                                     struct bpf_reg_state *reg, int nr_slots)
{
        struct bpf_func_state *state = func(env, reg);
        int spi, i, j;

        /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
         * will do check_mem_access to check and update stack bounds later, so
         * return true for that case.
         */
        spi = iter_get_spi(env, reg, nr_slots);
        if (spi == -ERANGE)
                return true;
        if (spi < 0)
                return false;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];

                for (j = 0; j < BPF_REG_SIZE; j++)
                        if (slot->slot_type[j] == STACK_ITER)
                                return false;
        }

        return true;
}

static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                   struct btf *btf, u32 btf_id, int nr_slots)
{
        struct bpf_func_state *state = func(env, reg);
        int spi, i, j;

        spi = iter_get_spi(env, reg, nr_slots);
        if (spi < 0)
                return -EINVAL;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_stack_state *slot = &state->stack[spi - i];
                struct bpf_reg_state *st = &slot->spilled_ptr;

                if (st->type & PTR_UNTRUSTED)
                        return -EPROTO;
                /* only main (first) slot has ref_obj_id set */
                if (i == 0 && !st->ref_obj_id)
                        return -EINVAL;
                if (i != 0 && st->ref_obj_id)
                        return -EINVAL;
                if (st->iter.btf != btf || st->iter.btf_id != btf_id)
                        return -EINVAL;

                for (j = 0; j < BPF_REG_SIZE; j++)
                        if (slot->slot_type[j] != STACK_ITER)
                                return -EINVAL;
        }

        return 0;
}

/* Check if given stack slot is "special":
 *   - spilled register state (STACK_SPILL);
 *   - dynptr state (STACK_DYNPTR);
 *   - iter state (STACK_ITER).
 */
static bool is_stack_slot_special(const struct bpf_stack_state *stack)
{
        enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];

        switch (type) {
        case STACK_SPILL:
        case STACK_DYNPTR:
        case STACK_ITER:
                return true;
        case STACK_INVALID:
        case STACK_MISC:
        case STACK_ZERO:
                return false;
        default:
                WARN_ONCE(1, "unknown stack slot type %d\n", type);
                return true;
        }
}

/* The reg state of a pointer or a bounded scalar was saved when
 * it was spilled to the stack.
 */
static bool is_spilled_reg(const struct bpf_stack_state *stack)
{
        return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
}

static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
{
        return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
               stack->spilled_ptr.type == SCALAR_VALUE;
}

static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
{
        return stack->slot_type[0] == STACK_SPILL &&
               stack->spilled_ptr.type == SCALAR_VALUE;
}

/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
 * case they are equivalent, or it's STACK_ZERO, in which case we preserve
 * more precise STACK_ZERO.
 * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
 * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
 */
static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
{
        if (*stype == STACK_ZERO)
                return;
        if (env->allow_ptr_leaks && *stype == STACK_INVALID)
                return;
        *stype = STACK_MISC;
}

static void scrub_spilled_slot(u8 *stype)
{
        if (*stype != STACK_INVALID)
                *stype = STACK_MISC;
}

/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
 * small to hold src. This is different from krealloc since we don't want to preserve
 * the contents of dst.
 *
 * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could
 * not be allocated.
 */
static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
{
        size_t alloc_bytes;
        void *orig = dst;
        size_t bytes;

        if (ZERO_OR_NULL_PTR(src))
                goto out;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
        dst = krealloc(orig, alloc_bytes, flags);
        if (!dst) {
                kfree(orig);
                return NULL;
        }

        memcpy(dst, src, bytes);
out:
        return dst ? dst : ZERO_SIZE_PTR;
}

/* resize an array from old_n items to new_n items. the array is reallocated if it's too
 * small to hold new_n items. new items are zeroed out if the array grows.
 *
 * Contrary to krealloc_array, does not free arr if new_n is zero.
 */
static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
        size_t alloc_size;
        void *new_arr;

        if (!new_n || old_n == new_n)
                goto out;

        alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
        new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
        if (!new_arr) {
                kfree(arr);
                return NULL;
        }
        arr = new_arr;

        if (new_n > old_n)
                memset(arr + old_n * size, 0, (new_n - old_n) * size);

out:
        return arr ? arr : ZERO_SIZE_PTR;
}

static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
{
        dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
                               sizeof(struct bpf_reference_state), GFP_KERNEL);
        if (!dst->refs)
                return -ENOMEM;

        dst->acquired_refs = src->acquired_refs;
        return 0;
}

static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
{
        size_t n = src->allocated_stack / BPF_REG_SIZE;

        dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
                                GFP_KERNEL);
        if (!dst->stack)
                return -ENOMEM;

        dst->allocated_stack = src->allocated_stack;
        return 0;
}

static int resize_reference_state(struct bpf_func_state *state, size_t n)
{
        state->refs = realloc_array(state->refs, state->acquired_refs, n,
                                    sizeof(struct bpf_reference_state));
        if (!state->refs)
                return -ENOMEM;

        state->acquired_refs = n;
        return 0;
}

/* Possibly update state->allocated_stack to be at least size bytes. Also
 * possibly update the function's high-water mark in its bpf_subprog_info.
 */
static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
{
        size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;

        /* The stack size is always a multiple of BPF_REG_SIZE. */
        size = round_up(size, BPF_REG_SIZE);
        n = size / BPF_REG_SIZE;

        if (old_n >= n)
                return 0;

        state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state));
        if (!state->stack)
                return -ENOMEM;

        state->allocated_stack = size;

        /* update known max for given subprogram */
        if (env->subprog_info[state->subprogno].stack_depth < size)
                env->subprog_info[state->subprogno].stack_depth = size;

        return 0;
}

/* Acquire a pointer id from the env and update the state->refs to include
 * this new pointer reference.
 * On success, returns a valid pointer id to associate with the register
 * On failure, returns a negative errno.
 */
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
{
        struct bpf_func_state *state = cur_func(env);
        int new_ofs = state->acquired_refs;
        int id, err;

        err = resize_reference_state(state, state->acquired_refs + 1);
        if (err)
                return err;
        id = ++env->id_gen;
        state->refs[new_ofs].id = id;
        state->refs[new_ofs].insn_idx = insn_idx;
        state->refs[new_ofs].callback_ref = state->in_callback_fn ? state->frameno : 0;

        return id;
}

/* release function corresponding to acquire_reference_state(). Idempotent. */
static int release_reference_state(struct bpf_func_state *state, int ptr_id)
{
        int i, last_idx;

        last_idx = state->acquired_refs - 1;
        for (i = 0; i < state->acquired_refs; i++) {
                if (state->refs[i].id == ptr_id) {
                        /* Cannot release caller references in callbacks */
                        if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
                                return -EINVAL;
                        if (last_idx && i != last_idx)
                                memcpy(&state->refs[i], &state->refs[last_idx],
                                       sizeof(*state->refs));
                        memset(&state->refs[last_idx], 0, sizeof(*state->refs));
                        state->acquired_refs--;
                        return 0;
                }
        }
        return -EINVAL;
}

static void free_func_state(struct bpf_func_state *state)
{
        if (!state)
                return;
        kfree(state->refs);
        kfree(state->stack);
        kfree(state);
}

static void clear_jmp_history(struct bpf_verifier_state *state)
{
        kfree(state->jmp_history);
        state->jmp_history = NULL;
        state->jmp_history_cnt = 0;
}

static void free_verifier_state(struct bpf_verifier_state *state,
                                bool free_self)
{
        int i;

        for (i = 0; i <= state->curframe; i++) {
                free_func_state(state->frame[i]);
                state->frame[i] = NULL;
        }
        clear_jmp_history(state);
        if (free_self)
                kfree(state);
}

/* copy verifier state from src to dst growing dst stack space
 * when necessary to accommodate larger src stack
 */
static int copy_func_state(struct bpf_func_state *dst,
                           const struct bpf_func_state *src)
{
        int err;

        memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
        err = copy_reference_state(dst, src);
        if (err)
                return err;
        return copy_stack_state(dst, src);
}

static int copy_verifier_state(struct bpf_verifier_state *dst_state,
                               const struct bpf_verifier_state *src)
{
        struct bpf_func_state *dst;
        int i, err;

        dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
                                          src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
                                          GFP_USER);
        if (!dst_state->jmp_history)
                return -ENOMEM;
        dst_state->jmp_history_cnt = src->jmp_history_cnt;

        /* if dst has more stack frames then src frame, free them, this is also
         * necessary in case of exceptional exits using bpf_throw.
         */
        for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
                free_func_state(dst_state->frame[i]);
                dst_state->frame[i] = NULL;
        }
        dst_state->speculative = src->speculative;
        dst_state->active_rcu_lock = src->active_rcu_lock;
        dst_state->active_preempt_lock = src->active_preempt_lock;
        dst_state->in_sleepable = src->in_sleepable;
        dst_state->curframe = src->curframe;
        dst_state->active_lock.ptr = src->active_lock.ptr;
        dst_state->active_lock.id = src->active_lock.id;
        dst_state->branches = src->branches;
        dst_state->parent = src->parent;
        dst_state->first_insn_idx = src->first_insn_idx;
        dst_state->last_insn_idx = src->last_insn_idx;
        dst_state->dfs_depth = src->dfs_depth;
        dst_state->callback_unroll_depth = src->callback_unroll_depth;
        dst_state->used_as_loop_entry = src->used_as_loop_entry;
        dst_state->may_goto_depth = src->may_goto_depth;
        for (i = 0; i <= src->curframe; i++) {
                dst = dst_state->frame[i];
                if (!dst) {
                        dst = kzalloc(sizeof(*dst), GFP_KERNEL);
                        if (!dst)
                                return -ENOMEM;
                        dst_state->frame[i] = dst;
                }
                err = copy_func_state(dst, src->frame[i]);
                if (err)
                        return err;
        }
        return 0;
}

static u32 state_htab_size(struct bpf_verifier_env *env)
{
        return env->prog->len;
}

static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
{
        struct bpf_verifier_state *cur = env->cur_state;
        struct bpf_func_state *state = cur->frame[cur->curframe];

        return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
}

static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
{
        int fr;

        if (a->curframe != b->curframe)
                return false;

        for (fr = a->curframe; fr >= 0; fr--)
                if (a->frame[fr]->callsite != b->frame[fr]->callsite)
                        return false;

        return true;
}

/* Open coded iterators allow back-edges in the state graph in order to
 * check unbounded loops that iterators.
 *
 * In is_state_visited() it is necessary to know if explored states are
 * part of some loops in order to decide whether non-exact states
 * comparison could be used:
 * - non-exact states comparison establishes sub-state relation and uses
 *   read and precision marks to do so, these marks are propagated from
 *   children states and thus are not guaranteed to be final in a loop;
 * - exact states comparison just checks if current and explored states
 *   are identical (and thus form a back-edge).
 *
 * Paper "A New Algorithm for Identifying Loops in Decompilation"
 * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
 * algorithm for loop structure detection and gives an overview of
 * relevant terminology. It also has helpful illustrations.
 *
 * [1] https://api.semanticscholar.org/CorpusID:15784067
 *
 * We use a similar algorithm but because loop nested structure is
 * irrelevant for verifier ours is significantly simpler and resembles
 * strongly connected components algorithm from Sedgewick's textbook.
 *
 * Define topmost loop entry as a first node of the loop traversed in a
 * depth first search starting from initial state. The goal of the loop
 * tracking algorithm is to associate topmost loop entries with states
 * derived from these entries.
 *
 * For each step in the DFS states traversal algorithm needs to identify
 * the following situations:
 *
 *          initial                     initial                   initial
 *            |                           |                         |
 *            V                           V                         V
 *           ...                         ...           .---------> hdr
 *            |                           |            |            |
 *            V                           V            |            V
 *           cur                     .-> succ          |    .------...
 *            |                      |    |            |    |       |
 *            V                      |    V            |    V       V
 *           succ                    '-- cur           |   ...     ...
 *                                                     |    |       |
 *                                                     |    V       V
 *                                                     |   succ <- cur
 *                                                     |    |
 *                                                     |    V
 *                                                     |   ...
 *                                                     |    |
 *                                                     '----'
 *
 *  (A) successor state of cur   (B) successor state of cur or it's entry
 *      not yet traversed            are in current DFS path, thus cur and succ
 *                                   are members of the same outermost loop
 *
 *                      initial                  initial
 *                        |                        |
 *                        V                        V
 *                       ...                      ...
 *                        |                        |
 *                        V                        V
 *                .------...               .------...
 *                |       |                |       |
 *                V       V                V       V
 *           .-> hdr     ...              ...     ...
 *           |    |       |                |       |
 *           |    V       V                V       V
 *           |   succ <- cur              succ <- cur
 *           |    |                        |
 *           |    V                        V
 *           |   ...                      ...
 *           |    |                        |
 *           '----'                       exit
 *
 * (C) successor state of cur is a part of some loop but this loop
 *     does not include cur or successor state is not in a loop at all.
 *
 * Algorithm could be described as the following python code:
 *
 *     traversed = set()   # Set of traversed nodes
 *     entries = {}        # Mapping from node to loop entry
 *     depths = {}         # Depth level assigned to graph node
 *     path = set()        # Current DFS path
 *
 *     # Find outermost loop entry known for n
 *     def get_loop_entry(n):
 *         h = entries.get(n, None)
 *         while h in entries and entries[h] != h:
 *             h = entries[h]
 *         return h
 *
 *     # Update n's loop entry if h's outermost entry comes
 *     # before n's outermost entry in current DFS path.
 *     def update_loop_entry(n, h):
 *         n1 = get_loop_entry(n) or n
 *         h1 = get_loop_entry(h) or h
 *         if h1 in path and depths[h1] <= depths[n1]:
 *             entries[n] = h1
 *
 *     def dfs(n, depth):
 *         traversed.add(n)
 *         path.add(n)
 *         depths[n] = depth
 *         for succ in G.successors(n):
 *             if succ not in traversed:
 *                 # Case A: explore succ and update cur's loop entry
 *                 #         only if succ's entry is in current DFS path.
 *                 dfs(succ, depth + 1)
 *                 h = get_loop_entry(succ)
 *                 update_loop_entry(n, h)
 *             else:
 *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
 *                 update_loop_entry(n, succ)
 *         path.remove(n)
 *
 * To adapt this algorithm for use with verifier:
 * - use st->branch == 0 as a signal that DFS of succ had been finished
 *   and cur's loop entry has to be updated (case A), handle this in
 *   update_branch_counts();
 * - use st->branch > 0 as a signal that st is in the current DFS path;
 * - handle cases B and C in is_state_visited();
 * - update topmost loop entry for intermediate states in get_loop_entry().
 */
static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
{
        struct bpf_verifier_state *topmost = st->loop_entry, *old;

        while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
                topmost = topmost->loop_entry;
        /* Update loop entries for intermediate states to avoid this
         * traversal in future get_loop_entry() calls.
         */
        while (st && st->loop_entry != topmost) {
                old = st->loop_entry;
                st->loop_entry = topmost;
                st = old;
        }
        return topmost;
}

static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
{
        struct bpf_verifier_state *cur1, *hdr1;

        cur1 = get_loop_entry(cur) ?: cur;
        hdr1 = get_loop_entry(hdr) ?: hdr;
        /* The head1->branches check decides between cases B and C in
         * comment for get_loop_entry(). If hdr1->branches == 0 then
         * head's topmost loop entry is not in current DFS path,
         * hence 'cur' and 'hdr' are not in the same loop and there is
         * no need to update cur->loop_entry.
         */
        if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
                cur->loop_entry = hdr;
                hdr->used_as_loop_entry = true;
        }
}

static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        while (st) {
                u32 br = --st->branches;

                /* br == 0 signals that DFS exploration for 'st' is finished,
                 * thus it is necessary to update parent's loop entry if it
                 * turned out that st is a part of some loop.
                 * This is a part of 'case A' in get_loop_entry() comment.
                 */
                if (br == 0 && st->parent && st->loop_entry)
                        update_loop_entry(st->parent, st->loop_entry);

                /* WARN_ON(br > 1) technically makes sense here,
                 * but see comment in push_stack(), hence:
                 */
                WARN_ONCE((int)br < 0,
                          "BUG update_branch_counts:branches_to_explore=%d\n",
                          br);
                if (br)
                        break;
                st = st->parent;
        }
}

static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
                     int *insn_idx, bool pop_log)
{
        struct bpf_verifier_state *cur = env->cur_state;
        struct bpf_verifier_stack_elem *elem, *head = env->head;
        int err;

        if (env->head == NULL)
                return -ENOENT;

        if (cur) {
                err = copy_verifier_state(cur, &head->st);
                if (err)
                        return err;
        }
        if (pop_log)
                bpf_vlog_reset(&env->log, head->log_pos);
        if (insn_idx)
                *insn_idx = head->insn_idx;
        if (prev_insn_idx)
                *prev_insn_idx = head->prev_insn_idx;
        elem = head->next;
        free_verifier_state(&head->st, false);
        kfree(head);
        env->head = elem;
        env->stack_size--;
        return 0;
}

static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
                                             int insn_idx, int prev_insn_idx,
                                             bool speculative)
{
        struct bpf_verifier_state *cur = env->cur_state;
        struct bpf_verifier_stack_elem *elem;
        int err;

        elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
        if (!elem)
                goto err;

        elem->insn_idx = insn_idx;
        elem->prev_insn_idx = prev_insn_idx;
        elem->next = env->head;
        elem->log_pos = env->log.end_pos;
        env->head = elem;
        env->stack_size++;
        err = copy_verifier_state(&elem->st, cur);
        if (err)
                goto err;
        elem->st.speculative |= speculative;
        if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
                verbose(env, "The sequence of %d jumps is too complex.\n",
                        env->stack_size);
                goto err;
        }
        if (elem->st.parent) {
                ++elem->st.parent->branches;
                /* WARN_ON(branches > 2) technically makes sense here,
                 * but
                 * 1. speculative states will bump 'branches' for non-branch
                 * instructions
                 * 2. is_state_visited() heuristics may decide not to create
                 * a new state for a sequence of branches and all such current
                 * and cloned states will be pointing to a single parent state
                 * which might have large 'branches' count.
                 */
        }
        return &elem->st;
err:
        free_verifier_state(env->cur_state, true);
        env->cur_state = NULL;
        /* pop all elements and return */
        while (!pop_stack(env, NULL, NULL, false));
        return NULL;
}

#define CALLER_SAVED_REGS 6
static const int caller_saved[CALLER_SAVED_REGS] = {
        BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};

/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
        reg->var_off = tnum_const(imm);
        reg->smin_value = (s64)imm;
        reg->smax_value = (s64)imm;
        reg->umin_value = imm;
        reg->umax_value = imm;

        reg->s32_min_value = (s32)imm;
        reg->s32_max_value = (s32)imm;
        reg->u32_min_value = (u32)imm;
        reg->u32_max_value = (u32)imm;
}

/* Mark the unknown part of a register (variable offset or scalar value) as
 * known to have the value @imm.
 */
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
        /* Clear off and union(map_ptr, range) */
        memset(((u8 *)reg) + sizeof(reg->type), 0,
               offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
        reg->id = 0;
        reg->ref_obj_id = 0;
        ___mark_reg_known(reg, imm);
}

static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
        reg->var_off = tnum_const_subreg(reg->var_off, imm);
        reg->s32_min_value = (s32)imm;
        reg->s32_max_value = (s32)imm;
        reg->u32_min_value = (u32)imm;
        reg->u32_max_value = (u32)imm;
}

/* Mark the 'variable offset' part of a register as zero.  This should be
 * used only on registers holding a pointer type.
 */
static void __mark_reg_known_zero(struct bpf_reg_state *reg)
{
        __mark_reg_known(reg, 0);
}

static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        __mark_reg_known(reg, 0);
        reg->type = SCALAR_VALUE;
        /* all scalars are assumed imprecise initially (unless unprivileged,
         * in which case everything is forced to be precise)
         */
        reg->precise = !env->bpf_capable;
}

static void mark_reg_known_zero(struct bpf_verifier_env *env,
                                struct bpf_reg_state *regs, u32 regno)
{
        if (WARN_ON(regno >= MAX_BPF_REG)) {
                verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
                /* Something bad happened, let's kill all regs */
                for (regno = 0; regno < MAX_BPF_REG; regno++)
                        __mark_reg_not_init(env, regs + regno);
                return;
        }
        __mark_reg_known_zero(regs + regno);
}

static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
                              bool first_slot, int dynptr_id)
{
        /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
         * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
         * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
         */
        __mark_reg_known_zero(reg);
        reg->type = CONST_PTR_TO_DYNPTR;
        /* Give each dynptr a unique id to uniquely associate slices to it. */
        reg->id = dynptr_id;
        reg->dynptr.type = type;
        reg->dynptr.first_slot = first_slot;
}

static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
        if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
                const struct bpf_map *map = reg->map_ptr;

                if (map->inner_map_meta) {
                        reg->type = CONST_PTR_TO_MAP;
                        reg->map_ptr = map->inner_map_meta;
                        /* transfer reg's id which is unique for every map_lookup_elem
                         * as UID of the inner map.
                         */
                        if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
                                reg->map_uid = reg->id;
                        if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
                                reg->map_uid = reg->id;
                } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
                        reg->type = PTR_TO_XDP_SOCK;
                } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
                           map->map_type == BPF_MAP_TYPE_SOCKHASH) {
                        reg->type = PTR_TO_SOCKET;
                } else {
                        reg->type = PTR_TO_MAP_VALUE;
                }
                return;
        }

        reg->type &= ~PTR_MAYBE_NULL;
}

static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
                                struct btf_field_graph_root *ds_head)
{
        __mark_reg_known_zero(&regs[regno]);
        regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
        regs[regno].btf = ds_head->btf;
        regs[regno].btf_id = ds_head->value_btf_id;
        regs[regno].off = ds_head->node_offset;
}

static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
{
        return type_is_pkt_pointer(reg->type);
}

static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
{
        return reg_is_pkt_pointer(reg) ||
               reg->type == PTR_TO_PACKET_END;
}

static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
{
        return base_type(reg->type) == PTR_TO_MEM &&
                (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
}

/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
                                    enum bpf_reg_type which)
{
        /* The register can already have a range from prior markings.
         * This is fine as long as it hasn't been advanced from its
         * origin.
         */
        return reg->type == which &&
               reg->id == 0 &&
               reg->off == 0 &&
               tnum_equals_const(reg->var_off, 0);
}

/* Reset the min/max bounds of a register */
static void __mark_reg_unbounded(struct bpf_reg_state *reg)
{
        reg->smin_value = S64_MIN;
        reg->smax_value = S64_MAX;
        reg->umin_value = 0;
        reg->umax_value = U64_MAX;

        reg->s32_min_value = S32_MIN;
        reg->s32_max_value = S32_MAX;
        reg->u32_min_value = 0;
        reg->u32_max_value = U32_MAX;
}

static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
{
        reg->smin_value = S64_MIN;
        reg->smax_value = S64_MAX;
        reg->umin_value = 0;
        reg->umax_value = U64_MAX;
}

static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
{
        reg->s32_min_value = S32_MIN;
        reg->s32_max_value = S32_MAX;
        reg->u32_min_value = 0;
        reg->u32_max_value = U32_MAX;
}

static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
        struct tnum var32_off = tnum_subreg(reg->var_off);

        /* min signed is max(sign bit) | min(other bits) */
        reg->s32_min_value = max_t(s32, reg->s32_min_value,
                        var32_off.value | (var32_off.mask & S32_MIN));
        /* max signed is min(sign bit) | max(other bits) */
        reg->s32_max_value = min_t(s32, reg->s32_max_value,
                        var32_off.value | (var32_off.mask & S32_MAX));
        reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
        reg->u32_max_value = min(reg->u32_max_value,
                                 (u32)(var32_off.value | var32_off.mask));
}

static void __update_reg64_bounds(struct bpf_reg_state *reg)
{
        /* min signed is max(sign bit) | min(other bits) */
        reg->smin_value = max_t(s64, reg->smin_value,
                                reg->var_off.value | (reg->var_off.mask & S64_MIN));
        /* max signed is min(sign bit) | max(other bits) */
        reg->smax_value = min_t(s64, reg->smax_value,
                                reg->var_off.value | (reg->var_off.mask & S64_MAX));
        reg->umin_value = max(reg->umin_value, reg->var_off.value);
        reg->umax_value = min(reg->umax_value,
                              reg->var_off.value | reg->var_off.mask);
}

static void __update_reg_bounds(struct bpf_reg_state *reg)
{
        __update_reg32_bounds(reg);
        __update_reg64_bounds(reg);
}

/* Uses signed min/max values to inform unsigned, and vice-versa */
static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
{
        /* If upper 32 bits of u64/s64 range don't change, we can use lower 32
         * bits to improve our u32/s32 boundaries.
         *
         * E.g., the case where we have upper 32 bits as zero ([10, 20] in
         * u64) is pretty trivial, it's obvious that in u32 we'll also have
         * [10, 20] range. But this property holds for any 64-bit range as
         * long as upper 32 bits in that entire range of values stay the same.
         *
         * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
         * in decimal) has the same upper 32 bits throughout all the values in
         * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
         * range.
         *
         * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
         * following the rules outlined below about u64/s64 correspondence
         * (which equally applies to u32 vs s32 correspondence). In general it
         * depends on actual hexadecimal values of 32-bit range. They can form
         * only valid u32, or only valid s32 ranges in some cases.
         *
         * So we use all these insights to derive bounds for subregisters here.
         */
        if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
                /* u64 to u32 casting preserves validity of low 32 bits as
                 * a range, if upper 32 bits are the same
                 */
                reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
                reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);

                if ((s32)reg->umin_value <= (s32)reg->umax_value) {
                        reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
                        reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
                }
        }
        if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
                /* low 32 bits should form a proper u32 range */
                if ((u32)reg->smin_value <= (u32)reg->smax_value) {
                        reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
                        reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
                }
                /* low 32 bits should form a proper s32 range */
                if ((s32)reg->smin_value <= (s32)reg->smax_value) {
                        reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
                        reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
                }
        }
        /* Special case where upper bits form a small sequence of two
         * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
         * 0x00000000 is also valid), while lower bits form a proper s32 range
         * going from negative numbers to positive numbers. E.g., let's say we
         * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
         * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
         * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
         * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
         * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
         * upper 32 bits. As a random example, s64 range
         * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
         * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
         */
        if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
            (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
                reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
                reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
        }
        if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
            (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
                reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
                reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
        }
        /* if u32 range forms a valid s32 range (due to matching sign bit),
         * try to learn from that
         */
        if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
                reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
                reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
        }
        /* If we cannot cross the sign boundary, then signed and unsigned bounds
         * are the same, so combine.  This works even in the negative case, e.g.
         * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
         */
        if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
                reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
                reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
        }
}

static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
{
        /* If u64 range forms a valid s64 range (due to matching sign bit),
         * try to learn from that. Let's do a bit of ASCII art to see when
         * this is happening. Let's take u64 range first:
         *
         * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
         * |-------------------------------|--------------------------------|
         *
         * Valid u64 range is formed when umin and umax are anywhere in the
         * range [0, U64_MAX], and umin <= umax. u64 case is simple and
         * straightforward. Let's see how s64 range maps onto the same range
         * of values, annotated below the line for comparison:
         *
         * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
         * |-------------------------------|--------------------------------|
         * 0                        S64_MAX S64_MIN                        -1
         *
         * So s64 values basically start in the middle and they are logically
         * contiguous to the right of it, wrapping around from -1 to 0, and
         * then finishing as S64_MAX (0x7fffffffffffffff) right before
         * S64_MIN. We can try drawing the continuity of u64 vs s64 values
         * more visually as mapped to sign-agnostic range of hex values.
         *
         *  u64 start                                               u64 end
         *  _______________________________________________________________
         * /                                                               \
         * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
         * |-------------------------------|--------------------------------|
         * 0                        S64_MAX S64_MIN                        -1
         *                                / \
         * >------------------------------   ------------------------------->
         * s64 continues...        s64 end   s64 start          s64 "midpoint"
         *
         * What this means is that, in general, we can't always derive
         * something new about u64 from any random s64 range, and vice versa.
         *
         * But we can do that in two particular cases. One is when entire
         * u64/s64 range is *entirely* contained within left half of the above
         * diagram or when it is *entirely* contained in the right half. I.e.:
         *
         * |-------------------------------|--------------------------------|
         *     ^                   ^            ^                 ^
         *     A                   B            C                 D
         *
         * [A, B] and [C, D] are contained entirely in their respective halves
         * and form valid contiguous ranges as both u64 and s64 values. [A, B]
         * will be non-negative both as u64 and s64 (and in fact it will be
         * identical ranges no matter the signedness). [C, D] treated as s64
         * will be a range of negative values, while in u64 it will be
         * non-negative range of values larger than 0x8000000000000000.
         *
         * Now, any other range here can't be represented in both u64 and s64
         * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
         * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
         * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
         * for example. Similarly, valid s64 range [D, A] (going from negative
         * to positive values), would be two separate [D, U64_MAX] and [0, A]
         * ranges as u64. Currently reg_state can't represent two segments per
         * numeric domain, so in such situations we can only derive maximal
         * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
         *
         * So we use these facts to derive umin/umax from smin/smax and vice
         * versa only if they stay within the same "half". This is equivalent
         * to checking sign bit: lower half will have sign bit as zero, upper
         * half have sign bit 1. Below in code we simplify this by just
         * casting umin/umax as smin/smax and checking if they form valid
         * range, and vice versa. Those are equivalent checks.
         */
        if ((s64)reg->umin_value <= (s64)reg->umax_value) {
                reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
                reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
        }
        /* If we cannot cross the sign boundary, then signed and unsigned bounds
         * are the same, so combine.  This works even in the negative case, e.g.
         * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
         */
        if ((u64)reg->smin_value <= (u64)reg->smax_value) {
                reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
                reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
        }
}

static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
{
        /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
         * values on both sides of 64-bit range in hope to have tighter range.
         * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
         * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
         * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
         * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
         * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
         * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
         * We just need to make sure that derived bounds we are intersecting
         * with are well-formed ranges in respective s64 or u64 domain, just
         * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
         */
        __u64 new_umin, new_umax;
        __s64 new_smin, new_smax;

        /* u32 -> u64 tightening, it's always well-formed */
        new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
        new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
        reg->umin_value = max_t(u64, reg->umin_value, new_umin);
        reg->umax_value = min_t(u64, reg->umax_value, new_umax);
        /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
        new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
        new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
        reg->smin_value = max_t(s64, reg->smin_value, new_smin);
        reg->smax_value = min_t(s64, reg->smax_value, new_smax);

        /* if s32 can be treated as valid u32 range, we can use it as well */
        if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
                /* s32 -> u64 tightening */
                new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
                new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
                reg->umin_value = max_t(u64, reg->umin_value, new_umin);
                reg->umax_value = min_t(u64, reg->umax_value, new_umax);
                /* s32 -> s64 tightening */
                new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
                new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
                reg->smin_value = max_t(s64, reg->smin_value, new_smin);
                reg->smax_value = min_t(s64, reg->smax_value, new_smax);
        }
}

static void __reg_deduce_bounds(struct bpf_reg_state *reg)
{
        __reg32_deduce_bounds(reg);
        __reg64_deduce_bounds(reg);
        __reg_deduce_mixed_bounds(reg);
}

/* Attempts to improve var_off based on unsigned min/max information */
static void __reg_bound_offset(struct bpf_reg_state *reg)
{
        struct tnum var64_off = tnum_intersect(reg->var_off,
                                               tnum_range(reg->umin_value,
                                                          reg->umax_value));
        struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
                                               tnum_range(reg->u32_min_value,
                                                          reg->u32_max_value));

        reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}

static void reg_bounds_sync(struct bpf_reg_state *reg)
{
        /* We might have learned new bounds from the var_off. */
        __update_reg_bounds(reg);
        /* We might have learned something about the sign bit. */
        __reg_deduce_bounds(reg);
        __reg_deduce_bounds(reg);
        /* We might have learned some bits from the bounds. */
        __reg_bound_offset(reg);
        /* Intersecting with the old var_off might have improved our bounds
         * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
         * then new var_off is (0; 0x7f...fc) which improves our umax.
         */
        __update_reg_bounds(reg);
}

static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, const char *ctx)
{
        const char *msg;

        if (reg->umin_value > reg->umax_value ||
            reg->smin_value > reg->smax_value ||
            reg->u32_min_value > reg->u32_max_value ||
            reg->s32_min_value > reg->s32_max_value) {
                    msg = "range bounds violation";
                    goto out;
        }

        if (tnum_is_const(reg->var_off)) {
                u64 uval = reg->var_off.value;
                s64 sval = (s64)uval;

                if (reg->umin_value != uval || reg->umax_value != uval ||
                    reg->smin_value != sval || reg->smax_value != sval) {
                        msg = "const tnum out of sync with range bounds";
                        goto out;
                }
        }

        if (tnum_subreg_is_const(reg->var_off)) {
                u32 uval32 = tnum_subreg(reg->var_off).value;
                s32 sval32 = (s32)uval32;

                if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
                    reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
                        msg = "const subreg tnum out of sync with range bounds";
                        goto out;
                }
        }

        return 0;
out:
        verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
                "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
                ctx, msg, reg->umin_value, reg->umax_value,
                reg->smin_value, reg->smax_value,
                reg->u32_min_value, reg->u32_max_value,
                reg->s32_min_value, reg->s32_max_value,
                reg->var_off.value, reg->var_off.mask);
        if (env->test_reg_invariants)
                return -EFAULT;
        __mark_reg_unbounded(reg);
        return 0;
}

static bool __reg32_bound_s64(s32 a)
{
        return a >= 0 && a <= S32_MAX;
}

static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
        reg->umin_value = reg->u32_min_value;
        reg->umax_value = reg->u32_max_value;

        /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
         * be positive otherwise set to worse case bounds and refine later
         * from tnum.
         */
        if (__reg32_bound_s64(reg->s32_min_value) &&
            __reg32_bound_s64(reg->s32_max_value)) {
                reg->smin_value = reg->s32_min_value;
                reg->smax_value = reg->s32_max_value;
        } else {
                reg->smin_value = 0;
                reg->smax_value = U32_MAX;
        }
}

/* Mark a register as having a completely unknown (scalar) value. */
static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
{
        /*
         * Clear type, off, and union(map_ptr, range) and
         * padding between 'type' and union
         */
        memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
        reg->type = SCALAR_VALUE;
        reg->id = 0;
        reg->ref_obj_id = 0;
        reg->var_off = tnum_unknown;
        reg->frameno = 0;
        reg->precise = false;
        __mark_reg_unbounded(reg);
}

/* Mark a register as having a completely unknown (scalar) value,
 * initialize .precise as true when not bpf capable.
 */
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg)
{
        __mark_reg_unknown_imprecise(reg);
        reg->precise = !env->bpf_capable;
}

static void mark_reg_unknown(struct bpf_verifier_env *env,
                             struct bpf_reg_state *regs, u32 regno)
{
        if (WARN_ON(regno >= MAX_BPF_REG)) {
                verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
                /* Something bad happened, let's kill all regs except FP */
                for (regno = 0; regno < BPF_REG_FP; regno++)
                        __mark_reg_not_init(env, regs + regno);
                return;
        }
        __mark_reg_unknown(env, regs + regno);
}

static void __mark_reg_not_init(const struct bpf_verifier_env *env,
                                struct bpf_reg_state *reg)
{
        __mark_reg_unknown(env, reg);
        reg->type = NOT_INIT;
}

static void mark_reg_not_init(struct bpf_verifier_env *env,
                              struct bpf_reg_state *regs, u32 regno)
{
        if (WARN_ON(regno >= MAX_BPF_REG)) {
                verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
                /* Something bad happened, let's kill all regs except FP */
                for (regno = 0; regno < BPF_REG_FP; regno++)
                        __mark_reg_not_init(env, regs + regno);
                return;
        }
        __mark_reg_not_init(env, regs + regno);
}

static void mark_btf_ld_reg(struct bpf_verifier_env *env,
                            struct bpf_reg_state *regs, u32 regno,
                            enum bpf_reg_type reg_type,
                            struct btf *btf, u32 btf_id,
                            enum bpf_type_flag flag)
{
        if (reg_type == SCALAR_VALUE) {
                mark_reg_unknown(env, regs, regno);
                return;
        }
        mark_reg_known_zero(env, regs, regno);
        regs[regno].type = PTR_TO_BTF_ID | flag;
        regs[regno].btf = btf;
        regs[regno].btf_id = btf_id;
        if (type_may_be_null(flag))
                regs[regno].id = ++env->id_gen;
}

#define DEF_NOT_SUBREG        (0)
static void init_reg_state(struct bpf_verifier_env *env,
                           struct bpf_func_state *state)
{
        struct bpf_reg_state *regs = state->regs;
        int i;

        for (i = 0; i < MAX_BPF_REG; i++) {
                mark_reg_not_init(env, regs, i);
                regs[i].live = REG_LIVE_NONE;
                regs[i].parent = NULL;
                regs[i].subreg_def = DEF_NOT_SUBREG;
        }

        /* frame pointer */
        regs[BPF_REG_FP].type = PTR_TO_STACK;
        mark_reg_known_zero(env, regs, BPF_REG_FP);
        regs[BPF_REG_FP].frameno = state->frameno;
}

static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
{
        return (struct bpf_retval_range){ minval, maxval };
}

#define BPF_MAIN_FUNC (-1)
static void init_func_state(struct bpf_verifier_env *env,
                            struct bpf_func_state *state,
                            int callsite, int frameno, int subprogno)
{
        state->callsite = callsite;
        state->frameno = frameno;
        state->subprogno = subprogno;
        state->callback_ret_range = retval_range(0, 0);
        init_reg_state(env, state);
        mark_verifier_state_scratched(env);
}

/* Similar to push_stack(), but for async callbacks */
static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
                                                int insn_idx, int prev_insn_idx,
                                                int subprog, bool is_sleepable)
{
        struct bpf_verifier_stack_elem *elem;
        struct bpf_func_state *frame;

        elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
        if (!elem)
                goto err;

        elem->insn_idx = insn_idx;
        elem->prev_insn_idx = prev_insn_idx;
        elem->next = env->head;
        elem->log_pos = env->log.end_pos;
        env->head = elem;
        env->stack_size++;
        if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
                verbose(env,
                        "The sequence of %d jumps is too complex for async cb.\n",
                        env->stack_size);
                goto err;
        }
        /* Unlike push_stack() do not copy_verifier_state().
         * The caller state doesn't matter.
         * This is async callback. It starts in a fresh stack.
         * Initialize it similar to do_check_common().
         */
        elem->st.branches = 1;
        elem->st.in_sleepable = is_sleepable;
        frame = kzalloc(sizeof(*frame), GFP_KERNEL);
        if (!frame)
                goto err;
        init_func_state(env, frame,
                        BPF_MAIN_FUNC /* callsite */,
                        0 /* frameno within this callchain */,
                        subprog /* subprog number within this prog */);
        elem->st.frame[0] = frame;
        return &elem->st;
err:
        free_verifier_state(env->cur_state, true);
        env->cur_state = NULL;
        /* pop all elements and return */
        while (!pop_stack(env, NULL, NULL, false));
        return NULL;
}


enum reg_arg_type {
        SRC_OP,                /* register is used as source operand */
        DST_OP,                /* register is used as destination operand */
        DST_OP_NO_MARK        /* same as above, check only, don't mark */
};

static int cmp_subprogs(const void *a, const void *b)
{
        return ((struct bpf_subprog_info *)a)->start -
               ((struct bpf_subprog_info *)b)->start;
}

static int find_subprog(struct bpf_verifier_env *env, int off)
{
        struct bpf_subprog_info *p;

        p = bsearch(&off, env->subprog_info, env->subprog_cnt,
                    sizeof(env->subprog_info[0]), cmp_subprogs);
        if (!p)
                return -ENOENT;
        return p - env->subprog_info;

}

static int add_subprog(struct bpf_verifier_env *env, int off)
{
        int insn_cnt = env->prog->len;
        int ret;

        if (off >= insn_cnt || off < 0) {
                verbose(env, "call to invalid destination\n");
                return -EINVAL;
        }
        ret = find_subprog(env, off);
        if (ret >= 0)
                return ret;
        if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
                verbose(env, "too many subprograms\n");
                return -E2BIG;
        }
        /* determine subprog starts. The end is one before the next starts */
        env->subprog_info[env->subprog_cnt++].start = off;
        sort(env->subprog_info, env->subprog_cnt,
             sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
        return env->subprog_cnt - 1;
}

static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
{
        struct bpf_prog_aux *aux = env->prog->aux;
        struct btf *btf = aux->btf;
        const struct btf_type *t;
        u32 main_btf_id, id;
        const char *name;
        int ret, i;

        /* Non-zero func_info_cnt implies valid btf */
        if (!aux->func_info_cnt)
                return 0;
        main_btf_id = aux->func_info[0].type_id;

        t = btf_type_by_id(btf, main_btf_id);
        if (!t) {
                verbose(env, "invalid btf id for main subprog in func_info\n");
                return -EINVAL;
        }

        name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
        if (IS_ERR(name)) {
                ret = PTR_ERR(name);
                /* If there is no tag present, there is no exception callback */
                if (ret == -ENOENT)
                        ret = 0;
                else if (ret == -EEXIST)
                        verbose(env, "multiple exception callback tags for main subprog\n");
                return ret;
        }

        ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
        if (ret < 0) {
                verbose(env, "exception callback '%s' could not be found in BTF\n", name);
                return ret;
        }
        id = ret;
        t = btf_type_by_id(btf, id);
        if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
                verbose(env, "exception callback '%s' must have global linkage\n", name);
                return -EINVAL;
        }
        ret = 0;
        for (i = 0; i < aux->func_info_cnt; i++) {
                if (aux->func_info[i].type_id != id)
                        continue;
                ret = aux->func_info[i].insn_off;
                /* Further func_info and subprog checks will also happen
                 * later, so assume this is the right insn_off for now.
                 */
                if (!ret) {
                        verbose(env, "invalid exception callback insn_off in func_info: 0\n");
                        ret = -EINVAL;
                }
        }
        if (!ret) {
                verbose(env, "exception callback type id not found in func_info\n");
                ret = -EINVAL;
        }
        return ret;
}

#define MAX_KFUNC_DESCS 256
#define MAX_KFUNC_BTFS        256

struct bpf_kfunc_desc {
        struct btf_func_model func_model;
        u32 func_id;
        s32 imm;
        u16 offset;
        unsigned long addr;
};

struct bpf_kfunc_btf {
        struct btf *btf;
        struct module *module;
        u16 offset;
};

struct bpf_kfunc_desc_tab {
        /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
         * verification. JITs do lookups by bpf_insn, where func_id may not be
         * available, therefore at the end of verification do_misc_fixups()
         * sorts this by imm and offset.
         */
        struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
        u32 nr_descs;
};

struct bpf_kfunc_btf_tab {
        struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
        u32 nr_descs;
};

static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
{
        const struct bpf_kfunc_desc *d0 = a;
        const struct bpf_kfunc_desc *d1 = b;

        /* func_id is not greater than BTF_MAX_TYPE */
        return d0->func_id - d1->func_id ?: d0->offset - d1->offset;
}

static int kfunc_btf_cmp_by_off(const void *a, const void *b)
{
        const struct bpf_kfunc_btf *d0 = a;
        const struct bpf_kfunc_btf *d1 = b;

        return d0->offset - d1->offset;
}

static const struct bpf_kfunc_desc *
find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
{
        struct bpf_kfunc_desc desc = {
                .func_id = func_id,
                .offset = offset,
        };
        struct bpf_kfunc_desc_tab *tab;

        tab = prog->aux->kfunc_tab;
        return bsearch(&desc, tab->descs, tab->nr_descs,
                       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
}

int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
                       u16 btf_fd_idx, u8 **func_addr)
{
        const struct bpf_kfunc_desc *desc;

        desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
        if (!desc)
                return -EFAULT;

        *func_addr = (u8 *)desc->addr;
        return 0;
}

static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
                                         s16 offset)
{
        struct bpf_kfunc_btf kf_btf = { .offset = offset };
        struct bpf_kfunc_btf_tab *tab;
        struct bpf_kfunc_btf *b;
        struct module *mod;
        struct btf *btf;
        int btf_fd;

        tab = env->prog->aux->kfunc_btf_tab;
        b = bsearch(&kf_btf, tab->descs, tab->nr_descs,
                    sizeof(tab->descs[0]), kfunc_btf_cmp_by_off);
        if (!b) {
                if (tab->nr_descs == MAX_KFUNC_BTFS) {
                        verbose(env, "too many different module BTFs\n");
                        return ERR_PTR(-E2BIG);
                }

                if (bpfptr_is_null(env->fd_array)) {
                        verbose(env, "kfunc offset > 0 without fd_array is invalid\n");
                        return ERR_PTR(-EPROTO);
                }

                if (copy_from_bpfptr_offset(&btf_fd, env->fd_array,
                                            offset * sizeof(btf_fd),
                                            sizeof(btf_fd)))
                        return ERR_PTR(-EFAULT);

                btf = btf_get_by_fd(btf_fd);
                if (IS_ERR(btf)) {
                        verbose(env, "invalid module BTF fd specified\n");
                        return btf;
                }

                if (!btf_is_module(btf)) {
                        verbose(env, "BTF fd for kfunc is not a module BTF\n");
                        btf_put(btf);
                        return ERR_PTR(-EINVAL);
                }

                mod = btf_try_get_module(btf);
                if (!mod) {
                        btf_put(btf);
                        return ERR_PTR(-ENXIO);
                }

                b = &tab->descs[tab->nr_descs++];
                b->btf = btf;
                b->module = mod;
                b->offset = offset;

                sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
                     kfunc_btf_cmp_by_off, NULL);
        }
        return b->btf;
}

void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
{
        if (!tab)
                return;

        while (tab->nr_descs--) {
                module_put(tab->descs[tab->nr_descs].module);
                btf_put(tab->descs[tab->nr_descs].btf);
        }
        kfree(tab);
}

static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
{
        if (offset) {
                if (offset < 0) {
                        /* In the future, this can be allowed to increase limit
                         * of fd index into fd_array, interpreted as u16.
                         */
                        verbose(env, "negative offset disallowed for kernel module function call\n");
                        return ERR_PTR(-EINVAL);
                }

                return __find_kfunc_desc_btf(env, offset);
        }
        return btf_vmlinux ?: ERR_PTR(-ENOENT);
}

static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
{
        const struct btf_type *func, *func_proto;
        struct bpf_kfunc_btf_tab *btf_tab;
        struct bpf_kfunc_desc_tab *tab;
        struct bpf_prog_aux *prog_aux;
        struct bpf_kfunc_desc *desc;
        const char *func_name;
        struct btf *desc_btf;
        unsigned long call_imm;
        unsigned long addr;
        int err;

        prog_aux = env->prog->aux;
        tab = prog_aux->kfunc_tab;
        btf_tab = prog_aux->kfunc_btf_tab;
        if (!tab) {
                if (!btf_vmlinux) {
                        verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
                        return -ENOTSUPP;
                }

                if (!env->prog->jit_requested) {
                        verbose(env, "JIT is required for calling kernel function\n");
                        return -ENOTSUPP;
                }

                if (!bpf_jit_supports_kfunc_call()) {
                        verbose(env, "JIT does not support calling kernel function\n");
                        return -ENOTSUPP;
                }

                if (!env->prog->gpl_compatible) {
                        verbose(env, "cannot call kernel function from non-GPL compatible program\n");
                        return -EINVAL;
                }

                tab = kzalloc(sizeof(*tab), GFP_KERNEL);
                if (!tab)
                        return -ENOMEM;
                prog_aux->kfunc_tab = tab;
        }

        /* func_id == 0 is always invalid, but instead of returning an error, be
         * conservative and wait until the code elimination pass before returning
         * error, so that invalid calls that get pruned out can be in BPF programs
         * loaded from userspace.  It is also required that offset be untouched
         * for such calls.
         */
        if (!func_id && !offset)
                return 0;

        if (!btf_tab && offset) {
                btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL);
                if (!btf_tab)
                        return -ENOMEM;
                prog_aux->kfunc_btf_tab = btf_tab;
        }

        desc_btf = find_kfunc_desc_btf(env, offset);
        if (IS_ERR(desc_btf)) {
                verbose(env, "failed to find BTF for kernel function\n");
                return PTR_ERR(desc_btf);
        }

        if (find_kfunc_desc(env->prog, func_id, offset))
                return 0;

        if (tab->nr_descs == MAX_KFUNC_DESCS) {
                verbose(env, "too many different kernel function calls\n");
                return -E2BIG;
        }

        func = btf_type_by_id(desc_btf, func_id);
        if (!func || !btf_type_is_func(func)) {
                verbose(env, "kernel btf_id %u is not a function\n",
                        func_id);
                return -EINVAL;
        }
        func_proto = btf_type_by_id(desc_btf, func->type);
        if (!func_proto || !btf_type_is_func_proto(func_proto)) {
                verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
                        func_id);
                return -EINVAL;
        }

        func_name = btf_name_by_offset(desc_btf, func->name_off);
        addr = kallsyms_lookup_name(func_name);
        if (!addr) {
                verbose(env, "cannot find address for kernel function %s\n",
                        func_name);
                return -EINVAL;
        }
        specialize_kfunc(env, func_id, offset, &addr);

        if (bpf_jit_supports_far_kfunc_call()) {
                call_imm = func_id;
        } else {
                call_imm = BPF_CALL_IMM(addr);
                /* Check whether the relative offset overflows desc->imm */
                if ((unsigned long)(s32)call_imm != call_imm) {
                        verbose(env, "address of kernel function %s is out of range\n",
                                func_name);
                        return -EINVAL;
                }
        }

        if (bpf_dev_bound_kfunc_id(func_id)) {
                err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
                if (err)
                        return err;
        }

        desc = &tab->descs[tab->nr_descs++];
        desc->func_id = func_id;
        desc->imm = call_imm;
        desc->offset = offset;
        desc->addr = addr;
        err = btf_distill_func_proto(&env->log, desc_btf,
                                     func_proto, func_name,
                                     &desc->func_model);
        if (!err)
                sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
                     kfunc_desc_cmp_by_id_off, NULL);
        return err;
}

static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
{
        const struct bpf_kfunc_desc *d0 = a;
        const struct bpf_kfunc_desc *d1 = b;

        if (d0->imm != d1->imm)
                return d0->imm < d1->imm ? -1 : 1;
        if (d0->offset != d1->offset)
                return d0->offset < d1->offset ? -1 : 1;
        return 0;
}

static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
{
        struct bpf_kfunc_desc_tab *tab;

        tab = prog->aux->kfunc_tab;
        if (!tab)
                return;

        sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
             kfunc_desc_cmp_by_imm_off, NULL);
}

bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
{
        return !!prog->aux->kfunc_tab;
}

const struct btf_func_model *
bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
                         const struct bpf_insn *insn)
{
        const struct bpf_kfunc_desc desc = {
                .imm = insn->imm,
                .offset = insn->off,
        };
        const struct bpf_kfunc_desc *res;
        struct bpf_kfunc_desc_tab *tab;

        tab = prog->aux->kfunc_tab;
        res = bsearch(&desc, tab->descs, tab->nr_descs,
                      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);

        return res ? &res->func_model : NULL;
}

static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprog = env->subprog_info;
        int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
        struct bpf_insn *insn = env->prog->insnsi;

        /* Add entry function. */
        ret = add_subprog(env, 0);
        if (ret)
                return ret;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
                    !bpf_pseudo_kfunc_call(insn))
                        continue;

                if (!env->bpf_capable) {
                        verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
                        return -EPERM;
                }

                if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
                        ret = add_subprog(env, i + insn->imm + 1);
                else
                        ret = add_kfunc_call(env, insn->imm, insn->off);

                if (ret < 0)
                        return ret;
        }

        ret = bpf_find_exception_callback_insn_off(env);
        if (ret < 0)
                return ret;
        ex_cb_insn = ret;

        /* If ex_cb_insn > 0, this means that the main program has a subprog
         * marked using BTF decl tag to serve as the exception callback.
         */
        if (ex_cb_insn) {
                ret = add_subprog(env, ex_cb_insn);
                if (ret < 0)
                        return ret;
                for (i = 1; i < env->subprog_cnt; i++) {
                        if (env->subprog_info[i].start != ex_cb_insn)
                                continue;
                        env->exception_callback_subprog = i;
                        mark_subprog_exc_cb(env, i);
                        break;
                }
        }

        /* Add a fake 'exit' subprog which could simplify subprog iteration
         * logic. 'subprog_cnt' should not be increased.
         */
        subprog[env->subprog_cnt].start = insn_cnt;

        if (env->log.level & BPF_LOG_LEVEL2)
                for (i = 0; i < env->subprog_cnt; i++)
                        verbose(env, "func#%d @%d\n", i, subprog[i].start);

        return 0;
}

static int check_subprogs(struct bpf_verifier_env *env)
{
        int i, subprog_start, subprog_end, off, cur_subprog = 0;
        struct bpf_subprog_info *subprog = env->subprog_info;
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;

        /* now check that all jumps are within the same subprog */
        subprog_start = subprog[cur_subprog].start;
        subprog_end = subprog[cur_subprog + 1].start;
        for (i = 0; i < insn_cnt; i++) {
                u8 code = insn[i].code;

                if (code == (BPF_JMP | BPF_CALL) &&
                    insn[i].src_reg == 0 &&
                    insn[i].imm == BPF_FUNC_tail_call)
                        subprog[cur_subprog].has_tail_call = true;
                if (BPF_CLASS(code) == BPF_LD &&
                    (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
                        subprog[cur_subprog].has_ld_abs = true;
                if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
                        goto next;
                if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
                        goto next;
                if (code == (BPF_JMP32 | BPF_JA))
                        off = i + insn[i].imm + 1;
                else
                        off = i + insn[i].off + 1;
                if (off < subprog_start || off >= subprog_end) {
                        verbose(env, "jump out of range from insn %d to %d\n", i, off);
                        return -EINVAL;
                }
next:
                if (i == subprog_end - 1) {
                        /* to avoid fall-through from one subprog into another
                         * the last insn of the subprog should be either exit
                         * or unconditional jump back or bpf_throw call
                         */
                        if (code != (BPF_JMP | BPF_EXIT) &&
                            code != (BPF_JMP32 | BPF_JA) &&
                            code != (BPF_JMP | BPF_JA)) {
                                verbose(env, "last insn is not an exit or jmp\n");
                                return -EINVAL;
                        }
                        subprog_start = subprog_end;
                        cur_subprog++;
                        if (cur_subprog < env->subprog_cnt)
                                subprog_end = subprog[cur_subprog + 1].start;
                }
        }
        return 0;
}

/* Parentage chain of this register (or stack slot) should take care of all
 * issues like callee-saved registers, stack slot allocation time, etc.
 */
static int mark_reg_read(struct bpf_verifier_env *env,
                         const struct bpf_reg_state *state,
                         struct bpf_reg_state *parent, u8 flag)
{
        bool writes = parent == state->parent; /* Observe write marks */
        int cnt = 0;

        while (parent) {
                /* if read wasn't screened by an earlier write ... */
                if (writes && state->live & REG_LIVE_WRITTEN)
                        break;
                if (parent->live & REG_LIVE_DONE) {
                        verbose(env, "verifier BUG type %s var_off %lld off %d\n",
                                reg_type_str(env, parent->type),
                                parent->var_off.value, parent->off);
                        return -EFAULT;
                }
                /* The first condition is more likely to be true than the
                 * second, checked it first.
                 */
                if ((parent->live & REG_LIVE_READ) == flag ||
                    parent->live & REG_LIVE_READ64)
                        /* The parentage chain never changes and
                         * this parent was already marked as LIVE_READ.
                         * There is no need to keep walking the chain again and
                         * keep re-marking all parents as LIVE_READ.
                         * This case happens when the same register is read
                         * multiple times without writes into it in-between.
                         * Also, if parent has the stronger REG_LIVE_READ64 set,
                         * then no need to set the weak REG_LIVE_READ32.
                         */
                        break;
                /* ... then we depend on parent's value */
                parent->live |= flag;
                /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
                if (flag == REG_LIVE_READ64)
                        parent->live &= ~REG_LIVE_READ32;
                state = parent;
                parent = state->parent;
                writes = true;
                cnt++;
        }

        if (env->longest_mark_read_walk < cnt)
                env->longest_mark_read_walk = cnt;
        return 0;
}

static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = func(env, reg);
        int spi, ret;

        /* For CONST_PTR_TO_DYNPTR, it must have already been done by
         * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
         * check_kfunc_call.
         */
        if (reg->type == CONST_PTR_TO_DYNPTR)
                return 0;
        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;
        /* Caller ensures dynptr is valid and initialized, which means spi is in
         * bounds and spi is the first dynptr slot. Simply mark stack slot as
         * read.
         */
        ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
                            state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
        if (ret)
                return ret;
        return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
                             state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
}

static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                          int spi, int nr_slots)
{
        struct bpf_func_state *state = func(env, reg);
        int err, i;

        for (i = 0; i < nr_slots; i++) {
                struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;

                err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
                if (err)
                        return err;

                mark_stack_slot_scratched(env, spi - i);
        }

        return 0;
}

/* This function is supposed to be used by the following 32-bit optimization
 * code only. It returns TRUE if the source or destination register operates
 * on 64-bit, otherwise return FALSE.
 */
static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
                     u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
{
        u8 code, class, op;

        code = insn->code;
        class = BPF_CLASS(code);
        op = BPF_OP(code);
        if (class == BPF_JMP) {
                /* BPF_EXIT for "main" will reach here. Return TRUE
                 * conservatively.
                 */
                if (op == BPF_EXIT)
                        return true;
                if (op == BPF_CALL) {
                        /* BPF to BPF call will reach here because of marking
                         * caller saved clobber with DST_OP_NO_MARK for which we
                         * don't care the register def because they are anyway
                         * marked as NOT_INIT already.
                         */
                        if (insn->src_reg == BPF_PSEUDO_CALL)
                                return false;
                        /* Helper call will reach here because of arg type
                         * check, conservatively return TRUE.
                         */
                        if (t == SRC_OP)
                                return true;

                        return false;
                }
        }

        if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
                return false;

        if (class == BPF_ALU64 || class == BPF_JMP ||
            (class == BPF_ALU && op == BPF_END && insn->imm == 64))
                return true;

        if (class == BPF_ALU || class == BPF_JMP32)
                return false;

        if (class == BPF_LDX) {
                if (t != SRC_OP)
                        return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
                /* LDX source must be ptr. */
                return true;
        }

        if (class == BPF_STX) {
                /* BPF_STX (including atomic variants) has multiple source
                 * operands, one of which is a ptr. Check whether the caller is
                 * asking about it.
                 */
                if (t == SRC_OP && reg->type != SCALAR_VALUE)
                        return true;
                return BPF_SIZE(code) == BPF_DW;
        }

        if (class == BPF_LD) {
                u8 mode = BPF_MODE(code);

                /* LD_IMM64 */
                if (mode == BPF_IMM)
                        return true;

                /* Both LD_IND and LD_ABS return 32-bit data. */
                if (t != SRC_OP)
                        return  false;

                /* Implicit ctx ptr. */
                if (regno == BPF_REG_6)
                        return true;

                /* Explicit source could be any width. */
                return true;
        }

        if (class == BPF_ST)
                /* The only source register for BPF_ST is a ptr. */
                return true;

        /* Conservatively return true at default. */
        return true;
}

/* Return the regno defined by the insn, or -1. */
static int insn_def_regno(const struct bpf_insn *insn)
{
        switch (BPF_CLASS(insn->code)) {
        case BPF_JMP:
        case BPF_JMP32:
        case BPF_ST:
                return -1;
        case BPF_STX:
                if (BPF_MODE(insn->code) == BPF_ATOMIC &&
                    (insn->imm & BPF_FETCH)) {
                        if (insn->imm == BPF_CMPXCHG)
                                return BPF_REG_0;
                        else
                                return insn->src_reg;
                } else {
                        return -1;
                }
        default:
                return insn->dst_reg;
        }
}

/* Return TRUE if INSN has defined any 32-bit value explicitly. */
static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        int dst_reg = insn_def_regno(insn);

        if (dst_reg == -1)
                return false;

        return !is_reg64(env, insn, dst_reg, NULL, DST_OP);
}

static void mark_insn_zext(struct bpf_verifier_env *env,
                           struct bpf_reg_state *reg)
{
        s32 def_idx = reg->subreg_def;

        if (def_idx == DEF_NOT_SUBREG)
                return;

        env->insn_aux_data[def_idx - 1].zext_dst = true;
        /* The dst will be zero extended, so won't be sub-register anymore. */
        reg->subreg_def = DEF_NOT_SUBREG;
}

static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
                           enum reg_arg_type t)
{
        struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
        struct bpf_reg_state *reg;
        bool rw64;

        if (regno >= MAX_BPF_REG) {
                verbose(env, "R%d is invalid\n", regno);
                return -EINVAL;
        }

        mark_reg_scratched(env, regno);

        reg = &regs[regno];
        rw64 = is_reg64(env, insn, regno, reg, t);
        if (t == SRC_OP) {
                /* check whether register used as source operand can be read */
                if (reg->type == NOT_INIT) {
                        verbose(env, "R%d !read_ok\n", regno);
                        return -EACCES;
                }
                /* We don't need to worry about FP liveness because it's read-only */
                if (regno == BPF_REG_FP)
                        return 0;

                if (rw64)
                        mark_insn_zext(env, reg);

                return mark_reg_read(env, reg, reg->parent,
                                     rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
        } else {
                /* check whether register used as dest operand can be written to */
                if (regno == BPF_REG_FP) {
                        verbose(env, "frame pointer is read only\n");
                        return -EACCES;
                }
                reg->live |= REG_LIVE_WRITTEN;
                reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
                if (t == DST_OP)
                        mark_reg_unknown(env, regs, regno);
        }
        return 0;
}

static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
                         enum reg_arg_type t)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];

        return __check_reg_arg(env, state->regs, regno, t);
}

static int insn_stack_access_flags(int frameno, int spi)
{
        return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
}

static int insn_stack_access_spi(int insn_flags)
{
        return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
}

static int insn_stack_access_frameno(int insn_flags)
{
        return insn_flags & INSN_F_FRAMENO_MASK;
}

static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].jmp_point = true;
}

static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].jmp_point;
}

/* for any branch, call, exit record the history of jmps in the given state */
static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
                            int insn_flags)
{
        u32 cnt = cur->jmp_history_cnt;
        struct bpf_jmp_history_entry *p;
        size_t alloc_size;

        /* combine instruction flags if we already recorded this instruction */
        if (env->cur_hist_ent) {
                /* atomic instructions push insn_flags twice, for READ and
                 * WRITE sides, but they should agree on stack slot
                 */
                WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
                          (env->cur_hist_ent->flags & insn_flags) != insn_flags,
                          "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
                          env->insn_idx, env->cur_hist_ent->flags, insn_flags);
                env->cur_hist_ent->flags |= insn_flags;
                return 0;
        }

        cnt++;
        alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
        p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
        if (!p)
                return -ENOMEM;
        cur->jmp_history = p;

        p = &cur->jmp_history[cnt - 1];
        p->idx = env->insn_idx;
        p->prev_idx = env->prev_insn_idx;
        p->flags = insn_flags;
        cur->jmp_history_cnt = cnt;
        env->cur_hist_ent = p;

        return 0;
}

static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
                                                        u32 hist_end, int insn_idx)
{
        if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
                return &st->jmp_history[hist_end - 1];
        return NULL;
}

/* Backtrack one insn at a time. If idx is not at the top of recorded
 * history then previous instruction came from straight line execution.
 * Return -ENOENT if we exhausted all instructions within given state.
 *
 * It's legal to have a bit of a looping with the same starting and ending
 * insn index within the same state, e.g.: 3->4->5->3, so just because current
 * instruction index is the same as state's first_idx doesn't mean we are
 * done. If there is still some jump history left, we should keep going. We
 * need to take into account that we might have a jump history between given
 * state's parent and itself, due to checkpointing. In this case, we'll have
 * history entry recording a jump from last instruction of parent state and
 * first instruction of given state.
 */
static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
                             u32 *history)
{
        u32 cnt = *history;

        if (i == st->first_insn_idx) {
                if (cnt == 0)
                        return -ENOENT;
                if (cnt == 1 && st->jmp_history[0].idx == i)
                        return -ENOENT;
        }

        if (cnt && st->jmp_history[cnt - 1].idx == i) {
                i = st->jmp_history[cnt - 1].prev_idx;
                (*history)--;
        } else {
                i--;
        }
        return i;
}

static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
{
        const struct btf_type *func;
        struct btf *desc_btf;

        if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
                return NULL;

        desc_btf = find_kfunc_desc_btf(data, insn->off);
        if (IS_ERR(desc_btf))
                return "<error>";

        func = btf_type_by_id(desc_btf, insn->imm);
        return btf_name_by_offset(desc_btf, func->name_off);
}

static inline void bt_init(struct backtrack_state *bt, u32 frame)
{
        bt->frame = frame;
}

static inline void bt_reset(struct backtrack_state *bt)
{
        struct bpf_verifier_env *env = bt->env;

        memset(bt, 0, sizeof(*bt));
        bt->env = env;
}

static inline u32 bt_empty(struct backtrack_state *bt)
{
        u64 mask = 0;
        int i;

        for (i = 0; i <= bt->frame; i++)
                mask |= bt->reg_masks[i] | bt->stack_masks[i];

        return mask == 0;
}

static inline int bt_subprog_enter(struct backtrack_state *bt)
{
        if (bt->frame == MAX_CALL_FRAMES - 1) {
                verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
                WARN_ONCE(1, "verifier backtracking bug");
                return -EFAULT;
        }
        bt->frame++;
        return 0;
}

static inline int bt_subprog_exit(struct backtrack_state *bt)
{
        if (bt->frame == 0) {
                verbose(bt->env, "BUG subprog exit from frame 0\n");
                WARN_ONCE(1, "verifier backtracking bug");
                return -EFAULT;
        }
        bt->frame--;
        return 0;
}

static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
{
        bt->reg_masks[frame] |= 1 << reg;
}

static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
{
        bt->reg_masks[frame] &= ~(1 << reg);
}

static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
{
        bt_set_frame_reg(bt, bt->frame, reg);
}

static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
{
        bt_clear_frame_reg(bt, bt->frame, reg);
}

static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
{
        bt->stack_masks[frame] |= 1ull << slot;
}

static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
{
        bt->stack_masks[frame] &= ~(1ull << slot);
}

static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
{
        return bt->reg_masks[frame];
}

static inline u32 bt_reg_mask(struct backtrack_state *bt)
{
        return bt->reg_masks[bt->frame];
}

static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
{
        return bt->stack_masks[frame];
}

static inline u64 bt_stack_mask(struct backtrack_state *bt)
{
        return bt->stack_masks[bt->frame];
}

static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
{
        return bt->reg_masks[bt->frame] & (1 << reg);
}

static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
{
        return bt->stack_masks[frame] & (1ull << slot);
}

/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
{
        DECLARE_BITMAP(mask, 64);
        bool first = true;
        int i, n;

        buf[0] = '\0';

        bitmap_from_u64(mask, reg_mask);
        for_each_set_bit(i, mask, 32) {
                n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
                first = false;
                buf += n;
                buf_sz -= n;
                if (buf_sz < 0)
                        break;
        }
}
/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
{
        DECLARE_BITMAP(mask, 64);
        bool first = true;
        int i, n;

        buf[0] = '\0';

        bitmap_from_u64(mask, stack_mask);
        for_each_set_bit(i, mask, 64) {
                n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
                first = false;
                buf += n;
                buf_sz -= n;
                if (buf_sz < 0)
                        break;
        }
}

static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);

/* For given verifier state backtrack_insn() is called from the last insn to
 * the first insn. Its purpose is to compute a bitmask of registers and
 * stack slots that needs precision in the parent verifier state.
 *
 * @idx is an index of the instruction we are currently processing;
 * @subseq_idx is an index of the subsequent instruction that:
 *   - *would be* executed next, if jump history is viewed in forward order;
 *   - *was* processed previously during backtracking.
 */
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
                          struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
{
        const struct bpf_insn_cbs cbs = {
                .cb_call        = disasm_kfunc_name,
                .cb_print        = verbose,
                .private_data        = env,
        };
        struct bpf_insn *insn = env->prog->insnsi + idx;
        u8 class = BPF_CLASS(insn->code);
        u8 opcode = BPF_OP(insn->code);
        u8 mode = BPF_MODE(insn->code);
        u32 dreg = insn->dst_reg;
        u32 sreg = insn->src_reg;
        u32 spi, i, fr;

        if (insn->code == 0)
                return 0;
        if (env->log.level & BPF_LOG_LEVEL2) {
                fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
                verbose(env, "mark_precise: frame%d: regs=%s ",
                        bt->frame, env->tmp_str_buf);
                fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
                verbose(env, "stack=%s before ", env->tmp_str_buf);
                verbose(env, "%d: ", idx);
                print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
        }

        if (class == BPF_ALU || class == BPF_ALU64) {
                if (!bt_is_reg_set(bt, dreg))
                        return 0;
                if (opcode == BPF_END || opcode == BPF_NEG) {
                        /* sreg is reserved and unused
                         * dreg still need precision before this insn
                         */
                        return 0;
                } else if (opcode == BPF_MOV) {
                        if (BPF_SRC(insn->code) == BPF_X) {
                                /* dreg = sreg or dreg = (s8, s16, s32)sreg
                                 * dreg needs precision after this insn
                                 * sreg needs precision before this insn
                                 */
                                bt_clear_reg(bt, dreg);
                                if (sreg != BPF_REG_FP)
                                        bt_set_reg(bt, sreg);
                        } else {
                                /* dreg = K
                                 * dreg needs precision after this insn.
                                 * Corresponding register is already marked
                                 * as precise=true in this verifier state.
                                 * No further markings in parent are necessary
                                 */
                                bt_clear_reg(bt, dreg);
                        }
                } else {
                        if (BPF_SRC(insn->code) == BPF_X) {
                                /* dreg += sreg
                                 * both dreg and sreg need precision
                                 * before this insn
                                 */
                                if (sreg != BPF_REG_FP)
                                        bt_set_reg(bt, sreg);
                        } /* else dreg += K
                           * dreg still needs precision before this insn
                           */
                }
        } else if (class == BPF_LDX) {
                if (!bt_is_reg_set(bt, dreg))
                        return 0;
                bt_clear_reg(bt, dreg);

                /* scalars can only be spilled into stack w/o losing precision.
                 * Load from any other memory can be zero extended.
                 * The desire to keep that precision is already indicated
                 * by 'precise' mark in corresponding register of this state.
                 * No further tracking necessary.
                 */
                if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
                        return 0;
                /* dreg = *(u64 *)[fp - off] was a fill from the stack.
                 * that [fp - off] slot contains scalar that needs to be
                 * tracked with precision
                 */
                spi = insn_stack_access_spi(hist->flags);
                fr = insn_stack_access_frameno(hist->flags);
                bt_set_frame_slot(bt, fr, spi);
        } else if (class == BPF_STX || class == BPF_ST) {
                if (bt_is_reg_set(bt, dreg))
                        /* stx & st shouldn't be using _scalar_ dst_reg
                         * to access memory. It means backtracking
                         * encountered a case of pointer subtraction.
                         */
                        return -ENOTSUPP;
                /* scalars can only be spilled into stack */
                if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
                        return 0;
                spi = insn_stack_access_spi(hist->flags);
                fr = insn_stack_access_frameno(hist->flags);
                if (!bt_is_frame_slot_set(bt, fr, spi))
                        return 0;
                bt_clear_frame_slot(bt, fr, spi);
                if (class == BPF_STX)
                        bt_set_reg(bt, sreg);
        } else if (class == BPF_JMP || class == BPF_JMP32) {
                if (bpf_pseudo_call(insn)) {
                        int subprog_insn_idx, subprog;

                        subprog_insn_idx = idx + insn->imm + 1;
                        subprog = find_subprog(env, subprog_insn_idx);
                        if (subprog < 0)
                                return -EFAULT;

                        if (subprog_is_global(env, subprog)) {
                                /* check that jump history doesn't have any
                                 * extra instructions from subprog; the next
                                 * instruction after call to global subprog
                                 * should be literally next instruction in
                                 * caller program
                                 */
                                WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
                                /* r1-r5 are invalidated after subprog call,
                                 * so for global func call it shouldn't be set
                                 * anymore
                                 */
                                if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
                                        verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
                                        WARN_ONCE(1, "verifier backtracking bug");
                                        return -EFAULT;
                                }
                                /* global subprog always sets R0 */
                                bt_clear_reg(bt, BPF_REG_0);
                                return 0;
                        } else {
                                /* static subprog call instruction, which
                                 * means that we are exiting current subprog,
                                 * so only r1-r5 could be still requested as
                                 * precise, r0 and r6-r10 or any stack slot in
                                 * the current frame should be zero by now
                                 */
                                if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
                                        verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
                                        WARN_ONCE(1, "verifier backtracking bug");
                                        return -EFAULT;
                                }
                                /* we are now tracking register spills correctly,
                                 * so any instance of leftover slots is a bug
                                 */
                                if (bt_stack_mask(bt) != 0) {
                                        verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
                                        WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
                                        return -EFAULT;
                                }
                                /* propagate r1-r5 to the caller */
                                for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
                                        if (bt_is_reg_set(bt, i)) {
                                                bt_clear_reg(bt, i);
                                                bt_set_frame_reg(bt, bt->frame - 1, i);
                                        }
                                }
                                if (bt_subprog_exit(bt))
                                        return -EFAULT;
                                return 0;
                        }
                } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
                        /* exit from callback subprog to callback-calling helper or
                         * kfunc call. Use idx/subseq_idx check to discern it from
                         * straight line code backtracking.
                         * Unlike the subprog call handling above, we shouldn't
                         * propagate precision of r1-r5 (if any requested), as they are
                         * not actually arguments passed directly to callback subprogs
                         */
                        if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
                                verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
                                WARN_ONCE(1, "verifier backtracking bug");
                                return -EFAULT;
                        }
                        if (bt_stack_mask(bt) != 0) {
                                verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
                                WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
                                return -EFAULT;
                        }
                        /* clear r1-r5 in callback subprog's mask */
                        for (i = BPF_REG_1; i <= BPF_REG_5; i++)
                                bt_clear_reg(bt, i);
                        if (bt_subprog_exit(bt))
                                return -EFAULT;
                        return 0;
                } else if (opcode == BPF_CALL) {
                        /* kfunc with imm==0 is invalid and fixup_kfunc_call will
                         * catch this error later. Make backtracking conservative
                         * with ENOTSUPP.
                         */
                        if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
                                return -ENOTSUPP;
                        /* regular helper call sets R0 */
                        bt_clear_reg(bt, BPF_REG_0);
                        if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
                                /* if backtracing was looking for registers R1-R5
                                 * they should have been found already.
                                 */
                                verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
                                WARN_ONCE(1, "verifier backtracking bug");
                                return -EFAULT;
                        }
                } else if (opcode == BPF_EXIT) {
                        bool r0_precise;

                        /* Backtracking to a nested function call, 'idx' is a part of
                         * the inner frame 'subseq_idx' is a part of the outer frame.
                         * In case of a regular function call, instructions giving
                         * precision to registers R1-R5 should have been found already.
                         * In case of a callback, it is ok to have R1-R5 marked for
                         * backtracking, as these registers are set by the function
                         * invoking callback.
                         */
                        if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
                                for (i = BPF_REG_1; i <= BPF_REG_5; i++)
                                        bt_clear_reg(bt, i);
                        if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
                                verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
                                WARN_ONCE(1, "verifier backtracking bug");
                                return -EFAULT;
                        }

                        /* BPF_EXIT in subprog or callback always returns
                         * right after the call instruction, so by checking
                         * whether the instruction at subseq_idx-1 is subprog
                         * call or not we can distinguish actual exit from
                         * *subprog* from exit from *callback*. In the former
                         * case, we need to propagate r0 precision, if
                         * necessary. In the former we never do that.
                         */
                        r0_precise = subseq_idx - 1 >= 0 &&
                                     bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
                                     bt_is_reg_set(bt, BPF_REG_0);

                        bt_clear_reg(bt, BPF_REG_0);
                        if (bt_subprog_enter(bt))
                                return -EFAULT;

                        if (r0_precise)
                                bt_set_reg(bt, BPF_REG_0);
                        /* r6-r9 and stack slots will stay set in caller frame
                         * bitmasks until we return back from callee(s)
                         */
                        return 0;
                } else if (BPF_SRC(insn->code) == BPF_X) {
                        if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
                                return 0;
                        /* dreg <cond> sreg
                         * Both dreg and sreg need precision before
                         * this insn. If only sreg was marked precise
                         * before it would be equally necessary to
                         * propagate it to dreg.
                         */
                        bt_set_reg(bt, dreg);
                        bt_set_reg(bt, sreg);
                         /* else dreg <cond> K
                          * Only dreg still needs precision before
                          * this insn, so for the K-based conditional
                          * there is nothing new to be marked.
                          */
                }
        } else if (class == BPF_LD) {
                if (!bt_is_reg_set(bt, dreg))
                        return 0;
                bt_clear_reg(bt, dreg);
                /* It's ld_imm64 or ld_abs or ld_ind.
                 * For ld_imm64 no further tracking of precision
                 * into parent is necessary
                 */
                if (mode == BPF_IND || mode == BPF_ABS)
                        /* to be analyzed */
                        return -ENOTSUPP;
        }
        return 0;
}

/* the scalar precision tracking algorithm:
 * . at the start all registers have precise=false.
 * . scalar ranges are tracked as normal through alu and jmp insns.
 * . once precise value of the scalar register is used in:
 *   .  ptr + scalar alu
 *   . if (scalar cond K|scalar)
 *   .  helper_call(.., scalar, ...) where ARG_CONST is expected
 *   backtrack through the verifier states and mark all registers and
 *   stack slots with spilled constants that these scalar regisers
 *   should be precise.
 * . during state pruning two registers (or spilled stack slots)
 *   are equivalent if both are not precise.
 *
 * Note the verifier cannot simply walk register parentage chain,
 * since many different registers and stack slots could have been
 * used to compute single precise scalar.
 *
 * The approach of starting with precise=true for all registers and then
 * backtrack to mark a register as not precise when the verifier detects
 * that program doesn't care about specific value (e.g., when helper
 * takes register as ARG_ANYTHING parameter) is not safe.
 *
 * It's ok to walk single parentage chain of the verifier states.
 * It's possible that this backtracking will go all the way till 1st insn.
 * All other branches will be explored for needing precision later.
 *
 * The backtracking needs to deal with cases like:
 *   R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
 * r9 -= r8
 * r5 = r9
 * if r5 > 0x79f goto pc+7
 *    R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
 * r5 += 1
 * ...
 * call bpf_perf_event_output#25
 *   where .arg5_type = ARG_CONST_SIZE_OR_ZERO
 *
 * and this case:
 * r6 = 1
 * call foo // uses callee's r6 inside to compute r0
 * r0 += r6
 * if r0 == 0 goto
 *
 * to track above reg_mask/stack_mask needs to be independent for each frame.
 *
 * Also if parent's curframe > frame where backtracking started,
 * the verifier need to mark registers in both frames, otherwise callees
 * may incorrectly prune callers. This is similar to
 * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
 *
 * For now backtracking falls back into conservative marking.
 */
static void mark_all_scalars_precise(struct bpf_verifier_env *env,
                                     struct bpf_verifier_state *st)
{
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;
        int i, j;

        if (env->log.level & BPF_LOG_LEVEL2) {
                verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
                        st->curframe);
        }

        /* big hammer: mark all scalars precise in this path.
         * pop_stack may still get !precise scalars.
         * We also skip current state and go straight to first parent state,
         * because precision markings in current non-checkpointed state are
         * not needed. See why in the comment in __mark_chain_precision below.
         */
        for (st = st->parent; st; st = st->parent) {
                for (i = 0; i <= st->curframe; i++) {
                        func = st->frame[i];
                        for (j = 0; j < BPF_REG_FP; j++) {
                                reg = &func->regs[j];
                                if (reg->type != SCALAR_VALUE || reg->precise)
                                        continue;
                                reg->precise = true;
                                if (env->log.level & BPF_LOG_LEVEL2) {
                                        verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
                                                i, j);
                                }
                        }
                        for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
                                if (!is_spilled_reg(&func->stack[j]))
                                        continue;
                                reg = &func->stack[j].spilled_ptr;
                                if (reg->type != SCALAR_VALUE || reg->precise)
                                        continue;
                                reg->precise = true;
                                if (env->log.level & BPF_LOG_LEVEL2) {
                                        verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
                                                i, -(j + 1) * 8);
                                }
                        }
                }
        }
}

static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;
        int i, j;

        for (i = 0; i <= st->curframe; i++) {
                func = st->frame[i];
                for (j = 0; j < BPF_REG_FP; j++) {
                        reg = &func->regs[j];
                        if (reg->type != SCALAR_VALUE)
                                continue;
                        reg->precise = false;
                }
                for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
                        if (!is_spilled_reg(&func->stack[j]))
                                continue;
                        reg = &func->stack[j].spilled_ptr;
                        if (reg->type != SCALAR_VALUE)
                                continue;
                        reg->precise = false;
                }
        }
}

static bool idset_contains(struct bpf_idset *s, u32 id)
{
        u32 i;

        for (i = 0; i < s->count; ++i)
                if (s->ids[i] == id)
                        return true;

        return false;
}

static int idset_push(struct bpf_idset *s, u32 id)
{
        if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
                return -EFAULT;
        s->ids[s->count++] = id;
        return 0;
}

static void idset_reset(struct bpf_idset *s)
{
        s->count = 0;
}

/* Collect a set of IDs for all registers currently marked as precise in env->bt.
 * Mark all registers with these IDs as precise.
 */
static int mark_precise_scalar_ids(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
        struct bpf_idset *precise_ids = &env->idset_scratch;
        struct backtrack_state *bt = &env->bt;
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;
        DECLARE_BITMAP(mask, 64);
        int i, fr;

        idset_reset(precise_ids);

        for (fr = bt->frame; fr >= 0; fr--) {
                func = st->frame[fr];

                bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
                for_each_set_bit(i, mask, 32) {
                        reg = &func->regs[i];
                        if (!reg->id || reg->type != SCALAR_VALUE)
                                continue;
                        if (idset_push(precise_ids, reg->id))
                                return -EFAULT;
                }

                bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
                for_each_set_bit(i, mask, 64) {
                        if (i >= func->allocated_stack / BPF_REG_SIZE)
                                break;
                        if (!is_spilled_scalar_reg(&func->stack[i]))
                                continue;
                        reg = &func->stack[i].spilled_ptr;
                        if (!reg->id)
                                continue;
                        if (idset_push(precise_ids, reg->id))
                                return -EFAULT;
                }
        }

        for (fr = 0; fr <= st->curframe; ++fr) {
                func = st->frame[fr];

                for (i = BPF_REG_0; i < BPF_REG_10; ++i) {
                        reg = &func->regs[i];
                        if (!reg->id)
                                continue;
                        if (!idset_contains(precise_ids, reg->id))
                                continue;
                        bt_set_frame_reg(bt, fr, i);
                }
                for (i = 0; i < func->allocated_stack / BPF_REG_SIZE; ++i) {
                        if (!is_spilled_scalar_reg(&func->stack[i]))
                                continue;
                        reg = &func->stack[i].spilled_ptr;
                        if (!reg->id)
                                continue;
                        if (!idset_contains(precise_ids, reg->id))
                                continue;
                        bt_set_frame_slot(bt, fr, i);
                }
        }

        return 0;
}

/*
 * __mark_chain_precision() backtracks BPF program instruction sequence and
 * chain of verifier states making sure that register *regno* (if regno >= 0)
 * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
 * SCALARS, as well as any other registers and slots that contribute to
 * a tracked state of given registers/stack slots, depending on specific BPF
 * assembly instructions (see backtrack_insns() for exact instruction handling
 * logic). This backtracking relies on recorded jmp_history and is able to
 * traverse entire chain of parent states. This process ends only when all the
 * necessary registers/slots and their transitive dependencies are marked as
 * precise.
 *
 * One important and subtle aspect is that precise marks *do not matter* in
 * the currently verified state (current state). It is important to understand
 * why this is the case.
 *
 * First, note that current state is the state that is not yet "checkpointed",
 * i.e., it is not yet put into env->explored_states, and it has no children
 * states as well. It's ephemeral, and can end up either a) being discarded if
 * compatible explored state is found at some point or BPF_EXIT instruction is
 * reached or b) checkpointed and put into env->explored_states, branching out
 * into one or more children states.
 *
 * In the former case, precise markings in current state are completely
 * ignored by state comparison code (see regsafe() for details). Only
 * checkpointed ("old") state precise markings are important, and if old
 * state's register/slot is precise, regsafe() assumes current state's
 * register/slot as precise and checks value ranges exactly and precisely. If
 * states turn out to be compatible, current state's necessary precise
 * markings and any required parent states' precise markings are enforced
 * after the fact with propagate_precision() logic, after the fact. But it's
 * important to realize that in this case, even after marking current state
 * registers/slots as precise, we immediately discard current state. So what
 * actually matters is any of the precise markings propagated into current
 * state's parent states, which are always checkpointed (due to b) case above).
 * As such, for scenario a) it doesn't matter if current state has precise
 * markings set or not.
 *
 * Now, for the scenario b), checkpointing and forking into child(ren)
 * state(s). Note that before current state gets to checkpointing step, any
 * processed instruction always assumes precise SCALAR register/slot
 * knowledge: if precise value or range is useful to prune jump branch, BPF
 * verifier takes this opportunity enthusiastically. Similarly, when
 * register's value is used to calculate offset or memory address, exact
 * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
 * what we mentioned above about state comparison ignoring precise markings
 * during state comparison, BPF verifier ignores and also assumes precise
 * markings *at will* during instruction verification process. But as verifier
 * assumes precision, it also propagates any precision dependencies across
 * parent states, which are not yet finalized, so can be further restricted
 * based on new knowledge gained from restrictions enforced by their children
 * states. This is so that once those parent states are finalized, i.e., when
 * they have no more active children state, state comparison logic in
 * is_state_visited() would enforce strict and precise SCALAR ranges, if
 * required for correctness.
 *
 * To build a bit more intuition, note also that once a state is checkpointed,
 * the path we took to get to that state is not important. This is crucial
 * property for state pruning. When state is checkpointed and finalized at
 * some instruction index, it can be correctly and safely used to "short
 * circuit" any *compatible* state that reaches exactly the same instruction
 * index. I.e., if we jumped to that instruction from a completely different
 * code path than original finalized state was derived from, it doesn't
 * matter, current state can be discarded because from that instruction
 * forward having a compatible state will ensure we will safely reach the
 * exit. States describe preconditions for further exploration, but completely
 * forget the history of how we got here.
 *
 * This also means that even if we needed precise SCALAR range to get to
 * finalized state, but from that point forward *that same* SCALAR register is
 * never used in a precise context (i.e., it's precise value is not needed for
 * correctness), it's correct and safe to mark such register as "imprecise"
 * (i.e., precise marking set to false). This is what we rely on when we do
 * not set precise marking in current state. If no child state requires
 * precision for any given SCALAR register, it's safe to dictate that it can
 * be imprecise. If any child state does require this register to be precise,
 * we'll mark it precise later retroactively during precise markings
 * propagation from child state to parent states.
 *
 * Skipping precise marking setting in current state is a mild version of
 * relying on the above observation. But we can utilize this property even
 * more aggressively by proactively forgetting any precise marking in the
 * current state (which we inherited from the parent state), right before we
 * checkpoint it and branch off into new child state. This is done by
 * mark_all_scalars_imprecise() to hopefully get more permissive and generic
 * finalized states which help in short circuiting more future states.
 */
static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
        struct backtrack_state *bt = &env->bt;
        struct bpf_verifier_state *st = env->cur_state;
        int first_idx = st->first_insn_idx;
        int last_idx = env->insn_idx;
        int subseq_idx = -1;
        struct bpf_func_state *func;
        struct bpf_reg_state *reg;
        bool skip_first = true;
        int i, fr, err;

        if (!env->bpf_capable)
                return 0;

        /* set frame number from which we are starting to backtrack */
        bt_init(bt, env->cur_state->curframe);

        /* Do sanity checks against current state of register and/or stack
         * slot, but don't set precise flag in current state, as precision
         * tracking in the current state is unnecessary.
         */
        func = st->frame[bt->frame];
        if (regno >= 0) {
                reg = &func->regs[regno];
                if (reg->type != SCALAR_VALUE) {
                        WARN_ONCE(1, "backtracing misuse");
                        return -EFAULT;
                }
                bt_set_reg(bt, regno);
        }

        if (bt_empty(bt))
                return 0;

        for (;;) {
                DECLARE_BITMAP(mask, 64);
                u32 history = st->jmp_history_cnt;
                struct bpf_jmp_history_entry *hist;

                if (env->log.level & BPF_LOG_LEVEL2) {
                        verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
                                bt->frame, last_idx, first_idx, subseq_idx);
                }

                /* If some register with scalar ID is marked as precise,
                 * make sure that all registers sharing this ID are also precise.
                 * This is needed to estimate effect of find_equal_scalars().
                 * Do this at the last instruction of each state,
                 * bpf_reg_state::id fields are valid for these instructions.
                 *
                 * Allows to track precision in situation like below:
                 *
                 *     r2 = unknown value
                 *     ...
                 *   --- state #0 ---
                 *     ...
                 *     r1 = r2                 // r1 and r2 now share the same ID
                 *     ...
                 *   --- state #1 {r1.id = A, r2.id = A} ---
                 *     ...
                 *     if (r2 > 10) goto exit; // find_equal_scalars() assigns range to r1
                 *     ...
                 *   --- state #2 {r1.id = A, r2.id = A} ---
                 *     r3 = r10
                 *     r3 += r1                // need to mark both r1 and r2
                 */
                if (mark_precise_scalar_ids(env, st))
                        return -EFAULT;

                if (last_idx < 0) {
                        /* we are at the entry into subprog, which
                         * is expected for global funcs, but only if
                         * requested precise registers are R1-R5
                         * (which are global func's input arguments)
                         */
                        if (st->curframe == 0 &&
                            st->frame[0]->subprogno > 0 &&
                            st->frame[0]->callsite == BPF_MAIN_FUNC &&
                            bt_stack_mask(bt) == 0 &&
                            (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
                                bitmap_from_u64(mask, bt_reg_mask(bt));
                                for_each_set_bit(i, mask, 32) {
                                        reg = &st->frame[0]->regs[i];
                                        bt_clear_reg(bt, i);
                                        if (reg->type == SCALAR_VALUE)
                                                reg->precise = true;
                                }
                                return 0;
                        }

                        verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
                                st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
                        WARN_ONCE(1, "verifier backtracking bug");
                        return -EFAULT;
                }

                for (i = last_idx;;) {
                        if (skip_first) {
                                err = 0;
                                skip_first = false;
                        } else {
                                hist = get_jmp_hist_entry(st, history, i);
                                err = backtrack_insn(env, i, subseq_idx, hist, bt);
                        }
                        if (err == -ENOTSUPP) {
                                mark_all_scalars_precise(env, env->cur_state);
                                bt_reset(bt);
                                return 0;
                        } else if (err) {
                                return err;
                        }
                        if (bt_empty(bt))
                                /* Found assignment(s) into tracked register in this state.
                                 * Since this state is already marked, just return.
                                 * Nothing to be tracked further in the parent state.
                                 */
                                return 0;
                        subseq_idx = i;
                        i = get_prev_insn_idx(st, i, &history);
                        if (i == -ENOENT)
                                break;
                        if (i >= env->prog->len) {
                                /* This can happen if backtracking reached insn 0
                                 * and there are still reg_mask or stack_mask
                                 * to backtrack.
                                 * It means the backtracking missed the spot where
                                 * particular register was initialized with a constant.
                                 */
                                verbose(env, "BUG backtracking idx %d\n", i);
                                WARN_ONCE(1, "verifier backtracking bug");
                                return -EFAULT;
                        }
                }
                st = st->parent;
                if (!st)
                        break;

                for (fr = bt->frame; fr >= 0; fr--) {
                        func = st->frame[fr];
                        bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
                        for_each_set_bit(i, mask, 32) {
                                reg = &func->regs[i];
                                if (reg->type != SCALAR_VALUE) {
                                        bt_clear_frame_reg(bt, fr, i);
                                        continue;
                                }
                                if (reg->precise)
                                        bt_clear_frame_reg(bt, fr, i);
                                else
                                        reg->precise = true;
                        }

                        bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
                        for_each_set_bit(i, mask, 64) {
                                if (i >= func->allocated_stack / BPF_REG_SIZE) {
                                        verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
                                                i, func->allocated_stack / BPF_REG_SIZE);
                                        WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
                                        return -EFAULT;
                                }

                                if (!is_spilled_scalar_reg(&func->stack[i])) {
                                        bt_clear_frame_slot(bt, fr, i);
                                        continue;
                                }
                                reg = &func->stack[i].spilled_ptr;
                                if (reg->precise)
                                        bt_clear_frame_slot(bt, fr, i);
                                else
                                        reg->precise = true;
                        }
                        if (env->log.level & BPF_LOG_LEVEL2) {
                                fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
                                             bt_frame_reg_mask(bt, fr));
                                verbose(env, "mark_precise: frame%d: parent state regs=%s ",
                                        fr, env->tmp_str_buf);
                                fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
                                               bt_frame_stack_mask(bt, fr));
                                verbose(env, "stack=%s: ", env->tmp_str_buf);
                                print_verifier_state(env, func, true);
                        }
                }

                if (bt_empty(bt))
                        return 0;

                subseq_idx = first_idx;
                last_idx = st->last_insn_idx;
                first_idx = st->first_insn_idx;
        }

        /* if we still have requested precise regs or slots, we missed
         * something (e.g., stack access through non-r10 register), so
         * fallback to marking all precise
         */
        if (!bt_empty(bt)) {
                mark_all_scalars_precise(env, env->cur_state);
                bt_reset(bt);
        }

        return 0;
}

int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
        return __mark_chain_precision(env, regno);
}

/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
 * desired reg and stack masks across all relevant frames
 */
static int mark_chain_precision_batch(struct bpf_verifier_env *env)
{
        return __mark_chain_precision(env, -1);
}

static bool is_spillable_regtype(enum bpf_reg_type type)
{
        switch (base_type(type)) {
        case PTR_TO_MAP_VALUE:
        case PTR_TO_STACK:
        case PTR_TO_CTX:
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET_END:
        case PTR_TO_FLOW_KEYS:
        case CONST_PTR_TO_MAP:
        case PTR_TO_SOCKET:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_XDP_SOCK:
        case PTR_TO_BTF_ID:
        case PTR_TO_BUF:
        case PTR_TO_MEM:
        case PTR_TO_FUNC:
        case PTR_TO_MAP_KEY:
        case PTR_TO_ARENA:
                return true;
        default:
                return false;
        }
}

/* Does this register contain a constant zero? */
static bool register_is_null(struct bpf_reg_state *reg)
{
        return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}

/* check if register is a constant scalar value */
static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
{
        return reg->type == SCALAR_VALUE &&
               tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
}

/* assuming is_reg_const() is true, return constant value of a register */
static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
{
        return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
}

static bool __is_pointer_value(bool allow_ptr_leaks,
                               const struct bpf_reg_state *reg)
{
        if (allow_ptr_leaks)
                return false;

        return reg->type != SCALAR_VALUE;
}

static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
                                        struct bpf_reg_state *src_reg)
{
        if (src_reg->type == SCALAR_VALUE && !src_reg->id &&
            !tnum_is_const(src_reg->var_off))
                /* Ensure that src_reg has a valid ID that will be copied to
                 * dst_reg and then will be used by find_equal_scalars() to
                 * propagate min/max range.
                 */
                src_reg->id = ++env->id_gen;
}

/* Copy src state preserving dst->parent and dst->live fields */
static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
{
        struct bpf_reg_state *parent = dst->parent;
        enum bpf_reg_liveness live = dst->live;

        *dst = *src;
        dst->parent = parent;
        dst->live = live;
}

static void save_register_state(struct bpf_verifier_env *env,
                                struct bpf_func_state *state,
                                int spi, struct bpf_reg_state *reg,
                                int size)
{
        int i;

        copy_register_state(&state->stack[spi].spilled_ptr, reg);
        if (size == BPF_REG_SIZE)
                state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;

        for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
                state->stack[spi].slot_type[i - 1] = STACK_SPILL;

        /* size < 8 bytes spill */
        for (; i; i--)
                mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
}

static bool is_bpf_st_mem(struct bpf_insn *insn)
{
        return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
}

static int get_reg_width(struct bpf_reg_state *reg)
{
        return fls64(reg->umax_value);
}

/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
 * stack boundary and alignment are checked in check_mem_access()
 */
static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
                                       /* stack frame we're writing to */
                                       struct bpf_func_state *state,
                                       int off, int size, int value_regno,
                                       int insn_idx)
{
        struct bpf_func_state *cur; /* state of the current function */
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        struct bpf_reg_state *reg = NULL;
        int insn_flags = insn_stack_access_flags(state->frameno, spi);

        /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
         * so it's aligned access and [off, off + size) are within stack limits
         */
        if (!env->allow_ptr_leaks &&
            is_spilled_reg(&state->stack[spi]) &&
            size != BPF_REG_SIZE) {
                verbose(env, "attempt to corrupt spilled pointer on stack\n");
                return -EACCES;
        }

        cur = env->cur_state->frame[env->cur_state->curframe];
        if (value_regno >= 0)
                reg = &cur->regs[value_regno];
        if (!env->bypass_spec_v4) {
                bool sanitize = reg && is_spillable_regtype(reg->type);

                for (i = 0; i < size; i++) {
                        u8 type = state->stack[spi].slot_type[i];

                        if (type != STACK_MISC && type != STACK_ZERO) {
                                sanitize = true;
                                break;
                        }
                }

                if (sanitize)
                        env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
        }

        err = destroy_if_dynptr_stack_slot(env, state, spi);
        if (err)
                return err;

        mark_stack_slot_scratched(env, spi);
        if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
                bool reg_value_fits;

                reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size;
                /* Make sure that reg had an ID to build a relation on spill. */
                if (reg_value_fits)
                        assign_scalar_id_before_mov(env, reg);
                save_register_state(env, state, spi, reg, size);
                /* Break the relation on a narrowing spill. */
                if (!reg_value_fits)
                        state->stack[spi].spilled_ptr.id = 0;
        } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
                   env->bpf_capable) {
                struct bpf_reg_state *tmp_reg = &env->fake_reg[0];

                memset(tmp_reg, 0, sizeof(*tmp_reg));
                __mark_reg_known(tmp_reg, insn->imm);
                tmp_reg->type = SCALAR_VALUE;
                save_register_state(env, state, spi, tmp_reg, size);
        } else if (reg && is_spillable_regtype(reg->type)) {
                /* register containing pointer is being spilled into stack */
                if (size != BPF_REG_SIZE) {
                        verbose_linfo(env, insn_idx, "; ");
                        verbose(env, "invalid size of register spill\n");
                        return -EACCES;
                }
                if (state != cur && reg->type == PTR_TO_STACK) {
                        verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
                        return -EINVAL;
                }
                save_register_state(env, state, spi, reg, size);
        } else {
                u8 type = STACK_MISC;

                /* regular write of data into stack destroys any spilled ptr */
                state->stack[spi].spilled_ptr.type = NOT_INIT;
                /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
                if (is_stack_slot_special(&state->stack[spi]))
                        for (i = 0; i < BPF_REG_SIZE; i++)
                                scrub_spilled_slot(&state->stack[spi].slot_type[i]);

                /* only mark the slot as written if all 8 bytes were written
                 * otherwise read propagation may incorrectly stop too soon
                 * when stack slots are partially written.
                 * This heuristic means that read propagation will be
                 * conservative, since it will add reg_live_read marks
                 * to stack slots all the way to first state when programs
                 * writes+reads less than 8 bytes
                 */
                if (size == BPF_REG_SIZE)
                        state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;

                /* when we zero initialize stack slots mark them as such */
                if ((reg && register_is_null(reg)) ||
                    (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
                        /* STACK_ZERO case happened because register spill
                         * wasn't properly aligned at the stack slot boundary,
                         * so it's not a register spill anymore; force
                         * originating register to be precise to make
                         * STACK_ZERO correct for subsequent states
                         */
                        err = mark_chain_precision(env, value_regno);
                        if (err)
                                return err;
                        type = STACK_ZERO;
                }

                /* Mark slots affected by this stack write. */
                for (i = 0; i < size; i++)
                        state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
                insn_flags = 0; /* not a register spill */
        }

        if (insn_flags)
                return push_jmp_history(env, env->cur_state, insn_flags);
        return 0;
}

/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
 * known to contain a variable offset.
 * This function checks whether the write is permitted and conservatively
 * tracks the effects of the write, considering that each stack slot in the
 * dynamic range is potentially written to.
 *
 * 'off' includes 'regno->off'.
 * 'value_regno' can be -1, meaning that an unknown value is being written to
 * the stack.
 *
 * Spilled pointers in range are not marked as written because we don't know
 * what's going to be actually written. This means that read propagation for
 * future reads cannot be terminated by this write.
 *
 * For privileged programs, uninitialized stack slots are considered
 * initialized by this write (even though we don't know exactly what offsets
 * are going to be written to). The idea is that we don't want the verifier to
 * reject future reads that access slots written to through variable offsets.
 */
static int check_stack_write_var_off(struct bpf_verifier_env *env,
                                     /* func where register points to */
                                     struct bpf_func_state *state,
                                     int ptr_regno, int off, int size,
                                     int value_regno, int insn_idx)
{
        struct bpf_func_state *cur; /* state of the current function */
        int min_off, max_off;
        int i, err;
        struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        bool writing_zero = false;
        /* set if the fact that we're writing a zero is used to let any
         * stack slots remain STACK_ZERO
         */
        bool zero_used = false;

        cur = env->cur_state->frame[env->cur_state->curframe];
        ptr_reg = &cur->regs[ptr_regno];
        min_off = ptr_reg->smin_value + off;
        max_off = ptr_reg->smax_value + off + size;
        if (value_regno >= 0)
                value_reg = &cur->regs[value_regno];
        if ((value_reg && register_is_null(value_reg)) ||
            (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
                writing_zero = true;

        for (i = min_off; i < max_off; i++) {
                int spi;

                spi = __get_spi(i);
                err = destroy_if_dynptr_stack_slot(env, state, spi);
                if (err)
                        return err;
        }

        /* Variable offset writes destroy any spilled pointers in range. */
        for (i = min_off; i < max_off; i++) {
                u8 new_type, *stype;
                int slot, spi;

                slot = -i - 1;
                spi = slot / BPF_REG_SIZE;
                stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
                mark_stack_slot_scratched(env, spi);

                if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
                        /* Reject the write if range we may write to has not
                         * been initialized beforehand. If we didn't reject
                         * here, the ptr status would be erased below (even
                         * though not all slots are actually overwritten),
                         * possibly opening the door to leaks.
                         *
                         * We do however catch STACK_INVALID case below, and
                         * only allow reading possibly uninitialized memory
                         * later for CAP_PERFMON, as the write may not happen to
                         * that slot.
                         */
                        verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
                                insn_idx, i);
                        return -EINVAL;
                }

                /* If writing_zero and the spi slot contains a spill of value 0,
                 * maintain the spill type.
                 */
                if (writing_zero && *stype == STACK_SPILL &&
                    is_spilled_scalar_reg(&state->stack[spi])) {
                        struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;

                        if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
                                zero_used = true;
                                continue;
                        }
                }

                /* Erase all other spilled pointers. */
                state->stack[spi].spilled_ptr.type = NOT_INIT;

                /* Update the slot type. */
                new_type = STACK_MISC;
                if (writing_zero && *stype == STACK_ZERO) {
                        new_type = STACK_ZERO;
                        zero_used = true;
                }
                /* If the slot is STACK_INVALID, we check whether it's OK to
                 * pretend that it will be initialized by this write. The slot
                 * might not actually be written to, and so if we mark it as
                 * initialized future reads might leak uninitialized memory.
                 * For privileged programs, we will accept such reads to slots
                 * that may or may not be written because, if we're reject
                 * them, the error would be too confusing.
                 */
                if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
                        verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
                                        insn_idx, i);
                        return -EINVAL;
                }
                *stype = new_type;
        }
        if (zero_used) {
                /* backtracking doesn't work for STACK_ZERO yet. */
                err = mark_chain_precision(env, value_regno);
                if (err)
                        return err;
        }
        return 0;
}

/* When register 'dst_regno' is assigned some values from stack[min_off,
 * max_off), we set the register's type according to the types of the
 * respective stack slots. If all the stack values are known to be zeros, then
 * so is the destination reg. Otherwise, the register is considered to be
 * SCALAR. This function does not deal with register filling; the caller must
 * ensure that all spilled registers in the stack range have been marked as
 * read.
 */
static void mark_reg_stack_read(struct bpf_verifier_env *env,
                                /* func where src register points to */
                                struct bpf_func_state *ptr_state,
                                int min_off, int max_off, int dst_regno)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        int i, slot, spi;
        u8 *stype;
        int zeros = 0;

        for (i = min_off; i < max_off; i++) {
                slot = -i - 1;
                spi = slot / BPF_REG_SIZE;
                mark_stack_slot_scratched(env, spi);
                stype = ptr_state->stack[spi].slot_type;
                if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
                        break;
                zeros++;
        }
        if (zeros == max_off - min_off) {
                /* Any access_size read into register is zero extended,
                 * so the whole register == const_zero.
                 */
                __mark_reg_const_zero(env, &state->regs[dst_regno]);
        } else {
                /* have read misc data from the stack */
                mark_reg_unknown(env, state->regs, dst_regno);
        }
        state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
}

/* Read the stack at 'off' and put the results into the register indicated by
 * 'dst_regno'. It handles reg filling if the addressed stack slot is a
 * spilled reg.
 *
 * 'dst_regno' can be -1, meaning that the read value is not going to a
 * register.
 *
 * The access is assumed to be within the current stack bounds.
 */
static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
                                      /* func where src register points to */
                                      struct bpf_func_state *reg_state,
                                      int off, int size, int dst_regno)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
        struct bpf_reg_state *reg;
        u8 *stype, type;
        int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);

        stype = reg_state->stack[spi].slot_type;
        reg = &reg_state->stack[spi].spilled_ptr;

        mark_stack_slot_scratched(env, spi);

        if (is_spilled_reg(&reg_state->stack[spi])) {
                u8 spill_size = 1;

                for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
                        spill_size++;

                if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
                        if (reg->type != SCALAR_VALUE) {
                                verbose_linfo(env, env->insn_idx, "; ");
                                verbose(env, "invalid size of register fill\n");
                                return -EACCES;
                        }

                        mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
                        if (dst_regno < 0)
                                return 0;

                        if (size <= spill_size &&
                            bpf_stack_narrow_access_ok(off, size, spill_size)) {
                                /* The earlier check_reg_arg() has decided the
                                 * subreg_def for this insn.  Save it first.
                                 */
                                s32 subreg_def = state->regs[dst_regno].subreg_def;

                                copy_register_state(&state->regs[dst_regno], reg);
                                state->regs[dst_regno].subreg_def = subreg_def;

                                /* Break the relation on a narrowing fill.
                                 * coerce_reg_to_size will adjust the boundaries.
                                 */
                                if (get_reg_width(reg) > size * BITS_PER_BYTE)
                                        state->regs[dst_regno].id = 0;
                        } else {
                                int spill_cnt = 0, zero_cnt = 0;

                                for (i = 0; i < size; i++) {
                                        type = stype[(slot - i) % BPF_REG_SIZE];
                                        if (type == STACK_SPILL) {
                                                spill_cnt++;
                                                continue;
                                        }
                                        if (type == STACK_MISC)
                                                continue;
                                        if (type == STACK_ZERO) {
                                                zero_cnt++;
                                                continue;
                                        }
                                        if (type == STACK_INVALID && env->allow_uninit_stack)
                                                continue;
                                        verbose(env, "invalid read from stack off %d+%d size %d\n",
                                                off, i, size);
                                        return -EACCES;
                                }

                                if (spill_cnt == size &&
                                    tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
                                        __mark_reg_const_zero(env, &state->regs[dst_regno]);
                                        /* this IS register fill, so keep insn_flags */
                                } else if (zero_cnt == size) {
                                        /* similarly to mark_reg_stack_read(), preserve zeroes */
                                        __mark_reg_const_zero(env, &state->regs[dst_regno]);
                                        insn_flags = 0; /* not restoring original register state */
                                } else {
                                        mark_reg_unknown(env, state->regs, dst_regno);
                                        insn_flags = 0; /* not restoring original register state */
                                }
                        }
                        state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
                } else if (dst_regno >= 0) {
                        /* restore register state from stack */
                        copy_register_state(&state->regs[dst_regno], reg);
                        /* mark reg as written since spilled pointer state likely
                         * has its liveness marks cleared by is_state_visited()
                         * which resets stack/reg liveness for state transitions
                         */
                        state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
                } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
                        /* If dst_regno==-1, the caller is asking us whether
                         * it is acceptable to use this value as a SCALAR_VALUE
                         * (e.g. for XADD).
                         * We must not allow unprivileged callers to do that
                         * with spilled pointers.
                         */
                        verbose(env, "leaking pointer from stack off %d\n",
                                off);
                        return -EACCES;
                }
                mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
        } else {
                for (i = 0; i < size; i++) {
                        type = stype[(slot - i) % BPF_REG_SIZE];
                        if (type == STACK_MISC)
                                continue;
                        if (type == STACK_ZERO)
                                continue;
                        if (type == STACK_INVALID && env->allow_uninit_stack)
                                continue;
                        verbose(env, "invalid read from stack off %d+%d size %d\n",
                                off, i, size);
                        return -EACCES;
                }
                mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
                if (dst_regno >= 0)
                        mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
                insn_flags = 0; /* we are not restoring spilled register */
        }
        if (insn_flags)
                return push_jmp_history(env, env->cur_state, insn_flags);
        return 0;
}

enum bpf_access_src {
        ACCESS_DIRECT = 1,  /* the access is performed by an instruction */
        ACCESS_HELPER = 2,  /* the access is performed by a helper */
};

static int check_stack_range_initialized(struct bpf_verifier_env *env,
                                         int regno, int off, int access_size,
                                         bool zero_size_allowed,
                                         enum bpf_access_src type,
                                         struct bpf_call_arg_meta *meta);

static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
{
        return cur_regs(env) + regno;
}

/* Read the stack at 'ptr_regno + off' and put the result into the register
 * 'dst_regno'.
 * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
 * but not its variable offset.
 * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
 *
 * As opposed to check_stack_read_fixed_off, this function doesn't deal with
 * filling registers (i.e. reads of spilled register cannot be detected when
 * the offset is not fixed). We conservatively mark 'dst_regno' as containing
 * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
 * offset; for a fixed offset check_stack_read_fixed_off should be used
 * instead.
 */
static int check_stack_read_var_off(struct bpf_verifier_env *env,
                                    int ptr_regno, int off, int size, int dst_regno)
{
        /* The state of the source register. */
        struct bpf_reg_state *reg = reg_state(env, ptr_regno);
        struct bpf_func_state *ptr_state = func(env, reg);
        int err;
        int min_off, max_off;

        /* Note that we pass a NULL meta, so raw access will not be permitted.
         */
        err = check_stack_range_initialized(env, ptr_regno, off, size,
                                            false, ACCESS_DIRECT, NULL);
        if (err)
                return err;

        min_off = reg->smin_value + off;
        max_off = reg->smax_value + off;
        mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
        return 0;
}

/* check_stack_read dispatches to check_stack_read_fixed_off or
 * check_stack_read_var_off.
 *
 * The caller must ensure that the offset falls within the allocated stack
 * bounds.
 *
 * 'dst_regno' is a register which will receive the value from the stack. It
 * can be -1, meaning that the read value is not going to a register.
 */
static int check_stack_read(struct bpf_verifier_env *env,
                            int ptr_regno, int off, int size,
                            int dst_regno)
{
        struct bpf_reg_state *reg = reg_state(env, ptr_regno);
        struct bpf_func_state *state = func(env, reg);
        int err;
        /* Some accesses are only permitted with a static offset. */
        bool var_off = !tnum_is_const(reg->var_off);

        /* The offset is required to be static when reads don't go to a
         * register, in order to not leak pointers (see
         * check_stack_read_fixed_off).
         */
        if (dst_regno < 0 && var_off) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
                        tn_buf, off, size);
                return -EACCES;
        }
        /* Variable offset is prohibited for unprivileged mode for simplicity
         * since it requires corresponding support in Spectre masking for stack
         * ALU. See also retrieve_ptr_limit(). The check in
         * check_stack_access_for_ptr_arithmetic() called by
         * adjust_ptr_min_max_vals() prevents users from creating stack pointers
         * with variable offsets, therefore no check is required here. Further,
         * just checking it here would be insufficient as speculative stack
         * writes could still lead to unsafe speculative behaviour.
         */
        if (!var_off) {
                off += reg->var_off.value;
                err = check_stack_read_fixed_off(env, state, off, size,
                                                 dst_regno);
        } else {
                /* Variable offset stack reads need more conservative handling
                 * than fixed offset ones. Note that dst_regno >= 0 on this
                 * branch.
                 */
                err = check_stack_read_var_off(env, ptr_regno, off, size,
                                               dst_regno);
        }
        return err;
}


/* check_stack_write dispatches to check_stack_write_fixed_off or
 * check_stack_write_var_off.
 *
 * 'ptr_regno' is the register used as a pointer into the stack.
 * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
 * 'value_regno' is the register whose value we're writing to the stack. It can
 * be -1, meaning that we're not writing from a register.
 *
 * The caller must ensure that the offset falls within the maximum stack size.
 */
static int check_stack_write(struct bpf_verifier_env *env,
                             int ptr_regno, int off, int size,
                             int value_regno, int insn_idx)
{
        struct bpf_reg_state *reg = reg_state(env, ptr_regno);
        struct bpf_func_state *state = func(env, reg);
        int err;

        if (tnum_is_const(reg->var_off)) {
                off += reg->var_off.value;
                err = check_stack_write_fixed_off(env, state, off, size,
                                                  value_regno, insn_idx);
        } else {
                /* Variable offset stack reads need more conservative handling
                 * than fixed offset ones.
                 */
                err = check_stack_write_var_off(env, state,
                                                ptr_regno, off, size,
                                                value_regno, insn_idx);
        }
        return err;
}

static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
                                 int off, int size, enum bpf_access_type type)
{
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_map *map = regs[regno].map_ptr;
        u32 cap = bpf_map_flags_to_cap(map);

        if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
                verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
                        map->value_size, off, size);
                return -EACCES;
        }

        if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
                verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
                        map->value_size, off, size);
                return -EACCES;
        }

        return 0;
}

/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
static int __check_mem_access(struct bpf_verifier_env *env, int regno,
                              int off, int size, u32 mem_size,
                              bool zero_size_allowed)
{
        bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
        struct bpf_reg_state *reg;

        if (off >= 0 && size_ok && (u64)off + size <= mem_size)
                return 0;

        reg = &cur_regs(env)[regno];
        switch (reg->type) {
        case PTR_TO_MAP_KEY:
                verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
                        mem_size, off, size);
                break;
        case PTR_TO_MAP_VALUE:
                verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                        mem_size, off, size);
                break;
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET_END:
                verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
                        off, size, regno, reg->id, off, mem_size);
                break;
        case PTR_TO_MEM:
        default:
                verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
                        mem_size, off, size);
        }

        return -EACCES;
}

/* check read/write into a memory region with possible variable offset */
static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
                                   int off, int size, u32 mem_size,
                                   bool zero_size_allowed)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg = &state->regs[regno];
        int err;

        /* We may have adjusted the register pointing to memory region, so we
         * need to try adding each of min_value and max_value to off
         * to make sure our theoretical access will be safe.
         *
         * The minimum value is only important with signed
         * comparisons where we can't assume the floor of a
         * value is 0.  If we are using signed variables for our
         * index'es we need to make sure that whatever we use
         * will have a set floor within our range.
         */
        if (reg->smin_value < 0 &&
            (reg->smin_value == S64_MIN ||
             (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
              reg->smin_value + off < 0)) {
                verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                        regno);
                return -EACCES;
        }
        err = __check_mem_access(env, regno, reg->smin_value + off, size,
                                 mem_size, zero_size_allowed);
        if (err) {
                verbose(env, "R%d min value is outside of the allowed memory range\n",
                        regno);
                return err;
        }

        /* If we haven't set a max value then we need to bail since we can't be
         * sure we won't do bad things.
         * If reg->umax_value + off could overflow, treat that as unbounded too.
         */
        if (reg->umax_value >= BPF_MAX_VAR_OFF) {
                verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
                        regno);
                return -EACCES;
        }
        err = __check_mem_access(env, regno, reg->umax_value + off, size,
                                 mem_size, zero_size_allowed);
        if (err) {
                verbose(env, "R%d max value is outside of the allowed memory range\n",
                        regno);
                return err;
        }

        return 0;
}

static int __check_ptr_off_reg(struct bpf_verifier_env *env,
                               const struct bpf_reg_state *reg, int regno,
                               bool fixed_off_ok)
{
        /* Access to this pointer-typed register or passing it to a helper
         * is only allowed in its original, unmodified form.
         */

        if (reg->off < 0) {
                verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
                        reg_type_str(env, reg->type), regno, reg->off);
                return -EACCES;
        }

        if (!fixed_off_ok && reg->off) {
                verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
                        reg_type_str(env, reg->type), regno, reg->off);
                return -EACCES;
        }

        if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "variable %s access var_off=%s disallowed\n",
                        reg_type_str(env, reg->type), tn_buf);
                return -EACCES;
        }

        return 0;
}

static int check_ptr_off_reg(struct bpf_verifier_env *env,
                             const struct bpf_reg_state *reg, int regno)
{
        return __check_ptr_off_reg(env, reg, regno, false);
}

static int map_kptr_match_type(struct bpf_verifier_env *env,
                               struct btf_field *kptr_field,
                               struct bpf_reg_state *reg, u32 regno)
{
        const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
        int perm_flags;
        const char *reg_name = "";

        if (btf_is_kernel(reg->btf)) {
                perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;

                /* Only unreferenced case accepts untrusted pointers */
                if (kptr_field->type == BPF_KPTR_UNREF)
                        perm_flags |= PTR_UNTRUSTED;
        } else {
                perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
                if (kptr_field->type == BPF_KPTR_PERCPU)
                        perm_flags |= MEM_PERCPU;
        }

        if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
                goto bad_type;

        /* We need to verify reg->type and reg->btf, before accessing reg->btf */
        reg_name = btf_type_name(reg->btf, reg->btf_id);

        /* For ref_ptr case, release function check should ensure we get one
         * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
         * normal store of unreferenced kptr, we must ensure var_off is zero.
         * Since ref_ptr cannot be accessed directly by BPF insns, checks for
         * reg->off and reg->ref_obj_id are not needed here.
         */
        if (__check_ptr_off_reg(env, reg, regno, true))
                return -EACCES;

        /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
         * we also need to take into account the reg->off.
         *
         * We want to support cases like:
         *
         * struct foo {
         *         struct bar br;
         *         struct baz bz;
         * };
         *
         * struct foo *v;
         * v = func();              // PTR_TO_BTF_ID
         * val->foo = v;      // reg->off is zero, btf and btf_id match type
         * val->bar = &v->br; // reg->off is still zero, but we need to retry with
         *                    // first member type of struct after comparison fails
         * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
         *                    // to match type
         *
         * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
         * is zero. We must also ensure that btf_struct_ids_match does not walk
         * the struct to match type against first member of struct, i.e. reject
         * second case from above. Hence, when type is BPF_KPTR_REF, we set
         * strict mode to true for type match.
         */
        if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
                                  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
                                  kptr_field->type != BPF_KPTR_UNREF))
                goto bad_type;
        return 0;
bad_type:
        verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
                reg_type_str(env, reg->type), reg_name);
        verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
        if (kptr_field->type == BPF_KPTR_UNREF)
                verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
                        targ_name);
        else
                verbose(env, "\n");
        return -EINVAL;
}

static bool in_sleepable(struct bpf_verifier_env *env)
{
        return env->prog->sleepable ||
               (env->cur_state && env->cur_state->in_sleepable);
}

/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
 * can dereference RCU protected pointers and result is PTR_TRUSTED.
 */
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
        return env->cur_state->active_rcu_lock ||
               env->cur_state->active_lock.ptr ||
               !in_sleepable(env);
}

/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types)
BTF_ID(struct, prog_test_ref_kfunc)
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
#ifdef CONFIG_BPF_JIT
BTF_ID(struct, bpf_cpumask)
#endif
BTF_ID(struct, task_struct)
BTF_ID(struct, bpf_crypto_ctx)
BTF_SET_END(rcu_protected_types)

static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
{
        if (!btf_is_kernel(btf))
                return true;
        return btf_id_set_contains(&rcu_protected_types, btf_id);
}

static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
{
        struct btf_struct_meta *meta;

        if (btf_is_kernel(kptr_field->kptr.btf))
                return NULL;

        meta = btf_find_struct_meta(kptr_field->kptr.btf,
                                    kptr_field->kptr.btf_id);

        return meta ? meta->record : NULL;
}

static bool rcu_safe_kptr(const struct btf_field *field)
{
        const struct btf_field_kptr *kptr = &field->kptr;

        return field->type == BPF_KPTR_PERCPU ||
               (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
}

static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
{
        struct btf_record *rec;
        u32 ret;

        ret = PTR_MAYBE_NULL;
        if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
                ret |= MEM_RCU;
                if (kptr_field->type == BPF_KPTR_PERCPU)
                        ret |= MEM_PERCPU;
                else if (!btf_is_kernel(kptr_field->kptr.btf))
                        ret |= MEM_ALLOC;

                rec = kptr_pointee_btf_record(kptr_field);
                if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
                        ret |= NON_OWN_REF;
        } else {
                ret |= PTR_UNTRUSTED;
        }

        return ret;
}

static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
                                 int value_regno, int insn_idx,
                                 struct btf_field *kptr_field)
{
        struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
        int class = BPF_CLASS(insn->code);
        struct bpf_reg_state *val_reg;

        /* Things we already checked for in check_map_access and caller:
         *  - Reject cases where variable offset may touch kptr
         *  - size of access (must be BPF_DW)
         *  - tnum_is_const(reg->var_off)
         *  - kptr_field->offset == off + reg->var_off.value
         */
        /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
        if (BPF_MODE(insn->code) != BPF_MEM) {
                verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
                return -EACCES;
        }

        /* We only allow loading referenced kptr, since it will be marked as
         * untrusted, similar to unreferenced kptr.
         */
        if (class != BPF_LDX &&
            (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
                verbose(env, "store to referenced kptr disallowed\n");
                return -EACCES;
        }

        if (class == BPF_LDX) {
                val_reg = reg_state(env, value_regno);
                /* We can simply mark the value_regno receiving the pointer
                 * value from map as PTR_TO_BTF_ID, with the correct type.
                 */
                mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
                                kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
        } else if (class == BPF_STX) {
                val_reg = reg_state(env, value_regno);
                if (!register_is_null(val_reg) &&
                    map_kptr_match_type(env, kptr_field, val_reg, value_regno))
                        return -EACCES;
        } else if (class == BPF_ST) {
                if (insn->imm) {
                        verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
                                kptr_field->offset);
                        return -EACCES;
                }
        } else {
                verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
                return -EACCES;
        }
        return 0;
}

/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
                            int off, int size, bool zero_size_allowed,
                            enum bpf_access_src src)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg = &state->regs[regno];
        struct bpf_map *map = reg->map_ptr;
        struct btf_record *rec;
        int err, i;

        err = check_mem_region_access(env, regno, off, size, map->value_size,
                                      zero_size_allowed);
        if (err)
                return err;

        if (IS_ERR_OR_NULL(map->record))
                return 0;
        rec = map->record;
        for (i = 0; i < rec->cnt; i++) {
                struct btf_field *field = &rec->fields[i];
                u32 p = field->offset;

                /* If any part of a field  can be touched by load/store, reject
                 * this program. To check that [x1, x2) overlaps with [y1, y2),
                 * it is sufficient to check x1 < y2 && y1 < x2.
                 */
                if (reg->smin_value + off < p + field->size &&
                    p < reg->umax_value + off + size) {
                        switch (field->type) {
                        case BPF_KPTR_UNREF:
                        case BPF_KPTR_REF:
                        case BPF_KPTR_PERCPU:
                                if (src != ACCESS_DIRECT) {
                                        verbose(env, "kptr cannot be accessed indirectly by helper\n");
                                        return -EACCES;
                                }
                                if (!tnum_is_const(reg->var_off)) {
                                        verbose(env, "kptr access cannot have variable offset\n");
                                        return -EACCES;
                                }
                                if (p != off + reg->var_off.value) {
                                        verbose(env, "kptr access misaligned expected=%u off=%llu\n",
                                                p, off + reg->var_off.value);
                                        return -EACCES;
                                }
                                if (size != bpf_size_to_bytes(BPF_DW)) {
                                        verbose(env, "kptr access size must be BPF_DW\n");
                                        return -EACCES;
                                }
                                break;
                        default:
                                verbose(env, "%s cannot be accessed directly by load/store\n",
                                        btf_field_type_name(field->type));
                                return -EACCES;
                        }
                }
        }
        return 0;
}

#define MAX_PACKET_OFF 0xffff

static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
                                       const struct bpf_call_arg_meta *meta,
                                       enum bpf_access_type t)
{
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);

        switch (prog_type) {
        /* Program types only with direct read access go here! */
        case BPF_PROG_TYPE_LWT_IN:
        case BPF_PROG_TYPE_LWT_OUT:
        case BPF_PROG_TYPE_LWT_SEG6LOCAL:
        case BPF_PROG_TYPE_SK_REUSEPORT:
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (t == BPF_WRITE)
                        return false;
                fallthrough;

        /* Program types with direct read + write access go here! */
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
        case BPF_PROG_TYPE_XDP:
        case BPF_PROG_TYPE_LWT_XMIT:
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
                if (meta)
                        return meta->pkt_access;

                env->seen_direct_write = true;
                return true;

        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                if (t == BPF_WRITE)
                        env->seen_direct_write = true;

                return true;

        default:
                return false;
        }
}

static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
                               int size, bool zero_size_allowed)
{
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = &regs[regno];
        int err;

        /* We may have added a variable offset to the packet pointer; but any
         * reg->range we have comes after that.  We are only checking the fixed
         * offset.
         */

        /* We don't allow negative numbers, because we aren't tracking enough
         * detail to prove they're safe.
         */
        if (reg->smin_value < 0) {
                verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                        regno);
                return -EACCES;
        }

        err = reg->range < 0 ? -EINVAL :
              __check_mem_access(env, regno, off, size, reg->range,
                                 zero_size_allowed);
        if (err) {
                verbose(env, "R%d offset is outside of the packet\n", regno);
                return err;
        }

        /* __check_mem_access has made sure "off + size - 1" is within u16.
         * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
         * otherwise find_good_pkt_pointers would have refused to set range info
         * that __check_mem_access would have rejected this pkt access.
         * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
         */
        env->prog->aux->max_pkt_offset =
                max_t(u32, env->prog->aux->max_pkt_offset,
                      off + reg->umax_value + size - 1);

        return err;
}

/* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
                            enum bpf_access_type t, enum bpf_reg_type *reg_type,
                            struct btf **btf, u32 *btf_id)
{
        struct bpf_insn_access_aux info = {
                .reg_type = *reg_type,
                .log = &env->log,
        };

        if (env->ops->is_valid_access &&
            env->ops->is_valid_access(off, size, t, env->prog, &info)) {
                /* A non zero info.ctx_field_size indicates that this field is a
                 * candidate for later verifier transformation to load the whole
                 * field and then apply a mask when accessed with a narrower
                 * access than actual ctx access size. A zero info.ctx_field_size
                 * will only allow for whole field access and rejects any other
                 * type of narrower access.
                 */
                *reg_type = info.reg_type;

                if (base_type(*reg_type) == PTR_TO_BTF_ID) {
                        *btf = info.btf;
                        *btf_id = info.btf_id;
                } else {
                        env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
                }
                /* remember the offset of last byte accessed in ctx */
                if (env->prog->aux->max_ctx_offset < off + size)
                        env->prog->aux->max_ctx_offset = off + size;
                return 0;
        }

        verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
        return -EACCES;
}

static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
                                  int size)
{
        if (size < 0 || off < 0 ||
            (u64)off + size > sizeof(struct bpf_flow_keys)) {
                verbose(env, "invalid access to flow keys off=%d size=%d\n",
                        off, size);
                return -EACCES;
        }
        return 0;
}

static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
                             u32 regno, int off, int size,
                             enum bpf_access_type t)
{
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = &regs[regno];
        struct bpf_insn_access_aux info = {};
        bool valid;

        if (reg->smin_value < 0) {
                verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                        regno);
                return -EACCES;
        }

        switch (reg->type) {
        case PTR_TO_SOCK_COMMON:
                valid = bpf_sock_common_is_valid_access(off, size, t, &info);
                break;
        case PTR_TO_SOCKET:
                valid = bpf_sock_is_valid_access(off, size, t, &info);
                break;
        case PTR_TO_TCP_SOCK:
                valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
                break;
        case PTR_TO_XDP_SOCK:
                valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
                break;
        default:
                valid = false;
        }


        if (valid) {
                env->insn_aux_data[insn_idx].ctx_field_size =
                        info.ctx_field_size;
                return 0;
        }

        verbose(env, "R%d invalid %s access off=%d size=%d\n",
                regno, reg_type_str(env, reg->type), off, size);

        return -EACCES;
}

static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
        return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
}

static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return reg->type == PTR_TO_CTX;
}

static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return type_is_sk_pointer(reg->type);
}

static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return type_is_pkt_pointer(reg->type);
}

static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */
        return reg->type == PTR_TO_FLOW_KEYS;
}

static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
{
        const struct bpf_reg_state *reg = reg_state(env, regno);

        return reg->type == PTR_TO_ARENA;
}

static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
        [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
        [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
        [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
#endif
        [CONST_PTR_TO_MAP] = btf_bpf_map_id,
};

static bool is_trusted_reg(const struct bpf_reg_state *reg)
{
        /* A referenced register is always trusted. */
        if (reg->ref_obj_id)
                return true;

        /* Types listed in the reg2btf_ids are always trusted */
        if (reg2btf_ids[base_type(reg->type)] &&
            !bpf_type_has_unsafe_modifiers(reg->type))
                return true;

        /* If a register is not referenced, it is trusted if it has the
         * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
         * other type modifiers may be safe, but we elect to take an opt-in
         * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
         * not.
         *
         * Eventually, we should make PTR_TRUSTED the single source of truth
         * for whether a register is trusted.
         */
        return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
               !bpf_type_has_unsafe_modifiers(reg->type);
}

static bool is_rcu_reg(const struct bpf_reg_state *reg)
{
        return reg->type & MEM_RCU;
}

static void clear_trusted_flags(enum bpf_type_flag *flag)
{
        *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
}

static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
                                   const struct bpf_reg_state *reg,
                                   int off, int size, bool strict)
{
        struct tnum reg_off;
        int ip_align;

        /* Byte size accesses are always allowed. */
        if (!strict || size == 1)
                return 0;

        /* For platforms that do not have a Kconfig enabling
         * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
         * NET_IP_ALIGN is universally set to '2'.  And on platforms
         * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
         * to this code only in strict mode where we want to emulate
         * the NET_IP_ALIGN==2 checking.  Therefore use an
         * unconditional IP align value of '2'.
         */
        ip_align = 2;

        reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
        if (!tnum_is_aligned(reg_off, size)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env,
                        "misaligned packet access off %d+%s+%d+%d size %d\n",
                        ip_align, tn_buf, reg->off, off, size);
                return -EACCES;
        }

        return 0;
}

static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
                                       const struct bpf_reg_state *reg,
                                       const char *pointer_desc,
                                       int off, int size, bool strict)
{
        struct tnum reg_off;

        /* Byte size accesses are always allowed. */
        if (!strict || size == 1)
                return 0;

        reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
        if (!tnum_is_aligned(reg_off, size)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
                        pointer_desc, tn_buf, reg->off, off, size);
                return -EACCES;
        }

        return 0;
}

static int check_ptr_alignment(struct bpf_verifier_env *env,
                               const struct bpf_reg_state *reg, int off,
                               int size, bool strict_alignment_once)
{
        bool strict = env->strict_alignment || strict_alignment_once;
        const char *pointer_desc = "";

        switch (reg->type) {
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
                /* Special case, because of NET_IP_ALIGN. Given metadata sits
                 * right in front, treat it the very same way.
                 */
                return check_pkt_ptr_alignment(env, reg, off, size, strict);
        case PTR_TO_FLOW_KEYS:
                pointer_desc = "flow keys ";
                break;
        case PTR_TO_MAP_KEY:
                pointer_desc = "key ";
                break;
        case PTR_TO_MAP_VALUE:
                pointer_desc = "value ";
                break;
        case PTR_TO_CTX:
                pointer_desc = "context ";
                break;
        case PTR_TO_STACK:
                pointer_desc = "stack ";
                /* The stack spill tracking logic in check_stack_write_fixed_off()
                 * and check_stack_read_fixed_off() relies on stack accesses being
                 * aligned.
                 */
                strict = true;
                break;
        case PTR_TO_SOCKET:
                pointer_desc = "sock ";
                break;
        case PTR_TO_SOCK_COMMON:
                pointer_desc = "sock_common ";
                break;
        case PTR_TO_TCP_SOCK:
                pointer_desc = "tcp_sock ";
                break;
        case PTR_TO_XDP_SOCK:
                pointer_desc = "xdp_sock ";
                break;
        case PTR_TO_ARENA:
                return 0;
        default:
                break;
        }
        return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
                                           strict);
}

static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{
        if (env->prog->jit_requested)
                return round_up(stack_depth, 16);

        /* round up to 32-bytes, since this is granularity
         * of interpreter stack size
         */
        return round_up(max_t(u32, stack_depth, 1), 32);
}

/* starting from main bpf function walk all instructions of the function
 * and recursively walk all callees that given function can call.
 * Ignore jump and exit insns.
 * Since recursion is prevented by check_cfg() this algorithm
 * only needs a local stack of MAX_CALL_FRAMES to remember callsites
 */
static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
{
        struct bpf_subprog_info *subprog = env->subprog_info;
        struct bpf_insn *insn = env->prog->insnsi;
        int depth = 0, frame = 0, i, subprog_end;
        bool tail_call_reachable = false;
        int ret_insn[MAX_CALL_FRAMES];
        int ret_prog[MAX_CALL_FRAMES];
        int j;

        i = subprog[idx].start;
process_func:
        /* protect against potential stack overflow that might happen when
         * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
         * depth for such case down to 256 so that the worst case scenario
         * would result in 8k stack size (32 which is tailcall limit * 256 =
         * 8k).
         *
         * To get the idea what might happen, see an example:
         * func1 -> sub rsp, 128
         *  subfunc1 -> sub rsp, 256
         *  tailcall1 -> add rsp, 256
         *   func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
         *   subfunc2 -> sub rsp, 64
         *   subfunc22 -> sub rsp, 128
         *   tailcall2 -> add rsp, 128
         *    func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
         *
         * tailcall will unwind the current stack frame but it will not get rid
         * of caller's stack as shown on the example above.
         */
        if (idx && subprog[idx].has_tail_call && depth >= 256) {
                verbose(env,
                        "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
                        depth);
                return -EACCES;
        }
        depth += round_up_stack_depth(env, subprog[idx].stack_depth);
        if (depth > MAX_BPF_STACK) {
                verbose(env, "combined stack size of %d calls is %d. Too large\n",
                        frame + 1, depth);
                return -EACCES;
        }
continue_func:
        subprog_end = subprog[idx + 1].start;
        for (; i < subprog_end; i++) {
                int next_insn, sidx;

                if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
                        bool err = false;

                        if (!is_bpf_throw_kfunc(insn + i))
                                continue;
                        if (subprog[idx].is_cb)
                                err = true;
                        for (int c = 0; c < frame && !err; c++) {
                                if (subprog[ret_prog[c]].is_cb) {
                                        err = true;
                                        break;
                                }
                        }
                        if (!err)
                                continue;
                        verbose(env,
                                "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
                                i, idx);
                        return -EINVAL;
                }

                if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
                        continue;
                /* remember insn and function to return to */
                ret_insn[frame] = i + 1;
                ret_prog[frame] = idx;

                /* find the callee */
                next_insn = i + insn[i].imm + 1;
                sidx = find_subprog(env, next_insn);
                if (sidx < 0) {
                        WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
                                  next_insn);
                        return -EFAULT;
                }
                if (subprog[sidx].is_async_cb) {
                        if (subprog[sidx].has_tail_call) {
                                verbose(env, "verifier bug. subprog has tail_call and async cb\n");
                                return -EFAULT;
                        }
                        /* async callbacks don't increase bpf prog stack size unless called directly */
                        if (!bpf_pseudo_call(insn + i))
                                continue;
                        if (subprog[sidx].is_exception_cb) {
                                verbose(env, "insn %d cannot call exception cb directly\n", i);
                                return -EINVAL;
                        }
                }
                i = next_insn;
                idx = sidx;

                if (subprog[idx].has_tail_call)
                        tail_call_reachable = true;

                frame++;
                if (frame >= MAX_CALL_FRAMES) {
                        verbose(env, "the call stack of %d frames is too deep !\n",
                                frame);
                        return -E2BIG;
                }
                goto process_func;
        }
        /* if tail call got detected across bpf2bpf calls then mark each of the
         * currently present subprog frames as tail call reachable subprogs;
         * this info will be utilized by JIT so that we will be preserving the
         * tail call counter throughout bpf2bpf calls combined with tailcalls
         */
        if (tail_call_reachable)
                for (j = 0; j < frame; j++) {
                        if (subprog[ret_prog[j]].is_exception_cb) {
                                verbose(env, "cannot tail call within exception cb\n");
                                return -EINVAL;
                        }
                        subprog[ret_prog[j]].tail_call_reachable = true;
                }
        if (subprog[0].tail_call_reachable)
                env->prog->aux->tail_call_reachable = true;

        /* end of for() loop means the last insn of the 'subprog'
         * was reached. Doesn't matter whether it was JA or EXIT
         */
        if (frame == 0)
                return 0;
        depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
        frame--;
        i = ret_insn[frame];
        idx = ret_prog[frame];
        goto continue_func;
}

static int check_max_stack_depth(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *si = env->subprog_info;
        int ret;

        for (int i = 0; i < env->subprog_cnt; i++) {
                if (!i || si[i].is_async_cb) {
                        ret = check_max_stack_depth_subprog(env, i);
                        if (ret < 0)
                                return ret;
                }
                continue;
        }
        return 0;
}

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
static int get_callee_stack_depth(struct bpf_verifier_env *env,
                                  const struct bpf_insn *insn, int idx)
{
        int start = idx + insn->imm + 1, subprog;

        subprog = find_subprog(env, start);
        if (subprog < 0) {
                WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
                          start);
                return -EFAULT;
        }
        return env->subprog_info[subprog].stack_depth;
}
#endif

static int __check_buffer_access(struct bpf_verifier_env *env,
                                 const char *buf_info,
                                 const struct bpf_reg_state *reg,
                                 int regno, int off, int size)
{
        if (off < 0) {
                verbose(env,
                        "R%d invalid %s buffer access: off=%d, size=%d\n",
                        regno, buf_info, off, size);
                return -EACCES;
        }
        if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env,
                        "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
                        regno, off, tn_buf);
                return -EACCES;
        }

        return 0;
}

static int check_tp_buffer_access(struct bpf_verifier_env *env,
                                  const struct bpf_reg_state *reg,
                                  int regno, int off, int size)
{
        int err;

        err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
        if (err)
                return err;

        if (off + size > env->prog->aux->max_tp_access)
                env->prog->aux->max_tp_access = off + size;

        return 0;
}

static int check_buffer_access(struct bpf_verifier_env *env,
                               const struct bpf_reg_state *reg,
                               int regno, int off, int size,
                               bool zero_size_allowed,
                               u32 *max_access)
{
        const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
        int err;

        err = __check_buffer_access(env, buf_info, reg, regno, off, size);
        if (err)
                return err;

        if (off + size > *max_access)
                *max_access = off + size;

        return 0;
}

/* BPF architecture zero extends alu32 ops into 64-bit registesr */
static void zext_32_to_64(struct bpf_reg_state *reg)
{
        reg->var_off = tnum_subreg(reg->var_off);
        __reg_assign_32_into_64(reg);
}

/* truncate register to smaller size (in bytes)
 * must be called with size < BPF_REG_SIZE
 */
static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
{
        u64 mask;

        /* clear high bits in bit representation */
        reg->var_off = tnum_cast(reg->var_off, size);

        /* fix arithmetic bounds */
        mask = ((u64)1 << (size * 8)) - 1;
        if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
                reg->umin_value &= mask;
                reg->umax_value &= mask;
        } else {
                reg->umin_value = 0;
                reg->umax_value = mask;
        }
        reg->smin_value = reg->umin_value;
        reg->smax_value = reg->umax_value;

        /* If size is smaller than 32bit register the 32bit register
         * values are also truncated so we push 64-bit bounds into
         * 32-bit bounds. Above were truncated < 32-bits already.
         */
        if (size < 4)
                __mark_reg32_unbounded(reg);

        reg_bounds_sync(reg);
}

static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
{
        if (size == 1) {
                reg->smin_value = reg->s32_min_value = S8_MIN;
                reg->smax_value = reg->s32_max_value = S8_MAX;
        } else if (size == 2) {
                reg->smin_value = reg->s32_min_value = S16_MIN;
                reg->smax_value = reg->s32_max_value = S16_MAX;
        } else {
                /* size == 4 */
                reg->smin_value = reg->s32_min_value = S32_MIN;
                reg->smax_value = reg->s32_max_value = S32_MAX;
        }
        reg->umin_value = reg->u32_min_value = 0;
        reg->umax_value = U64_MAX;
        reg->u32_max_value = U32_MAX;
        reg->var_off = tnum_unknown;
}

static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
{
        s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
        u64 top_smax_value, top_smin_value;
        u64 num_bits = size * 8;

        if (tnum_is_const(reg->var_off)) {
                u64_cval = reg->var_off.value;
                if (size == 1)
                        reg->var_off = tnum_const((s8)u64_cval);
                else if (size == 2)
                        reg->var_off = tnum_const((s16)u64_cval);
                else
                        /* size == 4 */
                        reg->var_off = tnum_const((s32)u64_cval);

                u64_cval = reg->var_off.value;
                reg->smax_value = reg->smin_value = u64_cval;
                reg->umax_value = reg->umin_value = u64_cval;
                reg->s32_max_value = reg->s32_min_value = u64_cval;
                reg->u32_max_value = reg->u32_min_value = u64_cval;
                return;
        }

        top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
        top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;

        if (top_smax_value != top_smin_value)
                goto out;

        /* find the s64_min and s64_min after sign extension */
        if (size == 1) {
                init_s64_max = (s8)reg->smax_value;
                init_s64_min = (s8)reg->smin_value;
        } else if (size == 2) {
                init_s64_max = (s16)reg->smax_value;
                init_s64_min = (s16)reg->smin_value;
        } else {
                init_s64_max = (s32)reg->smax_value;
                init_s64_min = (s32)reg->smin_value;
        }

        s64_max = max(init_s64_max, init_s64_min);
        s64_min = min(init_s64_max, init_s64_min);

        /* both of s64_max/s64_min positive or negative */
        if ((s64_max >= 0) == (s64_min >= 0)) {
                reg->smin_value = reg->s32_min_value = s64_min;
                reg->smax_value = reg->s32_max_value = s64_max;
                reg->umin_value = reg->u32_min_value = s64_min;
                reg->umax_value = reg->u32_max_value = s64_max;
                reg->var_off = tnum_range(s64_min, s64_max);
                return;
        }

out:
        set_sext64_default_val(reg, size);
}

static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
{
        if (size == 1) {
                reg->s32_min_value = S8_MIN;
                reg->s32_max_value = S8_MAX;
        } else {
                /* size == 2 */
                reg->s32_min_value = S16_MIN;
                reg->s32_max_value = S16_MAX;
        }
        reg->u32_min_value = 0;
        reg->u32_max_value = U32_MAX;
}

static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
{
        s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
        u32 top_smax_value, top_smin_value;
        u32 num_bits = size * 8;

        if (tnum_is_const(reg->var_off)) {
                u32_val = reg->var_off.value;
                if (size == 1)
                        reg->var_off = tnum_const((s8)u32_val);
                else
                        reg->var_off = tnum_const((s16)u32_val);

                u32_val = reg->var_off.value;
                reg->s32_min_value = reg->s32_max_value = u32_val;
                reg->u32_min_value = reg->u32_max_value = u32_val;
                return;
        }

        top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
        top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;

        if (top_smax_value != top_smin_value)
                goto out;

        /* find the s32_min and s32_min after sign extension */
        if (size == 1) {
                init_s32_max = (s8)reg->s32_max_value;
                init_s32_min = (s8)reg->s32_min_value;
        } else {
                /* size == 2 */
                init_s32_max = (s16)reg->s32_max_value;
                init_s32_min = (s16)reg->s32_min_value;
        }
        s32_max = max(init_s32_max, init_s32_min);
        s32_min = min(init_s32_max, init_s32_min);

        if ((s32_min >= 0) == (s32_max >= 0)) {
                reg->s32_min_value = s32_min;
                reg->s32_max_value = s32_max;
                reg->u32_min_value = (u32)s32_min;
                reg->u32_max_value = (u32)s32_max;
                return;
        }

out:
        set_sext32_default_val(reg, size);
}

static bool bpf_map_is_rdonly(const struct bpf_map *map)
{
        /* A map is considered read-only if the following condition are true:
         *
         * 1) BPF program side cannot change any of the map content. The
         *    BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
         *    and was set at map creation time.
         * 2) The map value(s) have been initialized from user space by a
         *    loader and then "frozen", such that no new map update/delete
         *    operations from syscall side are possible for the rest of
         *    the map's lifetime from that point onwards.
         * 3) Any parallel/pending map update/delete operations from syscall
         *    side have been completed. Only after that point, it's safe to
         *    assume that map value(s) are immutable.
         */
        return (map->map_flags & BPF_F_RDONLY_PROG) &&
               READ_ONCE(map->frozen) &&
               !bpf_map_write_active(map);
}

static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
                               bool is_ldsx)
{
        void *ptr;
        u64 addr;
        int err;

        err = map->ops->map_direct_value_addr(map, &addr, off);
        if (err)
                return err;
        ptr = (void *)(long)addr + off;

        switch (size) {
        case sizeof(u8):
                *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
                break;
        case sizeof(u16):
                *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
                break;
        case sizeof(u32):
                *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
                break;
        case sizeof(u64):
                *val = *(u64 *)ptr;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

#define BTF_TYPE_SAFE_RCU(__type)  __PASTE(__type, __safe_rcu)
#define BTF_TYPE_SAFE_RCU_OR_NULL(__type)  __PASTE(__type, __safe_rcu_or_null)
#define BTF_TYPE_SAFE_TRUSTED(__type)  __PASTE(__type, __safe_trusted)
#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type)  __PASTE(__type, __safe_trusted_or_null)

/*
 * Allow list few fields as RCU trusted or full trusted.
 * This logic doesn't allow mix tagging and will be removed once GCC supports
 * btf_type_tag.
 */

/* RCU trusted: these fields are trusted in RCU CS and never NULL */
BTF_TYPE_SAFE_RCU(struct task_struct) {
        const cpumask_t *cpus_ptr;
        struct css_set __rcu *cgroups;
        struct task_struct __rcu *real_parent;
        struct task_struct *group_leader;
};

BTF_TYPE_SAFE_RCU(struct cgroup) {
        /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
        struct kernfs_node *kn;
};

BTF_TYPE_SAFE_RCU(struct css_set) {
        struct cgroup *dfl_cgrp;
};

/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
        struct file __rcu *exe_file;
};

/* skb->sk, req->sk are not RCU protected, but we mark them as such
 * because bpf prog accessible sockets are SOCK_RCU_FREE.
 */
BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
        struct sock *sk;
};

BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
        struct sock *sk;
};

/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
        struct seq_file *seq;
};

BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
        struct bpf_iter_meta *meta;
        struct task_struct *task;
};

BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
        struct file *file;
};

BTF_TYPE_SAFE_TRUSTED(struct file) {
        struct inode *f_inode;
};

BTF_TYPE_SAFE_TRUSTED(struct dentry) {
        /* no negative dentry-s in places where bpf can see it */
        struct inode *d_inode;
};

BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
        struct sock *sk;
};

static bool type_is_rcu(struct bpf_verifier_env *env,
                        struct bpf_reg_state *reg,
                        const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
}

static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
                                struct bpf_reg_state *reg,
                                const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
}

static bool type_is_trusted(struct bpf_verifier_env *env,
                            struct bpf_reg_state *reg,
                            const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}

static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
                                    struct bpf_reg_state *reg,
                                    const char *field_name, u32 btf_id)
{
        BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));

        return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
                                          "__safe_trusted_or_null");
}

static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *regs,
                                   int regno, int off, int size,
                                   enum bpf_access_type atype,
                                   int value_regno)
{
        struct bpf_reg_state *reg = regs + regno;
        const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
        const char *tname = btf_name_by_offset(reg->btf, t->name_off);
        const char *field_name = NULL;
        enum bpf_type_flag flag = 0;
        u32 btf_id = 0;
        int ret;

        if (!env->allow_ptr_leaks) {
                verbose(env,
                        "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
                        tname);
                return -EPERM;
        }
        if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
                verbose(env,
                        "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
                        tname);
                return -EINVAL;
        }
        if (off < 0) {
                verbose(env,
                        "R%d is ptr_%s invalid negative access: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }
        if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env,
                        "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
                        regno, tname, off, tn_buf);
                return -EACCES;
        }

        if (reg->type & MEM_USER) {
                verbose(env,
                        "R%d is ptr_%s access user memory: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }

        if (reg->type & MEM_PERCPU) {
                verbose(env,
                        "R%d is ptr_%s access percpu memory: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }

        if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
                if (!btf_is_kernel(reg->btf)) {
                        verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
                        return -EFAULT;
                }
                ret = env->ops->btf_struct_access(&env->log, reg, off, size);
        } else {
                /* Writes are permitted with default btf_struct_access for
                 * program allocated objects (which always have ref_obj_id > 0),
                 * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
                 */
                if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
                        verbose(env, "only read is supported\n");
                        return -EACCES;
                }

                if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
                    !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
                        verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
                        return -EFAULT;
                }

                ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
        }

        if (ret < 0)
                return ret;

        if (ret != PTR_TO_BTF_ID) {
                /* just mark; */

        } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
                /* If this is an untrusted pointer, all pointers formed by walking it
                 * also inherit the untrusted flag.
                 */
                flag = PTR_UNTRUSTED;

        } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
                /* By default any pointer obtained from walking a trusted pointer is no
                 * longer trusted, unless the field being accessed has explicitly been
                 * marked as inheriting its parent's state of trust (either full or RCU).
                 * For example:
                 * 'cgroups' pointer is untrusted if task->cgroups dereference
                 * happened in a sleepable program outside of bpf_rcu_read_lock()
                 * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
                 * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
                 *
                 * A regular RCU-protected pointer with __rcu tag can also be deemed
                 * trusted if we are in an RCU CS. Such pointer can be NULL.
                 */
                if (type_is_trusted(env, reg, field_name, btf_id)) {
                        flag |= PTR_TRUSTED;
                } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
                        flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
                } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
                        if (type_is_rcu(env, reg, field_name, btf_id)) {
                                /* ignore __rcu tag and mark it MEM_RCU */
                                flag |= MEM_RCU;
                        } else if (flag & MEM_RCU ||
                                   type_is_rcu_or_null(env, reg, field_name, btf_id)) {
                                /* __rcu tagged pointers can be NULL */
                                flag |= MEM_RCU | PTR_MAYBE_NULL;

                                /* We always trust them */
                                if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
                                    flag & PTR_UNTRUSTED)
                                        flag &= ~PTR_UNTRUSTED;
                        } else if (flag & (MEM_PERCPU | MEM_USER)) {
                                /* keep as-is */
                        } else {
                                /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
                                clear_trusted_flags(&flag);
                        }
                } else {
                        /*
                         * If not in RCU CS or MEM_RCU pointer can be NULL then
                         * aggressively mark as untrusted otherwise such
                         * pointers will be plain PTR_TO_BTF_ID without flags
                         * and will be allowed to be passed into helpers for
                         * compat reasons.
                         */
                        flag = PTR_UNTRUSTED;
                }
        } else {
                /* Old compat. Deprecated */
                clear_trusted_flags(&flag);
        }

        if (atype == BPF_READ && value_regno >= 0)
                mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);

        return 0;
}

static int check_ptr_to_map_access(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *regs,
                                   int regno, int off, int size,
                                   enum bpf_access_type atype,
                                   int value_regno)
{
        struct bpf_reg_state *reg = regs + regno;
        struct bpf_map *map = reg->map_ptr;
        struct bpf_reg_state map_reg;
        enum bpf_type_flag flag = 0;
        const struct btf_type *t;
        const char *tname;
        u32 btf_id;
        int ret;

        if (!btf_vmlinux) {
                verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
                return -ENOTSUPP;
        }

        if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
                verbose(env, "map_ptr access not supported for map type %d\n",
                        map->map_type);
                return -ENOTSUPP;
        }

        t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
        tname = btf_name_by_offset(btf_vmlinux, t->name_off);

        if (!env->allow_ptr_leaks) {
                verbose(env,
                        "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
                        tname);
                return -EPERM;
        }

        if (off < 0) {
                verbose(env, "R%d is %s invalid negative access: off=%d\n",
                        regno, tname, off);
                return -EACCES;
        }

        if (atype != BPF_READ) {
                verbose(env, "only read from %s is supported\n", tname);
                return -EACCES;
        }

        /* Simulate access to a PTR_TO_BTF_ID */
        memset(&map_reg, 0, sizeof(map_reg));
        mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
        ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
        if (ret < 0)
                return ret;

        if (value_regno >= 0)
                mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);

        return 0;
}

/* Check that the stack access at the given offset is within bounds. The
 * maximum valid offset is -1.
 *
 * The minimum valid offset is -MAX_BPF_STACK for writes, and
 * -state->allocated_stack for reads.
 */
static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
                                          s64 off,
                                          struct bpf_func_state *state,
                                          enum bpf_access_type t)
{
        int min_valid_off;

        if (t == BPF_WRITE || env->allow_uninit_stack)
                min_valid_off = -MAX_BPF_STACK;
        else
                min_valid_off = -state->allocated_stack;

        if (off < min_valid_off || off > -1)
                return -EACCES;
        return 0;
}

/* Check that the stack access at 'regno + off' falls within the maximum stack
 * bounds.
 *
 * 'off' includes `regno->offset`, but not its dynamic part (if any).
 */
static int check_stack_access_within_bounds(
                struct bpf_verifier_env *env,
                int regno, int off, int access_size,
                enum bpf_access_src src, enum bpf_access_type type)
{
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = regs + regno;
        struct bpf_func_state *state = func(env, reg);
        s64 min_off, max_off;
        int err;
        char *err_extra;

        if (src == ACCESS_HELPER)
                /* We don't know if helpers are reading or writing (or both). */
                err_extra = " indirect access to";
        else if (type == BPF_READ)
                err_extra = " read from";
        else
                err_extra = " write to";

        if (tnum_is_const(reg->var_off)) {
                min_off = (s64)reg->var_off.value + off;
                max_off = min_off + access_size;
        } else {
                if (reg->smax_value >= BPF_MAX_VAR_OFF ||
                    reg->smin_value <= -BPF_MAX_VAR_OFF) {
                        verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
                                err_extra, regno);
                        return -EACCES;
                }
                min_off = reg->smin_value + off;
                max_off = reg->smax_value + off + access_size;
        }

        err = check_stack_slot_within_bounds(env, min_off, state, type);
        if (!err && max_off > 0)
                err = -EINVAL; /* out of stack access into non-negative offsets */
        if (!err && access_size < 0)
                /* access_size should not be negative (or overflow an int); others checks
                 * along the way should have prevented such an access.
                 */
                err = -EFAULT; /* invalid negative access size; integer overflow? */

        if (err) {
                if (tnum_is_const(reg->var_off)) {
                        verbose(env, "invalid%s stack R%d off=%d size=%d\n",
                                err_extra, regno, off, access_size);
                } else {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
                                err_extra, regno, tn_buf, off, access_size);
                }
                return err;
        }

        /* Note that there is no stack access with offset zero, so the needed stack
         * size is -min_off, not -min_off+1.
         */
        return grow_stack_state(env, state, -min_off /* size */);
}

/* check whether memory at (regno + off) is accessible for t = (read | write)
 * if t==write, value_regno is a register which value is stored into memory
 * if t==read, value_regno is a register which will receive the value from memory
 * if t==write && value_regno==-1, some unknown value is stored into memory
 * if t==read && value_regno==-1, don't care what we read from memory
 */
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
                            int off, int bpf_size, enum bpf_access_type t,
                            int value_regno, bool strict_alignment_once, bool is_ldsx)
{
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = regs + regno;
        int size, err = 0;

        size = bpf_size_to_bytes(bpf_size);
        if (size < 0)
                return size;

        /* alignment checks will add in reg->off themselves */
        err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
        if (err)
                return err;

        /* for access checks, reg->off is just part of off */
        off += reg->off;

        if (reg->type == PTR_TO_MAP_KEY) {
                if (t == BPF_WRITE) {
                        verbose(env, "write to change key R%d not allowed\n", regno);
                        return -EACCES;
                }

                err = check_mem_region_access(env, regno, off, size,
                                              reg->map_ptr->key_size, false);
                if (err)
                        return err;
                if (value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_MAP_VALUE) {
                struct btf_field *kptr_field = NULL;

                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into map\n", value_regno);
                        return -EACCES;
                }
                err = check_map_access_type(env, regno, off, size, t);
                if (err)
                        return err;
                err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
                if (err)
                        return err;
                if (tnum_is_const(reg->var_off))
                        kptr_field = btf_record_find(reg->map_ptr->record,
                                                     off + reg->var_off.value, BPF_KPTR);
                if (kptr_field) {
                        err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
                } else if (t == BPF_READ && value_regno >= 0) {
                        struct bpf_map *map = reg->map_ptr;

                        /* if map is read-only, track its contents as scalars */
                        if (tnum_is_const(reg->var_off) &&
                            bpf_map_is_rdonly(map) &&
                            map->ops->map_direct_value_addr) {
                                int map_off = off + reg->var_off.value;
                                u64 val = 0;

                                err = bpf_map_direct_read(map, map_off, size,
                                                          &val, is_ldsx);
                                if (err)
                                        return err;

                                regs[value_regno].type = SCALAR_VALUE;
                                __mark_reg_known(&regs[value_regno], val);
                        } else {
                                mark_reg_unknown(env, regs, value_regno);
                        }
                }
        } else if (base_type(reg->type) == PTR_TO_MEM) {
                bool rdonly_mem = type_is_rdonly_mem(reg->type);

                if (type_may_be_null(reg->type)) {
                        verbose(env, "R%d invalid mem access '%s'\n", regno,
                                reg_type_str(env, reg->type));
                        return -EACCES;
                }

                if (t == BPF_WRITE && rdonly_mem) {
                        verbose(env, "R%d cannot write into %s\n",
                                regno, reg_type_str(env, reg->type));
                        return -EACCES;
                }

                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into mem\n", value_regno);
                        return -EACCES;
                }

                err = check_mem_region_access(env, regno, off, size,
                                              reg->mem_size, false);
                if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_CTX) {
                enum bpf_reg_type reg_type = SCALAR_VALUE;
                struct btf *btf = NULL;
                u32 btf_id = 0;

                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into ctx\n", value_regno);
                        return -EACCES;
                }

                err = check_ptr_off_reg(env, reg, regno);
                if (err < 0)
                        return err;

                err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
                                       &btf_id);
                if (err)
                        verbose_linfo(env, insn_idx, "; ");
                if (!err && t == BPF_READ && value_regno >= 0) {
                        /* ctx access returns either a scalar, or a
                         * PTR_TO_PACKET[_META,_END]. In the latter
                         * case, we know the offset is zero.
                         */
                        if (reg_type == SCALAR_VALUE) {
                                mark_reg_unknown(env, regs, value_regno);
                        } else {
                                mark_reg_known_zero(env, regs,
                                                    value_regno);
                                if (type_may_be_null(reg_type))
                                        regs[value_regno].id = ++env->id_gen;
                                /* A load of ctx field could have different
                                 * actual load size with the one encoded in the
                                 * insn. When the dst is PTR, it is for sure not
                                 * a sub-register.
                                 */
                                regs[value_regno].subreg_def = DEF_NOT_SUBREG;
                                if (base_type(reg_type) == PTR_TO_BTF_ID) {
                                        regs[value_regno].btf = btf;
                                        regs[value_regno].btf_id = btf_id;
                                }
                        }
                        regs[value_regno].type = reg_type;
                }

        } else if (reg->type == PTR_TO_STACK) {
                /* Basic bounds checks. */
                err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
                if (err)
                        return err;

                if (t == BPF_READ)
                        err = check_stack_read(env, regno, off, size,
                                               value_regno);
                else
                        err = check_stack_write(env, regno, off, size,
                                                value_regno, insn_idx);
        } else if (reg_is_pkt_pointer(reg)) {
                if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
                        verbose(env, "cannot write into packet\n");
                        return -EACCES;
                }
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into packet\n",
                                value_regno);
                        return -EACCES;
                }
                err = check_packet_access(env, regno, off, size, false);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_FLOW_KEYS) {
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into flow keys\n",
                                value_regno);
                        return -EACCES;
                }

                err = check_flow_keys_access(env, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (type_is_sk_pointer(reg->type)) {
                if (t == BPF_WRITE) {
                        verbose(env, "R%d cannot write into %s\n",
                                regno, reg_type_str(env, reg->type));
                        return -EACCES;
                }
                err = check_sock_access(env, insn_idx, regno, off, size, t);
                if (!err && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_TP_BUFFER) {
                err = check_tp_buffer_access(env, reg, regno, off, size);
                if (!err && t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
                   !type_may_be_null(reg->type)) {
                err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
                                              value_regno);
        } else if (reg->type == CONST_PTR_TO_MAP) {
                err = check_ptr_to_map_access(env, regs, regno, off, size, t,
                                              value_regno);
        } else if (base_type(reg->type) == PTR_TO_BUF) {
                bool rdonly_mem = type_is_rdonly_mem(reg->type);
                u32 *max_access;

                if (rdonly_mem) {
                        if (t == BPF_WRITE) {
                                verbose(env, "R%d cannot write into %s\n",
                                        regno, reg_type_str(env, reg->type));
                                return -EACCES;
                        }
                        max_access = &env->prog->aux->max_rdonly_access;
                } else {
                        max_access = &env->prog->aux->max_rdwr_access;
                }

                err = check_buffer_access(env, reg, regno, off, size, false,
                                          max_access);

                if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
                        mark_reg_unknown(env, regs, value_regno);
        } else if (reg->type == PTR_TO_ARENA) {
                if (t == BPF_READ && value_regno >= 0)
                        mark_reg_unknown(env, regs, value_regno);
        } else {
                verbose(env, "R%d invalid mem access '%s'\n", regno,
                        reg_type_str(env, reg->type));
                return -EACCES;
        }

        if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
            regs[value_regno].type == SCALAR_VALUE) {
                if (!is_ldsx)
                        /* b/h/w load zero-extends, mark upper bits as known 0 */
                        coerce_reg_to_size(&regs[value_regno], size);
                else
                        coerce_reg_to_size_sx(&regs[value_regno], size);
        }
        return err;
}

static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
                             bool allow_trust_mismatch);

static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
        int load_reg;
        int err;

        switch (insn->imm) {
        case BPF_ADD:
        case BPF_ADD | BPF_FETCH:
        case BPF_AND:
        case BPF_AND | BPF_FETCH:
        case BPF_OR:
        case BPF_OR | BPF_FETCH:
        case BPF_XOR:
        case BPF_XOR | BPF_FETCH:
        case BPF_XCHG:
        case BPF_CMPXCHG:
                break;
        default:
                verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
                return -EINVAL;
        }

        if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
                verbose(env, "invalid atomic operand size\n");
                return -EINVAL;
        }

        /* check src1 operand */
        err = check_reg_arg(env, insn->src_reg, SRC_OP);
        if (err)
                return err;

        /* check src2 operand */
        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        if (err)
                return err;

        if (insn->imm == BPF_CMPXCHG) {
                /* Check comparison of R0 with memory location */
                const u32 aux_reg = BPF_REG_0;

                err = check_reg_arg(env, aux_reg, SRC_OP);
                if (err)
                        return err;

                if (is_pointer_value(env, aux_reg)) {
                        verbose(env, "R%d leaks addr into mem\n", aux_reg);
                        return -EACCES;
                }
        }

        if (is_pointer_value(env, insn->src_reg)) {
                verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
                return -EACCES;
        }

        if (is_ctx_reg(env, insn->dst_reg) ||
            is_pkt_reg(env, insn->dst_reg) ||
            is_flow_key_reg(env, insn->dst_reg) ||
            is_sk_reg(env, insn->dst_reg) ||
            (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) {
                verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
                        insn->dst_reg,
                        reg_type_str(env, reg_state(env, insn->dst_reg)->type));
                return -EACCES;
        }

        if (insn->imm & BPF_FETCH) {
                if (insn->imm == BPF_CMPXCHG)
                        load_reg = BPF_REG_0;
                else
                        load_reg = insn->src_reg;

                /* check and record load of old value */
                err = check_reg_arg(env, load_reg, DST_OP);
                if (err)
                        return err;
        } else {
                /* This instruction accesses a memory location but doesn't
                 * actually load it into a register.
                 */
                load_reg = -1;
        }

        /* Check whether we can read the memory, with second call for fetch
         * case to simulate the register fill.
         */
        err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_READ, -1, true, false);
        if (!err && load_reg >= 0)
                err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
                                       BPF_SIZE(insn->code), BPF_READ, load_reg,
                                       true, false);
        if (err)
                return err;

        if (is_arena_reg(env, insn->dst_reg)) {
                err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
                if (err)
                        return err;
        }
        /* Check whether we can write into the same memory. */
        err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
                               BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
        if (err)
                return err;
        return 0;
}

/* When register 'regno' is used to read the stack (either directly or through
 * a helper function) make sure that it's within stack boundary and, depending
 * on the access type and privileges, that all elements of the stack are
 * initialized.
 *
 * 'off' includes 'regno->off', but not its dynamic part (if any).
 *
 * All registers that have been spilled on the stack in the slots within the
 * read offsets are marked as read.
 */
static int check_stack_range_initialized(
                struct bpf_verifier_env *env, int regno, int off,
                int access_size, bool zero_size_allowed,
                enum bpf_access_src type, struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *reg = reg_state(env, regno);
        struct bpf_func_state *state = func(env, reg);
        int err, min_off, max_off, i, j, slot, spi;
        char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
        enum bpf_access_type bounds_check_type;
        /* Some accesses can write anything into the stack, others are
         * read-only.
         */
        bool clobber = false;

        if (access_size == 0 && !zero_size_allowed) {
                verbose(env, "invalid zero-sized read\n");
                return -EACCES;
        }

        if (type == ACCESS_HELPER) {
                /* The bounds checks for writes are more permissive than for
                 * reads. However, if raw_mode is not set, we'll do extra
                 * checks below.
                 */
                bounds_check_type = BPF_WRITE;
                clobber = true;
        } else {
                bounds_check_type = BPF_READ;
        }
        err = check_stack_access_within_bounds(env, regno, off, access_size,
                                               type, bounds_check_type);
        if (err)
                return err;


        if (tnum_is_const(reg->var_off)) {
                min_off = max_off = reg->var_off.value + off;
        } else {
                /* Variable offset is prohibited for unprivileged mode for
                 * simplicity since it requires corresponding support in
                 * Spectre masking for stack ALU.
                 * See also retrieve_ptr_limit().
                 */
                if (!env->bypass_spec_v1) {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
                                regno, err_extra, tn_buf);
                        return -EACCES;
                }
                /* Only initialized buffer on stack is allowed to be accessed
                 * with variable offset. With uninitialized buffer it's hard to
                 * guarantee that whole memory is marked as initialized on
                 * helper return since specific bounds are unknown what may
                 * cause uninitialized stack leaking.
                 */
                if (meta && meta->raw_mode)
                        meta = NULL;

                min_off = reg->smin_value + off;
                max_off = reg->smax_value + off;
        }

        if (meta && meta->raw_mode) {
                /* Ensure we won't be overwriting dynptrs when simulating byte
                 * by byte access in check_helper_call using meta.access_size.
                 * This would be a problem if we have a helper in the future
                 * which takes:
                 *
                 *        helper(uninit_mem, len, dynptr)
                 *
                 * Now, uninint_mem may overlap with dynptr pointer. Hence, it
                 * may end up writing to dynptr itself when touching memory from
                 * arg 1. This can be relaxed on a case by case basis for known
                 * safe cases, but reject due to the possibilitiy of aliasing by
                 * default.
                 */
                for (i = min_off; i < max_off + access_size; i++) {
                        int stack_off = -i - 1;

                        spi = __get_spi(i);
                        /* raw_mode may write past allocated_stack */
                        if (state->allocated_stack <= stack_off)
                                continue;
                        if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
                                verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
                                return -EACCES;
                        }
                }
                meta->access_size = access_size;
                meta->regno = regno;
                return 0;
        }

        for (i = min_off; i < max_off + access_size; i++) {
                u8 *stype;

                slot = -i - 1;
                spi = slot / BPF_REG_SIZE;
                if (state->allocated_stack <= slot) {
                        verbose(env, "verifier bug: allocated_stack too small");
                        return -EFAULT;
                }

                stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
                if (*stype == STACK_MISC)
                        goto mark;
                if ((*stype == STACK_ZERO) ||
                    (*stype == STACK_INVALID && env->allow_uninit_stack)) {
                        if (clobber) {
                                /* helper can write anything into the stack */
                                *stype = STACK_MISC;
                        }
                        goto mark;
                }

                if (is_spilled_reg(&state->stack[spi]) &&
                    (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
                     env->allow_ptr_leaks)) {
                        if (clobber) {
                                __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
                                for (j = 0; j < BPF_REG_SIZE; j++)
                                        scrub_spilled_slot(&state->stack[spi].slot_type[j]);
                        }
                        goto mark;
                }

                if (tnum_is_const(reg->var_off)) {
                        verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
                                err_extra, regno, min_off, i - min_off, access_size);
                } else {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
                                err_extra, regno, tn_buf, i - min_off, access_size);
                }
                return -EACCES;
mark:
                /* reading any byte out of 8-byte 'spill_slot' will cause
                 * the whole slot to be marked as 'read'
                 */
                mark_reg_read(env, &state->stack[spi].spilled_ptr,
                              state->stack[spi].spilled_ptr.parent,
                              REG_LIVE_READ64);
                /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
                 * be sure that whether stack slot is written to or not. Hence,
                 * we must still conservatively propagate reads upwards even if
                 * helper may write to the entire memory range.
                 */
        }
        return 0;
}

static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
                                   int access_size, bool zero_size_allowed,
                                   struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        u32 *max_access;

        switch (base_type(reg->type)) {
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
                return check_packet_access(env, regno, reg->off, access_size,
                                           zero_size_allowed);
        case PTR_TO_MAP_KEY:
                if (meta && meta->raw_mode) {
                        verbose(env, "R%d cannot write into %s\n", regno,
                                reg_type_str(env, reg->type));
                        return -EACCES;
                }
                return check_mem_region_access(env, regno, reg->off, access_size,
                                               reg->map_ptr->key_size, false);
        case PTR_TO_MAP_VALUE:
                if (check_map_access_type(env, regno, reg->off, access_size,
                                          meta && meta->raw_mode ? BPF_WRITE :
                                          BPF_READ))
                        return -EACCES;
                return check_map_access(env, regno, reg->off, access_size,
                                        zero_size_allowed, ACCESS_HELPER);
        case PTR_TO_MEM:
                if (type_is_rdonly_mem(reg->type)) {
                        if (meta && meta->raw_mode) {
                                verbose(env, "R%d cannot write into %s\n", regno,
                                        reg_type_str(env, reg->type));
                                return -EACCES;
                        }
                }
                return check_mem_region_access(env, regno, reg->off,
                                               access_size, reg->mem_size,
                                               zero_size_allowed);
        case PTR_TO_BUF:
                if (type_is_rdonly_mem(reg->type)) {
                        if (meta && meta->raw_mode) {
                                verbose(env, "R%d cannot write into %s\n", regno,
                                        reg_type_str(env, reg->type));
                                return -EACCES;
                        }

                        max_access = &env->prog->aux->max_rdonly_access;
                } else {
                        max_access = &env->prog->aux->max_rdwr_access;
                }
                return check_buffer_access(env, reg, regno, reg->off,
                                           access_size, zero_size_allowed,
                                           max_access);
        case PTR_TO_STACK:
                return check_stack_range_initialized(
                                env,
                                regno, reg->off, access_size,
                                zero_size_allowed, ACCESS_HELPER, meta);
        case PTR_TO_BTF_ID:
                return check_ptr_to_btf_access(env, regs, regno, reg->off,
                                               access_size, BPF_READ, -1);
        case PTR_TO_CTX:
                /* in case the function doesn't know how to access the context,
                 * (because we are in a program of type SYSCALL for example), we
                 * can not statically check its size.
                 * Dynamically check it now.
                 */
                if (!env->ops->convert_ctx_access) {
                        enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
                        int offset = access_size - 1;

                        /* Allow zero-byte read from PTR_TO_CTX */
                        if (access_size == 0)
                                return zero_size_allowed ? 0 : -EACCES;

                        return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
                                                atype, -1, false, false);
                }

                fallthrough;
        default: /* scalar_value or invalid ptr */
                /* Allow zero-byte read from NULL, regardless of pointer type */
                if (zero_size_allowed && access_size == 0 &&
                    register_is_null(reg))
                        return 0;

                verbose(env, "R%d type=%s ", regno,
                        reg_type_str(env, reg->type));
                verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
                return -EACCES;
        }
}

/* verify arguments to helpers or kfuncs consisting of a pointer and an access
 * size.
 *
 * @regno is the register containing the access size. regno-1 is the register
 * containing the pointer.
 */
static int check_mem_size_reg(struct bpf_verifier_env *env,
                              struct bpf_reg_state *reg, u32 regno,
                              bool zero_size_allowed,
                              struct bpf_call_arg_meta *meta)
{
        int err;

        /* This is used to refine r0 return value bounds for helpers
         * that enforce this value as an upper bound on return values.
         * See do_refine_retval_range() for helpers that can refine
         * the return value. C type of helper is u32 so we pull register
         * bound from umax_value however, if negative verifier errors
         * out. Only upper bounds can be learned because retval is an
         * int type and negative retvals are allowed.
         */
        meta->msize_max_value = reg->umax_value;

        /* The register is SCALAR_VALUE; the access check
         * happens using its boundaries.
         */
        if (!tnum_is_const(reg->var_off))
                /* For unprivileged variable accesses, disable raw
                 * mode so that the program is required to
                 * initialize all the memory that the helper could
                 * just partially fill up.
                 */
                meta = NULL;

        if (reg->smin_value < 0) {
                verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
                        regno);
                return -EACCES;
        }

        if (reg->umin_value == 0 && !zero_size_allowed) {
                verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
                        regno, reg->umin_value, reg->umax_value);
                return -EACCES;
        }

        if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
                verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
                        regno);
                return -EACCES;
        }
        err = check_helper_mem_access(env, regno - 1,
                                      reg->umax_value,
                                      zero_size_allowed, meta);
        if (!err)
                err = mark_chain_precision(env, regno);
        return err;
}

static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                         u32 regno, u32 mem_size)
{
        bool may_be_null = type_may_be_null(reg->type);
        struct bpf_reg_state saved_reg;
        struct bpf_call_arg_meta meta;
        int err;

        if (register_is_null(reg))
                return 0;

        memset(&meta, 0, sizeof(meta));
        /* Assuming that the register contains a value check if the memory
         * access is safe. Temporarily save and restore the register's state as
         * the conversion shouldn't be visible to a caller.
         */
        if (may_be_null) {
                saved_reg = *reg;
                mark_ptr_not_null_reg(reg);
        }

        err = check_helper_mem_access(env, regno, mem_size, true, &meta);
        /* Check access for BPF_WRITE */
        meta.raw_mode = true;
        err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);

        if (may_be_null)
                *reg = saved_reg;

        return err;
}

static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
                                    u32 regno)
{
        struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
        bool may_be_null = type_may_be_null(mem_reg->type);
        struct bpf_reg_state saved_reg;
        struct bpf_call_arg_meta meta;
        int err;

        WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);

        memset(&meta, 0, sizeof(meta));

        if (may_be_null) {
                saved_reg = *mem_reg;
                mark_ptr_not_null_reg(mem_reg);
        }

        err = check_mem_size_reg(env, reg, regno, true, &meta);
        /* Check access for BPF_WRITE */
        meta.raw_mode = true;
        err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);

        if (may_be_null)
                *mem_reg = saved_reg;
        return err;
}

/* Implementation details:
 * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
 * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
 * Two bpf_map_lookups (even with the same key) will have different reg->id.
 * Two separate bpf_obj_new will also have different reg->id.
 * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
 * clears reg->id after value_or_null->value transition, since the verifier only
 * cares about the range of access to valid map value pointer and doesn't care
 * about actual address of the map element.
 * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
 * reg->id > 0 after value_or_null->value transition. By doing so
 * two bpf_map_lookups will be considered two different pointers that
 * point to different bpf_spin_locks. Likewise for pointers to allocated objects
 * returned from bpf_obj_new.
 * The verifier allows taking only one bpf_spin_lock at a time to avoid
 * dead-locks.
 * Since only one bpf_spin_lock is allowed the checks are simpler than
 * reg_is_refcounted() logic. The verifier needs to remember only
 * one spin_lock instead of array of acquired_refs.
 * cur_state->active_lock remembers which map value element or allocated
 * object got locked and clears it after bpf_spin_unlock.
 */
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
                             bool is_lock)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        struct bpf_verifier_state *cur = env->cur_state;
        bool is_const = tnum_is_const(reg->var_off);
        u64 val = reg->var_off.value;
        struct bpf_map *map = NULL;
        struct btf *btf = NULL;
        struct btf_record *rec;

        if (!is_const) {
                verbose(env,
                        "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
                        regno);
                return -EINVAL;
        }
        if (reg->type == PTR_TO_MAP_VALUE) {
                map = reg->map_ptr;
                if (!map->btf) {
                        verbose(env,
                                "map '%s' has to have BTF in order to use bpf_spin_lock\n",
                                map->name);
                        return -EINVAL;
                }
        } else {
                btf = reg->btf;
        }

        rec = reg_btf_record(reg);
        if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
                verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
                        map ? map->name : "kptr");
                return -EINVAL;
        }
        if (rec->spin_lock_off != val + reg->off) {
                verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
                        val + reg->off, rec->spin_lock_off);
                return -EINVAL;
        }
        if (is_lock) {
                if (cur->active_lock.ptr) {
                        verbose(env,
                                "Locking two bpf_spin_locks are not allowed\n");
                        return -EINVAL;
                }
                if (map)
                        cur->active_lock.ptr = map;
                else
                        cur->active_lock.ptr = btf;
                cur->active_lock.id = reg->id;
        } else {
                void *ptr;

                if (map)
                        ptr = map;
                else
                        ptr = btf;

                if (!cur->active_lock.ptr) {
                        verbose(env, "bpf_spin_unlock without taking a lock\n");
                        return -EINVAL;
                }
                if (cur->active_lock.ptr != ptr ||
                    cur->active_lock.id != reg->id) {
                        verbose(env, "bpf_spin_unlock of different lock\n");
                        return -EINVAL;
                }

                invalidate_non_owning_refs(env);

                cur->active_lock.ptr = NULL;
                cur->active_lock.id = 0;
        }
        return 0;
}

static int process_timer_func(struct bpf_verifier_env *env, int regno,
                              struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        bool is_const = tnum_is_const(reg->var_off);
        struct bpf_map *map = reg->map_ptr;
        u64 val = reg->var_off.value;

        if (!is_const) {
                verbose(env,
                        "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
                        regno);
                return -EINVAL;
        }
        if (!map->btf) {
                verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
                        map->name);
                return -EINVAL;
        }
        if (!btf_record_has_field(map->record, BPF_TIMER)) {
                verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
                return -EINVAL;
        }
        if (map->record->timer_off != val + reg->off) {
                verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
                        val + reg->off, map->record->timer_off);
                return -EINVAL;
        }
        if (meta->map_ptr) {
                verbose(env, "verifier bug. Two map pointers in a timer helper\n");
                return -EFAULT;
        }
        meta->map_uid = reg->map_uid;
        meta->map_ptr = map;
        return 0;
}

static int process_wq_func(struct bpf_verifier_env *env, int regno,
                           struct bpf_kfunc_call_arg_meta *meta)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        struct bpf_map *map = reg->map_ptr;
        u64 val = reg->var_off.value;

        if (map->record->wq_off != val + reg->off) {
                verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n",
                        val + reg->off, map->record->wq_off);
                return -EINVAL;
        }
        meta->map.uid = reg->map_uid;
        meta->map.ptr = map;
        return 0;
}

static int process_kptr_func(struct bpf_verifier_env *env, int regno,
                             struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        struct bpf_map *map_ptr = reg->map_ptr;
        struct btf_field *kptr_field;
        u32 kptr_off;

        if (!tnum_is_const(reg->var_off)) {
                verbose(env,
                        "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
                        regno);
                return -EINVAL;
        }
        if (!map_ptr->btf) {
                verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
                        map_ptr->name);
                return -EINVAL;
        }
        if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
                verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
                return -EINVAL;
        }

        meta->map_ptr = map_ptr;
        kptr_off = reg->off + reg->var_off.value;
        kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
        if (!kptr_field) {
                verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
                return -EACCES;
        }
        if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
                verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
                return -EACCES;
        }
        meta->kptr_field = kptr_field;
        return 0;
}

/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
 * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
 *
 * In both cases we deal with the first 8 bytes, but need to mark the next 8
 * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
 * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
 *
 * Mutability of bpf_dynptr is at two levels, one is at the level of struct
 * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
 * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
 * mutate the view of the dynptr and also possibly destroy it. In the latter
 * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
 * memory that dynptr points to.
 *
 * The verifier will keep track both levels of mutation (bpf_dynptr's in
 * reg->type and the memory's in reg->dynptr.type), but there is no support for
 * readonly dynptr view yet, hence only the first case is tracked and checked.
 *
 * This is consistent with how C applies the const modifier to a struct object,
 * where the pointer itself inside bpf_dynptr becomes const but not what it
 * points to.
 *
 * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
 * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
 */
static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
                               enum bpf_arg_type arg_type, int clone_ref_obj_id)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        int err;

        /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
         * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
         */
        if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
                verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
                return -EFAULT;
        }

        /*  MEM_UNINIT - Points to memory that is an appropriate candidate for
         *                 constructing a mutable bpf_dynptr object.
         *
         *                 Currently, this is only possible with PTR_TO_STACK
         *                 pointing to a region of at least 16 bytes which doesn't
         *                 contain an existing bpf_dynptr.
         *
         *  MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
         *                 mutated or destroyed. However, the memory it points to
         *                 may be mutated.
         *
         *  None       - Points to a initialized dynptr that can be mutated and
         *                 destroyed, including mutation of the memory it points
         *                 to.
         */
        if (arg_type & MEM_UNINIT) {
                int i;

                if (!is_dynptr_reg_valid_uninit(env, reg)) {
                        verbose(env, "Dynptr has to be an uninitialized dynptr\n");
                        return -EINVAL;
                }

                /* we write BPF_DW bits (8 bytes) at a time */
                for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
                        err = check_mem_access(env, insn_idx, regno,
                                               i, BPF_DW, BPF_WRITE, -1, false, false);
                        if (err)
                                return err;
                }

                err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
        } else /* MEM_RDONLY and None case from above */ {
                /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
                if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
                        verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
                        return -EINVAL;
                }

                if (!is_dynptr_reg_valid_init(env, reg)) {
                        verbose(env,
                                "Expected an initialized dynptr as arg #%d\n",
                                regno);
                        return -EINVAL;
                }

                /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
                if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
                        verbose(env,
                                "Expected a dynptr of type %s as arg #%d\n",
                                dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
                        return -EINVAL;
                }

                err = mark_dynptr_read(env, reg);
        }
        return err;
}

static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
{
        struct bpf_func_state *state = func(env, reg);

        return state->stack[spi].spilled_ptr.ref_obj_id;
}

static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
}

static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ITER_NEW;
}

static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ITER_NEXT;
}

static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ITER_DESTROY;
}

static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
{
        /* btf_check_iter_kfuncs() guarantees that first argument of any iter
         * kfunc is iter state pointer
         */
        return arg == 0 && is_iter_kfunc(meta);
}

static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
                            struct bpf_kfunc_call_arg_meta *meta)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        const struct btf_type *t;
        const struct btf_param *arg;
        int spi, err, i, nr_slots;
        u32 btf_id;

        /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
        arg = &btf_params(meta->func_proto)[0];
        t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);        /* PTR */
        t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);        /* STRUCT */
        nr_slots = t->size / BPF_REG_SIZE;

        if (is_iter_new_kfunc(meta)) {
                /* bpf_iter_<type>_new() expects pointer to uninit iter state */
                if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
                        verbose(env, "expected uninitialized iter_%s as arg #%d\n",
                                iter_type_str(meta->btf, btf_id), regno);
                        return -EINVAL;
                }

                for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
                        err = check_mem_access(env, insn_idx, regno,
                                               i, BPF_DW, BPF_WRITE, -1, false, false);
                        if (err)
                                return err;
                }

                err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
                if (err)
                        return err;
        } else {
                /* iter_next() or iter_destroy() expect initialized iter state*/
                err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
                switch (err) {
                case 0:
                        break;
                case -EINVAL:
                        verbose(env, "expected an initialized iter_%s as arg #%d\n",
                                iter_type_str(meta->btf, btf_id), regno);
                        return err;
                case -EPROTO:
                        verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
                        return err;
                default:
                        return err;
                }

                spi = iter_get_spi(env, reg, nr_slots);
                if (spi < 0)
                        return spi;

                err = mark_iter_read(env, reg, spi, nr_slots);
                if (err)
                        return err;

                /* remember meta->iter info for process_iter_next_call() */
                meta->iter.spi = spi;
                meta->iter.frameno = reg->frameno;
                meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);

                if (is_iter_destroy_kfunc(meta)) {
                        err = unmark_stack_slots_iter(env, reg, nr_slots);
                        if (err)
                                return err;
                }
        }

        return 0;
}

/* Look for a previous loop entry at insn_idx: nearest parent state
 * stopped at insn_idx with callsites matching those in cur->frame.
 */
static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
                                                  struct bpf_verifier_state *cur,
                                                  int insn_idx)
{
        struct bpf_verifier_state_list *sl;
        struct bpf_verifier_state *st;

        /* Explored states are pushed in stack order, most recent states come first */
        sl = *explored_state(env, insn_idx);
        for (; sl; sl = sl->next) {
                /* If st->branches != 0 state is a part of current DFS verification path,
                 * hence cur & st for a loop.
                 */
                st = &sl->state;
                if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
                    st->dfs_depth < cur->dfs_depth)
                        return st;
        }

        return NULL;
}

static void reset_idmap_scratch(struct bpf_verifier_env *env);
static bool regs_exact(const struct bpf_reg_state *rold,
                       const struct bpf_reg_state *rcur,
                       struct bpf_idmap *idmap);

static void maybe_widen_reg(struct bpf_verifier_env *env,
                            struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
                            struct bpf_idmap *idmap)
{
        if (rold->type != SCALAR_VALUE)
                return;
        if (rold->type != rcur->type)
                return;
        if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
                return;
        __mark_reg_unknown(env, rcur);
}

static int widen_imprecise_scalars(struct bpf_verifier_env *env,
                                   struct bpf_verifier_state *old,
                                   struct bpf_verifier_state *cur)
{
        struct bpf_func_state *fold, *fcur;
        int i, fr;

        reset_idmap_scratch(env);
        for (fr = old->curframe; fr >= 0; fr--) {
                fold = old->frame[fr];
                fcur = cur->frame[fr];

                for (i = 0; i < MAX_BPF_REG; i++)
                        maybe_widen_reg(env,
                                        &fold->regs[i],
                                        &fcur->regs[i],
                                        &env->idmap_scratch);

                for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
                        if (!is_spilled_reg(&fold->stack[i]) ||
                            !is_spilled_reg(&fcur->stack[i]))
                                continue;

                        maybe_widen_reg(env,
                                        &fold->stack[i].spilled_ptr,
                                        &fcur->stack[i].spilled_ptr,
                                        &env->idmap_scratch);
                }
        }
        return 0;
}

/* process_iter_next_call() is called when verifier gets to iterator's next
 * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
 * to it as just "iter_next()" in comments below.
 *
 * BPF verifier relies on a crucial contract for any iter_next()
 * implementation: it should *eventually* return NULL, and once that happens
 * it should keep returning NULL. That is, once iterator exhausts elements to
 * iterate, it should never reset or spuriously return new elements.
 *
 * With the assumption of such contract, process_iter_next_call() simulates
 * a fork in the verifier state to validate loop logic correctness and safety
 * without having to simulate infinite amount of iterations.
 *
 * In current state, we first assume that iter_next() returned NULL and
 * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
 * conditions we should not form an infinite loop and should eventually reach
 * exit.
 *
 * Besides that, we also fork current state and enqueue it for later
 * verification. In a forked state we keep iterator state as ACTIVE
 * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
 * also bump iteration depth to prevent erroneous infinite loop detection
 * later on (see iter_active_depths_differ() comment for details). In this
 * state we assume that we'll eventually loop back to another iter_next()
 * calls (it could be in exactly same location or in some other instruction,
 * it doesn't matter, we don't make any unnecessary assumptions about this,
 * everything revolves around iterator state in a stack slot, not which
 * instruction is calling iter_next()). When that happens, we either will come
 * to iter_next() with equivalent state and can conclude that next iteration
 * will proceed in exactly the same way as we just verified, so it's safe to
 * assume that loop converges. If not, we'll go on another iteration
 * simulation with a different input state, until all possible starting states
 * are validated or we reach maximum number of instructions limit.
 *
 * This way, we will either exhaustively discover all possible input states
 * that iterator loop can start with and eventually will converge, or we'll
 * effectively regress into bounded loop simulation logic and either reach
 * maximum number of instructions if loop is not provably convergent, or there
 * is some statically known limit on number of iterations (e.g., if there is
 * an explicit `if n > 100 then break;` statement somewhere in the loop).
 *
 * Iteration convergence logic in is_state_visited() relies on exact
 * states comparison, which ignores read and precision marks.
 * This is necessary because read and precision marks are not finalized
 * while in the loop. Exact comparison might preclude convergence for
 * simple programs like below:
 *
 *     i = 0;
 *     while(iter_next(&it))
 *       i++;
 *
 * At each iteration step i++ would produce a new distinct state and
 * eventually instruction processing limit would be reached.
 *
 * To avoid such behavior speculatively forget (widen) range for
 * imprecise scalar registers, if those registers were not precise at the
 * end of the previous iteration and do not match exactly.
 *
 * This is a conservative heuristic that allows to verify wide range of programs,
 * however it precludes verification of programs that conjure an
 * imprecise value on the first loop iteration and use it as precise on a second.
 * For example, the following safe program would fail to verify:
 *
 *     struct bpf_num_iter it;
 *     int arr[10];
 *     int i = 0, a = 0;
 *     bpf_iter_num_new(&it, 0, 10);
 *     while (bpf_iter_num_next(&it)) {
 *       if (a == 0) {
 *         a = 1;
 *         i = 7; // Because i changed verifier would forget
 *                // it's range on second loop entry.
 *       } else {
 *         arr[i] = 42; // This would fail to verify.
 *       }
 *     }
 *     bpf_iter_num_destroy(&it);
 */
static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
                                  struct bpf_kfunc_call_arg_meta *meta)
{
        struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
        struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
        struct bpf_reg_state *cur_iter, *queued_iter;
        int iter_frameno = meta->iter.frameno;
        int iter_spi = meta->iter.spi;

        BTF_TYPE_EMIT(struct bpf_iter);

        cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;

        if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
            cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
                verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
                        cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
                return -EFAULT;
        }

        if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
                /* Because iter_next() call is a checkpoint is_state_visitied()
                 * should guarantee parent state with same call sites and insn_idx.
                 */
                if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
                    !same_callsites(cur_st->parent, cur_st)) {
                        verbose(env, "bug: bad parent state for iter next call");
                        return -EFAULT;
                }
                /* Note cur_st->parent in the call below, it is necessary to skip
                 * checkpoint created for cur_st by is_state_visited()
                 * right at this instruction.
                 */
                prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
                /* branch out active iter state */
                queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
                if (!queued_st)
                        return -ENOMEM;

                queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
                queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
                queued_iter->iter.depth++;
                if (prev_st)
                        widen_imprecise_scalars(env, prev_st, queued_st);

                queued_fr = queued_st->frame[queued_st->curframe];
                mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
        }

        /* switch to DRAINED state, but keep the depth unchanged */
        /* mark current iter state as drained and assume returned NULL */
        cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
        __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);

        return 0;
}

static bool arg_type_is_mem_size(enum bpf_arg_type type)
{
        return type == ARG_CONST_SIZE ||
               type == ARG_CONST_SIZE_OR_ZERO;
}

static bool arg_type_is_release(enum bpf_arg_type type)
{
        return type & OBJ_RELEASE;
}

static bool arg_type_is_dynptr(enum bpf_arg_type type)
{
        return base_type(type) == ARG_PTR_TO_DYNPTR;
}

static int int_ptr_type_to_size(enum bpf_arg_type type)
{
        if (type == ARG_PTR_TO_INT)
                return sizeof(u32);
        else if (type == ARG_PTR_TO_LONG)
                return sizeof(u64);

        return -EINVAL;
}

static int resolve_map_arg_type(struct bpf_verifier_env *env,
                                 const struct bpf_call_arg_meta *meta,
                                 enum bpf_arg_type *arg_type)
{
        if (!meta->map_ptr) {
                /* kernel subsystem misconfigured verifier */
                verbose(env, "invalid map_ptr to access map->type\n");
                return -EACCES;
        }

        switch (meta->map_ptr->map_type) {
        case BPF_MAP_TYPE_SOCKMAP:
        case BPF_MAP_TYPE_SOCKHASH:
                if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
                        *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
                } else {
                        verbose(env, "invalid arg_type for sockmap/sockhash\n");
                        return -EINVAL;
                }
                break;
        case BPF_MAP_TYPE_BLOOM_FILTER:
                if (meta->func_id == BPF_FUNC_map_peek_elem)
                        *arg_type = ARG_PTR_TO_MAP_VALUE;
                break;
        default:
                break;
        }
        return 0;
}

struct bpf_reg_types {
        const enum bpf_reg_type types[10];
        u32 *btf_id;
};

static const struct bpf_reg_types sock_types = {
        .types = {
                PTR_TO_SOCK_COMMON,
                PTR_TO_SOCKET,
                PTR_TO_TCP_SOCK,
                PTR_TO_XDP_SOCK,
        },
};

#ifdef CONFIG_NET
static const struct bpf_reg_types btf_id_sock_common_types = {
        .types = {
                PTR_TO_SOCK_COMMON,
                PTR_TO_SOCKET,
                PTR_TO_TCP_SOCK,
                PTR_TO_XDP_SOCK,
                PTR_TO_BTF_ID,
                PTR_TO_BTF_ID | PTR_TRUSTED,
        },
        .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
#endif

static const struct bpf_reg_types mem_types = {
        .types = {
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
                PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
                PTR_TO_MEM,
                PTR_TO_MEM | MEM_RINGBUF,
                PTR_TO_BUF,
                PTR_TO_BTF_ID | PTR_TRUSTED,
        },
};

static const struct bpf_reg_types int_ptr_types = {
        .types = {
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
                PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
        },
};

static const struct bpf_reg_types spin_lock_types = {
        .types = {
                PTR_TO_MAP_VALUE,
                PTR_TO_BTF_ID | MEM_ALLOC,
        }
};

static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
static const struct bpf_reg_types btf_ptr_types = {
        .types = {
                PTR_TO_BTF_ID,
                PTR_TO_BTF_ID | PTR_TRUSTED,
                PTR_TO_BTF_ID | MEM_RCU,
        },
};
static const struct bpf_reg_types percpu_btf_ptr_types = {
        .types = {
                PTR_TO_BTF_ID | MEM_PERCPU,
                PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
                PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
        }
};
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types dynptr_types = {
        .types = {
                PTR_TO_STACK,
                CONST_PTR_TO_DYNPTR,
        }
};

static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_MAP_KEY]                = &mem_types,
        [ARG_PTR_TO_MAP_VALUE]                = &mem_types,
        [ARG_CONST_SIZE]                = &scalar_types,
        [ARG_CONST_SIZE_OR_ZERO]        = &scalar_types,
        [ARG_CONST_ALLOC_SIZE_OR_ZERO]        = &scalar_types,
        [ARG_CONST_MAP_PTR]                = &const_map_ptr_types,
        [ARG_PTR_TO_CTX]                = &context_types,
        [ARG_PTR_TO_SOCK_COMMON]        = &sock_types,
#ifdef CONFIG_NET
        [ARG_PTR_TO_BTF_ID_SOCK_COMMON]        = &btf_id_sock_common_types,
#endif
        [ARG_PTR_TO_SOCKET]                = &fullsock_types,
        [ARG_PTR_TO_BTF_ID]                = &btf_ptr_types,
        [ARG_PTR_TO_SPIN_LOCK]                = &spin_lock_types,
        [ARG_PTR_TO_MEM]                = &mem_types,
        [ARG_PTR_TO_RINGBUF_MEM]        = &ringbuf_mem_types,
        [ARG_PTR_TO_INT]                = &int_ptr_types,
        [ARG_PTR_TO_LONG]                = &int_ptr_types,
        [ARG_PTR_TO_PERCPU_BTF_ID]        = &percpu_btf_ptr_types,
        [ARG_PTR_TO_FUNC]                = &func_ptr_types,
        [ARG_PTR_TO_STACK]                = &stack_ptr_types,
        [ARG_PTR_TO_CONST_STR]                = &const_str_ptr_types,
        [ARG_PTR_TO_TIMER]                = &timer_types,
        [ARG_PTR_TO_KPTR]                = &kptr_types,
        [ARG_PTR_TO_DYNPTR]                = &dynptr_types,
};

static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
                          enum bpf_arg_type arg_type,
                          const u32 *arg_btf_id,
                          struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        enum bpf_reg_type expected, type = reg->type;
        const struct bpf_reg_types *compatible;
        int i, j;

        compatible = compatible_reg_types[base_type(arg_type)];
        if (!compatible) {
                verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
                return -EFAULT;
        }

        /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
         * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
         *
         * Same for MAYBE_NULL:
         *
         * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
         * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
         *
         * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
         *
         * Therefore we fold these flags depending on the arg_type before comparison.
         */
        if (arg_type & MEM_RDONLY)
                type &= ~MEM_RDONLY;
        if (arg_type & PTR_MAYBE_NULL)
                type &= ~PTR_MAYBE_NULL;
        if (base_type(arg_type) == ARG_PTR_TO_MEM)
                type &= ~DYNPTR_TYPE_FLAG_MASK;

        if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
                type &= ~MEM_ALLOC;
                type &= ~MEM_PERCPU;
        }

        for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
                expected = compatible->types[i];
                if (expected == NOT_INIT)
                        break;

                if (type == expected)
                        goto found;
        }

        verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
        for (j = 0; j + 1 < i; j++)
                verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
        verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
        return -EACCES;

found:
        if (base_type(reg->type) != PTR_TO_BTF_ID)
                return 0;

        if (compatible == &mem_types) {
                if (!(arg_type & MEM_RDONLY)) {
                        verbose(env,
                                "%s() may write into memory pointed by R%d type=%s\n",
                                func_id_name(meta->func_id),
                                regno, reg_type_str(env, reg->type));
                        return -EACCES;
                }
                return 0;
        }

        switch ((int)reg->type) {
        case PTR_TO_BTF_ID:
        case PTR_TO_BTF_ID | PTR_TRUSTED:
        case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
        case PTR_TO_BTF_ID | MEM_RCU:
        case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
        case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
        {
                /* For bpf_sk_release, it needs to match against first member
                 * 'struct sock_common', hence make an exception for it. This
                 * allows bpf_sk_release to work for multiple socket types.
                 */
                bool strict_type_match = arg_type_is_release(arg_type) &&
                                         meta->func_id != BPF_FUNC_sk_release;

                if (type_may_be_null(reg->type) &&
                    (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
                        verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
                        return -EACCES;
                }

                if (!arg_btf_id) {
                        if (!compatible->btf_id) {
                                verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
                                return -EFAULT;
                        }
                        arg_btf_id = compatible->btf_id;
                }

                if (meta->func_id == BPF_FUNC_kptr_xchg) {
                        if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
                                return -EACCES;
                } else {
                        if (arg_btf_id == BPF_PTR_POISON) {
                                verbose(env, "verifier internal error:");
                                verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
                                        regno);
                                return -EACCES;
                        }

                        if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
                                                  btf_vmlinux, *arg_btf_id,
                                                  strict_type_match)) {
                                verbose(env, "R%d is of type %s but %s is expected\n",
                                        regno, btf_type_name(reg->btf, reg->btf_id),
                                        btf_type_name(btf_vmlinux, *arg_btf_id));
                                return -EACCES;
                        }
                }
                break;
        }
        case PTR_TO_BTF_ID | MEM_ALLOC:
        case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
                if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
                    meta->func_id != BPF_FUNC_kptr_xchg) {
                        verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
                        return -EFAULT;
                }
                if (meta->func_id == BPF_FUNC_kptr_xchg) {
                        if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
                                return -EACCES;
                }
                break;
        case PTR_TO_BTF_ID | MEM_PERCPU:
        case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
        case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
                /* Handled by helper specific checks */
                break;
        default:
                verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n");
                return -EFAULT;
        }
        return 0;
}

static struct btf_field *
reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
{
        struct btf_field *field;
        struct btf_record *rec;

        rec = reg_btf_record(reg);
        if (!rec)
                return NULL;

        field = btf_record_find(rec, off, fields);
        if (!field)
                return NULL;

        return field;
}

static int check_func_arg_reg_off(struct bpf_verifier_env *env,
                                  const struct bpf_reg_state *reg, int regno,
                                  enum bpf_arg_type arg_type)
{
        u32 type = reg->type;

        /* When referenced register is passed to release function, its fixed
         * offset must be 0.
         *
         * We will check arg_type_is_release reg has ref_obj_id when storing
         * meta->release_regno.
         */
        if (arg_type_is_release(arg_type)) {
                /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
                 * may not directly point to the object being released, but to
                 * dynptr pointing to such object, which might be at some offset
                 * on the stack. In that case, we simply to fallback to the
                 * default handling.
                 */
                if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
                        return 0;

                /* Doing check_ptr_off_reg check for the offset will catch this
                 * because fixed_off_ok is false, but checking here allows us
                 * to give the user a better error message.
                 */
                if (reg->off) {
                        verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
                                regno);
                        return -EINVAL;
                }
                return __check_ptr_off_reg(env, reg, regno, false);
        }

        switch (type) {
        /* Pointer types where both fixed and variable offset is explicitly allowed: */
        case PTR_TO_STACK:
        case PTR_TO_PACKET:
        case PTR_TO_PACKET_META:
        case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
        case PTR_TO_MEM:
        case PTR_TO_MEM | MEM_RDONLY:
        case PTR_TO_MEM | MEM_RINGBUF:
        case PTR_TO_BUF:
        case PTR_TO_BUF | MEM_RDONLY:
        case PTR_TO_ARENA:
        case SCALAR_VALUE:
                return 0;
        /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
         * fixed offset.
         */
        case PTR_TO_BTF_ID:
        case PTR_TO_BTF_ID | MEM_ALLOC:
        case PTR_TO_BTF_ID | PTR_TRUSTED:
        case PTR_TO_BTF_ID | MEM_RCU:
        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
        case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
                /* When referenced PTR_TO_BTF_ID is passed to release function,
                 * its fixed offset must be 0. In the other cases, fixed offset
                 * can be non-zero. This was already checked above. So pass
                 * fixed_off_ok as true to allow fixed offset for all other
                 * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
                 * still need to do checks instead of returning.
                 */
                return __check_ptr_off_reg(env, reg, regno, true);
        default:
                return __check_ptr_off_reg(env, reg, regno, false);
        }
}

static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
                                                const struct bpf_func_proto *fn,
                                                struct bpf_reg_state *regs)
{
        struct bpf_reg_state *state = NULL;
        int i;

        for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
                if (arg_type_is_dynptr(fn->arg_type[i])) {
                        if (state) {
                                verbose(env, "verifier internal error: multiple dynptr args\n");
                                return NULL;
                        }
                        state = &regs[BPF_REG_1 + i];
                }

        if (!state)
                verbose(env, "verifier internal error: no dynptr arg found\n");

        return state;
}

static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = func(env, reg);
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return reg->id;
        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;
        return state->stack[spi].spilled_ptr.id;
}

static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = func(env, reg);
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return reg->ref_obj_id;
        spi = dynptr_get_spi(env, reg);
        if (spi < 0)
                return spi;
        return state->stack[spi].spilled_ptr.ref_obj_id;
}

static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
                                            struct bpf_reg_state *reg)
{
        struct bpf_func_state *state = func(env, reg);
        int spi;

        if (reg->type == CONST_PTR_TO_DYNPTR)
                return reg->dynptr.type;

        spi = __get_spi(reg->off);
        if (spi < 0) {
                verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
                return BPF_DYNPTR_TYPE_INVALID;
        }

        return state->stack[spi].spilled_ptr.dynptr.type;
}

static int check_reg_const_str(struct bpf_verifier_env *env,
                               struct bpf_reg_state *reg, u32 regno)
{
        struct bpf_map *map = reg->map_ptr;
        int err;
        int map_off;
        u64 map_addr;
        char *str_ptr;

        if (reg->type != PTR_TO_MAP_VALUE)
                return -EINVAL;

        if (!bpf_map_is_rdonly(map)) {
                verbose(env, "R%d does not point to a readonly map'\n", regno);
                return -EACCES;
        }

        if (!tnum_is_const(reg->var_off)) {
                verbose(env, "R%d is not a constant address'\n", regno);
                return -EACCES;
        }

        if (!map->ops->map_direct_value_addr) {
                verbose(env, "no direct value access support for this map type\n");
                return -EACCES;
        }

        err = check_map_access(env, regno, reg->off,
                               map->value_size - reg->off, false,
                               ACCESS_HELPER);
        if (err)
                return err;

        map_off = reg->off + reg->var_off.value;
        err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
        if (err) {
                verbose(env, "direct value access on string failed\n");
                return err;
        }

        str_ptr = (char *)(long)(map_addr);
        if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
                verbose(env, "string is not zero-terminated\n");
                return -EINVAL;
        }
        return 0;
}

static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
                          struct bpf_call_arg_meta *meta,
                          const struct bpf_func_proto *fn,
                          int insn_idx)
{
        u32 regno = BPF_REG_1 + arg;
        struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
        enum bpf_arg_type arg_type = fn->arg_type[arg];
        enum bpf_reg_type type = reg->type;
        u32 *arg_btf_id = NULL;
        int err = 0;

        if (arg_type == ARG_DONTCARE)
                return 0;

        err = check_reg_arg(env, regno, SRC_OP);
        if (err)
                return err;

        if (arg_type == ARG_ANYTHING) {
                if (is_pointer_value(env, regno)) {
                        verbose(env, "R%d leaks addr into helper function\n",
                                regno);
                        return -EACCES;
                }
                return 0;
        }

        if (type_is_pkt_pointer(type) &&
            !may_access_direct_pkt_data(env, meta, BPF_READ)) {
                verbose(env, "helper access to the packet is not allowed\n");
                return -EACCES;
        }

        if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
                err = resolve_map_arg_type(env, meta, &arg_type);
                if (err)
                        return err;
        }

        if (register_is_null(reg) && type_may_be_null(arg_type))
                /* A NULL register has a SCALAR_VALUE type, so skip
                 * type checking.
                 */
                goto skip_type_check;

        /* arg_btf_id and arg_size are in a union. */
        if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
            base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
                arg_btf_id = fn->arg_btf_id[arg];

        err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
        if (err)
                return err;

        err = check_func_arg_reg_off(env, reg, regno, arg_type);
        if (err)
                return err;

skip_type_check:
        if (arg_type_is_release(arg_type)) {
                if (arg_type_is_dynptr(arg_type)) {
                        struct bpf_func_state *state = func(env, reg);
                        int spi;

                        /* Only dynptr created on stack can be released, thus
                         * the get_spi and stack state checks for spilled_ptr
                         * should only be done before process_dynptr_func for
                         * PTR_TO_STACK.
                         */
                        if (reg->type == PTR_TO_STACK) {
                                spi = dynptr_get_spi(env, reg);
                                if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
                                        verbose(env, "arg %d is an unacquired reference\n", regno);
                                        return -EINVAL;
                                }
                        } else {
                                verbose(env, "cannot release unowned const bpf_dynptr\n");
                                return -EINVAL;
                        }
                } else if (!reg->ref_obj_id && !register_is_null(reg)) {
                        verbose(env, "R%d must be referenced when passed to release function\n",
                                regno);
                        return -EINVAL;
                }
                if (meta->release_regno) {
                        verbose(env, "verifier internal error: more than one release argument\n");
                        return -EFAULT;
                }
                meta->release_regno = regno;
        }

        if (reg->ref_obj_id) {
                if (meta->ref_obj_id) {
                        verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
                                regno, reg->ref_obj_id,
                                meta->ref_obj_id);
                        return -EFAULT;
                }
                meta->ref_obj_id = reg->ref_obj_id;
        }

        switch (base_type(arg_type)) {
        case ARG_CONST_MAP_PTR:
                /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
                if (meta->map_ptr) {
                        /* Use map_uid (which is unique id of inner map) to reject:
                         * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
                         * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
                         * if (inner_map1 && inner_map2) {
                         *     timer = bpf_map_lookup_elem(inner_map1);
                         *     if (timer)
                         *         // mismatch would have been allowed
                         *         bpf_timer_init(timer, inner_map2);
                         * }
                         *
                         * Comparing map_ptr is enough to distinguish normal and outer maps.
                         */
                        if (meta->map_ptr != reg->map_ptr ||
                            meta->map_uid != reg->map_uid) {
                                verbose(env,
                                        "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
                                        meta->map_uid, reg->map_uid);
                                return -EINVAL;
                        }
                }
                meta->map_ptr = reg->map_ptr;
                meta->map_uid = reg->map_uid;
                break;
        case ARG_PTR_TO_MAP_KEY:
                /* bpf_map_xxx(..., map_ptr, ..., key) call:
                 * check that [key, key + map->key_size) are within
                 * stack limits and initialized
                 */
                if (!meta->map_ptr) {
                        /* in function declaration map_ptr must come before
                         * map_key, so that it's verified and known before
                         * we have to check map_key here. Otherwise it means
                         * that kernel subsystem misconfigured verifier
                         */
                        verbose(env, "invalid map_ptr to access map->key\n");
                        return -EACCES;
                }
                err = check_helper_mem_access(env, regno,
                                              meta->map_ptr->key_size, false,
                                              NULL);
                break;
        case ARG_PTR_TO_MAP_VALUE:
                if (type_may_be_null(arg_type) && register_is_null(reg))
                        return 0;

                /* bpf_map_xxx(..., map_ptr, ..., value) call:
                 * check [value, value + map->value_size) validity
                 */
                if (!meta->map_ptr) {
                        /* kernel subsystem misconfigured verifier */
                        verbose(env, "invalid map_ptr to access map->value\n");
                        return -EACCES;
                }
                meta->raw_mode = arg_type & MEM_UNINIT;
                err = check_helper_mem_access(env, regno,
                                              meta->map_ptr->value_size, false,
                                              meta);
                break;
        case ARG_PTR_TO_PERCPU_BTF_ID:
                if (!reg->btf_id) {
                        verbose(env, "Helper has invalid btf_id in R%d\n", regno);
                        return -EACCES;
                }
                meta->ret_btf = reg->btf;
                meta->ret_btf_id = reg->btf_id;
                break;
        case ARG_PTR_TO_SPIN_LOCK:
                if (in_rbtree_lock_required_cb(env)) {
                        verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
                        return -EACCES;
                }
                if (meta->func_id == BPF_FUNC_spin_lock) {
                        err = process_spin_lock(env, regno, true);
                        if (err)
                                return err;
                } else if (meta->func_id == BPF_FUNC_spin_unlock) {
                        err = process_spin_lock(env, regno, false);
                        if (err)
                                return err;
                } else {
                        verbose(env, "verifier internal error\n");
                        return -EFAULT;
                }
                break;
        case ARG_PTR_TO_TIMER:
                err = process_timer_func(env, regno, meta);
                if (err)
                        return err;
                break;
        case ARG_PTR_TO_FUNC:
                meta->subprogno = reg->subprogno;
                break;
        case ARG_PTR_TO_MEM:
                /* The access to this pointer is only checked when we hit the
                 * next is_mem_size argument below.
                 */
                meta->raw_mode = arg_type & MEM_UNINIT;
                if (arg_type & MEM_FIXED_SIZE) {
                        err = check_helper_mem_access(env, regno,
                                                      fn->arg_size[arg], false,
                                                      meta);
                }
                break;
        case ARG_CONST_SIZE:
                err = check_mem_size_reg(env, reg, regno, false, meta);
                break;
        case ARG_CONST_SIZE_OR_ZERO:
                err = check_mem_size_reg(env, reg, regno, true, meta);
                break;
        case ARG_PTR_TO_DYNPTR:
                err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
                if (err)
                        return err;
                break;
        case ARG_CONST_ALLOC_SIZE_OR_ZERO:
                if (!tnum_is_const(reg->var_off)) {
                        verbose(env, "R%d is not a known constant'\n",
                                regno);
                        return -EACCES;
                }
                meta->mem_size = reg->var_off.value;
                err = mark_chain_precision(env, regno);
                if (err)
                        return err;
                break;
        case ARG_PTR_TO_INT:
        case ARG_PTR_TO_LONG:
        {
                int size = int_ptr_type_to_size(arg_type);

                err = check_helper_mem_access(env, regno, size, false, meta);
                if (err)
                        return err;
                err = check_ptr_alignment(env, reg, 0, size, true);
                break;
        }
        case ARG_PTR_TO_CONST_STR:
        {
                err = check_reg_const_str(env, reg, regno);
                if (err)
                        return err;
                break;
        }
        case ARG_PTR_TO_KPTR:
                err = process_kptr_func(env, regno, meta);
                if (err)
                        return err;
                break;
        }

        return err;
}

static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
{
        enum bpf_attach_type eatype = env->prog->expected_attach_type;
        enum bpf_prog_type type = resolve_prog_type(env->prog);

        if (func_id != BPF_FUNC_map_update_elem &&
            func_id != BPF_FUNC_map_delete_elem)
                return false;

        /* It's not possible to get access to a locked struct sock in these
         * contexts, so updating is safe.
         */
        switch (type) {
        case BPF_PROG_TYPE_TRACING:
                if (eatype == BPF_TRACE_ITER)
                        return true;
                break;
        case BPF_PROG_TYPE_SOCK_OPS:
                /* map_update allowed only via dedicated helpers with event type checks */
                if (func_id == BPF_FUNC_map_delete_elem)
                        return true;
                break;
        case BPF_PROG_TYPE_SOCKET_FILTER:
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
        case BPF_PROG_TYPE_XDP:
        case BPF_PROG_TYPE_SK_REUSEPORT:
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_SK_LOOKUP:
                return true;
        default:
                break;
        }

        verbose(env, "cannot update sockmap in this context\n");
        return false;
}

static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
        return env->prog->jit_requested &&
               bpf_jit_supports_subprog_tailcalls();
}

static int check_map_func_compatibility(struct bpf_verifier_env *env,
                                        struct bpf_map *map, int func_id)
{
        if (!map)
                return 0;

        /* We need a two way check, first is from map perspective ... */
        switch (map->map_type) {
        case BPF_MAP_TYPE_PROG_ARRAY:
                if (func_id != BPF_FUNC_tail_call)
                        goto error;
                break;
        case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
                if (func_id != BPF_FUNC_perf_event_read &&
                    func_id != BPF_FUNC_perf_event_output &&
                    func_id != BPF_FUNC_skb_output &&
                    func_id != BPF_FUNC_perf_event_read_value &&
                    func_id != BPF_FUNC_xdp_output)
                        goto error;
                break;
        case BPF_MAP_TYPE_RINGBUF:
                if (func_id != BPF_FUNC_ringbuf_output &&
                    func_id != BPF_FUNC_ringbuf_reserve &&
                    func_id != BPF_FUNC_ringbuf_query &&
                    func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
                    func_id != BPF_FUNC_ringbuf_submit_dynptr &&
                    func_id != BPF_FUNC_ringbuf_discard_dynptr)
                        goto error;
                break;
        case BPF_MAP_TYPE_USER_RINGBUF:
                if (func_id != BPF_FUNC_user_ringbuf_drain)
                        goto error;
                break;
        case BPF_MAP_TYPE_STACK_TRACE:
                if (func_id != BPF_FUNC_get_stackid)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_ARRAY:
                if (func_id != BPF_FUNC_skb_under_cgroup &&
                    func_id != BPF_FUNC_current_task_under_cgroup)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_STORAGE:
        case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
                if (func_id != BPF_FUNC_get_local_storage)
                        goto error;
                break;
        case BPF_MAP_TYPE_DEVMAP:
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (func_id != BPF_FUNC_redirect_map &&
                    func_id != BPF_FUNC_map_lookup_elem)
                        goto error;
                break;
        /* Restrict bpf side of cpumap and xskmap, open when use-cases
         * appear.
         */
        case BPF_MAP_TYPE_CPUMAP:
                if (func_id != BPF_FUNC_redirect_map)
                        goto error;
                break;
        case BPF_MAP_TYPE_XSKMAP:
                if (func_id != BPF_FUNC_redirect_map &&
                    func_id != BPF_FUNC_map_lookup_elem)
                        goto error;
                break;
        case BPF_MAP_TYPE_ARRAY_OF_MAPS:
        case BPF_MAP_TYPE_HASH_OF_MAPS:
                if (func_id != BPF_FUNC_map_lookup_elem)
                        goto error;
                break;
        case BPF_MAP_TYPE_SOCKMAP:
                if (func_id != BPF_FUNC_sk_redirect_map &&
                    func_id != BPF_FUNC_sock_map_update &&
                    func_id != BPF_FUNC_msg_redirect_map &&
                    func_id != BPF_FUNC_sk_select_reuseport &&
                    func_id != BPF_FUNC_map_lookup_elem &&
                    !may_update_sockmap(env, func_id))
                        goto error;
                break;
        case BPF_MAP_TYPE_SOCKHASH:
                if (func_id != BPF_FUNC_sk_redirect_hash &&
                    func_id != BPF_FUNC_sock_hash_update &&
                    func_id != BPF_FUNC_msg_redirect_hash &&
                    func_id != BPF_FUNC_sk_select_reuseport &&
                    func_id != BPF_FUNC_map_lookup_elem &&
                    !may_update_sockmap(env, func_id))
                        goto error;
                break;
        case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
                if (func_id != BPF_FUNC_sk_select_reuseport)
                        goto error;
                break;
        case BPF_MAP_TYPE_QUEUE:
        case BPF_MAP_TYPE_STACK:
                if (func_id != BPF_FUNC_map_peek_elem &&
                    func_id != BPF_FUNC_map_pop_elem &&
                    func_id != BPF_FUNC_map_push_elem)
                        goto error;
                break;
        case BPF_MAP_TYPE_SK_STORAGE:
                if (func_id != BPF_FUNC_sk_storage_get &&
                    func_id != BPF_FUNC_sk_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_INODE_STORAGE:
                if (func_id != BPF_FUNC_inode_storage_get &&
                    func_id != BPF_FUNC_inode_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_TASK_STORAGE:
                if (func_id != BPF_FUNC_task_storage_get &&
                    func_id != BPF_FUNC_task_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGRP_STORAGE:
                if (func_id != BPF_FUNC_cgrp_storage_get &&
                    func_id != BPF_FUNC_cgrp_storage_delete &&
                    func_id != BPF_FUNC_kptr_xchg)
                        goto error;
                break;
        case BPF_MAP_TYPE_BLOOM_FILTER:
                if (func_id != BPF_FUNC_map_peek_elem &&
                    func_id != BPF_FUNC_map_push_elem)
                        goto error;
                break;
        default:
                break;
        }

        /* ... and second from the function itself. */
        switch (func_id) {
        case BPF_FUNC_tail_call:
                if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
                        goto error;
                if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
                        verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
                        return -EINVAL;
                }
                break;
        case BPF_FUNC_perf_event_read:
        case BPF_FUNC_perf_event_output:
        case BPF_FUNC_perf_event_read_value:
        case BPF_FUNC_skb_output:
        case BPF_FUNC_xdp_output:
                if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
                        goto error;
                break;
        case BPF_FUNC_ringbuf_output:
        case BPF_FUNC_ringbuf_reserve:
        case BPF_FUNC_ringbuf_query:
        case BPF_FUNC_ringbuf_reserve_dynptr:
        case BPF_FUNC_ringbuf_submit_dynptr:
        case BPF_FUNC_ringbuf_discard_dynptr:
                if (map->map_type != BPF_MAP_TYPE_RINGBUF)
                        goto error;
                break;
        case BPF_FUNC_user_ringbuf_drain:
                if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
                        goto error;
                break;
        case BPF_FUNC_get_stackid:
                if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
                        goto error;
                break;
        case BPF_FUNC_current_task_under_cgroup:
        case BPF_FUNC_skb_under_cgroup:
                if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
                        goto error;
                break;
        case BPF_FUNC_redirect_map:
                if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
                    map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
                    map->map_type != BPF_MAP_TYPE_CPUMAP &&
                    map->map_type != BPF_MAP_TYPE_XSKMAP)
                        goto error;
                break;
        case BPF_FUNC_sk_redirect_map:
        case BPF_FUNC_msg_redirect_map:
        case BPF_FUNC_sock_map_update:
                if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
                        goto error;
                break;
        case BPF_FUNC_sk_redirect_hash:
        case BPF_FUNC_msg_redirect_hash:
        case BPF_FUNC_sock_hash_update:
                if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
                        goto error;
                break;
        case BPF_FUNC_get_local_storage:
                if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
                    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_sk_select_reuseport:
                if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
                    map->map_type != BPF_MAP_TYPE_SOCKMAP &&
                    map->map_type != BPF_MAP_TYPE_SOCKHASH)
                        goto error;
                break;
        case BPF_FUNC_map_pop_elem:
                if (map->map_type != BPF_MAP_TYPE_QUEUE &&
                    map->map_type != BPF_MAP_TYPE_STACK)
                        goto error;
                break;
        case BPF_FUNC_map_peek_elem:
        case BPF_FUNC_map_push_elem:
                if (map->map_type != BPF_MAP_TYPE_QUEUE &&
                    map->map_type != BPF_MAP_TYPE_STACK &&
                    map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
                        goto error;
                break;
        case BPF_FUNC_map_lookup_percpu_elem:
                if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
                    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
                    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
                        goto error;
                break;
        case BPF_FUNC_sk_storage_get:
        case BPF_FUNC_sk_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_inode_storage_get:
        case BPF_FUNC_inode_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_task_storage_get:
        case BPF_FUNC_task_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
                        goto error;
                break;
        case BPF_FUNC_cgrp_storage_get:
        case BPF_FUNC_cgrp_storage_delete:
                if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
                        goto error;
                break;
        default:
                break;
        }

        return 0;
error:
        verbose(env, "cannot pass map_type %d into func %s#%d\n",
                map->map_type, func_id_name(func_id), func_id);
        return -EINVAL;
}

static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
{
        int count = 0;

        if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
                count++;
        if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
                count++;
        if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
                count++;
        if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
                count++;
        if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
                count++;

        /* We only support one arg being in raw mode at the moment,
         * which is sufficient for the helper functions we have
         * right now.
         */
        return count <= 1;
}

static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
{
        bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
        bool has_size = fn->arg_size[arg] != 0;
        bool is_next_size = false;

        if (arg + 1 < ARRAY_SIZE(fn->arg_type))
                is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);

        if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
                return is_next_size;

        return has_size == is_next_size || is_next_size == is_fixed;
}

static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
{
        /* bpf_xxx(..., buf, len) call will access 'len'
         * bytes from memory 'buf'. Both arg types need
         * to be paired, so make sure there's no buggy
         * helper function specification.
         */
        if (arg_type_is_mem_size(fn->arg1_type) ||
            check_args_pair_invalid(fn, 0) ||
            check_args_pair_invalid(fn, 1) ||
            check_args_pair_invalid(fn, 2) ||
            check_args_pair_invalid(fn, 3) ||
            check_args_pair_invalid(fn, 4))
                return false;

        return true;
}

static bool check_btf_id_ok(const struct bpf_func_proto *fn)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
                if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
                        return !!fn->arg_btf_id[i];
                if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
                        return fn->arg_btf_id[i] == BPF_PTR_POISON;
                if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
                    /* arg_btf_id and arg_size are in a union. */
                    (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
                     !(fn->arg_type[i] & MEM_FIXED_SIZE)))
                        return false;
        }

        return true;
}

static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
        return check_raw_mode_ok(fn) &&
               check_arg_pair_ok(fn) &&
               check_btf_id_ok(fn) ? 0 : -EINVAL;
}

/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
 * are now invalid, so turn them into unknown SCALAR_VALUE.
 *
 * This also applies to dynptr slices belonging to skb and xdp dynptrs,
 * since these slices point to packet data.
 */
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
        struct bpf_func_state *state;
        struct bpf_reg_state *reg;

        bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
                if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
                        mark_reg_invalid(env, reg);
        }));
}

enum {
        AT_PKT_END = -1,
        BEYOND_PKT_END = -2,
};

static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
{
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *reg = &state->regs[regn];

        if (reg->type != PTR_TO_PACKET)
                /* PTR_TO_PACKET_META is not supported yet */
                return;

        /* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
         * How far beyond pkt_end it goes is unknown.
         * if (!range_open) it's the case of pkt >= pkt_end
         * if (range_open) it's the case of pkt > pkt_end
         * hence this pointer is at least 1 byte bigger than pkt_end
         */
        if (range_open)
                reg->range = BEYOND_PKT_END;
        else
                reg->range = AT_PKT_END;
}

/* The pointer with the specified id has released its reference to kernel
 * resources. Identify all copies of the same pointer and clear the reference.
 */
static int release_reference(struct bpf_verifier_env *env,
                             int ref_obj_id)
{
        struct bpf_func_state *state;
        struct bpf_reg_state *reg;
        int err;

        err = release_reference_state(cur_func(env), ref_obj_id);
        if (err)
                return err;

        bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
                if (reg->ref_obj_id == ref_obj_id)
                        mark_reg_invalid(env, reg);
        }));

        return 0;
}

static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
{
        struct bpf_func_state *unused;
        struct bpf_reg_state *reg;

        bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
                if (type_is_non_owning_ref(reg->type))
                        mark_reg_invalid(env, reg);
        }));
}

static void clear_caller_saved_regs(struct bpf_verifier_env *env,
                                    struct bpf_reg_state *regs)
{
        int i;

        /* after the call registers r0 - r5 were scratched */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                mark_reg_not_init(env, regs, caller_saved[i]);
                __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
        }
}

typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee,
                                   int insn_idx);

static int set_callee_state(struct bpf_verifier_env *env,
                            struct bpf_func_state *caller,
                            struct bpf_func_state *callee, int insn_idx);

static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
                            set_callee_state_fn set_callee_state_cb,
                            struct bpf_verifier_state *state)
{
        struct bpf_func_state *caller, *callee;
        int err;

        if (state->curframe + 1 >= MAX_CALL_FRAMES) {
                verbose(env, "the call stack of %d frames is too deep\n",
                        state->curframe + 2);
                return -E2BIG;
        }

        if (state->frame[state->curframe + 1]) {
                verbose(env, "verifier bug. Frame %d already allocated\n",
                        state->curframe + 1);
                return -EFAULT;
        }

        caller = state->frame[state->curframe];
        callee = kzalloc(sizeof(*callee), GFP_KERNEL);
        if (!callee)
                return -ENOMEM;
        state->frame[state->curframe + 1] = callee;

        /* callee cannot access r0, r6 - r9 for reading and has to write
         * into its own stack before reading from it.
         * callee can read/write into caller's stack
         */
        init_func_state(env, callee,
                        /* remember the callsite, it will be used by bpf_exit */
                        callsite,
                        state->curframe + 1 /* frameno within this callchain */,
                        subprog /* subprog number within this prog */);
        /* Transfer references to the callee */
        err = copy_reference_state(callee, caller);
        err = err ?: set_callee_state_cb(env, caller, callee, callsite);
        if (err)
                goto err_out;

        /* only increment it after check_reg_arg() finished */
        state->curframe++;

        return 0;

err_out:
        free_func_state(callee);
        state->frame[state->curframe + 1] = NULL;
        return err;
}

static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
                                    const struct btf *btf,
                                    struct bpf_reg_state *regs)
{
        struct bpf_subprog_info *sub = subprog_info(env, subprog);
        struct bpf_verifier_log *log = &env->log;
        u32 i;
        int ret;

        ret = btf_prepare_func_args(env, subprog);
        if (ret)
                return ret;

        /* check that BTF function arguments match actual types that the
         * verifier sees.
         */
        for (i = 0; i < sub->arg_cnt; i++) {
                u32 regno = i + 1;
                struct bpf_reg_state *reg = &regs[regno];
                struct bpf_subprog_arg_info *arg = &sub->args[i];

                if (arg->arg_type == ARG_ANYTHING) {
                        if (reg->type != SCALAR_VALUE) {
                                bpf_log(log, "R%d is not a scalar\n", regno);
                                return -EINVAL;
                        }
                } else if (arg->arg_type == ARG_PTR_TO_CTX) {
                        ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
                        if (ret < 0)
                                return ret;
                        /* If function expects ctx type in BTF check that caller
                         * is passing PTR_TO_CTX.
                         */
                        if (reg->type != PTR_TO_CTX) {
                                bpf_log(log, "arg#%d expects pointer to ctx\n", i);
                                return -EINVAL;
                        }
                } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
                        ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
                        if (ret < 0)
                                return ret;
                        if (check_mem_reg(env, reg, regno, arg->mem_size))
                                return -EINVAL;
                        if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
                                bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
                                return -EINVAL;
                        }
                } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
                        /*
                         * Can pass any value and the kernel won't crash, but
                         * only PTR_TO_ARENA or SCALAR make sense. Everything
                         * else is a bug in the bpf program. Point it out to
                         * the user at the verification time instead of
                         * run-time debug nightmare.
                         */
                        if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
                                bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
                                return -EINVAL;
                        }
                } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
                        ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
                        if (ret)
                                return ret;
                } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
                        struct bpf_call_arg_meta meta;
                        int err;

                        if (register_is_null(reg) && type_may_be_null(arg->arg_type))
                                continue;

                        memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
                        err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
                        err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
                        if (err)
                                return err;
                } else {
                        bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
                                i, arg->arg_type);
                        return -EFAULT;
                }
        }

        return 0;
}

/* Compare BTF of a function call with given bpf_reg_state.
 * Returns:
 * EFAULT - there is a verifier bug. Abort verification.
 * EINVAL - there is a type mismatch or BTF is not available.
 * 0 - BTF matches with what bpf_reg_state expects.
 * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
 */
static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
                                  struct bpf_reg_state *regs)
{
        struct bpf_prog *prog = env->prog;
        struct btf *btf = prog->aux->btf;
        u32 btf_id;
        int err;

        if (!prog->aux->func_info)
                return -EINVAL;

        btf_id = prog->aux->func_info[subprog].type_id;
        if (!btf_id)
                return -EFAULT;

        if (prog->aux->func_info_aux[subprog].unreliable)
                return -EINVAL;

        err = btf_check_func_arg_match(env, subprog, btf, regs);
        /* Compiler optimizations can remove arguments from static functions
         * or mismatched type can be passed into a global function.
         * In such cases mark the function as unreliable from BTF point of view.
         */
        if (err)
                prog->aux->func_info_aux[subprog].unreliable = true;
        return err;
}

static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                              int insn_idx, int subprog,
                              set_callee_state_fn set_callee_state_cb)
{
        struct bpf_verifier_state *state = env->cur_state, *callback_state;
        struct bpf_func_state *caller, *callee;
        int err;

        caller = state->frame[state->curframe];
        err = btf_check_subprog_call(env, subprog, caller->regs);
        if (err == -EFAULT)
                return err;

        /* set_callee_state is used for direct subprog calls, but we are
         * interested in validating only BPF helpers that can call subprogs as
         * callbacks
         */
        env->subprog_info[subprog].is_cb = true;
        if (bpf_pseudo_kfunc_call(insn) &&
            !is_callback_calling_kfunc(insn->imm)) {
                verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
                        func_id_name(insn->imm), insn->imm);
                return -EFAULT;
        } else if (!bpf_pseudo_kfunc_call(insn) &&
                   !is_callback_calling_function(insn->imm)) { /* helper */
                verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
                        func_id_name(insn->imm), insn->imm);
                return -EFAULT;
        }

        if (is_async_callback_calling_insn(insn)) {
                struct bpf_verifier_state *async_cb;

                /* there is no real recursion here. timer and workqueue callbacks are async */
                env->subprog_info[subprog].is_async_cb = true;
                async_cb = push_async_cb(env, env->subprog_info[subprog].start,
                                         insn_idx, subprog,
                                         is_bpf_wq_set_callback_impl_kfunc(insn->imm));
                if (!async_cb)
                        return -EFAULT;
                callee = async_cb->frame[0];
                callee->async_entry_cnt = caller->async_entry_cnt + 1;

                /* Convert bpf_timer_set_callback() args into timer callback args */
                err = set_callee_state_cb(env, caller, callee, insn_idx);
                if (err)
                        return err;

                return 0;
        }

        /* for callback functions enqueue entry to callback and
         * proceed with next instruction within current frame.
         */
        callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
        if (!callback_state)
                return -ENOMEM;

        err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
                               callback_state);
        if (err)
                return err;

        callback_state->callback_unroll_depth++;
        callback_state->frame[callback_state->curframe - 1]->callback_depth++;
        caller->callback_depth = 0;
        return 0;
}

static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                           int *insn_idx)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_func_state *caller;
        int err, subprog, target_insn;

        target_insn = *insn_idx + insn->imm + 1;
        subprog = find_subprog(env, target_insn);
        if (subprog < 0) {
                verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
                return -EFAULT;
        }

        caller = state->frame[state->curframe];
        err = btf_check_subprog_call(env, subprog, caller->regs);
        if (err == -EFAULT)
                return err;
        if (subprog_is_global(env, subprog)) {
                const char *sub_name = subprog_name(env, subprog);

                /* Only global subprogs cannot be called with a lock held. */
                if (env->cur_state->active_lock.ptr) {
                        verbose(env, "global function calls are not allowed while holding a lock,\n"
                                     "use static function instead\n");
                        return -EINVAL;
                }

                /* Only global subprogs cannot be called with preemption disabled. */
                if (env->cur_state->active_preempt_lock) {
                        verbose(env, "global function calls are not allowed with preemption disabled,\n"
                                     "use static function instead\n");
                        return -EINVAL;
                }

                if (err) {
                        verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
                                subprog, sub_name);
                        return err;
                }

                verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
                        subprog, sub_name);
                /* mark global subprog for verifying after main prog */
                subprog_aux(env, subprog)->called = true;
                clear_caller_saved_regs(env, caller->regs);

                /* All global functions return a 64-bit SCALAR_VALUE */
                mark_reg_unknown(env, caller->regs, BPF_REG_0);
                caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;

                /* continue with next insn after call */
                return 0;
        }

        /* for regular function entry setup new frame and continue
         * from that frame.
         */
        err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
        if (err)
                return err;

        clear_caller_saved_regs(env, caller->regs);

        /* and go analyze first insn of the callee */
        *insn_idx = env->subprog_info[subprog].start - 1;

        if (env->log.level & BPF_LOG_LEVEL) {
                verbose(env, "caller:\n");
                print_verifier_state(env, caller, true);
                verbose(env, "callee:\n");
                print_verifier_state(env, state->frame[state->curframe], true);
        }

        return 0;
}

int map_set_for_each_callback_args(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee)
{
        /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
         *      void *callback_ctx, u64 flags);
         * callback_fn(struct bpf_map *map, void *key, void *value,
         *      void *callback_ctx);
         */
        callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];

        callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
        __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
        callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;

        callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
        __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
        callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;

        /* pointer to stack or null */
        callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];

        /* unused */
        __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        return 0;
}

static int set_callee_state(struct bpf_verifier_env *env,
                            struct bpf_func_state *caller,
                            struct bpf_func_state *callee, int insn_idx)
{
        int i;

        /* copy r1 - r5 args that callee can access.  The copy includes parent
         * pointers, which connects us up to the liveness chain
         */
        for (i = BPF_REG_1; i <= BPF_REG_5; i++)
                callee->regs[i] = caller->regs[i];
        return 0;
}

static int set_map_elem_callback_state(struct bpf_verifier_env *env,
                                       struct bpf_func_state *caller,
                                       struct bpf_func_state *callee,
                                       int insn_idx)
{
        struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
        struct bpf_map *map;
        int err;

        /* valid map_ptr and poison value does not matter */
        map = insn_aux->map_ptr_state.map_ptr;
        if (!map->ops->map_set_for_each_callback_args ||
            !map->ops->map_for_each_callback) {
                verbose(env, "callback function not allowed for map\n");
                return -ENOTSUPP;
        }

        err = map->ops->map_set_for_each_callback_args(env, caller, callee);
        if (err)
                return err;

        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_loop_callback_state(struct bpf_verifier_env *env,
                                   struct bpf_func_state *caller,
                                   struct bpf_func_state *callee,
                                   int insn_idx)
{
        /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
         *            u64 flags);
         * callback_fn(u32 index, void *callback_ctx);
         */
        callee->regs[BPF_REG_1].type = SCALAR_VALUE;
        callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];

        /* unused */
        __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);

        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_timer_callback_state(struct bpf_verifier_env *env,
                                    struct bpf_func_state *caller,
                                    struct bpf_func_state *callee,
                                    int insn_idx)
{
        struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;

        /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
         * callback_fn(struct bpf_map *map, void *key, void *value);
         */
        callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
        __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
        callee->regs[BPF_REG_1].map_ptr = map_ptr;

        callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
        __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
        callee->regs[BPF_REG_2].map_ptr = map_ptr;

        callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
        __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
        callee->regs[BPF_REG_3].map_ptr = map_ptr;

        /* unused */
        __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        callee->in_async_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_find_vma_callback_state(struct bpf_verifier_env *env,
                                       struct bpf_func_state *caller,
                                       struct bpf_func_state *callee,
                                       int insn_idx)
{
        /* bpf_find_vma(struct task_struct *task, u64 addr,
         *               void *callback_fn, void *callback_ctx, u64 flags)
         * (callback_fn)(struct task_struct *task,
         *               struct vm_area_struct *vma, void *callback_ctx);
         */
        callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];

        callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
        __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
        callee->regs[BPF_REG_2].btf =  btf_vmlinux;
        callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];

        /* pointer to stack or null */
        callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];

        /* unused */
        __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
                                           struct bpf_func_state *caller,
                                           struct bpf_func_state *callee,
                                           int insn_idx)
{
        /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
         *                          callback_ctx, u64 flags);
         * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
         */
        __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
        mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
        callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];

        /* unused */
        __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);

        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
                                         struct bpf_func_state *caller,
                                         struct bpf_func_state *callee,
                                         int insn_idx)
{
        /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
         *                     bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
         *
         * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
         * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
         * by this point, so look at 'root'
         */
        struct btf_field *field;

        field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
                                      BPF_RB_ROOT);
        if (!field || !field->graph_root.value_btf_id)
                return -EFAULT;

        mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
        ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
        mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
        ref_set_non_owning(env, &callee->regs[BPF_REG_2]);

        __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
        __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
        callee->in_callback_fn = true;
        callee->callback_ret_range = retval_range(0, 1);
        return 0;
}

static bool is_rbtree_lock_required_kfunc(u32 btf_id);

/* Are we currently verifying the callback for a rbtree helper that must
 * be called with lock held? If so, no need to complain about unreleased
 * lock
 */
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_insn *insn = env->prog->insnsi;
        struct bpf_func_state *callee;
        int kfunc_btf_id;

        if (!state->curframe)
                return false;

        callee = state->frame[state->curframe];

        if (!callee->in_callback_fn)
                return false;

        kfunc_btf_id = insn[callee->callsite].imm;
        return is_rbtree_lock_required_kfunc(kfunc_btf_id);
}

static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
{
        return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
}

static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
        struct bpf_verifier_state *state = env->cur_state, *prev_st;
        struct bpf_func_state *caller, *callee;
        struct bpf_reg_state *r0;
        bool in_callback_fn;
        int err;

        callee = state->frame[state->curframe];
        r0 = &callee->regs[BPF_REG_0];
        if (r0->type == PTR_TO_STACK) {
                /* technically it's ok to return caller's stack pointer
                 * (or caller's caller's pointer) back to the caller,
                 * since these pointers are valid. Only current stack
                 * pointer will be invalid as soon as function exits,
                 * but let's be conservative
                 */
                verbose(env, "cannot return stack pointer to the caller\n");
                return -EINVAL;
        }

        caller = state->frame[state->curframe - 1];
        if (callee->in_callback_fn) {
                if (r0->type != SCALAR_VALUE) {
                        verbose(env, "R0 not a scalar value\n");
                        return -EACCES;
                }

                /* we are going to rely on register's precise value */
                err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
                err = err ?: mark_chain_precision(env, BPF_REG_0);
                if (err)
                        return err;

                /* enforce R0 return value range */
                if (!retval_range_within(callee->callback_ret_range, r0)) {
                        verbose_invalid_scalar(env, r0, callee->callback_ret_range,
                                               "At callback return", "R0");
                        return -EINVAL;
                }
                if (!calls_callback(env, callee->callsite)) {
                        verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
                                *insn_idx, callee->callsite);
                        return -EFAULT;
                }
        } else {
                /* return to the caller whatever r0 had in the callee */
                caller->regs[BPF_REG_0] = *r0;
        }

        /* callback_fn frame should have released its own additions to parent's
         * reference state at this point, or check_reference_leak would
         * complain, hence it must be the same as the caller. There is no need
         * to copy it back.
         */
        if (!callee->in_callback_fn) {
                /* Transfer references to the caller */
                err = copy_reference_state(caller, callee);
                if (err)
                        return err;
        }

        /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
         * there function call logic would reschedule callback visit. If iteration
         * converges is_state_visited() would prune that visit eventually.
         */
        in_callback_fn = callee->in_callback_fn;
        if (in_callback_fn)
                *insn_idx = callee->callsite;
        else
                *insn_idx = callee->callsite + 1;

        if (env->log.level & BPF_LOG_LEVEL) {
                verbose(env, "returning from callee:\n");
                print_verifier_state(env, callee, true);
                verbose(env, "to caller at %d:\n", *insn_idx);
                print_verifier_state(env, caller, true);
        }
        /* clear everything in the callee. In case of exceptional exits using
         * bpf_throw, this will be done by copy_verifier_state for extra frames. */
        free_func_state(callee);
        state->frame[state->curframe--] = NULL;

        /* for callbacks widen imprecise scalars to make programs like below verify:
         *
         *   struct ctx { int i; }
         *   void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
         *   ...
         *   struct ctx = { .i = 0; }
         *   bpf_loop(100, cb, &ctx, 0);
         *
         * This is similar to what is done in process_iter_next_call() for open
         * coded iterators.
         */
        prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
        if (prev_st) {
                err = widen_imprecise_scalars(env, prev_st, state);
                if (err)
                        return err;
        }
        return 0;
}

static int do_refine_retval_range(struct bpf_verifier_env *env,
                                  struct bpf_reg_state *regs, int ret_type,
                                  int func_id,
                                  struct bpf_call_arg_meta *meta)
{
        struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];

        if (ret_type != RET_INTEGER)
                return 0;

        switch (func_id) {
        case BPF_FUNC_get_stack:
        case BPF_FUNC_get_task_stack:
        case BPF_FUNC_probe_read_str:
        case BPF_FUNC_probe_read_kernel_str:
        case BPF_FUNC_probe_read_user_str:
                ret_reg->smax_value = meta->msize_max_value;
                ret_reg->s32_max_value = meta->msize_max_value;
                ret_reg->smin_value = -MAX_ERRNO;
                ret_reg->s32_min_value = -MAX_ERRNO;
                reg_bounds_sync(ret_reg);
                break;
        case BPF_FUNC_get_smp_processor_id:
                ret_reg->umax_value = nr_cpu_ids - 1;
                ret_reg->u32_max_value = nr_cpu_ids - 1;
                ret_reg->smax_value = nr_cpu_ids - 1;
                ret_reg->s32_max_value = nr_cpu_ids - 1;
                ret_reg->umin_value = 0;
                ret_reg->u32_min_value = 0;
                ret_reg->smin_value = 0;
                ret_reg->s32_min_value = 0;
                reg_bounds_sync(ret_reg);
                break;
        }

        return reg_bounds_sanity_check(env, ret_reg, "retval");
}

static int
record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
                int func_id, int insn_idx)
{
        struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
        struct bpf_map *map = meta->map_ptr;

        if (func_id != BPF_FUNC_tail_call &&
            func_id != BPF_FUNC_map_lookup_elem &&
            func_id != BPF_FUNC_map_update_elem &&
            func_id != BPF_FUNC_map_delete_elem &&
            func_id != BPF_FUNC_map_push_elem &&
            func_id != BPF_FUNC_map_pop_elem &&
            func_id != BPF_FUNC_map_peek_elem &&
            func_id != BPF_FUNC_for_each_map_elem &&
            func_id != BPF_FUNC_redirect_map &&
            func_id != BPF_FUNC_map_lookup_percpu_elem)
                return 0;

        if (map == NULL) {
                verbose(env, "kernel subsystem misconfigured verifier\n");
                return -EINVAL;
        }

        /* In case of read-only, some additional restrictions
         * need to be applied in order to prevent altering the
         * state of the map from program side.
         */
        if ((map->map_flags & BPF_F_RDONLY_PROG) &&
            (func_id == BPF_FUNC_map_delete_elem ||
             func_id == BPF_FUNC_map_update_elem ||
             func_id == BPF_FUNC_map_push_elem ||
             func_id == BPF_FUNC_map_pop_elem)) {
                verbose(env, "write into map forbidden\n");
                return -EACCES;
        }

        if (!aux->map_ptr_state.map_ptr)
                bpf_map_ptr_store(aux, meta->map_ptr,
                                  !meta->map_ptr->bypass_spec_v1, false);
        else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
                bpf_map_ptr_store(aux, meta->map_ptr,
                                  !meta->map_ptr->bypass_spec_v1, true);
        return 0;
}

static int
record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
                int func_id, int insn_idx)
{
        struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
        struct bpf_reg_state *regs = cur_regs(env), *reg;
        struct bpf_map *map = meta->map_ptr;
        u64 val, max;
        int err;

        if (func_id != BPF_FUNC_tail_call)
                return 0;
        if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
                verbose(env, "kernel subsystem misconfigured verifier\n");
                return -EINVAL;
        }

        reg = &regs[BPF_REG_3];
        val = reg->var_off.value;
        max = map->max_entries;

        if (!(is_reg_const(reg, false) && val < max)) {
                bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
                return 0;
        }

        err = mark_chain_precision(env, BPF_REG_3);
        if (err)
                return err;
        if (bpf_map_key_unseen(aux))
                bpf_map_key_store(aux, val);
        else if (!bpf_map_key_poisoned(aux) &&
                  bpf_map_key_immediate(aux) != val)
                bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
        return 0;
}

static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
        struct bpf_func_state *state = cur_func(env);
        bool refs_lingering = false;
        int i;

        if (!exception_exit && state->frameno && !state->in_callback_fn)
                return 0;

        for (i = 0; i < state->acquired_refs; i++) {
                if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
                        continue;
                verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
                        state->refs[i].id, state->refs[i].insn_idx);
                refs_lingering = true;
        }
        return refs_lingering ? -EINVAL : 0;
}

static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *regs)
{
        struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
        struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
        struct bpf_map *fmt_map = fmt_reg->map_ptr;
        struct bpf_bprintf_data data = {};
        int err, fmt_map_off, num_args;
        u64 fmt_addr;
        char *fmt;

        /* data must be an array of u64 */
        if (data_len_reg->var_off.value % 8)
                return -EINVAL;
        num_args = data_len_reg->var_off.value / 8;

        /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
         * and map_direct_value_addr is set.
         */
        fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
        err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
                                                  fmt_map_off);
        if (err) {
                verbose(env, "verifier bug\n");
                return -EFAULT;
        }
        fmt = (char *)(long)fmt_addr + fmt_map_off;

        /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
         * can focus on validating the format specifiers.
         */
        err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
        if (err < 0)
                verbose(env, "Invalid format string\n");

        return err;
}

static int check_get_func_ip(struct bpf_verifier_env *env)
{
        enum bpf_prog_type type = resolve_prog_type(env->prog);
        int func_id = BPF_FUNC_get_func_ip;

        if (type == BPF_PROG_TYPE_TRACING) {
                if (!bpf_prog_has_trampoline(env->prog)) {
                        verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
                                func_id_name(func_id), func_id);
                        return -ENOTSUPP;
                }
                return 0;
        } else if (type == BPF_PROG_TYPE_KPROBE) {
                return 0;
        }

        verbose(env, "func %s#%d not supported for program type %d\n",
                func_id_name(func_id), func_id, type);
        return -ENOTSUPP;
}

static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
{
        return &env->insn_aux_data[env->insn_idx];
}

static bool loop_flag_is_zero(struct bpf_verifier_env *env)
{
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = &regs[BPF_REG_4];
        bool reg_is_null = register_is_null(reg);

        if (reg_is_null)
                mark_chain_precision(env, BPF_REG_4);

        return reg_is_null;
}

static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
{
        struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;

        if (!state->initialized) {
                state->initialized = 1;
                state->fit_for_inline = loop_flag_is_zero(env);
                state->callback_subprogno = subprogno;
                return;
        }

        if (!state->fit_for_inline)
                return;

        state->fit_for_inline = (loop_flag_is_zero(env) &&
                                 state->callback_subprogno == subprogno);
}

static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                             int *insn_idx_p)
{
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
        bool returns_cpu_specific_alloc_ptr = false;
        const struct bpf_func_proto *fn = NULL;
        enum bpf_return_type ret_type;
        enum bpf_type_flag ret_flag;
        struct bpf_reg_state *regs;
        struct bpf_call_arg_meta meta;
        int insn_idx = *insn_idx_p;
        bool changes_data;
        int i, err, func_id;

        /* find function prototype */
        func_id = insn->imm;
        if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
                verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
                        func_id);
                return -EINVAL;
        }

        if (env->ops->get_func_proto)
                fn = env->ops->get_func_proto(func_id, env->prog);
        if (!fn) {
                verbose(env, "program of this type cannot use helper %s#%d\n",
                        func_id_name(func_id), func_id);
                return -EINVAL;
        }

        /* eBPF programs must be GPL compatible to use GPL-ed functions */
        if (!env->prog->gpl_compatible && fn->gpl_only) {
                verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
                return -EINVAL;
        }

        if (fn->allowed && !fn->allowed(env->prog)) {
                verbose(env, "helper call is not allowed in probe\n");
                return -EINVAL;
        }

        if (!in_sleepable(env) && fn->might_sleep) {
                verbose(env, "helper call might sleep in a non-sleepable prog\n");
                return -EINVAL;
        }

        /* With LD_ABS/IND some JITs save/restore skb from r1. */
        changes_data = bpf_helper_changes_pkt_data(fn->func);
        if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
                verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
                        func_id_name(func_id), func_id);
                return -EINVAL;
        }

        memset(&meta, 0, sizeof(meta));
        meta.pkt_access = fn->pkt_access;

        err = check_func_proto(fn, func_id);
        if (err) {
                verbose(env, "kernel subsystem misconfigured func %s#%d\n",
                        func_id_name(func_id), func_id);
                return err;
        }

        if (env->cur_state->active_rcu_lock) {
                if (fn->might_sleep) {
                        verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
                                func_id_name(func_id), func_id);
                        return -EINVAL;
                }

                if (in_sleepable(env) && is_storage_get_function(func_id))
                        env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
        }

        if (env->cur_state->active_preempt_lock) {
                if (fn->might_sleep) {
                        verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
                                func_id_name(func_id), func_id);
                        return -EINVAL;
                }

                if (in_sleepable(env) && is_storage_get_function(func_id))
                        env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
        }

        meta.func_id = func_id;
        /* check args */
        for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
                err = check_func_arg(env, i, &meta, fn, insn_idx);
                if (err)
                        return err;
        }

        err = record_func_map(env, &meta, func_id, insn_idx);
        if (err)
                return err;

        err = record_func_key(env, &meta, func_id, insn_idx);
        if (err)
                return err;

        /* Mark slots with STACK_MISC in case of raw mode, stack offset
         * is inferred from register state.
         */
        for (i = 0; i < meta.access_size; i++) {
                err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
                                       BPF_WRITE, -1, false, false);
                if (err)
                        return err;
        }

        regs = cur_regs(env);

        if (meta.release_regno) {
                err = -EINVAL;
                /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
                 * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
                 * is safe to do directly.
                 */
                if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
                        if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
                                verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
                                return -EFAULT;
                        }
                        err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
                } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
                        u32 ref_obj_id = meta.ref_obj_id;
                        bool in_rcu = in_rcu_cs(env);
                        struct bpf_func_state *state;
                        struct bpf_reg_state *reg;

                        err = release_reference_state(cur_func(env), ref_obj_id);
                        if (!err) {
                                bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
                                        if (reg->ref_obj_id == ref_obj_id) {
                                                if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
                                                        reg->ref_obj_id = 0;
                                                        reg->type &= ~MEM_ALLOC;
                                                        reg->type |= MEM_RCU;
                                                } else {
                                                        mark_reg_invalid(env, reg);
                                                }
                                        }
                                }));
                        }
                } else if (meta.ref_obj_id) {
                        err = release_reference(env, meta.ref_obj_id);
                } else if (register_is_null(&regs[meta.release_regno])) {
                        /* meta.ref_obj_id can only be 0 if register that is meant to be
                         * released is NULL, which must be > R0.
                         */
                        err = 0;
                }
                if (err) {
                        verbose(env, "func %s#%d reference has not been acquired before\n",
                                func_id_name(func_id), func_id);
                        return err;
                }
        }

        switch (func_id) {
        case BPF_FUNC_tail_call:
                err = check_reference_leak(env, false);
                if (err) {
                        verbose(env, "tail_call would lead to reference leak\n");
                        return err;
                }
                break;
        case BPF_FUNC_get_local_storage:
                /* check that flags argument in get_local_storage(map, flags) is 0,
                 * this is required because get_local_storage() can't return an error.
                 */
                if (!register_is_null(&regs[BPF_REG_2])) {
                        verbose(env, "get_local_storage() doesn't support non-zero flags\n");
                        return -EINVAL;
                }
                break;
        case BPF_FUNC_for_each_map_elem:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_map_elem_callback_state);
                break;
        case BPF_FUNC_timer_set_callback:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_timer_callback_state);
                break;
        case BPF_FUNC_find_vma:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_find_vma_callback_state);
                break;
        case BPF_FUNC_snprintf:
                err = check_bpf_snprintf_call(env, regs);
                break;
        case BPF_FUNC_loop:
                update_loop_inline_state(env, meta.subprogno);
                /* Verifier relies on R1 value to determine if bpf_loop() iteration
                 * is finished, thus mark it precise.
                 */
                err = mark_chain_precision(env, BPF_REG_1);
                if (err)
                        return err;
                if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
                        err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                                 set_loop_callback_state);
                } else {
                        cur_func(env)->callback_depth = 0;
                        if (env->log.level & BPF_LOG_LEVEL2)
                                verbose(env, "frame%d bpf_loop iteration limit reached\n",
                                        env->cur_state->curframe);
                }
                break;
        case BPF_FUNC_dynptr_from_mem:
                if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
                        verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
                                reg_type_str(env, regs[BPF_REG_1].type));
                        return -EACCES;
                }
                break;
        case BPF_FUNC_set_retval:
                if (prog_type == BPF_PROG_TYPE_LSM &&
                    env->prog->expected_attach_type == BPF_LSM_CGROUP) {
                        if (!env->prog->aux->attach_func_proto->type) {
                                /* Make sure programs that attach to void
                                 * hooks don't try to modify return value.
                                 */
                                verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
                                return -EINVAL;
                        }
                }
                break;
        case BPF_FUNC_dynptr_data:
        {
                struct bpf_reg_state *reg;
                int id, ref_obj_id;

                reg = get_dynptr_arg_reg(env, fn, regs);
                if (!reg)
                        return -EFAULT;


                if (meta.dynptr_id) {
                        verbose(env, "verifier internal error: meta.dynptr_id already set\n");
                        return -EFAULT;
                }
                if (meta.ref_obj_id) {
                        verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
                        return -EFAULT;
                }

                id = dynptr_id(env, reg);
                if (id < 0) {
                        verbose(env, "verifier internal error: failed to obtain dynptr id\n");
                        return id;
                }

                ref_obj_id = dynptr_ref_obj_id(env, reg);
                if (ref_obj_id < 0) {
                        verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
                        return ref_obj_id;
                }

                meta.dynptr_id = id;
                meta.ref_obj_id = ref_obj_id;

                break;
        }
        case BPF_FUNC_dynptr_write:
        {
                enum bpf_dynptr_type dynptr_type;
                struct bpf_reg_state *reg;

                reg = get_dynptr_arg_reg(env, fn, regs);
                if (!reg)
                        return -EFAULT;

                dynptr_type = dynptr_get_type(env, reg);
                if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
                        return -EFAULT;

                if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
                        /* this will trigger clear_all_pkt_pointers(), which will
                         * invalidate all dynptr slices associated with the skb
                         */
                        changes_data = true;

                break;
        }
        case BPF_FUNC_per_cpu_ptr:
        case BPF_FUNC_this_cpu_ptr:
        {
                struct bpf_reg_state *reg = &regs[BPF_REG_1];
                const struct btf_type *type;

                if (reg->type & MEM_RCU) {
                        type = btf_type_by_id(reg->btf, reg->btf_id);
                        if (!type || !btf_type_is_struct(type)) {
                                verbose(env, "Helper has invalid btf/btf_id in R1\n");
                                return -EFAULT;
                        }
                        returns_cpu_specific_alloc_ptr = true;
                        env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
                }
                break;
        }
        case BPF_FUNC_user_ringbuf_drain:
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_user_ringbuf_callback_state);
                break;
        }

        if (err)
                return err;

        /* reset caller saved regs */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                mark_reg_not_init(env, regs, caller_saved[i]);
                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        }

        /* helper call returns 64-bit value. */
        regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;

        /* update return register (already marked as written above) */
        ret_type = fn->ret_type;
        ret_flag = type_flag(ret_type);

        switch (base_type(ret_type)) {
        case RET_INTEGER:
                /* sets type to SCALAR_VALUE */
                mark_reg_unknown(env, regs, BPF_REG_0);
                break;
        case RET_VOID:
                regs[BPF_REG_0].type = NOT_INIT;
                break;
        case RET_PTR_TO_MAP_VALUE:
                /* There is no offset yet applied, variable or fixed */
                mark_reg_known_zero(env, regs, BPF_REG_0);
                /* remember map_ptr, so that check_map_access()
                 * can check 'value_size' boundary of memory access
                 * to map element returned from bpf_map_lookup_elem()
                 */
                if (meta.map_ptr == NULL) {
                        verbose(env,
                                "kernel subsystem misconfigured verifier\n");
                        return -EINVAL;
                }
                regs[BPF_REG_0].map_ptr = meta.map_ptr;
                regs[BPF_REG_0].map_uid = meta.map_uid;
                regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
                if (!type_may_be_null(ret_type) &&
                    btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
                        regs[BPF_REG_0].id = ++env->id_gen;
                }
                break;
        case RET_PTR_TO_SOCKET:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
                break;
        case RET_PTR_TO_SOCK_COMMON:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
                break;
        case RET_PTR_TO_TCP_SOCK:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
                break;
        case RET_PTR_TO_MEM:
                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
                regs[BPF_REG_0].mem_size = meta.mem_size;
                break;
        case RET_PTR_TO_MEM_OR_BTF_ID:
        {
                const struct btf_type *t;

                mark_reg_known_zero(env, regs, BPF_REG_0);
                t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
                if (!btf_type_is_struct(t)) {
                        u32 tsize;
                        const struct btf_type *ret;
                        const char *tname;

                        /* resolve the type size of ksym. */
                        ret = btf_resolve_size(meta.ret_btf, t, &tsize);
                        if (IS_ERR(ret)) {
                                tname = btf_name_by_offset(meta.ret_btf, t->name_off);
                                verbose(env, "unable to resolve the size of type '%s': %ld\n",
                                        tname, PTR_ERR(ret));
                                return -EINVAL;
                        }
                        regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
                        regs[BPF_REG_0].mem_size = tsize;
                } else {
                        if (returns_cpu_specific_alloc_ptr) {
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
                        } else {
                                /* MEM_RDONLY may be carried from ret_flag, but it
                                 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
                                 * it will confuse the check of PTR_TO_BTF_ID in
                                 * check_mem_access().
                                 */
                                ret_flag &= ~MEM_RDONLY;
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
                        }

                        regs[BPF_REG_0].btf = meta.ret_btf;
                        regs[BPF_REG_0].btf_id = meta.ret_btf_id;
                }
                break;
        }
        case RET_PTR_TO_BTF_ID:
        {
                struct btf *ret_btf;
                int ret_btf_id;

                mark_reg_known_zero(env, regs, BPF_REG_0);
                regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
                if (func_id == BPF_FUNC_kptr_xchg) {
                        ret_btf = meta.kptr_field->kptr.btf;
                        ret_btf_id = meta.kptr_field->kptr.btf_id;
                        if (!btf_is_kernel(ret_btf)) {
                                regs[BPF_REG_0].type |= MEM_ALLOC;
                                if (meta.kptr_field->type == BPF_KPTR_PERCPU)
                                        regs[BPF_REG_0].type |= MEM_PERCPU;
                        }
                } else {
                        if (fn->ret_btf_id == BPF_PTR_POISON) {
                                verbose(env, "verifier internal error:");
                                verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
                                        func_id_name(func_id));
                                return -EINVAL;
                        }
                        ret_btf = btf_vmlinux;
                        ret_btf_id = *fn->ret_btf_id;
                }
                if (ret_btf_id == 0) {
                        verbose(env, "invalid return type %u of func %s#%d\n",
                                base_type(ret_type), func_id_name(func_id),
                                func_id);
                        return -EINVAL;
                }
                regs[BPF_REG_0].btf = ret_btf;
                regs[BPF_REG_0].btf_id = ret_btf_id;
                break;
        }
        default:
                verbose(env, "unknown return type %u of func %s#%d\n",
                        base_type(ret_type), func_id_name(func_id), func_id);
                return -EINVAL;
        }

        if (type_may_be_null(regs[BPF_REG_0].type))
                regs[BPF_REG_0].id = ++env->id_gen;

        if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
                verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
                        func_id_name(func_id), func_id);
                return -EFAULT;
        }

        if (is_dynptr_ref_function(func_id))
                regs[BPF_REG_0].dynptr_id = meta.dynptr_id;

        if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
                /* For release_reference() */
                regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
        } else if (is_acquire_function(func_id, meta.map_ptr)) {
                int id = acquire_reference_state(env, insn_idx);

                if (id < 0)
                        return id;
                /* For mark_ptr_or_null_reg() */
                regs[BPF_REG_0].id = id;
                /* For release_reference() */
                regs[BPF_REG_0].ref_obj_id = id;
        }

        err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
        if (err)
                return err;

        err = check_map_func_compatibility(env, meta.map_ptr, func_id);
        if (err)
                return err;

        if ((func_id == BPF_FUNC_get_stack ||
             func_id == BPF_FUNC_get_task_stack) &&
            !env->prog->has_callchain_buf) {
                const char *err_str;

#ifdef CONFIG_PERF_EVENTS
                err = get_callchain_buffers(sysctl_perf_event_max_stack);
                err_str = "cannot get callchain buffer for func %s#%d\n";
#else
                err = -ENOTSUPP;
                err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
#endif
                if (err) {
                        verbose(env, err_str, func_id_name(func_id), func_id);
                        return err;
                }

                env->prog->has_callchain_buf = true;
        }

        if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
                env->prog->call_get_stack = true;

        if (func_id == BPF_FUNC_get_func_ip) {
                if (check_get_func_ip(env))
                        return -ENOTSUPP;
                env->prog->call_get_func_ip = true;
        }

        if (changes_data)
                clear_all_pkt_pointers(env);
        return 0;
}

/* mark_btf_func_reg_size() is used when the reg size is determined by
 * the BTF func_proto's return value size and argument.
 */
static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
                                   size_t reg_size)
{
        struct bpf_reg_state *reg = &cur_regs(env)[regno];

        if (regno == BPF_REG_0) {
                /* Function return value */
                reg->live |= REG_LIVE_WRITTEN;
                reg->subreg_def = reg_size == sizeof(u64) ?
                        DEF_NOT_SUBREG : env->insn_idx + 1;
        } else {
                /* Function argument */
                if (reg_size == sizeof(u64)) {
                        mark_insn_zext(env, reg);
                        mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
                } else {
                        mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32);
                }
        }
}

static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_ACQUIRE;
}

static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_RELEASE;
}

static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
{
        return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
}

static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_SLEEPABLE;
}

static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_DESTRUCTIVE;
}

static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_RCU;
}

static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->kfunc_flags & KF_RCU_PROTECTED;
}

static bool is_kfunc_arg_mem_size(const struct btf *btf,
                                  const struct btf_param *arg,
                                  const struct bpf_reg_state *reg)
{
        const struct btf_type *t;

        t = btf_type_skip_modifiers(btf, arg->type, NULL);
        if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
                return false;

        return btf_param_match_suffix(btf, arg, "__sz");
}

static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
                                        const struct btf_param *arg,
                                        const struct bpf_reg_state *reg)
{
        const struct btf_type *t;

        t = btf_type_skip_modifiers(btf, arg->type, NULL);
        if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
                return false;

        return btf_param_match_suffix(btf, arg, "__szk");
}

static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__opt");
}

static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__k");
}

static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__ign");
}

static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__map");
}

static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__alloc");
}

static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__uninit");
}

static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
}

static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__nullable");
}

static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
{
        return btf_param_match_suffix(btf, arg, "__str");
}

static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
                                          const struct btf_param *arg,
                                          const char *name)
{
        int len, target_len = strlen(name);
        const char *param_name;

        param_name = btf_name_by_offset(btf, arg->name_off);
        if (str_is_empty(param_name))
                return false;
        len = strlen(param_name);
        if (len != target_len)
                return false;
        if (strcmp(param_name, name))
                return false;

        return true;
}

enum {
        KF_ARG_DYNPTR_ID,
        KF_ARG_LIST_HEAD_ID,
        KF_ARG_LIST_NODE_ID,
        KF_ARG_RB_ROOT_ID,
        KF_ARG_RB_NODE_ID,
        KF_ARG_WORKQUEUE_ID,
};

BTF_ID_LIST(kf_arg_btf_ids)
BTF_ID(struct, bpf_dynptr_kern)
BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)

static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
                                    const struct btf_param *arg, int type)
{
        const struct btf_type *t;
        u32 res_id;

        t = btf_type_skip_modifiers(btf, arg->type, NULL);
        if (!t)
                return false;
        if (!btf_type_is_ptr(t))
                return false;
        t = btf_type_skip_modifiers(btf, t->type, &res_id);
        if (!t)
                return false;
        return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
}

static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
}

static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
}

static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
}

static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
}

static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
}

static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
{
        return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
}

static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
                                  const struct btf_param *arg)
{
        const struct btf_type *t;

        t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
        if (!t)
                return false;

        return true;
}

/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
                                        const struct btf *btf,
                                        const struct btf_type *t, int rec)
{
        const struct btf_type *member_type;
        const struct btf_member *member;
        u32 i;

        if (!btf_type_is_struct(t))
                return false;

        for_each_member(i, t, member) {
                const struct btf_array *array;

                member_type = btf_type_skip_modifiers(btf, member->type, NULL);
                if (btf_type_is_struct(member_type)) {
                        if (rec >= 3) {
                                verbose(env, "max struct nesting depth exceeded\n");
                                return false;
                        }
                        if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
                                return false;
                        continue;
                }
                if (btf_type_is_array(member_type)) {
                        array = btf_array(member_type);
                        if (!array->nelems)
                                return false;
                        member_type = btf_type_skip_modifiers(btf, array->type, NULL);
                        if (!btf_type_is_scalar(member_type))
                                return false;
                        continue;
                }
                if (!btf_type_is_scalar(member_type))
                        return false;
        }
        return true;
}

enum kfunc_ptr_arg_type {
        KF_ARG_PTR_TO_CTX,
        KF_ARG_PTR_TO_ALLOC_BTF_ID,    /* Allocated object */
        KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
        KF_ARG_PTR_TO_DYNPTR,
        KF_ARG_PTR_TO_ITER,
        KF_ARG_PTR_TO_LIST_HEAD,
        KF_ARG_PTR_TO_LIST_NODE,
        KF_ARG_PTR_TO_BTF_ID,               /* Also covers reg2btf_ids conversions */
        KF_ARG_PTR_TO_MEM,
        KF_ARG_PTR_TO_MEM_SIZE,               /* Size derived from next argument, skip it */
        KF_ARG_PTR_TO_CALLBACK,
        KF_ARG_PTR_TO_RB_ROOT,
        KF_ARG_PTR_TO_RB_NODE,
        KF_ARG_PTR_TO_NULL,
        KF_ARG_PTR_TO_CONST_STR,
        KF_ARG_PTR_TO_MAP,
        KF_ARG_PTR_TO_WORKQUEUE,
};

enum special_kfunc_type {
        KF_bpf_obj_new_impl,
        KF_bpf_obj_drop_impl,
        KF_bpf_refcount_acquire_impl,
        KF_bpf_list_push_front_impl,
        KF_bpf_list_push_back_impl,
        KF_bpf_list_pop_front,
        KF_bpf_list_pop_back,
        KF_bpf_cast_to_kern_ctx,
        KF_bpf_rdonly_cast,
        KF_bpf_rcu_read_lock,
        KF_bpf_rcu_read_unlock,
        KF_bpf_rbtree_remove,
        KF_bpf_rbtree_add_impl,
        KF_bpf_rbtree_first,
        KF_bpf_dynptr_from_skb,
        KF_bpf_dynptr_from_xdp,
        KF_bpf_dynptr_slice,
        KF_bpf_dynptr_slice_rdwr,
        KF_bpf_dynptr_clone,
        KF_bpf_percpu_obj_new_impl,
        KF_bpf_percpu_obj_drop_impl,
        KF_bpf_throw,
        KF_bpf_wq_set_callback_impl,
        KF_bpf_preempt_disable,
        KF_bpf_preempt_enable,
        KF_bpf_iter_css_task_new,
        KF_bpf_session_cookie,
};

BTF_SET_START(special_kfunc_set)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
BTF_ID(func, bpf_refcount_acquire_impl)
BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
BTF_ID(func, bpf_wq_set_callback_impl)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#endif
BTF_SET_END(special_kfunc_set)

BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
BTF_ID(func, bpf_refcount_acquire_impl)
BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
BTF_ID(func, bpf_percpu_obj_new_impl)
BTF_ID(func, bpf_percpu_obj_drop_impl)
BTF_ID(func, bpf_throw)
BTF_ID(func, bpf_wq_set_callback_impl)
BTF_ID(func, bpf_preempt_disable)
BTF_ID(func, bpf_preempt_enable)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#else
BTF_ID_UNUSED
#endif
#ifdef CONFIG_BPF_EVENTS
BTF_ID(func, bpf_session_cookie)
#else
BTF_ID_UNUSED
#endif

static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
        if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
            meta->arg_owning_ref) {
                return false;
        }

        return meta->kfunc_flags & KF_RET_NULL;
}

static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
}

static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
}

static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
}

static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
{
        return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
}

static enum kfunc_ptr_arg_type
get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
                       struct bpf_kfunc_call_arg_meta *meta,
                       const struct btf_type *t, const struct btf_type *ref_t,
                       const char *ref_tname, const struct btf_param *args,
                       int argno, int nargs)
{
        u32 regno = argno + 1;
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *reg = &regs[regno];
        bool arg_mem_size = false;

        if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
                return KF_ARG_PTR_TO_CTX;

        /* In this function, we verify the kfunc's BTF as per the argument type,
         * leaving the rest of the verification with respect to the register
         * type to our caller. When a set of conditions hold in the BTF type of
         * arguments, we resolve it to a known kfunc_ptr_arg_type.
         */
        if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
                return KF_ARG_PTR_TO_CTX;

        if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_ALLOC_BTF_ID;

        if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_REFCOUNTED_KPTR;

        if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_DYNPTR;

        if (is_kfunc_arg_iter(meta, argno))
                return KF_ARG_PTR_TO_ITER;

        if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_LIST_HEAD;

        if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_LIST_NODE;

        if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_RB_ROOT;

        if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_RB_NODE;

        if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_CONST_STR;

        if (is_kfunc_arg_map(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_MAP;

        if (is_kfunc_arg_wq(meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_WORKQUEUE;

        if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
                if (!btf_type_is_struct(ref_t)) {
                        verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
                                meta->func_name, argno, btf_type_str(ref_t), ref_tname);
                        return -EINVAL;
                }
                return KF_ARG_PTR_TO_BTF_ID;
        }

        if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
                return KF_ARG_PTR_TO_CALLBACK;

        if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
                return KF_ARG_PTR_TO_NULL;

        if (argno + 1 < nargs &&
            (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
             is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
                arg_mem_size = true;

        /* This is the catch all argument type of register types supported by
         * check_helper_mem_access. However, we only allow when argument type is
         * pointer to scalar, or struct composed (recursively) of scalars. When
         * arg_mem_size is true, the pointer can be void *.
         */
        if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
            (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
                verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
                        argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
                return -EINVAL;
        }
        return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
}

static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
                                        struct bpf_reg_state *reg,
                                        const struct btf_type *ref_t,
                                        const char *ref_tname, u32 ref_id,
                                        struct bpf_kfunc_call_arg_meta *meta,
                                        int argno)
{
        const struct btf_type *reg_ref_t;
        bool strict_type_match = false;
        const struct btf *reg_btf;
        const char *reg_ref_tname;
        u32 reg_ref_id;

        if (base_type(reg->type) == PTR_TO_BTF_ID) {
                reg_btf = reg->btf;
                reg_ref_id = reg->btf_id;
        } else {
                reg_btf = btf_vmlinux;
                reg_ref_id = *reg2btf_ids[base_type(reg->type)];
        }

        /* Enforce strict type matching for calls to kfuncs that are acquiring
         * or releasing a reference, or are no-cast aliases. We do _not_
         * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
         * as we want to enable BPF programs to pass types that are bitwise
         * equivalent without forcing them to explicitly cast with something
         * like bpf_cast_to_kern_ctx().
         *
         * For example, say we had a type like the following:
         *
         * struct bpf_cpumask {
         *        cpumask_t cpumask;
         *        refcount_t usage;
         * };
         *
         * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
         * to a struct cpumask, so it would be safe to pass a struct
         * bpf_cpumask * to a kfunc expecting a struct cpumask *.
         *
         * The philosophy here is similar to how we allow scalars of different
         * types to be passed to kfuncs as long as the size is the same. The
         * only difference here is that we're simply allowing
         * btf_struct_ids_match() to walk the struct at the 0th offset, and
         * resolve types.
         */
        if (is_kfunc_acquire(meta) ||
            (is_kfunc_release(meta) && reg->ref_obj_id) ||
            btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
                strict_type_match = true;

        WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);

        reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
        reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
        if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
                verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
                        meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
                        btf_type_str(reg_ref_t), reg_ref_tname);
                return -EINVAL;
        }
        return 0;
}

static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        struct bpf_verifier_state *state = env->cur_state;
        struct btf_record *rec = reg_btf_record(reg);

        if (!state->active_lock.ptr) {
                verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
                return -EFAULT;
        }

        if (type_flag(reg->type) & NON_OWN_REF) {
                verbose(env, "verifier internal error: NON_OWN_REF already set\n");
                return -EFAULT;
        }

        reg->type |= NON_OWN_REF;
        if (rec->refcount_off >= 0)
                reg->type |= MEM_RCU;

        return 0;
}

static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
{
        struct bpf_func_state *state, *unused;
        struct bpf_reg_state *reg;
        int i;

        state = cur_func(env);

        if (!ref_obj_id) {
                verbose(env, "verifier internal error: ref_obj_id is zero for "
                             "owning -> non-owning conversion\n");
                return -EFAULT;
        }

        for (i = 0; i < state->acquired_refs; i++) {
                if (state->refs[i].id != ref_obj_id)
                        continue;

                /* Clear ref_obj_id here so release_reference doesn't clobber
                 * the whole reg
                 */
                bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
                        if (reg->ref_obj_id == ref_obj_id) {
                                reg->ref_obj_id = 0;
                                ref_set_non_owning(env, reg);
                        }
                }));
                return 0;
        }

        verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
        return -EFAULT;
}

/* Implementation details:
 *
 * Each register points to some region of memory, which we define as an
 * allocation. Each allocation may embed a bpf_spin_lock which protects any
 * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
 * allocation. The lock and the data it protects are colocated in the same
 * memory region.
 *
 * Hence, everytime a register holds a pointer value pointing to such
 * allocation, the verifier preserves a unique reg->id for it.
 *
 * The verifier remembers the lock 'ptr' and the lock 'id' whenever
 * bpf_spin_lock is called.
 *
 * To enable this, lock state in the verifier captures two values:
 *        active_lock.ptr = Register's type specific pointer
 *        active_lock.id  = A unique ID for each register pointer value
 *
 * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
 * supported register types.
 *
 * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
 * allocated objects is the reg->btf pointer.
 *
 * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
 * can establish the provenance of the map value statically for each distinct
 * lookup into such maps. They always contain a single map value hence unique
 * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
 *
 * So, in case of global variables, they use array maps with max_entries = 1,
 * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
 * into the same map value as max_entries is 1, as described above).
 *
 * In case of inner map lookups, the inner map pointer has same map_ptr as the
 * outer map pointer (in verifier context), but each lookup into an inner map
 * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
 * maps from the same outer map share the same map_ptr as active_lock.ptr, they
 * will get different reg->id assigned to each lookup, hence different
 * active_lock.id.
 *
 * In case of allocated objects, active_lock.ptr is the reg->btf, and the
 * reg->id is a unique ID preserved after the NULL pointer check on the pointer
 * returned from bpf_obj_new. Each allocation receives a new reg->id.
 */
static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
        void *ptr;
        u32 id;

        switch ((int)reg->type) {
        case PTR_TO_MAP_VALUE:
                ptr = reg->map_ptr;
                break;
        case PTR_TO_BTF_ID | MEM_ALLOC:
                ptr = reg->btf;
                break;
        default:
                verbose(env, "verifier internal error: unknown reg type for lock check\n");
                return -EFAULT;
        }
        id = reg->id;

        if (!env->cur_state->active_lock.ptr)
                return -EINVAL;
        if (env->cur_state->active_lock.ptr != ptr ||
            env->cur_state->active_lock.id != id) {
                verbose(env, "held lock and object are not in the same allocation\n");
                return -EINVAL;
        }
        return 0;
}

static bool is_bpf_list_api_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
               btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
               btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
               btf_id == special_kfunc_list[KF_bpf_list_pop_back];
}

static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
               btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
               btf_id == special_kfunc_list[KF_bpf_rbtree_first];
}

static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
        return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
               btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}

static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}

static bool is_async_callback_calling_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
}

static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
{
        return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
               insn->imm == special_kfunc_list[KF_bpf_throw];
}

static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
{
        return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
}

static bool is_callback_calling_kfunc(u32 btf_id)
{
        return is_sync_callback_calling_kfunc(btf_id) ||
               is_async_callback_calling_kfunc(btf_id);
}

static bool is_rbtree_lock_required_kfunc(u32 btf_id)
{
        return is_bpf_rbtree_api_kfunc(btf_id);
}

static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
                                          enum btf_field_type head_field_type,
                                          u32 kfunc_btf_id)
{
        bool ret;

        switch (head_field_type) {
        case BPF_LIST_HEAD:
                ret = is_bpf_list_api_kfunc(kfunc_btf_id);
                break;
        case BPF_RB_ROOT:
                ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
                break;
        default:
                verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
                        btf_field_type_name(head_field_type));
                return false;
        }

        if (!ret)
                verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
                        btf_field_type_name(head_field_type));
        return ret;
}

static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
                                          enum btf_field_type node_field_type,
                                          u32 kfunc_btf_id)
{
        bool ret;

        switch (node_field_type) {
        case BPF_LIST_NODE:
                ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
                       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
                break;
        case BPF_RB_NODE:
                ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
                       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
                break;
        default:
                verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
                        btf_field_type_name(node_field_type));
                return false;
        }

        if (!ret)
                verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
                        btf_field_type_name(node_field_type));
        return ret;
}

static int
__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, u32 regno,
                                   struct bpf_kfunc_call_arg_meta *meta,
                                   enum btf_field_type head_field_type,
                                   struct btf_field **head_field)
{
        const char *head_type_name;
        struct btf_field *field;
        struct btf_record *rec;
        u32 head_off;

        if (meta->btf != btf_vmlinux) {
                verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
                return -EFAULT;
        }

        if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
                return -EFAULT;

        head_type_name = btf_field_type_name(head_field_type);
        if (!tnum_is_const(reg->var_off)) {
                verbose(env,
                        "R%d doesn't have constant offset. %s has to be at the constant offset\n",
                        regno, head_type_name);
                return -EINVAL;
        }

        rec = reg_btf_record(reg);
        head_off = reg->off + reg->var_off.value;
        field = btf_record_find(rec, head_off, head_field_type);
        if (!field) {
                verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
                return -EINVAL;
        }

        /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
        if (check_reg_allocation_locked(env, reg)) {
                verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
                        rec->spin_lock_off, head_type_name);
                return -EINVAL;
        }

        if (*head_field) {
                verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);
                return -EFAULT;
        }
        *head_field = field;
        return 0;
}

static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
                                           struct bpf_reg_state *reg, u32 regno,
                                           struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
                                                          &meta->arg_list_head.field);
}

static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
                                             struct bpf_reg_state *reg, u32 regno,
                                             struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
                                                          &meta->arg_rbtree_root.field);
}

static int
__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
                                   struct bpf_reg_state *reg, u32 regno,
                                   struct bpf_kfunc_call_arg_meta *meta,
                                   enum btf_field_type head_field_type,
                                   enum btf_field_type node_field_type,
                                   struct btf_field **node_field)
{
        const char *node_type_name;
        const struct btf_type *et, *t;
        struct btf_field *field;
        u32 node_off;

        if (meta->btf != btf_vmlinux) {
                verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
                return -EFAULT;
        }

        if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
                return -EFAULT;

        node_type_name = btf_field_type_name(node_field_type);
        if (!tnum_is_const(reg->var_off)) {
                verbose(env,
                        "R%d doesn't have constant offset. %s has to be at the constant offset\n",
                        regno, node_type_name);
                return -EINVAL;
        }

        node_off = reg->off + reg->var_off.value;
        field = reg_find_field_offset(reg, node_off, node_field_type);
        if (!field) {
                verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
                return -EINVAL;
        }

        field = *node_field;

        et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
        t = btf_type_by_id(reg->btf, reg->btf_id);
        if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
                                  field->graph_root.value_btf_id, true)) {
                verbose(env, "operation on %s expects arg#1 %s at offset=%d "
                        "in struct %s, but arg is at offset=%d in struct %s\n",
                        btf_field_type_name(head_field_type),
                        btf_field_type_name(node_field_type),
                        field->graph_root.node_offset,
                        btf_name_by_offset(field->graph_root.btf, et->name_off),
                        node_off, btf_name_by_offset(reg->btf, t->name_off));
                return -EINVAL;
        }
        meta->arg_btf = reg->btf;
        meta->arg_btf_id = reg->btf_id;

        if (node_off != field->graph_root.node_offset) {
                verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
                        node_off, btf_field_type_name(node_field_type),
                        field->graph_root.node_offset,
                        btf_name_by_offset(field->graph_root.btf, et->name_off));
                return -EINVAL;
        }

        return 0;
}

static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
                                           struct bpf_reg_state *reg, u32 regno,
                                           struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
                                                  BPF_LIST_HEAD, BPF_LIST_NODE,
                                                  &meta->arg_list_head.field);
}

static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
                                             struct bpf_reg_state *reg, u32 regno,
                                             struct bpf_kfunc_call_arg_meta *meta)
{
        return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
                                                  BPF_RB_ROOT, BPF_RB_NODE,
                                                  &meta->arg_rbtree_root.field);
}

/*
 * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
 * LSM hooks and iters (both sleepable and non-sleepable) are safe.
 * Any sleepable progs are also safe since bpf_check_attach_target() enforce
 * them can only be attached to some specific hook points.
 */
static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
{
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);

        switch (prog_type) {
        case BPF_PROG_TYPE_LSM:
                return true;
        case BPF_PROG_TYPE_TRACING:
                if (env->prog->expected_attach_type == BPF_TRACE_ITER)
                        return true;
                fallthrough;
        default:
                return in_sleepable(env);
        }
}

static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
                            int insn_idx)
{
        const char *func_name = meta->func_name, *ref_tname;
        const struct btf *btf = meta->btf;
        const struct btf_param *args;
        struct btf_record *rec;
        u32 i, nargs;
        int ret;

        args = (const struct btf_param *)(meta->func_proto + 1);
        nargs = btf_type_vlen(meta->func_proto);
        if (nargs > MAX_BPF_FUNC_REG_ARGS) {
                verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
                        MAX_BPF_FUNC_REG_ARGS);
                return -EINVAL;
        }

        /* Check that BTF function arguments match actual types that the
         * verifier sees.
         */
        for (i = 0; i < nargs; i++) {
                struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
                const struct btf_type *t, *ref_t, *resolve_ret;
                enum bpf_arg_type arg_type = ARG_DONTCARE;
                u32 regno = i + 1, ref_id, type_size;
                bool is_ret_buf_sz = false;
                int kf_arg_type;

                t = btf_type_skip_modifiers(btf, args[i].type, NULL);

                if (is_kfunc_arg_ignore(btf, &args[i]))
                        continue;

                if (btf_type_is_scalar(t)) {
                        if (reg->type != SCALAR_VALUE) {
                                verbose(env, "R%d is not a scalar\n", regno);
                                return -EINVAL;
                        }

                        if (is_kfunc_arg_constant(meta->btf, &args[i])) {
                                if (meta->arg_constant.found) {
                                        verbose(env, "verifier internal error: only one constant argument permitted\n");
                                        return -EFAULT;
                                }
                                if (!tnum_is_const(reg->var_off)) {
                                        verbose(env, "R%d must be a known constant\n", regno);
                                        return -EINVAL;
                                }
                                ret = mark_chain_precision(env, regno);
                                if (ret < 0)
                                        return ret;
                                meta->arg_constant.found = true;
                                meta->arg_constant.value = reg->var_off.value;
                        } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
                                meta->r0_rdonly = true;
                                is_ret_buf_sz = true;
                        } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
                                is_ret_buf_sz = true;
                        }

                        if (is_ret_buf_sz) {
                                if (meta->r0_size) {
                                        verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
                                        return -EINVAL;
                                }

                                if (!tnum_is_const(reg->var_off)) {
                                        verbose(env, "R%d is not a const\n", regno);
                                        return -EINVAL;
                                }

                                meta->r0_size = reg->var_off.value;
                                ret = mark_chain_precision(env, regno);
                                if (ret)
                                        return ret;
                        }
                        continue;
                }

                if (!btf_type_is_ptr(t)) {
                        verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
                        return -EINVAL;
                }

                if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
                    (register_is_null(reg) || type_may_be_null(reg->type)) &&
                        !is_kfunc_arg_nullable(meta->btf, &args[i])) {
                        verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
                        return -EACCES;
                }

                if (reg->ref_obj_id) {
                        if (is_kfunc_release(meta) && meta->ref_obj_id) {
                                verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
                                        regno, reg->ref_obj_id,
                                        meta->ref_obj_id);
                                return -EFAULT;
                        }
                        meta->ref_obj_id = reg->ref_obj_id;
                        if (is_kfunc_release(meta))
                                meta->release_regno = regno;
                }

                ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
                ref_tname = btf_name_by_offset(btf, ref_t->name_off);

                kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
                if (kf_arg_type < 0)
                        return kf_arg_type;

                switch (kf_arg_type) {
                case KF_ARG_PTR_TO_NULL:
                        continue;
                case KF_ARG_PTR_TO_MAP:
                        if (!reg->map_ptr) {
                                verbose(env, "pointer in R%d isn't map pointer\n", regno);
                                return -EINVAL;
                        }
                        if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) {
                                /* Use map_uid (which is unique id of inner map) to reject:
                                 * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
                                 * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
                                 * if (inner_map1 && inner_map2) {
                                 *     wq = bpf_map_lookup_elem(inner_map1);
                                 *     if (wq)
                                 *         // mismatch would have been allowed
                                 *         bpf_wq_init(wq, inner_map2);
                                 * }
                                 *
                                 * Comparing map_ptr is enough to distinguish normal and outer maps.
                                 */
                                if (meta->map.ptr != reg->map_ptr ||
                                    meta->map.uid != reg->map_uid) {
                                        verbose(env,
                                                "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
                                                meta->map.uid, reg->map_uid);
                                        return -EINVAL;
                                }
                        }
                        meta->map.ptr = reg->map_ptr;
                        meta->map.uid = reg->map_uid;
                        fallthrough;
                case KF_ARG_PTR_TO_ALLOC_BTF_ID:
                case KF_ARG_PTR_TO_BTF_ID:
                        if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
                                break;

                        if (!is_trusted_reg(reg)) {
                                if (!is_kfunc_rcu(meta)) {
                                        verbose(env, "R%d must be referenced or trusted\n", regno);
                                        return -EINVAL;
                                }
                                if (!is_rcu_reg(reg)) {
                                        verbose(env, "R%d must be a rcu pointer\n", regno);
                                        return -EINVAL;
                                }
                        }

                        fallthrough;
                case KF_ARG_PTR_TO_CTX:
                        /* Trusted arguments have the same offset checks as release arguments */
                        arg_type |= OBJ_RELEASE;
                        break;
                case KF_ARG_PTR_TO_DYNPTR:
                case KF_ARG_PTR_TO_ITER:
                case KF_ARG_PTR_TO_LIST_HEAD:
                case KF_ARG_PTR_TO_LIST_NODE:
                case KF_ARG_PTR_TO_RB_ROOT:
                case KF_ARG_PTR_TO_RB_NODE:
                case KF_ARG_PTR_TO_MEM:
                case KF_ARG_PTR_TO_MEM_SIZE:
                case KF_ARG_PTR_TO_CALLBACK:
                case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
                case KF_ARG_PTR_TO_CONST_STR:
                case KF_ARG_PTR_TO_WORKQUEUE:
                        /* Trusted by default */
                        break;
                default:
                        WARN_ON_ONCE(1);
                        return -EFAULT;
                }

                if (is_kfunc_release(meta) && reg->ref_obj_id)
                        arg_type |= OBJ_RELEASE;
                ret = check_func_arg_reg_off(env, reg, regno, arg_type);
                if (ret < 0)
                        return ret;

                switch (kf_arg_type) {
                case KF_ARG_PTR_TO_CTX:
                        if (reg->type != PTR_TO_CTX) {
                                verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
                                return -EINVAL;
                        }

                        if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
                                ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
                                if (ret < 0)
                                        return -EINVAL;
                                meta->ret_btf_id  = ret;
                        }
                        break;
                case KF_ARG_PTR_TO_ALLOC_BTF_ID:
                        if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
                                        verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
                                        return -EINVAL;
                                }
                        } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
                                if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
                                        verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
                                        return -EINVAL;
                                }
                        } else {
                                verbose(env, "arg#%d expected pointer to allocated object\n", i);
                                return -EINVAL;
                        }
                        if (!reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        if (meta->btf == btf_vmlinux) {
                                meta->arg_btf = reg->btf;
                                meta->arg_btf_id = reg->btf_id;
                        }
                        break;
                case KF_ARG_PTR_TO_DYNPTR:
                {
                        enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
                        int clone_ref_obj_id = 0;

                        if (reg->type != PTR_TO_STACK &&
                            reg->type != CONST_PTR_TO_DYNPTR) {
                                verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
                                return -EINVAL;
                        }

                        if (reg->type == CONST_PTR_TO_DYNPTR)
                                dynptr_arg_type |= MEM_RDONLY;

                        if (is_kfunc_arg_uninit(btf, &args[i]))
                                dynptr_arg_type |= MEM_UNINIT;

                        if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
                                dynptr_arg_type |= DYNPTR_TYPE_SKB;
                        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
                                dynptr_arg_type |= DYNPTR_TYPE_XDP;
                        } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
                                   (dynptr_arg_type & MEM_UNINIT)) {
                                enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;

                                if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
                                        verbose(env, "verifier internal error: no dynptr type for parent of clone\n");
                                        return -EFAULT;
                                }

                                dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
                                clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
                                if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
                                        verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
                                        return -EFAULT;
                                }
                        }

                        ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
                        if (ret < 0)
                                return ret;

                        if (!(dynptr_arg_type & MEM_UNINIT)) {
                                int id = dynptr_id(env, reg);

                                if (id < 0) {
                                        verbose(env, "verifier internal error: failed to obtain dynptr id\n");
                                        return id;
                                }
                                meta->initialized_dynptr.id = id;
                                meta->initialized_dynptr.type = dynptr_get_type(env, reg);
                                meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
                        }

                        break;
                }
                case KF_ARG_PTR_TO_ITER:
                        if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
                                if (!check_css_task_iter_allowlist(env)) {
                                        verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
                                        return -EINVAL;
                                }
                        }
                        ret = process_iter_arg(env, regno, insn_idx, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_LIST_HEAD:
                        if (reg->type != PTR_TO_MAP_VALUE &&
                            reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
                                return -EINVAL;
                        }
                        if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_RB_ROOT:
                        if (reg->type != PTR_TO_MAP_VALUE &&
                            reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
                                return -EINVAL;
                        }
                        if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_LIST_NODE:
                        if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                verbose(env, "arg#%d expected pointer to allocated object\n", i);
                                return -EINVAL;
                        }
                        if (!reg->ref_obj_id) {
                                verbose(env, "allocated object must be referenced\n");
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_RB_NODE:
                        if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
                                if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
                                        verbose(env, "rbtree_remove node input must be non-owning ref\n");
                                        return -EINVAL;
                                }
                                if (in_rbtree_lock_required_cb(env)) {
                                        verbose(env, "rbtree_remove not allowed in rbtree cb\n");
                                        return -EINVAL;
                                }
                        } else {
                                if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
                                        verbose(env, "arg#%d expected pointer to allocated object\n", i);
                                        return -EINVAL;
                                }
                                if (!reg->ref_obj_id) {
                                        verbose(env, "allocated object must be referenced\n");
                                        return -EINVAL;
                                }
                        }

                        ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_MAP:
                        /* If argument has '__map' suffix expect 'struct bpf_map *' */
                        ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
                        ref_t = btf_type_by_id(btf_vmlinux, ref_id);
                        ref_tname = btf_name_by_offset(btf, ref_t->name_off);
                        fallthrough;
                case KF_ARG_PTR_TO_BTF_ID:
                        /* Only base_type is checked, further checks are done here */
                        if ((base_type(reg->type) != PTR_TO_BTF_ID ||
                             (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
                            !reg2btf_ids[base_type(reg->type)]) {
                                verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
                                verbose(env, "expected %s or socket\n",
                                        reg_type_str(env, base_type(reg->type) |
                                                          (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
                                return -EINVAL;
                        }
                        ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_MEM:
                        resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
                        if (IS_ERR(resolve_ret)) {
                                verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
                                        i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
                                return -EINVAL;
                        }
                        ret = check_mem_reg(env, reg, regno, type_size);
                        if (ret < 0)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_MEM_SIZE:
                {
                        struct bpf_reg_state *buff_reg = &regs[regno];
                        const struct btf_param *buff_arg = &args[i];
                        struct bpf_reg_state *size_reg = &regs[regno + 1];
                        const struct btf_param *size_arg = &args[i + 1];

                        if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
                                ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
                                if (ret < 0) {
                                        verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
                                        return ret;
                                }
                        }

                        if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
                                if (meta->arg_constant.found) {
                                        verbose(env, "verifier internal error: only one constant argument permitted\n");
                                        return -EFAULT;
                                }
                                if (!tnum_is_const(size_reg->var_off)) {
                                        verbose(env, "R%d must be a known constant\n", regno + 1);
                                        return -EINVAL;
                                }
                                meta->arg_constant.found = true;
                                meta->arg_constant.value = size_reg->var_off.value;
                        }

                        /* Skip next '__sz' or '__szk' argument */
                        i++;
                        break;
                }
                case KF_ARG_PTR_TO_CALLBACK:
                        if (reg->type != PTR_TO_FUNC) {
                                verbose(env, "arg%d expected pointer to func\n", i);
                                return -EINVAL;
                        }
                        meta->subprogno = reg->subprogno;
                        break;
                case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
                        if (!type_is_ptr_alloc_obj(reg->type)) {
                                verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
                                return -EINVAL;
                        }
                        if (!type_is_non_owning_ref(reg->type))
                                meta->arg_owning_ref = true;

                        rec = reg_btf_record(reg);
                        if (!rec) {
                                verbose(env, "verifier internal error: Couldn't find btf_record\n");
                                return -EFAULT;
                        }

                        if (rec->refcount_off < 0) {
                                verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
                                return -EINVAL;
                        }

                        meta->arg_btf = reg->btf;
                        meta->arg_btf_id = reg->btf_id;
                        break;
                case KF_ARG_PTR_TO_CONST_STR:
                        if (reg->type != PTR_TO_MAP_VALUE) {
                                verbose(env, "arg#%d doesn't point to a const string\n", i);
                                return -EINVAL;
                        }
                        ret = check_reg_const_str(env, reg, regno);
                        if (ret)
                                return ret;
                        break;
                case KF_ARG_PTR_TO_WORKQUEUE:
                        if (reg->type != PTR_TO_MAP_VALUE) {
                                verbose(env, "arg#%d doesn't point to a map value\n", i);
                                return -EINVAL;
                        }
                        ret = process_wq_func(env, regno, meta);
                        if (ret < 0)
                                return ret;
                        break;
                }
        }

        if (is_kfunc_release(meta) && !meta->release_regno) {
                verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
                        func_name);
                return -EINVAL;
        }

        return 0;
}

static int fetch_kfunc_meta(struct bpf_verifier_env *env,
                            struct bpf_insn *insn,
                            struct bpf_kfunc_call_arg_meta *meta,
                            const char **kfunc_name)
{
        const struct btf_type *func, *func_proto;
        u32 func_id, *kfunc_flags;
        const char *func_name;
        struct btf *desc_btf;

        if (kfunc_name)
                *kfunc_name = NULL;

        if (!insn->imm)
                return -EINVAL;

        desc_btf = find_kfunc_desc_btf(env, insn->off);
        if (IS_ERR(desc_btf))
                return PTR_ERR(desc_btf);

        func_id = insn->imm;
        func = btf_type_by_id(desc_btf, func_id);
        func_name = btf_name_by_offset(desc_btf, func->name_off);
        if (kfunc_name)
                *kfunc_name = func_name;
        func_proto = btf_type_by_id(desc_btf, func->type);

        kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
        if (!kfunc_flags) {
                return -EACCES;
        }

        memset(meta, 0, sizeof(*meta));
        meta->btf = desc_btf;
        meta->func_id = func_id;
        meta->kfunc_flags = *kfunc_flags;
        meta->func_proto = func_proto;
        meta->func_name = func_name;

        return 0;
}

static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);

static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                            int *insn_idx_p)
{
        bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
        u32 i, nargs, ptr_type_id, release_ref_obj_id;
        struct bpf_reg_state *regs = cur_regs(env);
        const char *func_name, *ptr_type_name;
        const struct btf_type *t, *ptr_type;
        struct bpf_kfunc_call_arg_meta meta;
        struct bpf_insn_aux_data *insn_aux;
        int err, insn_idx = *insn_idx_p;
        const struct btf_param *args;
        const struct btf_type *ret_t;
        struct btf *desc_btf;

        /* skip for now, but return error when we find this in fixup_kfunc_call */
        if (!insn->imm)
                return 0;

        err = fetch_kfunc_meta(env, insn, &meta, &func_name);
        if (err == -EACCES && func_name)
                verbose(env, "calling kernel function %s is not allowed\n", func_name);
        if (err)
                return err;
        desc_btf = meta.btf;
        insn_aux = &env->insn_aux_data[insn_idx];

        insn_aux->is_iter_next = is_iter_next_kfunc(&meta);

        if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
                verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
                return -EACCES;
        }

        sleepable = is_kfunc_sleepable(&meta);
        if (sleepable && !in_sleepable(env)) {
                verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
                return -EACCES;
        }

        /* Check the arguments */
        err = check_kfunc_args(env, &meta, insn_idx);
        if (err < 0)
                return err;

        if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_rbtree_add_callback_state);
                if (err) {
                        verbose(env, "kfunc %s#%d failed callback verification\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
                meta.r0_size = sizeof(u64);
                meta.r0_rdonly = false;
        }

        if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
                err = push_callback_call(env, insn, insn_idx, meta.subprogno,
                                         set_timer_callback_state);
                if (err) {
                        verbose(env, "kfunc %s#%d failed callback verification\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
        rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);

        preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
        preempt_enable = is_kfunc_bpf_preempt_enable(&meta);

        if (env->cur_state->active_rcu_lock) {
                struct bpf_func_state *state;
                struct bpf_reg_state *reg;
                u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);

                if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
                        verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
                        return -EACCES;
                }

                if (rcu_lock) {
                        verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
                        return -EINVAL;
                } else if (rcu_unlock) {
                        bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
                                if (reg->type & MEM_RCU) {
                                        reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
                                        reg->type |= PTR_UNTRUSTED;
                                }
                        }));
                        env->cur_state->active_rcu_lock = false;
                } else if (sleepable) {
                        verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
                        return -EACCES;
                }
        } else if (rcu_lock) {
                env->cur_state->active_rcu_lock = true;
        } else if (rcu_unlock) {
                verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
                return -EINVAL;
        }

        if (env->cur_state->active_preempt_lock) {
                if (preempt_disable) {
                        env->cur_state->active_preempt_lock++;
                } else if (preempt_enable) {
                        env->cur_state->active_preempt_lock--;
                } else if (sleepable) {
                        verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
                        return -EACCES;
                }
        } else if (preempt_disable) {
                env->cur_state->active_preempt_lock++;
        } else if (preempt_enable) {
                verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
                return -EINVAL;
        }

        /* In case of release function, we get register number of refcounted
         * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
         */
        if (meta.release_regno) {
                err = release_reference(env, regs[meta.release_regno].ref_obj_id);
                if (err) {
                        verbose(env, "kfunc %s#%d reference has not been acquired before\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
            meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
            meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
                release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
                insn_aux->insert_off = regs[BPF_REG_2].off;
                insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
                err = ref_convert_owning_non_owning(env, release_ref_obj_id);
                if (err) {
                        verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
                                func_name, meta.func_id);
                        return err;
                }

                err = release_reference(env, release_ref_obj_id);
                if (err) {
                        verbose(env, "kfunc %s#%d reference has not been acquired before\n",
                                func_name, meta.func_id);
                        return err;
                }
        }

        if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
                if (!bpf_jit_supports_exceptions()) {
                        verbose(env, "JIT does not support calling kfunc %s#%d\n",
                                func_name, meta.func_id);
                        return -ENOTSUPP;
                }
                env->seen_exception = true;

                /* In the case of the default callback, the cookie value passed
                 * to bpf_throw becomes the return value of the program.
                 */
                if (!env->exception_callback_subprog) {
                        err = check_return_code(env, BPF_REG_1, "R1");
                        if (err < 0)
                                return err;
                }
        }

        for (i = 0; i < CALLER_SAVED_REGS; i++)
                mark_reg_not_init(env, regs, caller_saved[i]);

        /* Check return type */
        t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);

        if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
                /* Only exception is bpf_obj_new_impl */
                if (meta.btf != btf_vmlinux ||
                    (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
                     meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
                     meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
                        verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
                        return -EINVAL;
                }
        }

        if (btf_type_is_scalar(t)) {
                mark_reg_unknown(env, regs, BPF_REG_0);
                mark_btf_func_reg_size(env, BPF_REG_0, t->size);
        } else if (btf_type_is_ptr(t)) {
                ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);

                if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
                        if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
                            meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
                                struct btf_struct_meta *struct_meta;
                                struct btf *ret_btf;
                                u32 ret_btf_id;

                                if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
                                        return -ENOMEM;

                                if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
                                        verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
                                        return -EINVAL;
                                }

                                ret_btf = env->prog->aux->btf;
                                ret_btf_id = meta.arg_constant.value;

                                /* This may be NULL due to user not supplying a BTF */
                                if (!ret_btf) {
                                        verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
                                        return -EINVAL;
                                }

                                ret_t = btf_type_by_id(ret_btf, ret_btf_id);
                                if (!ret_t || !__btf_type_is_struct(ret_t)) {
                                        verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
                                        return -EINVAL;
                                }

                                if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
                                        if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
                                                verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
                                                        ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
                                                return -EINVAL;
                                        }

                                        if (!bpf_global_percpu_ma_set) {
                                                mutex_lock(&bpf_percpu_ma_lock);
                                                if (!bpf_global_percpu_ma_set) {
                                                        /* Charge memory allocated with bpf_global_percpu_ma to
                                                         * root memcg. The obj_cgroup for root memcg is NULL.
                                                         */
                                                        err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
                                                        if (!err)
                                                                bpf_global_percpu_ma_set = true;
                                                }
                                                mutex_unlock(&bpf_percpu_ma_lock);
                                                if (err)
                                                        return err;
                                        }

                                        mutex_lock(&bpf_percpu_ma_lock);
                                        err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
                                        mutex_unlock(&bpf_percpu_ma_lock);
                                        if (err)
                                                return err;
                                }

                                struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
                                if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
                                        if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
                                                verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
                                                return -EINVAL;
                                        }

                                        if (struct_meta) {
                                                verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
                                                return -EINVAL;
                                        }
                                }

                                mark_reg_known_zero(env, regs, BPF_REG_0);
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
                                regs[BPF_REG_0].btf = ret_btf;
                                regs[BPF_REG_0].btf_id = ret_btf_id;
                                if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
                                        regs[BPF_REG_0].type |= MEM_PERCPU;

                                insn_aux->obj_new_size = ret_t->size;
                                insn_aux->kptr_struct_meta = struct_meta;
                        } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
                                mark_reg_known_zero(env, regs, BPF_REG_0);
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
                                regs[BPF_REG_0].btf = meta.arg_btf;
                                regs[BPF_REG_0].btf_id = meta.arg_btf_id;

                                insn_aux->kptr_struct_meta =
                                        btf_find_struct_meta(meta.arg_btf,
                                                             meta.arg_btf_id);
                        } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
                                   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
                                struct btf_field *field = meta.arg_list_head.field;

                                mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
                        } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
                                   meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
                                struct btf_field *field = meta.arg_rbtree_root.field;

                                mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
                        } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
                                mark_reg_known_zero(env, regs, BPF_REG_0);
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
                                regs[BPF_REG_0].btf = desc_btf;
                                regs[BPF_REG_0].btf_id = meta.ret_btf_id;
                        } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
                                ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
                                if (!ret_t || !btf_type_is_struct(ret_t)) {
                                        verbose(env,
                                                "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
                                        return -EINVAL;
                                }

                                mark_reg_known_zero(env, regs, BPF_REG_0);
                                regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
                                regs[BPF_REG_0].btf = desc_btf;
                                regs[BPF_REG_0].btf_id = meta.arg_constant.value;
                        } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
                                   meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
                                enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);

                                mark_reg_known_zero(env, regs, BPF_REG_0);

                                if (!meta.arg_constant.found) {
                                        verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
                                        return -EFAULT;
                                }

                                regs[BPF_REG_0].mem_size = meta.arg_constant.value;

                                /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
                                regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;

                                if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
                                        regs[BPF_REG_0].type |= MEM_RDONLY;
                                } else {
                                        /* this will set env->seen_direct_write to true */
                                        if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
                                                verbose(env, "the prog does not allow writes to packet data\n");
                                                return -EINVAL;
                                        }
                                }

                                if (!meta.initialized_dynptr.id) {
                                        verbose(env, "verifier internal error: no dynptr id\n");
                                        return -EFAULT;
                                }
                                regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;

                                /* we don't need to set BPF_REG_0's ref obj id
                                 * because packet slices are not refcounted (see
                                 * dynptr_type_refcounted)
                                 */
                        } else {
                                verbose(env, "kernel function %s unhandled dynamic return type\n",
                                        meta.func_name);
                                return -EFAULT;
                        }
                } else if (btf_type_is_void(ptr_type)) {
                        /* kfunc returning 'void *' is equivalent to returning scalar */
                        mark_reg_unknown(env, regs, BPF_REG_0);
                } else if (!__btf_type_is_struct(ptr_type)) {
                        if (!meta.r0_size) {
                                __u32 sz;

                                if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
                                        meta.r0_size = sz;
                                        meta.r0_rdonly = true;
                                }
                        }
                        if (!meta.r0_size) {
                                ptr_type_name = btf_name_by_offset(desc_btf,
                                                                   ptr_type->name_off);
                                verbose(env,
                                        "kernel function %s returns pointer type %s %s is not supported\n",
                                        func_name,
                                        btf_type_str(ptr_type),
                                        ptr_type_name);
                                return -EINVAL;
                        }

                        mark_reg_known_zero(env, regs, BPF_REG_0);
                        regs[BPF_REG_0].type = PTR_TO_MEM;
                        regs[BPF_REG_0].mem_size = meta.r0_size;

                        if (meta.r0_rdonly)
                                regs[BPF_REG_0].type |= MEM_RDONLY;

                        /* Ensures we don't access the memory after a release_reference() */
                        if (meta.ref_obj_id)
                                regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
                } else {
                        mark_reg_known_zero(env, regs, BPF_REG_0);
                        regs[BPF_REG_0].btf = desc_btf;
                        regs[BPF_REG_0].type = PTR_TO_BTF_ID;
                        regs[BPF_REG_0].btf_id = ptr_type_id;
                }

                if (is_kfunc_ret_null(&meta)) {
                        regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
                        /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
                        regs[BPF_REG_0].id = ++env->id_gen;
                }
                mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
                if (is_kfunc_acquire(&meta)) {
                        int id = acquire_reference_state(env, insn_idx);

                        if (id < 0)
                                return id;
                        if (is_kfunc_ret_null(&meta))
                                regs[BPF_REG_0].id = id;
                        regs[BPF_REG_0].ref_obj_id = id;
                } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
                        ref_set_non_owning(env, &regs[BPF_REG_0]);
                }

                if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
                        regs[BPF_REG_0].id = ++env->id_gen;
        } else if (btf_type_is_void(t)) {
                if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
                        if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
                            meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
                                insn_aux->kptr_struct_meta =
                                        btf_find_struct_meta(meta.arg_btf,
                                                             meta.arg_btf_id);
                        }
                }
        }

        nargs = btf_type_vlen(meta.func_proto);
        args = (const struct btf_param *)(meta.func_proto + 1);
        for (i = 0; i < nargs; i++) {
                u32 regno = i + 1;

                t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
                if (btf_type_is_ptr(t))
                        mark_btf_func_reg_size(env, regno, sizeof(void *));
                else
                        /* scalar. ensured by btf_check_kfunc_arg_match() */
                        mark_btf_func_reg_size(env, regno, t->size);
        }

        if (is_iter_next_kfunc(&meta)) {
                err = process_iter_next_call(env, insn_idx, &meta);
                if (err)
                        return err;
        }

        return 0;
}

static bool signed_add_overflows(s64 a, s64 b)
{
        /* Do the add in u64, where overflow is well-defined */
        s64 res = (s64)((u64)a + (u64)b);

        if (b < 0)
                return res > a;
        return res < a;
}

static bool signed_add32_overflows(s32 a, s32 b)
{
        /* Do the add in u32, where overflow is well-defined */
        s32 res = (s32)((u32)a + (u32)b);

        if (b < 0)
                return res > a;
        return res < a;
}

static bool signed_sub_overflows(s64 a, s64 b)
{
        /* Do the sub in u64, where overflow is well-defined */
        s64 res = (s64)((u64)a - (u64)b);

        if (b < 0)
                return res < a;
        return res > a;
}

static bool signed_sub32_overflows(s32 a, s32 b)
{
        /* Do the sub in u32, where overflow is well-defined */
        s32 res = (s32)((u32)a - (u32)b);

        if (b < 0)
                return res < a;
        return res > a;
}

static bool check_reg_sane_offset(struct bpf_verifier_env *env,
                                  const struct bpf_reg_state *reg,
                                  enum bpf_reg_type type)
{
        bool known = tnum_is_const(reg->var_off);
        s64 val = reg->var_off.value;
        s64 smin = reg->smin_value;

        if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
                verbose(env, "math between %s pointer and %lld is not allowed\n",
                        reg_type_str(env, type), val);
                return false;
        }

        if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
                verbose(env, "%s pointer offset %d is not allowed\n",
                        reg_type_str(env, type), reg->off);
                return false;
        }

        if (smin == S64_MIN) {
                verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
                        reg_type_str(env, type));
                return false;
        }

        if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
                verbose(env, "value %lld makes %s pointer be out of bounds\n",
                        smin, reg_type_str(env, type));
                return false;
        }

        return true;
}

enum {
        REASON_BOUNDS        = -1,
        REASON_TYPE        = -2,
        REASON_PATHS        = -3,
        REASON_LIMIT        = -4,
        REASON_STACK        = -5,
};

static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
                              u32 *alu_limit, bool mask_to_left)
{
        u32 max = 0, ptr_limit = 0;

        switch (ptr_reg->type) {
        case PTR_TO_STACK:
                /* Offset 0 is out-of-bounds, but acceptable start for the
                 * left direction, see BPF_REG_FP. Also, unknown scalar
                 * offset where we would need to deal with min/max bounds is
                 * currently prohibited for unprivileged.
                 */
                max = MAX_BPF_STACK + mask_to_left;
                ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
                break;
        case PTR_TO_MAP_VALUE:
                max = ptr_reg->map_ptr->value_size;
                ptr_limit = (mask_to_left ?
                             ptr_reg->smin_value :
                             ptr_reg->umax_value) + ptr_reg->off;
                break;
        default:
                return REASON_TYPE;
        }

        if (ptr_limit >= max)
                return REASON_LIMIT;
        *alu_limit = ptr_limit;
        return 0;
}

static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
                                    const struct bpf_insn *insn)
{
        return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
}

static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
                                       u32 alu_state, u32 alu_limit)
{
        /* If we arrived here from different branches with different
         * state or limits to sanitize, then this won't work.
         */
        if (aux->alu_state &&
            (aux->alu_state != alu_state ||
             aux->alu_limit != alu_limit))
                return REASON_PATHS;

        /* Corresponding fixup done in do_misc_fixups(). */
        aux->alu_state = alu_state;
        aux->alu_limit = alu_limit;
        return 0;
}

static int sanitize_val_alu(struct bpf_verifier_env *env,
                            struct bpf_insn *insn)
{
        struct bpf_insn_aux_data *aux = cur_aux(env);

        if (can_skip_alu_sanitation(env, insn))
                return 0;

        return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
}

static bool sanitize_needed(u8 opcode)
{
        return opcode == BPF_ADD || opcode == BPF_SUB;
}

struct bpf_sanitize_info {
        struct bpf_insn_aux_data aux;
        bool mask_to_left;
};

static struct bpf_verifier_state *
sanitize_speculative_path(struct bpf_verifier_env *env,
                          const struct bpf_insn *insn,
                          u32 next_idx, u32 curr_idx)
{
        struct bpf_verifier_state *branch;
        struct bpf_reg_state *regs;

        branch = push_stack(env, next_idx, curr_idx, true);
        if (branch && insn) {
                regs = branch->frame[branch->curframe]->regs;
                if (BPF_SRC(insn->code) == BPF_K) {
                        mark_reg_unknown(env, regs, insn->dst_reg);
                } else if (BPF_SRC(insn->code) == BPF_X) {
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        mark_reg_unknown(env, regs, insn->src_reg);
                }
        }
        return branch;
}

static int sanitize_ptr_alu(struct bpf_verifier_env *env,
                            struct bpf_insn *insn,
                            const struct bpf_reg_state *ptr_reg,
                            const struct bpf_reg_state *off_reg,
                            struct bpf_reg_state *dst_reg,
                            struct bpf_sanitize_info *info,
                            const bool commit_window)
{
        struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
        struct bpf_verifier_state *vstate = env->cur_state;
        bool off_is_imm = tnum_is_const(off_reg->var_off);
        bool off_is_neg = off_reg->smin_value < 0;
        bool ptr_is_dst_reg = ptr_reg == dst_reg;
        u8 opcode = BPF_OP(insn->code);
        u32 alu_state, alu_limit;
        struct bpf_reg_state tmp;
        bool ret;
        int err;

        if (can_skip_alu_sanitation(env, insn))
                return 0;

        /* We already marked aux for masking from non-speculative
         * paths, thus we got here in the first place. We only care
         * to explore bad access from here.
         */
        if (vstate->speculative)
                goto do_sim;

        if (!commit_window) {
                if (!tnum_is_const(off_reg->var_off) &&
                    (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
                        return REASON_BOUNDS;

                info->mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
                                     (opcode == BPF_SUB && !off_is_neg);
        }

        err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
        if (err < 0)
                return err;

        if (commit_window) {
                /* In commit phase we narrow the masking window based on
                 * the observed pointer move after the simulated operation.
                 */
                alu_state = info->aux.alu_state;
                alu_limit = abs(info->aux.alu_limit - alu_limit);
        } else {
                alu_state  = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
                alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
                alu_state |= ptr_is_dst_reg ?
                             BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;

                /* Limit pruning on unknown scalars to enable deep search for
                 * potential masking differences from other program paths.
                 */
                if (!off_is_imm)
                        env->explore_alu_limits = true;
        }

        err = update_alu_sanitation_state(aux, alu_state, alu_limit);
        if (err < 0)
                return err;
do_sim:
        /* If we're in commit phase, we're done here given we already
         * pushed the truncated dst_reg into the speculative verification
         * stack.
         *
         * Also, when register is a known constant, we rewrite register-based
         * operation to immediate-based, and thus do not need masking (and as
         * a consequence, do not need to simulate the zero-truncation either).
         */
        if (commit_window || off_is_imm)
                return 0;

        /* Simulate and find potential out-of-bounds access under
         * speculative execution from truncation as a result of
         * masking when off was not within expected range. If off
         * sits in dst, then we temporarily need to move ptr there
         * to simulate dst (== 0) +/-= ptr. Needed, for example,
         * for cases where we use K-based arithmetic in one direction
         * and truncated reg-based in the other in order to explore
         * bad access.
         */
        if (!ptr_is_dst_reg) {
                tmp = *dst_reg;
                copy_register_state(dst_reg, ptr_reg);
        }
        ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
                                        env->insn_idx);
        if (!ptr_is_dst_reg && ret)
                *dst_reg = tmp;
        return !ret ? REASON_STACK : 0;
}

static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state *vstate = env->cur_state;

        /* If we simulate paths under speculation, we don't update the
         * insn as 'seen' such that when we verify unreachable paths in
         * the non-speculative domain, sanitize_dead_code() can still
         * rewrite/sanitize them.
         */
        if (!vstate->speculative)
                env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
}

static int sanitize_err(struct bpf_verifier_env *env,
                        const struct bpf_insn *insn, int reason,
                        const struct bpf_reg_state *off_reg,
                        const struct bpf_reg_state *dst_reg)
{
        static const char *err = "pointer arithmetic with it prohibited for !root";
        const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
        u32 dst = insn->dst_reg, src = insn->src_reg;

        switch (reason) {
        case REASON_BOUNDS:
                verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
                        off_reg == dst_reg ? dst : src, err);
                break;
        case REASON_TYPE:
                verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
                        off_reg == dst_reg ? src : dst, err);
                break;
        case REASON_PATHS:
                verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
                        dst, op, err);
                break;
        case REASON_LIMIT:
                verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
                        dst, op, err);
                break;
        case REASON_STACK:
                verbose(env, "R%d could not be pushed for speculative verification, %s\n",
                        dst, err);
                break;
        default:
                verbose(env, "verifier internal error: unknown reason (%d)\n",
                        reason);
                break;
        }

        return -EACCES;
}

/* check that stack access falls within stack limits and that 'reg' doesn't
 * have a variable offset.
 *
 * Variable offset is prohibited for unprivileged mode for simplicity since it
 * requires corresponding support in Spectre masking for stack ALU.  See also
 * retrieve_ptr_limit().
 *
 *
 * 'off' includes 'reg->off'.
 */
static int check_stack_access_for_ptr_arithmetic(
                                struct bpf_verifier_env *env,
                                int regno,
                                const struct bpf_reg_state *reg,
                                int off)
{
        if (!tnum_is_const(reg->var_off)) {
                char tn_buf[48];

                tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
                        regno, tn_buf, off);
                return -EACCES;
        }

        if (off >= 0 || off < -MAX_BPF_STACK) {
                verbose(env, "R%d stack pointer arithmetic goes out of range, "
                        "prohibited for !root; off=%d\n", regno, off);
                return -EACCES;
        }

        return 0;
}

static int sanitize_check_bounds(struct bpf_verifier_env *env,
                                 const struct bpf_insn *insn,
                                 const struct bpf_reg_state *dst_reg)
{
        u32 dst = insn->dst_reg;

        /* For unprivileged we require that resulting offset must be in bounds
         * in order to be able to sanitize access later on.
         */
        if (env->bypass_spec_v1)
                return 0;

        switch (dst_reg->type) {
        case PTR_TO_STACK:
                if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
                                        dst_reg->off + dst_reg->var_off.value))
                        return -EACCES;
                break;
        case PTR_TO_MAP_VALUE:
                if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
                        verbose(env, "R%d pointer arithmetic of map value goes out of range, "
                                "prohibited for !root\n", dst);
                        return -EACCES;
                }
                break;
        default:
                break;
        }

        return 0;
}

/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
 * Caller should also handle BPF_MOV case separately.
 * If we return -EACCES, caller may want to try again treating pointer as a
 * scalar.  So we only emit a diagnostic if !env->allow_ptr_leaks.
 */
static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                                   struct bpf_insn *insn,
                                   const struct bpf_reg_state *ptr_reg,
                                   const struct bpf_reg_state *off_reg)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs, *dst_reg;
        bool known = tnum_is_const(off_reg->var_off);
        s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
            smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
        u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
            umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
        struct bpf_sanitize_info info = {};
        u8 opcode = BPF_OP(insn->code);
        u32 dst = insn->dst_reg;
        int ret;

        dst_reg = &regs[dst];

        if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
            smin_val > smax_val || umin_val > umax_val) {
                /* Taint dst register if offset had invalid bounds derived from
                 * e.g. dead branches.
                 */
                __mark_reg_unknown(env, dst_reg);
                return 0;
        }

        if (BPF_CLASS(insn->code) != BPF_ALU64) {
                /* 32-bit ALU ops on pointers produce (meaningless) scalars */
                if (opcode == BPF_SUB && env->allow_ptr_leaks) {
                        __mark_reg_unknown(env, dst_reg);
                        return 0;
                }

                verbose(env,
                        "R%d 32-bit pointer arithmetic prohibited\n",
                        dst);
                return -EACCES;
        }

        if (ptr_reg->type & PTR_MAYBE_NULL) {
                verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
                        dst, reg_type_str(env, ptr_reg->type));
                return -EACCES;
        }

        switch (base_type(ptr_reg->type)) {
        case PTR_TO_CTX:
        case PTR_TO_MAP_VALUE:
        case PTR_TO_MAP_KEY:
        case PTR_TO_STACK:
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET:
        case PTR_TO_TP_BUFFER:
        case PTR_TO_BTF_ID:
        case PTR_TO_MEM:
        case PTR_TO_BUF:
        case PTR_TO_FUNC:
        case CONST_PTR_TO_DYNPTR:
                break;
        case PTR_TO_FLOW_KEYS:
                if (known)
                        break;
                fallthrough;
        case CONST_PTR_TO_MAP:
                /* smin_val represents the known value */
                if (known && smin_val == 0 && opcode == BPF_ADD)
                        break;
                fallthrough;
        default:
                verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                        dst, reg_type_str(env, ptr_reg->type));
                return -EACCES;
        }

        /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
         * The id may be overwritten later if we create a new variable offset.
         */
        dst_reg->type = ptr_reg->type;
        dst_reg->id = ptr_reg->id;

        if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
            !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
                return -EINVAL;

        /* pointer types do not carry 32-bit bounds at the moment. */
        __mark_reg32_unbounded(dst_reg);

        if (sanitize_needed(opcode)) {
                ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
                                       &info, false);
                if (ret < 0)
                        return sanitize_err(env, insn, ret, off_reg, dst_reg);
        }

        switch (opcode) {
        case BPF_ADD:
                /* We can take a fixed offset as long as it doesn't overflow
                 * the s32 'off' field
                 */
                if (known && (ptr_reg->off + smin_val ==
                              (s64)(s32)(ptr_reg->off + smin_val))) {
                        /* pointer += K.  Accumulate it into fixed offset */
                        dst_reg->smin_value = smin_ptr;
                        dst_reg->smax_value = smax_ptr;
                        dst_reg->umin_value = umin_ptr;
                        dst_reg->umax_value = umax_ptr;
                        dst_reg->var_off = ptr_reg->var_off;
                        dst_reg->off = ptr_reg->off + smin_val;
                        dst_reg->raw = ptr_reg->raw;
                        break;
                }
                /* A new variable offset is created.  Note that off_reg->off
                 * == 0, since it's a scalar.
                 * dst_reg gets the pointer type and since some positive
                 * integer value was added to the pointer, give it a new 'id'
                 * if it's a PTR_TO_PACKET.
                 * this creates a new 'base' pointer, off_reg (variable) gets
                 * added into the variable offset, and we copy the fixed offset
                 * from ptr_reg.
                 */
                if (signed_add_overflows(smin_ptr, smin_val) ||
                    signed_add_overflows(smax_ptr, smax_val)) {
                        dst_reg->smin_value = S64_MIN;
                        dst_reg->smax_value = S64_MAX;
                } else {
                        dst_reg->smin_value = smin_ptr + smin_val;
                        dst_reg->smax_value = smax_ptr + smax_val;
                }
                if (umin_ptr + umin_val < umin_ptr ||
                    umax_ptr + umax_val < umax_ptr) {
                        dst_reg->umin_value = 0;
                        dst_reg->umax_value = U64_MAX;
                } else {
                        dst_reg->umin_value = umin_ptr + umin_val;
                        dst_reg->umax_value = umax_ptr + umax_val;
                }
                dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
                dst_reg->off = ptr_reg->off;
                dst_reg->raw = ptr_reg->raw;
                if (reg_is_pkt_pointer(ptr_reg)) {
                        dst_reg->id = ++env->id_gen;
                        /* something was added to pkt_ptr, set range to zero */
                        memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
                }
                break;
        case BPF_SUB:
                if (dst_reg == off_reg) {
                        /* scalar -= pointer.  Creates an unknown scalar */
                        verbose(env, "R%d tried to subtract pointer from scalar\n",
                                dst);
                        return -EACCES;
                }
                /* We don't allow subtraction from FP, because (according to
                 * test_verifier.c test "invalid fp arithmetic", JITs might not
                 * be able to deal with it.
                 */
                if (ptr_reg->type == PTR_TO_STACK) {
                        verbose(env, "R%d subtraction from stack pointer prohibited\n",
                                dst);
                        return -EACCES;
                }
                if (known && (ptr_reg->off - smin_val ==
                              (s64)(s32)(ptr_reg->off - smin_val))) {
                        /* pointer -= K.  Subtract it from fixed offset */
                        dst_reg->smin_value = smin_ptr;
                        dst_reg->smax_value = smax_ptr;
                        dst_reg->umin_value = umin_ptr;
                        dst_reg->umax_value = umax_ptr;
                        dst_reg->var_off = ptr_reg->var_off;
                        dst_reg->id = ptr_reg->id;
                        dst_reg->off = ptr_reg->off - smin_val;
                        dst_reg->raw = ptr_reg->raw;
                        break;
                }
                /* A new variable offset is created.  If the subtrahend is known
                 * nonnegative, then any reg->range we had before is still good.
                 */
                if (signed_sub_overflows(smin_ptr, smax_val) ||
                    signed_sub_overflows(smax_ptr, smin_val)) {
                        /* Overflow possible, we know nothing */
                        dst_reg->smin_value = S64_MIN;
                        dst_reg->smax_value = S64_MAX;
                } else {
                        dst_reg->smin_value = smin_ptr - smax_val;
                        dst_reg->smax_value = smax_ptr - smin_val;
                }
                if (umin_ptr < umax_val) {
                        /* Overflow possible, we know nothing */
                        dst_reg->umin_value = 0;
                        dst_reg->umax_value = U64_MAX;
                } else {
                        /* Cannot overflow (as long as bounds are consistent) */
                        dst_reg->umin_value = umin_ptr - umax_val;
                        dst_reg->umax_value = umax_ptr - umin_val;
                }
                dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
                dst_reg->off = ptr_reg->off;
                dst_reg->raw = ptr_reg->raw;
                if (reg_is_pkt_pointer(ptr_reg)) {
                        dst_reg->id = ++env->id_gen;
                        /* something was added to pkt_ptr, set range to zero */
                        if (smin_val < 0)
                                memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
                }
                break;
        case BPF_AND:
        case BPF_OR:
        case BPF_XOR:
                /* bitwise ops on pointers are troublesome, prohibit. */
                verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
                        dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        default:
                /* other operators (e.g. MUL,LSH) produce non-pointer results */
                verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
                        dst, bpf_alu_string[opcode >> 4]);
                return -EACCES;
        }

        if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
                return -EINVAL;
        reg_bounds_sync(dst_reg);
        if (sanitize_check_bounds(env, insn, dst_reg) < 0)
                return -EACCES;
        if (sanitize_needed(opcode)) {
                ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
                                       &info, true);
                if (ret < 0)
                        return sanitize_err(env, insn, ret, off_reg, dst_reg);
        }

        return 0;
}

static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        s32 smin_val = src_reg->s32_min_value;
        s32 smax_val = src_reg->s32_max_value;
        u32 umin_val = src_reg->u32_min_value;
        u32 umax_val = src_reg->u32_max_value;

        if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
            signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        } else {
                dst_reg->s32_min_value += smin_val;
                dst_reg->s32_max_value += smax_val;
        }
        if (dst_reg->u32_min_value + umin_val < umin_val ||
            dst_reg->u32_max_value + umax_val < umax_val) {
                dst_reg->u32_min_value = 0;
                dst_reg->u32_max_value = U32_MAX;
        } else {
                dst_reg->u32_min_value += umin_val;
                dst_reg->u32_max_value += umax_val;
        }
}

static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        s64 smin_val = src_reg->smin_value;
        s64 smax_val = src_reg->smax_value;
        u64 umin_val = src_reg->umin_value;
        u64 umax_val = src_reg->umax_value;

        if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
            signed_add_overflows(dst_reg->smax_value, smax_val)) {
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        } else {
                dst_reg->smin_value += smin_val;
                dst_reg->smax_value += smax_val;
        }
        if (dst_reg->umin_value + umin_val < umin_val ||
            dst_reg->umax_value + umax_val < umax_val) {
                dst_reg->umin_value = 0;
                dst_reg->umax_value = U64_MAX;
        } else {
                dst_reg->umin_value += umin_val;
                dst_reg->umax_value += umax_val;
        }
}

static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        s32 smin_val = src_reg->s32_min_value;
        s32 smax_val = src_reg->s32_max_value;
        u32 umin_val = src_reg->u32_min_value;
        u32 umax_val = src_reg->u32_max_value;

        if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
            signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
                /* Overflow possible, we know nothing */
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        } else {
                dst_reg->s32_min_value -= smax_val;
                dst_reg->s32_max_value -= smin_val;
        }
        if (dst_reg->u32_min_value < umax_val) {
                /* Overflow possible, we know nothing */
                dst_reg->u32_min_value = 0;
                dst_reg->u32_max_value = U32_MAX;
        } else {
                /* Cannot overflow (as long as bounds are consistent) */
                dst_reg->u32_min_value -= umax_val;
                dst_reg->u32_max_value -= umin_val;
        }
}

static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        s64 smin_val = src_reg->smin_value;
        s64 smax_val = src_reg->smax_value;
        u64 umin_val = src_reg->umin_value;
        u64 umax_val = src_reg->umax_value;

        if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
            signed_sub_overflows(dst_reg->smax_value, smin_val)) {
                /* Overflow possible, we know nothing */
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        } else {
                dst_reg->smin_value -= smax_val;
                dst_reg->smax_value -= smin_val;
        }
        if (dst_reg->umin_value < umax_val) {
                /* Overflow possible, we know nothing */
                dst_reg->umin_value = 0;
                dst_reg->umax_value = U64_MAX;
        } else {
                /* Cannot overflow (as long as bounds are consistent) */
                dst_reg->umin_value -= umax_val;
                dst_reg->umax_value -= umin_val;
        }
}

static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        s32 smin_val = src_reg->s32_min_value;
        u32 umin_val = src_reg->u32_min_value;
        u32 umax_val = src_reg->u32_max_value;

        if (smin_val < 0 || dst_reg->s32_min_value < 0) {
                /* Ain't nobody got time to multiply that sign */
                __mark_reg32_unbounded(dst_reg);
                return;
        }
        /* Both values are positive, so we can work with unsigned and
         * copy the result to signed (unless it exceeds S32_MAX).
         */
        if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
                /* Potential overflow, we know nothing */
                __mark_reg32_unbounded(dst_reg);
                return;
        }
        dst_reg->u32_min_value *= umin_val;
        dst_reg->u32_max_value *= umax_val;
        if (dst_reg->u32_max_value > S32_MAX) {
                /* Overflow possible, we know nothing */
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        } else {
                dst_reg->s32_min_value = dst_reg->u32_min_value;
                dst_reg->s32_max_value = dst_reg->u32_max_value;
        }
}

static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        s64 smin_val = src_reg->smin_value;
        u64 umin_val = src_reg->umin_value;
        u64 umax_val = src_reg->umax_value;

        if (smin_val < 0 || dst_reg->smin_value < 0) {
                /* Ain't nobody got time to multiply that sign */
                __mark_reg64_unbounded(dst_reg);
                return;
        }
        /* Both values are positive, so we can work with unsigned and
         * copy the result to signed (unless it exceeds S64_MAX).
         */
        if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
                /* Potential overflow, we know nothing */
                __mark_reg64_unbounded(dst_reg);
                return;
        }
        dst_reg->umin_value *= umin_val;
        dst_reg->umax_value *= umax_val;
        if (dst_reg->umax_value > S64_MAX) {
                /* Overflow possible, we know nothing */
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        } else {
                dst_reg->smin_value = dst_reg->umin_value;
                dst_reg->smax_value = dst_reg->umax_value;
        }
}

static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_subreg_is_const(src_reg->var_off);
        bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
        struct tnum var32_off = tnum_subreg(dst_reg->var_off);
        u32 umax_val = src_reg->u32_max_value;

        if (src_known && dst_known) {
                __mark_reg32_known(dst_reg, var32_off.value);
                return;
        }

        /* We get our minimum from the var_off, since that's inherently
         * bitwise.  Our maximum is the minimum of the operands' maxima.
         */
        dst_reg->u32_min_value = var32_off.value;
        dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);

        /* Safe to set s32 bounds by casting u32 result into s32 when u32
         * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
         */
        if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
                dst_reg->s32_min_value = dst_reg->u32_min_value;
                dst_reg->s32_max_value = dst_reg->u32_max_value;
        } else {
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        }
}

static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_is_const(src_reg->var_off);
        bool dst_known = tnum_is_const(dst_reg->var_off);
        u64 umax_val = src_reg->umax_value;

        if (src_known && dst_known) {
                __mark_reg_known(dst_reg, dst_reg->var_off.value);
                return;
        }

        /* We get our minimum from the var_off, since that's inherently
         * bitwise.  Our maximum is the minimum of the operands' maxima.
         */
        dst_reg->umin_value = dst_reg->var_off.value;
        dst_reg->umax_value = min(dst_reg->umax_value, umax_val);

        /* Safe to set s64 bounds by casting u64 result into s64 when u64
         * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
         */
        if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
                dst_reg->smin_value = dst_reg->umin_value;
                dst_reg->smax_value = dst_reg->umax_value;
        } else {
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        }
        /* We may learn something more from the var_off */
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_subreg_is_const(src_reg->var_off);
        bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
        struct tnum var32_off = tnum_subreg(dst_reg->var_off);
        u32 umin_val = src_reg->u32_min_value;

        if (src_known && dst_known) {
                __mark_reg32_known(dst_reg, var32_off.value);
                return;
        }

        /* We get our maximum from the var_off, and our minimum is the
         * maximum of the operands' minima
         */
        dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
        dst_reg->u32_max_value = var32_off.value | var32_off.mask;

        /* Safe to set s32 bounds by casting u32 result into s32 when u32
         * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
         */
        if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
                dst_reg->s32_min_value = dst_reg->u32_min_value;
                dst_reg->s32_max_value = dst_reg->u32_max_value;
        } else {
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        }
}

static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
                              struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_is_const(src_reg->var_off);
        bool dst_known = tnum_is_const(dst_reg->var_off);
        u64 umin_val = src_reg->umin_value;

        if (src_known && dst_known) {
                __mark_reg_known(dst_reg, dst_reg->var_off.value);
                return;
        }

        /* We get our maximum from the var_off, and our minimum is the
         * maximum of the operands' minima
         */
        dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
        dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;

        /* Safe to set s64 bounds by casting u64 result into s64 when u64
         * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
         */
        if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
                dst_reg->smin_value = dst_reg->umin_value;
                dst_reg->smax_value = dst_reg->umax_value;
        } else {
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        }
        /* We may learn something more from the var_off */
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_subreg_is_const(src_reg->var_off);
        bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
        struct tnum var32_off = tnum_subreg(dst_reg->var_off);

        if (src_known && dst_known) {
                __mark_reg32_known(dst_reg, var32_off.value);
                return;
        }

        /* We get both minimum and maximum from the var32_off. */
        dst_reg->u32_min_value = var32_off.value;
        dst_reg->u32_max_value = var32_off.value | var32_off.mask;

        /* Safe to set s32 bounds by casting u32 result into s32 when u32
         * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
         */
        if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
                dst_reg->s32_min_value = dst_reg->u32_min_value;
                dst_reg->s32_max_value = dst_reg->u32_max_value;
        } else {
                dst_reg->s32_min_value = S32_MIN;
                dst_reg->s32_max_value = S32_MAX;
        }
}

static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        bool src_known = tnum_is_const(src_reg->var_off);
        bool dst_known = tnum_is_const(dst_reg->var_off);

        if (src_known && dst_known) {
                /* dst_reg->var_off.value has been updated earlier */
                __mark_reg_known(dst_reg, dst_reg->var_off.value);
                return;
        }

        /* We get both minimum and maximum from the var_off. */
        dst_reg->umin_value = dst_reg->var_off.value;
        dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;

        /* Safe to set s64 bounds by casting u64 result into s64 when u64
         * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
         */
        if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
                dst_reg->smin_value = dst_reg->umin_value;
                dst_reg->smax_value = dst_reg->umax_value;
        } else {
                dst_reg->smin_value = S64_MIN;
                dst_reg->smax_value = S64_MAX;
        }

        __update_reg_bounds(dst_reg);
}

static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
                                   u64 umin_val, u64 umax_val)
{
        /* We lose all sign bit information (except what we can pick
         * up from var_off)
         */
        dst_reg->s32_min_value = S32_MIN;
        dst_reg->s32_max_value = S32_MAX;
        /* If we might shift our top bit out, then we know nothing */
        if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
                dst_reg->u32_min_value = 0;
                dst_reg->u32_max_value = U32_MAX;
        } else {
                dst_reg->u32_min_value <<= umin_val;
                dst_reg->u32_max_value <<= umax_val;
        }
}

static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        u32 umax_val = src_reg->u32_max_value;
        u32 umin_val = src_reg->u32_min_value;
        /* u32 alu operation will zext upper bits */
        struct tnum subreg = tnum_subreg(dst_reg->var_off);

        __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
        dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
        /* Not required but being careful mark reg64 bounds as unknown so
         * that we are forced to pick them up from tnum and zext later and
         * if some path skips this step we are still safe.
         */
        __mark_reg64_unbounded(dst_reg);
        __update_reg32_bounds(dst_reg);
}

static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
                                   u64 umin_val, u64 umax_val)
{
        /* Special case <<32 because it is a common compiler pattern to sign
         * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
         * positive we know this shift will also be positive so we can track
         * bounds correctly. Otherwise we lose all sign bit information except
         * what we can pick up from var_off. Perhaps we can generalize this
         * later to shifts of any length.
         */
        if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
                dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
        else
                dst_reg->smax_value = S64_MAX;

        if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
                dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
        else
                dst_reg->smin_value = S64_MIN;

        /* If we might shift our top bit out, then we know nothing */
        if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
                dst_reg->umin_value = 0;
                dst_reg->umax_value = U64_MAX;
        } else {
                dst_reg->umin_value <<= umin_val;
                dst_reg->umax_value <<= umax_val;
        }
}

static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        u64 umax_val = src_reg->umax_value;
        u64 umin_val = src_reg->umin_value;

        /* scalar64 calc uses 32bit unshifted bounds so must be called first */
        __scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
        __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);

        dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
        /* We may learn something more from the var_off */
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
                                 struct bpf_reg_state *src_reg)
{
        struct tnum subreg = tnum_subreg(dst_reg->var_off);
        u32 umax_val = src_reg->u32_max_value;
        u32 umin_val = src_reg->u32_min_value;

        /* BPF_RSH is an unsigned shift.  If the value in dst_reg might
         * be negative, then either:
         * 1) src_reg might be zero, so the sign bit of the result is
         *    unknown, so we lose our signed bounds
         * 2) it's known negative, thus the unsigned bounds capture the
         *    signed bounds
         * 3) the signed bounds cross zero, so they tell us nothing
         *    about the result
         * If the value in dst_reg is known nonnegative, then again the
         * unsigned bounds capture the signed bounds.
         * Thus, in all cases it suffices to blow away our signed bounds
         * and rely on inferring new ones from the unsigned bounds and
         * var_off of the result.
         */
        dst_reg->s32_min_value = S32_MIN;
        dst_reg->s32_max_value = S32_MAX;

        dst_reg->var_off = tnum_rshift(subreg, umin_val);
        dst_reg->u32_min_value >>= umax_val;
        dst_reg->u32_max_value >>= umin_val;

        __mark_reg64_unbounded(dst_reg);
        __update_reg32_bounds(dst_reg);
}

static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
                               struct bpf_reg_state *src_reg)
{
        u64 umax_val = src_reg->umax_value;
        u64 umin_val = src_reg->umin_value;

        /* BPF_RSH is an unsigned shift.  If the value in dst_reg might
         * be negative, then either:
         * 1) src_reg might be zero, so the sign bit of the result is
         *    unknown, so we lose our signed bounds
         * 2) it's known negative, thus the unsigned bounds capture the
         *    signed bounds
         * 3) the signed bounds cross zero, so they tell us nothing
         *    about the result
         * If the value in dst_reg is known nonnegative, then again the
         * unsigned bounds capture the signed bounds.
         * Thus, in all cases it suffices to blow away our signed bounds
         * and rely on inferring new ones from the unsigned bounds and
         * var_off of the result.
         */
        dst_reg->smin_value = S64_MIN;
        dst_reg->smax_value = S64_MAX;
        dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
        dst_reg->umin_value >>= umax_val;
        dst_reg->umax_value >>= umin_val;

        /* Its not easy to operate on alu32 bounds here because it depends
         * on bits being shifted in. Take easy way out and mark unbounded
         * so we can recalculate later from tnum.
         */
        __mark_reg32_unbounded(dst_reg);
        __update_reg_bounds(dst_reg);
}

static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
                                  struct bpf_reg_state *src_reg)
{
        u64 umin_val = src_reg->u32_min_value;

        /* Upon reaching here, src_known is true and
         * umax_val is equal to umin_val.
         */
        dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
        dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);

        dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);

        /* blow away the dst_reg umin_value/umax_value and rely on
         * dst_reg var_off to refine the result.
         */
        dst_reg->u32_min_value = 0;
        dst_reg->u32_max_value = U32_MAX;

        __mark_reg64_unbounded(dst_reg);
        __update_reg32_bounds(dst_reg);
}

static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
                                struct bpf_reg_state *src_reg)
{
        u64 umin_val = src_reg->umin_value;

        /* Upon reaching here, src_known is true and umax_val is equal
         * to umin_val.
         */
        dst_reg->smin_value >>= umin_val;
        dst_reg->smax_value >>= umin_val;

        dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);

        /* blow away the dst_reg umin_value/umax_value and rely on
         * dst_reg var_off to refine the result.
         */
        dst_reg->umin_value = 0;
        dst_reg->umax_value = U64_MAX;

        /* Its not easy to operate on alu32 bounds here because it depends
         * on bits being shifted in from upper 32-bits. Take easy way out
         * and mark unbounded so we can recalculate later from tnum.
         */
        __mark_reg32_unbounded(dst_reg);
        __update_reg_bounds(dst_reg);
}

static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
                                             const struct bpf_reg_state *src_reg)
{
        bool src_is_const = false;
        u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;

        if (insn_bitness == 32) {
                if (tnum_subreg_is_const(src_reg->var_off)
                    && src_reg->s32_min_value == src_reg->s32_max_value
                    && src_reg->u32_min_value == src_reg->u32_max_value)
                        src_is_const = true;
        } else {
                if (tnum_is_const(src_reg->var_off)
                    && src_reg->smin_value == src_reg->smax_value
                    && src_reg->umin_value == src_reg->umax_value)
                        src_is_const = true;
        }

        switch (BPF_OP(insn->code)) {
        case BPF_ADD:
        case BPF_SUB:
        case BPF_AND:
        case BPF_XOR:
        case BPF_OR:
        case BPF_MUL:
                return true;

        /* Shift operators range is only computable if shift dimension operand
         * is a constant. Shifts greater than 31 or 63 are undefined. This
         * includes shifts by a negative number.
         */
        case BPF_LSH:
        case BPF_RSH:
        case BPF_ARSH:
                return (src_is_const && src_reg->umax_value < insn_bitness);
        default:
                return false;
        }
}

/* WARNING: This function does calculations on 64-bit values, but the actual
 * execution may occur on 32-bit values. Therefore, things like bitshifts
 * need extra checks in the 32-bit case.
 */
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                                      struct bpf_insn *insn,
                                      struct bpf_reg_state *dst_reg,
                                      struct bpf_reg_state src_reg)
{
        u8 opcode = BPF_OP(insn->code);
        bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
        int ret;

        if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
                __mark_reg_unknown(env, dst_reg);
                return 0;
        }

        if (sanitize_needed(opcode)) {
                ret = sanitize_val_alu(env, insn);
                if (ret < 0)
                        return sanitize_err(env, insn, ret, NULL, NULL);
        }

        /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
         * There are two classes of instructions: The first class we track both
         * alu32 and alu64 sign/unsigned bounds independently this provides the
         * greatest amount of precision when alu operations are mixed with jmp32
         * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
         * and BPF_OR. This is possible because these ops have fairly easy to
         * understand and calculate behavior in both 32-bit and 64-bit alu ops.
         * See alu32 verifier tests for examples. The second class of
         * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
         * with regards to tracking sign/unsigned bounds because the bits may
         * cross subreg boundaries in the alu64 case. When this happens we mark
         * the reg unbounded in the subreg bound space and use the resulting
         * tnum to calculate an approximation of the sign/unsigned bounds.
         */
        switch (opcode) {
        case BPF_ADD:
                scalar32_min_max_add(dst_reg, &src_reg);
                scalar_min_max_add(dst_reg, &src_reg);
                dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
                break;
        case BPF_SUB:
                scalar32_min_max_sub(dst_reg, &src_reg);
                scalar_min_max_sub(dst_reg, &src_reg);
                dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
                break;
        case BPF_MUL:
                dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_mul(dst_reg, &src_reg);
                scalar_min_max_mul(dst_reg, &src_reg);
                break;
        case BPF_AND:
                dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_and(dst_reg, &src_reg);
                scalar_min_max_and(dst_reg, &src_reg);
                break;
        case BPF_OR:
                dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_or(dst_reg, &src_reg);
                scalar_min_max_or(dst_reg, &src_reg);
                break;
        case BPF_XOR:
                dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
                scalar32_min_max_xor(dst_reg, &src_reg);
                scalar_min_max_xor(dst_reg, &src_reg);
                break;
        case BPF_LSH:
                if (alu32)
                        scalar32_min_max_lsh(dst_reg, &src_reg);
                else
                        scalar_min_max_lsh(dst_reg, &src_reg);
                break;
        case BPF_RSH:
                if (alu32)
                        scalar32_min_max_rsh(dst_reg, &src_reg);
                else
                        scalar_min_max_rsh(dst_reg, &src_reg);
                break;
        case BPF_ARSH:
                if (alu32)
                        scalar32_min_max_arsh(dst_reg, &src_reg);
                else
                        scalar_min_max_arsh(dst_reg, &src_reg);
                break;
        default:
                break;
        }

        /* ALU32 ops are zero extended into 64bit register */
        if (alu32)
                zext_32_to_64(dst_reg);
        reg_bounds_sync(dst_reg);
        return 0;
}

/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
 * and var_off.
 */
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                   struct bpf_insn *insn)
{
        struct bpf_verifier_state *vstate = env->cur_state;
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
        struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
        u8 opcode = BPF_OP(insn->code);
        int err;

        dst_reg = &regs[insn->dst_reg];
        src_reg = NULL;

        if (dst_reg->type == PTR_TO_ARENA) {
                struct bpf_insn_aux_data *aux = cur_aux(env);

                if (BPF_CLASS(insn->code) == BPF_ALU64)
                        /*
                         * 32-bit operations zero upper bits automatically.
                         * 64-bit operations need to be converted to 32.
                         */
                        aux->needs_zext = true;

                /* Any arithmetic operations are allowed on arena pointers */
                return 0;
        }

        if (dst_reg->type != SCALAR_VALUE)
                ptr_reg = dst_reg;
        else
                /* Make sure ID is cleared otherwise dst_reg min/max could be
                 * incorrectly propagated into other registers by find_equal_scalars()
                 */
                dst_reg->id = 0;
        if (BPF_SRC(insn->code) == BPF_X) {
                src_reg = &regs[insn->src_reg];
                if (src_reg->type != SCALAR_VALUE) {
                        if (dst_reg->type != SCALAR_VALUE) {
                                /* Combining two pointers by any ALU op yields
                                 * an arbitrary scalar. Disallow all math except
                                 * pointer subtraction
                                 */
                                if (opcode == BPF_SUB && env->allow_ptr_leaks) {
                                        mark_reg_unknown(env, regs, insn->dst_reg);
                                        return 0;
                                }
                                verbose(env, "R%d pointer %s pointer prohibited\n",
                                        insn->dst_reg,
                                        bpf_alu_string[opcode >> 4]);
                                return -EACCES;
                        } else {
                                /* scalar += pointer
                                 * This is legal, but we have to reverse our
                                 * src/dest handling in computing the range
                                 */
                                err = mark_chain_precision(env, insn->dst_reg);
                                if (err)
                                        return err;
                                return adjust_ptr_min_max_vals(env, insn,
                                                               src_reg, dst_reg);
                        }
                } else if (ptr_reg) {
                        /* pointer += scalar */
                        err = mark_chain_precision(env, insn->src_reg);
                        if (err)
                                return err;
                        return adjust_ptr_min_max_vals(env, insn,
                                                       dst_reg, src_reg);
                } else if (dst_reg->precise) {
                        /* if dst_reg is precise, src_reg should be precise as well */
                        err = mark_chain_precision(env, insn->src_reg);
                        if (err)
                                return err;
                }
        } else {
                /* Pretend the src is a reg with a known value, since we only
                 * need to be able to read from this state.
                 */
                off_reg.type = SCALAR_VALUE;
                __mark_reg_known(&off_reg, insn->imm);
                src_reg = &off_reg;
                if (ptr_reg) /* pointer += K */
                        return adjust_ptr_min_max_vals(env, insn,
                                                       ptr_reg, src_reg);
        }

        /* Got here implies adding two SCALAR_VALUEs */
        if (WARN_ON_ONCE(ptr_reg)) {
                print_verifier_state(env, state, true);
                verbose(env, "verifier internal error: unexpected ptr_reg\n");
                return -EINVAL;
        }
        if (WARN_ON(!src_reg)) {
                print_verifier_state(env, state, true);
                verbose(env, "verifier internal error: no src_reg\n");
                return -EINVAL;
        }
        return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
}

/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        struct bpf_reg_state *regs = cur_regs(env);
        u8 opcode = BPF_OP(insn->code);
        int err;

        if (opcode == BPF_END || opcode == BPF_NEG) {
                if (opcode == BPF_NEG) {
                        if (BPF_SRC(insn->code) != BPF_K ||
                            insn->src_reg != BPF_REG_0 ||
                            insn->off != 0 || insn->imm != 0) {
                                verbose(env, "BPF_NEG uses reserved fields\n");
                                return -EINVAL;
                        }
                } else {
                        if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
                            (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
                            (BPF_CLASS(insn->code) == BPF_ALU64 &&
                             BPF_SRC(insn->code) != BPF_TO_LE)) {
                                verbose(env, "BPF_END uses reserved fields\n");
                                return -EINVAL;
                        }
                }

                /* check src operand */
                err = check_reg_arg(env, insn->dst_reg, SRC_OP);
                if (err)
                        return err;

                if (is_pointer_value(env, insn->dst_reg)) {
                        verbose(env, "R%d pointer arithmetic prohibited\n",
                                insn->dst_reg);
                        return -EACCES;
                }

                /* check dest operand */
                err = check_reg_arg(env, insn->dst_reg, DST_OP);
                if (err)
                        return err;

        } else if (opcode == BPF_MOV) {

                if (BPF_SRC(insn->code) == BPF_X) {
                        if (BPF_CLASS(insn->code) == BPF_ALU) {
                                if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
                                    insn->imm) {
                                        verbose(env, "BPF_MOV uses reserved fields\n");
                                        return -EINVAL;
                                }
                        } else if (insn->off == BPF_ADDR_SPACE_CAST) {
                                if (insn->imm != 1 && insn->imm != 1u << 16) {
                                        verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
                                        return -EINVAL;
                                }
                                if (!env->prog->aux->arena) {
                                        verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
                                        return -EINVAL;
                                }
                        } else {
                                if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
                                     insn->off != 32) || insn->imm) {
                                        verbose(env, "BPF_MOV uses reserved fields\n");
                                        return -EINVAL;
                                }
                        }

                        /* check src operand */
                        err = check_reg_arg(env, insn->src_reg, SRC_OP);
                        if (err)
                                return err;
                } else {
                        if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
                                verbose(env, "BPF_MOV uses reserved fields\n");
                                return -EINVAL;
                        }
                }

                /* check dest operand, mark as required later */
                err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
                if (err)
                        return err;

                if (BPF_SRC(insn->code) == BPF_X) {
                        struct bpf_reg_state *src_reg = regs + insn->src_reg;
                        struct bpf_reg_state *dst_reg = regs + insn->dst_reg;

                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
                                if (insn->imm) {
                                        /* off == BPF_ADDR_SPACE_CAST */
                                        mark_reg_unknown(env, regs, insn->dst_reg);
                                        if (insn->imm == 1) { /* cast from as(1) to as(0) */
                                                dst_reg->type = PTR_TO_ARENA;
                                                /* PTR_TO_ARENA is 32-bit */
                                                dst_reg->subreg_def = env->insn_idx + 1;
                                        }
                                } else if (insn->off == 0) {
                                        /* case: R1 = R2
                                         * copy register state to dest reg
                                         */
                                        assign_scalar_id_before_mov(env, src_reg);
                                        copy_register_state(dst_reg, src_reg);
                                        dst_reg->live |= REG_LIVE_WRITTEN;
                                        dst_reg->subreg_def = DEF_NOT_SUBREG;
                                } else {
                                        /* case: R1 = (s8, s16 s32)R2 */
                                        if (is_pointer_value(env, insn->src_reg)) {
                                                verbose(env,
                                                        "R%d sign-extension part of pointer\n",
                                                        insn->src_reg);
                                                return -EACCES;
                                        } else if (src_reg->type == SCALAR_VALUE) {
                                                bool no_sext;

                                                no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
                                                if (no_sext)
                                                        assign_scalar_id_before_mov(env, src_reg);
                                                copy_register_state(dst_reg, src_reg);
                                                if (!no_sext)
                                                        dst_reg->id = 0;
                                                coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
                                                dst_reg->live |= REG_LIVE_WRITTEN;
                                                dst_reg->subreg_def = DEF_NOT_SUBREG;
                                        } else {
                                                mark_reg_unknown(env, regs, insn->dst_reg);
                                        }
                                }
                        } else {
                                /* R1 = (u32) R2 */
                                if (is_pointer_value(env, insn->src_reg)) {
                                        verbose(env,
                                                "R%d partial copy of pointer\n",
                                                insn->src_reg);
                                        return -EACCES;
                                } else if (src_reg->type == SCALAR_VALUE) {
                                        if (insn->off == 0) {
                                                bool is_src_reg_u32 = get_reg_width(src_reg) <= 32;

                                                if (is_src_reg_u32)
                                                        assign_scalar_id_before_mov(env, src_reg);
                                                copy_register_state(dst_reg, src_reg);
                                                /* Make sure ID is cleared if src_reg is not in u32
                                                 * range otherwise dst_reg min/max could be incorrectly
                                                 * propagated into src_reg by find_equal_scalars()
                                                 */
                                                if (!is_src_reg_u32)
                                                        dst_reg->id = 0;
                                                dst_reg->live |= REG_LIVE_WRITTEN;
                                                dst_reg->subreg_def = env->insn_idx + 1;
                                        } else {
                                                /* case: W1 = (s8, s16)W2 */
                                                bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));

                                                if (no_sext)
                                                        assign_scalar_id_before_mov(env, src_reg);
                                                copy_register_state(dst_reg, src_reg);
                                                if (!no_sext)
                                                        dst_reg->id = 0;
                                                dst_reg->live |= REG_LIVE_WRITTEN;
                                                dst_reg->subreg_def = env->insn_idx + 1;
                                                coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
                                        }
                                } else {
                                        mark_reg_unknown(env, regs,
                                                         insn->dst_reg);
                                }
                                zext_32_to_64(dst_reg);
                                reg_bounds_sync(dst_reg);
                        }
                } else {
                        /* case: R = imm
                         * remember the value we stored into this reg
                         */
                        /* clear any state __mark_reg_known doesn't set */
                        mark_reg_unknown(env, regs, insn->dst_reg);
                        regs[insn->dst_reg].type = SCALAR_VALUE;
                        if (BPF_CLASS(insn->code) == BPF_ALU64) {
                                __mark_reg_known(regs + insn->dst_reg,
                                                 insn->imm);
                        } else {
                                __mark_reg_known(regs + insn->dst_reg,
                                                 (u32)insn->imm);
                        }
                }

        } else if (opcode > BPF_END) {
                verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
                return -EINVAL;

        } else {        /* all other ALU ops: and, sub, xor, add, ... */

                if (BPF_SRC(insn->code) == BPF_X) {
                        if (insn->imm != 0 || insn->off > 1 ||
                            (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
                                verbose(env, "BPF_ALU uses reserved fields\n");
                                return -EINVAL;
                        }
                        /* check src1 operand */
                        err = check_reg_arg(env, insn->src_reg, SRC_OP);
                        if (err)
                                return err;
                } else {
                        if (insn->src_reg != BPF_REG_0 || insn->off > 1 ||
                            (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
                                verbose(env, "BPF_ALU uses reserved fields\n");
                                return -EINVAL;
                        }
                }

                /* check src2 operand */
                err = check_reg_arg(env, insn->dst_reg, SRC_OP);
                if (err)
                        return err;

                if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
                    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
                        verbose(env, "div by zero\n");
                        return -EINVAL;
                }

                if ((opcode == BPF_LSH || opcode == BPF_RSH ||
                     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
                        int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;

                        if (insn->imm < 0 || insn->imm >= size) {
                                verbose(env, "invalid shift %d\n", insn->imm);
                                return -EINVAL;
                        }
                }

                /* check dest operand */
                err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
                err = err ?: adjust_reg_min_max_vals(env, insn);
                if (err)
                        return err;
        }

        return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
}

static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
                                   struct bpf_reg_state *dst_reg,
                                   enum bpf_reg_type type,
                                   bool range_right_open)
{
        struct bpf_func_state *state;
        struct bpf_reg_state *reg;
        int new_range;

        if (dst_reg->off < 0 ||
            (dst_reg->off == 0 && range_right_open))
                /* This doesn't give us any range */
                return;

        if (dst_reg->umax_value > MAX_PACKET_OFF ||
            dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
                /* Risk of overflow.  For instance, ptr + (1<<63) may be less
                 * than pkt_end, but that's because it's also less than pkt.
                 */
                return;

        new_range = dst_reg->off;
        if (range_right_open)
                new_range++;

        /* Examples for register markings:
         *
         * pkt_data in dst register:
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (r2 > pkt_end) goto <handle exception>
         *   <access okay>
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (r2 < pkt_end) goto <access okay>
         *   <handle exception>
         *
         *   Where:
         *     r2 == dst_reg, pkt_end == src_reg
         *     r2=pkt(id=n,off=8,r=0)
         *     r3=pkt(id=n,off=0,r=0)
         *
         * pkt_data in src register:
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (pkt_end >= r2) goto <access okay>
         *   <handle exception>
         *
         *   r2 = r3;
         *   r2 += 8;
         *   if (pkt_end <= r2) goto <handle exception>
         *   <access okay>
         *
         *   Where:
         *     pkt_end == dst_reg, r2 == src_reg
         *     r2=pkt(id=n,off=8,r=0)
         *     r3=pkt(id=n,off=0,r=0)
         *
         * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
         * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
         * and [r3, r3 + 8-1) respectively is safe to access depending on
         * the check.
         */

        /* If our ids match, then we must have the same max_value.  And we
         * don't care about the other reg's fixed offset, since if it's too big
         * the range won't allow anything.
         * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
         */
        bpf_for_each_reg_in_vstate(vstate, state, reg, ({
                if (reg->type == type && reg->id == dst_reg->id)
                        /* keep the maximum range already checked */
                        reg->range = max(reg->range, new_range);
        }));
}

/*
 * <reg1> <op> <reg2>, currently assuming reg2 is a constant
 */
static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
                                  u8 opcode, bool is_jmp32)
{
        struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
        struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
        u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
        u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
        s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
        s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
        u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
        u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
        s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
        s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;

        switch (opcode) {
        case BPF_JEQ:
                /* constants, umin/umax and smin/smax checks would be
                 * redundant in this case because they all should match
                 */
                if (tnum_is_const(t1) && tnum_is_const(t2))
                        return t1.value == t2.value;
                /* non-overlapping ranges */
                if (umin1 > umax2 || umax1 < umin2)
                        return 0;
                if (smin1 > smax2 || smax1 < smin2)
                        return 0;
                if (!is_jmp32) {
                        /* if 64-bit ranges are inconclusive, see if we can
                         * utilize 32-bit subrange knowledge to eliminate
                         * branches that can't be taken a priori
                         */
                        if (reg1->u32_min_value > reg2->u32_max_value ||
                            reg1->u32_max_value < reg2->u32_min_value)
                                return 0;
                        if (reg1->s32_min_value > reg2->s32_max_value ||
                            reg1->s32_max_value < reg2->s32_min_value)
                                return 0;
                }
                break;
        case BPF_JNE:
                /* constants, umin/umax and smin/smax checks would be
                 * redundant in this case because they all should match
                 */
                if (tnum_is_const(t1) && tnum_is_const(t2))
                        return t1.value != t2.value;
                /* non-overlapping ranges */
                if (umin1 > umax2 || umax1 < umin2)
                        return 1;
                if (smin1 > smax2 || smax1 < smin2)
                        return 1;
                if (!is_jmp32) {
                        /* if 64-bit ranges are inconclusive, see if we can
                         * utilize 32-bit subrange knowledge to eliminate
                         * branches that can't be taken a priori
                         */
                        if (reg1->u32_min_value > reg2->u32_max_value ||
                            reg1->u32_max_value < reg2->u32_min_value)
                                return 1;
                        if (reg1->s32_min_value > reg2->s32_max_value ||
                            reg1->s32_max_value < reg2->s32_min_value)
                                return 1;
                }
                break;
        case BPF_JSET:
                if (!is_reg_const(reg2, is_jmp32)) {
                        swap(reg1, reg2);
                        swap(t1, t2);
                }
                if (!is_reg_const(reg2, is_jmp32))
                        return -1;
                if ((~t1.mask & t1.value) & t2.value)
                        return 1;
                if (!((t1.mask | t1.value) & t2.value))
                        return 0;
                break;
        case BPF_JGT:
                if (umin1 > umax2)
                        return 1;
                else if (umax1 <= umin2)
                        return 0;
                break;
        case BPF_JSGT:
                if (smin1 > smax2)
                        return 1;
                else if (smax1 <= smin2)
                        return 0;
                break;
        case BPF_JLT:
                if (umax1 < umin2)
                        return 1;
                else if (umin1 >= umax2)
                        return 0;
                break;
        case BPF_JSLT:
                if (smax1 < smin2)
                        return 1;
                else if (smin1 >= smax2)
                        return 0;
                break;
        case BPF_JGE:
                if (umin1 >= umax2)
                        return 1;
                else if (umax1 < umin2)
                        return 0;
                break;
        case BPF_JSGE:
                if (smin1 >= smax2)
                        return 1;
                else if (smax1 < smin2)
                        return 0;
                break;
        case BPF_JLE:
                if (umax1 <= umin2)
                        return 1;
                else if (umin1 > umax2)
                        return 0;
                break;
        case BPF_JSLE:
                if (smax1 <= smin2)
                        return 1;
                else if (smin1 > smax2)
                        return 0;
                break;
        }

        return -1;
}

static int flip_opcode(u32 opcode)
{
        /* How can we transform "a <op> b" into "b <op> a"? */
        static const u8 opcode_flip[16] = {
                /* these stay the same */
                [BPF_JEQ  >> 4] = BPF_JEQ,
                [BPF_JNE  >> 4] = BPF_JNE,
                [BPF_JSET >> 4] = BPF_JSET,
                /* these swap "lesser" and "greater" (L and G in the opcodes) */
                [BPF_JGE  >> 4] = BPF_JLE,
                [BPF_JGT  >> 4] = BPF_JLT,
                [BPF_JLE  >> 4] = BPF_JGE,
                [BPF_JLT  >> 4] = BPF_JGT,
                [BPF_JSGE >> 4] = BPF_JSLE,
                [BPF_JSGT >> 4] = BPF_JSLT,
                [BPF_JSLE >> 4] = BPF_JSGE,
                [BPF_JSLT >> 4] = BPF_JSGT
        };
        return opcode_flip[opcode >> 4];
}

static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
                                   struct bpf_reg_state *src_reg,
                                   u8 opcode)
{
        struct bpf_reg_state *pkt;

        if (src_reg->type == PTR_TO_PACKET_END) {
                pkt = dst_reg;
        } else if (dst_reg->type == PTR_TO_PACKET_END) {
                pkt = src_reg;
                opcode = flip_opcode(opcode);
        } else {
                return -1;
        }

        if (pkt->range >= 0)
                return -1;

        switch (opcode) {
        case BPF_JLE:
                /* pkt <= pkt_end */
                fallthrough;
        case BPF_JGT:
                /* pkt > pkt_end */
                if (pkt->range == BEYOND_PKT_END)
                        /* pkt has at last one extra byte beyond pkt_end */
                        return opcode == BPF_JGT;
                break;
        case BPF_JLT:
                /* pkt < pkt_end */
                fallthrough;
        case BPF_JGE:
                /* pkt >= pkt_end */
                if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END)
                        return opcode == BPF_JGE;
                break;
        }
        return -1;
}

/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
 * and return:
 *  1 - branch will be taken and "goto target" will be executed
 *  0 - branch will not be taken and fall-through to next insn
 * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
 *      range [0,10]
 */
static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
                           u8 opcode, bool is_jmp32)
{
        if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
                return is_pkt_ptr_branch_taken(reg1, reg2, opcode);

        if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
                u64 val;

                /* arrange that reg2 is a scalar, and reg1 is a pointer */
                if (!is_reg_const(reg2, is_jmp32)) {
                        opcode = flip_opcode(opcode);
                        swap(reg1, reg2);
                }
                /* and ensure that reg2 is a constant */
                if (!is_reg_const(reg2, is_jmp32))
                        return -1;

                if (!reg_not_null(reg1))
                        return -1;

                /* If pointer is valid tests against zero will fail so we can
                 * use this to direct branch taken.
                 */
                val = reg_const_value(reg2, is_jmp32);
                if (val != 0)
                        return -1;

                switch (opcode) {
                case BPF_JEQ:
                        return 0;
                case BPF_JNE:
                        return 1;
                default:
                        return -1;
                }
        }

        /* now deal with two scalars, but not necessarily constants */
        return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
}

/* Opcode that corresponds to a *false* branch condition.
 * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
 */
static u8 rev_opcode(u8 opcode)
{
        switch (opcode) {
        case BPF_JEQ:                return BPF_JNE;
        case BPF_JNE:                return BPF_JEQ;
        /* JSET doesn't have it's reverse opcode in BPF, so add
         * BPF_X flag to denote the reverse of that operation
         */
        case BPF_JSET:                return BPF_JSET | BPF_X;
        case BPF_JSET | BPF_X:        return BPF_JSET;
        case BPF_JGE:                return BPF_JLT;
        case BPF_JGT:                return BPF_JLE;
        case BPF_JLE:                return BPF_JGT;
        case BPF_JLT:                return BPF_JGE;
        case BPF_JSGE:                return BPF_JSLT;
        case BPF_JSGT:                return BPF_JSLE;
        case BPF_JSLE:                return BPF_JSGT;
        case BPF_JSLT:                return BPF_JSGE;
        default:                return 0;
        }
}

/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
                                u8 opcode, bool is_jmp32)
{
        struct tnum t;
        u64 val;

        /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
        switch (opcode) {
        case BPF_JGE:
        case BPF_JGT:
        case BPF_JSGE:
        case BPF_JSGT:
                opcode = flip_opcode(opcode);
                swap(reg1, reg2);
                break;
        default:
                break;
        }

        switch (opcode) {
        case BPF_JEQ:
                if (is_jmp32) {
                        reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
                        reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
                        reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
                        reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
                        reg2->u32_min_value = reg1->u32_min_value;
                        reg2->u32_max_value = reg1->u32_max_value;
                        reg2->s32_min_value = reg1->s32_min_value;
                        reg2->s32_max_value = reg1->s32_max_value;

                        t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
                        reg1->var_off = tnum_with_subreg(reg1->var_off, t);
                        reg2->var_off = tnum_with_subreg(reg2->var_off, t);
                } else {
                        reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
                        reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
                        reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
                        reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
                        reg2->umin_value = reg1->umin_value;
                        reg2->umax_value = reg1->umax_value;
                        reg2->smin_value = reg1->smin_value;
                        reg2->smax_value = reg1->smax_value;

                        reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
                        reg2->var_off = reg1->var_off;
                }
                break;
        case BPF_JNE:
                if (!is_reg_const(reg2, is_jmp32))
                        swap(reg1, reg2);
                if (!is_reg_const(reg2, is_jmp32))
                        break;

                /* try to recompute the bound of reg1 if reg2 is a const and
                 * is exactly the edge of reg1.
                 */
                val = reg_const_value(reg2, is_jmp32);
                if (is_jmp32) {
                        /* u32_min_value is not equal to 0xffffffff at this point,
                         * because otherwise u32_max_value is 0xffffffff as well,
                         * in such a case both reg1 and reg2 would be constants,
                         * jump would be predicted and reg_set_min_max() won't
                         * be called.
                         *
                         * Same reasoning works for all {u,s}{min,max}{32,64} cases
                         * below.
                         */
                        if (reg1->u32_min_value == (u32)val)
                                reg1->u32_min_value++;
                        if (reg1->u32_max_value == (u32)val)
                                reg1->u32_max_value--;
                        if (reg1->s32_min_value == (s32)val)
                                reg1->s32_min_value++;
                        if (reg1->s32_max_value == (s32)val)
                                reg1->s32_max_value--;
                } else {
                        if (reg1->umin_value == (u64)val)
                                reg1->umin_value++;
                        if (reg1->umax_value == (u64)val)
                                reg1->umax_value--;
                        if (reg1->smin_value == (s64)val)
                                reg1->smin_value++;
                        if (reg1->smax_value == (s64)val)
                                reg1->smax_value--;
                }
                break;
        case BPF_JSET:
                if (!is_reg_const(reg2, is_jmp32))
                        swap(reg1, reg2);
                if (!is_reg_const(reg2, is_jmp32))
                        break;
                val = reg_const_value(reg2, is_jmp32);
                /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
                 * requires single bit to learn something useful. E.g., if we
                 * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
                 * are actually set? We can learn something definite only if
                 * it's a single-bit value to begin with.
                 *
                 * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
                 * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
                 * bit 1 is set, which we can readily use in adjustments.
                 */
                if (!is_power_of_2(val))
                        break;
                if (is_jmp32) {
                        t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
                        reg1->var_off = tnum_with_subreg(reg1->var_off, t);
                } else {
                        reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
                }
                break;
        case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
                if (!is_reg_const(reg2, is_jmp32))
                        swap(reg1, reg2);
                if (!is_reg_const(reg2, is_jmp32))
                        break;
                val = reg_const_value(reg2, is_jmp32);
                if (is_jmp32) {
                        t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
                        reg1->var_off = tnum_with_subreg(reg1->var_off, t);
                } else {
                        reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
                }
                break;
        case BPF_JLE:
                if (is_jmp32) {
                        reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
                        reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
                } else {
                        reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
                        reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
                }
                break;
        case BPF_JLT:
                if (is_jmp32) {
                        reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
                        reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
                } else {
                        reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
                        reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
                }
                break;
        case BPF_JSLE:
                if (is_jmp32) {
                        reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
                        reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
                } else {
                        reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
                        reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
                }
                break;
        case BPF_JSLT:
                if (is_jmp32) {
                        reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
                        reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
                } else {
                        reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
                        reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
                }
                break;
        default:
                return;
        }
}

/* Adjusts the register min/max values in the case that the dst_reg and
 * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
 * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
 * Technically we can do similar adjustments for pointers to the same object,
 * but we don't support that right now.
 */
static int reg_set_min_max(struct bpf_verifier_env *env,
                           struct bpf_reg_state *true_reg1,
                           struct bpf_reg_state *true_reg2,
                           struct bpf_reg_state *false_reg1,
                           struct bpf_reg_state *false_reg2,
                           u8 opcode, bool is_jmp32)
{
        int err;

        /* If either register is a pointer, we can't learn anything about its
         * variable offset from the compare (unless they were a pointer into
         * the same object, but we don't bother with that).
         */
        if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
                return 0;

        /* fallthrough (FALSE) branch */
        regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
        reg_bounds_sync(false_reg1);
        reg_bounds_sync(false_reg2);

        /* jump (TRUE) branch */
        regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
        reg_bounds_sync(true_reg1);
        reg_bounds_sync(true_reg2);

        err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
        err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
        err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
        err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
        return err;
}

static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                                 struct bpf_reg_state *reg, u32 id,
                                 bool is_null)
{
        if (type_may_be_null(reg->type) && reg->id == id &&
            (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
                /* Old offset (both fixed and variable parts) should have been
                 * known-zero, because we don't allow pointer arithmetic on
                 * pointers that might be NULL. If we see this happening, don't
                 * convert the register.
                 *
                 * But in some cases, some helpers that return local kptrs
                 * advance offset for the returned pointer. In those cases, it
                 * is fine to expect to see reg->off.
                 */
                if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
                        return;
                if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
                    WARN_ON_ONCE(reg->off))
                        return;

                if (is_null) {
                        reg->type = SCALAR_VALUE;
                        /* We don't need id and ref_obj_id from this point
                         * onwards anymore, thus we should better reset it,
                         * so that state pruning has chances to take effect.
                         */
                        reg->id = 0;
                        reg->ref_obj_id = 0;

                        return;
                }

                mark_ptr_not_null_reg(reg);

                if (!reg_may_point_to_spin_lock(reg)) {
                        /* For not-NULL ptr, reg->ref_obj_id will be reset
                         * in release_reference().
                         *
                         * reg->id is still used by spin_lock ptr. Other
                         * than spin_lock ptr type, reg->id can be reset.
                         */
                        reg->id = 0;
                }
        }
}

/* The logic is similar to find_good_pkt_pointers(), both could eventually
 * be folded together at some point.
 */
static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
                                  bool is_null)
{
        struct bpf_func_state *state = vstate->frame[vstate->curframe];
        struct bpf_reg_state *regs = state->regs, *reg;
        u32 ref_obj_id = regs[regno].ref_obj_id;
        u32 id = regs[regno].id;

        if (ref_obj_id && ref_obj_id == id && is_null)
                /* regs[regno] is in the " == NULL" branch.
                 * No one could have freed the reference state before
                 * doing the NULL check.
                 */
                WARN_ON_ONCE(release_reference_state(state, id));

        bpf_for_each_reg_in_vstate(vstate, state, reg, ({
                mark_ptr_or_null_reg(state, reg, id, is_null);
        }));
}

static bool try_match_pkt_pointers(const struct bpf_insn *insn,
                                   struct bpf_reg_state *dst_reg,
                                   struct bpf_reg_state *src_reg,
                                   struct bpf_verifier_state *this_branch,
                                   struct bpf_verifier_state *other_branch)
{
        if (BPF_SRC(insn->code) != BPF_X)
                return false;

        /* Pointers are always 64-bit. */
        if (BPF_CLASS(insn->code) == BPF_JMP32)
                return false;

        switch (BPF_OP(insn->code)) {
        case BPF_JGT:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
                        find_good_pkt_pointers(this_branch, dst_reg,
                                               dst_reg->type, false);
                        mark_pkt_end(other_branch, insn->dst_reg, true);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end > pkt_data', pkt_data > pkt_meta' */
                        find_good_pkt_pointers(other_branch, src_reg,
                                               src_reg->type, true);
                        mark_pkt_end(this_branch, insn->src_reg, false);
                } else {
                        return false;
                }
                break;
        case BPF_JLT:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
                        find_good_pkt_pointers(other_branch, dst_reg,
                                               dst_reg->type, true);
                        mark_pkt_end(this_branch, insn->dst_reg, false);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end < pkt_data', pkt_data > pkt_meta' */
                        find_good_pkt_pointers(this_branch, src_reg,
                                               src_reg->type, false);
                        mark_pkt_end(other_branch, insn->src_reg, true);
                } else {
                        return false;
                }
                break;
        case BPF_JGE:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
                        find_good_pkt_pointers(this_branch, dst_reg,
                                               dst_reg->type, true);
                        mark_pkt_end(other_branch, insn->dst_reg, false);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
                        find_good_pkt_pointers(other_branch, src_reg,
                                               src_reg->type, false);
                        mark_pkt_end(this_branch, insn->src_reg, true);
                } else {
                        return false;
                }
                break;
        case BPF_JLE:
                if ((dst_reg->type == PTR_TO_PACKET &&
                     src_reg->type == PTR_TO_PACKET_END) ||
                    (dst_reg->type == PTR_TO_PACKET_META &&
                     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
                        /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
                        find_good_pkt_pointers(other_branch, dst_reg,
                                               dst_reg->type, false);
                        mark_pkt_end(this_branch, insn->dst_reg, true);
                } else if ((dst_reg->type == PTR_TO_PACKET_END &&
                            src_reg->type == PTR_TO_PACKET) ||
                           (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
                            src_reg->type == PTR_TO_PACKET_META)) {
                        /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
                        find_good_pkt_pointers(this_branch, src_reg,
                                               src_reg->type, true);
                        mark_pkt_end(other_branch, insn->src_reg, false);
                } else {
                        return false;
                }
                break;
        default:
                return false;
        }

        return true;
}

static void find_equal_scalars(struct bpf_verifier_state *vstate,
                               struct bpf_reg_state *known_reg)
{
        struct bpf_func_state *state;
        struct bpf_reg_state *reg;

        bpf_for_each_reg_in_vstate(vstate, state, reg, ({
                if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
                        copy_register_state(reg, known_reg);
        }));
}

static int check_cond_jmp_op(struct bpf_verifier_env *env,
                             struct bpf_insn *insn, int *insn_idx)
{
        struct bpf_verifier_state *this_branch = env->cur_state;
        struct bpf_verifier_state *other_branch;
        struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
        struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
        struct bpf_reg_state *eq_branch_regs;
        u8 opcode = BPF_OP(insn->code);
        bool is_jmp32;
        int pred = -1;
        int err;

        /* Only conditional jumps are expected to reach here. */
        if (opcode == BPF_JA || opcode > BPF_JCOND) {
                verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
                return -EINVAL;
        }

        if (opcode == BPF_JCOND) {
                struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
                int idx = *insn_idx;

                if (insn->code != (BPF_JMP | BPF_JCOND) ||
                    insn->src_reg != BPF_MAY_GOTO ||
                    insn->dst_reg || insn->imm || insn->off == 0) {
                        verbose(env, "invalid may_goto off %d imm %d\n",
                                insn->off, insn->imm);
                        return -EINVAL;
                }
                prev_st = find_prev_entry(env, cur_st->parent, idx);

                /* branch out 'fallthrough' insn as a new state to explore */
                queued_st = push_stack(env, idx + 1, idx, false);
                if (!queued_st)
                        return -ENOMEM;

                queued_st->may_goto_depth++;
                if (prev_st)
                        widen_imprecise_scalars(env, prev_st, queued_st);
                *insn_idx += insn->off;
                return 0;
        }

        /* check src2 operand */
        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
        if (err)
                return err;

        dst_reg = &regs[insn->dst_reg];
        if (BPF_SRC(insn->code) == BPF_X) {
                if (insn->imm != 0) {
                        verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
                        return -EINVAL;
                }

                /* check src1 operand */
                err = check_reg_arg(env, insn->src_reg, SRC_OP);
                if (err)
                        return err;

                src_reg = &regs[insn->src_reg];
                if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
                    is_pointer_value(env, insn->src_reg)) {
                        verbose(env, "R%d pointer comparison prohibited\n",
                                insn->src_reg);
                        return -EACCES;
                }
        } else {
                if (insn->src_reg != BPF_REG_0) {
                        verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
                        return -EINVAL;
                }
                src_reg = &env->fake_reg[0];
                memset(src_reg, 0, sizeof(*src_reg));
                src_reg->type = SCALAR_VALUE;
                __mark_reg_known(src_reg, insn->imm);
        }

        is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
        pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
        if (pred >= 0) {
                /* If we get here with a dst_reg pointer type it is because
                 * above is_branch_taken() special cased the 0 comparison.
                 */
                if (!__is_pointer_value(false, dst_reg))
                        err = mark_chain_precision(env, insn->dst_reg);
                if (BPF_SRC(insn->code) == BPF_X && !err &&
                    !__is_pointer_value(false, src_reg))
                        err = mark_chain_precision(env, insn->src_reg);
                if (err)
                        return err;
        }

        if (pred == 1) {
                /* Only follow the goto, ignore fall-through. If needed, push
                 * the fall-through branch for simulation under speculative
                 * execution.
                 */
                if (!env->bypass_spec_v1 &&
                    !sanitize_speculative_path(env, insn, *insn_idx + 1,
                                               *insn_idx))
                        return -EFAULT;
                if (env->log.level & BPF_LOG_LEVEL)
                        print_insn_state(env, this_branch->frame[this_branch->curframe]);
                *insn_idx += insn->off;
                return 0;
        } else if (pred == 0) {
                /* Only follow the fall-through branch, since that's where the
                 * program will go. If needed, push the goto branch for
                 * simulation under speculative execution.
                 */
                if (!env->bypass_spec_v1 &&
                    !sanitize_speculative_path(env, insn,
                                               *insn_idx + insn->off + 1,
                                               *insn_idx))
                        return -EFAULT;
                if (env->log.level & BPF_LOG_LEVEL)
                        print_insn_state(env, this_branch->frame[this_branch->curframe]);
                return 0;
        }

        other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
                                  false);
        if (!other_branch)
                return -EFAULT;
        other_branch_regs = other_branch->frame[other_branch->curframe]->regs;

        if (BPF_SRC(insn->code) == BPF_X) {
                err = reg_set_min_max(env,
                                      &other_branch_regs[insn->dst_reg],
                                      &other_branch_regs[insn->src_reg],
                                      dst_reg, src_reg, opcode, is_jmp32);
        } else /* BPF_SRC(insn->code) == BPF_K */ {
                /* reg_set_min_max() can mangle the fake_reg. Make a copy
                 * so that these are two different memory locations. The
                 * src_reg is not used beyond here in context of K.
                 */
                memcpy(&env->fake_reg[1], &env->fake_reg[0],
                       sizeof(env->fake_reg[0]));
                err = reg_set_min_max(env,
                                      &other_branch_regs[insn->dst_reg],
                                      &env->fake_reg[0],
                                      dst_reg, &env->fake_reg[1],
                                      opcode, is_jmp32);
        }
        if (err)
                return err;

        if (BPF_SRC(insn->code) == BPF_X &&
            src_reg->type == SCALAR_VALUE && src_reg->id &&
            !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
                find_equal_scalars(this_branch, src_reg);
                find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
        }
        if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
            !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
                find_equal_scalars(this_branch, dst_reg);
                find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
        }

        /* if one pointer register is compared to another pointer
         * register check if PTR_MAYBE_NULL could be lifted.
         * E.g. register A - maybe null
         *      register B - not null
         * for JNE A, B, ... - A is not null in the false branch;
         * for JEQ A, B, ... - A is not null in the true branch.
         *
         * Since PTR_TO_BTF_ID points to a kernel struct that does
         * not need to be null checked by the BPF program, i.e.,
         * could be null even without PTR_MAYBE_NULL marking, so
         * only propagate nullness when neither reg is that type.
         */
        if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
            __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
            type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
            base_type(src_reg->type) != PTR_TO_BTF_ID &&
            base_type(dst_reg->type) != PTR_TO_BTF_ID) {
                eq_branch_regs = NULL;
                switch (opcode) {
                case BPF_JEQ:
                        eq_branch_regs = other_branch_regs;
                        break;
                case BPF_JNE:
                        eq_branch_regs = regs;
                        break;
                default:
                        /* do nothing */
                        break;
                }
                if (eq_branch_regs) {
                        if (type_may_be_null(src_reg->type))
                                mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
                        else
                                mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
                }
        }

        /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
         * NOTE: these optimizations below are related with pointer comparison
         *       which will never be JMP32.
         */
        if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
            insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
            type_may_be_null(dst_reg->type)) {
                /* Mark all identical registers in each branch as either
                 * safe or unknown depending R == 0 or R != 0 conditional.
                 */
                mark_ptr_or_null_regs(this_branch, insn->dst_reg,
                                      opcode == BPF_JNE);
                mark_ptr_or_null_regs(other_branch, insn->dst_reg,
                                      opcode == BPF_JEQ);
        } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
                                           this_branch, other_branch) &&
                   is_pointer_value(env, insn->dst_reg)) {
                verbose(env, "R%d pointer comparison prohibited\n",
                        insn->dst_reg);
                return -EACCES;
        }
        if (env->log.level & BPF_LOG_LEVEL)
                print_insn_state(env, this_branch->frame[this_branch->curframe]);
        return 0;
}

/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        struct bpf_insn_aux_data *aux = cur_aux(env);
        struct bpf_reg_state *regs = cur_regs(env);
        struct bpf_reg_state *dst_reg;
        struct bpf_map *map;
        int err;

        if (BPF_SIZE(insn->code) != BPF_DW) {
                verbose(env, "invalid BPF_LD_IMM insn\n");
                return -EINVAL;
        }
        if (insn->off != 0) {
                verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
                return -EINVAL;
        }

        err = check_reg_arg(env, insn->dst_reg, DST_OP);
        if (err)
                return err;

        dst_reg = &regs[insn->dst_reg];
        if (insn->src_reg == 0) {
                u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;

                dst_reg->type = SCALAR_VALUE;
                __mark_reg_known(&regs[insn->dst_reg], imm);
                return 0;
        }

        /* All special src_reg cases are listed below. From this point onwards
         * we either succeed and assign a corresponding dst_reg->type after
         * zeroing the offset, or fail and reject the program.
         */
        mark_reg_known_zero(env, regs, insn->dst_reg);

        if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
                dst_reg->type = aux->btf_var.reg_type;
                switch (base_type(dst_reg->type)) {
                case PTR_TO_MEM:
                        dst_reg->mem_size = aux->btf_var.mem_size;
                        break;
                case PTR_TO_BTF_ID:
                        dst_reg->btf = aux->btf_var.btf;
                        dst_reg->btf_id = aux->btf_var.btf_id;
                        break;
                default:
                        verbose(env, "bpf verifier is misconfigured\n");
                        return -EFAULT;
                }
                return 0;
        }

        if (insn->src_reg == BPF_PSEUDO_FUNC) {
                struct bpf_prog_aux *aux = env->prog->aux;
                u32 subprogno = find_subprog(env,
                                             env->insn_idx + insn->imm + 1);

                if (!aux->func_info) {
                        verbose(env, "missing btf func_info\n");
                        return -EINVAL;
                }
                if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
                        verbose(env, "callback function not static\n");
                        return -EINVAL;
                }

                dst_reg->type = PTR_TO_FUNC;
                dst_reg->subprogno = subprogno;
                return 0;
        }

        map = env->used_maps[aux->map_index];
        dst_reg->map_ptr = map;

        if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
            insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
                if (map->map_type == BPF_MAP_TYPE_ARENA) {
                        __mark_reg_unknown(env, dst_reg);
                        return 0;
                }
                dst_reg->type = PTR_TO_MAP_VALUE;
                dst_reg->off = aux->map_off;
                WARN_ON_ONCE(map->max_entries != 1);
                /* We want reg->id to be same (0) as map_value is not distinct */
        } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
                   insn->src_reg == BPF_PSEUDO_MAP_IDX) {
                dst_reg->type = CONST_PTR_TO_MAP;
        } else {
                verbose(env, "bpf verifier is misconfigured\n");
                return -EINVAL;
        }

        return 0;
}

static bool may_access_skb(enum bpf_prog_type type)
{
        switch (type) {
        case BPF_PROG_TYPE_SOCKET_FILTER:
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
                return true;
        default:
                return false;
        }
}

/* verify safety of LD_ABS|LD_IND instructions:
 * - they can only appear in the programs where ctx == skb
 * - since they are wrappers of function calls, they scratch R1-R5 registers,
 *   preserve R6-R9, and store return value into R0
 *
 * Implicit input:
 *   ctx == skb == R6 == CTX
 *
 * Explicit input:
 *   SRC == any register
 *   IMM == 32-bit immediate
 *
 * Output:
 *   R0 - 8/16/32-bit skb data converted to cpu endianness
 */
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
        struct bpf_reg_state *regs = cur_regs(env);
        static const int ctx_reg = BPF_REG_6;
        u8 mode = BPF_MODE(insn->code);
        int i, err;

        if (!may_access_skb(resolve_prog_type(env->prog))) {
                verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
                return -EINVAL;
        }

        if (!env->ops->gen_ld_abs) {
                verbose(env, "bpf verifier is misconfigured\n");
                return -EINVAL;
        }

        if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
            BPF_SIZE(insn->code) == BPF_DW ||
            (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
                verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
                return -EINVAL;
        }

        /* check whether implicit source operand (register R6) is readable */
        err = check_reg_arg(env, ctx_reg, SRC_OP);
        if (err)
                return err;

        /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as
         * gen_ld_abs() may terminate the program at runtime, leading to
         * reference leak.
         */
        err = check_reference_leak(env, false);
        if (err) {
                verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
                return err;
        }

        if (env->cur_state->active_lock.ptr) {
                verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
                return -EINVAL;
        }

        if (env->cur_state->active_rcu_lock) {
                verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
                return -EINVAL;
        }

        if (env->cur_state->active_preempt_lock) {
                verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n");
                return -EINVAL;
        }

        if (regs[ctx_reg].type != PTR_TO_CTX) {
                verbose(env,
                        "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
                return -EINVAL;
        }

        if (mode == BPF_IND) {
                /* check explicit source operand */
                err = check_reg_arg(env, insn->src_reg, SRC_OP);
                if (err)
                        return err;
        }

        err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
        if (err < 0)
                return err;

        /* reset caller saved regs to unreadable */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                mark_reg_not_init(env, regs, caller_saved[i]);
                check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
        }

        /* mark destination R0 register as readable, since it contains
         * the value fetched from the packet.
         * Already marked as written above.
         */
        mark_reg_unknown(env, regs, BPF_REG_0);
        /* ld_abs load up to 32-bit skb data. */
        regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
        return 0;
}

static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
{
        const char *exit_ctx = "At program exit";
        struct tnum enforce_attach_type_range = tnum_unknown;
        const struct bpf_prog *prog = env->prog;
        struct bpf_reg_state *reg;
        struct bpf_retval_range range = retval_range(0, 1);
        enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
        int err;
        struct bpf_func_state *frame = env->cur_state->frame[0];
        const bool is_subprog = frame->subprogno;

        /* LSM and struct_ops func-ptr's return type could be "void" */
        if (!is_subprog || frame->in_exception_callback_fn) {
                switch (prog_type) {
                case BPF_PROG_TYPE_LSM:
                        if (prog->expected_attach_type == BPF_LSM_CGROUP)
                                /* See below, can be 0 or 0-1 depending on hook. */
                                break;
                        fallthrough;
                case BPF_PROG_TYPE_STRUCT_OPS:
                        if (!prog->aux->attach_func_proto->type)
                                return 0;
                        break;
                default:
                        break;
                }
        }

        /* eBPF calling convention is such that R0 is used
         * to return the value from eBPF program.
         * Make sure that it's readable at this time
         * of bpf_exit, which means that program wrote
         * something into it earlier
         */
        err = check_reg_arg(env, regno, SRC_OP);
        if (err)
                return err;

        if (is_pointer_value(env, regno)) {
                verbose(env, "R%d leaks addr as return value\n", regno);
                return -EACCES;
        }

        reg = cur_regs(env) + regno;

        if (frame->in_async_callback_fn) {
                /* enforce return zero from async callbacks like timer */
                exit_ctx = "At async callback return";
                range = retval_range(0, 0);
                goto enforce_retval;
        }

        if (is_subprog && !frame->in_exception_callback_fn) {
                if (reg->type != SCALAR_VALUE) {
                        verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
                                regno, reg_type_str(env, reg->type));
                        return -EINVAL;
                }
                return 0;
        }

        switch (prog_type) {
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
                if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
                    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
                    env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
                    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
                    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
                    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
                    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
                    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
                    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
                        range = retval_range(1, 1);
                if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
                    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
                        range = retval_range(0, 3);
                break;
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
                        range = retval_range(0, 3);
                        enforce_attach_type_range = tnum_range(2, 3);
                }
                break;
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                break;
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
                if (!env->prog->aux->attach_btf_id)
                        return 0;
                range = retval_range(0, 0);
                break;
        case BPF_PROG_TYPE_TRACING:
                switch (env->prog->expected_attach_type) {
                case BPF_TRACE_FENTRY:
                case BPF_TRACE_FEXIT:
                        range = retval_range(0, 0);
                        break;
                case BPF_TRACE_RAW_TP:
                case BPF_MODIFY_RETURN:
                        return 0;
                case BPF_TRACE_ITER:
                        break;
                default:
                        return -ENOTSUPP;
                }
                break;
        case BPF_PROG_TYPE_SK_LOOKUP:
                range = retval_range(SK_DROP, SK_PASS);
                break;

        case BPF_PROG_TYPE_LSM:
                if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
                        /* Regular BPF_PROG_TYPE_LSM programs can return
                         * any value.
                         */
                        return 0;
                }
                if (!env->prog->aux->attach_func_proto->type) {
                        /* Make sure programs that attach to void
                         * hooks don't try to modify return value.
                         */
                        range = retval_range(1, 1);
                }
                break;

        case BPF_PROG_TYPE_NETFILTER:
                range = retval_range(NF_DROP, NF_ACCEPT);
                break;
        case BPF_PROG_TYPE_EXT:
                /* freplace program can return anything as its return value
                 * depends on the to-be-replaced kernel func or bpf program.
                 */
        default:
                return 0;
        }

enforce_retval:
        if (reg->type != SCALAR_VALUE) {
                verbose(env, "%s the register R%d is not a known value (%s)\n",
                        exit_ctx, regno, reg_type_str(env, reg->type));
                return -EINVAL;
        }

        err = mark_chain_precision(env, regno);
        if (err)
                return err;

        if (!retval_range_within(range, reg)) {
                verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
                if (!is_subprog &&
                    prog->expected_attach_type == BPF_LSM_CGROUP &&
                    prog_type == BPF_PROG_TYPE_LSM &&
                    !prog->aux->attach_func_proto->type)
                        verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
                return -EINVAL;
        }

        if (!tnum_is_unknown(enforce_attach_type_range) &&
            tnum_in(enforce_attach_type_range, reg->var_off))
                env->prog->enforce_expected_attach_type = 1;
        return 0;
}

/* non-recursive DFS pseudo code
 * 1  procedure DFS-iterative(G,v):
 * 2      label v as discovered
 * 3      let S be a stack
 * 4      S.push(v)
 * 5      while S is not empty
 * 6            t <- S.peek()
 * 7            if t is what we're looking for:
 * 8                return t
 * 9            for all edges e in G.adjacentEdges(t) do
 * 10               if edge e is already labelled
 * 11                   continue with the next edge
 * 12               w <- G.adjacentVertex(t,e)
 * 13               if vertex w is not discovered and not explored
 * 14                   label e as tree-edge
 * 15                   label w as discovered
 * 16                   S.push(w)
 * 17                   continue at 5
 * 18               else if vertex w is discovered
 * 19                   label e as back-edge
 * 20               else
 * 21                   // vertex w is explored
 * 22                   label e as forward- or cross-edge
 * 23           label t as explored
 * 24           S.pop()
 *
 * convention:
 * 0x10 - discovered
 * 0x11 - discovered and fall-through edge labelled
 * 0x12 - discovered and fall-through and branch edges labelled
 * 0x20 - explored
 */

enum {
        DISCOVERED = 0x10,
        EXPLORED = 0x20,
        FALLTHROUGH = 1,
        BRANCH = 2,
};

static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].prune_point = true;
}

static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].prune_point;
}

static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].force_checkpoint = true;
}

static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].force_checkpoint;
}

static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
{
        env->insn_aux_data[idx].calls_callback = true;
}

static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].calls_callback;
}

enum {
        DONE_EXPLORING = 0,
        KEEP_EXPLORING = 1,
};

/* t, w, e - match pseudo-code above:
 * t - index of current instruction
 * w - next instruction
 * e - edge
 */
static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
        int *insn_stack = env->cfg.insn_stack;
        int *insn_state = env->cfg.insn_state;

        if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
                return DONE_EXPLORING;

        if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
                return DONE_EXPLORING;

        if (w < 0 || w >= env->prog->len) {
                verbose_linfo(env, t, "%d: ", t);
                verbose(env, "jump out of range from insn %d to %d\n", t, w);
                return -EINVAL;
        }

        if (e == BRANCH) {
                /* mark branch target for state pruning */
                mark_prune_point(env, w);
                mark_jmp_point(env, w);
        }

        if (insn_state[w] == 0) {
                /* tree-edge */
                insn_state[t] = DISCOVERED | e;
                insn_state[w] = DISCOVERED;
                if (env->cfg.cur_stack >= env->prog->len)
                        return -E2BIG;
                insn_stack[env->cfg.cur_stack++] = w;
                return KEEP_EXPLORING;
        } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
                if (env->bpf_capable)
                        return DONE_EXPLORING;
                verbose_linfo(env, t, "%d: ", t);
                verbose_linfo(env, w, "%d: ", w);
                verbose(env, "back-edge from insn %d to %d\n", t, w);
                return -EINVAL;
        } else if (insn_state[w] == EXPLORED) {
                /* forward- or cross-edge */
                insn_state[t] = DISCOVERED | e;
        } else {
                verbose(env, "insn state internal bug\n");
                return -EFAULT;
        }
        return DONE_EXPLORING;
}

static int visit_func_call_insn(int t, struct bpf_insn *insns,
                                struct bpf_verifier_env *env,
                                bool visit_callee)
{
        int ret, insn_sz;

        insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
        ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
        if (ret)
                return ret;

        mark_prune_point(env, t + insn_sz);
        /* when we exit from subprog, we need to record non-linear history */
        mark_jmp_point(env, t + insn_sz);

        if (visit_callee) {
                mark_prune_point(env, t);
                ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
        }
        return ret;
}

/* Visits the instruction at index t and returns one of the following:
 *  < 0 - an error occurred
 *  DONE_EXPLORING - the instruction was fully explored
 *  KEEP_EXPLORING - there is still work to be done before it is fully explored
 */
static int visit_insn(int t, struct bpf_verifier_env *env)
{
        struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
        int ret, off, insn_sz;

        if (bpf_pseudo_func(insn))
                return visit_func_call_insn(t, insns, env, true);

        /* All non-branch instructions have a single fall-through edge. */
        if (BPF_CLASS(insn->code) != BPF_JMP &&
            BPF_CLASS(insn->code) != BPF_JMP32) {
                insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
                return push_insn(t, t + insn_sz, FALLTHROUGH, env);
        }

        switch (BPF_OP(insn->code)) {
        case BPF_EXIT:
                return DONE_EXPLORING;

        case BPF_CALL:
                if (is_async_callback_calling_insn(insn))
                        /* Mark this call insn as a prune point to trigger
                         * is_state_visited() check before call itself is
                         * processed by __check_func_call(). Otherwise new
                         * async state will be pushed for further exploration.
                         */
                        mark_prune_point(env, t);
                /* For functions that invoke callbacks it is not known how many times
                 * callback would be called. Verifier models callback calling functions
                 * by repeatedly visiting callback bodies and returning to origin call
                 * instruction.
                 * In order to stop such iteration verifier needs to identify when a
                 * state identical some state from a previous iteration is reached.
                 * Check below forces creation of checkpoint before callback calling
                 * instruction to allow search for such identical states.
                 */
                if (is_sync_callback_calling_insn(insn)) {
                        mark_calls_callback(env, t);
                        mark_force_checkpoint(env, t);
                        mark_prune_point(env, t);
                        mark_jmp_point(env, t);
                }
                if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
                        struct bpf_kfunc_call_arg_meta meta;

                        ret = fetch_kfunc_meta(env, insn, &meta, NULL);
                        if (ret == 0 && is_iter_next_kfunc(&meta)) {
                                mark_prune_point(env, t);
                                /* Checking and saving state checkpoints at iter_next() call
                                 * is crucial for fast convergence of open-coded iterator loop
                                 * logic, so we need to force it. If we don't do that,
                                 * is_state_visited() might skip saving a checkpoint, causing
                                 * unnecessarily long sequence of not checkpointed
                                 * instructions and jumps, leading to exhaustion of jump
                                 * history buffer, and potentially other undesired outcomes.
                                 * It is expected that with correct open-coded iterators
                                 * convergence will happen quickly, so we don't run a risk of
                                 * exhausting memory.
                                 */
                                mark_force_checkpoint(env, t);
                        }
                }
                return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);

        case BPF_JA:
                if (BPF_SRC(insn->code) != BPF_K)
                        return -EINVAL;

                if (BPF_CLASS(insn->code) == BPF_JMP)
                        off = insn->off;
                else
                        off = insn->imm;

                /* unconditional jump with single edge */
                ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
                if (ret)
                        return ret;

                mark_prune_point(env, t + off + 1);
                mark_jmp_point(env, t + off + 1);

                return ret;

        default:
                /* conditional jump with two edges */
                mark_prune_point(env, t);
                if (is_may_goto_insn(insn))
                        mark_force_checkpoint(env, t);

                ret = push_insn(t, t + 1, FALLTHROUGH, env);
                if (ret)
                        return ret;

                return push_insn(t, t + insn->off + 1, BRANCH, env);
        }
}

/* non-recursive depth-first-search to detect loops in BPF program
 * loop == back-edge in directed graph
 */
static int check_cfg(struct bpf_verifier_env *env)
{
        int insn_cnt = env->prog->len;
        int *insn_stack, *insn_state;
        int ex_insn_beg, i, ret = 0;
        bool ex_done = false;

        insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
        if (!insn_state)
                return -ENOMEM;

        insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
        if (!insn_stack) {
                kvfree(insn_state);
                return -ENOMEM;
        }

        insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
        insn_stack[0] = 0; /* 0 is the first instruction */
        env->cfg.cur_stack = 1;

walk_cfg:
        while (env->cfg.cur_stack > 0) {
                int t = insn_stack[env->cfg.cur_stack - 1];

                ret = visit_insn(t, env);
                switch (ret) {
                case DONE_EXPLORING:
                        insn_state[t] = EXPLORED;
                        env->cfg.cur_stack--;
                        break;
                case KEEP_EXPLORING:
                        break;
                default:
                        if (ret > 0) {
                                verbose(env, "visit_insn internal bug\n");
                                ret = -EFAULT;
                        }
                        goto err_free;
                }
        }

        if (env->cfg.cur_stack < 0) {
                verbose(env, "pop stack internal bug\n");
                ret = -EFAULT;
                goto err_free;
        }

        if (env->exception_callback_subprog && !ex_done) {
                ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;

                insn_state[ex_insn_beg] = DISCOVERED;
                insn_stack[0] = ex_insn_beg;
                env->cfg.cur_stack = 1;
                ex_done = true;
                goto walk_cfg;
        }

        for (i = 0; i < insn_cnt; i++) {
                struct bpf_insn *insn = &env->prog->insnsi[i];

                if (insn_state[i] != EXPLORED) {
                        verbose(env, "unreachable insn %d\n", i);
                        ret = -EINVAL;
                        goto err_free;
                }
                if (bpf_is_ldimm64(insn)) {
                        if (insn_state[i + 1] != 0) {
                                verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
                                ret = -EINVAL;
                                goto err_free;
                        }
                        i++; /* skip second half of ldimm64 */
                }
        }
        ret = 0; /* cfg looks good */

err_free:
        kvfree(insn_state);
        kvfree(insn_stack);
        env->cfg.insn_state = env->cfg.insn_stack = NULL;
        return ret;
}

static int check_abnormal_return(struct bpf_verifier_env *env)
{
        int i;

        for (i = 1; i < env->subprog_cnt; i++) {
                if (env->subprog_info[i].has_ld_abs) {
                        verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
                        return -EINVAL;
                }
                if (env->subprog_info[i].has_tail_call) {
                        verbose(env, "tail_call is not allowed in subprogs without BTF\n");
                        return -EINVAL;
                }
        }
        return 0;
}

/* The minimum supported BTF func info size */
#define MIN_BPF_FUNCINFO_SIZE        8
#define MAX_FUNCINFO_REC_SIZE        252

static int check_btf_func_early(struct bpf_verifier_env *env,
                                const union bpf_attr *attr,
                                bpfptr_t uattr)
{
        u32 krec_size = sizeof(struct bpf_func_info);
        const struct btf_type *type, *func_proto;
        u32 i, nfuncs, urec_size, min_size;
        struct bpf_func_info *krecord;
        struct bpf_prog *prog;
        const struct btf *btf;
        u32 prev_offset = 0;
        bpfptr_t urecord;
        int ret = -ENOMEM;

        nfuncs = attr->func_info_cnt;
        if (!nfuncs) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }

        urec_size = attr->func_info_rec_size;
        if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
            urec_size > MAX_FUNCINFO_REC_SIZE ||
            urec_size % sizeof(u32)) {
                verbose(env, "invalid func info rec size %u\n", urec_size);
                return -EINVAL;
        }

        prog = env->prog;
        btf = prog->aux->btf;

        urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
        min_size = min_t(u32, krec_size, urec_size);

        krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
        if (!krecord)
                return -ENOMEM;

        for (i = 0; i < nfuncs; i++) {
                ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
                if (ret) {
                        if (ret == -E2BIG) {
                                verbose(env, "nonzero tailing record in func info");
                                /* set the size kernel expects so loader can zero
                                 * out the rest of the record.
                                 */
                                if (copy_to_bpfptr_offset(uattr,
                                                          offsetof(union bpf_attr, func_info_rec_size),
                                                          &min_size, sizeof(min_size)))
                                        ret = -EFAULT;
                        }
                        goto err_free;
                }

                if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
                        ret = -EFAULT;
                        goto err_free;
                }

                /* check insn_off */
                ret = -EINVAL;
                if (i == 0) {
                        if (krecord[i].insn_off) {
                                verbose(env,
                                        "nonzero insn_off %u for the first func info record",
                                        krecord[i].insn_off);
                                goto err_free;
                        }
                } else if (krecord[i].insn_off <= prev_offset) {
                        verbose(env,
                                "same or smaller insn offset (%u) than previous func info record (%u)",
                                krecord[i].insn_off, prev_offset);
                        goto err_free;
                }

                /* check type_id */
                type = btf_type_by_id(btf, krecord[i].type_id);
                if (!type || !btf_type_is_func(type)) {
                        verbose(env, "invalid type id %d in func info",
                                krecord[i].type_id);
                        goto err_free;
                }

                func_proto = btf_type_by_id(btf, type->type);
                if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
                        /* btf_func_check() already verified it during BTF load */
                        goto err_free;

                prev_offset = krecord[i].insn_off;
                bpfptr_add(&urecord, urec_size);
        }

        prog->aux->func_info = krecord;
        prog->aux->func_info_cnt = nfuncs;
        return 0;

err_free:
        kvfree(krecord);
        return ret;
}

static int check_btf_func(struct bpf_verifier_env *env,
                          const union bpf_attr *attr,
                          bpfptr_t uattr)
{
        const struct btf_type *type, *func_proto, *ret_type;
        u32 i, nfuncs, urec_size;
        struct bpf_func_info *krecord;
        struct bpf_func_info_aux *info_aux = NULL;
        struct bpf_prog *prog;
        const struct btf *btf;
        bpfptr_t urecord;
        bool scalar_return;
        int ret = -ENOMEM;

        nfuncs = attr->func_info_cnt;
        if (!nfuncs) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }
        if (nfuncs != env->subprog_cnt) {
                verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
                return -EINVAL;
        }

        urec_size = attr->func_info_rec_size;

        prog = env->prog;
        btf = prog->aux->btf;

        urecord = make_bpfptr(attr->func_info, uattr.is_kernel);

        krecord = prog->aux->func_info;
        info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
        if (!info_aux)
                return -ENOMEM;

        for (i = 0; i < nfuncs; i++) {
                /* check insn_off */
                ret = -EINVAL;

                if (env->subprog_info[i].start != krecord[i].insn_off) {
                        verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
                        goto err_free;
                }

                /* Already checked type_id */
                type = btf_type_by_id(btf, krecord[i].type_id);
                info_aux[i].linkage = BTF_INFO_VLEN(type->info);
                /* Already checked func_proto */
                func_proto = btf_type_by_id(btf, type->type);

                ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
                scalar_return =
                        btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
                if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
                        verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
                        goto err_free;
                }
                if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
                        verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
                        goto err_free;
                }

                bpfptr_add(&urecord, urec_size);
        }

        prog->aux->func_info_aux = info_aux;
        return 0;

err_free:
        kfree(info_aux);
        return ret;
}

static void adjust_btf_func(struct bpf_verifier_env *env)
{
        struct bpf_prog_aux *aux = env->prog->aux;
        int i;

        if (!aux->func_info)
                return;

        /* func_info is not available for hidden subprogs */
        for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
                aux->func_info[i].insn_off = env->subprog_info[i].start;
}

#define MIN_BPF_LINEINFO_SIZE        offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE        MAX_FUNCINFO_REC_SIZE

static int check_btf_line(struct bpf_verifier_env *env,
                          const union bpf_attr *attr,
                          bpfptr_t uattr)
{
        u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
        struct bpf_subprog_info *sub;
        struct bpf_line_info *linfo;
        struct bpf_prog *prog;
        const struct btf *btf;
        bpfptr_t ulinfo;
        int err;

        nr_linfo = attr->line_info_cnt;
        if (!nr_linfo)
                return 0;
        if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
                return -EINVAL;

        rec_size = attr->line_info_rec_size;
        if (rec_size < MIN_BPF_LINEINFO_SIZE ||
            rec_size > MAX_LINEINFO_REC_SIZE ||
            rec_size & (sizeof(u32) - 1))
                return -EINVAL;

        /* Need to zero it in case the userspace may
         * pass in a smaller bpf_line_info object.
         */
        linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
                         GFP_KERNEL | __GFP_NOWARN);
        if (!linfo)
                return -ENOMEM;

        prog = env->prog;
        btf = prog->aux->btf;

        s = 0;
        sub = env->subprog_info;
        ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
        expected_size = sizeof(struct bpf_line_info);
        ncopy = min_t(u32, expected_size, rec_size);
        for (i = 0; i < nr_linfo; i++) {
                err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
                if (err) {
                        if (err == -E2BIG) {
                                verbose(env, "nonzero tailing record in line_info");
                                if (copy_to_bpfptr_offset(uattr,
                                                          offsetof(union bpf_attr, line_info_rec_size),
                                                          &expected_size, sizeof(expected_size)))
                                        err = -EFAULT;
                        }
                        goto err_free;
                }

                if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
                        err = -EFAULT;
                        goto err_free;
                }

                /*
                 * Check insn_off to ensure
                 * 1) strictly increasing AND
                 * 2) bounded by prog->len
                 *
                 * The linfo[0].insn_off == 0 check logically falls into
                 * the later "missing bpf_line_info for func..." case
                 * because the first linfo[0].insn_off must be the
                 * first sub also and the first sub must have
                 * subprog_info[0].start == 0.
                 */
                if ((i && linfo[i].insn_off <= prev_offset) ||
                    linfo[i].insn_off >= prog->len) {
                        verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
                                i, linfo[i].insn_off, prev_offset,
                                prog->len);
                        err = -EINVAL;
                        goto err_free;
                }

                if (!prog->insnsi[linfo[i].insn_off].code) {
                        verbose(env,
                                "Invalid insn code at line_info[%u].insn_off\n",
                                i);
                        err = -EINVAL;
                        goto err_free;
                }

                if (!btf_name_by_offset(btf, linfo[i].line_off) ||
                    !btf_name_by_offset(btf, linfo[i].file_name_off)) {
                        verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
                        err = -EINVAL;
                        goto err_free;
                }

                if (s != env->subprog_cnt) {
                        if (linfo[i].insn_off == sub[s].start) {
                                sub[s].linfo_idx = i;
                                s++;
                        } else if (sub[s].start < linfo[i].insn_off) {
                                verbose(env, "missing bpf_line_info for func#%u\n", s);
                                err = -EINVAL;
                                goto err_free;
                        }
                }

                prev_offset = linfo[i].insn_off;
                bpfptr_add(&ulinfo, rec_size);
        }

        if (s != env->subprog_cnt) {
                verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
                        env->subprog_cnt - s, s);
                err = -EINVAL;
                goto err_free;
        }

        prog->aux->linfo = linfo;
        prog->aux->nr_linfo = nr_linfo;

        return 0;

err_free:
        kvfree(linfo);
        return err;
}

#define MIN_CORE_RELO_SIZE        sizeof(struct bpf_core_relo)
#define MAX_CORE_RELO_SIZE        MAX_FUNCINFO_REC_SIZE

static int check_core_relo(struct bpf_verifier_env *env,
                           const union bpf_attr *attr,
                           bpfptr_t uattr)
{
        u32 i, nr_core_relo, ncopy, expected_size, rec_size;
        struct bpf_core_relo core_relo = {};
        struct bpf_prog *prog = env->prog;
        const struct btf *btf = prog->aux->btf;
        struct bpf_core_ctx ctx = {
                .log = &env->log,
                .btf = btf,
        };
        bpfptr_t u_core_relo;
        int err;

        nr_core_relo = attr->core_relo_cnt;
        if (!nr_core_relo)
                return 0;
        if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
                return -EINVAL;

        rec_size = attr->core_relo_rec_size;
        if (rec_size < MIN_CORE_RELO_SIZE ||
            rec_size > MAX_CORE_RELO_SIZE ||
            rec_size % sizeof(u32))
                return -EINVAL;

        u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
        expected_size = sizeof(struct bpf_core_relo);
        ncopy = min_t(u32, expected_size, rec_size);

        /* Unlike func_info and line_info, copy and apply each CO-RE
         * relocation record one at a time.
         */
        for (i = 0; i < nr_core_relo; i++) {
                /* future proofing when sizeof(bpf_core_relo) changes */
                err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
                if (err) {
                        if (err == -E2BIG) {
                                verbose(env, "nonzero tailing record in core_relo");
                                if (copy_to_bpfptr_offset(uattr,
                                                          offsetof(union bpf_attr, core_relo_rec_size),
                                                          &expected_size, sizeof(expected_size)))
                                        err = -EFAULT;
                        }
                        break;
                }

                if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
                        err = -EFAULT;
                        break;
                }

                if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
                        verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
                                i, core_relo.insn_off, prog->len);
                        err = -EINVAL;
                        break;
                }

                err = bpf_core_apply(&ctx, &core_relo, i,
                                     &prog->insnsi[core_relo.insn_off / 8]);
                if (err)
                        break;
                bpfptr_add(&u_core_relo, rec_size);
        }
        return err;
}

static int check_btf_info_early(struct bpf_verifier_env *env,
                                const union bpf_attr *attr,
                                bpfptr_t uattr)
{
        struct btf *btf;
        int err;

        if (!attr->func_info_cnt && !attr->line_info_cnt) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }

        btf = btf_get_by_fd(attr->prog_btf_fd);
        if (IS_ERR(btf))
                return PTR_ERR(btf);
        if (btf_is_kernel(btf)) {
                btf_put(btf);
                return -EACCES;
        }
        env->prog->aux->btf = btf;

        err = check_btf_func_early(env, attr, uattr);
        if (err)
                return err;
        return 0;
}

static int check_btf_info(struct bpf_verifier_env *env,
                          const union bpf_attr *attr,
                          bpfptr_t uattr)
{
        int err;

        if (!attr->func_info_cnt && !attr->line_info_cnt) {
                if (check_abnormal_return(env))
                        return -EINVAL;
                return 0;
        }

        err = check_btf_func(env, attr, uattr);
        if (err)
                return err;

        err = check_btf_line(env, attr, uattr);
        if (err)
                return err;

        err = check_core_relo(env, attr, uattr);
        if (err)
                return err;

        return 0;
}

/* check %cur's range satisfies %old's */
static bool range_within(const struct bpf_reg_state *old,
                         const struct bpf_reg_state *cur)
{
        return old->umin_value <= cur->umin_value &&
               old->umax_value >= cur->umax_value &&
               old->smin_value <= cur->smin_value &&
               old->smax_value >= cur->smax_value &&
               old->u32_min_value <= cur->u32_min_value &&
               old->u32_max_value >= cur->u32_max_value &&
               old->s32_min_value <= cur->s32_min_value &&
               old->s32_max_value >= cur->s32_max_value;
}

/* If in the old state two registers had the same id, then they need to have
 * the same id in the new state as well.  But that id could be different from
 * the old state, so we need to track the mapping from old to new ids.
 * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
 * regs with old id 5 must also have new id 9 for the new state to be safe.  But
 * regs with a different old id could still have new id 9, we don't care about
 * that.
 * So we look through our idmap to see if this old id has been seen before.  If
 * so, we require the new id to match; otherwise, we add the id pair to the map.
 */
static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
        struct bpf_id_pair *map = idmap->map;
        unsigned int i;

        /* either both IDs should be set or both should be zero */
        if (!!old_id != !!cur_id)
                return false;

        if (old_id == 0) /* cur_id == 0 as well */
                return true;

        for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
                if (!map[i].old) {
                        /* Reached an empty slot; haven't seen this id before */
                        map[i].old = old_id;
                        map[i].cur = cur_id;
                        return true;
                }
                if (map[i].old == old_id)
                        return map[i].cur == cur_id;
                if (map[i].cur == cur_id)
                        return false;
        }
        /* We ran out of idmap slots, which should be impossible */
        WARN_ON_ONCE(1);
        return false;
}

/* Similar to check_ids(), but allocate a unique temporary ID
 * for 'old_id' or 'cur_id' of zero.
 * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
 */
static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
{
        old_id = old_id ? old_id : ++idmap->tmp_id_gen;
        cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;

        return check_ids(old_id, cur_id, idmap);
}

static void clean_func_state(struct bpf_verifier_env *env,
                             struct bpf_func_state *st)
{
        enum bpf_reg_liveness live;
        int i, j;

        for (i = 0; i < BPF_REG_FP; i++) {
                live = st->regs[i].live;
                /* liveness must not touch this register anymore */
                st->regs[i].live |= REG_LIVE_DONE;
                if (!(live & REG_LIVE_READ))
                        /* since the register is unused, clear its state
                         * to make further comparison simpler
                         */
                        __mark_reg_not_init(env, &st->regs[i]);
        }

        for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
                live = st->stack[i].spilled_ptr.live;
                /* liveness must not touch this stack slot anymore */
                st->stack[i].spilled_ptr.live |= REG_LIVE_DONE;
                if (!(live & REG_LIVE_READ)) {
                        __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
                        for (j = 0; j < BPF_REG_SIZE; j++)
                                st->stack[i].slot_type[j] = STACK_INVALID;
                }
        }
}

static void clean_verifier_state(struct bpf_verifier_env *env,
                                 struct bpf_verifier_state *st)
{
        int i;

        if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
                /* all regs in this state in all frames were already marked */
                return;

        for (i = 0; i <= st->curframe; i++)
                clean_func_state(env, st->frame[i]);
}

/* the parentage chains form a tree.
 * the verifier states are added to state lists at given insn and
 * pushed into state stack for future exploration.
 * when the verifier reaches bpf_exit insn some of the verifer states
 * stored in the state lists have their final liveness state already,
 * but a lot of states will get revised from liveness point of view when
 * the verifier explores other branches.
 * Example:
 * 1: r0 = 1
 * 2: if r1 == 100 goto pc+1
 * 3: r0 = 2
 * 4: exit
 * when the verifier reaches exit insn the register r0 in the state list of
 * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch
 * of insn 2 and goes exploring further. At the insn 4 it will walk the
 * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ.
 *
 * Since the verifier pushes the branch states as it sees them while exploring
 * the program the condition of walking the branch instruction for the second
 * time means that all states below this branch were already explored and
 * their final liveness marks are already propagated.
 * Hence when the verifier completes the search of state list in is_state_visited()
 * we can call this clean_live_states() function to mark all liveness states
 * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state'
 * will not be used.
 * This function also clears the registers and stack for states that !READ
 * to simplify state merging.
 *
 * Important note here that walking the same branch instruction in the callee
 * doesn't meant that the states are DONE. The verifier has to compare
 * the callsites
 */
static void clean_live_states(struct bpf_verifier_env *env, int insn,
                              struct bpf_verifier_state *cur)
{
        struct bpf_verifier_state_list *sl;

        sl = *explored_state(env, insn);
        while (sl) {
                if (sl->state.branches)
                        goto next;
                if (sl->state.insn_idx != insn ||
                    !same_callsites(&sl->state, cur))
                        goto next;
                clean_verifier_state(env, &sl->state);
next:
                sl = sl->next;
        }
}

static bool regs_exact(const struct bpf_reg_state *rold,
                       const struct bpf_reg_state *rcur,
                       struct bpf_idmap *idmap)
{
        return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
               check_ids(rold->id, rcur->id, idmap) &&
               check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
}

enum exact_level {
        NOT_EXACT,
        EXACT,
        RANGE_WITHIN
};

/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
                    struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
                    enum exact_level exact)
{
        if (exact == EXACT)
                return regs_exact(rold, rcur, idmap);

        if (!(rold->live & REG_LIVE_READ) && exact == NOT_EXACT)
                /* explored state didn't use this */
                return true;
        if (rold->type == NOT_INIT) {
                if (exact == NOT_EXACT || rcur->type == NOT_INIT)
                        /* explored state can't have used this */
                        return true;
        }

        /* Enforce that register types have to match exactly, including their
         * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
         * rule.
         *
         * One can make a point that using a pointer register as unbounded
         * SCALAR would be technically acceptable, but this could lead to
         * pointer leaks because scalars are allowed to leak while pointers
         * are not. We could make this safe in special cases if root is
         * calling us, but it's probably not worth the hassle.
         *
         * Also, register types that are *not* MAYBE_NULL could technically be
         * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
         * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
         * to the same map).
         * However, if the old MAYBE_NULL register then got NULL checked,
         * doing so could have affected others with the same id, and we can't
         * check for that because we lost the id when we converted to
         * a non-MAYBE_NULL variant.
         * So, as a general rule we don't allow mixing MAYBE_NULL and
         * non-MAYBE_NULL registers as well.
         */
        if (rold->type != rcur->type)
                return false;

        switch (base_type(rold->type)) {
        case SCALAR_VALUE:
                if (env->explore_alu_limits) {
                        /* explore_alu_limits disables tnum_in() and range_within()
                         * logic and requires everything to be strict
                         */
                        return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
                               check_scalar_ids(rold->id, rcur->id, idmap);
                }
                if (!rold->precise && exact == NOT_EXACT)
                        return true;
                /* Why check_ids() for scalar registers?
                 *
                 * Consider the following BPF code:
                 *   1: r6 = ... unbound scalar, ID=a ...
                 *   2: r7 = ... unbound scalar, ID=b ...
                 *   3: if (r6 > r7) goto +1
                 *   4: r6 = r7
                 *   5: if (r6 > X) goto ...
                 *   6: ... memory operation using r7 ...
                 *
                 * First verification path is [1-6]:
                 * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
                 * - at (5) r6 would be marked <= X, find_equal_scalars() would also mark
                 *   r7 <= X, because r6 and r7 share same id.
                 * Next verification path is [1-4, 6].
                 *
                 * Instruction (6) would be reached in two states:
                 *   I.  r6{.id=b}, r7{.id=b} via path 1-6;
                 *   II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
                 *
                 * Use check_ids() to distinguish these states.
                 * ---
                 * Also verify that new value satisfies old value range knowledge.
                 */
                return range_within(rold, rcur) &&
                       tnum_in(rold->var_off, rcur->var_off) &&
                       check_scalar_ids(rold->id, rcur->id, idmap);
        case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
        case PTR_TO_MEM:
        case PTR_TO_BUF:
        case PTR_TO_TP_BUFFER:
                /* If the new min/max/var_off satisfy the old ones and
                 * everything else matches, we are OK.
                 */
                return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
                       range_within(rold, rcur) &&
                       tnum_in(rold->var_off, rcur->var_off) &&
                       check_ids(rold->id, rcur->id, idmap) &&
                       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
        case PTR_TO_PACKET_META:
        case PTR_TO_PACKET:
                /* We must have at least as much range as the old ptr
                 * did, so that any accesses which were safe before are
                 * still safe.  This is true even if old range < old off,
                 * since someone could have accessed through (ptr - k), or
                 * even done ptr -= k in a register, to get a safe access.
                 */
                if (rold->range > rcur->range)
                        return false;
                /* If the offsets don't match, we can't trust our alignment;
                 * nor can we be sure that we won't fall out of range.
                 */
                if (rold->off != rcur->off)
                        return false;
                /* id relations must be preserved */
                if (!check_ids(rold->id, rcur->id, idmap))
                        return false;
                /* new val must satisfy old val knowledge */
                return range_within(rold, rcur) &&
                       tnum_in(rold->var_off, rcur->var_off);
        case PTR_TO_STACK:
                /* two stack pointers are equal only if they're pointing to
                 * the same stack frame, since fp-8 in foo != fp-8 in bar
                 */
                return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
        case PTR_TO_ARENA:
                return true;
        default:
                return regs_exact(rold, rcur, idmap);
        }
}

static struct bpf_reg_state unbound_reg;

static __init int unbound_reg_init(void)
{
        __mark_reg_unknown_imprecise(&unbound_reg);
        unbound_reg.live |= REG_LIVE_READ;
        return 0;
}
late_initcall(unbound_reg_init);

static bool is_stack_all_misc(struct bpf_verifier_env *env,
                              struct bpf_stack_state *stack)
{
        u32 i;

        for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
                if ((stack->slot_type[i] == STACK_MISC) ||
                    (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
                        continue;
                return false;
        }

        return true;
}

static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
                                                  struct bpf_stack_state *stack)
{
        if (is_spilled_scalar_reg64(stack))
                return &stack->spilled_ptr;

        if (is_stack_all_misc(env, stack))
                return &unbound_reg;

        return NULL;
}

static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
                      struct bpf_func_state *cur, struct bpf_idmap *idmap,
                      enum exact_level exact)
{
        int i, spi;

        /* walk slots of the explored stack and ignore any additional
         * slots in the current stack, since explored(safe) state
         * didn't use them
         */
        for (i = 0; i < old->allocated_stack; i++) {
                struct bpf_reg_state *old_reg, *cur_reg;

                spi = i / BPF_REG_SIZE;

                if (exact != NOT_EXACT &&
                    old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
                    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
                        return false;

                if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)
                    && exact == NOT_EXACT) {
                        i += BPF_REG_SIZE - 1;
                        /* explored state didn't use this */
                        continue;
                }

                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
                        continue;

                if (env->allow_uninit_stack &&
                    old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
                        continue;

                /* explored stack has more populated slots than current stack
                 * and these slots were used
                 */
                if (i >= cur->allocated_stack)
                        return false;

                /* 64-bit scalar spill vs all slots MISC and vice versa.
                 * Load from all slots MISC produces unbound scalar.
                 * Construct a fake register for such stack and call
                 * regsafe() to ensure scalar ids are compared.
                 */
                old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
                cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
                if (old_reg && cur_reg) {
                        if (!regsafe(env, old_reg, cur_reg, idmap, exact))
                                return false;
                        i += BPF_REG_SIZE - 1;
                        continue;
                }

                /* if old state was safe with misc data in the stack
                 * it will be safe with zero-initialized stack.
                 * The opposite is not true
                 */
                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
                    cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
                        continue;
                if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
                    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
                        /* Ex: old explored (safe) state has STACK_SPILL in
                         * this stack slot, but current has STACK_MISC ->
                         * this verifier states are not equivalent,
                         * return false to continue verification of this path
                         */
                        return false;
                if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
                        continue;
                /* Both old and cur are having same slot_type */
                switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
                case STACK_SPILL:
                        /* when explored and current stack slot are both storing
                         * spilled registers, check that stored pointers types
                         * are the same as well.
                         * Ex: explored safe path could have stored
                         * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
                         * but current path has stored:
                         * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
                         * such verifier states are not equivalent.
                         * return false to continue verification of this path
                         */
                        if (!regsafe(env, &old->stack[spi].spilled_ptr,
                                     &cur->stack[spi].spilled_ptr, idmap, exact))
                                return false;
                        break;
                case STACK_DYNPTR:
                        old_reg = &old->stack[spi].spilled_ptr;
                        cur_reg = &cur->stack[spi].spilled_ptr;
                        if (old_reg->dynptr.type != cur_reg->dynptr.type ||
                            old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
                            !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
                                return false;
                        break;
                case STACK_ITER:
                        old_reg = &old->stack[spi].spilled_ptr;
                        cur_reg = &cur->stack[spi].spilled_ptr;
                        /* iter.depth is not compared between states as it
                         * doesn't matter for correctness and would otherwise
                         * prevent convergence; we maintain it only to prevent
                         * infinite loop check triggering, see
                         * iter_active_depths_differ()
                         */
                        if (old_reg->iter.btf != cur_reg->iter.btf ||
                            old_reg->iter.btf_id != cur_reg->iter.btf_id ||
                            old_reg->iter.state != cur_reg->iter.state ||
                            /* ignore {old_reg,cur_reg}->iter.depth, see above */
                            !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
                                return false;
                        break;
                case STACK_MISC:
                case STACK_ZERO:
                case STACK_INVALID:
                        continue;
                /* Ensure that new unhandled slot types return false by default */
                default:
                        return false;
                }
        }
        return true;
}

static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
                    struct bpf_idmap *idmap)
{
        int i;

        if (old->acquired_refs != cur->acquired_refs)
                return false;

        for (i = 0; i < old->acquired_refs; i++) {
                if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap))
                        return false;
        }

        return true;
}

/* compare two verifier states
 *
 * all states stored in state_list are known to be valid, since
 * verifier reached 'bpf_exit' instruction through them
 *
 * this function is called when verifier exploring different branches of
 * execution popped from the state stack. If it sees an old state that has
 * more strict register state and more strict stack state then this execution
 * branch doesn't need to be explored further, since verifier already
 * concluded that more strict state leads to valid finish.
 *
 * Therefore two states are equivalent if register state is more conservative
 * and explored stack state is more conservative than the current one.
 * Example:
 *       explored                   current
 * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
 * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
 *
 * In other words if current stack state (one being explored) has more
 * valid slots than old one that already passed validation, it means
 * the verifier can stop exploring and conclude that current state is valid too
 *
 * Similarly with registers. If explored state has register type as invalid
 * whereas register type in current state is meaningful, it means that
 * the current state will reach 'bpf_exit' instruction safely
 */
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
                              struct bpf_func_state *cur, enum exact_level exact)
{
        int i;

        if (old->callback_depth > cur->callback_depth)
                return false;

        for (i = 0; i < MAX_BPF_REG; i++)
                if (!regsafe(env, &old->regs[i], &cur->regs[i],
                             &env->idmap_scratch, exact))
                        return false;

        if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
                return false;

        if (!refsafe(old, cur, &env->idmap_scratch))
                return false;

        return true;
}

static void reset_idmap_scratch(struct bpf_verifier_env *env)
{
        env->idmap_scratch.tmp_id_gen = env->id_gen;
        memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
}

static bool states_equal(struct bpf_verifier_env *env,
                         struct bpf_verifier_state *old,
                         struct bpf_verifier_state *cur,
                         enum exact_level exact)
{
        int i;

        if (old->curframe != cur->curframe)
                return false;

        reset_idmap_scratch(env);

        /* Verification state from speculative execution simulation
         * must never prune a non-speculative execution one.
         */
        if (old->speculative && !cur->speculative)
                return false;

        if (old->active_lock.ptr != cur->active_lock.ptr)
                return false;

        /* Old and cur active_lock's have to be either both present
         * or both absent.
         */
        if (!!old->active_lock.id != !!cur->active_lock.id)
                return false;

        if (old->active_lock.id &&
            !check_ids(old->active_lock.id, cur->active_lock.id, &env->idmap_scratch))
                return false;

        if (old->active_rcu_lock != cur->active_rcu_lock)
                return false;

        if (old->active_preempt_lock != cur->active_preempt_lock)
                return false;

        if (old->in_sleepable != cur->in_sleepable)
                return false;

        /* for states to be equal callsites have to be the same
         * and all frame states need to be equivalent
         */
        for (i = 0; i <= old->curframe; i++) {
                if (old->frame[i]->callsite != cur->frame[i]->callsite)
                        return false;
                if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
                        return false;
        }
        return true;
}

/* Return 0 if no propagation happened. Return negative error code if error
 * happened. Otherwise, return the propagated bit.
 */
static int propagate_liveness_reg(struct bpf_verifier_env *env,
                                  struct bpf_reg_state *reg,
                                  struct bpf_reg_state *parent_reg)
{
        u8 parent_flag = parent_reg->live & REG_LIVE_READ;
        u8 flag = reg->live & REG_LIVE_READ;
        int err;

        /* When comes here, read flags of PARENT_REG or REG could be any of
         * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
         * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
         */
        if (parent_flag == REG_LIVE_READ64 ||
            /* Or if there is no read flag from REG. */
            !flag ||
            /* Or if the read flag from REG is the same as PARENT_REG. */
            parent_flag == flag)
                return 0;

        err = mark_reg_read(env, reg, parent_reg, flag);
        if (err)
                return err;

        return flag;
}

/* A write screens off any subsequent reads; but write marks come from the
 * straight-line code between a state and its parent.  When we arrive at an
 * equivalent state (jump target or such) we didn't arrive by the straight-line
 * code, so read marks in the state must propagate to the parent regardless
 * of the state's write marks. That's what 'parent == state->parent' comparison
 * in mark_reg_read() is for.
 */
static int propagate_liveness(struct bpf_verifier_env *env,
                              const struct bpf_verifier_state *vstate,
                              struct bpf_verifier_state *vparent)
{
        struct bpf_reg_state *state_reg, *parent_reg;
        struct bpf_func_state *state, *parent;
        int i, frame, err = 0;

        if (vparent->curframe != vstate->curframe) {
                WARN(1, "propagate_live: parent frame %d current frame %d\n",
                     vparent->curframe, vstate->curframe);
                return -EFAULT;
        }
        /* Propagate read liveness of registers... */
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
        for (frame = 0; frame <= vstate->curframe; frame++) {
                parent = vparent->frame[frame];
                state = vstate->frame[frame];
                parent_reg = parent->regs;
                state_reg = state->regs;
                /* We don't need to worry about FP liveness, it's read-only */
                for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
                        err = propagate_liveness_reg(env, &state_reg[i],
                                                     &parent_reg[i]);
                        if (err < 0)
                                return err;
                        if (err == REG_LIVE_READ64)
                                mark_insn_zext(env, &parent_reg[i]);
                }

                /* Propagate stack slots. */
                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
                            i < parent->allocated_stack / BPF_REG_SIZE; i++) {
                        parent_reg = &parent->stack[i].spilled_ptr;
                        state_reg = &state->stack[i].spilled_ptr;
                        err = propagate_liveness_reg(env, state_reg,
                                                     parent_reg);
                        if (err < 0)
                                return err;
                }
        }
        return 0;
}

/* find precise scalars in the previous equivalent state and
 * propagate them into the current state
 */
static int propagate_precision(struct bpf_verifier_env *env,
                               const struct bpf_verifier_state *old)
{
        struct bpf_reg_state *state_reg;
        struct bpf_func_state *state;
        int i, err = 0, fr;
        bool first;

        for (fr = old->curframe; fr >= 0; fr--) {
                state = old->frame[fr];
                state_reg = state->regs;
                first = true;
                for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
                        if (state_reg->type != SCALAR_VALUE ||
                            !state_reg->precise ||
                            !(state_reg->live & REG_LIVE_READ))
                                continue;
                        if (env->log.level & BPF_LOG_LEVEL2) {
                                if (first)
                                        verbose(env, "frame %d: propagating r%d", fr, i);
                                else
                                        verbose(env, ",r%d", i);
                        }
                        bt_set_frame_reg(&env->bt, fr, i);
                        first = false;
                }

                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                        if (!is_spilled_reg(&state->stack[i]))
                                continue;
                        state_reg = &state->stack[i].spilled_ptr;
                        if (state_reg->type != SCALAR_VALUE ||
                            !state_reg->precise ||
                            !(state_reg->live & REG_LIVE_READ))
                                continue;
                        if (env->log.level & BPF_LOG_LEVEL2) {
                                if (first)
                                        verbose(env, "frame %d: propagating fp%d",
                                                fr, (-i - 1) * BPF_REG_SIZE);
                                else
                                        verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
                        }
                        bt_set_frame_slot(&env->bt, fr, i);
                        first = false;
                }
                if (!first)
                        verbose(env, "\n");
        }

        err = mark_chain_precision_batch(env);
        if (err < 0)
                return err;

        return 0;
}

static bool states_maybe_looping(struct bpf_verifier_state *old,
                                 struct bpf_verifier_state *cur)
{
        struct bpf_func_state *fold, *fcur;
        int i, fr = cur->curframe;

        if (old->curframe != fr)
                return false;

        fold = old->frame[fr];
        fcur = cur->frame[fr];
        for (i = 0; i < MAX_BPF_REG; i++)
                if (memcmp(&fold->regs[i], &fcur->regs[i],
                           offsetof(struct bpf_reg_state, parent)))
                        return false;
        return true;
}

static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
{
        return env->insn_aux_data[insn_idx].is_iter_next;
}

/* is_state_visited() handles iter_next() (see process_iter_next_call() for
 * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
 * states to match, which otherwise would look like an infinite loop. So while
 * iter_next() calls are taken care of, we still need to be careful and
 * prevent erroneous and too eager declaration of "ininite loop", when
 * iterators are involved.
 *
 * Here's a situation in pseudo-BPF assembly form:
 *
 *   0: again:                          ; set up iter_next() call args
 *   1:   r1 = &it                      ; <CHECKPOINT HERE>
 *   2:   call bpf_iter_num_next        ; this is iter_next() call
 *   3:   if r0 == 0 goto done
 *   4:   ... something useful here ...
 *   5:   goto again                    ; another iteration
 *   6: done:
 *   7:   r1 = &it
 *   8:   call bpf_iter_num_destroy     ; clean up iter state
 *   9:   exit
 *
 * This is a typical loop. Let's assume that we have a prune point at 1:,
 * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
 * again`, assuming other heuristics don't get in a way).
 *
 * When we first time come to 1:, let's say we have some state X. We proceed
 * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
 * Now we come back to validate that forked ACTIVE state. We proceed through
 * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
 * are converging. But the problem is that we don't know that yet, as this
 * convergence has to happen at iter_next() call site only. So if nothing is
 * done, at 1: verifier will use bounded loop logic and declare infinite
 * looping (and would be *technically* correct, if not for iterator's
 * "eventual sticky NULL" contract, see process_iter_next_call()). But we
 * don't want that. So what we do in process_iter_next_call() when we go on
 * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
 * a different iteration. So when we suspect an infinite loop, we additionally
 * check if any of the *ACTIVE* iterator states depths differ. If yes, we
 * pretend we are not looping and wait for next iter_next() call.
 *
 * This only applies to ACTIVE state. In DRAINED state we don't expect to
 * loop, because that would actually mean infinite loop, as DRAINED state is
 * "sticky", and so we'll keep returning into the same instruction with the
 * same state (at least in one of possible code paths).
 *
 * This approach allows to keep infinite loop heuristic even in the face of
 * active iterator. E.g., C snippet below is and will be detected as
 * inifintely looping:
 *
 *   struct bpf_iter_num it;
 *   int *p, x;
 *
 *   bpf_iter_num_new(&it, 0, 10);
 *   while ((p = bpf_iter_num_next(&t))) {
 *       x = p;
 *       while (x--) {} // <<-- infinite loop here
 *   }
 *
 */
static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
{
        struct bpf_reg_state *slot, *cur_slot;
        struct bpf_func_state *state;
        int i, fr;

        for (fr = old->curframe; fr >= 0; fr--) {
                state = old->frame[fr];
                for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                        if (state->stack[i].slot_type[0] != STACK_ITER)
                                continue;

                        slot = &state->stack[i].spilled_ptr;
                        if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
                                continue;

                        cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
                        if (cur_slot->iter.depth != slot->iter.depth)
                                return true;
                }
        }
        return false;
}

static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
        struct bpf_verifier_state_list *new_sl;
        struct bpf_verifier_state_list *sl, **pprev;
        struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
        int i, j, n, err, states_cnt = 0;
        bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
        bool add_new_state = force_new_state;
        bool force_exact;

        /* bpf progs typically have pruning point every 4 instructions
         * http://vger.kernel.org/bpfconf2019.html#session-1
         * Do not add new state for future pruning if the verifier hasn't seen
         * at least 2 jumps and at least 8 instructions.
         * This heuristics helps decrease 'total_states' and 'peak_states' metric.
         * In tests that amounts to up to 50% reduction into total verifier
         * memory consumption and 20% verifier time speedup.
         */
        if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
            env->insn_processed - env->prev_insn_processed >= 8)
                add_new_state = true;

        pprev = explored_state(env, insn_idx);
        sl = *pprev;

        clean_live_states(env, insn_idx, cur);

        while (sl) {
                states_cnt++;
                if (sl->state.insn_idx != insn_idx)
                        goto next;

                if (sl->state.branches) {
                        struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];

                        if (frame->in_async_callback_fn &&
                            frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
                                /* Different async_entry_cnt means that the verifier is
                                 * processing another entry into async callback.
                                 * Seeing the same state is not an indication of infinite
                                 * loop or infinite recursion.
                                 * But finding the same state doesn't mean that it's safe
                                 * to stop processing the current state. The previous state
                                 * hasn't yet reached bpf_exit, since state.branches > 0.
                                 * Checking in_async_callback_fn alone is not enough either.
                                 * Since the verifier still needs to catch infinite loops
                                 * inside async callbacks.
                                 */
                                goto skip_inf_loop_check;
                        }
                        /* BPF open-coded iterators loop detection is special.
                         * states_maybe_looping() logic is too simplistic in detecting
                         * states that *might* be equivalent, because it doesn't know
                         * about ID remapping, so don't even perform it.
                         * See process_iter_next_call() and iter_active_depths_differ()
                         * for overview of the logic. When current and one of parent
                         * states are detected as equivalent, it's a good thing: we prove
                         * convergence and can stop simulating further iterations.
                         * It's safe to assume that iterator loop will finish, taking into
                         * account iter_next() contract of eventually returning
                         * sticky NULL result.
                         *
                         * Note, that states have to be compared exactly in this case because
                         * read and precision marks might not be finalized inside the loop.
                         * E.g. as in the program below:
                         *
                         *     1. r7 = -16
                         *     2. r6 = bpf_get_prandom_u32()
                         *     3. while (bpf_iter_num_next(&fp[-8])) {
                         *     4.   if (r6 != 42) {
                         *     5.     r7 = -32
                         *     6.     r6 = bpf_get_prandom_u32()
                         *     7.     continue
                         *     8.   }
                         *     9.   r0 = r10
                         *    10.   r0 += r7
                         *    11.   r8 = *(u64 *)(r0 + 0)
                         *    12.   r6 = bpf_get_prandom_u32()
                         *    13. }
                         *
                         * Here verifier would first visit path 1-3, create a checkpoint at 3
                         * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
                         * not have read or precision mark for r7 yet, thus inexact states
                         * comparison would discard current state with r7=-32
                         * => unsafe memory access at 11 would not be caught.
                         */
                        if (is_iter_next_insn(env, insn_idx)) {
                                if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
                                        struct bpf_func_state *cur_frame;
                                        struct bpf_reg_state *iter_state, *iter_reg;
                                        int spi;

                                        cur_frame = cur->frame[cur->curframe];
                                        /* btf_check_iter_kfuncs() enforces that
                                         * iter state pointer is always the first arg
                                         */
                                        iter_reg = &cur_frame->regs[BPF_REG_1];
                                        /* current state is valid due to states_equal(),
                                         * so we can assume valid iter and reg state,
                                         * no need for extra (re-)validations
                                         */
                                        spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
                                        iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
                                        if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
                                                update_loop_entry(cur, &sl->state);
                                                goto hit;
                                        }
                                }
                                goto skip_inf_loop_check;
                        }
                        if (is_may_goto_insn_at(env, insn_idx)) {
                                if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
                                        update_loop_entry(cur, &sl->state);
                                        goto hit;
                                }
                                goto skip_inf_loop_check;
                        }
                        if (calls_callback(env, insn_idx)) {
                                if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
                                        goto hit;
                                goto skip_inf_loop_check;
                        }
                        /* attempt to detect infinite loop to avoid unnecessary doomed work */
                        if (states_maybe_looping(&sl->state, cur) &&
                            states_equal(env, &sl->state, cur, EXACT) &&
                            !iter_active_depths_differ(&sl->state, cur) &&
                            sl->state.may_goto_depth == cur->may_goto_depth &&
                            sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
                                verbose_linfo(env, insn_idx, "; ");
                                verbose(env, "infinite loop detected at insn %d\n", insn_idx);
                                verbose(env, "cur state:");
                                print_verifier_state(env, cur->frame[cur->curframe], true);
                                verbose(env, "old state:");
                                print_verifier_state(env, sl->state.frame[cur->curframe], true);
                                return -EINVAL;
                        }
                        /* if the verifier is processing a loop, avoid adding new state
                         * too often, since different loop iterations have distinct
                         * states and may not help future pruning.
                         * This threshold shouldn't be too low to make sure that
                         * a loop with large bound will be rejected quickly.
                         * The most abusive loop will be:
                         * r1 += 1
                         * if r1 < 1000000 goto pc-2
                         * 1M insn_procssed limit / 100 == 10k peak states.
                         * This threshold shouldn't be too high either, since states
                         * at the end of the loop are likely to be useful in pruning.
                         */
skip_inf_loop_check:
                        if (!force_new_state &&
                            env->jmps_processed - env->prev_jmps_processed < 20 &&
                            env->insn_processed - env->prev_insn_processed < 100)
                                add_new_state = false;
                        goto miss;
                }
                /* If sl->state is a part of a loop and this loop's entry is a part of
                 * current verification path then states have to be compared exactly.
                 * 'force_exact' is needed to catch the following case:
                 *
                 *                initial     Here state 'succ' was processed first,
                 *                  |         it was eventually tracked to produce a
                 *                  V         state identical to 'hdr'.
                 *     .---------> hdr        All branches from 'succ' had been explored
                 *     |            |         and thus 'succ' has its .branches == 0.
                 *     |            V
                 *     |    .------...        Suppose states 'cur' and 'succ' correspond
                 *     |    |       |         to the same instruction + callsites.
                 *     |    V       V         In such case it is necessary to check
                 *     |   ...     ...        if 'succ' and 'cur' are states_equal().
                 *     |    |       |         If 'succ' and 'cur' are a part of the
                 *     |    V       V         same loop exact flag has to be set.
                 *     |   succ <- cur        To check if that is the case, verify
                 *     |    |                 if loop entry of 'succ' is in current
                 *     |    V                 DFS path.
                 *     |   ...
                 *     |    |
                 *     '----'
                 *
                 * Additional details are in the comment before get_loop_entry().
                 */
                loop_entry = get_loop_entry(&sl->state);
                force_exact = loop_entry && loop_entry->branches > 0;
                if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
                        if (force_exact)
                                update_loop_entry(cur, loop_entry);
hit:
                        sl->hit_cnt++;
                        /* reached equivalent register/stack state,
                         * prune the search.
                         * Registers read by the continuation are read by us.
                         * If we have any write marks in env->cur_state, they
                         * will prevent corresponding reads in the continuation
                         * from reaching our parent (an explored_state).  Our
                         * own state will get the read marks recorded, but
                         * they'll be immediately forgotten as we're pruning
                         * this state and will pop a new one.
                         */
                        err = propagate_liveness(env, &sl->state, cur);

                        /* if previous state reached the exit with precision and
                         * current state is equivalent to it (except precision marks)
                         * the precision needs to be propagated back in
                         * the current state.
                         */
                        if (is_jmp_point(env, env->insn_idx))
                                err = err ? : push_jmp_history(env, cur, 0);
                        err = err ? : propagate_precision(env, &sl->state);
                        if (err)
                                return err;
                        return 1;
                }
miss:
                /* when new state is not going to be added do not increase miss count.
                 * Otherwise several loop iterations will remove the state
                 * recorded earlier. The goal of these heuristics is to have
                 * states from some iterations of the loop (some in the beginning
                 * and some at the end) to help pruning.
                 */
                if (add_new_state)
                        sl->miss_cnt++;
                /* heuristic to determine whether this state is beneficial
                 * to keep checking from state equivalence point of view.
                 * Higher numbers increase max_states_per_insn and verification time,
                 * but do not meaningfully decrease insn_processed.
                 * 'n' controls how many times state could miss before eviction.
                 * Use bigger 'n' for checkpoints because evicting checkpoint states
                 * too early would hinder iterator convergence.
                 */
                n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
                if (sl->miss_cnt > sl->hit_cnt * n + n) {
                        /* the state is unlikely to be useful. Remove it to
                         * speed up verification
                         */
                        *pprev = sl->next;
                        if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
                            !sl->state.used_as_loop_entry) {
                                u32 br = sl->state.branches;

                                WARN_ONCE(br,
                                          "BUG live_done but branches_to_explore %d\n",
                                          br);
                                free_verifier_state(&sl->state, false);
                                kfree(sl);
                                env->peak_states--;
                        } else {
                                /* cannot free this state, since parentage chain may
                                 * walk it later. Add it for free_list instead to
                                 * be freed at the end of verification
                                 */
                                sl->next = env->free_list;
                                env->free_list = sl;
                        }
                        sl = *pprev;
                        continue;
                }
next:
                pprev = &sl->next;
                sl = *pprev;
        }

        if (env->max_states_per_insn < states_cnt)
                env->max_states_per_insn = states_cnt;

        if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
                return 0;

        if (!add_new_state)
                return 0;

        /* There were no equivalent states, remember the current one.
         * Technically the current state is not proven to be safe yet,
         * but it will either reach outer most bpf_exit (which means it's safe)
         * or it will be rejected. When there are no loops the verifier won't be
         * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
         * again on the way to bpf_exit.
         * When looping the sl->state.branches will be > 0 and this state
         * will not be considered for equivalence until branches == 0.
         */
        new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
        if (!new_sl)
                return -ENOMEM;
        env->total_states++;
        env->peak_states++;
        env->prev_jmps_processed = env->jmps_processed;
        env->prev_insn_processed = env->insn_processed;

        /* forget precise markings we inherited, see __mark_chain_precision */
        if (env->bpf_capable)
                mark_all_scalars_imprecise(env, cur);

        /* add new state to the head of linked list */
        new = &new_sl->state;
        err = copy_verifier_state(new, cur);
        if (err) {
                free_verifier_state(new, false);
                kfree(new_sl);
                return err;
        }
        new->insn_idx = insn_idx;
        WARN_ONCE(new->branches != 1,
                  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);

        cur->parent = new;
        cur->first_insn_idx = insn_idx;
        cur->dfs_depth = new->dfs_depth + 1;
        clear_jmp_history(cur);
        new_sl->next = *explored_state(env, insn_idx);
        *explored_state(env, insn_idx) = new_sl;
        /* connect new state to parentage chain. Current frame needs all
         * registers connected. Only r6 - r9 of the callers are alive (pushed
         * to the stack implicitly by JITs) so in callers' frames connect just
         * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
         * the state of the call instruction (with WRITTEN set), and r0 comes
         * from callee with its full parentage chain, anyway.
         */
        /* clear write marks in current state: the writes we did are not writes
         * our child did, so they don't screen off its reads from us.
         * (There are no read marks in current state, because reads always mark
         * their parent and current state never has children yet.  Only
         * explored_states can get read marks.)
         */
        for (j = 0; j <= cur->curframe; j++) {
                for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
                        cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
                for (i = 0; i < BPF_REG_FP; i++)
                        cur->frame[j]->regs[i].live = REG_LIVE_NONE;
        }

        /* all stack frames are accessible from callee, clear them all */
        for (j = 0; j <= cur->curframe; j++) {
                struct bpf_func_state *frame = cur->frame[j];
                struct bpf_func_state *newframe = new->frame[j];

                for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) {
                        frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
                        frame->stack[i].spilled_ptr.parent =
                                                &newframe->stack[i].spilled_ptr;
                }
        }
        return 0;
}

/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
        switch (base_type(type)) {
        case PTR_TO_CTX:
        case PTR_TO_SOCKET:
        case PTR_TO_SOCK_COMMON:
        case PTR_TO_TCP_SOCK:
        case PTR_TO_XDP_SOCK:
        case PTR_TO_BTF_ID:
        case PTR_TO_ARENA:
                return false;
        default:
                return true;
        }
}

/* If an instruction was previously used with particular pointer types, then we
 * need to be careful to avoid cases such as the below, where it may be ok
 * for one branch accessing the pointer, but not ok for the other branch:
 *
 * R1 = sock_ptr
 * goto X;
 * ...
 * R1 = some_other_valid_ptr;
 * goto X;
 * ...
 * R2 = *(u32 *)(R1 + 0);
 */
static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
{
        return src != prev && (!reg_type_mismatch_ok(src) ||
                               !reg_type_mismatch_ok(prev));
}

static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
                             bool allow_trust_mismatch)
{
        enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;

        if (*prev_type == NOT_INIT) {
                /* Saw a valid insn
                 * dst_reg = *(u32 *)(src_reg + off)
                 * save type to validate intersecting paths
                 */
                *prev_type = type;
        } else if (reg_type_mismatch(type, *prev_type)) {
                /* Abuser program is trying to use the same insn
                 * dst_reg = *(u32*) (src_reg + off)
                 * with different pointer types:
                 * src_reg == ctx in one branch and
                 * src_reg == stack|map in some other branch.
                 * Reject it.
                 */
                if (allow_trust_mismatch &&
                    base_type(type) == PTR_TO_BTF_ID &&
                    base_type(*prev_type) == PTR_TO_BTF_ID) {
                        /*
                         * Have to support a use case when one path through
                         * the program yields TRUSTED pointer while another
                         * is UNTRUSTED. Fallback to UNTRUSTED to generate
                         * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
                         */
                        *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
                } else {
                        verbose(env, "same insn cannot be used with different pointers\n");
                        return -EINVAL;
                }
        }

        return 0;
}

static int do_check(struct bpf_verifier_env *env)
{
        bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_insn *insns = env->prog->insnsi;
        struct bpf_reg_state *regs;
        int insn_cnt = env->prog->len;
        bool do_print_state = false;
        int prev_insn_idx = -1;

        for (;;) {
                bool exception_exit = false;
                struct bpf_insn *insn;
                u8 class;
                int err;

                /* reset current history entry on each new instruction */
                env->cur_hist_ent = NULL;

                env->prev_insn_idx = prev_insn_idx;
                if (env->insn_idx >= insn_cnt) {
                        verbose(env, "invalid insn idx %d insn_cnt %d\n",
                                env->insn_idx, insn_cnt);
                        return -EFAULT;
                }

                insn = &insns[env->insn_idx];
                class = BPF_CLASS(insn->code);

                if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
                        verbose(env,
                                "BPF program is too large. Processed %d insn\n",
                                env->insn_processed);
                        return -E2BIG;
                }

                state->last_insn_idx = env->prev_insn_idx;

                if (is_prune_point(env, env->insn_idx)) {
                        err = is_state_visited(env, env->insn_idx);
                        if (err < 0)
                                return err;
                        if (err == 1) {
                                /* found equivalent state, can prune the search */
                                if (env->log.level & BPF_LOG_LEVEL) {
                                        if (do_print_state)
                                                verbose(env, "\nfrom %d to %d%s: safe\n",
                                                        env->prev_insn_idx, env->insn_idx,
                                                        env->cur_state->speculative ?
                                                        " (speculative execution)" : "");
                                        else
                                                verbose(env, "%d: safe\n", env->insn_idx);
                                }
                                goto process_bpf_exit;
                        }
                }

                if (is_jmp_point(env, env->insn_idx)) {
                        err = push_jmp_history(env, state, 0);
                        if (err)
                                return err;
                }

                if (signal_pending(current))
                        return -EAGAIN;

                if (need_resched())
                        cond_resched();

                if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
                        verbose(env, "\nfrom %d to %d%s:",
                                env->prev_insn_idx, env->insn_idx,
                                env->cur_state->speculative ?
                                " (speculative execution)" : "");
                        print_verifier_state(env, state->frame[state->curframe], true);
                        do_print_state = false;
                }

                if (env->log.level & BPF_LOG_LEVEL) {
                        const struct bpf_insn_cbs cbs = {
                                .cb_call        = disasm_kfunc_name,
                                .cb_print        = verbose,
                                .private_data        = env,
                        };

                        if (verifier_state_scratched(env))
                                print_insn_state(env, state->frame[state->curframe]);

                        verbose_linfo(env, env->insn_idx, "; ");
                        env->prev_log_pos = env->log.end_pos;
                        verbose(env, "%d: ", env->insn_idx);
                        print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
                        env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
                        env->prev_log_pos = env->log.end_pos;
                }

                if (bpf_prog_is_offloaded(env->prog->aux)) {
                        err = bpf_prog_offload_verify_insn(env, env->insn_idx,
                                                           env->prev_insn_idx);
                        if (err)
                                return err;
                }

                regs = cur_regs(env);
                sanitize_mark_insn_seen(env);
                prev_insn_idx = env->insn_idx;

                if (class == BPF_ALU || class == BPF_ALU64) {
                        err = check_alu_op(env, insn);
                        if (err)
                                return err;

                } else if (class == BPF_LDX) {
                        enum bpf_reg_type src_reg_type;

                        /* check for reserved fields is already done */

                        /* check src operand */
                        err = check_reg_arg(env, insn->src_reg, SRC_OP);
                        if (err)
                                return err;

                        err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
                        if (err)
                                return err;

                        src_reg_type = regs[insn->src_reg].type;

                        /* check that memory (src_reg + off) is readable,
                         * the state of dst_reg will be updated by this func
                         */
                        err = check_mem_access(env, env->insn_idx, insn->src_reg,
                                               insn->off, BPF_SIZE(insn->code),
                                               BPF_READ, insn->dst_reg, false,
                                               BPF_MODE(insn->code) == BPF_MEMSX);
                        err = err ?: save_aux_ptr_type(env, src_reg_type, true);
                        err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
                        if (err)
                                return err;
                } else if (class == BPF_STX) {
                        enum bpf_reg_type dst_reg_type;

                        if (BPF_MODE(insn->code) == BPF_ATOMIC) {
                                err = check_atomic(env, env->insn_idx, insn);
                                if (err)
                                        return err;
                                env->insn_idx++;
                                continue;
                        }

                        if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
                                verbose(env, "BPF_STX uses reserved fields\n");
                                return -EINVAL;
                        }

                        /* check src1 operand */
                        err = check_reg_arg(env, insn->src_reg, SRC_OP);
                        if (err)
                                return err;
                        /* check src2 operand */
                        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
                        if (err)
                                return err;

                        dst_reg_type = regs[insn->dst_reg].type;

                        /* check that memory (dst_reg + off) is writeable */
                        err = check_mem_access(env, env->insn_idx, insn->dst_reg,
                                               insn->off, BPF_SIZE(insn->code),
                                               BPF_WRITE, insn->src_reg, false, false);
                        if (err)
                                return err;

                        err = save_aux_ptr_type(env, dst_reg_type, false);
                        if (err)
                                return err;
                } else if (class == BPF_ST) {
                        enum bpf_reg_type dst_reg_type;

                        if (BPF_MODE(insn->code) != BPF_MEM ||
                            insn->src_reg != BPF_REG_0) {
                                verbose(env, "BPF_ST uses reserved fields\n");
                                return -EINVAL;
                        }
                        /* check src operand */
                        err = check_reg_arg(env, insn->dst_reg, SRC_OP);
                        if (err)
                                return err;

                        dst_reg_type = regs[insn->dst_reg].type;

                        /* check that memory (dst_reg + off) is writeable */
                        err = check_mem_access(env, env->insn_idx, insn->dst_reg,
                                               insn->off, BPF_SIZE(insn->code),
                                               BPF_WRITE, -1, false, false);
                        if (err)
                                return err;

                        err = save_aux_ptr_type(env, dst_reg_type, false);
                        if (err)
                                return err;
                } else if (class == BPF_JMP || class == BPF_JMP32) {
                        u8 opcode = BPF_OP(insn->code);

                        env->jmps_processed++;
                        if (opcode == BPF_CALL) {
                                if (BPF_SRC(insn->code) != BPF_K ||
                                    (insn->src_reg != BPF_PSEUDO_KFUNC_CALL
                                     && insn->off != 0) ||
                                    (insn->src_reg != BPF_REG_0 &&
                                     insn->src_reg != BPF_PSEUDO_CALL &&
                                     insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
                                    insn->dst_reg != BPF_REG_0 ||
                                    class == BPF_JMP32) {
                                        verbose(env, "BPF_CALL uses reserved fields\n");
                                        return -EINVAL;
                                }

                                if (env->cur_state->active_lock.ptr) {
                                        if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
                                            (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
                                             (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
                                                verbose(env, "function calls are not allowed while holding a lock\n");
                                                return -EINVAL;
                                        }
                                }
                                if (insn->src_reg == BPF_PSEUDO_CALL) {
                                        err = check_func_call(env, insn, &env->insn_idx);
                                } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
                                        err = check_kfunc_call(env, insn, &env->insn_idx);
                                        if (!err && is_bpf_throw_kfunc(insn)) {
                                                exception_exit = true;
                                                goto process_bpf_exit_full;
                                        }
                                } else {
                                        err = check_helper_call(env, insn, &env->insn_idx);
                                }
                                if (err)
                                        return err;

                                mark_reg_scratched(env, BPF_REG_0);
                        } else if (opcode == BPF_JA) {
                                if (BPF_SRC(insn->code) != BPF_K ||
                                    insn->src_reg != BPF_REG_0 ||
                                    insn->dst_reg != BPF_REG_0 ||
                                    (class == BPF_JMP && insn->imm != 0) ||
                                    (class == BPF_JMP32 && insn->off != 0)) {
                                        verbose(env, "BPF_JA uses reserved fields\n");
                                        return -EINVAL;
                                }

                                if (class == BPF_JMP)
                                        env->insn_idx += insn->off + 1;
                                else
                                        env->insn_idx += insn->imm + 1;
                                continue;

                        } else if (opcode == BPF_EXIT) {
                                if (BPF_SRC(insn->code) != BPF_K ||
                                    insn->imm != 0 ||
                                    insn->src_reg != BPF_REG_0 ||
                                    insn->dst_reg != BPF_REG_0 ||
                                    class == BPF_JMP32) {
                                        verbose(env, "BPF_EXIT uses reserved fields\n");
                                        return -EINVAL;
                                }
process_bpf_exit_full:
                                if (env->cur_state->active_lock.ptr && !env->cur_state->curframe) {
                                        verbose(env, "bpf_spin_unlock is missing\n");
                                        return -EINVAL;
                                }

                                if (env->cur_state->active_rcu_lock && !env->cur_state->curframe) {
                                        verbose(env, "bpf_rcu_read_unlock is missing\n");
                                        return -EINVAL;
                                }

                                if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) {
                                        verbose(env, "%d bpf_preempt_enable%s missing\n",
                                                env->cur_state->active_preempt_lock,
                                                env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are");
                                        return -EINVAL;
                                }

                                /* We must do check_reference_leak here before
                                 * prepare_func_exit to handle the case when
                                 * state->curframe > 0, it may be a callback
                                 * function, for which reference_state must
                                 * match caller reference state when it exits.
                                 */
                                err = check_reference_leak(env, exception_exit);
                                if (err)
                                        return err;

                                /* The side effect of the prepare_func_exit
                                 * which is being skipped is that it frees
                                 * bpf_func_state. Typically, process_bpf_exit
                                 * will only be hit with outermost exit.
                                 * copy_verifier_state in pop_stack will handle
                                 * freeing of any extra bpf_func_state left over
                                 * from not processing all nested function
                                 * exits. We also skip return code checks as
                                 * they are not needed for exceptional exits.
                                 */
                                if (exception_exit)
                                        goto process_bpf_exit;

                                if (state->curframe) {
                                        /* exit from nested function */
                                        err = prepare_func_exit(env, &env->insn_idx);
                                        if (err)
                                                return err;
                                        do_print_state = true;
                                        continue;
                                }

                                err = check_return_code(env, BPF_REG_0, "R0");
                                if (err)
                                        return err;
process_bpf_exit:
                                mark_verifier_state_scratched(env);
                                update_branch_counts(env, env->cur_state);
                                err = pop_stack(env, &prev_insn_idx,
                                                &env->insn_idx, pop_log);
                                if (err < 0) {
                                        if (err != -ENOENT)
                                                return err;
                                        break;
                                } else {
                                        do_print_state = true;
                                        continue;
                                }
                        } else {
                                err = check_cond_jmp_op(env, insn, &env->insn_idx);
                                if (err)
                                        return err;
                        }
                } else if (class == BPF_LD) {
                        u8 mode = BPF_MODE(insn->code);

                        if (mode == BPF_ABS || mode == BPF_IND) {
                                err = check_ld_abs(env, insn);
                                if (err)
                                        return err;

                        } else if (mode == BPF_IMM) {
                                err = check_ld_imm(env, insn);
                                if (err)
                                        return err;

                                env->insn_idx++;
                                sanitize_mark_insn_seen(env);
                        } else {
                                verbose(env, "invalid BPF_LD mode\n");
                                return -EINVAL;
                        }
                } else {
                        verbose(env, "unknown insn class %d\n", class);
                        return -EINVAL;
                }

                env->insn_idx++;
        }

        return 0;
}

static int find_btf_percpu_datasec(struct btf *btf)
{
        const struct btf_type *t;
        const char *tname;
        int i, n;

        /*
         * Both vmlinux and module each have their own ".data..percpu"
         * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
         * types to look at only module's own BTF types.
         */
        n = btf_nr_types(btf);
        if (btf_is_module(btf))
                i = btf_nr_types(btf_vmlinux);
        else
                i = 1;

        for(; i < n; i++) {
                t = btf_type_by_id(btf, i);
                if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
                        continue;

                tname = btf_name_by_offset(btf, t->name_off);
                if (!strcmp(tname, ".data..percpu"))
                        return i;
        }

        return -ENOENT;
}

/* replace pseudo btf_id with kernel symbol address */
static int check_pseudo_btf_id(struct bpf_verifier_env *env,
                               struct bpf_insn *insn,
                               struct bpf_insn_aux_data *aux)
{
        const struct btf_var_secinfo *vsi;
        const struct btf_type *datasec;
        struct btf_mod_pair *btf_mod;
        const struct btf_type *t;
        const char *sym_name;
        bool percpu = false;
        u32 type, id = insn->imm;
        struct btf *btf;
        s32 datasec_id;
        u64 addr;
        int i, btf_fd, err;

        btf_fd = insn[1].imm;
        if (btf_fd) {
                btf = btf_get_by_fd(btf_fd);
                if (IS_ERR(btf)) {
                        verbose(env, "invalid module BTF object FD specified.\n");
                        return -EINVAL;
                }
        } else {
                if (!btf_vmlinux) {
                        verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
                        return -EINVAL;
                }
                btf = btf_vmlinux;
                btf_get(btf);
        }

        t = btf_type_by_id(btf, id);
        if (!t) {
                verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
                err = -ENOENT;
                goto err_put;
        }

        if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
                verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
                err = -EINVAL;
                goto err_put;
        }

        sym_name = btf_name_by_offset(btf, t->name_off);
        addr = kallsyms_lookup_name(sym_name);
        if (!addr) {
                verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
                        sym_name);
                err = -ENOENT;
                goto err_put;
        }
        insn[0].imm = (u32)addr;
        insn[1].imm = addr >> 32;

        if (btf_type_is_func(t)) {
                aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
                aux->btf_var.mem_size = 0;
                goto check_btf;
        }

        datasec_id = find_btf_percpu_datasec(btf);
        if (datasec_id > 0) {
                datasec = btf_type_by_id(btf, datasec_id);
                for_each_vsi(i, datasec, vsi) {
                        if (vsi->type == id) {
                                percpu = true;
                                break;
                        }
                }
        }

        type = t->type;
        t = btf_type_skip_modifiers(btf, type, NULL);
        if (percpu) {
                aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
                aux->btf_var.btf = btf;
                aux->btf_var.btf_id = type;
        } else if (!btf_type_is_struct(t)) {
                const struct btf_type *ret;
                const char *tname;
                u32 tsize;

                /* resolve the type size of ksym. */
                ret = btf_resolve_size(btf, t, &tsize);
                if (IS_ERR(ret)) {
                        tname = btf_name_by_offset(btf, t->name_off);
                        verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
                                tname, PTR_ERR(ret));
                        err = -EINVAL;
                        goto err_put;
                }
                aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
                aux->btf_var.mem_size = tsize;
        } else {
                aux->btf_var.reg_type = PTR_TO_BTF_ID;
                aux->btf_var.btf = btf;
                aux->btf_var.btf_id = type;
        }
check_btf:
        /* check whether we recorded this BTF (and maybe module) already */
        for (i = 0; i < env->used_btf_cnt; i++) {
                if (env->used_btfs[i].btf == btf) {
                        btf_put(btf);
                        return 0;
                }
        }

        if (env->used_btf_cnt >= MAX_USED_BTFS) {
                err = -E2BIG;
                goto err_put;
        }

        btf_mod = &env->used_btfs[env->used_btf_cnt];
        btf_mod->btf = btf;
        btf_mod->module = NULL;

        /* if we reference variables from kernel module, bump its refcount */
        if (btf_is_module(btf)) {
                btf_mod->module = btf_try_get_module(btf);
                if (!btf_mod->module) {
                        err = -ENXIO;
                        goto err_put;
                }
        }

        env->used_btf_cnt++;

        return 0;
err_put:
        btf_put(btf);
        return err;
}

static bool is_tracing_prog_type(enum bpf_prog_type type)
{
        switch (type) {
        case BPF_PROG_TYPE_KPROBE:
        case BPF_PROG_TYPE_TRACEPOINT:
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
                return true;
        default:
                return false;
        }
}

static int check_map_prog_compatibility(struct bpf_verifier_env *env,
                                        struct bpf_map *map,
                                        struct bpf_prog *prog)

{
        enum bpf_prog_type prog_type = resolve_prog_type(prog);

        if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
            btf_record_has_field(map->record, BPF_RB_ROOT)) {
                if (is_tracing_prog_type(prog_type)) {
                        verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
                        return -EINVAL;
                }
        }

        if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
                        verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
                        return -EINVAL;
                }

                if (is_tracing_prog_type(prog_type)) {
                        verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
                        return -EINVAL;
                }
        }

        if (btf_record_has_field(map->record, BPF_TIMER)) {
                if (is_tracing_prog_type(prog_type)) {
                        verbose(env, "tracing progs cannot use bpf_timer yet\n");
                        return -EINVAL;
                }
        }

        if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
                if (is_tracing_prog_type(prog_type)) {
                        verbose(env, "tracing progs cannot use bpf_wq yet\n");
                        return -EINVAL;
                }
        }

        if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
            !bpf_offload_prog_map_match(prog, map)) {
                verbose(env, "offload device mismatch between prog and map\n");
                return -EINVAL;
        }

        if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                verbose(env, "bpf_struct_ops map cannot be used in prog\n");
                return -EINVAL;
        }

        if (prog->sleepable)
                switch (map->map_type) {
                case BPF_MAP_TYPE_HASH:
                case BPF_MAP_TYPE_LRU_HASH:
                case BPF_MAP_TYPE_ARRAY:
                case BPF_MAP_TYPE_PERCPU_HASH:
                case BPF_MAP_TYPE_PERCPU_ARRAY:
                case BPF_MAP_TYPE_LRU_PERCPU_HASH:
                case BPF_MAP_TYPE_ARRAY_OF_MAPS:
                case BPF_MAP_TYPE_HASH_OF_MAPS:
                case BPF_MAP_TYPE_RINGBUF:
                case BPF_MAP_TYPE_USER_RINGBUF:
                case BPF_MAP_TYPE_INODE_STORAGE:
                case BPF_MAP_TYPE_SK_STORAGE:
                case BPF_MAP_TYPE_TASK_STORAGE:
                case BPF_MAP_TYPE_CGRP_STORAGE:
                case BPF_MAP_TYPE_QUEUE:
                case BPF_MAP_TYPE_STACK:
                case BPF_MAP_TYPE_ARENA:
                        break;
                default:
                        verbose(env,
                                "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
                        return -EINVAL;
                }

        return 0;
}

static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
{
        return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
                map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}

/* find and rewrite pseudo imm in ld_imm64 instructions:
 *
 * 1. if it accesses map FD, replace it with actual map pointer.
 * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
 *
 * NOTE: btf_vmlinux is required for converting pseudo btf_id.
 */
static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        int i, j, err;

        err = bpf_prog_calc_tag(env->prog);
        if (err)
                return err;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (BPF_CLASS(insn->code) == BPF_LDX &&
                    ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
                    insn->imm != 0)) {
                        verbose(env, "BPF_LDX uses reserved fields\n");
                        return -EINVAL;
                }

                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
                        struct bpf_insn_aux_data *aux;
                        struct bpf_map *map;
                        struct fd f;
                        u64 addr;
                        u32 fd;

                        if (i == insn_cnt - 1 || insn[1].code != 0 ||
                            insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
                            insn[1].off != 0) {
                                verbose(env, "invalid bpf_ld_imm64 insn\n");
                                return -EINVAL;
                        }

                        if (insn[0].src_reg == 0)
                                /* valid generic load 64-bit imm */
                                goto next_insn;

                        if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
                                aux = &env->insn_aux_data[i];
                                err = check_pseudo_btf_id(env, insn, aux);
                                if (err)
                                        return err;
                                goto next_insn;
                        }

                        if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
                                aux = &env->insn_aux_data[i];
                                aux->ptr_type = PTR_TO_FUNC;
                                goto next_insn;
                        }

                        /* In final convert_pseudo_ld_imm64() step, this is
                         * converted into regular 64-bit imm load insn.
                         */
                        switch (insn[0].src_reg) {
                        case BPF_PSEUDO_MAP_VALUE:
                        case BPF_PSEUDO_MAP_IDX_VALUE:
                                break;
                        case BPF_PSEUDO_MAP_FD:
                        case BPF_PSEUDO_MAP_IDX:
                                if (insn[1].imm == 0)
                                        break;
                                fallthrough;
                        default:
                                verbose(env, "unrecognized bpf_ld_imm64 insn\n");
                                return -EINVAL;
                        }

                        switch (insn[0].src_reg) {
                        case BPF_PSEUDO_MAP_IDX_VALUE:
                        case BPF_PSEUDO_MAP_IDX:
                                if (bpfptr_is_null(env->fd_array)) {
                                        verbose(env, "fd_idx without fd_array is invalid\n");
                                        return -EPROTO;
                                }
                                if (copy_from_bpfptr_offset(&fd, env->fd_array,
                                                            insn[0].imm * sizeof(fd),
                                                            sizeof(fd)))
                                        return -EFAULT;
                                break;
                        default:
                                fd = insn[0].imm;
                                break;
                        }

                        f = fdget(fd);
                        map = __bpf_map_get(f);
                        if (IS_ERR(map)) {
                                verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
                                return PTR_ERR(map);
                        }

                        err = check_map_prog_compatibility(env, map, env->prog);
                        if (err) {
                                fdput(f);
                                return err;
                        }

                        aux = &env->insn_aux_data[i];
                        if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
                            insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
                                addr = (unsigned long)map;
                        } else {
                                u32 off = insn[1].imm;

                                if (off >= BPF_MAX_VAR_OFF) {
                                        verbose(env, "direct value offset of %u is not allowed\n", off);
                                        fdput(f);
                                        return -EINVAL;
                                }

                                if (!map->ops->map_direct_value_addr) {
                                        verbose(env, "no direct value access support for this map type\n");
                                        fdput(f);
                                        return -EINVAL;
                                }

                                err = map->ops->map_direct_value_addr(map, &addr, off);
                                if (err) {
                                        verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
                                                map->value_size, off);
                                        fdput(f);
                                        return err;
                                }

                                aux->map_off = off;
                                addr += off;
                        }

                        insn[0].imm = (u32)addr;
                        insn[1].imm = addr >> 32;

                        /* check whether we recorded this map already */
                        for (j = 0; j < env->used_map_cnt; j++) {
                                if (env->used_maps[j] == map) {
                                        aux->map_index = j;
                                        fdput(f);
                                        goto next_insn;
                                }
                        }

                        if (env->used_map_cnt >= MAX_USED_MAPS) {
                                verbose(env, "The total number of maps per program has reached the limit of %u\n",
                                        MAX_USED_MAPS);
                                fdput(f);
                                return -E2BIG;
                        }

                        if (env->prog->sleepable)
                                atomic64_inc(&map->sleepable_refcnt);
                        /* hold the map. If the program is rejected by verifier,
                         * the map will be released by release_maps() or it
                         * will be used by the valid program until it's unloaded
                         * and all maps are released in bpf_free_used_maps()
                         */
                        bpf_map_inc(map);

                        aux->map_index = env->used_map_cnt;
                        env->used_maps[env->used_map_cnt++] = map;

                        if (bpf_map_is_cgroup_storage(map) &&
                            bpf_cgroup_storage_assign(env->prog->aux, map)) {
                                verbose(env, "only one cgroup storage of each type is allowed\n");
                                fdput(f);
                                return -EBUSY;
                        }
                        if (map->map_type == BPF_MAP_TYPE_ARENA) {
                                if (env->prog->aux->arena) {
                                        verbose(env, "Only one arena per program\n");
                                        fdput(f);
                                        return -EBUSY;
                                }
                                if (!env->allow_ptr_leaks || !env->bpf_capable) {
                                        verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
                                        fdput(f);
                                        return -EPERM;
                                }
                                if (!env->prog->jit_requested) {
                                        verbose(env, "JIT is required to use arena\n");
                                        fdput(f);
                                        return -EOPNOTSUPP;
                                }
                                if (!bpf_jit_supports_arena()) {
                                        verbose(env, "JIT doesn't support arena\n");
                                        fdput(f);
                                        return -EOPNOTSUPP;
                                }
                                env->prog->aux->arena = (void *)map;
                                if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
                                        verbose(env, "arena's user address must be set via map_extra or mmap()\n");
                                        fdput(f);
                                        return -EINVAL;
                                }
                        }

                        fdput(f);
next_insn:
                        insn++;
                        i++;
                        continue;
                }

                /* Basic sanity check before we invest more work here. */
                if (!bpf_opcode_in_insntable(insn->code)) {
                        verbose(env, "unknown opcode %02x\n", insn->code);
                        return -EINVAL;
                }
        }

        /* now all pseudo BPF_LD_IMM64 instructions load valid
         * 'struct bpf_map *' into a register instead of user map_fd.
         * These pointers will be used later by verifier to validate map access.
         */
        return 0;
}

/* drop refcnt of maps used by the rejected program */
static void release_maps(struct bpf_verifier_env *env)
{
        __bpf_free_used_maps(env->prog->aux, env->used_maps,
                             env->used_map_cnt);
}

/* drop refcnt of maps used by the rejected program */
static void release_btfs(struct bpf_verifier_env *env)
{
        __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
                             env->used_btf_cnt);
}

/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        int i;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
                        continue;
                if (insn->src_reg == BPF_PSEUDO_FUNC)
                        continue;
                insn->src_reg = 0;
        }
}

/* single env->prog->insni[off] instruction was replaced with the range
 * insni[off, off + cnt).  Adjust corresponding insn_aux_data by copying
 * [0, off) and [off, end) to new locations, so the patched range stays zero
 */
static void adjust_insn_aux_data(struct bpf_verifier_env *env,
                                 struct bpf_insn_aux_data *new_data,
                                 struct bpf_prog *new_prog, u32 off, u32 cnt)
{
        struct bpf_insn_aux_data *old_data = env->insn_aux_data;
        struct bpf_insn *insn = new_prog->insnsi;
        u32 old_seen = old_data[off].seen;
        u32 prog_len;
        int i;

        /* aux info at OFF always needs adjustment, no matter fast path
         * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
         * original insn at old prog.
         */
        old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);

        if (cnt == 1)
                return;
        prog_len = new_prog->len;

        memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
        memcpy(new_data + off + cnt - 1, old_data + off,
               sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
        for (i = off; i < off + cnt - 1; i++) {
                /* Expand insni[off]'s seen count to the patched range. */
                new_data[i].seen = old_seen;
                new_data[i].zext_dst = insn_has_def32(env, insn + i);
        }
        env->insn_aux_data = new_data;
        vfree(old_data);
}

static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
{
        int i;

        if (len == 1)
                return;
        /* NOTE: fake 'exit' subprog should be updated as well. */
        for (i = 0; i <= env->subprog_cnt; i++) {
                if (env->subprog_info[i].start <= off)
                        continue;
                env->subprog_info[i].start += len - 1;
        }
}

static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
{
        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
        int i, sz = prog->aux->size_poke_tab;
        struct bpf_jit_poke_descriptor *desc;

        for (i = 0; i < sz; i++) {
                desc = &tab[i];
                if (desc->insn_idx <= off)
                        continue;
                desc->insn_idx += len - 1;
        }
}

static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
                                            const struct bpf_insn *patch, u32 len)
{
        struct bpf_prog *new_prog;
        struct bpf_insn_aux_data *new_data = NULL;

        if (len > 1) {
                new_data = vzalloc(array_size(env->prog->len + len - 1,
                                              sizeof(struct bpf_insn_aux_data)));
                if (!new_data)
                        return NULL;
        }

        new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
        if (IS_ERR(new_prog)) {
                if (PTR_ERR(new_prog) == -ERANGE)
                        verbose(env,
                                "insn %d cannot be patched due to 16-bit range\n",
                                env->insn_aux_data[off].orig_idx);
                vfree(new_data);
                return NULL;
        }
        adjust_insn_aux_data(env, new_data, new_prog, off, len);
        adjust_subprog_starts(env, off, len);
        adjust_poke_descs(new_prog, off, len);
        return new_prog;
}

static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
                                              u32 off, u32 cnt)
{
        int i, j;

        /* find first prog starting at or after off (first to remove) */
        for (i = 0; i < env->subprog_cnt; i++)
                if (env->subprog_info[i].start >= off)
                        break;
        /* find first prog starting at or after off + cnt (first to stay) */
        for (j = i; j < env->subprog_cnt; j++)
                if (env->subprog_info[j].start >= off + cnt)
                        break;
        /* if j doesn't start exactly at off + cnt, we are just removing
         * the front of previous prog
         */
        if (env->subprog_info[j].start != off + cnt)
                j--;

        if (j > i) {
                struct bpf_prog_aux *aux = env->prog->aux;
                int move;

                /* move fake 'exit' subprog as well */
                move = env->subprog_cnt + 1 - j;

                memmove(env->subprog_info + i,
                        env->subprog_info + j,
                        sizeof(*env->subprog_info) * move);
                env->subprog_cnt -= j - i;

                /* remove func_info */
                if (aux->func_info) {
                        move = aux->func_info_cnt - j;

                        memmove(aux->func_info + i,
                                aux->func_info + j,
                                sizeof(*aux->func_info) * move);
                        aux->func_info_cnt -= j - i;
                        /* func_info->insn_off is set after all code rewrites,
                         * in adjust_btf_func() - no need to adjust
                         */
                }
        } else {
                /* convert i from "first prog to remove" to "first to adjust" */
                if (env->subprog_info[i].start == off)
                        i++;
        }

        /* update fake 'exit' subprog as well */
        for (; i <= env->subprog_cnt; i++)
                env->subprog_info[i].start -= cnt;

        return 0;
}

static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
                                      u32 cnt)
{
        struct bpf_prog *prog = env->prog;
        u32 i, l_off, l_cnt, nr_linfo;
        struct bpf_line_info *linfo;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo)
                return 0;

        linfo = prog->aux->linfo;

        /* find first line info to remove, count lines to be removed */
        for (i = 0; i < nr_linfo; i++)
                if (linfo[i].insn_off >= off)
                        break;

        l_off = i;
        l_cnt = 0;
        for (; i < nr_linfo; i++)
                if (linfo[i].insn_off < off + cnt)
                        l_cnt++;
                else
                        break;

        /* First live insn doesn't match first live linfo, it needs to "inherit"
         * last removed linfo.  prog is already modified, so prog->len == off
         * means no live instructions after (tail of the program was removed).
         */
        if (prog->len != off && l_cnt &&
            (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
                l_cnt--;
                linfo[--i].insn_off = off + cnt;
        }

        /* remove the line info which refer to the removed instructions */
        if (l_cnt) {
                memmove(linfo + l_off, linfo + i,
                        sizeof(*linfo) * (nr_linfo - i));

                prog->aux->nr_linfo -= l_cnt;
                nr_linfo = prog->aux->nr_linfo;
        }

        /* pull all linfo[i].insn_off >= off + cnt in by cnt */
        for (i = l_off; i < nr_linfo; i++)
                linfo[i].insn_off -= cnt;

        /* fix up all subprogs (incl. 'exit') which start >= off */
        for (i = 0; i <= env->subprog_cnt; i++)
                if (env->subprog_info[i].linfo_idx > l_off) {
                        /* program may have started in the removed region but
                         * may not be fully removed
                         */
                        if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
                                env->subprog_info[i].linfo_idx -= l_cnt;
                        else
                                env->subprog_info[i].linfo_idx = l_off;
                }

        return 0;
}

static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        unsigned int orig_prog_len = env->prog->len;
        int err;

        if (bpf_prog_is_offloaded(env->prog->aux))
                bpf_prog_offload_remove_insns(env, off, cnt);

        err = bpf_remove_insns(env->prog, off, cnt);
        if (err)
                return err;

        err = adjust_subprog_starts_after_remove(env, off, cnt);
        if (err)
                return err;

        err = bpf_adj_linfo_after_remove(env, off, cnt);
        if (err)
                return err;

        memmove(aux_data + off,        aux_data + off + cnt,
                sizeof(*aux_data) * (orig_prog_len - off - cnt));

        return 0;
}

/* The verifier does more data flow analysis than llvm and will not
 * explore branches that are dead at run time. Malicious programs can
 * have dead code too. Therefore replace all dead at-run-time code
 * with 'ja -1'.
 *
 * Just nops are not optimal, e.g. if they would sit at the end of the
 * program and through another bug we would manage to jump there, then
 * we'd execute beyond program memory otherwise. Returning exception
 * code also wouldn't work since we can have subprogs where the dead
 * code could be located.
 */
static void sanitize_dead_code(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
        struct bpf_insn *insn = env->prog->insnsi;
        const int insn_cnt = env->prog->len;
        int i;

        for (i = 0; i < insn_cnt; i++) {
                if (aux_data[i].seen)
                        continue;
                memcpy(insn + i, &trap, sizeof(trap));
                aux_data[i].zext_dst = false;
        }
}

static bool insn_is_cond_jump(u8 code)
{
        u8 op;

        op = BPF_OP(code);
        if (BPF_CLASS(code) == BPF_JMP32)
                return op != BPF_JA;

        if (BPF_CLASS(code) != BPF_JMP)
                return false;

        return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
}

static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
        struct bpf_insn *insn = env->prog->insnsi;
        const int insn_cnt = env->prog->len;
        int i;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (!insn_is_cond_jump(insn->code))
                        continue;

                if (!aux_data[i + 1].seen)
                        ja.off = insn->off;
                else if (!aux_data[i + 1 + insn->off].seen)
                        ja.off = 0;
                else
                        continue;

                if (bpf_prog_is_offloaded(env->prog->aux))
                        bpf_prog_offload_replace_insn(env, i, &ja);

                memcpy(insn, &ja, sizeof(ja));
        }
}

static int opt_remove_dead_code(struct bpf_verifier_env *env)
{
        struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
        int insn_cnt = env->prog->len;
        int i, err;

        for (i = 0; i < insn_cnt; i++) {
                int j;

                j = 0;
                while (i + j < insn_cnt && !aux_data[i + j].seen)
                        j++;
                if (!j)
                        continue;

                err = verifier_remove_insns(env, i, j);
                if (err)
                        return err;
                insn_cnt = env->prog->len;
        }

        return 0;
}

static int opt_remove_nops(struct bpf_verifier_env *env)
{
        const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        int i, err;

        for (i = 0; i < insn_cnt; i++) {
                if (memcmp(&insn[i], &ja, sizeof(ja)))
                        continue;

                err = verifier_remove_insns(env, i, 1);
                if (err)
                        return err;
                insn_cnt--;
                i--;
        }

        return 0;
}

static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
                                         const union bpf_attr *attr)
{
        struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
        struct bpf_insn_aux_data *aux = env->insn_aux_data;
        int i, patch_len, delta = 0, len = env->prog->len;
        struct bpf_insn *insns = env->prog->insnsi;
        struct bpf_prog *new_prog;
        bool rnd_hi32;

        rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
        zext_patch[1] = BPF_ZEXT_REG(0);
        rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
        rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
        rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
        for (i = 0; i < len; i++) {
                int adj_idx = i + delta;
                struct bpf_insn insn;
                int load_reg;

                insn = insns[adj_idx];
                load_reg = insn_def_regno(&insn);
                if (!aux[adj_idx].zext_dst) {
                        u8 code, class;
                        u32 imm_rnd;

                        if (!rnd_hi32)
                                continue;

                        code = insn.code;
                        class = BPF_CLASS(code);
                        if (load_reg == -1)
                                continue;

                        /* NOTE: arg "reg" (the fourth one) is only used for
                         *       BPF_STX + SRC_OP, so it is safe to pass NULL
                         *       here.
                         */
                        if (is_reg64(env, &insn, load_reg, NULL, DST_OP)) {
                                if (class == BPF_LD &&
                                    BPF_MODE(code) == BPF_IMM)
                                        i++;
                                continue;
                        }

                        /* ctx load could be transformed into wider load. */
                        if (class == BPF_LDX &&
                            aux[adj_idx].ptr_type == PTR_TO_CTX)
                                continue;

                        imm_rnd = get_random_u32();
                        rnd_hi32_patch[0] = insn;
                        rnd_hi32_patch[1].imm = imm_rnd;
                        rnd_hi32_patch[3].dst_reg = load_reg;
                        patch = rnd_hi32_patch;
                        patch_len = 4;
                        goto apply_patch_buffer;
                }

                /* Add in an zero-extend instruction if a) the JIT has requested
                 * it or b) it's a CMPXCHG.
                 *
                 * The latter is because: BPF_CMPXCHG always loads a value into
                 * R0, therefore always zero-extends. However some archs'
                 * equivalent instruction only does this load when the
                 * comparison is successful. This detail of CMPXCHG is
                 * orthogonal to the general zero-extension behaviour of the
                 * CPU, so it's treated independently of bpf_jit_needs_zext.
                 */
                if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
                        continue;

                /* Zero-extension is done by the caller. */
                if (bpf_pseudo_kfunc_call(&insn))
                        continue;

                if (WARN_ON(load_reg == -1)) {
                        verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
                        return -EFAULT;
                }

                zext_patch[0] = insn;
                zext_patch[1].dst_reg = load_reg;
                zext_patch[1].src_reg = load_reg;
                patch = zext_patch;
                patch_len = 2;
apply_patch_buffer:
                new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
                if (!new_prog)
                        return -ENOMEM;
                env->prog = new_prog;
                insns = new_prog->insnsi;
                aux = env->insn_aux_data;
                delta += patch_len - 1;
        }

        return 0;
}

/* convert load instructions that access fields of a context type into a
 * sequence of instructions that access fields of the underlying structure:
 *     struct __sk_buff    -> struct sk_buff
 *     struct bpf_sock_ops -> struct sock
 */
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
        const struct bpf_verifier_ops *ops = env->ops;
        int i, cnt, size, ctx_field_size, delta = 0;
        const int insn_cnt = env->prog->len;
        struct bpf_insn insn_buf[16], *insn;
        u32 target_size, size_default, off;
        struct bpf_prog *new_prog;
        enum bpf_access_type type;
        bool is_narrower_load;

        if (ops->gen_prologue || env->seen_direct_write) {
                if (!ops->gen_prologue) {
                        verbose(env, "bpf verifier is misconfigured\n");
                        return -EINVAL;
                }
                cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
                                        env->prog);
                if (cnt >= ARRAY_SIZE(insn_buf)) {
                        verbose(env, "bpf verifier is misconfigured\n");
                        return -EINVAL;
                } else if (cnt) {
                        new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        env->prog = new_prog;
                        delta += cnt - 1;
                }
        }

        if (bpf_prog_is_offloaded(env->prog->aux))
                return 0;

        insn = env->prog->insnsi + delta;

        for (i = 0; i < insn_cnt; i++, insn++) {
                bpf_convert_ctx_access_t convert_ctx_access;
                u8 mode;

                if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
                    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
                    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
                    insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
                    insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
                    insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
                    insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
                        type = BPF_READ;
                } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
                           insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
                           insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
                           insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
                           insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
                        type = BPF_WRITE;
                } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
                            insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
                           env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
                        insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
                        env->prog->aux->num_exentries++;
                        continue;
                } else {
                        continue;
                }

                if (type == BPF_WRITE &&
                    env->insn_aux_data[i + delta].sanitize_stack_spill) {
                        struct bpf_insn patch[] = {
                                *insn,
                                BPF_ST_NOSPEC(),
                        };

                        cnt = ARRAY_SIZE(patch);
                        new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        continue;
                }

                switch ((int)env->insn_aux_data[i + delta].ptr_type) {
                case PTR_TO_CTX:
                        if (!ops->convert_ctx_access)
                                continue;
                        convert_ctx_access = ops->convert_ctx_access;
                        break;
                case PTR_TO_SOCKET:
                case PTR_TO_SOCK_COMMON:
                        convert_ctx_access = bpf_sock_convert_ctx_access;
                        break;
                case PTR_TO_TCP_SOCK:
                        convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
                        break;
                case PTR_TO_XDP_SOCK:
                        convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
                        break;
                case PTR_TO_BTF_ID:
                case PTR_TO_BTF_ID | PTR_UNTRUSTED:
                /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
                 * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
                 * be said once it is marked PTR_UNTRUSTED, hence we must handle
                 * any faults for loads into such types. BPF_WRITE is disallowed
                 * for this case.
                 */
                case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
                        if (type == BPF_READ) {
                                if (BPF_MODE(insn->code) == BPF_MEM)
                                        insn->code = BPF_LDX | BPF_PROBE_MEM |
                                                     BPF_SIZE((insn)->code);
                                else
                                        insn->code = BPF_LDX | BPF_PROBE_MEMSX |
                                                     BPF_SIZE((insn)->code);
                                env->prog->aux->num_exentries++;
                        }
                        continue;
                case PTR_TO_ARENA:
                        if (BPF_MODE(insn->code) == BPF_MEMSX) {
                                verbose(env, "sign extending loads from arena are not supported yet\n");
                                return -EOPNOTSUPP;
                        }
                        insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
                        env->prog->aux->num_exentries++;
                        continue;
                default:
                        continue;
                }

                ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
                size = BPF_LDST_BYTES(insn);
                mode = BPF_MODE(insn->code);

                /* If the read access is a narrower load of the field,
                 * convert to a 4/8-byte load, to minimum program type specific
                 * convert_ctx_access changes. If conversion is successful,
                 * we will apply proper mask to the result.
                 */
                is_narrower_load = size < ctx_field_size;
                size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
                off = insn->off;
                if (is_narrower_load) {
                        u8 size_code;

                        if (type == BPF_WRITE) {
                                verbose(env, "bpf verifier narrow ctx access misconfigured\n");
                                return -EINVAL;
                        }

                        size_code = BPF_H;
                        if (ctx_field_size == 4)
                                size_code = BPF_W;
                        else if (ctx_field_size == 8)
                                size_code = BPF_DW;

                        insn->off = off & ~(size_default - 1);
                        insn->code = BPF_LDX | BPF_MEM | size_code;
                }

                target_size = 0;
                cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
                                         &target_size);
                if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
                    (ctx_field_size && !target_size)) {
                        verbose(env, "bpf verifier is misconfigured\n");
                        return -EINVAL;
                }

                if (is_narrower_load && size < target_size) {
                        u8 shift = bpf_ctx_narrow_access_offset(
                                off, size, size_default) * 8;
                        if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
                                verbose(env, "bpf verifier narrow ctx load misconfigured\n");
                                return -EINVAL;
                        }
                        if (ctx_field_size <= 4) {
                                if (shift)
                                        insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
                                                                        insn->dst_reg,
                                                                        shift);
                                insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
                                                                (1 << size * 8) - 1);
                        } else {
                                if (shift)
                                        insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
                                                                        insn->dst_reg,
                                                                        shift);
                                insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
                                                                (1ULL << size * 8) - 1);
                        }
                }
                if (mode == BPF_MEMSX)
                        insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
                                                       insn->dst_reg, insn->dst_reg,
                                                       size * 8, 0);

                new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                if (!new_prog)
                        return -ENOMEM;

                delta += cnt - 1;

                /* keep walking new program and skip insns we just inserted */
                env->prog = new_prog;
                insn      = new_prog->insnsi + i + delta;
        }

        return 0;
}

static int jit_subprogs(struct bpf_verifier_env *env)
{
        struct bpf_prog *prog = env->prog, **func, *tmp;
        int i, j, subprog_start, subprog_end = 0, len, subprog;
        struct bpf_map *map_ptr;
        struct bpf_insn *insn;
        void *old_bpf_func;
        int err, num_exentries;

        if (env->subprog_cnt <= 1)
                return 0;

        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
                if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
                        continue;

                /* Upon error here we cannot fall back to interpreter but
                 * need a hard reject of the program. Thus -EFAULT is
                 * propagated in any case.
                 */
                subprog = find_subprog(env, i + insn->imm + 1);
                if (subprog < 0) {
                        WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
                                  i + insn->imm + 1);
                        return -EFAULT;
                }
                /* temporarily remember subprog id inside insn instead of
                 * aux_data, since next loop will split up all insns into funcs
                 */
                insn->off = subprog;
                /* remember original imm in case JIT fails and fallback
                 * to interpreter will be needed
                 */
                env->insn_aux_data[i].call_imm = insn->imm;
                /* point imm to __bpf_call_base+1 from JITs point of view */
                insn->imm = 1;
                if (bpf_pseudo_func(insn)) {
#if defined(MODULES_VADDR)
                        u64 addr = MODULES_VADDR;
#else
                        u64 addr = VMALLOC_START;
#endif
                        /* jit (e.g. x86_64) may emit fewer instructions
                         * if it learns a u32 imm is the same as a u64 imm.
                         * Set close enough to possible prog address.
                         */
                        insn[0].imm = (u32)addr;
                        insn[1].imm = addr >> 32;
                }
        }

        err = bpf_prog_alloc_jited_linfo(prog);
        if (err)
                goto out_undo_insn;

        err = -ENOMEM;
        func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
        if (!func)
                goto out_undo_insn;

        for (i = 0; i < env->subprog_cnt; i++) {
                subprog_start = subprog_end;
                subprog_end = env->subprog_info[i + 1].start;

                len = subprog_end - subprog_start;
                /* bpf_prog_run() doesn't call subprogs directly,
                 * hence main prog stats include the runtime of subprogs.
                 * subprogs don't have IDs and not reachable via prog_get_next_id
                 * func[i]->stats will never be accessed and stays NULL
                 */
                func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
                if (!func[i])
                        goto out_free;
                memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
                       len * sizeof(struct bpf_insn));
                func[i]->type = prog->type;
                func[i]->len = len;
                if (bpf_prog_calc_tag(func[i]))
                        goto out_free;
                func[i]->is_func = 1;
                func[i]->sleepable = prog->sleepable;
                func[i]->aux->func_idx = i;
                /* Below members will be freed only at prog->aux */
                func[i]->aux->btf = prog->aux->btf;
                func[i]->aux->func_info = prog->aux->func_info;
                func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
                func[i]->aux->poke_tab = prog->aux->poke_tab;
                func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;

                for (j = 0; j < prog->aux->size_poke_tab; j++) {
                        struct bpf_jit_poke_descriptor *poke;

                        poke = &prog->aux->poke_tab[j];
                        if (poke->insn_idx < subprog_end &&
                            poke->insn_idx >= subprog_start)
                                poke->aux = func[i]->aux;
                }

                func[i]->aux->name[0] = 'F';
                func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
                func[i]->jit_requested = 1;
                func[i]->blinding_requested = prog->blinding_requested;
                func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
                func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
                func[i]->aux->linfo = prog->aux->linfo;
                func[i]->aux->nr_linfo = prog->aux->nr_linfo;
                func[i]->aux->jited_linfo = prog->aux->jited_linfo;
                func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
                func[i]->aux->arena = prog->aux->arena;
                num_exentries = 0;
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
                        if (BPF_CLASS(insn->code) == BPF_LDX &&
                            (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
                             BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
                             BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
                                num_exentries++;
                        if ((BPF_CLASS(insn->code) == BPF_STX ||
                             BPF_CLASS(insn->code) == BPF_ST) &&
                             BPF_MODE(insn->code) == BPF_PROBE_MEM32)
                                num_exentries++;
                        if (BPF_CLASS(insn->code) == BPF_STX &&
                             BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
                                num_exentries++;
                }
                func[i]->aux->num_exentries = num_exentries;
                func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
                func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
                if (!i)
                        func[i]->aux->exception_boundary = env->seen_exception;
                func[i] = bpf_int_jit_compile(func[i]);
                if (!func[i]->jited) {
                        err = -ENOTSUPP;
                        goto out_free;
                }
                cond_resched();
        }

        /* at this point all bpf functions were successfully JITed
         * now populate all bpf_calls with correct addresses and
         * run last pass of JIT
         */
        for (i = 0; i < env->subprog_cnt; i++) {
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
                        if (bpf_pseudo_func(insn)) {
                                subprog = insn->off;
                                insn[0].imm = (u32)(long)func[subprog]->bpf_func;
                                insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
                                continue;
                        }
                        if (!bpf_pseudo_call(insn))
                                continue;
                        subprog = insn->off;
                        insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
                }

                /* we use the aux data to keep a list of the start addresses
                 * of the JITed images for each function in the program
                 *
                 * for some architectures, such as powerpc64, the imm field
                 * might not be large enough to hold the offset of the start
                 * address of the callee's JITed image from __bpf_call_base
                 *
                 * in such cases, we can lookup the start address of a callee
                 * by using its subprog id, available from the off field of
                 * the call instruction, as an index for this list
                 */
                func[i]->aux->func = func;
                func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
                func[i]->aux->real_func_cnt = env->subprog_cnt;
        }
        for (i = 0; i < env->subprog_cnt; i++) {
                old_bpf_func = func[i]->bpf_func;
                tmp = bpf_int_jit_compile(func[i]);
                if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
                        verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
                        err = -ENOTSUPP;
                        goto out_free;
                }
                cond_resched();
        }

        /* finally lock prog and jit images for all functions and
         * populate kallsysm. Begin at the first subprogram, since
         * bpf_prog_load will add the kallsyms for the main program.
         */
        for (i = 1; i < env->subprog_cnt; i++) {
                err = bpf_prog_lock_ro(func[i]);
                if (err)
                        goto out_free;
        }

        for (i = 1; i < env->subprog_cnt; i++)
                bpf_prog_kallsyms_add(func[i]);

        /* Last step: make now unused interpreter insns from main
         * prog consistent for later dump requests, so they can
         * later look the same as if they were interpreted only.
         */
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        insn[0].imm = env->insn_aux_data[i].call_imm;
                        insn[1].imm = insn->off;
                        insn->off = 0;
                        continue;
                }
                if (!bpf_pseudo_call(insn))
                        continue;
                insn->off = env->insn_aux_data[i].call_imm;
                subprog = find_subprog(env, i + insn->off + 1);
                insn->imm = subprog;
        }

        prog->jited = 1;
        prog->bpf_func = func[0]->bpf_func;
        prog->jited_len = func[0]->jited_len;
        prog->aux->extable = func[0]->aux->extable;
        prog->aux->num_exentries = func[0]->aux->num_exentries;
        prog->aux->func = func;
        prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
        prog->aux->real_func_cnt = env->subprog_cnt;
        prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
        prog->aux->exception_boundary = func[0]->aux->exception_boundary;
        bpf_prog_jit_attempt_done(prog);
        return 0;
out_free:
        /* We failed JIT'ing, so at this point we need to unregister poke
         * descriptors from subprogs, so that kernel is not attempting to
         * patch it anymore as we're freeing the subprog JIT memory.
         */
        for (i = 0; i < prog->aux->size_poke_tab; i++) {
                map_ptr = prog->aux->poke_tab[i].tail_call.map;
                map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
        }
        /* At this point we're guaranteed that poke descriptors are not
         * live anymore. We can just unlink its descriptor table as it's
         * released with the main prog.
         */
        for (i = 0; i < env->subprog_cnt; i++) {
                if (!func[i])
                        continue;
                func[i]->aux->poke_tab = NULL;
                bpf_jit_free(func[i]);
        }
        kfree(func);
out_undo_insn:
        /* cleanup main prog to be interpreted */
        prog->jit_requested = 0;
        prog->blinding_requested = 0;
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
                if (!bpf_pseudo_call(insn))
                        continue;
                insn->off = 0;
                insn->imm = env->insn_aux_data[i].call_imm;
        }
        bpf_prog_jit_attempt_done(prog);
        return err;
}

static int fixup_call_args(struct bpf_verifier_env *env)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        struct bpf_prog *prog = env->prog;
        struct bpf_insn *insn = prog->insnsi;
        bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
        int i, depth;
#endif
        int err = 0;

        if (env->prog->jit_requested &&
            !bpf_prog_is_offloaded(env->prog->aux)) {
                err = jit_subprogs(env);
                if (err == 0)
                        return 0;
                if (err == -EFAULT)
                        return err;
        }
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (has_kfunc_call) {
                verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
                return -EINVAL;
        }
        if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
                /* When JIT fails the progs with bpf2bpf calls and tail_calls
                 * have to be rejected, since interpreter doesn't support them yet.
                 */
                verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
                return -EINVAL;
        }
        for (i = 0; i < prog->len; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        /* When JIT fails the progs with callback calls
                         * have to be rejected, since interpreter doesn't support them yet.
                         */
                        verbose(env, "callbacks are not allowed in non-JITed programs\n");
                        return -EINVAL;
                }

                if (!bpf_pseudo_call(insn))
                        continue;
                depth = get_callee_stack_depth(env, insn, i);
                if (depth < 0)
                        return depth;
                bpf_patch_call_args(insn, depth);
        }
        err = 0;
#endif
        return err;
}

/* replace a generic kfunc with a specialized version if necessary */
static void specialize_kfunc(struct bpf_verifier_env *env,
                             u32 func_id, u16 offset, unsigned long *addr)
{
        struct bpf_prog *prog = env->prog;
        bool seen_direct_write;
        void *xdp_kfunc;
        bool is_rdonly;

        if (bpf_dev_bound_kfunc_id(func_id)) {
                xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
                if (xdp_kfunc) {
                        *addr = (unsigned long)xdp_kfunc;
                        return;
                }
                /* fallback to default kfunc when not supported by netdev */
        }

        if (offset)
                return;

        if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
                seen_direct_write = env->seen_direct_write;
                is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);

                if (is_rdonly)
                        *addr = (unsigned long)bpf_dynptr_from_skb_rdonly;

                /* restore env->seen_direct_write to its original value, since
                 * may_access_direct_pkt_data mutates it
                 */
                env->seen_direct_write = seen_direct_write;
        }
}

static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
                                            u16 struct_meta_reg,
                                            u16 node_offset_reg,
                                            struct bpf_insn *insn,
                                            struct bpf_insn *insn_buf,
                                            int *cnt)
{
        struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
        struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };

        insn_buf[0] = addr[0];
        insn_buf[1] = addr[1];
        insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
        insn_buf[3] = *insn;
        *cnt = 4;
}

static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
                            struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
        const struct bpf_kfunc_desc *desc;

        if (!insn->imm) {
                verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
                return -EINVAL;
        }

        *cnt = 0;

        /* insn->imm has the btf func_id. Replace it with an offset relative to
         * __bpf_call_base, unless the JIT needs to call functions that are
         * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
         */
        desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
        if (!desc) {
                verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
                        insn->imm);
                return -EFAULT;
        }

        if (!bpf_jit_supports_far_kfunc_call())
                insn->imm = BPF_CALL_IMM(desc->addr);
        if (insn->off)
                return 0;
        if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
            desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
                struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
                struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
                u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;

                if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
                        verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
                                insn_idx);
                        return -EFAULT;
                }

                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
                insn_buf[1] = addr[0];
                insn_buf[2] = addr[1];
                insn_buf[3] = *insn;
                *cnt = 4;
        } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
                   desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
                   desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
                struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
                struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };

                if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
                        verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
                                insn_idx);
                        return -EFAULT;
                }

                if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
                    !kptr_struct_meta) {
                        verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
                                insn_idx);
                        return -EFAULT;
                }

                insn_buf[0] = addr[0];
                insn_buf[1] = addr[1];
                insn_buf[2] = *insn;
                *cnt = 3;
        } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
                   desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
                   desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
                struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
                int struct_meta_reg = BPF_REG_3;
                int node_offset_reg = BPF_REG_4;

                /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
                if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
                        struct_meta_reg = BPF_REG_4;
                        node_offset_reg = BPF_REG_5;
                }

                if (!kptr_struct_meta) {
                        verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
                                insn_idx);
                        return -EFAULT;
                }

                __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
                                                node_offset_reg, insn, insn_buf, cnt);
        } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
                   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
                insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
                *cnt = 1;
        } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
                struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };

                insn_buf[0] = ld_addrs[0];
                insn_buf[1] = ld_addrs[1];
                insn_buf[2] = *insn;
                *cnt = 3;
        }
        return 0;
}

/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
{
        struct bpf_subprog_info *info = env->subprog_info;
        int cnt = env->subprog_cnt;
        struct bpf_prog *prog;

        /* We only reserve one slot for hidden subprogs in subprog_info. */
        if (env->hidden_subprog_cnt) {
                verbose(env, "verifier internal error: only one hidden subprog supported\n");
                return -EFAULT;
        }
        /* We're not patching any existing instruction, just appending the new
         * ones for the hidden subprog. Hence all of the adjustment operations
         * in bpf_patch_insn_data are no-ops.
         */
        prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
        if (!prog)
                return -ENOMEM;
        env->prog = prog;
        info[cnt + 1].start = info[cnt].start;
        info[cnt].start = prog->len - len + 1;
        env->subprog_cnt++;
        env->hidden_subprog_cnt++;
        return 0;
}

/* Do various post-verification rewrites in a single program pass.
 * These rewrites simplify JIT and interpreter implementations.
 */
static int do_misc_fixups(struct bpf_verifier_env *env)
{
        struct bpf_prog *prog = env->prog;
        enum bpf_attach_type eatype = prog->expected_attach_type;
        enum bpf_prog_type prog_type = resolve_prog_type(prog);
        struct bpf_insn *insn = prog->insnsi;
        const struct bpf_func_proto *fn;
        const int insn_cnt = prog->len;
        const struct bpf_map_ops *ops;
        struct bpf_insn_aux_data *aux;
        struct bpf_insn insn_buf[16];
        struct bpf_prog *new_prog;
        struct bpf_map *map_ptr;
        int i, ret, cnt, delta = 0, cur_subprog = 0;
        struct bpf_subprog_info *subprogs = env->subprog_info;
        u16 stack_depth = subprogs[cur_subprog].stack_depth;
        u16 stack_depth_extra = 0;

        if (env->seen_exception && !env->exception_callback_subprog) {
                struct bpf_insn patch[] = {
                        env->prog->insnsi[insn_cnt - 1],
                        BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
                        BPF_EXIT_INSN(),
                };

                ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
                if (ret < 0)
                        return ret;
                prog = env->prog;
                insn = prog->insnsi;

                env->exception_callback_subprog = env->subprog_cnt - 1;
                /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
                mark_subprog_exc_cb(env, env->exception_callback_subprog);
        }

        for (i = 0; i < insn_cnt;) {
                if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
                        if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
                            (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
                                /* convert to 32-bit mov that clears upper 32-bit */
                                insn->code = BPF_ALU | BPF_MOV | BPF_X;
                                /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
                                insn->off = 0;
                                insn->imm = 0;
                        } /* cast from as(0) to as(1) should be handled by JIT */
                        goto next_insn;
                }

                if (env->insn_aux_data[i + delta].needs_zext)
                        /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
                        insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);

                /* Make divide-by-zero exceptions impossible. */
                if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
                        bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
                        bool isdiv = BPF_OP(insn->code) == BPF_DIV;
                        struct bpf_insn *patchlet;
                        struct bpf_insn chk_and_div[] = {
                                /* [R,W]x div 0 -> 0 */
                                BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                             BPF_JNE | BPF_K, insn->src_reg,
                                             0, 2, 0),
                                BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
                                BPF_JMP_IMM(BPF_JA, 0, 0, 1),
                                *insn,
                        };
                        struct bpf_insn chk_and_mod[] = {
                                /* [R,W]x mod 0 -> [R,W]x */
                                BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
                                             BPF_JEQ | BPF_K, insn->src_reg,
                                             0, 1 + (is64 ? 0 : 1), 0),
                                *insn,
                                BPF_JMP_IMM(BPF_JA, 0, 0, 1),
                                BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
                        };

                        patchlet = isdiv ? chk_and_div : chk_and_mod;
                        cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
                                      ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);

                        new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Make it impossible to de-reference a userspace address */
                if (BPF_CLASS(insn->code) == BPF_LDX &&
                    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
                     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
                        struct bpf_insn *patch = &insn_buf[0];
                        u64 uaddress_limit = bpf_arch_uaddress_limit();

                        if (!uaddress_limit)
                                goto next_insn;

                        *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
                        if (insn->off)
                                *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
                        *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
                        *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
                        *patch++ = *insn;
                        *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
                        *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);

                        cnt = patch - insn_buf;
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
                if (BPF_CLASS(insn->code) == BPF_LD &&
                    (BPF_MODE(insn->code) == BPF_ABS ||
                     BPF_MODE(insn->code) == BPF_IND)) {
                        cnt = env->ops->gen_ld_abs(insn, insn_buf);
                        if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
                                verbose(env, "bpf verifier is misconfigured\n");
                                return -EINVAL;
                        }

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Rewrite pointer arithmetic to mitigate speculation attacks. */
                if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
                        const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
                        const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
                        struct bpf_insn *patch = &insn_buf[0];
                        bool issrc, isneg, isimm;
                        u32 off_reg;

                        aux = &env->insn_aux_data[i + delta];
                        if (!aux->alu_state ||
                            aux->alu_state == BPF_ALU_NON_POINTER)
                                goto next_insn;

                        isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
                        issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
                                BPF_ALU_SANITIZE_SRC;
                        isimm = aux->alu_state & BPF_ALU_IMMEDIATE;

                        off_reg = issrc ? insn->src_reg : insn->dst_reg;
                        if (isimm) {
                                *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                        } else {
                                if (isneg)
                                        *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
                                *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                                *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
                                *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
                                *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
                                *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
                                *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
                        }
                        if (!issrc)
                                *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
                        insn->src_reg = BPF_REG_AX;
                        if (isneg)
                                insn->code = insn->code == code_add ?
                                             code_sub : code_add;
                        *patch++ = *insn;
                        if (issrc && isneg && !isimm)
                                *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
                        cnt = patch - insn_buf;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                if (is_may_goto_insn(insn)) {
                        int stack_off = -stack_depth - 8;

                        stack_depth_extra = 8;
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
                        insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
                        insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
                        insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
                        cnt = 4;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta += cnt - 1;
                        env->prog = prog = new_prog;
                        insn = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                if (insn->code != (BPF_JMP | BPF_CALL))
                        goto next_insn;
                if (insn->src_reg == BPF_PSEUDO_CALL)
                        goto next_insn;
                if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
                        ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
                        if (ret)
                                return ret;
                        if (cnt == 0)
                                goto next_insn;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta         += cnt - 1;
                        env->prog = prog = new_prog;
                        insn          = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Skip inlining the helper call if the JIT does it. */
                if (bpf_jit_inlines_helper_call(insn->imm))
                        goto next_insn;

                if (insn->imm == BPF_FUNC_get_route_realm)
                        prog->dst_needed = 1;
                if (insn->imm == BPF_FUNC_get_prandom_u32)
                        bpf_user_rnd_init_once();
                if (insn->imm == BPF_FUNC_override_return)
                        prog->kprobe_override = 1;
                if (insn->imm == BPF_FUNC_tail_call) {
                        /* If we tail call into other programs, we
                         * cannot make any assumptions since they can
                         * be replaced dynamically during runtime in
                         * the program array.
                         */
                        prog->cb_access = 1;
                        if (!allow_tail_call_in_subprogs(env))
                                prog->aux->stack_depth = MAX_BPF_STACK;
                        prog->aux->max_pkt_offset = MAX_PACKET_OFF;

                        /* mark bpf_tail_call as different opcode to avoid
                         * conditional branch in the interpreter for every normal
                         * call and to prevent accidental JITing by JIT compiler
                         * that doesn't support bpf_tail_call yet
                         */
                        insn->imm = 0;
                        insn->code = BPF_JMP | BPF_TAIL_CALL;

                        aux = &env->insn_aux_data[i + delta];
                        if (env->bpf_capable && !prog->blinding_requested &&
                            prog->jit_requested &&
                            !bpf_map_key_poisoned(aux) &&
                            !bpf_map_ptr_poisoned(aux) &&
                            !bpf_map_ptr_unpriv(aux)) {
                                struct bpf_jit_poke_descriptor desc = {
                                        .reason = BPF_POKE_REASON_TAIL_CALL,
                                        .tail_call.map = aux->map_ptr_state.map_ptr,
                                        .tail_call.key = bpf_map_key_immediate(aux),
                                        .insn_idx = i + delta,
                                };

                                ret = bpf_jit_add_poke_descriptor(prog, &desc);
                                if (ret < 0) {
                                        verbose(env, "adding tail call poke descriptor failed\n");
                                        return ret;
                                }

                                insn->imm = ret + 1;
                                goto next_insn;
                        }

                        if (!bpf_map_ptr_unpriv(aux))
                                goto next_insn;

                        /* instead of changing every JIT dealing with tail_call
                         * emit two extra insns:
                         * if (index >= max_entries) goto out;
                         * index &= array->index_mask;
                         * to avoid out-of-bounds cpu speculation
                         */
                        if (bpf_map_ptr_poisoned(aux)) {
                                verbose(env, "tail_call abusing map_ptr\n");
                                return -EINVAL;
                        }

                        map_ptr = aux->map_ptr_state.map_ptr;
                        insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
                                                  map_ptr->max_entries, 2);
                        insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
                                                    container_of(map_ptr,
                                                                 struct bpf_array,
                                                                 map)->index_mask);
                        insn_buf[2] = *insn;
                        cnt = 3;
                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                if (insn->imm == BPF_FUNC_timer_set_callback) {
                        /* The verifier will process callback_fn as many times as necessary
                         * with different maps and the register states prepared by
                         * set_timer_callback_state will be accurate.
                         *
                         * The following use case is valid:
                         *   map1 is shared by prog1, prog2, prog3.
                         *   prog1 calls bpf_timer_init for some map1 elements
                         *   prog2 calls bpf_timer_set_callback for some map1 elements.
                         *     Those that were not bpf_timer_init-ed will return -EINVAL.
                         *   prog3 calls bpf_timer_start for some map1 elements.
                         *     Those that were not both bpf_timer_init-ed and
                         *     bpf_timer_set_callback-ed will return -EINVAL.
                         */
                        struct bpf_insn ld_addrs[2] = {
                                BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
                        };

                        insn_buf[0] = ld_addrs[0];
                        insn_buf[1] = ld_addrs[1];
                        insn_buf[2] = *insn;
                        cnt = 3;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto patch_call_imm;
                }

                if (is_storage_get_function(insn->imm)) {
                        if (!in_sleepable(env) ||
                            env->insn_aux_data[i + delta].storage_get_func_atomic)
                                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
                        else
                                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
                        insn_buf[1] = *insn;
                        cnt = 2;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta += cnt - 1;
                        env->prog = prog = new_prog;
                        insn = new_prog->insnsi + i + delta;
                        goto patch_call_imm;
                }

                /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
                if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
                        /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
                         * bpf_mem_alloc() returns a ptr to the percpu data ptr.
                         */
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
                        insn_buf[1] = *insn;
                        cnt = 2;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta += cnt - 1;
                        env->prog = prog = new_prog;
                        insn = new_prog->insnsi + i + delta;
                        goto patch_call_imm;
                }

                /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
                 * and other inlining handlers are currently limited to 64 bit
                 * only.
                 */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    (insn->imm == BPF_FUNC_map_lookup_elem ||
                     insn->imm == BPF_FUNC_map_update_elem ||
                     insn->imm == BPF_FUNC_map_delete_elem ||
                     insn->imm == BPF_FUNC_map_push_elem   ||
                     insn->imm == BPF_FUNC_map_pop_elem    ||
                     insn->imm == BPF_FUNC_map_peek_elem   ||
                     insn->imm == BPF_FUNC_redirect_map    ||
                     insn->imm == BPF_FUNC_for_each_map_elem ||
                     insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
                        aux = &env->insn_aux_data[i + delta];
                        if (bpf_map_ptr_poisoned(aux))
                                goto patch_call_imm;

                        map_ptr = aux->map_ptr_state.map_ptr;
                        ops = map_ptr->ops;
                        if (insn->imm == BPF_FUNC_map_lookup_elem &&
                            ops->map_gen_lookup) {
                                cnt = ops->map_gen_lookup(map_ptr, insn_buf);
                                if (cnt == -EOPNOTSUPP)
                                        goto patch_map_ops_generic;
                                if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
                                        verbose(env, "bpf verifier is misconfigured\n");
                                        return -EINVAL;
                                }

                                new_prog = bpf_patch_insn_data(env, i + delta,
                                                               insn_buf, cnt);
                                if (!new_prog)
                                        return -ENOMEM;

                                delta    += cnt - 1;
                                env->prog = prog = new_prog;
                                insn      = new_prog->insnsi + i + delta;
                                goto next_insn;
                        }

                        BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
                                     (void *(*)(struct bpf_map *map, void *key))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
                                     (long (*)(struct bpf_map *map, void *key))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_update_elem,
                                     (long (*)(struct bpf_map *map, void *key, void *value,
                                              u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_push_elem,
                                     (long (*)(struct bpf_map *map, void *value,
                                              u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
                                     (long (*)(struct bpf_map *map, void *value))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
                                     (long (*)(struct bpf_map *map, void *value))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_redirect,
                                     (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
                                     (long (*)(struct bpf_map *map,
                                              bpf_callback_t callback_fn,
                                              void *callback_ctx,
                                              u64 flags))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
                                     (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));

patch_map_ops_generic:
                        switch (insn->imm) {
                        case BPF_FUNC_map_lookup_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
                                goto next_insn;
                        case BPF_FUNC_map_update_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_update_elem);
                                goto next_insn;
                        case BPF_FUNC_map_delete_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
                                goto next_insn;
                        case BPF_FUNC_map_push_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_push_elem);
                                goto next_insn;
                        case BPF_FUNC_map_pop_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
                                goto next_insn;
                        case BPF_FUNC_map_peek_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
                                goto next_insn;
                        case BPF_FUNC_redirect_map:
                                insn->imm = BPF_CALL_IMM(ops->map_redirect);
                                goto next_insn;
                        case BPF_FUNC_for_each_map_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
                                goto next_insn;
                        case BPF_FUNC_map_lookup_percpu_elem:
                                insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
                                goto next_insn;
                        }

                        goto patch_call_imm;
                }

                /* Implement bpf_jiffies64 inline. */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_jiffies64) {
                        struct bpf_insn ld_jiffies_addr[2] = {
                                BPF_LD_IMM64(BPF_REG_0,
                                             (unsigned long)&jiffies),
                        };

                        insn_buf[0] = ld_jiffies_addr[0];
                        insn_buf[1] = ld_jiffies_addr[1];
                        insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
                                                  BPF_REG_0, 0);
                        cnt = 3;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
                                                       cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
                /* Implement bpf_get_smp_processor_id() inline. */
                if (insn->imm == BPF_FUNC_get_smp_processor_id &&
                    prog->jit_requested && bpf_jit_supports_percpu_insn()) {
                        /* BPF_FUNC_get_smp_processor_id inlining is an
                         * optimization, so if pcpu_hot.cpu_number is ever
                         * changed in some incompatible and hard to support
                         * way, it's fine to back out this inlining logic
                         */
                        insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
                        insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
                        insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
                        cnt = 3;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }
#endif
                /* Implement bpf_get_func_arg inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_arg) {
                        /* Load nr_args from ctx - 8 */
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
                        insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
                        insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
                        insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
                        insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
                        insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
                        insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
                        insn_buf[7] = BPF_JMP_A(1);
                        insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
                        cnt = 9;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_get_func_ret inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_ret) {
                        if (eatype == BPF_TRACE_FEXIT ||
                            eatype == BPF_MODIFY_RETURN) {
                                /* Load nr_args from ctx - 8 */
                                insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
                                insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
                                insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
                                insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
                                insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
                                insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
                                cnt = 6;
                        } else {
                                insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
                                cnt = 1;
                        }

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement get_func_arg_cnt inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_arg_cnt) {
                        /* Load nr_args from ctx - 8 */
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
                        if (!new_prog)
                                return -ENOMEM;

                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_get_func_ip inline. */
                if (prog_type == BPF_PROG_TYPE_TRACING &&
                    insn->imm == BPF_FUNC_get_func_ip) {
                        /* Load IP address from ctx - 16 */
                        insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
                        if (!new_prog)
                                return -ENOMEM;

                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }

                /* Implement bpf_get_branch_snapshot inline. */
                if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
                    prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_get_branch_snapshot) {
                        /* We are dealing with the following func protos:
                         * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
                         * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
                         */
                        const u32 br_entry_size = sizeof(struct perf_branch_entry);

                        /* struct perf_branch_entry is part of UAPI and is
                         * used as an array element, so extremely unlikely to
                         * ever grow or shrink
                         */
                        BUILD_BUG_ON(br_entry_size != 24);

                        /* if (unlikely(flags)) return -EINVAL */
                        insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);

                        /* Transform size (bytes) into number of entries (cnt = size / 24).
                         * But to avoid expensive division instruction, we implement
                         * divide-by-3 through multiplication, followed by further
                         * division by 8 through 3-bit right shift.
                         * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
                         * p. 227, chapter "Unsigned Division by 3" for details and proofs.
                         *
                         * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
                         */
                        insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
                        insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
                        insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);

                        /* call perf_snapshot_branch_stack implementation */
                        insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
                        /* if (entry_cnt == 0) return -ENOENT */
                        insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
                        /* return entry_cnt * sizeof(struct perf_branch_entry) */
                        insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
                        insn_buf[7] = BPF_JMP_A(3);
                        /* return -EINVAL; */
                        insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
                        insn_buf[9] = BPF_JMP_A(1);
                        /* return -ENOENT; */
                        insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
                        cnt = 11;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        continue;
                }

                /* Implement bpf_kptr_xchg inline */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_kptr_xchg &&
                    bpf_jit_supports_ptr_xchg()) {
                        insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
                        insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
                        cnt = 2;

                        new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta    += cnt - 1;
                        env->prog = prog = new_prog;
                        insn      = new_prog->insnsi + i + delta;
                        goto next_insn;
                }
patch_call_imm:
                fn = env->ops->get_func_proto(insn->imm, env->prog);
                /* all functions that have prototype and verifier allowed
                 * programs to call them, must be real in-kernel functions
                 */
                if (!fn->func) {
                        verbose(env,
                                "kernel subsystem misconfigured func %s#%d\n",
                                func_id_name(insn->imm), insn->imm);
                        return -EFAULT;
                }
                insn->imm = fn->func - __bpf_call_base;
next_insn:
                if (subprogs[cur_subprog + 1].start == i + delta + 1) {
                        subprogs[cur_subprog].stack_depth += stack_depth_extra;
                        subprogs[cur_subprog].stack_extra = stack_depth_extra;
                        cur_subprog++;
                        stack_depth = subprogs[cur_subprog].stack_depth;
                        stack_depth_extra = 0;
                }
                i++;
                insn++;
        }

        env->prog->aux->stack_depth = subprogs[0].stack_depth;
        for (i = 0; i < env->subprog_cnt; i++) {
                int subprog_start = subprogs[i].start;
                int stack_slots = subprogs[i].stack_extra / 8;

                if (!stack_slots)
                        continue;
                if (stack_slots > 1) {
                        verbose(env, "verifier bug: stack_slots supports may_goto only\n");
                        return -EFAULT;
                }

                /* Add ST insn to subprog prologue to init extra stack */
                insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
                                         -subprogs[i].stack_depth, BPF_MAX_LOOPS);
                /* Copy first actual insn to preserve it */
                insn_buf[1] = env->prog->insnsi[subprog_start];

                new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
                if (!new_prog)
                        return -ENOMEM;
                env->prog = prog = new_prog;
        }

        /* Since poke tab is now finalized, publish aux to tracker. */
        for (i = 0; i < prog->aux->size_poke_tab; i++) {
                map_ptr = prog->aux->poke_tab[i].tail_call.map;
                if (!map_ptr->ops->map_poke_track ||
                    !map_ptr->ops->map_poke_untrack ||
                    !map_ptr->ops->map_poke_run) {
                        verbose(env, "bpf verifier is misconfigured\n");
                        return -EINVAL;
                }

                ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
                if (ret < 0) {
                        verbose(env, "tracking tail call prog failed\n");
                        return ret;
                }
        }

        sort_kfunc_descs_by_imm_off(env->prog);

        return 0;
}

static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
                                        int position,
                                        s32 stack_base,
                                        u32 callback_subprogno,
                                        u32 *cnt)
{
        s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
        s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
        s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
        int reg_loop_max = BPF_REG_6;
        int reg_loop_cnt = BPF_REG_7;
        int reg_loop_ctx = BPF_REG_8;

        struct bpf_prog *new_prog;
        u32 callback_start;
        u32 call_insn_offset;
        s32 callback_offset;

        /* This represents an inlined version of bpf_iter.c:bpf_loop,
         * be careful to modify this code in sync.
         */
        struct bpf_insn insn_buf[] = {
                /* Return error and jump to the end of the patch if
                 * expected number of iterations is too big.
                 */
                BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
                BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
                BPF_JMP_IMM(BPF_JA, 0, 0, 16),
                /* spill R6, R7, R8 to use these as loop vars */
                BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
                BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
                BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
                /* initialize loop vars */
                BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
                BPF_MOV32_IMM(reg_loop_cnt, 0),
                BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
                /* loop header,
                 * if reg_loop_cnt >= reg_loop_max skip the loop body
                 */
                BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
                /* callback call,
                 * correct callback offset would be set after patching
                 */
                BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
                BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
                BPF_CALL_REL(0),
                /* increment loop counter */
                BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
                /* jump to loop header if callback returned 0 */
                BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
                /* return value of bpf_loop,
                 * set R0 to the number of iterations
                 */
                BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
                /* restore original values of R6, R7, R8 */
                BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
                BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
                BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
        };

        *cnt = ARRAY_SIZE(insn_buf);
        new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
        if (!new_prog)
                return new_prog;

        /* callback start is known only after patching */
        callback_start = env->subprog_info[callback_subprogno].start;
        /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
        call_insn_offset = position + 12;
        callback_offset = callback_start - call_insn_offset - 1;
        new_prog->insnsi[call_insn_offset].imm = callback_offset;

        return new_prog;
}

static bool is_bpf_loop_call(struct bpf_insn *insn)
{
        return insn->code == (BPF_JMP | BPF_CALL) &&
                insn->src_reg == 0 &&
                insn->imm == BPF_FUNC_loop;
}

/* For all sub-programs in the program (including main) check
 * insn_aux_data to see if there are bpf_loop calls that require
 * inlining. If such calls are found the calls are replaced with a
 * sequence of instructions produced by `inline_bpf_loop` function and
 * subprog stack_depth is increased by the size of 3 registers.
 * This stack space is used to spill values of the R6, R7, R8.  These
 * registers are used to store the loop bound, counter and context
 * variables.
 */
static int optimize_bpf_loop(struct bpf_verifier_env *env)
{
        struct bpf_subprog_info *subprogs = env->subprog_info;
        int i, cur_subprog = 0, cnt, delta = 0;
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
        u16 stack_depth = subprogs[cur_subprog].stack_depth;
        u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
        u16 stack_depth_extra = 0;

        for (i = 0; i < insn_cnt; i++, insn++) {
                struct bpf_loop_inline_state *inline_state =
                        &env->insn_aux_data[i + delta].loop_inline_state;

                if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
                        struct bpf_prog *new_prog;

                        stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
                        new_prog = inline_bpf_loop(env,
                                                   i + delta,
                                                   -(stack_depth + stack_depth_extra),
                                                   inline_state->callback_subprogno,
                                                   &cnt);
                        if (!new_prog)
                                return -ENOMEM;

                        delta     += cnt - 1;
                        env->prog  = new_prog;
                        insn       = new_prog->insnsi + i + delta;
                }

                if (subprogs[cur_subprog + 1].start == i + delta + 1) {
                        subprogs[cur_subprog].stack_depth += stack_depth_extra;
                        cur_subprog++;
                        stack_depth = subprogs[cur_subprog].stack_depth;
                        stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
                        stack_depth_extra = 0;
                }
        }

        env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;

        return 0;
}

static void free_states(struct bpf_verifier_env *env)
{
        struct bpf_verifier_state_list *sl, *sln;
        int i;

        sl = env->free_list;
        while (sl) {
                sln = sl->next;
                free_verifier_state(&sl->state, false);
                kfree(sl);
                sl = sln;
        }
        env->free_list = NULL;

        if (!env->explored_states)
                return;

        for (i = 0; i < state_htab_size(env); i++) {
                sl = env->explored_states[i];

                while (sl) {
                        sln = sl->next;
                        free_verifier_state(&sl->state, false);
                        kfree(sl);
                        sl = sln;
                }
                env->explored_states[i] = NULL;
        }
}

static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
        bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
        struct bpf_subprog_info *sub = subprog_info(env, subprog);
        struct bpf_verifier_state *state;
        struct bpf_reg_state *regs;
        int ret, i;

        env->prev_linfo = NULL;
        env->pass_cnt++;

        state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
        if (!state)
                return -ENOMEM;
        state->curframe = 0;
        state->speculative = false;
        state->branches = 1;
        state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
        if (!state->frame[0]) {
                kfree(state);
                return -ENOMEM;
        }
        env->cur_state = state;
        init_func_state(env, state->frame[0],
                        BPF_MAIN_FUNC /* callsite */,
                        0 /* frameno */,
                        subprog);
        state->first_insn_idx = env->subprog_info[subprog].start;
        state->last_insn_idx = -1;

        regs = state->frame[state->curframe]->regs;
        if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
                const char *sub_name = subprog_name(env, subprog);
                struct bpf_subprog_arg_info *arg;
                struct bpf_reg_state *reg;

                verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
                ret = btf_prepare_func_args(env, subprog);
                if (ret)
                        goto out;

                if (subprog_is_exc_cb(env, subprog)) {
                        state->frame[0]->in_exception_callback_fn = true;
                        /* We have already ensured that the callback returns an integer, just
                         * like all global subprogs. We need to determine it only has a single
                         * scalar argument.
                         */
                        if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
                                verbose(env, "exception cb only supports single integer argument\n");
                                ret = -EINVAL;
                                goto out;
                        }
                }
                for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
                        arg = &sub->args[i - BPF_REG_1];
                        reg = &regs[i];

                        if (arg->arg_type == ARG_PTR_TO_CTX) {
                                reg->type = PTR_TO_CTX;
                                mark_reg_known_zero(env, regs, i);
                        } else if (arg->arg_type == ARG_ANYTHING) {
                                reg->type = SCALAR_VALUE;
                                mark_reg_unknown(env, regs, i);
                        } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
                                /* assume unspecial LOCAL dynptr type */
                                __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
                        } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
                                reg->type = PTR_TO_MEM;
                                if (arg->arg_type & PTR_MAYBE_NULL)
                                        reg->type |= PTR_MAYBE_NULL;
                                mark_reg_known_zero(env, regs, i);
                                reg->mem_size = arg->mem_size;
                                reg->id = ++env->id_gen;
                        } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
                                reg->type = PTR_TO_BTF_ID;
                                if (arg->arg_type & PTR_MAYBE_NULL)
                                        reg->type |= PTR_MAYBE_NULL;
                                if (arg->arg_type & PTR_UNTRUSTED)
                                        reg->type |= PTR_UNTRUSTED;
                                if (arg->arg_type & PTR_TRUSTED)
                                        reg->type |= PTR_TRUSTED;
                                mark_reg_known_zero(env, regs, i);
                                reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
                                reg->btf_id = arg->btf_id;
                                reg->id = ++env->id_gen;
                        } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
                                /* caller can pass either PTR_TO_ARENA or SCALAR */
                                mark_reg_unknown(env, regs, i);
                        } else {
                                WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
                                          i - BPF_REG_1, arg->arg_type);
                                ret = -EFAULT;
                                goto out;
                        }
                }
        } else {
                /* if main BPF program has associated BTF info, validate that
                 * it's matching expected signature, and otherwise mark BTF
                 * info for main program as unreliable
                 */
                if (env->prog->aux->func_info_aux) {
                        ret = btf_prepare_func_args(env, 0);
                        if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
                                env->prog->aux->func_info_aux[0].unreliable = true;
                }

                /* 1st arg to a function */
                regs[BPF_REG_1].type = PTR_TO_CTX;
                mark_reg_known_zero(env, regs, BPF_REG_1);
        }

        ret = do_check(env);
out:
        /* check for NULL is necessary, since cur_state can be freed inside
         * do_check() under memory pressure.
         */
        if (env->cur_state) {
                free_verifier_state(env->cur_state, true);
                env->cur_state = NULL;
        }
        while (!pop_stack(env, NULL, NULL, false));
        if (!ret && pop_log)
                bpf_vlog_reset(&env->log, 0);
        free_states(env);
        return ret;
}

/* Lazily verify all global functions based on their BTF, if they are called
 * from main BPF program or any of subprograms transitively.
 * BPF global subprogs called from dead code are not validated.
 * All callable global functions must pass verification.
 * Otherwise the whole program is rejected.
 * Consider:
 * int bar(int);
 * int foo(int f)
 * {
 *    return bar(f);
 * }
 * int bar(int b)
 * {
 *    ...
 * }
 * foo() will be verified first for R1=any_scalar_value. During verification it
 * will be assumed that bar() already verified successfully and call to bar()
 * from foo() will be checked for type match only. Later bar() will be verified
 * independently to check that it's safe for R1=any_scalar_value.
 */
static int do_check_subprogs(struct bpf_verifier_env *env)
{
        struct bpf_prog_aux *aux = env->prog->aux;
        struct bpf_func_info_aux *sub_aux;
        int i, ret, new_cnt;

        if (!aux->func_info)
                return 0;

        /* exception callback is presumed to be always called */
        if (env->exception_callback_subprog)
                subprog_aux(env, env->exception_callback_subprog)->called = true;

again:
        new_cnt = 0;
        for (i = 1; i < env->subprog_cnt; i++) {
                if (!subprog_is_global(env, i))
                        continue;

                sub_aux = subprog_aux(env, i);
                if (!sub_aux->called || sub_aux->verified)
                        continue;

                env->insn_idx = env->subprog_info[i].start;
                WARN_ON_ONCE(env->insn_idx == 0);
                ret = do_check_common(env, i);
                if (ret) {
                        return ret;
                } else if (env->log.level & BPF_LOG_LEVEL) {
                        verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
                                i, subprog_name(env, i));
                }

                /* We verified new global subprog, it might have called some
                 * more global subprogs that we haven't verified yet, so we
                 * need to do another pass over subprogs to verify those.
                 */
                sub_aux->verified = true;
                new_cnt++;
        }

        /* We can't loop forever as we verify at least one global subprog on
         * each pass.
         */
        if (new_cnt)
                goto again;

        return 0;
}

static int do_check_main(struct bpf_verifier_env *env)
{
        int ret;

        env->insn_idx = 0;
        ret = do_check_common(env, 0);
        if (!ret)
                env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
        return ret;
}


static void print_verification_stats(struct bpf_verifier_env *env)
{
        int i;

        if (env->log.level & BPF_LOG_STATS) {
                verbose(env, "verification time %lld usec\n",
                        div_u64(env->verification_time, 1000));
                verbose(env, "stack depth ");
                for (i = 0; i < env->subprog_cnt; i++) {
                        u32 depth = env->subprog_info[i].stack_depth;

                        verbose(env, "%d", depth);
                        if (i + 1 < env->subprog_cnt)
                                verbose(env, "+");
                }
                verbose(env, "\n");
        }
        verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
                "total_states %d peak_states %d mark_read %d\n",
                env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
                env->max_states_per_insn, env->total_states,
                env->peak_states, env->longest_mark_read_walk);
}

static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
        const struct btf_type *t, *func_proto;
        const struct bpf_struct_ops_desc *st_ops_desc;
        const struct bpf_struct_ops *st_ops;
        const struct btf_member *member;
        struct bpf_prog *prog = env->prog;
        u32 btf_id, member_idx;
        struct btf *btf;
        const char *mname;

        if (!prog->gpl_compatible) {
                verbose(env, "struct ops programs must have a GPL compatible license\n");
                return -EINVAL;
        }

        if (!prog->aux->attach_btf_id)
                return -ENOTSUPP;

        btf = prog->aux->attach_btf;
        if (btf_is_module(btf)) {
                /* Make sure st_ops is valid through the lifetime of env */
                env->attach_btf_mod = btf_try_get_module(btf);
                if (!env->attach_btf_mod) {
                        verbose(env, "struct_ops module %s is not found\n",
                                btf_get_name(btf));
                        return -ENOTSUPP;
                }
        }

        btf_id = prog->aux->attach_btf_id;
        st_ops_desc = bpf_struct_ops_find(btf, btf_id);
        if (!st_ops_desc) {
                verbose(env, "attach_btf_id %u is not a supported struct\n",
                        btf_id);
                return -ENOTSUPP;
        }
        st_ops = st_ops_desc->st_ops;

        t = st_ops_desc->type;
        member_idx = prog->expected_attach_type;
        if (member_idx >= btf_type_vlen(t)) {
                verbose(env, "attach to invalid member idx %u of struct %s\n",
                        member_idx, st_ops->name);
                return -EINVAL;
        }

        member = &btf_type_member(t)[member_idx];
        mname = btf_name_by_offset(btf, member->name_off);
        func_proto = btf_type_resolve_func_ptr(btf, member->type,
                                               NULL);
        if (!func_proto) {
                verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
                        mname, member_idx, st_ops->name);
                return -EINVAL;
        }

        if (st_ops->check_member) {
                int err = st_ops->check_member(t, member, prog);

                if (err) {
                        verbose(env, "attach to unsupported member %s of struct %s\n",
                                mname, st_ops->name);
                        return err;
                }
        }

        /* btf_ctx_access() used this to provide argument type info */
        prog->aux->ctx_arg_info =
                st_ops_desc->arg_info[member_idx].info;
        prog->aux->ctx_arg_info_size =
                st_ops_desc->arg_info[member_idx].cnt;

        prog->aux->attach_func_proto = func_proto;
        prog->aux->attach_func_name = mname;
        env->ops = st_ops->verifier_ops;

        return 0;
}
#define SECURITY_PREFIX "security_"

static int check_attach_modify_return(unsigned long addr, const char *func_name)
{
        if (within_error_injection_list(addr) ||
            !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
                return 0;

        return -EINVAL;
}

/* list of non-sleepable functions that are otherwise on
 * ALLOW_ERROR_INJECTION list
 */
BTF_SET_START(btf_non_sleepable_error_inject)
/* Three functions below can be called from sleepable and non-sleepable context.
 * Assume non-sleepable from bpf safety point of view.
 */
BTF_ID(func, __filemap_add_folio)
BTF_ID(func, should_fail_alloc_page)
BTF_ID(func, should_failslab)
BTF_SET_END(btf_non_sleepable_error_inject)

static int check_non_sleepable_error_inject(u32 btf_id)
{
        return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
}

int bpf_check_attach_target(struct bpf_verifier_log *log,
                            const struct bpf_prog *prog,
                            const struct bpf_prog *tgt_prog,
                            u32 btf_id,
                            struct bpf_attach_target_info *tgt_info)
{
        bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
        bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
        const char prefix[] = "btf_trace_";
        int ret = 0, subprog = -1, i;
        const struct btf_type *t;
        bool conservative = true;
        const char *tname;
        struct btf *btf;
        long addr = 0;
        struct module *mod = NULL;

        if (!btf_id) {
                bpf_log(log, "Tracing programs must provide btf_id\n");
                return -EINVAL;
        }
        btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
        if (!btf) {
                bpf_log(log,
                        "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
                return -EINVAL;
        }
        t = btf_type_by_id(btf, btf_id);
        if (!t) {
                bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
                return -EINVAL;
        }
        tname = btf_name_by_offset(btf, t->name_off);
        if (!tname) {
                bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
                return -EINVAL;
        }
        if (tgt_prog) {
                struct bpf_prog_aux *aux = tgt_prog->aux;

                if (bpf_prog_is_dev_bound(prog->aux) &&
                    !bpf_prog_dev_bound_match(prog, tgt_prog)) {
                        bpf_log(log, "Target program bound device mismatch");
                        return -EINVAL;
                }

                for (i = 0; i < aux->func_info_cnt; i++)
                        if (aux->func_info[i].type_id == btf_id) {
                                subprog = i;
                                break;
                        }
                if (subprog == -1) {
                        bpf_log(log, "Subprog %s doesn't exist\n", tname);
                        return -EINVAL;
                }
                if (aux->func && aux->func[subprog]->aux->exception_cb) {
                        bpf_log(log,
                                "%s programs cannot attach to exception callback\n",
                                prog_extension ? "Extension" : "FENTRY/FEXIT");
                        return -EINVAL;
                }
                conservative = aux->func_info_aux[subprog].unreliable;
                if (prog_extension) {
                        if (conservative) {
                                bpf_log(log,
                                        "Cannot replace static functions\n");
                                return -EINVAL;
                        }
                        if (!prog->jit_requested) {
                                bpf_log(log,
                                        "Extension programs should be JITed\n");
                                return -EINVAL;
                        }
                }
                if (!tgt_prog->jited) {
                        bpf_log(log, "Can attach to only JITed progs\n");
                        return -EINVAL;
                }
                if (prog_tracing) {
                        if (aux->attach_tracing_prog) {
                                /*
                                 * Target program is an fentry/fexit which is already attached
                                 * to another tracing program. More levels of nesting
                                 * attachment are not allowed.
                                 */
                                bpf_log(log, "Cannot nest tracing program attach more than once\n");
                                return -EINVAL;
                        }
                } else if (tgt_prog->type == prog->type) {
                        /*
                         * To avoid potential call chain cycles, prevent attaching of a
                         * program extension to another extension. It's ok to attach
                         * fentry/fexit to extension program.
                         */
                        bpf_log(log, "Cannot recursively attach\n");
                        return -EINVAL;
                }
                if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
                    prog_extension &&
                    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
                     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
                        /* Program extensions can extend all program types
                         * except fentry/fexit. The reason is the following.
                         * The fentry/fexit programs are used for performance
                         * analysis, stats and can be attached to any program
                         * type. When extension program is replacing XDP function
                         * it is necessary to allow performance analysis of all
                         * functions. Both original XDP program and its program
                         * extension. Hence attaching fentry/fexit to
                         * BPF_PROG_TYPE_EXT is allowed. If extending of
                         * fentry/fexit was allowed it would be possible to create
                         * long call chain fentry->extension->fentry->extension
                         * beyond reasonable stack size. Hence extending fentry
                         * is not allowed.
                         */
                        bpf_log(log, "Cannot extend fentry/fexit\n");
                        return -EINVAL;
                }
        } else {
                if (prog_extension) {
                        bpf_log(log, "Cannot replace kernel functions\n");
                        return -EINVAL;
                }
        }

        switch (prog->expected_attach_type) {
        case BPF_TRACE_RAW_TP:
                if (tgt_prog) {
                        bpf_log(log,
                                "Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
                        return -EINVAL;
                }
                if (!btf_type_is_typedef(t)) {
                        bpf_log(log, "attach_btf_id %u is not a typedef\n",
                                btf_id);
                        return -EINVAL;
                }
                if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
                        bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
                                btf_id, tname);
                        return -EINVAL;
                }
                tname += sizeof(prefix) - 1;
                t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_ptr(t))
                        /* should never happen in valid vmlinux build */
                        return -EINVAL;
                t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_func_proto(t))
                        /* should never happen in valid vmlinux build */
                        return -EINVAL;

                break;
        case BPF_TRACE_ITER:
                if (!btf_type_is_func(t)) {
                        bpf_log(log, "attach_btf_id %u is not a function\n",
                                btf_id);
                        return -EINVAL;
                }
                t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_func_proto(t))
                        return -EINVAL;
                ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
                if (ret)
                        return ret;
                break;
        default:
                if (!prog_extension)
                        return -EINVAL;
                fallthrough;
        case BPF_MODIFY_RETURN:
        case BPF_LSM_MAC:
        case BPF_LSM_CGROUP:
        case BPF_TRACE_FENTRY:
        case BPF_TRACE_FEXIT:
                if (!btf_type_is_func(t)) {
                        bpf_log(log, "attach_btf_id %u is not a function\n",
                                btf_id);
                        return -EINVAL;
                }
                if (prog_extension &&
                    btf_check_type_match(log, prog, btf, t))
                        return -EINVAL;
                t = btf_type_by_id(btf, t->type);
                if (!btf_type_is_func_proto(t))
                        return -EINVAL;

                if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) &&
                    (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type ||
                     prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
                        return -EINVAL;

                if (tgt_prog && conservative)
                        t = NULL;

                ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
                if (ret < 0)
                        return ret;

                if (tgt_prog) {
                        if (subprog == 0)
                                addr = (long) tgt_prog->bpf_func;
                        else
                                addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
                } else {
                        if (btf_is_module(btf)) {
                                mod = btf_try_get_module(btf);
                                if (mod)
                                        addr = find_kallsyms_symbol_value(mod, tname);
                                else
                                        addr = 0;
                        } else {
                                addr = kallsyms_lookup_name(tname);
                        }
                        if (!addr) {
                                module_put(mod);
                                bpf_log(log,
                                        "The address of function %s cannot be found\n",
                                        tname);
                                return -ENOENT;
                        }
                }

                if (prog->sleepable) {
                        ret = -EINVAL;
                        switch (prog->type) {
                        case BPF_PROG_TYPE_TRACING:

                                /* fentry/fexit/fmod_ret progs can be sleepable if they are
                                 * attached to ALLOW_ERROR_INJECTION and are not in denylist.
                                 */
                                if (!check_non_sleepable_error_inject(btf_id) &&
                                    within_error_injection_list(addr))
                                        ret = 0;
                                /* fentry/fexit/fmod_ret progs can also be sleepable if they are
                                 * in the fmodret id set with the KF_SLEEPABLE flag.
                                 */
                                else {
                                        u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
                                                                                prog);

                                        if (flags && (*flags & KF_SLEEPABLE))
                                                ret = 0;
                                }
                                break;
                        case BPF_PROG_TYPE_LSM:
                                /* LSM progs check that they are attached to bpf_lsm_*() funcs.
                                 * Only some of them are sleepable.
                                 */
                                if (bpf_lsm_is_sleepable_hook(btf_id))
                                        ret = 0;
                                break;
                        default:
                                break;
                        }
                        if (ret) {
                                module_put(mod);
                                bpf_log(log, "%s is not sleepable\n", tname);
                                return ret;
                        }
                } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
                        if (tgt_prog) {
                                module_put(mod);
                                bpf_log(log, "can't modify return codes of BPF programs\n");
                                return -EINVAL;
                        }
                        ret = -EINVAL;
                        if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
                            !check_attach_modify_return(addr, tname))
                                ret = 0;
                        if (ret) {
                                module_put(mod);
                                bpf_log(log, "%s() is not modifiable\n", tname);
                                return ret;
                        }
                }

                break;
        }
        tgt_info->tgt_addr = addr;
        tgt_info->tgt_name = tname;
        tgt_info->tgt_type = t;
        tgt_info->tgt_mod = mod;
        return 0;
}

BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
BTF_ID(func, preempt_count_add)
BTF_ID(func, preempt_count_sub)
#endif
#ifdef CONFIG_PREEMPT_RCU
BTF_ID(func, __rcu_read_lock)
BTF_ID(func, __rcu_read_unlock)
#endif
BTF_SET_END(btf_id_deny)

static bool can_be_sleepable(struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_TRACING) {
                switch (prog->expected_attach_type) {
                case BPF_TRACE_FENTRY:
                case BPF_TRACE_FEXIT:
                case BPF_MODIFY_RETURN:
                case BPF_TRACE_ITER:
                        return true;
                default:
                        return false;
                }
        }
        return prog->type == BPF_PROG_TYPE_LSM ||
               prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
               prog->type == BPF_PROG_TYPE_STRUCT_OPS;
}

static int check_attach_btf_id(struct bpf_verifier_env *env)
{
        struct bpf_prog *prog = env->prog;
        struct bpf_prog *tgt_prog = prog->aux->dst_prog;
        struct bpf_attach_target_info tgt_info = {};
        u32 btf_id = prog->aux->attach_btf_id;
        struct bpf_trampoline *tr;
        int ret;
        u64 key;

        if (prog->type == BPF_PROG_TYPE_SYSCALL) {
                if (prog->sleepable)
                        /* attach_btf_id checked to be zero already */
                        return 0;
                verbose(env, "Syscall programs can only be sleepable\n");
                return -EINVAL;
        }

        if (prog->sleepable && !can_be_sleepable(prog)) {
                verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
                return -EINVAL;
        }

        if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
                return check_struct_ops_btf_id(env);

        if (prog->type != BPF_PROG_TYPE_TRACING &&
            prog->type != BPF_PROG_TYPE_LSM &&
            prog->type != BPF_PROG_TYPE_EXT)
                return 0;

        ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
        if (ret)
                return ret;

        if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
                /* to make freplace equivalent to their targets, they need to
                 * inherit env->ops and expected_attach_type for the rest of the
                 * verification
                 */
                env->ops = bpf_verifier_ops[tgt_prog->type];
                prog->expected_attach_type = tgt_prog->expected_attach_type;
        }

        /* store info about the attachment target that will be used later */
        prog->aux->attach_func_proto = tgt_info.tgt_type;
        prog->aux->attach_func_name = tgt_info.tgt_name;
        prog->aux->mod = tgt_info.tgt_mod;

        if (tgt_prog) {
                prog->aux->saved_dst_prog_type = tgt_prog->type;
                prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
        }

        if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
                prog->aux->attach_btf_trace = true;
                return 0;
        } else if (prog->expected_attach_type == BPF_TRACE_ITER) {
                if (!bpf_iter_prog_supported(prog))
                        return -EINVAL;
                return 0;
        }

        if (prog->type == BPF_PROG_TYPE_LSM) {
                ret = bpf_lsm_verify_prog(&env->log, prog);
                if (ret < 0)
                        return ret;
        } else if (prog->type == BPF_PROG_TYPE_TRACING &&
                   btf_id_set_contains(&btf_id_deny, btf_id)) {
                return -EINVAL;
        }

        key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
        tr = bpf_trampoline_get(key, &tgt_info);
        if (!tr)
                return -ENOMEM;

        if (tgt_prog && tgt_prog->aux->tail_call_reachable)
                tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;

        prog->aux->dst_trampoline = tr;
        return 0;
}

struct btf *bpf_get_btf_vmlinux(void)
{
        if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
                mutex_lock(&bpf_verifier_lock);
                if (!btf_vmlinux)
                        btf_vmlinux = btf_parse_vmlinux();
                mutex_unlock(&bpf_verifier_lock);
        }
        return btf_vmlinux;
}

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
        u64 start_time = ktime_get_ns();
        struct bpf_verifier_env *env;
        int i, len, ret = -EINVAL, err;
        u32 log_true_size;
        bool is_priv;

        /* no program is valid */
        if (ARRAY_SIZE(bpf_verifier_ops) == 0)
                return -EINVAL;

        /* 'struct bpf_verifier_env' can be global, but since it's not small,
         * allocate/free it every time bpf_check() is called
         */
        env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
        if (!env)
                return -ENOMEM;

        env->bt.env = env;

        len = (*prog)->len;
        env->insn_aux_data =
                vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
        ret = -ENOMEM;
        if (!env->insn_aux_data)
                goto err_free_env;
        for (i = 0; i < len; i++)
                env->insn_aux_data[i].orig_idx = i;
        env->prog = *prog;
        env->ops = bpf_verifier_ops[env->prog->type];
        env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);

        env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
        env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
        env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
        env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
        env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);

        bpf_get_btf_vmlinux();

        /* grab the mutex to protect few globals used by verifier */
        if (!is_priv)
                mutex_lock(&bpf_verifier_lock);

        /* user could have requested verbose verifier output
         * and supplied buffer to store the verification trace
         */
        ret = bpf_vlog_init(&env->log, attr->log_level,
                            (char __user *) (unsigned long) attr->log_buf,
                            attr->log_size);
        if (ret)
                goto err_unlock;

        mark_verifier_state_clean(env);

        if (IS_ERR(btf_vmlinux)) {
                /* Either gcc or pahole or kernel are broken. */
                verbose(env, "in-kernel BTF is malformed\n");
                ret = PTR_ERR(btf_vmlinux);
                goto skip_full_check;
        }

        env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                env->strict_alignment = true;
        if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
                env->strict_alignment = false;

        if (is_priv)
                env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
        env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;

        env->explored_states = kvcalloc(state_htab_size(env),
                                       sizeof(struct bpf_verifier_state_list *),
                                       GFP_USER);
        ret = -ENOMEM;
        if (!env->explored_states)
                goto skip_full_check;

        ret = check_btf_info_early(env, attr, uattr);
        if (ret < 0)
                goto skip_full_check;

        ret = add_subprog_and_kfunc(env);
        if (ret < 0)
                goto skip_full_check;

        ret = check_subprogs(env);
        if (ret < 0)
                goto skip_full_check;

        ret = check_btf_info(env, attr, uattr);
        if (ret < 0)
                goto skip_full_check;

        ret = check_attach_btf_id(env);
        if (ret)
                goto skip_full_check;

        ret = resolve_pseudo_ldimm64(env);
        if (ret < 0)
                goto skip_full_check;

        if (bpf_prog_is_offloaded(env->prog->aux)) {
                ret = bpf_prog_offload_verifier_prep(env->prog);
                if (ret)
                        goto skip_full_check;
        }

        ret = check_cfg(env);
        if (ret < 0)
                goto skip_full_check;

        ret = do_check_main(env);
        ret = ret ?: do_check_subprogs(env);

        if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
                ret = bpf_prog_offload_finalize(env);

skip_full_check:
        kvfree(env->explored_states);

        if (ret == 0)
                ret = check_max_stack_depth(env);

        /* instruction rewrites happen after this point */
        if (ret == 0)
                ret = optimize_bpf_loop(env);

        if (is_priv) {
                if (ret == 0)
                        opt_hard_wire_dead_code_branches(env);
                if (ret == 0)
                        ret = opt_remove_dead_code(env);
                if (ret == 0)
                        ret = opt_remove_nops(env);
        } else {
                if (ret == 0)
                        sanitize_dead_code(env);
        }

        if (ret == 0)
                /* program is valid, convert *(u32*)(ctx + off) accesses */
                ret = convert_ctx_accesses(env);

        if (ret == 0)
                ret = do_misc_fixups(env);

        /* do 32-bit optimization after insn patching has done so those patched
         * insns could be handled correctly.
         */
        if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
                ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
                env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
                                                                     : false;
        }

        if (ret == 0)
                ret = fixup_call_args(env);

        env->verification_time = ktime_get_ns() - start_time;
        print_verification_stats(env);
        env->prog->aux->verified_insns = env->insn_processed;

        /* preserve original error even if log finalization is successful */
        err = bpf_vlog_finalize(&env->log, &log_true_size);
        if (err)
                ret = err;

        if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
            copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
                                  &log_true_size, sizeof(log_true_size))) {
                ret = -EFAULT;
                goto err_release_maps;
        }

        if (ret)
                goto err_release_maps;

        if (env->used_map_cnt) {
                /* if program passed verifier, update used_maps in bpf_prog_info */
                env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
                                                          sizeof(env->used_maps[0]),
                                                          GFP_KERNEL);

                if (!env->prog->aux->used_maps) {
                        ret = -ENOMEM;
                        goto err_release_maps;
                }

                memcpy(env->prog->aux->used_maps, env->used_maps,
                       sizeof(env->used_maps[0]) * env->used_map_cnt);
                env->prog->aux->used_map_cnt = env->used_map_cnt;
        }
        if (env->used_btf_cnt) {
                /* if program passed verifier, update used_btfs in bpf_prog_aux */
                env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
                                                          sizeof(env->used_btfs[0]),
                                                          GFP_KERNEL);
                if (!env->prog->aux->used_btfs) {
                        ret = -ENOMEM;
                        goto err_release_maps;
                }

                memcpy(env->prog->aux->used_btfs, env->used_btfs,
                       sizeof(env->used_btfs[0]) * env->used_btf_cnt);
                env->prog->aux->used_btf_cnt = env->used_btf_cnt;
        }
        if (env->used_map_cnt || env->used_btf_cnt) {
                /* program is valid. Convert pseudo bpf_ld_imm64 into generic
                 * bpf_ld_imm64 instructions
                 */
                convert_pseudo_ld_imm64(env);
        }

        adjust_btf_func(env);

err_release_maps:
        if (!env->prog->aux->used_maps)
                /* if we didn't copy map pointers into bpf_prog_info, release
                 * them now. Otherwise free_used_maps() will release them.
                 */
                release_maps(env);
        if (!env->prog->aux->used_btfs)
                release_btfs(env);

        /* extension progs temporarily inherit the attach_type of their targets
           for verification purposes, so set it back to zero before returning
         */
        if (env->prog->type == BPF_PROG_TYPE_EXT)
                env->prog->expected_attach_type = 0;

        *prog = env->prog;

        module_put(env->attach_btf_mod);
err_unlock:
        if (!is_priv)
                mutex_unlock(&bpf_verifier_lock);
        vfree(env->insn_aux_data);
err_free_env:
        kfree(env);
        return ret;
}























































































































    2 






    2 





    2 








































































































































































































































































































































































































































































































































































































































































    2 



































    1 







    2 

    2 





    2 























































































































































    1 
















    1 









































































































































    1 





    1 



















































    1 

























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
 */
#include <linux/sched.h>                /* test_thread_flag(), ...        */
#include <linux/sched/task_stack.h>        /* task_stack_*(), ...                */
#include <linux/kdebug.h>                /* oops_begin/end, ...                */
#include <linux/extable.h>                /* search_exception_tables        */
#include <linux/memblock.h>                /* max_low_pfn                        */
#include <linux/kfence.h>                /* kfence_handle_page_fault        */
#include <linux/kprobes.h>                /* NOKPROBE_SYMBOL, ...                */
#include <linux/mmiotrace.h>                /* kmmio_handler, ...                */
#include <linux/perf_event.h>                /* perf_sw_event                */
#include <linux/hugetlb.h>                /* hstate_index_to_shift        */
#include <linux/prefetch.h>                /* prefetchw                        */
#include <linux/context_tracking.h>        /* exception_enter(), ...        */
#include <linux/uaccess.h>                /* faulthandler_disabled()        */
#include <linux/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h>                        /* find_and_lock_vma() */
#include <linux/vmalloc.h>

#include <asm/cpufeature.h>                /* boot_cpu_has, ...                */
#include <asm/traps.h>                        /* dotraplinkage, ...                */
#include <asm/fixmap.h>                        /* VSYSCALL_ADDR                */
#include <asm/vsyscall.h>                /* emulate_vsyscall                */
#include <asm/vm86.h>                        /* struct vm86                        */
#include <asm/mmu_context.h>                /* vma_pkey()                        */
#include <asm/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h>                        /* store_idt(), ...                */
#include <asm/cpu_entry_area.h>                /* exception stack                */
#include <asm/pgtable_areas.h>                /* VMALLOC_START, ...                */
#include <asm/kvm_para.h>                /* kvm_handle_async_pf                */
#include <asm/vdso.h>                        /* fixup_vdso_exception()        */
#include <asm/irq_stack.h>
#include <asm/fred.h>
#include <asm/sev.h>                        /* snp_dump_hva_rmpentry()        */

#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>

/*
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
 */
static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
        return 0;
}

/*
 * Prefetch quirks:
 *
 * 32-bit mode:
 *
 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 *   Check that here and ignore it.  This is AMD erratum #91.
 *
 * 64-bit mode:
 *
 *   Sometimes the CPU reports invalid exceptions on prefetch.
 *   Check that here and ignore it.
 *
 * Opcode checker based on code by Richard Brunner.
 */
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
                      unsigned char opcode, int *prefetch)
{
        unsigned char instr_hi = opcode & 0xf0;
        unsigned char instr_lo = opcode & 0x0f;

        switch (instr_hi) {
        case 0x20:
        case 0x30:
                /*
                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
                 * In X86_64 long mode, the CPU will signal invalid
                 * opcode if some of these prefixes are present so
                 * X86_64 will never get here anyway
                 */
                return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
        case 0x40:
                /*
                 * In 64-bit mode 0x40..0x4F are valid REX prefixes
                 */
                return (!user_mode(regs) || user_64bit_mode(regs));
#endif
        case 0x60:
                /* 0x64 thru 0x67 are valid prefixes in all modes. */
                return (instr_lo & 0xC) == 0x4;
        case 0xF0:
                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
                return !instr_lo || (instr_lo>>1) == 1;
        case 0x00:
                /* Prefetch instruction is 0x0F0D or 0x0F18 */
                if (get_kernel_nofault(opcode, instr))
                        return 0;

                *prefetch = (instr_lo == 0xF) &&
                        (opcode == 0x0D || opcode == 0x18);
                return 0;
        default:
                return 0;
        }
}

static bool is_amd_k8_pre_npt(void)
{
        struct cpuinfo_x86 *c = &boot_cpu_data;

        return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
                        c->x86_vendor == X86_VENDOR_AMD &&
                        c->x86 == 0xf && c->x86_model < 0x40);
}

static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
        unsigned char *max_instr;
        unsigned char *instr;
        int prefetch = 0;

        /* Erratum #91 affects AMD K8, pre-NPT CPUs */
        if (!is_amd_k8_pre_npt())
                return 0;

        /*
         * If it was a exec (instruction fetch) fault on NX page, then
         * do not ignore the fault:
         */
        if (error_code & X86_PF_INSTR)
                return 0;

        instr = (void *)convert_ip_to_linear(current, regs);
        max_instr = instr + 15;

        /*
         * This code has historically always bailed out if IP points to a
         * not-present page (e.g. due to a race).  No one has ever
         * complained about this.
         */
        pagefault_disable();

        while (instr < max_instr) {
                unsigned char opcode;

                if (user_mode(regs)) {
                        if (get_user(opcode, (unsigned char __user *) instr))
                                break;
                } else {
                        if (get_kernel_nofault(opcode, instr))
                                break;
                }

                instr++;

                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
                        break;
        }

        pagefault_enable();
        return prefetch;
}

DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
        unsigned index = pgd_index(address);
        pgd_t *pgd_k;
        p4d_t *p4d, *p4d_k;
        pud_t *pud, *pud_k;
        pmd_t *pmd, *pmd_k;

        pgd += index;
        pgd_k = init_mm.pgd + index;

        if (!pgd_present(*pgd_k))
                return NULL;

        /*
         * set_pgd(pgd, *pgd_k); here would be useless on PAE
         * and redundant with the set_pmd() on non-PAE. As would
         * set_p4d/set_pud.
         */
        p4d = p4d_offset(pgd, address);
        p4d_k = p4d_offset(pgd_k, address);
        if (!p4d_present(*p4d_k))
                return NULL;

        pud = pud_offset(p4d, address);
        pud_k = pud_offset(p4d_k, address);
        if (!pud_present(*pud_k))
                return NULL;

        pmd = pmd_offset(pud, address);
        pmd_k = pmd_offset(pud_k, address);

        if (pmd_present(*pmd) != pmd_present(*pmd_k))
                set_pmd(pmd, *pmd_k);

        if (!pmd_present(*pmd_k))
                return NULL;
        else
                BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));

        return pmd_k;
}

/*
 *   Handle a fault on the vmalloc or module mapping area
 *
 *   This is needed because there is a race condition between the time
 *   when the vmalloc mapping code updates the PMD to the point in time
 *   where it synchronizes this update with the other page-tables in the
 *   system.
 *
 *   In this race window another thread/CPU can map an area on the same
 *   PMD, finds it already present and does not synchronize it with the
 *   rest of the system yet. As a result v[mz]alloc might return areas
 *   which are not mapped in every page-table in the system, causing an
 *   unhandled page-fault when they are accessed.
 */
static noinline int vmalloc_fault(unsigned long address)
{
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
        pte_t *pte_k;

        /* Make sure we are in vmalloc area: */
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;

        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
         *
         * Do _not_ use "current" here. We might be inside
         * an interrupt in the middle of a task switch..
         */
        pgd_paddr = read_cr3_pa();
        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
        if (!pmd_k)
                return -1;

        if (pmd_leaf(*pmd_k))
                return 0;

        pte_k = pte_offset_kernel(pmd_k, address);
        if (!pte_present(*pte_k))
                return -1;

        return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);

void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
        unsigned long addr;

        for (addr = start & PMD_MASK;
             addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
             addr += PMD_SIZE) {
                struct page *page;

                spin_lock(&pgd_lock);
                list_for_each_entry(page, &pgd_list, lru) {
                        spinlock_t *pgt_lock;

                        /* the pgt_lock only for Xen */
                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;

                        spin_lock(pgt_lock);
                        vmalloc_sync_one(page_address(page), addr);
                        spin_unlock(pgt_lock);
                }
                spin_unlock(&pgd_lock);
        }
}

static bool low_pfn(unsigned long pfn)
{
        return pfn < max_low_pfn;
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(address)];
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

#ifdef CONFIG_X86_PAE
        pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
                goto out;
#define pr_pde pr_cont
#else
#define pr_pde pr_info
#endif
        p4d = p4d_offset(pgd, address);
        pud = pud_offset(p4d, address);
        pmd = pmd_offset(pud, address);
        pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
#undef pr_pde

        /*
         * We must not directly access the pte in the highpte
         * case if the page table is located in highmem.
         * And let's rather not kmap-atomic the pte, just in case
         * it's allocated already:
         */
        if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
        pr_cont("\n");
}

#else /* CONFIG_X86_64: */

#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR 
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif

static int bad_address(void *p)
{
        unsigned long dummy;

        return get_kernel_nofault(dummy, (unsigned long *)p);
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = base + pgd_index(address);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

        if (bad_address(pgd))
                goto bad;

        pr_info("PGD %lx ", pgd_val(*pgd));

        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (bad_address(p4d))
                goto bad;

        pr_cont("P4D %lx ", p4d_val(*p4d));
        if (!p4d_present(*p4d) || p4d_leaf(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (bad_address(pud))
                goto bad;

        pr_cont("PUD %lx ", pud_val(*pud));
        if (!pud_present(*pud) || pud_leaf(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
        if (bad_address(pmd))
                goto bad;

        pr_cont("PMD %lx ", pmd_val(*pmd));
        if (!pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        if (bad_address(pte))
                goto bad;

        pr_cont("PTE %lx", pte_val(*pte));
out:
        pr_cont("\n");
        return;
bad:
        pr_info("BAD\n");
}

#endif /* CONFIG_X86_64 */

/*
 * Workaround for K8 erratum #93 & buggy BIOS.
 *
 * BIOS SMM functions are required to use a specific workaround
 * to avoid corruption of the 64bit RIP register on C stepping K8.
 *
 * A lot of BIOS that didn't get tested properly miss this.
 *
 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 * Try to work around it here.
 *
 * Note we only handle faults in kernel here.
 * Does nothing on 32-bit.
 */
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
            || boot_cpu_data.x86 != 0xf)
                return 0;

        if (user_mode(regs))
                return 0;

        if (address != regs->ip)
                return 0;

        if ((address >> 32) != 0)
                return 0;

        address |= 0xffffffffUL << 32;
        if ((address >= (u64)_stext && address <= (u64)_etext) ||
            (address >= MODULES_VADDR && address <= MODULES_END)) {
                printk_once(errata93_warning);
                regs->ip = address;
                return 1;
        }
#endif
        return 0;
}

/*
 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 * to illegal addresses >4GB.
 *
 * We catch this in the page fault handler because these addresses
 * are not reachable. Just detect this case and return.  Any code
 * segment in LDT is compatibility mode.
 */
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
                return 1;
#endif
        return 0;
}

/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
        if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
            idt_is_f00f_address(address)) {
                handle_invalid_op(regs);
                return 1;
        }
#endif
        return 0;
}

static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
{
        u32 offset = (index >> 3) * sizeof(struct desc_struct);
        unsigned long addr;
        struct ldttss_desc desc;

        if (index == 0) {
                pr_alert("%s: NULL\n", name);
                return;
        }

        if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
                pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
                return;
        }

        if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
                              sizeof(struct ldttss_desc))) {
                pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
                         name, index);
                return;
        }

        addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
        addr |= ((u64)desc.base3 << 32);
#endif
        pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
                 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}

static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
        if (!oops_may_print())
                return;

        if (error_code & X86_PF_INSTR) {
                unsigned int level;
                bool nx, rw;
                pgd_t *pgd;
                pte_t *pte;

                pgd = __va(read_cr3_pa());
                pgd += pgd_index(address);

                pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);

                if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
                        pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
                if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
                                (pgd_flags(*pgd) & _PAGE_USER) &&
                                (__read_cr4() & X86_CR4_SMEP))
                        pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
        }

        if (address < PAGE_SIZE && !user_mode(regs))
                pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
                        (void *)address);
        else
                pr_alert("BUG: unable to handle page fault for address: %px\n",
                        (void *)address);

        pr_alert("#PF: %s %s in %s mode\n",
                 (error_code & X86_PF_USER)  ? "user" : "supervisor",
                 (error_code & X86_PF_INSTR) ? "instruction fetch" :
                 (error_code & X86_PF_WRITE) ? "write access" :
                                               "read access",
                             user_mode(regs) ? "user" : "kernel");
        pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
                 !(error_code & X86_PF_PROT) ? "not-present page" :
                 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
                 (error_code & X86_PF_PK)    ? "protection keys violation" :
                 (error_code & X86_PF_RMP)   ? "RMP violation" :
                                               "permissions violation");

        if (!(error_code & X86_PF_USER) && user_mode(regs)) {
                struct desc_ptr idt, gdt;
                u16 ldtr, tr;

                /*
                 * This can happen for quite a few reasons.  The more obvious
                 * ones are faults accessing the GDT, or LDT.  Perhaps
                 * surprisingly, if the CPU tries to deliver a benign or
                 * contributory exception from user code and gets a page fault
                 * during delivery, the page fault can be delivered as though
                 * it originated directly from user code.  This could happen
                 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
                 * kernel or IST stack.
                 */
                store_idt(&idt);

                /* Usable even on Xen PV -- it's just slow. */
                native_store_gdt(&gdt);

                pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
                         idt.address, idt.size, gdt.address, gdt.size);

                store_ldt(ldtr);
                show_ldttss(&gdt, "LDTR", ldtr);

                store_tr(tr);
                show_ldttss(&gdt, "TR", tr);
        }

        dump_pagetable(address);

        if (error_code & X86_PF_RMP)
                snp_dump_hva_rmpentry(address);
}

static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
            unsigned long address)
{
        struct task_struct *tsk;
        unsigned long flags;
        int sig;

        flags = oops_begin();
        tsk = current;
        sig = SIGKILL;

        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
               tsk->comm, address);
        dump_pagetable(address);

        if (__die("Bad pagetable", regs, error_code))
                sig = 0;

        oops_end(flags, regs, sig);
}

static void sanitize_error_code(unsigned long address,
                                unsigned long *error_code)
{
        /*
         * To avoid leaking information about the kernel page
         * table layout, pretend that user-mode accesses to
         * kernel addresses are always protection faults.
         *
         * NB: This means that failed vsyscalls with vsyscall=none
         * will have the PROT bit.  This doesn't leak any
         * information and does not appear to cause any problems.
         */
        if (address >= TASK_SIZE_MAX)
                *error_code |= X86_PF_PROT;
}

static void set_signal_archinfo(unsigned long address,
                                unsigned long error_code)
{
        struct task_struct *tsk = current;

        tsk->thread.trap_nr = X86_TRAP_PF;
        tsk->thread.error_code = error_code | X86_PF_USER;
        tsk->thread.cr2 = address;
}

static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
                unsigned long address)
{
#ifdef CONFIG_VMAP_STACK
        struct stack_info info;
#endif
        unsigned long flags;
        int sig;

        if (user_mode(regs)) {
                /*
                 * Implicit kernel access from user mode?  Skip the stack
                 * overflow and EFI special cases.
                 */
                goto oops;
        }

#ifdef CONFIG_VMAP_STACK
        /*
         * Stack overflow?  During boot, we can fault near the initial
         * stack in the direct map, but that's not an overflow -- check
         * that we're in vmalloc space to avoid this.
         */
        if (is_vmalloc_addr((void *)address) &&
            get_stack_guard_info((void *)address, &info)) {
                /*
                 * We're likely to be running with very little stack space
                 * left.  It's plausible that we'd hit this condition but
                 * double-fault even before we get this far, in which case
                 * we're fine: the double-fault handler will deal with it.
                 *
                 * We don't want to make it all the way into the oops code
                 * and then double-fault, though, because we're likely to
                 * break the console driver and lose most of the stack dump.
                 */
                call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
                              handle_stack_overflow,
                              ASM_CALL_ARG3,
                              , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));

                unreachable();
        }
#endif

        /*
         * Buggy firmware could access regions which might page fault.  If
         * this happens, EFI has a special OOPS path that will try to
         * avoid hanging the system.
         */
        if (IS_ENABLED(CONFIG_EFI))
                efi_crash_gracefully_on_page_fault(address);

        /* Only not-present faults should be handled by KFENCE. */
        if (!(error_code & X86_PF_PROT) &&
            kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
                return;

oops:
        /*
         * Oops. The kernel tried to access some bad page. We'll have to
         * terminate things with extreme prejudice:
         */
        flags = oops_begin();

        show_fault_oops(regs, error_code, address);

        if (task_stack_end_corrupted(current))
                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

        sig = SIGKILL;
        if (__die("Oops", regs, error_code))
                sig = 0;

        /* Executive summary in case the body of the oops scrolled away */
        printk(KERN_DEFAULT "CR2: %016lx\n", address);

        oops_end(flags, regs, sig);
}

static noinline void
kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address, int signal, int si_code,
                         u32 pkey)
{
        WARN_ON_ONCE(user_mode(regs));

        /* Are we prepared to handle this kernel fault? */
        if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        /*
         * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
         * instruction.
         */
        if (is_prefetch(regs, error_code, address))
                return;

        page_fault_oops(regs, error_code, address);
}

/*
 * Print out info about fatal segfaults, if the show_unhandled_signals
 * sysctl is set:
 */
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
                unsigned long address, struct task_struct *tsk)
{
        const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
        /* This is a racy snapshot, but it's better than nothing. */
        int cpu = raw_smp_processor_id();

        if (!unhandled_signal(tsk, SIGSEGV))
                return;

        if (!printk_ratelimit())
                return;

        printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
                loglvl, tsk->comm, task_pid_nr(tsk), address,
                (void *)regs->ip, (void *)regs->sp, error_code);

        print_vma_addr(KERN_CONT " in ", regs->ip);

        /*
         * Dump the likely CPU where the fatal segfault happened.
         * This can help identify faulty hardware.
         */
        printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
               topology_core_id(cpu), topology_physical_package_id(cpu));


        printk(KERN_CONT "\n");

        show_opcodes(regs, loglvl);
}

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address, u32 pkey, int si_code)
{
        struct task_struct *tsk = current;

        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGSEGV, si_code, pkey);
                return;
        }

        if (!(error_code & X86_PF_USER)) {
                /* Implicit user access to kernel memory -- just oops */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * User mode accesses just cause a SIGSEGV.
         * It's possible to have interrupts off here:
         */
        local_irq_enable();

        /*
         * Valid to do another page fault here because this one came
         * from user space:
         */
        if (is_prefetch(regs, error_code, address))
                return;

        if (is_errata100(regs, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        if (likely(show_unhandled_signals))
                show_signal_msg(regs, error_code, address, tsk);

        set_signal_archinfo(address, error_code);

        if (si_code == SEGV_PKUERR)
                force_sig_pkuerr((void __user *)address, pkey);
        else
                force_sig_fault(SIGSEGV, si_code, (void __user *)address);

        local_irq_disable();
}

static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                     unsigned long address)
{
        __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}

static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
           unsigned long address, struct mm_struct *mm,
           struct vm_area_struct *vma, u32 pkey, int si_code)
{
        /*
         * Something tried to access memory that isn't in our memory map..
         * Fix it, but check if it's kernel or user first..
         */
        if (mm)
                mmap_read_unlock(mm);
        else
                vma_end_read(vma);

        __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}

static inline bool bad_area_access_from_pkeys(unsigned long error_code,
                struct vm_area_struct *vma)
{
        /* This code is always called on the current mm */
        bool foreign = false;

        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return false;
        if (error_code & X86_PF_PK)
                return true;
        /* this checks permission keys on the VMA: */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return true;
        return false;
}

static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address, struct mm_struct *mm,
                      struct vm_area_struct *vma)
{
        /*
         * This OSPKE check is not strictly necessary at runtime.
         * But, doing it this way allows compiler optimizations
         * if pkeys are compiled out.
         */
        if (bad_area_access_from_pkeys(error_code, vma)) {
                /*
                 * A protection key fault means that the PKRU value did not allow
                 * access to some PTE.  Userspace can figure out what PKRU was
                 * from the XSAVE state.  This function captures the pkey from
                 * the vma and passes it to userspace so userspace can discover
                 * which protection key was set on the PTE.
                 *
                 * If we get here, we know that the hardware signaled a X86_PF_PK
                 * fault and that there was a VMA once we got in the fault
                 * handler.  It does *not* guarantee that the VMA we find here
                 * was the one that we faulted on.
                 *
                 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
                 * 2. T1   : set PKRU to deny access to pkey=4, touches page
                 * 3. T1   : faults...
                 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
                 * 5. T1   : enters fault handler, takes mmap_lock, etc...
                 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
                 *             faulted on a pte with its pkey=4.
                 */
                u32 pkey = vma_pkey(vma);

                __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
        } else {
                __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
        }
}

static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
          vm_fault_t fault)
{
        /* Kernel mode? Handle exceptions or die: */
        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
                return;
        }

        /* User-space => ok to do another page fault: */
        if (is_prefetch(regs, error_code, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        set_signal_archinfo(address, error_code);

#ifdef CONFIG_MEMORY_FAILURE
        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                struct task_struct *tsk = current;
                unsigned lsb = 0;

                pr_err(
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                if (fault & VM_FAULT_HWPOISON_LARGE)
                        lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
                if (fault & VM_FAULT_HWPOISON)
                        lsb = PAGE_SHIFT;
                force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
                return;
        }
#endif
        force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}

static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
        if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
                return 0;

        if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
                return 0;

        return 1;
}

/*
 * Handle a spurious fault caused by a stale TLB entry.
 *
 * This allows us to lazily refresh the TLB when increasing the
 * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
 * eagerly is very expensive since that implies doing a full
 * cross-processor TLB flush, even if no stale TLB entries exist
 * on other processors.
 *
 * Spurious faults may only occur if the TLB contains an entry with
 * fewer permission than the page table entry.  Non-present (P = 0)
 * and reserved bit (R = 1) faults are never spurious.
 *
 * There are no security implications to leaving a stale TLB when
 * increasing the permissions on a page.
 *
 * Returns non-zero if a spurious fault was handled, zero otherwise.
 *
 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
 * (Optional Invalidation).
 */
static noinline int
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        int ret;

        /*
         * Only writes to RO or instruction fetches from NX may cause
         * spurious faults.
         *
         * These could be from user or supervisor accesses but the TLB
         * is only lazily flushed after a kernel mapping protection
         * change, so user accesses are not expected to cause spurious
         * faults.
         */
        if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
            error_code != (X86_PF_INSTR | X86_PF_PROT))
                return 0;

        pgd = init_mm.pgd + pgd_index(address);
        if (!pgd_present(*pgd))
                return 0;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                return 0;

        if (p4d_leaf(*p4d))
                return spurious_kernel_fault_check(error_code, (pte_t *) p4d);

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return 0;

        if (pud_leaf(*pud))
                return spurious_kernel_fault_check(error_code, (pte_t *) pud);

        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return 0;

        if (pmd_leaf(*pmd))
                return spurious_kernel_fault_check(error_code, (pte_t *) pmd);

        pte = pte_offset_kernel(pmd, address);
        if (!pte_present(*pte))
                return 0;

        ret = spurious_kernel_fault_check(error_code, pte);
        if (!ret)
                return 0;

        /*
         * Make sure we have permissions in PMD.
         * If not, then there's a bug in the page tables:
         */
        ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");

        return ret;
}
NOKPROBE_SYMBOL(spurious_kernel_fault);

int show_unhandled_signals = 1;

static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
        /* This is only called for the current mm, so: */
        bool foreign = false;

        /*
         * Read or write was blocked by protection keys.  This is
         * always an unconditional error and can never result in
         * a follow-up action to resolve the fault, like a COW.
         */
        if (error_code & X86_PF_PK)
                return 1;

        /*
         * SGX hardware blocked the access.  This usually happens
         * when the enclave memory contents have been destroyed, like
         * after a suspend/resume cycle. In any case, the kernel can't
         * fix the cause of the fault.  Handle the fault as an access
         * error even in cases where no actual access violation
         * occurred.  This allows userspace to rebuild the enclave in
         * response to the signal.
         */
        if (unlikely(error_code & X86_PF_SGX))
                return 1;

        /*
         * Make sure to check the VMA so that we do not perform
         * faults just to hit a X86_PF_PK as soon as we fill in a
         * page.
         */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return 1;

        /*
         * Shadow stack accesses (PF_SHSTK=1) are only permitted to
         * shadow stack VMAs. All other accesses result in an error.
         */
        if (error_code & X86_PF_SHSTK) {
                if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        if (error_code & X86_PF_WRITE) {
                /* write, present and write, not present: */
                if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        /* read, present: */
        if (unlikely(error_code & X86_PF_PROT))
                return 1;

        /* read, not present: */
        if (unlikely(!vma_is_accessible(vma)))
                return 1;

        return 0;
}

bool fault_in_kernel_space(unsigned long address)
{
        /*
         * On 64-bit systems, the vsyscall page is at an address above
         * TASK_SIZE_MAX, but is not considered part of the kernel
         * address space.
         */
        if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
                return false;

        return address >= TASK_SIZE_MAX;
}

/*
 * Called for all faults where 'address' is part of the kernel address
 * space.  Might get called for faults that originate from *code* that
 * ran in userspace or the kernel.
 */
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
                   unsigned long address)
{
        /*
         * Protection keys exceptions only happen on user pages.  We
         * have no user pages in the kernel portion of the address
         * space, so do not expect them here.
         */
        WARN_ON_ONCE(hw_error_code & X86_PF_PK);

#ifdef CONFIG_X86_32
        /*
         * We can fault-in kernel-space virtual memory on-demand. The
         * 'reference' page table is init_mm.pgd.
         *
         * NOTE! We MUST NOT take any locks for this case. We may
         * be in an interrupt or a critical region, and should
         * only copy the information from the master page table,
         * nothing more.
         *
         * Before doing this on-demand faulting, ensure that the
         * fault is not any of the following:
         * 1. A fault on a PTE with a reserved bit set.
         * 2. A fault caused by a user-mode access.  (Do not demand-
         *    fault kernel memory due to user-mode accesses).
         * 3. A fault caused by a page-level protection violation.
         *    (A demand fault would be on a non-present page which
         *     would have X86_PF_PROT==0).
         *
         * This is only needed to close a race condition on x86-32 in
         * the vmalloc mapping/unmapping code. See the comment above
         * vmalloc_fault() for details. On x86-64 the race does not
         * exist as the vmalloc mappings don't need to be synchronized
         * there.
         */
        if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
                if (vmalloc_fault(address) >= 0)
                        return;
        }
#endif

        if (is_f00f_bug(regs, hw_error_code, address))
                return;

        /* Was the fault spurious, caused by lazy TLB invalidation? */
        if (spurious_kernel_fault(hw_error_code, address))
                return;

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Note, despite being a "bad area", there are quite a few
         * acceptable reasons to get here, such as erratum fixups
         * and handling kernel code that can fault, like get_user().
         *
         * Don't take the mm semaphore here. If we fixup a prefetch
         * fault we could otherwise deadlock:
         */
        bad_area_nosemaphore(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);

/*
 * Handle faults in the user portion of the address space.  Nothing in here
 * should check X86_PF_USER without a specific justification: for almost
 * all purposes, we should treat a normal kernel access to user memory
 * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
 * The one exception is AC flag handling, which is, per the x86
 * architecture, special for WRUSS.
 */
static inline
void do_user_addr_fault(struct pt_regs *regs,
                        unsigned long error_code,
                        unsigned long address)
{
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct mm_struct *mm;
        vm_fault_t fault;
        unsigned int flags = FAULT_FLAG_DEFAULT;

        tsk = current;
        mm = tsk->mm;

        if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
                /*
                 * Whoops, this is kernel mode code trying to execute from
                 * user memory.  Unless this is AMD erratum #93, which
                 * corrupts RIP such that it looks like a user address,
                 * this is unrecoverable.  Don't even try to look up the
                 * VMA or look for extable entries.
                 */
                if (is_errata93(regs, address))
                        return;

                page_fault_oops(regs, error_code, address);
                return;
        }

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Reserved bits are never expected to be set on
         * entries in the user portion of the page tables.
         */
        if (unlikely(error_code & X86_PF_RSVD))
                pgtable_bad(regs, error_code, address);

        /*
         * If SMAP is on, check for invalid kernel (supervisor) access to user
         * pages in the user address space.  The odd case here is WRUSS,
         * which, according to the preliminary documentation, does not respect
         * SMAP and will have the USER bit set so, in all cases, SMAP
         * enforcement appears to be consistent with the USER bit.
         */
        if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
                     !(error_code & X86_PF_USER) &&
                     !(regs->flags & X86_EFLAGS_AC))) {
                /*
                 * No extable entry here.  This was a kernel access to an
                 * invalid pointer.  get_kernel_nofault() will not get here.
                 */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * If we're in an interrupt, have no user context or are running
         * in a region with pagefaults disabled then we must not take the fault
         */
        if (unlikely(faulthandler_disabled() || !mm)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /* Legacy check - remove this after verifying that it doesn't trigger */
        if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        local_irq_enable();

        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

        /*
         * Read-only permissions can not be expressed in shadow stack PTEs.
         * Treat all shadow stack accesses as WRITE faults. This ensures
         * that the MM will prepare everything (e.g., break COW) such that
         * maybe_mkwrite() can create a proper shadow stack PTE.
         */
        if (error_code & X86_PF_SHSTK)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_WRITE)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_INSTR)
                flags |= FAULT_FLAG_INSTRUCTION;

        /*
         * We set FAULT_FLAG_USER based on the register state, not
         * based on X86_PF_USER. User space accesses that cause
         * system page faults are still user accesses.
         */
        if (user_mode(regs))
                flags |= FAULT_FLAG_USER;

#ifdef CONFIG_X86_64
        /*
         * Faults in the vsyscall page might need emulation.  The
         * vsyscall page is at a high address (>PAGE_OFFSET), but is
         * considered to be part of the user address space.
         *
         * The vsyscall page does not have a "real" VMA, so do this
         * emulation before we go searching for VMAs.
         *
         * PKRU never rejects instruction fetches, so we don't need
         * to consider the PF_PK bit.
         */
        if (is_vsyscall_vaddr(address)) {
                if (emulate_vsyscall(error_code, regs, address))
                        return;
        }
#endif

        if (!(flags & FAULT_FLAG_USER))
                goto lock_mmap;

        vma = lock_vma_under_rcu(mm, address);
        if (!vma)
                goto lock_mmap;

        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, NULL, vma);
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                return;
        }
        fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
        if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
                vma_end_read(vma);

        if (!(fault & VM_FAULT_RETRY)) {
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                goto done;
        }
        count_vm_vma_lock_event(VMA_LOCK_RETRY);
        if (fault & VM_FAULT_MAJOR)
                flags |= FAULT_FLAG_TRIED;

        /* Quick path to respond to signals */
        if (fault_signal_pending(fault, regs)) {
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }
lock_mmap:

retry:
        vma = lock_mm_and_find_vma(mm, address, regs);
        if (unlikely(!vma)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /*
         * Ok, we have a good vm_area for this memory access, so
         * we can handle it..
         */
        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, mm, vma);
                return;
        }

        /*
         * If for any reason at all we couldn't handle the fault,
         * make sure we exit gracefully rather than endlessly redo
         * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
         * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
         *
         * Note that handle_userfault() may also release and reacquire mmap_lock
         * (and not return with VM_FAULT_RETRY), when returning to userland to
         * repeat the page fault later with a VM_FAULT_NOPAGE retval
         * (potentially after handling any pending signal during the return to
         * userland). The return to userland is identified whenever
         * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
         */
        fault = handle_mm_fault(vma, address, flags, regs);

        if (fault_signal_pending(fault, regs)) {
                /*
                 * Quick path to respond to signals.  The core mm code
                 * has unlocked the mm for us if we get here.
                 */
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }

        /* The fault is fully completed (including releasing mmap lock) */
        if (fault & VM_FAULT_COMPLETED)
                return;

        /*
         * If we need to retry the mmap_lock has already been released,
         * and if there is a fatal signal pending there is no guarantee
         * that we made any progress. Handle this case first.
         */
        if (unlikely(fault & VM_FAULT_RETRY)) {
                flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        mmap_read_unlock(mm);
done:
        if (likely(!(fault & VM_FAULT_ERROR)))
                return;

        if (fatal_signal_pending(current) && !user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         0, 0, ARCH_DEFAULT_PKEY);
                return;
        }

        if (fault & VM_FAULT_OOM) {
                /* Kernel mode? Handle exceptions or die: */
                if (!user_mode(regs)) {
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGSEGV, SEGV_MAPERR,
                                                 ARCH_DEFAULT_PKEY);
                        return;
                }

                /*
                 * We ran out of memory, call the OOM killer, and return the
                 * userspace (which will retry the fault, or kill us if we got
                 * oom-killed):
                 */
                pagefault_out_of_memory();
        } else {
                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
                             VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else if (fault & VM_FAULT_SIGSEGV)
                        bad_area_nosemaphore(regs, error_code, address);
                else
                        BUG();
        }
}
NOKPROBE_SYMBOL(do_user_addr_fault);

static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address)
{
        if (!trace_pagefault_enabled())
                return;

        if (user_mode(regs))
                trace_page_fault_user(address, regs, error_code);
        else
                trace_page_fault_kernel(address, regs, error_code);
}

static __always_inline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
                              unsigned long address)
{
        trace_page_fault_entries(regs, error_code, address);

        if (unlikely(kmmio_fault(regs, address)))
                return;

        /* Was the fault on kernel-controlled part of the address space? */
        if (unlikely(fault_in_kernel_space(address))) {
                do_kern_addr_fault(regs, error_code, address);
        } else {
                do_user_addr_fault(regs, error_code, address);
                /*
                 * User address page fault handling might have reenabled
                 * interrupts. Fixing up all potential exit points of
                 * do_user_addr_fault() and its leaf functions is just not
                 * doable w/o creating an unholy mess or turning the code
                 * upside down.
                 */
                local_irq_disable();
        }
}

DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
        irqentry_state_t state;
        unsigned long address;

        address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();

        prefetchw(&current->mm->mmap_lock);

        /*
         * KVM uses #PF vector to deliver 'page not present' events to guests
         * (asynchronous page fault mechanism). The event happens when a
         * userspace task is trying to access some valid (from guest's point of
         * view) memory which is not currently mapped by the host (e.g. the
         * memory is swapped out). Note, the corresponding "page ready" event
         * which is injected when the memory becomes available, is delivered via
         * an interrupt mechanism and not a #PF exception
         * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
         *
         * We are relying on the interrupted context being sane (valid RSP,
         * relevant locks not held, etc.), which is fine as long as the
         * interrupted context had IF=1.  We are also relying on the KVM
         * async pf type field and CR2 being read consistently instead of
         * getting values from real and async page faults mixed up.
         *
         * Fingers crossed.
         *
         * The async #PF handling code takes care of idtentry handling
         * itself.
         */
        if (kvm_handle_async_pf(regs, (u32)address))
                return;

        /*
         * Entry handling for valid #PF from kernel mode is slightly
         * different: RCU is already watching and ct_irq_enter() must not
         * be invoked because a kernel fault on a user space address might
         * sleep.
         *
         * In case the fault hit a RCU idle region the conditional entry
         * code reenabled RCU to avoid subsequent wreckage which helps
         * debuggability.
         */
        state = irqentry_enter(regs);

        instrumentation_begin();
        handle_page_fault(regs, error_code, address);
        instrumentation_end();

        irqentry_exit(regs, state);
}














































    1 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_KSM_H
#define __LINUX_KSM_H
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork().
 */

#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/sched.h>
#include <linux/sched/coredump.h>

#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags);

void ksm_add_vma(struct vm_area_struct *vma);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
int ksm_disable(struct mm_struct *mm);

int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
/*
 * To identify zeropages that were mapped by KSM, we reuse the dirty bit
 * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
 * deduplicating memory.
 */
#define is_ksm_zero_pte(pte)        (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))

extern atomic_long_t ksm_zero_pages;

static inline void ksm_map_zero_page(struct mm_struct *mm)
{
        atomic_long_inc(&ksm_zero_pages);
        atomic_long_inc(&mm->ksm_zero_pages);
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
        if (is_ksm_zero_pte(pte)) {
                atomic_long_dec(&ksm_zero_pages);
                atomic_long_dec(&mm->ksm_zero_pages);
        }
}

static inline long mm_ksm_zero_pages(struct mm_struct *mm)
{
        return atomic_long_read(&mm->ksm_zero_pages);
}

static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
        if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
                return __ksm_enter(mm);

        return 0;
}

static inline int ksm_execve(struct mm_struct *mm)
{
        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return __ksm_enter(mm);

        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
        if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
                __ksm_exit(mm);
}

/*
 * When do_swap_page() first faults in from swap what used to be a KSM page,
 * no problem, it will be assigned to this vma's anon_vma; but thereafter,
 * it might be faulted into a different anon_vma (or perhaps to a different
 * offset in the same anon_vma).  do_swap_page() cannot do all the locking
 * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
 * a copy, and leave remerging the pages to a later pass of ksmd.
 *
 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
 * but what if the vma was unmerged while the page was swapped out?
 */
struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr);

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
void collect_procs_ksm(struct folio *folio, struct page *page,
                struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);

#else  /* !CONFIG_KSM */

static inline void ksm_add_vma(struct vm_area_struct *vma)
{
}

static inline int ksm_disable(struct mm_struct *mm)
{
        return 0;
}

static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
        return 0;
}

static inline int ksm_execve(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
}

static inline void collect_procs_ksm(struct folio *folio, struct page *page,
                                     struct list_head *to_kill, int force_early)
{
}

#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags)
{
        return 0;
}

static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return folio;
}

static inline void rmap_walk_ksm(struct folio *folio,
                        struct rmap_walk_control *rwc)
{
}

static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
{
}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */

#endif /* __LINUX_KSM_H */




































































    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints,  longs and pointers.
   (C) 2002 Nadia Yvette Chambers, IBM */

#include <asm/types.h>
#include <linux/compiler.h>

/*
 * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
 * fs/inode.c.  It's not actually prime any more (the previous primes
 * were actively bad for hashing), but the name remains.
 */
#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
#else
#error Wordsize not 32 or 64
#endif

/*
 * This hash multiplies the input by a large odd number and takes the
 * high bits.  Since multiplication propagates changes to the most
 * significant end only, it is essential that the high bits of the
 * product be used for the hash value.
 *
 * Chuck Lever verified the effectiveness of this technique:
 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 *
 * Although a random odd number will do, it turns out that the golden
 * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
 * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
 *
 * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
 * which is very slightly easier to multiply by and makes no
 * difference to the hash distribution.
 */
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull

#ifdef CONFIG_HAVE_ARCH_HASH
/* This header may use the GOLDEN_RATIO_xx constants */
#include <asm/hash.h>
#endif

/*
 * The _generic versions exist only so lib/test_hash.c can compare
 * the arch-optimized versions with the generic.
 *
 * Note that if you change these, any <asm/hash.h> that aren't updated
 * to match need to have their HAVE_ARCH_* define values updated so the
 * self-test will not false-positive.
 */
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
        return val * GOLDEN_RATIO_32;
}

static inline u32 hash_32(u32 val, unsigned int bits)
{
        /* High bits are more random, so use them. */
        return __hash_32(val) >> (32 - bits);
}

#ifndef HAVE_ARCH_HASH_64
#define hash_64 hash_64_generic
#endif
static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
{
#if BITS_PER_LONG == 64
        /* 64x64-bit multiply is efficient on all 64-bit processors */
        return val * GOLDEN_RATIO_64 >> (64 - bits);
#else
        /* Hash 64 bits using only 32x32-bit multiply. */
        return hash_32((u32)val ^ __hash_32(val >> 32), bits);
#endif
}

static inline u32 hash_ptr(const void *ptr, unsigned int bits)
{
        return hash_long((unsigned long)ptr, bits);
}

/* This really should be called fold32_ptr; it does no hashing to speak of. */
static inline u32 hash32_ptr(const void *ptr)
{
        unsigned long val = (unsigned long)ptr;

#if BITS_PER_LONG == 64
        val ^= (val >> 32);
#endif
        return (u32)val;
}

#endif /* _LINUX_HASH_H */




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
 */
#ifndef __IPVLAN_H
#define __IPVLAN_H

#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/if_arp.h>
#include <linux/if_link.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/inetdevice.h>
#include <linux/netfilter.h>
#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/route.h>
#include <net/addrconf.h>
#include <net/l3mdev.h>

#define IPVLAN_DRV        "ipvlan"
#define IPV_DRV_VER        "0.1"

#define IPVLAN_HASH_SIZE        (1 << BITS_PER_BYTE)
#define IPVLAN_HASH_MASK        (IPVLAN_HASH_SIZE - 1)

#define IPVLAN_MAC_FILTER_BITS        8
#define IPVLAN_MAC_FILTER_SIZE        (1 << IPVLAN_MAC_FILTER_BITS)
#define IPVLAN_MAC_FILTER_MASK        (IPVLAN_MAC_FILTER_SIZE - 1)

#define IPVLAN_QBACKLOG_LIMIT        1000

typedef enum {
        IPVL_IPV6 = 0,
        IPVL_ICMPV6,
        IPVL_IPV4,
        IPVL_ARP,
} ipvl_hdr_type;

struct ipvl_pcpu_stats {
        u64_stats_t                rx_pkts;
        u64_stats_t                rx_bytes;
        u64_stats_t                rx_mcast;
        u64_stats_t                tx_pkts;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync        syncp;
        u32                        rx_errs;
        u32                        tx_drps;
};

struct ipvl_port;

struct ipvl_dev {
        struct net_device        *dev;
        struct list_head        pnode;
        struct ipvl_port        *port;
        struct net_device        *phy_dev;
        struct list_head        addrs;
        struct ipvl_pcpu_stats        __percpu *pcpu_stats;
        DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
        netdev_features_t        sfeatures;
        u32                        msg_enable;
        spinlock_t                addrs_lock;
};

struct ipvl_addr {
        struct ipvl_dev                *master; /* Back pointer to master */
        union {
                struct in6_addr        ip6;         /* IPv6 address on logical interface */
                struct in_addr        ip4;         /* IPv4 address on logical interface */
        } ipu;
#define ip6addr        ipu.ip6
#define ip4addr ipu.ip4
        struct hlist_node        hlnode;  /* Hash-table linkage */
        struct list_head        anode;   /* logical-interface linkage */
        ipvl_hdr_type                atype;
        struct rcu_head                rcu;
};

struct ipvl_port {
        struct net_device        *dev;
        possible_net_t                pnet;
        struct hlist_head        hlhead[IPVLAN_HASH_SIZE];
        struct list_head        ipvlans;
        u16                        mode;
        u16                        flags;
        u16                        dev_id_start;
        struct work_struct        wq;
        struct sk_buff_head        backlog;
        int                        count;
        struct ida                ida;
        netdevice_tracker        dev_tracker;
};

struct ipvl_skb_cb {
        bool tx_pkt;
};
#define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0]))

static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d)
{
        return rcu_dereference(d->rx_handler_data);
}

static inline struct ipvl_port *ipvlan_port_get_rcu_bh(const struct net_device *d)
{
        return rcu_dereference_bh(d->rx_handler_data);
}

static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d)
{
        return rtnl_dereference(d->rx_handler_data);
}

static inline bool ipvlan_is_private(const struct ipvl_port *port)
{
        return !!(port->flags & IPVLAN_F_PRIVATE);
}

static inline void ipvlan_mark_private(struct ipvl_port *port)
{
        port->flags |= IPVLAN_F_PRIVATE;
}

static inline void ipvlan_clear_private(struct ipvl_port *port)
{
        port->flags &= ~IPVLAN_F_PRIVATE;
}

static inline bool ipvlan_is_vepa(const struct ipvl_port *port)
{
        return !!(port->flags & IPVLAN_F_VEPA);
}

static inline void ipvlan_mark_vepa(struct ipvl_port *port)
{
        port->flags |= IPVLAN_F_VEPA;
}

static inline void ipvlan_clear_vepa(struct ipvl_port *port)
{
        port->flags &= ~IPVLAN_F_VEPA;
}

void ipvlan_init_secret(void);
unsigned int ipvlan_mac_hash(const unsigned char *addr);
rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
void ipvlan_process_multicast(struct work_struct *work);
int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev);
void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr);
struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
                                   const void *iaddr, bool is_v6);
bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
void ipvlan_ht_addr_del(struct ipvl_addr *addr);
struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h,
                                     int addr_type, bool use_dest);
void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type);
void ipvlan_count_rx(const struct ipvl_dev *ipvlan,
                     unsigned int len, bool success, bool mcast);
int ipvlan_link_new(struct net *src_net, struct net_device *dev,
                    struct nlattr *tb[], struct nlattr *data[],
                    struct netlink_ext_ack *extack);
void ipvlan_link_delete(struct net_device *dev, struct list_head *head);
void ipvlan_link_setup(struct net_device *dev);
int ipvlan_link_register(struct rtnl_link_ops *ops);
#ifdef CONFIG_IPVLAN_L3S
int ipvlan_l3s_register(struct ipvl_port *port);
void ipvlan_l3s_unregister(struct ipvl_port *port);
void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet);
int ipvlan_l3s_init(void);
void ipvlan_l3s_cleanup(void);
#else
static inline int ipvlan_l3s_register(struct ipvl_port *port)
{
        return -ENOTSUPP;
}

static inline void ipvlan_l3s_unregister(struct ipvl_port *port)
{
}

static inline void ipvlan_migrate_l3s_hook(struct net *oldnet,
                                           struct net *newnet)
{
}

static inline int ipvlan_l3s_init(void)
{
        return 0;
}

static inline void ipvlan_l3s_cleanup(void)
{
}
#endif /* CONFIG_IPVLAN_L3S */

static inline bool netif_is_ipvlan_port(const struct net_device *dev)
{
        return rcu_access_pointer(dev->rx_handler) == ipvlan_handle_frame;
}

#endif /* __IPVLAN_H */
























































    1 


    1 














    1 


    1 


    1 











    1 















    1 













    1 









    1 








    1 







    1 



    1 




    1 


    1 






















    1 




    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * This file contains sctp stream maniuplation primitives and helpers.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    Xin Long <lucien.xin@gmail.com>
 */

#include <linux/list.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>

static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt)
{
        struct sctp_association *asoc;
        struct sctp_chunk *ch, *temp;
        struct sctp_outq *outq;

        asoc = container_of(stream, struct sctp_association, stream);
        outq = &asoc->outqueue;

        list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) {
                __u16 sid = sctp_chunk_stream_no(ch);

                if (sid < outcnt)
                        continue;

                sctp_sched_dequeue_common(outq, ch);
                /* No need to call dequeue_done here because
                 * the chunks are not scheduled by now.
                 */

                /* Mark as failed send. */
                sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM);
                if (asoc->peer.prsctp_capable &&
                    SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags))
                        asoc->sent_cnt_removable--;

                sctp_chunk_free(ch);
        }
}

static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid)
{
        struct sctp_sched_ops *sched;

        if (!SCTP_SO(stream, sid)->ext)
                return;

        sched = sctp_sched_ops_from_stream(stream);
        sched->free_sid(stream, sid);
        kfree(SCTP_SO(stream, sid)->ext);
        SCTP_SO(stream, sid)->ext = NULL;
}

/* Migrates chunks from stream queues to new stream queues if needed,
 * but not across associations. Also, removes those chunks to streams
 * higher than the new max.
 */
static void sctp_stream_outq_migrate(struct sctp_stream *stream,
                                     struct sctp_stream *new, __u16 outcnt)
{
        int i;

        if (stream->outcnt > outcnt)
                sctp_stream_shrink_out(stream, outcnt);

        if (new) {
                /* Here we actually move the old ext stuff into the new
                 * buffer, because we want to keep it. Then
                 * sctp_stream_update will swap ->out pointers.
                 */
                for (i = 0; i < outcnt; i++) {
                        sctp_stream_free_ext(new, i);
                        SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext;
                        SCTP_SO(stream, i)->ext = NULL;
                }
        }

        for (i = outcnt; i < stream->outcnt; i++)
                sctp_stream_free_ext(stream, i);
}

static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
                                 gfp_t gfp)
{
        int ret;

        if (outcnt <= stream->outcnt)
                goto out;

        ret = genradix_prealloc(&stream->out, outcnt, gfp);
        if (ret)
                return ret;

out:
        stream->outcnt = outcnt;
        return 0;
}

static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
                                gfp_t gfp)
{
        int ret;

        if (incnt <= stream->incnt)
                goto out;

        ret = genradix_prealloc(&stream->in, incnt, gfp);
        if (ret)
                return ret;

out:
        stream->incnt = incnt;
        return 0;
}

int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
                     gfp_t gfp)
{
        struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
        int i, ret = 0;

        gfp |= __GFP_NOWARN;

        /* Initial stream->out size may be very big, so free it and alloc
         * a new one with new outcnt to save memory if needed.
         */
        if (outcnt == stream->outcnt)
                goto handle_in;

        /* Filter out chunks queued on streams that won't exist anymore */
        sched->unsched_all(stream);
        sctp_stream_outq_migrate(stream, NULL, outcnt);
        sched->sched_all(stream);

        ret = sctp_stream_alloc_out(stream, outcnt, gfp);
        if (ret)
                return ret;

        for (i = 0; i < stream->outcnt; i++)
                SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;

handle_in:
        sctp_stream_interleave_init(stream);
        if (!incnt)
                return 0;

        return sctp_stream_alloc_in(stream, incnt, gfp);
}

int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
{
        struct sctp_stream_out_ext *soute;
        int ret;

        soute = kzalloc(sizeof(*soute), GFP_KERNEL);
        if (!soute)
                return -ENOMEM;
        SCTP_SO(stream, sid)->ext = soute;

        ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL);
        if (ret) {
                kfree(SCTP_SO(stream, sid)->ext);
                SCTP_SO(stream, sid)->ext = NULL;
        }

        return ret;
}

void sctp_stream_free(struct sctp_stream *stream)
{
        struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
        int i;

        sched->unsched_all(stream);
        for (i = 0; i < stream->outcnt; i++)
                sctp_stream_free_ext(stream, i);
        genradix_free(&stream->out);
        genradix_free(&stream->in);
}

void sctp_stream_clear(struct sctp_stream *stream)
{
        int i;

        for (i = 0; i < stream->outcnt; i++) {
                SCTP_SO(stream, i)->mid = 0;
                SCTP_SO(stream, i)->mid_uo = 0;
        }

        for (i = 0; i < stream->incnt; i++)
                SCTP_SI(stream, i)->mid = 0;
}

void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
{
        struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);

        sched->unsched_all(stream);
        sctp_stream_outq_migrate(stream, new, new->outcnt);
        sctp_stream_free(stream);

        stream->out = new->out;
        stream->in  = new->in;
        stream->outcnt = new->outcnt;
        stream->incnt  = new->incnt;

        sched->sched_all(stream);

        new->out.tree.root = NULL;
        new->in.tree.root  = NULL;
        new->outcnt = 0;
        new->incnt  = 0;
}

static int sctp_send_reconf(struct sctp_association *asoc,
                            struct sctp_chunk *chunk)
{
        int retval = 0;

        retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk);
        if (retval)
                sctp_chunk_free(chunk);

        return retval;
}

static bool sctp_stream_outq_is_empty(struct sctp_stream *stream,
                                      __u16 str_nums, __be16 *str_list)
{
        struct sctp_association *asoc;
        __u16 i;

        asoc = container_of(stream, struct sctp_association, stream);
        if (!asoc->outqueue.out_qlen)
                return true;

        if (!str_nums)
                return false;

        for (i = 0; i < str_nums; i++) {
                __u16 sid = ntohs(str_list[i]);

                if (SCTP_SO(stream, sid)->ext &&
                    !list_empty(&SCTP_SO(stream, sid)->ext->outq))
                        return false;
        }

        return true;
}

int sctp_send_reset_streams(struct sctp_association *asoc,
                            struct sctp_reset_streams *params)
{
        struct sctp_stream *stream = &asoc->stream;
        __u16 i, str_nums, *str_list;
        struct sctp_chunk *chunk;
        int retval = -EINVAL;
        __be16 *nstr_list;
        bool out, in;

        if (!asoc->peer.reconf_capable ||
            !(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) {
                retval = -ENOPROTOOPT;
                goto out;
        }

        if (asoc->strreset_outstanding) {
                retval = -EINPROGRESS;
                goto out;
        }

        out = params->srs_flags & SCTP_STREAM_RESET_OUTGOING;
        in  = params->srs_flags & SCTP_STREAM_RESET_INCOMING;
        if (!out && !in)
                goto out;

        str_nums = params->srs_number_streams;
        str_list = params->srs_stream_list;
        if (str_nums) {
                int param_len = 0;

                if (out) {
                        for (i = 0; i < str_nums; i++)
                                if (str_list[i] >= stream->outcnt)
                                        goto out;

                        param_len = str_nums * sizeof(__u16) +
                                    sizeof(struct sctp_strreset_outreq);
                }

                if (in) {
                        for (i = 0; i < str_nums; i++)
                                if (str_list[i] >= stream->incnt)
                                        goto out;

                        param_len += str_nums * sizeof(__u16) +
                                     sizeof(struct sctp_strreset_inreq);
                }

                if (param_len > SCTP_MAX_CHUNK_LEN -
                                sizeof(struct sctp_reconf_chunk))
                        goto out;
        }

        nstr_list = kcalloc(str_nums, sizeof(__be16), GFP_KERNEL);
        if (!nstr_list) {
                retval = -ENOMEM;
                goto out;
        }

        for (i = 0; i < str_nums; i++)
                nstr_list[i] = htons(str_list[i]);

        if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) {
                kfree(nstr_list);
                retval = -EAGAIN;
                goto out;
        }

        chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in);

        kfree(nstr_list);

        if (!chunk) {
                retval = -ENOMEM;
                goto out;
        }

        if (out) {
                if (str_nums)
                        for (i = 0; i < str_nums; i++)
                                SCTP_SO(stream, str_list[i])->state =
                                                       SCTP_STREAM_CLOSED;
                else
                        for (i = 0; i < stream->outcnt; i++)
                                SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;
        }

        asoc->strreset_chunk = chunk;
        sctp_chunk_hold(asoc->strreset_chunk);

        retval = sctp_send_reconf(asoc, chunk);
        if (retval) {
                sctp_chunk_put(asoc->strreset_chunk);
                asoc->strreset_chunk = NULL;
                if (!out)
                        goto out;

                if (str_nums)
                        for (i = 0; i < str_nums; i++)
                                SCTP_SO(stream, str_list[i])->state =
                                                       SCTP_STREAM_OPEN;
                else
                        for (i = 0; i < stream->outcnt; i++)
                                SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;

                goto out;
        }

        asoc->strreset_outstanding = out + in;

out:
        return retval;
}

int sctp_send_reset_assoc(struct sctp_association *asoc)
{
        struct sctp_stream *stream = &asoc->stream;
        struct sctp_chunk *chunk = NULL;
        int retval;
        __u16 i;

        if (!asoc->peer.reconf_capable ||
            !(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
                return -ENOPROTOOPT;

        if (asoc->strreset_outstanding)
                return -EINPROGRESS;

        if (!sctp_outq_is_empty(&asoc->outqueue))
                return -EAGAIN;

        chunk = sctp_make_strreset_tsnreq(asoc);
        if (!chunk)
                return -ENOMEM;

        /* Block further xmit of data until this request is completed */
        for (i = 0; i < stream->outcnt; i++)
                SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;

        asoc->strreset_chunk = chunk;
        sctp_chunk_hold(asoc->strreset_chunk);

        retval = sctp_send_reconf(asoc, chunk);
        if (retval) {
                sctp_chunk_put(asoc->strreset_chunk);
                asoc->strreset_chunk = NULL;

                for (i = 0; i < stream->outcnt; i++)
                        SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;

                return retval;
        }

        asoc->strreset_outstanding = 1;

        return 0;
}

int sctp_send_add_streams(struct sctp_association *asoc,
                          struct sctp_add_streams *params)
{
        struct sctp_stream *stream = &asoc->stream;
        struct sctp_chunk *chunk = NULL;
        int retval;
        __u32 outcnt, incnt;
        __u16 out, in;

        if (!asoc->peer.reconf_capable ||
            !(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
                retval = -ENOPROTOOPT;
                goto out;
        }

        if (asoc->strreset_outstanding) {
                retval = -EINPROGRESS;
                goto out;
        }

        out = params->sas_outstrms;
        in  = params->sas_instrms;
        outcnt = stream->outcnt + out;
        incnt = stream->incnt + in;
        if (outcnt > SCTP_MAX_STREAM || incnt > SCTP_MAX_STREAM ||
            (!out && !in)) {
                retval = -EINVAL;
                goto out;
        }

        if (out) {
                retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL);
                if (retval)
                        goto out;
        }

        chunk = sctp_make_strreset_addstrm(asoc, out, in);
        if (!chunk) {
                retval = -ENOMEM;
                goto out;
        }

        asoc->strreset_chunk = chunk;
        sctp_chunk_hold(asoc->strreset_chunk);

        retval = sctp_send_reconf(asoc, chunk);
        if (retval) {
                sctp_chunk_put(asoc->strreset_chunk);
                asoc->strreset_chunk = NULL;
                goto out;
        }

        asoc->strreset_outstanding = !!out + !!in;

out:
        return retval;
}

static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param(
                        struct sctp_association *asoc, __be32 resp_seq,
                        __be16 type)
{
        struct sctp_chunk *chunk = asoc->strreset_chunk;
        struct sctp_reconf_chunk *hdr;
        union sctp_params param;

        if (!chunk)
                return NULL;

        hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
        sctp_walk_params(param, hdr) {
                /* sctp_strreset_tsnreq is actually the basic structure
                 * of all stream reconf params, so it's safe to use it
                 * to access request_seq.
                 */
                struct sctp_strreset_tsnreq *req = param.v;

                if ((!resp_seq || req->request_seq == resp_seq) &&
                    (!type || type == req->param_hdr.type))
                        return param.v;
        }

        return NULL;
}

static void sctp_update_strreset_result(struct sctp_association *asoc,
                                        __u32 result)
{
        asoc->strreset_result[1] = asoc->strreset_result[0];
        asoc->strreset_result[0] = result;
}

struct sctp_chunk *sctp_process_strreset_outreq(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp)
{
        struct sctp_strreset_outreq *outreq = param.v;
        struct sctp_stream *stream = &asoc->stream;
        __u32 result = SCTP_STRRESET_DENIED;
        __be16 *str_p = NULL;
        __u32 request_seq;
        __u16 i, nums;

        request_seq = ntohl(outreq->request_seq);

        if (ntohl(outreq->send_reset_at_tsn) >
            sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) {
                result = SCTP_STRRESET_IN_PROGRESS;
                goto err;
        }

        if (TSN_lt(asoc->strreset_inseq, request_seq) ||
            TSN_lt(request_seq, asoc->strreset_inseq - 2)) {
                result = SCTP_STRRESET_ERR_BAD_SEQNO;
                goto err;
        } else if (TSN_lt(request_seq, asoc->strreset_inseq)) {
                i = asoc->strreset_inseq - request_seq - 1;
                result = asoc->strreset_result[i];
                goto err;
        }
        asoc->strreset_inseq++;

        /* Check strreset_enable after inseq inc, as sender cannot tell
         * the peer doesn't enable strreset after receiving response with
         * result denied, as well as to keep consistent with bsd.
         */
        if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
                goto out;

        nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16);
        str_p = outreq->list_of_streams;
        for (i = 0; i < nums; i++) {
                if (ntohs(str_p[i]) >= stream->incnt) {
                        result = SCTP_STRRESET_ERR_WRONG_SSN;
                        goto out;
                }
        }

        if (asoc->strreset_chunk) {
                if (!sctp_chunk_lookup_strreset_param(
                                asoc, outreq->response_seq,
                                SCTP_PARAM_RESET_IN_REQUEST)) {
                        /* same process with outstanding isn't 0 */
                        result = SCTP_STRRESET_ERR_IN_PROGRESS;
                        goto out;
                }

                asoc->strreset_outstanding--;
                asoc->strreset_outseq++;

                if (!asoc->strreset_outstanding) {
                        struct sctp_transport *t;

                        t = asoc->strreset_chunk->transport;
                        if (del_timer(&t->reconf_timer))
                                sctp_transport_put(t);

                        sctp_chunk_put(asoc->strreset_chunk);
                        asoc->strreset_chunk = NULL;
                }
        }

        if (nums)
                for (i = 0; i < nums; i++)
                        SCTP_SI(stream, ntohs(str_p[i]))->mid = 0;
        else
                for (i = 0; i < stream->incnt; i++)
                        SCTP_SI(stream, i)->mid = 0;

        result = SCTP_STRRESET_PERFORMED;

        *evp = sctp_ulpevent_make_stream_reset_event(asoc,
                SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC);

out:
        sctp_update_strreset_result(asoc, result);
err:
        return sctp_make_strreset_resp(asoc, result, request_seq);
}

struct sctp_chunk *sctp_process_strreset_inreq(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp)
{
        struct sctp_strreset_inreq *inreq = param.v;
        struct sctp_stream *stream = &asoc->stream;
        __u32 result = SCTP_STRRESET_DENIED;
        struct sctp_chunk *chunk = NULL;
        __u32 request_seq;
        __u16 i, nums;
        __be16 *str_p;

        request_seq = ntohl(inreq->request_seq);
        if (TSN_lt(asoc->strreset_inseq, request_seq) ||
            TSN_lt(request_seq, asoc->strreset_inseq - 2)) {
                result = SCTP_STRRESET_ERR_BAD_SEQNO;
                goto err;
        } else if (TSN_lt(request_seq, asoc->strreset_inseq)) {
                i = asoc->strreset_inseq - request_seq - 1;
                result = asoc->strreset_result[i];
                if (result == SCTP_STRRESET_PERFORMED)
                        return NULL;
                goto err;
        }
        asoc->strreset_inseq++;

        if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ))
                goto out;

        if (asoc->strreset_outstanding) {
                result = SCTP_STRRESET_ERR_IN_PROGRESS;
                goto out;
        }

        nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16);
        str_p = inreq->list_of_streams;
        for (i = 0; i < nums; i++) {
                if (ntohs(str_p[i]) >= stream->outcnt) {
                        result = SCTP_STRRESET_ERR_WRONG_SSN;
                        goto out;
                }
        }

        if (!sctp_stream_outq_is_empty(stream, nums, str_p)) {
                result = SCTP_STRRESET_IN_PROGRESS;
                asoc->strreset_inseq--;
                goto err;
        }

        chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0);
        if (!chunk)
                goto out;

        if (nums)
                for (i = 0; i < nums; i++)
                        SCTP_SO(stream, ntohs(str_p[i]))->state =
                                               SCTP_STREAM_CLOSED;
        else
                for (i = 0; i < stream->outcnt; i++)
                        SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;

        asoc->strreset_chunk = chunk;
        asoc->strreset_outstanding = 1;
        sctp_chunk_hold(asoc->strreset_chunk);

        result = SCTP_STRRESET_PERFORMED;

out:
        sctp_update_strreset_result(asoc, result);
err:
        if (!chunk)
                chunk =  sctp_make_strreset_resp(asoc, result, request_seq);

        return chunk;
}

struct sctp_chunk *sctp_process_strreset_tsnreq(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp)
{
        __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen;
        struct sctp_strreset_tsnreq *tsnreq = param.v;
        struct sctp_stream *stream = &asoc->stream;
        __u32 result = SCTP_STRRESET_DENIED;
        __u32 request_seq;
        __u16 i;

        request_seq = ntohl(tsnreq->request_seq);
        if (TSN_lt(asoc->strreset_inseq, request_seq) ||
            TSN_lt(request_seq, asoc->strreset_inseq - 2)) {
                result = SCTP_STRRESET_ERR_BAD_SEQNO;
                goto err;
        } else if (TSN_lt(request_seq, asoc->strreset_inseq)) {
                i = asoc->strreset_inseq - request_seq - 1;
                result = asoc->strreset_result[i];
                if (result == SCTP_STRRESET_PERFORMED) {
                        next_tsn = asoc->ctsn_ack_point + 1;
                        init_tsn =
                                sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1;
                }
                goto err;
        }

        if (!sctp_outq_is_empty(&asoc->outqueue)) {
                result = SCTP_STRRESET_IN_PROGRESS;
                goto err;
        }

        asoc->strreset_inseq++;

        if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
                goto out;

        if (asoc->strreset_outstanding) {
                result = SCTP_STRRESET_ERR_IN_PROGRESS;
                goto out;
        }

        /* G4: The same processing as though a FWD-TSN chunk (as defined in
         *     [RFC3758]) with all streams affected and a new cumulative TSN
         *     ACK of the Receiver's Next TSN minus 1 were received MUST be
         *     performed.
         */
        max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);
        asoc->stream.si->report_ftsn(&asoc->ulpq, max_tsn_seen);

        /* G1: Compute an appropriate value for the Receiver's Next TSN -- the
         *     TSN that the peer should use to send the next DATA chunk.  The
         *     value SHOULD be the smallest TSN not acknowledged by the
         *     receiver of the request plus 2^31.
         */
        init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31);
        sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
                         init_tsn, GFP_ATOMIC);

        /* G3: The same processing as though a SACK chunk with no gap report
         *     and a cumulative TSN ACK of the Sender's Next TSN minus 1 were
         *     received MUST be performed.
         */
        sctp_outq_free(&asoc->outqueue);

        /* G2: Compute an appropriate value for the local endpoint's next TSN,
         *     i.e., the next TSN assigned by the receiver of the SSN/TSN reset
         *     chunk.  The value SHOULD be the highest TSN sent by the receiver
         *     of the request plus 1.
         */
        next_tsn = asoc->next_tsn;
        asoc->ctsn_ack_point = next_tsn - 1;
        asoc->adv_peer_ack_point = asoc->ctsn_ack_point;

        /* G5:  The next expected and outgoing SSNs MUST be reset to 0 for all
         *      incoming and outgoing streams.
         */
        for (i = 0; i < stream->outcnt; i++) {
                SCTP_SO(stream, i)->mid = 0;
                SCTP_SO(stream, i)->mid_uo = 0;
        }
        for (i = 0; i < stream->incnt; i++)
                SCTP_SI(stream, i)->mid = 0;

        result = SCTP_STRRESET_PERFORMED;

        *evp = sctp_ulpevent_make_assoc_reset_event(asoc, 0, init_tsn,
                                                    next_tsn, GFP_ATOMIC);

out:
        sctp_update_strreset_result(asoc, result);
err:
        return sctp_make_strreset_tsnresp(asoc, result, request_seq,
                                          next_tsn, init_tsn);
}

struct sctp_chunk *sctp_process_strreset_addstrm_out(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp)
{
        struct sctp_strreset_addstrm *addstrm = param.v;
        struct sctp_stream *stream = &asoc->stream;
        __u32 result = SCTP_STRRESET_DENIED;
        __u32 request_seq, incnt;
        __u16 in, i;

        request_seq = ntohl(addstrm->request_seq);
        if (TSN_lt(asoc->strreset_inseq, request_seq) ||
            TSN_lt(request_seq, asoc->strreset_inseq - 2)) {
                result = SCTP_STRRESET_ERR_BAD_SEQNO;
                goto err;
        } else if (TSN_lt(request_seq, asoc->strreset_inseq)) {
                i = asoc->strreset_inseq - request_seq - 1;
                result = asoc->strreset_result[i];
                goto err;
        }
        asoc->strreset_inseq++;

        if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ))
                goto out;

        in = ntohs(addstrm->number_of_streams);
        incnt = stream->incnt + in;
        if (!in || incnt > SCTP_MAX_STREAM)
                goto out;

        if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC))
                goto out;

        if (asoc->strreset_chunk) {
                if (!sctp_chunk_lookup_strreset_param(
                        asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) {
                        /* same process with outstanding isn't 0 */
                        result = SCTP_STRRESET_ERR_IN_PROGRESS;
                        goto out;
                }

                asoc->strreset_outstanding--;
                asoc->strreset_outseq++;

                if (!asoc->strreset_outstanding) {
                        struct sctp_transport *t;

                        t = asoc->strreset_chunk->transport;
                        if (del_timer(&t->reconf_timer))
                                sctp_transport_put(t);

                        sctp_chunk_put(asoc->strreset_chunk);
                        asoc->strreset_chunk = NULL;
                }
        }

        stream->incnt = incnt;

        result = SCTP_STRRESET_PERFORMED;

        *evp = sctp_ulpevent_make_stream_change_event(asoc,
                0, ntohs(addstrm->number_of_streams), 0, GFP_ATOMIC);

out:
        sctp_update_strreset_result(asoc, result);
err:
        return sctp_make_strreset_resp(asoc, result, request_seq);
}

struct sctp_chunk *sctp_process_strreset_addstrm_in(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp)
{
        struct sctp_strreset_addstrm *addstrm = param.v;
        struct sctp_stream *stream = &asoc->stream;
        __u32 result = SCTP_STRRESET_DENIED;
        struct sctp_chunk *chunk = NULL;
        __u32 request_seq, outcnt;
        __u16 out, i;
        int ret;

        request_seq = ntohl(addstrm->request_seq);
        if (TSN_lt(asoc->strreset_inseq, request_seq) ||
            TSN_lt(request_seq, asoc->strreset_inseq - 2)) {
                result = SCTP_STRRESET_ERR_BAD_SEQNO;
                goto err;
        } else if (TSN_lt(request_seq, asoc->strreset_inseq)) {
                i = asoc->strreset_inseq - request_seq - 1;
                result = asoc->strreset_result[i];
                if (result == SCTP_STRRESET_PERFORMED)
                        return NULL;
                goto err;
        }
        asoc->strreset_inseq++;

        if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ))
                goto out;

        if (asoc->strreset_outstanding) {
                result = SCTP_STRRESET_ERR_IN_PROGRESS;
                goto out;
        }

        out = ntohs(addstrm->number_of_streams);
        outcnt = stream->outcnt + out;
        if (!out || outcnt > SCTP_MAX_STREAM)
                goto out;

        ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC);
        if (ret)
                goto out;

        chunk = sctp_make_strreset_addstrm(asoc, out, 0);
        if (!chunk)
                goto out;

        asoc->strreset_chunk = chunk;
        asoc->strreset_outstanding = 1;
        sctp_chunk_hold(asoc->strreset_chunk);

        stream->outcnt = outcnt;

        result = SCTP_STRRESET_PERFORMED;

out:
        sctp_update_strreset_result(asoc, result);
err:
        if (!chunk)
                chunk = sctp_make_strreset_resp(asoc, result, request_seq);

        return chunk;
}

struct sctp_chunk *sctp_process_strreset_resp(
                                struct sctp_association *asoc,
                                union sctp_params param,
                                struct sctp_ulpevent **evp)
{
        struct sctp_stream *stream = &asoc->stream;
        struct sctp_strreset_resp *resp = param.v;
        struct sctp_transport *t;
        __u16 i, nums, flags = 0;
        struct sctp_paramhdr *req;
        __u32 result;

        req = sctp_chunk_lookup_strreset_param(asoc, resp->response_seq, 0);
        if (!req)
                return NULL;

        result = ntohl(resp->result);
        if (result != SCTP_STRRESET_PERFORMED) {
                /* if in progress, do nothing but retransmit */
                if (result == SCTP_STRRESET_IN_PROGRESS)
                        return NULL;
                else if (result == SCTP_STRRESET_DENIED)
                        flags = SCTP_STREAM_RESET_DENIED;
                else
                        flags = SCTP_STREAM_RESET_FAILED;
        }

        if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) {
                struct sctp_strreset_outreq *outreq;
                __be16 *str_p;

                outreq = (struct sctp_strreset_outreq *)req;
                str_p = outreq->list_of_streams;
                nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) /
                       sizeof(__u16);

                if (result == SCTP_STRRESET_PERFORMED) {
                        struct sctp_stream_out *sout;
                        if (nums) {
                                for (i = 0; i < nums; i++) {
                                        sout = SCTP_SO(stream, ntohs(str_p[i]));
                                        sout->mid = 0;
                                        sout->mid_uo = 0;
                                }
                        } else {
                                for (i = 0; i < stream->outcnt; i++) {
                                        sout = SCTP_SO(stream, i);
                                        sout->mid = 0;
                                        sout->mid_uo = 0;
                                }
                        }
                }

                flags |= SCTP_STREAM_RESET_OUTGOING_SSN;

                for (i = 0; i < stream->outcnt; i++)
                        SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;

                *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags,
                        nums, str_p, GFP_ATOMIC);
        } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) {
                struct sctp_strreset_inreq *inreq;
                __be16 *str_p;

                /* if the result is performed, it's impossible for inreq */
                if (result == SCTP_STRRESET_PERFORMED)
                        return NULL;

                inreq = (struct sctp_strreset_inreq *)req;
                str_p = inreq->list_of_streams;
                nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) /
                       sizeof(__u16);

                flags |= SCTP_STREAM_RESET_INCOMING_SSN;

                *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags,
                        nums, str_p, GFP_ATOMIC);
        } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) {
                struct sctp_strreset_resptsn *resptsn;
                __u32 stsn, rtsn;

                /* check for resptsn, as sctp_verify_reconf didn't do it*/
                if (ntohs(param.p->length) != sizeof(*resptsn))
                        return NULL;

                resptsn = (struct sctp_strreset_resptsn *)resp;
                stsn = ntohl(resptsn->senders_next_tsn);
                rtsn = ntohl(resptsn->receivers_next_tsn);

                if (result == SCTP_STRRESET_PERFORMED) {
                        __u32 mtsn = sctp_tsnmap_get_max_tsn_seen(
                                                &asoc->peer.tsn_map);
                        LIST_HEAD(temp);

                        asoc->stream.si->report_ftsn(&asoc->ulpq, mtsn);

                        sctp_tsnmap_init(&asoc->peer.tsn_map,
                                         SCTP_TSN_MAP_INITIAL,
                                         stsn, GFP_ATOMIC);

                        /* Clean up sacked and abandoned queues only. As the
                         * out_chunk_list may not be empty, splice it to temp,
                         * then get it back after sctp_outq_free is done.
                         */
                        list_splice_init(&asoc->outqueue.out_chunk_list, &temp);
                        sctp_outq_free(&asoc->outqueue);
                        list_splice_init(&temp, &asoc->outqueue.out_chunk_list);

                        asoc->next_tsn = rtsn;
                        asoc->ctsn_ack_point = asoc->next_tsn - 1;
                        asoc->adv_peer_ack_point = asoc->ctsn_ack_point;

                        for (i = 0; i < stream->outcnt; i++) {
                                SCTP_SO(stream, i)->mid = 0;
                                SCTP_SO(stream, i)->mid_uo = 0;
                        }
                        for (i = 0; i < stream->incnt; i++)
                                SCTP_SI(stream, i)->mid = 0;
                }

                for (i = 0; i < stream->outcnt; i++)
                        SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;

                *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags,
                        stsn, rtsn, GFP_ATOMIC);
        } else if (req->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) {
                struct sctp_strreset_addstrm *addstrm;
                __u16 number;

                addstrm = (struct sctp_strreset_addstrm *)req;
                nums = ntohs(addstrm->number_of_streams);
                number = stream->outcnt - nums;

                if (result == SCTP_STRRESET_PERFORMED) {
                        for (i = number; i < stream->outcnt; i++)
                                SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
                } else {
                        sctp_stream_shrink_out(stream, number);
                        stream->outcnt = number;
                }

                *evp = sctp_ulpevent_make_stream_change_event(asoc, flags,
                        0, nums, GFP_ATOMIC);
        } else if (req->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) {
                struct sctp_strreset_addstrm *addstrm;

                /* if the result is performed, it's impossible for addstrm in
                 * request.
                 */
                if (result == SCTP_STRRESET_PERFORMED)
                        return NULL;

                addstrm = (struct sctp_strreset_addstrm *)req;
                nums = ntohs(addstrm->number_of_streams);

                *evp = sctp_ulpevent_make_stream_change_event(asoc, flags,
                        nums, 0, GFP_ATOMIC);
        }

        asoc->strreset_outstanding--;
        asoc->strreset_outseq++;

        /* remove everything for this reconf request */
        if (!asoc->strreset_outstanding) {
                t = asoc->strreset_chunk->transport;
                if (del_timer(&t->reconf_timer))
                        sctp_transport_put(t);

                sctp_chunk_put(asoc->strreset_chunk);
                asoc->strreset_chunk = NULL;
        }

        return NULL;
}






    1 





    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __LICENSE_H
#define __LICENSE_H

static inline int license_is_gpl_compatible(const char *license)
{
        return (strcmp(license, "GPL") == 0
                || strcmp(license, "GPL v2") == 0
                || strcmp(license, "GPL and additional rights") == 0
                || strcmp(license, "Dual BSD/GPL") == 0
                || strcmp(license, "Dual MIT/GPL") == 0
                || strcmp(license, "Dual MPL/GPL") == 0);
}

#endif
























































































    1 


    1 












































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/hexdump.c
 */

#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/minmax.h>
#include <linux/export.h>
#include <asm/unaligned.h>

const char hex_asc[] = "0123456789abcdef";
EXPORT_SYMBOL(hex_asc);
const char hex_asc_upper[] = "0123456789ABCDEF";
EXPORT_SYMBOL(hex_asc_upper);

/**
 * hex_to_bin - convert a hex digit to its real value
 * @ch: ascii character represents hex digit
 *
 * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad
 * input.
 *
 * This function is used to load cryptographic keys, so it is coded in such a
 * way that there are no conditions or memory accesses that depend on data.
 *
 * Explanation of the logic:
 * (ch - '9' - 1) is negative if ch <= '9'
 * ('0' - 1 - ch) is negative if ch >= '0'
 * we "and" these two values, so the result is negative if ch is in the range
 *        '0' ... '9'
 * we are only interested in the sign, so we do a shift ">> 8"; note that right
 *        shift of a negative value is implementation-defined, so we cast the
 *        value to (unsigned) before the shift --- we have 0xffffff if ch is in
 *        the range '0' ... '9', 0 otherwise
 * we "and" this value with (ch - '0' + 1) --- we have a value 1 ... 10 if ch is
 *        in the range '0' ... '9', 0 otherwise
 * we add this value to -1 --- we have a value 0 ... 9 if ch is in the range '0'
 *        ... '9', -1 otherwise
 * the next line is similar to the previous one, but we need to decode both
 *        uppercase and lowercase letters, so we use (ch & 0xdf), which converts
 *        lowercase to uppercase
 */
int hex_to_bin(unsigned char ch)
{
        unsigned char cu = ch & 0xdf;
        return -1 +
                ((ch - '0' +  1) & (unsigned)((ch - '9' - 1) & ('0' - 1 - ch)) >> 8) +
                ((cu - 'A' + 11) & (unsigned)((cu - 'F' - 1) & ('A' - 1 - cu)) >> 8);
}
EXPORT_SYMBOL(hex_to_bin);

/**
 * hex2bin - convert an ascii hexadecimal string to its binary representation
 * @dst: binary result
 * @src: ascii hexadecimal string
 * @count: result length
 *
 * Return 0 on success, -EINVAL in case of bad input.
 */
int hex2bin(u8 *dst, const char *src, size_t count)
{
        while (count--) {
                int hi, lo;

                hi = hex_to_bin(*src++);
                if (unlikely(hi < 0))
                        return -EINVAL;
                lo = hex_to_bin(*src++);
                if (unlikely(lo < 0))
                        return -EINVAL;

                *dst++ = (hi << 4) | lo;
        }
        return 0;
}
EXPORT_SYMBOL(hex2bin);

/**
 * bin2hex - convert binary data to an ascii hexadecimal string
 * @dst: ascii hexadecimal result
 * @src: binary data
 * @count: binary data length
 */
char *bin2hex(char *dst, const void *src, size_t count)
{
        const unsigned char *_src = src;

        while (count--)
                dst = hex_byte_pack(dst, *_src++);
        return dst;
}
EXPORT_SYMBOL(bin2hex);

/**
 * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory
 * @buf: data blob to dump
 * @len: number of bytes in the @buf
 * @rowsize: number of bytes to print per line; must be 16 or 32
 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
 * @linebuf: where to put the converted data
 * @linebuflen: total size of @linebuf, including space for terminating NUL
 * @ascii: include ASCII after the hex output
 *
 * hex_dump_to_buffer() works on one "line" of output at a time, i.e.,
 * 16 or 32 bytes of input data converted to hex + ASCII output.
 *
 * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data
 * to a hex + ASCII dump at the supplied memory location.
 * The converted output is always NUL-terminated.
 *
 * E.g.:
 *   hex_dump_to_buffer(frame->data, frame->len, 16, 1,
 *                        linebuf, sizeof(linebuf), true);
 *
 * example output buffer:
 * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
 *
 * Return:
 * The amount of bytes placed in the buffer without terminating NUL. If the
 * output was truncated, then the return value is the number of bytes
 * (excluding the terminating NUL) which would have been written to the final
 * string if enough space had been available.
 */
int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize,
                       char *linebuf, size_t linebuflen, bool ascii)
{
        const u8 *ptr = buf;
        int ngroups;
        u8 ch;
        int j, lx = 0;
        int ascii_column;
        int ret;

        if (rowsize != 16 && rowsize != 32)
                rowsize = 16;

        if (len > rowsize)                /* limit to one line at a time */
                len = rowsize;
        if (!is_power_of_2(groupsize) || groupsize > 8)
                groupsize = 1;
        if ((len % groupsize) != 0)        /* no mixed size output */
                groupsize = 1;

        ngroups = len / groupsize;
        ascii_column = rowsize * 2 + rowsize / groupsize + 1;

        if (!linebuflen)
                goto overflow1;

        if (!len)
                goto nil;

        if (groupsize == 8) {
                const u64 *ptr8 = buf;

                for (j = 0; j < ngroups; j++) {
                        ret = snprintf(linebuf + lx, linebuflen - lx,
                                       "%s%16.16llx", j ? " " : "",
                                       get_unaligned(ptr8 + j));
                        if (ret >= linebuflen - lx)
                                goto overflow1;
                        lx += ret;
                }
        } else if (groupsize == 4) {
                const u32 *ptr4 = buf;

                for (j = 0; j < ngroups; j++) {
                        ret = snprintf(linebuf + lx, linebuflen - lx,
                                       "%s%8.8x", j ? " " : "",
                                       get_unaligned(ptr4 + j));
                        if (ret >= linebuflen - lx)
                                goto overflow1;
                        lx += ret;
                }
        } else if (groupsize == 2) {
                const u16 *ptr2 = buf;

                for (j = 0; j < ngroups; j++) {
                        ret = snprintf(linebuf + lx, linebuflen - lx,
                                       "%s%4.4x", j ? " " : "",
                                       get_unaligned(ptr2 + j));
                        if (ret >= linebuflen - lx)
                                goto overflow1;
                        lx += ret;
                }
        } else {
                for (j = 0; j < len; j++) {
                        if (linebuflen < lx + 2)
                                goto overflow2;
                        ch = ptr[j];
                        linebuf[lx++] = hex_asc_hi(ch);
                        if (linebuflen < lx + 2)
                                goto overflow2;
                        linebuf[lx++] = hex_asc_lo(ch);
                        if (linebuflen < lx + 2)
                                goto overflow2;
                        linebuf[lx++] = ' ';
                }
                if (j)
                        lx--;
        }
        if (!ascii)
                goto nil;

        while (lx < ascii_column) {
                if (linebuflen < lx + 2)
                        goto overflow2;
                linebuf[lx++] = ' ';
        }
        for (j = 0; j < len; j++) {
                if (linebuflen < lx + 2)
                        goto overflow2;
                ch = ptr[j];
                linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.';
        }
nil:
        linebuf[lx] = '\0';
        return lx;
overflow2:
        linebuf[lx++] = '\0';
overflow1:
        return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1;
}
EXPORT_SYMBOL(hex_dump_to_buffer);

#ifdef CONFIG_PRINTK
/**
 * print_hex_dump - print a text hex dump to syslog for a binary blob of data
 * @level: kernel log level (e.g. KERN_DEBUG)
 * @prefix_str: string to prefix each line with;
 *  caller supplies trailing spaces for alignment if desired
 * @prefix_type: controls whether prefix of an offset, address, or none
 *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
 * @rowsize: number of bytes to print per line; must be 16 or 32
 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1)
 * @buf: data blob to dump
 * @len: number of bytes in the @buf
 * @ascii: include ASCII after the hex output
 *
 * Given a buffer of u8 data, print_hex_dump() prints a hex + ASCII dump
 * to the kernel log at the specified kernel log level, with an optional
 * leading prefix.
 *
 * print_hex_dump() works on one "line" of output at a time, i.e.,
 * 16 or 32 bytes of input data converted to hex + ASCII output.
 * print_hex_dump() iterates over the entire input @buf, breaking it into
 * "line size" chunks to format and print.
 *
 * E.g.:
 *   print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS,
 *                    16, 1, frame->data, frame->len, true);
 *
 * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode:
 * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f  @ABCDEFGHIJKLMNO
 * Example output using %DUMP_PREFIX_ADDRESS and 4-byte mode:
 * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c  pqrstuvwxyz{|}~.
 */
void print_hex_dump(const char *level, const char *prefix_str, int prefix_type,
                    int rowsize, int groupsize,
                    const void *buf, size_t len, bool ascii)
{
        const u8 *ptr = buf;
        int i, linelen, remaining = len;
        unsigned char linebuf[32 * 3 + 2 + 32 + 1];

        if (rowsize != 16 && rowsize != 32)
                rowsize = 16;

        for (i = 0; i < len; i += rowsize) {
                linelen = min(remaining, rowsize);
                remaining -= rowsize;

                hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
                                   linebuf, sizeof(linebuf), ascii);

                switch (prefix_type) {
                case DUMP_PREFIX_ADDRESS:
                        printk("%s%s%p: %s\n",
                               level, prefix_str, ptr + i, linebuf);
                        break;
                case DUMP_PREFIX_OFFSET:
                        printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf);
                        break;
                default:
                        printk("%s%s%s\n", level, prefix_str, linebuf);
                        break;
                }
        }
}
EXPORT_SYMBOL(print_hex_dump);

#endif /* defined(CONFIG_PRINTK) */














































































































































































































































































    2 















    2 


    2 









    1 






    2 

    2 

    2 

    2 























    1 























































































































































































































































































    3 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation 
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>        
 */

#ifndef _IP6_FIB_H
#define _IP6_FIB_H

#include <linux/ipv6_route.h>
#include <linux/rtnetlink.h>
#include <linux/spinlock.h>
#include <linux/notifier.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/ip_fib.h>
#include <net/netlink.h>
#include <net/inetpeer.h>
#include <net/fib_notifier.h>
#include <linux/indirect_call_wrapper.h>
#include <uapi/linux/bpf.h>

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_HASHSZ 256
#else
#define FIB6_TABLE_HASHSZ 1
#endif

#define RT6_DEBUG 2

struct rt6_info;
struct fib6_info;

struct fib6_config {
        u32                fc_table;
        u32                fc_metric;
        int                fc_dst_len;
        int                fc_src_len;
        int                fc_ifindex;
        u32                fc_flags;
        u32                fc_protocol;
        u16                fc_type;        /* only 8 bits are used */
        u16                fc_delete_all_nh : 1,
                        fc_ignore_dev_down:1,
                        __unused : 14;
        u32                fc_nh_id;

        struct in6_addr        fc_dst;
        struct in6_addr        fc_src;
        struct in6_addr        fc_prefsrc;
        struct in6_addr        fc_gateway;

        unsigned long        fc_expires;
        struct nlattr        *fc_mx;
        int                fc_mx_len;
        int                fc_mp_len;
        struct nlattr        *fc_mp;

        struct nl_info        fc_nlinfo;
        struct nlattr        *fc_encap;
        u16                fc_encap_type;
        bool                fc_is_fdb;
};

struct fib6_node {
        struct fib6_node __rcu        *parent;
        struct fib6_node __rcu        *left;
        struct fib6_node __rcu        *right;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node __rcu        *subtree;
#endif
        struct fib6_info __rcu        *leaf;

        __u16                        fn_bit;                /* bit key */
        __u16                        fn_flags;
        int                        fn_sernum;
        struct fib6_info __rcu        *rr_ptr;
        struct rcu_head                rcu;
};

struct fib6_gc_args {
        int                        timeout;
        int                        more;
};

#ifndef CONFIG_IPV6_SUBTREES
#define FIB6_SUBTREE(fn)        NULL

static inline bool fib6_routes_require_src(const struct net *net)
{
        return false;
}

static inline void fib6_routes_require_src_inc(struct net *net) {}
static inline void fib6_routes_require_src_dec(struct net *net) {}

#else

static inline bool fib6_routes_require_src(const struct net *net)
{
        return net->ipv6.fib6_routes_require_src > 0;
}

static inline void fib6_routes_require_src_inc(struct net *net)
{
        net->ipv6.fib6_routes_require_src++;
}

static inline void fib6_routes_require_src_dec(struct net *net)
{
        net->ipv6.fib6_routes_require_src--;
}

#define FIB6_SUBTREE(fn)        (rcu_dereference_protected((fn)->subtree, 1))
#endif

/*
 *        routing information
 *
 */

struct rt6key {
        struct in6_addr        addr;
        int                plen;
};

struct fib6_table;

struct rt6_exception_bucket {
        struct hlist_head        chain;
        int                        depth;
};

struct rt6_exception {
        struct hlist_node        hlist;
        struct rt6_info                *rt6i;
        unsigned long                stamp;
        struct rcu_head                rcu;
};

#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
#define FIB6_MAX_DEPTH 5

struct fib6_nh {
        struct fib_nh_common        nh_common;

#ifdef CONFIG_IPV6_ROUTER_PREF
        unsigned long                last_probe;
#endif

        struct rt6_info * __percpu *rt6i_pcpu;
        struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
};

struct fib6_info {
        struct fib6_table                *fib6_table;
        struct fib6_info __rcu                *fib6_next;
        struct fib6_node __rcu                *fib6_node;

        /* Multipath routes:
         * siblings is a list of fib6_info that have the same metric/weight,
         * destination, but not the same gateway. nsiblings is just a cache
         * to speed up lookup.
         */
        union {
                struct list_head        fib6_siblings;
                struct list_head        nh_list;
        };
        unsigned int                        fib6_nsiblings;

        refcount_t                        fib6_ref;
        unsigned long                        expires;

        struct hlist_node                gc_link;

        struct dst_metrics                *fib6_metrics;
#define fib6_pmtu                fib6_metrics->metrics[RTAX_MTU-1]

        struct rt6key                        fib6_dst;
        u32                                fib6_flags;
        struct rt6key                        fib6_src;
        struct rt6key                        fib6_prefsrc;

        u32                                fib6_metric;
        u8                                fib6_protocol;
        u8                                fib6_type;

        u8                                offload;
        u8                                trap;
        u8                                offload_failed;

        u8                                should_flush:1,
                                        dst_nocount:1,
                                        dst_nopolicy:1,
                                        fib6_destroying:1,
                                        unused:4;

        struct rcu_head                        rcu;
        struct nexthop                        *nh;
        struct fib6_nh                        fib6_nh[];
};

struct rt6_info {
        struct dst_entry                dst;
        struct fib6_info __rcu                *from;
        int                                sernum;

        struct rt6key                        rt6i_dst;
        struct rt6key                        rt6i_src;
        struct in6_addr                        rt6i_gateway;
        struct inet6_dev                *rt6i_idev;
        u32                                rt6i_flags;

        /* more non-fragment space at head required */
        unsigned short                        rt6i_nfheader_len;
};

struct fib6_result {
        struct fib6_nh                *nh;
        struct fib6_info        *f6i;
        u32                        fib6_flags;
        u8                        fib6_type;
        struct rt6_info                *rt6;
};

#define for_each_fib6_node_rt_rcu(fn)                                        \
        for (rt = rcu_dereference((fn)->leaf); rt;                        \
             rt = rcu_dereference(rt->fib6_next))

#define for_each_fib6_walker_rt(w)                                        \
        for (rt = (w)->leaf; rt;                                        \
             rt = rcu_dereference_protected(rt->fib6_next, 1))

#define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst)

static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst)
{
        return dst_rt6_info(dst)->rt6i_idev;
}

static inline bool fib6_requires_src(const struct fib6_info *rt)
{
        return rt->fib6_src.plen > 0;
}

/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
 * been added to a table before.
 */
static inline void fib6_clean_expires(struct fib6_info *f6i)
{
        f6i->fib6_flags &= ~RTF_EXPIRES;
        f6i->expires = 0;
}

/* The callers should hold f6i->fib6_table->tb6_lock if a route has ever
 * been added to a table before.
 */
static inline void fib6_set_expires(struct fib6_info *f6i,
                                    unsigned long expires)
{
        f6i->expires = expires;
        f6i->fib6_flags |= RTF_EXPIRES;
}

static inline bool fib6_check_expired(const struct fib6_info *f6i)
{
        if (f6i->fib6_flags & RTF_EXPIRES)
                return time_after(jiffies, f6i->expires);
        return false;
}

/* Function to safely get fn->fn_sernum for passed in rt
 * and store result in passed in cookie.
 * Return true if we can get cookie safely
 * Return false if not
 */
static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
                                        u32 *cookie)
{
        struct fib6_node *fn;
        bool status = false;

        fn = rcu_dereference(f6i->fib6_node);

        if (fn) {
                *cookie = READ_ONCE(fn->fn_sernum);
                /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */
                smp_rmb();
                status = true;
        }

        return status;
}

static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
        struct fib6_info *from;
        u32 cookie = 0;

        if (rt->sernum)
                return rt->sernum;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from)
                fib6_get_cookie_safe(from, &cookie);

        rcu_read_unlock();

        return cookie;
}

static inline void ip6_rt_put(struct rt6_info *rt)
{
        /* dst_release() accepts a NULL parameter.
         * We rely on dst being first structure in struct rt6_info
         */
        BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0);
        dst_release(&rt->dst);
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh);
void fib6_info_destroy_rcu(struct rcu_head *head);

static inline void fib6_info_hold(struct fib6_info *f6i)
{
        refcount_inc(&f6i->fib6_ref);
}

static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
{
        return refcount_inc_not_zero(&f6i->fib6_ref);
}

static inline void fib6_info_release(struct fib6_info *f6i)
{
        if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) {
                DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link));
                call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu);
        }
}

enum fib6_walk_state {
#ifdef CONFIG_IPV6_SUBTREES
        FWS_S,
#endif
        FWS_L,
        FWS_R,
        FWS_C,
        FWS_U
};

struct fib6_walker {
        struct list_head lh;
        struct fib6_node *root, *node;
        struct fib6_info *leaf;
        enum fib6_walk_state state;
        unsigned int skip;
        unsigned int count;
        unsigned int skip_in_node;
        int (*func)(struct fib6_walker *);
        void *args;
};

struct rt6_statistics {
        __u32                fib_nodes;                /* all fib6 nodes */
        __u32                fib_route_nodes;        /* intermediate nodes */
        __u32                fib_rt_entries;                /* rt entries in fib table */
        __u32                fib_rt_cache;                /* cached rt entries in exception table */
        __u32                fib_discarded_routes;        /* total number of routes delete */

        /* The following stat is not protected by any lock */
        atomic_t        fib_rt_alloc;                /* total number of routes alloced */
};

#define RTN_TL_ROOT        0x0001
#define RTN_ROOT        0x0002                /* tree root node                */
#define RTN_RTINFO        0x0004                /* node with valid routing info        */

/*
 *        priority levels (or metrics)
 *
 */


struct fib6_table {
        struct hlist_node        tb6_hlist;
        u32                        tb6_id;
        spinlock_t                tb6_lock;
        struct fib6_node        tb6_root;
        struct inet_peer_base        tb6_peers;
        unsigned int                flags;
        unsigned int                fib_seq;
        struct hlist_head       tb6_gc_hlist;        /* GC candidates */
#define RT6_TABLE_HAS_DFLT_ROUTER        BIT(0)
};

#define RT6_TABLE_UNSPEC        RT_TABLE_UNSPEC
#define RT6_TABLE_MAIN                RT_TABLE_MAIN
#define RT6_TABLE_DFLT                RT6_TABLE_MAIN
#define RT6_TABLE_INFO                RT6_TABLE_MAIN
#define RT6_TABLE_PREFIX        RT6_TABLE_MAIN

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
#define FIB6_TABLE_MIN                1
#define FIB6_TABLE_MAX                RT_TABLE_MAX
#define RT6_TABLE_LOCAL                RT_TABLE_LOCAL
#else
#define FIB6_TABLE_MIN                RT_TABLE_MAIN
#define FIB6_TABLE_MAX                FIB6_TABLE_MIN
#define RT6_TABLE_LOCAL                RT6_TABLE_MAIN
#endif

typedef struct rt6_info *(*pol_lookup_t)(struct net *,
                                         struct fib6_table *,
                                         struct flowi6 *,
                                         const struct sk_buff *, int);

struct fib6_entry_notifier_info {
        struct fib_notifier_info info; /* must be first */
        struct fib6_info *rt;
        unsigned int nsiblings;
};

/*
 *        exported functions
 */

struct fib6_table *fib6_get_table(struct net *net, u32 id);
struct fib6_table *fib6_new_table(struct net *net, u32 id);
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup);

/* called with rcu lock held; can return error pointer
 * caller needs to select path
 */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags);

/* called with rcu lock held; caller needs to select path */
int fib6_table_lookup(struct net *net, struct fib6_table *table,
                      int oif, struct flowi6 *fl6, struct fib6_result *res,
                      int strict);

void fib6_select_path(const struct net *net, struct fib6_result *res,
                      struct flowi6 *fl6, int oif, bool have_oif_match,
                      const struct sk_buff *skb, int strict);
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr);

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match);

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
                    void *arg);
void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *arg),
                                void *arg);

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack);
int fib6_del(struct fib6_info *rt, struct nl_info *info);

static inline
void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
{
        const struct fib6_info *from;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (from)
                *addr = from->fib6_prefsrc.addr;
        else
                *addr = in6addr_any;

        rcu_read_unlock();
}

int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                 struct fib6_config *cfg, gfp_t gfp_flags,
                 struct netlink_ext_ack *extack);
void fib6_nh_release(struct fib6_nh *fib6_nh);
void fib6_nh_release_dsts(struct fib6_nh *fib6_nh);

int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack);
int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack);
int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt);
void fib6_rt_update(struct net *net, struct fib6_info *rt,
                    struct nl_info *info);
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
                     unsigned int flags);

void fib6_run_gc(unsigned long expires, struct net *net, bool force);

void fib6_gc_cleanup(void);

int fib6_init(void);

/* Add the route to the gc list if it is not already there
 *
 * The callers should hold f6i->fib6_table->tb6_lock.
 */
static inline void fib6_add_gc_list(struct fib6_info *f6i)
{
        /* If fib6_node is null, the f6i is not in (or removed from) the
         * table.
         *
         * There is a gap between finding the f6i from the table and
         * calling this function without the protection of the tb6_lock.
         * This check makes sure the f6i is not added to the gc list when
         * it is not on the table.
         */
        if (!rcu_dereference_protected(f6i->fib6_node,
                                       lockdep_is_held(&f6i->fib6_table->tb6_lock)))
                return;

        if (hlist_unhashed(&f6i->gc_link))
                hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist);
}

/* Remove the route from the gc list if it is on the list.
 *
 * The callers should hold f6i->fib6_table->tb6_lock.
 */
static inline void fib6_remove_gc_list(struct fib6_info *f6i)
{
        if (!hlist_unhashed(&f6i->gc_link))
                hlist_del_init(&f6i->gc_link);
}

struct ipv6_route_iter {
        struct seq_net_private p;
        struct fib6_walker w;
        loff_t skip;
        struct fib6_table *tbl;
        int sernum;
};

extern const struct seq_operations ipv6_route_seq_ops;

int call_fib6_notifier(struct notifier_block *nb,
                       enum fib_event_type event_type,
                       struct fib_notifier_info *info);
int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
                        struct fib_notifier_info *info);

int __net_init fib6_notifier_init(struct net *net);
void __net_exit fib6_notifier_exit(struct net *net);

unsigned int fib6_tables_seq_read(struct net *net);
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack);

void fib6_update_sernum(struct net *net, struct fib6_info *rt);
void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i);

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
{
        return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
}
void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
                            bool offload, bool trap, bool offload_failed);

#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
struct bpf_iter__ipv6_route {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct fib6_info *, rt);
};
#endif

INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags));
static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup,
                                                struct net *net,
                                                struct fib6_table *table,
                                                struct flowi6 *fl6,
                                                const struct sk_buff *skb,
                                                int flags)
{
        return INDIRECT_CALL_4(lookup,
                               ip6_pol_route_output,
                               ip6_pol_route_input,
                               ip6_pol_route_lookup,
                               __ip6_route_redirect,
                               net, table, fl6, skb, flags);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return net->ipv6.fib6_has_custom_rules;
}

int fib6_rules_init(void);
void fib6_rules_cleanup(void);
bool fib6_rule_default(const struct fib_rule *rule);
int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                    struct netlink_ext_ack *extack);
unsigned int fib6_rules_seq_read(struct net *net);

static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;

        if (!net->ipv6.fib6_rules_require_fldissect)
                return false;

        memset(flkeys, 0, sizeof(*flkeys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           flkeys, NULL, 0, 0, 0, flag);

        fl6->fl6_sport = flkeys->ports.src;
        fl6->fl6_dport = flkeys->ports.dst;
        fl6->flowi6_proto = flkeys->basic.ip_proto;

        return true;
}
#else
static inline bool fib6_has_custom_rules(const struct net *net)
{
        return false;
}
static inline int               fib6_rules_init(void)
{
        return 0;
}
static inline void              fib6_rules_cleanup(void)
{
        return ;
}
static inline bool fib6_rule_default(const struct fib_rule *rule)
{
        return true;
}
static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb,
                                  struct netlink_ext_ack *extack)
{
        return 0;
}
static inline unsigned int fib6_rules_seq_read(struct net *net)
{
        return 0;
}
static inline bool fib6_rules_early_flow_dissect(struct net *net,
                                                 struct sk_buff *skb,
                                                 struct flowi6 *fl6,
                                                 struct flow_keys *flkeys)
{
        return false;
}
#endif
#endif























































































    3 


























    3 































    2 
    2 






















































































    1 





    1 



























    1 
    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/anon_inodes.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 *  Thanks to Arnd Bergmann for code review and suggestions.
 *  More changes for Thomas Gleixner suggestions.
 *
 */

#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/anon_inodes.h>
#include <linux/pseudo_fs.h>

#include <linux/uaccess.h>

static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;

/*
 * anon_inodefs_dname() is called from d_path().
 */
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "anon_inode:%s",
                                dentry->d_name.name);
}

static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_dname        = anon_inodefs_dname,
};

static int anon_inodefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->dops = &anon_inodefs_dentry_operations;
        return 0;
}

static struct file_system_type anon_inode_fs_type = {
        .name                = "anon_inodefs",
        .init_fs_context = anon_inodefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static struct inode *anon_inode_make_secure_inode(
        const char *name,
        const struct inode *context_inode)
{
        struct inode *inode;
        const struct qstr qname = QSTR_INIT(name, strlen(name));
        int error;

        inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(inode))
                return inode;
        inode->i_flags &= ~S_PRIVATE;
        error =        security_inode_init_security_anon(inode, &qname, context_inode);
        if (error) {
                iput(inode);
                return ERR_PTR(error);
        }
        return inode;
}

static struct file *__anon_inode_getfile(const char *name,
                                         const struct file_operations *fops,
                                         void *priv, int flags,
                                         const struct inode *context_inode,
                                         bool make_inode)
{
        struct inode *inode;
        struct file *file;

        if (fops->owner && !try_module_get(fops->owner))
                return ERR_PTR(-ENOENT);

        if (make_inode) {
                inode =        anon_inode_make_secure_inode(name, context_inode);
                if (IS_ERR(inode)) {
                        file = ERR_CAST(inode);
                        goto err;
                }
        } else {
                inode =        anon_inode_inode;
                if (IS_ERR(inode)) {
                        file = ERR_PTR(-ENODEV);
                        goto err;
                }
                /*
                 * We know the anon_inode inode count is always
                 * greater than zero, so ihold() is safe.
                 */
                ihold(inode);
        }

        file = alloc_file_pseudo(inode, anon_inode_mnt, name,
                                 flags & (O_ACCMODE | O_NONBLOCK), fops);
        if (IS_ERR(file))
                goto err_iput;

        file->f_mapping = inode->i_mapping;

        file->private_data = priv;

        return file;

err_iput:
        iput(inode);
err:
        module_put(fops->owner);
        return file;
}

/**
 * anon_inode_getfile - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_getfile(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags)
{
        return __anon_inode_getfile(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);

/**
 * anon_inode_getfile_fmode - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @f_mode:  [in]    fmode
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup. Allows setting the fmode. Returns the newly created file* or an error
 * pointer.
 */
struct file *anon_inode_getfile_fmode(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags, fmode_t f_mode)
{
        struct file *file;

        file = __anon_inode_getfile(name, fops, priv, flags, NULL, false);
        if (!IS_ERR(file))
                file->f_mode |= f_mode;

        return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile_fmode);

/**
 * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
 *                             !S_PRIVATE anon inode rather than reuse the
 *                             singleton anon inode and calls the
 *                             inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_create_getfile(const char *name,
                                       const struct file_operations *fops,
                                       void *priv, int flags,
                                       const struct inode *context_inode)
{
        return __anon_inode_getfile(name, fops, priv, flags,
                                    context_inode, true);
}
EXPORT_SYMBOL_GPL(anon_inode_create_getfile);

static int __anon_inode_getfd(const char *name,
                              const struct file_operations *fops,
                              void *priv, int flags,
                              const struct inode *context_inode,
                              bool make_inode)
{
        int error, fd;
        struct file *file;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                return error;
        fd = error;

        file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
                                    make_inode);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_put_unused_fd;
        }
        fd_install(fd, file);

        return fd;

err_put_unused_fd:
        put_unused_fd(fd);
        return error;
}

/**
 * anon_inode_getfd - creates a new file instance by hooking it up to
 *                    an anonymous inode and a dentry that describe
 *                    the "class" of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is
 * useful for files that do not need to have a full-fledged inode in
 * order to operate correctly.  All the files created with
 * anon_inode_getfd() will use the same singleton inode, reducing
 * memory use and avoiding code duplication for the file/inode/dentry
 * setup.  Returns a newly created file descriptor or an error code.
 */
int anon_inode_getfd(const char *name, const struct file_operations *fops,
                     void *priv, int flags)
{
        return __anon_inode_getfd(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);

/**
 * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
 * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
 * the inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns a newly created file descriptor or an error code.
 */
int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
                            void *priv, int flags,
                            const struct inode *context_inode)
{
        return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}


static int __init anon_inode_init(void)
{
        anon_inode_mnt = kern_mount(&anon_inode_fs_type);
        if (IS_ERR(anon_inode_mnt))
                panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));

        anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(anon_inode_inode))
                panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));

        return 0;
}

fs_initcall(anon_inode_init);










































































































    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NSPROXY_H
#define _LINUX_NSPROXY_H

#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/sched.h>

struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
struct cgroup_namespace;
struct fs_struct;

/*
 * A structure to contain pointers to all per-process
 * namespaces - fs (mount), uts, network, sysvipc, etc.
 *
 * The pid namespace is an exception -- it's accessed using
 * task_active_pid_ns.  The pid namespace here is the
 * namespace that children will use.
 *
 * 'count' is the number of tasks holding a reference.
 * The count for each namespace, then, will be the number
 * of nsproxies pointing to it, not the number of tasks.
 *
 * The nsproxy is shared by tasks which share all namespaces.
 * As soon as a single namespace is cloned or unshared, the
 * nsproxy is copied.
 */
struct nsproxy {
        refcount_t count;
        struct uts_namespace *uts_ns;
        struct ipc_namespace *ipc_ns;
        struct mnt_namespace *mnt_ns;
        struct pid_namespace *pid_ns_for_children;
        struct net              *net_ns;
        struct time_namespace *time_ns;
        struct time_namespace *time_ns_for_children;
        struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;

/*
 * A structure to encompass all bits needed to install
 * a partial or complete new set of namespaces.
 *
 * If a new user namespace is requested cred will
 * point to a modifiable set of credentials. If a pointer
 * to a modifiable set is needed nsset_cred() must be
 * used and tested.
 */
struct nsset {
        unsigned flags;
        struct nsproxy *nsproxy;
        struct fs_struct *fs;
        const struct cred *cred;
};

static inline struct cred *nsset_cred(struct nsset *set)
{
        if (set->flags & CLONE_NEWUSER)
                return (struct cred *)set->cred;

        return NULL;
}

/*
 * the namespaces access rules are:
 *
 *  1. only current task is allowed to change tsk->nsproxy pointer or
 *     any pointer on the nsproxy itself.  Current must hold the task_lock
 *     when changing tsk->nsproxy.
 *
 *  2. when accessing (i.e. reading) current task's namespaces - no
 *     precautions should be taken - just dereference the pointers
 *
 *  3. the access to other task namespaces is performed like this
 *     task_lock(task);
 *     nsproxy = task->nsproxy;
 *     if (nsproxy != NULL) {
 *             / *
 *               * work with the namespaces here
 *               * e.g. get the reference on one of them
 *               * /
 *     } / *
 *         * NULL task->nsproxy means that this task is
 *         * almost dead (zombie)
 *         * /
 *     task_unlock(task);
 *
 */

int copy_namespaces(unsigned long flags, struct task_struct *tsk);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
int exec_task_namespaces(void);
void free_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
        struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);

static inline void put_nsproxy(struct nsproxy *ns)
{
        if (refcount_dec_and_test(&ns->count))
                free_nsproxy(ns);
}

static inline void get_nsproxy(struct nsproxy *ns)
{
        refcount_inc(&ns->count);
}

#endif





































































































































































    3 
    3 



























    3 






































    3 

































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/unistd.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>

#include <asm/ucontext.h>
#include <asm/fpu/signal.h>
#include <asm/sighandling.h>

#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>

/*
 * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
 * alone.  Using this generally makes no sense unless
 * user_64bit_mode(regs) would return true.
 */
static void force_valid_ss(struct pt_regs *regs)
{
        u32 ar;
        asm volatile ("lar %[old_ss], %[ar]\n\t"
                      "jz 1f\n\t"                /* If invalid: */
                      "xorl %[ar], %[ar]\n\t"        /* set ar = 0 */
                      "1:"
                      : [ar] "=r" (ar)
                      : [old_ss] "rm" ((u16)regs->ss));

        /*
         * For a valid 64-bit user context, we need DPL 3, type
         * read-write data or read-write exp-down data, and S and P
         * set.  We can't use VERW because VERW doesn't check the
         * P bit.
         */
        ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
        if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
            ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
                regs->ss = __USER_DS;
}

static bool restore_sigcontext(struct pt_regs *regs,
                               struct sigcontext __user *usc,
                               unsigned long uc_flags)
{
        struct sigcontext sc;

        /* Always make any pending restarted system calls return -EINTR */
        current->restart_block.fn = do_no_restart_syscall;

        if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1)))
                return false;

        regs->bx = sc.bx;
        regs->cx = sc.cx;
        regs->dx = sc.dx;
        regs->si = sc.si;
        regs->di = sc.di;
        regs->bp = sc.bp;
        regs->ax = sc.ax;
        regs->sp = sc.sp;
        regs->ip = sc.ip;
        regs->r8 = sc.r8;
        regs->r9 = sc.r9;
        regs->r10 = sc.r10;
        regs->r11 = sc.r11;
        regs->r12 = sc.r12;
        regs->r13 = sc.r13;
        regs->r14 = sc.r14;
        regs->r15 = sc.r15;

        /* Get CS/SS and force CPL3 */
        regs->cs = sc.cs | 0x03;
        regs->ss = sc.ss | 0x03;

        regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
        /* disable syscall checks */
        regs->orig_ax = -1;

        /*
         * Fix up SS if needed for the benefit of old DOSEMU and
         * CRIU.
         */
        if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
                force_valid_ss(regs);

        return fpu__restore_sig((void __user *)sc.fpstate, 0);
}

static __always_inline int
__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                     struct pt_regs *regs, unsigned long mask)
{
        unsafe_put_user(regs->di, &sc->di, Efault);
        unsafe_put_user(regs->si, &sc->si, Efault);
        unsafe_put_user(regs->bp, &sc->bp, Efault);
        unsafe_put_user(regs->sp, &sc->sp, Efault);
        unsafe_put_user(regs->bx, &sc->bx, Efault);
        unsafe_put_user(regs->dx, &sc->dx, Efault);
        unsafe_put_user(regs->cx, &sc->cx, Efault);
        unsafe_put_user(regs->ax, &sc->ax, Efault);
        unsafe_put_user(regs->r8, &sc->r8, Efault);
        unsafe_put_user(regs->r9, &sc->r9, Efault);
        unsafe_put_user(regs->r10, &sc->r10, Efault);
        unsafe_put_user(regs->r11, &sc->r11, Efault);
        unsafe_put_user(regs->r12, &sc->r12, Efault);
        unsafe_put_user(regs->r13, &sc->r13, Efault);
        unsafe_put_user(regs->r14, &sc->r14, Efault);
        unsafe_put_user(regs->r15, &sc->r15, Efault);

        unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
        unsafe_put_user(current->thread.error_code, &sc->err, Efault);
        unsafe_put_user(regs->ip, &sc->ip, Efault);
        unsafe_put_user(regs->flags, &sc->flags, Efault);
        unsafe_put_user(regs->cs, &sc->cs, Efault);
        unsafe_put_user(0, &sc->gs, Efault);
        unsafe_put_user(0, &sc->fs, Efault);
        unsafe_put_user(regs->ss, &sc->ss, Efault);

        unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);

        /* non-iBCS2 extensions.. */
        unsafe_put_user(mask, &sc->oldmask, Efault);
        unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
        return 0;
Efault:
        return -EFAULT;
}

#define unsafe_put_sigcontext(sc, fp, regs, set, label)                        \
do {                                                                        \
        if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0]))        \
                goto label;                                                \
} while(0);

#define unsafe_put_sigmask(set, frame, label) \
        unsafe_put_user(*(__u64 *)(set), \
                        (__u64 __user *)&(frame)->uc.uc_sigmask, \
                        label)

static unsigned long frame_uc_flags(struct pt_regs *regs)
{
        unsigned long flags;

        if (boot_cpu_has(X86_FEATURE_XSAVE))
                flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
        else
                flags = UC_SIGCONTEXT_SS;

        if (likely(user_64bit_mode(regs)))
                flags |= UC_STRICT_RESTORE_SS;

        return flags;
}

int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
        sigset_t *set = sigmask_to_save();
        struct rt_sigframe __user *frame;
        void __user *fp = NULL;
        unsigned long uc_flags;

        /* x86-64 should always use SA_RESTORER. */
        if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
                return -EFAULT;

        frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
        uc_flags = frame_uc_flags(regs);

        if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;

        /* Create the ucontext.  */
        unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
        unsafe_put_user(0, &frame->uc.uc_link, Efault);
        unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);

        /* Set up to return from userspace.  If provided, use a stub
           already in userspace.  */
        unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
        unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
        unsafe_put_sigmask(set, frame, Efault);
        user_access_end();

        if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
                if (copy_siginfo_to_user(&frame->info, &ksig->info))
                        return -EFAULT;
        }

        if (setup_signal_shadow_stack(ksig))
                return -EFAULT;

        /* Set up registers for signal handler */
        regs->di = ksig->sig;
        /* In case the signal handler was declared without prototypes */
        regs->ax = 0;

        /* This also works for non SA_SIGINFO handlers because they expect the
           next argument after the signal number on the stack. */
        regs->si = (unsigned long)&frame->info;
        regs->dx = (unsigned long)&frame->uc;
        regs->ip = (unsigned long) ksig->ka.sa.sa_handler;

        regs->sp = (unsigned long)frame;

        /*
         * Set up the CS and SS registers to run signal handlers in
         * 64-bit mode, even if the handler happens to be interrupting
         * 32-bit or 16-bit code.
         *
         * SS is subtle.  In 64-bit mode, we don't need any particular
         * SS descriptor, but we do need SS to be valid.  It's possible
         * that the old SS is entirely bogus -- this can happen if the
         * signal we're trying to deliver is #GP or #SS caused by a bad
         * SS value.  We also have a compatibility issue here: DOSEMU
         * relies on the contents of the SS register indicating the
         * SS value at the time of the signal, even though that code in
         * DOSEMU predates sigreturn's ability to restore SS.  (DOSEMU
         * avoids relying on sigreturn to restore SS; instead it uses
         * a trampoline.)  So we do our best: if the old SS was valid,
         * we keep it.  Otherwise we replace it.
         */
        regs->cs = __USER_CS;

        if (unlikely(regs->ss != __USER_DS))
                force_valid_ss(regs);

        return 0;

Efault:
        user_access_end();
        return -EFAULT;
}

/*
 * Do a signal return; undo the signal stack.
 */
SYSCALL_DEFINE0(rt_sigreturn)
{
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe __user *frame;
        sigset_t set;
        unsigned long uc_flags;

        frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
        if (!access_ok(frame, sizeof(*frame)))
                goto badframe;
        if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
                goto badframe;
        if (__get_user(uc_flags, &frame->uc.uc_flags))
                goto badframe;

        set_current_blocked(&set);

        if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                goto badframe;

        if (restore_signal_shadow_stack())
                goto badframe;

        if (restore_altstack(&frame->uc.uc_stack))
                goto badframe;

        return regs->ax;

badframe:
        signal_fault(regs, frame, "rt_sigreturn");
        return 0;
}

#ifdef CONFIG_X86_X32_ABI
static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
                const struct kernel_siginfo *from)
{
        struct compat_siginfo new;

        copy_siginfo_to_external32(&new, from);
        if (from->si_signo == SIGCHLD) {
                new._sifields._sigchld_x32._utime = from->si_utime;
                new._sifields._sigchld_x32._stime = from->si_stime;
        }
        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
                return -EFAULT;
        return 0;
}

int copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
{
        if (in_x32_syscall())
                return x32_copy_siginfo_to_user(to, from);
        return __copy_siginfo_to_user32(to, from);
}

int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
        compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save();
        struct rt_sigframe_x32 __user *frame;
        unsigned long uc_flags;
        void __user *restorer;
        void __user *fp = NULL;

        if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
                return -EFAULT;

        frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);

        uc_flags = frame_uc_flags(regs);

        if (setup_signal_shadow_stack(ksig))
                return -EFAULT;

        if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;

        /* Create the ucontext.  */
        unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
        unsafe_put_user(0, &frame->uc.uc_link, Efault);
        unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
        unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
        restorer = ksig->ka.sa.sa_restorer;
        unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
        unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
        unsafe_put_sigmask(set, frame, Efault);
        user_access_end();

        if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
                if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
                        return -EFAULT;
        }

        /* Set up registers for signal handler */
        regs->sp = (unsigned long) frame;
        regs->ip = (unsigned long) ksig->ka.sa.sa_handler;

        /* We use the x32 calling convention here... */
        regs->di = ksig->sig;
        regs->si = (unsigned long) &frame->info;
        regs->dx = (unsigned long) &frame->uc;

        loadsegment(ds, __USER_DS);
        loadsegment(es, __USER_DS);

        regs->cs = __USER_CS;
        regs->ss = __USER_DS;

        return 0;

Efault:
        user_access_end();
        return -EFAULT;
}

COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
{
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe_x32 __user *frame;
        sigset_t set;
        unsigned long uc_flags;

        frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);

        if (!access_ok(frame, sizeof(*frame)))
                goto badframe;
        if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
                goto badframe;
        if (__get_user(uc_flags, &frame->uc.uc_flags))
                goto badframe;

        set_current_blocked(&set);

        if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                goto badframe;

        if (restore_signal_shadow_stack())
                goto badframe;

        if (compat_restore_altstack(&frame->uc.uc_stack))
                goto badframe;

        return regs->ax;

badframe:
        signal_fault(regs, frame, "x32 rt_sigreturn");
        return 0;
}
#endif /* CONFIG_X86_X32_ABI */

#ifdef CONFIG_COMPAT
void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
{
        if (!act)
                return;

        if (in_ia32_syscall())
                act->sa.sa_flags |= SA_IA32_ABI;
        if (in_x32_syscall())
                act->sa.sa_flags |= SA_X32_ABI;
}
#endif /* CONFIG_COMPAT */

/*
* If adding a new si_code, there is probably new data in
* the siginfo.  Make sure folks bumping the si_code
* limits also have to look at this code.  Make sure any
* new fields are handled in copy_siginfo_to_user32()!
*/
static_assert(NSIGILL  == 11);
static_assert(NSIGFPE  == 15);
static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS  == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
static_assert(NSIGSYS  == 2);

/* This is part of the ABI and can never change in size: */
static_assert(sizeof(siginfo_t) == 128);

/* This is a part of the ABI and can never change in alignment */
static_assert(__alignof__(siginfo_t) == 8);

/*
* The offsets of all the (unioned) si_fields are fixed
* in the ABI, of course.  Make sure none of them ever
* move and are always at the beginning:
*/
static_assert(offsetof(siginfo_t, si_signo) == 0);
static_assert(offsetof(siginfo_t, si_errno) == 4);
static_assert(offsetof(siginfo_t, si_code)  == 8);

/*
* Ensure that the size of each si_field never changes.
* If it does, it is a sign that the
* copy_siginfo_to_user32() code below needs to updated
* along with the size in the CHECK_SI_SIZE().
*
* We repeat this check for both the generic and compat
* siginfos.
*
* Note: it is OK for these to grow as long as the whole
* structure stays within the padding size (checked
* above).
*/

#define CHECK_SI_OFFSET(name)                                                \
        static_assert(offsetof(siginfo_t, _sifields) ==                 \
                      offsetof(siginfo_t, _sifields.name))
#define CHECK_SI_SIZE(name, size)                                        \
        static_assert(sizeof_field(siginfo_t, _sifields.name) == size)

CHECK_SI_OFFSET(_kill);
CHECK_SI_SIZE  (_kill, 2*sizeof(int));
static_assert(offsetof(siginfo_t, si_pid) == 0x10);
static_assert(offsetof(siginfo_t, si_uid) == 0x14);

CHECK_SI_OFFSET(_timer);
CHECK_SI_SIZE  (_timer, 6*sizeof(int));
static_assert(offsetof(siginfo_t, si_tid)     == 0x10);
static_assert(offsetof(siginfo_t, si_overrun) == 0x14);
static_assert(offsetof(siginfo_t, si_value)   == 0x18);

CHECK_SI_OFFSET(_rt);
CHECK_SI_SIZE  (_rt, 4*sizeof(int));
static_assert(offsetof(siginfo_t, si_pid)   == 0x10);
static_assert(offsetof(siginfo_t, si_uid)   == 0x14);
static_assert(offsetof(siginfo_t, si_value) == 0x18);

CHECK_SI_OFFSET(_sigchld);
CHECK_SI_SIZE  (_sigchld, 8*sizeof(int));
static_assert(offsetof(siginfo_t, si_pid)    == 0x10);
static_assert(offsetof(siginfo_t, si_uid)    == 0x14);
static_assert(offsetof(siginfo_t, si_status) == 0x18);
static_assert(offsetof(siginfo_t, si_utime)  == 0x20);
static_assert(offsetof(siginfo_t, si_stime)  == 0x28);

#ifdef CONFIG_X86_X32_ABI
/* no _sigchld_x32 in the generic siginfo_t */
static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) ==
              7*sizeof(int));
static_assert(offsetof(compat_siginfo_t, _sifields) ==
              offsetof(compat_siginfo_t, _sifields._sigchld_x32));
static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime)  == 0x18);
static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime)  == 0x20);
#endif

CHECK_SI_OFFSET(_sigfault);
CHECK_SI_SIZE  (_sigfault, 8*sizeof(int));
static_assert(offsetof(siginfo_t, si_addr)        == 0x10);

static_assert(offsetof(siginfo_t, si_trapno)        == 0x18);

static_assert(offsetof(siginfo_t, si_addr_lsb)        == 0x18);

static_assert(offsetof(siginfo_t, si_lower)        == 0x20);
static_assert(offsetof(siginfo_t, si_upper)        == 0x28);

static_assert(offsetof(siginfo_t, si_pkey)        == 0x20);

static_assert(offsetof(siginfo_t, si_perf_data)         == 0x18);
static_assert(offsetof(siginfo_t, si_perf_type)         == 0x20);
static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24);

CHECK_SI_OFFSET(_sigpoll);
CHECK_SI_SIZE  (_sigpoll, 4*sizeof(int));
static_assert(offsetof(siginfo_t, si_band) == 0x10);
static_assert(offsetof(siginfo_t, si_fd)   == 0x18);

CHECK_SI_OFFSET(_sigsys);
CHECK_SI_SIZE  (_sigsys, 4*sizeof(int));
static_assert(offsetof(siginfo_t, si_call_addr) == 0x10);
static_assert(offsetof(siginfo_t, si_syscall)   == 0x18);
static_assert(offsetof(siginfo_t, si_arch)      == 0x1C);

/* any new si_fields should be added here */







































































































































































    1 

    1 









































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/mount.c - kernfs mount implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/seq_file.h>
#include <linux/exportfs.h>
#include <linux/uuid.h>
#include <linux/statfs.h>

#include "kernfs-internal.h"

struct kmem_cache *kernfs_node_cache __ro_after_init;
struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
struct kernfs_global_locks *kernfs_locks __ro_after_init;

static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
{
        struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry));
        struct kernfs_syscall_ops *scops = root->syscall_ops;

        if (scops && scops->show_options)
                return scops->show_options(sf, root);
        return 0;
}

static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
{
        struct kernfs_node *node = kernfs_dentry_node(dentry);
        struct kernfs_root *root = kernfs_root(node);
        struct kernfs_syscall_ops *scops = root->syscall_ops;

        if (scops && scops->show_path)
                return scops->show_path(sf, node, root);

        seq_dentry(sf, dentry, " \t\n\\");
        return 0;
}

static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        simple_statfs(dentry, buf);
        buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
        return 0;
}

const struct super_operations kernfs_sops = {
        .statfs                = kernfs_statfs,
        .drop_inode        = generic_delete_inode,
        .evict_inode        = kernfs_evict_inode,

        .show_options        = kernfs_sop_show_options,
        .show_path        = kernfs_sop_show_path,
};

static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
                            struct inode *parent)
{
        struct kernfs_node *kn = inode->i_private;

        if (*max_len < 2) {
                *max_len = 2;
                return FILEID_INVALID;
        }

        *max_len = 2;
        *(u64 *)fh = kn->id;
        return FILEID_KERNFS;
}

static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
                                            struct fid *fid, int fh_len,
                                            int fh_type, bool get_parent)
{
        struct kernfs_super_info *info = kernfs_info(sb);
        struct kernfs_node *kn;
        struct inode *inode;
        u64 id;

        if (fh_len < 2)
                return NULL;

        switch (fh_type) {
        case FILEID_KERNFS:
                id = *(u64 *)fid;
                break;
        case FILEID_INO32_GEN:
        case FILEID_INO32_GEN_PARENT:
                /*
                 * blk_log_action() exposes "LOW32,HIGH32" pair without
                 * type and userland can call us with generic fid
                 * constructed from them.  Combine it back to ID.  See
                 * blk_log_action().
                 */
                id = ((u64)fid->i32.gen << 32) | fid->i32.ino;
                break;
        default:
                return NULL;
        }

        kn = kernfs_find_and_get_node_by_id(info->root, id);
        if (!kn)
                return ERR_PTR(-ESTALE);

        if (get_parent) {
                struct kernfs_node *parent;

                parent = kernfs_get_parent(kn);
                kernfs_put(kn);
                kn = parent;
                if (!kn)
                        return ERR_PTR(-ESTALE);
        }

        inode = kernfs_get_inode(sb, kn);
        kernfs_put(kn);
        return d_obtain_alias(inode);
}

static struct dentry *kernfs_fh_to_dentry(struct super_block *sb,
                                          struct fid *fid, int fh_len,
                                          int fh_type)
{
        return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false);
}

static struct dentry *kernfs_fh_to_parent(struct super_block *sb,
                                          struct fid *fid, int fh_len,
                                          int fh_type)
{
        return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true);
}

static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
{
        struct kernfs_node *kn = kernfs_dentry_node(child);

        return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent));
}

static const struct export_operations kernfs_export_ops = {
        .encode_fh        = kernfs_encode_fh,
        .fh_to_dentry        = kernfs_fh_to_dentry,
        .fh_to_parent        = kernfs_fh_to_parent,
        .get_parent        = kernfs_get_parent_dentry,
};

/**
 * kernfs_root_from_sb - determine kernfs_root associated with a super_block
 * @sb: the super_block in question
 *
 * Return: the kernfs_root associated with @sb.  If @sb is not a kernfs one,
 * %NULL is returned.
 */
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
{
        if (sb->s_op == &kernfs_sops)
                return kernfs_info(sb)->root;
        return NULL;
}

/*
 * find the next ancestor in the path down to @child, where @parent was the
 * ancestor whose descendant we want to find.
 *
 * Say the path is /a/b/c/d.  @child is d, @parent is %NULL.  We return the root
 * node.  If @parent is b, then we return the node for c.
 * Passing in d as @parent is not ok.
 */
static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
                                              struct kernfs_node *parent)
{
        if (child == parent) {
                pr_crit_once("BUG in find_next_ancestor: called with parent == child");
                return NULL;
        }

        while (child->parent != parent) {
                if (!child->parent)
                        return NULL;
                child = child->parent;
        }

        return child;
}

/**
 * kernfs_node_dentry - get a dentry for the given kernfs_node
 * @kn: kernfs_node for which a dentry is needed
 * @sb: the kernfs super_block
 *
 * Return: the dentry pointer
 */
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
                                  struct super_block *sb)
{
        struct dentry *dentry;
        struct kernfs_node *knparent;

        BUG_ON(sb->s_op != &kernfs_sops);

        dentry = dget(sb->s_root);

        /* Check if this is the root kernfs_node */
        if (!kn->parent)
                return dentry;

        knparent = find_next_ancestor(kn, NULL);
        if (WARN_ON(!knparent)) {
                dput(dentry);
                return ERR_PTR(-EINVAL);
        }

        do {
                struct dentry *dtmp;
                struct kernfs_node *kntmp;

                if (kn == knparent)
                        return dentry;
                kntmp = find_next_ancestor(kn, knparent);
                if (WARN_ON(!kntmp)) {
                        dput(dentry);
                        return ERR_PTR(-EINVAL);
                }
                dtmp = lookup_positive_unlocked(kntmp->name, dentry,
                                               strlen(kntmp->name));
                dput(dentry);
                if (IS_ERR(dtmp))
                        return dtmp;
                knparent = kntmp;
                dentry = dtmp;
        } while (true);
}

static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
{
        struct kernfs_super_info *info = kernfs_info(sb);
        struct kernfs_root *kf_root = kfc->root;
        struct inode *inode;
        struct dentry *root;

        info->sb = sb;
        /* Userspace would break if executables or devices appear on sysfs */
        sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
        sb->s_blocksize = PAGE_SIZE;
        sb->s_blocksize_bits = PAGE_SHIFT;
        sb->s_magic = kfc->magic;
        sb->s_op = &kernfs_sops;
        sb->s_xattr = kernfs_xattr_handlers;
        if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
                sb->s_export_op = &kernfs_export_ops;
        sb->s_time_gran = 1;

        /* sysfs dentries and inodes don't require IO to create */
        sb->s_shrink->seeks = 0;

        /* get root inode, initialize and unlock it */
        down_read(&kf_root->kernfs_rwsem);
        inode = kernfs_get_inode(sb, info->root->kn);
        up_read(&kf_root->kernfs_rwsem);
        if (!inode) {
                pr_debug("kernfs: could not get root inode\n");
                return -ENOMEM;
        }

        /* instantiate and link root dentry */
        root = d_make_root(inode);
        if (!root) {
                pr_debug("%s: could not get root dentry!\n", __func__);
                return -ENOMEM;
        }
        sb->s_root = root;
        sb->s_d_op = &kernfs_dops;
        return 0;
}

static int kernfs_test_super(struct super_block *sb, struct fs_context *fc)
{
        struct kernfs_super_info *sb_info = kernfs_info(sb);
        struct kernfs_super_info *info = fc->s_fs_info;

        return sb_info->root == info->root && sb_info->ns == info->ns;
}

static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
{
        struct kernfs_fs_context *kfc = fc->fs_private;

        kfc->ns_tag = NULL;
        return set_anon_super_fc(sb, fc);
}

/**
 * kernfs_super_ns - determine the namespace tag of a kernfs super_block
 * @sb: super_block of interest
 *
 * Return: the namespace tag associated with kernfs super_block @sb.
 */
const void *kernfs_super_ns(struct super_block *sb)
{
        struct kernfs_super_info *info = kernfs_info(sb);

        return info->ns;
}

/**
 * kernfs_get_tree - kernfs filesystem access/retrieval helper
 * @fc: The filesystem context.
 *
 * This is to be called from each kernfs user's fs_context->ops->get_tree()
 * implementation, which should set the specified ->@fs_type and ->@flags, and
 * specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
 * respectively.
 *
 * Return: %0 on success, -errno on failure.
 */
int kernfs_get_tree(struct fs_context *fc)
{
        struct kernfs_fs_context *kfc = fc->fs_private;
        struct super_block *sb;
        struct kernfs_super_info *info;
        int error;

        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
                return -ENOMEM;

        info->root = kfc->root;
        info->ns = kfc->ns_tag;
        INIT_LIST_HEAD(&info->node);

        fc->s_fs_info = info;
        sb = sget_fc(fc, kernfs_test_super, kernfs_set_super);
        if (IS_ERR(sb))
                return PTR_ERR(sb);

        if (!sb->s_root) {
                struct kernfs_super_info *info = kernfs_info(sb);
                struct kernfs_root *root = kfc->root;

                kfc->new_sb_created = true;

                error = kernfs_fill_super(sb, kfc);
                if (error) {
                        deactivate_locked_super(sb);
                        return error;
                }
                sb->s_flags |= SB_ACTIVE;

                uuid_t uuid;
                uuid_gen(&uuid);
                super_set_uuid(sb, uuid.b, sizeof(uuid));

                down_write(&root->kernfs_supers_rwsem);
                list_add(&info->node, &info->root->supers);
                up_write(&root->kernfs_supers_rwsem);
        }

        fc->root = dget(sb->s_root);
        return 0;
}

void kernfs_free_fs_context(struct fs_context *fc)
{
        /* Note that we don't deal with kfc->ns_tag here. */
        kfree(fc->s_fs_info);
        fc->s_fs_info = NULL;
}

/**
 * kernfs_kill_sb - kill_sb for kernfs
 * @sb: super_block being killed
 *
 * This can be used directly for file_system_type->kill_sb().  If a kernfs
 * user needs extra cleanup, it can implement its own kill_sb() and call
 * this function at the end.
 */
void kernfs_kill_sb(struct super_block *sb)
{
        struct kernfs_super_info *info = kernfs_info(sb);
        struct kernfs_root *root = info->root;

        down_write(&root->kernfs_supers_rwsem);
        list_del(&info->node);
        up_write(&root->kernfs_supers_rwsem);

        /*
         * Remove the superblock from fs_supers/s_instances
         * so we can't find it, before freeing kernfs_super_info.
         */
        kill_anon_super(sb);
        kfree(info);
}

static void __init kernfs_mutex_init(void)
{
        int count;

        for (count = 0; count < NR_KERNFS_LOCKS; count++)
                mutex_init(&kernfs_locks->open_file_mutex[count]);
}

static void __init kernfs_lock_init(void)
{
        kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL);
        WARN_ON(!kernfs_locks);

        kernfs_mutex_init();
}

void __init kernfs_init(void)
{
        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
                                              sizeof(struct kernfs_node),
                                              0, SLAB_PANIC, NULL);

        /* Creates slab cache for kernfs inode attributes */
        kernfs_iattrs_cache  = kmem_cache_create("kernfs_iattrs_cache",
                                              sizeof(struct kernfs_iattrs),
                                              0, SLAB_PANIC, NULL);

        kernfs_lock_init();
}


































































   23 

















    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Prevent the compiler from merging or refetching reads or writes. The
 * compiler is also forbidden from reordering successive instances of
 * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
 * particular ordering. One way to make the compiler aware of ordering is to
 * put the two invocations of READ_ONCE or WRITE_ONCE in different C
 * statements.
 *
 * These two macros will also work on aggregate data types like structs or
 * unions.
 *
 * Their two major use cases are: (1) Mediating communication between
 * process-level code and irq/NMI handlers, all running on the same CPU,
 * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
 * mutilate accesses that either do not require ordering or that interact
 * with an explicit memory barrier or atomic instruction that provides the
 * required ordering.
 */
#ifndef __ASM_GENERIC_RWONCE_H
#define __ASM_GENERIC_RWONCE_H

#ifndef __ASSEMBLY__

#include <linux/compiler_types.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>

/*
 * Yes, this permits 64-bit accesses on 32-bit architectures. These will
 * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
 * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
 * (e.g. a virtual address) and a strong prevailing wind.
 */
#define compiletime_assert_rwonce_type(t)                                        \
        compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),        \
                "Unsupported access size for {READ,WRITE}_ONCE().")

/*
 * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
 * atomicity. Note that this may result in tears!
 */
#ifndef __READ_ONCE
#define __READ_ONCE(x)        (*(const volatile __unqual_scalar_typeof(x) *)&(x))
#endif

#define READ_ONCE(x)                                                        \
({                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __READ_ONCE(x);                                                        \
})

#define __WRITE_ONCE(x, val)                                                \
do {                                                                        \
        *(volatile typeof(x) *)&(x) = (val);                                \
} while (0)

#define WRITE_ONCE(x, val)                                                \
do {                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __WRITE_ONCE(x, val);                                                \
} while (0)

static __no_sanitize_or_inline
unsigned long __read_once_word_nocheck(const void *addr)
{
        return __READ_ONCE(*(unsigned long *)addr);
}

/*
 * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
 * word from memory atomically but without telling KASAN/KCSAN. This is
 * usually used by unwinding code when walking the stack of a running process.
 */
#define READ_ONCE_NOCHECK(x)                                                \
({                                                                        \
        compiletime_assert(sizeof(x) == sizeof(unsigned long),                \
                "Unsupported access size for READ_ONCE_NOCHECK().");        \
        (typeof(x))__read_once_word_nocheck(&(x));                        \
})

static __no_kasan_or_inline
unsigned long read_word_at_a_time(const void *addr)
{
        kasan_check_read(addr, 1);
        return *(unsigned long *)addr;
}

#endif /* __ASSEMBLY__ */
#endif        /* __ASM_GENERIC_RWONCE_H */















































    1 













    5 



    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_EXTEND_H
#define _NF_CONNTRACK_EXTEND_H

#include <linux/slab.h>

#include <net/netfilter/nf_conntrack.h>

enum nf_ct_ext_id {
        NF_CT_EXT_HELPER,
#if IS_ENABLED(CONFIG_NF_NAT)
        NF_CT_EXT_NAT,
#endif
        NF_CT_EXT_SEQADJ,
        NF_CT_EXT_ACCT,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        NF_CT_EXT_ECACHE,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
        NF_CT_EXT_TSTAMP,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        NF_CT_EXT_TIMEOUT,
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
        NF_CT_EXT_LABELS,
#endif
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
        NF_CT_EXT_SYNPROXY,
#endif
#if IS_ENABLED(CONFIG_NET_ACT_CT)
        NF_CT_EXT_ACT_CT,
#endif
        NF_CT_EXT_NUM,
};

/* Extensions: optional stuff which isn't permanently in struct. */
struct nf_ct_ext {
        u8 offset[NF_CT_EXT_NUM];
        u8 len;
        unsigned int gen_id;
        char data[] __aligned(8);
};

static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id)
{
        return !!ext->offset[id];
}

static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id)
{
        return (ct->ext && __nf_ct_ext_exist(ct->ext, id));
}

void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id);

static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id)
{
        struct nf_ct_ext *ext = ct->ext;

        if (!ext || !__nf_ct_ext_exist(ext, id))
                return NULL;

        if (unlikely(ext->gen_id))
                return __nf_ct_ext_find(ext, id);

        return (void *)ct->ext + ct->ext->offset[id];
}

/* Add this type, returns pointer to data or NULL. */
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);

/* ext genid.  if ext->id != ext_genid, extensions cannot be used
 * anymore unless conntrack has CONFIRMED bit set.
 */
extern atomic_t nf_conntrack_ext_genid;
void nf_ct_ext_bump_genid(void);

#endif /* _NF_CONNTRACK_EXTEND_H */



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_FRAG_H__
#define __NET_FRAG_H__

#include <linux/rhashtable-types.h>
#include <linux/completion.h>
#include <linux/in6.h>
#include <linux/rbtree_types.h>
#include <linux/refcount.h>
#include <net/dropreason-core.h>

/* Per netns frag queues directory */
struct fqdir {
        /* sysctls */
        long                        high_thresh;
        long                        low_thresh;
        int                        timeout;
        int                        max_dist;
        struct inet_frags        *f;
        struct net                *net;
        bool                        dead;

        struct rhashtable       rhashtable ____cacheline_aligned_in_smp;

        /* Keep atomic mem on separate cachelines in structs that include it */
        atomic_long_t                mem ____cacheline_aligned_in_smp;
        struct work_struct        destroy_work;
        struct llist_node        free_list;
};

/**
 * enum: fragment queue flags
 *
 * @INET_FRAG_FIRST_IN: first fragment has arrived
 * @INET_FRAG_LAST_IN: final fragment has arrived
 * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
 * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable
 * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed)
 */
enum {
        INET_FRAG_FIRST_IN        = BIT(0),
        INET_FRAG_LAST_IN        = BIT(1),
        INET_FRAG_COMPLETE        = BIT(2),
        INET_FRAG_HASH_DEAD        = BIT(3),
        INET_FRAG_DROP                = BIT(4),
};

struct frag_v4_compare_key {
        __be32                saddr;
        __be32                daddr;
        u32                user;
        u32                vif;
        __be16                id;
        u16                protocol;
};

struct frag_v6_compare_key {
        struct in6_addr        saddr;
        struct in6_addr        daddr;
        u32                user;
        __be32                id;
        u32                iif;
};

/**
 * struct inet_frag_queue - fragment queue
 *
 * @node: rhash node
 * @key: keys identifying this frag.
 * @timer: queue expiration timer
 * @lock: spinlock protecting this frag
 * @refcnt: reference count of the queue
 * @rb_fragments: received fragments rb-tree root
 * @fragments_tail: received fragments tail
 * @last_run_head: the head of the last "run". see ip_fragment.c
 * @stamp: timestamp of the last received fragment
 * @len: total length of the original datagram
 * @meat: length of received fragments so far
 * @tstamp_type: stamp has a mono delivery time (EDT)
 * @flags: fragment queue flags
 * @max_size: maximum received fragment size
 * @fqdir: pointer to struct fqdir
 * @rcu: rcu head for freeing deferall
 */
struct inet_frag_queue {
        struct rhash_head        node;
        union {
                struct frag_v4_compare_key v4;
                struct frag_v6_compare_key v6;
        } key;
        struct timer_list        timer;
        spinlock_t                lock;
        refcount_t                refcnt;
        struct rb_root                rb_fragments;
        struct sk_buff                *fragments_tail;
        struct sk_buff                *last_run_head;
        ktime_t                        stamp;
        int                        len;
        int                        meat;
        u8                        tstamp_type;
        __u8                        flags;
        u16                        max_size;
        struct fqdir                *fqdir;
        struct rcu_head                rcu;
};

struct inet_frags {
        unsigned int                qsize;

        void                        (*constructor)(struct inet_frag_queue *q,
                                               const void *arg);
        void                        (*destructor)(struct inet_frag_queue *);
        void                        (*frag_expire)(struct timer_list *t);
        struct kmem_cache        *frags_cachep;
        const char                *frags_cache_name;
        struct rhashtable_params rhash_params;
        refcount_t                refcnt;
        struct completion        completion;
};

int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *);

int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);

static inline void fqdir_pre_exit(struct fqdir *fqdir)
{
        /* Prevent creation of new frags.
         * Pairs with READ_ONCE() in inet_frag_find().
         */
        WRITE_ONCE(fqdir->high_thresh, 0);

        /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire()
         * and ip6frag_expire_frag_queue().
         */
        WRITE_ONCE(fqdir->dead, true);
}
void fqdir_exit(struct fqdir *fqdir);

void inet_frag_kill(struct inet_frag_queue *q);
void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);

/* Free all skbs in the queue; return the sum of their truesizes. */
unsigned int inet_frag_rbtree_purge(struct rb_root *root,
                                    enum skb_drop_reason reason);

static inline void inet_frag_put(struct inet_frag_queue *q)
{
        if (refcount_dec_and_test(&q->refcnt))
                inet_frag_destroy(q);
}

/* Memory Tracking Functions. */

static inline long frag_mem_limit(const struct fqdir *fqdir)
{
        return atomic_long_read(&fqdir->mem);
}

static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val)
{
        atomic_long_sub(val, &fqdir->mem);
}

static inline void add_frag_mem_limit(struct fqdir *fqdir, long val)
{
        atomic_long_add(val, &fqdir->mem);
}

/* RFC 3168 support :
 * We want to check ECN values of all fragments, do detect invalid combinations.
 * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
 */
#define        IPFRAG_ECN_NOT_ECT        0x01 /* one frag had ECN_NOT_ECT */
#define        IPFRAG_ECN_ECT_1        0x02 /* one frag had ECN_ECT_1 */
#define        IPFRAG_ECN_ECT_0        0x04 /* one frag had ECN_ECT_0 */
#define        IPFRAG_ECN_CE                0x08 /* one frag had ECN_CE */

extern const u8 ip_frag_ecn_table[16];

/* Return values of inet_frag_queue_insert() */
#define IPFRAG_OK        0
#define IPFRAG_DUP        1
#define IPFRAG_OVERLAP        2
int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
                           int offset, int end);
void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
                              struct sk_buff *parent);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
                            void *reasm_data, bool try_coalesce);
struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q);

#endif







































































    2 







    2 








    2 




































    1 









    1 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H

/* jhash.h: Jenkins hash support.
 *
 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
 *
 * https://burtleburtle.net/bob/hash/
 *
 * These are the credits from Bob's sources:
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * These are functions for producing 32-bit hashes for hash table lookup.
 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 * are externally useful functions.  Routines to test the hash are included
 * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 * the public domain.  It has no warranty.
 *
 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
 *
 * I've modified Bob's hash to be useful in the Linux kernel, and
 * any bugs present are my fault.
 * Jozsef
 */
#include <linux/bitops.h>
#include <linux/unaligned/packed_struct.h>

/* Best hash sizes are of power of two */
#define jhash_size(n)   ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n)   (jhash_size(n)-1)

/* __jhash_mix -- mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c)                        \
{                                                \
        a -= c;  a ^= rol32(c, 4);  c += b;        \
        b -= a;  b ^= rol32(a, 6);  a += c;        \
        c -= b;  c ^= rol32(b, 8);  b += a;        \
        a -= c;  a ^= rol32(c, 16); c += b;        \
        b -= a;  b ^= rol32(a, 19); a += c;        \
        c -= b;  c ^= rol32(b, 4);  b += a;        \
}

/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c)                        \
{                                                \
        c ^= b; c -= rol32(b, 14);                \
        a ^= c; a -= rol32(c, 11);                \
        b ^= a; b -= rol32(a, 25);                \
        c ^= b; c -= rol32(b, 16);                \
        a ^= c; a -= rol32(c, 4);                \
        b ^= a; b -= rol32(a, 14);                \
        c ^= b; c -= rol32(b, 24);                \
}

/* An arbitrary initial parameter */
#define JHASH_INITVAL                0xdeadbeef

/* jhash - hash an arbitrary key
 * @k: sequence of bytes as key
 * @length: the length of the key
 * @initval: the previous hash, or an arbitray value
 *
 * The generic version, hashes an arbitrary sequence of bytes.
 * No alignment or length assumptions are made about the input key.
 *
 * Returns the hash value of the key. The result depends on endianness.
 */
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
        u32 a, b, c;
        const u8 *k = key;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + length + initval;

        /* All but the last block: affect some 32 bits of (a,b,c) */
        while (length > 12) {
                a += __get_unaligned_cpu32(k);
                b += __get_unaligned_cpu32(k + 4);
                c += __get_unaligned_cpu32(k + 8);
                __jhash_mix(a, b, c);
                length -= 12;
                k += 12;
        }
        /* Last block: affect all 32 bits of (c) */
        switch (length) {
        case 12: c += (u32)k[11]<<24;        fallthrough;
        case 11: c += (u32)k[10]<<16;        fallthrough;
        case 10: c += (u32)k[9]<<8;        fallthrough;
        case 9:  c += k[8];                fallthrough;
        case 8:  b += (u32)k[7]<<24;        fallthrough;
        case 7:  b += (u32)k[6]<<16;        fallthrough;
        case 6:  b += (u32)k[5]<<8;        fallthrough;
        case 5:  b += k[4];                fallthrough;
        case 4:  a += (u32)k[3]<<24;        fallthrough;
        case 3:  a += (u32)k[2]<<16;        fallthrough;
        case 2:  a += (u32)k[1]<<8;        fallthrough;
        case 1:  a += k[0];
                 __jhash_final(a, b, c);
                 break;
        case 0: /* Nothing left to add */
                break;
        }

        return c;
}

/* jhash2 - hash an array of u32's
 * @k: the key which must be an array of u32's
 * @length: the number of u32's in the key
 * @initval: the previous hash, or an arbitray value
 *
 * Returns the hash value of the key.
 */
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
        u32 a, b, c;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + (length<<2) + initval;

        /* Handle most of the key */
        while (length > 3) {
                a += k[0];
                b += k[1];
                c += k[2];
                __jhash_mix(a, b, c);
                length -= 3;
                k += 3;
        }

        /* Handle the last 3 u32's */
        switch (length) {
        case 3: c += k[2];        fallthrough;
        case 2: b += k[1];        fallthrough;
        case 1: a += k[0];
                __jhash_final(a, b, c);
                break;
        case 0:        /* Nothing left to add */
                break;
        }

        return c;
}


/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
        a += initval;
        b += initval;
        c += initval;

        __jhash_final(a, b, c);

        return c;
}

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
        return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}

static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
        return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}

static inline u32 jhash_1word(u32 a, u32 initval)
{
        return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}

#endif /* _LINUX_JHASH_H */





















































































































































































    1 














































































































































    1 






































    1 
















    1 




    1 












































































































































































































































































































































































































































































































































































































    1 










    1 























































































































































































































































































    1 





    1 

















    1 





    1 


    1 
















































































































































































































































































    1 


























    1 













    1 








    1 








    1 



    1 






    1 












































































































































































































































































































































































    1 







    1 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
// SPDX-License-Identifier: GPL-2.0-only
/*
 * "splice": joining two ropes together by interweaving their strands.
 *
 * This is the "extended pipe" functionality, where a pipe is used as
 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
 * buffer that you can use to transfer data from one end to the other.
 *
 * The traditional unix read/write is extended with a "splice()" operation
 * that transfers data buffers to or from a pipe buffer.
 *
 * Named by Larry McVoy, original implementation from Linus, extended by
 * Jens to support splicing to files, network, direct splicing, etc and
 * fixing lots of bugs.
 *
 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
 *
 */
#include <linux/bvec.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/sched/signal.h>

#include "internal.h"

/*
 * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
 * indicate they support non-blocking reads or writes, we must clear it
 * here if set to avoid blocking other users of this pipe if splice is
 * being done on it.
 */
static noinline void noinline pipe_clear_nowait(struct file *file)
{
        fmode_t fmode = READ_ONCE(file->f_mode);

        do {
                if (!(fmode & FMODE_NOWAIT))
                        break;
        } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
}

/*
 * Attempt to steal a page from a pipe buffer. This should perhaps go into
 * a vm helper function, it's already simplified quite a bit by the
 * addition of remove_mapping(). If success is returned, the caller may
 * attempt to reuse this page for another destination.
 */
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct folio *folio = page_folio(buf->page);
        struct address_space *mapping;

        folio_lock(folio);

        mapping = folio_mapping(folio);
        if (mapping) {
                WARN_ON(!folio_test_uptodate(folio));

                /*
                 * At least for ext2 with nobh option, we need to wait on
                 * writeback completing on this folio, since we'll remove it
                 * from the pagecache.  Otherwise truncate wont wait on the
                 * folio, allowing the disk blocks to be reused by someone else
                 * before we actually wrote our data to them. fs corruption
                 * ensues.
                 */
                folio_wait_writeback(folio);

                if (!filemap_release_folio(folio, GFP_KERNEL))
                        goto out_unlock;

                /*
                 * If we succeeded in removing the mapping, set LRU flag
                 * and return good.
                 */
                if (remove_mapping(mapping, folio)) {
                        buf->flags |= PIPE_BUF_FLAG_LRU;
                        return true;
                }
        }

        /*
         * Raced with truncate or failed to remove folio from current
         * address space, unlock and return failure.
         */
out_unlock:
        folio_unlock(folio);
        return false;
}

static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
                                        struct pipe_buffer *buf)
{
        put_page(buf->page);
        buf->flags &= ~PIPE_BUF_FLAG_LRU;
}

/*
 * Check whether the contents of buf is OK to access. Since the content
 * is a page cache page, IO may be in flight.
 */
static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
                                       struct pipe_buffer *buf)
{
        struct folio *folio = page_folio(buf->page);
        int err;

        if (!folio_test_uptodate(folio)) {
                folio_lock(folio);

                /*
                 * Folio got truncated/unhashed. This will cause a 0-byte
                 * splice, if this is the first page.
                 */
                if (!folio->mapping) {
                        err = -ENODATA;
                        goto error;
                }

                /*
                 * Uh oh, read-error from disk.
                 */
                if (!folio_test_uptodate(folio)) {
                        err = -EIO;
                        goto error;
                }

                /* Folio is ok after all, we are done */
                folio_unlock(folio);
        }

        return 0;
error:
        folio_unlock(folio);
        return err;
}

const struct pipe_buf_operations page_cache_pipe_buf_ops = {
        .confirm        = page_cache_pipe_buf_confirm,
        .release        = page_cache_pipe_buf_release,
        .try_steal        = page_cache_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
                return false;

        buf->flags |= PIPE_BUF_FLAG_LRU;
        return generic_pipe_buf_try_steal(pipe, buf);
}

static const struct pipe_buf_operations user_page_pipe_buf_ops = {
        .release        = page_cache_pipe_buf_release,
        .try_steal        = user_page_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
{
        smp_mb();
        if (waitqueue_active(&pipe->rd_wait))
                wake_up_interruptible(&pipe->rd_wait);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}

/**
 * splice_to_pipe - fill passed data into a pipe
 * @pipe:        pipe to fill
 * @spd:        data to fill
 *
 * Description:
 *    @spd contains a map of pages and len/offset tuples, along with
 *    the struct pipe_buf_operations associated with these pages. This
 *    function will link that data to the pipe.
 *
 */
ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                       struct splice_pipe_desc *spd)
{
        unsigned int spd_pages = spd->nr_pages;
        unsigned int tail = pipe->tail;
        unsigned int head = pipe->head;
        unsigned int mask = pipe->ring_size - 1;
        ssize_t ret = 0;
        int page_nr = 0;

        if (!spd_pages)
                return 0;

        if (unlikely(!pipe->readers)) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
                goto out;
        }

        while (!pipe_full(head, tail, pipe->max_usage)) {
                struct pipe_buffer *buf = &pipe->bufs[head & mask];

                buf->page = spd->pages[page_nr];
                buf->offset = spd->partial[page_nr].offset;
                buf->len = spd->partial[page_nr].len;
                buf->private = spd->partial[page_nr].private;
                buf->ops = spd->ops;
                buf->flags = 0;

                head++;
                pipe->head = head;
                page_nr++;
                ret += buf->len;

                if (!--spd->nr_pages)
                        break;
        }

        if (!ret)
                ret = -EAGAIN;

out:
        while (page_nr < spd_pages)
                spd->spd_release(spd, page_nr++);

        return ret;
}
EXPORT_SYMBOL_GPL(splice_to_pipe);

ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
        unsigned int head = pipe->head;
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        int ret;

        if (unlikely(!pipe->readers)) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
        } else if (pipe_full(head, tail, pipe->max_usage)) {
                ret = -EAGAIN;
        } else {
                pipe->bufs[head & mask] = *buf;
                pipe->head = head + 1;
                return buf->len;
        }
        pipe_buf_release(pipe, buf);
        return ret;
}
EXPORT_SYMBOL(add_to_pipe);

/*
 * Check if we need to grow the arrays holding pages and partial page
 * descriptions.
 */
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
{
        unsigned int max_usage = READ_ONCE(pipe->max_usage);

        spd->nr_pages_max = max_usage;
        if (max_usage <= PIPE_DEF_BUFFERS)
                return 0;

        spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
        spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
                                     GFP_KERNEL);

        if (spd->pages && spd->partial)
                return 0;

        kfree(spd->pages);
        kfree(spd->partial);
        return -ENOMEM;
}

void splice_shrink_spd(struct splice_pipe_desc *spd)
{
        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
                return;

        kfree(spd->pages);
        kfree(spd->partial);
}

/**
 * copy_splice_read -  Copy data from a file and splice the copy into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function allocates a bunch of pages sufficient to hold the requested
 * amount of data (but limited by the remaining pipe capacity), passes it to
 * the file's ->read_iter() to read into and then splices the used pages into
 * the pipe.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
 */
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe,
                         size_t len, unsigned int flags)
{
        struct iov_iter to;
        struct bio_vec *bv;
        struct kiocb kiocb;
        struct page **pages;
        ssize_t ret;
        size_t used, npages, chunk, remain, keep = 0;
        int i;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_occupancy(pipe->head, pipe->tail);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);
        npages = DIV_ROUND_UP(len, PAGE_SIZE);

        bv = kzalloc(array_size(npages, sizeof(bv[0])) +
                     array_size(npages, sizeof(struct page *)), GFP_KERNEL);
        if (!bv)
                return -ENOMEM;

        pages = (struct page **)(bv + npages);
        npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
        if (!npages) {
                kfree(bv);
                return -ENOMEM;
        }

        remain = len = min_t(size_t, len, npages * PAGE_SIZE);

        for (i = 0; i < npages; i++) {
                chunk = min_t(size_t, PAGE_SIZE, remain);
                bv[i].bv_page = pages[i];
                bv[i].bv_offset = 0;
                bv[i].bv_len = chunk;
                remain -= chunk;
        }

        /* Do the I/O */
        iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
        init_sync_kiocb(&kiocb, in);
        kiocb.ki_pos = *ppos;
        ret = in->f_op->read_iter(&kiocb, &to);

        if (ret > 0) {
                keep = DIV_ROUND_UP(ret, PAGE_SIZE);
                *ppos = kiocb.ki_pos;
        }

        /*
         * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
         * there", rather than -EFAULT.
         */
        if (ret == -EFAULT)
                ret = -EAGAIN;

        /* Free any pages that didn't get touched at all. */
        if (keep < npages)
                release_pages(pages + keep, npages - keep);

        /* Push the remaining pages into the pipe. */
        remain = ret;
        for (i = 0; i < keep; i++) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);

                chunk = min_t(size_t, remain, PAGE_SIZE);
                *buf = (struct pipe_buffer) {
                        .ops        = &default_pipe_buf_ops,
                        .page        = bv[i].bv_page,
                        .offset        = 0,
                        .len        = chunk,
                };
                pipe->head++;
                remain -= chunk;
        }

        kfree(bv);
        return ret;
}
EXPORT_SYMBOL(copy_splice_read);

const struct pipe_buf_operations default_pipe_buf_ops = {
        .release        = generic_pipe_buf_release,
        .try_steal        = generic_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/* Pipe buffer operations for a socket and similar. */
const struct pipe_buf_operations nosteal_pipe_buf_ops = {
        .release        = generic_pipe_buf_release,
        .get                = generic_pipe_buf_get,
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);

static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
        smp_mb();
        if (waitqueue_active(&pipe->wr_wait))
                wake_up_interruptible(&pipe->wr_wait);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}

/**
 * splice_from_pipe_feed - feed available data from a pipe to a file
 * @pipe:        pipe to splice from
 * @sd:                information to @actor
 * @actor:        handler that splices the data
 *
 * Description:
 *    This function loops over the pipe and calls @actor to do the
 *    actual moving of a single struct pipe_buffer to the desired
 *    destination.  It returns when there's no more buffers left in
 *    the pipe or if the requested number of bytes (@sd->total_len)
 *    have been copied.  It returns a positive number (one) if the
 *    pipe needs to be filled with more data, zero if the required
 *    number of bytes have been copied and -errno on error.
 *
 *    This, together with splice_from_pipe_{begin,end,next}, may be
 *    used to implement the functionality of __splice_from_pipe() when
 *    locking is required around copying the pipe buffers to the
 *    destination.
 */
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                          splice_actor *actor)
{
        unsigned int head = pipe->head;
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        int ret;

        while (!pipe_empty(head, tail)) {
                struct pipe_buffer *buf = &pipe->bufs[tail & mask];

                sd->len = buf->len;
                if (sd->len > sd->total_len)
                        sd->len = sd->total_len;

                ret = pipe_buf_confirm(pipe, buf);
                if (unlikely(ret)) {
                        if (ret == -ENODATA)
                                ret = 0;
                        return ret;
                }

                ret = actor(pipe, buf, sd);
                if (ret <= 0)
                        return ret;

                buf->offset += ret;
                buf->len -= ret;

                sd->num_spliced += ret;
                sd->len -= ret;
                sd->pos += ret;
                sd->total_len -= ret;

                if (!buf->len) {
                        pipe_buf_release(pipe, buf);
                        tail++;
                        pipe->tail = tail;
                        if (pipe->files)
                                sd->need_wakeup = true;
                }

                if (!sd->total_len)
                        return 0;
        }

        return 1;
}

/* We know we have a pipe buffer, but maybe it's empty? */
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        struct pipe_buffer *buf = &pipe->bufs[tail & mask];

        if (unlikely(!buf->len)) {
                pipe_buf_release(pipe, buf);
                pipe->tail = tail+1;
                return true;
        }

        return false;
}

/**
 * splice_from_pipe_next - wait for some data to splice from
 * @pipe:        pipe to splice from
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function will wait for some data and return a positive
 *    value (one) if pipe buffers are available.  It will return zero
 *    or -errno if no more data needs to be spliced.
 */
static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
        /*
         * Check for signal early to make process killable when there are
         * always buffers available
         */
        if (signal_pending(current))
                return -ERESTARTSYS;

repeat:
        while (pipe_empty(pipe->head, pipe->tail)) {
                if (!pipe->writers)
                        return 0;

                if (sd->num_spliced)
                        return 0;

                if (sd->flags & SPLICE_F_NONBLOCK)
                        return -EAGAIN;

                if (signal_pending(current))
                        return -ERESTARTSYS;

                if (sd->need_wakeup) {
                        wakeup_pipe_writers(pipe);
                        sd->need_wakeup = false;
                }

                pipe_wait_readable(pipe);
        }

        if (eat_empty_buffer(pipe))
                goto repeat;

        return 1;
}

/**
 * splice_from_pipe_begin - start splicing from pipe
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function should be called before a loop containing
 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 *    initialize the necessary fields of @sd.
 */
static void splice_from_pipe_begin(struct splice_desc *sd)
{
        sd->num_spliced = 0;
        sd->need_wakeup = false;
}

/**
 * splice_from_pipe_end - finish splicing from pipe
 * @pipe:        pipe to splice from
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function will wake up pipe writers if necessary.  It should
 *    be called after a loop containing splice_from_pipe_next() and
 *    splice_from_pipe_feed().
 */
static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
        if (sd->need_wakeup)
                wakeup_pipe_writers(pipe);
}

/**
 * __splice_from_pipe - splice data from a pipe to given actor
 * @pipe:        pipe to splice from
 * @sd:                information to @actor
 * @actor:        handler that splices the data
 *
 * Description:
 *    This function does little more than loop over the pipe and call
 *    @actor to do the actual moving of a single struct pipe_buffer to
 *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
 *    pipe_to_user.
 *
 */
ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
                           splice_actor *actor)
{
        int ret;

        splice_from_pipe_begin(sd);
        do {
                cond_resched();
                ret = splice_from_pipe_next(pipe, sd);
                if (ret > 0)
                        ret = splice_from_pipe_feed(pipe, sd, actor);
        } while (ret > 0);
        splice_from_pipe_end(pipe, sd);

        return sd->num_spliced ? sd->num_spliced : ret;
}
EXPORT_SYMBOL(__splice_from_pipe);

/**
 * splice_from_pipe - splice data from a pipe to a file
 * @pipe:        pipe to splice from
 * @out:        file to splice to
 * @ppos:        position in @out
 * @len:        how many bytes to splice
 * @flags:        splice modifier flags
 * @actor:        handler that splices the data
 *
 * Description:
 *    See __splice_from_pipe. This function locks the pipe inode,
 *    otherwise it's identical to __splice_from_pipe().
 *
 */
ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                         loff_t *ppos, size_t len, unsigned int flags,
                         splice_actor *actor)
{
        ssize_t ret;
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
                .pos = *ppos,
                .u.file = out,
        };

        pipe_lock(pipe);
        ret = __splice_from_pipe(pipe, &sd, actor);
        pipe_unlock(pipe);

        return ret;
}

/**
 * iter_file_splice_write - splice data from a pipe to a file
 * @pipe:        pipe info
 * @out:        file to write to
 * @ppos:        position in @out
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will either move or copy pages (determined by @flags options) from
 *    the given pipe inode to the given file.
 *    This one is ->write_iter-based.
 *
 */
ssize_t
iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                          loff_t *ppos, size_t len, unsigned int flags)
{
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
                .pos = *ppos,
                .u.file = out,
        };
        int nbufs = pipe->max_usage;
        struct bio_vec *array;
        ssize_t ret;

        if (!out->f_op->write_iter)
                return -EINVAL;

        array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
        if (unlikely(!array))
                return -ENOMEM;

        pipe_lock(pipe);

        splice_from_pipe_begin(&sd);
        while (sd.total_len) {
                struct kiocb kiocb;
                struct iov_iter from;
                unsigned int head, tail, mask;
                size_t left;
                int n;

                ret = splice_from_pipe_next(pipe, &sd);
                if (ret <= 0)
                        break;

                if (unlikely(nbufs < pipe->max_usage)) {
                        kfree(array);
                        nbufs = pipe->max_usage;
                        array = kcalloc(nbufs, sizeof(struct bio_vec),
                                        GFP_KERNEL);
                        if (!array) {
                                ret = -ENOMEM;
                                break;
                        }
                }

                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                /* build the vector */
                left = sd.total_len;
                for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t this_len = buf->len;

                        /* zero-length bvecs are not supported, skip them */
                        if (!this_len)
                                continue;
                        this_len = min(this_len, left);

                        ret = pipe_buf_confirm(pipe, buf);
                        if (unlikely(ret)) {
                                if (ret == -ENODATA)
                                        ret = 0;
                                goto done;
                        }

                        bvec_set_page(&array[n], buf->page, this_len,
                                      buf->offset);
                        left -= this_len;
                        n++;
                }

                iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
                init_sync_kiocb(&kiocb, out);
                kiocb.ki_pos = sd.pos;
                ret = out->f_op->write_iter(&kiocb, &from);
                sd.pos = kiocb.ki_pos;
                if (ret <= 0)
                        break;

                sd.num_spliced += ret;
                sd.total_len -= ret;
                *ppos = sd.pos;

                /* dismiss the fully eaten buffers, adjust the partial one */
                tail = pipe->tail;
                while (ret) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        if (ret >= buf->len) {
                                ret -= buf->len;
                                buf->len = 0;
                                pipe_buf_release(pipe, buf);
                                tail++;
                                pipe->tail = tail;
                                if (pipe->files)
                                        sd.need_wakeup = true;
                        } else {
                                buf->offset += ret;
                                buf->len -= ret;
                                ret = 0;
                        }
                }
        }
done:
        kfree(array);
        splice_from_pipe_end(pipe, &sd);

        pipe_unlock(pipe);

        if (sd.num_spliced)
                ret = sd.num_spliced;

        return ret;
}

EXPORT_SYMBOL(iter_file_splice_write);

#ifdef CONFIG_NET
/**
 * splice_to_socket - splice data from a pipe to a socket
 * @pipe:        pipe to splice from
 * @out:        socket to write to
 * @ppos:        position in @out
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will send @len bytes from the pipe to a network socket. No data copying
 *    is involved.
 *
 */
ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
                         loff_t *ppos, size_t len, unsigned int flags)
{
        struct socket *sock = sock_from_file(out);
        struct bio_vec bvec[16];
        struct msghdr msg = {};
        ssize_t ret = 0;
        size_t spliced = 0;
        bool need_wakeup = false;

        pipe_lock(pipe);

        while (len > 0) {
                unsigned int head, tail, mask, bc = 0;
                size_t remain = len;

                /*
                 * Check for signal early to make process killable when there
                 * are always buffers available
                 */
                ret = -ERESTARTSYS;
                if (signal_pending(current))
                        break;

                while (pipe_empty(pipe->head, pipe->tail)) {
                        ret = 0;
                        if (!pipe->writers)
                                goto out;

                        if (spliced)
                                goto out;

                        ret = -EAGAIN;
                        if (flags & SPLICE_F_NONBLOCK)
                                goto out;

                        ret = -ERESTARTSYS;
                        if (signal_pending(current))
                                goto out;

                        if (need_wakeup) {
                                wakeup_pipe_writers(pipe);
                                need_wakeup = false;
                        }

                        pipe_wait_readable(pipe);
                }

                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                while (!pipe_empty(head, tail)) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t seg;

                        if (!buf->len) {
                                tail++;
                                continue;
                        }

                        seg = min_t(size_t, remain, buf->len);

                        ret = pipe_buf_confirm(pipe, buf);
                        if (unlikely(ret)) {
                                if (ret == -ENODATA)
                                        ret = 0;
                                break;
                        }

                        bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
                        remain -= seg;
                        if (remain == 0 || bc >= ARRAY_SIZE(bvec))
                                break;
                        tail++;
                }

                if (!bc)
                        break;

                msg.msg_flags = MSG_SPLICE_PAGES;
                if (flags & SPLICE_F_MORE)
                        msg.msg_flags |= MSG_MORE;
                if (remain && pipe_occupancy(pipe->head, tail) > 0)
                        msg.msg_flags |= MSG_MORE;
                if (out->f_flags & O_NONBLOCK)
                        msg.msg_flags |= MSG_DONTWAIT;

                iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
                              len - remain);
                ret = sock_sendmsg(sock, &msg);
                if (ret <= 0)
                        break;

                spliced += ret;
                len -= ret;
                tail = pipe->tail;
                while (ret > 0) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t seg = min_t(size_t, ret, buf->len);

                        buf->offset += seg;
                        buf->len -= seg;
                        ret -= seg;

                        if (!buf->len) {
                                pipe_buf_release(pipe, buf);
                                tail++;
                        }
                }

                if (tail != pipe->tail) {
                        pipe->tail = tail;
                        if (pipe->files)
                                need_wakeup = true;
                }
        }

out:
        pipe_unlock(pipe);
        if (need_wakeup)
                wakeup_pipe_writers(pipe);
        return spliced ?: ret;
}
#endif

static int warn_unsupported(struct file *file, const char *op)
{
        pr_debug_ratelimited(
                "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

/*
 * Attempt to initiate a splice from pipe to file.
 */
static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
                              loff_t *ppos, size_t len, unsigned int flags)
{
        if (unlikely(!out->f_op->splice_write))
                return warn_unsupported(out, "write");
        return out->f_op->splice_write(pipe, out, ppos, len, flags);
}

/*
 * Indicate to the caller that there was a premature EOF when reading from the
 * source and the caller didn't indicate they would be sending more data after
 * this.
 */
static void do_splice_eof(struct splice_desc *sd)
{
        if (sd->splice_eof)
                sd->splice_eof(sd);
}

/*
 * Callers already called rw_verify_area() on the entire range.
 * No need to call it for sub ranges.
 */
static ssize_t do_splice_read(struct file *in, loff_t *ppos,
                              struct pipe_inode_info *pipe, size_t len,
                              unsigned int flags)
{
        unsigned int p_space;

        if (unlikely(!(in->f_mode & FMODE_READ)))
                return -EBADF;
        if (!len)
                return 0;

        /* Don't try to read more the pipe has space for. */
        p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
        len = min_t(size_t, len, p_space << PAGE_SHIFT);

        if (unlikely(len > MAX_RW_COUNT))
                len = MAX_RW_COUNT;

        if (unlikely(!in->f_op->splice_read))
                return warn_unsupported(in, "read");
        /*
         * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
         * buffer, copy into it and splice that into the pipe.
         */
        if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
                return copy_splice_read(in, ppos, pipe, len, flags);
        return in->f_op->splice_read(in, ppos, pipe, len, flags);
}

/**
 * vfs_splice_read - Read data from a file and splice it into a pipe
 * @in:                File to splice from
 * @ppos:        Input file offset
 * @pipe:        Pipe to splice to
 * @len:        Number of bytes to splice
 * @flags:        Splice modifier flags (SPLICE_F_*)
 *
 * Splice the requested amount of data from the input file to the pipe.  This
 * is synchronous as the caller must hold the pipe lock across the entire
 * operation.
 *
 * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
 * a hole and a negative error code otherwise.
 */
ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags)
{
        ssize_t ret;

        ret = rw_verify_area(READ, in, ppos, len);
        if (unlikely(ret < 0))
                return ret;

        return do_splice_read(in, ppos, pipe, len, flags);
}
EXPORT_SYMBOL_GPL(vfs_splice_read);

/**
 * splice_direct_to_actor - splices data directly between two non-pipes
 * @in:                file to splice from
 * @sd:                actor information on where to splice to
 * @actor:        handles the data splicing
 *
 * Description:
 *    This is a special case helper to splice directly between two
 *    points, without requiring an explicit pipe. Internally an allocated
 *    pipe is cached in the process, and reused during the lifetime of
 *    that process.
 *
 */
ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                               splice_direct_actor *actor)
{
        struct pipe_inode_info *pipe;
        ssize_t ret, bytes;
        size_t len;
        int i, flags, more;

        /*
         * We require the input to be seekable, as we don't want to randomly
         * drop data for eg socket -> socket splicing. Use the piped splicing
         * for that!
         */
        if (unlikely(!(in->f_mode & FMODE_LSEEK)))
                return -EINVAL;

        /*
         * neither in nor out is a pipe, setup an internal pipe attached to
         * 'out' and transfer the wanted data from 'in' to 'out' through that
         */
        pipe = current->splice_pipe;
        if (unlikely(!pipe)) {
                pipe = alloc_pipe_info();
                if (!pipe)
                        return -ENOMEM;

                /*
                 * We don't have an immediate reader, but we'll read the stuff
                 * out of the pipe right after the splice_to_pipe(). So set
                 * PIPE_READERS appropriately.
                 */
                pipe->readers = 1;

                current->splice_pipe = pipe;
        }

        /*
         * Do the splice.
         */
        bytes = 0;
        len = sd->total_len;

        /* Don't block on output, we have to drain the direct pipe. */
        flags = sd->flags;
        sd->flags &= ~SPLICE_F_NONBLOCK;

        /*
         * We signal MORE until we've read sufficient data to fulfill the
         * request and we keep signalling it if the caller set it.
         */
        more = sd->flags & SPLICE_F_MORE;
        sd->flags |= SPLICE_F_MORE;

        WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));

        while (len) {
                size_t read_len;
                loff_t pos = sd->pos, prev_pos = pos;

                ret = do_splice_read(in, &pos, pipe, len, flags);
                if (unlikely(ret <= 0))
                        goto read_failure;

                read_len = ret;
                sd->total_len = read_len;

                /*
                 * If we now have sufficient data to fulfill the request then
                 * we clear SPLICE_F_MORE if it was not set initially.
                 */
                if (read_len >= len && !more)
                        sd->flags &= ~SPLICE_F_MORE;

                /*
                 * NOTE: nonblocking mode only applies to the input. We
                 * must not do the output in nonblocking mode as then we
                 * could get stuck data in the internal pipe:
                 */
                ret = actor(pipe, sd);
                if (unlikely(ret <= 0)) {
                        sd->pos = prev_pos;
                        goto out_release;
                }

                bytes += ret;
                len -= ret;
                sd->pos = pos;

                if (ret < read_len) {
                        sd->pos = prev_pos + ret;
                        goto out_release;
                }
        }

done:
        pipe->tail = pipe->head = 0;
        file_accessed(in);
        return bytes;

read_failure:
        /*
         * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
         * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
         * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
         * least 1 byte *then* we will also do the ->splice_eof() call.
         */
        if (ret == 0 && !more && len > 0 && bytes)
                do_splice_eof(sd);
out_release:
        /*
         * If we did an incomplete transfer we must release
         * the pipe buffers in question:
         */
        for (i = 0; i < pipe->ring_size; i++) {
                struct pipe_buffer *buf = &pipe->bufs[i];

                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }

        if (!bytes)
                bytes = ret;

        goto done;
}
EXPORT_SYMBOL(splice_direct_to_actor);

static int direct_splice_actor(struct pipe_inode_info *pipe,
                               struct splice_desc *sd)
{
        struct file *file = sd->u.file;
        long ret;

        file_start_write(file);
        ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
        file_end_write(file);
        return ret;
}

static int splice_file_range_actor(struct pipe_inode_info *pipe,
                                        struct splice_desc *sd)
{
        struct file *file = sd->u.file;

        return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
}

static void direct_file_splice_eof(struct splice_desc *sd)
{
        struct file *file = sd->u.file;

        if (file->f_op->splice_eof)
                file->f_op->splice_eof(file);
}

static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
                                      struct file *out, loff_t *opos,
                                      size_t len, unsigned int flags,
                                      splice_direct_actor *actor)
{
        struct splice_desc sd = {
                .len                = len,
                .total_len        = len,
                .flags                = flags,
                .pos                = *ppos,
                .u.file                = out,
                .splice_eof        = direct_file_splice_eof,
                .opos                = opos,
        };
        ssize_t ret;

        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        if (unlikely(out->f_flags & O_APPEND))
                return -EINVAL;

        ret = splice_direct_to_actor(in, &sd, actor);
        if (ret > 0)
                *ppos = sd.pos;

        return ret;
}
/**
 * do_splice_direct - splices data directly between two files
 * @in:                file to splice from
 * @ppos:        input file offset
 * @out:        file to splice to
 * @opos:        output file offset
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    For use by do_sendfile(). splice can easily emulate sendfile, but
 *    doing it in the application would incur an extra system call
 *    (splice in + splice out, as compared to just sendfile()). So this helper
 *    can splice directly through a process-private pipe.
 *
 * Callers already called rw_verify_area() on the entire range.
 */
ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
                         loff_t *opos, size_t len, unsigned int flags)
{
        return do_splice_direct_actor(in, ppos, out, opos, len, flags,
                                      direct_splice_actor);
}
EXPORT_SYMBOL(do_splice_direct);

/**
 * splice_file_range - splices data between two files for copy_file_range()
 * @in:                file to splice from
 * @ppos:        input file offset
 * @out:        file to splice to
 * @opos:        output file offset
 * @len:        number of bytes to splice
 *
 * Description:
 *    For use by ->copy_file_range() methods.
 *    Like do_splice_direct(), but vfs_copy_file_range() already holds
 *    start_file_write() on @out file.
 *
 * Callers already called rw_verify_area() on the entire range.
 */
ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
                          loff_t *opos, size_t len)
{
        lockdep_assert(file_write_started(out));

        return do_splice_direct_actor(in, ppos, out, opos,
                                      min_t(size_t, len, MAX_RW_COUNT),
                                      0, splice_file_range_actor);
}
EXPORT_SYMBOL(splice_file_range);

static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
{
        for (;;) {
                if (unlikely(!pipe->readers)) {
                        send_sig(SIGPIPE, current, 0);
                        return -EPIPE;
                }
                if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                        return 0;
                if (flags & SPLICE_F_NONBLOCK)
                        return -EAGAIN;
                if (signal_pending(current))
                        return -ERESTARTSYS;
                pipe_wait_writable(pipe);
        }
}

static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags);

ssize_t splice_file_to_pipe(struct file *in,
                            struct pipe_inode_info *opipe,
                            loff_t *offset,
                            size_t len, unsigned int flags)
{
        ssize_t ret;

        pipe_lock(opipe);
        ret = wait_for_space(opipe, flags);
        if (!ret)
                ret = do_splice_read(in, offset, opipe, len, flags);
        pipe_unlock(opipe);
        if (ret > 0)
                wakeup_pipe_readers(opipe);
        return ret;
}

/*
 * Determine where to splice to/from.
 */
ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
                  loff_t *off_out, size_t len, unsigned int flags)
{
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;
        loff_t offset;
        ssize_t ret;

        if (unlikely(!(in->f_mode & FMODE_READ) ||
                     !(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        ipipe = get_pipe_info(in, true);
        opipe = get_pipe_info(out, true);

        if (ipipe && opipe) {
                if (off_in || off_out)
                        return -ESPIPE;

                /* Splicing to self would be fun, but... */
                if (ipipe == opipe)
                        return -EINVAL;

                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
        } else if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
                        if (!(out->f_mode & FMODE_PWRITE))
                                return -EINVAL;
                        offset = *off_out;
                } else {
                        offset = out->f_pos;
                }

                if (unlikely(out->f_flags & O_APPEND))
                        return -EINVAL;

                ret = rw_verify_area(WRITE, out, &offset, len);
                if (unlikely(ret < 0))
                        return ret;

                if (in->f_flags & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                file_start_write(out);
                ret = do_splice_from(ipipe, out, &offset, len, flags);
                file_end_write(out);

                if (!off_out)
                        out->f_pos = offset;
                else
                        *off_out = offset;
        } else if (opipe) {
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
                        if (!(in->f_mode & FMODE_PREAD))
                                return -EINVAL;
                        offset = *off_in;
                } else {
                        offset = in->f_pos;
                }

                ret = rw_verify_area(READ, in, &offset, len);
                if (unlikely(ret < 0))
                        return ret;

                if (out->f_flags & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                ret = splice_file_to_pipe(in, opipe, &offset, len, flags);

                if (!off_in)
                        in->f_pos = offset;
                else
                        *off_in = offset;
        } else {
                ret = -EINVAL;
        }

        if (ret > 0) {
                /*
                 * Generate modify out before access in:
                 * do_splice_from() may've already sent modify out,
                 * and this ensures the events get merged.
                 */
                fsnotify_modify(out);
                fsnotify_access(in);
        }

        return ret;
}

static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
                           struct file *out, loff_t __user *off_out,
                           size_t len, unsigned int flags)
{
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;
        loff_t offset, *__off_in = NULL, *__off_out = NULL;
        ssize_t ret;

        ipipe = get_pipe_info(in, true);
        opipe = get_pipe_info(out, true);

        if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                pipe_clear_nowait(in);
        }
        if (opipe) {
                if (off_out)
                        return -ESPIPE;
                pipe_clear_nowait(out);
        }

        if (off_out) {
                if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                        return -EFAULT;
                __off_out = &offset;
        }
        if (off_in) {
                if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                        return -EFAULT;
                __off_in = &offset;
        }

        ret = do_splice(in, __off_in, out, __off_out, len, flags);
        if (ret < 0)
                return ret;

        if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
                return -EFAULT;
        if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
                return -EFAULT;

        return ret;
}

static ssize_t iter_to_pipe(struct iov_iter *from,
                            struct pipe_inode_info *pipe,
                            unsigned int flags)
{
        struct pipe_buffer buf = {
                .ops = &user_page_pipe_buf_ops,
                .flags = flags
        };
        size_t total = 0;
        ssize_t ret = 0;

        while (iov_iter_count(from)) {
                struct page *pages[16];
                ssize_t left;
                size_t start;
                int i, n;

                left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
                if (left <= 0) {
                        ret = left;
                        break;
                }

                n = DIV_ROUND_UP(left + start, PAGE_SIZE);
                for (i = 0; i < n; i++) {
                        int size = min_t(int, left, PAGE_SIZE - start);

                        buf.page = pages[i];
                        buf.offset = start;
                        buf.len = size;
                        ret = add_to_pipe(pipe, &buf);
                        if (unlikely(ret < 0)) {
                                iov_iter_revert(from, left);
                                // this one got dropped by add_to_pipe()
                                while (++i < n)
                                        put_page(pages[i]);
                                goto out;
                        }
                        total += ret;
                        left -= size;
                        start = 0;
                }
        }
out:
        return total ? total : ret;
}

static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                        struct splice_desc *sd)
{
        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
        return n == sd->len ? n : -EFAULT;
}

/*
 * For lack of a better implementation, implement vmsplice() to userspace
 * as a simple copy of the pipes pages to the user iov.
 */
static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
                                unsigned int flags)
{
        struct pipe_inode_info *pipe = get_pipe_info(file, true);
        struct splice_desc sd = {
                .total_len = iov_iter_count(iter),
                .flags = flags,
                .u.data = iter
        };
        ssize_t ret = 0;

        if (!pipe)
                return -EBADF;

        pipe_clear_nowait(file);

        if (sd.total_len) {
                pipe_lock(pipe);
                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
                pipe_unlock(pipe);
        }

        if (ret > 0)
                fsnotify_access(file);

        return ret;
}

/*
 * vmsplice splices a user address range into a pipe. It can be thought of
 * as splice-from-memory, where the regular splice is splice-from-file (or
 * to file). In both cases the output is a pipe, naturally.
 */
static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
                                unsigned int flags)
{
        struct pipe_inode_info *pipe;
        ssize_t ret = 0;
        unsigned buf_flag = 0;

        if (flags & SPLICE_F_GIFT)
                buf_flag = PIPE_BUF_FLAG_GIFT;

        pipe = get_pipe_info(file, true);
        if (!pipe)
                return -EBADF;

        pipe_clear_nowait(file);

        pipe_lock(pipe);
        ret = wait_for_space(pipe, flags);
        if (!ret)
                ret = iter_to_pipe(iter, pipe, buf_flag);
        pipe_unlock(pipe);
        if (ret > 0) {
                wakeup_pipe_readers(pipe);
                fsnotify_modify(file);
        }
        return ret;
}

static int vmsplice_type(struct fd f, int *type)
{
        if (!f.file)
                return -EBADF;
        if (f.file->f_mode & FMODE_WRITE) {
                *type = ITER_SOURCE;
        } else if (f.file->f_mode & FMODE_READ) {
                *type = ITER_DEST;
        } else {
                fdput(f);
                return -EBADF;
        }
        return 0;
}

/*
 * Note that vmsplice only really supports true splicing _from_ user memory
 * to a pipe, not the other way around. Splicing from user memory is a simple
 * operation that can be supported without any funky alignment restrictions
 * or nasty vm tricks. We simply map in the user memory and fill them into
 * a pipe. The reverse isn't quite as easy, though. There are two possible
 * solutions for that:
 *
 *        - memcpy() the data internally, at which point we might as well just
 *          do a regular read() on the buffer anyway.
 *        - Lots of nasty vm tricks, that are neither fast nor flexible (it
 *          has restriction limitations on both ends of the pipe).
 *
 * Currently we punt and implement it as a normal copy, see pipe_to_user().
 *
 */
SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
                unsigned long, nr_segs, unsigned int, flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        ssize_t error;
        struct fd f;
        int type;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        f = fdget(fd);
        error = vmsplice_type(f, &type);
        if (error)
                return error;

        error = import_iovec(type, uiov, nr_segs,
                             ARRAY_SIZE(iovstack), &iov, &iter);
        if (error < 0)
                goto out_fdput;

        if (!iov_iter_count(&iter))
                error = 0;
        else if (type == ITER_SOURCE)
                error = vmsplice_to_pipe(f.file, &iter, flags);
        else
                error = vmsplice_to_user(f.file, &iter, flags);

        kfree(iov);
out_fdput:
        fdput(f);
        return error;
}

SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        struct fd in, out;
        ssize_t error;

        if (unlikely(!len))
                return 0;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        error = -EBADF;
        in = fdget(fd_in);
        if (in.file) {
                out = fdget(fd_out);
                if (out.file) {
                        error = __do_splice(in.file, off_in, out.file, off_out,
                                            len, flags);
                        fdput(out);
                }
                fdput(in);
        }
        return error;
}

/*
 * Make sure there's data to read. Wait for input if we can, otherwise
 * return an appropriate error.
 */
static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
        int ret;

        /*
         * Check the pipe occupancy without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
        if (!pipe_empty(pipe->head, pipe->tail))
                return 0;

        ret = 0;
        pipe_lock(pipe);

        while (pipe_empty(pipe->head, pipe->tail)) {
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (!pipe->writers)
                        break;
                if (flags & SPLICE_F_NONBLOCK) {
                        ret = -EAGAIN;
                        break;
                }
                pipe_wait_readable(pipe);
        }

        pipe_unlock(pipe);
        return ret;
}

/*
 * Make sure there's writeable room. Wait for room if we can, otherwise
 * return an appropriate error.
 */
static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
        int ret;

        /*
         * Check pipe occupancy without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
        if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                return 0;

        ret = 0;
        pipe_lock(pipe);

        while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        ret = -EPIPE;
                        break;
                }
                if (flags & SPLICE_F_NONBLOCK) {
                        ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                pipe_wait_writable(pipe);
        }

        pipe_unlock(pipe);
        return ret;
}

/*
 * Splice contents of ipipe to opipe.
 */
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags)
{
        struct pipe_buffer *ibuf, *obuf;
        unsigned int i_head, o_head;
        unsigned int i_tail, o_tail;
        unsigned int i_mask, o_mask;
        int ret = 0;
        bool input_wakeup = false;


retry:
        ret = ipipe_prep(ipipe, flags);
        if (ret)
                return ret;

        ret = opipe_prep(opipe, flags);
        if (ret)
                return ret;

        /*
         * Potential ABBA deadlock, work around it by ordering lock
         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
        pipe_double_lock(ipipe, opipe);

        i_tail = ipipe->tail;
        i_mask = ipipe->ring_size - 1;
        o_head = opipe->head;
        o_mask = opipe->ring_size - 1;

        do {
                size_t o_len;

                if (!opipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                i_head = ipipe->head;
                o_tail = opipe->tail;

                if (pipe_empty(i_head, i_tail) && !ipipe->writers)
                        break;

                /*
                 * Cannot make any progress, because either the input
                 * pipe is empty or the output pipe is full.
                 */
                if (pipe_empty(i_head, i_tail) ||
                    pipe_full(o_head, o_tail, opipe->max_usage)) {
                        /* Already processed some buffers, break */
                        if (ret)
                                break;

                        if (flags & SPLICE_F_NONBLOCK) {
                                ret = -EAGAIN;
                                break;
                        }

                        /*
                         * We raced with another reader/writer and haven't
                         * managed to process any buffers.  A zero return
                         * value means EOF, so retry instead.
                         */
                        pipe_unlock(ipipe);
                        pipe_unlock(opipe);
                        goto retry;
                }

                ibuf = &ipipe->bufs[i_tail & i_mask];
                obuf = &opipe->bufs[o_head & o_mask];

                if (len >= ibuf->len) {
                        /*
                         * Simply move the whole buffer from ipipe to opipe
                         */
                        *obuf = *ibuf;
                        ibuf->ops = NULL;
                        i_tail++;
                        ipipe->tail = i_tail;
                        input_wakeup = true;
                        o_len = obuf->len;
                        o_head++;
                        opipe->head = o_head;
                } else {
                        /*
                         * Get a reference to this pipe buffer,
                         * so we can copy the contents over.
                         */
                        if (!pipe_buf_get(ipipe, ibuf)) {
                                if (ret == 0)
                                        ret = -EFAULT;
                                break;
                        }
                        *obuf = *ibuf;

                        /*
                         * Don't inherit the gift and merge flags, we need to
                         * prevent multiple steals of this page.
                         */
                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
                        obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;

                        obuf->len = len;
                        ibuf->offset += len;
                        ibuf->len -= len;
                        o_len = len;
                        o_head++;
                        opipe->head = o_head;
                }
                ret += o_len;
                len -= o_len;
        } while (len);

        pipe_unlock(ipipe);
        pipe_unlock(opipe);

        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
        if (ret > 0)
                wakeup_pipe_readers(opipe);

        if (input_wakeup)
                wakeup_pipe_writers(ipipe);

        return ret;
}

/*
 * Link contents of ipipe to opipe.
 */
static ssize_t link_pipe(struct pipe_inode_info *ipipe,
                         struct pipe_inode_info *opipe,
                         size_t len, unsigned int flags)
{
        struct pipe_buffer *ibuf, *obuf;
        unsigned int i_head, o_head;
        unsigned int i_tail, o_tail;
        unsigned int i_mask, o_mask;
        ssize_t ret = 0;

        /*
         * Potential ABBA deadlock, work around it by ordering lock
         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
        pipe_double_lock(ipipe, opipe);

        i_tail = ipipe->tail;
        i_mask = ipipe->ring_size - 1;
        o_head = opipe->head;
        o_mask = opipe->ring_size - 1;

        do {
                if (!opipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                i_head = ipipe->head;
                o_tail = opipe->tail;

                /*
                 * If we have iterated all input buffers or run out of
                 * output room, break.
                 */
                if (pipe_empty(i_head, i_tail) ||
                    pipe_full(o_head, o_tail, opipe->max_usage))
                        break;

                ibuf = &ipipe->bufs[i_tail & i_mask];
                obuf = &opipe->bufs[o_head & o_mask];

                /*
                 * Get a reference to this pipe buffer,
                 * so we can copy the contents over.
                 */
                if (!pipe_buf_get(ipipe, ibuf)) {
                        if (ret == 0)
                                ret = -EFAULT;
                        break;
                }

                *obuf = *ibuf;

                /*
                 * Don't inherit the gift and merge flag, we need to prevent
                 * multiple steals of this page.
                 */
                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
                obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;

                if (obuf->len > len)
                        obuf->len = len;
                ret += obuf->len;
                len -= obuf->len;

                o_head++;
                opipe->head = o_head;
                i_tail++;
        } while (len);

        pipe_unlock(ipipe);
        pipe_unlock(opipe);

        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
        if (ret > 0)
                wakeup_pipe_readers(opipe);

        return ret;
}

/*
 * This is a tee(1) implementation that works on pipes. It doesn't copy
 * any data, it simply references the 'in' pages on the 'out' pipe.
 * The 'flags' used are the SPLICE_F_* variants, currently the only
 * applicable one is SPLICE_F_NONBLOCK.
 */
ssize_t do_tee(struct file *in, struct file *out, size_t len,
               unsigned int flags)
{
        struct pipe_inode_info *ipipe = get_pipe_info(in, true);
        struct pipe_inode_info *opipe = get_pipe_info(out, true);
        ssize_t ret = -EINVAL;

        if (unlikely(!(in->f_mode & FMODE_READ) ||
                     !(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        /*
         * Duplicate the contents of ipipe to opipe without actually
         * copying the data.
         */
        if (ipipe && opipe && ipipe != opipe) {
                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                /*
                 * Keep going, unless we encounter an error. The ipipe/opipe
                 * ordering doesn't really matter.
                 */
                ret = ipipe_prep(ipipe, flags);
                if (!ret) {
                        ret = opipe_prep(opipe, flags);
                        if (!ret)
                                ret = link_pipe(ipipe, opipe, len, flags);
                }
        }

        if (ret > 0) {
                fsnotify_access(in);
                fsnotify_modify(out);
        }

        return ret;
}

SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
        struct fd in, out;
        ssize_t error;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        if (unlikely(!len))
                return 0;

        error = -EBADF;
        in = fdget(fdin);
        if (in.file) {
                out = fdget(fdout);
                if (out.file) {
                        error = do_tee(in.file, out.file, len, flags);
                        fdput(out);
                }
                 fdput(in);
         }

        return error;
}

















































































    1 

    1 








    1 







    1 
    1 
    1 


    1 









    1 
    1 












    1 
    1 




    1 

    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET        An implementation of the TCP/IP protocol suite for the LINUX
 *             operating system.  INET is implemented using the  BSD Socket
 *             interface as the means of communication with the user level.
 *
 *             Support for INET6 connection oriented protocols.
 *
 * Authors:    See the TCPv6 sources
 */

#include <linux/module.h>
#include <linux/in6.h>
#include <linux/ipv6.h>
#include <linux/jhash.h>
#include <linux/slab.h>

#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_ecn.h>
#include <net/inet_hashtables.h>
#include <net/ip6_route.h>
#include <net/sock.h>
#include <net/inet6_connection_sock.h>
#include <net/sock_reuseport.h>

struct dst_entry *inet6_csk_route_req(const struct sock *sk,
                                      struct flowi6 *fl6,
                                      const struct request_sock *req,
                                      u8 proto)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct ipv6_pinfo *np = inet6_sk(sk);
        struct in6_addr *final_p, final;
        struct dst_entry *dst;

        memset(fl6, 0, sizeof(*fl6));
        fl6->flowi6_proto = proto;
        fl6->daddr = ireq->ir_v6_rmt_addr;
        rcu_read_lock();
        final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
        rcu_read_unlock();
        fl6->saddr = ireq->ir_v6_loc_addr;
        fl6->flowi6_oif = ireq->ir_iif;
        fl6->flowi6_mark = ireq->ir_mark;
        fl6->fl6_dport = ireq->ir_rmt_port;
        fl6->fl6_sport = htons(ireq->ir_num);
        fl6->flowi6_uid = sk->sk_uid;
        security_req_classify_flow(req, flowi6_to_flowi_common(fl6));

        dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
        if (IS_ERR(dst))
                return NULL;

        return dst;
}
EXPORT_SYMBOL(inet6_csk_route_req);

void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
{
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;

        sin6->sin6_family = AF_INET6;
        sin6->sin6_addr = sk->sk_v6_daddr;
        sin6->sin6_port        = inet_sk(sk)->inet_dport;
        /* We do not store received flowlabel for TCP */
        sin6->sin6_flowinfo = 0;
        sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
                                                  sk->sk_bound_dev_if);
}
EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);

static inline
struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
{
        return __sk_dst_check(sk, cookie);
}

static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
                                                struct flowi6 *fl6)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct in6_addr *final_p, final;
        struct dst_entry *dst;

        memset(fl6, 0, sizeof(*fl6));
        fl6->flowi6_proto = sk->sk_protocol;
        fl6->daddr = sk->sk_v6_daddr;
        fl6->saddr = np->saddr;
        fl6->flowlabel = np->flow_label;
        IP6_ECN_flow_xmit(sk, fl6->flowlabel);
        fl6->flowi6_oif = sk->sk_bound_dev_if;
        fl6->flowi6_mark = sk->sk_mark;
        fl6->fl6_sport = inet->inet_sport;
        fl6->fl6_dport = inet->inet_dport;
        fl6->flowi6_uid = sk->sk_uid;
        security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));

        rcu_read_lock();
        final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
        rcu_read_unlock();

        dst = __inet6_csk_dst_check(sk, np->dst_cookie);
        if (!dst) {
                dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);

                if (!IS_ERR(dst))
                        ip6_dst_store(sk, dst, NULL, NULL);
        }
        return dst;
}

int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct flowi6 fl6;
        struct dst_entry *dst;
        int res;

        dst = inet6_csk_route_socket(sk, &fl6);
        if (IS_ERR(dst)) {
                WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
                sk->sk_route_caps = 0;
                kfree_skb(skb);
                return PTR_ERR(dst);
        }

        rcu_read_lock();
        skb_dst_set_noref(skb, dst);

        /* Restore final destination back after routing done */
        fl6.daddr = sk->sk_v6_daddr;

        res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
                       np->tclass, READ_ONCE(sk->sk_priority));
        rcu_read_unlock();
        return res;
}
EXPORT_SYMBOL_GPL(inet6_csk_xmit);

struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
{
        struct flowi6 fl6;
        struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6);

        if (IS_ERR(dst))
                return NULL;
        dst->ops->update_pmtu(dst, sk, NULL, mtu, true);

        dst = inet6_csk_route_socket(sk, &fl6);
        return IS_ERR(dst) ? NULL : dst;
}
EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu);




































    1 






    1 











    2 




    1 







    1 




    2 
















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
 */

#include <linux/types.h>
#include <linux/ipv6.h>
#include <linux/in6.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/icmp.h>
#include <linux/rcupdate.h>
#include <linux/sysctl.h>
#include <net/ipv6_frag.h>

#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#endif
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>

static DEFINE_MUTEX(defrag6_mutex);

static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
                                                struct sk_buff *skb)
{
        u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        if (skb_nfct(skb)) {
                enum ip_conntrack_info ctinfo;
                const struct nf_conn *ct = nf_ct_get(skb, &ctinfo);

                zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
        }
#endif
        if (nf_bridge_in_prerouting(skb))
                return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id;

        if (hooknum == NF_INET_PRE_ROUTING)
                return IP6_DEFRAG_CONNTRACK_IN + zone_id;
        else
                return IP6_DEFRAG_CONNTRACK_OUT + zone_id;
}

static unsigned int ipv6_defrag(void *priv,
                                struct sk_buff *skb,
                                const struct nf_hook_state *state)
{
        int err;

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        /* Previously seen (loopback)?        */
        if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb)))
                return NF_ACCEPT;

        if (skb->_nfct == IP_CT_UNTRACKED)
                return NF_ACCEPT;
#endif

        err = nf_ct_frag6_gather(state->net, skb,
                                 nf_ct6_defrag_user(state->hook, skb));
        /* queued */
        if (err == -EINPROGRESS)
                return NF_STOLEN;

        return err == 0 ? NF_ACCEPT : NF_DROP;
}

static const struct nf_hook_ops ipv6_defrag_ops[] = {
        {
                .hook                = ipv6_defrag,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_PRE_ROUTING,
                .priority        = NF_IP6_PRI_CONNTRACK_DEFRAG,
        },
        {
                .hook                = ipv6_defrag,
                .pf                = NFPROTO_IPV6,
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority        = NF_IP6_PRI_CONNTRACK_DEFRAG,
        },
};

static void __net_exit defrag6_net_exit(struct net *net)
{
        if (net->nf.defrag_ipv6_users) {
                nf_unregister_net_hooks(net, ipv6_defrag_ops,
                                        ARRAY_SIZE(ipv6_defrag_ops));
                net->nf.defrag_ipv6_users = 0;
        }
}

static const struct nf_defrag_hook defrag_hook = {
        .owner = THIS_MODULE,
        .enable = nf_defrag_ipv6_enable,
        .disable = nf_defrag_ipv6_disable,
};

static struct pernet_operations defrag6_net_ops = {
        .exit = defrag6_net_exit,
};

static int __init nf_defrag_init(void)
{
        int ret = 0;

        ret = nf_ct_frag6_init();
        if (ret < 0) {
                pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
                return ret;
        }
        ret = register_pernet_subsys(&defrag6_net_ops);
        if (ret < 0) {
                pr_err("nf_defrag_ipv6: can't register pernet ops\n");
                goto cleanup_frag6;
        }

        rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook);

        return ret;

cleanup_frag6:
        nf_ct_frag6_cleanup();
        return ret;

}

static void __exit nf_defrag_fini(void)
{
        rcu_assign_pointer(nf_defrag_v6_hook, NULL);
        unregister_pernet_subsys(&defrag6_net_ops);
        nf_ct_frag6_cleanup();
}

int nf_defrag_ipv6_enable(struct net *net)
{
        int err = 0;

        mutex_lock(&defrag6_mutex);
        if (net->nf.defrag_ipv6_users == UINT_MAX) {
                err = -EOVERFLOW;
                goto out_unlock;
        }

        if (net->nf.defrag_ipv6_users) {
                net->nf.defrag_ipv6_users++;
                goto out_unlock;
        }

        err = nf_register_net_hooks(net, ipv6_defrag_ops,
                                    ARRAY_SIZE(ipv6_defrag_ops));
        if (err == 0)
                net->nf.defrag_ipv6_users = 1;

 out_unlock:
        mutex_unlock(&defrag6_mutex);
        return err;
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);

void nf_defrag_ipv6_disable(struct net *net)
{
        mutex_lock(&defrag6_mutex);
        if (net->nf.defrag_ipv6_users) {
                net->nf.defrag_ipv6_users--;
                if (net->nf.defrag_ipv6_users == 0)
                        nf_unregister_net_hooks(net, ipv6_defrag_ops,
                                                ARRAY_SIZE(ipv6_defrag_ops));
        }
        mutex_unlock(&defrag6_mutex);
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable);

module_init(nf_defrag_init);
module_exit(nf_defrag_fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("IPv6 defragmentation support");


















































































































































































































































































































































































































    3 




































    1 














    3 


















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * IPv6 fragment reassembly for connection tracking
 *
 * Copyright (C)2004 USAGI/WIDE Project
 *
 * Author:
 *        Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
 *
 * Based on: net/ipv6/reassembly.c
 */

#define pr_fmt(fmt) "IPv6-nf: " fmt

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/ipv6.h>
#include <linux/slab.h>

#include <net/ipv6_frag.h>

#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#include <linux/sysctl.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netns/generic.h>

static const char nf_frags_cache_name[] = "nf-frags";

static unsigned int nf_frag_pernet_id __read_mostly;
static struct inet_frags nf_frags;

static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net)
{
        return net_generic(net, nf_frag_pernet_id);
}

#ifdef CONFIG_SYSCTL

static struct ctl_table nf_ct_frag6_sysctl_table[] = {
        {
                .procname        = "nf_conntrack_frag6_timeout",
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "nf_conntrack_frag6_low_thresh",
                .maxlen                = sizeof(unsigned long),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
        {
                .procname        = "nf_conntrack_frag6_high_thresh",
                .maxlen                = sizeof(unsigned long),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
};

static int nf_ct_frag6_sysctl_register(struct net *net)
{
        struct nft_ct_frag6_pernet *nf_frag;
        struct ctl_table *table;
        struct ctl_table_header *hdr;

        table = nf_ct_frag6_sysctl_table;
        if (!net_eq(net, &init_net)) {
                table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table),
                                GFP_KERNEL);
                if (table == NULL)
                        goto err_alloc;
        }

        nf_frag = nf_frag_pernet(net);

        table[0].data        = &nf_frag->fqdir->timeout;
        table[1].data        = &nf_frag->fqdir->low_thresh;
        table[1].extra2        = &nf_frag->fqdir->high_thresh;
        table[2].data        = &nf_frag->fqdir->high_thresh;
        table[2].extra1        = &nf_frag->fqdir->low_thresh;

        hdr = register_net_sysctl_sz(net, "net/netfilter", table,
                                     ARRAY_SIZE(nf_ct_frag6_sysctl_table));
        if (hdr == NULL)
                goto err_reg;

        nf_frag->nf_frag_frags_hdr = hdr;
        return 0;

err_reg:
        if (!net_eq(net, &init_net))
                kfree(table);
err_alloc:
        return -ENOMEM;
}

static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
{
        struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
        const struct ctl_table *table;

        table = nf_frag->nf_frag_frags_hdr->ctl_table_arg;
        unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr);
        if (!net_eq(net, &init_net))
                kfree(table);
}

#else
static int nf_ct_frag6_sysctl_register(struct net *net)
{
        return 0;
}
static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
{
}
#endif

static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
                             struct sk_buff *prev_tail, struct net_device *dev);

static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
{
        return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
}

static void nf_ct_frag6_expire(struct timer_list *t)
{
        struct inet_frag_queue *frag = from_timer(frag, t, timer);
        struct frag_queue *fq;

        fq = container_of(frag, struct frag_queue, q);

        ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}

/* Creation primitives. */
static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
                                  const struct ipv6hdr *hdr, int iif)
{
        struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
        struct frag_v6_compare_key key = {
                .id = id,
                .saddr = hdr->saddr,
                .daddr = hdr->daddr,
                .user = user,
                .iif = iif,
        };
        struct inet_frag_queue *q;

        q = inet_frag_find(nf_frag->fqdir, &key);
        if (!q)
                return NULL;

        return container_of(q, struct frag_queue, q);
}


static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
                             const struct frag_hdr *fhdr, int nhoff)
{
        unsigned int payload_len;
        struct net_device *dev;
        struct sk_buff *prev;
        int offset, end, err;
        u8 ecn;

        if (fq->q.flags & INET_FRAG_COMPLETE) {
                pr_debug("Already completed\n");
                goto err;
        }

        payload_len = ntohs(ipv6_hdr(skb)->payload_len);

        offset = ntohs(fhdr->frag_off) & ~0x7;
        end = offset + (payload_len -
                        ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));

        if ((unsigned int)end > IPV6_MAXPLEN) {
                pr_debug("offset is too large.\n");
                return -EINVAL;
        }

        ecn = ip6_frag_ecn(ipv6_hdr(skb));

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                const unsigned char *nh = skb_network_header(skb);
                skb->csum = csum_sub(skb->csum,
                                     csum_partial(nh, (u8 *)(fhdr + 1) - nh,
                                                  0));
        }

        /* Is this the final fragment? */
        if (!(fhdr->frag_off & htons(IP6_MF))) {
                /* If we already have some bits beyond end
                 * or have different end, the segment is corrupted.
                 */
                if (end < fq->q.len ||
                    ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) {
                        pr_debug("already received last fragment\n");
                        goto err;
                }
                fq->q.flags |= INET_FRAG_LAST_IN;
                fq->q.len = end;
        } else {
                /* Check if the fragment is rounded to 8 bytes.
                 * Required by the RFC.
                 */
                if (end & 0x7) {
                        /* RFC2460 says always send parameter problem in
                         * this case. -DaveM
                         */
                        pr_debug("end of fragment not rounded to 8 bytes.\n");
                        inet_frag_kill(&fq->q);
                        return -EPROTO;
                }
                if (end > fq->q.len) {
                        /* Some bits beyond end -> corruption. */
                        if (fq->q.flags & INET_FRAG_LAST_IN) {
                                pr_debug("last packet already reached.\n");
                                goto err;
                        }
                        fq->q.len = end;
                }
        }

        if (end == offset)
                goto err;

        /* Point into the IP datagram 'data' part. */
        if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) {
                pr_debug("queue: message is too short.\n");
                goto err;
        }
        if (pskb_trim_rcsum(skb, end - offset)) {
                pr_debug("Can't trim\n");
                goto err;
        }

        /* Note : skb->rbnode and skb->dev share the same location. */
        dev = skb->dev;
        /* Makes sure compiler wont do silly aliasing games */
        barrier();

        prev = fq->q.fragments_tail;
        err = inet_frag_queue_insert(&fq->q, skb, offset, end);
        if (err) {
                if (err == IPFRAG_DUP) {
                        /* No error for duplicates, pretend they got queued. */
                        kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG);
                        return -EINPROGRESS;
                }
                goto insert_error;
        }

        if (dev)
                fq->iif = dev->ifindex;

        fq->q.stamp = skb->tstamp;
        fq->q.tstamp_type = skb->tstamp_type;
        fq->q.meat += skb->len;
        fq->ecn |= ecn;
        if (payload_len > fq->q.max_size)
                fq->q.max_size = payload_len;
        add_frag_mem_limit(fq->q.fqdir, skb->truesize);

        /* The first fragment.
         * nhoffset is obtained from the first fragment, of course.
         */
        if (offset == 0) {
                fq->nhoffset = nhoff;
                fq->q.flags |= INET_FRAG_FIRST_IN;
        }

        if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
            fq->q.meat == fq->q.len) {
                unsigned long orefdst = skb->_skb_refdst;

                skb->_skb_refdst = 0UL;
                err = nf_ct_frag6_reasm(fq, skb, prev, dev);
                skb->_skb_refdst = orefdst;

                /* After queue has assumed skb ownership, only 0 or
                 * -EINPROGRESS must be returned.
                 */
                return err ? -EINPROGRESS : 0;
        }

        skb_dst_drop(skb);
        skb_orphan(skb);
        return -EINPROGRESS;

insert_error:
        inet_frag_kill(&fq->q);
err:
        skb_dst_drop(skb);
        return -EINVAL;
}

/*
 *        Check if this packet is complete.
 *
 *        It is called with locked fq, and caller must check that
 *        queue is eligible for reassembly i.e. it is not COMPLETE,
 *        the last and the first frames arrived and all the bits are here.
 */
static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
                             struct sk_buff *prev_tail, struct net_device *dev)
{
        void *reasm_data;
        int payload_len;
        u8 ecn;

        inet_frag_kill(&fq->q);

        ecn = ip_frag_ecn_table[fq->ecn];
        if (unlikely(ecn == 0xff))
                goto err;

        reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
        if (!reasm_data)
                goto err;

        payload_len = -skb_network_offset(skb) -
                       sizeof(struct ipv6hdr) + fq->q.len -
                       sizeof(struct frag_hdr);
        if (payload_len > IPV6_MAXPLEN) {
                net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n",
                                    payload_len);
                goto err;
        }

        /* We have to remove fragment header from datagram and to relocate
         * header in order to calculate ICV correctly. */
        skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0];
        memmove(skb->head + sizeof(struct frag_hdr), skb->head,
                (skb->data - skb->head) - sizeof(struct frag_hdr));
        skb->mac_header += sizeof(struct frag_hdr);
        skb->network_header += sizeof(struct frag_hdr);

        skb_reset_transport_header(skb);

        inet_frag_reasm_finish(&fq->q, skb, reasm_data, false);

        skb->ignore_df = 1;
        skb->dev = dev;
        ipv6_hdr(skb)->payload_len = htons(payload_len);
        ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
        IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
        IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;

        /* Yes, and fold redundant checksum back. 8) */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_partial(skb_network_header(skb),
                                         skb_network_header_len(skb),
                                         skb->csum);

        fq->q.rb_fragments = RB_ROOT;
        fq->q.fragments_tail = NULL;
        fq->q.last_run_head = NULL;

        return 0;

err:
        inet_frag_kill(&fq->q);
        return -EINVAL;
}

/*
 * find the header just before Fragment Header.
 *
 * if success return 0 and set ...
 * (*prevhdrp): the value of "Next Header Field" in the header
 *                just before Fragment Header.
 * (*prevhoff): the offset of "Next Header Field" in the header
 *                just before Fragment Header.
 * (*fhoff)   : the offset of Fragment Header.
 *
 * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c
 *
 */
static int
find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
{
        u8 nexthdr = ipv6_hdr(skb)->nexthdr;
        const int netoff = skb_network_offset(skb);
        u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr);
        int start = netoff + sizeof(struct ipv6hdr);
        int len = skb->len - start;
        u8 prevhdr = NEXTHDR_IPV6;

        while (nexthdr != NEXTHDR_FRAGMENT) {
                struct ipv6_opt_hdr hdr;
                int hdrlen;

                if (!ipv6_ext_hdr(nexthdr)) {
                        return -1;
                }
                if (nexthdr == NEXTHDR_NONE) {
                        pr_debug("next header is none\n");
                        return -1;
                }
                if (len < (int)sizeof(struct ipv6_opt_hdr)) {
                        pr_debug("too short\n");
                        return -1;
                }
                if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
                        BUG();
                if (nexthdr == NEXTHDR_AUTH)
                        hdrlen = ipv6_authlen(&hdr);
                else
                        hdrlen = ipv6_optlen(&hdr);

                prevhdr = nexthdr;
                prev_nhoff = start;

                nexthdr = hdr.nexthdr;
                len -= hdrlen;
                start += hdrlen;
        }

        if (len < 0)
                return -1;

        *prevhdrp = prevhdr;
        *prevhoff = prev_nhoff;
        *fhoff = start;

        return 0;
}

int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
        u16 savethdr = skb->transport_header;
        u8 nexthdr = NEXTHDR_FRAGMENT;
        int fhoff, nhoff, ret;
        struct frag_hdr *fhdr;
        struct frag_queue *fq;
        struct ipv6hdr *hdr;
        u8 prevhdr;

        /* Jumbo payload inhibits frag. header */
        if (ipv6_hdr(skb)->payload_len == 0) {
                pr_debug("payload len = 0\n");
                return 0;
        }

        if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
                return 0;

        /* Discard the first fragment if it does not include all headers
         * RFC 8200, Section 4.5
         */
        if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) {
                pr_debug("Drop incomplete fragment\n");
                return 0;
        }

        if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
                return -ENOMEM;

        skb_set_transport_header(skb, fhoff);
        hdr = ipv6_hdr(skb);
        fhdr = (struct frag_hdr *)skb_transport_header(skb);

        fq = fq_find(net, fhdr->identification, user, hdr,
                     skb->dev ? skb->dev->ifindex : 0);
        if (fq == NULL) {
                pr_debug("Can't find and can't create new queue\n");
                return -ENOMEM;
        }

        spin_lock_bh(&fq->q.lock);

        ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
        if (ret == -EPROTO) {
                skb->transport_header = savethdr;
                ret = 0;
        }

        spin_unlock_bh(&fq->q.lock);
        inet_frag_put(&fq->q);
        return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);

static int nf_ct_net_init(struct net *net)
{
        struct nft_ct_frag6_pernet *nf_frag  = nf_frag_pernet(net);
        int res;

        res = fqdir_init(&nf_frag->fqdir, &nf_frags, net);
        if (res < 0)
                return res;

        nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
        nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
        nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT;

        res = nf_ct_frag6_sysctl_register(net);
        if (res < 0)
                fqdir_exit(nf_frag->fqdir);
        return res;
}

static void nf_ct_net_pre_exit(struct net *net)
{
        struct nft_ct_frag6_pernet *nf_frag  = nf_frag_pernet(net);

        fqdir_pre_exit(nf_frag->fqdir);
}

static void nf_ct_net_exit(struct net *net)
{
        struct nft_ct_frag6_pernet *nf_frag  = nf_frag_pernet(net);

        nf_ct_frags6_sysctl_unregister(net);
        fqdir_exit(nf_frag->fqdir);
}

static struct pernet_operations nf_ct_net_ops = {
        .init                = nf_ct_net_init,
        .pre_exit        = nf_ct_net_pre_exit,
        .exit                = nf_ct_net_exit,
        .id                = &nf_frag_pernet_id,
        .size                = sizeof(struct nft_ct_frag6_pernet),
};

static const struct rhashtable_params nfct_rhash_params = {
        .head_offset                = offsetof(struct inet_frag_queue, node),
        .hashfn                        = ip6frag_key_hashfn,
        .obj_hashfn                = ip6frag_obj_hashfn,
        .obj_cmpfn                = ip6frag_obj_cmpfn,
        .automatic_shrinking        = true,
};

int nf_ct_frag6_init(void)
{
        int ret = 0;

        nf_frags.constructor = ip6frag_init;
        nf_frags.destructor = NULL;
        nf_frags.qsize = sizeof(struct frag_queue);
        nf_frags.frag_expire = nf_ct_frag6_expire;
        nf_frags.frags_cache_name = nf_frags_cache_name;
        nf_frags.rhash_params = nfct_rhash_params;
        ret = inet_frags_init(&nf_frags);
        if (ret)
                goto out;
        ret = register_pernet_subsys(&nf_ct_net_ops);
        if (ret)
                inet_frags_fini(&nf_frags);

out:
        return ret;
}

void nf_ct_frag6_cleanup(void)
{
        unregister_pernet_subsys(&nf_ct_net_ops);
        inet_frags_fini(&nf_frags);
}








































































































































































































































































































































































































































































































































































































































































    2 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef MM_SLAB_H
#define MM_SLAB_H

#include <linux/reciprocal_div.h>
#include <linux/list_lru.h>
#include <linux/local_lock.h>
#include <linux/random.h>
#include <linux/kobject.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>
#include <linux/kfence.h>
#include <linux/kasan.h>

/*
 * Internal slab definitions
 */

#ifdef CONFIG_64BIT
# ifdef system_has_cmpxchg128
# define system_has_freelist_aba()        system_has_cmpxchg128()
# define try_cmpxchg_freelist                try_cmpxchg128
# endif
#define this_cpu_try_cmpxchg_freelist        this_cpu_try_cmpxchg128
typedef u128 freelist_full_t;
#else /* CONFIG_64BIT */
# ifdef system_has_cmpxchg64
# define system_has_freelist_aba()        system_has_cmpxchg64()
# define try_cmpxchg_freelist                try_cmpxchg64
# endif
#define this_cpu_try_cmpxchg_freelist        this_cpu_try_cmpxchg64
typedef u64 freelist_full_t;
#endif /* CONFIG_64BIT */

#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
#undef system_has_freelist_aba
#endif

/*
 * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
 * problems with cmpxchg of just a pointer.
 */
typedef union {
        struct {
                void *freelist;
                unsigned long counter;
        };
        freelist_full_t full;
} freelist_aba_t;

/* Reuses the bits in struct page */
struct slab {
        unsigned long __page_flags;

        struct kmem_cache *slab_cache;
        union {
                struct {
                        union {
                                struct list_head slab_list;
#ifdef CONFIG_SLUB_CPU_PARTIAL
                                struct {
                                        struct slab *next;
                                        int slabs;        /* Nr of slabs left */
                                };
#endif
                        };
                        /* Double-word boundary */
                        union {
                                struct {
                                        void *freelist;                /* first free object */
                                        union {
                                                unsigned long counters;
                                                struct {
                                                        unsigned inuse:16;
                                                        unsigned objects:15;
                                                        unsigned frozen:1;
                                                };
                                        };
                                };
#ifdef system_has_freelist_aba
                                freelist_aba_t freelist_counter;
#endif
                        };
                };
                struct rcu_head rcu_head;
        };

        unsigned int __page_type;
        atomic_t __page_refcount;
#ifdef CONFIG_SLAB_OBJ_EXT
        unsigned long obj_exts;
#endif
};

#define SLAB_MATCH(pg, sl)                                                \
        static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
SLAB_MATCH(flags, __page_flags);
SLAB_MATCH(compound_head, slab_cache);        /* Ensure bit 0 is clear */
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_SLAB_OBJ_EXT
SLAB_MATCH(memcg_data, obj_exts);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
#if defined(system_has_freelist_aba)
static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
#endif

/**
 * folio_slab - Converts from folio to slab.
 * @folio: The folio.
 *
 * Currently struct slab is a different representation of a folio where
 * folio_test_slab() is true.
 *
 * Return: The slab which contains this folio.
 */
#define folio_slab(folio)        (_Generic((folio),                        \
        const struct folio *:        (const struct slab *)(folio),                \
        struct folio *:                (struct slab *)(folio)))

/**
 * slab_folio - The folio allocated for a slab
 * @slab: The slab.
 *
 * Slabs are allocated as folios that contain the individual objects and are
 * using some fields in the first struct page of the folio - those fields are
 * now accessed by struct slab. It is occasionally necessary to convert back to
 * a folio in order to communicate with the rest of the mm.  Please use this
 * helper function instead of casting yourself, as the implementation may change
 * in the future.
 */
#define slab_folio(s)                (_Generic((s),                                \
        const struct slab *:        (const struct folio *)s,                \
        struct slab *:                (struct folio *)s))

/**
 * page_slab - Converts from first struct page to slab.
 * @p: The first (either head of compound or single) page of slab.
 *
 * A temporary wrapper to convert struct page to struct slab in situations where
 * we know the page is the compound head, or single order-0 page.
 *
 * Long-term ideally everything would work with struct slab directly or go
 * through folio to struct slab.
 *
 * Return: The slab which contains this page
 */
#define page_slab(p)                (_Generic((p),                                \
        const struct page *:        (const struct slab *)(p),                \
        struct page *:                (struct slab *)(p)))

/**
 * slab_page - The first struct page allocated for a slab
 * @slab: The slab.
 *
 * A convenience wrapper for converting slab to the first struct page of the
 * underlying folio, to communicate with code not yet converted to folio or
 * struct slab.
 */
#define slab_page(s) folio_page(slab_folio(s), 0)

/*
 * If network-based swap is enabled, sl*b must keep track of whether pages
 * were allocated from pfmemalloc reserves.
 */
static inline bool slab_test_pfmemalloc(const struct slab *slab)
{
        return folio_test_active((struct folio *)slab_folio(slab));
}

static inline void slab_set_pfmemalloc(struct slab *slab)
{
        folio_set_active(slab_folio(slab));
}

static inline void slab_clear_pfmemalloc(struct slab *slab)
{
        folio_clear_active(slab_folio(slab));
}

static inline void __slab_clear_pfmemalloc(struct slab *slab)
{
        __folio_clear_active(slab_folio(slab));
}

static inline void *slab_address(const struct slab *slab)
{
        return folio_address(slab_folio(slab));
}

static inline int slab_nid(const struct slab *slab)
{
        return folio_nid(slab_folio(slab));
}

static inline pg_data_t *slab_pgdat(const struct slab *slab)
{
        return folio_pgdat(slab_folio(slab));
}

static inline struct slab *virt_to_slab(const void *addr)
{
        struct folio *folio = virt_to_folio(addr);

        if (!folio_test_slab(folio))
                return NULL;

        return folio_slab(folio);
}

static inline int slab_order(const struct slab *slab)
{
        return folio_order((struct folio *)slab_folio(slab));
}

static inline size_t slab_size(const struct slab *slab)
{
        return PAGE_SIZE << slab_order(slab);
}

#ifdef CONFIG_SLUB_CPU_PARTIAL
#define slub_percpu_partial(c)                        ((c)->partial)

#define slub_set_percpu_partial(c, p)                \
({                                                \
        slub_percpu_partial(c) = (p)->next;        \
})

#define slub_percpu_partial_read_once(c)        READ_ONCE(slub_percpu_partial(c))
#else
#define slub_percpu_partial(c)                        NULL

#define slub_set_percpu_partial(c, p)

#define slub_percpu_partial_read_once(c)        NULL
#endif // CONFIG_SLUB_CPU_PARTIAL

/*
 * Word size structure that can be atomically updated or read and that
 * contains both the order and the number of objects that a slab of the
 * given order would contain.
 */
struct kmem_cache_order_objects {
        unsigned int x;
};

/*
 * Slab cache management.
 */
struct kmem_cache {
#ifndef CONFIG_SLUB_TINY
        struct kmem_cache_cpu __percpu *cpu_slab;
#endif
        /* Used for retrieving partial slabs, etc. */
        slab_flags_t flags;
        unsigned long min_partial;
        unsigned int size;                /* Object size including metadata */
        unsigned int object_size;        /* Object size without metadata */
        struct reciprocal_value reciprocal_size;
        unsigned int offset;                /* Free pointer offset */
#ifdef CONFIG_SLUB_CPU_PARTIAL
        /* Number of per cpu partial objects to keep around */
        unsigned int cpu_partial;
        /* Number of per cpu partial slabs to keep around */
        unsigned int cpu_partial_slabs;
#endif
        struct kmem_cache_order_objects oo;

        /* Allocation and freeing of slabs */
        struct kmem_cache_order_objects min;
        gfp_t allocflags;                /* gfp flags to use on each alloc */
        int refcount;                        /* Refcount for slab cache destroy */
        void (*ctor)(void *object);        /* Object constructor */
        unsigned int inuse;                /* Offset to metadata */
        unsigned int align;                /* Alignment */
        unsigned int red_left_pad;        /* Left redzone padding size */
        const char *name;                /* Name (only for display!) */
        struct list_head list;                /* List of slab caches */
#ifdef CONFIG_SYSFS
        struct kobject kobj;                /* For sysfs */
#endif
#ifdef CONFIG_SLAB_FREELIST_HARDENED
        unsigned long random;
#endif

#ifdef CONFIG_NUMA
        /*
         * Defragmentation by allocating from a remote node.
         */
        unsigned int remote_node_defrag_ratio;
#endif

#ifdef CONFIG_SLAB_FREELIST_RANDOM
        unsigned int *random_seq;
#endif

#ifdef CONFIG_KASAN_GENERIC
        struct kasan_cache kasan_info;
#endif

#ifdef CONFIG_HARDENED_USERCOPY
        unsigned int useroffset;        /* Usercopy region offset */
        unsigned int usersize;                /* Usercopy region size */
#endif

        struct kmem_cache_node *node[MAX_NUMNODES];
};

#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
#define SLAB_SUPPORTS_SYSFS
void sysfs_slab_unlink(struct kmem_cache *s);
void sysfs_slab_release(struct kmem_cache *s);
#else
static inline void sysfs_slab_unlink(struct kmem_cache *s) { }
static inline void sysfs_slab_release(struct kmem_cache *s) { }
#endif

void *fixup_red_left(struct kmem_cache *s, void *p);

static inline void *nearest_obj(struct kmem_cache *cache,
                                const struct slab *slab, void *x)
{
        void *object = x - (x - slab_address(slab)) % cache->size;
        void *last_object = slab_address(slab) +
                (slab->objects - 1) * cache->size;
        void *result = (unlikely(object > last_object)) ? last_object : object;

        result = fixup_red_left(cache, result);
        return result;
}

/* Determine object index from a given position */
static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
                                          void *addr, void *obj)
{
        return reciprocal_divide(kasan_reset_tag(obj) - addr,
                                 cache->reciprocal_size);
}

static inline unsigned int obj_to_index(const struct kmem_cache *cache,
                                        const struct slab *slab, void *obj)
{
        if (is_kfence_address(obj))
                return 0;
        return __obj_to_index(cache, slab_address(slab), obj);
}

static inline int objs_per_slab(const struct kmem_cache *cache,
                                const struct slab *slab)
{
        return slab->objects;
}

/*
 * State of the slab allocator.
 *
 * This is used to describe the states of the allocator during bootup.
 * Allocators use this to gradually bootstrap themselves. Most allocators
 * have the problem that the structures used for managing slab caches are
 * allocated from slab caches themselves.
 */
enum slab_state {
        DOWN,                        /* No slab functionality yet */
        PARTIAL,                /* SLUB: kmem_cache_node available */
        UP,                        /* Slab caches usable but not all extras yet */
        FULL                        /* Everything is working */
};

extern enum slab_state slab_state;

/* The slab cache mutex protects the management structures during changes */
extern struct mutex slab_mutex;

/* The list of all slab caches on the system */
extern struct list_head slab_caches;

/* The slab cache that manages slab cache information */
extern struct kmem_cache *kmem_cache;

/* A table of kmalloc cache names and sizes */
extern const struct kmalloc_info_struct {
        const char *name[NR_KMALLOC_TYPES];
        unsigned int size;
} kmalloc_info[];

/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
void create_kmalloc_caches(void);

extern u8 kmalloc_size_index[24];

static inline unsigned int size_index_elem(unsigned int bytes)
{
        return (bytes - 1) / 8;
}

/*
 * Find the kmem_cache structure that serves a given size of
 * allocation
 *
 * This assumes size is larger than zero and not larger than
 * KMALLOC_MAX_CACHE_SIZE and the caller must check that.
 */
static inline struct kmem_cache *
kmalloc_slab(size_t size, gfp_t flags, unsigned long caller)
{
        unsigned int index;

        if (size <= 192)
                index = kmalloc_size_index[size_index_elem(size)];
        else
                index = fls(size - 1);

        return kmalloc_caches[kmalloc_type(flags, caller)][index];
}

gfp_t kmalloc_fix_flags(gfp_t flags);

/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);

void __init kmem_cache_init(void);
extern void create_boot_cache(struct kmem_cache *, const char *name,
                        unsigned int size, slab_flags_t flags,
                        unsigned int useroffset, unsigned int usersize);

int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(unsigned size, unsigned align,
                slab_flags_t flags, const char *name, void (*ctor)(void *));
struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
                   slab_flags_t flags, void (*ctor)(void *));

slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name);

static inline bool is_kmalloc_cache(struct kmem_cache *s)
{
        return (s->flags & SLAB_KMALLOC);
}

/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
                         SLAB_CACHE_DMA32 | SLAB_PANIC | \
                         SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )

#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
                          SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
#else
#define SLAB_DEBUG_FLAGS (0)
#endif

#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
                          SLAB_TEMPORARY | SLAB_ACCOUNT | \
                          SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)

/* Common flags available with current configuration */
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)

/* Common flags permitted for kmem_cache_create */
#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
                              SLAB_RED_ZONE | \
                              SLAB_POISON | \
                              SLAB_STORE_USER | \
                              SLAB_TRACE | \
                              SLAB_CONSISTENCY_CHECKS | \
                              SLAB_NOLEAKTRACE | \
                              SLAB_RECLAIM_ACCOUNT | \
                              SLAB_TEMPORARY | \
                              SLAB_ACCOUNT | \
                              SLAB_KMALLOC | \
                              SLAB_NO_MERGE | \
                              SLAB_NO_USER_FLAGS)

bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
void slab_kmem_cache_release(struct kmem_cache *);

struct seq_file;
struct file;

struct slabinfo {
        unsigned long active_objs;
        unsigned long num_objs;
        unsigned long active_slabs;
        unsigned long num_slabs;
        unsigned long shared_avail;
        unsigned int limit;
        unsigned int batchcount;
        unsigned int shared;
        unsigned int objects_per_slab;
        unsigned int cache_order;
};

void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);

#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
extern void print_tracking(struct kmem_cache *s, void *object);
long validate_slab_cache(struct kmem_cache *s);
static inline bool __slub_debug_enabled(void)
{
        return static_branch_unlikely(&slub_debug_enabled);
}
#else
static inline void print_tracking(struct kmem_cache *s, void *object)
{
}
static inline bool __slub_debug_enabled(void)
{
        return false;
}
#endif

/*
 * Returns true if any of the specified slab_debug flags is enabled for the
 * cache. Use only for flags parsed by setup_slub_debug() as it also enables
 * the static key.
 */
static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
{
        if (IS_ENABLED(CONFIG_SLUB_DEBUG))
                VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
        if (__slub_debug_enabled())
                return s->flags & flags;
        return false;
}

#ifdef CONFIG_SLAB_OBJ_EXT

/*
 * slab_obj_exts - get the pointer to the slab object extension vector
 * associated with a slab.
 * @slab: a pointer to the slab struct
 *
 * Returns a pointer to the object extension vector associated with the slab,
 * or NULL if no such vector has been associated yet.
 */
static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
        unsigned long obj_exts = READ_ONCE(slab->obj_exts);

#ifdef CONFIG_MEMCG
        VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
                                                        slab_page(slab));
        VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
#endif
        return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
}

int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
                        gfp_t gfp, bool new_slab);

#else /* CONFIG_SLAB_OBJ_EXT */

static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
        return NULL;
}

#endif /* CONFIG_SLAB_OBJ_EXT */

static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
{
        return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
                NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
}

#ifdef CONFIG_MEMCG_KMEM
bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
                                  gfp_t flags, size_t size, void **p);
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
                            void **p, int objects, struct slabobj_ext *obj_exts);
#endif

size_t __ksize(const void *objp);

static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
        /*
         * Debugging requires use of the padding between object
         * and whatever may come after it.
         */
        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
                return s->object_size;
#endif
        if (s->flags & SLAB_KASAN)
                return s->object_size;
        /*
         * If we have the need to store the freelist pointer
         * back there or track user information then we can
         * only use the space before that information.
         */
        if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
                return s->inuse;
        /*
         * Else we can use all the padding etc for the allocation
         */
        return s->size;
}

#ifdef CONFIG_SLUB_DEBUG
void dump_unreclaimable_slab(void);
#else
static inline void dump_unreclaimable_slab(void)
{
}
#endif

void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);

#ifdef CONFIG_SLAB_FREELIST_RANDOM
int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
                        gfp_t gfp);
void cache_random_seq_destroy(struct kmem_cache *cachep);
#else
static inline int cache_random_seq_create(struct kmem_cache *cachep,
                                        unsigned int count, gfp_t gfp)
{
        return 0;
}
static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
#endif /* CONFIG_SLAB_FREELIST_RANDOM */

static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc)) {
                if (c->ctor)
                        return false;
                if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
                        return flags & __GFP_ZERO;
                return true;
        }
        return flags & __GFP_ZERO;
}

static inline bool slab_want_init_on_free(struct kmem_cache *c)
{
        if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
                                &init_on_free))
                return !(c->ctor ||
                         (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
        return false;
}

#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
void debugfs_slab_release(struct kmem_cache *);
#else
static inline void debugfs_slab_release(struct kmem_cache *s) { }
#endif

#ifdef CONFIG_PRINTK
#define KS_ADDRS_COUNT 16
struct kmem_obj_info {
        void *kp_ptr;
        struct slab *kp_slab;
        void *kp_objp;
        unsigned long kp_data_offset;
        struct kmem_cache *kp_slab_cache;
        void *kp_ret;
        void *kp_stack[KS_ADDRS_COUNT];
        void *kp_free_stack[KS_ADDRS_COUNT];
};
void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
#endif

void __check_heap_object(const void *ptr, unsigned long n,
                         const struct slab *slab, bool to_user);

#ifdef CONFIG_SLUB_DEBUG
void skip_orig_size_check(struct kmem_cache *s, const void *object);
#endif

#endif /* MM_SLAB_H */









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Access to user system call parameters and results
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
 *
 * See asm-generic/syscall.h for descriptions of what we must do here.
 */

#ifndef _ASM_X86_SYSCALL_H
#define _ASM_X86_SYSCALL_H

#include <uapi/linux/audit.h>
#include <linux/sched.h>
#include <linux/err.h>
#include <asm/thread_info.h>        /* for TS_COMPAT */
#include <asm/unistd.h>

/* This is used purely for kernel/trace/trace_syscalls.c */
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
extern const sys_call_ptr_t sys_call_table[];

/*
 * These may not exist, but still put the prototypes in so we
 * can use IS_ENABLED().
 */
extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
extern long x64_sys_call(const struct pt_regs *, unsigned int nr);

/*
 * Only the low 32 bits of orig_ax are meaningful, so we return int.
 * This importantly ignores the high bits on 64-bit, so comparisons
 * sign-extend the low 32 bits.
 */
static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
{
        return regs->orig_ax;
}

static inline void syscall_rollback(struct task_struct *task,
                                    struct pt_regs *regs)
{
        regs->ax = regs->orig_ax;
}

static inline long syscall_get_error(struct task_struct *task,
                                     struct pt_regs *regs)
{
        unsigned long error = regs->ax;
#ifdef CONFIG_IA32_EMULATION
        /*
         * TS_COMPAT is set for 32-bit syscall entries and then
         * remains set until we return to user mode.
         */
        if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
                /*
                 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
                 * and will match correctly in comparisons.
                 */
                error = (long) (int) error;
#endif
        return IS_ERR_VALUE(error) ? error : 0;
}

static inline long syscall_get_return_value(struct task_struct *task,
                                            struct pt_regs *regs)
{
        return regs->ax;
}

static inline void syscall_set_return_value(struct task_struct *task,
                                            struct pt_regs *regs,
                                            int error, long val)
{
        regs->ax = (long) error ?: val;
}

#ifdef CONFIG_X86_32

static inline void syscall_get_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned long *args)
{
        memcpy(args, &regs->bx, 6 * sizeof(args[0]));
}

static inline int syscall_get_arch(struct task_struct *task)
{
        return AUDIT_ARCH_I386;
}

#else         /* CONFIG_X86_64 */

static inline void syscall_get_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
        if (task->thread_info.status & TS_COMPAT) {
                *args++ = regs->bx;
                *args++ = regs->cx;
                *args++ = regs->dx;
                *args++ = regs->si;
                *args++ = regs->di;
                *args   = regs->bp;
        } else
# endif
        {
                *args++ = regs->di;
                *args++ = regs->si;
                *args++ = regs->dx;
                *args++ = regs->r10;
                *args++ = regs->r8;
                *args   = regs->r9;
        }
}

static inline int syscall_get_arch(struct task_struct *task)
{
        /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
        return (IS_ENABLED(CONFIG_IA32_EMULATION) &&
                task->thread_info.status & TS_COMPAT)
                ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}

bool do_syscall_64(struct pt_regs *regs, int nr);
void do_int80_emulation(struct pt_regs *regs);

#endif        /* CONFIG_X86_32 */

void do_int80_syscall_32(struct pt_regs *regs);
bool do_fast_syscall_32(struct pt_regs *regs);
bool do_SYSENTER_32(struct pt_regs *regs);

#endif        /* _ASM_X86_SYSCALL_H */














































    1 






















    1 






















    1 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * sha256_base.h - core logic for SHA-256 implementations
 *
 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
 */

#ifndef _CRYPTO_SHA256_BASE_H
#define _CRYPTO_SHA256_BASE_H

#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/sha2.h>
#include <linux/string.h>
#include <linux/types.h>

typedef void (sha256_block_fn)(struct sha256_state *sst, u8 const *src,
                               int blocks);

static inline int sha224_base_init(struct shash_desc *desc)
{
        struct sha256_state *sctx = shash_desc_ctx(desc);

        sha224_init(sctx);
        return 0;
}

static inline int sha256_base_init(struct shash_desc *desc)
{
        struct sha256_state *sctx = shash_desc_ctx(desc);

        sha256_init(sctx);
        return 0;
}

static inline int lib_sha256_base_do_update(struct sha256_state *sctx,
                                            const u8 *data,
                                            unsigned int len,
                                            sha256_block_fn *block_fn)
{
        unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;

        sctx->count += len;

        if (unlikely((partial + len) >= SHA256_BLOCK_SIZE)) {
                int blocks;

                if (partial) {
                        int p = SHA256_BLOCK_SIZE - partial;

                        memcpy(sctx->buf + partial, data, p);
                        data += p;
                        len -= p;

                        block_fn(sctx, sctx->buf, 1);
                }

                blocks = len / SHA256_BLOCK_SIZE;
                len %= SHA256_BLOCK_SIZE;

                if (blocks) {
                        block_fn(sctx, data, blocks);
                        data += blocks * SHA256_BLOCK_SIZE;
                }
                partial = 0;
        }
        if (len)
                memcpy(sctx->buf + partial, data, len);

        return 0;
}

static inline int sha256_base_do_update(struct shash_desc *desc,
                                        const u8 *data,
                                        unsigned int len,
                                        sha256_block_fn *block_fn)
{
        struct sha256_state *sctx = shash_desc_ctx(desc);

        return lib_sha256_base_do_update(sctx, data, len, block_fn);
}

static inline int lib_sha256_base_do_finalize(struct sha256_state *sctx,
                                              sha256_block_fn *block_fn)
{
        const int bit_offset = SHA256_BLOCK_SIZE - sizeof(__be64);
        __be64 *bits = (__be64 *)(sctx->buf + bit_offset);
        unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;

        sctx->buf[partial++] = 0x80;
        if (partial > bit_offset) {
                memset(sctx->buf + partial, 0x0, SHA256_BLOCK_SIZE - partial);
                partial = 0;

                block_fn(sctx, sctx->buf, 1);
        }

        memset(sctx->buf + partial, 0x0, bit_offset - partial);
        *bits = cpu_to_be64(sctx->count << 3);
        block_fn(sctx, sctx->buf, 1);

        return 0;
}

static inline int sha256_base_do_finalize(struct shash_desc *desc,
                                          sha256_block_fn *block_fn)
{
        struct sha256_state *sctx = shash_desc_ctx(desc);

        return lib_sha256_base_do_finalize(sctx, block_fn);
}

static inline int lib_sha256_base_finish(struct sha256_state *sctx, u8 *out,
                                         unsigned int digest_size)
{
        __be32 *digest = (__be32 *)out;
        int i;

        for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be32))
                put_unaligned_be32(sctx->state[i], digest++);

        memzero_explicit(sctx, sizeof(*sctx));
        return 0;
}

static inline int sha256_base_finish(struct shash_desc *desc, u8 *out)
{
        unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
        struct sha256_state *sctx = shash_desc_ctx(desc);

        return lib_sha256_base_finish(sctx, out, digest_size);
}

#endif /* _CRYPTO_SHA256_BASE_H */


























































































    2 






    2 














    1 


























































    3 




    3 





    2 















    2 














































    2 























    3 


    2 




















    3 




    3 




    2 









    3 


    3 





    2 





































































    3 





    1 


    2 




    2 




    1 

    1 

    1 








    1 
























































































    3 











    3 
    3 


























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
// SPDX-License-Identifier: GPL-2.0
/*
 *        SUCS NET3:
 *
 *        Generic datagram handling routines. These are generic for all
 *        protocols. Possibly a generic IP version on top of these would
 *        make sense. Not tonight however 8-).
 *        This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
 *        NetROM layer all have identical poll code and mostly
 *        identical recvmsg() code. So we share it here. The poll was
 *        shared before but buried in udp.c so I moved it.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
 *                                                     udp.c code)
 *
 *        Fixes:
 *                Alan Cox        :        NULL return from skb_peek_copy()
 *                                        understood
 *                Alan Cox        :        Rewrote skb_read_datagram to avoid the
 *                                        skb_peek_copy stuff.
 *                Alan Cox        :        Added support for SOCK_SEQPACKET.
 *                                        IPX can no longer use the SO_TYPE hack
 *                                        but AX.25 now works right, and SPX is
 *                                        feasible.
 *                Alan Cox        :        Fixed write poll of non IP protocol
 *                                        crash.
 *                Florian  La Roche:        Changed for my new skbuff handling.
 *                Darryl Miles        :        Fixed non-blocking SOCK_SEQPACKET.
 *                Linus Torvalds        :        BSD semantic fixes.
 *                Alan Cox        :        Datagram iovec handling
 *                Darryl Miles        :        Fixed non-blocking SOCK_STREAM.
 *                Alan Cox        :        POSIXisms
 *                Pete Wyckoff    :       Unconnected accept() fix.
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/iov_iter.h>
#include <linux/indirect_call_wrapper.h>

#include <net/protocol.h>
#include <linux/skbuff.h>

#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include <crypto/hash.h>

/*
 *        Is a socket 'connection oriented' ?
 */
static inline int connection_based(struct sock *sk)
{
        return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}

static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
                                  void *key)
{
        /*
         * Avoid a wakeup if event not interesting for us
         */
        if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
                return 0;
        return autoremove_wake_function(wait, mode, sync, key);
}
/*
 * Wait for the last received packet to be different from skb
 */
int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb)
{
        int error;
        DEFINE_WAIT_FUNC(wait, receiver_wake_function);

        prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);

        /* Socket errors? */
        error = sock_error(sk);
        if (error)
                goto out_err;

        if (READ_ONCE(queue->prev) != skb)
                goto out;

        /* Socket shut down? */
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                goto out_noerr;

        /* Sequenced packets can come disconnected.
         * If so we report the problem
         */
        error = -ENOTCONN;
        if (connection_based(sk) &&
            !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
                goto out_err;

        /* handle signals */
        if (signal_pending(current))
                goto interrupted;

        error = 0;
        *timeo_p = schedule_timeout(*timeo_p);
out:
        finish_wait(sk_sleep(sk), &wait);
        return error;
interrupted:
        error = sock_intr_errno(*timeo_p);
out_err:
        *err = error;
        goto out;
out_noerr:
        *err = 0;
        error = 1;
        goto out;
}
EXPORT_SYMBOL(__skb_wait_for_more_packets);

static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
{
        struct sk_buff *nskb;

        if (skb->peeked)
                return skb;

        /* We have to unshare an skb before modifying it. */
        if (!skb_shared(skb))
                goto done;

        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                return ERR_PTR(-ENOMEM);

        skb->prev->next = nskb;
        skb->next->prev = nskb;
        nskb->prev = skb->prev;
        nskb->next = skb->next;

        consume_skb(skb);
        skb = nskb;

done:
        skb->peeked = 1;

        return skb;
}

struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
                                          struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last)
{
        bool peek_at_off = false;
        struct sk_buff *skb;
        int _off = 0;

        if (unlikely(flags & MSG_PEEK && *off >= 0)) {
                peek_at_off = true;
                _off = *off;
        }

        *last = queue->prev;
        skb_queue_walk(queue, skb) {
                if (flags & MSG_PEEK) {
                        if (peek_at_off && _off >= skb->len &&
                            (_off || skb->peeked)) {
                                _off -= skb->len;
                                continue;
                        }
                        if (!skb->len) {
                                skb = skb_set_peeked(skb);
                                if (IS_ERR(skb)) {
                                        *err = PTR_ERR(skb);
                                        return NULL;
                                }
                        }
                        refcount_inc(&skb->users);
                } else {
                        __skb_unlink(skb, queue);
                }
                *off = _off;
                return skb;
        }
        return NULL;
}

/**
 *        __skb_try_recv_datagram - Receive a datagram skbuff
 *        @sk: socket
 *        @queue: socket queue from which to receive
 *        @flags: MSG\_ flags
 *        @off: an offset in bytes to peek skb from. Returns an offset
 *              within an skb where data actually starts
 *        @err: error code returned
 *        @last: set to last peeked message to inform the wait function
 *               what to look for when peeking
 *
 *        Get a datagram skbuff, understands the peeking, nonblocking wakeups
 *        and possible races. This replaces identical code in packet, raw and
 *        udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
 *        the long standing peek and read race for datagram sockets. If you
 *        alter this routine remember it must be re-entrant.
 *
 *        This function will lock the socket if a skb is returned, so
 *        the caller needs to unlock the socket in that case (usually by
 *        calling skb_free_datagram). Returns NULL with @err set to
 *        -EAGAIN if no data was available or to some other value if an
 *        error was detected.
 *
 *        * It does not lock socket since today. This function is
 *        * free of race conditions. This measure should/can improve
 *        * significantly datagram socket latencies at high loads,
 *        * when data copying to user space takes lots of time.
 *        * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
 *        *  8) Great win.)
 *        *                                            --ANK (980729)
 *
 *        The order of the tests when we find no data waiting are specified
 *        quite explicitly by POSIX 1003.1g, don't change them without having
 *        the standard around please.
 */
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last)
{
        struct sk_buff *skb;
        unsigned long cpu_flags;
        /*
         * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
         */
        int error = sock_error(sk);

        if (error)
                goto no_packet;

        do {
                /* Again only user level code calls this function, so nothing
                 * interrupt level will suddenly eat the receive_queue.
                 *
                 * Look at current nfs client by the way...
                 * However, this function was correct in any case. 8)
                 */
                spin_lock_irqsave(&queue->lock, cpu_flags);
                skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
                                                last);
                spin_unlock_irqrestore(&queue->lock, cpu_flags);
                if (error)
                        goto no_packet;
                if (skb)
                        return skb;

                if (!sk_can_busy_loop(sk))
                        break;

                sk_busy_loop(sk, flags & MSG_DONTWAIT);
        } while (READ_ONCE(queue->prev) != *last);

        error = -EAGAIN;

no_packet:
        *err = error;
        return NULL;
}
EXPORT_SYMBOL(__skb_try_recv_datagram);

struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err)
{
        struct sk_buff *skb, *last;
        long timeo;

        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        do {
                skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
                                              &last);
                if (skb)
                        return skb;

                if (*err != -EAGAIN)
                        break;
        } while (timeo &&
                 !__skb_wait_for_more_packets(sk, sk_queue, err,
                                              &timeo, last));

        return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);

struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
                                  int *err)
{
        int off = 0;

        return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags,
                                   &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);

void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
        consume_skb(skb);
}
EXPORT_SYMBOL(skb_free_datagram);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb))
{
        int err = 0;

        if (flags & MSG_PEEK) {
                err = -ENOENT;
                spin_lock_bh(&sk_queue->lock);
                if (skb->next) {
                        __skb_unlink(skb, sk_queue);
                        refcount_dec(&skb->users);
                        if (destructor)
                                destructor(sk, skb);
                        err = 0;
                }
                spin_unlock_bh(&sk_queue->lock);
        }

        atomic_inc(&sk->sk_drops);
        return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);

/**
 *        skb_kill_datagram - Free a datagram skbuff forcibly
 *        @sk: socket
 *        @skb: datagram skbuff
 *        @flags: MSG\_ flags
 *
 *        This function frees a datagram skbuff that was received by
 *        skb_recv_datagram.  The flags argument must match the one
 *        used for skb_recv_datagram.
 *
 *        If the MSG_PEEK flag is set, and the packet is still on the
 *        receive queue of the socket, it will be taken off the queue
 *        before it is freed.
 *
 *        This function currently only disables BH when acquiring the
 *        sk_receive_queue lock.  Therefore it must not be used in a
 *        context where that lock is acquired in an IRQ context.
 *
 *        It returns 0 if the packet was removed by us.
 */

int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
        int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags,
                                      NULL);

        kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(skb_kill_datagram);

INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
                                                size_t bytes,
                                                void *data __always_unused,
                                                struct iov_iter *i));

static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
                               struct iov_iter *to, int len, bool fault_short,
                               size_t (*cb)(const void *, size_t, void *,
                                            struct iov_iter *), void *data)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset, start_off = offset, n;
        struct sk_buff *frag_iter;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
                                    skb->data + offset, copy, data, to);
                offset += n;
                if (n != copy)
                        goto short_copy;
                if ((len -= copy) == 0)
                        return 0;
        }

        /* Copy paged appendix. Hmm... why does this look so complicated? */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        struct page *page = skb_frag_page(frag);
                        u8 *vaddr = kmap_local_page(page);

                        if (copy > len)
                                copy = len;
                        n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
                                        vaddr + skb_frag_off(frag) + offset - start,
                                        copy, data, to);
                        kunmap_local(vaddr);
                        offset += n;
                        if (n != copy)
                                goto short_copy;
                        if (!(len -= copy))
                                return 0;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (__skb_datagram_iter(frag_iter, offset - start,
                                                to, copy, fault_short, cb, data))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

        /* This is not really a user copy fault, but rather someone
         * gave us a bogus length on the skb.  We should probably
         * print a warning here as it may indicate a kernel bug.
         */

fault:
        iov_iter_revert(to, offset - start_off);
        return -EFAULT;

short_copy:
        if (fault_short || iov_iter_count(to))
                goto fault;

        return 0;
}

static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
                                    struct iov_iter *i)
{
#ifdef CONFIG_CRYPTO_HASH
        struct ahash_request *hash = hashp;
        struct scatterlist sg;
        size_t copied;

        copied = copy_to_iter(addr, bytes, i);
        sg_init_one(&sg, addr, copied);
        ahash_request_set_crypt(hash, &sg, NULL, copied);
        crypto_ahash_update(hash);
        return copied;
#else
        return 0;
#endif
}

/**
 *        skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
 *          and update a hash.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 *      @hash: hash request to update
 */
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len,
                           struct ahash_request *hash)
{
        return __skb_datagram_iter(skb, offset, to, len, true,
                        hash_and_copy_to_iter, hash);
}
EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);

static size_t simple_copy_to_iter(const void *addr, size_t bytes,
                void *data __always_unused, struct iov_iter *i)
{
        return copy_to_iter(addr, bytes, i);
}

/**
 *        skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 */
int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len)
{
        trace_skb_copy_datagram_iovec(skb, len);
        return __skb_datagram_iter(skb, offset, to, len, false,
                        simple_copy_to_iter, NULL);
}
EXPORT_SYMBOL(skb_copy_datagram_iter);

/**
 *        skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying to
 *        @from: the copy source
 *        @len: amount of data to copy to buffer from iovec
 *
 *        Returns 0 or -EFAULT.
 */
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from,
                                 int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                if (copy_from_iter(skb->data + offset, copy, from) != copy)
                        goto fault;
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
        }

        /* Copy paged appendix. Hmm... why does this look so complicated? */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        size_t copied;

                        if (copy > len)
                                copy = len;
                        copied = copy_page_from_iter(skb_frag_page(frag),
                                          skb_frag_off(frag) + offset - start,
                                          copy, from);
                        if (copied != copy)
                                goto fault;

                        if (!(len -= copy))
                                return 0;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_datagram_from_iter(frag_iter,
                                                        offset - start,
                                                        from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
                            struct sk_buff *skb, struct iov_iter *from,
                            size_t length)
{
        int frag;

        if (msg && msg->msg_ubuf && msg->sg_from_iter)
                return msg->sg_from_iter(sk, skb, from, length);

        frag = skb_shinfo(skb)->nr_frags;

        while (length && iov_iter_count(from)) {
                struct page *head, *last_head = NULL;
                struct page *pages[MAX_SKB_FRAGS];
                int refs, order, n = 0;
                size_t start;
                ssize_t copied;
                unsigned long truesize;

                if (frag == MAX_SKB_FRAGS)
                        return -EMSGSIZE;

                copied = iov_iter_get_pages2(from, pages, length,
                                            MAX_SKB_FRAGS - frag, &start);
                if (copied < 0)
                        return -EFAULT;

                length -= copied;

                truesize = PAGE_ALIGN(copied + start);
                skb->data_len += copied;
                skb->len += copied;
                skb->truesize += truesize;
                if (sk && sk->sk_type == SOCK_STREAM) {
                        sk_wmem_queued_add(sk, truesize);
                        if (!skb_zcopy_pure(skb))
                                sk_mem_charge(sk, truesize);
                } else {
                        refcount_add(truesize, &skb->sk->sk_wmem_alloc);
                }

                head = compound_head(pages[n]);
                order = compound_order(head);

                for (refs = 0; copied != 0; start = 0) {
                        int size = min_t(int, copied, PAGE_SIZE - start);

                        if (pages[n] - head > (1UL << order) - 1) {
                                head = compound_head(pages[n]);
                                order = compound_order(head);
                        }

                        start += (pages[n] - head) << PAGE_SHIFT;
                        copied -= size;
                        n++;
                        if (frag) {
                                skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];

                                if (head == skb_frag_page(last) &&
                                    start == skb_frag_off(last) + skb_frag_size(last)) {
                                        skb_frag_size_add(last, size);
                                        /* We combined this page, we need to release
                                         * a reference. Since compound pages refcount
                                         * is shared among many pages, batch the refcount
                                         * adjustments to limit false sharing.
                                         */
                                        last_head = head;
                                        refs++;
                                        continue;
                                }
                        }
                        if (refs) {
                                page_ref_sub(last_head, refs);
                                refs = 0;
                        }
                        skb_fill_page_desc_noacc(skb, frag++, head, start, size);
                }
                if (refs)
                        page_ref_sub(last_head, refs);
        }
        return 0;
}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);

/**
 *        zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 *        @skb: buffer to copy
 *        @from: the source to copy from
 *
 *        The function will first copy up to headlen, and then pin the userspace
 *        pages and build frags through them.
 *
 *        Returns 0, -EFAULT or -EMSGSIZE.
 */
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
{
        int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));

        /* copy up to skb headlen */
        if (skb_copy_datagram_from_iter(skb, 0, from, copy))
                return -EFAULT;

        return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);

static __always_inline
size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
                              size_t len, void *from, void *priv2)
{
        __wsum next, *csum = priv2;

        next = csum_and_copy_to_user(from + progress, iter_to, len);
        *csum = csum_block_add(*csum, next, progress);
        return next ? 0 : len;
}

static __always_inline
size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
                           size_t len, void *from, void *priv2)
{
        __wsum *csum = priv2;
        __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len);

        *csum = csum_block_add(*csum, next, progress);
        return 0;
}

struct csum_state {
        __wsum csum;
        size_t off;
};

static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
                                    struct iov_iter *i)
{
        struct csum_state *csstate = _csstate;
        __wsum sum;

        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (unlikely(iov_iter_is_discard(i))) {
                // can't use csum_memcpy() for that one - data is not copied
                csstate->csum = csum_block_add(csstate->csum,
                                               csum_partial(addr, bytes, 0),
                                               csstate->off);
                csstate->off += bytes;
                return bytes;
        }

        sum = csum_shift(csstate->csum, csstate->off);

        bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
                                     copy_to_user_iter_csum,
                                     memcpy_to_iter_csum);
        csstate->csum = csum_shift(sum, csstate->off);
        csstate->off += bytes;
        return bytes;
}

/**
 *        skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
 *          and update a checksum.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 *      @csump: checksum pointer
 */
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
                                      struct iov_iter *to, int len,
                                      __wsum *csump)
{
        struct csum_state csdata = { .csum = *csump };
        int ret;

        ret = __skb_datagram_iter(skb, offset, to, len, true,
                                  csum_and_copy_to_iter, &csdata);
        if (ret)
                return ret;

        *csump = csdata.csum;
        return 0;
}

/**
 *        skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
 *        @skb: skbuff
 *        @hlen: hardware length
 *        @msg: destination
 *
 *        Caller _must_ check that skb will fit to this iovec.
 *
 *        Returns: 0       - success.
 *                 -EINVAL - checksum failure.
 *                 -EFAULT - fault during copy.
 */
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
                                   int hlen, struct msghdr *msg)
{
        __wsum csum;
        int chunk = skb->len - hlen;

        if (!chunk)
                return 0;

        if (msg_data_left(msg) < chunk) {
                if (__skb_checksum_complete(skb))
                        return -EINVAL;
                if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
                        goto fault;
        } else {
                csum = csum_partial(skb->data, hlen, skb->csum);
                if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
                                               chunk, &csum))
                        goto fault;

                if (csum_fold(csum)) {
                        iov_iter_revert(&msg->msg_iter, chunk);
                        return -EINVAL;
                }

                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(NULL, skb);
        }
        return 0;
fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);

/**
 *         datagram_poll - generic datagram poll
 *        @file: file struct
 *        @sock: socket
 *        @wait: poll table
 *
 *        Datagram poll: Again totally generic. This also handles
 *        sequenced packet sockets providing the socket receive queue
 *        is only ever holding data ready to receive.
 *
 *        Note: when you *don't* use this routine for this protocol,
 *        and you use a different write policy from sock_writeable()
 *        then please supply your own write_space callback.
 */
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           poll_table *wait)
{
        struct sock *sk = sock->sk;
        __poll_t mask;
        u8 shutdown;

        sock_poll_wait(file, sock, wait);
        mask = 0;

        /* exceptional events? */
        if (READ_ONCE(sk->sk_err) ||
            !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR |
                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);

        shutdown = READ_ONCE(sk->sk_shutdown);
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
        if (shutdown == SHUTDOWN_MASK)
                mask |= EPOLLHUP;

        /* readable? */
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Connection-based need to check for termination and startup */
        if (connection_based(sk)) {
                int state = READ_ONCE(sk->sk_state);

                if (state == TCP_CLOSE)
                        mask |= EPOLLHUP;
                /* connection hasn't started yet? */
                if (state == TCP_SYN_SENT)
                        return mask;
        }

        /* writable? */
        if (sock_writeable(sk))
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
        else
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        return mask;
}
EXPORT_SYMBOL(datagram_poll);



































































































































































































































































    1 



















































































































































































































































































































































































































































































































































































    1 







































































































































































































    2 






















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H

/*
 * Copyright 1995 Linus Torvalds
 */
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>

struct folio_batch;

unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);

static inline void invalidate_remote_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode))
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
int invalidate_inode_pages2(struct address_space *mapping);
int invalidate_inode_pages2_range(struct address_space *mapping,
                pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);

int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
int filemap_flush(struct address_space *);
int filemap_fdatawait_keep_errors(struct address_space *mapping);
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte);
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end);

static inline int filemap_fdatawait(struct address_space *mapping)
{
        return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
}

bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
int filemap_write_and_wait_range(struct address_space *mapping,
                loff_t lstart, loff_t lend);
int __filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end, int sync_mode);
int filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end);
int filemap_check_errors(struct address_space *mapping);
void __filemap_set_wb_err(struct address_space *mapping, int err);
int filemap_fdatawrite_wbc(struct address_space *mapping,
                           struct writeback_control *wbc);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);

static inline int filemap_write_and_wait(struct address_space *mapping)
{
        return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}

/**
 * filemap_set_wb_err - set a writeback error on an address_space
 * @mapping: mapping in which to set writeback error
 * @err: error to be set in mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * filemap_set_wb_err to record the error in the mapping so that it will be
 * automatically reported whenever fsync is called on the file.
 */
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
        /* Fastpath for common case of no error */
        if (unlikely(err))
                __filemap_set_wb_err(mapping, err);
}

/**
 * filemap_check_wb_err - has an error occurred since the mark was sampled?
 * @mapping: mapping to check for writeback errors
 * @since: previously-sampled errseq_t
 *
 * Grab the errseq_t value from the mapping, and see if it has changed "since"
 * the given value was sampled.
 *
 * If it has then report the latest error set, otherwise return 0.
 */
static inline int filemap_check_wb_err(struct address_space *mapping,
                                        errseq_t since)
{
        return errseq_check(&mapping->wb_err, since);
}

/**
 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
 * @mapping: mapping to be sampled
 *
 * Writeback errors are always reported relative to a particular sample point
 * in the past. This function provides those sample points.
 */
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
{
        return errseq_sample(&mapping->wb_err);
}

/**
 * file_sample_sb_err - sample the current errseq_t to test for later errors
 * @file: file pointer to be sampled
 *
 * Grab the most current superblock-level errseq_t value for the given
 * struct file.
 */
static inline errseq_t file_sample_sb_err(struct file *file)
{
        return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}

/*
 * Flush file data before changing attributes.  Caller must hold any locks
 * required to prevent further writes to this file until we're done setting
 * flags.
 */
static inline int inode_drain_writes(struct inode *inode)
{
        inode_dio_wait(inode);
        return filemap_write_and_wait(inode->i_mapping);
}

static inline bool mapping_empty(struct address_space *mapping)
{
        return xa_empty(&mapping->i_pages);
}

/*
 * mapping_shrinkable - test if page cache state allows inode reclaim
 * @mapping: the page cache mapping
 *
 * This checks the mapping's cache state for the pupose of inode
 * reclaim and LRU management.
 *
 * The caller is expected to hold the i_lock, but is not required to
 * hold the i_pages lock, which usually protects cache state. That's
 * because the i_lock and the list_lru lock that protect the inode and
 * its LRU state don't nest inside the irq-safe i_pages lock.
 *
 * Cache deletions are performed under the i_lock, which ensures that
 * when an inode goes empty, it will reliably get queued on the LRU.
 *
 * Cache additions do not acquire the i_lock and may race with this
 * check, in which case we'll report the inode as shrinkable when it
 * has cache pages. This is okay: the shrinker also checks the
 * refcount and the referenced bit, which will be elevated or set in
 * the process of adding new cache pages to an inode.
 */
static inline bool mapping_shrinkable(struct address_space *mapping)
{
        void *head;

        /*
         * On highmem systems, there could be lowmem pressure from the
         * inodes before there is highmem pressure from the page
         * cache. Make inodes shrinkable regardless of cache state.
         */
        if (IS_ENABLED(CONFIG_HIGHMEM))
                return true;

        /* Cache completely empty? Shrink away. */
        head = rcu_access_pointer(mapping->i_pages.xa_head);
        if (!head)
                return true;

        /*
         * The xarray stores single offset-0 entries directly in the
         * head pointer, which allows non-resident page cache entries
         * to escape the shadow shrinker's list of xarray nodes. The
         * inode shrinker needs to pick them up under memory pressure.
         */
        if (!xa_is_node(head) && xa_is_value(head))
                return true;

        return false;
}

/*
 * Bits in mapping->flags.
 */
enum mapping_flags {
        AS_EIO                = 0,        /* IO error on async write */
        AS_ENOSPC        = 1,        /* ENOSPC on async write */
        AS_MM_ALL_LOCKS        = 2,        /* under mm_take_all_locks() */
        AS_UNEVICTABLE        = 3,        /* e.g., ramdisk, SHM_LOCK */
        AS_EXITING        = 4,         /* final truncate in progress */
        /* writeback related tags are not used */
        AS_NO_WRITEBACK_TAGS = 5,
        AS_LARGE_FOLIO_SUPPORT = 6,
        AS_RELEASE_ALWAYS,        /* Call ->release_folio(), even if no private data */
        AS_STABLE_WRITES,        /* must wait for writeback before modifying
                                   folio contents */
        AS_UNMOVABLE,                /* The mapping cannot be moved, ever */
};

/**
 * mapping_set_error - record a writeback error in the address_space
 * @mapping: the mapping in which an error should be set
 * @error: the error to set in the mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * mapping_set_error to record the error in the mapping so that it can be
 * reported when the application calls fsync(2).
 */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
        if (likely(!error))
                return;

        /* Record in wb_err for checkers using errseq_t based tracking */
        __filemap_set_wb_err(mapping, error);

        /* Record it in superblock */
        if (mapping->host)
                errseq_set(&mapping->host->i_sb->s_wb_err, error);

        /* Record it in flags for now, for legacy callers */
        if (error == -ENOSPC)
                set_bit(AS_ENOSPC, &mapping->flags);
        else
                set_bit(AS_EIO, &mapping->flags);
}

static inline void mapping_set_unevictable(struct address_space *mapping)
{
        set_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_clear_unevictable(struct address_space *mapping)
{
        clear_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline bool mapping_unevictable(struct address_space *mapping)
{
        return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_set_exiting(struct address_space *mapping)
{
        set_bit(AS_EXITING, &mapping->flags);
}

static inline int mapping_exiting(struct address_space *mapping)
{
        return test_bit(AS_EXITING, &mapping->flags);
}

static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
        set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline int mapping_use_writeback_tags(struct address_space *mapping)
{
        return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline bool mapping_release_always(const struct address_space *mapping)
{
        return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_set_release_always(struct address_space *mapping)
{
        set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_clear_release_always(struct address_space *mapping)
{
        clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline bool mapping_stable_writes(const struct address_space *mapping)
{
        return test_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_stable_writes(struct address_space *mapping)
{
        set_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_clear_stable_writes(struct address_space *mapping)
{
        clear_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_unmovable(struct address_space *mapping)
{
        /*
         * It's expected unmovable mappings are also unevictable. Compaction
         * migrate scanner (isolate_migratepages_block()) relies on this to
         * reduce page locking.
         */
        set_bit(AS_UNEVICTABLE, &mapping->flags);
        set_bit(AS_UNMOVABLE, &mapping->flags);
}

static inline bool mapping_unmovable(struct address_space *mapping)
{
        return test_bit(AS_UNMOVABLE, &mapping->flags);
}

static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
        return mapping->gfp_mask;
}

/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
                gfp_t gfp_mask)
{
        return mapping_gfp_mask(mapping) & gfp_mask;
}

/*
 * This is non-atomic.  Only to be used before the mapping is activated.
 * Probably needs a barrier...
 */
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
        m->gfp_mask = mask;
}

/*
 * There are some parts of the kernel which assume that PMD entries
 * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
 * limit the maximum allocation order to PMD size.  I'm not aware of any
 * assumptions about maximum order if THP are disabled, but 8 seems like
 * a good order (that's 1MB if you're using 4kB pages)
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define MAX_PAGECACHE_ORDER        HPAGE_PMD_ORDER
#else
#define MAX_PAGECACHE_ORDER        8
#endif

/**
 * mapping_set_large_folios() - Indicate the file supports large folios.
 * @mapping: The file.
 *
 * The filesystem should call this function in its inode constructor to
 * indicate that the VFS can use large folios to cache the contents of
 * the file.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_large_folios(struct address_space *mapping)
{
        __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
}

/*
 * Large folio support currently depends on THP.  These dependencies are
 * being worked on but are not yet fixed.
 */
static inline bool mapping_large_folio_support(struct address_space *mapping)
{
        /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
        VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
                        "Anonymous mapping always supports large folio");

        return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
}

/* Return the maximum folio size for this pagecache mapping, in bytes. */
static inline size_t mapping_max_folio_size(struct address_space *mapping)
{
        if (mapping_large_folio_support(mapping))
                return PAGE_SIZE << MAX_PAGECACHE_ORDER;
        return PAGE_SIZE;
}

static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        return atomic_read(&mapping->nr_thps);
#else
        return 0;
#endif
}

static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_inc(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_dec(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

struct address_space *page_mapping(struct page *);
struct address_space *folio_mapping(struct folio *);
struct address_space *swapcache_mapping(struct folio *);

/**
 * folio_file_mapping - Find the mapping this folio belongs to.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Folios in the swap cache return the mapping of the
 * swap file or swap device where the data is stored.  This is different
 * from the mapping returned by folio_mapping().  The only reason to
 * use it is if, like NFS, you return 0 from ->activate_swapfile.
 *
 * Do not call this for folios which aren't in the page cache or swap cache.
 */
static inline struct address_space *folio_file_mapping(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return swapcache_mapping(folio);

        return folio->mapping;
}

/**
 * folio_flush_mapping - Find the file mapping this folio belongs to.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Anonymous folios return NULL, even if they're in
 * the swap cache.  Other kinds of folio also return NULL.
 *
 * This is ONLY used by architecture cache flushing code.  If you aren't
 * writing cache flushing code, you want either folio_mapping() or
 * folio_file_mapping().
 */
static inline struct address_space *folio_flush_mapping(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return NULL;

        return folio_mapping(folio);
}

static inline struct address_space *page_file_mapping(struct page *page)
{
        return folio_file_mapping(page_folio(page));
}

/**
 * folio_inode - Get the host inode for this folio.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the inode that this folio
 * belongs to.
 *
 * Do not call this for folios which aren't in the page cache.
 */
static inline struct inode *folio_inode(struct folio *folio)
{
        return folio->mapping->host;
}

/**
 * folio_attach_private - Attach private data to a folio.
 * @folio: Folio to attach data to.
 * @data: Data to attach to folio.
 *
 * Attaching private data to a folio increments the page's reference count.
 * The data must be detached before the folio will be freed.
 */
static inline void folio_attach_private(struct folio *folio, void *data)
{
        folio_get(folio);
        folio->private = data;
        folio_set_private(folio);
}

/**
 * folio_change_private - Change private data on a folio.
 * @folio: Folio to change the data on.
 * @data: Data to set on the folio.
 *
 * Change the private data attached to a folio and return the old
 * data.  The page must previously have had data attached and the data
 * must be detached before the folio will be freed.
 *
 * Return: Data that was previously attached to the folio.
 */
static inline void *folio_change_private(struct folio *folio, void *data)
{
        void *old = folio_get_private(folio);

        folio->private = data;
        return old;
}

/**
 * folio_detach_private - Detach private data from a folio.
 * @folio: Folio to detach data from.
 *
 * Removes the data that was previously attached to the folio and decrements
 * the refcount on the page.
 *
 * Return: Data that was attached to the folio.
 */
static inline void *folio_detach_private(struct folio *folio)
{
        void *data = folio_get_private(folio);

        if (!folio_test_private(folio))
                return NULL;
        folio_clear_private(folio);
        folio->private = NULL;
        folio_put(folio);

        return data;
}

static inline void attach_page_private(struct page *page, void *data)
{
        folio_attach_private(page_folio(page), data);
}

static inline void *detach_page_private(struct page *page)
{
        return folio_detach_private(page_folio(page));
}

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
#else
static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
        return folio_alloc_noprof(gfp, order);
}
#endif

#define filemap_alloc_folio(...)                                \
        alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))

static inline struct page *__page_cache_alloc(gfp_t gfp)
{
        return &filemap_alloc_folio(gfp, 0)->page;
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
        return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}

typedef int filler_t(struct file *, struct folio *);

pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);

/**
 * typedef fgf_t - Flags for getting folios from the page cache.
 *
 * Most users of the page cache will not need to use these flags;
 * there are convenience functions such as filemap_get_folio() and
 * filemap_lock_folio().  For users which need more control over exactly
 * what is done with the folios, these flags to __filemap_get_folio()
 * are available.
 *
 * * %FGP_ACCESSED - The folio will be marked accessed.
 * * %FGP_LOCK - The folio is returned locked.
 * * %FGP_CREAT - If no folio is present then a new folio is allocated,
 *   added to the page cache and the VM's LRU list.  The folio is
 *   returned locked.
 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
 *   folio is already in cache.  If the folio was allocated, unlock it
 *   before returning so the caller can do the same dance.
 * * %FGP_WRITE - The folio will be written to by the caller.
 * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
 * * %FGP_NOWAIT - Don't block on the folio lock.
 * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
 * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
 *   implementation.
 */
typedef unsigned int __bitwise fgf_t;

#define FGP_ACCESSED                ((__force fgf_t)0x00000001)
#define FGP_LOCK                ((__force fgf_t)0x00000002)
#define FGP_CREAT                ((__force fgf_t)0x00000004)
#define FGP_WRITE                ((__force fgf_t)0x00000008)
#define FGP_NOFS                ((__force fgf_t)0x00000010)
#define FGP_NOWAIT                ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP                ((__force fgf_t)0x00000040)
#define FGP_STABLE                ((__force fgf_t)0x00000080)
#define FGF_GET_ORDER(fgf)        (((__force unsigned)fgf) >> 26)        /* top 6 bits */

#define FGP_WRITEBEGIN                (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)

/**
 * fgf_set_order - Encode a length in the fgf_t flags.
 * @size: The suggested size of the folio to create.
 *
 * The caller of __filemap_get_folio() can use this to suggest a preferred
 * size for the folio that is created.  If there is already a folio at
 * the index, it will be returned, no matter what its size.  If a folio
 * is freshly created, it may be of a different size than requested
 * due to alignment constraints, memory pressure, or the presence of
 * other folios at nearby indices.
 */
static inline fgf_t fgf_set_order(size_t size)
{
        unsigned int shift = ilog2(size);

        if (shift <= PAGE_SHIFT)
                return 0;
        return (__force fgf_t)((shift - PAGE_SHIFT) << 26);
}

void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);

/**
 * filemap_get_folio - Find and get a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned with an increased refcount.
 *
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_get_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, 0, 0);
}

/**
 * filemap_lock_folio - Find and lock a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * Context: May sleep.
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_lock_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, FGP_LOCK, 0);
}

/**
 * filemap_grab_folio - grab a folio from the page cache
 * @mapping: The address space to search
 * @index: The page index
 *
 * Looks up the page cache entry at @mapping & @index. If no folio is found,
 * a new folio is created. The folio is locked, marked as accessed, and
 * returned.
 *
 * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
 * and failed to create a folio.
 */
static inline struct folio *filemap_grab_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                        mapping_gfp_mask(mapping));
}

/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * Otherwise, %NULL is returned.
 */
static inline struct page *find_get_page(struct address_space *mapping,
                                        pgoff_t offset)
{
        return pagecache_get_page(mapping, offset, 0, 0);
}

static inline struct page *find_get_page_flags(struct address_space *mapping,
                                        pgoff_t offset, fgf_t fgp_flags)
{
        return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @index: the page index
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page or %NULL if there is no page in the cache for this
 * index.
 */
static inline struct page *find_lock_page(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}

/**
 * find_or_create_page - locate or add a pagecache page
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * If the page is not present, a new page is allocated using @gfp_mask
 * and added to the page cache and the VM's LRU list.  The page is
 * returned locked and with an increased refcount.
 *
 * On memory exhaustion, %NULL is returned.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an
 * atomic allocation!
 */
static inline struct page *find_or_create_page(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask)
{
        return pagecache_get_page(mapping, index,
                                        FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                        gfp_mask);
}

/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

#define swapcache_index(folio)        __page_file_index(&(folio)->page)

/**
 * folio_index - File index of a folio.
 * @folio: The folio.
 *
 * For a folio which is either in the page cache or the swap cache,
 * return its index within the address_space it belongs to.  If you know
 * the page is definitely in the page cache, you can look at the folio's
 * index directly.
 *
 * Return: The index (offset in units of pages) of a folio in its file.
 */
static inline pgoff_t folio_index(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return swapcache_index(folio);
        return folio->index;
}

/**
 * folio_next_index - Get the index of the next folio.
 * @folio: The current folio.
 *
 * Return: The index of the folio which follows this folio in the file.
 */
static inline pgoff_t folio_next_index(struct folio *folio)
{
        return folio->index + folio_nr_pages(folio);
}

/**
 * folio_file_page - The page for a particular index.
 * @folio: The folio which contains this index.
 * @index: The index we want to look up.
 *
 * Sometimes after looking up a folio in the page cache, we need to
 * obtain the specific page for an index (eg a page fault).
 *
 * Return: The page containing the file data for this index.
 */
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
        return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}

/**
 * folio_contains - Does this folio contain this index?
 * @folio: The folio.
 * @index: The page index within the file.
 *
 * Context: The caller should have the page locked in order to prevent
 * (eg) shmem from moving the page between the page cache and swap cache
 * and changing its index in the middle of the operation.
 * Return: true or false.
 */
static inline bool folio_contains(struct folio *folio, pgoff_t index)
{
        return index - folio_index(folio) < folio_nr_pages(folio);
}

/*
 * Given the page we found in the page cache, return the page corresponding
 * to this index in the file
 */
static inline struct page *find_subpage(struct page *head, pgoff_t index)
{
        /* HugeTLBfs wants the head page regardless */
        if (PageHuge(head))
                return head;

        return head + (index & (thp_nr_pages(head) - 1));
}

unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);

struct page *grab_cache_page_write_begin(struct address_space *mapping,
                        pgoff_t index);

/*
 * Returns locked page at given index in given cache, creating it if needed.
 */
static inline struct page *grab_cache_page(struct address_space *mapping,
                                                                pgoff_t index)
{
        return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
}

struct folio *read_cache_folio(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index,
                gfp_t flags);
struct page *read_cache_page(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);

static inline struct page *read_mapping_page(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_page(mapping, index, NULL, file);
}

static inline struct folio *read_mapping_folio(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_folio(mapping, index, NULL, file);
}

/*
 * Get the offset in PAGE_SIZE (even for hugetlb pages).
 */
static inline pgoff_t page_to_pgoff(struct page *page)
{
        struct page *head;

        if (likely(!PageTransTail(page)))
                return page->index;

        head = compound_head(page);
        /*
         *  We don't initialize ->index for tail pages: calculate based on
         *  head page
         */
        return head->index + page - head;
}

/*
 * Return byte-offset into filesystem object for page.
 */
static inline loff_t page_offset(struct page *page)
{
        return ((loff_t)page->index) << PAGE_SHIFT;
}

static inline loff_t page_file_offset(struct page *page)
{
        return ((loff_t)page_index(page)) << PAGE_SHIFT;
}

/**
 * folio_pos - Returns the byte position of this folio in its file.
 * @folio: The folio.
 */
static inline loff_t folio_pos(struct folio *folio)
{
        return page_offset(&folio->page);
}

/**
 * folio_file_pos - Returns the byte position of this folio in its file.
 * @folio: The folio.
 *
 * This differs from folio_pos() for folios which belong to a swap file.
 * NFS is the only filesystem today which needs to use folio_file_pos().
 */
static inline loff_t folio_file_pos(struct folio *folio)
{
        return page_file_offset(&folio->page);
}

/*
 * Get the offset in PAGE_SIZE (even for hugetlb folios).
 */
static inline pgoff_t folio_pgoff(struct folio *folio)
{
        return folio->index;
}

static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
                                        unsigned long address)
{
        pgoff_t pgoff;
        pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        return pgoff;
}

struct wait_page_key {
        struct folio *folio;
        int bit_nr;
        int page_match;
};

struct wait_page_queue {
        struct folio *folio;
        int bit_nr;
        wait_queue_entry_t wait;
};

static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
{
        if (wait_page->folio != key->folio)
               return false;
        key->page_match = 1;

        if (wait_page->bit_nr != key->bit_nr)
                return false;

        return true;
}

void __folio_lock(struct folio *folio);
int __folio_lock_killable(struct folio *folio);
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf);
void unlock_page(struct page *page);
void folio_unlock(struct folio *folio);

/**
 * folio_trylock() - Attempt to lock a folio.
 * @folio: The folio to attempt to lock.
 *
 * Sometimes it is undesirable to wait for a folio to be unlocked (eg
 * when the locks are being taken in the wrong order, or if making
 * progress through a batch of folios is more important than processing
 * them in order).  Usually folio_lock() is the correct function to call.
 *
 * Context: Any context.
 * Return: Whether the lock was successfully acquired.
 */
static inline bool folio_trylock(struct folio *folio)
{
        return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
}

/*
 * Return true if the page was successfully locked
 */
static inline bool trylock_page(struct page *page)
{
        return folio_trylock(page_folio(page));
}

/**
 * folio_lock() - Lock this folio.
 * @folio: The folio to lock.
 *
 * The folio lock protects against many things, probably more than it
 * should.  It is primarily held while a folio is being brought uptodate,
 * either from its backing file or from swap.  It is also held while a
 * folio is being truncated from its address_space, so holding the lock
 * is sufficient to keep folio->mapping stable.
 *
 * The folio lock is also held while write() is modifying the page to
 * provide POSIX atomicity guarantees (as long as the write does not
 * cross a page boundary).  Other modifications to the data in the folio
 * do not hold the folio lock and can race with writes, eg DMA and stores
 * to mapped pages.
 *
 * Context: May sleep.  If you need to acquire the locks of two or
 * more folios, they must be in order of ascending index, if they are
 * in the same address_space.  If they are in different address_spaces,
 * acquire the lock of the folio which belongs to the address_space which
 * has the lowest address in memory first.
 */
static inline void folio_lock(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * lock_page() - Lock the folio containing this page.
 * @page: The page to lock.
 *
 * See folio_lock() for a description of what the lock protects.
 * This is a legacy function and new code should probably use folio_lock()
 * instead.
 *
 * Context: May sleep.  Pages in the same folio share a lock, so do not
 * attempt to lock two pages which share a folio.
 */
static inline void lock_page(struct page *page)
{
        struct folio *folio;
        might_sleep();

        folio = page_folio(page);
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * folio_lock_killable() - Lock this folio, interruptible by a fatal signal.
 * @folio: The folio to lock.
 *
 * Attempts to lock the folio, like folio_lock(), except that the sleep
 * to acquire the lock is interruptible by a fatal signal.
 *
 * Context: May sleep; see folio_lock().
 * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received.
 */
static inline int folio_lock_killable(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_killable(folio);
        return 0;
}

/*
 * folio_lock_or_retry - Lock the folio, unless this would block and the
 * caller indicated that it can handle a retry.
 *
 * Return value and mmap_lock implications depend on flags; see
 * __folio_lock_or_retry().
 */
static inline vm_fault_t folio_lock_or_retry(struct folio *folio,
                                             struct vm_fault *vmf)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_or_retry(folio, vmf);
        return 0;
}

/*
 * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
 * and should not be used directly.
 */
void folio_wait_bit(struct folio *folio, int bit_nr);
int folio_wait_bit_killable(struct folio *folio, int bit_nr);

/* 
 * Wait for a folio to be unlocked.
 *
 * This must be called with the caller "holding" the folio,
 * ie with increased folio reference count so that the folio won't
 * go away during the wait.
 */
static inline void folio_wait_locked(struct folio *folio)
{
        if (folio_test_locked(folio))
                folio_wait_bit(folio, PG_locked);
}

static inline int folio_wait_locked_killable(struct folio *folio)
{
        if (!folio_test_locked(folio))
                return 0;
        return folio_wait_bit_killable(folio, PG_locked);
}

static inline void wait_on_page_locked(struct page *page)
{
        folio_wait_locked(page_folio(page));
}

void folio_end_read(struct folio *folio, bool success);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
void end_page_writeback(struct page *page);
void folio_end_writeback(struct folio *folio);
void wait_for_stable_page(struct page *page);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
void __folio_cancel_dirty(struct folio *folio);
static inline void folio_cancel_dirty(struct folio *folio)
{
        /* Avoid atomic ops, locking, etc. when not actually needed. */
        if (folio_test_dirty(folio))
                __folio_cancel_dirty(folio);
}
bool folio_clear_dirty_for_io(struct folio *folio);
bool clear_page_dirty_for_io(struct page *page);
void folio_invalidate(struct folio *folio, size_t offset, size_t length);
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_MIGRATION
int filemap_migrate_folio(struct address_space *mapping, struct folio *dst,
                struct folio *src, enum migrate_mode mode);
#else
#define filemap_migrate_folio NULL
#endif
void folio_end_private_2(struct folio *folio);
void folio_wait_private_2(struct folio *folio);
int folio_wait_private_2_killable(struct folio *folio);

/*
 * Add an arbitrary waiter to a page's wait queue
 */
void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter);

/*
 * Fault in userspace address range.
 */
size_t fault_in_writeable(char __user *uaddr, size_t size);
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size);
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
size_t fault_in_readable(const char __user *uaddr, size_t size);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp);
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
void __filemap_remove_folio(struct folio *folio, void *shadow);
void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
                int whence);

/* Must be non-static for BPF error injection */
int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp, void **shadowp);

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte);

/**
 * filemap_range_needs_writeback - check if range potentially needs writeback
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback. Used by O_DIRECT
 * read/write with IOCB_NOWAIT, to see if the caller needs to do
 * filemap_write_and_wait_range() before proceeding.
 *
 * Return: %true if the caller should do filemap_write_and_wait_range() before
 * doing O_DIRECT to a page in this range, %false otherwise.
 */
static inline bool filemap_range_needs_writeback(struct address_space *mapping,
                                                 loff_t start_byte,
                                                 loff_t end_byte)
{
        if (!mapping->nrpages)
                return false;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                return false;
        return filemap_range_has_writeback(mapping, start_byte, end_byte);
}

/**
 * struct readahead_control - Describes a readahead request.
 *
 * A readahead request is for consecutive pages.  Filesystems which
 * implement the ->readahead method should call readahead_page() or
 * readahead_page_batch() in a loop and attempt to start I/O against
 * each page in the request.
 *
 * Most of the fields in this struct are private and should be accessed
 * by the functions below.
 *
 * @file: The file, used primarily by network filesystems for authentication.
 *          May be NULL if invoked internally by the filesystem.
 * @mapping: Readahead this filesystem object.
 * @ra: File readahead state.  May be NULL.
 */
struct readahead_control {
        struct file *file;
        struct address_space *mapping;
        struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
        pgoff_t _index;
        unsigned int _nr_pages;
        unsigned int _batch_count;
        bool _workingset;
        unsigned long _pflags;
};

#define DEFINE_READAHEAD(ractl, f, r, m, i)                                \
        struct readahead_control ractl = {                                \
                .file = f,                                                \
                .mapping = m,                                                \
                .ra = r,                                                \
                ._index = i,                                                \
        }

#define VM_READAHEAD_PAGES        (SZ_128K / PAGE_SIZE)

void page_cache_ra_unbounded(struct readahead_control *,
                unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct folio *,
                unsigned long req_count);
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len);

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
static inline
void page_cache_sync_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file, pgoff_t index,
                unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, index);
        page_cache_sync_ra(&ractl, req_count);
}

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @folio: The folio at @index which triggered the readahead call.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_async_readahead() should be called when a page is used which
 * is marked as PageReadahead; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
static inline
void page_cache_async_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file,
                struct folio *folio, pgoff_t index, unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, index);
        page_cache_async_ra(&ractl, folio, req_count);
}

static inline struct folio *__readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio;

        BUG_ON(ractl->_batch_count > ractl->_nr_pages);
        ractl->_nr_pages -= ractl->_batch_count;
        ractl->_index += ractl->_batch_count;

        if (!ractl->_nr_pages) {
                ractl->_batch_count = 0;
                return NULL;
        }

        folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        ractl->_batch_count = folio_nr_pages(folio);

        return folio;
}

/**
 * readahead_page - Get the next page to read.
 * @ractl: The current readahead request.
 *
 * Context: The page is locked and has an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: A pointer to the next page, or %NULL if we are done.
 */
static inline struct page *readahead_page(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        return &folio->page;
}

/**
 * readahead_folio - Get the next folio to read.
 * @ractl: The current readahead request.
 *
 * Context: The folio is locked.  The caller should unlock the folio once
 * all I/O to that folio has completed.
 * Return: A pointer to the next folio, or %NULL if we are done.
 */
static inline struct folio *readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        if (folio)
                folio_put(folio);
        return folio;
}

static inline unsigned int __readahead_batch(struct readahead_control *rac,
                struct page **array, unsigned int array_sz)
{
        unsigned int i = 0;
        XA_STATE(xas, &rac->mapping->i_pages, 0);
        struct page *page;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;
        rac->_batch_count = 0;

        xas_set(&xas, rac->_index);
        rcu_read_lock();
        xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
                if (xas_retry(&xas, page))
                        continue;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageTail(page), page);
                array[i++] = page;
                rac->_batch_count += thp_nr_pages(page);
                if (i == array_sz)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * readahead_page_batch - Get a batch of pages to read.
 * @rac: The current readahead request.
 * @array: An array of pointers to struct page.
 *
 * Context: The pages are locked and have an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: The number of pages placed in the array.  0 indicates the request
 * is complete.
 */
#define readahead_page_batch(rac, array)                                \
        __readahead_batch(rac, array, ARRAY_SIZE(array))

/**
 * readahead_pos - The byte offset into the file of this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_pos(struct readahead_control *rac)
{
        return (loff_t)rac->_index * PAGE_SIZE;
}

/**
 * readahead_length - The number of bytes in this readahead request.
 * @rac: The readahead request.
 */
static inline size_t readahead_length(struct readahead_control *rac)
{
        return rac->_nr_pages * PAGE_SIZE;
}

/**
 * readahead_index - The index of the first page in this readahead request.
 * @rac: The readahead request.
 */
static inline pgoff_t readahead_index(struct readahead_control *rac)
{
        return rac->_index;
}

/**
 * readahead_count - The number of pages in this readahead request.
 * @rac: The readahead request.
 */
static inline unsigned int readahead_count(struct readahead_control *rac)
{
        return rac->_nr_pages;
}

/**
 * readahead_batch_length - The number of bytes in the current batch.
 * @rac: The readahead request.
 */
static inline size_t readahead_batch_length(struct readahead_control *rac)
{
        return rac->_batch_count * PAGE_SIZE;
}

static inline unsigned long dir_pages(struct inode *inode)
{
        return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
                               PAGE_SHIFT;
}

/**
 * folio_mkwrite_check_truncate - check if folio was truncated
 * @folio: the folio to check
 * @inode: the inode to check the folio against
 *
 * Return: the number of bytes in the folio up to EOF,
 * or -EFAULT if the folio was truncated.
 */
static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
                                              struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        size_t offset = offset_in_folio(folio, size);

        if (!folio->mapping)
                return -EFAULT;

        /* folio is wholly inside EOF */
        if (folio_next_index(folio) - 1 < index)
                return folio_size(folio);
        /* folio is wholly past EOF */
        if (folio->index > index || !offset)
                return -EFAULT;
        /* folio is partially inside EOF */
        return offset;
}

/**
 * page_mkwrite_check_truncate - check if page was truncated
 * @page: the page to check
 * @inode: the inode to check the page against
 *
 * Returns the number of bytes in the page up to EOF,
 * or -EFAULT if the page was truncated.
 */
static inline int page_mkwrite_check_truncate(struct page *page,
                                              struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        int offset = offset_in_page(size);

        if (page->mapping != inode->i_mapping)
                return -EFAULT;

        /* page is wholly inside EOF */
        if (page->index < index)
                return PAGE_SIZE;
        /* page is wholly past EOF */
        if (page->index > index || !offset)
                return -EFAULT;
        /* page is partially inside EOF */
        return offset;
}

/**
 * i_blocks_per_folio - How many blocks fit in this folio.
 * @inode: The inode which contains the blocks.
 * @folio: The folio.
 *
 * If the block size is larger than the size of this folio, return zero.
 *
 * Context: The caller should hold a refcount on the folio to prevent it
 * from being split.
 * Return: The number of filesystem blocks covered by this folio.
 */
static inline
unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
{
        return folio_size(folio) >> inode->i_blkbits;
}

static inline
unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
{
        return i_blocks_per_folio(inode, page_folio(page));
}
#endif /* _LINUX_PAGEMAP_H */







































































































































































































































































































































































    2 





    3 






    1 
    2 






    3 





    3 






    2 
    2 
























    1 











    1 



































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/bitmap.c
 * Helper functions for bitmap.h.
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/slab.h>

/**
 * DOC: bitmap introduction
 *
 * bitmaps provide an array of bits, implemented using an
 * array of unsigned longs.  The number of valid bits in a
 * given bitmap does _not_ need to be an exact multiple of
 * BITS_PER_LONG.
 *
 * The possible unused bits in the last, partially used word
 * of a bitmap are 'don't care'.  The implementation makes
 * no particular effort to keep them zero.  It ensures that
 * their value will not affect the results of any operation.
 * The bitmap operations that return Boolean (bitmap_empty,
 * for example) or scalar (bitmap_weight, for example) results
 * carefully filter out these unused bits from impacting their
 * results.
 *
 * The byte ordering of bitmaps is more natural on little
 * endian architectures.  See the big-endian headers
 * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
 * for the best explanations of this ordering.
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] != bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;

        return true;
}
EXPORT_SYMBOL(__bitmap_equal);

bool __bitmap_or_equal(const unsigned long *bitmap1,
                       const unsigned long *bitmap2,
                       const unsigned long *bitmap3,
                       unsigned int bits)
{
        unsigned int k, lim = bits / BITS_PER_LONG;
        unsigned long tmp;

        for (k = 0; k < lim; ++k) {
                if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
                        return false;
        }

        if (!(bits % BITS_PER_LONG))
                return true;

        tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
        return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}

void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
        unsigned int k, lim = BITS_TO_LONGS(bits);
        for (k = 0; k < lim; ++k)
                dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);

/**
 * __bitmap_shift_right - logical right shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting right (dividing) means moving bits in the MS -> LS bit
 * direction.  Zeros are fed into the vacated MS positions and the
 * LS bits shifted off the bottom are lost.
 */
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned shift, unsigned nbits)
{
        unsigned k, lim = BITS_TO_LONGS(nbits);
        unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
        for (k = 0; off + k < lim; ++k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take lower rem bits of
                 * word above and make them the top rem bits of result.
                 */
                if (!rem || off + k + 1 >= lim)
                        upper = 0;
                else {
                        upper = src[off + k + 1];
                        if (off + k + 1 == lim - 1)
                                upper &= mask;
                        upper <<= (BITS_PER_LONG - rem);
                }
                lower = src[off + k];
                if (off + k == lim - 1)
                        lower &= mask;
                lower >>= rem;
                dst[k] = lower | upper;
        }
        if (off)
                memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);


/**
 * __bitmap_shift_left - logical left shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting left (multiplying) means moving bits in the LS -> MS
 * direction.  Zeros are fed into the vacated LS bit positions
 * and those MS bits shifted off the top are lost.
 */

void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        int k;
        unsigned int lim = BITS_TO_LONGS(nbits);
        unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        for (k = lim - off - 1; k >= 0; --k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take upper rem bits of
                 * word below and make them the bottom rem bits of result.
                 */
                if (rem && k > 0)
                        lower = src[k - 1] >> (BITS_PER_LONG - rem);
                else
                        lower = 0;
                upper = src[k] << rem;
                dst[k + off] = lower | upper;
        }
        if (off)
                memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);

/**
 * bitmap_cut() - remove bit region from bitmap and right shift remaining bits
 * @dst: destination bitmap, might overlap with src
 * @src: source bitmap
 * @first: start bit of region to be removed
 * @cut: number of bits to remove
 * @nbits: bitmap size, in bits
 *
 * Set the n-th bit of @dst iff the n-th bit of @src is set and
 * n is less than @first, or the m-th bit of @src is set for any
 * m such that @first <= n < nbits, and m = n + @cut.
 *
 * In pictures, example for a big-endian 32-bit architecture:
 *
 * The @src bitmap is::
 *
 *   31                                   63
 *   |                                    |
 *   10000000 11000001 11110010 00010101  10000000 11000001 01110010 00010101
 *                   |  |              |                                    |
 *                  16  14             0                                   32
 *
 * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
 *
 *   31                                   63
 *   |                                    |
 *   10110000 00011000 00110010 00010101  00010000 00011000 00101110 01000010
 *                      |              |                                    |
 *                      14 (bit 17     0                                   32
 *                          from @src)
 *
 * Note that @dst and @src might overlap partially or entirely.
 *
 * This is implemented in the obvious way, with a shift and carry
 * step for each moved bit. Optimisation is left as an exercise
 * for the compiler.
 */
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits);
        unsigned long keep = 0, carry;
        int i;

        if (first % BITS_PER_LONG) {
                keep = src[first / BITS_PER_LONG] &
                       (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
        }

        memmove(dst, src, len * sizeof(*dst));

        while (cut--) {
                for (i = first / BITS_PER_LONG; i < len; i++) {
                        if (i < len - 1)
                                carry = dst[i + 1] & 1UL;
                        else
                                carry = 0;

                        dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
                }
        }

        dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
        dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);

bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);

void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);

void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);

bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);

void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(nbits);

        for (k = 0; k < nr; k++)
                dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);

bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & bitmap2[k])
                        return true;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return true;
        return false;
}
EXPORT_SYMBOL(__bitmap_intersects);

bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & ~bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;
        return true;
}
EXPORT_SYMBOL(__bitmap_subset);

#define BITMAP_WEIGHT(FETCH, bits)        \
({                                                                                \
        unsigned int __bits = (bits), idx, w = 0;                                \
                                                                                \
        for (idx = 0; idx < __bits / BITS_PER_LONG; idx++)                        \
                w += hweight_long(FETCH);                                        \
                                                                                \
        if (__bits % BITS_PER_LONG)                                                \
                w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits));        \
                                                                                \
        w;                                                                        \
})

unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight);

unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_and);

unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_andnot);

void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_LONG;
                mask_to_set = ~0UL;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_WORD_MASK(size);
                *p |= mask_to_set;
        }
}
EXPORT_SYMBOL(__bitmap_set);

void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_clear >= 0) {
                *p &= ~mask_to_clear;
                len -= bits_to_clear;
                bits_to_clear = BITS_PER_LONG;
                mask_to_clear = ~0UL;
                p++;
        }
        if (len) {
                mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
                *p &= ~mask_to_clear;
        }
}
EXPORT_SYMBOL(__bitmap_clear);

/**
 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 * @align_offset: Alignment offset for zero area.
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds plus @align_offset
 * is multiple of that power of 2.
 */
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset)
{
        unsigned long index, end, i;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                start = i + 1;
                goto again;
        }
        return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);

/**
 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
 *        @buf: pointer to a bitmap
 *        @pos: a bit position in @buf (0 <= @pos < @nbits)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the bit at position @pos in @buf (of length @nbits) to the
 * ordinal of which set bit it is.  If it is not set or if @pos
 * is not a valid bit position, map to -1.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @pos
 * values 4 through 7 will get mapped to 0 through 3, respectively,
 * and other @pos values will get mapped to -1.  When @pos value 7
 * gets mapped to (returns) @ord value 3 in this example, that means
 * that bit 7 is the 3rd (starting with 0th) set bit in @buf.
 *
 * The bit positions 0 through @bits are valid positions in @buf.
 */
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
        if (pos >= nbits || !test_bit(pos, buf))
                return -1;

        return bitmap_weight(buf, pos);
}

/**
 * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
 *        @dst: remapped result
 *        @src: subset to be remapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @nbits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * If either of the @old and @new bitmaps are empty, or if @src and
 * @dst point to the same location, then this routine copies @src
 * to @dst.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to @src, placing the result in
 * @dst, clearing any bits previously set in @dst.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @src comes into this routine
 * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
 * 13 and 15 set.
 */
void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new,
                unsigned int nbits)
{
        unsigned int oldbit, w;

        if (dst == src)                /* following doesn't handle inplace remaps */
                return;
        bitmap_zero(dst, nbits);

        w = bitmap_weight(new, nbits);
        for_each_set_bit(oldbit, src, nbits) {
                int n = bitmap_pos_to_ord(old, oldbit, nbits);

                if (n < 0 || w == 0)
                        set_bit(oldbit, dst);        /* identity map */
                else
                        set_bit(find_nth_bit(new, nbits, n % w), dst);
        }
}
EXPORT_SYMBOL(bitmap_remap);

/**
 * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
 *        @oldbit: bit position to be mapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @bits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to bit position @oldbit, returning
 * the new bit position.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @oldbit is 5, then this routine
 * returns 13.
 */
int bitmap_bitremap(int oldbit, const unsigned long *old,
                                const unsigned long *new, int bits)
{
        int w = bitmap_weight(new, bits);
        int n = bitmap_pos_to_ord(old, oldbit, bits);
        if (n < 0 || w == 0)
                return oldbit;
        else
                return find_nth_bit(new, bits, n % w);
}
EXPORT_SYMBOL(bitmap_bitremap);

#ifdef CONFIG_NUMA
/**
 * bitmap_onto - translate one bitmap relative to another
 *        @dst: resulting translated bitmap
 *         @orig: original untranslated bitmap
 *         @relmap: bitmap relative to which translated
 *        @bits: number of bits in each of these bitmaps
 *
 * Set the n-th bit of @dst iff there exists some m such that the
 * n-th bit of @relmap is set, the m-th bit of @orig is set, and
 * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
 * (If you understood the previous sentence the first time your
 * read it, you're overqualified for your current job.)
 *
 * In other words, @orig is mapped onto (surjectively) @dst,
 * using the map { <n, m> | the n-th bit of @relmap is the
 * m-th set bit of @relmap }.
 *
 * Any set bits in @orig above bit number W, where W is the
 * weight of (number of set bits in) @relmap are mapped nowhere.
 * In particular, if for all bits m set in @orig, m >= W, then
 * @dst will end up empty.  In situations where the possibility
 * of such an empty result is not desired, one way to avoid it is
 * to use the bitmap_fold() operator, below, to first fold the
 * @orig bitmap over itself so that all its set bits x are in the
 * range 0 <= x < W.  The bitmap_fold() operator does this by
 * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
 *
 * Example [1] for bitmap_onto():
 *  Let's say @relmap has bits 30-39 set, and @orig has bits
 *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
 *  @dst will have bits 31, 33, 35, 37 and 39 set.
 *
 *  When bit 0 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the first bit (if any)
 *  that is turned on in @relmap.  Since bit 0 was off in the
 *  above example, we leave off that bit (bit 30) in @dst.
 *
 *  When bit 1 is set in @orig (as in the above example), it
 *  means turn on the bit in @dst corresponding to whatever
 *  is the second bit that is turned on in @relmap.  The second
 *  bit in @relmap that was turned on in the above example was
 *  bit 31, so we turned on bit 31 in @dst.
 *
 *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
 *  because they were the 4th, 6th, 8th and 10th set bits
 *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
 *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
 *
 *  When bit 11 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the twelfth bit that is
 *  turned on in @relmap.  In the above example, there were
 *  only ten bits turned on in @relmap (30..39), so that bit
 *  11 was set in @orig had no affect on @dst.
 *
 * Example [2] for bitmap_fold() + bitmap_onto():
 *  Let's say @relmap has these ten bits set::
 *
 *                40 41 42 43 45 48 53 61 74 95
 *
 *  (for the curious, that's 40 plus the first ten terms of the
 *  Fibonacci sequence.)
 *
 *  Further lets say we use the following code, invoking
 *  bitmap_fold() then bitmap_onto, as suggested above to
 *  avoid the possibility of an empty @dst result::
 *
 *        unsigned long *tmp;        // a temporary bitmap's bits
 *
 *        bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
 *        bitmap_onto(dst, tmp, relmap, bits);
 *
 *  Then this table shows what various values of @dst would be, for
 *  various @orig's.  I list the zero-based positions of each set bit.
 *  The tmp column shows the intermediate result, as computed by
 *  using bitmap_fold() to fold the @orig bitmap modulo ten
 *  (the weight of @relmap):
 *
 *      =============== ============== =================
 *      @orig           tmp            @dst
 *      0                0             40
 *      1                1             41
 *      9                9             95
 *      10               0             40 [#f1]_
 *      1 3 5 7          1 3 5 7       41 43 48 61
 *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
 *      0 9 18 27        0 9 8 7       40 61 74 95
 *      0 10 20 30       0             40
 *      0 11 22 33       0 1 2 3       40 41 42 43
 *      0 12 24 36       0 2 4 6       40 42 45 53
 *      78 102 211       1 2 8         41 42 74 [#f1]_
 *      =============== ============== =================
 *
 * .. [#f1]
 *
 *     For these marked lines, if we hadn't first done bitmap_fold()
 *     into tmp, then the @dst result would have been empty.
 *
 * If either of @orig or @relmap is empty (no set bits), then @dst
 * will be returned empty.
 *
 * If (as explained above) the only set bits in @orig are in positions
 * m where m >= W, (where W is the weight of @relmap) then @dst will
 * once again be returned empty.
 *
 * All bits in @dst not set by the above rule are cleared.
 */
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                        const unsigned long *relmap, unsigned int bits)
{
        unsigned int n, m;        /* same meaning as in above comment */

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, bits);

        /*
         * The following code is a more efficient, but less
         * obvious, equivalent to the loop:
         *        for (m = 0; m < bitmap_weight(relmap, bits); m++) {
         *                n = find_nth_bit(orig, bits, m);
         *                if (test_bit(m, orig))
         *                        set_bit(n, dst);
         *        }
         */

        m = 0;
        for_each_set_bit(n, relmap, bits) {
                /* m == bitmap_pos_to_ord(relmap, n, bits) */
                if (test_bit(m, orig))
                        set_bit(n, dst);
                m++;
        }
}

/**
 * bitmap_fold - fold larger bitmap into smaller, modulo specified size
 *        @dst: resulting smaller bitmap
 *        @orig: original larger bitmap
 *        @sz: specified size
 *        @nbits: number of bits in each of these bitmaps
 *
 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
 * Clear all other bits in @dst.  See further the comment and
 * Example [2] for bitmap_onto() for why and how to use this.
 */
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                        unsigned int sz, unsigned int nbits)
{
        unsigned int oldbit;

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, nbits);

        for_each_set_bit(oldbit, orig, nbits)
                set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */

unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
        return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                             flags);
}
EXPORT_SYMBOL(bitmap_alloc);

unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
        return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);

unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                                  flags, node);
}
EXPORT_SYMBOL(bitmap_alloc_node);

unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(bitmap_zalloc_node);

void bitmap_free(const unsigned long *bitmap)
{
        kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);

static void devm_bitmap_free(void *data)
{
        unsigned long *bitmap = data;

        bitmap_free(bitmap);
}

unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags)
{
        unsigned long *bitmap;
        int ret;

        bitmap = bitmap_alloc(nbits, flags);
        if (!bitmap)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
        if (ret)
                return NULL;

        return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);

unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags)
{
        return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);

#if BITS_PER_LONG == 64
/**
 * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u32 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                bitmap[i/2] = (unsigned long) buf[i];
                if (++i < halfwords)
                        bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
        }

        /* Clear tail bits in last word beyond nbits. */
        if (nbits % BITS_PER_LONG)
                bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);

/**
 * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
 *        @buf: array of u32 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
                if (++i < halfwords)
                        buf[i] = (u32) (bitmap[i/2] >> 32);
        }

        /* Clear tail bits in last element of array beyond nbits. */
        if (nbits % BITS_PER_LONG)
                buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);
#endif

#if BITS_PER_LONG == 32
/**
 * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u64 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits)
{
        int n;

        for (n = nbits; n > 0; n -= 64) {
                u64 val = *buf++;

                *bitmap++ = val;
                if (n > 32)
                        *bitmap++ = val >> 32;
        }

        /*
         * Clear tail bits in the last word beyond nbits.
         *
         * Negative index is OK because here we point to the word next
         * to the last word of the bitmap, except for nbits == 0, which
         * is tested implicitly.
         */
        if (nbits % BITS_PER_LONG)
                bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr64);

/**
 * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits
 *        @buf: array of u64 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        const unsigned long *end = bitmap + BITS_TO_LONGS(nbits);

        while (bitmap < end) {
                *buf = *bitmap++;
                if (bitmap < end)
                        *buf |= (u64)(*bitmap++) << 32;
                buf++;
        }

        /* Clear tail bits in the last element of array beyond nbits. */
        if (nbits % 64)
                buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0);
}
EXPORT_SYMBOL(bitmap_to_arr64);
#endif












































































































































    4 


    3 
    2 
    3 



















    2 


























































    2 

    2 


    2 














    1 





































    1 


















    2 


    2 

    2 























































































    2 

    2 





    2 
    2 
    2 

























    2 


    2 







    2 







    2 





    2 
    2 























    2 








    2 
    2 
    2 







    2 















    2 











    2 

    2 

    2 






    2 

    2 

















    2 







    1 



    2 

    1 
    2 
    2 




    2 







    1 














    1 




    1 



















    2 













    2 
    2 

    2 










    2 

















    2 
    2 
    2 









    2 






    2 





    2 












    1 














    1 


    1 










    1 






















    1 


















    1 












    1 





    1 


    1 




































































    1 



    2 


















    1 
    1 















    2 







    2 














    2 




    1 


































    2 

























    2 













    2 









    2 


    2 




    2 


































    1 







    1 





    1 


















    1 






    1 












    1 








































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/freezer.h>
#include <net/busy_poll.h>
#include <linux/vmalloc.h>

#include <linux/uaccess.h>


/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

#define MAX_SLACK        (100 * NSEC_PER_MSEC)

static long __estimate_accuracy(struct timespec64 *tv)
{
        long slack;
        int divfactor = 1000;

        if (tv->tv_sec < 0)
                return 0;

        if (task_nice(current) > 0)
                divfactor = divfactor / 5;

        if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
                return MAX_SLACK;

        slack = tv->tv_nsec / divfactor;
        slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

        if (slack > MAX_SLACK)
                return MAX_SLACK;

        return slack;
}

u64 select_estimate_accuracy(struct timespec64 *tv)
{
        u64 ret;
        struct timespec64 now;

        /*
         * Realtime tasks get a slack of 0 for obvious reasons.
         */

        if (rt_task(current))
                return 0;

        ktime_get_ts64(&now);
        now = timespec64_sub(*tv, now);
        ret = __estimate_accuracy(&now);
        if (ret < current->timer_slack_ns)
                return current->timer_slack_ns;
        return ret;
}



struct poll_table_page {
        struct poll_table_page * next;
        struct poll_table_entry * entry;
        struct poll_table_entry entries[];
};

#define POLL_TABLE_FULL(table) \
        ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                       poll_table *p);

void poll_initwait(struct poll_wqueues *pwq)
{
        init_poll_funcptr(&pwq->pt, __pollwait);
        pwq->polling_task = current;
        pwq->triggered = 0;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
}
EXPORT_SYMBOL(poll_initwait);

static void free_poll_entry(struct poll_table_entry *entry)
{
        remove_wait_queue(entry->wait_address, &entry->wait);
        fput(entry->filp);
}

void poll_freewait(struct poll_wqueues *pwq)
{
        struct poll_table_page * p = pwq->table;
        int i;
        for (i = 0; i < pwq->inline_index; i++)
                free_poll_entry(pwq->inline_entries + i);
        while (p) {
                struct poll_table_entry * entry;
                struct poll_table_page *old;

                entry = p->entry;
                do {
                        entry--;
                        free_poll_entry(entry);
                } while (entry > p->entries);
                old = p;
                p = p->next;
                free_page((unsigned long) old);
        }
}
EXPORT_SYMBOL(poll_freewait);

static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
        struct poll_table_page *table = p->table;

        if (p->inline_index < N_INLINE_POLL_ENTRIES)
                return p->inline_entries + p->inline_index++;

        if (!table || POLL_TABLE_FULL(table)) {
                struct poll_table_page *new_table;

                new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
                if (!new_table) {
                        p->error = -ENOMEM;
                        return NULL;
                }
                new_table->entry = new_table->entries;
                new_table->next = table;
                p->table = new_table;
                table = new_table;
        }

        return table->entry++;
}

static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct poll_wqueues *pwq = wait->private;
        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

        /*
         * Although this function is called under waitqueue lock, LOCK
         * doesn't imply write barrier and the users expect write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
         * and is paired with smp_store_mb() in poll_schedule_timeout.
         */
        smp_wmb();
        pwq->triggered = 1;

        /*
         * Perform the default wake up operation using a dummy
         * waitqueue.
         *
         * TODO: This is hacky but there currently is no interface to
         * pass in @sync.  @sync is scheduled to be removed and once
         * that happens, wake_up_process() can be used directly.
         */
        return default_wake_function(&dummy_wait, mode, sync, key);
}

static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct poll_table_entry *entry;

        entry = container_of(wait, struct poll_table_entry, wait);
        if (key && !(key_to_poll(key) & entry->key))
                return 0;
        return __pollwake(wait, mode, sync, key);
}

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
{
        struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
        struct poll_table_entry *entry = poll_get_entry(pwq);
        if (!entry)
                return;
        entry->filp = get_file(filp);
        entry->wait_address = wait_address;
        entry->key = p->_key;
        init_waitqueue_func_entry(&entry->wait, pollwake);
        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
}

static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
                          ktime_t *expires, unsigned long slack)
{
        int rc = -EINTR;

        set_current_state(state);
        if (!pwq->triggered)
                rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
        __set_current_state(TASK_RUNNING);

        /*
         * Prepare for the next iteration.
         *
         * The following smp_store_mb() serves two purposes.  First, it's
         * the counterpart rmb of the wmb in pollwake() such that data
         * written before wake up is always visible after wake up.
         * Second, the full barrier guarantees that triggered clearing
         * doesn't pass event check of the next iteration.  Note that
         * this problem doesn't exist for the first iteration as
         * add_wait_queue() has full barrier semantics.
         */
        smp_store_mb(pwq->triggered, 0);

        return rc;
}

/**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:                pointer to timespec64 variable for the final timeout
 * @sec:        seconds (from user space)
 * @nsec:        nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
        struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};

        if (!timespec64_valid(&ts))
                return -EINVAL;

        /* Optimize for the zero timeout value here */
        if (!sec && !nsec) {
                to->tv_sec = to->tv_nsec = 0;
        } else {
                ktime_get_ts64(to);
                *to = timespec64_add_safe(*to, ts);
        }
        return 0;
}

enum poll_time_type {
        PT_TIMEVAL = 0,
        PT_OLD_TIMEVAL = 1,
        PT_TIMESPEC = 2,
        PT_OLD_TIMESPEC = 3,
};

static int poll_select_finish(struct timespec64 *end_time,
                              void __user *p,
                              enum poll_time_type pt_type, int ret)
{
        struct timespec64 rts;

        restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);

        if (!p)
                return ret;

        if (current->personality & STICKY_TIMEOUTS)
                goto sticky;

        /* No update for zero timeout */
        if (!end_time->tv_sec && !end_time->tv_nsec)
                return ret;

        ktime_get_ts64(&rts);
        rts = timespec64_sub(*end_time, rts);
        if (rts.tv_sec < 0)
                rts.tv_sec = rts.tv_nsec = 0;


        switch (pt_type) {
        case PT_TIMEVAL:
                {
                        struct __kernel_old_timeval rtv;

                        if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
                                memset(&rtv, 0, sizeof(rtv));
                        rtv.tv_sec = rts.tv_sec;
                        rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
                        if (!copy_to_user(p, &rtv, sizeof(rtv)))
                                return ret;
                }
                break;
        case PT_OLD_TIMEVAL:
                {
                        struct old_timeval32 rtv;

                        rtv.tv_sec = rts.tv_sec;
                        rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
                        if (!copy_to_user(p, &rtv, sizeof(rtv)))
                                return ret;
                }
                break;
        case PT_TIMESPEC:
                if (!put_timespec64(&rts, p))
                        return ret;
                break;
        case PT_OLD_TIMESPEC:
                if (!put_old_timespec32(&rts, p))
                        return ret;
                break;
        default:
                BUG();
        }
        /*
         * If an application puts its timeval in read-only memory, we
         * don't want the Linux-specific update to the timeval to
         * cause a fault after the select has completed
         * successfully. However, because we're not updating the
         * timeval, we can't restart the system call.
         */

sticky:
        if (ret == -ERESTARTNOHAND)
                ret = -EINTR;
        return ret;
}

/*
 * Scalable version of the fd_set.
 */

typedef struct {
        unsigned long *in, *out, *ex;
        unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
#define FDS_BITPERLONG        (8*sizeof(long))
#define FDS_LONGS(nr)        (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)        (FDS_LONGS(nr)*sizeof(long))

/*
 * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
 */
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
        nr = FDS_BYTES(nr);
        if (ufdset)
                return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

        memset(fdset, 0, nr);
        return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
        if (ufdset)
                return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
        return 0;
}

static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
        memset(fdset, 0, FDS_BYTES(nr));
}

#define FDS_IN(fds, n)                (fds->in + n)
#define FDS_OUT(fds, n)                (fds->out + n)
#define FDS_EX(fds, n)                (fds->ex + n)

#define BITS(fds, n)        (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
        unsigned long *open_fds;
        unsigned long set;
        int max;
        struct fdtable *fdt;

        /* handle last in-complete long-word first */
        set = ~(~0UL << (n & (BITS_PER_LONG-1)));
        n /= BITS_PER_LONG;
        fdt = files_fdtable(current->files);
        open_fds = fdt->open_fds + n;
        max = 0;
        if (set) {
                set &= BITS(fds, n);
                if (set) {
                        if (!(set & ~*open_fds))
                                goto get_max;
                        return -EBADF;
                }
        }
        while (n) {
                open_fds--;
                n--;
                set = BITS(fds, n);
                if (!set)
                        continue;
                if (set & ~*open_fds)
                        return -EBADF;
                if (max)
                        continue;
get_max:
                do {
                        max++;
                        set >>= 1;
                } while (set);
                max += n * BITS_PER_LONG;
        }

        return max;
}

#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\
                        EPOLLNVAL)
#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\
                         EPOLLNVAL)
#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)

static inline void wait_key_set(poll_table *wait, unsigned long in,
                                unsigned long out, unsigned long bit,
                                __poll_t ll_flag)
{
        wait->_key = POLLEX_SET | ll_flag;
        if (in & bit)
                wait->_key |= POLLIN_SET;
        if (out & bit)
                wait->_key |= POLLOUT_SET;
}

static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
        ktime_t expire, *to = NULL;
        struct poll_wqueues table;
        poll_table *wait;
        int retval, i, timed_out = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        rcu_read_lock();
        retval = max_select_fd(n, fds);
        rcu_read_unlock();

        if (retval < 0)
                return retval;
        n = retval;

        poll_initwait(&table);
        wait = &table.pt;
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                wait->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);

        retval = 0;
        for (;;) {
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
                bool can_busy_loop = false;

                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

                for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
                        unsigned long in, out, ex, all_bits, bit = 1, j;
                        unsigned long res_in = 0, res_out = 0, res_ex = 0;
                        __poll_t mask;

                        in = *inp++; out = *outp++; ex = *exp++;
                        all_bits = in | out | ex;
                        if (all_bits == 0) {
                                i += BITS_PER_LONG;
                                continue;
                        }

                        for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                                struct fd f;
                                if (i >= n)
                                        break;
                                if (!(bit & all_bits))
                                        continue;
                                mask = EPOLLNVAL;
                                f = fdget(i);
                                if (f.file) {
                                        wait_key_set(wait, in, out, bit,
                                                     busy_flag);
                                        mask = vfs_poll(f.file, wait);

                                        fdput(f);
                                }
                                if ((mask & POLLIN_SET) && (in & bit)) {
                                        res_in |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                if ((mask & POLLOUT_SET) && (out & bit)) {
                                        res_out |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                if ((mask & POLLEX_SET) && (ex & bit)) {
                                        res_ex |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                /* got something, stop busy polling */
                                if (retval) {
                                        can_busy_loop = false;
                                        busy_flag = 0;

                                /*
                                 * only remember a returned
                                 * POLL_BUSY_LOOP if we asked for it
                                 */
                                } else if (busy_flag & mask)
                                        can_busy_loop = true;

                        }
                        if (res_in)
                                *rinp = res_in;
                        if (res_out)
                                *routp = res_out;
                        if (res_ex)
                                *rexp = res_ex;
                        cond_resched();
                }
                wait->_qproc = NULL;
                if (retval || timed_out || signal_pending(current))
                        break;
                if (table.error) {
                        retval = table.error;
                        break;
                }

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                                           to, slack))
                        timed_out = 1;
        }

        poll_freewait(&table);

        return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time)
{
        fd_set_bits fds;
        void *bits;
        int ret, max_fds;
        size_t size, alloc_size;
        struct fdtable *fdt;
        /* Allocate small arguments on the stack to save memory and be faster */
        long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

        ret = -EINVAL;
        if (n < 0)
                goto out_nofds;

        /* max_fds can increase, so grab it once to avoid race */
        rcu_read_lock();
        fdt = files_fdtable(current->files);
        max_fds = fdt->max_fds;
        rcu_read_unlock();
        if (n > max_fds)
                n = max_fds;

        /*
         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
         * since we used fdset we need to allocate memory in units of
         * long-words. 
         */
        size = FDS_BYTES(n);
        bits = stack_fds;
        if (size > sizeof(stack_fds) / 6) {
                /* Not enough space in on-stack array; must use kmalloc */
                ret = -ENOMEM;
                if (size > (SIZE_MAX / 6))
                        goto out_nofds;

                alloc_size = 6 * size;
                bits = kvmalloc(alloc_size, GFP_KERNEL);
                if (!bits)
                        goto out_nofds;
        }
        fds.in      = bits;
        fds.out     = bits +   size;
        fds.ex      = bits + 2*size;
        fds.res_in  = bits + 3*size;
        fds.res_out = bits + 4*size;
        fds.res_ex  = bits + 5*size;

        if ((ret = get_fd_set(n, inp, fds.in)) ||
            (ret = get_fd_set(n, outp, fds.out)) ||
            (ret = get_fd_set(n, exp, fds.ex)))
                goto out;
        zero_fd_set(n, fds.res_in);
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);

        ret = do_select(n, &fds, end_time);

        if (ret < 0)
                goto out;
        if (!ret) {
                ret = -ERESTARTNOHAND;
                if (signal_pending(current))
                        goto out;
                ret = 0;
        }

        if (set_fd_set(n, inp, fds.res_in) ||
            set_fd_set(n, outp, fds.res_out) ||
            set_fd_set(n, exp, fds.res_ex))
                ret = -EFAULT;

out:
        if (bits != stack_fds)
                kvfree(bits);
out_nofds:
        return ret;
}

static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
                       fd_set __user *exp, struct __kernel_old_timeval __user *tvp)
{
        struct timespec64 end_time, *to = NULL;
        struct __kernel_old_timeval tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret);
}

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp)
{
        return kern_select(n, inp, outp, exp, tvp);
}

static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
                       fd_set __user *exp, void __user *tsp,
                       const sigset_t __user *sigmask, size_t sigsetsize,
                       enum poll_time_type type)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                switch (type) {
                case PT_TIMESPEC:
                        if (get_timespec64(&ts, tsp))
                                return -EFAULT;
                        break;
                case PT_OLD_TIMESPEC:
                        if (get_old_timespec32(&ts, tsp))
                                return -EFAULT;
                        break;
                default:
                        BUG();
                }

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tsp, type, ret);
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
struct sigset_argpack {
        sigset_t __user *p;
        size_t size;
};

static inline int get_sigset_argpack(struct sigset_argpack *to,
                                     struct sigset_argpack __user *from)
{
        // the path is hot enough for overhead of copy_from_user() to matter
        if (from) {
                if (!user_read_access_begin(from, sizeof(*from)))
                        return -EFAULT;
                unsafe_get_user(to->p, &from->p, Efault);
                unsafe_get_user(to->size, &from->size, Efault);
                user_read_access_end();
        }
        return 0;
Efault:
        user_access_end();
        return -EFAULT;
}

SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct __kernel_timespec __user *, tsp,
                void __user *, sig)
{
        struct sigset_argpack x = {NULL, 0};

        if (get_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC);
}

#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)

SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct old_timespec32 __user *, tsp,
                void __user *, sig)
{
        struct sigset_argpack x = {NULL, 0};

        if (get_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC);
}

#endif

#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
        unsigned long n;
        fd_set __user *inp, *outp, *exp;
        struct __kernel_old_timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
        struct sel_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

struct poll_list {
        struct poll_list *next;
        unsigned int len;
        struct pollfd entries[];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     __poll_t busy_flag)
{
        int fd = pollfd->fd;
        __poll_t mask = 0, filter;
        struct fd f;

        if (fd < 0)
                goto out;
        mask = EPOLLNVAL;
        f = fdget(fd);
        if (!f.file)
                goto out;

        /* userland u16 ->events contains POLL... bitmap */
        filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
        pwait->_key = filter | busy_flag;
        mask = vfs_poll(f.file, pwait);
        if (mask & busy_flag)
                *can_busy_poll = true;
        mask &= filter;                /* Mask out unneeded events. */
        fdput(f);

out:
        /* ... and so does ->revents */
        pollfd->revents = mangle_poll(mask);
        return mask;
}

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);

        for (;;) {
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt, &can_busy_loop,
                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -ERESTARTNOHAND;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                        timed_out = 1;
        }
        return count;
}

#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
                        sizeof(struct pollfd))

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                struct timespec64 *end_time)
{
        struct poll_wqueues table;
        int err = -EFAULT, fdcount;
        /* Allocate small arguments on the stack to save memory and be
           faster - use long to make sure the buffer is aligned properly
           on 64 bit archs to avoid unaligned access */
        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
        struct poll_list *const head = (struct poll_list *)stack_pps;
         struct poll_list *walk = head;
        unsigned int todo = nfds;
        unsigned int len;

        if (nfds > rlimit(RLIMIT_NOFILE))
                return -EINVAL;

        len = min_t(unsigned int, nfds, N_STACK_PPS);
        for (;;) {
                walk->next = NULL;
                walk->len = len;
                if (!len)
                        break;

                if (copy_from_user(walk->entries, ufds + nfds-todo,
                                        sizeof(struct pollfd) * walk->len))
                        goto out_fds;

                if (walk->len >= todo)
                        break;
                todo -= walk->len;

                len = min(todo, POLLFD_PER_PAGE);
                walk = walk->next = kmalloc(struct_size(walk, entries, len),
                                            GFP_KERNEL);
                if (!walk) {
                        err = -ENOMEM;
                        goto out_fds;
                }
        }

        poll_initwait(&table);
        fdcount = do_poll(head, &table, end_time);
        poll_freewait(&table);

        if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
                goto out_fds;

        for (walk = head; walk; walk = walk->next) {
                struct pollfd *fds = walk->entries;
                unsigned int j;

                for (j = walk->len; j; fds++, ufds++, j--)
                        unsafe_put_user(fds->revents, &ufds->revents, Efault);
          }
        user_write_access_end();

        err = fdcount;
out_fds:
        walk = head->next;
        while (walk) {
                struct poll_list *pos = walk;
                walk = walk->next;
                kfree(pos);
        }

        return err;

Efault:
        user_write_access_end();
        err = -EFAULT;
        goto out_fds;
}

static long do_restart_poll(struct restart_block *restart_block)
{
        struct pollfd __user *ufds = restart_block->poll.ufds;
        int nfds = restart_block->poll.nfds;
        struct timespec64 *to = NULL, end_time;
        int ret;

        if (restart_block->poll.has_timeout) {
                end_time.tv_sec = restart_block->poll.tv_sec;
                end_time.tv_nsec = restart_block->poll.tv_nsec;
                to = &end_time;
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -ERESTARTNOHAND)
                ret = set_restart_fn(restart_block, do_restart_poll);

        return ret;
}

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -ERESTARTNOHAND) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = set_restart_fn(restart_block, do_restart_poll);
        }
        return ret;
}

SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
                struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_timespec64(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
}

#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)

SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
                struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_old_timespec32(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
}
#endif

#ifdef CONFIG_COMPAT
#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))

/*
 * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
 * 64-bit unsigned longs.
 */
static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                        unsigned long *fdset)
{
        if (ufdset) {
                return compat_get_bitmap(fdset, ufdset, nr);
        } else {
                zero_fd_set(nr, fdset);
                return 0;
        }
}

static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                      unsigned long *fdset)
{
        if (!ufdset)
                return 0;
        return compat_put_bitmap(ufdset, fdset, nr);
}


/*
 * This is a virtual copy of sys_select from fs/select.c and probably
 * should be compared to it from time to time
 */

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct timespec64 *end_time)
{
        fd_set_bits fds;
        void *bits;
        int size, max_fds, ret = -EINVAL;
        struct fdtable *fdt;
        long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

        if (n < 0)
                goto out_nofds;

        /* max_fds can increase, so grab it once to avoid race */
        rcu_read_lock();
        fdt = files_fdtable(current->files);
        max_fds = fdt->max_fds;
        rcu_read_unlock();
        if (n > max_fds)
                n = max_fds;

        /*
         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
         * since we used fdset we need to allocate memory in units of
         * long-words.
         */
        size = FDS_BYTES(n);
        bits = stack_fds;
        if (size > sizeof(stack_fds) / 6) {
                bits = kmalloc_array(6, size, GFP_KERNEL);
                ret = -ENOMEM;
                if (!bits)
                        goto out_nofds;
        }
        fds.in      = (unsigned long *)  bits;
        fds.out     = (unsigned long *) (bits +   size);
        fds.ex      = (unsigned long *) (bits + 2*size);
        fds.res_in  = (unsigned long *) (bits + 3*size);
        fds.res_out = (unsigned long *) (bits + 4*size);
        fds.res_ex  = (unsigned long *) (bits + 5*size);

        if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
            (ret = compat_get_fd_set(n, outp, fds.out)) ||
            (ret = compat_get_fd_set(n, exp, fds.ex)))
                goto out;
        zero_fd_set(n, fds.res_in);
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);

        ret = do_select(n, &fds, end_time);

        if (ret < 0)
                goto out;
        if (!ret) {
                ret = -ERESTARTNOHAND;
                if (signal_pending(current))
                        goto out;
                ret = 0;
        }

        if (compat_set_fd_set(n, inp, fds.res_in) ||
            compat_set_fd_set(n, outp, fds.res_out) ||
            compat_set_fd_set(n, exp, fds.res_ex))
                ret = -EFAULT;
out:
        if (bits != stack_fds)
                kfree(bits);
out_nofds:
        return ret;
}

static int do_compat_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct old_timeval32 __user *tvp)
{
        struct timespec64 end_time, *to = NULL;
        struct old_timeval32 tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = compat_core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret);
}

COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct old_timeval32 __user *, tvp)
{
        return do_compat_select(n, inp, outp, exp, tvp);
}

struct compat_sel_arg_struct {
        compat_ulong_t n;
        compat_uptr_t inp;
        compat_uptr_t outp;
        compat_uptr_t exp;
        compat_uptr_t tvp;
};

COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
        struct compat_sel_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
                                compat_ptr(a.exp), compat_ptr(a.tvp));
}

static long do_compat_pselect(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        void __user *tsp, compat_sigset_t __user *sigmask,
        compat_size_t sigsetsize, enum poll_time_type type)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                switch (type) {
                case PT_OLD_TIMESPEC:
                        if (get_old_timespec32(&ts, tsp))
                                return -EFAULT;
                        break;
                case PT_TIMESPEC:
                        if (get_timespec64(&ts, tsp))
                                return -EFAULT;
                        break;
                default:
                        BUG();
                }

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = compat_core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tsp, type, ret);
}

struct compat_sigset_argpack {
        compat_uptr_t p;
        compat_size_t size;
};
static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to,
                                            struct compat_sigset_argpack __user *from)
{
        if (from) {
                if (!user_read_access_begin(from, sizeof(*from)))
                        return -EFAULT;
                unsafe_get_user(to->p, &from->p, Efault);
                unsafe_get_user(to->size, &from->size, Efault);
                user_read_access_end();
        }
        return 0;
Efault:
        user_access_end();
        return -EFAULT;
}

COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct __kernel_timespec __user *, tsp, void __user *, sig)
{
        struct compat_sigset_argpack x = {0, 0};

        if (get_compat_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
                                 x.size, PT_TIMESPEC);
}

#if defined(CONFIG_COMPAT_32BIT_TIME)

COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct old_timespec32 __user *, tsp, void __user *, sig)
{
        struct compat_sigset_argpack x = {0, 0};

        if (get_compat_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
                                 x.size, PT_OLD_TIMESPEC);
}

#endif

#if defined(CONFIG_COMPAT_32BIT_TIME)
COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
        unsigned int,  nfds, struct old_timespec32 __user *, tsp,
        const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_old_timespec32(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
}
#endif

/* New compat syscall for 64 bit time_t*/
COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
        unsigned int,  nfds, struct __kernel_timespec __user *, tsp,
        const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_timespec64(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
}

#endif


































































































































































































































































































































    7 







































    2 





    2 





    2 

















    1 









































    2 


































    3 




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit.h -- Auditing support
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 */
#ifndef _LINUX_AUDIT_H_
#define _LINUX_AUDIT_H_

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/audit_arch.h>
#include <uapi/linux/audit.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/fanotify.h>

#define AUDIT_INO_UNSET ((unsigned long)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)

struct audit_sig_info {
        uid_t                uid;
        pid_t                pid;
        char                ctx[];
};

struct audit_buffer;
struct audit_context;
struct inode;
struct netlink_skb_parms;
struct path;
struct linux_binprm;
struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
struct sk_buff;
struct kern_ipc_perm;

struct audit_krule {
        u32                        pflags;
        u32                        flags;
        u32                        listnr;
        u32                        action;
        u32                        mask[AUDIT_BITMASK_SIZE];
        u32                        buflen; /* for data alloc on list rules */
        u32                        field_count;
        char                        *filterkey; /* ties events to rules */
        struct audit_field        *fields;
        struct audit_field        *arch_f; /* quick access to arch field */
        struct audit_field        *inode_f; /* quick access to an inode field */
        struct audit_watch        *watch;        /* associated watch */
        struct audit_tree        *tree;        /* associated watched tree */
        struct audit_fsnotify_mark        *exe;
        struct list_head        rlist;        /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;        /* for AUDIT_LIST* purposes only */
        u64                        prio;
};

/* Flag to indicate legacy AUDIT_LOGINUID unset usage */
#define AUDIT_LOGINUID_LEGACY                0x1

struct audit_field {
        u32                                type;
        union {
                u32                        val;
                kuid_t                        uid;
                kgid_t                        gid;
                struct {
                        char                *lsm_str;
                        void                *lsm_rule;
                };
        };
        u32                                op;
};

enum audit_ntp_type {
        AUDIT_NTP_OFFSET,
        AUDIT_NTP_FREQ,
        AUDIT_NTP_STATUS,
        AUDIT_NTP_TAI,
        AUDIT_NTP_TICK,
        AUDIT_NTP_ADJUST,

        AUDIT_NTP_NVALS /* count */
};

#ifdef CONFIG_AUDITSYSCALL
struct audit_ntp_val {
        long long oldval, newval;
};

struct audit_ntp_data {
        struct audit_ntp_val vals[AUDIT_NTP_NVALS];
};
#else
struct audit_ntp_data {};
#endif

enum audit_nfcfgop {
        AUDIT_XT_OP_REGISTER,
        AUDIT_XT_OP_REPLACE,
        AUDIT_XT_OP_UNREGISTER,
        AUDIT_NFT_OP_TABLE_REGISTER,
        AUDIT_NFT_OP_TABLE_UNREGISTER,
        AUDIT_NFT_OP_CHAIN_REGISTER,
        AUDIT_NFT_OP_CHAIN_UNREGISTER,
        AUDIT_NFT_OP_RULE_REGISTER,
        AUDIT_NFT_OP_RULE_UNREGISTER,
        AUDIT_NFT_OP_SET_REGISTER,
        AUDIT_NFT_OP_SET_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_REGISTER,
        AUDIT_NFT_OP_SETELEM_UNREGISTER,
        AUDIT_NFT_OP_GEN_REGISTER,
        AUDIT_NFT_OP_OBJ_REGISTER,
        AUDIT_NFT_OP_OBJ_UNREGISTER,
        AUDIT_NFT_OP_OBJ_RESET,
        AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_RESET,
        AUDIT_NFT_OP_RULE_RESET,
        AUDIT_NFT_OP_INVALID,
};

extern int __init audit_register_class(int class, unsigned *list);
extern int audit_classify_syscall(int abi, unsigned syscall);
extern int audit_classify_arch(int arch);
/* only for compat system calls */
extern unsigned compat_write_class[];
extern unsigned compat_read_class[];
extern unsigned compat_dir_class[];
extern unsigned compat_chattr_class[];
extern unsigned compat_signal_class[];

/* audit_names->type values */
#define        AUDIT_TYPE_UNKNOWN        0        /* we don't know yet */
#define        AUDIT_TYPE_NORMAL        1        /* a "normal" audit record */
#define        AUDIT_TYPE_PARENT        2        /* a parent audit record */
#define        AUDIT_TYPE_CHILD_DELETE 3        /* a child being deleted */
#define        AUDIT_TYPE_CHILD_CREATE 4        /* a child being created */

/* maximized args number that audit_socketcall can process */
#define AUDITSC_ARGS                6

/* bit values for ->signal->audit_tty */
#define AUDIT_TTY_ENABLE        BIT(0)
#define AUDIT_TTY_LOG_PASSWD        BIT(1)

struct filename;

#define AUDIT_OFF        0
#define AUDIT_ON        1
#define AUDIT_LOCKED        2
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
                                /* Public API */
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...);

extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
extern __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
extern void                    audit_log_end(struct audit_buffer *ab);
extern bool                    audit_string_contains_control(const char *string,
                                                          size_t len);
extern void                    audit_log_n_hex(struct audit_buffer *ab,
                                          const unsigned char *buf,
                                          size_t len);
extern void                    audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
extern void                    audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
extern void                    audit_log_untrustedstring(struct audit_buffer *ab,
                                                      const char *string);
extern void                    audit_log_d_path(struct audit_buffer *ab,
                                             const char *prefix,
                                             const struct path *path);
extern void                    audit_log_key(struct audit_buffer *ab,
                                          char *key);
extern void                    audit_log_path_denied(int type,
                                                  const char *operation);
extern void                    audit_log_lost(const char *message);

extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab);

extern int                    audit_update_lsm_rules(void);

                                /* Private API (for audit.c only) */
extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern int audit_set_loginuid(kuid_t loginuid);

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return tsk->loginuid;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return tsk->sessionid;
}

extern u32 audit_enabled;

extern int audit_signal_info(int sig, struct task_struct *t);

#else /* CONFIG_AUDIT */
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{ }
static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
                                                   gfp_t gfp_mask, int type)
{
        return NULL;
}
static inline __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{ }
static inline void audit_log_end(struct audit_buffer *ab)
{ }
static inline void audit_log_n_hex(struct audit_buffer *ab,
                                   const unsigned char *buf, size_t len)
{ }
static inline void audit_log_n_string(struct audit_buffer *ab,
                                      const char *buf, size_t n)
{ }
static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                const char *string, size_t n)
{ }
static inline void audit_log_untrustedstring(struct audit_buffer *ab,
                                             const char *string)
{ }
static inline void audit_log_d_path(struct audit_buffer *ab,
                                    const char *prefix,
                                    const struct path *path)
{ }
static inline void audit_log_key(struct audit_buffer *ab, char *key)
{ }
static inline void audit_log_path_denied(int type, const char *operation)
{ }
static inline int audit_log_task_context(struct audit_buffer *ab)
{
        return 0;
}
static inline void audit_log_task_info(struct audit_buffer *ab)
{ }

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return INVALID_UID;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return AUDIT_SID_UNSET;
}

#define audit_enabled AUDIT_OFF

static inline int audit_signal_info(int sig, struct task_struct *t)
{
        return 0;
}

#endif /* CONFIG_AUDIT */

#ifdef CONFIG_AUDIT_COMPAT_GENERIC
#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
#else
#define audit_is_compat(arch)  false
#endif

#define AUDIT_INODE_PARENT        1        /* dentry represents the parent */
#define AUDIT_INODE_HIDDEN        2        /* audit record should be hidden */
#define AUDIT_INODE_NOEVAL        4        /* audit record incomplete */

#ifdef CONFIG_AUDITSYSCALL
#include <asm/syscall.h> /* for syscall_get_arch() */

/* These are defined in auditsc.c */
                                /* Public API */
extern int  audit_alloc(struct task_struct *task);
extern void __audit_free(struct task_struct *task);
extern void __audit_uring_entry(u8 op);
extern void __audit_uring_exit(int success, long code);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
                                  unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
extern struct filename *__audit_reusename(const __user char *uptr);
extern void __audit_getname(struct filename *name);
extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
extern void __audit_file(const struct file *);
extern void __audit_inode_child(struct inode *parent,
                                const struct dentry *dentry,
                                const unsigned char type);
extern void audit_seccomp(unsigned long syscall, long signr, int code);
extern void audit_seccomp_actions_logged(const char *names,
                                         const char *old_names, int res);
extern void __audit_ptrace(struct task_struct *t);

static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
        task->audit_context = ctx;
}

static inline struct audit_context *audit_context(void)
{
        return current->audit_context;
}

static inline bool audit_dummy_context(void)
{
        void *p = audit_context();
        return !p || *(int *)p;
}
static inline void audit_free(struct task_struct *task)
{
        if (unlikely(task->audit_context))
                __audit_free(task);
}
static inline void audit_uring_entry(u8 op)
{
        /*
         * We intentionally check audit_context() before audit_enabled as most
         * Linux systems (as of ~2021) rely on systemd which forces audit to
         * be enabled regardless of the user's audit configuration.
         */
        if (unlikely(audit_context() && audit_enabled))
                __audit_uring_entry(op);
}
static inline void audit_uring_exit(int success, long code)
{
        if (unlikely(audit_context()))
                __audit_uring_exit(success, code);
}
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{
        if (unlikely(audit_context()))
                __audit_syscall_entry(major, a0, a1, a2, a3);
}
static inline void audit_syscall_exit(void *pt_regs)
{
        if (unlikely(audit_context())) {
                int success = is_syscall_success(pt_regs);
                long return_code = regs_return_value(pt_regs);

                __audit_syscall_exit(success, return_code);
        }
}
static inline struct filename *audit_reusename(const __user char *name)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_reusename(name);
        return NULL;
}
static inline void audit_getname(struct filename *name)
{
        if (unlikely(!audit_dummy_context()))
                __audit_getname(name);
}
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry, aflags);
}
static inline void audit_file(struct file *file)
{
        if (unlikely(!audit_dummy_context()))
                __audit_file(file);
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                                const struct dentry *dentry)
{
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry,
                                AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
}
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode_child(parent, dentry, type);
}
void audit_core_dumps(long signr);

static inline void audit_ptrace(struct task_struct *t)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ptrace(t);
}

                                /* Private API (for audit.c only) */
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
extern int __audit_socketcall(int nargs, unsigned long *args);
extern int __audit_sockaddr(int len, void *addr);
extern void __audit_fd_pair(int fd1, int fd2);
extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                  const struct cred *new,
                                  const struct cred *old);
extern void __audit_log_capset(const struct cred *new, const struct cred *old);
extern void __audit_mmap_fd(int fd, int flags);
extern void __audit_openat2_how(struct open_how *how);
extern void __audit_log_kern_module(char *name);
extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar);
extern void __audit_tk_injoffset(struct timespec64 offset);
extern void __audit_ntp_log(const struct audit_ntp_data *ad);
extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                              enum audit_nfcfgop op, gfp_t gfp);

static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_obj(ipcp);
}
static inline void audit_fd_pair(int fd1, int fd2)
{
        if (unlikely(!audit_dummy_context()))
                __audit_fd_pair(fd1, fd2);
}
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
}
static inline void audit_bprm(struct linux_binprm *bprm)
{
        if (unlikely(!audit_dummy_context()))
                __audit_bprm(bprm);
}
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_socketcall(nargs, args);
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        unsigned long a[AUDITSC_ARGS];
        int i;

        if (audit_dummy_context())
                return 0;

        for (i = 0; i < nargs; i++)
                a[i] = (unsigned long)args[i];
        return __audit_socketcall(nargs, a);
}

static inline int audit_sockaddr(int len, void *addr)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_sockaddr(len, addr);
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_open(oflag, mode, attr);
}
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
}
static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_notify(mqdes, notification);
}
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_getsetattr(mqdes, mqstat);
}

static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_log_bprm_fcaps(bprm, new, old);
        return 0;
}

static inline void audit_log_capset(const struct cred *new,
                                   const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                __audit_log_capset(new, old);
}

static inline void audit_mmap_fd(int fd, int flags)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mmap_fd(fd, flags);
}

static inline void audit_openat2_how(struct open_how *how)
{
        if (unlikely(!audit_dummy_context()))
                __audit_openat2_how(how);
}

static inline void audit_log_kern_module(char *name)
{
        if (!audit_dummy_context())
                __audit_log_kern_module(name);
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
        if (!audit_dummy_context())
                __audit_fanotify(response, friar);
}

static inline void audit_tk_injoffset(struct timespec64 offset)
{
        /* ignore no-op events */
        if (offset.tv_sec == 0 && offset.tv_nsec == 0)
                return;

        if (!audit_dummy_context())
                __audit_tk_injoffset(offset);
}

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{
        memset(ad, 0, sizeof(*ad));
}

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].oldval = val;
}

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].newval = val;
}

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{
        if (!audit_dummy_context())
                __audit_ntp_log(ad);
}

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{
        if (audit_enabled)
                __audit_log_nfcfg(name, af, nentries, op, gfp);
}

extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
static inline int audit_alloc(struct task_struct *task)
{
        return 0;
}
static inline void audit_free(struct task_struct *task)
{ }
static inline void audit_uring_entry(u8 op)
{ }
static inline void audit_uring_exit(int success, long code)
{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{ }
static inline void audit_syscall_exit(void *pt_regs)
{ }
static inline bool audit_dummy_context(void)
{
        return true;
}
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{ }
static inline struct audit_context *audit_context(void)
{
        return NULL;
}
static inline struct filename *audit_reusename(const __user char *name)
{
        return NULL;
}
static inline void audit_getname(struct filename *name)
{ }
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags)
{ }
static inline void audit_file(struct file *file)
{
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                const struct dentry *dentry)
{ }
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type)
{ }
static inline void audit_core_dumps(long signr)
{ }
static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{ }
static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
{ }
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
{ }
static inline void audit_bprm(struct linux_binprm *bprm)
{ }
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        return 0;
}

static inline void audit_fd_pair(int fd1, int fd2)
{ }
static inline int audit_sockaddr(int len, void *addr)
{
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{ }
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                     unsigned int msg_prio,
                                     const struct timespec64 *abs_timeout)
{ }
static inline void audit_mq_notify(mqd_t mqdes,
                                   const struct sigevent *notification)
{ }
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{ }
static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        return 0;
}
static inline void audit_log_capset(const struct cred *new,
                                    const struct cred *old)
{ }
static inline void audit_mmap_fd(int fd, int flags)
{ }

static inline void audit_openat2_how(struct open_how *how)
{ }

static inline void audit_log_kern_module(char *name)
{
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{ }

static inline void audit_tk_injoffset(struct timespec64 offset)
{ }

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{ }

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{ }

static inline void audit_ptrace(struct task_struct *t)
{ }

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{ }

#define audit_n_rules 0
#define audit_signals 0
#endif /* CONFIG_AUDITSYSCALL */

static inline bool audit_loginuid_set(struct task_struct *tsk)
{
        return uid_valid(audit_get_loginuid(tsk));
}

#endif









































































    2 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H

#ifdef CONFIG_MMU

#define GFP_PGTABLE_KERNEL        (GFP_KERNEL | __GFP_ZERO)
#define GFP_PGTABLE_USER        (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

/**
 * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
                        ~__GFP_HIGHMEM, 0);

        if (!ptdesc)
                return NULL;
        return ptdesc_address(ptdesc);
}
#define __pte_alloc_one_kernel(...)        alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
 * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_kernel_noprof(mm);
}
#define pte_alloc_one_kernel(...)        alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif

/**
 * pte_free_kernel - free PTE-level kernel page table memory
 * @mm: the mm_struct of the current context
 * @pte: pointer to the memory containing the page table
 */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
        pagetable_free(virt_to_ptdesc(pte));
}

/**
 * __pte_alloc_one - allocate memory for a PTE-level user page table
 * @mm: the mm_struct of the current context
 * @gfp: GFP flags to use for the allocation
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation or must have custom GFP flags.
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp)
{
        struct ptdesc *ptdesc;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pte_ctor(ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        return ptdesc_page(ptdesc);
}
#define __pte_alloc_one(...)        alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
 * pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER);
}
#define pte_alloc_one(...)        alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__))
#endif

/*
 * Should really implement gc for free page table pages. This could be
 * done with a reference count in struct page.
 */

/**
 * pte_free - free PTE-level user page table memory
 * @mm: the mm_struct of the current context
 * @pte_page: the `struct page` referencing the ptdesc
 */
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
        struct ptdesc *ptdesc = page_ptdesc(pte_page);

        pagetable_pte_dtor(ptdesc);
        pagetable_free(ptdesc);
}


#if CONFIG_PGTABLE_LEVELS > 2

#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
 * pmd_alloc_one - allocate memory for a PMD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor().
 *
 * Allocations use %GFP_PGTABLE_USER in user context and
 * %GFP_PGTABLE_KERNEL in kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        struct ptdesc *ptdesc;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pmd_ctor(ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }
        return ptdesc_address(ptdesc);
}
#define pmd_alloc_one(...)        alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
#endif

#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmd);

        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
        pagetable_pmd_dtor(ptdesc);
        pagetable_free(ptdesc);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_pud_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __pud_alloc_one(...)        alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
 * pud_alloc_one - allocate memory for a PUD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table using %GFP_PGTABLE_USER for user context
 * and %GFP_PGTABLE_KERNEL for kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __pud_alloc_one_noprof(mm, addr);
}
#define pud_alloc_one(...)        alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pud);

        BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
        pagetable_pud_dtor(ptdesc);
        pagetable_free(ptdesc);
}

#ifndef __HAVE_ARCH_PUD_FREE
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        __pud_free(mm, pud);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 3 */

#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        pagetable_free(virt_to_ptdesc(pgd));
}
#endif

#endif /* CONFIG_MMU */

#endif /* __ASM_GENERIC_PGALLOC_H */













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
/*
 * net/tipc/name_table.c: TIPC name table code
 *
 * Copyright (c) 2000-2006, 2014-2018, Ericsson AB
 * Copyright (c) 2004-2008, 2010-2014, Wind River Systems
 * Copyright (c) 2020-2021, Red Hat Inc
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <net/sock.h>
#include <linux/list_sort.h>
#include <linux/rbtree_augmented.h>
#include "core.h"
#include "netlink.h"
#include "name_table.h"
#include "name_distr.h"
#include "subscr.h"
#include "bcast.h"
#include "addr.h"
#include "node.h"
#include "group.h"

/**
 * struct service_range - container for all bindings of a service range
 * @lower: service range lower bound
 * @upper: service range upper bound
 * @tree_node: member of service range RB tree
 * @max: largest 'upper' in this node subtree
 * @local_publ: list of identical publications made from this node
 *   Used by closest_first lookup and multicast lookup algorithm
 * @all_publ: all publications identical to this one, whatever node and scope
 *   Used by round-robin lookup algorithm
 */
struct service_range {
        u32 lower;
        u32 upper;
        struct rb_node tree_node;
        u32 max;
        struct list_head local_publ;
        struct list_head all_publ;
};

/**
 * struct tipc_service - container for all published instances of a service type
 * @type: 32 bit 'type' value for service
 * @publ_cnt: increasing counter for publications in this service
 * @ranges: rb tree containing all service ranges for this service
 * @service_list: links to adjacent name ranges in hash chain
 * @subscriptions: list of subscriptions for this service type
 * @lock: spinlock controlling access to pertaining service ranges/publications
 * @rcu: RCU callback head used for deferred freeing
 */
struct tipc_service {
        u32 type;
        u32 publ_cnt;
        struct rb_root ranges;
        struct hlist_node service_list;
        struct list_head subscriptions;
        spinlock_t lock; /* Covers service range list */
        struct rcu_head rcu;
};

#define service_range_upper(sr) ((sr)->upper)
RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks,
                         struct service_range, tree_node, u32, max,
                         service_range_upper)

#define service_range_entry(rbtree_node)                                \
        (container_of(rbtree_node, struct service_range, tree_node))

#define service_range_overlap(sr, start, end)                                \
        ((sr)->lower <= (end) && (sr)->upper >= (start))

/**
 * service_range_foreach_match - iterate over tipc service rbtree for each
 *                               range match
 * @sr: the service range pointer as a loop cursor
 * @sc: the pointer to tipc service which holds the service range rbtree
 * @start: beginning of the search range (end >= start) for matching
 * @end: end of the search range (end >= start) for matching
 */
#define service_range_foreach_match(sr, sc, start, end)                        \
        for (sr = service_range_match_first((sc)->ranges.rb_node,        \
                                            start,                        \
                                            end);                        \
             sr;                                                        \
             sr = service_range_match_next(&(sr)->tree_node,                \
                                           start,                        \
                                           end))

/**
 * service_range_match_first - find first service range matching a range
 * @n: the root node of service range rbtree for searching
 * @start: beginning of the search range (end >= start) for matching
 * @end: end of the search range (end >= start) for matching
 *
 * Return: the leftmost service range node in the rbtree that overlaps the
 * specific range if any. Otherwise, returns NULL.
 */
static struct service_range *service_range_match_first(struct rb_node *n,
                                                       u32 start, u32 end)
{
        struct service_range *sr;
        struct rb_node *l, *r;

        /* Non overlaps in tree at all? */
        if (!n || service_range_entry(n)->max < start)
                return NULL;

        while (n) {
                l = n->rb_left;
                if (l && service_range_entry(l)->max >= start) {
                        /* A leftmost overlap range node must be one in the left
                         * subtree. If not, it has lower > end, then nodes on
                         * the right side cannot satisfy the condition either.
                         */
                        n = l;
                        continue;
                }

                /* No one in the left subtree can match, return if this node is
                 * an overlap i.e. leftmost.
                 */
                sr = service_range_entry(n);
                if (service_range_overlap(sr, start, end))
                        return sr;

                /* Ok, try to lookup on the right side */
                r = n->rb_right;
                if (sr->lower <= end &&
                    r && service_range_entry(r)->max >= start) {
                        n = r;
                        continue;
                }
                break;
        }

        return NULL;
}

/**
 * service_range_match_next - find next service range matching a range
 * @n: a node in service range rbtree from which the searching starts
 * @start: beginning of the search range (end >= start) for matching
 * @end: end of the search range (end >= start) for matching
 *
 * Return: the next service range node to the given node in the rbtree that
 * overlaps the specific range if any. Otherwise, returns NULL.
 */
static struct service_range *service_range_match_next(struct rb_node *n,
                                                      u32 start, u32 end)
{
        struct service_range *sr;
        struct rb_node *p, *r;

        while (n) {
                r = n->rb_right;
                if (r && service_range_entry(r)->max >= start)
                        /* A next overlap range node must be one in the right
                         * subtree. If not, it has lower > end, then any next
                         * successor (- an ancestor) of this node cannot
                         * satisfy the condition either.
                         */
                        return service_range_match_first(r, start, end);

                /* No one in the right subtree can match, go up to find an
                 * ancestor of this node which is parent of a left-hand child.
                 */
                while ((p = rb_parent(n)) && n == p->rb_right)
                        n = p;
                if (!p)
                        break;

                /* Return if this ancestor is an overlap */
                sr = service_range_entry(p);
                if (service_range_overlap(sr, start, end))
                        return sr;

                /* Ok, try to lookup more from this ancestor */
                if (sr->lower <= end) {
                        n = p;
                        continue;
                }
                break;
        }

        return NULL;
}

static int hash(int x)
{
        return x & (TIPC_NAMETBL_SIZE - 1);
}

/**
 * tipc_publ_create - create a publication structure
 * @ua: the service range the user is binding to
 * @sk: the address of the socket that is bound
 * @key: publication key
 */
static struct publication *tipc_publ_create(struct tipc_uaddr *ua,
                                            struct tipc_socket_addr *sk,
                                            u32 key)
{
        struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC);

        if (!p)
                return NULL;

        p->sr = ua->sr;
        p->sk = *sk;
        p->scope = ua->scope;
        p->key = key;
        INIT_LIST_HEAD(&p->binding_sock);
        INIT_LIST_HEAD(&p->binding_node);
        INIT_LIST_HEAD(&p->local_publ);
        INIT_LIST_HEAD(&p->all_publ);
        INIT_LIST_HEAD(&p->list);
        return p;
}

/**
 * tipc_service_create - create a service structure for the specified 'type'
 * @net: network namespace
 * @ua: address representing the service to be bound
 *
 * Allocates a single range structure and sets it to all 0's.
 */
static struct tipc_service *tipc_service_create(struct net *net,
                                                struct tipc_uaddr *ua)
{
        struct name_table *nt = tipc_name_table(net);
        struct tipc_service *service;
        struct hlist_head *hd;

        service = kzalloc(sizeof(*service), GFP_ATOMIC);
        if (!service) {
                pr_warn("Service creation failed, no memory\n");
                return NULL;
        }

        spin_lock_init(&service->lock);
        service->type = ua->sr.type;
        service->ranges = RB_ROOT;
        INIT_HLIST_NODE(&service->service_list);
        INIT_LIST_HEAD(&service->subscriptions);
        hd = &nt->services[hash(ua->sr.type)];
        hlist_add_head_rcu(&service->service_list, hd);
        return service;
}

/*  tipc_service_find_range - find service range matching publication parameters
 */
static struct service_range *tipc_service_find_range(struct tipc_service *sc,
                                                     struct tipc_uaddr *ua)
{
        struct service_range *sr;

        service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) {
                /* Look for exact match */
                if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper)
                        return sr;
        }

        return NULL;
}

static struct service_range *tipc_service_create_range(struct tipc_service *sc,
                                                       struct publication *p)
{
        struct rb_node **n, *parent = NULL;
        struct service_range *sr;
        u32 lower = p->sr.lower;
        u32 upper = p->sr.upper;

        n = &sc->ranges.rb_node;
        while (*n) {
                parent = *n;
                sr = service_range_entry(parent);
                if (lower == sr->lower && upper == sr->upper)
                        return sr;
                if (sr->max < upper)
                        sr->max = upper;
                if (lower <= sr->lower)
                        n = &parent->rb_left;
                else
                        n = &parent->rb_right;
        }
        sr = kzalloc(sizeof(*sr), GFP_ATOMIC);
        if (!sr)
                return NULL;
        sr->lower = lower;
        sr->upper = upper;
        sr->max = upper;
        INIT_LIST_HEAD(&sr->local_publ);
        INIT_LIST_HEAD(&sr->all_publ);
        rb_link_node(&sr->tree_node, parent, n);
        rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks);
        return sr;
}

static bool tipc_service_insert_publ(struct net *net,
                                     struct tipc_service *sc,
                                     struct publication *p)
{
        struct tipc_subscription *sub, *tmp;
        struct service_range *sr;
        struct publication *_p;
        u32 node = p->sk.node;
        bool first = false;
        bool res = false;
        u32 key = p->key;

        spin_lock_bh(&sc->lock);
        sr = tipc_service_create_range(sc, p);
        if (!sr)
                goto  exit;

        first = list_empty(&sr->all_publ);

        /* Return if the publication already exists */
        list_for_each_entry(_p, &sr->all_publ, all_publ) {
                if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) {
                        pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n",
                                 p->sr.type, p->sr.lower, p->sr.upper,
                                 node, p->sk.ref, key);
                        goto exit;
                }
        }

        if (in_own_node(net, p->sk.node))
                list_add(&p->local_publ, &sr->local_publ);
        list_add(&p->all_publ, &sr->all_publ);
        p->id = sc->publ_cnt++;

        /* Any subscriptions waiting for notification?  */
        list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
                tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first);
        }
        res = true;
exit:
        if (!res)
                pr_warn("Failed to bind to %u,%u,%u\n",
                        p->sr.type, p->sr.lower, p->sr.upper);
        spin_unlock_bh(&sc->lock);
        return res;
}

/**
 * tipc_service_remove_publ - remove a publication from a service
 * @r: service_range to remove publication from
 * @sk: address publishing socket
 * @key: target publication key
 */
static struct publication *tipc_service_remove_publ(struct service_range *r,
                                                    struct tipc_socket_addr *sk,
                                                    u32 key)
{
        struct publication *p;
        u32 node = sk->node;

        list_for_each_entry(p, &r->all_publ, all_publ) {
                if (p->key != key || (node && node != p->sk.node))
                        continue;
                list_del(&p->all_publ);
                list_del(&p->local_publ);
                return p;
        }
        return NULL;
}

/*
 * Code reused: time_after32() for the same purpose
 */
#define publication_after(pa, pb) time_after32((pa)->id, (pb)->id)
static int tipc_publ_sort(void *priv, const struct list_head *a,
                          const struct list_head *b)
{
        struct publication *pa, *pb;

        pa = container_of(a, struct publication, list);
        pb = container_of(b, struct publication, list);
        return publication_after(pa, pb);
}

/**
 * tipc_service_subscribe - attach a subscription, and optionally
 * issue the prescribed number of events if there is any service
 * range overlapping with the requested range
 * @service: the tipc_service to attach the @sub to
 * @sub: the subscription to attach
 */
static void tipc_service_subscribe(struct tipc_service *service,
                                   struct tipc_subscription *sub)
{
        struct publication *p, *first, *tmp;
        struct list_head publ_list;
        struct service_range *sr;
        u32 filter, lower, upper;

        filter = sub->s.filter;
        lower = sub->s.seq.lower;
        upper = sub->s.seq.upper;

        tipc_sub_get(sub);
        list_add(&sub->service_list, &service->subscriptions);

        if (filter & TIPC_SUB_NO_STATUS)
                return;

        INIT_LIST_HEAD(&publ_list);
        service_range_foreach_match(sr, service, lower, upper) {
                first = NULL;
                list_for_each_entry(p, &sr->all_publ, all_publ) {
                        if (filter & TIPC_SUB_PORTS)
                                list_add_tail(&p->list, &publ_list);
                        else if (!first || publication_after(first, p))
                                /* Pick this range's *first* publication */
                                first = p;
                }
                if (first)
                        list_add_tail(&first->list, &publ_list);
        }

        /* Sort the publications before reporting */
        list_sort(NULL, &publ_list, tipc_publ_sort);
        list_for_each_entry_safe(p, tmp, &publ_list, list) {
                tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true);
                list_del_init(&p->list);
        }
}

static struct tipc_service *tipc_service_find(struct net *net,
                                              struct tipc_uaddr *ua)
{
        struct name_table *nt = tipc_name_table(net);
        struct hlist_head *service_head;
        struct tipc_service *service;

        service_head = &nt->services[hash(ua->sr.type)];
        hlist_for_each_entry_rcu(service, service_head, service_list) {
                if (service->type == ua->sr.type)
                        return service;
        }
        return NULL;
};

struct publication *tipc_nametbl_insert_publ(struct net *net,
                                             struct tipc_uaddr *ua,
                                             struct tipc_socket_addr *sk,
                                             u32 key)
{
        struct tipc_service *sc;
        struct publication *p;

        p = tipc_publ_create(ua, sk, key);
        if (!p)
                return NULL;

        sc = tipc_service_find(net, ua);
        if (!sc)
                sc = tipc_service_create(net, ua);
        if (sc && tipc_service_insert_publ(net, sc, p))
                return p;
        kfree(p);
        return NULL;
}

struct publication *tipc_nametbl_remove_publ(struct net *net,
                                             struct tipc_uaddr *ua,
                                             struct tipc_socket_addr *sk,
                                             u32 key)
{
        struct tipc_subscription *sub, *tmp;
        struct publication *p = NULL;
        struct service_range *sr;
        struct tipc_service *sc;
        bool last;

        sc = tipc_service_find(net, ua);
        if (!sc)
                goto exit;

        spin_lock_bh(&sc->lock);
        sr = tipc_service_find_range(sc, ua);
        if (!sr)
                goto unlock;
        p = tipc_service_remove_publ(sr, sk, key);
        if (!p)
                goto unlock;

        /* Notify any waiting subscriptions */
        last = list_empty(&sr->all_publ);
        list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
                tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last);
        }

        /* Remove service range item if this was its last publication */
        if (list_empty(&sr->all_publ)) {
                rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks);
                kfree(sr);
        }

        /* Delete service item if no more publications and subscriptions */
        if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
                hlist_del_init_rcu(&sc->service_list);
                kfree_rcu(sc, rcu);
        }
unlock:
        spin_unlock_bh(&sc->lock);
exit:
        if (!p) {
                pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n",
                       ua->sr.type, ua->sr.lower, ua->sr.upper,
                       sk->node, sk->ref, key);
        }
        return p;
}

/**
 * tipc_nametbl_lookup_anycast - perform service instance to socket translation
 * @net: network namespace
 * @ua: service address to look up
 * @sk: address to socket we want to find
 *
 * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be
 * performed, which may not be this one.
 *
 * On exit:
 *
 * - If lookup is deferred to another node, leave 'sk->node' unchanged and
 *   return 'true'.
 * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which
 *   represent the bound socket and return 'true'.
 * - If lookup fails, return 'false'
 *
 * Note that for legacy users (node configured with Z.C.N address format) the
 * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0
 * we must look in the local binding list first
 */
bool tipc_nametbl_lookup_anycast(struct net *net,
                                 struct tipc_uaddr *ua,
                                 struct tipc_socket_addr *sk)
{
        struct tipc_net *tn = tipc_net(net);
        bool legacy = tn->legacy_addr_format;
        u32 self = tipc_own_addr(net);
        u32 inst = ua->sa.instance;
        struct service_range *r;
        struct tipc_service *sc;
        struct publication *p;
        struct list_head *l;
        bool res = false;

        if (!tipc_in_scope(legacy, sk->node, self))
                return true;

        rcu_read_lock();
        sc = tipc_service_find(net, ua);
        if (unlikely(!sc))
                goto exit;

        spin_lock_bh(&sc->lock);
        service_range_foreach_match(r, sc, inst, inst) {
                /* Select lookup algo: local, closest-first or round-robin */
                if (sk->node == self) {
                        l = &r->local_publ;
                        if (list_empty(l))
                                continue;
                        p = list_first_entry(l, struct publication, local_publ);
                        list_move_tail(&p->local_publ, &r->local_publ);
                } else if (legacy && !sk->node && !list_empty(&r->local_publ)) {
                        l = &r->local_publ;
                        p = list_first_entry(l, struct publication, local_publ);
                        list_move_tail(&p->local_publ, &r->local_publ);
                } else {
                        l = &r->all_publ;
                        p = list_first_entry(l, struct publication, all_publ);
                        list_move_tail(&p->all_publ, &r->all_publ);
                }
                *sk = p->sk;
                res = true;
                /* Todo: as for legacy, pick the first matching range only, a
                 * "true" round-robin will be performed as needed.
                 */
                break;
        }
        spin_unlock_bh(&sc->lock);

exit:
        rcu_read_unlock();
        return res;
}

/* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group
 * Returns a list of one (== group anycast) or more (== group multicast)
 * destination socket/node pairs matching the given address.
 * The requester may or may not want to exclude himself from the list.
 */
bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua,
                               struct list_head *dsts, int *dstcnt,
                               u32 exclude, bool mcast)
{
        u32 self = tipc_own_addr(net);
        u32 inst = ua->sa.instance;
        struct service_range *sr;
        struct tipc_service *sc;
        struct publication *p;

        *dstcnt = 0;
        rcu_read_lock();
        sc = tipc_service_find(net, ua);
        if (unlikely(!sc))
                goto exit;

        spin_lock_bh(&sc->lock);

        /* Todo: a full search i.e. service_range_foreach_match() instead? */
        sr = service_range_match_first(sc->ranges.rb_node, inst, inst);
        if (!sr)
                goto no_match;

        list_for_each_entry(p, &sr->all_publ, all_publ) {
                if (p->scope != ua->scope)
                        continue;
                if (p->sk.ref == exclude && p->sk.node == self)
                        continue;
                tipc_dest_push(dsts, p->sk.node, p->sk.ref);
                (*dstcnt)++;
                if (mcast)
                        continue;
                list_move_tail(&p->all_publ, &sr->all_publ);
                break;
        }
no_match:
        spin_unlock_bh(&sc->lock);
exit:
        rcu_read_unlock();
        return !list_empty(dsts);
}

/* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets
 *                                      matching the given address
 * Used on nodes which have received a multicast/broadcast message
 * Returns a list of local sockets
 */
void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua,
                                       struct list_head *dports)
{
        struct service_range *sr;
        struct tipc_service *sc;
        struct publication *p;
        u8 scope = ua->scope;

        rcu_read_lock();
        sc = tipc_service_find(net, ua);
        if (!sc)
                goto exit;

        spin_lock_bh(&sc->lock);
        service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) {
                list_for_each_entry(p, &sr->local_publ, local_publ) {
                        if (scope == p->scope || scope == TIPC_ANY_SCOPE)
                                tipc_dest_push(dports, 0, p->sk.ref);
                }
        }
        spin_unlock_bh(&sc->lock);
exit:
        rcu_read_unlock();
}

/* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching
 *                                    the given address. Used in sending node.
 * Used on nodes which are sending out a multicast/broadcast message
 * Returns a list of nodes, including own node if applicable
 */
void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua,
                                     struct tipc_nlist *nodes)
{
        struct service_range *sr;
        struct tipc_service *sc;
        struct publication *p;

        rcu_read_lock();
        sc = tipc_service_find(net, ua);
        if (!sc)
                goto exit;

        spin_lock_bh(&sc->lock);
        service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) {
                list_for_each_entry(p, &sr->all_publ, all_publ) {
                        tipc_nlist_add(nodes, p->sk.node);
                }
        }
        spin_unlock_bh(&sc->lock);
exit:
        rcu_read_unlock();
}

/* tipc_nametbl_build_group - build list of communication group members
 */
void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
                              struct tipc_uaddr *ua)
{
        struct service_range *sr;
        struct tipc_service *sc;
        struct publication *p;
        struct rb_node *n;

        rcu_read_lock();
        sc = tipc_service_find(net, ua);
        if (!sc)
                goto exit;

        spin_lock_bh(&sc->lock);
        for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
                sr = container_of(n, struct service_range, tree_node);
                list_for_each_entry(p, &sr->all_publ, all_publ) {
                        if (p->scope != ua->scope)
                                continue;
                        tipc_group_add_member(grp, p->sk.node, p->sk.ref,
                                              p->sr.lower);
                }
        }
        spin_unlock_bh(&sc->lock);
exit:
        rcu_read_unlock();
}

/* tipc_nametbl_publish - add service binding to name table
 */
struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua,
                                         struct tipc_socket_addr *sk, u32 key)
{
        struct name_table *nt = tipc_name_table(net);
        struct tipc_net *tn = tipc_net(net);
        struct publication *p = NULL;
        struct sk_buff *skb = NULL;
        u32 rc_dests;

        spin_lock_bh(&tn->nametbl_lock);

        if (nt->local_publ_count >= TIPC_MAX_PUBL) {
                pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL);
                goto exit;
        }

        p = tipc_nametbl_insert_publ(net, ua, sk, key);
        if (p) {
                nt->local_publ_count++;
                skb = tipc_named_publish(net, p);
        }
        rc_dests = nt->rc_dests;
exit:
        spin_unlock_bh(&tn->nametbl_lock);

        if (skb)
                tipc_node_broadcast(net, skb, rc_dests);
        return p;

}

/**
 * tipc_nametbl_withdraw - withdraw a service binding
 * @net: network namespace
 * @ua: service address/range being unbound
 * @sk: address of the socket being unbound from
 * @key: target publication key
 */
void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua,
                           struct tipc_socket_addr *sk, u32 key)
{
        struct name_table *nt = tipc_name_table(net);
        struct tipc_net *tn = tipc_net(net);
        struct sk_buff *skb = NULL;
        struct publication *p;
        u32 rc_dests;

        spin_lock_bh(&tn->nametbl_lock);

        p = tipc_nametbl_remove_publ(net, ua, sk, key);
        if (p) {
                nt->local_publ_count--;
                skb = tipc_named_withdraw(net, p);
                list_del_init(&p->binding_sock);
                kfree_rcu(p, rcu);
        }
        rc_dests = nt->rc_dests;
        spin_unlock_bh(&tn->nametbl_lock);

        if (skb)
                tipc_node_broadcast(net, skb, rc_dests);
}

/**
 * tipc_nametbl_subscribe - add a subscription object to the name table
 * @sub: subscription to add
 */
bool tipc_nametbl_subscribe(struct tipc_subscription *sub)
{
        struct tipc_net *tn = tipc_net(sub->net);
        u32 type = sub->s.seq.type;
        struct tipc_service *sc;
        struct tipc_uaddr ua;
        bool res = true;

        tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type,
                   sub->s.seq.lower, sub->s.seq.upper);
        spin_lock_bh(&tn->nametbl_lock);
        sc = tipc_service_find(sub->net, &ua);
        if (!sc)
                sc = tipc_service_create(sub->net, &ua);
        if (sc) {
                spin_lock_bh(&sc->lock);
                tipc_service_subscribe(sc, sub);
                spin_unlock_bh(&sc->lock);
        } else {
                pr_warn("Failed to subscribe for {%u,%u,%u}\n",
                        type, sub->s.seq.lower, sub->s.seq.upper);
                res = false;
        }
        spin_unlock_bh(&tn->nametbl_lock);
        return res;
}

/**
 * tipc_nametbl_unsubscribe - remove a subscription object from name table
 * @sub: subscription to remove
 */
void tipc_nametbl_unsubscribe(struct tipc_subscription *sub)
{
        struct tipc_net *tn = tipc_net(sub->net);
        struct tipc_service *sc;
        struct tipc_uaddr ua;

        tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
                   sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper);
        spin_lock_bh(&tn->nametbl_lock);
        sc = tipc_service_find(sub->net, &ua);
        if (!sc)
                goto exit;

        spin_lock_bh(&sc->lock);
        list_del_init(&sub->service_list);
        tipc_sub_put(sub);

        /* Delete service item if no more publications and subscriptions */
        if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
                hlist_del_init_rcu(&sc->service_list);
                kfree_rcu(sc, rcu);
        }
        spin_unlock_bh(&sc->lock);
exit:
        spin_unlock_bh(&tn->nametbl_lock);
}

int tipc_nametbl_init(struct net *net)
{
        struct tipc_net *tn = tipc_net(net);
        struct name_table *nt;
        int i;

        nt = kzalloc(sizeof(*nt), GFP_KERNEL);
        if (!nt)
                return -ENOMEM;

        for (i = 0; i < TIPC_NAMETBL_SIZE; i++)
                INIT_HLIST_HEAD(&nt->services[i]);

        INIT_LIST_HEAD(&nt->node_scope);
        INIT_LIST_HEAD(&nt->cluster_scope);
        rwlock_init(&nt->cluster_scope_lock);
        tn->nametbl = nt;
        spin_lock_init(&tn->nametbl_lock);
        return 0;
}

/**
 * tipc_service_delete - purge all publications for a service and delete it
 * @net: the associated network namespace
 * @sc: tipc_service to delete
 */
static void tipc_service_delete(struct net *net, struct tipc_service *sc)
{
        struct service_range *sr, *tmpr;
        struct publication *p, *tmp;

        spin_lock_bh(&sc->lock);
        rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
                list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
                        tipc_service_remove_publ(sr, &p->sk, p->key);
                        kfree_rcu(p, rcu);
                }
                rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks);
                kfree(sr);
        }
        hlist_del_init_rcu(&sc->service_list);
        spin_unlock_bh(&sc->lock);
        kfree_rcu(sc, rcu);
}

void tipc_nametbl_stop(struct net *net)
{
        struct name_table *nt = tipc_name_table(net);
        struct tipc_net *tn = tipc_net(net);
        struct hlist_head *service_head;
        struct tipc_service *service;
        u32 i;

        /* Verify name table is empty and purge any lingering
         * publications, then release the name table
         */
        spin_lock_bh(&tn->nametbl_lock);
        for (i = 0; i < TIPC_NAMETBL_SIZE; i++) {
                if (hlist_empty(&nt->services[i]))
                        continue;
                service_head = &nt->services[i];
                hlist_for_each_entry_rcu(service, service_head, service_list) {
                        tipc_service_delete(net, service);
                }
        }
        spin_unlock_bh(&tn->nametbl_lock);

        synchronize_net();
        kfree(nt);
}

static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
                                        struct tipc_service *service,
                                        struct service_range *sr,
                                        u32 *last_key)
{
        struct publication *p;
        struct nlattr *attrs;
        struct nlattr *b;
        void *hdr;

        if (*last_key) {
                list_for_each_entry(p, &sr->all_publ, all_publ)
                        if (p->key == *last_key)
                                break;
                if (list_entry_is_head(p, &sr->all_publ, all_publ))
                        return -EPIPE;
        } else {
                p = list_first_entry(&sr->all_publ,
                                     struct publication,
                                     all_publ);
        }

        list_for_each_entry_from(p, &sr->all_publ, all_publ) {
                *last_key = p->key;

                hdr = genlmsg_put(msg->skb, msg->portid, msg->seq,
                                  &tipc_genl_family, NLM_F_MULTI,
                                  TIPC_NL_NAME_TABLE_GET);
                if (!hdr)
                        return -EMSGSIZE;

                attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE);
                if (!attrs)
                        goto msg_full;

                b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL);
                if (!b)
                        goto attr_msg_full;

                if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type))
                        goto publ_msg_full;
                if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower))
                        goto publ_msg_full;
                if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper))
                        goto publ_msg_full;
                if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope))
                        goto publ_msg_full;
                if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node))
                        goto publ_msg_full;
                if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref))
                        goto publ_msg_full;
                if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key))
                        goto publ_msg_full;

                nla_nest_end(msg->skb, b);
                nla_nest_end(msg->skb, attrs);
                genlmsg_end(msg->skb, hdr);
        }
        *last_key = 0;

        return 0;

publ_msg_full:
        nla_nest_cancel(msg->skb, b);
attr_msg_full:
        nla_nest_cancel(msg->skb, attrs);
msg_full:
        genlmsg_cancel(msg->skb, hdr);

        return -EMSGSIZE;
}

static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg,
                                        struct tipc_service *sc,
                                        u32 *last_lower, u32 *last_key)
{
        struct service_range *sr;
        struct rb_node *n;
        int err;

        for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
                sr = container_of(n, struct service_range, tree_node);
                if (sr->lower < *last_lower)
                        continue;
                err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key);
                if (err) {
                        *last_lower = sr->lower;
                        return err;
                }
        }
        *last_lower = 0;
        return 0;
}

static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg,
                                u32 *last_type, u32 *last_lower, u32 *last_key)
{
        struct tipc_net *tn = tipc_net(net);
        struct tipc_service *service = NULL;
        struct hlist_head *head;
        struct tipc_uaddr ua;
        int err;
        int i;

        if (*last_type)
                i = hash(*last_type);
        else
                i = 0;

        for (; i < TIPC_NAMETBL_SIZE; i++) {
                head = &tn->nametbl->services[i];

                if (*last_type ||
                    (!i && *last_key && (*last_lower == *last_key))) {
                        tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
                                   *last_type, *last_lower, *last_lower);
                        service = tipc_service_find(net, &ua);
                        if (!service)
                                return -EPIPE;
                } else {
                        hlist_for_each_entry_rcu(service, head, service_list)
                                break;
                        if (!service)
                                continue;
                }

                hlist_for_each_entry_from_rcu(service, service_list) {
                        spin_lock_bh(&service->lock);
                        err = __tipc_nl_service_range_list(msg, service,
                                                           last_lower,
                                                           last_key);

                        if (err) {
                                *last_type = service->type;
                                spin_unlock_bh(&service->lock);
                                return err;
                        }
                        spin_unlock_bh(&service->lock);
                }
                *last_type = 0;
        }
        return 0;
}

int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        u32 last_type = cb->args[0];
        u32 last_lower = cb->args[1];
        u32 last_key = cb->args[2];
        int done = cb->args[3];
        struct tipc_nl_msg msg;
        int err;

        if (done)
                return 0;

        msg.skb = skb;
        msg.portid = NETLINK_CB(cb->skb).portid;
        msg.seq = cb->nlh->nlmsg_seq;

        rcu_read_lock();
        err = tipc_nl_service_list(net, &msg, &last_type,
                                   &last_lower, &last_key);
        if (!err) {
                done = 1;
        } else if (err != -EMSGSIZE) {
                /* We never set seq or call nl_dump_check_consistent() this
                 * means that setting prev_seq here will cause the consistence
                 * check to fail in the netlink callback handler. Resulting in
                 * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if
                 * we got an error.
                 */
                cb->prev_seq = 1;
        }
        rcu_read_unlock();

        cb->args[0] = last_type;
        cb->args[1] = last_lower;
        cb->args[2] = last_key;
        cb->args[3] = done;

        return skb->len;
}

struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port)
{
        struct tipc_dest *dst;

        list_for_each_entry(dst, l, list) {
                if (dst->node == node && dst->port == port)
                        return dst;
        }
        return NULL;
}

bool tipc_dest_push(struct list_head *l, u32 node, u32 port)
{
        struct tipc_dest *dst;

        if (tipc_dest_find(l, node, port))
                return false;

        dst = kmalloc(sizeof(*dst), GFP_ATOMIC);
        if (unlikely(!dst))
                return false;
        dst->node = node;
        dst->port = port;
        list_add(&dst->list, l);
        return true;
}

bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port)
{
        struct tipc_dest *dst;

        if (list_empty(l))
                return false;
        dst = list_first_entry(l, typeof(*dst), list);
        if (port)
                *port = dst->port;
        if (node)
                *node = dst->node;
        list_del(&dst->list);
        kfree(dst);
        return true;
}

bool tipc_dest_del(struct list_head *l, u32 node, u32 port)
{
        struct tipc_dest *dst;

        dst = tipc_dest_find(l, node, port);
        if (!dst)
                return false;
        list_del(&dst->list);
        kfree(dst);
        return true;
}

void tipc_dest_list_purge(struct list_head *l)
{
        struct tipc_dest *dst, *tmp;

        list_for_each_entry_safe(dst, tmp, l, list) {
                list_del(&dst->list);
                kfree(dst);
        }
}























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 1994 Linus Torvalds
 *
 * Pentium III FXSR, SSE support
 * General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 * x86-64 work by Andi Kleen 2002
 */

#ifndef _ASM_X86_FPU_API_H
#define _ASM_X86_FPU_API_H
#include <linux/bottom_half.h>

#include <asm/fpu/types.h>

/*
 * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
 * disables preemption so be careful if you intend to use it for long periods
 * of time.
 * If you intend to use the FPU in irq/softirq you need to check first with
 * irq_fpu_usable() if it is possible.
 */

/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
#define KFPU_387        _BITUL(0)        /* 387 state will be initialized */
#define KFPU_MXCSR        _BITUL(1)        /* MXCSR will be initialized */

extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
extern void fpregs_mark_activate(void);

/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
{
#ifdef CONFIG_X86_64
        /*
         * Any 64-bit code that uses 387 instructions must explicitly request
         * KFPU_387.
         */
        kernel_fpu_begin_mask(KFPU_MXCSR);
#else
        /*
         * 32-bit kernel code may use 387 operations as well as SSE2, etc,
         * as long as it checks that the CPU has the required capability.
         */
        kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
#endif
}

/*
 * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
 * A context switch will (and softirq might) save CPU's FPU registers to
 * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
 * a random state.
 *
 * local_bh_disable() protects against both preemption and soft interrupts
 * on !RT kernels.
 *
 * On RT kernels local_bh_disable() is not sufficient because it only
 * serializes soft interrupt related sections via a local lock, but stays
 * preemptible. Disabling preemption is the right choice here as bottom
 * half processing is always in thread context on RT kernels so it
 * implicitly prevents bottom half processing as well.
 *
 * Disabling preemption also serializes against kernel_fpu_begin().
 */
static inline void fpregs_lock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_disable();
        else
                preempt_disable();
}

static inline void fpregs_unlock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_enable();
        else
                preempt_enable();
}

/*
 * FPU state gets lazily restored before returning to userspace. So when in the
 * kernel, the valid FPU state may be kept in the buffer. This function will force
 * restore all the fpu state to the registers early if needed, and lock them from
 * being automatically saved/restored. Then FPU state can be modified safely in the
 * registers, before unlocking with fpregs_unlock().
 */
void fpregs_lock_and_load(void);

#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
#else
static inline void fpregs_assert_state_consistent(void) { }
#endif

/*
 * Load the task FPU state before returning to userspace.
 */
extern void switch_fpu_return(void);

/*
 * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
 *
 * If 'feature_name' is set then put a human-readable description of
 * the feature there as well - this can be used to print error (or success)
 * messages.
 */
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);

/* Trap handling */
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
extern void fpu_sync_fpstate(struct fpu *fpu);
extern void fpu_reset_from_exception_fixup(void);

/* Boot, hotplug and resume */
extern void fpu__init_cpu(void);
extern void fpu__init_system(void);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);

#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif

/* State tracking */
DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/* Process cleanup */
#ifdef CONFIG_X86_64
extern void fpstate_free(struct fpu *fpu);
#else
static inline void fpstate_free(struct fpu *fpu) { }
#endif

/* fpstate-related functions which are exported to KVM */
extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);

extern u64 xstate_get_guest_group_perm(void);

extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);


/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);

#ifdef CONFIG_X86_64
extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
extern void fpu_sync_guest_vmexit_xfd_state(void);
#else
static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
#endif

extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                           unsigned int size, u64 xfeatures, u32 pkru);
extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);

static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
{
        gfpu->fpstate->is_confidential = true;
}

static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
{
        return gfpu->fpstate->is_confidential;
}

/* prctl */
extern long fpu_xstate_prctl(int option, unsigned long arg2);

extern void fpu_idle_fpregs(void);

#endif /* _ASM_X86_FPU_API_H */








































































































































































































































































































































































    1 











































    1 








    1 
    1 

    1 









    1 


    1 









    1 

    1 






    1 
    1 












    1 













    1 







    1 



    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Functions to manage eBPF programs attached to cgroups
 *
 * Copyright (c) 2016 Daniel Mack
 */

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/cgroup.h>
#include <linux/filter.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_lsm.h>
#include <linux/bpf_verifier.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>

#include "../cgroup/cgroup-internal.h"

DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);

/* __always_inline is necessary to prevent indirect call through run_prog
 * function pointer.
 */
static __always_inline int
bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
                      enum cgroup_bpf_attach_type atype,
                      const void *ctx, bpf_prog_run_fn run_prog,
                      int retval, u32 *ret_flags)
{
        const struct bpf_prog_array_item *item;
        const struct bpf_prog *prog;
        const struct bpf_prog_array *array;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_cg_run_ctx run_ctx;
        u32 func_ret;

        run_ctx.retval = retval;
        migrate_disable();
        rcu_read_lock();
        array = rcu_dereference(cgrp->effective[atype]);
        item = &array->items[0];
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        while ((prog = READ_ONCE(item->prog))) {
                run_ctx.prog_item = item;
                func_ret = run_prog(prog, ctx);
                if (ret_flags) {
                        *(ret_flags) |= (func_ret >> 1);
                        func_ret &= 1;
                }
                if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
                        run_ctx.retval = -EPERM;
                item++;
        }
        bpf_reset_run_ctx(old_run_ctx);
        rcu_read_unlock();
        migrate_enable();
        return run_ctx.retval;
}

unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
                                       const struct bpf_insn *insn)
{
        const struct bpf_prog *shim_prog;
        struct sock *sk;
        struct cgroup *cgrp;
        int ret = 0;
        u64 *args;

        args = (u64 *)ctx;
        sk = (void *)(unsigned long)args[0];
        /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
        shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        if (likely(cgrp))
                ret = bpf_prog_run_array_cg(&cgrp->bpf,
                                            shim_prog->aux->cgroup_atype,
                                            ctx, bpf_prog_run, 0, NULL);
        return ret;
}

unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
                                         const struct bpf_insn *insn)
{
        const struct bpf_prog *shim_prog;
        struct socket *sock;
        struct cgroup *cgrp;
        int ret = 0;
        u64 *args;

        args = (u64 *)ctx;
        sock = (void *)(unsigned long)args[0];
        /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
        shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));

        cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
        if (likely(cgrp))
                ret = bpf_prog_run_array_cg(&cgrp->bpf,
                                            shim_prog->aux->cgroup_atype,
                                            ctx, bpf_prog_run, 0, NULL);
        return ret;
}

unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
                                          const struct bpf_insn *insn)
{
        const struct bpf_prog *shim_prog;
        struct cgroup *cgrp;
        int ret = 0;

        /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
        shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));

        /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
        cgrp = task_dfl_cgroup(current);
        if (likely(cgrp))
                ret = bpf_prog_run_array_cg(&cgrp->bpf,
                                            shim_prog->aux->cgroup_atype,
                                            ctx, bpf_prog_run, 0, NULL);
        return ret;
}

#ifdef CONFIG_BPF_LSM
struct cgroup_lsm_atype {
        u32 attach_btf_id;
        int refcnt;
};

static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];

static enum cgroup_bpf_attach_type
bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
{
        int i;

        lockdep_assert_held(&cgroup_mutex);

        if (attach_type != BPF_LSM_CGROUP)
                return to_cgroup_bpf_attach_type(attach_type);

        for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
                if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
                        return CGROUP_LSM_START + i;

        for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
                if (cgroup_lsm_atype[i].attach_btf_id == 0)
                        return CGROUP_LSM_START + i;

        return -E2BIG;

}

void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
{
        int i = cgroup_atype - CGROUP_LSM_START;

        lockdep_assert_held(&cgroup_mutex);

        WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
                     cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);

        cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
        cgroup_lsm_atype[i].refcnt++;
}

void bpf_cgroup_atype_put(int cgroup_atype)
{
        int i = cgroup_atype - CGROUP_LSM_START;

        cgroup_lock();
        if (--cgroup_lsm_atype[i].refcnt <= 0)
                cgroup_lsm_atype[i].attach_btf_id = 0;
        WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
        cgroup_unlock();
}
#else
static enum cgroup_bpf_attach_type
bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
{
        if (attach_type != BPF_LSM_CGROUP)
                return to_cgroup_bpf_attach_type(attach_type);
        return -EOPNOTSUPP;
}
#endif /* CONFIG_BPF_LSM */

void cgroup_bpf_offline(struct cgroup *cgrp)
{
        cgroup_get(cgrp);
        percpu_ref_kill(&cgrp->bpf.refcnt);
}

static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
{
        enum bpf_cgroup_storage_type stype;

        for_each_cgroup_storage_type(stype)
                bpf_cgroup_storage_free(storages[stype]);
}

static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
                                     struct bpf_cgroup_storage *new_storages[],
                                     enum bpf_attach_type type,
                                     struct bpf_prog *prog,
                                     struct cgroup *cgrp)
{
        enum bpf_cgroup_storage_type stype;
        struct bpf_cgroup_storage_key key;
        struct bpf_map *map;

        key.cgroup_inode_id = cgroup_id(cgrp);
        key.attach_type = type;

        for_each_cgroup_storage_type(stype) {
                map = prog->aux->cgroup_storage[stype];
                if (!map)
                        continue;

                storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
                if (storages[stype])
                        continue;

                storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
                if (IS_ERR(storages[stype])) {
                        bpf_cgroup_storages_free(new_storages);
                        return -ENOMEM;
                }

                new_storages[stype] = storages[stype];
        }

        return 0;
}

static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
                                       struct bpf_cgroup_storage *src[])
{
        enum bpf_cgroup_storage_type stype;

        for_each_cgroup_storage_type(stype)
                dst[stype] = src[stype];
}

static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
                                     struct cgroup *cgrp,
                                     enum bpf_attach_type attach_type)
{
        enum bpf_cgroup_storage_type stype;

        for_each_cgroup_storage_type(stype)
                bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
}

/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
 * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
 * doesn't free link memory, which will eventually be done by bpf_link's
 * release() callback, when its last FD is closed.
 */
static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
{
        cgroup_put(link->cgroup);
        link->cgroup = NULL;
}

/**
 * cgroup_bpf_release() - put references of all bpf programs and
 *                        release all cgroup bpf data
 * @work: work structure embedded into the cgroup to modify
 */
static void cgroup_bpf_release(struct work_struct *work)
{
        struct cgroup *p, *cgrp = container_of(work, struct cgroup,
                                               bpf.release_work);
        struct bpf_prog_array *old_array;
        struct list_head *storages = &cgrp->bpf.storages;
        struct bpf_cgroup_storage *storage, *stmp;

        unsigned int atype;

        cgroup_lock();

        for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
                struct hlist_head *progs = &cgrp->bpf.progs[atype];
                struct bpf_prog_list *pl;
                struct hlist_node *pltmp;

                hlist_for_each_entry_safe(pl, pltmp, progs, node) {
                        hlist_del(&pl->node);
                        if (pl->prog) {
                                if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
                                        bpf_trampoline_unlink_cgroup_shim(pl->prog);
                                bpf_prog_put(pl->prog);
                        }
                        if (pl->link) {
                                if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
                                        bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
                                bpf_cgroup_link_auto_detach(pl->link);
                        }
                        kfree(pl);
                        static_branch_dec(&cgroup_bpf_enabled_key[atype]);
                }
                old_array = rcu_dereference_protected(
                                cgrp->bpf.effective[atype],
                                lockdep_is_held(&cgroup_mutex));
                bpf_prog_array_free(old_array);
        }

        list_for_each_entry_safe(storage, stmp, storages, list_cg) {
                bpf_cgroup_storage_unlink(storage);
                bpf_cgroup_storage_free(storage);
        }

        cgroup_unlock();

        for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
                cgroup_bpf_put(p);

        percpu_ref_exit(&cgrp->bpf.refcnt);
        cgroup_put(cgrp);
}

/**
 * cgroup_bpf_release_fn() - callback used to schedule releasing
 *                           of bpf cgroup data
 * @ref: percpu ref counter structure
 */
static void cgroup_bpf_release_fn(struct percpu_ref *ref)
{
        struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);

        INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
        queue_work(system_wq, &cgrp->bpf.release_work);
}

/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
 * link or direct prog.
 */
static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
{
        if (pl->prog)
                return pl->prog;
        if (pl->link)
                return pl->link->link.prog;
        return NULL;
}

/* count number of elements in the list.
 * it's slow but the list cannot be long
 */
static u32 prog_list_length(struct hlist_head *head)
{
        struct bpf_prog_list *pl;
        u32 cnt = 0;

        hlist_for_each_entry(pl, head, node) {
                if (!prog_list_prog(pl))
                        continue;
                cnt++;
        }
        return cnt;
}

/* if parent has non-overridable prog attached,
 * disallow attaching new programs to the descendent cgroup.
 * if parent has overridable or multi-prog, allow attaching
 */
static bool hierarchy_allows_attach(struct cgroup *cgrp,
                                    enum cgroup_bpf_attach_type atype)
{
        struct cgroup *p;

        p = cgroup_parent(cgrp);
        if (!p)
                return true;
        do {
                u32 flags = p->bpf.flags[atype];
                u32 cnt;

                if (flags & BPF_F_ALLOW_MULTI)
                        return true;
                cnt = prog_list_length(&p->bpf.progs[atype]);
                WARN_ON_ONCE(cnt > 1);
                if (cnt == 1)
                        return !!(flags & BPF_F_ALLOW_OVERRIDE);
                p = cgroup_parent(p);
        } while (p);
        return true;
}

/* compute a chain of effective programs for a given cgroup:
 * start from the list of programs in this cgroup and add
 * all parent programs.
 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
 * to programs in this cgroup
 */
static int compute_effective_progs(struct cgroup *cgrp,
                                   enum cgroup_bpf_attach_type atype,
                                   struct bpf_prog_array **array)
{
        struct bpf_prog_array_item *item;
        struct bpf_prog_array *progs;
        struct bpf_prog_list *pl;
        struct cgroup *p = cgrp;
        int cnt = 0;

        /* count number of effective programs by walking parents */
        do {
                if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
                        cnt += prog_list_length(&p->bpf.progs[atype]);
                p = cgroup_parent(p);
        } while (p);

        progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
        if (!progs)
                return -ENOMEM;

        /* populate the array with effective progs */
        cnt = 0;
        p = cgrp;
        do {
                if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
                        continue;

                hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
                        if (!prog_list_prog(pl))
                                continue;

                        item = &progs->items[cnt];
                        item->prog = prog_list_prog(pl);
                        bpf_cgroup_storages_assign(item->cgroup_storage,
                                                   pl->storage);
                        cnt++;
                }
        } while ((p = cgroup_parent(p)));

        *array = progs;
        return 0;
}

static void activate_effective_progs(struct cgroup *cgrp,
                                     enum cgroup_bpf_attach_type atype,
                                     struct bpf_prog_array *old_array)
{
        old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
                                        lockdep_is_held(&cgroup_mutex));
        /* free prog array after grace period, since __cgroup_bpf_run_*()
         * might be still walking the array
         */
        bpf_prog_array_free(old_array);
}

/**
 * cgroup_bpf_inherit() - inherit effective programs from parent
 * @cgrp: the cgroup to modify
 */
int cgroup_bpf_inherit(struct cgroup *cgrp)
{
/* has to use marco instead of const int, since compiler thinks
 * that array below is variable length
 */
#define        NR ARRAY_SIZE(cgrp->bpf.effective)
        struct bpf_prog_array *arrays[NR] = {};
        struct cgroup *p;
        int ret, i;

        ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
                              GFP_KERNEL);
        if (ret)
                return ret;

        for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
                cgroup_bpf_get(p);

        for (i = 0; i < NR; i++)
                INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);

        INIT_LIST_HEAD(&cgrp->bpf.storages);

        for (i = 0; i < NR; i++)
                if (compute_effective_progs(cgrp, i, &arrays[i]))
                        goto cleanup;

        for (i = 0; i < NR; i++)
                activate_effective_progs(cgrp, i, arrays[i]);

        return 0;
cleanup:
        for (i = 0; i < NR; i++)
                bpf_prog_array_free(arrays[i]);

        for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
                cgroup_bpf_put(p);

        percpu_ref_exit(&cgrp->bpf.refcnt);

        return -ENOMEM;
}

static int update_effective_progs(struct cgroup *cgrp,
                                  enum cgroup_bpf_attach_type atype)
{
        struct cgroup_subsys_state *css;
        int err;

        /* allocate and recompute effective prog arrays */
        css_for_each_descendant_pre(css, &cgrp->self) {
                struct cgroup *desc = container_of(css, struct cgroup, self);

                if (percpu_ref_is_zero(&desc->bpf.refcnt))
                        continue;

                err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
                if (err)
                        goto cleanup;
        }

        /* all allocations were successful. Activate all prog arrays */
        css_for_each_descendant_pre(css, &cgrp->self) {
                struct cgroup *desc = container_of(css, struct cgroup, self);

                if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
                        if (unlikely(desc->bpf.inactive)) {
                                bpf_prog_array_free(desc->bpf.inactive);
                                desc->bpf.inactive = NULL;
                        }
                        continue;
                }

                activate_effective_progs(desc, atype, desc->bpf.inactive);
                desc->bpf.inactive = NULL;
        }

        return 0;

cleanup:
        /* oom while computing effective. Free all computed effective arrays
         * since they were not activated
         */
        css_for_each_descendant_pre(css, &cgrp->self) {
                struct cgroup *desc = container_of(css, struct cgroup, self);

                bpf_prog_array_free(desc->bpf.inactive);
                desc->bpf.inactive = NULL;
        }

        return err;
}

#define BPF_CGROUP_MAX_PROGS 64

static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
                                               struct bpf_prog *prog,
                                               struct bpf_cgroup_link *link,
                                               struct bpf_prog *replace_prog,
                                               bool allow_multi)
{
        struct bpf_prog_list *pl;

        /* single-attach case */
        if (!allow_multi) {
                if (hlist_empty(progs))
                        return NULL;
                return hlist_entry(progs->first, typeof(*pl), node);
        }

        hlist_for_each_entry(pl, progs, node) {
                if (prog && pl->prog == prog && prog != replace_prog)
                        /* disallow attaching the same prog twice */
                        return ERR_PTR(-EINVAL);
                if (link && pl->link == link)
                        /* disallow attaching the same link twice */
                        return ERR_PTR(-EINVAL);
        }

        /* direct prog multi-attach w/ replacement case */
        if (replace_prog) {
                hlist_for_each_entry(pl, progs, node) {
                        if (pl->prog == replace_prog)
                                /* a match found */
                                return pl;
                }
                /* prog to replace not found for cgroup */
                return ERR_PTR(-ENOENT);
        }

        return NULL;
}

/**
 * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
 *                         propagate the change to descendants
 * @cgrp: The cgroup which descendants to traverse
 * @prog: A program to attach
 * @link: A link to attach
 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
 * @type: Type of attach operation
 * @flags: Option flags
 *
 * Exactly one of @prog or @link can be non-null.
 * Must be called with cgroup_mutex held.
 */
static int __cgroup_bpf_attach(struct cgroup *cgrp,
                               struct bpf_prog *prog, struct bpf_prog *replace_prog,
                               struct bpf_cgroup_link *link,
                               enum bpf_attach_type type, u32 flags)
{
        u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
        struct bpf_prog *old_prog = NULL;
        struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
        struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
        struct bpf_prog *new_prog = prog ? : link->link.prog;
        enum cgroup_bpf_attach_type atype;
        struct bpf_prog_list *pl;
        struct hlist_head *progs;
        int err;

        if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
            ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
                /* invalid combination */
                return -EINVAL;
        if (link && (prog || replace_prog))
                /* only either link or prog/replace_prog can be specified */
                return -EINVAL;
        if (!!replace_prog != !!(flags & BPF_F_REPLACE))
                /* replace_prog implies BPF_F_REPLACE, and vice versa */
                return -EINVAL;

        atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
        if (atype < 0)
                return -EINVAL;

        progs = &cgrp->bpf.progs[atype];

        if (!hierarchy_allows_attach(cgrp, atype))
                return -EPERM;

        if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
                /* Disallow attaching non-overridable on top
                 * of existing overridable in this cgroup.
                 * Disallow attaching multi-prog if overridable or none
                 */
                return -EPERM;

        if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
                return -E2BIG;

        pl = find_attach_entry(progs, prog, link, replace_prog,
                               flags & BPF_F_ALLOW_MULTI);
        if (IS_ERR(pl))
                return PTR_ERR(pl);

        if (bpf_cgroup_storages_alloc(storage, new_storage, type,
                                      prog ? : link->link.prog, cgrp))
                return -ENOMEM;

        if (pl) {
                old_prog = pl->prog;
        } else {
                struct hlist_node *last = NULL;

                pl = kmalloc(sizeof(*pl), GFP_KERNEL);
                if (!pl) {
                        bpf_cgroup_storages_free(new_storage);
                        return -ENOMEM;
                }
                if (hlist_empty(progs))
                        hlist_add_head(&pl->node, progs);
                else
                        hlist_for_each(last, progs) {
                                if (last->next)
                                        continue;
                                hlist_add_behind(&pl->node, last);
                                break;
                        }
        }

        pl->prog = prog;
        pl->link = link;
        bpf_cgroup_storages_assign(pl->storage, storage);
        cgrp->bpf.flags[atype] = saved_flags;

        if (type == BPF_LSM_CGROUP) {
                err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
                if (err)
                        goto cleanup;
        }

        err = update_effective_progs(cgrp, atype);
        if (err)
                goto cleanup_trampoline;

        if (old_prog) {
                if (type == BPF_LSM_CGROUP)
                        bpf_trampoline_unlink_cgroup_shim(old_prog);
                bpf_prog_put(old_prog);
        } else {
                static_branch_inc(&cgroup_bpf_enabled_key[atype]);
        }
        bpf_cgroup_storages_link(new_storage, cgrp, type);
        return 0;

cleanup_trampoline:
        if (type == BPF_LSM_CGROUP)
                bpf_trampoline_unlink_cgroup_shim(new_prog);

cleanup:
        if (old_prog) {
                pl->prog = old_prog;
                pl->link = NULL;
        }
        bpf_cgroup_storages_free(new_storage);
        if (!old_prog) {
                hlist_del(&pl->node);
                kfree(pl);
        }
        return err;
}

static int cgroup_bpf_attach(struct cgroup *cgrp,
                             struct bpf_prog *prog, struct bpf_prog *replace_prog,
                             struct bpf_cgroup_link *link,
                             enum bpf_attach_type type,
                             u32 flags)
{
        int ret;

        cgroup_lock();
        ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
        cgroup_unlock();
        return ret;
}

/* Swap updated BPF program for given link in effective program arrays across
 * all descendant cgroups. This function is guaranteed to succeed.
 */
static void replace_effective_prog(struct cgroup *cgrp,
                                   enum cgroup_bpf_attach_type atype,
                                   struct bpf_cgroup_link *link)
{
        struct bpf_prog_array_item *item;
        struct cgroup_subsys_state *css;
        struct bpf_prog_array *progs;
        struct bpf_prog_list *pl;
        struct hlist_head *head;
        struct cgroup *cg;
        int pos;

        css_for_each_descendant_pre(css, &cgrp->self) {
                struct cgroup *desc = container_of(css, struct cgroup, self);

                if (percpu_ref_is_zero(&desc->bpf.refcnt))
                        continue;

                /* find position of link in effective progs array */
                for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
                        if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
                                continue;

                        head = &cg->bpf.progs[atype];
                        hlist_for_each_entry(pl, head, node) {
                                if (!prog_list_prog(pl))
                                        continue;
                                if (pl->link == link)
                                        goto found;
                                pos++;
                        }
                }
found:
                BUG_ON(!cg);
                progs = rcu_dereference_protected(
                                desc->bpf.effective[atype],
                                lockdep_is_held(&cgroup_mutex));
                item = &progs->items[pos];
                WRITE_ONCE(item->prog, link->link.prog);
        }
}

/**
 * __cgroup_bpf_replace() - Replace link's program and propagate the change
 *                          to descendants
 * @cgrp: The cgroup which descendants to traverse
 * @link: A link for which to replace BPF program
 * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
 *            incremented
 *
 * Must be called with cgroup_mutex held.
 */
static int __cgroup_bpf_replace(struct cgroup *cgrp,
                                struct bpf_cgroup_link *link,
                                struct bpf_prog *new_prog)
{
        enum cgroup_bpf_attach_type atype;
        struct bpf_prog *old_prog;
        struct bpf_prog_list *pl;
        struct hlist_head *progs;
        bool found = false;

        atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
        if (atype < 0)
                return -EINVAL;

        progs = &cgrp->bpf.progs[atype];

        if (link->link.prog->type != new_prog->type)
                return -EINVAL;

        hlist_for_each_entry(pl, progs, node) {
                if (pl->link == link) {
                        found = true;
                        break;
                }
        }
        if (!found)
                return -ENOENT;

        old_prog = xchg(&link->link.prog, new_prog);
        replace_effective_prog(cgrp, atype, link);
        bpf_prog_put(old_prog);
        return 0;
}

static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
                              struct bpf_prog *old_prog)
{
        struct bpf_cgroup_link *cg_link;
        int ret;

        cg_link = container_of(link, struct bpf_cgroup_link, link);

        cgroup_lock();
        /* link might have been auto-released by dying cgroup, so fail */
        if (!cg_link->cgroup) {
                ret = -ENOLINK;
                goto out_unlock;
        }
        if (old_prog && link->prog != old_prog) {
                ret = -EPERM;
                goto out_unlock;
        }
        ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
out_unlock:
        cgroup_unlock();
        return ret;
}

static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
                                               struct bpf_prog *prog,
                                               struct bpf_cgroup_link *link,
                                               bool allow_multi)
{
        struct bpf_prog_list *pl;

        if (!allow_multi) {
                if (hlist_empty(progs))
                        /* report error when trying to detach and nothing is attached */
                        return ERR_PTR(-ENOENT);

                /* to maintain backward compatibility NONE and OVERRIDE cgroups
                 * allow detaching with invalid FD (prog==NULL) in legacy mode
                 */
                return hlist_entry(progs->first, typeof(*pl), node);
        }

        if (!prog && !link)
                /* to detach MULTI prog the user has to specify valid FD
                 * of the program or link to be detached
                 */
                return ERR_PTR(-EINVAL);

        /* find the prog or link and detach it */
        hlist_for_each_entry(pl, progs, node) {
                if (pl->prog == prog && pl->link == link)
                        return pl;
        }
        return ERR_PTR(-ENOENT);
}

/**
 * purge_effective_progs() - After compute_effective_progs fails to alloc new
 *                           cgrp->bpf.inactive table we can recover by
 *                           recomputing the array in place.
 *
 * @cgrp: The cgroup which descendants to travers
 * @prog: A program to detach or NULL
 * @link: A link to detach or NULL
 * @atype: Type of detach operation
 */
static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
                                  struct bpf_cgroup_link *link,
                                  enum cgroup_bpf_attach_type atype)
{
        struct cgroup_subsys_state *css;
        struct bpf_prog_array *progs;
        struct bpf_prog_list *pl;
        struct hlist_head *head;
        struct cgroup *cg;
        int pos;

        /* recompute effective prog array in place */
        css_for_each_descendant_pre(css, &cgrp->self) {
                struct cgroup *desc = container_of(css, struct cgroup, self);

                if (percpu_ref_is_zero(&desc->bpf.refcnt))
                        continue;

                /* find position of link or prog in effective progs array */
                for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
                        if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
                                continue;

                        head = &cg->bpf.progs[atype];
                        hlist_for_each_entry(pl, head, node) {
                                if (!prog_list_prog(pl))
                                        continue;
                                if (pl->prog == prog && pl->link == link)
                                        goto found;
                                pos++;
                        }
                }

                /* no link or prog match, skip the cgroup of this layer */
                continue;
found:
                progs = rcu_dereference_protected(
                                desc->bpf.effective[atype],
                                lockdep_is_held(&cgroup_mutex));

                /* Remove the program from the array */
                WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
                          "Failed to purge a prog from array at index %d", pos);
        }
}

/**
 * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
 *                         propagate the change to descendants
 * @cgrp: The cgroup which descendants to traverse
 * @prog: A program to detach or NULL
 * @link: A link to detach or NULL
 * @type: Type of detach operation
 *
 * At most one of @prog or @link can be non-NULL.
 * Must be called with cgroup_mutex held.
 */
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
                               struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
        enum cgroup_bpf_attach_type atype;
        struct bpf_prog *old_prog;
        struct bpf_prog_list *pl;
        struct hlist_head *progs;
        u32 attach_btf_id = 0;
        u32 flags;

        if (prog)
                attach_btf_id = prog->aux->attach_btf_id;
        if (link)
                attach_btf_id = link->link.prog->aux->attach_btf_id;

        atype = bpf_cgroup_atype_find(type, attach_btf_id);
        if (atype < 0)
                return -EINVAL;

        progs = &cgrp->bpf.progs[atype];
        flags = cgrp->bpf.flags[atype];

        if (prog && link)
                /* only one of prog or link can be specified */
                return -EINVAL;

        pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
        if (IS_ERR(pl))
                return PTR_ERR(pl);

        /* mark it deleted, so it's ignored while recomputing effective */
        old_prog = pl->prog;
        pl->prog = NULL;
        pl->link = NULL;

        if (update_effective_progs(cgrp, atype)) {
                /* if update effective array failed replace the prog with a dummy prog*/
                pl->prog = old_prog;
                pl->link = link;
                purge_effective_progs(cgrp, old_prog, link, atype);
        }

        /* now can actually delete it from this cgroup list */
        hlist_del(&pl->node);

        kfree(pl);
        if (hlist_empty(progs))
                /* last program was detached, reset flags to zero */
                cgrp->bpf.flags[atype] = 0;
        if (old_prog) {
                if (type == BPF_LSM_CGROUP)
                        bpf_trampoline_unlink_cgroup_shim(old_prog);
                bpf_prog_put(old_prog);
        }
        static_branch_dec(&cgroup_bpf_enabled_key[atype]);
        return 0;
}

static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
                             enum bpf_attach_type type)
{
        int ret;

        cgroup_lock();
        ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
        cgroup_unlock();
        return ret;
}

/* Must be called with cgroup_mutex held to avoid races. */
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                              union bpf_attr __user *uattr)
{
        __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
        bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
        __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
        enum bpf_attach_type type = attr->query.attach_type;
        enum cgroup_bpf_attach_type from_atype, to_atype;
        enum cgroup_bpf_attach_type atype;
        struct bpf_prog_array *effective;
        int cnt, ret = 0, i;
        int total_cnt = 0;
        u32 flags;

        if (effective_query && prog_attach_flags)
                return -EINVAL;

        if (type == BPF_LSM_CGROUP) {
                if (!effective_query && attr->query.prog_cnt &&
                    prog_ids && !prog_attach_flags)
                        return -EINVAL;

                from_atype = CGROUP_LSM_START;
                to_atype = CGROUP_LSM_END;
                flags = 0;
        } else {
                from_atype = to_cgroup_bpf_attach_type(type);
                if (from_atype < 0)
                        return -EINVAL;
                to_atype = from_atype;
                flags = cgrp->bpf.flags[from_atype];
        }

        for (atype = from_atype; atype <= to_atype; atype++) {
                if (effective_query) {
                        effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
                                                              lockdep_is_held(&cgroup_mutex));
                        total_cnt += bpf_prog_array_length(effective);
                } else {
                        total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
                }
        }

        /* always output uattr->query.attach_flags as 0 during effective query */
        flags = effective_query ? 0 : flags;
        if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
                return -EFAULT;
        if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
                return -EFAULT;
        if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
                /* return early if user requested only program count + flags */
                return 0;

        if (attr->query.prog_cnt < total_cnt) {
                total_cnt = attr->query.prog_cnt;
                ret = -ENOSPC;
        }

        for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
                if (effective_query) {
                        effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
                                                              lockdep_is_held(&cgroup_mutex));
                        cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
                        ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
                } else {
                        struct hlist_head *progs;
                        struct bpf_prog_list *pl;
                        struct bpf_prog *prog;
                        u32 id;

                        progs = &cgrp->bpf.progs[atype];
                        cnt = min_t(int, prog_list_length(progs), total_cnt);
                        i = 0;
                        hlist_for_each_entry(pl, progs, node) {
                                prog = prog_list_prog(pl);
                                id = prog->aux->id;
                                if (copy_to_user(prog_ids + i, &id, sizeof(id)))
                                        return -EFAULT;
                                if (++i == cnt)
                                        break;
                        }

                        if (prog_attach_flags) {
                                flags = cgrp->bpf.flags[atype];

                                for (i = 0; i < cnt; i++)
                                        if (copy_to_user(prog_attach_flags + i,
                                                         &flags, sizeof(flags)))
                                                return -EFAULT;
                                prog_attach_flags += cnt;
                        }
                }

                prog_ids += cnt;
                total_cnt -= cnt;
        }
        return ret;
}

static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                            union bpf_attr __user *uattr)
{
        int ret;

        cgroup_lock();
        ret = __cgroup_bpf_query(cgrp, attr, uattr);
        cgroup_unlock();
        return ret;
}

int cgroup_bpf_prog_attach(const union bpf_attr *attr,
                           enum bpf_prog_type ptype, struct bpf_prog *prog)
{
        struct bpf_prog *replace_prog = NULL;
        struct cgroup *cgrp;
        int ret;

        cgrp = cgroup_get_from_fd(attr->target_fd);
        if (IS_ERR(cgrp))
                return PTR_ERR(cgrp);

        if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
            (attr->attach_flags & BPF_F_REPLACE)) {
                replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
                if (IS_ERR(replace_prog)) {
                        cgroup_put(cgrp);
                        return PTR_ERR(replace_prog);
                }
        }

        ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
                                attr->attach_type, attr->attach_flags);

        if (replace_prog)
                bpf_prog_put(replace_prog);
        cgroup_put(cgrp);
        return ret;
}

int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
{
        struct bpf_prog *prog;
        struct cgroup *cgrp;
        int ret;

        cgrp = cgroup_get_from_fd(attr->target_fd);
        if (IS_ERR(cgrp))
                return PTR_ERR(cgrp);

        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
        if (IS_ERR(prog))
                prog = NULL;

        ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
        if (prog)
                bpf_prog_put(prog);

        cgroup_put(cgrp);
        return ret;
}

static void bpf_cgroup_link_release(struct bpf_link *link)
{
        struct bpf_cgroup_link *cg_link =
                container_of(link, struct bpf_cgroup_link, link);
        struct cgroup *cg;

        /* link might have been auto-detached by dying cgroup already,
         * in that case our work is done here
         */
        if (!cg_link->cgroup)
                return;

        cgroup_lock();

        /* re-check cgroup under lock again */
        if (!cg_link->cgroup) {
                cgroup_unlock();
                return;
        }

        WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
                                    cg_link->type));
        if (cg_link->type == BPF_LSM_CGROUP)
                bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);

        cg = cg_link->cgroup;
        cg_link->cgroup = NULL;

        cgroup_unlock();

        cgroup_put(cg);
}

static void bpf_cgroup_link_dealloc(struct bpf_link *link)
{
        struct bpf_cgroup_link *cg_link =
                container_of(link, struct bpf_cgroup_link, link);

        kfree(cg_link);
}

static int bpf_cgroup_link_detach(struct bpf_link *link)
{
        bpf_cgroup_link_release(link);

        return 0;
}

static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
                                        struct seq_file *seq)
{
        struct bpf_cgroup_link *cg_link =
                container_of(link, struct bpf_cgroup_link, link);
        u64 cg_id = 0;

        cgroup_lock();
        if (cg_link->cgroup)
                cg_id = cgroup_id(cg_link->cgroup);
        cgroup_unlock();

        seq_printf(seq,
                   "cgroup_id:\t%llu\n"
                   "attach_type:\t%d\n",
                   cg_id,
                   cg_link->type);
}

static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
                                          struct bpf_link_info *info)
{
        struct bpf_cgroup_link *cg_link =
                container_of(link, struct bpf_cgroup_link, link);
        u64 cg_id = 0;

        cgroup_lock();
        if (cg_link->cgroup)
                cg_id = cgroup_id(cg_link->cgroup);
        cgroup_unlock();

        info->cgroup.cgroup_id = cg_id;
        info->cgroup.attach_type = cg_link->type;
        return 0;
}

static const struct bpf_link_ops bpf_cgroup_link_lops = {
        .release = bpf_cgroup_link_release,
        .dealloc = bpf_cgroup_link_dealloc,
        .detach = bpf_cgroup_link_detach,
        .update_prog = cgroup_bpf_replace,
        .show_fdinfo = bpf_cgroup_link_show_fdinfo,
        .fill_link_info = bpf_cgroup_link_fill_link_info,
};

int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct bpf_link_primer link_primer;
        struct bpf_cgroup_link *link;
        struct cgroup *cgrp;
        int err;

        if (attr->link_create.flags)
                return -EINVAL;

        cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
        if (IS_ERR(cgrp))
                return PTR_ERR(cgrp);

        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto out_put_cgroup;
        }
        bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
                      prog);
        link->cgroup = cgrp;
        link->type = attr->link_create.attach_type;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto out_put_cgroup;
        }

        err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
                                link->type, BPF_F_ALLOW_MULTI);
        if (err) {
                bpf_link_cleanup(&link_primer);
                goto out_put_cgroup;
        }

        return bpf_link_settle(&link_primer);

out_put_cgroup:
        cgroup_put(cgrp);
        return err;
}

int cgroup_bpf_prog_query(const union bpf_attr *attr,
                          union bpf_attr __user *uattr)
{
        struct cgroup *cgrp;
        int ret;

        cgrp = cgroup_get_from_fd(attr->query.target_fd);
        if (IS_ERR(cgrp))
                return PTR_ERR(cgrp);

        ret = cgroup_bpf_query(cgrp, attr, uattr);

        cgroup_put(cgrp);
        return ret;
}

/**
 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
 * @sk: The socket sending or receiving traffic
 * @skb: The skb that is being sent or received
 * @atype: The type of program to be executed
 *
 * If no socket is passed, or the socket is not of type INET or INET6,
 * this function does nothing and returns 0.
 *
 * The program type passed in via @type must be suitable for network
 * filtering. No further check is performed to assert that.
 *
 * For egress packets, this function can return:
 *   NET_XMIT_SUCCESS    (0)        - continue with packet output
 *   NET_XMIT_DROP       (1)        - drop packet and notify TCP to call cwr
 *   NET_XMIT_CN         (2)        - continue with packet output and notify TCP
 *                                  to call cwr
 *   -err                        - drop packet
 *
 * For ingress packets, this function will return -EPERM if any
 * attached program was found and if it returned != 1 during execution.
 * Otherwise 0 is returned.
 */
int __cgroup_bpf_run_filter_skb(struct sock *sk,
                                struct sk_buff *skb,
                                enum cgroup_bpf_attach_type atype)
{
        unsigned int offset = -skb_network_offset(skb);
        struct sock *save_sk;
        void *saved_data_end;
        struct cgroup *cgrp;
        int ret;

        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        save_sk = skb->sk;
        skb->sk = sk;
        __skb_push(skb, offset);

        /* compute pointers for the bpf prog */
        bpf_compute_and_save_data_end(skb, &saved_data_end);

        if (atype == CGROUP_INET_EGRESS) {
                u32 flags = 0;
                bool cn;

                ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
                                            __bpf_prog_run_save_cb, 0, &flags);

                /* Return values of CGROUP EGRESS BPF programs are:
                 *   0: drop packet
                 *   1: keep packet
                 *   2: drop packet and cn
                 *   3: keep packet and cn
                 *
                 * The returned value is then converted to one of the NET_XMIT
                 * or an error code that is then interpreted as drop packet
                 * (and no cn):
                 *   0: NET_XMIT_SUCCESS  skb should be transmitted
                 *   1: NET_XMIT_DROP     skb should be dropped and cn
                 *   2: NET_XMIT_CN       skb should be transmitted and cn
                 *   3: -err              skb should be dropped
                 */

                cn = flags & BPF_RET_SET_CN;
                if (ret && !IS_ERR_VALUE((long)ret))
                        ret = -EFAULT;
                if (!ret)
                        ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
                else
                        ret = (cn ? NET_XMIT_DROP : ret);
        } else {
                ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
                                            skb, __bpf_prog_run_save_cb, 0,
                                            NULL);
                if (ret && !IS_ERR_VALUE((long)ret))
                        ret = -EFAULT;
        }
        bpf_restore_data_end(skb, saved_data_end);
        __skb_pull(skb, offset);
        skb->sk = save_sk;

        return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);

/**
 * __cgroup_bpf_run_filter_sk() - Run a program on a sock
 * @sk: sock structure to manipulate
 * @atype: The type of program to be executed
 *
 * socket is passed is expected to be of type INET or INET6.
 *
 * The program type passed in via @type must be suitable for sock
 * filtering. No further check is performed to assert that.
 *
 * This function will return %-EPERM if any if an attached program was found
 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 */
int __cgroup_bpf_run_filter_sk(struct sock *sk,
                               enum cgroup_bpf_attach_type atype)
{
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

        return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
                                     NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);

/**
 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
 *                                       provided by user sockaddr
 * @sk: sock struct that will use sockaddr
 * @uaddr: sockaddr struct provided by user
 * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
 *            read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
 *            uaddr.
 * @atype: The type of program to be executed
 * @t_ctx: Pointer to attach type specific context
 * @flags: Pointer to u32 which contains higher bits of BPF program
 *         return value (OR'ed together).
 *
 * socket is expected to be of type INET, INET6 or UNIX.
 *
 * This function will return %-EPERM if an attached program is found and
 * returned value != 1 during execution. In all other cases, 0 is returned.
 */
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
                                      struct sockaddr *uaddr,
                                      int *uaddrlen,
                                      enum cgroup_bpf_attach_type atype,
                                      void *t_ctx,
                                      u32 *flags)
{
        struct bpf_sock_addr_kern ctx = {
                .sk = sk,
                .uaddr = uaddr,
                .t_ctx = t_ctx,
        };
        struct sockaddr_storage unspec;
        struct cgroup *cgrp;
        int ret;

        /* Check socket family since not all sockets represent network
         * endpoint (e.g. AF_UNIX).
         */
        if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
            sk->sk_family != AF_UNIX)
                return 0;

        if (!ctx.uaddr) {
                memset(&unspec, 0, sizeof(unspec));
                ctx.uaddr = (struct sockaddr *)&unspec;
                ctx.uaddrlen = 0;
        } else {
                ctx.uaddrlen = *uaddrlen;
        }

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
                                    0, flags);

        if (!ret && uaddr)
                *uaddrlen = ctx.uaddrlen;

        return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);

/**
 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
 * @sk: socket to get cgroup from
 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
 * sk with connection information (IP addresses, etc.) May not contain
 * cgroup info if it is a req sock.
 * @atype: The type of program to be executed
 *
 * socket passed is expected to be of type INET or INET6.
 *
 * The program type passed in via @type must be suitable for sock_ops
 * filtering. No further check is performed to assert that.
 *
 * This function will return %-EPERM if any if an attached program was found
 * and if it returned != 1 during execution. In all other cases, 0 is returned.
 */
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
                                     struct bpf_sock_ops_kern *sock_ops,
                                     enum cgroup_bpf_attach_type atype)
{
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

        return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
                                     0, NULL);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);

int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
                                      short access, enum cgroup_bpf_attach_type atype)
{
        struct cgroup *cgrp;
        struct bpf_cgroup_dev_ctx ctx = {
                .access_type = (access << 16) | dev_type,
                .major = major,
                .minor = minor,
        };
        int ret;

        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
        ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
                                    NULL);
        rcu_read_unlock();

        return ret;
}

BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{
        /* flags argument is not used now,
         * but provides an ability to extend the API.
         * verifier checks that its value is correct.
         */
        enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
        struct bpf_cgroup_storage *storage;
        struct bpf_cg_run_ctx *ctx;
        void *ptr;

        /* get current cgroup storage from BPF run context */
        ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
        storage = ctx->prog_item->cgroup_storage[stype];

        if (stype == BPF_CGROUP_STORAGE_SHARED)
                ptr = &READ_ONCE(storage->buf)->data[0];
        else
                ptr = this_cpu_ptr(storage->percpu_buf);

        return (unsigned long)ptr;
}

const struct bpf_func_proto bpf_get_local_storage_proto = {
        .func                = bpf_get_local_storage,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_MAP_VALUE,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_0(bpf_get_retval)
{
        struct bpf_cg_run_ctx *ctx =
                container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);

        return ctx->retval;
}

const struct bpf_func_proto bpf_get_retval_proto = {
        .func                = bpf_get_retval,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_1(bpf_set_retval, int, retval)
{
        struct bpf_cg_run_ctx *ctx =
                container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);

        ctx->retval = retval;
        return 0;
}

const struct bpf_func_proto bpf_set_retval_proto = {
        .func                = bpf_set_retval,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        func_proto = cgroup_current_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool cgroup_dev_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       const struct bpf_prog *prog,
                                       struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (type == BPF_WRITE)
                return false;

        if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
                return false;
        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
                bpf_ctx_record_field_size(info, size_default);
                if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                        return false;
                break;
        default:
                if (size != size_default)
                        return false;
        }

        return true;
}

const struct bpf_prog_ops cg_dev_prog_ops = {
};

const struct bpf_verifier_ops cg_dev_verifier_ops = {
        .get_func_proto                = cgroup_dev_func_proto,
        .is_valid_access        = cgroup_dev_is_valid_access,
};

/**
 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
 *
 * @head: sysctl table header
 * @table: sysctl table
 * @write: sysctl is being read (= 0) or written (= 1)
 * @buf: pointer to buffer (in and out)
 * @pcount: value-result argument: value is size of buffer pointed to by @buf,
 *        result is size of @new_buf if program set new value, initial value
 *        otherwise
 * @ppos: value-result argument: value is position at which read from or write
 *        to sysctl is happening, result is new position if program overrode it,
 *        initial value otherwise
 * @atype: type of program to be executed
 *
 * Program is run when sysctl is being accessed, either read or written, and
 * can allow or deny such access.
 *
 * This function will return %-EPERM if an attached program is found and
 * returned value != 1 during execution. In all other cases 0 is returned.
 */
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
                                   struct ctl_table *table, int write,
                                   char **buf, size_t *pcount, loff_t *ppos,
                                   enum cgroup_bpf_attach_type atype)
{
        struct bpf_sysctl_kern ctx = {
                .head = head,
                .table = table,
                .write = write,
                .ppos = ppos,
                .cur_val = NULL,
                .cur_len = PAGE_SIZE,
                .new_val = NULL,
                .new_len = 0,
                .new_updated = 0,
        };
        struct cgroup *cgrp;
        loff_t pos = 0;
        int ret;

        ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
        if (!ctx.cur_val ||
            table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
                /* Let BPF program decide how to proceed. */
                ctx.cur_len = 0;
        }

        if (write && *buf && *pcount) {
                /* BPF program should be able to override new value with a
                 * buffer bigger than provided by user.
                 */
                ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
                ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
                if (ctx.new_val) {
                        memcpy(ctx.new_val, *buf, ctx.new_len);
                } else {
                        /* Let BPF program decide how to proceed. */
                        ctx.new_len = 0;
                }
        }

        rcu_read_lock();
        cgrp = task_dfl_cgroup(current);
        ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
                                    NULL);
        rcu_read_unlock();

        kfree(ctx.cur_val);

        if (ret == 1 && ctx.new_updated) {
                kfree(*buf);
                *buf = ctx.new_val;
                *pcount = ctx.new_len;
        } else {
                kfree(ctx.new_val);
        }

        return ret;
}

#ifdef CONFIG_NET
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
                             struct bpf_sockopt_buf *buf)
{
        if (unlikely(max_optlen < 0))
                return -EINVAL;

        if (unlikely(max_optlen > PAGE_SIZE)) {
                /* We don't expose optvals that are greater than PAGE_SIZE
                 * to the BPF program.
                 */
                max_optlen = PAGE_SIZE;
        }

        if (max_optlen <= sizeof(buf->data)) {
                /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
                 * bytes avoid the cost of kzalloc.
                 */
                ctx->optval = buf->data;
                ctx->optval_end = ctx->optval + max_optlen;
                return max_optlen;
        }

        ctx->optval = kzalloc(max_optlen, GFP_USER);
        if (!ctx->optval)
                return -ENOMEM;

        ctx->optval_end = ctx->optval + max_optlen;

        return max_optlen;
}

static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
                             struct bpf_sockopt_buf *buf)
{
        if (ctx->optval == buf->data)
                return;
        kfree(ctx->optval);
}

static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
                                  struct bpf_sockopt_buf *buf)
{
        return ctx->optval != buf->data;
}

int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
                                       int *optname, sockptr_t optval,
                                       int *optlen, char **kernel_optval)
{
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        struct bpf_sockopt_buf buf = {};
        struct bpf_sockopt_kern ctx = {
                .sk = sk,
                .level = *level,
                .optname = *optname,
        };
        int ret, max_optlen;

        /* Allocate a bit more than the initial user buffer for
         * BPF program. The canonical use case is overriding
         * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
         */
        max_optlen = max_t(int, 16, *optlen);
        max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
        if (max_optlen < 0)
                return max_optlen;

        ctx.optlen = *optlen;

        if (copy_from_sockptr(ctx.optval, optval,
                              min(*optlen, max_optlen))) {
                ret = -EFAULT;
                goto out;
        }

        lock_sock(sk);
        ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
                                    &ctx, bpf_prog_run, 0, NULL);
        release_sock(sk);

        if (ret)
                goto out;

        if (ctx.optlen == -1) {
                /* optlen set to -1, bypass kernel */
                ret = 1;
        } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
                /* optlen is out of bounds */
                if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
                        pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
                                     ctx.optlen, max_optlen);
                        ret = 0;
                        goto out;
                }
                ret = -EFAULT;
        } else {
                /* optlen within bounds, run kernel handler */
                ret = 0;

                /* export any potential modifications */
                *level = ctx.level;
                *optname = ctx.optname;

                /* optlen == 0 from BPF indicates that we should
                 * use original userspace data.
                 */
                if (ctx.optlen != 0) {
                        *optlen = ctx.optlen;
                        /* We've used bpf_sockopt_kern->buf as an intermediary
                         * storage, but the BPF program indicates that we need
                         * to pass this data to the kernel setsockopt handler.
                         * No way to export on-stack buf, have to allocate a
                         * new buffer.
                         */
                        if (!sockopt_buf_allocated(&ctx, &buf)) {
                                void *p = kmalloc(ctx.optlen, GFP_USER);

                                if (!p) {
                                        ret = -ENOMEM;
                                        goto out;
                                }
                                memcpy(p, ctx.optval, ctx.optlen);
                                *kernel_optval = p;
                        } else {
                                *kernel_optval = ctx.optval;
                        }
                        /* export and don't free sockopt buf */
                        return 0;
                }
        }

out:
        sockopt_free_buf(&ctx, &buf);
        return ret;
}

int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
                                       int optname, sockptr_t optval,
                                       sockptr_t optlen, int max_optlen,
                                       int retval)
{
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        struct bpf_sockopt_buf buf = {};
        struct bpf_sockopt_kern ctx = {
                .sk = sk,
                .level = level,
                .optname = optname,
                .current_task = current,
        };
        int orig_optlen;
        int ret;

        orig_optlen = max_optlen;
        ctx.optlen = max_optlen;
        max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
        if (max_optlen < 0)
                return max_optlen;

        if (!retval) {
                /* If kernel getsockopt finished successfully,
                 * copy whatever was returned to the user back
                 * into our temporary buffer. Set optlen to the
                 * one that kernel returned as well to let
                 * BPF programs inspect the value.
                 */
                if (copy_from_sockptr(&ctx.optlen, optlen,
                                      sizeof(ctx.optlen))) {
                        ret = -EFAULT;
                        goto out;
                }

                if (ctx.optlen < 0) {
                        ret = -EFAULT;
                        goto out;
                }
                orig_optlen = ctx.optlen;

                if (copy_from_sockptr(ctx.optval, optval,
                                      min(ctx.optlen, max_optlen))) {
                        ret = -EFAULT;
                        goto out;
                }
        }

        lock_sock(sk);
        ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
                                    &ctx, bpf_prog_run, retval, NULL);
        release_sock(sk);

        if (ret < 0)
                goto out;

        if (!sockptr_is_null(optval) &&
            (ctx.optlen > max_optlen || ctx.optlen < 0)) {
                if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
                        pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
                                     ctx.optlen, max_optlen);
                        ret = retval;
                        goto out;
                }
                ret = -EFAULT;
                goto out;
        }

        if (ctx.optlen != 0) {
                if (!sockptr_is_null(optval) &&
                    copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
                        ret = -EFAULT;
                        goto out;
                }
                if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
                        ret = -EFAULT;
                        goto out;
                }
        }

out:
        sockopt_free_buf(&ctx, &buf);
        return ret;
}

int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
                                            int optname, void *optval,
                                            int *optlen, int retval)
{
        struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        struct bpf_sockopt_kern ctx = {
                .sk = sk,
                .level = level,
                .optname = optname,
                .optlen = *optlen,
                .optval = optval,
                .optval_end = optval + *optlen,
                .current_task = current,
        };
        int ret;

        /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
         * user data back into BPF buffer when reval != 0. This is
         * done as an optimization to avoid extra copy, assuming
         * kernel won't populate the data in case of an error.
         * Here we always pass the data and memset() should
         * be called if that data shouldn't be "exported".
         */

        ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
                                    &ctx, bpf_prog_run, retval, NULL);
        if (ret < 0)
                return ret;

        if (ctx.optlen > *optlen)
                return -EFAULT;

        /* BPF programs can shrink the buffer, export the modifications.
         */
        if (ctx.optlen != 0)
                *optlen = ctx.optlen;

        return ret;
}
#endif

static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
                              size_t *lenp)
{
        ssize_t tmp_ret = 0, ret;

        if (dir->header.parent) {
                tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
                if (tmp_ret < 0)
                        return tmp_ret;
        }

        ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
        if (ret < 0)
                return ret;
        *bufp += ret;
        *lenp -= ret;
        ret += tmp_ret;

        /* Avoid leading slash. */
        if (!ret)
                return ret;

        tmp_ret = strscpy(*bufp, "/", *lenp);
        if (tmp_ret < 0)
                return tmp_ret;
        *bufp += tmp_ret;
        *lenp -= tmp_ret;

        return ret + tmp_ret;
}

BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
           size_t, buf_len, u64, flags)
{
        ssize_t tmp_ret = 0, ret;

        if (!buf)
                return -EINVAL;

        if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
                if (!ctx->head)
                        return -EINVAL;
                tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
                if (tmp_ret < 0)
                        return tmp_ret;
        }

        ret = strscpy(buf, ctx->table->procname, buf_len);

        return ret < 0 ? ret : tmp_ret + ret;
}

static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
        .func                = bpf_sysctl_get_name,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
                             size_t src_len)
{
        if (!dst)
                return -EINVAL;

        if (!dst_len)
                return -E2BIG;

        if (!src || !src_len) {
                memset(dst, 0, dst_len);
                return -EINVAL;
        }

        memcpy(dst, src, min(dst_len, src_len));

        if (dst_len > src_len) {
                memset(dst + src_len, '\0', dst_len - src_len);
                return src_len;
        }

        dst[dst_len - 1] = '\0';

        return -E2BIG;
}

BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
           char *, buf, size_t, buf_len)
{
        return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
}

static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
        .func                = bpf_sysctl_get_current_value,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
           size_t, buf_len)
{
        if (!ctx->write) {
                if (buf && buf_len)
                        memset(buf, '\0', buf_len);
                return -EINVAL;
        }
        return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
}

static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
        .func                = bpf_sysctl_get_new_value,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
           const char *, buf, size_t, buf_len)
{
        if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
                return -EINVAL;

        if (buf_len > PAGE_SIZE - 1)
                return -E2BIG;

        memcpy(ctx->new_val, buf, buf_len);
        ctx->new_len = buf_len;
        ctx->new_updated = 1;

        return 0;
}

static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
        .func                = bpf_sysctl_set_new_value,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

static const struct bpf_func_proto *
sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        func_proto = cgroup_current_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_sysctl_get_name:
                return &bpf_sysctl_get_name_proto;
        case BPF_FUNC_sysctl_get_current_value:
                return &bpf_sysctl_get_current_value_proto;
        case BPF_FUNC_sysctl_get_new_value:
                return &bpf_sysctl_get_new_value_proto;
        case BPF_FUNC_sysctl_set_new_value:
                return &bpf_sysctl_set_new_value_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
                return false;

        switch (off) {
        case bpf_ctx_range(struct bpf_sysctl, write):
                if (type != BPF_READ)
                        return false;
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);
        case bpf_ctx_range(struct bpf_sysctl, file_pos):
                if (type == BPF_READ) {
                        bpf_ctx_record_field_size(info, size_default);
                        return bpf_ctx_narrow_access_ok(off, size, size_default);
                } else {
                        return size == size_default;
                }
        default:
                return false;
        }
}

static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        u32 read_size;

        switch (si->off) {
        case offsetof(struct bpf_sysctl, write):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct bpf_sysctl_kern, write,
                                       sizeof_field(struct bpf_sysctl_kern,
                                                    write),
                                       target_size));
                break;
        case offsetof(struct bpf_sysctl, file_pos):
                /* ppos is a pointer so it should be accessed via indirect
                 * loads and stores. Also for stores additional temporary
                 * register is used since neither src_reg nor dst_reg can be
                 * overridden.
                 */
                if (type == BPF_WRITE) {
                        int treg = BPF_REG_9;

                        if (si->src_reg == treg || si->dst_reg == treg)
                                --treg;
                        if (si->src_reg == treg || si->dst_reg == treg)
                                --treg;
                        *insn++ = BPF_STX_MEM(
                                BPF_DW, si->dst_reg, treg,
                                offsetof(struct bpf_sysctl_kern, tmp_reg));
                        *insn++ = BPF_LDX_MEM(
                                BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
                                treg, si->dst_reg,
                                offsetof(struct bpf_sysctl_kern, ppos));
                        *insn++ = BPF_RAW_INSN(
                                BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
                                treg, si->src_reg,
                                bpf_ctx_narrow_access_offset(
                                        0, sizeof(u32), sizeof(loff_t)),
                                si->imm);
                        *insn++ = BPF_LDX_MEM(
                                BPF_DW, treg, si->dst_reg,
                                offsetof(struct bpf_sysctl_kern, tmp_reg));
                } else {
                        *insn++ = BPF_LDX_MEM(
                                BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
                                si->dst_reg, si->src_reg,
                                offsetof(struct bpf_sysctl_kern, ppos));
                        read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
                        *insn++ = BPF_LDX_MEM(
                                BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
                                bpf_ctx_narrow_access_offset(
                                        0, read_size, sizeof(loff_t)));
                }
                *target_size = sizeof(u32);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
        .get_func_proto                = sysctl_func_proto,
        .is_valid_access        = sysctl_is_valid_access,
        .convert_ctx_access        = sysctl_convert_ctx_access,
};

const struct bpf_prog_ops cg_sysctl_prog_ops = {
};

#ifdef CONFIG_NET
BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
{
        const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;

        return net->net_cookie;
}

static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
        .func                = bpf_get_netns_cookie_sockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};
#endif

static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        func_proto = cgroup_current_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
#ifdef CONFIG_NET
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sockopt_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_setsockopt:
                if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
                        return &bpf_sk_setsockopt_proto;
                return NULL;
        case BPF_FUNC_getsockopt:
                if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
                        return &bpf_sk_getsockopt_proto;
                return NULL;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
#endif
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool cg_sockopt_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       const struct bpf_prog *prog,
                                       struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sockopt))
                return false;

        if (off % size != 0)
                return false;

        if (type == BPF_WRITE) {
                switch (off) {
                case offsetof(struct bpf_sockopt, retval):
                        if (size != size_default)
                                return false;
                        return prog->expected_attach_type ==
                                BPF_CGROUP_GETSOCKOPT;
                case offsetof(struct bpf_sockopt, optname):
                        fallthrough;
                case offsetof(struct bpf_sockopt, level):
                        if (size != size_default)
                                return false;
                        return prog->expected_attach_type ==
                                BPF_CGROUP_SETSOCKOPT;
                case offsetof(struct bpf_sockopt, optlen):
                        return size == size_default;
                default:
                        return false;
                }
        }

        switch (off) {
        case offsetof(struct bpf_sockopt, sk):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case offsetof(struct bpf_sockopt, optval):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_PACKET;
                break;
        case offsetof(struct bpf_sockopt, optval_end):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_PACKET_END;
                break;
        case offsetof(struct bpf_sockopt, retval):
                if (size != size_default)
                        return false;
                return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
        default:
                if (size != size_default)
                        return false;
                break;
        }
        return true;
}

#define CG_SOCKOPT_READ_FIELD(F)                                        \
        BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),        \
                    si->dst_reg, si->src_reg,                                \
                    offsetof(struct bpf_sockopt_kern, F))

#define CG_SOCKOPT_WRITE_FIELD(F)                                        \
        BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) |        \
                      BPF_MEM | BPF_CLASS(si->code)),                        \
                     si->dst_reg, si->src_reg,                                \
                     offsetof(struct bpf_sockopt_kern, F),                \
                     si->imm)

static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
                                         const struct bpf_insn *si,
                                         struct bpf_insn *insn_buf,
                                         struct bpf_prog *prog,
                                         u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sockopt, sk):
                *insn++ = CG_SOCKOPT_READ_FIELD(sk);
                break;
        case offsetof(struct bpf_sockopt, level):
                if (type == BPF_WRITE)
                        *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
                else
                        *insn++ = CG_SOCKOPT_READ_FIELD(level);
                break;
        case offsetof(struct bpf_sockopt, optname):
                if (type == BPF_WRITE)
                        *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
                else
                        *insn++ = CG_SOCKOPT_READ_FIELD(optname);
                break;
        case offsetof(struct bpf_sockopt, optlen):
                if (type == BPF_WRITE)
                        *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
                else
                        *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
                break;
        case offsetof(struct bpf_sockopt, retval):
                BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);

                if (type == BPF_WRITE) {
                        int treg = BPF_REG_9;

                        if (si->src_reg == treg || si->dst_reg == treg)
                                --treg;
                        if (si->src_reg == treg || si->dst_reg == treg)
                                --treg;
                        *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
                                              offsetof(struct bpf_sockopt_kern, tmp_reg));
                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
                                              treg, si->dst_reg,
                                              offsetof(struct bpf_sockopt_kern, current_task));
                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
                                              treg, treg,
                                              offsetof(struct task_struct, bpf_ctx));
                        *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
                                               BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
                                               treg, si->src_reg,
                                               offsetof(struct bpf_cg_run_ctx, retval),
                                               si->imm);
                        *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
                                              offsetof(struct bpf_sockopt_kern, tmp_reg));
                } else {
                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
                                              si->dst_reg, si->src_reg,
                                              offsetof(struct bpf_sockopt_kern, current_task));
                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
                                              si->dst_reg, si->dst_reg,
                                              offsetof(struct task_struct, bpf_ctx));
                        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
                                              si->dst_reg, si->dst_reg,
                                              offsetof(struct bpf_cg_run_ctx, retval));
                }
                break;
        case offsetof(struct bpf_sockopt, optval):
                *insn++ = CG_SOCKOPT_READ_FIELD(optval);
                break;
        case offsetof(struct bpf_sockopt, optval_end):
                *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
                break;
        }

        return insn - insn_buf;
}

static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
                                   bool direct_write,
                                   const struct bpf_prog *prog)
{
        /* Nothing to do for sockopt argument. The data is kzalloc'ated.
         */
        return 0;
}

const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
        .get_func_proto                = cg_sockopt_func_proto,
        .is_valid_access        = cg_sockopt_is_valid_access,
        .convert_ctx_access        = cg_sockopt_convert_ctx_access,
        .gen_prologue                = cg_sockopt_get_prologue,
};

const struct bpf_prog_ops cg_sockopt_prog_ops = {
};

/* Common helpers for cgroup hooks. */
const struct bpf_func_proto *
cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_get_local_storage:
                return &bpf_get_local_storage_proto;
        case BPF_FUNC_get_retval:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET_INGRESS:
                case BPF_CGROUP_INET_EGRESS:
                case BPF_CGROUP_SOCK_OPS:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return NULL;
                default:
                        return &bpf_get_retval_proto;
                }
        case BPF_FUNC_set_retval:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET_INGRESS:
                case BPF_CGROUP_INET_EGRESS:
                case BPF_CGROUP_SOCK_OPS:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return NULL;
                default:
                        return &bpf_set_retval_proto;
                }
        default:
                return NULL;
        }
}

/* Common helpers for cgroup hooks with valid process context. */
const struct bpf_func_proto *
cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_get_current_comm:
                return &bpf_get_current_comm_proto;
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_curr_proto;
#endif
        default:
                return NULL;
        }
}













































    1 



















    1 

    1 





















































































    1 












    1 





    1 

    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
// SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include "internal.h"


static const struct proc_ns_operations *ns_entries[] = {
#ifdef CONFIG_NET_NS
        &netns_operations,
#endif
#ifdef CONFIG_UTS_NS
        &utsns_operations,
#endif
#ifdef CONFIG_IPC_NS
        &ipcns_operations,
#endif
#ifdef CONFIG_PID_NS
        &pidns_operations,
        &pidns_for_children_operations,
#endif
#ifdef CONFIG_USER_NS
        &userns_operations,
#endif
        &mntns_operations,
#ifdef CONFIG_CGROUPS
        &cgroupns_operations,
#endif
#ifdef CONFIG_TIME_NS
        &timens_operations,
        &timens_for_children_operations,
#endif
};

static const char *proc_ns_get_link(struct dentry *dentry,
                                    struct inode *inode,
                                    struct delayed_call *done)
{
        const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
        struct task_struct *task;
        struct path ns_path;
        int error = -EACCES;

        if (!dentry)
                return ERR_PTR(-ECHILD);

        task = get_proc_task(inode);
        if (!task)
                return ERR_PTR(-EACCES);

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (error)
                goto out;

        error = nd_jump_link(&ns_path);
out:
        put_task_struct(task);
        return ERR_PTR(error);
}

static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
        struct task_struct *task;
        char name[50];
        int res = -EACCES;

        task = get_proc_task(inode);
        if (!task)
                return res;

        if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
                res = ns_get_name(name, sizeof(name), task, ns_ops);
                if (res >= 0)
                        res = readlink_copy(buffer, buflen, name);
        }
        put_task_struct(task);
        return res;
}

static const struct inode_operations proc_ns_link_inode_operations = {
        .readlink        = proc_ns_readlink,
        .get_link        = proc_ns_get_link,
        .setattr        = proc_setattr,
};

static struct dentry *proc_ns_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        const struct proc_ns_operations *ns_ops = ptr;
        struct inode *inode;
        struct proc_inode *ei;

        inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        inode->i_op = &proc_ns_link_inode_operations;
        ei->ns_ops = ns_ops;
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        const struct proc_ns_operations **entry, **last;

        if (!task)
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                goto out;
        if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
                goto out;
        entry = ns_entries + (ctx->pos - 2);
        last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
        while (entry <= last) {
                const struct proc_ns_operations *ops = *entry;
                if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
                                     proc_ns_instantiate, task, ops))
                        break;
                ctx->pos++;
                entry++;
        }
out:
        put_task_struct(task);
        return 0;
}

const struct file_operations proc_ns_dir_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_ns_dir_readdir,
        .llseek                = generic_file_llseek,
};

static struct dentry *proc_ns_dir_lookup(struct inode *dir,
                                struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task = get_proc_task(dir);
        const struct proc_ns_operations **entry, **last;
        unsigned int len = dentry->d_name.len;
        struct dentry *res = ERR_PTR(-ENOENT);

        if (!task)
                goto out_no_task;

        last = &ns_entries[ARRAY_SIZE(ns_entries)];
        for (entry = ns_entries; entry < last; entry++) {
                if (strlen((*entry)->name) != len)
                        continue;
                if (!memcmp(dentry->d_name.name, (*entry)->name, len))
                        break;
        }
        if (entry == last)
                goto out;

        res = proc_ns_instantiate(dentry, task, *entry);
out:
        put_task_struct(task);
out_no_task:
        return res;
}

const struct inode_operations proc_ns_dir_inode_operations = {
        .lookup                = proc_ns_dir_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};


























































































































































































































    1 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __CGROUP_INTERNAL_H
#define __CGROUP_INTERNAL_H

#include <linux/cgroup.h>
#include <linux/kernfs.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/fs_parser.h>

#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
extern void __init enable_debug_cgroup(void);

/*
 * cgroup_path() takes a spin lock. It is good practice not to take
 * spin locks within trace point handlers, as they are mostly hidden
 * from normal view. As cgroup_path() can take the kernfs_rename_lock
 * spin lock, it is best to not call that function from the trace event
 * handler.
 *
 * Note: trace_cgroup_##type##_enabled() is a static branch that will only
 *       be set when the trace event is enabled.
 */
#define TRACE_CGROUP_PATH(type, cgrp, ...)                                \
        do {                                                                \
                if (trace_cgroup_##type##_enabled()) {                        \
                        unsigned long flags;                                \
                        spin_lock_irqsave(&trace_cgroup_path_lock,        \
                                          flags);                        \
                        cgroup_path(cgrp, trace_cgroup_path,                \
                                    TRACE_CGROUP_PATH_LEN);                \
                        trace_cgroup_##type(cgrp, trace_cgroup_path,        \
                                            ##__VA_ARGS__);                \
                        spin_unlock_irqrestore(&trace_cgroup_path_lock, \
                                               flags);                        \
                }                                                        \
        } while (0)

/*
 * The cgroup filesystem superblock creation/mount context.
 */
struct cgroup_fs_context {
        struct kernfs_fs_context kfc;
        struct cgroup_root        *root;
        struct cgroup_namespace        *ns;
        unsigned int        flags;                        /* CGRP_ROOT_* flags */

        /* cgroup1 bits */
        bool                cpuset_clone_children;
        bool                none;                        /* User explicitly requested empty subsystem */
        bool                all_ss;                        /* Seen 'all' option */
        u16                subsys_mask;                /* Selected subsystems */
        char                *name;                        /* Hierarchy name */
        char                *release_agent;                /* Path for release notifications */
};

static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
{
        struct kernfs_fs_context *kfc = fc->fs_private;

        return container_of(kfc, struct cgroup_fs_context, kfc);
}

struct cgroup_pidlist;

struct cgroup_file_ctx {
        struct cgroup_namespace        *ns;

        struct {
                void                        *trigger;
        } psi;

        struct {
                bool                        started;
                struct css_task_iter        iter;
        } procs;

        struct {
                struct cgroup_pidlist        *pidlist;
        } procs1;
};

/*
 * A cgroup can be associated with multiple css_sets as different tasks may
 * belong to different cgroups on different hierarchies.  In the other
 * direction, a css_set is naturally associated with multiple cgroups.
 * This M:N relationship is represented by the following link structure
 * which exists for each association and allows traversing the associations
 * from both sides.
 */
struct cgrp_cset_link {
        /* the cgroup and css_set this link associates */
        struct cgroup                *cgrp;
        struct css_set                *cset;

        /* list of cgrp_cset_links anchored at cgrp->cset_links */
        struct list_head        cset_link;

        /* list of cgrp_cset_links anchored at css_set->cgrp_links */
        struct list_head        cgrp_link;
};

/* used to track tasks and csets during migration */
struct cgroup_taskset {
        /* the src and dst cset list running through cset->mg_node */
        struct list_head        src_csets;
        struct list_head        dst_csets;

        /* the number of tasks in the set */
        int                        nr_tasks;

        /* the subsys currently being processed */
        int                        ssid;

        /*
         * Fields for cgroup_taskset_*() iteration.
         *
         * Before migration is committed, the target migration tasks are on
         * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
         * the csets on ->dst_csets.  ->csets point to either ->src_csets
         * or ->dst_csets depending on whether migration is committed.
         *
         * ->cur_csets and ->cur_task point to the current task position
         * during iteration.
         */
        struct list_head        *csets;
        struct css_set                *cur_cset;
        struct task_struct        *cur_task;
};

/* migration context also tracks preloading */
struct cgroup_mgctx {
        /*
         * Preloaded source and destination csets.  Used to guarantee
         * atomic success or failure on actual migration.
         */
        struct list_head        preloaded_src_csets;
        struct list_head        preloaded_dst_csets;

        /* tasks and csets to migrate */
        struct cgroup_taskset        tset;

        /* subsystems affected by migration */
        u16                        ss_mask;
};

#define CGROUP_TASKSET_INIT(tset)                                                \
{                                                                                \
        .src_csets                = LIST_HEAD_INIT(tset.src_csets),                \
        .dst_csets                = LIST_HEAD_INIT(tset.dst_csets),                \
        .csets                        = &tset.src_csets,                                \
}

#define CGROUP_MGCTX_INIT(name)                                                        \
{                                                                                \
        LIST_HEAD_INIT(name.preloaded_src_csets),                                \
        LIST_HEAD_INIT(name.preloaded_dst_csets),                                \
        CGROUP_TASKSET_INIT(name.tset),                                                \
}

#define DEFINE_CGROUP_MGCTX(name)                                                \
        struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)

extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;

/* iterate across the hierarchies */
#define for_each_root(root)                                                \
        list_for_each_entry_rcu((root), &cgroup_roots, root_list,        \
                                lockdep_is_held(&cgroup_mutex))

/**
 * for_each_subsys - iterate all enabled cgroup subsystems
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 */
#define for_each_subsys(ss, ssid)                                        \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
             (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
        return !(cgrp->self.flags & CSS_ONLINE);
}

static inline bool notify_on_release(const struct cgroup *cgrp)
{
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}

void put_css_set_locked(struct css_set *cset);

static inline void put_css_set(struct css_set *cset)
{
        unsigned long flags;

        /*
         * Ensure that the refcount doesn't hit zero while any readers
         * can see it. Similar to atomic_dec_and_lock(), but for an
         * rwlock
         */
        if (refcount_dec_not_one(&cset->refcount))
                return;

        spin_lock_irqsave(&css_set_lock, flags);
        put_css_set_locked(cset);
        spin_unlock_irqrestore(&css_set_lock, flags);
}

/*
 * refcounted get/put for css_set objects
 */
static inline void get_css_set(struct css_set *cset)
{
        refcount_inc(&cset->refcount);
}

bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                     struct cgroup_root *root);
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
                          struct cgroup_namespace *ns);

void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
int cgroup_do_get_tree(struct fs_context *fc);

int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx);
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                   struct cgroup_mgctx *mgctx);

int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup);
void cgroup_attach_lock(bool lock_threadgroup);
void cgroup_attach_unlock(bool lock_threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                             bool *locked)
        __acquires(&cgroup_threadgroup_rwsem);
void cgroup_procs_write_finish(struct task_struct *task, bool locked)
        __releases(&cgroup_threadgroup_rwsem);

void cgroup_lock_and_drain_offline(struct cgroup *cgrp);

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root);

int __cgroup_task_count(const struct cgroup *cgrp);
int cgroup_task_count(const struct cgroup *cgrp);

/*
 * rstat.c
 */
int cgroup_rstat_init(struct cgroup *cgrp);
void cgroup_rstat_exit(struct cgroup *cgrp);
void cgroup_rstat_boot(void);
void cgroup_base_stat_cputime_show(struct seq_file *seq);

/*
 * namespace.c
 */
extern const struct proc_ns_operations cgroupns_operations;

/*
 * cgroup-v1.c
 */
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
extern const struct fs_parameter_spec cgroup1_fs_parameters[];

int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
void cgroup1_check_for_release(struct cgroup *cgrp);
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
int cgroup1_get_tree(struct fs_context *fc);
int cgroup1_reconfigure(struct fs_context *ctx);

#endif /* __CGROUP_INTERNAL_H */














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* SPDX-License-Identifier: GPL-2.0-or-later */

#ifndef _NET_GSO_H
#define _NET_GSO_H

#include <linux/skbuff.h>

/* Keeps track of mac header offset relative to skb->head.
 * It is useful for TSO of Tunneling protocol. e.g. GRE.
 * For non-tunnel skb it points to skb_mac_header() and for
 * tunnel skb it points to outer mac header.
 * Keeps track of level of encapsulation of network headers.
 */
struct skb_gso_cb {
        union {
                int        mac_offset;
                int        data_offset;
        };
        int        encap_level;
        __wsum        csum;
        __u16        csum_start;
};
#define SKB_GSO_CB_OFFSET        32
#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET))

static inline int skb_tnl_header_len(const struct sk_buff *inner_skb)
{
        return (skb_mac_header(inner_skb) - inner_skb->head) -
                SKB_GSO_CB(inner_skb)->mac_offset;
}

static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
{
        int new_headroom, headroom;
        int ret;

        headroom = skb_headroom(skb);
        ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC);
        if (ret)
                return ret;

        new_headroom = skb_headroom(skb);
        SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom);
        return 0;
}

static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
{
        /* Do not update partial checksums if remote checksum is enabled. */
        if (skb->remcsum_offload)
                return;

        SKB_GSO_CB(skb)->csum = res;
        SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
}

/* Compute the checksum for a gso segment. First compute the checksum value
 * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
 * then add in skb->csum (checksum from csum_start to end of packet).
 * skb->csum and csum_start are then updated to reflect the checksum of the
 * resultant packet starting from the transport header-- the resultant checksum
 * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo
 * header.
 */
static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
{
        unsigned char *csum_start = skb_transport_header(skb);
        int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start;
        __wsum partial = SKB_GSO_CB(skb)->csum;

        SKB_GSO_CB(skb)->csum = res;
        SKB_GSO_CB(skb)->csum_start = csum_start - skb->head;

        return csum_fold(csum_partial(csum_start, plen, partial));
}

struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
                                  netdev_features_t features, bool tx_path);

static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb,
                                              netdev_features_t features)
{
        return __skb_gso_segment(skb, features, true);
}

struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
                                    netdev_features_t features, __be16 type);

struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
                                    netdev_features_t features);

bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);

bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);

static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
                                        int pulled_hlen, u16 mac_offset,
                                        int mac_len)
{
        skb->protocol = protocol;
        skb->encapsulation = 1;
        skb_push(skb, pulled_hlen);
        skb_reset_transport_header(skb);
        skb->mac_header = mac_offset;
        skb->network_header = skb->mac_header + mac_len;
        skb->mac_len = mac_len;
}

#endif /* _NET_GSO_H */

































































































































































































































































































































































    1 


    1 




























































































































    1 































































































































































































    1 






    1 

    1 
    1 


    1 




    1 
















    1 

    1 






    1 






    1 


    1 

    1 





















































































































































    2 











    1 

    1 























    1 



    1 














    1 














    1 




























    1 















    1 






    1 





    1 







    1 







    1 
















    1 






    1 





























    1 





    1 








    1 







    1 






    1 













    1 








    1 





    1 





    1 



    1 


    1 



    1 


    1 

    1 


    1 

















    1 










    1 




    1 

    1 








    1 





    1 



    1 










    1 



















































    1 
    1 







    1 




    1 



    1 

    1 
    1 




    1 








    1 




    1 




    1 























    1 









































































































    1 







    1 













































    1 









    1 































































































































































































































    1 



    1 







































































































































































































































































































































































































































































































    1 



    1 
    1 






































    1 


    1 






    1 








































    1 
















    1 



    1 





















    1 





    1 





    1 











    1 











    1 














    1 






































































    1 
















    1 













    1 



    1 





    1 









    1 








    1 






    1 
















    1 






    1 










    1 
    1 











    1 



































    2 


    2 










    1 





    1 


    2 




































    1 













    1 








    1 



    1 
    1 
























































































































































































































































    1 












    1 



    1 



    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 



    1 
    1 





    1 

























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *                Linus Torvalds, <torvalds@cs.helsinki.fi>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Matthew Dillon, <dillon@apollo.west.oic.com>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 *
 * Fixes:
 *                Alan Cox        :        Numerous verify_area() calls
 *                Alan Cox        :        Set the ACK bit on a reset
 *                Alan Cox        :        Stopped it crashing if it closed while
 *                                        sk->inuse=1 and was trying to connect
 *                                        (tcp_err()).
 *                Alan Cox        :        All icmp error handling was broken
 *                                        pointers passed where wrong and the
 *                                        socket was looked up backwards. Nobody
 *                                        tested any icmp error code obviously.
 *                Alan Cox        :        tcp_err() now handled properly. It
 *                                        wakes people on errors. poll
 *                                        behaves and the icmp error race
 *                                        has gone by moving it into sock.c
 *                Alan Cox        :        tcp_send_reset() fixed to work for
 *                                        everything not just packets for
 *                                        unknown sockets.
 *                Alan Cox        :        tcp option processing.
 *                Alan Cox        :        Reset tweaked (still not 100%) [Had
 *                                        syn rule wrong]
 *                Herp Rosmanith  :        More reset fixes
 *                Alan Cox        :        No longer acks invalid rst frames.
 *                                        Acking any kind of RST is right out.
 *                Alan Cox        :        Sets an ignore me flag on an rst
 *                                        receive otherwise odd bits of prattle
 *                                        escape still
 *                Alan Cox        :        Fixed another acking RST frame bug.
 *                                        Should stop LAN workplace lockups.
 *                Alan Cox        :         Some tidyups using the new skb list
 *                                        facilities
 *                Alan Cox        :        sk->keepopen now seems to work
 *                Alan Cox        :        Pulls options out correctly on accepts
 *                Alan Cox        :        Fixed assorted sk->rqueue->next errors
 *                Alan Cox        :        PSH doesn't end a TCP read. Switched a
 *                                        bit to skb ops.
 *                Alan Cox        :        Tidied tcp_data to avoid a potential
 *                                        nasty.
 *                Alan Cox        :        Added some better commenting, as the
 *                                        tcp is hard to follow
 *                Alan Cox        :        Removed incorrect check for 20 * psh
 *        Michael O'Reilly        :        ack < copied bug fix.
 *        Johannes Stille                :        Misc tcp fixes (not all in yet).
 *                Alan Cox        :        FIN with no memory -> CRASH
 *                Alan Cox        :        Added socket option proto entries.
 *                                        Also added awareness of them to accept.
 *                Alan Cox        :        Added TCP options (SOL_TCP)
 *                Alan Cox        :        Switched wakeup calls to callbacks,
 *                                        so the kernel can layer network
 *                                        sockets.
 *                Alan Cox        :        Use ip_tos/ip_ttl settings.
 *                Alan Cox        :        Handle FIN (more) properly (we hope).
 *                Alan Cox        :        RST frames sent on unsynchronised
 *                                        state ack error.
 *                Alan Cox        :        Put in missing check for SYN bit.
 *                Alan Cox        :        Added tcp_select_window() aka NET2E
 *                                        window non shrink trick.
 *                Alan Cox        :        Added a couple of small NET2E timer
 *                                        fixes
 *                Charles Hedrick :        TCP fixes
 *                Toomas Tamm        :        TCP window fixes
 *                Alan Cox        :        Small URG fix to rlogin ^C ack fight
 *                Charles Hedrick        :        Rewrote most of it to actually work
 *                Linus                :        Rewrote tcp_read() and URG handling
 *                                        completely
 *                Gerhard Koerting:        Fixed some missing timer handling
 *                Matthew Dillon  :        Reworked TCP machine states as per RFC
 *                Gerhard Koerting:        PC/TCP workarounds
 *                Adam Caldwell        :        Assorted timer/timing errors
 *                Matthew Dillon        :        Fixed another RST bug
 *                Alan Cox        :        Move to kernel side addressing changes.
 *                Alan Cox        :        Beginning work on TCP fastpathing
 *                                        (not yet usable)
 *                Arnt Gulbrandsen:        Turbocharged tcp_check() routine.
 *                Alan Cox        :        TCP fast path debugging
 *                Alan Cox        :        Window clamping
 *                Michael Riepe        :        Bug in tcp_check()
 *                Matt Dillon        :        More TCP improvements and RST bug fixes
 *                Matt Dillon        :        Yet more small nasties remove from the
 *                                        TCP code (Be very nice to this man if
 *                                        tcp finally works 100%) 8)
 *                Alan Cox        :        BSD accept semantics.
 *                Alan Cox        :        Reset on closedown bug.
 *        Peter De Schrijver        :        ENOTCONN check missing in tcp_sendto().
 *                Michael Pall        :        Handle poll() after URG properly in
 *                                        all cases.
 *                Michael Pall        :        Undo the last fix in tcp_read_urg()
 *                                        (multi URG PUSH broke rlogin).
 *                Michael Pall        :        Fix the multi URG PUSH problem in
 *                                        tcp_readable(), poll() after URG
 *                                        works now.
 *                Michael Pall        :        recv(...,MSG_OOB) never blocks in the
 *                                        BSD api.
 *                Alan Cox        :        Changed the semantics of sk->socket to
 *                                        fix a race and a signal problem with
 *                                        accept() and async I/O.
 *                Alan Cox        :        Relaxed the rules on tcp_sendto().
 *                Yury Shevchuk        :        Really fixed accept() blocking problem.
 *                Craig I. Hagan  :        Allow for BSD compatible TIME_WAIT for
 *                                        clients/servers which listen in on
 *                                        fixed ports.
 *                Alan Cox        :        Cleaned the above up and shrank it to
 *                                        a sensible code size.
 *                Alan Cox        :        Self connect lockup fix.
 *                Alan Cox        :        No connect to multicast.
 *                Ross Biro        :        Close unaccepted children on master
 *                                        socket close.
 *                Alan Cox        :        Reset tracing code.
 *                Alan Cox        :        Spurious resets on shutdown.
 *                Alan Cox        :        Giant 15 minute/60 second timer error
 *                Alan Cox        :        Small whoops in polling before an
 *                                        accept.
 *                Alan Cox        :        Kept the state trace facility since
 *                                        it's handy for debugging.
 *                Alan Cox        :        More reset handler fixes.
 *                Alan Cox        :        Started rewriting the code based on
 *                                        the RFC's for other useful protocol
 *                                        references see: Comer, KA9Q NOS, and
 *                                        for a reference on the difference
 *                                        between specifications and how BSD
 *                                        works see the 4.4lite source.
 *                A.N.Kuznetsov        :        Don't time wait on completion of tidy
 *                                        close.
 *                Linus Torvalds        :        Fin/Shutdown & copied_seq changes.
 *                Linus Torvalds        :        Fixed BSD port reuse to work first syn
 *                Alan Cox        :        Reimplemented timers as per the RFC
 *                                        and using multiple timers for sanity.
 *                Alan Cox        :        Small bug fixes, and a lot of new
 *                                        comments.
 *                Alan Cox        :        Fixed dual reader crash by locking
 *                                        the buffers (much like datagram.c)
 *                Alan Cox        :        Fixed stuck sockets in probe. A probe
 *                                        now gets fed up of retrying without
 *                                        (even a no space) answer.
 *                Alan Cox        :        Extracted closing code better
 *                Alan Cox        :        Fixed the closing state machine to
 *                                        resemble the RFC.
 *                Alan Cox        :        More 'per spec' fixes.
 *                Jorge Cwik        :        Even faster checksumming.
 *                Alan Cox        :        tcp_data() doesn't ack illegal PSH
 *                                        only frames. At least one pc tcp stack
 *                                        generates them.
 *                Alan Cox        :        Cache last socket.
 *                Alan Cox        :        Per route irtt.
 *                Matt Day        :        poll()->select() match BSD precisely on error
 *                Alan Cox        :        New buffers
 *                Marc Tamsky        :        Various sk->prot->retransmits and
 *                                        sk->retransmits misupdating fixed.
 *                                        Fixed tcp_write_timeout: stuck close,
 *                                        and TCP syn retries gets used now.
 *                Mark Yarvis        :        In tcp_read_wakeup(), don't send an
 *                                        ack if state is TCP_CLOSED.
 *                Alan Cox        :        Look up device on a retransmit - routes may
 *                                        change. Doesn't yet cope with MSS shrink right
 *                                        but it's a start!
 *                Marc Tamsky        :        Closing in closing fixes.
 *                Mike Shaver        :        RFC1122 verifications.
 *                Alan Cox        :        rcv_saddr errors.
 *                Alan Cox        :        Block double connect().
 *                Alan Cox        :        Small hooks for enSKIP.
 *                Alexey Kuznetsov:        Path MTU discovery.
 *                Alan Cox        :        Support soft errors.
 *                Alan Cox        :        Fix MTU discovery pathological case
 *                                        when the remote claims no mtu!
 *                Marc Tamsky        :        TCP_CLOSE fix.
 *                Colin (G3TNE)        :        Send a reset on syn ack replies in
 *                                        window but wrong (fixes NT lpd problems)
 *                Pedro Roque        :        Better TCP window handling, delayed ack.
 *                Joerg Reuter        :        No modification of locked buffers in
 *                                        tcp_do_retransmit()
 *                Eric Schenk        :        Changed receiver side silly window
 *                                        avoidance algorithm to BSD style
 *                                        algorithm. This doubles throughput
 *                                        against machines running Solaris,
 *                                        and seems to result in general
 *                                        improvement.
 *        Stefan Magdalinski        :        adjusted tcp_readable() to fix FIONREAD
 *        Willy Konynenberg        :        Transparent proxying support.
 *        Mike McLagan                :        Routing by source
 *                Keith Owens        :        Do proper merging with partial SKB's in
 *                                        tcp_do_sendmsg to avoid burstiness.
 *                Eric Schenk        :        Fix fast close down bug with
 *                                        shutdown() followed by close().
 *                Andi Kleen         :        Make poll agree with SIGIO
 *        Salvatore Sanfilippo        :        Support SO_LINGER with linger == 1 and
 *                                        lingertime == 0 (RFC 793 ABORT Call)
 *        Hirokazu Takahashi        :        Use copy_from_user() instead of
 *                                        csum_and_copy_from_user() if possible.
 *
 * Description of States:
 *
 *        TCP_SYN_SENT                sent a connection request, waiting for ack
 *
 *        TCP_SYN_RECV                received a connection request, sent ack,
 *                                waiting for final ack in three-way handshake.
 *
 *        TCP_ESTABLISHED                connection established
 *
 *        TCP_FIN_WAIT1                our side has shutdown, waiting to complete
 *                                transmission of remaining buffered data
 *
 *        TCP_FIN_WAIT2                all buffered data sent, waiting for remote
 *                                to shutdown
 *
 *        TCP_CLOSING                both sides have shutdown but we still have
 *                                data we have to finish sending
 *
 *        TCP_TIME_WAIT                timeout to catch resent junk before entering
 *                                closed, can only be entered from FIN_WAIT2
 *                                or CLOSING.  Required because the other end
 *                                may not have gotten our last ACK causing it
 *                                to retransmit the data packet (which we ignore)
 *
 *        TCP_CLOSE_WAIT                remote side has shutdown and is waiting for
 *                                us to finish writing our data and to shutdown
 *                                (we have to close() to move on to LAST_ACK)
 *
 *        TCP_LAST_ACK                out side has shutdown after remote has
 *                                shutdown.  There may still be data in our
 *                                buffer that we have to finish sending
 *
 *        TCP_CLOSE                socket is finished
 */

#define pr_fmt(fmt) "TCP: " fmt

#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
#include <linux/inet_diag.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
#include <linux/scatterlist.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/random.h>
#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/errqueue.h>
#include <linux/static_key.h>
#include <linux/btf.h>

#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/rstreason.h>

#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
#include <net/hotdata.h>
#include <trace/events/tcp.h>
#include <net/rps.h>

/* Track pending CMSGs. */
enum {
        TCP_CMSG_INQ = 1,
        TCP_CMSG_TS = 2
};

DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);

DEFINE_PER_CPU(u32, tcp_tw_isn);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);

long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);

atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;        /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);

#if IS_ENABLED(CONFIG_SMC)
DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
EXPORT_SYMBOL(tcp_have_smc);
#endif

/*
 * Current number of TCP sockets.
 */
struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(tcp_sockets_allocated);

/*
 * TCP splice context
 */
struct tcp_splice_state {
        struct pipe_inode_info *pipe;
        size_t len;
        unsigned int flags;
};

/*
 * Pressure flag: try to collapse.
 * Technical note: it is used by multiple contexts non atomically.
 * All the __sk_mem_schedule() is of this nature: accounting
 * is strict, actions are advisory and have some latency.
 */
unsigned long tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);

void tcp_enter_memory_pressure(struct sock *sk)
{
        unsigned long val;

        if (READ_ONCE(tcp_memory_pressure))
                return;
        val = jiffies;

        if (!val)
                val--;
        if (!cmpxchg(&tcp_memory_pressure, 0, val))
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
}
EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);

void tcp_leave_memory_pressure(struct sock *sk)
{
        unsigned long val;

        if (!READ_ONCE(tcp_memory_pressure))
                return;
        val = xchg(&tcp_memory_pressure, 0);
        if (val)
                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
                              jiffies_to_msecs(jiffies - val));
}
EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);

/* Convert seconds to retransmits based on initial and max timeout */
static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
{
        u8 res = 0;

        if (seconds > 0) {
                int period = timeout;

                res = 1;
                while (seconds > period && res < 255) {
                        res++;
                        timeout <<= 1;
                        if (timeout > rto_max)
                                timeout = rto_max;
                        period += timeout;
                }
        }
        return res;
}

/* Convert retransmits to seconds based on initial and max timeout */
static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
{
        int period = 0;

        if (retrans > 0) {
                period = timeout;
                while (--retrans) {
                        timeout <<= 1;
                        if (timeout > rto_max)
                                timeout = rto_max;
                        period += timeout;
                }
        }
        return period;
}

static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
{
        u32 rate = READ_ONCE(tp->rate_delivered);
        u32 intv = READ_ONCE(tp->rate_interval_us);
        u64 rate64 = 0;

        if (rate && intv) {
                rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
                do_div(rate64, intv);
        }
        return rate64;
}

/* Address-family independent initialization for a tcp_sock.
 *
 * NOTE: A lot of things set to zero explicitly by call to
 *       sk_alloc() so need not be done here.
 */
void tcp_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int rto_min_us;

        tp->out_of_order_queue = RB_ROOT;
        sk->tcp_rtx_queue = RB_ROOT;
        tcp_init_xmit_timers(sk);
        INIT_LIST_HEAD(&tp->tsq_node);
        INIT_LIST_HEAD(&tp->tsorted_sent_queue);

        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
        icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us);
        icsk->icsk_delack_max = TCP_DELACK_MAX;
        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
        minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);

        /* So many TCP implementations out there (incorrectly) count the
         * initial SYN frame in their delayed-ACK and congestion control
         * algorithms that we must have the following bandaid to talk
         * efficiently to them.  -DaveM
         */
        tcp_snd_cwnd_set(tp, TCP_INIT_CWND);

        /* There's a bubble in the pipe until at least the first ACK. */
        tp->app_limited = ~0U;
        tp->rate_app_limited = 1;

        /* See draft-stevens-tcpca-spec-01 for discussion of the
         * initialization of these values.
         */
        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
        tp->snd_cwnd_clamp = ~0;
        tp->mss_cache = TCP_MSS_DEFAULT;

        tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
        tcp_assign_congestion_control(sk);

        tp->tsoffset = 0;
        tp->rack.reo_wnd_steps = 1;

        sk->sk_write_space = sk_stream_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

        icsk->icsk_sync_mss = tcp_sync_mss;

        WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
        WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
        tcp_scaling_ratio_init(sk);

        set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
        sk_sockets_allocated_inc(sk);
}
EXPORT_SYMBOL(tcp_init_sock);

static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
{
        struct sk_buff *skb = tcp_write_queue_tail(sk);

        if (tsflags && skb) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);
                struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

                sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
                if (tsflags & SOF_TIMESTAMPING_TX_ACK)
                        tcb->txstamp_ack = 1;
                if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
                        shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
        }
}

static bool tcp_stream_is_readable(struct sock *sk, int target)
{
        if (tcp_epollin_ready(sk, target))
                return true;
        return sk_is_readable(sk);
}

/*
 *        Wait for a TCP event.
 *
 *        Note that we don't need to lock the socket, as the upper poll layers
 *        take care of normal races (between the test and the event) and we don't
 *        go look at any of the socket buffers directly.
 */
__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        __poll_t mask;
        struct sock *sk = sock->sk;
        const struct tcp_sock *tp = tcp_sk(sk);
        u8 shutdown;
        int state;

        sock_poll_wait(file, sock, wait);

        state = inet_sk_state_load(sk);
        if (state == TCP_LISTEN)
                return inet_csk_listen_poll(sk);

        /* Socket is not locked. We are protected from async events
         * by poll logic and correct handling of state changes
         * made by other threads is impossible in any case.
         */

        mask = 0;

        /*
         * EPOLLHUP is certainly not done right. But poll() doesn't
         * have a notion of HUP in just one direction, and for a
         * socket the read side is more interesting.
         *
         * Some poll() documentation says that EPOLLHUP is incompatible
         * with the EPOLLOUT/POLLWR flags, so somebody should check this
         * all. But careful, it tends to be safer to return too many
         * bits than too few, and you can easily break real applications
         * if you don't tell them that something has hung up!
         *
         * Check-me.
         *
         * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
         * our fs/select.c). It means that after we received EOF,
         * poll always returns immediately, making impossible poll() on write()
         * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
         * if and only if shutdown has been made in both directions.
         * Actually, it is interesting to look how Solaris and DUX
         * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
         * then we could set it on SND_SHUTDOWN. BTW examples given
         * in Stevens' books assume exactly this behaviour, it explains
         * why EPOLLHUP is incompatible with EPOLLOUT.        --ANK
         *
         * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
         * blocking on fresh not-connected or disconnected socket. --ANK
         */
        shutdown = READ_ONCE(sk->sk_shutdown);
        if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
                mask |= EPOLLHUP;
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

        /* Connected or passive Fast Open socket? */
        if (state != TCP_SYN_SENT &&
            (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
                int target = sock_rcvlowat(sk, 0, INT_MAX);
                u16 urg_data = READ_ONCE(tp->urg_data);

                if (unlikely(urg_data) &&
                    READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
                    !sock_flag(sk, SOCK_URGINLINE))
                        target++;

                if (tcp_stream_is_readable(sk, target))
                        mask |= EPOLLIN | EPOLLRDNORM;

                if (!(shutdown & SEND_SHUTDOWN)) {
                        if (__sk_stream_is_writeable(sk, 1)) {
                                mask |= EPOLLOUT | EPOLLWRNORM;
                        } else {  /* send SIGIO later */
                                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

                                /* Race breaker. If space is freed after
                                 * wspace test but before the flags are set,
                                 * IO signal will be lost. Memory barrier
                                 * pairs with the input side.
                                 */
                                smp_mb__after_atomic();
                                if (__sk_stream_is_writeable(sk, 1))
                                        mask |= EPOLLOUT | EPOLLWRNORM;
                        }
                } else
                        mask |= EPOLLOUT | EPOLLWRNORM;

                if (urg_data & TCP_URG_VALID)
                        mask |= EPOLLPRI;
        } else if (state == TCP_SYN_SENT &&
                   inet_test_bit(DEFER_CONNECT, sk)) {
                /* Active TCP fastopen socket with defer_connect
                 * Return EPOLLOUT so application can call write()
                 * in order for kernel to generate SYN+data
                 */
                mask |= EPOLLOUT | EPOLLWRNORM;
        }
        /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
        smp_rmb();
        if (READ_ONCE(sk->sk_err) ||
            !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR;

        return mask;
}
EXPORT_SYMBOL(tcp_poll);

int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int answ;
        bool slow;

        switch (cmd) {
        case SIOCINQ:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;

                slow = lock_sock_fast(sk);
                answ = tcp_inq(sk);
                unlock_sock_fast(sk, slow);
                break;
        case SIOCATMARK:
                answ = READ_ONCE(tp->urg_data) &&
                       READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
                break;
        case SIOCOUTQ:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;

                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
                        answ = 0;
                else
                        answ = READ_ONCE(tp->write_seq) - tp->snd_una;
                break;
        case SIOCOUTQNSD:
                if (sk->sk_state == TCP_LISTEN)
                        return -EINVAL;

                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
                        answ = 0;
                else
                        answ = READ_ONCE(tp->write_seq) -
                               READ_ONCE(tp->snd_nxt);
                break;
        default:
                return -ENOIOCTLCMD;
        }

        *karg = answ;
        return 0;
}
EXPORT_SYMBOL(tcp_ioctl);

void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
        tp->pushed_seq = tp->write_seq;
}

static inline bool forced_push(const struct tcp_sock *tp)
{
        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}

void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

        tcb->seq     = tcb->end_seq = tp->write_seq;
        tcb->tcp_flags = TCPHDR_ACK;
        __skb_header_release(skb);
        tcp_add_write_queue_tail(sk, skb);
        sk_wmem_queued_add(sk, skb->truesize);
        sk_mem_charge(sk, skb->truesize);
        if (tp->nonagle & TCP_NAGLE_PUSH)
                tp->nonagle &= ~TCP_NAGLE_PUSH;

        tcp_slow_start_after_idle_check(sk);
}

static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
{
        if (flags & MSG_OOB)
                tp->snd_up = tp->write_seq;
}

/* If a not yet filled skb is pushed, do not send it if
 * we have data packets in Qdisc or NIC queues :
 * Because TX completion will happen shortly, it gives a chance
 * to coalesce future sendmsg() payload into this skb, without
 * need for a timer, and with no latency trade off.
 * As packets containing data payload have a bigger truesize
 * than pure acks (dataless) packets, the last checks prevent
 * autocorking if we only have an ACK in Qdisc/NIC queues,
 * or if TX completion was delayed after we processed ACK packet.
 */
static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
                                int size_goal)
{
        return skb->len < size_goal &&
               READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
               !tcp_rtx_queue_empty(sk) &&
               refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
               tcp_skb_can_collapse_to(skb);
}

void tcp_push(struct sock *sk, int flags, int mss_now,
              int nonagle, int size_goal)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;

        skb = tcp_write_queue_tail(sk);
        if (!skb)
                return;
        if (!(flags & MSG_MORE) || forced_push(tp))
                tcp_mark_push(tp, skb);

        tcp_mark_urg(tp, flags);

        if (tcp_should_autocork(sk, skb, size_goal)) {

                /* avoid atomic op if TSQ_THROTTLED bit is already set */
                if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
                        set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
                        smp_mb__after_atomic();
                }
                /* It is possible TX completion already happened
                 * before we set TSQ_THROTTLED.
                 */
                if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
                        return;
        }

        if (flags & MSG_MORE)
                nonagle = TCP_NAGLE_CORK;

        __tcp_push_pending_frames(sk, mss_now, nonagle);
}

static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
                                unsigned int offset, size_t len)
{
        struct tcp_splice_state *tss = rd_desc->arg.data;
        int ret;

        ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
                              min(rd_desc->count, len), tss->flags);
        if (ret > 0)
                rd_desc->count -= ret;
        return ret;
}

static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
{
        /* Store TCP splice context information in read_descriptor_t. */
        read_descriptor_t rd_desc = {
                .arg.data = tss,
                .count          = tss->len,
        };

        return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
}

/**
 *  tcp_splice_read - splice data from TCP socket to a pipe
 * @sock:        socket to splice from
 * @ppos:        position (not valid)
 * @pipe:        pipe to splice to
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will read pages from given socket and fill them into a pipe.
 *
 **/
ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags)
{
        struct sock *sk = sock->sk;
        struct tcp_splice_state tss = {
                .pipe = pipe,
                .len = len,
                .flags = flags,
        };
        long timeo;
        ssize_t spliced;
        int ret;

        sock_rps_record_flow(sk);
        /*
         * We can't seek on a socket input
         */
        if (unlikely(*ppos))
                return -ESPIPE;

        ret = spliced = 0;

        lock_sock(sk);

        timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
        while (tss.len) {
                ret = __tcp_splice_read(sk, &tss);
                if (ret < 0)
                        break;
                else if (!ret) {
                        if (spliced)
                                break;
                        if (sock_flag(sk, SOCK_DONE))
                                break;
                        if (sk->sk_err) {
                                ret = sock_error(sk);
                                break;
                        }
                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                break;
                        if (sk->sk_state == TCP_CLOSE) {
                                /*
                                 * This occurs when user tries to read
                                 * from never connected socket.
                                 */
                                ret = -ENOTCONN;
                                break;
                        }
                        if (!timeo) {
                                ret = -EAGAIN;
                                break;
                        }
                        /* if __tcp_splice_read() got nothing while we have
                         * an skb in receive queue, we do not want to loop.
                         * This might happen with URG data.
                         */
                        if (!skb_queue_empty(&sk->sk_receive_queue))
                                break;
                        ret = sk_wait_data(sk, &timeo, NULL);
                        if (ret < 0)
                                break;
                        if (signal_pending(current)) {
                                ret = sock_intr_errno(timeo);
                                break;
                        }
                        continue;
                }
                tss.len -= ret;
                spliced += ret;

                if (!tss.len || !timeo)
                        break;
                release_sock(sk);
                lock_sock(sk);

                if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
                    (sk->sk_shutdown & RCV_SHUTDOWN) ||
                    signal_pending(current))
                        break;
        }

        release_sock(sk);

        if (spliced)
                return spliced;

        return ret;
}
EXPORT_SYMBOL(tcp_splice_read);

struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
                                     bool force_schedule)
{
        struct sk_buff *skb;

        skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
        if (likely(skb)) {
                bool mem_scheduled;

                skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
                if (force_schedule) {
                        mem_scheduled = true;
                        sk_forced_mem_schedule(sk, skb->truesize);
                } else {
                        mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
                }
                if (likely(mem_scheduled)) {
                        skb_reserve(skb, MAX_TCP_HEADER);
                        skb->ip_summed = CHECKSUM_PARTIAL;
                        INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
                        return skb;
                }
                __kfree_skb(skb);
        } else {
                sk->sk_prot->enter_memory_pressure(sk);
                sk_stream_moderate_sndbuf(sk);
        }
        return NULL;
}

static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                                       int large_allowed)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 new_size_goal, size_goal;

        if (!large_allowed)
                return mss_now;

        /* Note : tcp_tso_autosize() will eventually split this later */
        new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);

        /* We try hard to avoid divides here */
        size_goal = tp->gso_segs * mss_now;
        if (unlikely(new_size_goal < size_goal ||
                     new_size_goal >= size_goal + mss_now)) {
                tp->gso_segs = min_t(u16, new_size_goal / mss_now,
                                     sk->sk_gso_max_segs);
                size_goal = tp->gso_segs * mss_now;
        }

        return max(size_goal, mss_now);
}

int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
        int mss_now;

        mss_now = tcp_current_mss(sk);
        *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));

        return mss_now;
}

/* In some cases, sendmsg() could have added an skb to the write queue,
 * but failed adding payload on it. We need to remove it to consume less
 * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
 * epoll() users. Another reason is that tcp_write_xmit() does not like
 * finding an empty skb in the write queue.
 */
void tcp_remove_empty_skb(struct sock *sk)
{
        struct sk_buff *skb = tcp_write_queue_tail(sk);

        if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
                tcp_unlink_write_queue(skb, sk);
                if (tcp_write_queue_empty(sk))
                        tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
                tcp_wmem_free_skb(sk, skb);
        }
}

/* skb changing from pure zc to mixed, must charge zc */
static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
{
        if (unlikely(skb_zcopy_pure(skb))) {
                u32 extra = skb->truesize -
                            SKB_TRUESIZE(skb_end_offset(skb));

                if (!sk_wmem_schedule(sk, extra))
                        return -ENOMEM;

                sk_mem_charge(sk, extra);
                skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
        }
        return 0;
}


int tcp_wmem_schedule(struct sock *sk, int copy)
{
        int left;

        if (likely(sk_wmem_schedule(sk, copy)))
                return copy;

        /* We could be in trouble if we have nothing queued.
         * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
         * to guarantee some progress.
         */
        left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued;
        if (left > 0)
                sk_forced_mem_schedule(sk, min(left, copy));
        return min(copy, sk->sk_forward_alloc);
}

void tcp_free_fastopen_req(struct tcp_sock *tp)
{
        if (tp->fastopen_req) {
                kfree(tp->fastopen_req);
                tp->fastopen_req = NULL;
        }
}

int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
                         size_t size, struct ubuf_info *uarg)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_sock *inet = inet_sk(sk);
        struct sockaddr *uaddr = msg->msg_name;
        int err, flags;

        if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
              TFO_CLIENT_ENABLE) ||
            (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
             uaddr->sa_family == AF_UNSPEC))
                return -EOPNOTSUPP;
        if (tp->fastopen_req)
                return -EALREADY; /* Another Fast Open is in progress */

        tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
                                   sk->sk_allocation);
        if (unlikely(!tp->fastopen_req))
                return -ENOBUFS;
        tp->fastopen_req->data = msg;
        tp->fastopen_req->size = size;
        tp->fastopen_req->uarg = uarg;

        if (inet_test_bit(DEFER_CONNECT, sk)) {
                err = tcp_connect(sk);
                /* Same failure procedure as in tcp_v4/6_connect */
                if (err) {
                        tcp_set_state(sk, TCP_CLOSE);
                        inet->inet_dport = 0;
                        sk->sk_route_caps = 0;
                }
        }
        flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
        err = __inet_stream_connect(sk->sk_socket, uaddr,
                                    msg->msg_namelen, flags, 1);
        /* fastopen_req could already be freed in __inet_stream_connect
         * if the connection times out or gets rst
         */
        if (tp->fastopen_req) {
                *copied = tp->fastopen_req->copied;
                tcp_free_fastopen_req(tp);
                inet_clear_bit(DEFER_CONNECT, sk);
        }
        return err;
}

int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct ubuf_info *uarg = NULL;
        struct sk_buff *skb;
        struct sockcm_cookie sockc;
        int flags, err, copied = 0;
        int mss_now = 0, size_goal, copied_syn = 0;
        int process_backlog = 0;
        int zc = 0;
        long timeo;

        flags = msg->msg_flags;

        if ((flags & MSG_ZEROCOPY) && size) {
                if (msg->msg_ubuf) {
                        uarg = msg->msg_ubuf;
                        if (sk->sk_route_caps & NETIF_F_SG)
                                zc = MSG_ZEROCOPY;
                } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
                        skb = tcp_write_queue_tail(sk);
                        uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
                        if (!uarg) {
                                err = -ENOBUFS;
                                goto out_err;
                        }
                        if (sk->sk_route_caps & NETIF_F_SG)
                                zc = MSG_ZEROCOPY;
                        else
                                uarg_to_msgzc(uarg)->zerocopy = 0;
                }
        } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
                if (sk->sk_route_caps & NETIF_F_SG)
                        zc = MSG_SPLICE_PAGES;
        }

        if (unlikely(flags & MSG_FASTOPEN ||
                     inet_test_bit(DEFER_CONNECT, sk)) &&
            !tp->repair) {
                err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
                if (err == -EINPROGRESS && copied_syn > 0)
                        goto out;
                else if (err)
                        goto out_err;
        }

        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

        tcp_rate_check_app_limited(sk);  /* is sending application-limited? */

        /* Wait for a connection to finish. One exception is TCP Fast Open
         * (passive side) where data is allowed to be sent before a connection
         * is fully established.
         */
        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
            !tcp_passive_fastopen(sk)) {
                err = sk_stream_wait_connect(sk, &timeo);
                if (err != 0)
                        goto do_error;
        }

        if (unlikely(tp->repair)) {
                if (tp->repair_queue == TCP_RECV_QUEUE) {
                        copied = tcp_send_rcvq(sk, msg, size);
                        goto out_nopush;
                }

                err = -EINVAL;
                if (tp->repair_queue == TCP_NO_QUEUE)
                        goto out_err;

                /* 'common' sending to sendq */
        }

        sockcm_init(&sockc, sk);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(sk, msg, &sockc);
                if (unlikely(err)) {
                        err = -EINVAL;
                        goto out_err;
                }
        }

        /* This should be in poll */
        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        /* Ok commence sending. */
        copied = 0;

restart:
        mss_now = tcp_send_mss(sk, &size_goal, flags);

        err = -EPIPE;
        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                goto do_error;

        while (msg_data_left(msg)) {
                ssize_t copy = 0;

                skb = tcp_write_queue_tail(sk);
                if (skb)
                        copy = size_goal - skb->len;

                if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
                        bool first_skb;

new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_space;

                        if (unlikely(process_backlog >= 16)) {
                                process_backlog = 0;
                                if (sk_flush_backlog(sk))
                                        goto restart;
                        }
                        first_skb = tcp_rtx_and_write_queues_empty(sk);
                        skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
                                                   first_skb);
                        if (!skb)
                                goto wait_for_space;

                        process_backlog++;

#ifdef CONFIG_SKB_DECRYPTED
                        skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
#endif
                        tcp_skb_entail(sk, skb);
                        copy = size_goal;

                        /* All packets are restored as if they have
                         * already been sent. skb_mstamp_ns isn't set to
                         * avoid wrong rtt estimation.
                         */
                        if (tp->repair)
                                TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
                }

                /* Try to append data to the end of skb. */
                if (copy > msg_data_left(msg))
                        copy = msg_data_left(msg);

                if (zc == 0) {
                        bool merge = true;
                        int i = skb_shinfo(skb)->nr_frags;
                        struct page_frag *pfrag = sk_page_frag(sk);

                        if (!sk_page_frag_refill(sk, pfrag))
                                goto wait_for_space;

                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
                                if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                }
                                merge = false;
                        }

                        copy = min_t(int, copy, pfrag->size - pfrag->offset);

                        if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
                                if (tcp_downgrade_zcopy_pure(sk, skb))
                                        goto wait_for_space;
                                skb_zcopy_downgrade_managed(skb);
                        }

                        copy = tcp_wmem_schedule(sk, copy);
                        if (!copy)
                                goto wait_for_space;

                        err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
                                                       pfrag->page,
                                                       pfrag->offset,
                                                       copy);
                        if (err)
                                goto do_error;

                        /* Update the skb. */
                        if (merge) {
                                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                        } else {
                                skb_fill_page_desc(skb, i, pfrag->page,
                                                   pfrag->offset, copy);
                                page_ref_inc(pfrag->page);
                        }
                        pfrag->offset += copy;
                } else if (zc == MSG_ZEROCOPY)  {
                        /* First append to a fragless skb builds initial
                         * pure zerocopy skb
                         */
                        if (!skb->len)
                                skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;

                        if (!skb_zcopy_pure(skb)) {
                                copy = tcp_wmem_schedule(sk, copy);
                                if (!copy)
                                        goto wait_for_space;
                        }

                        err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
                        if (err == -EMSGSIZE || err == -EEXIST) {
                                tcp_mark_push(tp, skb);
                                goto new_segment;
                        }
                        if (err < 0)
                                goto do_error;
                        copy = err;
                } else if (zc == MSG_SPLICE_PAGES) {
                        /* Splice in data if we can; copy if we can't. */
                        if (tcp_downgrade_zcopy_pure(sk, skb))
                                goto wait_for_space;
                        copy = tcp_wmem_schedule(sk, copy);
                        if (!copy)
                                goto wait_for_space;

                        err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
                                                   sk->sk_allocation);
                        if (err < 0) {
                                if (err == -EMSGSIZE) {
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                }
                                goto do_error;
                        }
                        copy = err;

                        if (!(flags & MSG_NO_SHARED_FRAGS))
                                skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;

                        sk_wmem_queued_add(sk, copy);
                        sk_mem_charge(sk, copy);
                }

                if (!copied)
                        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;

                WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
                TCP_SKB_CB(skb)->end_seq += copy;
                tcp_skb_pcount_set(skb, 0);

                copied += copy;
                if (!msg_data_left(msg)) {
                        if (unlikely(flags & MSG_EOR))
                                TCP_SKB_CB(skb)->eor = 1;
                        goto out;
                }

                if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
                        continue;

                if (forced_push(tp)) {
                        tcp_mark_push(tp, skb);
                        __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
                } else if (skb == tcp_send_head(sk))
                        tcp_push_one(sk, mss_now);
                continue;

wait_for_space:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                tcp_remove_empty_skb(sk);
                if (copied)
                        tcp_push(sk, flags & ~MSG_MORE, mss_now,
                                 TCP_NAGLE_PUSH, size_goal);

                err = sk_stream_wait_memory(sk, &timeo);
                if (err != 0)
                        goto do_error;

                mss_now = tcp_send_mss(sk, &size_goal, flags);
        }

out:
        if (copied) {
                tcp_tx_timestamp(sk, sockc.tsflags);
                tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
        }
out_nopush:
        /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
        if (uarg && !msg->msg_ubuf)
                net_zcopy_put(uarg);
        return copied + copied_syn;

do_error:
        tcp_remove_empty_skb(sk);

        if (copied + copied_syn)
                goto out;
out_err:
        /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
        if (uarg && !msg->msg_ubuf)
                net_zcopy_put_abort(uarg, true);
        err = sk_stream_error(sk, flags, err);
        /* make sure we wake any epoll edge trigger waiter */
        if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
                sk->sk_write_space(sk);
                tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
        }
        return err;
}
EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);

int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
        int ret;

        lock_sock(sk);
        ret = tcp_sendmsg_locked(sk, msg, size);
        release_sock(sk);

        return ret;
}
EXPORT_SYMBOL(tcp_sendmsg);

void tcp_splice_eof(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct tcp_sock *tp = tcp_sk(sk);
        int mss_now, size_goal;

        if (!tcp_write_queue_tail(sk))
                return;

        lock_sock(sk);
        mss_now = tcp_send_mss(sk, &size_goal, 0);
        tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
        release_sock(sk);
}
EXPORT_SYMBOL_GPL(tcp_splice_eof);

/*
 *        Handle reading urgent data. BSD has very simple semantics for
 *        this, no blocking and very strange errors 8)
 */

static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* No URG data to read. */
        if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
            tp->urg_data == TCP_URG_READ)
                return -EINVAL;        /* Yes this is right ! */

        if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
                return -ENOTCONN;

        if (tp->urg_data & TCP_URG_VALID) {
                int err = 0;
                char c = tp->urg_data;

                if (!(flags & MSG_PEEK))
                        WRITE_ONCE(tp->urg_data, TCP_URG_READ);

                /* Read urgent data. */
                msg->msg_flags |= MSG_OOB;

                if (len > 0) {
                        if (!(flags & MSG_TRUNC))
                                err = memcpy_to_msg(msg, &c, 1);
                        len = 1;
                } else
                        msg->msg_flags |= MSG_TRUNC;

                return err ? -EFAULT : len;
        }

        if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
                return 0;

        /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
         * the available implementations agree in this case:
         * this call should never block, independent of the
         * blocking state of the socket.
         * Mike <pall@rz.uni-karlsruhe.de>
         */
        return -EAGAIN;
}

static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
{
        struct sk_buff *skb;
        int copied = 0, err = 0;

        skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
                if (err)
                        return err;
                copied += skb->len;
        }

        skb_queue_walk(&sk->sk_write_queue, skb) {
                err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
                if (err)
                        break;

                copied += skb->len;
        }

        return err ?: copied;
}

/* Clean up the receive buffer for full frames taken by the user,
 * then send an ACK if necessary.  COPIED is the number of bytes
 * tcp_recvmsg has given to the user so far, it speeds up the
 * calculation of whether or not we must ACK for the sake of
 * a window update.
 */
void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
        struct tcp_sock *tp = tcp_sk(sk);
        bool time_to_ack = false;

        if (inet_csk_ack_scheduled(sk)) {
                const struct inet_connection_sock *icsk = inet_csk(sk);

                if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
                    tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
                    /*
                     * If this read emptied read buffer, we send ACK, if
                     * connection is not bidirectional, user drained
                     * receive buffer and there was a small segment
                     * in queue.
                     */
                    (copied > 0 &&
                     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
                       !inet_csk_in_pingpong_mode(sk))) &&
                      !atomic_read(&sk->sk_rmem_alloc)))
                        time_to_ack = true;
        }

        /* We send an ACK if we can now advertise a non-zero window
         * which has been raised "significantly".
         *
         * Even if window raised up to infinity, do not send window open ACK
         * in states, where we will not receive more. It is useless.
         */
        if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
                __u32 rcv_window_now = tcp_receive_window(tp);

                /* Optimize, __tcp_select_window() is not cheap. */
                if (2*rcv_window_now <= tp->window_clamp) {
                        __u32 new_window = __tcp_select_window(sk);

                        /* Send ACK now, if this read freed lots of space
                         * in our buffer. Certainly, new_window is new window.
                         * We can advertise it now, if it is not less than current one.
                         * "Lots" means "at least twice" here.
                         */
                        if (new_window && new_window >= 2 * rcv_window_now)
                                time_to_ack = true;
                }
        }
        if (time_to_ack)
                tcp_send_ack(sk);
}

void tcp_cleanup_rbuf(struct sock *sk, int copied)
{
        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
        struct tcp_sock *tp = tcp_sk(sk);

        WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
             "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
             tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
        __tcp_cleanup_rbuf(sk, copied);
}

static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        if (likely(skb->destructor == sock_rfree)) {
                sock_rfree(skb);
                skb->destructor = NULL;
                skb->sk = NULL;
                return skb_attempt_defer_free(skb);
        }
        __kfree_skb(skb);
}

struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
        struct sk_buff *skb;
        u32 offset;

        while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
                offset = seq - TCP_SKB_CB(skb)->seq;
                if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                        pr_err_once("%s: found a SYN, please report !\n", __func__);
                        offset--;
                }
                if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
                        *off = offset;
                        return skb;
                }
                /* This looks weird, but this can happen if TCP collapsing
                 * splitted a fat GRO packet, while we released socket lock
                 * in skb_splice_bits()
                 */
                tcp_eat_recv_skb(sk, skb);
        }
        return NULL;
}
EXPORT_SYMBOL(tcp_recv_skb);

/*
 * This routine provides an alternative to tcp_recvmsg() for routines
 * that would like to handle copying from skbuffs directly in 'sendfile'
 * fashion.
 * Note:
 *        - It is assumed that the socket was locked by the caller.
 *        - The routine does not block.
 *        - At present, there is no support for reading OOB data
 *          or for 'peeking' the socket using this routine
 *          (although both would be easy to implement).
 */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
                  sk_read_actor_t recv_actor)
{
        struct sk_buff *skb;
        struct tcp_sock *tp = tcp_sk(sk);
        u32 seq = tp->copied_seq;
        u32 offset;
        int copied = 0;

        if (sk->sk_state == TCP_LISTEN)
                return -ENOTCONN;
        while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
                if (offset < skb->len) {
                        int used;
                        size_t len;

                        len = skb->len - offset;
                        /* Stop reading if we hit a patch of urgent data */
                        if (unlikely(tp->urg_data)) {
                                u32 urg_offset = tp->urg_seq - seq;
                                if (urg_offset < len)
                                        len = urg_offset;
                                if (!len)
                                        break;
                        }
                        used = recv_actor(desc, skb, offset, len);
                        if (used <= 0) {
                                if (!copied)
                                        copied = used;
                                break;
                        }
                        if (WARN_ON_ONCE(used > len))
                                used = len;
                        seq += used;
                        copied += used;
                        offset += used;

                        /* If recv_actor drops the lock (e.g. TCP splice
                         * receive) the skb pointer might be invalid when
                         * getting here: tcp_collapse might have deleted it
                         * while aggregating skbs from the socket queue.
                         */
                        skb = tcp_recv_skb(sk, seq - 1, &offset);
                        if (!skb)
                                break;
                        /* TCP coalescing might have appended data to the skb.
                         * Try to splice more frags
                         */
                        if (offset + 1 != skb->len)
                                continue;
                }
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
                        tcp_eat_recv_skb(sk, skb);
                        ++seq;
                        break;
                }
                tcp_eat_recv_skb(sk, skb);
                if (!desc->count)
                        break;
                WRITE_ONCE(tp->copied_seq, seq);
        }
        WRITE_ONCE(tp->copied_seq, seq);

        tcp_rcv_space_adjust(sk);

        /* Clean up data we have read: This will do ACK frames. */
        if (copied > 0) {
                tcp_recv_skb(sk, seq, &offset);
                tcp_cleanup_rbuf(sk, copied);
        }
        return copied;
}
EXPORT_SYMBOL(tcp_read_sock);

int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
        struct sk_buff *skb;
        int copied = 0;

        if (sk->sk_state == TCP_LISTEN)
                return -ENOTCONN;

        while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
                u8 tcp_flags;
                int used;

                __skb_unlink(skb, &sk->sk_receive_queue);
                WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
                tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
                used = recv_actor(sk, skb);
                if (used < 0) {
                        if (!copied)
                                copied = used;
                        break;
                }
                copied += used;

                if (tcp_flags & TCPHDR_FIN)
                        break;
        }
        return copied;
}
EXPORT_SYMBOL(tcp_read_skb);

void tcp_read_done(struct sock *sk, size_t len)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 seq = tp->copied_seq;
        struct sk_buff *skb;
        size_t left;
        u32 offset;

        if (sk->sk_state == TCP_LISTEN)
                return;

        left = len;
        while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
                int used;

                used = min_t(size_t, skb->len - offset, left);
                seq += used;
                left -= used;

                if (skb->len > offset + used)
                        break;

                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
                        tcp_eat_recv_skb(sk, skb);
                        ++seq;
                        break;
                }
                tcp_eat_recv_skb(sk, skb);
        }
        WRITE_ONCE(tp->copied_seq, seq);

        tcp_rcv_space_adjust(sk);

        /* Clean up data we have read: This will do ACK frames. */
        if (left != len)
                tcp_cleanup_rbuf(sk, len - left);
}
EXPORT_SYMBOL(tcp_read_done);

int tcp_peek_len(struct socket *sock)
{
        return tcp_inq(sock->sk);
}
EXPORT_SYMBOL(tcp_peek_len);

/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val)
{
        int space, cap;

        if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
                cap = sk->sk_rcvbuf >> 1;
        else
                cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
        val = min(val, cap);
        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);

        /* Check if we need to signal EPOLLIN right now */
        tcp_data_ready(sk);

        if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
                return 0;

        space = tcp_space_from_win(sk, val);
        if (space > sk->sk_rcvbuf) {
                WRITE_ONCE(sk->sk_rcvbuf, space);
                WRITE_ONCE(tcp_sk(sk)->window_clamp, val);
        }
        return 0;
}
EXPORT_SYMBOL(tcp_set_rcvlowat);

void tcp_update_recv_tstamps(struct sk_buff *skb,
                             struct scm_timestamping_internal *tss)
{
        if (skb->tstamp)
                tss->ts[0] = ktime_to_timespec64(skb->tstamp);
        else
                tss->ts[0] = (struct timespec64) {0};

        if (skb_hwtstamps(skb)->hwtstamp)
                tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
        else
                tss->ts[2] = (struct timespec64) {0};
}

#ifdef CONFIG_MMU
static const struct vm_operations_struct tcp_vm_ops = {
};

int tcp_mmap(struct file *file, struct socket *sock,
             struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_WRITE | VM_EXEC))
                return -EPERM;
        vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC);

        /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
        vm_flags_set(vma, VM_MIXEDMAP);

        vma->vm_ops = &tcp_vm_ops;
        return 0;
}
EXPORT_SYMBOL(tcp_mmap);

static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
                                       u32 *offset_frag)
{
        skb_frag_t *frag;

        if (unlikely(offset_skb >= skb->len))
                return NULL;

        offset_skb -= skb_headlen(skb);
        if ((int)offset_skb < 0 || skb_has_frag_list(skb))
                return NULL;

        frag = skb_shinfo(skb)->frags;
        while (offset_skb) {
                if (skb_frag_size(frag) > offset_skb) {
                        *offset_frag = offset_skb;
                        return frag;
                }
                offset_skb -= skb_frag_size(frag);
                ++frag;
        }
        *offset_frag = 0;
        return frag;
}

static bool can_map_frag(const skb_frag_t *frag)
{
        struct page *page;

        if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag))
                return false;

        page = skb_frag_page(frag);

        if (PageCompound(page) || page->mapping)
                return false;

        return true;
}

static int find_next_mappable_frag(const skb_frag_t *frag,
                                   int remaining_in_skb)
{
        int offset = 0;

        if (likely(can_map_frag(frag)))
                return 0;

        while (offset < remaining_in_skb && !can_map_frag(frag)) {
                offset += skb_frag_size(frag);
                ++frag;
        }
        return offset;
}

static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
                                          struct tcp_zerocopy_receive *zc,
                                          struct sk_buff *skb, u32 offset)
{
        u32 frag_offset, partial_frag_remainder = 0;
        int mappable_offset;
        skb_frag_t *frag;

        /* worst case: skip to next skb. try to improve on this case below */
        zc->recv_skip_hint = skb->len - offset;

        /* Find the frag containing this offset (and how far into that frag) */
        frag = skb_advance_to_frag(skb, offset, &frag_offset);
        if (!frag)
                return;

        if (frag_offset) {
                struct skb_shared_info *info = skb_shinfo(skb);

                /* We read part of the last frag, must recvmsg() rest of skb. */
                if (frag == &info->frags[info->nr_frags - 1])
                        return;

                /* Else, we must at least read the remainder in this frag. */
                partial_frag_remainder = skb_frag_size(frag) - frag_offset;
                zc->recv_skip_hint -= partial_frag_remainder;
                ++frag;
        }

        /* partial_frag_remainder: If part way through a frag, must read rest.
         * mappable_offset: Bytes till next mappable frag, *not* counting bytes
         * in partial_frag_remainder.
         */
        mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
        zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
}

static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
                              int flags, struct scm_timestamping_internal *tss,
                              int *cmsg_flags);
static int receive_fallback_to_copy(struct sock *sk,
                                    struct tcp_zerocopy_receive *zc, int inq,
                                    struct scm_timestamping_internal *tss)
{
        unsigned long copy_address = (unsigned long)zc->copybuf_address;
        struct msghdr msg = {};
        int err;

        zc->length = 0;
        zc->recv_skip_hint = 0;

        if (copy_address != zc->copybuf_address)
                return -EINVAL;

        err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq,
                          &msg.msg_iter);
        if (err)
                return err;

        err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
                                 tss, &zc->msg_flags);
        if (err < 0)
                return err;

        zc->copybuf_len = err;
        if (likely(zc->copybuf_len)) {
                struct sk_buff *skb;
                u32 offset;

                skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
                if (skb)
                        tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
        }
        return 0;
}

static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
                                   struct sk_buff *skb, u32 copylen,
                                   u32 *offset, u32 *seq)
{
        unsigned long copy_address = (unsigned long)zc->copybuf_address;
        struct msghdr msg = {};
        int err;

        if (copy_address != zc->copybuf_address)
                return -EINVAL;

        err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen,
                          &msg.msg_iter);
        if (err)
                return err;
        err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
        if (err)
                return err;
        zc->recv_skip_hint -= copylen;
        *offset += copylen;
        *seq += copylen;
        return (__s32)copylen;
}

static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
                                  struct sock *sk,
                                  struct sk_buff *skb,
                                  u32 *seq,
                                  s32 copybuf_len,
                                  struct scm_timestamping_internal *tss)
{
        u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);

        if (!copylen)
                return 0;
        /* skb is null if inq < PAGE_SIZE. */
        if (skb) {
                offset = *seq - TCP_SKB_CB(skb)->seq;
        } else {
                skb = tcp_recv_skb(sk, *seq, &offset);
                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        tcp_update_recv_tstamps(skb, tss);
                        zc->msg_flags |= TCP_CMSG_TS;
                }
        }

        zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
                                                  seq);
        return zc->copybuf_len < 0 ? 0 : copylen;
}

static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
                                              struct page **pending_pages,
                                              unsigned long pages_remaining,
                                              unsigned long *address,
                                              u32 *length,
                                              u32 *seq,
                                              struct tcp_zerocopy_receive *zc,
                                              u32 total_bytes_to_map,
                                              int err)
{
        /* At least one page did not map. Try zapping if we skipped earlier. */
        if (err == -EBUSY &&
            zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
                u32 maybe_zap_len;

                maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
                                *length + /* Mapped or pending */
                                (pages_remaining * PAGE_SIZE); /* Failed map. */
                zap_page_range_single(vma, *address, maybe_zap_len, NULL);
                err = 0;
        }

        if (!err) {
                unsigned long leftover_pages = pages_remaining;
                int bytes_mapped;

                /* We called zap_page_range_single, try to reinsert. */
                err = vm_insert_pages(vma, *address,
                                      pending_pages,
                                      &pages_remaining);
                bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
                *seq += bytes_mapped;
                *address += bytes_mapped;
        }
        if (err) {
                /* Either we were unable to zap, OR we zapped, retried an
                 * insert, and still had an issue. Either ways, pages_remaining
                 * is the number of pages we were unable to map, and we unroll
                 * some state we speculatively touched before.
                 */
                const int bytes_not_mapped = PAGE_SIZE * pages_remaining;

                *length -= bytes_not_mapped;
                zc->recv_skip_hint += bytes_not_mapped;
        }
        return err;
}

static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
                                        struct page **pages,
                                        unsigned int pages_to_map,
                                        unsigned long *address,
                                        u32 *length,
                                        u32 *seq,
                                        struct tcp_zerocopy_receive *zc,
                                        u32 total_bytes_to_map)
{
        unsigned long pages_remaining = pages_to_map;
        unsigned int pages_mapped;
        unsigned int bytes_mapped;
        int err;

        err = vm_insert_pages(vma, *address, pages, &pages_remaining);
        pages_mapped = pages_to_map - (unsigned int)pages_remaining;
        bytes_mapped = PAGE_SIZE * pages_mapped;
        /* Even if vm_insert_pages fails, it may have partially succeeded in
         * mapping (some but not all of the pages).
         */
        *seq += bytes_mapped;
        *address += bytes_mapped;

        if (likely(!err))
                return 0;

        /* Error: maybe zap and retry + rollback state for failed inserts. */
        return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
                pages_remaining, address, length, seq, zc, total_bytes_to_map,
                err);
}

#define TCP_VALID_ZC_MSG_FLAGS   (TCP_CMSG_TS)
static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
                                      struct tcp_zerocopy_receive *zc,
                                      struct scm_timestamping_internal *tss)
{
        unsigned long msg_control_addr;
        struct msghdr cmsg_dummy;

        msg_control_addr = (unsigned long)zc->msg_control;
        cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
        cmsg_dummy.msg_controllen =
                (__kernel_size_t)zc->msg_controllen;
        cmsg_dummy.msg_flags = in_compat_syscall()
                ? MSG_CMSG_COMPAT : 0;
        cmsg_dummy.msg_control_is_user = true;
        zc->msg_flags = 0;
        if (zc->msg_control == msg_control_addr &&
            zc->msg_controllen == cmsg_dummy.msg_controllen) {
                tcp_recv_timestamp(&cmsg_dummy, sk, tss);
                zc->msg_control = (__u64)
                        ((uintptr_t)cmsg_dummy.msg_control_user);
                zc->msg_controllen =
                        (__u64)cmsg_dummy.msg_controllen;
                zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
        }
}

static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
                                           unsigned long address,
                                           bool *mmap_locked)
{
        struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);

        if (vma) {
                if (vma->vm_ops != &tcp_vm_ops) {
                        vma_end_read(vma);
                        return NULL;
                }
                *mmap_locked = false;
                return vma;
        }

        mmap_read_lock(mm);
        vma = vma_lookup(mm, address);
        if (!vma || vma->vm_ops != &tcp_vm_ops) {
                mmap_read_unlock(mm);
                return NULL;
        }
        *mmap_locked = true;
        return vma;
}

#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
static int tcp_zerocopy_receive(struct sock *sk,
                                struct tcp_zerocopy_receive *zc,
                                struct scm_timestamping_internal *tss)
{
        u32 length = 0, offset, vma_len, avail_len, copylen = 0;
        unsigned long address = (unsigned long)zc->address;
        struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
        s32 copybuf_len = zc->copybuf_len;
        struct tcp_sock *tp = tcp_sk(sk);
        const skb_frag_t *frags = NULL;
        unsigned int pages_to_map = 0;
        struct vm_area_struct *vma;
        struct sk_buff *skb = NULL;
        u32 seq = tp->copied_seq;
        u32 total_bytes_to_map;
        int inq = tcp_inq(sk);
        bool mmap_locked;
        int ret;

        zc->copybuf_len = 0;
        zc->msg_flags = 0;

        if (address & (PAGE_SIZE - 1) || address != zc->address)
                return -EINVAL;

        if (sk->sk_state == TCP_LISTEN)
                return -ENOTCONN;

        sock_rps_record_flow(sk);

        if (inq && inq <= copybuf_len)
                return receive_fallback_to_copy(sk, zc, inq, tss);

        if (inq < PAGE_SIZE) {
                zc->length = 0;
                zc->recv_skip_hint = inq;
                if (!inq && sock_flag(sk, SOCK_DONE))
                        return -EIO;
                return 0;
        }

        vma = find_tcp_vma(current->mm, address, &mmap_locked);
        if (!vma)
                return -EINVAL;

        vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
        avail_len = min_t(u32, vma_len, inq);
        total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
        if (total_bytes_to_map) {
                if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
                        zap_page_range_single(vma, address, total_bytes_to_map,
                                              NULL);
                zc->length = total_bytes_to_map;
                zc->recv_skip_hint = 0;
        } else {
                zc->length = avail_len;
                zc->recv_skip_hint = avail_len;
        }
        ret = 0;
        while (length + PAGE_SIZE <= zc->length) {
                int mappable_offset;
                struct page *page;

                if (zc->recv_skip_hint < PAGE_SIZE) {
                        u32 offset_frag;

                        if (skb) {
                                if (zc->recv_skip_hint > 0)
                                        break;
                                skb = skb->next;
                                offset = seq - TCP_SKB_CB(skb)->seq;
                        } else {
                                skb = tcp_recv_skb(sk, seq, &offset);
                        }

                        if (TCP_SKB_CB(skb)->has_rxtstamp) {
                                tcp_update_recv_tstamps(skb, tss);
                                zc->msg_flags |= TCP_CMSG_TS;
                        }
                        zc->recv_skip_hint = skb->len - offset;
                        frags = skb_advance_to_frag(skb, offset, &offset_frag);
                        if (!frags || offset_frag)
                                break;
                }

                mappable_offset = find_next_mappable_frag(frags,
                                                          zc->recv_skip_hint);
                if (mappable_offset) {
                        zc->recv_skip_hint = mappable_offset;
                        break;
                }
                page = skb_frag_page(frags);
                prefetchw(page);
                pages[pages_to_map++] = page;
                length += PAGE_SIZE;
                zc->recv_skip_hint -= PAGE_SIZE;
                frags++;
                if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
                    zc->recv_skip_hint < PAGE_SIZE) {
                        /* Either full batch, or we're about to go to next skb
                         * (and we cannot unroll failed ops across skbs).
                         */
                        ret = tcp_zerocopy_vm_insert_batch(vma, pages,
                                                           pages_to_map,
                                                           &address, &length,
                                                           &seq, zc,
                                                           total_bytes_to_map);
                        if (ret)
                                goto out;
                        pages_to_map = 0;
                }
        }
        if (pages_to_map) {
                ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
                                                   &address, &length, &seq,
                                                   zc, total_bytes_to_map);
        }
out:
        if (mmap_locked)
                mmap_read_unlock(current->mm);
        else
                vma_end_read(vma);
        /* Try to copy straggler data. */
        if (!ret)
                copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);

        if (length + copylen) {
                WRITE_ONCE(tp->copied_seq, seq);
                tcp_rcv_space_adjust(sk);

                /* Clean up data we have read: This will do ACK frames. */
                tcp_recv_skb(sk, seq, &offset);
                tcp_cleanup_rbuf(sk, length + copylen);
                ret = 0;
                if (length == zc->length)
                        zc->recv_skip_hint = 0;
        } else {
                if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
                        ret = -EIO;
        }
        zc->length = length;
        return ret;
}
#endif

/* Similar to __sock_recv_timestamp, but does not require an skb */
void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
                        struct scm_timestamping_internal *tss)
{
        int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
        bool has_timestamping = false;

        if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
                if (sock_flag(sk, SOCK_RCVTSTAMP)) {
                        if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
                                if (new_tstamp) {
                                        struct __kernel_timespec kts = {
                                                .tv_sec = tss->ts[0].tv_sec,
                                                .tv_nsec = tss->ts[0].tv_nsec,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
                                                 sizeof(kts), &kts);
                                } else {
                                        struct __kernel_old_timespec ts_old = {
                                                .tv_sec = tss->ts[0].tv_sec,
                                                .tv_nsec = tss->ts[0].tv_nsec,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
                                                 sizeof(ts_old), &ts_old);
                                }
                        } else {
                                if (new_tstamp) {
                                        struct __kernel_sock_timeval stv = {
                                                .tv_sec = tss->ts[0].tv_sec,
                                                .tv_usec = tss->ts[0].tv_nsec / 1000,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
                                                 sizeof(stv), &stv);
                                } else {
                                        struct __kernel_old_timeval tv = {
                                                .tv_sec = tss->ts[0].tv_sec,
                                                .tv_usec = tss->ts[0].tv_nsec / 1000,
                                        };
                                        put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
                                                 sizeof(tv), &tv);
                                }
                        }
                }

                if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE)
                        has_timestamping = true;
                else
                        tss->ts[0] = (struct timespec64) {0};
        }

        if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
                if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE)
                        has_timestamping = true;
                else
                        tss->ts[2] = (struct timespec64) {0};
        }

        if (has_timestamping) {
                tss->ts[1] = (struct timespec64) {0};
                if (sock_flag(sk, SOCK_TSTAMP_NEW))
                        put_cmsg_scm_timestamping64(msg, tss);
                else
                        put_cmsg_scm_timestamping(msg, tss);
        }
}

static int tcp_inq_hint(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 copied_seq = READ_ONCE(tp->copied_seq);
        u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
        int inq;

        inq = rcv_nxt - copied_seq;
        if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
                lock_sock(sk);
                inq = tp->rcv_nxt - tp->copied_seq;
                release_sock(sk);
        }
        /* After receiving a FIN, tell the user-space to continue reading
         * by returning a non-zero inq.
         */
        if (inq == 0 && sock_flag(sk, SOCK_DONE))
                inq = 1;
        return inq;
}

/*
 *        This routine copies from a sock struct into the user buffer.
 *
 *        Technical note: in 2.3 we work on _locked_ socket, so that
 *        tricks with *seq access order and skb->users are not required.
 *        Probably, code can be easily improved even more.
 */

static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
                              int flags, struct scm_timestamping_internal *tss,
                              int *cmsg_flags)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int copied = 0;
        u32 peek_seq;
        u32 *seq;
        unsigned long used;
        int err;
        int target;                /* Read at least this many bytes */
        long timeo;
        struct sk_buff *skb, *last;
        u32 peek_offset = 0;
        u32 urg_hole = 0;

        err = -ENOTCONN;
        if (sk->sk_state == TCP_LISTEN)
                goto out;

        if (tp->recvmsg_inq) {
                *cmsg_flags = TCP_CMSG_INQ;
                msg->msg_get_inq = 1;
        }
        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        /* Urgent data needs to be handled specially. */
        if (flags & MSG_OOB)
                goto recv_urg;

        if (unlikely(tp->repair)) {
                err = -EPERM;
                if (!(flags & MSG_PEEK))
                        goto out;

                if (tp->repair_queue == TCP_SEND_QUEUE)
                        goto recv_sndq;

                err = -EINVAL;
                if (tp->repair_queue == TCP_NO_QUEUE)
                        goto out;

                /* 'common' recv queue MSG_PEEK-ing */
        }

        seq = &tp->copied_seq;
        if (flags & MSG_PEEK) {
                peek_offset = max(sk_peek_offset(sk, flags), 0);
                peek_seq = tp->copied_seq + peek_offset;
                seq = &peek_seq;
        }

        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);

        do {
                u32 offset;

                /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
                if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
                        if (copied)
                                break;
                        if (signal_pending(current)) {
                                copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
                                break;
                        }
                }

                /* Next get a buffer. */

                last = skb_peek_tail(&sk->sk_receive_queue);
                skb_queue_walk(&sk->sk_receive_queue, skb) {
                        last = skb;
                        /* Now that we have two receive queues this
                         * shouldn't happen.
                         */
                        if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
                                 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
                                 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
                                 flags))
                                break;

                        offset = *seq - TCP_SKB_CB(skb)->seq;
                        if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                                pr_err_once("%s: found a SYN, please report !\n", __func__);
                                offset--;
                        }
                        if (offset < skb->len)
                                goto found_ok_skb;
                        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                                goto found_fin_ok;
                        WARN(!(flags & MSG_PEEK),
                             "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
                             *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
                }

                /* Well, if we have backlog, try to process it now yet. */

                if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
                        break;

                if (copied) {
                        if (!timeo ||
                            sk->sk_err ||
                            sk->sk_state == TCP_CLOSE ||
                            (sk->sk_shutdown & RCV_SHUTDOWN) ||
                            signal_pending(current))
                                break;
                } else {
                        if (sock_flag(sk, SOCK_DONE))
                                break;

                        if (sk->sk_err) {
                                copied = sock_error(sk);
                                break;
                        }

                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                break;

                        if (sk->sk_state == TCP_CLOSE) {
                                /* This occurs when user tries to read
                                 * from never connected socket.
                                 */
                                copied = -ENOTCONN;
                                break;
                        }

                        if (!timeo) {
                                copied = -EAGAIN;
                                break;
                        }

                        if (signal_pending(current)) {
                                copied = sock_intr_errno(timeo);
                                break;
                        }
                }

                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
                        __sk_flush_backlog(sk);
                } else {
                        tcp_cleanup_rbuf(sk, copied);
                        err = sk_wait_data(sk, &timeo, last);
                        if (err < 0) {
                                err = copied ? : err;
                                goto out;
                        }
                }

                if ((flags & MSG_PEEK) &&
                    (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) {
                        net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
                                            current->comm,
                                            task_pid_nr(current));
                        peek_seq = tp->copied_seq + peek_offset;
                }
                continue;

found_ok_skb:
                /* Ok so how much can we use? */
                used = skb->len - offset;
                if (len < used)
                        used = len;

                /* Do we have urgent data here? */
                if (unlikely(tp->urg_data)) {
                        u32 urg_offset = tp->urg_seq - *seq;
                        if (urg_offset < used) {
                                if (!urg_offset) {
                                        if (!sock_flag(sk, SOCK_URGINLINE)) {
                                                WRITE_ONCE(*seq, *seq + 1);
                                                urg_hole++;
                                                offset++;
                                                used--;
                                                if (!used)
                                                        goto skip_copy;
                                        }
                                } else
                                        used = urg_offset;
                        }
                }

                if (!(flags & MSG_TRUNC)) {
                        err = skb_copy_datagram_msg(skb, offset, msg, used);
                        if (err) {
                                /* Exception. Bailout! */
                                if (!copied)
                                        copied = -EFAULT;
                                break;
                        }
                }

                WRITE_ONCE(*seq, *seq + used);
                copied += used;
                len -= used;
                if (flags & MSG_PEEK)
                        sk_peek_offset_fwd(sk, used);
                else
                        sk_peek_offset_bwd(sk, used);
                tcp_rcv_space_adjust(sk);

skip_copy:
                if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
                        WRITE_ONCE(tp->urg_data, 0);
                        tcp_fast_path_check(sk);
                }

                if (TCP_SKB_CB(skb)->has_rxtstamp) {
                        tcp_update_recv_tstamps(skb, tss);
                        *cmsg_flags |= TCP_CMSG_TS;
                }

                if (used + offset < skb->len)
                        continue;

                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        goto found_fin_ok;
                if (!(flags & MSG_PEEK))
                        tcp_eat_recv_skb(sk, skb);
                continue;

found_fin_ok:
                /* Process the FIN. */
                WRITE_ONCE(*seq, *seq + 1);
                if (!(flags & MSG_PEEK))
                        tcp_eat_recv_skb(sk, skb);
                break;
        } while (len > 0);

        /* According to UNIX98, msg_name/msg_namelen are ignored
         * on connected socket. I was just happy when found this 8) --ANK
         */

        /* Clean up data we have read: This will do ACK frames. */
        tcp_cleanup_rbuf(sk, copied);
        return copied;

out:
        return err;

recv_urg:
        err = tcp_recv_urg(sk, msg, len, flags);
        goto out;

recv_sndq:
        err = tcp_peek_sndq(sk, msg, len);
        goto out;
}

int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
                int *addr_len)
{
        int cmsg_flags = 0, ret;
        struct scm_timestamping_internal tss;

        if (unlikely(flags & MSG_ERRQUEUE))
                return inet_recv_error(sk, msg, len, addr_len);

        if (sk_can_busy_loop(sk) &&
            skb_queue_empty_lockless(&sk->sk_receive_queue) &&
            sk->sk_state == TCP_ESTABLISHED)
                sk_busy_loop(sk, flags & MSG_DONTWAIT);

        lock_sock(sk);
        ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
        release_sock(sk);

        if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
                if (cmsg_flags & TCP_CMSG_TS)
                        tcp_recv_timestamp(msg, sk, &tss);
                if (msg->msg_get_inq) {
                        msg->msg_inq = tcp_inq_hint(sk);
                        if (cmsg_flags & TCP_CMSG_INQ)
                                put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
                                         sizeof(msg->msg_inq), &msg->msg_inq);
                }
        }
        return ret;
}
EXPORT_SYMBOL(tcp_recvmsg);

void tcp_set_state(struct sock *sk, int state)
{
        int oldstate = sk->sk_state;

        /* We defined a new enum for TCP states that are exported in BPF
         * so as not force the internal TCP states to be frozen. The
         * following checks will detect if an internal state value ever
         * differs from the BPF value. If this ever happens, then we will
         * need to remap the internal value to the BPF value before calling
         * tcp_call_bpf_2arg.
         */
        BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
        BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
        BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
        BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
        BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
        BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
        BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
        BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
        BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
        BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
        BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
        BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
        BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE);
        BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);

        /* bpf uapi header bpf.h defines an anonymous enum with values
         * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
         * is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
         * But clang built vmlinux does not have this enum in DWARF
         * since clang removes the above code before generating IR/debuginfo.
         * Let us explicitly emit the type debuginfo to ensure the
         * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
         * regardless of which compiler is used.
         */
        BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);

        if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
                tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);

        switch (state) {
        case TCP_ESTABLISHED:
                if (oldstate != TCP_ESTABLISHED)
                        TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
                break;
        case TCP_CLOSE_WAIT:
                if (oldstate == TCP_SYN_RECV)
                        TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
                break;

        case TCP_CLOSE:
                if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
                        TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);

                sk->sk_prot->unhash(sk);
                if (inet_csk(sk)->icsk_bind_hash &&
                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
                        inet_put_port(sk);
                fallthrough;
        default:
                if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
                        TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
        }

        /* Change state AFTER socket is unhashed to avoid closed
         * socket sitting in hash tables.
         */
        inet_sk_state_store(sk, state);
}
EXPORT_SYMBOL_GPL(tcp_set_state);

/*
 *        State processing on a close. This implements the state shift for
 *        sending our FIN frame. Note that we only send a FIN for some
 *        states. A shutdown() may have already sent the FIN, or we may be
 *        closed.
 */

static const unsigned char new_state[16] = {
  /* current state:        new state:      action:        */
  [0 /* (Invalid) */]        = TCP_CLOSE,
  [TCP_ESTABLISHED]        = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
  [TCP_SYN_SENT]        = TCP_CLOSE,
  [TCP_SYN_RECV]        = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
  [TCP_FIN_WAIT1]        = TCP_FIN_WAIT1,
  [TCP_FIN_WAIT2]        = TCP_FIN_WAIT2,
  [TCP_TIME_WAIT]        = TCP_CLOSE,
  [TCP_CLOSE]                = TCP_CLOSE,
  [TCP_CLOSE_WAIT]        = TCP_LAST_ACK  | TCP_ACTION_FIN,
  [TCP_LAST_ACK]        = TCP_LAST_ACK,
  [TCP_LISTEN]                = TCP_CLOSE,
  [TCP_CLOSING]                = TCP_CLOSING,
  [TCP_NEW_SYN_RECV]        = TCP_CLOSE,        /* should not happen ! */
};

static int tcp_close_state(struct sock *sk)
{
        int next = (int)new_state[sk->sk_state];
        int ns = next & TCP_STATE_MASK;

        tcp_set_state(sk, ns);

        return next & TCP_ACTION_FIN;
}

/*
 *        Shutdown the sending side of a connection. Much like close except
 *        that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
 */

void tcp_shutdown(struct sock *sk, int how)
{
        /*        We need to grab some memory, and put together a FIN,
         *        and then put it into the queue to be sent.
         *                Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
         */
        if (!(how & SEND_SHUTDOWN))
                return;

        /* If we've already sent a FIN, or it's a closed state, skip this. */
        if ((1 << sk->sk_state) &
            (TCPF_ESTABLISHED | TCPF_SYN_SENT |
             TCPF_CLOSE_WAIT)) {
                /* Clear out any half completed packets.  FIN if needed. */
                if (tcp_close_state(sk))
                        tcp_send_fin(sk);
        }
}
EXPORT_SYMBOL(tcp_shutdown);

int tcp_orphan_count_sum(void)
{
        int i, total = 0;

        for_each_possible_cpu(i)
                total += per_cpu(tcp_orphan_count, i);

        return max(total, 0);
}

static int tcp_orphan_cache;
static struct timer_list tcp_orphan_timer;
#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)

static void tcp_orphan_update(struct timer_list *unused)
{
        WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
        mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
}

static bool tcp_too_many_orphans(int shift)
{
        return READ_ONCE(tcp_orphan_cache) << shift >
                READ_ONCE(sysctl_tcp_max_orphans);
}

static bool tcp_out_of_memory(const struct sock *sk)
{
        if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
            sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
                return true;
        return false;
}

bool tcp_check_oom(const struct sock *sk, int shift)
{
        bool too_many_orphans, out_of_socket_memory;

        too_many_orphans = tcp_too_many_orphans(shift);
        out_of_socket_memory = tcp_out_of_memory(sk);

        if (too_many_orphans)
                net_info_ratelimited("too many orphaned sockets\n");
        if (out_of_socket_memory)
                net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
        return too_many_orphans || out_of_socket_memory;
}

void __tcp_close(struct sock *sk, long timeout)
{
        struct sk_buff *skb;
        int data_was_unread = 0;
        int state;

        WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);

        if (sk->sk_state == TCP_LISTEN) {
                tcp_set_state(sk, TCP_CLOSE);

                /* Special case. */
                inet_csk_listen_stop(sk);

                goto adjudge_to_death;
        }

        /*  We need to flush the recv. buffs.  We do this only on the
         *  descriptor close, not protocol-sourced closes, because the
         *  reader process may not have drained the data yet!
         */
        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
                u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;

                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        len--;
                data_was_unread += len;
                __kfree_skb(skb);
        }

        /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
        if (sk->sk_state == TCP_CLOSE)
                goto adjudge_to_death;

        /* As outlined in RFC 2525, section 2.17, we send a RST here because
         * data was lost. To witness the awful effects of the old behavior of
         * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
         * GET in an FTP client, suspend the process, wait for the client to
         * advertise a zero window, then kill -9 the FTP client, wheee...
         * Note: timeout is always zero in such a case.
         */
        if (unlikely(tcp_sk(sk)->repair)) {
                sk->sk_prot->disconnect(sk, 0);
        } else if (data_was_unread) {
                /* Unread data was tossed, zap the connection. */
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
                tcp_set_state(sk, TCP_CLOSE);
                tcp_send_active_reset(sk, sk->sk_allocation,
                                      SK_RST_REASON_NOT_SPECIFIED);
        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
                /* Check zero linger _after_ checking for unread data. */
                sk->sk_prot->disconnect(sk, 0);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
        } else if (tcp_close_state(sk)) {
                /* We FIN if the application ate all the data before
                 * zapping the connection.
                 */

                /* RED-PEN. Formally speaking, we have broken TCP state
                 * machine. State transitions:
                 *
                 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
                 * TCP_SYN_RECV        -> TCP_FIN_WAIT1 (it is difficult)
                 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
                 *
                 * are legal only when FIN has been sent (i.e. in window),
                 * rather than queued out of window. Purists blame.
                 *
                 * F.e. "RFC state" is ESTABLISHED,
                 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
                 *
                 * The visible declinations are that sometimes
                 * we enter time-wait state, when it is not required really
                 * (harmless), do not send active resets, when they are
                 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
                 * they look as CLOSING or LAST_ACK for Linux)
                 * Probably, I missed some more holelets.
                 *                                                 --ANK
                 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
                 * in a single packet! (May consider it later but will
                 * probably need API support or TCP_CORK SYN-ACK until
                 * data is written and socket is closed.)
                 */
                tcp_send_fin(sk);
        }

        sk_stream_wait_close(sk, timeout);

adjudge_to_death:
        state = sk->sk_state;
        sock_hold(sk);
        sock_orphan(sk);

        local_bh_disable();
        bh_lock_sock(sk);
        /* remove backlog if any, without releasing ownership. */
        __release_sock(sk);

        this_cpu_inc(tcp_orphan_count);

        /* Have we already been destroyed by a softirq or backlog? */
        if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
                goto out;

        /*        This is a (useful) BSD violating of the RFC. There is a
         *        problem with TCP as specified in that the other end could
         *        keep a socket open forever with no application left this end.
         *        We use a 1 minute timeout (about the same as BSD) then kill
         *        our end. If they send after that then tough - BUT: long enough
         *        that we won't make the old 4*rto = almost no time - whoops
         *        reset mistake.
         *
         *        Nope, it was not mistake. It is really desired behaviour
         *        f.e. on http servers, when such sockets are useless, but
         *        consume significant resources. Let's do it with special
         *        linger2        option.                                        --ANK
         */

        if (sk->sk_state == TCP_FIN_WAIT2) {
                struct tcp_sock *tp = tcp_sk(sk);
                if (READ_ONCE(tp->linger2) < 0) {
                        tcp_set_state(sk, TCP_CLOSE);
                        tcp_send_active_reset(sk, GFP_ATOMIC,
                                              SK_RST_REASON_NOT_SPECIFIED);
                        __NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPABORTONLINGER);
                } else {
                        const int tmo = tcp_fin_time(sk);

                        if (tmo > TCP_TIMEWAIT_LEN) {
                                inet_csk_reset_keepalive_timer(sk,
                                                tmo - TCP_TIMEWAIT_LEN);
                        } else {
                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                                goto out;
                        }
                }
        }
        if (sk->sk_state != TCP_CLOSE) {
                if (tcp_check_oom(sk, 0)) {
                        tcp_set_state(sk, TCP_CLOSE);
                        tcp_send_active_reset(sk, GFP_ATOMIC,
                                              SK_RST_REASON_NOT_SPECIFIED);
                        __NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPABORTONMEMORY);
                } else if (!check_net(sock_net(sk))) {
                        /* Not possible to send reset; just close */
                        tcp_set_state(sk, TCP_CLOSE);
                }
        }

        if (sk->sk_state == TCP_CLOSE) {
                struct request_sock *req;

                req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
                                                lockdep_sock_is_held(sk));
                /* We could get here with a non-NULL req if the socket is
                 * aborted (e.g., closed with unread data) before 3WHS
                 * finishes.
                 */
                if (req)
                        reqsk_fastopen_remove(sk, req, false);
                inet_csk_destroy_sock(sk);
        }
        /* Otherwise, socket is reprieved until protocol close. */

out:
        bh_unlock_sock(sk);
        local_bh_enable();
}

void tcp_close(struct sock *sk, long timeout)
{
        lock_sock(sk);
        __tcp_close(sk, timeout);
        release_sock(sk);
        if (!sk->sk_net_refcnt)
                inet_csk_clear_xmit_timers_sync(sk);
        sock_put(sk);
}
EXPORT_SYMBOL(tcp_close);

/* These states need RST on ABORT according to RFC793 */

static inline bool tcp_need_reset(int state)
{
        return (1 << state) &
               (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}

static void tcp_rtx_queue_purge(struct sock *sk)
{
        struct rb_node *p = rb_first(&sk->tcp_rtx_queue);

        tcp_sk(sk)->highest_sack = NULL;
        while (p) {
                struct sk_buff *skb = rb_to_skb(p);

                p = rb_next(p);
                /* Since we are deleting whole queue, no need to
                 * list_del(&skb->tcp_tsorted_anchor)
                 */
                tcp_rtx_queue_unlink(skb, sk);
                tcp_wmem_free_skb(sk, skb);
        }
}

void tcp_write_queue_purge(struct sock *sk)
{
        struct sk_buff *skb;

        tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
        while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
                tcp_skb_tsorted_anchor_cleanup(skb);
                tcp_wmem_free_skb(sk, skb);
        }
        tcp_rtx_queue_purge(sk);
        INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
        tcp_clear_all_retrans_hints(tcp_sk(sk));
        tcp_sk(sk)->packets_out = 0;
        inet_csk(sk)->icsk_backoff = 0;
}

int tcp_disconnect(struct sock *sk, int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int old_state = sk->sk_state;
        u32 seq;

        if (old_state != TCP_CLOSE)
                tcp_set_state(sk, TCP_CLOSE);

        /* ABORT function of RFC793 */
        if (old_state == TCP_LISTEN) {
                inet_csk_listen_stop(sk);
        } else if (unlikely(tp->repair)) {
                WRITE_ONCE(sk->sk_err, ECONNABORTED);
        } else if (tcp_need_reset(old_state) ||
                   (tp->snd_nxt != tp->write_seq &&
                    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
                /* The last check adjusts for discrepancy of Linux wrt. RFC
                 * states
                 */
                tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED);
                WRITE_ONCE(sk->sk_err, ECONNRESET);
        } else if (old_state == TCP_SYN_SENT)
                WRITE_ONCE(sk->sk_err, ECONNRESET);

        tcp_clear_xmit_timers(sk);
        __skb_queue_purge(&sk->sk_receive_queue);
        WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
        WRITE_ONCE(tp->urg_data, 0);
        sk_set_peek_off(sk, -1);
        tcp_write_queue_purge(sk);
        tcp_fastopen_active_disable_ofo_check(sk);
        skb_rbtree_purge(&tp->out_of_order_queue);

        inet->inet_dport = 0;

        inet_bhash2_reset_saddr(sk);

        WRITE_ONCE(sk->sk_shutdown, 0);
        sock_reset_flag(sk, SOCK_DONE);
        tp->srtt_us = 0;
        tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
        tp->rcv_rtt_last_tsecr = 0;

        seq = tp->write_seq + tp->max_window + 2;
        if (!seq)
                seq = 1;
        WRITE_ONCE(tp->write_seq, seq);

        icsk->icsk_backoff = 0;
        icsk->icsk_probes_out = 0;
        icsk->icsk_probes_tstamp = 0;
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        icsk->icsk_rto_min = TCP_RTO_MIN;
        icsk->icsk_delack_max = TCP_DELACK_MAX;
        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
        tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
        tp->snd_cwnd_cnt = 0;
        tp->is_cwnd_limited = 0;
        tp->max_packets_out = 0;
        tp->window_clamp = 0;
        tp->delivered = 0;
        tp->delivered_ce = 0;
        if (icsk->icsk_ca_ops->release)
                icsk->icsk_ca_ops->release(sk);
        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
        icsk->icsk_ca_initialized = 0;
        tcp_set_ca_state(sk, TCP_CA_Open);
        tp->is_sack_reneg = 0;
        tcp_clear_retrans(tp);
        tp->total_retrans = 0;
        inet_csk_delack_init(sk);
        /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
         * issue in __tcp_select_window()
         */
        icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
        __sk_dst_reset(sk);
        dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL)));
        tcp_saved_syn_free(tp);
        tp->compressed_ack = 0;
        tp->segs_in = 0;
        tp->segs_out = 0;
        tp->bytes_sent = 0;
        tp->bytes_acked = 0;
        tp->bytes_received = 0;
        tp->bytes_retrans = 0;
        tp->data_segs_in = 0;
        tp->data_segs_out = 0;
        tp->duplicate_sack[0].start_seq = 0;
        tp->duplicate_sack[0].end_seq = 0;
        tp->dsack_dups = 0;
        tp->reord_seen = 0;
        tp->retrans_out = 0;
        tp->sacked_out = 0;
        tp->tlp_high_seq = 0;
        tp->last_oow_ack_time = 0;
        tp->plb_rehash = 0;
        /* There's a bubble in the pipe until at least the first ACK. */
        tp->app_limited = ~0U;
        tp->rate_app_limited = 1;
        tp->rack.mstamp = 0;
        tp->rack.advanced = 0;
        tp->rack.reo_wnd_steps = 1;
        tp->rack.last_delivered = 0;
        tp->rack.reo_wnd_persist = 0;
        tp->rack.dsack_seen = 0;
        tp->syn_data_acked = 0;
        tp->rx_opt.saw_tstamp = 0;
        tp->rx_opt.dsack = 0;
        tp->rx_opt.num_sacks = 0;
        tp->rcv_ooopack = 0;


        /* Clean up fastopen related fields */
        tcp_free_fastopen_req(tp);
        inet_clear_bit(DEFER_CONNECT, sk);
        tp->fastopen_client_fail = 0;

        WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);

        if (sk->sk_frag.page) {
                put_page(sk->sk_frag.page);
                sk->sk_frag.page = NULL;
                sk->sk_frag.offset = 0;
        }
        sk_error_report(sk);
        return 0;
}
EXPORT_SYMBOL(tcp_disconnect);

static inline bool tcp_can_repair_sock(const struct sock *sk)
{
        return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
                (sk->sk_state != TCP_LISTEN);
}

static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
{
        struct tcp_repair_window opt;

        if (!tp->repair)
                return -EPERM;

        if (len != sizeof(opt))
                return -EINVAL;

        if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
                return -EFAULT;

        if (opt.max_window < opt.snd_wnd)
                return -EINVAL;

        if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
                return -EINVAL;

        if (after(opt.rcv_wup, tp->rcv_nxt))
                return -EINVAL;

        tp->snd_wl1        = opt.snd_wl1;
        tp->snd_wnd        = opt.snd_wnd;
        tp->max_window        = opt.max_window;

        tp->rcv_wnd        = opt.rcv_wnd;
        tp->rcv_wup        = opt.rcv_wup;

        return 0;
}

static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
                unsigned int len)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_repair_opt opt;
        size_t offset = 0;

        while (len >= sizeof(opt)) {
                if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
                        return -EFAULT;

                offset += sizeof(opt);
                len -= sizeof(opt);

                switch (opt.opt_code) {
                case TCPOPT_MSS:
                        tp->rx_opt.mss_clamp = opt.opt_val;
                        tcp_mtup_init(sk);
                        break;
                case TCPOPT_WINDOW:
                        {
                                u16 snd_wscale = opt.opt_val & 0xFFFF;
                                u16 rcv_wscale = opt.opt_val >> 16;

                                if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
                                        return -EFBIG;

                                tp->rx_opt.snd_wscale = snd_wscale;
                                tp->rx_opt.rcv_wscale = rcv_wscale;
                                tp->rx_opt.wscale_ok = 1;
                        }
                        break;
                case TCPOPT_SACK_PERM:
                        if (opt.opt_val != 0)
                                return -EINVAL;

                        tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
                        break;
                case TCPOPT_TIMESTAMP:
                        if (opt.opt_val != 0)
                                return -EINVAL;

                        tp->rx_opt.tstamp_ok = 1;
                        break;
                }
        }

        return 0;
}

DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
EXPORT_SYMBOL(tcp_tx_delay_enabled);

static void tcp_enable_tx_delay(void)
{
        if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
                static int __tcp_tx_delay_enabled = 0;

                if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
                        static_branch_enable(&tcp_tx_delay_enabled);
                        pr_info("TCP_TX_DELAY enabled\n");
                }
        }
}

/* When set indicates to always queue non-full frames.  Later the user clears
 * this option and we transmit any pending partial frames in the queue.  This is
 * meant to be used alongside sendfile() to get properly filled frames when the
 * user (for example) must write out headers with a write() call first and then
 * use sendfile to send out the data parts.
 *
 * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
 * TCP_NODELAY.
 */
void __tcp_sock_set_cork(struct sock *sk, bool on)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (on) {
                tp->nonagle |= TCP_NAGLE_CORK;
        } else {
                tp->nonagle &= ~TCP_NAGLE_CORK;
                if (tp->nonagle & TCP_NAGLE_OFF)
                        tp->nonagle |= TCP_NAGLE_PUSH;
                tcp_push_pending_frames(sk);
        }
}

void tcp_sock_set_cork(struct sock *sk, bool on)
{
        lock_sock(sk);
        __tcp_sock_set_cork(sk, on);
        release_sock(sk);
}
EXPORT_SYMBOL(tcp_sock_set_cork);

/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
 * remembered, but it is not activated until cork is cleared.
 *
 * However, when TCP_NODELAY is set we make an explicit push, which overrides
 * even TCP_CORK for currently queued segments.
 */
void __tcp_sock_set_nodelay(struct sock *sk, bool on)
{
        if (on) {
                tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
                tcp_push_pending_frames(sk);
        } else {
                tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
        }
}

void tcp_sock_set_nodelay(struct sock *sk)
{
        lock_sock(sk);
        __tcp_sock_set_nodelay(sk, true);
        release_sock(sk);
}
EXPORT_SYMBOL(tcp_sock_set_nodelay);

static void __tcp_sock_set_quickack(struct sock *sk, int val)
{
        if (!val) {
                inet_csk_enter_pingpong_mode(sk);
                return;
        }

        inet_csk_exit_pingpong_mode(sk);
        if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
            inet_csk_ack_scheduled(sk)) {
                inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
                tcp_cleanup_rbuf(sk, 1);
                if (!(val & 1))
                        inet_csk_enter_pingpong_mode(sk);
        }
}

void tcp_sock_set_quickack(struct sock *sk, int val)
{
        lock_sock(sk);
        __tcp_sock_set_quickack(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(tcp_sock_set_quickack);

int tcp_sock_set_syncnt(struct sock *sk, int val)
{
        if (val < 1 || val > MAX_TCP_SYNCNT)
                return -EINVAL;

        WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_syncnt);

int tcp_sock_set_user_timeout(struct sock *sk, int val)
{
        /* Cap the max time in ms TCP will retry or probe the window
         * before giving up and aborting (ETIMEDOUT) a connection.
         */
        if (val < 0)
                return -EINVAL;

        WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_user_timeout);

int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (val < 1 || val > MAX_TCP_KEEPIDLE)
                return -EINVAL;

        /* Paired with WRITE_ONCE() in keepalive_time_when() */
        WRITE_ONCE(tp->keepalive_time, val * HZ);
        if (sock_flag(sk, SOCK_KEEPOPEN) &&
            !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
                u32 elapsed = keepalive_time_elapsed(tp);

                if (tp->keepalive_time > elapsed)
                        elapsed = tp->keepalive_time - elapsed;
                else
                        elapsed = 0;
                inet_csk_reset_keepalive_timer(sk, elapsed);
        }

        return 0;
}

int tcp_sock_set_keepidle(struct sock *sk, int val)
{
        int err;

        lock_sock(sk);
        err = tcp_sock_set_keepidle_locked(sk, val);
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(tcp_sock_set_keepidle);

int tcp_sock_set_keepintvl(struct sock *sk, int val)
{
        if (val < 1 || val > MAX_TCP_KEEPINTVL)
                return -EINVAL;

        WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepintvl);

int tcp_sock_set_keepcnt(struct sock *sk, int val)
{
        if (val < 1 || val > MAX_TCP_KEEPCNT)
                return -EINVAL;

        /* Paired with READ_ONCE() in keepalive_probes() */
        WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
        return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);

int tcp_set_window_clamp(struct sock *sk, int val)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!val) {
                if (sk->sk_state != TCP_CLOSE)
                        return -EINVAL;
                WRITE_ONCE(tp->window_clamp, 0);
        } else {
                u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
                u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
                                                SOCK_MIN_RCVBUF / 2 : val;

                if (new_window_clamp == old_window_clamp)
                        return 0;

                WRITE_ONCE(tp->window_clamp, new_window_clamp);
                if (new_window_clamp < old_window_clamp) {
                        /* need to apply the reserved mem provisioning only
                         * when shrinking the window clamp
                         */
                        __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp);

                } else {
                        new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
                        tp->rcv_ssthresh = max(new_rcv_ssthresh,
                                               tp->rcv_ssthresh);
                }
        }
        return 0;
}

/*
 *        Socket option code for TCP.
 */
int do_tcp_setsockopt(struct sock *sk, int level, int optname,
                      sockptr_t optval, unsigned int optlen)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct net *net = sock_net(sk);
        int val;
        int err = 0;

        /* These are data/string values, all the others are ints */
        switch (optname) {
        case TCP_CONGESTION: {
                char name[TCP_CA_NAME_MAX];

                if (optlen < 1)
                        return -EINVAL;

                val = strncpy_from_sockptr(name, optval,
                                        min_t(long, TCP_CA_NAME_MAX-1, optlen));
                if (val < 0)
                        return -EFAULT;
                name[val] = 0;

                sockopt_lock_sock(sk);
                err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
                                                 sockopt_ns_capable(sock_net(sk)->user_ns,
                                                                    CAP_NET_ADMIN));
                sockopt_release_sock(sk);
                return err;
        }
        case TCP_ULP: {
                char name[TCP_ULP_NAME_MAX];

                if (optlen < 1)
                        return -EINVAL;

                val = strncpy_from_sockptr(name, optval,
                                        min_t(long, TCP_ULP_NAME_MAX - 1,
                                              optlen));
                if (val < 0)
                        return -EFAULT;
                name[val] = 0;

                sockopt_lock_sock(sk);
                err = tcp_set_ulp(sk, name);
                sockopt_release_sock(sk);
                return err;
        }
        case TCP_FASTOPEN_KEY: {
                __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
                __u8 *backup_key = NULL;

                /* Allow a backup key as well to facilitate key rotation
                 * First key is the active one.
                 */
                if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
                    optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
                        return -EINVAL;

                if (copy_from_sockptr(key, optval, optlen))
                        return -EFAULT;

                if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
                        backup_key = key + TCP_FASTOPEN_KEY_LENGTH;

                return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
        }
        default:
                /* fallthru */
                break;
        }

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        /* Handle options that can be set without locking the socket. */
        switch (optname) {
        case TCP_SYNCNT:
                return tcp_sock_set_syncnt(sk, val);
        case TCP_USER_TIMEOUT:
                return tcp_sock_set_user_timeout(sk, val);
        case TCP_KEEPINTVL:
                return tcp_sock_set_keepintvl(sk, val);
        case TCP_KEEPCNT:
                return tcp_sock_set_keepcnt(sk, val);
        case TCP_LINGER2:
                if (val < 0)
                        WRITE_ONCE(tp->linger2, -1);
                else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
                        WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
                else
                        WRITE_ONCE(tp->linger2, val * HZ);
                return 0;
        case TCP_DEFER_ACCEPT:
                /* Translate value in seconds to number of retransmits */
                WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
                           secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
                                           TCP_RTO_MAX / HZ));
                return 0;
        }

        sockopt_lock_sock(sk);

        switch (optname) {
        case TCP_MAXSEG:
                /* Values greater than interface MTU won't take effect. However
                 * at the point when this call is done we typically don't yet
                 * know which interface is going to be used
                 */
                if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
                        err = -EINVAL;
                        break;
                }
                tp->rx_opt.user_mss = val;
                break;

        case TCP_NODELAY:
                __tcp_sock_set_nodelay(sk, val);
                break;

        case TCP_THIN_LINEAR_TIMEOUTS:
                if (val < 0 || val > 1)
                        err = -EINVAL;
                else
                        tp->thin_lto = val;
                break;

        case TCP_THIN_DUPACK:
                if (val < 0 || val > 1)
                        err = -EINVAL;
                break;

        case TCP_REPAIR:
                if (!tcp_can_repair_sock(sk))
                        err = -EPERM;
                else if (val == TCP_REPAIR_ON) {
                        tp->repair = 1;
                        sk->sk_reuse = SK_FORCE_REUSE;
                        tp->repair_queue = TCP_NO_QUEUE;
                } else if (val == TCP_REPAIR_OFF) {
                        tp->repair = 0;
                        sk->sk_reuse = SK_NO_REUSE;
                        tcp_send_window_probe(sk);
                } else if (val == TCP_REPAIR_OFF_NO_WP) {
                        tp->repair = 0;
                        sk->sk_reuse = SK_NO_REUSE;
                } else
                        err = -EINVAL;

                break;

        case TCP_REPAIR_QUEUE:
                if (!tp->repair)
                        err = -EPERM;
                else if ((unsigned int)val < TCP_QUEUES_NR)
                        tp->repair_queue = val;
                else
                        err = -EINVAL;
                break;

        case TCP_QUEUE_SEQ:
                if (sk->sk_state != TCP_CLOSE) {
                        err = -EPERM;
                } else if (tp->repair_queue == TCP_SEND_QUEUE) {
                        if (!tcp_rtx_queue_empty(sk))
                                err = -EPERM;
                        else
                                WRITE_ONCE(tp->write_seq, val);
                } else if (tp->repair_queue == TCP_RECV_QUEUE) {
                        if (tp->rcv_nxt != tp->copied_seq) {
                                err = -EPERM;
                        } else {
                                WRITE_ONCE(tp->rcv_nxt, val);
                                WRITE_ONCE(tp->copied_seq, val);
                        }
                } else {
                        err = -EINVAL;
                }
                break;

        case TCP_REPAIR_OPTIONS:
                if (!tp->repair)
                        err = -EINVAL;
                else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
                        err = tcp_repair_options_est(sk, optval, optlen);
                else
                        err = -EPERM;
                break;

        case TCP_CORK:
                __tcp_sock_set_cork(sk, val);
                break;

        case TCP_KEEPIDLE:
                err = tcp_sock_set_keepidle_locked(sk, val);
                break;
        case TCP_SAVE_SYN:
                /* 0: disable, 1: enable, 2: start from ether_header */
                if (val < 0 || val > 2)
                        err = -EINVAL;
                else
                        tp->save_syn = val;
                break;

        case TCP_WINDOW_CLAMP:
                err = tcp_set_window_clamp(sk, val);
                break;

        case TCP_QUICKACK:
                __tcp_sock_set_quickack(sk, val);
                break;

        case TCP_AO_REPAIR:
                if (!tcp_can_repair_sock(sk)) {
                        err = -EPERM;
                        break;
                }
                err = tcp_ao_set_repair(sk, optval, optlen);
                break;
#ifdef CONFIG_TCP_AO
        case TCP_AO_ADD_KEY:
        case TCP_AO_DEL_KEY:
        case TCP_AO_INFO: {
                /* If this is the first TCP-AO setsockopt() on the socket,
                 * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR
                 * in any state.
                 */
                if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                        goto ao_parse;
                if (rcu_dereference_protected(tcp_sk(sk)->ao_info,
                                              lockdep_sock_is_held(sk)))
                        goto ao_parse;
                if (tp->repair)
                        goto ao_parse;
                err = -EISCONN;
                break;
ao_parse:
                err = tp->af_specific->ao_parse(sk, optname, optval, optlen);
                break;
        }
#endif
#ifdef CONFIG_TCP_MD5SIG
        case TCP_MD5SIG:
        case TCP_MD5SIG_EXT:
                err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
                break;
#endif
        case TCP_FASTOPEN:
                if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
                    TCPF_LISTEN))) {
                        tcp_fastopen_init_key_once(net);

                        fastopen_queue_tune(sk, val);
                } else {
                        err = -EINVAL;
                }
                break;
        case TCP_FASTOPEN_CONNECT:
                if (val > 1 || val < 0) {
                        err = -EINVAL;
                } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
                           TFO_CLIENT_ENABLE) {
                        if (sk->sk_state == TCP_CLOSE)
                                tp->fastopen_connect = val;
                        else
                                err = -EINVAL;
                } else {
                        err = -EOPNOTSUPP;
                }
                break;
        case TCP_FASTOPEN_NO_COOKIE:
                if (val > 1 || val < 0)
                        err = -EINVAL;
                else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        err = -EINVAL;
                else
                        tp->fastopen_no_cookie = val;
                break;
        case TCP_TIMESTAMP:
                if (!tp->repair) {
                        err = -EPERM;
                        break;
                }
                /* val is an opaque field,
                 * and low order bit contains usec_ts enable bit.
                 * Its a best effort, and we do not care if user makes an error.
                 */
                tp->tcp_usec_ts = val & 1;
                WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
                break;
        case TCP_REPAIR_WINDOW:
                err = tcp_repair_set_window(tp, optval, optlen);
                break;
        case TCP_NOTSENT_LOWAT:
                WRITE_ONCE(tp->notsent_lowat, val);
                sk->sk_write_space(sk);
                break;
        case TCP_INQ:
                if (val > 1 || val < 0)
                        err = -EINVAL;
                else
                        tp->recvmsg_inq = val;
                break;
        case TCP_TX_DELAY:
                if (val)
                        tcp_enable_tx_delay();
                WRITE_ONCE(tp->tcp_tx_delay, val);
                break;
        default:
                err = -ENOPROTOOPT;
                break;
        }

        sockopt_release_sock(sk);
        return err;
}

int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                   unsigned int optlen)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        if (level != SOL_TCP)
                /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
                return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
                                                                optval, optlen);
        return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(tcp_setsockopt);

static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
                                      struct tcp_info *info)
{
        u64 stats[__TCP_CHRONO_MAX], total = 0;
        enum tcp_chrono i;

        for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
                stats[i] = tp->chrono_stat[i - 1];
                if (i == tp->chrono_type)
                        stats[i] += tcp_jiffies32 - tp->chrono_start;
                stats[i] *= USEC_PER_SEC / HZ;
                total += stats[i];
        }

        info->tcpi_busy_time = total;
        info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
        info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
}

/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
        const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
        const struct inet_connection_sock *icsk = inet_csk(sk);
        unsigned long rate;
        u32 now;
        u64 rate64;
        bool slow;

        memset(info, 0, sizeof(*info));
        if (sk->sk_type != SOCK_STREAM)
                return;

        info->tcpi_state = inet_sk_state_load(sk);

        /* Report meaningful fields for all TCP states, including listeners */
        rate = READ_ONCE(sk->sk_pacing_rate);
        rate64 = (rate != ~0UL) ? rate : ~0ULL;
        info->tcpi_pacing_rate = rate64;

        rate = READ_ONCE(sk->sk_max_pacing_rate);
        rate64 = (rate != ~0UL) ? rate : ~0ULL;
        info->tcpi_max_pacing_rate = rate64;

        info->tcpi_reordering = tp->reordering;
        info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);

        if (info->tcpi_state == TCP_LISTEN) {
                /* listeners aliased fields :
                 * tcpi_unacked -> Number of children ready for accept()
                 * tcpi_sacked  -> max backlog
                 */
                info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
                info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
                return;
        }

        slow = lock_sock_fast(sk);

        info->tcpi_ca_state = icsk->icsk_ca_state;
        info->tcpi_retransmits = icsk->icsk_retransmits;
        info->tcpi_probes = icsk->icsk_probes_out;
        info->tcpi_backoff = icsk->icsk_backoff;

        if (tp->rx_opt.tstamp_ok)
                info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
        if (tcp_is_sack(tp))
                info->tcpi_options |= TCPI_OPT_SACK;
        if (tp->rx_opt.wscale_ok) {
                info->tcpi_options |= TCPI_OPT_WSCALE;
                info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
                info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
        }

        if (tp->ecn_flags & TCP_ECN_OK)
                info->tcpi_options |= TCPI_OPT_ECN;
        if (tp->ecn_flags & TCP_ECN_SEEN)
                info->tcpi_options |= TCPI_OPT_ECN_SEEN;
        if (tp->syn_data_acked)
                info->tcpi_options |= TCPI_OPT_SYN_DATA;
        if (tp->tcp_usec_ts)
                info->tcpi_options |= TCPI_OPT_USEC_TS;

        info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
        info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
                                                tcp_delack_max(sk)));
        info->tcpi_snd_mss = tp->mss_cache;
        info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;

        info->tcpi_unacked = tp->packets_out;
        info->tcpi_sacked = tp->sacked_out;

        info->tcpi_lost = tp->lost_out;
        info->tcpi_retrans = tp->retrans_out;

        now = tcp_jiffies32;
        info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
        info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
        info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);

        info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
        info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
        info->tcpi_rtt = tp->srtt_us >> 3;
        info->tcpi_rttvar = tp->mdev_us >> 2;
        info->tcpi_snd_ssthresh = tp->snd_ssthresh;
        info->tcpi_advmss = tp->advmss;

        info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
        info->tcpi_rcv_space = tp->rcvq_space.space;

        info->tcpi_total_retrans = tp->total_retrans;

        info->tcpi_bytes_acked = tp->bytes_acked;
        info->tcpi_bytes_received = tp->bytes_received;
        info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
        tcp_get_info_chrono_stats(tp, info);

        info->tcpi_segs_out = tp->segs_out;

        /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
        info->tcpi_segs_in = READ_ONCE(tp->segs_in);
        info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);

        info->tcpi_min_rtt = tcp_min_rtt(tp);
        info->tcpi_data_segs_out = tp->data_segs_out;

        info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
        rate64 = tcp_compute_delivery_rate(tp);
        if (rate64)
                info->tcpi_delivery_rate = rate64;
        info->tcpi_delivered = tp->delivered;
        info->tcpi_delivered_ce = tp->delivered_ce;
        info->tcpi_bytes_sent = tp->bytes_sent;
        info->tcpi_bytes_retrans = tp->bytes_retrans;
        info->tcpi_dsack_dups = tp->dsack_dups;
        info->tcpi_reord_seen = tp->reord_seen;
        info->tcpi_rcv_ooopack = tp->rcv_ooopack;
        info->tcpi_snd_wnd = tp->snd_wnd;
        info->tcpi_rcv_wnd = tp->rcv_wnd;
        info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
        info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;

        info->tcpi_total_rto = tp->total_rto;
        info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
        info->tcpi_total_rto_time = tp->total_rto_time;
        if (tp->rto_stamp)
                info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;

        unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);

static size_t tcp_opt_stats_get_size(void)
{
        return
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
                nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
                nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
                nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
                nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
                0;
}

/* Returns TTL or hop limit of an incoming packet from skb. */
static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
{
        if (skb->protocol == htons(ETH_P_IP))
                return ip_hdr(skb)->ttl;
        else if (skb->protocol == htons(ETH_P_IPV6))
                return ipv6_hdr(skb)->hop_limit;
        else
                return 0;
}

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
                                               const struct sk_buff *orig_skb,
                                               const struct sk_buff *ack_skb)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *stats;
        struct tcp_info info;
        unsigned long rate;
        u64 rate64;

        stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
        if (!stats)
                return NULL;

        tcp_get_info_chrono_stats(tp, &info);
        nla_put_u64_64bit(stats, TCP_NLA_BUSY,
                          info.tcpi_busy_time, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
                          info.tcpi_rwnd_limited, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
                          info.tcpi_sndbuf_limited, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
                          tp->data_segs_out, TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
                          tp->total_retrans, TCP_NLA_PAD);

        rate = READ_ONCE(sk->sk_pacing_rate);
        rate64 = (rate != ~0UL) ? rate : ~0ULL;
        nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);

        rate64 = tcp_compute_delivery_rate(tp);
        nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);

        nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
        nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
        nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));

        nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
        nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
        nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
        nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
        nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);

        nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
        nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);

        nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
                          TCP_NLA_PAD);
        nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
                          TCP_NLA_PAD);
        nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
        nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
        nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
        nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
        nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
                    max_t(int, 0, tp->write_seq - tp->snd_nxt));
        nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
                          TCP_NLA_PAD);
        if (ack_skb)
                nla_put_u8(stats, TCP_NLA_TTL,
                           tcp_skb_ttl_or_hop_limit(ack_skb));

        nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
        return stats;
}

int do_tcp_getsockopt(struct sock *sk, int level,
                      int optname, sockptr_t optval, sockptr_t optlen)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        int val, len;

        if (copy_from_sockptr(&len, optlen, sizeof(int)))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        len = min_t(unsigned int, len, sizeof(int));

        switch (optname) {
        case TCP_MAXSEG:
                val = tp->mss_cache;
                if (tp->rx_opt.user_mss &&
                    ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        val = tp->rx_opt.user_mss;
                if (tp->repair)
                        val = tp->rx_opt.mss_clamp;
                break;
        case TCP_NODELAY:
                val = !!(tp->nonagle&TCP_NAGLE_OFF);
                break;
        case TCP_CORK:
                val = !!(tp->nonagle&TCP_NAGLE_CORK);
                break;
        case TCP_KEEPIDLE:
                val = keepalive_time_when(tp) / HZ;
                break;
        case TCP_KEEPINTVL:
                val = keepalive_intvl_when(tp) / HZ;
                break;
        case TCP_KEEPCNT:
                val = keepalive_probes(tp);
                break;
        case TCP_SYNCNT:
                val = READ_ONCE(icsk->icsk_syn_retries) ? :
                        READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
                break;
        case TCP_LINGER2:
                val = READ_ONCE(tp->linger2);
                if (val >= 0)
                        val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
                break;
        case TCP_DEFER_ACCEPT:
                val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
                val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
                                      TCP_RTO_MAX / HZ);
                break;
        case TCP_WINDOW_CLAMP:
                val = READ_ONCE(tp->window_clamp);
                break;
        case TCP_INFO: {
                struct tcp_info info;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                tcp_get_info(sk, &info);

                len = min_t(unsigned int, len, sizeof(info));
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, &info, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_CC_INFO: {
                const struct tcp_congestion_ops *ca_ops;
                union tcp_cc_info info;
                size_t sz = 0;
                int attr;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                ca_ops = icsk->icsk_ca_ops;
                if (ca_ops && ca_ops->get_info)
                        sz = ca_ops->get_info(sk, ~0U, &attr, &info);

                len = min_t(unsigned int, len, sz);
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, &info, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_QUICKACK:
                val = !inet_csk_in_pingpong_mode(sk);
                break;

        case TCP_CONGESTION:
                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;
                len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
                        return -EFAULT;
                return 0;

        case TCP_ULP:
                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;
                len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
                if (!icsk->icsk_ulp_ops) {
                        len = 0;
                        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                                return -EFAULT;
                        return 0;
                }
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
                        return -EFAULT;
                return 0;

        case TCP_FASTOPEN_KEY: {
                u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
                unsigned int key_len;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                key_len = tcp_fastopen_get_cipher(net, icsk, key) *
                                TCP_FASTOPEN_KEY_LENGTH;
                len = min_t(unsigned int, len, key_len);
                if (copy_to_sockptr(optlen, &len, sizeof(int)))
                        return -EFAULT;
                if (copy_to_sockptr(optval, key, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_THIN_LINEAR_TIMEOUTS:
                val = tp->thin_lto;
                break;

        case TCP_THIN_DUPACK:
                val = 0;
                break;

        case TCP_REPAIR:
                val = tp->repair;
                break;

        case TCP_REPAIR_QUEUE:
                if (tp->repair)
                        val = tp->repair_queue;
                else
                        return -EINVAL;
                break;

        case TCP_REPAIR_WINDOW: {
                struct tcp_repair_window opt;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                if (len != sizeof(opt))
                        return -EINVAL;

                if (!tp->repair)
                        return -EPERM;

                opt.snd_wl1        = tp->snd_wl1;
                opt.snd_wnd        = tp->snd_wnd;
                opt.max_window        = tp->max_window;
                opt.rcv_wnd        = tp->rcv_wnd;
                opt.rcv_wup        = tp->rcv_wup;

                if (copy_to_sockptr(optval, &opt, len))
                        return -EFAULT;
                return 0;
        }
        case TCP_QUEUE_SEQ:
                if (tp->repair_queue == TCP_SEND_QUEUE)
                        val = tp->write_seq;
                else if (tp->repair_queue == TCP_RECV_QUEUE)
                        val = tp->rcv_nxt;
                else
                        return -EINVAL;
                break;

        case TCP_USER_TIMEOUT:
                val = READ_ONCE(icsk->icsk_user_timeout);
                break;

        case TCP_FASTOPEN:
                val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
                break;

        case TCP_FASTOPEN_CONNECT:
                val = tp->fastopen_connect;
                break;

        case TCP_FASTOPEN_NO_COOKIE:
                val = tp->fastopen_no_cookie;
                break;

        case TCP_TX_DELAY:
                val = READ_ONCE(tp->tcp_tx_delay);
                break;

        case TCP_TIMESTAMP:
                val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
                if (tp->tcp_usec_ts)
                        val |= 1;
                else
                        val &= ~1;
                break;
        case TCP_NOTSENT_LOWAT:
                val = READ_ONCE(tp->notsent_lowat);
                break;
        case TCP_INQ:
                val = tp->recvmsg_inq;
                break;
        case TCP_SAVE_SYN:
                val = tp->save_syn;
                break;
        case TCP_SAVED_SYN: {
                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;

                sockopt_lock_sock(sk);
                if (tp->saved_syn) {
                        if (len < tcp_saved_syn_len(tp->saved_syn)) {
                                len = tcp_saved_syn_len(tp->saved_syn);
                                if (copy_to_sockptr(optlen, &len, sizeof(int))) {
                                        sockopt_release_sock(sk);
                                        return -EFAULT;
                                }
                                sockopt_release_sock(sk);
                                return -EINVAL;
                        }
                        len = tcp_saved_syn_len(tp->saved_syn);
                        if (copy_to_sockptr(optlen, &len, sizeof(int))) {
                                sockopt_release_sock(sk);
                                return -EFAULT;
                        }
                        if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
                                sockopt_release_sock(sk);
                                return -EFAULT;
                        }
                        tcp_saved_syn_free(tp);
                        sockopt_release_sock(sk);
                } else {
                        sockopt_release_sock(sk);
                        len = 0;
                        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                                return -EFAULT;
                }
                return 0;
        }
#ifdef CONFIG_MMU
        case TCP_ZEROCOPY_RECEIVE: {
                struct scm_timestamping_internal tss;
                struct tcp_zerocopy_receive zc = {};
                int err;

                if (copy_from_sockptr(&len, optlen, sizeof(int)))
                        return -EFAULT;
                if (len < 0 ||
                    len < offsetofend(struct tcp_zerocopy_receive, length))
                        return -EINVAL;
                if (unlikely(len > sizeof(zc))) {
                        err = check_zeroed_sockptr(optval, sizeof(zc),
                                                   len - sizeof(zc));
                        if (err < 1)
                                return err == 0 ? -EINVAL : err;
                        len = sizeof(zc);
                        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                                return -EFAULT;
                }
                if (copy_from_sockptr(&zc, optval, len))
                        return -EFAULT;
                if (zc.reserved)
                        return -EINVAL;
                if (zc.msg_flags &  ~(TCP_VALID_ZC_MSG_FLAGS))
                        return -EINVAL;
                sockopt_lock_sock(sk);
                err = tcp_zerocopy_receive(sk, &zc, &tss);
                err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
                                                          &zc, &len, err);
                sockopt_release_sock(sk);
                if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
                        goto zerocopy_rcv_cmsg;
                switch (len) {
                case offsetofend(struct tcp_zerocopy_receive, msg_flags):
                        goto zerocopy_rcv_cmsg;
                case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
                case offsetofend(struct tcp_zerocopy_receive, msg_control):
                case offsetofend(struct tcp_zerocopy_receive, flags):
                case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
                case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
                case offsetofend(struct tcp_zerocopy_receive, err):
                        goto zerocopy_rcv_sk_err;
                case offsetofend(struct tcp_zerocopy_receive, inq):
                        goto zerocopy_rcv_inq;
                case offsetofend(struct tcp_zerocopy_receive, length):
                default:
                        goto zerocopy_rcv_out;
                }
zerocopy_rcv_cmsg:
                if (zc.msg_flags & TCP_CMSG_TS)
                        tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
                else
                        zc.msg_flags = 0;
zerocopy_rcv_sk_err:
                if (!err)
                        zc.err = sock_error(sk);
zerocopy_rcv_inq:
                zc.inq = tcp_inq_hint(sk);
zerocopy_rcv_out:
                if (!err && copy_to_sockptr(optval, &zc, len))
                        err = -EFAULT;
                return err;
        }
#endif
        case TCP_AO_REPAIR:
                if (!tcp_can_repair_sock(sk))
                        return -EPERM;
                return tcp_ao_get_repair(sk, optval, optlen);
        case TCP_AO_GET_KEYS:
        case TCP_AO_INFO: {
                int err;

                sockopt_lock_sock(sk);
                if (optname == TCP_AO_GET_KEYS)
                        err = tcp_ao_get_mkts(sk, optval, optlen);
                else
                        err = tcp_ao_get_sock_info(sk, optval, optlen);
                sockopt_release_sock(sk);

                return err;
        }
        case TCP_IS_MPTCP:
                val = 0;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                return -EFAULT;
        if (copy_to_sockptr(optval, &val, len))
                return -EFAULT;
        return 0;
}

bool tcp_bpf_bypass_getsockopt(int level, int optname)
{
        /* TCP do_tcp_getsockopt has optimized getsockopt implementation
         * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
         */
        if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
                return true;

        return false;
}
EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);

int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
                   int __user *optlen)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (level != SOL_TCP)
                /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
                return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
                                                                optval, optlen);
        return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
                                 USER_SOCKPTR(optlen));
}
EXPORT_SYMBOL(tcp_getsockopt);

#ifdef CONFIG_TCP_MD5SIG
int tcp_md5_sigpool_id = -1;
EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id);

int tcp_md5_alloc_sigpool(void)
{
        size_t scratch_size;
        int ret;

        scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr);
        ret = tcp_sigpool_alloc_ahash("md5", scratch_size);
        if (ret >= 0) {
                /* As long as any md5 sigpool was allocated, the return
                 * id would stay the same. Re-write the id only for the case
                 * when previously all MD5 keys were deleted and this call
                 * allocates the first MD5 key, which may return a different
                 * sigpool id than was used previously.
                 */
                WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */
                return 0;
        }
        return ret;
}

void tcp_md5_release_sigpool(void)
{
        tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id));
}

void tcp_md5_add_sigpool(void)
{
        tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id));
}

int tcp_md5_hash_key(struct tcp_sigpool *hp,
                     const struct tcp_md5sig_key *key)
{
        u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
        struct scatterlist sg;

        sg_init_one(&sg, key->key, keylen);
        ahash_request_set_crypt(hp->req, &sg, NULL, keylen);

        /* We use data_race() because tcp_md5_do_add() might change
         * key->key under us
         */
        return data_race(crypto_ahash_update(hp->req));
}
EXPORT_SYMBOL(tcp_md5_hash_key);

/* Called with rcu_read_lock() */
static enum skb_drop_reason
tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
                     const void *saddr, const void *daddr,
                     int family, int l3index, const __u8 *hash_location)
{
        /* This gets called for each TCP segment that has TCP-MD5 option.
         * We have 3 drop cases:
         * o No MD5 hash and one expected.
         * o MD5 hash and we're not expecting one.
         * o MD5 hash and its wrong.
         */
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_md5sig_key *key;
        u8 newhash[16];
        int genhash;

        key = tcp_md5_do_lookup(sk, l3index, saddr, family);

        if (!key && hash_location) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
                trace_tcp_hash_md5_unexpected(sk, skb);
                return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
        }

        /* Check the signature.
         * To support dual stack listeners, we need to handle
         * IPv4-mapped case.
         */
        if (family == AF_INET)
                genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
        else
                genhash = tp->af_specific->calc_md5_hash(newhash, key,
                                                         NULL, skb);
        if (genhash || memcmp(hash_location, newhash, 16) != 0) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
                trace_tcp_hash_md5_mismatch(sk, skb);
                return SKB_DROP_REASON_TCP_MD5FAILURE;
        }
        return SKB_NOT_DROPPED_YET;
}
#else
static inline enum skb_drop_reason
tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
                     const void *saddr, const void *daddr,
                     int family, int l3index, const __u8 *hash_location)
{
        return SKB_NOT_DROPPED_YET;
}

#endif

/* Called with rcu_read_lock() */
enum skb_drop_reason
tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
                 const struct sk_buff *skb,
                 const void *saddr, const void *daddr,
                 int family, int dif, int sdif)
{
        const struct tcphdr *th = tcp_hdr(skb);
        const struct tcp_ao_hdr *aoh;
        const __u8 *md5_location;
        int l3index;

        /* Invalid option or two times meet any of auth options */
        if (tcp_parse_auth_options(th, &md5_location, &aoh)) {
                trace_tcp_hash_bad_header(sk, skb);
                return SKB_DROP_REASON_TCP_AUTH_HDR;
        }

        if (req) {
                if (tcp_rsk_used_ao(req) != !!aoh) {
                        u8 keyid, rnext, maclen;

                        if (aoh) {
                                keyid = aoh->keyid;
                                rnext = aoh->rnext_keyid;
                                maclen = tcp_ao_hdr_maclen(aoh);
                        } else {
                                keyid = rnext = maclen = 0;
                        }

                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
                        trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen);
                        return SKB_DROP_REASON_TCP_AOFAILURE;
                }
        }

        /* sdif set, means packet ingressed via a device
         * in an L3 domain and dif is set to the l3mdev
         */
        l3index = sdif ? dif : 0;

        /* Fast path: unsigned segments */
        if (likely(!md5_location && !aoh)) {
                /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid
                 * for the remote peer. On TCP-AO established connection
                 * the last key is impossible to remove, so there's
                 * always at least one current_key.
                 */
                if (tcp_ao_required(sk, saddr, family, l3index, true)) {
                        trace_tcp_hash_ao_required(sk, skb);
                        return SKB_DROP_REASON_TCP_AONOTFOUND;
                }
                if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
                        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
                        trace_tcp_hash_md5_required(sk, skb);
                        return SKB_DROP_REASON_TCP_MD5NOTFOUND;
                }
                return SKB_NOT_DROPPED_YET;
        }

        if (aoh)
                return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh);

        return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
                                    l3index, md5_location);
}
EXPORT_SYMBOL_GPL(tcp_inbound_hash);

void tcp_done(struct sock *sk)
{
        struct request_sock *req;

        /* We might be called with a new socket, after
         * inet_csk_prepare_forced_close() has been called
         * so we can not use lockdep_sock_is_held(sk)
         */
        req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);

        if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
                TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);

        tcp_set_state(sk, TCP_CLOSE);
        tcp_clear_xmit_timers(sk);
        if (req)
                reqsk_fastopen_remove(sk, req, false);

        WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_state_change(sk);
        else
                inet_csk_destroy_sock(sk);
}
EXPORT_SYMBOL_GPL(tcp_done);

int tcp_abort(struct sock *sk, int err)
{
        int state = inet_sk_state_load(sk);

        if (state == TCP_NEW_SYN_RECV) {
                struct request_sock *req = inet_reqsk(sk);

                local_bh_disable();
                inet_csk_reqsk_queue_drop(req->rsk_listener, req);
                local_bh_enable();
                return 0;
        }
        if (state == TCP_TIME_WAIT) {
                struct inet_timewait_sock *tw = inet_twsk(sk);

                refcount_inc(&tw->tw_refcnt);
                local_bh_disable();
                inet_twsk_deschedule_put(tw);
                local_bh_enable();
                return 0;
        }

        /* BPF context ensures sock locking. */
        if (!has_current_bpf_ctx())
                /* Don't race with userspace socket closes such as tcp_close. */
                lock_sock(sk);

        if (sk->sk_state == TCP_LISTEN) {
                tcp_set_state(sk, TCP_CLOSE);
                inet_csk_listen_stop(sk);
        }

        /* Don't race with BH socket closes such as inet_csk_listen_stop. */
        local_bh_disable();
        bh_lock_sock(sk);

        if (!sock_flag(sk, SOCK_DEAD)) {
                if (tcp_need_reset(sk->sk_state))
                        tcp_send_active_reset(sk, GFP_ATOMIC,
                                              SK_RST_REASON_NOT_SPECIFIED);
                tcp_done_with_error(sk, err);
        }

        bh_unlock_sock(sk);
        local_bh_enable();
        tcp_write_queue_purge(sk);
        if (!has_current_bpf_ctx())
                release_sock(sk);
        return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);

extern struct tcp_congestion_ops tcp_reno;

static __initdata unsigned long thash_entries;
static int __init set_thash_entries(char *str)
{
        ssize_t ret;

        if (!str)
                return 0;

        ret = kstrtoul(str, 0, &thash_entries);
        if (ret)
                return 0;

        return 1;
}
__setup("thash_entries=", set_thash_entries);

static void __init tcp_init_mem(void)
{
        unsigned long limit = nr_free_buffer_pages() / 16;

        limit = max(limit, 128UL);
        sysctl_tcp_mem[0] = limit / 4 * 3;                /* 4.68 % */
        sysctl_tcp_mem[1] = limit;                        /* 6.25 % */
        sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;        /* 9.37 % */
}

static void __init tcp_struct_check(void)
{
        /* TX read-mostly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
        CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40);

        /* TXRX read-mostly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio);
        CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32);

        /* RX read-mostly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
        CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69);

        /* TX read-write hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
        CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);

        /* TXRX read-write hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);

        /* 32bit arches with 8byte alignment on u64 fields might need padding
         * before tcp_clock_cache.
         */
        CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4);

        /* RX read-write hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
        CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99);
}

void __init tcp_init(void)
{
        int max_rshare, max_wshare, cnt;
        unsigned long limit;
        unsigned int i;

        BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
        BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
                     sizeof_field(struct sk_buff, cb));

        tcp_struct_check();

        percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);

        timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
        mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);

        inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
                            thash_entries, 21,  /* one slot per 2 MB*/
                            0, 64 * 1024);
        tcp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("tcp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                  SLAB_ACCOUNT,
                                  NULL);
        tcp_hashinfo.bind2_bucket_cachep =
                kmem_cache_create("tcp_bind2_bucket",
                                  sizeof(struct inet_bind2_bucket), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                  SLAB_ACCOUNT,
                                  NULL);

        /* Size and allocate the main established and bind bucket
         * hash tables.
         *
         * The methodology is similar to that of the buffer cache.
         */
        tcp_hashinfo.ehash =
                alloc_large_system_hash("TCP established",
                                        sizeof(struct inet_ehash_bucket),
                                        thash_entries,
                                        17, /* one slot per 128 KB of memory */
                                        0,
                                        NULL,
                                        &tcp_hashinfo.ehash_mask,
                                        0,
                                        thash_entries ? 0 : 512 * 1024);
        for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
                INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);

        if (inet_ehash_locks_alloc(&tcp_hashinfo))
                panic("TCP: failed to alloc ehash_locks");
        tcp_hashinfo.bhash =
                alloc_large_system_hash("TCP bind",
                                        2 * sizeof(struct inet_bind_hashbucket),
                                        tcp_hashinfo.ehash_mask + 1,
                                        17, /* one slot per 128 KB of memory */
                                        0,
                                        &tcp_hashinfo.bhash_size,
                                        NULL,
                                        0,
                                        64 * 1024);
        tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
        tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
                spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
        }

        tcp_hashinfo.pernet = false;

        cnt = tcp_hashinfo.ehash_mask + 1;
        sysctl_tcp_max_orphans = cnt / 2;

        tcp_init_mem();
        /* Set per-socket limits to no more than 1/128 the pressure threshold */
        limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
        max_wshare = min(4UL*1024*1024, limit);
        max_rshare = min(6UL*1024*1024, limit);

        init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
        init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
        init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);

        init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
        init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
        init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);

        pr_info("Hash tables configured (established %u bind %u)\n",
                tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);

        tcp_v4_init();
        tcp_metrics_init();
        BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
        tcp_tasklet_init();
        mptcp_init();
}































































    1 
    1 







































































































































































































































































































































































































































































    3 











































































































































































































































































































































































































































































































































































































































































































































    1 

    1 




































































































































    1 




    1 


    1 







    1 

















    1 
    1 







    1 





















    1 


    1 
    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/buffer.c
 *
 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
 */

/*
 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
 *
 * Removed a lot of unnecessary code and simplified things now that
 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 *
 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
 *
 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
 *
 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
 */

#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
#include <linux/sched/isolation.h>

#include "internal.h"

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint hint, struct writeback_control *wbc);

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

inline void touch_buffer(struct buffer_head *bh)
{
        trace_block_touch_buffer(bh);
        folio_mark_accessed(bh->b_folio);
}
EXPORT_SYMBOL(touch_buffer);

void __lock_buffer(struct buffer_head *bh)
{
        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_buffer);

void unlock_buffer(struct buffer_head *bh)
{
        clear_bit_unlock(BH_Lock, &bh->b_state);
        smp_mb__after_atomic();
        wake_up_bit(&bh->b_state, BH_Lock);
}
EXPORT_SYMBOL(unlock_buffer);

/*
 * Returns if the folio has dirty or writeback buffers. If all the buffers
 * are unlocked and clean then the folio_test_dirty information is stale. If
 * any of the buffers are locked, it is assumed they are locked for IO.
 */
void buffer_check_dirty_writeback(struct folio *folio,
                                     bool *dirty, bool *writeback)
{
        struct buffer_head *head, *bh;
        *dirty = false;
        *writeback = false;

        BUG_ON(!folio_test_locked(folio));

        head = folio_buffers(folio);
        if (!head)
                return;

        if (folio_test_writeback(folio))
                *writeback = true;

        bh = head;
        do {
                if (buffer_locked(bh))
                        *writeback = true;

                if (buffer_dirty(bh))
                        *dirty = true;

                bh = bh->b_this_page;
        } while (bh != head);
}

/*
 * Block until a buffer comes unlocked.  This doesn't stop it
 * from becoming locked again - you have to lock it yourself
 * if you want to preserve its state.
 */
void __wait_on_buffer(struct buffer_head * bh)
{
        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__wait_on_buffer);

static void buffer_io_error(struct buffer_head *bh, char *msg)
{
        if (!test_bit(BH_Quiet, &bh->b_state))
                printk_ratelimited(KERN_ERR
                        "Buffer I/O error on dev %pg, logical block %llu%s\n",
                        bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}

/*
 * End-of-IO handler helper function which does not touch the bh after
 * unlocking it.
 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 * a race there is benign: unlock_buffer() only use the bh's address for
 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 * itself.
 */
static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                /* This happens, due to failed read-ahead attempts. */
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
}

/*
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 * unlock the buffer.
 */
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
        __end_buffer_read_notouch(bh, uptodate);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_read_sync);

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost sync page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_write_sync);

/*
 * Various filesystems appear to want __find_get_block to be non-blocking.
 * But it's the page lock which protects the buffers.  To get around this,
 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 * i_private_lock.
 *
 * Hack idea: for the blockdev mapping, i_private_lock contention
 * may be quite high.  This code could TryLock the page, and if that
 * succeeds, there is no need to take i_private_lock.
 */
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct buffer_head *ret = NULL;
        pgoff_t index;
        struct buffer_head *bh;
        struct buffer_head *head;
        struct folio *folio;
        int all_mapped = 1;
        static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);

        index = ((loff_t)block << blkbits) / PAGE_SIZE;
        folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
        if (IS_ERR(folio))
                goto out;

        spin_lock(&bd_mapping->i_private_lock);
        head = folio_buffers(folio);
        if (!head)
                goto out_unlock;
        bh = head;
        do {
                if (!buffer_mapped(bh))
                        all_mapped = 0;
                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
                bh = bh->b_this_page;
        } while (bh != head);

        /* we might be here because some of the buffers on this page are
         * not mapped.  This is due to various races between
         * file io on the block device and getblk.  It gets dealt with
         * elsewhere, don't buffer_error if we had some unmapped buffers
         */
        ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
        if (all_mapped && __ratelimit(&last_warned)) {
                printk("__find_get_block_slow() failed. block=%llu, "
                       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
                       "device %pg blocksize: %d\n",
                       (unsigned long long)block,
                       (unsigned long long)bh->b_blocknr,
                       bh->b_state, bh->b_size, bdev,
                       1 << blkbits);
        }
out_unlock:
        spin_unlock(&bd_mapping->i_private_lock);
        folio_put(folio);
out:
        return ret;
}

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;
        int folio_uptodate = 1;

        BUG_ON(!buffer_async_read(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
                buffer_io_error(bh, ", async page read");
                folio_set_error(folio);
        }

        /*
         * Be _very_ careful from here on. Bad things can happen if
         * two buffer heads end IO at almost the same time and both
         * decide that the page is now completely done.
         */
        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);
        clear_buffer_async_read(bh);
        unlock_buffer(bh);
        tmp = bh;
        do {
                if (!buffer_uptodate(tmp))
                        folio_uptodate = 0;
                if (buffer_async_read(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        } while (tmp != bh);
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);

        folio_end_read(folio, folio_uptodate);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}

struct postprocess_bh_ctx {
        struct work_struct work;
        struct buffer_head *bh;
};

static void verify_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        bool valid;

        valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
        end_buffer_async_read(bh, valid);
        kfree(ctx);
}

static bool need_fsverity(struct buffer_head *bh)
{
        struct folio *folio = bh->b_folio;
        struct inode *inode = folio->mapping->host;

        return fsverity_active(inode) &&
                /* needed by ext4 */
                folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
}

static void decrypt_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        int err;

        err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
                                               bh_offset(bh));
        if (err == 0 && need_fsverity(bh)) {
                /*
                 * We use different work queues for decryption and for verity
                 * because verity may require reading metadata pages that need
                 * decryption, and we shouldn't recurse to the same workqueue.
                 */
                INIT_WORK(&ctx->work, verify_bh);
                fsverity_enqueue_verify_work(&ctx->work);
                return;
        }
        end_buffer_async_read(bh, err == 0);
        kfree(ctx);
}

/*
 * I/O completion handler for block_read_full_folio() - pages
 * which come unlocked at the end of I/O.
 */
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
        struct inode *inode = bh->b_folio->mapping->host;
        bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
        bool verify = need_fsverity(bh);

        /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
        if (uptodate && (decrypt || verify)) {
                struct postprocess_bh_ctx *ctx =
                        kmalloc(sizeof(*ctx), GFP_ATOMIC);

                if (ctx) {
                        ctx->bh = bh;
                        if (decrypt) {
                                INIT_WORK(&ctx->work, decrypt_bh);
                                fscrypt_enqueue_decrypt_work(&ctx->work);
                        } else {
                                INIT_WORK(&ctx->work, verify_bh);
                                fsverity_enqueue_verify_work(&ctx->work);
                        }
                        return;
                }
                uptodate = 0;
        }
        end_buffer_async_read(bh, uptodate);
}

/*
 * Completion handler for block_write_full_folio() - folios which are unlocked
 * during I/O, and which have the writeback flag cleared upon I/O completion.
 */
static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;

        BUG_ON(!buffer_async_write(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost async page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
                folio_set_error(folio);
        }

        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);

        clear_buffer_async_write(bh);
        unlock_buffer(bh);
        tmp = bh->b_this_page;
        while (tmp != bh) {
                if (buffer_async_write(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        }
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        folio_end_writeback(folio);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}

/*
 * If a page's buffers are under async readin (end_buffer_async_read
 * completion) then there is a possibility that another thread of
 * control could lock one of the buffers after it has completed
 * but while some of the other buffers have not completed.  This
 * locked buffer would confuse end_buffer_async_read() into not unlocking
 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 * that this buffer is not under async I/O.
 *
 * The page comes unlocked when it has no locked buffer_async buffers
 * left.
 *
 * PageLocked prevents anyone starting new async I/O reads any of
 * the buffers.
 *
 * PageWriteback is used to prevent simultaneous writeout of the same
 * page.
 *
 * PageLocked prevents anyone from starting writeback of a page which is
 * under read I/O (PageWriteback is only ever set against a locked page).
 */
static void mark_buffer_async_read(struct buffer_head *bh)
{
        bh->b_end_io = end_buffer_async_read_io;
        set_buffer_async_read(bh);
}

static void mark_buffer_async_write_endio(struct buffer_head *bh,
                                          bh_end_io_t *handler)
{
        bh->b_end_io = handler;
        set_buffer_async_write(bh);
}

void mark_buffer_async_write(struct buffer_head *bh)
{
        mark_buffer_async_write_endio(bh, end_buffer_async_write);
}
EXPORT_SYMBOL(mark_buffer_async_write);


/*
 * fs/buffer.c contains helper functions for buffer-backed address space's
 * fsync functions.  A common requirement for buffer-based filesystems is
 * that certain data from the backing blockdev needs to be written out for
 * a successful fsync().  For example, ext2 indirect blocks need to be
 * written back and waited upon before fsync() returns.
 *
 * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 * management of a list of dependent buffers at ->i_mapping->i_private_list.
 *
 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 * from their controlling inode's queue when they are being freed.  But
 * try_to_free_buffers() will be operating against the *blockdev* mapping
 * at the time, not against the S_ISREG file which depends on those buffers.
 * So the locking for i_private_list is via the i_private_lock in the address_space
 * which backs the buffers.  Which is different from the address_space 
 * against which the buffers are listed.  So for a particular address_space,
 * mapping->i_private_lock does *not* protect mapping->i_private_list!  In fact,
 * mapping->i_private_list will always be protected by the backing blockdev's
 * ->i_private_lock.
 *
 * Which introduces a requirement: all buffers on an address_space's
 * ->i_private_list must be from the same address_space: the blockdev's.
 *
 * address_spaces which do not place buffers at ->i_private_list via these
 * utility functions are free to use i_private_lock and i_private_list for
 * whatever they want.  The only requirement is that list_empty(i_private_list)
 * be true at clear_inode() time.
 *
 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 * filesystems should do that.  invalidate_inode_buffers() should just go
 * BUG_ON(!list_empty).
 *
 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 * take an address_space, not an inode.  And it should be called
 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 * queued up.
 *
 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 * list if it is already on a list.  Because if the buffer is on a list,
 * it *must* already be on the right one.  If not, the filesystem is being
 * silly.  This will save a ton of locking.  But first we have to ensure
 * that buffers are taken *off* the old inode's list when they are freed
 * (presumably in truncate).  That requires careful auditing of all
 * filesystems (do it inside bforget()).  It could also be done by bringing
 * b_inode back.
 */

/*
 * The buffer's backing address_space's i_private_lock must be held
 */
static void __remove_assoc_queue(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
        WARN_ON(!bh->b_assoc_map);
        bh->b_assoc_map = NULL;
}

int inode_has_buffers(struct inode *inode)
{
        return !list_empty(&inode->i_data.i_private_list);
}

/*
 * osync is designed to support O_SYNC io.  It waits synchronously for
 * all already-submitted IO to complete, but does not queue any new
 * writes to the disk.
 *
 * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
 * as you dirty the buffers, and then use osync_inode_buffers to wait for
 * completion.  Any other dirty buffers which are not yet queued for
 * write will not be flushed to disk by the osync.
 */
static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct list_head *p;
        int err = 0;

        spin_lock(lock);
repeat:
        list_for_each_prev(p, list) {
                bh = BH_ENTRY(p);
                if (buffer_locked(bh)) {
                        get_bh(bh);
                        spin_unlock(lock);
                        wait_on_buffer(bh);
                        if (!buffer_uptodate(bh))
                                err = -EIO;
                        brelse(bh);
                        spin_lock(lock);
                        goto repeat;
                }
        }
        spin_unlock(lock);
        return err;
}

/**
 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 * @mapping: the mapping which wants those buffers written
 *
 * Starts I/O against the buffers at mapping->i_private_list, and waits upon
 * that I/O.
 *
 * Basically, this is a convenience function for fsync().
 * @mapping is a file or directory which needs those buffers to be written for
 * a successful fsync().
 */
int sync_mapping_buffers(struct address_space *mapping)
{
        struct address_space *buffer_mapping = mapping->i_private_data;

        if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
                return 0;

        return fsync_buffers_list(&buffer_mapping->i_private_lock,
                                        &mapping->i_private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);

/**
 * generic_buffers_fsync_noflush - generic buffer fsync implementation
 * for simple filesystems with no inode lock
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
                                  bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync_noflush);

/**
 * generic_buffers_fsync - generic buffer fsync implementation
 * for simple filesystems with no inode lock
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure. This also makes sure that
 * a device cache flush operation is called at the end.
 */
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
                          bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int ret;

        ret = generic_buffers_fsync_noflush(file, start, end, datasync);
        if (!ret)
                ret = blkdev_issue_flush(inode->i_sb->s_bdev);
        return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync);

/*
 * Called when we've recently written block `bblock', and it is known that
 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 */
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize)
{
        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
        if (bh) {
                if (buffer_dirty(bh))
                        write_dirty_buffer(bh, 0);
                put_bh(bh);
        }
}

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
{
        struct address_space *mapping = inode->i_mapping;
        struct address_space *buffer_mapping = bh->b_folio->mapping;

        mark_buffer_dirty(bh);
        if (!mapping->i_private_data) {
                mapping->i_private_data = buffer_mapping;
        } else {
                BUG_ON(mapping->i_private_data != buffer_mapping);
        }
        if (!bh->b_assoc_map) {
                spin_lock(&buffer_mapping->i_private_lock);
                list_move_tail(&bh->b_assoc_buffers,
                                &mapping->i_private_list);
                bh->b_assoc_map = mapping;
                spin_unlock(&buffer_mapping->i_private_lock);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);

/**
 * block_dirty_folio - Mark a folio as dirty.
 * @mapping: The address space containing this folio.
 * @folio: The folio to mark dirty.
 *
 * Filesystems which use buffer_heads can use this function as their
 * ->dirty_folio implementation.  Some filesystems need to do a little
 * work before calling this function.  Filesystems which do not use
 * buffer_heads should call filemap_dirty_folio() instead.
 *
 * If the folio has buffers, the uptodate buffers are set dirty, to
 * preserve dirty-state coherency between the folio and the buffers.
 * Buffers added to a dirty folio are created dirty.
 *
 * The buffers are dirtied before the folio is dirtied.  There's a small
 * race window in which writeback may see the folio cleanness but not the
 * buffer dirtiness.  That's fine.  If this code were to set the folio
 * dirty before the buffers, writeback could clear the folio dirty flag,
 * see a bunch of clean buffers and we'd end up with dirty buffers/clean
 * folio on the dirty folio list.
 *
 * We use i_private_lock to lock against try_to_free_buffers() while
 * using the folio's buffer list.  This also prevents clean buffers
 * being added to the folio after it was set dirty.
 *
 * Context: May only be called from process context.  Does not sleep.
 * Caller must ensure that @folio cannot be truncated during this call,
 * typically by holding the folio lock or having a page in the folio
 * mapped and holding the page table lock.
 *
 * Return: True if the folio was dirtied; false if it was already dirtied.
 */
bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        struct buffer_head *head;
        bool newly_dirty;

        spin_lock(&mapping->i_private_lock);
        head = folio_buffers(folio);
        if (head) {
                struct buffer_head *bh = head;

                do {
                        set_buffer_dirty(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        /*
         * Lock out page's memcg migration to keep PageDirty
         * synchronized with per-memcg dirty page counters.
         */
        folio_memcg_lock(folio);
        newly_dirty = !folio_test_set_dirty(folio);
        spin_unlock(&mapping->i_private_lock);

        if (newly_dirty)
                __folio_mark_dirty(folio, mapping, 1);

        folio_memcg_unlock(folio);

        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

        return newly_dirty;
}
EXPORT_SYMBOL(block_dirty_folio);

/*
 * Write out and wait upon a list of buffers.
 *
 * We have conflicting pressures: we want to make sure that all
 * initially dirty buffers get waited on, but that any subsequently
 * dirtied buffers don't.  After all, we don't want fsync to last
 * forever if somebody is actively writing to the file.
 *
 * Do this in two main stages: first we copy dirty buffers to a
 * temporary inode list, queueing the writes as we go.  Then we clean
 * up, waiting for those writes to complete.
 * 
 * During this second stage, any subsequent updates to the file may end
 * up refiling the buffer on the original inode's dirty list again, so
 * there is a chance we will end up with a buffer queued for write but
 * not yet completed on that list.  So, as a final cleanup we go through
 * the osync code to catch these locked, dirty buffers without requeuing
 * any newly dirty buffers for write.
 */
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct list_head tmp;
        struct address_space *mapping;
        int err = 0, err2;
        struct blk_plug plug;

        INIT_LIST_HEAD(&tmp);
        blk_start_plug(&plug);

        spin_lock(lock);
        while (!list_empty(list)) {
                bh = BH_ENTRY(list->next);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh) || buffer_locked(bh)) {
                        list_add(&bh->b_assoc_buffers, &tmp);
                        bh->b_assoc_map = mapping;
                        if (buffer_dirty(bh)) {
                                get_bh(bh);
                                spin_unlock(lock);
                                /*
                                 * Ensure any pending I/O completes so that
                                 * write_dirty_buffer() actually writes the
                                 * current contents - it is a noop if I/O is
                                 * still in flight on potentially older
                                 * contents.
                                 */
                                write_dirty_buffer(bh, REQ_SYNC);

                                /*
                                 * Kick off IO for the previous mapping. Note
                                 * that we will not run the very last mapping,
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }

        spin_unlock(lock);
        blk_finish_plug(&plug);
        spin_lock(lock);

        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh)) {
                        list_add(&bh->b_assoc_buffers,
                                 &mapping->i_private_list);
                        bh->b_assoc_map = mapping;
                }
                spin_unlock(lock);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        err = -EIO;
                brelse(bh);
                spin_lock(lock);
        }
        
        spin_unlock(lock);
        err2 = osync_buffers_list(lock, list);
        if (err)
                return err;
        else
                return err2;
}

/*
 * Invalidate any and all dirty buffers on a given inode.  We are
 * probably unmounting the fs, but that doesn't mean we have already
 * done a sync().  Just drop the buffers from the inode list.
 *
 * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
 * assumes that all the buffers are against the blockdev.  Not true
 * for reiserfs.
 */
void invalidate_inode_buffers(struct inode *inode)
{
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->i_private_list;
                struct address_space *buffer_mapping = mapping->i_private_data;

                spin_lock(&buffer_mapping->i_private_lock);
                while (!list_empty(list))
                        __remove_assoc_queue(BH_ENTRY(list->next));
                spin_unlock(&buffer_mapping->i_private_lock);
        }
}
EXPORT_SYMBOL(invalidate_inode_buffers);

/*
 * Remove any clean buffers from the inode's buffer list.  This is called
 * when we're trying to free the inode itself.  Those buffers can pin it.
 *
 * Returns true if all buffers were removed.
 */
int remove_inode_buffers(struct inode *inode)
{
        int ret = 1;

        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->i_private_list;
                struct address_space *buffer_mapping = mapping->i_private_data;

                spin_lock(&buffer_mapping->i_private_lock);
                while (!list_empty(list)) {
                        struct buffer_head *bh = BH_ENTRY(list->next);
                        if (buffer_dirty(bh)) {
                                ret = 0;
                                break;
                        }
                        __remove_assoc_queue(bh);
                }
                spin_unlock(&buffer_mapping->i_private_lock);
        }
        return ret;
}

/*
 * Create the appropriate buffers when given a folio for data area and
 * the size of each buffer.. Use the bh->b_this_page linked list to
 * follow the buffers created.  Return NULL if unable to create more
 * buffers.
 *
 * The retry flag is used to differentiate async IO (paging, swapping)
 * which may not fail from ordinary buffer allocations.
 */
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
                                        gfp_t gfp)
{
        struct buffer_head *bh, *head;
        long offset;
        struct mem_cgroup *memcg, *old_memcg;

        /* The folio lock pins the memcg */
        memcg = folio_memcg(folio);
        old_memcg = set_active_memcg(memcg);

        head = NULL;
        offset = folio_size(folio);
        while ((offset -= size) >= 0) {
                bh = alloc_buffer_head(gfp);
                if (!bh)
                        goto no_grow;

                bh->b_this_page = head;
                bh->b_blocknr = -1;
                head = bh;

                bh->b_size = size;

                /* Link the buffer to its folio */
                folio_set_bh(bh, folio, offset);
        }
out:
        set_active_memcg(old_memcg);
        return head;
/*
 * In case anything failed, we just free everything we got.
 */
no_grow:
        if (head) {
                do {
                        bh = head;
                        head = head->b_this_page;
                        free_buffer_head(bh);
                } while (head);
        }

        goto out;
}
EXPORT_SYMBOL_GPL(folio_alloc_buffers);

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                                       bool retry)
{
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
        if (retry)
                gfp |= __GFP_NOFAIL;

        return folio_alloc_buffers(page_folio(page), size, gfp);
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);

static inline void link_dev_buffers(struct folio *folio,
                struct buffer_head *head)
{
        struct buffer_head *bh, *tail;

        bh = head;
        do {
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;
        folio_attach_private(folio, head);
}

static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
        sector_t retval = ~((sector_t)0);
        loff_t sz = bdev_nr_bytes(bdev);

        if (sz) {
                unsigned int sizebits = blksize_bits(size);
                retval = (sz >> sizebits);
        }
        return retval;
}

/*
 * Initialise the state of a blockdev folio's buffers.
 */ 
static sector_t folio_init_buffers(struct folio *folio,
                struct block_device *bdev, unsigned size)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh = head;
        bool uptodate = folio_test_uptodate(folio);
        sector_t block = div_u64(folio_pos(folio), size);
        sector_t end_block = blkdev_max_block(bdev, size);

        do {
                if (!buffer_mapped(bh)) {
                        bh->b_end_io = NULL;
                        bh->b_private = NULL;
                        bh->b_bdev = bdev;
                        bh->b_blocknr = block;
                        if (uptodate)
                                set_buffer_uptodate(bh);
                        if (block < end_block)
                                set_buffer_mapped(bh);
                }
                block++;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * Caller needs to validate requested block against end of device.
         */
        return end_block;
}

/*
 * Create the page-cache folio that contains the requested block.
 *
 * This is used purely for blockdev mappings.
 *
 * Returns false if we have a failure which cannot be cured by retrying
 * without sleeping.  Returns true if we succeeded, or the caller should retry.
 */
static bool grow_dev_folio(struct block_device *bdev, sector_t block,
                pgoff_t index, unsigned size, gfp_t gfp)
{
        struct address_space *mapping = bdev->bd_mapping;
        struct folio *folio;
        struct buffer_head *bh;
        sector_t end_block = 0;

        folio = __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return false;

        bh = folio_buffers(folio);
        if (bh) {
                if (bh->b_size == size) {
                        end_block = folio_init_buffers(folio, bdev, size);
                        goto unlock;
                }

                /*
                 * Retrying may succeed; for example the folio may finish
                 * writeback, or buffers may be cleaned.  This should not
                 * happen very often; maybe we have old buffers attached to
                 * this blockdev's page cache and we're trying to change
                 * the block size?
                 */
                if (!try_to_free_buffers(folio)) {
                        end_block = ~0ULL;
                        goto unlock;
                }
        }

        bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
        if (!bh)
                goto unlock;

        /*
         * Link the folio to the buffers and initialise them.  Take the
         * lock to be atomic wrt __find_get_block(), which does not
         * run under the folio lock.
         */
        spin_lock(&mapping->i_private_lock);
        link_dev_buffers(folio, bh);
        end_block = folio_init_buffers(folio, bdev, size);
        spin_unlock(&mapping->i_private_lock);
unlock:
        folio_unlock(folio);
        folio_put(folio);
        return block < end_block;
}

/*
 * Create buffers for the specified block device block's folio.  If
 * that folio was dirty, the buffers are set dirty also.  Returns false
 * if we've hit a permanent error.
 */
static bool grow_buffers(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        loff_t pos;

        /*
         * Check for a block which lies outside our maximum possible
         * pagecache index.
         */
        if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
                printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
                        __func__, (unsigned long long)block,
                        bdev);
                return false;
        }

        /* Create a folio with the proper size buffers */
        return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
}

static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
             unsigned size, gfp_t gfp)
{
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
                printk(KERN_ERR "logical block size: %d\n",
                                        bdev_logical_block_size(bdev));

                dump_stack();
                return NULL;
        }

        for (;;) {
                struct buffer_head *bh;

                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;

                if (!grow_buffers(bdev, block, size, gfp))
                        return NULL;
        }
}

/*
 * The relationship between dirty buffers and dirty pages:
 *
 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
 * the page is tagged dirty in the page cache.
 *
 * At all times, the dirtiness of the buffers represents the dirtiness of
 * subsections of the page.  If the page has buffers, the page dirty bit is
 * merely a hint about the true dirty state.
 *
 * When a page is set dirty in its entirety, all its buffers are marked dirty
 * (if the page has buffers).
 *
 * When a buffer is marked dirty, its page is dirtied, but the page's other
 * buffers are not.
 *
 * Also.  When blockdev buffers are explicitly read with bread(), they
 * individually become uptodate.  But their backing page remains not
 * uptodate - even if all of its buffers are uptodate.  A subsequent
 * block_read_full_folio() against that folio will discover all the uptodate
 * buffers, will set the folio uptodate and will perform no I/O.
 */

/**
 * mark_buffer_dirty - mark a buffer_head as needing writeout
 * @bh: the buffer_head to mark dirty
 *
 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
 * its backing page dirty, then tag the page as dirty in the page cache
 * and then attach the address_space's inode to its superblock's dirty
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
 * i_pages lock and mapping->host->i_lock.
 */
void mark_buffer_dirty(struct buffer_head *bh)
{
        WARN_ON_ONCE(!buffer_uptodate(bh));

        trace_block_dirty_buffer(bh);

        /*
         * Very *carefully* optimize the it-is-already-dirty case.
         *
         * Don't let the final "is it dirty" escape to before we
         * perhaps modified the buffer.
         */
        if (buffer_dirty(bh)) {
                smp_mb();
                if (buffer_dirty(bh))
                        return;
        }

        if (!test_set_buffer_dirty(bh)) {
                struct folio *folio = bh->b_folio;
                struct address_space *mapping = NULL;

                folio_memcg_lock(folio);
                if (!folio_test_set_dirty(folio)) {
                        mapping = folio->mapping;
                        if (mapping)
                                __folio_mark_dirty(folio, mapping, 0);
                }
                folio_memcg_unlock(folio);
                if (mapping)
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty);

void mark_buffer_write_io_error(struct buffer_head *bh)
{
        set_buffer_write_io_error(bh);
        /* FIXME: do we need to set this in both places? */
        if (bh->b_folio && bh->b_folio->mapping)
                mapping_set_error(bh->b_folio->mapping, -EIO);
        if (bh->b_assoc_map) {
                mapping_set_error(bh->b_assoc_map, -EIO);
                errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
        }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);

/**
 * __brelse - Release a buffer.
 * @bh: The buffer to release.
 *
 * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
 */
void __brelse(struct buffer_head *bh)
{
        if (atomic_read(&bh->b_count)) {
                put_bh(bh);
                return;
        }
        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);

/**
 * __bforget - Discard any dirty data in a buffer.
 * @bh: The buffer to forget.
 *
 * This variant of bforget() can be called if @bh is guaranteed to not
 * be NULL.
 */
void __bforget(struct buffer_head *bh)
{
        clear_buffer_dirty(bh);
        if (bh->b_assoc_map) {
                struct address_space *buffer_mapping = bh->b_folio->mapping;

                spin_lock(&buffer_mapping->i_private_lock);
                list_del_init(&bh->b_assoc_buffers);
                bh->b_assoc_map = NULL;
                spin_unlock(&buffer_mapping->i_private_lock);
        }
        __brelse(bh);
}
EXPORT_SYMBOL(__bforget);

static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
        lock_buffer(bh);
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return bh;
        } else {
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(REQ_OP_READ, bh);
                wait_on_buffer(bh);
                if (buffer_uptodate(bh))
                        return bh;
        }
        brelse(bh);
        return NULL;
}

/*
 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
 * refcount elevated by one when they're in an LRU.  A buffer can only appear
 * once in a particular CPU's LRU.  A single buffer can be present in multiple
 * CPU's LRUs at the same time.
 *
 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
 * sb_find_get_block().
 *
 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
 * a local interrupt disable for that.
 */

#define BH_LRU_SIZE        16

struct bh_lru {
        struct buffer_head *bhs[BH_LRU_SIZE];
};

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

#ifdef CONFIG_SMP
#define bh_lru_lock()        local_irq_disable()
#define bh_lru_unlock()        local_irq_enable()
#else
#define bh_lru_lock()        preempt_disable()
#define bh_lru_unlock()        preempt_enable()
#endif

static inline void check_irqs_on(void)
{
#ifdef irqs_disabled
        BUG_ON(irqs_disabled());
#endif
}

/*
 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
 * inserted at the front, and the buffer_head at the back if any is evicted.
 * Or, if already in the LRU it is moved to the front.
 */
static void bh_lru_install(struct buffer_head *bh)
{
        struct buffer_head *evictee = bh;
        struct bh_lru *b;
        int i;

        check_irqs_on();
        bh_lru_lock();

        /*
         * the refcount of buffer_head in bh_lru prevents dropping the
         * attached page(i.e., try_to_free_buffers) so it could cause
         * failing page migration.
         * Skip putting upcoming bh into bh_lru until migration is done.
         */
        if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return;
        }

        b = this_cpu_ptr(&bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
                swap(evictee, b->bhs[i]);
                if (evictee == bh) {
                        bh_lru_unlock();
                        return;
                }
        }

        get_bh(bh);
        bh_lru_unlock();
        brelse(evictee);
}

/*
 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
 */
static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *ret = NULL;
        unsigned int i;

        check_irqs_on();
        bh_lru_lock();
        if (cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return NULL;
        }
        for (i = 0; i < BH_LRU_SIZE; i++) {
                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);

                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
                    bh->b_size == size) {
                        if (i) {
                                while (i) {
                                        __this_cpu_write(bh_lrus.bhs[i],
                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
                        break;
                }
        }
        bh_lru_unlock();
        return ret;
}

/*
 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
 * it in the LRU and mark it as accessed.  If it is not present then return
 * NULL
 */
struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

        if (bh == NULL) {
                /* __find_get_block_slow will mark the page accessed */
                bh = __find_get_block_slow(bdev, block);
                if (bh)
                        bh_lru_install(bh);
        } else
                touch_buffer(bh);

        return bh;
}
EXPORT_SYMBOL(__find_get_block);

/**
 * bdev_getblk - Get a buffer_head in a block device's buffer cache.
 * @bdev: The block device.
 * @block: The block number.
 * @size: The size of buffer_heads for this @bdev.
 * @gfp: The memory allocation flags to use.
 *
 * The returned buffer head has its reference count incremented, but is
 * not locked.  The caller should call brelse() when it has finished
 * with the buffer.  The buffer may not be uptodate.  If needed, the
 * caller can bring it uptodate either by reading it or overwriting it.
 *
 * Return: The buffer head, or NULL if memory could not be allocated.
 */
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh = __find_get_block(bdev, block, size);

        might_alloc(gfp);
        if (bh)
                return bh;

        return __getblk_slow(bdev, block, size, gfp);
}
EXPORT_SYMBOL(bdev_getblk);

/*
 * Do async read-ahead on a buffer..
 */
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = bdev_getblk(bdev, block, size,
                        GFP_NOWAIT | __GFP_MOVABLE);

        if (likely(bh)) {
                bh_readahead(bh, REQ_RAHEAD);
                brelse(bh);
        }
}
EXPORT_SYMBOL(__breadahead);

/**
 * __bread_gfp() - Read a block.
 * @bdev: The block device to read from.
 * @block: Block number in units of block size.
 * @size: The block size of this device in bytes.
 * @gfp: Not page allocation flags; see below.
 *
 * You are not expected to call this function.  You should use one of
 * sb_bread(), sb_bread_unmovable() or __bread().
 *
 * Read a specified block, and return the buffer head that refers to it.
 * If @gfp is 0, the memory will be allocated using the block device's
 * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
 * allocated from a movable area.  Do not pass in a complete set of
 * GFP flags.
 *
 * The returned buffer head has its refcount increased.  The caller should
 * call brelse() when it has finished with the buffer.
 *
 * Context: May sleep waiting for I/O.
 * Return: NULL if the block was unreadable.
 */
struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh;

        gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);

        /*
         * Prefer looping in the allocator rather than here, at least that
         * code knows what it's doing.
         */
        gfp |= __GFP_NOFAIL;

        bh = bdev_getblk(bdev, block, size, gfp);

        if (likely(bh) && !buffer_uptodate(bh))
                bh = __bread_slow(bh);
        return bh;
}
EXPORT_SYMBOL(__bread_gfp);

static void __invalidate_bh_lrus(struct bh_lru *b)
{
        int i;

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
}
/*
 * invalidate_bh_lrus() is called rarely - but not only at unmount.
 * This doesn't race because it runs in each cpu either in irq
 * or with preempt disabled.
 */
static void invalidate_bh_lru(void *arg)
{
        struct bh_lru *b = &get_cpu_var(bh_lrus);

        __invalidate_bh_lrus(b);
        put_cpu_var(bh_lrus);
}

bool has_bh_in_lru(int cpu, void *dummy)
{
        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
        int i;
        
        for (i = 0; i < BH_LRU_SIZE; i++) {
                if (b->bhs[i])
                        return true;
        }

        return false;
}

void invalidate_bh_lrus(void)
{
        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

/*
 * It's called from workqueue context so we need a bh_lru_lock to close
 * the race with preemption/irq.
 */
void invalidate_bh_lrus_cpu(void)
{
        struct bh_lru *b;

        bh_lru_lock();
        b = this_cpu_ptr(&bh_lrus);
        __invalidate_bh_lrus(b);
        bh_lru_unlock();
}

void folio_set_bh(struct buffer_head *bh, struct folio *folio,
                  unsigned long offset)
{
        bh->b_folio = folio;
        BUG_ON(offset >= folio_size(folio));
        if (folio_test_highmem(folio))
                /*
                 * This catches illegal uses and preserves the offset:
                 */
                bh->b_data = (char *)(0 + offset);
        else
                bh->b_data = folio_address(folio) + offset;
}
EXPORT_SYMBOL(folio_set_bh);

/*
 * Called when truncating a buffer on a page completely.
 */

/* Bits that are cleared during an invalidate */
#define BUFFER_FLAGS_DISCARD \
        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
         1 << BH_Delay | 1 << BH_Unwritten)

static void discard_buffer(struct buffer_head * bh)
{
        unsigned long b_state;

        lock_buffer(bh);
        clear_buffer_dirty(bh);
        bh->b_bdev = NULL;
        b_state = READ_ONCE(bh->b_state);
        do {
        } while (!try_cmpxchg(&bh->b_state, &b_state,
                              b_state & ~BUFFER_FLAGS_DISCARD));
        unlock_buffer(bh);
}

/**
 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * block_invalidate_folio() is called when all or part of the folio has been
 * invalidated by a truncate operation.
 *
 * block_invalidate_folio() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
        struct buffer_head *head, *bh, *next;
        size_t curr_off = 0;
        size_t stop = length + offset;

        BUG_ON(!folio_test_locked(folio));

        /*
         * Check for overflow
         */
        BUG_ON(stop > folio_size(folio) || stop < length);

        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        do {
                size_t next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                /*
                 * Are we still fully in range ?
                 */
                if (next_off > stop)
                        goto out;

                /*
                 * is this block fully invalidated?
                 */
                if (offset <= curr_off)
                        discard_buffer(bh);
                curr_off = next_off;
                bh = next;
        } while (bh != head);

        /*
         * We release buffers only if the entire folio is being invalidated.
         * The get_block cached value has been unconditionally invalidated,
         * so real IO is not possible anymore.
         */
        if (length == folio_size(folio))
                filemap_release_folio(folio, 0);
out:
        return;
}
EXPORT_SYMBOL(block_invalidate_folio);

/*
 * We attach and possibly dirty the buffers atomically wrt
 * block_dirty_folio() via i_private_lock.  try_to_free_buffers
 * is already excluded via the folio lock.
 */
struct buffer_head *create_empty_buffers(struct folio *folio,
                unsigned long blocksize, unsigned long b_state)
{
        struct buffer_head *bh, *head, *tail;
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;

        head = folio_alloc_buffers(folio, blocksize, gfp);
        bh = head;
        do {
                bh->b_state |= b_state;
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;

        spin_lock(&folio->mapping->i_private_lock);
        if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
                bh = head;
                do {
                        if (folio_test_dirty(folio))
                                set_buffer_dirty(bh);
                        if (folio_test_uptodate(folio))
                                set_buffer_uptodate(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        folio_attach_private(folio, head);
        spin_unlock(&folio->mapping->i_private_lock);

        return head;
}
EXPORT_SYMBOL(create_empty_buffers);

/**
 * clean_bdev_aliases: clean a range of buffers in block device
 * @bdev: Block device to clean buffers in
 * @block: Start of a range of blocks to clean
 * @len: Number of blocks to clean
 *
 * We are taking a range of blocks for data and we don't want writeback of any
 * buffer-cache aliases starting from return from this function and until the
 * moment when something will explicitly mark the buffer dirty (hopefully that
 * will not happen until we will free that block ;-) We don't even need to mark
 * it not-uptodate - nobody can expect anything from a newly allocated buffer
 * anyway. We used to use unmap_buffer() for such invalidation, but that was
 * wrong. We definitely don't want to mark the alias unmapped, for example - it
 * would confuse anyone who might pick it with bread() afterwards...
 *
 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
 * writeout I/O going on against recently-freed buffers.  We don't wait on that
 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
 * need to.  That happens here.
 */
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct folio_batch fbatch;
        pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
        pgoff_t end;
        int i, count;
        struct buffer_head *bh;
        struct buffer_head *head;

        end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
        folio_batch_init(&fbatch);
        while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
                count = folio_batch_count(&fbatch);
                for (i = 0; i < count; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (!folio_buffers(folio))
                                continue;
                        /*
                         * We use folio lock instead of bd_mapping->i_private_lock
                         * to pin buffers here since we can afford to sleep and
                         * it scales better than a global spinlock lock.
                         */
                        folio_lock(folio);
                        /* Recheck when the folio is locked which pins bhs */
                        head = folio_buffers(folio);
                        if (!head)
                                goto unlock_page;
                        bh = head;
                        do {
                                if (!buffer_mapped(bh) || (bh->b_blocknr < block))
                                        goto next;
                                if (bh->b_blocknr >= block + len)
                                        break;
                                clear_buffer_dirty(bh);
                                wait_on_buffer(bh);
                                clear_buffer_req(bh);
next:
                                bh = bh->b_this_page;
                        } while (bh != head);
unlock_page:
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
                /* End of range already reached? */
                if (index > end || !index)
                        break;
        }
}
EXPORT_SYMBOL(clean_bdev_aliases);

static struct buffer_head *folio_create_buffers(struct folio *folio,
                                                struct inode *inode,
                                                unsigned int b_state)
{
        struct buffer_head *bh;

        BUG_ON(!folio_test_locked(folio));

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio,
                                1 << READ_ONCE(inode->i_blkbits), b_state);
        return bh;
}

/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *        Mapped        Uptodate        Meaning
 *
 *        No        No                "unknown" - must do get_block()
 *        No        Yes                "hole" - zero-filled
 *        Yes        No                "allocated" - allocated on disk, not read in
 *        Yes        Yes                "valid" - allocated and up-to-date in memory.
 *
 * "Dirty" is valid only with the last case (mapped+uptodate).
 */

/*
 * While block_write_full_folio is writing back the dirty buffers under
 * the page lock, whoever dirtied the buffers may decide to clean them
 * again at any time.  We handle that by only looking at the buffer
 * state inside lock_buffer().
 *
 * If block_write_full_folio() is called for regular writeback
 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
 *
 * If block_write_full_folio() is called with wbc->sync_mode ==
 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
 * causes the writes to be flagged as synchronous writes.
 */
int __block_write_full_folio(struct inode *inode, struct folio *folio,
                        get_block_t *get_block, struct writeback_control *wbc)
{
        int err;
        sector_t block;
        sector_t last_block;
        struct buffer_head *bh, *head;
        size_t blocksize;
        int nr_underway = 0;
        blk_opf_t write_flags = wbc_to_write_flags(wbc);

        head = folio_create_buffers(folio, inode,
                                    (1 << BH_Dirty) | (1 << BH_Uptodate));

        /*
         * Be very careful.  We have no exclusion from block_dirty_folio
         * here, and the (potentially unmapped) buffers may become dirty at
         * any time.  If a buffer becomes dirty here after we've inspected it
         * then we just miss that fact, and the folio stays dirty.
         *
         * Buffers outside i_size may be dirtied by block_dirty_folio;
         * handle that here by just cleaning them.
         */

        bh = head;
        blocksize = bh->b_size;

        block = div_u64(folio_pos(folio), blocksize);
        last_block = div_u64(i_size_read(inode) - 1, blocksize);

        /*
         * Get all the dirty buffers mapped to disk addresses and
         * handle any aliases from the underlying blockdev's mapping.
         */
        do {
                if (block > last_block) {
                        /*
                         * mapped buffers outside i_size will occur, because
                         * this folio can be outside i_size when there is a
                         * truncate in progress.
                         */
                        /*
                         * The buffer was zeroed by block_write_full_folio()
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
                                clean_bdev_bh_alias(bh);
                        }
                }
                bh = bh->b_this_page;
                block++;
        } while (bh != head);

        do {
                if (!buffer_mapped(bh))
                        continue;
                /*
                 * If it's a fully non-blocking write attempt and we cannot
                 * lock the buffer then redirty the folio.  Note that this can
                 * potentially cause a busy-wait loop from writeback threads
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        folio_redirty_for_writepage(wbc, folio);
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        unlock_buffer(bh);
                }
        } while ((bh = bh->b_this_page) != head);

        /*
         * The folio and its buffers are protected by the writeback flag,
         * so we can drop the bh refcounts early.
         */
        BUG_ON(folio_test_writeback(folio));
        folio_start_writeback(folio);

        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);

        err = 0;
done:
        if (nr_underway == 0) {
                /*
                 * The folio was marked dirty, but the buffers were
                 * clean.  Someone wrote them back by hand with
                 * write_dirty_buffer/submit_bh.  A rare case.
                 */
                folio_end_writeback(folio);

                /*
                 * The folio and buffer_heads can be released at any time from
                 * here on.
                 */
        }
        return err;

recover:
        /*
         * ENOSPC, or some other error.  We may already have added some
         * blocks to the file, so we need to write these out to avoid
         * exposing stale data.
         * The folio is currently locked and not marked for writeback
         */
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        /*
                         * The buffer may have been set dirty during
                         * attachment to a dirty folio.
                         */
                        clear_buffer_dirty(bh);
                }
        } while ((bh = bh->b_this_page) != head);
        folio_set_error(folio);
        BUG_ON(folio_test_writeback(folio));
        mapping_set_error(folio->mapping, err);
        folio_start_writeback(folio);
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);
        goto done;
}
EXPORT_SYMBOL(__block_write_full_folio);

/*
 * If a folio has any new buffers, zero them out here, and mark them uptodate
 * and dirty so they'll be written out (in order to prevent uninitialised
 * block data from leaking). And clear the new bit.
 */
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        struct buffer_head *head, *bh;

        BUG_ON(!folio_test_locked(folio));
        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + bh->b_size;

                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        size_t start, xend;

                                        start = max(from, block_start);
                                        xend = min(to, block_end);

                                        folio_zero_segment(folio, start, xend);
                                        set_buffer_uptodate(bh);
                                }

                                clear_buffer_new(bh);
                                mark_buffer_dirty(bh);
                        }
                }

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}
EXPORT_SYMBOL(folio_zero_new_buffers);

static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
                const struct iomap *iomap)
{
        loff_t offset = (loff_t)block << inode->i_blkbits;

        bh->b_bdev = iomap->bdev;

        /*
         * Block points to offset in file we need to map, iomap contains
         * the offset at which the map starts. If the map ends before the
         * current block, then do not map the buffer and let the caller
         * handle it.
         */
        if (offset >= iomap->offset + iomap->length)
                return -EIO;

        switch (iomap->type) {
        case IOMAP_HOLE:
                /*
                 * If the buffer is not up to date or beyond the current EOF,
                 * we need to mark it as new to ensure sub-block zeroing is
                 * executed if necessary.
                 */
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                return 0;
        case IOMAP_DELALLOC:
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                set_buffer_uptodate(bh);
                set_buffer_mapped(bh);
                set_buffer_delay(bh);
                return 0;
        case IOMAP_UNWRITTEN:
                /*
                 * For unwritten regions, we always need to ensure that regions
                 * in the block we are not writing to are zeroed. Mark the
                 * buffer as new to ensure this.
                 */
                set_buffer_new(bh);
                set_buffer_unwritten(bh);
                fallthrough;
        case IOMAP_MAPPED:
                if ((iomap->flags & IOMAP_F_NEW) ||
                    offset >= i_size_read(inode)) {
                        /*
                         * This can happen if truncating the block device races
                         * with the check in the caller as i_size updates on
                         * block devices aren't synchronized by i_rwsem for
                         * block devices.
                         */
                        if (S_ISBLK(inode->i_mode))
                                return -EIO;
                        set_buffer_new(bh);
                }
                bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
                                inode->i_blkbits;
                set_buffer_mapped(bh);
                return 0;
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap)
{
        size_t from = offset_in_folio(folio, pos);
        size_t to = from + len;
        struct inode *inode = folio->mapping->host;
        size_t block_start, block_end;
        sector_t block;
        int err = 0;
        size_t blocksize;
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(to > folio_size(folio));
        BUG_ON(from > to);

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;
        block = div_u64(folio_pos(folio), blocksize);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                if (!buffer_uptodate(bh))
                                        set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        if (get_block)
                                err = get_block(inode, block, bh, 1);
                        else
                                err = iomap_to_bh(inode, block, bh, iomap);
                        if (err)
                                break;

                        if (buffer_new(bh)) {
                                clean_bdev_bh_alias(bh);
                                if (folio_test_uptodate(folio)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio,
                                                to, block_end,
                                                block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                        continue; 
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                     (block_start < from || block_end > to)) {
                        bh_read_nowait(bh, 0);
                        *wait_bh++=bh;
                }
        }
        /*
         * If we issued read requests - let them complete.
         */
        while(wait_bh > wait) {
                wait_on_buffer(*--wait_bh);
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
        if (unlikely(err))
                folio_zero_new_buffers(folio, from, to);
        return err;
}

int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
{
        return __block_write_begin_int(page_folio(page), pos, len, get_block,
                                       NULL);
}
EXPORT_SYMBOL(__block_write_begin);

static void __block_commit_write(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        bool partial = false;
        unsigned blocksize;
        struct buffer_head *bh, *head;

        bh = head = folio_buffers(folio);
        blocksize = bh->b_size;

        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
                                partial = true;
                } else {
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * If this is a partial write which happened to make all buffers
         * uptodate then we can optimize away a bogus read_folio() for
         * the next read(). Here we 'discover' whether the folio went
         * uptodate as a result of this (potentially partial) write.
         */
        if (!partial)
                folio_mark_uptodate(folio);
}

/*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
 *
 * The filesystem needs to handle block truncation upon failure.
 */
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                struct page **pagep, get_block_t *get_block)
{
        pgoff_t index = pos >> PAGE_SHIFT;
        struct page *page;
        int status;

        page = grab_cache_page_write_begin(mapping, index);
        if (!page)
                return -ENOMEM;

        status = __block_write_begin(page, pos, len, get_block);
        if (unlikely(status)) {
                unlock_page(page);
                put_page(page);
                page = NULL;
        }

        *pagep = page;
        return status;
}
EXPORT_SYMBOL(block_write_begin);

int block_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        size_t start = pos - folio_pos(folio);

        if (unlikely(copied < len)) {
                /*
                 * The buffers that were written will now be uptodate, so
                 * we don't have to worry about a read_folio reading them
                 * and overwriting a partial write. However if we have
                 * encountered a short write and only partially written
                 * into a buffer, it will not be marked uptodate, so a
                 * read_folio might come in and destroy our partial write.
                 *
                 * Do the simplest thing, and just treat any short write to a
                 * non uptodate folio as a zero-length write, and force the
                 * caller to redo the whole thing.
                 */
                if (!folio_test_uptodate(folio))
                        copied = 0;

                folio_zero_new_buffers(folio, start+copied, start+len);
        }
        flush_dcache_folio(folio);

        /* This could be a short (even 0-length) commit */
        __block_commit_write(folio, start, start + copied);

        return copied;
}
EXPORT_SYMBOL(block_write_end);

int generic_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool i_size_changed = false;

        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

        /*
         * No need to use i_size_read() here, the i_size cannot change under us
         * because we hold i_rwsem.
         *
         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
        if (pos + copied > inode->i_size) {
                i_size_write(inode, pos + copied);
                i_size_changed = true;
        }

        unlock_page(page);
        put_page(page);

        if (old_size < pos)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                mark_inode_dirty(inode);
        return copied;
}
EXPORT_SYMBOL(generic_write_end);

/*
 * block_is_partially_uptodate checks whether buffers within a folio are
 * uptodate or not.
 *
 * Returns true if all buffers which correspond to the specified part
 * of the folio are uptodate.
 */
bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
        unsigned block_start, block_end, blocksize;
        unsigned to;
        struct buffer_head *bh, *head;
        bool ret = true;

        head = folio_buffers(folio);
        if (!head)
                return false;
        blocksize = head->b_size;
        to = min_t(unsigned, folio_size(folio) - from, count);
        to = from + to;
        if (from < blocksize && to > folio_size(folio) - blocksize)
                return false;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end > from && block_start < to) {
                        if (!buffer_uptodate(bh)) {
                                ret = false;
                                break;
                        }
                        if (block_end >= to)
                                break;
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        return ret;
}
EXPORT_SYMBOL(block_is_partially_uptodate);

/*
 * Generic "read_folio" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the folio asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * folio once IO has completed.
 */
int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
        struct inode *inode = folio->mapping->host;
        sector_t iblock, lblock;
        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
        size_t blocksize;
        int nr, i;
        int fully_mapped = 1;
        bool page_error = false;
        loff_t limit = i_size_read(inode);

        /* This is needed for ext4. */
        if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
                limit = inode->i_sb->s_maxbytes;

        VM_BUG_ON_FOLIO(folio_test_large(folio), folio);

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;

        iblock = div_u64(folio_pos(folio), blocksize);
        lblock = div_u64(limit + blocksize - 1, blocksize);
        bh = head;
        nr = 0;
        i = 0;

        do {
                if (buffer_uptodate(bh))
                        continue;

                if (!buffer_mapped(bh)) {
                        int err = 0;

                        fully_mapped = 0;
                        if (iblock < lblock) {
                                WARN_ON(bh->b_size != blocksize);
                                err = get_block(inode, iblock, bh, 0);
                                if (err) {
                                        folio_set_error(folio);
                                        page_error = true;
                                }
                        }
                        if (!buffer_mapped(bh)) {
                                folio_zero_range(folio, i * blocksize,
                                                blocksize);
                                if (!err)
                                        set_buffer_uptodate(bh);
                                continue;
                        }
                        /*
                         * get_block() might have updated the buffer
                         * synchronously
                         */
                        if (buffer_uptodate(bh))
                                continue;
                }
                arr[nr++] = bh;
        } while (i++, iblock++, (bh = bh->b_this_page) != head);

        if (fully_mapped)
                folio_set_mappedtodisk(folio);

        if (!nr) {
                /*
                 * All buffers are uptodate or get_block() returned an
                 * error when trying to map them - we can finish the read.
                 */
                folio_end_read(folio, !page_error);
                return 0;
        }

        /* Stage two: lock the buffers */
        for (i = 0; i < nr; i++) {
                bh = arr[i];
                lock_buffer(bh);
                mark_buffer_async_read(bh);
        }

        /*
         * Stage 3: start the IO.  Check for uptodateness
         * inside the buffer lock in case another process reading
         * the underlying blockdev brought it uptodate (the sct fix).
         */
        for (i = 0; i < nr; i++) {
                bh = arr[i];
                if (buffer_uptodate(bh))
                        end_buffer_async_read(bh, 1);
                else
                        submit_bh(REQ_OP_READ, bh);
        }
        return 0;
}
EXPORT_SYMBOL(block_read_full_folio);

/* utility function for filesystems that need to do work on expanding
 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
 * deal with the hole.  
 */
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        struct page *page;
        void *fsdata = NULL;
        int err;

        err = inode_newsize_ok(inode, size);
        if (err)
                goto out;

        err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
        if (err)
                goto out;

        err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
        BUG_ON(err > 0);

out:
        return err;
}
EXPORT_SYMBOL(generic_cont_expand_simple);

static int cont_expand_zero(struct file *file, struct address_space *mapping,
                            loff_t pos, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        const struct address_space_operations *aops = mapping->a_ops;
        unsigned int blocksize = i_blocksize(inode);
        struct page *page;
        void *fsdata = NULL;
        pgoff_t index, curidx;
        loff_t curpos;
        unsigned zerofrom, offset, len;
        int err = 0;

        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;

        while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
                zerofrom = curpos & ~PAGE_MASK;
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = PAGE_SIZE - zerofrom;

                err = aops->write_begin(file, mapping, curpos, len,
                                            &page, &fsdata);
                if (err)
                        goto out;
                zero_user(page, zerofrom, len);
                err = aops->write_end(file, mapping, curpos, len, len,
                                                page, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;

                balance_dirty_pages_ratelimited(mapping);

                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        goto out;
                }
        }

        /* page covers the boundary, find the boundary offset */
        if (index == curidx) {
                zerofrom = curpos & ~PAGE_MASK;
                /* if we will expand the thing last block will be filled */
                if (offset <= zerofrom) {
                        goto out;
                }
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = offset - zerofrom;

                err = aops->write_begin(file, mapping, curpos, len,
                                            &page, &fsdata);
                if (err)
                        goto out;
                zero_user(page, zerofrom, len);
                err = aops->write_end(file, mapping, curpos, len, len,
                                                page, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;
        }
out:
        return err;
}

/*
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
int cont_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        unsigned int blocksize = i_blocksize(inode);
        unsigned int zerofrom;
        int err;

        err = cont_expand_zero(file, mapping, pos, bytes);
        if (err)
                return err;

        zerofrom = *bytes & ~PAGE_MASK;
        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
                *bytes |= (blocksize-1);
                (*bytes)++;
        }

        return block_write_begin(mapping, pos, len, pagep, get_block);
}
EXPORT_SYMBOL(cont_write_begin);

void block_commit_write(struct page *page, unsigned from, unsigned to)
{
        struct folio *folio = page_folio(page);
        __block_commit_write(folio, from, to);
}
EXPORT_SYMBOL(block_commit_write);

/*
 * block_page_mkwrite() is not allowed to change the file size as it gets
 * called from a page fault handler when a page is first dirtied. Hence we must
 * be careful to check for EOF conditions here. We set the page up correctly
 * for a written page which means we get ENOSPC checking when writing into
 * holes and correct delalloc and unwritten extent mapping on filesystems that
 * support these features.
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
 * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 *
 * Direct callers of this function should protect against filesystem freezing
 * using sb_start_pagefault() - sb_end_pagefault() functions.
 */
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                         get_block_t get_block)
{
        struct folio *folio = page_folio(vmf->page);
        struct inode *inode = file_inode(vma->vm_file);
        unsigned long end;
        loff_t size;
        int ret;

        folio_lock(folio);
        size = i_size_read(inode);
        if ((folio->mapping != inode->i_mapping) ||
            (folio_pos(folio) >= size)) {
                /* We overload EFAULT to mean page got truncated */
                ret = -EFAULT;
                goto out_unlock;
        }

        end = folio_size(folio);
        /* folio is wholly or partially inside EOF */
        if (folio_pos(folio) + end > size)
                end = size - folio_pos(folio);

        ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
        if (unlikely(ret))
                goto out_unlock;

        __block_commit_write(folio, 0, end);

        folio_mark_dirty(folio);
        folio_wait_stable(folio);
        return 0;
out_unlock:
        folio_unlock(folio);
        return ret;
}
EXPORT_SYMBOL(block_page_mkwrite);

int block_truncate_page(struct address_space *mapping,
                        loff_t from, get_block_t *get_block)
{
        pgoff_t index = from >> PAGE_SHIFT;
        unsigned blocksize;
        sector_t iblock;
        size_t offset, length, pos;
        struct inode *inode = mapping->host;
        struct folio *folio;
        struct buffer_head *bh;
        int err = 0;

        blocksize = i_blocksize(inode);
        length = from & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!length)
                return 0;

        length = blocksize - length;
        iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;

        folio = filemap_grab_folio(mapping, index);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        offset = offset_in_folio(folio, from);
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }

        if (!buffer_mapped(bh)) {
                WARN_ON(bh->b_size != blocksize);
                err = get_block(inode, iblock, bh, 0);
                if (err)
                        goto unlock;
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh))
                        goto unlock;
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
                err = bh_read(bh, 0);
                /* Uhhuh. Read error. Complain and punt. */
                if (err < 0)
                        goto unlock;
        }

        folio_zero_range(folio, offset, length);
        mark_buffer_dirty(bh);

unlock:
        folio_unlock(folio);
        folio_put(folio);

        return err;
}
EXPORT_SYMBOL(block_truncate_page);

/*
 * The generic ->writepage function for buffer-backed address_spaces
 */
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
                void *get_block)
{
        struct inode * const inode = folio->mapping->host;
        loff_t i_size = i_size_read(inode);

        /* Is the folio fully inside i_size? */
        if (folio_pos(folio) + folio_size(folio) <= i_size)
                return __block_write_full_folio(inode, folio, get_block, wbc);

        /* Is the folio fully outside i_size? (truncate in progress) */
        if (folio_pos(folio) >= i_size) {
                folio_unlock(folio);
                return 0; /* don't care */
        }

        /*
         * The folio straddles i_size.  It must be zeroed out on each and every
         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
         */
        folio_zero_segment(folio, offset_in_folio(folio, i_size),
                        folio_size(folio));
        return __block_write_full_folio(inode, folio, get_block, wbc);
}

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
{
        struct inode *inode = mapping->host;
        struct buffer_head tmp = {
                .b_size = i_blocksize(inode),
        };

        get_block(inode, block, &tmp, 0);
        return tmp.b_blocknr;
}
EXPORT_SYMBOL(generic_block_bmap);

static void end_bio_bh_io_sync(struct bio *bio)
{
        struct buffer_head *bh = bio->bi_private;

        if (unlikely(bio_flagged(bio, BIO_QUIET)))
                set_bit(BH_Quiet, &bh->b_state);

        bh->b_end_io(bh, !bio->bi_status);
        bio_put(bio);
}

static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint write_hint,
                          struct writeback_control *wbc)
{
        const enum req_op op = opf & REQ_OP_MASK;
        struct bio *bio;

        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
        BUG_ON(buffer_delay(bh));
        BUG_ON(buffer_unwritten(bh));

        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
                clear_buffer_write_io_error(bh);

        if (buffer_meta(bh))
                opf |= REQ_META;
        if (buffer_prio(bh))
                opf |= REQ_PRIO;

        bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);

        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);

        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_write_hint = write_hint;

        __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;

        /* Take care of bh's that straddle the end of the device */
        guard_bio_eod(bio);

        if (wbc) {
                wbc_init_bio(wbc, bio);
                wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
        }

        submit_bio(bio);
}

void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
        submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);

void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        lock_buffer(bh);
        if (!test_clear_buffer_dirty(bh)) {
                unlock_buffer(bh);
                return;
        }
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        submit_bh(REQ_OP_WRITE | op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);

/*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        WARN_ON(atomic_read(&bh->b_count) < 1);
        lock_buffer(bh);
        if (test_clear_buffer_dirty(bh)) {
                /*
                 * The bh should be mapped, but it might not be if the
                 * device was hot-removed. Not much we can do but fail the I/O.
                 */
                if (!buffer_mapped(bh)) {
                        unlock_buffer(bh);
                        return -EIO;
                }

                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
                submit_bh(REQ_OP_WRITE | op_flags, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        return -EIO;
        } else {
                unlock_buffer(bh);
        }
        return 0;
}
EXPORT_SYMBOL(__sync_dirty_buffer);

int sync_dirty_buffer(struct buffer_head *bh)
{
        return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);

static inline int buffer_busy(struct buffer_head *bh)
{
        return atomic_read(&bh->b_count) |
                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}

static bool
drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh;

        bh = head;
        do {
                if (buffer_busy(bh))
                        goto failed;
                bh = bh->b_this_page;
        } while (bh != head);

        do {
                struct buffer_head *next = bh->b_this_page;

                if (bh->b_assoc_map)
                        __remove_assoc_queue(bh);
                bh = next;
        } while (bh != head);
        *buffers_to_free = head;
        folio_detach_private(folio);
        return true;
failed:
        return false;
}

/**
 * try_to_free_buffers - Release buffers attached to this folio.
 * @folio: The folio.
 *
 * If any buffers are in use (dirty, under writeback, elevated refcount),
 * no buffers will be freed.
 *
 * If the folio is dirty but all the buffers are clean then we need to
 * be sure to mark the folio clean as well.  This is because the folio
 * may be against a block device, and a later reattachment of buffers
 * to a dirty folio will set *all* buffers dirty.  Which would corrupt
 * filesystem data on the same device.
 *
 * The same applies to regular filesystem folios: if all the buffers are
 * clean then we set the folio clean and proceed.  To do that, we require
 * total exclusion from block_dirty_folio().  That is obtained with
 * i_private_lock.
 *
 * Exclusion against try_to_free_buffers may be obtained by either
 * locking the folio or by holding its mapping's i_private_lock.
 *
 * Context: Process context.  @folio must be locked.  Will not sleep.
 * Return: true if all buffers attached to this folio were freed.
 */
bool try_to_free_buffers(struct folio *folio)
{
        struct address_space * const mapping = folio->mapping;
        struct buffer_head *buffers_to_free = NULL;
        bool ret = 0;

        BUG_ON(!folio_test_locked(folio));
        if (folio_test_writeback(folio))
                return false;

        if (mapping == NULL) {                /* can this still happen? */
                ret = drop_buffers(folio, &buffers_to_free);
                goto out;
        }

        spin_lock(&mapping->i_private_lock);
        ret = drop_buffers(folio, &buffers_to_free);

        /*
         * If the filesystem writes its buffers by hand (eg ext3)
         * then we can have clean buffers against a dirty folio.  We
         * clean the folio here; otherwise the VM will never notice
         * that the filesystem did any IO at all.
         *
         * Also, during truncate, discard_buffer will have marked all
         * the folio's buffers clean.  We discover that here and clean
         * the folio also.
         *
         * i_private_lock must be held over this entire operation in order
         * to synchronise against block_dirty_folio and prevent the
         * dirty bit from being lost.
         */
        if (ret)
                folio_cancel_dirty(folio);
        spin_unlock(&mapping->i_private_lock);
out:
        if (buffers_to_free) {
                struct buffer_head *bh = buffers_to_free;

                do {
                        struct buffer_head *next = bh->b_this_page;
                        free_buffer_head(bh);
                        bh = next;
                } while (bh != buffers_to_free);
        }
        return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);

/*
 * Buffer-head allocation
 */
static struct kmem_cache *bh_cachep __ro_after_init;

/*
 * Once the number of bh's in the machine exceeds this level, we start
 * stripping them in writeback.
 */
static unsigned long max_buffer_heads __ro_after_init;

int buffer_heads_over_limit;

struct bh_accounting {
        int nr;                        /* Number of live bh's */
        int ratelimit;                /* Limit cacheline bouncing */
};

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

static void recalc_bh_state(void)
{
        int i;
        int tot = 0;

        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
}

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
{
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
                spin_lock_init(&ret->b_uptodate_lock);
                preempt_disable();
                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
                preempt_enable();
        }
        return ret;
}
EXPORT_SYMBOL(alloc_buffer_head);

void free_buffer_head(struct buffer_head *bh)
{
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
        preempt_disable();
        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
        preempt_enable();
}
EXPORT_SYMBOL(free_buffer_head);

static int buffer_exit_cpu_dead(unsigned int cpu)
{
        int i;
        struct bh_lru *b = &per_cpu(bh_lrus, cpu);

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
        return 0;
}

/**
 * bh_uptodate_or_lock - Test whether the buffer is uptodate
 * @bh: struct buffer_head
 *
 * Return true if the buffer is up-to-date and false,
 * with the buffer locked, if not.
 */
int bh_uptodate_or_lock(struct buffer_head *bh)
{
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
                if (!buffer_uptodate(bh))
                        return 0;
                unlock_buffer(bh);
        }
        return 1;
}
EXPORT_SYMBOL(bh_uptodate_or_lock);

/**
 * __bh_read - Submit read for a locked buffer
 * @bh: struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @wait: wait until reading finish
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
        int ret = 0;

        BUG_ON(!buffer_locked(bh));

        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;
        submit_bh(REQ_OP_READ | op_flags, bh);
        if (wait) {
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        ret = -EIO;
        }
        return ret;
}
EXPORT_SYMBOL(__bh_read);

/**
 * __bh_read_batch - Submit read for a batch of unlocked buffers
 * @nr: entry number of the buffer batch
 * @bhs: a batch of struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @force_lock: force to get a lock on the buffer if set, otherwise drops any
 *              buffer that cannot lock.
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
void __bh_read_batch(int nr, struct buffer_head *bhs[],
                     blk_opf_t op_flags, bool force_lock)
{
        int i;

        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];

                if (buffer_uptodate(bh))
                        continue;

                if (force_lock)
                        lock_buffer(bh);
                else
                        if (!trylock_buffer(bh))
                                continue;

                if (buffer_uptodate(bh)) {
                        unlock_buffer(bh);
                        continue;
                }

                bh->b_end_io = end_buffer_read_sync;
                get_bh(bh);
                submit_bh(REQ_OP_READ | op_flags, bh);
        }
}
EXPORT_SYMBOL(__bh_read_batch);

void __init buffer_init(void)
{
        unsigned long nrpages;
        int ret;

        bh_cachep = KMEM_CACHE(buffer_head,
                                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
        /*
         * Limit the bh occupancy to 10% of ZONE_NORMAL
         */
        nrpages = (nr_free_buffer_pages() * 10) / 100;
        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
        ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
                                        NULL, buffer_exit_cpu_dead);
        WARN_ON(ret < 0);
}


















































    1 





    1 





























    2 


    2 





    1 


    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// SPDX-License-Identifier: GPL-2.0
/*
 * lib/minmax.c: windowed min/max tracker
 *
 * Kathleen Nichols' algorithm for tracking the minimum (or maximum)
 * value of a data stream over some fixed time interval.  (E.g.,
 * the minimum RTT over the past five minutes.) It uses constant
 * space and constant time per update yet almost always delivers
 * the same minimum as an implementation that has to keep all the
 * data in the window.
 *
 * The algorithm keeps track of the best, 2nd best & 3rd best min
 * values, maintaining an invariant that the measurement time of
 * the n'th best >= n-1'th best. It also makes sure that the three
 * values are widely separated in the time window since that bounds
 * the worse case error when that data is monotonically increasing
 * over the window.
 *
 * Upon getting a new min, we can forget everything earlier because
 * it has no value - the new min is <= everything else in the window
 * by definition and it's the most recent. So we restart fresh on
 * every new min and overwrites 2nd & 3rd choices. The same property
 * holds for 2nd & 3rd best.
 */
#include <linux/module.h>
#include <linux/win_minmax.h>

/* As time advances, update the 1st, 2nd, and 3rd choices. */
static u32 minmax_subwin_update(struct minmax *m, u32 win,
                                const struct minmax_sample *val)
{
        u32 dt = val->t - m->s[0].t;

        if (unlikely(dt > win)) {
                /*
                 * Passed entire window without a new val so make 2nd
                 * choice the new val & 3rd choice the new 2nd choice.
                 * we may have to iterate this since our 2nd choice
                 * may also be outside the window (we checked on entry
                 * that the third choice was in the window).
                 */
                m->s[0] = m->s[1];
                m->s[1] = m->s[2];
                m->s[2] = *val;
                if (unlikely(val->t - m->s[0].t > win)) {
                        m->s[0] = m->s[1];
                        m->s[1] = m->s[2];
                        m->s[2] = *val;
                }
        } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) {
                /*
                 * We've passed a quarter of the window without a new val
                 * so take a 2nd choice from the 2nd quarter of the window.
                 */
                m->s[2] = m->s[1] = *val;
        } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) {
                /*
                 * We've passed half the window without finding a new val
                 * so take a 3rd choice from the last half of the window
                 */
                m->s[2] = *val;
        }
        return m->s[0].v;
}

/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */
u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas)
{
        struct minmax_sample val = { .t = t, .v = meas };

        if (unlikely(val.v >= m->s[0].v) ||          /* found new max? */
            unlikely(val.t - m->s[2].t > win))          /* nothing left in window? */
                return minmax_reset(m, t, meas);  /* forget earlier samples */

        if (unlikely(val.v >= m->s[1].v))
                m->s[2] = m->s[1] = val;
        else if (unlikely(val.v >= m->s[2].v))
                m->s[2] = val;

        return minmax_subwin_update(m, win, &val);
}
EXPORT_SYMBOL(minmax_running_max);

/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */
u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas)
{
        struct minmax_sample val = { .t = t, .v = meas };

        if (unlikely(val.v <= m->s[0].v) ||          /* found new min? */
            unlikely(val.t - m->s[2].t > win))          /* nothing left in window? */
                return minmax_reset(m, t, meas);  /* forget earlier samples */

        if (unlikely(val.v <= m->s[1].v))
                m->s[2] = m->s[1] = val;
        else if (unlikely(val.v <= m->s[2].v))
                m->s[2] = val;

        return minmax_subwin_update(m, win, &val);
}












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 



    1 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2011 Instituto Nokia de Tecnologia
 *
 * Authors:
 *    Lauro Ramos Venancio <lauro.venancio@openbossa.org>
 *    Aloisio Almeida Jr <aloisio.almeida@openbossa.org>
 *
 * Vendor commands implementation based on net/wireless/nl80211.c
 * which is:
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__

#include <net/genetlink.h>
#include <linux/nfc.h>
#include <linux/slab.h>

#include "nfc.h"
#include "llcp.h"

static const struct genl_multicast_group nfc_genl_mcgrps[] = {
        { .name = NFC_GENL_MCAST_EVENT_NAME, },
};

static struct genl_family nfc_genl_family;
static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
        [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
        [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
                                .len = NFC_DEVICE_NAME_MAXSIZE },
        [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 },
        [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 },
        [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 },
        [NFC_ATTR_RF_MODE] = { .type = NLA_U8 },
        [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 },
        [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 },
        [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 },
        [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 },
        [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 },
        [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 },
        [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED },
        [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING,
                                     .len = NFC_FIRMWARE_NAME_MAXSIZE },
        [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 },
        [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY },
        [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 },
        [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 },
        [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },

};

static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = {
        [NFC_SDP_ATTR_URI] = { .type = NLA_STRING,
                               .len = U8_MAX - 4 },
        [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 },
};

static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target,
                                struct netlink_callback *cb, int flags)
{
        void *hdr;

        hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                          &nfc_genl_family, flags, NFC_CMD_GET_TARGET);
        if (!hdr)
                return -EMSGSIZE;

        genl_dump_check_consistent(cb, hdr);

        if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) ||
            nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) ||
            nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) ||
            nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res))
                goto nla_put_failure;
        if (target->nfcid1_len > 0 &&
            nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len,
                    target->nfcid1))
                goto nla_put_failure;
        if (target->sensb_res_len > 0 &&
            nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len,
                    target->sensb_res))
                goto nla_put_failure;
        if (target->sensf_res_len > 0 &&
            nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len,
                    target->sensf_res))
                goto nla_put_failure;

        if (target->is_iso15693) {
                if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID,
                               target->iso15693_dsfid) ||
                    nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID,
                            sizeof(target->iso15693_uid), target->iso15693_uid))
                        goto nla_put_failure;
        }

        genlmsg_end(msg, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb)
{
        const struct genl_dumpit_info *info = genl_dumpit_info(cb);
        struct nfc_dev *dev;
        u32 idx;

        if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX])
                return ERR_PTR(-EINVAL);

        idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return ERR_PTR(-ENODEV);

        return dev;
}

static int nfc_genl_dump_targets(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        int i = cb->args[0];
        struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
        int rc;

        if (!dev) {
                dev = __get_device_from_cb(cb);
                if (IS_ERR(dev))
                        return PTR_ERR(dev);

                cb->args[1] = (long) dev;
        }

        device_lock(&dev->dev);

        cb->seq = dev->targets_generation;

        while (i < dev->n_targets) {
                rc = nfc_genl_send_target(skb, &dev->targets[i], cb,
                                          NLM_F_MULTI);
                if (rc < 0)
                        break;

                i++;
        }

        device_unlock(&dev->dev);

        cb->args[0] = i;

        return skb->len;
}

static int nfc_genl_dump_targets_done(struct netlink_callback *cb)
{
        struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];

        if (dev)
                nfc_put_device(dev);

        return 0;
}

int nfc_genl_targets_found(struct nfc_dev *dev)
{
        struct sk_buff *msg;
        void *hdr;

        dev->genl_data.poll_req_portid = 0;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_TARGETS_FOUND);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_TARGET_LOST);
        if (!hdr)
                goto free_msg;

        if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) ||
            nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_TM_ACTIVATED);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;
        if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_tm_deactivated(struct nfc_dev *dev)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_TM_DEACTIVATED);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg)
{
        if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) ||
            nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
            nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) ||
            nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) ||
            nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode))
                return -1;
        return 0;
}

int nfc_genl_device_added(struct nfc_dev *dev)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_DEVICE_ADDED);
        if (!hdr)
                goto free_msg;

        if (nfc_genl_setup_device_added(dev, msg))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_device_removed(struct nfc_dev *dev)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_DEVICE_REMOVED);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list)
{
        struct sk_buff *msg;
        struct nlattr *sdp_attr, *uri_attr;
        struct nfc_llcp_sdp_tlv *sdres;
        struct hlist_node *n;
        void *hdr;
        int rc = -EMSGSIZE;
        int i;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_LLC_SDRES);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;

        sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP);
        if (sdp_attr == NULL) {
                rc = -ENOMEM;
                goto nla_put_failure;
        }

        i = 1;
        hlist_for_each_entry_safe(sdres, n, sdres_list, node) {
                pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap);

                uri_attr = nla_nest_start_noflag(msg, i++);
                if (uri_attr == NULL) {
                        rc = -ENOMEM;
                        goto nla_put_failure;
                }

                if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap))
                        goto nla_put_failure;

                if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri))
                        goto nla_put_failure;

                nla_nest_end(msg, uri_attr);

                hlist_del(&sdres->node);

                nfc_llcp_free_sdp_tlv(sdres);
        }

        nla_nest_end(msg, sdp_attr);

        genlmsg_end(msg, hdr);

        return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);

nla_put_failure:
free_msg:
        nlmsg_free(msg);

        nfc_llcp_free_sdp_tlv_list(sdres_list);

        return rc;
}

int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_SE_ADDED);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
            nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) ||
            nla_put_u8(msg, NFC_ATTR_SE_TYPE, type))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_SE_REMOVED);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
            nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx,
                            struct nfc_evt_transaction *evt_transaction)
{
        struct nfc_se *se;
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_SE_TRANSACTION);
        if (!hdr)
                goto free_msg;

        se = nfc_find_se(dev, se_idx);
        if (!se)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
            nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) ||
            nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) ||
            nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len,
                    evt_transaction->aid) ||
            nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len,
                    evt_transaction->params))
                goto nla_put_failure;

        /* evt_transaction is no more used */
        devm_kfree(&dev->dev, evt_transaction);

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        /* evt_transaction is no more used */
        devm_kfree(&dev->dev, evt_transaction);
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx)
{
        const struct nfc_se *se;
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_EVENT_SE_CONNECTIVITY);
        if (!hdr)
                goto free_msg;

        se = nfc_find_se(dev, se_idx);
        if (!se)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
            nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) ||
            nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev,
                                u32 portid, u32 seq,
                                struct netlink_callback *cb,
                                int flags)
{
        void *hdr;

        hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags,
                          NFC_CMD_GET_DEVICE);
        if (!hdr)
                return -EMSGSIZE;

        if (cb)
                genl_dump_check_consistent(cb, hdr);

        if (nfc_genl_setup_device_added(dev, msg))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nfc_genl_dump_devices(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
        struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
        bool first_call = false;

        if (!iter) {
                first_call = true;
                iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL);
                if (!iter)
                        return -ENOMEM;
                cb->args[0] = (long) iter;
        }

        mutex_lock(&nfc_devlist_mutex);

        cb->seq = nfc_devlist_generation;

        if (first_call) {
                nfc_device_iter_init(iter);
                dev = nfc_device_iter_next(iter);
        }

        while (dev) {
                int rc;

                rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid,
                                          cb->nlh->nlmsg_seq, cb, NLM_F_MULTI);
                if (rc < 0)
                        break;

                dev = nfc_device_iter_next(iter);
        }

        mutex_unlock(&nfc_devlist_mutex);

        cb->args[1] = (long) dev;

        return skb->len;
}

static int nfc_genl_dump_devices_done(struct netlink_callback *cb)
{
        struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];

        if (iter) {
                nfc_device_iter_exit(iter);
                kfree(iter);
        }

        return 0;
}

int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
                               u8 comm_mode, u8 rf_mode)
{
        struct sk_buff *msg;
        void *hdr;

        pr_debug("DEP link is up\n");

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;
        if (rf_mode == NFC_RF_INITIATOR &&
            nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx))
                goto nla_put_failure;
        if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) ||
            nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        dev->dep_link_up = true;

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

int nfc_genl_dep_link_down_event(struct nfc_dev *dev)
{
        struct sk_buff *msg;
        void *hdr;

        pr_debug("DEP link is down\n");

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_CMD_DEP_LINK_DOWN);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info)
{
        struct sk_buff *msg;
        struct nfc_dev *dev;
        u32 idx;
        int rc = -ENOBUFS;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg) {
                rc = -ENOMEM;
                goto out_putdev;
        }

        rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq,
                                  NULL, 0);
        if (rc < 0)
                goto out_free;

        nfc_put_device(dev);

        return genlmsg_reply(msg, info);

out_free:
        nlmsg_free(msg);
out_putdev:
        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        rc = nfc_dev_up(dev);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        rc = nfc_dev_down(dev);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx;
        u32 im_protocols = 0, tm_protocols = 0;

        pr_debug("Poll start\n");

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] &&
              !info->attrs[NFC_ATTR_PROTOCOLS]) &&
              !info->attrs[NFC_ATTR_TM_PROTOCOLS]))
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        if (info->attrs[NFC_ATTR_TM_PROTOCOLS])
                tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]);

        if (info->attrs[NFC_ATTR_IM_PROTOCOLS])
                im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]);
        else if (info->attrs[NFC_ATTR_PROTOCOLS])
                im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        mutex_lock(&dev->genl_data.genl_data_mutex);

        rc = nfc_start_poll(dev, im_protocols, tm_protocols);
        if (!rc)
                dev->genl_data.poll_req_portid = info->snd_portid;

        mutex_unlock(&dev->genl_data.genl_data_mutex);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        device_lock(&dev->dev);

        if (!dev->polling) {
                device_unlock(&dev->dev);
                nfc_put_device(dev);
                return -EINVAL;
        }

        device_unlock(&dev->dev);

        mutex_lock(&dev->genl_data.genl_data_mutex);

        if (dev->genl_data.poll_req_portid != info->snd_portid) {
                rc = -EBUSY;
                goto out;
        }

        rc = nfc_stop_poll(dev);
        dev->genl_data.poll_req_portid = 0;

out:
        mutex_unlock(&dev->genl_data.genl_data_mutex);
        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        u32 device_idx, target_idx, protocol;
        int rc;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_TARGET_INDEX] ||
            !info->attrs[NFC_ATTR_PROTOCOLS])
                return -EINVAL;

        device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(device_idx);
        if (!dev)
                return -ENODEV;

        target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);
        protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]);

        nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP);
        rc = nfc_activate_target(dev, target_idx, protocol);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_deactivate_target(struct sk_buff *skb,
                                      struct genl_info *info)
{
        struct nfc_dev *dev;
        u32 device_idx, target_idx;
        int rc;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_TARGET_INDEX])
                return -EINVAL;

        device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(device_idx);
        if (!dev)
                return -ENODEV;

        target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);

        rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc, tgt_idx;
        u32 idx;
        u8 comm;

        pr_debug("DEP link up\n");

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_COMM_MODE])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
        if (!info->attrs[NFC_ATTR_TARGET_INDEX])
                tgt_idx = NFC_TARGET_IDX_ANY;
        else
                tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]);

        comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]);

        if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE)
                return -EINVAL;

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        rc = nfc_dep_link_up(dev, tgt_idx, comm);

        nfc_put_device(dev);

        return rc;
}

static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        rc = nfc_dep_link_down(dev);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_send_params(struct sk_buff *msg,
                                struct nfc_llcp_local *local,
                                u32 portid, u32 seq)
{
        void *hdr;

        hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0,
                          NFC_CMD_LLC_GET_PARAMS);
        if (!hdr)
                return -EMSGSIZE;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) ||
            nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) ||
            nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) ||
            nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        struct nfc_llcp_local *local;
        int rc = 0;
        struct sk_buff *msg = NULL;
        u32 idx;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        device_lock(&dev->dev);

        local = nfc_llcp_find_local(dev);
        if (!local) {
                rc = -ENODEV;
                goto exit;
        }

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg) {
                rc = -ENOMEM;
                goto put_local;
        }

        rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq);

put_local:
        nfc_llcp_local_put(local);

exit:
        device_unlock(&dev->dev);

        nfc_put_device(dev);

        if (rc < 0) {
                if (msg)
                        nlmsg_free(msg);

                return rc;
        }

        return genlmsg_reply(msg, info);
}

static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        struct nfc_llcp_local *local;
        u8 rw = 0;
        u16 miux = 0;
        u32 idx;
        int rc = 0;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] &&
             !info->attrs[NFC_ATTR_LLC_PARAM_RW] &&
             !info->attrs[NFC_ATTR_LLC_PARAM_MIUX]))
                return -EINVAL;

        if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) {
                rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]);

                if (rw > LLCP_MAX_RW)
                        return -EINVAL;
        }

        if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) {
                miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]);

                if (miux > LLCP_MAX_MIUX)
                        return -EINVAL;
        }

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        device_lock(&dev->dev);

        local = nfc_llcp_find_local(dev);
        if (!local) {
                rc = -ENODEV;
                goto exit;
        }

        if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) {
                if (dev->dep_link_up) {
                        rc = -EINPROGRESS;
                        goto put_local;
                }

                local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]);
        }

        if (info->attrs[NFC_ATTR_LLC_PARAM_RW])
                local->rw = rw;

        if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX])
                local->miux = cpu_to_be16(miux);

put_local:
        nfc_llcp_local_put(local);

exit:
        device_unlock(&dev->dev);

        nfc_put_device(dev);

        return rc;
}

static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        struct nfc_llcp_local *local;
        struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1];
        u32 idx;
        u8 tid;
        char *uri;
        int rc = 0, rem;
        size_t uri_len, tlvs_len;
        struct hlist_head sdreq_list;
        struct nfc_llcp_sdp_tlv *sdreq;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_LLC_SDP])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        device_lock(&dev->dev);

        if (dev->dep_link_up == false) {
                rc = -ENOLINK;
                goto exit;
        }

        local = nfc_llcp_find_local(dev);
        if (!local) {
                rc = -ENODEV;
                goto exit;
        }

        INIT_HLIST_HEAD(&sdreq_list);

        tlvs_len = 0;

        nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) {
                rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX,
                                                 attr, nfc_sdp_genl_policy,
                                                 info->extack);

                if (rc != 0) {
                        rc = -EINVAL;
                        goto put_local;
                }

                if (!sdp_attrs[NFC_SDP_ATTR_URI])
                        continue;

                uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]);
                if (uri_len == 0)
                        continue;

                uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]);
                if (uri == NULL || *uri == 0)
                        continue;

                tid = local->sdreq_next_tid++;

                sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len);
                if (sdreq == NULL) {
                        rc = -ENOMEM;
                        goto put_local;
                }

                tlvs_len += sdreq->tlv_len;

                hlist_add_head(&sdreq->node, &sdreq_list);
        }

        if (hlist_empty(&sdreq_list)) {
                rc = -EINVAL;
                goto put_local;
        }

        rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len);

put_local:
        nfc_llcp_local_put(local);

exit:
        device_unlock(&dev->dev);

        nfc_put_device(dev);

        return rc;
}

static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx;
        char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1];

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME],
                    sizeof(firmware_name));

        rc = nfc_fw_download(dev, firmware_name);

        nfc_put_device(dev);
        return rc;
}

int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
                              u32 result)
{
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
        if (!msg)
                return -ENOMEM;

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_CMD_FW_DOWNLOAD);
        if (!hdr)
                goto free_msg;

        if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) ||
            nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) ||
            nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);

        return 0;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        return -EMSGSIZE;
}

static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx, se_idx;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_SE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
        se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        rc = nfc_enable_se(dev, se_idx);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        int rc;
        u32 idx, se_idx;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_SE_INDEX])
                return -EINVAL;

        idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
        se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]);

        dev = nfc_get_device(idx);
        if (!dev)
                return -ENODEV;

        rc = nfc_disable_se(dev, se_idx);

        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev,
                                u32 portid, u32 seq,
                                struct netlink_callback *cb,
                                int flags)
{
        void *hdr;
        struct nfc_se *se, *n;

        list_for_each_entry_safe(se, n, &dev->secure_elements, list) {
                hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags,
                                  NFC_CMD_GET_SE);
                if (!hdr)
                        goto nla_put_failure;

                if (cb)
                        genl_dump_check_consistent(cb, hdr);

                if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
                    nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) ||
                    nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type))
                        goto nla_put_failure;

                genlmsg_end(msg, hdr);
        }

        return 0;

nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static int nfc_genl_dump_ses(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
        struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
        bool first_call = false;

        if (!iter) {
                first_call = true;
                iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL);
                if (!iter)
                        return -ENOMEM;
                cb->args[0] = (long) iter;
        }

        mutex_lock(&nfc_devlist_mutex);

        cb->seq = nfc_devlist_generation;

        if (first_call) {
                nfc_device_iter_init(iter);
                dev = nfc_device_iter_next(iter);
        }

        while (dev) {
                int rc;

                rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid,
                                          cb->nlh->nlmsg_seq, cb, NLM_F_MULTI);
                if (rc < 0)
                        break;

                dev = nfc_device_iter_next(iter);
        }

        mutex_unlock(&nfc_devlist_mutex);

        cb->args[1] = (long) dev;

        return skb->len;
}

static int nfc_genl_dump_ses_done(struct netlink_callback *cb)
{
        struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];

        if (iter) {
                nfc_device_iter_exit(iter);
                kfree(iter);
        }

        return 0;
}

static int nfc_se_io(struct nfc_dev *dev, u32 se_idx,
                     u8 *apdu, size_t apdu_length,
                     se_io_cb_t cb, void *cb_context)
{
        struct nfc_se *se;
        int rc;

        pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx);

        device_lock(&dev->dev);

        if (!device_is_registered(&dev->dev)) {
                rc = -ENODEV;
                goto error;
        }

        if (!dev->dev_up) {
                rc = -ENODEV;
                goto error;
        }

        if (!dev->ops->se_io) {
                rc = -EOPNOTSUPP;
                goto error;
        }

        se = nfc_find_se(dev, se_idx);
        if (!se) {
                rc = -EINVAL;
                goto error;
        }

        if (se->state != NFC_SE_ENABLED) {
                rc = -ENODEV;
                goto error;
        }

        rc = dev->ops->se_io(dev, se_idx, apdu,
                        apdu_length, cb, cb_context);

        device_unlock(&dev->dev);
        return rc;

error:
        device_unlock(&dev->dev);
        kfree(cb_context);
        return rc;
}

struct se_io_ctx {
        u32 dev_idx;
        u32 se_idx;
};

static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err)
{
        struct se_io_ctx *ctx = context;
        struct sk_buff *msg;
        void *hdr;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg) {
                kfree(ctx);
                return;
        }

        hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
                          NFC_CMD_SE_IO);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) ||
            nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) ||
            nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);

        genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);

        kfree(ctx);

        return;

nla_put_failure:
free_msg:
        nlmsg_free(msg);
        kfree(ctx);

        return;
}

static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info)
{
        struct nfc_dev *dev;
        struct se_io_ctx *ctx;
        u32 dev_idx, se_idx;
        u8 *apdu;
        size_t apdu_len;
        int rc;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_SE_INDEX] ||
            !info->attrs[NFC_ATTR_SE_APDU])
                return -EINVAL;

        dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
        se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]);

        dev = nfc_get_device(dev_idx);
        if (!dev)
                return -ENODEV;

        if (!dev->ops || !dev->ops->se_io) {
                rc = -EOPNOTSUPP;
                goto put_dev;
        }

        apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]);
        if (apdu_len == 0) {
                rc = -EINVAL;
                goto put_dev;
        }

        apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]);
        if (!apdu) {
                rc = -EINVAL;
                goto put_dev;
        }

        ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL);
        if (!ctx) {
                rc = -ENOMEM;
                goto put_dev;
        }

        ctx->dev_idx = dev_idx;
        ctx->se_idx = se_idx;

        rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx);

put_dev:
        nfc_put_device(dev);
        return rc;
}

static int nfc_genl_vendor_cmd(struct sk_buff *skb,
                               struct genl_info *info)
{
        struct nfc_dev *dev;
        const struct nfc_vendor_cmd *cmd;
        u32 dev_idx, vid, subcmd;
        u8 *data;
        size_t data_len;
        int i, err;

        if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
            !info->attrs[NFC_ATTR_VENDOR_ID] ||
            !info->attrs[NFC_ATTR_VENDOR_SUBCMD])
                return -EINVAL;

        dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
        vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]);
        subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]);

        dev = nfc_get_device(dev_idx);
        if (!dev)
                return -ENODEV;

        if (!dev->vendor_cmds || !dev->n_vendor_cmds) {
                err = -ENODEV;
                goto put_dev;
        }

        if (info->attrs[NFC_ATTR_VENDOR_DATA]) {
                data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]);
                data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]);
                if (data_len == 0) {
                        err = -EINVAL;
                        goto put_dev;
                }
        } else {
                data = NULL;
                data_len = 0;
        }

        for (i = 0; i < dev->n_vendor_cmds; i++) {
                cmd = &dev->vendor_cmds[i];

                if (cmd->vendor_id != vid || cmd->subcmd != subcmd)
                        continue;

                dev->cur_cmd_info = info;
                err = cmd->doit(dev, data, data_len);
                dev->cur_cmd_info = NULL;
                goto put_dev;
        }

        err = -EOPNOTSUPP;

put_dev:
        nfc_put_device(dev);
        return err;
}

/* message building helper */
static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
                                int flags, u8 cmd)
{
        /* since there is no private header just add the generic one */
        return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd);
}

static struct sk_buff *
__nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen,
                           u32 portid, u32 seq,
                           enum nfc_attrs attr,
                           u32 oui, u32 subcmd, gfp_t gfp)
{
        struct sk_buff *skb;
        void *hdr;

        skb = nlmsg_new(approxlen + 100, gfp);
        if (!skb)
                return NULL;

        hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR);
        if (!hdr) {
                kfree_skb(skb);
                return NULL;
        }

        if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx))
                goto nla_put_failure;
        if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui))
                goto nla_put_failure;
        if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd))
                goto nla_put_failure;

        ((void **)skb->cb)[0] = dev;
        ((void **)skb->cb)[1] = hdr;

        return skb;

nla_put_failure:
        kfree_skb(skb);
        return NULL;
}

struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev,
                                                 enum nfc_attrs attr,
                                                 u32 oui, u32 subcmd,
                                                 int approxlen)
{
        if (WARN_ON(!dev->cur_cmd_info))
                return NULL;

        return __nfc_alloc_vendor_cmd_skb(dev, approxlen,
                                          dev->cur_cmd_info->snd_portid,
                                          dev->cur_cmd_info->snd_seq, attr,
                                          oui, subcmd, GFP_KERNEL);
}
EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb);

int nfc_vendor_cmd_reply(struct sk_buff *skb)
{
        struct nfc_dev *dev = ((void **)skb->cb)[0];
        void *hdr = ((void **)skb->cb)[1];

        /* clear CB data for netlink core to own from now on */
        memset(skb->cb, 0, sizeof(skb->cb));

        if (WARN_ON(!dev->cur_cmd_info)) {
                kfree_skb(skb);
                return -EINVAL;
        }

        genlmsg_end(skb, hdr);
        return genlmsg_reply(skb, dev->cur_cmd_info);
}
EXPORT_SYMBOL(nfc_vendor_cmd_reply);

static const struct genl_ops nfc_genl_ops[] = {
        {
                .cmd = NFC_CMD_GET_DEVICE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_get_device,
                .dumpit = nfc_genl_dump_devices,
                .done = nfc_genl_dump_devices_done,
        },
        {
                .cmd = NFC_CMD_DEV_UP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_dev_up,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_DEV_DOWN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_dev_down,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_START_POLL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_start_poll,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_STOP_POLL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_stop_poll,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_DEP_LINK_UP,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_dep_link_up,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_DEP_LINK_DOWN,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_dep_link_down,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_GET_TARGET,
                .validate = GENL_DONT_VALIDATE_STRICT |
                            GENL_DONT_VALIDATE_DUMP_STRICT,
                .dumpit = nfc_genl_dump_targets,
                .done = nfc_genl_dump_targets_done,
        },
        {
                .cmd = NFC_CMD_LLC_GET_PARAMS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_llc_get_params,
        },
        {
                .cmd = NFC_CMD_LLC_SET_PARAMS,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_llc_set_params,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_LLC_SDREQ,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_llc_sdreq,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_FW_DOWNLOAD,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_fw_download,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_ENABLE_SE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_enable_se,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_DISABLE_SE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_disable_se,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_GET_SE,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .dumpit = nfc_genl_dump_ses,
                .done = nfc_genl_dump_ses_done,
        },
        {
                .cmd = NFC_CMD_SE_IO,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_se_io,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_ACTIVATE_TARGET,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_activate_target,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_VENDOR,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_vendor_cmd,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = NFC_CMD_DEACTIVATE_TARGET,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = nfc_genl_deactivate_target,
                .flags = GENL_ADMIN_PERM,
        },
};

static struct genl_family nfc_genl_family __ro_after_init = {
        .hdrsize = 0,
        .name = NFC_GENL_NAME,
        .version = NFC_GENL_VERSION,
        .maxattr = NFC_ATTR_MAX,
        .policy = nfc_genl_policy,
        .module = THIS_MODULE,
        .ops = nfc_genl_ops,
        .n_ops = ARRAY_SIZE(nfc_genl_ops),
        .resv_start_op = NFC_CMD_DEACTIVATE_TARGET + 1,
        .mcgrps = nfc_genl_mcgrps,
        .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps),
};


struct urelease_work {
        struct        work_struct w;
        u32        portid;
};

static void nfc_urelease_event_work(struct work_struct *work)
{
        struct urelease_work *w = container_of(work, struct urelease_work, w);
        struct class_dev_iter iter;
        struct nfc_dev *dev;

        pr_debug("portid %d\n", w->portid);

        mutex_lock(&nfc_devlist_mutex);

        nfc_device_iter_init(&iter);
        dev = nfc_device_iter_next(&iter);

        while (dev) {
                mutex_lock(&dev->genl_data.genl_data_mutex);

                if (dev->genl_data.poll_req_portid == w->portid) {
                        nfc_stop_poll(dev);
                        dev->genl_data.poll_req_portid = 0;
                }

                mutex_unlock(&dev->genl_data.genl_data_mutex);

                dev = nfc_device_iter_next(&iter);
        }

        nfc_device_iter_exit(&iter);

        mutex_unlock(&nfc_devlist_mutex);

        kfree(w);
}

static int nfc_genl_rcv_nl_event(struct notifier_block *this,
                                 unsigned long event, void *ptr)
{
        struct netlink_notify *n = ptr;
        struct urelease_work *w;

        if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC)
                goto out;

        pr_debug("NETLINK_URELEASE event from id %d\n", n->portid);

        w = kmalloc(sizeof(*w), GFP_ATOMIC);
        if (w) {
                INIT_WORK(&w->w, nfc_urelease_event_work);
                w->portid = n->portid;
                schedule_work(&w->w);
        }

out:
        return NOTIFY_DONE;
}

void nfc_genl_data_init(struct nfc_genl_data *genl_data)
{
        genl_data->poll_req_portid = 0;
        mutex_init(&genl_data->genl_data_mutex);
}

void nfc_genl_data_exit(struct nfc_genl_data *genl_data)
{
        mutex_destroy(&genl_data->genl_data_mutex);
}

static struct notifier_block nl_notifier = {
        .notifier_call  = nfc_genl_rcv_nl_event,
};

/**
 * nfc_genl_init() - Initialize netlink interface
 *
 * This initialization function registers the nfc netlink family.
 */
int __init nfc_genl_init(void)
{
        int rc;

        rc = genl_register_family(&nfc_genl_family);
        if (rc)
                return rc;

        netlink_register_notifier(&nl_notifier);

        return 0;
}

/**
 * nfc_genl_exit() - Deinitialize netlink interface
 *
 * This exit function unregisters the nfc netlink family.
 */
void nfc_genl_exit(void)
{
        netlink_unregister_notifier(&nl_notifier);
        genl_unregister_family(&nfc_genl_family);
}




































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wireless configuration interface internals.
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright (C) 2018-2024 Intel Corporation
 */
#ifndef __NET_WIRELESS_CORE_H
#define __NET_WIRELESS_CORE_H
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/rbtree.h>
#include <linux/debugfs.h>
#include <linux/rfkill.h>
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <net/genetlink.h>
#include <net/cfg80211.h>
#include "reg.h"


#define WIPHY_IDX_INVALID        -1

struct cfg80211_registered_device {
        const struct cfg80211_ops *ops;
        struct list_head list;

        /* rfkill support */
        struct rfkill_ops rfkill_ops;
        struct work_struct rfkill_block;

        /* ISO / IEC 3166 alpha2 for which this device is receiving
         * country IEs on, this can help disregard country IEs from APs
         * on the same alpha2 quickly. The alpha2 may differ from
         * cfg80211_regdomain's alpha2 when an intersection has occurred.
         * If the AP is reconfigured this can also be used to tell us if
         * the country on the country IE changed. */
        char country_ie_alpha2[2];

        /*
         * the driver requests the regulatory core to set this regulatory
         * domain as the wiphy's. Only used for %REGULATORY_WIPHY_SELF_MANAGED
         * devices using the regulatory_set_wiphy_regd() API
         */
        const struct ieee80211_regdomain *requested_regd;

        /* If a Country IE has been received this tells us the environment
         * which its telling us its in. This defaults to ENVIRON_ANY */
        enum environment_cap env;

        /* wiphy index, internal only */
        int wiphy_idx;

        /* protected by RTNL */
        int devlist_generation, wdev_id;
        int opencount;
        wait_queue_head_t dev_wait;

        struct list_head beacon_registrations;
        spinlock_t beacon_registrations_lock;

        /* protected by RTNL only */
        int num_running_ifaces;
        int num_running_monitor_ifaces;
        u64 cookie_counter;

        /* BSSes/scanning */
        spinlock_t bss_lock;
        struct list_head bss_list;
        struct rb_root bss_tree;
        u32 bss_generation;
        u32 bss_entries;
        struct cfg80211_scan_request *scan_req; /* protected by RTNL */
        struct cfg80211_scan_request *int_scan_req;
        struct sk_buff *scan_msg;
        struct list_head sched_scan_req_list;
        time64_t suspend_at;
        struct wiphy_work scan_done_wk;

        struct genl_info *cur_cmd_info;

        struct work_struct conn_work;
        struct work_struct event_work;

        struct delayed_work dfs_update_channels_wk;

        struct wireless_dev *background_radar_wdev;
        struct cfg80211_chan_def background_radar_chandef;
        struct delayed_work background_cac_done_wk;
        struct work_struct background_cac_abort_wk;

        /* netlink port which started critical protocol (0 means not started) */
        u32 crit_proto_nlportid;

        struct cfg80211_coalesce *coalesce;

        struct work_struct destroy_work;
        struct wiphy_work sched_scan_stop_wk;
        struct work_struct sched_scan_res_wk;

        struct cfg80211_chan_def radar_chandef;
        struct work_struct propagate_radar_detect_wk;

        struct cfg80211_chan_def cac_done_chandef;
        struct work_struct propagate_cac_done_wk;

        struct work_struct mgmt_registrations_update_wk;
        /* lock for all wdev lists */
        spinlock_t mgmt_registrations_lock;

        struct work_struct wiphy_work;
        struct list_head wiphy_work_list;
        /* protects the list above */
        spinlock_t wiphy_work_lock;
        bool suspended;

        /* must be last because of the way we do wiphy_priv(),
         * and it should at least be aligned to NETDEV_ALIGN */
        struct wiphy wiphy __aligned(NETDEV_ALIGN);
};

static inline
struct cfg80211_registered_device *wiphy_to_rdev(struct wiphy *wiphy)
{
        BUG_ON(!wiphy);
        return container_of(wiphy, struct cfg80211_registered_device, wiphy);
}

static inline void
cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
{
#ifdef CONFIG_PM
        int i;

        if (!rdev->wiphy.wowlan_config)
                return;
        for (i = 0; i < rdev->wiphy.wowlan_config->n_patterns; i++)
                kfree(rdev->wiphy.wowlan_config->patterns[i].mask);
        kfree(rdev->wiphy.wowlan_config->patterns);
        if (rdev->wiphy.wowlan_config->tcp &&
            rdev->wiphy.wowlan_config->tcp->sock)
                sock_release(rdev->wiphy.wowlan_config->tcp->sock);
        kfree(rdev->wiphy.wowlan_config->tcp);
        kfree(rdev->wiphy.wowlan_config->nd_config);
        kfree(rdev->wiphy.wowlan_config);
#endif
}

static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev)
{
        u64 r = ++rdev->cookie_counter;

        if (WARN_ON(r == 0))
                r = ++rdev->cookie_counter;

        return r;
}

extern struct workqueue_struct *cfg80211_wq;
extern struct list_head cfg80211_rdev_list;
extern int cfg80211_rdev_list_generation;

/* This is constructed like this so it can be used in if/else */
static inline int for_each_rdev_check_rtnl(void)
{
        ASSERT_RTNL();
        return 0;
}
#define for_each_rdev(rdev)                                                \
        if (for_each_rdev_check_rtnl()) {} else                                \
                list_for_each_entry(rdev, &cfg80211_rdev_list, list)

struct cfg80211_internal_bss {
        struct list_head list;
        struct list_head hidden_list;
        struct rb_node rbn;
        u64 ts_boottime;
        unsigned long ts;
        unsigned long refcount;
        atomic_t hold;

        /* time at the start of the reception of the first octet of the
         * timestamp field of the last beacon/probe received for this BSS.
         * The time is the TSF of the BSS specified by %parent_bssid.
         */
        u64 parent_tsf;

        /* the BSS according to which %parent_tsf is set. This is set to
         * the BSS that the interface that requested the scan was connected to
         * when the beacon/probe was received.
         */
        u8 parent_bssid[ETH_ALEN] __aligned(2);

        /* must be last because of priv member */
        struct cfg80211_bss pub;
};

static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub)
{
        return container_of(pub, struct cfg80211_internal_bss, pub);
}

static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss)
{
        atomic_inc(&bss->hold);
        if (bss->pub.transmitted_bss) {
                bss = container_of(bss->pub.transmitted_bss,
                                   struct cfg80211_internal_bss, pub);
                atomic_inc(&bss->hold);
        }
}

static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss)
{
        int r = atomic_dec_return(&bss->hold);
        WARN_ON(r < 0);
        if (bss->pub.transmitted_bss) {
                bss = container_of(bss->pub.transmitted_bss,
                                   struct cfg80211_internal_bss, pub);
                r = atomic_dec_return(&bss->hold);
                WARN_ON(r < 0);
        }
}


struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx);
int get_wiphy_idx(struct wiphy *wiphy);

struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);

int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
                          struct net *net);

void cfg80211_init_wdev(struct wireless_dev *wdev);
void cfg80211_register_wdev(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev);

static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces &&
               rdev->num_running_ifaces > 0;
}

enum cfg80211_event_type {
        EVENT_CONNECT_RESULT,
        EVENT_ROAMED,
        EVENT_DISCONNECTED,
        EVENT_IBSS_JOINED,
        EVENT_STOPPED,
        EVENT_PORT_AUTHORIZED,
};

struct cfg80211_event {
        struct list_head list;
        enum cfg80211_event_type type;

        union {
                struct cfg80211_connect_resp_params cr;
                struct cfg80211_roam_info rm;
                struct {
                        const u8 *ie;
                        size_t ie_len;
                        u16 reason;
                        bool locally_generated;
                } dc;
                struct {
                        u8 bssid[ETH_ALEN];
                        struct ieee80211_channel *channel;
                } ij;
                struct {
                        u8 peer_addr[ETH_ALEN];
                        const u8 *td_bitmap;
                        u8 td_bitmap_len;
                } pa;
        };
};

struct cfg80211_cached_keys {
        struct key_params params[4];
        u8 data[4][WLAN_KEY_LEN_WEP104];
        int def;
};

struct cfg80211_beacon_registration {
        struct list_head list;
        u32 nlportid;
};

struct cfg80211_cqm_config {
        struct rcu_head rcu_head;
        u32 rssi_hyst;
        s32 last_rssi_event_value;
        enum nl80211_cqm_rssi_threshold_event last_rssi_event_type;
        bool use_range_api;
        int n_rssi_thresholds;
        s32 rssi_thresholds[] __counted_by(n_rssi_thresholds);
};

void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy,
                                   struct wiphy_work *work);

void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev);

/* free object */
void cfg80211_dev_free(struct cfg80211_registered_device *rdev);

int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
                        char *newname);

void ieee80211_set_bitrate_flags(struct wiphy *wiphy);

void cfg80211_bss_expire(struct cfg80211_registered_device *rdev);
void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
                      unsigned long age_secs);
void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
                                     unsigned int link,
                                     struct ieee80211_channel *channel);

/* IBSS */
int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
                         struct net_device *dev,
                         struct cfg80211_ibss_params *params,
                         struct cfg80211_cached_keys *connkeys);
void cfg80211_clear_ibss(struct net_device *dev, bool nowext);
int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, bool nowext);
void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
                            struct ieee80211_channel *channel);
int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev);

/* mesh */
extern const struct mesh_config default_mesh_config;
extern const struct mesh_setup default_mesh_setup;
int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
                         struct net_device *dev,
                         struct mesh_setup *setup,
                         const struct mesh_config *conf);
int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
                        struct net_device *dev);
int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev,
                              struct cfg80211_chan_def *chandef);

/* OCB */
int cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct ocb_setup *setup);
int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev,
                       struct net_device *dev);

/* AP */
int cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, int link,
                     bool notify);

/* MLME */
int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
                       struct net_device *dev,
                       struct cfg80211_auth_request *req);
int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
                        struct net_device *dev,
                        struct cfg80211_assoc_request *req,
                        struct netlink_ext_ack *extack);
int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, const u8 *bssid,
                         const u8 *ie, int ie_len, u16 reason,
                         bool local_state_change);
int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
                           struct net_device *dev, const u8 *ap_addr,
                           const u8 *ie, int ie_len, u16 reason,
                           bool local_state_change);
void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
                        struct net_device *dev);
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
                                u16 frame_type, const u8 *match_data,
                                int match_len, bool multicast_rx,
                                struct netlink_ext_ack *extack);
void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk);
void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
                          struct wireless_dev *wdev,
                          struct cfg80211_mgmt_tx_params *params,
                          u64 *cookie);
void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
                               const struct ieee80211_ht_cap *ht_capa_mask);
void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
                                const struct ieee80211_vht_cap *vht_capa_mask);

/* SME events */
int cfg80211_connect(struct cfg80211_registered_device *rdev,
                     struct net_device *dev,
                     struct cfg80211_connect_params *connect,
                     struct cfg80211_cached_keys *connkeys,
                     const u8 *prev_bssid);
void __cfg80211_connect_result(struct net_device *dev,
                               struct cfg80211_connect_resp_params *params,
                               bool wextev);
void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
                             size_t ie_len, u16 reason, bool from_ap);
int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u16 reason,
                        bool wextev);
void __cfg80211_roamed(struct wireless_dev *wdev,
                       struct cfg80211_roam_info *info);
void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr,
                                const u8 *td_bitmap, u8 td_bitmap_len);
int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev);
void cfg80211_autodisconnect_wk(struct work_struct *work);

/* SME implementation */
void cfg80211_conn_work(struct work_struct *work);
void cfg80211_sme_scan_done(struct net_device *dev);
bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status);
void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len);
void cfg80211_sme_disassoc(struct wireless_dev *wdev);
void cfg80211_sme_deauth(struct wireless_dev *wdev);
void cfg80211_sme_auth_timeout(struct wireless_dev *wdev);
void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev);
void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev);

/* internal helpers */
bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher);
bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev,
                            int key_idx, bool pairwise);
int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
                                   struct key_params *params, int key_idx,
                                   bool pairwise, const u8 *mac_addr);
void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk);
void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
                           bool send_message);
void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req);
int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev,
                                     bool want_multi);
void cfg80211_sched_scan_results_wk(struct work_struct *work);
int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_sched_scan_request *req,
                                 bool driver_initiated);
int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev,
                               u64 reqid, bool driver_initiated);
void cfg80211_upload_connect_keys(struct wireless_dev *wdev);
int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
                          struct net_device *dev, enum nl80211_iftype ntype,
                          struct vif_params *params);
void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev,
                                  struct wiphy_work *end);
void cfg80211_process_wdev_events(struct wireless_dev *wdev);

bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
                                u32 center_freq_khz, u32 bw_khz);

int cfg80211_scan(struct cfg80211_registered_device *rdev);

extern struct work_struct cfg80211_disconnect_work;

#define NL80211_BSS_USE_FOR_ALL        (NL80211_BSS_USE_FOR_NORMAL | \
                                 NL80211_BSS_USE_FOR_MLD_LINK)

void cfg80211_set_dfs_state(struct wiphy *wiphy,
                            const struct cfg80211_chan_def *chandef,
                            enum nl80211_dfs_state dfs_state);

void cfg80211_dfs_channels_update_work(struct work_struct *work);

void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev);

int
cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev,
                                          struct wireless_dev *wdev,
                                          struct cfg80211_chan_def *chandef);

void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev);

void cfg80211_background_cac_done_wk(struct work_struct *work);

void cfg80211_background_cac_abort_wk(struct work_struct *work);

bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
                                  struct ieee80211_channel *chan);

bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev);

bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef,
                          struct ieee80211_channel *chan,
                          bool primary_only);
bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev,
                               struct ieee80211_channel *chan,
                               bool primary_only);
bool _cfg80211_chandef_usable(struct wiphy *wiphy,
                              const struct cfg80211_chan_def *chandef,
                              u32 prohibited_flags, bool monitor);

static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
{
        unsigned long end = jiffies;

        if (end >= start)
                return jiffies_to_msecs(end - start);

        return jiffies_to_msecs(end + (ULONG_MAX - start) + 1);
}

int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
                                 struct cfg80211_chan_def *chandef);

int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
                           const u8 *rates, unsigned int n_rates,
                           u32 *mask);

int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev,
                                 enum nl80211_iftype iftype, u32 beacon_int);

void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
                               enum nl80211_iftype iftype, int num);

void cfg80211_leave(struct cfg80211_registered_device *rdev,
                    struct wireless_dev *wdev);

void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev);

void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
                       struct wireless_dev *wdev);

struct cfg80211_internal_bss *
cfg80211_bss_update(struct cfg80211_registered_device *rdev,
                    struct cfg80211_internal_bss *tmp,
                    bool signal_valid, unsigned long ts);
#ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS
#define CFG80211_DEV_WARN_ON(cond)        WARN_ON(cond)
#else
/*
 * Trick to enable using it as a condition,
 * and also not give a warning when it's
 * not used that way.
 */
#define CFG80211_DEV_WARN_ON(cond)        ({bool __r = (cond); __r; })
#endif

void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid);
void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev);
void cfg80211_pmsr_free_wk(struct work_struct *work);

void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id);
void cfg80211_remove_links(struct wireless_dev *wdev);
int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev);
void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask);

/**
 * struct cfg80211_colocated_ap - colocated AP information
 *
 * @list: linked list to all colocated APs
 * @bssid: BSSID of the reported AP
 * @ssid: SSID of the reported AP
 * @ssid_len: length of the ssid
 * @center_freq: frequency the reported AP is on
 * @unsolicited_probe: the reported AP is part of an ESS, where all the APs
 *        that operate in the same channel as the reported AP and that might be
 *        detected by a STA receiving this frame, are transmitting unsolicited
 *        Probe Response frames every 20 TUs
 * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP
 * @same_ssid: the reported AP has the same SSID as the reporting AP
 * @multi_bss: the reported AP is part of a multiple BSSID set
 * @transmitted_bssid: the reported AP is the transmitting BSSID
 * @colocated_ess: all the APs that share the same ESS as the reported AP are
 *        colocated and can be discovered via legacy bands.
 * @short_ssid_valid: short_ssid is valid and can be used
 * @short_ssid: the short SSID for this SSID
 * @psd_20: The 20MHz PSD EIRP of the primary 20MHz channel for the reported AP
 */
struct cfg80211_colocated_ap {
        struct list_head list;
        u8 bssid[ETH_ALEN];
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        size_t ssid_len;
        u32 short_ssid;
        u32 center_freq;
        u8 unsolicited_probe:1,
           oct_recommended:1,
           same_ssid:1,
           multi_bss:1,
           transmitted_bssid:1,
           colocated_ess:1,
           short_ssid_valid:1;
        s8 psd_20;
};

#if IS_ENABLED(CONFIG_CFG80211_KUNIT_TEST)
#define EXPORT_SYMBOL_IF_CFG80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym)
#define VISIBLE_IF_CFG80211_KUNIT
void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list);

int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies,
                                struct list_head *list);

size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen,
                           const u8 *subie, size_t subie_len,
                           u8 *new_ie, size_t new_ie_len);
#else
#define EXPORT_SYMBOL_IF_CFG80211_KUNIT(sym)
#define VISIBLE_IF_CFG80211_KUNIT static
#endif /* IS_ENABLED(CONFIG_CFG80211_KUNIT_TEST) */

#endif /* __NET_WIRELESS_CORE_H */















































































































































































































   25 













































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JUMP_LABEL_H
#define _LINUX_JUMP_LABEL_H

/*
 * Jump label support
 *
 * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 *
 * DEPRECATED API:
 *
 * The use of 'struct static_key' directly, is now DEPRECATED. In addition
 * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
 *
 * struct static_key false = STATIC_KEY_INIT_FALSE;
 * struct static_key true = STATIC_KEY_INIT_TRUE;
 * static_key_true()
 * static_key_false()
 *
 * The updated API replacements are:
 *
 * DEFINE_STATIC_KEY_TRUE(key);
 * DEFINE_STATIC_KEY_FALSE(key);
 * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
 * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count);
 * static_branch_likely()
 * static_branch_unlikely()
 *
 * Jump labels provide an interface to generate dynamic branches using
 * self-modifying code. Assuming toolchain and architecture support, if we
 * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)",
 * an "if (static_branch_unlikely(&key))" statement is an unconditional branch
 * (which defaults to false - and the true block is placed out of line).
 * Similarly, we can define an initially true key via
 * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same
 * "if (static_branch_unlikely(&key))", in which case we will generate an
 * unconditional branch to the out-of-line true branch. Keys that are
 * initially true or false can be using in both static_branch_unlikely()
 * and static_branch_likely() statements.
 *
 * At runtime we can change the branch target by setting the key
 * to true via a call to static_branch_enable(), or false using
 * static_branch_disable(). If the direction of the branch is switched by
 * these calls then we run-time modify the branch target via a
 * no-op -> jump or jump -> no-op conversion. For example, for an
 * initially false key that is used in an "if (static_branch_unlikely(&key))"
 * statement, setting the key to true requires us to patch in a jump
 * to the out-of-line of true branch.
 *
 * In addition to static_branch_{enable,disable}, we can also reference count
 * the key or branch direction via static_branch_{inc,dec}. Thus,
 * static_branch_inc() can be thought of as a 'make more true' and
 * static_branch_dec() as a 'make more false'.
 *
 * Since this relies on modifying code, the branch modifying functions
 * must be considered absolute slow paths (machine wide synchronization etc.).
 * OTOH, since the affected branches are unconditional, their runtime overhead
 * will be absolutely minimal, esp. in the default (off) case where the total
 * effect is a single NOP of appropriate size. The on case will patch in a jump
 * to the out-of-line block.
 *
 * When the control is directly exposed to userspace, it is prudent to delay the
 * decrement to avoid high frequency code modifications which can (and do)
 * cause significant performance degradation. Struct static_key_deferred and
 * static_key_slow_dec_deferred() provide for this.
 *
 * Lacking toolchain and or architecture support, static keys fall back to a
 * simple conditional branch.
 *
 * Additional babbling in: Documentation/staging/static-keys.rst
 */

#ifndef __ASSEMBLY__

#include <linux/types.h>
#include <linux/compiler.h>

extern bool static_key_initialized;

#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,                      \
                                    "%s(): static key '%pS' used before call to jump_label_init()", \
                                    __func__, (key))

struct static_key {
        atomic_t enabled;
#ifdef CONFIG_JUMP_LABEL
/*
 * Note:
 *   To make anonymous unions work with old compilers, the static
 *   initialization of them requires brackets. This creates a dependency
 *   on the order of the struct with the initializers. If any fields
 *   are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need
 *   to be modified.
 *
 * bit 0 => 1 if key is initially true
 *            0 if initially false
 * bit 1 => 1 if points to struct static_key_mod
 *            0 if points to struct jump_entry
 */
        union {
                unsigned long type;
                struct jump_entry *entries;
                struct static_key_mod *next;
        };
#endif        /* CONFIG_JUMP_LABEL */
};

#endif /* __ASSEMBLY__ */

#ifdef CONFIG_JUMP_LABEL
#include <asm/jump_label.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE

struct jump_entry {
        s32 code;
        s32 target;
        long key;        // key may be far away from the core kernel under KASLR
};

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return (unsigned long)&entry->code + entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return (unsigned long)&entry->target + entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        long offset = entry->key & ~3L;

        return (struct static_key *)((unsigned long)&entry->key + offset);
}

#else

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        return (struct static_key *)((unsigned long)entry->key & ~3UL);
}

#endif

static inline bool jump_entry_is_branch(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 1UL;
}

static inline bool jump_entry_is_init(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 2UL;
}

static inline void jump_entry_set_init(struct jump_entry *entry, bool set)
{
        if (set)
                entry->key |= 2;
        else
                entry->key &= ~2;
}

static inline int jump_entry_size(struct jump_entry *entry)
{
#ifdef JUMP_LABEL_NOP_SIZE
        return JUMP_LABEL_NOP_SIZE;
#else
        return arch_jump_entry_size(entry);
#endif
}

#endif
#endif

#ifndef __ASSEMBLY__

enum jump_label_type {
        JUMP_LABEL_NOP = 0,
        JUMP_LABEL_JMP,
};

struct module;

#ifdef CONFIG_JUMP_LABEL

#define JUMP_TYPE_FALSE                0UL
#define JUMP_TYPE_TRUE                1UL
#define JUMP_TYPE_LINKED        2UL
#define JUMP_TYPE_MASK                3UL

static __always_inline bool static_key_false(struct static_key *key)
{
        return arch_static_branch(key, false);
}

static __always_inline bool static_key_true(struct static_key *key)
{
        return !arch_static_branch(key, true);
}

extern struct jump_entry __start___jump_table[];
extern struct jump_entry __stop___jump_table[];

extern void jump_label_init(void);
extern void jump_label_init_ro(void);
extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
                                      enum jump_label_type type);
extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
                                            enum jump_label_type type);
extern void arch_jump_label_transform_apply(void);
extern int jump_label_text_reserved(void *start, void *end);
extern bool static_key_slow_inc(struct static_key *key);
extern bool static_key_fast_inc_not_disabled(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
extern bool static_key_slow_inc_cpuslocked(struct static_key *key);
extern void static_key_slow_dec_cpuslocked(struct static_key *key);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
extern void static_key_enable_cpuslocked(struct static_key *key);
extern void static_key_disable_cpuslocked(struct static_key *key);
extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);

/*
 * We should be using ATOMIC_INIT() for initializing .enabled, but
 * the inclusion of atomic.h is problematic for inclusion of jump_label.h
 * in 'low-level' headers. Thus, we are initializing .enabled with a
 * raw value, but have added a BUILD_BUG_ON() to catch any issues in
 * jump_label_init() see: kernel/jump_label.c.
 */
#define STATIC_KEY_INIT_TRUE                                        \
        { .enabled = { 1 },                                        \
          { .type = JUMP_TYPE_TRUE } }
#define STATIC_KEY_INIT_FALSE                                        \
        { .enabled = { 0 },                                        \
          { .type = JUMP_TYPE_FALSE } }

#else  /* !CONFIG_JUMP_LABEL */

#include <linux/atomic.h>
#include <linux/bug.h>

static __always_inline int static_key_count(struct static_key *key)
{
        return raw_atomic_read(&key->enabled);
}

static __always_inline void jump_label_init(void)
{
        static_key_initialized = true;
}

static __always_inline void jump_label_init_ro(void) { }

static __always_inline bool static_key_false(struct static_key *key)
{
        if (unlikely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static __always_inline bool static_key_true(struct static_key *key)
{
        if (likely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static inline bool static_key_fast_inc_not_disabled(struct static_key *key)
{
        int v;

        STATIC_KEY_CHECK_USE(key);
        /*
         * Prevent key->enabled getting negative to follow the same semantics
         * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment.
         */
        v = atomic_read(&key->enabled);
        do {
                if (v < 0 || (v + 1) < 0)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
        return true;
}
#define static_key_slow_inc(key)        static_key_fast_inc_not_disabled(key)

static inline void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        atomic_dec(&key->enabled);
}

#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)

static inline int jump_label_text_reserved(void *start, void *end)
{
        return 0;
}

static inline void jump_label_lock(void) {}
static inline void jump_label_unlock(void) {}

static inline void static_key_enable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }
        atomic_set(&key->enabled, 1);
}

static inline void static_key_disable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }
        atomic_set(&key->enabled, 0);
}

#define static_key_enable_cpuslocked(k)                static_key_enable((k))
#define static_key_disable_cpuslocked(k)        static_key_disable((k))

#define STATIC_KEY_INIT_TRUE        { .enabled = ATOMIC_INIT(1) }
#define STATIC_KEY_INIT_FALSE        { .enabled = ATOMIC_INIT(0) }

#endif        /* CONFIG_JUMP_LABEL */

#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled

/* -------------------------------------------------------------------------- */

/*
 * Two type wrappers around static_key, such that we can use compile time
 * type differentiation to emit the right code.
 *
 * All the below code is macros in order to play type games.
 */

struct static_key_true {
        struct static_key key;
};

struct static_key_false {
        struct static_key key;
};

#define STATIC_KEY_TRUE_INIT  (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE,  }
#define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, }

#define DEFINE_STATIC_KEY_TRUE(name)        \
        struct static_key_true name = STATIC_KEY_TRUE_INIT

#define DEFINE_STATIC_KEY_TRUE_RO(name)        \
        struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT

#define DECLARE_STATIC_KEY_TRUE(name)        \
        extern struct static_key_true name

#define DEFINE_STATIC_KEY_FALSE(name)        \
        struct static_key_false name = STATIC_KEY_FALSE_INIT

#define DEFINE_STATIC_KEY_FALSE_RO(name)        \
        struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT

#define DECLARE_STATIC_KEY_FALSE(name)        \
        extern struct static_key_false name

#define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count)                \
        struct static_key_true name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT,        \
        }

#define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count)                \
        struct static_key_false name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,        \
        }

#define _DEFINE_STATIC_KEY_1(name)        DEFINE_STATIC_KEY_TRUE(name)
#define _DEFINE_STATIC_KEY_0(name)        DEFINE_STATIC_KEY_FALSE(name)
#define DEFINE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)

#define _DEFINE_STATIC_KEY_RO_1(name)        DEFINE_STATIC_KEY_TRUE_RO(name)
#define _DEFINE_STATIC_KEY_RO_0(name)        DEFINE_STATIC_KEY_FALSE_RO(name)
#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)

#define _DECLARE_STATIC_KEY_1(name)        DECLARE_STATIC_KEY_TRUE(name)
#define _DECLARE_STATIC_KEY_0(name)        DECLARE_STATIC_KEY_FALSE(name)
#define DECLARE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)

extern bool ____wrong_branch_error(void);

#define static_key_enabled(x)                                                        \
({                                                                                \
        if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&        \
            !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
            !__builtin_types_compatible_p(typeof(*x), struct static_key_false))        \
                ____wrong_branch_error();                                        \
        static_key_count((struct static_key *)x) > 0;                                \
})

#ifdef CONFIG_JUMP_LABEL

/*
 * Combine the right initial value (type) with the right branch order
 * to generate the desired result.
 *
 *
 * type\branch|        likely (1)              |        unlikely (0)
 * -----------+-----------------------+------------------
 *            |                       |
 *  true (1)  |           ...                      |           ...
 *            |    NOP                      |           JMP L
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *            |                       |
 *  false (0) |           ...                      |           ...
 *            |    JMP L              |           NOP
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *
 * The initial value is encoded in the LSB of static_key::entries,
 * type: 0 = false, 1 = true.
 *
 * The branch type is encoded in the LSB of jump_entry::key,
 * branch: 0 = unlikely, 1 = likely.
 *
 * This gives the following logic table:
 *
 *        enabled        type        branch          instuction
 * -----------------------------+-----------
 *        0        0        0        | NOP
 *        0        0        1        | JMP
 *        0        1        0        | NOP
 *        0        1        1        | JMP
 *
 *        1        0        0        | JMP
 *        1        0        1        | NOP
 *        1        1        0        | JMP
 *        1        1        1        | NOP
 *
 * Which gives the following functions:
 *
 *   dynamic: instruction = enabled ^ branch
 *   static:  instruction = type ^ branch
 *
 * See jump_label_type() / jump_label_init_type().
 */

#define static_branch_likely(x)                                                        \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = !arch_static_branch(&(x)->key, true);                        \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = !arch_static_branch_jump(&(x)->key, true);                \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        likely_notrace(branch);                                                                \
})

#define static_branch_unlikely(x)                                                \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = arch_static_branch_jump(&(x)->key, false);                \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = arch_static_branch(&(x)->key, false);                        \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        unlikely_notrace(branch);                                                        \
})

#else /* !CONFIG_JUMP_LABEL */

#define static_branch_likely(x)                likely_notrace(static_key_enabled(&(x)->key))
#define static_branch_unlikely(x)        unlikely_notrace(static_key_enabled(&(x)->key))

#endif /* CONFIG_JUMP_LABEL */

#define static_branch_maybe(config, x)                                        \
        (IS_ENABLED(config) ? static_branch_likely(x)                        \
                            : static_branch_unlikely(x))

/*
 * Advanced usage; refcount, branch is enabled when: count != 0
 */

#define static_branch_inc(x)                static_key_slow_inc(&(x)->key)
#define static_branch_dec(x)                static_key_slow_dec(&(x)->key)
#define static_branch_inc_cpuslocked(x)        static_key_slow_inc_cpuslocked(&(x)->key)
#define static_branch_dec_cpuslocked(x)        static_key_slow_dec_cpuslocked(&(x)->key)

/*
 * Normal usage; boolean enable/disable.
 */

#define static_branch_enable(x)                        static_key_enable(&(x)->key)
#define static_branch_disable(x)                static_key_disable(&(x)->key)
#define static_branch_enable_cpuslocked(x)        static_key_enable_cpuslocked(&(x)->key)
#define static_branch_disable_cpuslocked(x)        static_key_disable_cpuslocked(&(x)->key)

#endif /* __ASSEMBLY__ */

#endif        /* _LINUX_JUMP_LABEL_H */





















































    1 
















































































































    5 













    4 


    1 
    3 
    4 































































    6 




    5 




    5 
    6 


































































    1 










    1 










































































    1 


































































































    1 







    1 
    1 




    1 




    1 
    1 

    1 






























































































    1 
    1 

















































































































































































































































































































































































































































































































































































    3 










    2 










    5 








    4 



    2 








    2 
    2 





















    3 



    4 









   10 





    4 















    7 









    2 






    5 
































   11 

    8 




    2 





    6 

















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/bvec.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>
#include <linux/iov_iter.h>

static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = raw_copy_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
                                 size_t len, void *from, void *priv2)
{
        ssize_t res;

        if (should_fail_usercopy())
                return len;

        from += progress;
        res = copy_to_user_nofault(iter_to, from, len);
        return res < 0 ? len : res;
}

static __always_inline
size_t copy_from_user_iter(void __user *iter_from, size_t progress,
                           size_t len, void *to, void *priv2)
{
        size_t res = len;

        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_from, len)) {
                to += progress;
                instrument_copy_from_user_before(to, iter_from, len);
                res = raw_copy_from_user(to, iter_from, len);
                instrument_copy_from_user_after(to, iter_from, len, res);
        }
        return res;
}

static __always_inline
size_t memcpy_to_iter(void *iter_to, size_t progress,
                      size_t len, void *from, void *priv2)
{
        memcpy(iter_to, from + progress, len);
        return 0;
}

static __always_inline
size_t memcpy_from_iter(void *iter_from, size_t progress,
                        size_t len, void *to, void *priv2)
{
        memcpy(to + progress, iter_from, len);
        return 0;
}

/*
 * fault_in_iov_iter_readable - fault in iov iterator for reading
 * @i: iterator
 * @size: maximum length
 *
 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 * @size.  For each iovec, fault in each page that constitutes the iovec.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 *
 * Always returns 0 for non-userspace iterators.
 */
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_readable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_readable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_readable);

/*
 * fault_in_iov_iter_writeable - fault in iov iterator for writing
 * @i: iterator
 * @size: maximum length
 *
 * Faults in the iterator using get_user_pages(), i.e., without triggering
 * hardware page faults.  This is primarily useful when we already know that
 * some or all of the pages in @i aren't in memory.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 *
 * Always returns 0 for non-user-space iterators.
 */
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_safe_writeable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_writeable);

void iov_iter_init(struct iov_iter *i, unsigned int direction,
                        const struct iovec *iov, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_IOVEC,
                .nofault = false,
                .data_source = direction,
                .__iov = iov,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_init);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter, memcpy_to_iter);
}
EXPORT_SYMBOL(_copy_to_iter);

#ifdef CONFIG_ARCH_HAS_COPY_MC
static __always_inline
size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
                            size_t len, void *from, void *priv2)
{
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = copy_mc_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        return copy_mc_to_kernel(iter_to, from + progress, len);
}

/**
 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 * @addr: source kernel address
 * @bytes: total transfer length
 * @i: destination iterator
 *
 * The pmem driver deploys this for the dax operation
 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 * successfully copied.
 *
 * The main differences between this and typical _copy_to_iter().
 *
 * * Typical tail/residue handling after a fault retries the copy
 *   byte-by-byte until the fault happens again. Re-triggering machine
 *   checks is potentially fatal so the implementation uses source
 *   alignment and poison alignment assumptions to avoid re-triggering
 *   hardware exceptions.
 *
 * * ITER_KVEC and ITER_BVEC can return short copies.  Compare to
 *   copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter_mc, memcpy_to_iter_mc);
}
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */

static __always_inline
size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter, memcpy_from_iter);
}

size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        if (user_backed_iter(i))
                might_fault();
        return __copy_from_iter(addr, bytes, i);
}
EXPORT_SYMBOL(_copy_from_iter);

static __always_inline
size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
}

size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_nocache,
                                   memcpy_from_iter);
}
EXPORT_SYMBOL(_copy_from_iter_nocache);

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
static __always_inline
size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
                                      size_t len, void *to, void *priv2)
{
        return __copy_from_user_flushcache(to + progress, iter_from, len);
}

static __always_inline
size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        memcpy_flushcache(to + progress, iter_from, len);
        return 0;
}

/**
 * _copy_from_iter_flushcache - write destination through cpu cache
 * @addr: destination kernel address
 * @bytes: total transfer length
 * @i: source iterator
 *
 * The pmem driver arranges for filesystem-dax to use this facility via
 * dax_copy_from_iter() for ensuring that writes to persistent memory
 * are flushed through the CPU cache. It is differentiated from
 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 * all iterator types. The _copy_from_iter_nocache() only attempts to
 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 * instructions that strand dirty-data in the cache.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_flushcache,
                                   memcpy_from_iter_flushcache);
}
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
#endif

static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
{
        struct page *head;
        size_t v = n + offset;

        /*
         * The general case needs to access the page order in order
         * to compute the page size.
         * However, we mostly deal with order-0 pages and thus can
         * avoid a possible cache line miss for requests that fit all
         * page orders.
         */
        if (n <= v && v <= PAGE_SIZE)
                return true;

        head = compound_head(page);
        v += (page - head) << PAGE_SHIFT;

        if (WARN_ON(n > v || v > page_size(head)))
                return false;
        return true;
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_to_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter);

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
                                 struct iov_iter *i)
{
        size_t res = 0;

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);

                n = iterate_and_advance(i, n, kaddr + offset,
                                        copy_to_user_iter_nofault,
                                        memcpy_to_iter);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter_nofault);

size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_from_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_from_iter);

static __always_inline
size_t zero_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *priv, void *priv2)
{
        return clear_user(iter_to, len);
}

static __always_inline
size_t zero_to_iter(void *iter_to, size_t progress,
                    size_t len, void *priv, void *priv2)
{
        memset(iter_to, 0, len);
        return 0;
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, NULL,
                                   zero_to_user_iter, zero_to_iter);
}
EXPORT_SYMBOL(iov_iter_zero);

size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        size_t n, copied = 0;

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        do {
                char *p;

                n = bytes - copied;
                if (PageHighMem(page)) {
                        page += offset / PAGE_SIZE;
                        offset %= PAGE_SIZE;
                        n = min_t(size_t, n, PAGE_SIZE - offset);
                }

                p = kmap_atomic(page) + offset;
                n = __copy_from_iter(p, n, i);
                kunmap_atomic(p);
                copied += n;
                offset += n;
        } while (PageHighMem(page) && copied != bytes && n > 0);

        return copied;
}
EXPORT_SYMBOL(copy_page_from_iter_atomic);

static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
{
        const struct bio_vec *bvec, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset;

        for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
                if (likely(size < bvec->bv_len))
                        break;
                size -= bvec->bv_len;
        }
        i->iov_offset = size;
        i->nr_segs -= bvec - i->bvec;
        i->bvec = bvec;
}

static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
{
        const struct iovec *iov, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset; // from beginning of current segment
        for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
                if (likely(size < iov->iov_len))
                        break;
                size -= iov->iov_len;
        }
        i->iov_offset = size;
        i->nr_segs -= iov - iter_iov(i);
        i->__iov = iov;
}

void iov_iter_advance(struct iov_iter *i, size_t size)
{
        if (unlikely(i->count < size))
                size = i->count;
        if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
                i->iov_offset += size;
                i->count -= size;
        } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
                /* iovec and kvec have identical layouts */
                iov_iter_iovec_advance(i, size);
        } else if (iov_iter_is_bvec(i)) {
                iov_iter_bvec_advance(i, size);
        } else if (iov_iter_is_discard(i)) {
                i->count -= size;
        }
}
EXPORT_SYMBOL(iov_iter_advance);

void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
        if (!unroll)
                return;
        if (WARN_ON(unroll > MAX_RW_COUNT))
                return;
        i->count += unroll;
        if (unlikely(iov_iter_is_discard(i)))
                return;
        if (unroll <= i->iov_offset) {
                i->iov_offset -= unroll;
                return;
        }
        unroll -= i->iov_offset;
        if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
                BUG(); /* We should never go beyond the start of the specified
                        * range since we might then be straying into pages that
                        * aren't pinned.
                        */
        } else if (iov_iter_is_bvec(i)) {
                const struct bio_vec *bvec = i->bvec;
                while (1) {
                        size_t n = (--bvec)->bv_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->bvec = bvec;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        } else { /* same logics for iovec and kvec */
                const struct iovec *iov = iter_iov(i);
                while (1) {
                        size_t n = (--iov)->iov_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->__iov = iov;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        }
}
EXPORT_SYMBOL(iov_iter_revert);

/*
 * Return the count of just the current iov_iter segment.
 */
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
        if (i->nr_segs > 1) {
                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                        return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
                if (iov_iter_is_bvec(i))
                        return min(i->count, i->bvec->bv_len - i->iov_offset);
        }
        return i->count;
}
EXPORT_SYMBOL(iov_iter_single_seg_count);

void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
                        const struct kvec *kvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_KVEC,
                .data_source = direction,
                .kvec = kvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_kvec);

void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
                        const struct bio_vec *bvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_BVEC,
                .data_source = direction,
                .bvec = bvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_bvec);

/**
 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @xarray: The xarray to access.
 * @start: The start file position.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
                     struct xarray *xarray, loff_t start, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_XARRAY,
                .data_source = direction,
                .xarray = xarray,
                .xarray_start = start,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_xarray);

/**
 * iov_iter_discard - Initialise an I/O iterator that discards data
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator that just discards everything that's written to it.
 * It's only available as a READ iterator.
 */
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
{
        BUG_ON(direction != READ);
        *i = (struct iov_iter){
                .iter_type = ITER_DISCARD,
                .data_source = false,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_discard);

static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
                                   unsigned len_mask)
{
        const struct iovec *iov = iter_iov(i);
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;

                if (len > size)
                        len = size;
                if (len & len_mask)
                        return false;
                if ((unsigned long)(iov->iov_base + skip) & addr_mask)
                        return false;

                iov++;
                size -= len;
                skip = 0;
        } while (size);

        return true;
}

static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
                                  unsigned len_mask)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned skip = i->iov_offset;
        size_t size = i->count;

        do {
                size_t len = bvec->bv_len;

                if (len > size)
                        len = size;
                if (len & len_mask)
                        return false;
                if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
                        return false;

                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return true;
}

/**
 * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
 *         are aligned to the parameters.
 *
 * @i: &struct iov_iter to restore
 * @addr_mask: bit mask to check against the iov element's addresses
 * @len_mask: bit mask to check against the iov element's lengths
 *
 * Return: false if any addresses or lengths intersect with the provided masks
 */
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
                         unsigned len_mask)
{
        if (likely(iter_is_ubuf(i))) {
                if (i->count & len_mask)
                        return false;
                if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
                        return false;
                return true;
        }

        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_aligned_iovec(i, addr_mask, len_mask);

        if (iov_iter_is_bvec(i))
                return iov_iter_aligned_bvec(i, addr_mask, len_mask);

        if (iov_iter_is_xarray(i)) {
                if (i->count & len_mask)
                        return false;
                if ((i->xarray_start + i->iov_offset) & addr_mask)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL_GPL(iov_iter_is_aligned);

static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
{
        const struct iovec *iov = iter_iov(i);
        unsigned long res = 0;
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;
                if (len) {
                        res |= (unsigned long)iov->iov_base + skip;
                        if (len > size)
                                len = size;
                        res |= len;
                        size -= len;
                }
                iov++;
                skip = 0;
        } while (size);
        return res;
}

static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned res = 0;
        size_t size = i->count;
        unsigned skip = i->iov_offset;

        do {
                size_t len = bvec->bv_len - skip;
                res |= (unsigned long)bvec->bv_offset + skip;
                if (len > size)
                        len = size;
                res |= len;
                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return res;
}

unsigned long iov_iter_alignment(const struct iov_iter *i)
{
        if (likely(iter_is_ubuf(i))) {
                size_t size = i->count;
                if (size)
                        return ((unsigned long)i->ubuf + i->iov_offset) | size;
                return 0;
        }

        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_alignment_iovec(i);

        if (iov_iter_is_bvec(i))
                return iov_iter_alignment_bvec(i);

        if (iov_iter_is_xarray(i))
                return (i->xarray_start + i->iov_offset) | i->count;

        return 0;
}
EXPORT_SYMBOL(iov_iter_alignment);

unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
{
        unsigned long res = 0;
        unsigned long v = 0;
        size_t size = i->count;
        unsigned k;

        if (iter_is_ubuf(i))
                return 0;

        if (WARN_ON(!iter_is_iovec(i)))
                return ~0U;

        for (k = 0; k < i->nr_segs; k++) {
                const struct iovec *iov = iter_iov(i) + k;
                if (iov->iov_len) {
                        unsigned long base = (unsigned long)iov->iov_base;
                        if (v) // if not the first one
                                res |= base | v; // this start | previous end
                        v = base + iov->iov_len;
                        if (size <= iov->iov_len)
                                break;
                        size -= iov->iov_len;
                }
        }
        return res;
}
EXPORT_SYMBOL(iov_iter_gap_alignment);

static int want_pages_array(struct page ***res, size_t size,
                            size_t start, unsigned int maxpages)
{
        unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);

        if (count > maxpages)
                count = maxpages;
        WARN_ON(!count);        // caller should've prevented that
        if (!*res) {
                *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
                if (!*res)
                        return 0;
        }
        return count;
}

static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
                                          pgoff_t index, unsigned int nr_pages)
{
        XA_STATE(xas, xa, index);
        struct page *page;
        unsigned int ret = 0;

        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
                if (xas_retry(&xas, page))
                        continue;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                pages[ret] = find_subpage(page, xas.xa_index);
                get_page(pages[ret]);
                if (++ret == nr_pages)
                        break;
        }
        rcu_read_unlock();
        return ret;
}

static ssize_t iter_xarray_get_pages(struct iov_iter *i,
                                     struct page ***pages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        unsigned nr, offset, count;
        pgoff_t index;
        loff_t pos;

        pos = i->xarray_start + i->iov_offset;
        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;
        *_start_offset = offset;

        count = want_pages_array(pages, maxsize, offset, maxpages);
        if (!count)
                return -ENOMEM;
        nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
        if (nr == 0)
                return 0;

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        i->iov_offset += maxsize;
        i->count -= maxsize;
        return maxsize;
}

/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
{
        size_t skip;
        long k;

        if (iter_is_ubuf(i))
                return (unsigned long)i->ubuf + i->iov_offset;

        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
                const struct iovec *iov = iter_iov(i) + k;
                size_t len = iov->iov_len - skip;

                if (unlikely(!len))
                        continue;
                if (*size > len)
                        *size = len;
                return (unsigned long)iov->iov_base + skip;
        }
        BUG(); // if it had been empty, we wouldn't get called
}

/* must be done on non-empty ITER_BVEC one */
static struct page *first_bvec_segment(const struct iov_iter *i,
                                       size_t *size, size_t *start)
{
        struct page *page;
        size_t skip = i->iov_offset, len;

        len = i->bvec->bv_len - skip;
        if (*size > len)
                *size = len;
        skip += i->bvec->bv_offset;
        page = i->bvec->bv_page + skip / PAGE_SIZE;
        *start = skip % PAGE_SIZE;
        return page;
}

static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
                   struct page ***pages, size_t maxsize,
                   unsigned int maxpages, size_t *start)
{
        unsigned int n, gup_flags = 0;

        if (maxsize > i->count)
                maxsize = i->count;
        if (!maxsize)
                return 0;
        if (maxsize > MAX_RW_COUNT)
                maxsize = MAX_RW_COUNT;

        if (likely(user_backed_iter(i))) {
                unsigned long addr;
                int res;

                if (iov_iter_rw(i) != WRITE)
                        gup_flags |= FOLL_WRITE;
                if (i->nofault)
                        gup_flags |= FOLL_NOFAULT;

                addr = first_iovec_segment(i, &maxsize);
                *start = addr % PAGE_SIZE;
                addr &= PAGE_MASK;
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                res = get_user_pages_fast(addr, n, gup_flags, *pages);
                if (unlikely(res <= 0))
                        return res;
                maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
                iov_iter_advance(i, maxsize);
                return maxsize;
        }
        if (iov_iter_is_bvec(i)) {
                struct page **p;
                struct page *page;

                page = first_bvec_segment(i, &maxsize, start);
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                p = *pages;
                for (int k = 0; k < n; k++)
                        get_page(p[k] = page + k);
                maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
                i->count -= maxsize;
                i->iov_offset += maxsize;
                if (i->iov_offset == i->bvec->bv_len) {
                        i->iov_offset = 0;
                        i->bvec++;
                        i->nr_segs--;
                }
                return maxsize;
        }
        if (iov_iter_is_xarray(i))
                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
        return -EFAULT;
}

ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                size_t maxsize, unsigned maxpages, size_t *start)
{
        if (!maxpages)
                return 0;
        BUG_ON(!pages);

        return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
}
EXPORT_SYMBOL(iov_iter_get_pages2);

ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
                struct page ***pages, size_t maxsize, size_t *start)
{
        ssize_t len;

        *pages = NULL;

        len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
        if (len <= 0) {
                kvfree(*pages);
                *pages = NULL;
        }
        return len;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);

static int iov_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct iovec *p;
        int npages = 0;

        for (p = iter_iov(i); size; skip = 0, p++) {
                unsigned offs = offset_in_page(p->iov_base + skip);
                size_t len = min(p->iov_len - skip, size);

                if (len) {
                        size -= len;
                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                        if (unlikely(npages > maxpages))
                                return maxpages;
                }
        }
        return npages;
}

static int bvec_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct bio_vec *p;
        int npages = 0;

        for (p = i->bvec; size; skip = 0, p++) {
                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
                size_t len = min(p->bv_len - skip, size);

                size -= len;
                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                if (unlikely(npages > maxpages))
                        return maxpages;
        }
        return npages;
}

int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
        if (unlikely(!i->count))
                return 0;
        if (likely(iter_is_ubuf(i))) {
                unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
                int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_npages(i, maxpages);
        if (iov_iter_is_bvec(i))
                return bvec_npages(i, maxpages);
        if (iov_iter_is_xarray(i)) {
                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        return 0;
}
EXPORT_SYMBOL(iov_iter_npages);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
        *new = *old;
        if (iov_iter_is_bvec(new))
                return new->bvec = kmemdup(new->bvec,
                                    new->nr_segs * sizeof(struct bio_vec),
                                    flags);
        else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
                /* iovec and kvec have identical layout */
                return new->__iov = kmemdup(new->__iov,
                                   new->nr_segs * sizeof(struct iovec),
                                   flags);
        return NULL;
}
EXPORT_SYMBOL(dup_iter);

static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uvec, u32 nr_segs)
{
        const struct compat_iovec __user *uiov =
                (const struct compat_iovec __user *)uvec;
        int ret = -EFAULT;
        u32 i;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        for (i = 0; i < nr_segs; i++) {
                compat_uptr_t buf;
                compat_ssize_t len;

                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);

                /* check for compat_size_t not fitting in compat_ssize_t .. */
                if (len < 0) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov[i].iov_base = compat_ptr(buf);
                iov[i].iov_len = len;
        }

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

static __noclone int copy_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uiov, unsigned long nr_segs)
{
        int ret = -EFAULT;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        do {
                void __user *buf;
                ssize_t len;

                unsafe_get_user(len, &uiov->iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov->iov_base, uaccess_end);

                /* check for size_t not fitting in ssize_t .. */
                if (unlikely(len < 0)) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov->iov_base = buf;
                iov->iov_len = len;

                uiov++; iov++;
        } while (--nr_segs);

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

struct iovec *iovec_from_user(const struct iovec __user *uvec,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat)
{
        struct iovec *iov = fast_iov;
        int ret;

        /*
         * SuS says "The readv() function *may* fail if the iovcnt argument was
         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
         * traditionally returned zero for zero segments, so...
         */
        if (nr_segs == 0)
                return iov;
        if (nr_segs > UIO_MAXIOV)
                return ERR_PTR(-EINVAL);
        if (nr_segs > fast_segs) {
                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
                if (!iov)
                        return ERR_PTR(-ENOMEM);
        }

        if (unlikely(compat))
                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
        else
                ret = copy_iovec_from_user(iov, uvec, nr_segs);
        if (ret) {
                if (iov != fast_iov)
                        kfree(iov);
                return ERR_PTR(ret);
        }

        return iov;
}

/*
 * Single segment iovec supplied by the user, import it as ITER_UBUF.
 */
static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
                                   struct iovec **iovp, struct iov_iter *i,
                                   bool compat)
{
        struct iovec *iov = *iovp;
        ssize_t ret;

        if (compat)
                ret = copy_compat_iovec_from_user(iov, uvec, 1);
        else
                ret = copy_iovec_from_user(iov, uvec, 1);
        if (unlikely(ret))
                return ret;

        ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
        if (unlikely(ret))
                return ret;
        *iovp = NULL;
        return i->count;
}

ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat)
{
        ssize_t total_len = 0;
        unsigned long seg;
        struct iovec *iov;

        if (nr_segs == 1)
                return __import_iovec_ubuf(type, uvec, iovp, i, compat);

        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
        if (IS_ERR(iov)) {
                *iovp = NULL;
                return PTR_ERR(iov);
        }

        /*
         * According to the Single Unix Specification we should return EINVAL if
         * an element length is < 0 when cast to ssize_t or if the total length
         * would overflow the ssize_t return value of the system call.
         *
         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
         * overflow case.
         */
        for (seg = 0; seg < nr_segs; seg++) {
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (!access_ok(iov[seg].iov_base, len)) {
                        if (iov != *iovp)
                                kfree(iov);
                        *iovp = NULL;
                        return -EFAULT;
                }

                if (len > MAX_RW_COUNT - total_len) {
                        len = MAX_RW_COUNT - total_len;
                        iov[seg].iov_len = len;
                }
                total_len += len;
        }

        iov_iter_init(i, type, iov, nr_segs, total_len);
        if (iov == *iovp)
                *iovp = NULL;
        else
                *iovp = iov;
        return total_len;
}

/**
 * import_iovec() - Copy an array of &struct iovec from userspace
 *     into the kernel, check that it is valid, and initialize a new
 *     &struct iov_iter iterator to access it.
 *
 * @type: One of %READ or %WRITE.
 * @uvec: Pointer to the userspace array.
 * @nr_segs: Number of elements in userspace array.
 * @fast_segs: Number of elements in @iov.
 * @iovp: (input and output parameter) Pointer to pointer to (usually small
 *     on-stack) kernel array.
 * @i: Pointer to iterator that will be initialized on success.
 *
 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
 * then this function places %NULL in *@iov on return. Otherwise, a new
 * array will be allocated and the result placed in *@iov. This means that
 * the caller may call kfree() on *@iov regardless of whether the small
 * on-stack array was used or not (and regardless of whether this function
 * returns an error or not).
 *
 * Return: Negative error code on error, bytes imported on success
 */
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iovp, struct iov_iter *i)
{
        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
                              in_compat_syscall());
}
EXPORT_SYMBOL(import_iovec);

int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
        if (len > MAX_RW_COUNT)
                len = MAX_RW_COUNT;
        if (unlikely(!access_ok(buf, len)))
                return -EFAULT;

        iov_iter_ubuf(i, rw, buf, len);
        return 0;
}
EXPORT_SYMBOL_GPL(import_ubuf);

/**
 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
 *     iov_iter_save_state() was called.
 *
 * @i: &struct iov_iter to restore
 * @state: state to restore from
 *
 * Used after iov_iter_save_state() to bring restore @i, if operations may
 * have advanced it.
 *
 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
 */
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
                         !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
                return;
        i->iov_offset = state->iov_offset;
        i->count = state->count;
        if (iter_is_ubuf(i))
                return;
        /*
         * For the *vec iters, nr_segs + iov is constant - if we increment
         * the vec, then we also decrement the nr_segs count. Hence we don't
         * need to track both of these, just one is enough and we can deduct
         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
         * size, so we can just increment the iov pointer as they are unionzed.
         * ITER_BVEC _may_ be the same size on some archs, but on others it is
         * not. Be safe and handle it separately.
         */
        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
        if (iov_iter_is_bvec(i))
                i->bvec -= state->nr_segs - i->nr_segs;
        else
                i->__iov -= state->nr_segs - i->nr_segs;
        i->nr_segs = state->nr_segs;
}

/*
 * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
 * get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        struct page *page, **p;
        unsigned int nr = 0, offset;
        loff_t pos = i->xarray_start + i->iov_offset;
        pgoff_t index = pos >> PAGE_SHIFT;
        XA_STATE(xas, i->xarray, index);

        offset = pos & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
                if (xas_retry(&xas, page))
                        continue;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                p[nr++] = find_subpage(page, xas.xa_index);
                if (nr == maxpages)
                        break;
        }
        rcu_read_unlock();

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/*
 * Extract a list of contiguous pages from an ITER_BVEC iterator.  This does
 * not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        struct page **p, *page;
        size_t skip = i->iov_offset, offset, size;
        int k;

        for (;;) {
                if (i->nr_segs == 0)
                        return 0;
                size = min(maxsize, i->bvec->bv_len - skip);
                if (size)
                        break;
                i->iov_offset = 0;
                i->nr_segs--;
                i->bvec++;
                skip = 0;
        }

        skip += i->bvec->bv_offset;
        page = i->bvec->bv_page + skip / PAGE_SIZE;
        offset = skip % PAGE_SIZE;
        *offset0 = offset;

        maxpages = want_pages_array(pages, size, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;
        for (k = 0; k < maxpages; k++)
                p[k] = page + k;

        size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        struct page **p, *page;
        const void *kaddr;
        size_t skip = i->iov_offset, offset, len, size;
        int k;

        for (;;) {
                if (i->nr_segs == 0)
                        return 0;
                size = min(maxsize, i->kvec->iov_len - skip);
                if (size)
                        break;
                i->iov_offset = 0;
                i->nr_segs--;
                i->kvec++;
                skip = 0;
        }

        kaddr = i->kvec->iov_base + skip;
        offset = (unsigned long)kaddr & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, size, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        kaddr -= offset;
        len = offset + size;
        for (k = 0; k < maxpages; k++) {
                size_t seg = min_t(size_t, len, PAGE_SIZE);

                if (is_vmalloc_or_module_addr(kaddr))
                        page = vmalloc_to_page(kaddr);
                else
                        page = virt_to_page(kaddr);

                p[k] = page;
                len -= seg;
                kaddr += PAGE_SIZE;
        }

        size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of contiguous pages from a user iterator and get a pin on
 * each of them.  This should only be used if the iterator is user-backed
 * (IOBUF/UBUF).
 *
 * It does not get refs on the pages, but the pages must be unpinned by the
 * caller once the transfer is complete.
 *
 * This is safe to be used where background IO/DMA *is* going to be modifying
 * the buffer; using a pin rather than a ref makes forces fork() to give the
 * child a copy of the page.
 */
static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
                                           struct page ***pages,
                                           size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        unsigned long addr;
        unsigned int gup_flags = 0;
        size_t offset;
        int res;

        if (i->data_source == ITER_DEST)
                gup_flags |= FOLL_WRITE;
        if (extraction_flags & ITER_ALLOW_P2PDMA)
                gup_flags |= FOLL_PCI_P2PDMA;
        if (i->nofault)
                gup_flags |= FOLL_NOFAULT;

        addr = first_iovec_segment(i, &maxsize);
        *offset0 = offset = addr % PAGE_SIZE;
        addr &= PAGE_MASK;
        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
        if (unlikely(res <= 0))
                return res;
        maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/**
 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
 * @i: The iterator to extract from
 * @pages: Where to return the list of pages
 * @maxsize: The maximum amount of iterator to extract
 * @maxpages: The maximum size of the list of pages
 * @extraction_flags: Flags to qualify request
 * @offset0: Where to return the starting offset into (*@pages)[0]
 *
 * Extract a list of contiguous pages from the current point of the iterator,
 * advancing the iterator.  The maximum number of pages and the maximum amount
 * of page contents can be set.
 *
 * If *@pages is NULL, a page list will be allocated to the required size and
 * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
 * that the caller allocated a page list at least @maxpages in size and this
 * will be filled in.
 *
 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
 * be allowed on the pages extracted.
 *
 * The iov_iter_extract_will_pin() function can be used to query how cleanup
 * should be performed.
 *
 * Extra refs or pins on the pages may be obtained as follows:
 *
 *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
 *      added to the pages, but refs will not be taken.
 *      iov_iter_extract_will_pin() will return true.
 *
 *  (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are
 *      merely listed; no extra refs or pins are obtained.
 *      iov_iter_extract_will_pin() will return 0.
 *
 * Note also:
 *
 *  (*) Use with ITER_DISCARD is not supported as that has no content.
 *
 * On success, the function sets *@pages to the new pagelist, if allocated, and
 * sets *offset0 to the offset into the first page.
 *
 * It may also return -ENOMEM and -EFAULT.
 */
ssize_t iov_iter_extract_pages(struct iov_iter *i,
                               struct page ***pages,
                               size_t maxsize,
                               unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0)
{
        maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
        if (!maxsize)
                return 0;

        if (likely(user_backed_iter(i)))
                return iov_iter_extract_user_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_kvec(i))
                return iov_iter_extract_kvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_bvec(i))
                return iov_iter_extract_bvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_xarray(i))
                return iov_iter_extract_xarray_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);



























    2 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/* SPDX-License-Identifier: GPL-2.0 */
/* linux/net/inet/arp.h */
#ifndef _ARP_H
#define _ARP_H

#include <linux/if_arp.h>
#include <linux/hash.h>
#include <net/neighbour.h>


extern struct neigh_table arp_tbl;

static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd)
{
        u32 key = *(const u32 *)pkey;
        u32 val = key ^ hash32_ptr(dev);

        return val * hash_rnd[0];
}

#ifdef CONFIG_INET
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
                key = INADDR_ANY;

        return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
}
#else
static inline
struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
        return NULL;
}
#endif

static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv4_neigh_lookup_noref(dev, key);
        if (n && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;
        rcu_read_unlock();

        return n;
}

static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv4_neigh_lookup_noref(dev, key);
        neigh_confirm(n);
        rcu_read_unlock();
}

void arp_init(void);
int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg);
void arp_send(int type, int ptype, __be32 dest_ip,
              struct net_device *dev, __be32 src_ip,
              const unsigned char *dest_hw,
              const unsigned char *src_hw, const unsigned char *th);
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir);
void arp_ifdown(struct net_device *dev);
int arp_invalidate(struct net_device *dev, __be32 ip, bool force);

struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
                           struct net_device *dev, __be32 src_ip,
                           const unsigned char *dest_hw,
                           const unsigned char *src_hw,
                           const unsigned char *target_hw);
void arp_xmit(struct sk_buff *skb);

#endif        /* _ARP_H */







































































    1 





    1 
    1 





































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
// SPDX-License-Identifier: GPL-2.0
/*
 *  Implement mseal() syscall.
 *
 *  Copyright (c) 2023,2024 Google, Inc.
 *
 *  Author: Jeff Xu <jeffxu@chromium.org>
 */

#include <linux/mempolicy.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"

static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_SEALED);
}

static inline void set_vma_sealed(struct vm_area_struct *vma)
{
        vm_flags_set(vma, VM_SEALED);
}

/*
 * check if a vma is sealed for modification.
 * return true, if modification is allowed.
 */
static bool can_modify_vma(struct vm_area_struct *vma)
{
        if (unlikely(vma_is_sealed(vma)))
                return false;

        return true;
}

static bool is_madv_discard(int behavior)
{
        return        behavior &
                (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
                 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
}

static bool is_ro_anon(struct vm_area_struct *vma)
{
        /* check anonymous mapping. */
        if (vma->vm_file || vma->vm_flags & VM_SHARED)
                return false;

        /*
         * check for non-writable:
         * PROT=RO or PKRU is not writeable.
         */
        if (!(vma->vm_flags & VM_WRITE) ||
                !arch_vma_access_permitted(vma, true, false, false))
                return true;

        return false;
}

/*
 * Check if the vmas of a memory range are allowed to be modified.
 * the memory ranger can have a gap (unallocated memory).
 * return true, if it is allowed.
 */
bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
{
        struct vm_area_struct *vma;

        VMA_ITERATOR(vmi, mm, start);

        /* going through each vma to check. */
        for_each_vma_range(vmi, vma, end) {
                if (unlikely(!can_modify_vma(vma)))
                        return false;
        }

        /* Allow by default. */
        return true;
}

/*
 * Check if the vmas of a memory range are allowed to be modified by madvise.
 * the memory ranger can have a gap (unallocated memory).
 * return true, if it is allowed.
 */
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
                int behavior)
{
        struct vm_area_struct *vma;

        VMA_ITERATOR(vmi, mm, start);

        if (!is_madv_discard(behavior))
                return true;

        /* going through each vma to check. */
        for_each_vma_range(vmi, vma, end)
                if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
                        return false;

        /* Allow by default. */
        return true;
}

static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct vm_area_struct **prev, unsigned long start,
                unsigned long end, vm_flags_t newflags)
{
        int ret = 0;
        vm_flags_t oldflags = vma->vm_flags;

        if (newflags == oldflags)
                goto out;

        vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        set_vma_sealed(vma);
out:
        *prev = vma;
        return ret;
}

/*
 * Check for do_mseal:
 * 1> start is part of a valid vma.
 * 2> end is part of a valid vma.
 * 3> No gap (unallocated address) between start and end.
 * 4> map is sealable.
 */
static int check_mm_seal(unsigned long start, unsigned long end)
{
        struct vm_area_struct *vma;
        unsigned long nstart = start;

        VMA_ITERATOR(vmi, current->mm, start);

        /* going through each vma to check. */
        for_each_vma_range(vmi, vma, end) {
                if (vma->vm_start > nstart)
                        /* unallocated memory found. */
                        return -ENOMEM;

                if (vma->vm_end >= end)
                        return 0;

                nstart = vma->vm_end;
        }

        return -ENOMEM;
}

/*
 * Apply sealing.
 */
static int apply_mm_seal(unsigned long start, unsigned long end)
{
        unsigned long nstart;
        struct vm_area_struct *vma, *prev;

        VMA_ITERATOR(vmi, current->mm, start);

        vma = vma_iter_load(&vmi);
        /*
         * Note: check_mm_seal should already checked ENOMEM case.
         * so vma should not be null, same for the other ENOMEM cases.
         */
        prev = vma_prev(&vmi);
        if (start > vma->vm_start)
                prev = vma;

        nstart = start;
        for_each_vma_range(vmi, vma, end) {
                int error;
                unsigned long tmp;
                vm_flags_t newflags;

                newflags = vma->vm_flags | VM_SEALED;
                tmp = vma->vm_end;
                if (tmp > end)
                        tmp = end;
                error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
                if (error)
                        return error;
                nstart = vma_iter_end(&vmi);
        }

        return 0;
}

/*
 * mseal(2) seals the VM's meta data from
 * selected syscalls.
 *
 * addr/len: VM address range.
 *
 *  The address range by addr/len must meet:
 *   start (addr) must be in a valid VMA.
 *   end (addr + len) must be in a valid VMA.
 *   no gap (unallocated memory) between start and end.
 *   start (addr) must be page aligned.
 *
 *  len: len will be page aligned implicitly.
 *
 *   Below VMA operations are blocked after sealing.
 *   1> Unmapping, moving to another location, and shrinking
 *        the size, via munmap() and mremap(), can leave an empty
 *        space, therefore can be replaced with a VMA with a new
 *        set of attributes.
 *   2> Moving or expanding a different vma into the current location,
 *        via mremap().
 *   3> Modifying a VMA via mmap(MAP_FIXED).
 *   4> Size expansion, via mremap(), does not appear to pose any
 *        specific risks to sealed VMAs. It is included anyway because
 *        the use case is unclear. In any case, users can rely on
 *        merging to expand a sealed VMA.
 *   5> mprotect and pkey_mprotect.
 *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
 *      for anonymous memory, when users don't have write permission to the
 *        memory. Those behaviors can alter region contents by discarding pages,
 *        effectively a memset(0) for anonymous memory.
 *
 *  flags: reserved.
 *
 * return values:
 *  zero: success.
 *  -EINVAL:
 *   invalid input flags.
 *   start address is not page aligned.
 *   Address arange (start + len) overflow.
 *  -ENOMEM:
 *   addr is not a valid address (not allocated).
 *   end (start + len) is not a valid address.
 *   a gap (unallocated memory) between start and end.
 *  -EPERM:
 *  - In 32 bit architecture, sealing is not supported.
 * Note:
 *  user can call mseal(2) multiple times, adding a seal on an
 *  already sealed memory is a no-action (no error).
 *
 *  unseal() is not supported.
 */
static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
        size_t len;
        int ret = 0;
        unsigned long end;
        struct mm_struct *mm = current->mm;

        ret = can_do_mseal(flags);
        if (ret)
                return ret;

        start = untagged_addr(start);
        if (!PAGE_ALIGNED(start))
                return -EINVAL;

        len = PAGE_ALIGN(len_in);
        /* Check to see whether len was rounded up from small -ve to zero. */
        if (len_in && !len)
                return -EINVAL;

        end = start + len;
        if (end < start)
                return -EINVAL;

        if (end == start)
                return 0;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        /*
         * First pass, this helps to avoid
         * partial sealing in case of error in input address range,
         * e.g. ENOMEM error.
         */
        ret = check_mm_seal(start, end);
        if (ret)
                goto out;

        /*
         * Second pass, this should success, unless there are errors
         * from vma_modify_flags, e.g. merge/split error, or process
         * reaching the max supported VMAs, however, those cases shall
         * be rare.
         */
        ret = apply_mm_seal(start, end);

out:
        mmap_write_unlock(current->mm);
        return ret;
}

SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
                flags)
{
        return do_mseal(start, len, flags);
}

































































    1 










    1 






    1 










    1 



    1 













    1 




    1 




















    1 






























































































    1 






    1 






    1 

































































































    1 




    1 
    1 
    1 



    1 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
// SPDX-License-Identifier: GPL-2.0
/*
 * Interface between ext4 and JBD
 */

#include "ext4_jbd2.h"

#include <trace/events/ext4.h>

int ext4_inode_journal_mode(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) == NULL)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        /* We do not support data journalling with delayed allocation */
        if (!S_ISREG(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
            test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
            !test_opt(inode->i_sb, DELALLOC))) {
                /* We do not support data journalling for encrypted data */
                if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
                        return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
                return EXT4_INODE_JOURNAL_DATA_MODE;        /* journal data */
        }
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return EXT4_INODE_ORDERED_DATA_MODE;        /* ordered */
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        BUG();
}

/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
        handle_t *handle = current->journal_info;
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);

        ref_cnt++;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
        return handle;
}


/* Decrement the non-pointer handle value */
static void ext4_put_nojournal(handle_t *handle)
{
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt == 0);

        ref_cnt--;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
}

/*
 * Wrappers for jbd2_journal_start/end.
 */
static int ext4_journal_check_start(struct super_block *sb)
{
        journal_t *journal;

        might_sleep();

        if (unlikely(ext4_forced_shutdown(sb)))
                return -EIO;

        if (WARN_ON_ONCE(sb_rdonly(sb)))
                return -EROFS;

        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
        /*
         * Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly.
         */
        if (journal && is_journal_aborted(journal)) {
                ext4_abort(sb, -journal->j_errno, "Detected aborted journal");
                return -EROFS;
        }
        return 0;
}

handle_t *__ext4_journal_start_sb(struct inode *inode,
                                  struct super_block *sb, unsigned int line,
                                  int type, int blocks, int rsv_blocks,
                                  int revoke_creds)
{
        journal_t *journal;
        int err;
        if (inode)
                trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
                                        revoke_creds, type,
                                        _RET_IP_);
        else
                trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
                                        revoke_creds, type,
                                        _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0)
                return ERR_PTR(err);

        journal = EXT4_SB(sb)->s_journal;
        if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
                return ext4_get_nojournal();
        return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
                                   GFP_NOFS, type, line);
}

int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
{
        struct super_block *sb;
        int err;
        int rc;

        if (!ext4_handle_valid(handle)) {
                ext4_put_nojournal(handle);
                return 0;
        }

        err = handle->h_err;
        if (!handle->h_transaction) {
                rc = jbd2_journal_stop(handle);
                return err ? err : rc;
        }

        sb = handle->h_transaction->t_journal->j_private;
        rc = jbd2_journal_stop(handle);

        if (!err)
                err = rc;
        if (err)
                __ext4_std_error(sb, where, line, err);
        return err;
}

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type)
{
        struct super_block *sb;
        int err;

        if (!ext4_handle_valid(handle))
                return ext4_get_nojournal();

        sb = handle->h_journal->j_private;
        trace_ext4_journal_start_reserved(sb,
                                jbd2_handle_buffer_credits(handle), _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0) {
                jbd2_journal_free_reserved(handle);
                return ERR_PTR(err);
        }

        err = jbd2_journal_start_reserved(handle, type, line);
        if (err < 0)
                return ERR_PTR(err);
        return handle;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred)
{
        if (!ext4_handle_valid(handle))
                return 0;
        if (is_handle_aborted(handle))
                return -EROFS;
        if (jbd2_handle_buffer_credits(handle) >= check_cred &&
            handle->h_revoke_credits >= revoke_cred)
                return 0;
        extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
        revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
        return ext4_journal_extend(handle, extend_cred, revoke_cred);
}

static void ext4_journal_abort_handle(const char *caller, unsigned int line,
                                      const char *err_fn,
                                      struct buffer_head *bh,
                                      handle_t *handle, int err)
{
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);

        BUG_ON(!ext4_handle_valid(handle));

        if (bh)
                BUFFER_TRACE(bh, "abort");

        if (!handle->h_err)
                handle->h_err = err;

        if (is_handle_aborted(handle))
                return;

        printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
               caller, line, errstr, err_fn);

        jbd2_journal_abort_handle(handle);
}

static void ext4_check_bdev_write_error(struct super_block *sb)
{
        struct address_space *mapping = sb->s_bdev->bd_mapping;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        /*
         * If the block device has write error flag, it may have failed to
         * async write out metadata buffers in the background. In this case,
         * we could read old data from disk and write it out again, which
         * may lead to on-disk filesystem inconsistency.
         */
        if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
                spin_lock(&sbi->s_bdev_wb_lock);
                err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
                spin_unlock(&sbi->s_bdev_wb_lock);
                if (err)
                        ext4_error_err(sb, -err,
                                       "Error while async write back metadata");
        }
}

int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct super_block *sb,
                                    struct buffer_head *bh,
                                    enum ext4_journal_trigger_type trigger_type)
{
        int err;

        might_sleep();

        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_write_access(handle, bh);
                if (err) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        return err;
                }
        } else
                ext4_check_bdev_write_error(sb);
        if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
                return 0;
        BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
        jbd2_journal_set_triggers(bh,
                &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
        return 0;
}

/*
 * The ext4 forget function must perform a revoke if we are freeing data
 * which has been journaled.  Metadata (eg. indirect blocks) must be
 * revoked in all cases.
 *
 * "bh" may be NULL: a metadata block may have been freed from memory
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
 */
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr)
{
        int err;

        might_sleep();

        trace_ext4_forget(inode, is_metadata, blocknr);
        BUFFER_TRACE(bh, "enter");

        ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
                  bh, is_metadata, inode->i_mode,
                  test_opt(inode->i_sb, DATA_FLAGS));

        /* In the no journal case, we can just do a bforget and return */
        if (!ext4_handle_valid(handle)) {
                bforget(bh);
                return 0;
        }

        /* Never use the revoke function if we are doing full data
         * journaling: there is no need to, and a V1 superblock won't
         * support it.  Otherwise, only skip the revoke on un-journaled
         * data blocks. */

        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (!is_metadata && !ext4_should_journal_data(inode))) {
                if (bh) {
                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
                        err = jbd2_journal_forget(handle, bh);
                        if (err)
                                ext4_journal_abort_handle(where, line, __func__,
                                                          bh, handle, err);
                        return err;
                }
                return 0;
        }

        /*
         * data!=journal && (is_metadata || should_journal_data(inode))
         */
        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
        err = jbd2_journal_revoke(handle, blocknr, bh);
        if (err) {
                ext4_journal_abort_handle(where, line, __func__,
                                          bh, handle, err);
                __ext4_error(inode->i_sb, where, line, true, -err, 0,
                             "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
}

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct super_block *sb,
                                struct buffer_head *bh,
                                enum ext4_journal_trigger_type trigger_type)
{
        int err;

        if (!ext4_handle_valid(handle))
                return 0;

        err = jbd2_journal_get_create_access(handle, bh);
        if (err) {
                ext4_journal_abort_handle(where, line, __func__, bh, handle,
                                          err);
                return err;
        }
        if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
                return 0;
        BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
        jbd2_journal_set_triggers(bh,
                &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
        return 0;
}

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh)
{
        int err = 0;

        might_sleep();

        set_buffer_meta(bh);
        set_buffer_prio(bh);
        set_buffer_uptodate(bh);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                /* Errors can only happen due to aborted journal or a nasty bug */
                if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        if (inode == NULL) {
                                pr_err("EXT4: jbd2_journal_dirty_metadata "
                                       "failed: handle type %u started at "
                                       "line %u, credits %u/%u, errcode %d",
                                       handle->h_type,
                                       handle->h_line_no,
                                       handle->h_requested_credits,
                                       jbd2_handle_buffer_credits(handle), err);
                                return err;
                        }
                        ext4_error_inode(inode, where, line,
                                         bh->b_blocknr,
                                         "journal_dirty_metadata failed: "
                                         "handle type %u started at line %u, "
                                         "credits %u/%u, errcode %d",
                                         handle->h_type,
                                         handle->h_line_no,
                                         handle->h_requested_credits,
                                         jbd2_handle_buffer_credits(handle),
                                         err);
                }
        } else {
                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
                else
                        mark_buffer_dirty(bh);
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
                                ext4_error_inode_err(inode, where, line,
                                                     bh->b_blocknr, EIO,
                                        "IO error syncing itable block");
                                err = -EIO;
                        }
                }
        }
        return err;
}






























































































    3 


























































































    1 
    1 



































































































































































































































































































































































































































    4 
    5 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 


    3 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *   Copyright (C) 2009 Red Hat, Inc.
 *
 * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>


static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

struct kthread_create_info
{
        /* Information passed to kthread() from kthreadd. */
        char *full_name;
        int (*threadfn)(void *data);
        void *data;
        int node;

        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion *done;

        struct list_head list;
};

struct kthread {
        unsigned long flags;
        unsigned int cpu;
        int result;
        int (*threadfn)(void *);
        void *data;
        struct completion parked;
        struct completion exited;
#ifdef CONFIG_BLK_CGROUP
        struct cgroup_subsys_state *blkcg_css;
#endif
        /* To store the full name if task comm is truncated. */
        char *full_name;
};

enum KTHREAD_BITS {
        KTHREAD_IS_PER_CPU = 0,
        KTHREAD_SHOULD_STOP,
        KTHREAD_SHOULD_PARK,
};

static inline struct kthread *to_kthread(struct task_struct *k)
{
        WARN_ON(!(k->flags & PF_KTHREAD));
        return k->worker_private;
}

/*
 * Variant of to_kthread() that doesn't assume @p is a kthread.
 *
 * Per construction; when:
 *
 *   (p->flags & PF_KTHREAD) && p->worker_private
 *
 * the task is both a kthread and struct kthread is persistent. However
 * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
 * begin_new_exec()).
 */
static inline struct kthread *__to_kthread(struct task_struct *p)
{
        void *kthread = p->worker_private;
        if (kthread && !(p->flags & PF_KTHREAD))
                kthread = NULL;
        return kthread;
}

void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        struct kthread *kthread = to_kthread(tsk);

        if (!kthread || !kthread->full_name) {
                __get_task_comm(buf, buf_size, tsk);
                return;
        }

        strscpy_pad(buf, kthread->full_name, buf_size);
}

bool set_kthread_struct(struct task_struct *p)
{
        struct kthread *kthread;

        if (WARN_ON_ONCE(to_kthread(p)))
                return false;

        kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
        if (!kthread)
                return false;

        init_completion(&kthread->exited);
        init_completion(&kthread->parked);
        p->vfork_done = &kthread->exited;

        p->worker_private = kthread;
        return true;
}

void free_kthread_struct(struct task_struct *k)
{
        struct kthread *kthread;

        /*
         * Can be NULL if kmalloc() in set_kthread_struct() failed.
         */
        kthread = to_kthread(k);
        if (!kthread)
                return;

#ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread->blkcg_css);
#endif
        k->worker_private = NULL;
        kfree(kthread->full_name);
        kfree(kthread);
}

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
bool kthread_should_stop(void)
{
        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);

static bool __kthread_should_park(struct task_struct *k)
{
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}

/**
 * kthread_should_park - should this kthread park now?
 *
 * When someone calls kthread_park() on your kthread, it will be woken
 * and this will return true.  You should then do the necessary
 * cleanup and call kthread_parkme()
 *
 * Similar to kthread_should_stop(), but this keeps the thread alive
 * and in a park position. kthread_unpark() "restarts" the thread and
 * calls the thread function again.
 */
bool kthread_should_park(void)
{
        return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);

bool kthread_should_stop_or_park(void)
{
        struct kthread *kthread = __to_kthread(current);

        if (!kthread)
                return false;

        return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
}

/**
 * kthread_freezable_should_stop - should this freezable kthread return now?
 * @was_frozen: optional out parameter, indicates whether %current was frozen
 *
 * kthread_should_stop() for freezable kthreads, which will enter
 * refrigerator if necessary.  This function is safe from kthread_stop() /
 * freezer deadlock and freezable kthreads should use this function instead
 * of calling try_to_freeze() directly.
 */
bool kthread_freezable_should_stop(bool *was_frozen)
{
        bool frozen = false;

        might_sleep();

        if (unlikely(freezing(current)))
                frozen = __refrigerator(true);

        if (was_frozen)
                *was_frozen = frozen;

        return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);

/**
 * kthread_func - return the function specified on kthread creation
 * @task: kthread task in question
 *
 * Returns NULL if the task is not a kthread.
 */
void *kthread_func(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        if (kthread)
                return kthread->threadfn;
        return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);

/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
 * Return the data value specified when kthread @task was created.
 * The caller is responsible for ensuring the validity of @task when
 * calling this function.
 */
void *kthread_data(struct task_struct *task)
{
        return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);

/**
 * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
 * was created if accessible.  If @task isn't a kthread task or its data is
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
void *kthread_probe_data(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        void *data = NULL;

        if (kthread)
                copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
        return data;
}

static void __kthread_parkme(struct kthread *self)
{
        for (;;) {
                /*
                 * TASK_PARKED is a special state; we must serialize against
                 * possible pending wakeups to avoid store-store collisions on
                 * task->state.
                 *
                 * Such a collision might possibly result in the task state
                 * changin from TASK_PARKED and us failing the
                 * wait_task_inactive() in kthread_park().
                 */
                set_special_state(TASK_PARKED);
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;

                /*
                 * Thread is going to call schedule(), do not preempt it,
                 * or the caller of kthread_park() may spend more time in
                 * wait_task_inactive().
                 */
                preempt_disable();
                complete(&self->parked);
                schedule_preempt_disabled();
                preempt_enable();
        }
        __set_current_state(TASK_RUNNING);
}

void kthread_parkme(void)
{
        __kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);

/**
 * kthread_exit - Cause the current kthread return @result to kthread_stop().
 * @result: The integer value to return to kthread_stop().
 *
 * While kthread_exit can be called directly, it exists so that
 * functions which do some additional work in non-modular code such as
 * module_put_and_kthread_exit can be implemented.
 *
 * Does not return.
 */
void __noreturn kthread_exit(long result)
{
        struct kthread *kthread = to_kthread(current);
        kthread->result = result;
        do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);

/**
 * kthread_complete_and_exit - Exit the current kthread.
 * @comp: Completion to complete
 * @code: The integer value to return to kthread_stop().
 *
 * If present, complete @comp and then return code to kthread_stop().
 *
 * A kernel thread whose module may be removed after the completion of
 * @comp can use this function to exit safely.
 *
 * Does not return.
 */
void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
{
        if (comp)
                complete(comp);

        kthread_exit(code);
}
EXPORT_SYMBOL(kthread_complete_and_exit);

static int kthread(void *_create)
{
        static const struct sched_param param = { .sched_priority = 0 };
        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
        struct completion *done;
        struct kthread *self;
        int ret;

        self = to_kthread(current);

        /* Release the structure when caller killed by a fatal signal. */
        done = xchg(&create->done, NULL);
        if (!done) {
                kfree(create->full_name);
                kfree(create);
                kthread_exit(-EINTR);
        }

        self->full_name = create->full_name;
        self->threadfn = threadfn;
        self->data = data;

        /*
         * The new thread inherited kthreadd's priority and CPU mask. Reset
         * back to default in case they have been changed.
         */
        sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
        set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));

        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
        /*
         * Thread is going to call schedule(), do not preempt it,
         * or the creator may spend more time in wait_task_inactive().
         */
        preempt_disable();
        complete(done);
        schedule_preempt_disabled();
        preempt_enable();

        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
                cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
        kthread_exit(ret);
}

/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
        if (tsk == kthreadd_task)
                return tsk->pref_node_fork;
#endif
        return NUMA_NO_NODE;
}

static void create_kthread(struct kthread_create_info *create)
{
        int pid;

#ifdef CONFIG_NUMA
        current->pref_node_fork = create->node;
#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, create->full_name,
                            CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* Release the structure when caller killed by a fatal signal. */
                struct completion *done = xchg(&create->done, NULL);

                kfree(create->full_name);
                if (!done) {
                        kfree(create);
                        return;
                }
                create->result = ERR_PTR(pid);
                complete(done);
        }
}

static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                                                    void *data, int node,
                                                    const char namefmt[],
                                                    va_list args)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
        struct kthread_create_info *create = kmalloc(sizeof(*create),
                                                     GFP_KERNEL);

        if (!create)
                return ERR_PTR(-ENOMEM);
        create->threadfn = threadfn;
        create->data = data;
        create->node = node;
        create->done = &done;
        create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
        if (!create->full_name) {
                task = ERR_PTR(-ENOMEM);
                goto free_create;
        }

        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);

        wake_up_process(kthreadd_task);
        /*
         * Wait for completion in killable state, for I might be chosen by
         * the OOM killer while kthreadd is trying to allocate memory for
         * new kernel thread.
         */
        if (unlikely(wait_for_completion_killable(&done))) {
                /*
                 * If I was killed by a fatal signal before kthreadd (or new
                 * kernel thread) calls complete(), leave the cleanup of this
                 * structure to that thread.
                 */
                if (xchg(&create->done, NULL))
                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
                 */
                wait_for_completion(&done);
        }
        task = create->result;
free_create:
        kfree(create);
        return task;
}

/**
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
 * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either return directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
                                           const char namefmt[],
                                           ...)
{
        struct task_struct *task;
        va_list args;

        va_start(args, namefmt);
        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
        va_end(args);

        return task;
}
EXPORT_SYMBOL(kthread_create_on_node);

static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
        unsigned long flags;

        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
        __kthread_bind_mask(p, cpumask_of(cpu), state);
}

void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}

/**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
 * @cpu: cpu (might not be online, must be possible) for @k to run on.
 *
 * Description: This function is equivalent to set_cpus_allowed(),
 * except that @cpu doesn't need to be online, and the thread must be
 * stopped (i.e., just returned from kthread_create()).
 */
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);

/**
 * kthread_create_on_cpu - Create a cpu bound kthread
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: This helper function creates and names a kernel thread
 */
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
                                          const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
        kthread_bind(p, cpu);
        /* CPU hotplug need to bind once again when unparking the thread. */
        to_kthread(p)->cpu = cpu;
        return p;
}
EXPORT_SYMBOL(kthread_create_on_cpu);

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
        struct kthread *kthread = to_kthread(k);
        if (!kthread)
                return;

        WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

        if (cpu < 0) {
                clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
                return;
        }

        kthread->cpu = cpu;
        set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *p)
{
        struct kthread *kthread = __to_kthread(p);
        if (!kthread)
                return false;

        return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:                thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return false, wakes it, and
 * waits for it to return. If the thread is marked percpu then its
 * bound to the cpu again.
 */
void kthread_unpark(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        /*
         * Newly created kthread was parked when the CPU was offline.
         * The binding was lost and we need to set it again.
         */
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);

        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
         */
        wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);

/**
 * kthread_park - park a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return true, wakes it, and
 * waits for it to return. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will park without
 * calling threadfn().
 *
 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
 * If called by the kthread itself just the park bit is set.
 */
int kthread_park(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;

        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
                return -EBUSY;

        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
                /*
                 * Wait for __kthread_parkme() to complete(), this means we
                 * _will_ have TASK_PARKED and are about to call schedule().
                 */
                wait_for_completion(&kthread->parked);
                /*
                 * Now wait for that schedule() to complete and the task to
                 * get scheduled out.
                 */
                WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
        }

        return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call kthread_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
        struct kthread *kthread;
        int ret;

        trace_sched_kthread_stop(k);

        get_task_struct(k);
        kthread = to_kthread(k);
        set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
        kthread_unpark(k);
        set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
        wake_up_process(k);
        wait_for_completion(&kthread->exited);
        ret = kthread->result;
        put_task_struct(k);

        trace_sched_kthread_stop_ret(ret);
        return ret;
}
EXPORT_SYMBOL(kthread_stop);

/**
 * kthread_stop_put - stop a thread and put its task struct
 * @k: thread created by kthread_create().
 *
 * Stops a thread created by kthread_create() and put its task_struct.
 * Only use when holding an extra task struct reference obtained by
 * calling get_task_struct().
 */
int kthread_stop_put(struct task_struct *k)
{
        int ret;

        ret = kthread_stop(k);
        put_task_struct(k);
        return ret;
}
EXPORT_SYMBOL(kthread_stop_put);

int kthreadd(void *unused)
{
        struct task_struct *tsk = current;

        /* Setup a clean context for our children to inherit. */
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
        set_mems_allowed(node_states[N_MEMORY]);

        current->flags |= PF_NOFREEZE;
        cgroup_init_kthreadd();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (list_empty(&kthread_create_list))
                        schedule();
                __set_current_state(TASK_RUNNING);

                spin_lock(&kthread_create_lock);
                while (!list_empty(&kthread_create_list)) {
                        struct kthread_create_info *create;

                        create = list_entry(kthread_create_list.next,
                                            struct kthread_create_info, list);
                        list_del_init(&create->list);
                        spin_unlock(&kthread_create_lock);

                        create_kthread(create);

                        spin_lock(&kthread_create_lock);
                }
                spin_unlock(&kthread_create_lock);
        }

        return 0;
}

void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
{
        memset(worker, 0, sizeof(struct kthread_worker));
        raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);

/**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
 * This function implements the main cycle of kthread worker. It processes
 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
 * is empty.
 *
 * The works are not allowed to keep any locks, disable preemption or interrupts
 * when they finish. There is defined a safe point for freezing when one work
 * finishes and before a new one is started.
 *
 * Also the works must not be handled by more than one worker at the same time,
 * see also kthread_queue_work().
 */
int kthread_worker_fn(void *worker_ptr)
{
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;

        /*
         * FIXME: Update the check and remove the assignment when all kthread
         * worker users are created using kthread_create_worker*() functions.
         */
        WARN_ON(worker->task && worker->task != current);
        worker->task = current;

        if (worker->flags & KTW_FREEZABLE)
                set_freezable();

repeat:
        set_current_state(TASK_INTERRUPTIBLE);        /* mb paired w/ kthread_stop */

        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
                raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
                raw_spin_unlock_irq(&worker->lock);
                return 0;
        }

        work = NULL;
        raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
        raw_spin_unlock_irq(&worker->lock);

        if (work) {
                kthread_work_func_t func = work->func;
                __set_current_state(TASK_RUNNING);
                trace_sched_kthread_work_execute_start(work);
                work->func(work);
                /*
                 * Avoid dereferencing work after this point.  The trace
                 * event only cares about the address.
                 */
                trace_sched_kthread_work_execute_end(work, func);
        } else if (!freezing(current))
                schedule();

        try_to_freeze();
        cond_resched();
        goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);

static __printf(3, 0) struct kthread_worker *
__kthread_create_worker(int cpu, unsigned int flags,
                        const char namefmt[], va_list args)
{
        struct kthread_worker *worker;
        struct task_struct *task;
        int node = NUMA_NO_NODE;

        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker)
                return ERR_PTR(-ENOMEM);

        kthread_init_worker(worker);

        if (cpu >= 0)
                node = cpu_to_node(cpu);

        task = __kthread_create_on_node(kthread_worker_fn, worker,
                                                node, namefmt, args);
        if (IS_ERR(task))
                goto fail_task;

        if (cpu >= 0)
                kthread_bind(task, cpu);

        worker->flags = flags;
        worker->task = task;
        wake_up_process(task);
        return worker;

fail_task:
        kfree(worker);
        return ERR_CAST(task);
}

/**
 * kthread_create_worker - create a kthread worker
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(-1, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker);

/**
 * kthread_create_worker_on_cpu - create a kthread worker and bind it
 *        to a given CPU and the associated NUMA node.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Use a valid CPU number if you want to bind the kthread worker
 * to the given CPU and the associated NUMA node.
 *
 * A good practice is to add the cpu number also into the worker name.
 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
 *
 * CPU hotplug:
 * The kthread worker API is simple and generic. It just provides a way
 * to create, use, and destroy workers.
 *
 * It is up to the API user how to handle CPU hotplug. They have to decide
 * how to handle pending work items, prevent queuing new ones, and
 * restore the functionality when the CPU goes off and on. There are a
 * few catches:
 *
 *    - CPU affinity gets lost when it is scheduled on an offline CPU.
 *
 *    - The worker might not exist when the CPU was off when the user
 *      created the workers.
 *
 * Good practice is to implement two CPU hotplug callbacks and to
 * destroy/create the worker when the CPU goes down/up.
 *
 * Return:
 * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(cpu, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);

/*
 * Returns true when the work could not be queued at the moment.
 * It happens when it is already pending in a worker list
 * or when it is being cancelled.
 */
static inline bool queuing_blocked(struct kthread_worker *worker,
                                   struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);

        return !list_empty(&work->node) || work->canceling;
}

static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
                                             struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);
        WARN_ON_ONCE(!list_empty(&work->node));
        /* Do not use a work with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker && work->worker != worker);
}

/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
                                struct kthread_work *work,
                                struct list_head *pos)
{
        kthread_insert_work_sanity_check(worker, work);

        trace_sched_kthread_work_queue_work(worker, work);

        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
}

/**
 * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_worker_create().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
 *
 * Reinitialize the work if it needs to be used by another worker.
 * For example, when the worker was stopped and started again.
 */
bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
{
        bool ret = false;
        unsigned long flags;

        raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);

/**
 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
 *        delayed work when the timer expires.
 * @t: pointer to the expired timer
 *
 * The format of the function is defined by struct timer_list.
 * It should have been called from irqsafe timer with irq already off.
 */
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
        struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
        unsigned long flags;

        /*
         * This might happen when a pending work is reinitialized.
         * It means that it is used a wrong way.
         */
        if (WARN_ON_ONCE(!worker))
                return;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        /* Move the work from worker->delayed_work_list. */
        WARN_ON_ONCE(list_empty(&work->node));
        list_del_init(&work->node);
        if (!work->canceling)
                kthread_insert_work(worker, work, &worker->work_list);

        raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);

static void __kthread_queue_delayed_work(struct kthread_worker *worker,
                                         struct kthread_delayed_work *dwork,
                                         unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;

        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                kthread_insert_work(worker, work, &worker->work_list);
                return;
        }

        /* Be paranoid and try to detect possible races already now. */
        kthread_insert_work_sanity_check(worker, work);

        list_add(&work->node, &worker->delayed_work_list);
        work->worker = worker;
        timer->expires = jiffies + delay;
        add_timer(timer);
}

/**
 * kthread_queue_delayed_work - queue the associated kthread work
 *        after a delay.
 * @worker: target kthread_worker
 * @dwork: kthread_delayed_work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If the work has not been pending it starts a timer that will queue
 * the work after the given @delay. If @delay is zero, it queues the
 * work immediately.
 *
 * Return: %false if the @work has already been pending. It means that
 * either the timer was running or the work was queued. It returns %true
 * otherwise.
 */
bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&worker->lock, flags);

        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }

        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);

struct kthread_flush_work {
        struct kthread_work        work;
        struct completion        done;
};

static void kthread_flush_work_fn(struct kthread_work *work)
{
        struct kthread_flush_work *fwork =
                container_of(work, struct kthread_flush_work, work);
        complete(&fwork->done);
}

/**
 * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
void kthread_flush_work(struct kthread_work *work)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
        struct kthread_worker *worker;
        bool noop = false;

        worker = work->worker;
        if (!worker)
                return;

        raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (!list_empty(&work->node))
                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
                kthread_insert_work(worker, &fwork.work,
                                    worker->work_list.next);
        else
                noop = true;

        raw_spin_unlock_irq(&worker->lock);

        if (!noop)
                wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);

/*
 * Make sure that the timer is neither set nor running and could
 * not manipulate the work list_head any longer.
 *
 * The function is called under worker->lock. The lock is temporary
 * released but the timer can't be set again in the meantime.
 */
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
                                              unsigned long *flags)
{
        struct kthread_delayed_work *dwork =
                container_of(work, struct kthread_delayed_work, work);
        struct kthread_worker *worker = work->worker;

        /*
         * del_timer_sync() must be called to make sure that the timer
         * callback is not running. The lock must be temporary released
         * to avoid a deadlock with the callback. In the meantime,
         * any queuing is blocked by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, *flags);
        del_timer_sync(&dwork->timer);
        raw_spin_lock_irqsave(&worker->lock, *flags);
        work->canceling--;
}

/*
 * This function removes the work from the worker queue.
 *
 * It is called under worker->lock. The caller must make sure that
 * the timer used by delayed work is not running, e.g. by calling
 * kthread_cancel_delayed_work_timer().
 *
 * The work might still be in use when this function finishes. See the
 * current_work proceed by the worker.
 *
 * Return: %true if @work was pending and successfully canceled,
 *        %false if @work was not pending
 */
static bool __kthread_cancel_work(struct kthread_work *work)
{
        /*
         * Try to remove the work from a worker list. It might either
         * be from worker->work_list or from worker->delayed_work_list.
         */
        if (!list_empty(&work->node)) {
                list_del_init(&work->node);
                return true;
        }

        return false;
}

/**
 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
 * @worker: kthread worker to use
 * @dwork: kthread delayed work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
 * @work is guaranteed to be queued immediately.
 *
 * Return: %false if @dwork was idle and queued, %true otherwise.
 *
 * A special case is when the work is being canceled in parallel.
 * It might be caused either by the real kthread_cancel_delayed_work_sync()
 * or yet another kthread_mod_delayed_work() call. We let the other command
 * win and return %true here. The return value can be used for reference
 * counting and the number of queued works stays the same. Anyway, the caller
 * is supposed to synchronize these operations a reasonable way.
 *
 * This function is safe to call from any context including IRQ handler.
 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
 * for details.
 */
bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&worker->lock, flags);

        /* Do not bother with canceling when never queued. */
        if (!work->worker) {
                ret = false;
                goto fast_queue;
        }

        /* Work must not be used with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker != worker);

        /*
         * Temporary cancel the work but do not fight with another command
         * that is canceling the work as well.
         *
         * It is a bit tricky because of possible races with another
         * mod_delayed_work() and cancel_delayed_work() callers.
         *
         * The timer must be canceled first because worker->lock is released
         * when doing so. But the work can be removed from the queue (list)
         * only when it can be queued again so that the return value can
         * be used for reference counting.
         */
        kthread_cancel_delayed_work_timer(work, &flags);
        if (work->canceling) {
                /* The number of works in the queue does not change. */
                ret = true;
                goto out;
        }
        ret = __kthread_cancel_work(work);

fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
out:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);

static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
        struct kthread_worker *worker = work->worker;
        unsigned long flags;
        int ret = false;

        if (!worker)
                goto out;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (is_dwork)
                kthread_cancel_delayed_work_timer(work, &flags);

        ret = __kthread_cancel_work(work);

        if (worker->current_work != work)
                goto out_fast;

        /*
         * The work is in progress and we need to wait with the lock released.
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
        raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;

out_fast:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
        return ret;
}

/**
 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
 * @work: the kthread work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself. On return from this
 * function, @work is guaranteed to be not pending or executing on any CPU.
 *
 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the worker on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return: %true if @work was pending, %false otherwise.
 */
bool kthread_cancel_work_sync(struct kthread_work *work)
{
        return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);

/**
 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
 *        wait for it to finish.
 * @dwork: the kthread delayed work to cancel
 *
 * This is kthread_cancel_work_sync() for delayed works.
 *
 * Return: %true if @dwork was pending, %false otherwise.
 */
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
        return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);

/**
 * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
void kthread_flush_worker(struct kthread_worker *worker)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };

        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);

/**
 * kthread_destroy_worker - destroy a kthread worker
 * @worker: worker to be destroyed
 *
 * Flush and destroy @worker.  The simple flush is enough because the kthread
 * worker API is used only in trivial scenarios.  There are no multi-step state
 * machines needed.
 *
 * Note that this function is not responsible for handling delayed work, so
 * caller should be responsible for queuing or canceling all delayed work items
 * before invoke this function.
 */
void kthread_destroy_worker(struct kthread_worker *worker)
{
        struct task_struct *task;

        task = worker->task;
        if (WARN_ON(!task))
                return;

        kthread_flush_worker(worker);
        kthread_stop(task);
        WARN_ON(!list_empty(&worker->delayed_work_list));
        WARN_ON(!list_empty(&worker->work_list));
        kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);

/**
 * kthread_use_mm - make the calling kthread operate on an address space
 * @mm: address space to operate on
 */
void kthread_use_mm(struct mm_struct *mm)
{
        struct mm_struct *active_mm;
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(tsk->mm);

        /*
         * It is possible for mm to be the same as tsk->active_mm, but
         * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
         * because these references are not equivalent.
         */
        mmgrab(mm);

        task_lock(tsk);
        /* Hold off tlb flush IPIs while switching mm's */
        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        membarrier_update_current_mm(mm);
        switch_mm_irqs_off(active_mm, mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
#endif

        /*
         * When a kthread starts operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after storing to tsk->mm, before accessing
         * user-space memory. A full memory barrier for membarrier
         * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
         * mmdrop_lazy_tlb().
         */
        mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);

/**
 * kthread_unuse_mm - reverse the effect of kthread_use_mm()
 * @mm: address space to operate on
 */
void kthread_unuse_mm(struct mm_struct *mm)
{
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(!tsk->mm);

        task_lock(tsk);
        /*
         * When a kthread stops operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after accessing user-space memory, before
         * clearing tsk->mm.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        tsk->mm = NULL;
        membarrier_update_current_mm(NULL);
        mmgrab_lazy_tlb(mm);
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
        local_irq_enable();
        task_unlock(tsk);

        mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);

#ifdef CONFIG_BLK_CGROUP
/**
 * kthread_associate_blkcg - associate blkcg to current kthread
 * @css: the cgroup info
 *
 * Current thread must be a kthread. The thread is running jobs on behalf of
 * other threads. In some cases, we expect the jobs attach cgroup info of
 * original threads instead of that of current thread. This function stores
 * original thread's cgroup info in current kthread context for later
 * retrieval.
 */
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
        struct kthread *kthread;

        if (!(current->flags & PF_KTHREAD))
                return;
        kthread = to_kthread(current);
        if (!kthread)
                return;

        if (kthread->blkcg_css) {
                css_put(kthread->blkcg_css);
                kthread->blkcg_css = NULL;
        }
        if (css) {
                css_get(css);
                kthread->blkcg_css = css;
        }
}
EXPORT_SYMBOL(kthread_associate_blkcg);

/**
 * kthread_blkcg - get associated blkcg css of current kthread
 *
 * Current thread must be a kthread.
 */
struct cgroup_subsys_state *kthread_blkcg(void)
{
        struct kthread *kthread;

        if (current->flags & PF_KTHREAD) {
                kthread = to_kthread(current);
                if (kthread)
                        return kthread->blkcg_css;
        }
        return NULL;
}
#endif
































































































































































































    1 














    1 





    1 

    1 









    1 








    1 








    1 








    1 








    1 













    1 






    1 


    1 






















    1 



    1 































































































































































































































































































































































































































































    1 
    1 
    1 






    1 













    1 


















    1 





























    1 

































    1 


















    1 








    1 



    1 






    1 














    1 























    1 



























    1 





    1 

































































    1 




    1 





















    1 








    1 
    1 


    1 














































































    1 


    1 
    1 
    1 



    1 

    1 


















    1 


















    1 




    1 



    1 
































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2003 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions implement the sctp_outq class.   The outqueue handles
 * bundling and queueing of outgoing SCTP chunks.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Perry Melange         <pmelange@null.cc.uic.edu>
 *    Xingang Guo           <xingang.guo@intel.com>
 *    Hui Huang             <hui.huang@nokia.com>
 *    Sridhar Samudrala     <sri@us.ibm.com>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/list.h>   /* For struct list_head */
#include <linux/socket.h>
#include <linux/ip.h>
#include <linux/slab.h>
#include <net/sock.h>          /* For skb_set_owner_w */

#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>
#include <trace/events/sctp.h>

/* Declare internal functions here.  */
static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn);
static void sctp_check_transmitted(struct sctp_outq *q,
                                   struct list_head *transmitted_queue,
                                   struct sctp_transport *transport,
                                   union sctp_addr *saddr,
                                   struct sctp_sackhdr *sack,
                                   __u32 *highest_new_tsn);

static void sctp_mark_missing(struct sctp_outq *q,
                              struct list_head *transmitted_queue,
                              struct sctp_transport *transport,
                              __u32 highest_new_tsn,
                              int count_of_newacks);

static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);

/* Add data to the front of the queue. */
static inline void sctp_outq_head_data(struct sctp_outq *q,
                                       struct sctp_chunk *ch)
{
        struct sctp_stream_out_ext *oute;
        __u16 stream;

        list_add(&ch->list, &q->out_chunk_list);
        q->out_qlen += ch->skb->len;

        stream = sctp_chunk_stream_no(ch);
        oute = SCTP_SO(&q->asoc->stream, stream)->ext;
        list_add(&ch->stream_list, &oute->outq);
}

/* Take data from the front of the queue. */
static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
{
        return q->sched->dequeue(q);
}

/* Add data chunk to the end of the queue. */
static inline void sctp_outq_tail_data(struct sctp_outq *q,
                                       struct sctp_chunk *ch)
{
        struct sctp_stream_out_ext *oute;
        __u16 stream;

        list_add_tail(&ch->list, &q->out_chunk_list);
        q->out_qlen += ch->skb->len;

        stream = sctp_chunk_stream_no(ch);
        oute = SCTP_SO(&q->asoc->stream, stream)->ext;
        list_add_tail(&ch->stream_list, &oute->outq);
}

/*
 * SFR-CACC algorithm:
 * D) If count_of_newacks is greater than or equal to 2
 * and t was not sent to the current primary then the
 * sender MUST NOT increment missing report count for t.
 */
static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary,
                                       struct sctp_transport *transport,
                                       int count_of_newacks)
{
        if (count_of_newacks >= 2 && transport != primary)
                return 1;
        return 0;
}

/*
 * SFR-CACC algorithm:
 * F) If count_of_newacks is less than 2, let d be the
 * destination to which t was sent. If cacc_saw_newack
 * is 0 for destination d, then the sender MUST NOT
 * increment missing report count for t.
 */
static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport,
                                       int count_of_newacks)
{
        if (count_of_newacks < 2 &&
                        (transport && !transport->cacc.cacc_saw_newack))
                return 1;
        return 0;
}

/*
 * SFR-CACC algorithm:
 * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD
 * execute steps C, D, F.
 *
 * C has been implemented in sctp_outq_sack
 */
static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary,
                                     struct sctp_transport *transport,
                                     int count_of_newacks)
{
        if (!primary->cacc.cycling_changeover) {
                if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks))
                        return 1;
                if (sctp_cacc_skip_3_1_f(transport, count_of_newacks))
                        return 1;
                return 0;
        }
        return 0;
}

/*
 * SFR-CACC algorithm:
 * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less
 * than next_tsn_at_change of the current primary, then
 * the sender MUST NOT increment missing report count
 * for t.
 */
static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn)
{
        if (primary->cacc.cycling_changeover &&
            TSN_lt(tsn, primary->cacc.next_tsn_at_change))
                return 1;
        return 0;
}

/*
 * SFR-CACC algorithm:
 * 3) If the missing report count for TSN t is to be
 * incremented according to [RFC2960] and
 * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set,
 * then the sender MUST further execute steps 3.1 and
 * 3.2 to determine if the missing report count for
 * TSN t SHOULD NOT be incremented.
 *
 * 3.3) If 3.1 and 3.2 do not dictate that the missing
 * report count for t should not be incremented, then
 * the sender SHOULD increment missing report count for
 * t (according to [RFC2960] and [SCTP_STEWART_2002]).
 */
static inline int sctp_cacc_skip(struct sctp_transport *primary,
                                 struct sctp_transport *transport,
                                 int count_of_newacks,
                                 __u32 tsn)
{
        if (primary->cacc.changeover_active &&
            (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) ||
             sctp_cacc_skip_3_2(primary, tsn)))
                return 1;
        return 0;
}

/* Initialize an existing sctp_outq.  This does the boring stuff.
 * You still need to define handlers if you really want to DO
 * something with this structure...
 */
void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
{
        memset(q, 0, sizeof(struct sctp_outq));

        q->asoc = asoc;
        INIT_LIST_HEAD(&q->out_chunk_list);
        INIT_LIST_HEAD(&q->control_chunk_list);
        INIT_LIST_HEAD(&q->retransmit);
        INIT_LIST_HEAD(&q->sacked);
        INIT_LIST_HEAD(&q->abandoned);
        sctp_sched_set_sched(asoc, sctp_sk(asoc->base.sk)->default_ss);
}

/* Free the outqueue structure and any related pending chunks.
 */
static void __sctp_outq_teardown(struct sctp_outq *q)
{
        struct sctp_transport *transport;
        struct list_head *lchunk, *temp;
        struct sctp_chunk *chunk, *tmp;

        /* Throw away unacknowledged chunks. */
        list_for_each_entry(transport, &q->asoc->peer.transport_addr_list,
                        transports) {
                while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) {
                        chunk = list_entry(lchunk, struct sctp_chunk,
                                           transmitted_list);
                        /* Mark as part of a failed message. */
                        sctp_chunk_fail(chunk, q->error);
                        sctp_chunk_free(chunk);
                }
        }

        /* Throw away chunks that have been gap ACKed.  */
        list_for_each_safe(lchunk, temp, &q->sacked) {
                list_del_init(lchunk);
                chunk = list_entry(lchunk, struct sctp_chunk,
                                   transmitted_list);
                sctp_chunk_fail(chunk, q->error);
                sctp_chunk_free(chunk);
        }

        /* Throw away any chunks in the retransmit queue. */
        list_for_each_safe(lchunk, temp, &q->retransmit) {
                list_del_init(lchunk);
                chunk = list_entry(lchunk, struct sctp_chunk,
                                   transmitted_list);
                sctp_chunk_fail(chunk, q->error);
                sctp_chunk_free(chunk);
        }

        /* Throw away any chunks that are in the abandoned queue. */
        list_for_each_safe(lchunk, temp, &q->abandoned) {
                list_del_init(lchunk);
                chunk = list_entry(lchunk, struct sctp_chunk,
                                   transmitted_list);
                sctp_chunk_fail(chunk, q->error);
                sctp_chunk_free(chunk);
        }

        /* Throw away any leftover data chunks. */
        while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
                sctp_sched_dequeue_done(q, chunk);

                /* Mark as send failure. */
                sctp_chunk_fail(chunk, q->error);
                sctp_chunk_free(chunk);
        }

        /* Throw away any leftover control chunks. */
        list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
                list_del_init(&chunk->list);
                sctp_chunk_free(chunk);
        }
}

void sctp_outq_teardown(struct sctp_outq *q)
{
        __sctp_outq_teardown(q);
        sctp_outq_init(q->asoc, q);
}

/* Free the outqueue structure and any related pending chunks.  */
void sctp_outq_free(struct sctp_outq *q)
{
        /* Throw away leftover chunks. */
        __sctp_outq_teardown(q);
}

/* Put a new chunk in an sctp_outq.  */
void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
{
        struct net *net = q->asoc->base.net;

        pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
                 chunk && chunk->chunk_hdr ?
                 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
                 "illegal chunk");

        /* If it is data, queue it up, otherwise, send it
         * immediately.
         */
        if (sctp_chunk_is_data(chunk)) {
                pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
                         __func__, q, chunk, chunk && chunk->chunk_hdr ?
                         sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
                         "illegal chunk");

                sctp_outq_tail_data(q, chunk);
                if (chunk->asoc->peer.prsctp_capable &&
                    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
                        chunk->asoc->sent_cnt_removable++;
                if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
                        SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
                else
                        SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
        } else {
                list_add_tail(&chunk->list, &q->control_chunk_list);
                SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
        }

        if (!q->cork)
                sctp_outq_flush(q, 0, gfp);
}

/* Insert a chunk into the sorted list based on the TSNs.  The retransmit list
 * and the abandoned list are in ascending order.
 */
static void sctp_insert_list(struct list_head *head, struct list_head *new)
{
        struct list_head *pos;
        struct sctp_chunk *nchunk, *lchunk;
        __u32 ntsn, ltsn;
        int done = 0;

        nchunk = list_entry(new, struct sctp_chunk, transmitted_list);
        ntsn = ntohl(nchunk->subh.data_hdr->tsn);

        list_for_each(pos, head) {
                lchunk = list_entry(pos, struct sctp_chunk, transmitted_list);
                ltsn = ntohl(lchunk->subh.data_hdr->tsn);
                if (TSN_lt(ntsn, ltsn)) {
                        list_add(new, pos->prev);
                        done = 1;
                        break;
                }
        }
        if (!done)
                list_add_tail(new, head);
}

static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
                                  struct sctp_sndrcvinfo *sinfo,
                                  struct list_head *queue, int msg_len)
{
        struct sctp_chunk *chk, *temp;

        list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
                struct sctp_stream_out *streamout;

                if (!chk->msg->abandoned &&
                    (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
                     chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive))
                        continue;

                chk->msg->abandoned = 1;
                list_del_init(&chk->transmitted_list);
                sctp_insert_list(&asoc->outqueue.abandoned,
                                 &chk->transmitted_list);

                streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
                asoc->sent_cnt_removable--;
                asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
                streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;

                if (queue != &asoc->outqueue.retransmit &&
                    !chk->tsn_gap_acked) {
                        if (chk->transport)
                                chk->transport->flight_size -=
                                                sctp_data_size(chk);
                        asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
                }

                msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
                if (msg_len <= 0)
                        break;
        }

        return msg_len;
}

static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
                                    struct sctp_sndrcvinfo *sinfo, int msg_len)
{
        struct sctp_outq *q = &asoc->outqueue;
        struct sctp_chunk *chk, *temp;
        struct sctp_stream_out *sout;

        q->sched->unsched_all(&asoc->stream);

        list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) {
                if (!chk->msg->abandoned &&
                    (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) ||
                     !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
                     chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive))
                        continue;

                chk->msg->abandoned = 1;
                sctp_sched_dequeue_common(q, chk);
                asoc->sent_cnt_removable--;
                asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;

                sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
                sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;

                /* clear out_curr if all frag chunks are pruned */
                if (asoc->stream.out_curr == sout &&
                    list_is_last(&chk->frag_list, &chk->msg->chunks))
                        asoc->stream.out_curr = NULL;

                msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
                sctp_chunk_free(chk);
                if (msg_len <= 0)
                        break;
        }

        q->sched->sched_all(&asoc->stream);

        return msg_len;
}

/* Abandon the chunks according their priorities */
void sctp_prsctp_prune(struct sctp_association *asoc,
                       struct sctp_sndrcvinfo *sinfo, int msg_len)
{
        struct sctp_transport *transport;

        if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable)
                return;

        msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
                                         &asoc->outqueue.retransmit,
                                         msg_len);
        if (msg_len <= 0)
                return;

        list_for_each_entry(transport, &asoc->peer.transport_addr_list,
                            transports) {
                msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
                                                 &transport->transmitted,
                                                 msg_len);
                if (msg_len <= 0)
                        return;
        }

        sctp_prsctp_prune_unsent(asoc, sinfo, msg_len);
}

/* Mark all the eligible packets on a transport for retransmission.  */
void sctp_retransmit_mark(struct sctp_outq *q,
                          struct sctp_transport *transport,
                          __u8 reason)
{
        struct list_head *lchunk, *ltemp;
        struct sctp_chunk *chunk;

        /* Walk through the specified transmitted queue.  */
        list_for_each_safe(lchunk, ltemp, &transport->transmitted) {
                chunk = list_entry(lchunk, struct sctp_chunk,
                                   transmitted_list);

                /* If the chunk is abandoned, move it to abandoned list. */
                if (sctp_chunk_abandoned(chunk)) {
                        list_del_init(lchunk);
                        sctp_insert_list(&q->abandoned, lchunk);

                        /* If this chunk has not been previousely acked,
                         * stop considering it 'outstanding'.  Our peer
                         * will most likely never see it since it will
                         * not be retransmitted
                         */
                        if (!chunk->tsn_gap_acked) {
                                if (chunk->transport)
                                        chunk->transport->flight_size -=
                                                        sctp_data_size(chunk);
                                q->outstanding_bytes -= sctp_data_size(chunk);
                                q->asoc->peer.rwnd += sctp_data_size(chunk);
                        }
                        continue;
                }

                /* If we are doing  retransmission due to a timeout or pmtu
                 * discovery, only the  chunks that are not yet acked should
                 * be added to the retransmit queue.
                 */
                if ((reason == SCTP_RTXR_FAST_RTX  &&
                            (chunk->fast_retransmit == SCTP_NEED_FRTX)) ||
                    (reason != SCTP_RTXR_FAST_RTX  && !chunk->tsn_gap_acked)) {
                        /* RFC 2960 6.2.1 Processing a Received SACK
                         *
                         * C) Any time a DATA chunk is marked for
                         * retransmission (via either T3-rtx timer expiration
                         * (Section 6.3.3) or via fast retransmit
                         * (Section 7.2.4)), add the data size of those
                         * chunks to the rwnd.
                         */
                        q->asoc->peer.rwnd += sctp_data_size(chunk);
                        q->outstanding_bytes -= sctp_data_size(chunk);
                        if (chunk->transport)
                                transport->flight_size -= sctp_data_size(chunk);

                        /* sctpimpguide-05 Section 2.8.2
                         * M5) If a T3-rtx timer expires, the
                         * 'TSN.Missing.Report' of all affected TSNs is set
                         * to 0.
                         */
                        chunk->tsn_missing_report = 0;

                        /* If a chunk that is being used for RTT measurement
                         * has to be retransmitted, we cannot use this chunk
                         * anymore for RTT measurements. Reset rto_pending so
                         * that a new RTT measurement is started when a new
                         * data chunk is sent.
                         */
                        if (chunk->rtt_in_progress) {
                                chunk->rtt_in_progress = 0;
                                transport->rto_pending = 0;
                        }

                        /* Move the chunk to the retransmit queue. The chunks
                         * on the retransmit queue are always kept in order.
                         */
                        list_del_init(lchunk);
                        sctp_insert_list(&q->retransmit, lchunk);
                }
        }

        pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d, "
                 "flight_size:%d, pba:%d\n", __func__, transport, reason,
                 transport->cwnd, transport->ssthresh, transport->flight_size,
                 transport->partial_bytes_acked);
}

/* Mark all the eligible packets on a transport for retransmission and force
 * one packet out.
 */
void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
                     enum sctp_retransmit_reason reason)
{
        struct net *net = q->asoc->base.net;

        switch (reason) {
        case SCTP_RTXR_T3_RTX:
                SCTP_INC_STATS(net, SCTP_MIB_T3_RETRANSMITS);
                sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX);
                /* Update the retran path if the T3-rtx timer has expired for
                 * the current retran path.
                 */
                if (transport == transport->asoc->peer.retran_path)
                        sctp_assoc_update_retran_path(transport->asoc);
                transport->asoc->rtx_data_chunks +=
                        transport->asoc->unack_data;
                if (transport->pl.state == SCTP_PL_COMPLETE &&
                    transport->asoc->unack_data)
                        sctp_transport_reset_probe_timer(transport);
                break;
        case SCTP_RTXR_FAST_RTX:
                SCTP_INC_STATS(net, SCTP_MIB_FAST_RETRANSMITS);
                sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX);
                q->fast_rtx = 1;
                break;
        case SCTP_RTXR_PMTUD:
                SCTP_INC_STATS(net, SCTP_MIB_PMTUD_RETRANSMITS);
                break;
        case SCTP_RTXR_T1_RTX:
                SCTP_INC_STATS(net, SCTP_MIB_T1_RETRANSMITS);
                transport->asoc->init_retries++;
                break;
        default:
                BUG();
        }

        sctp_retransmit_mark(q, transport, reason);

        /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination,
         * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by
         * following the procedures outlined in C1 - C5.
         */
        if (reason == SCTP_RTXR_T3_RTX)
                q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point);

        /* Flush the queues only on timeout, since fast_rtx is only
         * triggered during sack processing and the queue
         * will be flushed at the end.
         */
        if (reason != SCTP_RTXR_FAST_RTX)
                sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
}

/*
 * Transmit DATA chunks on the retransmit queue.  Upon return from
 * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
 * need to be transmitted by the caller.
 * We assume that pkt->transport has already been set.
 *
 * The return value is a normal kernel error return value.
 */
static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
                                 int rtx_timeout, int *start_timer, gfp_t gfp)
{
        struct sctp_transport *transport = pkt->transport;
        struct sctp_chunk *chunk, *chunk1;
        struct list_head *lqueue;
        enum sctp_xmit status;
        int error = 0;
        int timer = 0;
        int done = 0;
        int fast_rtx;

        lqueue = &q->retransmit;
        fast_rtx = q->fast_rtx;

        /* This loop handles time-out retransmissions, fast retransmissions,
         * and retransmissions due to opening of whindow.
         *
         * RFC 2960 6.3.3 Handle T3-rtx Expiration
         *
         * E3) Determine how many of the earliest (i.e., lowest TSN)
         * outstanding DATA chunks for the address for which the
         * T3-rtx has expired will fit into a single packet, subject
         * to the MTU constraint for the path corresponding to the
         * destination transport address to which the retransmission
         * is being sent (this may be different from the address for
         * which the timer expires [see Section 6.4]). Call this value
         * K. Bundle and retransmit those K DATA chunks in a single
         * packet to the destination endpoint.
         *
         * [Just to be painfully clear, if we are retransmitting
         * because a timeout just happened, we should send only ONE
         * packet of retransmitted data.]
         *
         * For fast retransmissions we also send only ONE packet.  However,
         * if we are just flushing the queue due to open window, we'll
         * try to send as much as possible.
         */
        list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) {
                /* If the chunk is abandoned, move it to abandoned list. */
                if (sctp_chunk_abandoned(chunk)) {
                        list_del_init(&chunk->transmitted_list);
                        sctp_insert_list(&q->abandoned,
                                         &chunk->transmitted_list);
                        continue;
                }

                /* Make sure that Gap Acked TSNs are not retransmitted.  A
                 * simple approach is just to move such TSNs out of the
                 * way and into a 'transmitted' queue and skip to the
                 * next chunk.
                 */
                if (chunk->tsn_gap_acked) {
                        list_move_tail(&chunk->transmitted_list,
                                       &transport->transmitted);
                        continue;
                }

                /* If we are doing fast retransmit, ignore non-fast_rtransmit
                 * chunks
                 */
                if (fast_rtx && !chunk->fast_retransmit)
                        continue;

redo:
                /* Attempt to append this chunk to the packet. */
                status = sctp_packet_append_chunk(pkt, chunk);

                switch (status) {
                case SCTP_XMIT_PMTU_FULL:
                        if (!pkt->has_data && !pkt->has_cookie_echo) {
                                /* If this packet did not contain DATA then
                                 * retransmission did not happen, so do it
                                 * again.  We'll ignore the error here since
                                 * control chunks are already freed so there
                                 * is nothing we can do.
                                 */
                                sctp_packet_transmit(pkt, gfp);
                                goto redo;
                        }

                        /* Send this packet.  */
                        error = sctp_packet_transmit(pkt, gfp);

                        /* If we are retransmitting, we should only
                         * send a single packet.
                         * Otherwise, try appending this chunk again.
                         */
                        if (rtx_timeout || fast_rtx)
                                done = 1;
                        else
                                goto redo;

                        /* Bundle next chunk in the next round.  */
                        break;

                case SCTP_XMIT_RWND_FULL:
                        /* Send this packet. */
                        error = sctp_packet_transmit(pkt, gfp);

                        /* Stop sending DATA as there is no more room
                         * at the receiver.
                         */
                        done = 1;
                        break;

                case SCTP_XMIT_DELAY:
                        /* Send this packet. */
                        error = sctp_packet_transmit(pkt, gfp);

                        /* Stop sending DATA because of nagle delay. */
                        done = 1;
                        break;

                default:
                        /* The append was successful, so add this chunk to
                         * the transmitted list.
                         */
                        list_move_tail(&chunk->transmitted_list,
                                       &transport->transmitted);

                        /* Mark the chunk as ineligible for fast retransmit
                         * after it is retransmitted.
                         */
                        if (chunk->fast_retransmit == SCTP_NEED_FRTX)
                                chunk->fast_retransmit = SCTP_DONT_FRTX;

                        q->asoc->stats.rtxchunks++;
                        break;
                }

                /* Set the timer if there were no errors */
                if (!error && !timer)
                        timer = 1;

                if (done)
                        break;
        }

        /* If we are here due to a retransmit timeout or a fast
         * retransmit and if there are any chunks left in the retransmit
         * queue that could not fit in the PMTU sized packet, they need
         * to be marked as ineligible for a subsequent fast retransmit.
         */
        if (rtx_timeout || fast_rtx) {
                list_for_each_entry(chunk1, lqueue, transmitted_list) {
                        if (chunk1->fast_retransmit == SCTP_NEED_FRTX)
                                chunk1->fast_retransmit = SCTP_DONT_FRTX;
                }
        }

        *start_timer = timer;

        /* Clear fast retransmit hint */
        if (fast_rtx)
                q->fast_rtx = 0;

        return error;
}

/* Cork the outqueue so queued chunks are really queued. */
void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
{
        if (q->cork)
                q->cork = 0;

        sctp_outq_flush(q, 0, gfp);
}

static int sctp_packet_singleton(struct sctp_transport *transport,
                                 struct sctp_chunk *chunk, gfp_t gfp)
{
        const struct sctp_association *asoc = transport->asoc;
        const __u16 sport = asoc->base.bind_addr.port;
        const __u16 dport = asoc->peer.port;
        const __u32 vtag = asoc->peer.i.init_tag;
        struct sctp_packet singleton;

        sctp_packet_init(&singleton, transport, sport, dport);
        sctp_packet_config(&singleton, vtag, 0);
        if (sctp_packet_append_chunk(&singleton, chunk) != SCTP_XMIT_OK) {
                list_del_init(&chunk->list);
                sctp_chunk_free(chunk);
                return -ENOMEM;
        }
        return sctp_packet_transmit(&singleton, gfp);
}

/* Struct to hold the context during sctp outq flush */
struct sctp_flush_ctx {
        struct sctp_outq *q;
        /* Current transport being used. It's NOT the same as curr active one */
        struct sctp_transport *transport;
        /* These transports have chunks to send. */
        struct list_head transport_list;
        struct sctp_association *asoc;
        /* Packet on the current transport above */
        struct sctp_packet *packet;
        gfp_t gfp;
};

/* transport: current transport */
static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
                                       struct sctp_chunk *chunk)
{
        struct sctp_transport *new_transport = chunk->transport;

        if (!new_transport) {
                if (!sctp_chunk_is_data(chunk)) {
                        /* If we have a prior transport pointer, see if
                         * the destination address of the chunk
                         * matches the destination address of the
                         * current transport.  If not a match, then
                         * try to look up the transport with a given
                         * destination address.  We do this because
                         * after processing ASCONFs, we may have new
                         * transports created.
                         */
                        if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest,
                                                        &ctx->transport->ipaddr))
                                new_transport = ctx->transport;
                        else
                                new_transport = sctp_assoc_lookup_paddr(ctx->asoc,
                                                                  &chunk->dest);
                }

                /* if we still don't have a new transport, then
                 * use the current active path.
                 */
                if (!new_transport)
                        new_transport = ctx->asoc->peer.active_path;
        } else {
                __u8 type;

                switch (new_transport->state) {
                case SCTP_INACTIVE:
                case SCTP_UNCONFIRMED:
                case SCTP_PF:
                        /* If the chunk is Heartbeat or Heartbeat Ack,
                         * send it to chunk->transport, even if it's
                         * inactive.
                         *
                         * 3.3.6 Heartbeat Acknowledgement:
                         * ...
                         * A HEARTBEAT ACK is always sent to the source IP
                         * address of the IP datagram containing the
                         * HEARTBEAT chunk to which this ack is responding.
                         * ...
                         *
                         * ASCONF_ACKs also must be sent to the source.
                         */
                        type = chunk->chunk_hdr->type;
                        if (type != SCTP_CID_HEARTBEAT &&
                            type != SCTP_CID_HEARTBEAT_ACK &&
                            type != SCTP_CID_ASCONF_ACK)
                                new_transport = ctx->asoc->peer.active_path;
                        break;
                default:
                        break;
                }
        }

        /* Are we switching transports? Take care of transport locks. */
        if (new_transport != ctx->transport) {
                ctx->transport = new_transport;
                ctx->packet = &ctx->transport->packet;

                if (list_empty(&ctx->transport->send_ready))
                        list_add_tail(&ctx->transport->send_ready,
                                      &ctx->transport_list);

                sctp_packet_config(ctx->packet,
                                   ctx->asoc->peer.i.init_tag,
                                   ctx->asoc->peer.ecn_capable);
                /* We've switched transports, so apply the
                 * Burst limit to the new transport.
                 */
                sctp_transport_burst_limited(ctx->transport);
        }
}

static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
{
        struct sctp_chunk *chunk, *tmp;
        enum sctp_xmit status;
        int one_packet, error;

        list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) {
                one_packet = 0;

                /* RFC 5061, 5.3
                 * F1) This means that until such time as the ASCONF
                 * containing the add is acknowledged, the sender MUST
                 * NOT use the new IP address as a source for ANY SCTP
                 * packet except on carrying an ASCONF Chunk.
                 */
                if (ctx->asoc->src_out_of_asoc_ok &&
                    chunk->chunk_hdr->type != SCTP_CID_ASCONF)
                        continue;

                list_del_init(&chunk->list);

                /* Pick the right transport to use. Should always be true for
                 * the first chunk as we don't have a transport by then.
                 */
                sctp_outq_select_transport(ctx, chunk);

                switch (chunk->chunk_hdr->type) {
                /* 6.10 Bundling
                 *   ...
                 *   An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN
                 *   COMPLETE with any other chunks.  [Send them immediately.]
                 */
                case SCTP_CID_INIT:
                case SCTP_CID_INIT_ACK:
                case SCTP_CID_SHUTDOWN_COMPLETE:
                        error = sctp_packet_singleton(ctx->transport, chunk,
                                                      ctx->gfp);
                        if (error < 0) {
                                ctx->asoc->base.sk->sk_err = -error;
                                return;
                        }
                        ctx->asoc->stats.octrlchunks++;
                        break;

                case SCTP_CID_ABORT:
                        if (sctp_test_T_bit(chunk))
                                ctx->packet->vtag = ctx->asoc->c.my_vtag;
                        fallthrough;

                /* The following chunks are "response" chunks, i.e.
                 * they are generated in response to something we
                 * received.  If we are sending these, then we can
                 * send only 1 packet containing these chunks.
                 */
                case SCTP_CID_HEARTBEAT_ACK:
                case SCTP_CID_SHUTDOWN_ACK:
                case SCTP_CID_COOKIE_ACK:
                case SCTP_CID_COOKIE_ECHO:
                case SCTP_CID_ERROR:
                case SCTP_CID_ECN_CWR:
                case SCTP_CID_ASCONF_ACK:
                        one_packet = 1;
                        fallthrough;

                case SCTP_CID_HEARTBEAT:
                        if (chunk->pmtu_probe) {
                                error = sctp_packet_singleton(ctx->transport,
                                                              chunk, ctx->gfp);
                                if (!error)
                                        ctx->asoc->stats.octrlchunks++;
                                break;
                        }
                        fallthrough;
                case SCTP_CID_SACK:
                case SCTP_CID_SHUTDOWN:
                case SCTP_CID_ECN_ECNE:
                case SCTP_CID_ASCONF:
                case SCTP_CID_FWD_TSN:
                case SCTP_CID_I_FWD_TSN:
                case SCTP_CID_RECONF:
                        status = sctp_packet_transmit_chunk(ctx->packet, chunk,
                                                            one_packet, ctx->gfp);
                        if (status != SCTP_XMIT_OK) {
                                /* put the chunk back */
                                list_add(&chunk->list, &ctx->q->control_chunk_list);
                                break;
                        }

                        ctx->asoc->stats.octrlchunks++;
                        /* PR-SCTP C5) If a FORWARD TSN is sent, the
                         * sender MUST assure that at least one T3-rtx
                         * timer is running.
                         */
                        if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN ||
                            chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) {
                                sctp_transport_reset_t3_rtx(ctx->transport);
                                ctx->transport->last_time_sent = jiffies;
                        }

                        if (chunk == ctx->asoc->strreset_chunk)
                                sctp_transport_reset_reconf_timer(ctx->transport);

                        break;

                default:
                        /* We built a chunk with an illegal type! */
                        BUG();
                }
        }
}

/* Returns false if new data shouldn't be sent */
static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx,
                                int rtx_timeout)
{
        int error, start_timer = 0;

        if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
                return false;

        if (ctx->transport != ctx->asoc->peer.retran_path) {
                /* Switch transports & prepare the packet.  */
                ctx->transport = ctx->asoc->peer.retran_path;
                ctx->packet = &ctx->transport->packet;

                if (list_empty(&ctx->transport->send_ready))
                        list_add_tail(&ctx->transport->send_ready,
                                      &ctx->transport_list);

                sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag,
                                   ctx->asoc->peer.ecn_capable);
        }

        error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout,
                                      &start_timer, ctx->gfp);
        if (error < 0)
                ctx->asoc->base.sk->sk_err = -error;

        if (start_timer) {
                sctp_transport_reset_t3_rtx(ctx->transport);
                ctx->transport->last_time_sent = jiffies;
        }

        /* This can happen on COOKIE-ECHO resend.  Only
         * one chunk can get bundled with a COOKIE-ECHO.
         */
        if (ctx->packet->has_cookie_echo)
                return false;

        /* Don't send new data if there is still data
         * waiting to retransmit.
         */
        if (!list_empty(&ctx->q->retransmit))
                return false;

        return true;
}

static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
                                 int rtx_timeout)
{
        struct sctp_chunk *chunk;
        enum sctp_xmit status;

        /* Is it OK to send data chunks?  */
        switch (ctx->asoc->state) {
        case SCTP_STATE_COOKIE_ECHOED:
                /* Only allow bundling when this packet has a COOKIE-ECHO
                 * chunk.
                 */
                if (!ctx->packet || !ctx->packet->has_cookie_echo)
                        return;

                fallthrough;
        case SCTP_STATE_ESTABLISHED:
        case SCTP_STATE_SHUTDOWN_PENDING:
        case SCTP_STATE_SHUTDOWN_RECEIVED:
                break;

        default:
                /* Do nothing. */
                return;
        }

        /* RFC 2960 6.1  Transmission of DATA Chunks
         *
         * C) When the time comes for the sender to transmit,
         * before sending new DATA chunks, the sender MUST
         * first transmit any outstanding DATA chunks which
         * are marked for retransmission (limited by the
         * current cwnd).
         */
        if (!list_empty(&ctx->q->retransmit) &&
            !sctp_outq_flush_rtx(ctx, rtx_timeout))
                return;

        /* Apply Max.Burst limitation to the current transport in
         * case it will be used for new data.  We are going to
         * rest it before we return, but we want to apply the limit
         * to the currently queued data.
         */
        if (ctx->transport)
                sctp_transport_burst_limited(ctx->transport);

        /* Finally, transmit new packets.  */
        while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) {
                __u32 sid = ntohs(chunk->subh.data_hdr->stream);
                __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state;

                /* Has this chunk expired? */
                if (sctp_chunk_abandoned(chunk)) {
                        sctp_sched_dequeue_done(ctx->q, chunk);
                        sctp_chunk_fail(chunk, 0);
                        sctp_chunk_free(chunk);
                        continue;
                }

                if (stream_state == SCTP_STREAM_CLOSED) {
                        sctp_outq_head_data(ctx->q, chunk);
                        break;
                }

                sctp_outq_select_transport(ctx, chunk);

                pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n",
                         __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ?
                         sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
                         "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
                         chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
                         refcount_read(&chunk->skb->users) : -1);

                /* Add the chunk to the packet.  */
                status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0,
                                                    ctx->gfp);
                if (status != SCTP_XMIT_OK) {
                        /* We could not append this chunk, so put
                         * the chunk back on the output queue.
                         */
                        pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
                                 __func__, ntohl(chunk->subh.data_hdr->tsn),
                                 status);

                        sctp_outq_head_data(ctx->q, chunk);
                        break;
                }

                /* The sender is in the SHUTDOWN-PENDING state,
                 * The sender MAY set the I-bit in the DATA
                 * chunk header.
                 */
                if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
                        chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
                if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
                        ctx->asoc->stats.ouodchunks++;
                else
                        ctx->asoc->stats.oodchunks++;

                /* Only now it's safe to consider this
                 * chunk as sent, sched-wise.
                 */
                sctp_sched_dequeue_done(ctx->q, chunk);

                list_add_tail(&chunk->transmitted_list,
                              &ctx->transport->transmitted);

                sctp_transport_reset_t3_rtx(ctx->transport);
                ctx->transport->last_time_sent = jiffies;

                /* Only let one DATA chunk get bundled with a
                 * COOKIE-ECHO chunk.
                 */
                if (ctx->packet->has_cookie_echo)
                        break;
        }
}

static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx)
{
        struct sock *sk = ctx->asoc->base.sk;
        struct list_head *ltransport;
        struct sctp_packet *packet;
        struct sctp_transport *t;
        int error = 0;

        while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) {
                t = list_entry(ltransport, struct sctp_transport, send_ready);
                packet = &t->packet;
                if (!sctp_packet_empty(packet)) {
                        rcu_read_lock();
                        if (t->dst && __sk_dst_get(sk) != t->dst) {
                                dst_hold(t->dst);
                                sk_setup_caps(sk, t->dst);
                        }
                        rcu_read_unlock();
                        error = sctp_packet_transmit(packet, ctx->gfp);
                        if (error < 0)
                                ctx->q->asoc->base.sk->sk_err = -error;
                }

                /* Clear the burst limited state, if any */
                sctp_transport_burst_reset(t);
        }
}

/* Try to flush an outqueue.
 *
 * Description: Send everything in q which we legally can, subject to
 * congestion limitations.
 * * Note: This function can be called from multiple contexts so appropriate
 * locking concerns must be made.  Today we use the sock lock to protect
 * this function.
 */

static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
{
        struct sctp_flush_ctx ctx = {
                .q = q,
                .transport = NULL,
                .transport_list = LIST_HEAD_INIT(ctx.transport_list),
                .asoc = q->asoc,
                .packet = NULL,
                .gfp = gfp,
        };

        /* 6.10 Bundling
         *   ...
         *   When bundling control chunks with DATA chunks, an
         *   endpoint MUST place control chunks first in the outbound
         *   SCTP packet.  The transmitter MUST transmit DATA chunks
         *   within a SCTP packet in increasing order of TSN.
         *   ...
         */

        sctp_outq_flush_ctrl(&ctx);

        if (q->asoc->src_out_of_asoc_ok)
                goto sctp_flush_out;

        sctp_outq_flush_data(&ctx, rtx_timeout);

sctp_flush_out:

        sctp_outq_flush_transports(&ctx);
}

/* Update unack_data based on the incoming SACK chunk */
static void sctp_sack_update_unack_data(struct sctp_association *assoc,
                                        struct sctp_sackhdr *sack)
{
        union sctp_sack_variable *frags;
        __u16 unack_data;
        int i;

        unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1;

        frags = (union sctp_sack_variable *)(sack + 1);
        for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) {
                unack_data -= ((ntohs(frags[i].gab.end) -
                                ntohs(frags[i].gab.start) + 1));
        }

        assoc->unack_data = unack_data;
}

/* This is where we REALLY process a SACK.
 *
 * Process the SACK against the outqueue.  Mostly, this just frees
 * things off the transmitted queue.
 */
int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
{
        struct sctp_association *asoc = q->asoc;
        struct sctp_sackhdr *sack = chunk->subh.sack_hdr;
        struct sctp_transport *transport;
        struct sctp_chunk *tchunk = NULL;
        struct list_head *lchunk, *transport_list, *temp;
        __u32 sack_ctsn, ctsn, tsn;
        __u32 highest_tsn, highest_new_tsn;
        __u32 sack_a_rwnd;
        unsigned int outstanding;
        struct sctp_transport *primary = asoc->peer.primary_path;
        int count_of_newacks = 0;
        int gap_ack_blocks;
        u8 accum_moved = 0;

        /* Grab the association's destination address list. */
        transport_list = &asoc->peer.transport_addr_list;

        /* SCTP path tracepoint for congestion control debugging. */
        if (trace_sctp_probe_path_enabled()) {
                list_for_each_entry(transport, transport_list, transports)
                        trace_sctp_probe_path(transport, asoc);
        }

        sack_ctsn = ntohl(sack->cum_tsn_ack);
        gap_ack_blocks = ntohs(sack->num_gap_ack_blocks);
        asoc->stats.gapcnt += gap_ack_blocks;
        /*
         * SFR-CACC algorithm:
         * On receipt of a SACK the sender SHOULD execute the
         * following statements.
         *
         * 1) If the cumulative ack in the SACK passes next tsn_at_change
         * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be
         * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for
         * all destinations.
         * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE
         * is set the receiver of the SACK MUST take the following actions:
         *
         * A) Initialize the cacc_saw_newack to 0 for all destination
         * addresses.
         *
         * Only bother if changeover_active is set. Otherwise, this is
         * totally suboptimal to do on every SACK.
         */
        if (primary->cacc.changeover_active) {
                u8 clear_cycling = 0;

                if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) {
                        primary->cacc.changeover_active = 0;
                        clear_cycling = 1;
                }

                if (clear_cycling || gap_ack_blocks) {
                        list_for_each_entry(transport, transport_list,
                                        transports) {
                                if (clear_cycling)
                                        transport->cacc.cycling_changeover = 0;
                                if (gap_ack_blocks)
                                        transport->cacc.cacc_saw_newack = 0;
                        }
                }
        }

        /* Get the highest TSN in the sack. */
        highest_tsn = sack_ctsn;
        if (gap_ack_blocks) {
                union sctp_sack_variable *frags =
                        (union sctp_sack_variable *)(sack + 1);

                highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end);
        }

        if (TSN_lt(asoc->highest_sacked, highest_tsn))
                asoc->highest_sacked = highest_tsn;

        highest_new_tsn = sack_ctsn;

        /* Run through the retransmit queue.  Credit bytes received
         * and free those chunks that we can.
         */
        sctp_check_transmitted(q, &q->retransmit, NULL, NULL, sack, &highest_new_tsn);

        /* Run through the transmitted queue.
         * Credit bytes received and free those chunks which we can.
         *
         * This is a MASSIVE candidate for optimization.
         */
        list_for_each_entry(transport, transport_list, transports) {
                sctp_check_transmitted(q, &transport->transmitted,
                                       transport, &chunk->source, sack,
                                       &highest_new_tsn);
                /*
                 * SFR-CACC algorithm:
                 * C) Let count_of_newacks be the number of
                 * destinations for which cacc_saw_newack is set.
                 */
                if (transport->cacc.cacc_saw_newack)
                        count_of_newacks++;
        }

        /* Move the Cumulative TSN Ack Point if appropriate.  */
        if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) {
                asoc->ctsn_ack_point = sack_ctsn;
                accum_moved = 1;
        }

        if (gap_ack_blocks) {

                if (asoc->fast_recovery && accum_moved)
                        highest_new_tsn = highest_tsn;

                list_for_each_entry(transport, transport_list, transports)
                        sctp_mark_missing(q, &transport->transmitted, transport,
                                          highest_new_tsn, count_of_newacks);
        }

        /* Update unack_data field in the assoc. */
        sctp_sack_update_unack_data(asoc, sack);

        ctsn = asoc->ctsn_ack_point;

        /* Throw away stuff rotting on the sack queue.  */
        list_for_each_safe(lchunk, temp, &q->sacked) {
                tchunk = list_entry(lchunk, struct sctp_chunk,
                                    transmitted_list);
                tsn = ntohl(tchunk->subh.data_hdr->tsn);
                if (TSN_lte(tsn, ctsn)) {
                        list_del_init(&tchunk->transmitted_list);
                        if (asoc->peer.prsctp_capable &&
                            SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
                                asoc->sent_cnt_removable--;
                        sctp_chunk_free(tchunk);
                }
        }

        /* ii) Set rwnd equal to the newly received a_rwnd minus the
         *     number of bytes still outstanding after processing the
         *     Cumulative TSN Ack and the Gap Ack Blocks.
         */

        sack_a_rwnd = ntohl(sack->a_rwnd);
        asoc->peer.zero_window_announced = !sack_a_rwnd;
        outstanding = q->outstanding_bytes;

        if (outstanding < sack_a_rwnd)
                sack_a_rwnd -= outstanding;
        else
                sack_a_rwnd = 0;

        asoc->peer.rwnd = sack_a_rwnd;

        asoc->stream.si->generate_ftsn(q, sack_ctsn);

        pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn);
        pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, "
                 "advertised peer ack point:0x%x\n", __func__, asoc, ctsn,
                 asoc->adv_peer_ack_point);

        return sctp_outq_is_empty(q);
}

/* Is the outqueue empty?
 * The queue is empty when we have not pending data, no in-flight data
 * and nothing pending retransmissions.
 */
int sctp_outq_is_empty(const struct sctp_outq *q)
{
        return q->out_qlen == 0 && q->outstanding_bytes == 0 &&
               list_empty(&q->retransmit);
}

/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

/* Go through a transport's transmitted list or the association's retransmit
 * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked.
 * The retransmit list will not have an associated transport.
 *
 * I added coherent debug information output.        --xguo
 *
 * Instead of printing 'sacked' or 'kept' for each TSN on the
 * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5.
 * KEPT TSN6-TSN7, etc.
 */
static void sctp_check_transmitted(struct sctp_outq *q,
                                   struct list_head *transmitted_queue,
                                   struct sctp_transport *transport,
                                   union sctp_addr *saddr,
                                   struct sctp_sackhdr *sack,
                                   __u32 *highest_new_tsn_in_sack)
{
        struct list_head *lchunk;
        struct sctp_chunk *tchunk;
        struct list_head tlist;
        __u32 tsn;
        __u32 sack_ctsn;
        __u32 rtt;
        __u8 restart_timer = 0;
        int bytes_acked = 0;
        int migrate_bytes = 0;
        bool forward_progress = false;

        sack_ctsn = ntohl(sack->cum_tsn_ack);

        INIT_LIST_HEAD(&tlist);

        /* The while loop will skip empty transmitted queues. */
        while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) {
                tchunk = list_entry(lchunk, struct sctp_chunk,
                                    transmitted_list);

                if (sctp_chunk_abandoned(tchunk)) {
                        /* Move the chunk to abandoned list. */
                        sctp_insert_list(&q->abandoned, lchunk);

                        /* If this chunk has not been acked, stop
                         * considering it as 'outstanding'.
                         */
                        if (transmitted_queue != &q->retransmit &&
                            !tchunk->tsn_gap_acked) {
                                if (tchunk->transport)
                                        tchunk->transport->flight_size -=
                                                        sctp_data_size(tchunk);
                                q->outstanding_bytes -= sctp_data_size(tchunk);
                        }
                        continue;
                }

                tsn = ntohl(tchunk->subh.data_hdr->tsn);
                if (sctp_acked(sack, tsn)) {
                        /* If this queue is the retransmit queue, the
                         * retransmit timer has already reclaimed
                         * the outstanding bytes for this chunk, so only
                         * count bytes associated with a transport.
                         */
                        if (transport && !tchunk->tsn_gap_acked) {
                                /* If this chunk is being used for RTT
                                 * measurement, calculate the RTT and update
                                 * the RTO using this value.
                                 *
                                 * 6.3.1 C5) Karn's algorithm: RTT measurements
                                 * MUST NOT be made using packets that were
                                 * retransmitted (and thus for which it is
                                 * ambiguous whether the reply was for the
                                 * first instance of the packet or a later
                                 * instance).
                                 */
                                if (!sctp_chunk_retransmitted(tchunk) &&
                                    tchunk->rtt_in_progress) {
                                        tchunk->rtt_in_progress = 0;
                                        rtt = jiffies - tchunk->sent_at;
                                        sctp_transport_update_rto(transport,
                                                                  rtt);
                                }

                                if (TSN_lte(tsn, sack_ctsn)) {
                                        /*
                                         * SFR-CACC algorithm:
                                         * 2) If the SACK contains gap acks
                                         * and the flag CHANGEOVER_ACTIVE is
                                         * set the receiver of the SACK MUST
                                         * take the following action:
                                         *
                                         * B) For each TSN t being acked that
                                         * has not been acked in any SACK so
                                         * far, set cacc_saw_newack to 1 for
                                         * the destination that the TSN was
                                         * sent to.
                                         */
                                        if (sack->num_gap_ack_blocks &&
                                            q->asoc->peer.primary_path->cacc.
                                            changeover_active)
                                                transport->cacc.cacc_saw_newack
                                                        = 1;
                                }
                        }

                        /* If the chunk hasn't been marked as ACKED,
                         * mark it and account bytes_acked if the
                         * chunk had a valid transport (it will not
                         * have a transport if ASCONF had deleted it
                         * while DATA was outstanding).
                         */
                        if (!tchunk->tsn_gap_acked) {
                                tchunk->tsn_gap_acked = 1;
                                if (TSN_lt(*highest_new_tsn_in_sack, tsn))
                                        *highest_new_tsn_in_sack = tsn;
                                bytes_acked += sctp_data_size(tchunk);
                                if (!tchunk->transport)
                                        migrate_bytes += sctp_data_size(tchunk);
                                forward_progress = true;
                        }

                        if (TSN_lte(tsn, sack_ctsn)) {
                                /* RFC 2960  6.3.2 Retransmission Timer Rules
                                 *
                                 * R3) Whenever a SACK is received
                                 * that acknowledges the DATA chunk
                                 * with the earliest outstanding TSN
                                 * for that address, restart T3-rtx
                                 * timer for that address with its
                                 * current RTO.
                                 */
                                restart_timer = 1;
                                forward_progress = true;

                                list_add_tail(&tchunk->transmitted_list,
                                              &q->sacked);
                        } else {
                                /* RFC2960 7.2.4, sctpimpguide-05 2.8.2
                                 * M2) Each time a SACK arrives reporting
                                 * 'Stray DATA chunk(s)' record the highest TSN
                                 * reported as newly acknowledged, call this
                                 * value 'HighestTSNinSack'. A newly
                                 * acknowledged DATA chunk is one not
                                 * previously acknowledged in a SACK.
                                 *
                                 * When the SCTP sender of data receives a SACK
                                 * chunk that acknowledges, for the first time,
                                 * the receipt of a DATA chunk, all the still
                                 * unacknowledged DATA chunks whose TSN is
                                 * older than that newly acknowledged DATA
                                 * chunk, are qualified as 'Stray DATA chunks'.
                                 */
                                list_add_tail(lchunk, &tlist);
                        }
                } else {
                        if (tchunk->tsn_gap_acked) {
                                pr_debug("%s: receiver reneged on data TSN:0x%x\n",
                                         __func__, tsn);

                                tchunk->tsn_gap_acked = 0;

                                if (tchunk->transport)
                                        bytes_acked -= sctp_data_size(tchunk);

                                /* RFC 2960 6.3.2 Retransmission Timer Rules
                                 *
                                 * R4) Whenever a SACK is received missing a
                                 * TSN that was previously acknowledged via a
                                 * Gap Ack Block, start T3-rtx for the
                                 * destination address to which the DATA
                                 * chunk was originally
                                 * transmitted if it is not already running.
                                 */
                                restart_timer = 1;
                        }

                        list_add_tail(lchunk, &tlist);
                }
        }

        if (transport) {
                if (bytes_acked) {
                        struct sctp_association *asoc = transport->asoc;

                        /* We may have counted DATA that was migrated
                         * to this transport due to DEL-IP operation.
                         * Subtract those bytes, since the were never
                         * send on this transport and shouldn't be
                         * credited to this transport.
                         */
                        bytes_acked -= migrate_bytes;

                        /* 8.2. When an outstanding TSN is acknowledged,
                         * the endpoint shall clear the error counter of
                         * the destination transport address to which the
                         * DATA chunk was last sent.
                         * The association's overall error counter is
                         * also cleared.
                         */
                        transport->error_count = 0;
                        transport->asoc->overall_error_count = 0;
                        forward_progress = true;

                        /*
                         * While in SHUTDOWN PENDING, we may have started
                         * the T5 shutdown guard timer after reaching the
                         * retransmission limit. Stop that timer as soon
                         * as the receiver acknowledged any data.
                         */
                        if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING &&
                            del_timer(&asoc->timers
                                [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]))
                                        sctp_association_put(asoc);

                        /* Mark the destination transport address as
                         * active if it is not so marked.
                         */
                        if ((transport->state == SCTP_INACTIVE ||
                             transport->state == SCTP_UNCONFIRMED) &&
                            sctp_cmp_addr_exact(&transport->ipaddr, saddr)) {
                                sctp_assoc_control_transport(
                                        transport->asoc,
                                        transport,
                                        SCTP_TRANSPORT_UP,
                                        SCTP_RECEIVED_SACK);
                        }

                        sctp_transport_raise_cwnd(transport, sack_ctsn,
                                                  bytes_acked);

                        transport->flight_size -= bytes_acked;
                        if (transport->flight_size == 0)
                                transport->partial_bytes_acked = 0;
                        q->outstanding_bytes -= bytes_acked + migrate_bytes;
                } else {
                        /* RFC 2960 6.1, sctpimpguide-06 2.15.2
                         * When a sender is doing zero window probing, it
                         * should not timeout the association if it continues
                         * to receive new packets from the receiver. The
                         * reason is that the receiver MAY keep its window
                         * closed for an indefinite time.
                         * A sender is doing zero window probing when the
                         * receiver's advertised window is zero, and there is
                         * only one data chunk in flight to the receiver.
                         *
                         * Allow the association to timeout while in SHUTDOWN
                         * PENDING or SHUTDOWN RECEIVED in case the receiver
                         * stays in zero window mode forever.
                         */
                        if (!q->asoc->peer.rwnd &&
                            !list_empty(&tlist) &&
                            (sack_ctsn+2 == q->asoc->next_tsn) &&
                            q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) {
                                pr_debug("%s: sack received for zero window "
                                         "probe:%u\n", __func__, sack_ctsn);

                                q->asoc->overall_error_count = 0;
                                transport->error_count = 0;
                        }
                }

                /* RFC 2960 6.3.2 Retransmission Timer Rules
                 *
                 * R2) Whenever all outstanding data sent to an address have
                 * been acknowledged, turn off the T3-rtx timer of that
                 * address.
                 */
                if (!transport->flight_size) {
                        if (del_timer(&transport->T3_rtx_timer))
                                sctp_transport_put(transport);
                } else if (restart_timer) {
                        if (!mod_timer(&transport->T3_rtx_timer,
                                       jiffies + transport->rto))
                                sctp_transport_hold(transport);
                }

                if (forward_progress) {
                        if (transport->dst)
                                sctp_transport_dst_confirm(transport);
                }
        }

        list_splice(&tlist, transmitted_queue);
}

/* Mark chunks as missing and consequently may get retransmitted. */
static void sctp_mark_missing(struct sctp_outq *q,
                              struct list_head *transmitted_queue,
                              struct sctp_transport *transport,
                              __u32 highest_new_tsn_in_sack,
                              int count_of_newacks)
{
        struct sctp_chunk *chunk;
        __u32 tsn;
        char do_fast_retransmit = 0;
        struct sctp_association *asoc = q->asoc;
        struct sctp_transport *primary = asoc->peer.primary_path;

        list_for_each_entry(chunk, transmitted_queue, transmitted_list) {

                tsn = ntohl(chunk->subh.data_hdr->tsn);

                /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all
                 * 'Unacknowledged TSN's', if the TSN number of an
                 * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack'
                 * value, increment the 'TSN.Missing.Report' count on that
                 * chunk if it has NOT been fast retransmitted or marked for
                 * fast retransmit already.
                 */
                if (chunk->fast_retransmit == SCTP_CAN_FRTX &&
                    !chunk->tsn_gap_acked &&
                    TSN_lt(tsn, highest_new_tsn_in_sack)) {

                        /* SFR-CACC may require us to skip marking
                         * this chunk as missing.
                         */
                        if (!transport || !sctp_cacc_skip(primary,
                                                chunk->transport,
                                                count_of_newacks, tsn)) {
                                chunk->tsn_missing_report++;

                                pr_debug("%s: tsn:0x%x missing counter:%d\n",
                                         __func__, tsn, chunk->tsn_missing_report);
                        }
                }
                /*
                 * M4) If any DATA chunk is found to have a
                 * 'TSN.Missing.Report'
                 * value larger than or equal to 3, mark that chunk for
                 * retransmission and start the fast retransmit procedure.
                 */

                if (chunk->tsn_missing_report >= 3) {
                        chunk->fast_retransmit = SCTP_NEED_FRTX;
                        do_fast_retransmit = 1;
                }
        }

        if (transport) {
                if (do_fast_retransmit)
                        sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX);

                pr_debug("%s: transport:%p, cwnd:%d, ssthresh:%d, "
                         "flight_size:%d, pba:%d\n",  __func__, transport,
                         transport->cwnd, transport->ssthresh,
                         transport->flight_size, transport->partial_bytes_acked);
        }
}

/* Is the given TSN acked by this packet?  */
static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
{
        __u32 ctsn = ntohl(sack->cum_tsn_ack);
        union sctp_sack_variable *frags;
        __u16 tsn_offset, blocks;
        int i;

        if (TSN_lte(tsn, ctsn))
                goto pass;

        /* 3.3.4 Selective Acknowledgment (SACK) (3):
         *
         * Gap Ack Blocks:
         *  These fields contain the Gap Ack Blocks. They are repeated
         *  for each Gap Ack Block up to the number of Gap Ack Blocks
         *  defined in the Number of Gap Ack Blocks field. All DATA
         *  chunks with TSNs greater than or equal to (Cumulative TSN
         *  Ack + Gap Ack Block Start) and less than or equal to
         *  (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack
         *  Block are assumed to have been received correctly.
         */

        frags = (union sctp_sack_variable *)(sack + 1);
        blocks = ntohs(sack->num_gap_ack_blocks);
        tsn_offset = tsn - ctsn;
        for (i = 0; i < blocks; ++i) {
                if (tsn_offset >= ntohs(frags[i].gab.start) &&
                    tsn_offset <= ntohs(frags[i].gab.end))
                        goto pass;
        }

        return 0;
pass:
        return 1;
}

static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist,
                                    int nskips, __be16 stream)
{
        int i;

        for (i = 0; i < nskips; i++) {
                if (skiplist[i].stream == stream)
                        return i;
        }
        return i;
}

/* Create and add a fwdtsn chunk to the outq's control queue if needed. */
void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
{
        struct sctp_association *asoc = q->asoc;
        struct sctp_chunk *ftsn_chunk = NULL;
        struct sctp_fwdtsn_skip ftsn_skip_arr[10];
        int nskips = 0;
        int skip_pos = 0;
        __u32 tsn;
        struct sctp_chunk *chunk;
        struct list_head *lchunk, *temp;

        if (!asoc->peer.prsctp_capable)
                return;

        /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the
         * received SACK.
         *
         * If (Advanced.Peer.Ack.Point < SackCumAck), then update
         * Advanced.Peer.Ack.Point to be equal to SackCumAck.
         */
        if (TSN_lt(asoc->adv_peer_ack_point, ctsn))
                asoc->adv_peer_ack_point = ctsn;

        /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point"
         * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as
         * the chunk next in the out-queue space is marked as "abandoned" as
         * shown in the following example:
         *
         * Assuming that a SACK arrived with the Cumulative TSN ACK 102
         * and the Advanced.Peer.Ack.Point is updated to this value:
         *
         *   out-queue at the end of  ==>   out-queue after Adv.Ack.Point
         *   normal SACK processing           local advancement
         *                ...                           ...
         *   Adv.Ack.Pt-> 102 acked                     102 acked
         *                103 abandoned                 103 abandoned
         *                104 abandoned     Adv.Ack.P-> 104 abandoned
         *                105                           105
         *                106 acked                     106 acked
         *                ...                           ...
         *
         * In this example, the data sender successfully advanced the
         * "Advanced.Peer.Ack.Point" from 102 to 104 locally.
         */
        list_for_each_safe(lchunk, temp, &q->abandoned) {
                chunk = list_entry(lchunk, struct sctp_chunk,
                                        transmitted_list);
                tsn = ntohl(chunk->subh.data_hdr->tsn);

                /* Remove any chunks in the abandoned queue that are acked by
                 * the ctsn.
                 */
                if (TSN_lte(tsn, ctsn)) {
                        list_del_init(lchunk);
                        sctp_chunk_free(chunk);
                } else {
                        if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) {
                                asoc->adv_peer_ack_point = tsn;
                                if (chunk->chunk_hdr->flags &
                                         SCTP_DATA_UNORDERED)
                                        continue;
                                skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0],
                                                nskips,
                                                chunk->subh.data_hdr->stream);
                                ftsn_skip_arr[skip_pos].stream =
                                        chunk->subh.data_hdr->stream;
                                ftsn_skip_arr[skip_pos].ssn =
                                         chunk->subh.data_hdr->ssn;
                                if (skip_pos == nskips)
                                        nskips++;
                                if (nskips == 10)
                                        break;
                        } else
                                break;
                }
        }

        /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point"
         * is greater than the Cumulative TSN ACK carried in the received
         * SACK, the data sender MUST send the data receiver a FORWARD TSN
         * chunk containing the latest value of the
         * "Advanced.Peer.Ack.Point".
         *
         * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD
         * list each stream and sequence number in the forwarded TSN. This
         * information will enable the receiver to easily find any
         * stranded TSN's waiting on stream reorder queues. Each stream
         * SHOULD only be reported once; this means that if multiple
         * abandoned messages occur in the same stream then only the
         * highest abandoned stream sequence number is reported. If the
         * total size of the FORWARD TSN does NOT fit in a single MTU then
         * the sender of the FORWARD TSN SHOULD lower the
         * Advanced.Peer.Ack.Point to the last TSN that will fit in a
         * single MTU.
         */
        if (asoc->adv_peer_ack_point > ctsn)
                ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point,
                                              nskips, &ftsn_skip_arr[0]);

        if (ftsn_chunk) {
                list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
                SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS);
        }
}




























    5 






    5 





    3 























    5 




    1 













    1 




































    1 

    1 





    1 
    1 































    4 

    4 




    2 
    2 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

#include <linux/lockdep.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/rwsem.h>
#include <linux/tracepoint-defs.h>
#include <linux/types.h>

#define MMAP_LOCK_INITIALIZER(name) \
        .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

DECLARE_TRACEPOINT(mmap_lock_start_locking);
DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
DECLARE_TRACEPOINT(mmap_lock_released);

#ifdef CONFIG_TRACING

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
        if (tracepoint_enabled(mmap_lock_start_locking))
                __mmap_lock_do_trace_start_locking(mm, write);
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
        if (tracepoint_enabled(mmap_lock_acquire_returned))
                __mmap_lock_do_trace_acquire_returned(mm, write, success);
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
        if (tracepoint_enabled(mmap_lock_released))
                __mmap_lock_do_trace_released(mm, write);
}

#else /* !CONFIG_TRACING */

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
}

#endif /* CONFIG_TRACING */

static inline void mmap_assert_locked(const struct mm_struct *mm)
{
        rwsem_assert_held(&mm->mmap_lock);
}

static inline void mmap_assert_write_locked(const struct mm_struct *mm)
{
        rwsem_assert_held_write(&mm->mmap_lock);
}

#ifdef CONFIG_PER_VMA_LOCK
/*
 * Drop all currently-held per-VMA locks.
 * This is called from the mmap_lock implementation directly before releasing
 * a write-locked mmap_lock (or downgrading it to read-locked).
 * This should normally NOT be called manually from other places.
 * If you want to call this manually anyway, keep in mind that this will release
 * *all* VMA write locks, including ones from further up the stack.
 */
static inline void vma_end_write_all(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);
        /*
         * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
         * mmap_lock being held.
         * We need RELEASE semantics here to ensure that preceding stores into
         * the VMA take effect before we unlock it with this store.
         * Pairs with ACQUIRE semantics in vma_start_read().
         */
        smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
}
#else
static inline void vma_end_write_all(struct mm_struct *mm) {}
#endif

static inline void mmap_init_lock(struct mm_struct *mm)
{
        init_rwsem(&mm->mmap_lock);
}

static inline void mmap_write_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write_nested(&mm->mmap_lock, subclass);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, true);
        ret = down_write_killable(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
        return ret;
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, true);
        vma_end_write_all(mm);
        up_write(&mm->mmap_lock);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
        __mmap_lock_trace_acquire_returned(mm, false, true);
        vma_end_write_all(mm);
        downgrade_write(&mm->mmap_lock);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, false);
        down_read(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, true);
}

static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_killable(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, ret == 0);
        return ret;
}

static inline bool mmap_read_trylock(struct mm_struct *mm)
{
        bool ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_trylock(&mm->mmap_lock) != 0;
        __mmap_lock_trace_acquire_returned(mm, false, ret);
        return ret;
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read(&mm->mmap_lock);
}

static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read_non_owner(&mm->mmap_lock);
}

static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
        return rwsem_is_contended(&mm->mmap_lock);
}

#endif /* _LINUX_MMAP_LOCK_H */























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NETFILTER_NETDEV_H_
#define _NETFILTER_NETDEV_H_

#include <linux/netfilter.h>
#include <linux/netdevice.h>

#ifdef CONFIG_NETFILTER_INGRESS
static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
                return false;
#endif
        return rcu_access_pointer(skb->dev->nf_hooks_ingress);
}

/* caller must hold rcu_read_lock */
static inline int nf_hook_ingress(struct sk_buff *skb)
{
        struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
        struct nf_hook_state state;
        int ret;

        /* Must recheck the ingress hook head, in the event it became NULL
         * after the check in nf_hook_ingress_active evaluated to true.
         */
        if (unlikely(!e))
                return 0;

        nf_hook_state_init(&state, NF_NETDEV_INGRESS,
                           NFPROTO_NETDEV, skb->dev, NULL, NULL,
                           dev_net(skb->dev), NULL);
        ret = nf_hook_slow(skb, &state, e, 0);
        if (ret == 0)
                return -1;

        return ret;
}

#else /* CONFIG_NETFILTER_INGRESS */
static inline int nf_hook_ingress_active(struct sk_buff *skb)
{
        return 0;
}

static inline int nf_hook_ingress(struct sk_buff *skb)
{
        return 0;
}
#endif /* CONFIG_NETFILTER_INGRESS */

#ifdef CONFIG_NETFILTER_EGRESS
static inline bool nf_hook_egress_active(void)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS]))
                return false;
#endif
        return true;
}

/**
 * nf_hook_egress - classify packets before transmission
 * @skb: packet to be classified
 * @rc: result code which shall be returned by __dev_queue_xmit() on failure
 * @dev: netdev whose egress hooks shall be applied to @skb
 *
 * Returns @skb on success or %NULL if the packet was consumed or filtered.
 * Caller must hold rcu_read_lock.
 *
 * On ingress, packets are classified first by tc, then by netfilter.
 * On egress, the order is reversed for symmetry.  Conceptually, tc and
 * netfilter can be thought of as layers, with netfilter layered above tc:
 * When tc redirects a packet to another interface, netfilter is not applied
 * because the packet is on the tc layer.
 *
 * The nf_skip_egress flag controls whether netfilter is applied on egress.
 * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the
 * packet passes through tc and netfilter.  Because __dev_queue_xmit() may be
 * called recursively by tunnel drivers such as vxlan, the flag is reverted to
 * false after sch_handle_egress().  This ensures that netfilter is applied
 * both on the overlay and underlying network.
 */
static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        struct nf_hook_entries *e;
        struct nf_hook_state state;
        int ret;

#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        if (skb->nf_skip_egress)
                return skb;
#endif

        e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held());
        if (!e)
                return skb;

        nf_hook_state_init(&state, NF_NETDEV_EGRESS,
                           NFPROTO_NETDEV, NULL, dev, NULL,
                           dev_net(dev), NULL);

        /* nf assumes rcu_read_lock, not just read_lock_bh */
        rcu_read_lock();
        ret = nf_hook_slow(skb, &state, e, 0);
        rcu_read_unlock();

        if (ret == 1) {
                return skb;
        } else if (ret < 0) {
                *rc = NET_XMIT_DROP;
                return NULL;
        } else { /* ret == 0 */
                *rc = NET_XMIT_SUCCESS;
                return NULL;
        }
}
#else /* CONFIG_NETFILTER_EGRESS */
static inline bool nf_hook_egress_active(void)
{
        return false;
}

static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NETFILTER_EGRESS */

static inline void nf_skip_egress(struct sk_buff *skb, bool skip)
{
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        skb->nf_skip_egress = skip;
#endif
}

static inline void nf_hook_netdev_init(struct net_device *dev)
{
#ifdef CONFIG_NETFILTER_INGRESS
        RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
#endif
}

#endif /* _NETFILTER_NETDEV_H_ */


























    3 









    2 
















































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the TCP protocol.
 *
 * Version:        @(#)tcp.h        1.0.2        04/28/93
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_TCP_H
#define _LINUX_TCP_H


#include <linux/skbuff.h>
#include <linux/win_minmax.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <uapi/linux/tcp.h>

static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_transport_header(skb);
}

static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
{
        return th->doff * 4;
}

static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
        return __tcp_hdrlen(tcp_hdr(skb));
}

static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
{
        return (struct tcphdr *)skb_inner_transport_header(skb);
}

static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb)
{
        return inner_tcp_hdr(skb)->doff * 4;
}

/**
 * skb_tcp_all_headers - Returns size of all headers for a TCP packet
 * @skb: buffer
 *
 * Used in TX path, for a packet known to be a TCP one.
 *
 * if (skb_is_gso(skb)) {
 *         int hlen = skb_tcp_all_headers(skb);
 *         ...
 */
static inline int skb_tcp_all_headers(const struct sk_buff *skb)
{
        return skb_transport_offset(skb) + tcp_hdrlen(skb);
}

/**
 * skb_inner_tcp_all_headers - Returns size of all headers for an encap TCP packet
 * @skb: buffer
 *
 * Used in TX path, for a packet known to be a TCP one.
 *
 * if (skb_is_gso(skb) && skb->encapsulation) {
 *         int hlen = skb_inner_tcp_all_headers(skb);
 *         ...
 */
static inline int skb_inner_tcp_all_headers(const struct sk_buff *skb)
{
        return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
}

static inline unsigned int tcp_optlen(const struct sk_buff *skb)
{
        return (tcp_hdr(skb)->doff - 5) * 4;
}

/* TCP Fast Open */
#define TCP_FASTOPEN_COOKIE_MIN        4        /* Min Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_MAX        16        /* Max Fast Open Cookie size in bytes */
#define TCP_FASTOPEN_COOKIE_SIZE 8        /* the size employed by this impl. */

/* TCP Fast Open Cookie as stored in memory */
struct tcp_fastopen_cookie {
        __le64        val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))];
        s8        len;
        bool        exp;        /* In RFC6994 experimental option format */
};

/* This defines a selective acknowledgement block. */
struct tcp_sack_block_wire {
        __be32        start_seq;
        __be32        end_seq;
};

struct tcp_sack_block {
        u32        start_seq;
        u32        end_seq;
};

/*These are used to set the sack_ok field in struct tcp_options_received */
#define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
#define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/

struct tcp_options_received {
/*        PAWS/RTTM data        */
        int        ts_recent_stamp;/* Time we stored ts_recent (for aging) */
        u32        ts_recent;        /* Time stamp to echo next                */
        u32        rcv_tsval;        /* Time stamp value                     */
        u32        rcv_tsecr;        /* Time stamp echo reply                */
        u16         saw_tstamp : 1,        /* Saw TIMESTAMP on last packet                */
                tstamp_ok : 1,        /* TIMESTAMP seen on SYN packet                */
                dsack : 1,        /* D-SACK is scheduled                        */
                wscale_ok : 1,        /* Wscale seen on SYN packet                */
                sack_ok : 3,        /* SACK seen on SYN packet                */
                smc_ok : 1,        /* SMC seen on SYN packet                */
                snd_wscale : 4,        /* Window scaling received from sender        */
                rcv_wscale : 4;        /* Window scaling to send to receiver        */
        u8        saw_unknown:1,        /* Received unknown option                */
                unused:7;
        u8        num_sacks;        /* Number of SACK blocks                */
        u16        user_mss;        /* mss requested by user in ioctl        */
        u16        mss_clamp;        /* Maximal mss, negotiated at connection setup */
};

static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
        rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
        rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
#if IS_ENABLED(CONFIG_SMC)
        rx_opt->smc_ok = 0;
#endif
}

/* This is the max number of SACKS that we'll generate and process. It's safe
 * to increase this, although since:
 *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
 * only four options will fit in a standard TCP header */
#define TCP_NUM_SACKS 4

struct tcp_request_sock_ops;

struct tcp_request_sock {
        struct inet_request_sock         req;
        const struct tcp_request_sock_ops *af_specific;
        u64                                snt_synack; /* first SYNACK sent time */
        bool                                tfo_listener;
        bool                                is_mptcp;
        bool                                req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
        bool                                drop_req;
#endif
        u32                                txhash;
        u32                                rcv_isn;
        u32                                snt_isn;
        u32                                ts_off;
        u32                                last_oow_ack_time; /* last SYNACK */
        u32                                rcv_nxt; /* the ack # by SYNACK. For
                                                  * FastOpen it's the seq#
                                                  * after data-in-SYN.
                                                  */
        u8                                syn_tos;
#ifdef CONFIG_TCP_AO
        u8                                ao_keyid;
        u8                                ao_rcv_next;
        bool                                used_tcp_ao;
#endif
};

static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
{
        return (struct tcp_request_sock *)req;
}

static inline bool tcp_rsk_used_ao(const struct request_sock *req)
{
#ifndef CONFIG_TCP_AO
        return false;
#else
        return tcp_rsk(req)->used_tcp_ao;
#endif
}

#define TCP_RMEM_TO_WIN_SCALE 8

struct tcp_sock {
        /* Cacheline organization can be found documented in
         * Documentation/networking/net_cachelines/tcp_sock.rst.
         * Please update the document when adding new fields.
         */

        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock        inet_conn;

        /* TX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_tx);
        /* timestamp of last sent data packet (for restart window) */
        u32        max_window;        /* Maximal window ever seen from peer        */
        u32        rcv_ssthresh;        /* Current window clamp                        */
        u32        reordering;        /* Packet reordering metric.                */
        u32        notsent_lowat;        /* TCP_NOTSENT_LOWAT */
        u16        gso_segs;        /* Max number of segs per GSO packet        */
        /* from STCP, retrans queue hinting */
        struct sk_buff *lost_skb_hint;
        struct sk_buff *retransmit_skb_hint;
        __cacheline_group_end(tcp_sock_read_tx);

        /* TXRX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_txrx);
        u32        tsoffset;        /* timestamp offset */
        u32        snd_wnd;        /* The window we expect to receive        */
        u32        mss_cache;        /* Cached effective mss, not including SACKS */
        u32        snd_cwnd;        /* Sending congestion window                */
        u32        prr_out;        /* Total number of pkts sent during Recovery. */
        u32        lost_out;        /* Lost packets                        */
        u32        sacked_out;        /* SACK'd packets                        */
        u16        tcp_header_len;        /* Bytes of tcp header to send                */
        u8        scaling_ratio;        /* see tcp_win_from_space() */
        u8        chrono_type : 2,        /* current chronograph type */
                repair      : 1,
                tcp_usec_ts : 1, /* TSval values in usec */
                is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
                is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
        __cacheline_group_end(tcp_sock_read_txrx);

        /* RX read-mostly hotpath cache lines */
        __cacheline_group_begin(tcp_sock_read_rx);
        u32        copied_seq;        /* Head of yet unread data */
        u32        rcv_tstamp;        /* timestamp of last received ACK (for keepalives) */
        u32        snd_wl1;        /* Sequence for window update                */
        u32        tlp_high_seq;        /* snd_nxt at the time of TLP */
        u32        rttvar_us;        /* smoothed mdev_max                        */
        u32        retrans_out;        /* Retransmitted packets out                */
        u16        advmss;                /* Advertised MSS                        */
        u16        urg_data;        /* Saved octet of OOB data and control flags */
        u32        lost;                /* Total data packets lost incl. rexmits */
        struct  minmax rtt_min;
        /* OOO segments go in this rbtree. Socket lock must be held. */
        struct rb_root        out_of_order_queue;
        u32        snd_ssthresh;        /* Slow start size threshold                */
        u8        recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
        __cacheline_group_end(tcp_sock_read_rx);

        /* TX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned;
        u32        segs_out;        /* RFC4898 tcpEStatsPerfSegsOut
                                 * The total number of segments sent.
                                 */
        u32        data_segs_out;        /* RFC4898 tcpEStatsPerfDataSegsOut
                                 * total number of data segments sent.
                                 */
        u64        bytes_sent;        /* RFC4898 tcpEStatsPerfHCDataOctetsOut
                                 * total number of data bytes sent.
                                 */
        u32        snd_sml;        /* Last byte of the most recently transmitted small packet */
        u32        chrono_start;        /* Start time in jiffies of a TCP chrono */
        u32        chrono_stat[3];        /* Time in jiffies for chrono_stat stats */
        u32        write_seq;        /* Tail(+1) of data held in tcp send buffer */
        u32        pushed_seq;        /* Last pushed seq, required to talk to windows */
        u32        lsndtime;
        u32        mdev_us;        /* medium deviation                        */
        u32        rtt_seq;        /* sequence number to update rttvar        */
        u64        tcp_wstamp_ns;        /* departure time for next sent data packet */
        struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
        struct sk_buff *highest_sack;   /* skb just after the highest
                                         * skb with SACKed bit set
                                         * (validity guaranteed only if
                                         * sacked_out > 0)
                                         */
        u8        ecn_flags;        /* ECN status bits.                        */
        __cacheline_group_end(tcp_sock_write_tx);

        /* TXRX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_txrx);
/*
 *        Header prediction flags
 *        0x5?10 << 16 + snd_wnd in net byte order
 */
        __be32        pred_flags;
        u64        tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
        u64        tcp_mstamp;        /* most recent packet received/sent */
        u32        rcv_nxt;        /* What we want to receive next                */
        u32        snd_nxt;        /* Next sequence we send                */
        u32        snd_una;        /* First byte we want an ack for        */
        u32        window_clamp;        /* Maximal window to advertise                */
        u32        srtt_us;        /* smoothed round trip time << 3 in usecs */
        u32        packets_out;        /* Packets which are "in flight"        */
        u32        snd_up;                /* Urgent pointer                */
        u32        delivered;        /* Total data packets delivered incl. rexmits */
        u32        delivered_ce;        /* Like the above but only ECE marked packets */
        u32        app_limited;        /* limited until "delivered" reaches this val */
        u32        rcv_wnd;        /* Current receiver window                */
/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
        struct tcp_options_received rx_opt;
        u8        nonagle     : 4,/* Disable Nagle algorithm?             */
                rate_app_limited:1;  /* rate_{delivered,interval_us} limited? */
        __cacheline_group_end(tcp_sock_write_txrx);

        /* RX read-write hotpath cache lines */
        __cacheline_group_begin(tcp_sock_write_rx) __aligned(8);
        u64        bytes_received;
                                /* RFC4898 tcpEStatsAppHCThruOctetsReceived
                                 * sum(delta(rcv_nxt)), or how many bytes
                                 * were acked.
                                 */
        u32        segs_in;        /* RFC4898 tcpEStatsPerfSegsIn
                                 * total number of segments in.
                                 */
        u32        data_segs_in;        /* RFC4898 tcpEStatsPerfDataSegsIn
                                 * total number of data segments in.
                                 */
        u32        rcv_wup;        /* rcv_nxt on last window update sent        */
        u32        max_packets_out;  /* max packets_out in last window */
        u32        cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
        u32        rate_delivered;    /* saved rate sample: packets delivered */
        u32        rate_interval_us;  /* saved rate sample: time elapsed */
        u32        rcv_rtt_last_tsecr;
        u64        first_tx_mstamp;  /* start of window send phase */
        u64        delivered_mstamp; /* time we reached "delivered" */
        u64        bytes_acked;        /* RFC4898 tcpEStatsAppHCThruOctetsAcked
                                 * sum(delta(snd_una)), or how many bytes
                                 * were acked.
                                 */
        struct {
                u32        rtt_us;
                u32        seq;
                u64        time;
        } rcv_rtt_est;
/* Receiver queue space */
        struct {
                u32        space;
                u32        seq;
                u64        time;
        } rcvq_space;
        __cacheline_group_end(tcp_sock_write_rx);
        /* End of Hot Path */

/*
 *        RFC793 variables by their proper names. This means you can
 *        read the code and the spec side by side (and laugh ...)
 *        See RFC793 and RFC1122. The RFC writes these in capitals.
 */
        u32        dsack_dups;        /* RFC4898 tcpEStatsStackDSACKDups
                                 * total number of DSACK blocks received
                                 */
        u32        compressed_ack_rcv_nxt;
        struct list_head tsq_node; /* anchor in tsq_tasklet.head list */

        /* Information of the most recently (s)acked skb */
        struct tcp_rack {
                u64 mstamp; /* (Re)sent time of the skb */
                u32 rtt_us;  /* Associated RTT */
                u32 end_seq; /* Ending TCP sequence of the skb */
                u32 last_delivered; /* tp->delivered at last reo_wnd adj */
                u8 reo_wnd_steps;   /* Allowed reordering window */
#define TCP_RACK_RECOVERY_THRESH 16
                u8 reo_wnd_persist:5, /* No. of recovery since last adj */
                   dsack_seen:1, /* Whether DSACK seen after last adj */
                   advanced:1;         /* mstamp advanced since last lost marking */
        } rack;
        u8        compressed_ack;
        u8        dup_ack_counter:2,
                tlp_retrans:1,        /* TLP is a retransmission */
                unused:5;
        u8        thin_lto    : 1,/* Use linear timeouts for thin streams */
                fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
                fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
                fastopen_client_fail:2, /* reason why fastopen failed */
                frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
        u8        repair_queue;
        u8        save_syn:2,        /* Save headers of SYN packet */
                syn_data:1,        /* SYN includes data */
                syn_fastopen:1,        /* SYN includes Fast Open option */
                syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
                syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1;/* data in SYN is acked by SYN-ACK */

        u8        keepalive_probes; /* num of allowed keep alive probes        */
        u32        tcp_tx_delay;        /* delay (in usec) added to TX packets */

/* RTT measurement */
        u32        mdev_max_us;        /* maximal mdev for the last rtt period        */

        u32        reord_seen;        /* number of data packet reordering events */

/*
 *        Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
        u32        snd_cwnd_cnt;        /* Linear increase counter                */
        u32        snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
        u32        snd_cwnd_used;
        u32        snd_cwnd_stamp;
        u32        prior_cwnd;        /* cwnd right before starting loss recovery */
        u32        prr_delivered;        /* Number of newly delivered packets to
                                 * receiver in Recovery. */
        u32        last_oow_ack_time;  /* timestamp of last out-of-window ACK */

        struct hrtimer        pacing_timer;
        struct hrtimer        compressed_ack_timer;

        struct sk_buff        *ooo_last_skb; /* cache rb_last(out_of_order_queue) */

        /* SACKs data, these 2 need to be together (see tcp_options_write) */
        struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
        struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

        struct tcp_sack_block recv_sack_cache[4];

        int     lost_cnt_hint;

        u32        prior_ssthresh; /* ssthresh saved at recovery start        */
        u32        high_seq;        /* snd_nxt at onset of congestion        */

        u32        retrans_stamp;        /* Timestamp of the last retransmit,
                                 * also used in SYN-SENT to remember stamp of
                                 * the first SYN. */
        u32        undo_marker;        /* snd_una upon a new recovery episode. */
        int        undo_retrans;        /* number of undoable retransmissions. */
        u64        bytes_retrans;        /* RFC4898 tcpEStatsPerfOctetsRetrans
                                 * Total data bytes retransmitted
                                 */
        u32        total_retrans;        /* Total retransmits for entire connection */
        u32        rto_stamp;        /* Start time (ms) of last CA_Loss recovery */
        u16        total_rto;        /* Total number of RTO timeouts, including
                                 * SYN/SYN-ACK and recurring timeouts.
                                 */
        u16        total_rto_recoveries;        /* Total number of RTO recoveries,
                                         * including any unfinished recovery.
                                         */
        u32        total_rto_time;        /* ms spent in (completed) RTO recoveries. */

        u32        urg_seq;        /* Seq of received urgent pointer */
        unsigned int                keepalive_time;          /* time before keep alive takes place */
        unsigned int                keepalive_intvl;  /* time interval between keep alive probes */

        int                        linger2;


/* Sock_ops bpf program related variables */
#ifdef CONFIG_BPF
        u8        bpf_sock_ops_cb_flags;  /* Control calling BPF programs
                                         * values defined in uapi/linux/tcp.h
                                         */
        u8        bpf_chg_cc_inprogress:1; /* In the middle of
                                          * bpf_setsockopt(TCP_CONGESTION),
                                          * it is to avoid the bpf_tcp_cc->init()
                                          * to recur itself by calling
                                          * bpf_setsockopt(TCP_CONGESTION, "itself").
                                          */
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
#endif

        u16 timeout_rehash;        /* Timeout-triggered rehash attempts */

        u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */

/* TCP-specific MTU probe information. */
        struct {
                u32                  probe_seq_start;
                u32                  probe_seq_end;
        } mtu_probe;
        u32     plb_rehash;     /* PLB-triggered rehash attempts */
        u32        mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
                           * while socket was owned by user.
                           */
#if IS_ENABLED(CONFIG_MPTCP)
        bool        is_mptcp;
#endif
#if IS_ENABLED(CONFIG_SMC)
        bool        syn_smc;        /* SYN includes SMC */
        bool        (*smc_hs_congested)(const struct sock *sk);
#endif

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
/* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */
        const struct tcp_sock_af_ops        *af_specific;

#ifdef CONFIG_TCP_MD5SIG
/* TCP MD5 Signature Option information */
        struct tcp_md5sig_info        __rcu *md5sig_info;
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info        __rcu *ao_info;
#endif
#endif

/* TCP fastopen related information */
        struct tcp_fastopen_request *fastopen_req;
        /* fastopen_rsk points to request_sock that resulted in this big
         * socket. Used to retransmit SYNACKs etc.
         */
        struct request_sock __rcu *fastopen_rsk;
        struct saved_syn *saved_syn;
};

enum tsq_enum {
        TSQ_THROTTLED,
        TSQ_QUEUED,
        TCP_TSQ_DEFERRED,           /* tcp_tasklet_func() found socket was owned */
        TCP_WRITE_TIMER_DEFERRED,  /* tcp_write_timer() found socket was owned */
        TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
        TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
                                    * tcp_v{4|6}_mtu_reduced()
                                    */
        TCP_ACK_DEFERRED,           /* TX pure ack is deferred */
};

enum tsq_flags {
        TSQF_THROTTLED                        = BIT(TSQ_THROTTLED),
        TSQF_QUEUED                        = BIT(TSQ_QUEUED),
        TCPF_TSQ_DEFERRED                = BIT(TCP_TSQ_DEFERRED),
        TCPF_WRITE_TIMER_DEFERRED        = BIT(TCP_WRITE_TIMER_DEFERRED),
        TCPF_DELACK_TIMER_DEFERRED        = BIT(TCP_DELACK_TIMER_DEFERRED),
        TCPF_MTU_REDUCED_DEFERRED        = BIT(TCP_MTU_REDUCED_DEFERRED),
        TCPF_ACK_DEFERRED                = BIT(TCP_ACK_DEFERRED),
};

#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)

/* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket.
 * Used in context of (lockless) tcp listeners.
 */
#define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)

struct tcp_timewait_sock {
        struct inet_timewait_sock tw_sk;
#define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
#define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt
        u32                          tw_rcv_wnd;
        u32                          tw_ts_offset;
        u32                          tw_ts_recent;

        /* The time we sent the last out-of-window ACK: */
        u32                          tw_last_oow_ack_time;

        int                          tw_ts_recent_stamp;
        u32                          tw_tx_delay;
#ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key          *tw_md5_key;
#endif
#ifdef CONFIG_TCP_AO
        struct tcp_ao_info        __rcu *ao_info;
#endif
};

static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
{
        return (struct tcp_timewait_sock *)sk;
}

static inline bool tcp_passive_fastopen(const struct sock *sk)
{
        return sk->sk_state == TCP_SYN_RECV &&
               rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL;
}

static inline void fastopen_queue_tune(struct sock *sk, int backlog)
{
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
        int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn);

        WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn));
}

static inline void tcp_move_syn(struct tcp_sock *tp,
                                struct request_sock *req)
{
        tp->saved_syn = req->saved_syn;
        req->saved_syn = NULL;
}

static inline void tcp_saved_syn_free(struct tcp_sock *tp)
{
        kfree(tp->saved_syn);
        tp->saved_syn = NULL;
}

static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn)
{
        return saved_syn->mac_hdrlen + saved_syn->network_hdrlen +
                saved_syn->tcp_hdrlen;
}

struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
                                               const struct sk_buff *orig_skb,
                                               const struct sk_buff *ack_skb);

static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss)
{
        /* We use READ_ONCE() here because socket might not be locked.
         * This happens for listeners.
         */
        u16 user_mss = READ_ONCE(tp->rx_opt.user_mss);

        return (user_mss && user_mss < mss) ? user_mss : mss;
}

int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount,
                  int shiftlen);

void __tcp_sock_set_cork(struct sock *sk, bool on);
void tcp_sock_set_cork(struct sock *sk, bool on);
int tcp_sock_set_keepcnt(struct sock *sk, int val);
int tcp_sock_set_keepidle_locked(struct sock *sk, int val);
int tcp_sock_set_keepidle(struct sock *sk, int val);
int tcp_sock_set_keepintvl(struct sock *sk, int val);
void __tcp_sock_set_nodelay(struct sock *sk, bool on);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);

static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
{
        return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
}

#endif        /* _LINUX_TCP_H */







































    3 







    1 











    1 











    2 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IPv6 library code, needed by static components when full IPv6 support is
 * not configured or static.
 */

#include <linux/export.h>
#include <net/ipv6.h>
#include <net/ipv6_stubs.h>
#include <net/addrconf.h>
#include <net/ip.h>

/* if ipv6 module registers this function is used by xfrm to force all
 * sockets to relookup their nodes - this is fairly expensive, be
 * careful
 */
void (*__fib6_flush_trees)(struct net *);
EXPORT_SYMBOL(__fib6_flush_trees);

#define IPV6_ADDR_SCOPE_TYPE(scope)        ((scope) << 16)

static inline unsigned int ipv6_addr_scope2type(unsigned int scope)
{
        switch (scope) {
        case IPV6_ADDR_SCOPE_NODELOCAL:
                return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) |
                        IPV6_ADDR_LOOPBACK);
        case IPV6_ADDR_SCOPE_LINKLOCAL:
                return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) |
                        IPV6_ADDR_LINKLOCAL);
        case IPV6_ADDR_SCOPE_SITELOCAL:
                return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) |
                        IPV6_ADDR_SITELOCAL);
        }
        return IPV6_ADDR_SCOPE_TYPE(scope);
}

int __ipv6_addr_type(const struct in6_addr *addr)
{
        __be32 st;

        st = addr->s6_addr32[0];

        /* Consider all addresses with the first three bits different of
           000 and 111 as unicasts.
         */
        if ((st & htonl(0xE0000000)) != htonl(0x00000000) &&
            (st & htonl(0xE0000000)) != htonl(0xE0000000))
                return (IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));

        if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) {
                /* multicast */
                /* addr-select 3.1 */
                return (IPV6_ADDR_MULTICAST |
                        ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr)));
        }

        if ((st & htonl(0xFFC00000)) == htonl(0xFE800000))
                return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL));                /* addr-select 3.1 */
        if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000))
                return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL));                /* addr-select 3.1 */
        if ((st & htonl(0xFE000000)) == htonl(0xFC000000))
                return (IPV6_ADDR_UNICAST |
                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));                        /* RFC 4193 */

        if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) {
                if (addr->s6_addr32[2] == 0) {
                        if (addr->s6_addr32[3] == 0)
                                return IPV6_ADDR_ANY;

                        if (addr->s6_addr32[3] == htonl(0x00000001))
                                return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST |
                                        IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL));        /* addr-select 3.4 */

                        return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST |
                                IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));        /* addr-select 3.3 */
                }

                if (addr->s6_addr32[2] == htonl(0x0000ffff))
                        return (IPV6_ADDR_MAPPED |
                                IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));        /* addr-select 3.3 */
        }

        return (IPV6_ADDR_UNICAST |
                IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL));        /* addr-select 3.4 */
}
EXPORT_SYMBOL(__ipv6_addr_type);

static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain);

int register_inet6addr_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&inet6addr_chain, nb);
}
EXPORT_SYMBOL(register_inet6addr_notifier);

int unregister_inet6addr_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&inet6addr_chain, nb);
}
EXPORT_SYMBOL(unregister_inet6addr_notifier);

int inet6addr_notifier_call_chain(unsigned long val, void *v)
{
        return atomic_notifier_call_chain(&inet6addr_chain, val, v);
}
EXPORT_SYMBOL(inet6addr_notifier_call_chain);

int register_inet6addr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&inet6addr_validator_chain, nb);
}
EXPORT_SYMBOL(register_inet6addr_validator_notifier);

int unregister_inet6addr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&inet6addr_validator_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_inet6addr_validator_notifier);

int inet6addr_validator_notifier_call_chain(unsigned long val, void *v)
{
        return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v);
}
EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain);

static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net,
                                                           const struct sock *sk,
                                                           struct flowi6 *fl6,
                                                           const struct in6_addr *final_dst)
{
        return ERR_PTR(-EAFNOSUPPORT);
}

static int eafnosupport_ipv6_route_input(struct sk_buff *skb)
{
        return -EAFNOSUPPORT;
}

static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
{
        return NULL;
}

static int
eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
                               int oif, struct flowi6 *fl6,
                               struct fib6_result *res, int flags)
{
        return -EAFNOSUPPORT;
}

static int
eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                         struct fib6_result *res, int flags)
{
        return -EAFNOSUPPORT;
}

static void
eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res,
                              struct flowi6 *fl6, int oif, bool have_oif_match,
                              const struct sk_buff *skb, int strict)
{
}

static u32
eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res,
                               const struct in6_addr *daddr,
                               const struct in6_addr *saddr)
{
        return 0;
}

static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                                     struct fib6_config *cfg, gfp_t gfp_flags,
                                     struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
        return -EAFNOSUPPORT;
}

static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
                                   bool skip_notify)
{
        return -EAFNOSUPPORT;
}

static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                                      int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        kfree_skb(skb);
        return -EAFNOSUPPORT;
}

static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                                     struct net_device *dev)
{
        return ERR_PTR(-EAFNOSUPPORT);
}

const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
        .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow,
        .ipv6_route_input  = eafnosupport_ipv6_route_input,
        .fib6_get_table    = eafnosupport_fib6_get_table,
        .fib6_table_lookup = eafnosupport_fib6_table_lookup,
        .fib6_lookup       = eafnosupport_fib6_lookup,
        .fib6_select_path  = eafnosupport_fib6_select_path,
        .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
        .fib6_nh_init           = eafnosupport_fib6_nh_init,
        .ip6_del_rt           = eafnosupport_ip6_del_rt,
        .ipv6_fragment           = eafnosupport_ipv6_fragment,
        .ipv6_dev_find     = eafnosupport_ipv6_dev_find,
};
EXPORT_SYMBOL_GPL(ipv6_stub);

/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
const struct in6_addr in6addr_loopback __aligned(BITS_PER_LONG/8)
        = IN6ADDR_LOOPBACK_INIT;
EXPORT_SYMBOL(in6addr_loopback);
const struct in6_addr in6addr_any __aligned(BITS_PER_LONG/8)
        = IN6ADDR_ANY_INIT;
EXPORT_SYMBOL(in6addr_any);
const struct in6_addr in6addr_linklocal_allnodes __aligned(BITS_PER_LONG/8)
        = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
EXPORT_SYMBOL(in6addr_linklocal_allnodes);
const struct in6_addr in6addr_linklocal_allrouters __aligned(BITS_PER_LONG/8)
        = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_linklocal_allrouters);
const struct in6_addr in6addr_interfacelocal_allnodes __aligned(BITS_PER_LONG/8)
        = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
EXPORT_SYMBOL(in6addr_interfacelocal_allnodes);
const struct in6_addr in6addr_interfacelocal_allrouters __aligned(BITS_PER_LONG/8)
        = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
const struct in6_addr in6addr_sitelocal_allrouters __aligned(BITS_PER_LONG/8)
        = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_sitelocal_allrouters);

static void snmp6_free_dev(struct inet6_dev *idev)
{
        kfree(idev->stats.icmpv6msgdev);
        kfree(idev->stats.icmpv6dev);
        free_percpu(idev->stats.ipv6);
}

static void in6_dev_finish_destroy_rcu(struct rcu_head *head)
{
        struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu);

        snmp6_free_dev(idev);
        kfree(idev);
}

/* Nobody refers to this device, we may destroy it. */

void in6_dev_finish_destroy(struct inet6_dev *idev)
{
        struct net_device *dev = idev->dev;

        WARN_ON(!list_empty(&idev->addr_list));
        WARN_ON(rcu_access_pointer(idev->mc_list));
        WARN_ON(timer_pending(&idev->rs_timer));

#ifdef NET_REFCNT_DEBUG
        pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
#endif
        netdev_put(dev, &idev->dev_tracker);
        if (!idev->dead) {
                pr_warn("Freeing alive inet6 device %p\n", idev);
                return;
        }
        call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu);
}
EXPORT_SYMBOL(in6_dev_finish_destroy);
























































    2 











    2 









    1 
    2 


    2 


















    3 






















    1 

    1 



    1 

    1 




































    1 
    1 







    1 







    3 









    3 














    1 






































    1 

























    2 







    2 

    2 











    2 













    2 













    1 














    1 







    1 































    2 





















































    2 




























    3 
    1 




























    1 





























































    3 










































































































    3 



    3 








    2 
    1 






    1 





    2 








    3 






    3 
    1 





    2 



















    3 










    1 






    1 





    2 

    3 








    1 
















































    1 






    1 
























    1 





    1 





    1 



    1 






    1 







    1 





    1 




    1 




















































































    3 







    3 









    2 











    2 



    1 







    3 














    3 








































































































    2 





    2 




    1 




    1 

    1 

    1 



    1 




    1 





















    2 









    3 








    3 




    2 





































    2 















    3 



















    2 

















    2 



    3 










    3 






    3 

    1 










    2 
    1 
















    3 












    3 


    3 
















    2 



    1 







    3 
    2 









    3 


















    3 

    3 



    1 
    3 

    2 
    1 




    3 




    3 










    2 

    3 



    3 



    2 
    3 
























    1 

    1 







    1 








    1 










































    1 











































































































































































































































    1 









    1 



























    1 




    1 







    1 











    2 

    2 








    2 
    2 


    2 
















































    2 

    1 






    2 





    2 



    2 










    1 
    1 








    1 














    1 
    2 














    1 






















    2 

    2 

    2 










    2 





    2 

    2 
















    1 












    1 



















































    1 



    1 




































    1 





    1 


    1 
















    1 


    1 
    1 


































    1 















    1 










































































































































































    1 






























    2 





















































































































    2 







































    2 


    2 







    2 












    2 

























    1 






    1 







    1 
    2 



    2 










    3 
    2 
    3 











    2 


    1 









    1 



    1 



















    2 










    1 

    2 








    2 



    2 








    2 



    2 
    1 







    1 
    2 



    2 





    2 





    1 





    1 
    1 



    2 
    1 













    2 








    2 







    1 

    2 


    1 
    2 


    2 




    2 


    1 



    3 
















    3 
    2 









    1 
    1 









    1 
    2 



    2 






























































































    2 







    2 

    1 






    1 




    1 























































    2 














    2 




    2 








    3 




    2 


    1 














    1 



    2 


































































    1 





























































































































































































































































































































































    3 



    2 



    1 

    1 








    1 









    1 





























    1 













































    1 

















    1 
    1 



    1 




    1 










































































































































































    1 















    1 









    1 




    1 


    1 
    1 

    1 


    1 
    1 




    1 





    1 


    1 




















    1 


















    1 
















    1 




    1 











































































    1 


    1 








    1 
























































    1 













    1 
    1 



    1 










    1 






    1 

















    1 




    1 






























    1 






    1 









    3 



























    3 













    2 

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
// SPDX-License-Identifier: GPL-2.0-only
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *                Linus Torvalds, <torvalds@cs.helsinki.fi>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Matthew Dillon, <dillon@apollo.west.oic.com>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:        Pedro Roque        :        Retransmit queue handled by TCP.
 *                                :        Fragmentation on mtu decrease
 *                                :        Segment collapse on retransmit
 *                                :        AF independence
 *
 *                Linus Torvalds        :        send_delayed_ack
 *                David S. Miller        :        Charge memory using the right skb
 *                                        during syn/ack processing.
 *                David S. Miller :        Output engine completely rewritten.
 *                Andrea Arcangeli:        SYNACK carry ts_recent in tsecr.
 *                Cacophonix Gaul :        draft-minshall-nagle-01
 *                J Hadi Salim        :        ECN support
 *
 */

#define pr_fmt(fmt) "TCP: " fmt

#include <net/tcp.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>

#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/static_key.h>
#include <linux/skbuff_ref.h>

#include <trace/events/tcp.h>

/* Refresh clocks of a TCP socket,
 * ensuring monotically increasing values.
 */
void tcp_mstamp_refresh(struct tcp_sock *tp)
{
        u64 val = tcp_clock_ns();

        tp->tcp_clock_cache = val;
        tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
}

static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                           int push_one, gfp_t gfp);

/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int prior_packets = tp->packets_out;

        WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);

        __skb_unlink(skb, &sk->sk_write_queue);
        tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);

        if (tp->highest_sack == NULL)
                tp->highest_sack = skb;

        tp->packets_out += tcp_skb_pcount(skb);
        if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);

        NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
                      tcp_skb_pcount(skb));
        tcp_check_space(sk);
}

/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
 * window scaling factor due to loss of precision.
 * If window has been shrunk, what should we make? It is not clear at all.
 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
 * invalid. OK, let's make this for now:
 */
static inline __u32 tcp_acceptable_seq(const struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
            (tp->rx_opt.wscale_ok &&
             ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
                return tp->snd_nxt;
        else
                return tcp_wnd_end(tp);
}

/* Calculate mss to advertise in SYN segment.
 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
 *
 * 1. It is independent of path mtu.
 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
 *    attached devices, because some buggy hosts are confused by
 *    large MSS.
 * 4. We do not make 3, we advertise MSS, calculated from first
 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
 *    This may be overridden via information stored in routing table.
 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
 *    probably even Jumbo".
 */
static __u16 tcp_advertise_mss(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        const struct dst_entry *dst = __sk_dst_get(sk);
        int mss = tp->advmss;

        if (dst) {
                unsigned int metric = dst_metric_advmss(dst);

                if (metric < mss) {
                        mss = metric;
                        tp->advmss = mss;
                }
        }

        return (__u16)mss;
}

/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
 * This is the first part of cwnd validation mechanism.
 */
void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
        u32 cwnd = tcp_snd_cwnd(tp);

        tcp_ca_event(sk, CA_EVENT_CWND_RESTART);

        tp->snd_ssthresh = tcp_current_ssthresh(sk);
        restart_cwnd = min(restart_cwnd, cwnd);

        while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
                cwnd >>= 1;
        tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
        tp->snd_cwnd_stamp = tcp_jiffies32;
        tp->snd_cwnd_used = 0;
}

/* Congestion state accounting after a packet has been sent. */
static void tcp_event_data_sent(struct tcp_sock *tp,
                                struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        const u32 now = tcp_jiffies32;

        if (tcp_packets_in_flight(tp) == 0)
                tcp_ca_event(sk, CA_EVENT_TX_START);

        tp->lsndtime = now;

        /* If it is a reply for ato after last received
         * packet, increase pingpong count.
         */
        if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
                inet_csk_inc_pingpong_cnt(sk);
}

/* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (unlikely(tp->compressed_ack)) {
                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
                              tp->compressed_ack);
                tp->compressed_ack = 0;
                if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
                        __sock_put(sk);
        }

        if (unlikely(rcv_nxt != tp->rcv_nxt))
                return;  /* Special ACK sent by DCTCP to reflect ECN */
        tcp_dec_quickack_mode(sk);
        inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}

/* Determine a window scaling and initial window to offer.
 * Based on the assumption that the given amount of space
 * will be offered. Store the results in the tp structure.
 * NOTE: for smooth operation initial space offering should
 * be a multiple of mss if possible. We assume here that mss >= 1.
 * This MUST be enforced by all callers.
 */
void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
                               __u32 *rcv_wnd, __u32 *__window_clamp,
                               int wscale_ok, __u8 *rcv_wscale,
                               __u32 init_rcv_wnd)
{
        unsigned int space = (__space < 0 ? 0 : __space);
        u32 window_clamp = READ_ONCE(*__window_clamp);

        /* If no clamp set the clamp to the max possible scaled window */
        if (window_clamp == 0)
                window_clamp = (U16_MAX << TCP_MAX_WSCALE);
        space = min(window_clamp, space);

        /* Quantize space offering to a multiple of mss if possible. */
        if (space > mss)
                space = rounddown(space, mss);

        /* NOTE: offering an initial window larger than 32767
         * will break some buggy TCP stacks. If the admin tells us
         * it is likely we could be speaking with such a buggy stack
         * we will truncate our initial window offering to 32K-1
         * unless the remote has sent us a window scaling option,
         * which we interpret as a sign the remote TCP is not
         * misinterpreting the window field as a signed quantity.
         */
        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
                (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
        else
                (*rcv_wnd) = space;

        if (init_rcv_wnd)
                *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);

        *rcv_wscale = 0;
        if (wscale_ok) {
                /* Set window scaling on max possible window */
                space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
                space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
                space = min_t(u32, space, window_clamp);
                *rcv_wscale = clamp_t(int, ilog2(space) - 15,
                                      0, TCP_MAX_WSCALE);
        }
        /* Set the clamp no higher than max representable value */
        WRITE_ONCE(*__window_clamp,
                   min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
}
EXPORT_SYMBOL(tcp_select_initial_window);

/* Chose a new window to advertise, update state in tcp_sock for the
 * socket, and return result with RFC1323 scaling applied.  The return
 * value can be stuffed directly into th->window for an outgoing
 * frame.
 */
static u16 tcp_select_window(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        u32 old_win = tp->rcv_wnd;
        u32 cur_win, new_win;

        /* Make the window 0 if we failed to queue the data because we
         * are out of memory. The window is temporary, so we don't store
         * it on the socket.
         */
        if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
                return 0;

        cur_win = tcp_receive_window(tp);
        new_win = __tcp_select_window(sk);
        if (new_win < cur_win) {
                /* Danger Will Robinson!
                 * Don't update rcv_wup/rcv_wnd here or else
                 * we will not be able to advertise a zero
                 * window in time.  --DaveM
                 *
                 * Relax Will Robinson.
                 */
                if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) {
                        /* Never shrink the offered window */
                        if (new_win == 0)
                                NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
                        new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
                }
        }

        tp->rcv_wnd = new_win;
        tp->rcv_wup = tp->rcv_nxt;

        /* Make sure we do not exceed the maximum possible
         * scaled window.
         */
        if (!tp->rx_opt.rcv_wscale &&
            READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
                new_win = min(new_win, MAX_TCP_WINDOW);
        else
                new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));

        /* RFC1323 scaling applied */
        new_win >>= tp->rx_opt.rcv_wscale;

        /* If we advertise zero window, disable fast path. */
        if (new_win == 0) {
                tp->pred_flags = 0;
                if (old_win)
                        NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
        } else if (old_win == 0) {
                NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
        }

        return new_win;
}

/* Packet ECN state for a SYN-ACK */
static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
        if (!(tp->ecn_flags & TCP_ECN_OK))
                TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
        else if (tcp_ca_needs_ecn(sk) ||
                 tcp_bpf_ca_needs_ecn(sk))
                INET_ECN_xmit(sk);
}

/* Packet ECN state for a SYN.  */
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
        bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
                tcp_ca_needs_ecn(sk) || bpf_needs_ecn;

        if (!use_ecn) {
                const struct dst_entry *dst = __sk_dst_get(sk);

                if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
                        use_ecn = true;
        }

        tp->ecn_flags = 0;

        if (use_ecn) {
                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
                tp->ecn_flags = TCP_ECN_OK;
                if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
                        INET_ECN_xmit(sk);
        }
}

static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
{
        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
                /* tp->ecn_flags are cleared at a later point in time when
                 * SYN ACK is ultimatively being received.
                 */
                TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
}

static void
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
{
        if (inet_rsk(req)->ecn_ok)
                th->ece = 1;
}

/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
 * be sent.
 */
static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
                         struct tcphdr *th, int tcp_header_len)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (tp->ecn_flags & TCP_ECN_OK) {
                /* Not-retransmitted data segment: set ECT and inject CWR. */
                if (skb->len != tcp_header_len &&
                    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
                        INET_ECN_xmit(sk);
                        if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
                                tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
                                th->cwr = 1;
                                skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
                        }
                } else if (!tcp_ca_needs_ecn(sk)) {
                        /* ACK or retransmitted segment: clear ECT|CE */
                        INET_ECN_dontxmit(sk);
                }
                if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
                        th->ece = 1;
        }
}

/* Constructs common control bits of non-data skb. If SYN/FIN is present,
 * auto increment end seqno.
 */
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
        skb->ip_summed = CHECKSUM_PARTIAL;

        TCP_SKB_CB(skb)->tcp_flags = flags;

        tcp_skb_pcount_set(skb, 1);

        TCP_SKB_CB(skb)->seq = seq;
        if (flags & (TCPHDR_SYN | TCPHDR_FIN))
                seq++;
        TCP_SKB_CB(skb)->end_seq = seq;
}

static inline bool tcp_urg_mode(const struct tcp_sock *tp)
{
        return tp->snd_una != tp->snd_up;
}

#define OPTION_SACK_ADVERTISE        BIT(0)
#define OPTION_TS                BIT(1)
#define OPTION_MD5                BIT(2)
#define OPTION_WSCALE                BIT(3)
#define OPTION_FAST_OPEN_COOKIE        BIT(8)
#define OPTION_SMC                BIT(9)
#define OPTION_MPTCP                BIT(10)
#define OPTION_AO                BIT(11)

static void smc_options_write(__be32 *ptr, u16 *options)
{
#if IS_ENABLED(CONFIG_SMC)
        if (static_branch_unlikely(&tcp_have_smc)) {
                if (unlikely(OPTION_SMC & *options)) {
                        *ptr++ = htonl((TCPOPT_NOP  << 24) |
                                       (TCPOPT_NOP  << 16) |
                                       (TCPOPT_EXP <<  8) |
                                       (TCPOLEN_EXP_SMC_BASE));
                        *ptr++ = htonl(TCPOPT_SMC_MAGIC);
                }
        }
#endif
}

struct tcp_out_options {
        u16 options;                /* bit field of OPTION_* */
        u16 mss;                /* 0 to disable */
        u8 ws;                        /* window scale, 0 to disable */
        u8 num_sack_blocks;        /* number of SACK blocks to include */
        u8 hash_size;                /* bytes in hash_location */
        u8 bpf_opt_len;                /* length of BPF hdr option */
        __u8 *hash_location;        /* temporary pointer, overloaded */
        __u32 tsval, tsecr;        /* need to include OPTION_TS */
        struct tcp_fastopen_cookie *fastopen_cookie;        /* Fast open cookie */
        struct mptcp_out_options mptcp;
};

static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
                                struct tcp_sock *tp,
                                struct tcp_out_options *opts)
{
#if IS_ENABLED(CONFIG_MPTCP)
        if (unlikely(OPTION_MPTCP & opts->options))
                mptcp_write_options(th, ptr, tp, &opts->mptcp);
#endif
}

#ifdef CONFIG_CGROUP_BPF
static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
                                        enum tcp_synack_type synack_type)
{
        if (unlikely(!skb))
                return BPF_WRITE_HDR_TCP_CURRENT_MSS;

        if (unlikely(synack_type == TCP_SYNACK_COOKIE))
                return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;

        return 0;
}

/* req, syn_skb and synack_type are used when writing synack */
static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req,
                                  struct sk_buff *syn_skb,
                                  enum tcp_synack_type synack_type,
                                  struct tcp_out_options *opts,
                                  unsigned int *remaining)
{
        struct bpf_sock_ops_kern sock_ops;
        int err;

        if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
                                           BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
            !*remaining)
                return;

        /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */

        /* init sock_ops */
        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));

        sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;

        if (req) {
                /* The listen "sk" cannot be passed here because
                 * it is not locked.  It would not make too much
                 * sense to do bpf_setsockopt(listen_sk) based
                 * on individual connection request also.
                 *
                 * Thus, "req" is passed here and the cgroup-bpf-progs
                 * of the listen "sk" will be run.
                 *
                 * "req" is also used here for fastopen even the "sk" here is
                 * a fullsock "child" sk.  It is to keep the behavior
                 * consistent between fastopen and non-fastopen on
                 * the bpf programming side.
                 */
                sock_ops.sk = (struct sock *)req;
                sock_ops.syn_skb = syn_skb;
        } else {
                sock_owned_by_me(sk);

                sock_ops.is_fullsock = 1;
                sock_ops.sk = sk;
        }

        sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
        sock_ops.remaining_opt_len = *remaining;
        /* tcp_current_mss() does not pass a skb */
        if (skb)
                bpf_skops_init_skb(&sock_ops, skb, 0);

        err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);

        if (err || sock_ops.remaining_opt_len == *remaining)
                return;

        opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
        /* round up to 4 bytes */
        opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;

        *remaining -= opts->bpf_opt_len;
}

static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
                                    struct request_sock *req,
                                    struct sk_buff *syn_skb,
                                    enum tcp_synack_type synack_type,
                                    struct tcp_out_options *opts)
{
        u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
        struct bpf_sock_ops_kern sock_ops;
        int err;

        if (likely(!max_opt_len))
                return;

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));

        sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;

        if (req) {
                sock_ops.sk = (struct sock *)req;
                sock_ops.syn_skb = syn_skb;
        } else {
                sock_owned_by_me(sk);

                sock_ops.is_fullsock = 1;
                sock_ops.sk = sk;
        }

        sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
        sock_ops.remaining_opt_len = max_opt_len;
        first_opt_off = tcp_hdrlen(skb) - max_opt_len;
        bpf_skops_init_skb(&sock_ops, skb, first_opt_off);

        err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);

        if (err)
                nr_written = 0;
        else
                nr_written = max_opt_len - sock_ops.remaining_opt_len;

        if (nr_written < max_opt_len)
                memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
                       max_opt_len - nr_written);
}
#else
static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
                                  struct request_sock *req,
                                  struct sk_buff *syn_skb,
                                  enum tcp_synack_type synack_type,
                                  struct tcp_out_options *opts,
                                  unsigned int *remaining)
{
}

static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
                                    struct request_sock *req,
                                    struct sk_buff *syn_skb,
                                    enum tcp_synack_type synack_type,
                                    struct tcp_out_options *opts)
{
}
#endif

static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
                                      const struct tcp_request_sock *tcprsk,
                                      struct tcp_out_options *opts,
                                      struct tcp_key *key, __be32 *ptr)
{
#ifdef CONFIG_TCP_AO
        u8 maclen = tcp_ao_maclen(key->ao_key);

        if (tcprsk) {
                u8 aolen = maclen + sizeof(struct tcp_ao_hdr);

                *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) |
                               (tcprsk->ao_keyid << 8) |
                               (tcprsk->ao_rcv_next));
        } else {
                struct tcp_ao_key *rnext_key;
                struct tcp_ao_info *ao_info;

                ao_info = rcu_dereference_check(tp->ao_info,
                        lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
                rnext_key = READ_ONCE(ao_info->rnext_key);
                if (WARN_ON_ONCE(!rnext_key))
                        return ptr;
                *ptr++ = htonl((TCPOPT_AO << 24) |
                               (tcp_ao_len(key->ao_key) << 16) |
                               (key->ao_key->sndid << 8) |
                               (rnext_key->rcvid));
        }
        opts->hash_location = (__u8 *)ptr;
        ptr += maclen / sizeof(*ptr);
        if (unlikely(maclen % sizeof(*ptr))) {
                memset(ptr, TCPOPT_NOP, sizeof(*ptr));
                ptr++;
        }
#endif
        return ptr;
}

/* Write previously computed TCP options to the packet.
 *
 * Beware: Something in the Internet is very sensitive to the ordering of
 * TCP options, we learned this through the hard way, so be careful here.
 * Luckily we can at least blame others for their non-compliance but from
 * inter-operability perspective it seems that we're somewhat stuck with
 * the ordering which we have been using if we want to keep working with
 * those broken things (not that it currently hurts anybody as there isn't
 * particular reason why the ordering would need to be changed).
 *
 * At least SACK_PERM as the first option is known to lead to a disaster
 * (but it may well be that other scenarios fail similarly).
 */
static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
                              const struct tcp_request_sock *tcprsk,
                              struct tcp_out_options *opts,
                              struct tcp_key *key)
{
        __be32 *ptr = (__be32 *)(th + 1);
        u16 options = opts->options;        /* mungable copy */

        if (tcp_key_is_md5(key)) {
                *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
                               (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
                /* overload cookie hash location */
                opts->hash_location = (__u8 *)ptr;
                ptr += 4;
        } else if (tcp_key_is_ao(key)) {
                ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr);
        }
        if (unlikely(opts->mss)) {
                *ptr++ = htonl((TCPOPT_MSS << 24) |
                               (TCPOLEN_MSS << 16) |
                               opts->mss);
        }

        if (likely(OPTION_TS & options)) {
                if (unlikely(OPTION_SACK_ADVERTISE & options)) {
                        *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
                                       (TCPOLEN_SACK_PERM << 16) |
                                       (TCPOPT_TIMESTAMP << 8) |
                                       TCPOLEN_TIMESTAMP);
                        options &= ~OPTION_SACK_ADVERTISE;
                } else {
                        *ptr++ = htonl((TCPOPT_NOP << 24) |
                                       (TCPOPT_NOP << 16) |
                                       (TCPOPT_TIMESTAMP << 8) |
                                       TCPOLEN_TIMESTAMP);
                }
                *ptr++ = htonl(opts->tsval);
                *ptr++ = htonl(opts->tsecr);
        }

        if (unlikely(OPTION_SACK_ADVERTISE & options)) {
                *ptr++ = htonl((TCPOPT_NOP << 24) |
                               (TCPOPT_NOP << 16) |
                               (TCPOPT_SACK_PERM << 8) |
                               TCPOLEN_SACK_PERM);
        }

        if (unlikely(OPTION_WSCALE & options)) {
                *ptr++ = htonl((TCPOPT_NOP << 24) |
                               (TCPOPT_WINDOW << 16) |
                               (TCPOLEN_WINDOW << 8) |
                               opts->ws);
        }

        if (unlikely(opts->num_sack_blocks)) {
                struct tcp_sack_block *sp = tp->rx_opt.dsack ?
                        tp->duplicate_sack : tp->selective_acks;
                int this_sack;

                *ptr++ = htonl((TCPOPT_NOP  << 24) |
                               (TCPOPT_NOP  << 16) |
                               (TCPOPT_SACK <<  8) |
                               (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
                                                     TCPOLEN_SACK_PERBLOCK)));

                for (this_sack = 0; this_sack < opts->num_sack_blocks;
                     ++this_sack) {
                        *ptr++ = htonl(sp[this_sack].start_seq);
                        *ptr++ = htonl(sp[this_sack].end_seq);
                }

                tp->rx_opt.dsack = 0;
        }

        if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
                struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
                u8 *p = (u8 *)ptr;
                u32 len; /* Fast Open option length */

                if (foc->exp) {
                        len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
                        *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
                                     TCPOPT_FASTOPEN_MAGIC);
                        p += TCPOLEN_EXP_FASTOPEN_BASE;
                } else {
                        len = TCPOLEN_FASTOPEN_BASE + foc->len;
                        *p++ = TCPOPT_FASTOPEN;
                        *p++ = len;
                }

                memcpy(p, foc->val, foc->len);
                if ((len & 3) == 2) {
                        p[foc->len] = TCPOPT_NOP;
                        p[foc->len + 1] = TCPOPT_NOP;
                }
                ptr += (len + 3) >> 2;
        }

        smc_options_write(ptr, &options);

        mptcp_options_write(th, ptr, tp, opts);
}

static void smc_set_option(const struct tcp_sock *tp,
                           struct tcp_out_options *opts,
                           unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
        if (static_branch_unlikely(&tcp_have_smc)) {
                if (tp->syn_smc) {
                        if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
                                opts->options |= OPTION_SMC;
                                *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
                        }
                }
        }
#endif
}

static void smc_set_option_cond(const struct tcp_sock *tp,
                                const struct inet_request_sock *ireq,
                                struct tcp_out_options *opts,
                                unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
        if (static_branch_unlikely(&tcp_have_smc)) {
                if (tp->syn_smc && ireq->smc_ok) {
                        if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
                                opts->options |= OPTION_SMC;
                                *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
                        }
                }
        }
#endif
}

static void mptcp_set_option_cond(const struct request_sock *req,
                                  struct tcp_out_options *opts,
                                  unsigned int *remaining)
{
        if (rsk_is_mptcp(req)) {
                unsigned int size;

                if (mptcp_synack_options(req, &size, &opts->mptcp)) {
                        if (*remaining >= size) {
                                opts->options |= OPTION_MPTCP;
                                *remaining -= size;
                        }
                }
        }
}

/* Compute TCP options for SYN packets. This is not the final
 * network wire format yet.
 */
static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
                                struct tcp_out_options *opts,
                                struct tcp_key *key)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;
        struct tcp_fastopen_request *fastopen = tp->fastopen_req;
        bool timestamps;

        /* Better than switch (key.type) as it has static branches */
        if (tcp_key_is_md5(key)) {
                timestamps = false;
                opts->options |= OPTION_MD5;
                remaining -= TCPOLEN_MD5SIG_ALIGNED;
        } else {
                timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
                if (tcp_key_is_ao(key)) {
                        opts->options |= OPTION_AO;
                        remaining -= tcp_ao_len_aligned(key->ao_key);
                }
        }

        /* We always get an MSS option.  The option bytes which will be seen in
         * normal data packets should timestamps be used, must be in the MSS
         * advertised.  But we subtract them from tp->mss_cache so that
         * calculations in tcp_sendmsg are simpler etc.  So account for this
         * fact here if necessary.  If we don't do this correctly, as a
         * receiver we won't recognize data packets as being full sized when we
         * should, and thus we won't abide by the delayed ACK rules correctly.
         * SACKs don't matter, we never delay an ACK when we have any of those
         * going out.  */
        opts->mss = tcp_advertise_mss(sk);
        remaining -= TCPOLEN_MSS_ALIGNED;

        if (likely(timestamps)) {
                opts->options |= OPTION_TS;
                opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
                opts->tsecr = tp->rx_opt.ts_recent;
                remaining -= TCPOLEN_TSTAMP_ALIGNED;
        }
        if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
                opts->ws = tp->rx_opt.rcv_wscale;
                opts->options |= OPTION_WSCALE;
                remaining -= TCPOLEN_WSCALE_ALIGNED;
        }
        if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
                opts->options |= OPTION_SACK_ADVERTISE;
                if (unlikely(!(OPTION_TS & opts->options)))
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }

        if (fastopen && fastopen->cookie.len >= 0) {
                u32 need = fastopen->cookie.len;

                need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
                                               TCPOLEN_FASTOPEN_BASE;
                need = (need + 3) & ~3U;  /* Align to 32 bits */
                if (remaining >= need) {
                        opts->options |= OPTION_FAST_OPEN_COOKIE;
                        opts->fastopen_cookie = &fastopen->cookie;
                        remaining -= need;
                        tp->syn_fastopen = 1;
                        tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
                }
        }

        smc_set_option(tp, opts, &remaining);

        if (sk_is_mptcp(sk)) {
                unsigned int size;

                if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
                        opts->options |= OPTION_MPTCP;
                        remaining -= size;
                }
        }

        bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);

        return MAX_TCP_OPTION_SPACE - remaining;
}

/* Set up TCP options for SYN-ACKs. */
static unsigned int tcp_synack_options(const struct sock *sk,
                                       struct request_sock *req,
                                       unsigned int mss, struct sk_buff *skb,
                                       struct tcp_out_options *opts,
                                       const struct tcp_key *key,
                                       struct tcp_fastopen_cookie *foc,
                                       enum tcp_synack_type synack_type,
                                       struct sk_buff *syn_skb)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;

        if (tcp_key_is_md5(key)) {
                opts->options |= OPTION_MD5;
                remaining -= TCPOLEN_MD5SIG_ALIGNED;

                /* We can't fit any SACK blocks in a packet with MD5 + TS
                 * options. There was discussion about disabling SACK
                 * rather than TS in order to fit in better with old,
                 * buggy kernels, but that was deemed to be unnecessary.
                 */
                if (synack_type != TCP_SYNACK_COOKIE)
                        ireq->tstamp_ok &= !ireq->sack_ok;
        } else if (tcp_key_is_ao(key)) {
                opts->options |= OPTION_AO;
                remaining -= tcp_ao_len_aligned(key->ao_key);
                ireq->tstamp_ok &= !ireq->sack_ok;
        }

        /* We always send an MSS option. */
        opts->mss = mss;
        remaining -= TCPOLEN_MSS_ALIGNED;

        if (likely(ireq->wscale_ok)) {
                opts->ws = ireq->rcv_wscale;
                opts->options |= OPTION_WSCALE;
                remaining -= TCPOLEN_WSCALE_ALIGNED;
        }
        if (likely(ireq->tstamp_ok)) {
                opts->options |= OPTION_TS;
                opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
                              tcp_rsk(req)->ts_off;
                opts->tsecr = READ_ONCE(req->ts_recent);
                remaining -= TCPOLEN_TSTAMP_ALIGNED;
        }
        if (likely(ireq->sack_ok)) {
                opts->options |= OPTION_SACK_ADVERTISE;
                if (unlikely(!ireq->tstamp_ok))
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
        if (foc != NULL && foc->len >= 0) {
                u32 need = foc->len;

                need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
                                   TCPOLEN_FASTOPEN_BASE;
                need = (need + 3) & ~3U;  /* Align to 32 bits */
                if (remaining >= need) {
                        opts->options |= OPTION_FAST_OPEN_COOKIE;
                        opts->fastopen_cookie = foc;
                        remaining -= need;
                }
        }

        mptcp_set_option_cond(req, opts, &remaining);

        smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);

        bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
                              synack_type, opts, &remaining);

        return MAX_TCP_OPTION_SPACE - remaining;
}

/* Compute TCP options for ESTABLISHED sockets. This is not the
 * final wire format yet.
 */
static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
                                        struct tcp_out_options *opts,
                                        struct tcp_key *key)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int size = 0;
        unsigned int eff_sacks;

        opts->options = 0;

        /* Better than switch (key.type) as it has static branches */
        if (tcp_key_is_md5(key)) {
                opts->options |= OPTION_MD5;
                size += TCPOLEN_MD5SIG_ALIGNED;
        } else if (tcp_key_is_ao(key)) {
                opts->options |= OPTION_AO;
                size += tcp_ao_len_aligned(key->ao_key);
        }

        if (likely(tp->rx_opt.tstamp_ok)) {
                opts->options |= OPTION_TS;
                opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
                                tp->tsoffset : 0;
                opts->tsecr = tp->rx_opt.ts_recent;
                size += TCPOLEN_TSTAMP_ALIGNED;
        }

        /* MPTCP options have precedence over SACK for the limited TCP
         * option space because a MPTCP connection would be forced to
         * fall back to regular TCP if a required multipath option is
         * missing. SACK still gets a chance to use whatever space is
         * left.
         */
        if (sk_is_mptcp(sk)) {
                unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
                unsigned int opt_size = 0;

                if (mptcp_established_options(sk, skb, &opt_size, remaining,
                                              &opts->mptcp)) {
                        opts->options |= OPTION_MPTCP;
                        size += opt_size;
                }
        }

        eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
        if (unlikely(eff_sacks)) {
                const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
                if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
                                         TCPOLEN_SACK_PERBLOCK))
                        return size;

                opts->num_sack_blocks =
                        min_t(unsigned int, eff_sacks,
                              (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
                              TCPOLEN_SACK_PERBLOCK);

                size += TCPOLEN_SACK_BASE_ALIGNED +
                        opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
        }

        if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
                                            BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
                unsigned int remaining = MAX_TCP_OPTION_SPACE - size;

                bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);

                size = MAX_TCP_OPTION_SPACE - remaining;
        }

        return size;
}


/* TCP SMALL QUEUES (TSQ)
 *
 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
 * to reduce RTT and bufferbloat.
 * We do this using a special skb destructor (tcp_wfree).
 *
 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
 * needs to be reallocated in a driver.
 * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
 *
 * Since transmit from skb destructor is forbidden, we use a tasklet
 * to process all sockets that eventually need to send more skbs.
 * We use one tasklet per cpu, with its own queue of sockets.
 */
struct tsq_tasklet {
        struct tasklet_struct        tasklet;
        struct list_head        head; /* queue of tcp sockets */
};
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);

static void tcp_tsq_write(struct sock *sk)
{
        if ((1 << sk->sk_state) &
            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
             TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
                struct tcp_sock *tp = tcp_sk(sk);

                if (tp->lost_out > tp->retrans_out &&
                    tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
                        tcp_mstamp_refresh(tp);
                        tcp_xmit_retransmit_queue(sk);
                }

                tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
                               0, GFP_ATOMIC);
        }
}

static void tcp_tsq_handler(struct sock *sk)
{
        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk))
                tcp_tsq_write(sk);
        else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
                sock_hold(sk);
        bh_unlock_sock(sk);
}
/*
 * One tasklet per cpu tries to send more skbs.
 * We run in tasklet context but need to disable irqs when
 * transferring tsq->head because tcp_wfree() might
 * interrupt us (non NAPI drivers)
 */
static void tcp_tasklet_func(struct tasklet_struct *t)
{
        struct tsq_tasklet *tsq = from_tasklet(tsq,  t, tasklet);
        LIST_HEAD(list);
        unsigned long flags;
        struct list_head *q, *n;
        struct tcp_sock *tp;
        struct sock *sk;

        local_irq_save(flags);
        list_splice_init(&tsq->head, &list);
        local_irq_restore(flags);

        list_for_each_safe(q, n, &list) {
                tp = list_entry(q, struct tcp_sock, tsq_node);
                list_del(&tp->tsq_node);

                sk = (struct sock *)tp;
                smp_mb__before_atomic();
                clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);

                tcp_tsq_handler(sk);
                sk_free(sk);
        }
}

#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |                \
                          TCPF_WRITE_TIMER_DEFERRED |        \
                          TCPF_DELACK_TIMER_DEFERRED |        \
                          TCPF_MTU_REDUCED_DEFERRED |        \
                          TCPF_ACK_DEFERRED)
/**
 * tcp_release_cb - tcp release_sock() callback
 * @sk: socket
 *
 * called from release_sock() to perform protocol dependent
 * actions before socket release.
 */
void tcp_release_cb(struct sock *sk)
{
        unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
        unsigned long nflags;

        /* perform an atomic operation only if at least one flag is set */
        do {
                if (!(flags & TCP_DEFERRED_ALL))
                        return;
                nflags = flags & ~TCP_DEFERRED_ALL;
        } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));

        if (flags & TCPF_TSQ_DEFERRED) {
                tcp_tsq_write(sk);
                __sock_put(sk);
        }

        if (flags & TCPF_WRITE_TIMER_DEFERRED) {
                tcp_write_timer_handler(sk);
                __sock_put(sk);
        }
        if (flags & TCPF_DELACK_TIMER_DEFERRED) {
                tcp_delack_timer_handler(sk);
                __sock_put(sk);
        }
        if (flags & TCPF_MTU_REDUCED_DEFERRED) {
                inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
                __sock_put(sk);
        }
        if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
                tcp_send_ack(sk);
}
EXPORT_SYMBOL(tcp_release_cb);

void __init tcp_tasklet_init(void)
{
        int i;

        for_each_possible_cpu(i) {
                struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);

                INIT_LIST_HEAD(&tsq->head);
                tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
        }
}

/*
 * Write buffer destructor automatically called from kfree_skb.
 * We can't xmit new skbs from this context, as we might already
 * hold qdisc lock.
 */
void tcp_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned long flags, nval, oval;
        struct tsq_tasklet *tsq;
        bool empty;

        /* Keep one reference on sk_wmem_alloc.
         * Will be released by sk_free() from here or tcp_tasklet_func()
         */
        WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));

        /* If this softirq is serviced by ksoftirqd, we are likely under stress.
         * Wait until our queues (qdisc + devices) are drained.
         * This gives :
         * - less callbacks to tcp_write_xmit(), reducing stress (batches)
         * - chance for incoming ACK (processed by another cpu maybe)
         *   to migrate this flow (skb->ooo_okay will be eventually set)
         */
        if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
                goto out;

        oval = smp_load_acquire(&sk->sk_tsq_flags);
        do {
                if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
                        goto out;

                nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
        } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));

        /* queue this socket to tasklet queue */
        local_irq_save(flags);
        tsq = this_cpu_ptr(&tsq_tasklet);
        empty = list_empty(&tsq->head);
        list_add(&tp->tsq_node, &tsq->head);
        if (empty)
                tasklet_schedule(&tsq->tasklet);
        local_irq_restore(flags);
        return;
out:
        sk_free(sk);
}

/* Note: Called under soft irq.
 * We can call TCP stack right away, unless socket is owned by user.
 */
enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
{
        struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
        struct sock *sk = (struct sock *)tp;

        tcp_tsq_handler(sk);
        sock_put(sk);

        return HRTIMER_NORESTART;
}

static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
                                      u64 prior_wstamp)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (sk->sk_pacing_status != SK_PACING_NONE) {
                unsigned long rate = READ_ONCE(sk->sk_pacing_rate);

                /* Original sch_fq does not pace first 10 MSS
                 * Note that tp->data_segs_out overflows after 2^32 packets,
                 * this is a minor annoyance.
                 */
                if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
                        u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
                        u64 credit = tp->tcp_wstamp_ns - prior_wstamp;

                        /* take into account OS jitter */
                        len_ns -= min_t(u64, len_ns / 2, credit);
                        tp->tcp_wstamp_ns += len_ns;
                }
        }
        list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}

INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));

/* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
 * All SKB's seen here are completely headerless.  It is our
 * job to build the TCP header, and pass the packet down to
 * IP so it can do the same plus pass the packet off to the
 * device.
 *
 * We are working here with either a clone of the original
 * SKB, or a fresh unique copy made by the retransmit engine.
 */
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
                              int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_sock *inet;
        struct tcp_sock *tp;
        struct tcp_skb_cb *tcb;
        struct tcp_out_options opts;
        unsigned int tcp_options_size, tcp_header_size;
        struct sk_buff *oskb = NULL;
        struct tcp_key key;
        struct tcphdr *th;
        u64 prior_wstamp;
        int err;

        BUG_ON(!skb || !tcp_skb_pcount(skb));
        tp = tcp_sk(sk);
        prior_wstamp = tp->tcp_wstamp_ns;
        tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
        skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
        if (clone_it) {
                oskb = skb;

                tcp_skb_tsorted_save(oskb) {
                        if (unlikely(skb_cloned(oskb)))
                                skb = pskb_copy(oskb, gfp_mask);
                        else
                                skb = skb_clone(oskb, gfp_mask);
                } tcp_skb_tsorted_restore(oskb);

                if (unlikely(!skb))
                        return -ENOBUFS;
                /* retransmit skbs might have a non zero value in skb->dev
                 * because skb->dev is aliased with skb->rbnode.rb_left
                 */
                skb->dev = NULL;
        }

        inet = inet_sk(sk);
        tcb = TCP_SKB_CB(skb);
        memset(&opts, 0, sizeof(opts));

        tcp_get_current_key(sk, &key);
        if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
                tcp_options_size = tcp_syn_options(sk, skb, &opts, &key);
        } else {
                tcp_options_size = tcp_established_options(sk, skb, &opts, &key);
                /* Force a PSH flag on all (GSO) packets to expedite GRO flush
                 * at receiver : This slightly improve GRO performance.
                 * Note that we do not force the PSH flag for non GSO packets,
                 * because they might be sent under high congestion events,
                 * and in this case it is better to delay the delivery of 1-MSS
                 * packets and thus the corresponding ACK packet that would
                 * release the following packet.
                 */
                if (tcp_skb_pcount(skb) > 1)
                        tcb->tcp_flags |= TCPHDR_PSH;
        }
        tcp_header_size = tcp_options_size + sizeof(struct tcphdr);

        /* We set skb->ooo_okay to one if this packet can select
         * a different TX queue than prior packets of this flow,
         * to avoid self inflicted reorders.
         * The 'other' queue decision is based on current cpu number
         * if XPS is enabled, or sk->sk_txhash otherwise.
         * We can switch to another (and better) queue if:
         * 1) No packet with payload is in qdisc/device queues.
         *    Delays in TX completion can defeat the test
         *    even if packets were already sent.
         * 2) Or rtx queue is empty.
         *    This mitigates above case if ACK packets for
         *    all prior packets were already processed.
         */
        skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) ||
                        tcp_rtx_queue_empty(sk);

        /* If we had to use memory reserve to allocate this skb,
         * this might cause drops if packet is looped back :
         * Other socket might not have SOCK_MEMALLOC.
         * Packets not looped back do not care about pfmemalloc.
         */
        skb->pfmemalloc = 0;

        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
        refcount_add(skb->truesize, &sk->sk_wmem_alloc);

        skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));

        /* Build TCP header and checksum it. */
        th = (struct tcphdr *)skb->data;
        th->source                = inet->inet_sport;
        th->dest                = inet->inet_dport;
        th->seq                        = htonl(tcb->seq);
        th->ack_seq                = htonl(rcv_nxt);
        *(((__be16 *)th) + 6)        = htons(((tcp_header_size >> 2) << 12) |
                                        tcb->tcp_flags);

        th->check                = 0;
        th->urg_ptr                = 0;

        /* The urg_mode check is necessary during a below snd_una win probe */
        if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
                if (before(tp->snd_up, tcb->seq + 0x10000)) {
                        th->urg_ptr = htons(tp->snd_up - tcb->seq);
                        th->urg = 1;
                } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
                        th->urg_ptr = htons(0xFFFF);
                        th->urg = 1;
                }
        }

        skb_shinfo(skb)->gso_type = sk->sk_gso_type;
        if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
                th->window      = htons(tcp_select_window(sk));
                tcp_ecn_send(sk, skb, th, tcp_header_size);
        } else {
                /* RFC1323: The window in SYN & SYN/ACK segments
                 * is never scaled.
                 */
                th->window        = htons(min(tp->rcv_wnd, 65535U));
        }

        tcp_options_write(th, tp, NULL, &opts, &key);

        if (tcp_key_is_md5(&key)) {
#ifdef CONFIG_TCP_MD5SIG
                /* Calculate the MD5 hash, as we have all we need now */
                sk_gso_disable(sk);
                tp->af_specific->calc_md5_hash(opts.hash_location,
                                               key.md5_key, sk, skb);
#endif
        } else if (tcp_key_is_ao(&key)) {
                int err;

                err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th,
                                          opts.hash_location);
                if (err) {
                        kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
                        return -ENOMEM;
                }
        }

        /* BPF prog is the last one writing header option */
        bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);

        INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
                           tcp_v6_send_check, tcp_v4_send_check,
                           sk, skb);

        if (likely(tcb->tcp_flags & TCPHDR_ACK))
                tcp_event_ack_sent(sk, rcv_nxt);

        if (skb->len != tcp_header_size) {
                tcp_event_data_sent(tp, sk);
                tp->data_segs_out += tcp_skb_pcount(skb);
                tp->bytes_sent += skb->len - tcp_header_size;
        }

        if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
                TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
                              tcp_skb_pcount(skb));

        tp->segs_out += tcp_skb_pcount(skb);
        skb_set_hash_from_sk(skb, sk);
        /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
        skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
        skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);

        /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */

        /* Cleanup our debris for IP stacks */
        memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
                               sizeof(struct inet6_skb_parm)));

        tcp_add_tx_delay(skb, tp);

        err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
                                 inet6_csk_xmit, ip_queue_xmit,
                                 sk, skb, &inet->cork.fl);

        if (unlikely(err > 0)) {
                tcp_enter_cwr(sk);
                err = net_xmit_eval(err);
        }
        if (!err && oskb) {
                tcp_update_skb_after_send(sk, oskb, prior_wstamp);
                tcp_rate_skb_sent(sk, oskb);
        }
        return err;
}

static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                            gfp_t gfp_mask)
{
        return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
                                  tcp_sk(sk)->rcv_nxt);
}

/* This routine just queues the buffer for sending.
 *
 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
 * otherwise socket can stall.
 */
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* Advance write_seq and place onto the write_queue. */
        WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
        __skb_header_release(skb);
        tcp_add_write_queue_tail(sk, skb);
        sk_wmem_queued_add(sk, skb->truesize);
        sk_mem_charge(sk, skb->truesize);
}

/* Initialize TSO segments for a packet. */
static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
        int tso_segs;

        if (skb->len <= mss_now) {
                /* Avoid the costly divide in the normal
                 * non-TSO case.
                 */
                TCP_SKB_CB(skb)->tcp_gso_size = 0;
                tcp_skb_pcount_set(skb, 1);
                return 1;
        }
        TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
        tso_segs = DIV_ROUND_UP(skb->len, mss_now);
        tcp_skb_pcount_set(skb, tso_segs);
        return tso_segs;
}

/* Pcount in the middle of the write queue got changed, we need to do various
 * tweaks to fix counters
 */
static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
{
        struct tcp_sock *tp = tcp_sk(sk);

        tp->packets_out -= decr;

        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                tp->sacked_out -= decr;
        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
                tp->retrans_out -= decr;
        if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
                tp->lost_out -= decr;

        /* Reno case is special. Sigh... */
        if (tcp_is_reno(tp) && decr > 0)
                tp->sacked_out -= min_t(u32, tp->sacked_out, decr);

        if (tp->lost_skb_hint &&
            before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
            (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                tp->lost_cnt_hint -= decr;

        tcp_verify_left_out(tp);
}

static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
{
        return TCP_SKB_CB(skb)->txstamp_ack ||
                (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
}

static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (unlikely(tcp_has_tx_tstamp(skb)) &&
            !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
                struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
                u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;

                shinfo->tx_flags &= ~tsflags;
                shinfo2->tx_flags |= tsflags;
                swap(shinfo->tskey, shinfo2->tskey);
                TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
                TCP_SKB_CB(skb)->txstamp_ack = 0;
        }
}

static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
{
        TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
        TCP_SKB_CB(skb)->eor = 0;
}

/* Insert buff after skb on the write or rtx queue of sk.  */
static void tcp_insert_write_queue_after(struct sk_buff *skb,
                                         struct sk_buff *buff,
                                         struct sock *sk,
                                         enum tcp_queue tcp_queue)
{
        if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
                __skb_queue_after(&sk->sk_write_queue, skb, buff);
        else
                tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
}

/* Function to create two new TCP segments.  Shrinks the given segment
 * to the specified size and appends a new segment with the rest of the
 * packet to the list.  This won't be called frequently, I hope.
 * Remember, these are still headerless SKBs at this point.
 */
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
                 struct sk_buff *skb, u32 len,
                 unsigned int mss_now, gfp_t gfp)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
        int old_factor;
        long limit;
        int nlen;
        u8 flags;

        if (WARN_ON(len > skb->len))
                return -EINVAL;

        DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));

        /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
         * We need some allowance to not penalize applications setting small
         * SO_SNDBUF values.
         * Also allow first and last skb in retransmit queue to be split.
         */
        limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
        if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
                     tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
                     skb != tcp_rtx_queue_head(sk) &&
                     skb != tcp_rtx_queue_tail(sk))) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
                return -ENOMEM;
        }

        if (skb_unclone_keeptruesize(skb, gfp))
                return -ENOMEM;

        /* Get a new skb... force flag on. */
        buff = tcp_stream_alloc_skb(sk, gfp, true);
        if (!buff)
                return -ENOMEM; /* We'll just try again later. */
        skb_copy_decrypted(buff, skb);
        mptcp_skb_ext_copy(buff, skb);

        sk_wmem_queued_add(sk, buff->truesize);
        sk_mem_charge(sk, buff->truesize);
        nlen = skb->len - len;
        buff->truesize += nlen;
        skb->truesize -= nlen;

        /* Correct the sequence numbers. */
        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

        /* PSH and FIN should only be set in the second packet. */
        flags = TCP_SKB_CB(skb)->tcp_flags;
        TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
        TCP_SKB_CB(buff)->tcp_flags = flags;
        TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
        tcp_skb_fragment_eor(skb, buff);

        skb_split(skb, buff, len);

        skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
        tcp_fragment_tstamp(skb, buff);

        old_factor = tcp_skb_pcount(skb);

        /* Fix up tso_factor for both original and new SKB.  */
        tcp_set_skb_tso_segs(skb, mss_now);
        tcp_set_skb_tso_segs(buff, mss_now);

        /* Update delivered info for the new segment */
        TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;

        /* If this packet has been sent out already, we must
         * adjust the various packet counters.
         */
        if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
                int diff = old_factor - tcp_skb_pcount(skb) -
                        tcp_skb_pcount(buff);

                if (diff)
                        tcp_adjust_pcount(sk, skb, diff);
        }

        /* Link BUFF into the send queue. */
        __skb_header_release(buff);
        tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
        if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
                list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);

        return 0;
}

/* This is similar to __pskb_pull_tail(). The difference is that pulled
 * data is not copied, but immediately discarded.
 */
static int __pskb_trim_head(struct sk_buff *skb, int len)
{
        struct skb_shared_info *shinfo;
        int i, k, eat;

        DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
        eat = len;
        k = 0;
        shinfo = skb_shinfo(skb);
        for (i = 0; i < shinfo->nr_frags; i++) {
                int size = skb_frag_size(&shinfo->frags[i]);

                if (size <= eat) {
                        skb_frag_unref(skb, i);
                        eat -= size;
                } else {
                        shinfo->frags[k] = shinfo->frags[i];
                        if (eat) {
                                skb_frag_off_add(&shinfo->frags[k], eat);
                                skb_frag_size_sub(&shinfo->frags[k], eat);
                                eat = 0;
                        }
                        k++;
                }
        }
        shinfo->nr_frags = k;

        skb->data_len -= len;
        skb->len = skb->data_len;
        return len;
}

/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
        u32 delta_truesize;

        if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
                return -ENOMEM;

        delta_truesize = __pskb_trim_head(skb, len);

        TCP_SKB_CB(skb)->seq += len;

        skb->truesize           -= delta_truesize;
        sk_wmem_queued_add(sk, -delta_truesize);
        if (!skb_zcopy_pure(skb))
                sk_mem_uncharge(sk, delta_truesize);

        /* Any change of skb->len requires recalculation of tso factor. */
        if (tcp_skb_pcount(skb) > 1)
                tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));

        return 0;
}

/* Calculate MSS not accounting any TCP options.  */
static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        int mss_now;

        /* Calculate base mss without TCP options:
           It is MMS_S - sizeof(tcphdr) of rfc1122
         */
        mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);

        /* Clamp it (mss_clamp does not include tcp options) */
        if (mss_now > tp->rx_opt.mss_clamp)
                mss_now = tp->rx_opt.mss_clamp;

        /* Now subtract optional transport overhead */
        mss_now -= icsk->icsk_ext_hdr_len;

        /* Then reserve room for full set of TCP options and 8 bytes of data */
        mss_now = max(mss_now,
                      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
        return mss_now;
}

/* Calculate MSS. Not accounting for SACKs here.  */
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
        /* Subtract TCP options size, not including SACKs */
        return __tcp_mtu_to_mss(sk, pmtu) -
               (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
}
EXPORT_SYMBOL(tcp_mtu_to_mss);

/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);

        return mss +
              tp->tcp_header_len +
              icsk->icsk_ext_hdr_len +
              icsk->icsk_af_ops->net_header_len;
}
EXPORT_SYMBOL(tcp_mss_to_mtu);

/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct net *net = sock_net(sk);

        icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
        icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
                               icsk->icsk_af_ops->net_header_len;
        icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
        icsk->icsk_mtup.probe_size = 0;
        if (icsk->icsk_mtup.enabled)
                icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
EXPORT_SYMBOL(tcp_mtup_init);

/* This function synchronize snd mss to current pmtu/exthdr set.

   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
   for TCP options, but includes only bare TCP header.

   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
   It is minimum of user_mss and mss received with SYN.
   It also does not include TCP options.

   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.

   tp->mss_cache is current effective sending mss, including
   all tcp options except for SACKs. It is evaluated,
   taking into account current pmtu, but never exceeds
   tp->rx_opt.mss_clamp.

   NOTE1. rfc1122 clearly states that advertised MSS
   DOES NOT include either tcp or ip options.

   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
   are READ ONLY outside this function.                --ANK (980731)
 */
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        int mss_now;

        if (icsk->icsk_mtup.search_high > pmtu)
                icsk->icsk_mtup.search_high = pmtu;

        mss_now = tcp_mtu_to_mss(sk, pmtu);
        mss_now = tcp_bound_to_half_wnd(tp, mss_now);

        /* And store cached results */
        icsk->icsk_pmtu_cookie = pmtu;
        if (icsk->icsk_mtup.enabled)
                mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
        tp->mss_cache = mss_now;

        return mss_now;
}
EXPORT_SYMBOL(tcp_sync_mss);

/* Compute the current effective MSS, taking SACKs and IP options,
 * and even PMTU discovery events into account.
 */
unsigned int tcp_current_mss(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        const struct dst_entry *dst = __sk_dst_get(sk);
        u32 mss_now;
        unsigned int header_len;
        struct tcp_out_options opts;
        struct tcp_key key;

        mss_now = tp->mss_cache;

        if (dst) {
                u32 mtu = dst_mtu(dst);
                if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
                        mss_now = tcp_sync_mss(sk, mtu);
        }
        tcp_get_current_key(sk, &key);
        header_len = tcp_established_options(sk, NULL, &opts, &key) +
                     sizeof(struct tcphdr);
        /* The mss_cache is sized based on tp->tcp_header_len, which assumes
         * some common options. If this is an odd packet (because we have SACK
         * blocks etc) then our calculated header_len will be different, and
         * we have to adjust mss_now correspondingly */
        if (header_len != tp->tcp_header_len) {
                int delta = (int) header_len - tp->tcp_header_len;
                mss_now -= delta;
        }

        return mss_now;
}

/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
 * As additional protections, we do not touch cwnd in retransmission phases,
 * and if application hit its sndbuf limit recently.
 */
static void tcp_cwnd_application_limited(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
            sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
                /* Limited by application or receiver window. */
                u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
                u32 win_used = max(tp->snd_cwnd_used, init_win);
                if (win_used < tcp_snd_cwnd(tp)) {
                        tp->snd_ssthresh = tcp_current_ssthresh(sk);
                        tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
                }
                tp->snd_cwnd_used = 0;
        }
        tp->snd_cwnd_stamp = tcp_jiffies32;
}

static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
{
        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        struct tcp_sock *tp = tcp_sk(sk);

        /* Track the strongest available signal of the degree to which the cwnd
         * is fully utilized. If cwnd-limited then remember that fact for the
         * current window. If not cwnd-limited then track the maximum number of
         * outstanding packets in the current window. (If cwnd-limited then we
         * chose to not update tp->max_packets_out to avoid an extra else
         * clause with no functional impact.)
         */
        if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
            is_cwnd_limited ||
            (!tp->is_cwnd_limited &&
             tp->packets_out > tp->max_packets_out)) {
                tp->is_cwnd_limited = is_cwnd_limited;
                tp->max_packets_out = tp->packets_out;
                tp->cwnd_usage_seq = tp->snd_nxt;
        }

        if (tcp_is_cwnd_limited(sk)) {
                /* Network is feed fully. */
                tp->snd_cwnd_used = 0;
                tp->snd_cwnd_stamp = tcp_jiffies32;
        } else {
                /* Network starves. */
                if (tp->packets_out > tp->snd_cwnd_used)
                        tp->snd_cwnd_used = tp->packets_out;

                if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
                    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
                    !ca_ops->cong_control)
                        tcp_cwnd_application_limited(sk);

                /* The following conditions together indicate the starvation
                 * is caused by insufficient sender buffer:
                 * 1) just sent some data (see tcp_write_xmit)
                 * 2) not cwnd limited (this else condition)
                 * 3) no more data to send (tcp_write_queue_empty())
                 * 4) application is hitting buffer limit (SOCK_NOSPACE)
                 */
                if (tcp_write_queue_empty(sk) && sk->sk_socket &&
                    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
                    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
                        tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
        }
}

/* Minshall's variant of the Nagle send check. */
static bool tcp_minshall_check(const struct tcp_sock *tp)
{
        return after(tp->snd_sml, tp->snd_una) &&
                !after(tp->snd_sml, tp->snd_nxt);
}

/* Update snd_sml if this skb is under mss
 * Note that a TSO packet might end with a sub-mss segment
 * The test is really :
 * if ((skb->len % mss) != 0)
 *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
 * But we can avoid doing the divide again given we already have
 *  skb_pcount = skb->len / mss_now
 */
static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
                                const struct sk_buff *skb)
{
        if (skb->len < tcp_skb_pcount(skb) * mss_now)
                tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
}

/* Return false, if packet can be sent now without violation Nagle's rules:
 * 1. It is full sized. (provided by caller in %partial bool)
 * 2. Or it contains FIN. (already checked by caller)
 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
 *    With Minshall's modification: all sent small packets are ACKed.
 */
static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
                            int nonagle)
{
        return partial &&
                ((nonagle & TCP_NAGLE_CORK) ||
                 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
}

/* Return how many segs we'd like on a TSO packet,
 * depending on current pacing rate, and how close the peer is.
 *
 * Rationale is:
 * - For close peers, we rather send bigger packets to reduce
 *   cpu costs, because occasional losses will be repaired fast.
 * - For long distance/rtt flows, we would like to get ACK clocking
 *   with 1 ACK per ms.
 *
 * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
 * in bigger TSO bursts. We we cut the RTT-based allowance in half
 * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
 * is below 1500 bytes after 6 * ~500 usec = 3ms.
 */
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
                            int min_tso_segs)
{
        unsigned long bytes;
        u32 r;

        bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);

        r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
        if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
                bytes += sk->sk_gso_max_size >> r;

        bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);

        return max_t(u32, bytes / mss_now, min_tso_segs);
}

/* Return the number of segments we want in the skb we are transmitting.
 * See if congestion control module wants to decide; otherwise, autosize.
 */
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        u32 min_tso, tso_segs;

        min_tso = ca_ops->min_tso_segs ?
                        ca_ops->min_tso_segs(sk) :
                        READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);

        tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
        return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}

/* Returns the portion of skb which can be sent right away */
static unsigned int tcp_mss_split_point(const struct sock *sk,
                                        const struct sk_buff *skb,
                                        unsigned int mss_now,
                                        unsigned int max_segs,
                                        int nonagle)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 partial, needed, window, max_len;

        window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
        max_len = mss_now * max_segs;

        if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
                return max_len;

        needed = min(skb->len, window);

        if (max_len <= needed)
                return max_len;

        partial = needed % mss_now;
        /* If last segment is not a full MSS, check if Nagle rules allow us
         * to include this last segment in this skb.
         * Otherwise, we'll split the skb at last MSS boundary
         */
        if (tcp_nagle_check(partial != 0, tp, nonagle))
                return needed - partial;

        return needed;
}

/* Can at least one segment of SKB be sent right now, according to the
 * congestion window rules?  If so, return how many segments are allowed.
 */
static u32 tcp_cwnd_test(const struct tcp_sock *tp)
{
        u32 in_flight, cwnd, halfcwnd;

        in_flight = tcp_packets_in_flight(tp);
        cwnd = tcp_snd_cwnd(tp);
        if (in_flight >= cwnd)
                return 0;

        /* For better scheduling, ensure we have at least
         * 2 GSO packets in flight.
         */
        halfcwnd = max(cwnd >> 1, 1U);
        return min(halfcwnd, cwnd - in_flight);
}

/* Initialize TSO state of a skb.
 * This must be invoked the first time we consider transmitting
 * SKB onto the wire.
 */
static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
        int tso_segs = tcp_skb_pcount(skb);

        if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
                return tcp_set_skb_tso_segs(skb, mss_now);

        return tso_segs;
}


/* Return true if the Nagle test allows this packet to be
 * sent now.
 */
static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
                                  unsigned int cur_mss, int nonagle)
{
        /* Nagle rule does not apply to frames, which sit in the middle of the
         * write_queue (they have no chances to get new data).
         *
         * This is implemented in the callers, where they modify the 'nonagle'
         * argument based upon the location of SKB in the send queue.
         */
        if (nonagle & TCP_NAGLE_PUSH)
                return true;

        /* Don't use the nagle rule for urgent data (or for the final FIN). */
        if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
                return true;

        if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
                return true;

        return false;
}

/* Does at least the first segment of SKB fit into the send window? */
static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
                             const struct sk_buff *skb,
                             unsigned int cur_mss)
{
        u32 end_seq = TCP_SKB_CB(skb)->end_seq;

        if (skb->len > cur_mss)
                end_seq = TCP_SKB_CB(skb)->seq + cur_mss;

        return !after(end_seq, tcp_wnd_end(tp));
}

/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
 * which is put after SKB on the list.  It is very much like
 * tcp_fragment() except that it may make several kinds of assumptions
 * in order to speed up the splitting operation.  In particular, we
 * know that all the data is in scatter-gather pages, and that the
 * packet has never been sent out before (and thus is not cloned).
 */
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
                        unsigned int mss_now, gfp_t gfp)
{
        int nlen = skb->len - len;
        struct sk_buff *buff;
        u8 flags;

        /* All of a TSO frame must be composed of paged data.  */
        DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);

        buff = tcp_stream_alloc_skb(sk, gfp, true);
        if (unlikely(!buff))
                return -ENOMEM;
        skb_copy_decrypted(buff, skb);
        mptcp_skb_ext_copy(buff, skb);

        sk_wmem_queued_add(sk, buff->truesize);
        sk_mem_charge(sk, buff->truesize);
        buff->truesize += nlen;
        skb->truesize -= nlen;

        /* Correct the sequence numbers. */
        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

        /* PSH and FIN should only be set in the second packet. */
        flags = TCP_SKB_CB(skb)->tcp_flags;
        TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
        TCP_SKB_CB(buff)->tcp_flags = flags;

        tcp_skb_fragment_eor(skb, buff);

        skb_split(skb, buff, len);
        tcp_fragment_tstamp(skb, buff);

        /* Fix up tso_factor for both original and new SKB.  */
        tcp_set_skb_tso_segs(skb, mss_now);
        tcp_set_skb_tso_segs(buff, mss_now);

        /* Link BUFF into the send queue. */
        __skb_header_release(buff);
        tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);

        return 0;
}

/* Try to defer sending, if possible, in order to minimize the amount
 * of TSO splitting we do.  View it as a kind of TSO Nagle test.
 *
 * This algorithm is from John Heffner.
 */
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
                                 bool *is_cwnd_limited,
                                 bool *is_rwnd_limited,
                                 u32 max_segs)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 send_win, cong_win, limit, in_flight;
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *head;
        int win_divisor;
        s64 delta;

        if (icsk->icsk_ca_state >= TCP_CA_Recovery)
                goto send_now;

        /* Avoid bursty behavior by allowing defer
         * only if the last write was recent (1 ms).
         * Note that tp->tcp_wstamp_ns can be in the future if we have
         * packets waiting in a qdisc or device for EDT delivery.
         */
        delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
        if (delta > 0)
                goto send_now;

        in_flight = tcp_packets_in_flight(tp);

        BUG_ON(tcp_skb_pcount(skb) <= 1);
        BUG_ON(tcp_snd_cwnd(tp) <= in_flight);

        send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;

        /* From in_flight test above, we know that cwnd > in_flight.  */
        cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;

        limit = min(send_win, cong_win);

        /* If a full-sized TSO skb can be sent, do it. */
        if (limit >= max_segs * tp->mss_cache)
                goto send_now;

        /* Middle in queue won't get any more data, full sendable already? */
        if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
                goto send_now;

        win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
        if (win_divisor) {
                u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);

                /* If at least some fraction of a window is available,
                 * just use it.
                 */
                chunk /= win_divisor;
                if (limit >= chunk)
                        goto send_now;
        } else {
                /* Different approach, try not to defer past a single
                 * ACK.  Receiver should ACK every other full sized
                 * frame, so if we have space for more than 3 frames
                 * then send now.
                 */
                if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
                        goto send_now;
        }

        /* TODO : use tsorted_sent_queue ? */
        head = tcp_rtx_queue_head(sk);
        if (!head)
                goto send_now;
        delta = tp->tcp_clock_cache - head->tstamp;
        /* If next ACK is likely to come too late (half srtt), do not defer */
        if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
                goto send_now;

        /* Ok, it looks like it is advisable to defer.
         * Three cases are tracked :
         * 1) We are cwnd-limited
         * 2) We are rwnd-limited
         * 3) We are application limited.
         */
        if (cong_win < send_win) {
                if (cong_win <= skb->len) {
                        *is_cwnd_limited = true;
                        return true;
                }
        } else {
                if (send_win <= skb->len) {
                        *is_rwnd_limited = true;
                        return true;
                }
        }

        /* If this packet won't get more data, do not wait. */
        if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
            TCP_SKB_CB(skb)->eor)
                goto send_now;

        return true;

send_now:
        return false;
}

static inline void tcp_mtu_check_reprobe(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        u32 interval;
        s32 delta;

        interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
        delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
        if (unlikely(delta >= interval * HZ)) {
                int mss = tcp_current_mss(sk);

                /* Update current search range */
                icsk->icsk_mtup.probe_size = 0;
                icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
                        sizeof(struct tcphdr) +
                        icsk->icsk_af_ops->net_header_len;
                icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);

                /* Update probe time stamp */
                icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
        }
}

static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
{
        struct sk_buff *skb, *next;

        skb = tcp_send_head(sk);
        tcp_for_write_queue_from_safe(skb, next, sk) {
                if (len <= skb->len)
                        break;

                if (unlikely(TCP_SKB_CB(skb)->eor) ||
                    tcp_has_tx_tstamp(skb) ||
                    !skb_pure_zcopy_same(skb, next))
                        return false;

                len -= skb->len;
        }

        return true;
}

static int tcp_clone_payload(struct sock *sk, struct sk_buff *to,
                             int probe_size)
{
        skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags;
        int i, todo, len = 0, nr_frags = 0;
        const struct sk_buff *skb;

        if (!sk_wmem_schedule(sk, to->truesize + probe_size))
                return -ENOMEM;

        skb_queue_walk(&sk->sk_write_queue, skb) {
                const skb_frag_t *fragfrom = skb_shinfo(skb)->frags;

                if (skb_headlen(skb))
                        return -EINVAL;

                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) {
                        if (len >= probe_size)
                                goto commit;
                        todo = min_t(int, skb_frag_size(fragfrom),
                                     probe_size - len);
                        len += todo;
                        if (lastfrag &&
                            skb_frag_page(fragfrom) == skb_frag_page(lastfrag) &&
                            skb_frag_off(fragfrom) == skb_frag_off(lastfrag) +
                                                      skb_frag_size(lastfrag)) {
                                skb_frag_size_add(lastfrag, todo);
                                continue;
                        }
                        if (unlikely(nr_frags == MAX_SKB_FRAGS))
                                return -E2BIG;
                        skb_frag_page_copy(fragto, fragfrom);
                        skb_frag_off_copy(fragto, fragfrom);
                        skb_frag_size_set(fragto, todo);
                        nr_frags++;
                        lastfrag = fragto++;
                }
        }
commit:
        WARN_ON_ONCE(len != probe_size);
        for (i = 0; i < nr_frags; i++)
                skb_frag_ref(to, i);

        skb_shinfo(to)->nr_frags = nr_frags;
        to->truesize += probe_size;
        to->len += probe_size;
        to->data_len += probe_size;
        __skb_header_release(to);
        return 0;
}

/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if
 * all its payload was moved to another one (dst).
 * Make sure to transfer tcp_flags, eor, and tstamp.
 */
static void tcp_eat_one_skb(struct sock *sk,
                            struct sk_buff *dst,
                            struct sk_buff *src)
{
        TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags;
        TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor;
        tcp_skb_collapse_tstamp(dst, src);
        tcp_unlink_write_queue(src, sk);
        tcp_wmem_free_skb(sk, src);
}

/* Create a new MTU probe if we are ready.
 * MTU probe is regularly attempting to increase the path MTU by
 * deliberately sending larger packets.  This discovers routing
 * changes resulting in larger path MTUs.
 *
 * Returns 0 if we should wait to probe (no cwnd available),
 *         1 if a probe was sent,
 *         -1 otherwise
 */
static int tcp_mtu_probe(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb, *nskb, *next;
        struct net *net = sock_net(sk);
        int probe_size;
        int size_needed;
        int copy, len;
        int mss_now;
        int interval;

        /* Not currently probing/verifying,
         * not in recovery,
         * have enough cwnd, and
         * not SACKing (the variable headers throw things off)
         */
        if (likely(!icsk->icsk_mtup.enabled ||
                   icsk->icsk_mtup.probe_size ||
                   inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
                   tcp_snd_cwnd(tp) < 11 ||
                   tp->rx_opt.num_sacks || tp->rx_opt.dsack))
                return -1;

        /* Use binary search for probe_size between tcp_mss_base,
         * and current mss_clamp. if (search_high - search_low)
         * smaller than a threshold, backoff from probing.
         */
        mss_now = tcp_current_mss(sk);
        probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
                                    icsk->icsk_mtup.search_low) >> 1);
        size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
        interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
        /* When misfortune happens, we are reprobing actively,
         * and then reprobe timer has expired. We stick with current
         * probing process by not resetting search range to its orignal.
         */
        if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
            interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
                /* Check whether enough time has elaplased for
                 * another round of probing.
                 */
                tcp_mtu_check_reprobe(sk);
                return -1;
        }

        /* Have enough data in the send queue to probe? */
        if (tp->write_seq - tp->snd_nxt < size_needed)
                return -1;

        if (tp->snd_wnd < size_needed)
                return -1;
        if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
                return 0;

        /* Do we need to wait to drain cwnd? With none in flight, don't stall */
        if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) {
                if (!tcp_packets_in_flight(tp))
                        return -1;
                else
                        return 0;
        }

        if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
                return -1;

        /* We're allowed to probe.  Build it now. */
        nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false);
        if (!nskb)
                return -1;

        /* build the payload, and be prepared to abort if this fails. */
        if (tcp_clone_payload(sk, nskb, probe_size)) {
                tcp_skb_tsorted_anchor_cleanup(nskb);
                consume_skb(nskb);
                return -1;
        }
        sk_wmem_queued_add(sk, nskb->truesize);
        sk_mem_charge(sk, nskb->truesize);

        skb = tcp_send_head(sk);
        skb_copy_decrypted(nskb, skb);
        mptcp_skb_ext_copy(nskb, skb);

        TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
        TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;

        tcp_insert_write_queue_before(nskb, skb, sk);
        tcp_highest_sack_replace(sk, skb, nskb);

        len = 0;
        tcp_for_write_queue_from_safe(skb, next, sk) {
                copy = min_t(int, skb->len, probe_size - len);

                if (skb->len <= copy) {
                        tcp_eat_one_skb(sk, nskb, skb);
                } else {
                        TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
                                                   ~(TCPHDR_FIN|TCPHDR_PSH);
                        __pskb_trim_head(skb, copy);
                        tcp_set_skb_tso_segs(skb, mss_now);
                        TCP_SKB_CB(skb)->seq += copy;
                }

                len += copy;

                if (len >= probe_size)
                        break;
        }
        tcp_init_tso_segs(nskb, nskb->len);

        /* We're ready to send.  If this fails, the probe will
         * be resegmented into mss-sized pieces by tcp_write_xmit().
         */
        if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
                /* Decrement cwnd here because we are sending
                 * effectively two packets. */
                tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
                tcp_event_new_data_sent(sk, nskb);

                icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
                tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
                tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;

                return 1;
        }

        return -1;
}

static bool tcp_pacing_check(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!tcp_needs_internal_pacing(sk))
                return false;

        if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
                return false;

        if (!hrtimer_is_queued(&tp->pacing_timer)) {
                hrtimer_start(&tp->pacing_timer,
                              ns_to_ktime(tp->tcp_wstamp_ns),
                              HRTIMER_MODE_ABS_PINNED_SOFT);
                sock_hold(sk);
        }
        return true;
}

static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk)
{
        const struct rb_node *node = sk->tcp_rtx_queue.rb_node;

        /* No skb in the rtx queue. */
        if (!node)
                return true;

        /* Only one skb in rtx queue. */
        return !node->rb_left && !node->rb_right;
}

/* TCP Small Queues :
 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
 * (These limits are doubled for retransmits)
 * This allows for :
 *  - better RTT estimation and ACK scheduling
 *  - faster recovery
 *  - high rates
 * Alas, some drivers / subsystems require a fair amount
 * of queued bytes to ensure line rate.
 * One example is wifi aggregation (802.11 AMPDU)
 */
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
                                  unsigned int factor)
{
        unsigned long limit;

        limit = max_t(unsigned long,
                      2 * skb->truesize,
                      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
        if (sk->sk_pacing_status == SK_PACING_NONE)
                limit = min_t(unsigned long, limit,
                              READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
        limit <<= factor;

        if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
            tcp_sk(sk)->tcp_tx_delay) {
                u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
                                  tcp_sk(sk)->tcp_tx_delay;

                /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
                 * approximate our needs assuming an ~100% skb->truesize overhead.
                 * USEC_PER_SEC is approximated by 2^20.
                 * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
                 */
                extra_bytes >>= (20 - 1);
                limit += extra_bytes;
        }
        if (refcount_read(&sk->sk_wmem_alloc) > limit) {
                /* Always send skb if rtx queue is empty or has one skb.
                 * No need to wait for TX completion to call us back,
                 * after softirq/tasklet schedule.
                 * This helps when TX completions are delayed too much.
                 */
                if (tcp_rtx_queue_empty_or_single_skb(sk))
                        return false;

                set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
                /* It is possible TX completion already happened
                 * before we set TSQ_THROTTLED, so we must
                 * test again the condition.
                 */
                smp_mb__after_atomic();
                if (refcount_read(&sk->sk_wmem_alloc) > limit)
                        return true;
        }
        return false;
}

static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
{
        const u32 now = tcp_jiffies32;
        enum tcp_chrono old = tp->chrono_type;

        if (old > TCP_CHRONO_UNSPEC)
                tp->chrono_stat[old - 1] += now - tp->chrono_start;
        tp->chrono_start = now;
        tp->chrono_type = new;
}

void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
{
        struct tcp_sock *tp = tcp_sk(sk);

        /* If there are multiple conditions worthy of tracking in a
         * chronograph then the highest priority enum takes precedence
         * over the other conditions. So that if something "more interesting"
         * starts happening, stop the previous chrono and start a new one.
         */
        if (type > tp->chrono_type)
                tcp_chrono_set(tp, type);
}

void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
{
        struct tcp_sock *tp = tcp_sk(sk);


        /* There are multiple conditions worthy of tracking in a
         * chronograph, so that the highest priority enum takes
         * precedence over the other conditions (see tcp_chrono_start).
         * If a condition stops, we only stop chrono tracking if
         * it's the "most interesting" or current chrono we are
         * tracking and starts busy chrono if we have pending data.
         */
        if (tcp_rtx_and_write_queues_empty(sk))
                tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
        else if (type == tp->chrono_type)
                tcp_chrono_set(tp, TCP_CHRONO_BUSY);
}

/* First skb in the write queue is smaller than ideal packet size.
 * Check if we can move payload from the second skb in the queue.
 */
static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount)
{
        struct sk_buff *next_skb = skb->next;
        unsigned int nlen;

        if (tcp_skb_is_last(sk, skb))
                return;

        if (!tcp_skb_can_collapse(skb, next_skb))
                return;

        nlen = min_t(u32, amount, next_skb->len);
        if (!nlen || !skb_shift(skb, next_skb, nlen))
                return;

        TCP_SKB_CB(skb)->end_seq += nlen;
        TCP_SKB_CB(next_skb)->seq += nlen;

        if (!next_skb->len) {
                /* In case FIN is set, we need to update end_seq */
                TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;

                tcp_eat_one_skb(sk, skb, next_skb);
        }
}

/* This routine writes packets to the network.  It advances the
 * send_head.  This happens as incoming acks open up the remote
 * window for us.
 *
 * LARGESEND note: !tcp_urg_mode is overkill, only frames between
 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
 * account rare use of URG, this is not a big flaw.
 *
 * Send at most one packet when push_one > 0. Temporarily ignore
 * cwnd limit to force at most one packet out when push_one == 2.

 * Returns true, if no segments are in flight and we have queued segments,
 * but cannot send anything now because of SWS or another problem.
 */
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                           int push_one, gfp_t gfp)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        unsigned int tso_segs, sent_pkts;
        u32 cwnd_quota, max_segs;
        int result;
        bool is_cwnd_limited = false, is_rwnd_limited = false;

        sent_pkts = 0;

        tcp_mstamp_refresh(tp);
        if (!push_one) {
                /* Do MTU probing. */
                result = tcp_mtu_probe(sk);
                if (!result) {
                        return false;
                } else if (result > 0) {
                        sent_pkts = 1;
                }
        }

        max_segs = tcp_tso_segs(sk, mss_now);
        while ((skb = tcp_send_head(sk))) {
                unsigned int limit;
                int missing_bytes;

                if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
                        /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
                        tp->tcp_wstamp_ns = tp->tcp_clock_cache;
                        skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
                        list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
                        tcp_init_tso_segs(skb, mss_now);
                        goto repair; /* Skip network transmission */
                }

                if (tcp_pacing_check(sk))
                        break;

                cwnd_quota = tcp_cwnd_test(tp);
                if (!cwnd_quota) {
                        if (push_one == 2)
                                /* Force out a loss probe pkt. */
                                cwnd_quota = 1;
                        else
                                break;
                }
                cwnd_quota = min(cwnd_quota, max_segs);
                missing_bytes = cwnd_quota * mss_now - skb->len;
                if (missing_bytes > 0)
                        tcp_grow_skb(sk, skb, missing_bytes);

                tso_segs = tcp_set_skb_tso_segs(skb, mss_now);

                if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
                        is_rwnd_limited = true;
                        break;
                }

                if (tso_segs == 1) {
                        if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
                                                     (tcp_skb_is_last(sk, skb) ?
                                                      nonagle : TCP_NAGLE_PUSH))))
                                break;
                } else {
                        if (!push_one &&
                            tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
                                                 &is_rwnd_limited, max_segs))
                                break;
                }

                limit = mss_now;
                if (tso_segs > 1 && !tcp_urg_mode(tp))
                        limit = tcp_mss_split_point(sk, skb, mss_now,
                                                    cwnd_quota,
                                                    nonagle);

                if (skb->len > limit &&
                    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
                        break;

                if (tcp_small_queue_check(sk, skb, 0))
                        break;

                /* Argh, we hit an empty skb(), presumably a thread
                 * is sleeping in sendmsg()/sk_stream_wait_memory().
                 * We do not want to send a pure-ack packet and have
                 * a strange looking rtx queue with empty packet(s).
                 */
                if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
                        break;

                if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
                        break;

repair:
                /* Advance the send_head.  This one is sent out.
                 * This call will increment packets_out.
                 */
                tcp_event_new_data_sent(sk, skb);

                tcp_minshall_update(tp, mss_now, skb);
                sent_pkts += tcp_skb_pcount(skb);

                if (push_one)
                        break;
        }

        if (is_rwnd_limited)
                tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
        else
                tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);

        is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
        if (likely(sent_pkts || is_cwnd_limited))
                tcp_cwnd_validate(sk, is_cwnd_limited);

        if (likely(sent_pkts)) {
                if (tcp_in_cwnd_reduction(sk))
                        tp->prr_out += sent_pkts;

                /* Send one loss probe per tail loss episode. */
                if (push_one != 2)
                        tcp_schedule_loss_probe(sk, false);
                return false;
        }
        return !tp->packets_out && !tcp_write_queue_empty(sk);
}

bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        u32 timeout, timeout_us, rto_delta_us;
        int early_retrans;

        /* Don't do any loss probe on a Fast Open connection before 3WHS
         * finishes.
         */
        if (rcu_access_pointer(tp->fastopen_rsk))
                return false;

        early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
        /* Schedule a loss probe in 2*RTT for SACK capable connections
         * not in loss recovery, that are either limited by cwnd or application.
         */
        if ((early_retrans != 3 && early_retrans != 4) ||
            !tp->packets_out || !tcp_is_sack(tp) ||
            (icsk->icsk_ca_state != TCP_CA_Open &&
             icsk->icsk_ca_state != TCP_CA_CWR))
                return false;

        /* Probe timeout is 2*rtt. Add minimum RTO to account
         * for delayed ack when there's one outstanding packet. If no RTT
         * sample is available then probe after TCP_TIMEOUT_INIT.
         */
        if (tp->srtt_us) {
                timeout_us = tp->srtt_us >> 2;
                if (tp->packets_out == 1)
                        timeout_us += tcp_rto_min_us(sk);
                else
                        timeout_us += TCP_TIMEOUT_MIN_US;
                timeout = usecs_to_jiffies(timeout_us);
        } else {
                timeout = TCP_TIMEOUT_INIT;
        }

        /* If the RTO formula yields an earlier time, then use that time. */
        rto_delta_us = advancing_rto ?
                        jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
                        tcp_rto_delta_us(sk);  /* How far in future is RTO? */
        if (rto_delta_us > 0)
                timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));

        tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX);
        return true;
}

/* Thanks to skb fast clones, we can detect if a prior transmit of
 * a packet is still in a qdisc or driver queue.
 * In this case, there is very little point doing a retransmit !
 */
static bool skb_still_in_host_queue(struct sock *sk,
                                    const struct sk_buff *skb)
{
        if (unlikely(skb_fclone_busy(sk, skb))) {
                set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
                smp_mb__after_atomic();
                if (skb_fclone_busy(sk, skb)) {
                        NET_INC_STATS(sock_net(sk),
                                      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
                        return true;
                }
        }
        return false;
}

/* When probe timeout (PTO) fires, try send a new segment if possible, else
 * retransmit the last segment.
 */
void tcp_send_loss_probe(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int pcount;
        int mss = tcp_current_mss(sk);

        /* At most one outstanding TLP */
        if (tp->tlp_high_seq)
                goto rearm_timer;

        tp->tlp_retrans = 0;
        skb = tcp_send_head(sk);
        if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
                pcount = tp->packets_out;
                tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
                if (tp->packets_out > pcount)
                        goto probe_sent;
                goto rearm_timer;
        }
        skb = skb_rb_last(&sk->tcp_rtx_queue);
        if (unlikely(!skb)) {
                WARN_ONCE(tp->packets_out,
                          "invalid inflight: %u state %u cwnd %u mss %d\n",
                          tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss);
                inet_csk(sk)->icsk_pending = 0;
                return;
        }

        if (skb_still_in_host_queue(sk, skb))
                goto rearm_timer;

        pcount = tcp_skb_pcount(skb);
        if (WARN_ON(!pcount))
                goto rearm_timer;

        if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
                if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
                                          (pcount - 1) * mss, mss,
                                          GFP_ATOMIC)))
                        goto rearm_timer;
                skb = skb_rb_next(skb);
        }

        if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
                goto rearm_timer;

        if (__tcp_retransmit_skb(sk, skb, 1))
                goto rearm_timer;

        tp->tlp_retrans = 1;

probe_sent:
        /* Record snd_nxt for loss detection. */
        tp->tlp_high_seq = tp->snd_nxt;

        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
        /* Reset s.t. tcp_rearm_rto will restart timer from now */
        inet_csk(sk)->icsk_pending = 0;
rearm_timer:
        tcp_rearm_rto(sk);
}

/* Push out any pending frames which were held back due to
 * TCP_CORK or attempt at coalescing tiny packets.
 * The socket must be locked by the caller.
 */
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
                               int nonagle)
{
        /* If we are closed, the bytes will have to remain here.
         * In time closedown will finish, we empty the write queue and
         * all will be happy.
         */
        if (unlikely(sk->sk_state == TCP_CLOSE))
                return;

        if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
                           sk_gfp_mask(sk, GFP_ATOMIC)))
                tcp_check_probe_timer(sk);
}

/* Send _single_ skb sitting at the send head. This function requires
 * true push pending frames to setup probe timer etc.
 */
void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
        struct sk_buff *skb = tcp_send_head(sk);

        BUG_ON(!skb || skb->len < mss_now);

        tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
}

/* This function returns the amount that we can raise the
 * usable window based on the following constraints
 *
 * 1. The window can never be shrunk once it is offered (RFC 793)
 * 2. We limit memory per socket
 *
 * RFC 1122:
 * "the suggested [SWS] avoidance algorithm for the receiver is to keep
 *  RECV.NEXT + RCV.WIN fixed until:
 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 *
 * i.e. don't raise the right edge of the window until you can raise
 * it at least MSS bytes.
 *
 * Unfortunately, the recommended algorithm breaks header prediction,
 * since header prediction assumes th->window stays fixed.
 *
 * Strictly speaking, keeping th->window fixed violates the receiver
 * side SWS prevention criteria. The problem is that under this rule
 * a stream of single byte packets will cause the right side of the
 * window to always advance by a single byte.
 *
 * Of course, if the sender implements sender side SWS prevention
 * then this will not be a problem.
 *
 * BSD seems to make the following compromise:
 *
 *        If the free space is less than the 1/4 of the maximum
 *        space available and the free space is less than 1/2 mss,
 *        then set the window to 0.
 *        [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
 *        Otherwise, just prevent the window from shrinking
 *        and from being larger than the largest representable value.
 *
 * This prevents incremental opening of the window in the regime
 * where TCP is limited by the speed of the reader side taking
 * data out of the TCP receive queue. It does nothing about
 * those cases where the window is constrained on the sender side
 * because the pipeline is full.
 *
 * BSD also seems to "accidentally" limit itself to windows that are a
 * multiple of MSS, at least until the free space gets quite small.
 * This would appear to be a side effect of the mbuf implementation.
 * Combining these two algorithms results in the observed behavior
 * of having a fixed window size at almost all times.
 *
 * Below we obtain similar behavior by forcing the offered window to
 * a multiple of the mss when it is feasible to do so.
 *
 * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
 * Regular options like TIMESTAMP are taken into account.
 */
u32 __tcp_select_window(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        /* MSS for the peer's data.  Previous versions used mss_clamp
         * here.  I don't know if the value based on our guesses
         * of peer's MSS is better for the performance.  It's more correct
         * but may be worse for the performance because of rcv_mss
         * fluctuations.  --SAW  1998/11/1
         */
        int mss = icsk->icsk_ack.rcv_mss;
        int free_space = tcp_space(sk);
        int allowed_space = tcp_full_space(sk);
        int full_space, window;

        if (sk_is_mptcp(sk))
                mptcp_space(sk, &free_space, &allowed_space);

        full_space = min_t(int, tp->window_clamp, allowed_space);

        if (unlikely(mss > full_space)) {
                mss = full_space;
                if (mss <= 0)
                        return 0;
        }

        /* Only allow window shrink if the sysctl is enabled and we have
         * a non-zero scaling factor in effect.
         */
        if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
                goto shrink_window_allowed;

        /* do not allow window to shrink */

        if (free_space < (full_space >> 1)) {
                icsk->icsk_ack.quick = 0;

                if (tcp_under_memory_pressure(sk))
                        tcp_adjust_rcv_ssthresh(sk);

                /* free_space might become our new window, make sure we don't
                 * increase it due to wscale.
                 */
                free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);

                /* if free space is less than mss estimate, or is below 1/16th
                 * of the maximum allowed, try to move to zero-window, else
                 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
                 * new incoming data is dropped due to memory limits.
                 * With large window, mss test triggers way too late in order
                 * to announce zero window in time before rmem limit kicks in.
                 */
                if (free_space < (allowed_space >> 4) || free_space < mss)
                        return 0;
        }

        if (free_space > tp->rcv_ssthresh)
                free_space = tp->rcv_ssthresh;

        /* Don't do rounding if we are using window scaling, since the
         * scaled window will not line up with the MSS boundary anyway.
         */
        if (tp->rx_opt.rcv_wscale) {
                window = free_space;

                /* Advertise enough space so that it won't get scaled away.
                 * Import case: prevent zero window announcement if
                 * 1<<rcv_wscale > mss.
                 */
                window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
        } else {
                window = tp->rcv_wnd;
                /* Get the largest window that is a nice multiple of mss.
                 * Window clamp already applied above.
                 * If our current window offering is within 1 mss of the
                 * free space we just keep it. This prevents the divide
                 * and multiply from happening most of the time.
                 * We also don't do any window rounding when the free space
                 * is too small.
                 */
                if (window <= free_space - mss || window > free_space)
                        window = rounddown(free_space, mss);
                else if (mss == full_space &&
                         free_space > window + (full_space >> 1))
                        window = free_space;
        }

        return window;

shrink_window_allowed:
        /* new window should always be an exact multiple of scaling factor */
        free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);

        if (free_space < (full_space >> 1)) {
                icsk->icsk_ack.quick = 0;

                if (tcp_under_memory_pressure(sk))
                        tcp_adjust_rcv_ssthresh(sk);

                /* if free space is too low, return a zero window */
                if (free_space < (allowed_space >> 4) || free_space < mss ||
                        free_space < (1 << tp->rx_opt.rcv_wscale))
                        return 0;
        }

        if (free_space > tp->rcv_ssthresh) {
                free_space = tp->rcv_ssthresh;
                /* new window should always be an exact multiple of scaling factor
                 *
                 * For this case, we ALIGN "up" (increase free_space) because
                 * we know free_space is not zero here, it has been reduced from
                 * the memory-based limit, and rcv_ssthresh is not a hard limit
                 * (unlike sk_rcvbuf).
                 */
                free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale));
        }

        return free_space;
}

void tcp_skb_collapse_tstamp(struct sk_buff *skb,
                             const struct sk_buff *next_skb)
{
        if (unlikely(tcp_has_tx_tstamp(next_skb))) {
                const struct skb_shared_info *next_shinfo =
                        skb_shinfo(next_skb);
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
                shinfo->tskey = next_shinfo->tskey;
                TCP_SKB_CB(skb)->txstamp_ack |=
                        TCP_SKB_CB(next_skb)->txstamp_ack;
        }
}

/* Collapses two adjacent SKB's during retransmission. */
static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *next_skb = skb_rb_next(skb);
        int next_skb_size;

        next_skb_size = next_skb->len;

        BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);

        if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size))
                return false;

        tcp_highest_sack_replace(sk, next_skb, skb);

        /* Update sequence range on original skb. */
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;

        /* Merge over control information. This moves PSH/FIN etc. over */
        TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;

        /* All done, get rid of second SKB and account for it so
         * packet counting does not break.
         */
        TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
        TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;

        /* changed transmit queue under us so clear hints */
        tcp_clear_retrans_hints_partial(tp);
        if (next_skb == tp->retransmit_skb_hint)
                tp->retransmit_skb_hint = skb;

        tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));

        tcp_skb_collapse_tstamp(skb, next_skb);

        tcp_rtx_queue_unlink_and_free(next_skb, sk);
        return true;
}

/* Check if coalescing SKBs is legal. */
static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
{
        if (tcp_skb_pcount(skb) > 1)
                return false;
        if (skb_cloned(skb))
                return false;
        /* Some heuristics for collapsing over SACK'd could be invented */
        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                return false;

        return true;
}

/* Collapse packets in the retransmit queue to make to create
 * less packets on the wire. This is only done on retransmission.
 */
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
                                     int space)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb = to, *tmp;
        bool first = true;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
                return;
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
                return;

        skb_rbtree_walk_from_safe(skb, tmp) {
                if (!tcp_can_collapse(sk, skb))
                        break;

                if (!tcp_skb_can_collapse(to, skb))
                        break;

                space -= skb->len;

                if (first) {
                        first = false;
                        continue;
                }

                if (space < 0)
                        break;

                if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
                        break;

                if (!tcp_collapse_retrans(sk, to))
                        break;
        }
}

/* This retransmits one SKB.  Policy decisions and retransmit queue
 * state updates are done by the caller.  Returns non-zero if an
 * error occurred which prevented the send.
 */
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int cur_mss;
        int diff, len, err;
        int avail_wnd;

        /* Inconclusive MTU probe */
        if (icsk->icsk_mtup.probe_size)
                icsk->icsk_mtup.probe_size = 0;

        if (skb_still_in_host_queue(sk, skb))
                return -EBUSY;

start:
        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
                if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                        TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
                        TCP_SKB_CB(skb)->seq++;
                        goto start;
                }
                if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
                        WARN_ON_ONCE(1);
                        return -EINVAL;
                }
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
                        return -ENOMEM;
        }

        if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
                return -EHOSTUNREACH; /* Routing failure or similar. */

        cur_mss = tcp_current_mss(sk);
        avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;

        /* If receiver has shrunk his window, and skb is out of
         * new window, do not retransmit it. The exception is the
         * case, when window is shrunk to zero. In this case
         * our retransmit of one segment serves as a zero window probe.
         */
        if (avail_wnd <= 0) {
                if (TCP_SKB_CB(skb)->seq != tp->snd_una)
                        return -EAGAIN;
                avail_wnd = cur_mss;
        }

        len = cur_mss * segs;
        if (len > avail_wnd) {
                len = rounddown(avail_wnd, cur_mss);
                if (!len)
                        len = avail_wnd;
        }
        if (skb->len > len) {
                if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
                                 cur_mss, GFP_ATOMIC))
                        return -ENOMEM; /* We'll try again later. */
        } else {
                if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
                        return -ENOMEM;

                diff = tcp_skb_pcount(skb);
                tcp_set_skb_tso_segs(skb, cur_mss);
                diff -= tcp_skb_pcount(skb);
                if (diff)
                        tcp_adjust_pcount(sk, skb, diff);
                avail_wnd = min_t(int, avail_wnd, cur_mss);
                if (skb->len < avail_wnd)
                        tcp_retrans_try_collapse(sk, skb, avail_wnd);
        }

        /* RFC3168, section 6.1.1.1. ECN fallback */
        if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
                tcp_ecn_clear_syn(sk, skb);

        /* Update global and local TCP statistics. */
        segs = tcp_skb_pcount(skb);
        TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
        tp->total_retrans += segs;
        tp->bytes_retrans += skb->len;

        /* make sure skb->data is aligned on arches that require it
         * and check if ack-trimming & collapsing extended the headroom
         * beyond what csum_start can cover.
         */
        if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
                     skb_headroom(skb) >= 0xFFFF)) {
                struct sk_buff *nskb;

                tcp_skb_tsorted_save(skb) {
                        nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
                        if (nskb) {
                                nskb->dev = NULL;
                                err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
                        } else {
                                err = -ENOBUFS;
                        }
                } tcp_skb_tsorted_restore(skb);

                if (!err) {
                        tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
                        tcp_rate_skb_sent(sk, skb);
                }
        } else {
                err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
        }

        if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
                tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
                                  TCP_SKB_CB(skb)->seq, segs, err);

        if (likely(!err)) {
                trace_tcp_retransmit_skb(sk, skb);
        } else if (err != -EBUSY) {
                NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
        }

        /* To avoid taking spuriously low RTT samples based on a timestamp
         * for a transmit that never happened, always mark EVER_RETRANS
         */
        TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;

        return err;
}

int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int err = __tcp_retransmit_skb(sk, skb, segs);

        if (err == 0) {
#if FASTRETRANS_DEBUG > 0
                if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
                        net_dbg_ratelimited("retrans_out leaked\n");
                }
#endif
                TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
                tp->retrans_out += tcp_skb_pcount(skb);
        }

        /* Save stamp of the first (attempted) retransmit. */
        if (!tp->retrans_stamp)
                tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb);

        if (tp->undo_retrans < 0)
                tp->undo_retrans = 0;
        tp->undo_retrans += tcp_skb_pcount(skb);
        return err;
}

/* This gets called after a retransmit timeout, and the initially
 * retransmitted data is acknowledged.  It tries to continue
 * resending the rest of the retransmit queue, until either
 * we've sent it all or the congestion window limit is reached.
 */
void tcp_xmit_retransmit_queue(struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb, *rtx_head, *hole = NULL;
        struct tcp_sock *tp = tcp_sk(sk);
        bool rearm_timer = false;
        u32 max_segs;
        int mib_idx;

        if (!tp->packets_out)
                return;

        rtx_head = tcp_rtx_queue_head(sk);
        skb = tp->retransmit_skb_hint ?: rtx_head;
        max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
        skb_rbtree_walk_from(skb) {
                __u8 sacked;
                int segs;

                if (tcp_pacing_check(sk))
                        break;

                /* we could do better than to assign each time */
                if (!hole)
                        tp->retransmit_skb_hint = skb;

                segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
                if (segs <= 0)
                        break;
                sacked = TCP_SKB_CB(skb)->sacked;
                /* In case tcp_shift_skb_data() have aggregated large skbs,
                 * we need to make sure not sending too bigs TSO packets
                 */
                segs = min_t(int, segs, max_segs);

                if (tp->retrans_out >= tp->lost_out) {
                        break;
                } else if (!(sacked & TCPCB_LOST)) {
                        if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
                                hole = skb;
                        continue;

                } else {
                        if (icsk->icsk_ca_state != TCP_CA_Loss)
                                mib_idx = LINUX_MIB_TCPFASTRETRANS;
                        else
                                mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
                }

                if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
                        continue;

                if (tcp_small_queue_check(sk, skb, 1))
                        break;

                if (tcp_retransmit_skb(sk, skb, segs))
                        break;

                NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));

                if (tcp_in_cwnd_reduction(sk))
                        tp->prr_out += tcp_skb_pcount(skb);

                if (skb == rtx_head &&
                    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
                        rearm_timer = true;

        }
        if (rearm_timer)
                tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                     inet_csk(sk)->icsk_rto,
                                     TCP_RTO_MAX);
}

/* We allow to exceed memory limits for FIN packets to expedite
 * connection tear down and (memory) recovery.
 * Otherwise tcp_send_fin() could be tempted to either delay FIN
 * or even be forced to close flow without any FIN.
 * In general, we want to allow one skb per socket to avoid hangs
 * with edge trigger epoll()
 */
void sk_forced_mem_schedule(struct sock *sk, int size)
{
        int delta, amt;

        delta = size - sk->sk_forward_alloc;
        if (delta <= 0)
                return;
        amt = sk_mem_pages(delta);
        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
        sk_memory_allocated_add(sk, amt);

        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
                mem_cgroup_charge_skmem(sk->sk_memcg, amt,
                                        gfp_memcg_charge() | __GFP_NOFAIL);
}

/* Send a FIN. The caller locks the socket for us.
 * We should try to send a FIN packet really hard, but eventually give up.
 */
void tcp_send_fin(struct sock *sk)
{
        struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
        struct tcp_sock *tp = tcp_sk(sk);

        /* Optimization, tack on the FIN if we have one skb in write queue and
         * this skb was not yet sent, or we are under memory pressure.
         * Note: in the latter case, FIN packet will be sent after a timeout,
         * as TCP stack thinks it has already been transmitted.
         */
        tskb = tail;
        if (!tskb && tcp_under_memory_pressure(sk))
                tskb = skb_rb_last(&sk->tcp_rtx_queue);

        if (tskb) {
                TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
                TCP_SKB_CB(tskb)->end_seq++;
                tp->write_seq++;
                if (!tail) {
                        /* This means tskb was already sent.
                         * Pretend we included the FIN on previous transmit.
                         * We need to set tp->snd_nxt to the value it would have
                         * if FIN had been sent. This is because retransmit path
                         * does not change tp->snd_nxt.
                         */
                        WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
                        return;
                }
        } else {
                skb = alloc_skb_fclone(MAX_TCP_HEADER,
                                       sk_gfp_mask(sk, GFP_ATOMIC |
                                                       __GFP_NOWARN));
                if (unlikely(!skb))
                        return;

                INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
                skb_reserve(skb, MAX_TCP_HEADER);
                sk_forced_mem_schedule(sk, skb->truesize);
                /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
                tcp_init_nondata_skb(skb, tp->write_seq,
                                     TCPHDR_ACK | TCPHDR_FIN);
                tcp_queue_skb(sk, skb);
        }
        __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}

/* We get here when a process closes a file descriptor (either due to
 * an explicit close() or as a byproduct of exit()'ing) and there
 * was unread data in the receive queue.  This behavior is recommended
 * by RFC 2525, section 2.17.  -DaveM
 */
void tcp_send_active_reset(struct sock *sk, gfp_t priority,
                           enum sk_rst_reason reason)
{
        struct sk_buff *skb;

        TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);

        /* NOTE: No TCP options attached and we never retransmit this. */
        skb = alloc_skb(MAX_TCP_HEADER, priority);
        if (!skb) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
                return;
        }

        /* Reserve space for headers and prepare control bits. */
        skb_reserve(skb, MAX_TCP_HEADER);
        tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
                             TCPHDR_ACK | TCPHDR_RST);
        tcp_mstamp_refresh(tcp_sk(sk));
        /* Send it off. */
        if (tcp_transmit_skb(sk, skb, 0, priority))
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);

        /* skb of trace_tcp_send_reset() keeps the skb that caused RST,
         * skb here is different to the troublesome skb, so use NULL
         */
        trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
}

/* Send a crossed SYN-ACK during socket establishment.
 * WARNING: This routine must only be called when we have already sent
 * a SYN packet that crossed the incoming SYN that caused this routine
 * to get called. If this assumption fails then the initial rcv_wnd
 * and rcv_wscale values will not be correct.
 */
int tcp_send_synack(struct sock *sk)
{
        struct sk_buff *skb;

        skb = tcp_rtx_queue_head(sk);
        if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                pr_err("%s: wrong queue state\n", __func__);
                return -EFAULT;
        }
        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
                if (skb_cloned(skb)) {
                        struct sk_buff *nskb;

                        tcp_skb_tsorted_save(skb) {
                                nskb = skb_copy(skb, GFP_ATOMIC);
                        } tcp_skb_tsorted_restore(skb);
                        if (!nskb)
                                return -ENOMEM;
                        INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
                        tcp_highest_sack_replace(sk, skb, nskb);
                        tcp_rtx_queue_unlink_and_free(skb, sk);
                        __skb_header_release(nskb);
                        tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
                        sk_wmem_queued_add(sk, nskb->truesize);
                        sk_mem_charge(sk, nskb->truesize);
                        skb = nskb;
                }

                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
                tcp_ecn_send_synack(sk, skb);
        }
        return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}

/**
 * tcp_make_synack - Allocate one skb and build a SYNACK packet.
 * @sk: listener socket
 * @dst: dst entry attached to the SYNACK. It is consumed and caller
 *       should not use it again.
 * @req: request_sock pointer
 * @foc: cookie for tcp fast open
 * @synack_type: Type of synack to prepare
 * @syn_skb: SYN packet just received.  It could be NULL for rtx case.
 */
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
                                struct request_sock *req,
                                struct tcp_fastopen_cookie *foc,
                                enum tcp_synack_type synack_type,
                                struct sk_buff *syn_skb)
{
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_out_options opts;
        struct tcp_key key = {};
        struct sk_buff *skb;
        int tcp_header_size;
        struct tcphdr *th;
        int mss;
        u64 now;

        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
        if (unlikely(!skb)) {
                dst_release(dst);
                return NULL;
        }
        /* Reserve space for headers. */
        skb_reserve(skb, MAX_TCP_HEADER);

        switch (synack_type) {
        case TCP_SYNACK_NORMAL:
                skb_set_owner_w(skb, req_to_sk(req));
                break;
        case TCP_SYNACK_COOKIE:
                /* Under synflood, we do not attach skb to a socket,
                 * to avoid false sharing.
                 */
                break;
        case TCP_SYNACK_FASTOPEN:
                /* sk is a const pointer, because we want to express multiple
                 * cpu might call us concurrently.
                 * sk->sk_wmem_alloc in an atomic, we can promote to rw.
                 */
                skb_set_owner_w(skb, (struct sock *)sk);
                break;
        }
        skb_dst_set(skb, dst);

        mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));

        memset(&opts, 0, sizeof(opts));
        now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
        if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
                skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
                                      SKB_CLOCK_MONOTONIC);
        else
#endif
        {
                skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
                if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
                        tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
        }

#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        rcu_read_lock();
#endif
        if (tcp_rsk_used_ao(req)) {
#ifdef CONFIG_TCP_AO
                struct tcp_ao_key *ao_key = NULL;
                u8 keyid = tcp_rsk(req)->ao_keyid;
                u8 rnext = tcp_rsk(req)->ao_rcv_next;

                ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
                                                            keyid, -1);
                /* If there is no matching key - avoid sending anything,
                 * especially usigned segments. It could try harder and lookup
                 * for another peer-matching key, but the peer has requested
                 * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
                 */
                if (unlikely(!ao_key)) {
                        trace_tcp_ao_synack_no_key(sk, keyid, rnext);
                        rcu_read_unlock();
                        kfree_skb(skb);
                        net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n",
                                             keyid);
                        return NULL;
                }
                key.ao_key = ao_key;
                key.type = TCP_KEY_AO;
#endif
        } else {
#ifdef CONFIG_TCP_MD5SIG
                key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk,
                                        req_to_sk(req));
                if (key.md5_key)
                        key.type = TCP_KEY_MD5;
#endif
        }
        skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4);
        /* bpf program will be interested in the tcp_flags */
        TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
        tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts,
                                             &key, foc, synack_type, syn_skb)
                                        + sizeof(*th);

        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);

        th = (struct tcphdr *)skb->data;
        memset(th, 0, sizeof(struct tcphdr));
        th->syn = 1;
        th->ack = 1;
        tcp_ecn_make_synack(req, th);
        th->source = htons(ireq->ir_num);
        th->dest = ireq->ir_rmt_port;
        skb->mark = ireq->ir_mark;
        skb->ip_summed = CHECKSUM_PARTIAL;
        th->seq = htonl(tcp_rsk(req)->snt_isn);
        /* XXX data is queued and acked as is. No buffer/window check */
        th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);

        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(min(req->rsk_rcv_wnd, 65535U));
        tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key);
        th->doff = (tcp_header_size >> 2);
        TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);

        /* Okay, we have all we need - do the md5 hash if needed */
        if (tcp_key_is_md5(&key)) {
#ifdef CONFIG_TCP_MD5SIG
                tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
                                        key.md5_key, req_to_sk(req), skb);
#endif
        } else if (tcp_key_is_ao(&key)) {
#ifdef CONFIG_TCP_AO
                tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location,
                                        key.ao_key, req, skb,
                                        opts.hash_location - (u8 *)th, 0);
#endif
        }
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
        rcu_read_unlock();
#endif

        bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
                                synack_type, &opts);

        skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
        tcp_add_tx_delay(skb, tp);

        return skb;
}
EXPORT_SYMBOL(tcp_make_synack);

static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_congestion_ops *ca;
        u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);

        if (ca_key == TCP_CA_UNSPEC)
                return;

        rcu_read_lock();
        ca = tcp_ca_find_key(ca_key);
        if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
                bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
                icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
                icsk->icsk_ca_ops = ca;
        }
        rcu_read_unlock();
}

/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
{
        const struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        __u8 rcv_wscale;
        u32 rcv_wnd;

        /* We'll fix this up when we get a response from the other end.
         * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
         */
        tp->tcp_header_len = sizeof(struct tcphdr);
        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
                tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;

        tcp_ao_connect_init(sk);

        /* If user gave his TCP_MAXSEG, record it to clamp */
        if (tp->rx_opt.user_mss)
                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
        tp->max_window = 0;
        tcp_mtup_init(sk);
        tcp_sync_mss(sk, dst_mtu(dst));

        tcp_ca_dst_init(sk, dst);

        if (!tp->window_clamp)
                WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));
        tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));

        tcp_initialize_rcv_mss(sk);

        /* limit the window selection if the user enforce a smaller rx buffer */
        if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
            (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
                WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));

        rcv_wnd = tcp_rwnd_init_bpf(sk);
        if (rcv_wnd == 0)
                rcv_wnd = dst_metric(dst, RTAX_INITRWND);

        tcp_select_initial_window(sk, tcp_full_space(sk),
                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
                                  &tp->rcv_wnd,
                                  &tp->window_clamp,
                                  READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
                                  &rcv_wscale,
                                  rcv_wnd);

        tp->rx_opt.rcv_wscale = rcv_wscale;
        tp->rcv_ssthresh = tp->rcv_wnd;

        WRITE_ONCE(sk->sk_err, 0);
        sock_reset_flag(sk, SOCK_DONE);
        tp->snd_wnd = 0;
        tcp_init_wl(tp, 0);
        tcp_write_queue_purge(sk);
        tp->snd_una = tp->write_seq;
        tp->snd_sml = tp->write_seq;
        tp->snd_up = tp->write_seq;
        WRITE_ONCE(tp->snd_nxt, tp->write_seq);

        if (likely(!tp->repair))
                tp->rcv_nxt = 0;
        else
                tp->rcv_tstamp = tcp_jiffies32;
        tp->rcv_wup = tp->rcv_nxt;
        WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);

        inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
        inet_csk(sk)->icsk_retransmits = 0;
        tcp_clear_retrans(tp);
}

static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

        tcb->end_seq += skb->len;
        __skb_header_release(skb);
        sk_wmem_queued_add(sk, skb->truesize);
        sk_mem_charge(sk, skb->truesize);
        WRITE_ONCE(tp->write_seq, tcb->end_seq);
        tp->packets_out += tcp_skb_pcount(skb);
}

/* Build and send a SYN with data and (cached) Fast Open cookie. However,
 * queue a data-only packet after the regular SYN, such that regular SYNs
 * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
 * only the SYN sequence, the data are retransmitted in the first ACK.
 * If cookie is not cached or other error occurs, falls back to send a
 * regular SYN with Fast Open cookie request option.
 */
static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_fastopen_request *fo = tp->fastopen_req;
        struct page_frag *pfrag = sk_page_frag(sk);
        struct sk_buff *syn_data;
        int space, err = 0;

        tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
        if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
                goto fallback;

        /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
         * user-MSS. Reserve maximum option space for middleboxes that add
         * private TCP options. The cost is reduced data space in SYN :(
         */
        tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
        /* Sync mss_cache after updating the mss_clamp */
        tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);

        space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
                MAX_TCP_OPTION_SPACE;

        space = min_t(size_t, space, fo->size);

        if (space &&
            !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE),
                                  pfrag, sk->sk_allocation))
                goto fallback;
        syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false);
        if (!syn_data)
                goto fallback;
        memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
        if (space) {
                space = min_t(size_t, space, pfrag->size - pfrag->offset);
                space = tcp_wmem_schedule(sk, space);
        }
        if (space) {
                space = copy_page_from_iter(pfrag->page, pfrag->offset,
                                            space, &fo->data->msg_iter);
                if (unlikely(!space)) {
                        tcp_skb_tsorted_anchor_cleanup(syn_data);
                        kfree_skb(syn_data);
                        goto fallback;
                }
                skb_fill_page_desc(syn_data, 0, pfrag->page,
                                   pfrag->offset, space);
                page_ref_inc(pfrag->page);
                pfrag->offset += space;
                skb_len_add(syn_data, space);
                skb_zcopy_set(syn_data, fo->uarg, NULL);
        }
        /* No more data pending in inet_wait_for_connect() */
        if (space == fo->size)
                fo->data = NULL;
        fo->copied = space;

        tcp_connect_queue_skb(sk, syn_data);
        if (syn_data->len)
                tcp_chrono_start(sk, TCP_CHRONO_BUSY);

        err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);

        skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC);

        /* Now full SYN+DATA was cloned and sent (or not),
         * remove the SYN from the original skb (syn_data)
         * we keep in write queue in case of a retransmit, as we
         * also have the SYN packet (with no data) in the same queue.
         */
        TCP_SKB_CB(syn_data)->seq++;
        TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
        if (!err) {
                tp->syn_data = (fo->copied > 0);
                tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
                goto done;
        }

        /* data was not sent, put it in write_queue */
        __skb_queue_tail(&sk->sk_write_queue, syn_data);
        tp->packets_out -= tcp_skb_pcount(syn_data);

fallback:
        /* Send a regular SYN with Fast Open cookie request option */
        if (fo->cookie.len > 0)
                fo->cookie.len = 0;
        err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
        if (err)
                tp->syn_fastopen = 0;
done:
        fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
        return err;
}

/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
        int err;

        tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);

#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
        /* Has to be checked late, after setting daddr/saddr/ops.
         * Return error if the peer has both a md5 and a tcp-ao key
         * configured as this is ambiguous.
         */
        if (unlikely(rcu_dereference_protected(tp->md5sig_info,
                                               lockdep_sock_is_held(sk)))) {
                bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1);
                bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk);
                struct tcp_ao_info *ao_info;

                ao_info = rcu_dereference_check(tp->ao_info,
                                                lockdep_sock_is_held(sk));
                if (ao_info) {
                        /* This is an extra check: tcp_ao_required() in
                         * tcp_v{4,6}_parse_md5_keys() should prevent adding
                         * md5 keys on ao_required socket.
                         */
                        needs_ao |= ao_info->ao_required;
                        WARN_ON_ONCE(ao_info->ao_required && needs_md5);
                }
                if (needs_md5 && needs_ao)
                        return -EKEYREJECTED;

                /* If we have a matching md5 key and no matching tcp-ao key
                 * then free up ao_info if allocated.
                 */
                if (needs_md5) {
                        tcp_ao_destroy_sock(sk, false);
                } else if (needs_ao) {
                        tcp_clear_md5_list(sk);
                        kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
                                                  lockdep_sock_is_held(sk)));
                }
        }
#endif
#ifdef CONFIG_TCP_AO
        if (unlikely(rcu_dereference_protected(tp->ao_info,
                                               lockdep_sock_is_held(sk)))) {
                /* Don't allow connecting if ao is configured but no
                 * matching key is found.
                 */
                if (!tp->af_specific->ao_lookup(sk, sk, -1, -1))
                        return -EKEYREJECTED;
        }
#endif

        if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
                return -EHOSTUNREACH; /* Routing failure or similar. */

        tcp_connect_init(sk);

        if (unlikely(tp->repair)) {
                tcp_finish_connect(sk, NULL);
                return 0;
        }

        buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true);
        if (unlikely(!buff))
                return -ENOBUFS;

        tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
        tcp_mstamp_refresh(tp);
        tp->retrans_stamp = tcp_time_stamp_ts(tp);
        tcp_connect_queue_skb(sk, buff);
        tcp_ecn_send_syn(sk, buff);
        tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);

        /* Send off SYN; include data in Fast Open. */
        err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
              tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
        if (err == -ECONNREFUSED)
                return err;

        /* We change tp->snd_nxt after the tcp_transmit_skb() call
         * in order to make this packet get counted in tcpOutSegs.
         */
        WRITE_ONCE(tp->snd_nxt, tp->write_seq);
        tp->pushed_seq = tp->write_seq;
        buff = tcp_send_head(sk);
        if (unlikely(buff)) {
                WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
                tp->pushed_seq        = TCP_SKB_CB(buff)->seq;
        }
        TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);

        /* Timer for repeating the SYN until an answer. */
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
        return 0;
}
EXPORT_SYMBOL(tcp_connect);

u32 tcp_delack_max(const struct sock *sk)
{
        u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1;

        return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min);
}

/* Send out a delayed ack, the caller does the policy checking
 * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
 * for details.
 */
void tcp_send_delayed_ack(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        int ato = icsk->icsk_ack.ato;
        unsigned long timeout;

        if (ato > TCP_DELACK_MIN) {
                const struct tcp_sock *tp = tcp_sk(sk);
                int max_ato = HZ / 2;

                if (inet_csk_in_pingpong_mode(sk) ||
                    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
                        max_ato = TCP_DELACK_MAX;

                /* Slow path, intersegment interval is "high". */

                /* If some rtt estimate is known, use it to bound delayed ack.
                 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
                 * directly.
                 */
                if (tp->srtt_us) {
                        int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
                                        TCP_DELACK_MIN);

                        if (rtt < max_ato)
                                max_ato = rtt;
                }

                ato = min(ato, max_ato);
        }

        ato = min_t(u32, ato, tcp_delack_max(sk));

        /* Stay within the limit we were given */
        timeout = jiffies + ato;

        /* Use new timeout only if there wasn't a older one earlier. */
        if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
                /* If delack timer is about to expire, send ACK now. */
                if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
                        tcp_send_ack(sk);
                        return;
                }

                if (!time_before(timeout, icsk->icsk_ack.timeout))
                        timeout = icsk->icsk_ack.timeout;
        }
        icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
        icsk->icsk_ack.timeout = timeout;
        sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}

/* This routine sends an ack and also updates the window. */
void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
{
        struct sk_buff *buff;

        /* If we have been reset, we may not send again. */
        if (sk->sk_state == TCP_CLOSE)
                return;

        /* We are not putting this on the write queue, so
         * tcp_transmit_skb() will set the ownership to this
         * sock.
         */
        buff = alloc_skb(MAX_TCP_HEADER,
                         sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
        if (unlikely(!buff)) {
                struct inet_connection_sock *icsk = inet_csk(sk);
                unsigned long delay;

                delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
                if (delay < TCP_RTO_MAX)
                        icsk->icsk_ack.retry++;
                inet_csk_schedule_ack(sk);
                icsk->icsk_ack.ato = TCP_ATO_MIN;
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX);
                return;
        }

        /* Reserve space for headers and prepare control bits. */
        skb_reserve(buff, MAX_TCP_HEADER);
        tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

        /* We do not want pure acks influencing TCP Small Queues or fq/pacing
         * too much.
         * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
         */
        skb_set_tcp_pure_ack(buff);

        /* Send it off, this clears delayed acks for us. */
        __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
}
EXPORT_SYMBOL_GPL(__tcp_send_ack);

void tcp_send_ack(struct sock *sk)
{
        __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
}

/* This routine sends a packet with an out of date sequence
 * number. It assumes the other end will try to ack it.
 *
 * Question: what should we make while urgent mode?
 * 4.4BSD forces sending single byte of data. We cannot send
 * out of window data, because we have SND.NXT==SND.MAX...
 *
 * Current solution: to send TWO zero-length segments in urgent mode:
 * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
 * out-of-date with SND.UNA-1 to probe window.
 */
static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;

        /* We don't queue it, tcp_transmit_skb() sets ownership. */
        skb = alloc_skb(MAX_TCP_HEADER,
                        sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
        if (!skb)
                return -1;

        /* Reserve space for headers and set control bits. */
        skb_reserve(skb, MAX_TCP_HEADER);
        /* Use a previous sequence.  This should cause the other
         * end to send an ack.  Don't queue or clone SKB, just
         * send it.
         */
        tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
        NET_INC_STATS(sock_net(sk), mib);
        return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}

/* Called from setsockopt( ... TCP_REPAIR ) */
void tcp_send_window_probe(struct sock *sk)
{
        if (sk->sk_state == TCP_ESTABLISHED) {
                tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
                tcp_mstamp_refresh(tcp_sk(sk));
                tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
        }
}

/* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk, int mib)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;

        if (sk->sk_state == TCP_CLOSE)
                return -1;

        skb = tcp_send_head(sk);
        if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
                int err;
                unsigned int mss = tcp_current_mss(sk);
                unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;

                if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
                        tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;

                /* We are probing the opening of a window
                 * but the window size is != 0
                 * must have been a result SWS avoidance ( sender )
                 */
                if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
                    skb->len > mss) {
                        seg_size = min(seg_size, mss);
                        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
                        if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
                                         skb, seg_size, mss, GFP_ATOMIC))
                                return -1;
                } else if (!tcp_skb_pcount(skb))
                        tcp_set_skb_tso_segs(skb, mss);

                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
                err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
                if (!err)
                        tcp_event_new_data_sent(sk, skb);
                return err;
        } else {
                if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
                        tcp_xmit_probe_skb(sk, 1, mib);
                return tcp_xmit_probe_skb(sk, 0, mib);
        }
}

/* A window probe timeout has occurred.  If window is not closed send
 * a partial packet else a zero probe.
 */
void tcp_send_probe0(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        unsigned long timeout;
        int err;

        err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);

        if (tp->packets_out || tcp_write_queue_empty(sk)) {
                /* Cancel probe timer, if it is not required. */
                icsk->icsk_probes_out = 0;
                icsk->icsk_backoff = 0;
                icsk->icsk_probes_tstamp = 0;
                return;
        }

        icsk->icsk_probes_out++;
        if (err <= 0) {
                if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
                        icsk->icsk_backoff++;
                timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
        } else {
                /* If packet was not sent due to local congestion,
                 * Let senders fight for local resources conservatively.
                 */
                timeout = TCP_RESOURCE_PROBE_INTERVAL;
        }

        timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
        tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX);
}

int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
{
        const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
        struct flowi fl;
        int res;

        /* Paired with WRITE_ONCE() in sock_setsockopt() */
        if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
                WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
        res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
                                  NULL);
        if (!res) {
                TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
                if (unlikely(tcp_passive_fastopen(sk))) {
                        /* sk has const attribute because listeners are lockless.
                         * However in this case, we are dealing with a passive fastopen
                         * socket thus we can change total_retrans value.
                         */
                        tcp_sk_rw(sk)->total_retrans++;
                }
                trace_tcp_retransmit_synack(sk, req);
        }
        return res;
}
EXPORT_SYMBOL(tcp_rtx_synack);













































































































































































































































































































































    3 










































    1 
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H

#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/math.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>

struct path;
struct file;
struct vfsmount;

/*
 * linux/include/linux/dcache.h
 *
 * Dirent cache data structures
 *
 * (C) Copyright 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

#define IS_ROOT(x) ((x) == (x)->d_parent)

/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
 #define HASH_LEN_DECLARE u32 hash; u32 len
 #define bytemask_from_count(cnt)        (~(~0ul << (cnt)*8))
#else
 #define HASH_LEN_DECLARE u32 len; u32 hash
 #define bytemask_from_count(cnt)        (~(~0ul >> (cnt)*8))
#endif

/*
 * "quick string" -- eases parameter passing, but more importantly
 * saves "metadata" about the string (ie length and the hash).
 *
 * hash comes first so it snuggles against d_parent in the
 * dentry.
 */
struct qstr {
        union {
                struct {
                        HASH_LEN_DECLARE;
                };
                u64 hash_len;
        };
        const unsigned char *name;
};

#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }

extern const struct qstr empty_name;
extern const struct qstr slash_name;
extern const struct qstr dotdot_name;

/*
 * Try to keep struct dentry aligned on 64 byte cachelines (this will
 * give reasonable cacheline footprint with larger lines without the
 * large memory footprint increase).
 */
#ifdef CONFIG_64BIT
# define DNAME_INLINE_LEN 40 /* 192 bytes */
#else
# ifdef CONFIG_SMP
#  define DNAME_INLINE_LEN 40 /* 128 bytes */
# else
#  define DNAME_INLINE_LEN 44 /* 128 bytes */
# endif
#endif

#define d_lock        d_lockref.lock

struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;                /* protected by d_lock */
        seqcount_spinlock_t d_seq;        /* per dentry seqlock */
        struct hlist_bl_node d_hash;        /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        struct qstr d_name;
        struct inode *d_inode;                /* Where the name belongs to - NULL is
                                         * negative */
        unsigned char d_iname[DNAME_INLINE_LEN];        /* small names */

        /* Ref lookup also touches following */
        struct lockref d_lockref;        /* per-dentry lock and refcount */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;        /* The root of the dentry tree */
        unsigned long d_time;                /* used by d_revalidate */
        void *d_fsdata;                        /* fs-specific data */

        union {
                struct list_head d_lru;                /* LRU list */
                wait_queue_head_t *d_wait;        /* in-lookup ones only */
        };
        struct hlist_node d_sib;        /* child of parent list */
        struct hlist_head d_children;        /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;        /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;        /* only for in-lookup ones */
                 struct rcu_head d_rcu;
        } d_u;
};

/*
 * dentry->d_lock spinlock nesting subclasses:
 *
 * 0: normal
 * 1: nested
 */
enum dentry_d_lock_class
{
        DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
        DENTRY_D_LOCK_NESTED
};

enum d_real_type {
        D_REAL_DATA,
        D_REAL_METADATA,
};

struct dentry_operations {
        int (*d_revalidate)(struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
} ____cacheline_aligned;

/*
 * Locking rules for dentry_operations callbacks are to be found in
 * Documentation/filesystems/locking.rst. Keep it updated!
 *
 * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
 * Keep it updated too!
 */

/* d_flags entries */
#define DCACHE_OP_HASH                        BIT(0)
#define DCACHE_OP_COMPARE                BIT(1)
#define DCACHE_OP_REVALIDATE                BIT(2)
#define DCACHE_OP_DELETE                BIT(3)
#define DCACHE_OP_PRUNE                        BIT(4)

#define        DCACHE_DISCONNECTED                BIT(5)
     /* This dentry is possibly not currently connected to the dcache tree, in
      * which case its parent will either be itself, or will have this flag as
      * well.  nfsd will not use a dentry with this bit set, but will first
      * endeavour to clear the bit either by discovering that it is connected,
      * or by performing lookup operations.   Any filesystem which supports
      * nfsd_operations MUST have a lookup function which, if it finds a
      * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that
      * dentry into place and return that dentry rather than the passed one,
      * typically using d_splice_alias. */

#define DCACHE_REFERENCED                BIT(6) /* Recently used, don't discard. */

#define DCACHE_DONTCACHE                BIT(7) /* Purge from memory on final dput() */

#define DCACHE_CANT_MOUNT                BIT(8)
#define DCACHE_GENOCIDE                        BIT(9)
#define DCACHE_SHRINK_LIST                BIT(10)

#define DCACHE_OP_WEAK_REVALIDATE        BIT(11)

#define DCACHE_NFSFS_RENAMED                BIT(12)
     /* this dentry has been "silly renamed" and has to be deleted on the last
      * dput() */
#define DCACHE_FSNOTIFY_PARENT_WATCHED        BIT(14)
     /* Parent inode is watched by some fsnotify listener */

#define DCACHE_DENTRY_KILLED                BIT(15)

#define DCACHE_MOUNTED                        BIT(16) /* is a mountpoint */
#define DCACHE_NEED_AUTOMOUNT                BIT(17) /* handle automount on this dir */
#define DCACHE_MANAGE_TRANSIT                BIT(18) /* manage transit from this dirent */
#define DCACHE_MANAGED_DENTRY \
        (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)

#define DCACHE_LRU_LIST                        BIT(19)

#define DCACHE_ENTRY_TYPE                (7 << 20) /* bits 20..22 are for storing type: */
#define DCACHE_MISS_TYPE                (0 << 20) /* Negative dentry */
#define DCACHE_WHITEOUT_TYPE                (1 << 20) /* Whiteout dentry (stop pathwalk) */
#define DCACHE_DIRECTORY_TYPE                (2 << 20) /* Normal directory */
#define DCACHE_AUTODIR_TYPE                (3 << 20) /* Lookupless directory (presumed automount) */
#define DCACHE_REGULAR_TYPE                (4 << 20) /* Regular file type */
#define DCACHE_SPECIAL_TYPE                (5 << 20) /* Other file type */
#define DCACHE_SYMLINK_TYPE                (6 << 20) /* Symlink */

#define DCACHE_NOKEY_NAME                BIT(25) /* Encrypted name encoded without key */
#define DCACHE_OP_REAL                        BIT(26)

#define DCACHE_PAR_LOOKUP                BIT(28) /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR                BIT(29)
#define DCACHE_NORCU                        BIT(30) /* No RCU delay for freeing */

extern seqlock_t rename_lock;

/*
 * These are the low-level FS interfaces to the dcache..
 */
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);
extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);

/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
                                        wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                        const struct qstr *name);
extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void d_invalidate(struct dentry *);

/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);

extern void d_mark_tmpfile(struct file *, struct inode *);
extern void d_tmpfile(struct file *, struct inode *);

extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);

extern struct dentry *d_find_alias_rcu(struct inode *);

/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);

/*
 * This adds the entry to the hash queues.
 */
extern void d_rehash(struct dentry *);
 
extern void d_add(struct dentry *, struct inode *);

/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);

extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);

static inline unsigned d_count(const struct dentry *dentry)
{
        return dentry->d_lockref.count;
}

/*
 * helper function for dentry_operations.d_dname() members
 */
extern __printf(3, 4)
char *dynamic_dname(char *, int, const char *, ...);

extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(const struct dentry *, char *, int);
extern char *dentry_path(const struct dentry *, char *, int);

/* Allocation counts.. */

/**
 * dget_dlock -        get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a live dentry, increment the reference count and return the dentry.
 * Caller must hold @dentry->d_lock.  Making sure that dentry is alive is
 * caller's resonsibility.  There are many conditions sufficient to guarantee
 * that; e.g. anything with non-negative refcount is alive, so's anything
 * hashed, anything positive, anyone's parent, etc.
 */
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
        dentry->d_lockref.count++;
        return dentry;
}


/**
 * dget - get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a dentry or %NULL pointer increment the reference count
 * if appropriate and return the dentry.  A dentry will not be
 * destroyed when it has references.  Conversely, a dentry with
 * no references can disappear for any number of reasons, starting
 * with memory pressure.  In other words, that primitive is
 * used to clone an existing reference; using it on something with
 * zero refcount is a bug.
 *
 * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
 * avoidance point of view it is equivalent to spin_lock()/increment
 * refcount/spin_unlock(), so calling it under @dentry->d_lock is
 * always a bug; so's calling it under ->d_lock on any of its descendents.
 *
 */
static inline struct dentry *dget(struct dentry *dentry)
{
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
}

extern struct dentry *dget_parent(struct dentry *dentry);

/**
 * d_unhashed - is dentry hashed
 * @dentry: entry to check
 *
 * Returns true if the dentry passed is not currently hashed.
 */
static inline int d_unhashed(const struct dentry *dentry)
{
        return hlist_bl_unhashed(&dentry->d_hash);
}

static inline int d_unlinked(const struct dentry *dentry)
{
        return d_unhashed(dentry) && !IS_ROOT(dentry);
}

static inline int cant_mount(const struct dentry *dentry)
{
        return (dentry->d_flags & DCACHE_CANT_MOUNT);
}

static inline void dont_mount(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_CANT_MOUNT;
        spin_unlock(&dentry->d_lock);
}

extern void __d_lookup_unhash_wake(struct dentry *dentry);

static inline int d_in_lookup(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_PAR_LOOKUP;
}

static inline void d_lookup_done(struct dentry *dentry)
{
        if (unlikely(d_in_lookup(dentry)))
                __d_lookup_unhash_wake(dentry);
}

extern void dput(struct dentry *);

static inline bool d_managed(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}

static inline bool d_mountpoint(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MOUNTED;
}

/*
 * Directory cache entry type accessor functions.
 */
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_ENTRY_TYPE;
}

static inline bool d_is_miss(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}

static inline bool d_is_whiteout(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}

static inline bool d_can_lookup(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}

static inline bool d_is_autodir(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}

static inline bool d_is_dir(const struct dentry *dentry)
{
        return d_can_lookup(dentry) || d_is_autodir(dentry);
}

static inline bool d_is_symlink(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}

static inline bool d_is_reg(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}

static inline bool d_is_special(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}

static inline bool d_is_file(const struct dentry *dentry)
{
        return d_is_reg(dentry) || d_is_special(dentry);
}

static inline bool d_is_negative(const struct dentry *dentry)
{
        // TODO: check d_is_whiteout(dentry) also.
        return d_is_miss(dentry);
}

static inline bool d_flags_negative(unsigned flags)
{
        return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}

static inline bool d_is_positive(const struct dentry *dentry)
{
        return !d_is_negative(dentry);
}

/**
 * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents either an absent name or a name that
 * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
 * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
 * fallthrough marker in an opaque directory.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.  (3) The dentry may have something attached to ->d_lower and the
 * type field of the flags may be set to something other than miss or whiteout.
 */
static inline bool d_really_is_negative(const struct dentry *dentry)
{
        return dentry->d_inode == NULL;
}

/**
 * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents a name that maps to an inode
 * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
 * that is represented on medium as a 0,0 chardev.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.
 */
static inline bool d_really_is_positive(const struct dentry *dentry)
{
        return dentry->d_inode != NULL;
}

static inline int simple_positive(const struct dentry *dentry)
{
        return d_really_is_positive(dentry) && !d_unhashed(dentry);
}

extern int sysctl_vfs_cache_pressure;

static inline unsigned long vfs_pressure_ratio(unsigned long val)
{
        return mult_frac(val, sysctl_vfs_cache_pressure, 100);
}

/**
 * d_inode - Get the actual inode of this dentry
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode(const struct dentry *dentry)
{
        return dentry->d_inode;
}

/**
 * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
        return READ_ONCE(dentry->d_inode);
}

/**
 * d_backing_inode - Get upper or lower inode we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get at the inode that will be used
 * if this dentry were to be opened as a file.  The inode may be on the upper
 * dentry or it may be on a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own inodes.
 */
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
        struct inode *inode = upper->d_inode;

        return inode;
}

/**
 * d_real - Return the real dentry
 * @dentry: the dentry to query
 * @type: the type of real dentry (data or metadata)
 *
 * If dentry is on a union/overlay, then return the underlying, real dentry.
 * Otherwise return the dentry itself.
 *
 * See also: Documentation/filesystems/vfs.rst
 */
static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
                return dentry->d_op->d_real(dentry, type);
        else
                return dentry;
}

/**
 * d_real_inode - Return the real inode hosting the data
 * @dentry: The dentry to query
 *
 * If dentry is on a union/overlay, then return the underlying, real inode.
 * Otherwise return d_inode().
 */
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
        /* This usage of d_real() results in const dentry */
        return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA));
}

struct name_snapshot {
        struct qstr name;
        unsigned char inline_name[DNAME_INLINE_LEN];
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);

static inline struct dentry *d_first_child(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
}

static inline struct dentry *d_next_sibling(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
}

#endif        /* __LINUX_DCACHE_H */




























































































































    8 





















    1 


    1 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/atomic.h>
#include <linux/errseq.h>
#include <linux/log2.h>

/*
 * An errseq_t is a way of recording errors in one place, and allowing any
 * number of "subscribers" to tell whether it has changed since a previous
 * point where it was sampled.
 *
 * It's implemented as an unsigned 32-bit value. The low order bits are
 * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
 * are used as a counter. This is done with atomics instead of locking so that
 * these functions can be called from any context.
 *
 * The general idea is for consumers to sample an errseq_t value. That value
 * can later be used to tell whether any new errors have occurred since that
 * sampling was done.
 *
 * Note that there is a risk of collisions if new errors are being recorded
 * frequently, since we have so few bits to use as a counter.
 *
 * To mitigate this, one bit is used as a flag to tell whether the value has
 * been sampled since a new value was recorded. That allows us to avoid bumping
 * the counter if no one has sampled it since the last time an error was
 * recorded.
 *
 * A new errseq_t should always be zeroed out.  A errseq_t value of all zeroes
 * is the special (but common) case where there has never been an error. An all
 * zero value thus serves as the "epoch" if one wishes to know whether there
 * has ever been an error set since it was first initialized.
 */

/* The low bits are designated for error code (max of MAX_ERRNO) */
#define ERRSEQ_SHIFT                ilog2(MAX_ERRNO + 1)

/* This bit is used as a flag to indicate whether the value has been seen */
#define ERRSEQ_SEEN                (1 << ERRSEQ_SHIFT)

/* The lowest bit of the counter */
#define ERRSEQ_CTR_INC                (1 << (ERRSEQ_SHIFT + 1))

/**
 * errseq_set - set a errseq_t for later reporting
 * @eseq: errseq_t field that should be set
 * @err: error to set (must be between -1 and -MAX_ERRNO)
 *
 * This function sets the error in @eseq, and increments the sequence counter
 * if the last sequence was sampled at some point in the past.
 *
 * Any error set will always overwrite an existing error.
 *
 * Return: The previous value, primarily for debugging purposes. The
 * return value should not be used as a previously sampled value in later
 * calls as it will not have the SEEN flag set.
 */
errseq_t errseq_set(errseq_t *eseq, int err)
{
        errseq_t cur, old;

        /* MAX_ERRNO must be able to serve as a mask */
        BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1);

        /*
         * Ensure the error code actually fits where we want it to go. If it
         * doesn't then just throw a warning and don't record anything. We
         * also don't accept zero here as that would effectively clear a
         * previous error.
         */
        old = READ_ONCE(*eseq);

        if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
                                "err = %d\n", err))
                return old;

        for (;;) {
                errseq_t new;

                /* Clear out error bits and set new error */
                new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err;

                /* Only increment if someone has looked at it */
                if (old & ERRSEQ_SEEN)
                        new += ERRSEQ_CTR_INC;

                /* If there would be no change, then call it done */
                if (new == old) {
                        cur = new;
                        break;
                }

                /* Try to swap the new value into place */
                cur = cmpxchg(eseq, old, new);

                /*
                 * Call it success if we did the swap or someone else beat us
                 * to it for the same value.
                 */
                if (likely(cur == old || cur == new))
                        break;

                /* Raced with an update, try again */
                old = cur;
        }
        return cur;
}
EXPORT_SYMBOL(errseq_set);

/**
 * errseq_sample() - Grab current errseq_t value.
 * @eseq: Pointer to errseq_t to be sampled.
 *
 * This function allows callers to initialise their errseq_t variable.
 * If the error has been "seen", new callers will not see an old error.
 * If there is an unseen error in @eseq, the caller of this function will
 * see it the next time it checks for an error.
 *
 * Context: Any context.
 * Return: The current errseq value.
 */
errseq_t errseq_sample(errseq_t *eseq)
{
        errseq_t old = READ_ONCE(*eseq);

        /* If nobody has seen this error yet, then we can be the first. */
        if (!(old & ERRSEQ_SEEN))
                old = 0;
        return old;
}
EXPORT_SYMBOL(errseq_sample);

/**
 * errseq_check() - Has an error occurred since a particular sample point?
 * @eseq: Pointer to errseq_t value to be checked.
 * @since: Previously-sampled errseq_t from which to check.
 *
 * Grab the value that eseq points to, and see if it has changed @since
 * the given value was sampled. The @since value is not advanced, so there
 * is no need to mark the value as seen.
 *
 * Return: The latest error set in the errseq_t or 0 if it hasn't changed.
 */
int errseq_check(errseq_t *eseq, errseq_t since)
{
        errseq_t cur = READ_ONCE(*eseq);

        if (likely(cur == since))
                return 0;
        return -(cur & MAX_ERRNO);
}
EXPORT_SYMBOL(errseq_check);

/**
 * errseq_check_and_advance() - Check an errseq_t and advance to current value.
 * @eseq: Pointer to value being checked and reported.
 * @since: Pointer to previously-sampled errseq_t to check against and advance.
 *
 * Grab the eseq value, and see whether it matches the value that @since
 * points to. If it does, then just return 0.
 *
 * If it doesn't, then the value has changed. Set the "seen" flag, and try to
 * swap it into place as the new eseq value. Then, set that value as the new
 * "since" value, and return whatever the error portion is set to.
 *
 * Note that no locking is provided here for concurrent updates to the "since"
 * value. The caller must provide that if necessary. Because of this, callers
 * may want to do a lockless errseq_check before taking the lock and calling
 * this.
 *
 * Return: Negative errno if one has been stored, or 0 if no new error has
 * occurred.
 */
int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
{
        int err = 0;
        errseq_t old, new;

        /*
         * Most callers will want to use the inline wrapper to check this,
         * so that the common case of no error is handled without needing
         * to take the lock that protects the "since" value.
         */
        old = READ_ONCE(*eseq);
        if (old != *since) {
                /*
                 * Set the flag and try to swap it into place if it has
                 * changed.
                 *
                 * We don't care about the outcome of the swap here. If the
                 * swap doesn't occur, then it has either been updated by a
                 * writer who is altering the value in some way (updating
                 * counter or resetting the error), or another reader who is
                 * just setting the "seen" flag. Either outcome is OK, and we
                 * can advance "since" and return an error based on what we
                 * have.
                 */
                new = old | ERRSEQ_SEEN;
                if (new != old)
                        cmpxchg(eseq, old, new);
                *since = new;
                err = -(new & MAX_ERRNO);
        }
        return err;
}
EXPORT_SYMBOL(errseq_check_and_advance);






























































































































































































































































































































































































































































































































































































































































































    1 



    1 




    1 






























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
// SPDX-License-Identifier: GPL-2.0-only
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Implementation of the Transmission Control Protocol(TCP).
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *                Linus Torvalds, <torvalds@cs.helsinki.fi>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Matthew Dillon, <dillon@apollo.west.oic.com>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 */

#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
#include <net/rstreason.h>

static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 elapsed, user_timeout;
        s32 remaining;

        user_timeout = READ_ONCE(icsk->icsk_user_timeout);
        if (!user_timeout)
                return icsk->icsk_rto;

        elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp;
        if (tp->tcp_usec_ts)
                elapsed /= USEC_PER_MSEC;

        remaining = user_timeout - elapsed;
        if (remaining <= 0)
                return 1; /* user timeout has passed; fire ASAP */

        return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
}

u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 remaining, user_timeout;
        s32 elapsed;

        user_timeout = READ_ONCE(icsk->icsk_user_timeout);
        if (!user_timeout || !icsk->icsk_probes_tstamp)
                return when;

        elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
        if (unlikely(elapsed < 0))
                elapsed = 0;
        remaining = msecs_to_jiffies(user_timeout) - elapsed;
        remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);

        return min_t(u32, remaining, when);
}

/**
 *  tcp_write_err() - close socket and save error info
 *  @sk:  The socket the error has appeared on.
 *
 *  Returns: Nothing (void)
 */

static void tcp_write_err(struct sock *sk)
{
        tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}

/**
 *  tcp_out_of_resources() - Close socket if out of resources
 *  @sk:        pointer to current socket
 *  @do_reset:  send a last packet with reset flag
 *
 *  Do not allow orphaned sockets to eat all our resources.
 *  This is direct violation of TCP specs, but it is required
 *  to prevent DoS attacks. It is called when a retransmission timeout
 *  or zero probe timeout occurs on orphaned socket.
 *
 *  Also close if our net namespace is exiting; in that case there is no
 *  hope of ever communicating again since all netns interfaces are already
 *  down (or about to be down), and we need to release our dst references,
 *  which have been moved to the netns loopback interface, so the namespace
 *  can finish exiting.  This condition is only possible if we are a kernel
 *  socket, as those do not hold references to the namespace.
 *
 *  Criteria is still not confirmed experimentally and may change.
 *  We kill the socket, if:
 *  1. If number of orphaned sockets exceeds an administratively configured
 *     limit.
 *  2. If we have strong memory pressure.
 *  3. If our net namespace is exiting.
 */
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
        struct tcp_sock *tp = tcp_sk(sk);
        int shift = 0;

        /* If peer does not open window for long time, or did not transmit
         * anything for long time, penalize it. */
        if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
                shift++;

        /* If some dubious ICMP arrived, penalize even more. */
        if (READ_ONCE(sk->sk_err_soft))
                shift++;

        if (tcp_check_oom(sk, shift)) {
                /* Catch exceptional cases, when connection requires reset.
                 *      1. Last segment was sent recently. */
                if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
                    /*  2. Window is closed. */
                    (!tp->snd_wnd && !tp->packets_out))
                        do_reset = true;
                if (do_reset)
                        tcp_send_active_reset(sk, GFP_ATOMIC,
                                              SK_RST_REASON_NOT_SPECIFIED);
                tcp_done(sk);
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
                return 1;
        }

        if (!check_net(sock_net(sk))) {
                /* Not possible to send reset; just close */
                tcp_done(sk);
                return 1;
        }

        return 0;
}

/**
 *  tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket
 *  @sk:    Pointer to the current socket.
 *  @alive: bool, socket alive state
 */
static int tcp_orphan_retries(struct sock *sk, bool alive)
{
        int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */

        /* We know from an ICMP that something is wrong. */
        if (READ_ONCE(sk->sk_err_soft) && !alive)
                retries = 0;

        /* However, if socket sent something recently, select some safe
         * number of retries. 8 corresponds to >100 seconds with minimal
         * RTO of 200msec. */
        if (retries == 0 && alive)
                retries = 8;
        return retries;
}

static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
{
        const struct net *net = sock_net(sk);
        int mss;

        /* Black hole detection */
        if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing))
                return;

        if (!icsk->icsk_mtup.enabled) {
                icsk->icsk_mtup.enabled = 1;
                icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
        } else {
                mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
                mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss);
                mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor));
                mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss));
                icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
        }
        tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}

static unsigned int tcp_model_timeout(struct sock *sk,
                                      unsigned int boundary,
                                      unsigned int rto_base)
{
        unsigned int linear_backoff_thresh, timeout;

        linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
        if (boundary <= linear_backoff_thresh)
                timeout = ((2 << boundary) - 1) * rto_base;
        else
                timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
                        (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
        return jiffies_to_msecs(timeout);
}
/**
 *  retransmits_timed_out() - returns true if this connection has timed out
 *  @sk:       The current socket
 *  @boundary: max number of retransmissions
 *  @timeout:  A custom timeout value.
 *             If set to 0 the default timeout is calculated and used.
 *             Using TCP_RTO_MIN and the number of unsuccessful retransmits.
 *
 * The default "timeout" value this function can calculate and use
 * is equivalent to the timeout of a TCP Connection
 * after "boundary" unsuccessful, exponentially backed-off
 * retransmissions with an initial RTO of TCP_RTO_MIN.
 */
static bool retransmits_timed_out(struct sock *sk,
                                  unsigned int boundary,
                                  unsigned int timeout)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int start_ts, delta;

        if (!inet_csk(sk)->icsk_retransmits)
                return false;

        start_ts = tp->retrans_stamp;
        if (likely(timeout == 0)) {
                unsigned int rto_base = TCP_RTO_MIN;

                if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
                        rto_base = tcp_timeout_init(sk);
                timeout = tcp_model_timeout(sk, boundary, rto_base);
        }

        if (tp->tcp_usec_ts) {
                /* delta maybe off up to a jiffy due to timer granularity. */
                delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1);
                return (s32)(delta - timeout * USEC_PER_MSEC) >= 0;
        }
        return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0;
}

/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        bool expired = false, do_reset;
        int retry_until, max_retransmits;

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                if (icsk->icsk_retransmits)
                        __dst_negative_advice(sk);
                /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
                retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
                        READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);

                max_retransmits = retry_until;
                if (sk->sk_state == TCP_SYN_SENT)
                        max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts);

                expired = icsk->icsk_retransmits >= max_retransmits;
        } else {
                if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
                        /* Black hole detection */
                        tcp_mtu_probing(icsk, sk);

                        __dst_negative_advice(sk);
                }

                retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
                if (sock_flag(sk, SOCK_DEAD)) {
                        const bool alive = icsk->icsk_rto < TCP_RTO_MAX;

                        retry_until = tcp_orphan_retries(sk, alive);
                        do_reset = alive ||
                                !retransmits_timed_out(sk, retry_until, 0);

                        if (tcp_out_of_resources(sk, do_reset))
                                return 1;
                }
        }
        if (!expired)
                expired = retransmits_timed_out(sk, retry_until,
                                                READ_ONCE(icsk->icsk_user_timeout));
        tcp_fastopen_active_detect_blackhole(sk, expired);

        if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
                tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
                                  icsk->icsk_retransmits,
                                  icsk->icsk_rto, (int)expired);

        if (expired) {
                /* Has it gone just too far? */
                tcp_write_err(sk);
                return 1;
        }

        if (sk_rethink_txhash(sk)) {
                tp->timeout_rehash++;
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
        }

        return 0;
}

/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);

        if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
                return;

        /* Handling the sack compression case */
        if (tp->compressed_ack) {
                tcp_mstamp_refresh(tp);
                tcp_sack_compress_send_ack(sk);
                return;
        }

        if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
                return;

        if (time_after(icsk->icsk_ack.timeout, jiffies)) {
                sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
                return;
        }
        icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;

        if (inet_csk_ack_scheduled(sk)) {
                if (!inet_csk_in_pingpong_mode(sk)) {
                        /* Delayed ACK missed: inflate ATO. */
                        icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto);
                } else {
                        /* Delayed ACK missed: leave pingpong mode and
                         * deflate ATO.
                         */
                        inet_csk_exit_pingpong_mode(sk);
                        icsk->icsk_ack.ato      = TCP_ATO_MIN;
                }
                tcp_mstamp_refresh(tp);
                tcp_send_ack(sk);
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
        }
}


/**
 *  tcp_delack_timer() - The TCP delayed ACK timeout handler
 *  @t:  Pointer to the timer. (gets casted to struct sock *)
 *
 *  This function gets (indirectly) called when the kernel timer for a TCP packet
 *  of this socket expires. Calls tcp_delack_timer_handler() to do the actual work.
 *
 *  Returns: Nothing (void)
 */
static void tcp_delack_timer(struct timer_list *t)
{
        struct inet_connection_sock *icsk =
                        from_timer(icsk, t, icsk_delack_timer);
        struct sock *sk = &icsk->icsk_inet.sk;

        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                tcp_delack_timer_handler(sk);
        } else {
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
                /* deleguate our work to tcp_release_cb() */
                if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
                        sock_hold(sk);
        }
        bh_unlock_sock(sk);
        sock_put(sk);
}

static void tcp_probe_timer(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb = tcp_send_head(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int max_probes;

        if (tp->packets_out || !skb) {
                icsk->icsk_probes_out = 0;
                icsk->icsk_probes_tstamp = 0;
                return;
        }

        /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
         * long as the receiver continues to respond probes. We support this by
         * default and reset icsk_probes_out with incoming ACKs. But if the
         * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
         * kill the socket when the retry count and the time exceeds the
         * corresponding system limit. We also implement similar policy when
         * we use RTO to probe window in tcp_retransmit_timer().
         */
        if (!icsk->icsk_probes_tstamp) {
                icsk->icsk_probes_tstamp = tcp_jiffies32;
        } else {
                u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);

                if (user_timeout &&
                    (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
                     msecs_to_jiffies(user_timeout))
                        goto abort;
        }
        max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
        if (sock_flag(sk, SOCK_DEAD)) {
                const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;

                max_probes = tcp_orphan_retries(sk, alive);
                if (!alive && icsk->icsk_backoff >= max_probes)
                        goto abort;
                if (tcp_out_of_resources(sk, true))
                        return;
        }

        if (icsk->icsk_probes_out >= max_probes) {
abort:                tcp_write_err(sk);
        } else {
                /* Only send another probe if we didn't close things up. */
                tcp_send_probe0(sk);
        }
}

static void tcp_update_rto_stats(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);

        if (!icsk->icsk_retransmits) {
                tp->total_rto_recoveries++;
                tp->rto_stamp = tcp_time_stamp_ms(tp);
        }
        icsk->icsk_retransmits++;
        tp->total_rto++;
}

/*
 *        Timer for Fast Open socket to retransmit SYNACK. Note that the
 *        sk here is the child socket, not the parent (listener) socket.
 */
static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int max_retries;

        req->rsk_ops->syn_ack_timeout(req);

        /* Add one more retry for fastopen.
         * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
         */
        max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
                READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;

        if (req->num_timeout >= max_retries) {
                tcp_write_err(sk);
                return;
        }
        /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
        if (icsk->icsk_retransmits == 1)
                tcp_enter_loss(sk);
        /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
         * returned from rtx_syn_ack() to make it more persistent like
         * regular retransmit because if the child socket has been accepted
         * it's not good to give up too easily.
         */
        inet_rtx_syn_ack(sk, req);
        req->num_timeout++;
        tcp_update_rto_stats(sk);
        if (!tp->retrans_stamp)
                tp->retrans_stamp = tcp_time_stamp_ts(tp);
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                          req->timeout << req->num_timeout, TCP_RTO_MAX);
}

static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
                                     const struct sk_buff *skb,
                                     u32 rtx_delta)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        const int timeout = TCP_RTO_MAX * 2;
        s32 rcv_delta;

        /* Note: timer interrupt might have been delayed by at least one jiffy,
         * and tp->rcv_tstamp might very well have been written recently.
         * rcv_delta can thus be negative.
         */
        rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
        if (rcv_delta <= timeout)
                return false;

        return msecs_to_jiffies(rtx_delta) > timeout;
}

/**
 *  tcp_retransmit_timer() - The TCP retransmit timeout handler
 *  @sk:  Pointer to the current socket.
 *
 *  This function gets called when the kernel timer for a TCP packet
 *  of this socket expires.
 *
 *  It handles retransmission, timer adjustment and other necessary measures.
 *
 *  Returns: Nothing (void)
 */
void tcp_retransmit_timer(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct net *net = sock_net(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct request_sock *req;
        struct sk_buff *skb;

        req = rcu_dereference_protected(tp->fastopen_rsk,
                                        lockdep_sock_is_held(sk));
        if (req) {
                WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
                             sk->sk_state != TCP_FIN_WAIT1);
                tcp_fastopen_synack_timer(sk, req);
                /* Before we receive ACK to our SYN-ACK don't retransmit
                 * anything else (e.g., data or FIN segments).
                 */
                return;
        }

        if (!tp->packets_out)
                return;

        skb = tcp_rtx_queue_head(sk);
        if (WARN_ON_ONCE(!skb))
                return;

        tp->tlp_high_seq = 0;

        if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
            !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
                /* Receiver dastardly shrinks window. Our retransmits
                 * become zero probes, but we should not timeout this
                 * connection. If the socket is an orphan, time it out,
                 * we cannot allow such beasts to hang infinitely.
                 */
                struct inet_sock *inet = inet_sk(sk);
                u32 rtx_delta;

                rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: 
                                tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
                if (tp->tcp_usec_ts)
                        rtx_delta /= USEC_PER_MSEC;

                if (sk->sk_family == AF_INET) {
                        net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
                                &inet->inet_daddr, ntohs(inet->inet_dport),
                                inet->inet_num, tp->snd_una, tp->snd_nxt,
                                jiffies_to_msecs(jiffies - tp->rcv_tstamp),
                                rtx_delta);
                }
#if IS_ENABLED(CONFIG_IPV6)
                else if (sk->sk_family == AF_INET6) {
                        net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
                                &sk->sk_v6_daddr, ntohs(inet->inet_dport),
                                inet->inet_num, tp->snd_una, tp->snd_nxt,
                                jiffies_to_msecs(jiffies - tp->rcv_tstamp),
                                rtx_delta);
                }
#endif
                if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) {
                        tcp_write_err(sk);
                        goto out;
                }
                tcp_enter_loss(sk);
                tcp_retransmit_skb(sk, skb, 1);
                __sk_dst_reset(sk);
                goto out_reset_timer;
        }

        __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
        if (tcp_write_timeout(sk))
                goto out;

        if (icsk->icsk_retransmits == 0) {
                int mib_idx = 0;

                if (icsk->icsk_ca_state == TCP_CA_Recovery) {
                        if (tcp_is_sack(tp))
                                mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
                        else
                                mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
                } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
                        mib_idx = LINUX_MIB_TCPLOSSFAILURES;
                } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
                           tp->sacked_out) {
                        if (tcp_is_sack(tp))
                                mib_idx = LINUX_MIB_TCPSACKFAILURES;
                        else
                                mib_idx = LINUX_MIB_TCPRENOFAILURES;
                }
                if (mib_idx)
                        __NET_INC_STATS(sock_net(sk), mib_idx);
        }

        tcp_enter_loss(sk);

        tcp_update_rto_stats(sk);
        if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
                /* Retransmission failed because of local congestion,
                 * Let senders fight for local resources conservatively.
                 */
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                          TCP_RESOURCE_PROBE_INTERVAL,
                                          TCP_RTO_MAX);
                goto out;
        }

        /* Increase the timeout each time we retransmit.  Note that
         * we do not increase the rtt estimate.  rto is initialized
         * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
         * that doubling rto each time is the least we can get away with.
         * In KA9Q, Karn uses this for the first few times, and then
         * goes to quadratic.  netBSD doubles, but only goes up to *64,
         * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
         * defined in the protocol as the maximum possible RTT.  I guess
         * we'll have to use something other than TCP to talk to the
         * University of Mars.
         *
         * PAWS allows us longer timeouts and large windows, so once
         * implemented ftp to mars will work nicely. We will have to fix
         * the 120 second clamps though!
         */

out_reset_timer:
        /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
         * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
         * might be increased if the stream oscillates between thin and thick,
         * thus the old value might already be too high compared to the value
         * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
         * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
         * exponential backoff behaviour to avoid continue hammering
         * linear-timeout retransmissions into a black hole
         */
        if (sk->sk_state == TCP_ESTABLISHED &&
            (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) &&
            tcp_stream_is_thin(tp) &&
            icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
                icsk->icsk_backoff = 0;
                icsk->icsk_rto = clamp(__tcp_set_rto(tp),
                                       tcp_rto_min(sk),
                                       TCP_RTO_MAX);
        } else if (sk->sk_state != TCP_SYN_SENT ||
                   tp->total_rto >
                   READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
                /* Use normal (exponential) backoff unless linear timeouts are
                 * activated.
                 */
                icsk->icsk_backoff++;
                icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
        }
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                  tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
        if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0))
                __sk_dst_reset(sk);

out:;
}

/* Called with bottom-half processing disabled.
   Called by tcp_write_timer() */
void tcp_write_timer_handler(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        int event;

        if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
            !icsk->icsk_pending)
                return;

        if (time_after(icsk->icsk_timeout, jiffies)) {
                sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
                return;
        }

        tcp_mstamp_refresh(tcp_sk(sk));
        event = icsk->icsk_pending;

        switch (event) {
        case ICSK_TIME_REO_TIMEOUT:
                tcp_rack_reo_timeout(sk);
                break;
        case ICSK_TIME_LOSS_PROBE:
                tcp_send_loss_probe(sk);
                break;
        case ICSK_TIME_RETRANS:
                icsk->icsk_pending = 0;
                tcp_retransmit_timer(sk);
                break;
        case ICSK_TIME_PROBE0:
                icsk->icsk_pending = 0;
                tcp_probe_timer(sk);
                break;
        }
}

static void tcp_write_timer(struct timer_list *t)
{
        struct inet_connection_sock *icsk =
                        from_timer(icsk, t, icsk_retransmit_timer);
        struct sock *sk = &icsk->icsk_inet.sk;

        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                tcp_write_timer_handler(sk);
        } else {
                /* delegate our work to tcp_release_cb() */
                if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
                        sock_hold(sk);
        }
        bh_unlock_sock(sk);
        sock_put(sk);
}

void tcp_syn_ack_timeout(const struct request_sock *req)
{
        struct net *net = read_pnet(&inet_rsk(req)->ireq_net);

        __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
}
EXPORT_SYMBOL(tcp_syn_ack_timeout);

void tcp_set_keepalive(struct sock *sk, int val)
{
        if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
                return;

        if (val && !sock_flag(sk, SOCK_KEEPOPEN))
                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
        else if (!val)
                inet_csk_delete_keepalive_timer(sk);
}
EXPORT_SYMBOL_GPL(tcp_set_keepalive);


static void tcp_keepalive_timer (struct timer_list *t)
{
        struct sock *sk = from_timer(sk, t, sk_timer);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        u32 elapsed;

        /* Only process if socket is not in use. */
        bh_lock_sock(sk);
        if (sock_owned_by_user(sk)) {
                /* Try again later. */
                inet_csk_reset_keepalive_timer (sk, HZ/20);
                goto out;
        }

        if (sk->sk_state == TCP_LISTEN) {
                pr_err("Hmm... keepalive on a LISTEN ???\n");
                goto out;
        }

        tcp_mstamp_refresh(tp);
        if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
                if (READ_ONCE(tp->linger2) >= 0) {
                        const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;

                        if (tmo > 0) {
                                tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
                                goto out;
                        }
                }
                tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
                goto death;
        }

        if (!sock_flag(sk, SOCK_KEEPOPEN) ||
            ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)))
                goto out;

        elapsed = keepalive_time_when(tp);

        /* It is alive without keepalive 8) */
        if (tp->packets_out || !tcp_write_queue_empty(sk))
                goto resched;

        elapsed = keepalive_time_elapsed(tp);

        if (elapsed >= keepalive_time_when(tp)) {
                u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);

                /* If the TCP_USER_TIMEOUT option is enabled, use that
                 * to determine when to timeout instead.
                 */
                if ((user_timeout != 0 &&
                    elapsed >= msecs_to_jiffies(user_timeout) &&
                    icsk->icsk_probes_out > 0) ||
                    (user_timeout == 0 &&
                    icsk->icsk_probes_out >= keepalive_probes(tp))) {
                        tcp_send_active_reset(sk, GFP_ATOMIC,
                                              SK_RST_REASON_NOT_SPECIFIED);
                        tcp_write_err(sk);
                        goto out;
                }
                if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
                        icsk->icsk_probes_out++;
                        elapsed = keepalive_intvl_when(tp);
                } else {
                        /* If keepalive was lost due to local congestion,
                         * try harder.
                         */
                        elapsed = TCP_RESOURCE_PROBE_INTERVAL;
                }
        } else {
                /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
                elapsed = keepalive_time_when(tp) - elapsed;
        }

resched:
        inet_csk_reset_keepalive_timer (sk, elapsed);
        goto out;

death:
        tcp_done(sk);

out:
        bh_unlock_sock(sk);
        sock_put(sk);
}

static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
{
        struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
        struct sock *sk = (struct sock *)tp;

        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                if (tp->compressed_ack) {
                        /* Since we have to send one ack finally,
                         * subtract one from tp->compressed_ack to keep
                         * LINUX_MIB_TCPACKCOMPRESSED accurate.
                         */
                        tp->compressed_ack--;
                        tcp_send_ack(sk);
                }
        } else {
                if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
                                      &sk->sk_tsq_flags))
                        sock_hold(sk);
        }
        bh_unlock_sock(sk);

        sock_put(sk);

        return HRTIMER_NORESTART;
}

void tcp_init_xmit_timers(struct sock *sk)
{
        inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
                                  &tcp_keepalive_timer);
        hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_ABS_PINNED_SOFT);
        tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;

        hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_REL_PINNED_SOFT);
        tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
}


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 








































































    1 





























    4 



































































































































































































    3 

    3 
























    6 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-fallback.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_FALLBACK_H
#define _LINUX_ATOMIC_FALLBACK_H

#include <linux/compiler.h>

#if defined(arch_xchg)
#define raw_xchg arch_xchg
#elif defined(arch_xchg_relaxed)
#define raw_xchg(...) \
        __atomic_op_fence(arch_xchg, __VA_ARGS__)
#else
extern void raw_xchg_not_implemented(void);
#define raw_xchg(...) raw_xchg_not_implemented()
#endif

#if defined(arch_xchg_acquire)
#define raw_xchg_acquire arch_xchg_acquire
#elif defined(arch_xchg_relaxed)
#define raw_xchg_acquire(...) \
        __atomic_op_acquire(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_acquire arch_xchg
#else
extern void raw_xchg_acquire_not_implemented(void);
#define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented()
#endif

#if defined(arch_xchg_release)
#define raw_xchg_release arch_xchg_release
#elif defined(arch_xchg_relaxed)
#define raw_xchg_release(...) \
        __atomic_op_release(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_release arch_xchg
#else
extern void raw_xchg_release_not_implemented(void);
#define raw_xchg_release(...) raw_xchg_release_not_implemented()
#endif

#if defined(arch_xchg_relaxed)
#define raw_xchg_relaxed arch_xchg_relaxed
#elif defined(arch_xchg)
#define raw_xchg_relaxed arch_xchg
#else
extern void raw_xchg_relaxed_not_implemented(void);
#define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg)
#define raw_cmpxchg arch_cmpxchg
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg(...) \
        __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
#else
extern void raw_cmpxchg_not_implemented(void);
#define raw_cmpxchg(...) raw_cmpxchg_not_implemented()
#endif

#if defined(arch_cmpxchg_acquire)
#define raw_cmpxchg_acquire arch_cmpxchg_acquire
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_acquire arch_cmpxchg
#else
extern void raw_cmpxchg_acquire_not_implemented(void);
#define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg_release)
#define raw_cmpxchg_release arch_cmpxchg_release
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_release(...) \
        __atomic_op_release(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_release arch_cmpxchg
#else
extern void raw_cmpxchg_release_not_implemented(void);
#define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented()
#endif

#if defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_relaxed arch_cmpxchg_relaxed
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_relaxed arch_cmpxchg
#else
extern void raw_cmpxchg_relaxed_not_implemented(void);
#define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg64)
#define raw_cmpxchg64 arch_cmpxchg64
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64(...) \
        __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__)
#else
extern void raw_cmpxchg64_not_implemented(void);
#define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented()
#endif

#if defined(arch_cmpxchg64_acquire)
#define raw_cmpxchg64_acquire arch_cmpxchg64_acquire
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_acquire arch_cmpxchg64
#else
extern void raw_cmpxchg64_acquire_not_implemented(void);
#define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg64_release)
#define raw_cmpxchg64_release arch_cmpxchg64_release
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_release(...) \
        __atomic_op_release(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_release arch_cmpxchg64
#else
extern void raw_cmpxchg64_release_not_implemented(void);
#define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented()
#endif

#if defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_relaxed arch_cmpxchg64
#else
extern void raw_cmpxchg64_relaxed_not_implemented(void);
#define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg128)
#define raw_cmpxchg128 arch_cmpxchg128
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128(...) \
        __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__)
#else
extern void raw_cmpxchg128_not_implemented(void);
#define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented()
#endif

#if defined(arch_cmpxchg128_acquire)
#define raw_cmpxchg128_acquire arch_cmpxchg128_acquire
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_acquire arch_cmpxchg128
#else
extern void raw_cmpxchg128_acquire_not_implemented(void);
#define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg128_release)
#define raw_cmpxchg128_release arch_cmpxchg128_release
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_release(...) \
        __atomic_op_release(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_release arch_cmpxchg128
#else
extern void raw_cmpxchg128_release_not_implemented(void);
#define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented()
#endif

#if defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_relaxed arch_cmpxchg128
#else
extern void raw_cmpxchg128_relaxed_not_implemented(void);
#define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented()
#endif

#if defined(arch_try_cmpxchg)
#define raw_try_cmpxchg arch_try_cmpxchg
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg(...) \
        __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__)
#else
#define raw_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_acquire)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg
#else
#define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_release)
#define raw_try_cmpxchg_release arch_try_cmpxchg_release
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_release(...) \
        __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_release arch_try_cmpxchg
#else
#define raw_try_cmpxchg_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg
#else
#define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64 arch_try_cmpxchg64
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64(...) \
        __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__)
#else
#define raw_try_cmpxchg64(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_acquire)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_release)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64_release
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_release(...) \
        __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128 arch_try_cmpxchg128
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128(...) \
        __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__)
#else
#define raw_try_cmpxchg128(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_acquire)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_release)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128_release
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_release(...) \
        __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg_local arch_cmpxchg_local

#ifdef arch_try_cmpxchg_local
#define raw_try_cmpxchg_local arch_try_cmpxchg_local
#else
#define raw_try_cmpxchg_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg64_local arch_cmpxchg64_local

#ifdef arch_try_cmpxchg64_local
#define raw_try_cmpxchg64_local arch_try_cmpxchg64_local
#else
#define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg128_local arch_cmpxchg128_local

#ifdef arch_try_cmpxchg128_local
#define raw_try_cmpxchg128_local arch_try_cmpxchg128_local
#else
#define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_sync_cmpxchg arch_sync_cmpxchg

#ifdef arch_sync_try_cmpxchg
#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
#else
#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

/**
 * raw_atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read(const atomic_t *v)
{
        return arch_atomic_read(v);
}

/**
 * raw_atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read_acquire(const atomic_t *v)
{
#if defined(arch_atomic_read_acquire)
        return arch_atomic_read_acquire(v);
#else
        int ret;

        if (__native_word(atomic_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set(atomic_t *v, int i)
{
        arch_atomic_set(v, i);
}

/**
 * raw_atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set_release(atomic_t *v, int i)
{
#if defined(arch_atomic_set_release)
        arch_atomic_set_release(v, i);
#else
        if (__native_word(atomic_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic_set(v, i);
        }
#endif
}

/**
 * raw_atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_add(int i, atomic_t *v)
{
        arch_atomic_add(i, v);
}

/**
 * raw_atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_add_return"
#endif
}

/**
 * raw_atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_acquire)
        return arch_atomic_add_return_acquire(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_acquire"
#endif
}

/**
 * raw_atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_release)
        return arch_atomic_add_return_release(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_release"
#endif
}

/**
 * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_relaxed)
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_add"
#endif
}

/**
 * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_acquire)
        return arch_atomic_fetch_add_acquire(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_acquire"
#endif
}

/**
 * raw_atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_release)
        return arch_atomic_fetch_add_release(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_release"
#endif
}

/**
 * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_relaxed)
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_sub(int i, atomic_t *v)
{
        arch_atomic_sub(i, v);
}

/**
 * raw_atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_sub_return"
#endif
}

/**
 * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_acquire)
        return arch_atomic_sub_return_acquire(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_acquire"
#endif
}

/**
 * raw_atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_release)
        return arch_atomic_sub_return_release(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_release"
#endif
}

/**
 * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_relaxed)
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_sub"
#endif
}

/**
 * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_acquire)
        return arch_atomic_fetch_sub_acquire(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_release)
        return arch_atomic_fetch_sub_release(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_release"
#endif
}

/**
 * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_relaxed)
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_inc(atomic_t *v)
{
#if defined(arch_atomic_inc)
        arch_atomic_inc(v);
#else
        raw_atomic_add(1, v);
#endif
}

/**
 * raw_atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return(atomic_t *v)
{
#if defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(1, v);
#endif
}

/**
 * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_inc_return_acquire)
        return arch_atomic_inc_return_acquire(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret = arch_atomic_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_release(atomic_t *v)
{
#if defined(arch_atomic_inc_return_release)
        return arch_atomic_inc_return_release(v);
#elif defined(arch_atomic_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_release(1, v);
#endif
}

/**
 * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_inc_return_relaxed)
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_add(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_acquire)
        return arch_atomic_fetch_inc_acquire(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_release)
        return arch_atomic_fetch_inc_release(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_relaxed)
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_dec(atomic_t *v)
{
#if defined(arch_atomic_dec)
        arch_atomic_dec(v);
#else
        raw_atomic_sub(1, v);
#endif
}

/**
 * raw_atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return(atomic_t *v)
{
#if defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_sub_return(1, v);
#endif
}

/**
 * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_dec_return_acquire)
        return arch_atomic_dec_return_acquire(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret = arch_atomic_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_release(atomic_t *v)
{
#if defined(arch_atomic_dec_return_release)
        return arch_atomic_dec_return_release(v);
#elif defined(arch_atomic_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_dec_return_relaxed)
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_acquire)
        return arch_atomic_fetch_dec_acquire(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_release)
        return arch_atomic_fetch_dec_release(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_relaxed)
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_and(int i, atomic_t *v)
{
        arch_atomic_and(i, v);
}

/**
 * raw_atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_and"
#endif
}

/**
 * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_acquire)
        return arch_atomic_fetch_and_acquire(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_acquire"
#endif
}

/**
 * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_release)
        return arch_atomic_fetch_and_release(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_release"
#endif
}

/**
 * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_relaxed)
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_andnot)
        arch_atomic_andnot(i, v);
#else
        raw_atomic_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_acquire)
        return arch_atomic_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_release)
        return arch_atomic_fetch_andnot_release(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_relaxed)
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_or(int i, atomic_t *v)
{
        arch_atomic_or(i, v);
}

/**
 * raw_atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_or"
#endif
}

/**
 * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_acquire)
        return arch_atomic_fetch_or_acquire(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_acquire"
#endif
}

/**
 * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_release)
        return arch_atomic_fetch_or_release(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_release"
#endif
}

/**
 * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_relaxed)
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_xor(int i, atomic_t *v)
{
        arch_atomic_xor(i, v);
}

/**
 * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_xor"
#endif
}

/**
 * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_acquire)
        return arch_atomic_fetch_xor_acquire(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_release)
        return arch_atomic_fetch_xor_release(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_release"
#endif
}

/**
 * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_relaxed)
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_acquire(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_acquire)
        return arch_atomic_xchg_acquire(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_release(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_release)
        return arch_atomic_xchg_release(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_relaxed(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_relaxed)
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_acquire)
        return arch_atomic_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_release)
        return arch_atomic_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_relaxed)
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_acquire)
        return arch_atomic_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_release)
        return arch_atomic_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_relaxed)
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_sub_and_test(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_and_test)
        return arch_atomic_sub_and_test(i, v);
#else
        return raw_atomic_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_and_test(atomic_t *v)
{
#if defined(arch_atomic_dec_and_test)
        return arch_atomic_dec_and_test(v);
#else
        return raw_atomic_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_and_test(atomic_t *v)
{
#if defined(arch_atomic_inc_and_test)
        return arch_atomic_inc_and_test(v);
#else
        return raw_atomic_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_acquire)
        return arch_atomic_add_negative_acquire(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_release)
        return arch_atomic_add_negative_release(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_relaxed)
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_fetch_add_unless)
        return arch_atomic_fetch_add_unless(v, a, u);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_add_unless)
        return arch_atomic_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_not_zero(atomic_t *v)
{
#if defined(arch_atomic_inc_not_zero)
        return arch_atomic_inc_not_zero(v);
#else
        return raw_atomic_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_unless_negative(atomic_t *v)
{
#if defined(arch_atomic_inc_unless_negative)
        return arch_atomic_inc_unless_negative(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_unless_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_unless_positive)
        return arch_atomic_dec_unless_positive(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
raw_atomic_dec_if_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_if_positive)
        return arch_atomic_dec_if_positive(v);
#else
        int dec, c = raw_atomic_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#endif

/**
 * raw_atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read(const atomic64_t *v)
{
        return arch_atomic64_read(v);
}

/**
 * raw_atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read_acquire(const atomic64_t *v)
{
#if defined(arch_atomic64_read_acquire)
        return arch_atomic64_read_acquire(v);
#else
        s64 ret;

        if (__native_word(atomic64_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic64_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set(atomic64_t *v, s64 i)
{
        arch_atomic64_set(v, i);
}

/**
 * raw_atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set_release(atomic64_t *v, s64 i)
{
#if defined(arch_atomic64_set_release)
        arch_atomic64_set_release(v, i);
#else
        if (__native_word(atomic64_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic64_set(v, i);
        }
#endif
}

/**
 * raw_atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_add(s64 i, atomic64_t *v)
{
        arch_atomic64_add(i, v);
}

/**
 * raw_atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_add_return"
#endif
}

/**
 * raw_atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_acquire)
        return arch_atomic64_add_return_acquire(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_acquire"
#endif
}

/**
 * raw_atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_release)
        return arch_atomic64_add_return_release(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_release"
#endif
}

/**
 * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_relaxed)
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_add"
#endif
}

/**
 * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_acquire)
        return arch_atomic64_fetch_add_acquire(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_acquire"
#endif
}

/**
 * raw_atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_release)
        return arch_atomic64_fetch_add_release(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_release"
#endif
}

/**
 * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_relaxed)
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_sub(s64 i, atomic64_t *v)
{
        arch_atomic64_sub(i, v);
}

/**
 * raw_atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_sub_return"
#endif
}

/**
 * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_acquire)
        return arch_atomic64_sub_return_acquire(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_acquire"
#endif
}

/**
 * raw_atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_release)
        return arch_atomic64_sub_return_release(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_release"
#endif
}

/**
 * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_relaxed)
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_sub"
#endif
}

/**
 * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_acquire)
        return arch_atomic64_fetch_sub_acquire(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_release)
        return arch_atomic64_fetch_sub_release(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_release"
#endif
}

/**
 * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_relaxed)
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_inc(atomic64_t *v)
{
#if defined(arch_atomic64_inc)
        arch_atomic64_inc(v);
#else
        raw_atomic64_add(1, v);
#endif
}

/**
 * raw_atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_acquire)
        return arch_atomic64_inc_return_acquire(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_release)
        return arch_atomic64_inc_return_release(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_release(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_relaxed)
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_add(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_acquire)
        return arch_atomic64_fetch_inc_acquire(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_release)
        return arch_atomic64_fetch_inc_release(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_relaxed)
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_dec(atomic64_t *v)
{
#if defined(arch_atomic64_dec)
        arch_atomic64_dec(v);
#else
        raw_atomic64_sub(1, v);
#endif
}

/**
 * raw_atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_sub_return(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_acquire)
        return arch_atomic64_dec_return_acquire(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_release)
        return arch_atomic64_dec_return_release(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_relaxed)
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_acquire)
        return arch_atomic64_fetch_dec_acquire(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_release)
        return arch_atomic64_fetch_dec_release(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_relaxed)
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_and(s64 i, atomic64_t *v)
{
        arch_atomic64_and(i, v);
}

/**
 * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_and"
#endif
}

/**
 * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_acquire)
        return arch_atomic64_fetch_and_acquire(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_acquire"
#endif
}

/**
 * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_release)
        return arch_atomic64_fetch_and_release(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_release"
#endif
}

/**
 * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_relaxed)
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_andnot)
        arch_atomic64_andnot(i, v);
#else
        raw_atomic64_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_acquire)
        return arch_atomic64_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_release)
        return arch_atomic64_fetch_andnot_release(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_relaxed)
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_or(s64 i, atomic64_t *v)
{
        arch_atomic64_or(i, v);
}

/**
 * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_or"
#endif
}

/**
 * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_acquire)
        return arch_atomic64_fetch_or_acquire(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_acquire"
#endif
}

/**
 * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_release)
        return arch_atomic64_fetch_or_release(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_release"
#endif
}

/**
 * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_relaxed)
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_xor(s64 i, atomic64_t *v)
{
        arch_atomic64_xor(i, v);
}

/**
 * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_xor"
#endif
}

/**
 * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_acquire)
        return arch_atomic64_fetch_xor_acquire(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_release)
        return arch_atomic64_fetch_xor_release(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_release"
#endif
}

/**
 * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_relaxed)
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_acquire)
        return arch_atomic64_xchg_acquire(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_release(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_release)
        return arch_atomic64_xchg_release(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_relaxed)
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_acquire)
        return arch_atomic64_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_release)
        return arch_atomic64_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_relaxed)
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_acquire)
        return arch_atomic64_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_release)
        return arch_atomic64_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_relaxed)
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_and_test)
        return arch_atomic64_sub_and_test(i, v);
#else
        return raw_atomic64_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_dec_and_test)
        return arch_atomic64_dec_and_test(v);
#else
        return raw_atomic64_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_inc_and_test)
        return arch_atomic64_inc_and_test(v);
#else
        return raw_atomic64_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_acquire)
        return arch_atomic64_add_negative_acquire(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_release)
        return arch_atomic64_add_negative_release(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_relaxed)
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_fetch_add_unless)
        return arch_atomic64_fetch_add_unless(v, a, u);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_add_unless)
        return arch_atomic64_add_unless(v, a, u);
#else
        return raw_atomic64_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_not_zero(atomic64_t *v)
{
#if defined(arch_atomic64_inc_not_zero)
        return arch_atomic64_inc_not_zero(v);
#else
        return raw_atomic64_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_unless_negative(atomic64_t *v)
{
#if defined(arch_atomic64_inc_unless_negative)
        return arch_atomic64_inc_unless_negative(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_unless_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_unless_positive)
        return arch_atomic64_dec_unless_positive(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
raw_atomic64_dec_if_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_if_positive)
        return arch_atomic64_dec_if_positive(v);
#else
        s64 dec, c = raw_atomic64_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#endif /* _LINUX_ATOMIC_FALLBACK_H */
// b565db590afeeff0d7c9485ccbca5bb6e155749f










































































































































































































































































































































































































    2 




















    1 
























    1 
    2 
    1 
    2 




    1 









































































































    2 




























































































































































    2 







    1 


























































































    1 



































































    1 











































































    2 





    2 










































    1 

    1 




    1 





    1 
    1 
    1 
    1 
    1 































    3 

    4 













    4 





    2 
    2 

    4 
    4 





















































































































































































































































































































































































































































































































































































    1 




















    1 
    1 













































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* memcontrol.h - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
#include <linux/shrinker.h>

struct mem_cgroup;
struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
        MEMCG_PERCPU_B,
        MEMCG_VMALLOC,
        MEMCG_KMEM,
        MEMCG_ZSWAP_B,
        MEMCG_ZSWAPPED,
        MEMCG_NR_STAT,
};

enum memcg_memory_event {
        MEMCG_LOW,
        MEMCG_HIGH,
        MEMCG_MAX,
        MEMCG_OOM,
        MEMCG_OOM_KILL,
        MEMCG_OOM_GROUP_KILL,
        MEMCG_SWAP_HIGH,
        MEMCG_SWAP_MAX,
        MEMCG_SWAP_FAIL,
        MEMCG_NR_MEMORY_EVENTS,
};

struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        unsigned int generation;
};

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT        16

struct mem_cgroup_id {
        int id;
        refcount_t ref;
};

/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremented by the number of pages. This counter is used
 * to trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
        MEM_CGROUP_NTARGETS,
};

struct memcg_vmstats_percpu;
struct memcg_vmstats;
struct lruvec_stats_percpu;
struct lruvec_stats;

struct mem_cgroup_reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        unsigned int generation;
};

/*
 * per-node information in memory controller.
 */
struct mem_cgroup_per_node {
        struct lruvec                lruvec;

        struct lruvec_stats_percpu __percpu        *lruvec_stats_percpu;
        struct lruvec_stats                        *lruvec_stats;

        unsigned long                lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];

        struct mem_cgroup_reclaim_iter        iter;

        struct shrinker_info __rcu        *shrinker_info;

        struct rb_node                tree_node;        /* RB tree node */
        unsigned long                usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                        on_tree;
        struct mem_cgroup        *memcg;                /* Back pointer, we cannot */
                                                /* use container_of           */
};

struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
        /* Array of thresholds */
        struct mem_cgroup_threshold entries[] __counted_by(size);
};

struct mem_cgroup_thresholds {
        /* Primary thresholds array */
        struct mem_cgroup_threshold_ary *primary;
        /*
         * Spare threshold array.
         * This is needed to make mem_cgroup_unregister_event() "never fail".
         * It must be able to store at least primary->size - 1 entries.
         */
        struct mem_cgroup_threshold_ary *spare;
};

/*
 * Remember four most recent foreign writebacks with dirty pages in this
 * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
 * one in a given round, we're likely to catch it later if it keeps
 * foreign-dirtying, so a fairly low count should be enough.
 *
 * See mem_cgroup_track_foreign_dirty_slowpath() for details.
 */
#define MEMCG_CGWB_FRN_CNT        4

struct memcg_cgwb_frn {
        u64 bdi_id;                        /* bdi->id of the foreign inode */
        int memcg_id;                        /* memcg->css.id of foreign inode */
        u64 at;                                /* jiffies_64 at the time of dirtying */
        struct wb_completion done;        /* tracks in-flight foreign writebacks */
};

/*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
 * is destroyed, without having to round up the individual references
 * of all live memory objects in the wild.
 */
struct obj_cgroup {
        struct percpu_ref refcnt;
        struct mem_cgroup *memcg;
        atomic_t nr_charged_bytes;
        union {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
};

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_id id;

        /* Accounted resources */
        struct page_counter memory;                /* Both v1 & v2 */

        union {
                struct page_counter swap;        /* v2 only */
                struct page_counter memsw;        /* v1 only */
        };

        /* Legacy consumer-oriented counters */
        struct page_counter kmem;                /* v1 only */
        struct page_counter tcpmem;                /* v1 only */

        /* Range enforcement for interrupt charges */
        struct work_struct high_work;

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
        unsigned long zswap_max;

        /*
         * Prevent pages from this memcg from being written back from zswap to
         * swap, and from being swapped out on zswap store failures.
         */
        bool zswap_writeback;
#endif

        unsigned long soft_limit;

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;

        /* protected by memcg_oom_lock */
        bool                oom_lock;
        int                under_oom;

        int        swappiness;
        /* OOM-Killer disable */
        int                oom_kill_disable;

        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
        struct cgroup_file events_local_file;

        /* handle for "memory.swap.events" */
        struct cgroup_file swap_events_file;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /*
         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
         */
        unsigned long move_charge_at_immigrate;
        /* taken only while moving_account > 0 */
        spinlock_t                move_lock;
        unsigned long                move_lock_flags;

        CACHELINE_PADDING(_pad1_);

        /* memory.stat */
        struct memcg_vmstats        *vmstats;

        /* memory.events */
        atomic_long_t                memory_events[MEMCG_NR_MEMORY_EVENTS];
        atomic_long_t                memory_events_local[MEMCG_NR_MEMORY_EVENTS];

        /*
         * Hint of reclaim pressure for socket memroy management. Note
         * that this indicator should NOT be used in legacy cgroup mode
         * where socket memory is accounted/charged separately.
         */
        unsigned long                socket_pressure;

        /* Legacy tcp memory accounting */
        bool                        tcpmem_active;
        int                        tcpmem_pressure;

#ifdef CONFIG_MEMCG_KMEM
        int kmemcg_id;
        /*
         * memcg->objcg is wiped out as a part of the objcg repaprenting
         * process. memcg->orig_objcg preserves a pointer (and a reference)
         * to the original objcg until the end of live of memcg.
         */
        struct obj_cgroup __rcu        *objcg;
        struct obj_cgroup        *orig_objcg;
        /* list of inherited objcgs, protected by objcg_lock */
        struct list_head objcg_list;
#endif

        CACHELINE_PADDING(_pad2_);

        /*
         * set > 0 if pages under this cgroup are moving to other cgroup.
         */
        atomic_t                moving_account;
        struct task_struct        *move_lock_task;

        struct memcg_vmstats_percpu __percpu *vmstats_percpu;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head cgwb_list;
        struct wb_domain cgwb_domain;
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* per-memcg mm_struct list */
        struct lru_gen_mm_list mm_list;
#endif

        struct mem_cgroup_per_node *nodeinfo[];
};

/*
 * size of first charge trial.
 * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
 * workload.
 */
#define MEMCG_CHARGE_BATCH 64U

extern struct mem_cgroup *root_mem_cgroup;

enum page_memcg_data_flags {
        /* page->memcg_data is a pointer to an slabobj_ext vector */
        MEMCG_DATA_OBJEXTS = (1UL << 0),
        /* page has been accounted as a non-slab kernel page */
        MEMCG_DATA_KMEM = (1UL << 1),
        /* the next bit after the last actual flag */
        __NR_MEMCG_DATA_FLAGS  = (1UL << 2),
};

#define __FIRST_OBJEXT_FLAG        __NR_MEMCG_DATA_FLAGS

#else /* CONFIG_MEMCG */

#define __FIRST_OBJEXT_FLAG        (1UL << 0)

#endif /* CONFIG_MEMCG */

enum objext_flags {
        /* slabobj_ext vector failed to allocate */
        OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
        /* the next bit after the last actual flag */
        __NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
};

#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)

#ifdef CONFIG_MEMCG

static inline bool folio_memcg_kmem(struct folio *folio);

/*
 * After the initialization objcg->memcg is always pointing at
 * a valid memcg, but can be atomically swapped to the parent memcg.
 *
 * The caller must ensure that the returned memcg won't be released:
 * e.g. acquire the rcu_read_lock or css_set_lock.
 */
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
        return READ_ONCE(objcg->memcg);
}

/*
 * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * kmem folios.
 */
static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * __folio_objcg - get the object cgroup associated with a kmem folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the object cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper object cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * LRU folios.
 */
static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);

        return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * folio_memcg - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - folio_memcg_lock()
 * - exclusive reference
 * - mem_cgroup_trylock_pages()
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        if (folio_memcg_kmem(folio))
                return obj_cgroup_memcg(__folio_objcg(folio));
        return __folio_memcg(folio);
}

static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return folio_memcg(page_folio(page));
}

/**
 * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios.
 *
 * Return: A pointer to the memory cgroup associated with the folio,
 * or NULL.
 */
static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
{
        unsigned long memcg_data = READ_ONCE(folio->memcg_data);

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        WARN_ON_ONCE(!rcu_read_lock_held());

        if (memcg_data & MEMCG_DATA_KMEM) {
                struct obj_cgroup *objcg;

                objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
                return obj_cgroup_memcg(objcg);
        }

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * folio_memcg_check - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function unlike folio_memcg() can take any folio
 * as an argument. It has to be used in cases when it's not known if a folio
 * has an associated memory cgroup pointer or an object cgroups vector or
 * an object cgroup.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - lock_folio_memcg()
 * - exclusive reference
 * - mem_cgroup_trylock_pages()
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        /*
         * Because folio->memcg_data might be changed asynchronously
         * for slabs, READ_ONCE() should be used here.
         */
        unsigned long memcg_data = READ_ONCE(folio->memcg_data);

        if (memcg_data & MEMCG_DATA_OBJEXTS)
                return NULL;

        if (memcg_data & MEMCG_DATA_KMEM) {
                struct obj_cgroup *objcg;

                objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
                return obj_cgroup_memcg(objcg);
        }

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        if (PageTail(page))
                return NULL;
        return folio_memcg_check((struct folio *)page);
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        struct mem_cgroup *memcg;

        rcu_read_lock();
retry:
        memcg = obj_cgroup_memcg(objcg);
        if (unlikely(!css_tryget(&memcg->css)))
                goto retry;
        rcu_read_unlock();

        return memcg;
}

#ifdef CONFIG_MEMCG_KMEM
/*
 * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
 * @folio: Pointer to the folio.
 *
 * Checks if the folio has MemcgKmem flag set. The caller must ensure
 * that the folio has an associated memory cgroup. It's not safe to call
 * this function against some types of folios, e.g. slab folios.
 */
static inline bool folio_memcg_kmem(struct folio *folio)
{
        VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
        VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
        return folio->memcg_data & MEMCG_DATA_KMEM;
}


#else
static inline bool folio_memcg_kmem(struct folio *folio)
{
        return false;
}

#endif

static inline bool PageMemcgKmem(struct page *page)
{
        return folio_memcg_kmem(page_folio(page));
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return (memcg == root_mem_cgroup);
}

static inline bool mem_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;

        if (mem_cgroup_disabled())
                return;

        /*
         * There is no reclaim protection applied to a targeted reclaim.
         * We are special casing this specific case here because
         * mem_cgroup_calculate_protection is not robust enough to keep
         * the protection invariant for calculated effective values for
         * parallel reclaimers with different reclaim target. This is
         * especially a problem for tail memcgs (as they have pages on LRU)
         * which would want to have effective values 0 for targeted reclaim
         * but a different value for external reclaim.
         *
         * Example
         * Let's have global and A's reclaim in parallel:
         *  |
         *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
         *  |\
         *  | C (low = 1G, usage = 2.5G)
         *  B (low = 1G, usage = 0.5G)
         *
         * For the global reclaim
         * A.elow = A.low
         * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
         * C.elow = min(C.usage, C.low)
         *
         * With the effective values resetting we have A reclaim
         * A.elow = 0
         * B.elow = B.low
         * C.elow = C.low
         *
         * If the global reclaim races with A's reclaim then
         * B.elow = C.elow = 0 because children_low_usage > A.elow)
         * is possible and reclaiming B would be violating the protection.
         *
         */
        if (root == memcg)
                return;

        *min = READ_ONCE(memcg->memory.emin);
        *low = READ_ONCE(memcg->memory.elow);
}

void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                     struct mem_cgroup *memcg);

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        /*
         * The root memcg doesn't account charges, and doesn't support
         * protection. The target memcg's protection is ignored, see
         * mem_cgroup_calculate_protection() and mem_cgroup_protection()
         */
        return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) ||
                memcg == target;
}

static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.elow) >=
                page_counter_read(&memcg->memory);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.emin) >=
                page_counter_read(&memcg->memory);
}

void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);

int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);

/**
 * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
 * @folio: Folio to charge.
 * @mm: mm context of the allocating task.
 * @gfp: Reclaim mode.
 *
 * Try to charge @folio to the memcg that @mm belongs to, reclaiming
 * pages according to @gfp if necessary.  If @mm is NULL, try to
 * charge to the active memcg.
 *
 * Do not use this for folios allocated for swapin.
 *
 * Return: 0 on success. Otherwise, an error code is returned.
 */
static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
                                    gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_charge(folio, mm, gfp);
}

int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
                long nr_pages);

int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);

void __mem_cgroup_uncharge(struct folio *folio);

/**
 * mem_cgroup_uncharge - Uncharge a folio.
 * @folio: Folio to uncharge.
 *
 * Uncharge a folio previously charged with mem_cgroup_charge().
 */
static inline void mem_cgroup_uncharge(struct folio *folio)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge(folio);
}

void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_folios(folios);
}

void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
void mem_cgroup_migrate(struct folio *old, struct folio *new);

/**
 * mem_cgroup_lruvec - get the lru list vector for a memcg & node
 * @memcg: memcg of the wanted lruvec
 * @pgdat: pglist_data
 *
 * Returns the lru list vector holding pages for a given @memcg &
 * @pgdat combination. This can be the node lruvec, if the memory
 * controller is disabled.
 */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        struct mem_cgroup_per_node *mz;
        struct lruvec *lruvec;

        if (mem_cgroup_disabled()) {
                lruvec = &pgdat->__lruvec;
                goto out;
        }

        if (!memcg)
                memcg = root_mem_cgroup;

        mz = memcg->nodeinfo[pgdat->node_id];
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->pgdat here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->pgdat != pgdat))
                lruvec->pgdat = pgdat;
        return lruvec;
}

/**
 * folio_lruvec - return lruvec for isolating/putting an LRU folio
 * @folio: Pointer to the folio.
 *
 * This function relies on folio->mem_cgroup being stable.
 */
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
        return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
}

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_current(void);

struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                                                unsigned long *flags);

#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
#else
static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}
#endif

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
        return percpu_ref_tryget(&objcg->refcnt);
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
        percpu_ref_get(&objcg->refcnt);
}

static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
                                       unsigned long nr)
{
        percpu_ref_get_many(&objcg->refcnt, nr);
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
        if (objcg)
                percpu_ref_put(&objcg->refcnt);
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget(&memcg->css);
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget_online(&memcg->css);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
        if (memcg)
                css_put(&memcg->css);
}

#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                           int (*)(struct task_struct *, void *), void *arg);

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return 0;

        return memcg->id.id;
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
}

struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return mem_cgroup_from_css(seq_css(m));
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        struct mem_cgroup_per_node *mz;

        if (mem_cgroup_disabled())
                return NULL;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->memcg;
}

/**
 * parent_mem_cgroup - find the accounting parent of a memcg
 * @memcg: memcg whose parent to find
 *
 * Returns the parent memcg, or NULL if this is the root.
 */
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return mem_cgroup_from_css(memcg->css.parent);
}

static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
                              struct mem_cgroup *root)
{
        if (root == memcg)
                return true;
        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
{
        struct mem_cgroup *task_memcg;
        bool match = false;

        rcu_read_lock();
        task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (task_memcg)
                match = mem_cgroup_is_descendant(task_memcg, memcg);
        rcu_read_unlock();
        return match;
}

struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return true;
        return !!(memcg->css.flags & CSS_ONLINE);
}

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        struct mem_cgroup_per_node *mz;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}

void mem_cgroup_handle_over_high(gfp_t gfp_mask);

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

unsigned long mem_cgroup_size(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
                                struct task_struct *p);

void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

static inline void mem_cgroup_enter_user_fault(void)
{
        WARN_ON(current->in_user_fault);
        current->in_user_fault = 1;
}

static inline void mem_cgroup_exit_user_fault(void)
{
        WARN_ON(!current->in_user_fault);
        current->in_user_fault = 0;
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return p->memcg_in_oom;
}

bool mem_cgroup_oom_synchronize(bool wait);
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
                                            struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

void folio_memcg_lock(struct folio *folio);
void folio_memcg_unlock(struct folio *folio);

void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
                       int val);

/* try to stablize folio_memcg() for all the pages in a memcg */
static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
{
        rcu_read_lock();

        if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
                return true;

        rcu_read_unlock();
        return false;
}

static inline void mem_cgroup_unlock_pages(void)
{
        rcu_read_unlock();
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_memcg_state(memcg, idx, val);
        local_irq_restore(flags);
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = page_memcg(page);
        if (memcg)
                mod_memcg_state(memcg, idx, val);
        rcu_read_unlock();
}

unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                      enum node_stat_item idx);

void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);

void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_kmem_state(p, idx, val);
        local_irq_restore(flags);
}

void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count);

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        unsigned long flags;

        local_irq_save(flags);
        __count_memcg_events(memcg, idx, count);
        local_irq_restore(flags);
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        if (memcg)
                count_memcg_events(memcg, idx, nr);
}

static inline void count_memcg_event_mm(struct mm_struct *mm,
                                        enum vm_event_item idx)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                count_memcg_events(memcg, idx, 1);
        rcu_read_unlock();
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
        bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
                          event == MEMCG_SWAP_FAIL;

        atomic_long_inc(&memcg->memory_events_local[event]);
        if (!swap_event)
                cgroup_file_notify(&memcg->events_local_file);

        do {
                atomic_long_inc(&memcg->memory_events[event]);
                if (swap_event)
                        cgroup_file_notify(&memcg->swap_events_file);
                else
                        cgroup_file_notify(&memcg->events_file);

                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                        break;
                if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        break;
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                memcg_memory_event(memcg, event);
        rcu_read_unlock();
}

void split_page_memcg(struct page *head, int old_order, int new_order);

unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                                gfp_t gfp_mask,
                                                unsigned long *total_scanned);

#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT        0

static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        return NULL;
}

static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return NULL;
}

static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return NULL;
}

static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        return NULL;
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        return NULL;
}

static inline bool folio_memcg_kmem(struct folio *folio)
{
        return false;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return false;
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_disabled(void)
{
        return true;
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;
}

static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                                   struct mem_cgroup *memcg)
{
}

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        return true;
}
static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline void mem_cgroup_commit_charge(struct folio *folio,
                struct mem_cgroup *memcg)
{
}

static inline int mem_cgroup_charge(struct folio *folio,
                struct mm_struct *mm, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
                gfp_t gfp, long nr_pages)
{
        return 0;
}

static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
                        struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
{
}

static inline void mem_cgroup_uncharge(struct folio *folio)
{
}

static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}

static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
                unsigned int nr_pages)
{
}

static inline void mem_cgroup_replace_folio(struct folio *old,
                struct folio *new)
{
}

static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
}

static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        return &pgdat->__lruvec;
}

static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
{
        return true;
}

static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
        return NULL;
}

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{
        return NULL;
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irq(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                unsigned long *flagsp)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
                struct mem_cgroup_reclaim_cookie *reclaim)
{
        return NULL;
}

static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
                                         struct mem_cgroup *prev)
{
}

static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                int (*fn)(struct task_struct *, void *), void *arg)
{
}

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
        WARN_ON_ONCE(id);
        /* XXX: This should always return root_mem_cgroup */
        return NULL;
}

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
{
        return NULL;
}
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return NULL;
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        return NULL;
}

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        return 0;
}

static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
        return 0;
}

static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
        return 0;
}

static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
}

static inline void
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}

static inline void folio_memcg_lock(struct folio *folio)
{
}

static inline void folio_memcg_unlock(struct folio *folio)
{
}

static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
{
        /* to match folio_memcg_rcu() */
        rcu_read_lock();
        return true;
}

static inline void mem_cgroup_unlock_pages(void)
{
        rcu_read_unlock();
}

static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
}

static inline void mem_cgroup_enter_user_fault(void)
{
}

static inline void mem_cgroup_exit_user_fault(void)
{
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return false;
}

static inline bool mem_cgroup_oom_synchronize(bool wait)
{
        return false;
}

static inline struct mem_cgroup *mem_cgroup_get_oom_group(
        struct task_struct *victim, struct mem_cgroup *oom_domain)
{
        return NULL;
}

static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}

static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                     enum memcg_stat_item idx,
                                     int nr)
{
}

static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx,
                                   int nr)
{
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
}

static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        return 0;
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}

static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                           int val)
{
        struct page *page = virt_to_head_page(p);

        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        struct page *page = virt_to_head_page(p);

        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

static inline void __count_memcg_events(struct mem_cgroup *memcg,
                                        enum vm_event_item idx,
                                        unsigned long count)
{
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
}

static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}

static inline void split_page_memcg(struct page *head, int old_order, int new_order)
{
}

static inline
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                            gfp_t gfp_mask,
                                            unsigned long *total_scanned)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

/*
 * Extended information for slab objects stored as an array in page->memcg_data
 * if MEMCG_DATA_OBJEXTS is set.
 */
struct slabobj_ext {
#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup *objcg;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref ref;
#endif
} __aligned(8);

static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, 1);
}

static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, -1);
}

static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
        struct mem_cgroup *memcg;

        memcg = lruvec_memcg(lruvec);
        if (!memcg)
                return NULL;
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
                return NULL;
        return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

static inline void unlock_page_lruvec(struct lruvec *lruvec)
{
        spin_unlock(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
{
        spin_unlock_irq(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
                unsigned long flags)
{
        spin_unlock_irqrestore(&lruvec->lru_lock, flags);
}

/* Test requires a stable page->memcg binding, see page_memcg() */
static inline bool folio_matches_lruvec(struct folio *folio,
                struct lruvec *lruvec)
{
        return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
               lruvec_memcg(lruvec) == folio_memcg(folio);
}

/* Don't lock again iff page's lruvec locked */
static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
                struct lruvec *locked_lruvec)
{
        if (locked_lruvec) {
                if (folio_matches_lruvec(folio, locked_lruvec))
                        return locked_lruvec;

                unlock_page_lruvec_irq(locked_lruvec);
        }

        return folio_lruvec_lock_irq(folio);
}

/* Don't lock again iff folio's lruvec locked */
static inline void folio_lruvec_relock_irqsave(struct folio *folio,
                struct lruvec **lruvecp, unsigned long *flags)
{
        if (*lruvecp) {
                if (folio_matches_lruvec(folio, *lruvecp))
                        return;

                unlock_page_lruvec_irqrestore(*lruvecp, *flags);
        }

        *lruvecp = folio_lruvec_lock_irqsave(folio, flags);
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);

void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                             struct bdi_writeback *wb);

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        memcg = folio_memcg(folio);
        if (unlikely(memcg && &memcg->css != wb->memcg_css))
                mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
}

void mem_cgroup_flush_foreign(struct bdi_writeback *wb);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
        return NULL;
}

static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
                                       unsigned long *pfilepages,
                                       unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
{
}

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
}

static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

struct sock;
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
                             gfp_t gfp_mask);
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return !!memcg->tcpmem_pressure;
        do {
                if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
                        return true;
        } while ((memcg = parent_mem_cgroup(memcg)));
        return false;
}

int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
static inline void mem_cgroup_sk_free(struct sock *sk) { };
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        return false;
}

static inline void set_shrinker_bit(struct mem_cgroup *memcg,
                                    int nid, int shrinker_id)
{
}
#endif

#ifdef CONFIG_MEMCG_KMEM
bool mem_cgroup_kmem_disabled(void);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);

/*
 * The returned objcg pointer is safe to use without additional
 * protection within a scope. The scope is defined either by
 * the current task (similar to the "current" global variable)
 * or by set_active_memcg() pair.
 * Please, use obj_cgroup_get() to get a reference if the pointer
 * needs to be used outside of the local scope.
 */
struct obj_cgroup *current_obj_cgroup(void);
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);

static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
        struct obj_cgroup *objcg = current_obj_cgroup();

        if (objcg)
                obj_cgroup_get(objcg);

        return objcg;
}

int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_bpf_enabled_key;
static inline bool memcg_bpf_enabled(void)
{
        return static_branch_likely(&memcg_bpf_enabled_key);
}

extern struct static_key_false memcg_kmem_online_key;

static inline bool memcg_kmem_online(void)
{
        return static_branch_likely(&memcg_kmem_online_key);
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        if (memcg_kmem_online())
                return __memcg_kmem_charge_page(page, gfp, order);
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
        if (memcg_kmem_online())
                __memcg_kmem_uncharge_page(page, order);
}

/*
 * A helper for accessing memcg's kmem_id, used for getting
 * corresponding LRU lists.
 */
static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return memcg ? memcg->kmemcg_id : -1;
}

struct mem_cgroup *mem_cgroup_from_obj(void *p);
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);

static inline void count_objcg_event(struct obj_cgroup *objcg,
                                     enum vm_event_item idx)
{
        struct mem_cgroup *memcg;

        if (!memcg_kmem_online())
                return;

        rcu_read_lock();
        memcg = obj_cgroup_memcg(objcg);
        count_memcg_events(memcg, idx, 1);
        rcu_read_unlock();
}

#else
static inline bool mem_cgroup_kmem_disabled(void)
{
        return true;
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                           int order)
{
        return 0;
}

static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline bool memcg_bpf_enabled(void)
{
        return false;
}

static inline bool memcg_kmem_online(void)
{
        return false;
}

static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return -1;
}

static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
        return NULL;
}

static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
{
        return NULL;
}

static inline void count_objcg_event(struct obj_cgroup *objcg,
                                     enum vm_event_item idx)
{
}

#endif /* CONFIG_MEMCG_KMEM */

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
#else
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
{
        return true;
}
static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
                                           size_t size)
{
}
static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
                                             size_t size)
{
}
static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
        /* if zswap is disabled, do not block pages going to the swapping device */
        return true;
}
#endif

#endif /* _LINUX_MEMCONTROL_H */
























































































































































































































    1 




































































































































    1 

















































































































































































































































































































































































































































































































































































































































































































































    1 













    1 



















    1 






























































    1 











    1 







    1 















    1 

























    1 













    1 

    1 





































































    1 

















    1 























































    1 






















    1 







    1 

    1 


    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 






























































    1 




































































































































































































































































































































































































































































































































    1 




    1 

    1 




    1 






    1 












    1 




    1 

























































































































































































































































































































































































































































































































    1 








    1 





















































































































    1 





    1 







    1 









    1 
































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2003 Intel Corp.
 * Copyright (c) 2001-2002 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions interface with the sockets layer to implement the
 * SCTP Extensions for the Sockets API.
 *
 * Note that the descriptions from the specification are USER level
 * functions--this file is the functions which populate the struct proto
 * for SCTP which is the BOTTOM of the sockets interface.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Narasimha Budihal     <narsi@refcode.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Xingang Guo           <xingang.guo@intel.com>
 *    Daisy Chang           <daisyc@us.ibm.com>
 *    Sridhar Samudrala     <samudrala@us.ibm.com>
 *    Inaky Perez-Gonzalez  <inaky.gonzalez@intel.com>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 *    Ryan Layer            <rmlayer@us.ibm.com>
 *    Anup Pemmaiah         <pemmaiah@cc.usu.edu>
 *    Kevin Gao             <kevin.gao@intel.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <crypto/hash.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/time.h>
#include <linux/sched/signal.h>
#include <linux/ip.h>
#include <linux/capability.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/compat.h>
#include <linux/rhashtable.h>

#include <net/ip.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/busy_poll.h>
#include <trace/events/sock.h>

#include <linux/socket.h> /* for sa_family_t */
#include <linux/export.h>
#include <net/sock.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>
#include <net/rps.h>

/* Forward declarations for internal helper functions. */
static bool sctp_writeable(const struct sock *sk);
static void sctp_wfree(struct sk_buff *skb);
static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
                                size_t msg_len);
static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p);
static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
static int sctp_wait_for_accept(struct sock *sk, long timeo);
static void sctp_wait_for_close(struct sock *sk, long timeo);
static void sctp_destruct_sock(struct sock *sk);
static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
                                        union sctp_addr *addr, int len);
static int sctp_bindx_add(struct sock *, struct sockaddr *, int);
static int sctp_bindx_rem(struct sock *, struct sockaddr *, int);
static int sctp_send_asconf_add_ip(struct sock *, struct sockaddr *, int);
static int sctp_send_asconf_del_ip(struct sock *, struct sockaddr *, int);
static int sctp_send_asconf(struct sctp_association *asoc,
                            struct sctp_chunk *chunk);
static int sctp_do_bind(struct sock *, union sctp_addr *, int);
static int sctp_autobind(struct sock *sk);
static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
                             struct sctp_association *assoc,
                             enum sctp_socket_type type);

static unsigned long sctp_memory_pressure;
static atomic_long_t sctp_memory_allocated;
static DEFINE_PER_CPU(int, sctp_memory_per_cpu_fw_alloc);
struct percpu_counter sctp_sockets_allocated;

static void sctp_enter_memory_pressure(struct sock *sk)
{
        WRITE_ONCE(sctp_memory_pressure, 1);
}


/* Get the sndbuf space available at the time on the association.  */
static inline int sctp_wspace(struct sctp_association *asoc)
{
        struct sock *sk = asoc->base.sk;

        return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used
                                       : sk_stream_wspace(sk);
}

/* Increment the used sndbuf space count of the corresponding association by
 * the size of the outgoing data chunk.
 * Also, set the skb destructor for sndbuf accounting later.
 *
 * Since it is always 1-1 between chunk and skb, and also a new skb is always
 * allocated for chunk bundling in sctp_packet_transmit(), we can use the
 * destructor in the data chunk skb for the purpose of the sndbuf space
 * tracking.
 */
static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
{
        struct sctp_association *asoc = chunk->asoc;
        struct sock *sk = asoc->base.sk;

        /* The sndbuf space is tracked per association.  */
        sctp_association_hold(asoc);

        if (chunk->shkey)
                sctp_auth_shkey_hold(chunk->shkey);

        skb_set_owner_w(chunk->skb, sk);

        chunk->skb->destructor = sctp_wfree;
        /* Save the chunk pointer in skb for sctp_wfree to use later.  */
        skb_shinfo(chunk->skb)->destructor_arg = chunk;

        refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
        asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk);
        sk_wmem_queued_add(sk, chunk->skb->truesize + sizeof(struct sctp_chunk));
        sk_mem_charge(sk, chunk->skb->truesize);
}

static void sctp_clear_owner_w(struct sctp_chunk *chunk)
{
        skb_orphan(chunk->skb);
}

#define traverse_and_process()        \
do {                                \
        msg = chunk->msg;        \
        if (msg == prev_msg)        \
                continue;        \
        list_for_each_entry(c, &msg->chunks, frag_list) {        \
                if ((clear && asoc->base.sk == c->skb->sk) ||        \
                    (!clear && asoc->base.sk != c->skb->sk))        \
                        cb(c);        \
        }                        \
        prev_msg = msg;                \
} while (0)

static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
                                       bool clear,
                                       void (*cb)(struct sctp_chunk *))

{
        struct sctp_datamsg *msg, *prev_msg = NULL;
        struct sctp_outq *q = &asoc->outqueue;
        struct sctp_chunk *chunk, *c;
        struct sctp_transport *t;

        list_for_each_entry(t, &asoc->peer.transport_addr_list, transports)
                list_for_each_entry(chunk, &t->transmitted, transmitted_list)
                        traverse_and_process();

        list_for_each_entry(chunk, &q->retransmit, transmitted_list)
                traverse_and_process();

        list_for_each_entry(chunk, &q->sacked, transmitted_list)
                traverse_and_process();

        list_for_each_entry(chunk, &q->abandoned, transmitted_list)
                traverse_and_process();

        list_for_each_entry(chunk, &q->out_chunk_list, list)
                traverse_and_process();
}

static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk,
                                 void (*cb)(struct sk_buff *, struct sock *))

{
        struct sk_buff *skb, *tmp;

        sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp)
                cb(skb, sk);

        sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp)
                cb(skb, sk);

        sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp)
                cb(skb, sk);
}

/* Verify that this is a valid address. */
static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
                                   int len)
{
        struct sctp_af *af;

        /* Verify basic sockaddr. */
        af = sctp_sockaddr_af(sctp_sk(sk), addr, len);
        if (!af)
                return -EINVAL;

        /* Is this a valid SCTP address?  */
        if (!af->addr_valid(addr, sctp_sk(sk), NULL))
                return -EINVAL;

        if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr)))
                return -EINVAL;

        return 0;
}

/* Look up the association by its id.  If this is not a UDP-style
 * socket, the ID field is always ignored.
 */
struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id)
{
        struct sctp_association *asoc = NULL;

        /* If this is not a UDP-style socket, assoc id should be ignored. */
        if (!sctp_style(sk, UDP)) {
                /* Return NULL if the socket state is not ESTABLISHED. It
                 * could be a TCP-style listening socket or a socket which
                 * hasn't yet called connect() to establish an association.
                 */
                if (!sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING))
                        return NULL;

                /* Get the first and the only association from the list. */
                if (!list_empty(&sctp_sk(sk)->ep->asocs))
                        asoc = list_entry(sctp_sk(sk)->ep->asocs.next,
                                          struct sctp_association, asocs);
                return asoc;
        }

        /* Otherwise this is a UDP-style socket. */
        if (id <= SCTP_ALL_ASSOC)
                return NULL;

        spin_lock_bh(&sctp_assocs_id_lock);
        asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id);
        if (asoc && (asoc->base.sk != sk || asoc->base.dead))
                asoc = NULL;
        spin_unlock_bh(&sctp_assocs_id_lock);

        return asoc;
}

/* Look up the transport from an address and an assoc id. If both address and
 * id are specified, the associations matching the address and the id should be
 * the same.
 */
static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
                                              struct sockaddr_storage *addr,
                                              sctp_assoc_t id)
{
        struct sctp_association *addr_asoc = NULL, *id_asoc = NULL;
        struct sctp_af *af = sctp_get_af_specific(addr->ss_family);
        union sctp_addr *laddr = (union sctp_addr *)addr;
        struct sctp_transport *transport;

        if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len))
                return NULL;

        addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep,
                                               laddr,
                                               &transport);

        if (!addr_asoc)
                return NULL;

        id_asoc = sctp_id2assoc(sk, id);
        if (id_asoc && (id_asoc != addr_asoc))
                return NULL;

        sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk),
                                                (union sctp_addr *)addr);

        return transport;
}

/* API 3.1.2 bind() - UDP Style Syntax
 * The syntax of bind() is,
 *
 *   ret = bind(int sd, struct sockaddr *addr, int addrlen);
 *
 *   sd      - the socket descriptor returned by socket().
 *   addr    - the address structure (struct sockaddr_in or struct
 *             sockaddr_in6 [RFC 2553]),
 *   addr_len - the size of the address structure.
 */
static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
{
        int retval = 0;

        lock_sock(sk);

        pr_debug("%s: sk:%p, addr:%p, addr_len:%d\n", __func__, sk,
                 addr, addr_len);

        /* Disallow binding twice. */
        if (!sctp_sk(sk)->ep->base.bind_addr.port)
                retval = sctp_do_bind(sk, (union sctp_addr *)addr,
                                      addr_len);
        else
                retval = -EINVAL;

        release_sock(sk);

        return retval;
}

static int sctp_get_port_local(struct sock *, union sctp_addr *);

/* Verify this is a valid sockaddr. */
static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
                                        union sctp_addr *addr, int len)
{
        struct sctp_af *af;

        /* Check minimum size.  */
        if (len < sizeof (struct sockaddr))
                return NULL;

        if (!opt->pf->af_supported(addr->sa.sa_family, opt))
                return NULL;

        if (addr->sa.sa_family == AF_INET6) {
                if (len < SIN6_LEN_RFC2133)
                        return NULL;
                /* V4 mapped address are really of AF_INET family */
                if (ipv6_addr_v4mapped(&addr->v6.sin6_addr) &&
                    !opt->pf->af_supported(AF_INET, opt))
                        return NULL;
        }

        /* If we get this far, af is valid. */
        af = sctp_get_af_specific(addr->sa.sa_family);

        if (len < af->sockaddr_len)
                return NULL;

        return af;
}

static void sctp_auto_asconf_init(struct sctp_sock *sp)
{
        struct net *net = sock_net(&sp->inet.sk);

        if (net->sctp.default_auto_asconf) {
                spin_lock_bh(&net->sctp.addr_wq_lock);
                list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist);
                spin_unlock_bh(&net->sctp.addr_wq_lock);
                sp->do_auto_asconf = 1;
        }
}

/* Bind a local address either to an endpoint or to an association.  */
static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
{
        struct net *net = sock_net(sk);
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_endpoint *ep = sp->ep;
        struct sctp_bind_addr *bp = &ep->base.bind_addr;
        struct sctp_af *af;
        unsigned short snum;
        int ret = 0;

        /* Common sockaddr verification. */
        af = sctp_sockaddr_af(sp, addr, len);
        if (!af) {
                pr_debug("%s: sk:%p, newaddr:%p, len:%d EINVAL\n",
                         __func__, sk, addr, len);
                return -EINVAL;
        }

        snum = ntohs(addr->v4.sin_port);

        pr_debug("%s: sk:%p, new addr:%pISc, port:%d, new port:%d, len:%d\n",
                 __func__, sk, &addr->sa, bp->port, snum, len);

        /* PF specific bind() address verification. */
        if (!sp->pf->bind_verify(sp, addr))
                return -EADDRNOTAVAIL;

        /* We must either be unbound, or bind to the same port.
         * It's OK to allow 0 ports if we are already bound.
         * We'll just inhert an already bound port in this case
         */
        if (bp->port) {
                if (!snum)
                        snum = bp->port;
                else if (snum != bp->port) {
                        pr_debug("%s: new port %d doesn't match existing port "
                                 "%d\n", __func__, snum, bp->port);
                        return -EINVAL;
                }
        }

        if (snum && inet_port_requires_bind_service(net, snum) &&
            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                return -EACCES;

        /* See if the address matches any of the addresses we may have
         * already bound before checking against other endpoints.
         */
        if (sctp_bind_addr_match(bp, addr, sp))
                return -EINVAL;

        /* Make sure we are allowed to bind here.
         * The function sctp_get_port_local() does duplicate address
         * detection.
         */
        addr->v4.sin_port = htons(snum);
        if (sctp_get_port_local(sk, addr))
                return -EADDRINUSE;

        /* Refresh ephemeral port.  */
        if (!bp->port) {
                bp->port = inet_sk(sk)->inet_num;
                sctp_auto_asconf_init(sp);
        }

        /* Add the address to the bind address list.
         * Use GFP_ATOMIC since BHs will be disabled.
         */
        ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len,
                                 SCTP_ADDR_SRC, GFP_ATOMIC);

        if (ret) {
                sctp_put_port(sk);
                return ret;
        }
        /* Copy back into socket for getsockname() use. */
        inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
        sp->pf->to_sk_saddr(addr, sk);

        return ret;
}

 /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks
 *
 * R1) One and only one ASCONF Chunk MAY be in transit and unacknowledged
 * at any one time.  If a sender, after sending an ASCONF chunk, decides
 * it needs to transfer another ASCONF Chunk, it MUST wait until the
 * ASCONF-ACK Chunk returns from the previous ASCONF Chunk before sending a
 * subsequent ASCONF. Note this restriction binds each side, so at any
 * time two ASCONF may be in-transit on any given association (one sent
 * from each endpoint).
 */
static int sctp_send_asconf(struct sctp_association *asoc,
                            struct sctp_chunk *chunk)
{
        int retval = 0;

        /* If there is an outstanding ASCONF chunk, queue it for later
         * transmission.
         */
        if (asoc->addip_last_asconf) {
                list_add_tail(&chunk->list, &asoc->addip_chunk_list);
                goto out;
        }

        /* Hold the chunk until an ASCONF_ACK is received. */
        sctp_chunk_hold(chunk);
        retval = sctp_primitive_ASCONF(asoc->base.net, asoc, chunk);
        if (retval)
                sctp_chunk_free(chunk);
        else
                asoc->addip_last_asconf = chunk;

out:
        return retval;
}

/* Add a list of addresses as bind addresses to local endpoint or
 * association.
 *
 * Basically run through each address specified in the addrs/addrcnt
 * array/length pair, determine if it is IPv6 or IPv4 and call
 * sctp_do_bind() on it.
 *
 * If any of them fails, then the operation will be reversed and the
 * ones that were added will be removed.
 *
 * Only sctp_setsockopt_bindx() is supposed to call this function.
 */
static int sctp_bindx_add(struct sock *sk, struct sockaddr *addrs, int addrcnt)
{
        int cnt;
        int retval = 0;
        void *addr_buf;
        struct sockaddr *sa_addr;
        struct sctp_af *af;

        pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk,
                 addrs, addrcnt);

        addr_buf = addrs;
        for (cnt = 0; cnt < addrcnt; cnt++) {
                /* The list may contain either IPv4 or IPv6 address;
                 * determine the address length for walking thru the list.
                 */
                sa_addr = addr_buf;
                af = sctp_get_af_specific(sa_addr->sa_family);
                if (!af) {
                        retval = -EINVAL;
                        goto err_bindx_add;
                }

                retval = sctp_do_bind(sk, (union sctp_addr *)sa_addr,
                                      af->sockaddr_len);

                addr_buf += af->sockaddr_len;

err_bindx_add:
                if (retval < 0) {
                        /* Failed. Cleanup the ones that have been added */
                        if (cnt > 0)
                                sctp_bindx_rem(sk, addrs, cnt);
                        return retval;
                }
        }

        return retval;
}

/* Send an ASCONF chunk with Add IP address parameters to all the peers of the
 * associations that are part of the endpoint indicating that a list of local
 * addresses are added to the endpoint.
 *
 * If any of the addresses is already in the bind address list of the
 * association, we do not send the chunk for that association.  But it will not
 * affect other associations.
 *
 * Only sctp_setsockopt_bindx() is supposed to call this function.
 */
static int sctp_send_asconf_add_ip(struct sock                *sk,
                                   struct sockaddr        *addrs,
                                   int                         addrcnt)
{
        struct sctp_sock                *sp;
        struct sctp_endpoint                *ep;
        struct sctp_association                *asoc;
        struct sctp_bind_addr                *bp;
        struct sctp_chunk                *chunk;
        struct sctp_sockaddr_entry        *laddr;
        union sctp_addr                        *addr;
        union sctp_addr                        saveaddr;
        void                                *addr_buf;
        struct sctp_af                        *af;
        struct list_head                *p;
        int                                 i;
        int                                 retval = 0;

        sp = sctp_sk(sk);
        ep = sp->ep;

        if (!ep->asconf_enable)
                return retval;

        pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
                 __func__, sk, addrs, addrcnt);

        list_for_each_entry(asoc, &ep->asocs, asocs) {
                if (!asoc->peer.asconf_capable)
                        continue;

                if (asoc->peer.addip_disabled_mask & SCTP_PARAM_ADD_IP)
                        continue;

                if (!sctp_state(asoc, ESTABLISHED))
                        continue;

                /* Check if any address in the packed array of addresses is
                 * in the bind address list of the association. If so,
                 * do not send the asconf chunk to its peer, but continue with
                 * other associations.
                 */
                addr_buf = addrs;
                for (i = 0; i < addrcnt; i++) {
                        addr = addr_buf;
                        af = sctp_get_af_specific(addr->v4.sin_family);
                        if (!af) {
                                retval = -EINVAL;
                                goto out;
                        }

                        if (sctp_assoc_lookup_laddr(asoc, addr))
                                break;

                        addr_buf += af->sockaddr_len;
                }
                if (i < addrcnt)
                        continue;

                /* Use the first valid address in bind addr list of
                 * association as Address Parameter of ASCONF CHUNK.
                 */
                bp = &asoc->base.bind_addr;
                p = bp->address_list.next;
                laddr = list_entry(p, struct sctp_sockaddr_entry, list);
                chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs,
                                                   addrcnt, SCTP_PARAM_ADD_IP);
                if (!chunk) {
                        retval = -ENOMEM;
                        goto out;
                }

                /* Add the new addresses to the bind address list with
                 * use_as_src set to 0.
                 */
                addr_buf = addrs;
                for (i = 0; i < addrcnt; i++) {
                        addr = addr_buf;
                        af = sctp_get_af_specific(addr->v4.sin_family);
                        memcpy(&saveaddr, addr, af->sockaddr_len);
                        retval = sctp_add_bind_addr(bp, &saveaddr,
                                                    sizeof(saveaddr),
                                                    SCTP_ADDR_NEW, GFP_ATOMIC);
                        addr_buf += af->sockaddr_len;
                }
                if (asoc->src_out_of_asoc_ok) {
                        struct sctp_transport *trans;

                        list_for_each_entry(trans,
                            &asoc->peer.transport_addr_list, transports) {
                                trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
                                    2*asoc->pathmtu, 4380));
                                trans->ssthresh = asoc->peer.i.a_rwnd;
                                trans->rto = asoc->rto_initial;
                                sctp_max_rto(asoc, trans);
                                trans->rtt = trans->srtt = trans->rttvar = 0;
                                /* Clear the source and route cache */
                                sctp_transport_route(trans, NULL,
                                                     sctp_sk(asoc->base.sk));
                        }
                }
                retval = sctp_send_asconf(asoc, chunk);
        }

out:
        return retval;
}

/* Remove a list of addresses from bind addresses list.  Do not remove the
 * last address.
 *
 * Basically run through each address specified in the addrs/addrcnt
 * array/length pair, determine if it is IPv6 or IPv4 and call
 * sctp_del_bind() on it.
 *
 * If any of them fails, then the operation will be reversed and the
 * ones that were removed will be added back.
 *
 * At least one address has to be left; if only one address is
 * available, the operation will return -EBUSY.
 *
 * Only sctp_setsockopt_bindx() is supposed to call this function.
 */
static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_endpoint *ep = sp->ep;
        int cnt;
        struct sctp_bind_addr *bp = &ep->base.bind_addr;
        int retval = 0;
        void *addr_buf;
        union sctp_addr *sa_addr;
        struct sctp_af *af;

        pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
                 __func__, sk, addrs, addrcnt);

        addr_buf = addrs;
        for (cnt = 0; cnt < addrcnt; cnt++) {
                /* If the bind address list is empty or if there is only one
                 * bind address, there is nothing more to be removed (we need
                 * at least one address here).
                 */
                if (list_empty(&bp->address_list) ||
                    (sctp_list_single_entry(&bp->address_list))) {
                        retval = -EBUSY;
                        goto err_bindx_rem;
                }

                sa_addr = addr_buf;
                af = sctp_get_af_specific(sa_addr->sa.sa_family);
                if (!af) {
                        retval = -EINVAL;
                        goto err_bindx_rem;
                }

                if (!af->addr_valid(sa_addr, sp, NULL)) {
                        retval = -EADDRNOTAVAIL;
                        goto err_bindx_rem;
                }

                if (sa_addr->v4.sin_port &&
                    sa_addr->v4.sin_port != htons(bp->port)) {
                        retval = -EINVAL;
                        goto err_bindx_rem;
                }

                if (!sa_addr->v4.sin_port)
                        sa_addr->v4.sin_port = htons(bp->port);

                /* FIXME - There is probably a need to check if sk->sk_saddr and
                 * sk->sk_rcv_addr are currently set to one of the addresses to
                 * be removed. This is something which needs to be looked into
                 * when we are fixing the outstanding issues with multi-homing
                 * socket routing and failover schemes. Refer to comments in
                 * sctp_do_bind(). -daisy
                 */
                retval = sctp_del_bind_addr(bp, sa_addr);

                addr_buf += af->sockaddr_len;
err_bindx_rem:
                if (retval < 0) {
                        /* Failed. Add the ones that has been removed back */
                        if (cnt > 0)
                                sctp_bindx_add(sk, addrs, cnt);
                        return retval;
                }
        }

        return retval;
}

/* Send an ASCONF chunk with Delete IP address parameters to all the peers of
 * the associations that are part of the endpoint indicating that a list of
 * local addresses are removed from the endpoint.
 *
 * If any of the addresses is already in the bind address list of the
 * association, we do not send the chunk for that association.  But it will not
 * affect other associations.
 *
 * Only sctp_setsockopt_bindx() is supposed to call this function.
 */
static int sctp_send_asconf_del_ip(struct sock                *sk,
                                   struct sockaddr        *addrs,
                                   int                        addrcnt)
{
        struct sctp_sock        *sp;
        struct sctp_endpoint        *ep;
        struct sctp_association        *asoc;
        struct sctp_transport        *transport;
        struct sctp_bind_addr        *bp;
        struct sctp_chunk        *chunk;
        union sctp_addr                *laddr;
        void                        *addr_buf;
        struct sctp_af                *af;
        struct sctp_sockaddr_entry *saddr;
        int                         i;
        int                         retval = 0;
        int                        stored = 0;

        chunk = NULL;
        sp = sctp_sk(sk);
        ep = sp->ep;

        if (!ep->asconf_enable)
                return retval;

        pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
                 __func__, sk, addrs, addrcnt);

        list_for_each_entry(asoc, &ep->asocs, asocs) {

                if (!asoc->peer.asconf_capable)
                        continue;

                if (asoc->peer.addip_disabled_mask & SCTP_PARAM_DEL_IP)
                        continue;

                if (!sctp_state(asoc, ESTABLISHED))
                        continue;

                /* Check if any address in the packed array of addresses is
                 * not present in the bind address list of the association.
                 * If so, do not send the asconf chunk to its peer, but
                 * continue with other associations.
                 */
                addr_buf = addrs;
                for (i = 0; i < addrcnt; i++) {
                        laddr = addr_buf;
                        af = sctp_get_af_specific(laddr->v4.sin_family);
                        if (!af) {
                                retval = -EINVAL;
                                goto out;
                        }

                        if (!sctp_assoc_lookup_laddr(asoc, laddr))
                                break;

                        addr_buf += af->sockaddr_len;
                }
                if (i < addrcnt)
                        continue;

                /* Find one address in the association's bind address list
                 * that is not in the packed array of addresses. This is to
                 * make sure that we do not delete all the addresses in the
                 * association.
                 */
                bp = &asoc->base.bind_addr;
                laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
                                               addrcnt, sp);
                if ((laddr == NULL) && (addrcnt == 1)) {
                        if (asoc->asconf_addr_del_pending)
                                continue;
                        asoc->asconf_addr_del_pending =
                            kzalloc(sizeof(union sctp_addr), GFP_ATOMIC);
                        if (asoc->asconf_addr_del_pending == NULL) {
                                retval = -ENOMEM;
                                goto out;
                        }
                        asoc->asconf_addr_del_pending->sa.sa_family =
                                    addrs->sa_family;
                        asoc->asconf_addr_del_pending->v4.sin_port =
                                    htons(bp->port);
                        if (addrs->sa_family == AF_INET) {
                                struct sockaddr_in *sin;

                                sin = (struct sockaddr_in *)addrs;
                                asoc->asconf_addr_del_pending->v4.sin_addr.s_addr = sin->sin_addr.s_addr;
                        } else if (addrs->sa_family == AF_INET6) {
                                struct sockaddr_in6 *sin6;

                                sin6 = (struct sockaddr_in6 *)addrs;
                                asoc->asconf_addr_del_pending->v6.sin6_addr = sin6->sin6_addr;
                        }

                        pr_debug("%s: keep the last address asoc:%p %pISc at %p\n",
                                 __func__, asoc, &asoc->asconf_addr_del_pending->sa,
                                 asoc->asconf_addr_del_pending);

                        asoc->src_out_of_asoc_ok = 1;
                        stored = 1;
                        goto skip_mkasconf;
                }

                if (laddr == NULL)
                        return -EINVAL;

                /* We do not need RCU protection throughout this loop
                 * because this is done under a socket lock from the
                 * setsockopt call.
                 */
                chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt,
                                                   SCTP_PARAM_DEL_IP);
                if (!chunk) {
                        retval = -ENOMEM;
                        goto out;
                }

skip_mkasconf:
                /* Reset use_as_src flag for the addresses in the bind address
                 * list that are to be deleted.
                 */
                addr_buf = addrs;
                for (i = 0; i < addrcnt; i++) {
                        laddr = addr_buf;
                        af = sctp_get_af_specific(laddr->v4.sin_family);
                        list_for_each_entry(saddr, &bp->address_list, list) {
                                if (sctp_cmp_addr_exact(&saddr->a, laddr))
                                        saddr->state = SCTP_ADDR_DEL;
                        }
                        addr_buf += af->sockaddr_len;
                }

                /* Update the route and saddr entries for all the transports
                 * as some of the addresses in the bind address list are
                 * about to be deleted and cannot be used as source addresses.
                 */
                list_for_each_entry(transport, &asoc->peer.transport_addr_list,
                                        transports) {
                        sctp_transport_route(transport, NULL,
                                             sctp_sk(asoc->base.sk));
                }

                if (stored)
                        /* We don't need to transmit ASCONF */
                        continue;
                retval = sctp_send_asconf(asoc, chunk);
        }
out:
        return retval;
}

/* set addr events to assocs in the endpoint.  ep and addr_wq must be locked */
int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
{
        struct sock *sk = sctp_opt2sk(sp);
        union sctp_addr *addr;
        struct sctp_af *af;

        /* It is safe to write port space in caller. */
        addr = &addrw->a;
        addr->v4.sin_port = htons(sp->ep->base.bind_addr.port);
        af = sctp_get_af_specific(addr->sa.sa_family);
        if (!af)
                return -EINVAL;
        if (sctp_verify_addr(sk, addr, af->sockaddr_len))
                return -EINVAL;

        if (addrw->state == SCTP_ADDR_NEW)
                return sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1);
        else
                return sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1);
}

/* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
 *
 * API 8.1
 * int sctp_bindx(int sd, struct sockaddr *addrs, int addrcnt,
 *                int flags);
 *
 * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
 * If the sd is an IPv6 socket, the addresses passed can either be IPv4
 * or IPv6 addresses.
 *
 * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
 * Section 3.1.2 for this usage.
 *
 * addrs is a pointer to an array of one or more socket addresses. Each
 * address is contained in its appropriate structure (i.e. struct
 * sockaddr_in or struct sockaddr_in6) the family of the address type
 * must be used to distinguish the address length (note that this
 * representation is termed a "packed array" of addresses). The caller
 * specifies the number of addresses in the array with addrcnt.
 *
 * On success, sctp_bindx() returns 0. On failure, sctp_bindx() returns
 * -1, and sets errno to the appropriate error code.
 *
 * For SCTP, the port given in each socket address must be the same, or
 * sctp_bindx() will fail, setting errno to EINVAL.
 *
 * The flags parameter is formed from the bitwise OR of zero or more of
 * the following currently defined flags:
 *
 * SCTP_BINDX_ADD_ADDR
 *
 * SCTP_BINDX_REM_ADDR
 *
 * SCTP_BINDX_ADD_ADDR directs SCTP to add the given addresses to the
 * association, and SCTP_BINDX_REM_ADDR directs SCTP to remove the given
 * addresses from the association. The two flags are mutually exclusive;
 * if both are given, sctp_bindx() will fail with EINVAL. A caller may
 * not remove all addresses from an association; sctp_bindx() will
 * reject such an attempt with EINVAL.
 *
 * An application can use sctp_bindx(SCTP_BINDX_ADD_ADDR) to associate
 * additional addresses with an endpoint after calling bind().  Or use
 * sctp_bindx(SCTP_BINDX_REM_ADDR) to remove some addresses a listening
 * socket is associated with so that no new association accepted will be
 * associated with those addresses. If the endpoint supports dynamic
 * address a SCTP_BINDX_REM_ADDR or SCTP_BINDX_ADD_ADDR may cause a
 * endpoint to send the appropriate message to the peer to change the
 * peers address lists.
 *
 * Adding and removing addresses from a connected association is
 * optional functionality. Implementations that do not support this
 * functionality should return EOPNOTSUPP.
 *
 * Basically do nothing but copying the addresses from user to kernel
 * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
 * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
 * from userspace.
 *
 * On exit there is no need to do sockfd_put(), sys_setsockopt() does
 * it.
 *
 * sk        The sk of the socket
 * addrs     The pointer to the addresses
 * addrssize Size of the addrs buffer
 * op        Operation to perform (add or remove, see the flags of
 *           sctp_bindx)
 *
 * Returns 0 if ok, <0 errno code on error.
 */
static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs,
                                 int addrs_size, int op)
{
        int err;
        int addrcnt = 0;
        int walk_size = 0;
        struct sockaddr *sa_addr;
        void *addr_buf = addrs;
        struct sctp_af *af;

        pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n",
                 __func__, sk, addr_buf, addrs_size, op);

        if (unlikely(addrs_size <= 0))
                return -EINVAL;

        /* Walk through the addrs buffer and count the number of addresses. */
        while (walk_size < addrs_size) {
                if (walk_size + sizeof(sa_family_t) > addrs_size)
                        return -EINVAL;

                sa_addr = addr_buf;
                af = sctp_get_af_specific(sa_addr->sa_family);

                /* If the address family is not supported or if this address
                 * causes the address buffer to overflow return EINVAL.
                 */
                if (!af || (walk_size + af->sockaddr_len) > addrs_size)
                        return -EINVAL;
                addrcnt++;
                addr_buf += af->sockaddr_len;
                walk_size += af->sockaddr_len;
        }

        /* Do the work. */
        switch (op) {
        case SCTP_BINDX_ADD_ADDR:
                /* Allow security module to validate bindx addresses. */
                err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD,
                                                 addrs, addrs_size);
                if (err)
                        return err;
                err = sctp_bindx_add(sk, addrs, addrcnt);
                if (err)
                        return err;
                return sctp_send_asconf_add_ip(sk, addrs, addrcnt);
        case SCTP_BINDX_REM_ADDR:
                err = sctp_bindx_rem(sk, addrs, addrcnt);
                if (err)
                        return err;
                return sctp_send_asconf_del_ip(sk, addrs, addrcnt);

        default:
                return -EINVAL;
        }
}

static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs,
                int addrlen)
{
        int err;

        lock_sock(sk);
        err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR);
        release_sock(sk);
        return err;
}

static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
                                 const union sctp_addr *daddr,
                                 const struct sctp_initmsg *init,
                                 struct sctp_transport **tp)
{
        struct sctp_association *asoc;
        struct sock *sk = ep->base.sk;
        struct net *net = sock_net(sk);
        enum sctp_scope scope;
        int err;

        if (sctp_endpoint_is_peeled_off(ep, daddr))
                return -EADDRNOTAVAIL;

        if (!ep->base.bind_addr.port) {
                if (sctp_autobind(sk))
                        return -EAGAIN;
        } else {
                if (inet_port_requires_bind_service(net, ep->base.bind_addr.port) &&
                    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                        return -EACCES;
        }

        scope = sctp_scope(daddr);
        asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
        if (!asoc)
                return -ENOMEM;

        err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
        if (err < 0)
                goto free;

        *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
        if (!*tp) {
                err = -ENOMEM;
                goto free;
        }

        if (!init)
                return 0;

        if (init->sinit_num_ostreams) {
                __u16 outcnt = init->sinit_num_ostreams;

                asoc->c.sinit_num_ostreams = outcnt;
                /* outcnt has been changed, need to re-init stream */
                err = sctp_stream_init(&asoc->stream, outcnt, 0, GFP_KERNEL);
                if (err)
                        goto free;
        }

        if (init->sinit_max_instreams)
                asoc->c.sinit_max_instreams = init->sinit_max_instreams;

        if (init->sinit_max_attempts)
                asoc->max_init_attempts = init->sinit_max_attempts;

        if (init->sinit_max_init_timeo)
                asoc->max_init_timeo =
                        msecs_to_jiffies(init->sinit_max_init_timeo);

        return 0;
free:
        sctp_association_free(asoc);
        return err;
}

static int sctp_connect_add_peer(struct sctp_association *asoc,
                                 union sctp_addr *daddr, int addr_len)
{
        struct sctp_endpoint *ep = asoc->ep;
        struct sctp_association *old;
        struct sctp_transport *t;
        int err;

        err = sctp_verify_addr(ep->base.sk, daddr, addr_len);
        if (err)
                return err;

        old = sctp_endpoint_lookup_assoc(ep, daddr, &t);
        if (old && old != asoc)
                return old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
                                                            : -EALREADY;

        if (sctp_endpoint_is_peeled_off(ep, daddr))
                return -EADDRNOTAVAIL;

        t = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
        if (!t)
                return -ENOMEM;

        return 0;
}

/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
 *
 * Common routine for handling connect() and sctp_connectx().
 * Connect will come in with just a single address.
 */
static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs,
                          int addrs_size, int flags, sctp_assoc_t *assoc_id)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_endpoint *ep = sp->ep;
        struct sctp_transport *transport;
        struct sctp_association *asoc;
        void *addr_buf = kaddrs;
        union sctp_addr *daddr;
        struct sctp_af *af;
        int walk_size, err;
        long timeo;

        if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) ||
            (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)))
                return -EISCONN;

        daddr = addr_buf;
        af = sctp_get_af_specific(daddr->sa.sa_family);
        if (!af || af->sockaddr_len > addrs_size)
                return -EINVAL;

        err = sctp_verify_addr(sk, daddr, af->sockaddr_len);
        if (err)
                return err;

        asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
        if (asoc)
                return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
                                                             : -EALREADY;

        err = sctp_connect_new_asoc(ep, daddr, NULL, &transport);
        if (err)
                return err;
        asoc = transport->asoc;

        addr_buf += af->sockaddr_len;
        walk_size = af->sockaddr_len;
        while (walk_size < addrs_size) {
                err = -EINVAL;
                if (walk_size + sizeof(sa_family_t) > addrs_size)
                        goto out_free;

                daddr = addr_buf;
                af = sctp_get_af_specific(daddr->sa.sa_family);
                if (!af || af->sockaddr_len + walk_size > addrs_size)
                        goto out_free;

                if (asoc->peer.port != ntohs(daddr->v4.sin_port))
                        goto out_free;

                err = sctp_connect_add_peer(asoc, daddr, af->sockaddr_len);
                if (err)
                        goto out_free;

                addr_buf  += af->sockaddr_len;
                walk_size += af->sockaddr_len;
        }

        /* In case the user of sctp_connectx() wants an association
         * id back, assign one now.
         */
        if (assoc_id) {
                err = sctp_assoc_set_id(asoc, GFP_KERNEL);
                if (err < 0)
                        goto out_free;
        }

        err = sctp_primitive_ASSOCIATE(sock_net(sk), asoc, NULL);
        if (err < 0)
                goto out_free;

        /* Initialize sk's dport and daddr for getpeername() */
        inet_sk(sk)->inet_dport = htons(asoc->peer.port);
        sp->pf->to_sk_daddr(daddr, sk);
        sk->sk_err = 0;

        if (assoc_id)
                *assoc_id = asoc->assoc_id;

        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
        return sctp_wait_for_connect(asoc, &timeo);

out_free:
        pr_debug("%s: took out_free path with asoc:%p kaddrs:%p err:%d\n",
                 __func__, asoc, kaddrs, err);
        sctp_association_free(asoc);
        return err;
}

/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
 *
 * API 8.9
 * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt,
 *                         sctp_assoc_t *asoc);
 *
 * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
 * If the sd is an IPv6 socket, the addresses passed can either be IPv4
 * or IPv6 addresses.
 *
 * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
 * Section 3.1.2 for this usage.
 *
 * addrs is a pointer to an array of one or more socket addresses. Each
 * address is contained in its appropriate structure (i.e. struct
 * sockaddr_in or struct sockaddr_in6) the family of the address type
 * must be used to distengish the address length (note that this
 * representation is termed a "packed array" of addresses). The caller
 * specifies the number of addresses in the array with addrcnt.
 *
 * On success, sctp_connectx() returns 0. It also sets the assoc_id to
 * the association id of the new association.  On failure, sctp_connectx()
 * returns -1, and sets errno to the appropriate error code.  The assoc_id
 * is not touched by the kernel.
 *
 * For SCTP, the port given in each socket address must be the same, or
 * sctp_connectx() will fail, setting errno to EINVAL.
 *
 * An application can use sctp_connectx to initiate an association with
 * an endpoint that is multi-homed.  Much like sctp_bindx() this call
 * allows a caller to specify multiple addresses at which a peer can be
 * reached.  The way the SCTP stack uses the list of addresses to set up
 * the association is implementation dependent.  This function only
 * specifies that the stack will try to make use of all the addresses in
 * the list when needed.
 *
 * Note that the list of addresses passed in is only used for setting up
 * the association.  It does not necessarily equal the set of addresses
 * the peer uses for the resulting association.  If the caller wants to
 * find out the set of peer addresses, it must use sctp_getpaddrs() to
 * retrieve them after the association has been set up.
 *
 * Basically do nothing but copying the addresses from user to kernel
 * land and invoking either sctp_connectx(). This is used for tunneling
 * the sctp_connectx() request through sctp_setsockopt() from userspace.
 *
 * On exit there is no need to do sockfd_put(), sys_setsockopt() does
 * it.
 *
 * sk        The sk of the socket
 * addrs     The pointer to the addresses
 * addrssize Size of the addrs buffer
 *
 * Returns >=0 if ok, <0 errno code on error.
 */
static int __sctp_setsockopt_connectx(struct sock *sk, struct sockaddr *kaddrs,
                                      int addrs_size, sctp_assoc_t *assoc_id)
{
        int err = 0, flags = 0;

        pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
                 __func__, sk, kaddrs, addrs_size);

        /* make sure the 1st addr's sa_family is accessible later */
        if (unlikely(addrs_size < sizeof(sa_family_t)))
                return -EINVAL;

        /* Allow security module to validate connectx addresses. */
        err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_CONNECTX,
                                         (struct sockaddr *)kaddrs,
                                          addrs_size);
        if (err)
                return err;

        /* in-kernel sockets don't generally have a file allocated to them
         * if all they do is call sock_create_kern().
         */
        if (sk->sk_socket->file)
                flags = sk->sk_socket->file->f_flags;

        return __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id);
}

/*
 * This is an older interface.  It's kept for backward compatibility
 * to the option that doesn't provide association id.
 */
static int sctp_setsockopt_connectx_old(struct sock *sk,
                                        struct sockaddr *kaddrs,
                                        int addrs_size)
{
        return __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, NULL);
}

/*
 * New interface for the API.  The since the API is done with a socket
 * option, to make it simple we feed back the association id is as a return
 * indication to the call.  Error is always negative and association id is
 * always positive.
 */
static int sctp_setsockopt_connectx(struct sock *sk,
                                    struct sockaddr *kaddrs,
                                    int addrs_size)
{
        sctp_assoc_t assoc_id = 0;
        int err = 0;

        err = __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, &assoc_id);

        if (err)
                return err;
        else
                return assoc_id;
}

/*
 * New (hopefully final) interface for the API.
 * We use the sctp_getaddrs_old structure so that use-space library
 * can avoid any unnecessary allocations. The only different part
 * is that we store the actual length of the address buffer into the
 * addrs_num structure member. That way we can re-use the existing
 * code.
 */
#ifdef CONFIG_COMPAT
struct compat_sctp_getaddrs_old {
        sctp_assoc_t        assoc_id;
        s32                addr_num;
        compat_uptr_t        addrs;                /* struct sockaddr * */
};
#endif

static int sctp_getsockopt_connectx3(struct sock *sk, int len,
                                     char __user *optval,
                                     int __user *optlen)
{
        struct sctp_getaddrs_old param;
        sctp_assoc_t assoc_id = 0;
        struct sockaddr *kaddrs;
        int err = 0;

#ifdef CONFIG_COMPAT
        if (in_compat_syscall()) {
                struct compat_sctp_getaddrs_old param32;

                if (len < sizeof(param32))
                        return -EINVAL;
                if (copy_from_user(&param32, optval, sizeof(param32)))
                        return -EFAULT;

                param.assoc_id = param32.assoc_id;
                param.addr_num = param32.addr_num;
                param.addrs = compat_ptr(param32.addrs);
        } else
#endif
        {
                if (len < sizeof(param))
                        return -EINVAL;
                if (copy_from_user(&param, optval, sizeof(param)))
                        return -EFAULT;
        }

        kaddrs = memdup_user(param.addrs, param.addr_num);
        if (IS_ERR(kaddrs))
                return PTR_ERR(kaddrs);

        err = __sctp_setsockopt_connectx(sk, kaddrs, param.addr_num, &assoc_id);
        kfree(kaddrs);
        if (err == 0 || err == -EINPROGRESS) {
                if (copy_to_user(optval, &assoc_id, sizeof(assoc_id)))
                        return -EFAULT;
                if (put_user(sizeof(assoc_id), optlen))
                        return -EFAULT;
        }

        return err;
}

/* API 3.1.4 close() - UDP Style Syntax
 * Applications use close() to perform graceful shutdown (as described in
 * Section 10.1 of [SCTP]) on ALL the associations currently represented
 * by a UDP-style socket.
 *
 * The syntax is
 *
 *   ret = close(int sd);
 *
 *   sd      - the socket descriptor of the associations to be closed.
 *
 * To gracefully shutdown a specific association represented by the
 * UDP-style socket, an application should use the sendmsg() call,
 * passing no user data, but including the appropriate flag in the
 * ancillary data (see Section xxxx).
 *
 * If sd in the close() call is a branched-off socket representing only
 * one association, the shutdown is performed on that association only.
 *
 * 4.1.6 close() - TCP Style Syntax
 *
 * Applications use close() to gracefully close down an association.
 *
 * The syntax is:
 *
 *    int close(int sd);
 *
 *      sd      - the socket descriptor of the association to be closed.
 *
 * After an application calls close() on a socket descriptor, no further
 * socket operations will succeed on that descriptor.
 *
 * API 7.1.4 SO_LINGER
 *
 * An application using the TCP-style socket can use this option to
 * perform the SCTP ABORT primitive.  The linger option structure is:
 *
 *  struct  linger {
 *     int     l_onoff;                // option on/off
 *     int     l_linger;               // linger time
 * };
 *
 * To enable the option, set l_onoff to 1.  If the l_linger value is set
 * to 0, calling close() is the same as the ABORT primitive.  If the
 * value is set to a negative value, the setsockopt() call will return
 * an error.  If the value is set to a positive value linger_time, the
 * close() can be blocked for at most linger_time ms.  If the graceful
 * shutdown phase does not finish during this period, close() will
 * return but the graceful shutdown phase continues in the system.
 */
static void sctp_close(struct sock *sk, long timeout)
{
        struct net *net = sock_net(sk);
        struct sctp_endpoint *ep;
        struct sctp_association *asoc;
        struct list_head *pos, *temp;
        unsigned int data_was_unread;

        pr_debug("%s: sk:%p, timeout:%ld\n", __func__, sk, timeout);

        lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
        sk->sk_shutdown = SHUTDOWN_MASK;
        inet_sk_set_state(sk, SCTP_SS_CLOSING);

        ep = sctp_sk(sk)->ep;

        /* Clean up any skbs sitting on the receive queue.  */
        data_was_unread = sctp_queue_purge_ulpevents(&sk->sk_receive_queue);
        data_was_unread += sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby);

        /* Walk all associations on an endpoint.  */
        list_for_each_safe(pos, temp, &ep->asocs) {
                asoc = list_entry(pos, struct sctp_association, asocs);

                if (sctp_style(sk, TCP)) {
                        /* A closed association can still be in the list if
                         * it belongs to a TCP-style listening socket that is
                         * not yet accepted. If so, free it. If not, send an
                         * ABORT or SHUTDOWN based on the linger options.
                         */
                        if (sctp_state(asoc, CLOSED)) {
                                sctp_association_free(asoc);
                                continue;
                        }
                }

                if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) ||
                    !skb_queue_empty(&asoc->ulpq.reasm) ||
                    !skb_queue_empty(&asoc->ulpq.reasm_uo) ||
                    (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) {
                        struct sctp_chunk *chunk;

                        chunk = sctp_make_abort_user(asoc, NULL, 0);
                        sctp_primitive_ABORT(net, asoc, chunk);
                } else
                        sctp_primitive_SHUTDOWN(net, asoc, NULL);
        }

        /* On a TCP-style socket, block for at most linger_time if set. */
        if (sctp_style(sk, TCP) && timeout)
                sctp_wait_for_close(sk, timeout);

        /* This will run the backlog queue.  */
        release_sock(sk);

        /* Supposedly, no process has access to the socket, but
         * the net layers still may.
         * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
         * held and that should be grabbed before socket lock.
         */
        spin_lock_bh(&net->sctp.addr_wq_lock);
        bh_lock_sock_nested(sk);

        /* Hold the sock, since sk_common_release() will put sock_put()
         * and we have just a little more cleanup.
         */
        sock_hold(sk);
        sk_common_release(sk);

        bh_unlock_sock(sk);
        spin_unlock_bh(&net->sctp.addr_wq_lock);

        sock_put(sk);

        SCTP_DBG_OBJCNT_DEC(sock);
}

/* Handle EPIPE error. */
static int sctp_error(struct sock *sk, int flags, int err)
{
        if (err == -EPIPE)
                err = sock_error(sk) ? : -EPIPE;
        if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
                send_sig(SIGPIPE, current, 0);
        return err;
}

/* API 3.1.3 sendmsg() - UDP Style Syntax
 *
 * An application uses sendmsg() and recvmsg() calls to transmit data to
 * and receive data from its peer.
 *
 *  ssize_t sendmsg(int socket, const struct msghdr *message,
 *                  int flags);
 *
 *  socket  - the socket descriptor of the endpoint.
 *  message - pointer to the msghdr structure which contains a single
 *            user message and possibly some ancillary data.
 *
 *            See Section 5 for complete description of the data
 *            structures.
 *
 *  flags   - flags sent or received with the user message, see Section
 *            5 for complete description of the flags.
 *
 * Note:  This function could use a rewrite especially when explicit
 * connect support comes in.
 */
/* BUG:  We do not implement the equivalent of sk_stream_wait_memory(). */

static int sctp_msghdr_parse(const struct msghdr *msg,
                             struct sctp_cmsgs *cmsgs);

static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs,
                              struct sctp_sndrcvinfo *srinfo,
                              const struct msghdr *msg, size_t msg_len)
{
        __u16 sflags;
        int err;

        if (sctp_sstate(sk, LISTENING) && sctp_style(sk, TCP))
                return -EPIPE;

        if (msg_len > sk->sk_sndbuf)
                return -EMSGSIZE;

        memset(cmsgs, 0, sizeof(*cmsgs));
        err = sctp_msghdr_parse(msg, cmsgs);
        if (err) {
                pr_debug("%s: msghdr parse err:%x\n", __func__, err);
                return err;
        }

        memset(srinfo, 0, sizeof(*srinfo));
        if (cmsgs->srinfo) {
                srinfo->sinfo_stream = cmsgs->srinfo->sinfo_stream;
                srinfo->sinfo_flags = cmsgs->srinfo->sinfo_flags;
                srinfo->sinfo_ppid = cmsgs->srinfo->sinfo_ppid;
                srinfo->sinfo_context = cmsgs->srinfo->sinfo_context;
                srinfo->sinfo_assoc_id = cmsgs->srinfo->sinfo_assoc_id;
                srinfo->sinfo_timetolive = cmsgs->srinfo->sinfo_timetolive;
        }

        if (cmsgs->sinfo) {
                srinfo->sinfo_stream = cmsgs->sinfo->snd_sid;
                srinfo->sinfo_flags = cmsgs->sinfo->snd_flags;
                srinfo->sinfo_ppid = cmsgs->sinfo->snd_ppid;
                srinfo->sinfo_context = cmsgs->sinfo->snd_context;
                srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id;
        }

        if (cmsgs->prinfo) {
                srinfo->sinfo_timetolive = cmsgs->prinfo->pr_value;
                SCTP_PR_SET_POLICY(srinfo->sinfo_flags,
                                   cmsgs->prinfo->pr_policy);
        }

        sflags = srinfo->sinfo_flags;
        if (!sflags && msg_len)
                return 0;

        if (sctp_style(sk, TCP) && (sflags & (SCTP_EOF | SCTP_ABORT)))
                return -EINVAL;

        if (((sflags & SCTP_EOF) && msg_len > 0) ||
            (!(sflags & (SCTP_EOF | SCTP_ABORT)) && msg_len == 0))
                return -EINVAL;

        if ((sflags & SCTP_ADDR_OVER) && !msg->msg_name)
                return -EINVAL;

        return 0;
}

static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
                                 struct sctp_cmsgs *cmsgs,
                                 union sctp_addr *daddr,
                                 struct sctp_transport **tp)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_association *asoc;
        struct cmsghdr *cmsg;
        __be32 flowinfo = 0;
        struct sctp_af *af;
        int err;

        *tp = NULL;

        if (sflags & (SCTP_EOF | SCTP_ABORT))
                return -EINVAL;

        if (sctp_style(sk, TCP) && (sctp_sstate(sk, ESTABLISHED) ||
                                    sctp_sstate(sk, CLOSING)))
                return -EADDRNOTAVAIL;

        /* Label connection socket for first association 1-to-many
         * style for client sequence socket()->sendmsg(). This
         * needs to be done before sctp_assoc_add_peer() as that will
         * set up the initial packet that needs to account for any
         * security ip options (CIPSO/CALIPSO) added to the packet.
         */
        af = sctp_get_af_specific(daddr->sa.sa_family);
        if (!af)
                return -EINVAL;
        err = security_sctp_bind_connect(sk, SCTP_SENDMSG_CONNECT,
                                         (struct sockaddr *)daddr,
                                         af->sockaddr_len);
        if (err < 0)
                return err;

        err = sctp_connect_new_asoc(ep, daddr, cmsgs->init, tp);
        if (err)
                return err;
        asoc = (*tp)->asoc;

        if (!cmsgs->addrs_msg)
                return 0;

        if (daddr->sa.sa_family == AF_INET6)
                flowinfo = daddr->v6.sin6_flowinfo;

        /* sendv addr list parse */
        for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
                union sctp_addr _daddr;
                int dlen;

                if (cmsg->cmsg_level != IPPROTO_SCTP ||
                    (cmsg->cmsg_type != SCTP_DSTADDRV4 &&
                     cmsg->cmsg_type != SCTP_DSTADDRV6))
                        continue;

                daddr = &_daddr;
                memset(daddr, 0, sizeof(*daddr));
                dlen = cmsg->cmsg_len - sizeof(struct cmsghdr);
                if (cmsg->cmsg_type == SCTP_DSTADDRV4) {
                        if (dlen < sizeof(struct in_addr)) {
                                err = -EINVAL;
                                goto free;
                        }

                        dlen = sizeof(struct in_addr);
                        daddr->v4.sin_family = AF_INET;
                        daddr->v4.sin_port = htons(asoc->peer.port);
                        memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen);
                } else {
                        if (dlen < sizeof(struct in6_addr)) {
                                err = -EINVAL;
                                goto free;
                        }

                        dlen = sizeof(struct in6_addr);
                        daddr->v6.sin6_flowinfo = flowinfo;
                        daddr->v6.sin6_family = AF_INET6;
                        daddr->v6.sin6_port = htons(asoc->peer.port);
                        memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
                }

                err = sctp_connect_add_peer(asoc, daddr, sizeof(*daddr));
                if (err)
                        goto free;
        }

        return 0;

free:
        sctp_association_free(asoc);
        return err;
}

static int sctp_sendmsg_check_sflags(struct sctp_association *asoc,
                                     __u16 sflags, struct msghdr *msg,
                                     size_t msg_len)
{
        struct sock *sk = asoc->base.sk;
        struct net *net = sock_net(sk);

        if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP))
                return -EPIPE;

        if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP) &&
            !sctp_state(asoc, ESTABLISHED))
                return 0;

        if (sflags & SCTP_EOF) {
                pr_debug("%s: shutting down association:%p\n", __func__, asoc);
                sctp_primitive_SHUTDOWN(net, asoc, NULL);

                return 0;
        }

        if (sflags & SCTP_ABORT) {
                struct sctp_chunk *chunk;

                chunk = sctp_make_abort_user(asoc, msg, msg_len);
                if (!chunk)
                        return -ENOMEM;

                pr_debug("%s: aborting association:%p\n", __func__, asoc);
                sctp_primitive_ABORT(net, asoc, chunk);
                iov_iter_revert(&msg->msg_iter, msg_len);

                return 0;
        }

        return 1;
}

static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
                                struct msghdr *msg, size_t msg_len,
                                struct sctp_transport *transport,
                                struct sctp_sndrcvinfo *sinfo)
{
        struct sock *sk = asoc->base.sk;
        struct sctp_sock *sp = sctp_sk(sk);
        struct net *net = sock_net(sk);
        struct sctp_datamsg *datamsg;
        bool wait_connect = false;
        struct sctp_chunk *chunk;
        long timeo;
        int err;

        if (sinfo->sinfo_stream >= asoc->stream.outcnt) {
                err = -EINVAL;
                goto err;
        }

        if (unlikely(!SCTP_SO(&asoc->stream, sinfo->sinfo_stream)->ext)) {
                err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream);
                if (err)
                        goto err;
        }

        if (sp->disable_fragments && msg_len > asoc->frag_point) {
                err = -EMSGSIZE;
                goto err;
        }

        if (asoc->pmtu_pending) {
                if (sp->param_flags & SPP_PMTUD_ENABLE)
                        sctp_assoc_sync_pmtu(asoc);
                asoc->pmtu_pending = 0;
        }

        if (sctp_wspace(asoc) < (int)msg_len)
                sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));

        if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) {
                timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
                err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
                if (err)
                        goto err;
                if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) {
                        err = -EINVAL;
                        goto err;
                }
        }

        if (sctp_state(asoc, CLOSED)) {
                err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
                if (err)
                        goto err;

                if (asoc->ep->intl_enable) {
                        timeo = sock_sndtimeo(sk, 0);
                        err = sctp_wait_for_connect(asoc, &timeo);
                        if (err) {
                                err = -ESRCH;
                                goto err;
                        }
                } else {
                        wait_connect = true;
                }

                pr_debug("%s: we associated primitively\n", __func__);
        }

        datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
        if (IS_ERR(datamsg)) {
                err = PTR_ERR(datamsg);
                goto err;
        }

        asoc->force_delay = !!(msg->msg_flags & MSG_MORE);

        list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
                sctp_chunk_hold(chunk);
                sctp_set_owner_w(chunk);
                chunk->transport = transport;
        }

        err = sctp_primitive_SEND(net, asoc, datamsg);
        if (err) {
                sctp_datamsg_free(datamsg);
                goto err;
        }

        pr_debug("%s: we sent primitively\n", __func__);

        sctp_datamsg_put(datamsg);

        if (unlikely(wait_connect)) {
                timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
                sctp_wait_for_connect(asoc, &timeo);
        }

        err = msg_len;

err:
        return err;
}

static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk,
                                               const struct msghdr *msg,
                                               struct sctp_cmsgs *cmsgs)
{
        union sctp_addr *daddr = NULL;
        int err;

        if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) {
                int len = msg->msg_namelen;

                if (len > sizeof(*daddr))
                        len = sizeof(*daddr);

                daddr = (union sctp_addr *)msg->msg_name;

                err = sctp_verify_addr(sk, daddr, len);
                if (err)
                        return ERR_PTR(err);
        }

        return daddr;
}

static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc,
                                      struct sctp_sndrcvinfo *sinfo,
                                      struct sctp_cmsgs *cmsgs)
{
        if (!cmsgs->srinfo && !cmsgs->sinfo) {
                sinfo->sinfo_stream = asoc->default_stream;
                sinfo->sinfo_ppid = asoc->default_ppid;
                sinfo->sinfo_context = asoc->default_context;
                sinfo->sinfo_assoc_id = sctp_assoc2id(asoc);

                if (!cmsgs->prinfo)
                        sinfo->sinfo_flags = asoc->default_flags;
        }

        if (!cmsgs->srinfo && !cmsgs->prinfo)
                sinfo->sinfo_timetolive = asoc->default_timetolive;

        if (cmsgs->authinfo) {
                /* Reuse sinfo_tsn to indicate that authinfo was set and
                 * sinfo_ssn to save the keyid on tx path.
                 */
                sinfo->sinfo_tsn = 1;
                sinfo->sinfo_ssn = cmsgs->authinfo->auth_keynumber;
        }
}

static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_transport *transport = NULL;
        struct sctp_sndrcvinfo _sinfo, *sinfo;
        struct sctp_association *asoc, *tmp;
        struct sctp_cmsgs cmsgs;
        union sctp_addr *daddr;
        bool new = false;
        __u16 sflags;
        int err;

        /* Parse and get snd_info */
        err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len);
        if (err)
                goto out;

        sinfo  = &_sinfo;
        sflags = sinfo->sinfo_flags;

        /* Get daddr from msg */
        daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs);
        if (IS_ERR(daddr)) {
                err = PTR_ERR(daddr);
                goto out;
        }

        lock_sock(sk);

        /* SCTP_SENDALL process */
        if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) {
                list_for_each_entry_safe(asoc, tmp, &ep->asocs, asocs) {
                        err = sctp_sendmsg_check_sflags(asoc, sflags, msg,
                                                        msg_len);
                        if (err == 0)
                                continue;
                        if (err < 0)
                                goto out_unlock;

                        sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs);

                        err = sctp_sendmsg_to_asoc(asoc, msg, msg_len,
                                                   NULL, sinfo);
                        if (err < 0)
                                goto out_unlock;

                        iov_iter_revert(&msg->msg_iter, err);
                }

                goto out_unlock;
        }

        /* Get and check or create asoc */
        if (daddr) {
                asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
                if (asoc) {
                        err = sctp_sendmsg_check_sflags(asoc, sflags, msg,
                                                        msg_len);
                        if (err <= 0)
                                goto out_unlock;
                } else {
                        err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr,
                                                    &transport);
                        if (err)
                                goto out_unlock;

                        asoc = transport->asoc;
                        new = true;
                }

                if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER))
                        transport = NULL;
        } else {
                asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id);
                if (!asoc) {
                        err = -EPIPE;
                        goto out_unlock;
                }

                err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len);
                if (err <= 0)
                        goto out_unlock;
        }

        /* Update snd_info with the asoc */
        sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs);

        /* Send msg to the asoc */
        err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo);
        if (err < 0 && err != -ESRCH && new)
                sctp_association_free(asoc);

out_unlock:
        release_sock(sk);
out:
        return sctp_error(sk, msg->msg_flags, err);
}

/* This is an extended version of skb_pull() that removes the data from the
 * start of a skb even when data is spread across the list of skb's in the
 * frag_list. len specifies the total amount of data that needs to be removed.
 * when 'len' bytes could be removed from the skb, it returns 0.
 * If 'len' exceeds the total skb length,  it returns the no. of bytes that
 * could not be removed.
 */
static int sctp_skb_pull(struct sk_buff *skb, int len)
{
        struct sk_buff *list;
        int skb_len = skb_headlen(skb);
        int rlen;

        if (len <= skb_len) {
                __skb_pull(skb, len);
                return 0;
        }
        len -= skb_len;
        __skb_pull(skb, skb_len);

        skb_walk_frags(skb, list) {
                rlen = sctp_skb_pull(list, len);
                skb->len -= (len-rlen);
                skb->data_len -= (len-rlen);

                if (!rlen)
                        return 0;

                len = rlen;
        }

        return len;
}

/* API 3.1.3  recvmsg() - UDP Style Syntax
 *
 *  ssize_t recvmsg(int socket, struct msghdr *message,
 *                    int flags);
 *
 *  socket  - the socket descriptor of the endpoint.
 *  message - pointer to the msghdr structure which contains a single
 *            user message and possibly some ancillary data.
 *
 *            See Section 5 for complete description of the data
 *            structures.
 *
 *  flags   - flags sent or received with the user message, see Section
 *            5 for complete description of the flags.
 */
static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
                        int flags, int *addr_len)
{
        struct sctp_ulpevent *event = NULL;
        struct sctp_sock *sp = sctp_sk(sk);
        struct sk_buff *skb, *head_skb;
        int copied;
        int err = 0;
        int skb_len;

        pr_debug("%s: sk:%p, msghdr:%p, len:%zd, flags:0x%x, addr_len:%p)\n",
                 __func__, sk, msg, len, flags, addr_len);

        if (unlikely(flags & MSG_ERRQUEUE))
                return inet_recv_error(sk, msg, len, addr_len);

        if (sk_can_busy_loop(sk) &&
            skb_queue_empty_lockless(&sk->sk_receive_queue))
                sk_busy_loop(sk, flags & MSG_DONTWAIT);

        lock_sock(sk);

        if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) &&
            !sctp_sstate(sk, CLOSING) && !sctp_sstate(sk, CLOSED)) {
                err = -ENOTCONN;
                goto out;
        }

        skb = sctp_skb_recv_datagram(sk, flags, &err);
        if (!skb)
                goto out;

        /* Get the total length of the skb including any skb's in the
         * frag_list.
         */
        skb_len = skb->len;

        copied = skb_len;
        if (copied > len)
                copied = len;

        err = skb_copy_datagram_msg(skb, 0, msg, copied);

        event = sctp_skb2event(skb);

        if (err)
                goto out_free;

        if (event->chunk && event->chunk->head_skb)
                head_skb = event->chunk->head_skb;
        else
                head_skb = skb;
        sock_recv_cmsgs(msg, sk, head_skb);
        if (sctp_ulpevent_is_notification(event)) {
                msg->msg_flags |= MSG_NOTIFICATION;
                sp->pf->event_msgname(event, msg->msg_name, addr_len);
        } else {
                sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len);
        }

        /* Check if we allow SCTP_NXTINFO. */
        if (sp->recvnxtinfo)
                sctp_ulpevent_read_nxtinfo(event, msg, sk);
        /* Check if we allow SCTP_RCVINFO. */
        if (sp->recvrcvinfo)
                sctp_ulpevent_read_rcvinfo(event, msg);
        /* Check if we allow SCTP_SNDRCVINFO. */
        if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT))
                sctp_ulpevent_read_sndrcvinfo(event, msg);

        err = copied;

        /* If skb's length exceeds the user's buffer, update the skb and
         * push it back to the receive_queue so that the next call to
         * recvmsg() will return the remaining data. Don't set MSG_EOR.
         */
        if (skb_len > copied) {
                msg->msg_flags &= ~MSG_EOR;
                if (flags & MSG_PEEK)
                        goto out_free;
                sctp_skb_pull(skb, copied);
                skb_queue_head(&sk->sk_receive_queue, skb);

                /* When only partial message is copied to the user, increase
                 * rwnd by that amount. If all the data in the skb is read,
                 * rwnd is updated when the event is freed.
                 */
                if (!sctp_ulpevent_is_notification(event))
                        sctp_assoc_rwnd_increase(event->asoc, copied);
                goto out;
        } else if ((event->msg_flags & MSG_NOTIFICATION) ||
                   (event->msg_flags & MSG_EOR))
                msg->msg_flags |= MSG_EOR;
        else
                msg->msg_flags &= ~MSG_EOR;

out_free:
        if (flags & MSG_PEEK) {
                /* Release the skb reference acquired after peeking the skb in
                 * sctp_skb_recv_datagram().
                 */
                kfree_skb(skb);
        } else {
                /* Free the event which includes releasing the reference to
                 * the owner of the skb, freeing the skb and updating the
                 * rwnd.
                 */
                sctp_ulpevent_free(event);
        }
out:
        release_sock(sk);
        return err;
}

/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
 *
 * This option is a on/off flag.  If enabled no SCTP message
 * fragmentation will be performed.  Instead if a message being sent
 * exceeds the current PMTU size, the message will NOT be sent and
 * instead a error will be indicated to the user.
 */
static int sctp_setsockopt_disable_fragments(struct sock *sk, int *val,
                                             unsigned int optlen)
{
        if (optlen < sizeof(int))
                return -EINVAL;
        sctp_sk(sk)->disable_fragments = (*val == 0) ? 0 : 1;
        return 0;
}

static int sctp_setsockopt_events(struct sock *sk, __u8 *sn_type,
                                  unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        int i;

        if (optlen > sizeof(struct sctp_event_subscribe))
                return -EINVAL;

        for (i = 0; i < optlen; i++)
                sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i,
                                       sn_type[i]);

        list_for_each_entry(asoc, &sp->ep->asocs, asocs)
                asoc->subscribe = sctp_sk(sk)->subscribe;

        /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
         * if there is no data to be sent or retransmit, the stack will
         * immediately send up this notification.
         */
        if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) {
                struct sctp_ulpevent *event;

                asoc = sctp_id2assoc(sk, 0);
                if (asoc && sctp_outq_is_empty(&asoc->outqueue)) {
                        event = sctp_ulpevent_make_sender_dry_event(asoc,
                                        GFP_USER | __GFP_NOWARN);
                        if (!event)
                                return -ENOMEM;

                        asoc->stream.si->enqueue_event(&asoc->ulpq, event);
                }
        }

        return 0;
}

/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE)
 *
 * This socket option is applicable to the UDP-style socket only.  When
 * set it will cause associations that are idle for more than the
 * specified number of seconds to automatically close.  An association
 * being idle is defined an association that has NOT sent or received
 * user data.  The special value of '0' indicates that no automatic
 * close of any associations should be performed.  The option expects an
 * integer defining the number of seconds of idle time before an
 * association is closed.
 */
static int sctp_setsockopt_autoclose(struct sock *sk, u32 *optval,
                                     unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct net *net = sock_net(sk);

        /* Applicable to UDP-style socket only */
        if (sctp_style(sk, TCP))
                return -EOPNOTSUPP;
        if (optlen != sizeof(int))
                return -EINVAL;

        sp->autoclose = *optval;
        if (sp->autoclose > net->sctp.max_autoclose)
                sp->autoclose = net->sctp.max_autoclose;

        return 0;
}

/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS)
 *
 * Applications can enable or disable heartbeats for any peer address of
 * an association, modify an address's heartbeat interval, force a
 * heartbeat to be sent immediately, and adjust the address's maximum
 * number of retransmissions sent before an address is considered
 * unreachable.  The following structure is used to access and modify an
 * address's parameters:
 *
 *  struct sctp_paddrparams {
 *     sctp_assoc_t            spp_assoc_id;
 *     struct sockaddr_storage spp_address;
 *     uint32_t                spp_hbinterval;
 *     uint16_t                spp_pathmaxrxt;
 *     uint32_t                spp_pathmtu;
 *     uint32_t                spp_sackdelay;
 *     uint32_t                spp_flags;
 *     uint32_t                spp_ipv6_flowlabel;
 *     uint8_t                 spp_dscp;
 * };
 *
 *   spp_assoc_id    - (one-to-many style socket) This is filled in the
 *                     application, and identifies the association for
 *                     this query.
 *   spp_address     - This specifies which address is of interest.
 *   spp_hbinterval  - This contains the value of the heartbeat interval,
 *                     in milliseconds.  If a  value of zero
 *                     is present in this field then no changes are to
 *                     be made to this parameter.
 *   spp_pathmaxrxt  - This contains the maximum number of
 *                     retransmissions before this address shall be
 *                     considered unreachable. If a  value of zero
 *                     is present in this field then no changes are to
 *                     be made to this parameter.
 *   spp_pathmtu     - When Path MTU discovery is disabled the value
 *                     specified here will be the "fixed" path mtu.
 *                     Note that if the spp_address field is empty
 *                     then all associations on this address will
 *                     have this fixed path mtu set upon them.
 *
 *   spp_sackdelay   - When delayed sack is enabled, this value specifies
 *                     the number of milliseconds that sacks will be delayed
 *                     for. This value will apply to all addresses of an
 *                     association if the spp_address field is empty. Note
 *                     also, that if delayed sack is enabled and this
 *                     value is set to 0, no change is made to the last
 *                     recorded delayed sack timer value.
 *
 *   spp_flags       - These flags are used to control various features
 *                     on an association. The flag field may contain
 *                     zero or more of the following options.
 *
 *                     SPP_HB_ENABLE  - Enable heartbeats on the
 *                     specified address. Note that if the address
 *                     field is empty all addresses for the association
 *                     have heartbeats enabled upon them.
 *
 *                     SPP_HB_DISABLE - Disable heartbeats on the
 *                     speicifed address. Note that if the address
 *                     field is empty all addresses for the association
 *                     will have their heartbeats disabled. Note also
 *                     that SPP_HB_ENABLE and SPP_HB_DISABLE are
 *                     mutually exclusive, only one of these two should
 *                     be specified. Enabling both fields will have
 *                     undetermined results.
 *
 *                     SPP_HB_DEMAND - Request a user initiated heartbeat
 *                     to be made immediately.
 *
 *                     SPP_HB_TIME_IS_ZERO - Specify's that the time for
 *                     heartbeat delayis to be set to the value of 0
 *                     milliseconds.
 *
 *                     SPP_PMTUD_ENABLE - This field will enable PMTU
 *                     discovery upon the specified address. Note that
 *                     if the address feild is empty then all addresses
 *                     on the association are effected.
 *
 *                     SPP_PMTUD_DISABLE - This field will disable PMTU
 *                     discovery upon the specified address. Note that
 *                     if the address feild is empty then all addresses
 *                     on the association are effected. Not also that
 *                     SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
 *                     exclusive. Enabling both will have undetermined
 *                     results.
 *
 *                     SPP_SACKDELAY_ENABLE - Setting this flag turns
 *                     on delayed sack. The time specified in spp_sackdelay
 *                     is used to specify the sack delay for this address. Note
 *                     that if spp_address is empty then all addresses will
 *                     enable delayed sack and take on the sack delay
 *                     value specified in spp_sackdelay.
 *                     SPP_SACKDELAY_DISABLE - Setting this flag turns
 *                     off delayed sack. If the spp_address field is blank then
 *                     delayed sack is disabled for the entire association. Note
 *                     also that this field is mutually exclusive to
 *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
 *                     results.
 *
 *                     SPP_IPV6_FLOWLABEL:  Setting this flag enables the
 *                     setting of the IPV6 flow label value.  The value is
 *                     contained in the spp_ipv6_flowlabel field.
 *                     Upon retrieval, this flag will be set to indicate that
 *                     the spp_ipv6_flowlabel field has a valid value returned.
 *                     If a specific destination address is set (in the
 *                     spp_address field), then the value returned is that of
 *                     the address.  If just an association is specified (and
 *                     no address), then the association's default flow label
 *                     is returned.  If neither an association nor a destination
 *                     is specified, then the socket's default flow label is
 *                     returned.  For non-IPv6 sockets, this flag will be left
 *                     cleared.
 *
 *                     SPP_DSCP:  Setting this flag enables the setting of the
 *                     Differentiated Services Code Point (DSCP) value
 *                     associated with either the association or a specific
 *                     address.  The value is obtained in the spp_dscp field.
 *                     Upon retrieval, this flag will be set to indicate that
 *                     the spp_dscp field has a valid value returned.  If a
 *                     specific destination address is set when called (in the
 *                     spp_address field), then that specific destination
 *                     address's DSCP value is returned.  If just an association
 *                     is specified, then the association's default DSCP is
 *                     returned.  If neither an association nor a destination is
 *                     specified, then the socket's default DSCP is returned.
 *
 *   spp_ipv6_flowlabel
 *                   - This field is used in conjunction with the
 *                     SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
 *                     The 20 least significant bits are used for the flow
 *                     label.  This setting has precedence over any IPv6-layer
 *                     setting.
 *
 *   spp_dscp        - This field is used in conjunction with the SPP_DSCP flag
 *                     and contains the DSCP.  The 6 most significant bits are
 *                     used for the DSCP.  This setting has precedence over any
 *                     IPv4- or IPv6- layer setting.
 */
static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
                                       struct sctp_transport   *trans,
                                       struct sctp_association *asoc,
                                       struct sctp_sock        *sp,
                                       int                      hb_change,
                                       int                      pmtud_change,
                                       int                      sackdelay_change)
{
        int error;

        if (params->spp_flags & SPP_HB_DEMAND && trans) {
                error = sctp_primitive_REQUESTHEARTBEAT(trans->asoc->base.net,
                                                        trans->asoc, trans);
                if (error)
                        return error;
        }

        /* Note that unless the spp_flag is set to SPP_HB_ENABLE the value of
         * this field is ignored.  Note also that a value of zero indicates
         * the current setting should be left unchanged.
         */
        if (params->spp_flags & SPP_HB_ENABLE) {

                /* Re-zero the interval if the SPP_HB_TIME_IS_ZERO is
                 * set.  This lets us use 0 value when this flag
                 * is set.
                 */
                if (params->spp_flags & SPP_HB_TIME_IS_ZERO)
                        params->spp_hbinterval = 0;

                if (params->spp_hbinterval ||
                    (params->spp_flags & SPP_HB_TIME_IS_ZERO)) {
                        if (trans) {
                                trans->hbinterval =
                                    msecs_to_jiffies(params->spp_hbinterval);
                                sctp_transport_reset_hb_timer(trans);
                        } else if (asoc) {
                                asoc->hbinterval =
                                    msecs_to_jiffies(params->spp_hbinterval);
                        } else {
                                sp->hbinterval = params->spp_hbinterval;
                        }
                }
        }

        if (hb_change) {
                if (trans) {
                        trans->param_flags =
                                (trans->param_flags & ~SPP_HB) | hb_change;
                } else if (asoc) {
                        asoc->param_flags =
                                (asoc->param_flags & ~SPP_HB) | hb_change;
                } else {
                        sp->param_flags =
                                (sp->param_flags & ~SPP_HB) | hb_change;
                }
        }

        /* When Path MTU discovery is disabled the value specified here will
         * be the "fixed" path mtu (i.e. the value of the spp_flags field must
         * include the flag SPP_PMTUD_DISABLE for this field to have any
         * effect).
         */
        if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
                if (trans) {
                        trans->pathmtu = params->spp_pathmtu;
                        sctp_assoc_sync_pmtu(asoc);
                } else if (asoc) {
                        sctp_assoc_set_pmtu(asoc, params->spp_pathmtu);
                } else {
                        sp->pathmtu = params->spp_pathmtu;
                }
        }

        if (pmtud_change) {
                if (trans) {
                        int update = (trans->param_flags & SPP_PMTUD_DISABLE) &&
                                (params->spp_flags & SPP_PMTUD_ENABLE);
                        trans->param_flags =
                                (trans->param_flags & ~SPP_PMTUD) | pmtud_change;
                        if (update) {
                                sctp_transport_pmtu(trans, sctp_opt2sk(sp));
                                sctp_assoc_sync_pmtu(asoc);
                        }
                        sctp_transport_pl_reset(trans);
                } else if (asoc) {
                        asoc->param_flags =
                                (asoc->param_flags & ~SPP_PMTUD) | pmtud_change;
                } else {
                        sp->param_flags =
                                (sp->param_flags & ~SPP_PMTUD) | pmtud_change;
                }
        }

        /* Note that unless the spp_flag is set to SPP_SACKDELAY_ENABLE the
         * value of this field is ignored.  Note also that a value of zero
         * indicates the current setting should be left unchanged.
         */
        if ((params->spp_flags & SPP_SACKDELAY_ENABLE) && params->spp_sackdelay) {
                if (trans) {
                        trans->sackdelay =
                                msecs_to_jiffies(params->spp_sackdelay);
                } else if (asoc) {
                        asoc->sackdelay =
                                msecs_to_jiffies(params->spp_sackdelay);
                } else {
                        sp->sackdelay = params->spp_sackdelay;
                }
        }

        if (sackdelay_change) {
                if (trans) {
                        trans->param_flags =
                                (trans->param_flags & ~SPP_SACKDELAY) |
                                sackdelay_change;
                } else if (asoc) {
                        asoc->param_flags =
                                (asoc->param_flags & ~SPP_SACKDELAY) |
                                sackdelay_change;
                } else {
                        sp->param_flags =
                                (sp->param_flags & ~SPP_SACKDELAY) |
                                sackdelay_change;
                }
        }

        /* Note that a value of zero indicates the current setting should be
           left unchanged.
         */
        if (params->spp_pathmaxrxt) {
                if (trans) {
                        trans->pathmaxrxt = params->spp_pathmaxrxt;
                } else if (asoc) {
                        asoc->pathmaxrxt = params->spp_pathmaxrxt;
                } else {
                        sp->pathmaxrxt = params->spp_pathmaxrxt;
                }
        }

        if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
                if (trans) {
                        if (trans->ipaddr.sa.sa_family == AF_INET6) {
                                trans->flowlabel = params->spp_ipv6_flowlabel &
                                                   SCTP_FLOWLABEL_VAL_MASK;
                                trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
                        }
                } else if (asoc) {
                        struct sctp_transport *t;

                        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                                            transports) {
                                if (t->ipaddr.sa.sa_family != AF_INET6)
                                        continue;
                                t->flowlabel = params->spp_ipv6_flowlabel &
                                               SCTP_FLOWLABEL_VAL_MASK;
                                t->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
                        }
                        asoc->flowlabel = params->spp_ipv6_flowlabel &
                                          SCTP_FLOWLABEL_VAL_MASK;
                        asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
                } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) {
                        sp->flowlabel = params->spp_ipv6_flowlabel &
                                        SCTP_FLOWLABEL_VAL_MASK;
                        sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
                }
        }

        if (params->spp_flags & SPP_DSCP) {
                if (trans) {
                        trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
                        trans->dscp |= SCTP_DSCP_SET_MASK;
                } else if (asoc) {
                        struct sctp_transport *t;

                        list_for_each_entry(t, &asoc->peer.transport_addr_list,
                                            transports) {
                                t->dscp = params->spp_dscp &
                                          SCTP_DSCP_VAL_MASK;
                                t->dscp |= SCTP_DSCP_SET_MASK;
                        }
                        asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
                        asoc->dscp |= SCTP_DSCP_SET_MASK;
                } else {
                        sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
                        sp->dscp |= SCTP_DSCP_SET_MASK;
                }
        }

        return 0;
}

static int sctp_setsockopt_peer_addr_params(struct sock *sk,
                                            struct sctp_paddrparams *params,
                                            unsigned int optlen)
{
        struct sctp_transport   *trans = NULL;
        struct sctp_association *asoc = NULL;
        struct sctp_sock        *sp = sctp_sk(sk);
        int error;
        int hb_change, pmtud_change, sackdelay_change;

        if (optlen == ALIGN(offsetof(struct sctp_paddrparams,
                                            spp_ipv6_flowlabel), 4)) {
                if (params->spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL))
                        return -EINVAL;
        } else if (optlen != sizeof(*params)) {
                return -EINVAL;
        }

        /* Validate flags and value parameters. */
        hb_change        = params->spp_flags & SPP_HB;
        pmtud_change     = params->spp_flags & SPP_PMTUD;
        sackdelay_change = params->spp_flags & SPP_SACKDELAY;

        if (hb_change        == SPP_HB ||
            pmtud_change     == SPP_PMTUD ||
            sackdelay_change == SPP_SACKDELAY ||
            params->spp_sackdelay > 500 ||
            (params->spp_pathmtu &&
             params->spp_pathmtu < SCTP_DEFAULT_MINSEGMENT))
                return -EINVAL;

        /* If an address other than INADDR_ANY is specified, and
         * no transport is found, then the request is invalid.
         */
        if (!sctp_is_any(sk, (union sctp_addr *)&params->spp_address)) {
                trans = sctp_addr_id2transport(sk, &params->spp_address,
                                               params->spp_assoc_id);
                if (!trans)
                        return -EINVAL;
        }

        /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, params->spp_assoc_id);
        if (!asoc && params->spp_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        /* Heartbeat demand can only be sent on a transport or
         * association, but not a socket.
         */
        if (params->spp_flags & SPP_HB_DEMAND && !trans && !asoc)
                return -EINVAL;

        /* Process parameters. */
        error = sctp_apply_peer_addr_params(params, trans, asoc, sp,
                                            hb_change, pmtud_change,
                                            sackdelay_change);

        if (error)
                return error;

        /* If changes are for association, also apply parameters to each
         * transport.
         */
        if (!trans && asoc) {
                list_for_each_entry(trans, &asoc->peer.transport_addr_list,
                                transports) {
                        sctp_apply_peer_addr_params(params, trans, asoc, sp,
                                                    hb_change, pmtud_change,
                                                    sackdelay_change);
                }
        }

        return 0;
}

static inline __u32 sctp_spp_sackdelay_enable(__u32 param_flags)
{
        return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_ENABLE;
}

static inline __u32 sctp_spp_sackdelay_disable(__u32 param_flags)
{
        return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_DISABLE;
}

static void sctp_apply_asoc_delayed_ack(struct sctp_sack_info *params,
                                        struct sctp_association *asoc)
{
        struct sctp_transport *trans;

        if (params->sack_delay) {
                asoc->sackdelay = msecs_to_jiffies(params->sack_delay);
                asoc->param_flags =
                        sctp_spp_sackdelay_enable(asoc->param_flags);
        }
        if (params->sack_freq == 1) {
                asoc->param_flags =
                        sctp_spp_sackdelay_disable(asoc->param_flags);
        } else if (params->sack_freq > 1) {
                asoc->sackfreq = params->sack_freq;
                asoc->param_flags =
                        sctp_spp_sackdelay_enable(asoc->param_flags);
        }

        list_for_each_entry(trans, &asoc->peer.transport_addr_list,
                            transports) {
                if (params->sack_delay) {
                        trans->sackdelay = msecs_to_jiffies(params->sack_delay);
                        trans->param_flags =
                                sctp_spp_sackdelay_enable(trans->param_flags);
                }
                if (params->sack_freq == 1) {
                        trans->param_flags =
                                sctp_spp_sackdelay_disable(trans->param_flags);
                } else if (params->sack_freq > 1) {
                        trans->sackfreq = params->sack_freq;
                        trans->param_flags =
                                sctp_spp_sackdelay_enable(trans->param_flags);
                }
        }
}

/*
 * 7.1.23.  Get or set delayed ack timer (SCTP_DELAYED_SACK)
 *
 * This option will effect the way delayed acks are performed.  This
 * option allows you to get or set the delayed ack time, in
 * milliseconds.  It also allows changing the delayed ack frequency.
 * Changing the frequency to 1 disables the delayed sack algorithm.  If
 * the assoc_id is 0, then this sets or gets the endpoints default
 * values.  If the assoc_id field is non-zero, then the set or get
 * effects the specified association for the one to many model (the
 * assoc_id field is ignored by the one to one model).  Note that if
 * sack_delay or sack_freq are 0 when setting this option, then the
 * current values will remain unchanged.
 *
 * struct sctp_sack_info {
 *     sctp_assoc_t            sack_assoc_id;
 *     uint32_t                sack_delay;
 *     uint32_t                sack_freq;
 * };
 *
 * sack_assoc_id -  This parameter, indicates which association the user
 *    is performing an action upon.  Note that if this field's value is
 *    zero then the endpoints default value is changed (effecting future
 *    associations only).
 *
 * sack_delay -  This parameter contains the number of milliseconds that
 *    the user is requesting the delayed ACK timer be set to.  Note that
 *    this value is defined in the standard to be between 200 and 500
 *    milliseconds.
 *
 * sack_freq -  This parameter contains the number of packets that must
 *    be received before a sack is sent without waiting for the delay
 *    timer to expire.  The default value for this is 2, setting this
 *    value to 1 will disable the delayed sack algorithm.
 */
static int __sctp_setsockopt_delayed_ack(struct sock *sk,
                                         struct sctp_sack_info *params)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;

        /* Validate value parameter. */
        if (params->sack_delay > 500)
                return -EINVAL;

        /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, params->sack_assoc_id);
        if (!asoc && params->sack_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                sctp_apply_asoc_delayed_ack(params, asoc);

                return 0;
        }

        if (sctp_style(sk, TCP))
                params->sack_assoc_id = SCTP_FUTURE_ASSOC;

        if (params->sack_assoc_id == SCTP_FUTURE_ASSOC ||
            params->sack_assoc_id == SCTP_ALL_ASSOC) {
                if (params->sack_delay) {
                        sp->sackdelay = params->sack_delay;
                        sp->param_flags =
                                sctp_spp_sackdelay_enable(sp->param_flags);
                }
                if (params->sack_freq == 1) {
                        sp->param_flags =
                                sctp_spp_sackdelay_disable(sp->param_flags);
                } else if (params->sack_freq > 1) {
                        sp->sackfreq = params->sack_freq;
                        sp->param_flags =
                                sctp_spp_sackdelay_enable(sp->param_flags);
                }
        }

        if (params->sack_assoc_id == SCTP_CURRENT_ASSOC ||
            params->sack_assoc_id == SCTP_ALL_ASSOC)
                list_for_each_entry(asoc, &sp->ep->asocs, asocs)
                        sctp_apply_asoc_delayed_ack(params, asoc);

        return 0;
}

static int sctp_setsockopt_delayed_ack(struct sock *sk,
                                       struct sctp_sack_info *params,
                                       unsigned int optlen)
{
        if (optlen == sizeof(struct sctp_assoc_value)) {
                struct sctp_assoc_value *v = (struct sctp_assoc_value *)params;
                struct sctp_sack_info p;

                pr_warn_ratelimited(DEPRECATED
                                    "%s (pid %d) "
                                    "Use of struct sctp_assoc_value in delayed_ack socket option.\n"
                                    "Use struct sctp_sack_info instead\n",
                                    current->comm, task_pid_nr(current));

                p.sack_assoc_id = v->assoc_id;
                p.sack_delay = v->assoc_value;
                p.sack_freq = v->assoc_value ? 0 : 1;
                return __sctp_setsockopt_delayed_ack(sk, &p);
        }

        if (optlen != sizeof(struct sctp_sack_info))
                return -EINVAL;
        if (params->sack_delay == 0 && params->sack_freq == 0)
                return 0;
        return __sctp_setsockopt_delayed_ack(sk, params);
}

/* 7.1.3 Initialization Parameters (SCTP_INITMSG)
 *
 * Applications can specify protocol parameters for the default association
 * initialization.  The option name argument to setsockopt() and getsockopt()
 * is SCTP_INITMSG.
 *
 * Setting initialization parameters is effective only on an unconnected
 * socket (for UDP-style sockets only future associations are effected
 * by the change).  With TCP-style sockets, this option is inherited by
 * sockets derived from a listener socket.
 */
static int sctp_setsockopt_initmsg(struct sock *sk, struct sctp_initmsg *sinit,
                                   unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);

        if (optlen != sizeof(struct sctp_initmsg))
                return -EINVAL;

        if (sinit->sinit_num_ostreams)
                sp->initmsg.sinit_num_ostreams = sinit->sinit_num_ostreams;
        if (sinit->sinit_max_instreams)
                sp->initmsg.sinit_max_instreams = sinit->sinit_max_instreams;
        if (sinit->sinit_max_attempts)
                sp->initmsg.sinit_max_attempts = sinit->sinit_max_attempts;
        if (sinit->sinit_max_init_timeo)
                sp->initmsg.sinit_max_init_timeo = sinit->sinit_max_init_timeo;

        return 0;
}

/*
 * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM)
 *
 *   Applications that wish to use the sendto() system call may wish to
 *   specify a default set of parameters that would normally be supplied
 *   through the inclusion of ancillary data.  This socket option allows
 *   such an application to set the default sctp_sndrcvinfo structure.
 *   The application that wishes to use this socket option simply passes
 *   in to this call the sctp_sndrcvinfo structure defined in Section
 *   5.2.2) The input parameters accepted by this call include
 *   sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context,
 *   sinfo_timetolive.  The user must provide the sinfo_assoc_id field in
 *   to this call if the caller is using the UDP model.
 */
static int sctp_setsockopt_default_send_param(struct sock *sk,
                                              struct sctp_sndrcvinfo *info,
                                              unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;

        if (optlen != sizeof(*info))
                return -EINVAL;
        if (info->sinfo_flags &
            ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
              SCTP_ABORT | SCTP_EOF))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, info->sinfo_assoc_id);
        if (!asoc && info->sinfo_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                asoc->default_stream = info->sinfo_stream;
                asoc->default_flags = info->sinfo_flags;
                asoc->default_ppid = info->sinfo_ppid;
                asoc->default_context = info->sinfo_context;
                asoc->default_timetolive = info->sinfo_timetolive;

                return 0;
        }

        if (sctp_style(sk, TCP))
                info->sinfo_assoc_id = SCTP_FUTURE_ASSOC;

        if (info->sinfo_assoc_id == SCTP_FUTURE_ASSOC ||
            info->sinfo_assoc_id == SCTP_ALL_ASSOC) {
                sp->default_stream = info->sinfo_stream;
                sp->default_flags = info->sinfo_flags;
                sp->default_ppid = info->sinfo_ppid;
                sp->default_context = info->sinfo_context;
                sp->default_timetolive = info->sinfo_timetolive;
        }

        if (info->sinfo_assoc_id == SCTP_CURRENT_ASSOC ||
            info->sinfo_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &sp->ep->asocs, asocs) {
                        asoc->default_stream = info->sinfo_stream;
                        asoc->default_flags = info->sinfo_flags;
                        asoc->default_ppid = info->sinfo_ppid;
                        asoc->default_context = info->sinfo_context;
                        asoc->default_timetolive = info->sinfo_timetolive;
                }
        }

        return 0;
}

/* RFC6458, Section 8.1.31. Set/get Default Send Parameters
 * (SCTP_DEFAULT_SNDINFO)
 */
static int sctp_setsockopt_default_sndinfo(struct sock *sk,
                                           struct sctp_sndinfo *info,
                                           unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;

        if (optlen != sizeof(*info))
                return -EINVAL;
        if (info->snd_flags &
            ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
              SCTP_ABORT | SCTP_EOF))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, info->snd_assoc_id);
        if (!asoc && info->snd_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                asoc->default_stream = info->snd_sid;
                asoc->default_flags = info->snd_flags;
                asoc->default_ppid = info->snd_ppid;
                asoc->default_context = info->snd_context;

                return 0;
        }

        if (sctp_style(sk, TCP))
                info->snd_assoc_id = SCTP_FUTURE_ASSOC;

        if (info->snd_assoc_id == SCTP_FUTURE_ASSOC ||
            info->snd_assoc_id == SCTP_ALL_ASSOC) {
                sp->default_stream = info->snd_sid;
                sp->default_flags = info->snd_flags;
                sp->default_ppid = info->snd_ppid;
                sp->default_context = info->snd_context;
        }

        if (info->snd_assoc_id == SCTP_CURRENT_ASSOC ||
            info->snd_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &sp->ep->asocs, asocs) {
                        asoc->default_stream = info->snd_sid;
                        asoc->default_flags = info->snd_flags;
                        asoc->default_ppid = info->snd_ppid;
                        asoc->default_context = info->snd_context;
                }
        }

        return 0;
}

/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
 *
 * Requests that the local SCTP stack use the enclosed peer address as
 * the association primary.  The enclosed address must be one of the
 * association peer's addresses.
 */
static int sctp_setsockopt_primary_addr(struct sock *sk, struct sctp_prim *prim,
                                        unsigned int optlen)
{
        struct sctp_transport *trans;
        struct sctp_af *af;
        int err;

        if (optlen != sizeof(struct sctp_prim))
                return -EINVAL;

        /* Allow security module to validate address but need address len. */
        af = sctp_get_af_specific(prim->ssp_addr.ss_family);
        if (!af)
                return -EINVAL;

        err = security_sctp_bind_connect(sk, SCTP_PRIMARY_ADDR,
                                         (struct sockaddr *)&prim->ssp_addr,
                                         af->sockaddr_len);
        if (err)
                return err;

        trans = sctp_addr_id2transport(sk, &prim->ssp_addr, prim->ssp_assoc_id);
        if (!trans)
                return -EINVAL;

        sctp_assoc_set_primary(trans->asoc, trans);

        return 0;
}

/*
 * 7.1.5 SCTP_NODELAY
 *
 * Turn on/off any Nagle-like algorithm.  This means that packets are
 * generally sent as soon as possible and no unnecessary delays are
 * introduced, at the cost of more packets in the network.  Expects an
 *  integer boolean flag.
 */
static int sctp_setsockopt_nodelay(struct sock *sk, int *val,
                                   unsigned int optlen)
{
        if (optlen < sizeof(int))
                return -EINVAL;
        sctp_sk(sk)->nodelay = (*val == 0) ? 0 : 1;
        return 0;
}

/*
 *
 * 7.1.1 SCTP_RTOINFO
 *
 * The protocol parameters used to initialize and bound retransmission
 * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access
 * and modify these parameters.
 * All parameters are time values, in milliseconds.  A value of 0, when
 * modifying the parameters, indicates that the current value should not
 * be changed.
 *
 */
static int sctp_setsockopt_rtoinfo(struct sock *sk,
                                   struct sctp_rtoinfo *rtoinfo,
                                   unsigned int optlen)
{
        struct sctp_association *asoc;
        unsigned long rto_min, rto_max;
        struct sctp_sock *sp = sctp_sk(sk);

        if (optlen != sizeof (struct sctp_rtoinfo))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, rtoinfo->srto_assoc_id);

        /* Set the values to the specific association */
        if (!asoc && rtoinfo->srto_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        rto_max = rtoinfo->srto_max;
        rto_min = rtoinfo->srto_min;

        if (rto_max)
                rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max;
        else
                rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max;

        if (rto_min)
                rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min;
        else
                rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min;

        if (rto_min > rto_max)
                return -EINVAL;

        if (asoc) {
                if (rtoinfo->srto_initial != 0)
                        asoc->rto_initial =
                                msecs_to_jiffies(rtoinfo->srto_initial);
                asoc->rto_max = rto_max;
                asoc->rto_min = rto_min;
        } else {
                /* If there is no association or the association-id = 0
                 * set the values to the endpoint.
                 */
                if (rtoinfo->srto_initial != 0)
                        sp->rtoinfo.srto_initial = rtoinfo->srto_initial;
                sp->rtoinfo.srto_max = rto_max;
                sp->rtoinfo.srto_min = rto_min;
        }

        return 0;
}

/*
 *
 * 7.1.2 SCTP_ASSOCINFO
 *
 * This option is used to tune the maximum retransmission attempts
 * of the association.
 * Returns an error if the new association retransmission value is
 * greater than the sum of the retransmission value  of the peer.
 * See [SCTP] for more information.
 *
 */
static int sctp_setsockopt_associnfo(struct sock *sk,
                                     struct sctp_assocparams *assocparams,
                                     unsigned int optlen)
{

        struct sctp_association *asoc;

        if (optlen != sizeof(struct sctp_assocparams))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, assocparams->sasoc_assoc_id);

        if (!asoc && assocparams->sasoc_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        /* Set the values to the specific association */
        if (asoc) {
                if (assocparams->sasoc_asocmaxrxt != 0) {
                        __u32 path_sum = 0;
                        int   paths = 0;
                        struct sctp_transport *peer_addr;

                        list_for_each_entry(peer_addr, &asoc->peer.transport_addr_list,
                                        transports) {
                                path_sum += peer_addr->pathmaxrxt;
                                paths++;
                        }

                        /* Only validate asocmaxrxt if we have more than
                         * one path/transport.  We do this because path
                         * retransmissions are only counted when we have more
                         * then one path.
                         */
                        if (paths > 1 &&
                            assocparams->sasoc_asocmaxrxt > path_sum)
                                return -EINVAL;

                        asoc->max_retrans = assocparams->sasoc_asocmaxrxt;
                }

                if (assocparams->sasoc_cookie_life != 0)
                        asoc->cookie_life =
                                ms_to_ktime(assocparams->sasoc_cookie_life);
        } else {
                /* Set the values to the endpoint */
                struct sctp_sock *sp = sctp_sk(sk);

                if (assocparams->sasoc_asocmaxrxt != 0)
                        sp->assocparams.sasoc_asocmaxrxt =
                                                assocparams->sasoc_asocmaxrxt;
                if (assocparams->sasoc_cookie_life != 0)
                        sp->assocparams.sasoc_cookie_life =
                                                assocparams->sasoc_cookie_life;
        }
        return 0;
}

/*
 * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR)
 *
 * This socket option is a boolean flag which turns on or off mapped V4
 * addresses.  If this option is turned on and the socket is type
 * PF_INET6, then IPv4 addresses will be mapped to V6 representation.
 * If this option is turned off, then no mapping will be done of V4
 * addresses and a user will receive both PF_INET6 and PF_INET type
 * addresses on the socket.
 */
static int sctp_setsockopt_mappedv4(struct sock *sk, int *val,
                                    unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);

        if (optlen < sizeof(int))
                return -EINVAL;
        if (*val)
                sp->v4mapped = 1;
        else
                sp->v4mapped = 0;

        return 0;
}

/*
 * 8.1.16.  Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG)
 * This option will get or set the maximum size to put in any outgoing
 * SCTP DATA chunk.  If a message is larger than this size it will be
 * fragmented by SCTP into the specified size.  Note that the underlying
 * SCTP implementation may fragment into smaller sized chunks when the
 * PMTU of the underlying association is smaller than the value set by
 * the user.  The default value for this option is '0' which indicates
 * the user is NOT limiting fragmentation and only the PMTU will effect
 * SCTP's choice of DATA chunk size.  Note also that values set larger
 * than the maximum size of an IP datagram will effectively let SCTP
 * control fragmentation (i.e. the same as setting this option to 0).
 *
 * The following structure is used to access and modify this parameter:
 *
 * struct sctp_assoc_value {
 *   sctp_assoc_t assoc_id;
 *   uint32_t assoc_value;
 * };
 *
 * assoc_id:  This parameter is ignored for one-to-one style sockets.
 *    For one-to-many style sockets this parameter indicates which
 *    association the user is performing an action upon.  Note that if
 *    this field's value is zero then the endpoints default value is
 *    changed (effecting future associations only).
 * assoc_value:  This parameter specifies the maximum size in bytes.
 */
static int sctp_setsockopt_maxseg(struct sock *sk,
                                  struct sctp_assoc_value *params,
                                  unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        sctp_assoc_t assoc_id;
        int val;

        if (optlen == sizeof(int)) {
                pr_warn_ratelimited(DEPRECATED
                                    "%s (pid %d) "
                                    "Use of int in maxseg socket option.\n"
                                    "Use struct sctp_assoc_value instead\n",
                                    current->comm, task_pid_nr(current));
                assoc_id = SCTP_FUTURE_ASSOC;
                val = *(int *)params;
        } else if (optlen == sizeof(struct sctp_assoc_value)) {
                assoc_id = params->assoc_id;
                val = params->assoc_value;
        } else {
                return -EINVAL;
        }

        asoc = sctp_id2assoc(sk, assoc_id);
        if (!asoc && assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (val) {
                int min_len, max_len;
                __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) :
                                 sizeof(struct sctp_data_chunk);

                min_len = sctp_min_frag_point(sp, datasize);
                max_len = SCTP_MAX_CHUNK_LEN - datasize;

                if (val < min_len || val > max_len)
                        return -EINVAL;
        }

        if (asoc) {
                asoc->user_frag = val;
                sctp_assoc_update_frag_point(asoc);
        } else {
                sp->user_frag = val;
        }

        return 0;
}


/*
 *  7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR)
 *
 *   Requests that the peer mark the enclosed address as the association
 *   primary. The enclosed address must be one of the association's
 *   locally bound addresses. The following structure is used to make a
 *   set primary request:
 */
static int sctp_setsockopt_peer_primary_addr(struct sock *sk,
                                             struct sctp_setpeerprim *prim,
                                             unsigned int optlen)
{
        struct sctp_sock        *sp;
        struct sctp_association        *asoc = NULL;
        struct sctp_chunk        *chunk;
        struct sctp_af                *af;
        int                         err;

        sp = sctp_sk(sk);

        if (!sp->ep->asconf_enable)
                return -EPERM;

        if (optlen != sizeof(struct sctp_setpeerprim))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, prim->sspp_assoc_id);
        if (!asoc)
                return -EINVAL;

        if (!asoc->peer.asconf_capable)
                return -EPERM;

        if (asoc->peer.addip_disabled_mask & SCTP_PARAM_SET_PRIMARY)
                return -EPERM;

        if (!sctp_state(asoc, ESTABLISHED))
                return -ENOTCONN;

        af = sctp_get_af_specific(prim->sspp_addr.ss_family);
        if (!af)
                return -EINVAL;

        if (!af->addr_valid((union sctp_addr *)&prim->sspp_addr, sp, NULL))
                return -EADDRNOTAVAIL;

        if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim->sspp_addr))
                return -EADDRNOTAVAIL;

        /* Allow security module to validate address. */
        err = security_sctp_bind_connect(sk, SCTP_SET_PEER_PRIMARY_ADDR,
                                         (struct sockaddr *)&prim->sspp_addr,
                                         af->sockaddr_len);
        if (err)
                return err;

        /* Create an ASCONF chunk with SET_PRIMARY parameter        */
        chunk = sctp_make_asconf_set_prim(asoc,
                                          (union sctp_addr *)&prim->sspp_addr);
        if (!chunk)
                return -ENOMEM;

        err = sctp_send_asconf(asoc, chunk);

        pr_debug("%s: we set peer primary addr primitively\n", __func__);

        return err;
}

static int sctp_setsockopt_adaptation_layer(struct sock *sk,
                                            struct sctp_setadaptation *adapt,
                                            unsigned int optlen)
{
        if (optlen != sizeof(struct sctp_setadaptation))
                return -EINVAL;

        sctp_sk(sk)->adaptation_ind = adapt->ssb_adaptation_ind;

        return 0;
}

/*
 * 7.1.29.  Set or Get the default context (SCTP_CONTEXT)
 *
 * The context field in the sctp_sndrcvinfo structure is normally only
 * used when a failed message is retrieved holding the value that was
 * sent down on the actual send call.  This option allows the setting of
 * a default context on an association basis that will be received on
 * reading messages from the peer.  This is especially helpful in the
 * one-2-many model for an application to keep some reference to an
 * internal state machine that is processing messages on the
 * association.  Note that the setting of this value only effects
 * received messages from the peer and does not effect the value that is
 * saved with outbound messages.
 */
static int sctp_setsockopt_context(struct sock *sk,
                                   struct sctp_assoc_value *params,
                                   unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;

        if (optlen != sizeof(struct sctp_assoc_value))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                asoc->default_rcv_context = params->assoc_value;

                return 0;
        }

        if (sctp_style(sk, TCP))
                params->assoc_id = SCTP_FUTURE_ASSOC;

        if (params->assoc_id == SCTP_FUTURE_ASSOC ||
            params->assoc_id == SCTP_ALL_ASSOC)
                sp->default_rcv_context = params->assoc_value;

        if (params->assoc_id == SCTP_CURRENT_ASSOC ||
            params->assoc_id == SCTP_ALL_ASSOC)
                list_for_each_entry(asoc, &sp->ep->asocs, asocs)
                        asoc->default_rcv_context = params->assoc_value;

        return 0;
}

/*
 * 7.1.24.  Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE)
 *
 * This options will at a minimum specify if the implementation is doing
 * fragmented interleave.  Fragmented interleave, for a one to many
 * socket, is when subsequent calls to receive a message may return
 * parts of messages from different associations.  Some implementations
 * may allow you to turn this value on or off.  If so, when turned off,
 * no fragment interleave will occur (which will cause a head of line
 * blocking amongst multiple associations sharing the same one to many
 * socket).  When this option is turned on, then each receive call may
 * come from a different association (thus the user must receive data
 * with the extended calls (e.g. sctp_recvmsg) to keep track of which
 * association each receive belongs to.
 *
 * This option takes a boolean value.  A non-zero value indicates that
 * fragmented interleave is on.  A value of zero indicates that
 * fragmented interleave is off.
 *
 * Note that it is important that an implementation that allows this
 * option to be turned on, have it off by default.  Otherwise an unaware
 * application using the one to many model may become confused and act
 * incorrectly.
 */
static int sctp_setsockopt_fragment_interleave(struct sock *sk, int *val,
                                               unsigned int optlen)
{
        if (optlen != sizeof(int))
                return -EINVAL;

        sctp_sk(sk)->frag_interleave = !!*val;

        if (!sctp_sk(sk)->frag_interleave)
                sctp_sk(sk)->ep->intl_enable = 0;

        return 0;
}

/*
 * 8.1.21.  Set or Get the SCTP Partial Delivery Point
 *       (SCTP_PARTIAL_DELIVERY_POINT)
 *
 * This option will set or get the SCTP partial delivery point.  This
 * point is the size of a message where the partial delivery API will be
 * invoked to help free up rwnd space for the peer.  Setting this to a
 * lower value will cause partial deliveries to happen more often.  The
 * calls argument is an integer that sets or gets the partial delivery
 * point.  Note also that the call will fail if the user attempts to set
 * this value larger than the socket receive buffer size.
 *
 * Note that any single message having a length smaller than or equal to
 * the SCTP partial delivery point will be delivered in one single read
 * call as long as the user provided buffer is large enough to hold the
 * message.
 */
static int sctp_setsockopt_partial_delivery_point(struct sock *sk, u32 *val,
                                                  unsigned int optlen)
{
        if (optlen != sizeof(u32))
                return -EINVAL;

        /* Note: We double the receive buffer from what the user sets
         * it to be, also initial rwnd is based on rcvbuf/2.
         */
        if (*val > (sk->sk_rcvbuf >> 1))
                return -EINVAL;

        sctp_sk(sk)->pd_point = *val;

        return 0; /* is this the right error code? */
}

/*
 * 7.1.28.  Set or Get the maximum burst (SCTP_MAX_BURST)
 *
 * This option will allow a user to change the maximum burst of packets
 * that can be emitted by this association.  Note that the default value
 * is 4, and some implementations may restrict this setting so that it
 * can only be lowered.
 *
 * NOTE: This text doesn't seem right.  Do this on a socket basis with
 * future associations inheriting the socket value.
 */
static int sctp_setsockopt_maxburst(struct sock *sk,
                                    struct sctp_assoc_value *params,
                                    unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        sctp_assoc_t assoc_id;
        u32 assoc_value;

        if (optlen == sizeof(int)) {
                pr_warn_ratelimited(DEPRECATED
                                    "%s (pid %d) "
                                    "Use of int in max_burst socket option deprecated.\n"
                                    "Use struct sctp_assoc_value instead\n",
                                    current->comm, task_pid_nr(current));
                assoc_id = SCTP_FUTURE_ASSOC;
                assoc_value = *((int *)params);
        } else if (optlen == sizeof(struct sctp_assoc_value)) {
                assoc_id = params->assoc_id;
                assoc_value = params->assoc_value;
        } else
                return -EINVAL;

        asoc = sctp_id2assoc(sk, assoc_id);
        if (!asoc && assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                asoc->max_burst = assoc_value;

                return 0;
        }

        if (sctp_style(sk, TCP))
                assoc_id = SCTP_FUTURE_ASSOC;

        if (assoc_id == SCTP_FUTURE_ASSOC || assoc_id == SCTP_ALL_ASSOC)
                sp->max_burst = assoc_value;

        if (assoc_id == SCTP_CURRENT_ASSOC || assoc_id == SCTP_ALL_ASSOC)
                list_for_each_entry(asoc, &sp->ep->asocs, asocs)
                        asoc->max_burst = assoc_value;

        return 0;
}

/*
 * 7.1.18.  Add a chunk that must be authenticated (SCTP_AUTH_CHUNK)
 *
 * This set option adds a chunk type that the user is requesting to be
 * received only in an authenticated way.  Changes to the list of chunks
 * will only effect future associations on the socket.
 */
static int sctp_setsockopt_auth_chunk(struct sock *sk,
                                      struct sctp_authchunk *val,
                                      unsigned int optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;

        if (!ep->auth_enable)
                return -EACCES;

        if (optlen != sizeof(struct sctp_authchunk))
                return -EINVAL;

        switch (val->sauth_chunk) {
        case SCTP_CID_INIT:
        case SCTP_CID_INIT_ACK:
        case SCTP_CID_SHUTDOWN_COMPLETE:
        case SCTP_CID_AUTH:
                return -EINVAL;
        }

        /* add this chunk id to the endpoint */
        return sctp_auth_ep_add_chunkid(ep, val->sauth_chunk);
}

/*
 * 7.1.19.  Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT)
 *
 * This option gets or sets the list of HMAC algorithms that the local
 * endpoint requires the peer to use.
 */
static int sctp_setsockopt_hmac_ident(struct sock *sk,
                                      struct sctp_hmacalgo *hmacs,
                                      unsigned int optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        u32 idents;

        if (!ep->auth_enable)
                return -EACCES;

        if (optlen < sizeof(struct sctp_hmacalgo))
                return -EINVAL;
        optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) +
                                             SCTP_AUTH_NUM_HMACS * sizeof(u16));

        idents = hmacs->shmac_num_idents;
        if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS ||
            (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo)))
                return -EINVAL;

        return sctp_auth_ep_set_hmacs(ep, hmacs);
}

/*
 * 7.1.20.  Set a shared key (SCTP_AUTH_KEY)
 *
 * This option will set a shared secret key which is used to build an
 * association shared key.
 */
static int sctp_setsockopt_auth_key(struct sock *sk,
                                    struct sctp_authkey *authkey,
                                    unsigned int optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_association *asoc;
        int ret = -EINVAL;

        if (optlen <= sizeof(struct sctp_authkey))
                return -EINVAL;
        /* authkey->sca_keylength is u16, so optlen can't be bigger than
         * this.
         */
        optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(*authkey));

        if (authkey->sca_keylength > optlen - sizeof(*authkey))
                goto out;

        asoc = sctp_id2assoc(sk, authkey->sca_assoc_id);
        if (!asoc && authkey->sca_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        if (asoc) {
                ret = sctp_auth_set_key(ep, asoc, authkey);
                goto out;
        }

        if (sctp_style(sk, TCP))
                authkey->sca_assoc_id = SCTP_FUTURE_ASSOC;

        if (authkey->sca_assoc_id == SCTP_FUTURE_ASSOC ||
            authkey->sca_assoc_id == SCTP_ALL_ASSOC) {
                ret = sctp_auth_set_key(ep, asoc, authkey);
                if (ret)
                        goto out;
        }

        ret = 0;

        if (authkey->sca_assoc_id == SCTP_CURRENT_ASSOC ||
            authkey->sca_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &ep->asocs, asocs) {
                        int res = sctp_auth_set_key(ep, asoc, authkey);

                        if (res && !ret)
                                ret = res;
                }
        }

out:
        memzero_explicit(authkey, optlen);
        return ret;
}

/*
 * 7.1.21.  Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY)
 *
 * This option will get or set the active shared key to be used to build
 * the association shared key.
 */
static int sctp_setsockopt_active_key(struct sock *sk,
                                      struct sctp_authkeyid *val,
                                      unsigned int optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_association *asoc;
        int ret = 0;

        if (optlen != sizeof(struct sctp_authkeyid))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, val->scact_assoc_id);
        if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc)
                return sctp_auth_set_active_key(ep, asoc, val->scact_keynumber);

        if (sctp_style(sk, TCP))
                val->scact_assoc_id = SCTP_FUTURE_ASSOC;

        if (val->scact_assoc_id == SCTP_FUTURE_ASSOC ||
            val->scact_assoc_id == SCTP_ALL_ASSOC) {
                ret = sctp_auth_set_active_key(ep, asoc, val->scact_keynumber);
                if (ret)
                        return ret;
        }

        if (val->scact_assoc_id == SCTP_CURRENT_ASSOC ||
            val->scact_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &ep->asocs, asocs) {
                        int res = sctp_auth_set_active_key(ep, asoc,
                                                           val->scact_keynumber);

                        if (res && !ret)
                                ret = res;
                }
        }

        return ret;
}

/*
 * 7.1.22.  Delete a shared key (SCTP_AUTH_DELETE_KEY)
 *
 * This set option will delete a shared secret key from use.
 */
static int sctp_setsockopt_del_key(struct sock *sk,
                                   struct sctp_authkeyid *val,
                                   unsigned int optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_association *asoc;
        int ret = 0;

        if (optlen != sizeof(struct sctp_authkeyid))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, val->scact_assoc_id);
        if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc)
                return sctp_auth_del_key_id(ep, asoc, val->scact_keynumber);

        if (sctp_style(sk, TCP))
                val->scact_assoc_id = SCTP_FUTURE_ASSOC;

        if (val->scact_assoc_id == SCTP_FUTURE_ASSOC ||
            val->scact_assoc_id == SCTP_ALL_ASSOC) {
                ret = sctp_auth_del_key_id(ep, asoc, val->scact_keynumber);
                if (ret)
                        return ret;
        }

        if (val->scact_assoc_id == SCTP_CURRENT_ASSOC ||
            val->scact_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &ep->asocs, asocs) {
                        int res = sctp_auth_del_key_id(ep, asoc,
                                                       val->scact_keynumber);

                        if (res && !ret)
                                ret = res;
                }
        }

        return ret;
}

/*
 * 8.3.4  Deactivate a Shared Key (SCTP_AUTH_DEACTIVATE_KEY)
 *
 * This set option will deactivate a shared secret key.
 */
static int sctp_setsockopt_deactivate_key(struct sock *sk,
                                          struct sctp_authkeyid *val,
                                          unsigned int optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_association *asoc;
        int ret = 0;

        if (optlen != sizeof(struct sctp_authkeyid))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, val->scact_assoc_id);
        if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc)
                return sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber);

        if (sctp_style(sk, TCP))
                val->scact_assoc_id = SCTP_FUTURE_ASSOC;

        if (val->scact_assoc_id == SCTP_FUTURE_ASSOC ||
            val->scact_assoc_id == SCTP_ALL_ASSOC) {
                ret = sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber);
                if (ret)
                        return ret;
        }

        if (val->scact_assoc_id == SCTP_CURRENT_ASSOC ||
            val->scact_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &ep->asocs, asocs) {
                        int res = sctp_auth_deact_key_id(ep, asoc,
                                                         val->scact_keynumber);

                        if (res && !ret)
                                ret = res;
                }
        }

        return ret;
}

/*
 * 8.1.23 SCTP_AUTO_ASCONF
 *
 * This option will enable or disable the use of the automatic generation of
 * ASCONF chunks to add and delete addresses to an existing association.  Note
 * that this option has two caveats namely: a) it only affects sockets that
 * are bound to all addresses available to the SCTP stack, and b) the system
 * administrator may have an overriding control that turns the ASCONF feature
 * off no matter what setting the socket option may have.
 * This option expects an integer boolean flag, where a non-zero value turns on
 * the option, and a zero value turns off the option.
 * Note. In this implementation, socket operation overrides default parameter
 * being set by sysctl as well as FreeBSD implementation
 */
static int sctp_setsockopt_auto_asconf(struct sock *sk, int *val,
                                        unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);

        if (optlen < sizeof(int))
                return -EINVAL;
        if (!sctp_is_ep_boundall(sk) && *val)
                return -EINVAL;
        if ((*val && sp->do_auto_asconf) || (!*val && !sp->do_auto_asconf))
                return 0;

        spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock);
        if (*val == 0 && sp->do_auto_asconf) {
                list_del(&sp->auto_asconf_list);
                sp->do_auto_asconf = 0;
        } else if (*val && !sp->do_auto_asconf) {
                list_add_tail(&sp->auto_asconf_list,
                    &sock_net(sk)->sctp.auto_asconf_splist);
                sp->do_auto_asconf = 1;
        }
        spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock);
        return 0;
}

/*
 * SCTP_PEER_ADDR_THLDS
 *
 * This option allows us to alter the partially failed threshold for one or all
 * transports in an association.  See Section 6.1 of:
 * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
 */
static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
                                            struct sctp_paddrthlds_v2 *val,
                                            unsigned int optlen, bool v2)
{
        struct sctp_transport *trans;
        struct sctp_association *asoc;
        int len;

        len = v2 ? sizeof(*val) : sizeof(struct sctp_paddrthlds);
        if (optlen < len)
                return -EINVAL;

        if (v2 && val->spt_pathpfthld > val->spt_pathcpthld)
                return -EINVAL;

        if (!sctp_is_any(sk, (const union sctp_addr *)&val->spt_address)) {
                trans = sctp_addr_id2transport(sk, &val->spt_address,
                                               val->spt_assoc_id);
                if (!trans)
                        return -ENOENT;

                if (val->spt_pathmaxrxt)
                        trans->pathmaxrxt = val->spt_pathmaxrxt;
                if (v2)
                        trans->ps_retrans = val->spt_pathcpthld;
                trans->pf_retrans = val->spt_pathpfthld;

                return 0;
        }

        asoc = sctp_id2assoc(sk, val->spt_assoc_id);
        if (!asoc && val->spt_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                list_for_each_entry(trans, &asoc->peer.transport_addr_list,
                                    transports) {
                        if (val->spt_pathmaxrxt)
                                trans->pathmaxrxt = val->spt_pathmaxrxt;
                        if (v2)
                                trans->ps_retrans = val->spt_pathcpthld;
                        trans->pf_retrans = val->spt_pathpfthld;
                }

                if (val->spt_pathmaxrxt)
                        asoc->pathmaxrxt = val->spt_pathmaxrxt;
                if (v2)
                        asoc->ps_retrans = val->spt_pathcpthld;
                asoc->pf_retrans = val->spt_pathpfthld;
        } else {
                struct sctp_sock *sp = sctp_sk(sk);

                if (val->spt_pathmaxrxt)
                        sp->pathmaxrxt = val->spt_pathmaxrxt;
                if (v2)
                        sp->ps_retrans = val->spt_pathcpthld;
                sp->pf_retrans = val->spt_pathpfthld;
        }

        return 0;
}

static int sctp_setsockopt_recvrcvinfo(struct sock *sk, int *val,
                                       unsigned int optlen)
{
        if (optlen < sizeof(int))
                return -EINVAL;

        sctp_sk(sk)->recvrcvinfo = (*val == 0) ? 0 : 1;

        return 0;
}

static int sctp_setsockopt_recvnxtinfo(struct sock *sk, int *val,
                                       unsigned int optlen)
{
        if (optlen < sizeof(int))
                return -EINVAL;

        sctp_sk(sk)->recvnxtinfo = (*val == 0) ? 0 : 1;

        return 0;
}

static int sctp_setsockopt_pr_supported(struct sock *sk,
                                        struct sctp_assoc_value *params,
                                        unsigned int optlen)
{
        struct sctp_association *asoc;

        if (optlen != sizeof(*params))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        sctp_sk(sk)->ep->prsctp_enable = !!params->assoc_value;

        return 0;
}

static int sctp_setsockopt_default_prinfo(struct sock *sk,
                                          struct sctp_default_prinfo *info,
                                          unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        int retval = -EINVAL;

        if (optlen != sizeof(*info))
                goto out;

        if (info->pr_policy & ~SCTP_PR_SCTP_MASK)
                goto out;

        if (info->pr_policy == SCTP_PR_SCTP_NONE)
                info->pr_value = 0;

        asoc = sctp_id2assoc(sk, info->pr_assoc_id);
        if (!asoc && info->pr_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        retval = 0;

        if (asoc) {
                SCTP_PR_SET_POLICY(asoc->default_flags, info->pr_policy);
                asoc->default_timetolive = info->pr_value;
                goto out;
        }

        if (sctp_style(sk, TCP))
                info->pr_assoc_id = SCTP_FUTURE_ASSOC;

        if (info->pr_assoc_id == SCTP_FUTURE_ASSOC ||
            info->pr_assoc_id == SCTP_ALL_ASSOC) {
                SCTP_PR_SET_POLICY(sp->default_flags, info->pr_policy);
                sp->default_timetolive = info->pr_value;
        }

        if (info->pr_assoc_id == SCTP_CURRENT_ASSOC ||
            info->pr_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &sp->ep->asocs, asocs) {
                        SCTP_PR_SET_POLICY(asoc->default_flags,
                                           info->pr_policy);
                        asoc->default_timetolive = info->pr_value;
                }
        }

out:
        return retval;
}

static int sctp_setsockopt_reconfig_supported(struct sock *sk,
                                              struct sctp_assoc_value *params,
                                              unsigned int optlen)
{
        struct sctp_association *asoc;
        int retval = -EINVAL;

        if (optlen != sizeof(*params))
                goto out;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        sctp_sk(sk)->ep->reconf_enable = !!params->assoc_value;

        retval = 0;

out:
        return retval;
}

static int sctp_setsockopt_enable_strreset(struct sock *sk,
                                           struct sctp_assoc_value *params,
                                           unsigned int optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_association *asoc;
        int retval = -EINVAL;

        if (optlen != sizeof(*params))
                goto out;

        if (params->assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
                goto out;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        retval = 0;

        if (asoc) {
                asoc->strreset_enable = params->assoc_value;
                goto out;
        }

        if (sctp_style(sk, TCP))
                params->assoc_id = SCTP_FUTURE_ASSOC;

        if (params->assoc_id == SCTP_FUTURE_ASSOC ||
            params->assoc_id == SCTP_ALL_ASSOC)
                ep->strreset_enable = params->assoc_value;

        if (params->assoc_id == SCTP_CURRENT_ASSOC ||
            params->assoc_id == SCTP_ALL_ASSOC)
                list_for_each_entry(asoc, &ep->asocs, asocs)
                        asoc->strreset_enable = params->assoc_value;

out:
        return retval;
}

static int sctp_setsockopt_reset_streams(struct sock *sk,
                                         struct sctp_reset_streams *params,
                                         unsigned int optlen)
{
        struct sctp_association *asoc;

        if (optlen < sizeof(*params))
                return -EINVAL;
        /* srs_number_streams is u16, so optlen can't be bigger than this. */
        optlen = min_t(unsigned int, optlen, USHRT_MAX +
                                             sizeof(__u16) * sizeof(*params));

        if (params->srs_number_streams * sizeof(__u16) >
            optlen - sizeof(*params))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, params->srs_assoc_id);
        if (!asoc)
                return -EINVAL;

        return sctp_send_reset_streams(asoc, params);
}

static int sctp_setsockopt_reset_assoc(struct sock *sk, sctp_assoc_t *associd,
                                       unsigned int optlen)
{
        struct sctp_association *asoc;

        if (optlen != sizeof(*associd))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, *associd);
        if (!asoc)
                return -EINVAL;

        return sctp_send_reset_assoc(asoc);
}

static int sctp_setsockopt_add_streams(struct sock *sk,
                                       struct sctp_add_streams *params,
                                       unsigned int optlen)
{
        struct sctp_association *asoc;

        if (optlen != sizeof(*params))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, params->sas_assoc_id);
        if (!asoc)
                return -EINVAL;

        return sctp_send_add_streams(asoc, params);
}

static int sctp_setsockopt_scheduler(struct sock *sk,
                                     struct sctp_assoc_value *params,
                                     unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        int retval = 0;

        if (optlen < sizeof(*params))
                return -EINVAL;

        if (params->assoc_value > SCTP_SS_MAX)
                return -EINVAL;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc)
                return sctp_sched_set_sched(asoc, params->assoc_value);

        if (sctp_style(sk, TCP))
                params->assoc_id = SCTP_FUTURE_ASSOC;

        if (params->assoc_id == SCTP_FUTURE_ASSOC ||
            params->assoc_id == SCTP_ALL_ASSOC)
                sp->default_ss = params->assoc_value;

        if (params->assoc_id == SCTP_CURRENT_ASSOC ||
            params->assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &sp->ep->asocs, asocs) {
                        int ret = sctp_sched_set_sched(asoc,
                                                       params->assoc_value);

                        if (ret && !retval)
                                retval = ret;
                }
        }

        return retval;
}

static int sctp_setsockopt_scheduler_value(struct sock *sk,
                                           struct sctp_stream_value *params,
                                           unsigned int optlen)
{
        struct sctp_association *asoc;
        int retval = -EINVAL;

        if (optlen < sizeof(*params))
                goto out;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id != SCTP_CURRENT_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        if (asoc) {
                retval = sctp_sched_set_value(asoc, params->stream_id,
                                              params->stream_value, GFP_KERNEL);
                goto out;
        }

        retval = 0;

        list_for_each_entry(asoc, &sctp_sk(sk)->ep->asocs, asocs) {
                int ret = sctp_sched_set_value(asoc, params->stream_id,
                                               params->stream_value,
                                               GFP_KERNEL);
                if (ret && !retval) /* try to return the 1st error. */
                        retval = ret;
        }

out:
        return retval;
}

static int sctp_setsockopt_interleaving_supported(struct sock *sk,
                                                  struct sctp_assoc_value *p,
                                                  unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;

        if (optlen < sizeof(*p))
                return -EINVAL;

        asoc = sctp_id2assoc(sk, p->assoc_id);
        if (!asoc && p->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP))
                return -EINVAL;

        if (!sock_net(sk)->sctp.intl_enable || !sp->frag_interleave) {
                return -EPERM;
        }

        sp->ep->intl_enable = !!p->assoc_value;
        return 0;
}

static int sctp_setsockopt_reuse_port(struct sock *sk, int *val,
                                      unsigned int optlen)
{
        if (!sctp_style(sk, TCP))
                return -EOPNOTSUPP;

        if (sctp_sk(sk)->ep->base.bind_addr.port)
                return -EFAULT;

        if (optlen < sizeof(int))
                return -EINVAL;

        sctp_sk(sk)->reuse = !!*val;

        return 0;
}

static int sctp_assoc_ulpevent_type_set(struct sctp_event *param,
                                        struct sctp_association *asoc)
{
        struct sctp_ulpevent *event;

        sctp_ulpevent_type_set(&asoc->subscribe, param->se_type, param->se_on);

        if (param->se_type == SCTP_SENDER_DRY_EVENT && param->se_on) {
                if (sctp_outq_is_empty(&asoc->outqueue)) {
                        event = sctp_ulpevent_make_sender_dry_event(asoc,
                                        GFP_USER | __GFP_NOWARN);
                        if (!event)
                                return -ENOMEM;

                        asoc->stream.si->enqueue_event(&asoc->ulpq, event);
                }
        }

        return 0;
}

static int sctp_setsockopt_event(struct sock *sk, struct sctp_event *param,
                                 unsigned int optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        int retval = 0;

        if (optlen < sizeof(*param))
                return -EINVAL;

        if (param->se_type < SCTP_SN_TYPE_BASE ||
            param->se_type > SCTP_SN_TYPE_MAX)
                return -EINVAL;

        asoc = sctp_id2assoc(sk, param->se_assoc_id);
        if (!asoc && param->se_assoc_id > SCTP_ALL_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc)
                return sctp_assoc_ulpevent_type_set(param, asoc);

        if (sctp_style(sk, TCP))
                param->se_assoc_id = SCTP_FUTURE_ASSOC;

        if (param->se_assoc_id == SCTP_FUTURE_ASSOC ||
            param->se_assoc_id == SCTP_ALL_ASSOC)
                sctp_ulpevent_type_set(&sp->subscribe,
                                       param->se_type, param->se_on);

        if (param->se_assoc_id == SCTP_CURRENT_ASSOC ||
            param->se_assoc_id == SCTP_ALL_ASSOC) {
                list_for_each_entry(asoc, &sp->ep->asocs, asocs) {
                        int ret = sctp_assoc_ulpevent_type_set(param, asoc);

                        if (ret && !retval)
                                retval = ret;
                }
        }

        return retval;
}

static int sctp_setsockopt_asconf_supported(struct sock *sk,
                                            struct sctp_assoc_value *params,
                                            unsigned int optlen)
{
        struct sctp_association *asoc;
        struct sctp_endpoint *ep;
        int retval = -EINVAL;

        if (optlen != sizeof(*params))
                goto out;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        ep = sctp_sk(sk)->ep;
        ep->asconf_enable = !!params->assoc_value;

        if (ep->asconf_enable && ep->auth_enable) {
                sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
                sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
        }

        retval = 0;

out:
        return retval;
}

static int sctp_setsockopt_auth_supported(struct sock *sk,
                                          struct sctp_assoc_value *params,
                                          unsigned int optlen)
{
        struct sctp_association *asoc;
        struct sctp_endpoint *ep;
        int retval = -EINVAL;

        if (optlen != sizeof(*params))
                goto out;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        ep = sctp_sk(sk)->ep;
        if (params->assoc_value) {
                retval = sctp_auth_init(ep, GFP_KERNEL);
                if (retval)
                        goto out;
                if (ep->asconf_enable) {
                        sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
                        sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
                }
        }

        ep->auth_enable = !!params->assoc_value;
        retval = 0;

out:
        return retval;
}

static int sctp_setsockopt_ecn_supported(struct sock *sk,
                                         struct sctp_assoc_value *params,
                                         unsigned int optlen)
{
        struct sctp_association *asoc;
        int retval = -EINVAL;

        if (optlen != sizeof(*params))
                goto out;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        sctp_sk(sk)->ep->ecn_enable = !!params->assoc_value;
        retval = 0;

out:
        return retval;
}

static int sctp_setsockopt_pf_expose(struct sock *sk,
                                     struct sctp_assoc_value *params,
                                     unsigned int optlen)
{
        struct sctp_association *asoc;
        int retval = -EINVAL;

        if (optlen != sizeof(*params))
                goto out;

        if (params->assoc_value > SCTP_PF_EXPOSE_MAX)
                goto out;

        asoc = sctp_id2assoc(sk, params->assoc_id);
        if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                goto out;

        if (asoc)
                asoc->pf_expose = params->assoc_value;
        else
                sctp_sk(sk)->pf_expose = params->assoc_value;
        retval = 0;

out:
        return retval;
}

static int sctp_setsockopt_encap_port(struct sock *sk,
                                      struct sctp_udpencaps *encap,
                                      unsigned int optlen)
{
        struct sctp_association *asoc;
        struct sctp_transport *t;
        __be16 encap_port;

        if (optlen != sizeof(*encap))
                return -EINVAL;

        /* If an address other than INADDR_ANY is specified, and
         * no transport is found, then the request is invalid.
         */
        encap_port = (__force __be16)encap->sue_port;
        if (!sctp_is_any(sk, (union sctp_addr *)&encap->sue_address)) {
                t = sctp_addr_id2transport(sk, &encap->sue_address,
                                           encap->sue_assoc_id);
                if (!t)
                        return -EINVAL;

                t->encap_port = encap_port;
                return 0;
        }

        /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, encap->sue_assoc_id);
        if (!asoc && encap->sue_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        /* If changes are for association, also apply encap_port to
         * each transport.
         */
        if (asoc) {
                list_for_each_entry(t, &asoc->peer.transport_addr_list,
                                    transports)
                        t->encap_port = encap_port;

                asoc->encap_port = encap_port;
                return 0;
        }

        sctp_sk(sk)->encap_port = encap_port;
        return 0;
}

static int sctp_setsockopt_probe_interval(struct sock *sk,
                                          struct sctp_probeinterval *params,
                                          unsigned int optlen)
{
        struct sctp_association *asoc;
        struct sctp_transport *t;
        __u32 probe_interval;

        if (optlen != sizeof(*params))
                return -EINVAL;

        probe_interval = params->spi_interval;
        if (probe_interval && probe_interval < SCTP_PROBE_TIMER_MIN)
                return -EINVAL;

        /* If an address other than INADDR_ANY is specified, and
         * no transport is found, then the request is invalid.
         */
        if (!sctp_is_any(sk, (union sctp_addr *)&params->spi_address)) {
                t = sctp_addr_id2transport(sk, &params->spi_address,
                                           params->spi_assoc_id);
                if (!t)
                        return -EINVAL;

                t->probe_interval = msecs_to_jiffies(probe_interval);
                sctp_transport_pl_reset(t);
                return 0;
        }

        /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, params->spi_assoc_id);
        if (!asoc && params->spi_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        /* If changes are for association, also apply probe_interval to
         * each transport.
         */
        if (asoc) {
                list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
                        t->probe_interval = msecs_to_jiffies(probe_interval);
                        sctp_transport_pl_reset(t);
                }

                asoc->probe_interval = msecs_to_jiffies(probe_interval);
                return 0;
        }

        sctp_sk(sk)->probe_interval = probe_interval;
        return 0;
}

/* API 6.2 setsockopt(), getsockopt()
 *
 * Applications use setsockopt() and getsockopt() to set or retrieve
 * socket options.  Socket options are used to change the default
 * behavior of sockets calls.  They are described in Section 7.
 *
 * The syntax is:
 *
 *   ret = getsockopt(int sd, int level, int optname, void __user *optval,
 *                    int __user *optlen);
 *   ret = setsockopt(int sd, int level, int optname, const void __user *optval,
 *                    int optlen);
 *
 *   sd      - the socket descript.
 *   level   - set to IPPROTO_SCTP for all SCTP options.
 *   optname - the option name.
 *   optval  - the buffer to store the value of the option.
 *   optlen  - the size of the buffer.
 */
static int sctp_setsockopt(struct sock *sk, int level, int optname,
                           sockptr_t optval, unsigned int optlen)
{
        void *kopt = NULL;
        int retval = 0;

        pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname);

        /* I can hardly begin to describe how wrong this is.  This is
         * so broken as to be worse than useless.  The API draft
         * REALLY is NOT helpful here...  I am not convinced that the
         * semantics of setsockopt() with a level OTHER THAN SOL_SCTP
         * are at all well-founded.
         */
        if (level != SOL_SCTP) {
                struct sctp_af *af = sctp_sk(sk)->pf->af;

                return af->setsockopt(sk, level, optname, optval, optlen);
        }

        if (optlen > 0) {
                /* Trim it to the biggest size sctp sockopt may need if necessary */
                optlen = min_t(unsigned int, optlen,
                               PAGE_ALIGN(USHRT_MAX +
                                          sizeof(__u16) * sizeof(struct sctp_reset_streams)));
                kopt = memdup_sockptr(optval, optlen);
                if (IS_ERR(kopt))
                        return PTR_ERR(kopt);
        }

        lock_sock(sk);

        switch (optname) {
        case SCTP_SOCKOPT_BINDX_ADD:
                /* 'optlen' is the size of the addresses buffer. */
                retval = sctp_setsockopt_bindx(sk, kopt, optlen,
                                               SCTP_BINDX_ADD_ADDR);
                break;

        case SCTP_SOCKOPT_BINDX_REM:
                /* 'optlen' is the size of the addresses buffer. */
                retval = sctp_setsockopt_bindx(sk, kopt, optlen,
                                               SCTP_BINDX_REM_ADDR);
                break;

        case SCTP_SOCKOPT_CONNECTX_OLD:
                /* 'optlen' is the size of the addresses buffer. */
                retval = sctp_setsockopt_connectx_old(sk, kopt, optlen);
                break;

        case SCTP_SOCKOPT_CONNECTX:
                /* 'optlen' is the size of the addresses buffer. */
                retval = sctp_setsockopt_connectx(sk, kopt, optlen);
                break;

        case SCTP_DISABLE_FRAGMENTS:
                retval = sctp_setsockopt_disable_fragments(sk, kopt, optlen);
                break;

        case SCTP_EVENTS:
                retval = sctp_setsockopt_events(sk, kopt, optlen);
                break;

        case SCTP_AUTOCLOSE:
                retval = sctp_setsockopt_autoclose(sk, kopt, optlen);
                break;

        case SCTP_PEER_ADDR_PARAMS:
                retval = sctp_setsockopt_peer_addr_params(sk, kopt, optlen);
                break;

        case SCTP_DELAYED_SACK:
                retval = sctp_setsockopt_delayed_ack(sk, kopt, optlen);
                break;
        case SCTP_PARTIAL_DELIVERY_POINT:
                retval = sctp_setsockopt_partial_delivery_point(sk, kopt, optlen);
                break;

        case SCTP_INITMSG:
                retval = sctp_setsockopt_initmsg(sk, kopt, optlen);
                break;
        case SCTP_DEFAULT_SEND_PARAM:
                retval = sctp_setsockopt_default_send_param(sk, kopt, optlen);
                break;
        case SCTP_DEFAULT_SNDINFO:
                retval = sctp_setsockopt_default_sndinfo(sk, kopt, optlen);
                break;
        case SCTP_PRIMARY_ADDR:
                retval = sctp_setsockopt_primary_addr(sk, kopt, optlen);
                break;
        case SCTP_SET_PEER_PRIMARY_ADDR:
                retval = sctp_setsockopt_peer_primary_addr(sk, kopt, optlen);
                break;
        case SCTP_NODELAY:
                retval = sctp_setsockopt_nodelay(sk, kopt, optlen);
                break;
        case SCTP_RTOINFO:
                retval = sctp_setsockopt_rtoinfo(sk, kopt, optlen);
                break;
        case SCTP_ASSOCINFO:
                retval = sctp_setsockopt_associnfo(sk, kopt, optlen);
                break;
        case SCTP_I_WANT_MAPPED_V4_ADDR:
                retval = sctp_setsockopt_mappedv4(sk, kopt, optlen);
                break;
        case SCTP_MAXSEG:
                retval = sctp_setsockopt_maxseg(sk, kopt, optlen);
                break;
        case SCTP_ADAPTATION_LAYER:
                retval = sctp_setsockopt_adaptation_layer(sk, kopt, optlen);
                break;
        case SCTP_CONTEXT:
                retval = sctp_setsockopt_context(sk, kopt, optlen);
                break;
        case SCTP_FRAGMENT_INTERLEAVE:
                retval = sctp_setsockopt_fragment_interleave(sk, kopt, optlen);
                break;
        case SCTP_MAX_BURST:
                retval = sctp_setsockopt_maxburst(sk, kopt, optlen);
                break;
        case SCTP_AUTH_CHUNK:
                retval = sctp_setsockopt_auth_chunk(sk, kopt, optlen);
                break;
        case SCTP_HMAC_IDENT:
                retval = sctp_setsockopt_hmac_ident(sk, kopt, optlen);
                break;
        case SCTP_AUTH_KEY:
                retval = sctp_setsockopt_auth_key(sk, kopt, optlen);
                break;
        case SCTP_AUTH_ACTIVE_KEY:
                retval = sctp_setsockopt_active_key(sk, kopt, optlen);
                break;
        case SCTP_AUTH_DELETE_KEY:
                retval = sctp_setsockopt_del_key(sk, kopt, optlen);
                break;
        case SCTP_AUTH_DEACTIVATE_KEY:
                retval = sctp_setsockopt_deactivate_key(sk, kopt, optlen);
                break;
        case SCTP_AUTO_ASCONF:
                retval = sctp_setsockopt_auto_asconf(sk, kopt, optlen);
                break;
        case SCTP_PEER_ADDR_THLDS:
                retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen,
                                                          false);
                break;
        case SCTP_PEER_ADDR_THLDS_V2:
                retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen,
                                                          true);
                break;
        case SCTP_RECVRCVINFO:
                retval = sctp_setsockopt_recvrcvinfo(sk, kopt, optlen);
                break;
        case SCTP_RECVNXTINFO:
                retval = sctp_setsockopt_recvnxtinfo(sk, kopt, optlen);
                break;
        case SCTP_PR_SUPPORTED:
                retval = sctp_setsockopt_pr_supported(sk, kopt, optlen);
                break;
        case SCTP_DEFAULT_PRINFO:
                retval = sctp_setsockopt_default_prinfo(sk, kopt, optlen);
                break;
        case SCTP_RECONFIG_SUPPORTED:
                retval = sctp_setsockopt_reconfig_supported(sk, kopt, optlen);
                break;
        case SCTP_ENABLE_STREAM_RESET:
                retval = sctp_setsockopt_enable_strreset(sk, kopt, optlen);
                break;
        case SCTP_RESET_STREAMS:
                retval = sctp_setsockopt_reset_streams(sk, kopt, optlen);
                break;
        case SCTP_RESET_ASSOC:
                retval = sctp_setsockopt_reset_assoc(sk, kopt, optlen);
                break;
        case SCTP_ADD_STREAMS:
                retval = sctp_setsockopt_add_streams(sk, kopt, optlen);
                break;
        case SCTP_STREAM_SCHEDULER:
                retval = sctp_setsockopt_scheduler(sk, kopt, optlen);
                break;
        case SCTP_STREAM_SCHEDULER_VALUE:
                retval = sctp_setsockopt_scheduler_value(sk, kopt, optlen);
                break;
        case SCTP_INTERLEAVING_SUPPORTED:
                retval = sctp_setsockopt_interleaving_supported(sk, kopt,
                                                                optlen);
                break;
        case SCTP_REUSE_PORT:
                retval = sctp_setsockopt_reuse_port(sk, kopt, optlen);
                break;
        case SCTP_EVENT:
                retval = sctp_setsockopt_event(sk, kopt, optlen);
                break;
        case SCTP_ASCONF_SUPPORTED:
                retval = sctp_setsockopt_asconf_supported(sk, kopt, optlen);
                break;
        case SCTP_AUTH_SUPPORTED:
                retval = sctp_setsockopt_auth_supported(sk, kopt, optlen);
                break;
        case SCTP_ECN_SUPPORTED:
                retval = sctp_setsockopt_ecn_supported(sk, kopt, optlen);
                break;
        case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
                retval = sctp_setsockopt_pf_expose(sk, kopt, optlen);
                break;
        case SCTP_REMOTE_UDP_ENCAPS_PORT:
                retval = sctp_setsockopt_encap_port(sk, kopt, optlen);
                break;
        case SCTP_PLPMTUD_PROBE_INTERVAL:
                retval = sctp_setsockopt_probe_interval(sk, kopt, optlen);
                break;
        default:
                retval = -ENOPROTOOPT;
                break;
        }

        release_sock(sk);
        kfree(kopt);
        return retval;
}

/* API 3.1.6 connect() - UDP Style Syntax
 *
 * An application may use the connect() call in the UDP model to initiate an
 * association without sending data.
 *
 * The syntax is:
 *
 * ret = connect(int sd, const struct sockaddr *nam, socklen_t len);
 *
 * sd: the socket descriptor to have a new association added to.
 *
 * nam: the address structure (either struct sockaddr_in or struct
 *    sockaddr_in6 defined in RFC2553 [7]).
 *
 * len: the size of the address.
 */
static int sctp_connect(struct sock *sk, struct sockaddr *addr,
                        int addr_len, int flags)
{
        struct sctp_af *af;
        int err = -EINVAL;

        lock_sock(sk);
        pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk,
                 addr, addr_len);

        /* Validate addr_len before calling common connect/connectx routine. */
        af = sctp_get_af_specific(addr->sa_family);
        if (af && addr_len >= af->sockaddr_len)
                err = __sctp_connect(sk, addr, af->sockaddr_len, flags, NULL);

        release_sock(sk);
        return err;
}

int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr,
                      int addr_len, int flags)
{
        if (addr_len < sizeof(uaddr->sa_family))
                return -EINVAL;

        if (uaddr->sa_family == AF_UNSPEC)
                return -EOPNOTSUPP;

        return sctp_connect(sock->sk, uaddr, addr_len, flags);
}

/* FIXME: Write comments. */
static int sctp_disconnect(struct sock *sk, int flags)
{
        return -EOPNOTSUPP; /* STUB */
}

/* 4.1.4 accept() - TCP Style Syntax
 *
 * Applications use accept() call to remove an established SCTP
 * association from the accept queue of the endpoint.  A new socket
 * descriptor will be returned from accept() to represent the newly
 * formed association.
 */
static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg)
{
        struct sctp_sock *sp;
        struct sctp_endpoint *ep;
        struct sock *newsk = NULL;
        struct sctp_association *asoc;
        long timeo;
        int error = 0;

        lock_sock(sk);

        sp = sctp_sk(sk);
        ep = sp->ep;

        if (!sctp_style(sk, TCP)) {
                error = -EOPNOTSUPP;
                goto out;
        }

        if (!sctp_sstate(sk, LISTENING)) {
                error = -EINVAL;
                goto out;
        }

        timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);

        error = sctp_wait_for_accept(sk, timeo);
        if (error)
                goto out;

        /* We treat the list of associations on the endpoint as the accept
         * queue and pick the first association on the list.
         */
        asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);

        newsk = sp->pf->create_accept_sk(sk, asoc, arg->kern);
        if (!newsk) {
                error = -ENOMEM;
                goto out;
        }

        /* Populate the fields of the newsk from the oldsk and migrate the
         * asoc to the newsk.
         */
        error = sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP);
        if (error) {
                sk_common_release(newsk);
                newsk = NULL;
        }

out:
        release_sock(sk);
        arg->err = error;
        return newsk;
}

/* The SCTP ioctl handler. */
static int sctp_ioctl(struct sock *sk, int cmd, int *karg)
{
        int rc = -ENOTCONN;

        lock_sock(sk);

        /*
         * SEQPACKET-style sockets in LISTENING state are valid, for
         * SCTP, so only discard TCP-style sockets in LISTENING state.
         */
        if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
                goto out;

        switch (cmd) {
        case SIOCINQ: {
                struct sk_buff *skb;
                *karg = 0;

                skb = skb_peek(&sk->sk_receive_queue);
                if (skb != NULL) {
                        /*
                         * We will only return the amount of this packet since
                         * that is all that will be read.
                         */
                        *karg = skb->len;
                }
                rc = 0;
                break;
        }
        default:
                rc = -ENOIOCTLCMD;
                break;
        }
out:
        release_sock(sk);
        return rc;
}

/* This is the function which gets called during socket creation to
 * initialized the SCTP-specific portion of the sock.
 * The sock structure should already be zero-filled memory.
 */
static int sctp_init_sock(struct sock *sk)
{
        struct net *net = sock_net(sk);
        struct sctp_sock *sp;

        pr_debug("%s: sk:%p\n", __func__, sk);

        sp = sctp_sk(sk);

        /* Initialize the SCTP per socket area.  */
        switch (sk->sk_type) {
        case SOCK_SEQPACKET:
                sp->type = SCTP_SOCKET_UDP;
                break;
        case SOCK_STREAM:
                sp->type = SCTP_SOCKET_TCP;
                break;
        default:
                return -ESOCKTNOSUPPORT;
        }

        sk->sk_gso_type = SKB_GSO_SCTP;

        /* Initialize default send parameters. These parameters can be
         * modified with the SCTP_DEFAULT_SEND_PARAM socket option.
         */
        sp->default_stream = 0;
        sp->default_ppid = 0;
        sp->default_flags = 0;
        sp->default_context = 0;
        sp->default_timetolive = 0;

        sp->default_rcv_context = 0;
        sp->max_burst = net->sctp.max_burst;

        sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg;

        /* Initialize default setup parameters. These parameters
         * can be modified with the SCTP_INITMSG socket option or
         * overridden by the SCTP_INIT CMSG.
         */
        sp->initmsg.sinit_num_ostreams   = sctp_max_outstreams;
        sp->initmsg.sinit_max_instreams  = sctp_max_instreams;
        sp->initmsg.sinit_max_attempts   = net->sctp.max_retrans_init;
        sp->initmsg.sinit_max_init_timeo = net->sctp.rto_max;

        /* Initialize default RTO related parameters.  These parameters can
         * be modified for with the SCTP_RTOINFO socket option.
         */
        sp->rtoinfo.srto_initial = net->sctp.rto_initial;
        sp->rtoinfo.srto_max     = net->sctp.rto_max;
        sp->rtoinfo.srto_min     = net->sctp.rto_min;

        /* Initialize default association related parameters. These parameters
         * can be modified with the SCTP_ASSOCINFO socket option.
         */
        sp->assocparams.sasoc_asocmaxrxt = net->sctp.max_retrans_association;
        sp->assocparams.sasoc_number_peer_destinations = 0;
        sp->assocparams.sasoc_peer_rwnd = 0;
        sp->assocparams.sasoc_local_rwnd = 0;
        sp->assocparams.sasoc_cookie_life = net->sctp.valid_cookie_life;

        /* Initialize default event subscriptions. By default, all the
         * options are off.
         */
        sp->subscribe = 0;

        /* Default Peer Address Parameters.  These defaults can
         * be modified via SCTP_PEER_ADDR_PARAMS
         */
        sp->hbinterval  = net->sctp.hb_interval;
        sp->udp_port    = htons(net->sctp.udp_port);
        sp->encap_port  = htons(net->sctp.encap_port);
        sp->pathmaxrxt  = net->sctp.max_retrans_path;
        sp->pf_retrans  = net->sctp.pf_retrans;
        sp->ps_retrans  = net->sctp.ps_retrans;
        sp->pf_expose   = net->sctp.pf_expose;
        sp->pathmtu     = 0; /* allow default discovery */
        sp->sackdelay   = net->sctp.sack_timeout;
        sp->sackfreq        = 2;
        sp->param_flags = SPP_HB_ENABLE |
                          SPP_PMTUD_ENABLE |
                          SPP_SACKDELAY_ENABLE;
        sp->default_ss = SCTP_SS_DEFAULT;

        /* If enabled no SCTP message fragmentation will be performed.
         * Configure through SCTP_DISABLE_FRAGMENTS socket option.
         */
        sp->disable_fragments = 0;

        /* Enable Nagle algorithm by default.  */
        sp->nodelay           = 0;

        sp->recvrcvinfo = 0;
        sp->recvnxtinfo = 0;

        /* Enable by default. */
        sp->v4mapped          = 1;

        /* Auto-close idle associations after the configured
         * number of seconds.  A value of 0 disables this
         * feature.  Configure through the SCTP_AUTOCLOSE socket option,
         * for UDP-style sockets only.
         */
        sp->autoclose         = 0;

        /* User specified fragmentation limit. */
        sp->user_frag         = 0;

        sp->adaptation_ind = 0;

        sp->pf = sctp_get_pf_specific(sk->sk_family);

        /* Control variables for partial data delivery. */
        atomic_set(&sp->pd_mode, 0);
        skb_queue_head_init(&sp->pd_lobby);
        sp->frag_interleave = 0;
        sp->probe_interval = net->sctp.probe_interval;

        /* Create a per socket endpoint structure.  Even if we
         * change the data structure relationships, this may still
         * be useful for storing pre-connect address information.
         */
        sp->ep = sctp_endpoint_new(sk, GFP_KERNEL);
        if (!sp->ep)
                return -ENOMEM;

        sp->hmac = NULL;

        sk->sk_destruct = sctp_destruct_sock;

        SCTP_DBG_OBJCNT_INC(sock);

        sk_sockets_allocated_inc(sk);
        sock_prot_inuse_add(net, sk->sk_prot, 1);

        return 0;
}

/* Cleanup any SCTP per socket resources. Must be called with
 * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true
 */
static void sctp_destroy_sock(struct sock *sk)
{
        struct sctp_sock *sp;

        pr_debug("%s: sk:%p\n", __func__, sk);

        /* Release our hold on the endpoint. */
        sp = sctp_sk(sk);
        /* This could happen during socket init, thus we bail out
         * early, since the rest of the below is not setup either.
         */
        if (sp->ep == NULL)
                return;

        if (sp->do_auto_asconf) {
                sp->do_auto_asconf = 0;
                list_del(&sp->auto_asconf_list);
        }
        sctp_endpoint_free(sp->ep);
        sk_sockets_allocated_dec(sk);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}

/* Triggered when there are no references on the socket anymore */
static void sctp_destruct_common(struct sock *sk)
{
        struct sctp_sock *sp = sctp_sk(sk);

        /* Free up the HMAC transform. */
        crypto_free_shash(sp->hmac);
}

static void sctp_destruct_sock(struct sock *sk)
{
        sctp_destruct_common(sk);
        inet_sock_destruct(sk);
}

/* API 4.1.7 shutdown() - TCP Style Syntax
 *     int shutdown(int socket, int how);
 *
 *     sd      - the socket descriptor of the association to be closed.
 *     how     - Specifies the type of shutdown.  The  values  are
 *               as follows:
 *               SHUT_RD
 *                     Disables further receive operations. No SCTP
 *                     protocol action is taken.
 *               SHUT_WR
 *                     Disables further send operations, and initiates
 *                     the SCTP shutdown sequence.
 *               SHUT_RDWR
 *                     Disables further send  and  receive  operations
 *                     and initiates the SCTP shutdown sequence.
 */
static void sctp_shutdown(struct sock *sk, int how)
{
        struct net *net = sock_net(sk);
        struct sctp_endpoint *ep;

        if (!sctp_style(sk, TCP))
                return;

        ep = sctp_sk(sk)->ep;
        if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) {
                struct sctp_association *asoc;

                inet_sk_set_state(sk, SCTP_SS_CLOSING);
                asoc = list_entry(ep->asocs.next,
                                  struct sctp_association, asocs);
                sctp_primitive_SHUTDOWN(net, asoc, NULL);
        }
}

int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
                       struct sctp_info *info)
{
        struct sctp_transport *prim;
        struct list_head *pos;
        int mask;

        memset(info, 0, sizeof(*info));
        if (!asoc) {
                struct sctp_sock *sp = sctp_sk(sk);

                info->sctpi_s_autoclose = sp->autoclose;
                info->sctpi_s_adaptation_ind = sp->adaptation_ind;
                info->sctpi_s_pd_point = sp->pd_point;
                info->sctpi_s_nodelay = sp->nodelay;
                info->sctpi_s_disable_fragments = sp->disable_fragments;
                info->sctpi_s_v4mapped = sp->v4mapped;
                info->sctpi_s_frag_interleave = sp->frag_interleave;
                info->sctpi_s_type = sp->type;

                return 0;
        }

        info->sctpi_tag = asoc->c.my_vtag;
        info->sctpi_state = asoc->state;
        info->sctpi_rwnd = asoc->a_rwnd;
        info->sctpi_unackdata = asoc->unack_data;
        info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
        info->sctpi_instrms = asoc->stream.incnt;
        info->sctpi_outstrms = asoc->stream.outcnt;
        list_for_each(pos, &asoc->base.inqueue.in_chunk_list)
                info->sctpi_inqueue++;
        list_for_each(pos, &asoc->outqueue.out_chunk_list)
                info->sctpi_outqueue++;
        info->sctpi_overall_error = asoc->overall_error_count;
        info->sctpi_max_burst = asoc->max_burst;
        info->sctpi_maxseg = asoc->frag_point;
        info->sctpi_peer_rwnd = asoc->peer.rwnd;
        info->sctpi_peer_tag = asoc->c.peer_vtag;

        mask = asoc->peer.intl_capable << 1;
        mask = (mask | asoc->peer.ecn_capable) << 1;
        mask = (mask | asoc->peer.ipv4_address) << 1;
        mask = (mask | asoc->peer.ipv6_address) << 1;
        mask = (mask | asoc->peer.reconf_capable) << 1;
        mask = (mask | asoc->peer.asconf_capable) << 1;
        mask = (mask | asoc->peer.prsctp_capable) << 1;
        mask = (mask | asoc->peer.auth_capable);
        info->sctpi_peer_capable = mask;
        mask = asoc->peer.sack_needed << 1;
        mask = (mask | asoc->peer.sack_generation) << 1;
        mask = (mask | asoc->peer.zero_window_announced);
        info->sctpi_peer_sack = mask;

        info->sctpi_isacks = asoc->stats.isacks;
        info->sctpi_osacks = asoc->stats.osacks;
        info->sctpi_opackets = asoc->stats.opackets;
        info->sctpi_ipackets = asoc->stats.ipackets;
        info->sctpi_rtxchunks = asoc->stats.rtxchunks;
        info->sctpi_outofseqtsns = asoc->stats.outofseqtsns;
        info->sctpi_idupchunks = asoc->stats.idupchunks;
        info->sctpi_gapcnt = asoc->stats.gapcnt;
        info->sctpi_ouodchunks = asoc->stats.ouodchunks;
        info->sctpi_iuodchunks = asoc->stats.iuodchunks;
        info->sctpi_oodchunks = asoc->stats.oodchunks;
        info->sctpi_iodchunks = asoc->stats.iodchunks;
        info->sctpi_octrlchunks = asoc->stats.octrlchunks;
        info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;

        prim = asoc->peer.primary_path;
        memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr));
        info->sctpi_p_state = prim->state;
        info->sctpi_p_cwnd = prim->cwnd;
        info->sctpi_p_srtt = prim->srtt;
        info->sctpi_p_rto = jiffies_to_msecs(prim->rto);
        info->sctpi_p_hbinterval = prim->hbinterval;
        info->sctpi_p_pathmaxrxt = prim->pathmaxrxt;
        info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay);
        info->sctpi_p_ssthresh = prim->ssthresh;
        info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked;
        info->sctpi_p_flight_size = prim->flight_size;
        info->sctpi_p_error = prim->error_count;

        return 0;
}
EXPORT_SYMBOL_GPL(sctp_get_sctp_info);

/* use callback to avoid exporting the core structure */
void sctp_transport_walk_start(struct rhashtable_iter *iter) __acquires(RCU)
{
        rhltable_walk_enter(&sctp_transport_hashtable, iter);

        rhashtable_walk_start(iter);
}

void sctp_transport_walk_stop(struct rhashtable_iter *iter) __releases(RCU)
{
        rhashtable_walk_stop(iter);
        rhashtable_walk_exit(iter);
}

struct sctp_transport *sctp_transport_get_next(struct net *net,
                                               struct rhashtable_iter *iter)
{
        struct sctp_transport *t;

        t = rhashtable_walk_next(iter);
        for (; t; t = rhashtable_walk_next(iter)) {
                if (IS_ERR(t)) {
                        if (PTR_ERR(t) == -EAGAIN)
                                continue;
                        break;
                }

                if (!sctp_transport_hold(t))
                        continue;

                if (net_eq(t->asoc->base.net, net) &&
                    t->asoc->peer.primary_path == t)
                        break;

                sctp_transport_put(t);
        }

        return t;
}

struct sctp_transport *sctp_transport_get_idx(struct net *net,
                                              struct rhashtable_iter *iter,
                                              int pos)
{
        struct sctp_transport *t;

        if (!pos)
                return SEQ_START_TOKEN;

        while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) {
                if (!--pos)
                        break;
                sctp_transport_put(t);
        }

        return t;
}

int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
                           void *p) {
        int err = 0;
        int hash = 0;
        struct sctp_endpoint *ep;
        struct sctp_hashbucket *head;

        for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize;
             hash++, head++) {
                read_lock_bh(&head->lock);
                sctp_for_each_hentry(ep, &head->chain) {
                        err = cb(ep, p);
                        if (err)
                                break;
                }
                read_unlock_bh(&head->lock);
        }

        return err;
}
EXPORT_SYMBOL_GPL(sctp_for_each_endpoint);

int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net,
                                  const union sctp_addr *laddr,
                                  const union sctp_addr *paddr, void *p, int dif)
{
        struct sctp_transport *transport;
        struct sctp_endpoint *ep;
        int err = -ENOENT;

        rcu_read_lock();
        transport = sctp_addrs_lookup_transport(net, laddr, paddr, dif, dif);
        if (!transport) {
                rcu_read_unlock();
                return err;
        }
        ep = transport->asoc->ep;
        if (!sctp_endpoint_hold(ep)) { /* asoc can be peeled off */
                sctp_transport_put(transport);
                rcu_read_unlock();
                return err;
        }
        rcu_read_unlock();

        err = cb(ep, transport, p);
        sctp_endpoint_put(ep);
        sctp_transport_put(transport);
        return err;
}
EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);

int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done,
                                    struct net *net, int *pos, void *p)
{
        struct rhashtable_iter hti;
        struct sctp_transport *tsp;
        struct sctp_endpoint *ep;
        int ret;

again:
        ret = 0;
        sctp_transport_walk_start(&hti);

        tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
        for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
                ep = tsp->asoc->ep;
                if (sctp_endpoint_hold(ep)) { /* asoc can be peeled off */
                        ret = cb(ep, tsp, p);
                        if (ret)
                                break;
                        sctp_endpoint_put(ep);
                }
                (*pos)++;
                sctp_transport_put(tsp);
        }
        sctp_transport_walk_stop(&hti);

        if (ret) {
                if (cb_done && !cb_done(ep, tsp, p)) {
                        (*pos)++;
                        sctp_endpoint_put(ep);
                        sctp_transport_put(tsp);
                        goto again;
                }
                sctp_endpoint_put(ep);
                sctp_transport_put(tsp);
        }

        return ret;
}
EXPORT_SYMBOL_GPL(sctp_transport_traverse_process);

/* 7.2.1 Association Status (SCTP_STATUS)

 * Applications can retrieve current status information about an
 * association, including association state, peer receiver window size,
 * number of unacked data chunks, and number of data chunks pending
 * receipt.  This information is read-only.
 */
static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
                                       char __user *optval,
                                       int __user *optlen)
{
        struct sctp_status status;
        struct sctp_association *asoc = NULL;
        struct sctp_transport *transport;
        sctp_assoc_t associd;
        int retval = 0;

        if (len < sizeof(status)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(status);
        if (copy_from_user(&status, optval, len)) {
                retval = -EFAULT;
                goto out;
        }

        associd = status.sstat_assoc_id;
        asoc = sctp_id2assoc(sk, associd);
        if (!asoc) {
                retval = -EINVAL;
                goto out;
        }

        transport = asoc->peer.primary_path;

        status.sstat_assoc_id = sctp_assoc2id(asoc);
        status.sstat_state = sctp_assoc_to_state(asoc);
        status.sstat_rwnd =  asoc->peer.rwnd;
        status.sstat_unackdata = asoc->unack_data;

        status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map);
        status.sstat_instrms = asoc->stream.incnt;
        status.sstat_outstrms = asoc->stream.outcnt;
        status.sstat_fragmentation_point = asoc->frag_point;
        status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
        memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr,
                        transport->af_specific->sockaddr_len);
        /* Map ipv4 address into v4-mapped-on-v6 address.  */
        sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk),
                (union sctp_addr *)&status.sstat_primary.spinfo_address);
        status.sstat_primary.spinfo_state = transport->state;
        status.sstat_primary.spinfo_cwnd = transport->cwnd;
        status.sstat_primary.spinfo_srtt = transport->srtt;
        status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
        status.sstat_primary.spinfo_mtu = transport->pathmtu;

        if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
                status.sstat_primary.spinfo_state = SCTP_ACTIVE;

        if (put_user(len, optlen)) {
                retval = -EFAULT;
                goto out;
        }

        pr_debug("%s: len:%d, state:%d, rwnd:%d, assoc_id:%d\n",
                 __func__, len, status.sstat_state, status.sstat_rwnd,
                 status.sstat_assoc_id);

        if (copy_to_user(optval, &status, len)) {
                retval = -EFAULT;
                goto out;
        }

out:
        return retval;
}


/* 7.2.2 Peer Address Information (SCTP_GET_PEER_ADDR_INFO)
 *
 * Applications can retrieve information about a specific peer address
 * of an association, including its reachability state, congestion
 * window, and retransmission timer values.  This information is
 * read-only.
 */
static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
                                          char __user *optval,
                                          int __user *optlen)
{
        struct sctp_paddrinfo pinfo;
        struct sctp_transport *transport;
        int retval = 0;

        if (len < sizeof(pinfo)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(pinfo);
        if (copy_from_user(&pinfo, optval, len)) {
                retval = -EFAULT;
                goto out;
        }

        transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address,
                                           pinfo.spinfo_assoc_id);
        if (!transport) {
                retval = -EINVAL;
                goto out;
        }

        if (transport->state == SCTP_PF &&
            transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) {
                retval = -EACCES;
                goto out;
        }

        pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
        pinfo.spinfo_state = transport->state;
        pinfo.spinfo_cwnd = transport->cwnd;
        pinfo.spinfo_srtt = transport->srtt;
        pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
        pinfo.spinfo_mtu = transport->pathmtu;

        if (pinfo.spinfo_state == SCTP_UNKNOWN)
                pinfo.spinfo_state = SCTP_ACTIVE;

        if (put_user(len, optlen)) {
                retval = -EFAULT;
                goto out;
        }

        if (copy_to_user(optval, &pinfo, len)) {
                retval = -EFAULT;
                goto out;
        }

out:
        return retval;
}

/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
 *
 * This option is a on/off flag.  If enabled no SCTP message
 * fragmentation will be performed.  Instead if a message being sent
 * exceeds the current PMTU size, the message will NOT be sent and
 * instead a error will be indicated to the user.
 */
static int sctp_getsockopt_disable_fragments(struct sock *sk, int len,
                                        char __user *optval, int __user *optlen)
{
        int val;

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = (sctp_sk(sk)->disable_fragments == 1);
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}

/* 7.1.15 Set notification and ancillary events (SCTP_EVENTS)
 *
 * This socket option is used to specify various notifications and
 * ancillary data the user wishes to receive.
 */
static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval,
                                  int __user *optlen)
{
        struct sctp_event_subscribe subscribe;
        __u8 *sn_type = (__u8 *)&subscribe;
        int i;

        if (len == 0)
                return -EINVAL;
        if (len > sizeof(struct sctp_event_subscribe))
                len = sizeof(struct sctp_event_subscribe);
        if (put_user(len, optlen))
                return -EFAULT;

        for (i = 0; i < len; i++)
                sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe,
                                                        SCTP_SN_TYPE_BASE + i);

        if (copy_to_user(optval, &subscribe, len))
                return -EFAULT;

        return 0;
}

/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE)
 *
 * This socket option is applicable to the UDP-style socket only.  When
 * set it will cause associations that are idle for more than the
 * specified number of seconds to automatically close.  An association
 * being idle is defined an association that has NOT sent or received
 * user data.  The special value of '0' indicates that no automatic
 * close of any associations should be performed.  The option expects an
 * integer defining the number of seconds of idle time before an
 * association is closed.
 */
static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optval, int __user *optlen)
{
        /* Applicable to UDP-style socket only */
        if (sctp_style(sk, TCP))
                return -EOPNOTSUPP;
        if (len < sizeof(int))
                return -EINVAL;
        len = sizeof(int);
        if (put_user(len, optlen))
                return -EFAULT;
        if (put_user(sctp_sk(sk)->autoclose, (int __user *)optval))
                return -EFAULT;
        return 0;
}

/* Helper routine to branch off an association to a new socket.  */
int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
{
        struct sctp_association *asoc = sctp_id2assoc(sk, id);
        struct sctp_sock *sp = sctp_sk(sk);
        struct socket *sock;
        int err = 0;

        /* Do not peel off from one netns to another one. */
        if (!net_eq(current->nsproxy->net_ns, sock_net(sk)))
                return -EINVAL;

        if (!asoc)
                return -EINVAL;

        /* An association cannot be branched off from an already peeled-off
         * socket, nor is this supported for tcp style sockets.
         */
        if (!sctp_style(sk, UDP))
                return -EINVAL;

        /* Create a new socket.  */
        err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock);
        if (err < 0)
                return err;

        sctp_copy_sock(sock->sk, sk, asoc);

        /* Make peeled-off sockets more like 1-1 accepted sockets.
         * Set the daddr and initialize id to something more random and also
         * copy over any ip options.
         */
        sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sock->sk);
        sp->pf->copy_ip_options(sk, sock->sk);

        /* Populate the fields of the newsk from the oldsk and migrate the
         * asoc to the newsk.
         */
        err = sctp_sock_migrate(sk, sock->sk, asoc,
                                SCTP_SOCKET_UDP_HIGH_BANDWIDTH);
        if (err) {
                sock_release(sock);
                sock = NULL;
        }

        *sockp = sock;

        return err;
}
EXPORT_SYMBOL(sctp_do_peeloff);

static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff,
                                          struct file **newfile, unsigned flags)
{
        struct socket *newsock;
        int retval;

        retval = sctp_do_peeloff(sk, peeloff->associd, &newsock);
        if (retval < 0)
                goto out;

        /* Map the socket to an unused fd that can be returned to the user.  */
        retval = get_unused_fd_flags(flags & SOCK_CLOEXEC);
        if (retval < 0) {
                sock_release(newsock);
                goto out;
        }

        *newfile = sock_alloc_file(newsock, 0, NULL);
        if (IS_ERR(*newfile)) {
                put_unused_fd(retval);
                retval = PTR_ERR(*newfile);
                *newfile = NULL;
                return retval;
        }

        pr_debug("%s: sk:%p, newsk:%p, sd:%d\n", __func__, sk, newsock->sk,
                 retval);

        peeloff->sd = retval;

        if (flags & SOCK_NONBLOCK)
                (*newfile)->f_flags |= O_NONBLOCK;
out:
        return retval;
}

static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen)
{
        sctp_peeloff_arg_t peeloff;
        struct file *newfile = NULL;
        int retval = 0;

        if (len < sizeof(sctp_peeloff_arg_t))
                return -EINVAL;
        len = sizeof(sctp_peeloff_arg_t);
        if (copy_from_user(&peeloff, optval, len))
                return -EFAULT;

        retval = sctp_getsockopt_peeloff_common(sk, &peeloff, &newfile, 0);
        if (retval < 0)
                goto out;

        /* Return the fd mapped to the new socket.  */
        if (put_user(len, optlen)) {
                fput(newfile);
                put_unused_fd(retval);
                return -EFAULT;
        }

        if (copy_to_user(optval, &peeloff, len)) {
                fput(newfile);
                put_unused_fd(retval);
                return -EFAULT;
        }
        fd_install(retval, newfile);
out:
        return retval;
}

static int sctp_getsockopt_peeloff_flags(struct sock *sk, int len,
                                         char __user *optval, int __user *optlen)
{
        sctp_peeloff_flags_arg_t peeloff;
        struct file *newfile = NULL;
        int retval = 0;

        if (len < sizeof(sctp_peeloff_flags_arg_t))
                return -EINVAL;
        len = sizeof(sctp_peeloff_flags_arg_t);
        if (copy_from_user(&peeloff, optval, len))
                return -EFAULT;

        retval = sctp_getsockopt_peeloff_common(sk, &peeloff.p_arg,
                                                &newfile, peeloff.flags);
        if (retval < 0)
                goto out;

        /* Return the fd mapped to the new socket.  */
        if (put_user(len, optlen)) {
                fput(newfile);
                put_unused_fd(retval);
                return -EFAULT;
        }

        if (copy_to_user(optval, &peeloff, len)) {
                fput(newfile);
                put_unused_fd(retval);
                return -EFAULT;
        }
        fd_install(retval, newfile);
out:
        return retval;
}

/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS)
 *
 * Applications can enable or disable heartbeats for any peer address of
 * an association, modify an address's heartbeat interval, force a
 * heartbeat to be sent immediately, and adjust the address's maximum
 * number of retransmissions sent before an address is considered
 * unreachable.  The following structure is used to access and modify an
 * address's parameters:
 *
 *  struct sctp_paddrparams {
 *     sctp_assoc_t            spp_assoc_id;
 *     struct sockaddr_storage spp_address;
 *     uint32_t                spp_hbinterval;
 *     uint16_t                spp_pathmaxrxt;
 *     uint32_t                spp_pathmtu;
 *     uint32_t                spp_sackdelay;
 *     uint32_t                spp_flags;
 * };
 *
 *   spp_assoc_id    - (one-to-many style socket) This is filled in the
 *                     application, and identifies the association for
 *                     this query.
 *   spp_address     - This specifies which address is of interest.
 *   spp_hbinterval  - This contains the value of the heartbeat interval,
 *                     in milliseconds.  If a  value of zero
 *                     is present in this field then no changes are to
 *                     be made to this parameter.
 *   spp_pathmaxrxt  - This contains the maximum number of
 *                     retransmissions before this address shall be
 *                     considered unreachable. If a  value of zero
 *                     is present in this field then no changes are to
 *                     be made to this parameter.
 *   spp_pathmtu     - When Path MTU discovery is disabled the value
 *                     specified here will be the "fixed" path mtu.
 *                     Note that if the spp_address field is empty
 *                     then all associations on this address will
 *                     have this fixed path mtu set upon them.
 *
 *   spp_sackdelay   - When delayed sack is enabled, this value specifies
 *                     the number of milliseconds that sacks will be delayed
 *                     for. This value will apply to all addresses of an
 *                     association if the spp_address field is empty. Note
 *                     also, that if delayed sack is enabled and this
 *                     value is set to 0, no change is made to the last
 *                     recorded delayed sack timer value.
 *
 *   spp_flags       - These flags are used to control various features
 *                     on an association. The flag field may contain
 *                     zero or more of the following options.
 *
 *                     SPP_HB_ENABLE  - Enable heartbeats on the
 *                     specified address. Note that if the address
 *                     field is empty all addresses for the association
 *                     have heartbeats enabled upon them.
 *
 *                     SPP_HB_DISABLE - Disable heartbeats on the
 *                     speicifed address. Note that if the address
 *                     field is empty all addresses for the association
 *                     will have their heartbeats disabled. Note also
 *                     that SPP_HB_ENABLE and SPP_HB_DISABLE are
 *                     mutually exclusive, only one of these two should
 *                     be specified. Enabling both fields will have
 *                     undetermined results.
 *
 *                     SPP_HB_DEMAND - Request a user initiated heartbeat
 *                     to be made immediately.
 *
 *                     SPP_PMTUD_ENABLE - This field will enable PMTU
 *                     discovery upon the specified address. Note that
 *                     if the address feild is empty then all addresses
 *                     on the association are effected.
 *
 *                     SPP_PMTUD_DISABLE - This field will disable PMTU
 *                     discovery upon the specified address. Note that
 *                     if the address feild is empty then all addresses
 *                     on the association are effected. Not also that
 *                     SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually
 *                     exclusive. Enabling both will have undetermined
 *                     results.
 *
 *                     SPP_SACKDELAY_ENABLE - Setting this flag turns
 *                     on delayed sack. The time specified in spp_sackdelay
 *                     is used to specify the sack delay for this address. Note
 *                     that if spp_address is empty then all addresses will
 *                     enable delayed sack and take on the sack delay
 *                     value specified in spp_sackdelay.
 *                     SPP_SACKDELAY_DISABLE - Setting this flag turns
 *                     off delayed sack. If the spp_address field is blank then
 *                     delayed sack is disabled for the entire association. Note
 *                     also that this field is mutually exclusive to
 *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
 *                     results.
 *
 *                     SPP_IPV6_FLOWLABEL:  Setting this flag enables the
 *                     setting of the IPV6 flow label value.  The value is
 *                     contained in the spp_ipv6_flowlabel field.
 *                     Upon retrieval, this flag will be set to indicate that
 *                     the spp_ipv6_flowlabel field has a valid value returned.
 *                     If a specific destination address is set (in the
 *                     spp_address field), then the value returned is that of
 *                     the address.  If just an association is specified (and
 *                     no address), then the association's default flow label
 *                     is returned.  If neither an association nor a destination
 *                     is specified, then the socket's default flow label is
 *                     returned.  For non-IPv6 sockets, this flag will be left
 *                     cleared.
 *
 *                     SPP_DSCP:  Setting this flag enables the setting of the
 *                     Differentiated Services Code Point (DSCP) value
 *                     associated with either the association or a specific
 *                     address.  The value is obtained in the spp_dscp field.
 *                     Upon retrieval, this flag will be set to indicate that
 *                     the spp_dscp field has a valid value returned.  If a
 *                     specific destination address is set when called (in the
 *                     spp_address field), then that specific destination
 *                     address's DSCP value is returned.  If just an association
 *                     is specified, then the association's default DSCP is
 *                     returned.  If neither an association nor a destination is
 *                     specified, then the socket's default DSCP is returned.
 *
 *   spp_ipv6_flowlabel
 *                   - This field is used in conjunction with the
 *                     SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
 *                     The 20 least significant bits are used for the flow
 *                     label.  This setting has precedence over any IPv6-layer
 *                     setting.
 *
 *   spp_dscp        - This field is used in conjunction with the SPP_DSCP flag
 *                     and contains the DSCP.  The 6 most significant bits are
 *                     used for the DSCP.  This setting has precedence over any
 *                     IPv4- or IPv6- layer setting.
 */
static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
                                            char __user *optval, int __user *optlen)
{
        struct sctp_paddrparams  params;
        struct sctp_transport   *trans = NULL;
        struct sctp_association *asoc = NULL;
        struct sctp_sock        *sp = sctp_sk(sk);

        if (len >= sizeof(params))
                len = sizeof(params);
        else if (len >= ALIGN(offsetof(struct sctp_paddrparams,
                                       spp_ipv6_flowlabel), 4))
                len = ALIGN(offsetof(struct sctp_paddrparams,
                                     spp_ipv6_flowlabel), 4);
        else
                return -EINVAL;

        if (copy_from_user(&params, optval, len))
                return -EFAULT;

        /* If an address other than INADDR_ANY is specified, and
         * no transport is found, then the request is invalid.
         */
        if (!sctp_is_any(sk, (union sctp_addr *)&params.spp_address)) {
                trans = sctp_addr_id2transport(sk, &params.spp_address,
                                               params.spp_assoc_id);
                if (!trans) {
                        pr_debug("%s: failed no transport\n", __func__);
                        return -EINVAL;
                }
        }

        /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, params.spp_assoc_id);
        if (!asoc && params.spp_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                pr_debug("%s: failed no association\n", __func__);
                return -EINVAL;
        }

        if (trans) {
                /* Fetch transport values. */
                params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval);
                params.spp_pathmtu    = trans->pathmtu;
                params.spp_pathmaxrxt = trans->pathmaxrxt;
                params.spp_sackdelay  = jiffies_to_msecs(trans->sackdelay);

                /*draft-11 doesn't say what to return in spp_flags*/
                params.spp_flags      = trans->param_flags;
                if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
                        params.spp_ipv6_flowlabel = trans->flowlabel &
                                                    SCTP_FLOWLABEL_VAL_MASK;
                        params.spp_flags |= SPP_IPV6_FLOWLABEL;
                }
                if (trans->dscp & SCTP_DSCP_SET_MASK) {
                        params.spp_dscp        = trans->dscp & SCTP_DSCP_VAL_MASK;
                        params.spp_flags |= SPP_DSCP;
                }
        } else if (asoc) {
                /* Fetch association values. */
                params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
                params.spp_pathmtu    = asoc->pathmtu;
                params.spp_pathmaxrxt = asoc->pathmaxrxt;
                params.spp_sackdelay  = jiffies_to_msecs(asoc->sackdelay);

                /*draft-11 doesn't say what to return in spp_flags*/
                params.spp_flags      = asoc->param_flags;
                if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
                        params.spp_ipv6_flowlabel = asoc->flowlabel &
                                                    SCTP_FLOWLABEL_VAL_MASK;
                        params.spp_flags |= SPP_IPV6_FLOWLABEL;
                }
                if (asoc->dscp & SCTP_DSCP_SET_MASK) {
                        params.spp_dscp        = asoc->dscp & SCTP_DSCP_VAL_MASK;
                        params.spp_flags |= SPP_DSCP;
                }
        } else {
                /* Fetch socket values. */
                params.spp_hbinterval = sp->hbinterval;
                params.spp_pathmtu    = sp->pathmtu;
                params.spp_sackdelay  = sp->sackdelay;
                params.spp_pathmaxrxt = sp->pathmaxrxt;

                /*draft-11 doesn't say what to return in spp_flags*/
                params.spp_flags      = sp->param_flags;
                if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
                        params.spp_ipv6_flowlabel = sp->flowlabel &
                                                    SCTP_FLOWLABEL_VAL_MASK;
                        params.spp_flags |= SPP_IPV6_FLOWLABEL;
                }
                if (sp->dscp & SCTP_DSCP_SET_MASK) {
                        params.spp_dscp        = sp->dscp & SCTP_DSCP_VAL_MASK;
                        params.spp_flags |= SPP_DSCP;
                }
        }

        if (copy_to_user(optval, &params, len))
                return -EFAULT;

        if (put_user(len, optlen))
                return -EFAULT;

        return 0;
}

/*
 * 7.1.23.  Get or set delayed ack timer (SCTP_DELAYED_SACK)
 *
 * This option will effect the way delayed acks are performed.  This
 * option allows you to get or set the delayed ack time, in
 * milliseconds.  It also allows changing the delayed ack frequency.
 * Changing the frequency to 1 disables the delayed sack algorithm.  If
 * the assoc_id is 0, then this sets or gets the endpoints default
 * values.  If the assoc_id field is non-zero, then the set or get
 * effects the specified association for the one to many model (the
 * assoc_id field is ignored by the one to one model).  Note that if
 * sack_delay or sack_freq are 0 when setting this option, then the
 * current values will remain unchanged.
 *
 * struct sctp_sack_info {
 *     sctp_assoc_t            sack_assoc_id;
 *     uint32_t                sack_delay;
 *     uint32_t                sack_freq;
 * };
 *
 * sack_assoc_id -  This parameter, indicates which association the user
 *    is performing an action upon.  Note that if this field's value is
 *    zero then the endpoints default value is changed (effecting future
 *    associations only).
 *
 * sack_delay -  This parameter contains the number of milliseconds that
 *    the user is requesting the delayed ACK timer be set to.  Note that
 *    this value is defined in the standard to be between 200 and 500
 *    milliseconds.
 *
 * sack_freq -  This parameter contains the number of packets that must
 *    be received before a sack is sent without waiting for the delay
 *    timer to expire.  The default value for this is 2, setting this
 *    value to 1 will disable the delayed sack algorithm.
 */
static int sctp_getsockopt_delayed_ack(struct sock *sk, int len,
                                            char __user *optval,
                                            int __user *optlen)
{
        struct sctp_sack_info    params;
        struct sctp_association *asoc = NULL;
        struct sctp_sock        *sp = sctp_sk(sk);

        if (len >= sizeof(struct sctp_sack_info)) {
                len = sizeof(struct sctp_sack_info);

                if (copy_from_user(&params, optval, len))
                        return -EFAULT;
        } else if (len == sizeof(struct sctp_assoc_value)) {
                pr_warn_ratelimited(DEPRECATED
                                    "%s (pid %d) "
                                    "Use of struct sctp_assoc_value in delayed_ack socket option.\n"
                                    "Use struct sctp_sack_info instead\n",
                                    current->comm, task_pid_nr(current));
                if (copy_from_user(&params, optval, len))
                        return -EFAULT;
        } else
                return -EINVAL;

        /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, params.sack_assoc_id);
        if (!asoc && params.sack_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                /* Fetch association values. */
                if (asoc->param_flags & SPP_SACKDELAY_ENABLE) {
                        params.sack_delay = jiffies_to_msecs(asoc->sackdelay);
                        params.sack_freq = asoc->sackfreq;

                } else {
                        params.sack_delay = 0;
                        params.sack_freq = 1;
                }
        } else {
                /* Fetch socket values. */
                if (sp->param_flags & SPP_SACKDELAY_ENABLE) {
                        params.sack_delay  = sp->sackdelay;
                        params.sack_freq = sp->sackfreq;
                } else {
                        params.sack_delay  = 0;
                        params.sack_freq = 1;
                }
        }

        if (copy_to_user(optval, &params, len))
                return -EFAULT;

        if (put_user(len, optlen))
                return -EFAULT;

        return 0;
}

/* 7.1.3 Initialization Parameters (SCTP_INITMSG)
 *
 * Applications can specify protocol parameters for the default association
 * initialization.  The option name argument to setsockopt() and getsockopt()
 * is SCTP_INITMSG.
 *
 * Setting initialization parameters is effective only on an unconnected
 * socket (for UDP-style sockets only future associations are effected
 * by the change).  With TCP-style sockets, this option is inherited by
 * sockets derived from a listener socket.
 */
static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval, int __user *optlen)
{
        if (len < sizeof(struct sctp_initmsg))
                return -EINVAL;
        len = sizeof(struct sctp_initmsg);
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len))
                return -EFAULT;
        return 0;
}


static int sctp_getsockopt_peer_addrs(struct sock *sk, int len,
                                      char __user *optval, int __user *optlen)
{
        struct sctp_association *asoc;
        int cnt = 0;
        struct sctp_getaddrs getaddrs;
        struct sctp_transport *from;
        void __user *to;
        union sctp_addr temp;
        struct sctp_sock *sp = sctp_sk(sk);
        int addrlen;
        size_t space_left;
        int bytes_copied;

        if (len < sizeof(struct sctp_getaddrs))
                return -EINVAL;

        if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
                return -EFAULT;

        /* For UDP-style sockets, id specifies the association to query.  */
        asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
        if (!asoc)
                return -EINVAL;

        to = optval + offsetof(struct sctp_getaddrs, addrs);
        space_left = len - offsetof(struct sctp_getaddrs, addrs);

        list_for_each_entry(from, &asoc->peer.transport_addr_list,
                                transports) {
                memcpy(&temp, &from->ipaddr, sizeof(temp));
                addrlen = sctp_get_pf_specific(sk->sk_family)
                              ->addr_to_user(sp, &temp);
                if (space_left < addrlen)
                        return -ENOMEM;
                if (copy_to_user(to, &temp, addrlen))
                        return -EFAULT;
                to += addrlen;
                cnt++;
                space_left -= addrlen;
        }

        if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num))
                return -EFAULT;
        bytes_copied = ((char __user *)to) - optval;
        if (put_user(bytes_copied, optlen))
                return -EFAULT;

        return 0;
}

static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to,
                            size_t space_left, int *bytes_copied)
{
        struct sctp_sockaddr_entry *addr;
        union sctp_addr temp;
        int cnt = 0;
        int addrlen;
        struct net *net = sock_net(sk);

        rcu_read_lock();
        list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) {
                if (!addr->valid)
                        continue;

                if ((PF_INET == sk->sk_family) &&
                    (AF_INET6 == addr->a.sa.sa_family))
                        continue;
                if ((PF_INET6 == sk->sk_family) &&
                    inet_v6_ipv6only(sk) &&
                    (AF_INET == addr->a.sa.sa_family))
                        continue;
                memcpy(&temp, &addr->a, sizeof(temp));
                if (!temp.v4.sin_port)
                        temp.v4.sin_port = htons(port);

                addrlen = sctp_get_pf_specific(sk->sk_family)
                              ->addr_to_user(sctp_sk(sk), &temp);

                if (space_left < addrlen) {
                        cnt =  -ENOMEM;
                        break;
                }
                memcpy(to, &temp, addrlen);

                to += addrlen;
                cnt++;
                space_left -= addrlen;
                *bytes_copied += addrlen;
        }
        rcu_read_unlock();

        return cnt;
}


static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
                                       char __user *optval, int __user *optlen)
{
        struct sctp_bind_addr *bp;
        struct sctp_association *asoc;
        int cnt = 0;
        struct sctp_getaddrs getaddrs;
        struct sctp_sockaddr_entry *addr;
        void __user *to;
        union sctp_addr temp;
        struct sctp_sock *sp = sctp_sk(sk);
        int addrlen;
        int err = 0;
        size_t space_left;
        int bytes_copied = 0;
        void *addrs;
        void *buf;

        if (len < sizeof(struct sctp_getaddrs))
                return -EINVAL;

        if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs)))
                return -EFAULT;

        /*
         *  For UDP-style sockets, id specifies the association to query.
         *  If the id field is set to the value '0' then the locally bound
         *  addresses are returned without regard to any particular
         *  association.
         */
        if (0 == getaddrs.assoc_id) {
                bp = &sctp_sk(sk)->ep->base.bind_addr;
        } else {
                asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
                if (!asoc)
                        return -EINVAL;
                bp = &asoc->base.bind_addr;
        }

        to = optval + offsetof(struct sctp_getaddrs, addrs);
        space_left = len - offsetof(struct sctp_getaddrs, addrs);

        addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN);
        if (!addrs)
                return -ENOMEM;

        /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
         * addresses from the global local address list.
         */
        if (sctp_list_single_entry(&bp->address_list)) {
                addr = list_entry(bp->address_list.next,
                                  struct sctp_sockaddr_entry, list);
                if (sctp_is_any(sk, &addr->a)) {
                        cnt = sctp_copy_laddrs(sk, bp->port, addrs,
                                                space_left, &bytes_copied);
                        if (cnt < 0) {
                                err = cnt;
                                goto out;
                        }
                        goto copy_getaddrs;
                }
        }

        buf = addrs;
        /* Protection on the bound address list is not needed since
         * in the socket option context we hold a socket lock and
         * thus the bound address list can't change.
         */
        list_for_each_entry(addr, &bp->address_list, list) {
                memcpy(&temp, &addr->a, sizeof(temp));
                addrlen = sctp_get_pf_specific(sk->sk_family)
                              ->addr_to_user(sp, &temp);
                if (space_left < addrlen) {
                        err =  -ENOMEM; /*fixme: right error?*/
                        goto out;
                }
                memcpy(buf, &temp, addrlen);
                buf += addrlen;
                bytes_copied += addrlen;
                cnt++;
                space_left -= addrlen;
        }

copy_getaddrs:
        if (copy_to_user(to, addrs, bytes_copied)) {
                err = -EFAULT;
                goto out;
        }
        if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) {
                err = -EFAULT;
                goto out;
        }
        /* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too,
         * but we can't change it anymore.
         */
        if (put_user(bytes_copied, optlen))
                err = -EFAULT;
out:
        kfree(addrs);
        return err;
}

/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
 *
 * Requests that the local SCTP stack use the enclosed peer address as
 * the association primary.  The enclosed address must be one of the
 * association peer's addresses.
 */
static int sctp_getsockopt_primary_addr(struct sock *sk, int len,
                                        char __user *optval, int __user *optlen)
{
        struct sctp_prim prim;
        struct sctp_association *asoc;
        struct sctp_sock *sp = sctp_sk(sk);

        if (len < sizeof(struct sctp_prim))
                return -EINVAL;

        len = sizeof(struct sctp_prim);

        if (copy_from_user(&prim, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, prim.ssp_assoc_id);
        if (!asoc)
                return -EINVAL;

        if (!asoc->peer.primary_path)
                return -ENOTCONN;

        memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr,
                asoc->peer.primary_path->af_specific->sockaddr_len);

        sctp_get_pf_specific(sk->sk_family)->addr_to_user(sp,
                        (union sctp_addr *)&prim.ssp_addr);

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &prim, len))
                return -EFAULT;

        return 0;
}

/*
 * 7.1.11  Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER)
 *
 * Requests that the local endpoint set the specified Adaptation Layer
 * Indication parameter for all future INIT and INIT-ACK exchanges.
 */
static int sctp_getsockopt_adaptation_layer(struct sock *sk, int len,
                                  char __user *optval, int __user *optlen)
{
        struct sctp_setadaptation adaptation;

        if (len < sizeof(struct sctp_setadaptation))
                return -EINVAL;

        len = sizeof(struct sctp_setadaptation);

        adaptation.ssb_adaptation_ind = sctp_sk(sk)->adaptation_ind;

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &adaptation, len))
                return -EFAULT;

        return 0;
}

/*
 *
 * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM)
 *
 *   Applications that wish to use the sendto() system call may wish to
 *   specify a default set of parameters that would normally be supplied
 *   through the inclusion of ancillary data.  This socket option allows
 *   such an application to set the default sctp_sndrcvinfo structure.


 *   The application that wishes to use this socket option simply passes
 *   in to this call the sctp_sndrcvinfo structure defined in Section
 *   5.2.2) The input parameters accepted by this call include
 *   sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context,
 *   sinfo_timetolive.  The user must provide the sinfo_assoc_id field in
 *   to this call if the caller is using the UDP model.
 *
 *   For getsockopt, it get the default sctp_sndrcvinfo structure.
 */
static int sctp_getsockopt_default_send_param(struct sock *sk,
                                        int len, char __user *optval,
                                        int __user *optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        struct sctp_sndrcvinfo info;

        if (len < sizeof(info))
                return -EINVAL;

        len = sizeof(info);

        if (copy_from_user(&info, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, info.sinfo_assoc_id);
        if (!asoc && info.sinfo_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                info.sinfo_stream = asoc->default_stream;
                info.sinfo_flags = asoc->default_flags;
                info.sinfo_ppid = asoc->default_ppid;
                info.sinfo_context = asoc->default_context;
                info.sinfo_timetolive = asoc->default_timetolive;
        } else {
                info.sinfo_stream = sp->default_stream;
                info.sinfo_flags = sp->default_flags;
                info.sinfo_ppid = sp->default_ppid;
                info.sinfo_context = sp->default_context;
                info.sinfo_timetolive = sp->default_timetolive;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &info, len))
                return -EFAULT;

        return 0;
}

/* RFC6458, Section 8.1.31. Set/get Default Send Parameters
 * (SCTP_DEFAULT_SNDINFO)
 */
static int sctp_getsockopt_default_sndinfo(struct sock *sk, int len,
                                           char __user *optval,
                                           int __user *optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        struct sctp_sndinfo info;

        if (len < sizeof(info))
                return -EINVAL;

        len = sizeof(info);

        if (copy_from_user(&info, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, info.snd_assoc_id);
        if (!asoc && info.snd_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                info.snd_sid = asoc->default_stream;
                info.snd_flags = asoc->default_flags;
                info.snd_ppid = asoc->default_ppid;
                info.snd_context = asoc->default_context;
        } else {
                info.snd_sid = sp->default_stream;
                info.snd_flags = sp->default_flags;
                info.snd_ppid = sp->default_ppid;
                info.snd_context = sp->default_context;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &info, len))
                return -EFAULT;

        return 0;
}

/*
 *
 * 7.1.5 SCTP_NODELAY
 *
 * Turn on/off any Nagle-like algorithm.  This means that packets are
 * generally sent as soon as possible and no unnecessary delays are
 * introduced, at the cost of more packets in the network.  Expects an
 * integer boolean flag.
 */

static int sctp_getsockopt_nodelay(struct sock *sk, int len,
                                   char __user *optval, int __user *optlen)
{
        int val;

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = (sctp_sk(sk)->nodelay == 1);
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}

/*
 *
 * 7.1.1 SCTP_RTOINFO
 *
 * The protocol parameters used to initialize and bound retransmission
 * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access
 * and modify these parameters.
 * All parameters are time values, in milliseconds.  A value of 0, when
 * modifying the parameters, indicates that the current value should not
 * be changed.
 *
 */
static int sctp_getsockopt_rtoinfo(struct sock *sk, int len,
                                char __user *optval,
                                int __user *optlen) {
        struct sctp_rtoinfo rtoinfo;
        struct sctp_association *asoc;

        if (len < sizeof (struct sctp_rtoinfo))
                return -EINVAL;

        len = sizeof(struct sctp_rtoinfo);

        if (copy_from_user(&rtoinfo, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id);

        if (!asoc && rtoinfo.srto_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        /* Values corresponding to the specific association. */
        if (asoc) {
                rtoinfo.srto_initial = jiffies_to_msecs(asoc->rto_initial);
                rtoinfo.srto_max = jiffies_to_msecs(asoc->rto_max);
                rtoinfo.srto_min = jiffies_to_msecs(asoc->rto_min);
        } else {
                /* Values corresponding to the endpoint. */
                struct sctp_sock *sp = sctp_sk(sk);

                rtoinfo.srto_initial = sp->rtoinfo.srto_initial;
                rtoinfo.srto_max = sp->rtoinfo.srto_max;
                rtoinfo.srto_min = sp->rtoinfo.srto_min;
        }

        if (put_user(len, optlen))
                return -EFAULT;

        if (copy_to_user(optval, &rtoinfo, len))
                return -EFAULT;

        return 0;
}

/*
 *
 * 7.1.2 SCTP_ASSOCINFO
 *
 * This option is used to tune the maximum retransmission attempts
 * of the association.
 * Returns an error if the new association retransmission value is
 * greater than the sum of the retransmission value  of the peer.
 * See [SCTP] for more information.
 *
 */
static int sctp_getsockopt_associnfo(struct sock *sk, int len,
                                     char __user *optval,
                                     int __user *optlen)
{

        struct sctp_assocparams assocparams;
        struct sctp_association *asoc;
        struct list_head *pos;
        int cnt = 0;

        if (len < sizeof (struct sctp_assocparams))
                return -EINVAL;

        len = sizeof(struct sctp_assocparams);

        if (copy_from_user(&assocparams, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id);

        if (!asoc && assocparams.sasoc_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        /* Values correspoinding to the specific association */
        if (asoc) {
                assocparams.sasoc_asocmaxrxt = asoc->max_retrans;
                assocparams.sasoc_peer_rwnd = asoc->peer.rwnd;
                assocparams.sasoc_local_rwnd = asoc->a_rwnd;
                assocparams.sasoc_cookie_life = ktime_to_ms(asoc->cookie_life);

                list_for_each(pos, &asoc->peer.transport_addr_list) {
                        cnt++;
                }

                assocparams.sasoc_number_peer_destinations = cnt;
        } else {
                /* Values corresponding to the endpoint */
                struct sctp_sock *sp = sctp_sk(sk);

                assocparams.sasoc_asocmaxrxt = sp->assocparams.sasoc_asocmaxrxt;
                assocparams.sasoc_peer_rwnd = sp->assocparams.sasoc_peer_rwnd;
                assocparams.sasoc_local_rwnd = sp->assocparams.sasoc_local_rwnd;
                assocparams.sasoc_cookie_life =
                                        sp->assocparams.sasoc_cookie_life;
                assocparams.sasoc_number_peer_destinations =
                                        sp->assocparams.
                                        sasoc_number_peer_destinations;
        }

        if (put_user(len, optlen))
                return -EFAULT;

        if (copy_to_user(optval, &assocparams, len))
                return -EFAULT;

        return 0;
}

/*
 * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR)
 *
 * This socket option is a boolean flag which turns on or off mapped V4
 * addresses.  If this option is turned on and the socket is type
 * PF_INET6, then IPv4 addresses will be mapped to V6 representation.
 * If this option is turned off, then no mapping will be done of V4
 * addresses and a user will receive both PF_INET6 and PF_INET type
 * addresses on the socket.
 */
static int sctp_getsockopt_mappedv4(struct sock *sk, int len,
                                    char __user *optval, int __user *optlen)
{
        int val;
        struct sctp_sock *sp = sctp_sk(sk);

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = sp->v4mapped;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

/*
 * 7.1.29.  Set or Get the default context (SCTP_CONTEXT)
 * (chapter and verse is quoted at sctp_setsockopt_context())
 */
static int sctp_getsockopt_context(struct sock *sk, int len,
                                   char __user *optval, int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;

        if (len < sizeof(struct sctp_assoc_value))
                return -EINVAL;

        len = sizeof(struct sctp_assoc_value);

        if (copy_from_user(&params, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        params.assoc_value = asoc ? asoc->default_rcv_context
                                  : sctp_sk(sk)->default_rcv_context;

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &params, len))
                return -EFAULT;

        return 0;
}

/*
 * 8.1.16.  Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG)
 * This option will get or set the maximum size to put in any outgoing
 * SCTP DATA chunk.  If a message is larger than this size it will be
 * fragmented by SCTP into the specified size.  Note that the underlying
 * SCTP implementation may fragment into smaller sized chunks when the
 * PMTU of the underlying association is smaller than the value set by
 * the user.  The default value for this option is '0' which indicates
 * the user is NOT limiting fragmentation and only the PMTU will effect
 * SCTP's choice of DATA chunk size.  Note also that values set larger
 * than the maximum size of an IP datagram will effectively let SCTP
 * control fragmentation (i.e. the same as setting this option to 0).
 *
 * The following structure is used to access and modify this parameter:
 *
 * struct sctp_assoc_value {
 *   sctp_assoc_t assoc_id;
 *   uint32_t assoc_value;
 * };
 *
 * assoc_id:  This parameter is ignored for one-to-one style sockets.
 *    For one-to-many style sockets this parameter indicates which
 *    association the user is performing an action upon.  Note that if
 *    this field's value is zero then the endpoints default value is
 *    changed (effecting future associations only).
 * assoc_value:  This parameter specifies the maximum size in bytes.
 */
static int sctp_getsockopt_maxseg(struct sock *sk, int len,
                                  char __user *optval, int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;

        if (len == sizeof(int)) {
                pr_warn_ratelimited(DEPRECATED
                                    "%s (pid %d) "
                                    "Use of int in maxseg socket option.\n"
                                    "Use struct sctp_assoc_value instead\n",
                                    current->comm, task_pid_nr(current));
                params.assoc_id = SCTP_FUTURE_ASSOC;
        } else if (len >= sizeof(struct sctp_assoc_value)) {
                len = sizeof(struct sctp_assoc_value);
                if (copy_from_user(&params, optval, len))
                        return -EFAULT;
        } else
                return -EINVAL;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc)
                params.assoc_value = asoc->frag_point;
        else
                params.assoc_value = sctp_sk(sk)->user_frag;

        if (put_user(len, optlen))
                return -EFAULT;
        if (len == sizeof(int)) {
                if (copy_to_user(optval, &params.assoc_value, len))
                        return -EFAULT;
        } else {
                if (copy_to_user(optval, &params, len))
                        return -EFAULT;
        }

        return 0;
}

/*
 * 7.1.24.  Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE)
 * (chapter and verse is quoted at sctp_setsockopt_fragment_interleave())
 */
static int sctp_getsockopt_fragment_interleave(struct sock *sk, int len,
                                               char __user *optval, int __user *optlen)
{
        int val;

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);

        val = sctp_sk(sk)->frag_interleave;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

/*
 * 7.1.25.  Set or Get the sctp partial delivery point
 * (chapter and verse is quoted at sctp_setsockopt_partial_delivery_point())
 */
static int sctp_getsockopt_partial_delivery_point(struct sock *sk, int len,
                                                  char __user *optval,
                                                  int __user *optlen)
{
        u32 val;

        if (len < sizeof(u32))
                return -EINVAL;

        len = sizeof(u32);

        val = sctp_sk(sk)->pd_point;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

/*
 * 7.1.28.  Set or Get the maximum burst (SCTP_MAX_BURST)
 * (chapter and verse is quoted at sctp_setsockopt_maxburst())
 */
static int sctp_getsockopt_maxburst(struct sock *sk, int len,
                                    char __user *optval,
                                    int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;

        if (len == sizeof(int)) {
                pr_warn_ratelimited(DEPRECATED
                                    "%s (pid %d) "
                                    "Use of int in max_burst socket option.\n"
                                    "Use struct sctp_assoc_value instead\n",
                                    current->comm, task_pid_nr(current));
                params.assoc_id = SCTP_FUTURE_ASSOC;
        } else if (len >= sizeof(struct sctp_assoc_value)) {
                len = sizeof(struct sctp_assoc_value);
                if (copy_from_user(&params, optval, len))
                        return -EFAULT;
        } else
                return -EINVAL;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        params.assoc_value = asoc ? asoc->max_burst : sctp_sk(sk)->max_burst;

        if (len == sizeof(int)) {
                if (copy_to_user(optval, &params.assoc_value, len))
                        return -EFAULT;
        } else {
                if (copy_to_user(optval, &params, len))
                        return -EFAULT;
        }

        return 0;

}

static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
                                    char __user *optval, int __user *optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_hmacalgo  __user *p = (void __user *)optval;
        struct sctp_hmac_algo_param *hmacs;
        __u16 data_len = 0;
        u32 num_idents;
        int i;

        if (!ep->auth_enable)
                return -EACCES;

        hmacs = ep->auth_hmacs_list;
        data_len = ntohs(hmacs->param_hdr.length) -
                   sizeof(struct sctp_paramhdr);

        if (len < sizeof(struct sctp_hmacalgo) + data_len)
                return -EINVAL;

        len = sizeof(struct sctp_hmacalgo) + data_len;
        num_idents = data_len / sizeof(u16);

        if (put_user(len, optlen))
                return -EFAULT;
        if (put_user(num_idents, &p->shmac_num_idents))
                return -EFAULT;
        for (i = 0; i < num_idents; i++) {
                __u16 hmacid = ntohs(hmacs->hmac_ids[i]);

                if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16)))
                        return -EFAULT;
        }
        return 0;
}

static int sctp_getsockopt_active_key(struct sock *sk, int len,
                                    char __user *optval, int __user *optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_authkeyid val;
        struct sctp_association *asoc;

        if (len < sizeof(struct sctp_authkeyid))
                return -EINVAL;

        len = sizeof(struct sctp_authkeyid);
        if (copy_from_user(&val, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, val.scact_assoc_id);
        if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                if (!asoc->peer.auth_capable)
                        return -EACCES;
                val.scact_keynumber = asoc->active_key_id;
        } else {
                if (!ep->auth_enable)
                        return -EACCES;
                val.scact_keynumber = ep->active_key_id;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
                                    char __user *optval, int __user *optlen)
{
        struct sctp_authchunks __user *p = (void __user *)optval;
        struct sctp_authchunks val;
        struct sctp_association *asoc;
        struct sctp_chunks_param *ch;
        u32    num_chunks = 0;
        char __user *to;

        if (len < sizeof(struct sctp_authchunks))
                return -EINVAL;

        if (copy_from_user(&val, optval, sizeof(val)))
                return -EFAULT;

        to = p->gauth_chunks;
        asoc = sctp_id2assoc(sk, val.gauth_assoc_id);
        if (!asoc)
                return -EINVAL;

        if (!asoc->peer.auth_capable)
                return -EACCES;

        ch = asoc->peer.peer_chunks;
        if (!ch)
                goto num;

        /* See if the user provided enough room for all the data */
        num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr);
        if (len < num_chunks)
                return -EINVAL;

        if (copy_to_user(to, ch->chunks, num_chunks))
                return -EFAULT;
num:
        len = sizeof(struct sctp_authchunks) + num_chunks;
        if (put_user(len, optlen))
                return -EFAULT;
        if (put_user(num_chunks, &p->gauth_number_of_chunks))
                return -EFAULT;
        return 0;
}

static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len,
                                    char __user *optval, int __user *optlen)
{
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_authchunks __user *p = (void __user *)optval;
        struct sctp_authchunks val;
        struct sctp_association *asoc;
        struct sctp_chunks_param *ch;
        u32    num_chunks = 0;
        char __user *to;

        if (len < sizeof(struct sctp_authchunks))
                return -EINVAL;

        if (copy_from_user(&val, optval, sizeof(val)))
                return -EFAULT;

        to = p->gauth_chunks;
        asoc = sctp_id2assoc(sk, val.gauth_assoc_id);
        if (!asoc && val.gauth_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                if (!asoc->peer.auth_capable)
                        return -EACCES;
                ch = (struct sctp_chunks_param *)asoc->c.auth_chunks;
        } else {
                if (!ep->auth_enable)
                        return -EACCES;
                ch = ep->auth_chunk_list;
        }
        if (!ch)
                goto num;

        num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr);
        if (len < sizeof(struct sctp_authchunks) + num_chunks)
                return -EINVAL;

        if (copy_to_user(to, ch->chunks, num_chunks))
                return -EFAULT;
num:
        len = sizeof(struct sctp_authchunks) + num_chunks;
        if (put_user(len, optlen))
                return -EFAULT;
        if (put_user(num_chunks, &p->gauth_number_of_chunks))
                return -EFAULT;

        return 0;
}

/*
 * 8.2.5.  Get the Current Number of Associations (SCTP_GET_ASSOC_NUMBER)
 * This option gets the current number of associations that are attached
 * to a one-to-many style socket.  The option value is an uint32_t.
 */
static int sctp_getsockopt_assoc_number(struct sock *sk, int len,
                                    char __user *optval, int __user *optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        u32 val = 0;

        if (sctp_style(sk, TCP))
                return -EOPNOTSUPP;

        if (len < sizeof(u32))
                return -EINVAL;

        len = sizeof(u32);

        list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
                val++;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

/*
 * 8.1.23 SCTP_AUTO_ASCONF
 * See the corresponding setsockopt entry as description
 */
static int sctp_getsockopt_auto_asconf(struct sock *sk, int len,
                                   char __user *optval, int __user *optlen)
{
        int val = 0;

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        if (sctp_sk(sk)->do_auto_asconf && sctp_is_ep_boundall(sk))
                val = 1;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;
        return 0;
}

/*
 * 8.2.6. Get the Current Identifiers of Associations
 *        (SCTP_GET_ASSOC_ID_LIST)
 *
 * This option gets the current list of SCTP association identifiers of
 * the SCTP associations handled by a one-to-many style socket.
 */
static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
                                    char __user *optval, int __user *optlen)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_association *asoc;
        struct sctp_assoc_ids *ids;
        size_t ids_size;
        u32 num = 0;

        if (sctp_style(sk, TCP))
                return -EOPNOTSUPP;

        if (len < sizeof(struct sctp_assoc_ids))
                return -EINVAL;

        list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
                num++;
        }

        ids_size = struct_size(ids, gaids_assoc_id, num);
        if (len < ids_size)
                return -EINVAL;

        len = ids_size;
        ids = kmalloc(len, GFP_USER | __GFP_NOWARN);
        if (unlikely(!ids))
                return -ENOMEM;

        ids->gaids_number_of_ids = num;
        num = 0;
        list_for_each_entry(asoc, &(sp->ep->asocs), asocs) {
                ids->gaids_assoc_id[num++] = asoc->assoc_id;
        }

        if (put_user(len, optlen) || copy_to_user(optval, ids, len)) {
                kfree(ids);
                return -EFAULT;
        }

        kfree(ids);
        return 0;
}

/*
 * SCTP_PEER_ADDR_THLDS
 *
 * This option allows us to fetch the partially failed threshold for one or all
 * transports in an association.  See Section 6.1 of:
 * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
 */
static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
                                            char __user *optval, int len,
                                            int __user *optlen, bool v2)
{
        struct sctp_paddrthlds_v2 val;
        struct sctp_transport *trans;
        struct sctp_association *asoc;
        int min;

        min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds);
        if (len < min)
                return -EINVAL;
        len = min;
        if (copy_from_user(&val, optval, len))
                return -EFAULT;

        if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
                trans = sctp_addr_id2transport(sk, &val.spt_address,
                                               val.spt_assoc_id);
                if (!trans)
                        return -ENOENT;

                val.spt_pathmaxrxt = trans->pathmaxrxt;
                val.spt_pathpfthld = trans->pf_retrans;
                val.spt_pathcpthld = trans->ps_retrans;

                goto out;
        }

        asoc = sctp_id2assoc(sk, val.spt_assoc_id);
        if (!asoc && val.spt_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        if (asoc) {
                val.spt_pathpfthld = asoc->pf_retrans;
                val.spt_pathmaxrxt = asoc->pathmaxrxt;
                val.spt_pathcpthld = asoc->ps_retrans;
        } else {
                struct sctp_sock *sp = sctp_sk(sk);

                val.spt_pathpfthld = sp->pf_retrans;
                val.spt_pathmaxrxt = sp->pathmaxrxt;
                val.spt_pathcpthld = sp->ps_retrans;
        }

out:
        if (put_user(len, optlen) || copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

/*
 * SCTP_GET_ASSOC_STATS
 *
 * This option retrieves local per endpoint statistics. It is modeled
 * after OpenSolaris' implementation
 */
static int sctp_getsockopt_assoc_stats(struct sock *sk, int len,
                                       char __user *optval,
                                       int __user *optlen)
{
        struct sctp_assoc_stats sas;
        struct sctp_association *asoc = NULL;

        /* User must provide at least the assoc id */
        if (len < sizeof(sctp_assoc_t))
                return -EINVAL;

        /* Allow the struct to grow and fill in as much as possible */
        len = min_t(size_t, len, sizeof(sas));

        if (copy_from_user(&sas, optval, len))
                return -EFAULT;

        asoc = sctp_id2assoc(sk, sas.sas_assoc_id);
        if (!asoc)
                return -EINVAL;

        sas.sas_rtxchunks = asoc->stats.rtxchunks;
        sas.sas_gapcnt = asoc->stats.gapcnt;
        sas.sas_outofseqtsns = asoc->stats.outofseqtsns;
        sas.sas_osacks = asoc->stats.osacks;
        sas.sas_isacks = asoc->stats.isacks;
        sas.sas_octrlchunks = asoc->stats.octrlchunks;
        sas.sas_ictrlchunks = asoc->stats.ictrlchunks;
        sas.sas_oodchunks = asoc->stats.oodchunks;
        sas.sas_iodchunks = asoc->stats.iodchunks;
        sas.sas_ouodchunks = asoc->stats.ouodchunks;
        sas.sas_iuodchunks = asoc->stats.iuodchunks;
        sas.sas_idupchunks = asoc->stats.idupchunks;
        sas.sas_opackets = asoc->stats.opackets;
        sas.sas_ipackets = asoc->stats.ipackets;

        /* New high max rto observed, will return 0 if not a single
         * RTO update took place. obs_rto_ipaddr will be bogus
         * in such a case
         */
        sas.sas_maxrto = asoc->stats.max_obs_rto;
        memcpy(&sas.sas_obs_rto_ipaddr, &asoc->stats.obs_rto_ipaddr,
                sizeof(struct sockaddr_storage));

        /* Mark beginning of a new observation period */
        asoc->stats.max_obs_rto = asoc->rto_min;

        if (put_user(len, optlen))
                return -EFAULT;

        pr_debug("%s: len:%d, assoc_id:%d\n", __func__, len, sas.sas_assoc_id);

        if (copy_to_user(optval, &sas, len))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt_recvrcvinfo(struct sock *sk,        int len,
                                       char __user *optval,
                                       int __user *optlen)
{
        int val = 0;

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        if (sctp_sk(sk)->recvrcvinfo)
                val = 1;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt_recvnxtinfo(struct sock *sk,        int len,
                                       char __user *optval,
                                       int __user *optlen)
{
        int val = 0;

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        if (sctp_sk(sk)->recvnxtinfo)
                val = 1;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt_pr_supported(struct sock *sk, int len,
                                        char __user *optval,
                                        int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->peer.prsctp_capable
                                  : sctp_sk(sk)->ep->prsctp_enable;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_default_prinfo(struct sock *sk, int len,
                                          char __user *optval,
                                          int __user *optlen)
{
        struct sctp_default_prinfo info;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(info)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(info);
        if (copy_from_user(&info, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, info.pr_assoc_id);
        if (!asoc && info.pr_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        if (asoc) {
                info.pr_policy = SCTP_PR_POLICY(asoc->default_flags);
                info.pr_value = asoc->default_timetolive;
        } else {
                struct sctp_sock *sp = sctp_sk(sk);

                info.pr_policy = SCTP_PR_POLICY(sp->default_flags);
                info.pr_value = sp->default_timetolive;
        }

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &info, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len,
                                          char __user *optval,
                                          int __user *optlen)
{
        struct sctp_prstatus params;
        struct sctp_association *asoc;
        int policy;
        int retval = -EINVAL;

        if (len < sizeof(params))
                goto out;

        len = sizeof(params);
        if (copy_from_user(&params, optval, len)) {
                retval = -EFAULT;
                goto out;
        }

        policy = params.sprstat_policy;
        if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) ||
            ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK)))
                goto out;

        asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
        if (!asoc)
                goto out;

        if (policy == SCTP_PR_SCTP_ALL) {
                params.sprstat_abandoned_unsent = 0;
                params.sprstat_abandoned_sent = 0;
                for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
                        params.sprstat_abandoned_unsent +=
                                asoc->abandoned_unsent[policy];
                        params.sprstat_abandoned_sent +=
                                asoc->abandoned_sent[policy];
                }
        } else {
                params.sprstat_abandoned_unsent =
                        asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)];
                params.sprstat_abandoned_sent =
                        asoc->abandoned_sent[__SCTP_PR_INDEX(policy)];
        }

        if (put_user(len, optlen)) {
                retval = -EFAULT;
                goto out;
        }

        if (copy_to_user(optval, &params, len)) {
                retval = -EFAULT;
                goto out;
        }

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
                                           char __user *optval,
                                           int __user *optlen)
{
        struct sctp_stream_out_ext *streamoute;
        struct sctp_association *asoc;
        struct sctp_prstatus params;
        int retval = -EINVAL;
        int policy;

        if (len < sizeof(params))
                goto out;

        len = sizeof(params);
        if (copy_from_user(&params, optval, len)) {
                retval = -EFAULT;
                goto out;
        }

        policy = params.sprstat_policy;
        if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) ||
            ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK)))
                goto out;

        asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
        if (!asoc || params.sprstat_sid >= asoc->stream.outcnt)
                goto out;

        streamoute = SCTP_SO(&asoc->stream, params.sprstat_sid)->ext;
        if (!streamoute) {
                /* Not allocated yet, means all stats are 0 */
                params.sprstat_abandoned_unsent = 0;
                params.sprstat_abandoned_sent = 0;
                retval = 0;
                goto out;
        }

        if (policy == SCTP_PR_SCTP_ALL) {
                params.sprstat_abandoned_unsent = 0;
                params.sprstat_abandoned_sent = 0;
                for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
                        params.sprstat_abandoned_unsent +=
                                streamoute->abandoned_unsent[policy];
                        params.sprstat_abandoned_sent +=
                                streamoute->abandoned_sent[policy];
                }
        } else {
                params.sprstat_abandoned_unsent =
                        streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)];
                params.sprstat_abandoned_sent =
                        streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)];
        }

        if (put_user(len, optlen) || copy_to_user(optval, &params, len)) {
                retval = -EFAULT;
                goto out;
        }

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len,
                                              char __user *optval,
                                              int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->peer.reconf_capable
                                  : sctp_sk(sk)->ep->reconf_enable;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
                                           char __user *optval,
                                           int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->strreset_enable
                                  : sctp_sk(sk)->ep->strreset_enable;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_scheduler(struct sock *sk, int len,
                                     char __user *optval,
                                     int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? sctp_sched_get_sched(asoc)
                                  : sctp_sk(sk)->default_ss;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_scheduler_value(struct sock *sk, int len,
                                           char __user *optval,
                                           int __user *optlen)
{
        struct sctp_stream_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc) {
                retval = -EINVAL;
                goto out;
        }

        retval = sctp_sched_get_value(asoc, params.stream_id,
                                      &params.stream_value);
        if (retval)
                goto out;

        if (put_user(len, optlen)) {
                retval = -EFAULT;
                goto out;
        }

        if (copy_to_user(optval, &params, len)) {
                retval = -EFAULT;
                goto out;
        }

out:
        return retval;
}

static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len,
                                                  char __user *optval,
                                                  int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->peer.intl_capable
                                  : sctp_sk(sk)->ep->intl_enable;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
                                      char __user *optval,
                                      int __user *optlen)
{
        int val;

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = sctp_sk(sk)->reuse;
        if (put_user(len, optlen))
                return -EFAULT;

        if (copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
                                 int __user *optlen)
{
        struct sctp_association *asoc;
        struct sctp_event param;
        __u16 subscribe;

        if (len < sizeof(param))
                return -EINVAL;

        len = sizeof(param);
        if (copy_from_user(&param, optval, len))
                return -EFAULT;

        if (param.se_type < SCTP_SN_TYPE_BASE ||
            param.se_type > SCTP_SN_TYPE_MAX)
                return -EINVAL;

        asoc = sctp_id2assoc(sk, param.se_assoc_id);
        if (!asoc && param.se_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP))
                return -EINVAL;

        subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe;
        param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type);

        if (put_user(len, optlen))
                return -EFAULT;

        if (copy_to_user(optval, &param, len))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt_asconf_supported(struct sock *sk, int len,
                                            char __user *optval,
                                            int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->peer.asconf_capable
                                  : sctp_sk(sk)->ep->asconf_enable;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_auth_supported(struct sock *sk, int len,
                                          char __user *optval,
                                          int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->peer.auth_capable
                                  : sctp_sk(sk)->ep->auth_enable;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_ecn_supported(struct sock *sk, int len,
                                         char __user *optval,
                                         int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->peer.ecn_capable
                                  : sctp_sk(sk)->ep->ecn_enable;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_pf_expose(struct sock *sk, int len,
                                     char __user *optval,
                                     int __user *optlen)
{
        struct sctp_assoc_value params;
        struct sctp_association *asoc;
        int retval = -EFAULT;

        if (len < sizeof(params)) {
                retval = -EINVAL;
                goto out;
        }

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                goto out;

        asoc = sctp_id2assoc(sk, params.assoc_id);
        if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                retval = -EINVAL;
                goto out;
        }

        params.assoc_value = asoc ? asoc->pf_expose
                                  : sctp_sk(sk)->pf_expose;

        if (put_user(len, optlen))
                goto out;

        if (copy_to_user(optval, &params, len))
                goto out;

        retval = 0;

out:
        return retval;
}

static int sctp_getsockopt_encap_port(struct sock *sk, int len,
                                      char __user *optval, int __user *optlen)
{
        struct sctp_association *asoc;
        struct sctp_udpencaps encap;
        struct sctp_transport *t;
        __be16 encap_port;

        if (len < sizeof(encap))
                return -EINVAL;

        len = sizeof(encap);
        if (copy_from_user(&encap, optval, len))
                return -EFAULT;

        /* If an address other than INADDR_ANY is specified, and
         * no transport is found, then the request is invalid.
         */
        if (!sctp_is_any(sk, (union sctp_addr *)&encap.sue_address)) {
                t = sctp_addr_id2transport(sk, &encap.sue_address,
                                           encap.sue_assoc_id);
                if (!t) {
                        pr_debug("%s: failed no transport\n", __func__);
                        return -EINVAL;
                }

                encap_port = t->encap_port;
                goto out;
        }

        /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, encap.sue_assoc_id);
        if (!asoc && encap.sue_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                pr_debug("%s: failed no association\n", __func__);
                return -EINVAL;
        }

        if (asoc) {
                encap_port = asoc->encap_port;
                goto out;
        }

        encap_port = sctp_sk(sk)->encap_port;

out:
        encap.sue_port = (__force uint16_t)encap_port;
        if (copy_to_user(optval, &encap, len))
                return -EFAULT;

        if (put_user(len, optlen))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt_probe_interval(struct sock *sk, int len,
                                          char __user *optval,
                                          int __user *optlen)
{
        struct sctp_probeinterval params;
        struct sctp_association *asoc;
        struct sctp_transport *t;
        __u32 probe_interval;

        if (len < sizeof(params))
                return -EINVAL;

        len = sizeof(params);
        if (copy_from_user(&params, optval, len))
                return -EFAULT;

        /* If an address other than INADDR_ANY is specified, and
         * no transport is found, then the request is invalid.
         */
        if (!sctp_is_any(sk, (union sctp_addr *)&params.spi_address)) {
                t = sctp_addr_id2transport(sk, &params.spi_address,
                                           params.spi_assoc_id);
                if (!t) {
                        pr_debug("%s: failed no transport\n", __func__);
                        return -EINVAL;
                }

                probe_interval = jiffies_to_msecs(t->probe_interval);
                goto out;
        }

        /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
         * socket is a one to many style socket, and an association
         * was not found, then the id was invalid.
         */
        asoc = sctp_id2assoc(sk, params.spi_assoc_id);
        if (!asoc && params.spi_assoc_id != SCTP_FUTURE_ASSOC &&
            sctp_style(sk, UDP)) {
                pr_debug("%s: failed no association\n", __func__);
                return -EINVAL;
        }

        if (asoc) {
                probe_interval = jiffies_to_msecs(asoc->probe_interval);
                goto out;
        }

        probe_interval = sctp_sk(sk)->probe_interval;

out:
        params.spi_interval = probe_interval;
        if (copy_to_user(optval, &params, len))
                return -EFAULT;

        if (put_user(len, optlen))
                return -EFAULT;

        return 0;
}

static int sctp_getsockopt(struct sock *sk, int level, int optname,
                           char __user *optval, int __user *optlen)
{
        int retval = 0;
        int len;

        pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname);

        /* I can hardly begin to describe how wrong this is.  This is
         * so broken as to be worse than useless.  The API draft
         * REALLY is NOT helpful here...  I am not convinced that the
         * semantics of getsockopt() with a level OTHER THAN SOL_SCTP
         * are at all well-founded.
         */
        if (level != SOL_SCTP) {
                struct sctp_af *af = sctp_sk(sk)->pf->af;

                retval = af->getsockopt(sk, level, optname, optval, optlen);
                return retval;
        }

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        lock_sock(sk);

        switch (optname) {
        case SCTP_STATUS:
                retval = sctp_getsockopt_sctp_status(sk, len, optval, optlen);
                break;
        case SCTP_DISABLE_FRAGMENTS:
                retval = sctp_getsockopt_disable_fragments(sk, len, optval,
                                                           optlen);
                break;
        case SCTP_EVENTS:
                retval = sctp_getsockopt_events(sk, len, optval, optlen);
                break;
        case SCTP_AUTOCLOSE:
                retval = sctp_getsockopt_autoclose(sk, len, optval, optlen);
                break;
        case SCTP_SOCKOPT_PEELOFF:
                retval = sctp_getsockopt_peeloff(sk, len, optval, optlen);
                break;
        case SCTP_SOCKOPT_PEELOFF_FLAGS:
                retval = sctp_getsockopt_peeloff_flags(sk, len, optval, optlen);
                break;
        case SCTP_PEER_ADDR_PARAMS:
                retval = sctp_getsockopt_peer_addr_params(sk, len, optval,
                                                          optlen);
                break;
        case SCTP_DELAYED_SACK:
                retval = sctp_getsockopt_delayed_ack(sk, len, optval,
                                                          optlen);
                break;
        case SCTP_INITMSG:
                retval = sctp_getsockopt_initmsg(sk, len, optval, optlen);
                break;
        case SCTP_GET_PEER_ADDRS:
                retval = sctp_getsockopt_peer_addrs(sk, len, optval,
                                                    optlen);
                break;
        case SCTP_GET_LOCAL_ADDRS:
                retval = sctp_getsockopt_local_addrs(sk, len, optval,
                                                     optlen);
                break;
        case SCTP_SOCKOPT_CONNECTX3:
                retval = sctp_getsockopt_connectx3(sk, len, optval, optlen);
                break;
        case SCTP_DEFAULT_SEND_PARAM:
                retval = sctp_getsockopt_default_send_param(sk, len,
                                                            optval, optlen);
                break;
        case SCTP_DEFAULT_SNDINFO:
                retval = sctp_getsockopt_default_sndinfo(sk, len,
                                                         optval, optlen);
                break;
        case SCTP_PRIMARY_ADDR:
                retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen);
                break;
        case SCTP_NODELAY:
                retval = sctp_getsockopt_nodelay(sk, len, optval, optlen);
                break;
        case SCTP_RTOINFO:
                retval = sctp_getsockopt_rtoinfo(sk, len, optval, optlen);
                break;
        case SCTP_ASSOCINFO:
                retval = sctp_getsockopt_associnfo(sk, len, optval, optlen);
                break;
        case SCTP_I_WANT_MAPPED_V4_ADDR:
                retval = sctp_getsockopt_mappedv4(sk, len, optval, optlen);
                break;
        case SCTP_MAXSEG:
                retval = sctp_getsockopt_maxseg(sk, len, optval, optlen);
                break;
        case SCTP_GET_PEER_ADDR_INFO:
                retval = sctp_getsockopt_peer_addr_info(sk, len, optval,
                                                        optlen);
                break;
        case SCTP_ADAPTATION_LAYER:
                retval = sctp_getsockopt_adaptation_layer(sk, len, optval,
                                                        optlen);
                break;
        case SCTP_CONTEXT:
                retval = sctp_getsockopt_context(sk, len, optval, optlen);
                break;
        case SCTP_FRAGMENT_INTERLEAVE:
                retval = sctp_getsockopt_fragment_interleave(sk, len, optval,
                                                             optlen);
                break;
        case SCTP_PARTIAL_DELIVERY_POINT:
                retval = sctp_getsockopt_partial_delivery_point(sk, len, optval,
                                                                optlen);
                break;
        case SCTP_MAX_BURST:
                retval = sctp_getsockopt_maxburst(sk, len, optval, optlen);
                break;
        case SCTP_AUTH_KEY:
        case SCTP_AUTH_CHUNK:
        case SCTP_AUTH_DELETE_KEY:
        case SCTP_AUTH_DEACTIVATE_KEY:
                retval = -EOPNOTSUPP;
                break;
        case SCTP_HMAC_IDENT:
                retval = sctp_getsockopt_hmac_ident(sk, len, optval, optlen);
                break;
        case SCTP_AUTH_ACTIVE_KEY:
                retval = sctp_getsockopt_active_key(sk, len, optval, optlen);
                break;
        case SCTP_PEER_AUTH_CHUNKS:
                retval = sctp_getsockopt_peer_auth_chunks(sk, len, optval,
                                                        optlen);
                break;
        case SCTP_LOCAL_AUTH_CHUNKS:
                retval = sctp_getsockopt_local_auth_chunks(sk, len, optval,
                                                        optlen);
                break;
        case SCTP_GET_ASSOC_NUMBER:
                retval = sctp_getsockopt_assoc_number(sk, len, optval, optlen);
                break;
        case SCTP_GET_ASSOC_ID_LIST:
                retval = sctp_getsockopt_assoc_ids(sk, len, optval, optlen);
                break;
        case SCTP_AUTO_ASCONF:
                retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
                break;
        case SCTP_PEER_ADDR_THLDS:
                retval = sctp_getsockopt_paddr_thresholds(sk, optval, len,
                                                          optlen, false);
                break;
        case SCTP_PEER_ADDR_THLDS_V2:
                retval = sctp_getsockopt_paddr_thresholds(sk, optval, len,
                                                          optlen, true);
                break;
        case SCTP_GET_ASSOC_STATS:
                retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen);
                break;
        case SCTP_RECVRCVINFO:
                retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen);
                break;
        case SCTP_RECVNXTINFO:
                retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen);
                break;
        case SCTP_PR_SUPPORTED:
                retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen);
                break;
        case SCTP_DEFAULT_PRINFO:
                retval = sctp_getsockopt_default_prinfo(sk, len, optval,
                                                        optlen);
                break;
        case SCTP_PR_ASSOC_STATUS:
                retval = sctp_getsockopt_pr_assocstatus(sk, len, optval,
                                                        optlen);
                break;
        case SCTP_PR_STREAM_STATUS:
                retval = sctp_getsockopt_pr_streamstatus(sk, len, optval,
                                                         optlen);
                break;
        case SCTP_RECONFIG_SUPPORTED:
                retval = sctp_getsockopt_reconfig_supported(sk, len, optval,
                                                            optlen);
                break;
        case SCTP_ENABLE_STREAM_RESET:
                retval = sctp_getsockopt_enable_strreset(sk, len, optval,
                                                         optlen);
                break;
        case SCTP_STREAM_SCHEDULER:
                retval = sctp_getsockopt_scheduler(sk, len, optval,
                                                   optlen);
                break;
        case SCTP_STREAM_SCHEDULER_VALUE:
                retval = sctp_getsockopt_scheduler_value(sk, len, optval,
                                                         optlen);
                break;
        case SCTP_INTERLEAVING_SUPPORTED:
                retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
                                                                optlen);
                break;
        case SCTP_REUSE_PORT:
                retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
                break;
        case SCTP_EVENT:
                retval = sctp_getsockopt_event(sk, len, optval, optlen);
                break;
        case SCTP_ASCONF_SUPPORTED:
                retval = sctp_getsockopt_asconf_supported(sk, len, optval,
                                                          optlen);
                break;
        case SCTP_AUTH_SUPPORTED:
                retval = sctp_getsockopt_auth_supported(sk, len, optval,
                                                        optlen);
                break;
        case SCTP_ECN_SUPPORTED:
                retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen);
                break;
        case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
                retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen);
                break;
        case SCTP_REMOTE_UDP_ENCAPS_PORT:
                retval = sctp_getsockopt_encap_port(sk, len, optval, optlen);
                break;
        case SCTP_PLPMTUD_PROBE_INTERVAL:
                retval = sctp_getsockopt_probe_interval(sk, len, optval, optlen);
                break;
        default:
                retval = -ENOPROTOOPT;
                break;
        }

        release_sock(sk);
        return retval;
}

static bool sctp_bpf_bypass_getsockopt(int level, int optname)
{
        if (level == SOL_SCTP) {
                switch (optname) {
                case SCTP_SOCKOPT_PEELOFF:
                case SCTP_SOCKOPT_PEELOFF_FLAGS:
                case SCTP_SOCKOPT_CONNECTX3:
                        return true;
                default:
                        return false;
                }
        }

        return false;
}

static int sctp_hash(struct sock *sk)
{
        /* STUB */
        return 0;
}

static void sctp_unhash(struct sock *sk)
{
        /* STUB */
}

/* Check if port is acceptable.  Possibly find first available port.
 *
 * The port hash table (contained in the 'global' SCTP protocol storage
 * returned by struct sctp_protocol *sctp_get_protocol()). The hash
 * table is an array of 4096 lists (sctp_bind_hashbucket). Each
 * list (the list number is the port number hashed out, so as you
 * would expect from a hash function, all the ports in a given list have
 * such a number that hashes out to the same list number; you were
 * expecting that, right?); so each list has a set of ports, with a
 * link to the socket (struct sock) that uses it, the port number and
 * a fastreuse flag (FIXME: NPI ipg).
 */
static struct sctp_bind_bucket *sctp_bucket_create(
        struct sctp_bind_hashbucket *head, struct net *, unsigned short snum);

static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
{
        struct sctp_sock *sp = sctp_sk(sk);
        bool reuse = (sk->sk_reuse || sp->reuse);
        struct sctp_bind_hashbucket *head; /* hash list */
        struct net *net = sock_net(sk);
        kuid_t uid = sock_i_uid(sk);
        struct sctp_bind_bucket *pp;
        unsigned short snum;
        int ret;

        snum = ntohs(addr->v4.sin_port);

        pr_debug("%s: begins, snum:%d\n", __func__, snum);

        if (snum == 0) {
                /* Search for an available port. */
                int low, high, remaining, index;
                unsigned int rover;

                inet_sk_get_local_port_range(sk, &low, &high);
                remaining = (high - low) + 1;
                rover = get_random_u32_below(remaining) + low;

                do {
                        rover++;
                        if ((rover < low) || (rover > high))
                                rover = low;
                        if (inet_is_local_reserved_port(net, rover))
                                continue;
                        index = sctp_phashfn(net, rover);
                        head = &sctp_port_hashtable[index];
                        spin_lock_bh(&head->lock);
                        sctp_for_each_hentry(pp, &head->chain)
                                if ((pp->port == rover) &&
                                    net_eq(net, pp->net))
                                        goto next;
                        break;
                next:
                        spin_unlock_bh(&head->lock);
                        cond_resched();
                } while (--remaining > 0);

                /* Exhausted local port range during search? */
                ret = 1;
                if (remaining <= 0)
                        return ret;

                /* OK, here is the one we will use.  HEAD (the port
                 * hash table list entry) is non-NULL and we hold it's
                 * mutex.
                 */
                snum = rover;
        } else {
                /* We are given an specific port number; we verify
                 * that it is not being used. If it is used, we will
                 * exahust the search in the hash list corresponding
                 * to the port number (snum) - we detect that with the
                 * port iterator, pp being NULL.
                 */
                head = &sctp_port_hashtable[sctp_phashfn(net, snum)];
                spin_lock_bh(&head->lock);
                sctp_for_each_hentry(pp, &head->chain) {
                        if ((pp->port == snum) && net_eq(pp->net, net))
                                goto pp_found;
                }
        }
        pp = NULL;
        goto pp_not_found;
pp_found:
        if (!hlist_empty(&pp->owner)) {
                /* We had a port hash table hit - there is an
                 * available port (pp != NULL) and it is being
                 * used by other socket (pp->owner not empty); that other
                 * socket is going to be sk2.
                 */
                struct sock *sk2;

                pr_debug("%s: found a possible match\n", __func__);

                if ((pp->fastreuse && reuse &&
                     sk->sk_state != SCTP_SS_LISTENING) ||
                    (pp->fastreuseport && sk->sk_reuseport &&
                     uid_eq(pp->fastuid, uid)))
                        goto success;

                /* Run through the list of sockets bound to the port
                 * (pp->port) [via the pointers bind_next and
                 * bind_pprev in the struct sock *sk2 (pp->sk)]. On each one,
                 * we get the endpoint they describe and run through
                 * the endpoint's list of IP (v4 or v6) addresses,
                 * comparing each of the addresses with the address of
                 * the socket sk. If we find a match, then that means
                 * that this port/socket (sk) combination are already
                 * in an endpoint.
                 */
                sk_for_each_bound(sk2, &pp->owner) {
                        int bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
                        struct sctp_sock *sp2 = sctp_sk(sk2);
                        struct sctp_endpoint *ep2 = sp2->ep;

                        if (sk == sk2 ||
                            (reuse && (sk2->sk_reuse || sp2->reuse) &&
                             sk2->sk_state != SCTP_SS_LISTENING) ||
                            (sk->sk_reuseport && sk2->sk_reuseport &&
                             uid_eq(uid, sock_i_uid(sk2))))
                                continue;

                        if ((!sk->sk_bound_dev_if || !bound_dev_if2 ||
                             sk->sk_bound_dev_if == bound_dev_if2) &&
                            sctp_bind_addr_conflict(&ep2->base.bind_addr,
                                                    addr, sp2, sp)) {
                                ret = 1;
                                goto fail_unlock;
                        }
                }

                pr_debug("%s: found a match\n", __func__);
        }
pp_not_found:
        /* If there was a hash table miss, create a new port.  */
        ret = 1;
        if (!pp && !(pp = sctp_bucket_create(head, net, snum)))
                goto fail_unlock;

        /* In either case (hit or miss), make sure fastreuse is 1 only
         * if sk->sk_reuse is too (that is, if the caller requested
         * SO_REUSEADDR on this socket -sk-).
         */
        if (hlist_empty(&pp->owner)) {
                if (reuse && sk->sk_state != SCTP_SS_LISTENING)
                        pp->fastreuse = 1;
                else
                        pp->fastreuse = 0;

                if (sk->sk_reuseport) {
                        pp->fastreuseport = 1;
                        pp->fastuid = uid;
                } else {
                        pp->fastreuseport = 0;
                }
        } else {
                if (pp->fastreuse &&
                    (!reuse || sk->sk_state == SCTP_SS_LISTENING))
                        pp->fastreuse = 0;

                if (pp->fastreuseport &&
                    (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid)))
                        pp->fastreuseport = 0;
        }

        /* We are set, so fill up all the data in the hash table
         * entry, tie the socket list information with the rest of the
         * sockets FIXME: Blurry, NPI (ipg).
         */
success:
        if (!sp->bind_hash) {
                inet_sk(sk)->inet_num = snum;
                sk_add_bind_node(sk, &pp->owner);
                sp->bind_hash = pp;
        }
        ret = 0;

fail_unlock:
        spin_unlock_bh(&head->lock);
        return ret;
}

/* Assign a 'snum' port to the socket.  If snum == 0, an ephemeral
 * port is requested.
 */
static int sctp_get_port(struct sock *sk, unsigned short snum)
{
        union sctp_addr addr;
        struct sctp_af *af = sctp_sk(sk)->pf->af;

        /* Set up a dummy address struct from the sk. */
        af->from_sk(&addr, sk);
        addr.v4.sin_port = htons(snum);

        /* Note: sk->sk_num gets filled in if ephemeral port request. */
        return sctp_get_port_local(sk, &addr);
}

/*
 *  Move a socket to LISTENING state.
 */
static int sctp_listen_start(struct sock *sk, int backlog)
{
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_endpoint *ep = sp->ep;
        struct crypto_shash *tfm = NULL;
        char alg[32];

        /* Allocate HMAC for generating cookie. */
        if (!sp->hmac && sp->sctp_hmac_alg) {
                sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg);
                tfm = crypto_alloc_shash(alg, 0, 0);
                if (IS_ERR(tfm)) {
                        net_info_ratelimited("failed to load transform for %s: %ld\n",
                                             sp->sctp_hmac_alg, PTR_ERR(tfm));
                        return -ENOSYS;
                }
                sctp_sk(sk)->hmac = tfm;
        }

        /*
         * If a bind() or sctp_bindx() is not called prior to a listen()
         * call that allows new associations to be accepted, the system
         * picks an ephemeral port and will choose an address set equivalent
         * to binding with a wildcard address.
         *
         * This is not currently spelled out in the SCTP sockets
         * extensions draft, but follows the practice as seen in TCP
         * sockets.
         *
         */
        inet_sk_set_state(sk, SCTP_SS_LISTENING);
        if (!ep->base.bind_addr.port) {
                if (sctp_autobind(sk))
                        return -EAGAIN;
        } else {
                if (sctp_get_port(sk, inet_sk(sk)->inet_num)) {
                        inet_sk_set_state(sk, SCTP_SS_CLOSED);
                        return -EADDRINUSE;
                }
        }

        WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
        return sctp_hash_endpoint(ep);
}

/*
 * 4.1.3 / 5.1.3 listen()
 *
 *   By default, new associations are not accepted for UDP style sockets.
 *   An application uses listen() to mark a socket as being able to
 *   accept new associations.
 *
 *   On TCP style sockets, applications use listen() to ready the SCTP
 *   endpoint for accepting inbound associations.
 *
 *   On both types of endpoints a backlog of '0' disables listening.
 *
 *  Move a socket to LISTENING state.
 */
int sctp_inet_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        int err = -EINVAL;

        if (unlikely(backlog < 0))
                return err;

        lock_sock(sk);

        /* Peeled-off sockets are not allowed to listen().  */
        if (sctp_style(sk, UDP_HIGH_BANDWIDTH))
                goto out;

        if (sock->state != SS_UNCONNECTED)
                goto out;

        if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED))
                goto out;

        /* If backlog is zero, disable listening. */
        if (!backlog) {
                if (sctp_sstate(sk, CLOSED))
                        goto out;

                err = 0;
                sctp_unhash_endpoint(ep);
                sk->sk_state = SCTP_SS_CLOSED;
                if (sk->sk_reuse || sctp_sk(sk)->reuse)
                        sctp_sk(sk)->bind_hash->fastreuse = 1;
                goto out;
        }

        /* If we are already listening, just update the backlog */
        if (sctp_sstate(sk, LISTENING))
                WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
        else {
                err = sctp_listen_start(sk, backlog);
                if (err)
                        goto out;
        }

        err = 0;
out:
        release_sock(sk);
        return err;
}

/*
 * This function is done by modeling the current datagram_poll() and the
 * tcp_poll().  Note that, based on these implementations, we don't
 * lock the socket in this function, even though it seems that,
 * ideally, locking or some other mechanisms can be used to ensure
 * the integrity of the counters (sndbuf and wmem_alloc) used
 * in this place.  We assume that we don't need locks either until proven
 * otherwise.
 *
 * Another thing to note is that we include the Async I/O support
 * here, again, by modeling the current TCP/UDP code.  We don't have
 * a good way to test with it yet.
 */
__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct sctp_sock *sp = sctp_sk(sk);
        __poll_t mask;

        poll_wait(file, sk_sleep(sk), wait);

        sock_rps_record_flow(sk);

        /* A TCP-style listening socket becomes readable when the accept queue
         * is not empty.
         */
        if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
                return (!list_empty(&sp->ep->asocs)) ?
                        (EPOLLIN | EPOLLRDNORM) : 0;

        mask = 0;

        /* Is there any exceptional events?  */
        if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR |
                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
        if (sk->sk_shutdown == SHUTDOWN_MASK)
                mask |= EPOLLHUP;

        /* Is it readable?  Reconsider this code with TCP-style support.  */
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* The association is either gone or not ready.  */
        if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED))
                return mask;

        /* Is it writable?  */
        if (sctp_writeable(sk)) {
                mask |= EPOLLOUT | EPOLLWRNORM;
        } else {
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                /*
                 * Since the socket is not locked, the buffer
                 * might be made available after the writeable check and
                 * before the bit is set.  This could cause a lost I/O
                 * signal.  tcp_poll() has a race breaker for this race
                 * condition.  Based on their implementation, we put
                 * in the following code to cover it as well.
                 */
                if (sctp_writeable(sk))
                        mask |= EPOLLOUT | EPOLLWRNORM;
        }
        return mask;
}

/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

static struct sctp_bind_bucket *sctp_bucket_create(
        struct sctp_bind_hashbucket *head, struct net *net, unsigned short snum)
{
        struct sctp_bind_bucket *pp;

        pp = kmem_cache_alloc(sctp_bucket_cachep, GFP_ATOMIC);
        if (pp) {
                SCTP_DBG_OBJCNT_INC(bind_bucket);
                pp->port = snum;
                pp->fastreuse = 0;
                INIT_HLIST_HEAD(&pp->owner);
                pp->net = net;
                hlist_add_head(&pp->node, &head->chain);
        }
        return pp;
}

/* Caller must hold hashbucket lock for this tb with local BH disabled */
static void sctp_bucket_destroy(struct sctp_bind_bucket *pp)
{
        if (pp && hlist_empty(&pp->owner)) {
                __hlist_del(&pp->node);
                kmem_cache_free(sctp_bucket_cachep, pp);
                SCTP_DBG_OBJCNT_DEC(bind_bucket);
        }
}

/* Release this socket's reference to a local port.  */
static inline void __sctp_put_port(struct sock *sk)
{
        struct sctp_bind_hashbucket *head =
                &sctp_port_hashtable[sctp_phashfn(sock_net(sk),
                                                  inet_sk(sk)->inet_num)];
        struct sctp_bind_bucket *pp;

        spin_lock(&head->lock);
        pp = sctp_sk(sk)->bind_hash;
        __sk_del_bind_node(sk);
        sctp_sk(sk)->bind_hash = NULL;
        inet_sk(sk)->inet_num = 0;
        sctp_bucket_destroy(pp);
        spin_unlock(&head->lock);
}

void sctp_put_port(struct sock *sk)
{
        local_bh_disable();
        __sctp_put_port(sk);
        local_bh_enable();
}

/*
 * The system picks an ephemeral port and choose an address set equivalent
 * to binding with a wildcard address.
 * One of those addresses will be the primary address for the association.
 * This automatically enables the multihoming capability of SCTP.
 */
static int sctp_autobind(struct sock *sk)
{
        union sctp_addr autoaddr;
        struct sctp_af *af;
        __be16 port;

        /* Initialize a local sockaddr structure to INADDR_ANY. */
        af = sctp_sk(sk)->pf->af;

        port = htons(inet_sk(sk)->inet_num);
        af->inaddr_any(&autoaddr, port);

        return sctp_do_bind(sk, &autoaddr, af->sockaddr_len);
}

/* Parse out IPPROTO_SCTP CMSG headers.  Perform only minimal validation.
 *
 * From RFC 2292
 * 4.2 The cmsghdr Structure *
 *
 * When ancillary data is sent or received, any number of ancillary data
 * objects can be specified by the msg_control and msg_controllen members of
 * the msghdr structure, because each object is preceded by
 * a cmsghdr structure defining the object's length (the cmsg_len member).
 * Historically Berkeley-derived implementations have passed only one object
 * at a time, but this API allows multiple objects to be
 * passed in a single call to sendmsg() or recvmsg(). The following example
 * shows two ancillary data objects in a control buffer.
 *
 *   |<--------------------------- msg_controllen -------------------------->|
 *   |                                                                       |
 *
 *   |<----- ancillary data object ----->|<----- ancillary data object ----->|
 *
 *   |<---------- CMSG_SPACE() --------->|<---------- CMSG_SPACE() --------->|
 *   |                                   |                                   |
 *
 *   |<---------- cmsg_len ---------->|  |<--------- cmsg_len ----------->|  |
 *
 *   |<--------- CMSG_LEN() --------->|  |<-------- CMSG_LEN() ---------->|  |
 *   |                                |  |                                |  |
 *
 *   +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+
 *   |cmsg_|cmsg_|cmsg_|XX|           |XX|cmsg_|cmsg_|cmsg_|XX|           |XX|
 *
 *   |len  |level|type |XX|cmsg_data[]|XX|len  |level|type |XX|cmsg_data[]|XX|
 *
 *   +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+
 *    ^
 *    |
 *
 * msg_control
 * points here
 */
static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
{
        struct msghdr *my_msg = (struct msghdr *)msg;
        struct cmsghdr *cmsg;

        for_each_cmsghdr(cmsg, my_msg) {
                if (!CMSG_OK(my_msg, cmsg))
                        return -EINVAL;

                /* Should we parse this header or ignore?  */
                if (cmsg->cmsg_level != IPPROTO_SCTP)
                        continue;

                /* Strictly check lengths following example in SCM code.  */
                switch (cmsg->cmsg_type) {
                case SCTP_INIT:
                        /* SCTP Socket API Extension
                         * 5.3.1 SCTP Initiation Structure (SCTP_INIT)
                         *
                         * This cmsghdr structure provides information for
                         * initializing new SCTP associations with sendmsg().
                         * The SCTP_INITMSG socket option uses this same data
                         * structure.  This structure is not used for
                         * recvmsg().
                         *
                         * cmsg_level    cmsg_type      cmsg_data[]
                         * ------------  ------------   ----------------------
                         * IPPROTO_SCTP  SCTP_INIT      struct sctp_initmsg
                         */
                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_initmsg)))
                                return -EINVAL;

                        cmsgs->init = CMSG_DATA(cmsg);
                        break;

                case SCTP_SNDRCV:
                        /* SCTP Socket API Extension
                         * 5.3.2 SCTP Header Information Structure(SCTP_SNDRCV)
                         *
                         * This cmsghdr structure specifies SCTP options for
                         * sendmsg() and describes SCTP header information
                         * about a received message through recvmsg().
                         *
                         * cmsg_level    cmsg_type      cmsg_data[]
                         * ------------  ------------   ----------------------
                         * IPPROTO_SCTP  SCTP_SNDRCV    struct sctp_sndrcvinfo
                         */
                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndrcvinfo)))
                                return -EINVAL;

                        cmsgs->srinfo = CMSG_DATA(cmsg);

                        if (cmsgs->srinfo->sinfo_flags &
                            ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
                              SCTP_SACK_IMMEDIATELY | SCTP_SENDALL |
                              SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF))
                                return -EINVAL;
                        break;

                case SCTP_SNDINFO:
                        /* SCTP Socket API Extension
                         * 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO)
                         *
                         * This cmsghdr structure specifies SCTP options for
                         * sendmsg(). This structure and SCTP_RCVINFO replaces
                         * SCTP_SNDRCV which has been deprecated.
                         *
                         * cmsg_level    cmsg_type      cmsg_data[]
                         * ------------  ------------   ---------------------
                         * IPPROTO_SCTP  SCTP_SNDINFO    struct sctp_sndinfo
                         */
                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndinfo)))
                                return -EINVAL;

                        cmsgs->sinfo = CMSG_DATA(cmsg);

                        if (cmsgs->sinfo->snd_flags &
                            ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
                              SCTP_SACK_IMMEDIATELY | SCTP_SENDALL |
                              SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF))
                                return -EINVAL;
                        break;
                case SCTP_PRINFO:
                        /* SCTP Socket API Extension
                         * 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO)
                         *
                         * This cmsghdr structure specifies SCTP options for sendmsg().
                         *
                         * cmsg_level    cmsg_type      cmsg_data[]
                         * ------------  ------------   ---------------------
                         * IPPROTO_SCTP  SCTP_PRINFO    struct sctp_prinfo
                         */
                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_prinfo)))
                                return -EINVAL;

                        cmsgs->prinfo = CMSG_DATA(cmsg);
                        if (cmsgs->prinfo->pr_policy & ~SCTP_PR_SCTP_MASK)
                                return -EINVAL;

                        if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE)
                                cmsgs->prinfo->pr_value = 0;
                        break;
                case SCTP_AUTHINFO:
                        /* SCTP Socket API Extension
                         * 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO)
                         *
                         * This cmsghdr structure specifies SCTP options for sendmsg().
                         *
                         * cmsg_level    cmsg_type      cmsg_data[]
                         * ------------  ------------   ---------------------
                         * IPPROTO_SCTP  SCTP_AUTHINFO  struct sctp_authinfo
                         */
                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_authinfo)))
                                return -EINVAL;

                        cmsgs->authinfo = CMSG_DATA(cmsg);
                        break;
                case SCTP_DSTADDRV4:
                case SCTP_DSTADDRV6:
                        /* SCTP Socket API Extension
                         * 5.3.9/10 SCTP Destination IPv4/6 Address Structure (SCTP_DSTADDRV4/6)
                         *
                         * This cmsghdr structure specifies SCTP options for sendmsg().
                         *
                         * cmsg_level    cmsg_type         cmsg_data[]
                         * ------------  ------------   ---------------------
                         * IPPROTO_SCTP  SCTP_DSTADDRV4 struct in_addr
                         * ------------  ------------   ---------------------
                         * IPPROTO_SCTP  SCTP_DSTADDRV6 struct in6_addr
                         */
                        cmsgs->addrs_msg = my_msg;
                        break;
                default:
                        return -EINVAL;
                }
        }

        return 0;
}

/*
 * Wait for a packet..
 * Note: This function is the same function as in core/datagram.c
 * with a few modifications to make lksctp work.
 */
static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p)
{
        int error;
        DEFINE_WAIT(wait);

        prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);

        /* Socket errors? */
        error = sock_error(sk);
        if (error)
                goto out;

        if (!skb_queue_empty(&sk->sk_receive_queue))
                goto ready;

        /* Socket shut down?  */
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                goto out;

        /* Sequenced packets can come disconnected.  If so we report the
         * problem.
         */
        error = -ENOTCONN;

        /* Is there a good reason to think that we may receive some data?  */
        if (list_empty(&sctp_sk(sk)->ep->asocs) && !sctp_sstate(sk, LISTENING))
                goto out;

        /* Handle signals.  */
        if (signal_pending(current))
                goto interrupted;

        /* Let another process have a go.  Since we are going to sleep
         * anyway.  Note: This may cause odd behaviors if the message
         * does not fit in the user's buffer, but this seems to be the
         * only way to honor MSG_DONTWAIT realistically.
         */
        release_sock(sk);
        *timeo_p = schedule_timeout(*timeo_p);
        lock_sock(sk);

ready:
        finish_wait(sk_sleep(sk), &wait);
        return 0;

interrupted:
        error = sock_intr_errno(*timeo_p);

out:
        finish_wait(sk_sleep(sk), &wait);
        *err = error;
        return error;
}

/* Receive a datagram.
 * Note: This is pretty much the same routine as in core/datagram.c
 * with a few changes to make lksctp work.
 */
struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, int *err)
{
        int error;
        struct sk_buff *skb;
        long timeo;

        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        pr_debug("%s: timeo:%ld, max:%ld\n", __func__, timeo,
                 MAX_SCHEDULE_TIMEOUT);

        do {
                /* Again only user level code calls this function,
                 * so nothing interrupt level
                 * will suddenly eat the receive_queue.
                 *
                 *  Look at current nfs client by the way...
                 *  However, this function was correct in any case. 8)
                 */
                if (flags & MSG_PEEK) {
                        skb = skb_peek(&sk->sk_receive_queue);
                        if (skb)
                                refcount_inc(&skb->users);
                } else {
                        skb = __skb_dequeue(&sk->sk_receive_queue);
                }

                if (skb)
                        return skb;

                /* Caller is allowed not to check sk->sk_err before calling. */
                error = sock_error(sk);
                if (error)
                        goto no_packet;

                if (sk->sk_shutdown & RCV_SHUTDOWN)
                        break;


                /* User doesn't want to wait.  */
                error = -EAGAIN;
                if (!timeo)
                        goto no_packet;
        } while (sctp_wait_for_packet(sk, err, &timeo) == 0);

        return NULL;

no_packet:
        *err = error;
        return NULL;
}

/* If sndbuf has changed, wake up per association sndbuf waiters.  */
static void __sctp_write_space(struct sctp_association *asoc)
{
        struct sock *sk = asoc->base.sk;

        if (sctp_wspace(asoc) <= 0)
                return;

        if (waitqueue_active(&asoc->wait))
                wake_up_interruptible(&asoc->wait);

        if (sctp_writeable(sk)) {
                struct socket_wq *wq;

                rcu_read_lock();
                wq = rcu_dereference(sk->sk_wq);
                if (wq) {
                        if (waitqueue_active(&wq->wait))
                                wake_up_interruptible(&wq->wait);

                        /* Note that we try to include the Async I/O support
                         * here by modeling from the current TCP/UDP code.
                         * We have not tested with it yet.
                         */
                        if (!(sk->sk_shutdown & SEND_SHUTDOWN))
                                sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
                }
                rcu_read_unlock();
        }
}

static void sctp_wake_up_waiters(struct sock *sk,
                                 struct sctp_association *asoc)
{
        struct sctp_association *tmp = asoc;

        /* We do accounting for the sndbuf space per association,
         * so we only need to wake our own association.
         */
        if (asoc->ep->sndbuf_policy)
                return __sctp_write_space(asoc);

        /* If association goes down and is just flushing its
         * outq, then just normally notify others.
         */
        if (asoc->base.dead)
                return sctp_write_space(sk);

        /* Accounting for the sndbuf space is per socket, so we
         * need to wake up others, try to be fair and in case of
         * other associations, let them have a go first instead
         * of just doing a sctp_write_space() call.
         *
         * Note that we reach sctp_wake_up_waiters() only when
         * associations free up queued chunks, thus we are under
         * lock and the list of associations on a socket is
         * guaranteed not to change.
         */
        for (tmp = list_next_entry(tmp, asocs); 1;
             tmp = list_next_entry(tmp, asocs)) {
                /* Manually skip the head element. */
                if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs))
                        continue;
                /* Wake up association. */
                __sctp_write_space(tmp);
                /* We've reached the end. */
                if (tmp == asoc)
                        break;
        }
}

/* Do accounting for the sndbuf space.
 * Decrement the used sndbuf space of the corresponding association by the
 * data size which was just transmitted(freed).
 */
static void sctp_wfree(struct sk_buff *skb)
{
        struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg;
        struct sctp_association *asoc = chunk->asoc;
        struct sock *sk = asoc->base.sk;

        sk_mem_uncharge(sk, skb->truesize);
        sk_wmem_queued_add(sk, -(skb->truesize + sizeof(struct sctp_chunk)));
        asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk);
        WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk),
                                      &sk->sk_wmem_alloc));

        if (chunk->shkey) {
                struct sctp_shared_key *shkey = chunk->shkey;

                /* refcnt == 2 and !list_empty mean after this release, it's
                 * not being used anywhere, and it's time to notify userland
                 * that this shkey can be freed if it's been deactivated.
                 */
                if (shkey->deactivated && !list_empty(&shkey->key_list) &&
                    refcount_read(&shkey->refcnt) == 2) {
                        struct sctp_ulpevent *ev;

                        ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id,
                                                        SCTP_AUTH_FREE_KEY,
                                                        GFP_KERNEL);
                        if (ev)
                                asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
                }
                sctp_auth_shkey_release(chunk->shkey);
        }

        sock_wfree(skb);
        sctp_wake_up_waiters(sk, asoc);

        sctp_association_put(asoc);
}

/* Do accounting for the receive space on the socket.
 * Accounting for the association is done in ulpevent.c
 * We set this as a destructor for the cloned data skbs so that
 * accounting is done at the correct time.
 */
void sctp_sock_rfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct sctp_ulpevent *event = sctp_skb2event(skb);

        atomic_sub(event->rmem_len, &sk->sk_rmem_alloc);

        /*
         * Mimic the behavior of sock_rfree
         */
        sk_mem_uncharge(sk, event->rmem_len);
}


/* Helper function to wait for space in the sndbuf.  */
static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
                                size_t msg_len)
{
        struct sock *sk = asoc->base.sk;
        long current_timeo = *timeo_p;
        DEFINE_WAIT(wait);
        int err = 0;

        pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc,
                 *timeo_p, msg_len);

        /* Increment the association's refcnt.  */
        sctp_association_hold(asoc);

        /* Wait on the association specific sndbuf space. */
        for (;;) {
                prepare_to_wait_exclusive(&asoc->wait, &wait,
                                          TASK_INTERRUPTIBLE);
                if (asoc->base.dead)
                        goto do_dead;
                if (!*timeo_p)
                        goto do_nonblock;
                if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING)
                        goto do_error;
                if (signal_pending(current))
                        goto do_interrupted;
                if ((int)msg_len <= sctp_wspace(asoc) &&
                    sk_wmem_schedule(sk, msg_len))
                        break;

                /* Let another process have a go.  Since we are going
                 * to sleep anyway.
                 */
                release_sock(sk);
                current_timeo = schedule_timeout(current_timeo);
                lock_sock(sk);
                if (sk != asoc->base.sk)
                        goto do_error;

                *timeo_p = current_timeo;
        }

out:
        finish_wait(&asoc->wait, &wait);

        /* Release the association's refcnt.  */
        sctp_association_put(asoc);

        return err;

do_dead:
        err = -ESRCH;
        goto out;

do_error:
        err = -EPIPE;
        goto out;

do_interrupted:
        err = sock_intr_errno(*timeo_p);
        goto out;

do_nonblock:
        err = -EAGAIN;
        goto out;
}

void sctp_data_ready(struct sock *sk)
{
        struct socket_wq *wq;

        trace_sk_data_ready(sk);

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
                                                EPOLLRDNORM | EPOLLRDBAND);
        sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
}

/* If socket sndbuf has changed, wake up all per association waiters.  */
void sctp_write_space(struct sock *sk)
{
        struct sctp_association *asoc;

        /* Wake up the tasks in each wait queue.  */
        list_for_each_entry(asoc, &((sctp_sk(sk))->ep->asocs), asocs) {
                __sctp_write_space(asoc);
        }
}

/* Is there any sndbuf space available on the socket?
 *
 * Note that sk_wmem_alloc is the sum of the send buffers on all of the
 * associations on the same socket.  For a UDP-style socket with
 * multiple associations, it is possible for it to be "unwriteable"
 * prematurely.  I assume that this is acceptable because
 * a premature "unwriteable" is better than an accidental "writeable" which
 * would cause an unwanted block under certain circumstances.  For the 1-1
 * UDP-style sockets or TCP-style sockets, this code should work.
 *  - Daisy
 */
static bool sctp_writeable(const struct sock *sk)
{
        return READ_ONCE(sk->sk_sndbuf) > READ_ONCE(sk->sk_wmem_queued);
}

/* Wait for an association to go into ESTABLISHED state. If timeout is 0,
 * returns immediately with EINPROGRESS.
 */
static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p)
{
        struct sock *sk = asoc->base.sk;
        int err = 0;
        long current_timeo = *timeo_p;
        DEFINE_WAIT(wait);

        pr_debug("%s: asoc:%p, timeo:%ld\n", __func__, asoc, *timeo_p);

        /* Increment the association's refcnt.  */
        sctp_association_hold(asoc);

        for (;;) {
                prepare_to_wait_exclusive(&asoc->wait, &wait,
                                          TASK_INTERRUPTIBLE);
                if (!*timeo_p)
                        goto do_nonblock;
                if (sk->sk_shutdown & RCV_SHUTDOWN)
                        break;
                if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING ||
                    asoc->base.dead)
                        goto do_error;
                if (signal_pending(current))
                        goto do_interrupted;

                if (sctp_state(asoc, ESTABLISHED))
                        break;

                /* Let another process have a go.  Since we are going
                 * to sleep anyway.
                 */
                release_sock(sk);
                current_timeo = schedule_timeout(current_timeo);
                lock_sock(sk);

                *timeo_p = current_timeo;
        }

out:
        finish_wait(&asoc->wait, &wait);

        /* Release the association's refcnt.  */
        sctp_association_put(asoc);

        return err;

do_error:
        if (asoc->init_err_counter + 1 > asoc->max_init_attempts)
                err = -ETIMEDOUT;
        else
                err = -ECONNREFUSED;
        goto out;

do_interrupted:
        err = sock_intr_errno(*timeo_p);
        goto out;

do_nonblock:
        err = -EINPROGRESS;
        goto out;
}

static int sctp_wait_for_accept(struct sock *sk, long timeo)
{
        struct sctp_endpoint *ep;
        int err = 0;
        DEFINE_WAIT(wait);

        ep = sctp_sk(sk)->ep;


        for (;;) {
                prepare_to_wait_exclusive(sk_sleep(sk), &wait,
                                          TASK_INTERRUPTIBLE);

                if (list_empty(&ep->asocs)) {
                        release_sock(sk);
                        timeo = schedule_timeout(timeo);
                        lock_sock(sk);
                }

                err = -EINVAL;
                if (!sctp_sstate(sk, LISTENING))
                        break;

                err = 0;
                if (!list_empty(&ep->asocs))
                        break;

                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        break;

                err = -EAGAIN;
                if (!timeo)
                        break;
        }

        finish_wait(sk_sleep(sk), &wait);

        return err;
}

static void sctp_wait_for_close(struct sock *sk, long timeout)
{
        DEFINE_WAIT(wait);

        do {
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                if (list_empty(&sctp_sk(sk)->ep->asocs))
                        break;
                release_sock(sk);
                timeout = schedule_timeout(timeout);
                lock_sock(sk);
        } while (!signal_pending(current) && timeout);

        finish_wait(sk_sleep(sk), &wait);
}

static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk)
{
        struct sk_buff *frag;

        if (!skb->data_len)
                goto done;

        /* Don't forget the fragments. */
        skb_walk_frags(skb, frag)
                sctp_skb_set_owner_r_frag(frag, sk);

done:
        sctp_skb_set_owner_r(skb, sk);
}

void sctp_copy_sock(struct sock *newsk, struct sock *sk,
                    struct sctp_association *asoc)
{
        struct inet_sock *inet = inet_sk(sk);
        struct inet_sock *newinet;
        struct sctp_sock *sp = sctp_sk(sk);

        newsk->sk_type = sk->sk_type;
        newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
        newsk->sk_flags = sk->sk_flags;
        newsk->sk_tsflags = sk->sk_tsflags;
        newsk->sk_no_check_tx = sk->sk_no_check_tx;
        newsk->sk_no_check_rx = sk->sk_no_check_rx;
        newsk->sk_reuse = sk->sk_reuse;
        sctp_sk(newsk)->reuse = sp->reuse;

        newsk->sk_shutdown = sk->sk_shutdown;
        newsk->sk_destruct = sk->sk_destruct;
        newsk->sk_family = sk->sk_family;
        newsk->sk_protocol = IPPROTO_SCTP;
        newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
        newsk->sk_sndbuf = sk->sk_sndbuf;
        newsk->sk_rcvbuf = sk->sk_rcvbuf;
        newsk->sk_lingertime = sk->sk_lingertime;
        newsk->sk_rcvtimeo = sk->sk_rcvtimeo;
        newsk->sk_sndtimeo = sk->sk_sndtimeo;
        newsk->sk_rxhash = sk->sk_rxhash;

        newinet = inet_sk(newsk);

        /* Initialize sk's sport, dport, rcv_saddr and daddr for
         * getsockname() and getpeername()
         */
        newinet->inet_sport = inet->inet_sport;
        newinet->inet_saddr = inet->inet_saddr;
        newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
        newinet->inet_dport = htons(asoc->peer.port);
        newinet->pmtudisc = inet->pmtudisc;
        atomic_set(&newinet->inet_id, get_random_u16());

        newinet->uc_ttl = inet->uc_ttl;
        inet_set_bit(MC_LOOP, newsk);
        newinet->mc_ttl = 1;
        newinet->mc_index = 0;
        newinet->mc_list = NULL;

        if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                net_enable_timestamp();

        /* Set newsk security attributes from original sk and connection
         * security attribute from asoc.
         */
        security_sctp_sk_clone(asoc, sk, newsk);
}

static inline void sctp_copy_descendant(struct sock *sk_to,
                                        const struct sock *sk_from)
{
        size_t ancestor_size = sizeof(struct inet_sock);

        ancestor_size += sk_from->sk_prot->obj_size;
        ancestor_size -= offsetof(struct sctp_sock, pd_lobby);
        __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
}

/* Populate the fields of the newsk from the oldsk and migrate the assoc
 * and its messages to the newsk.
 */
static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
                             struct sctp_association *assoc,
                             enum sctp_socket_type type)
{
        struct sctp_sock *oldsp = sctp_sk(oldsk);
        struct sctp_sock *newsp = sctp_sk(newsk);
        struct sctp_bind_bucket *pp; /* hash list port iterator */
        struct sctp_endpoint *newep = newsp->ep;
        struct sk_buff *skb, *tmp;
        struct sctp_ulpevent *event;
        struct sctp_bind_hashbucket *head;
        int err;

        /* Migrate socket buffer sizes and all the socket level options to the
         * new socket.
         */
        newsk->sk_sndbuf = oldsk->sk_sndbuf;
        newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
        /* Brute force copy old sctp opt. */
        sctp_copy_descendant(newsk, oldsk);

        /* Restore the ep value that was overwritten with the above structure
         * copy.
         */
        newsp->ep = newep;
        newsp->hmac = NULL;

        /* Hook this new socket in to the bind_hash list. */
        head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk),
                                                 inet_sk(oldsk)->inet_num)];
        spin_lock_bh(&head->lock);
        pp = sctp_sk(oldsk)->bind_hash;
        sk_add_bind_node(newsk, &pp->owner);
        sctp_sk(newsk)->bind_hash = pp;
        inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num;
        spin_unlock_bh(&head->lock);

        /* Copy the bind_addr list from the original endpoint to the new
         * endpoint so that we can handle restarts properly
         */
        err = sctp_bind_addr_dup(&newsp->ep->base.bind_addr,
                                 &oldsp->ep->base.bind_addr, GFP_KERNEL);
        if (err)
                return err;

        /* New ep's auth_hmacs should be set if old ep's is set, in case
         * that net->sctp.auth_enable has been changed to 0 by users and
         * new ep's auth_hmacs couldn't be set in sctp_endpoint_init().
         */
        if (oldsp->ep->auth_hmacs) {
                err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL);
                if (err)
                        return err;
        }

        sctp_auto_asconf_init(newsp);

        /* Move any messages in the old socket's receive queue that are for the
         * peeled off association to the new socket's receive queue.
         */
        sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) {
                event = sctp_skb2event(skb);
                if (event->asoc == assoc) {
                        __skb_unlink(skb, &oldsk->sk_receive_queue);
                        __skb_queue_tail(&newsk->sk_receive_queue, skb);
                        sctp_skb_set_owner_r_frag(skb, newsk);
                }
        }

        /* Clean up any messages pending delivery due to partial
         * delivery.   Three cases:
         * 1) No partial deliver;  no work.
         * 2) Peeling off partial delivery; keep pd_lobby in new pd_lobby.
         * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue.
         */
        atomic_set(&sctp_sk(newsk)->pd_mode, assoc->ulpq.pd_mode);

        if (atomic_read(&sctp_sk(oldsk)->pd_mode)) {
                struct sk_buff_head *queue;

                /* Decide which queue to move pd_lobby skbs to. */
                if (assoc->ulpq.pd_mode) {
                        queue = &newsp->pd_lobby;
                } else
                        queue = &newsk->sk_receive_queue;

                /* Walk through the pd_lobby, looking for skbs that
                 * need moved to the new socket.
                 */
                sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) {
                        event = sctp_skb2event(skb);
                        if (event->asoc == assoc) {
                                __skb_unlink(skb, &oldsp->pd_lobby);
                                __skb_queue_tail(queue, skb);
                                sctp_skb_set_owner_r_frag(skb, newsk);
                        }
                }

                /* Clear up any skbs waiting for the partial
                 * delivery to finish.
                 */
                if (assoc->ulpq.pd_mode)
                        sctp_clear_pd(oldsk, NULL);

        }

        sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag);

        /* Set the type of socket to indicate that it is peeled off from the
         * original UDP-style socket or created with the accept() call on a
         * TCP-style socket..
         */
        newsp->type = type;

        /* Mark the new socket "in-use" by the user so that any packets
         * that may arrive on the association after we've moved it are
         * queued to the backlog.  This prevents a potential race between
         * backlog processing on the old socket and new-packet processing
         * on the new socket.
         *
         * The caller has just allocated newsk so we can guarantee that other
         * paths won't try to lock it and then oldsk.
         */
        lock_sock_nested(newsk, SINGLE_DEPTH_NESTING);
        sctp_for_each_tx_datachunk(assoc, true, sctp_clear_owner_w);
        sctp_assoc_migrate(assoc, newsk);
        sctp_for_each_tx_datachunk(assoc, false, sctp_set_owner_w);

        /* If the association on the newsk is already closed before accept()
         * is called, set RCV_SHUTDOWN flag.
         */
        if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) {
                inet_sk_set_state(newsk, SCTP_SS_CLOSED);
                newsk->sk_shutdown |= RCV_SHUTDOWN;
        } else {
                inet_sk_set_state(newsk, SCTP_SS_ESTABLISHED);
        }

        release_sock(newsk);

        return 0;
}


/* This proto struct describes the ULP interface for SCTP.  */
struct proto sctp_prot = {
        .name        =        "SCTP",
        .owner       =        THIS_MODULE,
        .close       =        sctp_close,
        .disconnect  =        sctp_disconnect,
        .accept      =        sctp_accept,
        .ioctl       =        sctp_ioctl,
        .init        =        sctp_init_sock,
        .destroy     =        sctp_destroy_sock,
        .shutdown    =        sctp_shutdown,
        .setsockopt  =        sctp_setsockopt,
        .getsockopt  =        sctp_getsockopt,
        .bpf_bypass_getsockopt        = sctp_bpf_bypass_getsockopt,
        .sendmsg     =        sctp_sendmsg,
        .recvmsg     =        sctp_recvmsg,
        .bind        =        sctp_bind,
        .bind_add    =  sctp_bind_add,
        .backlog_rcv =        sctp_backlog_rcv,
        .hash        =        sctp_hash,
        .unhash      =        sctp_unhash,
        .no_autobind =        true,
        .obj_size    =  sizeof(struct sctp_sock),
        .useroffset  =  offsetof(struct sctp_sock, subscribe),
        .usersize    =  offsetof(struct sctp_sock, initmsg) -
                                offsetof(struct sctp_sock, subscribe) +
                                sizeof_field(struct sctp_sock, initmsg),
        .sysctl_mem  =  sysctl_sctp_mem,
        .sysctl_rmem =  sysctl_sctp_rmem,
        .sysctl_wmem =  sysctl_sctp_wmem,
        .memory_pressure = &sctp_memory_pressure,
        .enter_memory_pressure = sctp_enter_memory_pressure,

        .memory_allocated = &sctp_memory_allocated,
        .per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc,

        .sockets_allocated = &sctp_sockets_allocated,
};

#if IS_ENABLED(CONFIG_IPV6)

static void sctp_v6_destruct_sock(struct sock *sk)
{
        sctp_destruct_common(sk);
        inet6_sock_destruct(sk);
}

static int sctp_v6_init_sock(struct sock *sk)
{
        int ret = sctp_init_sock(sk);

        if (!ret)
                sk->sk_destruct = sctp_v6_destruct_sock;

        return ret;
}

struct proto sctpv6_prot = {
        .name                = "SCTPv6",
        .owner                = THIS_MODULE,
        .close                = sctp_close,
        .disconnect        = sctp_disconnect,
        .accept                = sctp_accept,
        .ioctl                = sctp_ioctl,
        .init                = sctp_v6_init_sock,
        .destroy        = sctp_destroy_sock,
        .shutdown        = sctp_shutdown,
        .setsockopt        = sctp_setsockopt,
        .getsockopt        = sctp_getsockopt,
        .bpf_bypass_getsockopt        = sctp_bpf_bypass_getsockopt,
        .sendmsg        = sctp_sendmsg,
        .recvmsg        = sctp_recvmsg,
        .bind                = sctp_bind,
        .bind_add        = sctp_bind_add,
        .backlog_rcv        = sctp_backlog_rcv,
        .hash                = sctp_hash,
        .unhash                = sctp_unhash,
        .no_autobind        = true,
        .obj_size        = sizeof(struct sctp6_sock),
        .ipv6_pinfo_offset = offsetof(struct sctp6_sock, inet6),
        .useroffset        = offsetof(struct sctp6_sock, sctp.subscribe),
        .usersize        = offsetof(struct sctp6_sock, sctp.initmsg) -
                                offsetof(struct sctp6_sock, sctp.subscribe) +
                                sizeof_field(struct sctp6_sock, sctp.initmsg),
        .sysctl_mem        = sysctl_sctp_mem,
        .sysctl_rmem        = sysctl_sctp_rmem,
        .sysctl_wmem        = sysctl_sctp_wmem,
        .memory_pressure = &sctp_memory_pressure,
        .enter_memory_pressure = sctp_enter_memory_pressure,

        .memory_allocated = &sctp_memory_allocated,
        .per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc,

        .sockets_allocated = &sctp_sockets_allocated,
};
#endif /* IS_ENABLED(CONFIG_IPV6) */



















































    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/errno.h>

/*
 * Kernel pointers have redundant information, so we can use a
 * scheme where we can return either an error code or a normal
 * pointer with the same return value.
 *
 * This should be a per-architecture thing, to allow different
 * error and pointer decisions.
 */
#define MAX_ERRNO        4095

#ifndef __ASSEMBLY__

/**
 * IS_ERR_VALUE - Detect an error pointer.
 * @x: The pointer to check.
 *
 * Like IS_ERR(), but does not generate a compiler warning if result is unused.
 */
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)

/**
 * ERR_PTR - Create an error pointer.
 * @error: A negative error code.
 *
 * Encodes @error into a pointer value. Users should consider the result
 * opaque and not assume anything about how the error is encoded.
 *
 * Return: A pointer with @error encoded within its value.
 */
static inline void * __must_check ERR_PTR(long error)
{
        return (void *) error;
}

/**
 * PTR_ERR - Extract the error code from an error pointer.
 * @ptr: An error pointer.
 * Return: The error code within @ptr.
 */
static inline long __must_check PTR_ERR(__force const void *ptr)
{
        return (long) ptr;
}

/**
 * IS_ERR - Detect an error pointer.
 * @ptr: The pointer to check.
 * Return: true if @ptr is an error pointer, false otherwise.
 */
static inline bool __must_check IS_ERR(__force const void *ptr)
{
        return IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * IS_ERR_OR_NULL - Detect an error pointer or a null pointer.
 * @ptr: The pointer to check.
 *
 * Like IS_ERR(), but also returns true for a null pointer.
 */
static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
 * @ptr: The pointer to cast.
 *
 * Explicitly cast an error-valued pointer to another pointer type in such a
 * way as to make it clear that's what's going on.
 */
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
        /* cast away the const */
        return (void *) ptr;
}

/**
 * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one.
 * @ptr: A potential error pointer.
 *
 * Convenience function that can be used inside a function that returns
 * an error code to propagate errors received as error pointers.
 * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces:
 *
 * .. code-block:: c
 *
 *        if (IS_ERR(ptr))
 *                return PTR_ERR(ptr);
 *        else
 *                return 0;
 *
 * Return: The error code within @ptr if it is an error pointer; 0 otherwise.
 */
static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);
        else
                return 0;
}

#endif

#endif /* _LINUX_ERR_H */

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * AppArmor security module
 *
 * This file contains AppArmor network mediation definitions.
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2017 Canonical Ltd.
 */

#ifndef __AA_NET_H
#define __AA_NET_H

#include <net/sock.h>
#include <linux/path.h>

#include "apparmorfs.h"
#include "label.h"
#include "perms.h"
#include "policy.h"

#define AA_MAY_SEND                AA_MAY_WRITE
#define AA_MAY_RECEIVE                AA_MAY_READ

#define AA_MAY_SHUTDOWN                AA_MAY_DELETE

#define AA_MAY_CONNECT                AA_MAY_OPEN
#define AA_MAY_ACCEPT                0x00100000

#define AA_MAY_BIND                0x00200000
#define AA_MAY_LISTEN                0x00400000

#define AA_MAY_SETOPT                0x01000000
#define AA_MAY_GETOPT                0x02000000

#define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE |    \
                        AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN |          \
                        AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \
                        AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT)

#define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE |        \
                      AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\
                      AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD |        \
                      AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK |        \
                      AA_MAY_MPROT)

#define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT |        \
                       AA_MAY_ACCEPT)
struct aa_sk_ctx {
        struct aa_label *label;
        struct aa_label *peer;
};

#define SK_CTX(X) ((X)->sk_security)
static inline struct aa_sk_ctx *aa_sock(const struct sock *sk)
{
        return sk->sk_security;
}

#define DEFINE_AUDIT_NET(NAME, OP, SK, F, T, P)                                  \
        struct lsm_network_audit NAME ## _net = { .sk = (SK),                  \
                                                  .family = (F)};          \
        DEFINE_AUDIT_DATA(NAME,                                                  \
                          ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \
                                                     LSM_AUDIT_DATA_NONE, \
                                                     AA_CLASS_NET,        \
                          OP);                                                  \
        NAME.common.u.net = &(NAME ## _net);                                  \
        NAME.net.type = (T);                                                  \
        NAME.net.protocol = (P)

#define DEFINE_AUDIT_SK(NAME, OP, SK)                                        \
        DEFINE_AUDIT_NET(NAME, OP, SK, (SK)->sk_family, (SK)->sk_type,        \
                         (SK)->sk_protocol)


#define af_select(FAMILY, FN, DEF_FN)                \
({                                                \
        int __e;                                \
        switch ((FAMILY)) {                        \
        default:                                \
                __e = DEF_FN;                        \
        }                                        \
        __e;                                        \
})

struct aa_secmark {
        u8 audit;
        u8 deny;
        u32 secid;
        char *label;
};

extern struct aa_sfs_entry aa_sfs_entry_network[];

void audit_net_cb(struct audit_buffer *ab, void *va);
int aa_profile_af_perm(struct aa_profile *profile,
                       struct apparmor_audit_data *ad,
                       u32 request, u16 family, int type);
int aa_af_perm(const struct cred *subj_cred, struct aa_label *label,
               const char *op, u32 request, u16 family,
               int type, int protocol);
static inline int aa_profile_af_sk_perm(struct aa_profile *profile,
                                        struct apparmor_audit_data *ad,
                                        u32 request,
                                        struct sock *sk)
{
        return aa_profile_af_perm(profile, ad, request, sk->sk_family,
                                  sk->sk_type);
}
int aa_sk_perm(const char *op, u32 request, struct sock *sk);

int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label,
                      const char *op, u32 request,
                      struct socket *sock);

int apparmor_secmark_check(struct aa_label *label, char *op, u32 request,
                           u32 secid, const struct sock *sk);

#endif /* __AA_NET_H */














































    2 
























    5 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Copyright (c) 2021, Google LLC.
 * Pasha Tatashin <pasha.tatashin@soleen.com>
 */
#ifndef __LINUX_PAGE_TABLE_CHECK_H
#define __LINUX_PAGE_TABLE_CHECK_H

#ifdef CONFIG_PAGE_TABLE_CHECK
#include <linux/jump_label.h>

extern struct static_key_true page_table_check_disabled;
extern struct page_ext_operations page_table_check_ops;

void __page_table_check_zero(struct page *page, unsigned int order);
void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte);
void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd);
void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud);
void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
                unsigned int nr);
void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd);
void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
                                        unsigned long addr,
                                        pmd_t pmd);

static inline void page_table_check_alloc(struct page *page, unsigned int order)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_zero(page, order);
}

static inline void page_table_check_free(struct page *page, unsigned int order)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_zero(page, order);
}

static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pte_clear(mm, pte);
}

static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pmd_clear(mm, pmd);
}

static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pud_clear(mm, pud);
}

static inline void page_table_check_ptes_set(struct mm_struct *mm,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_ptes_set(mm, ptep, pte, nr);
}

static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
                                            pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pmd_set(mm, pmdp, pmd);
}

static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
                                            pud_t pud)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pud_set(mm, pudp, pud);
}

static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pte_clear_range(mm, addr, pmd);
}

#else

static inline void page_table_check_alloc(struct page *page, unsigned int order)
{
}

static inline void page_table_check_free(struct page *page, unsigned int order)
{
}

static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
}

static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
}

static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
}

static inline void page_table_check_ptes_set(struct mm_struct *mm,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
}

static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
                                            pmd_t pmd)
{
}

static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
                                            pud_t pud)
{
}

static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    pmd_t pmd)
{
}

#endif /* CONFIG_PAGE_TABLE_CHECK */
#endif /* __LINUX_PAGE_TABLE_CHECK_H */














































































    3 










    3 


    3 








    3 









    3 










    3 





















































    3 
    3 





































































































































































































































    3 












    3 
    3 

    3 



    3 








    3 











    3 




















































































































































































































































































    3 














    3 


    3 

    3 















    3 
    3 











    3 

































































































































    3 






























































    3 












    3 











    3 



    3 








    3 





























    3 





























    3 





    3 




    3 





    3 





    3 






    3 



















    3 






    3 























    3 





















    3 
    3 


    3 





















    2 






















    3 









    3 



    1 
    1 
    1 



    1 





    1 




















































































    3 








    3 
    3 










    3 




    3 

























































































































































































































































































































    1 


































































    2 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 













    3 









    3 

















































    3 































    3 
    3 






    3 






    3 










    3 


































































































































    3 


    3 

    3 




    3 



    3 

































































































































    3 






    3 





















































    2 


    3 
































































































































































































































    3 



    3 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/signal.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
 *
 *  2003-06-02  Jim Houston - Concurrent Computer Corp.
 *                Changes to use preallocated sigqueue structures
 *                to allow signals to be sent reliably.
 */

#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/debug.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/sysctl.h>
#include <uapi/linux/pidfd.h>

#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>

#include <asm/param.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
#include <asm/syscall.h>        /* for syscall_get_* */

/*
 * SLAB caches for signal bits.
 */

static struct kmem_cache *sigqueue_cachep;

int print_fatal_signals __read_mostly;

static void __user *sig_handler(struct task_struct *t, int sig)
{
        return t->sighand->action[sig - 1].sa.sa_handler;
}

static inline bool sig_handler_ignored(void __user *handler, int sig)
{
        /* Is it explicitly or implicitly ignored? */
        return handler == SIG_IGN ||
               (handler == SIG_DFL && sig_kernel_ignore(sig));
}

static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
        void __user *handler;

        handler = sig_handler(t, sig);

        /* SIGKILL and SIGSTOP may not be sent to the global init */
        if (unlikely(is_global_init(t) && sig_kernel_only(sig)))
                return true;

        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
            handler == SIG_DFL && !(force && sig_kernel_only(sig)))
                return true;

        /* Only allow kernel generated signals to this kthread */
        if (unlikely((t->flags & PF_KTHREAD) &&
                     (handler == SIG_KTHREAD_KERNEL) && !force))
                return true;

        return sig_handler_ignored(handler, sig);
}

static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
        /*
         * Blocked signals are never ignored, since the
         * signal handler may change by the time it is
         * unblocked.
         */
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return false;

        /*
         * Tracers may want to know about even ignored signal unless it
         * is SIGKILL which can't be reported anyway but can be ignored
         * by SIGNAL_UNKILLABLE task.
         */
        if (t->ptrace && sig != SIGKILL)
                return false;

        return sig_task_ignored(t, sig, force);
}

/*
 * Re-calculate pending state from the set of locally pending
 * signals, globally pending signals, and blocked signals.
 */
static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
        unsigned long ready;
        long i;

        switch (_NSIG_WORDS) {
        default:
                for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
                        ready |= signal->sig[i] &~ blocked->sig[i];
                break;

        case 4: ready  = signal->sig[3] &~ blocked->sig[3];
                ready |= signal->sig[2] &~ blocked->sig[2];
                ready |= signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 2: ready  = signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 1: ready  = signal->sig[0] &~ blocked->sig[0];
        }
        return ready !=        0;
}

#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))

static bool recalc_sigpending_tsk(struct task_struct *t)
{
        if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked) ||
            cgroup_task_frozen(t)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
                return true;
        }

        /*
         * We must never clear the flag in another thread, or in current
         * when it's possible the current syscall is returning -ERESTART*.
         * So we don't clear it here, and only callers who know they should do.
         */
        return false;
}

void recalc_sigpending(void)
{
        if (!recalc_sigpending_tsk(current) && !freezing(current))
                clear_thread_flag(TIF_SIGPENDING);

}
EXPORT_SYMBOL(recalc_sigpending);

void calculate_sigpending(void)
{
        /* Have any signals or users of TIF_SIGPENDING been delayed
         * until after fork?
         */
        spin_lock_irq(&current->sighand->siglock);
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
}

/* Given the mask, find the first available signal that should be serviced. */

#define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
         sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))

int next_signal(struct sigpending *pending, sigset_t *mask)
{
        unsigned long i, *s, *m, x;
        int sig = 0;

        s = pending->signal.sig;
        m = mask->sig;

        /*
         * Handle the first word specially: it contains the
         * synchronous signals that need to be dequeued first.
         */
        x = *s &~ *m;
        if (x) {
                if (x & SYNCHRONOUS_MASK)
                        x &= SYNCHRONOUS_MASK;
                sig = ffz(~x) + 1;
                return sig;
        }

        switch (_NSIG_WORDS) {
        default:
                for (i = 1; i < _NSIG_WORDS; ++i) {
                        x = *++s &~ *++m;
                        if (!x)
                                continue;
                        sig = ffz(~x) + i*_NSIG_BPW + 1;
                        break;
                }
                break;

        case 2:
                x = s[1] &~ m[1];
                if (!x)
                        break;
                sig = ffz(~x) + _NSIG_BPW + 1;
                break;

        case 1:
                /* Nothing to do */
                break;
        }

        return sig;
}

static inline void print_dropped_signal(int sig)
{
        static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);

        if (!print_fatal_signals)
                return;

        if (!__ratelimit(&ratelimit_state))
                return;

        pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
                                current->comm, current->pid, sig);
}

/**
 * task_set_jobctl_pending - set jobctl pending bits
 * @task: target task
 * @mask: pending bits to set
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
 * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
 * cleared.  If @task is already being killed or exiting, this function
 * becomes noop.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if @mask is set, %false if made noop because @task was dying.
 */
bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
        BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));

        if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
                return false;

        if (mask & JOBCTL_STOP_SIGMASK)
                task->jobctl &= ~JOBCTL_STOP_SIGMASK;

        task->jobctl |= mask;
        return true;
}

/**
 * task_clear_jobctl_trapping - clear jobctl trapping bit
 * @task: target task
 *
 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
 * Clear it and wake up the ptracer.  Note that we don't need any further
 * locking.  @task->siglock guarantees that @task->parent points to the
 * ptracer.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_trapping(struct task_struct *task)
{
        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_TRAPPING;
                smp_mb();        /* advised by wake_up_bit() */
                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
        }
}

/**
 * task_clear_jobctl_pending - clear jobctl pending bits
 * @task: target task
 * @mask: pending bits to clear
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
 * STOP bits are cleared together.
 *
 * If clearing of @mask leaves no stop or trap pending, this function calls
 * task_clear_jobctl_trapping().
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~JOBCTL_PENDING_MASK);

        if (mask & JOBCTL_STOP_PENDING)
                mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;

        task->jobctl &= ~mask;

        if (!(task->jobctl & JOBCTL_PENDING_MASK))
                task_clear_jobctl_trapping(task);
}

/**
 * task_participate_group_stop - participate in a group stop
 * @task: task participating in a group stop
 *
 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
 * Group stop states are cleared and the group stop count is consumed if
 * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
 * stop, the appropriate `SIGNAL_*` flags are set.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if group stop completion should be notified to the parent, %false
 * otherwise.
 */
static bool task_participate_group_stop(struct task_struct *task)
{
        struct signal_struct *sig = task->signal;
        bool consume = task->jobctl & JOBCTL_STOP_CONSUME;

        WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));

        task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);

        if (!consume)
                return false;

        if (!WARN_ON_ONCE(sig->group_stop_count == 0))
                sig->group_stop_count--;

        /*
         * Tell the caller to notify completion iff we are entering into a
         * fresh group stop.  Read comment in do_signal_stop() for details.
         */
        if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
                signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
                return true;
        }
        return false;
}

void task_join_group_stop(struct task_struct *task)
{
        unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
        struct signal_struct *sig = current->signal;

        if (sig->group_stop_count) {
                sig->group_stop_count++;
                mask |= JOBCTL_STOP_CONSUME;
        } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
                return;

        /* Have the new thread join an on-going signal group stop */
        task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}

/*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
 *   appropriate lock must be held to stop the target task from exiting
 */
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
                 int override_rlimit, const unsigned int sigqueue_flags)
{
        struct sigqueue *q = NULL;
        struct ucounts *ucounts;
        long sigpending;

        /*
         * Protect access to @t credentials. This can go away when all
         * callers hold rcu read lock.
         *
         * NOTE! A pending signal will hold on to the user refcount,
         * and we get/put the refcount only when the sigpending count
         * changes from/to zero.
         */
        rcu_read_lock();
        ucounts = task_ucounts(t);
        sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
        rcu_read_unlock();
        if (!sigpending)
                return NULL;

        if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
                q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
        } else {
                print_dropped_signal(sig);
        }

        if (unlikely(q == NULL)) {
                dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
        } else {
                INIT_LIST_HEAD(&q->list);
                q->flags = sigqueue_flags;
                q->ucounts = ucounts;
        }
        return q;
}

static void __sigqueue_free(struct sigqueue *q)
{
        if (q->flags & SIGQUEUE_PREALLOC)
                return;
        if (q->ucounts) {
                dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
                q->ucounts = NULL;
        }
        kmem_cache_free(sigqueue_cachep, q);
}

void flush_sigqueue(struct sigpending *queue)
{
        struct sigqueue *q;

        sigemptyset(&queue->signal);
        while (!list_empty(&queue->list)) {
                q = list_entry(queue->list.next, struct sigqueue , list);
                list_del_init(&q->list);
                __sigqueue_free(q);
        }
}

/*
 * Flush all pending signals for this kthread.
 */
void flush_signals(struct task_struct *t)
{
        unsigned long flags;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        clear_tsk_thread_flag(t, TIF_SIGPENDING);
        flush_sigqueue(&t->pending);
        flush_sigqueue(&t->signal->shared_pending);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
EXPORT_SYMBOL(flush_signals);

#ifdef CONFIG_POSIX_TIMERS
static void __flush_itimer_signals(struct sigpending *pending)
{
        sigset_t signal, retain;
        struct sigqueue *q, *n;

        signal = pending->signal;
        sigemptyset(&retain);

        list_for_each_entry_safe(q, n, &pending->list, list) {
                int sig = q->info.si_signo;

                if (likely(q->info.si_code != SI_TIMER)) {
                        sigaddset(&retain, sig);
                } else {
                        sigdelset(&signal, sig);
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }

        sigorsets(&pending->signal, &signal, &retain);
}

void flush_itimer_signals(void)
{
        struct task_struct *tsk = current;
        unsigned long flags;

        spin_lock_irqsave(&tsk->sighand->siglock, flags);
        __flush_itimer_signals(&tsk->pending);
        __flush_itimer_signals(&tsk->signal->shared_pending);
        spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
}
#endif

void ignore_signals(struct task_struct *t)
{
        int i;

        for (i = 0; i < _NSIG; ++i)
                t->sighand->action[i].sa.sa_handler = SIG_IGN;

        flush_signals(t);
}

/*
 * Flush all handlers for a task.
 */

void
flush_signal_handlers(struct task_struct *t, int force_default)
{
        int i;
        struct k_sigaction *ka = &t->sighand->action[0];
        for (i = _NSIG ; i != 0 ; i--) {
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
#ifdef __ARCH_HAS_SA_RESTORER
                ka->sa.sa_restorer = NULL;
#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
}

bool unhandled_signal(struct task_struct *tsk, int sig)
{
        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return true;

        if (handler != SIG_IGN && handler != SIG_DFL)
                return false;

        /* If dying, we handle all new signals by ignoring them */
        if (fatal_signal_pending(tsk))
                return false;

        /* if ptraced, let the tracer determine */
        return !tsk->ptrace;
}

static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info,
                           bool *resched_timer)
{
        struct sigqueue *q, *first = NULL;

        /*
         * Collect the siginfo appropriate to this signal.  Check if
         * there is another siginfo for the same signal.
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
                        if (first)
                                goto still_pending;
                        first = q;
                }
        }

        sigdelset(&list->signal, sig);

        if (first) {
still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);

                *resched_timer =
                        (first->flags & SIGQUEUE_PREALLOC) &&
                        (info->si_code == SI_TIMER) &&
                        (info->si_sys_private);

                __sigqueue_free(first);
        } else {
                /*
                 * Ok, it wasn't in the queue.  This must be
                 * a fast-pathed signal or we must have been
                 * out of queue space.  So zero out the info.
                 */
                clear_siginfo(info);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = SI_USER;
                info->si_pid = 0;
                info->si_uid = 0;
        }
}

static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        kernel_siginfo_t *info, bool *resched_timer)
{
        int sig = next_signal(pending, mask);

        if (sig)
                collect_signal(sig, pending, info, resched_timer);
        return sig;
}

/*
 * Dequeue a signal and return the element to the caller, which is
 * expected to free it.
 *
 * All callers have to hold the siglock.
 */
int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
                   kernel_siginfo_t *info, enum pid_type *type)
{
        bool resched_timer = false;
        int signr;

        /* We only dequeue private signals from ourselves, we don't let
         * signalfd steal them
         */
        *type = PIDTYPE_PID;
        signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
        if (!signr) {
                *type = PIDTYPE_TGID;
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
                /*
                 * itimer signal ?
                 *
                 * itimers are process shared and we restart periodic
                 * itimers in the signal delivery path to prevent DoS
                 * attacks in the high resolution timer case. This is
                 * compliant with the old way of self-restarting
                 * itimers, as the SIGALRM is a legacy signal and only
                 * queued once. Changing the restart behaviour to
                 * restart the timer in the signal dequeue path is
                 * reducing the timer noise on heavy loaded !highres
                 * systems too.
                 */
                if (unlikely(signr == SIGALRM)) {
                        struct hrtimer *tmr = &tsk->signal->real_timer;

                        if (!hrtimer_is_queued(tmr) &&
                            tsk->signal->it_real_incr != 0) {
                                hrtimer_forward(tmr, tmr->base->get_time(),
                                                tsk->signal->it_real_incr);
                                hrtimer_restart(tmr);
                        }
                }
#endif
        }

        recalc_sigpending();
        if (!signr)
                return 0;

        if (unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
                 * caller might release the siglock and then the pending
                 * stop signal it is about to process is no longer in the
                 * pending bitmasks, but must still be cleared by a SIGCONT
                 * (and overruled by a SIGKILL).  So those cases clear this
                 * shared flag after we've set it.  Note that this flag may
                 * remain set after the signal we return is ignored or
                 * handled.  That doesn't matter because its only purpose
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
                current->jobctl |= JOBCTL_STOP_DEQUEUED;
        }
#ifdef CONFIG_POSIX_TIMERS
        if (resched_timer) {
                /*
                 * Release the siglock to ensure proper locking order
                 * of timer locks outside of siglocks.  Note, we leave
                 * irqs disabled here, since the posix-timers code is
                 * about to disable them again anyway.
                 */
                spin_unlock(&tsk->sighand->siglock);
                posixtimer_rearm(info);
                spin_lock(&tsk->sighand->siglock);

                /* Don't expose the si_sys_private value to userspace */
                info->si_sys_private = 0;
        }
#endif
        return signr;
}
EXPORT_SYMBOL_GPL(dequeue_signal);

static int dequeue_synchronous_signal(kernel_siginfo_t *info)
{
        struct task_struct *tsk = current;
        struct sigpending *pending = &tsk->pending;
        struct sigqueue *q, *sync = NULL;

        /*
         * Might a synchronous signal be in the queue?
         */
        if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
                return 0;

        /*
         * Return the first synchronous signal in the queue.
         */
        list_for_each_entry(q, &pending->list, list) {
                /* Synchronous signals have a positive si_code */
                if ((q->info.si_code > SI_USER) &&
                    (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
                        sync = q;
                        goto next;
                }
        }
        return 0;
next:
        /*
         * Check if there is another siginfo for the same signal.
         */
        list_for_each_entry_continue(q, &pending->list, list) {
                if (q->info.si_signo == sync->info.si_signo)
                        goto still_pending;
        }

        sigdelset(&pending->signal, sync->info.si_signo);
        recalc_sigpending();
still_pending:
        list_del_init(&sync->list);
        copy_siginfo(info, &sync->info);
        __sigqueue_free(sync);
        return info->si_signo;
}

/*
 * Tell a process that it has a new active signal..
 *
 * NOTE! we rely on the previous spin_lock to
 * lock interrupts for us! We can only be called with
 * "siglock" held, and the local interrupt must
 * have been disabled when that got acquired!
 *
 * No need to set need_resched since signal event passing
 * goes through ->blocked
 */
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
        lockdep_assert_held(&t->sighand->siglock);

        set_tsk_thread_flag(t, TIF_SIGPENDING);

        /*
         * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
         * case. We don't check t->state here because there is a race with it
         * executing another processor and just now entering stopped state.
         * By using wake_up_state, we ensure the process will wake up and
         * handle its death signal.
         */
        if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
                kick_process(t);
}

/*
 * Remove signals in mask from the pending set and queue.
 * Returns 1 if any signals were found.
 *
 * All callers must be holding the siglock.
 */
static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
        struct sigqueue *q, *n;
        sigset_t m;

        sigandsets(&m, mask, &s->signal);
        if (sigisemptyset(&m))
                return;

        sigandnsets(&s->signal, &s->signal, mask);
        list_for_each_entry_safe(q, n, &s->list, list) {
                if (sigismember(mask, q->info.si_signo)) {
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }
}

static inline int is_si_special(const struct kernel_siginfo *info)
{
        return info <= SEND_SIG_PRIV;
}

static inline bool si_fromuser(const struct kernel_siginfo *info)
{
        return info == SEND_SIG_NOINFO ||
                (!is_si_special(info) && SI_FROMUSER(info));
}

/*
 * called with RCU read lock from check_kill_permission()
 */
static bool kill_ok_by_cred(struct task_struct *t)
{
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);

        return uid_eq(cred->euid, tcred->suid) ||
               uid_eq(cred->euid, tcred->uid) ||
               uid_eq(cred->uid, tcred->suid) ||
               uid_eq(cred->uid, tcred->uid) ||
               ns_capable(tcred->user_ns, CAP_KILL);
}

/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
static int check_kill_permission(int sig, struct kernel_siginfo *info,
                                 struct task_struct *t)
{
        struct pid *sid;
        int error;

        if (!valid_signal(sig))
                return -EINVAL;

        if (!si_fromuser(info))
                return 0;

        error = audit_signal_info(sig, t); /* Let audit system see the signal */
        if (error)
                return error;

        if (!same_thread_group(current, t) &&
            !kill_ok_by_cred(t)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
                        /*
                         * We don't return the error if sid == NULL. The
                         * task was unhashed, the caller must notice this.
                         */
                        if (!sid || sid == task_session(current))
                                break;
                        fallthrough;
                default:
                        return -EPERM;
                }
        }

        return security_task_kill(t, info, sig, NULL);
}

/**
 * ptrace_trap_notify - schedule trap to notify ptracer
 * @t: tracee wanting to notify tracer
 *
 * This function schedules sticky ptrace trap which is cleared on the next
 * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
 * ptracer.
 *
 * If @t is running, STOP trap will be taken.  If trapped for STOP and
 * ptracer is listening for events, tracee is woken up so that it can
 * re-trap for the new event.  If trapped otherwise, STOP trap will be
 * eventually taken without returning to userland after the existing traps
 * are finished by PTRACE_CONT.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
static void ptrace_trap_notify(struct task_struct *t)
{
        WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
        lockdep_assert_held(&t->sighand->siglock);

        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
        ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}

/*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
 * time regardless of blocking, ignoring, or handling.  This does the
 * actual continuing for SIGCONT, but not the actual stopping for stop
 * signals. The process stop is done as a signal action for SIG_DFL.
 *
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
        sigset_t flush;

        if (signal->flags & SIGNAL_GROUP_EXIT) {
                if (signal->core_state)
                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, drop the signal.
                 */
                return false;
        } else if (sig_kernel_stop(sig)) {
                /*
                 * This is a stop signal.  Remove SIGCONT from all queues.
                 */
                siginitset(&flush, sigmask(SIGCONT));
                flush_sigqueue_mask(&flush, &signal->shared_pending);
                for_each_thread(p, t)
                        flush_sigqueue_mask(&flush, &t->pending);
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
                 * Remove all stop signals from all queues, wake all threads.
                 */
                siginitset(&flush, SIG_KERNEL_STOP_MASK);
                flush_sigqueue_mask(&flush, &signal->shared_pending);
                for_each_thread(p, t) {
                        flush_sigqueue_mask(&flush, &t->pending);
                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
                        if (likely(!(t->ptrace & PT_SEIZED))) {
                                t->jobctl &= ~JOBCTL_STOPPED;
                                wake_up_state(t, __TASK_STOPPED);
                        } else
                                ptrace_trap_notify(t);
                }

                /*
                 * Notify the parent with CLD_CONTINUED if we were stopped.
                 *
                 * If we were in the middle of a group stop, we pretend it
                 * was already finished, and then continued. Since SIGCHLD
                 * doesn't queue we report only CLD_STOPPED, as if the next
                 * CLD_CONTINUED was dropped.
                 */
                why = 0;
                if (signal->flags & SIGNAL_STOP_STOPPED)
                        why |= SIGNAL_CLD_CONTINUED;
                else if (signal->group_stop_count)
                        why |= SIGNAL_CLD_STOPPED;

                if (why) {
                        /*
                         * The first thread which returns from do_signal_stop()
                         * will take ->siglock, notice SIGNAL_CLD_MASK, and
                         * notify its parent. See get_signal().
                         */
                        signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
                        signal->group_stop_count = 0;
                        signal->group_exit_code = 0;
                }
        }

        return !sig_ignored(p, sig, force);
}

/*
 * Test if P wants to take SIG.  After we've checked all threads with this,
 * it's equivalent to finding no threads not blocking SIG.  Any threads not
 * blocking SIG were ruled out because they are not running and already
 * have pending signals.  Such threads will dequeue from the shared queue
 * as soon as they're available, so putting the signal on the shared queue
 * will be equivalent to sending it to one such thread.
 */
static inline bool wants_signal(int sig, struct task_struct *p)
{
        if (sigismember(&p->blocked, sig))
                return false;

        if (p->flags & PF_EXITING)
                return false;

        if (sig == SIGKILL)
                return true;

        if (task_is_stopped_or_traced(p))
                return false;

        return task_curr(p) || !task_sigpending(p);
}

static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;

        /*
         * Now find a thread we can wake up to take the signal off the queue.
         *
         * Try the suggested task first (may or may not be the main thread).
         */
        if (wants_signal(sig, p))
                t = p;
        else if ((type == PIDTYPE_PID) || thread_group_empty(p))
                /*
                 * There is just one thread and it does not need to be woken.
                 * It will dequeue unblocked signals before it runs again.
                 */
                return;
        else {
                /*
                 * Otherwise try to find a suitable thread.
                 */
                t = signal->curr_target;
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
                        if (t == signal->curr_target)
                                /*
                                 * No thread needs to be woken.
                                 * Any eligible threads will see
                                 * the signal in the queue soon.
                                 */
                                return;
                }
                signal->curr_target = t;
        }

        /*
         * Found a killable thread.  If the signal will be fatal,
         * then start taking the whole group down immediately.
         */
        if (sig_fatal(p, sig) &&
            (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
            (sig == SIGKILL || !p->ptrace)) {
                /*
                 * This signal will be fatal to the whole group.
                 */
                if (!sig_kernel_coredump(sig)) {
                        /*
                         * Start a group exit and wake everybody up.
                         * This way we don't have other threads
                         * running and doing things after a slower
                         * thread has the fatal signal pending.
                         */
                        signal->flags = SIGNAL_GROUP_EXIT;
                        signal->group_exit_code = sig;
                        signal->group_stop_count = 0;
                        __for_each_thread(signal, t) {
                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        }
                        return;
                }
        }

        /*
         * The signal is already in the shared-pending queue.
         * Tell the chosen thread to wake up and dequeue it.
         */
        signal_wake_up(t, sig == SIGKILL);
        return;
}

static inline bool legacy_queue(struct sigpending *signals, int sig)
{
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}

static int __send_signal_locked(int sig, struct kernel_siginfo *info,
                                struct task_struct *t, enum pid_type type, bool force)
{
        struct sigpending *pending;
        struct sigqueue *q;
        int override_rlimit;
        int ret = 0, result;

        lockdep_assert_held(&t->sighand->siglock);

        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, force))
                goto ret;

        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        /*
         * Short-circuit ignored signals and support queuing
         * exactly one non-rt signal, so that we can get more
         * detailed information about the cause of the signal.
         */
        result = TRACE_SIGNAL_ALREADY_PENDING;
        if (legacy_queue(pending, sig))
                goto ret;

        result = TRACE_SIGNAL_DELIVERED;
        /*
         * Skip useless siginfo allocation for SIGKILL and kernel threads.
         */
        if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
                goto out_set;

        /*
         * Real-time signals must be queued if sent by sigqueue, or
         * some other real-time mechanism.  It is implementation
         * defined whether kill() does so.  We attempt to do so, on
         * the principle of least surprise, but since kill is not
         * allowed to fail with EAGAIN when low on memory we just
         * make sure at least one signal gets delivered and don't
         * pass on the info struct.
         */
        if (sig < SIGRTMIN)
                override_rlimit = (is_si_special(info) || info->si_code >= 0);
        else
                override_rlimit = 0;

        q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);

        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
                case (unsigned long) SEND_SIG_NOINFO:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
                        rcu_read_lock();
                        q->info.si_uid =
                                from_kuid_munged(task_cred_xxx(t, user_ns),
                                                 current_uid());
                        rcu_read_unlock();
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_KERNEL;
                        q->info.si_pid = 0;
                        q->info.si_uid = 0;
                        break;
                default:
                        copy_siginfo(&q->info, info);
                        break;
                }
        } else if (!is_si_special(info) &&
                   sig >= SIGRTMIN && info->si_code != SI_USER) {
                /*
                 * Queue overflow, abort.  We may abort if the
                 * signal was rt and sent by user using something
                 * other than kill().
                 */
                result = TRACE_SIGNAL_OVERFLOW_FAIL;
                ret = -EAGAIN;
                goto ret;
        } else {
                /*
                 * This is a silent loss of information.  We still
                 * send the signal, but the *info bits are lost.
                 */
                result = TRACE_SIGNAL_LOSE_INFO;
        }

out_set:
        signalfd_notify(t, sig);
        sigaddset(&pending->signal, sig);

        /* Let multiprocess signals appear after on-going forks */
        if (type > PIDTYPE_TGID) {
                struct multiprocess_signals *delayed;
                hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
                        sigset_t *signal = &delayed->signal;
                        /* Can't queue both a stop and a continue signal */
                        if (sig == SIGCONT)
                                sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
                        else if (sig_kernel_stop(sig))
                                sigdelset(signal, SIGCONT);
                        sigaddset(signal, sig);
                }
        }

        complete_signal(sig, t, type);
ret:
        trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
        return ret;
}

static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
{
        bool ret = false;
        switch (siginfo_layout(info->si_signo, info->si_code)) {
        case SIL_KILL:
        case SIL_CHLD:
        case SIL_RT:
                ret = true;
                break;
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
        case SIL_SYS:
                ret = false;
                break;
        }
        return ret;
}

int send_signal_locked(int sig, struct kernel_siginfo *info,
                       struct task_struct *t, enum pid_type type)
{
        /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
        bool force = false;

        if (info == SEND_SIG_NOINFO) {
                /* Force if sent from an ancestor pid namespace */
                force = !task_pid_nr_ns(current, task_active_pid_ns(t));
        } else if (info == SEND_SIG_PRIV) {
                /* Don't ignore kernel generated signals */
                force = true;
        } else if (has_si_pid_and_uid(info)) {
                /* SIGKILL and SIGSTOP is special or has ids */
                struct user_namespace *t_user_ns;

                rcu_read_lock();
                t_user_ns = task_cred_xxx(t, user_ns);
                if (current_user_ns() != t_user_ns) {
                        kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
                        info->si_uid = from_kuid_munged(t_user_ns, uid);
                }
                rcu_read_unlock();

                /* A kernel generated signal? */
                force = (info->si_code == SI_KERNEL);

                /* From an ancestor pid namespace? */
                if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
                        info->si_pid = 0;
                        force = true;
                }
        }
        return __send_signal_locked(sig, info, t, type, force);
}

static void print_fatal_signal(int signr)
{
        struct pt_regs *regs = task_pt_regs(current);
        struct file *exe_file;

        exe_file = get_task_exe_file(current);
        if (exe_file) {
                pr_info("%pD: %s: potentially unexpected fatal signal %d.\n",
                        exe_file, current->comm, signr);
                fput(exe_file);
        } else {
                pr_info("%s: potentially unexpected fatal signal %d.\n",
                        current->comm, signr);
        }

#if defined(__i386__) && !defined(__arch_um__)
        pr_info("code at %08lx: ", regs->ip);
        {
                int i;
                for (i = 0; i < 16; i++) {
                        unsigned char insn;

                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
                                break;
                        pr_cont("%02x ", insn);
                }
        }
        pr_cont("\n");
#endif
        preempt_disable();
        show_regs(regs);
        preempt_enable();
}

static int __init setup_print_fatal_signals(char *str)
{
        get_option (&str, &print_fatal_signals);

        return 1;
}

__setup("print-fatal-signals=", setup_print_fatal_signals);

int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
                        enum pid_type type)
{
        unsigned long flags;
        int ret = -ESRCH;

        if (lock_task_sighand(p, &flags)) {
                ret = send_signal_locked(sig, info, p, type);
                unlock_task_sighand(p, &flags);
        }

        return ret;
}

enum sig_handler {
        HANDLER_CURRENT, /* If reachable use the current handler */
        HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */
        HANDLER_EXIT,         /* Only visible as the process exit code */
};

/*
 * Force a signal that the process can't ignore: if necessary
 * we unblock the signal and change any SIG_IGN to SIG_DFL.
 *
 * Note: If we unblock the signal, we always reset it to SIG_DFL,
 * since we do not want to have a signal handler that was blocked
 * be invoked when user space had explicitly blocked it.
 *
 * We don't want to have recursive SIGSEGV's etc, for example,
 * that is why we also clear SIGNAL_UNKILLABLE.
 */
static int
force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
        enum sig_handler handler)
{
        unsigned long int flags;
        int ret, blocked, ignored;
        struct k_sigaction *action;
        int sig = info->si_signo;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        action = &t->sighand->action[sig-1];
        ignored = action->sa.sa_handler == SIG_IGN;
        blocked = sigismember(&t->blocked, sig);
        if (blocked || ignored || (handler != HANDLER_CURRENT)) {
                action->sa.sa_handler = SIG_DFL;
                if (handler == HANDLER_EXIT)
                        action->sa.sa_flags |= SA_IMMUTABLE;
                if (blocked)
                        sigdelset(&t->blocked, sig);
        }
        /*
         * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
         * debugging to leave init killable. But HANDLER_EXIT is always fatal.
         */
        if (action->sa.sa_handler == SIG_DFL &&
            (!t->ptrace || (handler == HANDLER_EXIT)))
                t->signal->flags &= ~SIGNAL_UNKILLABLE;
        ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
        /* This can happen if the signal was already pending and blocked */
        if (!task_sigpending(t))
                signal_wake_up(t, 0);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);

        return ret;
}

int force_sig_info(struct kernel_siginfo *info)
{
        return force_sig_info_to_task(info, current, HANDLER_CURRENT);
}

/*
 * Nuke all other threads in the group.
 */
int zap_other_threads(struct task_struct *p)
{
        struct task_struct *t;
        int count = 0;

        p->signal->group_stop_count = 0;

        for_other_threads(p, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                count++;

                /* Don't bother with already dead threads */
                if (t->exit_state)
                        continue;
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
        }

        return count;
}

struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
                                           unsigned long *flags)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        for (;;) {
                sighand = rcu_dereference(tsk->sighand);
                if (unlikely(sighand == NULL))
                        break;

                /*
                 * This sighand can be already freed and even reused, but
                 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
                 * initializes ->siglock: this slab can't go away, it has
                 * the same object type, ->siglock can't be reinitialized.
                 *
                 * We need to ensure that tsk->sighand is still the same
                 * after we take the lock, we can race with de_thread() or
                 * __exit_signal(). In the latter case the next iteration
                 * must see ->sighand == NULL.
                 */
                spin_lock_irqsave(&sighand->siglock, *flags);
                if (likely(sighand == rcu_access_pointer(tsk->sighand)))
                        break;
                spin_unlock_irqrestore(&sighand->siglock, *flags);
        }
        rcu_read_unlock();

        return sighand;
}

#ifdef CONFIG_LOCKDEP
void lockdep_assert_task_sighand_held(struct task_struct *task)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        sighand = rcu_dereference(task->sighand);
        if (sighand)
                lockdep_assert_held(&sighand->siglock);
        else
                WARN_ON_ONCE(1);
        rcu_read_unlock();
}
#endif

/*
 * send signal info to all the members of a thread group or to the
 * individual thread if type == PIDTYPE_PID.
 */
int group_send_sig_info(int sig, struct kernel_siginfo *info,
                        struct task_struct *p, enum pid_type type)
{
        int ret;

        rcu_read_lock();
        ret = check_kill_permission(sig, info, p);
        rcu_read_unlock();

        if (!ret && sig)
                ret = do_send_sig_info(sig, info, p, type);

        return ret;
}

/*
 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
 * control characters do (^C, ^Z etc)
 * - the caller must hold at least a readlock on tasklist_lock
 */
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        struct task_struct *p = NULL;
        int ret = -ESRCH;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
                /*
                 * If group_send_sig_info() succeeds at least once ret
                 * becomes 0 and after that the code below has no effect.
                 * Otherwise we return the last err or -ESRCH if this
                 * process group is empty.
                 */
                if (ret)
                        ret = err;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return ret;
}

static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
                                struct pid *pid, enum pid_type type)
{
        int error = -ESRCH;
        struct task_struct *p;

        for (;;) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        error = group_send_sig_info(sig, info, p, type);
                rcu_read_unlock();
                if (likely(!p || error != -ESRCH))
                        return error;
                /*
                 * The task was unhashed in between, try again.  If it
                 * is dead, pid_task() will return NULL, if we race with
                 * de_thread() it will find the new leader.
                 */
        }
}

int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
{
        return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
}

static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int error;
        rcu_read_lock();
        error = kill_pid_info(sig, info, find_vpid(pid));
        rcu_read_unlock();
        return error;
}

static inline bool kill_as_cred_perm(const struct cred *cred,
                                     struct task_struct *target)
{
        const struct cred *pcred = __task_cred(target);

        return uid_eq(cred->euid, pcred->suid) ||
               uid_eq(cred->euid, pcred->uid) ||
               uid_eq(cred->uid, pcred->suid) ||
               uid_eq(cred->uid, pcred->uid);
}

/*
 * The usb asyncio usage of siginfo is wrong.  The glibc support
 * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
 * AKA after the generic fields:
 *        kernel_pid_t        si_pid;
 *        kernel_uid32_t        si_uid;
 *        sigval_t        si_value;
 *
 * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
 * after the generic fields is:
 *        void __user         *si_addr;
 *
 * This is a practical problem when there is a 64bit big endian kernel
 * and a 32bit userspace.  As the 32bit address will encoded in the low
 * 32bits of the pointer.  Those low 32bits will be stored at higher
 * address than appear in a 32 bit pointer.  So userspace will not
 * see the address it was expecting for it's completions.
 *
 * There is nothing in the encoding that can allow
 * copy_siginfo_to_user32 to detect this confusion of formats, so
 * handle this by requiring the caller of kill_pid_usb_asyncio to
 * notice when this situration takes place and to store the 32bit
 * pointer in sival_int, instead of sival_addr of the sigval_t addr
 * parameter.
 */
int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
                         struct pid *pid, const struct cred *cred)
{
        struct kernel_siginfo info;
        struct task_struct *p;
        unsigned long flags;
        int ret = -EINVAL;

        if (!valid_signal(sig))
                return ret;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = errno;
        info.si_code = SI_ASYNCIO;
        *((sigval_t *)&info.si_pid) = addr;

        rcu_read_lock();
        p = pid_task(pid, PIDTYPE_PID);
        if (!p) {
                ret = -ESRCH;
                goto out_unlock;
        }
        if (!kill_as_cred_perm(cred, p)) {
                ret = -EPERM;
                goto out_unlock;
        }
        ret = security_task_kill(p, &info, sig, cred);
        if (ret)
                goto out_unlock;

        if (sig) {
                if (lock_task_sighand(p, &flags)) {
                        ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false);
                        unlock_task_sighand(p, &flags);
                } else
                        ret = -ESRCH;
        }
out_unlock:
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);

/*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
 *
 * POSIX specifies that kill(-1,sig) is unspecified, but what we have
 * is probably wrong.  Should make it like BSD or SYSV.
 */

static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int ret;

        if (pid > 0)
                return kill_proc_info(sig, info, pid);

        /* -INT_MIN is undefined.  Exclude this case to avoid a UBSAN warning */
        if (pid == INT_MIN)
                return -ESRCH;

        read_lock(&tasklist_lock);
        if (pid != -1) {
                ret = __kill_pgrp_info(sig, info,
                                pid ? find_vpid(-pid) : task_pgrp(current));
        } else {
                int retval = 0, count = 0;
                struct task_struct * p;

                for_each_process(p) {
                        if (task_pid_vnr(p) > 1 &&
                                        !same_thread_group(p, current)) {
                                int err = group_send_sig_info(sig, info, p,
                                                              PIDTYPE_MAX);
                                ++count;
                                if (err != -EPERM)
                                        retval = err;
                        }
                }
                ret = count ? retval : -ESRCH;
        }
        read_unlock(&tasklist_lock);

        return ret;
}

/*
 * These are for backward compatibility with the rest of the kernel source.
 */

int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
        /*
         * Make sure legacy kernel users don't send in bad values
         * (normal paths check this in check_kill_permission).
         */
        if (!valid_signal(sig))
                return -EINVAL;

        return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
EXPORT_SYMBOL(send_sig_info);

#define __si_special(priv) \
        ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)

int
send_sig(int sig, struct task_struct *p, int priv)
{
        return send_sig_info(sig, __si_special(priv), p);
}
EXPORT_SYMBOL(send_sig);

void force_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info(&info);
}
EXPORT_SYMBOL(force_sig);

void force_fatal_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_SIG_DFL);
}

void force_exit_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_EXIT);
}

/*
 * When things go south during signal handling, we
 * will force a SIGSEGV. And if the signal that caused
 * the problem was already a SIGSEGV, we'll want to
 * make sure we don't even try to deliver the signal..
 */
void force_sigsegv(int sig)
{
        if (sig == SIGSEGV)
                force_fatal_sig(SIGSEGV);
        else
                force_sig(SIGSEGV);
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}

int force_sig_fault(int sig, int code, void __user *addr)
{
        return force_sig_fault_to_task(sig, code, addr, current);
}

int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return send_sig_info(info.si_signo, &info, t);
}

int force_sig_mceerr(int code, void __user *addr, short lsb)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return force_sig_info(&info);
}

int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_BNDERR;
        info.si_addr  = addr;
        info.si_lower = lower;
        info.si_upper = upper;
        return force_sig_info(&info);
}

#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_PKUERR;
        info.si_addr  = addr;
        info.si_pkey  = pkey;
        return force_sig_info(&info);
}
#endif

int send_sig_perf(void __user *addr, u32 type, u64 sig_data)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo     = SIGTRAP;
        info.si_errno     = 0;
        info.si_code      = TRAP_PERF;
        info.si_addr      = addr;
        info.si_perf_data = sig_data;
        info.si_perf_type = type;

        /*
         * Signals generated by perf events should not terminate the whole
         * process if SIGTRAP is blocked, however, delivering the signal
         * asynchronously is better than not delivering at all. But tell user
         * space if the signal was asynchronous, so it can clearly be
         * distinguished from normal synchronous ones.
         */
        info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ?
                                     TRAP_PERF_FLAG_ASYNC :
                                     0;

        return send_sig_info(info.si_signo, &info, current);
}

/**
 * force_sig_seccomp - signals the task to allow in-process syscall emulation
 * @syscall: syscall number to send to userland
 * @reason: filter-supplied reason code to send to userland (via si_errno)
 * @force_coredump: true to trigger a coredump
 *
 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 */
int force_sig_seccomp(int syscall, int reason, bool force_coredump)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSYS;
        info.si_code = SYS_SECCOMP;
        info.si_call_addr = (void __user *)KSTK_EIP(current);
        info.si_errno = reason;
        info.si_arch = syscall_get_arch(current);
        info.si_syscall = syscall;
        return force_sig_info_to_task(&info, current,
                force_coredump ? HANDLER_EXIT : HANDLER_CURRENT);
}

/* For the crazy architectures that include trap information in
 * the errno field, instead of an actual errno value.
 */
int force_sig_ptrace_errno_trap(int errno, void __user *addr)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGTRAP;
        info.si_errno = errno;
        info.si_code  = TRAP_HWBKPT;
        info.si_addr  = addr;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                          struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return send_sig_info(info.si_signo, &info, t);
}

static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        int ret;
        read_lock(&tasklist_lock);
        ret = __kill_pgrp_info(sig, info, pgrp);
        read_unlock(&tasklist_lock);
        return ret;
}

int kill_pgrp(struct pid *pid, int sig, int priv)
{
        return kill_pgrp_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pgrp);

int kill_pid(struct pid *pid, int sig, int priv)
{
        return kill_pid_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pid);

/*
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
 * afford to lose notifications of asynchronous events, like timer
 * expirations or I/O completions".  In the case of POSIX Timers
 * we allocate the sigqueue structure from the timer_create.  If this
 * allocation fails we are able to report the failure to the application
 * with an EAGAIN error.
 */
struct sigqueue *sigqueue_alloc(void)
{
        return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
}

void sigqueue_free(struct sigqueue *q)
{
        unsigned long flags;
        spinlock_t *lock = &current->sighand->siglock;

        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
        /*
         * We must hold ->siglock while testing q->list
         * to serialize with collect_signal() or with
         * __exit_signal()->flush_sigqueue().
         */
        spin_lock_irqsave(lock, flags);
        q->flags &= ~SIGQUEUE_PREALLOC;
        /*
         * If it is queued it will be freed when dequeued,
         * like the "regular" sigqueue.
         */
        if (!list_empty(&q->list))
                q = NULL;
        spin_unlock_irqrestore(lock, flags);

        if (q)
                __sigqueue_free(q);
}

int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
{
        int sig = q->info.si_signo;
        struct sigpending *pending;
        struct task_struct *t;
        unsigned long flags;
        int ret, result;

        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));

        ret = -1;
        rcu_read_lock();

        /*
         * This function is used by POSIX timers to deliver a timer signal.
         * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
         * set), the signal must be delivered to the specific thread (queues
         * into t->pending).
         *
         * Where type is not PIDTYPE_PID, signals must be delivered to the
         * process. In this case, prefer to deliver to current if it is in
         * the same thread group as the target process, which avoids
         * unnecessarily waking up a potentially idle task.
         */
        t = pid_task(pid, type);
        if (!t)
                goto ret;
        if (type != PIDTYPE_PID && same_thread_group(t, current))
                t = current;
        if (!likely(lock_task_sighand(t, &flags)))
                goto ret;

        ret = 1; /* the signal is ignored */
        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, false))
                goto out;

        ret = 0;
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
                 * the overrun count.
                 */
                BUG_ON(q->info.si_code != SI_TIMER);
                q->info.si_overrun++;
                result = TRACE_SIGNAL_ALREADY_PENDING;
                goto out;
        }
        q->info.si_overrun = 0;

        signalfd_notify(t, sig);
        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        list_add_tail(&q->list, &pending->list);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, type);
        result = TRACE_SIGNAL_DELIVERED;
out:
        trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
        unlock_task_sighand(t, &flags);
ret:
        rcu_read_unlock();
        return ret;
}

void do_notify_pidfd(struct task_struct *task)
{
        struct pid *pid = task_pid(task);

        WARN_ON(task->exit_state == 0);

        __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
                        poll_to_key(EPOLLIN | EPOLLRDNORM));
}

/*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
 *
 * Returns true if our parent ignored us and so we've switched to
 * self-reaping.
 */
bool do_notify_parent(struct task_struct *tsk, int sig)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
        bool autoreap = false;
        u64 utime, stime;

        WARN_ON_ONCE(sig == -1);

        /* do_notify_parent_cldstop should have been called instead.  */
        WARN_ON_ONCE(task_is_stopped_or_traced(tsk));

        WARN_ON_ONCE(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
        /*
         * tsk is a group leader and has no threads, wake up the
         * non-PIDFD_THREAD waiters.
         */
        if (thread_group_empty(tsk))
                do_notify_pidfd(tsk);

        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
                 * Check if it has changed security domain.
                 */
                if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
                        sig = SIGCHLD;
        }

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        /*
         * We are under tasklist_lock here so our parent is tied to
         * us and cannot change.
         *
         * task_active_pid_ns will always return the same pid namespace
         * until a task passes through release_task.
         *
         * write_lock() currently calls preempt_disable() which is the
         * same as rcu_read_lock(), but according to Oleg, this is not
         * correct to rely on this
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
                                       task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
        info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);

        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
                info.si_code = CLD_DUMPED;
        else if (tsk->exit_code & 0x7f)
                info.si_code = CLD_KILLED;
        else {
                info.si_code = CLD_EXITED;
                info.si_status = tsk->exit_code >> 8;
        }

        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
                 * We are exiting and our parent doesn't care.  POSIX.1
                 * defines special semantics for setting SIGCHLD to SIG_IGN
                 * or setting the SA_NOCLDWAIT flag: we should be reaped
                 * automatically and not left for our parent's wait4 call.
                 * Rather than having the parent do it as a magic kind of
                 * signal handler, we just set this to tell do_exit that we
                 * can be cleaned up without becoming a zombie.  Note that
                 * we still call __wake_up_parent in this case, because a
                 * blocked sys_wait4 might now return -ECHILD.
                 *
                 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
                autoreap = true;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
                        sig = 0;
        }
        /*
         * Send with __send_signal as si_pid and si_uid are in the
         * parent's namespaces.
         */
        if (valid_signal(sig) && sig)
                __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);

        return autoreap;
}

/**
 * do_notify_parent_cldstop - notify parent of stopped/continued state change
 * @tsk: task reporting the state change
 * @for_ptracer: the notification is for ptracer
 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
 *
 * Notify @tsk's parent that the stopped/continued state has changed.  If
 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
 *
 * CONTEXT:
 * Must be called with tasklist_lock at least read locked.
 */
static void do_notify_parent_cldstop(struct task_struct *tsk,
                                     bool for_ptracer, int why)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
        u64 utime, stime;

        if (for_ptracer) {
                parent = tsk->parent;
        } else {
                tsk = tsk->group_leader;
                parent = tsk->real_parent;
        }

        clear_siginfo(&info);
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime);
        info.si_stime = nsec_to_clock_t(stime);

         info.si_code = why;
         switch (why) {
         case CLD_CONTINUED:
                 info.si_status = SIGCONT;
                 break;
         case CLD_STOPPED:
                 info.si_status = tsk->signal->group_exit_code & 0x7f;
                 break;
         case CLD_TRAPPED:
                 info.si_status = tsk->exit_code & 0x7f;
                 break;
         default:
                 BUG();
         }

        sighand = parent->sighand;
        spin_lock_irqsave(&sighand->siglock, flags);
        if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
            !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
                send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID);
        /*
         * Even if SIGCHLD is not generated, we must wake up wait4 calls.
         */
        __wake_up_parent(tsk, parent);
        spin_unlock_irqrestore(&sighand->siglock, flags);
}

/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
 * We always set current->last_siginfo while stopped here.
 * That makes it a way to test a stopped process for
 * being ptrace-stopped vs being job-control-stopped.
 *
 * Returns the signal the ptracer requested the code resume
 * with.  If the code did not stop because the tracer is gone,
 * the stop signal remains unchanged unless clear_code.
 */
static int ptrace_stop(int exit_code, int why, unsigned long message,
                       kernel_siginfo_t *info)
        __releases(&current->sighand->siglock)
        __acquires(&current->sighand->siglock)
{
        bool gstop_done = false;

        if (arch_ptrace_stop_needed()) {
                /*
                 * The arch code has something special to do before a
                 * ptrace stop.  This is allowed to block, e.g. for faults
                 * on user stack pages.  We can't keep the siglock while
                 * calling arch_ptrace_stop, so we must release it now.
                 * To preserve proper semantics, we must do this before
                 * any signal bookkeeping like checking group_stop_count.
                 */
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop();
                spin_lock_irq(&current->sighand->siglock);
        }

        /*
         * After this point ptrace_signal_wake_up or signal_wake_up
         * will clear TASK_TRACED if ptrace_unlink happens or a fatal
         * signal comes in.  Handle previous ptrace_unlinks and fatal
         * signals here to prevent ptrace_stop sleeping in schedule.
         */
        if (!current->ptrace || __fatal_signal_pending(current))
                return exit_code;

        set_special_state(TASK_TRACED);
        current->jobctl |= JOBCTL_TRACED;

        /*
         * We're committing to trapping.  TRACED should be visible before
         * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
         * Also, transition to TRACED and updates to ->jobctl should be
         * atomic with respect to siglock and should be done after the arch
         * hook as siglock is released and regrabbed across it.
         *
         *     TRACER                                    TRACEE
         *
         *     ptrace_attach()
         * [L]   wait_on_bit(JOBCTL_TRAPPING)        [S] set_special_state(TRACED)
         *     do_wait()
         *       set_current_state()                smp_wmb();
         *       ptrace_do_wait()
         *         wait_task_stopped()
         *           task_stopped_code()
         * [L]         task_is_traced()                [S] task_clear_jobctl_trapping();
         */
        smp_wmb();

        current->ptrace_message = message;
        current->last_siginfo = info;
        current->exit_code = exit_code;

        /*
         * If @why is CLD_STOPPED, we're trapping to participate in a group
         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
         * across siglock relocks since INTERRUPT was scheduled, PENDING
         * could be clear now.  We act as if SIGCONT is received after
         * TASK_TRACED is entered - ignore it.
         */
        if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
                gstop_done = task_participate_group_stop(current);

        /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
        task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
        if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
                task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);

        /* entering a trap, clear TRAPPING */
        task_clear_jobctl_trapping(current);

        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
        /*
         * Notify parents of the stop.
         *
         * While ptraced, there are two parents - the ptracer and
         * the real_parent of the group_leader.  The ptracer should
         * know about every stop while the real parent is only
         * interested in the completion of group stop.  The states
         * for the two don't interact with each other.  Notify
         * separately unless they're gonna be duplicates.
         */
        if (current->ptrace)
                do_notify_parent_cldstop(current, true, why);
        if (gstop_done && (!current->ptrace || ptrace_reparented(current)))
                do_notify_parent_cldstop(current, false, why);

        /*
         * The previous do_notify_parent_cldstop() invocation woke ptracer.
         * One a PREEMPTION kernel this can result in preemption requirement
         * which will be fulfilled after read_unlock() and the ptracer will be
         * put on the CPU.
         * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
         * this task wait in schedule(). If this task gets preempted then it
         * remains enqueued on the runqueue. The ptracer will observe this and
         * then sleep for a delay of one HZ tick. In the meantime this task
         * gets scheduled, enters schedule() and will wait for the ptracer.
         *
         * This preemption point is not bad from a correctness point of
         * view but extends the runtime by one HZ tick time due to the
         * ptracer's sleep.  The preempt-disable section ensures that there
         * will be no preemption between unlock and schedule() and so
         * improving the performance since the ptracer will observe that
         * the tracee is scheduled out once it gets on the CPU.
         *
         * On PREEMPT_RT locking tasklist_lock does not disable preemption.
         * Therefore the task can be preempted after do_notify_parent_cldstop()
         * before unlocking tasklist_lock so there is no benefit in doing this.
         *
         * In fact disabling preemption is harmful on PREEMPT_RT because
         * the spinlock_t in cgroup_enter_frozen() must not be acquired
         * with preemption disabled due to the 'sleeping' spinlock
         * substitution of RT.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        read_unlock(&tasklist_lock);
        cgroup_enter_frozen();
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable_no_resched();
        schedule();
        cgroup_leave_frozen(true);

        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
         */
        spin_lock_irq(&current->sighand->siglock);
        exit_code = current->exit_code;
        current->last_siginfo = NULL;
        current->ptrace_message = 0;
        current->exit_code = 0;

        /* LISTENING can be set only during STOP traps, clear it */
        current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN);

        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
         * This sets TIF_SIGPENDING, but never clears it.
         */
        recalc_sigpending_tsk(current);
        return exit_code;
}

static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
{
        kernel_siginfo_t info;

        clear_siginfo(&info);
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());

        /* Let the debugger run.  */
        return ptrace_stop(exit_code, why, message, &info);
}

int ptrace_notify(int exit_code, unsigned long message)
{
        int signr;

        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        if (unlikely(task_work_pending(current)))
                task_work_run();

        spin_lock_irq(&current->sighand->siglock);
        signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
        spin_unlock_irq(&current->sighand->siglock);
        return signr;
}

/**
 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
 * @signr: signr causing group stop if initiating
 *
 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
 * and participate in it.  If already set, participate in the existing
 * group stop.  If participated in a group stop (and thus slept), %true is
 * returned with siglock released.
 *
 * If ptraced, this function doesn't handle stop itself.  Instead,
 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
 * untouched.  The caller must ensure that INTERRUPT trap handling takes
 * places afterwards.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which is released
 * on %true return.
 *
 * RETURNS:
 * %false if group stop is already cancelled or ptrace trap is scheduled.
 * %true if participated in group stop.
 */
static bool do_signal_stop(int signr)
        __releases(&current->sighand->siglock)
{
        struct signal_struct *sig = current->signal;

        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
                unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;

                /* signr will be recorded in task->jobctl for retries */
                WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);

                if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
                    unlikely(sig->flags & SIGNAL_GROUP_EXIT) ||
                    unlikely(sig->group_exec_task))
                        return false;
                /*
                 * There is no group stop already in progress.  We must
                 * initiate one now.
                 *
                 * While ptraced, a task may be resumed while group stop is
                 * still in effect and then receive a stop signal and
                 * initiate another group stop.  This deviates from the
                 * usual behavior as two consecutive stop signals can't
                 * cause two group stops when !ptraced.  That is why we
                 * also check !task_is_stopped(t) below.
                 *
                 * The condition can be distinguished by testing whether
                 * SIGNAL_STOP_STOPPED is already set.  Don't generate
                 * group_exit_code in such case.
                 *
                 * This is not necessary for SIGNAL_STOP_CONTINUED because
                 * an intervening stop signal is required to cause two
                 * continued events regardless of ptrace.
                 */
                if (!(sig->flags & SIGNAL_STOP_STOPPED))
                        sig->group_exit_code = signr;

                sig->group_stop_count = 0;
                if (task_set_jobctl_pending(current, signr | gstop))
                        sig->group_stop_count++;

                for_other_threads(current, t) {
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
                        if (!task_is_stopped(t) &&
                            task_set_jobctl_pending(t, signr | gstop)) {
                                sig->group_stop_count++;
                                if (likely(!(t->ptrace & PT_SEIZED)))
                                        signal_wake_up(t, 0);
                                else
                                        ptrace_trap_notify(t);
                        }
                }
        }

        if (likely(!current->ptrace)) {
                int notify = 0;

                /*
                 * If there are no other threads in the group, or if there
                 * is a group stop in progress and we are the last to stop,
                 * report to the parent.
                 */
                if (task_participate_group_stop(current))
                        notify = CLD_STOPPED;

                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
                spin_unlock_irq(&current->sighand->siglock);

                /*
                 * Notify the parent of the group stop completion.  Because
                 * we're not holding either the siglock or tasklist_lock
                 * here, ptracer may attach inbetween; however, this is for
                 * group stop and should always be delivered to the real
                 * parent of the group leader.  The new ptracer will get
                 * its notification when this task transitions into
                 * TASK_TRACED.
                 */
                if (notify) {
                        read_lock(&tasklist_lock);
                        do_notify_parent_cldstop(current, false, notify);
                        read_unlock(&tasklist_lock);
                }

                /* Now we don't run again until woken by SIGCONT or SIGKILL */
                cgroup_enter_frozen();
                schedule();
                return true;
        } else {
                /*
                 * While ptraced, group stop is handled by STOP trap.
                 * Schedule it and let the caller deal with it.
                 */
                task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
                return false;
        }
}

/**
 * do_jobctl_trap - take care of ptrace jobctl traps
 *
 * When PT_SEIZED, it's used for both group stop and explicit
 * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
 * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
 * the stop signal; otherwise, %SIGTRAP.
 *
 * When !PT_SEIZED, it's used only for group stop trap with stop signal
 * number as exit_code and no siginfo.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which may be
 * released and re-acquired before returning with intervening sleep.
 */
static void do_jobctl_trap(void)
{
        struct signal_struct *signal = current->signal;
        int signr = current->jobctl & JOBCTL_STOP_SIGMASK;

        if (current->ptrace & PT_SEIZED) {
                if (!signal->group_stop_count &&
                    !(signal->flags & SIGNAL_STOP_STOPPED))
                        signr = SIGTRAP;
                WARN_ON_ONCE(!signr);
                ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
                                 CLD_STOPPED, 0);
        } else {
                WARN_ON_ONCE(!signr);
                ptrace_stop(signr, CLD_STOPPED, 0, NULL);
        }
}

/**
 * do_freezer_trap - handle the freezer jobctl trap
 *
 * Puts the task into frozen state, if only the task is not about to quit.
 * In this case it drops JOBCTL_TRAP_FREEZE.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held,
 * which is always released before returning.
 */
static void do_freezer_trap(void)
        __releases(&current->sighand->siglock)
{
        /*
         * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
         * let's make another loop to give it a chance to be handled.
         * In any case, we'll return back.
         */
        if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
             JOBCTL_TRAP_FREEZE) {
                spin_unlock_irq(&current->sighand->siglock);
                return;
        }

        /*
         * Now we're sure that there is no pending fatal signal and no
         * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
         * immediately (if there is a non-fatal signal pending), and
         * put the task into sleep.
         */
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        clear_thread_flag(TIF_SIGPENDING);
        spin_unlock_irq(&current->sighand->siglock);
        cgroup_enter_frozen();
        schedule();
}

static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
{
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
         * change signr. This flag has no meaning unless we are going
         * to stop after return from ptrace_stop(). In this case it will
         * be checked in do_signal_stop(), we should only stop if it was
         * not cleared by SIGCONT while we were sleeping. See also the
         * comment in dequeue_signal().
         */
        current->jobctl |= JOBCTL_STOP_DEQUEUED;
        signr = ptrace_stop(signr, CLD_TRAPPED, 0, info);

        /* We're back.  Did the debugger cancel the sig?  */
        if (signr == 0)
                return signr;

        /*
         * Update the siginfo structure if the signal has
         * changed.  If the debugger wanted something
         * specific in the siginfo structure then it should
         * have updated *info via PTRACE_SETSIGINFO.
         */
        if (signr != info->si_signo) {
                clear_siginfo(info);
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
                info->si_uid = from_kuid_munged(current_user_ns(),
                                                task_uid(current->parent));
                rcu_read_unlock();
        }

        /* If the (new) signal is now blocked, requeue it.  */
        if (sigismember(&current->blocked, signr) ||
            fatal_signal_pending(current)) {
                send_signal_locked(signr, info, current, type);
                signr = 0;
        }

        return signr;
}

static void hide_si_addr_tag_bits(struct ksignal *ksig)
{
        switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
                ksig->info.si_addr = arch_untagged_si_addr(
                        ksig->info.si_addr, ksig->sig, ksig->info.si_code);
                break;
        case SIL_KILL:
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_CHLD:
        case SIL_RT:
        case SIL_SYS:
                break;
        }
}

bool get_signal(struct ksignal *ksig)
{
        struct sighand_struct *sighand = current->sighand;
        struct signal_struct *signal = current->signal;
        int signr;

        clear_notify_signal();
        if (unlikely(task_work_pending(current)))
                task_work_run();

        if (!task_sigpending(current))
                return false;

        if (unlikely(uprobe_deny_signal()))
                return false;

        /*
         * Do this once, we can't return to user-mode if freezing() == T.
         * do_signal_stop() and ptrace_stop() do freezable_schedule() and
         * thus do not need another check after return.
         */
        try_to_freeze();

relock:
        spin_lock_irq(&sighand->siglock);

        /*
         * Every stopped thread goes here after wakeup. Check to see if
         * we should notify the parent, prepare_signal(SIGCONT) encodes
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
                int why;

                if (signal->flags & SIGNAL_CLD_CONTINUED)
                        why = CLD_CONTINUED;
                else
                        why = CLD_STOPPED;

                signal->flags &= ~SIGNAL_CLD_MASK;

                spin_unlock_irq(&sighand->siglock);

                /*
                 * Notify the parent that we're continuing.  This event is
                 * always per-process and doesn't make whole lot of sense
                 * for ptracers, who shouldn't consume the state via
                 * wait(2) either, but, for backward compatibility, notify
                 * the ptracer of the group leader too unless it's gonna be
                 * a duplicate.
                 */
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, false, why);

                if (ptrace_reparented(current->group_leader))
                        do_notify_parent_cldstop(current->group_leader,
                                                true, why);
                read_unlock(&tasklist_lock);

                goto relock;
        }

        for (;;) {
                struct k_sigaction *ka;
                enum pid_type type;

                /* Has this task already been marked for death? */
                if ((signal->flags & SIGNAL_GROUP_EXIT) ||
                     signal->group_exec_task) {
                        signr = SIGKILL;
                        sigdelset(&current->pending.signal, SIGKILL);
                        trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
                                             &sighand->action[SIGKILL-1]);
                        recalc_sigpending();
                        /*
                         * implies do_group_exit() or return to PF_USER_WORKER,
                         * no need to initialize ksig->info/etc.
                         */
                        goto fatal;
                }

                if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
                    do_signal_stop(0))
                        goto relock;

                if (unlikely(current->jobctl &
                             (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
                        if (current->jobctl & JOBCTL_TRAP_MASK) {
                                do_jobctl_trap();
                                spin_unlock_irq(&sighand->siglock);
                        } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
                                do_freezer_trap();

                        goto relock;
                }

                /*
                 * If the task is leaving the frozen state, let's update
                 * cgroup counters and reset the frozen bit.
                 */
                if (unlikely(cgroup_task_frozen(current))) {
                        spin_unlock_irq(&sighand->siglock);
                        cgroup_leave_frozen(false);
                        goto relock;
                }

                /*
                 * Signals generated by the execution of an instruction
                 * need to be delivered before any other pending signals
                 * so that the instruction pointer in the signal stack
                 * frame points to the faulting instruction.
                 */
                type = PIDTYPE_PID;
                signr = dequeue_synchronous_signal(&ksig->info);
                if (!signr)
                        signr = dequeue_signal(current, &current->blocked,
                                               &ksig->info, &type);

                if (!signr)
                        break; /* will return 0 */

                if (unlikely(current->ptrace) && (signr != SIGKILL) &&
                    !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
                        signr = ptrace_signal(signr, &ksig->info, type);
                        if (!signr)
                                continue;
                }

                ka = &sighand->action[signr-1];

                /* Trace actually delivered signals. */
                trace_signal_deliver(signr, &ksig->info, ka);

                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
                        /* Run the handler.  */
                        ksig->ka = *ka;

                        if (ka->sa.sa_flags & SA_ONESHOT)
                                ka->sa.sa_handler = SIG_DFL;

                        break; /* will return non-zero "signr" value */
                }

                /*
                 * Now we are doing the default action for this signal.
                 */
                if (sig_kernel_ignore(signr)) /* Default is nothing. */
                        continue;

                /*
                 * Global init gets no signals it doesn't want.
                 * Container-init gets no signals it doesn't want from same
                 * container.
                 *
                 * Note that if global/container-init sees a sig_kernel_only()
                 * signal here, the signal must have been generated internally
                 * or must have come from an ancestor namespace. In either
                 * case, the signal cannot be dropped.
                 */
                if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
                                !sig_kernel_only(signr))
                        continue;

                if (sig_kernel_stop(signr)) {
                        /*
                         * The default action is to stop all threads in
                         * the thread group.  The job control signals
                         * do nothing in an orphaned pgrp, but SIGSTOP
                         * always works.  Note that siglock needs to be
                         * dropped during the call to is_orphaned_pgrp()
                         * because of lock ordering with tasklist_lock.
                         * This allows an intervening SIGCONT to be posted.
                         * We need to check for that and bail out if necessary.
                         */
                        if (signr != SIGSTOP) {
                                spin_unlock_irq(&sighand->siglock);

                                /* signals can be posted during this window */

                                if (is_current_pgrp_orphaned())
                                        goto relock;

                                spin_lock_irq(&sighand->siglock);
                        }

                        if (likely(do_signal_stop(signr))) {
                                /* It released the siglock.  */
                                goto relock;
                        }

                        /*
                         * We didn't actually stop, due to a race
                         * with SIGCONT or something like that.
                         */
                        continue;
                }

        fatal:
                spin_unlock_irq(&sighand->siglock);
                if (unlikely(cgroup_task_frozen(current)))
                        cgroup_leave_frozen(true);

                /*
                 * Anything else is fatal, maybe with a core dump.
                 */
                current->flags |= PF_SIGNALED;

                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
                                print_fatal_signal(signr);
                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
                         * their demise.  If we lost the race with another
                         * thread getting here, it set group_exit_code
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
                        do_coredump(&ksig->info);
                }

                /*
                 * PF_USER_WORKER threads will catch and exit on fatal signals
                 * themselves. They have cleanup that must be performed, so we
                 * cannot call do_exit() on their behalf. Note that ksig won't
                 * be properly initialized, PF_USER_WORKER's shouldn't use it.
                 */
                if (current->flags & PF_USER_WORKER)
                        goto out;

                /*
                 * Death signals, no core dump.
                 */
                do_group_exit(signr);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);

        ksig->sig = signr;

        if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
                hide_si_addr_tag_bits(ksig);
out:
        return signr > 0;
}

/**
 * signal_delivered - called after signal delivery to update blocked signals
 * @ksig:                kernel signal struct
 * @stepping:                nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has successfully been
 * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
 * is always blocked), and the signal itself is blocked unless %SA_NODEFER
 * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.
 */
static void signal_delivered(struct ksignal *ksig, int stepping)
{
        sigset_t blocked;

        /* A signal was successfully delivered, and the
           saved sigmask was stored on the signal frame,
           and will be restored by sigreturn.  So we can
           simply clear the restore sigmask flag.  */
        clear_restore_sigmask();

        sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
        if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
                sigaddset(&blocked, ksig->sig);
        set_current_blocked(&blocked);
        if (current->sas_ss_flags & SS_AUTODISARM)
                sas_ss_reset(current);
        if (stepping)
                ptrace_notify(SIGTRAP, 0);
}

void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
{
        if (failed)
                force_sigsegv(ksig->sig);
        else
                signal_delivered(ksig, stepping);
}

/*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
 * the shared signals in @which since we will not.
 */
static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
{
        sigset_t retarget;
        struct task_struct *t;

        sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
        if (sigisemptyset(&retarget))
                return;

        for_other_threads(tsk, t) {
                if (t->flags & PF_EXITING)
                        continue;

                if (!has_pending_signals(&retarget, &t->blocked))
                        continue;
                /* Remove the signals this thread can handle. */
                sigandsets(&retarget, &retarget, &t->blocked);

                if (!task_sigpending(t))
                        signal_wake_up(t, 0);

                if (sigisemptyset(&retarget))
                        break;
        }
}

void exit_signals(struct task_struct *tsk)
{
        int group_stop = 0;
        sigset_t unblocked;

        /*
         * @tsk is about to have PF_EXITING set - lock out users which
         * expect stable threadgroup.
         */
        cgroup_threadgroup_change_begin(tsk);

        if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
                sched_mm_cid_exit_signals(tsk);
                tsk->flags |= PF_EXITING;
                cgroup_threadgroup_change_end(tsk);
                return;
        }

        spin_lock_irq(&tsk->sighand->siglock);
        /*
         * From now this task is not visible for group-wide signals,
         * see wants_signal(), do_signal_stop().
         */
        sched_mm_cid_exit_signals(tsk);
        tsk->flags |= PF_EXITING;

        cgroup_threadgroup_change_end(tsk);

        if (!task_sigpending(tsk))
                goto out;

        unblocked = tsk->blocked;
        signotset(&unblocked);
        retarget_shared_pending(tsk, &unblocked);

        if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
            task_participate_group_stop(tsk))
                group_stop = CLD_STOPPED;
out:
        spin_unlock_irq(&tsk->sighand->siglock);

        /*
         * If group stop has completed, deliver the notification.  This
         * should always go to the real parent of the group leader.
         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, false, group_stop);
                read_unlock(&tasklist_lock);
        }
}

/*
 * System call entry points.
 */

/**
 *  sys_restart_syscall - restart a system call
 */
SYSCALL_DEFINE0(restart_syscall)
{
        struct restart_block *restart = &current->restart_block;
        return restart->fn(restart);
}

long do_no_restart_syscall(struct restart_block *param)
{
        return -EINTR;
}

static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
        if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
                sigset_t newblocked;
                /* A set of now blocked but previously unblocked signals. */
                sigandnsets(&newblocked, newset, &current->blocked);
                retarget_shared_pending(tsk, &newblocked);
        }
        tsk->blocked = *newset;
        recalc_sigpending();
}

/**
 * set_current_blocked - change current->blocked mask
 * @newset: new mask
 *
 * It is wrong to change ->blocked directly, this helper should be used
 * to ensure the process can't miss a shared signal we are going to block.
 */
void set_current_blocked(sigset_t *newset)
{
        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
        __set_current_blocked(newset);
}

void __set_current_blocked(const sigset_t *newset)
{
        struct task_struct *tsk = current;

        /*
         * In case the signal mask hasn't changed, there is nothing we need
         * to do. The current->blocked shouldn't be modified by other task.
         */
        if (sigequalsets(&tsk->blocked, newset))
                return;

        spin_lock_irq(&tsk->sighand->siglock);
        __set_task_blocked(tsk, newset);
        spin_unlock_irq(&tsk->sighand->siglock);
}

/*
 * This is also useful for kernel threads that want to temporarily
 * (or permanently) block certain signals.
 *
 * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
 * interface happily blocks "unblockable" signals like SIGKILL
 * and friends.
 */
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
        struct task_struct *tsk = current;
        sigset_t newset;

        /* Lockless, only current can change ->blocked, never from irq */
        if (oldset)
                *oldset = tsk->blocked;

        switch (how) {
        case SIG_BLOCK:
                sigorsets(&newset, &tsk->blocked, set);
                break;
        case SIG_UNBLOCK:
                sigandnsets(&newset, &tsk->blocked, set);
                break;
        case SIG_SETMASK:
                newset = *set;
                break;
        default:
                return -EINVAL;
        }

        __set_current_blocked(&newset);
        return 0;
}
EXPORT_SYMBOL(sigprocmask);

/*
 * The api helps set app-provided sigmasks.
 *
 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
 * epoll_pwait where a new sigmask is passed from userland for the syscalls.
 *
 * Note that it does set_restore_sigmask() in advance, so it must be always
 * paired with restore_saved_sigmask_unless() before return from syscall.
 */
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;
        if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}

#ifdef CONFIG_COMPAT
int set_compat_user_sigmask(const compat_sigset_t __user *umask,
                            size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;
        if (get_compat_sigset(&kmask, umask))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}
#endif

/**
 *  sys_rt_sigprocmask - change the list of currently blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: stores pending signals
 *  @oset: previous value of signal mask if non-null
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
                sigset_t __user *, oset, size_t, sigsetsize)
{
        sigset_t old_set, new_set;
        int error;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        old_set = current->blocked;

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
                        return -EFAULT;
        }

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
                compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
        sigset_t old_set = current->blocked;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (nset) {
                sigset_t new_set;
                int error;
                if (get_compat_sigset(&new_set, nset))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }
        return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif

static void do_sigpending(sigset_t *set)
{
        spin_lock_irq(&current->sighand->siglock);
        sigorsets(set, &current->pending.signal,
                  &current->signal->shared_pending.signal);
        spin_unlock_irq(&current->sighand->siglock);

        /* Outside the lock because only this thread touches it.  */
        sigandsets(set, &current->blocked, set);
}

/**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                        while blocked
 *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sigsetsize))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
                compat_size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        return put_compat_sigset(uset, &set, sigsetsize);
}
#endif

static const struct {
        unsigned char limit, layout;
} sig_sicodes[] = {
        [SIGILL]  = { NSIGILL,  SIL_FAULT },
        [SIGFPE]  = { NSIGFPE,  SIL_FAULT },
        [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
        [SIGBUS]  = { NSIGBUS,  SIL_FAULT },
        [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
#if defined(SIGEMT)
        [SIGEMT]  = { NSIGEMT,  SIL_FAULT },
#endif
        [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
        [SIGPOLL] = { NSIGPOLL, SIL_POLL },
        [SIGSYS]  = { NSIGSYS,  SIL_SYS },
};

static bool known_siginfo_layout(unsigned sig, int si_code)
{
        if (si_code == SI_KERNEL)
                return true;
        else if ((si_code > SI_USER)) {
                if (sig_specific_sicodes(sig)) {
                        if (si_code <= sig_sicodes[sig].limit)
                                return true;
                }
                else if (si_code <= NSIGPOLL)
                        return true;
        }
        else if (si_code >= SI_DETHREAD)
                return true;
        else if (si_code == SI_ASYNCNL)
                return true;
        return false;
}

enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
{
        enum siginfo_layout layout = SIL_KILL;
        if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
                if ((sig < ARRAY_SIZE(sig_sicodes)) &&
                    (si_code <= sig_sicodes[sig].limit)) {
                        layout = sig_sicodes[sig].layout;
                        /* Handle the exceptions */
                        if ((sig == SIGBUS) &&
                            (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
                                layout = SIL_FAULT_MCEERR;
                        else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
                                layout = SIL_FAULT_BNDERR;
#ifdef SEGV_PKUERR
                        else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
                                layout = SIL_FAULT_PKUERR;
#endif
                        else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
                                layout = SIL_FAULT_PERF_EVENT;
                        else if (IS_ENABLED(CONFIG_SPARC) &&
                                 (sig == SIGILL) && (si_code == ILL_ILLTRP))
                                layout = SIL_FAULT_TRAPNO;
                        else if (IS_ENABLED(CONFIG_ALPHA) &&
                                 ((sig == SIGFPE) ||
                                  ((sig == SIGTRAP) && (si_code == TRAP_UNK))))
                                layout = SIL_FAULT_TRAPNO;
                }
                else if (si_code <= NSIGPOLL)
                        layout = SIL_POLL;
        } else {
                if (si_code == SI_TIMER)
                        layout = SIL_TIMER;
                else if (si_code == SI_SIGIO)
                        layout = SIL_POLL;
                else if (si_code < 0)
                        layout = SIL_RT;
        }
        return layout;
}

static inline char __user *si_expansion(const siginfo_t __user *info)
{
        return ((char __user *)info) + sizeof(struct kernel_siginfo);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
{
        char __user *expansion = si_expansion(to);
        if (copy_to_user(to, from , sizeof(struct kernel_siginfo)))
                return -EFAULT;
        if (clear_user(expansion, SI_EXPANSION_SIZE))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
                                       const siginfo_t __user *from)
{
        if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) {
                char __user *expansion = si_expansion(from);
                char buf[SI_EXPANSION_SIZE];
                int i;
                /*
                 * An unknown si_code might need more than
                 * sizeof(struct kernel_siginfo) bytes.  Verify all of the
                 * extra bytes are 0.  This guarantees copy_siginfo_to_user
                 * will return this data to userspace exactly.
                 */
                if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE))
                        return -EFAULT;
                for (i = 0; i < SI_EXPANSION_SIZE; i++) {
                        if (buf[i] != 0)
                                return -E2BIG;
                }
        }
        return 0;
}

static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
                                    const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        to->si_signo = signo;
        return post_copy_siginfo_from_user(to, from);
}

int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        return post_copy_siginfo_from_user(to, from);
}

#ifdef CONFIG_COMPAT
/**
 * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo
 * @to: compat siginfo destination
 * @from: kernel siginfo source
 *
 * Note: This function does not work properly for the SIGCHLD on x32, but
 * fortunately it doesn't have to.  The only valid callers for this function are
 * copy_siginfo_to_user32, which is overriden for x32 and the coredump code.
 * The latter does not care because SIGCHLD will never cause a coredump.
 */
void copy_siginfo_to_external32(struct compat_siginfo *to,
                const struct kernel_siginfo *from)
{
        memset(to, 0, sizeof(*to));

        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = ptr_to_compat(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_lower = ptr_to_compat(from->si_lower);
                to->si_upper = ptr_to_compat(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_status = from->si_status;
                to->si_utime = from->si_utime;
                to->si_stime = from->si_stime;
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = ptr_to_compat(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
}

int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
{
        struct compat_siginfo new;

        copy_siginfo_to_external32(&new, from);
        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
                                         const struct compat_siginfo *from)
{
        clear_siginfo(to);
        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = compat_ptr(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_lower = compat_ptr(from->si_lower);
                to->si_upper = compat_ptr(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid    = from->si_pid;
                to->si_uid    = from->si_uid;
                to->si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
                if (in_x32_syscall()) {
                        to->si_utime = from->_sifields._sigchld_x32._utime;
                        to->si_stime = from->_sifields._sigchld_x32._stime;
                } else
#endif
                {
                        to->si_utime = from->si_utime;
                        to->si_stime = from->si_stime;
                }
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = compat_ptr(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
        return 0;
}

static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to,
                                      const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        from.si_signo = signo;
        return post_copy_siginfo_from_user32(to, &from);
}

int copy_siginfo_from_user32(struct kernel_siginfo *to,
                             const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        return post_copy_siginfo_from_user32(to, &from);
}
#endif /* CONFIG_COMPAT */

/**
 *  do_sigtimedwait - wait for queued signals specified in @which
 *  @which: queued signals to wait for
 *  @info: if non-null, the signal's siginfo is returned here
 *  @ts: upper bound on process time suspension
 */
static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
                    const struct timespec64 *ts)
{
        ktime_t *to = NULL, timeout = KTIME_MAX;
        struct task_struct *tsk = current;
        sigset_t mask = *which;
        enum pid_type type;
        int sig, ret = 0;

        if (ts) {
                if (!timespec64_valid(ts))
                        return -EINVAL;
                timeout = timespec64_to_ktime(*ts);
                to = &timeout;
        }

        /*
         * Invert the set of allowed signals to get those we want to block.
         */
        sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
        signotset(&mask);

        spin_lock_irq(&tsk->sighand->siglock);
        sig = dequeue_signal(tsk, &mask, info, &type);
        if (!sig && timeout) {
                /*
                 * None ready, temporarily unblock those we're interested
                 * while we are sleeping in so that we'll be awakened when
                 * they arrive. Unblocking is always fine, we can avoid
                 * set_current_blocked().
                 */
                tsk->real_blocked = tsk->blocked;
                sigandsets(&tsk->blocked, &tsk->blocked, &mask);
                recalc_sigpending();
                spin_unlock_irq(&tsk->sighand->siglock);

                __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
                                               HRTIMER_MODE_REL);
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
                sigemptyset(&tsk->real_blocked);
                sig = dequeue_signal(tsk, &mask, info, &type);
        }
        spin_unlock_irq(&tsk->sighand->siglock);

        if (sig)
                return sig;
        return ret ? -EINTR : -EAGAIN;
}

/**
 *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
 *                        in @uthese
 *  @uthese: queued signals to wait for
 *  @uinfo: if non-null, the signal's siginfo is returned here
 *  @uts: upper bound on process time suspension
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct __kernel_timespec __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct old_timespec32 __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif
#endif

static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
                                 enum pid_type type)
{
        clear_siginfo(info);
        info->si_signo = sig;
        info->si_errno = 0;
        info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
        info->si_pid = task_tgid_vnr(current);
        info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}

/**
 *  sys_kill - send a signal to a process
 *  @pid: the PID of the process
 *  @sig: signal to be sent
 */
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);

        return kill_something_info(sig, &info, pid);
}

/*
 * Verify that the signaler and signalee either are in the same pid namespace
 * or that the signaler's pid namespace is an ancestor of the signalee's pid
 * namespace.
 */
static bool access_pidfd_pidns(struct pid *pid)
{
        struct pid_namespace *active = task_active_pid_ns(current);
        struct pid_namespace *p = ns_of_pid(pid);

        for (;;) {
                if (!p)
                        return false;
                if (p == active)
                        break;
                p = p->parent;
        }

        return true;
}

static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo,
                siginfo_t __user *info)
{
#ifdef CONFIG_COMPAT
        /*
         * Avoid hooking up compat syscalls and instead handle necessary
         * conversions here. Note, this is a stop-gap measure and should not be
         * considered a generic solution.
         */
        if (in_compat_syscall())
                return copy_siginfo_from_user32(
                        kinfo, (struct compat_siginfo __user *)info);
#endif
        return copy_siginfo_from_user(kinfo, info);
}

static struct pid *pidfd_to_pid(const struct file *file)
{
        struct pid *pid;

        pid = pidfd_pid(file);
        if (!IS_ERR(pid))
                return pid;

        return tgid_pidfd_to_pid(file);
}

#define PIDFD_SEND_SIGNAL_FLAGS                            \
        (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
         PIDFD_SIGNAL_PROCESS_GROUP)

/**
 * sys_pidfd_send_signal - Signal a process through a pidfd
 * @pidfd:  file descriptor of the process
 * @sig:    signal to send
 * @info:   signal info
 * @flags:  future flags
 *
 * Send the signal to the thread group or to the individual thread depending
 * on PIDFD_THREAD.
 * In the future extension to @flags may be used to override the default scope
 * of @pidfd.
 *
 * Return: 0 on success, negative errno on failure
 */
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
                siginfo_t __user *, info, unsigned int, flags)
{
        int ret;
        struct fd f;
        struct pid *pid;
        kernel_siginfo_t kinfo;
        enum pid_type type;

        /* Enforce flags be set to 0 until we add an extension. */
        if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
                return -EINVAL;

        /* Ensure that only a single signal scope determining flag is set. */
        if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
                return -EINVAL;

        f = fdget(pidfd);
        if (!f.file)
                return -EBADF;

        /* Is this a pidfd? */
        pid = pidfd_to_pid(f.file);
        if (IS_ERR(pid)) {
                ret = PTR_ERR(pid);
                goto err;
        }

        ret = -EINVAL;
        if (!access_pidfd_pidns(pid))
                goto err;

        switch (flags) {
        case 0:
                /* Infer scope from the type of pidfd. */
                if (f.file->f_flags & PIDFD_THREAD)
                        type = PIDTYPE_PID;
                else
                        type = PIDTYPE_TGID;
                break;
        case PIDFD_SIGNAL_THREAD:
                type = PIDTYPE_PID;
                break;
        case PIDFD_SIGNAL_THREAD_GROUP:
                type = PIDTYPE_TGID;
                break;
        case PIDFD_SIGNAL_PROCESS_GROUP:
                type = PIDTYPE_PGID;
                break;
        }

        if (info) {
                ret = copy_siginfo_from_user_any(&kinfo, info);
                if (unlikely(ret))
                        goto err;

                ret = -EINVAL;
                if (unlikely(sig != kinfo.si_signo))
                        goto err;

                /* Only allow sending arbitrary signals to yourself. */
                ret = -EPERM;
                if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
                    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
                        goto err;
        } else {
                prepare_kill_siginfo(sig, &kinfo, type);
        }

        if (type == PIDTYPE_PGID)
                ret = kill_pgrp_info(sig, &kinfo, pid);
        else
                ret = kill_pid_info_type(sig, &kinfo, pid, type);
err:
        fdput(f);
        return ret;
}

static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
        struct task_struct *p;
        int error = -ESRCH;

        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
                error = check_kill_permission(sig, info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
                 */
                if (!error && sig) {
                        error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
                        /*
                         * If lock_task_sighand() failed we pretend the task
                         * dies after receiving the signal. The window is tiny,
                         * and the signal is private anyway.
                         */
                        if (unlikely(error == -ESRCH))
                                error = 0;
                }
        }
        rcu_read_unlock();

        return error;
}

static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_PID);

        return do_send_specific(tgid, pid, sig, &info);
}

/**
 *  sys_tgkill - send signal to one specific thread
 *  @tgid: the thread group ID of the thread
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *
 *  This syscall also checks the @tgid and returns -ESRCH even if the PID
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        return do_tkill(tgid, pid, sig);
}

/**
 *  sys_tkill - send signal to one specific task
 *  @pid: the PID of the task
 *  @sig: signal to be sent
 *
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0)
                return -EINVAL;

        return do_tkill(0, pid, sig);
}

static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        /* POSIX.1b doesn't mention process groups.  */
        return kill_proc_info(sig, info, pid);
}

/**
 *  sys_rt_sigqueueinfo - send signal information to a signal
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *  @uinfo: signal info to be sent
 */
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}
#endif

static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        return do_send_specific(tgid, pid, sig, info);
}

SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
                        compat_pid_t, tgid,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#endif

/*
 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
 */
void kernel_sigaction(int sig, __sighandler_t action)
{
        spin_lock_irq(&current->sighand->siglock);
        current->sighand->action[sig - 1].sa.sa_handler = action;
        if (action == SIG_IGN) {
                sigset_t mask;

                sigemptyset(&mask);
                sigaddset(&mask, sig);

                flush_sigqueue_mask(&mask, &current->signal->shared_pending);
                flush_sigqueue_mask(&mask, &current->pending);
                recalc_sigpending();
        }
        spin_unlock_irq(&current->sighand->siglock);
}
EXPORT_SYMBOL(kernel_sigaction);

void __weak sigaction_compat_abi(struct k_sigaction *act,
                struct k_sigaction *oact)
{
}

int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
        struct task_struct *p = current, *t;
        struct k_sigaction *k;
        sigset_t mask;

        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;

        k = &p->sighand->action[sig-1];

        spin_lock_irq(&p->sighand->siglock);
        if (k->sa.sa_flags & SA_IMMUTABLE) {
                spin_unlock_irq(&p->sighand->siglock);
                return -EINVAL;
        }
        if (oact)
                *oact = *k;

        /*
         * Make sure that we never accidentally claim to support SA_UNSUPPORTED,
         * e.g. by having an architecture use the bit in their uapi.
         */
        BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED);

        /*
         * Clear unknown flag bits in order to allow userspace to detect missing
         * support for flag bits and to allow the kernel to use non-uapi bits
         * internally.
         */
        if (act)
                act->sa.sa_flags &= UAPI_SA_FLAGS;
        if (oact)
                oact->sa.sa_flags &= UAPI_SA_FLAGS;

        sigaction_compat_abi(act, oact);

        if (act) {
                sigdelsetmask(&act->sa.sa_mask,
                              sigmask(SIGKILL) | sigmask(SIGSTOP));
                *k = *act;
                /*
                 * POSIX 3.3.1.3:
                 *  "Setting a signal action to SIG_IGN for a signal that is
                 *   pending shall cause the pending signal to be discarded,
                 *   whether or not it is blocked."
                 *
                 *  "Setting a signal action to SIG_DFL for a signal that is
                 *   pending and whose default action is to ignore the signal
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
                if (sig_handler_ignored(sig_handler(p, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        flush_sigqueue_mask(&mask, &p->signal->shared_pending);
                        for_each_thread(p, t)
                                flush_sigqueue_mask(&mask, &t->pending);
                }
        }

        spin_unlock_irq(&p->sighand->siglock);
        return 0;
}

#ifdef CONFIG_DYNAMIC_SIGFRAME
static inline void sigaltstack_lock(void)
        __acquires(&current->sighand->siglock)
{
        spin_lock_irq(&current->sighand->siglock);
}

static inline void sigaltstack_unlock(void)
        __releases(&current->sighand->siglock)
{
        spin_unlock_irq(&current->sighand->siglock);
}
#else
static inline void sigaltstack_lock(void) { }
static inline void sigaltstack_unlock(void) { }
#endif

static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
                size_t min_ss_size)
{
        struct task_struct *t = current;
        int ret = 0;

        if (oss) {
                memset(oss, 0, sizeof(stack_t));
                oss->ss_sp = (void __user *) t->sas_ss_sp;
                oss->ss_size = t->sas_ss_size;
                oss->ss_flags = sas_ss_flags(sp) |
                        (current->sas_ss_flags & SS_FLAG_BITS);
        }

        if (ss) {
                void __user *ss_sp = ss->ss_sp;
                size_t ss_size = ss->ss_size;
                unsigned ss_flags = ss->ss_flags;
                int ss_mode;

                if (unlikely(on_sig_stack(sp)))
                        return -EPERM;

                ss_mode = ss_flags & ~SS_FLAG_BITS;
                if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
                                ss_mode != 0))
                        return -EINVAL;

                /*
                 * Return before taking any locks if no actual
                 * sigaltstack changes were requested.
                 */
                if (t->sas_ss_sp == (unsigned long)ss_sp &&
                    t->sas_ss_size == ss_size &&
                    t->sas_ss_flags == ss_flags)
                        return 0;

                sigaltstack_lock();
                if (ss_mode == SS_DISABLE) {
                        ss_size = 0;
                        ss_sp = NULL;
                } else {
                        if (unlikely(ss_size < min_ss_size))
                                ret = -ENOMEM;
                        if (!sigaltstack_size_valid(ss_size))
                                ret = -ENOMEM;
                }
                if (!ret) {
                        t->sas_ss_sp = (unsigned long) ss_sp;
                        t->sas_ss_size = ss_size;
                        t->sas_ss_flags = ss_flags;
                }
                sigaltstack_unlock();
        }
        return ret;
}

SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
        stack_t new, old;
        int err;
        if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
                              current_user_stack_pointer(),
                              MINSIGSTKSZ);
        if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
                err = -EFAULT;
        return err;
}

int restore_altstack(const stack_t __user *uss)
{
        stack_t new;
        if (copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(),
                             MINSIGSTKSZ);
        /* squash all but EFAULT for now */
        return 0;
}

int __save_altstack(stack_t __user *uss, unsigned long sp)
{
        struct task_struct *t = current;
        int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}

#ifdef CONFIG_COMPAT
static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
                                 compat_stack_t __user *uoss_ptr)
{
        stack_t uss, uoss;
        int ret;

        if (uss_ptr) {
                compat_stack_t uss32;
                if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
                        return -EFAULT;
                uss.ss_sp = compat_ptr(uss32.ss_sp);
                uss.ss_flags = uss32.ss_flags;
                uss.ss_size = uss32.ss_size;
        }
        ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
                             compat_user_stack_pointer(),
                             COMPAT_MINSIGSTKSZ);
        if (ret >= 0 && uoss_ptr)  {
                compat_stack_t old;
                memset(&old, 0, sizeof(old));
                old.ss_sp = ptr_to_compat(uoss.ss_sp);
                old.ss_flags = uoss.ss_flags;
                old.ss_size = uoss.ss_size;
                if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
                        ret = -EFAULT;
        }
        return ret;
}

COMPAT_SYSCALL_DEFINE2(sigaltstack,
                        const compat_stack_t __user *, uss_ptr,
                        compat_stack_t __user *, uoss_ptr)
{
        return do_compat_sigaltstack(uss_ptr, uoss_ptr);
}

int compat_restore_altstack(const compat_stack_t __user *uss)
{
        int err = do_compat_sigaltstack(uss, NULL);
        /* squash all but -EFAULT for now */
        return err == -EFAULT ? err : 0;
}

int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
        int err;
        struct task_struct *t = current;
        err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
                         &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}
#endif

#ifdef __ARCH_WANT_SYS_SIGPENDING

/**
 *  sys_sigpending - examine pending signals
 *  @uset: where mask of pending signal is returned
 */
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
        sigset_t set;

        if (sizeof(old_sigset_t) > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
        sigset_t set;

        do_sigpending(&set);

        return put_user(set.sig[0], set32);
}
#endif

#endif

#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/**
 *  sys_sigprocmask - examine and change blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: signals to add or remove (if non-null)
 *  @oset: previous value of signal mask if non-null
 *
 * Some platforms have their own version with special arguments;
 * others support only sys_rt_sigprocmask.
 */

SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                old_sigset_t __user *, oset)
{
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;

        old_set = current->blocked.sig[0];

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(*nset)))
                        return -EFAULT;

                new_blocked = current->blocked;

                switch (how) {
                case SIG_BLOCK:
                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
                        new_blocked.sig[0] = new_set;
                        break;
                default:
                        return -EINVAL;
                }

                set_current_blocked(&new_blocked);
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(*oset)))
                        return -EFAULT;
        }

        return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */

#ifndef CONFIG_ODD_RT_SIGACTION
/**
 *  sys_rt_sigaction - alter an action taken by a process
 *  @sig: signal to be sent
 *  @act: new sigaction
 *  @oact: used to save the previous sigaction
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct sigaction __user *, act,
                struct sigaction __user *, oact,
                size_t, sigsetsize)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
                return -EFAULT;

        ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
        if (ret)
                return ret;

        if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
                return -EFAULT;

        return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct compat_sigaction __user *, act,
                struct compat_sigaction __user *, oact,
                compat_size_t, sigsetsize)
{
        struct k_sigaction new_ka, old_ka;
#ifdef __ARCH_HAS_SA_RESTORER
        compat_uptr_t restorer;
#endif
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;

        if (act) {
                compat_uptr_t handler;
                ret = get_user(handler, &act->sa_handler);
                new_ka.sa.sa_handler = compat_ptr(handler);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= get_user(restorer, &act->sa_restorer);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
                ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
                ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
                if (ret)
                        return -EFAULT;
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
        if (!ret && oact) {
                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
                               &oact->sa_handler);
                ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
                                         sizeof(oact->sa_mask));
                ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                                &oact->sa_restorer);
#endif
        }
        return ret;
}
#endif
#endif /* !CONFIG_ODD_RT_SIGACTION */

#ifdef CONFIG_OLD_SIGACTION
SYSCALL_DEFINE3(sigaction, int, sig,
                const struct old_sigaction __user *, act,
                struct old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;

        if (act) {
                old_sigset_t mask;
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;
#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }

        return ret;
}
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
                const struct compat_old_sigaction __user *, act,
                struct compat_old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;
        compat_old_sigset_t mask;
        compat_uptr_t handler, restorer;

        if (act) {
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(handler, &act->sa_handler) ||
                    __get_user(restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;

#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                new_ka.sa.sa_handler = compat_ptr(handler);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
                               &oact->sa_handler) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                               &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }
        return ret;
}
#endif

#ifdef CONFIG_SGETMASK_SYSCALL

/*
 * For backwards compatibility.  Functionality superseded by sigprocmask.
 */
SYSCALL_DEFINE0(sgetmask)
{
        /* SMP safe */
        return current->blocked.sig[0];
}

SYSCALL_DEFINE1(ssetmask, int, newmask)
{
        int old = current->blocked.sig[0];
        sigset_t newset;

        siginitset(&newset, newmask);
        set_current_blocked(&newset);

        return old;
}
#endif /* CONFIG_SGETMASK_SYSCALL */

#ifdef __ARCH_WANT_SYS_SIGNAL
/*
 * For backwards compatibility.  Functionality superseded by sigaction.
 */
SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        new_sa.sa.sa_handler = handler;
        new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
        sigemptyset(&new_sa.sa.sa_mask);

        ret = do_sigaction(sig, &new_sa, &old_sa);

        return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* __ARCH_WANT_SYS_SIGNAL */

#ifdef __ARCH_WANT_SYS_PAUSE

SYSCALL_DEFINE0(pause)
{
        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        return -ERESTARTNOHAND;
}

#endif

static int sigsuspend(sigset_t *set)
{
        current->saved_sigmask = current->blocked;
        set_current_blocked(set);

        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        set_restore_sigmask();
        return -ERESTARTNOHAND;
}

/**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
 *        @unewset value until a signal is received
 *  @unewset: new signal mask value
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&newset, unewset, sizeof(newset)))
                return -EFAULT;
        return sigsuspend(&newset);
}
 
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&newset, unewset))
                return -EFAULT;
        return sigsuspend(&newset);
}
#endif

#ifdef CONFIG_OLD_SIGSUSPEND
SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif
#ifdef CONFIG_OLD_SIGSUSPEND3
SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif

__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void siginfo_buildtime_checks(void)
{
        BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);

        /* Verify the offsets in the two siginfos match */
#define CHECK_OFFSET(field) \
        BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field))

        /* kill */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);

        /* timer */
        CHECK_OFFSET(si_tid);
        CHECK_OFFSET(si_overrun);
        CHECK_OFFSET(si_value);

        /* rt */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_value);

        /* sigchld */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_status);
        CHECK_OFFSET(si_utime);
        CHECK_OFFSET(si_stime);

        /* sigfault */
        CHECK_OFFSET(si_addr);
        CHECK_OFFSET(si_trapno);
        CHECK_OFFSET(si_addr_lsb);
        CHECK_OFFSET(si_lower);
        CHECK_OFFSET(si_upper);
        CHECK_OFFSET(si_pkey);
        CHECK_OFFSET(si_perf_data);
        CHECK_OFFSET(si_perf_type);
        CHECK_OFFSET(si_perf_flags);

        /* sigpoll */
        CHECK_OFFSET(si_band);
        CHECK_OFFSET(si_fd);

        /* sigsys */
        CHECK_OFFSET(si_call_addr);
        CHECK_OFFSET(si_syscall);
        CHECK_OFFSET(si_arch);
#undef CHECK_OFFSET

        /* usb asyncio */
        BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
                     offsetof(struct siginfo, si_addr));
        if (sizeof(int) == sizeof(void __user *)) {
                BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
                             sizeof(void __user *));
        } else {
                BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
                              sizeof_field(struct siginfo, si_uid)) !=
                             sizeof(void __user *));
                BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
                             offsetof(struct siginfo, si_uid));
        }
#ifdef CONFIG_COMPAT
        BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
                     offsetof(struct compat_siginfo, si_addr));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof(compat_uptr_t));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof_field(struct siginfo, si_pid));
#endif
}

#if defined(CONFIG_SYSCTL)
static struct ctl_table signal_debug_table[] = {
#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
        {
                .procname        = "exception-trace",
                .data                = &show_unhandled_signals,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
#endif
};

static int __init init_signal_sysctls(void)
{
        register_sysctl_init("debug", signal_debug_table);
        return 0;
}
early_initcall(init_signal_sysctls);
#endif /* CONFIG_SYSCTL */

void __init signals_init(void)
{
        siginfo_buildtime_checks();

        sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}

#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
 * kdb_send_sig - Allows kdb to send signals without exposing
 * signal internals.  This function checks if the required locks are
 * available before calling the main signal code, to avoid kdb
 * deadlocks.
 */
void kdb_send_sig(struct task_struct *t, int sig)
{
        static struct task_struct *kdb_prev_t;
        int new_t, ret;
        if (!spin_trylock(&t->sighand->siglock)) {
                kdb_printf("Can't do kill command now.\n"
                           "The sigmask lock is held somewhere else in "
                           "kernel, try again later\n");
                return;
        }
        new_t = kdb_prev_t != t;
        kdb_prev_t = t;
        if (!task_is_running(t) && new_t) {
                spin_unlock(&t->sighand->siglock);
                kdb_printf("Process is not RUNNING, sending a signal from "
                           "kdb risks deadlock\n"
                           "on the run queue locks. "
                           "The signal has _not_ been sent.\n"
                           "Reissue the kill command if you want to risk "
                           "the deadlock.\n");
                return;
        }
        ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
        spin_unlock(&t->sighand->siglock);
        if (ret)
                kdb_printf("Fail to deliver Signal %d to process %d.\n",
                           sig, t->pid);
        else
                kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
}
#endif        /* CONFIG_KGDB_KDB */












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#ifndef _LINUX_SCHED_ISOLATION_H
#define _LINUX_SCHED_ISOLATION_H

#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/init.h>
#include <linux/tick.h>

enum hk_type {
        HK_TYPE_TIMER,
        HK_TYPE_RCU,
        HK_TYPE_MISC,
        HK_TYPE_SCHED,
        HK_TYPE_TICK,
        HK_TYPE_DOMAIN,
        HK_TYPE_WQ,
        HK_TYPE_MANAGED_IRQ,
        HK_TYPE_KTHREAD,
        HK_TYPE_MAX
};

#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
extern int housekeeping_any_cpu(enum hk_type type);
extern const struct cpumask *housekeeping_cpumask(enum hk_type type);
extern bool housekeeping_enabled(enum hk_type type);
extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
extern void __init housekeeping_init(void);

#else

static inline int housekeeping_any_cpu(enum hk_type type)
{
        return smp_processor_id();
}

static inline const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
        return cpu_possible_mask;
}

static inline bool housekeeping_enabled(enum hk_type type)
{
        return false;
}

static inline void housekeeping_affine(struct task_struct *t,
                                       enum hk_type type) { }

static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
        return true;
}

static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */

static inline bool housekeeping_cpu(int cpu, enum hk_type type)
{
#ifdef CONFIG_CPU_ISOLATION
        if (static_branch_unlikely(&housekeeping_overridden))
                return housekeeping_test_cpu(cpu, type);
#endif
        return true;
}

static inline bool cpu_is_isolated(int cpu)
{
        return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
               !housekeeping_test_cpu(cpu, HK_TYPE_TICK) ||
               cpuset_cpu_is_isolated(cpu);
}

#endif /* _LINUX_SCHED_ISOLATION_H */


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_SCHED_GENERIC_H
#define __NET_SCHED_GENERIC_H

#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/hashtable.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
#include <net/flow_offload.h>
#include <linux/xarray.h>

struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
struct bpf_flow_keys;

struct qdisc_rate_table {
        struct tc_ratespec rate;
        u32                data[256];
        struct qdisc_rate_table *next;
        int                refcnt;
};

enum qdisc_state_t {
        __QDISC_STATE_SCHED,
        __QDISC_STATE_DEACTIVATED,
        __QDISC_STATE_MISSED,
        __QDISC_STATE_DRAINING,
};

enum qdisc_state2_t {
        /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
         * Use qdisc_run_begin/end() or qdisc_is_running() instead.
         */
        __QDISC_STATE2_RUNNING,
};

#define QDISC_STATE_MISSED        BIT(__QDISC_STATE_MISSED)
#define QDISC_STATE_DRAINING        BIT(__QDISC_STATE_DRAINING)

#define QDISC_STATE_NON_EMPTY        (QDISC_STATE_MISSED | \
                                        QDISC_STATE_DRAINING)

struct qdisc_size_table {
        struct rcu_head                rcu;
        struct list_head        list;
        struct tc_sizespec        szopts;
        int                        refcnt;
        u16                        data[];
};

/* similar to sk_buff_head, but skb->prev pointer is undefined. */
struct qdisc_skb_head {
        struct sk_buff        *head;
        struct sk_buff        *tail;
        __u32                qlen;
        spinlock_t        lock;
};

struct Qdisc {
        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *sch);
        unsigned int                flags;
#define TCQ_F_BUILTIN                1
#define TCQ_F_INGRESS                2
#define TCQ_F_CAN_BYPASS        4
#define TCQ_F_MQROOT                8
#define TCQ_F_ONETXQUEUE        0x10 /* dequeue_skb() can assume all skbs are for
                                      * q->dev_queue : It can test
                                      * netif_xmit_frozen_or_stopped() before
                                      * dequeueing next packet.
                                      * Its true for MQ/MQPRIO slaves, or non
                                      * multiqueue device.
                                      */
#define TCQ_F_WARN_NONWC        (1 << 16)
#define TCQ_F_CPUSTATS                0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT                0x40 /* root of its hierarchy :
                                      * qdisc_tree_decrease_qlen() should stop.
                                      */
#define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK                0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
        u32                        limit;
        const struct Qdisc_ops        *ops;
        struct qdisc_size_table        __rcu *stab;
        struct hlist_node       hash;
        u32                        handle;
        u32                        parent;

        struct netdev_queue        *dev_queue;

        struct net_rate_estimator __rcu *rate_est;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        int                        pad;
        refcount_t                refcnt;

        /*
         * For performance sake on SMP, we put highly modified fields at the end
         */
        struct sk_buff_head        gso_skb ____cacheline_aligned_in_smp;
        struct qdisc_skb_head        q;
        struct gnet_stats_basic_sync bstats;
        struct gnet_stats_queue        qstats;
        int                     owner;
        unsigned long                state;
        unsigned long                state2; /* must be written under qdisc spinlock */
        struct Qdisc            *next_sched;
        struct sk_buff_head        skb_bad_txq;

        spinlock_t                busylock ____cacheline_aligned_in_smp;
        spinlock_t                seqlock;

        struct rcu_head                rcu;
        netdevice_tracker        dev_tracker;
        struct lock_class_key        root_lock_key;
        /* private data */
        long privdata[] ____cacheline_aligned;
};

static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;
        refcount_inc(&qdisc->refcnt);
}

static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return true;
        return refcount_dec_if_one(&qdisc->refcnt);
}

/* Intended to be used by unlocked users, when concurrent qdisc release is
 * possible.
 */

static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return qdisc;
        if (refcount_inc_not_zero(&qdisc->refcnt))
                return qdisc;
        return NULL;
}

/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc
 * root_lock section, or provide their own memory barriers -- ordering
 * against qdisc_run_begin/end() atomic bit operations.
 */
static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK)
                return spin_is_locked(&qdisc->seqlock);
        return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
}

static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
{
        return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY);
}

static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
{
        return q->flags & TCQ_F_CPUSTATS;
}

static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
{
        if (qdisc_is_percpu_stats(qdisc))
                return nolock_qdisc_is_empty(qdisc);
        return !READ_ONCE(qdisc->q.qlen);
}

/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with
 * the qdisc root lock acquired.
 */
static inline bool qdisc_run_begin(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                if (spin_trylock(&qdisc->seqlock))
                        return true;

                /* No need to insist if the MISSED flag was already set.
                 * Note that test_and_set_bit() also gives us memory ordering
                 * guarantees wrt potential earlier enqueue() and below
                 * spin_trylock(), both of which are necessary to prevent races
                 */
                if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state))
                        return false;

                /* Try to take the lock again to make sure that we will either
                 * grab it or the CPU that still has it will see MISSED set
                 * when testing it in qdisc_run_end()
                 */
                return spin_trylock(&qdisc->seqlock);
        }
        return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
}

static inline void qdisc_run_end(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                spin_unlock(&qdisc->seqlock);

                /* spin_unlock() only has store-release semantic. The unlock
                 * and test_bit() ordering is a store-load ordering, so a full
                 * memory barrier is needed here.
                 */
                smp_mb();

                if (unlikely(test_bit(__QDISC_STATE_MISSED,
                                      &qdisc->state)))
                        __netif_schedule(qdisc);
        } else {
                __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
        }
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
{
        return qdisc->flags & TCQ_F_ONETXQUEUE;
}

static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
{
        return netdev_queue_dql_avail(txq);
}

struct Qdisc_class_ops {
        unsigned int                flags;
        /* Child qdisc manipulation */
        struct netdev_queue *        (*select_queue)(struct Qdisc *, struct tcmsg *);
        int                        (*graft)(struct Qdisc *, unsigned long cl,
                                        struct Qdisc *, struct Qdisc **,
                                        struct netlink_ext_ack *extack);
        struct Qdisc *                (*leaf)(struct Qdisc *, unsigned long cl);
        void                        (*qlen_notify)(struct Qdisc *, unsigned long);

        /* Class manipulation routines */
        unsigned long                (*find)(struct Qdisc *, u32 classid);
        int                        (*change)(struct Qdisc *, u32, u32,
                                        struct nlattr **, unsigned long *,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct Qdisc *, unsigned long,
                                          struct netlink_ext_ack *);
        void                        (*walk)(struct Qdisc *, struct qdisc_walker * arg);

        /* Filter manipulation */
        struct tcf_block *        (*tcf_block)(struct Qdisc *sch,
                                             unsigned long arg,
                                             struct netlink_ext_ack *extack);
        unsigned long                (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                        (*unbind_tcf)(struct Qdisc *, unsigned long);

        /* rtnetlink specific */
        int                        (*dump)(struct Qdisc *, unsigned long,
                                        struct sk_buff *skb, struct tcmsg*);
        int                        (*dump_stats)(struct Qdisc *, unsigned long,
                                        struct gnet_dump *);
};

/* Qdisc_class_ops flag values */

/* Implements API that doesn't require rtnl lock */
enum qdisc_class_ops_flags {
        QDISC_CLASS_OPS_DOIT_UNLOCKED = 1,
};

struct Qdisc_ops {
        struct Qdisc_ops        *next;
        const struct Qdisc_class_ops        *cl_ops;
        char                        id[IFNAMSIZ];
        int                        priv_size;
        unsigned int                static_flags;

        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *);
        struct sk_buff *        (*peek)(struct Qdisc *);

        int                        (*init)(struct Qdisc *sch, struct nlattr *arg,
                                        struct netlink_ext_ack *extack);
        void                        (*reset)(struct Qdisc *);
        void                        (*destroy)(struct Qdisc *);
        int                        (*change)(struct Qdisc *sch,
                                          struct nlattr *arg,
                                          struct netlink_ext_ack *extack);
        void                        (*attach)(struct Qdisc *sch);
        int                        (*change_tx_queue_len)(struct Qdisc *, unsigned int);
        void                        (*change_real_num_tx)(struct Qdisc *sch,
                                                      unsigned int new_real_tx);

        int                        (*dump)(struct Qdisc *, struct sk_buff *);
        int                        (*dump_stats)(struct Qdisc *, struct gnet_dump *);

        void                        (*ingress_block_set)(struct Qdisc *sch,
                                                     u32 block_index);
        void                        (*egress_block_set)(struct Qdisc *sch,
                                                    u32 block_index);
        u32                        (*ingress_block_get)(struct Qdisc *sch);
        u32                        (*egress_block_get)(struct Qdisc *sch);

        struct module                *owner;
};

struct tcf_result {
        union {
                struct {
                        unsigned long        class;
                        u32                classid;
                };
                const struct tcf_proto *goto_tp;
        };
};

struct tcf_chain;

struct tcf_proto_ops {
        struct list_head        head;
        char                        kind[IFNAMSIZ];

        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        int                        (*init)(struct tcf_proto*);
        void                        (*destroy)(struct tcf_proto *tp, bool rtnl_held,
                                           struct netlink_ext_ack *extack);

        void*                        (*get)(struct tcf_proto*, u32 handle);
        void                        (*put)(struct tcf_proto *tp, void *f);
        int                        (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
                                        void **, u32,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct tcf_proto *tp, void *arg,
                                          bool *last, bool rtnl_held,
                                          struct netlink_ext_ack *);
        bool                        (*delete_empty)(struct tcf_proto *tp);
        void                        (*walk)(struct tcf_proto *tp,
                                        struct tcf_walker *arg, bool rtnl_held);
        int                        (*reoffload)(struct tcf_proto *tp, bool add,
                                             flow_setup_cb_t *cb, void *cb_priv,
                                             struct netlink_ext_ack *extack);
        void                        (*hw_add)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*hw_del)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*bind_class)(void *, u32, unsigned long,
                                              void *, unsigned long);
        void *                        (*tmplt_create)(struct net *net,
                                                struct tcf_chain *chain,
                                                struct nlattr **tca,
                                                struct netlink_ext_ack *extack);
        void                        (*tmplt_destroy)(void *tmplt_priv);
        void                        (*tmplt_reoffload)(struct tcf_chain *chain,
                                                   bool add,
                                                   flow_setup_cb_t *cb,
                                                   void *cb_priv);
        struct tcf_exts *        (*get_exts)(const struct tcf_proto *tp,
                                            u32 handle);

        /* rtnetlink specific */
        int                        (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*,
                                        bool);
        int                        (*terse_dump)(struct net *net,
                                              struct tcf_proto *tp, void *fh,
                                              struct sk_buff *skb,
                                              struct tcmsg *t, bool rtnl_held);
        int                        (*tmplt_dump)(struct sk_buff *skb,
                                              struct net *net,
                                              void *tmplt_priv);

        struct module                *owner;
        int                        flags;
};

/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags
 * are expected to implement tcf_proto_ops->delete_empty(), otherwise race
 * conditions can occur when filters are inserted/deleted simultaneously.
 */
enum tcf_proto_ops_flags {
        TCF_PROTO_OPS_DOIT_UNLOCKED = 1,
};

struct tcf_proto {
        /* Fast access part */
        struct tcf_proto __rcu        *next;
        void __rcu                *root;

        /* called under RCU BH lock*/
        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        __be16                        protocol;

        /* All the rest */
        u32                        prio;
        void                        *data;
        const struct tcf_proto_ops        *ops;
        struct tcf_chain        *chain;
        /* Lock protects tcf_proto shared state and can be used by unlocked
         * classifiers to protect their private data.
         */
        spinlock_t                lock;
        bool                        deleting;
        bool                        counted;
        refcount_t                refcnt;
        struct rcu_head                rcu;
        struct hlist_node        destroy_ht_node;
};

struct qdisc_skb_cb {
        struct {
                unsigned int                pkt_len;
                u16                        slave_dev_queue_mapping;
                u16                        tc_classid;
        };
#define QDISC_CB_PRIV_LEN 20
        unsigned char                data[QDISC_CB_PRIV_LEN];
};

typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);

struct tcf_chain {
        /* Protects filter_chain. */
        struct mutex filter_chain_lock;
        struct tcf_proto __rcu *filter_chain;
        struct list_head list;
        struct tcf_block *block;
        u32 index; /* chain index */
        unsigned int refcnt;
        unsigned int action_refcnt;
        bool explicitly_created;
        bool flushing;
        const struct tcf_proto_ops *tmplt_ops;
        void *tmplt_priv;
        struct rcu_head rcu;
};

struct tcf_block {
        struct xarray ports; /* datapath accessible */
        /* Lock protects tcf_block and lifetime-management data of chains
         * attached to the block (refcnt, action_refcnt, explicitly_created).
         */
        struct mutex lock;
        struct list_head chain_list;
        u32 index; /* block index for shared blocks */
        u32 classid; /* which class this block belongs to */
        refcount_t refcnt;
        struct net *net;
        struct Qdisc *q;
        struct rw_semaphore cb_lock; /* protects cb_list and offload counters */
        struct flow_block flow_block;
        struct list_head owner_list;
        bool keep_dst;
        bool bypass_wanted;
        atomic_t filtercnt; /* Number of filters */
        atomic_t skipswcnt; /* Number of skip_sw filters */
        atomic_t offloadcnt; /* Number of oddloaded filters */
        unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
        unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
        struct {
                struct tcf_chain *chain;
                struct list_head filter_chain_list;
        } chain0;
        struct rcu_head rcu;
        DECLARE_HASHTABLE(proto_destroy_ht, 7);
        struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */
};

struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index);

static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain)
{
        return lockdep_is_held(&chain->filter_chain_lock);
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return lockdep_is_held(&tp->lock);
}

#define tcf_chain_dereference(p, chain)                                        \
        rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain))

#define tcf_proto_dereference(p, tp)                                        \
        rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp))

static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
{
        struct qdisc_skb_cb *qcb;

        BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb));
        BUILD_BUG_ON(sizeof(qcb->data) < sz);
}

static inline int qdisc_qlen(const struct Qdisc *q)
{
        return q->q.qlen;
}

static inline int qdisc_qlen_sum(const struct Qdisc *q)
{
        __u32 qlen = q->qstats.qlen;
        int i;

        if (qdisc_is_percpu_stats(q)) {
                for_each_possible_cpu(i)
                        qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
        } else {
                qlen += q->q.qlen;
        }

        return qlen;
}

static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
{
        return (struct qdisc_skb_cb *)skb->cb;
}

static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
{
        return &qdisc->q.lock;
}

static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
{
        struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc);

        return q;
}

static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc)
{
        return rcu_dereference_bh(qdisc->dev_queue->qdisc);
}

static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
{
        return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping);
}

static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->dev;
}

static inline void sch_tree_lock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_lock_bh(qdisc_lock(q));
        else
                spin_lock_bh(qdisc_root_sleeping_lock(q));
}

static inline void sch_tree_unlock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_unlock_bh(qdisc_lock(q));
        else
                spin_unlock_bh(qdisc_root_sleeping_lock(q));
}

extern struct Qdisc noop_qdisc;
extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1];
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
        return ntx < dev->real_num_tx_queues ?
                        default_qdisc_ops : &pfifo_fast_ops;
}

struct Qdisc_class_common {
        u32                        classid;
        unsigned int                filter_cnt;
        struct hlist_node        hnode;
};

struct Qdisc_class_hash {
        struct hlist_head        *hash;
        unsigned int                hashsize;
        unsigned int                hashmask;
        unsigned int                hashelems;
};

static inline unsigned int qdisc_class_hash(u32 id, u32 mask)
{
        id ^= id >> 8;
        id ^= id >> 4;
        return id & mask;
}

static inline struct Qdisc_class_common *
qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
{
        struct Qdisc_class_common *cl;
        unsigned int h;

        if (!id)
                return NULL;

        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
                        return cl;
        }
        return NULL;
}

static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl)
{
        return cl->filter_cnt > 0;
}

static inline void qdisc_class_get(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_add_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class overflow");

        cl->filter_cnt = res;
}

static inline void qdisc_class_put(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_sub_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class underflow");

        cl->filter_cnt = res;
}

static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
{
        u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;

        return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
}

int qdisc_class_hash_init(struct Qdisc_class_hash *);
void qdisc_class_hash_insert(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_remove(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
void qdisc_class_hash_destroy(struct Qdisc_class_hash *);

int dev_qdisc_change_tx_queue_len(struct net_device *dev);
void dev_qdisc_change_real_num_tx(struct net_device *dev,
                                  unsigned int new_real_tx);
void dev_init_scheduler(struct net_device *dev);
void dev_shutdown(struct net_device *dev);
void dev_activate(struct net_device *dev);
void dev_deactivate(struct net_device *dev);
void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
#ifdef CONFIG_NET_SCHED
int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                              void *type_data);
void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                                struct Qdisc *new, struct Qdisc *old,
                                enum tc_setup_type type, void *type_data,
                                struct netlink_ext_ack *extack);
#else
static inline int
qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                          void *type_data)
{
        q->flags &= ~TCQ_F_OFFLOADED;
        return 0;
}

static inline void
qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                           struct Qdisc *new, struct Qdisc *old,
                           enum tc_setup_type type, void *type_data,
                           struct netlink_ext_ack *extack)
{
}
#endif
void qdisc_offload_query_caps(struct net_device *dev,
                              enum tc_setup_type type,
                              void *caps, size_t caps_len);
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                          const struct Qdisc_ops *ops,
                          struct netlink_ext_ack *extack);
void qdisc_free(struct Qdisc *qdisc);
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops, u32 parentid,
                                struct netlink_ext_ack *extack);
void __qdisc_calculate_pkt_len(struct sk_buff *skb,
                               const struct qdisc_size_table *stab);
int skb_do_redirect(struct sk_buff *);

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_XGRESS
        return skb->tc_at_ingress;
#else
        return false;
#endif
}

static inline bool skb_skip_tc_classify(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        if (skb->tc_skip_classify) {
                skb->tc_skip_classify = 0;
                return true;
        }
#endif
        return false;
}

/* Reset all TX qdiscs greater than index of a device.  */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
        struct Qdisc *qdisc;

        for (; i < dev->num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
                if (qdisc) {
                        spin_lock_bh(qdisc_lock(qdisc));
                        qdisc_reset(qdisc);
                        spin_unlock_bh(qdisc_lock(qdisc));
                }
        }
}

/* Are all TX queues of the device empty?  */
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
        unsigned int i;

        rcu_read_lock();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                const struct Qdisc *q = rcu_dereference(txq->qdisc);

                if (!qdisc_is_empty(q)) {
                        rcu_read_unlock();
                        return false;
                }
        }
        rcu_read_unlock();
        return true;
}

/* Are any of the TX qdiscs changing?  */
static inline bool qdisc_tx_changing(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                if (rcu_access_pointer(txq->qdisc) !=
                    rcu_access_pointer(txq->qdisc_sleeping))
                        return true;
        }
        return false;
}

/* Is the device using the noop qdisc on all queues?  */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != &noop_qdisc)
                        return false;
        }
        return true;
}

static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
{
        return qdisc_skb_cb(skb)->pkt_len;
}

/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
        __NET_XMIT_STOLEN = 0x00010000,
        __NET_XMIT_BYPASS = 0x00020000,
};

#ifdef CONFIG_NET_CLS_ACT
#define net_xmit_drop_count(e)        ((e) & __NET_XMIT_STOLEN ? 0 : 1)
#else
#define net_xmit_drop_count(e)        (1)
#endif

static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
                                           const struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
        struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab);

        if (stab)
                __qdisc_calculate_pkt_len(skb, stab);
#endif
}

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                struct sk_buff **to_free)
{
        qdisc_calculate_pkt_len(skb, sch);
        return sch->enqueue(skb, sch, to_free);
}

static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
                                  __u64 bytes, __u32 packets)
{
        u64_stats_update_begin(&bstats->syncp);
        u64_stats_add(&bstats->bytes, bytes);
        u64_stats_add(&bstats->packets, packets);
        u64_stats_update_end(&bstats->syncp);
}

static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
                                 const struct sk_buff *skb)
{
        _bstats_update(bstats,
                       qdisc_pkt_len(skb),
                       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
}

static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
                                           const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(sch->cpu_bstats), skb);
}

static inline void qdisc_bstats_update(struct Qdisc *sch,
                                       const struct sk_buff *skb)
{
        bstats_update(&sch->bstats, skb);
}

static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog -= qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog += qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
{
        this_cpu_dec(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->requeues);
}

static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
{
        sch->qstats.drops += count;
}

static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
        qstats->drops++;
}

static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats)
{
        qstats->overlimits++;
}

static inline void qdisc_qstats_drop(struct Qdisc *sch)
{
        qstats_drop_inc(&sch->qstats);
}

static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->drops);
}

static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
{
        sch->qstats.overlimits++;
}

static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch)
{
        __u32 qlen = qdisc_qlen_sum(sch);

        return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen);
}

static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch,  __u32 *qlen,
                                             __u32 *backlog)
{
        struct gnet_stats_queue qstats = { 0 };

        gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats);
        *qlen = qstats.qlen + qdisc_qlen(sch);
        *backlog = qstats.backlog;
}

static inline void qdisc_tree_flush_backlog(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void qdisc_purge_queue(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_reset(sch);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void __qdisc_enqueue_tail(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        struct sk_buff *last = qh->tail;

        if (last) {
                skb->next = NULL;
                last->next = skb;
                qh->tail = skb;
        } else {
                qh->tail = skb;
                qh->head = skb;
        }
        qh->qlen++;
}

static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
{
        __qdisc_enqueue_tail(skb, &sch->q);
        qdisc_qstats_backlog_inc(sch, skb);
        return NET_XMIT_SUCCESS;
}

static inline void __qdisc_enqueue_head(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        skb->next = qh->head;

        if (!qh->head)
                qh->tail = skb;
        qh->head = skb;
        qh->qlen++;
}

static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
{
        struct sk_buff *skb = qh->head;

        if (likely(skb != NULL)) {
                qh->head = skb->next;
                qh->qlen--;
                if (qh->head == NULL)
                        qh->tail = NULL;
                skb->next = NULL;
        }

        return skb;
}

static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
        struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);

        if (likely(skb != NULL)) {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
        }

        return skb;
}

struct tc_skb_cb {
        struct qdisc_skb_cb qdisc_cb;
        u32 drop_reason;

        u16 zone; /* Only valid if post_ct = true */
        u16 mru;
        u8 post_ct:1;
        u8 post_ct_snat:1;
        u8 post_ct_dnat:1;
};

static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
{
        struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        return cb;
}

static inline enum skb_drop_reason
tcf_get_drop_reason(const struct sk_buff *skb)
{
        return tc_skb_cb(skb)->drop_reason;
}

static inline void tcf_set_drop_reason(const struct sk_buff *skb,
                                       enum skb_drop_reason reason)
{
        tc_skb_cb(skb)->drop_reason = reason;
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
{
        skb->next = *to_free;
        *to_free = skb;
}

static inline void __qdisc_drop_all(struct sk_buff *skb,
                                    struct sk_buff **to_free)
{
        if (skb->prev)
                skb->prev->next = *to_free;
        else
                skb->next = *to_free;
        *to_free = skb;
}

static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
                                                   struct qdisc_skb_head *qh,
                                                   struct sk_buff **to_free)
{
        struct sk_buff *skb = __qdisc_dequeue_head(qh);

        if (likely(skb != NULL)) {
                unsigned int len = qdisc_pkt_len(skb);

                qdisc_qstats_backlog_dec(sch, skb);
                __qdisc_drop(skb, to_free);
                return len;
        }

        return 0;
}

static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
{
        const struct qdisc_skb_head *qh = &sch->q;

        return qh->head;
}

/* generic pseudo peek method for non-work-conserving qdisc */
static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        /* we can reuse ->gso_skb because peek isn't called for root qdiscs */
        if (!skb) {
                skb = sch->dequeue(sch);

                if (skb) {
                        __skb_queue_head(&sch->gso_skb, skb);
                        /* it's still part of the queue */
                        qdisc_qstats_backlog_inc(sch, skb);
                        sch->q.qlen++;
                }
        }

        return skb;
}

static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
                                                 struct sk_buff *skb)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_backlog_dec(sch, skb);
                qdisc_bstats_cpu_update(sch, skb);
                qdisc_qstats_cpu_qlen_dec(sch);
        } else {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
                sch->q.qlen--;
        }
}

static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
                                                 unsigned int pkt_len)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_qlen_inc(sch);
                this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
        } else {
                sch->qstats.backlog += pkt_len;
                sch->q.qlen++;
        }
}

/* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        if (skb) {
                skb = __skb_dequeue(&sch->gso_skb);
                if (qdisc_is_percpu_stats(sch)) {
                        qdisc_qstats_cpu_backlog_dec(sch, skb);
                        qdisc_qstats_cpu_qlen_dec(sch);
                } else {
                        qdisc_qstats_backlog_dec(sch, skb);
                        sch->q.qlen--;
                }
        } else {
                skb = sch->dequeue(sch);
        }

        return skb;
}

static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
{
        /*
         * We do not know the backlog in bytes of this list, it
         * is up to the caller to correct it
         */
        ASSERT_RTNL();
        if (qh->qlen) {
                rtnl_kfree_skbs(qh->head, qh->tail);

                qh->head = NULL;
                qh->tail = NULL;
                qh->qlen = 0;
        }
}

static inline void qdisc_reset_queue(struct Qdisc *sch)
{
        __qdisc_reset_queue(&sch->q);
}

static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
                                          struct Qdisc **pold)
{
        struct Qdisc *old;

        sch_tree_lock(sch);
        old = *pold;
        *pold = new;
        if (old != NULL)
                qdisc_purge_queue(old);
        sch_tree_unlock(sch);

        return old;
}

static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
{
        rtnl_kfree_skbs(skb, skb);
        qdisc_qstats_drop(sch);
}

static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_cpu_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
                             struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop_all(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

struct psched_ratecfg {
        u64        rate_bytes_ps; /* bytes per second */
        u32        mult;
        u16        overhead;
        u16        mpu;
        u8        linklayer;
        u8        shift;
};

static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
                                unsigned int len)
{
        len += r->overhead;

        if (len < r->mpu)
                len = r->mpu;

        if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
                return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift;

        return ((u64)len * r->mult) >> r->shift;
}

void psched_ratecfg_precompute(struct psched_ratecfg *r,
                               const struct tc_ratespec *conf,
                               u64 rate64);

static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
                                          const struct psched_ratecfg *r)
{
        memset(res, 0, sizeof(*res));

        /* legacy struct tc_ratespec has a 32bit @rate field
         * Qdisc using 64bit rate should add new attributes
         * in order to maintain compatibility.
         */
        res->rate = min_t(u64, r->rate_bytes_ps, ~0U);

        res->overhead = r->overhead;
        res->mpu = r->mpu;
        res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
}

struct psched_pktrate {
        u64        rate_pkts_ps; /* packets per second */
        u32        mult;
        u8        shift;
};

static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r,
                                  unsigned int pkt_num)
{
        return ((u64)pkt_num * r->mult) >> r->shift;
}

void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64);

/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
 * The fast path only needs to access filter list and to update stats
 */
struct mini_Qdisc {
        struct tcf_proto *filter_list;
        struct tcf_block *block;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        unsigned long rcu_state;
};

static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
                                                const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb);
}

static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
{
        this_cpu_inc(miniq->cpu_qstats->drops);
}

struct mini_Qdisc_pair {
        struct mini_Qdisc miniq1;
        struct mini_Qdisc miniq2;
        struct mini_Qdisc __rcu **p_miniq;
};

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
                          struct tcf_proto *tp_head);
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
                          struct mini_Qdisc __rcu **p_miniq);
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
                                struct tcf_block *block);

void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx);

int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb));

/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
        while (test_bit(__QDISC_STATE_SCHED, &q->state))
                msleep(1);
}

#endif




























































































































































































































































































    1 










































































































    2 


















































































    1 



































































































































































































    2 





    1 

    1 



































    2 

    2 






    1 








    2 


























    2 


    1 
    1 








    1 

































































































































































































































































    3 





































































































    2 




















    1 





































































































































































































































































    1 



















    1 





















    1 


































































































































    1 




    1 


    1 









    1 


    1 

    1 








    1 






    1 


    1 


















    1 


    1 







    1 






    1 
    1 






    1 


    1 




    1 


















    1 















    1 


    1 












































































































    2 





    2 





    2 















    2 
    2 





    1 






    2 
    1 

























    1 
    1 

















    2 























































    2 

    1 



















    2 








    2 
    2 



    1 

































































































































































    2 












    2 

































    1 




    1 




    1 





    1 













    1 









    1 
























    1 















    1 




























    1 



    1 
























    1 





    1 





























    1 





    1 

    1 










    1 




    1 












    2 
































    2 


    2 



    1 





    1 









    1 












    1 




    1 






    1 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  fs/eventpoll.c (Efficient event retrieval implementation)
 *  Copyright (C) 2001,...,2009         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
#include <net/busy_poll.h>

/*
 * LOCKING:
 * There are three level of locking required by epoll :
 *
 * 1) epnested_mutex (mutex)
 * 2) ep->mtx (mutex)
 * 3) ep->lock (rwlock)
 *
 * The acquire order is the one listed above, from 1 to 3.
 * We need a rwlock (ep->lock) because we manipulate objects
 * from inside the poll callback, that might be triggered from
 * a wake_up() that in turn might be called from IRQ context.
 * So we can't sleep inside the poll callback and hence we need
 * a spinlock. During the event transfer loop (from kernel to
 * user space) we could end up sleeping due a copy_to_user(), so
 * we need a lock that will allow us to sleep. This lock is a
 * mutex (ep->mtx). It is acquired during the event transfer loop,
 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
 * The epnested_mutex is acquired when inserting an epoll fd onto another
 * epoll fd. We do this so that we walk the epoll tree and ensure that this
 * insertion does not create a cycle of epoll file descriptors, which
 * could lead to deadlock. We need a global mutex to prevent two
 * simultaneous inserts (A into B and B into A) from racing and
 * constructing a cycle without either insert observing that it is
 * going to.
 * It is necessary to acquire multiple "ep->mtx"es at once in the
 * case when one epoll fd is added to another. In this case, we
 * always acquire the locks in the order of nesting (i.e. after
 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
 * before e2->mtx). Since we disallow cycles of epoll file
 * descriptors, this ensures that the mutexes are well-ordered. In
 * order to communicate this nesting to lockdep, when walking a tree
 * of epoll file descriptors, we use the current recursion depth as
 * the lockdep subkey.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epnested_mutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
 * Events that require holding "epnested_mutex" are very rare, while for
 * normal operations the epoll private "ep->mtx" will guarantee
 * a better scalability.
 */

/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)

#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)

#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
                                EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)

/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4

#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))

#define EP_UNACTIVE_PTR ((void *) -1L)

#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))

struct epoll_filefd {
        struct file *file;
        int fd;
} __packed;

/* Wait structure used by the poll hooks */
struct eppoll_entry {
        /* List header used to link this structure to the "struct epitem" */
        struct eppoll_entry *next;

        /* The "base" pointer is set to the container "struct epitem" */
        struct epitem *base;

        /*
         * Wait queue item that will be linked to the target file wait
         * queue head.
         */
        wait_queue_entry_t wait;

        /* The wait queue head that linked the "wait" wait queue item */
        wait_queue_head_t *whead;
};

/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 * Avoid increasing the size of this struct, there can be many thousands
 * of these on a server and we do not want this to take another cache line.
 */
struct epitem {
        union {
                /* RB tree node links this structure to the eventpoll RB tree */
                struct rb_node rbn;
                /* Used to free the struct epitem */
                struct rcu_head rcu;
        };

        /* List header used to link this structure to the eventpoll ready list */
        struct list_head rdllink;

        /*
         * Works together "struct eventpoll"->ovflist in keeping the
         * single linked chain of items.
         */
        struct epitem *next;

        /* The file descriptor information this item refers to */
        struct epoll_filefd ffd;

        /*
         * Protected by file->f_lock, true for to-be-released epitem already
         * removed from the "struct file" items list; together with
         * eventpoll->refcount orchestrates "struct eventpoll" disposal
         */
        bool dying;

        /* List containing poll wait queues */
        struct eppoll_entry *pwqlist;

        /* The "container" of this item */
        struct eventpoll *ep;

        /* List header used to link this item to the "struct file" items list */
        struct hlist_node fllink;

        /* wakeup_source used when EPOLLWAKEUP is set */
        struct wakeup_source __rcu *ws;

        /* The structure that describe the interested events and the source fd */
        struct epoll_event event;
};

/*
 * This structure is stored inside the "private_data" member of the file
 * structure and represents the main data structure for the eventpoll
 * interface.
 */
struct eventpoll {
        /*
         * This mutex is used to ensure that files are not removed
         * while epoll is using them. This is held during the event
         * collection loop, the file cleanup path, the epoll file exit
         * code and the ctl operations.
         */
        struct mutex mtx;

        /* Wait queue used by sys_epoll_wait() */
        wait_queue_head_t wq;

        /* Wait queue used by file->poll() */
        wait_queue_head_t poll_wait;

        /* List of ready file descriptors */
        struct list_head rdllist;

        /* Lock which protects rdllist and ovflist */
        rwlock_t lock;

        /* RB tree root used to store monitored fd structs */
        struct rb_root_cached rbr;

        /*
         * This is a single linked list that chains all the "struct epitem" that
         * happened while transferring ready events to userspace w/out
         * holding ->lock.
         */
        struct epitem *ovflist;

        /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
        struct wakeup_source *ws;

        /* The user that created the eventpoll descriptor */
        struct user_struct *user;

        struct file *file;

        /* used to optimize loop detection check */
        u64 gen;
        struct hlist_head refs;

        /*
         * usage count, used together with epitem->dying to
         * orchestrate the disposal of this struct
         */
        refcount_t refcount;

#ifdef CONFIG_NET_RX_BUSY_POLL
        /* used to track busy poll napi_id */
        unsigned int napi_id;
        /* busy poll timeout */
        u32 busy_poll_usecs;
        /* busy poll packet budget */
        u16 busy_poll_budget;
        bool prefer_busy_poll;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* tracks wakeup nests for lockdep validation */
        u8 nests;
#endif
};

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
        poll_table pt;
        struct epitem *epi;
};

/*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;

/* Used for cycles detection */
static DEFINE_MUTEX(epnested_mutex);

static u64 loop_check_gen = 0;

/* Used to check for epoll file descriptor inclusion loops */
static struct eventpoll *inserting_into;

/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __ro_after_init;

/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __ro_after_init;

/*
 * List of files with newly added links, where we may need to limit the number
 * of emanating paths. Protected by the epnested_mutex.
 */
struct epitems_head {
        struct hlist_head epitems;
        struct epitems_head *next;
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;

static struct kmem_cache *ephead_cache __ro_after_init;

static inline void free_ephead(struct epitems_head *head)
{
        if (head)
                kmem_cache_free(ephead_cache, head);
}

static void list_file(struct file *file)
{
        struct epitems_head *head;

        head = container_of(file->f_ep, struct epitems_head, epitems);
        if (!head->next) {
                head->next = tfile_check_list;
                tfile_check_list = head;
        }
}

static void unlist_file(struct epitems_head *head)
{
        struct epitems_head *to_free = head;
        struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
        if (p) {
                struct epitem *epi= container_of(p, struct epitem, fllink);
                spin_lock(&epi->ffd.file->f_lock);
                if (!hlist_empty(&head->epitems))
                        to_free = NULL;
                head->next = NULL;
                spin_unlock(&epi->ffd.file->f_lock);
        }
        free_ephead(to_free);
}

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static long long_zero;
static long long_max = LONG_MAX;

static struct ctl_table epoll_table[] = {
        {
                .procname        = "max_user_watches",
                .data                = &max_user_watches,
                .maxlen                = sizeof(max_user_watches),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = &long_zero,
                .extra2                = &long_max,
        },
};

static void __init epoll_sysctls_init(void)
{
        register_sysctl("fs/epoll", epoll_table);
}
#else
#define epoll_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */

static const struct file_operations eventpoll_fops;

static inline int is_file_epoll(struct file *f)
{
        return f->f_op == &eventpoll_fops;
}

/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
                              struct file *file, int fd)
{
        ffd->file = file;
        ffd->fd = fd;
}

/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
                             struct epoll_filefd *p2)
{
        return (p1->file > p2->file ? +1:
                (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct epitem *epi)
{
        return !list_empty(&epi->rdllink);
}

static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait);
}

/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait)->base;
}

/**
 * ep_events_available - Checks if ready events might be available.
 *
 * @ep: Pointer to the eventpoll context.
 *
 * Return: a value different than %zero if ready events are available,
 *          or %zero otherwise.
 */
static inline int ep_events_available(struct eventpoll *ep)
{
        return !list_empty_careful(&ep->rdllist) ||
                READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}

#ifdef CONFIG_NET_RX_BUSY_POLL
/**
 * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
 * from the epoll instance ep is preferred, but if it is not set fallback to
 * the system-wide global via busy_loop_timeout.
 *
 * @start_time: The start time used to compute the remaining time until timeout.
 * @ep: Pointer to the eventpoll context.
 *
 * Return: true if the timeout has expired, false otherwise.
 */
static bool busy_loop_ep_timeout(unsigned long start_time,
                                 struct eventpoll *ep)
{
        unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        } else {
                return busy_loop_timeout(start_time);
        }
}

static bool ep_busy_loop_on(struct eventpoll *ep)
{
        return !!ep->busy_poll_usecs || net_busy_loop_on();
}

static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
        struct eventpoll *ep = p;

        return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}

/*
 * Busy poll if globally on and supporting sockets found && no events,
 * busy loop will return if need_resched or ep_events_available.
 *
 * we must do our busy polling with irqs enabled
 */
static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);
        u16 budget = READ_ONCE(ep->busy_poll_budget);
        bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);

        if (!budget)
                budget = BUSY_POLL_BUDGET;

        if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
                napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
                               ep, prefer_busy_poll, budget);
                if (ep_events_available(ep))
                        return true;
                /*
                 * Busy poll timed out.  Drop NAPI ID for now, we can add
                 * it back in when we have moved a socket with a valid NAPI
                 * ID onto the ready list.
                 */
                ep->napi_id = 0;
                return false;
        }
        return false;
}

/*
 * Set epoll busy poll NAPI ID from sk.
 */
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;
        unsigned int napi_id;
        struct socket *sock;
        struct sock *sk;

        if (!ep_busy_loop_on(ep))
                return;

        sock = sock_from_file(epi->ffd.file);
        if (!sock)
                return;

        sk = sock->sk;
        if (!sk)
                return;

        napi_id = READ_ONCE(sk->sk_napi_id);

        /* Non-NAPI IDs can be rejected
         *        or
         * Nothing to do if we already have this ID
         */
        if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
                return;

        /* record NAPI ID for use in next busy poll */
        ep->napi_id = napi_id;
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        struct eventpoll *ep = file->private_data;
        void __user *uarg = (void __user *)arg;
        struct epoll_params epoll_params;

        switch (cmd) {
        case EPIOCSPARAMS:
                if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
                        return -EFAULT;

                /* pad byte must be zero */
                if (epoll_params.__pad)
                        return -EINVAL;

                if (epoll_params.busy_poll_usecs > S32_MAX)
                        return -EINVAL;

                if (epoll_params.prefer_busy_poll > 1)
                        return -EINVAL;

                if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
                    !capable(CAP_NET_ADMIN))
                        return -EPERM;

                WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
                WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
                WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
                return 0;
        case EPIOCGPARAMS:
                memset(&epoll_params, 0, sizeof(epoll_params));
                epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
                epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
                epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
                if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
                        return -EFAULT;
                return 0;
        default:
                return -ENOIOCTLCMD;
        }
}

#else

static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
        return false;
}

static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        return -EOPNOTSUPP;
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

/*
 * As described in commit 0ccf831cb lockdep: annotate epoll
 * the use of wait queues used by epoll is done in a very controlled
 * manner. Wake ups can nest inside each other, but are never done
 * with the same locking. For example:
 *
 *   dfd = socket(...);
 *   efd1 = epoll_create();
 *   efd2 = epoll_create();
 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
 *
 * When a packet arrives to the device underneath "dfd", the net code will
 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
 * callback wakeup entry on that queue, and the wake_up() performed by the
 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
 * (efd1) notices that it may have some event ready, so it needs to wake up
 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
 * that ends up in another wake_up(), after having checked about the
 * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid
 * stack blasting.
 *
 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
 * this special case of epoll.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             unsigned pollflags)
{
        struct eventpoll *ep_src;
        unsigned long flags;
        u8 nests = 0;

        /*
         * To set the subclass or nesting level for spin_lock_irqsave_nested()
         * it might be natural to create a per-cpu nest count. However, since
         * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
         * schedule() in the -rt kernel, the per-cpu variable are no longer
         * protected. Thus, we are introducing a per eventpoll nest field.
         * If we are not being call from ep_poll_callback(), epi is NULL and
         * we are at the first level of nesting, 0. Otherwise, we are being
         * called from ep_poll_callback() and if a previous wakeup source is
         * not an epoll file itself, we are at depth 1 since the wakeup source
         * is depth 0. If the wakeup source is a previous epoll file in the
         * wakeup chain then we use its nests value and record ours as
         * nests + 1. The previous epoll file nests value is stable since its
         * already holding its own poll_wait.lock.
         */
        if (epi) {
                if ((is_file_epoll(epi->ffd.file))) {
                        ep_src = epi->ffd.file->private_data;
                        nests = ep_src->nests;
                } else {
                        nests = 1;
                }
        }
        spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
        ep->nests = nests + 1;
        wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
        ep->nests = 0;
        spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             __poll_t pollflags)
{
        wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}

#endif

static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
        wait_queue_head_t *whead;

        rcu_read_lock();
        /*
         * If it is cleared by POLLFREE, it should be rcu-safe.
         * If we read NULL we need a barrier paired with
         * smp_store_release() in ep_poll_callback(), otherwise
         * we rely on whead->lock.
         */
        whead = smp_load_acquire(&pwq->whead);
        if (whead)
                remove_wait_queue(whead, &pwq->wait);
        rcu_read_unlock();
}

/*
 * This function unregisters poll callbacks from the associated file
 * descriptor.  Must be called with "mtx" held.
 */
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
        struct eppoll_entry **p = &epi->pwqlist;
        struct eppoll_entry *pwq;

        while ((pwq = *p) != NULL) {
                *p = pwq->next;
                ep_remove_wait_queue(pwq);
                kmem_cache_free(pwq_cache, pwq);
        }
}

/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
        return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}

/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        if (ws)
                __pm_stay_awake(ws);
}

static inline bool ep_has_wakeup_source(struct epitem *epi)
{
        return rcu_access_pointer(epi->ws) ? true : false;
}

/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
        struct wakeup_source *ws;

        rcu_read_lock();
        ws = rcu_dereference(epi->ws);
        if (ws)
                __pm_stay_awake(ws);
        rcu_read_unlock();
}


/*
 * ep->mutex needs to be held because we could be hit by
 * eventpoll_release_file() and epoll_ctl().
 */
static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
{
        /*
         * Steal the ready list, and re-init the original one to the
         * empty list. Also, set ep->ovflist to NULL so that events
         * happening while looping w/out locks, are not lost. We cannot
         * have the poll callback to queue directly on ep->rdllist,
         * because we want the "sproc" callback to be able to do it
         * in a lockless way.
         */
        lockdep_assert_irqs_enabled();
        write_lock_irq(&ep->lock);
        list_splice_init(&ep->rdllist, txlist);
        WRITE_ONCE(ep->ovflist, NULL);
        write_unlock_irq(&ep->lock);
}

static void ep_done_scan(struct eventpoll *ep,
                         struct list_head *txlist)
{
        struct epitem *epi, *nepi;

        write_lock_irq(&ep->lock);
        /*
         * During the time we spent inside the "sproc" callback, some
         * other events might have been queued by the poll callback.
         * We re-insert them inside the main ready-list here.
         */
        for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
                /*
                 * We need to check if the item is already in the list.
                 * During the "sproc" callback execution time, items are
                 * queued into ->ovflist but the "txlist" might already
                 * contain them, and the list_splice() below takes care of them.
                 */
                if (!ep_is_linked(epi)) {
                        /*
                         * ->ovflist is LIFO, so we have to reverse it in order
                         * to keep in FIFO.
                         */
                        list_add(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        /*
         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
         * releasing the lock, events will be queued in the normal way inside
         * ep->rdllist.
         */
        WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

        /*
         * Quickly re-inject items left on "txlist".
         */
        list_splice(txlist, &ep->rdllist);
        __pm_relax(ep->ws);

        if (!list_empty(&ep->rdllist)) {
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
        }

        write_unlock_irq(&ep->lock);
}

static void ep_get(struct eventpoll *ep)
{
        refcount_inc(&ep->refcount);
}

/*
 * Returns true if the event poll can be disposed
 */
static bool ep_refcount_dec_and_test(struct eventpoll *ep)
{
        if (!refcount_dec_and_test(&ep->refcount))
                return false;

        WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
        return true;
}

static void ep_free(struct eventpoll *ep)
{
        mutex_destroy(&ep->mtx);
        free_uid(ep->user);
        wakeup_source_unregister(ep->ws);
        kfree(ep);
}

/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 * all the associated resources. Must be called with "mtx" held.
 * If the dying flag is set, do the removal only if force is true.
 * This prevents ep_clear_and_put() from dropping all the ep references
 * while running concurrently with eventpoll_release_file().
 * Returns true if the eventpoll can be disposed.
 */
static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
{
        struct file *file = epi->ffd.file;
        struct epitems_head *to_free;
        struct hlist_head *head;

        lockdep_assert_irqs_enabled();

        /*
         * Removes poll wait queue hooks.
         */
        ep_unregister_pollwait(ep, epi);

        /* Remove the current item from the list of epoll hooks */
        spin_lock(&file->f_lock);
        if (epi->dying && !force) {
                spin_unlock(&file->f_lock);
                return false;
        }

        to_free = NULL;
        head = file->f_ep;
        if (head->first == &epi->fllink && !epi->fllink.next) {
                file->f_ep = NULL;
                if (!is_file_epoll(file)) {
                        struct epitems_head *v;
                        v = container_of(head, struct epitems_head, epitems);
                        if (!smp_load_acquire(&v->next))
                                to_free = v;
                }
        }
        hlist_del_rcu(&epi->fllink);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);

        rb_erase_cached(&epi->rbn, &ep->rbr);

        write_lock_irq(&ep->lock);
        if (ep_is_linked(epi))
                list_del_init(&epi->rdllink);
        write_unlock_irq(&ep->lock);

        wakeup_source_unregister(ep_wakeup_source(epi));
        /*
         * At this point it is safe to free the eventpoll item. Use the union
         * field epi->rcu, since we are trying to minimize the size of
         * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
         * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
         * use of the rbn field.
         */
        kfree_rcu(epi, rcu);

        percpu_counter_dec(&ep->user->epoll_watches);
        return ep_refcount_dec_and_test(ep);
}

/*
 * ep_remove variant for callers owing an additional reference to the ep
 */
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
        WARN_ON_ONCE(__ep_remove(ep, epi, false));
}

static void ep_clear_and_put(struct eventpoll *ep)
{
        struct rb_node *rbp, *next;
        struct epitem *epi;
        bool dispose;

        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
                ep_poll_safewake(ep, NULL, 0);

        mutex_lock(&ep->mtx);

        /*
         * Walks through the whole tree by unregistering poll callbacks.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);

                ep_unregister_pollwait(ep, epi);
                cond_resched();
        }

        /*
         * Walks through the whole tree and try to free each "struct epitem".
         * Note that ep_remove_safe() will not remove the epitem in case of a
         * racing eventpoll_release_file(); the latter will do the removal.
         * At this point we are sure no poll callbacks will be lingering around.
         * Since we still own a reference to the eventpoll struct, the loop can't
         * dispose it.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
                next = rb_next(rbp);
                epi = rb_entry(rbp, struct epitem, rbn);
                ep_remove_safe(ep, epi);
                cond_resched();
        }

        dispose = ep_refcount_dec_and_test(ep);
        mutex_unlock(&ep->mtx);

        if (dispose)
                ep_free(ep);
}

static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        int ret;

        if (!is_file_epoll(file))
                return -EINVAL;

        switch (cmd) {
        case EPIOCSPARAMS:
        case EPIOCGPARAMS:
                ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
        struct eventpoll *ep = file->private_data;

        if (ep)
                ep_clear_and_put(ep);

        return 0;
}

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);

static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
{
        struct eventpoll *ep = file->private_data;
        LIST_HEAD(txlist);
        struct epitem *epi, *tmp;
        poll_table pt;
        __poll_t res = 0;

        init_poll_funcptr(&pt, NULL);

        /* Insert inside our poll wait queue */
        poll_wait(file, &ep->poll_wait, wait);

        /*
         * Proceed to find out if wanted events are really available inside
         * the ready list.
         */
        mutex_lock_nested(&ep->mtx, depth);
        ep_start_scan(ep, &txlist);
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                if (ep_item_poll(epi, &pt, depth + 1)) {
                        res = EPOLLIN | EPOLLRDNORM;
                        break;
                } else {
                        /*
                         * Item has been dropped into the ready list by the poll
                         * callback, but it's not actually ready, as far as
                         * caller requested events goes. We can remove it here.
                         */
                        __pm_relax(ep_wakeup_source(epi));
                        list_del_init(&epi->rdllink);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);
        return res;
}

/*
 * The ffd.file pointer may be in the process of being torn down due to
 * being closed, but we may not have finished eventpoll_release() yet.
 *
 * Normally, even with the atomic_long_inc_not_zero, the file may have
 * been free'd and then gotten re-allocated to something else (since
 * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
 *
 * But for epoll, users hold the ep->mtx mutex, and as such any file in
 * the process of being free'd will block in eventpoll_release_file()
 * and thus the underlying file allocation will not be free'd, and the
 * file re-use cannot happen.
 *
 * For the same reason we can avoid a rcu_read_lock() around the
 * operation - 'ffd.file' cannot go away even if the refcount has
 * reached zero (but we must still not call out to ->poll() functions
 * etc).
 */
static struct file *epi_fget(const struct epitem *epi)
{
        struct file *file;

        file = epi->ffd.file;
        if (!atomic_long_inc_not_zero(&file->f_count))
                file = NULL;
        return file;
}

/*
 * Differs from ep_eventpoll_poll() in that internal callers already have
 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
 * is correctly annotated.
 */
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
                                 int depth)
{
        struct file *file = epi_fget(epi);
        __poll_t res;

        /*
         * We could return EPOLLERR | EPOLLHUP or something, but let's
         * treat this more as "file doesn't exist, poll didn't happen".
         */
        if (!file)
                return 0;

        pt->_key = epi->event.events;
        if (!is_file_epoll(file))
                res = vfs_poll(file, pt);
        else
                res = __ep_eventpoll_poll(file, pt, depth);
        fput(file);
        return res & epi->event.events;
}

static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
{
        return __ep_eventpoll_poll(file, wait, 0);
}

#ifdef CONFIG_PROC_FS
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct eventpoll *ep = f->private_data;
        struct rb_node *rbp;

        mutex_lock(&ep->mtx);
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
                struct inode *inode = file_inode(epi->ffd.file);

                seq_printf(m, "tfd: %8d events: %8x data: %16llx "
                           " pos:%lli ino:%lx sdev:%x\n",
                           epi->ffd.fd, epi->event.events,
                           (long long)epi->event.data,
                           (long long)epi->ffd.file->f_pos,
                           inode->i_ino, inode->i_sb->s_dev);
                if (seq_has_overflowed(m))
                        break;
        }
        mutex_unlock(&ep->mtx);
}
#endif

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = ep_show_fdinfo,
#endif
        .release        = ep_eventpoll_release,
        .poll                = ep_eventpoll_poll,
        .llseek                = noop_llseek,
        .unlocked_ioctl        = ep_eventpoll_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

/*
 * This is called from eventpoll_release() to unlink files from the eventpoll
 * interface. We need to have this facility to cleanup correctly files that are
 * closed without being removed from the eventpoll interface.
 */
void eventpoll_release_file(struct file *file)
{
        struct eventpoll *ep;
        struct epitem *epi;
        bool dispose;

        /*
         * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
         * touching the epitems list before eventpoll_release_file() can access
         * the ep->mtx.
         */
again:
        spin_lock(&file->f_lock);
        if (file->f_ep && file->f_ep->first) {
                epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
                epi->dying = true;
                spin_unlock(&file->f_lock);

                /*
                 * ep access is safe as we still own a reference to the ep
                 * struct
                 */
                ep = epi->ep;
                mutex_lock(&ep->mtx);
                dispose = __ep_remove(ep, epi, true);
                mutex_unlock(&ep->mtx);

                if (dispose)
                        ep_free(ep);
                goto again;
        }
        spin_unlock(&file->f_lock);
}

static int ep_alloc(struct eventpoll **pep)
{
        struct eventpoll *ep;

        ep = kzalloc(sizeof(*ep), GFP_KERNEL);
        if (unlikely(!ep))
                return -ENOMEM;

        mutex_init(&ep->mtx);
        rwlock_init(&ep->lock);
        init_waitqueue_head(&ep->wq);
        init_waitqueue_head(&ep->poll_wait);
        INIT_LIST_HEAD(&ep->rdllist);
        ep->rbr = RB_ROOT_CACHED;
        ep->ovflist = EP_UNACTIVE_PTR;
        ep->user = get_current_user();
        refcount_set(&ep->refcount, 1);

        *pep = ep;

        return 0;
}

/*
 * Search the file inside the eventpoll tree. The RB tree operations
 * are protected by the "mtx" mutex, and ep_find() must be called with
 * "mtx" held.
 */
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
        int kcmp;
        struct rb_node *rbp;
        struct epitem *epi, *epir = NULL;
        struct epoll_filefd ffd;

        ep_set_ffd(&ffd, file, fd);
        for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
                epi = rb_entry(rbp, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
                if (kcmp > 0)
                        rbp = rbp->rb_right;
                else if (kcmp < 0)
                        rbp = rbp->rb_left;
                else {
                        epir = epi;
                        break;
                }
        }

        return epir;
}

#ifdef CONFIG_KCMP
static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
{
        struct rb_node *rbp;
        struct epitem *epi;

        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (epi->ffd.fd == tfd) {
                        if (toff == 0)
                                return epi;
                        else
                                toff--;
                }
                cond_resched();
        }

        return NULL;
}

struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
                                     unsigned long toff)
{
        struct file *file_raw;
        struct eventpoll *ep;
        struct epitem *epi;

        if (!is_file_epoll(file))
                return ERR_PTR(-EINVAL);

        ep = file->private_data;

        mutex_lock(&ep->mtx);
        epi = ep_find_tfd(ep, tfd, toff);
        if (epi)
                file_raw = epi->ffd.file;
        else
                file_raw = ERR_PTR(-ENOENT);
        mutex_unlock(&ep->mtx);

        return file_raw;
}
#endif /* CONFIG_KCMP */

/*
 * Adds a new entry to the tail of the list in a lockless way, i.e.
 * multiple CPUs are allowed to call this function concurrently.
 *
 * Beware: it is necessary to prevent any other modifications of the
 *         existing list until all changes are completed, in other words
 *         concurrent list_add_tail_lockless() calls should be protected
 *         with a read lock, where write lock acts as a barrier which
 *         makes sure all list_add_tail_lockless() calls are fully
 *         completed.
 *
 *        Also an element can be locklessly added to the list only in one
 *        direction i.e. either to the tail or to the head, otherwise
 *        concurrent access will corrupt the list.
 *
 * Return: %false if element has been already added to the list, %true
 * otherwise.
 */
static inline bool list_add_tail_lockless(struct list_head *new,
                                          struct list_head *head)
{
        struct list_head *prev;

        /*
         * This is simple 'new->next = head' operation, but cmpxchg()
         * is used in order to detect that same element has been just
         * added to the list from another CPU: the winner observes
         * new->next == new.
         */
        if (!try_cmpxchg(&new->next, &new, head))
                return false;

        /*
         * Initially ->next of a new element must be updated with the head
         * (we are inserting to the tail) and only then pointers are atomically
         * exchanged.  XCHG guarantees memory ordering, thus ->next should be
         * updated before pointers are actually swapped and pointers are
         * swapped before prev->next is updated.
         */

        prev = xchg(&head->prev, new);

        /*
         * It is safe to modify prev->next and new->prev, because a new element
         * is added only to the tail and new->next is updated before XCHG.
         */

        prev->next = new;
        new->prev = prev;

        return true;
}

/*
 * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
 * i.e. multiple CPUs are allowed to call this function concurrently.
 *
 * Return: %false if epi element has been already chained, %true otherwise.
 */
static inline bool chain_epi_lockless(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;

        /* Fast preliminary check */
        if (epi->next != EP_UNACTIVE_PTR)
                return false;

        /* Check that the same epi has not been just chained from another CPU */
        if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
                return false;

        /* Atomically exchange tail */
        epi->next = xchg(&ep->ovflist, epi);

        return true;
}

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 *
 * This callback takes a read lock in order not to contend with concurrent
 * events from another file descriptor, thus all modifications to ->rdllist
 * or ->ovflist are lockless.  Read lock is paired with the write lock from
 * ep_start/done_scan(), which stops all list modifications and guarantees
 * that lists state is seen correctly.
 *
 * Another thing worth to mention is that ep_poll_callback() can be called
 * concurrently for the same @epi from different CPUs if poll table was inited
 * with several wait queues entries.  Plural wakeup from different CPUs of a
 * single wait queue is serialized by wq.lock, but the case when multiple wait
 * queues are used should be detected accordingly.  This is detected using
 * cmpxchg() operation.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        int pwake = 0;
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
        __poll_t pollflags = key_to_poll(key);
        unsigned long flags;
        int ewake = 0;

        read_lock_irqsave(&ep->lock, flags);

        ep_set_busy_poll_napi_id(epi);

        /*
         * If the event mask does not contain any poll(2) event, we consider the
         * descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,
         * until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
                goto out_unlock;

        /*
         * Check the events coming with the callback. At this stage, not
         * every device reports the events in the "key" parameter of the
         * callback. We need to be able to handle both cases here, hence the
         * test for "key" != NULL before the event match test.
         */
        if (pollflags && !(pollflags & epi->event.events))
                goto out_unlock;

        /*
         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
                if (chain_epi_lockless(epi))
                        ep_pm_stay_awake_rcu(epi);
        } else if (!ep_is_linked(epi)) {
                /* In the usual case, add event to ready list. */
                if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
                        ep_pm_stay_awake_rcu(epi);
        }

        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
                if ((epi->event.events & EPOLLEXCLUSIVE) &&
                                        !(pollflags & POLLFREE)) {
                        switch (pollflags & EPOLLINOUT_BITS) {
                        case EPOLLIN:
                                if (epi->event.events & EPOLLIN)
                                        ewake = 1;
                                break;
                        case EPOLLOUT:
                                if (epi->event.events & EPOLLOUT)
                                        ewake = 1;
                                break;
                        case 0:
                                ewake = 1;
                                break;
                        }
                }
                wake_up(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
                pwake++;

out_unlock:
        read_unlock_irqrestore(&ep->lock, flags);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);

        if (!(epi->event.events & EPOLLEXCLUSIVE))
                ewake = 1;

        if (pollflags & POLLFREE) {
                /*
                 * If we race with ep_remove_wait_queue() it can miss
                 * ->whead = NULL and do another remove_wait_queue() after
                 * us, so we can't use __remove_wait_queue().
                 */
                list_del_init(&wait->entry);
                /*
                 * ->whead != NULL protects us from the race with
                 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
                 * takes whead->lock held by the caller. Once we nullify it,
                 * nothing protects ep/epi or even wait.
                 */
                smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
        }

        return ewake;
}

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                 poll_table *pt)
{
        struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
        struct epitem *epi = epq->epi;
        struct eppoll_entry *pwq;

        if (unlikely(!epi))        // an earlier allocation has failed
                return;

        pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
        if (unlikely(!pwq)) {
                epq->epi = NULL;
                return;
        }

        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(whead, &pwq->wait);
        else
                add_wait_queue(whead, &pwq->wait);
        pwq->next = epi->pwqlist;
        epi->pwqlist = pwq;
}

static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
        int kcmp;
        struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
        struct epitem *epic;
        bool leftmost = true;

        while (*p) {
                parent = *p;
                epic = rb_entry(parent, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
                if (kcmp > 0) {
                        p = &parent->rb_right;
                        leftmost = false;
                } else
                        p = &parent->rb_left;
        }
        rb_link_node(&epi->rbn, parent, p);
        rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}



#define PATH_ARR_SIZE 5
/*
 * These are the number paths of length 1 to 5, that we are allowing to emanate
 * from a single file of interest. For example, we allow 1000 paths of length
 * 1, to emanate from each file of interest. This essentially represents the
 * potential wakeup paths, which need to be limited in order to avoid massive
 * uncontrolled wakeup storms. The common use case should be a single ep which
 * is connected to n file sources. In this case each file source has 1 path
 * of length 1. Thus, the numbers below should be more than sufficient. These
 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
 * and delete can't add additional paths. Protected by the epnested_mutex.
 */
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];

static int path_count_inc(int nests)
{
        /* Allow an arbitrary number of depth 1 paths */
        if (nests == 0)
                return 0;

        if (++path_count[nests] > path_limits[nests])
                return -1;
        return 0;
}

static void path_count_init(void)
{
        int i;

        for (i = 0; i < PATH_ARR_SIZE; i++)
                path_count[i] = 0;
}

static int reverse_path_check_proc(struct hlist_head *refs, int depth)
{
        int error = 0;
        struct epitem *epi;

        if (depth > EP_MAX_NESTS) /* too deep nesting */
                return -1;

        /* CTL_DEL can remove links here, but that can't increase our count */
        hlist_for_each_entry_rcu(epi, refs, fllink) {
                struct hlist_head *refs = &epi->ep->refs;
                if (hlist_empty(refs))
                        error = path_count_inc(depth);
                else
                        error = reverse_path_check_proc(refs, depth + 1);
                if (error != 0)
                        break;
        }
        return error;
}

/**
 * reverse_path_check - The tfile_check_list is list of epitem_head, which have
 *                      links that are proposed to be newly added. We need to
 *                      make sure that those added links don't add too many
 *                      paths such that we will spend all our time waking up
 *                      eventpoll objects.
 *
 * Return: %zero if the proposed links don't create too many paths,
 *            %-1 otherwise.
 */
static int reverse_path_check(void)
{
        struct epitems_head *p;

        for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
                int error;
                path_count_init();
                rcu_read_lock();
                error = reverse_path_check_proc(&p->epitems, 0);
                rcu_read_unlock();
                if (error)
                        return error;
        }
        return 0;
}

static int ep_create_wakeup_source(struct epitem *epi)
{
        struct name_snapshot n;
        struct wakeup_source *ws;

        if (!epi->ep->ws) {
                epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
                if (!epi->ep->ws)
                        return -ENOMEM;
        }

        take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
        ws = wakeup_source_register(NULL, n.name.name);
        release_dentry_name_snapshot(&n);

        if (!ws)
                return -ENOMEM;
        rcu_assign_pointer(epi->ws, ws);

        return 0;
}

/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        RCU_INIT_POINTER(epi->ws, NULL);

        /*
         * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
         * used internally by wakeup_source_remove, too (called by
         * wakeup_source_unregister), so we cannot use call_rcu
         */
        synchronize_rcu();
        wakeup_source_unregister(ws);
}

static int attach_epitem(struct file *file, struct epitem *epi)
{
        struct epitems_head *to_free = NULL;
        struct hlist_head *head = NULL;
        struct eventpoll *ep = NULL;

        if (is_file_epoll(file))
                ep = file->private_data;

        if (ep) {
                head = &ep->refs;
        } else if (!READ_ONCE(file->f_ep)) {
allocate:
                to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
                if (!to_free)
                        return -ENOMEM;
                head = &to_free->epitems;
        }
        spin_lock(&file->f_lock);
        if (!file->f_ep) {
                if (unlikely(!head)) {
                        spin_unlock(&file->f_lock);
                        goto allocate;
                }
                file->f_ep = head;
                to_free = NULL;
        }
        hlist_add_head_rcu(&epi->fllink, file->f_ep);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);
        return 0;
}

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                     struct file *tfile, int fd, int full_check)
{
        int error, pwake = 0;
        __poll_t revents;
        struct epitem *epi;
        struct ep_pqueue epq;
        struct eventpoll *tep = NULL;

        if (is_file_epoll(tfile))
                tep = tfile->private_data;

        lockdep_assert_irqs_enabled();

        if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
                                            max_user_watches) >= 0))
                return -ENOSPC;
        percpu_counter_inc(&ep->user->epoll_watches);

        if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        /* Item initialization follow here ... */
        INIT_LIST_HEAD(&epi->rdllink);
        epi->ep = ep;
        ep_set_ffd(&epi->ffd, tfile, fd);
        epi->event = *event;
        epi->next = EP_UNACTIVE_PTR;

        if (tep)
                mutex_lock_nested(&tep->mtx, 1);
        /* Add the current item to the list of active epoll hook for this file */
        if (unlikely(attach_epitem(tfile, epi) < 0)) {
                if (tep)
                        mutex_unlock(&tep->mtx);
                kmem_cache_free(epi_cache, epi);
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        if (full_check && !tep)
                list_file(tfile);

        /*
         * Add the current item to the RB tree. All RB tree operations are
         * protected by "mtx", and ep_insert() is called with "mtx" held.
         */
        ep_rbtree_insert(ep, epi);
        if (tep)
                mutex_unlock(&tep->mtx);

        /*
         * ep_remove_safe() calls in the later error paths can't lead to
         * ep_free() as the ep file itself still holds an ep reference.
         */
        ep_get(ep);

        /* now check if we've created too many backpaths */
        if (unlikely(full_check && reverse_path_check())) {
                ep_remove_safe(ep, epi);
                return -EINVAL;
        }

        if (epi->event.events & EPOLLWAKEUP) {
                error = ep_create_wakeup_source(epi);
                if (error) {
                        ep_remove_safe(ep, epi);
                        return error;
                }
        }

        /* Initialize the poll table using the queue callback */
        epq.epi = epi;
        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

        /*
         * Attach the item to the poll hooks and get current event bits.
         * We can safely use the file* here because its usage count has
         * been increased by the caller of this function. Note that after
         * this operation completes, the poll callback can start hitting
         * the new item.
         */
        revents = ep_item_poll(epi, &epq.pt, 1);

        /*
         * We have to check if something went wrong during the poll wait queue
         * install process. Namely an allocation for a wait queue failed due
         * high memory pressure.
         */
        if (unlikely(!epq.epi)) {
                ep_remove_safe(ep, epi);
                return -ENOMEM;
        }

        /* We have to drop the new item inside our item list to keep track of it */
        write_lock_irq(&ep->lock);

        /* record NAPI ID of new item if present */
        ep_set_busy_poll_napi_id(epi);

        /* If the file is already "ready" we drop it inside the ready list */
        if (revents && !ep_is_linked(epi)) {
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake(epi);

                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }

        write_unlock_irq(&ep->lock);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

/*
 * Modify the interest event mask by dropping an event if the new mask
 * has a match in the current file status. Must be called with "mtx" held.
 */
static int ep_modify(struct eventpoll *ep, struct epitem *epi,
                     const struct epoll_event *event)
{
        int pwake = 0;
        poll_table pt;

        lockdep_assert_irqs_enabled();

        init_poll_funcptr(&pt, NULL);

        /*
         * Set the new event interest mask before calling f_op->poll();
         * otherwise we might miss an event that happens between the
         * f_op->poll() call and the new event set registering.
         */
        epi->event.events = event->events; /* need barrier below */
        epi->event.data = event->data; /* protected by mtx */
        if (epi->event.events & EPOLLWAKEUP) {
                if (!ep_has_wakeup_source(epi))
                        ep_create_wakeup_source(epi);
        } else if (ep_has_wakeup_source(epi)) {
                ep_destroy_wakeup_source(epi);
        }

        /*
         * The following barrier has two effects:
         *
         * 1) Flush epi changes above to other CPUs.  This ensures
         *    we do not miss events from ep_poll_callback if an
         *    event occurs immediately after we call f_op->poll().
         *    We need this because we did not take ep->lock while
         *    changing epi above (but ep_poll_callback does take
         *    ep->lock).
         *
         * 2) We also need to ensure we do not miss _past_ events
         *    when calling f_op->poll().  This barrier also
         *    pairs with the barrier in wq_has_sleeper (see
         *    comments for wq_has_sleeper).
         *
         * This barrier will now guarantee ep_poll_callback or f_op->poll
         * (or both) will notice the readiness of an item.
         */
        smp_mb();

        /*
         * Get current event bits. We can safely use the file* here because
         * its usage count has been increased by the caller of this function.
         * If the item is "hot" and it is not registered inside the ready
         * list, push it inside.
         */
        if (ep_item_poll(epi, &pt, 1)) {
                write_lock_irq(&ep->lock);
                if (!ep_is_linked(epi)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);

                        /* Notify waiting tasks that events are available */
                        if (waitqueue_active(&ep->wq))
                                wake_up(&ep->wq);
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
                write_unlock_irq(&ep->lock);
        }

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

static int ep_send_events(struct eventpoll *ep,
                          struct epoll_event __user *events, int maxevents)
{
        struct epitem *epi, *tmp;
        LIST_HEAD(txlist);
        poll_table pt;
        int res = 0;

        /*
         * Always short-circuit for fatal signals to allow threads to make a
         * timely exit without the chance of finding more events available and
         * fetching repeatedly.
         */
        if (fatal_signal_pending(current))
                return -EINTR;

        init_poll_funcptr(&pt, NULL);

        mutex_lock(&ep->mtx);
        ep_start_scan(ep, &txlist);

        /*
         * We can loop without lock because we are passed a task private list.
         * Items cannot vanish during the loop we are holding ep->mtx.
         */
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                struct wakeup_source *ws;
                __poll_t revents;

                if (res >= maxevents)
                        break;

                /*
                 * Activate ep->ws before deactivating epi->ws to prevent
                 * triggering auto-suspend here (in case we reactive epi->ws
                 * below).
                 *
                 * This could be rearranged to delay the deactivation of epi->ws
                 * instead, but then epi->ws would temporarily be out of sync
                 * with ep_is_linked().
                 */
                ws = ep_wakeup_source(epi);
                if (ws) {
                        if (ws->active)
                                __pm_stay_awake(ep->ws);
                        __pm_relax(ws);
                }

                list_del_init(&epi->rdllink);

                /*
                 * If the event mask intersect the caller-requested one,
                 * deliver the event to userspace. Again, we are holding ep->mtx,
                 * so no operations coming from userspace can change the item.
                 */
                revents = ep_item_poll(epi, &pt, 1);
                if (!revents)
                        continue;

                events = epoll_put_uevent(revents, epi->event.data, events);
                if (!events) {
                        list_add(&epi->rdllink, &txlist);
                        ep_pm_stay_awake(epi);
                        if (!res)
                                res = -EFAULT;
                        break;
                }
                res++;
                if (epi->event.events & EPOLLONESHOT)
                        epi->event.events &= EP_PRIVATE_BITS;
                else if (!(epi->event.events & EPOLLET)) {
                        /*
                         * If this file has been added with Level
                         * Trigger mode, we need to insert back inside
                         * the ready list, so that the next call to
                         * epoll_wait() will check again the events
                         * availability. At this point, no one can insert
                         * into ep->rdllist besides us. The epoll_ctl()
                         * callers are locked out by
                         * ep_send_events() holding "mtx" and the
                         * poll callback will queue them in ep->ovflist.
                         */
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);

        return res;
}

static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
        struct timespec64 now;

        if (ms < 0)
                return NULL;

        if (!ms) {
                to->tv_sec = 0;
                to->tv_nsec = 0;
                return to;
        }

        to->tv_sec = ms / MSEC_PER_SEC;
        to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);

        ktime_get_ts64(&now);
        *to = timespec64_add_safe(now, *to);
        return to;
}

/*
 * autoremove_wake_function, but remove even on failure to wake up, because we
 * know that default_wake_function/ttwu will only fail if the thread is already
 * woken, and in that case the ep_poll loop will remove the entry anyways, not
 * try to reuse it.
 */
static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
                                       unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wq_entry, mode, sync, key);

        /*
         * Pairs with list_empty_careful in ep_poll, and ensures future loop
         * iterations see the cause of this wakeup.
         */
        list_del_init_careful(&wq_entry->entry);
        return ret;
}

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
 *           event buffer.
 *
 * @ep: Pointer to the eventpoll context.
 * @events: Pointer to the userspace buffer where the ready events should be
 *          stored.
 * @maxevents: Size (in terms of number of events) of the caller event buffer.
 * @timeout: Maximum timeout for the ready events fetch operation, in
 *           timespec. If the timeout is zero, the function will not block,
 *           while if the @timeout ptr is NULL, the function will block
 *           until at least one event has been retrieved (or an error
 *           occurred).
 *
 * Return: the number of ready events which have been fetched, or an
 *          error code, in case of error.
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, struct timespec64 *timeout)
{
        int res, eavail, timed_out = 0;
        u64 slack = 0;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;

        lockdep_assert_irqs_enabled();

        if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
                slack = select_estimate_accuracy(timeout);
                to = &expires;
                *to = timespec64_to_ktime(*timeout);
        } else if (timeout) {
                /*
                 * Avoid the unnecessary trip to the wait queue loop, if the
                 * caller specified a non blocking operation.
                 */
                timed_out = 1;
        }

        /*
         * This call is racy: We may or may not see events that are being added
         * to the ready list under the lock (e.g., in IRQ callbacks). For cases
         * with a non-zero timeout, this thread will check the ready list under
         * lock and will add to the wait queue.  For cases with a zero
         * timeout, the user by definition should not care and will have to
         * recheck again.
         */
        eavail = ep_events_available(ep);

        while (1) {
                if (eavail) {
                        /*
                         * Try to transfer events to user space. In case we get
                         * 0 events and there's still timeout left over, we go
                         * trying again in search of more luck.
                         */
                        res = ep_send_events(ep, events, maxevents);
                        if (res)
                                return res;
                }

                if (timed_out)
                        return 0;

                eavail = ep_busy_loop(ep, timed_out);
                if (eavail)
                        continue;

                if (signal_pending(current))
                        return -EINTR;

                /*
                 * Internally init_wait() uses autoremove_wake_function(),
                 * thus wait entry is removed from the wait queue on each
                 * wakeup. Why it is important? In case of several waiters
                 * each new wakeup will hit the next waiter, giving it the
                 * chance to harvest new event. Otherwise wakeup can be
                 * lost. This is also good performance-wise, because on
                 * normal wakeup path no need to call __remove_wait_queue()
                 * explicitly, thus ep->lock is not taken, which halts the
                 * event delivery.
                 *
                 * In fact, we now use an even more aggressive function that
                 * unconditionally removes, because we don't reuse the wait
                 * entry between loop iterations. This lets us also avoid the
                 * performance issue if a process is killed, causing all of its
                 * threads to wake up without being removed normally.
                 */
                init_wait(&wait);
                wait.func = ep_autoremove_wake_function;

                write_lock_irq(&ep->lock);
                /*
                 * Barrierless variant, waitqueue_active() is called under
                 * the same lock on wakeup ep_poll_callback() side, so it
                 * is safe to avoid an explicit barrier.
                 */
                __set_current_state(TASK_INTERRUPTIBLE);

                /*
                 * Do the final check under the lock. ep_start/done_scan()
                 * plays with two lists (->rdllist and ->ovflist) and there
                 * is always a race when both lists are empty for short
                 * period of time although events are pending, so lock is
                 * important.
                 */
                eavail = ep_events_available(ep);
                if (!eavail)
                        __add_wait_queue_exclusive(&ep->wq, &wait);

                write_unlock_irq(&ep->lock);

                if (!eavail)
                        timed_out = !schedule_hrtimeout_range(to, slack,
                                                              HRTIMER_MODE_ABS);
                __set_current_state(TASK_RUNNING);

                /*
                 * We were woken up, thus go and try to harvest some events.
                 * If timed out and still on the wait queue, recheck eavail
                 * carefully under lock, below.
                 */
                eavail = 1;

                if (!list_empty_careful(&wait.entry)) {
                        write_lock_irq(&ep->lock);
                        /*
                         * If the thread timed out and is not on the wait queue,
                         * it means that the thread was woken up after its
                         * timeout expired before it could reacquire the lock.
                         * Thus, when wait.entry is empty, it needs to harvest
                         * events.
                         */
                        if (timed_out)
                                eavail = list_empty(&wait.entry);
                        __remove_wait_queue(&ep->wq, &wait);
                        write_unlock_irq(&ep->lock);
                }
        }
}

/**
 * ep_loop_check_proc - verify that adding an epoll file inside another
 *                      epoll structure does not violate the constraints, in
 *                      terms of closed loops, or too deep chains (which can
 *                      result in excessive stack usage).
 *
 * @ep: the &struct eventpoll to be currently checked.
 * @depth: Current depth of the path being checked.
 *
 * Return: %zero if adding the epoll @file inside current epoll
 *          structure @ep does not violate the constraints, or %-1 otherwise.
 */
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
        int error = 0;
        struct rb_node *rbp;
        struct epitem *epi;

        mutex_lock_nested(&ep->mtx, depth + 1);
        ep->gen = loop_check_gen;
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
                        struct eventpoll *ep_tovisit;
                        ep_tovisit = epi->ffd.file->private_data;
                        if (ep_tovisit->gen == loop_check_gen)
                                continue;
                        if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
                                error = -1;
                        else
                                error = ep_loop_check_proc(ep_tovisit, depth + 1);
                        if (error != 0)
                                break;
                } else {
                        /*
                         * If we've reached a file that is not associated with
                         * an ep, then we need to check if the newly added
                         * links are going to add too many wakeup paths. We do
                         * this by adding it to the tfile_check_list, if it's
                         * not already there, and calling reverse_path_check()
                         * during ep_insert().
                         */
                        list_file(epi->ffd.file);
                }
        }
        mutex_unlock(&ep->mtx);

        return error;
}

/**
 * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
 *                 into another epoll file (represented by @ep) does not create
 *                 closed loops or too deep chains.
 *
 * @ep: Pointer to the epoll we are inserting into.
 * @to: Pointer to the epoll to be inserted.
 *
 * Return: %zero if adding the epoll @to inside the epoll @from
 * does not violate the constraints, or %-1 otherwise.
 */
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
        inserting_into = ep;
        return ep_loop_check_proc(to, 0);
}

static void clear_tfile_check_list(void)
{
        rcu_read_lock();
        while (tfile_check_list != EP_UNACTIVE_PTR) {
                struct epitems_head *head = tfile_check_list;
                tfile_check_list = head->next;
                unlist_file(head);
        }
        rcu_read_unlock();
}

/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
        int error, fd;
        struct eventpoll *ep = NULL;
        struct file *file;

        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

        if (flags & ~EPOLL_CLOEXEC)
                return -EINVAL;
        /*
         * Create the internal data structure ("struct eventpoll").
         */
        error = ep_alloc(&ep);
        if (error < 0)
                return error;
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
        fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
        if (fd < 0) {
                error = fd;
                goto out_free_ep;
        }
        file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                 O_RDWR | (flags & O_CLOEXEC));
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto out_free_fd;
        }
#ifdef CONFIG_NET_RX_BUSY_POLL
        ep->busy_poll_usecs = 0;
        ep->busy_poll_budget = 0;
        ep->prefer_busy_poll = false;
#endif
        ep->file = file;
        fd_install(fd, file);
        return fd;

out_free_fd:
        put_unused_fd(fd);
out_free_ep:
        ep_clear_and_put(ep);
        return error;
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
        return do_epoll_create(flags);
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
        if (size <= 0)
                return -EINVAL;

        return do_epoll_create(0);
}

#ifdef CONFIG_PM_SLEEP
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
                epev->events &= ~EPOLLWAKEUP;
}
#else
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        epev->events &= ~EPOLLWAKEUP;
}
#endif

static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
                                   bool nonblock)
{
        if (!nonblock) {
                mutex_lock_nested(mutex, depth);
                return 0;
        }
        if (mutex_trylock(mutex))
                return 0;
        return -EAGAIN;
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock)
{
        int error;
        int full_check = 0;
        struct fd f, tf;
        struct eventpoll *ep;
        struct epitem *epi;
        struct eventpoll *tep = NULL;

        error = -EBADF;
        f = fdget(epfd);
        if (!f.file)
                goto error_return;

        /* Get the "struct file *" for the target file */
        tf = fdget(fd);
        if (!tf.file)
                goto error_fput;

        /* The target file descriptor must support poll */
        error = -EPERM;
        if (!file_can_poll(tf.file))
                goto error_tgt_fput;

        /* Check if EPOLLWAKEUP is allowed */
        if (ep_op_has_event(op))
                ep_take_care_of_epollwakeup(epds);

        /*
         * We have to check that the file structure underneath the file descriptor
         * the user passed to us _is_ an eventpoll file. And also we do not permit
         * adding an epoll file descriptor inside itself.
         */
        error = -EINVAL;
        if (f.file == tf.file || !is_file_epoll(f.file))
                goto error_tgt_fput;

        /*
         * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
        if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                if (op == EPOLL_CTL_MOD)
                        goto error_tgt_fput;
                if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
                                (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
                        goto error_tgt_fput;
        }

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = f.file->private_data;

        /*
         * When we insert an epoll file descriptor inside another epoll file
         * descriptor, there is the chance of creating closed loops, which are
         * better be handled here, than in more critical paths. While we are
         * checking for loops we also determine the list of files reachable
         * and hang them on the tfile_check_list, so we can check that we
         * haven't created too many possible wakeup paths.
         *
         * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
         * the epoll file descriptor is attaching directly to a wakeup source,
         * unless the epoll file descriptor is nested. The purpose of taking the
         * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
         * deep wakeup paths from forming in parallel through multiple
         * EPOLL_CTL_ADD operations.
         */
        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
        if (error)
                goto error_tgt_fput;
        if (op == EPOLL_CTL_ADD) {
                if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
                    is_file_epoll(tf.file)) {
                        mutex_unlock(&ep->mtx);
                        error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                        loop_check_gen++;
                        full_check = 1;
                        if (is_file_epoll(tf.file)) {
                                tep = tf.file->private_data;
                                error = -ELOOP;
                                if (ep_loop_check(ep, tep) != 0)
                                        goto error_tgt_fput;
                        }
                        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                }
        }

        /*
         * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
         * above, we can be sure to be able to use the item looked up by
         * ep_find() till we release the mutex.
         */
        epi = ep_find(ep, tf.file, fd);

        error = -EINVAL;
        switch (op) {
        case EPOLL_CTL_ADD:
                if (!epi) {
                        epds->events |= EPOLLERR | EPOLLHUP;
                        error = ep_insert(ep, epds, tf.file, fd, full_check);
                } else
                        error = -EEXIST;
                break;
        case EPOLL_CTL_DEL:
                if (epi) {
                        /*
                         * The eventpoll itself is still alive: the refcount
                         * can't go to zero here.
                         */
                        ep_remove_safe(ep, epi);
                        error = 0;
                } else {
                        error = -ENOENT;
                }
                break;
        case EPOLL_CTL_MOD:
                if (epi) {
                        if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                                epds->events |= EPOLLERR | EPOLLHUP;
                                error = ep_modify(ep, epi, epds);
                        }
                } else
                        error = -ENOENT;
                break;
        }
        mutex_unlock(&ep->mtx);

error_tgt_fput:
        if (full_check) {
                clear_tfile_check_list();
                loop_check_gen++;
                mutex_unlock(&epnested_mutex);
        }

        fdput(tf);
error_fput:
        fdput(f);
error_return:

        return error;
}

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
{
        struct epoll_event epds;

        if (ep_op_has_event(op) &&
            copy_from_user(&epds, event, sizeof(struct epoll_event)))
                return -EFAULT;

        return do_epoll_ctl(epfd, op, fd, &epds, false);
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
                         int maxevents, struct timespec64 *to)
{
        int error;
        struct fd f;
        struct eventpoll *ep;

        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;

        /* Verify that the area passed by the user is writeable */
        if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
                return -EFAULT;

        /* Get the "struct file *" for the eventpoll file */
        f = fdget(epfd);
        if (!f.file)
                return -EBADF;

        /*
         * We have to check that the file structure underneath the fd
         * the user passed to us _is_ an eventpoll file.
         */
        error = -EINVAL;
        if (!is_file_epoll(f.file))
                goto error_fput;

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = f.file->private_data;

        /* Time to fish for events ... */
        error = ep_poll(ep, events, maxevents, to);

error_fput:
        fdput(f);
        return error;
}

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout)
{
        struct timespec64 to;

        return do_epoll_wait(epfd, events, maxevents,
                             ep_timeout_to_timespec(&to, timeout));
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_pwait(2).
 */
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
                          int maxevents, struct timespec64 *to,
                          const sigset_t __user *sigmask, size_t sigsetsize)
{
        int error;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        error = set_user_sigmask(sigmask, sigsetsize);
        if (error)
                return error;

        error = do_epoll_wait(epfd, events, maxevents, to);

        restore_saved_sigmask_unless(error == -EINTR);

        return error;
}

SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 to;

        return do_epoll_pwait(epfd, events, maxevents,
                              ep_timeout_to_timespec(&to, timeout),
                              sigmask, sigsetsize);
}

SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
                int, maxevents, const struct __kernel_timespec __user *, timeout,
                const sigset_t __user *, sigmask, size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_epoll_pwait(epfd, events, maxevents, to,
                              sigmask, sigsetsize);
}

#ifdef CONFIG_COMPAT
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
                                 int maxevents, struct timespec64 *timeout,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize)
{
        long err;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        err = set_compat_user_sigmask(sigmask, sigsetsize);
        if (err)
                return err;

        err = do_epoll_wait(epfd, events, maxevents, timeout);

        restore_saved_sigmask_unless(err == -EINTR);

        return err;
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents, int, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 to;

        return do_compat_epoll_pwait(epfd, events, maxevents,
                                     ep_timeout_to_timespec(&to, timeout),
                                     sigmask, sigsetsize);
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents,
                       const struct __kernel_timespec __user *, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_compat_epoll_pwait(epfd, events, maxevents, to,
                                     sigmask, sigsetsize);
}

#endif

static int __init eventpoll_init(void)
{
        struct sysinfo si;

        si_meminfo(&si);
        /*
         * Allows top 4% of lomem to be allocated for epoll watches (per user).
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);

        /*
         * We can have many thousands of epitems, so prevent this from
         * using an extra cache line on 64-bit (and smaller) CPUs
         */
        BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

        /* Allocates slab cache used to allocate "struct epitem" items */
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
                sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
        epoll_sysctls_init();

        ephead_cache = kmem_cache_create("ep_head",
                sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

        return 0;
}
fs_initcall(eventpoll_init);










































































































































































































































































































































































































































































































































































































    5 




    2 

    2 

    1 

    1 



    1 























































    3 

    4 



    4 






    2 

















































































































































































































    9 





    2 














    6 




















    6 
    9 
    6 




    4 


    2 

    4 
    2 
    2 











    3 
    2 


















    2 









    3 





    3 






    4 









    4 


















    4 
















    4 












    4 















    2 












    4 







    2 



















    2 

    2 













    2 

















    3 


    2 
    5 


    4 

    2 
















    1 












    5 
    2 



























































    4 

























    1 












































































































































    1 

















    1 

    2 
    1 






















    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 













    4 
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
 *
 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *              serialize accesses to xtime/lost_ticks).
 *                              Copyright (C) 1998  Andrea Arcangeli
 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
 *  2002-05-31        Move sys_sysinfo here and make its locking sane, Robert Love
 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
 */

#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>

#include "tick-internal.h"
#include "timer_migration.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;

EXPORT_SYMBOL(jiffies_64);

/*
 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
 * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
 * level has a different granularity.
 *
 * The level granularity is:                LVL_CLK_DIV ^ level
 * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
 *
 * The array level of a newly armed timer depends on the relative expiry
 * time. The farther the expiry time is away the higher the array level and
 * therefore the granularity becomes.
 *
 * Contrary to the original timer wheel implementation, which aims for 'exact'
 * expiry of the timers, this implementation removes the need for recascading
 * the timers into the lower array levels. The previous 'classic' timer wheel
 * implementation of the kernel already violated the 'exact' expiry by adding
 * slack to the expiry time to provide batched expiration. The granularity
 * levels provide implicit batching.
 *
 * This is an optimization of the original timer wheel implementation for the
 * majority of the timer wheel use cases: timeouts. The vast majority of
 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
 * the timeout expires it indicates that normal operation is disturbed, so it
 * does not matter much whether the timeout comes with a slight delay.
 *
 * The only exception to this are networking timers with a small expiry
 * time. They rely on the granularity. Those fit into the first wheel level,
 * which has HZ granularity.
 *
 * We don't have cascading anymore. timers with a expiry time above the
 * capacity of the last wheel level are force expired at the maximum timeout
 * value of the last wheel level. From data sampling we know that the maximum
 * value observed is 5 days (network connection tracking), so this should not
 * be an issue.
 *
 * The currently chosen array constants values are a good compromise between
 * array size and granularity.
 *
 * This results in the following granularity and range levels:
 *
 * HZ 1000 steps
 * Level Offset  Granularity            Range
 *  0      0         1 ms                0 ms -         63 ms
 *  1     64         8 ms               64 ms -        511 ms
 *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
 *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
 *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
 *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
 *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
 *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
 *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
 *
 * HZ  300
 * Level Offset  Granularity            Range
 *  0           0         3 ms                0 ms -        210 ms
 *  1          64        26 ms              213 ms -       1703 ms (213ms - ~1s)
 *  2         128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
 *  3         192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
 *  4         256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
 *  5         320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
 *  6         384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
 *  7         448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
 *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
 *
 * HZ  250
 * Level Offset  Granularity            Range
 *  0           0         4 ms                0 ms -        255 ms
 *  1          64        32 ms              256 ms -       2047 ms (256ms - ~2s)
 *  2         128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
 *  3         192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
 *  4         256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
 *  5         320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
 *  6         384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
 *  7         448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
 *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
 *
 * HZ  100
 * Level Offset  Granularity            Range
 *  0           0         10 ms               0 ms -        630 ms
 *  1          64         80 ms             640 ms -       5110 ms (640ms - ~5s)
 *  2         128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
 *  3         192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
 *  4         256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
 *  5         320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
 *  6         384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
 *  7         448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */

/* Clock divisor for the next level */
#define LVL_CLK_SHIFT        3
#define LVL_CLK_DIV        (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK        (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n)        ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)        (1UL << LVL_SHIFT(n))

/*
 * The time start value for each level to select the bucket at enqueue
 * time. We start from the last possible delta of the previous level
 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
 */
#define LVL_START(n)        ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))

/* Size of each clock level */
#define LVL_BITS        6
#define LVL_SIZE        (1UL << LVL_BITS)
#define LVL_MASK        (LVL_SIZE - 1)
#define LVL_OFFS(n)        ((n) * LVL_SIZE)

/* Level depth */
#if HZ > 100
# define LVL_DEPTH        9
# else
# define LVL_DEPTH        8
#endif

/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF        (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX        (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))

/*
 * The resulting wheel size. If NOHZ is configured we allocate two
 * wheels so we have a separate storage for the deferrable timers.
 */
#define WHEEL_SIZE        (LVL_SIZE * LVL_DEPTH)

#ifdef CONFIG_NO_HZ_COMMON
/*
 * If multiple bases need to be locked, use the base ordering for lock
 * nesting, i.e. lowest number first.
 */
# define NR_BASES        3
# define BASE_LOCAL        0
# define BASE_GLOBAL        1
# define BASE_DEF        2
#else
# define NR_BASES        1
# define BASE_LOCAL        0
# define BASE_GLOBAL        0
# define BASE_DEF        0
#endif

/**
 * struct timer_base - Per CPU timer base (number of base depends on config)
 * @lock:                Lock protecting the timer_base
 * @running_timer:        When expiring timers, the lock is dropped. To make
 *                        sure not to race against deleting/modifying a
 *                        currently running timer, the pointer is set to the
 *                        timer, which expires at the moment. If no timer is
 *                        running, the pointer is NULL.
 * @expiry_lock:        PREEMPT_RT only: Lock is taken in softirq around
 *                        timer expiry callback execution and when trying to
 *                        delete a running timer and it wasn't successful in
 *                        the first glance. It prevents priority inversion
 *                        when callback was preempted on a remote CPU and a
 *                        caller tries to delete the running timer. It also
 *                        prevents a life lock, when the task which tries to
 *                        delete a timer preempted the softirq thread which
 *                        is running the timer callback function.
 * @timer_waiters:        PREEMPT_RT only: Tells, if there is a waiter
 *                        waiting for the end of the timer callback function
 *                        execution.
 * @clk:                clock of the timer base; is updated before enqueue
 *                        of a timer; during expiry, it is 1 offset ahead of
 *                        jiffies to avoid endless requeuing to current
 *                        jiffies
 * @next_expiry:        expiry value of the first timer; it is updated when
 *                        finding the next timer and during enqueue; the
 *                        value is not valid, when next_expiry_recalc is set
 * @cpu:                Number of CPU the timer base belongs to
 * @next_expiry_recalc: States, whether a recalculation of next_expiry is
 *                        required. Value is set true, when a timer was
 *                        deleted.
 * @is_idle:                Is set, when timer_base is idle. It is triggered by NOHZ
 *                        code. This state is only used in standard
 *                        base. Deferrable timers, which are enqueued remotely
 *                        never wake up an idle CPU. So no matter of supporting it
 *                        for this base.
 * @timers_pending:        Is set, when a timer is pending in the base. It is only
 *                        reliable when next_expiry_recalc is not set.
 * @pending_map:        bitmap of the timer wheel; each bit reflects a
 *                        bucket of the wheel. When a bit is set, at least a
 *                        single timer is enqueued in the related bucket.
 * @vectors:                Array of lists; Each array member reflects a bucket
 *                        of the timer wheel. The list contains all timers
 *                        which are enqueued into a specific bucket.
 */
struct timer_base {
        raw_spinlock_t                lock;
        struct timer_list        *running_timer;
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                expiry_lock;
        atomic_t                timer_waiters;
#endif
        unsigned long                clk;
        unsigned long                next_expiry;
        unsigned int                cpu;
        bool                        next_expiry_recalc;
        bool                        is_idle;
        bool                        timers_pending;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head        vectors[WHEEL_SIZE];
} ____cacheline_aligned;

static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);

#ifdef CONFIG_NO_HZ_COMMON

static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);

static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);

#ifdef CONFIG_SMP
static unsigned int sysctl_timer_migration = 1;

DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);

static void timers_update_migration(void)
{
        if (sysctl_timer_migration && tick_nohz_active)
                static_branch_enable(&timers_migration_enabled);
        else
                static_branch_disable(&timers_migration_enabled);
}

#ifdef CONFIG_SYSCTL
static int timer_migration_handler(struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        mutex_lock(&timer_keys_mutex);
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration();
        mutex_unlock(&timer_keys_mutex);
        return ret;
}

static struct ctl_table timer_sysctl[] = {
        {
                .procname        = "timer_migration",
                .data                = &sysctl_timer_migration,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = timer_migration_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static int __init timer_sysctl_init(void)
{
        register_sysctl("kernel", timer_sysctl);
        return 0;
}
device_initcall(timer_sysctl_init);
#endif /* CONFIG_SYSCTL */
#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */

static void timer_update_keys(struct work_struct *work)
{
        mutex_lock(&timer_keys_mutex);
        timers_update_migration();
        static_branch_enable(&timers_nohz_active);
        mutex_unlock(&timer_keys_mutex);
}

void timers_update_nohz(void)
{
        schedule_work(&timer_update_work);
}

static inline bool is_timers_nohz_active(void)
{
        return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */

static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
{
        int rem;
        unsigned long original = j;

        /*
         * We don't want all cpus firing their timers at once hitting the
         * same lock or cachelines, so we skew each extra cpu with an extra
         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
         * already did this.
         * The skew is done by adding 3*cpunr, then round, then subtract this
         * extra offset again.
         */
        j += cpu * 3;

        rem = j % HZ;

        /*
         * If the target jiffie is just after a whole second (which can happen
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
         * But never round down if @force_up is set.
         */
        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;

        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;

        /*
         * Make sure j is still in the future. Otherwise return the
         * unmodified value.
         */
        return time_is_after_jiffies(j) ? j : original;
}

/**
 * __round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, false);
}
EXPORT_SYMBOL_GPL(__round_jiffies);

/**
 * __round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);

/**
 * round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);

/**
 * round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies_relative(unsigned long j)
{
        return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);

/**
 * __round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, true);
}
EXPORT_SYMBOL_GPL(__round_jiffies_up);

/**
 * __round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);

/**
 * round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * This is the same as round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);

/**
 * round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * This is the same as round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up_relative(unsigned long j)
{
        return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);


static inline unsigned int timer_get_idx(struct timer_list *timer)
{
        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}

static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
                        idx << TIMER_ARRAYSHIFT;
}

/*
 * Helper function to calculate the array index for a given expiry
 * time.
 */
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
                                  unsigned long *bucket_expiry)
{

        /*
         * The timer wheel has to guarantee that a timer does not fire
         * early. Early expiry can happen due to:
         * - Timer is armed at the edge of a tick
         * - Truncation of the expiry time in the outer wheel levels
         *
         * Round up with level granularity to prevent this.
         */
        expires = (expires >> LVL_SHIFT(lvl)) + 1;
        *bucket_expiry = expires << LVL_SHIFT(lvl);
        return LVL_OFFS(lvl) + (expires & LVL_MASK);
}

static int calc_wheel_index(unsigned long expires, unsigned long clk,
                            unsigned long *bucket_expiry)
{
        unsigned long delta = expires - clk;
        unsigned int idx;

        if (delta < LVL_START(1)) {
                idx = calc_index(expires, 0, bucket_expiry);
        } else if (delta < LVL_START(2)) {
                idx = calc_index(expires, 1, bucket_expiry);
        } else if (delta < LVL_START(3)) {
                idx = calc_index(expires, 2, bucket_expiry);
        } else if (delta < LVL_START(4)) {
                idx = calc_index(expires, 3, bucket_expiry);
        } else if (delta < LVL_START(5)) {
                idx = calc_index(expires, 4, bucket_expiry);
        } else if (delta < LVL_START(6)) {
                idx = calc_index(expires, 5, bucket_expiry);
        } else if (delta < LVL_START(7)) {
                idx = calc_index(expires, 6, bucket_expiry);
        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
                idx = calc_index(expires, 7, bucket_expiry);
        } else if ((long) delta < 0) {
                idx = clk & LVL_MASK;
                *bucket_expiry = clk;
        } else {
                /*
                 * Force expire obscene large timeouts to expire at the
                 * capacity limit of the wheel.
                 */
                if (delta >= WHEEL_TIMEOUT_CUTOFF)
                        expires = clk + WHEEL_TIMEOUT_MAX;

                idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
        }
        return idx;
}

static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
        /*
         * Deferrable timers do not prevent the CPU from entering dynticks and
         * are not taken into account on the idle/nohz_full path. An IPI when a
         * new deferrable timer is enqueued will wake up the remote CPU but
         * nothing will be done with the deferrable timer base. Therefore skip
         * the remote IPI for deferrable timers completely.
         */
        if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
                return;

        /*
         * We might have to IPI the remote CPU if the base is idle and the
         * timer is pinned. If it is a non pinned timer, it is only queued
         * on the remote CPU, when timer was running during queueing. Then
         * everything is handled by remote CPU anyway. If the other CPU is
         * on the way to idle then it can't set base->is_idle as we hold
         * the base lock:
         */
        if (base->is_idle) {
                WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
                               tick_nohz_full_cpu(base->cpu)));
                wake_up_nohz_cpu(base->cpu);
        }
}

/*
 * Enqueue the timer into the hash bucket, mark it pending in
 * the bitmap, store the index in the timer flags then wake up
 * the target CPU if needed.
 */
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
                          unsigned int idx, unsigned long bucket_expiry)
{

        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);

        trace_timer_start(timer, bucket_expiry);

        /*
         * Check whether this is the new first expiring timer. The
         * effective expiry time of the timer is required here
         * (bucket_expiry) instead of timer->expires.
         */
        if (time_before(bucket_expiry, base->next_expiry)) {
                /*
                 * Set the next expiry time and kick the CPU so it
                 * can reevaluate the wheel:
                 */
                base->next_expiry = bucket_expiry;
                base->timers_pending = true;
                base->next_expiry_recalc = false;
                trigger_dyntick_cpu(base, timer);
        }
}

static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
        unsigned long bucket_expiry;
        unsigned int idx;

        idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
        enqueue_timer(base, timer, idx, bucket_expiry);
}

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr timer_debug_descr;

struct timer_hint {
        void        (*function)(struct timer_list *t);
        long        offset;
};

#define TIMER_HINT(fn, container, timr, hintfn)                        \
        {                                                        \
                .function = fn,                                        \
                .offset          = offsetof(container, hintfn) -        \
                            offsetof(container, timr)                \
        }

static const struct timer_hint timer_hints[] = {
        TIMER_HINT(delayed_work_timer_fn,
                   struct delayed_work, timer, work.func),
        TIMER_HINT(kthread_delayed_work_timer_fn,
                   struct kthread_delayed_work, timer, work.func),
};

static void *timer_debug_hint(void *addr)
{
        struct timer_list *timer = addr;
        int i;

        for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
                if (timer_hints[i].function == timer->function) {
                        void (**fn)(void) = addr + timer_hints[i].offset;

                        return *fn;
                }
        }

        return timer->function;
}

static bool timer_is_static_object(void *addr)
{
        struct timer_list *timer = addr;

        return (timer->entry.pprev == NULL &&
                timer->entry.next == TIMER_ENTRY_STATIC);
}

/*
 * timer_fixup_init is called when:
 * - an active object is initialized
 */
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_init(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
        WARN_ON(1);
}

/*
 * timer_fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;

        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * timer_fixup_free is called when:
 * - an active object is freed
 */
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_free(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * timer_fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr timer_debug_descr = {
        .name                        = "timer_list",
        .debug_hint                = timer_debug_hint,
        .is_static_object        = timer_is_static_object,
        .fixup_init                = timer_fixup_init,
        .fixup_activate                = timer_fixup_activate,
        .fixup_free                = timer_fixup_free,
        .fixup_assert_init        = timer_fixup_assert_init,
};

static inline void debug_timer_init(struct timer_list *timer)
{
        debug_object_init(timer, &timer_debug_descr);
}

static inline void debug_timer_activate(struct timer_list *timer)
{
        debug_object_activate(timer, &timer_debug_descr);
}

static inline void debug_timer_deactivate(struct timer_list *timer)
{
        debug_object_deactivate(timer, &timer_debug_descr);
}

static inline void debug_timer_assert_init(struct timer_list *timer)
{
        debug_object_assert_init(timer, &timer_debug_descr);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key);

void init_timer_on_stack_key(struct timer_list *timer,
                             void (*func)(struct timer_list *),
                             unsigned int flags,
                             const char *name, struct lock_class_key *key)
{
        debug_object_init_on_stack(timer, &timer_debug_descr);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);

void destroy_timer_on_stack(struct timer_list *timer)
{
        debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_timer_on_stack);

#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif

static inline void debug_init(struct timer_list *timer)
{
        debug_timer_init(timer);
        trace_timer_init(timer);
}

static inline void debug_deactivate(struct timer_list *timer)
{
        debug_timer_deactivate(timer);
        trace_timer_cancel(timer);
}

static inline void debug_assert_init(struct timer_list *timer)
{
        debug_timer_assert_init(timer);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key)
{
        timer->entry.pprev = NULL;
        timer->function = func;
        if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
                flags &= TIMER_INIT_FLAGS;
        timer->flags = flags | raw_smp_processor_id();
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
}

/**
 * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
 * @func: timer callback function
 * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
 *
 * init_timer_key() must be done to a timer prior to calling *any* of the
 * other timer functions.
 */
void init_timer_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key)
{
        debug_init(timer);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);

static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
        struct hlist_node *entry = &timer->entry;

        debug_deactivate(timer);

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;
}

static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                             bool clear_pending)
{
        unsigned idx = timer_get_idx(timer);

        if (!timer_pending(timer))
                return 0;

        if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
                __clear_bit(idx, base->pending_map);
                base->next_expiry_recalc = true;
        }

        detach_timer(timer, clear_pending);
        return 1;
}

static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
        struct timer_base *base;

        base = per_cpu_ptr(&timer_bases[index], cpu);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
        return base;
}

static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
        struct timer_base *base;

        base = this_cpu_ptr(&timer_bases[index]);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = this_cpu_ptr(&timer_bases[BASE_DEF]);
        return base;
}

static inline struct timer_base *get_timer_base(u32 tflags)
{
        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}

static inline void __forward_timer_base(struct timer_base *base,
                                        unsigned long basej)
{
        /*
         * Check whether we can forward the base. We can only do that when
         * @basej is past base->clk otherwise we might rewind base->clk.
         */
        if (time_before_eq(basej, base->clk))
                return;

        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
        if (time_after(base->next_expiry, basej)) {
                base->clk = basej;
        } else {
                if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
                        return;
                base->clk = base->next_expiry;
        }

}

static inline void forward_timer_base(struct timer_base *base)
{
        __forward_timer_base(base, READ_ONCE(jiffies));
}

/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
 * that all timers which are tied to this base are locked, and the base itself
 * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found in the base->vectors array.
 *
 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
 * to wait until the migration is done.
 */
static struct timer_base *lock_timer_base(struct timer_list *timer,
                                          unsigned long *flags)
        __acquires(timer->base->lock)
{
        for (;;) {
                struct timer_base *base;
                u32 tf;

                /*
                 * We need to use READ_ONCE() here, otherwise the compiler
                 * might re-read @tf between the check for TIMER_MIGRATING
                 * and spin_lock().
                 */
                tf = READ_ONCE(timer->flags);

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        raw_spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        raw_spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }
}

#define MOD_TIMER_PENDING_ONLY                0x01
#define MOD_TIMER_REDUCE                0x02
#define MOD_TIMER_NOTPENDING                0x04

static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
        unsigned long clk = 0, flags, bucket_expiry;
        struct timer_base *base, *new_base;
        unsigned int idx = UINT_MAX;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
         * same array bucket then just return:
         */
        if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
                /*
                 * The downside of this optimization is that it can result in
                 * larger granularity than you would get from adding a new
                 * timer with this expiry.
                 */
                long diff = timer->expires - expires;

                if (!diff)
                        return 1;
                if (options & MOD_TIMER_REDUCE && diff <= 0)
                        return 1;

                /*
                 * We lock timer base and calculate the bucket index right
                 * here. If the timer ends up in the same bucket, then we
                 * just update the expiry time and avoid the whole
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);

                if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
                    time_before_eq(timer->expires, expires)) {
                        ret = 1;
                        goto out_unlock;
                }

                clk = base->clk;
                idx = calc_wheel_index(expires, clk, &bucket_expiry);

                /*
                 * Retrieve and compare the array index of the pending
                 * timer. If it matches set the expiry to the new value so a
                 * subsequent call will exit in the expires check above.
                 */
                if (idx == timer_get_idx(timer)) {
                        if (!(options & MOD_TIMER_REDUCE))
                                timer->expires = expires;
                        else if (time_after(timer->expires, expires))
                                timer->expires = expires;
                        ret = 1;
                        goto out_unlock;
                }
        } else {
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);
        }

        ret = detach_if_pending(timer, base, false);
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;

        new_base = get_timer_this_cpu_base(timer->flags);

        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise timer_delete_sync() can't detect that the timer's
                 * handler yet has not finished. This also guarantees that the
                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
                        timer->flags |= TIMER_MIGRATING;

                        raw_spin_unlock(&base->lock);
                        base = new_base;
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                        forward_timer_base(base);
                }
        }

        debug_timer_activate(timer);

        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
         * between calculating 'idx' and possibly switching the base, only
         * enqueue_timer() is required. Otherwise we need to (re)calculate
         * the wheel index via internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk)
                enqueue_timer(base, timer, idx, bucket_expiry);
        else
                internal_add_timer(base, timer);

out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * mod_timer_pending - Modify a pending timer's timeout
 * @timer:        The pending timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer_pending() is the same for pending timers as mod_timer(), but
 * will not activate inactive timers.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and not modified or was in
 *          shutdown state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires
 */
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);

/**
 * mod_timer - Modify a timer's timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer(timer, expires) is equivalent to:
 *
 *     del_timer(timer); timer->expires = expires; add_timer(timer);
 *
 * mod_timer() is more efficient than the above open coded sequence. In
 * case that the timer is inactive, the del_timer() part is a NOP. The
 * timer is in any case activated with the new expiry time @expires.
 *
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded. In this case the return value is 0 and meaningless.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires did
 *          not change the effective expiry time
 */
int mod_timer(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);

/**
 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * timer_reduce() is very similar to mod_timer(), except that it will only
 * modify an enqueued timer if that would reduce the expiration time. If
 * @timer is not enqueued it starts the timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires
 *          did not change the effective expiry time such that the
 *          timer would expire earlier than already scheduled
 */
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);

/**
 * add_timer - Start a timer
 * @timer:        The timer to be started
 *
 * Start @timer to expire at @timer->expires in the future. @timer->expires
 * is the absolute expiry time measured in 'jiffies'. When the timer expires
 * timer->function(timer) will be invoked from soft interrupt context.
 *
 * The @timer->expires and @timer->function fields must be set prior
 * to calling this function.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * If @timer->expires is already in the past @timer will be queued to
 * expire at the next timer tick.
 *
 * This can only operate on an inactive timer. Attempts to invoke this on
 * an active timer are rejected with a warning.
 */
void add_timer(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);

/**
 * add_timer_local() - Start a timer on the local CPU
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is set.
 *
 * See add_timer() for further details.
 */
void add_timer_local(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags |= TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_local);

/**
 * add_timer_global() - Start a timer without TIMER_PINNED flag set
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
 *
 * See add_timer() for further details.
 */
void add_timer_global(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags &= ~TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_global);

/**
 * add_timer_on - Start a timer on a particular CPU
 * @timer:        The timer to be started
 * @cpu:        The CPU to start it on
 *
 * Same as add_timer() except that it starts the timer on the given CPU and
 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
 * the next round, add_timer_global() should be used instead as it unsets
 * the TIMER_PINNED flag.
 *
 * See add_timer() for further details.
 */
void add_timer_on(struct timer_list *timer, int cpu)
{
        struct timer_base *new_base, *base;
        unsigned long flags;

        debug_assert_init(timer);

        if (WARN_ON_ONCE(timer_pending(timer)))
                return;

        /* Make sure timer flags have TIMER_PINNED flag set */
        timer->flags |= TIMER_PINNED;

        new_base = get_timer_cpu_base(timer->flags, cpu);

        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
         * wrong base locked.  See lock_timer_base().
         */
        base = lock_timer_base(timer, &flags);
        /*
         * Has @timer been shutdown? This needs to be evaluated while
         * holding base lock to prevent a race against the shutdown code.
         */
        if (!timer->function)
                goto out_unlock;

        if (base != new_base) {
                timer->flags |= TIMER_MIGRATING;

                raw_spin_unlock(&base->lock);
                base = new_base;
                raw_spin_lock(&base->lock);
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
        forward_timer_base(base);

        debug_timer_activate(timer);
        internal_add_timer(base, timer);
out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);

/**
 * __timer_delete - Internal function: Deactivate a timer
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the time. In that
 * case any attempt to rearm @timer after this function returns will be
 * silently ignored.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
static int __timer_delete(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * If @shutdown is set then the lock has to be taken whether the
         * timer is pending or not to protect against a concurrent rearm
         * which might hit between the lockless pending check and the lock
         * acquisition. By taking the lock it is ensured that such a newly
         * enqueued timer is dequeued and cannot end up with
         * timer->function == NULL in the expiry code.
         *
         * If timer->function is currently executed, then this makes sure
         * that the callback cannot requeue the timer.
         */
        if (timer_pending(timer) || shutdown) {
                base = lock_timer_base(timer, &flags);
                ret = detach_if_pending(timer, base, true);
                if (shutdown)
                        timer->function = NULL;
                raw_spin_unlock_irqrestore(&base->lock, flags);
        }

        return ret;
}

/**
 * timer_delete - Deactivate a timer
 * @timer:        The timer to be deactivated
 *
 * The function only deactivates a pending timer, but contrary to
 * timer_delete_sync() it does not take into account whether the timer's
 * callback function is concurrently executed on a different CPU or not.
 * It neither prevents rearming of the timer.  If @timer can be rearmed
 * concurrently then the return value of this function is meaningless.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
int timer_delete(struct timer_list *timer)
{
        return __timer_delete(timer, false);
}
EXPORT_SYMBOL(timer_delete);

/**
 * timer_shutdown - Deactivate a timer and prevent rearming
 * @timer:        The timer to be deactivated
 *
 * The function does not wait for an eventually running timer callback on a
 * different CPU but it prevents rearming of the timer. Any attempt to arm
 * @timer after this function returns will be silently ignored.
 *
 * This function is useful for teardown code and should only be used when
 * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown(struct timer_list *timer)
{
        return __timer_delete(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown);

/**
 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
 * @timer:        Timer to deactivate
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the timer. Any
 * attempt to rearm @timer after this function returns will be silently
 * ignored.
 *
 * This function cannot guarantee that the timer cannot be rearmed
 * right after dropping the base lock if @shutdown is false. That
 * needs to be prevented by the calling code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = -1;

        debug_assert_init(timer);

        base = lock_timer_base(timer, &flags);

        if (base->running_timer != timer)
                ret = detach_if_pending(timer, base, true);
        if (shutdown)
                timer->function = NULL;

        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer:        Timer to deactivate
 *
 * This function tries to deactivate a timer. On success the timer is not
 * queued and the timer callback function is not running on any CPU.
 *
 * This function does not guarantee that the timer cannot be rearmed right
 * after dropping the base lock. That needs to be prevented by the calling
 * code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
int try_to_del_timer_sync(struct timer_list *timer)
{
        return __try_to_del_timer_sync(timer, false);
}
EXPORT_SYMBOL(try_to_del_timer_sync);

#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
        spin_lock_init(&base->expiry_lock);
}

static inline void timer_base_lock_expiry(struct timer_base *base)
{
        spin_lock(&base->expiry_lock);
}

static inline void timer_base_unlock_expiry(struct timer_base *base)
{
        spin_unlock(&base->expiry_lock);
}

/*
 * The counterpart to del_timer_wait_running().
 *
 * If there is a waiter for base->expiry_lock, then it was waiting for the
 * timer callback to finish. Drop expiry_lock and reacquire it. That allows
 * the waiter to acquire the lock and make progress.
 */
static void timer_sync_wait_running(struct timer_base *base)
{
        if (atomic_read(&base->timer_waiters)) {
                raw_spin_unlock_irq(&base->lock);
                spin_unlock(&base->expiry_lock);
                spin_lock(&base->expiry_lock);
                raw_spin_lock_irq(&base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion, if the softirq thread on a remote CPU
 * got preempted, and it prevents a life lock when the task which tries to
 * delete a timer preempted the softirq thread running the timer callback
 * function.
 */
static void del_timer_wait_running(struct timer_list *timer)
{
        u32 tf;

        tf = READ_ONCE(timer->flags);
        if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
                struct timer_base *base = get_timer_base(tf);

                /*
                 * Mark the base as contended and grab the expiry lock,
                 * which is held by the softirq across the timer
                 * callback. Drop the lock immediately so the softirq can
                 * expire the next timer. In theory the timer could already
                 * be running again, but that's more than unlikely and just
                 * causes another wait loop.
                 */
                atomic_inc(&base->timer_waiters);
                spin_lock_bh(&base->expiry_lock);
                atomic_dec(&base->timer_waiters);
                spin_unlock_bh(&base->expiry_lock);
        }
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif

/**
 * __timer_delete_sync - Internal function: Deactivate a timer and wait
 *                         for the handler to finish.
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, @timer->function will be set to NULL under the
 *                timer base lock which prevents rearming of @timer
 *
 * If @shutdown is not set the timer can be rearmed later. If the timer can
 * be rearmed concurrently, i.e. after dropping the base lock then the
 * return value is meaningless.
 *
 * If @shutdown is set then @timer->function is set to NULL under timer
 * base lock which prevents rearming of the timer. Any attempt to rearm
 * a shutdown timer is silently ignored.
 *
 * If the timer should be reused after shutdown it has to be initialized
 * again.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
        int ret;

#ifdef CONFIG_LOCKDEP
        unsigned long flags;

        /*
         * If lockdep gives a backtrace here, please reference
         * the synchronization rules above.
         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
#endif
        /*
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
        WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));

        /*
         * Must be able to sleep on PREEMPT_RT because of the slowpath in
         * del_timer_wait_running().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
                lockdep_assert_preemption_enabled();

        do {
                ret = __try_to_del_timer_sync(timer, shutdown);

                if (unlikely(ret < 0)) {
                        del_timer_wait_running(timer);
                        cpu_relax();
                }
        } while (ret < 0);

        return ret;
}

/**
 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
 * @timer:        The timer to be deactivated
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
 * interrupt contexts unless the timer is an irqsafe one. The caller must
 * not hold locks which would prevent completion of the timer's callback
 * function. The timer's handler must not call add_timer_on(). Upon exit
 * the timer is not queued and the handler is not running on any CPU.
 *
 * For !irqsafe timers, the caller must not hold locks that are held in
 * interrupt context. Even if the lock has nothing to do with the timer in
 * question.  Here's why::
 *
 *    CPU0                             CPU1
 *    ----                             ----
 *                                     <SOFTIRQ>
 *                                       call_timer_fn();
 *                                       base->running_timer = mytimer;
 *    spin_lock_irq(somelock);
 *                                     <IRQ>
 *                                        spin_lock(somelock);
 *    timer_delete_sync(mytimer);
 *    while (base->running_timer == mytimer);
 *
 * Now timer_delete_sync() will never return and never release somelock.
 * The interrupt on the other CPU is waiting to grab somelock but it has
 * interrupted the softirq that CPU0 is waiting to finish.
 *
 * This function cannot guarantee that the timer is not rearmed again by
 * some concurrent or preempting code, right after it dropped the base
 * lock. If there is the possibility of a concurrent rearm then the return
 * value of the function is meaningless.
 *
 * If such a guarantee is needed, e.g. for teardown situations then use
 * timer_shutdown_sync() instead.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
int timer_delete_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync);

/**
 * timer_shutdown_sync - Shutdown a timer and prevent rearming
 * @timer: The timer to be shutdown
 *
 * When the function returns it is guaranteed that:
 *   - @timer is not queued
 *   - The callback function of @timer is not running
 *   - @timer cannot be enqueued again. Any attempt to rearm
 *     @timer is silently ignored.
 *
 * See timer_delete_sync() for synchronization rules.
 *
 * This function is useful for final teardown of an infrastructure where
 * the timer is subject to a circular dependency problem.
 *
 * A common pattern for this is a timer and a workqueue where the timer can
 * schedule work and work can arm the timer. On shutdown the workqueue must
 * be destroyed and the timer must be prevented from rearming. Unless the
 * code has conditionals like 'if (mything->in_shutdown)' to prevent that
 * there is no way to get this correct with timer_delete_sync().
 *
 * timer_shutdown_sync() is solving the problem. The correct ordering of
 * calls in this case is:
 *
 *        timer_shutdown_sync(&mything->timer);
 *        workqueue_destroy(&mything->workqueue);
 *
 * After this 'mything' can be safely freed.
 *
 * This obviously implies that the timer is not required to be functional
 * for the rest of the shutdown operation.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown_sync);

static void call_timer_fn(struct timer_list *timer,
                          void (*fn)(struct timer_list *),
                          unsigned long baseclk)
{
        int count = preempt_count();

#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the timer from inside the
         * function that is called from it, this we need to take into
         * account for lockdep too. To avoid bogus "held lock freed"
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
        /*
         * Couple the lock chain with the lock chain at
         * timer_delete_sync() by acquiring the lock_map around the fn()
         * call here and in timer_delete_sync().
         */
        lock_map_acquire(&lockdep_map);

        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);

        lock_map_release(&lockdep_map);

        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
                preempt_count_set(count);
        }
}

static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
        /*
         * This value is required only for tracing. base->clk was
         * incremented directly before expire_timers was called. But expiry
         * is related to the old base->clk value.
         */
        unsigned long baseclk = base->clk - 1;

        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);

                timer = hlist_entry(head->first, struct timer_list, entry);

                base->running_timer = timer;
                detach_timer(timer, true);

                fn = timer->function;

                if (WARN_ON_ONCE(!fn)) {
                        /* Should never happen. Emphasis on should! */
                        base->running_timer = NULL;
                        continue;
                }

                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                        base->running_timer = NULL;
                } else {
                        raw_spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                        base->running_timer = NULL;
                        timer_sync_wait_running(base);
                }
        }
}

static int collect_expired_timers(struct timer_base *base,
                                  struct hlist_head *heads)
{
        unsigned long clk = base->clk = base->next_expiry;
        struct hlist_head *vec;
        int i, levels = 0;
        unsigned int idx;

        for (i = 0; i < LVL_DEPTH; i++) {
                idx = (clk & LVL_MASK) + i * LVL_SIZE;

                if (__test_and_clear_bit(idx, base->pending_map)) {
                        vec = base->vectors + idx;
                        hlist_move_list(vec, heads++);
                        levels++;
                }
                /* Is it time to look at the next level? */
                if (clk & LVL_CLK_MASK)
                        break;
                /* Shift clock for the next level granularity */
                clk >>= LVL_CLK_SHIFT;
        }
        return levels;
}

/*
 * Find the next pending bucket of a level. Search from level start (@offset)
 * + @clk upwards and if nothing there, search from start of the level
 * (@offset) up to @offset + clk.
 */
static int next_pending_bucket(struct timer_base *base, unsigned offset,
                               unsigned clk)
{
        unsigned pos, start = offset + clk;
        unsigned end = offset + LVL_SIZE;

        pos = find_next_bit(base->pending_map, end, start);
        if (pos < end)
                return pos - start;

        pos = find_next_bit(base->pending_map, start, offset);
        return pos < start ? pos + LVL_SIZE - start : -1;
}

/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 *
 * Store next expiry time in base->next_expiry.
 */
static void next_expiry_recalc(struct timer_base *base)
{
        unsigned long clk, next, adj;
        unsigned lvl, offset = 0;

        next = base->clk + NEXT_TIMER_MAX_DELTA;
        clk = base->clk;
        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
                unsigned long lvl_clk = clk & LVL_CLK_MASK;

                if (pos >= 0) {
                        unsigned long tmp = clk + (unsigned long) pos;

                        tmp <<= LVL_SHIFT(lvl);
                        if (time_before(tmp, next))
                                next = tmp;

                        /*
                         * If the next expiration happens before we reach
                         * the next level, no need to check further.
                         */
                        if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
                                break;
                }
                /*
                 * Clock for the next level. If the current level clock lower
                 * bits are zero, we look at the next level as is. If not we
                 * need to advance it by one because that's going to be the
                 * next expiring bucket in that level. base->clk is the next
                 * expiring jiffie. So in case of:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    0
                 *
                 * we have to look at all levels @index 0. With
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    2
                 *
                 * LVL0 has the next expiring bucket @index 2. The upper
                 * levels have the next expiring bucket @index 1.
                 *
                 * In case that the propagation wraps the next level the same
                 * rules apply:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    F    2
                 *
                 * So after looking at LVL0 we get:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1
                 *  0    0    0    1    0
                 *
                 * So no propagation from LVL1 to LVL2 because that happened
                 * with the add already, but then we need to propagate further
                 * from LVL2 to LVL3.
                 *
                 * So the simple check whether the lower bits of the current
                 * level are 0 or not is sufficient for all cases.
                 */
                adj = lvl_clk ? 1 : 0;
                clk >>= LVL_CLK_SHIFT;
                clk += adj;
        }

        base->next_expiry = next;
        base->next_expiry_recalc = false;
        base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Check, if the next hrtimer event is before the next timer wheel
 * event:
 */
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
        u64 nextevt = hrtimer_get_next_event();

        /*
         * If high resolution timers are enabled
         * hrtimer_get_next_event() returns KTIME_MAX.
         */
        if (expires <= nextevt)
                return expires;

        /*
         * If the next timer is already expired, return the tick base
         * time so the tick is fired immediately.
         */
        if (nextevt <= basem)
                return basem;

        /*
         * Round up to the next jiffie. High resolution timers are
         * off, so the hrtimers are expired in the tick and we need to
         * make sure that this tick really expires the timer to avoid
         * a ping pong of the nohz stop code.
         *
         * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
         */
        return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}

static unsigned long next_timer_interrupt(struct timer_base *base,
                                          unsigned long basej)
{
        if (base->next_expiry_recalc)
                next_expiry_recalc(base);

        /*
         * Move next_expiry for the empty base into the future to prevent an
         * unnecessary raise of the timer softirq when the next_expiry value
         * will be reached even if there is no timer pending.
         *
         * This update is also required to make timer_base::next_expiry values
         * easy comparable to find out which base holds the first pending timer.
         */
        if (!base->timers_pending)
                base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;

        return base->next_expiry;
}

static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
                                                struct timer_base *base_local,
                                                struct timer_base *base_global,
                                                struct timer_events *tevt)
{
        unsigned long nextevt, nextevt_local, nextevt_global;
        bool local_first;

        nextevt_local = next_timer_interrupt(base_local, basej);
        nextevt_global = next_timer_interrupt(base_global, basej);

        local_first = time_before_eq(nextevt_local, nextevt_global);

        nextevt = local_first ? nextevt_local : nextevt_global;

        /*
         * If the @nextevt is at max. one tick away, use @nextevt and store
         * it in the local expiry value. The next global event is irrelevant in
         * this case and can be left as KTIME_MAX.
         */
        if (time_before_eq(nextevt, basej + 1)) {
                /* If we missed a tick already, force 0 delta */
                if (time_before(nextevt, basej))
                        nextevt = basej;
                tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;

                /*
                 * This is required for the remote check only but it doesn't
                 * hurt, when it is done for both call sites:
                 *
                 * * The remote callers will only take care of the global timers
                 *   as local timers will be handled by CPU itself. When not
                 *   updating tevt->global with the already missed first global
                 *   timer, it is possible that it will be missed completely.
                 *
                 * * The local callers will ignore the tevt->global anyway, when
                 *   nextevt is max. one tick away.
                 */
                if (!local_first)
                        tevt->global = tevt->local;
                return nextevt;
        }

        /*
         * Update tevt.* values:
         *
         * If the local queue expires first, then the global event can be
         * ignored. If the global queue is empty, nothing to do either.
         */
        if (!local_first && base_global->timers_pending)
                tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;

        if (base_local->timers_pending)
                tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;

        return nextevt;
}

# ifdef CONFIG_SMP
/**
 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @tevt:        Pointer to the storage for the expiry values
 * @cpu:        Remote CPU
 *
 * Stores the next pending local and global timer expiry values in the
 * struct pointed to by @tevt. If a queue is empty the corresponding
 * field is set to KTIME_MAX. If local event expires before global
 * event, global event is set to KTIME_MAX as well.
 *
 * Caller needs to make sure timer base locks are held (use
 * timer_lock_remote_bases() for this purpose).
 */
void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
                                       struct timer_events *tevt,
                                       unsigned int cpu)
{
        struct timer_base *base_local, *base_global;

        /* Preset local / global events */
        tevt->local = tevt->global = KTIME_MAX;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_held(&base_local->lock);
        lockdep_assert_held(&base_global->lock);

        fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
}

/**
 * timer_unlock_remote_bases - unlock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Unlocks the remote timer bases.
 */
void timer_unlock_remote_bases(unsigned int cpu)
        __releases(timer_bases[BASE_LOCAL]->lock)
        __releases(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);
}

/**
 * timer_lock_remote_bases - lock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Locks the remote timer bases.
 */
void timer_lock_remote_bases(unsigned int cpu)
        __acquires(timer_bases[BASE_LOCAL]->lock)
        __acquires(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_irqs_disabled();

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
}

/**
 * timer_base_is_idle() - Return whether timer base is set idle
 *
 * Returns value of local timer base is_idle value.
 */
bool timer_base_is_idle(void)
{
        return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
}

static void __run_timer_base(struct timer_base *base);

/**
 * timer_expire_remote() - expire global timers of cpu
 * @cpu:        Remote CPU
 *
 * Expire timers of global base of remote CPU.
 */
void timer_expire_remote(unsigned int cpu)
{
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        __run_timer_base(base);
}

static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        u64 next_tmigr;

        if (timer_base_idle)
                next_tmigr = tmigr_cpu_new_timer(tevt->global);
        else if (tick_stop_path)
                next_tmigr = tmigr_cpu_deactivate(tevt->global);
        else
                next_tmigr = tmigr_quick_check(tevt->global);

        /*
         * If the CPU is the last going idle in timer migration hierarchy, make
         * sure the CPU will wake up in time to handle remote timers.
         * next_tmigr == KTIME_MAX if other CPUs are still active.
         */
        if (next_tmigr < tevt->local) {
                u64 tmp;

                /* If we missed a tick already, force 0 delta */
                if (next_tmigr < basem)
                        next_tmigr = basem;

                tmp = div_u64(next_tmigr - basem, TICK_NSEC);

                *nextevt = basej + (unsigned long)tmp;
                tevt->local = next_tmigr;
        }
}
# else
static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        /*
         * Make sure first event is written into tevt->local to not miss a
         * timer on !SMP systems.
         */
        tevt->local = min_t(u64, tevt->local, tevt->global);
}
# endif /* CONFIG_SMP */

static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
                                             bool *idle)
{
        struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
        struct timer_base *base_local, *base_global;
        unsigned long nextevt;
        bool idle_is_possible;

        /*
         * When the CPU is offline, the tick is cancelled and nothing is supposed
         * to try to stop it.
         */
        if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
                if (idle)
                        *idle = true;
                return tevt.local;
        }

        base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
        base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);

        nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
                                             base_global, &tevt);

        /*
         * If the next event is only one jiffie ahead there is no need to call
         * timer migration hierarchy related functions. The value for the next
         * global timer in @tevt struct equals then KTIME_MAX. This is also
         * true, when the timer base is idle.
         *
         * The proper timer migration hierarchy function depends on the callsite
         * and whether timer base is idle or not. @nextevt will be updated when
         * this CPU needs to handle the first timer migration hierarchy
         * event. See timer_use_tmigr() for detailed information.
         */
        idle_is_possible = time_after(nextevt, basej + 1);
        if (idle_is_possible)
                timer_use_tmigr(basej, basem, &nextevt, idle,
                                base_local->is_idle, &tevt);

        /*
         * We have a fresh next event. Check whether we can forward the
         * base.
         */
        __forward_timer_base(base_local, basej);
        __forward_timer_base(base_global, basej);

        /*
         * Set base->is_idle only when caller is timer_base_try_to_set_idle()
         */
        if (idle) {
                /*
                 * Bases are idle if the next event is more than a tick
                 * away. Caution: @nextevt could have changed by enqueueing a
                 * global timer into timer migration hierarchy. Therefore a new
                 * check is required here.
                 *
                 * If the base is marked idle then any timer add operation must
                 * forward the base clk itself to keep granularity small. This
                 * idle logic is only maintained for the BASE_LOCAL and
                 * BASE_GLOBAL base, deferrable timers may still see large
                 * granularity skew (by design).
                 */
                if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
                        base_local->is_idle = true;
                        /*
                         * Global timers queued locally while running in a task
                         * in nohz_full mode need a self-IPI to kick reprogramming
                         * in IRQ tail.
                         */
                        if (tick_nohz_full_cpu(base_local->cpu))
                                base_global->is_idle = true;
                        trace_timer_base_idle(true, base_local->cpu);
                }
                *idle = base_local->is_idle;

                /*
                 * When timer base is not set idle, undo the effect of
                 * tmigr_cpu_deactivate() to prevent inconsistent states - active
                 * timer base but inactive timer migration hierarchy.
                 *
                 * When timer base was already marked idle, nothing will be
                 * changed here.
                 */
                if (!base_local->is_idle && idle_is_possible)
                        tmigr_cpu_activate();
        }

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);

        return cmp_next_hrtimer_event(basem, tevt.local);
}

/**
 * get_next_timer_interrupt() - return the time (clock mono) of the next timer
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. If timer of global base was queued into
 * timer migration hierarchy, first global timer is not taken into account. If
 * it was the last CPU of timer migration hierarchy going idle, first global
 * event is taken into account.
 */
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
        return __get_next_timer_interrupt(basej, basem, NULL);
}

/**
 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @idle:        pointer to store the value of timer_base->is_idle on return;
 *                *idle contains the information whether tick was already stopped
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
 * returned as well.
 */
u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
{
        if (*idle)
                return KTIME_MAX;

        return __get_next_timer_interrupt(basej, basem, idle);
}

/**
 * timer_clear_idle - Clear the idle state of the timer base
 *
 * Called with interrupts disabled
 */
void timer_clear_idle(void)
{
        /*
         * We do this unlocked. The worst outcome is a remote pinned timer
         * enqueue sending a pointless IPI, but taking the lock would just
         * make the window for sending the IPI a few instructions smaller
         * for the cost of taking the lock in the exit from idle
         * path. Required for BASE_LOCAL only.
         */
        __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
        if (tick_nohz_full_cpu(smp_processor_id()))
                __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
        trace_timer_base_idle(false, smp_processor_id());

        /* Activate without holding the timer_base->lock */
        tmigr_cpu_activate();
}
#endif

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 */
static inline void __run_timers(struct timer_base *base)
{
        struct hlist_head heads[LVL_DEPTH];
        int levels;

        lockdep_assert_held(&base->lock);

        if (base->running_timer)
                return;

        while (time_after_eq(jiffies, base->clk) &&
               time_after_eq(jiffies, base->next_expiry)) {
                levels = collect_expired_timers(base, heads);
                /*
                 * The two possible reasons for not finding any expired
                 * timer at this clk are that all matching timers have been
                 * dequeued or no timer has been queued since
                 * base::next_expiry was set to base::clk +
                 * NEXT_TIMER_MAX_DELTA.
                 */
                WARN_ON_ONCE(!levels && !base->next_expiry_recalc
                             && base->timers_pending);
                /*
                 * While executing timers, base->clk is set 1 offset ahead of
                 * jiffies to avoid endless requeuing to current jiffies.
                 */
                base->clk++;
                next_expiry_recalc(base);

                while (levels--)
                        expire_timers(base, heads + levels);
        }
}

static void __run_timer_base(struct timer_base *base)
{
        if (time_before(jiffies, base->next_expiry))
                return;

        timer_base_lock_expiry(base);
        raw_spin_lock_irq(&base->lock);
        __run_timers(base);
        raw_spin_unlock_irq(&base->lock);
        timer_base_unlock_expiry(base);
}

static void run_timer_base(int index)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[index]);

        __run_timer_base(base);
}

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
        run_timer_base(BASE_LOCAL);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
                run_timer_base(BASE_GLOBAL);
                run_timer_base(BASE_DEF);

                if (is_timers_nohz_active())
                        tmigr_handle_remote();
        }
}

/*
 * Called by the local, per-CPU timer interrupt on SMP.
 */
static void run_local_timers(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);

        hrtimer_run_queues();

        for (int i = 0; i < NR_BASES; i++, base++) {
                /* Raise the softirq only if required. */
                if (time_after_eq(jiffies, base->next_expiry) ||
                    (i == BASE_DEF && tmigr_requires_handle_remote())) {
                        raise_softirq(TIMER_SOFTIRQ);
                        return;
                }
        }
}

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_tick();
#endif
        sched_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers();
}

/*
 * Since schedule_timeout()'s timer is defined on the stack, it must store
 * the target task on the stack as well.
 */
struct process_timer {
        struct timer_list timer;
        struct task_struct *task;
};

static void process_timeout(struct timer_list *t)
{
        struct process_timer *timeout = from_timer(timeout, t, timer);

        wake_up_process(timeout->task);
}

/**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have elapsed.
 * The function behavior depends on the current task state
 * (see also set_current_state() description):
 *
 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
 * at all. That happens because sched_submit_work() does nothing for
 * tasks in %TASK_RUNNING state.
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be %TASK_RUNNING when this
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * Returns 0 when the timer has expired otherwise the remaining time in
 * jiffies will be returned. In all cases the return value is guaranteed
 * to be non-negative.
 */
signed long __sched schedule_timeout(signed long timeout)
{
        struct process_timer timer;
        unsigned long expire;

        switch (timeout)
        {
        case MAX_SCHEDULE_TIMEOUT:
                /*
                 * These two special cases are useful to be comfortable
                 * in the caller. Nothing more. We could take
                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
                 * but I' d like to return a valid offset (>=0) to allow
                 * the caller to do everything it want with the retval.
                 */
                schedule();
                goto out;
        default:
                /*
                 * Another bit of PARANOID. Note that the retval will be
                 * 0 since no piece of kernel is supposed to do a check
                 * for a negative retval of schedule_timeout() (since it
                 * should never happens anyway). You just have the printk()
                 * that will tell you if something is gone wrong and where.
                 */
                if (timeout < 0) {
                        printk(KERN_ERR "schedule_timeout: wrong timeout "
                                "value %lx\n", timeout);
                        dump_stack();
                        __set_current_state(TASK_RUNNING);
                        goto out;
                }
        }

        expire = timeout + jiffies;

        timer.task = current;
        timer_setup_on_stack(&timer.timer, process_timeout, 0);
        __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
        schedule();
        del_timer_sync(&timer.timer);

        /* Remove the timer from the object tracker */
        destroy_timer_on_stack(&timer.timer);

        timeout = expire - jiffies;

 out:
        return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

/*
 * We can use __set_current_state() here because schedule_timeout() calls
 * schedule() unconditionally.
 */
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
        __set_current_state(TASK_INTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);

signed long __sched schedule_timeout_killable(signed long timeout)
{
        __set_current_state(TASK_KILLABLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);

signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
        __set_current_state(TASK_UNINTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);

/*
 * Like schedule_timeout_uninterruptible(), except this task will not contribute
 * to load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
        __set_current_state(TASK_IDLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);

#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
        struct timer_list *timer;
        int cpu = new_base->cpu;

        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
}

int timers_prepare_cpu(unsigned int cpu)
{
        struct timer_base *base;
        int b;

        for (b = 0; b < NR_BASES; b++) {
                base = per_cpu_ptr(&timer_bases[b], cpu);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                base->next_expiry_recalc = false;
                base->timers_pending = false;
                base->is_idle = false;
        }
        return 0;
}

int timers_dead_cpu(unsigned int cpu)
{
        struct timer_base *old_base;
        struct timer_base *new_base;
        int b, i;

        for (b = 0; b < NR_BASES; b++) {
                old_base = per_cpu_ptr(&timer_bases[b], cpu);
                new_base = get_cpu_ptr(&timer_bases[b]);
                /*
                 * The caller is globally serialized and nobody else
                 * takes two locks at once, deadlock is not possible.
                 */
                raw_spin_lock_irq(&new_base->lock);
                raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

                /*
                 * The current CPUs base clock might be stale. Update it
                 * before moving the timers over.
                 */
                forward_timer_base(new_base);

                WARN_ON_ONCE(old_base->running_timer);
                old_base->running_timer = NULL;

                for (i = 0; i < WHEEL_SIZE; i++)
                        migrate_timer_list(new_base, old_base->vectors + i);

                raw_spin_unlock(&old_base->lock);
                raw_spin_unlock_irq(&new_base->lock);
                put_cpu_ptr(&timer_bases);
        }
        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}

void __init init_timers(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}

/**
 * msleep - sleep safely even with waitqueue interruptions
 * @msecs: Time in milliseconds to sleep for
 */
void msleep(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout)
                timeout = schedule_timeout_uninterruptible(timeout);
}

EXPORT_SYMBOL(msleep);

/**
 * msleep_interruptible - sleep waiting for signals
 * @msecs: Time in milliseconds to sleep for
 */
unsigned long msleep_interruptible(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout && !signal_pending(current))
                timeout = schedule_timeout_interruptible(timeout);
        return jiffies_to_msecs(timeout);
}

EXPORT_SYMBOL(msleep_interruptible);

/**
 * usleep_range_state - Sleep for an approximate time in a given state
 * @min:        Minimum time in usecs to sleep
 * @max:        Maximum time in usecs to sleep
 * @state:        State of the current task that will be while sleeping
 *
 * In non-atomic context where the exact wakeup time is flexible, use
 * usleep_range_state() instead of udelay().  The sleep improves responsiveness
 * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
 * power usage by allowing hrtimers to take advantage of an already-
 * scheduled interrupt instead of scheduling a new one just for this sleep.
 */
void __sched usleep_range_state(unsigned long min, unsigned long max,
                                unsigned int state)
{
        ktime_t exp = ktime_add_us(ktime_get(), min);
        u64 delta = (u64)(max - min) * NSEC_PER_USEC;

        for (;;) {
                __set_current_state(state);
                /* Do not return before the requested sleep time has elapsed */
                if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
                        break;
        }
}
EXPORT_SYMBOL(usleep_range_state);





















































































































































































































































































































    1 





















































































    1 
















































    1 







    1 











    1 
    1 









    1 




























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* af_can.c - Protocol family CAN core module
 *            (used by different CAN protocol modules)
 *
 * Copyright (c) 2002-2017 Volkswagen Group Electronic Research
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

#include <linux/module.h>
#include <linux/stddef.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/uaccess.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <linux/if_ether.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/can/can-ml.h>
#include <linux/ratelimit.h>
#include <net/net_namespace.h>
#include <net/sock.h>

#include "af_can.h"

MODULE_DESCRIPTION("Controller Area Network PF_CAN core");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, "
              "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");

MODULE_ALIAS_NETPROTO(PF_CAN);

static int stats_timer __read_mostly = 1;
module_param(stats_timer, int, 0444);
MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)");

static struct kmem_cache *rcv_cache __read_mostly;

/* table of registered CAN protocols */
static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly;
static DEFINE_MUTEX(proto_tab_lock);

static atomic_t skbcounter = ATOMIC_INIT(0);

/* af_can socket functions */

void can_sock_destruct(struct sock *sk)
{
        skb_queue_purge(&sk->sk_receive_queue);
        skb_queue_purge(&sk->sk_error_queue);
}
EXPORT_SYMBOL(can_sock_destruct);

static const struct can_proto *can_get_proto(int protocol)
{
        const struct can_proto *cp;

        rcu_read_lock();
        cp = rcu_dereference(proto_tab[protocol]);
        if (cp && !try_module_get(cp->prot->owner))
                cp = NULL;
        rcu_read_unlock();

        return cp;
}

static inline void can_put_proto(const struct can_proto *cp)
{
        module_put(cp->prot->owner);
}

static int can_create(struct net *net, struct socket *sock, int protocol,
                      int kern)
{
        struct sock *sk;
        const struct can_proto *cp;
        int err = 0;

        sock->state = SS_UNCONNECTED;

        if (protocol < 0 || protocol >= CAN_NPROTO)
                return -EINVAL;

        cp = can_get_proto(protocol);

#ifdef CONFIG_MODULES
        if (!cp) {
                /* try to load protocol module if kernel is modular */

                err = request_module("can-proto-%d", protocol);

                /* In case of error we only print a message but don't
                 * return the error code immediately.  Below we will
                 * return -EPROTONOSUPPORT
                 */
                if (err)
                        pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n",
                                           protocol);

                cp = can_get_proto(protocol);
        }
#endif

        /* check for available protocol and correct usage */

        if (!cp)
                return -EPROTONOSUPPORT;

        if (cp->type != sock->type) {
                err = -EPROTOTYPE;
                goto errout;
        }

        sock->ops = cp->ops;

        sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern);
        if (!sk) {
                err = -ENOMEM;
                goto errout;
        }

        sock_init_data(sock, sk);
        sk->sk_destruct = can_sock_destruct;

        if (sk->sk_prot->init)
                err = sk->sk_prot->init(sk);

        if (err) {
                /* release sk on errors */
                sock_orphan(sk);
                sock_put(sk);
        }

 errout:
        can_put_proto(cp);
        return err;
}

/* af_can tx path */

/**
 * can_send - transmit a CAN frame (optional with local loopback)
 * @skb: pointer to socket buffer with CAN frame in data section
 * @loop: loopback for listeners on local CAN sockets (recommended default!)
 *
 * Due to the loopback this routine must not be called from hardirq context.
 *
 * Return:
 *  0 on success
 *  -ENETDOWN when the selected interface is down
 *  -ENOBUFS on full driver queue (see net_xmit_errno())
 *  -ENOMEM when local loopback failed at calling skb_clone()
 *  -EPERM when trying to send on a non-CAN interface
 *  -EMSGSIZE CAN frame size is bigger than CAN interface MTU
 *  -EINVAL when the skb->data does not contain a valid CAN frame
 */
int can_send(struct sk_buff *skb, int loop)
{
        struct sk_buff *newskb = NULL;
        struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats;
        int err = -EINVAL;

        if (can_is_canxl_skb(skb)) {
                skb->protocol = htons(ETH_P_CANXL);
        } else if (can_is_can_skb(skb)) {
                skb->protocol = htons(ETH_P_CAN);
        } else if (can_is_canfd_skb(skb)) {
                struct canfd_frame *cfd = (struct canfd_frame *)skb->data;

                skb->protocol = htons(ETH_P_CANFD);

                /* set CAN FD flag for CAN FD frames by default */
                cfd->flags |= CANFD_FDF;
        } else {
                goto inval_skb;
        }

        /* Make sure the CAN frame can pass the selected CAN netdevice. */
        if (unlikely(skb->len > skb->dev->mtu)) {
                err = -EMSGSIZE;
                goto inval_skb;
        }

        if (unlikely(skb->dev->type != ARPHRD_CAN)) {
                err = -EPERM;
                goto inval_skb;
        }

        if (unlikely(!(skb->dev->flags & IFF_UP))) {
                err = -ENETDOWN;
                goto inval_skb;
        }

        skb->ip_summed = CHECKSUM_UNNECESSARY;

        skb_reset_mac_header(skb);
        skb_reset_network_header(skb);
        skb_reset_transport_header(skb);

        if (loop) {
                /* local loopback of sent CAN frames */

                /* indication for the CAN driver: do loopback */
                skb->pkt_type = PACKET_LOOPBACK;

                /* The reference to the originating sock may be required
                 * by the receiving socket to check whether the frame is
                 * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS
                 * Therefore we have to ensure that skb->sk remains the
                 * reference to the originating sock by restoring skb->sk
                 * after each skb_clone() or skb_orphan() usage.
                 */

                if (!(skb->dev->flags & IFF_ECHO)) {
                        /* If the interface is not capable to do loopback
                         * itself, we do it here.
                         */
                        newskb = skb_clone(skb, GFP_ATOMIC);
                        if (!newskb) {
                                kfree_skb(skb);
                                return -ENOMEM;
                        }

                        can_skb_set_owner(newskb, skb->sk);
                        newskb->ip_summed = CHECKSUM_UNNECESSARY;
                        newskb->pkt_type = PACKET_BROADCAST;
                }
        } else {
                /* indication for the CAN driver: no loopback required */
                skb->pkt_type = PACKET_HOST;
        }

        /* send to netdevice */
        err = dev_queue_xmit(skb);
        if (err > 0)
                err = net_xmit_errno(err);

        if (err) {
                kfree_skb(newskb);
                return err;
        }

        if (newskb)
                netif_rx(newskb);

        /* update statistics */
        pkg_stats->tx_frames++;
        pkg_stats->tx_frames_delta++;

        return 0;

inval_skb:
        kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(can_send);

/* af_can rx path */

static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net,
                                                        struct net_device *dev)
{
        if (dev) {
                struct can_ml_priv *can_ml = can_get_ml_priv(dev);
                return &can_ml->dev_rcv_lists;
        } else {
                return net->can.rx_alldev_list;
        }
}

/**
 * effhash - hash function for 29 bit CAN identifier reduction
 * @can_id: 29 bit CAN identifier
 *
 * Description:
 *  To reduce the linear traversal in one linked list of _single_ EFF CAN
 *  frame subscriptions the 29 bit identifier is mapped to 10 bits.
 *  (see CAN_EFF_RCV_HASH_BITS definition)
 *
 * Return:
 *  Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask )
 */
static unsigned int effhash(canid_t can_id)
{
        unsigned int hash;

        hash = can_id;
        hash ^= can_id >> CAN_EFF_RCV_HASH_BITS;
        hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS);

        return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1);
}

/**
 * can_rcv_list_find - determine optimal filterlist inside device filter struct
 * @can_id: pointer to CAN identifier of a given can_filter
 * @mask: pointer to CAN mask of a given can_filter
 * @dev_rcv_lists: pointer to the device filter struct
 *
 * Description:
 *  Returns the optimal filterlist to reduce the filter handling in the
 *  receive path. This function is called by service functions that need
 *  to register or unregister a can_filter in the filter lists.
 *
 *  A filter matches in general, when
 *
 *          <received_can_id> & mask == can_id & mask
 *
 *  so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe
 *  relevant bits for the filter.
 *
 *  The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
 *  filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg
 *  frames there is a special filterlist and a special rx path filter handling.
 *
 * Return:
 *  Pointer to optimal filterlist for the given can_id/mask pair.
 *  Consistency checked mask.
 *  Reduced can_id to have a preprocessed filter compare value.
 */
static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
                                            struct can_dev_rcv_lists *dev_rcv_lists)
{
        canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */

        /* filter for error message frames in extra filterlist */
        if (*mask & CAN_ERR_FLAG) {
                /* clear CAN_ERR_FLAG in filter entry */
                *mask &= CAN_ERR_MASK;
                return &dev_rcv_lists->rx[RX_ERR];
        }

        /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */

#define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG)

        /* ensure valid values in can_mask for 'SFF only' frame filtering */
        if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG))
                *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS);

        /* reduce condition testing at receive time */
        *can_id &= *mask;

        /* inverse can_id/can_mask filter */
        if (inv)
                return &dev_rcv_lists->rx[RX_INV];

        /* mask == 0 => no condition testing at receive time */
        if (!(*mask))
                return &dev_rcv_lists->rx[RX_ALL];

        /* extra filterlists for the subscription of a single non-RTR can_id */
        if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) &&
            !(*can_id & CAN_RTR_FLAG)) {
                if (*can_id & CAN_EFF_FLAG) {
                        if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS))
                                return &dev_rcv_lists->rx_eff[effhash(*can_id)];
                } else {
                        if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS))
                                return &dev_rcv_lists->rx_sff[*can_id];
                }
        }

        /* default: filter via can_id/can_mask */
        return &dev_rcv_lists->rx[RX_FIL];
}

/**
 * can_rx_register - subscribe CAN frames from a specific interface
 * @net: the applicable net namespace
 * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list)
 * @can_id: CAN identifier (see description)
 * @mask: CAN mask (see description)
 * @func: callback function on filter match
 * @data: returned parameter for callback function
 * @ident: string for calling module identification
 * @sk: socket pointer (might be NULL)
 *
 * Description:
 *  Invokes the callback function with the received sk_buff and the given
 *  parameter 'data' on a matching receive filter. A filter matches, when
 *
 *          <received_can_id> & mask == can_id & mask
 *
 *  The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
 *  filter for error message frames (CAN_ERR_FLAG bit set in mask).
 *
 *  The provided pointer to the sk_buff is guaranteed to be valid as long as
 *  the callback function is running. The callback function must *not* free
 *  the given sk_buff while processing it's task. When the given sk_buff is
 *  needed after the end of the callback function it must be cloned inside
 *  the callback function with skb_clone().
 *
 * Return:
 *  0 on success
 *  -ENOMEM on missing cache mem to create subscription entry
 *  -ENODEV unknown device
 */
int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
                    canid_t mask, void (*func)(struct sk_buff *, void *),
                    void *data, char *ident, struct sock *sk)
{
        struct receiver *rcv;
        struct hlist_head *rcv_list;
        struct can_dev_rcv_lists *dev_rcv_lists;
        struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;

        /* insert new receiver  (dev,canid,mask) -> (func,data) */

        if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev)))
                return -ENODEV;

        if (dev && !net_eq(net, dev_net(dev)))
                return -ENODEV;

        rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
        if (!rcv)
                return -ENOMEM;

        spin_lock_bh(&net->can.rcvlists_lock);

        dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
        rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);

        rcv->can_id = can_id;
        rcv->mask = mask;
        rcv->matches = 0;
        rcv->func = func;
        rcv->data = data;
        rcv->ident = ident;
        rcv->sk = sk;

        hlist_add_head_rcu(&rcv->list, rcv_list);
        dev_rcv_lists->entries++;

        rcv_lists_stats->rcv_entries++;
        rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max,
                                               rcv_lists_stats->rcv_entries);
        spin_unlock_bh(&net->can.rcvlists_lock);

        return 0;
}
EXPORT_SYMBOL(can_rx_register);

/* can_rx_delete_receiver - rcu callback for single receiver entry removal */
static void can_rx_delete_receiver(struct rcu_head *rp)
{
        struct receiver *rcv = container_of(rp, struct receiver, rcu);
        struct sock *sk = rcv->sk;

        kmem_cache_free(rcv_cache, rcv);
        if (sk)
                sock_put(sk);
}

/**
 * can_rx_unregister - unsubscribe CAN frames from a specific interface
 * @net: the applicable net namespace
 * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list)
 * @can_id: CAN identifier
 * @mask: CAN mask
 * @func: callback function on filter match
 * @data: returned parameter for callback function
 *
 * Description:
 *  Removes subscription entry depending on given (subscription) values.
 */
void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
                       canid_t mask, void (*func)(struct sk_buff *, void *),
                       void *data)
{
        struct receiver *rcv = NULL;
        struct hlist_head *rcv_list;
        struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
        struct can_dev_rcv_lists *dev_rcv_lists;

        if (dev && dev->type != ARPHRD_CAN)
                return;

        if (dev && !net_eq(net, dev_net(dev)))
                return;

        spin_lock_bh(&net->can.rcvlists_lock);

        dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
        rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);

        /* Search the receiver list for the item to delete.  This should
         * exist, since no receiver may be unregistered that hasn't
         * been registered before.
         */
        hlist_for_each_entry_rcu(rcv, rcv_list, list) {
                if (rcv->can_id == can_id && rcv->mask == mask &&
                    rcv->func == func && rcv->data == data)
                        break;
        }

        /* Check for bugs in CAN protocol implementations using af_can.c:
         * 'rcv' will be NULL if no matching list item was found for removal.
         * As this case may potentially happen when closing a socket while
         * the notifier for removing the CAN netdev is running we just print
         * a warning here.
         */
        if (!rcv) {
                pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n",
                        DNAME(dev), can_id, mask);
                goto out;
        }

        hlist_del_rcu(&rcv->list);
        dev_rcv_lists->entries--;

        if (rcv_lists_stats->rcv_entries > 0)
                rcv_lists_stats->rcv_entries--;

 out:
        spin_unlock_bh(&net->can.rcvlists_lock);

        /* schedule the receiver item for deletion */
        if (rcv) {
                if (rcv->sk)
                        sock_hold(rcv->sk);
                call_rcu(&rcv->rcu, can_rx_delete_receiver);
        }
}
EXPORT_SYMBOL(can_rx_unregister);

static inline void deliver(struct sk_buff *skb, struct receiver *rcv)
{
        rcv->func(skb, rcv->data);
        rcv->matches++;
}

static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb)
{
        struct receiver *rcv;
        int matches = 0;
        struct can_frame *cf = (struct can_frame *)skb->data;
        canid_t can_id = cf->can_id;

        if (dev_rcv_lists->entries == 0)
                return 0;

        if (can_id & CAN_ERR_FLAG) {
                /* check for error message frame entries only */
                hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) {
                        if (can_id & rcv->mask) {
                                deliver(skb, rcv);
                                matches++;
                        }
                }
                return matches;
        }

        /* check for unfiltered entries */
        hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) {
                deliver(skb, rcv);
                matches++;
        }

        /* check for can_id/mask entries */
        hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) {
                if ((can_id & rcv->mask) == rcv->can_id) {
                        deliver(skb, rcv);
                        matches++;
                }
        }

        /* check for inverted can_id/mask entries */
        hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) {
                if ((can_id & rcv->mask) != rcv->can_id) {
                        deliver(skb, rcv);
                        matches++;
                }
        }

        /* check filterlists for single non-RTR can_ids */
        if (can_id & CAN_RTR_FLAG)
                return matches;

        if (can_id & CAN_EFF_FLAG) {
                hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) {
                        if (rcv->can_id == can_id) {
                                deliver(skb, rcv);
                                matches++;
                        }
                }
        } else {
                can_id &= CAN_SFF_MASK;
                hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) {
                        deliver(skb, rcv);
                        matches++;
                }
        }

        return matches;
}

static void can_receive(struct sk_buff *skb, struct net_device *dev)
{
        struct can_dev_rcv_lists *dev_rcv_lists;
        struct net *net = dev_net(dev);
        struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
        int matches;

        /* update statistics */
        pkg_stats->rx_frames++;
        pkg_stats->rx_frames_delta++;

        /* create non-zero unique skb identifier together with *skb */
        while (!(can_skb_prv(skb)->skbcnt))
                can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter);

        rcu_read_lock();

        /* deliver the packet to sockets listening on all devices */
        matches = can_rcv_filter(net->can.rx_alldev_list, skb);

        /* find receive list for this device */
        dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
        matches += can_rcv_filter(dev_rcv_lists, skb);

        rcu_read_unlock();

        /* consume the skbuff allocated by the netdevice driver */
        consume_skb(skb);

        if (matches > 0) {
                pkg_stats->matches++;
                pkg_stats->matches_delta++;
        }
}

static int can_rcv(struct sk_buff *skb, struct net_device *dev,
                   struct packet_type *pt, struct net_device *orig_dev)
{
        if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) {
                pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n",
                             dev->type, skb->len);

                kfree_skb(skb);
                return NET_RX_DROP;
        }

        can_receive(skb, dev);
        return NET_RX_SUCCESS;
}

static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
                     struct packet_type *pt, struct net_device *orig_dev)
{
        if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) {
                pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n",
                             dev->type, skb->len);

                kfree_skb(skb);
                return NET_RX_DROP;
        }

        can_receive(skb, dev);
        return NET_RX_SUCCESS;
}

static int canxl_rcv(struct sk_buff *skb, struct net_device *dev,
                     struct packet_type *pt, struct net_device *orig_dev)
{
        if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) {
                pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n",
                             dev->type, skb->len);

                kfree_skb(skb);
                return NET_RX_DROP;
        }

        can_receive(skb, dev);
        return NET_RX_SUCCESS;
}

/* af_can protocol functions */

/**
 * can_proto_register - register CAN transport protocol
 * @cp: pointer to CAN protocol structure
 *
 * Return:
 *  0 on success
 *  -EINVAL invalid (out of range) protocol number
 *  -EBUSY  protocol already in use
 *  -ENOBUF if proto_register() fails
 */
int can_proto_register(const struct can_proto *cp)
{
        int proto = cp->protocol;
        int err = 0;

        if (proto < 0 || proto >= CAN_NPROTO) {
                pr_err("can: protocol number %d out of range\n", proto);
                return -EINVAL;
        }

        err = proto_register(cp->prot, 0);
        if (err < 0)
                return err;

        mutex_lock(&proto_tab_lock);

        if (rcu_access_pointer(proto_tab[proto])) {
                pr_err("can: protocol %d already registered\n", proto);
                err = -EBUSY;
        } else {
                RCU_INIT_POINTER(proto_tab[proto], cp);
        }

        mutex_unlock(&proto_tab_lock);

        if (err < 0)
                proto_unregister(cp->prot);

        return err;
}
EXPORT_SYMBOL(can_proto_register);

/**
 * can_proto_unregister - unregister CAN transport protocol
 * @cp: pointer to CAN protocol structure
 */
void can_proto_unregister(const struct can_proto *cp)
{
        int proto = cp->protocol;

        mutex_lock(&proto_tab_lock);
        BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp);
        RCU_INIT_POINTER(proto_tab[proto], NULL);
        mutex_unlock(&proto_tab_lock);

        synchronize_rcu();

        proto_unregister(cp->prot);
}
EXPORT_SYMBOL(can_proto_unregister);

static int can_pernet_init(struct net *net)
{
        spin_lock_init(&net->can.rcvlists_lock);
        net->can.rx_alldev_list =
                kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL);
        if (!net->can.rx_alldev_list)
                goto out;
        net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL);
        if (!net->can.pkg_stats)
                goto out_free_rx_alldev_list;
        net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL);
        if (!net->can.rcv_lists_stats)
                goto out_free_pkg_stats;

        if (IS_ENABLED(CONFIG_PROC_FS)) {
                /* the statistics are updated every second (timer triggered) */
                if (stats_timer) {
                        timer_setup(&net->can.stattimer, can_stat_update,
                                    0);
                        mod_timer(&net->can.stattimer,
                                  round_jiffies(jiffies + HZ));
                }
                net->can.pkg_stats->jiffies_init = jiffies;
                can_init_proc(net);
        }

        return 0;

 out_free_pkg_stats:
        kfree(net->can.pkg_stats);
 out_free_rx_alldev_list:
        kfree(net->can.rx_alldev_list);
 out:
        return -ENOMEM;
}

static void can_pernet_exit(struct net *net)
{
        if (IS_ENABLED(CONFIG_PROC_FS)) {
                can_remove_proc(net);
                if (stats_timer)
                        del_timer_sync(&net->can.stattimer);
        }

        kfree(net->can.rx_alldev_list);
        kfree(net->can.pkg_stats);
        kfree(net->can.rcv_lists_stats);
}

/* af_can module init/exit functions */

static struct packet_type can_packet __read_mostly = {
        .type = cpu_to_be16(ETH_P_CAN),
        .func = can_rcv,
};

static struct packet_type canfd_packet __read_mostly = {
        .type = cpu_to_be16(ETH_P_CANFD),
        .func = canfd_rcv,
};

static struct packet_type canxl_packet __read_mostly = {
        .type = cpu_to_be16(ETH_P_CANXL),
        .func = canxl_rcv,
};

static const struct net_proto_family can_family_ops = {
        .family = PF_CAN,
        .create = can_create,
        .owner  = THIS_MODULE,
};

static struct pernet_operations can_pernet_ops __read_mostly = {
        .init = can_pernet_init,
        .exit = can_pernet_exit,
};

static __init int can_init(void)
{
        int err;

        /* check for correct padding to be able to use the structs similarly */
        BUILD_BUG_ON(offsetof(struct can_frame, len) !=
                     offsetof(struct canfd_frame, len) ||
                     offsetof(struct can_frame, len) !=
                     offsetof(struct canxl_frame, flags) ||
                     offsetof(struct can_frame, data) !=
                     offsetof(struct canfd_frame, data));

        pr_info("can: controller area network core\n");

        rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver),
                                      0, 0, NULL);
        if (!rcv_cache)
                return -ENOMEM;

        err = register_pernet_subsys(&can_pernet_ops);
        if (err)
                goto out_pernet;

        /* protocol register */
        err = sock_register(&can_family_ops);
        if (err)
                goto out_sock;

        dev_add_pack(&can_packet);
        dev_add_pack(&canfd_packet);
        dev_add_pack(&canxl_packet);

        return 0;

out_sock:
        unregister_pernet_subsys(&can_pernet_ops);
out_pernet:
        kmem_cache_destroy(rcv_cache);

        return err;
}

static __exit void can_exit(void)
{
        /* protocol unregister */
        dev_remove_pack(&canxl_packet);
        dev_remove_pack(&canfd_packet);
        dev_remove_pack(&can_packet);
        sock_unregister(PF_CAN);

        unregister_pernet_subsys(&can_pernet_ops);

        rcu_barrier(); /* Wait for completion of call_rcu()'s */

        kmem_cache_destroy(rcv_cache);
}

module_init(can_init);
module_exit(can_exit);









































































































    2 
































































































































    1 










































    1 
















































































    2 




    3 
    4 
    3 








    1 

    2 









    2 



    2 
    2 



    3 

    2 


























    2 
    2 


    2 

    2 










    1 














































    1 









    1 


    1 
    1 




    4 









































    2 

    1 





















    1 






    4 












    2 
    1 

    3 

    1 


    4 

    3 

    1 

    4 




    2 




    3 




    3 
    4 


















    3 





    4 





































    4 






















    2 






    1 






























    3 


















    4 
































    4 


































    3 
    1 


    3 

    2 





    3 
















































































    2 










































































































































































































































































































































































































































































































































































































































































    1 










    1 





























    9 
















    1 


    8 






































   10 
























    1 









    2 







    1 






























    7 






    8 
















    9 
    9 
   10 






    1 

   10 

    9 

    9 
    2 
    9 

    8 
    1 
    9 
    1 
    9 


















    2 




    3 
    1 



    7 


    1 



    9 





    8 
    9 
    8 





    9 

    8 



    9 



















    8 


    8 


















































































































































































































    1 

















































































    2 


    2 


























    2 


























    2 



















    2 






    1 

    2 




















    2 

    2 
























    2 

    2 










    1 









    2 






































































    2 
    2 


    2 
    2 


















    1 
    1 






    1 


























    1 



    1 







    1 








































    1 







































    1 



    1 
    1 














    1 



    1 




    1 


    1 
















    2 




    1 
    1 



    1 
    2 
    2 



    2 


    1 
    1 

    1 
    2 




















































































































































































































































































































































    2 





    1 




    1 
    2 





















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dcache.c
 *
 * Complete reimplementation
 * (C) 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

/*
 * Notes on the allocation strategy:
 *
 * The dcache is a master of the icache - whenever a dcache entry
 * exists, the inode will always exist. "iput()" is done either when
 * the dcache entry is deleted or garbage collected.
 */

#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"

/*
 * Usage:
 * dcache->d_inode->i_lock protects:
 *   - i_dentry, d_u.d_alias, d_inode of aliases
 * dcache_hash_bucket lock protects:
 *   - the dcache hash table
 * s_roots bl list spinlock protects:
 *   - the s_roots list (see __d_drop)
 * dentry->d_sb->s_dentry_lru_lock protects:
 *   - the dcache lru lists and counters
 * d_lock protects:
 *   - d_flags
 *   - d_name
 *   - d_lru
 *   - d_count
 *   - d_unhashed()
 *   - d_parent and d_chilren
 *   - childrens' d_sib and d_parent
 *   - d_u.d_alias, d_inode
 *
 * Ordering:
 * dentry->d_inode->i_lock
 *   dentry->d_lock
 *     dentry->d_sb->s_dentry_lru_lock
 *     dcache_hash_bucket lock
 *     s_roots lock
 *
 * If there is an ancestor relationship:
 * dentry->d_parent->...->d_parent->d_lock
 *   ...
 *     dentry->d_parent->d_lock
 *       dentry->d_lock
 *
 * If no ancestor relationship:
 * arbitrary, since it's serialized on rename_lock
 */
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);

__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

EXPORT_SYMBOL(rename_lock);

static struct kmem_cache *dentry_cache __ro_after_init;

const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);
const struct qstr dotdot_name = QSTR_INIT("..", 2);
EXPORT_SYMBOL(dotdot_name);

/*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
 * to make this good - I've just made it work.
 *
 * This hash-function tries to avoid losing too many bits of hash
 * information, yet avoid using a prime hash-size or similar.
 */

static unsigned int d_hash_shift __ro_after_init;

static struct hlist_bl_head *dentry_hashtable __ro_after_init;

static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
        return dentry_hashtable + (hash >> d_hash_shift);
}

#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];

static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
                                        unsigned int hash)
{
        hash += (unsigned long) parent / L1_CACHE_BYTES;
        return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}

struct dentry_stat_t {
        long nr_dentry;
        long nr_unused;
        long age_limit;                /* age in seconds */
        long want_pages;        /* pages requested by system */
        long nr_negative;        /* # of unused negative dentries */
        long dummy;                /* Reserved for future use */
};

static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
static struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
};

/*
 * Here we resort to our own counters instead of using generic per-cpu counters
 * for consistency with what the vfs inode code does. We are expected to harvest
 * better code and performance by having our own specialized counters.
 *
 * Please note that the loop is done over all possible CPUs, not over all online
 * CPUs. The reason for this is that we don't want to play games with CPUs going
 * on and off. If one of them goes off, we will just keep their counters.
 *
 * glommer: See cffbc8a for details, and if you ever intend to change this,
 * please update all vfs counters to match.
 */
static long get_nr_dentry(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_unused, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_negative(void)
{
        int i;
        long sum = 0;

        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_negative, i);
        return sum < 0 ? 0 : sum;
}

static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        dentry_stat.nr_dentry = get_nr_dentry();
        dentry_stat.nr_unused = get_nr_dentry_unused();
        dentry_stat.nr_negative = get_nr_dentry_negative();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table fs_dcache_sysctls[] = {
        {
                .procname        = "dentry-state",
                .data                = &dentry_stat,
                .maxlen                = 6*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_dentry,
        },
};

static int __init init_fs_dcache_sysctls(void)
{
        register_sysctl_init("fs", fs_dcache_sysctls);
        return 0;
}
fs_initcall(init_fs_dcache_sysctls);
#endif

/*
 * Compare 2 name strings, return 0 if they match, otherwise non-zero.
 * The strings are both count bytes long, and count is non-zero.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>
/*
 * NOTE! 'cs' and 'scount' come from a dentry, so it has a
 * aligned allocation for this particular component. We don't
 * strictly need the load_unaligned_zeropad() safety, but it
 * doesn't hurt either.
 *
 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
 * need the careful unaligned handling.
 */
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        unsigned long a,b,mask;

        for (;;) {
                a = read_word_at_a_time(cs);
                b = load_unaligned_zeropad(ct);
                if (tcount < sizeof(unsigned long))
                        break;
                if (unlikely(a != b))
                        return 1;
                cs += sizeof(unsigned long);
                ct += sizeof(unsigned long);
                tcount -= sizeof(unsigned long);
                if (!tcount)
                        return 0;
        }
        mask = bytemask_from_count(tcount);
        return unlikely(!!((a ^ b) & mask));
}

#else

static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        do {
                if (*cs != *ct)
                        return 1;
                cs++;
                ct++;
                tcount--;
        } while (tcount);
        return 0;
}

#endif

static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
        /*
         * Be careful about RCU walk racing with rename:
         * use 'READ_ONCE' to fetch the name pointer.
         *
         * NOTE! Even if a rename will mean that the length
         * was not loaded atomically, we don't care. The
         * RCU walk will check the sequence count eventually,
         * and catch it. And we won't overrun the buffer,
         * because we're reading the name pointer atomically,
         * and a dentry name is guaranteed to be properly
         * terminated with a NUL byte.
         *
         * End result: even if 'len' is wrong, we'll exit
         * early because the data cannot match (there can
         * be no NUL in the ct/tcount data)
         */
        const unsigned char *cs = READ_ONCE(dentry->d_name.name);

        return dentry_string_cmp(cs, ct, tcount);
}

struct external_name {
        union {
                atomic_t count;
                struct rcu_head head;
        } u;
        unsigned char name[];
};

static inline struct external_name *external_name(struct dentry *dentry)
{
        return container_of(dentry->d_name.name, struct external_name, name[0]);
}

static void __d_free(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);

        kmem_cache_free(dentry_cache, dentry); 
}

static void __d_free_external(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        kfree(external_name(dentry));
        kmem_cache_free(dentry_cache, dentry);
}

static inline int dname_external(const struct dentry *dentry)
{
        return dentry->d_name.name != dentry->d_iname;
}

void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        name->name = dentry->d_name;
        if (unlikely(dname_external(dentry))) {
                atomic_inc(&external_name(dentry)->u.count);
        } else {
                memcpy(name->inline_name, dentry->d_iname,
                       dentry->d_name.len + 1);
                name->name.name = name->inline_name;
        }
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(take_dentry_name_snapshot);

void release_dentry_name_snapshot(struct name_snapshot *name)
{
        if (unlikely(name->name.name != name->inline_name)) {
                struct external_name *p;
                p = container_of(name->name.name, struct external_name, name[0]);
                if (unlikely(atomic_dec_and_test(&p->u.count)))
                        kfree_rcu(p, u.head);
        }
}
EXPORT_SYMBOL(release_dentry_name_snapshot);

static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
{
        unsigned flags;

        dentry->d_inode = inode;
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~DCACHE_ENTRY_TYPE;
        flags |= type_flags;
        smp_store_release(&dentry->d_flags, flags);
}

static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
        unsigned flags = READ_ONCE(dentry->d_flags);

        flags &= ~DCACHE_ENTRY_TYPE;
        WRITE_ONCE(dentry->d_flags, flags);
        dentry->d_inode = NULL;
        if (flags & DCACHE_LRU_LIST)
                this_cpu_inc(nr_dentry_negative);
}

static void dentry_free(struct dentry *dentry)
{
        WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
        if (unlikely(dname_external(dentry))) {
                struct external_name *p = external_name(dentry);
                if (likely(atomic_dec_and_test(&p->u.count))) {
                        call_rcu(&dentry->d_u.d_rcu, __d_free_external);
                        return;
                }
        }
        /* if dentry was never visible to RCU, immediate free is OK */
        if (dentry->d_flags & DCACHE_NORCU)
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
}

/*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
 */
static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_lock)
        __releases(dentry->d_inode->i_lock)
{
        struct inode *inode = dentry->d_inode;

        raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
        raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
                fsnotify_inoderemove(inode);
        if (dentry->d_op && dentry->d_op->d_iput)
                dentry->d_op->d_iput(dentry, inode);
        else
                iput(inode);
}

/*
 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
 * is in use - which includes both the "real" per-superblock
 * LRU list _and_ the DCACHE_SHRINK_LIST use.
 *
 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
 * on the shrink list (ie not on the superblock LRU list).
 *
 * The per-cpu "nr_dentry_unused" counters are updated with
 * the DCACHE_LRU_LIST bit.
 *
 * The per-cpu "nr_dentry_negative" counters are only updated
 * when deleted from or added to the per-superblock LRU list, not
 * from/to the shrink list. That is to avoid an unneeded dec/inc
 * pair when moving from LRU to shrink list in select_collect().
 *
 * These helper functions make sure we always follow the
 * rules. d_lock must be held by the caller.
 */
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, 0);
        dentry->d_flags |= DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_inc(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_add_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_lru_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_del_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_shrink_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        list_del_init(&dentry->d_lru);
        dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        this_cpu_dec(nr_dentry_unused);
}

static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
        D_FLAG_VERIFY(dentry, 0);
        list_add(&dentry->d_lru, list);
        dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
}

/*
 * These can only be called under the global LRU lock, ie during the
 * callback for freeing the LRU list. "isolate" removes it from the
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate(lru, &dentry->d_lru);
}

static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
                              struct list_head *list)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate_move(lru, &dentry->d_lru, list);
}

static void ___d_drop(struct dentry *dentry)
{
        struct hlist_bl_head *b;
        /*
         * Hashed dentries are normally on the dentry hashtable,
         * with the exception of those newly allocated by
         * d_obtain_root, which are always IS_ROOT:
         */
        if (unlikely(IS_ROOT(dentry)))
                b = &dentry->d_sb->s_roots;
        else
                b = d_hash(dentry->d_name.hash);

        hlist_bl_lock(b);
        __hlist_bl_del(&dentry->d_hash);
        hlist_bl_unlock(b);
}

void __d_drop(struct dentry *dentry)
{
        if (!d_unhashed(dentry)) {
                ___d_drop(dentry);
                dentry->d_hash.pprev = NULL;
                write_seqcount_invalidate(&dentry->d_seq);
        }
}
EXPORT_SYMBOL(__d_drop);

/**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
 *
 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
 * be found through a VFS lookup any more. Note that this is different from
 * deleting the dentry - d_delete will try to mark the dentry negative if
 * possible, giving a successful _negative_ lookup, while d_drop will
 * just make the cache lookup fail.
 *
 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
 * reason (NFS timeouts or autofs deletes).
 *
 * __d_drop requires dentry->d_lock
 *
 * ___d_drop doesn't mark dentry as "unhashed"
 * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
 */
void d_drop(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);

static inline void dentry_unlist(struct dentry *dentry)
{
        struct dentry *next;
        /*
         * Inform d_walk() and shrink_dentry_list() that we are no longer
         * attached to the dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;
        if (unlikely(hlist_unhashed(&dentry->d_sib)))
                return;
        __hlist_del(&dentry->d_sib);
        /*
         * Cursors can move around the list of children.  While we'd been
         * a normal list member, it didn't matter - ->d_sib.next would've
         * been updated.  However, from now on it won't be and for the
         * things like d_walk() it might end up with a nasty surprise.
         * Normally d_walk() doesn't care about cursors moving around -
         * ->d_lock on parent prevents that and since a cursor has no children
         * of its own, we get through it without ever unlocking the parent.
         * There is one exception, though - if we ascend from a child that
         * gets killed as soon as we unlock it, the next sibling is found
         * using the value left in its ->d_sib.next.  And if _that_
         * pointed to a cursor, and cursor got moved (e.g. by lseek())
         * before d_walk() regains parent->d_lock, we'll end up skipping
         * everything the cursor had been moved past.
         *
         * Solution: make sure that the pointer left behind in ->d_sib.next
         * points to something that won't be moving around.  I.e. skip the
         * cursors.
         */
        while (dentry->d_sib.next) {
                next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
                if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
                        break;
                dentry->d_sib.next = next->d_sib.next;
        }
}

static struct dentry *__dentry_kill(struct dentry *dentry)
{
        struct dentry *parent = NULL;
        bool can_free = true;

        /*
         * The dentry is now unrecoverably dead to the world.
         */
        lockref_mark_dead(&dentry->d_lockref);

        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
        if (dentry->d_flags & DCACHE_OP_PRUNE)
                dentry->d_op->d_prune(dentry);

        if (dentry->d_flags & DCACHE_LRU_LIST) {
                if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
                        d_lru_del(dentry);
        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
        if (dentry->d_inode)
                dentry_unlink_inode(dentry);
        else
                spin_unlock(&dentry->d_lock);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);

        cond_resched();
        /* now that it's negative, ->d_parent is stable */
        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                spin_lock(&parent->d_lock);
        }
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry_unlist(dentry);
        if (dentry->d_flags & DCACHE_SHRINK_LIST)
                can_free = false;
        spin_unlock(&dentry->d_lock);
        if (likely(can_free))
                dentry_free(dentry);
        if (parent && --parent->d_lockref.count) {
                spin_unlock(&parent->d_lock);
                return NULL;
        }
        return parent;
}

/*
 * Lock a dentry for feeding it to __dentry_kill().
 * Called under rcu_read_lock() and dentry->d_lock; the former
 * guarantees that nothing we access will be freed under us.
 * Note that dentry is *not* protected from concurrent dentry_kill(),
 * d_delete(), etc.
 *
 * Return false if dentry is busy.  Otherwise, return true and have
 * that dentry's inode locked.
 */

static bool lock_for_kill(struct dentry *dentry)
{
        struct inode *inode = dentry->d_inode;

        if (unlikely(dentry->d_lockref.count))
                return false;

        if (!inode || likely(spin_trylock(&inode->i_lock)))
                return true;

        do {
                spin_unlock(&dentry->d_lock);
                spin_lock(&inode->i_lock);
                spin_lock(&dentry->d_lock);
                if (likely(inode == dentry->d_inode))
                        break;
                spin_unlock(&inode->i_lock);
                inode = dentry->d_inode;
        } while (inode);
        if (likely(!dentry->d_lockref.count))
                return true;
        if (inode)
                spin_unlock(&inode->i_lock);
        return false;
}

/*
 * Decide if dentry is worth retaining.  Usually this is called with dentry
 * locked; if not locked, we are more limited and might not be able to tell
 * without a lock.  False in this case means "punt to locked path and recheck".
 *
 * In case we aren't locked, these predicates are not "stable". However, it is
 * sufficient that at some point after we dropped the reference the dentry was
 * hashed and the flags had the proper value. Other dentry users may have
 * re-gotten a reference to the dentry and change that, but our work is done -
 * we can leave the dentry around with a zero refcount.
 */
static inline bool retain_dentry(struct dentry *dentry, bool locked)
{
        unsigned int d_flags;

        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);

        // Unreachable? Nobody would be able to look it up, no point retaining
        if (unlikely(d_unhashed(dentry)))
                return false;

        // Same if it's disconnected
        if (unlikely(d_flags & DCACHE_DISCONNECTED))
                return false;

        // ->d_delete() might tell us not to bother, but that requires
        // ->d_lock; can't decide without it
        if (unlikely(d_flags & DCACHE_OP_DELETE)) {
                if (!locked || dentry->d_op->d_delete(dentry))
                        return false;
        }

        // Explicitly told not to bother
        if (unlikely(d_flags & DCACHE_DONTCACHE))
                return false;

        // At this point it looks like we ought to keep it.  We also might
        // need to do something - put it on LRU if it wasn't there already
        // and mark it referenced if it was on LRU, but not marked yet.
        // Unfortunately, both actions require ->d_lock, so in lockless
        // case we'd have to punt rather than doing those.
        if (unlikely(!(d_flags & DCACHE_LRU_LIST))) {
                if (!locked)
                        return false;
                d_lru_add(dentry);
        } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) {
                if (!locked)
                        return false;
                dentry->d_flags |= DCACHE_REFERENCED;
        }
        return true;
}

void d_mark_dontcache(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&de->d_lock);
                de->d_flags |= DCACHE_DONTCACHE;
                spin_unlock(&de->d_lock);
        }
        inode->i_state |= I_DONTCACHE;
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);

/*
 * Try to do a lockless dput(), and return whether that was successful.
 *
 * If unsuccessful, we return false, having already taken the dentry lock.
 * In that case refcount is guaranteed to be zero and we have already
 * decided that it's not worth keeping around.
 *
 * The caller needs to hold the RCU read lock, so that the dentry is
 * guaranteed to stay around even if the refcount goes down to zero!
 */
static inline bool fast_dput(struct dentry *dentry)
{
        int ret;

        /*
         * try to decrement the lockref optimistically.
         */
        ret = lockref_put_return(&dentry->d_lockref);

        /*
         * If the lockref_put_return() failed due to the lock being held
         * by somebody else, the fast path has failed. We will need to
         * get the lock, and then check the count again.
         */
        if (unlikely(ret < 0)) {
                spin_lock(&dentry->d_lock);
                if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
                        spin_unlock(&dentry->d_lock);
                        return true;
                }
                dentry->d_lockref.count--;
                goto locked;
        }

        /*
         * If we weren't the last ref, we're done.
         */
        if (ret)
                return true;

        /*
         * Can we decide that decrement of refcount is all we needed without
         * taking the lock?  There's a very common case when it's all we need -
         * dentry looks like it ought to be retained and there's nothing else
         * to do.
         */
        if (retain_dentry(dentry, false))
                return true;

        /*
         * Either not worth retaining or we can't tell without the lock.
         * Get the lock, then.  We've already decremented the refcount to 0,
         * but we'll need to re-check the situation after getting the lock.
         */
        spin_lock(&dentry->d_lock);

        /*
         * Did somebody else grab a reference to it in the meantime, and
         * we're no longer the last user after all? Alternatively, somebody
         * else could have killed it and marked it dead. Either way, we
         * don't need to do anything else.
         */
locked:
        if (dentry->d_lockref.count || retain_dentry(dentry, true)) {
                spin_unlock(&dentry->d_lock);
                return true;
        }
        return false;
}


/* 
 * This is dput
 *
 * This is complicated by the fact that we do not want to put
 * dentries that are no longer on any hash chain on the unused
 * list: we'd much rather just get rid of them immediately.
 *
 * However, that implies that we have to traverse the dentry
 * tree upwards to the parents which might _also_ now be
 * scheduled for deletion (it may have been only waiting for
 * its last child to go away).
 *
 * This tail recursion is done by hand as we don't want to depend
 * on the compiler to always get this right (gcc generally doesn't).
 * Real recursion would eat up our stack space.
 */

/*
 * dput - release a dentry
 * @dentry: dentry to release 
 *
 * Release a dentry. This will drop the usage count and if appropriate
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
 */
void dput(struct dentry *dentry)
{
        if (!dentry)
                return;
        might_sleep();
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        while (lock_for_kill(dentry)) {
                rcu_read_unlock();
                dentry = __dentry_kill(dentry);
                if (!dentry)
                        return;
                if (retain_dentry(dentry, true)) {
                        spin_unlock(&dentry->d_lock);
                        return;
                }
                rcu_read_lock();
        }
        rcu_read_unlock();
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(dput);

static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
        if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                d_shrink_add(dentry, list);
        }
}

void dput_to_list(struct dentry *dentry, struct list_head *list)
{
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        to_shrink_list(dentry, list);
        spin_unlock(&dentry->d_lock);
}

struct dentry *dget_parent(struct dentry *dentry)
{
        int gotref;
        struct dentry *ret;
        unsigned seq;

        /*
         * Do optimistic parent lookup without any
         * locking.
         */
        rcu_read_lock();
        seq = raw_seqcount_begin(&dentry->d_seq);
        ret = READ_ONCE(dentry->d_parent);
        gotref = lockref_get_not_zero(&ret->d_lockref);
        rcu_read_unlock();
        if (likely(gotref)) {
                if (!read_seqcount_retry(&dentry->d_seq, seq))
                        return ret;
                dput(ret);
        }

repeat:
        /*
         * Don't need rcu_dereference because we re-check it was correct under
         * the lock.
         */
        rcu_read_lock();
        ret = dentry->d_parent;
        spin_lock(&ret->d_lock);
        if (unlikely(ret != dentry->d_parent)) {
                spin_unlock(&ret->d_lock);
                rcu_read_unlock();
                goto repeat;
        }
        rcu_read_unlock();
        BUG_ON(!ret->d_lockref.count);
        ret->d_lockref.count++;
        spin_unlock(&ret->d_lock);
        return ret;
}
EXPORT_SYMBOL(dget_parent);

static struct dentry * __d_find_any_alias(struct inode *inode)
{
        struct dentry *alias;

        if (hlist_empty(&inode->i_dentry))
                return NULL;
        alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        lockref_get(&alias->d_lockref);
        return alias;
}

/**
 * d_find_any_alias - find any alias for a given inode
 * @inode: inode to find an alias for
 *
 * If any aliases exist for the given inode, take and return a
 * reference for one of them.  If no aliases exist, return %NULL.
 */
struct dentry *d_find_any_alias(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        de = __d_find_any_alias(inode);
        spin_unlock(&inode->i_lock);
        return de;
}
EXPORT_SYMBOL(d_find_any_alias);

static struct dentry *__d_find_alias(struct inode *inode)
{
        struct dentry *alias;

        if (S_ISDIR(inode->i_mode))
                return __d_find_any_alias(inode);

        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&alias->d_lock);
                 if (!d_unhashed(alias)) {
                        dget_dlock(alias);
                        spin_unlock(&alias->d_lock);
                        return alias;
                }
                spin_unlock(&alias->d_lock);
        }
        return NULL;
}

/**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
 * Notice that if inode is a directory there can be only one alias and
 * it can be unhashed only if it has no children, or if it is the root
 * of a filesystem, or if the directory was renamed and d_revalidate
 * was the first vfs operation to notice.
 *
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
 * any other hashed alias over that one.
 */
struct dentry *d_find_alias(struct inode *inode)
{
        struct dentry *de = NULL;

        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
}
EXPORT_SYMBOL(d_find_alias);

/*
 *  Caller MUST be holding rcu_read_lock() and be guaranteed
 *  that inode won't get freed until rcu_read_unlock().
 */
struct dentry *d_find_alias_rcu(struct inode *inode)
{
        struct hlist_head *l = &inode->i_dentry;
        struct dentry *de = NULL;

        spin_lock(&inode->i_lock);
        // ->i_dentry and ->i_rcu are colocated, but the latter won't be
        // used without having I_FREEING set, which means no aliases left
        if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
                if (S_ISDIR(inode->i_mode)) {
                        de = hlist_entry(l->first, struct dentry, d_u.d_alias);
                } else {
                        hlist_for_each_entry(de, l, d_u.d_alias)
                                if (!d_unhashed(de))
                                        break;
                }
        }
        spin_unlock(&inode->i_lock);
        return de;
}

/*
 *        Try to kill dentries associated with this inode.
 * WARNING: you must own a reference to inode.
 */
void d_prune_aliases(struct inode *inode)
{
        LIST_HEAD(dispose);
        struct dentry *dentry;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&dentry->d_lock);
                if (!dentry->d_lockref.count)
                        to_shrink_list(dentry, &dispose);
                spin_unlock(&dentry->d_lock);
        }
        spin_unlock(&inode->i_lock);
        shrink_dentry_list(&dispose);
}
EXPORT_SYMBOL(d_prune_aliases);

static inline void shrink_kill(struct dentry *victim)
{
        do {
                rcu_read_unlock();
                victim = __dentry_kill(victim);
                rcu_read_lock();
        } while (victim && lock_for_kill(victim));
        rcu_read_unlock();
        if (victim)
                spin_unlock(&victim->d_lock);
}

void shrink_dentry_list(struct list_head *list)
{
        while (!list_empty(list)) {
                struct dentry *dentry;

                dentry = list_entry(list->prev, struct dentry, d_lru);
                spin_lock(&dentry->d_lock);
                rcu_read_lock();
                if (!lock_for_kill(dentry)) {
                        bool can_free;
                        rcu_read_unlock();
                        d_shrink_del(dentry);
                        can_free = dentry->d_flags & DCACHE_DENTRY_KILLED;
                        spin_unlock(&dentry->d_lock);
                        if (can_free)
                                dentry_free(dentry);
                        continue;
                }
                d_shrink_del(dentry);
                shrink_kill(dentry);
        }
}

static enum lru_status dentry_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);


        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        /*
         * Referenced dentries are still in use. If they have active
         * counts, just remove them from the LRU. Otherwise give them
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }

        if (dentry->d_flags & DCACHE_REFERENCED) {
                dentry->d_flags &= ~DCACHE_REFERENCED;
                spin_unlock(&dentry->d_lock);

                /*
                 * The list move itself will be made by the common LRU code. At
                 * this point, we've dropped the dentry->d_lock but keep the
                 * lru lock. This is safe to do, since every list movement is
                 * protected by the lru lock even if both locks are held.
                 *
                 * This is guaranteed by the fact that all LRU management
                 * functions are intermediated by the LRU API calls like
                 * list_lru_add_obj and list_lru_del_obj. List movement in this file
                 * only ever occur through this functions or through callbacks
                 * like this one, that are called from the LRU API.
                 *
                 * The only exceptions to this are functions like
                 * shrink_dentry_list, and code that first checks for the
                 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
                 * operating only with stack provided lists after they are
                 * properly isolated from the main list.  It is thus, always a
                 * local access.
                 */
                return LRU_ROTATE;
        }

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}

/**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
 * @sc: shrink control, passed to list_lru_shrink_walk()
 *
 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
 * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(dispose);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
}

static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);

        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}


/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
 * Shrink the dcache for the specified super block. This is used to free
 * the dcache before unmounting a file system.
 */
void shrink_dcache_sb(struct super_block *sb)
{
        do {
                LIST_HEAD(dispose);

                list_lru_walk(&sb->s_dentry_lru,
                        dentry_lru_isolate_shrink, &dispose, 1024);
                shrink_dentry_list(&dispose);
        } while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);

/**
 * enum d_walk_ret - action to talke during tree walk
 * @D_WALK_CONTINUE:        contrinue walk
 * @D_WALK_QUIT:        quit walk
 * @D_WALK_NORETRY:        quit when retry is needed
 * @D_WALK_SKIP:        skip this dentry and its children
 */
enum d_walk_ret {
        D_WALK_CONTINUE,
        D_WALK_QUIT,
        D_WALK_NORETRY,
        D_WALK_SKIP,
};

/**
 * d_walk - walk the dentry tree
 * @parent:        start of walk
 * @data:        data passed to @enter() and @finish()
 * @enter:        callback when first entering the dentry
 *
 * The @enter() callbacks are called with d_lock held.
 */
static void d_walk(struct dentry *parent, void *data,
                   enum d_walk_ret (*enter)(void *, struct dentry *))
{
        struct dentry *this_parent, *dentry;
        unsigned seq = 0;
        enum d_walk_ret ret;
        bool retry = true;

again:
        read_seqbegin_or_lock(&rename_lock, &seq);
        this_parent = parent;
        spin_lock(&this_parent->d_lock);

        ret = enter(data, this_parent);
        switch (ret) {
        case D_WALK_CONTINUE:
                break;
        case D_WALK_QUIT:
        case D_WALK_SKIP:
                goto out_unlock;
        case D_WALK_NORETRY:
                retry = false;
                break;
        }
repeat:
        dentry = d_first_child(this_parent);
resume:
        hlist_for_each_entry_from(dentry, d_sib) {
                if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
                        continue;

                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

                ret = enter(data, dentry);
                switch (ret) {
                case D_WALK_CONTINUE:
                        break;
                case D_WALK_QUIT:
                        spin_unlock(&dentry->d_lock);
                        goto out_unlock;
                case D_WALK_NORETRY:
                        retry = false;
                        break;
                case D_WALK_SKIP:
                        spin_unlock(&dentry->d_lock);
                        continue;
                }

                if (!hlist_empty(&dentry->d_children)) {
                        spin_unlock(&this_parent->d_lock);
                        spin_release(&dentry->d_lock.dep_map, _RET_IP_);
                        this_parent = dentry;
                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        rcu_read_lock();
ascend:
        if (this_parent != parent) {
                dentry = this_parent;
                this_parent = dentry->d_parent;

                spin_unlock(&dentry->d_lock);
                spin_lock(&this_parent->d_lock);

                /* might go back up the wrong parent if we have had a rename. */
                if (need_seqretry(&rename_lock, seq))
                        goto rename_retry;
                /* go into the first sibling still alive */
                hlist_for_each_entry_continue(dentry, d_sib) {
                        if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
                                rcu_read_unlock();
                                goto resume;
                        }
                }
                goto ascend;
        }
        if (need_seqretry(&rename_lock, seq))
                goto rename_retry;
        rcu_read_unlock();

out_unlock:
        spin_unlock(&this_parent->d_lock);
        done_seqretry(&rename_lock, seq);
        return;

rename_retry:
        spin_unlock(&this_parent->d_lock);
        rcu_read_unlock();
        BUG_ON(seq & 1);
        if (!retry)
                return;
        seq = 1;
        goto again;
}

struct check_mount {
        struct vfsmount *mnt;
        unsigned int mounted;
};

static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
        struct check_mount *info = data;
        struct path path = { .mnt = info->mnt, .dentry = dentry };

        if (likely(!d_mountpoint(dentry)))
                return D_WALK_CONTINUE;
        if (__path_is_mountpoint(&path)) {
                info->mounted = 1;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * path_has_submounts - check for mounts over a dentry in the
 *                      current namespace.
 * @parent: path to check.
 *
 * Return true if the parent or its subdirectories contain
 * a mount point in the current namespace.
 */
int path_has_submounts(const struct path *parent)
{
        struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };

        read_seqlock_excl(&mount_lock);
        d_walk(parent->dentry, &data, path_check_mount);
        read_sequnlock_excl(&mount_lock);

        return data.mounted;
}
EXPORT_SYMBOL(path_has_submounts);

/*
 * Called by mount code to set a mountpoint and check if the mountpoint is
 * reachable (e.g. NFS can unhash a directory dentry and then the complete
 * subtree can become unreachable).
 *
 * Only one of d_invalidate() and d_set_mounted() must succeed.  For
 * this reason take rename_lock and d_lock on dentry and ancestors.
 */
int d_set_mounted(struct dentry *dentry)
{
        struct dentry *p;
        int ret = -ENOENT;
        write_seqlock(&rename_lock);
        for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
                /* Need exclusion wrt. d_invalidate() */
                spin_lock(&p->d_lock);
                if (unlikely(d_unhashed(p))) {
                        spin_unlock(&p->d_lock);
                        goto out;
                }
                spin_unlock(&p->d_lock);
        }
        spin_lock(&dentry->d_lock);
        if (!d_unlinked(dentry)) {
                ret = -EBUSY;
                if (!d_mountpoint(dentry)) {
                        dentry->d_flags |= DCACHE_MOUNTED;
                        ret = 0;
                }
        }
         spin_unlock(&dentry->d_lock);
out:
        write_sequnlock(&rename_lock);
        return ret;
}

/*
 * Search the dentry child list of the specified parent,
 * and move any unused dentries to the end of the unused
 * list for prune_dcache(). We descend to the next level
 * whenever the d_children list is non-empty and continue
 * searching.
 *
 * It returns zero iff there are no unused children,
 * otherwise  it returns the number of children moved to
 * the end of the unused list. This may not be the total
 * number of unused children, because select_parent can
 * drop the lock and return early due to latency
 * constraints.
 */

struct select_data {
        struct dentry *start;
        union {
                long found;
                struct dentry *victim;
        };
        struct list_head dispose;
};

static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                data->found++;
        } else if (!dentry->d_lockref.count) {
                to_shrink_list(dentry, &data->dispose);
                data->found++;
        } else if (dentry->d_lockref.count < 0) {
                data->found++;
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (!dentry->d_lockref.count) {
                if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                        rcu_read_lock();
                        data->victim = dentry;
                        return D_WALK_QUIT;
                }
                to_shrink_list(dentry, &data->dispose);
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

/**
 * shrink_dcache_parent - prune dcache
 * @parent: parent of entries to prune
 *
 * Prune the dcache to remove unused children of the parent dentry.
 */
void shrink_dcache_parent(struct dentry *parent)
{
        for (;;) {
                struct select_data data = {.start = parent};

                INIT_LIST_HEAD(&data.dispose);
                d_walk(parent, &data, select_collect);

                if (!list_empty(&data.dispose)) {
                        shrink_dentry_list(&data.dispose);
                        continue;
                }

                cond_resched();
                if (!data.found)
                        break;
                data.victim = NULL;
                d_walk(parent, &data, select_collect2);
                if (data.victim) {
                        spin_lock(&data.victim->d_lock);
                        if (!lock_for_kill(data.victim)) {
                                spin_unlock(&data.victim->d_lock);
                                rcu_read_unlock();
                        } else {
                                shrink_kill(data.victim);
                        }
                }
                if (!list_empty(&data.dispose))
                        shrink_dentry_list(&data.dispose);
        }
}
EXPORT_SYMBOL(shrink_dcache_parent);

static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
        /* it has busy descendents; complain about those instead */
        if (!hlist_empty(&dentry->d_children))
                return D_WALK_CONTINUE;

        /* root with refcount 1 is fine */
        if (dentry == _data && dentry->d_lockref.count == 1)
                return D_WALK_CONTINUE;

        WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} "
                        " still in use (%d) [unmount of %s %s]\n",
                       dentry,
                       dentry->d_inode ?
                       dentry->d_inode->i_ino : 0UL,
                       dentry,
                       dentry->d_lockref.count,
                       dentry->d_sb->s_type->name,
                       dentry->d_sb->s_id);
        return D_WALK_CONTINUE;
}

static void do_one_tree(struct dentry *dentry)
{
        shrink_dcache_parent(dentry);
        d_walk(dentry, dentry, umount_check);
        d_drop(dentry);
        dput(dentry);
}

/*
 * destroy the dentries attached to a superblock on unmounting
 */
void shrink_dcache_for_umount(struct super_block *sb)
{
        struct dentry *dentry;

        WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");

        dentry = sb->s_root;
        sb->s_root = NULL;
        do_one_tree(dentry);

        while (!hlist_bl_empty(&sb->s_roots)) {
                dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
                do_one_tree(dentry);
        }
}

static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
        struct dentry **victim = _data;
        if (d_mountpoint(dentry)) {
                *victim = dget_dlock(dentry);
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * d_invalidate - detach submounts, prune dcache, and drop
 * @dentry: dentry to invalidate (aka detach, prune and drop)
 */
void d_invalidate(struct dentry *dentry)
{
        bool had_submounts = false;
        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
                spin_unlock(&dentry->d_lock);
                return;
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);

        /* Negative dentries can be dropped without further checks */
        if (!dentry->d_inode)
                return;

        shrink_dcache_parent(dentry);
        for (;;) {
                struct dentry *victim = NULL;
                d_walk(dentry, &victim, find_submount);
                if (!victim) {
                        if (had_submounts)
                                shrink_dcache_parent(dentry);
                        return;
                }
                had_submounts = true;
                detach_mounts(victim);
                dput(victim);
        }
}
EXPORT_SYMBOL(d_invalidate);

/**
 * __d_alloc        -        allocate a dcache entry
 * @sb: filesystem it will belong to
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry;
        char *dname;
        int err;

        dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
                                      GFP_KERNEL);
        if (!dentry)
                return NULL;

        /*
         * We guarantee that the inline name is always NUL-terminated.
         * This way the memcpy() done by the name switching in rename
         * will still always have a NUL at the end, even if we might
         * be overwriting an internal NUL character
         */
        dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
        if (unlikely(!name)) {
                name = &slash_name;
                dname = dentry->d_iname;
        } else if (name->len > DNAME_INLINE_LEN-1) {
                size_t size = offsetof(struct external_name, name[1]);
                struct external_name *p = kmalloc(size + name->len,
                                                  GFP_KERNEL_ACCOUNT |
                                                  __GFP_RECLAIMABLE);
                if (!p) {
                        kmem_cache_free(dentry_cache, dentry); 
                        return NULL;
                }
                atomic_set(&p->u.count, 1);
                dname = p->name;
        } else  {
                dname = dentry->d_iname;
        }        

        dentry->d_name.len = name->len;
        dentry->d_name.hash = name->hash;
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;

        /* Make sure we always see the terminating NUL character */
        smp_store_release(&dentry->d_name.name, dname); /* ^^^ */

        dentry->d_lockref.count = 1;
        dentry->d_flags = 0;
        spin_lock_init(&dentry->d_lock);
        seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
        dentry->d_inode = NULL;
        dentry->d_parent = dentry;
        dentry->d_sb = sb;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        INIT_HLIST_BL_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_HLIST_HEAD(&dentry->d_children);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_HLIST_NODE(&dentry->d_sib);
        d_set_d_op(dentry, dentry->d_sb->s_d_op);

        if (dentry->d_op && dentry->d_op->d_init) {
                err = dentry->d_op->d_init(dentry);
                if (err) {
                        if (dname_external(dentry))
                                kfree(external_name(dentry));
                        kmem_cache_free(dentry_cache, dentry);
                        return NULL;
                }
        }

        this_cpu_inc(nr_dentry);

        return dentry;
}

/**
 * d_alloc        -        allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(parent->d_sb, name);
        if (!dentry)
                return NULL;
        spin_lock(&parent->d_lock);
        /*
         * don't need child lock because it is not subject
         * to concurrency here
         */
        dentry->d_parent = dget_dlock(parent);
        hlist_add_head(&dentry->d_sib, &parent->d_children);
        spin_unlock(&parent->d_lock);

        return dentry;
}
EXPORT_SYMBOL(d_alloc);

struct dentry *d_alloc_anon(struct super_block *sb)
{
        return __d_alloc(sb, NULL);
}
EXPORT_SYMBOL(d_alloc_anon);

struct dentry *d_alloc_cursor(struct dentry * parent)
{
        struct dentry *dentry = d_alloc_anon(parent->d_sb);
        if (dentry) {
                dentry->d_flags |= DCACHE_DENTRY_CURSOR;
                dentry->d_parent = dget(parent);
        }
        return dentry;
}

/**
 * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
 * @sb: the superblock
 * @name: qstr of the name
 *
 * For a filesystem that just pins its dentries in memory and never
 * performs lookups at all, return an unhashed IS_ROOT dentry.
 * This is used for pipes, sockets et.al. - the stuff that should
 * never be anyone's children or parents.  Unlike all other
 * dentries, these will not have RCU delay between dropping the
 * last reference and freeing them.
 *
 * The only user is alloc_file_pseudo() and that's what should
 * be considered a public interface.  Don't use directly.
 */
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
        static const struct dentry_operations anon_ops = {
                .d_dname = simple_dname
        };
        struct dentry *dentry = __d_alloc(sb, name);
        if (likely(dentry)) {
                dentry->d_flags |= DCACHE_NORCU;
                if (!sb->s_d_op)
                        d_set_d_op(dentry, &anon_ops);
        }
        return dentry;
}

struct dentry *d_alloc_name(struct dentry *parent, const char *name)
{
        struct qstr q;

        q.name = name;
        q.hash_len = hashlen_string(parent, name);
        return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);

void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
        WARN_ON_ONCE(dentry->d_op);
        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH        |
                                DCACHE_OP_COMPARE        |
                                DCACHE_OP_REVALIDATE        |
                                DCACHE_OP_WEAK_REVALIDATE        |
                                DCACHE_OP_DELETE        |
                                DCACHE_OP_REAL));
        dentry->d_op = op;
        if (!op)
                return;
        if (op->d_hash)
                dentry->d_flags |= DCACHE_OP_HASH;
        if (op->d_compare)
                dentry->d_flags |= DCACHE_OP_COMPARE;
        if (op->d_revalidate)
                dentry->d_flags |= DCACHE_OP_REVALIDATE;
        if (op->d_weak_revalidate)
                dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
        if (op->d_delete)
                dentry->d_flags |= DCACHE_OP_DELETE;
        if (op->d_prune)
                dentry->d_flags |= DCACHE_OP_PRUNE;
        if (op->d_real)
                dentry->d_flags |= DCACHE_OP_REAL;

}
EXPORT_SYMBOL(d_set_d_op);

static unsigned d_flags_for_inode(struct inode *inode)
{
        unsigned add_flags = DCACHE_REGULAR_TYPE;

        if (!inode)
                return DCACHE_MISS_TYPE;

        if (S_ISDIR(inode->i_mode)) {
                add_flags = DCACHE_DIRECTORY_TYPE;
                if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
                        if (unlikely(!inode->i_op->lookup))
                                add_flags = DCACHE_AUTODIR_TYPE;
                        else
                                inode->i_opflags |= IOP_LOOKUP;
                }
                goto type_determined;
        }

        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
                if (unlikely(inode->i_op->get_link)) {
                        add_flags = DCACHE_SYMLINK_TYPE;
                        goto type_determined;
                }
                inode->i_opflags |= IOP_NOFOLLOW;
        }

        if (unlikely(!S_ISREG(inode->i_mode)))
                add_flags = DCACHE_SPECIAL_TYPE;

type_determined:
        if (unlikely(IS_AUTOMOUNT(inode)))
                add_flags |= DCACHE_NEED_AUTOMOUNT;
        return add_flags;
}

static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
        unsigned add_flags = d_flags_for_inode(inode);
        WARN_ON(d_in_lookup(dentry));

        spin_lock(&dentry->d_lock);
        /*
         * Decrement negative dentry count if it was in the LRU list.
         */
        if (dentry->d_flags & DCACHE_LRU_LIST)
                this_cpu_dec(nr_dentry_negative);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
        fsnotify_update_flags(dentry);
        spin_unlock(&dentry->d_lock);
}

/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
                __d_instantiate(entry, inode);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_instantiate);

/*
 * This should be equivalent to d_instantiate() + unlock_new_inode(),
 * with lockdep-related part of unlock_new_inode() done before
 * anything else.  Use that instead of open-coding d_instantiate()/
 * unlock_new_inode() combinations.
 */
void d_instantiate_new(struct dentry *entry, struct inode *inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        BUG_ON(!inode);
        lockdep_annotate_inode_mutex_key(inode);
        security_d_instantiate(entry, inode);
        spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);

struct dentry *d_make_root(struct inode *root_inode)
{
        struct dentry *res = NULL;

        if (root_inode) {
                res = d_alloc_anon(root_inode->i_sb);
                if (res)
                        d_instantiate(res, root_inode);
                else
                        iput(root_inode);
        }
        return res;
}
EXPORT_SYMBOL(d_make_root);

static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
        struct super_block *sb;
        struct dentry *new, *res;

        if (!inode)
                return ERR_PTR(-ESTALE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        sb = inode->i_sb;

        res = d_find_any_alias(inode); /* existing alias? */
        if (res)
                goto out;

        new = d_alloc_anon(sb);
        if (!new) {
                res = ERR_PTR(-ENOMEM);
                goto out;
        }

        security_d_instantiate(new, inode);
        spin_lock(&inode->i_lock);
        res = __d_find_any_alias(inode); /* recheck under lock */
        if (likely(!res)) { /* still no alias, attach a disconnected dentry */
                unsigned add_flags = d_flags_for_inode(inode);

                if (disconnected)
                        add_flags |= DCACHE_DISCONNECTED;

                spin_lock(&new->d_lock);
                __d_set_inode_and_type(new, inode, add_flags);
                hlist_add_head(&new->d_u.d_alias, &inode->i_dentry);
                if (!disconnected) {
                        hlist_bl_lock(&sb->s_roots);
                        hlist_bl_add_head(&new->d_hash, &sb->s_roots);
                        hlist_bl_unlock(&sb->s_roots);
                }
                spin_unlock(&new->d_lock);
                spin_unlock(&inode->i_lock);
                inode = NULL; /* consumed by new->d_inode */
                res = new;
        } else {
                spin_unlock(&inode->i_lock);
                dput(new);
        }

 out:
        iput(inode);
        return res;
}

/**
 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
 * similar open by handle operations.  The returned dentry may be anonymous,
 * or may have a full name (if the inode was already in the cache).
 *
 * When called on a directory inode, we must ensure that the inode only ever
 * has one dentry.  If a dentry is found, that is returned instead of
 * allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is released.
 * To make it easier to use in export operations a %NULL or IS_ERR inode may
 * be passed in and the error will be propagated to the return value,
 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_alias(struct inode *inode)
{
        return __d_obtain_alias(inode, true);
}
EXPORT_SYMBOL(d_obtain_alias);

/**
 * d_obtain_root - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain an IS_ROOT dentry for the root of a filesystem.
 *
 * We must ensure that directory inodes only ever have one dentry.  If a
 * dentry is found, that is returned instead of allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is
 * released.  A %NULL or IS_ERR inode may be passed in and will be the
 * error will be propagate to the return value, with a %NULL @inode
 * replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_root(struct inode *inode)
{
        return __d_obtain_alias(inode, false);
}
EXPORT_SYMBOL(d_obtain_root);

/**
 * d_add_ci - lookup or allocate new dentry with case-exact name
 * @inode:  the inode case-insensitive lookup has found
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * This is to avoid filling the dcache with case-insensitive names to the
 * same inode, only the actual correct case is stored in the dcache for
 * case-insensitive filesystems.
 *
 * For a case-insensitive lookup match and if the case-exact dentry
 * already exists in the dcache, use it and return it.
 *
 * If no entry exists with the exact case name, allocate new dentry with
 * the exact case, and return the spliced entry.
 */
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                        struct qstr *name)
{
        struct dentry *found, *res;

        /*
         * First check if a dentry matching the name already exists,
         * if not go ahead and create it now.
         */
        found = d_hash_and_lookup(dentry->d_parent, name);
        if (found) {
                iput(inode);
                return found;
        }
        if (d_in_lookup(dentry)) {
                found = d_alloc_parallel(dentry->d_parent, name,
                                        dentry->d_wait);
                if (IS_ERR(found) || !d_in_lookup(found)) {
                        iput(inode);
                        return found;
                }
        } else {
                found = d_alloc(dentry->d_parent, name);
                if (!found) {
                        iput(inode);
                        return ERR_PTR(-ENOMEM);
                } 
        }
        res = d_splice_alias(inode, found);
        if (res) {
                d_lookup_done(found);
                dput(found);
                return res;
        }
        return found;
}
EXPORT_SYMBOL(d_add_ci);

/**
 * d_same_name - compare dentry name with case-exact name
 * @parent: parent dentry
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * Return: true if names are same, or false
 */
bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                 const struct qstr *name)
{
        if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
                if (dentry->d_name.len != name->len)
                        return false;
                return dentry_cmp(dentry, name->name, name->len) == 0;
        }
        return parent->d_op->d_compare(dentry,
                                       dentry->d_name.len, dentry->d_name.name,
                                       name) == 0;
}
EXPORT_SYMBOL_GPL(d_same_name);

/*
 * This is __d_lookup_rcu() when the parent dentry has
 * DCACHE_OP_COMPARE, which makes things much nastier.
 */
static noinline struct dentry *__d_lookup_rcu_op_compare(
        const struct dentry *parent,
        const struct qstr *name,
        unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
        struct hlist_bl_node *node;
        struct dentry *dentry;

        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                int tlen;
                const char *tname;
                unsigned seq;

seqretry:
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;
                if (dentry->d_name.hash != hashlen_hash(hashlen))
                        continue;
                tlen = dentry->d_name.len;
                tname = dentry->d_name.name;
                /* we want a consistent (name,len) pair */
                if (read_seqcount_retry(&dentry->d_seq, seq)) {
                        cpu_relax();
                        goto seqretry;
                }
                if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0)
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * __d_lookup_rcu - search for a dentry (racy, store-free)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * @seqp: returns d_seq value at the point where the dentry was found
 * Returns: dentry, or NULL
 *
 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
 * resolution (store-free path walking) design described in
 * Documentation/filesystems/path-lookup.txt.
 *
 * This is not to be used outside core vfs.
 *
 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
 * held, and rcu_read_lock held. The returned dentry must not be stored into
 * without taking d_lock and checking d_seq sequence count against @seq
 * returned here.
 *
 * A refcount may be taken on the found dentry with the d_rcu_to_refcount
 * function.
 *
 * Alternatively, __d_lookup_rcu may be called again to look up the child of
 * the returned dentry, so long as its parent's seqlock is checked after the
 * child is looked up. Thus, an interlocking stepping of sequence lock checks
 * is formed, giving integrity down the path walk.
 *
 * NOTE! The caller *has* to check the resulting dentry against the sequence
 * number we've returned before using any of the resulting dentry state!
 */
struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name,
                                unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        const unsigned char *str = name->name;
        struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
        struct hlist_bl_node *node;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        if (unlikely(parent->d_flags & DCACHE_OP_COMPARE))
                return __d_lookup_rcu_op_compare(parent, name, seqp);

        /*
         * The hash list is protected using RCU.
         *
         * Carefully use d_seq when comparing a candidate dentry, to avoid
         * races with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                unsigned seq;

                /*
                 * The dentry sequence count protects us from concurrent
                 * renames, and thus protects parent and name fields.
                 *
                 * The caller must perform a seqcount check in order
                 * to do anything useful with the returned dentry.
                 *
                 * NOTE! We do a "raw" seqcount_begin here. That means that
                 * we don't wait for the sequence count to stabilize if it
                 * is in the middle of a sequence change. If we do the slow
                 * dentry compare, we will do seqretries until it is stable,
                 * and if we end up with a successful lookup, we actually
                 * want to exit RCU lookup anyway.
                 *
                 * Note that raw_seqcount_begin still *does* smp_rmb(), so
                 * we are still guaranteed NUL-termination of ->d_name.name.
                 */
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;
                if (dentry->d_name.hash_len != hashlen)
                        continue;
                if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * d_lookup searches the children of the parent dentry for the name in
 * question. If the dentry is found its reference count is incremented and the
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
        struct dentry *dentry;
        unsigned seq;

        do {
                seq = read_seqbegin(&rename_lock);
                dentry = __d_lookup(parent, name);
                if (dentry)
                        break;
        } while (read_seqretry(&rename_lock, seq));
        return dentry;
}
EXPORT_SYMBOL(d_lookup);

/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * __d_lookup is like d_lookup, however it may (rarely) return a
 * false-negative result due to unrelated rename activity.
 *
 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
 * however it must be used carefully, eg. with a following d_lookup in
 * the case of failure.
 *
 * __d_lookup callers must be commented.
 */
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = d_hash(hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
         * with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {

                if (dentry->d_name.hash != hash)
                        continue;

                spin_lock(&dentry->d_lock);
                if (dentry->d_parent != parent)
                        goto next;
                if (d_unhashed(dentry))
                        goto next;

                if (!d_same_name(dentry, parent, name))
                        goto next;

                dentry->d_lockref.count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
next:
                spin_unlock(&dentry->d_lock);
         }
         rcu_read_unlock();

         return found;
}

/**
 * d_hash_and_lookup - hash the qstr then search for a dentry
 * @dir: Directory to search in
 * @name: qstr of name we wish to find
 *
 * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
 */
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
        /*
         * Check for a fs-specific hash function. Note that we must
         * calculate the standard hash first, as the d_op->d_hash()
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(dir, name->name, name->len);
        if (dir->d_flags & DCACHE_OP_HASH) {
                int err = dir->d_op->d_hash(dir, name);
                if (unlikely(err < 0))
                        return ERR_PTR(err);
        }
        return d_lookup(dir, name);
}
EXPORT_SYMBOL(d_hash_and_lookup);

/*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
 * - unhash this dentry and free it.
 *
 * Usually, we want to just turn this into
 * a negative dentry, but if anybody else is
 * currently using the dentry or the inode
 * we can't do that and we fall back on removing
 * it from the hash queues and waiting for
 * it to be deleted later when it has no users
 */
 
/**
 * d_delete - delete a dentry
 * @dentry: The dentry to delete
 *
 * Turn the dentry into a negative dentry if possible, otherwise
 * remove it from the hash queues so it can be deleted later
 */
 
void d_delete(struct dentry * dentry)
{
        struct inode *inode = dentry->d_inode;

        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        /*
         * Are we the only user?
         */
        if (dentry->d_lockref.count == 1) {
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_unlink_inode(dentry);
        } else {
                __d_drop(dentry);
                spin_unlock(&dentry->d_lock);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_delete);

static void __d_rehash(struct dentry *entry)
{
        struct hlist_bl_head *b = d_hash(entry->d_name.hash);

        hlist_bl_lock(b);
        hlist_bl_add_head_rcu(&entry->d_hash, b);
        hlist_bl_unlock(b);
}

/**
 * d_rehash        - add an entry back to the hash
 * @entry: dentry to add to the hash
 *
 * Adds a dentry to the hash according to its name.
 */
 
void d_rehash(struct dentry * entry)
{
        spin_lock(&entry->d_lock);
        __d_rehash(entry);
        spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);

static inline unsigned start_dir_add(struct inode *dir)
{
        preempt_disable_nested();
        for (;;) {
                unsigned n = dir->i_dir_seq;
                if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
                        return n;
                cpu_relax();
        }
}

static inline void end_dir_add(struct inode *dir, unsigned int n,
                               wait_queue_head_t *d_wait)
{
        smp_store_release(&dir->i_dir_seq, n + 2);
        preempt_enable_nested();
        wake_up_all(d_wait);
}

static void d_wait_lookup(struct dentry *dentry)
{
        if (d_in_lookup(dentry)) {
                DECLARE_WAITQUEUE(wait, current);
                add_wait_queue(dentry->d_wait, &wait);
                do {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        spin_unlock(&dentry->d_lock);
                        schedule();
                        spin_lock(&dentry->d_lock);
                } while (d_in_lookup(dentry));
        }
}

struct dentry *d_alloc_parallel(struct dentry *parent,
                                const struct qstr *name,
                                wait_queue_head_t *wq)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = in_lookup_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *new = d_alloc(parent, name);
        struct dentry *dentry;
        unsigned seq, r_seq, d_seq;

        if (unlikely(!new))
                return ERR_PTR(-ENOMEM);

retry:
        rcu_read_lock();
        seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
        r_seq = read_seqbegin(&rename_lock);
        dentry = __d_lookup_rcu(parent, name, &d_seq);
        if (unlikely(dentry)) {
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
                        rcu_read_unlock();
                        dput(dentry);
                        goto retry;
                }
                rcu_read_unlock();
                dput(new);
                return dentry;
        }
        if (unlikely(read_seqretry(&rename_lock, r_seq))) {
                rcu_read_unlock();
                goto retry;
        }

        if (unlikely(seq & 1)) {
                rcu_read_unlock();
                goto retry;
        }

        hlist_bl_lock(b);
        if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
                hlist_bl_unlock(b);
                rcu_read_unlock();
                goto retry;
        }
        /*
         * No changes for the parent since the beginning of d_lookup().
         * Since all removals from the chain happen with hlist_bl_lock(),
         * any potential in-lookup matches are going to stay here until
         * we unlock the chain.  All fields are stable in everything
         * we encounter.
         */
        hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
                if (dentry->d_name.hash != hash)
                        continue;
                if (dentry->d_parent != parent)
                        continue;
                if (!d_same_name(dentry, parent, name))
                        continue;
                hlist_bl_unlock(b);
                /* now we can try to grab a reference */
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }

                rcu_read_unlock();
                /*
                 * somebody is likely to be still doing lookup for it;
                 * wait for them to finish
                 */
                spin_lock(&dentry->d_lock);
                d_wait_lookup(dentry);
                /*
                 * it's not in-lookup anymore; in principle we should repeat
                 * everything from dcache lookup, but it's likely to be what
                 * d_lookup() would've found anyway.  If it is, just return it;
                 * otherwise we really have to repeat the whole thing.
                 */
                if (unlikely(dentry->d_name.hash != hash))
                        goto mismatch;
                if (unlikely(dentry->d_parent != parent))
                        goto mismatch;
                if (unlikely(d_unhashed(dentry)))
                        goto mismatch;
                if (unlikely(!d_same_name(dentry, parent, name)))
                        goto mismatch;
                /* OK, it *is* a hashed match; return it */
                spin_unlock(&dentry->d_lock);
                dput(new);
                return dentry;
        }
        rcu_read_unlock();
        /* we can't take ->d_lock here; it's OK, though. */
        new->d_flags |= DCACHE_PAR_LOOKUP;
        new->d_wait = wq;
        hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
        hlist_bl_unlock(b);
        return new;
mismatch:
        spin_unlock(&dentry->d_lock);
        dput(dentry);
        goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);

/*
 * - Unhash the dentry
 * - Retrieve and clear the waitqueue head in dentry
 * - Return the waitqueue head
 */
static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry)
{
        wait_queue_head_t *d_wait;
        struct hlist_bl_head *b;

        lockdep_assert_held(&dentry->d_lock);

        b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash);
        hlist_bl_lock(b);
        dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
        __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
        d_wait = dentry->d_wait;
        dentry->d_wait = NULL;
        hlist_bl_unlock(b);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_lru);
        return d_wait;
}

void __d_lookup_unhash_wake(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        wake_up_all(__d_lookup_unhash(dentry));
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(__d_lookup_unhash_wake);

/* inode->i_lock held if inode is non-NULL */

static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;
        spin_lock(&dentry->d_lock);
        if (unlikely(d_in_lookup(dentry))) {
                dir = dentry->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(dentry);
        }
        if (inode) {
                unsigned add_flags = d_flags_for_inode(inode);
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
                raw_write_seqcount_begin(&dentry->d_seq);
                __d_set_inode_and_type(dentry, inode, add_flags);
                raw_write_seqcount_end(&dentry->d_seq);
                fsnotify_update_flags(dentry);
        }
        __d_rehash(dentry);
        if (dir)
                end_dir_add(dir, n, d_wait);
        spin_unlock(&dentry->d_lock);
        if (inode)
                spin_unlock(&inode->i_lock);
}

/**
 * d_add - add dentry to hash queues
 * @entry: dentry to add
 * @inode: The inode to attach to this dentry
 *
 * This adds the entry to the hash queues and initializes @inode.
 * The entry was actually filled in earlier during d_alloc().
 */

void d_add(struct dentry *entry, struct inode *inode)
{
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
        }
        __d_add(entry, inode);
}
EXPORT_SYMBOL(d_add);

/**
 * d_exact_alias - find and hash an exact unhashed alias
 * @entry: dentry to add
 * @inode: The inode to go with this dentry
 *
 * If an unhashed dentry with the same name/parent and desired
 * inode already exists, hash and return it.  Otherwise, return
 * NULL.
 *
 * Parent directory should be locked.
 */
struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
{
        struct dentry *alias;
        unsigned int hash = entry->d_name.hash;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                /*
                 * Don't need alias->d_lock here, because aliases with
                 * d_parent == entry->d_parent are not subject to name or
                 * parent changes, because the parent inode i_mutex is held.
                 */
                if (alias->d_name.hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
                if (!d_same_name(alias, entry->d_parent, &entry->d_name))
                        continue;
                spin_lock(&alias->d_lock);
                if (!d_unhashed(alias)) {
                        spin_unlock(&alias->d_lock);
                        alias = NULL;
                } else {
                        dget_dlock(alias);
                        __d_rehash(alias);
                        spin_unlock(&alias->d_lock);
                }
                spin_unlock(&inode->i_lock);
                return alias;
        }
        spin_unlock(&inode->i_lock);
        return NULL;
}
EXPORT_SYMBOL(d_exact_alias);

static void swap_names(struct dentry *dentry, struct dentry *target)
{
        if (unlikely(dname_external(target))) {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * Both external: swap the pointers
                         */
                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
                         * storage and make target internal.
                         */
                        memcpy(target->d_iname, dentry->d_name.name,
                                        dentry->d_name.len + 1);
                        dentry->d_name.name = target->d_name.name;
                        target->d_name.name = target->d_iname;
                }
        } else {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * dentry:external, target:internal.  Give dentry's
                         * storage to target and make dentry internal
                         */
                        memcpy(dentry->d_iname, target->d_name.name,
                                        target->d_name.len + 1);
                        target->d_name.name = dentry->d_name.name;
                        dentry->d_name.name = dentry->d_iname;
                } else {
                        /*
                         * Both are internal.
                         */
                        unsigned int i;
                        BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
                        for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
                                swap(((long *) &dentry->d_iname)[i],
                                     ((long *) &target->d_iname)[i]);
                        }
                }
        }
        swap(dentry->d_name.hash_len, target->d_name.hash_len);
}

static void copy_name(struct dentry *dentry, struct dentry *target)
{
        struct external_name *old_name = NULL;
        if (unlikely(dname_external(dentry)))
                old_name = external_name(dentry);
        if (unlikely(dname_external(target))) {
                atomic_inc(&external_name(target)->u.count);
                dentry->d_name = target->d_name;
        } else {
                memcpy(dentry->d_iname, target->d_name.name,
                                target->d_name.len + 1);
                dentry->d_name.name = dentry->d_iname;
                dentry->d_name.hash_len = target->d_name.hash_len;
        }
        if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
                kfree_rcu(old_name, u.head);
}

/*
 * __d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 * @exchange: exchange the two dentries
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. Caller must hold
 * rename_lock, the i_mutex of the source and target directories,
 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
 */
static void __d_move(struct dentry *dentry, struct dentry *target,
                     bool exchange)
{
        struct dentry *old_parent, *p;
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;

        WARN_ON(!dentry->d_inode);
        if (WARN_ON(dentry == target))
                return;

        BUG_ON(d_ancestor(target, dentry));
        old_parent = dentry->d_parent;
        p = d_ancestor(old_parent, target);
        if (IS_ROOT(dentry)) {
                BUG_ON(p);
                spin_lock(&target->d_parent->d_lock);
        } else if (!p) {
                /* target is not a descendent of dentry->d_parent */
                spin_lock(&target->d_parent->d_lock);
                spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
        } else {
                BUG_ON(p == dentry);
                spin_lock(&old_parent->d_lock);
                if (p != target)
                        spin_lock_nested(&target->d_parent->d_lock,
                                        DENTRY_D_LOCK_NESTED);
        }
        spin_lock_nested(&dentry->d_lock, 2);
        spin_lock_nested(&target->d_lock, 3);

        if (unlikely(d_in_lookup(target))) {
                dir = target->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(target);
        }

        write_seqcount_begin(&dentry->d_seq);
        write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

        /* unhash both */
        if (!d_unhashed(dentry))
                ___d_drop(dentry);
        if (!d_unhashed(target))
                ___d_drop(target);

        /* ... and switch them in the tree */
        dentry->d_parent = target->d_parent;
        if (!exchange) {
                copy_name(dentry, target);
                target->d_hash.pprev = NULL;
                dentry->d_parent->d_lockref.count++;
                if (dentry != old_parent) /* wasn't IS_ROOT */
                        WARN_ON(!--old_parent->d_lockref.count);
        } else {
                target->d_parent = old_parent;
                swap_names(dentry, target);
                if (!hlist_unhashed(&target->d_sib))
                        __hlist_del(&target->d_sib);
                hlist_add_head(&target->d_sib, &target->d_parent->d_children);
                __d_rehash(target);
                fsnotify_update_flags(target);
        }
        if (!hlist_unhashed(&dentry->d_sib))
                __hlist_del(&dentry->d_sib);
        hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
        __d_rehash(dentry);
        fsnotify_update_flags(dentry);
        fscrypt_handle_d_move(dentry);

        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);

        if (dir)
                end_dir_add(dir, n, d_wait);

        if (dentry->d_parent != old_parent)
                spin_unlock(&dentry->d_parent->d_lock);
        if (dentry != old_parent)
                spin_unlock(&old_parent->d_lock);
        spin_unlock(&target->d_lock);
        spin_unlock(&dentry->d_lock);
}

/*
 * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. See the locking
 * requirements for __d_move.
 */
void d_move(struct dentry *dentry, struct dentry *target)
{
        write_seqlock(&rename_lock);
        __d_move(dentry, target, false);
        write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);

/*
 * d_exchange - exchange two dentries
 * @dentry1: first dentry
 * @dentry2: second dentry
 */
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
{
        write_seqlock(&rename_lock);

        WARN_ON(!dentry1->d_inode);
        WARN_ON(!dentry2->d_inode);
        WARN_ON(IS_ROOT(dentry1));
        WARN_ON(IS_ROOT(dentry2));

        __d_move(dentry1, dentry2, true);

        write_sequnlock(&rename_lock);
}

/**
 * d_ancestor - search for an ancestor
 * @p1: ancestor dentry
 * @p2: child dentry
 *
 * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
 * an ancestor of p2, else NULL.
 */
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                if (p->d_parent == p1)
                        return p;
        }
        return NULL;
}

/*
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
 * dentry->d_parent->d_inode->i_mutex, and rename_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
static int __d_unalias(struct dentry *dentry, struct dentry *alias)
{
        struct mutex *m1 = NULL;
        struct rw_semaphore *m2 = NULL;
        int ret = -ESTALE;

        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
                goto out_unalias;

        /* See lock_rename() */
        if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                goto out_err;
        m1 = &dentry->d_sb->s_vfs_rename_mutex;
        if (!inode_trylock_shared(alias->d_parent->d_inode))
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
        __d_move(alias, dentry, false);
        ret = 0;
out_err:
        if (m2)
                up_read(m2);
        if (m1)
                mutex_unlock(m1);
        return ret;
}

/**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has an IS_ROOT alias, then d_move that in
 * place of the given dentry and return it, else simply d_add the inode
 * to the dentry and return NULL.
 *
 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
 * we should error out: directories can't have multiple aliases.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 * Cluster filesystems may call this function with a negative, hashed dentry.
 * In that case, we know that the inode will be a regular file, and also this
 * will only occur during atomic_open. So we need to check for the dentry
 * being already hashed only in the final case.
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        BUG_ON(!d_unhashed(dentry));

        if (!inode)
                goto out;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
                if (unlikely(new)) {
                        /* The reference to new ensures it remains an alias */
                        spin_unlock(&inode->i_lock);
                        write_seqlock(&rename_lock);
                        if (unlikely(d_ancestor(new, dentry))) {
                                write_sequnlock(&rename_lock);
                                dput(new);
                                new = ERR_PTR(-ELOOP);
                                pr_warn_ratelimited(
                                        "VFS: Lookup of '%s' in %s %s"
                                        " would have caused loop\n",
                                        dentry->d_name.name,
                                        inode->i_sb->s_type->name,
                                        inode->i_sb->s_id);
                        } else if (!IS_ROOT(new)) {
                                struct dentry *old_parent = dget(new->d_parent);
                                int err = __d_unalias(dentry, new);
                                write_sequnlock(&rename_lock);
                                if (err) {
                                        dput(new);
                                        new = ERR_PTR(err);
                                }
                                dput(old_parent);
                        } else {
                                __d_move(new, dentry, false);
                                write_sequnlock(&rename_lock);
                        }
                        iput(inode);
                        return new;
                }
        }
out:
        __d_add(dentry, inode);
        return NULL;
}
EXPORT_SYMBOL(d_splice_alias);

/*
 * Test whether new_dentry is a subdirectory of old_dentry.
 *
 * Trivially implemented using the dcache structure
 */

/**
 * is_subdir - is new dentry a subdirectory of old_dentry
 * @new_dentry: new dentry
 * @old_dentry: old dentry
 *
 * Returns true if new_dentry is a subdirectory of the parent (at any depth).
 * Returns false otherwise.
 * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
 */
  
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
        bool result;
        unsigned seq;

        if (new_dentry == old_dentry)
                return true;

        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
                /*
                 * Need rcu_readlock to protect against the d_parent trashing
                 * due to d_move
                 */
                rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = true;
                else
                        result = false;
                rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));

        return result;
}
EXPORT_SYMBOL(is_subdir);

static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
{
        struct dentry *root = data;
        if (dentry != root) {
                if (d_unhashed(dentry) || !dentry->d_inode)
                        return D_WALK_SKIP;

                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                        dentry->d_flags |= DCACHE_GENOCIDE;
                        dentry->d_lockref.count--;
                }
        }
        return D_WALK_CONTINUE;
}

void d_genocide(struct dentry *parent)
{
        d_walk(parent, parent, d_genocide_kill);
}

void d_mark_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        BUG_ON(dentry->d_name.name != dentry->d_iname ||
                !hlist_unhashed(&dentry->d_u.d_alias) ||
                !d_unlinked(dentry));
        spin_lock(&dentry->d_parent->d_lock);
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
                                (unsigned long long)inode->i_ino);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
}
EXPORT_SYMBOL(d_mark_tmpfile);

void d_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        inode_dec_link_count(inode);
        d_mark_tmpfile(file, inode);
        d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);

static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
        if (!str)
                return 0;
        dhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("dhash_entries=", set_dhash_entries);

static void __init dcache_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY | HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

static void __init dcache_init(void)
{
        /*
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
         * of the dcache.
         */
        dentry_cache = KMEM_CACHE_USERCOPY(dentry,
                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
                d_iname);

        /* Hash may have been set up in dcache_init_early */
        if (!hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

/* SLAB cache for __getname() consumers */
struct kmem_cache *names_cachep __ro_after_init;
EXPORT_SYMBOL(names_cachep);

void __init vfs_caches_init_early(void)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
                INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);

        dcache_init_early();
        inode_init_early();
}

void __init vfs_caches_init(void)
{
        names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);

        dcache_init();
        inode_init();
        files_init();
        files_maxfiles_init();
        mnt_init();
        bdev_cache_init();
        chrdev_init();
}























































































    1 

































    1 


    1 

    1 



    1 






































































































































    1 



    1 




















    1 







































    1 

    1 













    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 











    3 

    2 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 







    1 















    1 


    1 
    1 


























    1 










    1 

















    1 














    1 












    1 







    1 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
 * 10Apr2002        Andrew Morton
 *                Split out of fs/inode.c
 *                Additions for address_space-based writeback
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"

/*
 * 4MB minimal write chunk size
 */
#define MIN_WRITEBACK_PAGES        (4096UL >> (PAGE_SHIFT - 10))

/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        unsigned int auto_free:1;        /* free on completion */
        enum wb_reason reason;                /* why was writeback initiated? */

        struct list_head list;                /* pending work list */
        struct wb_completion *done;        /* set if the caller waits */
};

/*
 * If an inode is constantly having its pages dirtied, but then the
 * updates stop dirtytime_expire_interval seconds in the past, it's
 * possible for the worst case time between when an inode has its
 * timestamps updated and when they finally get written out to be two
 * dirtytime_expire_intervals.  We set the default to 12 hours (in
 * seconds), which means most of the time inodes will have their
 * timestamps written to disk after 12 hours, but in the worst case a
 * few inodes might not their timestamps updated for 24 hours.
 */
unsigned int dirtytime_expire_interval = 12 * 60 * 60;

static inline struct inode *wb_inode(struct list_head *head)
{
        return list_entry(head, struct inode, i_io_list);
}

/*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure and inline functions so that the definition
 * remains local to this file.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);

static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb)) {
                return false;
        } else {
                set_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(!wb->avg_write_bandwidth);
                atomic_long_add(wb->avg_write_bandwidth,
                                &wb->bdi->tot_write_bandwidth);
                return true;
        }
}

static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
            list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
                clear_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
                                        &wb->bdi->tot_write_bandwidth) < 0);
        }
}

/**
 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
 * @inode: inode to be moved
 * @wb: target bdi_writeback
 * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
 *
 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
 * Returns %true if @inode is the first occupant of the !dirty_time IO
 * lists; otherwise, %false.
 */
static bool inode_io_list_move_locked(struct inode *inode,
                                      struct bdi_writeback *wb,
                                      struct list_head *head)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        list_move(&inode->i_io_list, head);

        /* dirty_time doesn't count as dirty_io until expiration */
        if (head != &wb->b_dirty_time)
                return wb_io_lists_populated(wb);

        wb_io_lists_depopulated(wb);
        return false;
}

static void wb_wakeup(struct bdi_writeback *wb)
{
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        spin_unlock_irq(&wb->work_lock);
}

/*
 * This function is used when the first inode for this wb is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
 *
 * We have to be careful not to postpone flush work if it is scheduled for
 * earlier. Thus we use queue_delayed_work().
 */
static void wb_wakeup_delayed(struct bdi_writeback *wb)
{
        unsigned long timeout;

        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                queue_delayed_work(bdi_wq, &wb->dwork, timeout);
        spin_unlock_irq(&wb->work_lock);
}

static void finish_writeback_work(struct wb_writeback_work *work)
{
        struct wb_completion *done = work->done;

        if (work->auto_free)
                kfree(work);
        if (done) {
                wait_queue_head_t *waitq = done->waitq;

                /* @done can't be accessed after the following dec */
                if (atomic_dec_and_test(&done->cnt))
                        wake_up_all(waitq);
        }
}

static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
{
        trace_writeback_queue(wb, work);

        if (work->done)
                atomic_inc(&work->done->cnt);

        spin_lock_irq(&wb->work_lock);

        if (test_bit(WB_registered, &wb->state)) {
                list_add_tail(&work->list, &wb->work_list);
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        } else
                finish_writeback_work(work);

        spin_unlock_irq(&wb->work_lock);
}

/**
 * wb_wait_for_completion - wait for completion of bdi_writeback_works
 * @done: target wb_completion
 *
 * Wait for one or more work items issued to @bdi with their ->done field
 * set to @done, which should have been initialized with
 * DEFINE_WB_COMPLETION().  This function returns after all such work items
 * are completed.  Work items which are waited upon aren't freed
 * automatically on completion.
 */
void wb_wait_for_completion(struct wb_completion *done)
{
        atomic_dec(&done->cnt);                /* put down the initial count */
        wait_event(*done->waitq, !atomic_read(&done->cnt));
}

#ifdef CONFIG_CGROUP_WRITEBACK

/*
 * Parameters for foreign inode detection, see wbc_detach_inode() to see
 * how they're used.
 *
 * These paramters are inherently heuristical as the detection target
 * itself is fuzzy.  All we want to do is detaching an inode from the
 * current owner if it's being written to by some other cgroups too much.
 *
 * The current cgroup writeback is built on the assumption that multiple
 * cgroups writing to the same inode concurrently is very rare and a mode
 * of operation which isn't well supported.  As such, the goal is not
 * taking too long when a different cgroup takes over an inode while
 * avoiding too aggressive flip-flops from occasional foreign writes.
 *
 * We record, very roughly, 2s worth of IO time history and if more than
 * half of that is foreign, trigger the switch.  The recording is quantized
 * to 16 slots.  To avoid tiny writes from swinging the decision too much,
 * writes smaller than 1/8 of avg size are ignored.
 */
#define WB_FRN_TIME_SHIFT        13        /* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT        3        /* avg = avg * 7/8 + new * 1/8 */
#define WB_FRN_TIME_CUT_DIV        8        /* ignore rounds < avg / 8 */
#define WB_FRN_TIME_PERIOD        (2 * (1 << WB_FRN_TIME_SHIFT))        /* 2s */

#define WB_FRN_HIST_SLOTS        16        /* inode->i_wb_frn_history is 16bit */
#define WB_FRN_HIST_UNIT        (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
                                        /* each slot's duration is 2s / 16 */
#define WB_FRN_HIST_THR_SLOTS        (WB_FRN_HIST_SLOTS / 2)
                                        /* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS        (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT        1024        /* don't queue too many concurrently */

/*
 * Maximum inodes per isw.  A specific value has been chosen to make
 * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
 */
#define WB_MAX_INODES_PER_ISW  ((1024UL - sizeof(struct inode_switch_wbs_context)) \
                                / sizeof(struct inode *))

static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;

void __inode_attach_wb(struct inode *inode, struct folio *folio)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;

        if (inode_cgwb_enabled(inode)) {
                struct cgroup_subsys_state *memcg_css;

                if (folio) {
                        memcg_css = mem_cgroup_css_from_folio(folio);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                } else {
                        /* must pin memcg_css, see wb_get_create() */
                        memcg_css = task_get_css(current, memory_cgrp_id);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                        css_put(memcg_css);
                }
        }

        if (!wb)
                wb = &bdi->wb;

        /*
         * There may be multiple instances of this function racing to
         * update the same inode.  Use cmpxchg() to tell the winner.
         */
        if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
                wb_put(wb);
}
EXPORT_SYMBOL_GPL(__inode_attach_wb);

/**
 * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
 * @inode: inode of interest with i_lock held
 * @wb: target bdi_writeback
 *
 * Remove the inode from wb's io lists and if necessarily put onto b_attached
 * list.  Only inodes attached to cgwb's are kept on this list.
 */
static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        inode->i_state &= ~I_SYNC_QUEUED;
        if (wb != &wb->bdi->wb)
                list_move(&inode->i_io_list, &wb->b_attached);
        else
                list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

/**
 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
 * @inode: inode of interest with i_lock held
 *
 * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
 * held on entry and is released on return.  The returned wb is guaranteed
 * to stay @inode's associated wb until its list_lock is released.
 */
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        while (true) {
                struct bdi_writeback *wb = inode_to_wb(inode);

                /*
                 * inode_to_wb() association is protected by both
                 * @inode->i_lock and @wb->list_lock but list_lock nests
                 * outside i_lock.  Drop i_lock and verify that the
                 * association hasn't changed after acquiring list_lock.
                 */
                wb_get(wb);
                spin_unlock(&inode->i_lock);
                spin_lock(&wb->list_lock);

                /* i_wb may have changed inbetween, can't use inode_to_wb() */
                if (likely(wb == inode->i_wb)) {
                        wb_put(wb);        /* @inode already has ref */
                        return wb;
                }

                spin_unlock(&wb->list_lock);
                wb_put(wb);
                cpu_relax();
                spin_lock(&inode->i_lock);
        }
}

/**
 * inode_to_wb_and_lock_list - determine an inode's wb and lock it
 * @inode: inode of interest
 *
 * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
 * on entry.
 */
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        spin_lock(&inode->i_lock);
        return locked_inode_to_wb_and_lock_list(inode);
}

struct inode_switch_wbs_context {
        struct rcu_work                work;

        /*
         * Multiple inodes can be switched at once.  The switching procedure
         * consists of two parts, separated by a RCU grace period.  To make
         * sure that the second part is executed for each inode gone through
         * the first part, all inode pointers are placed into a NULL-terminated
         * array embedded into struct inode_switch_wbs_context.  Otherwise
         * an inode could be left in a non-consistent state.
         */
        struct bdi_writeback        *new_wb;
        struct inode                *inodes[];
};

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        down_write(&bdi->wb_switch_rwsem);
}

static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        up_write(&bdi->wb_switch_rwsem);
}

static bool inode_do_switch_wbs(struct inode *inode,
                                struct bdi_writeback *old_wb,
                                struct bdi_writeback *new_wb)
{
        struct address_space *mapping = inode->i_mapping;
        XA_STATE(xas, &mapping->i_pages, 0);
        struct folio *folio;
        bool switched = false;

        spin_lock(&inode->i_lock);
        xa_lock_irq(&mapping->i_pages);

        /*
         * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
         * path owns the inode and we shouldn't modify ->i_io_list.
         */
        if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
                goto skip_switch;

        trace_inode_switch_wbs(inode, old_wb, new_wb);

        /*
         * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
         * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to
         * folios actually under writeback.
         */
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
                if (folio_test_dirty(folio)) {
                        long nr = folio_nr_pages(folio);
                        wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
                        wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
                }
        }

        xas_set(&xas, 0);
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
                long nr = folio_nr_pages(folio);
                WARN_ON_ONCE(!folio_test_writeback(folio));
                wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
                wb_stat_mod(new_wb, WB_WRITEBACK, nr);
        }

        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                atomic_dec(&old_wb->writeback_inodes);
                atomic_inc(&new_wb->writeback_inodes);
        }

        wb_get(new_wb);

        /*
         * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
         * the specific list @inode was on is ignored and the @inode is put on
         * ->b_dirty which is always correct including from ->b_dirty_time.
         * The transfer preserves @inode->dirtied_when ordering.  If the @inode
         * was clean, it means it was on the b_attached list, so move it onto
         * the b_attached list of @new_wb.
         */
        if (!list_empty(&inode->i_io_list)) {
                inode->i_wb = new_wb;

                if (inode->i_state & I_DIRTY_ALL) {
                        struct inode *pos;

                        list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
                                if (time_after_eq(inode->dirtied_when,
                                                  pos->dirtied_when))
                                        break;
                        inode_io_list_move_locked(inode, new_wb,
                                                  pos->i_io_list.prev);
                } else {
                        inode_cgwb_move_to_attached(inode, new_wb);
                }
        } else {
                inode->i_wb = new_wb;
        }

        /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
        switched = true;
skip_switch:
        /*
         * Paired with load_acquire in unlocked_inode_to_wb_begin() and
         * ensures that the new wb is visible if they see !I_WB_SWITCH.
         */
        smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);

        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&inode->i_lock);

        return switched;
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
{
        struct inode_switch_wbs_context *isw =
                container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
        struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
        struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
        struct bdi_writeback *new_wb = isw->new_wb;
        unsigned long nr_switched = 0;
        struct inode **inodep;

        /*
         * If @inode switches cgwb membership while sync_inodes_sb() is
         * being issued, sync_inodes_sb() might miss it.  Synchronize.
         */
        down_read(&bdi->wb_switch_rwsem);

        /*
         * By the time control reaches here, RCU grace period has passed
         * since I_WB_SWITCH assertion and all wb stat update transactions
         * between unlocked_inode_to_wb_begin/end() are guaranteed to be
         * synchronizing against the i_pages lock.
         *
         * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
         * gives us exclusion against all wb related operations on @inode
         * including IO list manipulations and stat updates.
         */
        if (old_wb < new_wb) {
                spin_lock(&old_wb->list_lock);
                spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
        } else {
                spin_lock(&new_wb->list_lock);
                spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
        }

        for (inodep = isw->inodes; *inodep; inodep++) {
                WARN_ON_ONCE((*inodep)->i_wb != old_wb);
                if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
                        nr_switched++;
        }

        spin_unlock(&new_wb->list_lock);
        spin_unlock(&old_wb->list_lock);

        up_read(&bdi->wb_switch_rwsem);

        if (nr_switched) {
                wb_wakeup(new_wb);
                wb_put_many(old_wb, nr_switched);
        }

        for (inodep = isw->inodes; *inodep; inodep++)
                iput(*inodep);
        wb_put(new_wb);
        kfree(isw);
        atomic_dec(&isw_nr_in_flight);
}

static bool inode_prepare_wbs_switch(struct inode *inode,
                                     struct bdi_writeback *new_wb)
{
        /*
         * Paired with smp_mb() in cgroup_writeback_umount().
         * isw_nr_in_flight must be increased before checking SB_ACTIVE and
         * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
         * in cgroup_writeback_umount() and the isw_wq will be not flushed.
         */
        smp_mb();

        if (IS_DAX(inode))
                return false;

        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
        if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
            inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
            inode_to_wb(inode) == new_wb) {
                spin_unlock(&inode->i_lock);
                return false;
        }
        inode->i_state |= I_WB_SWITCH;
        __iget(inode);
        spin_unlock(&inode->i_lock);

        return true;
}

/**
 * inode_switch_wbs - change the wb association of an inode
 * @inode: target inode
 * @new_wb_id: ID of the new wb
 *
 * Switch @inode's wb association to the wb identified by @new_wb_id.  The
 * switching is performed asynchronously and may fail silently.
 */
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;

        /* noop if seems to be already in progress */
        if (inode->i_state & I_WB_SWITCH)
                return;

        /* avoid queueing a new switch if too many are already in flight */
        if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
                return;

        isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
        if (!isw)
                return;

        atomic_inc(&isw_nr_in_flight);

        /* find and pin the new wb */
        rcu_read_lock();
        memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css)
                goto out_free;

        isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
        css_put(memcg_css);
        if (!isw->new_wb)
                goto out_free;

        if (!inode_prepare_wbs_switch(inode, isw->new_wb))
                goto out_free;

        isw->inodes[0] = inode;

        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the i_page
         * lock so that stat transfer can synchronize against them.
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
        queue_rcu_work(isw_wq, &isw->work);
        return;

out_free:
        atomic_dec(&isw_nr_in_flight);
        if (isw->new_wb)
                wb_put(isw->new_wb);
        kfree(isw);
}

static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
                                   struct list_head *list, int *nr)
{
        struct inode *inode;

        list_for_each_entry(inode, list, i_io_list) {
                if (!inode_prepare_wbs_switch(inode, isw->new_wb))
                        continue;

                isw->inodes[*nr] = inode;
                (*nr)++;

                if (*nr >= WB_MAX_INODES_PER_ISW - 1)
                        return true;
        }
        return false;
}

/**
 * cleanup_offline_cgwb - detach associated inodes
 * @wb: target wb
 *
 * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
 * to eventually release the dying @wb.  Returns %true if not all inodes were
 * switched and the function has to be restarted.
 */
bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;
        int nr;
        bool restart = false;

        isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
                      GFP_KERNEL);
        if (!isw)
                return restart;

        atomic_inc(&isw_nr_in_flight);

        for (memcg_css = wb->memcg_css->parent; memcg_css;
             memcg_css = memcg_css->parent) {
                isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
                if (isw->new_wb)
                        break;
        }
        if (unlikely(!isw->new_wb))
                isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */

        nr = 0;
        spin_lock(&wb->list_lock);
        /*
         * In addition to the inodes that have completed writeback, also switch
         * cgwbs for those inodes only with dirty timestamps. Otherwise, those
         * inodes won't be written back for a long time when lazytime is
         * enabled, and thus pinning the dying cgwbs. It won't break the
         * bandwidth restrictions, as writeback of inode metadata is not
         * accounted for.
         */
        restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
        if (!restart)
                restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
        spin_unlock(&wb->list_lock);

        /* no attached inodes? bail out */
        if (nr == 0) {
                atomic_dec(&isw_nr_in_flight);
                wb_put(isw->new_wb);
                kfree(isw);
                return restart;
        }

        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the i_page
         * lock so that stat transfer can synchronize against them.
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
        queue_rcu_work(isw_wq, &isw->work);

        return restart;
}

/**
 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * @inode is locked and about to be written back under the control of @wbc.
 * Record @inode's writeback context into @wbc and unlock the i_lock.  On
 * writeback completion, wbc_detach_inode() should be called.  This is used
 * to track the cgroup writeback context.
 */
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
{
        if (!inode_cgwb_enabled(inode)) {
                spin_unlock(&inode->i_lock);
                return;
        }

        wbc->wb = inode_to_wb(inode);
        wbc->inode = inode;

        wbc->wb_id = wbc->wb->memcg_css->id;
        wbc->wb_lcand_id = inode->i_wb_frn_winner;
        wbc->wb_tcand_id = 0;
        wbc->wb_bytes = 0;
        wbc->wb_lcand_bytes = 0;
        wbc->wb_tcand_bytes = 0;

        wb_get(wbc->wb);
        spin_unlock(&inode->i_lock);

        /*
         * A dying wb indicates that either the blkcg associated with the
         * memcg changed or the associated memcg is dying.  In the first
         * case, a replacement wb should already be available and we should
         * refresh the wb immediately.  In the second case, trying to
         * refresh will keep failing.
         */
        if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
                inode_switch_wbs(inode, wbc->wb_id);
}
EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);

/**
 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
 * @wbc: writeback_control of the just finished writeback
 *
 * To be called after a writeback attempt of an inode finishes and undoes
 * wbc_attach_and_unlock_inode().  Can be called under any context.
 *
 * As concurrent write sharing of an inode is expected to be very rare and
 * memcg only tracks page ownership on first-use basis severely confining
 * the usefulness of such sharing, cgroup writeback tracks ownership
 * per-inode.  While the support for concurrent write sharing of an inode
 * is deemed unnecessary, an inode being written to by different cgroups at
 * different points in time is a lot more common, and, more importantly,
 * charging only by first-use can too readily lead to grossly incorrect
 * behaviors (single foreign page can lead to gigabytes of writeback to be
 * incorrectly attributed).
 *
 * To resolve this issue, cgroup writeback detects the majority dirtier of
 * an inode and transfers the ownership to it.  To avoid unnecessary
 * oscillation, the detection mechanism keeps track of history and gives
 * out the switch verdict only if the foreign usage pattern is stable over
 * a certain amount of time and/or writeback attempts.
 *
 * On each writeback attempt, @wbc tries to detect the majority writer
 * using Boyer-Moore majority vote algorithm.  In addition to the byte
 * count from the majority voting, it also counts the bytes written for the
 * current wb and the last round's winner wb (max of last round's current
 * wb, the winner from two rounds ago, and the last round's majority
 * candidate).  Keeping track of the historical winner helps the algorithm
 * to semi-reliably detect the most active writer even when it's not the
 * absolute majority.
 *
 * Once the winner of the round is determined, whether the winner is
 * foreign or not and how much IO time the round consumed is recorded in
 * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
 * over a certain threshold, the switch verdict is given.
 */
void wbc_detach_inode(struct writeback_control *wbc)
{
        struct bdi_writeback *wb = wbc->wb;
        struct inode *inode = wbc->inode;
        unsigned long avg_time, max_bytes, max_time;
        u16 history;
        int max_id;

        if (!wb)
                return;

        history = inode->i_wb_frn_history;
        avg_time = inode->i_wb_frn_avg_time;

        /* pick the winner of this round */
        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_id;
                max_bytes = wbc->wb_bytes;
        } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_lcand_id;
                max_bytes = wbc->wb_lcand_bytes;
        } else {
                max_id = wbc->wb_tcand_id;
                max_bytes = wbc->wb_tcand_bytes;
        }

        /*
         * Calculate the amount of IO time the winner consumed and fold it
         * into the running average kept per inode.  If the consumed IO
         * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
         * deciding whether to switch or not.  This is to prevent one-off
         * small dirtiers from skewing the verdict.
         */
        max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
                                wb->avg_write_bandwidth);
        if (avg_time)
                avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
                            (avg_time >> WB_FRN_TIME_AVG_SHIFT);
        else
                avg_time = max_time;        /* immediate catch up on first run */

        if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
                int slots;

                /*
                 * The switch verdict is reached if foreign wb's consume
                 * more than a certain proportion of IO time in a
                 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
                 * history mask where each bit represents one sixteenth of
                 * the period.  Determine the number of slots to shift into
                 * history from @max_time.
                 */
                slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
                            (unsigned long)WB_FRN_HIST_MAX_SLOTS);
                history <<= slots;
                if (wbc->wb_id != max_id)
                        history |= (1U << slots) - 1;

                if (history)
                        trace_inode_foreign_history(inode, wbc, history);

                /*
                 * Switch if the current wb isn't the consistent winner.
                 * If there are multiple closely competing dirtiers, the
                 * inode may switch across them repeatedly over time, which
                 * is okay.  The main goal is avoiding keeping an inode on
                 * the wrong wb for an extended period of time.
                 */
                if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
                        inode_switch_wbs(inode, max_id);
        }

        /*
         * Multiple instances of this function may race to update the
         * following fields but we don't mind occassional inaccuracies.
         */
        inode->i_wb_frn_winner = max_id;
        inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
        inode->i_wb_frn_history = history;

        wb_put(wbc->wb);
        wbc->wb = NULL;
}
EXPORT_SYMBOL_GPL(wbc_detach_inode);

/**
 * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
 * @wbc: writeback_control of the writeback in progress
 * @page: page being written out
 * @bytes: number of bytes being written out
 *
 * @bytes from @page are about to written out during the writeback
 * controlled by @wbc.  Keep the book for foreign inode detection.  See
 * wbc_detach_inode().
 */
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes)
{
        struct folio *folio;
        struct cgroup_subsys_state *css;
        int id;

        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (!wbc->wb || wbc->no_cgroup_owner)
                return;

        folio = page_folio(page);
        css = mem_cgroup_css_from_folio(folio);
        /* dead cgroups shouldn't contribute to inode ownership arbitration */
        if (!(css->flags & CSS_ONLINE))
                return;

        id = css->id;

        if (id == wbc->wb_id) {
                wbc->wb_bytes += bytes;
                return;
        }

        if (id == wbc->wb_lcand_id)
                wbc->wb_lcand_bytes += bytes;

        /* Boyer-Moore majority vote algorithm */
        if (!wbc->wb_tcand_bytes)
                wbc->wb_tcand_id = id;
        if (id == wbc->wb_tcand_id)
                wbc->wb_tcand_bytes += bytes;
        else
                wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);

/**
 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
 * @wb: target bdi_writeback to split @nr_pages to
 * @nr_pages: number of pages to write for the whole bdi
 *
 * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
 * relation to the total write bandwidth of all wb's w/ dirty inodes on
 * @wb->bdi.
 */
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        unsigned long this_bw = wb->avg_write_bandwidth;
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);

        if (nr_pages == LONG_MAX)
                return LONG_MAX;

        /*
         * This may be called on clean wb's and proportional distribution
         * may not make sense, just use the original @nr_pages in those
         * cases.  In general, we wanna err on the side of writing more.
         */
        if (!tot_bw || this_bw >= tot_bw)
                return nr_pages;
        else
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}

/**
 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
 * @bdi: target backing_dev_info
 * @base_work: wb_writeback_work to issue
 * @skip_if_busy: skip wb's which already have writeback in progress
 *
 * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
 * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
 * distributed to the busy wbs according to each wb's proportion in the
 * total active write bandwidth of @bdi.
 */
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        struct bdi_writeback *last_wb = NULL;
        struct bdi_writeback *wb = list_entry(&bdi->wb_list,
                                              struct bdi_writeback, bdi_node);

        might_sleep();
restart:
        rcu_read_lock();
        list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION(fallback_work_done, bdi);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;

                if (last_wb) {
                        wb_put(last_wb);
                        last_wb = NULL;
                }

                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
                     list_empty(&wb->b_dirty_time)))
                        continue;
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;

                nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);

                work = kmalloc(sizeof(*work), GFP_ATOMIC);
                if (work) {
                        *work = *base_work;
                        work->nr_pages = nr_pages;
                        work->auto_free = 1;
                        wb_queue_work(wb, work);
                        continue;
                }

                /*
                 * If wb_tryget fails, the wb has been shutdown, skip it.
                 *
                 * Pin @wb so that it stays on @bdi->wb_list.  This allows
                 * continuing iteration from @wb after dropping and
                 * regrabbing rcu read lock.
                 */
                if (!wb_tryget(wb))
                        continue;

                /* alloc failed, execute synchronously using on-stack fallback */
                work = &fallback_work;
                *work = *base_work;
                work->nr_pages = nr_pages;
                work->auto_free = 0;
                work->done = &fallback_work_done;

                wb_queue_work(wb, work);
                last_wb = wb;

                rcu_read_unlock();
                wb_wait_for_completion(&fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();

        if (last_wb)
                wb_put(last_wb);
}

/**
 * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
 * @bdi_id: target bdi id
 * @memcg_id: target memcg css id
 * @reason: reason why some writeback work initiated
 * @done: target wb_completion
 *
 * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
 * with the specified parameters.
 */
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done)
{
        struct backing_dev_info *bdi;
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;
        struct wb_writeback_work *work;
        unsigned long dirty;
        int ret;

        /* lookup bdi and memcg */
        bdi = bdi_get_by_id(bdi_id);
        if (!bdi)
                return -ENOENT;

        rcu_read_lock();
        memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css) {
                ret = -ENOENT;
                goto out_bdi_put;
        }

        /*
         * And find the associated wb.  If the wb isn't there already
         * there's nothing to flush, don't create one.
         */
        wb = wb_get_lookup(bdi, memcg_css);
        if (!wb) {
                ret = -ENOENT;
                goto out_css_put;
        }

        /*
         * The caller is attempting to write out most of
         * the currently dirty pages.  Let's take the current dirty page
         * count and inflate it by 25% which should be large enough to
         * flush out most dirty pages while avoiding getting livelocked by
         * concurrent dirtiers.
         *
         * BTW the memcg stats are flushed periodically and this is best-effort
         * estimation, so some potential error is ok.
         */
        dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
        dirty = dirty * 10 / 8;

        /* issue the writeback work */
        work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
        if (work) {
                work->nr_pages = dirty;
                work->sync_mode = WB_SYNC_NONE;
                work->range_cyclic = 1;
                work->reason = reason;
                work->done = done;
                work->auto_free = 1;
                wb_queue_work(wb, work);
                ret = 0;
        } else {
                ret = -ENOMEM;
        }

        wb_put(wb);
out_css_put:
        css_put(memcg_css);
out_bdi_put:
        bdi_put(bdi);
        return ret;
}

/**
 * cgroup_writeback_umount - flush inode wb switches for umount
 *
 * This function is called when a super_block is about to be destroyed and
 * flushes in-flight inode wb switches.  An inode wb switch goes through
 * RCU and then workqueue, so the two need to be flushed in order to ensure
 * that all previously scheduled switches are finished.  As wb switches are
 * rare occurrences and synchronize_rcu() can take a while, perform
 * flushing iff wb switches are in flight.
 */
void cgroup_writeback_umount(void)
{
        /*
         * SB_ACTIVE should be reliably cleared before checking
         * isw_nr_in_flight, see generic_shutdown_super().
         */
        smp_mb();

        if (atomic_read(&isw_nr_in_flight)) {
                /*
                 * Use rcu_barrier() to wait for all pending callbacks to
                 * ensure that all in-flight wb switches are in the workqueue.
                 */
                rcu_barrier();
                flush_workqueue(isw_wq);
        }
}

static int __init cgroup_writeback_init(void)
{
        isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
        if (!isw_wq)
                return -ENOMEM;
        return 0;
}
fs_initcall(cgroup_writeback_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }

static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_unlock(&inode->i_lock);
        spin_lock(&wb->list_lock);
        return wb;
}

static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_lock(&wb->list_lock);
        return wb;
}

static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        return nr_pages;
}

static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        might_sleep();

        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                base_work->auto_free = 0;
                wb_queue_work(&bdi->wb, base_work);
        }
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * Add in the number of potentially dirty inodes, because each inode
 * write can dirty pagecache in the underlying blockdev.
 */
static unsigned long get_nr_dirty_pages(void)
{
        return global_node_page_state(NR_FILE_DIRTY) +
                get_nr_dirty_inodes();
}

static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
{
        if (!wb_has_dirty_io(wb))
                return;

        /*
         * All callers of this function want to start writeback of all
         * dirty pages. Places like vmscan can call this at a very
         * high frequency, causing pointless allocations of tons of
         * work items and keeping the flusher threads busy retrieving
         * that work. Ensure that we only allow one of them pending and
         * inflight at the time.
         */
        if (test_bit(WB_start_all, &wb->state) ||
            test_and_set_bit(WB_start_all, &wb->state))
                return;

        wb->start_all_reason = reason;
        wb_wakeup(wb);
}

/**
 * wb_start_background_writeback - start background writeback
 * @wb: bdi_writback to write from
 *
 * Description:
 *   This makes sure WB_SYNC_NONE background writeback happens. When
 *   this function returns, it is only guaranteed that for given wb
 *   some IO is happening if we are over background dirty threshold.
 *   Caller need not hold sb s_umount semaphore.
 */
void wb_start_background_writeback(struct bdi_writeback *wb)
{
        /*
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(wb);
        wb_wakeup(wb);
}

/*
 * Remove the inode from the writeback list it is on.
 */
void inode_io_list_del(struct inode *inode)
{
        struct bdi_writeback *wb;

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);

        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);

        spin_unlock(&inode->i_lock);
        spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);

/*
 * mark an inode as under writeback on the sb
 */
void sb_mark_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (list_empty(&inode->i_wb_list)) {
                        list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
                        trace_sb_mark_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * clear an inode as under writeback on the sb
 */
void sb_clear_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (!list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (!list_empty(&inode->i_wb_list)) {
                        list_del_init(&inode->i_wb_list);
                        trace_sb_clear_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
        assert_spin_locked(&inode->i_lock);

        inode->i_state &= ~I_SYNC_QUEUED;
        /*
         * When the inode is being freed just don't bother with dirty list
         * tracking. Flush worker will ignore this inode anyway and it will
         * trigger assertions in inode_io_list_move_locked().
         */
        if (inode->i_state & I_FREEING) {
                list_del_init(&inode->i_io_list);
                wb_io_lists_depopulated(wb);
                return;
        }
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;

                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
        inode_io_list_move_locked(inode, wb, &wb->b_dirty);
}

static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
        spin_lock(&inode->i_lock);
        redirty_tail_locked(inode, wb);
        spin_unlock(&inode->i_lock);
}

/*
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
        inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}

static void inode_sync_complete(struct inode *inode)
{
        inode->i_state &= ~I_SYNC;
        /* If inode is clean an unused, put it into LRU now... */
        inode_add_lru(inode);
        /* Waiters must see I_SYNC cleared before being woken up */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
}

static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
        bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
        /*
         * For inodes being constantly redirtied, dirtied_when can get stuck.
         * It _appears_ to be in the future, but is actually in distant past.
         * This test is necessary to prevent such wrapped-around relative times
         * from permanently stopping the whole bdi writeback.
         */
        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
        return ret;
}

/*
 * Move expired (dirtied before dirtied_before) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
                               unsigned long dirtied_before)
{
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
        struct inode *inode;
        int do_sb_sort = 0;
        int moved = 0;

        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
                if (inode_dirtied_after(inode, dirtied_before))
                        break;
                spin_lock(&inode->i_lock);
                list_move(&inode->i_io_list, &tmp);
                moved++;
                inode->i_state |= I_SYNC_QUEUED;
                spin_unlock(&inode->i_lock);
                if (sb_is_blkdev_sb(inode->i_sb))
                        continue;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
        }

        /* just one sb in list, splice to dispatch_queue and we're done */
        if (!do_sb_sort) {
                list_splice(&tmp, dispatch_queue);
                goto out;
        }

        /*
         * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
         * we don't take inode->i_lock here because it is just a pointless overhead.
         * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
         * fully under our control.
         */
        while (!list_empty(&tmp)) {
                sb = wb_inode(tmp.prev)->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
                                list_move(&inode->i_io_list, dispatch_queue);
                }
        }
out:
        return moved;
}

/*
 * Queue all expired dirty inodes for io, eldest first.
 * Before
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    gf         edc     BA
 * After
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    g          fBAedc
 *                                           |
 *                                           +--> dequeue for IO
 */
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
                     unsigned long dirtied_before)
{
        int moved;
        unsigned long time_expire_jif = dirtied_before;

        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
        if (!work->for_sync)
                time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                     time_expire_jif);
        if (moved)
                wb_io_lists_populated(wb);
        trace_writeback_queue_io(wb, work, dirtied_before, moved);
}

static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int ret;

        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
                trace_writeback_write_inode_start(inode, wbc);
                ret = inode->i_sb->s_op->write_inode(inode, wbc);
                trace_writeback_write_inode(inode, wbc);
                return ret;
        }
        return 0;
}

/*
 * Wait for writeback on an inode to complete. Called with i_lock held.
 * Caller must make sure inode cannot go away when we drop i_lock.
 */
static void __inode_wait_for_writeback(struct inode *inode)
        __releases(inode->i_lock)
        __acquires(inode->i_lock)
{
        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
        wait_queue_head_t *wqh;

        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        while (inode->i_state & I_SYNC) {
                spin_unlock(&inode->i_lock);
                __wait_on_bit(wqh, &wq, bit_wait,
                              TASK_UNINTERRUPTIBLE);
                spin_lock(&inode->i_lock);
        }
}

/*
 * Wait for writeback on an inode to complete. Caller must have inode pinned.
 */
void inode_wait_for_writeback(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        __inode_wait_for_writeback(inode);
        spin_unlock(&inode->i_lock);
}

/*
 * Sleep until I_SYNC is cleared. This function must be called with i_lock
 * held and drops it. It is aimed for callers not holding any inode reference
 * so once i_lock is dropped, inode can go away.
 */
static void inode_sleep_on_writeback(struct inode *inode)
        __releases(inode->i_lock)
{
        DEFINE_WAIT(wait);
        wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        int sleep;

        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        sleep = inode->i_state & I_SYNC;
        spin_unlock(&inode->i_lock);
        if (sleep)
                schedule();
        finish_wait(wqh, &wait);
}

/*
 * Find proper writeback list for the inode depending on its current state and
 * possibly also change of its state while we were doing writeback.  Here we
 * handle things such as livelock prevention or fairness of writeback among
 * inodes. This function can be called only by flusher thread - noone else
 * processes all inodes in writeback lists and requeueing inodes behind flusher
 * thread's back can have unexpected consequences.
 */
static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                          struct writeback_control *wbc,
                          unsigned long dirtied_before)
{
        if (inode->i_state & I_FREEING)
                return;

        /*
         * Sync livelock prevention. Each inode is tagged and synced in one
         * shot. If still dirty, it will be redirty_tail()'ed below.  Update
         * the dirty time to prevent enqueue and sync it again.
         */
        if ((inode->i_state & I_DIRTY) &&
            (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
                inode->dirtied_when = jiffies;

        if (wbc->pages_skipped) {
                /*
                 * Writeback is not making progress due to locked buffers.
                 * Skip this inode for now. Although having skipped pages
                 * is odd for clean inodes, it can happen for some
                 * filesystems so handle that gracefully.
                 */
                if (inode->i_state & I_DIRTY_ALL)
                        redirty_tail_locked(inode, wb);
                else
                        inode_cgwb_move_to_attached(inode, wb);
                return;
        }

        if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                /*
                 * We didn't write back all the pages.  nfs_writepages()
                 * sometimes bales out without doing anything.
                 */
                if (wbc->nr_to_write <= 0 &&
                    !inode_dirtied_after(inode, dirtied_before)) {
                        /* Slice used up. Queue for next turn. */
                        requeue_io(inode, wb);
                } else {
                        /*
                         * Writeback blocked by something other than
                         * congestion. Delay the inode for some time to
                         * avoid spinning on the CPU (100% iowait)
                         * retrying writeback of the dirty page/inode
                         * that cannot be performed immediately.
                         */
                        redirty_tail_locked(inode, wb);
                }
        } else if (inode->i_state & I_DIRTY) {
                /*
                 * Filesystems can dirty the inode during writeback operations,
                 * such as delayed allocation during submission or metadata
                 * updates after data IO completion.
                 */
                redirty_tail_locked(inode, wb);
        } else if (inode->i_state & I_DIRTY_TIME) {
                inode->dirtied_when = jiffies;
                inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
                inode->i_state &= ~I_SYNC_QUEUED;
        } else {
                /* The inode is clean. Remove from writeback lists. */
                inode_cgwb_move_to_attached(inode, wb);
        }
}

/*
 * Write out an inode and its dirty pages (or some of its dirty pages, depending
 * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
 *
 * This doesn't remove the inode from the writeback list it is on, except
 * potentially to move it from b_dirty_time to b_dirty due to timestamp
 * expiration.  The caller is otherwise responsible for writeback list handling.
 *
 * The caller is also responsible for setting the I_SYNC flag beforehand and
 * calling inode_sync_complete() to clear it afterwards.
 */
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
        struct address_space *mapping = inode->i_mapping;
        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;

        WARN_ON(!(inode->i_state & I_SYNC));

        trace_writeback_single_inode_start(inode, wbc, nr_to_write);

        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion. We don't do it for sync(2) writeback because it has a
         * separate, external IO completion path and ->sync_fs for guaranteeing
         * inode metadata is written back correctly.
         */
        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        /*
         * If the inode has dirty timestamps and we need to write them, call
         * mark_inode_dirty_sync() to notify the filesystem about it and to
         * change I_DIRTY_TIME into I_DIRTY_SYNC.
         */
        if ((inode->i_state & I_DIRTY_TIME) &&
            (wbc->sync_mode == WB_SYNC_ALL ||
             time_after(jiffies, inode->dirtied_time_when +
                        dirtytime_expire_interval * HZ))) {
                trace_writeback_lazytime(inode);
                mark_inode_dirty_sync(inode);
        }

        /*
         * Get and clear the dirty flags from i_state.  This needs to be done
         * after calling writepages because some filesystems may redirty the
         * inode during writepages due to delalloc.  It also needs to be done
         * after handling timestamp expiration, as that may dirty the inode too.
         */
        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~dirty;

        /*
         * Paired with smp_mb() in __mark_inode_dirty().  This allows
         * __mark_inode_dirty() to test i_state without grabbing i_lock -
         * either they see the I_DIRTY bits cleared or we see the dirtied
         * inode.
         *
         * I_DIRTY_PAGES is always cleared together above even if @mapping
         * still has dirty pages.  The flag is reinstated after smp_mb() if
         * necessary.  This guarantees that either __mark_inode_dirty()
         * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
         */
        smp_mb();

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state |= I_DIRTY_PAGES;
        else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
                if (!(inode->i_state & I_DIRTY_PAGES)) {
                        inode->i_state &= ~I_PINNING_NETFS_WB;
                        wbc->unpinned_netfs_wb = true;
                        dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
                }
        }

        spin_unlock(&inode->i_lock);

        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
        wbc->unpinned_netfs_wb = false;
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
}

/*
 * Write out an inode's dirty data and metadata on-demand, i.e. separately from
 * the regular batched writeback done by the flusher threads in
 * writeback_sb_inodes().  @wbc controls various aspects of the write, such as
 * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
 *
 * To prevent the inode from going away, either the caller must have a reference
 * to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
 */
static int writeback_single_inode(struct inode *inode,
                                  struct writeback_control *wbc)
{
        struct bdi_writeback *wb;
        int ret = 0;

        spin_lock(&inode->i_lock);
        if (!atomic_read(&inode->i_count))
                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
        else
                WARN_ON(inode->i_state & I_WILL_FREE);

        if (inode->i_state & I_SYNC) {
                /*
                 * Writeback is already running on the inode.  For WB_SYNC_NONE,
                 * that's enough and we can just return.  For WB_SYNC_ALL, we
                 * must wait for the existing writeback to complete, then do
                 * writeback again if there's anything left.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out;
                __inode_wait_for_writeback(inode);
        }
        WARN_ON(inode->i_state & I_SYNC);
        /*
         * If the inode is already fully clean, then there's nothing to do.
         *
         * For data-integrity syncs we also need to check whether any pages are
         * still under writeback, e.g. due to prior WB_SYNC_NONE writeback.  If
         * there are any such pages, we'll need to wait for them.
         */
        if (!(inode->i_state & I_DIRTY_ALL) &&
            (wbc->sync_mode != WB_SYNC_ALL ||
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode->i_state |= I_SYNC;
        wbc_attach_and_unlock_inode(wbc, inode);

        ret = __writeback_single_inode(inode, wbc);

        wbc_detach_inode(wbc);

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
         * If the inode is freeing, its i_io_list shoudn't be updated
         * as it can be finally deleted at this moment.
         */
        if (!(inode->i_state & I_FREEING)) {
                /*
                 * If the inode is now fully clean, then it can be safely
                 * removed from its writeback list (if any). Otherwise the
                 * flusher threads are responsible for the writeback lists.
                 */
                if (!(inode->i_state & I_DIRTY_ALL))
                        inode_cgwb_move_to_attached(inode, wb);
                else if (!(inode->i_state & I_SYNC_QUEUED)) {
                        if ((inode->i_state & I_DIRTY))
                                redirty_tail_locked(inode, wb);
                        else if (inode->i_state & I_DIRTY_TIME) {
                                inode->dirtied_when = jiffies;
                                inode_io_list_move_locked(inode,
                                                          wb,
                                                          &wb->b_dirty_time);
                        }
                }
        }

        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
out:
        spin_unlock(&inode->i_lock);
        return ret;
}

static long writeback_chunk_size(struct bdi_writeback *wb,
                                 struct wb_writeback_work *work)
{
        long pages;

        /*
         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
         * here avoids calling into writeback_inodes_wb() more than once.
         *
         * The intended call sequence for WB_SYNC_ALL writeback is:
         *
         *      wb_writeback()
         *          writeback_sb_inodes()       <== called only once
         *              write_cache_pages()     <== called once for each inode
         *                   (quickly) tag currently dirty pages
         *                   (maybe slowly) sync all tagged pages
         */
        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
                pages = LONG_MAX;
        else {
                pages = min(wb->avg_write_bandwidth / 2,
                            global_wb_domain.dirty_limit / DIRTY_SCOPE);
                pages = min(pages, work->nr_pages);
                pages = round_down(pages + MIN_WRITEBACK_PAGES,
                                   MIN_WRITEBACK_PAGES);
        }

        return pages;
}

/*
 * Write a portion of b_io inodes which belong to @sb.
 *
 * Return the number of pages and/or inodes written.
 *
 * NOTE! This is called with wb->list_lock held, and will
 * unlock and relock that for each inode it ends up doing
 * IO for.
 */
static long writeback_sb_inodes(struct super_block *sb,
                                struct bdi_writeback *wb,
                                struct wb_writeback_work *work)
{
        struct writeback_control wbc = {
                .sync_mode                = work->sync_mode,
                .tagged_writepages        = work->tagged_writepages,
                .for_kupdate                = work->for_kupdate,
                .for_background                = work->for_background,
                .for_sync                = work->for_sync,
                .range_cyclic                = work->range_cyclic,
                .range_start                = 0,
                .range_end                = LLONG_MAX,
        };
        unsigned long start_time = jiffies;
        long write_chunk;
        long total_wrote = 0;  /* count both pages and inodes */
        unsigned long dirtied_before = jiffies;

        if (work->for_kupdate)
                dirtied_before = jiffies -
                        msecs_to_jiffies(dirty_expire_interval * 10);

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct bdi_writeback *tmp_wb;
                long wrote;

                if (inode->i_sb != sb) {
                        if (work->sb) {
                                /*
                                 * We only want to write back data for this
                                 * superblock, move all inodes not belonging
                                 * to it back onto the dirty list.
                                 */
                                redirty_tail(inode, wb);
                                continue;
                        }

                        /*
                         * The inode belongs to a different superblock.
                         * Bounce back to the caller to unpin this and
                         * pin the next superblock.
                         */
                        break;
                }

                /*
                 * Don't bother with new inodes or inodes being freed, first
                 * kind does not need periodic writeout yet, and for the latter
                 * kind writeout is handled by the freer.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        redirty_tail_locked(inode, wb);
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
                        /*
                         * If this inode is locked for writeback and we are not
                         * doing writeback-for-data-integrity, move it to
                         * b_more_io so that writeback can proceed with the
                         * other inodes on s_io.
                         *
                         * We'll have another go at writing back this inode
                         * when we completed a full scan of b_io.
                         */
                        requeue_io(inode, wb);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_sb_inodes_requeue(inode);
                        continue;
                }
                spin_unlock(&wb->list_lock);

                /*
                 * We already requeued the inode if it had I_SYNC set and we
                 * are doing WB_SYNC_NONE writeback. So this catches only the
                 * WB_SYNC_ALL case.
                 */
                if (inode->i_state & I_SYNC) {
                        /* Wait for I_SYNC. This function drops i_lock... */
                        inode_sleep_on_writeback(inode);
                        /* Inode may be gone, start again */
                        spin_lock(&wb->list_lock);
                        continue;
                }
                inode->i_state |= I_SYNC;
                wbc_attach_and_unlock_inode(&wbc, inode);

                write_chunk = writeback_chunk_size(wb, work);
                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;

                /*
                 * We use I_SYNC to pin the inode in memory. While it is set
                 * evict_inode() will wait so the inode cannot be freed.
                 */
                __writeback_single_inode(inode, &wbc);

                wbc_detach_inode(&wbc);
                work->nr_pages -= write_chunk - wbc.nr_to_write;
                wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
                wrote = wrote < 0 ? 0 : wrote;
                total_wrote += wrote;

                if (need_resched()) {
                        /*
                         * We're trying to balance between building up a nice
                         * long list of IOs to improve our merge rate, and
                         * getting those IOs out quickly for anyone throttling
                         * in balance_dirty_pages().  cond_resched() doesn't
                         * unplug, so get our IOs out the door before we
                         * give up the CPU.
                         */
                        blk_flush_plug(current->plug, false);
                        cond_resched();
                }

                /*
                 * Requeue @inode if still dirty.  Be careful as @inode may
                 * have been switched to another wb in the meantime.
                 */
                tmp_wb = inode_to_wb_and_lock_list(inode);
                spin_lock(&inode->i_lock);
                if (!(inode->i_state & I_DIRTY_ALL))
                        total_wrote++;
                requeue_inode(inode, tmp_wb, &wbc, dirtied_before);
                inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);

                if (unlikely(tmp_wb != wb)) {
                        spin_unlock(&tmp_wb->list_lock);
                        spin_lock(&wb->list_lock);
                }

                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
                 */
                if (total_wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        return total_wrote;
}

static long __writeback_inodes_wb(struct bdi_writeback *wb,
                                  struct wb_writeback_work *work)
{
        unsigned long start_time = jiffies;
        long wrote = 0;

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;

                if (!super_trylock_shared(sb)) {
                        /*
                         * super_trylock_shared() may fail consistently due to
                         * s_umount being grabbed by someone else. Don't use
                         * requeue_io() to avoid busy retrying the inode/sb.
                         */
                        redirty_tail(inode, wb);
                        continue;
                }
                wrote += writeback_sb_inodes(sb, wb, work);
                up_read(&sb->s_umount);

                /* refer to the same tests at the end of writeback_sb_inodes */
                if (wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        /* Leave any unwritten inodes on b_io */
        return wrote;
}

static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                                enum wb_reason reason)
{
        struct wb_writeback_work work = {
                .nr_pages        = nr_pages,
                .sync_mode        = WB_SYNC_NONE,
                .range_cyclic        = 1,
                .reason                = reason,
        };
        struct blk_plug plug;

        blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
                queue_io(wb, &work, jiffies);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);

        return nr_pages - work.nr_pages;
}

/*
 * Explicit flushing or periodic writeback of "old" data.
 *
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 * dirtying-time in the inode's address_space.  So this periodic writeback code
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
 *
 * Try to run once per dirty_writeback_interval.  But if a writeback event
 * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
 *
 * dirtied_before takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
 */
static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
{
        long nr_pages = work->nr_pages;
        unsigned long dirtied_before = jiffies;
        struct inode *inode;
        long progress;
        struct blk_plug plug;
        bool queued = false;

        blk_start_plug(&plug);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
                if (work->nr_pages <= 0)
                        break;

                /*
                 * Background writeout and kupdate-style writeback may
                 * run forever. Stop them if there is other work to do
                 * so that e.g. sync can proceed. They'll be restarted
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
                    !list_empty(&wb->work_list))
                        break;

                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
                if (work->for_background && !wb_over_bg_thresh(wb))
                        break;


                spin_lock(&wb->list_lock);

                trace_writeback_start(wb, work);
                if (list_empty(&wb->b_io)) {
                        /*
                         * Kupdate and background works are special and we want
                         * to include all inodes that need writing. Livelock
                         * avoidance is handled by these works yielding to any
                         * other work so we are safe.
                         */
                        if (work->for_kupdate) {
                                dirtied_before = jiffies -
                                        msecs_to_jiffies(dirty_expire_interval *
                                                         10);
                        } else if (work->for_background)
                                dirtied_before = jiffies;

                        queue_io(wb, work, dirtied_before);
                        queued = true;
                }
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb, work);

                /*
                 * Did we write something? Try for more
                 *
                 * Dirty inodes are moved to b_io for writeback in batches.
                 * The completion of the current batch does not necessarily
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
                if (progress || !queued) {
                        spin_unlock(&wb->list_lock);
                        continue;
                }

                /*
                 * No more inodes for IO, bail
                 */
                if (list_empty(&wb->b_more_io)) {
                        spin_unlock(&wb->list_lock);
                        break;
                }

                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
                trace_writeback_wait(wb, work);
                inode = wb_inode(wb->b_more_io.prev);
                spin_lock(&inode->i_lock);
                spin_unlock(&wb->list_lock);
                /* This function drops i_lock... */
                inode_sleep_on_writeback(inode);
        }
        blk_finish_plug(&plug);

        return nr_pages - work->nr_pages;
}

/*
 * Return the next wb_writeback_work struct that hasn't been processed yet.
 */
static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work = NULL;

        spin_lock_irq(&wb->work_lock);
        if (!list_empty(&wb->work_list)) {
                work = list_entry(wb->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
        spin_unlock_irq(&wb->work_lock);
        return work;
}

static long wb_check_background_flush(struct bdi_writeback *wb)
{
        if (wb_over_bg_thresh(wb)) {

                struct wb_writeback_work work = {
                        .nr_pages        = LONG_MAX,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_background        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_BACKGROUND,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
        unsigned long expired;
        long nr_pages;

        /*
         * When set to zero, disable periodic writeback
         */
        if (!dirty_writeback_interval)
                return 0;

        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
                return 0;

        wb->last_old_flush = jiffies;
        nr_pages = get_nr_dirty_pages();

        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = nr_pages,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_kupdate        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_PERIODIC,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_start_all(struct bdi_writeback *wb)
{
        long nr_pages;

        if (!test_bit(WB_start_all, &wb->state))
                return 0;

        nr_pages = get_nr_dirty_pages();
        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = wb_split_bdi_pages(wb, nr_pages),
                        .sync_mode        = WB_SYNC_NONE,
                        .range_cyclic        = 1,
                        .reason                = wb->start_all_reason,
                };

                nr_pages = wb_writeback(wb, &work);
        }

        clear_bit(WB_start_all, &wb->state);
        return nr_pages;
}


/*
 * Retrieve work items and do the writeback they describe
 */
static long wb_do_writeback(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work;
        long wrote = 0;

        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
                trace_writeback_exec(wb, work);
                wrote += wb_writeback(wb, work);
                finish_writeback_work(work);
        }

        /*
         * Check for a flush-everything request
         */
        wrote += wb_check_start_all(wb);

        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
        clear_bit(WB_writeback_running, &wb->state);

        return wrote;
}

/*
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * reschedules periodically and does kupdated style flushing.
 */
void wb_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, dwork);
        long pages_written;

        set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));

        if (likely(!current_is_workqueue_rescuer() ||
                   !test_bit(WB_registered, &wb->state))) {
                /*
                 * The normal path.  Keep writing back @wb until its
                 * work_list is empty.  Note that this path is also taken
                 * if @wb is shutting down even when we're running off the
                 * rescuer as work_list needs to be drained.
                 */
                do {
                        pages_written = wb_do_writeback(wb);
                        trace_writeback_pages_written(pages_written);
                } while (!list_empty(&wb->work_list));
        } else {
                /*
                 * bdi_wq can't get enough workers and we're running off
                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
                 * enough for efficient IO.
                 */
                pages_written = writeback_inodes_wb(wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
        }

        if (!list_empty(&wb->work_list))
                wb_wakeup(wb);
        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                wb_wakeup_delayed(wb);
}

/*
 * Start writeback of all dirty pages on this bdi.
 */
static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                         enum wb_reason reason)
{
        struct bdi_writeback *wb;

        if (!bdi_has_dirty_io(bdi))
                return;

        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                wb_start_writeback(wb, reason);
}

void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason)
{
        rcu_read_lock();
        __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wakeup the flusher threads to start writeback of all currently dirty pages
 */
void wakeup_flusher_threads(enum wb_reason reason)
{
        struct backing_dev_info *bdi;

        /*
         * If we are expecting writeback progress we must submit plugged IO.
         */
        blk_flush_plug(current->plug, true);

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wake up bdi's periodically to make sure dirtytime inodes gets
 * written back periodically.  We deliberately do *not* check the
 * b_dirtytime list in wb_has_dirty_io(), since this would cause the
 * kernel to be constantly waking up once there are any dirtytime
 * inodes on the system.  So instead we define a separate delayed work
 * function which gets called much more rarely.  (By default, only
 * once every 12 hours.)
 *
 * If there is any other write activity going on in the file system,
 * this function won't be necessary.  But if the only thing that has
 * happened on the file system is a dirtytime inode caused by an atime
 * update, we need this infrastructure below to make sure that inode
 * eventually gets pushed out to disk.
 */
static void wakeup_dirtytime_writeback(struct work_struct *w);
static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);

static void wakeup_dirtytime_writeback(struct work_struct *w)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;

                list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        if (!list_empty(&wb->b_dirty_time))
                                wb_wakeup(wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}

static int __init start_dirtytime_writeback(void)
{
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
        return 0;
}
__initcall(start_dirtytime_writeback);

int dirtytime_interval_handler(struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                mod_delayed_work(system_wq, &dirtytime_work, 0);
        return ret;
}

/**
 * __mark_inode_dirty -        internal function to mark an inode dirty
 *
 * @inode: inode to mark
 * @flags: what kind of dirty, e.g. I_DIRTY_SYNC.  This can be a combination of
 *           multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
 *           with I_DIRTY_PAGES.
 *
 * Mark an inode as dirty.  We notify the filesystem, then update the inode's
 * dirty flags.  Then, if needed we add the inode to the appropriate dirty list.
 *
 * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
 * instead of calling this directly.
 *
 * CAREFUL!  We only add the inode to the dirty list if it is hashed or if it
 * refers to a blockdev.  Unhashed inodes will never be added to the dirty list
 * even if they are later hashed, as they will have been marked dirty already.
 *
 * In short, ensure you hash any inodes _before_ you start marking them dirty.
 *
 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
 * the kernel-internal blockdev inode represents the dirtying time of the
 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
 */
void __mark_inode_dirty(struct inode *inode, int flags)
{
        struct super_block *sb = inode->i_sb;
        int dirtytime = 0;
        struct bdi_writeback *wb = NULL;

        trace_writeback_mark_inode_dirty(inode, flags);

        if (flags & I_DIRTY_INODE) {
                /*
                 * Inode timestamp update will piggback on this dirtying.
                 * We tell ->dirty_inode callback that timestamps need to
                 * be updated by setting I_DIRTY_TIME in flags.
                 */
                if (inode->i_state & I_DIRTY_TIME) {
                        spin_lock(&inode->i_lock);
                        if (inode->i_state & I_DIRTY_TIME) {
                                inode->i_state &= ~I_DIRTY_TIME;
                                flags |= I_DIRTY_TIME;
                        }
                        spin_unlock(&inode->i_lock);
                }

                /*
                 * Notify the filesystem about the inode being dirtied, so that
                 * (if needed) it can update on-disk fields and journal the
                 * inode.  This is only needed when the inode itself is being
                 * dirtied now.  I.e. it's only needed for I_DIRTY_INODE, not
                 * for just I_DIRTY_PAGES or I_DIRTY_TIME.
                 */
                trace_writeback_dirty_inode_start(inode, flags);
                if (sb->s_op->dirty_inode)
                        sb->s_op->dirty_inode(inode,
                                flags & (I_DIRTY_INODE | I_DIRTY_TIME));
                trace_writeback_dirty_inode(inode, flags);

                /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
                flags &= ~I_DIRTY_TIME;
        } else {
                /*
                 * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
                 * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
                 * in one call to __mark_inode_dirty().)
                 */
                dirtytime = flags & I_DIRTY_TIME;
                WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
        }

        /*
         * Paired with smp_mb() in __writeback_single_inode() for the
         * following lockless i_state test.  See there for details.
         */
        smp_mb();

        if ((inode->i_state & flags) == flags)
                return;

        spin_lock(&inode->i_lock);
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;

                inode_attach_wb(inode, NULL);

                inode->i_state |= flags;

                /*
                 * Grab inode's wb early because it requires dropping i_lock and we
                 * need to make sure following checks happen atomically with dirty
                 * list handling so that we don't move inodes under flush worker's
                 * hands.
                 */
                if (!was_dirty) {
                        wb = locked_inode_to_wb_and_lock_list(inode);
                        spin_lock(&inode->i_lock);
                }

                /*
                 * If the inode is queued for writeback by flush worker, just
                 * update its dirty state. Once the flush worker is done with
                 * the inode it will place it on the appropriate superblock
                 * list, based upon its state.
                 */
                if (inode->i_state & I_SYNC_QUEUED)
                        goto out_unlock;

                /*
                 * Only add valid (hashed) inodes to the superblock's
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
                                goto out_unlock;
                }
                if (inode->i_state & I_FREEING)
                        goto out_unlock;

                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
                        struct list_head *dirty_list;
                        bool wakeup_bdi = false;

                        inode->dirtied_when = jiffies;
                        if (dirtytime)
                                inode->dirtied_time_when = jiffies;

                        if (inode->i_state & I_DIRTY)
                                dirty_list = &wb->b_dirty;
                        else
                                dirty_list = &wb->b_dirty_time;

                        wakeup_bdi = inode_io_list_move_locked(inode, wb,
                                                               dirty_list);

                        spin_unlock(&wb->list_lock);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_dirty_inode_enqueue(inode);

                        /*
                         * If this is the first dirty inode for this bdi,
                         * we have to wake-up the corresponding bdi thread
                         * to make sure background write-back happens
                         * later.
                         */
                        if (wakeup_bdi &&
                            (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
                                wb_wakeup_delayed(wb);
                        return;
                }
        }
out_unlock:
        if (wb)
                spin_unlock(&wb->list_lock);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);

/*
 * The @s_sync_lock is used to serialise concurrent sync operations
 * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
 * Concurrent callers will block on the s_sync_lock rather than doing contending
 * walks. The queueing maintains sync(2) required behaviour as all the IO that
 * has been issued up to the time this function is enter is guaranteed to be
 * completed by the time we have gained the lock and waited for all IO that is
 * in progress regardless of the order callers are granted the lock.
 */
static void wait_sb_inodes(struct super_block *sb)
{
        LIST_HEAD(sync_list);

        /*
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        mutex_lock(&sb->s_sync_lock);

        /*
         * Splice the writeback list onto a temporary list to avoid waiting on
         * inodes that have started writeback after this point.
         *
         * Use rcu_read_lock() to keep the inodes around until we have a
         * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
         * the local list because inodes can be dropped from either by writeback
         * completion.
         */
        rcu_read_lock();
        spin_lock_irq(&sb->s_inode_wblist_lock);
        list_splice_init(&sb->s_inodes_wb, &sync_list);

        /*
         * Data integrity sync. Must wait for all pages under writeback, because
         * there may have been pages dirtied before our sync call, but which had
         * writeout started before we write it out.  In which case, the inode
         * may not be on the dirty list, but we still have to wait for that
         * writeout.
         */
        while (!list_empty(&sync_list)) {
                struct inode *inode = list_first_entry(&sync_list, struct inode,
                                                       i_wb_list);
                struct address_space *mapping = inode->i_mapping;

                /*
                 * Move each inode back to the wb list before we drop the lock
                 * to preserve consistency between i_wb_list and the mapping
                 * writeback tag. Writeback completion is responsible to remove
                 * the inode from either list once the writeback tag is cleared.
                 */
                list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);

                /*
                 * The mapping can appear untagged while still on-list since we
                 * do not have the mapping lock. Skip it here, wb completion
                 * will remove it.
                 */
                if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                        continue;

                spin_unlock_irq(&sb->s_inode_wblist_lock);

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);

                        spin_lock_irq(&sb->s_inode_wblist_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();

                /*
                 * We keep the error status of individual mapping so that
                 * applications can catch the writeback error using fsync(2).
                 * See filemap_fdatawait_keep_errors() for details.
                 */
                filemap_fdatawait_keep_errors(mapping);

                cond_resched();

                iput(inode);

                rcu_read_lock();
                spin_lock_irq(&sb->s_inode_wblist_lock);
        }
        spin_unlock_irq(&sb->s_inode_wblist_lock);
        rcu_read_unlock();
        mutex_unlock(&sb->s_sync_lock);
}

static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
                                     enum wb_reason reason, bool skip_if_busy)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                        = sb,
                .sync_mode                = WB_SYNC_NONE,
                .tagged_writepages        = 1,
                .done                        = &done,
                .nr_pages                = nr,
                .reason                        = reason,
        };

        if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
        wb_wait_for_completion(&done);
}

/**
 * writeback_inodes_sb_nr -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
 * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb_nr(struct super_block *sb,
                            unsigned long nr,
                            enum wb_reason reason)
{
        __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);

/**
 * writeback_inodes_sb        -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);

/**
 * try_to_writeback_inodes_sb - try to start writeback if none underway
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
 */
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        if (!down_read_trylock(&sb->s_umount))
                return;

        __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
        up_read(&sb->s_umount);
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb);

/**
 * sync_inodes_sb        -        sync sb inode pages
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
 * super_block.
 */
void sync_inodes_sb(struct super_block *sb)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                = sb,
                .sync_mode        = WB_SYNC_ALL,
                .nr_pages        = LONG_MAX,
                .range_cyclic        = 0,
                .done                = &done,
                .reason                = WB_REASON_SYNC,
                .for_sync        = 1,
        };

        /*
         * Can't skip on !bdi_has_dirty() because we should wait for !dirty
         * inodes under writeback and I_DIRTY_TIME inodes ignored by
         * bdi_has_dirty() need to be written out too.
         */
        if (bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
        bdi_down_write_wb_switch_rwsem(bdi);
        bdi_split_work_to_wbs(bdi, &work, false);
        wb_wait_for_completion(&done);
        bdi_up_write_wb_switch_rwsem(bdi);

        wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);

/**
 * write_inode_now        -        write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
 *
 * This function commits an inode to disk immediately if it is dirty. This is
 * primarily needed by knfsd.
 *
 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
 */
int write_inode_now(struct inode *inode, int sync)
{
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start = 0,
                .range_end = LLONG_MAX,
        };

        if (!mapping_can_writeback(inode->i_mapping))
                wbc.nr_to_write = 0;

        might_sleep();
        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);

/**
 * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
 * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
int sync_inode_metadata(struct inode *inode, int wait)
{
        struct writeback_control wbc = {
                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .nr_to_write = 0, /* metadata-only */
        };

        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(sync_inode_metadata);








































































    2 














    3 










    2 

    2 











































































    2 



    2 
























    2 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NET_SCM_H
#define __LINUX_NET_SCM_H

#include <linux/limits.h>
#include <linux/net.h>
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/security.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/sched/signal.h>
#include <net/compat.h>

/* Well, we should have at least one descriptor open
 * to accept passed FDs 8)
 */
#define SCM_MAX_FD        253

struct scm_creds {
        u32        pid;
        kuid_t        uid;
        kgid_t        gid;
};

#ifdef CONFIG_UNIX
struct unix_edge;
#endif

struct scm_fp_list {
        short                        count;
        short                        count_unix;
        short                        max;
#ifdef CONFIG_UNIX
        bool                        inflight;
        bool                        dead;
        struct list_head        vertices;
        struct unix_edge        *edges;
#endif
        struct user_struct        *user;
        struct file                *fp[SCM_MAX_FD];
};

struct scm_cookie {
        struct pid                *pid;                /* Skb credentials */
        struct scm_fp_list        *fp;                /* Passed files                */
        struct scm_creds        creds;                /* Skb credentials        */
#ifdef CONFIG_SECURITY_NETWORK
        u32                        secid;                /* Passed security ID         */
#endif
};

void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm);
void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm);
void __scm_destroy(struct scm_cookie *scm);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl);

#ifdef CONFIG_SECURITY_NETWORK
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{
        security_socket_getpeersec_dgram(sock, NULL, &scm->secid);
}
#else
static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm)
{ }
#endif /* CONFIG_SECURITY_NETWORK */

static __inline__ void scm_set_cred(struct scm_cookie *scm,
                                    struct pid *pid, kuid_t uid, kgid_t gid)
{
        scm->pid  = get_pid(pid);
        scm->creds.pid = pid_vnr(pid);
        scm->creds.uid = uid;
        scm->creds.gid = gid;
}

static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
{
        put_pid(scm->pid);
        scm->pid  = NULL;
}

static __inline__ void scm_destroy(struct scm_cookie *scm)
{
        scm_destroy_cred(scm);
        if (scm->fp)
                __scm_destroy(scm);
}

static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
                               struct scm_cookie *scm, bool forcecreds)
{
        memset(scm, 0, sizeof(*scm));
        scm->creds.uid = INVALID_UID;
        scm->creds.gid = INVALID_GID;
        if (forcecreds)
                scm_set_cred(scm, task_tgid(current), current_uid(), current_gid());
        unix_get_peersec_dgram(sock, scm);
        if (msg->msg_controllen <= 0)
                return 0;
        return __scm_send(sock, msg, scm);
}

#ifdef CONFIG_SECURITY_NETWORK
static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
{
        char *secdata;
        u32 seclen;
        int err;

        if (test_bit(SOCK_PASSSEC, &sock->flags)) {
                err = security_secid_to_secctx(scm->secid, &secdata, &seclen);

                if (!err) {
                        put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata);
                        security_release_secctx(secdata, seclen);
                }
        }
}

static inline bool scm_has_secdata(struct socket *sock)
{
        return test_bit(SOCK_PASSSEC, &sock->flags);
}
#else
static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
{ }

static inline bool scm_has_secdata(struct socket *sock)
{
        return false;
}
#endif /* CONFIG_SECURITY_NETWORK */

static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
{
        struct file *pidfd_file = NULL;
        int len, pidfd;

        /* put_cmsg() doesn't return an error if CMSG is truncated,
         * that's why we need to opencode these checks here.
         */
        if (msg->msg_flags & MSG_CMSG_COMPAT)
                len = sizeof(struct compat_cmsghdr) + sizeof(int);
        else
                len = sizeof(struct cmsghdr) + sizeof(int);

        if (msg->msg_controllen < len) {
                msg->msg_flags |= MSG_CTRUNC;
                return;
        }

        if (!scm->pid)
                return;

        pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);

        if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
                if (pidfd_file) {
                        put_unused_fd(pidfd);
                        fput(pidfd_file);
                }

                return;
        }

        if (pidfd_file)
                fd_install(pidfd, pidfd_file);
}

static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg,
                                     struct scm_cookie *scm, int flags)
{
        if (!msg->msg_control) {
                if (test_bit(SOCK_PASSCRED, &sock->flags) ||
                    test_bit(SOCK_PASSPIDFD, &sock->flags) ||
                    scm->fp || scm_has_secdata(sock))
                        msg->msg_flags |= MSG_CTRUNC;
                scm_destroy(scm);
                return false;
        }

        if (test_bit(SOCK_PASSCRED, &sock->flags)) {
                struct user_namespace *current_ns = current_user_ns();
                struct ucred ucreds = {
                        .pid = scm->creds.pid,
                        .uid = from_kuid_munged(current_ns, scm->creds.uid),
                        .gid = from_kgid_munged(current_ns, scm->creds.gid),
                };
                put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
        }

        scm_passec(sock, msg, scm);

        if (scm->fp)
                scm_detach_fds(msg, scm);

        return true;
}

static inline void scm_recv(struct socket *sock, struct msghdr *msg,
                            struct scm_cookie *scm, int flags)
{
        if (!__scm_recv_common(sock, msg, scm, flags))
                return;

        scm_destroy_cred(scm);
}

static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg,
                                 struct scm_cookie *scm, int flags)
{
        if (!__scm_recv_common(sock, msg, scm, flags))
                return;

        if (test_bit(SOCK_PASSPIDFD, &sock->flags))
                scm_pidfd_recv(msg, scm);

        scm_destroy_cred(scm);
}

static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
                                  unsigned int flags)
{
        if (!ufd)
                return -EFAULT;
        return receive_fd(f, ufd, flags);
}

#endif /* __LINUX_NET_SCM_H */








































































































































    2 













































    1 
    2 






































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/sched.h>
#include <linux/thread_info.h>

#include <asm/uaccess.h>

/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 *
 * Passing down mm_struct allows to define untagging rules on per-process
 * basis.
 *
 * It's defined as noop for architectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

#ifndef untagged_addr_remote
#define untagged_addr_remote(mm, addr)        ({                \
        mmap_assert_locked(mm);                                \
        untagged_addr(addr);                                \
})
#endif

/*
 * Architectures should provide two primitives (raw_copy_{to,from}_user())
 * and get rid of their private instances of copy_{to,from}_user() and
 * __copy_{to,from}_user{,_inatomic}().
 *
 * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
 * return the amount left to copy.  They should assume that access_ok() has
 * already been checked (and succeeded); they should *not* zero-pad anything.
 * No KASAN or object size checks either - those belong here.
 *
 * Both of these functions should attempt to copy size bytes starting at from
 * into the area starting at to.  They must not fetch or store anything
 * outside of those areas.  Return value must be between 0 (everything
 * copied successfully) and size (nothing copied).
 *
 * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
 * at to must become equal to the bytes fetched from the corresponding area
 * starting at from.  All data past to + size - N must be left unmodified.
 *
 * If copying succeeds, the return value must be 0.  If some data cannot be
 * fetched, it is permitted to copy less than had been fetched; the only
 * hard requirement is that not storing anything at all (i.e. returning size)
 * should happen only when nothing could be copied.  In other words, you don't
 * have to squeeze as much as possible - it is allowed, but not necessary.
 *
 * For raw_copy_from_user() to always points to kernel memory and no faults
 * on store should happen.  Interpretation of from is affected by set_fs().
 * For raw_copy_to_user() it's the other way round.
 *
 * Both can be inlined - it's up to architectures whether it wants to bother
 * with that.  They should not be used directly; they are used to implement
 * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
 * that are used instead.  Out of those, __... ones are inlined.  Plain
 * copy_{to,from}_user() might or might not be inlined.  If you want them
 * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
 *
 * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
 * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
 * at all; their callers absolutely must check the return value.
 *
 * Biarch ones should also provide raw_copy_in_user() - similar to the above,
 * but both source and destination are __user pointers (affected by set_fs()
 * as usual) and both source and destination can trigger faults.
 */

static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        instrument_copy_from_user_before(to, from, n);
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        might_fault();
        instrument_copy_from_user_before(to, from, n);
        if (should_fail_usercopy())
                return n;
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

/**
 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
 * @to:   Destination address, in user space.
 * @from: Source address, in kernel space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.
 *
 * Copy data from kernel space to user space.  Caller must check
 * the specified block with access_ok() before calling this function.
 * The caller should also make sure he pins the user space address
 * so that we don't result in page fault and sleep.
 */
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

#ifdef INLINE_COPY_FROM_USER
static inline __must_check unsigned long
_copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                instrument_copy_from_user_before(to, from, n);
                res = raw_copy_from_user(to, from, n);
                instrument_copy_from_user_after(to, from, n, res);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
#else
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);
#endif

#ifdef INLINE_COPY_TO_USER
static inline __must_check unsigned long
_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
#else
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);
#endif

static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
        if (check_copy_size(to, n, false))
                n = _copy_from_user(to, from, n);
        return n;
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
        if (check_copy_size(from, n, true))
                n = _copy_to_user(to, from, n);
        return n;
}

#ifndef copy_mc_to_kernel
/*
 * Without arch opt-in this generic copy_mc_to_kernel() will not handle
 * #MC (or arch equivalent) during source read.
 */
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
        return 0;
}
#endif

static __always_inline void pagefault_disabled_inc(void)
{
        current->pagefault_disabled++;
}

static __always_inline void pagefault_disabled_dec(void)
{
        current->pagefault_disabled--;
}

/*
 * These routines enable/disable the pagefault handler. If disabled, it will
 * not take any locks and go straight to the fixup table.
 *
 * User access methods will not sleep when called from a pagefault_disabled()
 * environment.
 */
static inline void pagefault_disable(void)
{
        pagefault_disabled_inc();
        /*
         * make sure to have issued the store before a pagefault
         * can hit.
         */
        barrier();
}

static inline void pagefault_enable(void)
{
        /*
         * make sure to issue those last loads/stores before enabling
         * the pagefault handler again.
         */
        barrier();
        pagefault_disabled_dec();
}

/*
 * Is the pagefault handler disabled? If so, user access methods will not sleep.
 */
static inline bool pagefault_disabled(void)
{
        return current->pagefault_disabled != 0;
}

/*
 * The pagefault handler is in general disabled by pagefault_disable() or
 * when in irq context (via in_atomic()).
 *
 * This function should only be used by the fault handlers. Other users should
 * stick to pagefault_disabled().
 * Please NEVER use preempt_disable() to disable the fault handler. With
 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
 */
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

#ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS

/**
 * probe_subpage_writeable: probe the user range for write faults at sub-page
 *                            granularity (e.g. arm64 MTE)
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns 0 on success, the number of bytes not probed on fault.
 *
 * It is expected that the caller checked for the write permission of each
 * page in the range either by put_user() or GUP. The architecture port can
 * implement a more efficient get_user() probing if the same sub-page faults
 * are triggered by either a read or a write.
 */
static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size)
{
        return 0;
}

#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */

#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline __must_check unsigned long
__copy_from_user_inatomic_nocache(void *to, const void __user *from,
                                  unsigned long n)
{
        return __copy_from_user_inatomic(to, from, n);
}

#endif                /* ARCH_HAS_NOCACHE_UACCESS */

extern __must_check int check_zeroed_user(const void __user *from, size_t size);

/**
 * copy_struct_from_user: copy a struct from userspace
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @src:   Source address, in userspace.
 * @usize: (Alleged) size of @src struct.
 *
 * Copies a struct from userspace to kernel space, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *        return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *        return -EINVAL;
 *
 *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
 *      if (err)
 *        return err;
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the userspace has passed an old struct to a
 *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
 *    are to be zero-filled.
 *  * If @usize > @ksize, then the userspace has passed a new struct to an
 *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
 *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
                      size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                int ret = check_zeroed_user(src + size, rest);
                if (ret <= 0)
                        return ret ?: -E2BIG;
        }
        /* Copy the interoperable parts of the struct. */
        if (copy_from_user(dst, src, size))
                return -EFAULT;
        return 0;
}

bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);

long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);

long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
                size_t size);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
                long count);

long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);

#ifndef __get_kernel_nofault
#define __get_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(src);        \
        type data;                                        \
        if (__get_user(data, p))                        \
                goto label;                                \
        *(type *)dst = data;                                \
} while (0)

#define __put_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(dst);        \
        type data = *(type *)src;                        \
        if (__put_user(data, p))                        \
                goto label;                                \
} while (0)
#endif

/**
 * get_kernel_nofault(): safely attempt to read from a location
 * @val: read into this variable
 * @ptr: address to read from
 *
 * Returns 0 on success, or -EFAULT.
 */
#define get_kernel_nofault(val, ptr) ({                                \
        const typeof(val) *__gk_ptr = (ptr);                        \
        copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})

#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif

#ifdef CONFIG_HARDENED_USERCOPY
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len);
#endif

#endif                /* __LINUX_UACCESS_H__ */


















































































































































































































    1 




















    1 
    1 
    1 
    1 

    1 




    1 




    1 




















    1 






























































































































































































































    1 
































    1 
    1 




    1 


    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
















    1 



















    1 























































































































































































































































































































































































































































































































    1 


    1 


















    1 










    1 










    1 









    1 













































    1 


    1 
















    1 





















    1 
    1 





    1 











    1 








    1 




    1 


    1 




    1 

















    1 



    1 



    1 







    1 


    1 













    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 











    2 
    1 










































    1 
































































    2 
    1 





    1 




    1 

































    2 







    1 









    1 





















































































































    1 







    1 













    1 












    1 












    1 
    1 


















































































































































    1 













    1 
    1 




    1 




    1 












    1 





















    1 









    1 






    1 


























    1 














































































    1 

    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xfrm_policy.c
 *
 * Changes:
 *        Mitsuru KANDA @USAGI
 *         Kazunori MIYAZAWA @USAGI
 *         Kunihiro Ishiguro <kunihiro@ipinfusion.com>
 *                 IPv6 support
 *         Kazunori MIYAZAWA @USAGI
 *         YOSHIFUJI Hideaki
 *                 Split up af-specific portion
 *        Derek Atkins <derek@ihtfp.com>                Add the post_input processor
 *
 */

#include <linux/err.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/cpu.h>
#include <linux/audit.h>
#include <linux/rhashtable.h>
#include <linux/if_tunnel.h>
#include <linux/icmp.h>
#include <net/dst.h>
#include <net/flow.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/gre.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/mip6.h>
#endif
#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif
#ifdef CONFIG_XFRM_ESPINTCP
#include <net/espintcp.h>
#endif

#include "xfrm_hash.h"

#define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10))
#define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
#define XFRM_MAX_QUEUE_LEN        100

struct xfrm_flo {
        struct dst_entry *dst_orig;
        u8 flags;
};

/* prefixes smaller than this are stored in lists, not trees. */
#define INEXACT_PREFIXLEN_IPV4        16
#define INEXACT_PREFIXLEN_IPV6        48

struct xfrm_pol_inexact_node {
        struct rb_node node;
        union {
                xfrm_address_t addr;
                struct rcu_head rcu;
        };
        u8 prefixlen;

        struct rb_root root;

        /* the policies matching this node, can be empty list */
        struct hlist_head hhead;
};

/* xfrm inexact policy search tree:
 * xfrm_pol_inexact_bin = hash(dir,type,family,if_id);
 *  |
 * +---- root_d: sorted by daddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 +- root: sorted by saddr/prefix
 * |                 |              |
 * |                 |         xfrm_pol_inexact_node
 * |                 |              |
 * |                 |              + root: unused
 * |                 |              |
 * |                 |              + hhead: saddr:daddr policies
 * |                 |
 * |                 +- coarse policies and all any:daddr policies
 * |
 * +---- root_s: sorted by saddr:prefix
 * |                 |
 * |        xfrm_pol_inexact_node
 * |                 |
 * |                 + root: unused
 * |                 |
 * |                 + hhead: saddr:any policies
 * |
 * +---- coarse policies and all any:any policies
 *
 * Lookups return four candidate lists:
 * 1. any:any list from top-level xfrm_pol_inexact_bin
 * 2. any:daddr list from daddr tree
 * 3. saddr:daddr list from 2nd level daddr tree
 * 4. saddr:any list from saddr tree
 *
 * This result set then needs to be searched for the policy with
 * the lowest priority.  If two results have same prio, youngest one wins.
 */

struct xfrm_pol_inexact_key {
        possible_net_t net;
        u32 if_id;
        u16 family;
        u8 dir, type;
};

struct xfrm_pol_inexact_bin {
        struct xfrm_pol_inexact_key k;
        struct rhash_head head;
        /* list containing '*:*' policies */
        struct hlist_head hhead;

        seqcount_spinlock_t count;
        /* tree sorted by daddr/prefix */
        struct rb_root root_d;

        /* tree sorted by saddr/prefix */
        struct rb_root root_s;

        /* slow path below */
        struct list_head inexact_bins;
        struct rcu_head rcu;
};

enum xfrm_pol_inexact_candidate_type {
        XFRM_POL_CAND_BOTH,
        XFRM_POL_CAND_SADDR,
        XFRM_POL_CAND_DADDR,
        XFRM_POL_CAND_ANY,

        XFRM_POL_CAND_MAX,
};

struct xfrm_pol_inexact_candidates {
        struct hlist_head *res[XFRM_POL_CAND_MAX];
};

struct xfrm_flow_keys {
        struct flow_dissector_key_basic basic;
        struct flow_dissector_key_control control;
        union {
                struct flow_dissector_key_ipv4_addrs ipv4;
                struct flow_dissector_key_ipv6_addrs ipv6;
        } addrs;
        struct flow_dissector_key_ip ip;
        struct flow_dissector_key_icmp icmp;
        struct flow_dissector_key_ports ports;
        struct flow_dissector_key_keyid gre;
};

static struct flow_dissector xfrm_session_dissector __ro_after_init;

static DEFINE_SPINLOCK(xfrm_if_cb_lock);
static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;

static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
                                                __read_mostly;

static struct kmem_cache *xfrm_dst_cache __ro_after_init;

static struct rhashtable xfrm_policy_inexact_table;
static const struct rhashtable_params xfrm_pol_inexact_params;

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
static int stale_bundle(struct dst_entry *dst);
static int xfrm_bundle_ok(struct xfrm_dst *xdst);
static void xfrm_policy_queue_process(struct timer_list *t);

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir);
static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir,
                           u32 if_id);

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net,
                               u8 type, u16 family, u8 dir, u32 if_id);
static struct xfrm_policy *
xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy,
                        bool excl);
static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
                                            struct xfrm_policy *policy);

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr);

static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy)
{
        return refcount_inc_not_zero(&policy->refcnt);
}

static inline bool
__xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        return  addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) &&
                addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl4->flowi4_proto == sel->proto || !sel->proto) &&
                (fl4->flowi4_oif == sel->ifindex || !sel->ifindex);
}

static inline bool
__xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl)
{
        const struct flowi6 *fl6 = &fl->u.ip6;

        return  addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) &&
                addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) &&
                !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) &&
                !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) &&
                (fl6->flowi6_proto == sel->proto || !sel->proto) &&
                (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
}

bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl,
                         unsigned short family)
{
        switch (family) {
        case AF_INET:
                return __xfrm4_selector_match(sel, fl);
        case AF_INET6:
                return __xfrm6_selector_match(sel, fl);
        }
        return false;
}

static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
{
        const struct xfrm_policy_afinfo *afinfo;

        if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return NULL;
        rcu_read_lock();
        afinfo = rcu_dereference(xfrm_policy_afinfo[family]);
        if (unlikely(!afinfo))
                rcu_read_unlock();
        return afinfo;
}

/* Called with rcu_read_lock(). */
static const struct xfrm_if_cb *xfrm_if_get_cb(void)
{
        return rcu_dereference(xfrm_if_cb);
}

struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr,
                                    int family, u32 mark)
{
        const struct xfrm_policy_afinfo *afinfo;
        struct dst_entry *dst;

        afinfo = xfrm_policy_get_afinfo(family);
        if (unlikely(afinfo == NULL))
                return ERR_PTR(-EAFNOSUPPORT);

        dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);

        rcu_read_unlock();

        return dst;
}
EXPORT_SYMBOL(__xfrm_dst_lookup);

static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
                                                int tos, int oif,
                                                xfrm_address_t *prev_saddr,
                                                xfrm_address_t *prev_daddr,
                                                int family, u32 mark)
{
        struct net *net = xs_net(x);
        xfrm_address_t *saddr = &x->props.saddr;
        xfrm_address_t *daddr = &x->id.daddr;
        struct dst_entry *dst;

        if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) {
                saddr = x->coaddr;
                daddr = prev_daddr;
        }
        if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) {
                saddr = prev_saddr;
                daddr = x->coaddr;
        }

        dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);

        if (!IS_ERR(dst)) {
                if (prev_saddr != saddr)
                        memcpy(prev_saddr, saddr,  sizeof(*prev_saddr));
                if (prev_daddr != daddr)
                        memcpy(prev_daddr, daddr,  sizeof(*prev_daddr));
        }

        return dst;
}

static inline unsigned long make_jiffies(long secs)
{
        if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
                return MAX_SCHEDULE_TIMEOUT-1;
        else
                return secs*HZ;
}

static void xfrm_policy_timer(struct timer_list *t)
{
        struct xfrm_policy *xp = from_timer(xp, t, timer);
        time64_t now = ktime_get_real_seconds();
        time64_t next = TIME64_MAX;
        int warn = 0;
        int dir;

        read_lock(&xp->lock);

        if (unlikely(xp->walk.dead))
                goto out;

        dir = xfrm_policy_id2dir(xp->index);

        if (xp->lft.hard_add_expires_seconds) {
                time64_t tmo = xp->lft.hard_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.hard_use_expires_seconds) {
                time64_t tmo = xp->lft.hard_use_expires_seconds +
                        (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
                if (tmo <= 0)
                        goto expired;
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_add_expires_seconds) {
                time64_t tmo = xp->lft.soft_add_expires_seconds +
                        xp->curlft.add_time - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }
        if (xp->lft.soft_use_expires_seconds) {
                time64_t tmo = xp->lft.soft_use_expires_seconds +
                        (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
                if (tmo <= 0) {
                        warn = 1;
                        tmo = XFRM_KM_TIMEOUT;
                }
                if (tmo < next)
                        next = tmo;
        }

        if (warn)
                km_policy_expired(xp, dir, 0, 0);
        if (next != TIME64_MAX &&
            !mod_timer(&xp->timer, jiffies + make_jiffies(next)))
                xfrm_pol_hold(xp);

out:
        read_unlock(&xp->lock);
        xfrm_pol_put(xp);
        return;

expired:
        read_unlock(&xp->lock);
        if (!xfrm_policy_delete(xp, dir))
                km_policy_expired(xp, dir, 1, 0);
        xfrm_pol_put(xp);
}

/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
 * SPD calls.
 */

struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
{
        struct xfrm_policy *policy;

        policy = kzalloc(sizeof(struct xfrm_policy), gfp);

        if (policy) {
                write_pnet(&policy->xp_net, net);
                INIT_LIST_HEAD(&policy->walk.all);
                INIT_HLIST_NODE(&policy->bydst_inexact_list);
                INIT_HLIST_NODE(&policy->bydst);
                INIT_HLIST_NODE(&policy->byidx);
                rwlock_init(&policy->lock);
                refcount_set(&policy->refcnt, 1);
                skb_queue_head_init(&policy->polq.hold_queue);
                timer_setup(&policy->timer, xfrm_policy_timer, 0);
                timer_setup(&policy->polq.hold_timer,
                            xfrm_policy_queue_process, 0);
        }
        return policy;
}
EXPORT_SYMBOL(xfrm_policy_alloc);

static void xfrm_policy_destroy_rcu(struct rcu_head *head)
{
        struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);

        security_xfrm_policy_free(policy->security);
        kfree(policy);
}

/* Destroy xfrm_policy: descendant resources must be released to this moment. */

void xfrm_policy_destroy(struct xfrm_policy *policy)
{
        BUG_ON(!policy->walk.dead);

        if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
                BUG();

        xfrm_dev_policy_free(policy);
        call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
}
EXPORT_SYMBOL(xfrm_policy_destroy);

/* Rule must be locked. Release descendant resources, announce
 * entry dead. The rule must be unlinked from lists to the moment.
 */

static void xfrm_policy_kill(struct xfrm_policy *policy)
{
        write_lock_bh(&policy->lock);
        policy->walk.dead = 1;
        write_unlock_bh(&policy->lock);

        atomic_inc(&policy->genid);

        if (del_timer(&policy->polq.hold_timer))
                xfrm_pol_put(policy);
        skb_queue_purge(&policy->polq.hold_queue);

        if (del_timer(&policy->timer))
                xfrm_pol_put(policy);

        xfrm_pol_put(policy);
}

static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;

static inline unsigned int idx_hash(struct net *net, u32 index)
{
        return __idx_hash(index, net->xfrm.policy_idx_hmask);
}

/* calculate policy hash thresholds */
static void __get_hash_thresh(struct net *net,
                              unsigned short family, int dir,
                              u8 *dbits, u8 *sbits)
{
        switch (family) {
        case AF_INET:
                *dbits = net->xfrm.policy_bydst[dir].dbits4;
                *sbits = net->xfrm.policy_bydst[dir].sbits4;
                break;

        case AF_INET6:
                *dbits = net->xfrm.policy_bydst[dir].dbits6;
                *sbits = net->xfrm.policy_bydst[dir].sbits6;
                break;

        default:
                *dbits = 0;
                *sbits = 0;
        }
}

static struct hlist_head *policy_hash_bysel(struct net *net,
                                            const struct xfrm_selector *sel,
                                            unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __sel_hash(sel, family, hmask, dbits, sbits);

        if (hash == hmask + 1)
                return NULL;

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static struct hlist_head *policy_hash_direct(struct net *net,
                                             const xfrm_address_t *daddr,
                                             const xfrm_address_t *saddr,
                                             unsigned short family, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int hash;
        u8 dbits;
        u8 sbits;

        __get_hash_thresh(net, family, dir, &dbits, &sbits);
        hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits);

        return rcu_dereference_check(net->xfrm.policy_bydst[dir].table,
                     lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash;
}

static void xfrm_dst_hash_transfer(struct net *net,
                                   struct hlist_head *list,
                                   struct hlist_head *ndsttable,
                                   unsigned int nhashmask,
                                   int dir)
{
        struct hlist_node *tmp, *entry0 = NULL;
        struct xfrm_policy *pol;
        unsigned int h0 = 0;
        u8 dbits;
        u8 sbits;

redo:
        hlist_for_each_entry_safe(pol, tmp, list, bydst) {
                unsigned int h;

                __get_hash_thresh(net, pol->family, dir, &dbits, &sbits);
                h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
                                pol->family, nhashmask, dbits, sbits);
                if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) {
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_head_rcu(&pol->bydst, ndsttable + h);
                        h0 = h;
                } else {
                        if (h != h0)
                                continue;
                        hlist_del_rcu(&pol->bydst);
                        hlist_add_behind_rcu(&pol->bydst, entry0);
                }
                entry0 = &pol->bydst;
        }
        if (!hlist_empty(list)) {
                entry0 = NULL;
                goto redo;
        }
}

static void xfrm_idx_hash_transfer(struct hlist_head *list,
                                   struct hlist_head *nidxtable,
                                   unsigned int nhashmask)
{
        struct hlist_node *tmp;
        struct xfrm_policy *pol;

        hlist_for_each_entry_safe(pol, tmp, list, byidx) {
                unsigned int h;

                h = __idx_hash(pol->index, nhashmask);
                hlist_add_head(&pol->byidx, nidxtable+h);
        }
}

static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
{
        return ((old_hmask + 1) << 1) - 1;
}

static void xfrm_bydst_resize(struct net *net, int dir)
{
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *ndst = xfrm_hash_alloc(nsize);
        struct hlist_head *odst;
        int i;

        if (!ndst)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));

        for (i = hmask; i >= 0; i--)
                xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir);

        rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
        net->xfrm.policy_bydst[dir].hmask = nhashmask;

        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        synchronize_rcu();

        xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
}

static void xfrm_byidx_resize(struct net *net)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;
        unsigned int nhashmask = xfrm_new_hash_mask(hmask);
        unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
        struct hlist_head *oidx = net->xfrm.policy_byidx;
        struct hlist_head *nidx = xfrm_hash_alloc(nsize);
        int i;

        if (!nidx)
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        for (i = hmask; i >= 0; i--)
                xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);

        net->xfrm.policy_byidx = nidx;
        net->xfrm.policy_idx_hmask = nhashmask;

        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
}

static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total)
{
        unsigned int cnt = net->xfrm.policy_count[dir];
        unsigned int hmask = net->xfrm.policy_bydst[dir].hmask;

        if (total)
                *total += cnt;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            cnt > hmask)
                return 1;

        return 0;
}

static inline int xfrm_byidx_should_resize(struct net *net, int total)
{
        unsigned int hmask = net->xfrm.policy_idx_hmask;

        if ((hmask + 1) < xfrm_policy_hashmax &&
            total > hmask)
                return 1;

        return 0;
}

void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si)
{
        si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN];
        si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT];
        si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD];
        si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX];
        si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX];
        si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX];
        si->spdhcnt = net->xfrm.policy_idx_hmask;
        si->spdhmcnt = xfrm_policy_hashmax;
}
EXPORT_SYMBOL(xfrm_spd_getinfo);

static DEFINE_MUTEX(hash_resize_mutex);
static void xfrm_hash_resize(struct work_struct *work)
{
        struct net *net = container_of(work, struct net, xfrm.policy_hash_work);
        int dir, total;

        mutex_lock(&hash_resize_mutex);

        total = 0;
        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                if (xfrm_bydst_should_resize(net, dir, &total))
                        xfrm_bydst_resize(net, dir);
        }
        if (xfrm_byidx_should_resize(net, total))
                xfrm_byidx_resize(net);

        mutex_unlock(&hash_resize_mutex);
}

/* Make sure *pol can be inserted into fastbin.
 * Useful to check that later insert requests will be successful
 * (provided xfrm_policy_lock is held throughout).
 */
static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir)
{
        struct xfrm_pol_inexact_bin *bin, *prev;
        struct xfrm_pol_inexact_key k = {
                .family = pol->family,
                .type = pol->type,
                .dir = dir,
                .if_id = pol->if_id,
        };
        struct net *net = xp_net(pol);

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        write_pnet(&k.net, net);
        bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k,
                                     xfrm_pol_inexact_params);
        if (bin)
                return bin;

        bin = kzalloc(sizeof(*bin), GFP_ATOMIC);
        if (!bin)
                return NULL;

        bin->k = k;
        INIT_HLIST_HEAD(&bin->hhead);
        bin->root_d = RB_ROOT;
        bin->root_s = RB_ROOT;
        seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);

        prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
                                                &bin->k, &bin->head,
                                                xfrm_pol_inexact_params);
        if (!prev) {
                list_add(&bin->inexact_bins, &net->xfrm.inexact_bins);
                return bin;
        }

        kfree(bin);

        return IS_ERR(prev) ? NULL : prev;
}

static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr,
                                               int family, u8 prefixlen)
{
        if (xfrm_addr_any(addr, family))
                return true;

        if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6)
                return true;

        if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4)
                return true;

        return false;
}

static bool
xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy)
{
        const xfrm_address_t *addr;
        bool saddr_any, daddr_any;
        u8 prefixlen;

        addr = &policy->selector.saddr;
        prefixlen = policy->selector.prefixlen_s;

        saddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        addr = &policy->selector.daddr;
        prefixlen = policy->selector.prefixlen_d;
        daddr_any = xfrm_pol_inexact_addr_use_any_list(addr,
                                                       policy->family,
                                                       prefixlen);
        return saddr_any && daddr_any;
}

static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node,
                                       const xfrm_address_t *addr, u8 prefixlen)
{
        node->addr = *addr;
        node->prefixlen = prefixlen;
}

static struct xfrm_pol_inexact_node *
xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen)
{
        struct xfrm_pol_inexact_node *node;

        node = kzalloc(sizeof(*node), GFP_ATOMIC);
        if (node)
                xfrm_pol_inexact_node_init(node, addr, prefixlen);

        return node;
}

static int xfrm_policy_addr_delta(const xfrm_address_t *a,
                                  const xfrm_address_t *b,
                                  u8 prefixlen, u16 family)
{
        u32 ma, mb, mask;
        unsigned int pdw, pbi;
        int delta = 0;

        switch (family) {
        case AF_INET:
                if (prefixlen == 0)
                        return 0;
                mask = ~0U << (32 - prefixlen);
                ma = ntohl(a->a4) & mask;
                mb = ntohl(b->a4) & mask;
                if (ma < mb)
                        delta = -1;
                else if (ma > mb)
                        delta = 1;
                break;
        case AF_INET6:
                pdw = prefixlen >> 5;
                pbi = prefixlen & 0x1f;

                if (pdw) {
                        delta = memcmp(a->a6, b->a6, pdw << 2);
                        if (delta)
                                return delta;
                }
                if (pbi) {
                        mask = ~0U << (32 - pbi);
                        ma = ntohl(a->a6[pdw]) & mask;
                        mb = ntohl(b->a6[pdw]) & mask;
                        if (ma < mb)
                                delta = -1;
                        else if (ma > mb)
                                delta = 1;
                }
                break;
        default:
                break;
        }

        return delta;
}

static void xfrm_policy_inexact_list_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              u16 family)
{
        unsigned int matched_s, matched_d;
        struct xfrm_policy *policy, *p;

        matched_s = 0;
        matched_d = 0;

        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                struct hlist_node *newpos = NULL;
                bool matches_s, matches_d;

                if (policy->walk.dead || !policy->bydst_reinsert)
                        continue;

                WARN_ON_ONCE(policy->family != family);

                policy->bydst_reinsert = false;
                hlist_for_each_entry(p, &n->hhead, bydst) {
                        if (policy->priority > p->priority)
                                newpos = &p->bydst;
                        else if (policy->priority == p->priority &&
                                 policy->pos > p->pos)
                                newpos = &p->bydst;
                        else
                                break;
                }

                if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, &n->hhead);

                /* paranoia checks follow.
                 * Check that the reinserted policy matches at least
                 * saddr or daddr for current node prefix.
                 *
                 * Matching both is fine, matching saddr in one policy
                 * (but not daddr) and then matching only daddr in another
                 * is a bug.
                 */
                matches_s = xfrm_policy_addr_delta(&policy->selector.saddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                matches_d = xfrm_policy_addr_delta(&policy->selector.daddr,
                                                   &n->addr,
                                                   n->prefixlen,
                                                   family) == 0;
                if (matches_s && matches_d)
                        continue;

                WARN_ON_ONCE(!matches_s && !matches_d);
                if (matches_s)
                        matched_s++;
                if (matches_d)
                        matched_d++;
                WARN_ON_ONCE(matched_s && matched_d);
        }
}

static void xfrm_policy_inexact_node_reinsert(struct net *net,
                                              struct xfrm_pol_inexact_node *n,
                                              struct rb_root *new,
                                              u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node **p, *parent;

        /* we should not have another subtree here */
        WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root));
restart:
        parent = NULL;
        p = &new->rb_node;
        while (*p) {
                u8 prefixlen;
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                prefixlen = min(node->prefixlen, n->prefixlen);

                delta = xfrm_policy_addr_delta(&n->addr, &node->addr,
                                               prefixlen, family);
                if (delta < 0) {
                        p = &parent->rb_left;
                } else if (delta > 0) {
                        p = &parent->rb_right;
                } else {
                        bool same_prefixlen = node->prefixlen == n->prefixlen;
                        struct xfrm_policy *tmp;

                        hlist_for_each_entry(tmp, &n->hhead, bydst) {
                                tmp->bydst_reinsert = true;
                                hlist_del_rcu(&tmp->bydst);
                        }

                        node->prefixlen = prefixlen;

                        xfrm_policy_inexact_list_reinsert(net, node, family);

                        if (same_prefixlen) {
                                kfree_rcu(n, rcu);
                                return;
                        }

                        rb_erase(*p, new);
                        kfree_rcu(n, rcu);
                        n = node;
                        goto restart;
                }
        }

        rb_link_node_rcu(&n->node, parent, p);
        rb_insert_color(&n->node, new);
}

/* merge nodes v and n */
static void xfrm_policy_inexact_node_merge(struct net *net,
                                           struct xfrm_pol_inexact_node *v,
                                           struct xfrm_pol_inexact_node *n,
                                           u16 family)
{
        struct xfrm_pol_inexact_node *node;
        struct xfrm_policy *tmp;
        struct rb_node *rnode;

        /* To-be-merged node v has a subtree.
         *
         * Dismantle it and insert its nodes to n->root.
         */
        while ((rnode = rb_first(&v->root)) != NULL) {
                node = rb_entry(rnode, struct xfrm_pol_inexact_node, node);
                rb_erase(&node->node, &v->root);
                xfrm_policy_inexact_node_reinsert(net, node, &n->root,
                                                  family);
        }

        hlist_for_each_entry(tmp, &v->hhead, bydst) {
                tmp->bydst_reinsert = true;
                hlist_del_rcu(&tmp->bydst);
        }

        xfrm_policy_inexact_list_reinsert(net, n, family);
}

static struct xfrm_pol_inexact_node *
xfrm_policy_inexact_insert_node(struct net *net,
                                struct rb_root *root,
                                xfrm_address_t *addr,
                                u16 family, u8 prefixlen, u8 dir)
{
        struct xfrm_pol_inexact_node *cached = NULL;
        struct rb_node **p, *parent = NULL;
        struct xfrm_pol_inexact_node *node;

        p = &root->rb_node;
        while (*p) {
                int delta;

                parent = *p;
                node = rb_entry(*p, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen,
                                               family);
                if (delta == 0 && prefixlen >= node->prefixlen) {
                        WARN_ON_ONCE(cached); /* ipsec policies got lost */
                        return node;
                }

                if (delta < 0)
                        p = &parent->rb_left;
                else
                        p = &parent->rb_right;

                if (prefixlen < node->prefixlen) {
                        delta = xfrm_policy_addr_delta(addr, &node->addr,
                                                       prefixlen,
                                                       family);
                        if (delta)
                                continue;

                        /* This node is a subnet of the new prefix. It needs
                         * to be removed and re-inserted with the smaller
                         * prefix and all nodes that are now also covered
                         * by the reduced prefixlen.
                         */
                        rb_erase(&node->node, root);

                        if (!cached) {
                                xfrm_pol_inexact_node_init(node, addr,
                                                           prefixlen);
                                cached = node;
                        } else {
                                /* This node also falls within the new
                                 * prefixlen. Merge the to-be-reinserted
                                 * node and this one.
                                 */
                                xfrm_policy_inexact_node_merge(net, node,
                                                               cached, family);
                                kfree_rcu(node, rcu);
                        }

                        /* restart */
                        p = &root->rb_node;
                        parent = NULL;
                }
        }

        node = cached;
        if (!node) {
                node = xfrm_pol_inexact_node_alloc(addr, prefixlen);
                if (!node)
                        return NULL;
        }

        rb_link_node_rcu(&node->node, parent, p);
        rb_insert_color(&node->node, root);

        return node;
}

static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm)
{
        struct xfrm_pol_inexact_node *node;
        struct rb_node *rn = rb_first(r);

        while (rn) {
                node = rb_entry(rn, struct xfrm_pol_inexact_node, node);

                xfrm_policy_inexact_gc_tree(&node->root, rm);
                rn = rb_next(rn);

                if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) {
                        WARN_ON_ONCE(rm);
                        continue;
                }

                rb_erase(&node->node, r);
                kfree_rcu(node, rcu);
        }
}

static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit)
{
        write_seqcount_begin(&b->count);
        xfrm_policy_inexact_gc_tree(&b->root_d, net_exit);
        xfrm_policy_inexact_gc_tree(&b->root_s, net_exit);
        write_seqcount_end(&b->count);

        if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) ||
            !hlist_empty(&b->hhead)) {
                WARN_ON_ONCE(net_exit);
                return;
        }

        if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head,
                                   xfrm_pol_inexact_params) == 0) {
                list_del(&b->inexact_bins);
                kfree_rcu(b, rcu);
        }
}

static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b)
{
        struct net *net = read_pnet(&b->k.net);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        __xfrm_policy_inexact_prune_bin(b, false);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static void __xfrm_policy_inexact_flush(struct net *net)
{
        struct xfrm_pol_inexact_bin *bin, *t;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(bin, false);
}

static struct hlist_head *
xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin,
                                struct xfrm_policy *policy, u8 dir)
{
        struct xfrm_pol_inexact_node *n;
        struct net *net;

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        if (xfrm_policy_inexact_insert_use_any_list(policy))
                return &bin->hhead;

        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr,
                                               policy->family,
                                               policy->selector.prefixlen_d)) {
                write_seqcount_begin(&bin->count);
                n = xfrm_policy_inexact_insert_node(net,
                                                    &bin->root_s,
                                                    &policy->selector.saddr,
                                                    policy->family,
                                                    policy->selector.prefixlen_s,
                                                    dir);
                write_seqcount_end(&bin->count);
                if (!n)
                        return NULL;

                return &n->hhead;
        }

        /* daddr is fixed */
        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &bin->root_d,
                                            &policy->selector.daddr,
                                            policy->family,
                                            policy->selector.prefixlen_d, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        /* saddr is wildcard */
        if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr,
                                               policy->family,
                                               policy->selector.prefixlen_s))
                return &n->hhead;

        write_seqcount_begin(&bin->count);
        n = xfrm_policy_inexact_insert_node(net,
                                            &n->root,
                                            &policy->selector.saddr,
                                            policy->family,
                                            policy->selector.prefixlen_s, dir);
        write_seqcount_end(&bin->count);
        if (!n)
                return NULL;

        return &n->hhead;
}

static struct xfrm_policy *
xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl)
{
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *delpol;
        struct hlist_head *chain;
        struct net *net;

        bin = xfrm_policy_inexact_alloc_bin(policy, dir);
        if (!bin)
                return ERR_PTR(-ENOMEM);

        net = xp_net(policy);
        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir);
        if (!chain) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-ENOMEM);
        }

        delpol = xfrm_policy_insert_list(chain, policy, excl);
        if (delpol && excl) {
                __xfrm_policy_inexact_prune_bin(bin, false);
                return ERR_PTR(-EEXIST);
        }

        chain = &net->xfrm.policy_inexact[dir];
        xfrm_policy_insert_inexact_list(chain, policy);

        if (delpol)
                __xfrm_policy_inexact_prune_bin(bin, false);

        return delpol;
}

static void xfrm_hash_rebuild(struct work_struct *work)
{
        struct net *net = container_of(work, struct net,
                                       xfrm.policy_hthresh.work);
        unsigned int hmask;
        struct xfrm_policy *pol;
        struct xfrm_policy *policy;
        struct hlist_head *chain;
        struct hlist_head *odst;
        struct hlist_node *newpos;
        int i;
        int dir;
        unsigned seq;
        u8 lbits4, rbits4, lbits6, rbits6;

        mutex_lock(&hash_resize_mutex);

        /* read selector prefixlen thresholds */
        do {
                seq = read_seqbegin(&net->xfrm.policy_hthresh.lock);

                lbits4 = net->xfrm.policy_hthresh.lbits4;
                rbits4 = net->xfrm.policy_hthresh.rbits4;
                lbits6 = net->xfrm.policy_hthresh.lbits6;
                rbits6 = net->xfrm.policy_hthresh.rbits6;
        } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);

        /* make sure that we can insert the indirect policies again before
         * we start with destructive action.
         */
        list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) {
                struct xfrm_pol_inexact_bin *bin;
                u8 dbits, sbits;

                if (policy->walk.dead)
                        continue;

                dir = xfrm_policy_id2dir(policy->index);
                if (dir >= XFRM_POLICY_MAX)
                        continue;

                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        if (policy->family == AF_INET) {
                                dbits = rbits4;
                                sbits = lbits4;
                        } else {
                                dbits = rbits6;
                                sbits = lbits6;
                        }
                } else {
                        if (policy->family == AF_INET) {
                                dbits = lbits4;
                                sbits = rbits4;
                        } else {
                                dbits = lbits6;
                                sbits = rbits6;
                        }
                }

                if (policy->selector.prefixlen_d < dbits ||
                    policy->selector.prefixlen_s < sbits)
                        continue;

                bin = xfrm_policy_inexact_alloc_bin(policy, dir);
                if (!bin)
                        goto out_unlock;

                if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir))
                        goto out_unlock;
        }

        /* reset the bydst and inexact table in all directions */
        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct hlist_node *n;

                hlist_for_each_entry_safe(policy, n,
                                          &net->xfrm.policy_inexact[dir],
                                          bydst_inexact_list) {
                        hlist_del_rcu(&policy->bydst);
                        hlist_del_init(&policy->bydst_inexact_list);
                }

                hmask = net->xfrm.policy_bydst[dir].hmask;
                odst = net->xfrm.policy_bydst[dir].table;
                for (i = hmask; i >= 0; i--) {
                        hlist_for_each_entry_safe(policy, n, odst + i, bydst)
                                hlist_del_rcu(&policy->bydst);
                }
                if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
                        /* dir out => dst = remote, src = local */
                        net->xfrm.policy_bydst[dir].dbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = rbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = lbits6;
                } else {
                        /* dir in/fwd => dst = local, src = remote */
                        net->xfrm.policy_bydst[dir].dbits4 = lbits4;
                        net->xfrm.policy_bydst[dir].sbits4 = rbits4;
                        net->xfrm.policy_bydst[dir].dbits6 = lbits6;
                        net->xfrm.policy_bydst[dir].sbits6 = rbits6;
                }
        }

        /* re-insert all policies by order of creation */
        list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) {
                if (policy->walk.dead)
                        continue;
                dir = xfrm_policy_id2dir(policy->index);
                if (dir >= XFRM_POLICY_MAX) {
                        /* skip socket policies */
                        continue;
                }
                newpos = NULL;
                chain = policy_hash_bysel(net, &policy->selector,
                                          policy->family, dir);

                if (!chain) {
                        void *p = xfrm_policy_inexact_insert(policy, dir, 0);

                        WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p));
                        continue;
                }

                hlist_for_each_entry(pol, chain, bydst) {
                        if (policy->priority >= pol->priority)
                                newpos = &pol->bydst;
                        else
                                break;
                }
                if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                        hlist_add_behind_rcu(&policy->bydst, newpos);
                else
                        hlist_add_head_rcu(&policy->bydst, chain);
        }

out_unlock:
        __xfrm_policy_inexact_flush(net);
        write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        mutex_unlock(&hash_resize_mutex);
}

void xfrm_policy_hash_rebuild(struct net *net)
{
        schedule_work(&net->xfrm.policy_hthresh.work);
}
EXPORT_SYMBOL(xfrm_policy_hash_rebuild);

/* Generate new index... KAME seems to generate them ordered by cost
 * of an absolute inpredictability of ordering of rules. This will not pass. */
static u32 xfrm_gen_index(struct net *net, int dir, u32 index)
{
        for (;;) {
                struct hlist_head *list;
                struct xfrm_policy *p;
                u32 idx;
                int found;

                if (!index) {
                        idx = (net->xfrm.idx_generator | dir);
                        net->xfrm.idx_generator += 8;
                } else {
                        idx = index;
                        index = 0;
                }

                if (idx == 0)
                        idx = 8;
                list = net->xfrm.policy_byidx + idx_hash(net, idx);
                found = 0;
                hlist_for_each_entry(p, list, byidx) {
                        if (p->index == idx) {
                                found = 1;
                                break;
                        }
                }
                if (!found)
                        return idx;
        }
}

static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
{
        u32 *p1 = (u32 *) s1;
        u32 *p2 = (u32 *) s2;
        int len = sizeof(struct xfrm_selector) / sizeof(u32);
        int i;

        for (i = 0; i < len; i++) {
                if (p1[i] != p2[i])
                        return 1;
        }

        return 0;
}

static void xfrm_policy_requeue(struct xfrm_policy *old,
                                struct xfrm_policy *new)
{
        struct xfrm_policy_queue *pq = &old->polq;
        struct sk_buff_head list;

        if (skb_queue_empty(&pq->hold_queue))
                return;

        __skb_queue_head_init(&list);

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice_init(&pq->hold_queue, &list);
        if (del_timer(&pq->hold_timer))
                xfrm_pol_put(old);
        spin_unlock_bh(&pq->hold_queue.lock);

        pq = &new->polq;

        spin_lock_bh(&pq->hold_queue.lock);
        skb_queue_splice(&list, &pq->hold_queue);
        pq->timeout = XFRM_QUEUE_TMO_MIN;
        if (!mod_timer(&pq->hold_timer, jiffies))
                xfrm_pol_hold(new);
        spin_unlock_bh(&pq->hold_queue.lock);
}

static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
                                          struct xfrm_policy *pol)
{
        return mark->v == pol->mark.v && mark->m == pol->mark.m;
}

static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_key *k = data;
        u32 a = k->type << 24 | k->dir << 16 | k->family;

        return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)),
                            seed);
}

static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed)
{
        const struct xfrm_pol_inexact_bin *b = data;

        return xfrm_pol_bin_key(&b->k, 0, seed);
}

static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg,
                            const void *ptr)
{
        const struct xfrm_pol_inexact_key *key = arg->key;
        const struct xfrm_pol_inexact_bin *b = ptr;
        int ret;

        if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net)))
                return -1;

        ret = b->k.dir ^ key->dir;
        if (ret)
                return ret;

        ret = b->k.type ^ key->type;
        if (ret)
                return ret;

        ret = b->k.family ^ key->family;
        if (ret)
                return ret;

        return b->k.if_id ^ key->if_id;
}

static const struct rhashtable_params xfrm_pol_inexact_params = {
        .head_offset                = offsetof(struct xfrm_pol_inexact_bin, head),
        .hashfn                        = xfrm_pol_bin_key,
        .obj_hashfn                = xfrm_pol_bin_obj,
        .obj_cmpfn                = xfrm_pol_bin_cmp,
        .automatic_shrinking        = true,
};

static void xfrm_policy_insert_inexact_list(struct hlist_head *chain,
                                            struct xfrm_policy *policy)
{
        struct xfrm_policy *pol, *delpol = NULL;
        struct hlist_node *newpos = NULL;
        int i = 0;

        hlist_for_each_entry(pol, chain, bydst_inexact_list) {
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
                    xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        delpol = pol;
                        if (policy->priority > pol->priority)
                                continue;
                } else if (policy->priority >= pol->priority) {
                        newpos = &pol->bydst_inexact_list;
                        continue;
                }
                if (delpol)
                        break;
        }

        if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos);
        else
                hlist_add_head_rcu(&policy->bydst_inexact_list, chain);

        hlist_for_each_entry(pol, chain, bydst_inexact_list) {
                pol->pos = i;
                i++;
        }
}

static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain,
                                                   struct xfrm_policy *policy,
                                                   bool excl)
{
        struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
                    xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        if (excl)
                                return ERR_PTR(-EEXIST);
                        delpol = pol;
                        if (policy->priority > pol->priority)
                                continue;
                } else if (policy->priority >= pol->priority) {
                        newpos = pol;
                        continue;
                }
                if (delpol)
                        break;
        }

        if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET)
                hlist_add_behind_rcu(&policy->bydst, &newpos->bydst);
        else
                /* Packet offload policies enter to the head
                 * to speed-up lookups.
                 */
                hlist_add_head_rcu(&policy->bydst, chain);

        return delpol;
}

int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
{
        struct net *net = xp_net(policy);
        struct xfrm_policy *delpol;
        struct hlist_head *chain;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
        if (chain)
                delpol = xfrm_policy_insert_list(chain, policy, excl);
        else
                delpol = xfrm_policy_inexact_insert(policy, dir, excl);

        if (IS_ERR(delpol)) {
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                return PTR_ERR(delpol);
        }

        __xfrm_policy_link(policy, dir);

        /* After previous checking, family can either be AF_INET or AF_INET6 */
        if (policy->family == AF_INET)
                rt_genid_bump_ipv4(net);
        else
                rt_genid_bump_ipv6(net);

        if (delpol) {
                xfrm_policy_requeue(delpol, policy);
                __xfrm_policy_unlink(delpol, dir);
        }
        policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
        hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
        policy->curlft.add_time = ktime_get_real_seconds();
        policy->curlft.use_time = 0;
        if (!mod_timer(&policy->timer, jiffies + HZ))
                xfrm_pol_hold(policy);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (delpol)
                xfrm_policy_kill(delpol);
        else if (xfrm_bydst_should_resize(net, dir, NULL))
                schedule_work(&net->xfrm.policy_hash_work);

        return 0;
}
EXPORT_SYMBOL(xfrm_policy_insert);

static struct xfrm_policy *
__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
                        u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
                        struct xfrm_sec_ctx *ctx)
{
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == type &&
                    pol->if_id == if_id &&
                    xfrm_policy_mark_match(mark, pol) &&
                    !selector_cmp(sel, &pol->selector) &&
                    xfrm_sec_ctx_match(ctx, pol->security))
                        return pol;
        }

        return NULL;
}

struct xfrm_policy *
xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                      u8 type, int dir, struct xfrm_selector *sel,
                      struct xfrm_sec_ctx *ctx, int delete, int *err)
{
        struct xfrm_pol_inexact_bin *bin = NULL;
        struct xfrm_policy *pol, *ret = NULL;
        struct hlist_head *chain;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_bysel(net, sel, sel->family, dir);
        if (!chain) {
                struct xfrm_pol_inexact_candidates cand;
                int i;

                bin = xfrm_policy_inexact_lookup(net, type,
                                                 sel->family, dir, if_id);
                if (!bin) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                if (!xfrm_policy_find_inexact_candidates(&cand, bin,
                                                         &sel->saddr,
                                                         &sel->daddr)) {
                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                        return NULL;
                }

                pol = NULL;
                for (i = 0; i < ARRAY_SIZE(cand.res); i++) {
                        struct xfrm_policy *tmp;

                        tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark,
                                                      if_id, type, dir,
                                                      sel, ctx);
                        if (!tmp)
                                continue;

                        if (!pol || tmp->pos < pol->pos)
                                pol = tmp;
                }
        } else {
                pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir,
                                              sel, ctx);
        }

        if (pol) {
                xfrm_pol_hold(pol);
                if (delete) {
                        *err = security_xfrm_policy_delete(pol->security);
                        if (*err) {
                                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                return pol;
                        }
                        __xfrm_policy_unlink(pol, dir);
                }
                ret = pol;
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        if (bin && delete)
                xfrm_policy_inexact_prune_bin(bin);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);

struct xfrm_policy *
xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
                 u8 type, int dir, u32 id, int delete, int *err)
{
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;

        *err = -ENOENT;
        if (xfrm_policy_id2dir(id) != dir)
                return NULL;

        *err = 0;
        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = net->xfrm.policy_byidx + idx_hash(net, id);
        ret = NULL;
        hlist_for_each_entry(pol, chain, byidx) {
                if (pol->type == type && pol->index == id &&
                    pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
                        xfrm_pol_hold(pol);
                        if (delete) {
                                *err = security_xfrm_policy_delete(
                                                                pol->security);
                                if (*err) {
                                        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                                        return pol;
                                }
                                __xfrm_policy_unlink(pol, dir);
                        }
                        ret = pol;
                        break;
                }
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (ret && delete)
                xfrm_policy_kill(ret);
        return ret;
}
EXPORT_SYMBOL(xfrm_policy_byid);

#ifdef CONFIG_SECURITY_NETWORK_XFRM
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        struct xfrm_policy *pol;
        int err = 0;

        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead ||
                    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                err = security_xfrm_policy_delete(pol->security);
                if (err) {
                        xfrm_audit_policy_delete(pol, 0, task_valid);
                        return err;
                }
        }
        return err;
}

static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
                                                     struct net_device *dev,
                                                     bool task_valid)
{
        struct xfrm_policy *pol;
        int err = 0;

        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead ||
                    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
                    pol->xdo.dev != dev)
                        continue;

                err = security_xfrm_policy_delete(pol->security);
                if (err) {
                        xfrm_audit_policy_delete(pol, 0, task_valid);
                        return err;
                }
        }
        return err;
}
#else
static inline int
xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
{
        return 0;
}

static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
                                                     struct net_device *dev,
                                                     bool task_valid)
{
        return 0;
}
#endif

int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
{
        int dir, err = 0, cnt = 0;
        struct xfrm_policy *pol;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        err = xfrm_policy_flush_secctx_check(net, type, task_valid);
        if (err)
                goto out;

again:
        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead)
                        continue;

                dir = xfrm_policy_id2dir(pol->index);
                if (dir >= XFRM_POLICY_MAX ||
                    pol->type != type)
                        continue;

                __xfrm_policy_unlink(pol, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_dev_policy_delete(pol);
                cnt++;
                xfrm_audit_policy_delete(pol, 1, task_valid);
                xfrm_policy_kill(pol);
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                goto again;
        }
        if (cnt)
                __xfrm_policy_inexact_flush(net);
        else
                err = -ESRCH;
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_policy_flush);

int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
                          bool task_valid)
{
        int dir, err = 0, cnt = 0;
        struct xfrm_policy *pol;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);

        err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid);
        if (err)
                goto out;

again:
        list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
                if (pol->walk.dead)
                        continue;

                dir = xfrm_policy_id2dir(pol->index);
                if (dir >= XFRM_POLICY_MAX ||
                    pol->xdo.dev != dev)
                        continue;

                __xfrm_policy_unlink(pol, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_dev_policy_delete(pol);
                cnt++;
                xfrm_audit_policy_delete(pol, 1, task_valid);
                xfrm_policy_kill(pol);
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                goto again;
        }
        if (cnt)
                __xfrm_policy_inexact_flush(net);
        else
                err = -ESRCH;
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return err;
}
EXPORT_SYMBOL(xfrm_dev_policy_flush);

int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
                     int (*func)(struct xfrm_policy *, int, int, void*),
                     void *data)
{
        struct xfrm_policy *pol;
        struct xfrm_policy_walk_entry *x;
        int error = 0;

        if (walk->type >= XFRM_POLICY_TYPE_MAX &&
            walk->type != XFRM_POLICY_TYPE_ANY)
                return -EINVAL;

        if (list_empty(&walk->walk.all) && walk->seq != 0)
                return 0;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        if (list_empty(&walk->walk.all))
                x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all);
        else
                x = list_first_entry(&walk->walk.all,
                                     struct xfrm_policy_walk_entry, all);

        list_for_each_entry_from(x, &net->xfrm.policy_all, all) {
                if (x->dead)
                        continue;
                pol = container_of(x, struct xfrm_policy, walk);
                if (walk->type != XFRM_POLICY_TYPE_ANY &&
                    walk->type != pol->type)
                        continue;
                error = func(pol, xfrm_policy_id2dir(pol->index),
                             walk->seq, data);
                if (error) {
                        list_move_tail(&walk->walk.all, &x->all);
                        goto out;
                }
                walk->seq++;
        }
        if (walk->seq == 0) {
                error = -ENOENT;
                goto out;
        }
        list_del_init(&walk->walk.all);
out:
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        return error;
}
EXPORT_SYMBOL(xfrm_policy_walk);

void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type)
{
        INIT_LIST_HEAD(&walk->walk.all);
        walk->walk.dead = 1;
        walk->type = type;
        walk->seq = 0;
}
EXPORT_SYMBOL(xfrm_policy_walk_init);

void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net)
{
        if (list_empty(&walk->walk.all))
                return;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */
        list_del(&walk->walk.all);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}
EXPORT_SYMBOL(xfrm_policy_walk_done);

/*
 * Find policy to apply to this flow.
 *
 * Returns 0 if policy found, else an -errno.
 */
static int xfrm_policy_match(const struct xfrm_policy *pol,
                             const struct flowi *fl,
                             u8 type, u16 family, u32 if_id)
{
        const struct xfrm_selector *sel = &pol->selector;
        int ret = -ESRCH;
        bool match;

        if (pol->family != family ||
            pol->if_id != if_id ||
            (fl->flowi_mark & pol->mark.m) != pol->mark.v ||
            pol->type != type)
                return ret;

        match = xfrm_selector_match(sel, fl, family);
        if (match)
                ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid);
        return ret;
}

static struct xfrm_pol_inexact_node *
xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
                                seqcount_spinlock_t *count,
                                const xfrm_address_t *addr, u16 family)
{
        const struct rb_node *parent;
        int seq;

again:
        seq = read_seqcount_begin(count);

        parent = rcu_dereference_raw(r->rb_node);
        while (parent) {
                struct xfrm_pol_inexact_node *node;
                int delta;

                node = rb_entry(parent, struct xfrm_pol_inexact_node, node);

                delta = xfrm_policy_addr_delta(addr, &node->addr,
                                               node->prefixlen, family);
                if (delta < 0) {
                        parent = rcu_dereference_raw(parent->rb_left);
                        continue;
                } else if (delta > 0) {
                        parent = rcu_dereference_raw(parent->rb_right);
                        continue;
                }

                return node;
        }

        if (read_seqcount_retry(count, seq))
                goto again;

        return NULL;
}

static bool
xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand,
                                    struct xfrm_pol_inexact_bin *b,
                                    const xfrm_address_t *saddr,
                                    const xfrm_address_t *daddr)
{
        struct xfrm_pol_inexact_node *n;
        u16 family;

        if (!b)
                return false;

        family = b->k.family;
        memset(cand, 0, sizeof(*cand));
        cand->res[XFRM_POL_CAND_ANY] = &b->hhead;

        n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr,
                                            family);
        if (n) {
                cand->res[XFRM_POL_CAND_DADDR] = &n->hhead;
                n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr,
                                                    family);
                if (n)
                        cand->res[XFRM_POL_CAND_BOTH] = &n->hhead;
        }

        n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr,
                                            family);
        if (n)
                cand->res[XFRM_POL_CAND_SADDR] = &n->hhead;

        return true;
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family,
                               u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_key k = {
                .family = family,
                .type = type,
                .dir = dir,
                .if_id = if_id,
        };

        write_pnet(&k.net, net);

        return rhashtable_lookup(&xfrm_policy_inexact_table, &k,
                                 xfrm_pol_inexact_params);
}

static struct xfrm_pol_inexact_bin *
xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family,
                           u8 dir, u32 if_id)
{
        struct xfrm_pol_inexact_bin *bin;

        lockdep_assert_held(&net->xfrm.xfrm_policy_lock);

        rcu_read_lock();
        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        rcu_read_unlock();

        return bin;
}

static struct xfrm_policy *
__xfrm_policy_eval_candidates(struct hlist_head *chain,
                              struct xfrm_policy *prefer,
                              const struct flowi *fl,
                              u8 type, u16 family, u32 if_id)
{
        u32 priority = prefer ? prefer->priority : ~0u;
        struct xfrm_policy *pol;

        if (!chain)
                return NULL;

        hlist_for_each_entry_rcu(pol, chain, bydst) {
                int err;

                if (pol->priority > priority)
                        break;

                err = xfrm_policy_match(pol, fl, type, family, if_id);
                if (err) {
                        if (err != -ESRCH)
                                return ERR_PTR(err);

                        continue;
                }

                if (prefer) {
                        /* matches.  Is it older than *prefer? */
                        if (pol->priority == priority &&
                            prefer->pos < pol->pos)
                                return prefer;
                }

                return pol;
        }

        return NULL;
}

static struct xfrm_policy *
xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
                            struct xfrm_policy *prefer,
                            const struct flowi *fl,
                            u8 type, u16 family, u32 if_id)
{
        struct xfrm_policy *tmp;
        int i;

        for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
                tmp = __xfrm_policy_eval_candidates(cand->res[i],
                                                    prefer,
                                                    fl, type, family, if_id);
                if (!tmp)
                        continue;

                if (IS_ERR(tmp))
                        return tmp;
                prefer = tmp;
        }

        return prefer;
}

static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
                                                     const struct flowi *fl,
                                                     u16 family, u8 dir,
                                                     u32 if_id)
{
        struct xfrm_pol_inexact_candidates cand;
        const xfrm_address_t *daddr, *saddr;
        struct xfrm_pol_inexact_bin *bin;
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;
        unsigned int sequence;
        int err;

        daddr = xfrm_flowi_daddr(fl, family);
        saddr = xfrm_flowi_saddr(fl, family);
        if (unlikely(!daddr || !saddr))
                return NULL;

        rcu_read_lock();
 retry:
        do {
                sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
                chain = policy_hash_direct(net, daddr, saddr, family, dir);
        } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));

        ret = NULL;
        hlist_for_each_entry_rcu(pol, chain, bydst) {
                err = xfrm_policy_match(pol, fl, type, family, if_id);
                if (err) {
                        if (err == -ESRCH)
                                continue;
                        else {
                                ret = ERR_PTR(err);
                                goto fail;
                        }
                } else {
                        ret = pol;
                        break;
                }
        }
        if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET)
                goto skip_inexact;

        bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id);
        if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr,
                                                         daddr))
                goto skip_inexact;

        pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
                                          family, if_id);
        if (pol) {
                ret = pol;
                if (IS_ERR(pol))
                        goto fail;
        }

skip_inexact:
        if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
                goto retry;

        if (ret && !xfrm_pol_hold_rcu(ret))
                goto retry;
fail:
        rcu_read_unlock();

        return ret;
}

static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
                                              const struct flowi *fl,
                                              u16 family, u8 dir, u32 if_id)
{
#ifdef CONFIG_XFRM_SUB_POLICY
        struct xfrm_policy *pol;

        pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
                                        dir, if_id);
        if (pol != NULL)
                return pol;
#endif
        return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
                                         dir, if_id);
}

static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
                                                 const struct flowi *fl,
                                                 u16 family, u32 if_id)
{
        struct xfrm_policy *pol;

        rcu_read_lock();
 again:
        pol = rcu_dereference(sk->sk_policy[dir]);
        if (pol != NULL) {
                bool match;
                int err = 0;

                if (pol->family != family) {
                        pol = NULL;
                        goto out;
                }

                match = xfrm_selector_match(&pol->selector, fl, family);
                if (match) {
                        if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v ||
                            pol->if_id != if_id) {
                                pol = NULL;
                                goto out;
                        }
                        err = security_xfrm_policy_lookup(pol->security,
                                                      fl->flowi_secid);
                        if (!err) {
                                if (!xfrm_pol_hold_rcu(pol))
                                        goto again;
                        } else if (err == -ESRCH) {
                                pol = NULL;
                        } else {
                                pol = ERR_PTR(err);
                        }
                } else
                        pol = NULL;
        }
out:
        rcu_read_unlock();
        return pol;
}

static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        list_add(&pol->walk.all, &net->xfrm.policy_all);
        net->xfrm.policy_count[dir]++;
        xfrm_pol_hold(pol);
}

static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
                                                int dir)
{
        struct net *net = xp_net(pol);

        if (list_empty(&pol->walk.all))
                return NULL;

        /* Socket policies are not hashed. */
        if (!hlist_unhashed(&pol->bydst)) {
                hlist_del_rcu(&pol->bydst);
                hlist_del_init(&pol->bydst_inexact_list);
                hlist_del(&pol->byidx);
        }

        list_del_init(&pol->walk.all);
        net->xfrm.policy_count[dir]--;

        return pol;
}

static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir);
}

static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir)
{
        __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir);
}

int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
{
        struct net *net = xp_net(pol);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        pol = __xfrm_policy_unlink(pol, dir);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
        if (pol) {
                xfrm_dev_policy_delete(pol);
                xfrm_policy_kill(pol);
                return 0;
        }
        return -ENOENT;
}
EXPORT_SYMBOL(xfrm_policy_delete);

int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
{
        struct net *net = sock_net(sk);
        struct xfrm_policy *old_pol;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
                return -EINVAL;
#endif

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        old_pol = rcu_dereference_protected(sk->sk_policy[dir],
                                lockdep_is_held(&net->xfrm.xfrm_policy_lock));
        if (pol) {
                pol->curlft.add_time = ktime_get_real_seconds();
                pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
                xfrm_sk_policy_link(pol, dir);
        }
        rcu_assign_pointer(sk->sk_policy[dir], pol);
        if (old_pol) {
                if (pol)
                        xfrm_policy_requeue(old_pol, pol);

                /* Unlinking succeeds always. This is the only function
                 * allowed to delete or replace socket policy.
                 */
                xfrm_sk_policy_unlink(old_pol, dir);
        }
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        if (old_pol) {
                xfrm_policy_kill(old_pol);
        }
        return 0;
}

static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
{
        struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC);
        struct net *net = xp_net(old);

        if (newp) {
                newp->selector = old->selector;
                if (security_xfrm_policy_clone(old->security,
                                               &newp->security)) {
                        kfree(newp);
                        return NULL;  /* ENOMEM */
                }
                newp->lft = old->lft;
                newp->curlft = old->curlft;
                newp->mark = old->mark;
                newp->if_id = old->if_id;
                newp->action = old->action;
                newp->flags = old->flags;
                newp->xfrm_nr = old->xfrm_nr;
                newp->index = old->index;
                newp->type = old->type;
                newp->family = old->family;
                memcpy(newp->xfrm_vec, old->xfrm_vec,
                       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
                spin_lock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_sk_policy_link(newp, dir);
                spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
                xfrm_pol_put(newp);
        }
        return newp;
}

int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
        const struct xfrm_policy *p;
        struct xfrm_policy *np;
        int i, ret = 0;

        rcu_read_lock();
        for (i = 0; i < 2; i++) {
                p = rcu_dereference(osk->sk_policy[i]);
                if (p) {
                        np = clone_policy(p, i);
                        if (unlikely(!np)) {
                                ret = -ENOMEM;
                                break;
                        }
                        rcu_assign_pointer(sk->sk_policy[i], np);
                }
        }
        rcu_read_unlock();
        return ret;
}

static int
xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
               xfrm_address_t *remote, unsigned short family, u32 mark)
{
        int err;
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);

        if (unlikely(afinfo == NULL))
                return -EINVAL;
        err = afinfo->get_saddr(net, oif, local, remote, mark);
        rcu_read_unlock();
        return err;
}

/* Resolve list of templates for the flow, given policy. */

static int
xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
                      struct xfrm_state **xfrm, unsigned short family)
{
        struct net *net = xp_net(policy);
        int nx;
        int i, error;
        xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
        xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
        xfrm_address_t tmp;

        for (nx = 0, i = 0; i < policy->xfrm_nr; i++) {
                struct xfrm_state *x;
                xfrm_address_t *remote = daddr;
                xfrm_address_t *local  = saddr;
                struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];

                if (tmpl->mode == XFRM_MODE_TUNNEL ||
                    tmpl->mode == XFRM_MODE_BEET) {
                        remote = &tmpl->id.daddr;
                        local = &tmpl->saddr;
                        if (xfrm_addr_any(local, tmpl->encap_family)) {
                                error = xfrm_get_saddr(net, fl->flowi_oif,
                                                       &tmp, remote,
                                                       tmpl->encap_family, 0);
                                if (error)
                                        goto fail;
                                local = &tmp;
                        }
                }

                x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
                                    family, policy->if_id);
                if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR);
                        xfrm_state_put(x);
                        error = -EINVAL;
                        goto fail;
                }

                if (x && x->km.state == XFRM_STATE_VALID) {
                        xfrm[nx++] = x;
                        daddr = remote;
                        saddr = local;
                        continue;
                }
                if (x) {
                        error = (x->km.state == XFRM_STATE_ERROR ?
                                 -EINVAL : -EAGAIN);
                        xfrm_state_put(x);
                } else if (error == -ESRCH) {
                        error = -EAGAIN;
                }

                if (!tmpl->optional)
                        goto fail;
        }
        return nx;

fail:
        for (nx--; nx >= 0; nx--)
                xfrm_state_put(xfrm[nx]);
        return error;
}

static int
xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
                  struct xfrm_state **xfrm, unsigned short family)
{
        struct xfrm_state *tp[XFRM_MAX_DEPTH];
        struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
        int cnx = 0;
        int error;
        int ret;
        int i;

        for (i = 0; i < npols; i++) {
                if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
                        error = -ENOBUFS;
                        goto fail;
                }

                ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
                if (ret < 0) {
                        error = ret;
                        goto fail;
                } else
                        cnx += ret;
        }

        /* found states are sorted for outbound processing */
        if (npols > 1)
                xfrm_state_sort(xfrm, tpp, cnx, family);

        return cnx;

 fail:
        for (cnx--; cnx >= 0; cnx--)
                xfrm_state_put(tpp[cnx]);
        return error;

}

static int xfrm_get_tos(const struct flowi *fl, int family)
{
        if (family == AF_INET)
                return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos;

        return 0;
}

static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_ops *dst_ops;
        struct xfrm_dst *xdst;

        if (!afinfo)
                return ERR_PTR(-EINVAL);

        switch (family) {
        case AF_INET:
                dst_ops = &net->xfrm.xfrm4_dst_ops;
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                dst_ops = &net->xfrm.xfrm6_dst_ops;
                break;
#endif
        default:
                BUG();
        }
        xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0);

        if (likely(xdst)) {
                memset_after(xdst, 0, u.dst);
        } else
                xdst = ERR_PTR(-ENOBUFS);

        rcu_read_unlock();

        return xdst;
}

static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
                           int nfheader_len)
{
        if (dst->ops->family == AF_INET6) {
                path->path_cookie = rt6_get_cookie(dst_rt6_info(dst));
                path->u.rt6.rt6i_nfheader_len = nfheader_len;
        }
}

static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
                                const struct flowi *fl)
{
        const struct xfrm_policy_afinfo *afinfo =
                xfrm_policy_get_afinfo(xdst->u.dst.ops->family);
        int err;

        if (!afinfo)
                return -EINVAL;

        err = afinfo->fill_dst(xdst, dev, fl);

        rcu_read_unlock();

        return err;
}


/* Allocate chain of dst_entry's, attach known xfrm's, calculate
 * all the metrics... Shortly, bundle a bundle.
 */

static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
                                            struct xfrm_state **xfrm,
                                            struct xfrm_dst **bundle,
                                            int nx,
                                            const struct flowi *fl,
                                            struct dst_entry *dst)
{
        const struct xfrm_state_afinfo *afinfo;
        const struct xfrm_mode *inner_mode;
        struct net *net = xp_net(policy);
        unsigned long now = jiffies;
        struct net_device *dev;
        struct xfrm_dst *xdst_prev = NULL;
        struct xfrm_dst *xdst0 = NULL;
        int i = 0;
        int err;
        int header_len = 0;
        int nfheader_len = 0;
        int trailer_len = 0;
        int tos;
        int family = policy->selector.family;
        xfrm_address_t saddr, daddr;

        xfrm_flowi_addr_get(fl, &saddr, &daddr, family);

        tos = xfrm_get_tos(fl, family);

        dst_hold(dst);

        for (; i < nx; i++) {
                struct xfrm_dst *xdst = xfrm_alloc_dst(net, family);
                struct dst_entry *dst1 = &xdst->u.dst;

                err = PTR_ERR(xdst);
                if (IS_ERR(xdst)) {
                        dst_release(dst);
                        goto put_states;
                }

                bundle[i] = xdst;
                if (!xdst_prev)
                        xdst0 = xdst;
                else
                        /* Ref count is taken during xfrm_alloc_dst()
                         * No need to do dst_clone() on dst1
                         */
                        xfrm_dst_set_child(xdst_prev, &xdst->u.dst);

                if (xfrm[i]->sel.family == AF_UNSPEC) {
                        inner_mode = xfrm_ip2inner_mode(xfrm[i],
                                                        xfrm_af2proto(family));
                        if (!inner_mode) {
                                err = -EAFNOSUPPORT;
                                dst_release(dst);
                                goto put_states;
                        }
                } else
                        inner_mode = &xfrm[i]->inner_mode;

                xdst->route = dst;
                dst_copy_metrics(dst1, dst);

                if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
                        __u32 mark = 0;
                        int oif;

                        if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m)
                                mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);

                        if (xfrm[i]->xso.type != XFRM_DEV_OFFLOAD_PACKET)
                                family = xfrm[i]->props.family;

                        oif = fl->flowi_oif ? : fl->flowi_l3mdev;
                        dst = xfrm_dst_lookup(xfrm[i], tos, oif,
                                              &saddr, &daddr, family, mark);
                        err = PTR_ERR(dst);
                        if (IS_ERR(dst))
                                goto put_states;
                } else
                        dst_hold(dst);

                dst1->xfrm = xfrm[i];
                xdst->xfrm_genid = xfrm[i]->genid;

                dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
                dst1->lastuse = now;

                dst1->input = dst_discard;

                rcu_read_lock();
                afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
                if (likely(afinfo))
                        dst1->output = afinfo->output;
                else
                        dst1->output = dst_discard_out;
                rcu_read_unlock();

                xdst_prev = xdst;

                header_len += xfrm[i]->props.header_len;
                if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
                        nfheader_len += xfrm[i]->props.header_len;
                trailer_len += xfrm[i]->props.trailer_len;
        }

        xfrm_dst_set_child(xdst_prev, dst);
        xdst0->path = dst;

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        xfrm_init_path(xdst0, dst, nfheader_len);
        xfrm_init_pmtu(bundle, nx);

        for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst;
             xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) {
                err = xfrm_fill_dst(xdst_prev, dev, fl);
                if (err)
                        goto free_dst;

                xdst_prev->u.dst.header_len = header_len;
                xdst_prev->u.dst.trailer_len = trailer_len;
                header_len -= xdst_prev->u.dst.xfrm->props.header_len;
                trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len;
        }

        return &xdst0->u.dst;

put_states:
        for (; i < nx; i++)
                xfrm_state_put(xfrm[i]);
free_dst:
        if (xdst0)
                dst_release_immediate(&xdst0->u.dst);

        return ERR_PTR(err);
}

static int xfrm_expand_policies(const struct flowi *fl, u16 family,
                                struct xfrm_policy **pols,
                                int *num_pols, int *num_xfrms)
{
        int i;

        if (*num_pols == 0 || !pols[0]) {
                *num_pols = 0;
                *num_xfrms = 0;
                return 0;
        }
        if (IS_ERR(pols[0])) {
                *num_pols = 0;
                return PTR_ERR(pols[0]);
        }

        *num_xfrms = pols[0]->xfrm_nr;

#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0]->action == XFRM_POLICY_ALLOW &&
            pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
                                                    XFRM_POLICY_TYPE_MAIN,
                                                    fl, family,
                                                    XFRM_POLICY_OUT,
                                                    pols[0]->if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                xfrm_pols_put(pols, *num_pols);
                                *num_pols = 0;
                                return PTR_ERR(pols[1]);
                        }
                        (*num_pols)++;
                        (*num_xfrms) += pols[1]->xfrm_nr;
                }
        }
#endif
        for (i = 0; i < *num_pols; i++) {
                if (pols[i]->action != XFRM_POLICY_ALLOW) {
                        *num_xfrms = -1;
                        break;
                }
        }

        return 0;

}

static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
                               const struct flowi *fl, u16 family,
                               struct dst_entry *dst_orig)
{
        struct net *net = xp_net(pols[0]);
        struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct xfrm_dst *xdst;
        struct dst_entry *dst;
        int err;

        /* Try to instantiate a bundle */
        err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
        if (err <= 0) {
                if (err == 0)
                        return NULL;

                if (err != -EAGAIN)
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
                return ERR_PTR(err);
        }

        dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
        if (IS_ERR(dst)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
                return ERR_CAST(dst);
        }

        xdst = (struct xfrm_dst *)dst;
        xdst->num_xfrms = err;
        xdst->num_pols = num_pols;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
        xdst->policy_genid = atomic_read(&pols[0]->genid);

        return xdst;
}

static void xfrm_policy_queue_process(struct timer_list *t)
{
        struct sk_buff *skb;
        struct sock *sk;
        struct dst_entry *dst;
        struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer);
        struct net *net = xp_net(pol);
        struct xfrm_policy_queue *pq = &pol->polq;
        struct flowi fl;
        struct sk_buff_head list;
        __u32 skb_mark;

        spin_lock(&pq->hold_queue.lock);
        skb = skb_peek(&pq->hold_queue);
        if (!skb) {
                spin_unlock(&pq->hold_queue.lock);
                goto out;
        }
        dst = skb_dst(skb);
        sk = skb->sk;

        /* Fixup the mark to support VTI. */
        skb_mark = skb->mark;
        skb->mark = pol->mark.v;
        xfrm_decode_session(net, skb, &fl, dst->ops->family);
        skb->mark = skb_mark;
        spin_unlock(&pq->hold_queue.lock);

        dst_hold(xfrm_dst_path(dst));
        dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst))
                goto purge_queue;

        if (dst->flags & DST_XFRM_QUEUE) {
                dst_release(dst);

                if (pq->timeout >= XFRM_QUEUE_TMO_MAX)
                        goto purge_queue;

                pq->timeout = pq->timeout << 1;
                if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout))
                        xfrm_pol_hold(pol);
                goto out;
        }

        dst_release(dst);

        __skb_queue_head_init(&list);

        spin_lock(&pq->hold_queue.lock);
        pq->timeout = 0;
        skb_queue_splice_init(&pq->hold_queue, &list);
        spin_unlock(&pq->hold_queue.lock);

        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);

                /* Fixup the mark to support VTI. */
                skb_mark = skb->mark;
                skb->mark = pol->mark.v;
                xfrm_decode_session(net, skb, &fl, skb_dst(skb)->ops->family);
                skb->mark = skb_mark;

                dst_hold(xfrm_dst_path(skb_dst(skb)));
                dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
                if (IS_ERR(dst)) {
                        kfree_skb(skb);
                        continue;
                }

                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_dst_set(skb, dst);

                dst_output(net, skb->sk, skb);
        }

out:
        xfrm_pol_put(pol);
        return;

purge_queue:
        pq->timeout = 0;
        skb_queue_purge(&pq->hold_queue);
        xfrm_pol_put(pol);
}

static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        unsigned long sched_next;
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_dst *xdst = (struct xfrm_dst *) dst;
        struct xfrm_policy *pol = xdst->pols[0];
        struct xfrm_policy_queue *pq = &pol->polq;

        if (unlikely(skb_fclone_busy(sk, skb))) {
                kfree_skb(skb);
                return 0;
        }

        if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) {
                kfree_skb(skb);
                return -EAGAIN;
        }

        skb_dst_force(skb);

        spin_lock_bh(&pq->hold_queue.lock);

        if (!pq->timeout)
                pq->timeout = XFRM_QUEUE_TMO_MIN;

        sched_next = jiffies + pq->timeout;

        if (del_timer(&pq->hold_timer)) {
                if (time_before(pq->hold_timer.expires, sched_next))
                        sched_next = pq->hold_timer.expires;
                xfrm_pol_put(pol);
        }

        __skb_queue_tail(&pq->hold_queue, skb);
        if (!mod_timer(&pq->hold_timer, sched_next))
                xfrm_pol_hold(pol);

        spin_unlock_bh(&pq->hold_queue.lock);

        return 0;
}

static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
                                                 struct xfrm_flo *xflo,
                                                 const struct flowi *fl,
                                                 int num_xfrms,
                                                 u16 family)
{
        int err;
        struct net_device *dev;
        struct dst_entry *dst;
        struct dst_entry *dst1;
        struct xfrm_dst *xdst;

        xdst = xfrm_alloc_dst(net, family);
        if (IS_ERR(xdst))
                return xdst;

        if (!(xflo->flags & XFRM_LOOKUP_QUEUE) ||
            net->xfrm.sysctl_larval_drop ||
            num_xfrms <= 0)
                return xdst;

        dst = xflo->dst_orig;
        dst1 = &xdst->u.dst;
        dst_hold(dst);
        xdst->route = dst;

        dst_copy_metrics(dst1, dst);

        dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
        dst1->flags |= DST_XFRM_QUEUE;
        dst1->lastuse = jiffies;

        dst1->input = dst_discard;
        dst1->output = xdst_queue_output;

        dst_hold(dst);
        xfrm_dst_set_child(xdst, dst);
        xdst->path = dst;

        xfrm_init_path((struct xfrm_dst *)dst1, dst, 0);

        err = -ENODEV;
        dev = dst->dev;
        if (!dev)
                goto free_dst;

        err = xfrm_fill_dst(xdst, dev, fl);
        if (err)
                goto free_dst;

out:
        return xdst;

free_dst:
        dst_release(dst1);
        xdst = ERR_PTR(err);
        goto out;
}

static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
                                           const struct flowi *fl,
                                           u16 family, u8 dir,
                                           struct xfrm_flo *xflo, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int num_pols = 0, num_xfrms = 0, err;
        struct xfrm_dst *xdst;

        /* Resolve policies to use if we couldn't get them from
         * previous cache entry */
        num_pols = 1;
        pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
        err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
        if (err < 0)
                goto inc_error;
        if (num_pols == 0)
                return NULL;
        if (num_xfrms <= 0)
                goto make_dummy_bundle;

        xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
                                              xflo->dst_orig);
        if (IS_ERR(xdst)) {
                err = PTR_ERR(xdst);
                if (err == -EREMOTE) {
                        xfrm_pols_put(pols, num_pols);
                        return NULL;
                }

                if (err != -EAGAIN)
                        goto error;
                goto make_dummy_bundle;
        } else if (xdst == NULL) {
                num_xfrms = 0;
                goto make_dummy_bundle;
        }

        return xdst;

make_dummy_bundle:
        /* We found policies, but there's no bundles to instantiate:
         * either because the policy blocks, has no transformations or
         * we could not build template (no xfrm_states).*/
        xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family);
        if (IS_ERR(xdst)) {
                xfrm_pols_put(pols, num_pols);
                return ERR_CAST(xdst);
        }
        xdst->num_pols = num_pols;
        xdst->num_xfrms = num_xfrms;
        memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);

        return xdst;

inc_error:
        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
error:
        xfrm_pols_put(pols, num_pols);
        return ERR_PTR(err);
}

static struct dst_entry *make_blackhole(struct net *net, u16 family,
                                        struct dst_entry *dst_orig)
{
        const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
        struct dst_entry *ret;

        if (!afinfo) {
                dst_release(dst_orig);
                return ERR_PTR(-EINVAL);
        } else {
                ret = afinfo->blackhole_route(net, dst_orig);
        }
        rcu_read_unlock();

        return ret;
}

/* Finds/creates a bundle for given flow and if_id
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 *
 * xfrm_lookup uses an if_id of 0 by default, and is provided for
 * compatibility
 */
struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk,
                                        int flags, u32 if_id)
{
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        struct xfrm_dst *xdst;
        struct dst_entry *dst, *route;
        u16 family = dst_orig->ops->family;
        u8 dir = XFRM_POLICY_OUT;
        int i, err, num_pols, num_xfrms = 0, drop_pols = 0;

        dst = NULL;
        xdst = NULL;
        route = NULL;

        sk = sk_const_to_full_sk(sk);
        if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
                num_pols = 1;
                pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
                                                if_id);
                err = xfrm_expand_policies(fl, family, pols,
                                           &num_pols, &num_xfrms);
                if (err < 0)
                        goto dropdst;

                if (num_pols) {
                        if (num_xfrms <= 0) {
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        xdst = xfrm_resolve_and_create_bundle(
                                        pols, num_pols, fl,
                                        family, dst_orig);

                        if (IS_ERR(xdst)) {
                                xfrm_pols_put(pols, num_pols);
                                err = PTR_ERR(xdst);
                                if (err == -EREMOTE)
                                        goto nopol;

                                goto dropdst;
                        } else if (xdst == NULL) {
                                num_xfrms = 0;
                                drop_pols = num_pols;
                                goto no_transform;
                        }

                        route = xdst->route;
                }
        }

        if (xdst == NULL) {
                struct xfrm_flo xflo;

                xflo.dst_orig = dst_orig;
                xflo.flags = flags;

                /* To accelerate a bit...  */
                if (!if_id && ((dst_orig->flags & DST_NOXFRM) ||
                               !net->xfrm.policy_count[XFRM_POLICY_OUT]))
                        goto nopol;

                xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
                if (xdst == NULL)
                        goto nopol;
                if (IS_ERR(xdst)) {
                        err = PTR_ERR(xdst);
                        goto dropdst;
                }

                num_pols = xdst->num_pols;
                num_xfrms = xdst->num_xfrms;
                memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols);
                route = xdst->route;
        }

        dst = &xdst->u.dst;
        if (route == NULL && num_xfrms > 0) {
                /* The only case when xfrm_bundle_lookup() returns a
                 * bundle with null route, is when the template could
                 * not be resolved. It means policies are there, but
                 * bundle could not be created, since we don't yet
                 * have the xfrm_state's. We need to wait for KM to
                 * negotiate new SA's or bail out with error.*/
                if (net->xfrm.sysctl_larval_drop) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                        err = -EREMOTE;
                        goto error;
                }

                err = -EAGAIN;

                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
                goto error;
        }

no_transform:
        if (num_pols == 0)
                goto nopol;

        if ((flags & XFRM_LOOKUP_ICMP) &&
            !(pols[0]->flags & XFRM_POLICY_ICMP)) {
                err = -ENOENT;
                goto error;
        }

        for (i = 0; i < num_pols; i++)
                WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds());

        if (num_xfrms < 0) {
                /* Prohibit the flow */
                XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
                err = -EPERM;
                goto error;
        } else if (num_xfrms > 0) {
                /* Flow transformed */
                dst_release(dst_orig);
        } else {
                /* Flow passes untransformed */
                dst_release(dst);
                dst = dst_orig;
        }
ok:
        xfrm_pols_put(pols, drop_pols);
        if (dst && dst->xfrm &&
            dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
                dst->flags |= DST_XFRM_TUNNEL;
        return dst;

nopol:
        if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) &&
            net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                err = -EPERM;
                goto error;
        }
        if (!(flags & XFRM_LOOKUP_ICMP)) {
                dst = dst_orig;
                goto ok;
        }
        err = -ENOENT;
error:
        dst_release(dst);
dropdst:
        if (!(flags & XFRM_LOOKUP_KEEP_DST_REF))
                dst_release(dst_orig);
        xfrm_pols_put(pols, drop_pols);
        return ERR_PTR(err);
}
EXPORT_SYMBOL(xfrm_lookup_with_ifid);

/* Main function: finds/creates a bundle for given flow.
 *
 * At the moment we eat a raw IP route. Mostly to speed up lookups
 * on interfaces with disabled IPsec.
 */
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags)
{
        return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
}
EXPORT_SYMBOL(xfrm_lookup);

/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
 * Otherwise we may send out blackholed packets.
 */
struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl,
                                    const struct sock *sk, int flags)
{
        struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk,
                                            flags | XFRM_LOOKUP_QUEUE |
                                            XFRM_LOOKUP_KEEP_DST_REF);

        if (PTR_ERR(dst) == -EREMOTE)
                return make_blackhole(net, dst_orig->ops->family, dst_orig);

        if (IS_ERR(dst))
                dst_release(dst_orig);

        return dst;
}
EXPORT_SYMBOL(xfrm_lookup_route);

static inline int
xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl)
{
        struct sec_path *sp = skb_sec_path(skb);
        struct xfrm_state *x;

        if (!sp || idx < 0 || idx >= sp->len)
                return 0;
        x = sp->xvec[idx];
        if (!x->type->reject)
                return 0;
        return x->type->reject(x, skb, fl);
}

/* When skb is transformed back to its "native" form, we have to
 * check policy restrictions. At the moment we make this in maximally
 * stupid way. Shame on me. :-) Of course, connected sockets must
 * have policy cached at them.
 */

static inline int
xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x,
              unsigned short family, u32 if_id)
{
        if (xfrm_state_kern(x))
                return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family);
        return        x->id.proto == tmpl->id.proto &&
                (x->id.spi == tmpl->id.spi || !tmpl->id.spi) &&
                (x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
                x->props.mode == tmpl->mode &&
                (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) ||
                 !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) &&
                !(x->props.mode != XFRM_MODE_TRANSPORT &&
                  xfrm_state_addr_cmp(tmpl, x, family)) &&
                (if_id == 0 || if_id == x->if_id);
}

/*
 * 0 or more than 0 is returned when validation is succeeded (either bypass
 * because of optional transport mode, or next index of the matched secpath
 * state with the template.
 * -1 is returned when no matching template is found.
 * Otherwise "-2 - errored_index" is returned.
 */
static inline int
xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start,
               unsigned short family, u32 if_id)
{
        int idx = start;

        if (tmpl->optional) {
                if (tmpl->mode == XFRM_MODE_TRANSPORT)
                        return start;
        } else
                start = -1;
        for (; idx < sp->len; idx++) {
                if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id))
                        return ++idx;
                if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
                        if (idx < sp->verified_cnt) {
                                /* Secpath entry previously verified, consider optional and
                                 * continue searching
                                 */
                                continue;
                        }

                        if (start == -1)
                                start = -2-idx;
                        break;
                }
        }
        return start;
}

static void
decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse)
{
        struct flowi4 *fl4 = &fl->u.ip4;

        memset(fl4, 0, sizeof(struct flowi4));

        if (reverse) {
                fl4->saddr = flkeys->addrs.ipv4.dst;
                fl4->daddr = flkeys->addrs.ipv4.src;
                fl4->fl4_sport = flkeys->ports.dst;
                fl4->fl4_dport = flkeys->ports.src;
        } else {
                fl4->saddr = flkeys->addrs.ipv4.src;
                fl4->daddr = flkeys->addrs.ipv4.dst;
                fl4->fl4_sport = flkeys->ports.src;
                fl4->fl4_dport = flkeys->ports.dst;
        }

        switch (flkeys->basic.ip_proto) {
        case IPPROTO_GRE:
                fl4->fl4_gre_key = flkeys->gre.keyid;
                break;
        case IPPROTO_ICMP:
                fl4->fl4_icmp_type = flkeys->icmp.type;
                fl4->fl4_icmp_code = flkeys->icmp.code;
                break;
        }

        fl4->flowi4_proto = flkeys->basic.ip_proto;
        fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK;
}

#if IS_ENABLED(CONFIG_IPV6)
static void
decode_session6(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse)
{
        struct flowi6 *fl6 = &fl->u.ip6;

        memset(fl6, 0, sizeof(struct flowi6));

        if (reverse) {
                fl6->saddr = flkeys->addrs.ipv6.dst;
                fl6->daddr = flkeys->addrs.ipv6.src;
                fl6->fl6_sport = flkeys->ports.dst;
                fl6->fl6_dport = flkeys->ports.src;
        } else {
                fl6->saddr = flkeys->addrs.ipv6.src;
                fl6->daddr = flkeys->addrs.ipv6.dst;
                fl6->fl6_sport = flkeys->ports.src;
                fl6->fl6_dport = flkeys->ports.dst;
        }

        switch (flkeys->basic.ip_proto) {
        case IPPROTO_GRE:
                fl6->fl6_gre_key = flkeys->gre.keyid;
                break;
        case IPPROTO_ICMPV6:
                fl6->fl6_icmp_type = flkeys->icmp.type;
                fl6->fl6_icmp_code = flkeys->icmp.code;
                break;
        }

        fl6->flowi6_proto = flkeys->basic.ip_proto;
}
#endif

int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl,
                          unsigned int family, int reverse)
{
        struct xfrm_flow_keys flkeys;

        memset(&flkeys, 0, sizeof(flkeys));
        __skb_flow_dissect(net, skb, &xfrm_session_dissector, &flkeys,
                           NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        switch (family) {
        case AF_INET:
                decode_session4(&flkeys, fl, reverse);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                decode_session6(&flkeys, fl, reverse);
                break;
#endif
        default:
                return -EAFNOSUPPORT;
        }

        fl->flowi_mark = skb->mark;
        if (reverse) {
                fl->flowi_oif = skb->skb_iif;
        } else {
                int oif = 0;

                if (skb_dst(skb) && skb_dst(skb)->dev)
                        oif = skb_dst(skb)->dev->ifindex;

                fl->flowi_oif = oif;
        }

        return security_xfrm_decode_session(skb, &fl->flowi_secid);
}
EXPORT_SYMBOL(__xfrm_decode_session);

static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp)
{
        for (; k < sp->len; k++) {
                if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
                        *idxp = k;
                        return 1;
                }
        }

        return 0;
}

static bool icmp_err_packet(const struct flowi *fl, unsigned short family)
{
        const struct flowi4 *fl4 = &fl->u.ip4;

        if (family == AF_INET &&
            fl4->flowi4_proto == IPPROTO_ICMP &&
            (fl4->fl4_icmp_type == ICMP_DEST_UNREACH ||
             fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED))
                return true;

#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6) {
                const struct flowi6 *fl6 = &fl->u.ip6;

                if (fl6->flowi6_proto == IPPROTO_ICMPV6 &&
                    (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH ||
                    fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG ||
                    fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED))
                        return true;
        }
#endif
        return false;
}

static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family,
                                  const struct flowi *fl, struct flowi *fl1)
{
        bool ret = true;
        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
        int hl = family == AF_INET ? (sizeof(struct iphdr) +  sizeof(struct icmphdr)) :
                 (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr));

        if (!newskb)
                return true;

        if (!pskb_pull(newskb, hl))
                goto out;

        skb_reset_network_header(newskb);

        if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0)
                goto out;

        fl1->flowi_oif = fl->flowi_oif;
        fl1->flowi_mark = fl->flowi_mark;
        fl1->flowi_tos = fl->flowi_tos;
        nf_nat_decode_session(newskb, fl1, family);
        ret = false;

out:
        consume_skb(newskb);
        return ret;
}

static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family,
                                           const struct xfrm_selector *sel,
                                           const struct flowi *fl)
{
        bool ret = false;

        if (icmp_err_packet(fl, family)) {
                struct flowi fl1;

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return ret;

                ret = xfrm_selector_match(sel, &fl1, family);
        }

        return ret;
}

static inline struct
xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb,
                              const struct flowi *fl, unsigned short family,
                              u32 if_id)
{
        struct xfrm_policy *pol = NULL;

        if (icmp_err_packet(fl, family)) {
                struct flowi fl1;
                struct net *net = dev_net(skb->dev);

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return pol;

                pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id);
                if (IS_ERR(pol))
                        pol = NULL;
        }

        return pol;
}

static inline struct
dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl,
                             unsigned short family, struct dst_entry *dst)
{
        if (icmp_err_packet(fl, family)) {
                struct net *net = dev_net(skb->dev);
                struct dst_entry *dst2;
                struct flowi fl1;

                if (xfrm_icmp_flow_decode(skb, family, fl, &fl1))
                        return dst;

                dst_hold(dst);

                dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP));

                if (IS_ERR(dst2))
                        return dst;

                if (dst2->xfrm) {
                        dst_release(dst);
                        dst = dst2;
                } else {
                        dst_release(dst2);
                }
        }

        return dst;
}

int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
                        unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct xfrm_policy *pol;
        struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
        int npols = 0;
        int xfrm_nr;
        int pi;
        int reverse;
        struct flowi fl;
        int xerr_idx = -1;
        const struct xfrm_if_cb *ifcb;
        struct sec_path *sp;
        u32 if_id = 0;

        rcu_read_lock();
        ifcb = xfrm_if_get_cb();

        if (ifcb) {
                struct xfrm_if_decode_session_result r;

                if (ifcb->decode_session(skb, family, &r)) {
                        if_id = r.if_id;
                        net = r.net;
                }
        }
        rcu_read_unlock();

        reverse = dir & ~XFRM_POLICY_MASK;
        dir &= XFRM_POLICY_MASK;

        if (__xfrm_decode_session(net, skb, &fl, family, reverse) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
                return 0;
        }

        nf_nat_decode_session(skb, &fl, family);

        /* First, check used SA against their selectors. */
        sp = skb_sec_path(skb);
        if (sp) {
                int i;

                for (i = sp->len - 1; i >= 0; i--) {
                        struct xfrm_state *x = sp->xvec[i];
                        int ret = 0;

                        if (!xfrm_selector_match(&x->sel, &fl, family)) {
                                ret = 1;
                                if (x->props.flags & XFRM_STATE_ICMP &&
                                    xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl))
                                        ret = 0;
                                if (ret) {
                                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH);
                                        return 0;
                                }
                        }
                }
        }

        pol = NULL;
        sk = sk_to_full_sk(sk);
        if (sk && sk->sk_policy[dir]) {
                pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
                if (IS_ERR(pol)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                        return 0;
                }
        }

        if (!pol)
                pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);

        if (IS_ERR(pol)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                return 0;
        }

        if (!pol && dir == XFRM_POLICY_FWD)
                pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id);

        if (!pol) {
                if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }

                if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) {
                        xfrm_secpath_reject(xerr_idx, skb, &fl);
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS);
                        return 0;
                }
                return 1;
        }

        /* This lockless write can happen from different cpus. */
        WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds());

        pols[0] = pol;
        npols++;
#ifdef CONFIG_XFRM_SUB_POLICY
        if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
                pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
                                                    &fl, family,
                                                    XFRM_POLICY_IN, if_id);
                if (pols[1]) {
                        if (IS_ERR(pols[1])) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
                                xfrm_pol_put(pols[0]);
                                return 0;
                        }
                        /* This write can happen from different cpus. */
                        WRITE_ONCE(pols[1]->curlft.use_time,
                                   ktime_get_real_seconds());
                        npols++;
                }
        }
#endif

        if (pol->action == XFRM_POLICY_ALLOW) {
                static struct sec_path dummy;
                struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
                struct xfrm_tmpl **tpp = tp;
                int ti = 0;
                int i, k;

                sp = skb_sec_path(skb);
                if (!sp)
                        sp = &dummy;

                for (pi = 0; pi < npols; pi++) {
                        if (pols[pi] != pol &&
                            pols[pi]->action != XFRM_POLICY_ALLOW) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);
                                goto reject;
                        }
                        if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) {
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
                                goto reject_error;
                        }
                        for (i = 0; i < pols[pi]->xfrm_nr; i++)
                                tpp[ti++] = &pols[pi]->xfrm_vec[i];
                }
                xfrm_nr = ti;

                if (npols > 1) {
                        xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
                        tpp = stp;
                }

                /* For each tunnel xfrm, find the first matching tmpl.
                 * For each tmpl before that, find corresponding xfrm.
                 * Order is _important_. Later we will implement
                 * some barriers, but at the moment barriers
                 * are implied between each two transformations.
                 * Upon success, marks secpath entries as having been
                 * verified to allow them to be skipped in future policy
                 * checks (e.g. nested tunnels).
                 */
                for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
                        k = xfrm_policy_ok(tpp[i], sp, k, family, if_id);
                        if (k < 0) {
                                if (k < -1)
                                        /* "-2 - errored_index" returned */
                                        xerr_idx = -(2+k);
                                XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                                goto reject;
                        }
                }

                if (secpath_has_nontransport(sp, k, &xerr_idx)) {
                        XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH);
                        goto reject;
                }

                xfrm_pols_put(pols, npols);
                sp->verified_cnt = k;

                return 1;
        }
        XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK);

reject:
        xfrm_secpath_reject(xerr_idx, skb, &fl);
reject_error:
        xfrm_pols_put(pols, npols);
        return 0;
}
EXPORT_SYMBOL(__xfrm_policy_check);

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
        struct net *net = dev_net(skb->dev);
        struct flowi fl;
        struct dst_entry *dst;
        int res = 1;

        if (xfrm_decode_session(net, skb, &fl, family) < 0) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        skb_dst_force(skb);
        if (!skb_dst(skb)) {
                XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
                return 0;
        }

        dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
        if (IS_ERR(dst)) {
                res = 0;
                dst = NULL;
        }

        if (dst && !dst->xfrm)
                dst = xfrm_out_fwd_icmp(skb, &fl, family, dst);

        skb_dst_set(skb, dst);
        return res;
}
EXPORT_SYMBOL(__xfrm_route_forward);

/* Optimize later using cookies and generation ids. */

static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
{
        /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete
         * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to
         * get validated by dst_ops->check on every use.  We do this
         * because when a normal route referenced by an XFRM dst is
         * obsoleted we do not go looking around for all parent
         * referencing XFRM dsts so that we can invalidate them.  It
         * is just too much work.  Instead we make the checks here on
         * every use.  For example:
         *
         *        XFRM dst A --> IPv4 dst X
         *
         * X is the "xdst->route" of A (X is also the "dst->path" of A
         * in this example).  If X is marked obsolete, "A" will not
         * notice.  That's what we are validating here via the
         * stale_bundle() check.
         *
         * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
         * be marked on it.
         * This will force stale_bundle() to fail on any xdst bundle with
         * this dst linked in it.
         */
        if (dst->obsolete < 0 && !stale_bundle(dst))
                return dst;

        return NULL;
}

static int stale_bundle(struct dst_entry *dst)
{
        return !xfrm_bundle_ok((struct xfrm_dst *)dst);
}

void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
        while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) {
                dst->dev = blackhole_netdev;
                dev_hold(dst->dev);
                dev_put(dev);
        }
}
EXPORT_SYMBOL(xfrm_dst_ifdown);

static void xfrm_link_failure(struct sk_buff *skb)
{
        /* Impossible. Such dst must be popped before reaches point of failure. */
}

static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst)
{
        if (dst->obsolete)
                sk_dst_reset(sk);
}

static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)
{
        while (nr--) {
                struct xfrm_dst *xdst = bundle[nr];
                u32 pmtu, route_mtu_cached;
                struct dst_entry *dst;

                dst = &xdst->u.dst;
                pmtu = dst_mtu(xfrm_dst_child(dst));
                xdst->child_mtu_cached = pmtu;

                pmtu = xfrm_state_mtu(dst->xfrm, pmtu);

                route_mtu_cached = dst_mtu(xdst->route);
                xdst->route_mtu_cached = route_mtu_cached;

                if (pmtu > route_mtu_cached)
                        pmtu = route_mtu_cached;

                dst_metric_set(dst, RTAX_MTU, pmtu);
        }
}

/* Check that the bundle accepts the flow and its components are
 * still valid.
 */

static int xfrm_bundle_ok(struct xfrm_dst *first)
{
        struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
        struct dst_entry *dst = &first->u.dst;
        struct xfrm_dst *xdst;
        int start_from, nr;
        u32 mtu;

        if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) ||
            (dst->dev && !netif_running(dst->dev)))
                return 0;

        if (dst->flags & DST_XFRM_QUEUE)
                return 1;

        start_from = nr = 0;
        do {
                struct xfrm_dst *xdst = (struct xfrm_dst *)dst;

                if (dst->xfrm->km.state != XFRM_STATE_VALID)
                        return 0;
                if (xdst->xfrm_genid != dst->xfrm->genid)
                        return 0;
                if (xdst->num_pols > 0 &&
                    xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
                        return 0;

                bundle[nr++] = xdst;

                mtu = dst_mtu(xfrm_dst_child(dst));
                if (xdst->child_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->child_mtu_cached = mtu;
                }

                if (!dst_check(xdst->route, xdst->route_cookie))
                        return 0;
                mtu = dst_mtu(xdst->route);
                if (xdst->route_mtu_cached != mtu) {
                        start_from = nr;
                        xdst->route_mtu_cached = mtu;
                }

                dst = xfrm_dst_child(dst);
        } while (dst->xfrm);

        if (likely(!start_from))
                return 1;

        xdst = bundle[start_from - 1];
        mtu = xdst->child_mtu_cached;
        while (start_from--) {
                dst = &xdst->u.dst;

                mtu = xfrm_state_mtu(dst->xfrm, mtu);
                if (mtu > xdst->route_mtu_cached)
                        mtu = xdst->route_mtu_cached;
                dst_metric_set(dst, RTAX_MTU, mtu);
                if (!start_from)
                        break;

                xdst = bundle[start_from - 1];
                xdst->child_mtu_cached = mtu;
        }

        return 1;
}

static unsigned int xfrm_default_advmss(const struct dst_entry *dst)
{
        return dst_metric_advmss(xfrm_dst_path(dst));
}

static unsigned int xfrm_mtu(const struct dst_entry *dst)
{
        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

        return mtu ? : dst_mtu(xfrm_dst_path(dst));
}

static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst,
                                        const void *daddr)
{
        while (dst->xfrm) {
                const struct xfrm_state *xfrm = dst->xfrm;

                dst = xfrm_dst_child(dst);

                if (xfrm->props.mode == XFRM_MODE_TRANSPORT)
                        continue;
                if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR)
                        daddr = xfrm->coaddr;
                else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR))
                        daddr = &xfrm->id.daddr;
        }
        return daddr;
}

static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        if (!skb)
                daddr = xfrm_get_dst_nexthop(dst, daddr);
        return path->ops->neigh_lookup(path, skb, daddr);
}

static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct dst_entry *path = xfrm_dst_path(dst);

        daddr = xfrm_get_dst_nexthop(dst, daddr);
        path->ops->confirm_neigh(path, daddr);
}

int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family)
{
        int err = 0;

        if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo)))
                return -EAFNOSUPPORT;

        spin_lock(&xfrm_policy_afinfo_lock);
        if (unlikely(xfrm_policy_afinfo[family] != NULL))
                err = -EEXIST;
        else {
                struct dst_ops *dst_ops = afinfo->dst_ops;
                if (likely(dst_ops->kmem_cachep == NULL))
                        dst_ops->kmem_cachep = xfrm_dst_cache;
                if (likely(dst_ops->check == NULL))
                        dst_ops->check = xfrm_dst_check;
                if (likely(dst_ops->default_advmss == NULL))
                        dst_ops->default_advmss = xfrm_default_advmss;
                if (likely(dst_ops->mtu == NULL))
                        dst_ops->mtu = xfrm_mtu;
                if (likely(dst_ops->negative_advice == NULL))
                        dst_ops->negative_advice = xfrm_negative_advice;
                if (likely(dst_ops->link_failure == NULL))
                        dst_ops->link_failure = xfrm_link_failure;
                if (likely(dst_ops->neigh_lookup == NULL))
                        dst_ops->neigh_lookup = xfrm_neigh_lookup;
                if (likely(!dst_ops->confirm_neigh))
                        dst_ops->confirm_neigh = xfrm_confirm_neigh;
                rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo);
        }
        spin_unlock(&xfrm_policy_afinfo_lock);

        return err;
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);

void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
{
        struct dst_ops *dst_ops = afinfo->dst_ops;
        int i;

        for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) {
                if (xfrm_policy_afinfo[i] != afinfo)
                        continue;
                RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL);
                break;
        }

        synchronize_rcu();

        dst_ops->kmem_cachep = NULL;
        dst_ops->check = NULL;
        dst_ops->negative_advice = NULL;
        dst_ops->link_failure = NULL;
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);

void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
{
        spin_lock(&xfrm_if_cb_lock);
        rcu_assign_pointer(xfrm_if_cb, ifcb);
        spin_unlock(&xfrm_if_cb_lock);
}
EXPORT_SYMBOL(xfrm_if_register_cb);

void xfrm_if_unregister_cb(void)
{
        RCU_INIT_POINTER(xfrm_if_cb, NULL);
        synchronize_rcu();
}
EXPORT_SYMBOL(xfrm_if_unregister_cb);

#ifdef CONFIG_XFRM_STATISTICS
static int __net_init xfrm_statistics_init(struct net *net)
{
        int rv;
        net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib);
        if (!net->mib.xfrm_statistics)
                return -ENOMEM;
        rv = xfrm_proc_init(net);
        if (rv < 0)
                free_percpu(net->mib.xfrm_statistics);
        return rv;
}

static void xfrm_statistics_fini(struct net *net)
{
        xfrm_proc_fini(net);
        free_percpu(net->mib.xfrm_statistics);
}
#else
static int __net_init xfrm_statistics_init(struct net *net)
{
        return 0;
}

static void xfrm_statistics_fini(struct net *net)
{
}
#endif

static int __net_init xfrm_policy_init(struct net *net)
{
        unsigned int hmask, sz;
        int dir, err;

        if (net_eq(net, &init_net)) {
                xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
                err = rhashtable_init(&xfrm_policy_inexact_table,
                                      &xfrm_pol_inexact_params);
                BUG_ON(err);
        }

        hmask = 8 - 1;
        sz = (hmask+1) * sizeof(struct hlist_head);

        net->xfrm.policy_byidx = xfrm_hash_alloc(sz);
        if (!net->xfrm.policy_byidx)
                goto out_byidx;
        net->xfrm.policy_idx_hmask = hmask;

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                net->xfrm.policy_count[dir] = 0;
                net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0;
                INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]);

                htab = &net->xfrm.policy_bydst[dir];
                htab->table = xfrm_hash_alloc(sz);
                if (!htab->table)
                        goto out_bydst;
                htab->hmask = hmask;
                htab->dbits4 = 32;
                htab->sbits4 = 32;
                htab->dbits6 = 128;
                htab->sbits6 = 128;
        }
        net->xfrm.policy_hthresh.lbits4 = 32;
        net->xfrm.policy_hthresh.rbits4 = 32;
        net->xfrm.policy_hthresh.lbits6 = 128;
        net->xfrm.policy_hthresh.rbits6 = 128;

        seqlock_init(&net->xfrm.policy_hthresh.lock);

        INIT_LIST_HEAD(&net->xfrm.policy_all);
        INIT_LIST_HEAD(&net->xfrm.inexact_bins);
        INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
        INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
        return 0;

out_bydst:
        for (dir--; dir >= 0; dir--) {
                struct xfrm_policy_hash *htab;

                htab = &net->xfrm.policy_bydst[dir];
                xfrm_hash_free(htab->table, sz);
        }
        xfrm_hash_free(net->xfrm.policy_byidx, sz);
out_byidx:
        return -ENOMEM;
}

static void xfrm_policy_fini(struct net *net)
{
        struct xfrm_pol_inexact_bin *b, *t;
        unsigned int sz;
        int dir;

        flush_work(&net->xfrm.policy_hash_work);
#ifdef CONFIG_XFRM_SUB_POLICY
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false);
#endif
        xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false);

        WARN_ON(!list_empty(&net->xfrm.policy_all));

        for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
                struct xfrm_policy_hash *htab;

                WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir]));

                htab = &net->xfrm.policy_bydst[dir];
                sz = (htab->hmask + 1) * sizeof(struct hlist_head);
                WARN_ON(!hlist_empty(htab->table));
                xfrm_hash_free(htab->table, sz);
        }

        sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
        WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
        xfrm_hash_free(net->xfrm.policy_byidx, sz);

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins)
                __xfrm_policy_inexact_prune_bin(b, true);
        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
}

static int __net_init xfrm_net_init(struct net *net)
{
        int rv;

        /* Initialize the per-net locks here */
        spin_lock_init(&net->xfrm.xfrm_state_lock);
        spin_lock_init(&net->xfrm.xfrm_policy_lock);
        seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
        mutex_init(&net->xfrm.xfrm_cfg_mutex);
        net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT;
        net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT;

        rv = xfrm_statistics_init(net);
        if (rv < 0)
                goto out_statistics;
        rv = xfrm_state_init(net);
        if (rv < 0)
                goto out_state;
        rv = xfrm_policy_init(net);
        if (rv < 0)
                goto out_policy;
        rv = xfrm_sysctl_init(net);
        if (rv < 0)
                goto out_sysctl;

        return 0;

out_sysctl:
        xfrm_policy_fini(net);
out_policy:
        xfrm_state_fini(net);
out_state:
        xfrm_statistics_fini(net);
out_statistics:
        return rv;
}

static void __net_exit xfrm_net_exit(struct net *net)
{
        xfrm_sysctl_fini(net);
        xfrm_policy_fini(net);
        xfrm_state_fini(net);
        xfrm_statistics_fini(net);
}

static struct pernet_operations __net_initdata xfrm_net_ops = {
        .init = xfrm_net_init,
        .exit = xfrm_net_exit,
};

static const struct flow_dissector_key xfrm_flow_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct xfrm_flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct xfrm_flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct xfrm_flow_keys, addrs.ipv4),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct xfrm_flow_keys, addrs.ipv6),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct xfrm_flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct xfrm_flow_keys, gre),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IP,
                .offset = offsetof(struct xfrm_flow_keys, ip),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_ICMP,
                .offset = offsetof(struct xfrm_flow_keys, icmp),
        },
};

void __init xfrm_init(void)
{
        skb_flow_dissector_init(&xfrm_session_dissector,
                                xfrm_flow_dissector_keys,
                                ARRAY_SIZE(xfrm_flow_dissector_keys));

        register_pernet_subsys(&xfrm_net_ops);
        xfrm_dev_init();
        xfrm_input_init();

#ifdef CONFIG_XFRM_ESPINTCP
        espintcp_init();
#endif

        register_xfrm_state_bpf();
}

#ifdef CONFIG_AUDITSYSCALL
static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
                                         struct audit_buffer *audit_buf)
{
        struct xfrm_sec_ctx *ctx = xp->security;
        struct xfrm_selector *sel = &xp->selector;

        if (ctx)
                audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s",
                                 ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str);

        switch (sel->family) {
        case AF_INET:
                audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4);
                if (sel->prefixlen_s != 32)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4);
                if (sel->prefixlen_d != 32)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        case AF_INET6:
                audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
                if (sel->prefixlen_s != 128)
                        audit_log_format(audit_buf, " src_prefixlen=%d",
                                         sel->prefixlen_s);
                audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
                if (sel->prefixlen_d != 128)
                        audit_log_format(audit_buf, " dst_prefixlen=%d",
                                         sel->prefixlen_d);
                break;
        }
}

void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-add");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_add);

void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result,
                              bool task_valid)
{
        struct audit_buffer *audit_buf;

        audit_buf = xfrm_audit_start("SPD-delete");
        if (audit_buf == NULL)
                return;
        xfrm_audit_helper_usrinfo(task_valid, audit_buf);
        audit_log_format(audit_buf, " res=%u", result);
        xfrm_audit_common_policyinfo(xp, audit_buf);
        audit_log_end(audit_buf);
}
EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete);
#endif

#ifdef CONFIG_XFRM_MIGRATE
static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp,
                                        const struct xfrm_selector *sel_tgt)
{
        if (sel_cmp->proto == IPSEC_ULPROTO_ANY) {
                if (sel_tgt->family == sel_cmp->family &&
                    xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr,
                                    sel_cmp->family) &&
                    xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr,
                                    sel_cmp->family) &&
                    sel_tgt->prefixlen_d == sel_cmp->prefixlen_d &&
                    sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) {
                        return true;
                }
        } else {
                if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) {
                        return true;
                }
        }
        return false;
}

static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel,
                                                    u8 dir, u8 type, struct net *net, u32 if_id)
{
        struct xfrm_policy *pol, *ret = NULL;
        struct hlist_head *chain;
        u32 priority = ~0U;

        spin_lock_bh(&net->xfrm.xfrm_policy_lock);
        chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir);
        hlist_for_each_entry(pol, chain, bydst) {
                if ((if_id == 0 || pol->if_id == if_id) &&
                    xfrm_migrate_selector_match(sel, &pol->selector) &&
                    pol->type == type) {
                        ret = pol;
                        priority = ret->priority;
                        break;
                }
        }
        chain = &net->xfrm.policy_inexact[dir];
        hlist_for_each_entry(pol, chain, bydst_inexact_list) {
                if ((pol->priority >= priority) && ret)
                        break;

                if ((if_id == 0 || pol->if_id == if_id) &&
                    xfrm_migrate_selector_match(sel, &pol->selector) &&
                    pol->type == type) {
                        ret = pol;
                        break;
                }
        }

        xfrm_pol_hold(ret);

        spin_unlock_bh(&net->xfrm.xfrm_policy_lock);

        return ret;
}

static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t)
{
        int match = 0;

        if (t->mode == m->mode && t->id.proto == m->proto &&
            (m->reqid == 0 || t->reqid == m->reqid)) {
                switch (t->mode) {
                case XFRM_MODE_TUNNEL:
                case XFRM_MODE_BEET:
                        if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
                                            m->old_family) &&
                            xfrm_addr_equal(&t->saddr, &m->old_saddr,
                                            m->old_family)) {
                                match = 1;
                        }
                        break;
                case XFRM_MODE_TRANSPORT:
                        /* in case of transport mode, template does not store
                           any IP addresses, hence we just compare mode and
                           protocol */
                        match = 1;
                        break;
                default:
                        break;
                }
        }
        return match;
}

/* update endpoint address(es) of template(s) */
static int xfrm_policy_migrate(struct xfrm_policy *pol,
                               struct xfrm_migrate *m, int num_migrate,
                               struct netlink_ext_ack *extack)
{
        struct xfrm_migrate *mp;
        int i, j, n = 0;

        write_lock_bh(&pol->lock);
        if (unlikely(pol->walk.dead)) {
                /* target policy has been deleted */
                NL_SET_ERR_MSG(extack, "Target policy not found");
                write_unlock_bh(&pol->lock);
                return -ENOENT;
        }

        for (i = 0; i < pol->xfrm_nr; i++) {
                for (j = 0, mp = m; j < num_migrate; j++, mp++) {
                        if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i]))
                                continue;
                        n++;
                        if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
                            pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
                                continue;
                        /* update endpoints */
                        memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
                               sizeof(pol->xfrm_vec[i].id.daddr));
                        memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr,
                               sizeof(pol->xfrm_vec[i].saddr));
                        pol->xfrm_vec[i].encap_family = mp->new_family;
                        /* flush bundles */
                        atomic_inc(&pol->genid);
                }
        }

        write_unlock_bh(&pol->lock);

        if (!n)
                return -ENODATA;

        return 0;
}

static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate,
                              struct netlink_ext_ack *extack)
{
        int i, j;

        if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) {
                NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)");
                return -EINVAL;
        }

        for (i = 0; i < num_migrate; i++) {
                if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
                    xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) {
                        NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null");
                        return -EINVAL;
                }

                /* check if there is any duplicated entry */
                for (j = i + 1; j < num_migrate; j++) {
                        if (!memcmp(&m[i].old_daddr, &m[j].old_daddr,
                                    sizeof(m[i].old_daddr)) &&
                            !memcmp(&m[i].old_saddr, &m[j].old_saddr,
                                    sizeof(m[i].old_saddr)) &&
                            m[i].proto == m[j].proto &&
                            m[i].mode == m[j].mode &&
                            m[i].reqid == m[j].reqid &&
                            m[i].old_family == m[j].old_family) {
                                NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
                 struct xfrm_migrate *m, int num_migrate,
                 struct xfrm_kmaddress *k, struct net *net,
                 struct xfrm_encap_tmpl *encap, u32 if_id,
                 struct netlink_ext_ack *extack)
{
        int i, err, nx_cur = 0, nx_new = 0;
        struct xfrm_policy *pol = NULL;
        struct xfrm_state *x, *xc;
        struct xfrm_state *x_cur[XFRM_MAX_DEPTH];
        struct xfrm_state *x_new[XFRM_MAX_DEPTH];
        struct xfrm_migrate *mp;

        /* Stage 0 - sanity checks */
        err = xfrm_migrate_check(m, num_migrate, extack);
        if (err < 0)
                goto out;

        if (dir >= XFRM_POLICY_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid policy direction");
                err = -EINVAL;
                goto out;
        }

        /* Stage 1 - find policy */
        pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id);
        if (!pol) {
                NL_SET_ERR_MSG(extack, "Target policy not found");
                err = -ENOENT;
                goto out;
        }

        /* Stage 2 - find and update state(s) */
        for (i = 0, mp = m; i < num_migrate; i++, mp++) {
                if ((x = xfrm_migrate_state_find(mp, net, if_id))) {
                        x_cur[nx_cur] = x;
                        nx_cur++;
                        xc = xfrm_state_migrate(x, mp, encap);
                        if (xc) {
                                x_new[nx_new] = xc;
                                nx_new++;
                        } else {
                                err = -ENODATA;
                                goto restore_state;
                        }
                }
        }

        /* Stage 3 - update policy */
        err = xfrm_policy_migrate(pol, m, num_migrate, extack);
        if (err < 0)
                goto restore_state;

        /* Stage 4 - delete old state(s) */
        if (nx_cur) {
                xfrm_states_put(x_cur, nx_cur);
                xfrm_states_delete(x_cur, nx_cur);
        }

        /* Stage 5 - announce */
        km_migrate(sel, dir, type, m, num_migrate, k, encap);

        xfrm_pol_put(pol);

        return 0;
out:
        return err;

restore_state:
        if (pol)
                xfrm_pol_put(pol);
        if (nx_cur)
                xfrm_states_put(x_cur, nx_cur);
        if (nx_new)
                xfrm_states_delete(x_new, nx_new);

        return err;
}
EXPORT_SYMBOL(xfrm_migrate);
#endif
















































































































































































































































































































































































































































































































































































































































































































    1 


























































































    9 







    3 


















    1 






























    2 
    2 



    2 










    1 















    1 





    1 

































































































    1 



    1 











    6 






















































































































    3 


    4 





















    4 
    2 






































    1 
    1 
















































































































































































    1 


    1 
    2 





    1 





    2 




    1 

















































    4 





























































































    1 










    2 







    4 















    4 

    4 
    1 










    3 

    1 




    4 


    4 























    4 






























































































    4 












    3 










   11 




































































































































































    5 























































    1 
















    1 































































































    4 








































































    2 
    2 




    3 



    2 
    2 






























    1 

























    1 






    1 



















































































    3 
    3 














    2 
    3 
















    2 



    2 






































































    3 


























    2 








    1 







    2 
    2 








    4 













































    2 


























    4 




    6 



























































































































    1 


















































    1 












































    5 



















    5 

    5 






























































































































    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the AF_INET socket handler.
 *
 * Version:        @(#)sock.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche <flla@stud.uni-sb.de>
 *
 * Fixes:
 *                Alan Cox        :        Volatiles in skbuff pointers. See
 *                                        skbuff comments. May be overdone,
 *                                        better to prove they can be removed
 *                                        than the reverse.
 *                Alan Cox        :        Added a zapped field for tcp to note
 *                                        a socket is reset and must stay shut up
 *                Alan Cox        :        New fields for options
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Eliminate low level recv/recvfrom
 *                David S. Miller        :        New socket lookup architecture.
 *              Steve Whitehouse:       Default routines for sock_ops
 *              Arnaldo C. Melo :        removed net_pinfo, tp_pinfo and made
 *                                      protinfo be just a void pointer, as the
 *                                      protocol specific parts were moved to
 *                                      respective headers and ipv4/v6, etc now
 *                                      use private slabcaches for its socks
 *              Pedro Hortas        :        New flags field for socket options
 */
#ifndef _SOCK_H
#define _SOCK_H

#include <linux/hardirq.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/timer.h>
#include <linux/cache.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>        /* struct sk_buff */
#include <linux/mm.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/static_key.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
#include <linux/sockptr.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/llist.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
#include <net/l3mdev.h>
#include <uapi/linux/socket.h>

/*
 * This structure really needs to be cleaned up.
 * Most of it is for TCP, and not used by any of
 * the other protocols.
 */

/* This is the per-socket lock.  The spinlock provides a synchronization
 * between user contexts and software interrupt processing, whereas the
 * mini-semaphore synchronizes multiple users amongst themselves.
 */
typedef struct {
        spinlock_t                slock;
        int                        owned;
        wait_queue_head_t        wq;
        /*
         * We express the mutex-alike socket_lock semantics
         * to the lock validator by explicitly managing
         * the slock as a lock variant (in addition to
         * the slock itself):
         */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} socket_lock_t;

struct sock;
struct proto;
struct net;

typedef __u32 __bitwise __portpair;
typedef __u64 __bitwise __addrpair;

/**
 *        struct sock_common - minimal network layer representation of sockets
 *        @skc_daddr: Foreign IPv4 addr
 *        @skc_rcv_saddr: Bound local IPv4 addr
 *        @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
 *        @skc_hash: hash value used with various protocol lookup tables
 *        @skc_u16hashes: two u16 hash values used by UDP lookup tables
 *        @skc_dport: placeholder for inet_dport/tw_dport
 *        @skc_num: placeholder for inet_num/tw_num
 *        @skc_portpair: __u32 union of @skc_dport & @skc_num
 *        @skc_family: network address family
 *        @skc_state: Connection state
 *        @skc_reuse: %SO_REUSEADDR setting
 *        @skc_reuseport: %SO_REUSEPORT setting
 *        @skc_ipv6only: socket is IPV6 only
 *        @skc_net_refcnt: socket is using net ref counting
 *        @skc_bound_dev_if: bound device index if != 0
 *        @skc_bind_node: bind hash linkage for various protocol lookup tables
 *        @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
 *        @skc_prot: protocol handlers inside a network family
 *        @skc_net: reference to the network namespace of this socket
 *        @skc_v6_daddr: IPV6 destination address
 *        @skc_v6_rcv_saddr: IPV6 source address
 *        @skc_cookie: socket's cookie value
 *        @skc_node: main hash linkage for various protocol lookup tables
 *        @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
 *        @skc_tx_queue_mapping: tx queue number for this connection
 *        @skc_rx_queue_mapping: rx queue number for this connection
 *        @skc_flags: place holder for sk_flags
 *                %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 *                %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 *        @skc_listener: connection request listener socket (aka rsk_listener)
 *                [union with @skc_flags]
 *        @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
 *                [union with @skc_flags]
 *        @skc_incoming_cpu: record/match cpu processing incoming packets
 *        @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
 *                [union with @skc_incoming_cpu]
 *        @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
 *                [union with @skc_incoming_cpu]
 *        @skc_refcnt: reference count
 *
 *        This is the minimal network layer representation of sockets, the header
 *        for struct sock and struct inet_timewait_sock.
 */
struct sock_common {
        union {
                __addrpair        skc_addrpair;
                struct {
                        __be32        skc_daddr;
                        __be32        skc_rcv_saddr;
                };
        };
        union  {
                unsigned int        skc_hash;
                __u16                skc_u16hashes[2];
        };
        /* skc_dport && skc_num must be grouped as well */
        union {
                __portpair        skc_portpair;
                struct {
                        __be16        skc_dport;
                        __u16        skc_num;
                };
        };

        unsigned short                skc_family;
        volatile unsigned char        skc_state;
        unsigned char                skc_reuse:4;
        unsigned char                skc_reuseport:1;
        unsigned char                skc_ipv6only:1;
        unsigned char                skc_net_refcnt:1;
        int                        skc_bound_dev_if;
        union {
                struct hlist_node        skc_bind_node;
                struct hlist_node        skc_portaddr_node;
        };
        struct proto                *skc_prot;
        possible_net_t                skc_net;

#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                skc_v6_daddr;
        struct in6_addr                skc_v6_rcv_saddr;
#endif

        atomic64_t                skc_cookie;

        /* following fields are padding to force
         * offset(struct sock, sk_refcnt) == 128 on 64bit arches
         * assuming IPV6 is enabled. We use this padding differently
         * for different kind of 'sockets'
         */
        union {
                unsigned long        skc_flags;
                struct sock        *skc_listener; /* request_sock */
                struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
        };
        /*
         * fields between dontcopy_begin/dontcopy_end
         * are not copied in sock_copy()
         */
        /* private: */
        int                        skc_dontcopy_begin[0];
        /* public: */
        union {
                struct hlist_node        skc_node;
                struct hlist_nulls_node skc_nulls_node;
        };
        unsigned short                skc_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        unsigned short                skc_rx_queue_mapping;
#endif
        union {
                int                skc_incoming_cpu;
                u32                skc_rcv_wnd;
                u32                skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
        };

        refcount_t                skc_refcnt;
        /* private: */
        int                     skc_dontcopy_end[0];
        union {
                u32                skc_rxhash;
                u32                skc_window_clamp;
                u32                skc_tw_snd_nxt; /* struct tcp_timewait_sock */
        };
        /* public: */
};

struct bpf_local_storage;
struct sk_filter;

/**
  *        struct sock - network layer representation of sockets
  *        @__sk_common: shared layout with inet_timewait_sock
  *        @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *        @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *        @sk_lock:        synchronizer
  *        @sk_kern_sock: True if sock is using kernel lock classes
  *        @sk_rcvbuf: size of receive buffer in bytes
  *        @sk_wq: sock wait queue and async head
  *        @sk_rx_dst: receive input route used by early demux
  *        @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
  *        @sk_rx_dst_cookie: cookie for @sk_rx_dst
  *        @sk_dst_cache: destination cache
  *        @sk_dst_pending_confirm: need to confirm neighbour
  *        @sk_policy: flow policy
  *        @sk_receive_queue: incoming packets
  *        @sk_wmem_alloc: transmit queue bytes committed
  *        @sk_tsq_flags: TCP Small Queues flags
  *        @sk_write_queue: Packet sending queue
  *        @sk_omem_alloc: "o" is "option" or "other"
  *        @sk_wmem_queued: persistent queue size
  *        @sk_forward_alloc: space allocated forward
  *        @sk_reserved_mem: space reserved and non-reclaimable for the socket
  *        @sk_napi_id: id of the last napi context to receive data for sk
  *        @sk_ll_usec: usecs to busypoll when there is no data
  *        @sk_allocation: allocation mode
  *        @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *        @sk_pacing_status: Pacing status (requested, handled by sch_fq)
  *        @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
  *        @sk_sndbuf: size of send buffer in bytes
  *        @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
  *        @sk_no_check_rx: allow zero checksum in RX packets
  *        @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *        @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden.
  *        @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *        @sk_gso_max_size: Maximum GSO segment size to build
  *        @sk_gso_max_segs: Maximum number of GSO segments
  *        @sk_pacing_shift: scaling factor for TCP Small Queues
  *        @sk_lingertime: %SO_LINGER l_linger setting
  *        @sk_backlog: always used with the per-socket spinlock held
  *        @sk_callback_lock: used with the callbacks in the end of this struct
  *        @sk_error_queue: rarely used
  *        @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *                          IPV6_ADDRFORM for instance)
  *        @sk_err: last error
  *        @sk_err_soft: errors that don't cause failure but are the cause of a
  *                      persistent failure not just 'timed out'
  *        @sk_drops: raw/udp drops counter
  *        @sk_ack_backlog: current listen backlog
  *        @sk_max_ack_backlog: listen backlog set in listen()
  *        @sk_uid: user id of owner
  *        @sk_prefer_busy_poll: prefer busypolling over softirq processing
  *        @sk_busy_poll_budget: napi processing budget when busypolling
  *        @sk_priority: %SO_PRIORITY setting
  *        @sk_type: socket type (%SOCK_STREAM, etc)
  *        @sk_protocol: which protocol this socket belongs in this network family
  *        @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred
  *        @sk_peer_pid: &struct pid for this socket's peer
  *        @sk_peer_cred: %SO_PEERCRED setting
  *        @sk_rcvlowat: %SO_RCVLOWAT setting
  *        @sk_rcvtimeo: %SO_RCVTIMEO setting
  *        @sk_sndtimeo: %SO_SNDTIMEO setting
  *        @sk_txhash: computed flow hash for use on transmit
  *        @sk_txrehash: enable TX hash rethink
  *        @sk_filter: socket filtering instructions
  *        @sk_timer: sock cleanup timer
  *        @sk_stamp: time stamp of last packet received
  *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
  *        @sk_tsflags: SO_TIMESTAMPING flags
  *        @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
  *                           Sockets that can be used under memory reclaim should
  *                           set this to false.
  *        @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
  *                      for timestamping
  *        @sk_tskey: counter to disambiguate concurrent tstamp requests
  *        @sk_zckey: counter to order MSG_ZEROCOPY notifications
  *        @sk_socket: Identd and reporting IO signals
  *        @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
  *        @sk_frag: cached page frag
  *        @sk_peek_off: current peek_offset value
  *        @sk_send_head: front of stuff to transmit
  *        @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
  *        @sk_security: used by security modules
  *        @sk_mark: generic packet mark
  *        @sk_cgrp_data: cgroup data for this cgroup
  *        @sk_memcg: this socket's memory cgroup association
  *        @sk_write_pending: a write to stream socket waits to start
  *        @sk_disconnects: number of disconnect operations performed on this sock
  *        @sk_state_change: callback to indicate change in the state of the sock
  *        @sk_data_ready: callback to indicate there is data to be processed
  *        @sk_write_space: callback to indicate there is bf sending space available
  *        @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *        @sk_backlog_rcv: callback to process the backlog
  *        @sk_validate_xmit_skb: ptr to an optional validate function
  *        @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
  *        @sk_reuseport_cb: reuseport group container
  *        @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
  *        @sk_rcu: used during RCU grace period
  *        @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
  *        @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
  *        @sk_txtime_report_errors: set report errors mode for SO_TXTIME
  *        @sk_txtime_unused: unused txtime flags
  *        @ns_tracker: tracker for netns reference
  */
struct sock {
        /*
         * Now struct inet_timewait_sock also uses sock_common, so please just
         * don't add nothing before this first member (__sk_common) --acme
         */
        struct sock_common        __sk_common;
#define sk_node                        __sk_common.skc_node
#define sk_nulls_node                __sk_common.skc_nulls_node
#define sk_refcnt                __sk_common.skc_refcnt
#define sk_tx_queue_mapping        __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
#define sk_rx_queue_mapping        __sk_common.skc_rx_queue_mapping
#endif

#define sk_dontcopy_begin        __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end                __sk_common.skc_dontcopy_end
#define sk_hash                        __sk_common.skc_hash
#define sk_portpair                __sk_common.skc_portpair
#define sk_num                        __sk_common.skc_num
#define sk_dport                __sk_common.skc_dport
#define sk_addrpair                __sk_common.skc_addrpair
#define sk_daddr                __sk_common.skc_daddr
#define sk_rcv_saddr                __sk_common.skc_rcv_saddr
#define sk_family                __sk_common.skc_family
#define sk_state                __sk_common.skc_state
#define sk_reuse                __sk_common.skc_reuse
#define sk_reuseport                __sk_common.skc_reuseport
#define sk_ipv6only                __sk_common.skc_ipv6only
#define sk_net_refcnt                __sk_common.skc_net_refcnt
#define sk_bound_dev_if                __sk_common.skc_bound_dev_if
#define sk_bind_node                __sk_common.skc_bind_node
#define sk_prot                        __sk_common.skc_prot
#define sk_net                        __sk_common.skc_net
#define sk_v6_daddr                __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
#define sk_cookie                __sk_common.skc_cookie
#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
#define sk_flags                __sk_common.skc_flags
#define sk_rxhash                __sk_common.skc_rxhash

        __cacheline_group_begin(sock_write_rx);

        atomic_t                sk_drops;
        __s32                        sk_peek_off;
        struct sk_buff_head        sk_error_queue;
        struct sk_buff_head        sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
         * the per-socket spinlock held and requires low latency
         * access. Therefore we special case it's implementation.
         * Note : rmem_alloc is in this structure to fill a hole
         * on 64bit arches, not because its logically part of
         * backlog.
         */
        struct {
                atomic_t        rmem_alloc;
                int                len;
                struct sk_buff        *head;
                struct sk_buff        *tail;
        } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

        __cacheline_group_end(sock_write_rx);

        __cacheline_group_begin(sock_read_rx);
        /* early demux fields */
        struct dst_entry __rcu        *sk_rx_dst;
        int                        sk_rx_dst_ifindex;
        u32                        sk_rx_dst_cookie;

#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int                sk_ll_usec;
        unsigned int                sk_napi_id;
        u16                        sk_busy_poll_budget;
        u8                        sk_prefer_busy_poll;
#endif
        u8                        sk_userlocks;
        int                        sk_rcvbuf;

        struct sk_filter __rcu        *sk_filter;
        union {
                struct socket_wq __rcu        *sk_wq;
                /* private: */
                struct socket_wq        *sk_wq_raw;
                /* public: */
        };

        void                        (*sk_data_ready)(struct sock *sk);
        long                        sk_rcvtimeo;
        int                        sk_rcvlowat;
        __cacheline_group_end(sock_read_rx);

        __cacheline_group_begin(sock_read_rxtx);
        int                        sk_err;
        struct socket                *sk_socket;
        struct mem_cgroup        *sk_memcg;
#ifdef CONFIG_XFRM
        struct xfrm_policy __rcu *sk_policy[2];
#endif
        __cacheline_group_end(sock_read_rxtx);

        __cacheline_group_begin(sock_write_rxtx);
        socket_lock_t                sk_lock;
        u32                        sk_reserved_mem;
        int                        sk_forward_alloc;
        u32                        sk_tsflags;
        __cacheline_group_end(sock_write_rxtx);

        __cacheline_group_begin(sock_write_tx);
        int                        sk_write_pending;
        atomic_t                sk_omem_alloc;
        int                        sk_sndbuf;

        int                        sk_wmem_queued;
        refcount_t                sk_wmem_alloc;
        unsigned long                sk_tsq_flags;
        union {
                struct sk_buff        *sk_send_head;
                struct rb_root        tcp_rtx_queue;
        };
        struct sk_buff_head        sk_write_queue;
        u32                        sk_dst_pending_confirm;
        u32                        sk_pacing_status; /* see enum sk_pacing */
        struct page_frag        sk_frag;
        struct timer_list        sk_timer;

        unsigned long                sk_pacing_rate; /* bytes per second */
        atomic_t                sk_zckey;
        atomic_t                sk_tskey;
        __cacheline_group_end(sock_write_tx);

        __cacheline_group_begin(sock_read_tx);
        unsigned long                sk_max_pacing_rate;
        long                        sk_sndtimeo;
        u32                        sk_priority;
        u32                        sk_mark;
        struct dst_entry __rcu        *sk_dst_cache;
        netdev_features_t        sk_route_caps;
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff*                (*sk_validate_xmit_skb)(struct sock *sk,
                                                        struct net_device *dev,
                                                        struct sk_buff *skb);
#endif
        u16                        sk_gso_type;
        u16                        sk_gso_max_segs;
        unsigned int                sk_gso_max_size;
        gfp_t                        sk_allocation;
        u32                        sk_txhash;
        u8                        sk_pacing_shift;
        bool                        sk_use_task_frag;
        __cacheline_group_end(sock_read_tx);

        /*
         * Because of non atomicity rules, all
         * changes are protected by socket lock.
         */
        u8                        sk_gso_disabled : 1,
                                sk_kern_sock : 1,
                                sk_no_check_tx : 1,
                                sk_no_check_rx : 1;
        u8                        sk_shutdown;
        u16                        sk_type;
        u16                        sk_protocol;
        unsigned long                sk_lingertime;
        struct proto                *sk_prot_creator;
        rwlock_t                sk_callback_lock;
        int                        sk_err_soft;
        u32                        sk_ack_backlog;
        u32                        sk_max_ack_backlog;
        kuid_t                        sk_uid;
        spinlock_t                sk_peer_lock;
        int                        sk_bind_phc;
        struct pid                *sk_peer_pid;
        const struct cred        *sk_peer_cred;

        ktime_t                        sk_stamp;
#if BITS_PER_LONG==32
        seqlock_t                sk_stamp_seq;
#endif
        int                        sk_disconnects;

        u8                        sk_txrehash;
        u8                        sk_clockid;
        u8                        sk_txtime_deadline_mode : 1,
                                sk_txtime_report_errors : 1,
                                sk_txtime_unused : 6;

        void                        *sk_user_data;
#ifdef CONFIG_SECURITY
        void                        *sk_security;
#endif
        struct sock_cgroup_data        sk_cgrp_data;
        void                        (*sk_state_change)(struct sock *sk);
        void                        (*sk_write_space)(struct sock *sk);
        void                        (*sk_error_report)(struct sock *sk);
        int                        (*sk_backlog_rcv)(struct sock *sk,
                                                  struct sk_buff *skb);
        void                    (*sk_destruct)(struct sock *sk);
        struct sock_reuseport __rcu        *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu        *sk_bpf_storage;
#endif
        struct rcu_head                sk_rcu;
        netns_tracker                ns_tracker;
};

struct sock_bh_locked {
        struct sock *sock;
        local_lock_t bh_lock;
};

enum sk_pacing {
        SK_PACING_NONE                = 0,
        SK_PACING_NEEDED        = 1,
        SK_PACING_FQ                = 2,
};

/* flag bits in sk_user_data
 *
 * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
 *   not be suitable for copying when cloning the socket. For instance,
 *   it can point to a reference counted object. sk_user_data bottom
 *   bit is set if pointer must not be copied.
 *
 * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
 *   managed/owned by a BPF reuseport array. This bit should be set
 *   when sk_user_data's sk is added to the bpf's reuseport_array.
 *
 * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
 *   sk_user_data points to psock type. This bit should be set
 *   when sk_user_data is assigned to a psock object.
 */
#define SK_USER_DATA_NOCOPY        1UL
#define SK_USER_DATA_BPF        2UL
#define SK_USER_DATA_PSOCK        4UL
#define SK_USER_DATA_PTRMASK        ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
                                  SK_USER_DATA_PSOCK)

/**
 * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
 * @sk: socket
 */
static inline bool sk_user_data_is_nocopy(const struct sock *sk)
{
        return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
}

#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))

/**
 * __locked_read_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 *
 * The caller must be holding sk->sk_callback_lock.
 */
static inline void *
__locked_read_sk_user_data_with_flags(const struct sock *sk,
                                      uintptr_t flags)
{
        uintptr_t sk_user_data =
                (uintptr_t)rcu_dereference_check(__sk_user_data(sk),
                                                 lockdep_is_held(&sk->sk_callback_lock));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

/**
 * __rcu_dereference_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 */
static inline void *
__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
                                          uintptr_t flags)
{
        uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

#define rcu_dereference_sk_user_data(sk)                                \
        __rcu_dereference_sk_user_data_with_flags(sk, 0)
#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)                \
({                                                                        \
        uintptr_t __tmp1 = (uintptr_t)(ptr),                                \
                  __tmp2 = (uintptr_t)(flags);                                \
        WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);                        \
        WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);                        \
        rcu_assign_pointer(__sk_user_data((sk)),                        \
                           __tmp1 | __tmp2);                                \
})
#define rcu_assign_sk_user_data(sk, ptr)                                \
        __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)

static inline
struct net *sock_net(const struct sock *sk)
{
        return read_pnet(&sk->sk_net);
}

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
        write_pnet(&sk->sk_net, net);
}

/*
 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
 * or not whether his port will be reused by someone else. SK_FORCE_REUSE
 * on a socket means that the socket will reuse everybody else's port
 * without looking at the other's sk_reuse value.
 */

#define SK_NO_REUSE        0
#define SK_CAN_REUSE        1
#define SK_FORCE_REUSE        2

int sk_set_peek_off(struct sock *sk, int val);

static inline int sk_peek_offset(const struct sock *sk, int flags)
{
        if (unlikely(flags & MSG_PEEK)) {
                return READ_ONCE(sk->sk_peek_off);
        }

        return 0;
}

static inline void sk_peek_offset_bwd(struct sock *sk, int val)
{
        s32 off = READ_ONCE(sk->sk_peek_off);

        if (unlikely(off >= 0)) {
                off = max_t(s32, off - val, 0);
                WRITE_ONCE(sk->sk_peek_off, off);
        }
}

static inline void sk_peek_offset_fwd(struct sock *sk, int val)
{
        sk_peek_offset_bwd(sk, -val);
}

/*
 * Hashed lists helper routines
 */
static inline struct sock *sk_entry(const struct hlist_node *node)
{
        return hlist_entry(node, struct sock, sk_node);
}

static inline struct sock *__sk_head(const struct hlist_head *head)
{
        return hlist_entry(head->first, struct sock, sk_node);
}

static inline struct sock *sk_head(const struct hlist_head *head)
{
        return hlist_empty(head) ? NULL : __sk_head(head);
}

static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
}

static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
}

static inline struct sock *sk_next(const struct sock *sk)
{
        return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node);
}

static inline struct sock *sk_nulls_next(const struct sock *sk)
{
        return (!is_a_nulls(sk->sk_nulls_node.next)) ?
                hlist_nulls_entry(sk->sk_nulls_node.next,
                                  struct sock, sk_nulls_node) :
                NULL;
}

static inline bool sk_unhashed(const struct sock *sk)
{
        return hlist_unhashed(&sk->sk_node);
}

static inline bool sk_hashed(const struct sock *sk)
{
        return !sk_unhashed(sk);
}

static inline void sk_node_init(struct hlist_node *node)
{
        node->pprev = NULL;
}

static inline void __sk_del_node(struct sock *sk)
{
        __hlist_del(&sk->sk_node);
}

/* NB: equivalent to hlist_del_init_rcu */
static inline bool __sk_del_node_init(struct sock *sk)
{
        if (sk_hashed(sk)) {
                __sk_del_node(sk);
                sk_node_init(&sk->sk_node);
                return true;
        }
        return false;
}

/* Grab socket reference count. This operation is valid only
   when sk is ALREADY grabbed f.e. it is found in hash table
   or a list and the lookup is made under lock preventing hash table
   modifications.
 */

static __always_inline void sock_hold(struct sock *sk)
{
        refcount_inc(&sk->sk_refcnt);
}

/* Ungrab socket in the context, which assumes that socket refcnt
   cannot hit zero, f.e. it is true in context of any socketcall.
 */
static __always_inline void __sock_put(struct sock *sk)
{
        refcount_dec(&sk->sk_refcnt);
}

static inline bool sk_del_node_init(struct sock *sk)
{
        bool rc = __sk_del_node_init(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}
#define sk_del_node_init_rcu(sk)        sk_del_node_init(sk)

static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk)
{
        if (sk_hashed(sk)) {
                hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
                return true;
        }
        return false;
}

static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
        bool rc = __sk_nulls_del_node_init_rcu(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}

static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
        hlist_add_head(&sk->sk_node, list);
}

static inline void sk_add_node(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        __sk_add_node(sk, list);
}

static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
            sk->sk_family == AF_INET6)
                hlist_add_tail_rcu(&sk->sk_node, list);
        else
                hlist_add_head_rcu(&sk->sk_node, list);
}

static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        hlist_add_tail_rcu(&sk->sk_node, list);
}

static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
}

static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}

static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        sock_hold(sk);
        __sk_nulls_add_node_rcu(sk, list);
}

static inline void __sk_del_bind_node(struct sock *sk)
{
        __hlist_del(&sk->sk_bind_node);
}

static inline void sk_add_bind_node(struct sock *sk,
                                        struct hlist_head *list)
{
        hlist_add_head(&sk->sk_bind_node, list);
}

#define sk_for_each(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, sk_node)
#define sk_nulls_for_each(__sk, node, list) \
        hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
#define sk_nulls_for_each_rcu(__sk, node, list) \
        hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
#define sk_for_each_from(__sk) \
        hlist_for_each_entry_from(__sk, sk_node)
#define sk_nulls_for_each_from(__sk, node) \
        if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
                hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
#define sk_for_each_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_bind_node)

/**
 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @offset:        offset of hlist_node within the struct.
 *
 */
#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)                       \
        for (pos = rcu_dereference(hlist_first_rcu(head));                       \
             pos != NULL &&                                                       \
                ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
             pos = rcu_dereference(hlist_next_rcu(pos)))

static inline struct user_namespace *sk_user_ns(const struct sock *sk)
{
        /* Careful only use this in a context where these parameters
         * can not change and must all be valid, such as recvmsg from
         * userspace.
         */
        return sk->sk_socket->file->f_cred->user_ns;
}

/* Sock flags */
enum sock_flags {
        SOCK_DEAD,
        SOCK_DONE,
        SOCK_URGINLINE,
        SOCK_KEEPOPEN,
        SOCK_LINGER,
        SOCK_DESTROY,
        SOCK_BROADCAST,
        SOCK_TIMESTAMP,
        SOCK_ZAPPED,
        SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
        SOCK_DBG, /* %SO_DEBUG setting */
        SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
        SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
        SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
        SOCK_MEMALLOC, /* VM depends on this socket for swapping */
        SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
        SOCK_FASYNC, /* fasync() active */
        SOCK_RXQ_OVFL,
        SOCK_ZEROCOPY, /* buffers from userspace */
        SOCK_WIFI_STATUS, /* push wifi status to userspace */
        SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS.
                     * Will use last 4 bytes of packet sent from
                     * user-space instead.
                     */
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
        SOCK_TXTIME,
        SOCK_XDP, /* XDP is attached */
        SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
        SOCK_RCVMARK, /* Receive SO_MARK  ancillary data with packet */
};

#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
{
        nsk->sk_flags = osk->sk_flags;
}

static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
{
        __set_bit(flag, &sk->sk_flags);
}

static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
        __clear_bit(flag, &sk->sk_flags);
}

static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
                                     int valbool)
{
        if (valbool)
                sock_set_flag(sk, bit);
        else
                sock_reset_flag(sk, bit);
}

static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
{
        return test_bit(flag, &sk->sk_flags);
}

#ifdef CONFIG_NET
DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
static inline int sk_memalloc_socks(void)
{
        return static_branch_unlikely(&memalloc_socks_key);
}

void __receive_sock(struct file *file);
#else

static inline int sk_memalloc_socks(void)
{
        return 0;
}

static inline void __receive_sock(struct file *file)
{ }
#endif

static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
{
        return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
}

static inline void sk_acceptq_removed(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}

static inline void sk_acceptq_added(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}

/* Note: If you think the test should be:
 *        return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
 * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
 */
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}

/*
 * Compute minimal free write space needed to queue new packets.
 */
static inline int sk_stream_min_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_wmem_queued) >> 1;
}

static inline int sk_stream_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
}

static inline void sk_wmem_queued_add(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

static inline void sk_forward_alloc_add(struct sock *sk, int val)
{
        /* Paired with lockless reads of sk->sk_forward_alloc */
        WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val);
}

void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        /* dont let skb dst not refcounted, we are going to leave rcu lock */
        skb_dst_force(skb);

        if (!sk->sk_backlog.tail)
                WRITE_ONCE(sk->sk_backlog.head, skb);
        else
                sk->sk_backlog.tail->next = skb;

        WRITE_ONCE(sk->sk_backlog.tail, skb);
        skb->next = NULL;
}

/*
 * Take into account size of receive queue and backlog queue
 * Do not take into account this skb truesize,
 * to allow even a single big packet to come.
 */
static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit)
{
        unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);

        return qsize > limit;
}

/* The per-socket spinlock must be held here. */
static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
                                              unsigned int limit)
{
        if (sk_rcvqueues_full(sk, limit))
                return -ENOBUFS;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
                return -ENOMEM;

        __sk_add_backlog(sk, skb);
        sk->sk_backlog.len += skb->truesize;
        return 0;
}

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);

INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb));

static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
                return __sk_backlog_rcv(sk, skb);

        return INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                  tcp_v6_do_rcv,
                                  tcp_v4_do_rcv,
                                  sk, skb);
}

static inline void sk_incoming_cpu_update(struct sock *sk)
{
        int cpu = raw_smp_processor_id();

        if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
                WRITE_ONCE(sk->sk_incoming_cpu, cpu);
}


static inline void sock_rps_save_rxhash(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in sock_rps_record_flow().
         */
        if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
                WRITE_ONCE(sk->sk_rxhash, skb->hash);
#endif
}

static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
        /* Paired with READ_ONCE() in sock_rps_record_flow() */
        WRITE_ONCE(sk->sk_rxhash, 0);
#endif
}

#define sk_wait_event(__sk, __timeo, __condition, __wait)                \
        ({        int __rc, __dis = __sk->sk_disconnects;                        \
                release_sock(__sk);                                        \
                __rc = __condition;                                        \
                if (!__rc) {                                                \
                        *(__timeo) = wait_woken(__wait,                        \
                                                TASK_INTERRUPTIBLE,        \
                                                *(__timeo));                \
                }                                                        \
                sched_annotate_sleep();                                        \
                lock_sock(__sk);                                        \
                __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \
                __rc;                                                        \
        })

int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
void sk_stream_wait_close(struct sock *sk, long timeo_p);
int sk_stream_error(struct sock *sk, int flags, int err);
void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);

void __sk_flush_backlog(struct sock *sk);

static inline bool sk_flush_backlog(struct sock *sk)
{
        if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
                __sk_flush_backlog(sk);
                return true;
        }
        return false;
}

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);

struct request_sock_ops;
struct timewait_sock_ops;
struct inet_hashinfo;
struct raw_hashinfo;
struct smc_hashinfo;
struct module;
struct sk_psock;

/*
 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
 * un-modified. Special care is taken when initializing object to zero.
 */
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
{
        if (offsetof(struct sock, sk_node.next) != 0)
                memset(sk, 0, offsetof(struct sock, sk_node.next));
        memset(&sk->sk_node.pprev, 0,
               size - offsetof(struct sock, sk_node.pprev));
}

struct proto_accept_arg {
        int flags;
        int err;
        int is_empty;
        bool kern;
};

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 */
struct proto {
        void                        (*close)(struct sock *sk,
                                        long timeout);
        int                        (*pre_connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*disconnect)(struct sock *sk, int flags);

        struct sock *                (*accept)(struct sock *sk,
                                          struct proto_accept_arg *arg);

        int                        (*ioctl)(struct sock *sk, int cmd,
                                         int *karg);
        int                        (*init)(struct sock *sk);
        void                        (*destroy)(struct sock *sk);
        void                        (*shutdown)(struct sock *sk, int how);
        int                        (*setsockopt)(struct sock *sk, int level,
                                        int optname, sockptr_t optval,
                                        unsigned int optlen);
        int                        (*getsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int __user *option);
        void                        (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
        int                        (*compat_ioctl)(struct sock *sk,
                                        unsigned int cmd, unsigned long arg);
#endif
        int                        (*sendmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len);
        int                        (*recvmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len, int flags, int *addr_len);
        void                        (*splice_eof)(struct socket *sock);
        int                        (*bind)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);
        int                        (*bind_add)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);

        int                        (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
        bool                        (*bpf_bypass_getsockopt)(int level,
                                                         int optname);

        void                (*release_cb)(struct sock *sk);

        /* Keeping track of sk's, looking them up, and port selection methods. */
        int                        (*hash)(struct sock *sk);
        void                        (*unhash)(struct sock *sk);
        void                        (*rehash)(struct sock *sk);
        int                        (*get_port)(struct sock *sk, unsigned short snum);
        void                        (*put_port)(struct sock *sk);
#ifdef CONFIG_BPF_SYSCALL
        int                        (*psock_update_sk_prot)(struct sock *sk,
                                                        struct sk_psock *psock,
                                                        bool restore);
#endif

        /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
        unsigned int                inuse_idx;
#endif

#if IS_ENABLED(CONFIG_MPTCP)
        int                        (*forward_alloc_get)(const struct sock *sk);
#endif

        bool                        (*stream_memory_free)(const struct sock *sk, int wake);
        bool                        (*sock_is_readable)(struct sock *sk);
        /* Memory pressure */
        void                        (*enter_memory_pressure)(struct sock *sk);
        void                        (*leave_memory_pressure)(struct sock *sk);
        atomic_long_t                *memory_allocated;        /* Current allocated memory. */
        int  __percpu                *per_cpu_fw_alloc;
        struct percpu_counter        *sockets_allocated;        /* Current number of sockets. */

        /*
         * Pressure flag: try to collapse.
         * Technical note: it is used by multiple contexts non atomically.
         * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
         * All the __sk_mem_schedule() is of this nature: accounting
         * is strict, actions are advisory and have some latency.
         */
        unsigned long                *memory_pressure;
        long                        *sysctl_mem;

        int                        *sysctl_wmem;
        int                        *sysctl_rmem;
        u32                        sysctl_wmem_offset;
        u32                        sysctl_rmem_offset;

        int                        max_header;
        bool                        no_autobind;

        struct kmem_cache        *slab;
        unsigned int                obj_size;
        unsigned int                ipv6_pinfo_offset;
        slab_flags_t                slab_flags;
        unsigned int                useroffset;        /* Usercopy region offset */
        unsigned int                usersize;        /* Usercopy region size */

        unsigned int __percpu        *orphan_count;

        struct request_sock_ops        *rsk_prot;
        struct timewait_sock_ops *twsk_prot;

        union {
                struct inet_hashinfo        *hashinfo;
                struct udp_table        *udp_table;
                struct raw_hashinfo        *raw_hash;
                struct smc_hashinfo        *smc_hash;
        } h;

        struct module                *owner;

        char                        name[32];

        struct list_head        node;
        int                        (*diag_destroy)(struct sock *sk, int err);
} __randomize_layout;

int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
int sock_load_diag_module(int family, int protocol);

INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));

static inline int sk_forward_alloc_get(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_MPTCP)
        if (sk->sk_prot->forward_alloc_get)
                return sk->sk_prot->forward_alloc_get(sk);
#endif
        return READ_ONCE(sk->sk_forward_alloc);
}

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
        if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
                return false;

        return sk->sk_prot->stream_memory_free ?
                INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free,
                                     tcp_stream_memory_free, sk, wake) : true;
}

static inline bool sk_stream_memory_free(const struct sock *sk)
{
        return __sk_stream_memory_free(sk, 0);
}

static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
{
        return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
               __sk_stream_memory_free(sk, wake);
}

static inline bool sk_stream_is_writeable(const struct sock *sk)
{
        return __sk_stream_is_writeable(sk, 0);
}

static inline int sk_under_cgroup_hierarchy(struct sock *sk,
                                            struct cgroup *ancestor)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
        return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
                                    ancestor);
#else
        return -ENOTSUPP;
#endif
}

#define SK_ALLOC_PERCPU_COUNTER_BATCH 16

static inline void sk_sockets_allocated_dec(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline void sk_sockets_allocated_inc(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline u64
sk_sockets_allocated_read_positive(struct sock *sk)
{
        return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
}

static inline int
proto_sockets_allocated_sum_positive(struct proto *prot)
{
        return percpu_counter_sum_positive(prot->sockets_allocated);
}

#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR        64        /* should be enough for the first time */
struct prot_inuse {
        int all;
        int val[PROTO_INUSE_NR];
};

static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
        this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
}

static inline void sock_inuse_add(const struct net *net, int val)
{
        this_cpu_add(net->core.prot_inuse->all, val);
}

int sock_prot_inuse_get(struct net *net, struct proto *proto);
int sock_inuse_get(struct net *net);
#else
static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
}

static inline void sock_inuse_add(const struct net *net, int val)
{
}
#endif


/* With per-bucket locks this operation is not-atomic, so that
 * this version is not worse.
 */
static inline int __sk_prot_rehash(struct sock *sk)
{
        sk->sk_prot->unhash(sk);
        return sk->sk_prot->hash(sk);
}

/* About 10 seconds */
#define SOCK_DESTROY_TIME (10*HZ)

/* Sockets 0-1023 can't be bound to unless you are superuser */
#define PROT_SOCK        1024

#define SHUTDOWN_MASK        3
#define RCV_SHUTDOWN        1
#define SEND_SHUTDOWN        2

#define SOCK_BINDADDR_LOCK        4
#define SOCK_BINDPORT_LOCK        8

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

static inline struct socket *SOCKET_I(struct inode *inode)
{
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

static inline struct inode *SOCK_INODE(struct socket *socket)
{
        return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}

/*
 * Functions for memory accounting
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

#define SK_MEM_SEND        0
#define SK_MEM_RECV        1

/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
        return READ_ONCE(sk->sk_prot->sysctl_mem[index]);
}

static inline int sk_mem_pages(int amt)
{
        return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
{
        /* return true if protocol supports memory accounting */
        return !!sk->sk_prot->memory_allocated;
}

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
                skb_pfmemalloc(skb);
}

static inline int sk_unused_reserved_mem(const struct sock *sk)
{
        int unused_mem;

        if (likely(!sk->sk_reserved_mem))
                return 0;

        unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued -
                        atomic_read(&sk->sk_rmem_alloc);

        return unused_mem > 0 ? unused_mem : 0;
}

static inline void sk_mem_reclaim(struct sock *sk)
{
        int reclaimable;

        if (!sk_has_account(sk))
                return;

        reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

        if (reclaimable >= (int)PAGE_SIZE)
                __sk_mem_reclaim(sk, reclaimable);
}

static inline void sk_mem_reclaim_final(struct sock *sk)
{
        sk->sk_reserved_mem = 0;
        sk_mem_reclaim(sk);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, -size);
}

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, size);
        sk_mem_reclaim(sk);
}

/*
 * Macro so as to not evaluate some arguments when
 * lockdep is not enabled.
 *
 * Mark both the sk_lock and the sk_lock.slock as a
 * per-address-family lock class.
 */
#define sock_lock_init_class_and_name(sk, sname, skey, name, key)        \
do {                                                                        \
        sk->sk_lock.owned = 0;                                                \
        init_waitqueue_head(&sk->sk_lock.wq);                                \
        spin_lock_init(&(sk)->sk_lock.slock);                                \
        debug_check_no_locks_freed((void *)&(sk)->sk_lock,                \
                        sizeof((sk)->sk_lock));                                \
        lockdep_set_class_and_name(&(sk)->sk_lock.slock,                \
                                (skey), (sname));                                \
        lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);        \
} while (0)

static inline bool lockdep_sock_is_held(const struct sock *sk)
{
        return lockdep_is_held(&sk->sk_lock) ||
               lockdep_is_held(&sk->sk_lock.slock);
}

void lock_sock_nested(struct sock *sk, int subclass);

static inline void lock_sock(struct sock *sk)
{
        lock_sock_nested(sk, 0);
}

void __lock_sock(struct sock *sk);
void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);

/* BH context may only use the following locking interface. */
#define bh_lock_sock(__sk)        spin_lock(&((__sk)->sk_lock.slock))
#define bh_lock_sock_nested(__sk) \
                                spin_lock_nested(&((__sk)->sk_lock.slock), \
                                SINGLE_DEPTH_NESTING)
#define bh_unlock_sock(__sk)        spin_unlock(&((__sk)->sk_lock.slock))

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock);

/**
 * lock_sock_fast - fast version of lock_sock
 * @sk: socket
 *
 * This version should be used for very small section, where process wont block
 * return false if fast path is taken:
 *
 *   sk_lock.slock locked, owned = 0, BH disabled
 *
 * return true if slow path is taken:
 *
 *   sk_lock.slock unlocked, owned = 1, BH enabled
 */
static inline bool lock_sock_fast(struct sock *sk)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/* fast socket lock variant for caller already holding a [different] socket lock */
static inline bool lock_sock_fast_nested(struct sock *sk)
{
        mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/**
 * unlock_sock_fast - complement of lock_sock_fast
 * @sk: socket
 * @slow: slow mode
 *
 * fast unlock socket for user context.
 * If slow mode is on, we call regular release_sock()
 */
static inline void unlock_sock_fast(struct sock *sk, bool slow)
        __releases(&sk->sk_lock.slock)
{
        if (slow) {
                release_sock(sk);
                __release(&sk->sk_lock.slock);
        } else {
                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
                spin_unlock_bh(&sk->sk_lock.slock);
        }
}

void sockopt_lock_sock(struct sock *sk);
void sockopt_release_sock(struct sock *sk);
bool sockopt_ns_capable(struct user_namespace *ns, int cap);
bool sockopt_capable(int cap);

/* Used by processes to "lock" a socket state, so that
 * interrupts and bottom half handlers won't change it
 * from under us. It essentially blocks any incoming
 * packets, so that we won't get any new data or any
 * packets that change the state of the socket.
 *
 * While locked, BH processing will add new packets to
 * the backlog queue.  This queue is processed by the
 * owner of the socket lock right before it is released.
 *
 * Since ~2.3.5 it is also exclusive sleep lock serializing
 * accesses from user process context.
 */

static inline void sock_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline void sock_not_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline bool sock_owned_by_user(const struct sock *sk)
{
        sock_owned_by_me(sk);
        return sk->sk_lock.owned;
}

static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
{
        return sk->sk_lock.owned;
}

static inline void sock_release_ownership(struct sock *sk)
{
        DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
        sk->sk_lock.owned = 0;

        /* The sk_lock has mutex_unlock() semantics: */
        mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}

/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
        struct sock *sk = (struct sock *)csk;

        return !sock_owned_by_user_nocheck(sk) &&
                !spin_is_locked(&sk->sk_lock.slock);
}

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern);
void sk_free(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
void sk_free_unlock_clone(struct sock *sk);

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);
#else
#define sock_edemux sock_efree
#endif

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen);
int sock_setsockopt(struct socket *sock, int level, int op,
                    sockptr_t optval, unsigned int optlen);
int do_sock_setsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, int optlen);
int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, sockptr_t optlen);

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order);

static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
                                                  unsigned long size,
                                                  int noblock, int *errcode)
{
        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}

void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);

static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
{
        if (sk->sk_socket)
                clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
        WRITE_ONCE(sk->sk_prot, proto);
}

struct sockcm_cookie {
        u64 transmit_time;
        u32 mark;
        u32 tsflags;
};

static inline void sockcm_init(struct sockcm_cookie *sockc,
                               const struct sock *sk)
{
        *sockc = (struct sockcm_cookie) {
                .tsflags = READ_ONCE(sk->sk_tsflags)
        };
}

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc);
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * does not implement a particular function.
 */
int sock_no_bind(struct socket *, struct sockaddr *, int);
int sock_no_connect(struct socket *, struct sockaddr *, int, int);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
int sock_no_mmap(struct file *file, struct socket *sock,
                 struct vm_area_struct *vma);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * uses the inet style.
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                                  char __user *optval, int __user *optlen);
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen);

void sk_common_release(struct sock *sk);

/*
 *        Default socket callbacks and setup code
 */

/* Initialise core socket variables using an explicit uid. */
void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);

/* Initialise core socket variables.
 * Assumes struct socket *sock is embedded in a struct socket_alloc.
 */
void sock_init_data(struct socket *sock, struct sock *sk);

/*
 * Socket reference counting postulates.
 *
 * * Each user of socket SHOULD hold a reference count.
 * * Each access point to socket (an hash table bucket, reference from a list,
 *   running timer, skb in flight MUST hold a reference count.
 * * When reference count hits 0, it means it will never increase back.
 * * When reference count hits 0, it means that no references from
 *   outside exist to this socket and current process on current CPU
 *   is last user and may/should destroy this socket.
 * * sk_free is called from any context: process, BH, IRQ. When
 *   it is called, socket has no references from outside -> sk_free
 *   may release descendant resources allocated by the socket, but
 *   to the time when it is called, socket is NOT referenced by any
 *   hash tables, lists etc.
 * * Packets, delivered from outside (from network or from another process)
 *   and enqueued on receive/error queues SHOULD NOT grab reference count,
 *   when they sit in queue. Otherwise, packets will leak to hole, when
 *   socket is looked up by one cpu and unhasing is made by another CPU.
 *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
 *   (leak to backlog). Packet socket does all the processing inside
 *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
 *   use separate SMP lock, so that they are prone too.
 */

/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put(struct sock *sk)
{
        if (refcount_dec_and_test(&sk->sk_refcnt))
                sk_free(sk);
}
/* Generic version of sock_put(), dealing with all sockets
 * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...)
 */
void sock_gen_put(struct sock *sk);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested,
                     unsigned int trim_cap, bool refcounted);
static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                                 const int nested)
{
        return __sk_receive_skb(sk, skb, nested, 1, true);
}

static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
{
        /* sk_tx_queue_mapping accept only upto a 16-bit value */
        if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
                return;
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
}

#define NO_QUEUE_MAPPING        USHRT_MAX

static inline void sk_tx_queue_clear(struct sock *sk)
{
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

static inline int sk_tx_queue_get(const struct sock *sk)
{
        if (sk) {
                /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
                 * and sk_tx_queue_set().
                 */
                int val = READ_ONCE(sk->sk_tx_queue_mapping);

                if (val != NO_QUEUE_MAPPING)
                        return val;
        }
        return -1;
}

static inline void __sk_rx_queue_set(struct sock *sk,
                                     const struct sk_buff *skb,
                                     bool force_set)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (skb_rx_queue_recorded(skb)) {
                u16 rx_queue = skb_get_rx_queue(skb);

                if (force_set ||
                    unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
                        WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
        }
#endif
}

static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, true);
}

static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, false);
}

static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}

static inline int sk_rx_queue_get(const struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (sk) {
                int res = READ_ONCE(sk->sk_rx_queue_mapping);

                if (res != NO_QUEUE_MAPPING)
                        return res;
        }
#endif

        return -1;
}

static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
        sk->sk_socket = sock;
}

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
        BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
        return &rcu_dereference_raw(sk->sk_wq)->wait;
}
/* Detach socket from process context.
 * Announce socket dead, detach it from wait queue and inode.
 * Note that parent inode held reference count on this struct sock,
 * we do not release it in this function, because protocol
 * probably wants some additional cleanups or even continuing
 * to work with this socket (TCP).
 */
static inline void sock_orphan(struct sock *sk)
{
        write_lock_bh(&sk->sk_callback_lock);
        sock_set_flag(sk, SOCK_DEAD);
        sk_set_socket(sk, NULL);
        sk->sk_wq  = NULL;
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
        WARN_ON(parent->sk);
        write_lock_bh(&sk->sk_callback_lock);
        rcu_assign_pointer(sk->sk_wq, &parent->wq);
        parent->sk = sk;
        sk_set_socket(sk, parent);
        sk->sk_uid = SOCK_INODE(parent)->i_uid;
        security_sock_graft(sk, parent);
        write_unlock_bh(&sk->sk_callback_lock);
}

kuid_t sock_i_uid(struct sock *sk);
unsigned long __sock_i_ino(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);

static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
        return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
}

static inline u32 net_tx_rndhash(void)
{
        u32 v = get_random_u32();

        return v ?: 1;
}

static inline void sk_set_txhash(struct sock *sk)
{
        /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
        WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}

static inline bool sk_rethink_txhash(struct sock *sk)
{
        if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
                sk_set_txhash(sk);
                return true;
        }
        return false;
}

static inline struct dst_entry *
__sk_dst_get(const struct sock *sk)
{
        return rcu_dereference_check(sk->sk_dst_cache,
                                     lockdep_sock_is_held(sk));
}

static inline struct dst_entry *
sk_dst_get(const struct sock *sk)
{
        struct dst_entry *dst;

        rcu_read_lock();
        dst = rcu_dereference(sk->sk_dst_cache);
        if (dst && !rcuref_get(&dst->__rcuref))
                dst = NULL;
        rcu_read_unlock();
        return dst;
}

static inline void __dst_negative_advice(struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->ops->negative_advice)
                dst->ops->negative_advice(sk, dst);
}

static inline void dst_negative_advice(struct sock *sk)
{
        sk_rethink_txhash(sk);
        __dst_negative_advice(sk);
}

static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = rcu_dereference_protected(sk->sk_dst_cache,
                                            lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst)));
        dst_release(old_dst);
}

static inline void
__sk_dst_reset(struct sock *sk)
{
        __sk_dst_set(sk, NULL);
}

static inline void
sk_dst_reset(struct sock *sk)
{
        sk_dst_set(sk, NULL);
}

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);

static inline void sk_dst_confirm(struct sock *sk)
{
        if (!READ_ONCE(sk->sk_dst_pending_confirm))
                WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
}

static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
        if (skb_get_dst_pending_confirm(skb)) {
                struct sock *sk = skb->sk;

                if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
                        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                neigh_confirm(n);
        }
}

bool sk_mc_loop(const struct sock *sk);

static inline bool sk_can_gso(const struct sock *sk)
{
        return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst);

static inline void sk_gso_disable(struct sock *sk)
{
        sk->sk_gso_disabled = 1;
        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
}

static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
                                           struct iov_iter *from, char *to,
                                           int copy, int offset)
{
        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, offset);
        } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
                if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
        } else if (!copy_from_iter_full(to, copy, from))
                return -EFAULT;

        return 0;
}

static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
                                       struct iov_iter *from, int copy)
{
        int err, offset = skb->len;

        err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
                                       copy, offset);
        if (err)
                __skb_trim(skb, offset);

        return err;
}

static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
                                           struct sk_buff *skb,
                                           struct page *page,
                                           int off, int copy)
{
        int err;

        err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
                                       copy, skb->len);
        if (err)
                return err;

        skb_len_add(skb, copy);
        sk_wmem_queued_add(sk, copy);
        sk_mem_charge(sk, copy);
        return 0;
}

/**
 * sk_wmem_alloc_get - returns write allocations
 * @sk: socket
 *
 * Return: sk_wmem_alloc minus initial offset of one
 */
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) - 1;
}

/**
 * sk_rmem_alloc_get - returns read allocations
 * @sk: socket
 *
 * Return: sk_rmem_alloc
 */
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
        return atomic_read(&sk->sk_rmem_alloc);
}

/**
 * sk_has_allocations - check if allocations are outstanding
 * @sk: socket
 *
 * Return: true if socket has write or read allocations
 */
static inline bool sk_has_allocations(const struct sock *sk)
{
        return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}

/**
 * skwq_has_sleeper - check if there are any waiting processes
 * @wq: struct socket_wq
 *
 * Return: true if socket_wq has waiting processes
 *
 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
 * barrier call. They were added due to the race found within the tcp code.
 *
 * Consider following tcp code paths::
 *
 *   CPU1                CPU2
 *   sys_select          receive packet
 *   ...                 ...
 *   __add_wait_queue    update tp->rcv_nxt
 *   ...                 ...
 *   tp->rcv_nxt check   sock_def_readable
 *   ...                 {
 *   schedule               rcu_read_lock();
 *                          wq = rcu_dereference(sk->sk_wq);
 *                          if (wq && waitqueue_active(&wq->wait))
 *                              wake_up_interruptible(&wq->wait)
 *                          ...
 *                       }
 *
 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
 * in its cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1
 * could then endup calling schedule and sleep forever if there are no more
 * data on the socket.
 *
 */
static inline bool skwq_has_sleeper(struct socket_wq *wq)
{
        return wq && wq_has_sleeper(&wq->wait);
}

/**
 * sock_poll_wait - place memory barrier behind the poll_wait call.
 * @filp:           file
 * @sock:           socket to wait on
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
                                  poll_table *p)
{
        if (!poll_does_not_wait(p)) {
                poll_wait(filp, &sock->wq.wait, p);
                /* We need to be sure we are in sync with the
                 * socket flags modification.
                 *
                 * This memory barrier is paired in the wq_has_sleeper.
                 */
                smp_mb();
        }
}

static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
        /* This pairs with WRITE_ONCE() in sk_set_txhash() */
        u32 txhash = READ_ONCE(sk->sk_txhash);

        if (txhash) {
                skb->l4_hash = 1;
                skb->hash = txhash;
        }
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);

/*
 *        Queue a received datagram if it will fit. Stream and sequenced
 *        protocols can't normally use this as they need to fit buffers in
 *        and play with them.
 *
 *        Inlined as it's very short and called for pretty much every
 *        packet ever received.
 */
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
{
        if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb_orphan(skb);
                skb->destructor = sock_efree;
                skb->sk = sk;
                return true;
        }
        return false;
}

static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk)
{
        skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
        if (skb) {
                if (sk_rmem_schedule(sk, skb, skb->truesize)) {
                        skb_set_owner_r(skb, sk);
                        return skb;
                }
                __kfree_skb(skb);
        }
        return NULL;
}

static inline void skb_prepare_for_gro(struct sk_buff *skb)
{
        if (skb->destructor != sock_wfree) {
                skb_orphan(skb);
                return;
        }
        skb->slow_gro = 1;
}

void sk_reset_timer(struct sock *sk, struct timer_list *timer,
                    unsigned long expires);

void sk_stop_timer(struct sock *sk, struct timer_list *timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
                              enum skb_drop_reason *reason);

static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return sock_queue_rcv_skb_reason(sk, skb, NULL);
}

int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);

/*
 *        Recover an error report and clear atomically
 */

static inline int sock_error(struct sock *sk)
{
        int err;

        /* Avoid an atomic operation for the common case.
         * This is racy since another cpu/thread can change sk_err under us.
         */
        if (likely(data_race(!sk->sk_err)))
                return 0;

        err = xchg(&sk->sk_err, 0);
        return -err;
}

void sk_error_report(struct sock *sk);

static inline unsigned long sock_wspace(struct sock *sk)
{
        int amt = 0;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
                amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
                if (amt < 0)
                        amt = 0;
        }
        return amt;
}

/* Note:
 *  We use sk->sk_wq_raw, from contexts knowing this
 *  pointer is not NULL and cannot disappear/change.
 */
static inline void sk_set_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        set_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_clear_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        clear_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_wake_async(const struct sock *sk, int how, int band)
{
        if (sock_flag(sk, SOCK_FASYNC)) {
                rcu_read_lock();
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
                rcu_read_unlock();
        }
}

static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
{
        if (unlikely(sock_flag(sk, SOCK_FASYNC)))
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
}

/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
 * Note: for send buffers, TCP works better if we can build two skbs at
 * minimum.
 */
#define TCP_SKB_MIN_TRUESIZE        (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))

#define SOCK_MIN_SNDBUF                (TCP_SKB_MIN_TRUESIZE * 2)
#define SOCK_MIN_RCVBUF                 TCP_SKB_MIN_TRUESIZE

static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
        u32 val;

        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return;

        val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
        val = max_t(u32, val, sk_unused_reserved_mem(sk));

        WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}

/**
 * sk_page_frag - return an appropriate page_frag
 * @sk: socket
 *
 * Use the per task page_frag instead of the per socket one for
 * optimization when we know that we're in process context and own
 * everything that's associated with %current.
 *
 * Both direct reclaim and page faults can nest inside other
 * socket operations and end up recursing into sk_page_frag()
 * while it's already in use: explicitly avoid task page_frag
 * when users disable sk_use_task_frag.
 *
 * Return: a per task page_frag if context allows that,
 * otherwise a per socket one.
 */
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
        if (sk->sk_use_task_frag)
                return &current->task_frag;

        return &sk->sk_frag;
}

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);

/*
 *        Default write policy as shown to user space via poll/select/SIGIO
 */
static inline bool sock_writeable(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
}

static inline gfp_t gfp_any(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline gfp_t gfp_memcg_charge(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_rcvtimeo;
}

static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_sndtimeo;
}

static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
        int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);

        return v ?: 1;
}

/* Alas, with timeout socket operations are not restartable.
 * Compare this to poll().
 */
static inline int sock_intr_errno(long timeo)
{
        return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

struct sock_skb_cb {
        u32 dropcount;
};

/* Store sock_skb_cb at the end of skb->cb[] so protocol families
 * using skb->cb[] would keep using it directly and utilize its
 * alignement guarantee.
 */
#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
                            sizeof(struct sock_skb_cb)))

#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
                            SOCK_SKB_CB_OFFSET))

#define sock_skb_cb_check_size(size) \
        BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)

static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
        SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
                                                atomic_read(&sk->sk_drops) : 0;
}

static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
{
        int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        atomic_add(segs, &sk->sk_drops);
}

static inline ktime_t sock_read_timestamp(struct sock *sk)
{
#if BITS_PER_LONG==32
        unsigned int seq;
        ktime_t kt;

        do {
                seq = read_seqbegin(&sk->sk_stamp_seq);
                kt = sk->sk_stamp;
        } while (read_seqretry(&sk->sk_stamp_seq, seq));

        return kt;
#else
        return READ_ONCE(sk->sk_stamp);
#endif
}

static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
{
#if BITS_PER_LONG==32
        write_seqlock(&sk->sk_stamp_seq);
        sk->sk_stamp = kt;
        write_sequnlock(&sk->sk_stamp_seq);
#else
        WRITE_ONCE(sk->sk_stamp, kt);
#endif
}

void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb);
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
                             struct sk_buff *skb);

static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
        struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
        u32 tsflags = READ_ONCE(sk->sk_tsflags);
        ktime_t kt = skb->tstamp;
        /*
         * generate control messages if
         * - receive time stamping in software requested
         * - software time stamp available and wanted
         * - hardware time stamps available and wanted
         */
        if (sock_flag(sk, SOCK_RCVTSTAMP) ||
            (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
            (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
            (hwtstamps->hwtstamp &&
             (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                __sock_recv_timestamp(msg, sk, skb);
        else
                sock_write_timestamp(sk, kt);

        if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb))
                __sock_recv_wifi_status(msg, sk, skb);
}

void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                       struct sk_buff *skb);

#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL)                        | \
                           (1UL << SOCK_RCVTSTAMP)                        | \
                           (1UL << SOCK_RCVMARK))
#define TSFLAGS_ANY          (SOF_TIMESTAMPING_SOFTWARE                        | \
                           SOF_TIMESTAMPING_RAW_HARDWARE)

        if (sk->sk_flags & FLAGS_RECV_CMSGS ||
            READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY)
                __sock_recv_cmsgs(msg, sk, skb);
        else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
                sock_write_timestamp(sk, skb->tstamp);
        else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP))
                sock_write_timestamp(sk, 0);
}

void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);

/**
 * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
 * @sk:                socket sending this packet
 * @tsflags:        timestamping flags to use
 * @tx_flags:        completed with instructions for time stamping
 * @tskey:      filled in with next sk_tskey (not for TCP, which uses seqno)
 *
 * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
 */
static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
                                      __u8 *tx_flags, __u32 *tskey)
{
        if (unlikely(tsflags)) {
                __sock_tx_timestamp(tsflags, tx_flags);
                if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
                    tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
                        *tskey = atomic_inc_return(&sk->sk_tskey) - 1;
        }
        if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
                *tx_flags |= SKBTX_WIFI_STATUS;
}

static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags,
                                     __u8 *tx_flags)
{
        _sock_tx_timestamp(sk, tsflags, tx_flags, NULL);
}

static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
{
        _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags,
                           &skb_shinfo(skb)->tskey);
}

static inline bool sk_is_inet(const struct sock *sk)
{
        int family = READ_ONCE(sk->sk_family);

        return family == AF_INET || family == AF_INET6;
}

static inline bool sk_is_tcp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_STREAM &&
               sk->sk_protocol == IPPROTO_TCP;
}

static inline bool sk_is_udp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_DGRAM &&
               sk->sk_protocol == IPPROTO_UDP;
}

static inline bool sk_is_stream_unix(const struct sock *sk)
{
        return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM;
}

/**
 * sk_eat_skb - Release a skb if it is no longer needed
 * @sk: socket to eat this skb from
 * @skb: socket buffer to eat
 *
 * This routine must be called with interrupts disabled or with the socket
 * locked so that the sk_buff queue operation is ok.
*/
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        __kfree_skb(skb);
}

static inline bool
skb_sk_is_prefetched(struct sk_buff *skb)
{
#ifdef CONFIG_INET
        return skb->destructor == sock_pfree;
#else
        return false;
#endif /* CONFIG_INET */
}

/* This helper checks if a socket is a full socket,
 * ie _not_ a timewait or request socket.
 */
static inline bool sk_fullsock(const struct sock *sk)
{
        return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}

static inline bool
sk_is_refcounted(struct sock *sk)
{
        /* Only full sockets have sk->sk_flags. */
        return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}

/* Checks if this SKB belongs to an HW offloaded socket
 * and whether any SW fallbacks are required based on dev.
 * Check decrypted mark in case skb_orphan() cleared socket.
 */
static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
                                                   struct net_device *dev)
{
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sock *sk = skb->sk;

        if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
                skb = sk->sk_validate_xmit_skb(sk, dev, skb);
        } else if (unlikely(skb_is_decrypted(skb))) {
                pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
                kfree_skb(skb);
                skb = NULL;
        }
#endif

        return skb;
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
 * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
 */
static inline bool sk_listener(const struct sock *sk)
{
        return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
                       int type);

bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap);
bool sk_capable(const struct sock *sk, int cap);
bool sk_net_capable(const struct sock *sk, int cap);

void sk_get_meminfo(const struct sock *sk, u32 *meminfo);

/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS                256
#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
#define SK_WMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;

extern int sysctl_tstamp_allow_data;

extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;

#define SKB_FRAG_PAGE_ORDER        get_order(32768)
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_wmem ? */
        if (proto->sysctl_wmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));

        return READ_ONCE(*proto->sysctl_wmem);
}

static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_rmem ? */
        if (proto->sysctl_rmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));

        return READ_ONCE(*proto->sysctl_rmem);
}

/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
 * Some wifi drivers need to tweak it to get more chunks.
 * They can use this helper from their ndo_start_xmit()
 */
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
        if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
                return;
        WRITE_ONCE(sk->sk_pacing_shift, val);
}

/* if a socket is bound to a device, check that the given device
 * index is either the same or that the socket is bound to an L3
 * master device and the given device index is also enslaved to
 * that L3 master
 */
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        int mdif;

        if (!bound_dev_if || bound_dev_if == dif)
                return true;

        mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
        if (mdif && mdif == bound_dev_if)
                return true;

        return false;
}

void sock_def_readable(struct sock *sk);

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping);

void sock_enable_timestamps(struct sock *sk);
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
void sock_set_rcvbuf(struct sock *sk, int val);
void sock_set_mark(struct sock *sk, u32 val);
void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);

int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval);

int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size);
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk)
{
        if (sk->sk_prot->sock_is_readable)
                return sk->sk_prot->sock_is_readable(sk);
        return false;
}
#endif        /* _SOCK_H */



























































    2 

















    2 

    2 
    2 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
 *
 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
 * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
 */
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/route.h>
#include <linux/ip.h>
#include <net/ip.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("iptables mangle table");

#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
                            (1 << NF_INET_LOCAL_IN) | \
                            (1 << NF_INET_FORWARD) | \
                            (1 << NF_INET_LOCAL_OUT) | \
                            (1 << NF_INET_POST_ROUTING))

static const struct xt_table packet_mangler = {
        .name                = "mangle",
        .valid_hooks        = MANGLE_VALID_HOOKS,
        .me                = THIS_MODULE,
        .af                = NFPROTO_IPV4,
        .priority        = NF_IP_PRI_MANGLE,
};

static unsigned int
ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
        unsigned int ret, verdict;
        const struct iphdr *iph;
        __be32 saddr, daddr;
        u32 mark;
        int err;
        u8 tos;

        /* Save things which could affect route */
        mark = skb->mark;
        iph = ip_hdr(skb);
        saddr = iph->saddr;
        daddr = iph->daddr;
        tos = iph->tos;

        ret = ipt_do_table(priv, skb, state);
        verdict = ret & NF_VERDICT_MASK;
        /* Reroute for ANY change. */
        if (verdict != NF_DROP && verdict != NF_STOLEN) {
                iph = ip_hdr(skb);

                if (iph->saddr != saddr ||
                    iph->daddr != daddr ||
                    skb->mark != mark ||
                    iph->tos != tos) {
                        err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
                        if (err < 0)
                                ret = NF_DROP_ERR(err);
                }
        }

        return ret;
}

/* The work comes in here from netfilter.c. */
static unsigned int
iptable_mangle_hook(void *priv,
                     struct sk_buff *skb,
                     const struct nf_hook_state *state)
{
        if (state->hook == NF_INET_LOCAL_OUT)
                return ipt_mangle_out(priv, skb, state);
        return ipt_do_table(priv, skb, state);
}

static struct nf_hook_ops *mangle_ops __read_mostly;
static int iptable_mangle_table_init(struct net *net)
{
        struct ipt_replace *repl;
        int ret;

        repl = ipt_alloc_initial_table(&packet_mangler);
        if (repl == NULL)
                return -ENOMEM;
        ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops);
        kfree(repl);
        return ret;
}

static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
{
        ipt_unregister_table_pre_exit(net, "mangle");
}

static void __net_exit iptable_mangle_net_exit(struct net *net)
{
        ipt_unregister_table_exit(net, "mangle");
}

static struct pernet_operations iptable_mangle_net_ops = {
        .pre_exit = iptable_mangle_net_pre_exit,
        .exit = iptable_mangle_net_exit,
};

static int __init iptable_mangle_init(void)
{
        int ret = xt_register_template(&packet_mangler,
                                       iptable_mangle_table_init);
        if (ret < 0)
                return ret;

        mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
        if (IS_ERR(mangle_ops)) {
                xt_unregister_template(&packet_mangler);
                ret = PTR_ERR(mangle_ops);
                return ret;
        }

        ret = register_pernet_subsys(&iptable_mangle_net_ops);
        if (ret < 0) {
                xt_unregister_template(&packet_mangler);
                kfree(mangle_ops);
                return ret;
        }

        return ret;
}

static void __exit iptable_mangle_fini(void)
{
        unregister_pernet_subsys(&iptable_mangle_net_ops);
        xt_unregister_template(&packet_mangler);
        kfree(mangle_ops);
}

module_init(iptable_mangle_init);
module_exit(iptable_mangle_fini);

















































































































































































































    1 






















































    1 






















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM tcp

#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TCP_H

#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/tracepoint.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#include <linux/sock_diag.h>
#include <net/rstreason.h>

/*
 * tcp event with arguments sk and skb
 *
 * Note: this class requires a valid sk pointer; while skb pointer could
 *       be NULL.
 */
DECLARE_EVENT_CLASS(tcp_event_sk_skb,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                              sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s",
                  __entry->skbaddr, __entry->skaddr,
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  show_tcp_state_name(__entry->state))
);

DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb)
);

#undef FN
#define FN(reason)        TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
DEFINE_RST_REASON(FN, FN)

#undef FN
#undef FNe
#define FN(reason)        { SK_RST_REASON_##reason, #reason },
#define FNe(reason)        { SK_RST_REASON_##reason, #reason }

/*
 * skb of trace_tcp_send_reset is the skb that caused RST. In case of
 * active reset, skb should be NULL
 */
TRACE_EVENT(tcp_send_reset,

        TP_PROTO(const struct sock *sk,
                 const struct sk_buff *skb,
                 const enum sk_rst_reason reason),

        TP_ARGS(sk, skb, reason),

        TP_STRUCT__entry(
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)
                __field(enum sk_rst_reason, reason)
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                /* Zero means unknown state. */
                __entry->state = sk ? sk->sk_state : 0;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));

                if (sk && sk_fullsock(sk)) {
                        const struct inet_sock *inet = inet_sk(sk);

                        TP_STORE_ADDR_PORTS(__entry, inet, sk);
                } else if (skb) {
                        const struct tcphdr *th = (const struct tcphdr *)skb->data;
                        /*
                         * We should reverse the 4-tuple of skb, so later
                         * it can print the right flow direction of rst.
                         */
                        TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, entry->saddr);
                }
                __entry->reason = reason;
        ),

        TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s reason=%s",
                  __entry->skbaddr, __entry->skaddr,
                  __entry->saddr, __entry->daddr,
                  __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN",
                  __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe)))
);

#undef FN
#undef FNe

/*
 * tcp event with arguments sk
 *
 * Note: this class requires a valid sk pointer.
 */
DECLARE_EVENT_CLASS(tcp_event_sk,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
                __field(__u64, sock_cookie)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

                __entry->sock_cookie = sock_gen_cookie(sk);
        ),

        TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx",
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->sock_cookie)
);

DEFINE_EVENT(tcp_event_sk, tcp_receive_reset,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,

        TP_PROTO(struct sock *sk),

        TP_ARGS(sk)
);

TRACE_EVENT(tcp_retransmit_synack,

        TP_PROTO(const struct sock *sk, const struct request_sock *req),

        TP_ARGS(sk, req),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(const void *, req)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                struct inet_request_sock *ireq = inet_rsk(req);
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->req = req;

                __entry->sport = ireq->ir_num;
                __entry->dport = ntohs(ireq->ir_rmt_port);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = ireq->ir_loc_addr;

                p32 = (__be32 *) __entry->daddr;
                *p32 = ireq->ir_rmt_addr;

                TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr,
                              ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr);
        ),

        TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6)
);

#include <trace/events/net_probe_common.h>

TRACE_EVENT(tcp_probe,

        TP_PROTO(struct sock *sk, struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u32, mark)
                __field(__u16, data_len)
                __field(__u32, snd_nxt)
                __field(__u32, snd_una)
                __field(__u32, snd_cwnd)
                __field(__u32, ssthresh)
                __field(__u32, snd_wnd)
                __field(__u32, srtt)
                __field(__u32, rcv_wnd)
                __field(__u64, sock_cookie)
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;
                const struct inet_sock *inet = inet_sk(sk);
                const struct tcp_sock *tp = tcp_sk(sk);

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));

                TP_STORE_ADDR_PORTS(__entry, inet, sk);

                /* For filtering use */
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->mark = skb->mark;
                __entry->family = sk->sk_family;

                __entry->data_len = skb->len - __tcp_hdrlen(th);
                __entry->snd_nxt = tp->snd_nxt;
                __entry->snd_una = tp->snd_una;
                __entry->snd_cwnd = tcp_snd_cwnd(tp);
                __entry->snd_wnd = tp->snd_wnd;
                __entry->rcv_wnd = tp->rcv_wnd;
                __entry->ssthresh = tcp_current_ssthresh(sk);
                __entry->srtt = tp->srtt_us >> 3;
                __entry->sock_cookie = sock_gen_cookie(sk);

                __entry->skbaddr = skb;
                __entry->skaddr = sk;
        ),

        TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx skbaddr=%p skaddr=%p",
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr, __entry->mark,
                  __entry->data_len, __entry->snd_nxt, __entry->snd_una,
                  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
                  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie,
                  __entry->skbaddr, __entry->skaddr)
);

/*
 * tcp event with only skb
 */
DECLARE_EVENT_CLASS(tcp_event_skb,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(const void *, skbaddr)
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;
                __entry->skbaddr = skb;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));

                TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr);
        ),

        TP_printk("skbaddr=%p src=%pISpc dest=%pISpc",
                  __entry->skbaddr, __entry->saddr, __entry->daddr)
);

DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

TRACE_EVENT(tcp_cong_state_set,

        TP_PROTO(struct sock *sk, const u8 ca_state),

        TP_ARGS(sk, ca_state),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
                __field(__u8, cong_state)
        ),

        TP_fast_assign(
                struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;

                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                           sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

                __entry->cong_state = ca_state;
        ),

        TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u",
                  show_family_name(__entry->family),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->cong_state)
);

DECLARE_EVENT_CLASS(tcp_hash_event,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
                __field(int, l3index)

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(bool, fin)
                __field(bool, syn)
                __field(bool, rst)
                __field(bool, psh)
                __field(bool, ack)
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr);
                __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0;

                /* For filtering use */
                __entry->sport = ntohs(th->source);
                __entry->dport = ntohs(th->dest);
                __entry->family = sk->sk_family;

                __entry->fin = th->fin;
                __entry->syn = th->syn;
                __entry->rst = th->rst;
                __entry->psh = th->psh;
                __entry->ack = th->ack;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c]",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->l3index,
                  __entry->fin ? 'F' : ' ',
                  __entry->syn ? 'S' : ' ',
                  __entry->rst ? 'R' : ' ',
                  __entry->psh ? 'P' : ' ',
                  __entry->ack ? '.' : ' ')
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_bad_header,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_required,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_unexpected,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_md5_mismatch,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DEFINE_EVENT(tcp_hash_event, tcp_hash_ao_required,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
        TP_ARGS(sk, skb)
);

DECLARE_EVENT_CLASS(tcp_ao_event,

        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),

        TP_ARGS(sk, skb, keyid, rnext, maclen),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skbaddr)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))
                __field(int, l3index)

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(bool, fin)
                __field(bool, syn)
                __field(bool, rst)
                __field(bool, psh)
                __field(bool, ack)

                __field(__u8, keyid)
                __field(__u8, rnext)
                __field(__u8, maclen)
        ),

        TP_fast_assign(
                const struct tcphdr *th = (const struct tcphdr *)skb->data;

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skbaddr = skb;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr);
                __entry->l3index = inet_sdif(skb) ? inet_iif(skb) : 0;

                /* For filtering use */
                __entry->sport = ntohs(th->source);
                __entry->dport = ntohs(th->dest);
                __entry->family = sk->sk_family;

                __entry->fin = th->fin;
                __entry->syn = th->syn;
                __entry->rst = th->rst;
                __entry->psh = th->psh;
                __entry->ack = th->ack;

                __entry->keyid = keyid;
                __entry->rnext = rnext;
                __entry->maclen = maclen;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc L3index=%d [%c%c%c%c%c] keyid=%u rnext=%u maclen=%u",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->l3index,
                  __entry->fin ? 'F' : ' ',
                  __entry->syn ? 'S' : ' ',
                  __entry->rst ? 'R' : ' ',
                  __entry->psh ? 'P' : ' ',
                  __entry->ack ? '.' : ' ',
                  __entry->keyid, __entry->rnext, __entry->maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_handshake_failure,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_wrong_maclen,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_mismatch,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_key_not_found,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DEFINE_EVENT(tcp_ao_event, tcp_ao_rnext_request,
        TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
                 const __u8 keyid, const __u8 rnext, const __u8 maclen),
        TP_ARGS(sk, skb, keyid, rnext, maclen)
);

DECLARE_EVENT_CLASS(tcp_ao_event_sk,

        TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext),

        TP_ARGS(sk, keyid, rnext),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(__u8, keyid)
                __field(__u8, rnext)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS(__entry, inet, sk);

                /* For filtering use */
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                __entry->keyid = keyid;
                __entry->rnext = rnext;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc keyid=%u rnext=%u",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->keyid, __entry->rnext)
);

DEFINE_EVENT(tcp_ao_event_sk, tcp_ao_synack_no_key,
        TP_PROTO(const struct sock *sk, const __u8 keyid, const __u8 rnext),
        TP_ARGS(sk, keyid, rnext)
);

DECLARE_EVENT_CLASS(tcp_ao_event_sne,

        TP_PROTO(const struct sock *sk, __u32 new_sne),

        TP_ARGS(sk, new_sne),

        TP_STRUCT__entry(
                __field(__u64, net_cookie)
                __field(const void *, skaddr)
                __field(int, state)

                /* sockaddr_in6 is always bigger than sockaddr_in */
                __array(__u8, saddr, sizeof(struct sockaddr_in6))
                __array(__u8, daddr, sizeof(struct sockaddr_in6))

                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)

                __field(__u32, new_sne)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);

                __entry->net_cookie = sock_net(sk)->net_cookie;
                __entry->skaddr = sk;
                __entry->state = sk->sk_state;

                memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
                memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
                TP_STORE_ADDR_PORTS(__entry, inet, sk);

                /* For filtering use */
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);
                __entry->family = sk->sk_family;

                __entry->new_sne = new_sne;
        ),

        TP_printk("net=%llu state=%s family=%s src=%pISpc dest=%pISpc sne=%u",
                  __entry->net_cookie,
                  show_tcp_state_name(__entry->state),
                  show_family_name(__entry->family),
                  __entry->saddr, __entry->daddr,
                  __entry->new_sne)
);

DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_snd_sne_update,
        TP_PROTO(const struct sock *sk, __u32 new_sne),
        TP_ARGS(sk, new_sne)
);

DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_rcv_sne_update,
        TP_PROTO(const struct sock *sk, __u32 new_sne),
        TP_ARGS(sk, new_sne)
);

#endif /* _TRACE_TCP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>











































































    1 















    1 





















































    1 
    1 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// SPDX-License-Identifier: GPL-2.0
/*
 * Out-of-line refcount functions.
 */

#include <linux/mutex.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/bug.h>

#define REFCOUNT_WARN(str)        WARN_ONCE(1, "refcount_t: " str ".\n")

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t)
{
        refcount_set(r, REFCOUNT_SATURATED);

        switch (t) {
        case REFCOUNT_ADD_NOT_ZERO_OVF:
                REFCOUNT_WARN("saturated; leaking memory");
                break;
        case REFCOUNT_ADD_OVF:
                REFCOUNT_WARN("saturated; leaking memory");
                break;
        case REFCOUNT_ADD_UAF:
                REFCOUNT_WARN("addition on 0; use-after-free");
                break;
        case REFCOUNT_SUB_UAF:
                REFCOUNT_WARN("underflow; use-after-free");
                break;
        case REFCOUNT_DEC_LEAK:
                REFCOUNT_WARN("decrement hit 0; leaking memory");
                break;
        default:
                REFCOUNT_WARN("unknown saturation event!?");
        }
}
EXPORT_SYMBOL(refcount_warn_saturate);

/**
 * refcount_dec_if_one - decrement a refcount if it is 1
 * @r: the refcount
 *
 * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
 * success thereof.
 *
 * Like all decrement operations, it provides release memory order and provides
 * a control dependency.
 *
 * It can be used like a try-delete operator; this explicit case is provided
 * and not cmpxchg in generic, because that would allow implementing unsafe
 * operations.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
bool refcount_dec_if_one(refcount_t *r)
{
        int val = 1;

        return atomic_try_cmpxchg_release(&r->refs, &val, 0);
}
EXPORT_SYMBOL(refcount_dec_if_one);

/**
 * refcount_dec_not_one - decrement a refcount if it is not 1
 * @r: the refcount
 *
 * No atomic_t counterpart, it decrements unless the value is 1, in which case
 * it will return false.
 *
 * Was often done like: atomic_add_unless(&var, -1, 1)
 *
 * Return: true if the decrement operation was successful, false otherwise
 */
bool refcount_dec_not_one(refcount_t *r)
{
        unsigned int new, val = atomic_read(&r->refs);

        do {
                if (unlikely(val == REFCOUNT_SATURATED))
                        return true;

                if (val == 1)
                        return false;

                new = val - 1;
                if (new > val) {
                        WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n");
                        return true;
                }

        } while (!atomic_try_cmpxchg_release(&r->refs, &val, new));

        return true;
}
EXPORT_SYMBOL(refcount_dec_not_one);

/**
 * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
 *                               refcount to 0
 * @r: the refcount
 * @lock: the mutex to be locked
 *
 * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
 * to decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides a control dependency such that free() must come after.
 * See the comment on top.
 *
 * Return: true and hold mutex if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
{
        if (refcount_dec_not_one(r))
                return false;

        mutex_lock(lock);
        if (!refcount_dec_and_test(r)) {
                mutex_unlock(lock);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_mutex_lock);

/**
 * refcount_dec_and_lock - return holding spinlock if able to decrement
 *                         refcount to 0
 * @r: the refcount
 * @lock: the spinlock to be locked
 *
 * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides a control dependency such that free() must come after.
 * See the comment on top.
 *
 * Return: true and hold spinlock if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
{
        if (refcount_dec_not_one(r))
                return false;

        spin_lock(lock);
        if (!refcount_dec_and_test(r)) {
                spin_unlock(lock);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock);

/**
 * refcount_dec_and_lock_irqsave - return holding spinlock with disabled
 *                                 interrupts if able to decrement refcount to 0
 * @r: the refcount
 * @lock: the spinlock to be locked
 * @flags: saved IRQ-flags if the is acquired
 *
 * Same as refcount_dec_and_lock() above except that the spinlock is acquired
 * with disabled interrupts.
 *
 * Return: true and hold spinlock if able to decrement refcount to 0, false
 *         otherwise
 */
bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock,
                                   unsigned long *flags)
{
        if (refcount_dec_not_one(r))
                return false;

        spin_lock_irqsave(lock, *flags);
        if (!refcount_dec_and_test(r)) {
                spin_unlock_irqrestore(lock, *flags);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);
















    1 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2006, Johannes Berg <johannes@sipsolutions.net>
 */

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/leds.h>
#include "ieee80211_i.h"

#define MAC80211_BLINK_DELAY 50 /* ms */

static inline void ieee80211_led_rx(struct ieee80211_local *local)
{
#ifdef CONFIG_MAC80211_LEDS
        if (!atomic_read(&local->rx_led_active))
                return;
        led_trigger_blink_oneshot(&local->rx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0);
#endif
}

static inline void ieee80211_led_tx(struct ieee80211_local *local)
{
#ifdef CONFIG_MAC80211_LEDS
        if (!atomic_read(&local->tx_led_active))
                return;
        led_trigger_blink_oneshot(&local->tx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0);
#endif
}

#ifdef CONFIG_MAC80211_LEDS
void ieee80211_led_assoc(struct ieee80211_local *local,
                         bool associated);
void ieee80211_led_radio(struct ieee80211_local *local,
                         bool enabled);
void ieee80211_alloc_led_names(struct ieee80211_local *local);
void ieee80211_free_led_names(struct ieee80211_local *local);
void ieee80211_led_init(struct ieee80211_local *local);
void ieee80211_led_exit(struct ieee80211_local *local);
void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
                                unsigned int types_on, unsigned int types_off);
#else
static inline void ieee80211_led_assoc(struct ieee80211_local *local,
                                       bool associated)
{
}
static inline void ieee80211_led_radio(struct ieee80211_local *local,
                                       bool enabled)
{
}
static inline void ieee80211_alloc_led_names(struct ieee80211_local *local)
{
}
static inline void ieee80211_free_led_names(struct ieee80211_local *local)
{
}
static inline void ieee80211_led_init(struct ieee80211_local *local)
{
}
static inline void ieee80211_led_exit(struct ieee80211_local *local)
{
}
static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local,
                                              unsigned int types_on,
                                              unsigned int types_off)
{
}
#endif

static inline void
ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes)
{
#ifdef CONFIG_MAC80211_LEDS
        if (atomic_read(&local->tpt_led_active))
                local->tpt_led_trigger->tx_bytes += bytes;
#endif
}

static inline void
ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes)
{
#ifdef CONFIG_MAC80211_LEDS
        if (atomic_read(&local->tpt_led_active))
                local->tpt_led_trigger->rx_bytes += bytes;
#endif
}


































































































































































































































































































































































































































































































































































































































    1 



    1 













    1 

















    1 















    1 










    1 











    1 










    1 

    1 
    1 





    1 























    1 








































































































    1 



    2 








    1 


    1 


    1 























    1 





    1 




    1 





    1 













    1 







    1 








    2 
























































































































































































































































































































































































































































    2 






    1 











    1 
    1 











    1 




    1 












    1 



    1 













    1 




















    1 




    1 



    1 







































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                PF_INET protocol family socket handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Changes (see also sock.c)
 *
 *                piggy,
 *                Karl Knutson        :        Socket protocol table
 *                A.N.Kuznetsov        :        Socket death error in accept().
 *                John Richardson :        Fix non blocking error in connect()
 *                                        so sockets that fail to connect
 *                                        don't return -EINPROGRESS.
 *                Alan Cox        :        Asynchronous I/O support
 *                Alan Cox        :        Keep correct socket pointer on sock
 *                                        structures
 *                                        when accept() ed
 *                Alan Cox        :        Semantics of SO_LINGER aren't state
 *                                        moved to close when you look carefully.
 *                                        With this fixed and the accept bug fixed
 *                                        some RPC stuff seems happier.
 *                Niibe Yutaka        :        4.4BSD style write async I/O
 *                Alan Cox,
 *                Tony Gale         :        Fixed reuse semantics.
 *                Alan Cox        :        bind() shouldn't abort existing but dead
 *                                        sockets. Stops FTP netin:.. I hope.
 *                Alan Cox        :        bind() works correctly for RAW sockets.
 *                                        Note that FreeBSD at least was broken
 *                                        in this respect so be careful with
 *                                        compatibility tests...
 *                Alan Cox        :        routing cache support
 *                Alan Cox        :        memzero the socket structure for
 *                                        compactness.
 *                Matt Day        :        nonblock connect error handler
 *                Alan Cox        :        Allow large numbers of pending sockets
 *                                        (eg for big web sites), but only if
 *                                        specifically application requested.
 *                Alan Cox        :        New buffering throughout IP. Used
 *                                        dumbly.
 *                Alan Cox        :        New buffering now used smartly.
 *                Alan Cox        :        BSD rather than common sense
 *                                        interpretation of listen.
 *                Germano Caronni        :        Assorted small races.
 *                Alan Cox        :        sendmsg/recvmsg basic support.
 *                Alan Cox        :        Only sendmsg/recvmsg now supported.
 *                Alan Cox        :        Locked down bind (see security list).
 *                Alan Cox        :        Loosened bind a little.
 *                Mike McLagan        :        ADD/DEL DLCI Ioctls
 *        Willy Konynenberg        :        Transparent proxying support.
 *                David S. Miller        :        New socket lookup architecture.
 *                                        Some other random speedups.
 *                Cyrus Durgin        :        Cleaned up file for kmod hacks.
 *                Andi Kleen        :        Fix inet_stream_connect TCP race.
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/capability.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/slab.h>

#include <linux/uaccess.h>

#include <linux/inet.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
#include <net/gro.h>
#include <net/gso.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/ip_tunnels.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/secure_seq.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
#include <net/l3mdev.h>
#include <net/compat.h>
#include <net/rps.h>

#include <trace/events/sock.h>

/* The inetsw table contains everything that inet_create needs to
 * build a new socket.
 */
static struct list_head inetsw[SOCK_MAX];
static DEFINE_SPINLOCK(inetsw_lock);

/* New destruction routine */

void inet_sock_destruct(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);

        __skb_queue_purge(&sk->sk_receive_queue);
        __skb_queue_purge(&sk->sk_error_queue);

        sk_mem_reclaim_final(sk);

        if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
                pr_err("Attempt to release TCP socket in state %d %p\n",
                       sk->sk_state, sk);
                return;
        }
        if (!sock_flag(sk, SOCK_DEAD)) {
                pr_err("Attempt to release alive inet socket %p\n", sk);
                return;
        }

        WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON_ONCE(sk->sk_wmem_queued);
        WARN_ON_ONCE(sk_forward_alloc_get(sk));

        kfree(rcu_dereference_protected(inet->inet_opt, 1));
        dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
        dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
}
EXPORT_SYMBOL(inet_sock_destruct);

/*
 *        The routines beyond this point handle the behaviour of an AF_INET
 *        socket object. Mostly it punts to the subprotocols of IP to do
 *        the work.
 */

/*
 *        Automatically bind an unbound socket.
 */

static int inet_autobind(struct sock *sk)
{
        struct inet_sock *inet;
        /* We may need to bind the socket. */
        lock_sock(sk);
        inet = inet_sk(sk);
        if (!inet->inet_num) {
                if (sk->sk_prot->get_port(sk, 0)) {
                        release_sock(sk);
                        return -EAGAIN;
                }
                inet->inet_sport = htons(inet->inet_num);
        }
        release_sock(sk);
        return 0;
}

int __inet_listen_sk(struct sock *sk, int backlog)
{
        unsigned char old_state = sk->sk_state;
        int err, tcp_fastopen;

        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                return -EINVAL;

        WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
        /* Really, if the socket is already in listen state
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != TCP_LISTEN) {
                /* Enable TFO w/o requiring TCP_FASTOPEN socket option.
                 * Note that only TCP sockets (SOCK_STREAM) will reach here.
                 * Also fastopen backlog may already been set via the option
                 * because the socket was in TCP_LISTEN state previously but
                 * was shutdown() rather than close().
                 */
                tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
                if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
                    (tcp_fastopen & TFO_SERVER_ENABLE) &&
                    !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
                        fastopen_queue_tune(sk, backlog);
                        tcp_fastopen_init_key_once(sock_net(sk));
                }

                err = inet_csk_listen_start(sk);
                if (err)
                        return err;

                tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
        }
        return 0;
}

/*
 *        Move a socket into listening state.
 */
int inet_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        int err = -EINVAL;

        lock_sock(sk);

        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
                goto out;

        err = __inet_listen_sk(sk, backlog);

out:
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(inet_listen);

/*
 *        Create an inet socket.
 */

static int inet_create(struct net *net, struct socket *sock, int protocol,
                       int kern)
{
        struct sock *sk;
        struct inet_protosw *answer;
        struct inet_sock *inet;
        struct proto *answer_prot;
        unsigned char answer_flags;
        int try_loading_module = 0;
        int err;

        if (protocol < 0 || protocol >= IPPROTO_MAX)
                return -EINVAL;

        sock->state = SS_UNCONNECTED;

        /* Look for the requested type/protocol pair. */
lookup_protocol:
        err = -ESOCKTNOSUPPORT;
        rcu_read_lock();
        list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

                err = 0;
                /* Check the non-wild match. */
                if (protocol == answer->protocol) {
                        if (protocol != IPPROTO_IP)
                                break;
                } else {
                        /* Check for the two wild cases. */
                        if (IPPROTO_IP == protocol) {
                                protocol = answer->protocol;
                                break;
                        }
                        if (IPPROTO_IP == answer->protocol)
                                break;
                }
                err = -EPROTONOSUPPORT;
        }

        if (unlikely(err)) {
                if (try_loading_module < 2) {
                        rcu_read_unlock();
                        /*
                         * Be more specific, e.g. net-pf-2-proto-132-type-1
                         * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
                         */
                        if (++try_loading_module == 1)
                                request_module("net-pf-%d-proto-%d-type-%d",
                                               PF_INET, protocol, sock->type);
                        /*
                         * Fall back to generic, e.g. net-pf-2-proto-132
                         * (net-pf-PF_INET-proto-IPPROTO_SCTP)
                         */
                        else
                                request_module("net-pf-%d-proto-%d",
                                               PF_INET, protocol);
                        goto lookup_protocol;
                } else
                        goto out_rcu_unlock;
        }

        err = -EPERM;
        if (sock->type == SOCK_RAW && !kern &&
            !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out_rcu_unlock;

        sock->ops = answer->ops;
        answer_prot = answer->prot;
        answer_flags = answer->flags;
        rcu_read_unlock();

        WARN_ON(!answer_prot->slab);

        err = -ENOMEM;
        sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
        if (!sk)
                goto out;

        err = 0;
        if (INET_PROTOSW_REUSE & answer_flags)
                sk->sk_reuse = SK_CAN_REUSE;

        if (INET_PROTOSW_ICSK & answer_flags)
                inet_init_csk_locks(sk);

        inet = inet_sk(sk);
        inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);

        inet_clear_bit(NODEFRAG, sk);

        if (SOCK_RAW == sock->type) {
                inet->inet_num = protocol;
                if (IPPROTO_RAW == protocol)
                        inet_set_bit(HDRINCL, sk);
        }

        if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
                inet->pmtudisc = IP_PMTUDISC_DONT;
        else
                inet->pmtudisc = IP_PMTUDISC_WANT;

        atomic_set(&inet->inet_id, 0);

        sock_init_data(sock, sk);

        sk->sk_destruct           = inet_sock_destruct;
        sk->sk_protocol           = protocol;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
        sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);

        inet->uc_ttl        = -1;
        inet_set_bit(MC_LOOP, sk);
        inet->mc_ttl        = 1;
        inet_set_bit(MC_ALL, sk);
        inet->mc_index        = 0;
        inet->mc_list        = NULL;
        inet->rcv_tos        = 0;

        if (inet->inet_num) {
                /* It assumes that any protocol which allows
                 * the user to assign a number at socket
                 * creation time automatically
                 * shares.
                 */
                inet->inet_sport = htons(inet->inet_num);
                /* Add to protocol hash chains. */
                err = sk->sk_prot->hash(sk);
                if (err) {
                        sk_common_release(sk);
                        goto out;
                }
        }

        if (sk->sk_prot->init) {
                err = sk->sk_prot->init(sk);
                if (err) {
                        sk_common_release(sk);
                        goto out;
                }
        }

        if (!kern) {
                err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
                if (err) {
                        sk_common_release(sk);
                        goto out;
                }
        }
out:
        return err;
out_rcu_unlock:
        rcu_read_unlock();
        goto out;
}


/*
 *        The peer socket should always be NULL (or else). When we call this
 *        function we are destroying the object and from then on nobody
 *        should refer to it.
 */
int inet_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        if (sk) {
                long timeout;

                if (!sk->sk_kern_sock)
                        BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);

                /* Applications forget to leave groups before exiting */
                ip_mc_drop_socket(sk);

                /* If linger is set, we don't return until the close
                 * is complete.  Otherwise we return immediately. The
                 * actually closing is done the same either way.
                 *
                 * If the close is due to the process exiting, we never
                 * linger..
                 */
                timeout = 0;
                if (sock_flag(sk, SOCK_LINGER) &&
                    !(current->flags & PF_EXITING))
                        timeout = sk->sk_lingertime;
                sk->sk_prot->close(sk, timeout);
                sock->sk = NULL;
        }
        return 0;
}
EXPORT_SYMBOL(inet_release);

int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        u32 flags = BIND_WITH_LOCK;
        int err;

        /* If the socket has its own bind function then use it. (RAW) */
        if (sk->sk_prot->bind) {
                return sk->sk_prot->bind(sk, uaddr, addr_len);
        }
        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        /* BPF prog is run before any checks are done so that if the prog
         * changes context in a wrong way it will be caught.
         */
        err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
                                                 CGROUP_INET4_BIND, &flags);
        if (err)
                return err;

        return __inet_bind(sk, uaddr, addr_len, flags);
}

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        return inet_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_bind);

int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
                u32 flags)
{
        struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        unsigned short snum;
        int chk_addr_ret;
        u32 tb_id = RT_TABLE_LOCAL;
        int err;

        if (addr->sin_family != AF_INET) {
                /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
                 * only if s_addr is INADDR_ANY.
                 */
                err = -EAFNOSUPPORT;
                if (addr->sin_family != AF_UNSPEC ||
                    addr->sin_addr.s_addr != htonl(INADDR_ANY))
                        goto out;
        }

        tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
        chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

        /* Not specified by any standard per-se, however it breaks too
         * many applications when removed.  It is unfortunate since
         * allowing applications to make a non-local bind solves
         * several problems with systems using dynamic addressing.
         * (ie. your servers still start up even if your ISDN link
         *  is temporarily down)
         */
        err = -EADDRNOTAVAIL;
        if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
                                         chk_addr_ret))
                goto out;

        snum = ntohs(addr->sin_port);
        err = -EACCES;
        if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
            snum && inet_port_requires_bind_service(net, snum) &&
            !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
                goto out;

        /*      We keep a pair of addresses. rcv_saddr is the one
         *      used by hash lookups, and saddr is used for transmit.
         *
         *      In the BSD API these are the same except where it
         *      would be illegal to use them (multicast/broadcast) in
         *      which case the sending device address is used.
         */
        if (flags & BIND_WITH_LOCK)
                lock_sock(sk);

        /* Check these errors (active socket, double bind). */
        err = -EINVAL;
        if (sk->sk_state != TCP_CLOSE || inet->inet_num)
                goto out_release_sock;

        inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
        if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
                inet->inet_saddr = 0;  /* Use device */

        /* Make sure we are allowed to bind here. */
        if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
                      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
                err = sk->sk_prot->get_port(sk, snum);
                if (err) {
                        inet->inet_saddr = inet->inet_rcv_saddr = 0;
                        goto out_release_sock;
                }
                if (!(flags & BIND_FROM_BPF)) {
                        err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
                        if (err) {
                                inet->inet_saddr = inet->inet_rcv_saddr = 0;
                                if (sk->sk_prot->put_port)
                                        sk->sk_prot->put_port(sk);
                                goto out_release_sock;
                        }
                }
        }

        if (inet->inet_rcv_saddr)
                sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
        if (snum)
                sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
        inet->inet_sport = htons(inet->inet_num);
        inet->inet_daddr = 0;
        inet->inet_dport = 0;
        sk_dst_reset(sk);
        err = 0;
out_release_sock:
        if (flags & BIND_WITH_LOCK)
                release_sock(sk);
out:
        return err;
}

int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
                       int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        const struct proto *prot;
        int err;

        if (addr_len < sizeof(uaddr->sa_family))
                return -EINVAL;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);

        if (uaddr->sa_family == AF_UNSPEC)
                return prot->disconnect(sk, flags);

        if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
                err = prot->pre_connect(sk, uaddr, addr_len);
                if (err)
                        return err;
        }

        if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
                return -EAGAIN;
        return prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);

static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);

        add_wait_queue(sk_sleep(sk), &wait);
        sk->sk_write_pending += writebias;

        /* Basic assumption: if someone sets sk->sk_err, he _must_
         * change state of the socket from TCP_SYN_*.
         * Connect() does not allow to get error notifications
         * without closing the socket.
         */
        while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                release_sock(sk);
                timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
                lock_sock(sk);
                if (signal_pending(current) || !timeo)
                        break;
        }
        remove_wait_queue(sk_sleep(sk), &wait);
        sk->sk_write_pending -= writebias;
        return timeo;
}

/*
 *        Connect to a remote host. There is regrettably still a little
 *        TCP 'magic' in here.
 */
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                          int addr_len, int flags, int is_sendmsg)
{
        struct sock *sk = sock->sk;
        int err;
        long timeo;

        /*
         * uaddr can be NULL and addr_len can be 0 if:
         * sk is a TCP fastopen active socket and
         * TCP_FASTOPEN_CONNECT sockopt is set and
         * we already have a valid cookie for this socket.
         * In this case, user can call write() after connect().
         * write() will invoke tcp_sendmsg_fastopen() which calls
         * __inet_stream_connect().
         */
        if (uaddr) {
                if (addr_len < sizeof(uaddr->sa_family))
                        return -EINVAL;

                if (uaddr->sa_family == AF_UNSPEC) {
                        sk->sk_disconnects++;
                        err = sk->sk_prot->disconnect(sk, flags);
                        sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
                        goto out;
                }
        }

        switch (sock->state) {
        default:
                err = -EINVAL;
                goto out;
        case SS_CONNECTED:
                err = -EISCONN;
                goto out;
        case SS_CONNECTING:
                if (inet_test_bit(DEFER_CONNECT, sk))
                        err = is_sendmsg ? -EINPROGRESS : -EISCONN;
                else
                        err = -EALREADY;
                /* Fall out of switch with err, set for this state */
                break;
        case SS_UNCONNECTED:
                err = -EISCONN;
                if (sk->sk_state != TCP_CLOSE)
                        goto out;

                if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
                        err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
                        if (err)
                                goto out;
                }

                err = sk->sk_prot->connect(sk, uaddr, addr_len);
                if (err < 0)
                        goto out;

                sock->state = SS_CONNECTING;

                if (!err && inet_test_bit(DEFER_CONNECT, sk))
                        goto out;

                /* Just entered SS_CONNECTING state; the only
                 * difference is that return value in non-blocking
                 * case is EINPROGRESS, rather than EALREADY.
                 */
                err = -EINPROGRESS;
                break;
        }

        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
                                tcp_sk(sk)->fastopen_req &&
                                tcp_sk(sk)->fastopen_req->data ? 1 : 0;
                int dis = sk->sk_disconnects;

                /* Error code is set above */
                if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
                        goto out;

                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        goto out;

                if (dis != sk->sk_disconnects) {
                        err = -EPIPE;
                        goto out;
                }
        }

        /* Connection was closed by RST, timeout, ICMP error
         * or another process disconnected us.
         */
        if (sk->sk_state == TCP_CLOSE)
                goto sock_error;

        /* sk->sk_err may be not zero now, if RECVERR was ordered by user
         * and error was received after socket entered established state.
         * Hence, it is handled normally after connect() return successfully.
         */

        sock->state = SS_CONNECTED;
        err = 0;
out:
        return err;

sock_error:
        err = sock_error(sk) ? : -ECONNABORTED;
        sock->state = SS_UNCONNECTED;
        sk->sk_disconnects++;
        if (sk->sk_prot->disconnect(sk, flags))
                sock->state = SS_DISCONNECTING;
        goto out;
}
EXPORT_SYMBOL(__inet_stream_connect);

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        int addr_len, int flags)
{
        int err;

        lock_sock(sock->sk);
        err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0);
        release_sock(sock->sk);
        return err;
}
EXPORT_SYMBOL(inet_stream_connect);

void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
{
        sock_rps_record_flow(newsk);
        WARN_ON(!((1 << newsk->sk_state) &
                  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
                   TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
                   TCPF_CLOSING | TCPF_CLOSE_WAIT |
                   TCPF_CLOSE)));

        if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
                set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
        sock_graft(newsk, newsock);

        newsock->state = SS_CONNECTED;
}

/*
 *        Accept a pending connection. The TCP layer now gives BSD semantics.
 */

int inet_accept(struct socket *sock, struct socket *newsock,
                struct proto_accept_arg *arg)
{
        struct sock *sk1 = sock->sk, *sk2;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        arg->err = -EINVAL;
        sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg);
        if (!sk2)
                return arg->err;

        lock_sock(sk2);
        __inet_accept(sock, newsock, sk2);
        release_sock(sk2);
        return 0;
}
EXPORT_SYMBOL(inet_accept);

/*
 *        This does both peername and sockname.
 */
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
                 int peer)
{
        struct sock *sk                = sock->sk;
        struct inet_sock *inet        = inet_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
        int sin_addr_len = sizeof(*sin);

        sin->sin_family = AF_INET;
        lock_sock(sk);
        if (peer) {
                if (!inet->inet_dport ||
                    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
                     peer == 1)) {
                        release_sock(sk);
                        return -ENOTCONN;
                }
                sin->sin_port = inet->inet_dport;
                sin->sin_addr.s_addr = inet->inet_daddr;
                BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
                                       CGROUP_INET4_GETPEERNAME);
        } else {
                __be32 addr = inet->inet_rcv_saddr;
                if (!addr)
                        addr = inet->inet_saddr;
                sin->sin_port = inet->inet_sport;
                sin->sin_addr.s_addr = addr;
                BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
                                       CGROUP_INET4_GETSOCKNAME);
        }
        release_sock(sk);
        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
        return sin_addr_len;
}
EXPORT_SYMBOL(inet_getname);

int inet_send_prepare(struct sock *sk)
{
        sock_rps_record_flow(sk);

        /* We may need to bind the socket. */
        if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
            inet_autobind(sk))
                return -EAGAIN;

        return 0;
}
EXPORT_SYMBOL_GPL(inet_send_prepare);

int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;

        if (unlikely(inet_send_prepare(sk)))
                return -EAGAIN;

        return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
                               sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);

void inet_splice_eof(struct socket *sock)
{
        const struct proto *prot;
        struct sock *sk = sock->sk;

        if (unlikely(inet_send_prepare(sk)))
                return;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        prot = READ_ONCE(sk->sk_prot);
        if (prot->splice_eof)
                prot->splice_eof(sock);
}
EXPORT_SYMBOL_GPL(inet_splice_eof);

INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
                                          size_t, int, int *));
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                 int flags)
{
        struct sock *sk = sock->sk;
        int addr_len = 0;
        int err;

        if (likely(!(flags & MSG_ERRQUEUE)))
                sock_rps_record_flow(sk);

        err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
                              sk, msg, size, flags, &addr_len);
        if (err >= 0)
                msg->msg_namelen = addr_len;
        return err;
}
EXPORT_SYMBOL(inet_recvmsg);

int inet_shutdown(struct socket *sock, int how)
{
        struct sock *sk = sock->sk;
        int err = 0;

        /* This should really check to make sure
         * the socket is a TCP socket. (WHY AC...)
         */
        how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
                       1->2 bit 2 snds.
                       2->3 */
        if ((how & ~SHUTDOWN_MASK) || !how)        /* MAXINT->0 */
                return -EINVAL;

        lock_sock(sk);
        if (sock->state == SS_CONNECTING) {
                if ((1 << sk->sk_state) &
                    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
                        sock->state = SS_DISCONNECTING;
                else
                        sock->state = SS_CONNECTED;
        }

        switch (sk->sk_state) {
        case TCP_CLOSE:
                err = -ENOTCONN;
                /* Hack to wake up other listeners, who can poll for
                   EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
                fallthrough;
        default:
                WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
                if (sk->sk_prot->shutdown)
                        sk->sk_prot->shutdown(sk, how);
                break;

        /* Remaining two branches are temporary solution for missing
         * close() in multithreaded environment. It is _not_ a good idea,
         * but we have no choice until close() is repaired at VFS level.
         */
        case TCP_LISTEN:
                if (!(how & RCV_SHUTDOWN))
                        break;
                fallthrough;
        case TCP_SYN_SENT:
                err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
                break;
        }

        /* Wake up anyone sleeping in poll. */
        sk->sk_state_change(sk);
        release_sock(sk);
        return err;
}
EXPORT_SYMBOL(inet_shutdown);

/*
 *        ioctl() calls you can issue on an INET socket. Most of these are
 *        device configuration and stuff and very rarely used. Some ioctls
 *        pass on to the socket itself.
 *
 *        NOTE: I like the idea of a module for the config stuff. ie ifconfig
 *        loads the devconfigure module does its configuring and unloads it.
 *        There's a good 20K of config code hanging around the kernel.
 */

int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct sock *sk = sock->sk;
        int err = 0;
        struct net *net = sock_net(sk);
        void __user *p = (void __user *)arg;
        struct ifreq ifr;
        struct rtentry rt;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT:
                if (copy_from_user(&rt, p, sizeof(struct rtentry)))
                        return -EFAULT;
                err = ip_rt_ioctl(net, cmd, &rt);
                break;
        case SIOCRTMSG:
                err = -EINVAL;
                break;
        case SIOCDARP:
        case SIOCGARP:
        case SIOCSARP:
                err = arp_ioctl(net, cmd, (void __user *)arg);
                break;
        case SIOCGIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCGIFNETMASK:
        case SIOCGIFDSTADDR:
        case SIOCGIFPFLAGS:
                if (get_user_ifreq(&ifr, NULL, p))
                        return -EFAULT;
                err = devinet_ioctl(net, cmd, &ifr);
                if (!err && put_user_ifreq(&ifr, p))
                        err = -EFAULT;
                break;

        case SIOCSIFADDR:
        case SIOCSIFBRDADDR:
        case SIOCSIFNETMASK:
        case SIOCSIFDSTADDR:
        case SIOCSIFPFLAGS:
        case SIOCSIFFLAGS:
                if (get_user_ifreq(&ifr, NULL, p))
                        return -EFAULT;
                err = devinet_ioctl(net, cmd, &ifr);
                break;
        default:
                if (sk->sk_prot->ioctl)
                        err = sk_ioctl(sk, cmd, (void __user *)arg);
                else
                        err = -ENOIOCTLCMD;
                break;
        }
        return err;
}
EXPORT_SYMBOL(inet_ioctl);

#ifdef CONFIG_COMPAT
static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
                struct compat_rtentry __user *ur)
{
        compat_uptr_t rtdev;
        struct rtentry rt;

        if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
                        3 * sizeof(struct sockaddr)) ||
            get_user(rt.rt_flags, &ur->rt_flags) ||
            get_user(rt.rt_metric, &ur->rt_metric) ||
            get_user(rt.rt_mtu, &ur->rt_mtu) ||
            get_user(rt.rt_window, &ur->rt_window) ||
            get_user(rt.rt_irtt, &ur->rt_irtt) ||
            get_user(rtdev, &ur->rt_dev))
                return -EFAULT;

        rt.rt_dev = compat_ptr(rtdev);
        return ip_rt_ioctl(sock_net(sk), cmd, &rt);
}

static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;

        switch (cmd) {
        case SIOCADDRT:
        case SIOCDELRT:
                return inet_compat_routing_ioctl(sk, cmd, argp);
        default:
                if (!sk->sk_prot->compat_ioctl)
                        return -ENOIOCTLCMD;
                return sk->sk_prot->compat_ioctl(sk, cmd, arg);
        }
}
#endif /* CONFIG_COMPAT */

const struct proto_ops inet_stream_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_stream_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = inet_accept,
        .getname           = inet_getname,
        .poll                   = tcp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = inet_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
#ifdef CONFIG_MMU
        .mmap                   = tcp_mmap,
#endif
        .splice_eof           = inet_splice_eof,
        .splice_read           = tcp_splice_read,
        .set_peek_off      = sk_set_peek_off,
        .read_sock           = tcp_read_sock,
        .read_skb           = tcp_read_skb,
        .sendmsg_locked    = tcp_sendmsg_locked,
        .peek_len           = tcp_peek_len,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
        .set_rcvlowat           = tcp_set_rcvlowat,
};
EXPORT_SYMBOL(inet_stream_ops);

const struct proto_ops inet_dgram_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_dgram_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = sock_no_accept,
        .getname           = inet_getname,
        .poll                   = udp_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .read_skb           = udp_read_skb,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
        .splice_eof           = inet_splice_eof,
        .set_peek_off           = udp_set_peek_off,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
};
EXPORT_SYMBOL(inet_dgram_ops);

/*
 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
 * udp_poll
 */
static const struct proto_ops inet_sockraw_ops = {
        .family                   = PF_INET,
        .owner                   = THIS_MODULE,
        .release           = inet_release,
        .bind                   = inet_bind,
        .connect           = inet_dgram_connect,
        .socketpair           = sock_no_socketpair,
        .accept                   = sock_no_accept,
        .getname           = inet_getname,
        .poll                   = datagram_poll,
        .ioctl                   = inet_ioctl,
        .gettstamp           = sock_gettstamp,
        .listen                   = sock_no_listen,
        .shutdown           = inet_shutdown,
        .setsockopt           = sock_common_setsockopt,
        .getsockopt           = sock_common_getsockopt,
        .sendmsg           = inet_sendmsg,
        .recvmsg           = inet_recvmsg,
        .mmap                   = sock_no_mmap,
        .splice_eof           = inet_splice_eof,
#ifdef CONFIG_COMPAT
        .compat_ioctl           = inet_compat_ioctl,
#endif
};

static const struct net_proto_family inet_family_ops = {
        .family = PF_INET,
        .create = inet_create,
        .owner        = THIS_MODULE,
};

/* Upon startup we insert all the elements in inetsw_array[] into
 * the linked list inetsw.
 */
static struct inet_protosw inetsw_array[] =
{
        {
                .type =       SOCK_STREAM,
                .protocol =   IPPROTO_TCP,
                .prot =       &tcp_prot,
                .ops =        &inet_stream_ops,
                .flags =      INET_PROTOSW_PERMANENT |
                              INET_PROTOSW_ICSK,
        },

        {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_UDP,
                .prot =       &udp_prot,
                .ops =        &inet_dgram_ops,
                .flags =      INET_PROTOSW_PERMANENT,
       },

       {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_ICMP,
                .prot =       &ping_prot,
                .ops =        &inet_sockraw_ops,
                .flags =      INET_PROTOSW_REUSE,
       },

       {
               .type =       SOCK_RAW,
               .protocol =   IPPROTO_IP,        /* wild card */
               .prot =       &raw_prot,
               .ops =        &inet_sockraw_ops,
               .flags =      INET_PROTOSW_REUSE,
       }
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

void inet_register_protosw(struct inet_protosw *p)
{
        struct list_head *lh;
        struct inet_protosw *answer;
        int protocol = p->protocol;
        struct list_head *last_perm;

        spin_lock_bh(&inetsw_lock);

        if (p->type >= SOCK_MAX)
                goto out_illegal;

        /* If we are trying to override a permanent protocol, bail. */
        last_perm = &inetsw[p->type];
        list_for_each(lh, &inetsw[p->type]) {
                answer = list_entry(lh, struct inet_protosw, list);
                /* Check only the non-wild match. */
                if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
                        break;
                if (protocol == answer->protocol)
                        goto out_permanent;
                last_perm = lh;
        }

        /* Add the new entry after the last permanent entry if any, so that
         * the new entry does not override a permanent entry when matched with
         * a wild-card protocol. But it is allowed to override any existing
         * non-permanent entry.  This means that when we remove this entry, the
         * system automatically returns to the old behavior.
         */
        list_add_rcu(&p->list, last_perm);
out:
        spin_unlock_bh(&inetsw_lock);

        return;

out_permanent:
        pr_err("Attempt to override permanent protocol %d\n", protocol);
        goto out;

out_illegal:
        pr_err("Ignoring attempt to register invalid socket type %d\n",
               p->type);
        goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

void inet_unregister_protosw(struct inet_protosw *p)
{
        if (INET_PROTOSW_PERMANENT & p->flags) {
                pr_err("Attempt to unregister permanent protocol %d\n",
                       p->protocol);
        } else {
                spin_lock_bh(&inetsw_lock);
                list_del_rcu(&p->list);
                spin_unlock_bh(&inetsw_lock);

                synchronize_net();
        }
}
EXPORT_SYMBOL(inet_unregister_protosw);

static int inet_sk_reselect_saddr(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        __be32 old_saddr = inet->inet_saddr;
        __be32 daddr = inet->inet_daddr;
        struct flowi4 *fl4;
        struct rtable *rt;
        __be32 new_saddr;
        struct ip_options_rcu *inet_opt;
        int err;

        inet_opt = rcu_dereference_protected(inet->inet_opt,
                                             lockdep_sock_is_held(sk));
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;

        /* Query new route. */
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
                              sk->sk_protocol, inet->inet_sport,
                              inet->inet_dport, sk);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        new_saddr = fl4->saddr;

        if (new_saddr == old_saddr) {
                sk_setup_caps(sk, &rt->dst);
                return 0;
        }

        err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
        if (err) {
                ip_rt_put(rt);
                return err;
        }

        sk_setup_caps(sk, &rt->dst);

        if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
                pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
                        __func__, &old_saddr, &new_saddr);
        }

        /*
         * XXX The only one ugly spot where we need to
         * XXX really change the sockets identity after
         * XXX it has entered the hashes. -DaveM
         *
         * Besides that, it does not check for connection
         * uniqueness. Wait for troubles.
         */
        return __sk_prot_rehash(sk);
}

int inet_sk_rebuild_header(struct sock *sk)
{
        struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
        struct inet_sock *inet = inet_sk(sk);
        __be32 daddr;
        struct ip_options_rcu *inet_opt;
        struct flowi4 *fl4;
        int err;

        /* Route is OK, nothing to do. */
        if (rt)
                return 0;

        /* Reroute. */
        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        daddr = inet->inet_daddr;
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;
        rcu_read_unlock();
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
                                   inet->inet_dport, inet->inet_sport,
                                   sk->sk_protocol, ip_sock_rt_tos(sk),
                                   sk->sk_bound_dev_if);
        if (!IS_ERR(rt)) {
                err = 0;
                sk_setup_caps(sk, &rt->dst);
        } else {
                err = PTR_ERR(rt);

                /* Routing failed... */
                sk->sk_route_caps = 0;
                /*
                 * Other protocols have to map its equivalent state to TCP_SYN_SENT.
                 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
                 */
                if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
                    sk->sk_state != TCP_SYN_SENT ||
                    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
                    (err = inet_sk_reselect_saddr(sk)) != 0)
                        WRITE_ONCE(sk->sk_err_soft, -err);
        }

        return err;
}
EXPORT_SYMBOL(inet_sk_rebuild_header);

void inet_sk_set_state(struct sock *sk, int state)
{
        trace_inet_sock_set_state(sk, sk->sk_state, state);
        sk->sk_state = state;
}
EXPORT_SYMBOL(inet_sk_set_state);

void inet_sk_state_store(struct sock *sk, int newstate)
{
        trace_inet_sock_set_state(sk, sk->sk_state, newstate);
        smp_store_release(&sk->sk_state, newstate);
}

struct sk_buff *inet_gso_segment(struct sk_buff *skb,
                                 netdev_features_t features)
{
        bool udpfrag = false, fixedid = false, gso_partial, encap;
        struct sk_buff *segs = ERR_PTR(-EINVAL);
        const struct net_offload *ops;
        unsigned int offset = 0;
        struct iphdr *iph;
        int proto, tot_len;
        int nhoff;
        int ihl;
        int id;

        skb_reset_network_header(skb);
        nhoff = skb_network_header(skb) - skb_mac_header(skb);
        if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
                goto out;

        iph = ip_hdr(skb);
        ihl = iph->ihl * 4;
        if (ihl < sizeof(*iph))
                goto out;

        id = ntohs(iph->id);
        proto = iph->protocol;

        /* Warning: after this point, iph might be no longer valid */
        if (unlikely(!pskb_may_pull(skb, ihl)))
                goto out;
        __skb_pull(skb, ihl);

        encap = SKB_GSO_CB(skb)->encap_level > 0;
        if (encap)
                features &= skb->dev->hw_enc_features;
        SKB_GSO_CB(skb)->encap_level += ihl;

        skb_reset_transport_header(skb);

        segs = ERR_PTR(-EPROTONOSUPPORT);

        if (!skb->encapsulation || encap) {
                udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
                fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);

                /* fixed ID is invalid if DF bit is not set */
                if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
                        goto out;
        }

        ops = rcu_dereference(inet_offloads[proto]);
        if (likely(ops && ops->callbacks.gso_segment)) {
                segs = ops->callbacks.gso_segment(skb, features);
                if (!segs)
                        skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
        }

        if (IS_ERR_OR_NULL(segs))
                goto out;

        gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);

        skb = segs;
        do {
                iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
                if (udpfrag) {
                        iph->frag_off = htons(offset >> 3);
                        if (skb->next)
                                iph->frag_off |= htons(IP_MF);
                        offset += skb->len - nhoff - ihl;
                        tot_len = skb->len - nhoff;
                } else if (skb_is_gso(skb)) {
                        if (!fixedid) {
                                iph->id = htons(id);
                                id += skb_shinfo(skb)->gso_segs;
                        }

                        if (gso_partial)
                                tot_len = skb_shinfo(skb)->gso_size +
                                          SKB_GSO_CB(skb)->data_offset +
                                          skb->head - (unsigned char *)iph;
                        else
                                tot_len = skb->len - nhoff;
                } else {
                        if (!fixedid)
                                iph->id = htons(id++);
                        tot_len = skb->len - nhoff;
                }
                iph->tot_len = htons(tot_len);
                ip_send_check(iph);
                if (encap)
                        skb_reset_inner_headers(skb);
                skb->network_header = (u8 *)iph - skb->head;
                skb_reset_mac_len(skb);
        } while ((skb = skb->next));

out:
        return segs;
}

static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
                                        netdev_features_t features)
{
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
                return ERR_PTR(-EINVAL);

        return inet_gso_segment(skb, features);
}

struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
        const struct net_offload *ops;
        struct sk_buff *pp = NULL;
        const struct iphdr *iph;
        struct sk_buff *p;
        unsigned int hlen;
        unsigned int off;
        int flush = 1;
        int proto;

        off = skb_gro_offset(skb);
        hlen = off + sizeof(*iph);
        iph = skb_gro_header(skb, hlen, off);
        if (unlikely(!iph))
                goto out;

        proto = iph->protocol;

        ops = rcu_dereference(inet_offloads[proto]);
        if (!ops || !ops->callbacks.gro_receive)
                goto out;

        if (*(u8 *)iph != 0x45)
                goto out;

        if (ip_is_fragment(iph))
                goto out;

        if (unlikely(ip_fast_csum((u8 *)iph, 5)))
                goto out;

        NAPI_GRO_CB(skb)->proto = proto;
        flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));

        list_for_each_entry(p, head, list) {
                struct iphdr *iph2;

                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                iph2 = (struct iphdr *)(p->data + off);
                /* The above works because, with the exception of the top
                 * (inner most) layer, we only aggregate pkts with the same
                 * hdr length so all the hdrs we'll need to verify will start
                 * at the same offset.
                 */
                if ((iph->protocol ^ iph2->protocol) |
                    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
                    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        NAPI_GRO_CB(skb)->flush |= flush;
        NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;

        /* Note : No need to call skb_gro_postpull_rcsum() here,
         * as we already checked checksum over ipv4 header was 0
         */
        skb_gro_pull(skb, sizeof(*iph));
        skb_set_transport_header(skb, skb_gro_offset(skb));

        pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
                                       ops->callbacks.gro_receive, head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}

static struct sk_buff *ipip_gro_receive(struct list_head *head,
                                        struct sk_buff *skb)
{
        if (NAPI_GRO_CB(skb)->encap_mark) {
                NAPI_GRO_CB(skb)->flush = 1;
                return NULL;
        }

        NAPI_GRO_CB(skb)->encap_mark = 1;

        return inet_gro_receive(head, skb);
}

#define SECONDS_PER_DAY        86400

/* inet_current_timestamp - Return IP network timestamp
 *
 * Return milliseconds since midnight in network byte order.
 */
__be32 inet_current_timestamp(void)
{
        u32 secs;
        u32 msecs;
        struct timespec64 ts;

        ktime_get_real_ts64(&ts);

        /* Get secs since midnight. */
        (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
        /* Convert to msecs. */
        msecs = secs * MSEC_PER_SEC;
        /* Convert nsec to msec. */
        msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;

        /* Convert to network byte order. */
        return htonl(msecs);
}
EXPORT_SYMBOL(inet_current_timestamp);

int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
        unsigned int family = READ_ONCE(sk->sk_family);

        if (family == AF_INET)
                return ip_recv_error(sk, msg, len, addr_len);
#if IS_ENABLED(CONFIG_IPV6)
        if (family == AF_INET6)
                return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
#endif
        return -EINVAL;
}
EXPORT_SYMBOL(inet_recv_error);

int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
        const struct net_offload *ops;
        __be16 totlen = iph->tot_len;
        int proto = iph->protocol;
        int err = -ENOSYS;

        if (skb->encapsulation) {
                skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
                skb_set_inner_network_header(skb, nhoff);
        }

        iph_set_totlen(iph, skb->len - nhoff);
        csum_replace2(&iph->check, totlen, iph->tot_len);

        ops = rcu_dereference(inet_offloads[proto]);
        if (WARN_ON(!ops || !ops->callbacks.gro_complete))
                goto out;

        /* Only need to add sizeof(*iph) to get to the next hdr below
         * because any hdr with option will have been flushed in
         * inet_gro_receive().
         */
        err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
                              tcp4_gro_complete, udp4_gro_complete,
                              skb, nhoff + sizeof(*iph));

out:
        return err;
}

static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
        skb->encapsulation = 1;
        skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
        return inet_gro_complete(skb, nhoff);
}

int inet_ctl_sock_create(struct sock **sk, unsigned short family,
                         unsigned short type, unsigned char protocol,
                         struct net *net)
{
        struct socket *sock;
        int rc = sock_create_kern(net, family, type, protocol, &sock);

        if (rc == 0) {
                *sk = sock->sk;
                (*sk)->sk_allocation = GFP_ATOMIC;
                (*sk)->sk_use_task_frag = false;
                /*
                 * Unhash it so that IP input processing does not even see it,
                 * we do not wish this socket to see incoming packets.
                 */
                (*sk)->sk_prot->unhash(*sk);
        }
        return rc;
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);

unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
        unsigned long res = 0;
        int i;

        for_each_possible_cpu(i)
                res += snmp_get_cpu_field(mib, i, offt);
        return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);

#if BITS_PER_LONG==32

u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
                         size_t syncp_offset)
{
        void *bhptr;
        struct u64_stats_sync *syncp;
        u64 v;
        unsigned int start;

        bhptr = per_cpu_ptr(mib, cpu);
        syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
        do {
                start = u64_stats_fetch_begin(syncp);
                v = *(((u64 *)bhptr) + offt);
        } while (u64_stats_fetch_retry(syncp, start));

        return v;
}
EXPORT_SYMBOL_GPL(snmp_get_cpu_field64);

u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset)
{
        u64 res = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset);
        }
        return res;
}
EXPORT_SYMBOL_GPL(snmp_fold_field64);
#endif

#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
        .handler =        igmp_rcv,
};
#endif

static const struct net_protocol icmp_protocol = {
        .handler =        icmp_rcv,
        .err_handler =        icmp_err,
        .no_policy =        1,
};

static __net_init int ipv4_mib_init_net(struct net *net)
{
        int i;

        net->mib.tcp_statistics = alloc_percpu(struct tcp_mib);
        if (!net->mib.tcp_statistics)
                goto err_tcp_mib;
        net->mib.ip_statistics = alloc_percpu(struct ipstats_mib);
        if (!net->mib.ip_statistics)
                goto err_ip_mib;

        for_each_possible_cpu(i) {
                struct ipstats_mib *af_inet_stats;
                af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i);
                u64_stats_init(&af_inet_stats->syncp);
        }

        net->mib.net_statistics = alloc_percpu(struct linux_mib);
        if (!net->mib.net_statistics)
                goto err_net_mib;
        net->mib.udp_statistics = alloc_percpu(struct udp_mib);
        if (!net->mib.udp_statistics)
                goto err_udp_mib;
        net->mib.udplite_statistics = alloc_percpu(struct udp_mib);
        if (!net->mib.udplite_statistics)
                goto err_udplite_mib;
        net->mib.icmp_statistics = alloc_percpu(struct icmp_mib);
        if (!net->mib.icmp_statistics)
                goto err_icmp_mib;
        net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
                                              GFP_KERNEL);
        if (!net->mib.icmpmsg_statistics)
                goto err_icmpmsg_mib;

        tcp_mib_init(net);
        return 0;

err_icmpmsg_mib:
        free_percpu(net->mib.icmp_statistics);
err_icmp_mib:
        free_percpu(net->mib.udplite_statistics);
err_udplite_mib:
        free_percpu(net->mib.udp_statistics);
err_udp_mib:
        free_percpu(net->mib.net_statistics);
err_net_mib:
        free_percpu(net->mib.ip_statistics);
err_ip_mib:
        free_percpu(net->mib.tcp_statistics);
err_tcp_mib:
        return -ENOMEM;
}

static __net_exit void ipv4_mib_exit_net(struct net *net)
{
        kfree(net->mib.icmpmsg_statistics);
        free_percpu(net->mib.icmp_statistics);
        free_percpu(net->mib.udplite_statistics);
        free_percpu(net->mib.udp_statistics);
        free_percpu(net->mib.net_statistics);
        free_percpu(net->mib.ip_statistics);
        free_percpu(net->mib.tcp_statistics);
#ifdef CONFIG_MPTCP
        /* allocated on demand, see mptcp_init_sock() */
        free_percpu(net->mib.mptcp_statistics);
#endif
}

static __net_initdata struct pernet_operations ipv4_mib_ops = {
        .init = ipv4_mib_init_net,
        .exit = ipv4_mib_exit_net,
};

static int __init init_ipv4_mibs(void)
{
        return register_pernet_subsys(&ipv4_mib_ops);
}

static __net_init int inet_init_net(struct net *net)
{
        /*
         * Set defaults for local port range
         */
        net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;

        seqlock_init(&net->ipv4.ping_group_range.lock);
        /*
         * Sane defaults - nobody may create ping sockets.
         * Boot scripts should set this to distro-specific group.
         */
        net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1);
        net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0);

        /* Default values for sysctl-controlled parameters.
         * We set them here, in case sysctl is not compiled.
         */
        net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
        net->ipv4.sysctl_ip_fwd_update_priority = 1;
        net->ipv4.sysctl_ip_dynaddr = 0;
        net->ipv4.sysctl_ip_early_demux = 1;
        net->ipv4.sysctl_udp_early_demux = 1;
        net->ipv4.sysctl_tcp_early_demux = 1;
        net->ipv4.sysctl_nexthop_compat_mode = 1;
#ifdef CONFIG_SYSCTL
        net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif

        /* Some igmp sysctl, whose values are always used */
        net->ipv4.sysctl_igmp_max_memberships = 20;
        net->ipv4.sysctl_igmp_max_msf = 10;
        /* IGMP reports for link-local multicast groups are enabled by default */
        net->ipv4.sysctl_igmp_llm_reports = 1;
        net->ipv4.sysctl_igmp_qrv = 2;

        net->ipv4.sysctl_fib_notify_on_flag_change = 0;

        return 0;
}

static __net_initdata struct pernet_operations af_inet_ops = {
        .init = inet_init_net,
};

static int __init init_inet_pernet_ops(void)
{
        return register_pernet_subsys(&af_inet_ops);
}

static int ipv4_proc_init(void);

/*
 *        IP protocol layer initialiser
 */


static const struct net_offload ipip_offload = {
        .callbacks = {
                .gso_segment        = ipip_gso_segment,
                .gro_receive        = ipip_gro_receive,
                .gro_complete        = ipip_gro_complete,
        },
};

static int __init ipip_offload_init(void)
{
        return inet_add_offload(&ipip_offload, IPPROTO_IPIP);
}

static int __init ipv4_offload_init(void)
{
        /*
         * Add offloads
         */
        if (udpv4_offload_init() < 0)
                pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
        if (tcpv4_offload_init() < 0)
                pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
        if (ipip_offload_init() < 0)
                pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);

        net_hotdata.ip_packet_offload = (struct packet_offload) {
                .type = cpu_to_be16(ETH_P_IP),
                .callbacks = {
                        .gso_segment = inet_gso_segment,
                        .gro_receive = inet_gro_receive,
                        .gro_complete = inet_gro_complete,
                },
        };
        dev_add_offload(&net_hotdata.ip_packet_offload);
        return 0;
}

fs_initcall(ipv4_offload_init);

static struct packet_type ip_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_IP),
        .func = ip_rcv,
        .list_func = ip_list_rcv,
};

static int __init inet_init(void)
{
        struct inet_protosw *q;
        struct list_head *r;
        int rc;

        sock_skb_cb_check_size(sizeof(struct inet_skb_parm));

        raw_hashinfo_init(&raw_v4_hashinfo);

        rc = proto_register(&tcp_prot, 1);
        if (rc)
                goto out;

        rc = proto_register(&udp_prot, 1);
        if (rc)
                goto out_unregister_tcp_proto;

        rc = proto_register(&raw_prot, 1);
        if (rc)
                goto out_unregister_udp_proto;

        rc = proto_register(&ping_prot, 1);
        if (rc)
                goto out_unregister_raw_proto;

        /*
         *        Tell SOCKET that we are alive...
         */

        (void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL
        ip_static_sysctl_init();
#endif

        /*
         *        Add all the base protocols.
         */

        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
                pr_crit("%s: Cannot add ICMP protocol\n", __func__);

        net_hotdata.udp_protocol = (struct net_protocol) {
                .handler =        udp_rcv,
                .err_handler =        udp_err,
                .no_policy =        1,
        };
        if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0)
                pr_crit("%s: Cannot add UDP protocol\n", __func__);

        net_hotdata.tcp_protocol = (struct net_protocol) {
                .handler        =        tcp_v4_rcv,
                .err_handler        =        tcp_v4_err,
                .no_policy        =        1,
                .icmp_strict_tag_validation = 1,
        };
        if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0)
                pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
        if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
                pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif

        /* Register the socket-side information for inet_create. */
        for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
                INIT_LIST_HEAD(r);

        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
                inet_register_protosw(q);

        /*
         *        Set the ARP module up
         */

        arp_init();

        /*
         *        Set the IP module up
         */

        ip_init();

        /* Initialise per-cpu ipv4 mibs */
        if (init_ipv4_mibs())
                panic("%s: Cannot init ipv4 mibs\n", __func__);

        /* Setup TCP slab cache for open requests. */
        tcp_init();

        /* Setup UDP memory threshold */
        udp_init();

        /* Add UDP-Lite (RFC 3828) */
        udplite4_register();

        raw_init();

        ping_init();

        /*
         *        Set the ICMP layer up
         */

        if (icmp_init() < 0)
                panic("Failed to create the ICMP control socket.\n");

        /*
         *        Initialise the multicast router
         */
#if defined(CONFIG_IP_MROUTE)
        if (ip_mr_init())
                pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif

        if (init_inet_pernet_ops())
                pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);

        ipv4_proc_init();

        ipfrag_init();

        dev_add_pack(&ip_packet_type);

        ip_tunnel_core_init();

        rc = 0;
out:
        return rc;
out_unregister_raw_proto:
        proto_unregister(&raw_prot);
out_unregister_udp_proto:
        proto_unregister(&udp_prot);
out_unregister_tcp_proto:
        proto_unregister(&tcp_prot);
        goto out;
}

fs_initcall(inet_init);

/* ------------------------------------------------------------------------ */

#ifdef CONFIG_PROC_FS
static int __init ipv4_proc_init(void)
{
        int rc = 0;

        if (raw_proc_init())
                goto out_raw;
        if (tcp4_proc_init())
                goto out_tcp;
        if (udp4_proc_init())
                goto out_udp;
        if (ping_proc_init())
                goto out_ping;
        if (ip_misc_proc_init())
                goto out_misc;
out:
        return rc;
out_misc:
        ping_proc_exit();
out_ping:
        udp4_proc_exit();
out_udp:
        tcp4_proc_exit();
out_tcp:
        raw_proc_exit();
out_raw:
        rc = -ENOMEM;
        goto out;
}

#else /* CONFIG_PROC_FS */
static int __init ipv4_proc_init(void)
{
        return 0;
}
#endif /* CONFIG_PROC_FS */


















































    1 

    1 
    1 











    1 




    1 



    1 


    1 









































































    1 











    1 

    1 




    1 




















    1 

    1 





















    1 












    1 








    1 





























    1 






    1 




    1 











    1 












    1 





    1 






















































    1 











    1 


    1 

    1 




    1 














    1 






    1 
    1 








    1 


    1 








    1 











































    1 




    1 
    1 
    1 

    1 













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IPv6 input
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Ian P. Morris                <I.P.Morris@soton.ac.uk>
 *
 *        Based in linux/net/ipv4/ip_input.c
 */
/* Changes
 *
 *        Mitsuru KANDA @USAGI and
 *        YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs().
 */

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
#include <linux/slab.h>
#include <linux/indirect_call_wrapper.h>

#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>

#include <net/sock.h>
#include <net/snmp.h>
#include <net/udp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/xfrm.h>
#include <net/inet_ecn.h>
#include <net/dst_metadata.h>

static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
                                struct sk_buff *skb)
{
        if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
            !skb_dst(skb) && !skb->sk) {
                switch (ipv6_hdr(skb)->nexthdr) {
                case IPPROTO_TCP:
                        if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
                                tcp_v6_early_demux(skb);
                        break;
                case IPPROTO_UDP:
                        if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
                                udp_v6_early_demux(skb);
                        break;
                }
        }

        if (!skb_valid_dst(skb))
                ip6_route_input(skb);
}

int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip6_rcv(skb);
        if (!skb)
                return NET_RX_SUCCESS;
        ip6_rcv_finish_core(net, sk, skb);

        return dst_input(skb);
}

static void ip6_sublist_rcv_finish(struct list_head *head)
{
        struct sk_buff *skb, *next;

        list_for_each_entry_safe(skb, next, head, list) {
                skb_list_del_init(skb);
                dst_input(skb);
        }
}

static bool ip6_can_use_hint(const struct sk_buff *skb,
                             const struct sk_buff *hint)
{
        return hint && !skb_dst(skb) &&
               ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr);
}

static struct sk_buff *ip6_extract_route_hint(const struct net *net,
                                              struct sk_buff *skb)
{
        if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) ||
            IP6CB(skb)->flags & IP6SKB_MULTIPATH)
                return NULL;

        return skb;
}

static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
                                struct list_head *head)
{
        struct sk_buff *skb, *next, *hint = NULL;
        struct dst_entry *curr_dst = NULL;
        struct list_head sublist;

        INIT_LIST_HEAD(&sublist);
        list_for_each_entry_safe(skb, next, head, list) {
                struct dst_entry *dst;

                skb_list_del_init(skb);
                /* if ingress device is enslaved to an L3 master device pass the
                 * skb to its handler for processing
                 */
                skb = l3mdev_ip6_rcv(skb);
                if (!skb)
                        continue;

                if (ip6_can_use_hint(skb, hint))
                        skb_dst_copy(skb, hint);
                else
                        ip6_rcv_finish_core(net, sk, skb);
                dst = skb_dst(skb);
                if (curr_dst != dst) {
                        hint = ip6_extract_route_hint(net, skb);

                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip6_sublist_rcv_finish(&sublist);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dst = dst;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        ip6_sublist_rcv_finish(&sublist);
}

static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
                                    struct net *net)
{
        enum skb_drop_reason reason;
        const struct ipv6hdr *hdr;
        u32 pkt_len;
        struct inet6_dev *idev;

        if (skb->pkt_type == PACKET_OTHERHOST) {
                dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
                kfree_skb_reason(skb, SKB_DROP_REASON_OTHERHOST);
                return NULL;
        }

        rcu_read_lock();

        idev = __in6_dev_get(skb->dev);

        __IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len);

        SKB_DR_SET(reason, NOT_SPECIFIED);
        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
            !idev || unlikely(READ_ONCE(idev->cnf.disable_ipv6))) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
                if (idev && unlikely(READ_ONCE(idev->cnf.disable_ipv6)))
                        SKB_DR_SET(reason, IPV6DISABLED);
                goto drop;
        }

        memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));

        /*
         * Store incoming device index. When the packet will
         * be queued, we cannot refer to skb->dev anymore.
         *
         * BTW, when we send a packet for our own local address on a
         * non-loopback interface (e.g. ethX), it is being delivered
         * via the loopback interface (lo) here; skb->dev = loopback_dev.
         * It, however, should be considered as if it is being
         * arrived via the sending interface (ethX), because of the
         * nature of scoping architecture. --yoshfuji
         */
        IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;

        if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
                goto err;

        hdr = ipv6_hdr(skb);

        if (hdr->version != 6) {
                SKB_DR_SET(reason, UNHANDLED_PROTO);
                goto err;
        }

        __IP6_ADD_STATS(net, idev,
                        IPSTATS_MIB_NOECTPKTS +
                                (ipv6_get_dsfield(hdr) & INET_ECN_MASK),
                        max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
        /*
         * RFC4291 2.5.3
         * The loopback address must not be used as the source address in IPv6
         * packets that are sent outside of a single node. [..]
         * A packet received on an interface with a destination address
         * of loopback must be dropped.
         */
        if ((ipv6_addr_loopback(&hdr->saddr) ||
             ipv6_addr_loopback(&hdr->daddr)) &&
            !(dev->flags & IFF_LOOPBACK) &&
            !netif_is_l3_master(dev))
                goto err;

        /* RFC4291 Errata ID: 3480
         * Interface-Local scope spans only a single interface on a
         * node and is useful only for loopback transmission of
         * multicast.  Packets with interface-local scope received
         * from another node must be discarded.
         */
        if (!(skb->pkt_type == PACKET_LOOPBACK ||
              dev->flags & IFF_LOOPBACK) &&
            ipv6_addr_is_multicast(&hdr->daddr) &&
            IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
                goto err;

        /* If enabled, drop unicast packets that were encapsulated in link-layer
         * multicast or broadcast to protected against the so-called "hole-196"
         * attack in 802.11 wireless.
         */
        if (!ipv6_addr_is_multicast(&hdr->daddr) &&
            (skb->pkt_type == PACKET_BROADCAST ||
             skb->pkt_type == PACKET_MULTICAST) &&
            READ_ONCE(idev->cnf.drop_unicast_in_l2_multicast)) {
                SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST);
                goto err;
        }

        /* RFC4291 2.7
         * Nodes must not originate a packet to a multicast address whose scope
         * field contains the reserved value 0; if such a packet is received, it
         * must be silently dropped.
         */
        if (ipv6_addr_is_multicast(&hdr->daddr) &&
            IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0)
                goto err;

        /*
         * RFC4291 2.7
         * Multicast addresses must not be used as source addresses in IPv6
         * packets or appear in any Routing header.
         */
        if (ipv6_addr_is_multicast(&hdr->saddr))
                goto err;

        skb->transport_header = skb->network_header + sizeof(*hdr);
        IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);

        pkt_len = ntohs(hdr->payload_len);

        /* pkt_len may be zero if Jumbo payload option is present */
        if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
                if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
                        __IP6_INC_STATS(net,
                                        idev, IPSTATS_MIB_INTRUNCATEDPKTS);
                        SKB_DR_SET(reason, PKT_TOO_SMALL);
                        goto drop;
                }
                if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
                        goto err;
                hdr = ipv6_hdr(skb);
        }

        if (hdr->nexthdr == NEXTHDR_HOP) {
                if (ipv6_parse_hopopts(skb) < 0) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
                        rcu_read_unlock();
                        return NULL;
                }
        }

        rcu_read_unlock();

        /* Must drop socket now because of tproxy. */
        if (!skb_sk_is_prefetched(skb))
                skb_orphan(skb);

        return skb;
err:
        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
        SKB_DR_OR(reason, IP_INHDR);
drop:
        rcu_read_unlock();
        kfree_skb_reason(skb, reason);
        return NULL;
}

int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
        struct net *net = dev_net(skb->dev);

        skb = ip6_rcv_core(skb, dev, net);
        if (skb == NULL)
                return NET_RX_DROP;
        return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
                       net, NULL, skb, dev, NULL,
                       ip6_rcv_finish);
}

static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
                            struct net *net)
{
        NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
                     head, dev, NULL, ip6_rcv_finish);
        ip6_list_rcv_finish(net, NULL, head);
}

/* Receive a list of IPv6 packets */
void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
                   struct net_device *orig_dev)
{
        struct net_device *curr_dev = NULL;
        struct net *curr_net = NULL;
        struct sk_buff *skb, *next;
        struct list_head sublist;

        INIT_LIST_HEAD(&sublist);
        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *dev = skb->dev;
                struct net *net = dev_net(dev);

                skb_list_del_init(skb);
                skb = ip6_rcv_core(skb, dev, net);
                if (skb == NULL)
                        continue;

                if (curr_dev != dev || curr_net != net) {
                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip6_sublist_rcv(&sublist, curr_dev, curr_net);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dev = dev;
                        curr_net = net;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        if (!list_empty(&sublist))
                ip6_sublist_rcv(&sublist, curr_dev, curr_net);
}

INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *));

/*
 *        Deliver the packet to the host
 */
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
                              bool have_final)
{
        const struct inet6_protocol *ipprot;
        struct inet6_dev *idev;
        unsigned int nhoff;
        SKB_DR(reason);
        bool raw;

        /*
         *        Parse extension headers
         */

resubmit:
        idev = ip6_dst_idev(skb_dst(skb));
        nhoff = IP6CB(skb)->nhoff;
        if (!have_final) {
                if (!pskb_pull(skb, skb_transport_offset(skb)))
                        goto discard;
                nexthdr = skb_network_header(skb)[nhoff];
        }

resubmit_final:
        raw = raw6_local_deliver(skb, nexthdr);
        ipprot = rcu_dereference(inet6_protos[nexthdr]);
        if (ipprot) {
                int ret;

                if (have_final) {
                        if (!(ipprot->flags & INET6_PROTO_FINAL)) {
                                /* Once we've seen a final protocol don't
                                 * allow encapsulation on any non-final
                                 * ones. This allows foo in UDP encapsulation
                                 * to work.
                                 */
                                goto discard;
                        }
                } else if (ipprot->flags & INET6_PROTO_FINAL) {
                        const struct ipv6hdr *hdr;
                        int sdif = inet6_sdif(skb);
                        struct net_device *dev;

                        /* Only do this once for first final protocol */
                        have_final = true;


                        skb_postpull_rcsum(skb, skb_network_header(skb),
                                           skb_network_header_len(skb));
                        hdr = ipv6_hdr(skb);

                        /* skb->dev passed may be master dev for vrfs. */
                        if (sdif) {
                                dev = dev_get_by_index_rcu(net, sdif);
                                if (!dev)
                                        goto discard;
                        } else {
                                dev = skb->dev;
                        }

                        if (ipv6_addr_is_multicast(&hdr->daddr) &&
                            !ipv6_chk_mcast_addr(dev, &hdr->daddr,
                                                 &hdr->saddr) &&
                            !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) {
                                SKB_DR_SET(reason, IP_INADDRERRORS);
                                goto discard;
                        }
                }
                if (!(ipprot->flags & INET6_PROTO_NOPOLICY)) {
                        if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                SKB_DR_SET(reason, XFRM_POLICY);
                                goto discard;
                        }
                        nf_reset_ct(skb);
                }

                ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv,
                                      skb);
                if (ret > 0) {
                        if (ipprot->flags & INET6_PROTO_FINAL) {
                                /* Not an extension header, most likely UDP
                                 * encapsulation. Use return value as nexthdr
                                 * protocol not nhoff (which presumably is
                                 * not set by handler).
                                 */
                                nexthdr = ret;
                                goto resubmit_final;
                        } else {
                                goto resubmit;
                        }
                } else if (ret == 0) {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
                }
        } else {
                if (!raw) {
                        if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                __IP6_INC_STATS(net, idev,
                                                IPSTATS_MIB_INUNKNOWNPROTOS);
                                icmpv6_send(skb, ICMPV6_PARAMPROB,
                                            ICMPV6_UNK_NEXTHDR, nhoff);
                                SKB_DR_SET(reason, IP_NOPROTO);
                        } else {
                                SKB_DR_SET(reason, XFRM_POLICY);
                        }
                        kfree_skb_reason(skb, reason);
                } else {
                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
                        consume_skb(skb);
                }
        }
        return;

discard:
        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
        kfree_skb_reason(skb, reason);
}

static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb_clear_delivery_time(skb);
        rcu_read_lock();
        ip6_protocol_deliver_rcu(net, skb, 0, false);
        rcu_read_unlock();

        return 0;
}


int ip6_input(struct sk_buff *skb)
{
        return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
                       dev_net(skb->dev), NULL, skb, skb->dev, NULL,
                       ip6_input_finish);
}
EXPORT_SYMBOL_GPL(ip6_input);

int ip6_mc_input(struct sk_buff *skb)
{
        int sdif = inet6_sdif(skb);
        const struct ipv6hdr *hdr;
        struct net_device *dev;
        bool deliver;

        __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
                         __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
                         skb->len);

        /* skb->dev passed may be master dev for vrfs. */
        if (sdif) {
                rcu_read_lock();
                dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
                if (!dev) {
                        rcu_read_unlock();
                        kfree_skb(skb);
                        return -ENODEV;
                }
        } else {
                dev = skb->dev;
        }

        hdr = ipv6_hdr(skb);
        deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL);
        if (sdif)
                rcu_read_unlock();

#ifdef CONFIG_IPV6_MROUTE
        /*
         *      IPv6 multicast router mode is now supported ;)
         */
        if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
            !(ipv6_addr_type(&hdr->daddr) &
              (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
            likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
                /*
                 * Okay, we try to forward - split and duplicate
                 * packets.
                 */
                struct sk_buff *skb2;
                struct inet6_skb_parm *opt = IP6CB(skb);

                /* Check for MLD */
                if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
                        /* Check if this is a mld message */
                        u8 nexthdr = hdr->nexthdr;
                        __be16 frag_off;
                        int offset;

                        /* Check if the value of Router Alert
                         * is for MLD (0x0000).
                         */
                        if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) {
                                deliver = false;

                                if (!ipv6_ext_hdr(nexthdr)) {
                                        /* BUG */
                                        goto out;
                                }
                                offset = ipv6_skip_exthdr(skb, sizeof(*hdr),
                                                          &nexthdr, &frag_off);
                                if (offset < 0)
                                        goto out;

                                if (ipv6_is_mld(skb, nexthdr, offset))
                                        deliver = true;

                                goto out;
                        }
                        /* unknown RA - process it normally */
                }

                if (deliver)
                        skb2 = skb_clone(skb, GFP_ATOMIC);
                else {
                        skb2 = skb;
                        skb = NULL;
                }

                if (skb2) {
                        ip6_mr_input(skb2);
                }
        }
out:
#endif
        if (likely(deliver))
                ip6_input(skb);
        else {
                /* discard */
                kfree_skb(skb);
        }

        return 0;
}



























    7 








    6 















    1 








    1 



















    5 


    4 











    4 























    3 
    4 



    9 
















    8 

    3 


    6 

















    1 





    1 



















































































































































































    1 

    1 




















    3 







    4 












    1 










    3 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_NOTIFY_H
#define _LINUX_FS_NOTIFY_H

/*
 * include/linux/fsnotify.h - generic hooks for filesystem notification, to
 * reduce in-source duplication from both dnotify and inotify.
 *
 * We don't compile any of this away in some complicated menagerie of ifdefs.
 * Instead, we rely on the code inside to optimize away as needed.
 *
 * (C) Copyright 2005 Robert Love
 */

#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/bug.h>

/* Are there any inode/mount/sb objects watched with priority prio or above? */
static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb,
                                                     int prio)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return false;

        return atomic_long_read(&sbinfo->watched_objects[prio]);
}

/* Are there any inode/mount/sb objects that are being watched at all? */
static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
{
        return fsnotify_sb_has_priority_watchers(sb, 0);
}

/*
 * Notify this @dir inode about a change in a child directory entry.
 * The directory entry may have turned positive or negative or its inode may
 * have changed (i.e. renamed over).
 *
 * Unlike fsnotify_parent(), the event will be reported regardless of the
 * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
 * the child is interested and not the parent.
 */
static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
                                struct inode *dir, const struct qstr *name,
                                u32 cookie)
{
        if (!fsnotify_sb_has_watchers(dir->i_sb))
                return 0;

        return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
}

static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
{
        fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
}

static inline void fsnotify_inode(struct inode *inode, __u32 mask)
{
        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0);
}

/* Notify this dentry's parent about a child's events. */
static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        struct inode *inode = d_inode(dentry);

        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return 0;

        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;

                /* sb/mount marks are not interested in name of directory */
                if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                        goto notify_child;
        }

        /* disconnected dentry cannot notify parent */
        if (IS_ROOT(dentry))
                goto notify_child;

        return __fsnotify_parent(dentry, mask, data, data_type);

notify_child:
        return fsnotify(mask, data, data_type, NULL, NULL, inode, 0);
}

/*
 * Simple wrappers to consolidate calls to fsnotify_parent() when an event
 * is on a file/dentry.
 */
static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
{
        fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}

static inline int fsnotify_file(struct file *file, __u32 mask)
{
        const struct path *path;

        if (file->f_mode & FMODE_NONOTIFY)
                return 0;

        path = &file->f_path;
        /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */
        if (mask & ALL_FSNOTIFY_PERM_EVENTS &&
            !fsnotify_sb_has_priority_watchers(path->dentry->d_sb,
                                               FSNOTIFY_PRIO_CONTENT))
                return 0;

        return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}

#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
 * fsnotify_file_area_perm - permission hook before access to file range
 */
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        __u32 fsnotify_mask = FS_ACCESS_PERM;

        /*
         * filesystem may be modified in the context of permission events
         * (e.g. by HSM filling a file on access), so sb freeze protection
         * must not be held.
         */
        lockdep_assert_once(file_write_not_started(file));

        if (!(perm_mask & MAY_READ))
                return 0;

        return fsnotify_file(file, fsnotify_mask);
}

/*
 * fsnotify_file_perm - permission hook before file access
 */
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
}

/*
 * fsnotify_open_perm - permission hook before file open
 */
static inline int fsnotify_open_perm(struct file *file)
{
        int ret;

        if (file->f_flags & __FMODE_EXEC) {
                ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
                if (ret)
                        return ret;
        }

        return fsnotify_file(file, FS_OPEN_PERM);
}

#else
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return 0;
}

static inline int fsnotify_open_perm(struct file *file)
{
        return 0;
}
#endif

/*
 * fsnotify_link_count - inode's link count changed
 */
static inline void fsnotify_link_count(struct inode *inode)
{
        fsnotify_inode(inode, FS_ATTRIB);
}

/*
 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
 */
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const struct qstr *old_name,
                                 int isdir, struct inode *target,
                                 struct dentry *moved)
{
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
        __u32 old_dir_mask = FS_MOVED_FROM;
        __u32 new_dir_mask = FS_MOVED_TO;
        __u32 rename_mask = FS_RENAME;
        const struct qstr *new_name = &moved->d_name;

        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
                rename_mask |= FS_ISDIR;
        }

        /* Event with information about both old and new parent+name */
        fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
                      old_dir, old_name, 0);

        fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      old_dir, old_name, fs_cookie);
        fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      new_dir, new_name, fs_cookie);

        if (target)
                fsnotify_link_count(target);
        fsnotify_inode(source, FS_MOVE_SELF);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
}

/*
 * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
 */
static inline void fsnotify_inode_delete(struct inode *inode)
{
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
 */
static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        __fsnotify_vfsmount_delete(mnt);
}

/*
 * fsnotify_inoderemove - an inode is going away
 */
static inline void fsnotify_inoderemove(struct inode *inode)
{
        fsnotify_inode(inode, FS_DELETE_SELF);
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_create - 'name' was linked in
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE);
}

/*
 * fsnotify_link - new hardlink in 'inode' directory
 *
 * Caller must make sure that new_dentry->d_name is stable.
 * Note: We have to pass also the linked inode ptr as some filesystems leave
 *   new_dentry->d_inode NULL and instantiate inode pointer later
 */
static inline void fsnotify_link(struct inode *dir, struct inode *inode,
                                 struct dentry *new_dentry)
{
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
                      dir, &new_dentry->d_name, 0);
}

/*
 * fsnotify_delete - @dentry was unlinked and unhashed
 *
 * Caller must make sure that dentry->d_name is stable.
 *
 * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
 * as this may be called after d_delete() and old_dentry may be negative.
 */
static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
                                   struct dentry *dentry)
{
        __u32 mask = FS_DELETE;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
                      0);
}

/**
 * d_delete_notify - delete a dentry and call fsnotify_delete()
 * @dentry: The dentry to delete
 *
 * This helper is used to guaranty that the unlinked inode cannot be found
 * by lookup of this name after fsnotify_delete() event has been delivered.
 */
static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ihold(inode);
        d_delete(dentry);
        fsnotify_delete(dir, inode, dentry);
        iput(inode);
}

/*
 * fsnotify_unlink - 'name' was unlinked
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_mkdir - directory 'name' was created
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
}

/*
 * fsnotify_rmdir - directory 'name' was removed
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_access - file was read
 */
static inline void fsnotify_access(struct file *file)
{
        fsnotify_file(file, FS_ACCESS);
}

/*
 * fsnotify_modify - file was modified
 */
static inline void fsnotify_modify(struct file *file)
{
        fsnotify_file(file, FS_MODIFY);
}

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
        __u32 mask = FS_OPEN;

        if (file->f_flags & __FMODE_EXEC)
                mask |= FS_OPEN_EXEC;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_close - file was closed
 */
static inline void fsnotify_close(struct file *file)
{
        __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE :
                                                    FS_CLOSE_NOWRITE;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_xattr - extended attributes were changed
 */
static inline void fsnotify_xattr(struct dentry *dentry)
{
        fsnotify_dentry(dentry, FS_ATTRIB);
}

/*
 * fsnotify_change - notify_change event.  file was modified and/or metadata
 * was changed.
 */
static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
{
        __u32 mask = 0;

        if (ia_valid & ATTR_UID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_GID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_SIZE)
                mask |= FS_MODIFY;

        /* both times implies a utime(s) call */
        if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
                mask |= FS_ATTRIB;
        else if (ia_valid & ATTR_ATIME)
                mask |= FS_ACCESS;
        else if (ia_valid & ATTR_MTIME)
                mask |= FS_MODIFY;

        if (ia_valid & ATTR_MODE)
                mask |= FS_ATTRIB;

        if (mask)
                fsnotify_dentry(dentry, mask);
}

static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
                                    int error)
{
        struct fs_error_report report = {
                .error = error,
                .inode = inode,
                .sb = sb,
        };

        return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
                        NULL, NULL, NULL, 0);
}

#endif        /* _LINUX_FS_NOTIFY_H */





























































    1 



    3 


    2 
    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * This header is used to share core functionality between the
 * standalone connection tracking module, and the compatibility layer's use
 * of connection tracking.
 *
 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
 *        - generalize L3 protocol dependent part.
 *
 * Derived from include/linux/netfiter_ipv4/ip_conntrack_core.h
 */

#ifndef _NF_CONNTRACK_CORE_H
#define _NF_CONNTRACK_CORE_H

#include <linux/netfilter.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_l4proto.h>

/* This header is used to share core functionality between the
   standalone connection tracking module, and the compatibility layer's use
   of connection tracking. */

unsigned int nf_conntrack_in(struct sk_buff *skb,
                             const struct nf_hook_state *state);

int nf_conntrack_init_net(struct net *net);
void nf_conntrack_cleanup_net(struct net *net);
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);

void nf_conntrack_proto_pernet_init(struct net *net);

int nf_conntrack_proto_init(void);
void nf_conntrack_proto_fini(void);

int nf_conntrack_init_start(void);
void nf_conntrack_cleanup_start(void);

void nf_conntrack_init_end(void);
void nf_conntrack_cleanup_end(void);

bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
                        const struct nf_conntrack_tuple *orig);

/* Find a connection corresponding to a tuple. */
struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net,
                      const struct nf_conntrack_zone *zone,
                      const struct nf_conntrack_tuple *tuple);

int __nf_conntrack_confirm(struct sk_buff *skb);

/* Confirm a connection: returns NF_DROP if packet must be dropped. */
static inline int nf_conntrack_confirm(struct sk_buff *skb)
{
        struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb);
        int ret = NF_ACCEPT;

        if (ct) {
                if (!nf_ct_is_confirmed(ct)) {
                        ret = __nf_conntrack_confirm(skb);

                        if (ret == NF_ACCEPT)
                                ct = (struct nf_conn *)skb_nfct(skb);
                }

                if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct))
                        nf_ct_deliver_cached_events(ct);
        }
        return ret;
}

unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state);

void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
                 const struct nf_conntrack_l4proto *proto);

#define CONNTRACK_LOCKS 1024

extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
void nf_conntrack_lock(spinlock_t *lock);

extern spinlock_t nf_conntrack_expect_lock;

/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */

static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
{
        if (timeout > INT_MAX)
                timeout = INT_MAX;

        if (nf_ct_is_confirmed(ct))
                WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
        else
                ct->timeout = (u32)timeout;
}

int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout);
void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off);
int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status);

#endif /* _NF_CONNTRACK_CORE_H */




























    2 




    2 






































































    1 


    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Lock-less NULL terminated single linked list
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/llist.h>


/**
 * llist_add_batch - add several linked entries in batch
 * @new_first:        first entry in batch to be added
 * @new_last:        last entry in batch to be added
 * @head:        the head for your lock-less list
 *
 * Return whether list is empty before adding.
 */
bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
                     struct llist_head *head)
{
        struct llist_node *first = READ_ONCE(head->first);

        do {
                new_last->next = first;
        } while (!try_cmpxchg(&head->first, &first, new_first));

        return !first;
}
EXPORT_SYMBOL_GPL(llist_add_batch);

/**
 * llist_del_first - delete the first entry of lock-less list
 * @head:        the head for your lock-less list
 *
 * If list is empty, return NULL, otherwise, return the first entry
 * deleted, this is the newest added one.
 *
 * Only one llist_del_first user can be used simultaneously with
 * multiple llist_add users without lock.  Because otherwise
 * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
 * llist_add) sequence in another user may change @head->first->next,
 * but keep @head->first.  If multiple consumers are needed, please
 * use llist_del_all or use lock between consumers.
 */
struct llist_node *llist_del_first(struct llist_head *head)
{
        struct llist_node *entry, *next;

        entry = smp_load_acquire(&head->first);
        do {
                if (entry == NULL)
                        return NULL;
                next = READ_ONCE(entry->next);
        } while (!try_cmpxchg(&head->first, &entry, next));

        return entry;
}
EXPORT_SYMBOL_GPL(llist_del_first);

/**
 * llist_del_first_this - delete given entry of lock-less list if it is first
 * @head:        the head for your lock-less list
 * @this:        a list entry.
 *
 * If head of the list is given entry, delete and return %true else
 * return %false.
 *
 * Multiple callers can safely call this concurrently with multiple
 * llist_add() callers, providing all the callers offer a different @this.
 */
bool llist_del_first_this(struct llist_head *head,
                          struct llist_node *this)
{
        struct llist_node *entry, *next;

        /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */
        entry = smp_load_acquire(&head->first);
        do {
                if (entry != this)
                        return false;
                next = READ_ONCE(entry->next);
        } while (!try_cmpxchg(&head->first, &entry, next));

        return true;
}
EXPORT_SYMBOL_GPL(llist_del_first_this);

/**
 * llist_reverse_order - reverse order of a llist chain
 * @head:        first item of the list to be reversed
 *
 * Reverse the order of a chain of llist entries and return the
 * new first entry.
 */
struct llist_node *llist_reverse_order(struct llist_node *head)
{
        struct llist_node *new_head = NULL;

        while (head) {
                struct llist_node *tmp = head;
                head = head->next;
                tmp->next = new_head;
                new_head = tmp;
        }

        return new_head;
}
EXPORT_SYMBOL_GPL(llist_reverse_order);

















































































































    1 



    1 



    1 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Sally Floyd's High Speed TCP (RFC 3649) congestion control
 *
 * See https://www.icir.org/floyd/hstcp.html
 *
 * John Heffner <jheffner@psc.edu>
 */

#include <linux/module.h>
#include <net/tcp.h>

/* From AIMD tables from RFC 3649 appendix B,
 * with fixed-point MD scaled <<8.
 */
static const struct hstcp_aimd_val {
        unsigned int cwnd;
        unsigned int md;
} hstcp_aimd_vals[] = {
        {     38,  128, /*  0.50 */ },
        {    118,  112, /*  0.44 */ },
        {    221,  104, /*  0.41 */ },
        {    347,   98, /*  0.38 */ },
        {    495,   93, /*  0.37 */ },
        {    663,   89, /*  0.35 */ },
        {    851,   86, /*  0.34 */ },
        {   1058,   83, /*  0.33 */ },
        {   1284,   81, /*  0.32 */ },
        {   1529,   78, /*  0.31 */ },
        {   1793,   76, /*  0.30 */ },
        {   2076,   74, /*  0.29 */ },
        {   2378,   72, /*  0.28 */ },
        {   2699,   71, /*  0.28 */ },
        {   3039,   69, /*  0.27 */ },
        {   3399,   68, /*  0.27 */ },
        {   3778,   66, /*  0.26 */ },
        {   4177,   65, /*  0.26 */ },
        {   4596,   64, /*  0.25 */ },
        {   5036,   62, /*  0.25 */ },
        {   5497,   61, /*  0.24 */ },
        {   5979,   60, /*  0.24 */ },
        {   6483,   59, /*  0.23 */ },
        {   7009,   58, /*  0.23 */ },
        {   7558,   57, /*  0.22 */ },
        {   8130,   56, /*  0.22 */ },
        {   8726,   55, /*  0.22 */ },
        {   9346,   54, /*  0.21 */ },
        {   9991,   53, /*  0.21 */ },
        {  10661,   52, /*  0.21 */ },
        {  11358,   52, /*  0.20 */ },
        {  12082,   51, /*  0.20 */ },
        {  12834,   50, /*  0.20 */ },
        {  13614,   49, /*  0.19 */ },
        {  14424,   48, /*  0.19 */ },
        {  15265,   48, /*  0.19 */ },
        {  16137,   47, /*  0.19 */ },
        {  17042,   46, /*  0.18 */ },
        {  17981,   45, /*  0.18 */ },
        {  18955,   45, /*  0.18 */ },
        {  19965,   44, /*  0.17 */ },
        {  21013,   43, /*  0.17 */ },
        {  22101,   43, /*  0.17 */ },
        {  23230,   42, /*  0.17 */ },
        {  24402,   41, /*  0.16 */ },
        {  25618,   41, /*  0.16 */ },
        {  26881,   40, /*  0.16 */ },
        {  28193,   39, /*  0.16 */ },
        {  29557,   39, /*  0.15 */ },
        {  30975,   38, /*  0.15 */ },
        {  32450,   38, /*  0.15 */ },
        {  33986,   37, /*  0.15 */ },
        {  35586,   36, /*  0.14 */ },
        {  37253,   36, /*  0.14 */ },
        {  38992,   35, /*  0.14 */ },
        {  40808,   35, /*  0.14 */ },
        {  42707,   34, /*  0.13 */ },
        {  44694,   33, /*  0.13 */ },
        {  46776,   33, /*  0.13 */ },
        {  48961,   32, /*  0.13 */ },
        {  51258,   32, /*  0.13 */ },
        {  53677,   31, /*  0.12 */ },
        {  56230,   30, /*  0.12 */ },
        {  58932,   30, /*  0.12 */ },
        {  61799,   29, /*  0.12 */ },
        {  64851,   28, /*  0.11 */ },
        {  68113,   28, /*  0.11 */ },
        {  71617,   27, /*  0.11 */ },
        {  75401,   26, /*  0.10 */ },
        {  79517,   26, /*  0.10 */ },
        {  84035,   25, /*  0.10 */ },
        {  89053,   24, /*  0.10 */ },
};

#define HSTCP_AIMD_MAX        ARRAY_SIZE(hstcp_aimd_vals)

struct hstcp {
        u32        ai;
};

static void hstcp_init(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct hstcp *ca = inet_csk_ca(sk);

        ca->ai = 0;

        /* Ensure the MD arithmetic works.  This is somewhat pedantic,
         * since I don't think we will see a cwnd this large. :) */
        tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}

static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct hstcp *ca = inet_csk_ca(sk);

        if (!tcp_is_cwnd_limited(sk))
                return;

        if (tcp_in_slow_start(tp))
                tcp_slow_start(tp, acked);
        else {
                /* Update AIMD parameters.
                 *
                 * We want to guarantee that:
                 *     hstcp_aimd_vals[ca->ai-1].cwnd <
                 *     snd_cwnd <=
                 *     hstcp_aimd_vals[ca->ai].cwnd
                 */
                if (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd) {
                        while (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd &&
                               ca->ai < HSTCP_AIMD_MAX - 1)
                                ca->ai++;
                } else if (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd) {
                        while (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd)
                                ca->ai--;
                }

                /* Do additive increase */
                if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
                        /* cwnd = cwnd + a(w) / cwnd */
                        tp->snd_cwnd_cnt += ca->ai + 1;
                        if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
                                tp->snd_cwnd_cnt -= tcp_snd_cwnd(tp);
                                tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
                        }
                }
        }
}

static u32 hstcp_ssthresh(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct hstcp *ca = inet_csk_ca(sk);

        /* Do multiplicative decrease */
        return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}

static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
        .init                = hstcp_init,
        .ssthresh        = hstcp_ssthresh,
        .undo_cwnd        = tcp_reno_undo_cwnd,
        .cong_avoid        = hstcp_cong_avoid,

        .owner                = THIS_MODULE,
        .name                = "highspeed"
};

static int __init hstcp_register(void)
{
        BUILD_BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
        return tcp_register_congestion_control(&tcp_highspeed);
}

static void __exit hstcp_unregister(void)
{
        tcp_unregister_congestion_control(&tcp_highspeed);
}

module_init(hstcp_register);
module_exit(hstcp_unregister);

MODULE_AUTHOR("John Heffner");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("High Speed TCP");






















































































































    1 








    1 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H

/*
 * Interface between the scheduler and various task lifetime (fork()/exit())
 * functionality:
 */

#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/uaccess.h>

struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL

struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        const char *name;
        int exit_signal;
        u32 kthread:1;
        u32 io_thread:1;
        u32 user_worker:1;
        u32 no_files:1;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
        pid_t *set_tid;
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
        int idle;
        int (*fn)(void *);
        void *fn_arg;
        struct cgroup *cgrp;
        struct css_set *cset;
};

/*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;

extern union thread_union init_thread_union;
extern struct task_struct init_task;

extern int lockdep_tasklist_lock_is_held(void);

extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);

extern void mm_cache_init(void);
extern void proc_caches_init(void);

extern void fork_init(void);

extern void release_task(struct task_struct * p);

extern int copy_thread(struct task_struct *, const struct kernel_clone_args *);

extern void flush_thread(void);

#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern __noreturn void do_group_exit(int);

extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);

extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *copy_process(struct pid *pid, int trace, int node,
                                 struct kernel_clone_args *args);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                            unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);

extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec()   {}
#endif

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
        if (!refcount_dec_and_test(&t->usage))
                return;

        /*
         * In !RT, it is always safe to call __put_task_struct().
         * Under RT, we can only call it in preemptible context.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
                static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);

                lock_map_acquire_try(&put_task_map);
                __put_task_struct(t);
                lock_map_release(&put_task_map);
                return;
        }

        /*
         * under PREEMPT_RT, we can't call put_task_struct
         * in atomic context because it will indirectly
         * acquire sleeping locks.
         *
         * call_rcu() will schedule delayed_put_task_struct_rcu()
         * to be called in process context.
         *
         * __put_task_struct() is called when
         * refcount_dec_and_test(&t->usage) succeeds.
         *
         * This means that it can't "conflict" with
         * put_task_struct_rcu_user() which abuses ->rcu the same
         * way; rcu_users has a reference so task->usage can't be
         * zero after rcu_users 1 -> 0 transition.
         *
         * delayed_free_task() also uses ->rcu, but it is only called
         * when it fails to fork a process. Therefore, there is no
         * way it can conflict with put_task_struct().
         */
        call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}

DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))

static inline void put_task_struct_many(struct task_struct *t, int nr)
{
        if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
}

void put_task_struct_rcu_user(struct task_struct *task);

/* Free all architecture-specific resources held by a thread. */
void release_thread(struct task_struct *dead_task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif

#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
 * If an architecture has not declared a thread_struct whitelist we
 * must assume something there may need to be copied to userspace.
 */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = 0;
        /* Handle dynamically sized thread_struct. */
        *size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif

#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return NULL;
}
#endif

/*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
 * neither inside nor outside.
 */
static inline void task_lock(struct task_struct *p)
{
        spin_lock(&p->alloc_lock);
}

static inline void task_unlock(struct task_struct *p)
{
        spin_unlock(&p->alloc_lock);
}

DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))

#endif /* _LINUX_SCHED_TASK_H */




















































    7 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2005-2010 IBM Corporation
 *
 * Authors:
 * Mimi Zohar <zohar@us.ibm.com>
 * Kylene Hall <kjhall@us.ibm.com>
 *
 * File: evm.h
 */

#ifndef __INTEGRITY_EVM_H
#define __INTEGRITY_EVM_H

#include <linux/xattr.h>
#include <linux/security.h>

#include "../integrity.h"

#define EVM_INIT_HMAC        0x0001
#define EVM_INIT_X509        0x0002
#define EVM_ALLOW_METADATA_WRITES        0x0004
#define EVM_SETUP_COMPLETE 0x80000000 /* userland has signaled key load */

#define EVM_KEY_MASK (EVM_INIT_HMAC | EVM_INIT_X509)
#define EVM_INIT_MASK (EVM_INIT_HMAC | EVM_INIT_X509 | EVM_SETUP_COMPLETE | \
                       EVM_ALLOW_METADATA_WRITES)

struct xattr_list {
        struct list_head list;
        char *name;
        bool enabled;
};

#define EVM_NEW_FILE                        0x00000001
#define EVM_IMMUTABLE_DIGSIG                0x00000002

/* EVM integrity metadata associated with an inode */
struct evm_iint_cache {
        unsigned long flags;
        enum integrity_status evm_status:4;
        struct integrity_inode_attributes metadata_inode;
};

extern struct lsm_blob_sizes evm_blob_sizes;

static inline struct evm_iint_cache *evm_iint_inode(const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;

        return inode->i_security + evm_blob_sizes.lbs_inode;
}

extern int evm_initialized;

#define EVM_ATTR_FSUUID                0x0001

extern int evm_hmac_attrs;

/* List of EVM protected security xattrs */
extern struct list_head evm_config_xattrnames;

struct evm_digest {
        struct ima_digest_data_hdr hdr;
        char digest[IMA_MAX_DIGEST_SIZE];
} __packed;

int evm_protected_xattr(const char *req_xattr_name);

int evm_init_key(void);
int evm_update_evmxattr(struct dentry *dentry,
                        const char *req_xattr_name,
                        const char *req_xattr_value,
                        size_t req_xattr_value_len);
int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name,
                  const char *req_xattr_value,
                  size_t req_xattr_value_len, struct evm_digest *data,
                  struct evm_iint_cache *iint);
int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name,
                  const char *req_xattr_value,
                  size_t req_xattr_value_len, char type,
                  struct evm_digest *data, struct evm_iint_cache *iint);
int evm_init_hmac(struct inode *inode, const struct xattr *xattrs,
                  char *hmac_val);
int evm_init_secfs(void);

#endif










































    1 












    1 



    1 




    1 




    1 







    1 






































































































    1 













    1 




    1 




    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This abstraction carries sctp events to the ULP (sockets).
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Sridhar Samudrala     <sri@us.ibm.com>
 */

#include <linux/slab.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/busy_poll.h>
#include <net/sctp/structs.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

/* Forward declarations for internal helpers.  */
static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
                                              struct sctp_ulpevent *);
static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *,
                                              struct sctp_ulpevent *);
static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq);

/* 1st Level Abstractions */

/* Initialize a ULP queue from a block of memory.  */
void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc)
{
        memset(ulpq, 0, sizeof(struct sctp_ulpq));

        ulpq->asoc = asoc;
        skb_queue_head_init(&ulpq->reasm);
        skb_queue_head_init(&ulpq->reasm_uo);
        skb_queue_head_init(&ulpq->lobby);
        ulpq->pd_mode  = 0;
}


/* Flush the reassembly and ordering queues.  */
void sctp_ulpq_flush(struct sctp_ulpq *ulpq)
{
        struct sk_buff *skb;
        struct sctp_ulpevent *event;

        while ((skb = __skb_dequeue(&ulpq->lobby)) != NULL) {
                event = sctp_skb2event(skb);
                sctp_ulpevent_free(event);
        }

        while ((skb = __skb_dequeue(&ulpq->reasm)) != NULL) {
                event = sctp_skb2event(skb);
                sctp_ulpevent_free(event);
        }

        while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) {
                event = sctp_skb2event(skb);
                sctp_ulpevent_free(event);
        }
}

/* Dispose of a ulpqueue.  */
void sctp_ulpq_free(struct sctp_ulpq *ulpq)
{
        sctp_ulpq_flush(ulpq);
}

/* Process an incoming DATA chunk.  */
int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
                        gfp_t gfp)
{
        struct sk_buff_head temp;
        struct sctp_ulpevent *event;
        int event_eor = 0;

        /* Create an event from the incoming chunk. */
        event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
        if (!event)
                return -ENOMEM;

        event->ssn = ntohs(chunk->subh.data_hdr->ssn);
        event->ppid = chunk->subh.data_hdr->ppid;

        /* Do reassembly if needed.  */
        event = sctp_ulpq_reasm(ulpq, event);

        /* Do ordering if needed.  */
        if (event) {
                /* Create a temporary list to collect chunks on.  */
                skb_queue_head_init(&temp);
                __skb_queue_tail(&temp, sctp_event2skb(event));

                if (event->msg_flags & MSG_EOR)
                        event = sctp_ulpq_order(ulpq, event);
        }

        /* Send event to the ULP.  'event' is the sctp_ulpevent for
         * very first SKB on the 'temp' list.
         */
        if (event) {
                event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
                sctp_ulpq_tail_event(ulpq, &temp);
        }

        return event_eor;
}

/* Add a new event for propagation to the ULP.  */
/* Clear the partial delivery mode for this socket.   Note: This
 * assumes that no association is currently in partial delivery mode.
 */
int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc)
{
        struct sctp_sock *sp = sctp_sk(sk);

        if (atomic_dec_and_test(&sp->pd_mode)) {
                /* This means there are no other associations in PD, so
                 * we can go ahead and clear out the lobby in one shot
                 */
                if (!skb_queue_empty(&sp->pd_lobby)) {
                        skb_queue_splice_tail_init(&sp->pd_lobby,
                                                   &sk->sk_receive_queue);
                        return 1;
                }
        } else {
                /* There are other associations in PD, so we only need to
                 * pull stuff out of the lobby that belongs to the
                 * associations that is exiting PD (all of its notifications
                 * are posted here).
                 */
                if (!skb_queue_empty(&sp->pd_lobby) && asoc) {
                        struct sk_buff *skb, *tmp;
                        struct sctp_ulpevent *event;

                        sctp_skb_for_each(skb, &sp->pd_lobby, tmp) {
                                event = sctp_skb2event(skb);
                                if (event->asoc == asoc) {
                                        __skb_unlink(skb, &sp->pd_lobby);
                                        __skb_queue_tail(&sk->sk_receive_queue,
                                                         skb);
                                }
                        }
                }
        }

        return 0;
}

/* Set the pd_mode on the socket and ulpq */
static void sctp_ulpq_set_pd(struct sctp_ulpq *ulpq)
{
        struct sctp_sock *sp = sctp_sk(ulpq->asoc->base.sk);

        atomic_inc(&sp->pd_mode);
        ulpq->pd_mode = 1;
}

/* Clear the pd_mode and restart any pending messages waiting for delivery. */
static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq)
{
        ulpq->pd_mode = 0;
        sctp_ulpq_reasm_drain(ulpq);
        return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc);
}

int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list)
{
        struct sock *sk = ulpq->asoc->base.sk;
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_ulpevent *event;
        struct sk_buff_head *queue;
        struct sk_buff *skb;
        int clear_pd = 0;

        skb = __skb_peek(skb_list);
        event = sctp_skb2event(skb);

        /* If the socket is just going to throw this away, do not
         * even try to deliver it.
         */
        if (sk->sk_shutdown & RCV_SHUTDOWN &&
            (sk->sk_shutdown & SEND_SHUTDOWN ||
             !sctp_ulpevent_is_notification(event)))
                goto out_free;

        if (!sctp_ulpevent_is_notification(event)) {
                sk_mark_napi_id(sk, skb);
                sk_incoming_cpu_update(sk);
        }
        /* Check if the user wishes to receive this event.  */
        if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
                goto out_free;

        /* If we are in partial delivery mode, post to the lobby until
         * partial delivery is cleared, unless, of course _this_ is
         * the association the cause of the partial delivery.
         */

        if (atomic_read(&sp->pd_mode) == 0) {
                queue = &sk->sk_receive_queue;
        } else {
                if (ulpq->pd_mode) {
                        /* If the association is in partial delivery, we
                         * need to finish delivering the partially processed
                         * packet before passing any other data.  This is
                         * because we don't truly support stream interleaving.
                         */
                        if ((event->msg_flags & MSG_NOTIFICATION) ||
                            (SCTP_DATA_NOT_FRAG ==
                                    (event->msg_flags & SCTP_DATA_FRAG_MASK)))
                                queue = &sp->pd_lobby;
                        else {
                                clear_pd = event->msg_flags & MSG_EOR;
                                queue = &sk->sk_receive_queue;
                        }
                } else {
                        /*
                         * If fragment interleave is enabled, we
                         * can queue this to the receive queue instead
                         * of the lobby.
                         */
                        if (sp->frag_interleave)
                                queue = &sk->sk_receive_queue;
                        else
                                queue = &sp->pd_lobby;
                }
        }

        skb_queue_splice_tail_init(skb_list, queue);

        /* Did we just complete partial delivery and need to get
         * rolling again?  Move pending data to the receive
         * queue.
         */
        if (clear_pd)
                sctp_ulpq_clear_pd(ulpq);

        if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
                if (!sock_owned_by_user(sk))
                        sp->data_ready_signalled = 1;
                sk->sk_data_ready(sk);
        }
        return 1;

out_free:
        sctp_queue_purge_ulpevents(skb_list);

        return 0;
}

/* 2nd Level Abstractions */

/* Helper function to store chunks that need to be reassembled.  */
static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq,
                                         struct sctp_ulpevent *event)
{
        struct sk_buff *pos;
        struct sctp_ulpevent *cevent;
        __u32 tsn, ctsn;

        tsn = event->tsn;

        /* See if it belongs at the end. */
        pos = skb_peek_tail(&ulpq->reasm);
        if (!pos) {
                __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
                return;
        }

        /* Short circuit just dropping it at the end. */
        cevent = sctp_skb2event(pos);
        ctsn = cevent->tsn;
        if (TSN_lt(ctsn, tsn)) {
                __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
                return;
        }

        /* Find the right place in this list. We store them by TSN.  */
        skb_queue_walk(&ulpq->reasm, pos) {
                cevent = sctp_skb2event(pos);
                ctsn = cevent->tsn;

                if (TSN_lt(tsn, ctsn))
                        break;
        }

        /* Insert before pos. */
        __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event));

}

/* Helper function to return an event corresponding to the reassembled
 * datagram.
 * This routine creates a re-assembled skb given the first and last skb's
 * as stored in the reassembly queue. The skb's may be non-linear if the sctp
 * payload was fragmented on the way and ip had to reassemble them.
 * We add the rest of skb's to the first skb's fraglist.
 */
struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net,
                                                  struct sk_buff_head *queue,
                                                  struct sk_buff *f_frag,
                                                  struct sk_buff *l_frag)
{
        struct sk_buff *pos;
        struct sk_buff *new = NULL;
        struct sctp_ulpevent *event;
        struct sk_buff *pnext, *last;
        struct sk_buff *list = skb_shinfo(f_frag)->frag_list;

        /* Store the pointer to the 2nd skb */
        if (f_frag == l_frag)
                pos = NULL;
        else
                pos = f_frag->next;

        /* Get the last skb in the f_frag's frag_list if present. */
        for (last = list; list; last = list, list = list->next)
                ;

        /* Add the list of remaining fragments to the first fragments
         * frag_list.
         */
        if (last)
                last->next = pos;
        else {
                if (skb_cloned(f_frag)) {
                        /* This is a cloned skb, we can't just modify
                         * the frag_list.  We need a new skb to do that.
                         * Instead of calling skb_unshare(), we'll do it
                         * ourselves since we need to delay the free.
                         */
                        new = skb_copy(f_frag, GFP_ATOMIC);
                        if (!new)
                                return NULL;        /* try again later */

                        sctp_skb_set_owner_r(new, f_frag->sk);

                        skb_shinfo(new)->frag_list = pos;
                } else
                        skb_shinfo(f_frag)->frag_list = pos;
        }

        /* Remove the first fragment from the reassembly queue.  */
        __skb_unlink(f_frag, queue);

        /* if we did unshare, then free the old skb and re-assign */
        if (new) {
                kfree_skb(f_frag);
                f_frag = new;
        }

        while (pos) {

                pnext = pos->next;

                /* Update the len and data_len fields of the first fragment. */
                f_frag->len += pos->len;
                f_frag->data_len += pos->len;

                /* Remove the fragment from the reassembly queue.  */
                __skb_unlink(pos, queue);

                /* Break if we have reached the last fragment.  */
                if (pos == l_frag)
                        break;
                pos->next = pnext;
                pos = pnext;
        }

        event = sctp_skb2event(f_frag);
        SCTP_INC_STATS(net, SCTP_MIB_REASMUSRMSGS);

        return event;
}


/* Helper function to check if an incoming chunk has filled up the last
 * missing fragment in a SCTP datagram and return the corresponding event.
 */
static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ulpq)
{
        struct sk_buff *pos;
        struct sctp_ulpevent *cevent;
        struct sk_buff *first_frag = NULL;
        __u32 ctsn, next_tsn;
        struct sctp_ulpevent *retval = NULL;
        struct sk_buff *pd_first = NULL;
        struct sk_buff *pd_last = NULL;
        size_t pd_len = 0;
        struct sctp_association *asoc;
        u32 pd_point;

        /* Initialized to 0 just to avoid compiler warning message.  Will
         * never be used with this value. It is referenced only after it
         * is set when we find the first fragment of a message.
         */
        next_tsn = 0;

        /* The chunks are held in the reasm queue sorted by TSN.
         * Walk through the queue sequentially and look for a sequence of
         * fragmented chunks that complete a datagram.
         * 'first_frag' and next_tsn are reset when we find a chunk which
         * is the first fragment of a datagram. Once these 2 fields are set
         * we expect to find the remaining middle fragments and the last
         * fragment in order. If not, first_frag is reset to NULL and we
         * start the next pass when we find another first fragment.
         *
         * There is a potential to do partial delivery if user sets
         * SCTP_PARTIAL_DELIVERY_POINT option. Lets count some things here
         * to see if can do PD.
         */
        skb_queue_walk(&ulpq->reasm, pos) {
                cevent = sctp_skb2event(pos);
                ctsn = cevent->tsn;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        /* If this "FIRST_FRAG" is the first
                         * element in the queue, then count it towards
                         * possible PD.
                         */
                        if (skb_queue_is_first(&ulpq->reasm, pos)) {
                            pd_first = pos;
                            pd_last = pos;
                            pd_len = pos->len;
                        } else {
                            pd_first = NULL;
                            pd_last = NULL;
                            pd_len = 0;
                        }

                        first_frag = pos;
                        next_tsn = ctsn + 1;
                        break;

                case SCTP_DATA_MIDDLE_FRAG:
                        if ((first_frag) && (ctsn == next_tsn)) {
                                next_tsn++;
                                if (pd_first) {
                                    pd_last = pos;
                                    pd_len += pos->len;
                                }
                        } else
                                first_frag = NULL;
                        break;

                case SCTP_DATA_LAST_FRAG:
                        if (first_frag && (ctsn == next_tsn))
                                goto found;
                        else
                                first_frag = NULL;
                        break;
                }
        }

        asoc = ulpq->asoc;
        if (pd_first) {
                /* Make sure we can enter partial deliver.
                 * We can trigger partial delivery only if framgent
                 * interleave is set, or the socket is not already
                 * in  partial delivery.
                 */
                if (!sctp_sk(asoc->base.sk)->frag_interleave &&
                    atomic_read(&sctp_sk(asoc->base.sk)->pd_mode))
                        goto done;

                cevent = sctp_skb2event(pd_first);
                pd_point = sctp_sk(asoc->base.sk)->pd_point;
                if (pd_point && pd_point <= pd_len) {
                        retval = sctp_make_reassembled_event(asoc->base.net,
                                                             &ulpq->reasm,
                                                             pd_first, pd_last);
                        if (retval)
                                sctp_ulpq_set_pd(ulpq);
                }
        }
done:
        return retval;
found:
        retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
                                             &ulpq->reasm, first_frag, pos);
        if (retval)
                retval->msg_flags |= MSG_EOR;
        goto done;
}

/* Retrieve the next set of fragments of a partial message. */
static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq)
{
        struct sk_buff *pos, *last_frag, *first_frag;
        struct sctp_ulpevent *cevent;
        __u32 ctsn, next_tsn;
        int is_last;
        struct sctp_ulpevent *retval;

        /* The chunks are held in the reasm queue sorted by TSN.
         * Walk through the queue sequentially and look for the first
         * sequence of fragmented chunks.
         */

        if (skb_queue_empty(&ulpq->reasm))
                return NULL;

        last_frag = first_frag = NULL;
        retval = NULL;
        next_tsn = 0;
        is_last = 0;

        skb_queue_walk(&ulpq->reasm, pos) {
                cevent = sctp_skb2event(pos);
                ctsn = cevent->tsn;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        if (!first_frag)
                                return NULL;
                        goto done;
                case SCTP_DATA_MIDDLE_FRAG:
                        if (!first_frag) {
                                first_frag = pos;
                                next_tsn = ctsn + 1;
                                last_frag = pos;
                        } else if (next_tsn == ctsn) {
                                next_tsn++;
                                last_frag = pos;
                        } else
                                goto done;
                        break;
                case SCTP_DATA_LAST_FRAG:
                        if (!first_frag)
                                first_frag = pos;
                        else if (ctsn != next_tsn)
                                goto done;
                        last_frag = pos;
                        is_last = 1;
                        goto done;
                default:
                        return NULL;
                }
        }

        /* We have the reassembled event. There is no need to look
         * further.
         */
done:
        retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm,
                                             first_frag, last_frag);
        if (retval && is_last)
                retval->msg_flags |= MSG_EOR;

        return retval;
}


/* Helper function to reassemble chunks.  Hold chunks on the reasm queue that
 * need reassembling.
 */
static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq,
                                                struct sctp_ulpevent *event)
{
        struct sctp_ulpevent *retval = NULL;

        /* Check if this is part of a fragmented message.  */
        if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
                event->msg_flags |= MSG_EOR;
                return event;
        }

        sctp_ulpq_store_reasm(ulpq, event);
        if (!ulpq->pd_mode)
                retval = sctp_ulpq_retrieve_reassembled(ulpq);
        else {
                __u32 ctsn, ctsnap;

                /* Do not even bother unless this is the next tsn to
                 * be delivered.
                 */
                ctsn = event->tsn;
                ctsnap = sctp_tsnmap_get_ctsn(&ulpq->asoc->peer.tsn_map);
                if (TSN_lte(ctsn, ctsnap))
                        retval = sctp_ulpq_retrieve_partial(ulpq);
        }

        return retval;
}

/* Retrieve the first part (sequential fragments) for partial delivery.  */
static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq)
{
        struct sk_buff *pos, *last_frag, *first_frag;
        struct sctp_ulpevent *cevent;
        __u32 ctsn, next_tsn;
        struct sctp_ulpevent *retval;

        /* The chunks are held in the reasm queue sorted by TSN.
         * Walk through the queue sequentially and look for a sequence of
         * fragmented chunks that start a datagram.
         */

        if (skb_queue_empty(&ulpq->reasm))
                return NULL;

        last_frag = first_frag = NULL;
        retval = NULL;
        next_tsn = 0;

        skb_queue_walk(&ulpq->reasm, pos) {
                cevent = sctp_skb2event(pos);
                ctsn = cevent->tsn;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        if (!first_frag) {
                                first_frag = pos;
                                next_tsn = ctsn + 1;
                                last_frag = pos;
                        } else
                                goto done;
                        break;

                case SCTP_DATA_MIDDLE_FRAG:
                        if (!first_frag)
                                return NULL;
                        if (ctsn == next_tsn) {
                                next_tsn++;
                                last_frag = pos;
                        } else
                                goto done;
                        break;

                case SCTP_DATA_LAST_FRAG:
                        if (!first_frag)
                                return NULL;
                        else
                                goto done;
                        break;

                default:
                        return NULL;
                }
        }

        /* We have the reassembled event. There is no need to look
         * further.
         */
done:
        retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm,
                                             first_frag, last_frag);
        return retval;
}

/*
 * Flush out stale fragments from the reassembly queue when processing
 * a Forward TSN.
 *
 * RFC 3758, Section 3.6
 *
 * After receiving and processing a FORWARD TSN, the data receiver MUST
 * take cautions in updating its re-assembly queue.  The receiver MUST
 * remove any partially reassembled message, which is still missing one
 * or more TSNs earlier than or equal to the new cumulative TSN point.
 * In the event that the receiver has invoked the partial delivery API,
 * a notification SHOULD also be generated to inform the upper layer API
 * that the message being partially delivered will NOT be completed.
 */
void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn)
{
        struct sk_buff *pos, *tmp;
        struct sctp_ulpevent *event;
        __u32 tsn;

        if (skb_queue_empty(&ulpq->reasm))
                return;

        skb_queue_walk_safe(&ulpq->reasm, pos, tmp) {
                event = sctp_skb2event(pos);
                tsn = event->tsn;

                /* Since the entire message must be abandoned by the
                 * sender (item A3 in Section 3.5, RFC 3758), we can
                 * free all fragments on the list that are less then
                 * or equal to ctsn_point
                 */
                if (TSN_lte(tsn, fwd_tsn)) {
                        __skb_unlink(pos, &ulpq->reasm);
                        sctp_ulpevent_free(event);
                } else
                        break;
        }
}

/*
 * Drain the reassembly queue.  If we just cleared parted delivery, it
 * is possible that the reassembly queue will contain already reassembled
 * messages.  Retrieve any such messages and give them to the user.
 */
static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq)
{
        struct sctp_ulpevent *event = NULL;

        if (skb_queue_empty(&ulpq->reasm))
                return;

        while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) {
                struct sk_buff_head temp;

                skb_queue_head_init(&temp);
                __skb_queue_tail(&temp, sctp_event2skb(event));

                /* Do ordering if needed.  */
                if (event->msg_flags & MSG_EOR)
                        event = sctp_ulpq_order(ulpq, event);

                /* Send event to the ULP.  'event' is the
                 * sctp_ulpevent for  very first SKB on the  temp' list.
                 */
                if (event)
                        sctp_ulpq_tail_event(ulpq, &temp);
        }
}


/* Helper function to gather skbs that have possibly become
 * ordered by an incoming chunk.
 */
static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
                                              struct sctp_ulpevent *event)
{
        struct sk_buff_head *event_list;
        struct sk_buff *pos, *tmp;
        struct sctp_ulpevent *cevent;
        struct sctp_stream *stream;
        __u16 sid, csid, cssn;

        sid = event->stream;
        stream  = &ulpq->asoc->stream;

        event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;

        /* We are holding the chunks by stream, by SSN.  */
        sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
                cevent = (struct sctp_ulpevent *) pos->cb;
                csid = cevent->stream;
                cssn = cevent->ssn;

                /* Have we gone too far?  */
                if (csid > sid)
                        break;

                /* Have we not gone far enough?  */
                if (csid < sid)
                        continue;

                if (cssn != sctp_ssn_peek(stream, in, sid))
                        break;

                /* Found it, so mark in the stream. */
                sctp_ssn_next(stream, in, sid);

                __skb_unlink(pos, &ulpq->lobby);

                /* Attach all gathered skbs to the event.  */
                __skb_queue_tail(event_list, pos);
        }
}

/* Helper function to store chunks needing ordering.  */
static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq,
                                           struct sctp_ulpevent *event)
{
        struct sk_buff *pos;
        struct sctp_ulpevent *cevent;
        __u16 sid, csid;
        __u16 ssn, cssn;

        pos = skb_peek_tail(&ulpq->lobby);
        if (!pos) {
                __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
                return;
        }

        sid = event->stream;
        ssn = event->ssn;

        cevent = (struct sctp_ulpevent *) pos->cb;
        csid = cevent->stream;
        cssn = cevent->ssn;
        if (sid > csid) {
                __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
                return;
        }

        if ((sid == csid) && SSN_lt(cssn, ssn)) {
                __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
                return;
        }

        /* Find the right place in this list.  We store them by
         * stream ID and then by SSN.
         */
        skb_queue_walk(&ulpq->lobby, pos) {
                cevent = (struct sctp_ulpevent *) pos->cb;
                csid = cevent->stream;
                cssn = cevent->ssn;

                if (csid > sid)
                        break;
                if (csid == sid && SSN_lt(ssn, cssn))
                        break;
        }


        /* Insert before pos. */
        __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event));
}

static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
                                             struct sctp_ulpevent *event)
{
        __u16 sid, ssn;
        struct sctp_stream *stream;

        /* Check if this message needs ordering.  */
        if (event->msg_flags & SCTP_DATA_UNORDERED)
                return event;

        /* Note: The stream ID must be verified before this routine.  */
        sid = event->stream;
        ssn = event->ssn;
        stream  = &ulpq->asoc->stream;

        /* Is this the expected SSN for this stream ID?  */
        if (ssn != sctp_ssn_peek(stream, in, sid)) {
                /* We've received something out of order, so find where it
                 * needs to be placed.  We order by stream and then by SSN.
                 */
                sctp_ulpq_store_ordered(ulpq, event);
                return NULL;
        }

        /* Mark that the next chunk has been found.  */
        sctp_ssn_next(stream, in, sid);

        /* Go find any other chunks that were waiting for
         * ordering.
         */
        sctp_ulpq_retrieve_ordered(ulpq, event);

        return event;
}

/* Helper function to gather skbs that have possibly become
 * ordered by forward tsn skipping their dependencies.
 */
static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
{
        struct sk_buff *pos, *tmp;
        struct sctp_ulpevent *cevent;
        struct sctp_ulpevent *event;
        struct sctp_stream *stream;
        struct sk_buff_head temp;
        struct sk_buff_head *lobby = &ulpq->lobby;
        __u16 csid, cssn;

        stream = &ulpq->asoc->stream;

        /* We are holding the chunks by stream, by SSN.  */
        skb_queue_head_init(&temp);
        event = NULL;
        sctp_skb_for_each(pos, lobby, tmp) {
                cevent = (struct sctp_ulpevent *) pos->cb;
                csid = cevent->stream;
                cssn = cevent->ssn;

                /* Have we gone too far?  */
                if (csid > sid)
                        break;

                /* Have we not gone far enough?  */
                if (csid < sid)
                        continue;

                /* see if this ssn has been marked by skipping */
                if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid)))
                        break;

                __skb_unlink(pos, lobby);
                if (!event)
                        /* Create a temporary list to collect chunks on.  */
                        event = sctp_skb2event(pos);

                /* Attach all gathered skbs to the event.  */
                __skb_queue_tail(&temp, pos);
        }

        /* If we didn't reap any data, see if the next expected SSN
         * is next on the queue and if so, use that.
         */
        if (event == NULL && pos != (struct sk_buff *)lobby) {
                cevent = (struct sctp_ulpevent *) pos->cb;
                csid = cevent->stream;
                cssn = cevent->ssn;

                if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) {
                        sctp_ssn_next(stream, in, csid);
                        __skb_unlink(pos, lobby);
                        __skb_queue_tail(&temp, pos);
                        event = sctp_skb2event(pos);
                }
        }

        /* Send event to the ULP.  'event' is the sctp_ulpevent for
         * very first SKB on the 'temp' list.
         */
        if (event) {
                /* see if we have more ordered that we can deliver */
                sctp_ulpq_retrieve_ordered(ulpq, event);
                sctp_ulpq_tail_event(ulpq, &temp);
        }
}

/* Skip over an SSN. This is used during the processing of
 * Forwared TSN chunk to skip over the abandoned ordered data
 */
void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
{
        struct sctp_stream *stream;

        /* Note: The stream ID must be verified before this routine.  */
        stream  = &ulpq->asoc->stream;

        /* Is this an old SSN?  If so ignore. */
        if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)))
                return;

        /* Mark that we are no longer expecting this SSN or lower. */
        sctp_ssn_skip(stream, in, sid, ssn);

        /* Go find any other chunks that were waiting for
         * ordering and deliver them if needed.
         */
        sctp_ulpq_reap_ordered(ulpq, sid);
}

__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list,
                            __u16 needed)
{
        __u16 freed = 0;
        __u32 tsn, last_tsn;
        struct sk_buff *skb, *flist, *last;
        struct sctp_ulpevent *event;
        struct sctp_tsnmap *tsnmap;

        tsnmap = &ulpq->asoc->peer.tsn_map;

        while ((skb = skb_peek_tail(list)) != NULL) {
                event = sctp_skb2event(skb);
                tsn = event->tsn;

                /* Don't renege below the Cumulative TSN ACK Point. */
                if (TSN_lte(tsn, sctp_tsnmap_get_ctsn(tsnmap)))
                        break;

                /* Events in ordering queue may have multiple fragments
                 * corresponding to additional TSNs.  Sum the total
                 * freed space; find the last TSN.
                 */
                freed += skb_headlen(skb);
                flist = skb_shinfo(skb)->frag_list;
                for (last = flist; flist; flist = flist->next) {
                        last = flist;
                        freed += skb_headlen(last);
                }
                if (last)
                        last_tsn = sctp_skb2event(last)->tsn;
                else
                        last_tsn = tsn;

                /* Unlink the event, then renege all applicable TSNs. */
                __skb_unlink(skb, list);
                sctp_ulpevent_free(event);
                while (TSN_lte(tsn, last_tsn)) {
                        sctp_tsnmap_renege(tsnmap, tsn);
                        tsn++;
                }
                if (freed >= needed)
                        return freed;
        }

        return freed;
}

/* Renege 'needed' bytes from the ordering queue. */
static __u16 sctp_ulpq_renege_order(struct sctp_ulpq *ulpq, __u16 needed)
{
        return sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed);
}

/* Renege 'needed' bytes from the reassembly queue. */
static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
{
        return sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed);
}

/* Partial deliver the first message as there is pressure on rwnd. */
void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
                                gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_association *asoc;
        struct sctp_sock *sp;
        __u32 ctsn;
        struct sk_buff *skb;

        asoc = ulpq->asoc;
        sp = sctp_sk(asoc->base.sk);

        /* If the association is already in Partial Delivery mode
         * we have nothing to do.
         */
        if (ulpq->pd_mode)
                return;

        /* Data must be at or below the Cumulative TSN ACK Point to
         * start partial delivery.
         */
        skb = skb_peek(&asoc->ulpq.reasm);
        if (skb != NULL) {
                ctsn = sctp_skb2event(skb)->tsn;
                if (!TSN_lte(ctsn, sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)))
                        return;
        }

        /* If the user enabled fragment interleave socket option,
         * multiple associations can enter partial delivery.
         * Otherwise, we can only enter partial delivery if the
         * socket is not in partial deliver mode.
         */
        if (sp->frag_interleave || atomic_read(&sp->pd_mode) == 0) {
                /* Is partial delivery possible?  */
                event = sctp_ulpq_retrieve_first(ulpq);
                /* Send event to the ULP.   */
                if (event) {
                        struct sk_buff_head temp;

                        skb_queue_head_init(&temp);
                        __skb_queue_tail(&temp, sctp_event2skb(event));
                        sctp_ulpq_tail_event(ulpq, &temp);
                        sctp_ulpq_set_pd(ulpq);
                        return;
                }
        }
}

/* Renege some packets to make room for an incoming chunk.  */
void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
                      gfp_t gfp)
{
        struct sctp_association *asoc = ulpq->asoc;
        __u32 freed = 0;
        __u16 needed;

        needed = ntohs(chunk->chunk_hdr->length) -
                 sizeof(struct sctp_data_chunk);

        if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
                freed = sctp_ulpq_renege_order(ulpq, needed);
                if (freed < needed)
                        freed += sctp_ulpq_renege_frags(ulpq, needed - freed);
        }
        /* If able to free enough room, accept this chunk. */
        if (sk_rmem_schedule(asoc->base.sk, chunk->skb, needed) &&
            freed >= needed) {
                int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp);
                /*
                 * Enter partial delivery if chunk has not been
                 * delivered; otherwise, drain the reassembly queue.
                 */
                if (retval <= 0)
                        sctp_ulpq_partial_delivery(ulpq, gfp);
                else if (retval == 1)
                        sctp_ulpq_reasm_drain(ulpq);
        }
}

/* Notify the application if an association is aborted and in
 * partial delivery mode.  Send up any pending received messages.
 */
void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
{
        struct sctp_ulpevent *ev = NULL;
        struct sctp_sock *sp;
        struct sock *sk;

        if (!ulpq->pd_mode)
                return;

        sk = ulpq->asoc->base.sk;
        sp = sctp_sk(sk);
        if (sctp_ulpevent_type_enabled(ulpq->asoc->subscribe,
                                       SCTP_PARTIAL_DELIVERY_EVENT))
                ev = sctp_ulpevent_make_pdapi(ulpq->asoc,
                                              SCTP_PARTIAL_DELIVERY_ABORTED,
                                              0, 0, 0, gfp);
        if (ev)
                __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));

        /* If there is data waiting, send it up the socket now. */
        if ((sctp_ulpq_clear_pd(ulpq) || ev) && !sp->data_ready_signalled) {
                sp->data_ready_signalled = 1;
                sk->sk_data_ready(sk);
        }
}







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Because linux/module.h has tracepoints in the header, and ftrace.h
 * used to include this file, define_trace.h includes linux/module.h
 * But we do not want the module.h to override the TRACE_SYSTEM macro
 * variable that define_trace.h is processing, so we only set it
 * when module events are being processed, which would happen when
 * CREATE_TRACE_POINTS is defined.
 */
#ifdef CREATE_TRACE_POINTS
#undef TRACE_SYSTEM
#define TRACE_SYSTEM module
#endif

#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MODULE_H

#include <linux/tracepoint.h>

#ifdef CONFIG_MODULES

struct module;

#define show_module_flags(flags) __print_flags(flags, "",        \
        { (1UL << TAINT_PROPRIETARY_MODULE),        "P" },                \
        { (1UL << TAINT_OOT_MODULE),                "O" },                \
        { (1UL << TAINT_FORCED_MODULE),                "F" },                \
        { (1UL << TAINT_CRAP),                        "C" },                \
        { (1UL << TAINT_UNSIGNED_MODULE),        "E" })

TRACE_EVENT(module_load,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __field(        unsigned int,        taints                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->taints = mod->taints;
                __assign_str(name);
        ),

        TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
);

TRACE_EVENT(module_free,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __assign_str(name);
        ),

        TP_printk("%s", __get_str(name))
);

#ifdef CONFIG_MODULE_UNLOAD
/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */

DECLARE_EVENT_CLASS(module_refcnt,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        int,                refcnt                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->refcnt        = atomic_read(&mod->refcnt);
                __assign_str(name);
        ),

        TP_printk("%s call_site=%ps refcnt=%d",
                  __get_str(name), (void *)__entry->ip, __entry->refcnt)
);

DEFINE_EVENT(module_refcnt, module_get,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);

DEFINE_EVENT(module_refcnt, module_put,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);
#endif /* CONFIG_MODULE_UNLOAD */

TRACE_EVENT(module_request,

        TP_PROTO(char *name, bool wait, unsigned long ip),

        TP_ARGS(name, wait, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        bool,                wait                )
                __string(        name,                name                )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->wait        = wait;
                __assign_str(name);
        ),

        TP_printk("%s wait=%d call_site=%ps",
                  __get_str(name), (int)__entry->wait, (void *)__entry->ip)
);

#endif /* CONFIG_MODULES */

#endif /* _TRACE_MODULE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































    1 





    1 


















    1 





    1 












































    1 



























    1 




    1 

















    1 



    1 







    1 






















    1 







    1 












































    1 















    1 




    1 



    1 




























    1 



    1 

    1 




    1 



































    1 





    1 

    1 

    1 













    1 

    1 























































































































































































































































































































    1 


    1 





    1 





    1 
    1 

































































































































































































    1 






























    1 
















    1 

































































































































































    1 







    1 















    1 









































    1 







    1 



    1 







    1 





    1 







    1 
    1 
    1 





    1 













    1 



    1 

















    1 




    1 
























    1 













































































































    1 




































    1 




    1 









    1 

















    1 




    1 




    1 









































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
// SPDX-License-Identifier: GPL-2.0
/*
 * NETLINK      Generic Netlink Family
 *
 *                 Authors:        Jamal Hadi Salim
 *                                 Thomas Graf <tgraf@suug.ch>
 *                                Johannes Berg <johannes@sipsolutions.net>
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/string_helpers.h>
#include <linux/skbuff.h>
#include <linux/mutex.h>
#include <linux/bitmap.h>
#include <linux/rwsem.h>
#include <linux/idr.h>
#include <net/sock.h>
#include <net/genetlink.h>

#include "genetlink.h"

static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
static DECLARE_RWSEM(cb_lock);

atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0);
DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq);

void genl_lock(void)
{
        mutex_lock(&genl_mutex);
}
EXPORT_SYMBOL(genl_lock);

void genl_unlock(void)
{
        mutex_unlock(&genl_mutex);
}
EXPORT_SYMBOL(genl_unlock);

static void genl_lock_all(void)
{
        down_write(&cb_lock);
        genl_lock();
}

static void genl_unlock_all(void)
{
        genl_unlock();
        up_write(&cb_lock);
}

static void genl_op_lock(const struct genl_family *family)
{
        if (!family->parallel_ops)
                genl_lock();
}

static void genl_op_unlock(const struct genl_family *family)
{
        if (!family->parallel_ops)
                genl_unlock();
}

static DEFINE_IDR(genl_fam_idr);

/*
 * Bitmap of multicast groups that are currently in use.
 *
 * To avoid an allocation at boot of just one unsigned long,
 * declare it global instead.
 * Bit 0 is marked as already used since group 0 is invalid.
 * Bit 1 is marked as already used since the drop-monitor code
 * abuses the API and thinks it can statically use group 1.
 * That group will typically conflict with other groups that
 * any proper users use.
 * Bit 16 is marked as used since it's used for generic netlink
 * and the code no longer marks pre-reserved IDs as used.
 * Bit 17 is marked as already used since the VFS quota code
 * also abused this API and relied on family == group ID, we
 * cater to that by giving it a static family and group ID.
 * Bit 18 is marked as already used since the PMCRAID driver
 * did the same thing as the VFS quota code (maybe copied?)
 */
static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
                                      BIT(GENL_ID_VFS_DQUOT) |
                                      BIT(GENL_ID_PMCRAID);
static unsigned long *mc_groups = &mc_group_start;
static unsigned long mc_groups_longs = 1;

/* We need the last attribute with non-zero ID therefore a 2-entry array */
static struct nla_policy genl_policy_reject_all[] = {
        { .type = NLA_REJECT },
        { .type = NLA_REJECT },
};

static int genl_ctrl_event(int event, const struct genl_family *family,
                           const struct genl_multicast_group *grp,
                           int grp_id);

static void
genl_op_fill_in_reject_policy(const struct genl_family *family,
                              struct genl_ops *op)
{
        BUILD_BUG_ON(ARRAY_SIZE(genl_policy_reject_all) - 1 != 1);

        if (op->policy || op->cmd < family->resv_start_op)
                return;

        op->policy = genl_policy_reject_all;
        op->maxattr = 1;
}

static void
genl_op_fill_in_reject_policy_split(const struct genl_family *family,
                                    struct genl_split_ops *op)
{
        if (op->policy)
                return;

        op->policy = genl_policy_reject_all;
        op->maxattr = 1;
}

static const struct genl_family *genl_family_find_byid(unsigned int id)
{
        return idr_find(&genl_fam_idr, id);
}

static const struct genl_family *genl_family_find_byname(char *name)
{
        const struct genl_family *family;
        unsigned int id;

        idr_for_each_entry(&genl_fam_idr, family, id)
                if (strcmp(family->name, name) == 0)
                        return family;

        return NULL;
}

struct genl_op_iter {
        const struct genl_family *family;
        struct genl_split_ops doit;
        struct genl_split_ops dumpit;
        int cmd_idx;
        int entry_idx;
        u32 cmd;
        u8 flags;
};

static void genl_op_from_full(const struct genl_family *family,
                              unsigned int i, struct genl_ops *op)
{
        *op = family->ops[i];

        if (!op->maxattr)
                op->maxattr = family->maxattr;
        if (!op->policy)
                op->policy = family->policy;

        genl_op_fill_in_reject_policy(family, op);
}

static int genl_get_cmd_full(u32 cmd, const struct genl_family *family,
                             struct genl_ops *op)
{
        int i;

        for (i = 0; i < family->n_ops; i++)
                if (family->ops[i].cmd == cmd) {
                        genl_op_from_full(family, i, op);
                        return 0;
                }

        return -ENOENT;
}

static void genl_op_from_small(const struct genl_family *family,
                               unsigned int i, struct genl_ops *op)
{
        memset(op, 0, sizeof(*op));
        op->doit        = family->small_ops[i].doit;
        op->dumpit        = family->small_ops[i].dumpit;
        op->cmd                = family->small_ops[i].cmd;
        op->internal_flags = family->small_ops[i].internal_flags;
        op->flags        = family->small_ops[i].flags;
        op->validate        = family->small_ops[i].validate;

        op->maxattr = family->maxattr;
        op->policy = family->policy;

        genl_op_fill_in_reject_policy(family, op);
}

static int genl_get_cmd_small(u32 cmd, const struct genl_family *family,
                              struct genl_ops *op)
{
        int i;

        for (i = 0; i < family->n_small_ops; i++)
                if (family->small_ops[i].cmd == cmd) {
                        genl_op_from_small(family, i, op);
                        return 0;
                }

        return -ENOENT;
}

static void genl_op_from_split(struct genl_op_iter *iter)
{
        const struct genl_family *family = iter->family;
        int i, cnt = 0;

        i = iter->entry_idx - family->n_ops - family->n_small_ops;

        if (family->split_ops[i + cnt].flags & GENL_CMD_CAP_DO) {
                iter->doit = family->split_ops[i + cnt];
                genl_op_fill_in_reject_policy_split(family, &iter->doit);
                cnt++;
        } else {
                memset(&iter->doit, 0, sizeof(iter->doit));
        }

        if (i + cnt < family->n_split_ops &&
            family->split_ops[i + cnt].flags & GENL_CMD_CAP_DUMP &&
            (!cnt || family->split_ops[i + cnt].cmd == iter->doit.cmd)) {
                iter->dumpit = family->split_ops[i + cnt];
                genl_op_fill_in_reject_policy_split(family, &iter->dumpit);
                cnt++;
        } else {
                memset(&iter->dumpit, 0, sizeof(iter->dumpit));
        }

        WARN_ON(!cnt);
        iter->entry_idx += cnt;
}

static int
genl_get_cmd_split(u32 cmd, u8 flag, const struct genl_family *family,
                   struct genl_split_ops *op)
{
        int i;

        for (i = 0; i < family->n_split_ops; i++)
                if (family->split_ops[i].cmd == cmd &&
                    family->split_ops[i].flags & flag) {
                        *op = family->split_ops[i];
                        return 0;
                }

        return -ENOENT;
}

static int
genl_cmd_full_to_split(struct genl_split_ops *op,
                       const struct genl_family *family,
                       const struct genl_ops *full, u8 flags)
{
        if ((flags & GENL_CMD_CAP_DO && !full->doit) ||
            (flags & GENL_CMD_CAP_DUMP && !full->dumpit)) {
                memset(op, 0, sizeof(*op));
                return -ENOENT;
        }

        if (flags & GENL_CMD_CAP_DUMP) {
                op->start        = full->start;
                op->dumpit        = full->dumpit;
                op->done        = full->done;
        } else {
                op->pre_doit        = family->pre_doit;
                op->doit        = full->doit;
                op->post_doit        = family->post_doit;
        }

        if (flags & GENL_CMD_CAP_DUMP &&
            full->validate & GENL_DONT_VALIDATE_DUMP) {
                op->policy        = NULL;
                op->maxattr        = 0;
        } else {
                op->policy        = full->policy;
                op->maxattr        = full->maxattr;
        }

        op->cmd                        = full->cmd;
        op->internal_flags        = full->internal_flags;
        op->flags                = full->flags;
        op->validate                = full->validate;

        /* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */
        op->flags                |= flags;

        return 0;
}

/* Must make sure that op is initialized to 0 on failure */
static int
genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family,
             struct genl_split_ops *op)
{
        struct genl_ops full;
        int err;

        err = genl_get_cmd_full(cmd, family, &full);
        if (err == -ENOENT)
                err = genl_get_cmd_small(cmd, family, &full);
        /* Found one of legacy forms */
        if (err == 0)
                return genl_cmd_full_to_split(op, family, &full, flags);

        err = genl_get_cmd_split(cmd, flags, family, op);
        if (err)
                memset(op, 0, sizeof(*op));
        return err;
}

/* For policy dumping only, get ops of both do and dump.
 * Fail if both are missing, genl_get_cmd() will zero-init in case of failure.
 */
static int
genl_get_cmd_both(u32 cmd, const struct genl_family *family,
                  struct genl_split_ops *doit, struct genl_split_ops *dumpit)
{
        int err1, err2;

        err1 = genl_get_cmd(cmd, GENL_CMD_CAP_DO, family, doit);
        err2 = genl_get_cmd(cmd, GENL_CMD_CAP_DUMP, family, dumpit);

        return err1 && err2 ? -ENOENT : 0;
}

static bool
genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter)
{
        iter->family = family;
        iter->cmd_idx = 0;
        iter->entry_idx = 0;

        iter->flags = 0;

        return iter->family->n_ops +
                iter->family->n_small_ops +
                iter->family->n_split_ops;
}

static bool genl_op_iter_next(struct genl_op_iter *iter)
{
        const struct genl_family *family = iter->family;
        bool legacy_op = true;
        struct genl_ops op;

        if (iter->entry_idx < family->n_ops) {
                genl_op_from_full(family, iter->entry_idx, &op);
        } else if (iter->entry_idx < family->n_ops + family->n_small_ops) {
                genl_op_from_small(family, iter->entry_idx - family->n_ops,
                                   &op);
        } else if (iter->entry_idx <
                   family->n_ops + family->n_small_ops + family->n_split_ops) {
                legacy_op = false;
                /* updates entry_idx */
                genl_op_from_split(iter);
        } else {
                return false;
        }

        iter->cmd_idx++;

        if (legacy_op) {
                iter->entry_idx++;

                genl_cmd_full_to_split(&iter->doit, family,
                                       &op, GENL_CMD_CAP_DO);
                genl_cmd_full_to_split(&iter->dumpit, family,
                                       &op, GENL_CMD_CAP_DUMP);
        }

        iter->cmd = iter->doit.cmd | iter->dumpit.cmd;
        iter->flags = iter->doit.flags | iter->dumpit.flags;

        return true;
}

static void
genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src)
{
        *dst = *src;
}

static unsigned int genl_op_iter_idx(struct genl_op_iter *iter)
{
        return iter->cmd_idx;
}

static int genl_allocate_reserve_groups(int n_groups, int *first_id)
{
        unsigned long *new_groups;
        int start = 0;
        int i;
        int id;
        bool fits;

        do {
                if (start == 0)
                        id = find_first_zero_bit(mc_groups,
                                                 mc_groups_longs *
                                                 BITS_PER_LONG);
                else
                        id = find_next_zero_bit(mc_groups,
                                                mc_groups_longs * BITS_PER_LONG,
                                                start);

                fits = true;
                for (i = id;
                     i < min_t(int, id + n_groups,
                               mc_groups_longs * BITS_PER_LONG);
                     i++) {
                        if (test_bit(i, mc_groups)) {
                                start = i;
                                fits = false;
                                break;
                        }
                }

                if (id + n_groups > mc_groups_longs * BITS_PER_LONG) {
                        unsigned long new_longs = mc_groups_longs +
                                                  BITS_TO_LONGS(n_groups);
                        size_t nlen = new_longs * sizeof(unsigned long);

                        if (mc_groups == &mc_group_start) {
                                new_groups = kzalloc(nlen, GFP_KERNEL);
                                if (!new_groups)
                                        return -ENOMEM;
                                mc_groups = new_groups;
                                *mc_groups = mc_group_start;
                        } else {
                                new_groups = krealloc(mc_groups, nlen,
                                                      GFP_KERNEL);
                                if (!new_groups)
                                        return -ENOMEM;
                                mc_groups = new_groups;
                                for (i = 0; i < BITS_TO_LONGS(n_groups); i++)
                                        mc_groups[mc_groups_longs + i] = 0;
                        }
                        mc_groups_longs = new_longs;
                }
        } while (!fits);

        for (i = id; i < id + n_groups; i++)
                set_bit(i, mc_groups);
        *first_id = id;
        return 0;
}

static struct genl_family genl_ctrl;

static int genl_validate_assign_mc_groups(struct genl_family *family)
{
        int first_id;
        int n_groups = family->n_mcgrps;
        int err = 0, i;
        bool groups_allocated = false;

        if (!n_groups)
                return 0;

        for (i = 0; i < n_groups; i++) {
                const struct genl_multicast_group *grp = &family->mcgrps[i];

                if (WARN_ON(grp->name[0] == '\0'))
                        return -EINVAL;
                if (WARN_ON(!string_is_terminated(grp->name, GENL_NAMSIZ)))
                        return -EINVAL;
        }

        /* special-case our own group and hacks */
        if (family == &genl_ctrl) {
                first_id = GENL_ID_CTRL;
                BUG_ON(n_groups != 1);
        } else if (strcmp(family->name, "NET_DM") == 0) {
                first_id = 1;
                BUG_ON(n_groups != 1);
        } else if (family->id == GENL_ID_VFS_DQUOT) {
                first_id = GENL_ID_VFS_DQUOT;
                BUG_ON(n_groups != 1);
        } else if (family->id == GENL_ID_PMCRAID) {
                first_id = GENL_ID_PMCRAID;
                BUG_ON(n_groups != 1);
        } else {
                groups_allocated = true;
                err = genl_allocate_reserve_groups(n_groups, &first_id);
                if (err)
                        return err;
        }

        family->mcgrp_offset = first_id;

        /* if still initializing, can't and don't need to realloc bitmaps */
        if (!init_net.genl_sock)
                return 0;

        if (family->netnsok) {
                struct net *net;

                netlink_table_grab();
                rcu_read_lock();
                for_each_net_rcu(net) {
                        err = __netlink_change_ngroups(net->genl_sock,
                                        mc_groups_longs * BITS_PER_LONG);
                        if (err) {
                                /*
                                 * No need to roll back, can only fail if
                                 * memory allocation fails and then the
                                 * number of _possible_ groups has been
                                 * increased on some sockets which is ok.
                                 */
                                break;
                        }
                }
                rcu_read_unlock();
                netlink_table_ungrab();
        } else {
                err = netlink_change_ngroups(init_net.genl_sock,
                                             mc_groups_longs * BITS_PER_LONG);
        }

        if (groups_allocated && err) {
                for (i = 0; i < family->n_mcgrps; i++)
                        clear_bit(family->mcgrp_offset + i, mc_groups);
        }

        return err;
}

static void genl_unregister_mc_groups(const struct genl_family *family)
{
        struct net *net;
        int i;

        netlink_table_grab();
        rcu_read_lock();
        for_each_net_rcu(net) {
                for (i = 0; i < family->n_mcgrps; i++)
                        __netlink_clear_multicast_users(
                                net->genl_sock, family->mcgrp_offset + i);
        }
        rcu_read_unlock();
        netlink_table_ungrab();

        for (i = 0; i < family->n_mcgrps; i++) {
                int grp_id = family->mcgrp_offset + i;

                if (grp_id != 1)
                        clear_bit(grp_id, mc_groups);
                genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family,
                                &family->mcgrps[i], grp_id);
        }
}

static bool genl_split_op_check(const struct genl_split_ops *op)
{
        if (WARN_ON(hweight8(op->flags & (GENL_CMD_CAP_DO |
                                          GENL_CMD_CAP_DUMP)) != 1))
                return true;
        return false;
}

static int genl_validate_ops(const struct genl_family *family)
{
        struct genl_op_iter i, j;
        unsigned int s;

        if (WARN_ON(family->n_ops && !family->ops) ||
            WARN_ON(family->n_small_ops && !family->small_ops) ||
            WARN_ON(family->n_split_ops && !family->split_ops))
                return -EINVAL;

        for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) {
                if (!(i.flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP)))
                        return -EINVAL;

                if (WARN_ON(i.cmd >= family->resv_start_op &&
                            (i.doit.validate || i.dumpit.validate)))
                        return -EINVAL;

                genl_op_iter_copy(&j, &i);
                while (genl_op_iter_next(&j)) {
                        if (i.cmd == j.cmd)
                                return -EINVAL;
                }
        }

        if (family->n_split_ops) {
                if (genl_split_op_check(&family->split_ops[0]))
                        return -EINVAL;
        }

        for (s = 1; s < family->n_split_ops; s++) {
                const struct genl_split_ops *a, *b;

                a = &family->split_ops[s - 1];
                b = &family->split_ops[s];

                if (genl_split_op_check(b))
                        return -EINVAL;

                /* Check sort order */
                if (a->cmd < b->cmd) {
                        continue;
                } else if (a->cmd > b->cmd) {
                        WARN_ON(1);
                        return -EINVAL;
                }

                if (a->internal_flags != b->internal_flags ||
                    ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO |
                                               GENL_CMD_CAP_DUMP))) {
                        WARN_ON(1);
                        return -EINVAL;
                }

                if ((a->flags & GENL_CMD_CAP_DO) &&
                    (b->flags & GENL_CMD_CAP_DUMP))
                        continue;

                WARN_ON(1);
                return -EINVAL;
        }

        return 0;
}

static void *genl_sk_priv_alloc(struct genl_family *family)
{
        void *priv;

        priv = kzalloc(family->sock_priv_size, GFP_KERNEL);
        if (!priv)
                return ERR_PTR(-ENOMEM);

        if (family->sock_priv_init)
                family->sock_priv_init(priv);

        return priv;
}

static void genl_sk_priv_free(const struct genl_family *family, void *priv)
{
        if (family->sock_priv_destroy)
                family->sock_priv_destroy(priv);
        kfree(priv);
}

static int genl_sk_privs_alloc(struct genl_family *family)
{
        if (!family->sock_priv_size)
                return 0;

        family->sock_privs = kzalloc(sizeof(*family->sock_privs), GFP_KERNEL);
        if (!family->sock_privs)
                return -ENOMEM;
        xa_init(family->sock_privs);
        return 0;
}

static void genl_sk_privs_free(const struct genl_family *family)
{
        unsigned long id;
        void *priv;

        if (!family->sock_priv_size)
                return;

        xa_for_each(family->sock_privs, id, priv)
                genl_sk_priv_free(family, priv);

        xa_destroy(family->sock_privs);
        kfree(family->sock_privs);
}

static void genl_sk_priv_free_by_sock(struct genl_family *family,
                                      struct sock *sk)
{
        void *priv;

        if (!family->sock_priv_size)
                return;
        priv = xa_erase(family->sock_privs, (unsigned long) sk);
        if (!priv)
                return;
        genl_sk_priv_free(family, priv);
}

static void genl_release(struct sock *sk, unsigned long *groups)
{
        struct genl_family *family;
        unsigned int id;

        down_read(&cb_lock);

        idr_for_each_entry(&genl_fam_idr, family, id)
                genl_sk_priv_free_by_sock(family, sk);

        up_read(&cb_lock);
}

/**
 * __genl_sk_priv_get - Get family private pointer for socket, if exists
 *
 * @family: family
 * @sk: socket
 *
 * Lookup a private memory for a Generic netlink family and specified socket.
 *
 * Caller should make sure this is called in RCU read locked section.
 *
 * Return: valid pointer on success, otherwise negative error value
 * encoded by ERR_PTR(), NULL in case priv does not exist.
 */
void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk)
{
        if (WARN_ON_ONCE(!family->sock_privs))
                return ERR_PTR(-EINVAL);
        return xa_load(family->sock_privs, (unsigned long) sk);
}

/**
 * genl_sk_priv_get - Get family private pointer for socket
 *
 * @family: family
 * @sk: socket
 *
 * Lookup a private memory for a Generic netlink family and specified socket.
 * Allocate the private memory in case it was not already done.
 *
 * Return: valid pointer on success, otherwise negative error value
 * encoded by ERR_PTR().
 */
void *genl_sk_priv_get(struct genl_family *family, struct sock *sk)
{
        void *priv, *old_priv;

        priv = __genl_sk_priv_get(family, sk);
        if (priv)
                return priv;

        /* priv for the family does not exist so far, create it. */

        priv = genl_sk_priv_alloc(family);
        if (IS_ERR(priv))
                return ERR_CAST(priv);

        old_priv = xa_cmpxchg(family->sock_privs, (unsigned long) sk, NULL,
                              priv, GFP_KERNEL);
        if (old_priv) {
                genl_sk_priv_free(family, priv);
                if (xa_is_err(old_priv))
                        return ERR_PTR(xa_err(old_priv));
                /* Race happened, priv for the socket was already inserted. */
                return old_priv;
        }
        return priv;
}

/**
 * genl_register_family - register a generic netlink family
 * @family: generic netlink family
 *
 * Registers the specified family after validating it first. Only one
 * family may be registered with the same family name or identifier.
 *
 * The family's ops, multicast groups and module pointer must already
 * be assigned.
 *
 * Return 0 on success or a negative error code.
 */
int genl_register_family(struct genl_family *family)
{
        int err, i;
        int start = GENL_START_ALLOC, end = GENL_MAX_ID;

        err = genl_validate_ops(family);
        if (err)
                return err;

        genl_lock_all();

        if (genl_family_find_byname(family->name)) {
                err = -EEXIST;
                goto errout_locked;
        }

        err = genl_sk_privs_alloc(family);
        if (err)
                goto errout_locked;

        /*
         * Sadly, a few cases need to be special-cased
         * due to them having previously abused the API
         * and having used their family ID also as their
         * multicast group ID, so we use reserved IDs
         * for both to be sure we can do that mapping.
         */
        if (family == &genl_ctrl) {
                /* and this needs to be special for initial family lookups */
                start = end = GENL_ID_CTRL;
        } else if (strcmp(family->name, "pmcraid") == 0) {
                start = end = GENL_ID_PMCRAID;
        } else if (strcmp(family->name, "VFS_DQUOT") == 0) {
                start = end = GENL_ID_VFS_DQUOT;
        }

        family->id = idr_alloc_cyclic(&genl_fam_idr, family,
                                      start, end + 1, GFP_KERNEL);
        if (family->id < 0) {
                err = family->id;
                goto errout_sk_privs_free;
        }

        err = genl_validate_assign_mc_groups(family);
        if (err)
                goto errout_remove;

        genl_unlock_all();

        /* send all events */
        genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0);
        for (i = 0; i < family->n_mcgrps; i++)
                genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family,
                                &family->mcgrps[i], family->mcgrp_offset + i);

        return 0;

errout_remove:
        idr_remove(&genl_fam_idr, family->id);
errout_sk_privs_free:
        genl_sk_privs_free(family);
errout_locked:
        genl_unlock_all();
        return err;
}
EXPORT_SYMBOL(genl_register_family);

/**
 * genl_unregister_family - unregister generic netlink family
 * @family: generic netlink family
 *
 * Unregisters the specified family.
 *
 * Returns 0 on success or a negative error code.
 */
int genl_unregister_family(const struct genl_family *family)
{
        genl_lock_all();

        if (!genl_family_find_byid(family->id)) {
                genl_unlock_all();
                return -ENOENT;
        }

        genl_unregister_mc_groups(family);

        idr_remove(&genl_fam_idr, family->id);

        up_write(&cb_lock);
        wait_event(genl_sk_destructing_waitq,
                   atomic_read(&genl_sk_destructing_cnt) == 0);

        genl_sk_privs_free(family);

        genl_unlock();

        genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);

        return 0;
}
EXPORT_SYMBOL(genl_unregister_family);

/**
 * genlmsg_put - Add generic netlink header to netlink message
 * @skb: socket buffer holding the message
 * @portid: netlink portid the message is addressed to
 * @seq: sequence number (usually the one of the sender)
 * @family: generic netlink family
 * @flags: netlink message flags
 * @cmd: generic netlink command
 *
 * Returns pointer to user specific header
 */
void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                  const struct genl_family *family, int flags, u8 cmd)
{
        struct nlmsghdr *nlh;
        struct genlmsghdr *hdr;

        nlh = nlmsg_put(skb, portid, seq, family->id, GENL_HDRLEN +
                        family->hdrsize, flags);
        if (nlh == NULL)
                return NULL;

        hdr = nlmsg_data(nlh);
        hdr->cmd = cmd;
        hdr->version = family->version;
        hdr->reserved = 0;

        return (char *) hdr + GENL_HDRLEN;
}
EXPORT_SYMBOL(genlmsg_put);

static struct genl_dumpit_info *genl_dumpit_info_alloc(void)
{
        return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL);
}

static void genl_dumpit_info_free(const struct genl_dumpit_info *info)
{
        kfree(info);
}

static struct nlattr **
genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
                                struct nlmsghdr *nlh,
                                struct netlink_ext_ack *extack,
                                const struct genl_split_ops *ops,
                                int hdrlen,
                                enum genl_validate_flags no_strict_flag)
{
        enum netlink_validation validate = ops->validate & no_strict_flag ?
                                           NL_VALIDATE_LIBERAL :
                                           NL_VALIDATE_STRICT;
        struct nlattr **attrbuf;
        int err;

        if (!ops->maxattr)
                return NULL;

        attrbuf = kmalloc_array(ops->maxattr + 1,
                                sizeof(struct nlattr *), GFP_KERNEL);
        if (!attrbuf)
                return ERR_PTR(-ENOMEM);

        err = __nlmsg_parse(nlh, hdrlen, attrbuf, ops->maxattr, ops->policy,
                            validate, extack);
        if (err) {
                kfree(attrbuf);
                return ERR_PTR(err);
        }
        return attrbuf;
}

static void genl_family_rcv_msg_attrs_free(struct nlattr **attrbuf)
{
        kfree(attrbuf);
}

struct genl_start_context {
        const struct genl_family *family;
        struct nlmsghdr *nlh;
        struct netlink_ext_ack *extack;
        const struct genl_split_ops *ops;
        int hdrlen;
};

static int genl_start(struct netlink_callback *cb)
{
        struct genl_start_context *ctx = cb->data;
        const struct genl_split_ops *ops;
        struct genl_dumpit_info *info;
        struct nlattr **attrs = NULL;
        int rc = 0;

        ops = ctx->ops;
        if (!(ops->validate & GENL_DONT_VALIDATE_DUMP) &&
            ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen))
                return -EINVAL;

        attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack,
                                                ops, ctx->hdrlen,
                                                GENL_DONT_VALIDATE_DUMP_STRICT);
        if (IS_ERR(attrs))
                return PTR_ERR(attrs);

        info = genl_dumpit_info_alloc();
        if (!info) {
                genl_family_rcv_msg_attrs_free(attrs);
                return -ENOMEM;
        }
        info->op = *ops;
        info->info.family        = ctx->family;
        info->info.snd_seq        = cb->nlh->nlmsg_seq;
        info->info.snd_portid        = NETLINK_CB(cb->skb).portid;
        info->info.nlhdr        = cb->nlh;
        info->info.genlhdr        = nlmsg_data(cb->nlh);
        info->info.attrs        = attrs;
        genl_info_net_set(&info->info, sock_net(cb->skb->sk));
        info->info.extack        = cb->extack;
        memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr));

        cb->data = info;
        if (ops->start) {
                genl_op_lock(ctx->family);
                rc = ops->start(cb);
                genl_op_unlock(ctx->family);
        }

        if (rc) {
                genl_family_rcv_msg_attrs_free(info->info.attrs);
                genl_dumpit_info_free(info);
                cb->data = NULL;
        }
        return rc;
}

static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct genl_dumpit_info *dump_info = cb->data;
        const struct genl_split_ops *ops = &dump_info->op;
        struct genl_info *info = &dump_info->info;
        int rc;

        info->extack = cb->extack;

        genl_op_lock(info->family);
        rc = ops->dumpit(skb, cb);
        genl_op_unlock(info->family);
        return rc;
}

static int genl_done(struct netlink_callback *cb)
{
        struct genl_dumpit_info *dump_info = cb->data;
        const struct genl_split_ops *ops = &dump_info->op;
        struct genl_info *info = &dump_info->info;
        int rc = 0;

        info->extack = cb->extack;

        if (ops->done) {
                genl_op_lock(info->family);
                rc = ops->done(cb);
                genl_op_unlock(info->family);
        }
        genl_family_rcv_msg_attrs_free(info->attrs);
        genl_dumpit_info_free(dump_info);
        return rc;
}

static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
                                      struct sk_buff *skb,
                                      struct nlmsghdr *nlh,
                                      struct netlink_ext_ack *extack,
                                      const struct genl_split_ops *ops,
                                      int hdrlen, struct net *net)
{
        struct genl_start_context ctx;
        struct netlink_dump_control c = {
                .module = family->module,
                .data = &ctx,
                .start = genl_start,
                .dump = genl_dumpit,
                .done = genl_done,
                .extack = extack,
        };
        int err;

        ctx.family = family;
        ctx.nlh = nlh;
        ctx.extack = extack;
        ctx.ops = ops;
        ctx.hdrlen = hdrlen;

        genl_op_unlock(family);
        err = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
        genl_op_lock(family);

        return err;
}

static int genl_family_rcv_msg_doit(const struct genl_family *family,
                                    struct sk_buff *skb,
                                    struct nlmsghdr *nlh,
                                    struct netlink_ext_ack *extack,
                                    const struct genl_split_ops *ops,
                                    int hdrlen, struct net *net)
{
        struct nlattr **attrbuf;
        struct genl_info info;
        int err;

        attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack,
                                                  ops, hdrlen,
                                                  GENL_DONT_VALIDATE_STRICT);
        if (IS_ERR(attrbuf))
                return PTR_ERR(attrbuf);

        info.snd_seq = nlh->nlmsg_seq;
        info.snd_portid = NETLINK_CB(skb).portid;
        info.family = family;
        info.nlhdr = nlh;
        info.genlhdr = nlmsg_data(nlh);
        info.attrs = attrbuf;
        info.extack = extack;
        genl_info_net_set(&info, net);
        memset(&info.user_ptr, 0, sizeof(info.user_ptr));

        if (ops->pre_doit) {
                err = ops->pre_doit(ops, skb, &info);
                if (err)
                        goto out;
        }

        err = ops->doit(skb, &info);

        if (ops->post_doit)
                ops->post_doit(ops, skb, &info);

out:
        genl_family_rcv_msg_attrs_free(attrbuf);

        return err;
}

static int genl_header_check(const struct genl_family *family,
                             struct nlmsghdr *nlh, struct genlmsghdr *hdr,
                             struct netlink_ext_ack *extack)
{
        u16 flags;

        /* Only for commands added after we started validating */
        if (hdr->cmd < family->resv_start_op)
                return 0;

        if (hdr->reserved) {
                NL_SET_ERR_MSG(extack, "genlmsghdr.reserved field is not 0");
                return -EINVAL;
        }

        /* Old netlink flags have pretty loose semantics, allow only the flags
         * consumed by the core where we can enforce the meaning.
         */
        flags = nlh->nlmsg_flags;
        if ((flags & NLM_F_DUMP) == NLM_F_DUMP) /* DUMP is 2 bits */
                flags &= ~NLM_F_DUMP;
        if (flags & ~(NLM_F_REQUEST | NLM_F_ACK | NLM_F_ECHO)) {
                NL_SET_ERR_MSG(extack,
                               "ambiguous or reserved bits set in nlmsg_flags");
                return -EINVAL;
        }

        return 0;
}

static int genl_family_rcv_msg(const struct genl_family *family,
                               struct sk_buff *skb,
                               struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct genlmsghdr *hdr = nlmsg_data(nlh);
        struct genl_split_ops op;
        int hdrlen;
        u8 flags;

        /* this family doesn't exist in this netns */
        if (!family->netnsok && !net_eq(net, &init_net))
                return -ENOENT;

        hdrlen = GENL_HDRLEN + family->hdrsize;
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
                return -EINVAL;

        if (genl_header_check(family, nlh, hdr, extack))
                return -EINVAL;

        flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ?
                GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO;
        if (genl_get_cmd(hdr->cmd, flags, family, &op))
                return -EOPNOTSUPP;

        if ((op.flags & GENL_ADMIN_PERM) &&
            !netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if ((op.flags & GENL_UNS_ADMIN_PERM) &&
            !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        if (flags & GENL_CMD_CAP_DUMP)
                return genl_family_rcv_msg_dumpit(family, skb, nlh, extack,
                                                  &op, hdrlen, net);
        else
                return genl_family_rcv_msg_doit(family, skb, nlh, extack,
                                                &op, hdrlen, net);
}

static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        const struct genl_family *family;
        int err;

        family = genl_family_find_byid(nlh->nlmsg_type);
        if (family == NULL)
                return -ENOENT;

        genl_op_lock(family);
        err = genl_family_rcv_msg(family, skb, nlh, extack);
        genl_op_unlock(family);

        return err;
}

static void genl_rcv(struct sk_buff *skb)
{
        down_read(&cb_lock);
        netlink_rcv_skb(skb, &genl_rcv_msg);
        up_read(&cb_lock);
}

/**************************************************************************
 * Controller
 **************************************************************************/

static struct genl_family genl_ctrl;

static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq,
                          u32 flags, struct sk_buff *skb, u8 cmd)
{
        struct genl_op_iter i;
        void *hdr;

        hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd);
        if (hdr == NULL)
                return -EMSGSIZE;

        if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) ||
            nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) ||
            nla_put_u32(skb, CTRL_ATTR_VERSION, family->version) ||
            nla_put_u32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize) ||
            nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr))
                goto nla_put_failure;

        if (genl_op_iter_init(family, &i)) {
                struct nlattr *nla_ops;

                nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS);
                if (nla_ops == NULL)
                        goto nla_put_failure;

                while (genl_op_iter_next(&i)) {
                        struct nlattr *nest;
                        u32 op_flags;

                        op_flags = i.flags;
                        if (i.doit.policy || i.dumpit.policy)
                                op_flags |= GENL_CMD_CAP_HASPOL;

                        nest = nla_nest_start_noflag(skb, genl_op_iter_idx(&i));
                        if (nest == NULL)
                                goto nla_put_failure;

                        if (nla_put_u32(skb, CTRL_ATTR_OP_ID, i.cmd) ||
                            nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags))
                                goto nla_put_failure;

                        nla_nest_end(skb, nest);
                }

                nla_nest_end(skb, nla_ops);
        }

        if (family->n_mcgrps) {
                struct nlattr *nla_grps;
                int i;

                nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS);
                if (nla_grps == NULL)
                        goto nla_put_failure;

                for (i = 0; i < family->n_mcgrps; i++) {
                        struct nlattr *nest;
                        const struct genl_multicast_group *grp;

                        grp = &family->mcgrps[i];

                        nest = nla_nest_start_noflag(skb, i + 1);
                        if (nest == NULL)
                                goto nla_put_failure;

                        if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID,
                                        family->mcgrp_offset + i) ||
                            nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME,
                                           grp->name))
                                goto nla_put_failure;

                        nla_nest_end(skb, nest);
                }
                nla_nest_end(skb, nla_grps);
        }

        genlmsg_end(skb, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(skb, hdr);
        return -EMSGSIZE;
}

static int ctrl_fill_mcgrp_info(const struct genl_family *family,
                                const struct genl_multicast_group *grp,
                                int grp_id, u32 portid, u32 seq, u32 flags,
                                struct sk_buff *skb, u8 cmd)
{
        void *hdr;
        struct nlattr *nla_grps;
        struct nlattr *nest;

        hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd);
        if (hdr == NULL)
                return -1;

        if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) ||
            nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id))
                goto nla_put_failure;

        nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS);
        if (nla_grps == NULL)
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, 1);
        if (nest == NULL)
                goto nla_put_failure;

        if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) ||
            nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME,
                           grp->name))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        nla_nest_end(skb, nla_grps);

        genlmsg_end(skb, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(skb, hdr);
        return -EMSGSIZE;
}

static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
{
        int n = 0;
        struct genl_family *rt;
        struct net *net = sock_net(skb->sk);
        int fams_to_skip = cb->args[0];
        unsigned int id;
        int err = 0;

        idr_for_each_entry(&genl_fam_idr, rt, id) {
                if (!rt->netnsok && !net_eq(net, &init_net))
                        continue;

                if (n++ < fams_to_skip)
                        continue;

                err = ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid,
                                     cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                     skb, CTRL_CMD_NEWFAMILY);
                if (err) {
                        n--;
                        break;
                }
        }

        cb->args[0] = n;
        return err;
}

static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family,
                                             u32 portid, int seq, u8 cmd)
{
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (skb == NULL)
                return ERR_PTR(-ENOBUFS);

        err = ctrl_fill_info(family, portid, seq, 0, skb, cmd);
        if (err < 0) {
                nlmsg_free(skb);
                return ERR_PTR(err);
        }

        return skb;
}

static struct sk_buff *
ctrl_build_mcgrp_msg(const struct genl_family *family,
                     const struct genl_multicast_group *grp,
                     int grp_id, u32 portid, int seq, u8 cmd)
{
        struct sk_buff *skb;
        int err;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (skb == NULL)
                return ERR_PTR(-ENOBUFS);

        err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid,
                                   seq, 0, skb, cmd);
        if (err < 0) {
                nlmsg_free(skb);
                return ERR_PTR(err);
        }

        return skb;
}

static const struct nla_policy ctrl_policy_family[] = {
        [CTRL_ATTR_FAMILY_ID]        = { .type = NLA_U16 },
        [CTRL_ATTR_FAMILY_NAME]        = { .type = NLA_NUL_STRING,
                                    .len = GENL_NAMSIZ - 1 },
};

static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info)
{
        struct sk_buff *msg;
        const struct genl_family *res = NULL;
        int err = -EINVAL;

        if (info->attrs[CTRL_ATTR_FAMILY_ID]) {
                u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]);
                res = genl_family_find_byid(id);
                err = -ENOENT;
        }

        if (info->attrs[CTRL_ATTR_FAMILY_NAME]) {
                char *name;

                name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]);
                res = genl_family_find_byname(name);
#ifdef CONFIG_MODULES
                if (res == NULL) {
                        genl_unlock();
                        up_read(&cb_lock);
                        request_module("net-pf-%d-proto-%d-family-%s",
                                       PF_NETLINK, NETLINK_GENERIC, name);
                        down_read(&cb_lock);
                        genl_lock();
                        res = genl_family_find_byname(name);
                }
#endif
                err = -ENOENT;
        }

        if (res == NULL)
                return err;

        if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) {
                /* family doesn't exist here */
                return -ENOENT;
        }

        msg = ctrl_build_family_msg(res, info->snd_portid, info->snd_seq,
                                    CTRL_CMD_NEWFAMILY);
        if (IS_ERR(msg))
                return PTR_ERR(msg);

        return genlmsg_reply(msg, info);
}

static int genl_ctrl_event(int event, const struct genl_family *family,
                           const struct genl_multicast_group *grp,
                           int grp_id)
{
        struct sk_buff *msg;

        /* genl is still initialising */
        if (!init_net.genl_sock)
                return 0;

        switch (event) {
        case CTRL_CMD_NEWFAMILY:
        case CTRL_CMD_DELFAMILY:
                WARN_ON(grp);
                msg = ctrl_build_family_msg(family, 0, 0, event);
                break;
        case CTRL_CMD_NEWMCAST_GRP:
        case CTRL_CMD_DELMCAST_GRP:
                BUG_ON(!grp);
                msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event);
                break;
        default:
                return -EINVAL;
        }

        if (IS_ERR(msg))
                return PTR_ERR(msg);

        if (!family->netnsok) {
                genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0,
                                        0, GFP_KERNEL);
        } else {
                rcu_read_lock();
                genlmsg_multicast_allns(&genl_ctrl, msg, 0,
                                        0, GFP_ATOMIC);
                rcu_read_unlock();
        }

        return 0;
}

struct ctrl_dump_policy_ctx {
        struct netlink_policy_dump_state *state;
        const struct genl_family *rt;
        struct genl_op_iter *op_iter;
        u32 op;
        u16 fam_id;
        u8 dump_map:1,
           single_op:1;
};

static const struct nla_policy ctrl_policy_policy[] = {
        [CTRL_ATTR_FAMILY_ID]        = { .type = NLA_U16 },
        [CTRL_ATTR_FAMILY_NAME]        = { .type = NLA_NUL_STRING,
                                    .len = GENL_NAMSIZ - 1 },
        [CTRL_ATTR_OP]                = { .type = NLA_U32 },
};

static int ctrl_dumppolicy_start(struct netlink_callback *cb)
{
        const struct genl_dumpit_info *info = genl_dumpit_info(cb);
        struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
        struct nlattr **tb = info->info.attrs;
        const struct genl_family *rt;
        struct genl_op_iter i;
        int err;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));

        if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME])
                return -EINVAL;

        if (tb[CTRL_ATTR_FAMILY_ID]) {
                ctx->fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]);
        } else {
                rt = genl_family_find_byname(
                        nla_data(tb[CTRL_ATTR_FAMILY_NAME]));
                if (!rt)
                        return -ENOENT;
                ctx->fam_id = rt->id;
        }

        rt = genl_family_find_byid(ctx->fam_id);
        if (!rt)
                return -ENOENT;

        ctx->rt = rt;

        if (tb[CTRL_ATTR_OP]) {
                struct genl_split_ops doit, dump;

                ctx->single_op = true;
                ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]);

                err = genl_get_cmd_both(ctx->op, rt, &doit, &dump);
                if (err) {
                        NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]);
                        return err;
                }

                if (doit.policy) {
                        err = netlink_policy_dump_add_policy(&ctx->state,
                                                             doit.policy,
                                                             doit.maxattr);
                        if (err)
                                goto err_free_state;
                }
                if (dump.policy) {
                        err = netlink_policy_dump_add_policy(&ctx->state,
                                                             dump.policy,
                                                             dump.maxattr);
                        if (err)
                                goto err_free_state;
                }

                if (!ctx->state)
                        return -ENODATA;

                ctx->dump_map = 1;
                return 0;
        }

        ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL);
        if (!ctx->op_iter)
                return -ENOMEM;

        genl_op_iter_init(rt, ctx->op_iter);
        ctx->dump_map = genl_op_iter_next(ctx->op_iter);

        for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) {
                if (i.doit.policy) {
                        err = netlink_policy_dump_add_policy(&ctx->state,
                                                             i.doit.policy,
                                                             i.doit.maxattr);
                        if (err)
                                goto err_free_state;
                }
                if (i.dumpit.policy) {
                        err = netlink_policy_dump_add_policy(&ctx->state,
                                                             i.dumpit.policy,
                                                             i.dumpit.maxattr);
                        if (err)
                                goto err_free_state;
                }
        }

        if (!ctx->state) {
                err = -ENODATA;
                goto err_free_op_iter;
        }
        return 0;

err_free_state:
        netlink_policy_dump_free(ctx->state);
err_free_op_iter:
        kfree(ctx->op_iter);
        return err;
}

static void *ctrl_dumppolicy_prep(struct sk_buff *skb,
                                  struct netlink_callback *cb)
{
        struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
        void *hdr;

        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
                          cb->nlh->nlmsg_seq, &genl_ctrl,
                          NLM_F_MULTI, CTRL_CMD_GETPOLICY);
        if (!hdr)
                return NULL;

        if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, ctx->fam_id))
                return NULL;

        return hdr;
}

static int ctrl_dumppolicy_put_op(struct sk_buff *skb,
                                  struct netlink_callback *cb,
                                  struct genl_split_ops *doit,
                                  struct genl_split_ops *dumpit)
{
        struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
        struct nlattr *nest_pol, *nest_op;
        void *hdr;
        int idx;

        /* skip if we have nothing to show */
        if (!doit->policy && !dumpit->policy)
                return 0;

        hdr = ctrl_dumppolicy_prep(skb, cb);
        if (!hdr)
                return -ENOBUFS;

        nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY);
        if (!nest_pol)
                goto err;

        nest_op = nla_nest_start(skb, doit->cmd);
        if (!nest_op)
                goto err;

        if (doit->policy) {
                idx = netlink_policy_dump_get_policy_idx(ctx->state,
                                                         doit->policy,
                                                         doit->maxattr);

                if (nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx))
                        goto err;
        }
        if (dumpit->policy) {
                idx = netlink_policy_dump_get_policy_idx(ctx->state,
                                                         dumpit->policy,
                                                         dumpit->maxattr);

                if (nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx))
                        goto err;
        }

        nla_nest_end(skb, nest_op);
        nla_nest_end(skb, nest_pol);
        genlmsg_end(skb, hdr);

        return 0;
err:
        genlmsg_cancel(skb, hdr);
        return -ENOBUFS;
}

static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;
        void *hdr;

        if (ctx->dump_map) {
                if (ctx->single_op) {
                        struct genl_split_ops doit, dumpit;

                        if (WARN_ON(genl_get_cmd_both(ctx->op, ctx->rt,
                                                      &doit, &dumpit)))
                                return -ENOENT;

                        if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit))
                                return skb->len;

                        /* done with the per-op policy index list */
                        ctx->dump_map = 0;
                }

                while (ctx->dump_map) {
                        if (ctrl_dumppolicy_put_op(skb, cb,
                                                   &ctx->op_iter->doit,
                                                   &ctx->op_iter->dumpit))
                                return skb->len;

                        ctx->dump_map = genl_op_iter_next(ctx->op_iter);
                }
        }

        while (netlink_policy_dump_loop(ctx->state)) {
                struct nlattr *nest;

                hdr = ctrl_dumppolicy_prep(skb, cb);
                if (!hdr)
                        goto nla_put_failure;

                nest = nla_nest_start(skb, CTRL_ATTR_POLICY);
                if (!nest)
                        goto nla_put_failure;

                if (netlink_policy_dump_write(skb, ctx->state))
                        goto nla_put_failure;

                nla_nest_end(skb, nest);

                genlmsg_end(skb, hdr);
        }

        return skb->len;

nla_put_failure:
        genlmsg_cancel(skb, hdr);
        return skb->len;
}

static int ctrl_dumppolicy_done(struct netlink_callback *cb)
{
        struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx;

        kfree(ctx->op_iter);
        netlink_policy_dump_free(ctx->state);
        return 0;
}

static const struct genl_split_ops genl_ctrl_ops[] = {
        {
                .cmd                = CTRL_CMD_GETFAMILY,
                .validate        = GENL_DONT_VALIDATE_STRICT,
                .policy                = ctrl_policy_family,
                .maxattr        = ARRAY_SIZE(ctrl_policy_family) - 1,
                .doit                = ctrl_getfamily,
                .flags                = GENL_CMD_CAP_DO,
        },
        {
                .cmd                = CTRL_CMD_GETFAMILY,
                .validate        = GENL_DONT_VALIDATE_DUMP,
                .policy                = ctrl_policy_family,
                .maxattr        = ARRAY_SIZE(ctrl_policy_family) - 1,
                .dumpit                = ctrl_dumpfamily,
                .flags                = GENL_CMD_CAP_DUMP,
        },
        {
                .cmd                = CTRL_CMD_GETPOLICY,
                .policy                = ctrl_policy_policy,
                .maxattr        = ARRAY_SIZE(ctrl_policy_policy) - 1,
                .start                = ctrl_dumppolicy_start,
                .dumpit                = ctrl_dumppolicy,
                .done                = ctrl_dumppolicy_done,
                .flags                = GENL_CMD_CAP_DUMP,
        },
};

static const struct genl_multicast_group genl_ctrl_groups[] = {
        { .name = "notify", },
};

static struct genl_family genl_ctrl __ro_after_init = {
        .module = THIS_MODULE,
        .split_ops = genl_ctrl_ops,
        .n_split_ops = ARRAY_SIZE(genl_ctrl_ops),
        .resv_start_op = CTRL_CMD_GETPOLICY + 1,
        .mcgrps = genl_ctrl_groups,
        .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups),
        .id = GENL_ID_CTRL,
        .name = "nlctrl",
        .version = 0x2,
        .netnsok = true,
};

static int genl_bind(struct net *net, int group)
{
        const struct genl_family *family;
        unsigned int id;
        int ret = 0;

        down_read(&cb_lock);

        idr_for_each_entry(&genl_fam_idr, family, id) {
                const struct genl_multicast_group *grp;
                int i;

                if (family->n_mcgrps == 0)
                        continue;

                i = group - family->mcgrp_offset;
                if (i < 0 || i >= family->n_mcgrps)
                        continue;

                grp = &family->mcgrps[i];
                if ((grp->flags & GENL_MCAST_CAP_NET_ADMIN) &&
                    !ns_capable(net->user_ns, CAP_NET_ADMIN))
                        ret = -EPERM;
                if ((grp->flags & GENL_MCAST_CAP_SYS_ADMIN) &&
                    !ns_capable(net->user_ns, CAP_SYS_ADMIN))
                        ret = -EPERM;

                if (family->bind)
                        family->bind(i);

                break;
        }

        up_read(&cb_lock);
        return ret;
}

static void genl_unbind(struct net *net, int group)
{
        const struct genl_family *family;
        unsigned int id;

        down_read(&cb_lock);

        idr_for_each_entry(&genl_fam_idr, family, id) {
                int i;

                if (family->n_mcgrps == 0)
                        continue;

                i = group - family->mcgrp_offset;
                if (i < 0 || i >= family->n_mcgrps)
                        continue;

                if (family->unbind)
                        family->unbind(i);

                break;
        }

        up_read(&cb_lock);
}

static int __net_init genl_pernet_init(struct net *net)
{
        struct netlink_kernel_cfg cfg = {
                .input                = genl_rcv,
                .flags                = NL_CFG_F_NONROOT_RECV,
                .bind                = genl_bind,
                .unbind                = genl_unbind,
                .release        = genl_release,
        };

        /* we'll bump the group number right afterwards */
        net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg);

        if (!net->genl_sock && net_eq(net, &init_net))
                panic("GENL: Cannot initialize generic netlink\n");

        if (!net->genl_sock)
                return -ENOMEM;

        return 0;
}

static void __net_exit genl_pernet_exit(struct net *net)
{
        netlink_kernel_release(net->genl_sock);
        net->genl_sock = NULL;
}

static struct pernet_operations genl_pernet_ops = {
        .init = genl_pernet_init,
        .exit = genl_pernet_exit,
};

static int __init genl_init(void)
{
        int err;

        err = genl_register_family(&genl_ctrl);
        if (err < 0)
                goto problem;

        err = register_pernet_subsys(&genl_pernet_ops);
        if (err)
                goto problem;

        return 0;

problem:
        panic("GENL: Cannot register controller: %d\n", err);
}

core_initcall(genl_init);

static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
                         gfp_t flags)
{
        struct sk_buff *tmp;
        struct net *net, *prev = NULL;
        bool delivered = false;
        int err;

        for_each_net_rcu(net) {
                if (prev) {
                        tmp = skb_clone(skb, flags);
                        if (!tmp) {
                                err = -ENOMEM;
                                goto error;
                        }
                        err = nlmsg_multicast(prev->genl_sock, tmp,
                                              portid, group, flags);
                        if (!err)
                                delivered = true;
                        else if (err != -ESRCH)
                                goto error;
                }

                prev = net;
        }

        err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
        if (!err)
                delivered = true;
        else if (err != -ESRCH)
                return err;
        return delivered ? 0 : -ESRCH;
 error:
        kfree_skb(skb);
        return err;
}

int genlmsg_multicast_allns(const struct genl_family *family,
                            struct sk_buff *skb, u32 portid,
                            unsigned int group, gfp_t flags)
{
        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return -EINVAL;

        group = family->mcgrp_offset + group;
        return genlmsg_mcast(skb, portid, group, flags);
}
EXPORT_SYMBOL(genlmsg_multicast_allns);

void genl_notify(const struct genl_family *family, struct sk_buff *skb,
                 struct genl_info *info, u32 group, gfp_t flags)
{
        struct net *net = genl_info_net(info);
        struct sock *sk = net->genl_sock;

        if (WARN_ON_ONCE(group >= family->n_mcgrps))
                return;

        group = family->mcgrp_offset + group;
        nlmsg_notify(sk, skb, info->snd_portid, group,
                     nlmsg_report(info->nlhdr), flags);
}
EXPORT_SYMBOL(genl_notify);


















































































































































































   17 












    2 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H

#include <asm/processor.h>

#if defined(__KERNEL__) && !defined(__ASSEMBLY__)

#include <asm/asm.h>
#include <linux/bitops.h>
#include <asm/alternative.h>

enum cpuid_leafs
{
        CPUID_1_EDX                = 0,
        CPUID_8000_0001_EDX,
        CPUID_8086_0001_EDX,
        CPUID_LNX_1,
        CPUID_1_ECX,
        CPUID_C000_0001_EDX,
        CPUID_8000_0001_ECX,
        CPUID_LNX_2,
        CPUID_LNX_3,
        CPUID_7_0_EBX,
        CPUID_D_1_EAX,
        CPUID_LNX_4,
        CPUID_7_1_EAX,
        CPUID_8000_0008_EBX,
        CPUID_6_EAX,
        CPUID_8000_000A_EDX,
        CPUID_7_ECX,
        CPUID_8000_0007_EBX,
        CPUID_7_EDX,
        CPUID_8000_001F_EAX,
        CPUID_8000_0021_EAX,
        CPUID_LNX_5,
        NR_CPUID_WORDS,
};

#define X86_CAP_FMT_NUM "%d:%d"
#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31)

extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
#define X86_CAP_FMT "%s"
#define x86_cap_flag(flag) x86_cap_flags[flag]

/*
 * In order to save room, we index into this array by doing
 * X86_BUG_<name> - NCAPINTS*32.
 */
extern const char * const x86_bug_flags[NBUGINTS*32];

#define test_cpu_cap(c, bit)                                                \
         arch_test_bit(bit, (unsigned long *)((c)->x86_capability))

/*
 * There are 32 bits/features in each mask word.  The high bits
 * (selected with (bit>>5) give us the word number and the low 5
 * bits give us the bit/feature number inside the word.
 * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
 * see if it is set in the mask word.
 */
#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit)        \
        (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))

/*
 * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the
 * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all
 * header macros which use NCAPINTS need to be changed. The duplicated macro
 * use causes the compiler to issue errors for all headers so that all usage
 * sites can be corrected.
 */
#define REQUIRED_MASK_BIT_SET(feature_bit)                \
         ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) ||        \
           REQUIRED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 22))

#define DISABLED_MASK_BIT_SET(feature_bit)                                \
         ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) ||        \
           DISABLED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 22))

#define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         test_cpu_cap(c, bit))

#define this_cpu_has(bit)                                                \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         x86_this_cpu_test_bit(bit, cpu_info.x86_capability))

/*
 * This macro is for detection of features which need kernel
 * infrastructure to be used.  It may *not* directly test the CPU
 * itself.  Use the cpu_has() family if you want true runtime
 * testing of CPU features, like in hypervisor code where you are
 * supporting a possible guest feature where host support for it
 * is not relevant.
 */
#define cpu_feature_enabled(bit)        \
        (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))

#define boot_cpu_has(bit)        cpu_has(&boot_cpu_data, bit)

#define set_cpu_cap(c, bit)        set_bit(bit, (unsigned long *)((c)->x86_capability))

extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);

#define setup_force_cpu_cap(bit) do {                        \
                                                        \
        if (!boot_cpu_has(bit))                                \
                WARN_ON(alternatives_patched);                \
                                                        \
        set_cpu_cap(&boot_cpu_data, bit);                \
        set_bit(bit, (unsigned long *)cpu_caps_set);        \
} while (0)

#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)

/*
 * Static testing of CPU features. Used the same as boot_cpu_has(). It
 * statically patches the target code for additional performance. Use
 * static_cpu_has() only in fast paths, where every cycle counts. Which
 * means that the boot_cpu_has() variant is already fast enough for the
 * majority of cases and you should stick to using it as it is generally
 * only two instructions: a RIP-relative MOV and a TEST.
 *
 * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
 * that this is only used on a fallback path and will sometimes cause
 * it to manifest the address of boot_cpu_data in a register, fouling
 * the mainline (post-initialization) code.
 */
static __always_inline bool _static_cpu_has(u16 bit)
{
        asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
                ".pushsection .altinstr_aux,\"ax\"\n"
                "6:\n"
                " testb %[bitnum], %a[cap_byte]\n"
                " jnz %l[t_yes]\n"
                " jmp %l[t_no]\n"
                ".popsection\n"
                 : : [feature]  "i" (bit),
                     [bitnum]   "i" (1 << (bit & 7)),
                     [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3])
                 : : t_yes, t_no);
t_yes:
        return true;
t_no:
        return false;
}

#define static_cpu_has(bit)                                        \
(                                                                \
        __builtin_constant_p(boot_cpu_has(bit)) ?                \
                boot_cpu_has(bit) :                                \
                _static_cpu_has(bit)                                \
)

#define cpu_has_bug(c, bit)                cpu_has(c, (bit))
#define set_cpu_bug(c, bit)                set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit)                clear_cpu_cap(c, (bit))

#define static_cpu_has_bug(bit)                static_cpu_has((bit))
#define boot_cpu_has_bug(bit)                cpu_has_bug(&boot_cpu_data, (bit))
#define boot_cpu_set_bug(bit)                set_cpu_cap(&boot_cpu_data, (bit))

#define MAX_CPU_FEATURES                (NCAPINTS * 32)
#define cpu_have_feature                boot_cpu_has

#define CPU_FEATURE_TYPEFMT                "x86,ven%04Xfam%04Xmod%04X"
#define CPU_FEATURE_TYPEVAL                boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
                                        boot_cpu_data.x86_model

#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
#endif /* _ASM_X86_CPUFEATURE_H */











































































































    8 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC_H
#define _ASM_X86_ATOMIC_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

/*
 * Atomic operations that C can't guarantee us.  Useful for
 * resource counting etc..
 */

static __always_inline int arch_atomic_read(const atomic_t *v)
{
        /*
         * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
         * it's non-inlined function that increases binary size and stack usage.
         */
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic_set(atomic_t *v, int i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "addl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "subl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test

static __always_inline void arch_atomic_inc(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "incl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc

static __always_inline void arch_atomic_dec(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "decl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec

static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test

static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test

static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
#define arch_atomic_add_negative arch_atomic_add_negative

static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic_add_return arch_atomic_add_return

#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)

static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic_fetch_add arch_atomic_fetch_add

#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)

static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg

static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg

static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic_xchg arch_atomic_xchg

static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "andl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));

        return val;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and

static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "orl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));

        return val;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or

static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "xorl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));

        return val;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor

#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
# include <asm/atomic64_64.h>
#endif

#endif /* _ASM_X86_ATOMIC_H */






























































































   11 





   11 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_COMPAT_H
#define _ASM_X86_COMPAT_H

/*
 * Architecture specific compatibility types
 */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>

#define compat_mode_t        compat_mode_t
typedef u16                compat_mode_t;

#define __compat_uid_t        __compat_uid_t
typedef u16                __compat_uid_t;
typedef u16                __compat_gid_t;

#define compat_dev_t        compat_dev_t
typedef u16                compat_dev_t;

#define compat_ipc_pid_t compat_ipc_pid_t
typedef u16                 compat_ipc_pid_t;

#define compat_statfs        compat_statfs

#include <asm-generic/compat.h>

#define COMPAT_UTS_MACHINE        "i686\0\0"

typedef u16                compat_nlink_t;

struct compat_stat {
        u32                st_dev;
        compat_ino_t        st_ino;
        compat_mode_t        st_mode;
        compat_nlink_t        st_nlink;
        __compat_uid_t        st_uid;
        __compat_gid_t        st_gid;
        u32                st_rdev;
        u32                st_size;
        u32                st_blksize;
        u32                st_blocks;
        u32                st_atime;
        u32                st_atime_nsec;
        u32                st_mtime;
        u32                st_mtime_nsec;
        u32                st_ctime;
        u32                st_ctime_nsec;
        u32                __unused4;
        u32                __unused5;
};

/*
 * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the
 * compat flock64 structure.
 */
#define __ARCH_NEED_COMPAT_FLOCK64_PACKED

struct compat_statfs {
        int                f_type;
        int                f_bsize;
        int                f_blocks;
        int                f_bfree;
        int                f_bavail;
        int                f_files;
        int                f_ffree;
        compat_fsid_t        f_fsid;
        int                f_namelen;        /* SunOS ignores this field. */
        int                f_frsize;
        int                f_flags;
        int                f_spare[4];
};

#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
        (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif

static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
        if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
                return true;
#endif
        return false;
}

static inline bool in_32bit_syscall(void)
{
        return in_ia32_syscall() || in_x32_syscall();
}

#ifdef CONFIG_COMPAT
static inline bool in_compat_syscall(void)
{
        return in_32bit_syscall();
}
#define in_compat_syscall in_compat_syscall        /* override the generic impl */
#define compat_need_64bit_alignment_fixup in_ia32_syscall
#endif

struct compat_siginfo;

#ifdef CONFIG_X86_X32_ABI
int copy_siginfo_to_user32(struct compat_siginfo __user *to,
                const kernel_siginfo_t *from);
#define copy_siginfo_to_user32 copy_siginfo_to_user32
#endif /* CONFIG_X86_X32_ABI */

#endif /* _ASM_X86_COMPAT_H */



















































































































































































































































































































    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ext4

#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EXT4_H

#include <linux/writeback.h>
#include <linux/tracepoint.h>

struct ext4_allocation_context;
struct ext4_allocation_request;
struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
struct ext4_map_blocks;
struct extent_status;
struct ext4_fsmap;
struct partial_cluster;

#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))

#define show_mballoc_flags(flags) __print_flags(flags, "|",        \
        { EXT4_MB_HINT_MERGE,                "HINT_MERGE" },                \
        { EXT4_MB_HINT_RESERVED,        "HINT_RESV" },                \
        { EXT4_MB_HINT_METADATA,        "HINT_MDATA" },                \
        { EXT4_MB_HINT_FIRST,                "HINT_FIRST" },                \
        { EXT4_MB_HINT_BEST,                "HINT_BEST" },                \
        { EXT4_MB_HINT_DATA,                "HINT_DATA" },                \
        { EXT4_MB_HINT_NOPREALLOC,        "HINT_NOPREALLOC" },        \
        { EXT4_MB_HINT_GROUP_ALLOC,        "HINT_GRP_ALLOC" },        \
        { EXT4_MB_HINT_GOAL_ONLY,        "HINT_GOAL_ONLY" },        \
        { EXT4_MB_HINT_TRY_GOAL,        "HINT_TRY_GOAL" },        \
        { EXT4_MB_DELALLOC_RESERVED,        "DELALLOC_RESV" },        \
        { EXT4_MB_STREAM_ALLOC,                "STREAM_ALLOC" },        \
        { EXT4_MB_USE_ROOT_BLOCKS,        "USE_ROOT_BLKS" },        \
        { EXT4_MB_USE_RESERVED,                "USE_RESV" },                \
        { EXT4_MB_STRICT_CHECK,                "STRICT_CHECK" })

#define show_map_flags(flags) __print_flags(flags, "|",                        \
        { EXT4_GET_BLOCKS_CREATE,                "CREATE" },                \
        { EXT4_GET_BLOCKS_UNWRIT_EXT,                "UNWRIT" },                \
        { EXT4_GET_BLOCKS_DELALLOC_RESERVE,        "DELALLOC" },                \
        { EXT4_GET_BLOCKS_PRE_IO,                "PRE_IO" },                \
        { EXT4_GET_BLOCKS_CONVERT,                "CONVERT" },                \
        { EXT4_GET_BLOCKS_METADATA_NOFAIL,        "METADATA_NOFAIL" },        \
        { EXT4_GET_BLOCKS_NO_NORMALIZE,                "NO_NORMALIZE" },        \
        { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,        "CONVERT_UNWRITTEN" },  \
        { EXT4_GET_BLOCKS_ZERO,                        "ZERO" },                \
        { EXT4_GET_BLOCKS_IO_SUBMIT,                "IO_SUBMIT" },                \
        { EXT4_EX_NOCACHE,                        "EX_NOCACHE" })

/*
 * __print_flags() requires that all enum values be wrapped in the
 * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
 * ring buffer.
 */
TRACE_DEFINE_ENUM(BH_New);
TRACE_DEFINE_ENUM(BH_Mapped);
TRACE_DEFINE_ENUM(BH_Unwritten);
TRACE_DEFINE_ENUM(BH_Boundary);

#define show_mflags(flags) __print_flags(flags, "",        \
        { EXT4_MAP_NEW,                "N" },                        \
        { EXT4_MAP_MAPPED,        "M" },                        \
        { EXT4_MAP_UNWRITTEN,        "U" },                        \
        { EXT4_MAP_BOUNDARY,        "B" })

#define show_free_flags(flags) __print_flags(flags, "|",        \
        { EXT4_FREE_BLOCKS_METADATA,                "METADATA" },        \
        { EXT4_FREE_BLOCKS_FORGET,                "FORGET" },        \
        { EXT4_FREE_BLOCKS_VALIDATED,                "VALIDATED" },        \
        { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE,        "NO_QUOTA" },        \
        { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
        { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER,        "LAST_CLUSTER" })

TRACE_DEFINE_ENUM(ES_WRITTEN_B);
TRACE_DEFINE_ENUM(ES_UNWRITTEN_B);
TRACE_DEFINE_ENUM(ES_DELAYED_B);
TRACE_DEFINE_ENUM(ES_HOLE_B);
TRACE_DEFINE_ENUM(ES_REFERENCED_B);

#define show_extent_status(status) __print_flags(status, "",        \
        { EXTENT_STATUS_WRITTEN,        "W" },                        \
        { EXTENT_STATUS_UNWRITTEN,        "U" },                        \
        { EXTENT_STATUS_DELAYED,        "D" },                        \
        { EXTENT_STATUS_HOLE,                "H" },                        \
        { EXTENT_STATUS_REFERENCED,        "R" })

#define show_falloc_mode(mode) __print_flags(mode, "|",                \
        { FALLOC_FL_KEEP_SIZE,                "KEEP_SIZE"},                \
        { FALLOC_FL_PUNCH_HOLE,                "PUNCH_HOLE"},                \
        { FALLOC_FL_NO_HIDE_STALE,        "NO_HIDE_STALE"},        \
        { FALLOC_FL_COLLAPSE_RANGE,        "COLLAPSE_RANGE"},        \
        { FALLOC_FL_ZERO_RANGE,                "ZERO_RANGE"})

TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);

#define show_fc_reason(reason)                                                \
        __print_symbolic(reason,                                        \
                { EXT4_FC_REASON_XATTR,                "XATTR"},                \
                { EXT4_FC_REASON_CROSS_RENAME,        "CROSS_RENAME"},        \
                { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \
                { EXT4_FC_REASON_NOMEM,        "NO_MEM"},                        \
                { EXT4_FC_REASON_SWAP_BOOT,        "SWAP_BOOT"},                \
                { EXT4_FC_REASON_RESIZE,        "RESIZE"},                \
                { EXT4_FC_REASON_RENAME_DIR,        "RENAME_DIR"},                \
                { EXT4_FC_REASON_FALLOC_RANGE,        "FALLOC_RANGE"},        \
                { EXT4_FC_REASON_INODE_JOURNAL_DATA,        "INODE_JOURNAL_DATA"}, \
                { EXT4_FC_REASON_ENCRYPTED_FILENAME,        "ENCRYPTED_FILENAME"})

TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);
TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW);
TRACE_DEFINE_ENUM(CR_ANY_FREE);

#define show_criteria(cr)                                               \
        __print_symbolic(cr,                                            \
                         { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" },        \
                         { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" },      \
                         { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" },    \
                         { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" },      \
                         { CR_ANY_FREE, "CR_ANY_FREE" })

TRACE_EVENT(ext4_other_inode_update_time,
        TP_PROTO(struct inode *inode, ino_t orig_ino),

        TP_ARGS(inode, orig_ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        orig_ino                )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->orig_ino = orig_ino;
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->orig_ino,
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid)
);

TRACE_EVENT(ext4_free_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u64, blocks                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->blocks        = inode->i_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid, __entry->blocks)
);

TRACE_EVENT(ext4_request_inode,
        TP_PROTO(struct inode *dir, int mode),

        TP_ARGS(dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        dir                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = dir->i_sb->s_dev;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_allocate_inode,
        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        dir                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_evict_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        nlink                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->nlink        = inode->i_nlink;
        ),

        TP_printk("dev %d,%d ino %lu nlink %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nlink)
);

TRACE_EVENT(ext4_drop_inode,
        TP_PROTO(struct inode *inode, int drop),

        TP_ARGS(inode, drop),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        drop                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->drop        = drop;
        ),

        TP_printk("dev %d,%d ino %lu drop %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->drop)
);

TRACE_EVENT(ext4_nfs_commit_metadata,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_mark_inode_dirty,
        TP_PROTO(struct inode *inode, unsigned long IP),

        TP_ARGS(inode, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        ip                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->ip        = IP;
        ),

        TP_printk("dev %d,%d ino %lu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, (void *)__entry->ip)
);

TRACE_EVENT(ext4_begin_ordered_truncate,
        TP_PROTO(struct inode *inode, loff_t new_size),

        TP_ARGS(inode, new_size),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        new_size                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->new_size        = new_size;
        ),

        TP_printk("dev %d,%d ino %lu new_size %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->new_size)
);

DECLARE_EVENT_CLASS(ext4__write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len)
);

DEFINE_EVENT(ext4__write_begin, ext4_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),

        TP_ARGS(inode, pos, len, copied),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, copied                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->copied        = copied;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->copied)
);

DEFINE_EVENT(ext4__write_end, ext4_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_da_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

TRACE_EVENT(ext4_writepages,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        long,        nr_to_write                )
                __field(        long,        pages_skipped                )
                __field(        loff_t,        range_start                )
                __field(        loff_t,        range_end                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
                __field(        char,        for_kupdate                )
                __field(        char,        range_cyclic                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->range_start        = wbc->range_start;
                __entry->range_end        = wbc->range_end;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->range_cyclic        = wbc->range_cyclic;
        ),

        TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
                  "range_start %lld range_end %lld sync_mode %d "
                  "for_kupdate %d range_cyclic %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
                  __entry->range_end, __entry->sync_mode,
                  __entry->for_kupdate, __entry->range_cyclic,
                  (unsigned long) __entry->writeback_index)
);

TRACE_EVENT(ext4_da_write_pages,
        TP_PROTO(struct inode *inode, pgoff_t first_page,
                 struct writeback_control *wbc),

        TP_ARGS(inode, first_page, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(      pgoff_t,        first_page                )
                __field(         long,        nr_to_write                )
                __field(          int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->first_page        = first_page;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
                  "sync_mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->first_page,
                  __entry->nr_to_write, __entry->sync_mode)
);

TRACE_EVENT(ext4_da_write_pages_extent,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),

        TP_ARGS(inode, map),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        lblk                        )
                __field(        __u32,        len                        )
                __field(        __u32,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = map->m_lblk;
                __entry->len                = map->m_len;
                __entry->flags                = map->m_flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
                  show_mflags(__entry->flags))
);

TRACE_EVENT(ext4_writepages_result,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                        int ret, int pages_written),

        TP_ARGS(inode, wbc, ret, pages_written),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
                __field(        int,        pages_written                )
                __field(        long,        pages_skipped                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
                __entry->pages_written        = pages_written;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
                  "sync_mode %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->sync_mode,
                  (unsigned long) __entry->writeback_index)
);

DECLARE_EVENT_CLASS(ext4__folio_op,
        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->index        = folio->index;
        ),

        TP_printk("dev %d,%d ino %lu folio_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index)
);

DEFINE_EVENT(ext4__folio_op, ext4_read_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DEFINE_EVENT(ext4__folio_op, ext4_release_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DECLARE_EVENT_CLASS(ext4_invalidate_folio_op,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )
                __field(        size_t, offset                        )
                __field(        size_t, length                        )
        ),

        TP_fast_assign(
                __entry->dev        = folio->mapping->host->i_sb->s_dev;
                __entry->ino        = folio->mapping->host->i_ino;
                __entry->index        = folio->index;
                __entry->offset        = offset;
                __entry->length        = length;
        ),

        TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index,
                  __entry->offset, __entry->length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),

        TP_ARGS(sb, blk, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        blk                        )
                __field(        __u64,        count                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->blk        = blk;
                __entry->count        = count;
        ),

        TP_printk("dev %d,%d blk %llu count %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blk, __entry->count)
);

DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u64,        pa_lstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_lstart        = pa->pa_lstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),

        TP_ARGS(pa, block, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        __u32,        count                        )

        ),

        TP_fast_assign(
                __entry->dev                = pa->pa_inode->i_sb->s_dev;
                __entry->ino                = pa->pa_inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d ino %lu block %llu count %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->block, __entry->count)
);

TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),

        TP_ARGS(sb, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d pstart %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->pa_pstart, __entry->pa_len)
);

TRACE_EVENT(ext4_discard_preallocations,
        TP_PROTO(struct inode *inode, unsigned int len),

        TP_ARGS(inode, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        len                )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu len: %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->len)
);

TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_PROTO(struct super_block *sb, int needed),

        TP_ARGS(sb, needed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        needed                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d needed %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->needed)
);

TRACE_EVENT(ext4_request_blocks,
        TP_PROTO(struct ext4_allocation_request *ar),

        TP_ARGS(ar),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu "
                  "lleft %u lright %u pleft %llu pright %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->logical, __entry->goal,
                  __entry->lleft, __entry->lright, __entry->pleft,
                  __entry->pright)
);

TRACE_EVENT(ext4_allocate_blocks,
        TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),

        TP_ARGS(ar, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->block        = block;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u "
                  "goal %llu lleft %u lright %u pleft %llu pright %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->block, __entry->logical,
                  __entry->goal,  __entry->lleft, __entry->lright,
                  __entry->pleft, __entry->pright)
);

TRACE_EVENT(ext4_free_blocks,
        TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
                 int flags),

        TP_ARGS(inode, block, count, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned long,        count                )
                __field(        int,        flags                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
                __entry->flags                = flags;
                __entry->mode                = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  show_free_flags(__entry->flags))
);

TRACE_EVENT(ext4_sync_file_enter,
        TP_PROTO(struct file *file, int datasync),

        TP_ARGS(file, datasync),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        int,        datasync                )
        ),

        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;

                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->datasync        = datasync;
                __entry->parent                = d_inode(dentry->d_parent)->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
);

TRACE_EVENT(ext4_sync_file_exit,
        TP_PROTO(struct inode *inode, int ret),

        TP_ARGS(inode, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

TRACE_EVENT(ext4_sync_fs,
        TP_PROTO(struct super_block *sb, int wait),

        TP_ARGS(sb, wait),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        wait                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->wait        = wait;
        ),

        TP_printk("dev %d,%d wait %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->wait)
);

TRACE_EVENT(ext4_alloc_da_blocks,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field( unsigned int,        data_blocks                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
        ),

        TP_printk("dev %d,%d ino %lu reserved_data_blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->data_blocks)
);

TRACE_EVENT(ext4_mballoc_alloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         goal_logical                )
                __field(          int,        goal_start                )
                __field(        __u32,         goal_group                )
                __field(          int,        goal_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
                __field(        __u16,        found                        )
                __field(        __u16,        groups                        )
                __field(        __u16,        buddy                        )
                __field(        __u16,        flags                        )
                __field(        __u16,        tail                        )
                __field(        __u8,        cr                        )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->goal_logical        = ac->ac_g_ex.fe_logical;
                __entry->goal_start        = ac->ac_g_ex.fe_start;
                __entry->goal_group        = ac->ac_g_ex.fe_group;
                __entry->goal_len        = ac->ac_g_ex.fe_len;
                __entry->result_logical        = ac->ac_f_ex.fe_logical;
                __entry->result_start        = ac->ac_f_ex.fe_start;
                __entry->result_group        = ac->ac_f_ex.fe_group;
                __entry->result_len        = ac->ac_f_ex.fe_len;
                __entry->found                = ac->ac_found;
                __entry->flags                = ac->ac_flags;
                __entry->groups                = ac->ac_groups_scanned;
                __entry->buddy                = ac->ac_buddy;
                __entry->tail                = ac->ac_tail;
                __entry->cr                = ac->ac_criteria;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %s flags %s "
                  "tail %u broken %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
                  __entry->goal_len, __entry->goal_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical,
                  __entry->found, __entry->groups, show_criteria(__entry->cr),
                  show_mballoc_flags(__entry->flags), __entry->tail,
                  __entry->buddy ? 1 << __entry->buddy : 0)
);

TRACE_EVENT(ext4_mballoc_prealloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->result_logical        = ac->ac_b_ex.fe_logical;
                __entry->result_start        = ac->ac_b_ex.fe_start;
                __entry->result_group        = ac->ac_b_ex.fe_group;
                __entry->result_len        = ac->ac_b_ex.fe_len;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical)
);

DECLARE_EVENT_CLASS(ext4__mballoc,
        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->result_start        = start;
                __entry->result_group        = group;
                __entry->result_len        = len;
        ),

        TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

TRACE_EVENT(ext4_forget,
        TP_PROTO(struct inode *inode, int is_metadata, __u64 block),

        TP_ARGS(inode, is_metadata, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        int,        is_metadata                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->block        = block;
                __entry->is_metadata = is_metadata;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->is_metadata, __entry->block)
);

TRACE_EVENT(ext4_da_update_reserve_space,
        TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),

        TP_ARGS(inode, used_blocks, quota_claim),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        used_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        int,        quota_claim                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->used_blocks = used_blocks;
                __entry->reserved_data_blocks =
                                EXT4_I(inode)->i_reserved_data_blocks;
                __entry->quota_claim = quota_claim;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
                  "reserved_data_blocks %d quota_claim %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->quota_claim)
);

TRACE_EVENT(ext4_da_reserve_space,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->reserved_data_blocks)
);

TRACE_EVENT(ext4_da_release_space,
        TP_PROTO(struct inode *inode, int freed_blocks),

        TP_ARGS(inode, freed_blocks),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        freed_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->freed_blocks = freed_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks)
);

DECLARE_EVENT_CLASS(ext4__bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

TRACE_EVENT(ext4_read_block_bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch),

        TP_ARGS(sb, group, prefetch),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        bool,        prefetch                )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->prefetch = prefetch;
        ),

        TP_printk("dev %d,%d group %u prefetch %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->prefetch)
);

DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        offset                        )
                __field(        loff_t, len                        )
                __field(        int,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len,
                  show_falloc_mode(__entry->mode))
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

TRACE_EVENT(ext4_fallocate_exit,
        TP_PROTO(struct inode *inode, loff_t offset,
                 unsigned int max_blocks, int ret),

        TP_ARGS(inode, offset, max_blocks, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int,        blocks                )
                __field(        int,         ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->blocks        = max_blocks;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->blocks,
                  __entry->ret)
);

TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),

        TP_ARGS(parent, dentry),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        loff_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->parent                = parent->i_ino;
                __entry->size                = d_inode(dentry)->i_size;
        ),

        TP_printk("dev %d,%d ino %lu size %lld parent %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->size,
                  (unsigned long) __entry->parent)
);

TRACE_EVENT(ext4_unlink_exit,
        TP_PROTO(struct dentry *dentry, int ret),

        TP_ARGS(dentry, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__truncate,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        __u64,                blocks                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->blocks        = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %lu blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->blocks)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/* 'ux' is the unwritten extent. */
TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux),

        TP_ARGS(inode, map, ux),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
                  "u_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
);

/*
 * 'ux' is the unwritten extent.
 * 'ix' is the initialized extent to which blocks are transferred.
 */
TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux, struct ext4_extent *ix),

        TP_ARGS(inode, map, ux, ix),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
                __field(        ext4_lblk_t,        i_lblk        )
                __field(        unsigned,        i_len        )
                __field(        ext4_fsblk_t,        i_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
                __entry->i_lblk                = le32_to_cpu(ix->ee_block);
                __entry->i_len                = ext4_ext_get_actual_len(ix);
                __entry->i_pblk                = ext4_ext_pblock(ix);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
                  "u_lblk %u u_len %u u_pblk %llu "
                  "i_lblk %u i_len %u i_pblk %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
                  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),

        TP_ARGS(inode, lblk, len, flags),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len, show_map_flags(__entry->flags))
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
                 int ret),

        TP_ARGS(inode, flags, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        flags                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        mflags                )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->flags        = flags;
                __entry->pblk        = map->m_pblk;
                __entry->lblk        = map->m_lblk;
                __entry->len        = map->m_len;
                __entry->mflags        = map->m_flags;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u "
                  "mflags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
                  __entry->len, show_mflags(__entry->mflags), __entry->ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

TRACE_EVENT(ext4_ext_load_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),

        TP_ARGS(inode, lblk, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->pblk)
);

TRACE_EVENT(ext4_load_inode,
        TP_PROTO(struct super_block *sb, unsigned long ino),

        TP_ARGS(sb, ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                )
                __field(        ino_t,        ino                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = ino;
        ),

        TP_printk("dev %d,%d ino %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_journal_start_sb,
        TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type, (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_inode,
        TP_PROTO(struct inode *inode, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        unsigned long,        ino                )
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = inode->i_sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
                __entry->ino                 = inode->i_ino;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, ino %lu, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type, __entry->ino,
                  (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_reserved,
        TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),

        TP_ARGS(sb, blocks, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
        ),

        TP_printk("dev %d,%d blocks, %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, (void *)__entry->ip)
);

DECLARE_EVENT_CLASS(ext4__trim,
        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len),

        TP_STRUCT__entry(
                __field(        int,        dev_major                )
                __field(        int,        dev_minor                )
                __field(        __u32,         group                        )
                __field(        int,        start                        )
                __field(        int,        len                        )
        ),

        TP_fast_assign(
                __entry->dev_major        = MAJOR(sb->s_dev);
                __entry->dev_minor        = MINOR(sb->s_dev);
                __entry->group                = group;
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk("dev %d,%d group %u, start %d, len %d",
                  __entry->dev_major, __entry->dev_minor,
                  __entry->group, __entry->start, __entry->len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_extent,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_all_free,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

TRACE_EVENT(ext4_ext_handle_unwritten_extents,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
                 unsigned int allocated, ext4_fsblk_t newblock),

        TP_ARGS(inode, map, flags, allocated, newblock),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        int,                flags                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        allocated        )
                __field(        ext4_fsblk_t,        newblk                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->flags                = flags;
                __entry->lblk                = map->m_lblk;
                __entry->pblk                = map->m_pblk;
                __entry->len                = map->m_len;
                __entry->allocated        = allocated;
                __entry->newblk                = newblock;
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s "
                  "allocated %d newblock %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_map_flags(__entry->flags),
                  (unsigned int) __entry->allocated,
                  (unsigned long long) __entry->newblk)
);

TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
        TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),

        TP_ARGS(sb, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        unsigned int,        flags        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        unsigned int,        len        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = map->m_flags;
                __entry->lblk        = map->m_lblk;
                __entry->pblk        = map->m_pblk;
                __entry->len        = map->m_len;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_mflags(__entry->flags), __entry->ret)
);

TRACE_EVENT(ext4_ext_show_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                 unsigned short len),

        TP_ARGS(inode, lblk, pblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned short,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  (unsigned long long) __entry->pblk,
                  (unsigned short) __entry->len)
);

TRACE_EVENT(ext4_remove_blocks,
        TP_PROTO(struct inode *inode, struct ext4_extent *ex,
                 ext4_lblk_t from, ext4_fsblk_t to,
                 struct partial_cluster *pc),

        TP_ARGS(inode, ex, from, to, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        from        )
                __field(        ext4_lblk_t,        to        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        unsigned short,        ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
                  "from %u to %u partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (unsigned) __entry->from,
                  (unsigned) __entry->to,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_leaf,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 struct ext4_extent *ex,
                 struct partial_cluster *pc),

        TP_ARGS(inode, start, ex, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        short,                ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
                  "partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_idx,
        TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),

        TP_ARGS(inode, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
        ),

        TP_printk("dev %d,%d ino %lu index_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long long) __entry->pblk)
);

TRACE_EVENT(ext4_ext_remove_space,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 ext4_lblk_t end, int depth),

        TP_ARGS(inode, start, end, depth),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        end        )
                __field(        int,                depth        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->start        = start;
                __entry->end        = end;
                __entry->depth        = depth;
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth)
);

TRACE_EVENT(ext4_ext_remove_space_done,
        TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
                 int depth, struct partial_cluster *pc, __le16 eh_entries),

        TP_ARGS(inode, start, end, depth, pc, eh_entries),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        start                )
                __field(        ext4_lblk_t,        end                )
                __field(        int,                depth                )
                __field(        ext4_fsblk_t,        pc_pclu                )
                __field(        ext4_lblk_t,        pc_lblk                )
                __field(        int,                pc_state        )
                __field(        unsigned short,        eh_entries        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->end                = end;
                __entry->depth                = depth;
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
                __entry->eh_entries        = le16_to_cpu(eh_entries);
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d "
                  "partial [pclu %lld lblk %u state %d] "
                  "remaining_entries %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state,
                  (unsigned short) __entry->eh_entries)
);

DECLARE_EVENT_CLASS(ext4__es_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

TRACE_EVENT(ext4_es_remove_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        lblk                        )
                __field(        loff_t,        len                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len)
);

TRACE_EVENT(ext4_es_find_extent_range_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_find_extent_range_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

TRACE_EVENT(ext4_es_lookup_extent_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_lookup_extent_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 int found),

        TP_ARGS(inode, es, found),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        int,                found                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
                __entry->found        = found;
        ),

        TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->found,
                  __entry->lblk, __entry->len,
                  __entry->found ? __entry->pblk : 0,
                  show_extent_status(__entry->found ? __entry->status : 0))
);

DECLARE_EVENT_CLASS(ext4__es_shrink_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_to_scan                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

TRACE_EVENT(ext4_es_shrink_scan_exit,
        TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt),

        TP_ARGS(sb, nr_shrunk, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_shrunk                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_shrunk, __entry->cache_cnt)
);

TRACE_EVENT(ext4_collapse_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_insert_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_es_shrink,
        TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
                 int nr_skipped, int retried),

        TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        int,                nr_shrunk        )
                __field(        unsigned long long, scan_time        )
                __field(        int,                nr_skipped        )
                __field(        int,                retried                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->scan_time        = div_u64(scan_time, 1000);
                __entry->nr_skipped        = nr_skipped;
                __entry->retried        = retried;
        ),

        TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
                  "nr_skipped %d retried %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
                  __entry->scan_time, __entry->nr_skipped, __entry->retried)
);

TRACE_EVENT(ext4_es_insert_delayed_block,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 bool allocated),

        TP_ARGS(inode, es, allocated),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        bool,                allocated        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = es->es_lblk;
                __entry->len                = es->es_len;
                __entry->pblk                = ext4_es_show_pblock(es);
                __entry->status                = ext4_es_status(es);
                __entry->allocated        = allocated;
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
                  "allocated %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status),
                  __entry->allocated)
);

/* fsmap traces */
DECLARE_EVENT_CLASS(ext4_fsmap_class,
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
                 u64 owner),
        TP_ARGS(sb, keydev, agno, bno, len, owner),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u32, agno)
                __field(u64, bno)
                __field(u64, len)
                __field(u64, owner)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->bno = bno;
                __entry->len = len;
                __entry->owner = owner;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->bno,
                  __entry->len,
                  __entry->owner)
)
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(ext4_fsmap_class, name, \
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \
                 u64 owner), \
        TP_ARGS(sb, keydev, agno, bno, len, owner))
DEFINE_FSMAP_EVENT(ext4_fsmap_low_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_high_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_mapping);

DECLARE_EVENT_CLASS(ext4_getfsmap_class,
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap),
        TP_ARGS(sb, fsmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u64, block)
                __field(u64, len)
                __field(u64, owner)
                __field(u64, flags)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(fsmap->fmr_device);
                __entry->block = fsmap->fmr_physical;
                __entry->len = fsmap->fmr_length;
                __entry->owner = fsmap->fmr_owner;
                __entry->flags = fsmap->fmr_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->block,
                  __entry->len,
                  __entry->owner,
                  __entry->flags)
)
#define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT(ext4_getfsmap_class, name, \
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \
        TP_ARGS(sb, fsmap))
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping);

TRACE_EVENT(ext4_shutdown,
        TP_PROTO(struct super_block *sb, unsigned long flags),

        TP_ARGS(sb, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(     unsigned,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags)
);

TRACE_EVENT(ext4_error,
        TP_PROTO(struct super_block *sb, const char *function,
                 unsigned int line),

        TP_ARGS(sb, function, line),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field( const char *,        function                )
                __field(     unsigned,        line                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->function = function;
                __entry->line        = line;
        ),

        TP_printk("dev %d,%d function %s line %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->function, __entry->line)
);

TRACE_EVENT(ext4_prefetch_bitmaps,
            TP_PROTO(struct super_block *sb, ext4_group_t group,
                     ext4_group_t next, unsigned int prefetch_ios),

        TP_ARGS(sb, group, next, prefetch_ios),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        __u32,        next                        )
                __field(        __u32,        ios                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->next        = next;
                __entry->ios        = prefetch_ios;
        ),

        TP_printk("dev %d,%d group %u next %u ios %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->next, __entry->ios)
);

TRACE_EVENT(ext4_lazy_itable_init,
            TP_PROTO(struct super_block *sb, ext4_group_t group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
);

TRACE_EVENT(ext4_fc_replay_scan,
        TP_PROTO(struct super_block *sb, int error, int off),

        TP_ARGS(sb, error, off),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, error)
                __field(int, off)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->error = error;
                __entry->off = off;
        ),

        TP_printk("dev %d,%d error %d, off %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->error, __entry->off)
);

TRACE_EVENT(ext4_fc_replay,
        TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2),

        TP_ARGS(sb, tag, ino, priv1, priv2),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, tag)
                __field(int, ino)
                __field(int, priv1)
                __field(int, priv2)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tag = tag;
                __entry->ino = ino;
                __entry->priv1 = priv1;
                __entry->priv2 = priv2;
        ),

        TP_printk("dev %d,%d: tag %d, ino %d, data1 %d, data2 %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tag, __entry->ino, __entry->priv1, __entry->priv2)
);

TRACE_EVENT(ext4_fc_commit_start,
        TP_PROTO(struct super_block *sb, tid_t commit_tid),

        TP_ARGS(sb, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d tid %u", MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tid)
);

TRACE_EVENT(ext4_fc_commit_stop,
            TP_PROTO(struct super_block *sb, int nblks, int reason,
                     tid_t commit_tid),

        TP_ARGS(sb, nblks, reason, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, nblks)
                __field(int, reason)
                __field(int, num_fc)
                __field(int, num_fc_ineligible)
                __field(int, nblks_agg)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->nblks = nblks;
                __entry->reason = reason;
                __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->num_fc_ineligible =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nblks, __entry->reason, __entry->num_fc,
                  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
);

#define FC_REASON_NAME_STAT(reason)                                        \
        show_fc_reason(reason),                                                \
        __entry->fc_ineligible_rc[reason]

TRACE_EVENT(ext4_fc_stats,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX)
                __field(unsigned long, fc_commits)
                __field(unsigned long, fc_ineligible_commits)
                __field(unsigned long, fc_numblks)
        ),

        TP_fast_assign(
                int i;

                __entry->dev = sb->s_dev;
                for (i = 0; i < EXT4_FC_REASON_MAX; i++) {
                        __entry->fc_ineligible_rc[i] =
                                EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i];
                }
                __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->fc_ineligible_commits =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("dev %d,%d fc ineligible reasons:\n"
                  "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u"
                  "num_commits:%lu, ineligible: %lu, numblks: %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME),
                  __entry->fc_commits, __entry->fc_ineligible_commits,
                  __entry->fc_numblks)
);

DECLARE_EVENT_CLASS(ext4_fc_track_dentry,

        TP_PROTO(handle_t *handle, struct inode *inode,
                 struct dentry *dentry, int ret),

        TP_ARGS(handle, inode, dentry, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d,%d, t_tid %u, ino %lu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error
        )
);

#define DEFINE_EVENT_CLASS_DENTRY(__type)                                \
DEFINE_EVENT(ext4_fc_track_dentry, ext4_fc_track_##__type,                \
        TP_PROTO(handle_t *handle, struct inode *inode,                        \
                 struct dentry *dentry, int ret),                        \
        TP_ARGS(handle, inode, dentry, ret)                                \
)

DEFINE_EVENT_CLASS_DENTRY(create);
DEFINE_EVENT_CLASS_DENTRY(link);
DEFINE_EVENT_CLASS_DENTRY(unlink);

TRACE_EVENT(ext4_fc_track_inode,
        TP_PROTO(handle_t *handle, struct inode *inode, int ret),

        TP_ARGS(handle, inode, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error)
        );

TRACE_EVENT(ext4_fc_track_range,
        TP_PROTO(handle_t *handle, struct inode *inode,
                 long start, long end, int ret),

        TP_ARGS(handle, inode, start, end, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(long, start)
                __field(long, end)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->start = start;
                __entry->end = end;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d, start %ld, end %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error, __entry->start, __entry->end)
        );

TRACE_EVENT(ext4_fc_cleanup,
        TP_PROTO(journal_t *journal, int full, tid_t tid),

        TP_ARGS(journal, full, tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, j_fc_off)
                __field(int, full)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                struct super_block *sb = journal->j_private;

                __entry->dev = sb->s_dev;
                __entry->j_fc_off = journal->j_fc_off;
                __entry->full = full;
                __entry->tid = tid;
        ),

        TP_printk("dev %d,%d, j_fc_off %d, full %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->j_fc_off, __entry->full, __entry->tid)
        );

TRACE_EVENT(ext4_update_sb,
        TP_PROTO(struct super_block *sb, ext4_fsblk_t fsblk,
                 unsigned int flags),

        TP_ARGS(sb, fsblk, flags),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ext4_fsblk_t,        fsblk)
                __field(unsigned int,        flags)
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->fsblk        = fsblk;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d fsblk %llu flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->fsblk, __entry->flags)
);

#endif /* _TRACE_EXT4_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































    2 






    2 
    2 






















    2 


































    2 



    2 



















    2 

















    2 







    2 






    2 












































































































































    1 




    1 







    1 


    1 









    1 































    1 



    1 



    1 









    1 










    1 


    1 














































    2 








    2 













    2 
























































































































    1 


    1 




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
// SPDX-License-Identifier: GPL-2.0-only
/*
 * File: socket.c
 *
 * Phonet sockets
 *
 * Copyright (C) 2008 Nokia Corporation.
 *
 * Authors: Sakari Ailus <sakari.ailus@nokia.com>
 *          Rémi Denis-Courmont
 */

#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/net.h>
#include <linux/poll.h>
#include <linux/sched/signal.h>

#include <net/sock.h>
#include <net/tcp_states.h>

#include <linux/phonet.h>
#include <linux/export.h>
#include <net/phonet/phonet.h>
#include <net/phonet/pep.h>
#include <net/phonet/pn_dev.h>

static int pn_socket_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        if (sk) {
                sock->sk = NULL;
                sk->sk_prot->close(sk, 0);
        }
        return 0;
}

#define PN_HASHSIZE        16
#define PN_HASHMASK        (PN_HASHSIZE-1)


static struct  {
        struct hlist_head hlist[PN_HASHSIZE];
        struct mutex lock;
} pnsocks;

void __init pn_sock_init(void)
{
        unsigned int i;

        for (i = 0; i < PN_HASHSIZE; i++)
                INIT_HLIST_HEAD(pnsocks.hlist + i);
        mutex_init(&pnsocks.lock);
}

static struct hlist_head *pn_hash_list(u16 obj)
{
        return pnsocks.hlist + (obj & PN_HASHMASK);
}

/*
 * Find address based on socket address, match only certain fields.
 * Also grab sock if it was found. Remember to sock_put it later.
 */
struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn)
{
        struct sock *sknode;
        struct sock *rval = NULL;
        u16 obj = pn_sockaddr_get_object(spn);
        u8 res = spn->spn_resource;
        struct hlist_head *hlist = pn_hash_list(obj);

        rcu_read_lock();
        sk_for_each_rcu(sknode, hlist) {
                struct pn_sock *pn = pn_sk(sknode);
                BUG_ON(!pn->sobject); /* unbound socket */

                if (!net_eq(sock_net(sknode), net))
                        continue;
                if (pn_port(obj)) {
                        /* Look up socket by port */
                        if (pn_port(pn->sobject) != pn_port(obj))
                                continue;
                } else {
                        /* If port is zero, look up by resource */
                        if (pn->resource != res)
                                continue;
                }
                if (pn_addr(pn->sobject) &&
                    pn_addr(pn->sobject) != pn_addr(obj))
                        continue;

                rval = sknode;
                sock_hold(sknode);
                break;
        }
        rcu_read_unlock();

        return rval;
}

/* Deliver a broadcast packet (only in bottom-half) */
void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb)
{
        struct hlist_head *hlist = pnsocks.hlist;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < PN_HASHSIZE; h++) {
                struct sock *sknode;

                sk_for_each(sknode, hlist) {
                        struct sk_buff *clone;

                        if (!net_eq(sock_net(sknode), net))
                                continue;
                        if (!sock_flag(sknode, SOCK_BROADCAST))
                                continue;

                        clone = skb_clone(skb, GFP_ATOMIC);
                        if (clone) {
                                sock_hold(sknode);
                                sk_receive_skb(sknode, clone, 0);
                        }
                }
                hlist++;
        }
        rcu_read_unlock();
}

int pn_sock_hash(struct sock *sk)
{
        struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject);

        mutex_lock(&pnsocks.lock);
        sk_add_node_rcu(sk, hlist);
        mutex_unlock(&pnsocks.lock);

        return 0;
}
EXPORT_SYMBOL(pn_sock_hash);

void pn_sock_unhash(struct sock *sk)
{
        mutex_lock(&pnsocks.lock);
        sk_del_node_init_rcu(sk);
        mutex_unlock(&pnsocks.lock);
        pn_sock_unbind_all_res(sk);
        synchronize_rcu();
}
EXPORT_SYMBOL(pn_sock_unhash);

static DEFINE_MUTEX(port_mutex);

static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
{
        struct sock *sk = sock->sk;
        struct pn_sock *pn = pn_sk(sk);
        struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
        int err;
        u16 handle;
        u8 saddr;

        if (sk->sk_prot->bind)
                return sk->sk_prot->bind(sk, addr, len);

        if (len < sizeof(struct sockaddr_pn))
                return -EINVAL;
        if (spn->spn_family != AF_PHONET)
                return -EAFNOSUPPORT;

        handle = pn_sockaddr_get_object((struct sockaddr_pn *)addr);
        saddr = pn_addr(handle);
        if (saddr && phonet_address_lookup(sock_net(sk), saddr))
                return -EADDRNOTAVAIL;

        lock_sock(sk);
        if (sk->sk_state != TCP_CLOSE || pn_port(pn->sobject)) {
                err = -EINVAL; /* attempt to rebind */
                goto out;
        }
        WARN_ON(sk_hashed(sk));
        mutex_lock(&port_mutex);
        err = sk->sk_prot->get_port(sk, pn_port(handle));
        if (err)
                goto out_port;

        /* get_port() sets the port, bind() sets the address if applicable */
        pn->sobject = pn_object(saddr, pn_port(pn->sobject));
        pn->resource = spn->spn_resource;

        /* Enable RX on the socket */
        err = sk->sk_prot->hash(sk);
out_port:
        mutex_unlock(&port_mutex);
out:
        release_sock(sk);
        return err;
}

static int pn_socket_autobind(struct socket *sock)
{
        struct sockaddr_pn sa;
        int err;

        memset(&sa, 0, sizeof(sa));
        sa.spn_family = AF_PHONET;
        err = pn_socket_bind(sock, (struct sockaddr *)&sa,
                                sizeof(struct sockaddr_pn));
        if (err != -EINVAL)
                return err;
        BUG_ON(!pn_port(pn_sk(sock->sk)->sobject));
        return 0; /* socket was already bound */
}

static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
                int len, int flags)
{
        struct sock *sk = sock->sk;
        struct pn_sock *pn = pn_sk(sk);
        struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
        struct task_struct *tsk = current;
        long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
        int err;

        if (pn_socket_autobind(sock))
                return -ENOBUFS;
        if (len < sizeof(struct sockaddr_pn))
                return -EINVAL;
        if (spn->spn_family != AF_PHONET)
                return -EAFNOSUPPORT;

        lock_sock(sk);

        switch (sock->state) {
        case SS_UNCONNECTED:
                if (sk->sk_state != TCP_CLOSE) {
                        err = -EISCONN;
                        goto out;
                }
                break;
        case SS_CONNECTING:
                err = -EALREADY;
                goto out;
        default:
                err = -EISCONN;
                goto out;
        }

        pn->dobject = pn_sockaddr_get_object(spn);
        pn->resource = pn_sockaddr_get_resource(spn);
        sock->state = SS_CONNECTING;

        err = sk->sk_prot->connect(sk, addr, len);
        if (err) {
                sock->state = SS_UNCONNECTED;
                pn->dobject = 0;
                goto out;
        }

        while (sk->sk_state == TCP_SYN_SENT) {
                DEFINE_WAIT(wait);

                if (!timeo) {
                        err = -EINPROGRESS;
                        goto out;
                }
                if (signal_pending(tsk)) {
                        err = sock_intr_errno(timeo);
                        goto out;
                }

                prepare_to_wait_exclusive(sk_sleep(sk), &wait,
                                                TASK_INTERRUPTIBLE);
                release_sock(sk);
                timeo = schedule_timeout(timeo);
                lock_sock(sk);
                finish_wait(sk_sleep(sk), &wait);
        }

        if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
                err = 0;
        else if (sk->sk_state == TCP_CLOSE_WAIT)
                err = -ECONNRESET;
        else
                err = -ECONNREFUSED;
        sock->state = err ? SS_UNCONNECTED : SS_CONNECTED;
out:
        release_sock(sk);
        return err;
}

static int pn_socket_accept(struct socket *sock, struct socket *newsock,
                            struct proto_accept_arg *arg)
{
        struct sock *sk = sock->sk;
        struct sock *newsk;

        if (unlikely(sk->sk_state != TCP_LISTEN))
                return -EINVAL;

        newsk = sk->sk_prot->accept(sk, arg);
        if (!newsk)
                return arg->err;

        lock_sock(newsk);
        sock_graft(newsk, newsock);
        newsock->state = SS_CONNECTED;
        release_sock(newsk);
        return 0;
}

static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
                                int peer)
{
        struct sock *sk = sock->sk;
        struct pn_sock *pn = pn_sk(sk);

        memset(addr, 0, sizeof(struct sockaddr_pn));
        addr->sa_family = AF_PHONET;
        if (!peer) /* Race with bind() here is userland's problem. */
                pn_sockaddr_set_object((struct sockaddr_pn *)addr,
                                        pn->sobject);

        return sizeof(struct sockaddr_pn);
}

static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
                                        poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct pep_sock *pn = pep_sk(sk);
        __poll_t mask = 0;

        poll_wait(file, sk_sleep(sk), wait);

        if (sk->sk_state == TCP_CLOSE)
                return EPOLLERR;
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;
        if (!skb_queue_empty_lockless(&pn->ctrlreq_queue))
                mask |= EPOLLPRI;
        if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
                return EPOLLHUP;

        if (sk->sk_state == TCP_ESTABLISHED &&
                refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf &&
                atomic_read(&pn->tx_credits))
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;

        return mask;
}

static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
                                unsigned long arg)
{
        struct sock *sk = sock->sk;
        struct pn_sock *pn = pn_sk(sk);

        if (cmd == SIOCPNGETOBJECT) {
                struct net_device *dev;
                u16 handle;
                u8 saddr;

                if (get_user(handle, (__u16 __user *)arg))
                        return -EFAULT;

                lock_sock(sk);
                if (sk->sk_bound_dev_if)
                        dev = dev_get_by_index(sock_net(sk),
                                                sk->sk_bound_dev_if);
                else
                        dev = phonet_device_get(sock_net(sk));
                if (dev && (dev->flags & IFF_UP))
                        saddr = phonet_address_get(dev, pn_addr(handle));
                else
                        saddr = PN_NO_ADDR;
                release_sock(sk);

                dev_put(dev);
                if (saddr == PN_NO_ADDR)
                        return -EHOSTUNREACH;

                handle = pn_object(saddr, pn_port(pn->sobject));
                return put_user(handle, (__u16 __user *)arg);
        }

        return sk_ioctl(sk, cmd, (void __user *)arg);
}

static int pn_socket_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        int err = 0;

        if (pn_socket_autobind(sock))
                return -ENOBUFS;

        lock_sock(sk);
        if (sock->state != SS_UNCONNECTED) {
                err = -EINVAL;
                goto out;
        }

        if (sk->sk_state != TCP_LISTEN) {
                sk->sk_state = TCP_LISTEN;
                sk->sk_ack_backlog = 0;
        }
        sk->sk_max_ack_backlog = backlog;
out:
        release_sock(sk);
        return err;
}

static int pn_socket_sendmsg(struct socket *sock, struct msghdr *m,
                             size_t total_len)
{
        struct sock *sk = sock->sk;

        if (pn_socket_autobind(sock))
                return -EAGAIN;

        return sk->sk_prot->sendmsg(sk, m, total_len);
}

const struct proto_ops phonet_dgram_ops = {
        .family                = AF_PHONET,
        .owner                = THIS_MODULE,
        .release        = pn_socket_release,
        .bind                = pn_socket_bind,
        .connect        = sock_no_connect,
        .socketpair        = sock_no_socketpair,
        .accept                = sock_no_accept,
        .getname        = pn_socket_getname,
        .poll                = datagram_poll,
        .ioctl                = pn_socket_ioctl,
        .listen                = sock_no_listen,
        .shutdown        = sock_no_shutdown,
        .sendmsg        = pn_socket_sendmsg,
        .recvmsg        = sock_common_recvmsg,
        .mmap                = sock_no_mmap,
};

const struct proto_ops phonet_stream_ops = {
        .family                = AF_PHONET,
        .owner                = THIS_MODULE,
        .release        = pn_socket_release,
        .bind                = pn_socket_bind,
        .connect        = pn_socket_connect,
        .socketpair        = sock_no_socketpair,
        .accept                = pn_socket_accept,
        .getname        = pn_socket_getname,
        .poll                = pn_socket_poll,
        .ioctl                = pn_socket_ioctl,
        .listen                = pn_socket_listen,
        .shutdown        = sock_no_shutdown,
        .setsockopt        = sock_common_setsockopt,
        .getsockopt        = sock_common_getsockopt,
        .sendmsg        = pn_socket_sendmsg,
        .recvmsg        = sock_common_recvmsg,
        .mmap                = sock_no_mmap,
};
EXPORT_SYMBOL(phonet_stream_ops);

/* allocate port for a socket */
int pn_sock_get_port(struct sock *sk, unsigned short sport)
{
        static int port_cur;
        struct net *net = sock_net(sk);
        struct pn_sock *pn = pn_sk(sk);
        struct sockaddr_pn try_sa;
        struct sock *tmpsk;

        memset(&try_sa, 0, sizeof(struct sockaddr_pn));
        try_sa.spn_family = AF_PHONET;
        WARN_ON(!mutex_is_locked(&port_mutex));
        if (!sport) {
                /* search free port */
                int port, pmin, pmax;

                phonet_get_local_port_range(&pmin, &pmax);
                for (port = pmin; port <= pmax; port++) {
                        port_cur++;
                        if (port_cur < pmin || port_cur > pmax)
                                port_cur = pmin;

                        pn_sockaddr_set_port(&try_sa, port_cur);
                        tmpsk = pn_find_sock_by_sa(net, &try_sa);
                        if (tmpsk == NULL) {
                                sport = port_cur;
                                goto found;
                        } else
                                sock_put(tmpsk);
                }
        } else {
                /* try to find specific port */
                pn_sockaddr_set_port(&try_sa, sport);
                tmpsk = pn_find_sock_by_sa(net, &try_sa);
                if (tmpsk == NULL)
                        /* No sock there! We can use that port... */
                        goto found;
                else
                        sock_put(tmpsk);
        }
        /* the port must be in use already */
        return -EADDRINUSE;

found:
        pn->sobject = pn_object(pn_addr(pn->sobject), sport);
        return 0;
}
EXPORT_SYMBOL(pn_sock_get_port);

#ifdef CONFIG_PROC_FS
static struct sock *pn_sock_get_idx(struct seq_file *seq, loff_t pos)
{
        struct net *net = seq_file_net(seq);
        struct hlist_head *hlist = pnsocks.hlist;
        struct sock *sknode;
        unsigned int h;

        for (h = 0; h < PN_HASHSIZE; h++) {
                sk_for_each_rcu(sknode, hlist) {
                        if (!net_eq(net, sock_net(sknode)))
                                continue;
                        if (!pos)
                                return sknode;
                        pos--;
                }
                hlist++;
        }
        return NULL;
}

static struct sock *pn_sock_get_next(struct seq_file *seq, struct sock *sk)
{
        struct net *net = seq_file_net(seq);

        do
                sk = sk_next(sk);
        while (sk && !net_eq(net, sock_net(sk)));

        return sk;
}

static void *pn_sock_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return *pos ? pn_sock_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *pn_sock_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct sock *sk;

        if (v == SEQ_START_TOKEN)
                sk = pn_sock_get_idx(seq, 0);
        else
                sk = pn_sock_get_next(seq, v);
        (*pos)++;
        return sk;
}

static void pn_sock_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
{
        rcu_read_unlock();
}

static int pn_sock_seq_show(struct seq_file *seq, void *v)
{
        seq_setwidth(seq, 127);
        if (v == SEQ_START_TOKEN)
                seq_puts(seq, "pt  loc  rem rs st tx_queue rx_queue "
                        "  uid inode ref pointer drops");
        else {
                struct sock *sk = v;
                struct pn_sock *pn = pn_sk(sk);

                seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu "
                        "%d %pK %u",
                        sk->sk_protocol, pn->sobject, pn->dobject,
                        pn->resource, sk->sk_state,
                        sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
                        from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
                        sock_i_ino(sk),
                        refcount_read(&sk->sk_refcnt), sk,
                        atomic_read(&sk->sk_drops));
        }
        seq_pad(seq, '\n');
        return 0;
}

const struct seq_operations pn_sock_seq_ops = {
        .start = pn_sock_seq_start,
        .next = pn_sock_seq_next,
        .stop = pn_sock_seq_stop,
        .show = pn_sock_seq_show,
};
#endif

static struct  {
        struct sock *sk[256];
} pnres;

/*
 * Find and hold socket based on resource.
 */
struct sock *pn_find_sock_by_res(struct net *net, u8 res)
{
        struct sock *sk;

        if (!net_eq(net, &init_net))
                return NULL;

        rcu_read_lock();
        sk = rcu_dereference(pnres.sk[res]);
        if (sk)
                sock_hold(sk);
        rcu_read_unlock();
        return sk;
}

static DEFINE_MUTEX(resource_mutex);

int pn_sock_bind_res(struct sock *sk, u8 res)
{
        int ret = -EADDRINUSE;

        if (!net_eq(sock_net(sk), &init_net))
                return -ENOIOCTLCMD;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
        if (pn_socket_autobind(sk->sk_socket))
                return -EAGAIN;

        mutex_lock(&resource_mutex);
        if (pnres.sk[res] == NULL) {
                sock_hold(sk);
                rcu_assign_pointer(pnres.sk[res], sk);
                ret = 0;
        }
        mutex_unlock(&resource_mutex);
        return ret;
}

int pn_sock_unbind_res(struct sock *sk, u8 res)
{
        int ret = -ENOENT;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        mutex_lock(&resource_mutex);
        if (pnres.sk[res] == sk) {
                RCU_INIT_POINTER(pnres.sk[res], NULL);
                ret = 0;
        }
        mutex_unlock(&resource_mutex);

        if (ret == 0) {
                synchronize_rcu();
                sock_put(sk);
        }
        return ret;
}

void pn_sock_unbind_all_res(struct sock *sk)
{
        unsigned int res, match = 0;

        mutex_lock(&resource_mutex);
        for (res = 0; res < 256; res++) {
                if (pnres.sk[res] == sk) {
                        RCU_INIT_POINTER(pnres.sk[res], NULL);
                        match++;
                }
        }
        mutex_unlock(&resource_mutex);

        while (match > 0) {
                __sock_put(sk);
                match--;
        }
        /* Caller is responsible for RCU sync before final sock_put() */
}

#ifdef CONFIG_PROC_FS
static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
{
        struct net *net = seq_file_net(seq);
        unsigned int i;

        if (!net_eq(net, &init_net))
                return NULL;

        for (i = 0; i < 256; i++) {
                if (pnres.sk[i] == NULL)
                        continue;
                if (!pos)
                        return pnres.sk + i;
                pos--;
        }
        return NULL;
}

static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
{
        struct net *net = seq_file_net(seq);
        unsigned int i;

        BUG_ON(!net_eq(net, &init_net));

        for (i = (sk - pnres.sk) + 1; i < 256; i++)
                if (pnres.sk[i])
                        return pnres.sk + i;
        return NULL;
}

static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(resource_mutex)
{
        mutex_lock(&resource_mutex);
        return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct sock **sk;

        if (v == SEQ_START_TOKEN)
                sk = pn_res_get_idx(seq, 0);
        else
                sk = pn_res_get_next(seq, v);
        (*pos)++;
        return sk;
}

static void pn_res_seq_stop(struct seq_file *seq, void *v)
        __releases(resource_mutex)
{
        mutex_unlock(&resource_mutex);
}

static int pn_res_seq_show(struct seq_file *seq, void *v)
{
        seq_setwidth(seq, 63);
        if (v == SEQ_START_TOKEN)
                seq_puts(seq, "rs   uid inode");
        else {
                struct sock **psk = v;
                struct sock *sk = *psk;

                seq_printf(seq, "%02X %5u %lu",
                           (int) (psk - pnres.sk),
                           from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
                           sock_i_ino(sk));
        }
        seq_pad(seq, '\n');
        return 0;
}

const struct seq_operations pn_res_seq_ops = {
        .start = pn_res_seq_start,
        .next = pn_res_seq_next,
        .stop = pn_res_seq_stop,
        .show = pn_res_seq_show,
};
#endif








































































































































































































































































































































































































































    1 
























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2002-2005, Devicescape Software, Inc.
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright(c) 2015-2017 Intel Deutschland GmbH
 * Copyright(c) 2020-2024 Intel Corporation
 */

#ifndef STA_INFO_H
#define STA_INFO_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/if_ether.h>
#include <linux/workqueue.h>
#include <linux/average.h>
#include <linux/bitfield.h>
#include <linux/etherdevice.h>
#include <linux/rhashtable.h>
#include <linux/u64_stats_sync.h>
#include "key.h"

/**
 * enum ieee80211_sta_info_flags - Stations flags
 *
 * These flags are used with &struct sta_info's @flags member, but
 * only indirectly with set_sta_flag() and friends.
 *
 * @WLAN_STA_AUTH: Station is authenticated.
 * @WLAN_STA_ASSOC: Station is associated.
 * @WLAN_STA_PS_STA: Station is in power-save mode
 * @WLAN_STA_AUTHORIZED: Station is authorized to send/receive traffic.
 *        This bit is always checked so needs to be enabled for all stations
 *        when virtual port control is not in use.
 * @WLAN_STA_SHORT_PREAMBLE: Station is capable of receiving short-preamble
 *        frames.
 * @WLAN_STA_WDS: Station is one of our WDS peers.
 * @WLAN_STA_CLEAR_PS_FILT: Clear PS filter in hardware (using the
 *        IEEE80211_TX_CTL_CLEAR_PS_FILT control flag) when the next
 *        frame to this station is transmitted.
 * @WLAN_STA_MFP: Management frame protection is used with this STA.
 * @WLAN_STA_BLOCK_BA: Used to deny ADDBA requests (both TX and RX)
 *        during suspend/resume and station removal.
 * @WLAN_STA_PS_DRIVER: driver requires keeping this station in
 *        power-save mode logically to flush frames that might still
 *        be in the queues
 * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping
 *        station in power-save mode, reply when the driver unblocks.
 * @WLAN_STA_TDLS_PEER: Station is a TDLS peer.
 * @WLAN_STA_TDLS_PEER_AUTH: This TDLS peer is authorized to send direct
 *        packets. This means the link is enabled.
 * @WLAN_STA_TDLS_INITIATOR: We are the initiator of the TDLS link with this
 *        station.
 * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching
 * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this
 *        TDLS peer
 * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on
 *        the BSS base channel.
 * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was
 *        keeping station in power-save mode, reply when the driver
 *        unblocks the station.
 * @WLAN_STA_SP: Station is in a service period, so don't try to
 *        reply to other uAPSD trigger frames or PS-Poll.
 * @WLAN_STA_4ADDR_EVENT: 4-addr event was already sent for this frame.
 * @WLAN_STA_INSERTED: This station is inserted into the hash table.
 * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station.
 * @WLAN_STA_TOFFSET_KNOWN: toffset calculated for this station is valid.
 * @WLAN_STA_MPSP_OWNER: local STA is owner of a mesh Peer Service Period.
 * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP.
 * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX
 *        until pending frames are delivered
 * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption,
 *        so drop all packets without a key later.
 * @WLAN_STA_DECAP_OFFLOAD: This station uses rx decap offload
 *
 * @NUM_WLAN_STA_FLAGS: number of defined flags
 */
enum ieee80211_sta_info_flags {
        WLAN_STA_AUTH,
        WLAN_STA_ASSOC,
        WLAN_STA_PS_STA,
        WLAN_STA_AUTHORIZED,
        WLAN_STA_SHORT_PREAMBLE,
        WLAN_STA_WDS,
        WLAN_STA_CLEAR_PS_FILT,
        WLAN_STA_MFP,
        WLAN_STA_BLOCK_BA,
        WLAN_STA_PS_DRIVER,
        WLAN_STA_PSPOLL,
        WLAN_STA_TDLS_PEER,
        WLAN_STA_TDLS_PEER_AUTH,
        WLAN_STA_TDLS_INITIATOR,
        WLAN_STA_TDLS_CHAN_SWITCH,
        WLAN_STA_TDLS_OFF_CHANNEL,
        WLAN_STA_TDLS_WIDER_BW,
        WLAN_STA_UAPSD,
        WLAN_STA_SP,
        WLAN_STA_4ADDR_EVENT,
        WLAN_STA_INSERTED,
        WLAN_STA_RATE_CONTROL,
        WLAN_STA_TOFFSET_KNOWN,
        WLAN_STA_MPSP_OWNER,
        WLAN_STA_MPSP_RECIPIENT,
        WLAN_STA_PS_DELIVER,
        WLAN_STA_USES_ENCRYPTION,
        WLAN_STA_DECAP_OFFLOAD,

        NUM_WLAN_STA_FLAGS,
};

#define ADDBA_RESP_INTERVAL HZ
#define HT_AGG_MAX_RETRIES                15
#define HT_AGG_BURST_RETRIES                3
#define HT_AGG_RETRIES_PERIOD                (15 * HZ)

#define HT_AGG_STATE_DRV_READY                0
#define HT_AGG_STATE_RESPONSE_RECEIVED        1
#define HT_AGG_STATE_OPERATIONAL        2
#define HT_AGG_STATE_STOPPING                3
#define HT_AGG_STATE_WANT_START                4
#define HT_AGG_STATE_WANT_STOP                5
#define HT_AGG_STATE_START_CB                6
#define HT_AGG_STATE_STOP_CB                7
#define HT_AGG_STATE_SENT_ADDBA                8

DECLARE_EWMA(avg_signal, 10, 8)
enum ieee80211_agg_stop_reason {
        AGG_STOP_DECLINED,
        AGG_STOP_LOCAL_REQUEST,
        AGG_STOP_PEER_REQUEST,
        AGG_STOP_DESTROY_STA,
};

/* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */
#define AIRTIME_USE_TX                BIT(0)
#define AIRTIME_USE_RX                BIT(1)

struct airtime_info {
        u64 rx_airtime;
        u64 tx_airtime;
        unsigned long last_active;
        s32 deficit;
        atomic_t aql_tx_pending; /* Estimated airtime for frames pending */
        u32 aql_limit_low;
        u32 aql_limit_high;
};

void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
                                          struct sta_info *sta, u8 ac,
                                          u16 tx_airtime, bool tx_completed);

struct sta_info;

/**
 * struct tid_ampdu_tx - TID aggregation information (Tx).
 *
 * @rcu_head: rcu head for freeing structure
 * @session_timer: check if we keep Tx-ing on the TID (by timeout value)
 * @addba_resp_timer: timer for peer's response to addba request
 * @pending: pending frames queue -- use sta's spinlock to protect
 * @sta: station we are attached to
 * @dialog_token: dialog token for aggregation session
 * @timeout: session timeout value to be filled in ADDBA requests
 * @tid: TID number
 * @state: session state (see above)
 * @last_tx: jiffies of last tx activity
 * @stop_initiator: initiator of a session stop
 * @tx_stop: TX DelBA frame when stopping
 * @buf_size: reorder buffer size at receiver
 * @failed_bar_ssn: ssn of the last failed BAR tx attempt
 * @bar_pending: BAR needs to be re-sent
 * @amsdu: support A-MSDU withing A-MDPU
 * @ssn: starting sequence number of the session
 *
 * This structure's lifetime is managed by RCU, assignments to
 * the array holding it must hold the aggregation mutex.
 *
 * The TX path can access it under RCU lock-free if, and
 * only if, the state has the flag %HT_AGG_STATE_OPERATIONAL
 * set. Otherwise, the TX path must also acquire the spinlock
 * and re-check the state, see comments in the tx code
 * touching it.
 */
struct tid_ampdu_tx {
        struct rcu_head rcu_head;
        struct timer_list session_timer;
        struct timer_list addba_resp_timer;
        struct sk_buff_head pending;
        struct sta_info *sta;
        unsigned long state;
        unsigned long last_tx;
        u16 timeout;
        u8 dialog_token;
        u8 stop_initiator;
        bool tx_stop;
        u16 buf_size;
        u16 ssn;

        u16 failed_bar_ssn;
        bool bar_pending;
        bool amsdu;
        u8 tid;
};

/**
 * struct tid_ampdu_rx - TID aggregation information (Rx).
 *
 * @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an
 *        A-MSDU with individually reported subframes.
 * @reorder_buf_filtered: bitmap indicating where there are filtered frames in
 *        the reorder buffer that should be ignored when releasing frames
 * @reorder_time: jiffies when skb was added
 * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
 * @reorder_timer: releases expired frames from the reorder buffer.
 * @sta: station we are attached to
 * @last_rx: jiffies of last rx activity
 * @head_seq_num: head sequence number in reordering buffer.
 * @stored_mpdu_num: number of MPDUs in reordering buffer
 * @ssn: Starting Sequence Number expected to be aggregated.
 * @buf_size: buffer size for incoming A-MPDUs
 * @timeout: reset timer value (in TUs).
 * @tid: TID number
 * @rcu_head: RCU head used for freeing this struct
 * @reorder_lock: serializes access to reorder buffer, see below.
 * @auto_seq: used for offloaded BA sessions to automatically pick head_seq_and
 *        and ssn.
 * @removed: this session is removed (but might have been found due to RCU)
 * @started: this session has started (head ssn or higher was received)
 *
 * This structure's lifetime is managed by RCU, assignments to
 * the array holding it must hold the aggregation mutex.
 *
 * The @reorder_lock is used to protect the members of this
 * struct, except for @timeout, @buf_size and @dialog_token,
 * which are constant across the lifetime of the struct (the
 * dialog token being used only for debugging).
 */
struct tid_ampdu_rx {
        struct rcu_head rcu_head;
        spinlock_t reorder_lock;
        u64 reorder_buf_filtered;
        struct sk_buff_head *reorder_buf;
        unsigned long *reorder_time;
        struct sta_info *sta;
        struct timer_list session_timer;
        struct timer_list reorder_timer;
        unsigned long last_rx;
        u16 head_seq_num;
        u16 stored_mpdu_num;
        u16 ssn;
        u16 buf_size;
        u16 timeout;
        u8 tid;
        u8 auto_seq:1,
           removed:1,
           started:1;
};

/**
 * struct sta_ampdu_mlme - STA aggregation information.
 *
 * @tid_rx: aggregation info for Rx per TID -- RCU protected
 * @tid_rx_token: dialog tokens for valid aggregation sessions
 * @tid_rx_timer_expired: bitmap indicating on which TIDs the
 *        RX timer expired until the work for it runs
 * @tid_rx_stop_requested:  bitmap indicating which BA sessions per TID the
 *        driver requested to close until the work for it runs
 * @tid_rx_manage_offl: bitmap indicating which BA sessions were requested
 *        to be treated as started/stopped due to offloading
 * @agg_session_valid: bitmap indicating which TID has a rx BA session open on
 * @unexpected_agg: bitmap indicating which TID already sent a delBA due to
 *        unexpected aggregation related frames outside a session
 * @work: work struct for starting/stopping aggregation
 * @tid_tx: aggregation info for Tx per TID
 * @tid_start_tx: sessions where start was requested, not just protected
 *        by wiphy mutex but also sta->lock
 * @last_addba_req_time: timestamp of the last addBA request.
 * @addba_req_num: number of times addBA request has been sent.
 * @dialog_token_allocator: dialog token enumerator for each new session;
 */
struct sta_ampdu_mlme {
        /* rx */
        struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
        u8 tid_rx_token[IEEE80211_NUM_TIDS];
        unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        unsigned long tid_rx_manage_offl[BITS_TO_LONGS(2 * IEEE80211_NUM_TIDS)];
        unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
        /* tx */
        struct wiphy_work work;
        struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
        struct tid_ampdu_tx *tid_start_tx[IEEE80211_NUM_TIDS];
        unsigned long last_addba_req_time[IEEE80211_NUM_TIDS];
        u8 addba_req_num[IEEE80211_NUM_TIDS];
        u8 dialog_token_allocator;
};


/* Value to indicate no TID reservation */
#define IEEE80211_TID_UNRESERVED        0xff

#define IEEE80211_FAST_XMIT_MAX_IV        18

/**
 * struct ieee80211_fast_tx - TX fastpath information
 * @key: key to use for hw crypto
 * @hdr: the 802.11 header to put with the frame
 * @hdr_len: actual 802.11 header length
 * @sa_offs: offset of the SA
 * @da_offs: offset of the DA
 * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
 * @band: band this will be transmitted on, for tx_info
 * @rcu_head: RCU head to free this struct
 *
 * This struct is small enough so that the common case (maximum crypto
 * header length of 8 like for CCMP/GCMP) fits into a single 64-byte
 * cache line.
 */
struct ieee80211_fast_tx {
        struct ieee80211_key *key;
        u8 hdr_len;
        u8 sa_offs, da_offs, pn_offs;
        u8 band;
        u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
               sizeof(rfc1042_header)] __aligned(2);

        struct rcu_head rcu_head;
};

/**
 * struct ieee80211_fast_rx - RX fastpath information
 * @dev: netdevice for reporting the SKB
 * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type)
 * @vif_addr: interface address
 * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache)
 * @control_port_protocol: control port protocol copied from sdata
 * @expected_ds_bits: from/to DS bits expected
 * @icv_len: length of the MIC if present
 * @key: bool indicating encryption is expected (key is set)
 * @internal_forward: forward froms internally on AP/VLAN type interfaces
 * @uses_rss: copy of USES_RSS hw flag
 * @da_offs: offset of the DA in the header (for header conversion)
 * @sa_offs: offset of the SA in the header (for header conversion)
 * @rcu_head: RCU head for freeing this structure
 */
struct ieee80211_fast_rx {
        struct net_device *dev;
        enum nl80211_iftype vif_type;
        u8 vif_addr[ETH_ALEN] __aligned(2);
        u8 rfc1042_hdr[6] __aligned(2);
        __be16 control_port_protocol;
        __le16 expected_ds_bits;
        u8 icv_len;
        u8 key:1,
           internal_forward:1,
           uses_rss:1;
        u8 da_offs, sa_offs;

        struct rcu_head rcu_head;
};

/* we use only values in the range 0-100, so pick a large precision */
DECLARE_EWMA(mesh_fail_avg, 20, 8)
DECLARE_EWMA(mesh_tx_rate_avg, 8, 16)

/**
 * struct mesh_sta - mesh STA information
 * @plink_lock: serialize access to plink fields
 * @llid: Local link ID
 * @plid: Peer link ID
 * @aid: local aid supplied by peer
 * @reason: Cancel reason on PLINK_HOLDING state
 * @plink_retries: Retries in establishment
 * @plink_state: peer link state
 * @plink_timeout: timeout of peer link
 * @plink_timer: peer link watch timer
 * @plink_sta: peer link watch timer's sta_info
 * @t_offset: timing offset relative to this host
 * @t_offset_setpoint: reference timing offset of this sta to be used when
 *         calculating clockdrift
 * @local_pm: local link-specific power save mode
 * @peer_pm: peer-specific power save mode towards local STA
 * @nonpeer_pm: STA power save mode towards non-peer neighbors
 * @processed_beacon: set to true after peer rates and capabilities are
 *        processed
 * @connected_to_gate: true if mesh STA has a path to a mesh gate
 * @connected_to_as: true if mesh STA has a path to a authentication server
 * @fail_avg: moving percentage of failed MSDUs
 * @tx_rate_avg: moving average of tx bitrate
 */
struct mesh_sta {
        struct timer_list plink_timer;
        struct sta_info *plink_sta;

        s64 t_offset;
        s64 t_offset_setpoint;

        spinlock_t plink_lock;
        u16 llid;
        u16 plid;
        u16 aid;
        u16 reason;
        u8 plink_retries;

        bool processed_beacon;
        bool connected_to_gate;
        bool connected_to_as;

        enum nl80211_plink_state plink_state;
        u32 plink_timeout;

        /* mesh power save */
        enum nl80211_mesh_power_mode local_pm;
        enum nl80211_mesh_power_mode peer_pm;
        enum nl80211_mesh_power_mode nonpeer_pm;

        /* moving percentage of failed MSDUs */
        struct ewma_mesh_fail_avg fail_avg;
        /* moving average of tx bitrate */
        struct ewma_mesh_tx_rate_avg tx_rate_avg;
};

DECLARE_EWMA(signal, 10, 8)

struct ieee80211_sta_rx_stats {
        unsigned long packets;
        unsigned long last_rx;
        unsigned long num_duplicates;
        unsigned long fragments;
        unsigned long dropped;
        int last_signal;
        u8 chains;
        s8 chain_signal_last[IEEE80211_MAX_CHAINS];
        u32 last_rate;
        struct u64_stats_sync syncp;
        u64 bytes;
        u64 msdu[IEEE80211_NUM_TIDS + 1];
};

/*
 * IEEE 802.11-2016 (10.6 "Defragmentation") recommends support for "concurrent
 * reception of at least one MSDU per access category per associated STA"
 * on APs, or "at least one MSDU per access category" on other interface types.
 *
 * This limit can be increased by changing this define, at the cost of slower
 * frame reassembly and increased memory use while fragments are pending.
 */
#define IEEE80211_FRAGMENT_MAX 4

struct ieee80211_fragment_entry {
        struct sk_buff_head skb_list;
        unsigned long first_frag_time;
        u16 seq;
        u16 extra_len;
        u16 last_frag;
        u8 rx_queue;
        u8 check_sequential_pn:1, /* needed for CCMP/GCMP */
           is_protected:1;
        u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
        unsigned int key_color;
};

struct ieee80211_fragment_cache {
        struct ieee80211_fragment_entry        entries[IEEE80211_FRAGMENT_MAX];
        unsigned int next;
};

/*
 * The bandwidth threshold below which the per-station CoDel parameters will be
 * scaled to be more lenient (to prevent starvation of slow stations). This
 * value will be scaled by the number of active stations when it is being
 * applied.
 */
#define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */

/**
 * struct link_sta_info - Link STA information
 * All link specific sta info are stored here for reference. This can be
 * a single entry for non-MLD STA or multiple entries for MLD STA
 * @addr: Link MAC address - Can be same as MLD STA mac address and is always
 *        same for non-MLD STA. This is used as key for searching link STA
 * @link_id: Link ID uniquely identifying the link STA. This is 0 for non-MLD
 *        and set to the corresponding vif LinkId for MLD STA
 * @op_mode_nss: NSS limit as set by operating mode notification, or 0
 * @capa_nss: NSS limit as determined by local and peer capabilities
 * @link_hash_node: hash node for rhashtable
 * @sta: Points to the STA info
 * @gtk: group keys negotiated with this station, if any
 * @tx_stats: TX statistics
 * @tx_stats.packets: # of packets transmitted
 * @tx_stats.bytes: # of bytes in all packets transmitted
 * @tx_stats.last_rate: last TX rate
 * @tx_stats.msdu: # of transmitted MSDUs per TID
 * @rx_stats: RX statistics
 * @rx_stats_avg: averaged RX statistics
 * @rx_stats_avg.signal: averaged signal
 * @rx_stats_avg.chain_signal: averaged per-chain signal
 * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs
 *        this (by advertising the USES_RSS hw flag)
 * @status_stats: TX status statistics
 * @status_stats.filtered: # of filtered frames
 * @status_stats.retry_failed: # of frames that failed after retry
 * @status_stats.retry_count: # of retries attempted
 * @status_stats.lost_packets: # of lost packets
 * @status_stats.last_pkt_time: timestamp of last ACKed packet
 * @status_stats.msdu_retries: # of MSDU retries
 * @status_stats.msdu_failed: # of failed MSDUs
 * @status_stats.last_ack: last ack timestamp (jiffies)
 * @status_stats.last_ack_signal: last ACK signal
 * @status_stats.ack_signal_filled: last ACK signal validity
 * @status_stats.avg_ack_signal: average ACK signal
 * @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
 *        taken from HT/VHT capabilities or VHT operating mode notification
 * @debugfs_dir: debug filesystem directory dentry
 * @pub: public (driver visible) link STA data
 * TODO Move other link params from sta_info as required for MLD operation
 */
struct link_sta_info {
        u8 addr[ETH_ALEN];
        u8 link_id;

        u8 op_mode_nss, capa_nss;

        struct rhlist_head link_hash_node;

        struct sta_info *sta;
        struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS +
                                        NUM_DEFAULT_MGMT_KEYS +
                                        NUM_DEFAULT_BEACON_KEYS];
        struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats;

        /* Updated from RX path only, no locking requirements */
        struct ieee80211_sta_rx_stats rx_stats;
        struct {
                struct ewma_signal signal;
                struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS];
        } rx_stats_avg;

        /* Updated from TX status path only, no locking requirements */
        struct {
                unsigned long filtered;
                unsigned long retry_failed, retry_count;
                unsigned int lost_packets;
                unsigned long last_pkt_time;
                u64 msdu_retries[IEEE80211_NUM_TIDS + 1];
                u64 msdu_failed[IEEE80211_NUM_TIDS + 1];
                unsigned long last_ack;
                s8 last_ack_signal;
                bool ack_signal_filled;
                struct ewma_avg_signal avg_ack_signal;
        } status_stats;

        /* Updated from TX path only, no locking requirements */
        struct {
                u64 packets[IEEE80211_NUM_ACS];
                u64 bytes[IEEE80211_NUM_ACS];
                struct ieee80211_tx_rate last_rate;
                struct rate_info last_rate_info;
                u64 msdu[IEEE80211_NUM_TIDS + 1];
        } tx_stats;

        enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct dentry *debugfs_dir;
#endif

        struct ieee80211_link_sta *pub;
};

/**
 * struct sta_info - STA information
 *
 * This structure collects information about a station that
 * mac80211 is communicating with.
 *
 * @list: global linked list entry
 * @free_list: list entry for keeping track of stations to free
 * @hash_node: hash node for rhashtable
 * @addr: station's MAC address - duplicated from public part to
 *        let the hash table work with just a single cacheline
 * @local: pointer to the global information
 * @sdata: virtual interface this station belongs to
 * @ptk: peer keys negotiated with this station, if any
 * @ptk_idx: last installed peer key index
 * @rate_ctrl: rate control algorithm reference
 * @rate_ctrl_lock: spinlock used to protect rate control data
 *        (data inside the algorithm, so serializes calls there)
 * @rate_ctrl_priv: rate control private per-STA pointer
 * @lock: used for locking all fields that require locking, see comments
 *        in the header file.
 * @drv_deliver_wk: used for delivering frames after driver PS unblocking
 * @listen_interval: listen interval of this station, when we're acting as AP
 * @_flags: STA flags, see &enum ieee80211_sta_info_flags, do not use directly
 * @ps_lock: used for powersave (when mac80211 is the AP) related locking
 * @ps_tx_buf: buffers (per AC) of frames to transmit to this station
 *        when it leaves power saving state or polls
 * @tx_filtered: buffers (per AC) of frames we already tried to
 *        transmit but were filtered by hardware due to STA having
 *        entered power saving state, these are also delivered to
 *        the station when it leaves powersave or polls for frames
 * @driver_buffered_tids: bitmap of TIDs the driver has data buffered on
 * @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on
 * @assoc_at: clock boottime (in ns) of last association
 * @last_connected: time (in seconds) when a station got connected
 * @last_seq_ctrl: last received seq/frag number from this STA (per TID
 *        plus one for non-QoS frames)
 * @tid_seq: per-TID sequence numbers for sending to this STA
 * @airtime: per-AC struct airtime_info describing airtime statistics for this
 *        station
 * @airtime_weight: station weight for airtime fairness calculation purposes
 * @ampdu_mlme: A-MPDU state machine state
 * @mesh: mesh STA information
 * @debugfs_dir: debug filesystem directory dentry
 * @dead: set to true when sta is unlinked
 * @removed: set to true when sta is being removed from sta_list
 * @uploaded: set to true when sta is uploaded to the driver
 * @sta: station information we share with the driver
 * @sta_state: duplicates information about station state (for debug)
 * @rcu_head: RCU head used for freeing this station struct
 * @cparams: CoDel parameters for this station.
 * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
 * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer:
 *
 *          * -1: not yet known
 *          * 0: non-mesh A-MSDU length field
 *          * 1: big-endian mesh A-MSDU length field
 *          * 2: little-endian mesh A-MSDU length field
 *
 * @fast_tx: TX fastpath information
 * @fast_rx: RX fastpath information
 * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
 *        the BSS one.
 * @frags: fragment cache
 * @cur: storage for aggregation data
 *        &struct ieee80211_sta points either here or to deflink.agg.
 * @deflink: This is the default link STA information, for non MLO STA all link
 *        specific STA information is accessed through @deflink or through
 *        link[0] which points to address of @deflink. For MLO Link STA
 *        the first added link STA will point to deflink.
 * @link: reference to Link Sta entries. For Non MLO STA, except 1st link,
 *        i.e link[0] all links would be assigned to NULL by default and
 *        would access link information via @deflink or link[0]. For MLO
 *        STA, first link STA being added will point its link pointer to
 *        @deflink address and remaining would be allocated and the address
 *        would be assigned to link[link_id] where link_id is the id assigned
 *        by the AP.
 */
struct sta_info {
        /* General information, mostly static */
        struct list_head list, free_list;
        struct rcu_head rcu_head;
        struct rhlist_head hash_node;
        u8 addr[ETH_ALEN];
        struct ieee80211_local *local;
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS];
        u8 ptk_idx;
        struct rate_control_ref *rate_ctrl;
        void *rate_ctrl_priv;
        spinlock_t rate_ctrl_lock;
        spinlock_t lock;

        struct ieee80211_fast_tx __rcu *fast_tx;
        struct ieee80211_fast_rx __rcu *fast_rx;

#ifdef CONFIG_MAC80211_MESH
        struct mesh_sta *mesh;
#endif

        struct work_struct drv_deliver_wk;

        u16 listen_interval;

        bool dead;
        bool removed;

        bool uploaded;

        enum ieee80211_sta_state sta_state;

        /* use the accessors defined below */
        unsigned long _flags;

        /* STA powersave lock and frame queues */
        spinlock_t ps_lock;
        struct sk_buff_head ps_tx_buf[IEEE80211_NUM_ACS];
        struct sk_buff_head tx_filtered[IEEE80211_NUM_ACS];
        unsigned long driver_buffered_tids;
        unsigned long txq_buffered_tids;

        u64 assoc_at;
        long last_connected;

        /* Plus 1 for non-QoS frames */
        __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1];

        u16 tid_seq[IEEE80211_QOS_CTL_TID_MASK + 1];

        struct airtime_info airtime[IEEE80211_NUM_ACS];
        u16 airtime_weight;

        /*
         * Aggregation information, locked with lock.
         */
        struct sta_ampdu_mlme ampdu_mlme;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct dentry *debugfs_dir;
#endif

        struct codel_params cparams;

        u8 reserved_tid;
        s8 amsdu_mesh_control;

        struct cfg80211_chan_def tdls_chandef;

        struct ieee80211_fragment_cache frags;

        struct ieee80211_sta_aggregates cur;
        struct link_sta_info deflink;
        struct link_sta_info __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS];

        /* keep last! */
        struct ieee80211_sta sta;
};

static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta)
{
#ifdef CONFIG_MAC80211_MESH
        return sta->mesh->plink_state;
#endif
        return NL80211_PLINK_LISTEN;
}

static inline void set_sta_flag(struct sta_info *sta,
                                enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        set_bit(flag, &sta->_flags);
}

static inline void clear_sta_flag(struct sta_info *sta,
                                  enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        clear_bit(flag, &sta->_flags);
}

static inline int test_sta_flag(struct sta_info *sta,
                                enum ieee80211_sta_info_flags flag)
{
        return test_bit(flag, &sta->_flags);
}

static inline int test_and_clear_sta_flag(struct sta_info *sta,
                                          enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        return test_and_clear_bit(flag, &sta->_flags);
}

static inline int test_and_set_sta_flag(struct sta_info *sta,
                                        enum ieee80211_sta_info_flags flag)
{
        WARN_ON(flag == WLAN_STA_AUTH ||
                flag == WLAN_STA_ASSOC ||
                flag == WLAN_STA_AUTHORIZED);
        return test_and_set_bit(flag, &sta->_flags);
}

int sta_info_move_state(struct sta_info *sta,
                        enum ieee80211_sta_state new_state);

static inline void sta_info_pre_move_state(struct sta_info *sta,
                                           enum ieee80211_sta_state new_state)
{
        int ret;

        WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED));

        ret = sta_info_move_state(sta, new_state);
        WARN_ON_ONCE(ret);
}


void ieee80211_assign_tid_tx(struct sta_info *sta, int tid,
                             struct tid_ampdu_tx *tid_tx);

#define rcu_dereference_protected_tid_tx(sta, tid)                        \
        rcu_dereference_protected((sta)->ampdu_mlme.tid_tx[tid],        \
                                  lockdep_is_held(&(sta)->lock) ||        \
                                  lockdep_is_held(&(sta)->local->hw.wiphy->mtx));

/* Maximum number of frames to buffer per power saving station per AC */
#define STA_MAX_TX_BUFFER        64

/* Minimum buffered frame expiry time. If STA uses listen interval that is
 * smaller than this value, the minimum value here is used instead. */
#define STA_TX_BUFFER_EXPIRE (10 * HZ)

/* How often station data is cleaned up (e.g., expiration of buffered frames)
 */
#define STA_INFO_CLEANUP_INTERVAL (10 * HZ)

struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
                                         const u8 *addr);

/*
 * Get a STA info, must be under RCU read lock.
 */
struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
                              const u8 *addr);

struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
                                  const u8 *addr);

/* user must hold wiphy mutex or be in RCU critical section */
struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local,
                                       const u8 *sta_addr, const u8 *vif_addr);

#define for_each_sta_info(local, _addr, _sta, _tmp)                        \
        rhl_for_each_entry_rcu(_sta, _tmp,                                \
                               sta_info_hash_lookup(local, _addr), hash_node)

struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local,
                                              const u8 *addr);

#define for_each_link_sta_info(local, _addr, _sta, _tmp)                \
        rhl_for_each_entry_rcu(_sta, _tmp,                                \
                               link_sta_info_hash_lookup(local, _addr),        \
                               link_hash_node)

struct link_sta_info *
link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr);

/*
 * Get STA info by index, BROKEN!
 */
struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
                                     int idx);
/*
 * Create a new STA info, caller owns returned structure
 * until sta_info_insert().
 */
struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
                                const u8 *addr, gfp_t gfp);
struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata,
                                          const u8 *mld_addr,
                                          unsigned int link_id,
                                          const u8 *link_addr,
                                          gfp_t gfp);

void sta_info_free(struct ieee80211_local *local, struct sta_info *sta);

/*
 * Insert STA info into hash table/list, returns zero or a
 * -EEXIST if (if the same MAC address is already present).
 *
 * Calling the non-rcu version makes the caller relinquish,
 * the _rcu version calls read_lock_rcu() and must be called
 * without it held.
 */
int sta_info_insert(struct sta_info *sta);
int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU);

int __must_check __sta_info_destroy(struct sta_info *sta);
int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata,
                          const u8 *addr);
int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
                              const u8 *addr);

void sta_info_recalc_tim(struct sta_info *sta);

int sta_info_init(struct ieee80211_local *local);
void sta_info_stop(struct ieee80211_local *local);

/**
 * __sta_info_flush - flush matching STA entries from the STA table
 *
 * Return: the number of removed STA entries.
 *
 * @sdata: sdata to remove all stations from
 * @vlans: if the given interface is an AP interface, also flush VLANs
 * @link_id: if given (>=0), all those STA entries using @link_id only
 *             will be removed. If -1 is passed, all STA entries will be
 *             removed.
 */
int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans,
                     int link_id);

/**
 * sta_info_flush - flush matching STA entries from the STA table
 *
 * Return: the number of removed STA entries.
 *
 * @sdata: sdata to remove all stations from
 * @link_id: if given (>=0), all those STA entries using @link_id only
 *             will be removed. If -1 is passed, all STA entries will be
 *             removed.
 */
static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata,
                                 int link_id)
{
        return __sta_info_flush(sdata, false, link_id);
}

void sta_set_rate_info_tx(struct sta_info *sta,
                          const struct ieee80211_tx_rate *rate,
                          struct rate_info *rinfo);
void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
                   bool tidstats);

u32 sta_get_expected_throughput(struct sta_info *sta);

void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
                          unsigned long exp_time);

int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id);
void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id);
int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id);
void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id);

void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta);
void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta);
void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta);

unsigned long ieee80211_sta_last_active(struct sta_info *sta);

void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta,
                                           const u8 *ext_capab,
                                           unsigned int ext_capab_len);

void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links);

enum sta_stats_type {
        STA_STATS_RATE_TYPE_INVALID = 0,
        STA_STATS_RATE_TYPE_LEGACY,
        STA_STATS_RATE_TYPE_HT,
        STA_STATS_RATE_TYPE_VHT,
        STA_STATS_RATE_TYPE_HE,
        STA_STATS_RATE_TYPE_S1G,
        STA_STATS_RATE_TYPE_EHT,
};

#define STA_STATS_FIELD_HT_MCS                GENMASK( 7,  0)
#define STA_STATS_FIELD_LEGACY_IDX        GENMASK( 3,  0)
#define STA_STATS_FIELD_LEGACY_BAND        GENMASK( 7,  4)
#define STA_STATS_FIELD_VHT_MCS                GENMASK( 3,  0)
#define STA_STATS_FIELD_VHT_NSS                GENMASK( 7,  4)
#define STA_STATS_FIELD_HE_MCS                GENMASK( 3,  0)
#define STA_STATS_FIELD_HE_NSS                GENMASK( 7,  4)
#define STA_STATS_FIELD_EHT_MCS                GENMASK( 3,  0)
#define STA_STATS_FIELD_EHT_NSS                GENMASK( 7,  4)
#define STA_STATS_FIELD_BW                GENMASK(12,  8)
#define STA_STATS_FIELD_SGI                GENMASK(13, 13)
#define STA_STATS_FIELD_TYPE                GENMASK(16, 14)
#define STA_STATS_FIELD_HE_RU                GENMASK(19, 17)
#define STA_STATS_FIELD_HE_GI                GENMASK(21, 20)
#define STA_STATS_FIELD_HE_DCM                GENMASK(22, 22)
#define STA_STATS_FIELD_EHT_RU                GENMASK(20, 17)
#define STA_STATS_FIELD_EHT_GI                GENMASK(22, 21)

#define STA_STATS_FIELD(_n, _v)                FIELD_PREP(STA_STATS_FIELD_ ## _n, _v)
#define STA_STATS_GET(_n, _v)                FIELD_GET(STA_STATS_FIELD_ ## _n, _v)

#define STA_STATS_RATE_INVALID                0

static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
{
        u32 r;

        r = STA_STATS_FIELD(BW, s->bw);

        if (s->enc_flags & RX_ENC_FLAG_SHORT_GI)
                r |= STA_STATS_FIELD(SGI, 1);

        switch (s->encoding) {
        case RX_ENC_VHT:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_VHT);
                r |= STA_STATS_FIELD(VHT_NSS, s->nss);
                r |= STA_STATS_FIELD(VHT_MCS, s->rate_idx);
                break;
        case RX_ENC_HT:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HT);
                r |= STA_STATS_FIELD(HT_MCS, s->rate_idx);
                break;
        case RX_ENC_LEGACY:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_LEGACY);
                r |= STA_STATS_FIELD(LEGACY_BAND, s->band);
                r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx);
                break;
        case RX_ENC_HE:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE);
                r |= STA_STATS_FIELD(HE_NSS, s->nss);
                r |= STA_STATS_FIELD(HE_MCS, s->rate_idx);
                r |= STA_STATS_FIELD(HE_GI, s->he_gi);
                r |= STA_STATS_FIELD(HE_RU, s->he_ru);
                r |= STA_STATS_FIELD(HE_DCM, s->he_dcm);
                break;
        case RX_ENC_EHT:
                r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_EHT);
                r |= STA_STATS_FIELD(EHT_NSS, s->nss);
                r |= STA_STATS_FIELD(EHT_MCS, s->rate_idx);
                r |= STA_STATS_FIELD(EHT_GI, s->eht.gi);
                r |= STA_STATS_FIELD(EHT_RU, s->eht.ru);
                break;
        default:
                WARN_ON(1);
                return STA_STATS_RATE_INVALID;
        }

        return r;
}

#endif /* STA_INFO_H */































    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_STRUCT_H
#define _LINUX_FS_STRUCT_H

#include <linux/path.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>

struct fs_struct {
        int users;
        spinlock_t lock;
        seqcount_spinlock_t seq;
        int umask;
        int in_exec;
        struct path root, pwd;
} __randomize_layout;

extern struct kmem_cache *fs_cachep;

extern void exit_fs(struct task_struct *);
extern void set_fs_root(struct fs_struct *, const struct path *);
extern void set_fs_pwd(struct fs_struct *, const struct path *);
extern struct fs_struct *copy_fs_struct(struct fs_struct *);
extern void free_fs_struct(struct fs_struct *);
extern int unshare_fs_struct(void);

static inline void get_fs_root(struct fs_struct *fs, struct path *root)
{
        spin_lock(&fs->lock);
        *root = fs->root;
        path_get(root);
        spin_unlock(&fs->lock);
}

static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
{
        spin_lock(&fs->lock);
        *pwd = fs->pwd;
        path_get(pwd);
        spin_unlock(&fs->lock);
}

extern bool current_chrooted(void);

#endif /* _LINUX_FS_STRUCT_H */






































   11 





    9 




































































































    8 






























    8 





    1 









    8 



    7 
    1 

    1 














   13 
    2 










    7 
























    8 


    8 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
 * which are designed to protect kernel memory from needless exposure
 * and overwrite under many unintended conditions. This code is based
 * on PAX_USERCOPY, which is:
 *
 * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
 * Security Inc.
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
#include "slab.h"

/*
 * Checks if a given pointer and length is contained by the current
 * stack frame (if possible).
 *
 * Returns:
 *        NOT_STACK: not at all on the stack
 *        GOOD_FRAME: fully within a valid stack frame
 *        GOOD_STACK: within the current stack (when can't frame-check exactly)
 *        BAD_STACK: error condition (invalid stack position or bad stack frame)
 */
static noinline int check_stack_object(const void *obj, unsigned long len)
{
        const void * const stack = task_stack_page(current);
        const void * const stackend = stack + THREAD_SIZE;
        int ret;

        /* Object is not on the stack at all. */
        if (obj + len <= stack || stackend <= obj)
                return NOT_STACK;

        /*
         * Reject: object partially overlaps the stack (passing the
         * check above means at least one end is within the stack,
         * so if this check fails, the other end is outside the stack).
         */
        if (obj < stack || stackend < obj + len)
                return BAD_STACK;

        /* Check if object is safely within a valid frame. */
        ret = arch_within_stack_frames(stack, stackend, obj, len);
        if (ret)
                return ret;

        /* Finally, check stack depth if possible. */
#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
        if (IS_ENABLED(CONFIG_STACK_GROWSUP)) {
                if ((void *)current_stack_pointer < obj + len)
                        return BAD_STACK;
        } else {
                if (obj < (void *)current_stack_pointer)
                        return BAD_STACK;
        }
#endif

        return GOOD_STACK;
}

/*
 * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found
 * an unexpected state during a copy_from_user() or copy_to_user() call.
 * There are several checks being performed on the buffer by the
 * __check_object_size() function. Normal stack buffer usage should never
 * trip the checks, and kernel text addressing will always trip the check.
 * For cache objects, it is checking that only the whitelisted range of
 * bytes for a given cache is being accessed (via the cache's usersize and
 * useroffset fields). To adjust a cache whitelist, use the usercopy-aware
 * kmem_cache_create_usercopy() function to create the cache (and
 * carefully audit the whitelist range).
 */
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len)
{
        pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
                 to_user ? "exposure" : "overwrite",
                 to_user ? "from" : "to",
                 name ? : "unknown?!",
                 detail ? " '" : "", detail ? : "", detail ? "'" : "",
                 offset, len);

        /*
         * For greater effect, it would be nice to do do_group_exit(),
         * but BUG() actually hooks all the lock-breaking and per-arch
         * Oops code, so that is used here instead.
         */
        BUG();
}

/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
static bool overlaps(const unsigned long ptr, unsigned long n,
                     unsigned long low, unsigned long high)
{
        const unsigned long check_low = ptr;
        unsigned long check_high = check_low + n;

        /* Does not overlap if entirely above or entirely below. */
        if (check_low >= high || check_high <= low)
                return false;

        return true;
}

/* Is this address range in the kernel text area? */
static inline void check_kernel_text_object(const unsigned long ptr,
                                            unsigned long n, bool to_user)
{
        unsigned long textlow = (unsigned long)_stext;
        unsigned long texthigh = (unsigned long)_etext;
        unsigned long textlow_linear, texthigh_linear;

        if (overlaps(ptr, n, textlow, texthigh))
                usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n);

        /*
         * Some architectures have virtual memory mappings with a secondary
         * mapping of the kernel text, i.e. there is more than one virtual
         * kernel address that points to the kernel image. It is usually
         * when there is a separate linear physical memory mapping, in that
         * __pa() is not just the reverse of __va(). This can be detected
         * and checked:
         */
        textlow_linear = (unsigned long)lm_alias(textlow);
        /* No different mapping: we're done. */
        if (textlow_linear == textlow)
                return;

        /* Check the secondary mapping... */
        texthigh_linear = (unsigned long)lm_alias(texthigh);
        if (overlaps(ptr, n, textlow_linear, texthigh_linear))
                usercopy_abort("linear kernel text", NULL, to_user,
                               ptr - textlow_linear, n);
}

static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
                                       bool to_user)
{
        /* Reject if object wraps past end of memory. */
        if (ptr + (n - 1) < ptr)
                usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);

        /* Reject if NULL or ZERO-allocation. */
        if (ZERO_OR_NULL_PTR(ptr))
                usercopy_abort("null address", NULL, to_user, ptr, n);
}

static inline void check_heap_object(const void *ptr, unsigned long n,
                                     bool to_user)
{
        unsigned long addr = (unsigned long)ptr;
        unsigned long offset;
        struct folio *folio;

        if (is_kmap_addr(ptr)) {
                offset = offset_in_page(ptr);
                if (n > PAGE_SIZE - offset)
                        usercopy_abort("kmap", NULL, to_user, offset, n);
                return;
        }

        if (is_vmalloc_addr(ptr) && !pagefault_disabled()) {
                struct vmap_area *area = find_vmap_area(addr);

                if (!area)
                        usercopy_abort("vmalloc", "no area", to_user, 0, n);

                if (n > area->va_end - addr) {
                        offset = addr - area->va_start;
                        usercopy_abort("vmalloc", NULL, to_user, offset, n);
                }
                return;
        }

        if (!virt_addr_valid(ptr))
                return;

        folio = virt_to_folio(ptr);

        if (folio_test_slab(folio)) {
                /* Check slab allocator for flags and size. */
                __check_heap_object(ptr, n, folio_slab(folio), to_user);
        } else if (folio_test_large(folio)) {
                offset = ptr - folio_address(folio);
                if (n > folio_size(folio) - offset)
                        usercopy_abort("page alloc", NULL, to_user, offset, n);
        }
}

static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);

/*
 * Validates that the given object is:
 * - not bogus address
 * - fully contained by stack (or stack frame, when available)
 * - fully within SLAB object (or object whitelist area, when available)
 * - not in kernel text
 */
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
        if (static_branch_unlikely(&bypass_usercopy_checks))
                return;

        /* Skip all tests if size is zero. */
        if (!n)
                return;

        /* Check for invalid addresses. */
        check_bogus_address((const unsigned long)ptr, n, to_user);

        /* Check for bad stack object. */
        switch (check_stack_object(ptr, n)) {
        case NOT_STACK:
                /* Object is not touching the current process stack. */
                break;
        case GOOD_FRAME:
        case GOOD_STACK:
                /*
                 * Object is either in the correct frame (when it
                 * is possible to check) or just generally on the
                 * process stack (when frame checking not available).
                 */
                return;
        default:
                usercopy_abort("process stack", NULL, to_user,
#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
                        IS_ENABLED(CONFIG_STACK_GROWSUP) ?
                                ptr - (void *)current_stack_pointer :
                                (void *)current_stack_pointer - ptr,
#else
                        0,
#endif
                        n);
        }

        /* Check for bad heap object. */
        check_heap_object(ptr, n, to_user);

        /* Check for object in kernel to avoid text exposure. */
        check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);

static bool enable_checks __initdata = true;

static int __init parse_hardened_usercopy(char *str)
{
        if (kstrtobool(str, &enable_checks))
                pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
                        str);
        return 1;
}

__setup("hardened_usercopy=", parse_hardened_usercopy);

static int __init set_hardened_usercopy(void)
{
        if (enable_checks == false)
                static_branch_enable(&bypass_usercopy_checks);
        return 1;
}

late_initcall(set_hardened_usercopy);






























































































































































































































































































































































































































































































































































































































































































    2 












































    2 




























    2 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002 Andi Kleen, SuSE Labs.
 * Thanks to Ben LaHaise for precious feedback.
 */
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/vmalloc.h>
#include <linux/libnvdimm.h>
#include <linux/vmstat.h>
#include <linux/kernel.h>
#include <linux/cc_platform.h>
#include <linux/set_memory.h>
#include <linux/memregion.h>

#include <asm/e820/api.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/memtype.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>

#include "../mm_internal.h"

/*
 * The current flushing context - we pass it instead of 5 arguments:
 */
struct cpa_data {
        unsigned long        *vaddr;
        pgd_t                *pgd;
        pgprot_t        mask_set;
        pgprot_t        mask_clr;
        unsigned long        numpages;
        unsigned long        curpage;
        unsigned long        pfn;
        unsigned int        flags;
        unsigned int        force_split                : 1,
                        force_static_prot        : 1,
                        force_flush_all                : 1;
        struct page        **pages;
};

enum cpa_warn {
        CPA_CONFLICT,
        CPA_PROTECT,
        CPA_DETECT,
};

static const int cpa_warn_level = CPA_PROTECT;

/*
 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
 * entries change the page attribute in parallel to some other cpu
 * splitting a large page entry along with changing the attribute.
 */
static DEFINE_SPINLOCK(cpa_lock);

#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */

static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
        return __pgprot(cachemode2protval(pcm));
}

#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];

void update_page_count(int level, unsigned long pages)
{
        /* Protect against CPA */
        spin_lock(&pgd_lock);
        direct_pages_count[level] += pages;
        spin_unlock(&pgd_lock);
}

static void split_page_count(int level)
{
        if (direct_pages_count[level] == 0)
                return;

        direct_pages_count[level]--;
        if (system_state == SYSTEM_RUNNING) {
                if (level == PG_LEVEL_2M)
                        count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
                else if (level == PG_LEVEL_1G)
                        count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
        }
        direct_pages_count[level - 1] += PTRS_PER_PTE;
}

void arch_report_meminfo(struct seq_file *m)
{
        seq_printf(m, "DirectMap4k:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_4K] << 2);
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
        seq_printf(m, "DirectMap2M:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_2M] << 11);
#else
        seq_printf(m, "DirectMap4M:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_2M] << 12);
#endif
        if (direct_gbpages)
                seq_printf(m, "DirectMap1G:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_1G] << 20);
}
#else
static inline void split_page_count(int level) { }
#endif

#ifdef CONFIG_X86_CPA_STATISTICS

static unsigned long cpa_1g_checked;
static unsigned long cpa_1g_sameprot;
static unsigned long cpa_1g_preserved;
static unsigned long cpa_2m_checked;
static unsigned long cpa_2m_sameprot;
static unsigned long cpa_2m_preserved;
static unsigned long cpa_4k_install;

static inline void cpa_inc_1g_checked(void)
{
        cpa_1g_checked++;
}

static inline void cpa_inc_2m_checked(void)
{
        cpa_2m_checked++;
}

static inline void cpa_inc_4k_install(void)
{
        data_race(cpa_4k_install++);
}

static inline void cpa_inc_lp_sameprot(int level)
{
        if (level == PG_LEVEL_1G)
                cpa_1g_sameprot++;
        else
                cpa_2m_sameprot++;
}

static inline void cpa_inc_lp_preserved(int level)
{
        if (level == PG_LEVEL_1G)
                cpa_1g_preserved++;
        else
                cpa_2m_preserved++;
}

static int cpastats_show(struct seq_file *m, void *p)
{
        seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
        seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
        seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
        seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
        seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
        seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
        seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
        return 0;
}

static int cpastats_open(struct inode *inode, struct file *file)
{
        return single_open(file, cpastats_show, NULL);
}

static const struct file_operations cpastats_fops = {
        .open                = cpastats_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static int __init cpa_stats_init(void)
{
        debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
                            &cpastats_fops);
        return 0;
}
late_initcall(cpa_stats_init);
#else
static inline void cpa_inc_1g_checked(void) { }
static inline void cpa_inc_2m_checked(void) { }
static inline void cpa_inc_4k_install(void) { }
static inline void cpa_inc_lp_sameprot(int level) { }
static inline void cpa_inc_lp_preserved(int level) { }
#endif


static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
        return addr >= start && addr < end;
}

static inline int
within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
{
        return addr >= start && addr <= end;
}

#ifdef CONFIG_X86_64

/*
 * The kernel image is mapped into two places in the virtual address space
 * (addresses without KASLR, of course):
 *
 * 1. The kernel direct map (0xffff880000000000)
 * 2. The "high kernel map" (0xffffffff81000000)
 *
 * We actually execute out of #2. If we get the address of a kernel symbol, it
 * points to #2, but almost all physical-to-virtual translations point to #1.
 *
 * This is so that we can have both a directmap of all physical memory *and*
 * take full advantage of the limited (s32) immediate addressing range (2G)
 * of x86_64.
 *
 * See Documentation/arch/x86/x86_64/mm.rst for more detail.
 */

static inline unsigned long highmap_start_pfn(void)
{
        return __pa_symbol(_text) >> PAGE_SHIFT;
}

static inline unsigned long highmap_end_pfn(void)
{
        /* Do not reference physical address outside the kernel. */
        return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}

static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
        /*
         * Kernel text has an alias mapping at a high address, known
         * here as "highmap".
         */
        return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
}

#else

static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
        /* There is no highmap on 32-bit */
        return false;
}

#endif

/*
 * See set_mce_nospec().
 *
 * Machine check recovery code needs to change cache mode of poisoned pages to
 * UC to avoid speculative access logging another error. But passing the
 * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
 * speculative access. So we cheat and flip the top bit of the address. This
 * works fine for the code that updates the page tables. But at the end of the
 * process we need to flush the TLB and cache and the non-canonical address
 * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
 *
 * But in the common case we already have a canonical address. This code
 * will fix the top bit if needed and is a no-op otherwise.
 */
static inline unsigned long fix_addr(unsigned long addr)
{
#ifdef CONFIG_X86_64
        return (long)(addr << 1) >> 1;
#else
        return addr;
#endif
}

static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
{
        if (cpa->flags & CPA_PAGES_ARRAY) {
                struct page *page = cpa->pages[idx];

                if (unlikely(PageHighMem(page)))
                        return 0;

                return (unsigned long)page_address(page);
        }

        if (cpa->flags & CPA_ARRAY)
                return cpa->vaddr[idx];

        return *cpa->vaddr + idx * PAGE_SIZE;
}

/*
 * Flushing functions
 */

static void clflush_cache_range_opt(void *vaddr, unsigned int size)
{
        const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
        void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
        void *vend = vaddr + size;

        if (p >= vend)
                return;

        for (; p < vend; p += clflush_size)
                clflushopt(p);
}

/**
 * clflush_cache_range - flush a cache range with clflush
 * @vaddr:        virtual start address
 * @size:        number of bytes to flush
 *
 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
 * SFENCE to avoid ordering issues.
 */
void clflush_cache_range(void *vaddr, unsigned int size)
{
        mb();
        clflush_cache_range_opt(vaddr, size);
        mb();
}
EXPORT_SYMBOL_GPL(clflush_cache_range);

#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_invalidate_pmem(void *addr, size_t size)
{
        clflush_cache_range(addr, size);
}
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif

#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
bool cpu_cache_has_invalidate_memregion(void)
{
        return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM);

int cpu_cache_invalidate_memregion(int res_desc)
{
        if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
                return -ENXIO;
        wbinvd_on_all_cpus();
        return 0;
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM);
#endif

static void __cpa_flush_all(void *arg)
{
        unsigned long cache = (unsigned long)arg;

        /*
         * Flush all to work around Errata in early athlons regarding
         * large page flushing.
         */
        __flush_tlb_all();

        if (cache && boot_cpu_data.x86 >= 4)
                wbinvd();
}

static void cpa_flush_all(unsigned long cache)
{
        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);

        on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}

static void __cpa_flush_tlb(void *data)
{
        struct cpa_data *cpa = data;
        unsigned int i;

        for (i = 0; i < cpa->numpages; i++)
                flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}

static void cpa_flush(struct cpa_data *data, int cache)
{
        struct cpa_data *cpa = data;
        unsigned int i;

        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);

        if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
                cpa_flush_all(cache);
                return;
        }

        if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
                flush_tlb_all();
        else
                on_each_cpu(__cpa_flush_tlb, cpa, 1);

        if (!cache)
                return;

        mb();
        for (i = 0; i < cpa->numpages; i++) {
                unsigned long addr = __cpa_addr(cpa, i);
                unsigned int level;

                pte_t *pte = lookup_address(addr, &level);

                /*
                 * Only flush present addresses:
                 */
                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
                        clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
        }
        mb();
}

static bool overlaps(unsigned long r1_start, unsigned long r1_end,
                     unsigned long r2_start, unsigned long r2_end)
{
        return (r1_start <= r2_end && r1_end >= r2_start) ||
                (r2_start <= r1_end && r2_end >= r1_start);
}

#ifdef CONFIG_PCI_BIOS
/*
 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
 * based config access (CONFIG_PCI_GOBIOS) support.
 */
#define BIOS_PFN        PFN_DOWN(BIOS_BEGIN)
#define BIOS_PFN_END        PFN_DOWN(BIOS_END - 1)

static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
        if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
                return _PAGE_NX;
        return 0;
}
#else
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
        return 0;
}
#endif

/*
 * The .rodata section needs to be read-only. Using the pfn catches all
 * aliases.  This also includes __ro_after_init, so do not enforce until
 * kernel_set_to_readonly is true.
 */
static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
{
        unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));

        /*
         * Note: __end_rodata is at page aligned and not inclusive, so
         * subtract 1 to get the last enforced PFN in the rodata area.
         */
        epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;

        if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
                return _PAGE_RW;
        return 0;
}

/*
 * Protect kernel text against becoming non executable by forbidding
 * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
 * out of which the kernel actually executes.  Do not protect the low
 * mapping.
 *
 * This does not cover __inittext since that is gone after boot.
 */
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
{
        unsigned long t_end = (unsigned long)_etext - 1;
        unsigned long t_start = (unsigned long)_text;

        if (overlaps(start, end, t_start, t_end))
                return _PAGE_NX;
        return 0;
}

#if defined(CONFIG_X86_64)
/*
 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 * kernel text mappings for the large page aligned text, rodata sections
 * will be always read-only. For the kernel identity mappings covering the
 * holes caused by this alignment can be anything that user asks.
 *
 * This will preserve the large page mappings for kernel text/data at no
 * extra cost.
 */
static pgprotval_t protect_kernel_text_ro(unsigned long start,
                                          unsigned long end)
{
        unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
        unsigned long t_start = (unsigned long)_text;
        unsigned int level;

        if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
                return 0;
        /*
         * Don't enforce the !RW mapping for the kernel text mapping, if
         * the current mapping is already using small page mapping.  No
         * need to work hard to preserve large page mappings in this case.
         *
         * This also fixes the Linux Xen paravirt guest boot failure caused
         * by unexpected read-only mappings for kernel identity
         * mappings. In this paravirt guest case, the kernel text mapping
         * and the kernel identity mapping share the same page-table pages,
         * so the protections for kernel text and identity mappings have to
         * be the same.
         */
        if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
                return _PAGE_RW;
        return 0;
}
#else
static pgprotval_t protect_kernel_text_ro(unsigned long start,
                                          unsigned long end)
{
        return 0;
}
#endif

static inline bool conflicts(pgprot_t prot, pgprotval_t val)
{
        return (pgprot_val(prot) & ~val) != pgprot_val(prot);
}

static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
                                  unsigned long start, unsigned long end,
                                  unsigned long pfn, const char *txt)
{
        static const char *lvltxt[] = {
                [CPA_CONFLICT]        = "conflict",
                [CPA_PROTECT]        = "protect",
                [CPA_DETECT]        = "detect",
        };

        if (warnlvl > cpa_warn_level || !conflicts(prot, val))
                return;

        pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
                lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
                (unsigned long long)val);
}

/*
 * Certain areas of memory on x86 require very specific protection flags,
 * for example the BIOS area or kernel text. Callers don't always get this
 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 * checks and fixes these known static required protection bits.
 */
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
                                          unsigned long pfn, unsigned long npg,
                                          unsigned long lpsize, int warnlvl)
{
        pgprotval_t forbidden, res;
        unsigned long end;

        /*
         * There is no point in checking RW/NX conflicts when the requested
         * mapping is setting the page !PRESENT.
         */
        if (!(pgprot_val(prot) & _PAGE_PRESENT))
                return prot;

        /* Operate on the virtual address */
        end = start + npg * PAGE_SIZE - 1;

        res = protect_kernel_text(start, end);
        check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
        forbidden = res;

        /*
         * Special case to preserve a large page. If the change spawns the
         * full large page mapping then there is no point to split it
         * up. Happens with ftrace and is going to be removed once ftrace
         * switched to text_poke().
         */
        if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
                res = protect_kernel_text_ro(start, end);
                check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
                forbidden |= res;
        }

        /* Check the PFN directly */
        res = protect_pci_bios(pfn, pfn + npg - 1);
        check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
        forbidden |= res;

        res = protect_rodata(pfn, pfn + npg - 1);
        check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
        forbidden |= res;

        return __pgprot(pgprot_val(prot) & ~forbidden);
}

/*
 * Validate strict W^X semantics.
 */
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
                                  unsigned long pfn, unsigned long npg,
                                  bool nx, bool rw)
{
        unsigned long end;

        /*
         * 32-bit has some unfixable W+X issues, like EFI code
         * and writeable data being in the same page.  Disable
         * detection and enforcement there.
         */
        if (IS_ENABLED(CONFIG_X86_32))
                return new;

        /* Only verify when NX is supported: */
        if (!(__supported_pte_mask & _PAGE_NX))
                return new;

        if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
                return new;

        if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
                return new;

        /* Non-leaf translation entries can disable writing or execution. */
        if (!rw || nx)
                return new;

        end = start + npg * PAGE_SIZE - 1;
        WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
                  (unsigned long long)pgprot_val(old),
                  (unsigned long long)pgprot_val(new),
                  start, end, pfn);

        /*
         * For now, allow all permission change attempts by returning the
         * attempted permissions.  This can 'return old' to actively
         * refuse the permission change at a later time.
         */
        return new;
}

/*
 * Lookup the page table entry for a virtual address in a specific pgd.
 * Return a pointer to the entry, the level of the mapping, and the effective
 * NX and RW bits of all page table levels.
 */
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw)
{
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        *level = PG_LEVEL_NONE;
        *nx = false;
        *rw = true;

        if (pgd_none(*pgd))
                return NULL;

        *nx |= pgd_flags(*pgd) & _PAGE_NX;
        *rw &= pgd_flags(*pgd) & _PAGE_RW;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d))
                return NULL;

        *level = PG_LEVEL_512G;
        if (p4d_leaf(*p4d) || !p4d_present(*p4d))
                return (pte_t *)p4d;

        *nx |= p4d_flags(*p4d) & _PAGE_NX;
        *rw &= p4d_flags(*p4d) & _PAGE_RW;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud))
                return NULL;

        *level = PG_LEVEL_1G;
        if (pud_leaf(*pud) || !pud_present(*pud))
                return (pte_t *)pud;

        *nx |= pud_flags(*pud) & _PAGE_NX;
        *rw &= pud_flags(*pud) & _PAGE_RW;

        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                return NULL;

        *level = PG_LEVEL_2M;
        if (pmd_leaf(*pmd) || !pmd_present(*pmd))
                return (pte_t *)pmd;

        *nx |= pmd_flags(*pmd) & _PAGE_NX;
        *rw &= pmd_flags(*pmd) & _PAGE_RW;

        *level = PG_LEVEL_4K;

        return pte_offset_kernel(pmd, address);
}

/*
 * Lookup the page table entry for a virtual address in a specific pgd.
 * Return a pointer to the entry and the level of the mapping.
 */
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                             unsigned int *level)
{
        bool nx, rw;

        return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
}

/*
 * Lookup the page table entry for a virtual address. Return a pointer
 * to the entry and the level of the mapping.
 *
 * Note: We return pud and pmd either when the entry is marked large
 * or when the present bit is not set. Otherwise we would return a
 * pointer to a nonexisting mapping.
 */
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);

static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw)
{
        pgd_t *pgd;

        if (!cpa->pgd)
                pgd = pgd_offset_k(address);
        else
                pgd = cpa->pgd + pgd_index(address);

        return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}

/*
 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 * or NULL if not present.
 */
pmd_t *lookup_pmd_address(unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;

        pgd = pgd_offset_k(address);
        if (pgd_none(*pgd))
                return NULL;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
                return NULL;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
                return NULL;

        return pmd_offset(pud, address);
}

/*
 * This is necessary because __pa() does not work on some
 * kinds of memory, like vmalloc() or the alloc_remap()
 * areas on 32-bit NUMA systems.  The percpu areas can
 * end up in this kind of memory, for instance.
 *
 * Note that as long as the PTEs are well-formed with correct PFNs, this
 * works without checking the PRESENT bit in the leaf PTE.  This is unlike
 * the similar vmalloc_to_page() and derivatives.  Callers may depend on
 * this behavior.
 *
 * This could be optimized, but it is only used in paths that are not perf
 * sensitive, and keeping it unoptimized should increase the testing coverage
 * for the more obscure platforms.
 */
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
        unsigned long virt_addr = (unsigned long)__virt_addr;
        phys_addr_t phys_addr;
        unsigned long offset;
        enum pg_level level;
        pte_t *pte;

        pte = lookup_address(virt_addr, &level);
        BUG_ON(!pte);

        /*
         * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
         * before being left-shifted PAGE_SHIFT bits -- this trick is to
         * make 32-PAE kernel work correctly.
         */
        switch (level) {
        case PG_LEVEL_1G:
                phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PUD_MASK;
                break;
        case PG_LEVEL_2M:
                phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PMD_MASK;
                break;
        default:
                phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
                offset = virt_addr & ~PAGE_MASK;
        }

        return (phys_addr_t)(phys_addr | offset);
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);

/*
 * Set the new pmd in all the pgds we know about:
 */
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
{
        /* change init_mm */
        set_pte_atomic(kpte, pte);
#ifdef CONFIG_X86_32
        if (!SHARED_KERNEL_PMD) {
                struct page *page;

                list_for_each_entry(page, &pgd_list, lru) {
                        pgd_t *pgd;
                        p4d_t *p4d;
                        pud_t *pud;
                        pmd_t *pmd;

                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
                        p4d = p4d_offset(pgd, address);
                        pud = pud_offset(p4d, address);
                        pmd = pmd_offset(pud, address);
                        set_pte_atomic((pte_t *)pmd, pte);
                }
        }
#endif
}

static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
{
        /*
         * _PAGE_GLOBAL means "global page" for present PTEs.
         * But, it is also used to indicate _PAGE_PROTNONE
         * for non-present PTEs.
         *
         * This ensures that a _PAGE_GLOBAL PTE going from
         * present to non-present is not confused as
         * _PAGE_PROTNONE.
         */
        if (!(pgprot_val(prot) & _PAGE_PRESENT))
                pgprot_val(prot) &= ~_PAGE_GLOBAL;

        return prot;
}

static int __should_split_large_page(pte_t *kpte, unsigned long address,
                                     struct cpa_data *cpa)
{
        unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
        pgprot_t old_prot, new_prot, req_prot, chk_prot;
        pte_t new_pte, *tmp;
        enum pg_level level;
        bool nx, rw;

        /*
         * Check for races, another CPU might have split this page
         * up already:
         */
        tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (tmp != kpte)
                return 1;

        switch (level) {
        case PG_LEVEL_2M:
                old_prot = pmd_pgprot(*(pmd_t *)kpte);
                old_pfn = pmd_pfn(*(pmd_t *)kpte);
                cpa_inc_2m_checked();
                break;
        case PG_LEVEL_1G:
                old_prot = pud_pgprot(*(pud_t *)kpte);
                old_pfn = pud_pfn(*(pud_t *)kpte);
                cpa_inc_1g_checked();
                break;
        default:
                return -EINVAL;
        }

        psize = page_level_size(level);
        pmask = page_level_mask(level);

        /*
         * Calculate the number of pages, which fit into this large
         * page starting at address:
         */
        lpaddr = (address + psize) & pmask;
        numpages = (lpaddr - address) >> PAGE_SHIFT;
        if (numpages < cpa->numpages)
                cpa->numpages = numpages;

        /*
         * We are safe now. Check whether the new pgprot is the same:
         * Convert protection attributes to 4k-format, as cpa->mask* are set
         * up accordingly.
         */

        /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
        req_prot = pgprot_large_2_4k(old_prot);

        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);

        /*
         * req_prot is in format of 4k pages. It must be converted to large
         * page format: the caching mode includes the PAT bit located at
         * different bit positions in the two formats.
         */
        req_prot = pgprot_4k_2_large(req_prot);
        req_prot = pgprot_clear_protnone_bits(req_prot);
        if (pgprot_val(req_prot) & _PAGE_PRESENT)
                pgprot_val(req_prot) |= _PAGE_PSE;

        /*
         * old_pfn points to the large page base pfn. So we need to add the
         * offset of the virtual address:
         */
        pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;

        /*
         * Calculate the large page base address and the number of 4K pages
         * in the large page
         */
        lpaddr = address & pmask;
        numpages = psize >> PAGE_SHIFT;

        /*
         * Sanity check that the existing mapping is correct versus the static
         * protections. static_protections() guards against !PRESENT, so no
         * extra conditional required here.
         */
        chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
                                      psize, CPA_CONFLICT);

        if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
                /*
                 * Split the large page and tell the split code to
                 * enforce static protections.
                 */
                cpa->force_static_prot = 1;
                return 1;
        }

        /*
         * Optimization: If the requested pgprot is the same as the current
         * pgprot, then the large page can be preserved and no updates are
         * required independent of alignment and length of the requested
         * range. The above already established that the current pgprot is
         * correct, which in consequence makes the requested pgprot correct
         * as well if it is the same. The static protection scan below will
         * not come to a different conclusion.
         */
        if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
                cpa_inc_lp_sameprot(level);
                return 0;
        }

        /*
         * If the requested range does not cover the full page, split it up
         */
        if (address != lpaddr || cpa->numpages != numpages)
                return 1;

        /*
         * Check whether the requested pgprot is conflicting with a static
         * protection requirement in the large page.
         */
        new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
                                      psize, CPA_DETECT);

        new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
                              nx, rw);

        /*
         * If there is a conflict, split the large page.
         *
         * There used to be a 4k wise evaluation trying really hard to
         * preserve the large pages, but experimentation has shown, that this
         * does not help at all. There might be corner cases which would
         * preserve one large page occasionally, but it's really not worth the
         * extra code and cycles for the common case.
         */
        if (pgprot_val(req_prot) != pgprot_val(new_prot))
                return 1;

        /* All checks passed. Update the large page mapping. */
        new_pte = pfn_pte(old_pfn, new_prot);
        __set_pmd_pte(kpte, address, new_pte);
        cpa->flags |= CPA_FLUSHTLB;
        cpa_inc_lp_preserved(level);
        return 0;
}

static int should_split_large_page(pte_t *kpte, unsigned long address,
                                   struct cpa_data *cpa)
{
        int do_split;

        if (cpa->force_split)
                return 1;

        spin_lock(&pgd_lock);
        do_split = __should_split_large_page(kpte, address, cpa);
        spin_unlock(&pgd_lock);

        return do_split;
}

static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
                          pgprot_t ref_prot, unsigned long address,
                          unsigned long size)
{
        unsigned int npg = PFN_DOWN(size);
        pgprot_t prot;

        /*
         * If should_split_large_page() discovered an inconsistent mapping,
         * remove the invalid protection in the split mapping.
         */
        if (!cpa->force_static_prot)
                goto set;

        /* Hand in lpsize = 0 to enforce the protection mechanism */
        prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);

        if (pgprot_val(prot) == pgprot_val(ref_prot))
                goto set;

        /*
         * If this is splitting a PMD, fix it up. PUD splits cannot be
         * fixed trivially as that would require to rescan the newly
         * installed PMD mappings after returning from split_large_page()
         * so an eventual further split can allocate the necessary PTE
         * pages. Warn for now and revisit it in case this actually
         * happens.
         */
        if (size == PAGE_SIZE)
                ref_prot = prot;
        else
                pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
        set_pte(pte, pfn_pte(pfn, ref_prot));
}

static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
                   struct page *base)
{
        unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
        pte_t *pbase = (pte_t *)page_address(base);
        unsigned int i, level;
        pgprot_t ref_prot;
        bool nx, rw;
        pte_t *tmp;

        spin_lock(&pgd_lock);
        /*
         * Check for races, another CPU might have split this page
         * up for us already:
         */
        tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (tmp != kpte) {
                spin_unlock(&pgd_lock);
                return 1;
        }

        paravirt_alloc_pte(&init_mm, page_to_pfn(base));

        switch (level) {
        case PG_LEVEL_2M:
                ref_prot = pmd_pgprot(*(pmd_t *)kpte);
                /*
                 * Clear PSE (aka _PAGE_PAT) and move
                 * PAT bit to correct position.
                 */
                ref_prot = pgprot_large_2_4k(ref_prot);
                ref_pfn = pmd_pfn(*(pmd_t *)kpte);
                lpaddr = address & PMD_MASK;
                lpinc = PAGE_SIZE;
                break;

        case PG_LEVEL_1G:
                ref_prot = pud_pgprot(*(pud_t *)kpte);
                ref_pfn = pud_pfn(*(pud_t *)kpte);
                pfninc = PMD_SIZE >> PAGE_SHIFT;
                lpaddr = address & PUD_MASK;
                lpinc = PMD_SIZE;
                /*
                 * Clear the PSE flags if the PRESENT flag is not set
                 * otherwise pmd_present/pmd_huge will return true
                 * even on a non present pmd.
                 */
                if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
                        pgprot_val(ref_prot) &= ~_PAGE_PSE;
                break;

        default:
                spin_unlock(&pgd_lock);
                return 1;
        }

        ref_prot = pgprot_clear_protnone_bits(ref_prot);

        /*
         * Get the target pfn from the original entry:
         */
        pfn = ref_pfn;
        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
                split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);

        if (virt_addr_valid(address)) {
                unsigned long pfn = PFN_DOWN(__pa(address));

                if (pfn_range_is_mapped(pfn, pfn + 1))
                        split_page_count(level);
        }

        /*
         * Install the new, split up pagetable.
         *
         * We use the standard kernel pagetable protections for the new
         * pagetable protections, the actual ptes set above control the
         * primary protection behavior:
         */
        __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));

        /*
         * Do a global flush tlb after splitting the large page
         * and before we do the actual change page attribute in the PTE.
         *
         * Without this, we violate the TLB application note, that says:
         * "The TLBs may contain both ordinary and large-page
         *  translations for a 4-KByte range of linear addresses. This
         *  may occur if software modifies the paging structures so that
         *  the page size used for the address range changes. If the two
         *  translations differ with respect to page frame or attributes
         *  (e.g., permissions), processor behavior is undefined and may
         *  be implementation-specific."
         *
         * We do this global tlb flush inside the cpa_lock, so that we
         * don't allow any other cpu, with stale tlb entries change the
         * page attribute in parallel, that also falls into the
         * just split large page entry.
         */
        flush_tlb_all();
        spin_unlock(&pgd_lock);

        return 0;
}

static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
                            unsigned long address)
{
        struct page *base;

        if (!debug_pagealloc_enabled())
                spin_unlock(&cpa_lock);
        base = alloc_pages(GFP_KERNEL, 0);
        if (!debug_pagealloc_enabled())
                spin_lock(&cpa_lock);
        if (!base)
                return -ENOMEM;

        if (__split_large_page(cpa, kpte, address, base))
                __free_page(base);

        return 0;
}

static bool try_to_free_pte_page(pte_t *pte)
{
        int i;

        for (i = 0; i < PTRS_PER_PTE; i++)
                if (!pte_none(pte[i]))
                        return false;

        free_page((unsigned long)pte);
        return true;
}

static bool try_to_free_pmd_page(pmd_t *pmd)
{
        int i;

        for (i = 0; i < PTRS_PER_PMD; i++)
                if (!pmd_none(pmd[i]))
                        return false;

        free_page((unsigned long)pmd);
        return true;
}

static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
        pte_t *pte = pte_offset_kernel(pmd, start);

        while (start < end) {
                set_pte(pte, __pte(0));

                start += PAGE_SIZE;
                pte++;
        }

        if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
                pmd_clear(pmd);
                return true;
        }
        return false;
}

static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
                              unsigned long start, unsigned long end)
{
        if (unmap_pte_range(pmd, start, end))
                if (try_to_free_pmd_page(pud_pgtable(*pud)))
                        pud_clear(pud);
}

static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
{
        pmd_t *pmd = pmd_offset(pud, start);

        /*
         * Not on a 2MB page boundary?
         */
        if (start & (PMD_SIZE - 1)) {
                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
                unsigned long pre_end = min_t(unsigned long, end, next_page);

                __unmap_pmd_range(pud, pmd, start, pre_end);

                start = pre_end;
                pmd++;
        }

        /*
         * Try to unmap in 2M chunks.
         */
        while (end - start >= PMD_SIZE) {
                if (pmd_leaf(*pmd))
                        pmd_clear(pmd);
                else
                        __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);

                start += PMD_SIZE;
                pmd++;
        }

        /*
         * 4K leftovers?
         */
        if (start < end)
                return __unmap_pmd_range(pud, pmd, start, end);

        /*
         * Try again to free the PMD page if haven't succeeded above.
         */
        if (!pud_none(*pud))
                if (try_to_free_pmd_page(pud_pgtable(*pud)))
                        pud_clear(pud);
}

static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
{
        pud_t *pud = pud_offset(p4d, start);

        /*
         * Not on a GB page boundary?
         */
        if (start & (PUD_SIZE - 1)) {
                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
                unsigned long pre_end        = min_t(unsigned long, end, next_page);

                unmap_pmd_range(pud, start, pre_end);

                start = pre_end;
                pud++;
        }

        /*
         * Try to unmap in 1G chunks?
         */
        while (end - start >= PUD_SIZE) {

                if (pud_leaf(*pud))
                        pud_clear(pud);
                else
                        unmap_pmd_range(pud, start, start + PUD_SIZE);

                start += PUD_SIZE;
                pud++;
        }

        /*
         * 2M leftovers?
         */
        if (start < end)
                unmap_pmd_range(pud, start, end);

        /*
         * No need to try to free the PUD page because we'll free it in
         * populate_pgd's error path
         */
}

static int alloc_pte_page(pmd_t *pmd)
{
        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
        if (!pte)
                return -1;

        set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
        return 0;
}

static int alloc_pmd_page(pud_t *pud)
{
        pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
        if (!pmd)
                return -1;

        set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
        return 0;
}

static void populate_pte(struct cpa_data *cpa,
                         unsigned long start, unsigned long end,
                         unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
{
        pte_t *pte;

        pte = pte_offset_kernel(pmd, start);

        pgprot = pgprot_clear_protnone_bits(pgprot);

        while (num_pages-- && start < end) {
                set_pte(pte, pfn_pte(cpa->pfn, pgprot));

                start         += PAGE_SIZE;
                cpa->pfn++;
                pte++;
        }
}

static long populate_pmd(struct cpa_data *cpa,
                         unsigned long start, unsigned long end,
                         unsigned num_pages, pud_t *pud, pgprot_t pgprot)
{
        long cur_pages = 0;
        pmd_t *pmd;
        pgprot_t pmd_pgprot;

        /*
         * Not on a 2M boundary?
         */
        if (start & (PMD_SIZE - 1)) {
                unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;

                pre_end   = min_t(unsigned long, pre_end, next_page);
                cur_pages = (pre_end - start) >> PAGE_SHIFT;
                cur_pages = min_t(unsigned int, num_pages, cur_pages);

                /*
                 * Need a PTE page?
                 */
                pmd = pmd_offset(pud, start);
                if (pmd_none(*pmd))
                        if (alloc_pte_page(pmd))
                                return -1;

                populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);

                start = pre_end;
        }

        /*
         * We mapped them all?
         */
        if (num_pages == cur_pages)
                return cur_pages;

        pmd_pgprot = pgprot_4k_2_large(pgprot);

        while (end - start >= PMD_SIZE) {

                /*
                 * We cannot use a 1G page so allocate a PMD page if needed.
                 */
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                pmd = pmd_offset(pud, start);

                set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
                                        canon_pgprot(pmd_pgprot))));

                start          += PMD_SIZE;
                cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
                cur_pages += PMD_SIZE >> PAGE_SHIFT;
        }

        /*
         * Map trailing 4K pages.
         */
        if (start < end) {
                pmd = pmd_offset(pud, start);
                if (pmd_none(*pmd))
                        if (alloc_pte_page(pmd))
                                return -1;

                populate_pte(cpa, start, end, num_pages - cur_pages,
                             pmd, pgprot);
        }
        return num_pages;
}

static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
                        pgprot_t pgprot)
{
        pud_t *pud;
        unsigned long end;
        long cur_pages = 0;
        pgprot_t pud_pgprot;

        end = start + (cpa->numpages << PAGE_SHIFT);

        /*
         * Not on a Gb page boundary? => map everything up to it with
         * smaller pages.
         */
        if (start & (PUD_SIZE - 1)) {
                unsigned long pre_end;
                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;

                pre_end   = min_t(unsigned long, end, next_page);
                cur_pages = (pre_end - start) >> PAGE_SHIFT;
                cur_pages = min_t(int, (int)cpa->numpages, cur_pages);

                pud = pud_offset(p4d, start);

                /*
                 * Need a PMD page?
                 */
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
                                         pud, pgprot);
                if (cur_pages < 0)
                        return cur_pages;

                start = pre_end;
        }

        /* We mapped them all? */
        if (cpa->numpages == cur_pages)
                return cur_pages;

        pud = pud_offset(p4d, start);
        pud_pgprot = pgprot_4k_2_large(pgprot);

        /*
         * Map everything starting from the Gb boundary, possibly with 1G pages
         */
        while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
                set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
                                   canon_pgprot(pud_pgprot))));

                start          += PUD_SIZE;
                cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
                cur_pages += PUD_SIZE >> PAGE_SHIFT;
                pud++;
        }

        /* Map trailing leftover */
        if (start < end) {
                long tmp;

                pud = pud_offset(p4d, start);
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
                                   pud, pgprot);
                if (tmp < 0)
                        return cur_pages;

                cur_pages += tmp;
        }
        return cur_pages;
}

/*
 * Restrictions for kernel page table do not necessarily apply when mapping in
 * an alternate PGD.
 */
static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
        pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
        pud_t *pud = NULL;        /* shut up gcc */
        p4d_t *p4d;
        pgd_t *pgd_entry;
        long ret;

        pgd_entry = cpa->pgd + pgd_index(addr);

        if (pgd_none(*pgd_entry)) {
                p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
                if (!p4d)
                        return -1;

                set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
        }

        /*
         * Allocate a PUD page and hand it down for mapping.
         */
        p4d = p4d_offset(pgd_entry, addr);
        if (p4d_none(*p4d)) {
                pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
                if (!pud)
                        return -1;

                set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
        }

        pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
        pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);

        ret = populate_pud(cpa, addr, p4d, pgprot);
        if (ret < 0) {
                /*
                 * Leave the PUD page in place in case some other CPU or thread
                 * already found it, but remove any useless entries we just
                 * added to it.
                 */
                unmap_pud_range(p4d, addr,
                                addr + (cpa->numpages << PAGE_SHIFT));
                return ret;
        }

        cpa->numpages = ret;
        return 0;
}

static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
                               int primary)
{
        if (cpa->pgd) {
                /*
                 * Right now, we only execute this code path when mapping
                 * the EFI virtual memory map regions, no other users
                 * provide a ->pgd value. This may change in the future.
                 */
                return populate_pgd(cpa, vaddr);
        }

        /*
         * Ignore all non primary paths.
         */
        if (!primary) {
                cpa->numpages = 1;
                return 0;
        }

        /*
         * Ignore the NULL PTE for kernel identity mapping, as it is expected
         * to have holes.
         * Also set numpages to '1' indicating that we processed cpa req for
         * one virtual address page and its pfn. TBD: numpages can be set based
         * on the initial value and the level returned by lookup_address().
         */
        if (within(vaddr, PAGE_OFFSET,
                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
                cpa->numpages = 1;
                cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
                return 0;

        } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
                /* Faults in the highmap are OK, so do not warn: */
                return -EFAULT;
        } else {
                WARN(1, KERN_WARNING "CPA: called for zero pte. "
                        "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
                        *cpa->vaddr);

                return -EFAULT;
        }
}

static int __change_page_attr(struct cpa_data *cpa, int primary)
{
        unsigned long address;
        int do_split, err;
        unsigned int level;
        pte_t *kpte, old_pte;
        bool nx, rw;

        address = __cpa_addr(cpa, cpa->curpage);
repeat:
        kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (!kpte)
                return __cpa_process_fault(cpa, address, primary);

        old_pte = *kpte;
        if (pte_none(old_pte))
                return __cpa_process_fault(cpa, address, primary);

        if (level == PG_LEVEL_4K) {
                pte_t new_pte;
                pgprot_t old_prot = pte_pgprot(old_pte);
                pgprot_t new_prot = pte_pgprot(old_pte);
                unsigned long pfn = pte_pfn(old_pte);

                pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
                pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);

                cpa_inc_4k_install();
                /* Hand in lpsize = 0 to enforce the protection mechanism */
                new_prot = static_protections(new_prot, address, pfn, 1, 0,
                                              CPA_PROTECT);

                new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
                                      nx, rw);

                new_prot = pgprot_clear_protnone_bits(new_prot);

                /*
                 * We need to keep the pfn from the existing PTE,
                 * after all we're only going to change its attributes
                 * not the memory it points to
                 */
                new_pte = pfn_pte(pfn, new_prot);
                cpa->pfn = pfn;
                /*
                 * Do we really change anything ?
                 */
                if (pte_val(old_pte) != pte_val(new_pte)) {
                        set_pte_atomic(kpte, new_pte);
                        cpa->flags |= CPA_FLUSHTLB;
                }
                cpa->numpages = 1;
                return 0;
        }

        /*
         * Check, whether we can keep the large page intact
         * and just change the pte:
         */
        do_split = should_split_large_page(kpte, address, cpa);
        /*
         * When the range fits into the existing large page,
         * return. cp->numpages and cpa->tlbflush have been updated in
         * try_large_page:
         */
        if (do_split <= 0)
                return do_split;

        /*
         * We have to split the large page:
         */
        err = split_large_page(cpa, kpte, address);
        if (!err)
                goto repeat;

        return err;
}

static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);

/*
 * Check the directmap and "high kernel map" 'aliases'.
 */
static int cpa_process_alias(struct cpa_data *cpa)
{
        struct cpa_data alias_cpa;
        unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
        unsigned long vaddr;
        int ret;

        if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
                return 0;

        /*
         * No need to redo, when the primary call touched the direct
         * mapping already:
         */
        vaddr = __cpa_addr(cpa, cpa->curpage);
        if (!(within(vaddr, PAGE_OFFSET,
                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {

                alias_cpa = *cpa;
                alias_cpa.vaddr = &laddr;
                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
                alias_cpa.curpage = 0;

                /* Directmap always has NX set, do not modify. */
                if (__supported_pte_mask & _PAGE_NX) {
                        alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
                        alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
                }

                cpa->force_flush_all = 1;

                ret = __change_page_attr_set_clr(&alias_cpa, 0);
                if (ret)
                        return ret;
        }

#ifdef CONFIG_X86_64
        /*
         * If the primary call didn't touch the high mapping already
         * and the physical address is inside the kernel map, we need
         * to touch the high mapped kernel as well:
         */
        if (!within(vaddr, (unsigned long)_text, _brk_end) &&
            __cpa_pfn_in_highmap(cpa->pfn)) {
                unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
                                               __START_KERNEL_map - phys_base;
                alias_cpa = *cpa;
                alias_cpa.vaddr = &temp_cpa_vaddr;
                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
                alias_cpa.curpage = 0;

                /*
                 * [_text, _brk_end) also covers data, do not modify NX except
                 * in cases where the highmap is the primary target.
                 */
                if (__supported_pte_mask & _PAGE_NX) {
                        alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
                        alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
                }

                cpa->force_flush_all = 1;
                /*
                 * The high mapping range is imprecise, so ignore the
                 * return value.
                 */
                __change_page_attr_set_clr(&alias_cpa, 0);
        }
#endif

        return 0;
}

static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{
        unsigned long numpages = cpa->numpages;
        unsigned long rempages = numpages;
        int ret = 0;

        /*
         * No changes, easy!
         */
        if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
            !cpa->force_split)
                return ret;

        while (rempages) {
                /*
                 * Store the remaining nr of pages for the large page
                 * preservation check.
                 */
                cpa->numpages = rempages;
                /* for array changes, we can't use large page */
                if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
                        cpa->numpages = 1;

                if (!debug_pagealloc_enabled())
                        spin_lock(&cpa_lock);
                ret = __change_page_attr(cpa, primary);
                if (!debug_pagealloc_enabled())
                        spin_unlock(&cpa_lock);
                if (ret)
                        goto out;

                if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
                        ret = cpa_process_alias(cpa);
                        if (ret)
                                goto out;
                }

                /*
                 * Adjust the number of pages with the result of the
                 * CPA operation. Either a large page has been
                 * preserved or a single page update happened.
                 */
                BUG_ON(cpa->numpages > rempages || !cpa->numpages);
                rempages -= cpa->numpages;
                cpa->curpage += cpa->numpages;
        }

out:
        /* Restore the original numpages */
        cpa->numpages = numpages;
        return ret;
}

static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                                    pgprot_t mask_set, pgprot_t mask_clr,
                                    int force_split, int in_flag,
                                    struct page **pages)
{
        struct cpa_data cpa;
        int ret, cache;

        memset(&cpa, 0, sizeof(cpa));

        /*
         * Check, if we are requested to set a not supported
         * feature.  Clearing non-supported features is OK.
         */
        mask_set = canon_pgprot(mask_set);

        if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
                return 0;

        /* Ensure we are PAGE_SIZE aligned */
        if (in_flag & CPA_ARRAY) {
                int i;
                for (i = 0; i < numpages; i++) {
                        if (addr[i] & ~PAGE_MASK) {
                                addr[i] &= PAGE_MASK;
                                WARN_ON_ONCE(1);
                        }
                }
        } else if (!(in_flag & CPA_PAGES_ARRAY)) {
                /*
                 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
                 * No need to check in that case
                 */
                if (*addr & ~PAGE_MASK) {
                        *addr &= PAGE_MASK;
                        /*
                         * People should not be passing in unaligned addresses:
                         */
                        WARN_ON_ONCE(1);
                }
        }

        /* Must avoid aliasing mappings in the highmem code */
        kmap_flush_unused();

        vm_unmap_aliases();

        cpa.vaddr = addr;
        cpa.pages = pages;
        cpa.numpages = numpages;
        cpa.mask_set = mask_set;
        cpa.mask_clr = mask_clr;
        cpa.flags = in_flag;
        cpa.curpage = 0;
        cpa.force_split = force_split;

        ret = __change_page_attr_set_clr(&cpa, 1);

        /*
         * Check whether we really changed something:
         */
        if (!(cpa.flags & CPA_FLUSHTLB))
                goto out;

        /*
         * No need to flush, when we did not set any of the caching
         * attributes:
         */
        cache = !!pgprot2cachemode(mask_set);

        /*
         * On error; flush everything to be sure.
         */
        if (ret) {
                cpa_flush_all(cache);
                goto out;
        }

        cpa_flush(&cpa, cache);
out:
        return ret;
}

static inline int change_page_attr_set(unsigned long *addr, int numpages,
                                       pgprot_t mask, int array)
{
        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
                (array ? CPA_ARRAY : 0), NULL);
}

static inline int change_page_attr_clear(unsigned long *addr, int numpages,
                                         pgprot_t mask, int array)
{
        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
                (array ? CPA_ARRAY : 0), NULL);
}

static inline int cpa_set_pages_array(struct page **pages, int numpages,
                                       pgprot_t mask)
{
        return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
                CPA_PAGES_ARRAY, pages);
}

static inline int cpa_clear_pages_array(struct page **pages, int numpages,
                                         pgprot_t mask)
{
        return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
                CPA_PAGES_ARRAY, pages);
}

/*
 * __set_memory_prot is an internal helper for callers that have been passed
 * a pgprot_t value from upper layers and a reservation has already been taken.
 * If you want to set the pgprot to a specific page protocol, use the
 * set_memory_xx() functions.
 */
int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
{
        return change_page_attr_set_clr(&addr, numpages, prot,
                                        __pgprot(~pgprot_val(prot)), 0, 0,
                                        NULL);
}

int _set_memory_uc(unsigned long addr, int numpages)
{
        /*
         * for now UC MINUS. see comments in ioremap()
         * If you really need strong UC use ioremap_uc(), but note
         * that you cannot override IO areas with set_memory_*() as
         * these helpers cannot work with IO memory.
         */
        return change_page_attr_set(&addr, numpages,
                                    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
                                    0);
}

int set_memory_uc(unsigned long addr, int numpages)
{
        int ret;

        /*
         * for now UC MINUS. see comments in ioremap()
         */
        ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                              _PAGE_CACHE_MODE_UC_MINUS, NULL);
        if (ret)
                goto out_err;

        ret = _set_memory_uc(addr, numpages);
        if (ret)
                goto out_free;

        return 0;

out_free:
        memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
out_err:
        return ret;
}
EXPORT_SYMBOL(set_memory_uc);

int _set_memory_wc(unsigned long addr, int numpages)
{
        int ret;

        ret = change_page_attr_set(&addr, numpages,
                                   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
                                   0);
        if (!ret) {
                ret = change_page_attr_set_clr(&addr, numpages,
                                               cachemode2pgprot(_PAGE_CACHE_MODE_WC),
                                               __pgprot(_PAGE_CACHE_MASK),
                                               0, 0, NULL);
        }
        return ret;
}

int set_memory_wc(unsigned long addr, int numpages)
{
        int ret;

        ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                _PAGE_CACHE_MODE_WC, NULL);
        if (ret)
                return ret;

        ret = _set_memory_wc(addr, numpages);
        if (ret)
                memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);

        return ret;
}
EXPORT_SYMBOL(set_memory_wc);

int _set_memory_wt(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages,
                                    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
}

int _set_memory_wb(unsigned long addr, int numpages)
{
        /* WB cache mode is hard wired to all cache attribute bits being 0 */
        return change_page_attr_clear(&addr, numpages,
                                      __pgprot(_PAGE_CACHE_MASK), 0);
}

int set_memory_wb(unsigned long addr, int numpages)
{
        int ret;

        ret = _set_memory_wb(addr, numpages);
        if (ret)
                return ret;

        memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
        return 0;
}
EXPORT_SYMBOL(set_memory_wb);

/* Prevent speculative access to a page by marking it not-present */
#ifdef CONFIG_X86_64
int set_mce_nospec(unsigned long pfn)
{
        unsigned long decoy_addr;
        int rc;

        /* SGX pages are not in the 1:1 map */
        if (arch_is_platform_page(pfn << PAGE_SHIFT))
                return 0;
        /*
         * We would like to just call:
         *      set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
         * but doing that would radically increase the odds of a
         * speculative access to the poison page because we'd have
         * the virtual address of the kernel 1:1 mapping sitting
         * around in registers.
         * Instead we get tricky.  We create a non-canonical address
         * that looks just like the one we want, but has bit 63 flipped.
         * This relies on set_memory_XX() properly sanitizing any __pa()
         * results with __PHYSICAL_MASK or PTE_PFN_MASK.
         */
        decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));

        rc = set_memory_np(decoy_addr, 1);
        if (rc)
                pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
        return rc;
}

/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
{
        unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);

        return set_memory_p(addr, 1);
}
EXPORT_SYMBOL_GPL(clear_mce_nospec);
#endif /* CONFIG_X86_64 */

int set_memory_x(unsigned long addr, int numpages)
{
        if (!(__supported_pte_mask & _PAGE_NX))
                return 0;

        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}

int set_memory_nx(unsigned long addr, int numpages)
{
        if (!(__supported_pte_mask & _PAGE_NX))
                return 0;

        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}

int set_memory_ro(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
}

int set_memory_rox(unsigned long addr, int numpages)
{
        pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);

        if (__supported_pte_mask & _PAGE_NX)
                clr.pgprot |= _PAGE_NX;

        return change_page_attr_clear(&addr, numpages, clr, 0);
}

int set_memory_rw(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}

int set_memory_np(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}

int set_memory_np_noalias(unsigned long addr, int numpages)
{
        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
                                        __pgprot(_PAGE_PRESENT), 0,
                                        CPA_NO_CHECK_ALIAS, NULL);
}

int set_memory_p(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}

int set_memory_4k(unsigned long addr, int numpages)
{
        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
                                        __pgprot(0), 1, 0, NULL);
}

int set_memory_nonglobal(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages,
                                      __pgprot(_PAGE_GLOBAL), 0);
}

int set_memory_global(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages,
                                    __pgprot(_PAGE_GLOBAL), 0);
}

/*
 * __set_memory_enc_pgtable() is used for the hypervisors that get
 * informed about "encryption" status via page tables.
 */
static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
{
        pgprot_t empty = __pgprot(0);
        struct cpa_data cpa;
        int ret;

        /* Should not be working on unaligned addresses */
        if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
                addr &= PAGE_MASK;

        memset(&cpa, 0, sizeof(cpa));
        cpa.vaddr = &addr;
        cpa.numpages = numpages;
        cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
        cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
        cpa.pgd = init_mm.pgd;

        /* Must avoid aliasing mappings in the highmem code */
        kmap_flush_unused();
        vm_unmap_aliases();

        /* Flush the caches as needed before changing the encryption attribute. */
        if (x86_platform.guest.enc_tlb_flush_required(enc))
                cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());

        /* Notify hypervisor that we are about to set/clr encryption attribute. */
        if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc))
                goto vmm_fail;

        ret = __change_page_attr_set_clr(&cpa, 1);

        /*
         * After changing the encryption attribute, we need to flush TLBs again
         * in case any speculative TLB caching occurred (but no need to flush
         * caches again).  We could just use cpa_flush_all(), but in case TLB
         * flushing gets optimized in the cpa_flush() path use the same logic
         * as above.
         */
        cpa_flush(&cpa, 0);

        if (ret)
                return ret;

        /* Notify hypervisor that we have successfully set/clr encryption attribute. */
        if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
                goto vmm_fail;

        return 0;

vmm_fail:
        WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n",
                  (void *)addr, numpages, enc ? "private" : "shared");

        return -EIO;
}

static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
        if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
                return __set_memory_enc_pgtable(addr, numpages, enc);

        return 0;
}

int set_memory_encrypted(unsigned long addr, int numpages)
{
        return __set_memory_enc_dec(addr, numpages, true);
}
EXPORT_SYMBOL_GPL(set_memory_encrypted);

int set_memory_decrypted(unsigned long addr, int numpages)
{
        return __set_memory_enc_dec(addr, numpages, false);
}
EXPORT_SYMBOL_GPL(set_memory_decrypted);

int set_pages_uc(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_uc(addr, numpages);
}
EXPORT_SYMBOL(set_pages_uc);

static int _set_pages_array(struct page **pages, int numpages,
                enum page_cache_mode new_type)
{
        unsigned long start;
        unsigned long end;
        enum page_cache_mode set_type;
        int i;
        int free_idx;
        int ret;

        for (i = 0; i < numpages; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                if (memtype_reserve(start, end, new_type, NULL))
                        goto err_out;
        }

        /* If WC, set to UC- first and then WC */
        set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
                                _PAGE_CACHE_MODE_UC_MINUS : new_type;

        ret = cpa_set_pages_array(pages, numpages,
                                  cachemode2pgprot(set_type));
        if (!ret && new_type == _PAGE_CACHE_MODE_WC)
                ret = change_page_attr_set_clr(NULL, numpages,
                                               cachemode2pgprot(
                                                _PAGE_CACHE_MODE_WC),
                                               __pgprot(_PAGE_CACHE_MASK),
                                               0, CPA_PAGES_ARRAY, pages);
        if (ret)
                goto err_out;
        return 0; /* Success */
err_out:
        free_idx = i;
        for (i = 0; i < free_idx; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                memtype_free(start, end);
        }
        return -EINVAL;
}

int set_pages_array_uc(struct page **pages, int numpages)
{
        return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_pages_array_uc);

int set_pages_array_wc(struct page **pages, int numpages)
{
        return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_pages_array_wc);

int set_pages_wb(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_wb(addr, numpages);
}
EXPORT_SYMBOL(set_pages_wb);

int set_pages_array_wb(struct page **pages, int numpages)
{
        int retval;
        unsigned long start;
        unsigned long end;
        int i;

        /* WB cache mode is hard wired to all cache attribute bits being 0 */
        retval = cpa_clear_pages_array(pages, numpages,
                        __pgprot(_PAGE_CACHE_MASK));
        if (retval)
                return retval;

        for (i = 0; i < numpages; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                memtype_free(start, end);
        }

        return 0;
}
EXPORT_SYMBOL(set_pages_array_wb);

int set_pages_ro(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_ro(addr, numpages);
}

int set_pages_rw(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_rw(addr, numpages);
}

static int __set_pages_p(struct page *page, int numpages)
{
        unsigned long tempaddr = (unsigned long) page_address(page);
        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .pgd = NULL,
                                .numpages = numpages,
                                .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                                .mask_clr = __pgprot(0),
                                .flags = CPA_NO_CHECK_ALIAS };

        /*
         * No alias checking needed for setting present flag. otherwise,
         * we may need to break large pages for 64-bit kernel text
         * mappings (this adds to complexity if we want to do this from
         * atomic context especially). Let's keep it simple!
         */
        return __change_page_attr_set_clr(&cpa, 1);
}

static int __set_pages_np(struct page *page, int numpages)
{
        unsigned long tempaddr = (unsigned long) page_address(page);
        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .pgd = NULL,
                                .numpages = numpages,
                                .mask_set = __pgprot(0),
                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                                .flags = CPA_NO_CHECK_ALIAS };

        /*
         * No alias checking needed for setting not present flag. otherwise,
         * we may need to break large pages for 64-bit kernel text
         * mappings (this adds to complexity if we want to do this from
         * atomic context especially). Let's keep it simple!
         */
        return __change_page_attr_set_clr(&cpa, 1);
}

int set_direct_map_invalid_noflush(struct page *page)
{
        return __set_pages_np(page, 1);
}

int set_direct_map_default_noflush(struct page *page)
{
        return __set_pages_p(page, 1);
}

#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
        if (PageHighMem(page))
                return;
        if (!enable) {
                debug_check_no_locks_freed(page_address(page),
                                           numpages * PAGE_SIZE);
        }

        /*
         * The return value is ignored as the calls cannot fail.
         * Large pages for identity mappings are not used at boot time
         * and hence no memory allocations during large page split.
         */
        if (enable)
                __set_pages_p(page, numpages);
        else
                __set_pages_np(page, numpages);

        /*
         * We should perform an IPI and flush all tlbs,
         * but that can deadlock->flush only current cpu.
         * Preemption needs to be disabled around __flush_tlb_all() due to
         * CR3 reload in __native_flush_tlb().
         */
        preempt_disable();
        __flush_tlb_all();
        preempt_enable();

        arch_flush_lazy_mmu_mode();
}
#endif /* CONFIG_DEBUG_PAGEALLOC */

bool kernel_page_present(struct page *page)
{
        unsigned int level;
        pte_t *pte;

        if (PageHighMem(page))
                return false;

        pte = lookup_address((unsigned long)page_address(page), &level);
        return (pte_val(*pte) & _PAGE_PRESENT);
}

int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
                                   unsigned numpages, unsigned long page_flags)
{
        int retval = -EINVAL;

        struct cpa_data cpa = {
                .vaddr = &address,
                .pfn = pfn,
                .pgd = pgd,
                .numpages = numpages,
                .mask_set = __pgprot(0),
                .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
                .flags = CPA_NO_CHECK_ALIAS,
        };

        WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

        if (!(__supported_pte_mask & _PAGE_NX))
                goto out;

        if (!(page_flags & _PAGE_ENC))
                cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);

        cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);

        retval = __change_page_attr_set_clr(&cpa, 1);
        __flush_tlb_all();

out:
        return retval;
}

/*
 * __flush_tlb_all() flushes mappings only on current CPU and hence this
 * function shouldn't be used in an SMP environment. Presently, it's used only
 * during boot (way before smp_init()) by EFI subsystem and hence is ok.
 */
int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                     unsigned long numpages)
{
        int retval;

        /*
         * The typical sequence for unmapping is to find a pte through
         * lookup_address_in_pgd() (ideally, it should never return NULL because
         * the address is already mapped) and change its protections. As pfn is
         * the *target* of a mapping, it's not useful while unmapping.
         */
        struct cpa_data cpa = {
                .vaddr                = &address,
                .pfn                = 0,
                .pgd                = pgd,
                .numpages        = numpages,
                .mask_set        = __pgprot(0),
                .mask_clr        = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                .flags                = CPA_NO_CHECK_ALIAS,
        };

        WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

        retval = __change_page_attr_set_clr(&cpa, 1);
        __flush_tlb_all();

        return retval;
}

/*
 * The testcases use internal knowledge of the implementation that shouldn't
 * be exposed to the rest of the kernel. Include these directly here.
 */
#ifdef CONFIG_CPA_DEBUG
#include "cpa-test.c"
#endif




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    2 











































































































    1 


































































































































































































































































































































































    1 


    1 



































































































































































































































































































































































































    1 





    1 






    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 
















    1 
    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
13444
13445
13446
13447
13448
13449
13450
13451
13452
13453
13454
13455
13456
13457
13458
13459
13460
13461
13462
13463
13464
13465
13466
13467
13468
13469
13470
13471
13472
13473
13474
13475
13476
13477
13478
13479
13480
13481
13482
13483
13484
13485
13486
13487
13488
13489
13490
13491
13492
13493
13494
13495
13496
13497
13498
13499
13500
13501
13502
13503
13504
13505
13506
13507
13508
13509
13510
13511
13512
13513
13514
13515
13516
13517
13518
13519
13520
13521
13522
13523
13524
13525
13526
13527
13528
13529
13530
13531
13532
13533
13534
13535
13536
13537
13538
13539
13540
13541
13542
13543
13544
13545
13546
13547
13548
13549
13550
13551
13552
13553
13554
13555
13556
13557
13558
13559
13560
13561
13562
13563
13564
13565
13566
13567
13568
13569
13570
13571
13572
13573
13574
13575
13576
13577
13578
13579
13580
13581
13582
13583
13584
13585
13586
13587
13588
13589
13590
13591
13592
13593
13594
13595
13596
13597
13598
13599
13600
13601
13602
13603
13604
13605
13606
13607
13608
13609
13610
13611
13612
13613
13614
13615
13616
13617
13618
13619
13620
13621
13622
13623
13624
13625
13626
13627
13628
13629
13630
13631
13632
13633
13634
13635
13636
13637
13638
13639
13640
13641
13642
13643
13644
13645
13646
13647
13648
13649
13650
13651
13652
13653
13654
13655
13656
13657
13658
13659
13660
13661
13662
13663
13664
13665
13666
13667
13668
13669
13670
13671
13672
13673
13674
13675
13676
13677
13678
13679
13680
13681
13682
13683
13684
13685
13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
// SPDX-License-Identifier: GPL-2.0
/*
 * Performance events core code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>

#include "internal.h"

#include <asm/irq_regs.h>

typedef int (*remote_function_f)(void *);

struct remote_function_call {
        struct task_struct        *p;
        remote_function_f        func;
        void                        *info;
        int                        ret;
};

static void remote_function(void *data)
{
        struct remote_function_call *tfc = data;
        struct task_struct *p = tfc->p;

        if (p) {
                /* -EAGAIN */
                if (task_cpu(p) != smp_processor_id())
                        return;

                /*
                 * Now that we're on right CPU with IRQs disabled, we can test
                 * if we hit the right task without races.
                 */

                tfc->ret = -ESRCH; /* No such (running) process */
                if (p != current)
                        return;
        }

        tfc->ret = tfc->func(tfc->info);
}

/**
 * task_function_call - call a function on the cpu on which a task runs
 * @p:                the task to evaluate
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func when the task is currently running. This might
 * be on the current CPU, which just calls the function directly.  This will
 * retry due to any failures in smp_call_function_single(), such as if the
 * task_cpu() goes offline concurrently.
 *
 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 */
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = p,
                .func        = func,
                .info        = info,
                .ret        = -EAGAIN,
        };
        int ret;

        for (;;) {
                ret = smp_call_function_single(task_cpu(p), remote_function,
                                               &data, 1);
                if (!ret)
                        ret = data.ret;

                if (ret != -EAGAIN)
                        break;

                cond_resched();
        }

        return ret;
}

/**
 * cpu_function_call - call a function on the cpu
 * @cpu:        target cpu to queue this function
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func on the remote cpu.
 *
 * returns: @func return value or -ENXIO when the cpu is offline
 */
static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = NULL,
                .func        = func,
                .info        = info,
                .ret        = -ENXIO, /* No such CPU */
        };

        smp_call_function_single(cpu, remote_function, &data, 1);

        return data.ret;
}

static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
{
        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx)
                raw_spin_lock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx)
{
        if (ctx)
                raw_spin_unlock(&ctx->lock);
        raw_spin_unlock(&cpuctx->ctx.lock);
}

#define TASK_TOMBSTONE ((void *)-1L)

static bool is_kernel_event(struct perf_event *event)
{
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}

static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);

struct perf_event_context *perf_cpu_task_ctx(void)
{
        lockdep_assert_irqs_disabled();
        return this_cpu_ptr(&perf_cpu_context)->task_ctx;
}

/*
 * On task ctx scheduling...
 *
 * When !ctx->nr_events a task context will not be scheduled. This means
 * we can disable the scheduler hooks (for performance) without leaving
 * pending task ctx state.
 *
 * This however results in two special cases:
 *
 *  - removing the last event from a task ctx; this is relatively straight
 *    forward and is done in __perf_remove_from_context.
 *
 *  - adding the first event to a task ctx; this is tricky because we cannot
 *    rely on ctx->is_active and therefore cannot use event_function_call().
 *    See perf_install_in_context().
 *
 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 */

typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
                        struct perf_event_context *, void *);

struct event_function_struct {
        struct perf_event *event;
        event_f func;
        void *data;
};

static int event_function(void *info)
{
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;

        lockdep_assert_irqs_disabled();

        perf_ctx_lock(cpuctx, task_ctx);
        /*
         * Since we do the IPI call without holding ctx->lock things can have
         * changed, double check we hit the task we set out to hit.
         */
        if (ctx->task) {
                if (ctx->task != current) {
                        ret = -ESRCH;
                        goto unlock;
                }

                /*
                 * We only use event_function_call() on established contexts,
                 * and event_function() is only ever called when active (or
                 * rather, we'll have bailed in task_function_call() or the
                 * above ctx->task != current test), therefore we must have
                 * ctx->is_active here.
                 */
                WARN_ON_ONCE(!ctx->is_active);
                /*
                 * And since we have ctx->is_active, cpuctx->task_ctx must
                 * match.
                 */
                WARN_ON_ONCE(task_ctx != ctx);
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        efs->func(event, cpuctx, ctx, efs->data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static void event_function_call(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
        struct event_function_struct efs = {
                .event = event,
                .func = func,
                .data = data,
        };

        if (!event->parent) {
                /*
                 * If this is a !child event, we must hold ctx::mutex to
                 * stabilize the event->ctx relation. See
                 * perf_event_ctx_lock().
                 */
                lockdep_assert_held(&ctx->mutex);
        }

        if (!task) {
                cpu_function_call(event->cpu, event_function, &efs);
                return;
        }

        if (task == TASK_TOMBSTONE)
                return;

again:
        if (!task_function_call(task, event_function, &efs))
                return;

        raw_spin_lock_irq(&ctx->lock);
        /*
         * Reload the task pointer, it might have been changed by
         * a concurrent perf_event_context_sched_out().
         */
        task = ctx->task;
        if (task == TASK_TOMBSTONE) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        func(event, NULL, ctx, data);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Similar to event_function_call() + event_function(), but hard assumes IRQs
 * are already disabled and we're on the right CPU.
 */
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;

        lockdep_assert_irqs_disabled();

        if (task) {
                if (task == TASK_TOMBSTONE)
                        return;

                task_ctx = ctx;
        }

        perf_ctx_lock(cpuctx, task_ctx);

        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;

        if (task) {
                /*
                 * We must be either inactive or active and the right task,
                 * otherwise we're screwed, since we cannot IPI to somewhere
                 * else.
                 */
                if (ctx->is_active) {
                        if (WARN_ON_ONCE(task != current))
                                goto unlock;

                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
                                goto unlock;
                }
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        func(event, cpuctx, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);
}

#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
                       PERF_FLAG_FD_CLOEXEC)

/*
 * branch priv levels that need permission checks
 */
#define PERF_SAMPLE_BRANCH_PERM_PLM \
        (PERF_SAMPLE_BRANCH_KERNEL |\
         PERF_SAMPLE_BRANCH_HV)

enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
        EVENT_TIME = 0x4,
        /* see ctx_resched() for details */
        EVENT_CPU = 0x8,
        EVENT_CGROUP = 0x10,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

/*
 * perf_sched_events : >0 events exist
 */

static void perf_sched_delayed(struct work_struct *work);
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;

static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;
static atomic_t nr_build_id_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
static struct kmem_cache *perf_event_cache;

/*
 * perf event paranoia level:
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
 *   1 - disallow cpu events for unpriv
 *   2 - disallow kernel profiling for unpriv
 */
int sysctl_perf_event_paranoid __read_mostly = 2;

/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */

/*
 * max perf event sample rate
 */
#define DEFAULT_MAX_SAMPLE_RATE                100000
#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT        25

int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;

static int max_samples_per_tick __read_mostly        = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly        = DEFAULT_SAMPLE_PERIOD_NS;

static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;

static void update_perf_cpu_limits(void)
{
        u64 tmp = perf_sample_period_ns;

        tmp *= sysctl_perf_cpu_time_max_percent;
        tmp = div_u64(tmp, 100);
        if (!tmp)
                tmp = 1;

        WRITE_ONCE(perf_sample_allowed_ns, tmp);
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);

int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
                                       void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        int perf_cpu = sysctl_perf_cpu_time_max_percent;
        /*
         * If throttling is disabled don't allow the write:
         */
        if (write && (perf_cpu == 100 || perf_cpu == 0))
                return -EINVAL;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();

        return 0;
}

int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;

int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (ret || !write)
                return ret;

        if (sysctl_perf_cpu_time_max_percent == 100 ||
            sysctl_perf_cpu_time_max_percent == 0) {
                printk(KERN_WARNING
                       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
                WRITE_ONCE(perf_sample_allowed_ns, 0);
        } else {
                update_perf_cpu_limits();
        }

        return 0;
}

/*
 * perf samples are done in some very critical code paths (NMIs).
 * If they take too much CPU time, the system can lock up and not
 * get any real work done.  This will drop the sample rate when
 * we detect that events are taking too long.
 */
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);

static u64 __report_avg;
static u64 __report_allowed;

static void perf_duration_warn(struct irq_work *w)
{
        printk_ratelimited(KERN_INFO
                "perf: interrupt took too long (%lld > %lld), lowering "
                "kernel.perf_event_max_sample_rate to %d\n",
                __report_avg, __report_allowed,
                sysctl_perf_event_sample_rate);
}

static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);

void perf_sample_event_took(u64 sample_len_ns)
{
        u64 max_len = READ_ONCE(perf_sample_allowed_ns);
        u64 running_len;
        u64 avg_len;
        u32 max;

        if (max_len == 0)
                return;

        /* Decay the counter by 1 average sample. */
        running_len = __this_cpu_read(running_sample_length);
        running_len -= running_len/NR_ACCUMULATED_SAMPLES;
        running_len += sample_len_ns;
        __this_cpu_write(running_sample_length, running_len);

        /*
         * Note: this will be biased artifically low until we have
         * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
        avg_len = running_len/NR_ACCUMULATED_SAMPLES;
        if (avg_len <= max_len)
                return;

        __report_avg = avg_len;
        __report_allowed = max_len;

        /*
         * Compute a throttle threshold 25% below the current duration.
         */
        avg_len += avg_len / 4;
        max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
        if (avg_len < max)
                max /= (u32)avg_len;
        else
                max = 1;

        WRITE_ONCE(perf_sample_allowed_ns, avg_len);
        WRITE_ONCE(max_samples_per_tick, max);

        sysctl_perf_event_sample_rate = max * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;

        if (!irq_work_queue(&perf_duration_work)) {
                early_printk("perf: interrupt took too long (%lld > %lld), lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
                             __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
}

static atomic64_t perf_event_id;

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);

void __weak perf_event_print_debug(void)        { }

static inline u64 perf_clock(void)
{
        return local_clock();
}

static inline u64 perf_event_clock(struct perf_event *event)
{
        return event->clock();
}

/*
 * State based event timekeeping...
 *
 * The basic idea is to use event->state to determine which (if any) time
 * fields to increment with the current delta. This means we only need to
 * update timestamps when we change state or when they are explicitly requested
 * (read).
 *
 * Event groups make things a little more complicated, but not terribly so. The
 * rules for a group are that if the group leader is OFF the entire group is
 * OFF, irrespecive of what the group member states are. This results in
 * __perf_effective_state().
 *
 * A futher ramification is that when a group leader flips between OFF and
 * !OFF, we need to update all group member times.
 *
 *
 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 * need to make sure the relevant context time is updated before we try and
 * update our timestamps.
 */

static __always_inline enum perf_event_state
__perf_effective_state(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;

        if (leader->state <= PERF_EVENT_STATE_OFF)
                return leader->state;

        return event->state;
}

static __always_inline void
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
{
        enum perf_event_state state = __perf_effective_state(event);
        u64 delta = now - event->tstamp;

        *enabled = event->total_time_enabled;
        if (state >= PERF_EVENT_STATE_INACTIVE)
                *enabled += delta;

        *running = event->total_time_running;
        if (state >= PERF_EVENT_STATE_ACTIVE)
                *running += delta;
}

static void perf_event_update_time(struct perf_event *event)
{
        u64 now = perf_event_time(event);

        __perf_update_times(event, now, &event->total_time_enabled,
                                        &event->total_time_running);
        event->tstamp = now;
}

static void perf_event_update_sibling_time(struct perf_event *leader)
{
        struct perf_event *sibling;

        for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
}

static void
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
{
        if (event->state == state)
                return;

        perf_event_update_time(event);
        /*
         * If a group leader gets enabled/disabled all its siblings
         * are affected too.
         */
        if ((event->state < 0) ^ (state < 0))
                perf_event_update_sibling_time(event);

        WRITE_ONCE(event->state, state);
}

/*
 * UP store-release, load-acquire
 */

#define __store_release(ptr, val)                                        \
do {                                                                        \
        barrier();                                                        \
        WRITE_ONCE(*(ptr), (val));                                        \
} while (0)

#define __load_acquire(ptr)                                                \
({                                                                        \
        __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
        barrier();                                                        \
        ___p;                                                                \
})

static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                perf_pmu_disable(pmu_ctx->pmu);
        }
}

static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                perf_pmu_enable(pmu_ctx->pmu);
        }
}

static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);

#ifdef CONFIG_CGROUP_PERF

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        /* @event doesn't care about cgroup */
        if (!event->cgrp)
                return true;

        /* wants specific cgroup scope but @cpuctx isn't associated with any */
        if (!cpuctx->cgrp)
                return false;

        /*
         * Cgroup scoping is recursive.  An event enabled for a cgroup is
         * also enabled for all its descendant cgroups.  If @cpuctx's
         * cgroup is a descendant of @event's (the test covers identity
         * case), it's a match.
         */
        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
                                    event->cgrp->css.cgroup);
}

static inline void perf_detach_cgroup(struct perf_event *event)
{
        css_put(&event->cgrp->css);
        event->cgrp = NULL;
}

static inline int is_cgroup_event(struct perf_event *event)
{
        return event->cgrp != NULL;
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        return t->time;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        if (!__load_acquire(&t->active))
                return t->time;
        now += READ_ONCE(t->timeoffset);
        return now;
}

static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
        if (adv)
                info->time += now - info->timestamp;
        info->timestamp = now;
        /*
         * see update_context_time()
         */
        WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct cgroup_subsys_state *css;
        struct perf_cgroup_info *info;

        if (cgrp) {
                u64 now = perf_clock();

                for (css = &cgrp->css; css; css = css->parent) {
                        cgrp = container_of(css, struct perf_cgroup, css);
                        info = this_cpu_ptr(cgrp->info);

                        __update_cgrp_time(info, now, true);
                        if (final)
                                __store_release(&info->active, 0);
                }
        }
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
        struct perf_cgroup_info *info;

        /*
         * ensure we access cgroup data only when needed and
         * when we know the cgroup is pinned (css_get)
         */
        if (!is_cgroup_event(event))
                return;

        info = this_cpu_ptr(event->cgrp->info);
        /*
         * Do not update time when cgroup is not active
         */
        if (info->active)
                __update_cgrp_time(info, perf_clock(), true);
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
        struct perf_event_context *ctx = &cpuctx->ctx;
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct perf_cgroup_info *info;
        struct cgroup_subsys_state *css;

        /*
         * ctx->lock held by caller
         * ensure we do not access cgroup data
         * unless we have the cgroup pinned (css_get)
         */
        if (!cgrp)
                return;

        WARN_ON_ONCE(!ctx->nr_cgroups);

        for (css = &cgrp->css; css; css = css->parent) {
                cgrp = container_of(css, struct perf_cgroup, css);
                info = this_cpu_ptr(cgrp->info);
                __update_cgrp_time(info, ctx->timestamp, false);
                __store_release(&info->active, 1);
        }
}

/*
 * reschedule events based on the cgroup constraint of task.
 */
static void perf_cgroup_switch(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cgroup *cgrp;

        /*
         * cpuctx->cgrp is set when the first cgroup event enabled,
         * and is cleared when the last cgroup event disabled.
         */
        if (READ_ONCE(cpuctx->cgrp) == NULL)
                return;

        WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

        cgrp = perf_cgroup_from_task(task, NULL);
        if (READ_ONCE(cpuctx->cgrp) == cgrp)
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_ctx_disable(&cpuctx->ctx, true);

        ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
        /*
         * must not be done before ctxswout due
         * to update_cgrp_time_from_cpuctx() in
         * ctx_sched_out()
         */
        cpuctx->cgrp = cgrp;
        /*
         * set cgrp before ctxsw in to allow
         * perf_cgroup_set_timestamp() in ctx_sched_in()
         * to not have to pass task around
         */
        ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);

        perf_ctx_enable(&cpuctx->ctx, true);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
                                struct cgroup_subsys_state *css)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event **storage;
        int cpu, heap_size, ret = 0;

        /*
         * Allow storage to have sufficent space for an iterator for each
         * possibly nested cgroup plus an iterator for events with no cgroup.
         */
        for (heap_size = 1; css; css = css->parent)
                heap_size++;

        for_each_possible_cpu(cpu) {
                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;

                storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
                                       GFP_KERNEL, cpu_to_node(cpu));
                if (!storage) {
                        ret = -ENOMEM;
                        break;
                }

                raw_spin_lock_irq(&cpuctx->ctx.lock);
                if (cpuctx->heap_size < heap_size) {
                        swap(cpuctx->heap, storage);
                        if (storage == cpuctx->heap_default)
                                storage = NULL;
                        cpuctx->heap_size = heap_size;
                }
                raw_spin_unlock_irq(&cpuctx->ctx.lock);

                kfree(storage);
        }

        return ret;
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
        struct fd f = fdget(fd);
        int ret = 0;

        if (!f.file)
                return -EBADF;

        css = css_tryget_online_from_dir(f.file->f_path.dentry,
                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
        }

        ret = perf_cgroup_ensure_storage(event, css);
        if (ret)
                goto out;

        cgrp = container_of(css, struct perf_cgroup, css);
        event->cgrp = cgrp;

        /*
         * all events in a group must monitor
         * the same cgroup because a task belongs
         * to only one perf cgroup at a time
         */
        if (group_leader && group_leader->cgrp != cgrp) {
                perf_detach_cgroup(event);
                ret = -EINVAL;
        }
out:
        fdput(f);
        return ret;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups++;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (ctx->nr_cgroups++)
                return;

        cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups--;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (--ctx->nr_cgroups)
                return;

        cpuctx->cgrp = NULL;
}

#else /* !CONFIG_CGROUP_PERF */

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        return true;
}

static inline void perf_detach_cgroup(struct perf_event *event)
{}

static inline int is_cgroup_event(struct perf_event *event)
{
        return 0;
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
                                                bool final)
{
}

static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        return -EINVAL;
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        return 0;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        return 0;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static void perf_cgroup_switch(struct task_struct *task)
{
}
#endif

/*
 * set default to be dependent on timer tick just
 * like original code
 */
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
 * function must be called with interrupts disabled
 */
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
        struct perf_cpu_pmu_context *cpc;
        bool rotations;

        lockdep_assert_irqs_disabled();

        cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
        rotations = perf_rotate_context(cpc);

        raw_spin_lock(&cpc->hrtimer_lock);
        if (rotations)
                hrtimer_forward_now(hr, cpc->hrtimer_interval);
        else
                cpc->hrtimer_active = 0;
        raw_spin_unlock(&cpc->hrtimer_lock);

        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}

static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
        struct hrtimer *timer = &cpc->hrtimer;
        struct pmu *pmu = cpc->epc.pmu;
        u64 interval;

        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
        interval = pmu->hrtimer_interval_ms;
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;

        cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);

        raw_spin_lock_init(&cpc->hrtimer_lock);
        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
        timer->function = perf_mux_hrtimer_handler;
}

static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
        struct hrtimer *timer = &cpc->hrtimer;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
        if (!cpc->hrtimer_active) {
                cpc->hrtimer_active = 1;
                hrtimer_forward_now(timer, cpc->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);

        return 0;
}

static int perf_mux_hrtimer_restart_ipi(void *arg)
{
        return perf_mux_hrtimer_restart(arg);
}

void perf_pmu_disable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!(*count)++)
                pmu->pmu_disable(pmu);
}

void perf_pmu_enable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!--(*count))
                pmu->pmu_enable(pmu);
}

static void perf_assert_pmu_disabled(struct pmu *pmu)
{
        WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}

static void get_ctx(struct perf_event_context *ctx)
{
        refcount_inc(&ctx->refcount);
}

static void *alloc_task_ctx_data(struct pmu *pmu)
{
        if (pmu->task_ctx_cache)
                return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);

        return NULL;
}

static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
{
        if (pmu->task_ctx_cache && task_ctx_data)
                kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
}

static void free_ctx(struct rcu_head *head)
{
        struct perf_event_context *ctx;

        ctx = container_of(head, struct perf_event_context, rcu_head);
        kfree(ctx);
}

static void put_ctx(struct perf_event_context *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task && ctx->task != TASK_TOMBSTONE)
                        put_task_struct(ctx->task);
                call_rcu(&ctx->rcu_head, free_ctx);
        }
}

/*
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 * perf_pmu_migrate_context() we need some magic.
 *
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
 * Lock ordering is by mutex address. There are two other sites where
 * perf_event_context::mutex nests and those are:
 *
 *  - perf_event_exit_task_context()        [ child , 0 ]
 *      perf_event_exit_event()
 *        put_event()                        [ parent, 1 ]
 *
 *  - perf_event_init_context()                [ parent, 0 ]
 *      inherit_task_group()
 *        inherit_group()
 *          inherit_event()
 *            perf_event_alloc()
 *              perf_init_event()
 *                perf_try_init_event()        [ child , 1 ]
 *
 * While it appears there is an obvious deadlock here -- the parent and child
 * nesting levels are inverted between the two. This is in fact safe because
 * life-time rules separate them. That is an exiting task cannot fork, and a
 * spawning task cannot (yet) exit.
 *
 * But remember that these are parent<->child context relations, and
 * migration does not affect children, therefore these two orderings should not
 * interact.
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 * concerned with cpuctx and that doesn't have children.
 *
 * The places that change perf_event::ctx will issue:
 *
 *   perf_remove_from_context();
 *   synchronize_rcu();
 *   perf_install_in_context();
 *
 * to affect the change. The remove_from_context() + synchronize_rcu() should
 * quiesce the event, after which we can install it in the new location. This
 * means that only external vectors (perf_fops, prctl) can perturb the event
 * while in transit. Therefore all such accessors should also acquire
 * perf_event_context::mutex to serialize against this.
 *
 * However; because event->ctx can change while we're waiting to acquire
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 * function.
 *
 * Lock order:
 *    exec_update_lock
 *        task_struct::perf_event_mutex
 *          perf_event_context::mutex
 *            perf_event::child_mutex;
 *              perf_event_context::lock
 *            perf_event::mmap_mutex
 *            mmap_lock
 *              perf_addr_filters_head::lock
 *
 *    cpu_hotplug_lock
 *      pmus_lock
 *          cpuctx->mutex / perf_event_context::mutex
 */
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
        struct perf_event_context *ctx;

again:
        rcu_read_lock();
        ctx = READ_ONCE(event->ctx);
        if (!refcount_inc_not_zero(&ctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_nested(&ctx->mutex, nesting);
        if (event->ctx != ctx) {
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
                goto again;
        }

        return ctx;
}

static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
        return perf_event_ctx_lock_nested(event, 0);
}

static void perf_event_ctx_unlock(struct perf_event *event,
                                  struct perf_event_context *ctx)
{
        mutex_unlock(&ctx->mutex);
        put_ctx(ctx);
}

/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
 */
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{
        struct perf_event_context *parent_ctx = ctx->parent_ctx;

        lockdep_assert_held(&ctx->lock);

        if (parent_ctx)
                ctx->parent_ctx = NULL;
        ctx->generation++;

        return parent_ctx;
}

static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
                                enum pid_type type)
{
        u32 nr;
        /*
         * only top level events have the pid namespace they were created in
         */
        if (event->parent)
                event = event->parent;

        nr = __task_pid_nr_ns(p, type, event->ns);
        /* avoid -1 if it is idle thread or runs in another ns */
        if (!nr && !pid_alive(p))
                nr = -1;
        return nr;
}

static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_TGID);
}

static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_PID);
}

/*
 * If we inherit events we want to return the parent event id
 * to userspace.
 */
static u64 primary_event_id(struct perf_event *event)
{
        u64 id = event->id;

        if (event->parent)
                id = event->parent->id;

        return id;
}

/*
 * Get the perf_event_context for a task and lock it.
 *
 * This has to cope with the fact that until it is locked,
 * the context could get moved to another task.
 */
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
        struct perf_event_context *ctx;

retry:
        /*
         * One of the few rules of preemptible RCU is that one cannot do
         * rcu_read_unlock() while holding a scheduler (or nested) lock when
         * part of the read side critical section was irqs-enabled -- see
         * rcu_read_unlock_special().
         *
         * Since ctx->lock nests under rq->lock we must ensure the entire read
         * side critical section has interrupts disabled.
         */
        local_irq_save(*flags);
        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * get swapped for another underneath us by
                 * perf_event_task_sched_out, though the
                 * rcu_read_lock() protects us from any context
                 * getting freed.  Lock the context and check if it
                 * got swapped before we could get the lock, and retry
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
                        goto retry;
                }

                if (ctx->task == TASK_TOMBSTONE ||
                    !refcount_inc_not_zero(&ctx->refcount)) {
                        raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
                } else {
                        WARN_ON_ONCE(ctx->task != task);
                }
        }
        rcu_read_unlock();
        if (!ctx)
                local_irq_restore(*flags);
        return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task)
{
        struct perf_event_context *ctx;
        unsigned long flags;

        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
}

static void perf_unpin_context(struct perf_event_context *ctx)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
}

/*
 * Update the record of the current time in a context.
 */
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
        u64 now = perf_clock();

        lockdep_assert_held(&ctx->lock);

        if (adv)
                ctx->time += now - ctx->timestamp;
        ctx->timestamp = now;

        /*
         * The above: time' = time + (now - timestamp), can be re-arranged
         * into: time` = now + (time - timestamp), which gives a single value
         * offset to compute future time without locks on.
         *
         * See perf_event_time_now(), which can be used from NMI context where
         * it's (obviously) not possible to acquire ctx->lock in order to read
         * both the above values in a consistent manner.
         */
        WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}

static void update_context_time(struct perf_event_context *ctx)
{
        __update_context_time(ctx, true);
}

static u64 perf_event_time(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);

        return ctx->time;
}

static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time_now(event, now);

        if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
                return ctx->time;

        now += READ_ONCE(ctx->timeoffset);
        return now;
}

static enum event_type_t get_event_type(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        enum event_type_t event_type;

        lockdep_assert_held(&ctx->lock);

        /*
         * It's 'group type', really, because if our group leader is
         * pinned, so are we.
         */
        if (event->group_leader != event)
                event = event->group_leader;

        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
        if (!ctx->task)
                event_type |= EVENT_CPU;

        return event_type;
}

/*
 * Helper function to initialize event group nodes.
 */
static void init_event_group(struct perf_event *event)
{
        RB_CLEAR_NODE(&event->group_node);
        event->group_index = 0;
}

/*
 * Extract pinned or flexible groups from the context
 * based on event attrs bits.
 */
static struct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        if (event->attr.pinned)
                return &ctx->pinned_groups;
        else
                return &ctx->flexible_groups;
}

/*
 * Helper function to initializes perf_event_group trees.
 */
static void perf_event_groups_init(struct perf_event_groups *groups)
{
        groups->tree = RB_ROOT;
        groups->index = 0;
}

static inline struct cgroup *event_cgroup(const struct perf_event *event)
{
        struct cgroup *cgroup = NULL;

#ifdef CONFIG_CGROUP_PERF
        if (event->cgrp)
                cgroup = event->cgrp->css.cgroup;
#endif

        return cgroup;
}

/*
 * Compare function for event groups;
 *
 * Implements complex key that first sorts by CPU and then by virtual index
 * which provides ordering when rotating groups for the same CPU.
 */
static __always_inline int
perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
                      const struct cgroup *left_cgroup, const u64 left_group_index,
                      const struct perf_event *right)
{
        if (left_cpu < right->cpu)
                return -1;
        if (left_cpu > right->cpu)
                return 1;

        if (left_pmu) {
                if (left_pmu < right->pmu_ctx->pmu)
                        return -1;
                if (left_pmu > right->pmu_ctx->pmu)
                        return 1;
        }

#ifdef CONFIG_CGROUP_PERF
        {
                const struct cgroup *right_cgroup = event_cgroup(right);

                if (left_cgroup != right_cgroup) {
                        if (!left_cgroup) {
                                /*
                                 * Left has no cgroup but right does, no
                                 * cgroups come first.
                                 */
                                return -1;
                        }
                        if (!right_cgroup) {
                                /*
                                 * Right has no cgroup but left does, no
                                 * cgroups come first.
                                 */
                                return 1;
                        }
                        /* Two dissimilar cgroups, order by id. */
                        if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
                                return -1;

                        return 1;
                }
        }
#endif

        if (left_group_index < right->group_index)
                return -1;
        if (left_group_index > right->group_index)
                return 1;

        return 0;
}

#define __node_2_pe(node) \
        rb_entry((node), struct perf_event, group_node)

static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
        struct perf_event *e = __node_2_pe(a);
        return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
                                     e->group_index, __node_2_pe(b)) < 0;
}

struct __group_key {
        int cpu;
        struct pmu *pmu;
        struct cgroup *cgroup;
};

static inline int __group_cmp(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
}

static inline int
__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
                                     b->group_index, b);
}

/*
 * Insert @event into @groups' tree; using
 *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
 * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
 */
static void
perf_event_groups_insert(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        event->group_index = ++groups->index;

        rb_add(&event->group_node, &groups->tree, __group_less);
}

/*
 * Helper function to insert event into the pinned or flexible groups.
 */
static void
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_insert(groups, event);
}

/*
 * Delete a group from a tree.
 */
static void
perf_event_groups_delete(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
                     RB_EMPTY_ROOT(&groups->tree));

        rb_erase(&event->group_node, &groups->tree);
        init_event_group(event);
}

/*
 * Helper function to delete event from its groups.
 */
static void
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_delete(groups, event);
}

/*
 * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
 */
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                        struct pmu *pmu, struct cgroup *cgrp)
{
        struct __group_key key = {
                .cpu = cpu,
                .pmu = pmu,
                .cgroup = cgrp,
        };
        struct rb_node *node;

        node = rb_find_first(&key, &groups->tree, __group_cmp);
        if (node)
                return __node_2_pe(node);

        return NULL;
}

static struct perf_event *
perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
        struct __group_key key = {
                .cpu = event->cpu,
                .pmu = pmu,
                .cgroup = event_cgroup(event),
        };
        struct rb_node *next;

        next = rb_next_match(&key, &event->group_node, __group_cmp);
        if (next)
                return __node_2_pe(next);

        return NULL;
}

#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)                \
        for (event = perf_event_groups_first(groups, cpu, pmu, NULL);        \
             event; event = perf_event_groups_next(event, pmu))

/*
 * Iterate through the whole groups tree.
 */
#define perf_event_groups_for_each(event, groups)                        \
        for (event = rb_entry_safe(rb_first(&((groups)->tree)),                \
                                typeof(*event), group_node); event;        \
                event = rb_entry_safe(rb_next(&event->group_node),        \
                                typeof(*event), group_node))

/*
 * Add an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
        lockdep_assert_held(&ctx->lock);

        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;

        event->tstamp = perf_event_time(event);

        /*
         * If we're a stand alone event or group leader, we go to the context
         * list, group events are kept attached to the group so that
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
                event->group_caps = event->event_caps;
                add_event_to_groups(event, ctx);
        }

        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;

        if (event->state > PERF_EVENT_STATE_OFF)
                perf_cgroup_event_enable(event, ctx);

        ctx->generation++;
        event->pmu_ctx->nr_events++;
}

/*
 * Initialize event state based on the perf_event_attr::disabled.
 */
static inline void perf_event__state_init(struct perf_event *event)
{
        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
                                              PERF_EVENT_STATE_INACTIVE;
}

static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
        int entry = sizeof(u64); /* value */
        int size = 0;
        int nr = 1;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_ID)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_LOST)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_GROUP) {
                nr += nr_siblings;
                size += sizeof(u64);
        }

        /*
         * Since perf_event_validate_size() limits this to 16k and inhibits
         * adding more siblings, this will never overflow.
         */
        return size + nr * entry;
}

static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
        struct perf_sample_data *data;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);

        if (sample_type & PERF_SAMPLE_ADDR)
                size += sizeof(data->addr);

        if (sample_type & PERF_SAMPLE_PERIOD)
                size += sizeof(data->period);

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                size += sizeof(data->weight.full);

        if (sample_type & PERF_SAMPLE_READ)
                size += event->read_size;

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                size += sizeof(data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                size += sizeof(data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                size += sizeof(data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                size += sizeof(data->code_page_size);

        event->header_size = size;
}

/*
 * Called at perf_event creation and when events are attached/detached from a
 * group.
 */
static void perf_event__header_size(struct perf_event *event)
{
        event->read_size =
                __perf_event_read_size(event->attr.read_format,
                                       event->group_leader->nr_siblings);
        __perf_event_header_size(event, event->attr.sample_type);
}

static void perf_event__id_header_size(struct perf_event *event)
{
        struct perf_sample_data *data;
        u64 sample_type = event->attr.sample_type;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_TID)
                size += sizeof(data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                size += sizeof(data->time);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_ID)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                size += sizeof(data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                size += sizeof(data->cpu_entry);

        event->id_header_size = size;
}

/*
 * Check that adding an event to the group does not result in anybody
 * overflowing the 64k event limit imposed by the output buffer.
 *
 * Specifically, check that the read_size for the event does not exceed 16k,
 * read_size being the one term that grows with groups size. Since read_size
 * depends on per-event read_format, also (re)check the existing events.
 *
 * This leaves 48k for the constant size fields and things like callchains,
 * branch stacks and register sets.
 */
static bool perf_event_validate_size(struct perf_event *event)
{
        struct perf_event *sibling, *group_leader = event->group_leader;

        if (__perf_event_read_size(event->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        if (__perf_event_read_size(group_leader->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        /*
         * When creating a new group leader, group_leader->ctx is initialized
         * after the size has been validated, but we cannot safely use
         * for_each_sibling_event() until group_leader->ctx is set. A new group
         * leader cannot have any siblings yet, so we can safely skip checking
         * the non-existent siblings.
         */
        if (event == group_leader)
                return true;

        for_each_sibling_event(sibling, group_leader) {
                if (__perf_event_read_size(sibling->attr.read_format,
                                           group_leader->nr_siblings + 1) > 16*1024)
                        return false;
        }

        return true;
}

static void perf_group_attach(struct perf_event *event)
{
        struct perf_event *group_leader = event->group_leader, *pos;

        lockdep_assert_held(&event->ctx->lock);

        /*
         * We can have double attach due to group movement (move_group) in
         * perf_event_open().
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;

        event->attach_state |= PERF_ATTACH_GROUP;

        if (group_leader == event)
                return;

        WARN_ON_ONCE(group_leader->ctx != event->ctx);

        group_leader->group_caps &= event->event_caps;

        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
        group_leader->group_generation++;

        perf_event__header_size(group_leader);

        for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
}

/*
 * Remove an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;

        event->attach_state &= ~PERF_ATTACH_CONTEXT;

        ctx->nr_events--;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;

        list_del_rcu(&event->event_entry);

        if (event->group_leader == event)
                del_event_from_groups(event, ctx);

        /*
         * If event was in error state, then keep it
         * that way, otherwise bogus counts will be
         * returned on read(). The only way to get out
         * of error state is by explicit re-enabling
         * of the event
         */
        if (event->state > PERF_EVENT_STATE_OFF) {
                perf_cgroup_event_disable(event, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        }

        ctx->generation++;
        event->pmu_ctx->nr_events--;
}

static int
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
{
        if (!has_aux(aux_event))
                return 0;

        if (!event->pmu->aux_output_match)
                return 0;

        return event->pmu->aux_output_match(aux_event);
}

static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
                            struct perf_event_context *ctx);

static void perf_put_aux_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *iter;

        /*
         * If event uses aux_event tear down the link
         */
        if (event->aux_event) {
                iter = event->aux_event;
                event->aux_event = NULL;
                put_event(iter);
                return;
        }

        /*
         * If the event is an aux_event, tear down all links to
         * it from other events.
         */
        for_each_sibling_event(iter, event->group_leader) {
                if (iter->aux_event != event)
                        continue;

                iter->aux_event = NULL;
                put_event(event);

                /*
                 * If it's ACTIVE, schedule it out and put it into ERROR
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
                event_sched_out(iter, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
}

static bool perf_need_aux_event(struct perf_event *event)
{
        return !!event->attr.aux_output || !!event->attr.aux_sample_size;
}

static int perf_get_aux_event(struct perf_event *event,
                              struct perf_event *group_leader)
{
        /*
         * Our group leader must be an aux event if we want to be
         * an aux_output. This way, the aux event will precede its
         * aux_output events in the group, and therefore will always
         * schedule first.
         */
        if (!group_leader)
                return 0;

        /*
         * aux_output and aux_sample_size are mutually exclusive.
         */
        if (event->attr.aux_output && event->attr.aux_sample_size)
                return 0;

        if (event->attr.aux_output &&
            !perf_aux_output_match(event, group_leader))
                return 0;

        if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                return 0;

        if (!atomic_long_inc_not_zero(&group_leader->refcount))
                return 0;

        /*
         * Link aux_outputs to their aux event; this is undone in
         * perf_group_detach() by perf_put_aux_event(). When the
         * group in torn down, the aux_output events loose their
         * link to the aux_event and can't schedule any more.
         */
        event->aux_event = group_leader;

        return 1;
}

static inline struct list_head *get_event_list(struct perf_event *event)
{
        return event->attr.pinned ? &event->pmu_ctx->pinned_active :
                                    &event->pmu_ctx->flexible_active;
}

/*
 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
 * cannot exist on their own, schedule them out and move them into the ERROR
 * state. Also see _perf_event_enable(), it will not be able to recover
 * this ERROR state.
 */
static inline void perf_remove_sibling_event(struct perf_event *event)
{
        event_sched_out(event, event->ctx);
        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}

static void perf_group_detach(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event *sibling, *tmp;
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_GROUP))
                return;

        event->attach_state &= ~PERF_ATTACH_GROUP;

        perf_put_aux_event(event);

        /*
         * If this is a sibling, remove it from its group.
         */
        if (leader != event) {
                list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                event->group_leader->group_generation++;
                goto out;
        }

        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {

                if (sibling->event_caps & PERF_EV_CAP_SIBLING)
                        perf_remove_sibling_event(sibling);

                sibling->group_leader = sibling;
                list_del_init(&sibling->sibling_list);

                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;

                if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
                        add_event_to_groups(sibling, event->ctx);

                        if (sibling->state == PERF_EVENT_STATE_ACTIVE)
                                list_add_tail(&sibling->active_list, get_event_list(sibling));
                }

                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }

out:
        for_each_sibling_event(tmp, leader)
                perf_event__header_size(tmp);

        perf_event__header_size(leader);
}

static void sync_child_event(struct perf_event *child_event);

static void perf_child_detach(struct perf_event *event)
{
        struct perf_event *parent_event = event->parent;

        if (!(event->attach_state & PERF_ATTACH_CHILD))
                return;

        event->attach_state &= ~PERF_ATTACH_CHILD;

        if (WARN_ON_ONCE(!parent_event))
                return;

        lockdep_assert_held(&parent_event->child_mutex);

        sync_child_event(event);
        list_del_init(&event->child_list);
}

static bool is_orphaned_event(struct perf_event *event)
{
        return event->state == PERF_EVENT_STATE_DEAD;
}

static inline int
event_filter_match(struct perf_event *event)
{
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
               perf_cgroup_match(event);
}

static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;

        // XXX cpc serialization, probably per-cpu IRQ disabled

        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        /*
         * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
         * we can schedule events _OUT_ individually through things like
         * __perf_remove_from_context().
         */
        list_del_init(&event->active_list);

        perf_pmu_disable(event->pmu);

        event->pmu->del(event, 0);
        event->oncpu = -1;

        if (event->pending_disable) {
                event->pending_disable = 0;
                perf_cgroup_event_disable(event, ctx);
                state = PERF_EVENT_STATE_OFF;
        }

        if (event->pending_sigtrap) {
                bool dec = true;

                event->pending_sigtrap = 0;
                if (state != PERF_EVENT_STATE_OFF &&
                    !event->pending_work) {
                        event->pending_work = 1;
                        dec = false;
                        WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
                        task_work_add(current, &event->pending_task, TWA_RESUME);
                }
                if (dec)
                        local_dec(&event->ctx->nr_pending);
        }

        perf_event_set_state(event, state);

        if (!is_software_event(event))
                cpc->active_oncpu--;
        if (event->attr.freq && event->attr.sample_freq) {
                ctx->nr_freq--;
                epc->nr_freq--;
        }
        if (event->attr.exclusive || !cpc->active_oncpu)
                cpc->exclusive = 0;

        perf_pmu_enable(event->pmu);
}

static void
group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event;

        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);

        event_sched_out(group_event, ctx);

        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
                event_sched_out(event, ctx);
}

#define DETACH_GROUP        0x01UL
#define DETACH_CHILD        0x02UL
#define DETACH_DEAD        0x04UL

/*
 * Cross CPU call to remove a performance event
 *
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
static void
__perf_remove_from_context(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx,
                           void *info)
{
        struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
        unsigned long flags = (unsigned long)info;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, false);
        }

        /*
         * Ensure event_sched_out() switches to OFF, at the very least
         * this avoids raising perf_pending_task() at this time.
         */
        if (flags & DETACH_DEAD)
                event->pending_disable = 1;
        event_sched_out(event, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);
        if (flags & DETACH_DEAD)
                event->state = PERF_EVENT_STATE_DEAD;

        if (!pmu_ctx->nr_events) {
                pmu_ctx->rotate_necessary = 0;

                if (ctx->task && ctx->is_active) {
                        struct perf_cpu_pmu_context *cpc;

                        cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
                        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                        cpc->task_epc = NULL;
                }
        }

        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);

                ctx->is_active = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
                }
        }
}

/*
 * Remove the event from a task's (or a CPU's) list of events.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->mutex);

        /*
         * Because of perf_event_exit_task(), perf_remove_from_context() ought
         * to work in the face of TASK_TOMBSTONE, unlike every other
         * event_function_call() user.
         */
        raw_spin_lock_irq(&ctx->lock);
        if (!ctx->is_active) {
                __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
                                           ctx, (void *)flags);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_remove_from_context, (void *)flags);
}

/*
 * Cross CPU call to disable a performance event
 */
static void __perf_event_disable(struct perf_event *event,
                                 struct perf_cpu_context *cpuctx,
                                 struct perf_event_context *ctx,
                                 void *info)
{
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        perf_pmu_disable(event->pmu_ctx->pmu);

        if (event == event->group_leader)
                group_sched_out(event, ctx);
        else
                event_sched_out(event, ctx);

        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        perf_cgroup_event_disable(event, ctx);

        perf_pmu_enable(event->pmu_ctx->pmu);
}

/*
 * Disable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in perf_event_exit_event().
 *
 * When called from perf_pending_irq it's OK because event->ctx
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
static void _perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state <= PERF_EVENT_STATE_OFF) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_disable, NULL);
}

void perf_event_disable_local(struct perf_event *event)
{
        event_function_local(event, __perf_event_disable, NULL);
}

/*
 * Strictly speaking kernel users cannot create groups and therefore this
 * interface does not need the perf_event_ctx_lock() magic.
 */
void perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_disable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
        event->pending_disable = 1;
        irq_work_queue(&event->pending_irq);
}

#define MAX_INTERRUPTS (~0ULL)

static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);

static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        int ret = 0;

        WARN_ON_ONCE(event->ctx != ctx);

        lockdep_assert_held(&ctx->lock);

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
         * Order event::oncpu write to happen before the ACTIVE state is
         * visible. This allows perf_event_{stop,read}() to observe the correct
         * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
        perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);

        /*
         * Unthrottle events, since we scheduled we might have missed several
         * ticks already, also for a heavily scheduling task there is little
         * guarantee it'll get a tick in a timely manner.
         */
        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
                perf_log_throttle(event, 1);
                event->hw.interrupts = 0;
        }

        perf_pmu_disable(event->pmu);

        perf_log_itrace_start(event);

        if (event->pmu->add(event, PERF_EF_START)) {
                perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                event->oncpu = -1;
                ret = -EAGAIN;
                goto out;
        }

        if (!is_software_event(event))
                cpc->active_oncpu++;
        if (event->attr.freq && event->attr.sample_freq) {
                ctx->nr_freq++;
                epc->nr_freq++;
        }
        if (event->attr.exclusive)
                cpc->exclusive = 1;

out:
        perf_pmu_enable(event->pmu);

        return ret;
}

static int
group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = group_event->pmu_ctx->pmu;

        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;

        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);

        if (event_sched_in(group_event, ctx))
                goto error;

        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }

        if (!pmu->commit_txn(pmu))
                return 0;

group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
        for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;

                event_sched_out(event, ctx);
        }
        event_sched_out(group_event, ctx);

error:
        pmu->cancel_txn(pmu);
        return -EAGAIN;
}

/*
 * Work out whether we can put this event group on the CPU now.
 */
static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);

        /*
         * Groups consisting entirely of software events can always go on.
         */
        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
        if (cpc->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
         * events on the CPU, it can't go on.
         */
        if (event->attr.exclusive && !list_empty(get_event_list(event)))
                return 0;
        /*
         * Otherwise, try to add it if all previous groups were able
         * to go on.
         */
        return can_add_hw;
}

static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
{
        list_add_event(event, ctx);
        perf_group_attach(event);
}

static void task_ctx_sched_out(struct perf_event_context *ctx,
                                enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        if (!cpuctx->task_ctx)
                return;

        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;

        ctx_sched_out(ctx, event_type);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx)
{
        ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
        if (ctx)
                 ctx_sched_in(ctx, EVENT_PINNED);
        ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
        if (ctx)
                 ctx_sched_in(ctx, EVENT_FLEXIBLE);
}

/*
 * We want to maintain the following priority of scheduling:
 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
 *  - task pinned (EVENT_PINNED)
 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
 *  - task flexible (EVENT_FLEXIBLE).
 *
 * In order to avoid unscheduling and scheduling back in everything every
 * time an event is added, only do it for the groups of equal priority and
 * below.
 *
 * This can be called after a batch operation on task events, in which case
 * event_type is a bit mask of the types of events involved. For CPU events,
 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
 */
/*
 * XXX: ctx_resched() reschedule entire perf_event_context while adding new
 * event to the context or enabling existing event in the context. We can
 * probably optimize it by rescheduling only affected pmu_ctx.
 */
static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        enum event_type_t event_type)
{
        bool cpu_event = !!(event_type & EVENT_CPU);

        /*
         * If pinned groups are involved, flexible groups also need to be
         * scheduled out.
         */
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;

        event_type &= EVENT_ALL;

        perf_ctx_disable(&cpuctx->ctx, false);
        if (task_ctx) {
                perf_ctx_disable(task_ctx, false);
                task_ctx_sched_out(task_ctx, event_type);
        }

        /*
         * Decide which cpu ctx groups to schedule out based on the types
         * of events that caused rescheduling:
         *  - EVENT_CPU: schedule out corresponding groups;
         *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
                ctx_sched_out(&cpuctx->ctx, event_type);
        else if (event_type & EVENT_PINNED)
                ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);

        perf_event_sched_in(cpuctx, task_ctx);

        perf_ctx_enable(&cpuctx->ctx, false);
        if (task_ctx)
                perf_ctx_enable(task_ctx, false);
}

void perf_pmu_resched(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;

        perf_ctx_lock(cpuctx, task_ctx);
        ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
        perf_ctx_unlock(cpuctx, task_ctx);
}

/*
 * Cross CPU call to install and enable a performance event
 *
 * Very similar to remote_function() + event_function() but cannot assume that
 * things like ctx->is_active and cpuctx->task_ctx are set.
 */
static int  __perf_install_in_context(void *info)
{
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;

        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx->task) {
                raw_spin_lock(&ctx->lock);
                task_ctx = ctx;

                reprogram = (ctx->task == current);

                /*
                 * If the task is running, it must be running on this CPU,
                 * otherwise we cannot reprogram things.
                 *
                 * If its not running, we don't care, ctx->lock will
                 * serialize against it becoming runnable.
                 */
                if (task_curr(ctx->task) && !reprogram) {
                        ret = -ESRCH;
                        goto unlock;
                }

                WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
        } else if (task_ctx) {
                raw_spin_lock(&task_ctx->lock);
        }

#ifdef CONFIG_CGROUP_PERF
        if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
                /*
                 * If the current cgroup doesn't match the event's
                 * cgroup, we should not try to schedule it.
                 */
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
                reprogram = cgroup_is_descendant(cgrp->css.cgroup,
                                        event->cgrp->css.cgroup);
        }
#endif

        if (reprogram) {
                ctx_sched_out(ctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
                add_event_to_ctx(event, ctx);
        }

unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx);

/*
 * Attach a performance event to a context.
 *
 * Very similar to event_function_call, see comment there.
 */
static void
perf_install_in_context(struct perf_event_context *ctx,
                        struct perf_event *event,
                        int cpu)
{
        struct task_struct *task = READ_ONCE(ctx->task);

        lockdep_assert_held(&ctx->mutex);

        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));

        if (event->cpu != -1)
                WARN_ON_ONCE(event->cpu != cpu);

        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * will be 'complete'. See perf_iterate_sb_cpu().
         */
        smp_store_release(&event->ctx, ctx);

        /*
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
         */
        if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
            ctx->nr_events && !is_cgroup_event(event)) {
                raw_spin_lock_irq(&ctx->lock);
                if (ctx->task == TASK_TOMBSTONE) {
                        raw_spin_unlock_irq(&ctx->lock);
                        return;
                }
                add_event_to_ctx(event, ctx);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
        }

        /*
         * Should not happen, we validate the ctx is still alive before calling.
         */
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
                return;

        /*
         * Installing events is tricky because we cannot rely on ctx->is_active
         * to be set in case this is the nr_events 0 -> 1 transition.
         *
         * Instead we use task_curr(), which tells us if the task is running.
         * However, since we use task_curr() outside of rq::lock, we can race
         * against the actual state. This means the result can be wrong.
         *
         * If we get a false positive, we retry, this is harmless.
         *
         * If we get a false negative, things are complicated. If we are after
         * perf_event_context_sched_in() ctx::lock will serialize us, and the
         * value must be correct. If we're before, it doesn't matter since
         * perf_event_context_sched_in() will program the counter.
         *
         * However, this hinges on the remote context switch having observed
         * our task->perf_event_ctxp[] store, such that it will in fact take
         * ctx::lock in perf_event_context_sched_in().
         *
         * We do this by task_function_call(), if the IPI fails to hit the task
         * we know any future context switch of task must see the
         * perf_event_ctpx[] store.
         */

        /*
         * This smp_mb() orders the task->perf_event_ctxp[] store with the
         * task_cpu() load, such that if the IPI then does not find the task
         * running, a future context switch of that task must observe the
         * store.
         */
        smp_mb();
again:
        if (!task_function_call(task, __perf_install_in_context, event))
                return;

        raw_spin_lock_irq(&ctx->lock);
        task = ctx->task;
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
                /*
                 * Cannot happen because we already checked above (which also
                 * cannot happen), and we hold ctx->mutex, which serializes us
                 * against perf_event_exit_task_context().
                 */
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        /*
         * If the task is not running, ctx->lock will avoid it becoming so,
         * thus we can safely install the event.
         */
        if (task_curr(task)) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Cross CPU call to enable a performance event
 */
static void __perf_event_enable(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;

        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <= PERF_EVENT_STATE_ERROR)
                return;

        if (ctx->is_active)
                ctx_sched_out(ctx, EVENT_TIME);

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);

        if (!ctx->is_active)
                return;

        if (!event_filter_match(event)) {
                ctx_sched_in(ctx, EVENT_TIME);
                return;
        }

        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
                ctx_sched_in(ctx, EVENT_TIME);
                return;
        }

        task_ctx = cpuctx->task_ctx;
        if (ctx->task)
                WARN_ON_ONCE(task_ctx != ctx);

        ctx_resched(cpuctx, task_ctx, get_event_type(event));
}

/*
 * Enable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
static void _perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <  PERF_EVENT_STATE_ERROR) {
out:
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        /*
         * If the event is in error state, clear that first.
         *
         * That way, if we see the event in error state below, we know that it
         * has gone back into error state, as distinct from the task having
         * been scheduled away before the cross-call arrived.
         */
        if (event->state == PERF_EVENT_STATE_ERROR) {
                /*
                 * Detached SIBLING events cannot leave ERROR state.
                 */
                if (event->event_caps & PERF_EV_CAP_SIBLING &&
                    event->group_leader == event)
                        goto out;

                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_enable, NULL);
}

/*
 * See perf_event_disable();
 */
void perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_enable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_enable);

struct stop_event_data {
        struct perf_event        *event;
        unsigned int                restart;
};

static int __perf_event_stop(void *info)
{
        struct stop_event_data *sd = info;
        struct perf_event *event = sd->event;

        /* if it's already INACTIVE, do nothing */
        if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                return 0;

        /* matches smp_wmb() in event_sched_in() */
        smp_rmb();

        /*
         * There is a window with interrupts enabled before we get here,
         * so we need to check again lest we try to stop another CPU's event.
         */
        if (READ_ONCE(event->oncpu) != smp_processor_id())
                return -EAGAIN;

        event->pmu->stop(event, PERF_EF_UPDATE);

        /*
         * May race with the actual stop (through perf_pmu_output_stop()),
         * but it is only used for events with AUX ring buffer, and such
         * events will refuse to restart because of rb::aux_mmap_count==0,
         * see comments in perf_aux_output_begin().
         *
         * Since this is happening on an event-local CPU, no trace is lost
         * while restarting.
         */
        if (sd->restart)
                event->pmu->start(event, 0);

        return 0;
}

static int perf_event_stop(struct perf_event *event, int restart)
{
        struct stop_event_data sd = {
                .event                = event,
                .restart        = restart,
        };
        int ret = 0;

        do {
                if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                        return 0;

                /* matches smp_wmb() in event_sched_in() */
                smp_rmb();

                /*
                 * We only want to restart ACTIVE events, so if the event goes
                 * inactive here (event->oncpu==-1), there's nothing more to do;
                 * fall through with ret==-ENXIO.
                 */
                ret = cpu_function_call(READ_ONCE(event->oncpu),
                                        __perf_event_stop, &sd);
        } while (ret == -EAGAIN);

        return ret;
}

/*
 * In order to contain the amount of racy and tricky in the address filter
 * configuration management, it is a two part process:
 *
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 *      we update the addresses of corresponding vmas in
 *        event::addr_filter_ranges array and bump the event::addr_filters_gen;
 * (p2) when an event is scheduled in (pmu::add), it calls
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 *      if the generation has changed since the previous call.
 *
 * If (p1) happens while the event is active, we restart it to force (p2).
 *
 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
 *     ioctl;
 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
 *     for reading;
 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
 *     of exec.
 */
void perf_event_addr_filters_sync(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

        if (!has_addr_filter(event))
                return;

        raw_spin_lock(&ifh->lock);
        if (event->addr_filters_gen != event->hw.addr_filters_gen) {
                event->pmu->addr_filters_sync(event);
                event->hw.addr_filters_gen = event->addr_filters_gen;
        }
        raw_spin_unlock(&ifh->lock);
}
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);

static int _perf_event_refresh(struct perf_event *event, int refresh)
{
        /*
         * not supported on inherited events
         */
        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;

        atomic_add(refresh, &event->event_limit);
        _perf_event_enable(event);

        return 0;
}

/*
 * See perf_event_disable()
 */
int perf_event_refresh(struct perf_event *event, int refresh)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_refresh(event, refresh);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_refresh);

static int perf_event_modify_breakpoint(struct perf_event *bp,
                                         struct perf_event_attr *attr)
{
        int err;

        _perf_event_disable(bp);

        err = modify_user_hw_breakpoint_check(bp, attr, true);

        if (!bp->attr.disabled)
                _perf_event_enable(bp);

        return err;
}

/*
 * Copy event-type-independent attributes that may be modified.
 */
static void perf_event_modify_copy_attr(struct perf_event_attr *to,
                                        const struct perf_event_attr *from)
{
        to->sig_data = from->sig_data;
}

static int perf_event_modify_attr(struct perf_event *event,
                                  struct perf_event_attr *attr)
{
        int (*func)(struct perf_event *, struct perf_event_attr *);
        struct perf_event *child;
        int err;

        if (event->attr.type != attr->type)
                return -EINVAL;

        switch (event->attr.type) {
        case PERF_TYPE_BREAKPOINT:
                func = perf_event_modify_breakpoint;
                break;
        default:
                /* Place holder for future additions. */
                return -EOPNOTSUPP;
        }

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        /*
         * Event-type-independent attributes must be copied before event-type
         * modification, which will validate that final attributes match the
         * source attributes after all relevant attributes have been copied.
         */
        perf_event_modify_copy_attr(&event->attr, attr);
        err = func(event, attr);
        if (err)
                goto out;
        list_for_each_entry(child, &event->child_list, child_list) {
                perf_event_modify_copy_attr(&child->attr, attr);
                err = func(child, attr);
                if (err)
                        goto out;
        }
out:
        mutex_unlock(&event->child_mutex);
        return err;
}

static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
                                enum event_type_t event_type)
{
        struct perf_event_context *ctx = pmu_ctx->ctx;
        struct perf_event *event, *tmp;
        struct pmu *pmu = pmu_ctx->pmu;

        if (ctx->task && !ctx->is_active) {
                struct perf_cpu_pmu_context *cpc;

                cpc = this_cpu_ptr(pmu->cpu_pmu_context);
                WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                cpc->task_epc = NULL;
        }

        if (!event_type)
                return;

        perf_pmu_disable(pmu);
        if (event_type & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->pinned_active,
                                         active_list)
                        group_sched_out(event, ctx);
        }

        if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->flexible_active,
                                         active_list)
                        group_sched_out(event, ctx);
                /*
                 * Since we cleared EVENT_FLEXIBLE, also clear
                 * rotate_necessary, is will be reset by
                 * ctx_flexible_sched_in() when needed.
                 */
                pmu_ctx->rotate_necessary = 0;
        }
        perf_pmu_enable(pmu);
}

static void
ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;
        bool cgroup = event_type & EVENT_CGROUP;

        event_type &= ~EVENT_CGROUP;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events)) {
                /*
                 * See __perf_remove_from_context().
                 */
                WARN_ON_ONCE(ctx->is_active);
                if (ctx->task)
                        WARN_ON_ONCE(cpuctx->task_ctx);
                return;
        }

        /*
         * Always update time if it was set; not only when it changes.
         * Otherwise we can 'forget' to update time for any but the last
         * context we sched out. For example:
         *
         *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
         *   ctx_sched_out(.event_type = EVENT_PINNED)
         *
         * would only update time for the pinned events.
         */
        if (is_active & EVENT_TIME) {
                /* update (and stop) ctx time */
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active &= ~event_type;
        if (!(ctx->is_active & EVENT_ALL))
                ctx->is_active = 0;

        if (ctx->task) {
                WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                if (!ctx->is_active)
                        cpuctx->task_ctx = NULL;
        }

        is_active ^= ctx->is_active; /* changed bits */

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                __pmu_ctx_sched_out(pmu_ctx, is_active);
        }
}

/*
 * Test whether two contexts are equivalent, i.e. whether they have both been
 * cloned from the same version of the same context.
 *
 * Equivalence is measured using a generation number in the context that is
 * incremented on each modification to it; see unclone_ctx(), list_add_event()
 * and list_del_event().
 */
static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
{
        lockdep_assert_held(&ctx1->lock);
        lockdep_assert_held(&ctx2->lock);

        /* Pinning disables the swap optimization */
        if (ctx1->pin_count || ctx2->pin_count)
                return 0;

        /* If ctx1 is the parent of ctx2 */
        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
                return 1;

        /* If ctx2 is the parent of ctx1 */
        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
                return 1;

        /*
         * If ctx1 and ctx2 have the same parent; we flatten the parent
         * hierarchy, see perf_event_init_context().
         */
        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
                        ctx1->parent_gen == ctx2->parent_gen)
                return 1;

        /* Unmatched */
        return 0;
}

static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
{
        u64 value;

        if (!event->attr.inherit_stat)
                return;

        /*
         * Update the event value, we cannot use perf_event_read()
         * because we're in the middle of a context switch and have IRQs
         * disabled, which upsets smp_call_function_single(), however
         * we know the event must be on the current CPU, therefore we
         * don't need to use it.
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);

        perf_event_update_time(event);

        /*
         * In order to keep per-task stats reliable we need to flip the event
         * values when we flip the contexts.
         */
        value = local64_read(&next_event->count);
        value = local64_xchg(&event->count, value);
        local64_set(&next_event->count, value);

        swap(event->total_time_enabled, next_event->total_time_enabled);
        swap(event->total_time_running, next_event->total_time_running);

        /*
         * Since we swizzled the values, update the user visible data too.
         */
        perf_event_update_userpage(event);
        perf_event_update_userpage(next_event);
}

static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
{
        struct perf_event *event, *next_event;

        if (!ctx->nr_stat)
                return;

        update_context_time(ctx);

        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);

        next_event = list_first_entry(&next_ctx->event_list,
                                        struct perf_event, event_entry);

        while (&event->event_entry != &ctx->event_list &&
               &next_event->event_entry != &next_ctx->event_list) {

                __perf_event_sync_stat(event, next_event);

                event = list_next_entry(event, event_entry);
                next_event = list_next_entry(next_event, event_entry);
        }
}

#define double_list_for_each_entry(pos1, pos2, head1, head2, member)        \
        for (pos1 = list_first_entry(head1, typeof(*pos1), member),        \
             pos2 = list_first_entry(head2, typeof(*pos2), member);        \
             !list_entry_is_head(pos1, head1, member) &&                \
             !list_entry_is_head(pos2, head2, member);                        \
             pos1 = list_next_entry(pos1, member),                        \
             pos2 = list_next_entry(pos2, member))

static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
                                          struct perf_event_context *next_ctx)
{
        struct perf_event_pmu_context *prev_epc, *next_epc;

        if (!prev_ctx->nr_task_data)
                return;

        double_list_for_each_entry(prev_epc, next_epc,
                                   &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
                                   pmu_ctx_entry) {

                if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
                        continue;

                /*
                 * PMU specific parts of task perf context can require
                 * additional synchronization. As an example of such
                 * synchronization see implementation details of Intel
                 * LBR call stack data profiling;
                 */
                if (prev_epc->pmu->swap_task_ctx)
                        prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
                else
                        swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
        }
}

static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_cpu_pmu_context *cpc;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);

                if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
                        pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
        }
}

static void
perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
{
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
        int do_switch = 1;

        if (likely(!ctx))
                return;

        rcu_read_lock();
        next_ctx = rcu_dereference(next->perf_event_ctxp);
        if (!next_ctx)
                goto unlock;

        parent = rcu_dereference(ctx->parent_ctx);
        next_parent = rcu_dereference(next_ctx->parent_ctx);

        /* If neither context have a parent context; they cannot be clones. */
        if (!parent && !next_parent)
                goto unlock;

        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
                 * contexts and check that they are clones under the
                 * lock (including re-checking that neither has been
                 * uncloned in the meantime).  It doesn't matter which
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
                raw_spin_lock(&ctx->lock);
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {

                        perf_ctx_disable(ctx, false);

                        /* PMIs are disabled; ctx->nr_pending is stable. */
                        if (local_read(&ctx->nr_pending) ||
                            local_read(&next_ctx->nr_pending)) {
                                /*
                                 * Must not swap out ctx when there's pending
                                 * events that rely on the ctx->task relation.
                                 */
                                raw_spin_unlock(&next_ctx->lock);
                                rcu_read_unlock();
                                goto inside_switch;
                        }

                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);

                        perf_ctx_sched_task_cb(ctx, false);
                        perf_event_swap_task_ctx_data(ctx, next_ctx);

                        perf_ctx_enable(ctx, false);

                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * modified the ctx and the above modification of
                         * ctx->task and ctx->task_ctx_data are immaterial
                         * since those values are always verified under
                         * ctx->lock which we're now holding.
                         */
                        RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
                        RCU_INIT_POINTER(next->perf_event_ctxp, ctx);

                        do_switch = 0;

                        perf_event_sync_stat(ctx, next_ctx);
                }
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
unlock:
        rcu_read_unlock();

        if (do_switch) {
                raw_spin_lock(&ctx->lock);
                perf_ctx_disable(ctx, false);

inside_switch:
                perf_ctx_sched_task_cb(ctx, false);
                task_ctx_sched_out(ctx, EVENT_ALL);

                perf_ctx_enable(ctx, false);
                raw_spin_unlock(&ctx->lock);
        }
}

static DEFINE_PER_CPU(struct list_head, sched_cb_list);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);

void perf_sched_cb_dec(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);

        this_cpu_dec(perf_sched_cb_usages);
        barrier();

        if (!--cpc->sched_cb_usage)
                list_del(&cpc->sched_cb_entry);
}


void perf_sched_cb_inc(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);

        if (!cpc->sched_cb_usage++)
                list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));

        barrier();
        this_cpu_inc(perf_sched_cb_usages);
}

/*
 * This function provides the context switch callback to the lower code
 * layer. It is invoked ONLY when the context switch callback is enabled.
 *
 * This callback is relevant even to per-cpu events; for example multi event
 * PEBS requires this to provide PID/TID information. This requires we flush
 * all queued PEBS records before we context switch to a new task.
 */
static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu;

        pmu = cpc->epc.pmu;

        /* software PMUs will not have sched_task */
        if (WARN_ON_ONCE(!pmu->sched_task))
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        pmu->sched_task(cpc->task_epc, sched_in);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
                                bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cpu_pmu_context *cpc;

        /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
        if (prev == next || cpuctx->task_ctx)
                return;

        list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
                __perf_pmu_sched_task(cpc, sched_in);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
{
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);

        perf_event_context_sched_out(task, next);

        /*
         * if cgroup events exist on this CPU, then we need
         * to check if we have to switch out PMU state.
         * cgroup event are system-wide mode only
         */
        perf_cgroup_switch(next);
}

static bool perf_less_group_idx(const void *l, const void *r)
{
        const struct perf_event *le = *(const struct perf_event **)l;
        const struct perf_event *re = *(const struct perf_event **)r;

        return le->group_index < re->group_index;
}

static void swap_ptr(void *l, void *r)
{
        void **lp = l, **rp = r;

        swap(*lp, *rp);
}

static const struct min_heap_callbacks perf_min_heap = {
        .elem_size = sizeof(struct perf_event *),
        .less = perf_less_group_idx,
        .swp = swap_ptr,
};

static void __heap_add(struct min_heap *heap, struct perf_event *event)
{
        struct perf_event **itrs = heap->data;

        if (event) {
                itrs[heap->nr] = event;
                heap->nr++;
        }
}

static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_cpu_pmu_context *cpc;

        if (!pmu_ctx->ctx->task)
                return;

        cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
        cpc->task_epc = pmu_ctx;
}

static noinline int visit_groups_merge(struct perf_event_context *ctx,
                                struct perf_event_groups *groups, int cpu,
                                struct pmu *pmu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
{
#ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
#endif
        struct perf_cpu_context *cpuctx = NULL;
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct min_heap event_heap;
        struct perf_event **evt;
        int ret;

        if (pmu->filter && pmu->filter(pmu, cpu))
                return 0;

        if (!ctx->task) {
                cpuctx = this_cpu_ptr(&perf_cpu_context);
                event_heap = (struct min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = cpuctx->heap_size,
                };

                lockdep_assert_held(&cpuctx->ctx.lock);

#ifdef CONFIG_CGROUP_PERF
                if (cpuctx->cgrp)
                        css = &cpuctx->cgrp->css;
#endif
        } else {
                event_heap = (struct min_heap){
                        .data = itrs,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
                __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
        }
        evt = event_heap.data;

        __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));

#ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
                __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif

        if (event_heap.nr) {
                __link_epc((*evt)->pmu_ctx);
                perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
        }

        min_heapify_all(&event_heap, &perf_min_heap);

        while (event_heap.nr) {
                ret = func(*evt, data);
                if (ret)
                        return ret;

                *evt = perf_event_groups_next(*evt, pmu);
                if (*evt)
                        min_heapify(&event_heap, 0, &perf_min_heap);
                else
                        min_heap_pop(&event_heap, &perf_min_heap);
        }

        return 0;
}

/*
 * Because the userpage is strictly per-event (there is no concept of context,
 * so there cannot be a context indirection), every userpage must be updated
 * when context time starts :-(
 *
 * IOW, we must not miss EVENT_TIME edges.
 */
static inline bool event_update_userpage(struct perf_event *event)
{
        if (likely(!atomic_read(&event->mmap_count)))
                return false;

        perf_event_update_time(event);
        perf_event_update_userpage(event);

        return true;
}

static inline void group_update_userpage(struct perf_event *group_event)
{
        struct perf_event *event;

        if (!event_update_userpage(group_event))
                return;

        for_each_sibling_event(event, group_event)
                event_update_userpage(event);
}

static int merge_sched_in(struct perf_event *event, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        int *can_add_hw = data;

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        if (!event_filter_match(event))
                return 0;

        if (group_can_go_on(event, *can_add_hw)) {
                if (!group_sched_in(event, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }

        if (event->state == PERF_EVENT_STATE_INACTIVE) {
                *can_add_hw = 0;
                if (event->attr.pinned) {
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
                } else {
                        struct perf_cpu_pmu_context *cpc;

                        event->pmu_ctx->rotate_necessary = 1;
                        cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
                        perf_mux_hrtimer_restart(cpc);
                        group_update_userpage(event);
                }
        }

        return 0;
}

static void pmu_groups_sched_in(struct perf_event_context *ctx,
                                struct perf_event_groups *groups,
                                struct pmu *pmu)
{
        int can_add_hw = 1;
        visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
                           merge_sched_in, &can_add_hw);
}

static void ctx_groups_sched_in(struct perf_event_context *ctx,
                                struct perf_event_groups *groups,
                                bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
        }
}

static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
                               struct pmu *pmu)
{
        pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}

static void
ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        int is_active = ctx->is_active;
        bool cgroup = event_type & EVENT_CGROUP;

        event_type &= ~EVENT_CGROUP;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events))
                return;

        if (!(is_active & EVENT_TIME)) {
                /* start ctx time */
                __update_context_time(ctx, false);
                perf_cgroup_set_timestamp(cpuctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active |= (event_type | EVENT_TIME);
        if (ctx->task) {
                if (!is_active)
                        cpuctx->task_ctx = ctx;
                else
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
        }

        is_active ^= ctx->is_active; /* changed bits */

        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED)
                ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);

        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE)
                ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}

static void perf_event_context_sched_in(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;

        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (!ctx)
                goto rcu_unlock;

        if (cpuctx->task_ctx == ctx) {
                perf_ctx_lock(cpuctx, ctx);
                perf_ctx_disable(ctx, false);

                perf_ctx_sched_task_cb(ctx, true);

                perf_ctx_enable(ctx, false);
                perf_ctx_unlock(cpuctx, ctx);
                goto rcu_unlock;
        }

        perf_ctx_lock(cpuctx, ctx);
        /*
         * We must check ctx->nr_events while holding ctx->lock, such
         * that we serialize against perf_install_in_context().
         */
        if (!ctx->nr_events)
                goto unlock;

        perf_ctx_disable(ctx, false);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * cpu flexible, task flexible.
         *
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
                perf_ctx_disable(&cpuctx->ctx, false);
                ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
        }

        perf_event_sched_in(cpuctx, ctx);

        perf_ctx_sched_task_cb(cpuctx->task_ctx, true);

        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                perf_ctx_enable(&cpuctx->ctx, false);

        perf_ctx_enable(ctx, false);

unlock:
        perf_ctx_unlock(cpuctx, ctx);
rcu_unlock:
        rcu_read_unlock();
}

/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
{
        perf_event_context_sched_in(task);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
        u64 frequency = event->attr.sample_freq;
        u64 sec = NSEC_PER_SEC;
        u64 divisor, dividend;

        int count_fls, nsec_fls, frequency_fls, sec_fls;

        count_fls = fls64(count);
        nsec_fls = fls64(nsec);
        frequency_fls = fls64(frequency);
        sec_fls = 30;

        /*
         * We got @count in @nsec, with a target of sample_freq HZ
         * the target period becomes:
         *
         *             @count * 10^9
         * period = -------------------
         *          @nsec * sample_freq
         *
         */

        /*
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
#define REDUCE_FLS(a, b)                \
do {                                        \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
                a##_fls--;                \
        } else {                        \
                b >>= 1;                \
                b##_fls--;                \
        }                                \
} while (0)

        /*
         * Reduce accuracy until either term fits in a u64, then proceed with
         * the other, so that finally we can do a u64/u64 division.
         */
        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
                REDUCE_FLS(nsec, frequency);
                REDUCE_FLS(sec, count);
        }

        if (count_fls + sec_fls > 64) {
                divisor = nsec * frequency;

                while (count_fls + sec_fls > 64) {
                        REDUCE_FLS(count, sec);
                        divisor >>= 1;
                }

                dividend = count * sec;
        } else {
                dividend = count * sec;

                while (nsec_fls + frequency_fls > 64) {
                        REDUCE_FLS(nsec, frequency);
                        dividend >>= 1;
                }

                divisor = nsec * frequency;
        }

        if (!divisor)
                return dividend;

        return div64_u64(dividend, divisor);
}

static DEFINE_PER_CPU(int, perf_throttled_count);
static DEFINE_PER_CPU(u64, perf_throttled_seq);

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period, sample_period;
        s64 delta;

        period = perf_calculate_period(event, nsec, count);

        delta = (s64)(period - hwc->sample_period);
        delta = (delta + 7) / 8; /* low pass filter */

        sample_period = hwc->sample_period + delta;

        if (!sample_period)
                sample_period = 1;

        hwc->sample_period = sample_period;

        if (local64_read(&hwc->period_left) > 8*sample_period) {
                if (disable)
                        event->pmu->stop(event, PERF_EF_UPDATE);

                local64_set(&hwc->period_left, 0);

                if (disable)
                        event->pmu->start(event, PERF_EF_RELOAD);
        }
}

static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
        struct perf_event *event;
        struct hw_perf_event *hwc;
        u64 now, period = TICK_NSEC;
        s64 delta;

        list_for_each_entry(event, event_list, active_list) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;

                // XXX use visit thingy to avoid the -1,cpu match
                if (!event_filter_match(event))
                        continue;

                hwc = &event->hw;

                if (hwc->interrupts == MAX_INTERRUPTS) {
                        hwc->interrupts = 0;
                        perf_log_throttle(event, 1);
                        if (!event->attr.freq || !event->attr.sample_freq)
                                event->pmu->start(event, 0);
                }

                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;

                /*
                 * stop the event and update event->count
                 */
                event->pmu->stop(event, PERF_EF_UPDATE);

                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;

                /*
                 * restart the event
                 * reload only if value has changed
                 * we have stopped the event so tell that
                 * to perf_adjust_period() to avoid stopping it
                 * twice.
                 */
                if (delta > 0)
                        perf_adjust_period(event, period, delta, false);

                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
        }
}

/*
 * combine freq adjustment with unthrottling to avoid two passes over the
 * events. At the same time, make sure, having freq events does not change
 * the rate of unthrottling as that would introduce bias.
 */
static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
        struct perf_event_pmu_context *pmu_ctx;

        /*
         * only need to iterate over all events iff:
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
        if (!(ctx->nr_freq || unthrottle))
                return;

        raw_spin_lock(&ctx->lock);

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (!(pmu_ctx->nr_freq || unthrottle))
                        continue;
                if (!perf_pmu_ctx_is_active(pmu_ctx))
                        continue;
                if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
                        continue;

                perf_pmu_disable(pmu_ctx->pmu);
                perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
                perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
                perf_pmu_enable(pmu_ctx->pmu);
        }

        raw_spin_unlock(&ctx->lock);
}

/*
 * Move @event to the tail of the @ctx's elegible events.
 */
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (ctx->rotate_disable)
                return;

        perf_event_groups_delete(&ctx->flexible_groups, event);
        perf_event_groups_insert(&ctx->flexible_groups, event);
}

/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_event *event;
        struct rb_node *node;
        struct rb_root *tree;
        struct __group_key key = {
                .pmu = pmu_ctx->pmu,
        };

        /* pick the first active flexible event */
        event = list_first_entry_or_null(&pmu_ctx->flexible_active,
                                         struct perf_event, active_list);
        if (event)
                goto out;

        /* if no active flexible event, pick the first event */
        tree = &pmu_ctx->ctx->flexible_groups.tree;

        if (!pmu_ctx->ctx->task) {
                key.cpu = smp_processor_id();

                node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
                if (node)
                        event = __node_2_pe(node);
                goto out;
        }

        key.cpu = -1;
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node) {
                event = __node_2_pe(node);
                goto out;
        }

        key.cpu = smp_processor_id();
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node)
                event = __node_2_pe(node);

out:
        /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
        pmu_ctx->rotate_necessary = 0;

        return event;
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
        struct perf_event *cpu_event = NULL, *task_event = NULL;
        int cpu_rotate, task_rotate;
        struct pmu *pmu;

        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */

        cpu_epc = &cpc->epc;
        pmu = cpu_epc->pmu;
        task_epc = cpc->task_epc;

        cpu_rotate = cpu_epc->rotate_necessary;
        task_rotate = task_epc ? task_epc->rotate_necessary : 0;

        if (!(cpu_rotate || task_rotate))
                return false;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        if (task_rotate)
                task_event = ctx_event_to_rotate(task_epc);
        if (cpu_rotate)
                cpu_event = ctx_event_to_rotate(cpu_epc);

        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
        if (task_event || (task_epc && cpu_event)) {
                update_context_time(task_epc->ctx);
                __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
        }

        if (cpu_event) {
                update_context_time(&cpuctx->ctx);
                __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                rotate_ctx(&cpuctx->ctx, cpu_event);
                __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
        }

        if (task_event)
                rotate_ctx(task_epc->ctx, task_event);

        if (task_event || (task_epc && cpu_event))
                __pmu_ctx_sched_in(task_epc->ctx, pmu);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);

        return true;
}

void perf_event_task_tick(void)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;
        int throttled;

        lockdep_assert_irqs_disabled();

        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);

        perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_adjust_freq_unthr_context(ctx, !!throttled);
        rcu_read_unlock();
}

static int event_enable_on_exec(struct perf_event *event,
                                struct perf_event_context *ctx)
{
        if (!event->attr.enable_on_exec)
                return 0;

        event->attr.enable_on_exec = 0;
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);

        return 1;
}

/*
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;

        local_irq_save(flags);
        if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
                goto out;

        if (!ctx->nr_events)
                goto out;

        cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
        ctx_sched_out(ctx, EVENT_TIME);

        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
        }

        /*
         * Unclone and reschedule this context if we enabled any event.
         */
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
        } else {
                ctx_sched_in(ctx, EVENT_TIME);
        }
        perf_ctx_unlock(cpuctx, ctx);

out:
        local_irq_restore(flags);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
                                  struct perf_event_context *ctx);

/*
 * Removes all events from the current task that have been marked
 * remove-on-exec, and feeds their values back to parent events.
 */
static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        struct perf_event *event, *next;
        unsigned long flags;
        bool modified = false;

        mutex_lock(&ctx->mutex);

        if (WARN_ON_ONCE(ctx->task != current))
                goto unlock;

        list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
                if (!event->attr.remove_on_exec)
                        continue;

                if (!is_kernel_event(event))
                        perf_remove_from_owner(event);

                modified = true;

                perf_event_exit_event(event, ctx);
        }

        raw_spin_lock_irqsave(&ctx->lock, flags);
        if (modified)
                clone_ctx = unclone_ctx(ctx);
        raw_spin_unlock_irqrestore(&ctx->lock, flags);

unlock:
        mutex_unlock(&ctx->mutex);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

struct perf_read_data {
        struct perf_event *event;
        bool group;
        int ret;
};

static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
        u16 local_pkg, event_pkg;

        if ((unsigned)event_cpu >= nr_cpu_ids)
                return event_cpu;

        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
                int local_cpu = smp_processor_id();

                event_pkg = topology_physical_package_id(event_cpu);
                local_pkg = topology_physical_package_id(local_cpu);

                if (event_pkg == local_pkg)
                        return local_cpu;
        }

        return event_cpu;
}

/*
 * Cross CPU call to read the hardware event
 */
static void __perf_event_read(void *info)
{
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu = event->pmu;

        /*
         * If this is a task context, we need to check whether it is
         * the current task context of this cpu.  If not it has been
         * scheduled out before the smp call arrived.  In that case
         * event->count would have been updated to a recent sample
         * when the event was scheduled out.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;

        raw_spin_lock(&ctx->lock);
        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        perf_event_update_time(event);
        if (data->group)
                perf_event_update_sibling_time(event);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                goto unlock;

        if (!data->group) {
                pmu->read(event);
                data->ret = 0;
                goto unlock;
        }

        pmu->start_txn(pmu, PERF_PMU_TXN_READ);

        pmu->read(event);

        for_each_sibling_event(sub, event) {
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
                         * sibling could be on different (eg: software) PMU.
                         */
                        sub->pmu->read(sub);
                }
        }

        data->ret = pmu->commit_txn(pmu);

unlock:
        raw_spin_unlock(&ctx->lock);
}

static inline u64 perf_event_count(struct perf_event *event)
{
        return local64_read(&event->count) + atomic64_read(&event->child_count);
}

static void calc_timer_values(struct perf_event *event,
                                u64 *now,
                                u64 *enabled,
                                u64 *running)
{
        u64 ctx_time;

        *now = perf_clock();
        ctx_time = perf_event_time_now(event, *now);
        __perf_update_times(event, ctx_time, enabled, running);
}

/*
 * NMI-safe method to read a local event, that is an event that
 * is:
 *   - either for the current task, or for this CPU
 *   - does not have inherit set, for inherited task events
 *     will not be local and we cannot read them atomically
 *   - must not have a pmu::count method
 */
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running)
{
        unsigned long flags;
        int event_oncpu;
        int event_cpu;
        int ret = 0;

        /*
         * Disabling interrupts avoids all counter scheduling (context
         * switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        /*
         * It must not be an event with inherit set, we cannot read
         * all child counters from atomic context.
         */
        if (event->attr.inherit) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * Get the event CPU numbers, and adjust them to local if the event is
         * a per-package event that can be read locally
         */
        event_oncpu = __perf_event_read_cpu(event, event->oncpu);
        event_cpu = __perf_event_read_cpu(event, event->cpu);

        /* If this is a per-CPU event, it must be for this CPU */
        if (!(event->attach_state & PERF_ATTACH_TASK) &&
            event_cpu != smp_processor_id()) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a pinned event it must be running on this CPU */
        if (event->attr.pinned && event_oncpu != smp_processor_id()) {
                ret = -EBUSY;
                goto out;
        }

        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
         * oncpu == -1).
         */
        if (event_oncpu == smp_processor_id())
                event->pmu->read(event);

        *value = local64_read(&event->count);
        if (enabled || running) {
                u64 __enabled, __running, __now;

                calc_timer_values(event, &__now, &__enabled, &__running);
                if (enabled)
                        *enabled = __enabled;
                if (running)
                        *running = __running;
        }
out:
        local_irq_restore(flags);

        return ret;
}

static int perf_event_read(struct perf_event *event, bool group)
{
        enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;

        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
again:
        if (state == PERF_EVENT_STATE_ACTIVE) {
                struct perf_read_data data;

                /*
                 * Orders the ->state and ->oncpu loads such that if we see
                 * ACTIVE we must also see the right ->oncpu.
                 *
                 * Matches the smp_wmb() from event_sched_in().
                 */
                smp_rmb();

                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;

                data = (struct perf_read_data){
                        .event = event,
                        .group = group,
                        .ret = 0,
                };

                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);

                /*
                 * Purposely ignore the smp_call_function_single() return
                 * value.
                 *
                 * If event_cpu isn't a valid CPU it means the event got
                 * scheduled out and that will have updated the event count.
                 *
                 * Therefore, either way, we'll have an up-to-date event count
                 * after this.
                 */
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;

        } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;

                raw_spin_lock_irqsave(&ctx->lock, flags);
                state = event->state;
                if (state != PERF_EVENT_STATE_INACTIVE) {
                        raw_spin_unlock_irqrestore(&ctx->lock, flags);
                        goto again;
                }

                /*
                 * May read while context is not active (e.g., thread is
                 * blocked), in that case we cannot update context time
                 */
                if (ctx->is_active & EVENT_TIME) {
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }

                perf_event_update_time(event);
                if (group)
                        perf_event_update_sibling_time(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }

        return ret;
}

/*
 * Initialize the perf_event context in a task_struct:
 */
static void __perf_event_init_context(struct perf_event_context *ctx)
{
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->pmu_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        refcount_set(&ctx->refcount, 1);
}

static void
__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
{
        epc->pmu = pmu;
        INIT_LIST_HEAD(&epc->pmu_ctx_entry);
        INIT_LIST_HEAD(&epc->pinned_active);
        INIT_LIST_HEAD(&epc->flexible_active);
        atomic_set(&epc->refcount, 1);
}

static struct perf_event_context *
alloc_perf_context(struct task_struct *task)
{
        struct perf_event_context *ctx;

        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
        if (!ctx)
                return NULL;

        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);

        return ctx;
}

static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
        struct task_struct *task;

        rcu_read_lock();
        if (!vpid)
                task = current;
        else
                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        if (!task)
                return ERR_PTR(-ESRCH);

        return task;
}

/*
 * Returns a matching context with refcount and pincount.
 */
static struct perf_event_context *
find_get_context(struct task_struct *task, struct perf_event *event)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
        unsigned long flags;
        int err;

        if (!task) {
                /* Must be root to operate on a CPU event: */
                err = perf_allow_cpu(&event->attr);
                if (err)
                        return ERR_PTR(err);

                cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                return ctx;
        }

        err = -EINVAL;
retry:
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;

                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
                ctx = alloc_perf_context(task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;

                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 * If it has already passed perf_event_exit_task().
                 * we must see PF_EXITING, it takes this mutex too.
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
                else if (task->perf_event_ctxp)
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp, ctx);
                }
                mutex_unlock(&task->perf_event_mutex);

                if (unlikely(err)) {
                        put_ctx(ctx);

                        if (err == -EAGAIN)
                                goto retry;
                        goto errout;
                }
        }

        return ctx;

errout:
        return ERR_PTR(err);
}

static struct perf_event_pmu_context *
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
                     struct perf_event *event)
{
        struct perf_event_pmu_context *new = NULL, *epc;
        void *task_ctx_data = NULL;

        if (!ctx->task) {
                /*
                 * perf_pmu_migrate_context() / __perf_pmu_install_event()
                 * relies on the fact that find_get_pmu_context() cannot fail
                 * for CPU contexts.
                 */
                struct perf_cpu_pmu_context *cpc;

                cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
                epc = &cpc->epc;
                raw_spin_lock_irq(&ctx->lock);
                if (!epc->ctx) {
                        atomic_set(&epc->refcount, 1);
                        epc->embedded = 1;
                        list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
                        epc->ctx = ctx;
                } else {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                }
                raw_spin_unlock_irq(&ctx->lock);
                return epc;
        }

        new = kzalloc(sizeof(*epc), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);

        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
                task_ctx_data = alloc_task_ctx_data(pmu);
                if (!task_ctx_data) {
                        kfree(new);
                        return ERR_PTR(-ENOMEM);
                }
        }

        __perf_init_event_pmu_context(new, pmu);

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because perf_event_init_task() doesn't actually hold the
         * child_ctx->mutex.
         */

        raw_spin_lock_irq(&ctx->lock);
        list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (epc->pmu == pmu) {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                        goto found_epc;
                }
        }

        epc = new;
        new = NULL;

        list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
        epc->ctx = ctx;

found_epc:
        if (task_ctx_data && !epc->task_ctx_data) {
                epc->task_ctx_data = task_ctx_data;
                task_ctx_data = NULL;
                ctx->nr_task_data++;
        }
        raw_spin_unlock_irq(&ctx->lock);

        free_task_ctx_data(pmu, task_ctx_data);
        kfree(new);

        return epc;
}

static void get_pmu_ctx(struct perf_event_pmu_context *epc)
{
        WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
}

static void free_epc_rcu(struct rcu_head *head)
{
        struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);

        kfree(epc->task_ctx_data);
        kfree(epc);
}

static void put_pmu_ctx(struct perf_event_pmu_context *epc)
{
        struct perf_event_context *ctx = epc->ctx;
        unsigned long flags;

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because of the call-site in _free_event()/put_event()
         * which isn't always called under ctx->mutex.
         */
        if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
                return;

        WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));

        list_del_init(&epc->pmu_ctx_entry);
        epc->ctx = NULL;

        WARN_ON_ONCE(!list_empty(&epc->pinned_active));
        WARN_ON_ONCE(!list_empty(&epc->flexible_active));

        raw_spin_unlock_irqrestore(&ctx->lock, flags);

        if (epc->embedded)
                return;

        call_rcu(&epc->rcu_head, free_epc_rcu);
}

static void perf_event_free_filter(struct perf_event *event);

static void free_event_rcu(struct rcu_head *head)
{
        struct perf_event *event = container_of(head, typeof(*event), rcu_head);

        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
        kmem_cache_free(perf_event_cache, event);
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb);

static void detach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_del_rcu(&event->sb_list);
        raw_spin_unlock(&pel->lock);
}

static bool is_sb_event(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        if (event->parent)
                return false;

        if (event->attach_state & PERF_ATTACH_TASK)
                return false;

        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
            attr->comm || attr->comm_exec ||
            attr->task || attr->ksymbol ||
            attr->context_switch || attr->text_poke ||
            attr->bpf_event)
                return true;
        return false;
}

static void unaccount_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                detach_sb_event(event);
}

#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif

static void unaccount_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        spin_lock(&nr_freq_lock);
        if (atomic_dec_and_test(&nr_freq_events))
                tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void unaccount_freq_event(void)
{
        if (tick_nohz_full_enabled())
                unaccount_freq_event_nohz();
        else
                atomic_dec(&nr_freq_events);
}

static void unaccount_event(struct perf_event *event)
{
        bool dec = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_dec(&nr_build_id_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_dec(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_dec(&nr_cgroup_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                unaccount_freq_event();
        if (event->attr.context_switch) {
                dec = true;
                atomic_dec(&nr_switch_events);
        }
        if (is_cgroup_event(event))
                dec = true;
        if (has_branch_stack(event))
                dec = true;
        if (event->attr.ksymbol)
                atomic_dec(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_dec(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_dec(&nr_text_poke_events);

        if (dec) {
                if (!atomic_add_unless(&perf_sched_count, -1, 1))
                        schedule_delayed_work(&perf_sched_work, HZ);
        }

        unaccount_pmu_sb_event(event);
}

static void perf_sched_delayed(struct work_struct *work)
{
        mutex_lock(&perf_sched_mutex);
        if (atomic_dec_and_test(&perf_sched_count))
                static_branch_disable(&perf_sched_events);
        mutex_unlock(&perf_sched_mutex);
}

/*
 * The following implement mutual exclusion of events on "exclusive" pmus
 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
 * at a time, so we disallow creating events that might conflict, namely:
 *
 *  1) cpu-wide events in the presence of per-task events,
 *  2) per-task events in the presence of cpu-wide events,
 *  3) two matching events on the same perf_event_context.
 *
 * The former two cases are handled in the allocation path (perf_event_alloc(),
 * _free_event()), the latter -- before the first perf_install_in_context().
 */
static int exclusive_event_init(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return 0;

        /*
         * Prevent co-existence of per-task and cpu-wide events on the
         * same exclusive pmu.
         *
         * Negative pmu::exclusive_cnt means there are cpu-wide
         * events on this "exclusive" pmu, positive means there are
         * per-task events.
         *
         * Since this is called in perf_event_alloc() path, event::ctx
         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
         * to mean "per-task event", because unlike other attach states it
         * never gets cleared.
         */
        if (event->attach_state & PERF_ATTACH_TASK) {
                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
                        return -EBUSY;
        } else {
                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
                        return -EBUSY;
        }

        return 0;
}

static void exclusive_event_destroy(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return;

        /* see comment in exclusive_event_init() */
        if (event->attach_state & PERF_ATTACH_TASK)
                atomic_dec(&pmu->exclusive_cnt);
        else
                atomic_inc(&pmu->exclusive_cnt);
}

static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
        if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
                return true;
        return false;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
{
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;

        lockdep_assert_held(&ctx->mutex);

        if (!is_exclusive_pmu(pmu))
                return true;

        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
                if (exclusive_event_match(iter_event, event))
                        return false;
        }

        return true;
}

static void perf_addr_filters_splice(struct perf_event *event,
                                       struct list_head *head);

static void _free_event(struct perf_event *event)
{
        irq_work_sync(&event->pending_irq);

        unaccount_event(event);

        security_perf_event_free(event);

        if (event->rb) {
                /*
                 * Can happen when we close an event with re-directed output.
                 *
                 * Since we have a 0 refcount, perf_mmap_close() will skip
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
                ring_buffer_attach(event, NULL);
                mutex_unlock(&event->mmap_mutex);
        }

        if (is_cgroup_event(event))
                perf_detach_cgroup(event);

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }

        perf_event_free_bpf_prog(event);
        perf_addr_filters_splice(event, NULL);
        kfree(event->addr_filter_ranges);

        if (event->destroy)
                event->destroy(event);

        /*
         * Must be after ->destroy(), due to uprobe_perf_close() using
         * hw.target.
         */
        if (event->hw.target)
                put_task_struct(event->hw.target);

        if (event->pmu_ctx)
                put_pmu_ctx(event->pmu_ctx);

        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in particular
         * all task references must be cleaned up.
         */
        if (event->ctx)
                put_ctx(event->ctx);

        exclusive_event_destroy(event);
        module_put(event->pmu->module);

        call_rcu(&event->rcu_head, free_event_rcu);
}

/*
 * Used to free events which have a known refcount of 1, such as in error paths
 * where the event isn't exposed yet and inherited events.
 */
static void free_event(struct perf_event *event)
{
        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
                                "unexpected event refcount: %ld; ptr=%p\n",
                                atomic_long_read(&event->refcount), event)) {
                /* leak to avoid use-after-free */
                return;
        }

        _free_event(event);
}

/*
 * Remove user event from the owner task.
 */
static void perf_remove_from_owner(struct perf_event *event)
{
        struct task_struct *owner;

        rcu_read_lock();
        /*
         * Matches the smp_store_release() in perf_event_exit_task(). If we
         * observe !owner it means the list deletion is complete and we can
         * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
        owner = READ_ONCE(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
                 * task reference we can safely take a new reference
                 * while holding the rcu_read_lock().
                 */
                get_task_struct(owner);
        }
        rcu_read_unlock();

        if (owner) {
                /*
                 * If we're here through perf_event_exit_task() we're already
                 * holding ctx->mutex which would be an inversion wrt. the
                 * normal lock order.
                 *
                 * However we can safely take this lock because its the child
                 * ctx->mutex.
                 */
                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);

                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
                if (event->owner) {
                        list_del_init(&event->owner_entry);
                        smp_store_release(&event->owner, NULL);
                }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
}

static void put_event(struct perf_event *event)
{
        if (!atomic_long_dec_and_test(&event->refcount))
                return;

        _free_event(event);
}

/*
 * Kill an event dead; while event:refcount will preserve the event
 * object, it will not preserve its functionality. Once the last 'user'
 * gives up the object, we'll destroy the thing.
 */
int perf_event_release_kernel(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
        LIST_HEAD(free_list);

        /*
         * If we got here through err_alloc: free_event(event); we will not
         * have attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
                                (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
                goto no_ctx;
        }

        if (!is_kernel_event(event))
                perf_remove_from_owner(event);

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);

        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
         *
         * Anybody acquiring event->child_mutex after the below loop _must_
         * also see this, most importantly inherit_event() which will avoid
         * placing more children on the list.
         *
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
        perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);

        perf_event_ctx_unlock(event, ctx);

again:
        mutex_lock(&event->child_mutex);
        list_for_each_entry(child, &event->child_list, child_list) {
                void *var = NULL;

                /*
                 * Cannot change, child events are not migrated, see the
                 * comment with perf_event_ctx_lock_nested().
                 */
                ctx = READ_ONCE(child->ctx);
                /*
                 * Since child_mutex nests inside ctx::mutex, we must jump
                 * through hoops. We start by grabbing a reference on the ctx.
                 *
                 * Since the event cannot get freed while we hold the
                 * child_mutex, the context must also exist and have a !0
                 * reference count.
                 */
                get_ctx(ctx);

                /*
                 * Now that we have a ctx ref, we can drop child_mutex, and
                 * acquire ctx::mutex without fear of it going away. Then we
                 * can re-acquire child_mutex.
                 */
                mutex_unlock(&event->child_mutex);
                mutex_lock(&ctx->mutex);
                mutex_lock(&event->child_mutex);

                /*
                 * Now that we hold ctx::mutex and child_mutex, revalidate our
                 * state, if child is still the first entry, it didn't get freed
                 * and we can continue doing so.
                 */
                tmp = list_first_entry_or_null(&event->child_list,
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
                        list_move(&child->child_list, &free_list);
                        /*
                         * This matches the refcount bump in inherit_event();
                         * this can't be the last reference.
                         */
                        put_event(event);
                } else {
                        var = &ctx->refcount;
                }

                mutex_unlock(&event->child_mutex);
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);

                if (var) {
                        /*
                         * If perf_event_free_task() has deleted all events from the
                         * ctx while the child_mutex got released above, make sure to
                         * notify about the preceding put_ctx().
                         */
                        smp_mb(); /* pairs with wait_var_event() */
                        wake_up_var(var);
                }
                goto again;
        }
        mutex_unlock(&event->child_mutex);

        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
                void *var = &child->ctx->refcount;

                list_del(&child->child_list);
                free_event(child);

                /*
                 * Wake any perf_event_free_task() waiting for this event to be
                 * freed.
                 */
                smp_mb(); /* pairs with wait_var_event() */
                wake_up_var(var);
        }

no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);

/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
        perf_event_release_kernel(file->private_data);
        return 0;
}

static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event *child;
        u64 total = 0;

        *enabled = 0;
        *running = 0;

        mutex_lock(&event->child_mutex);

        (void)perf_event_read(event, false);
        total += perf_event_count(event);

        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);

        list_for_each_entry(child, &event->child_list, child_list) {
                (void)perf_event_read(child, false);
                total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
        mutex_unlock(&event->child_mutex);

        return total;
}

u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        count = __perf_event_read_value(event, enabled, running);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
{
        struct perf_event_context *ctx = leader->ctx;
        struct perf_event *sub, *parent;
        unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;

        ret = perf_event_read(leader, true);
        if (ret)
                return ret;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        /*
         * Verify the grouping between the parent and child (inherited)
         * events is still in tact.
         *
         * Specifically:
         *  - leader->ctx->lock pins leader->sibling_list
         *  - parent->child_mutex pins parent->child_list
         *  - parent->ctx->mutex pins parent->sibling_list
         *
         * Because parent->ctx != leader->ctx (and child_list nests inside
         * ctx->mutex), group destruction is not atomic between children, also
         * see perf_event_release_kernel(). Additionally, parent can grow the
         * group.
         *
         * Therefore it is possible to have parent and child groups in a
         * different configuration and summing over such a beast makes no sense
         * what so ever.
         *
         * Reject this.
         */
        parent = leader->parent;
        if (parent &&
            (parent->group_generation != leader->group_generation ||
             parent->nr_siblings != leader->nr_siblings)) {
                ret = -ECHILD;
                goto unlock;
        }

        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
         * will be identical to those of the leader, so we only publish one
         * set.
         */
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] += leader->total_time_enabled +
                        atomic64_read(&leader->child_total_time_enabled);
        }

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] += leader->total_time_running +
                        atomic64_read(&leader->child_total_time_running);
        }

        /*
         * Write {count,id} tuples for every sibling.
         */
        values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);
        }

unlock:
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return ret;
}

static int perf_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
{
        struct perf_event *leader = event->group_leader, *child;
        struct perf_event_context *ctx = leader->ctx;
        int ret;
        u64 *values;

        lockdep_assert_held(&ctx->mutex);

        values = kzalloc(event->read_size, GFP_KERNEL);
        if (!values)
                return -ENOMEM;

        values[0] = 1 + leader->nr_siblings;

        mutex_lock(&leader->child_mutex);

        ret = __perf_read_group_add(leader, read_format, values);
        if (ret)
                goto unlock;

        list_for_each_entry(child, &leader->child_list, child_list) {
                ret = __perf_read_group_add(child, read_format, values);
                if (ret)
                        goto unlock;
        }

        mutex_unlock(&leader->child_mutex);

        ret = event->read_size;
        if (copy_to_user(buf, values, event->read_size))
                ret = -EFAULT;
        goto out;

unlock:
        mutex_unlock(&leader->child_mutex);
out:
        kfree(values);
        return ret;
}

static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
{
        u64 enabled, running;
        u64 values[5];
        int n = 0;

        values[n++] = __perf_event_read_value(event, &enabled, &running);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        if (copy_to_user(buf, values, n * sizeof(u64)))
                return -EFAULT;

        return n * sizeof(u64);
}

static bool is_event_hup(struct perf_event *event)
{
        bool no_children;

        if (event->state > PERF_EVENT_STATE_EXIT)
                return false;

        mutex_lock(&event->child_mutex);
        no_children = list_empty(&event->child_list);
        mutex_unlock(&event->child_mutex);
        return no_children;
}

/*
 * Read the performance event - simple non blocking version for now
 */
static ssize_t
__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
        u64 read_format = event->attr.read_format;
        int ret;

        /*
         * Return end-of-file for a read on an event that is in
         * error state (i.e. because it was pinned but it couldn't be
         * scheduled on to the CPU at some point).
         */
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;

        if (count < event->read_size)
                return -ENOSPC;

        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_read_group(event, read_format, buf);
        else
                ret = perf_read_one(event, read_format, buf);

        return ret;
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        int ret;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

static __poll_t perf_poll(struct file *file, poll_table *wait)
{
        struct perf_event *event = file->private_data;
        struct perf_buffer *rb;
        __poll_t events = EPOLLHUP;

        poll_wait(file, &event->waitq, wait);

        if (is_event_hup(event))
                return events;

        /*
         * Pin the event->rb by taking event->mmap_mutex; otherwise
         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
        rb = event->rb;
        if (rb)
                events = atomic_xchg(&rb->poll, 0);
        mutex_unlock(&event->mmap_mutex);
        return events;
}

static void _perf_event_reset(struct perf_event *event)
{
        (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
}

/* Assume it's not an event with inherit set. */
u64 perf_event_pause(struct perf_event *event, bool reset)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(event->attr.inherit);
        _perf_event_disable(event);
        count = local64_read(&event->count);
        if (reset)
                local64_set(&event->count, 0);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_pause);

/*
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in perf_event_exit_event() if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
 */
static void perf_event_for_each_child(struct perf_event *event,
                                        void (*func)(struct perf_event *))
{
        struct perf_event *child;

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
                func(child);
        mutex_unlock(&event->child_mutex);
}

static void perf_event_for_each(struct perf_event *event,
                                  void (*func)(struct perf_event *))
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;

        lockdep_assert_held(&ctx->mutex);

        event = event->group_leader;

        perf_event_for_each_child(event, func);
        for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
}

static void __perf_event_period(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        u64 value = *((u64 *)info);
        bool active;

        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
                event->attr.sample_period = value;
                event->hw.sample_period = value;
        }

        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
                perf_pmu_disable(event->pmu);
                /*
                 * We could be throttled; unthrottle now to avoid the tick
                 * trying to unthrottle while we already re-started the event.
                 */
                if (event->hw.interrupts == MAX_INTERRUPTS) {
                        event->hw.interrupts = 0;
                        perf_log_throttle(event, 1);
                }
                event->pmu->stop(event, PERF_EF_UPDATE);
        }

        local64_set(&event->hw.period_left, 0);

        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(event->pmu);
        }
}

static int perf_event_check_period(struct perf_event *event, u64 value)
{
        return event->pmu->check_period(event, value);
}

static int _perf_event_period(struct perf_event *event, u64 value)
{
        if (!is_sampling_event(event))
                return -EINVAL;

        if (!value)
                return -EINVAL;

        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
                return -EINVAL;

        if (perf_event_check_period(event, value))
                return -EINVAL;

        if (!event->attr.freq && (value & (1ULL << 63)))
                return -EINVAL;

        event_function_call(event, __perf_event_period, &value);

        return 0;
}

int perf_event_period(struct perf_event *event, u64 value)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_period(event, value);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_period);

static const struct file_operations perf_fops;

static inline int perf_fget_light(int fd, struct fd *p)
{
        struct fd f = fdget(fd);
        if (!f.file)
                return -EBADF;

        if (f.file->f_op != &perf_fops) {
                fdput(f);
                return -EBADF;
        }
        *p = f;
        return 0;
}

static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
        void (*func)(struct perf_event *);
        u32 flags = arg;

        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
                func = _perf_event_reset;
                break;

        case PERF_EVENT_IOC_REFRESH:
                return _perf_event_refresh(event, arg);

        case PERF_EVENT_IOC_PERIOD:
        {
                u64 value;

                if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
                        return -EFAULT;

                return _perf_event_period(event, value);
        }
        case PERF_EVENT_IOC_ID:
        {
                u64 id = primary_event_id(event);

                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
                        return -EFAULT;
                return 0;
        }

        case PERF_EVENT_IOC_SET_OUTPUT:
        {
                int ret;
                if (arg != -1) {
                        struct perf_event *output_event;
                        struct fd output;
                        ret = perf_fget_light(arg, &output);
                        if (ret)
                                return ret;
                        output_event = output.file->private_data;
                        ret = perf_event_set_output(event, output_event);
                        fdput(output);
                } else {
                        ret = perf_event_set_output(event, NULL);
                }
                return ret;
        }

        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);

        case PERF_EVENT_IOC_SET_BPF:
        {
                struct bpf_prog *prog;
                int err;

                prog = bpf_prog_get(arg);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);

                err = perf_event_set_bpf_prog(event, prog, 0);
                if (err) {
                        bpf_prog_put(prog);
                        return err;
                }

                return 0;
        }

        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                struct perf_buffer *rb;

                rcu_read_lock();
                rb = rcu_dereference(event->rb);
                if (!rb || !rb->nr_pages) {
                        rcu_read_unlock();
                        return -EINVAL;
                }
                rb_toggle_paused(rb, !!arg);
                rcu_read_unlock();
                return 0;
        }

        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);

        case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
                struct perf_event_attr new_attr;
                int err = perf_copy_attr((struct perf_event_attr __user *)arg,
                                         &new_attr);

                if (err)
                        return err;

                return perf_event_modify_attr(event,  &new_attr);
        }
        default:
                return -ENOTTY;
        }

        if (flags & PERF_IOC_FLAG_GROUP)
                perf_event_for_each(event, func);
        else
                perf_event_for_each_child(event, func);

        return 0;
}

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        long ret;

        /* Treat ioctl like writes as it is likely a mutating operation. */
        ret = security_perf_event_write(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_ioctl(event, cmd, arg);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
{
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
        case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
        case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
                }
                break;
        }
        return perf_ioctl(file, cmd, arg);
}
#else
# define perf_compat_ioctl NULL
#endif

int perf_event_task_enable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_enable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

int perf_event_task_disable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_disable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

static int perf_event_index(struct perf_event *event)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;

        return event->pmu->event_idx(event);
}

static void perf_event_init_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        userpg = rb->user_page;

        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
        userpg->data_offset = PAGE_SIZE;
        userpg->data_size = perf_data_size(rb);

unlock:
        rcu_read_unlock();
}

void __weak arch_perf_update_userpage(
        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
void perf_event_update_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;
        u64 enabled, running, now;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we can be called in
         * NMI context
         */
        calc_timer_values(event, &now, &enabled, &running);

        userpg = rb->user_page;
        /*
         * Disable preemption to guarantee consistent time stamps are stored to
         * the user page.
         */
        preempt_disable();
        ++userpg->lock;
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
        if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);

        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);

        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);

        arch_perf_update_userpage(event, userpg, now);

        barrier();
        ++userpg->lock;
        preempt_enable();
unlock:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);

static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
        struct perf_event *event = vmf->vma->vm_file->private_data;
        struct perf_buffer *rb;
        vm_fault_t ret = VM_FAULT_SIGBUS;

        if (vmf->flags & FAULT_FLAG_MKWRITE) {
                if (vmf->pgoff == 0)
                        ret = 0;
                return ret;
        }

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                goto unlock;

        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
        if (!vmf->page)
                goto unlock;

        get_page(vmf->page);
        vmf->page->mapping = vmf->vma->vm_file->f_mapping;
        vmf->page->index   = vmf->pgoff;

        ret = 0;
unlock:
        rcu_read_unlock();

        return ret;
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb)
{
        struct perf_buffer *old_rb = NULL;
        unsigned long flags;

        WARN_ON_ONCE(event->parent);

        if (event->rb) {
                /*
                 * Should be impossible, we set this when removing
                 * event->rb_entry and wait/clear when adding event->rb_entry.
                 */
                WARN_ON_ONCE(event->rcu_pending);

                old_rb = event->rb;
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);

                event->rcu_batches = get_state_synchronize_rcu();
                event->rcu_pending = 1;
        }

        if (rb) {
                if (event->rcu_pending) {
                        cond_synchronize_rcu(event->rcu_batches);
                        event->rcu_pending = 0;
                }

                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }

        /*
         * Avoid racing with perf_mmap_close(AUX): stop the event
         * before swizzling the event::rb pointer; if it's getting
         * unmapped, its aux_mmap_count will be 0 and it won't
         * restart. See the comment in __perf_pmu_output_stop().
         *
         * Data will inevitably be lost when set_output is done in
         * mid-air, but then again, whoever does it like this is
         * not in for the data anyway.
         */
        if (has_aux(event))
                perf_event_stop(event, 0);

        rcu_assign_pointer(event->rb, rb);

        if (old_rb) {
                ring_buffer_put(old_rb);
                /*
                 * Since we detached before setting the new rb, so that we
                 * could attach the new rb, we could have missed a wakeup.
                 * Provide it now.
                 */
                wake_up_all(&event->waitq);
        }
}

static void ring_buffer_wakeup(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
                        wake_up_all(&event->waitq);
        }
        rcu_read_unlock();
}

struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                if (!refcount_inc_not_zero(&rb->refcount))
                        rb = NULL;
        }
        rcu_read_unlock();

        return rb;
}

void ring_buffer_put(struct perf_buffer *rb)
{
        if (!refcount_dec_and_test(&rb->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&rb->event_list));

        call_rcu(&rb->rcu_head, rb_free_rcu);
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;

        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);

        if (vma->vm_pgoff)
                atomic_inc(&event->rb->aux_mmap_count);

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);
}

static void perf_pmu_output_stop(struct perf_event *event);

/*
 * A buffer can be mmap()ed multiple times; either directly through the same
 * event, or through other events by use of perf_event_set_output().
 *
 * In order to undo the VM accounting done by perf_mmap() we need to destroy
 * the buffer here, where we still have a VM context. This means we need
 * to detach all events redirecting to us.
 */
static void perf_mmap_close(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;
        struct perf_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
        bool detach_rest = false;

        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);

        /*
         * rb->aux_mmap_count will always drop before rb->mmap_count and
         * event->mmap_count, so it is ok to use event->mmap_mutex to
         * serialize with perf_mmap here.
         */
        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
                /*
                 * Stop all AUX events that are writing to this buffer,
                 * so that we can free its AUX pages and corresponding PMU
                 * data. Note that after rb::aux_mmap_count dropped to zero,
                 * they won't start any more (see perf_aux_output_begin()).
                 */
                perf_pmu_output_stop(event);

                /* now it's safe to free the pages */
                atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
                atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);

                /* this has to be the last one */
                rb_free_aux(rb);
                WARN_ON_ONCE(refcount_read(&rb->aux_refcount));

                mutex_unlock(&event->mmap_mutex);
        }

        if (atomic_dec_and_test(&rb->mmap_count))
                detach_rest = true;

        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
                goto out_put;

        ring_buffer_attach(event, NULL);
        mutex_unlock(&event->mmap_mutex);

        /* If there's still other mmap()s of this buffer, we're done. */
        if (!detach_rest)
                goto out_put;

        /*
         * No other mmap()s, detach from all other events that might redirect
         * into the now unreachable buffer. Somewhat complicated by the
         * fact that rb::event_lock otherwise nests inside mmap_mutex.
         */
again:
        rcu_read_lock();
        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
                if (!atomic_long_inc_not_zero(&event->refcount)) {
                        /*
                         * This event is en-route to free_event() which will
                         * detach it and remove it from the list.
                         */
                        continue;
                }
                rcu_read_unlock();

                mutex_lock(&event->mmap_mutex);
                /*
                 * Check we didn't race with perf_event_set_output() which can
                 * swizzle the rb from under us while we were waiting to
                 * acquire mmap_mutex.
                 *
                 * If we find a different rb; ignore this event, a next
                 * iteration will no longer find it on the list. We have to
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
                if (event->rb == rb)
                        ring_buffer_attach(event, NULL);

                mutex_unlock(&event->mmap_mutex);
                put_event(event);

                /*
                 * Restart the iteration; either we're on the wrong list or
                 * destroyed its integrity by doing a deletion.
                 */
                goto again;
        }
        rcu_read_unlock();

        /*
         * It could be there's still a few 0-ref events on the list; they'll
         * get cleaned up by free_event() -- they'll also still have their
         * ref on the rb and will free it whenever they are done with it.
         *
         * Aside from that, this buffer is 'fully' detached and unmapped,
         * undo the VM accounting.
         */

        atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
                        &mmap_user->locked_vm);
        atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
        free_uid(mmap_user);

out_put:
        ring_buffer_put(rb); /* could be last */
}

static const struct vm_operations_struct perf_mmap_vmops = {
        .open                = perf_mmap_open,
        .close                = perf_mmap_close, /* non mergeable */
        .fault                = perf_mmap_fault,
        .page_mkwrite        = perf_mmap_fault,
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct perf_event *event = file->private_data;
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        struct perf_buffer *rb = NULL;
        unsigned long locked, lock_limit;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra = 0, extra = 0;
        int ret = 0, flags = 0;

        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        vma_size = vma->vm_end - vma->vm_start;

        if (vma->vm_pgoff == 0) {
                nr_pages = (vma_size / PAGE_SIZE) - 1;
        } else {
                /*
                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
                 * mapped, all subsequent mappings should have the same size
                 * and offset. Must be above the normal perf buffer.
                 */
                u64 aux_offset, aux_size;

                if (!event->rb)
                        return -EINVAL;

                nr_pages = vma_size / PAGE_SIZE;

                mutex_lock(&event->mmap_mutex);
                ret = -EINVAL;

                rb = event->rb;
                if (!rb)
                        goto aux_unlock;

                aux_offset = READ_ONCE(rb->user_page->aux_offset);
                aux_size = READ_ONCE(rb->user_page->aux_size);

                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                        goto aux_unlock;

                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
                        goto aux_unlock;

                /* already mapped with a different offset */
                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
                        goto aux_unlock;

                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
                        goto aux_unlock;

                /* already mapped with a different size */
                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
                        goto aux_unlock;

                if (!is_power_of_2(nr_pages))
                        goto aux_unlock;

                if (!atomic_inc_not_zero(&rb->mmap_count))
                        goto aux_unlock;

                if (rb_has_aux(rb)) {
                        atomic_inc(&rb->aux_mmap_count);
                        ret = 0;
                        goto unlock;
                }

                atomic_set(&rb->aux_mmap_count, 1);
                user_extra = nr_pages;

                goto accounting;
        }

        /*
         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
                return -EINVAL;

        if (vma_size != PAGE_SIZE * (1 + nr_pages))
                return -EINVAL;

        WARN_ON_ONCE(event->ctx->parent_ctx);
again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
                if (data_page_nr(event->rb) != nr_pages) {
                        ret = -EINVAL;
                        goto unlock;
                }

                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
                        /*
                         * Raced against perf_mmap_close(); remove the
                         * event and try again.
                         */
                        ring_buffer_attach(event, NULL);
                        mutex_unlock(&event->mmap_mutex);
                        goto again;
                }

                goto unlock;
        }

        user_extra = nr_pages + 1;

accounting:
        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);

        /*
         * Increase the limit linearly with more CPUs:
         */
        user_lock_limit *= num_online_cpus();

        user_locked = atomic_long_read(&user->locked_vm);

        /*
         * sysctl_perf_event_mlock may have changed, so that
         *     user->locked_vm > user_lock_limit
         */
        if (user_locked > user_lock_limit)
                user_locked = user_lock_limit;
        user_locked += user_extra;

        if (user_locked > user_lock_limit) {
                /*
                 * charge locked_vm until it hits user_lock_limit;
                 * charge the rest from pinned_vm
                 */
                extra = user_locked - user_lock_limit;
                user_extra -= extra;
        }

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;

        if ((locked > lock_limit) && perf_is_paranoid() &&
                !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }

        WARN_ON(!rb && event->rb);

        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;

        if (!rb) {
                rb = rb_alloc(nr_pages,
                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
                              event->cpu, flags);

                if (!rb) {
                        ret = -ENOMEM;
                        goto unlock;
                }

                atomic_set(&rb->mmap_count, 1);
                rb->mmap_user = get_current_user();
                rb->mmap_locked = extra;

                ring_buffer_attach(event, rb);

                perf_event_update_time(event);
                perf_event_init_userpage(event);
                perf_event_update_userpage(event);
        } else {
                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
                                   event->attr.aux_watermark, flags);
                if (!ret)
                        rb->aux_mmap_locked = extra;
        }

unlock:
        if (!ret) {
                atomic_long_add(user_extra, &user->locked_vm);
                atomic64_add(extra, &vma->vm_mm->pinned_vm);

                atomic_inc(&event->mmap_count);
        } else if (rb) {
                atomic_dec(&rb->mmap_count);
        }
aux_unlock:
        mutex_unlock(&event->mmap_mutex);

        /*
         * Since pinned accounting is per vm we cannot allow fork() to copy our
         * vma.
         */
        vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
        vma->vm_ops = &perf_mmap_vmops;

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);

        return ret;
}

static int perf_fasync(int fd, struct file *filp, int on)
{
        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;

        inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
        inode_unlock(inode);

        if (retval < 0)
                return retval;

        return 0;
}

static const struct file_operations perf_fops = {
        .llseek                        = no_llseek,
        .release                = perf_release,
        .read                        = perf_read,
        .poll                        = perf_poll,
        .unlocked_ioctl                = perf_ioctl,
        .compat_ioctl                = perf_compat_ioctl,
        .mmap                        = perf_mmap,
        .fasync                        = perf_fasync,
};

/*
 * Perf event wakeup
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

void perf_event_wakeup(struct perf_event *event)
{
        ring_buffer_wakeup(event);

        if (event->pending_kill) {
                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                event->pending_kill = 0;
        }
}

static void perf_sigtrap(struct perf_event *event)
{
        /*
         * We'd expect this to only occur if the irq_work is delayed and either
         * ctx->task or current has changed in the meantime. This can be the
         * case on architectures that do not implement arch_irq_work_raise().
         */
        if (WARN_ON_ONCE(event->ctx->task != current))
                return;

        /*
         * Both perf_pending_task() and perf_pending_irq() can race with the
         * task exiting.
         */
        if (current->flags & PF_EXITING)
                return;

        send_sig_perf((void __user *)event->pending_addr,
                      event->orig_type, event->attr.sig_data);
}

/*
 * Deliver the pending work in-event-context or follow the context.
 */
static void __perf_pending_irq(struct perf_event *event)
{
        int cpu = READ_ONCE(event->oncpu);

        /*
         * If the event isn't running; we done. event_sched_out() will have
         * taken care of things.
         */
        if (cpu < 0)
                return;

        /*
         * Yay, we hit home and are in the context of the event.
         */
        if (cpu == smp_processor_id()) {
                if (event->pending_sigtrap) {
                        event->pending_sigtrap = 0;
                        perf_sigtrap(event);
                        local_dec(&event->ctx->nr_pending);
                }
                if (event->pending_disable) {
                        event->pending_disable = 0;
                        perf_event_disable_local(event);
                }
                return;
        }

        /*
         *  CPU-A                        CPU-B
         *
         *  perf_event_disable_inatomic()
         *    @pending_disable = CPU-A;
         *    irq_work_queue();
         *
         *  sched-out
         *    @pending_disable = -1;
         *
         *                                sched-in
         *                                perf_event_disable_inatomic()
         *                                  @pending_disable = CPU-B;
         *                                  irq_work_queue(); // FAILS
         *
         *  irq_work_run()
         *    perf_pending_irq()
         *
         * But the event runs on CPU-B and wants disabling there.
         */
        irq_work_queue_on(&event->pending_irq, cpu);
}

static void perf_pending_irq(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();

        /*
         * The wakeup isn't bound to the context of the event -- it can happen
         * irrespective of where the event is.
         */
        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }

        __perf_pending_irq(event);

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

static void perf_pending_task(struct callback_head *head)
{
        struct perf_event *event = container_of(head, struct perf_event, pending_task);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();

        if (event->pending_work) {
                event->pending_work = 0;
                perf_sigtrap(event);
                local_dec(&event->ctx->nr_pending);
        }

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
        preempt_enable_notrace();

        put_event(event);
}

#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);

void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
                return;

        rcu_assign_pointer(perf_guest_cbs, cbs);
        static_call_update(__perf_guest_state, cbs->state);
        static_call_update(__perf_guest_get_ip, cbs->get_ip);

        /* Implementing ->handle_intel_pt_intr is optional. */
        if (cbs->handle_intel_pt_intr)
                static_call_update(__perf_guest_handle_intel_pt_intr,
                                   cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
                return;

        rcu_assign_pointer(perf_guest_cbs, NULL);
        static_call_update(__perf_guest_state, (void *)&__static_call_return0);
        static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
        static_call_update(__perf_guest_handle_intel_pt_intr,
                           (void *)&__static_call_return0);
        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
#endif

static void
perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
{
        int bit;
        DECLARE_BITMAP(_mask, 64);

        bitmap_from_u64(_mask, mask);
        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;

                val = perf_reg_value(regs, bit);
                perf_output_put(handle, val);
        }
}

static void perf_sample_regs_user(struct perf_regs *regs_user,
                                  struct pt_regs *regs)
{
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
        } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
        }
}

static void perf_sample_regs_intr(struct perf_regs *regs_intr,
                                  struct pt_regs *regs)
{
        regs_intr->regs = regs;
        regs_intr->abi  = perf_reg_abi(current);
}


/*
 * Get remaining task size from user stack pointer.
 *
 * It'd be better to take stack vma map and limit this more
 * precisely, but there's no way to get it safely under interrupt,
 * so using TASK_SIZE as limit.
 */
static u64 perf_ustack_task_size(struct pt_regs *regs)
{
        unsigned long addr = perf_user_stack_pointer(regs);

        if (!addr || addr >= TASK_SIZE)
                return 0;

        return TASK_SIZE - addr;
}

static u16
perf_sample_ustack_size(u16 stack_size, u16 header_size,
                        struct pt_regs *regs)
{
        u64 task_size;

        /* No regs, no stack pointer, no dump. */
        if (!regs)
                return 0;

        /*
         * Check if we fit in with the requested stack size into the:
         * - TASK_SIZE
         *   If we don't, we limit the size to the TASK_SIZE.
         *
         * - remaining sample size
         *   If we don't, we customize the stack size to
         *   fit in to the remaining sample size.
         */

        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
        stack_size = min(stack_size, (u16) task_size);

        /* Current header size plus static size and dynamic size. */
        header_size += 2 * sizeof(u64);

        /* Do we fit in with the current stack dump size? */
        if ((u16) (header_size + stack_size) < header_size) {
                /*
                 * If we overflow the maximum size for the sample,
                 * we customize the stack dump size to fit in.
                 */
                stack_size = USHRT_MAX - header_size - sizeof(u64);
                stack_size = round_up(stack_size, sizeof(u64));
        }

        return stack_size;
}

static void
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
                          struct pt_regs *regs)
{
        /* Case of a kernel thread, nothing to dump */
        if (!regs) {
                u64 size = 0;
                perf_output_put(handle, size);
        } else {
                unsigned long sp;
                unsigned int rem;
                u64 dyn_size;

                /*
                 * We dump:
                 * static size
                 *   - the size requested by user or the best one we can fit
                 *     in to the sample max size
                 * data
                 *   - user stack dump data
                 * dynamic size
                 *   - the actual dumped size
                 */

                /* Static size. */
                perf_output_put(handle, dump_size);

                /* Data. */
                sp = perf_user_stack_pointer(regs);
                rem = __output_copy_user(handle, (void *) sp, dump_size);
                dyn_size = dump_size - rem;

                perf_output_skip(handle, rem);

                /* Dynamic size. */
                perf_output_put(handle, dyn_size);
        }
}

static unsigned long perf_prepare_sample_aux(struct perf_event *event,
                                          struct perf_sample_data *data,
                                          size_t size)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;

        data->aux_size = 0;

        if (!sampler)
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
                goto out;

        rb = ring_buffer_get(sampler);
        if (!rb)
                goto out;

        /*
         * If this is an NMI hit inside sampling code, don't take
         * the sample. See also perf_aux_sample_output().
         */
        if (READ_ONCE(rb->aux_in_sampling)) {
                data->aux_size = 0;
        } else {
                size = min_t(size_t, size, perf_aux_size(rb));
                data->aux_size = ALIGN(size, sizeof(u64));
        }
        ring_buffer_put(rb);

out:
        return data->aux_size;
}

static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
                                 struct perf_event *event,
                                 struct perf_output_handle *handle,
                                 unsigned long size)
{
        unsigned long flags;
        long ret;

        /*
         * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
         * paths. If we start calling them in NMI context, they may race with
         * the IRQ ones, that is, for example, re-starting an event that's just
         * been stopped, which is why we're using a separate callback that
         * doesn't change the event state.
         *
         * IRQs need to be disabled to prevent IPIs from racing with us.
         */
        local_irq_save(flags);
        /*
         * Guard against NMI hits inside the critical section;
         * see also perf_prepare_sample_aux().
         */
        WRITE_ONCE(rb->aux_in_sampling, 1);
        barrier();

        ret = event->pmu->snapshot_aux(event, handle, size);

        barrier();
        WRITE_ONCE(rb->aux_in_sampling, 0);
        local_irq_restore(flags);

        return ret;
}

static void perf_aux_sample_output(struct perf_event *event,
                                   struct perf_output_handle *handle,
                                   struct perf_sample_data *data)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;
        unsigned long pad;
        long size;

        if (WARN_ON_ONCE(!sampler || !data->aux_size))
                return;

        rb = ring_buffer_get(sampler);
        if (!rb)
                return;

        size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);

        /*
         * An error here means that perf_output_copy() failed (returned a
         * non-zero surplus that it didn't copy), which in its current
         * enlightened implementation is not possible. If that changes, we'd
         * like to know.
         */
        if (WARN_ON_ONCE(size < 0))
                goto out_put;

        /*
         * The pad comes from ALIGN()ing data->aux_size up to u64 in
         * perf_prepare_sample_aux(), so should not be more than that.
         */
        pad = data->aux_size - size;
        if (WARN_ON_ONCE(pad >= sizeof(u64)))
                pad = 8;

        if (pad) {
                u64 zero = 0;
                perf_output_copy(handle, &zero, pad);
        }

out_put:
        ring_buffer_put(rb);
}

/*
 * A set of common sample data types saved even for non-sample records
 * when event->attr.sample_id_all is set.
 */
#define PERF_SAMPLE_ID_ALL  (PERF_SAMPLE_TID | PERF_SAMPLE_TIME |        \
                             PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID |        \
                             PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)

static void __perf_event_header__init_id(struct perf_sample_data *data,
                                         struct perf_event *event,
                                         u64 sample_type)
{
        data->type = event->attr.sample_type;
        data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;

        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
                data->tid_entry.tid = perf_event_tid(event, current);
        }

        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_event_clock(event);

        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                data->stream_id = event->id;

        if (sample_type & PERF_SAMPLE_CPU) {
                data->cpu_entry.cpu         = raw_smp_processor_id();
                data->cpu_entry.reserved = 0;
        }
}

void perf_event_header__init_id(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event)
{
        if (event->attr.sample_id_all) {
                header->size += event->id_header_size;
                __perf_event_header__init_id(data, event, event->attr.sample_type);
        }
}

static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                                           struct perf_sample_data *data)
{
        u64 sample_type = data->type;

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);
}

void perf_event__output_id_sample(struct perf_event *event,
                                  struct perf_output_handle *handle,
                                  struct perf_sample_data *sample)
{
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
}

static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
{
        u64 read_format = event->attr.read_format;
        u64 values[5];
        int n = 0;

        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));
}

static void perf_output_read_group(struct perf_output_handle *handle,
                            struct perf_event *event,
                            u64 enabled, u64 running)
{
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
        unsigned long flags;
        u64 values[6];
        int n = 0;

        /*
         * Disabling interrupts avoids all counter scheduling
         * (context switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        values[n++] = 1 + leader->nr_siblings;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;

        if ((leader != event) &&
            (leader->state == PERF_EVENT_STATE_ACTIVE))
                leader->pmu->read(leader);

        values[n++] = perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));

        for_each_sibling_event(sub, leader) {
                n = 0;

                if ((sub != event) &&
                    (sub->state == PERF_EVENT_STATE_ACTIVE))
                        sub->pmu->read(sub);

                values[n++] = perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);

                __output_copy(handle, values, n * sizeof(u64));
        }

        local_irq_restore(flags);
}

#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
                                 PERF_FORMAT_TOTAL_TIME_RUNNING)

/*
 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
 *
 * The problem is that its both hard and excessively expensive to iterate the
 * child list, not to mention that its impossible to IPI the children running
 * on another CPU, from interrupt/NMI context.
 */
static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
{
        u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we are called in
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
                calc_timer_values(event, &now, &enabled, &running);

        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
        else
                perf_output_read_one(handle, event, enabled, running);
}

void perf_output_sample(struct perf_output_handle *handle,
                        struct perf_event_header *header,
                        struct perf_sample_data *data,
                        struct perf_event *event)
{
        u64 sample_type = data->type;

        perf_output_put(handle, *header);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ADDR)
                perf_output_put(handle, data->addr);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_PERIOD)
                perf_output_put(handle, data->period);

        if (sample_type & PERF_SAMPLE_READ)
                perf_output_read(handle, event);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                size += data->callchain->nr;
                size *= sizeof(u64);
                __output_copy(handle, data->callchain, size);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;

                        perf_output_put(handle, raw->size);
                        do {
                                if (frag->copy) {
                                        __output_custom(handle, frag->copy,
                                                        frag->data, frag->size);
                                } else {
                                        __output_copy(handle, frag->data,
                                                      frag->size);
                                }
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);
                        if (frag->pad)
                                __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32        size;
                                u32        data;
                        } raw = {
                                .size = sizeof(u32),
                                .data = 0,
                        };
                        perf_output_put(handle, raw);
                }
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;

                        size = data->br_stack->nr
                             * sizeof(struct perf_branch_entry);

                        perf_output_put(handle, data->br_stack->nr);
                        if (branch_sample_hw_index(event))
                                perf_output_put(handle, data->br_stack->hw_idx);
                        perf_output_copy(handle, data->br_stack->entries, size);
                        /*
                         * Add the extension space which is appended
                         * right after the struct perf_branch_stack.
                         */
                        if (data->br_stack_cntr) {
                                size = data->br_stack->nr * sizeof(u64);
                                perf_output_copy(handle, data->br_stack_cntr, size);
                        }
                } else {
                        /*
                         * we always store at least the value of nr
                         */
                        u64 nr = 0;
                        perf_output_put(handle, nr);
                }
        }

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                u64 abi = data->regs_user.abi;

                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_user;
                        perf_output_sample_regs(handle,
                                                data->regs_user.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
        }

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                perf_output_put(handle, data->weight.full);

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                perf_output_put(handle, data->txn);

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                u64 abi = data->regs_intr.abi;
                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_intr;

                        perf_output_sample_regs(handle,
                                                data->regs_intr.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                perf_output_put(handle, data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                perf_output_put(handle, data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                perf_output_put(handle, data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                perf_output_put(handle, data->code_page_size);

        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);

                if (data->aux_size)
                        perf_aux_sample_output(event, handle, data);
        }

        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;

                if (wakeup_events) {
                        struct perf_buffer *rb = handle->rb;
                        int events = local_inc_return(&rb->events);

                        if (events >= wakeup_events) {
                                local_sub(wakeup_events, &rb->events);
                                local_inc(&rb->wakeup);
                        }
                }
        }
}

static u64 perf_virt_to_phys(u64 virt)
{
        u64 phys_addr = 0;

        if (!virt)
                return 0;

        if (virt >= TASK_SIZE) {
                /* If it's vmalloc()d memory, leave phys_addr as 0 */
                if (virt_addr_valid((void *)(uintptr_t)virt) &&
                    !(virt >= VMALLOC_START && virt < VMALLOC_END))
                        phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
        } else {
                /*
                 * Walking the pages tables for user address.
                 * Interrupts are disabled, so it prevents any tear down
                 * of the page tables.
                 * Try IRQ-safe get_user_page_fast_only first.
                 * If failed, leave phys_addr as 0.
                 */
                if (current->mm != NULL) {
                        struct page *p;

                        pagefault_disable();
                        if (get_user_page_fast_only(virt, 0, &p)) {
                                phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
                                put_page(p);
                        }
                        pagefault_enable();
                }
        }

        return phys_addr;
}

/*
 * Return the pagetable size of a given virtual address.
 */
static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
        u64 size = 0;

#ifdef CONFIG_HAVE_GUP_FAST
        pgd_t *pgdp, pgd;
        p4d_t *p4dp, p4d;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;
        pte_t *ptep, pte;

        pgdp = pgd_offset(mm, addr);
        pgd = READ_ONCE(*pgdp);
        if (pgd_none(pgd))
                return 0;

        if (pgd_leaf(pgd))
                return pgd_leaf_size(pgd);

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        p4d = READ_ONCE(*p4dp);
        if (!p4d_present(p4d))
                return 0;

        if (p4d_leaf(p4d))
                return p4d_leaf_size(p4d);

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        pud = READ_ONCE(*pudp);
        if (!pud_present(pud))
                return 0;

        if (pud_leaf(pud))
                return pud_leaf_size(pud);

        pmdp = pmd_offset_lockless(pudp, pud, addr);
again:
        pmd = pmdp_get_lockless(pmdp);
        if (!pmd_present(pmd))
                return 0;

        if (pmd_leaf(pmd))
                return pmd_leaf_size(pmd);

        ptep = pte_offset_map(&pmd, addr);
        if (!ptep)
                goto again;

        pte = ptep_get_lockless(ptep);
        if (pte_present(pte))
                size = pte_leaf_size(pte);
        pte_unmap(ptep);
#endif /* CONFIG_HAVE_GUP_FAST */

        return size;
}

static u64 perf_get_page_size(unsigned long addr)
{
        struct mm_struct *mm;
        unsigned long flags;
        u64 size;

        if (!addr)
                return 0;

        /*
         * Software page-table walkers must disable IRQs,
         * which prevents any tear down of the page tables.
         */
        local_irq_save(flags);

        mm = current->mm;
        if (!mm) {
                /*
                 * For kernel threads and the like, use init_mm so that
                 * we can find kernel memory.
                 */
                mm = &init_mm;
        }

        size = perf_get_pgtable_size(mm, addr);

        local_irq_restore(flags);

        return size;
}

static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
        bool kernel = !event->attr.exclude_callchain_kernel;
        bool user   = !event->attr.exclude_callchain_user;
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;

        if (!kernel && !user)
                return &__empty_callchain;

        callchain = get_perf_callchain(regs, 0, kernel, user,
                                       max_stack, crosstask, true);
        return callchain ?: &__empty_callchain;
}

static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
{
        return d * !!(flags & s);
}

void perf_prepare_sample(struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        u64 sample_type = event->attr.sample_type;
        u64 filtered_sample_type;

        /*
         * Add the sample flags that are dependent to others.  And clear the
         * sample flags that have already been done by the PMU driver.
         */
        filtered_sample_type = sample_type;
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
                                           PERF_SAMPLE_IP);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
                                           PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
                                           PERF_SAMPLE_REGS_USER);
        filtered_sample_type &= ~data->sample_flags;

        if (filtered_sample_type == 0) {
                /* Make sure it has the correct data->type for output */
                data->type = event->attr.sample_type;
                return;
        }

        __perf_event_header__init_id(data, event, filtered_sample_type);

        if (filtered_sample_type & PERF_SAMPLE_IP) {
                data->ip = perf_instruction_pointer(regs);
                data->sample_flags |= PERF_SAMPLE_IP;
        }

        if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
                perf_sample_save_callchain(data, event, regs);

        if (filtered_sample_type & PERF_SAMPLE_RAW) {
                data->raw = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_RAW;
        }

        if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
                data->br_stack = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
                perf_sample_regs_user(&data->regs_user, regs);

        /*
         * It cannot use the filtered_sample_type here as REGS_USER can be set
         * by STACK_USER (using __cond_set() above) and we don't want to update
         * the dyn_size if it's not requested by users.
         */
        if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                if (data->regs_user.regs) {
                        u64 mask = event->attr.sample_regs_user;
                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
                /*
                 * Either we need PERF_SAMPLE_STACK_USER bit to be always
                 * processed as the last one or have additional check added
                 * in case new sample type is added, because we could eat
                 * up the rest of the sample size.
                 */
                u16 stack_size = event->attr.sample_stack_user;
                u16 header_size = perf_sample_data_size(data, event);
                u16 size = sizeof(u64);

                stack_size = perf_sample_ustack_size(stack_size, header_size,
                                                     data->regs_user.regs);

                /*
                 * If there is something to dump, add space for the dump
                 * itself and for the field that tells the dynamic size,
                 * which is how many have been actually dumped.
                 */
                if (stack_size)
                        size += sizeof(u64) + stack_size;

                data->stack_user_size = stack_size;
                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_STACK_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
                data->weight.full = 0;
                data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
        }

        if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
                data->data_src.val = PERF_MEM_NA;
                data->sample_flags |= PERF_SAMPLE_DATA_SRC;
        }

        if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
                data->txn = 0;
                data->sample_flags |= PERF_SAMPLE_TRANSACTION;
        }

        if (filtered_sample_type & PERF_SAMPLE_ADDR) {
                data->addr = 0;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                perf_sample_regs_intr(&data->regs_intr, regs);

                if (data->regs_intr.regs) {
                        u64 mask = event->attr.sample_regs_intr;

                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_INTR;
        }

        if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
                data->phys_addr = perf_virt_to_phys(data->addr);
                data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
        }

#ifdef CONFIG_CGROUP_PERF
        if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
                struct cgroup *cgrp;

                /* protected by RCU */
                cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
                data->cgroup = cgroup_id(cgrp);
                data->sample_flags |= PERF_SAMPLE_CGROUP;
        }
#endif

        /*
         * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
         * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
         * but the value will not dump to the userspace.
         */
        if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
                data->data_page_size = perf_get_page_size(data->addr);
                data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
                data->code_page_size = perf_get_page_size(data->ip);
                data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_AUX) {
                u64 size;
                u16 header_size = perf_sample_data_size(data, event);

                header_size += sizeof(u64); /* size */

                /*
                 * Given the 16bit nature of header::size, an AUX sample can
                 * easily overflow it, what with all the preceding sample bits.
                 * Make sure this doesn't happen by using up to U16_MAX bytes
                 * per sample in total (rounded down to 8 byte boundary).
                 */
                size = min_t(size_t, U16_MAX - header_size,
                             event->attr.aux_sample_size);
                size = rounddown(size, 8);
                size = perf_prepare_sample_aux(event, data, size);

                WARN_ON_ONCE(size + header_size > U16_MAX);
                data->dyn_size += size + sizeof(u64); /* size above */
                data->sample_flags |= PERF_SAMPLE_AUX;
        }
}

void perf_prepare_header(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        header->type = PERF_RECORD_SAMPLE;
        header->size = perf_sample_data_size(data, event);
        header->misc = perf_misc_flags(regs);

        /*
         * If you're adding more sample types here, you likely need to do
         * something about the overflowing header::size, like repurpose the
         * lowest 3 bits of size, which should be always zero at the moment.
         * This raises a more important question, do we really need 512k sized
         * samples and why, so good argumentation is in order for whatever you
         * do here next.
         */
        WARN_ON_ONCE(header->size & 7);
}

static __always_inline int
__perf_event_output(struct perf_event *event,
                    struct perf_sample_data *data,
                    struct pt_regs *regs,
                    int (*output_begin)(struct perf_output_handle *,
                                        struct perf_sample_data *,
                                        struct perf_event *,
                                        unsigned int))
{
        struct perf_output_handle handle;
        struct perf_event_header header;
        int err;

        /* protect the callchain buffers */
        rcu_read_lock();

        perf_prepare_sample(data, event, regs);
        perf_prepare_header(&header, data, event, regs);

        err = output_begin(&handle, data, event, header.size);
        if (err)
                goto exit;

        perf_output_sample(&handle, &header, data, event);

        perf_output_end(&handle);

exit:
        rcu_read_unlock();
        return err;
}

void
perf_event_output_forward(struct perf_event *event,
                         struct perf_sample_data *data,
                         struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_forward);
}

void
perf_event_output_backward(struct perf_event *event,
                           struct perf_sample_data *data,
                           struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_backward);
}

int
perf_event_output(struct perf_event *event,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
{
        return __perf_event_output(event, data, regs, perf_output_begin);
}

/*
 * read event_id
 */

struct perf_read_event {
        struct perf_event_header        header;

        u32                                pid;
        u32                                tid;
};

static void
perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;

        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

typedef void (perf_iterate_f)(struct perf_event *event, void *data);

static void
perf_iterate_ctx(struct perf_event_context *ctx,
                   perf_iterate_f output,
                   void *data, bool all)
{
        struct perf_event *event;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (!all) {
                        if (event->state < PERF_EVENT_STATE_INACTIVE)
                                continue;
                        if (!event_filter_match(event))
                                continue;
                }

                output(event, data);
        }
}

static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
        struct perf_event *event;

        list_for_each_entry_rcu(event, &pel->list, sb_list) {
                /*
                 * Skip events that are not fully formed yet; ensure that
                 * if we observe event->ctx, both event and ctx will be
                 * complete enough. See perf_install_in_context().
                 */
                if (!smp_load_acquire(&event->ctx))
                        continue;

                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
                        continue;
                output(event, data);
        }
}

/*
 * Iterate all events that need to receive side-band events.
 *
 * For new callers; ensure that account_pmu_sb_event() includes
 * your event, otherwise it might not get delivered.
 */
static void
perf_iterate_sb(perf_iterate_f output, void *data,
               struct perf_event_context *task_ctx)
{
        struct perf_event_context *ctx;

        rcu_read_lock();
        preempt_disable();

        /*
         * If we have task_ctx != NULL we only notify the task context itself.
         * The task_ctx is set only for EXIT events before releasing task
         * context.
         */
        if (task_ctx) {
                perf_iterate_ctx(task_ctx, output, data, false);
                goto done;
        }

        perf_iterate_sb_cpu(output, data);

        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, output, data, false);
done:
        preempt_enable();
        rcu_read_unlock();
}

/*
 * Clear all file-based filters at exec, they'll have to be
 * re-instated when/if these objects are mmapped again.
 */
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;
                        restart++;
                }

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

void perf_event_exec(void)
{
        struct perf_event_context *ctx;

        ctx = perf_pin_task_context(current);
        if (!ctx)
                return;

        perf_event_enable_on_exec(ctx);
        perf_event_remove_on_exec(ctx);
        perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);

        perf_unpin_context(ctx);
        put_ctx(ctx);
}

struct remote_output {
        struct perf_buffer        *rb;
        int                        err;
};

static void __perf_event_output_stop(struct perf_event *event, void *data)
{
        struct perf_event *parent = event->parent;
        struct remote_output *ro = data;
        struct perf_buffer *rb = ro->rb;
        struct stop_event_data sd = {
                .event        = event,
        };

        if (!has_aux(event))
                return;

        if (!parent)
                parent = event;

        /*
         * In case of inheritance, it will be the parent that links to the
         * ring-buffer, but it will be the child that's actually using it.
         *
         * We are using event::rb to determine if the event should be stopped,
         * however this may race with ring_buffer_attach() (through set_output),
         * which will make us skip the event that actually needs to be stopped.
         * So ring_buffer_attach() has to stop an aux event before re-assigning
         * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
}

static int __perf_pmu_output_stop(void *info)
{
        struct perf_event *event = info;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct remote_output ro = {
                .rb        = event->rb,
        };

        rcu_read_lock();
        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
        if (cpuctx->task_ctx)
                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                   &ro, false);
        rcu_read_unlock();

        return ro.err;
}

static void perf_pmu_output_stop(struct perf_event *event)
{
        struct perf_event *iter;
        int err, cpu;

restart:
        rcu_read_lock();
        list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
                /*
                 * For per-CPU events, we need to make sure that neither they
                 * nor their children are running; for cpu==-1 events it's
                 * sufficient to stop the event itself if it's active, since
                 * it can't have children.
                 */
                cpu = iter->cpu;
                if (cpu == -1)
                        cpu = READ_ONCE(iter->oncpu);

                if (cpu == -1)
                        continue;

                err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
                if (err == -EAGAIN) {
                        rcu_read_unlock();
                        goto restart;
                }
        }
        rcu_read_unlock();
}

/*
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 */

struct perf_task_event {
        struct task_struct                *task;
        struct perf_event_context        *task_ctx;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                ppid;
                u32                                tid;
                u32                                ptid;
                u64                                time;
        } event_id;
};

static int perf_event_task_match(struct perf_event *event)
{
        return event->attr.comm  || event->attr.mmap ||
               event->attr.mmap2 || event->attr.mmap_data ||
               event->attr.task;
}

static void perf_event_task_output(struct perf_event *event,
                                   void *data)
{
        struct perf_task_event *task_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data        sample;
        struct task_struct *task = task_event->task;
        int ret, size = task_event->event_id.header.size;

        if (!perf_event_task_match(event))
                return;

        perf_event_header__init_id(&task_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                task_event->event_id.header.size);
        if (ret)
                goto out;

        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.tid = perf_event_tid(event, task);

        if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
                task_event->event_id.ppid = perf_event_pid(event,
                                                        task->real_parent);
                task_event->event_id.ptid = perf_event_pid(event,
                                                        task->real_parent);
        } else {  /* PERF_RECORD_FORK */
                task_event->event_id.ppid = perf_event_pid(event, current);
                task_event->event_id.ptid = perf_event_tid(event, current);
        }

        task_event->event_id.time = perf_event_clock(event);

        perf_output_put(&handle, task_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        task_event->event_id.header.size = size;
}

static void perf_event_task(struct task_struct *task,
                              struct perf_event_context *task_ctx,
                              int new)
{
        struct perf_task_event task_event;

        if (!atomic_read(&nr_comm_events) &&
            !atomic_read(&nr_mmap_events) &&
            !atomic_read(&nr_task_events))
                return;

        task_event = (struct perf_task_event){
                .task          = task,
                .task_ctx = task_ctx,
                .event_id    = {
                        .header = {
                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
                                .misc = 0,
                                .size = sizeof(task_event.event_id),
                        },
                        /* .pid  */
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
                        /* .time */
                },
        };

        perf_iterate_sb(perf_event_task_output,
                       &task_event,
                       task_ctx);
}

void perf_event_fork(struct task_struct *task)
{
        perf_event_task(task, NULL, 1);
        perf_event_namespaces(task);
}

/*
 * comm tracking
 */

struct perf_comm_event {
        struct task_struct        *task;
        char                        *comm;
        int                        comm_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
        } event_id;
};

static int perf_event_comm_match(struct perf_event *event)
{
        return event->attr.comm;
}

static void perf_event_comm_output(struct perf_event *event,
                                   void *data)
{
        struct perf_comm_event *comm_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
        int ret;

        if (!perf_event_comm_match(event))
                return;

        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                comm_event->event_id.header.size);

        if (ret)
                goto out;

        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);

        perf_output_put(&handle, comm_event->event_id);
        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        comm_event->event_id.header.size = size;
}

static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
        char comm[TASK_COMM_LEN];
        unsigned int size;

        memset(comm, 0, sizeof(comm));
        strscpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));

        comm_event->comm = comm;
        comm_event->comm_size = size;

        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

        perf_iterate_sb(perf_event_comm_output,
                       comm_event,
                       NULL);
}

void perf_event_comm(struct task_struct *task, bool exec)
{
        struct perf_comm_event comm_event;

        if (!atomic_read(&nr_comm_events))
                return;

        comm_event = (struct perf_comm_event){
                .task        = task,
                /* .comm      */
                /* .comm_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_COMM,
                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                },
        };

        perf_event_comm_event(&comm_event);
}

/*
 * namespaces tracking
 */

struct perf_namespaces_event {
        struct task_struct                *task;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                nr_namespaces;
                struct perf_ns_link_info        link_info[NR_NAMESPACES];
        } event_id;
};

static int perf_event_namespaces_match(struct perf_event *event)
{
        return event->attr.namespaces;
}

static void perf_event_namespaces_output(struct perf_event *event,
                                         void *data)
{
        struct perf_namespaces_event *namespaces_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = namespaces_event->event_id.header.size;
        int ret;

        if (!perf_event_namespaces_match(event))
                return;

        perf_event_header__init_id(&namespaces_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                namespaces_event->event_id.header.size);
        if (ret)
                goto out;

        namespaces_event->event_id.pid = perf_event_pid(event,
                                                        namespaces_event->task);
        namespaces_event->event_id.tid = perf_event_tid(event,
                                                        namespaces_event->task);

        perf_output_put(&handle, namespaces_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        namespaces_event->event_id.header.size = header_size;
}

static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
                                   struct task_struct *task,
                                   const struct proc_ns_operations *ns_ops)
{
        struct path ns_path;
        struct inode *ns_inode;
        int error;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (!error) {
                ns_inode = ns_path.dentry->d_inode;
                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                ns_link_info->ino = ns_inode->i_ino;
                path_put(&ns_path);
        }
}

void perf_event_namespaces(struct task_struct *task)
{
        struct perf_namespaces_event namespaces_event;
        struct perf_ns_link_info *ns_link_info;

        if (!atomic_read(&nr_namespaces_events))
                return;

        namespaces_event = (struct perf_namespaces_event){
                .task        = task,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_NAMESPACES,
                                .misc = 0,
                                .size = sizeof(namespaces_event.event_id),
                        },
                        /* .pid */
                        /* .tid */
                        .nr_namespaces = NR_NAMESPACES,
                        /* .link_info[NR_NAMESPACES] */
                },
        };

        ns_link_info = namespaces_event.event_id.link_info;

        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
                               task, &mntns_operations);

#ifdef CONFIG_USER_NS
        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
                               task, &userns_operations);
#endif
#ifdef CONFIG_NET_NS
        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
                               task, &netns_operations);
#endif
#ifdef CONFIG_UTS_NS
        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
                               task, &utsns_operations);
#endif
#ifdef CONFIG_IPC_NS
        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
                               task, &ipcns_operations);
#endif
#ifdef CONFIG_PID_NS
        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
                               task, &pidns_operations);
#endif
#ifdef CONFIG_CGROUPS
        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
                               task, &cgroupns_operations);
#endif

        perf_iterate_sb(perf_event_namespaces_output,
                        &namespaces_event,
                        NULL);
}

/*
 * cgroup tracking
 */
#ifdef CONFIG_CGROUP_PERF

struct perf_cgroup_event {
        char                                *path;
        int                                path_size;
        struct {
                struct perf_event_header        header;
                u64                                id;
                char                                path[];
        } event_id;
};

static int perf_event_cgroup_match(struct perf_event *event)
{
        return event->attr.cgroup;
}

static void perf_event_cgroup_output(struct perf_event *event, void *data)
{
        struct perf_cgroup_event *cgroup_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = cgroup_event->event_id.header.size;
        int ret;

        if (!perf_event_cgroup_match(event))
                return;

        perf_event_header__init_id(&cgroup_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                cgroup_event->event_id.header.size);
        if (ret)
                goto out;

        perf_output_put(&handle, cgroup_event->event_id);
        __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        cgroup_event->event_id.header.size = header_size;
}

static void perf_event_cgroup(struct cgroup *cgrp)
{
        struct perf_cgroup_event cgroup_event;
        char path_enomem[16] = "//enomem";
        char *pathname;
        size_t size;

        if (!atomic_read(&nr_cgroup_events))
                return;

        cgroup_event = (struct perf_cgroup_event){
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_CGROUP,
                                .misc = 0,
                                .size = sizeof(cgroup_event.event_id),
                        },
                        .id = cgroup_id(cgrp),
                },
        };

        pathname = kmalloc(PATH_MAX, GFP_KERNEL);
        if (pathname == NULL) {
                cgroup_event.path = path_enomem;
        } else {
                /* just to be sure to have enough space for alignment */
                cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
                cgroup_event.path = pathname;
        }

        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(cgroup_event.path) + 1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                cgroup_event.path[size++] = '\0';

        cgroup_event.event_id.header.size += size;
        cgroup_event.path_size = size;

        perf_iterate_sb(perf_event_cgroup_output,
                        &cgroup_event,
                        NULL);

        kfree(pathname);
}

#endif

/*
 * mmap tracking
 */

struct perf_mmap_event {
        struct vm_area_struct        *vma;

        const char                *file_name;
        int                        file_size;
        int                        maj, min;
        u64                        ino;
        u64                        ino_generation;
        u32                        prot, flags;
        u8                        build_id[BUILD_ID_SIZE_MAX];
        u32                        build_id_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                start;
                u64                                len;
                u64                                pgoff;
        } event_id;
};

static int perf_event_mmap_match(struct perf_event *event,
                                 void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct vm_area_struct *vma = mmap_event->vma;
        int executable = vma->vm_flags & VM_EXEC;

        return (!executable && event->attr.mmap_data) ||
               (executable && (event->attr.mmap || event->attr.mmap2));
}

static void perf_event_mmap_output(struct perf_event *event,
                                   void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
        u32 type = mmap_event->event_id.header.type;
        bool use_build_id;
        int ret;

        if (!perf_event_mmap_match(event, data))
                return;

        if (event->attr.mmap2) {
                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }

        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                mmap_event->event_id.header.size);
        if (ret)
                goto out;

        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);

        use_build_id = event->attr.build_id && mmap_event->build_id_size;

        if (event->attr.mmap2 && use_build_id)
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;

        perf_output_put(&handle, mmap_event->event_id);

        if (event->attr.mmap2) {
                if (use_build_id) {
                        u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };

                        __output_copy(&handle, size, 4);
                        __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
                } else {
                        perf_output_put(&handle, mmap_event->maj);
                        perf_output_put(&handle, mmap_event->min);
                        perf_output_put(&handle, mmap_event->ino);
                        perf_output_put(&handle, mmap_event->ino_generation);
                }
                perf_output_put(&handle, mmap_event->prot);
                perf_output_put(&handle, mmap_event->flags);
        }

        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        mmap_event->event_id.header.size = size;
        mmap_event->event_id.header.type = type;
}

static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
        char *name = NULL;

        if (vma->vm_flags & VM_READ)
                prot |= PROT_READ;
        if (vma->vm_flags & VM_WRITE)
                prot |= PROT_WRITE;
        if (vma->vm_flags & VM_EXEC)
                prot |= PROT_EXEC;

        if (vma->vm_flags & VM_MAYSHARE)
                flags = MAP_SHARED;
        else
                flags = MAP_PRIVATE;

        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
        if (is_vm_hugetlb_page(vma))
                flags |= MAP_HUGETLB;

        if (file) {
                struct inode *inode;
                dev_t dev;

                buf = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!buf) {
                        name = "//enomem";
                        goto cpy_name;
                }
                /*
                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
                name = file_path(file, buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
                }
                inode = file_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);

                goto got_name;
        } else {
                if (vma->vm_ops && vma->vm_ops->name)
                        name = (char *) vma->vm_ops->name(vma);
                if (!name)
                        name = (char *)arch_vma_name(vma);
                if (!name) {
                        if (vma_is_initial_heap(vma))
                                name = "[heap]";
                        else if (vma_is_initial_stack(vma))
                                name = "[stack]";
                        else
                                name = "//anon";
                }
        }

cpy_name:
        strscpy(tmp, name, sizeof(tmp));
        name = tmp;
got_name:
        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(name)+1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                name[size++] = '\0';

        mmap_event->file_name = name;
        mmap_event->file_size = size;
        mmap_event->maj = maj;
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
        mmap_event->prot = prot;
        mmap_event->flags = flags;

        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;

        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

        if (atomic_read(&nr_build_id_events))
                build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);

        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);

        kfree(buf);
}

/*
 * Check whether inode and address range match filter criteria.
 */
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
                                     struct file *file, unsigned long offset,
                                     unsigned long size)
{
        /* d_inode(NULL) won't be equal to any mapped user-space file */
        if (!filter->path.dentry)
                return false;

        if (d_inode(filter->path.dentry) != file_inode(file))
                return false;

        if (filter->offset > offset + size)
                return false;

        if (filter->offset + filter->size < offset)
                return false;

        return true;
}

static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
                                        struct vm_area_struct *vma,
                                        struct perf_addr_filter_range *fr)
{
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
        struct file *file = vma->vm_file;

        if (!perf_addr_filter_match(filter, file, off, vma_size))
                return false;

        if (filter->offset < off) {
                fr->start = vma->vm_start;
                fr->size = min(vma_size, filter->size - (off - filter->offset));
        } else {
                fr->start = vma->vm_start + filter->offset - off;
                fr->size = min(vma->vm_end - fr->start, filter->size);
        }

        return true;
}

static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct vm_area_struct *vma = data;
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        if (!vma->vm_file)
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (perf_addr_filter_vma_adjust(filter, vma,
                                                &event->addr_filter_ranges[count]))
                        restart++;

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

/*
 * Adjust all task's events' filters to the new vma
 */
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
        struct perf_event_context *ctx;

        /*
         * Data tracing isn't supported yet and as such there is no need
         * to keep track of anything that isn't related to executable code:
         */
        if (!(vma->vm_flags & VM_EXEC))
                return;

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
        rcu_read_unlock();
}

void perf_event_mmap(struct vm_area_struct *vma)
{
        struct perf_mmap_event mmap_event;

        if (!atomic_read(&nr_mmap_events))
                return;

        mmap_event = (struct perf_mmap_event){
                .vma        = vma,
                /* .file_name */
                /* .file_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
                /* .maj (attr_mmap2 only) */
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
                /* .prot (attr_mmap2 only) */
                /* .flags (attr_mmap2 only) */
        };

        perf_addr_filters_adjust(vma);
        perf_event_mmap_event(&mmap_event);
}

void perf_event_aux_event(struct perf_event *event, unsigned long head,
                          unsigned long size, u64 flags)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                offset;
                u64                                size;
                u64                                flags;
        } rec = {
                .header = {
                        .type = PERF_RECORD_AUX,
                        .misc = 0,
                        .size = sizeof(rec),
                },
                .offset                = head,
                .size                = size,
                .flags                = flags,
        };
        int ret;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

/*
 * Lost/dropped samples logging
 */
void perf_log_lost_samples(struct perf_event *event, u64 lost)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                lost;
        } lost_samples_event = {
                .header = {
                        .type = PERF_RECORD_LOST_SAMPLES,
                        .misc = 0,
                        .size = sizeof(lost_samples_event),
                },
                .lost                = lost,
        };

        perf_event_header__init_id(&lost_samples_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                lost_samples_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, lost_samples_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * context_switch tracking
 */

struct perf_switch_event {
        struct task_struct        *task;
        struct task_struct        *next_prev;

        struct {
                struct perf_event_header        header;
                u32                                next_prev_pid;
                u32                                next_prev_tid;
        } event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
        return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
        struct perf_switch_event *se = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_switch_match(event))
                return;

        /* Only CPU-wide events are allowed to see next/prev pid/tid */
        if (event->ctx->task) {
                se->event_id.header.type = PERF_RECORD_SWITCH;
                se->event_id.header.size = sizeof(se->event_id.header);
        } else {
                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
                se->event_id.header.size = sizeof(se->event_id);
                se->event_id.next_prev_pid =
                                        perf_event_pid(event, se->next_prev);
                se->event_id.next_prev_tid =
                                        perf_event_tid(event, se->next_prev);
        }

        perf_event_header__init_id(&se->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
        if (ret)
                return;

        if (event->ctx->task)
                perf_output_put(&handle, se->event_id.header);
        else
                perf_output_put(&handle, se->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in)
{
        struct perf_switch_event switch_event;

        /* N.B. caller checks nr_switch_events != 0 */

        switch_event = (struct perf_switch_event){
                .task                = task,
                .next_prev        = next_prev,
                .event_id        = {
                        .header = {
                                /* .type */
                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
                                /* .size */
                        },
                        /* .next_prev_pid */
                        /* .next_prev_tid */
                },
        };

        if (!sched_in && task->on_rq) {
                switch_event.event_id.header.misc |=
                                PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
        }

        perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
}

/*
 * IRQ throttle logging
 */

static void perf_log_throttle(struct perf_event *event, int enable)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                time;
                u64                                id;
                u64                                stream_id;
        } throttle_event = {
                .header = {
                        .type = PERF_RECORD_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
                .time                = perf_event_clock(event),
                .id                = primary_event_id(event),
                .stream_id        = event->id,
        };

        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;

        perf_event_header__init_id(&throttle_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                throttle_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, throttle_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * ksymbol register/unregister tracking
 */

struct perf_ksymbol_event {
        const char        *name;
        int                name_len;
        struct {
                struct perf_event_header        header;
                u64                                addr;
                u32                                len;
                u16                                ksym_type;
                u16                                flags;
        } event_id;
};

static int perf_event_ksymbol_match(struct perf_event *event)
{
        return event->attr.ksymbol;
}

static void perf_event_ksymbol_output(struct perf_event *event, void *data)
{
        struct perf_ksymbol_event *ksymbol_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_ksymbol_match(event))
                return;

        perf_event_header__init_id(&ksymbol_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                ksymbol_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, ksymbol_event->event_id);
        __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
                        const char *sym)
{
        struct perf_ksymbol_event ksymbol_event;
        char name[KSYM_NAME_LEN];
        u16 flags = 0;
        int name_len;

        if (!atomic_read(&nr_ksymbol_events))
                return;

        if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
            ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
                goto err;

        strscpy(name, sym, KSYM_NAME_LEN);
        name_len = strlen(name) + 1;
        while (!IS_ALIGNED(name_len, sizeof(u64)))
                name[name_len++] = '\0';
        BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));

        if (unregister)
                flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;

        ksymbol_event = (struct perf_ksymbol_event){
                .name = name,
                .name_len = name_len,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_KSYMBOL,
                                .size = sizeof(ksymbol_event.event_id) +
                                        name_len,
                        },
                        .addr = addr,
                        .len = len,
                        .ksym_type = ksym_type,
                        .flags = flags,
                },
        };

        perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
        return;
err:
        WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
}

/*
 * bpf program load/unload tracking
 */

struct perf_bpf_event {
        struct bpf_prog        *prog;
        struct {
                struct perf_event_header        header;
                u16                                type;
                u16                                flags;
                u32                                id;
                u8                                tag[BPF_TAG_SIZE];
        } event_id;
};

static int perf_event_bpf_match(struct perf_event *event)
{
        return event->attr.bpf_event;
}

static void perf_event_bpf_output(struct perf_event *event, void *data)
{
        struct perf_bpf_event *bpf_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_bpf_match(event))
                return;

        perf_event_header__init_id(&bpf_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                bpf_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, bpf_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
                                         enum perf_bpf_event_type type)
{
        bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
        int i;

        if (prog->aux->func_cnt == 0) {
                perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
                                   (u64)(unsigned long)prog->bpf_func,
                                   prog->jited_len, unregister,
                                   prog->aux->ksym.name);
        } else {
                for (i = 0; i < prog->aux->func_cnt; i++) {
                        struct bpf_prog *subprog = prog->aux->func[i];

                        perf_event_ksymbol(
                                PERF_RECORD_KSYMBOL_TYPE_BPF,
                                (u64)(unsigned long)subprog->bpf_func,
                                subprog->jited_len, unregister,
                                subprog->aux->ksym.name);
                }
        }
}

void perf_event_bpf_event(struct bpf_prog *prog,
                          enum perf_bpf_event_type type,
                          u16 flags)
{
        struct perf_bpf_event bpf_event;

        switch (type) {
        case PERF_BPF_EVENT_PROG_LOAD:
        case PERF_BPF_EVENT_PROG_UNLOAD:
                if (atomic_read(&nr_ksymbol_events))
                        perf_event_bpf_emit_ksymbols(prog, type);
                break;
        default:
                return;
        }

        if (!atomic_read(&nr_bpf_events))
                return;

        bpf_event = (struct perf_bpf_event){
                .prog = prog,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_BPF_EVENT,
                                .size = sizeof(bpf_event.event_id),
                        },
                        .type = type,
                        .flags = flags,
                        .id = prog->aux->id,
                },
        };

        BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));

        memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}

struct perf_text_poke_event {
        const void                *old_bytes;
        const void                *new_bytes;
        size_t                        pad;
        u16                        old_len;
        u16                        new_len;

        struct {
                struct perf_event_header        header;

                u64                                addr;
        } event_id;
};

static int perf_event_text_poke_match(struct perf_event *event)
{
        return event->attr.text_poke;
}

static void perf_event_text_poke_output(struct perf_event *event, void *data)
{
        struct perf_text_poke_event *text_poke_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u64 padding = 0;
        int ret;

        if (!perf_event_text_poke_match(event))
                return;

        perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                text_poke_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, text_poke_event->event_id);
        perf_output_put(&handle, text_poke_event->old_len);
        perf_output_put(&handle, text_poke_event->new_len);

        __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
        __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);

        if (text_poke_event->pad)
                __output_copy(&handle, &padding, text_poke_event->pad);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_text_poke(const void *addr, const void *old_bytes,
                          size_t old_len, const void *new_bytes, size_t new_len)
{
        struct perf_text_poke_event text_poke_event;
        size_t tot, pad;

        if (!atomic_read(&nr_text_poke_events))
                return;

        tot  = sizeof(text_poke_event.old_len) + old_len;
        tot += sizeof(text_poke_event.new_len) + new_len;
        pad  = ALIGN(tot, sizeof(u64)) - tot;

        text_poke_event = (struct perf_text_poke_event){
                .old_bytes    = old_bytes,
                .new_bytes    = new_bytes,
                .pad          = pad,
                .old_len      = old_len,
                .new_len      = new_len,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_TEXT_POKE,
                                .misc = PERF_RECORD_MISC_KERNEL,
                                .size = sizeof(text_poke_event.event_id) + tot + pad,
                        },
                        .addr = (unsigned long)addr,
                },
        };

        perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
}

void perf_event_itrace_started(struct perf_event *event)
{
        event->attach_state |= PERF_ATTACH_ITRACE;
}

static void perf_log_itrace_start(struct perf_event *event)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u32                                pid;
                u32                                tid;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
            event->attach_state & PERF_ATTACH_ITRACE)
                return;

        rec.header.type        = PERF_RECORD_ITRACE_START;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.pid        = perf_event_pid(event, current);
        rec.tid        = perf_event_tid(event, current);

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                hw_id;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        rec.header.type        = PERF_RECORD_AUX_OUTPUT_HW_ID;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.hw_id        = hw_id;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}
EXPORT_SYMBOL_GPL(perf_report_aux_output_id);

static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
        u64 seq;

        seq = __this_cpu_read(perf_throttled_seq);
        if (seq != hwc->interrupts_seq) {
                hwc->interrupts_seq = seq;
                hwc->interrupts = 1;
        } else {
                hwc->interrupts++;
                if (unlikely(throttle &&
                             hwc->interrupts > max_samples_per_tick)) {
                        __this_cpu_inc(perf_throttled_count);
                        tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                        hwc->interrupts = MAX_INTERRUPTS;
                        perf_log_throttle(event, 0);
                        ret = 1;
                }
        }

        if (event->attr.freq) {
                u64 now = perf_clock();
                s64 delta = now - hwc->freq_time_stamp;

                hwc->freq_time_stamp = now;

                if (delta > 0 && delta < 2*TICK_NSEC)
                        perf_adjust_period(event, delta, hwc->last_period, true);
        }

        return ret;
}

int perf_event_account_interrupt(struct perf_event *event)
{
        return __perf_event_account_interrupt(event, 1);
}

static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
{
        /*
         * Due to interrupt latency (AKA "skid"), we may enter the
         * kernel before taking an overflow, even if the PMU is only
         * counting user events.
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return false;

        return true;
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_overflow_handler(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .event = event,
        };
        struct bpf_prog *prog;
        int ret = 0;

        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
        rcu_read_lock();
        prog = READ_ONCE(event->prog);
        if (prog) {
                perf_prepare_sample(data, event, regs);
                ret = bpf_prog_run(prog, &ctx);
        }
        rcu_read_unlock();
out:
        __this_cpu_dec(bpf_prog_active);

        return ret;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        if (event->overflow_handler_context)
                /* hw breakpoint or kernel counter */
                return -EINVAL;

        if (event->prog)
                return -EEXIST;

        if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
                return -EINVAL;

        if (event->attr.precise_ip &&
            prog->call_get_stack &&
            (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
             event->attr.exclude_callchain_kernel ||
             event->attr.exclude_callchain_user)) {
                /*
                 * On perf_event with precise_ip, calling bpf_get_stack()
                 * may trigger unwinder warnings and occasional crashes.
                 * bpf_get_[stack|stackid] works around this issue by using
                 * callchain attached to perf_sample_data. If the
                 * perf_event does not full (kernel and user) callchain
                 * attached to perf_sample_data, do not allow attaching BPF
                 * program that calls bpf_get_[stack|stackid].
                 */
                return -EPROTO;
        }

        event->prog = prog;
        event->bpf_cookie = bpf_cookie;
        return 0;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
        struct bpf_prog *prog = event->prog;

        if (!prog)
                return;

        event->prog = NULL;
        bpf_prog_put(prog);
}
#else
static inline int bpf_overflow_handler(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs)
{
        return 1;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        return -EOPNOTSUPP;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif

/*
 * Generic event overflow handling, sampling.
 */

static int __perf_event_overflow(struct perf_event *event,
                                 int throttle, struct perf_sample_data *data,
                                 struct pt_regs *regs)
{
        int events = atomic_read(&event->event_limit);
        int ret = 0;

        /*
         * Non-sampling counters might still use the PMI to fold short
         * hardware counters, ignore those.
         */
        if (unlikely(!is_sampling_event(event)))
                return 0;

        ret = __perf_event_account_interrupt(event, throttle);

        if (event->prog && !bpf_overflow_handler(event, data, regs))
                return ret;

        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
         */

        event->pending_kill = POLL_IN;
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
                perf_event_disable_inatomic(event);
        }

        if (event->attr.sigtrap) {
                /*
                 * The desired behaviour of sigtrap vs invalid samples is a bit
                 * tricky; on the one hand, one should not loose the SIGTRAP if
                 * it is the first event, on the other hand, we should also not
                 * trigger the WARN or override the data address.
                 */
                bool valid_sample = sample_is_allowed(event, regs);
                unsigned int pending_id = 1;

                if (regs)
                        pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
                if (!event->pending_sigtrap) {
                        event->pending_sigtrap = pending_id;
                        local_inc(&event->ctx->nr_pending);
                } else if (event->attr.exclude_kernel && valid_sample) {
                        /*
                         * Should not be able to return to user space without
                         * consuming pending_sigtrap; with exceptions:
                         *
                         *  1. Where !exclude_kernel, events can overflow again
                         *     in the kernel without returning to user space.
                         *
                         *  2. Events that can overflow again before the IRQ-
                         *     work without user space progress (e.g. hrtimer).
                         *     To approximate progress (with false negatives),
                         *     check 32-bit hash of the current IP.
                         */
                        WARN_ON_ONCE(event->pending_sigtrap != pending_id);
                }

                event->pending_addr = 0;
                if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
                        event->pending_addr = data->addr;
                irq_work_queue(&event->pending_irq);
        }

        READ_ONCE(event->overflow_handler)(event, data, regs);

        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
                irq_work_queue(&event->pending_irq);
        }

        return ret;
}

int perf_event_overflow(struct perf_event *event,
                        struct perf_sample_data *data,
                        struct pt_regs *regs)
{
        return __perf_event_overflow(event, 1, data, regs);
}

/*
 * Generic software event infrastructure
 */

struct swevent_htable {
        struct swevent_hlist                *swevent_hlist;
        struct mutex                        hlist_mutex;
        int                                hlist_refcount;

        /* Recursion avoidance in each contexts */
        int                                recursion[PERF_NR_CONTEXTS];
};

static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

/*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

u64 perf_swevent_set_period(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
        u64 nr, offset;
        s64 old, val;

        hwc->last_period = hwc->sample_period;

        old = local64_read(&hwc->period_left);
        do {
                val = old;
                if (val < 0)
                        return 0;

                nr = div64_u64(period + val, period);
                offset = nr * period;
                val -= offset;
        } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));

        return nr;
}

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;

        if (!overflow)
                overflow = perf_swevent_set_period(event);

        if (hwc->interrupts == MAX_INTERRUPTS)
                return;

        for (; overflow; overflow--) {
                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
                throttle = 1;
        }
}

static void perf_swevent_event(struct perf_event *event, u64 nr,
                               struct perf_sample_data *data,
                               struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;

        local64_add(nr, &event->count);

        if (!regs)
                return;

        if (!is_sampling_event(event))
                return;

        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
                data->period = nr;
                return perf_swevent_overflow(event, 1, data, regs);
        } else
                data->period = event->hw.last_period;

        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);

        if (local64_add_negative(nr, &hwc->period_left))
                return;

        perf_swevent_overflow(event, 0, data, regs);
}

static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 1;

        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;

                if (event->attr.exclude_kernel && !user_mode(regs))
                        return 1;
        }

        return 0;
}

static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
                                u32 event_id,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->attr.type != type)
                return 0;

        if (event->attr.config != event_id)
                return 0;

        if (perf_exclude_event(event, regs))
                return 0;

        return 1;
}

static inline u64 swevent_hash(u64 type, u32 event_id)
{
        u64 val = event_id | (type << 32);

        return hash_64(val, SWEVENT_HLIST_BITS);
}

static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
        u64 hash = swevent_hash(type, event_id);

        return &hlist->heads[hash];
}

/* For the read side: events when they trigger */
static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
{
        struct swevent_hlist *hlist;

        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
        u64 type = event->attr.type;

        /*
         * Event scheduling is always serialized against hlist allocation
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct perf_event *event;
        struct hlist_head *head;

        rcu_read_lock();
        head = find_swevent_head_rcu(swhash, type, event_id);
        if (!head)
                goto end;

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
end:
        rcu_read_unlock();
}

DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);

int perf_swevent_get_recursion_context(void)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        return get_recursion_context(swhash->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);

void perf_swevent_put_recursion_context(int rctx)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        put_recursion_context(swhash->recursion, rctx);
}

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        struct perf_sample_data data;

        if (WARN_ON_ONCE(!regs))
                return;

        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}

void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        int rctx;

        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (unlikely(rctx < 0))
                goto fail;

        ___perf_sw_event(event_id, nr, regs, addr);

        perf_swevent_put_recursion_context(rctx);
fail:
        preempt_enable_notrace();
}

static void perf_swevent_read(struct perf_event *event)
{
}

static int perf_swevent_add(struct perf_event *event, int flags)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;

        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }

        hwc->state = !(flags & PERF_EF_START);

        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;

        hlist_add_head_rcu(&event->hlist_entry, head);
        perf_event_update_userpage(event);

        return 0;
}

static void perf_swevent_del(struct perf_event *event, int flags)
{
        hlist_del_rcu(&event->hlist_entry);
}

static void perf_swevent_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
}

static void perf_swevent_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
}

/* Deref the hlist from the update side */
static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{
        return rcu_dereference_protected(swhash->swevent_hlist,
                                         lockdep_is_held(&swhash->hlist_mutex));
}

static void swevent_hlist_release(struct swevent_htable *swhash)
{
        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);

        if (!hlist)
                return;

        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
        kfree_rcu(hlist, rcu_head);
}

static void swevent_hlist_put_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);

        if (!--swhash->hlist_refcount)
                swevent_hlist_release(swhash);

        mutex_unlock(&swhash->hlist_mutex);
}

static void swevent_hlist_put(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(cpu);
}

static int swevent_hlist_get_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;

        mutex_lock(&swhash->hlist_mutex);
        if (!swevent_hlist_deref(swhash) &&
            cpumask_test_cpu(cpu, perf_online_mask)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
                if (!hlist) {
                        err = -ENOMEM;
                        goto exit;
                }
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        swhash->hlist_refcount++;
exit:
        mutex_unlock(&swhash->hlist_mutex);

        return err;
}

static int swevent_hlist_get(void)
{
        int err, cpu, failed_cpu;

        mutex_lock(&pmus_lock);
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        failed_cpu = cpu;
                        goto fail;
                }
        }
        mutex_unlock(&pmus_lock);
        return 0;
fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
                swevent_hlist_put_cpu(cpu);
        }
        mutex_unlock(&pmus_lock);
        return err;
}

struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

static void sw_perf_event_destroy(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        WARN_ON(event->parent);

        static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put();
}

static struct pmu perf_cpu_clock; /* fwd declaration */
static struct pmu perf_task_clock;

static int perf_swevent_init(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
                event->attr.type = perf_cpu_clock.type;
                return -ENOENT;
        case PERF_COUNT_SW_TASK_CLOCK:
                event->attr.type = perf_task_clock.type;
                return -ENOENT;

        default:
                break;
        }

        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;

        if (!event->parent) {
                int err;

                err = swevent_hlist_get();
                if (err)
                        return err;

                static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }

        return 0;
}

static struct pmu perf_swevent = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = perf_swevent_init,
        .add                = perf_swevent_add,
        .del                = perf_swevent_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#ifdef CONFIG_EVENT_TRACING

static void tp_perf_event_destroy(struct perf_event *event)
{
        perf_trace_destroy(event);
}

static int perf_tp_event_init(struct perf_event *event)
{
        int err;

        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;

        /*
         * no branch sampling for tracepoint events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        err = perf_trace_init(event);
        if (err)
                return err;

        event->destroy = tp_perf_event_destroy;

        return 0;
}

static struct pmu perf_tracepoint = {
        .task_ctx_nr        = perf_sw_context,

        .event_init        = perf_tp_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
{
        void *record = data->raw->frag.data;

        /* only top level events have filters set */
        if (event->parent)
                event = event->parent;

        if (likely(!event->filter) || filter_match_preds(event->filter, record))
                return 1;
        return 0;
}

static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
         * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;

        if (!perf_tp_filter_match(event, data))
                return 0;

        return 1;
}

void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task)
{
        if (bpf_prog_array_valid(call)) {
                *(struct pt_regs **)raw_data = regs;
                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
                        perf_swevent_put_recursion_context(rctx);
                        return;
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
                      rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

static void __perf_tp_event_target_task(u64 count, void *record,
                                        struct pt_regs *regs,
                                        struct perf_sample_data *data,
                                        struct perf_event *event)
{
        struct trace_entry *entry = record;

        if (event->attr.config != entry->type)
                return;
        /* Cannot deliver synchronous signal to other task. */
        if (event->attr.sigtrap)
                return;
        if (perf_tp_event_match(event, data, regs))
                perf_swevent_event(event, count, data, regs);
}

static void perf_tp_event_target_task(u64 count, void *record,
                                      struct pt_regs *regs,
                                      struct perf_sample_data *data,
                                      struct perf_event_context *ctx)
{
        unsigned int cpu = smp_processor_id();
        struct pmu *pmu = &perf_tracepoint;
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, sibling);
        }

        perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, sibling);
        }
}

void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
                   struct task_struct *task)
{
        struct perf_sample_data data;
        struct perf_event *event;

        struct perf_raw_record raw = {
                .frag = {
                        .size = entry_size,
                        .data = record,
                },
        };

        perf_sample_data_init(&data, 0, 0);
        perf_sample_save_raw_data(&data, &raw);

        perf_trace_buf_update(record, event_type);

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs)) {
                        perf_swevent_event(event, count, &data, regs);

                        /*
                         * Here use the same on-stack perf_sample_data,
                         * some members in data are event-specific and
                         * need to be re-computed for different sweveents.
                         * Re-initialize data->sample_flags safely to avoid
                         * the problem that next event skips preparing data
                         * because data->sample_flags is set.
                         */
                        perf_sample_data_init(&data, 0, 0);
                        perf_sample_save_raw_data(&data, &raw);
                }
        }

        /*
         * If we got specified a target task, also iterate its context and
         * deliver this event there too.
         */
        if (task && task != current) {
                struct perf_event_context *ctx;

                rcu_read_lock();
                ctx = rcu_dereference(task->perf_event_ctxp);
                if (!ctx)
                        goto unlock;

                raw_spin_lock(&ctx->lock);
                perf_tp_event_target_task(count, record, regs, &data, ctx);
                raw_spin_unlock(&ctx->lock);
unlock:
                rcu_read_unlock();
        }

        perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
 * Flags in config, used by dynamic PMU kprobe and uprobe
 * The flags should match following PMU_FORMAT_ATTR().
 *
 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 *                               if not set, create kprobe/uprobe
 *
 * The following values specify a reference counter (or semaphore in the
 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
 *
 * PERF_UPROBE_REF_CTR_OFFSET_BITS        # of bits in config as th offset
 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT        # of bits to shift left
 */
enum perf_probe_config {
        PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
        PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
        PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};

PMU_FORMAT_ATTR(retprobe, "config:0");
#endif

#ifdef CONFIG_KPROBE_EVENTS
static struct attribute *kprobe_attrs[] = {
        &format_attr_retprobe.attr,
        NULL,
};

static struct attribute_group kprobe_format_group = {
        .name = "format",
        .attrs = kprobe_attrs,
};

static const struct attribute_group *kprobe_attr_groups[] = {
        &kprobe_format_group,
        NULL,
};

static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_kprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = kprobe_attr_groups,
};

static int perf_kprobe_event_init(struct perf_event *event)
{
        int err;
        bool is_retprobe;

        if (event->attr.type != perf_kprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        err = perf_kprobe_init(event, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_kprobe_destroy;

        return 0;
}
#endif /* CONFIG_KPROBE_EVENTS */

#ifdef CONFIG_UPROBE_EVENTS
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");

static struct attribute *uprobe_attrs[] = {
        &format_attr_retprobe.attr,
        &format_attr_ref_ctr_offset.attr,
        NULL,
};

static struct attribute_group uprobe_format_group = {
        .name = "format",
        .attrs = uprobe_attrs,
};

static const struct attribute_group *uprobe_attr_groups[] = {
        &uprobe_format_group,
        NULL,
};

static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_uprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = uprobe_attr_groups,
};

static int perf_uprobe_event_init(struct perf_event *event)
{
        int err;
        unsigned long ref_ctr_offset;
        bool is_retprobe;

        if (event->attr.type != perf_uprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
        err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_uprobe_destroy;

        return 0;
}
#endif /* CONFIG_UPROBE_EVENTS */

static inline void perf_tp_register(void)
{
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
#ifdef CONFIG_KPROBE_EVENTS
        perf_pmu_register(&perf_kprobe, "kprobe", -1);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        perf_pmu_register(&perf_uprobe, "uprobe", -1);
#endif
}

static void perf_event_free_filter(struct perf_event *event)
{
        ftrace_profile_free_filter(event);
}

/*
 * returns true if the event is a tracepoint, or a kprobe/upprobe created
 * with perf_event_open()
 */
static inline bool perf_event_is_tracing(struct perf_event *event)
{
        if (event->pmu == &perf_tracepoint)
                return true;
#ifdef CONFIG_KPROBE_EVENTS
        if (event->pmu == &perf_kprobe)
                return true;
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->pmu == &perf_uprobe)
                return true;
#endif
        return false;
}

int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;

        if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog, bpf_cookie);

        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
        is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);
        if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;

        if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
                return -EINVAL;

        if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
                /* only uprobe programs are allowed to be sleepable */
                return -EINVAL;

        /* Kprobe override only works for kprobes, not uprobes. */
        if (prog->kprobe_override && !is_kprobe)
                return -EINVAL;

        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);

                if (prog->aux->max_ctx_offset > off)
                        return -EACCES;
        }

        return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
        if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
        perf_event_detach_bpf_prog(event);
}

#else

static inline void perf_tp_register(void)
{
}

static void perf_event_free_filter(struct perf_event *event)
{
}

int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        return -ENOENT;
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */

#ifdef CONFIG_HAVE_HW_BREAKPOINT
void perf_bp_event(struct perf_event *bp, void *data)
{
        struct perf_sample_data sample;
        struct pt_regs *regs = data;

        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);

        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
}
#endif

/*
 * Allocate a new address filter
 */
static struct perf_addr_filter *
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
{
        int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
        struct perf_addr_filter *filter;

        filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
        if (!filter)
                return NULL;

        INIT_LIST_HEAD(&filter->entry);
        list_add_tail(&filter->entry, filters);

        return filter;
}

static void free_filters_list(struct list_head *filters)
{
        struct perf_addr_filter *filter, *iter;

        list_for_each_entry_safe(filter, iter, filters, entry) {
                path_put(&filter->path);
                list_del(&filter->entry);
                kfree(filter);
        }
}

/*
 * Free existing address filters and optionally install new ones
 */
static void perf_addr_filters_splice(struct perf_event *event,
                                     struct list_head *head)
{
        unsigned long flags;
        LIST_HEAD(list);

        if (!has_addr_filter(event))
                return;

        /* don't bother with children, they don't have their own filters */
        if (event->parent)
                return;

        raw_spin_lock_irqsave(&event->addr_filters.lock, flags);

        list_splice_init(&event->addr_filters.list, &list);
        if (head)
                list_splice(head, &event->addr_filters.list);

        raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);

        free_filters_list(&list);
}

/*
 * Scan through mm's vmas and see if one of them matches the
 * @filter; if so, adjust filter's address range.
 * Called with mm::mmap_lock down for reading.
 */
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
                                   struct mm_struct *mm,
                                   struct perf_addr_filter_range *fr)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        for_each_vma(vmi, vma) {
                if (!vma->vm_file)
                        continue;

                if (perf_addr_filter_vma_adjust(filter, vma, fr))
                        return;
        }
}

/*
 * Update event's address range filters based on the
 * task's existing mappings, if any.
 */
static void perf_event_addr_filters_apply(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct task_struct *task = READ_ONCE(event->ctx->task);
        struct perf_addr_filter *filter;
        struct mm_struct *mm = NULL;
        unsigned int count = 0;
        unsigned long flags;

        /*
         * We may observe TASK_TOMBSTONE, which means that the event tear-down
         * will stop on the parent's child_mutex that our caller is also holding
         */
        if (task == TASK_TOMBSTONE)
                return;

        if (ifh->nr_file_filters) {
                mm = get_task_mm(task);
                if (!mm)
                        goto restart;

                mmap_read_lock(mm);
        }

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        /*
                         * Adjust base offset if the filter is associated to a
                         * binary that needs to be mapped:
                         */
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;

                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
                } else {
                        event->addr_filter_ranges[count].start = filter->offset;
                        event->addr_filter_ranges[count].size  = filter->size;
                }

                count++;
        }

        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (ifh->nr_file_filters) {
                mmap_read_unlock(mm);

                mmput(mm);
        }

restart:
        perf_event_stop(event, 1);
}

/*
 * Address range filtering: limiting the data to certain
 * instruction address ranges. Filters are ioctl()ed to us from
 * userspace as ascii strings.
 *
 * Filter string format:
 *
 * ACTION RANGE_SPEC
 * where ACTION is one of the
 *  * "filter": limit the trace to this region
 *  * "start": start tracing from this address
 *  * "stop": stop tracing at this address/region;
 * RANGE_SPEC is
 *  * for kernel addresses: <start address>[/<size>]
 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
 *
 * if <size> is not specified or is zero, the range is treated as a single
 * address; not valid for ACTION=="filter".
 */
enum {
        IF_ACT_NONE = -1,
        IF_ACT_FILTER,
        IF_ACT_START,
        IF_ACT_STOP,
        IF_SRC_FILE,
        IF_SRC_KERNEL,
        IF_SRC_FILEADDR,
        IF_SRC_KERNELADDR,
};

enum {
        IF_STATE_ACTION = 0,
        IF_STATE_SOURCE,
        IF_STATE_END,
};

static const match_table_t if_tokens = {
        { IF_ACT_FILTER,        "filter" },
        { IF_ACT_START,                "start" },
        { IF_ACT_STOP,                "stop" },
        { IF_SRC_FILE,                "%u/%u@%s" },
        { IF_SRC_KERNEL,        "%u/%u" },
        { IF_SRC_FILEADDR,        "%u@%s" },
        { IF_SRC_KERNELADDR,        "%u" },
        { IF_ACT_NONE,                NULL },
};

/*
 * Address filter string parser
 */
static int
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                             struct list_head *filters)
{
        struct perf_addr_filter *filter = NULL;
        char *start, *orig, *filename = NULL;
        substring_t args[MAX_OPT_ARGS];
        int state = IF_STATE_ACTION, token;
        unsigned int kernel = 0;
        int ret = -EINVAL;

        orig = fstr = kstrdup(fstr, GFP_KERNEL);
        if (!fstr)
                return -ENOMEM;

        while ((start = strsep(&fstr, " ,\n")) != NULL) {
                static const enum perf_addr_filter_action_t actions[] = {
                        [IF_ACT_FILTER]        = PERF_ADDR_FILTER_ACTION_FILTER,
                        [IF_ACT_START]        = PERF_ADDR_FILTER_ACTION_START,
                        [IF_ACT_STOP]        = PERF_ADDR_FILTER_ACTION_STOP,
                };
                ret = -EINVAL;

                if (!*start)
                        continue;

                /* filter definition begins */
                if (state == IF_STATE_ACTION) {
                        filter = perf_addr_filter_new(event, filters);
                        if (!filter)
                                goto fail;
                }

                token = match_token(start, if_tokens, args);
                switch (token) {
                case IF_ACT_FILTER:
                case IF_ACT_START:
                case IF_ACT_STOP:
                        if (state != IF_STATE_ACTION)
                                goto fail;

                        filter->action = actions[token];
                        state = IF_STATE_SOURCE;
                        break;

                case IF_SRC_KERNELADDR:
                case IF_SRC_KERNEL:
                        kernel = 1;
                        fallthrough;

                case IF_SRC_FILEADDR:
                case IF_SRC_FILE:
                        if (state != IF_STATE_SOURCE)
                                goto fail;

                        *args[0].to = 0;
                        ret = kstrtoul(args[0].from, 0, &filter->offset);
                        if (ret)
                                goto fail;

                        if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
                                *args[1].to = 0;
                                ret = kstrtoul(args[1].from, 0, &filter->size);
                                if (ret)
                                        goto fail;
                        }

                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
                                int fpos = token == IF_SRC_FILE ? 2 : 1;

                                kfree(filename);
                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
                                }
                        }

                        state = IF_STATE_END;
                        break;

                default:
                        goto fail;
                }

                /*
                 * Filter definition is fully parsed, validate and install it.
                 * Make sure that it doesn't contradict itself or the event's
                 * attribute.
                 */
                if (state == IF_STATE_END) {
                        ret = -EINVAL;

                        /*
                         * ACTION "filter" must have a non-zero length region
                         * specified.
                         */
                        if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
                            !filter->size)
                                goto fail;

                        if (!kernel) {
                                if (!filename)
                                        goto fail;

                                /*
                                 * For now, we only support file-based filters
                                 * in per-task events; doing so for CPU-wide
                                 * events requires additional context switching
                                 * trickery, since same object code will be
                                 * mapped at different virtual addresses in
                                 * different processes.
                                 */
                                ret = -EOPNOTSUPP;
                                if (!event->ctx->task)
                                        goto fail;

                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW,
                                                &filter->path);
                                if (ret)
                                        goto fail;

                                ret = -EINVAL;
                                if (!filter->path.dentry ||
                                    !S_ISREG(d_inode(filter->path.dentry)
                                             ->i_mode))
                                        goto fail;

                                event->addr_filters.nr_file_filters++;
                        }

                        /* ready to consume more filters */
                        kfree(filename);
                        filename = NULL;
                        state = IF_STATE_ACTION;
                        filter = NULL;
                        kernel = 0;
                }
        }

        if (state != IF_STATE_ACTION)
                goto fail;

        kfree(filename);
        kfree(orig);

        return 0;

fail:
        kfree(filename);
        free_filters_list(filters);
        kfree(orig);

        return ret;
}

static int
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
{
        LIST_HEAD(filters);
        int ret;

        /*
         * Since this is called in perf_ioctl() path, we're already holding
         * ctx::mutex.
         */
        lockdep_assert_held(&event->ctx->mutex);

        if (WARN_ON_ONCE(event->parent))
                return -EINVAL;

        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
        if (ret)
                goto fail_clear_files;

        ret = event->pmu->addr_filters_validate(&filters);
        if (ret)
                goto fail_free_filters;

        /* remove existing filters, if any */
        perf_addr_filters_splice(event, &filters);

        /* install new filters */
        perf_event_for_each_child(event, perf_event_addr_filters_apply);

        return ret;

fail_free_filters:
        free_filters_list(&filters);

fail_clear_files:
        event->addr_filters.nr_file_filters = 0;

        return ret;
}

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
        int ret = -EINVAL;
        char *filter_str;

        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);

#ifdef CONFIG_EVENT_TRACING
        if (perf_event_is_tracing(event)) {
                struct perf_event_context *ctx = event->ctx;

                /*
                 * Beware, here be dragons!!
                 *
                 * the tracepoint muck will deadlock against ctx->mutex, but
                 * the tracepoint stuff does not actually need it. So
                 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
                 * already have a reference on ctx.
                 *
                 * This can result in event getting moved to a different ctx,
                 * but that does not affect the tracepoint state.
                 */
                mutex_unlock(&ctx->mutex);
                ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
                mutex_lock(&ctx->mutex);
        } else
#endif
        if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);

        kfree(filter_str);
        return ret;
}

/*
 * hrtimer based swevent callback
 */

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
        struct pt_regs *regs;
        struct perf_event *event;
        u64 period;

        event = container_of(hrtimer, struct perf_event, hw.hrtimer);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return HRTIMER_NORESTART;

        event->pmu->read(event);

        perf_sample_data_init(&data, 0, event->hw.last_period);
        regs = get_irq_regs();

        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (__perf_event_overflow(event, 1, &data, regs))
                                ret = HRTIMER_NORESTART;
        }

        period = max_t(u64, 10000, event->hw.sample_period);
        hrtimer_forward_now(hrtimer, ns_to_ktime(period));

        return ret;
}

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period;

        if (!is_sampling_event(event))
                return;

        period = local64_read(&hwc->period_left);
        if (period) {
                if (period < 0)
                        period = 10000;

                local64_set(&hwc->period_left, 0);
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
                      HRTIMER_MODE_REL_PINNED_HARD);
}

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));

                hrtimer_cancel(&hwc->hrtimer);
        }
}

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (!is_sampling_event(event))
                return;

        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hwc->hrtimer.function = perf_swevent_hrtimer;

        /*
         * Since hrtimers have a fixed rate, we can do a static freq->period
         * mapping and avoid the whole period adjust feedback stuff.
         */
        if (event->attr.freq) {
                long freq = event->attr.sample_freq;

                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
}

/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
{
        s64 prev;
        u64 now;

        now = local_clock();
        prev = local64_xchg(&event->hw.prev_count, now);
        local64_add(now - prev, &event->count);
}

static void cpu_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, local_clock());
        perf_swevent_start_hrtimer(event);
}

static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        cpu_clock_event_update(event);
}

static int cpu_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
        cpu_clock_event_stop(event, flags);
}

static void cpu_clock_event_read(struct perf_event *event)
{
        cpu_clock_event_update(event);
}

static int cpu_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_cpu_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_cpu_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = cpu_clock_event_init,
        .add                = cpu_clock_event_add,
        .del                = cpu_clock_event_del,
        .start                = cpu_clock_event_start,
        .stop                = cpu_clock_event_stop,
        .read                = cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
{
        u64 prev;
        s64 delta;

        prev = local64_xchg(&event->hw.prev_count, now);
        delta = now - prev;
        local64_add(delta, &event->count);
}

static void task_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, event->ctx->time);
        perf_swevent_start_hrtimer(event);
}

static void task_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        task_clock_event_update(event, event->ctx->time);
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
        task_clock_event_stop(event, PERF_EF_UPDATE);
}

static void task_clock_event_read(struct perf_event *event)
{
        u64 now = perf_clock();
        u64 delta = now - event->ctx->timestamp;
        u64 time = event->ctx->time + delta;

        task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_task_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_task_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = task_clock_event_init,
        .add                = task_clock_event_add,
        .del                = task_clock_event_del,
        .start                = task_clock_event_start,
        .stop                = task_clock_event_stop,
        .read                = task_clock_event_read,
};

static void perf_pmu_nop_void(struct pmu *pmu)
{
}

static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
{
}

static int perf_pmu_nop_int(struct pmu *pmu)
{
        return 0;
}

static int perf_event_nop_int(struct perf_event *event, u64 value)
{
        return 0;
}

static DEFINE_PER_CPU(unsigned int, nop_txn_flags);

static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
        __this_cpu_write(nop_txn_flags, flags);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_disable(pmu);
}

static int perf_pmu_commit_txn(struct pmu *pmu)
{
        unsigned int flags = __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return 0;

        perf_pmu_enable(pmu);
        return 0;
}

static void perf_pmu_cancel_txn(struct pmu *pmu)
{
        unsigned int flags =  __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_enable(pmu);
}

static int perf_event_idx_default(struct perf_event *event)
{
        return 0;
}

static void free_pmu_context(struct pmu *pmu)
{
        free_percpu(pmu->cpu_pmu_context);
}

/*
 * Let userspace know that this PMU supports address range filtering:
 */
static ssize_t nr_addr_filters_show(struct device *dev,
                                    struct device_attribute *attr,
                                    char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);

static struct idr pmu_idr;

static ssize_t
type_show(struct device *dev, struct device_attribute *attr, char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);

static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
                                struct device_attribute *attr,
                                char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
}

static DEFINE_MUTEX(mux_interval_mutex);

static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *buf, size_t count)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        int timer, cpu, ret;

        ret = kstrtoint(buf, 0, &timer);
        if (ret)
                return ret;

        if (timer < 1)
                return -EINVAL;

        /* same value, noting to do */
        if (timer == pmu->hrtimer_interval_ms)
                return count;

        mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;

        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc;
                cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);

                cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);

        return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);

static struct attribute *pmu_dev_attrs[] = {
        &dev_attr_type.attr,
        &dev_attr_perf_event_mux_interval_ms.attr,
        &dev_attr_nr_addr_filters.attr,
        NULL,
};

static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct pmu *pmu = dev_get_drvdata(dev);

        if (n == 2 && !pmu->nr_addr_filters)
                return 0;

        return a->mode;
}

static struct attribute_group pmu_dev_attr_group = {
        .is_visible = pmu_dev_is_visible,
        .attrs = pmu_dev_attrs,
};

static const struct attribute_group *pmu_dev_groups[] = {
        &pmu_dev_attr_group,
        NULL,
};

static int pmu_bus_running;
static struct bus_type pmu_bus = {
        .name                = "event_source",
        .dev_groups        = pmu_dev_groups,
};

static void pmu_dev_release(struct device *dev)
{
        kfree(dev);
}

static int pmu_dev_alloc(struct pmu *pmu)
{
        int ret = -ENOMEM;

        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
        if (!pmu->dev)
                goto out;

        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);

        dev_set_drvdata(pmu->dev, pmu);
        pmu->dev->bus = &pmu_bus;
        pmu->dev->parent = pmu->parent;
        pmu->dev->release = pmu_dev_release;

        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
                goto free_dev;

        ret = device_add(pmu->dev);
        if (ret)
                goto free_dev;

        if (pmu->attr_update) {
                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
                if (ret)
                        goto del_dev;
        }

out:
        return ret;

del_dev:
        device_del(pmu->dev);

free_dev:
        put_device(pmu->dev);
        goto out;
}

static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;

int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
        int cpu, ret, max = PERF_TYPE_MAX;

        mutex_lock(&pmus_lock);
        ret = -ENOMEM;
        pmu->pmu_disable_count = alloc_percpu(int);
        if (!pmu->pmu_disable_count)
                goto unlock;

        pmu->type = -1;
        if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
                ret = -EINVAL;
                goto free_pdc;
        }

        pmu->name = name;

        if (type >= 0)
                max = type;

        ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
        if (ret < 0)
                goto free_pdc;

        WARN_ON(type >= 0 && ret != type);

        type = ret;
        pmu->type = type;

        if (pmu_bus_running && !pmu->dev) {
                ret = pmu_dev_alloc(pmu);
                if (ret)
                        goto free_idr;
        }

        ret = -ENOMEM;
        pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
        if (!pmu->cpu_pmu_context)
                goto free_dev;

        for_each_possible_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc;

                cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                __perf_init_event_pmu_context(&cpc->epc, pmu);
                __perf_mux_hrtimer_init(cpc, cpu);
        }

        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
                         * If we have pmu_enable/pmu_disable calls, install
                         * transaction stubs that use that to try and batch
                         * hardware accesses.
                         */
                        pmu->start_txn  = perf_pmu_start_txn;
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
                        pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
        }

        if (!pmu->pmu_enable) {
                pmu->pmu_enable  = perf_pmu_nop_void;
                pmu->pmu_disable = perf_pmu_nop_void;
        }

        if (!pmu->check_period)
                pmu->check_period = perf_event_nop_int;

        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;

        list_add_rcu(&pmu->entry, &pmus);
        atomic_set(&pmu->exclusive_cnt, 0);
        ret = 0;
unlock:
        mutex_unlock(&pmus_lock);

        return ret;

free_dev:
        if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
                device_del(pmu->dev);
                put_device(pmu->dev);
        }

free_idr:
        idr_remove(&pmu_idr, pmu->type);

free_pdc:
        free_percpu(pmu->pmu_disable_count);
        goto unlock;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);

void perf_pmu_unregister(struct pmu *pmu)
{
        mutex_lock(&pmus_lock);
        list_del_rcu(&pmu->entry);

        /*
         * We dereference the pmu list under both SRCU and regular RCU, so
         * synchronize against both of those.
         */
        synchronize_srcu(&pmus_srcu);
        synchronize_rcu();

        free_percpu(pmu->pmu_disable_count);
        idr_remove(&pmu_idr, pmu->type);
        if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
                if (pmu->nr_addr_filters)
                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
                device_del(pmu->dev);
                put_device(pmu->dev);
        }
        free_pmu_context(pmu);
        mutex_unlock(&pmus_lock);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);

static inline bool has_extended_regs(struct perf_event *event)
{
        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
}

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
        struct perf_event_context *ctx = NULL;
        int ret;

        if (!try_module_get(pmu->module))
                return -ENODEV;

        /*
         * A number of pmu->event_init() methods iterate the sibling_list to,
         * for example, validate if the group fits on the PMU. Therefore,
         * if this is a sibling event, acquire the ctx->mutex to protect
         * the sibling_list.
         */
        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
                 */
                ctx = perf_event_ctx_lock_nested(event->group_leader,
                                                 SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }

        event->pmu = pmu;
        ret = pmu->event_init(event);

        if (ctx)
                perf_event_ctx_unlock(event->group_leader, ctx);

        if (!ret) {
                if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
                    has_extended_regs(event))
                        ret = -EOPNOTSUPP;

                if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
                    event_has_any_exclude_flag(event))
                        ret = -EINVAL;

                if (ret && event->destroy)
                        event->destroy(event);
        }

        if (ret)
                module_put(pmu->module);

        return ret;
}

static struct pmu *perf_init_event(struct perf_event *event)
{
        bool extended_type = false;
        int idx, type, ret;
        struct pmu *pmu;

        idx = srcu_read_lock(&pmus_srcu);

        /*
         * Save original type before calling pmu->event_init() since certain
         * pmus overwrites event->attr.type to forward event to another pmu.
         */
        event->orig_type = event->attr.type;

        /* Try parent's PMU first: */
        if (event->parent && event->parent->pmu) {
                pmu = event->parent->pmu;
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;
        }

        /*
         * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
         * are often aliases for PERF_TYPE_RAW.
         */
        type = event->attr.type;
        if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
                type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
                if (!type) {
                        type = PERF_TYPE_RAW;
                } else {
                        extended_type = true;
                        event->attr.config &= PERF_HW_EVENT_MASK;
                }
        }

again:
        rcu_read_lock();
        pmu = idr_find(&pmu_idr, type);
        rcu_read_unlock();
        if (pmu) {
                if (event->attr.type != type && type != PERF_TYPE_RAW &&
                    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
                        goto fail;

                ret = perf_try_init_event(pmu, event);
                if (ret == -ENOENT && event->attr.type != type && !extended_type) {
                        type = event->attr.type;
                        goto again;
                }

                if (ret)
                        pmu = ERR_PTR(ret);

                goto unlock;
        }

        list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;

                if (ret != -ENOENT) {
                        pmu = ERR_PTR(ret);
                        goto unlock;
                }
        }
fail:
        pmu = ERR_PTR(-ENOENT);
unlock:
        srcu_read_unlock(&pmus_srcu, idx);

        return pmu;
}

static void attach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_add_rcu(&event->sb_list, &pel->list);
        raw_spin_unlock(&pel->lock);
}

/*
 * We keep a list of all !task (and therefore per-cpu) events
 * that need to receive side-band records.
 *
 * This avoids having to scan all the various PMU per-cpu contexts
 * looking for them.
 */
static void account_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                attach_sb_event(event);
}

/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        /* Lock so we don't race with concurrent unaccount */
        spin_lock(&nr_freq_lock);
        if (atomic_inc_return(&nr_freq_events) == 1)
                tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void account_freq_event(void)
{
        if (tick_nohz_full_enabled())
                account_freq_event_nohz();
        else
                atomic_inc(&nr_freq_events);
}


static void account_event(struct perf_event *event)
{
        bool inc = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_inc(&nr_build_id_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_inc(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_inc(&nr_cgroup_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
                account_freq_event();
        if (event->attr.context_switch) {
                atomic_inc(&nr_switch_events);
                inc = true;
        }
        if (has_branch_stack(event))
                inc = true;
        if (is_cgroup_event(event))
                inc = true;
        if (event->attr.ksymbol)
                atomic_inc(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_inc(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_inc(&nr_text_poke_events);

        if (inc) {
                /*
                 * We need the mutex here because static_branch_enable()
                 * must complete *before* the perf_sched_count increment
                 * becomes visible.
                 */
                if (atomic_inc_not_zero(&perf_sched_count))
                        goto enabled;

                mutex_lock(&perf_sched_mutex);
                if (!atomic_read(&perf_sched_count)) {
                        static_branch_enable(&perf_sched_events);
                        /*
                         * Guarantee that all CPUs observe they key change and
                         * call the perf scheduling hooks before proceeding to
                         * install events that need them.
                         */
                        synchronize_rcu();
                }
                /*
                 * Now that we have waited for the sync_sched(), allow further
                 * increments to by-pass the mutex.
                 */
                atomic_inc(&perf_sched_count);
                mutex_unlock(&perf_sched_mutex);
        }
enabled:

        account_pmu_sb_event(event);
}

/*
 * Allocate and initialize an event structure
 */
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
                 void *context, int cgroup_fd)
{
        struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err = -EINVAL;
        int node;

        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
                        return ERR_PTR(-EINVAL);
        }
        if (attr->sigtrap && !task) {
                /* Requires a task: avoid signalling random tasks. */
                return ERR_PTR(-EINVAL);
        }

        node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
        event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
                                      node);
        if (!event)
                return ERR_PTR(-ENOMEM);

        /*
         * Single events are their own group leaders, with an
         * empty sibling list:
         */
        if (!group_leader)
                group_leader = event;

        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->active_list);
        init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
        INIT_HLIST_NODE(&event->hlist_entry);


        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending_irq, perf_pending_irq);
        init_task_work(&event->pending_task, perf_pending_task);

        mutex_init(&event->mmap_mutex);
        raw_spin_lock_init(&event->addr_filters.lock);

        atomic_long_set(&event->refcount, 1);
        event->cpu                = cpu;
        event->attr                = *attr;
        event->group_leader        = group_leader;
        event->pmu                = NULL;
        event->oncpu                = -1;

        event->parent                = parent_event;

        event->ns                = get_pid_ns(task_active_pid_ns(current));
        event->id                = atomic64_inc_return(&perf_event_id);

        event->state                = PERF_EVENT_STATE_INACTIVE;

        if (parent_event)
                event->event_caps = parent_event->event_caps;

        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
                /*
                 * XXX pmu::event_init needs to know what task to account to
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
                event->hw.target = get_task_struct(task);
        }

        event->clock = &local_clock;
        if (parent_event)
                event->clock = parent_event->clock;

        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
                if (parent_event->prog) {
                        struct bpf_prog *prog = parent_event->prog;

                        bpf_prog_inc(prog);
                        event->prog = prog;
                }
#endif
        }

        if (overflow_handler) {
                event->overflow_handler        = overflow_handler;
                event->overflow_handler_context = context;
        } else if (is_write_backward(event)){
                event->overflow_handler = perf_event_output_backward;
                event->overflow_handler_context = NULL;
        } else {
                event->overflow_handler = perf_event_output_forward;
                event->overflow_handler_context = NULL;
        }

        perf_event__state_init(event);

        pmu = NULL;

        hwc = &event->hw;
        hwc->sample_period = attr->sample_period;
        if (attr->freq && attr->sample_freq)
                hwc->sample_period = 1;
        hwc->last_period = hwc->sample_period;

        local64_set(&hwc->period_left, hwc->sample_period);

        /*
         * We currently do not support PERF_SAMPLE_READ on inherited events.
         * See perf_output_read().
         */
        if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
                goto err_ns;

        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;

        pmu = perf_init_event(event);
        if (IS_ERR(pmu)) {
                err = PTR_ERR(pmu);
                goto err_ns;
        }

        /*
         * Disallow uncore-task events. Similarly, disallow uncore-cgroup
         * events (they don't make sense as the cgroup will be different
         * on other CPUs in the uncore mask).
         */
        if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
                err = -EINVAL;
                goto err_pmu;
        }

        if (event->attr.aux_output &&
            !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
                err = -EOPNOTSUPP;
                goto err_pmu;
        }

        if (cgroup_fd != -1) {
                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
                if (err)
                        goto err_pmu;
        }

        err = exclusive_event_init(event);
        if (err)
                goto err_pmu;

        if (has_addr_filter(event)) {
                event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
                                                    sizeof(struct perf_addr_filter_range),
                                                    GFP_KERNEL);
                if (!event->addr_filter_ranges) {
                        err = -ENOMEM;
                        goto err_per_task;
                }

                /*
                 * Clone the parent's vma offsets: they are valid until exec()
                 * even if the mm is not shared with the parent.
                 */
                if (event->parent) {
                        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

                        raw_spin_lock_irq(&ifh->lock);
                        memcpy(event->addr_filter_ranges,
                               event->parent->addr_filter_ranges,
                               pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
                        raw_spin_unlock_irq(&ifh->lock);
                }

                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
        }

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers(attr->sample_max_stack);
                        if (err)
                                goto err_addr_filters;
                }
        }

        err = security_perf_event_alloc(event);
        if (err)
                goto err_callchain_buffer;

        /* symmetric to unaccount_event() in _free_event() */
        account_event(event);

        return event;

err_callchain_buffer:
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }
err_addr_filters:
        kfree(event->addr_filter_ranges);

err_per_task:
        exclusive_event_destroy(event);

err_pmu:
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
        if (event->destroy)
                event->destroy(event);
        module_put(pmu->module);
err_ns:
        if (event->hw.target)
                put_task_struct(event->hw.target);
        call_rcu(&event->rcu_head, free_event_rcu);

        return ERR_PTR(err);
}

static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr)
{
        u32 size;
        int ret;

        /* Zero the full structure, so that a short copy will be nice. */
        memset(attr, 0, sizeof(*attr));

        ret = get_user(size, &uattr->size);
        if (ret)
                return ret;

        /* ABI compatibility quirk: */
        if (!size)
                size = PERF_ATTR_SIZE_VER0;
        if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
                goto err_size;

        ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
        if (ret) {
                if (ret == -E2BIG)
                        goto err_size;
                return ret;
        }

        attr->size = size;

        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                return -EINVAL;

        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
                return -EINVAL;

        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;

        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
                u64 mask = attr->branch_sample_type;

                /* only using defined bits */
                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
                        return -EINVAL;

                /* at least one branch bit must be set */
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;

                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {

                        /* exclude_kernel checked on syscall entry */
                        if (!attr->exclude_kernel)
                                mask |= PERF_SAMPLE_BRANCH_KERNEL;

                        if (!attr->exclude_user)
                                mask |= PERF_SAMPLE_BRANCH_USER;

                        if (!attr->exclude_hv)
                                mask |= PERF_SAMPLE_BRANCH_HV;
                        /*
                         * adjust user setting (for HW filter setup)
                         */
                        attr->branch_sample_type = mask;
                }
                /* privileged levels capture (kernel, hv): check permissions */
                if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
                        ret = perf_allow_kernel(attr);
                        if (ret)
                                return ret;
                }
        }

        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
                ret = perf_reg_validate(attr->sample_regs_user);
                if (ret)
                        return ret;
        }

        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
                if (!arch_perf_have_user_stack_dump())
                        return -ENOSYS;

                /*
                 * We have __u32 type for the size, but so far
                 * we can only use __u16 as maximum due to the
                 * __u16 sample size limit.
                 */
                if (attr->sample_stack_user >= USHRT_MAX)
                        return -EINVAL;
                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
                        return -EINVAL;
        }

        if (!attr->sample_max_stack)
                attr->sample_max_stack = sysctl_perf_event_max_stack;

        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);

#ifndef CONFIG_CGROUP_PERF
        if (attr->sample_type & PERF_SAMPLE_CGROUP)
                return -EINVAL;
#endif
        if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
            (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
                return -EINVAL;

        if (!attr->inherit && attr->inherit_thread)
                return -EINVAL;

        if (attr->remove_on_exec && attr->enable_on_exec)
                return -EINVAL;

        if (attr->sigtrap && !attr->remove_on_exec)
                return -EINVAL;

out:
        return ret;

err_size:
        put_user(sizeof(*attr), &uattr->size);
        ret = -E2BIG;
        goto out;
}

static void mutex_lock_double(struct mutex *a, struct mutex *b)
{
        if (b < a)
                swap(a, b);

        mutex_lock(a);
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}

static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
        struct perf_buffer *rb = NULL;
        int ret = -EINVAL;

        if (!output_event) {
                mutex_lock(&event->mmap_mutex);
                goto set;
        }

        /* don't allow circular references */
        if (event == output_event)
                goto out;

        /*
         * Don't allow cross-cpu buffers
         */
        if (output_event->cpu != event->cpu)
                goto out;

        /*
         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
                goto out;

        /*
         * Mixing clocks in the same buffer is trouble you don't need.
         */
        if (output_event->clock != event->clock)
                goto out;

        /*
         * Either writing ring buffer from beginning or from end.
         * Mixing is not allowed.
         */
        if (is_write_backward(output_event) != is_write_backward(event))
                goto out;

        /*
         * If both events generate aux data, they must be on the same PMU
         */
        if (has_aux(event) && has_aux(output_event) &&
            event->pmu != output_event->pmu)
                goto out;

        /*
         * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
         * output_event is already on rb->event_list, and the list iteration
         * restarts after every removal, it is guaranteed this new event is
         * observed *OR* if output_event is already removed, it's guaranteed we
         * observe !rb->mmap_count.
         */
        mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
        /* Can't redirect output if we've got an active mmap() */
        if (atomic_read(&event->mmap_count))
                goto unlock;

        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
                if (!rb)
                        goto unlock;

                /* did we race against perf_mmap_close() */
                if (!atomic_read(&rb->mmap_count)) {
                        ring_buffer_put(rb);
                        goto unlock;
                }
        }

        ring_buffer_attach(event, rb);

        ret = 0;
unlock:
        mutex_unlock(&event->mmap_mutex);
        if (output_event)
                mutex_unlock(&output_event->mmap_mutex);

out:
        return ret;
}

static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
        bool nmi_safe = false;

        switch (clk_id) {
        case CLOCK_MONOTONIC:
                event->clock = &ktime_get_mono_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_MONOTONIC_RAW:
                event->clock = &ktime_get_raw_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_REALTIME:
                event->clock = &ktime_get_real_ns;
                break;

        case CLOCK_BOOTTIME:
                event->clock = &ktime_get_boottime_ns;
                break;

        case CLOCK_TAI:
                event->clock = &ktime_get_clocktai_ns;
                break;

        default:
                return -EINVAL;
        }

        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
                return -EINVAL;

        return 0;
}

static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
        unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
        bool is_capable = perfmon_capable();

        if (attr->sigtrap) {
                /*
                 * perf_event_attr::sigtrap sends signals to the other task.
                 * Require the current task to also have CAP_KILL.
                 */
                rcu_read_lock();
                is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
                rcu_read_unlock();

                /*
                 * If the required capabilities aren't available, checks for
                 * ptrace permissions: upgrade to ATTACH, since sending signals
                 * can effectively change the target task.
                 */
                ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
        }

        /*
         * Preserve ptrace permission check for backwards compatibility. The
         * ptrace check also includes checks that the current task and other
         * task have matching uids, and is therefore not done here explicitly.
         */
        return is_capable || ptrace_may_access(task, ptrace_mode);
}

/**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
 * @attr_uptr:        event_id type attributes for monitoring/sampling
 * @pid:                target pid
 * @cpu:                target cpu
 * @group_fd:                group leader event fd
 * @flags:                perf event open flags
 */
SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
        int cgroup_fd = -1;

        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;

        err = perf_copy_attr(attr_uptr, &attr);
        if (err)
                return err;

        /* Do we allow access to perf_event_open(2) ? */
        err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
        if (err)
                return err;

        if (!attr.exclude_kernel) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        if (attr.namespaces) {
                if (!perfmon_capable())
                        return -EACCES;
        }

        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (attr.sample_period & (1ULL << 63))
                        return -EINVAL;
        }

        /* Only privileged users can get physical addresses */
        if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        /* REGS_INTR can leak data, lockdown must prevent this */
        if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
                err = security_locked_down(LOCKDOWN_PERF);
                if (err)
                        return err;
        }

        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
         * designates the cpu on which to monitor threads from that
         * cgroup.
         */
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;

        if (flags & PERF_FLAG_FD_CLOEXEC)
                f_flags |= O_CLOEXEC;

        event_fd = get_unused_fd_flags(f_flags);
        if (event_fd < 0)
                return event_fd;

        if (group_fd != -1) {
                err = perf_fget_light(group_fd, &group);
                if (err)
                        goto err_fd;
                group_leader = group.file->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
                        group_leader = NULL;
        }

        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
                        goto err_group_fd;
                }
        }

        if (task && group_leader &&
            group_leader->attr.inherit != attr.inherit) {
                err = -EINVAL;
                goto err_task;
        }

        if (flags & PERF_FLAG_PID_CGROUP)
                cgroup_fd = pid;

        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
        }

        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -EOPNOTSUPP;
                        goto err_alloc;
                }
        }

        /*
         * Special case software events and allow them to be part of
         * any hardware group.
         */
        pmu = event->pmu;

        if (attr.use_clockid) {
                err = perf_event_set_clock(event, attr.clockid);
                if (err)
                        goto err_alloc;
        }

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        if (task) {
                err = down_read_interruptible(&task->signal->exec_update_lock);
                if (err)
                        goto err_alloc;

                /*
                 * We must hold exec_update_lock across this and any potential
                 * perf_install_in_context() call for this new event to
                 * serialize against exec() altering our credentials (and the
                 * perf_event_exit_task() that could imply).
                 */
                err = -EACCES;
                if (!perf_check_permission(&attr, task))
                        goto err_cred;
        }

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_cred;
        }

        mutex_lock(&ctx->mutex);

        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_locked;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);

                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_locked;
                }
        }

        if (group_leader) {
                err = -EINVAL;

                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_locked;

                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
                        goto err_locked;

                /*
                 * Make sure we're both events for the same CPU;
                 * grouping events for different CPUs is broken; since
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
                        goto err_locked;

                /*
                 * Make sure we're both on the same context; either task or cpu.
                 */
                if (group_leader->ctx != ctx)
                        goto err_locked;

                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
                        goto err_locked;

                if (is_software_event(event) &&
                    !in_software_context(group_leader)) {
                        /*
                         * If the event is a sw event, but the group_leader
                         * is on hw context.
                         *
                         * Allow the addition of software events to hw
                         * groups, this is safe because software events
                         * never fail to schedule.
                         *
                         * Note the comment that goes with struct
                         * perf_event_pmu_context.
                         */
                        pmu = group_leader->pmu_ctx->pmu;
                } else if (!is_software_event(event)) {
                        if (is_software_event(group_leader) &&
                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                                /*
                                 * In case the group is a pure software group, and we
                                 * try to add a hardware event, move the whole group to
                                 * the hardware context.
                                 */
                                move_group = 1;
                        }

                        /* Don't allow group of multiple hw events from different pmus */
                        if (!in_software_context(group_leader) &&
                            group_leader->pmu_ctx->pmu != pmu)
                                goto err_locked;
                }
        }

        /*
         * Now that we're certain of the pmu; find the pmu_ctx.
         */
        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_locked;
        }
        event->pmu_ctx = pmu_ctx;

        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
                        goto err_context;
        }

        if (!perf_event_validate_size(event)) {
                err = -E2BIG;
                goto err_context;
        }

        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
                goto err_context;
        }

        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_context;
        }

        WARN_ON_ONCE(ctx->parent_ctx);

        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
                event_file = NULL;
                goto err_context;
        }

        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */

        if (move_group) {
                perf_remove_from_context(group_leader, 0);
                put_pmu_ctx(group_leader->pmu_ctx);

                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                }

                /*
                 * Install the group siblings before the group leader.
                 *
                 * Because a group leader will try and install the entire group
                 * (through the sibling list, which is still in-tact), we can
                 * end up with siblings installed in the wrong context.
                 *
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
                        sibling->pmu_ctx = pmu_ctx;
                        get_pmu_ctx(pmu_ctx);
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                }

                /*
                 * Removing from the context ends up with disabled
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
                group_leader->pmu_ctx = pmu_ctx;
                get_pmu_ctx(pmu_ctx);
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
        }

        /*
         * Precalculate sample_data sizes; do while holding ctx::mutex such
         * that we're serialized against further additions and before
         * perf_install_in_context() which is the point the event is active and
         * can use these values.
         */
        perf_event__header_size(event);
        perf_event__id_header_size(event);

        event->owner = current;

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);

        mutex_unlock(&ctx->mutex);

        if (task) {
                up_read(&task->signal->exec_update_lock);
                put_task_struct(task);
        }

        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);

        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
         * perf_group_detach().
         */
        fdput(group);
        fd_install(event_fd, event_file);
        return event_fd;

err_context:
        put_pmu_ctx(event->pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_locked:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
err_alloc:
        free_event(event);
err_task:
        if (task)
                put_task_struct(task);
err_group_fd:
        fdput(group);
err_fd:
        put_unused_fd(event_fd);
        return err;
}

/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
 * @task: task to profile (NULL for percpu)
 * @overflow_handler: callback to trigger when we hit the event
 * @context: context data could be used in overflow_handler callback
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event_context *ctx;
        struct perf_event *event;
        struct pmu *pmu;
        int err;

        /*
         * Grouping is not supported for kernel events, neither is 'AUX',
         * make sure the caller's intentions are adjusted.
         */
        if (attr->aux_output)
                return ERR_PTR(-EINVAL);

        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
        }

        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;
        pmu = event->pmu;

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }

        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_unlock;
        }

        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_unlock;
        }
        event->pmu_ctx = pmu_ctx;

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_pmu_ctx;
                }
        }

        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_pmu_ctx;
        }

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);

        return event;

err_pmu_ctx:
        put_pmu_ctx(pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_alloc:
        free_event(event);
err:
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

static void __perf_pmu_remove(struct perf_event_context *ctx,
                              int cpu, struct pmu *pmu,
                              struct perf_event_groups *groups,
                              struct list_head *events)
{
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
                perf_remove_from_context(event, 0);
                put_pmu_ctx(event->pmu_ctx);
                list_add(&event->migrate_entry, events);

                for_each_sibling_event(sibling, event) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                        list_add(&sibling->migrate_entry, events);
                }
        }
}

static void __perf_pmu_install_event(struct pmu *pmu,
                                     struct perf_event_context *ctx,
                                     int cpu, struct perf_event *event)
{
        struct perf_event_pmu_context *epc;
        struct perf_event_context *old_ctx = event->ctx;

        get_ctx(ctx); /* normally find_get_context() */

        event->cpu = cpu;
        epc = find_get_pmu_context(pmu, ctx, event);
        event->pmu_ctx = epc;

        if (event->state >= PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_INACTIVE;
        perf_install_in_context(ctx, event, cpu);

        /*
         * Now that event->ctx is updated and visible, put the old ctx.
         */
        put_ctx(old_ctx);
}

static void __perf_pmu_install(struct perf_event_context *ctx,
                               int cpu, struct pmu *pmu, struct list_head *events)
{
        struct perf_event *event, *tmp;

        /*
         * Re-instate events in 2 passes.
         *
         * Skip over group leaders and only install siblings on this first
         * pass, siblings will not get enabled without a leader, however a
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                if (event->group_leader == event)
                        continue;

                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }

        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
}

void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{
        struct perf_event_context *src_ctx, *dst_ctx;
        LIST_HEAD(events);

        /*
         * Since per-cpu context is persistent, no need to grab an extra
         * reference.
         */
        src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;

        /*
         * See perf_event_ctx_lock() for comments on the details
         * of swizzling perf_event::ctx.
         */
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);

        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);

        if (!list_empty(&events)) {
                /*
                 * Wait for the events to quiesce before re-instating them.
                 */
                synchronize_rcu();

                __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
        }

        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event)
{
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;

        if (child_event->attr.inherit_stat) {
                struct task_struct *task = child_event->ctx->task;

                if (task && task != TASK_TOMBSTONE)
                        perf_event_read_event(child_event, task);
        }

        child_val = perf_event_count(child_event);

        /*
         * Add back the child's count to the parent's count:
         */
        atomic64_add(child_val, &parent_event->child_count);
        atomic64_add(child_event->total_time_enabled,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
}

static void
perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event *parent_event = event->parent;
        unsigned long detach_flags = 0;

        if (parent_event) {
                /*
                 * Do not destroy the 'original' grouping; because of the
                 * context switch optimization the original events could've
                 * ended up in a random child task.
                 *
                 * If we were to destroy the original group, all group related
                 * operations would cease to function properly after this
                 * random child dies.
                 *
                 * Do destroy all inherited groups, we don't care about those
                 * and being thorough is better.
                 */
                detach_flags = DETACH_GROUP | DETACH_CHILD;
                mutex_lock(&parent_event->child_mutex);
        }

        perf_remove_from_context(event, detach_flags);

        raw_spin_lock_irq(&ctx->lock);
        if (event->state > PERF_EVENT_STATE_EXIT)
                perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
        raw_spin_unlock_irq(&ctx->lock);

        /*
         * Child events can be freed.
         */
        if (parent_event) {
                mutex_unlock(&parent_event->child_mutex);
                /*
                 * Kick perf_poll() for is_event_hup();
                 */
                perf_event_wakeup(parent_event);
                free_event(event);
                put_event(parent_event);
                return;
        }

        /*
         * Parent events are governed by their filedesc, retain them.
         */
        perf_event_wakeup(event);
}

static void perf_event_exit_task_context(struct task_struct *child)
{
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;

        WARN_ON_ONCE(child != current);

        child_ctx = perf_pin_task_context(child);
        if (!child_ctx)
                return;

        /*
         * In order to reduce the amount of tricky in ctx tear-down, we hold
         * ctx::mutex over the entire thing. This serializes against almost
         * everything that wants to access the ctx.
         *
         * The exception is sys_perf_event_open() /
         * perf_event_create_kernel_count() which does find_get_context()
         * without ctx::mutex (it cannot because of the move_group double mutex
         * lock thing). See the comments in perf_install_in_context().
         */
        mutex_lock(&child_ctx->mutex);

        /*
         * In a single ctx::lock section, de-schedule the events and detach the
         * context from the task such that we cannot ever get it scheduled back
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
        task_ctx_sched_out(child_ctx, EVENT_ALL);

        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
        RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
        put_ctx(child_ctx); /* cannot be last */
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        put_task_struct(current); /* cannot be last */

        clone_ctx = unclone_ctx(child_ctx);
        raw_spin_unlock_irq(&child_ctx->lock);

        if (clone_ctx)
                put_ctx(clone_ctx);

        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
         */
        perf_event_task(child, child_ctx, 0);

        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
                perf_event_exit_event(child_event, child_ctx);

        mutex_unlock(&child_ctx->mutex);

        put_ctx(child_ctx);
}

/*
 * When a child task exits, feed back event values to parent events.
 *
 * Can be called with exec_update_lock held when called from
 * setup_new_exec().
 */
void perf_event_exit_task(struct task_struct *child)
{
        struct perf_event *event, *tmp;

        mutex_lock(&child->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
                                 owner_entry) {
                list_del_init(&event->owner_entry);

                /*
                 * Ensure the list deletion is visible before we clear
                 * the owner, closes a race against perf_release() where
                 * we need to serialize on the owner->perf_event_mutex.
                 */
                smp_store_release(&event->owner, NULL);
        }
        mutex_unlock(&child->perf_event_mutex);

        perf_event_exit_task_context(child);

        /*
         * The perf_event_exit_task_context calls perf_event_task
         * with child's task_ctx, which generates EXIT events for
         * child contexts and sets child->perf_event_ctxp[] to NULL.
         * At this point we need to send EXIT events to cpu contexts.
         */
        perf_event_task(child, NULL, 0);
}

static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
{
        struct perf_event *parent = event->parent;

        if (WARN_ON_ONCE(!parent))
                return;

        mutex_lock(&parent->child_mutex);
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);

        put_event(parent);

        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
        free_event(event);
}

/*
 * Free a context as created by inheritance by perf_event_init_task() below,
 * used by fork() in case of fail.
 *
 * Even though the task has never lived, the context and events have been
 * exposed through the child_list, so we must take care tearing it all down.
 */
void perf_event_free_task(struct task_struct *task)
{
        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;

        ctx = rcu_access_pointer(task->perf_event_ctxp);
        if (!ctx)
                return;

        mutex_lock(&ctx->mutex);
        raw_spin_lock_irq(&ctx->lock);
        /*
         * Destroy the task <-> ctx relation and mark the context dead.
         *
         * This is important because even though the task hasn't been
         * exposed yet the context has been (through child_list).
         */
        RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
        WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
        put_task_struct(task); /* cannot be last */
        raw_spin_unlock_irq(&ctx->lock);


        list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
                perf_free_event(event, ctx);

        mutex_unlock(&ctx->mutex);

        /*
         * perf_event_release_kernel() could've stolen some of our
         * child events and still have them on its free_list. In that
         * case we must wait for these events to have been freed (in
         * particular all their references to this task must've been
         * dropped).
         *
         * Without this copy_process() will unconditionally free this
         * task (irrespective of its reference count) and
         * _free_event()'s put_task_struct(event->hw.target) will be a
         * use-after-free.
         *
         * Wait for all events to drop their context reference.
         */
        wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
        put_ctx(ctx); /* must be last */
}

void perf_event_delayed_put(struct task_struct *task)
{
        WARN_ON_ONCE(task->perf_event_ctxp);
}

struct file *perf_event_get(unsigned int fd)
{
        struct file *file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);

        if (file->f_op != &perf_fops) {
                fput(file);
                return ERR_PTR(-EBADF);
        }

        return file;
}

const struct perf_event *perf_get_event(struct file *file)
{
        if (file->f_op != &perf_fops)
                return ERR_PTR(-EINVAL);

        return file->private_data;
}

const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        if (!event)
                return ERR_PTR(-EINVAL);

        return &event->attr;
}

/*
 * Inherit an event from parent task to child task.
 *
 * Returns:
 *  - valid pointer on success
 *  - NULL for orphaned events
 *  - IS_ERR() on error
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
{
        enum perf_event_state parent_state = parent_event->state;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *child_event;
        unsigned long flags;

        /*
         * Instead of creating recursive hierarchies of events,
         * we link inherited events back to the original parent,
         * which has a filp for sure, which we use as the reference
         * count:
         */
        if (parent_event->parent)
                parent_event = parent_event->parent;

        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;

        pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
        if (IS_ERR(pmu_ctx)) {
                free_event(child_event);
                return ERR_CAST(pmu_ctx);
        }
        child_event->pmu_ctx = pmu_ctx;

        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
         * must be under the same lock in order to serialize against
         * perf_event_release_kernel(), such that either we must observe
         * is_orphaned_event() or they will observe us on the child_list.
         */
        mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
                mutex_unlock(&parent_event->child_mutex);
                /* task_ctx_data is freed with child_ctx */
                free_event(child_event);
                return NULL;
        }

        get_ctx(child_ctx);

        /*
         * Make the child state follow the state of the parent event,
         * not its attr.disabled bit.  We hold the parent's mutex,
         * so we won't race with perf_event_{en, dis}able_family.
         */
        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
                child_event->state = PERF_EVENT_STATE_INACTIVE;
        else
                child_event->state = PERF_EVENT_STATE_OFF;

        if (parent_event->attr.freq) {
                u64 sample_period = parent_event->hw.sample_period;
                struct hw_perf_event *hwc = &child_event->hw;

                hwc->sample_period = sample_period;
                hwc->last_period   = sample_period;

                local64_set(&hwc->period_left, sample_period);
        }

        child_event->ctx = child_ctx;
        child_event->overflow_handler = parent_event->overflow_handler;
        child_event->overflow_handler_context
                = parent_event->overflow_handler_context;

        /*
         * Precalculate sample_data sizes
         */
        perf_event__header_size(child_event);
        perf_event__id_header_size(child_event);

        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
        add_event_to_ctx(child_event, child_ctx);
        child_event->attach_state |= PERF_ATTACH_CHILD;
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);

        /*
         * Link this into the parent event's child list
         */
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        return child_event;
}

/*
 * Inherits an event group.
 *
 * This will quietly suppress orphaned events; !inherit_event() is not an error.
 * This matches with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int inherit_group(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event_context *child_ctx)
{
        struct perf_event *leader;
        struct perf_event *sub;
        struct perf_event *child_ctr;

        leader = inherit_event(parent_event, parent, parent_ctx,
                                 child, NULL, child_ctx);
        if (IS_ERR(leader))
                return PTR_ERR(leader);
        /*
         * @leader can be NULL here because of is_orphaned_event(). In this
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
        for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
                        return PTR_ERR(child_ctr);

                if (sub->aux_event == parent_event && child_ctr &&
                    !perf_get_aux_event(child_ctr, leader))
                        return -EINVAL;
        }
        if (leader)
                leader->group_generation = parent_event->group_generation;
        return 0;
}

/*
 * Creates the child task context and tries to inherit the event-group.
 *
 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
 * inherited_all set when we 'fail' to inherit an orphaned event; this is
 * consistent with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
                   struct task_struct *child,
                   u64 clone_flags, int *inherited_all)
{
        struct perf_event_context *child_ctx;
        int ret;

        if (!event->attr.inherit ||
            (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
            /* Do not inherit if sigtrap and signal handlers were cleared. */
            (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
                *inherited_all = 0;
                return 0;
        }

        child_ctx = child->perf_event_ctxp;
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * inherit events that have been marked for cloning.
                 * First allocate and initialize a context for the
                 * child.
                 */
                child_ctx = alloc_perf_context(child);
                if (!child_ctx)
                        return -ENOMEM;

                child->perf_event_ctxp = child_ctx;
        }

        ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
        if (ret)
                *inherited_all = 0;

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
        unsigned long flags;
        int ret = 0;

        if (likely(!parent->perf_event_ctxp))
                return 0;

        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent);
        if (!parent_ctx)
                return 0;

        /*
         * No need to check if parent_ctx != NULL here; since we saw
         * it non-NULL earlier, the only reason for it to become NULL
         * is if we exit, and since we're currently in the middle of
         * a fork we can't be exiting at the same time.
         */

        /*
         * Lock the parent list. No need to lock the child - not PID
         * hashed yet and not running, so nobody can access it.
         */
        mutex_lock(&parent_ctx->mutex);

        /*
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        /*
         * We can't hold ctx->lock when iterating the ->flexible_group list due
         * to allocations, but we need to prevent rotation because
         * rotate_ctx() will change the list from interrupt context.
         */
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;

        child_ctx = child->perf_event_ctxp;

        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
                 *
                 * Note that if the parent is a clone, the holding of
                 * parent_ctx->lock avoids it from being uncloned.
                 */
                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
                } else {
                        child_ctx->parent_ctx = parent_ctx;
                        child_ctx->parent_gen = parent_ctx->generation;
                }
                get_ctx(child_ctx->parent_ctx);
        }

        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
out_unlock:
        mutex_unlock(&parent_ctx->mutex);

        perf_unpin_context(parent_ctx);
        put_ctx(parent_ctx);

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
        int ret;

        child->perf_event_ctxp = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);

        ret = perf_event_init_context(child, clone_flags);
        if (ret) {
                perf_event_free_task(child);
                return ret;
        }

        return 0;
}

static void __init perf_event_init_all_cpus(void)
{
        struct swevent_htable *swhash;
        struct perf_cpu_context *cpuctx;
        int cpu;

        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);

        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);

                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));

                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
                cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
                cpuctx->heap = cpuctx->heap_default;
        }
}

static void perf_swevent_init_cpu(unsigned int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);
        if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
                WARN_ON(!hlist);
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
}

#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx = __info;
        struct perf_event *event;

        raw_spin_lock(&ctx->lock);
        ctx_sched_out(ctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
}

static void perf_event_exit_cpu_context(int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        // XXX simplify cpuctx->online
        mutex_lock(&pmus_lock);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
        cpuctx->online = 0;
        mutex_unlock(&ctx->mutex);
        cpumask_clear_cpu(cpu, perf_online_mask);
        mutex_unlock(&pmus_lock);
}
#else

static void perf_event_exit_cpu_context(int cpu) { }

#endif

int perf_event_init_cpu(unsigned int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        perf_swevent_init_cpu(cpu);

        mutex_lock(&pmus_lock);
        cpumask_set_cpu(cpu, perf_online_mask);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        cpuctx->online = 1;
        mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);

        return 0;
}

int perf_event_exit_cpu(unsigned int cpu)
{
        perf_event_exit_cpu_context(cpu);
        return 0;
}

static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
        int cpu;

        for_each_online_cpu(cpu)
                perf_event_exit_cpu(cpu);

        return NOTIFY_OK;
}

/*
 * Run the perf reboot notifier at the very last possible moment so that
 * the generic watchdog code runs as long as possible.
 */
static struct notifier_block perf_reboot_notifier = {
        .notifier_call = perf_reboot,
        .priority = INT_MIN,
};

void __init perf_event_init(void)
{
        int ret;

        idr_init(&pmu_idr);

        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
        perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
        perf_pmu_register(&perf_task_clock, "task_clock", -1);
        perf_tp_register();
        perf_event_init_cpu(smp_processor_id());
        register_reboot_notifier(&perf_reboot_notifier);

        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

        perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);

        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
         */
        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
                     != 1024);
}

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct perf_pmu_events_attr *pmu_attr =
                container_of(attr, struct perf_pmu_events_attr, attr);

        if (pmu_attr->event_str)
                return sprintf(page, "%s\n", pmu_attr->event_str);

        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);

static int __init perf_event_sysfs_init(void)
{
        struct pmu *pmu;
        int ret;

        mutex_lock(&pmus_lock);

        ret = bus_register(&pmu_bus);
        if (ret)
                goto unlock;

        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->dev)
                        continue;

                ret = pmu_dev_alloc(pmu);
                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
        }
        pmu_bus_running = 1;
        ret = 0;

unlock:
        mutex_unlock(&pmus_lock);

        return ret;
}
device_initcall(perf_event_sysfs_init);

#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct perf_cgroup *jc;

        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
        if (!jc)
                return ERR_PTR(-ENOMEM);

        jc->info = alloc_percpu(struct perf_cgroup_info);
        if (!jc->info) {
                kfree(jc);
                return ERR_PTR(-ENOMEM);
        }

        return &jc->css;
}

static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);

        free_percpu(jc->info);
        kfree(jc);
}

static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
{
        perf_event_cgroup(css->cgroup);
        return 0;
}

static int __perf_cgroup_move(void *info)
{
        struct task_struct *task = info;

        preempt_disable();
        perf_cgroup_switch(task);
        preempt_enable();

        return 0;
}

static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *css;

        cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
}

struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc        = perf_cgroup_css_alloc,
        .css_free        = perf_cgroup_css_free,
        .css_online        = perf_cgroup_css_online,
        .attach                = perf_cgroup_attach,
        /*
         * Implicitly enable on dfl hierarchy so that perf events can
         * always be filtered by cgroup2 path as long as perf_event
         * controller is not mounted on a legacy hierarchy.
         */
        .implicit_on_dfl = true,
        .threaded        = true,
};
#endif /* CONFIG_CGROUP_PERF */

DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);




















































































































































































































































































































































































































































































































































































































































































































































    2 


    2 











































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/power/wakeup.c - System wakeup events framework
 *
 * Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
 */
#define pr_fmt(fmt) "PM: " fmt

#include <linux/device.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pm_wakeirq.h>
#include <trace/events/power.h>

#include "power.h"

#define list_for_each_entry_rcu_locked(pos, head, member) \
        list_for_each_entry_rcu(pos, head, member, \
                srcu_read_lock_held(&wakeup_srcu))
/*
 * If set, the suspend/hibernate code will abort transitions to a sleep state
 * if wakeup events are registered during or immediately before the transition.
 */
bool events_check_enabled __read_mostly;

/* First wakeup IRQ seen by the kernel in the last cycle. */
static unsigned int wakeup_irq[2] __read_mostly;
static DEFINE_RAW_SPINLOCK(wakeup_irq_lock);

/* If greater than 0 and the system is suspending, terminate the suspend. */
static atomic_t pm_abort_suspend __read_mostly;

/*
 * Combined counters of registered wakeup events and wakeup events in progress.
 * They need to be modified together atomically, so it's better to use one
 * atomic variable to hold them both.
 */
static atomic_t combined_event_count = ATOMIC_INIT(0);

#define IN_PROGRESS_BITS        (sizeof(int) * 4)
#define MAX_IN_PROGRESS                ((1 << IN_PROGRESS_BITS) - 1)

static void split_counters(unsigned int *cnt, unsigned int *inpr)
{
        unsigned int comb = atomic_read(&combined_event_count);

        *cnt = (comb >> IN_PROGRESS_BITS);
        *inpr = comb & MAX_IN_PROGRESS;
}

/* A preserved old value of the events counter. */
static unsigned int saved_count;

static DEFINE_RAW_SPINLOCK(events_lock);

static void pm_wakeup_timer_fn(struct timer_list *t);

static LIST_HEAD(wakeup_sources);

static DECLARE_WAIT_QUEUE_HEAD(wakeup_count_wait_queue);

DEFINE_STATIC_SRCU(wakeup_srcu);

static struct wakeup_source deleted_ws = {
        .name = "deleted",
        .lock =  __SPIN_LOCK_UNLOCKED(deleted_ws.lock),
};

static DEFINE_IDA(wakeup_ida);

/**
 * wakeup_source_create - Create a struct wakeup_source object.
 * @name: Name of the new wakeup source.
 */
struct wakeup_source *wakeup_source_create(const char *name)
{
        struct wakeup_source *ws;
        const char *ws_name;
        int id;

        ws = kzalloc(sizeof(*ws), GFP_KERNEL);
        if (!ws)
                goto err_ws;

        ws_name = kstrdup_const(name, GFP_KERNEL);
        if (!ws_name)
                goto err_name;
        ws->name = ws_name;

        id = ida_alloc(&wakeup_ida, GFP_KERNEL);
        if (id < 0)
                goto err_id;
        ws->id = id;

        return ws;

err_id:
        kfree_const(ws->name);
err_name:
        kfree(ws);
err_ws:
        return NULL;
}
EXPORT_SYMBOL_GPL(wakeup_source_create);

/*
 * Record wakeup_source statistics being deleted into a dummy wakeup_source.
 */
static void wakeup_source_record(struct wakeup_source *ws)
{
        unsigned long flags;

        spin_lock_irqsave(&deleted_ws.lock, flags);

        if (ws->event_count) {
                deleted_ws.total_time =
                        ktime_add(deleted_ws.total_time, ws->total_time);
                deleted_ws.prevent_sleep_time =
                        ktime_add(deleted_ws.prevent_sleep_time,
                                  ws->prevent_sleep_time);
                deleted_ws.max_time =
                        ktime_compare(deleted_ws.max_time, ws->max_time) > 0 ?
                                deleted_ws.max_time : ws->max_time;
                deleted_ws.event_count += ws->event_count;
                deleted_ws.active_count += ws->active_count;
                deleted_ws.relax_count += ws->relax_count;
                deleted_ws.expire_count += ws->expire_count;
                deleted_ws.wakeup_count += ws->wakeup_count;
        }

        spin_unlock_irqrestore(&deleted_ws.lock, flags);
}

static void wakeup_source_free(struct wakeup_source *ws)
{
        ida_free(&wakeup_ida, ws->id);
        kfree_const(ws->name);
        kfree(ws);
}

/**
 * wakeup_source_destroy - Destroy a struct wakeup_source object.
 * @ws: Wakeup source to destroy.
 *
 * Use only for wakeup source objects created with wakeup_source_create().
 */
void wakeup_source_destroy(struct wakeup_source *ws)
{
        if (!ws)
                return;

        __pm_relax(ws);
        wakeup_source_record(ws);
        wakeup_source_free(ws);
}
EXPORT_SYMBOL_GPL(wakeup_source_destroy);

/**
 * wakeup_source_add - Add given object to the list of wakeup sources.
 * @ws: Wakeup source object to add to the list.
 */
void wakeup_source_add(struct wakeup_source *ws)
{
        unsigned long flags;

        if (WARN_ON(!ws))
                return;

        spin_lock_init(&ws->lock);
        timer_setup(&ws->timer, pm_wakeup_timer_fn, 0);
        ws->active = false;

        raw_spin_lock_irqsave(&events_lock, flags);
        list_add_rcu(&ws->entry, &wakeup_sources);
        raw_spin_unlock_irqrestore(&events_lock, flags);
}
EXPORT_SYMBOL_GPL(wakeup_source_add);

/**
 * wakeup_source_remove - Remove given object from the wakeup sources list.
 * @ws: Wakeup source object to remove from the list.
 */
void wakeup_source_remove(struct wakeup_source *ws)
{
        unsigned long flags;

        if (WARN_ON(!ws))
                return;

        raw_spin_lock_irqsave(&events_lock, flags);
        list_del_rcu(&ws->entry);
        raw_spin_unlock_irqrestore(&events_lock, flags);
        synchronize_srcu(&wakeup_srcu);

        del_timer_sync(&ws->timer);
        /*
         * Clear timer.function to make wakeup_source_not_registered() treat
         * this wakeup source as not registered.
         */
        ws->timer.function = NULL;
}
EXPORT_SYMBOL_GPL(wakeup_source_remove);

/**
 * wakeup_source_register - Create wakeup source and add it to the list.
 * @dev: Device this wakeup source is associated with (or NULL if virtual).
 * @name: Name of the wakeup source to register.
 */
struct wakeup_source *wakeup_source_register(struct device *dev,
                                             const char *name)
{
        struct wakeup_source *ws;
        int ret;

        ws = wakeup_source_create(name);
        if (ws) {
                if (!dev || device_is_registered(dev)) {
                        ret = wakeup_source_sysfs_add(dev, ws);
                        if (ret) {
                                wakeup_source_free(ws);
                                return NULL;
                        }
                }
                wakeup_source_add(ws);
        }
        return ws;
}
EXPORT_SYMBOL_GPL(wakeup_source_register);

/**
 * wakeup_source_unregister - Remove wakeup source from the list and remove it.
 * @ws: Wakeup source object to unregister.
 */
void wakeup_source_unregister(struct wakeup_source *ws)
{
        if (ws) {
                wakeup_source_remove(ws);
                if (ws->dev)
                        wakeup_source_sysfs_remove(ws);

                wakeup_source_destroy(ws);
        }
}
EXPORT_SYMBOL_GPL(wakeup_source_unregister);

/**
 * wakeup_sources_read_lock - Lock wakeup source list for read.
 *
 * Returns an index of srcu lock for struct wakeup_srcu.
 * This index must be passed to the matching wakeup_sources_read_unlock().
 */
int wakeup_sources_read_lock(void)
{
        return srcu_read_lock(&wakeup_srcu);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_lock);

/**
 * wakeup_sources_read_unlock - Unlock wakeup source list.
 * @idx: return value from corresponding wakeup_sources_read_lock()
 */
void wakeup_sources_read_unlock(int idx)
{
        srcu_read_unlock(&wakeup_srcu, idx);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_unlock);

/**
 * wakeup_sources_walk_start - Begin a walk on wakeup source list
 *
 * Returns first object of the list of wakeup sources.
 *
 * Note that to be safe, wakeup sources list needs to be locked by calling
 * wakeup_source_read_lock() for this.
 */
struct wakeup_source *wakeup_sources_walk_start(void)
{
        struct list_head *ws_head = &wakeup_sources;

        return list_entry_rcu(ws_head->next, struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_start);

/**
 * wakeup_sources_walk_next - Get next wakeup source from the list
 * @ws: Previous wakeup source object
 *
 * Note that to be safe, wakeup sources list needs to be locked by calling
 * wakeup_source_read_lock() for this.
 */
struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws)
{
        struct list_head *ws_head = &wakeup_sources;

        return list_next_or_null_rcu(ws_head, &ws->entry,
                                struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_next);

/**
 * device_wakeup_attach - Attach a wakeup source object to a device object.
 * @dev: Device to handle.
 * @ws: Wakeup source object to attach to @dev.
 *
 * This causes @dev to be treated as a wakeup device.
 */
static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws)
{
        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                spin_unlock_irq(&dev->power.lock);
                return -EEXIST;
        }
        dev->power.wakeup = ws;
        if (dev->power.wakeirq)
                device_wakeup_attach_irq(dev, dev->power.wakeirq);
        spin_unlock_irq(&dev->power.lock);
        return 0;
}

/**
 * device_wakeup_enable - Enable given device to be a wakeup source.
 * @dev: Device to handle.
 *
 * Create a wakeup source object, register it and attach it to @dev.
 */
int device_wakeup_enable(struct device *dev)
{
        struct wakeup_source *ws;
        int ret;

        if (!dev || !dev->power.can_wakeup)
                return -EINVAL;

        if (pm_suspend_target_state != PM_SUSPEND_ON)
                dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__);

        ws = wakeup_source_register(dev, dev_name(dev));
        if (!ws)
                return -ENOMEM;

        ret = device_wakeup_attach(dev, ws);
        if (ret)
                wakeup_source_unregister(ws);

        return ret;
}
EXPORT_SYMBOL_GPL(device_wakeup_enable);

/**
 * device_wakeup_attach_irq - Attach a wakeirq to a wakeup source
 * @dev: Device to handle
 * @wakeirq: Device specific wakeirq entry
 *
 * Attach a device wakeirq to the wakeup source so the device
 * wake IRQ can be configured automatically for suspend and
 * resume.
 *
 * Call under the device's power.lock lock.
 */
void device_wakeup_attach_irq(struct device *dev,
                             struct wake_irq *wakeirq)
{
        struct wakeup_source *ws;

        ws = dev->power.wakeup;
        if (!ws)
                return;

        if (ws->wakeirq)
                dev_err(dev, "Leftover wakeup IRQ found, overriding\n");

        ws->wakeirq = wakeirq;
}

/**
 * device_wakeup_detach_irq - Detach a wakeirq from a wakeup source
 * @dev: Device to handle
 *
 * Removes a device wakeirq from the wakeup source.
 *
 * Call under the device's power.lock lock.
 */
void device_wakeup_detach_irq(struct device *dev)
{
        struct wakeup_source *ws;

        ws = dev->power.wakeup;
        if (ws)
                ws->wakeirq = NULL;
}

/**
 * device_wakeup_arm_wake_irqs -
 *
 * Iterates over the list of device wakeirqs to arm them.
 */
void device_wakeup_arm_wake_irqs(void)
{
        struct wakeup_source *ws;
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
                dev_pm_arm_wake_irq(ws->wakeirq);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}

/**
 * device_wakeup_disarm_wake_irqs -
 *
 * Iterates over the list of device wakeirqs to disarm them.
 */
void device_wakeup_disarm_wake_irqs(void)
{
        struct wakeup_source *ws;
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
                dev_pm_disarm_wake_irq(ws->wakeirq);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}

/**
 * device_wakeup_detach - Detach a device's wakeup source object from it.
 * @dev: Device to detach the wakeup source object from.
 *
 * After it returns, @dev will not be treated as a wakeup device any more.
 */
static struct wakeup_source *device_wakeup_detach(struct device *dev)
{
        struct wakeup_source *ws;

        spin_lock_irq(&dev->power.lock);
        ws = dev->power.wakeup;
        dev->power.wakeup = NULL;
        spin_unlock_irq(&dev->power.lock);
        return ws;
}

/**
 * device_wakeup_disable - Do not regard a device as a wakeup source any more.
 * @dev: Device to handle.
 *
 * Detach the @dev's wakeup source object from it, unregister this wakeup source
 * object and destroy it.
 */
void device_wakeup_disable(struct device *dev)
{
        struct wakeup_source *ws;

        if (!dev || !dev->power.can_wakeup)
                return;

        ws = device_wakeup_detach(dev);
        wakeup_source_unregister(ws);
}
EXPORT_SYMBOL_GPL(device_wakeup_disable);

/**
 * device_set_wakeup_capable - Set/reset device wakeup capability flag.
 * @dev: Device to handle.
 * @capable: Whether or not @dev is capable of waking up the system from sleep.
 *
 * If @capable is set, set the @dev's power.can_wakeup flag and add its
 * wakeup-related attributes to sysfs.  Otherwise, unset the @dev's
 * power.can_wakeup flag and remove its wakeup-related attributes from sysfs.
 *
 * This function may sleep and it can't be called from any context where
 * sleeping is not allowed.
 */
void device_set_wakeup_capable(struct device *dev, bool capable)
{
        if (!!dev->power.can_wakeup == !!capable)
                return;

        dev->power.can_wakeup = capable;
        if (device_is_registered(dev) && !list_empty(&dev->power.entry)) {
                if (capable) {
                        int ret = wakeup_sysfs_add(dev);

                        if (ret)
                                dev_info(dev, "Wakeup sysfs attributes not added\n");
                } else {
                        wakeup_sysfs_remove(dev);
                }
        }
}
EXPORT_SYMBOL_GPL(device_set_wakeup_capable);

/**
 * device_set_wakeup_enable - Enable or disable a device to wake up the system.
 * @dev: Device to handle.
 * @enable: enable/disable flag
 */
int device_set_wakeup_enable(struct device *dev, bool enable)
{
        if (enable)
                return device_wakeup_enable(dev);

        device_wakeup_disable(dev);
        return 0;
}
EXPORT_SYMBOL_GPL(device_set_wakeup_enable);

/**
 * wakeup_source_not_registered - validate the given wakeup source.
 * @ws: Wakeup source to be validated.
 */
static bool wakeup_source_not_registered(struct wakeup_source *ws)
{
        /*
         * Use timer struct to check if the given source is initialized
         * by wakeup_source_add.
         */
        return ws->timer.function != pm_wakeup_timer_fn;
}

/*
 * The functions below use the observation that each wakeup event starts a
 * period in which the system should not be suspended.  The moment this period
 * will end depends on how the wakeup event is going to be processed after being
 * detected and all of the possible cases can be divided into two distinct
 * groups.
 *
 * First, a wakeup event may be detected by the same functional unit that will
 * carry out the entire processing of it and possibly will pass it to user space
 * for further processing.  In that case the functional unit that has detected
 * the event may later "close" the "no suspend" period associated with it
 * directly as soon as it has been dealt with.  The pair of pm_stay_awake() and
 * pm_relax(), balanced with each other, is supposed to be used in such
 * situations.
 *
 * Second, a wakeup event may be detected by one functional unit and processed
 * by another one.  In that case the unit that has detected it cannot really
 * "close" the "no suspend" period associated with it, unless it knows in
 * advance what's going to happen to the event during processing.  This
 * knowledge, however, may not be available to it, so it can simply specify time
 * to wait before the system can be suspended and pass it as the second
 * argument of pm_wakeup_event().
 *
 * It is valid to call pm_relax() after pm_wakeup_event(), in which case the
 * "no suspend" period will be ended either by the pm_relax(), or by the timer
 * function executed when the timer expires, whichever comes first.
 */

/**
 * wakeup_source_activate - Mark given wakeup source as active.
 * @ws: Wakeup source to handle.
 *
 * Update the @ws' statistics and, if @ws has just been activated, notify the PM
 * core of the event by incrementing the counter of the wakeup events being
 * processed.
 */
static void wakeup_source_activate(struct wakeup_source *ws)
{
        unsigned int cec;

        if (WARN_ONCE(wakeup_source_not_registered(ws),
                        "unregistered wakeup source\n"))
                return;

        ws->active = true;
        ws->active_count++;
        ws->last_time = ktime_get();
        if (ws->autosleep_enabled)
                ws->start_prevent_time = ws->last_time;

        /* Increment the counter of events in progress. */
        cec = atomic_inc_return(&combined_event_count);

        trace_wakeup_source_activate(ws->name, cec);
}

/**
 * wakeup_source_report_event - Report wakeup event using the given source.
 * @ws: Wakeup source to report the event for.
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 */
static void wakeup_source_report_event(struct wakeup_source *ws, bool hard)
{
        ws->event_count++;
        /* This is racy, but the counter is approximate anyway. */
        if (events_check_enabled)
                ws->wakeup_count++;

        if (!ws->active)
                wakeup_source_activate(ws);

        if (hard)
                pm_system_wakeup();
}

/**
 * __pm_stay_awake - Notify the PM core of a wakeup event.
 * @ws: Wakeup source object associated with the source of the event.
 *
 * It is safe to call this function from interrupt context.
 */
void __pm_stay_awake(struct wakeup_source *ws)
{
        unsigned long flags;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);

        wakeup_source_report_event(ws, false);
        del_timer(&ws->timer);
        ws->timer_expires = 0;

        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_stay_awake);

/**
 * pm_stay_awake - Notify the PM core that a wakeup event is being processed.
 * @dev: Device the wakeup event is related to.
 *
 * Notify the PM core of a wakeup event (signaled by @dev) by calling
 * __pm_stay_awake for the @dev's wakeup source object.
 *
 * Call this function after detecting of a wakeup event if pm_relax() is going
 * to be called directly after processing the event (and possibly passing it to
 * user space for further processing).
 */
void pm_stay_awake(struct device *dev)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        __pm_stay_awake(dev->power.wakeup);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_stay_awake);

#ifdef CONFIG_PM_AUTOSLEEP
static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now)
{
        ktime_t delta = ktime_sub(now, ws->start_prevent_time);
        ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta);
}
#else
static inline void update_prevent_sleep_time(struct wakeup_source *ws,
                                             ktime_t now) {}
#endif

/**
 * wakeup_source_deactivate - Mark given wakeup source as inactive.
 * @ws: Wakeup source to handle.
 *
 * Update the @ws' statistics and notify the PM core that the wakeup source has
 * become inactive by decrementing the counter of wakeup events being processed
 * and incrementing the counter of registered wakeup events.
 */
static void wakeup_source_deactivate(struct wakeup_source *ws)
{
        unsigned int cnt, inpr, cec;
        ktime_t duration;
        ktime_t now;

        ws->relax_count++;
        /*
         * __pm_relax() may be called directly or from a timer function.
         * If it is called directly right after the timer function has been
         * started, but before the timer function calls __pm_relax(), it is
         * possible that __pm_stay_awake() will be called in the meantime and
         * will set ws->active.  Then, ws->active may be cleared immediately
         * by the __pm_relax() called from the timer function, but in such a
         * case ws->relax_count will be different from ws->active_count.
         */
        if (ws->relax_count != ws->active_count) {
                ws->relax_count--;
                return;
        }

        ws->active = false;

        now = ktime_get();
        duration = ktime_sub(now, ws->last_time);
        ws->total_time = ktime_add(ws->total_time, duration);
        if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time))
                ws->max_time = duration;

        ws->last_time = now;
        del_timer(&ws->timer);
        ws->timer_expires = 0;

        if (ws->autosleep_enabled)
                update_prevent_sleep_time(ws, now);

        /*
         * Increment the counter of registered wakeup events and decrement the
         * counter of wakeup events in progress simultaneously.
         */
        cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
        trace_wakeup_source_deactivate(ws->name, cec);

        split_counters(&cnt, &inpr);
        if (!inpr && waitqueue_active(&wakeup_count_wait_queue))
                wake_up(&wakeup_count_wait_queue);
}

/**
 * __pm_relax - Notify the PM core that processing of a wakeup event has ended.
 * @ws: Wakeup source object associated with the source of the event.
 *
 * Call this function for wakeup events whose processing started with calling
 * __pm_stay_awake().
 *
 * It is safe to call it from interrupt context.
 */
void __pm_relax(struct wakeup_source *ws)
{
        unsigned long flags;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);
        if (ws->active)
                wakeup_source_deactivate(ws);
        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_relax);

/**
 * pm_relax - Notify the PM core that processing of a wakeup event has ended.
 * @dev: Device that signaled the event.
 *
 * Execute __pm_relax() for the @dev's wakeup source object.
 */
void pm_relax(struct device *dev)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        __pm_relax(dev->power.wakeup);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_relax);

/**
 * pm_wakeup_timer_fn - Delayed finalization of a wakeup event.
 * @t: timer list
 *
 * Call wakeup_source_deactivate() for the wakeup source whose address is stored
 * in @data if it is currently active and its timer has not been canceled and
 * the expiration time of the timer is not in future.
 */
static void pm_wakeup_timer_fn(struct timer_list *t)
{
        struct wakeup_source *ws = from_timer(ws, t, timer);
        unsigned long flags;

        spin_lock_irqsave(&ws->lock, flags);

        if (ws->active && ws->timer_expires
            && time_after_eq(jiffies, ws->timer_expires)) {
                wakeup_source_deactivate(ws);
                ws->expire_count++;
        }

        spin_unlock_irqrestore(&ws->lock, flags);
}

/**
 * pm_wakeup_ws_event - Notify the PM core of a wakeup event.
 * @ws: Wakeup source object associated with the event source.
 * @msec: Anticipated event processing time (in milliseconds).
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 *
 * Notify the PM core of a wakeup event whose source is @ws that will take
 * approximately @msec milliseconds to be processed by the kernel.  If @ws is
 * not active, activate it.  If @msec is nonzero, set up the @ws' timer to
 * execute pm_wakeup_timer_fn() in future.
 *
 * It is safe to call this function from interrupt context.
 */
void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard)
{
        unsigned long flags;
        unsigned long expires;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);

        wakeup_source_report_event(ws, hard);

        if (!msec) {
                wakeup_source_deactivate(ws);
                goto unlock;
        }

        expires = jiffies + msecs_to_jiffies(msec);
        if (!expires)
                expires = 1;

        if (!ws->timer_expires || time_after(expires, ws->timer_expires)) {
                mod_timer(&ws->timer, expires);
                ws->timer_expires = expires;
        }

 unlock:
        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_ws_event);

/**
 * pm_wakeup_dev_event - Notify the PM core of a wakeup event.
 * @dev: Device the wakeup event is related to.
 * @msec: Anticipated event processing time (in milliseconds).
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 *
 * Call pm_wakeup_ws_event() for the @dev's wakeup source object.
 */
void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        pm_wakeup_ws_event(dev->power.wakeup, msec, hard);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_dev_event);

void pm_print_active_wakeup_sources(void)
{
        struct wakeup_source *ws;
        int srcuidx, active = 0;
        struct wakeup_source *last_activity_ws = NULL;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                if (ws->active) {
                        pm_pr_dbg("active wakeup source: %s\n", ws->name);
                        active = 1;
                } else if (!active &&
                           (!last_activity_ws ||
                            ktime_to_ns(ws->last_time) >
                            ktime_to_ns(last_activity_ws->last_time))) {
                        last_activity_ws = ws;
                }
        }

        if (!active && last_activity_ws)
                pm_pr_dbg("last active wakeup source: %s\n",
                        last_activity_ws->name);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}
EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources);

/**
 * pm_wakeup_pending - Check if power transition in progress should be aborted.
 *
 * Compare the current number of registered wakeup events with its preserved
 * value from the past and return true if new wakeup events have been registered
 * since the old value was stored.  Also return true if the current number of
 * wakeup events being processed is different from zero.
 */
bool pm_wakeup_pending(void)
{
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&events_lock, flags);
        if (events_check_enabled) {
                unsigned int cnt, inpr;

                split_counters(&cnt, &inpr);
                ret = (cnt != saved_count || inpr > 0);
                events_check_enabled = !ret;
        }
        raw_spin_unlock_irqrestore(&events_lock, flags);

        if (ret) {
                pm_pr_dbg("Wakeup pending, aborting suspend\n");
                pm_print_active_wakeup_sources();
        }

        return ret || atomic_read(&pm_abort_suspend) > 0;
}
EXPORT_SYMBOL_GPL(pm_wakeup_pending);

void pm_system_wakeup(void)
{
        atomic_inc(&pm_abort_suspend);
        s2idle_wake();
}
EXPORT_SYMBOL_GPL(pm_system_wakeup);

void pm_system_cancel_wakeup(void)
{
        atomic_dec_if_positive(&pm_abort_suspend);
}

void pm_wakeup_clear(unsigned int irq_number)
{
        raw_spin_lock_irq(&wakeup_irq_lock);

        if (irq_number && wakeup_irq[0] == irq_number)
                wakeup_irq[0] = wakeup_irq[1];
        else
                wakeup_irq[0] = 0;

        wakeup_irq[1] = 0;

        raw_spin_unlock_irq(&wakeup_irq_lock);

        if (!irq_number)
                atomic_set(&pm_abort_suspend, 0);
}

void pm_system_irq_wakeup(unsigned int irq_number)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&wakeup_irq_lock, flags);

        if (wakeup_irq[0] == 0)
                wakeup_irq[0] = irq_number;
        else if (wakeup_irq[1] == 0)
                wakeup_irq[1] = irq_number;
        else
                irq_number = 0;

        pm_pr_dbg("Triggering wakeup from IRQ %d\n", irq_number);

        raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags);

        if (irq_number)
                pm_system_wakeup();
}

unsigned int pm_wakeup_irq(void)
{
        return wakeup_irq[0];
}

/**
 * pm_get_wakeup_count - Read the number of registered wakeup events.
 * @count: Address to store the value at.
 * @block: Whether or not to block.
 *
 * Store the number of registered wakeup events at the address in @count.  If
 * @block is set, block until the current number of wakeup events being
 * processed is zero.
 *
 * Return 'false' if the current number of wakeup events being processed is
 * nonzero.  Otherwise return 'true'.
 */
bool pm_get_wakeup_count(unsigned int *count, bool block)
{
        unsigned int cnt, inpr;

        if (block) {
                DEFINE_WAIT(wait);

                for (;;) {
                        prepare_to_wait(&wakeup_count_wait_queue, &wait,
                                        TASK_INTERRUPTIBLE);
                        split_counters(&cnt, &inpr);
                        if (inpr == 0 || signal_pending(current))
                                break;
                        pm_print_active_wakeup_sources();
                        schedule();
                }
                finish_wait(&wakeup_count_wait_queue, &wait);
        }

        split_counters(&cnt, &inpr);
        *count = cnt;
        return !inpr;
}

/**
 * pm_save_wakeup_count - Save the current number of registered wakeup events.
 * @count: Value to compare with the current number of registered wakeup events.
 *
 * If @count is equal to the current number of registered wakeup events and the
 * current number of wakeup events being processed is zero, store @count as the
 * old number of registered wakeup events for pm_check_wakeup_events(), enable
 * wakeup events detection and return 'true'.  Otherwise disable wakeup events
 * detection and return 'false'.
 */
bool pm_save_wakeup_count(unsigned int count)
{
        unsigned int cnt, inpr;
        unsigned long flags;

        events_check_enabled = false;
        raw_spin_lock_irqsave(&events_lock, flags);
        split_counters(&cnt, &inpr);
        if (cnt == count && inpr == 0) {
                saved_count = count;
                events_check_enabled = true;
        }
        raw_spin_unlock_irqrestore(&events_lock, flags);
        return events_check_enabled;
}

#ifdef CONFIG_PM_AUTOSLEEP
/**
 * pm_wakep_autosleep_enabled - Modify autosleep_enabled for all wakeup sources.
 * @set: Whether to set or to clear the autosleep_enabled flags.
 */
void pm_wakep_autosleep_enabled(bool set)
{
        struct wakeup_source *ws;
        ktime_t now = ktime_get();
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                spin_lock_irq(&ws->lock);
                if (ws->autosleep_enabled != set) {
                        ws->autosleep_enabled = set;
                        if (ws->active) {
                                if (set)
                                        ws->start_prevent_time = now;
                                else
                                        update_prevent_sleep_time(ws, now);
                        }
                }
                spin_unlock_irq(&ws->lock);
        }
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}
#endif /* CONFIG_PM_AUTOSLEEP */

/**
 * print_wakeup_source_stats - Print wakeup source statistics information.
 * @m: seq_file to print the statistics into.
 * @ws: Wakeup source object to print the statistics for.
 */
static int print_wakeup_source_stats(struct seq_file *m,
                                     struct wakeup_source *ws)
{
        unsigned long flags;
        ktime_t total_time;
        ktime_t max_time;
        unsigned long active_count;
        ktime_t active_time;
        ktime_t prevent_sleep_time;

        spin_lock_irqsave(&ws->lock, flags);

        total_time = ws->total_time;
        max_time = ws->max_time;
        prevent_sleep_time = ws->prevent_sleep_time;
        active_count = ws->active_count;
        if (ws->active) {
                ktime_t now = ktime_get();

                active_time = ktime_sub(now, ws->last_time);
                total_time = ktime_add(total_time, active_time);
                if (active_time > max_time)
                        max_time = active_time;

                if (ws->autosleep_enabled)
                        prevent_sleep_time = ktime_add(prevent_sleep_time,
                                ktime_sub(now, ws->start_prevent_time));
        } else {
                active_time = 0;
        }

        seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
                   ws->name, active_count, ws->event_count,
                   ws->wakeup_count, ws->expire_count,
                   ktime_to_ms(active_time), ktime_to_ms(total_time),
                   ktime_to_ms(max_time), ktime_to_ms(ws->last_time),
                   ktime_to_ms(prevent_sleep_time));

        spin_unlock_irqrestore(&ws->lock, flags);

        return 0;
}

static void *wakeup_sources_stats_seq_start(struct seq_file *m,
                                        loff_t *pos)
{
        struct wakeup_source *ws;
        loff_t n = *pos;
        int *srcuidx = m->private;

        if (n == 0) {
                seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
                        "expire_count\tactive_since\ttotal_time\tmax_time\t"
                        "last_change\tprevent_suspend_time\n");
        }

        *srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                if (n-- <= 0)
                        return ws;
        }

        return NULL;
}

static void *wakeup_sources_stats_seq_next(struct seq_file *m,
                                        void *v, loff_t *pos)
{
        struct wakeup_source *ws = v;
        struct wakeup_source *next_ws = NULL;

        ++(*pos);

        list_for_each_entry_continue_rcu(ws, &wakeup_sources, entry) {
                next_ws = ws;
                break;
        }

        if (!next_ws)
                print_wakeup_source_stats(m, &deleted_ws);

        return next_ws;
}

static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
{
        int *srcuidx = m->private;

        srcu_read_unlock(&wakeup_srcu, *srcuidx);
}

/**
 * wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
 * @m: seq_file to print the statistics into.
 * @v: wakeup_source of each iteration
 */
static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
{
        struct wakeup_source *ws = v;

        print_wakeup_source_stats(m, ws);

        return 0;
}

static const struct seq_operations wakeup_sources_stats_seq_ops = {
        .start = wakeup_sources_stats_seq_start,
        .next  = wakeup_sources_stats_seq_next,
        .stop  = wakeup_sources_stats_seq_stop,
        .show  = wakeup_sources_stats_seq_show,
};

static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
{
        return seq_open_private(file, &wakeup_sources_stats_seq_ops, sizeof(int));
}

static const struct file_operations wakeup_sources_stats_fops = {
        .owner = THIS_MODULE,
        .open = wakeup_sources_stats_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release_private,
};

static int __init wakeup_sources_debugfs_init(void)
{
        debugfs_create_file("wakeup_sources", 0444, NULL, NULL,
                            &wakeup_sources_stats_fops);
        return 0;
}

postcore_initcall(wakeup_sources_debugfs_init);




























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2005,2006,2007,2008 IBM Corporation
 *
 * Authors:
 * Reiner Sailer <sailer@watson.ibm.com>
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima.h
 *        internal Integrity Measurement Architecture (IMA) definitions
 */

#ifndef __LINUX_IMA_H
#define __LINUX_IMA_H

#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/hash.h>
#include <linux/tpm.h>
#include <linux/audit.h>
#include <crypto/hash_info.h>

#include "../integrity.h"

enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN,
                     IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII };
enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8, TPM_PCR10 = 10 };

/* digest size for IMA, fits SHA1 or MD5 */
#define IMA_DIGEST_SIZE                SHA1_DIGEST_SIZE
#define IMA_EVENT_NAME_LEN_MAX        255

#define IMA_HASH_BITS 10
#define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS)

#define IMA_TEMPLATE_FIELD_ID_MAX_LEN        16
#define IMA_TEMPLATE_NUM_FIELDS_MAX        15

#define IMA_TEMPLATE_IMA_NAME "ima"
#define IMA_TEMPLATE_IMA_FMT "d|n"

#define NR_BANKS(chip) ((chip != NULL) ? chip->nr_allocated_banks : 0)

/* current content of the policy */
extern int ima_policy_flag;

/* bitset of digests algorithms allowed in the setxattr hook */
extern atomic_t ima_setxattr_allowed_hash_algorithms;

/* IMA hash algorithm description */
struct ima_algo_desc {
        struct crypto_shash *tfm;
        enum hash_algo algo;
};

/* set during initialization */
extern int ima_hash_algo __ro_after_init;
extern int ima_sha1_idx __ro_after_init;
extern int ima_hash_algo_idx __ro_after_init;
extern int ima_extra_slots __ro_after_init;
extern struct ima_algo_desc *ima_algo_array __ro_after_init;

extern int ima_appraise;
extern struct tpm_chip *ima_tpm_chip;
extern const char boot_aggregate_name[];

/* IMA event related data */
struct ima_event_data {
        struct ima_iint_cache *iint;
        struct file *file;
        const unsigned char *filename;
        struct evm_ima_xattr_data *xattr_value;
        int xattr_len;
        const struct modsig *modsig;
        const char *violation;
        const void *buf;
        int buf_len;
};

/* IMA template field data definition */
struct ima_field_data {
        u8 *data;
        u32 len;
};

/* IMA template field definition */
struct ima_template_field {
        const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN];
        int (*field_init)(struct ima_event_data *event_data,
                          struct ima_field_data *field_data);
        void (*field_show)(struct seq_file *m, enum ima_show_type show,
                           struct ima_field_data *field_data);
};

/* IMA template descriptor definition */
struct ima_template_desc {
        struct list_head list;
        char *name;
        char *fmt;
        int num_fields;
        const struct ima_template_field **fields;
};

struct ima_template_entry {
        int pcr;
        struct tpm_digest *digests;
        struct ima_template_desc *template_desc; /* template descriptor */
        u32 template_data_len;
        struct ima_field_data template_data[];        /* template related data */
};

struct ima_queue_entry {
        struct hlist_node hnext;        /* place in hash collision list */
        struct list_head later;                /* place in ima_measurements list */
        struct ima_template_entry *entry;
};
extern struct list_head ima_measurements;        /* list of all measurements */

/* Some details preceding the binary serialized measurement list */
struct ima_kexec_hdr {
        u16 version;
        u16 _reserved0;
        u32 _reserved1;
        u64 buffer_size;
        u64 count;
};

/* IMA iint action cache flags */
#define IMA_MEASURE                0x00000001
#define IMA_MEASURED                0x00000002
#define IMA_APPRAISE                0x00000004
#define IMA_APPRAISED                0x00000008
/*#define IMA_COLLECT                0x00000010  do not use this flag */
#define IMA_COLLECTED                0x00000020
#define IMA_AUDIT                0x00000040
#define IMA_AUDITED                0x00000080
#define IMA_HASH                0x00000100
#define IMA_HASHED                0x00000200

/* IMA iint policy rule cache flags */
#define IMA_NONACTION_FLAGS        0xff000000
#define IMA_DIGSIG_REQUIRED        0x01000000
#define IMA_PERMIT_DIRECTIO        0x02000000
#define IMA_NEW_FILE                0x04000000
#define IMA_FAIL_UNVERIFIABLE_SIGS        0x10000000
#define IMA_MODSIG_ALLOWED        0x20000000
#define IMA_CHECK_BLACKLIST        0x40000000
#define IMA_VERITY_REQUIRED        0x80000000

#define IMA_DO_MASK                (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \
                                 IMA_HASH | IMA_APPRAISE_SUBMASK)
#define IMA_DONE_MASK                (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \
                                 IMA_HASHED | IMA_COLLECTED | \
                                 IMA_APPRAISED_SUBMASK)

/* IMA iint subaction appraise cache flags */
#define IMA_FILE_APPRAISE        0x00001000
#define IMA_FILE_APPRAISED        0x00002000
#define IMA_MMAP_APPRAISE        0x00004000
#define IMA_MMAP_APPRAISED        0x00008000
#define IMA_BPRM_APPRAISE        0x00010000
#define IMA_BPRM_APPRAISED        0x00020000
#define IMA_READ_APPRAISE        0x00040000
#define IMA_READ_APPRAISED        0x00080000
#define IMA_CREDS_APPRAISE        0x00100000
#define IMA_CREDS_APPRAISED        0x00200000
#define IMA_APPRAISE_SUBMASK        (IMA_FILE_APPRAISE | IMA_MMAP_APPRAISE | \
                                 IMA_BPRM_APPRAISE | IMA_READ_APPRAISE | \
                                 IMA_CREDS_APPRAISE)
#define IMA_APPRAISED_SUBMASK        (IMA_FILE_APPRAISED | IMA_MMAP_APPRAISED | \
                                 IMA_BPRM_APPRAISED | IMA_READ_APPRAISED | \
                                 IMA_CREDS_APPRAISED)

/* IMA iint cache atomic_flags */
#define IMA_CHANGE_XATTR        0
#define IMA_UPDATE_XATTR        1
#define IMA_CHANGE_ATTR                2
#define IMA_DIGSIG                3
#define IMA_MUST_MEASURE        4

/* IMA integrity metadata associated with an inode */
struct ima_iint_cache {
        struct mutex mutex;        /* protects: version, flags, digest */
        struct integrity_inode_attributes real_inode;
        unsigned long flags;
        unsigned long measured_pcrs;
        unsigned long atomic_flags;
        enum integrity_status ima_file_status:4;
        enum integrity_status ima_mmap_status:4;
        enum integrity_status ima_bprm_status:4;
        enum integrity_status ima_read_status:4;
        enum integrity_status ima_creds_status:4;
        struct ima_digest_data *ima_hash;
};

extern struct lsm_blob_sizes ima_blob_sizes;

static inline struct ima_iint_cache *
ima_inode_get_iint(const struct inode *inode)
{
        struct ima_iint_cache **iint_sec;

        if (unlikely(!inode->i_security))
                return NULL;

        iint_sec = inode->i_security + ima_blob_sizes.lbs_inode;
        return *iint_sec;
}

static inline void ima_inode_set_iint(const struct inode *inode,
                                      struct ima_iint_cache *iint)
{
        struct ima_iint_cache **iint_sec;

        if (unlikely(!inode->i_security))
                return;

        iint_sec = inode->i_security + ima_blob_sizes.lbs_inode;
        *iint_sec = iint;
}

struct ima_iint_cache *ima_iint_find(struct inode *inode);
struct ima_iint_cache *ima_inode_get(struct inode *inode);
void ima_inode_free(struct inode *inode);
void __init ima_iintcache_init(void);

extern const int read_idmap[];

#ifdef CONFIG_HAVE_IMA_KEXEC
void ima_load_kexec_buffer(void);
#else
static inline void ima_load_kexec_buffer(void) {}
#endif /* CONFIG_HAVE_IMA_KEXEC */

#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS
void ima_post_key_create_or_update(struct key *keyring, struct key *key,
                                   const void *payload, size_t plen,
                                   unsigned long flags, bool create);
#endif

/*
 * The default binary_runtime_measurements list format is defined as the
 * platform native format.  The canonical format is defined as little-endian.
 */
extern bool ima_canonical_fmt;

/* Internal IMA function definitions */
int ima_init(void);
int ima_fs_init(void);
int ima_add_template_entry(struct ima_template_entry *entry, int violation,
                           const char *op, struct inode *inode,
                           const unsigned char *filename);
int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash);
int ima_calc_buffer_hash(const void *buf, loff_t len,
                         struct ima_digest_data *hash);
int ima_calc_field_array_hash(struct ima_field_data *field_data,
                              struct ima_template_entry *entry);
int ima_calc_boot_aggregate(struct ima_digest_data *hash);
void ima_add_violation(struct file *file, const unsigned char *filename,
                       struct ima_iint_cache *iint, const char *op,
                       const char *cause);
int ima_init_crypto(void);
void ima_putc(struct seq_file *m, void *data, int datalen);
void ima_print_digest(struct seq_file *m, u8 *digest, u32 size);
int template_desc_init_fields(const char *template_fmt,
                              const struct ima_template_field ***fields,
                              int *num_fields);
struct ima_template_desc *ima_template_desc_current(void);
struct ima_template_desc *ima_template_desc_buf(void);
struct ima_template_desc *lookup_template_desc(const char *name);
bool ima_template_has_modsig(const struct ima_template_desc *ima_template);
int ima_restore_measurement_entry(struct ima_template_entry *entry);
int ima_restore_measurement_list(loff_t bufsize, void *buf);
int ima_measurements_show(struct seq_file *m, void *v);
unsigned long ima_get_binary_runtime_size(void);
int ima_init_template(void);
void ima_init_template_list(void);
int __init ima_init_digests(void);
int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event,
                          void *lsm_data);

/*
 * used to protect h_table and sha_table
 */
extern spinlock_t ima_queue_lock;

struct ima_h_table {
        atomic_long_t len;        /* number of stored measurements in the list */
        atomic_long_t violations;
        struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE];
};
extern struct ima_h_table ima_htable;

static inline unsigned int ima_hash_key(u8 *digest)
{
        /* there is no point in taking a hash of part of a digest */
        return (digest[0] | digest[1] << 8) % IMA_MEASURE_HTABLE_SIZE;
}

#define __ima_hooks(hook)                                \
        hook(NONE, none)                                \
        hook(FILE_CHECK, file)                                \
        hook(MMAP_CHECK, mmap)                                \
        hook(MMAP_CHECK_REQPROT, mmap_reqprot)                \
        hook(BPRM_CHECK, bprm)                                \
        hook(CREDS_CHECK, creds)                        \
        hook(POST_SETATTR, post_setattr)                \
        hook(MODULE_CHECK, module)                        \
        hook(FIRMWARE_CHECK, firmware)                        \
        hook(KEXEC_KERNEL_CHECK, kexec_kernel)                \
        hook(KEXEC_INITRAMFS_CHECK, kexec_initramfs)        \
        hook(POLICY_CHECK, policy)                        \
        hook(KEXEC_CMDLINE, kexec_cmdline)                \
        hook(KEY_CHECK, key)                                \
        hook(CRITICAL_DATA, critical_data)                \
        hook(SETXATTR_CHECK, setxattr_check)                \
        hook(MAX_CHECK, none)

#define __ima_hook_enumify(ENUM, str)        ENUM,
#define __ima_stringify(arg) (#arg)
#define __ima_hook_measuring_stringify(ENUM, str) \
                (__ima_stringify(measuring_ ##str)),

enum ima_hooks {
        __ima_hooks(__ima_hook_enumify)
};

static const char * const ima_hooks_measure_str[] = {
        __ima_hooks(__ima_hook_measuring_stringify)
};

static inline const char *func_measure_str(enum ima_hooks func)
{
        if (func >= MAX_CHECK)
                return ima_hooks_measure_str[NONE];

        return ima_hooks_measure_str[func];
}

extern const char *const func_tokens[];

struct modsig;

#ifdef CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS
/*
 * To track keys that need to be measured.
 */
struct ima_key_entry {
        struct list_head list;
        void *payload;
        size_t payload_len;
        char *keyring_name;
};
void ima_init_key_queue(void);
bool ima_should_queue_key(void);
bool ima_queue_key(struct key *keyring, const void *payload,
                   size_t payload_len);
void ima_process_queued_keys(void);
#else
static inline void ima_init_key_queue(void) {}
static inline bool ima_should_queue_key(void) { return false; }
static inline bool ima_queue_key(struct key *keyring,
                                 const void *payload,
                                 size_t payload_len) { return false; }
static inline void ima_process_queued_keys(void) {}
#endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */

/* LIM API function definitions */
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
                   const struct cred *cred, u32 secid, int mask,
                   enum ima_hooks func, int *pcr,
                   struct ima_template_desc **template_desc,
                   const char *func_data, unsigned int *allowed_algos);
int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file,
                            void *buf, loff_t size, enum hash_algo algo,
                            struct modsig *modsig);
void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
                           const unsigned char *filename,
                           struct evm_ima_xattr_data *xattr_value,
                           int xattr_len, const struct modsig *modsig, int pcr,
                           struct ima_template_desc *template_desc);
int process_buffer_measurement(struct mnt_idmap *idmap,
                               struct inode *inode, const void *buf, int size,
                               const char *eventname, enum ima_hooks func,
                               int pcr, const char *func_data,
                               bool buf_hash, u8 *digest, size_t digest_len);
void ima_audit_measurement(struct ima_iint_cache *iint,
                           const unsigned char *filename);
int ima_alloc_init_template(struct ima_event_data *event_data,
                            struct ima_template_entry **entry,
                            struct ima_template_desc *template_desc);
int ima_store_template(struct ima_template_entry *entry, int violation,
                       struct inode *inode,
                       const unsigned char *filename, int pcr);
void ima_free_template_entry(struct ima_template_entry *entry);
const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);

/* IMA policy related functions */
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
                     const struct cred *cred, u32 secid, enum ima_hooks func,
                     int mask, int flags, int *pcr,
                     struct ima_template_desc **template_desc,
                     const char *func_data, unsigned int *allowed_algos);
void ima_init_policy(void);
void ima_update_policy(void);
void ima_update_policy_flags(void);
ssize_t ima_parse_add_rule(char *);
void ima_delete_rules(void);
int ima_check_policy(void);
void *ima_policy_start(struct seq_file *m, loff_t *pos);
void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos);
void ima_policy_stop(struct seq_file *m, void *v);
int ima_policy_show(struct seq_file *m, void *v);

/* Appraise integrity measurements */
#define IMA_APPRAISE_ENFORCE        0x01
#define IMA_APPRAISE_FIX        0x02
#define IMA_APPRAISE_LOG        0x04
#define IMA_APPRAISE_MODULES        0x08
#define IMA_APPRAISE_FIRMWARE        0x10
#define IMA_APPRAISE_POLICY        0x20
#define IMA_APPRAISE_KEXEC        0x40

#ifdef CONFIG_IMA_APPRAISE
int ima_check_blacklist(struct ima_iint_cache *iint,
                        const struct modsig *modsig, int pcr);
int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint,
                             struct file *file, const unsigned char *filename,
                             struct evm_ima_xattr_data *xattr_value,
                             int xattr_len, const struct modsig *modsig);
int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode,
                      int mask, enum ima_hooks func);
void ima_update_xattr(struct ima_iint_cache *iint, struct file *file);
enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint,
                                           enum ima_hooks func);
enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value,
                                 int xattr_len);
int ima_read_xattr(struct dentry *dentry,
                   struct evm_ima_xattr_data **xattr_value, int xattr_len);
void __init init_ima_appraise_lsm(const struct lsm_id *lsmid);

#else
static inline int ima_check_blacklist(struct ima_iint_cache *iint,
                                      const struct modsig *modsig, int pcr)
{
        return 0;
}

static inline int ima_appraise_measurement(enum ima_hooks func,
                                           struct ima_iint_cache *iint,
                                           struct file *file,
                                           const unsigned char *filename,
                                           struct evm_ima_xattr_data *xattr_value,
                                           int xattr_len,
                                           const struct modsig *modsig)
{
        return INTEGRITY_UNKNOWN;
}

static inline int ima_must_appraise(struct mnt_idmap *idmap,
                                    struct inode *inode, int mask,
                                    enum ima_hooks func)
{
        return 0;
}

static inline void ima_update_xattr(struct ima_iint_cache *iint,
                                    struct file *file)
{
}

static inline enum integrity_status
ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func)
{
        return INTEGRITY_UNKNOWN;
}

static inline enum hash_algo
ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len)
{
        return ima_hash_algo;
}

static inline int ima_read_xattr(struct dentry *dentry,
                                 struct evm_ima_xattr_data **xattr_value,
                                 int xattr_len)
{
        return 0;
}

static inline void __init init_ima_appraise_lsm(const struct lsm_id *lsmid)
{
}

#endif /* CONFIG_IMA_APPRAISE */

#ifdef CONFIG_IMA_APPRAISE_MODSIG
int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len,
                    struct modsig **modsig);
void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size);
int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo,
                          const u8 **digest, u32 *digest_size);
int ima_get_raw_modsig(const struct modsig *modsig, const void **data,
                       u32 *data_len);
void ima_free_modsig(struct modsig *modsig);
#else
static inline int ima_read_modsig(enum ima_hooks func, const void *buf,
                                  loff_t buf_len, struct modsig **modsig)
{
        return -EOPNOTSUPP;
}

static inline void ima_collect_modsig(struct modsig *modsig, const void *buf,
                                      loff_t size)
{
}

static inline int ima_get_modsig_digest(const struct modsig *modsig,
                                        enum hash_algo *algo, const u8 **digest,
                                        u32 *digest_size)
{
        return -EOPNOTSUPP;
}

static inline int ima_get_raw_modsig(const struct modsig *modsig,
                                     const void **data, u32 *data_len)
{
        return -EOPNOTSUPP;
}

static inline void ima_free_modsig(struct modsig *modsig)
{
}
#endif /* CONFIG_IMA_APPRAISE_MODSIG */

/* LSM based policy rules require audit */
#ifdef CONFIG_IMA_LSM_RULES

#define ima_filter_rule_init security_audit_rule_init
#define ima_filter_rule_free security_audit_rule_free
#define ima_filter_rule_match security_audit_rule_match

#else

static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr,
                                       void **lsmrule, gfp_t gfp)
{
        return -EINVAL;
}

static inline void ima_filter_rule_free(void *lsmrule)
{
}

static inline int ima_filter_rule_match(u32 secid, u32 field, u32 op,
                                        void *lsmrule)
{
        return -EINVAL;
}
#endif /* CONFIG_IMA_LSM_RULES */

#ifdef        CONFIG_IMA_READ_POLICY
#define        POLICY_FILE_FLAGS        (S_IWUSR | S_IRUSR)
#else
#define        POLICY_FILE_FLAGS        S_IWUSR
#endif /* CONFIG_IMA_READ_POLICY */

#endif /* __LINUX_IMA_H */


































    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * RCU-based infrastructure for lightweight reader-writer locking
 *
 * Copyright (c) 2015, Red Hat, Inc.
 *
 * Author: Oleg Nesterov <oleg@redhat.com>
 */

#ifndef _LINUX_RCU_SYNC_H_
#define _LINUX_RCU_SYNC_H_

#include <linux/wait.h>
#include <linux/rcupdate.h>

/* Structure to mediate between updaters and fastpath-using readers.  */
struct rcu_sync {
        int                        gp_state;
        int                        gp_count;
        wait_queue_head_t        gp_wait;

        struct rcu_head                cb_head;
};

/**
 * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
 * Returns true if readers are permitted to use their fastpaths.  Must be
 * invoked within some flavor of RCU read-side critical section.
 */
static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
                         "suspicious rcu_sync_is_idle() usage");
        return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
}

extern void rcu_sync_init(struct rcu_sync *);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);

#define __RCU_SYNC_INITIALIZER(name) {                                        \
                .gp_state = 0,                                                \
                .gp_count = 0,                                                \
                .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),        \
        }

#define        DEFINE_RCU_SYNC(name)        \
        struct rcu_sync name = __RCU_SYNC_INITIALIZER(name)

#endif /* _LINUX_RCU_SYNC_H_ */
























































    1 







    1 







    1 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFRM_HASH_H
#define _XFRM_HASH_H

#include <linux/xfrm.h>
#include <linux/socket.h>
#include <linux/jhash.h>

static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
{
        return ntohl(addr->a4);
}

static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
{
        return jhash2((__force u32 *)addr->a6, 4, 0);
}

static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        u32 sum = (__force u32)daddr->a4 + (__force u32)saddr->a4;
        return ntohl((__force __be32)sum);
}

static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr)
{
        return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr);
}

static inline u32 __bits2mask32(__u8 bits)
{
        u32 mask32 = 0xffffffff;

        if (bits == 0)
                mask32 = 0;
        else if (bits < 32)
                mask32 <<= (32 - bits);

        return mask32;
}

static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return jhash_2words(ntohl(daddr->a4) & __bits2mask32(dbits),
                            ntohl(saddr->a4) & __bits2mask32(sbits),
                            0);
}

static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr,
                                             __u8 prefixlen)
{
        unsigned int pdw;
        unsigned int pbi;
        u32 initval = 0;

        pdw = prefixlen >> 5;     /* num of whole u32 in prefix */
        pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */

        if (pbi) {
                __be32 mask;

                mask = htonl((0xffffffff) << (32 - pbi));

                initval = (__force u32)(addr->a6[pdw] & mask);
        }

        return jhash2((__force u32 *)addr->a6, pdw, initval);
}

static inline unsigned int __xfrm6_dpref_spref_hash(const xfrm_address_t *daddr,
                                                    const xfrm_address_t *saddr,
                                                    __u8 dbits,
                                                    __u8 sbits)
{
        return __xfrm6_pref_hash(daddr, dbits) ^
               __xfrm6_pref_hash(saddr, sbits);
}

static inline unsigned int __xfrm_dst_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           u32 reqid, unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family ^ reqid;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int __xfrm_src_hash(const xfrm_address_t *daddr,
                                           const xfrm_address_t *saddr,
                                           unsigned short family,
                                           unsigned int hmask)
{
        unsigned int h = family;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_daddr_saddr_hash(daddr, saddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_daddr_saddr_hash(daddr, saddr);
                break;
        }
        return (h ^ (h >> 16)) & hmask;
}

static inline unsigned int
__xfrm_spi_hash(const xfrm_address_t *daddr, __be32 spi, u8 proto,
                unsigned short family, unsigned int hmask)
{
        unsigned int h = (__force u32)spi ^ proto;
        switch (family) {
        case AF_INET:
                h ^= __xfrm4_addr_hash(daddr);
                break;
        case AF_INET6:
                h ^= __xfrm6_addr_hash(daddr);
                break;
        }
        return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
}

static inline unsigned int
__xfrm_seq_hash(u32 seq, unsigned int hmask)
{
        unsigned int h = seq;
        return (h ^ (h >> 10) ^ (h >> 20)) & hmask;
}

static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
{
        return (index ^ (index >> 8)) & hmask;
}

static inline unsigned int __sel_hash(const struct xfrm_selector *sel,
                                      unsigned short family, unsigned int hmask,
                                      u8 dbits, u8 sbits)
{
        const xfrm_address_t *daddr = &sel->daddr;
        const xfrm_address_t *saddr = &sel->saddr;
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                if (sel->prefixlen_d < dbits ||
                    sel->prefixlen_s < sbits)
                        return hmask + 1;

                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

static inline unsigned int __addr_hash(const xfrm_address_t *daddr,
                                       const xfrm_address_t *saddr,
                                       unsigned short family,
                                       unsigned int hmask,
                                       u8 dbits, u8 sbits)
{
        unsigned int h = 0;

        switch (family) {
        case AF_INET:
                h = __xfrm4_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;

        case AF_INET6:
                h = __xfrm6_dpref_spref_hash(daddr, saddr, dbits, sbits);
                break;
        }
        h ^= (h >> 16);
        return h & hmask;
}

struct hlist_head *xfrm_hash_alloc(unsigned int sz);
void xfrm_hash_free(struct hlist_head *n, unsigned int sz);

#endif /* _XFRM_HASH_H */






















































    1 
    1 


    1 









    2 
    1 









    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _PROTO_MEMORY_H
#define _PROTO_MEMORY_H

#include <net/sock.h>
#include <net/hotdata.h>

/* 1 MB per cpu, in page units */
#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))

static inline bool sk_has_memory_pressure(const struct sock *sk)
{
        return sk->sk_prot->memory_pressure != NULL;
}

static inline bool
proto_memory_pressure(const struct proto *prot)
{
        if (!prot->memory_pressure)
                return false;
        return !!READ_ONCE(*prot->memory_pressure);
}

static inline bool sk_under_global_memory_pressure(const struct sock *sk)
{
        return proto_memory_pressure(sk->sk_prot);
}

static inline bool sk_under_memory_pressure(const struct sock *sk)
{
        if (!sk->sk_prot->memory_pressure)
                return false;

        if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
            mem_cgroup_under_socket_pressure(sk->sk_memcg))
                return true;

        return !!READ_ONCE(*sk->sk_prot->memory_pressure);
}

static inline long
proto_memory_allocated(const struct proto *prot)
{
        return max(0L, atomic_long_read(prot->memory_allocated));
}

static inline long
sk_memory_allocated(const struct sock *sk)
{
        return proto_memory_allocated(sk->sk_prot);
}

static inline void proto_memory_pcpu_drain(struct proto *proto)
{
        int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0);

        if (val)
                atomic_long_add(val, proto->memory_allocated);
}

static inline void
sk_memory_allocated_add(const struct sock *sk, int val)
{
        struct proto *proto = sk->sk_prot;

        val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val);

        if (unlikely(val >= READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv)))
                proto_memory_pcpu_drain(proto);
}

static inline void
sk_memory_allocated_sub(const struct sock *sk, int val)
{
        struct proto *proto = sk->sk_prot;

        val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val);

        if (unlikely(val <= -READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv)))
                proto_memory_pcpu_drain(proto);
}

#endif /* _PROTO_MEMORY_H */
































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BLK_CGROUP_PRIVATE_H
#define _BLK_CGROUP_PRIVATE_H
/*
 * block cgroup private header
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 */

#include <linux/blk-cgroup.h>
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
#include "blk.h"

struct blkcg_gq;
struct blkg_policy_data;


/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
#define BLKG_STAT_CPU_BATCH        (INT_MAX / 2)

#ifdef CONFIG_BLK_CGROUP

enum blkg_iostat_type {
        BLKG_IOSTAT_READ,
        BLKG_IOSTAT_WRITE,
        BLKG_IOSTAT_DISCARD,

        BLKG_IOSTAT_NR,
};

struct blkg_iostat {
        u64                                bytes[BLKG_IOSTAT_NR];
        u64                                ios[BLKG_IOSTAT_NR];
};

struct blkg_iostat_set {
        struct u64_stats_sync                sync;
        struct blkcg_gq                       *blkg;
        struct llist_node                lnode;
        int                                lqueued;        /* queued in llist */
        struct blkg_iostat                cur;
        struct blkg_iostat                last;
};

/* association between a blk cgroup and a request queue */
struct blkcg_gq {
        /* Pointer to the associated request_queue */
        struct request_queue                *q;
        struct list_head                q_node;
        struct hlist_node                blkcg_node;
        struct blkcg                        *blkcg;

        /* all non-root blkcg_gq's are guaranteed to have access to parent */
        struct blkcg_gq                        *parent;

        /* reference count */
        struct percpu_ref                refcnt;

        /* is this blkg online? protected by both blkcg and q locks */
        bool                                online;

        struct blkg_iostat_set __percpu        *iostat_cpu;
        struct blkg_iostat_set                iostat;

        struct blkg_policy_data                *pd[BLKCG_MAX_POLS];
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spinlock_t                        async_bio_lock;
        struct bio_list                        async_bios;
#endif
        union {
                struct work_struct        async_bio_work;
                struct work_struct        free_work;
        };

        atomic_t                        use_delay;
        atomic64_t                        delay_nsec;
        atomic64_t                        delay_start;
        u64                                last_delay;
        int                                last_use;

        struct rcu_head                        rcu_head;
};

struct blkcg {
        struct cgroup_subsys_state        css;
        spinlock_t                        lock;
        refcount_t                        online_pin;

        struct radix_tree_root                blkg_tree;
        struct blkcg_gq        __rcu                *blkg_hint;
        struct hlist_head                blkg_list;

        struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];

        struct list_head                all_blkcgs_node;

        /*
         * List of updated percpu blkg_iostat_set's since the last flush.
         */
        struct llist_head __percpu        *lhead;

#ifdef CONFIG_BLK_CGROUP_FC_APPID
        char                            fc_app_id[FC_APPID_LEN];
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head                cgwb_list;
#endif
};

static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct blkcg, css) : NULL;
}

/*
 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
 * request_queue (q).  This is used by blkcg policies which need to track
 * information per blkcg - q pair.
 *
 * There can be multiple active blkcg policies and each blkg:policy pair is
 * represented by a blkg_policy_data which is allocated and freed by each
 * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
 * area by allocating larger data structure which embeds blkg_policy_data
 * at the beginning.
 */
struct blkg_policy_data {
        /* the blkg and policy id this per-policy data belongs to */
        struct blkcg_gq                        *blkg;
        int                                plid;
        bool                                online;
};

/*
 * Policies that need to keep per-blkcg data which is independent from any
 * request_queue associated to it should implement cpd_alloc/free_fn()
 * methods.  A policy can allocate private data area by allocating larger
 * data structure which embeds blkcg_policy_data at the beginning.
 * cpd_init() is invoked to let each policy handle per-blkcg data.
 */
struct blkcg_policy_data {
        /* the blkcg and policy id this per-policy data belongs to */
        struct blkcg                        *blkcg;
        int                                plid;
};

typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk,
                struct blkcg *blkcg, gfp_t gfp);
typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
                                struct seq_file *s);

struct blkcg_policy {
        int                                plid;
        /* cgroup files for the policy */
        struct cftype                        *dfl_cftypes;
        struct cftype                        *legacy_cftypes;

        /* operations */
        blkcg_pol_alloc_cpd_fn                *cpd_alloc_fn;
        blkcg_pol_free_cpd_fn                *cpd_free_fn;

        blkcg_pol_alloc_pd_fn                *pd_alloc_fn;
        blkcg_pol_init_pd_fn                *pd_init_fn;
        blkcg_pol_online_pd_fn                *pd_online_fn;
        blkcg_pol_offline_pd_fn                *pd_offline_fn;
        blkcg_pol_free_pd_fn                *pd_free_fn;
        blkcg_pol_reset_pd_stats_fn        *pd_reset_stats_fn;
        blkcg_pol_stat_pd_fn                *pd_stat_fn;
};

extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats;

void blkg_init_queue(struct request_queue *q);
int blkcg_init_disk(struct gendisk *disk);
void blkcg_exit_disk(struct gendisk *disk);

/* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol);
void blkcg_policy_unregister(struct blkcg_policy *pol);
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol);

const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total);
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);

struct blkg_conf_ctx {
        char                                *input;
        char                                *body;
        struct block_device                *bdev;
        struct blkcg_gq                        *blkg;
};

void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);

/**
 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 * @return: true if this bio needs to be submitted with the root blkg context.
 *
 * In order to avoid priority inversions we sometimes need to issue a bio as if
 * it were attached to the root blkg, and then backcharge to the actual owning
 * blkg.  The idea is we do bio_blkcg_css() to look up the actual context for
 * the bio and attach the appropriate blkg to the bio.  Then we call this helper
 * and if it is true run with the root blkg for that queue and then do any
 * backcharging to the originating cgroup once the io is complete.
 */
static inline bool bio_issue_as_root_blkg(struct bio *bio)
{
        return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
}

/**
 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 * @blkcg: blkcg of interest
 * @q: request_queue of interest
 *
 * Lookup blkg for the @blkcg - @q pair.

 * Must be called in a RCU critical section.
 */
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
                                           struct request_queue *q)
{
        struct blkcg_gq *blkg;

        if (blkcg == &blkcg_root)
                return q->root_blkg;

        blkg = rcu_dereference_check(blkcg->blkg_hint,
                        lockdep_is_held(&q->queue_lock));
        if (blkg && blkg->q == q)
                return blkg;

        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
        if (blkg && blkg->q != q)
                blkg = NULL;
        return blkg;
}

/**
 * blkg_to_pdata - get policy private data
 * @blkg: blkg of interest
 * @pol: policy of interest
 *
 * Return pointer to private data associated with the @blkg-@pol pair.
 */
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol)
{
        return blkg ? blkg->pd[pol->plid] : NULL;
}

static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                     struct blkcg_policy *pol)
{
        return blkcg ? blkcg->cpd[pol->plid] : NULL;
}

/**
 * pdata_to_blkg - get blkg associated with policy private data
 * @pd: policy private data of interest
 *
 * @pd is policy private data.  Determine the blkg it's associated with.
 */
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
{
        return pd ? pd->blkg : NULL;
}

static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
{
        return cpd ? cpd->blkcg : NULL;
}

/**
 * blkg_path - format cgroup path of blkg
 * @blkg: blkg of interest
 * @buf: target buffer
 * @buflen: target buffer length
 *
 * Format the path of the cgroup of @blkg into @buf.
 */
static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
{
        return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
}

/**
 * blkg_get - get a blkg reference
 * @blkg: blkg to get
 *
 * The caller should be holding an existing reference.
 */
static inline void blkg_get(struct blkcg_gq *blkg)
{
        percpu_ref_get(&blkg->refcnt);
}

/**
 * blkg_tryget - try and get a blkg reference
 * @blkg: blkg to get
 *
 * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
 * of freeing this blkg, so we can only use it if the refcnt is not zero.
 */
static inline bool blkg_tryget(struct blkcg_gq *blkg)
{
        return blkg && percpu_ref_tryget(&blkg->refcnt);
}

/**
 * blkg_put - put a blkg reference
 * @blkg: blkg to put
 */
static inline void blkg_put(struct blkcg_gq *blkg)
{
        percpu_ref_put(&blkg->refcnt);
}

/**
 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 * read locked.  If called under either blkcg or queue lock, the iteration
 * is guaranteed to include all and only online blkgs.  The caller may
 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
 * @p_blkg is included in the iteration and the first node to be visited.
 */
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

/**
 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Similar to blkg_for_each_descendant_pre() but performs post-order
 * traversal instead.  Synchronization rules are the same.  @p_blkg is
 * included in the iteration and the last node to be visited.
 */
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

static inline void blkcg_bio_issue_init(struct bio *bio)
{
        bio_issue_init(&bio->bi_issue, bio_sectors(bio));
}

static inline void blkcg_use_delay(struct blkcg_gq *blkg)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        if (atomic_add_return(1, &blkg->use_delay) == 1)
                atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
}

static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        if (WARN_ON_ONCE(old < 0))
                return 0;
        if (old == 0)
                return 0;

        /*
         * We do this song and dance because we can race with somebody else
         * adding or removing delay.  If we just did an atomic_dec we'd end up
         * negative and we'd already be in trouble.  We need to subtract 1 and
         * then check to see if we were the last delay so we can drop the
         * congestion count on the cgroup.
         */
        while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
                ;

        if (old == 0)
                return 0;
        if (old == 1)
                atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
        return 1;
}

/**
 * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
 * @blkg: target blkg
 * @delay: delay duration in nsecs
 *
 * When enabled with this function, the delay is not decayed and must be
 * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
 * blkcg_[un]use_delay() and blkcg_add_delay() usages.
 */
static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person setting the congestion count for this blkg. */
        if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
                atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);

        atomic64_set(&blkg->delay_nsec, delay);
}

/**
 * blkcg_clear_delay - Disable allocator delay mechanism
 * @blkg: target blkg
 *
 * Disable use_delay mechanism. See blkcg_set_delay().
 */
static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person clearing the congestion count for this blkg. */
        if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
                atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
}

/**
 * blk_cgroup_mergeable - Determine whether to allow or disallow merges
 * @rq: request to merge into
 * @bio: bio to merge
 *
 * @bio and @rq should belong to the same cgroup and their issue_as_root should
 * match. The latter is necessary as we don't want to throttle e.g. a metadata
 * update because it happens to be next to a regular IO.
 */
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
{
        return rq->bio->bi_blkg == bio->bi_blkg &&
                bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
}

void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else        /* CONFIG_BLK_CGROUP */

struct blkg_policy_data {
};

struct blkcg_policy_data {
};

struct blkcg_policy {
};

struct blkcg {
};

static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline void blkg_init_queue(struct request_queue *q) { }
static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct gendisk *disk,
                                        const struct blkcg_policy *pol) { return 0; }
static inline void blkcg_deactivate_policy(struct gendisk *disk,
                                           const struct blkcg_policy *pol) { }

static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }

#define blk_queue_for_each_rl(rl, q)        \
        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)

#endif        /* CONFIG_BLK_CGROUP */

#endif /* _BLK_CGROUP_PRIVATE_H */


























































































































































































































































































































































































































































































































































































































































































































    6 























































































































































































































































































































































































































































































































































































    1 






































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Linux Socket Filter Data Structures
 */
#ifndef __LINUX_FILTER_H__
#define __LINUX_FILTER_H__

#include <linux/atomic.h>
#include <linux/bpf.h>
#include <linux/refcount.h>
#include <linux/compat.h>
#include <linux/skbuff.h>
#include <linux/linkage.h>
#include <linux/printk.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/capability.h>
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <linux/sockptr.h>
#include <crypto/sha1.h>
#include <linux/u64_stats_sync.h>

#include <net/sch_generic.h>

#include <asm/byteorder.h>
#include <uapi/linux/filter.h>

struct sk_buff;
struct sock;
struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
struct sock_reuseport;
struct ctl_table;
struct ctl_table_header;

/* ArgX, context and stack frame pointer register positions. Note,
 * Arg1, Arg2, Arg3, etc are used as argument mappings of function
 * calls in BPF_CALL instruction.
 */
#define BPF_REG_ARG1        BPF_REG_1
#define BPF_REG_ARG2        BPF_REG_2
#define BPF_REG_ARG3        BPF_REG_3
#define BPF_REG_ARG4        BPF_REG_4
#define BPF_REG_ARG5        BPF_REG_5
#define BPF_REG_CTX        BPF_REG_6
#define BPF_REG_FP        BPF_REG_10

/* Additional register mappings for converted user programs. */
#define BPF_REG_A        BPF_REG_0
#define BPF_REG_X        BPF_REG_7
#define BPF_REG_TMP        BPF_REG_2        /* scratch reg */
#define BPF_REG_D        BPF_REG_8        /* data, callee-saved */
#define BPF_REG_H        BPF_REG_9        /* hlen, callee-saved */

/* Kernel hidden auxiliary/helper register. */
#define BPF_REG_AX                MAX_BPF_REG
#define MAX_BPF_EXT_REG                (MAX_BPF_REG + 1)
#define MAX_BPF_JIT_REG                MAX_BPF_EXT_REG

/* unused opcode to mark special call to bpf_tail_call() helper */
#define BPF_TAIL_CALL        0xf0

/* unused opcode to mark special load instruction. Same as BPF_ABS */
#define BPF_PROBE_MEM        0x20

/* unused opcode to mark special ldsx instruction. Same as BPF_IND */
#define BPF_PROBE_MEMSX        0x40

/* unused opcode to mark special load instruction. Same as BPF_MSH */
#define BPF_PROBE_MEM32        0xa0

/* unused opcode to mark special atomic instruction */
#define BPF_PROBE_ATOMIC 0xe0

/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS        0xe0

/* unused opcode to mark speculation barrier for mitigating
 * Speculative Store Bypass
 */
#define BPF_NOSPEC        0xc0

/* As per nm, we expose JITed images as text (code) section for
 * kallsyms. That way, tools like perf can find it to match
 * addresses.
 */
#define BPF_SYM_ELF_TYPE        't'

/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK        512

/* Helper macros for filter block array initializers. */

/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */

#define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU64_REG(OP, DST, SRC)                                \
        BPF_ALU64_REG_OFF(OP, DST, SRC, 0)

#define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU32_REG(OP, DST, SRC)                                \
        BPF_ALU32_REG_OFF(OP, DST, SRC, 0)

/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */

#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU64_IMM(OP, DST, IMM)                                \
        BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)

#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU32_IMM(OP, DST, IMM)                                \
        BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)

/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */

#define BPF_ENDIAN(TYPE, DST, LEN)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Byte Swap, bswap16/32/64 */

#define BPF_BSWAP(DST, LEN)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Short form of mov, dst_reg = src_reg */

#define BPF_MOV64_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
 * dst_reg = src_reg + <percpu_base_off>
 * BPF_ADDR_PERCPU is used as a special insn->off value.
 */
#define BPF_ADDR_PERCPU        (-1)

#define BPF_MOV64_PERCPU_REG(DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = BPF_ADDR_PERCPU,                        \
                .imm   = 0 })

static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
}

/* Short form of mov, dst_reg = imm32 */

#define BPF_MOV64_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */

#define BPF_MOVSX64_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_MOVSX32_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Special form of mov32, used for doing explicit zero extension on dst. */
#define BPF_ZEXT_REG(DST)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = DST,                                        \
                .off   = 0,                                        \
                .imm   = 1 })

static inline bool insn_is_zext(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
}

/* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers
 * to pointers in user vma.
 */
static inline bool insn_is_cast_user(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
                              insn->off == BPF_ADDR_SPACE_CAST &&
                              insn->imm == 1U << 16;
}

/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM)                                        \
        BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_DW | BPF_IMM,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = (__u32) (IMM) }),                        \
        ((struct bpf_insn) {                                        \
                .code  = 0, /* zero is reserved opcode */        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((__u64) (IMM)) >> 32 })

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD)                                \
        BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */

#define BPF_LD_ABS(SIZE, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */

#define BPF_LD_IND(SIZE, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,        \
                .dst_reg = 0,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Memory load, dst_reg = *(uint *) (src_reg + off16) */

#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */

#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */

#define BPF_STX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })


/*
 * Atomic operations:
 *
 *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
 *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
 *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
 *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
 *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
 *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
 *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
 *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
 *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
 *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
 */

#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = OP })

/* Legacy alias */
#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)

/* Memory store, *(uint *) (dst_reg + off16) = imm32 */

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */

#define BPF_JMP_REG(OP, DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */

#define BPF_JMP_IMM(OP, DST, IMM, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_REG(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_IMM(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Unconditional jumps, goto pc + off16 */

#define BPF_JMP_A(OFF)                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Relative call */

#define BPF_CALL_REL(TGT)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_CALL,                        \
                .off   = 0,                                        \
                .imm   = TGT })

/* Convert function address to BPF immediate */

#define BPF_CALL_IMM(x)        ((void *)(x) - (void *)__bpf_call_base)

#define BPF_EMIT_CALL(FUNC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = BPF_CALL_IMM(FUNC) })

/* Raw code statement block */

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = CODE,                                        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Program exit */

#define BPF_EXIT_INSN()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_EXIT,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Speculation barrier */

#define BPF_ST_NOSPEC()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_NOSPEC,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Internal classic blocks for direct assignment */

#define __BPF_STMT(CODE, K)                                        \
        ((struct sock_filter) BPF_STMT(CODE, K))

#define __BPF_JUMP(CODE, K, JT, JF)                                \
        ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF))

#define bytes_to_bpf_size(bytes)                                \
({                                                                \
        int bpf_size = -EINVAL;                                        \
                                                                \
        if (bytes == sizeof(u8))                                \
                bpf_size = BPF_B;                                \
        else if (bytes == sizeof(u16))                                \
                bpf_size = BPF_H;                                \
        else if (bytes == sizeof(u32))                                \
                bpf_size = BPF_W;                                \
        else if (bytes == sizeof(u64))                                \
                bpf_size = BPF_DW;                                \
                                                                \
        bpf_size;                                                \
})

#define bpf_size_to_bytes(bpf_size)                                \
({                                                                \
        int bytes = -EINVAL;                                        \
                                                                \
        if (bpf_size == BPF_B)                                        \
                bytes = sizeof(u8);                                \
        else if (bpf_size == BPF_H)                                \
                bytes = sizeof(u16);                                \
        else if (bpf_size == BPF_W)                                \
                bytes = sizeof(u32);                                \
        else if (bpf_size == BPF_DW)                                \
                bytes = sizeof(u64);                                \
                                                                \
        bytes;                                                        \
})

#define BPF_SIZEOF(type)                                        \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof(type)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_FIELD_SIZEOF(type, field)                                \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_LDST_BYTES(insn)                                        \
        ({                                                        \
                const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \
                WARN_ON(__size < 0);                                \
                __size;                                                \
        })

#define __BPF_MAP_0(m, v, ...) v
#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)

#define __BPF_REG_0(...) __BPF_PAD(5)
#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)

#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)

#define __BPF_CAST(t, a)                                                       \
        (__force t)                                                               \
        (__force                                                               \
         typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
                                      (unsigned long)0, (t)0))) a
#define __BPF_V void
#define __BPF_N

#define __BPF_DECL_ARGS(t, a) t   a
#define __BPF_DECL_REGS(t, a) u64 a

#define __BPF_PAD(n)                                                               \
        __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
                  u64, __ur_3, u64, __ur_4, u64, __ur_5)

#define BPF_CALL_x(x, attr, name, ...)                                               \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
        typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));    \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))     \
        {                                                                       \
                return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
        }                                                                       \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))

#define __NOATTR
#define BPF_CALL_0(name, ...)        BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_1(name, ...)        BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_2(name, ...)        BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_3(name, ...)        BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_4(name, ...)        BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_5(name, ...)        BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__)

#define NOTRACE_BPF_CALL_1(name, ...)        BPF_CALL_x(1, notrace, name, __VA_ARGS__)

#define bpf_ctx_range(TYPE, MEMBER)                                                \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)                                \
        offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
#if BITS_PER_LONG == 64
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#else
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1
#endif /* BITS_PER_LONG == 64 */

#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)                                \
        ({                                                                        \
                BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE));                \
                *(PTR_SIZE) = (SIZE);                                                \
                offsetof(TYPE, MEMBER);                                                \
        })

/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
        u16                len;
        compat_uptr_t        filter;        /* struct sock_filter * */
};

struct sock_fprog_kern {
        u16                        len;
        struct sock_filter        *filter;
};

/* Some arches need doubleword alignment for their instructions and/or data */
#define BPF_IMAGE_ALIGNMENT 8

struct bpf_binary_header {
        u32 size;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};

struct bpf_prog_stats {
        u64_stats_t cnt;
        u64_stats_t nsecs;
        u64_stats_t misses;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

struct sk_filter {
        refcount_t        refcnt;
        struct rcu_head        rcu;
        struct bpf_prog        *prog;
};

DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

extern struct mutex nf_conn_btf_access_lock;
extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                                     const struct bpf_reg_state *reg,
                                     int off, int size);

typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
                                          const struct bpf_insn *insnsi,
                                          unsigned int (*bpf_func)(const void *,
                                                                   const struct bpf_insn *));

static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
                                          const void *ctx,
                                          bpf_dispatcher_fn dfunc)
{
        u32 ret;

        cant_migrate();
        if (static_branch_unlikely(&bpf_stats_enabled_key)) {
                struct bpf_prog_stats *stats;
                u64 duration, start = sched_clock();
                unsigned long flags;

                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);

                duration = sched_clock() - start;
                stats = this_cpu_ptr(prog->stats);
                flags = u64_stats_update_begin_irqsave(&stats->syncp);
                u64_stats_inc(&stats->cnt);
                u64_stats_add(&stats->nsecs, duration);
                u64_stats_update_end_irqrestore(&stats->syncp, flags);
        } else {
                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
        }
        return ret;
}

static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
{
        return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
}

/*
 * Use in preemptible and therefore migratable context to make sure that
 * the execution of the BPF program runs on one CPU.
 *
 * This uses migrate_disable/enable() explicitly to document that the
 * invocation of a BPF program does not require reentrancy protection
 * against a BPF program which is invoked from a preempting task.
 */
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
                                          const void *ctx)
{
        u32 ret;

        migrate_disable();
        ret = bpf_prog_run(prog, ctx);
        migrate_enable();
        return ret;
}

#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

struct bpf_skb_data_end {
        struct qdisc_skb_cb qdisc_cb;
        void *data_meta;
        void *data_end;
};

struct bpf_nh_params {
        u32 nh_family;
        union {
                u32 ipv4_nh;
                struct in6_addr ipv6_nh;
        };
};

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT        BIT(0)        /* no napi_direct on return_frame */
#define BPF_RI_F_RI_INIT        BIT(1)
#define BPF_RI_F_CPU_MAP_INIT        BIT(2)
#define BPF_RI_F_DEV_MAP_INIT        BIT(3)
#define BPF_RI_F_XSK_MAP_INIT        BIT(4)

struct bpf_redirect_info {
        u64 tgt_index;
        void *tgt_value;
        struct bpf_map *map;
        u32 flags;
        u32 map_id;
        enum bpf_map_type map_type;
        struct bpf_nh_params nh;
        u32 kern_flags;
};

struct bpf_net_context {
        struct bpf_redirect_info ri;
        struct list_head cpu_map_flush_list;
        struct list_head dev_map_flush_list;
        struct list_head xskmap_map_flush_list;
};

static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
{
        struct task_struct *tsk = current;

        if (tsk->bpf_net_context != NULL)
                return NULL;
        bpf_net_ctx->ri.kern_flags = 0;

        tsk->bpf_net_context = bpf_net_ctx;
        return bpf_net_ctx;
}

static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
{
        if (bpf_net_ctx)
                current->bpf_net_context = NULL;
}

static inline struct bpf_net_context *bpf_net_ctx_get(void)
{
        return current->bpf_net_context;
}

static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
                memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
        }

        return &bpf_net_ctx->ri;
}

static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT;
        }

        return &bpf_net_ctx->cpu_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT;
        }

        return &bpf_net_ctx->dev_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT;
        }

        return &bpf_net_ctx->xskmap_map_flush_list;
}

/* Compute the linear packet data range [data, data_end) which
 * will be accessed by various program types (cls_bpf, act_bpf,
 * lwt, ...). Subsystems allowing direct data access must (!)
 * ensure that cb[] area can be written to when BPF program is
 * invoked (otherwise cb[] save/restore is necessary).
 */
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        cb->data_meta = skb->data - skb_metadata_len(skb);
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Similar to bpf_compute_data_pointers(), except that save orginal
 * data in cb->data and cb->meta_data for restore.
 */
static inline void bpf_compute_and_save_data_end(
        struct sk_buff *skb, void **saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        *saved_data_end = cb->data_end;
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Restore data saved by bpf_compute_and_save_data_end(). */
static inline void bpf_restore_data_end(
        struct sk_buff *skb, void *saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        cb->data_end = saved_data_end;
}

static inline u8 *bpf_skb_cb(const struct sk_buff *skb)
{
        /* eBPF programs may read/write skb->cb[] area to transfer meta
         * data between tail calls. Since this also needs to work with
         * tc, that scratch memory is mapped to qdisc_skb_cb's data area.
         *
         * In some socket filter cases, the cb unfortunately needs to be
         * saved/restored so that protocol specific skb->cb[] data won't
         * be lost. In any case, due to unpriviledged eBPF programs
         * attached to sockets, we need to clear the bpf_skb_cb() area
         * to not leak previous contents to user space.
         */
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN);
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) !=
                     sizeof_field(struct qdisc_skb_cb, data));

        return qdisc_skb_cb(skb)->data;
}

/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                         const void *ctx)
{
        const struct sk_buff *skb = ctx;
        u8 *cb_data = bpf_skb_cb(skb);
        u8 cb_saved[BPF_SKB_CB_LEN];
        u32 res;

        if (unlikely(prog->cb_access)) {
                memcpy(cb_saved, cb_data, sizeof(cb_saved));
                memset(cb_data, 0, sizeof(cb_saved));
        }

        res = bpf_prog_run(prog, skb);

        if (unlikely(prog->cb_access))
                memcpy(cb_data, cb_saved, sizeof(cb_saved));

        return res;
}

static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                       struct sk_buff *skb)
{
        u32 res;

        migrate_disable();
        res = __bpf_prog_run_save_cb(prog, skb);
        migrate_enable();
        return res;
}

static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
                                        struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u32 res;

        if (unlikely(prog->cb_access))
                memset(cb_data, 0, BPF_SKB_CB_LEN);

        res = bpf_prog_run_pin_on_cpu(prog, skb);
        return res;
}

DECLARE_BPF_DISPATCHER(xdp)

DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp);

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);

static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
{
        return prog->len * sizeof(struct bpf_insn);
}

static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog)
{
        return round_up(bpf_prog_insn_size(prog) +
                        sizeof(__be64) + 1, SHA1_BLOCK_SIZE);
}

static inline unsigned int bpf_prog_size(unsigned int proglen)
{
        return max(sizeof(struct bpf_prog),
                   offsetof(struct bpf_prog, insns[proglen]));
}

static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
{
        /* When classic BPF programs have been loaded and the arch
         * does not have a classic BPF JIT (anymore), they have been
         * converted via bpf_migrate_filter() to eBPF and thus always
         * have an unspec program type.
         */
        return prog->type == BPF_PROG_TYPE_UNSPEC;
}

static inline u32 bpf_ctx_off_adjust_machine(u32 size)
{
        const u32 size_machine = sizeof(unsigned long);

        if (size > size_machine && size % size_machine == 0)
                size = size_machine;

        return size;
}

static inline bool
bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
{
        return size <= size_default && (size & (size - 1)) == 0;
}

static inline u8
bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
{
        u8 access_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
        return access_off;
#else
        return size_default - (access_off + size);
#endif
}

#define bpf_ctx_wide_access_ok(off, size, type, field)                        \
        (size == sizeof(__u64) &&                                        \
        off >= offsetof(type, field) &&                                        \
        off + sizeof(__u64) <= offsetofend(type, field) &&                \
        off % sizeof(__u64) == 0)

#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))

static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (!fp->jited) {
                set_vm_flush_reset_perms(fp);
                return set_memory_ro((unsigned long)fp, fp->pages);
        }
#endif
        return 0;
}

static inline int __must_check
bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
        set_vm_flush_reset_perms(hdr);
        return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}

int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
        return sk_filter_trim_cap(sk, skb, 1);
}

struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
void bpf_prog_free(struct bpf_prog *fp);

bool bpf_opcode_in_insntable(u8 code);

void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
void bpf_prog_jit_attempt_done(struct bpf_prog *prog);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags);
void __bpf_prog_free(struct bpf_prog *fp);

static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{
        __bpf_prog_free(fp);
}

typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
                                       unsigned int flen);

int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);

int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);

u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#define __bpf_call_base_args \
        ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
         (void *)__bpf_call_base)

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_jit_inlines_helper_call(s32 imm);
bool bpf_jit_supports_subprog_tailcalls(void);
bool bpf_jit_supports_percpu_insn(void);
bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);

static inline bool bpf_dump_raw_ok(const struct cred *cred)
{
        /* Reconstruction of call-sites is dependent on kallsyms,
         * thus make dump the same restriction.
         */
        return kallsyms_show_value(cred);
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);

static inline bool xdp_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_set_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_clear_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}

static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
                                 unsigned int pktlen)
{
        unsigned int len;

        if (unlikely(!(fwd->flags & IFF_UP)))
                return -ENETDOWN;

        len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
        if (pktlen > len)
                return -EMSGSIZE;

        return 0;
}

/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the
 * same cpu context. Further for best results no more than a single map
 * for the do_redirect/do_flush pair should be used. This limitation is
 * because we only track one map and force a flush when the map changes.
 * This does not appear to be a real limitation for existing software.
 */
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, struct bpf_prog *prog);
int xdp_do_redirect(struct net_device *dev,
                    struct xdp_buff *xdp,
                    struct bpf_prog *prog);
int xdp_do_redirect_frame(struct net_device *dev,
                          struct xdp_buff *xdp,
                          struct xdp_frame *xdpf,
                          struct bpf_prog *prog);
void xdp_do_flush(void);

void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act);

#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash);
#else
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                     struct bpf_prog *prog, struct sk_buff *skb,
                     struct sock *migrating_sk,
                     u32 hash)
{
        return NULL;
}
#endif

#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
extern int bpf_jit_kallsyms;
extern long bpf_jit_limit;
extern long bpf_jit_limit_max;

typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_jit_binary_free(struct bpf_binary_header *hdr);
u64 bpf_jit_alloc_exec_limit(void);
void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_prog_pack_free(void *ptr, u32 size);

static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
        return list_empty(&fp->aux->ksym.lnode) ||
               fp->aux->ksym.lnode.prev == LIST_POISON2;
}

struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_hdr,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns);
int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
                                 struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header);
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header);

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke);

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed);

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);

static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
                                u32 pass, void *image)
{
        pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen,
               proglen, pass, image, current->comm, task_pid_nr(current));

        if (image)
                print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
                               16, 1, image, proglen, false);
}

static inline bool bpf_jit_is_ebpf(void)
{
# ifdef CONFIG_HAVE_EBPF_JIT
        return true;
# else
        return false;
# endif
}

static inline bool ebpf_jit_enabled(void)
{
        return bpf_jit_enable && bpf_jit_is_ebpf();
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return fp->jited && bpf_jit_is_ebpf();
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        /* These are the prerequisites, should someone ever have the
         * idea to call blinding outside of them, we make sure to
         * bail out.
         */
        if (!bpf_jit_is_ebpf())
                return false;
        if (!prog->jit_requested)
                return false;
        if (!bpf_jit_harden)
                return false;
        if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
                return false;

        return true;
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        /* There are a couple of corner cases where kallsyms should
         * not be enabled f.e. on hardening.
         */
        if (bpf_jit_harden)
                return false;
        if (!bpf_jit_kallsyms)
                return false;
        if (bpf_jit_kallsyms == 1)
                return true;

        return false;
}

const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym);
bool is_bpf_text_address(unsigned long addr);
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym);
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);

static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        const char *ret = __bpf_address_lookup(addr, size, off, sym);

        if (ret && modname)
                *modname = NULL;
        return ret;
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);

#else /* CONFIG_BPF_JIT */

static inline bool ebpf_jit_enabled(void)
{
        return false;
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        return false;
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return false;
}

static inline int
bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                            struct bpf_jit_poke_descriptor *poke)
{
        return -ENOTSUPP;
}

static inline void bpf_jit_free(struct bpf_prog *fp)
{
        bpf_prog_unlock_free(fp);
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        return false;
}

static inline const char *
__bpf_address_lookup(unsigned long addr, unsigned long *size,
                     unsigned long *off, char *sym)
{
        return NULL;
}

static inline bool is_bpf_text_address(unsigned long addr)
{
        return false;
}

static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
                                  char *type, char *sym)
{
        return -ERANGE;
}

static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        return NULL;
}

static inline const char *
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        return NULL;
}

static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
}

static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}

#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);

#define BPF_ANC                BIT(15)

static inline bool bpf_needs_clear_a(const struct sock_filter *first)
{
        switch (first->code) {
        case BPF_RET | BPF_K:
        case BPF_LD | BPF_W | BPF_LEN:
                return false;

        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
                if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X)
                        return true;
                return false;

        default:
                return true;
        }
}

static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
{
        BUG_ON(ftest->code & BPF_ANC);

        switch (ftest->code) {
        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
#define BPF_ANCILLARY(CODE)        case SKF_AD_OFF + SKF_AD_##CODE:        \
                                return BPF_ANC | SKF_AD_##CODE
                switch (ftest->k) {
                BPF_ANCILLARY(PROTOCOL);
                BPF_ANCILLARY(PKTTYPE);
                BPF_ANCILLARY(IFINDEX);
                BPF_ANCILLARY(NLATTR);
                BPF_ANCILLARY(NLATTR_NEST);
                BPF_ANCILLARY(MARK);
                BPF_ANCILLARY(QUEUE);
                BPF_ANCILLARY(HATYPE);
                BPF_ANCILLARY(RXHASH);
                BPF_ANCILLARY(CPU);
                BPF_ANCILLARY(ALU_XOR_X);
                BPF_ANCILLARY(VLAN_TAG);
                BPF_ANCILLARY(VLAN_TAG_PRESENT);
                BPF_ANCILLARY(PAY_OFFSET);
                BPF_ANCILLARY(RANDOM);
                BPF_ANCILLARY(VLAN_TPID);
                }
                fallthrough;
        default:
                return ftest->code;
        }
}

void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
                                           int k, unsigned int size);

static inline int bpf_tell_extensions(void)
{
        return SKF_AD_MAX;
}

struct bpf_sock_addr_kern {
        struct sock *sk;
        struct sockaddr *uaddr;
        /* Temporary "register" to make indirect stores to nested structures
         * defined above. We need three registers to make such a store, but
         * only two (src and dst) are available at convert_ctx_access time
         */
        u64 tmp_reg;
        void *t_ctx;        /* Attach type specific context. */
        u32 uaddrlen;
};

struct bpf_sock_ops_kern {
        struct        sock *sk;
        union {
                u32 args[4];
                u32 reply;
                u32 replylong[4];
        };
        struct sk_buff        *syn_skb;
        struct sk_buff        *skb;
        void        *skb_data_end;
        u8        op;
        u8        is_fullsock;
        u8        remaining_opt_len;
        u64        temp;                        /* temp and everything after is not
                                         * initialized to 0 before calling
                                         * the BPF program. New fields that
                                         * should be initialized to 0 should
                                         * be inserted before temp.
                                         * temp is scratch storage used by
                                         * sock_ops_convert_ctx_access
                                         * as temporary storage of a register.
                                         */
};

struct bpf_sysctl_kern {
        struct ctl_table_header *head;
        const struct ctl_table *table;
        void *cur_val;
        size_t cur_len;
        void *new_val;
        size_t new_len;
        int new_updated;
        int write;
        loff_t *ppos;
        /* Temporary "register" for indirect stores to ppos. */
        u64 tmp_reg;
};

#define BPF_SOCKOPT_KERN_BUF_SIZE        32
struct bpf_sockopt_buf {
        u8                data[BPF_SOCKOPT_KERN_BUF_SIZE];
};

struct bpf_sockopt_kern {
        struct sock        *sk;
        u8                *optval;
        u8                *optval_end;
        s32                level;
        s32                optname;
        s32                optlen;
        /* for retval in struct bpf_cg_run_ctx */
        struct task_struct *current_task;
        /* Temporary "register" for indirect stores to ppos. */
        u64                tmp_reg;
};

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

struct bpf_sk_lookup_kern {
        u16                family;
        u16                protocol;
        __be16                sport;
        u16                dport;
        struct {
                __be32 saddr;
                __be32 daddr;
        } v4;
        struct {
                const struct in6_addr *saddr;
                const struct in6_addr *daddr;
        } v6;
        struct sock        *selected_sk;
        u32                ingress_ifindex;
        bool                no_reuseport;
};

extern struct static_key_false bpf_sk_lookup_enabled;

/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
 *
 * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
 * SK_DROP. Their meaning is as follows:
 *
 *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
 *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
 *  SK_DROP                           : terminate lookup with -ECONNREFUSED
 *
 * This macro aggregates return values and selected sockets from
 * multiple BPF programs according to following rules in order:
 *
 *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
 *     macro result is SK_PASS and last ctx.selected_sk is used.
 *  2. If any program returned SK_DROP return value,
 *     macro result is SK_DROP.
 *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
 *
 * Caller must ensure that the prog array is non-NULL, and that the
 * array as well as the programs it contains remain valid.
 */
#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)                        \
        ({                                                                \
                struct bpf_sk_lookup_kern *_ctx = &(ctx);                \
                struct bpf_prog_array_item *_item;                        \
                struct sock *_selected_sk = NULL;                        \
                bool _no_reuseport = false;                                \
                struct bpf_prog *_prog;                                        \
                bool _all_pass = true;                                        \
                u32 _ret;                                                \
                                                                        \
                migrate_disable();                                        \
                _item = &(array)->items[0];                                \
                while ((_prog = READ_ONCE(_item->prog))) {                \
                        /* restore most recent selection */                \
                        _ctx->selected_sk = _selected_sk;                \
                        _ctx->no_reuseport = _no_reuseport;                \
                                                                        \
                        _ret = func(_prog, _ctx);                        \
                        if (_ret == SK_PASS && _ctx->selected_sk) {        \
                                /* remember last non-NULL socket */        \
                                _selected_sk = _ctx->selected_sk;        \
                                _no_reuseport = _ctx->no_reuseport;        \
                        } else if (_ret == SK_DROP && _all_pass) {        \
                                _all_pass = false;                        \
                        }                                                \
                        _item++;                                        \
                }                                                        \
                _ctx->selected_sk = _selected_sk;                        \
                _ctx->no_reuseport = _no_reuseport;                        \
                migrate_enable();                                        \
                _all_pass || _selected_sk ? SK_PASS : SK_DROP;                \
         })

static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
                                        const __be32 saddr, const __be16 sport,
                                        const __be32 daddr, const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET,
                        .protocol        = protocol,
                        .v4.saddr        = saddr,
                        .v4.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET6,
                        .protocol        = protocol,
                        .v6.saddr        = saddr,
                        .v6.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
                                                   u64 flags, const u64 flag_mask,
                                                   void *lookup_elem(struct bpf_map *map, u32 key))
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;

        /* Lower bits of the flags are used as return code on lookup failure */
        if (unlikely(flags & ~(action_mask | flag_mask)))
                return XDP_ABORTED;

        ri->tgt_value = lookup_elem(map, index);
        if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
                /* If the lookup fails we want to clear out the state in the
                 * redirect_info struct completely, so that if an eBPF program
                 * performs multiple lookups, the last one always takes
                 * precedence.
                 */
                ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return flags & action_mask;
        }

        ri->tgt_index = index;
        ri->map_id = map->id;
        ri->map_type = map->map_type;

        if (flags & BPF_F_BROADCAST) {
                WRITE_ONCE(ri->map, map);
                ri->flags = flags;
        } else {
                WRITE_ONCE(ri->map, NULL);
                ri->flags = 0;
        }

        return XDP_REDIRECT;
}

#ifdef CONFIG_NET
int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags);
int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush);
#else /* CONFIG_NET */
static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
                                       void *to, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
                                        const void *from, u32 len, u64 flags)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset,
                                       void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
                                        void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        return NULL;
}

static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
                                    unsigned long len, bool flush)
{
}
#endif /* CONFIG_NET */

#endif /* __LINUX_FILTER_H__ */






























































































    1 
    1 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 










































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This is a module which is used for queueing packets and communicating with
 * userspace via nfnetlink.
 *
 * (C) 2005 by Harald Welte <laforge@netfilter.org>
 * (C) 2007 by Patrick McHardy <kaber@trash.net>
 *
 * Based on the old ipv4-only ip_queue.c:
 * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
 * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/proc_fs.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_queue.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
#include <linux/cgroup-defs.h>
#include <net/gso.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/netfilter/nf_queue.h>
#include <net/netns/generic.h>

#include <linux/atomic.h>

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
#include "../bridge/br_private.h"
#endif

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif

#define NFQNL_QMAX_DEFAULT 1024

/* We're using struct nlattr which has 16bit nla_len. Note that nla_len
 * includes the header length. Thus, the maximum packet length that we
 * support is 65531 bytes. We send truncated packets if the specified length
 * is larger than that.  Userspace can check for presence of NFQA_CAP_LEN
 * attribute to detect truncation.
 */
#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)

struct nfqnl_instance {
        struct hlist_node hlist;                /* global list of queues */
        struct rcu_head rcu;

        u32 peer_portid;
        unsigned int queue_maxlen;
        unsigned int copy_range;
        unsigned int queue_dropped;
        unsigned int queue_user_dropped;


        u_int16_t queue_num;                        /* number of this queue */
        u_int8_t copy_mode;
        u_int32_t flags;                        /* Set using NFQA_CFG_FLAGS */
/*
 * Following fields are dirtied for each queued packet,
 * keep them in same cache line if possible.
 */
        spinlock_t        lock        ____cacheline_aligned_in_smp;
        unsigned int        queue_total;
        unsigned int        id_sequence;                /* 'sequence' of pkt ids */
        struct list_head queue_list;                /* packets in queue */
};

typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);

static unsigned int nfnl_queue_net_id __read_mostly;

#define INSTANCE_BUCKETS        16
struct nfnl_queue_net {
        spinlock_t instances_lock;
        struct hlist_head instance_table[INSTANCE_BUCKETS];
};

static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
{
        return net_generic(net, nfnl_queue_net_id);
}

static inline u_int8_t instance_hashfn(u_int16_t queue_num)
{
        return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
}

static struct nfqnl_instance *
instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
{
        struct hlist_head *head;
        struct nfqnl_instance *inst;

        head = &q->instance_table[instance_hashfn(queue_num)];
        hlist_for_each_entry_rcu(inst, head, hlist) {
                if (inst->queue_num == queue_num)
                        return inst;
        }
        return NULL;
}

static struct nfqnl_instance *
instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
{
        struct nfqnl_instance *inst;
        unsigned int h;
        int err;

        spin_lock(&q->instances_lock);
        if (instance_lookup(q, queue_num)) {
                err = -EEXIST;
                goto out_unlock;
        }

        inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
        if (!inst) {
                err = -ENOMEM;
                goto out_unlock;
        }

        inst->queue_num = queue_num;
        inst->peer_portid = portid;
        inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
        inst->copy_range = NFQNL_MAX_COPY_RANGE;
        inst->copy_mode = NFQNL_COPY_NONE;
        spin_lock_init(&inst->lock);
        INIT_LIST_HEAD(&inst->queue_list);

        if (!try_module_get(THIS_MODULE)) {
                err = -EAGAIN;
                goto out_free;
        }

        h = instance_hashfn(queue_num);
        hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);

        spin_unlock(&q->instances_lock);

        return inst;

out_free:
        kfree(inst);
out_unlock:
        spin_unlock(&q->instances_lock);
        return ERR_PTR(err);
}

static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
                        unsigned long data);

static void
instance_destroy_rcu(struct rcu_head *head)
{
        struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
                                                   rcu);

        rcu_read_lock();
        nfqnl_flush(inst, NULL, 0);
        rcu_read_unlock();
        kfree(inst);
        module_put(THIS_MODULE);
}

static void
__instance_destroy(struct nfqnl_instance *inst)
{
        hlist_del_rcu(&inst->hlist);
        call_rcu(&inst->rcu, instance_destroy_rcu);
}

static void
instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
{
        spin_lock(&q->instances_lock);
        __instance_destroy(inst);
        spin_unlock(&q->instances_lock);
}

static inline void
__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
       list_add_tail(&entry->list, &queue->queue_list);
       queue->queue_total++;
}

static void
__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
        list_del(&entry->list);
        queue->queue_total--;
}

static struct nf_queue_entry *
find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
{
        struct nf_queue_entry *entry = NULL, *i;

        spin_lock_bh(&queue->lock);

        list_for_each_entry(i, &queue->queue_list, list) {
                if (i->id == id) {
                        entry = i;
                        break;
                }
        }

        if (entry)
                __dequeue_entry(queue, entry);

        spin_unlock_bh(&queue->lock);

        return entry;
}

static unsigned int nf_iterate(struct sk_buff *skb,
                               struct nf_hook_state *state,
                               const struct nf_hook_entries *hooks,
                               unsigned int *index)
{
        const struct nf_hook_entry *hook;
        unsigned int verdict, i = *index;

        while (i < hooks->num_hook_entries) {
                hook = &hooks->hooks[i];
repeat:
                verdict = nf_hook_entry_hookfn(hook, skb, state);
                if (verdict != NF_ACCEPT) {
                        *index = i;
                        if (verdict != NF_REPEAT)
                                return verdict;
                        goto repeat;
                }
                i++;
        }

        *index = i;
        return NF_ACCEPT;
}

static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
{
        switch (pf) {
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
        case NFPROTO_BRIDGE:
                return rcu_dereference(net->nf.hooks_bridge[hooknum]);
#endif
        case NFPROTO_IPV4:
                return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
        case NFPROTO_IPV6:
                return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
        default:
                WARN_ON_ONCE(1);
                return NULL;
        }

        return NULL;
}

static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
{
#ifdef CONFIG_INET
        const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);

        if (entry->state.hook == NF_INET_LOCAL_OUT) {
                const struct iphdr *iph = ip_hdr(skb);

                if (!(iph->tos == rt_info->tos &&
                      skb->mark == rt_info->mark &&
                      iph->daddr == rt_info->daddr &&
                      iph->saddr == rt_info->saddr))
                        return ip_route_me_harder(entry->state.net, entry->state.sk,
                                                  skb, RTN_UNSPEC);
        }
#endif
        return 0;
}

static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
{
        const struct nf_ipv6_ops *v6ops;
        int ret = 0;

        switch (entry->state.pf) {
        case AF_INET:
                ret = nf_ip_reroute(skb, entry);
                break;
        case AF_INET6:
                v6ops = rcu_dereference(nf_ipv6_ops);
                if (v6ops)
                        ret = v6ops->reroute(skb, entry);
                break;
        }
        return ret;
}

/* caller must hold rcu read-side lock */
static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
        const struct nf_hook_entry *hook_entry;
        const struct nf_hook_entries *hooks;
        struct sk_buff *skb = entry->skb;
        const struct net *net;
        unsigned int i;
        int err;
        u8 pf;

        net = entry->state.net;
        pf = entry->state.pf;

        hooks = nf_hook_entries_head(net, pf, entry->state.hook);

        i = entry->hook_index;
        if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
                kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
                nf_queue_entry_free(entry);
                return;
        }

        hook_entry = &hooks->hooks[i];

        /* Continue traversal iff userspace said ok... */
        if (verdict == NF_REPEAT)
                verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);

        if (verdict == NF_ACCEPT) {
                if (nf_reroute(skb, entry) < 0)
                        verdict = NF_DROP;
        }

        if (verdict == NF_ACCEPT) {
next_hook:
                ++i;
                verdict = nf_iterate(skb, &entry->state, hooks, &i);
        }

        switch (verdict & NF_VERDICT_MASK) {
        case NF_ACCEPT:
        case NF_STOP:
                local_bh_disable();
                entry->state.okfn(entry->state.net, entry->state.sk, skb);
                local_bh_enable();
                break;
        case NF_QUEUE:
                err = nf_queue(skb, &entry->state, i, verdict);
                if (err == 1)
                        goto next_hook;
                break;
        case NF_STOLEN:
                break;
        default:
                kfree_skb(skb);
        }

        nf_queue_entry_free(entry);
}

static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
        const struct nf_ct_hook *ct_hook;

        if (verdict == NF_ACCEPT ||
            verdict == NF_REPEAT ||
            verdict == NF_STOP) {
                unsigned int ct_verdict = verdict;

                rcu_read_lock();
                ct_hook = rcu_dereference(nf_ct_hook);
                if (ct_hook)
                        ct_verdict = ct_hook->update(entry->state.net, entry->skb);
                rcu_read_unlock();

                switch (ct_verdict & NF_VERDICT_MASK) {
                case NF_ACCEPT:
                        /* follow userspace verdict, could be REPEAT */
                        break;
                case NF_STOLEN:
                        nf_queue_entry_free(entry);
                        return;
                default:
                        verdict = ct_verdict & NF_VERDICT_MASK;
                        break;
                }
        }
        nf_reinject(entry, verdict);
}

static void
nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
{
        struct nf_queue_entry *entry, *next;

        spin_lock_bh(&queue->lock);
        list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
                if (!cmpfn || cmpfn(entry, data)) {
                        list_del(&entry->list);
                        queue->queue_total--;
                        nfqnl_reinject(entry, NF_DROP);
                }
        }
        spin_unlock_bh(&queue->lock);
}

static int
nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
                      bool csum_verify)
{
        __u32 flags = 0;

        if (packet->ip_summed == CHECKSUM_PARTIAL)
                flags = NFQA_SKB_CSUMNOTREADY;
        else if (csum_verify)
                flags = NFQA_SKB_CSUM_NOTVERIFIED;

        if (skb_is_gso(packet))
                flags |= NFQA_SKB_GSO;

        return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0;
}

static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
{
        const struct cred *cred;

        if (!sk_fullsock(sk))
                return 0;

        read_lock_bh(&sk->sk_callback_lock);
        if (sk->sk_socket && sk->sk_socket->file) {
                cred = sk->sk_socket->file->f_cred;
                if (nla_put_be32(skb, NFQA_UID,
                    htonl(from_kuid_munged(&init_user_ns, cred->fsuid))))
                        goto nla_put_failure;
                if (nla_put_be32(skb, NFQA_GID,
                    htonl(from_kgid_munged(&init_user_ns, cred->fsgid))))
                        goto nla_put_failure;
        }
        read_unlock_bh(&sk->sk_callback_lock);
        return 0;

nla_put_failure:
        read_unlock_bh(&sk->sk_callback_lock);
        return -1;
}

static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
{
#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
        if (sk && sk_fullsock(sk)) {
                u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data);

                if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid)))
                        return -1;
        }
#endif
        return 0;
}

static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
{
        u32 seclen = 0;
#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
        if (!skb || !sk_fullsock(skb->sk))
                return 0;

        read_lock_bh(&skb->sk->sk_callback_lock);

        if (skb->secmark)
                security_secid_to_secctx(skb->secmark, secdata, &seclen);

        read_unlock_bh(&skb->sk->sk_callback_lock);
#endif
        return seclen;
}

static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry)
{
        struct sk_buff *entskb = entry->skb;
        u32 nlalen = 0;

        if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
                return 0;

        if (skb_vlan_tag_present(entskb))
                nlalen += nla_total_size(nla_total_size(sizeof(__be16)) +
                                         nla_total_size(sizeof(__be16)));

        if (entskb->network_header > entskb->mac_header)
                nlalen += nla_total_size((entskb->network_header -
                                          entskb->mac_header));

        return nlalen;
}

static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb)
{
        struct sk_buff *entskb = entry->skb;

        if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
                return 0;

        if (skb_vlan_tag_present(entskb)) {
                struct nlattr *nest;

                nest = nla_nest_start(skb, NFQA_VLAN);
                if (!nest)
                        goto nla_put_failure;

                if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) ||
                    nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto))
                        goto nla_put_failure;

                nla_nest_end(skb, nest);
        }

        if (entskb->mac_header < entskb->network_header) {
                int len = (int)(entskb->network_header - entskb->mac_header);

                if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb)))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -1;
}

static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                           struct nf_queue_entry *entry,
                           __be32 **packet_id_ptr)
{
        size_t size;
        size_t data_len = 0, cap_len = 0;
        unsigned int hlen = 0;
        struct sk_buff *skb;
        struct nlattr *nla;
        struct nfqnl_msg_packet_hdr *pmsg;
        struct nlmsghdr *nlh;
        struct sk_buff *entskb = entry->skb;
        struct net_device *indev;
        struct net_device *outdev;
        struct nf_conn *ct = NULL;
        enum ip_conntrack_info ctinfo = 0;
        const struct nfnl_ct_hook *nfnl_ct;
        bool csum_verify;
        char *secdata = NULL;
        u32 seclen = 0;
        ktime_t tstamp;

        size = nlmsg_total_size(sizeof(struct nfgenmsg))
                + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
#endif
                + nla_total_size(sizeof(u_int32_t))        /* mark */
                + nla_total_size(sizeof(u_int32_t))        /* priority */
                + nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
                + nla_total_size(sizeof(u_int32_t))        /* skbinfo */
#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
                + nla_total_size(sizeof(u_int32_t))        /* classid */
#endif
                + nla_total_size(sizeof(u_int32_t));        /* cap_len */

        tstamp = skb_tstamp_cond(entskb, false);
        if (tstamp)
                size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));

        size += nfqnl_get_bridge_size(entry);

        if (entry->state.hook <= NF_INET_FORWARD ||
           (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
                csum_verify = !skb_csum_unnecessary(entskb);
        else
                csum_verify = false;

        outdev = entry->state.out;

        switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) {
        case NFQNL_COPY_META:
        case NFQNL_COPY_NONE:
                break;

        case NFQNL_COPY_PACKET:
                if (!(queue->flags & NFQA_CFG_F_GSO) &&
                    entskb->ip_summed == CHECKSUM_PARTIAL &&
                    skb_checksum_help(entskb))
                        return NULL;

                data_len = READ_ONCE(queue->copy_range);
                if (data_len > entskb->len)
                        data_len = entskb->len;

                hlen = skb_zerocopy_headlen(entskb);
                hlen = min_t(unsigned int, hlen, data_len);
                size += sizeof(struct nlattr) + hlen;
                cap_len = entskb->len;
                break;
        }

        nfnl_ct = rcu_dereference(nfnl_ct_hook);

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        if (queue->flags & NFQA_CFG_F_CONNTRACK) {
                if (nfnl_ct != NULL) {
                        ct = nf_ct_get(entskb, &ctinfo);
                        if (ct != NULL)
                                size += nfnl_ct->build_size(ct);
                }
        }
#endif

        if (queue->flags & NFQA_CFG_F_UID_GID) {
                size += (nla_total_size(sizeof(u_int32_t))        /* uid */
                        + nla_total_size(sizeof(u_int32_t)));        /* gid */
        }

        if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
                seclen = nfqnl_get_sk_secctx(entskb, &secdata);
                if (seclen)
                        size += nla_total_size(seclen);
        }

        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
                skb_tx_error(entskb);
                goto nlmsg_failure;
        }

        nlh = nfnl_msg_put(skb, 0, 0,
                           nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
                           0, entry->state.pf, NFNETLINK_V0,
                           htons(queue->queue_num));
        if (!nlh) {
                skb_tx_error(entskb);
                kfree_skb(skb);
                goto nlmsg_failure;
        }

        nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
        pmsg = nla_data(nla);
        pmsg->hw_protocol        = entskb->protocol;
        pmsg->hook                = entry->state.hook;
        *packet_id_ptr                = &pmsg->packet_id;

        indev = entry->state.in;
        if (indev) {
#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)))
                        goto nla_put_failure;
#else
                if (entry->state.pf == PF_BRIDGE) {
                        /* Case 1: indev is physical input device, we need to
                         * look for bridge group (when called from
                         * netfilter_bridge) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
                                         htonl(indev->ifindex)) ||
                        /* this is the bridge group "brX" */
                        /* rcu_read_lock()ed by __nf_queue */
                            nla_put_be32(skb, NFQA_IFINDEX_INDEV,
                                         htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
                                goto nla_put_failure;
                } else {
                        int physinif;

                        /* Case 2: indev is bridge group, we need to look for
                         * physical device (when called from ipv4) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_INDEV,
                                         htonl(indev->ifindex)))
                                goto nla_put_failure;

                        physinif = nf_bridge_get_physinif(entskb);
                        if (physinif &&
                            nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
                                         htonl(physinif)))
                                goto nla_put_failure;
                }
#endif
        }

        if (outdev) {
#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)))
                        goto nla_put_failure;
#else
                if (entry->state.pf == PF_BRIDGE) {
                        /* Case 1: outdev is physical output device, we need to
                         * look for bridge group (when called from
                         * netfilter_bridge) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
                                         htonl(outdev->ifindex)) ||
                        /* this is the bridge group "brX" */
                        /* rcu_read_lock()ed by __nf_queue */
                            nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
                                         htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
                                goto nla_put_failure;
                } else {
                        int physoutif;

                        /* Case 2: outdev is bridge group, we need to look for
                         * physical output device (when called from ipv4) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
                                         htonl(outdev->ifindex)))
                                goto nla_put_failure;

                        physoutif = nf_bridge_get_physoutif(entskb);
                        if (physoutif &&
                            nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
                                         htonl(physoutif)))
                                goto nla_put_failure;
                }
#endif
        }

        if (entskb->mark &&
            nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
                goto nla_put_failure;

        if (entskb->priority &&
            nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
                goto nla_put_failure;

        if (indev && entskb->dev &&
            skb_mac_header_was_set(entskb) &&
            skb_mac_header_len(entskb) != 0) {
                struct nfqnl_msg_packet_hw phw;
                int len;

                memset(&phw, 0, sizeof(phw));
                len = dev_parse_header(entskb, phw.hw_addr);
                if (len) {
                        phw.hw_addrlen = htons(len);
                        if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw))
                                goto nla_put_failure;
                }
        }

        if (nfqnl_put_bridge(entry, skb) < 0)
                goto nla_put_failure;

        if (entry->state.hook <= NF_INET_FORWARD && tstamp) {
                struct nfqnl_msg_packet_timestamp ts;
                struct timespec64 kts = ktime_to_timespec64(tstamp);

                ts.sec = cpu_to_be64(kts.tv_sec);
                ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);

                if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts))
                        goto nla_put_failure;
        }

        if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk &&
            nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
                goto nla_put_failure;

        if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
                goto nla_put_failure;

        if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
                goto nla_put_failure;

        if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
                goto nla_put_failure;

        if (cap_len > data_len &&
            nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
                goto nla_put_failure;

        if (nfqnl_put_packet_info(skb, entskb, csum_verify))
                goto nla_put_failure;

        if (data_len) {
                struct nlattr *nla;

                if (skb_tailroom(skb) < sizeof(*nla) + hlen)
                        goto nla_put_failure;

                nla = skb_put(skb, sizeof(*nla));
                nla->nla_type = NFQA_PAYLOAD;
                nla->nla_len = nla_attr_size(data_len);

                if (skb_zerocopy(skb, entskb, data_len, hlen))
                        goto nla_put_failure;
        }

        nlh->nlmsg_len = skb->len;
        if (seclen)
                security_release_secctx(secdata, seclen);
        return skb;

nla_put_failure:
        skb_tx_error(entskb);
        kfree_skb(skb);
        net_err_ratelimited("nf_queue: error creating packet message\n");
nlmsg_failure:
        if (seclen)
                security_release_secctx(secdata, seclen);
        return NULL;
}

static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
        const struct nf_conn *ct = (void *)skb_nfct(entry->skb);

        if (ct && ((ct->status & flags) == IPS_DYING))
                return true;
#endif
        return false;
}

static int
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
                        struct nf_queue_entry *entry)
{
        struct sk_buff *nskb;
        int err = -ENOBUFS;
        __be32 *packet_id_ptr;
        int failopen = 0;

        nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr);
        if (nskb == NULL) {
                err = -ENOMEM;
                goto err_out;
        }
        spin_lock_bh(&queue->lock);

        if (nf_ct_drop_unconfirmed(entry))
                goto err_out_free_nskb;

        if (queue->queue_total >= queue->queue_maxlen) {
                if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
                        failopen = 1;
                        err = 0;
                } else {
                        queue->queue_dropped++;
                        net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
                                             queue->queue_total);
                }
                goto err_out_free_nskb;
        }
        entry->id = ++queue->id_sequence;
        *packet_id_ptr = htonl(entry->id);

        /* nfnetlink_unicast will either free the nskb or add it to a socket */
        err = nfnetlink_unicast(nskb, net, queue->peer_portid);
        if (err < 0) {
                if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
                        failopen = 1;
                        err = 0;
                } else {
                        queue->queue_user_dropped++;
                }
                goto err_out_unlock;
        }

        __enqueue_entry(queue, entry);

        spin_unlock_bh(&queue->lock);
        return 0;

err_out_free_nskb:
        kfree_skb(nskb);
err_out_unlock:
        spin_unlock_bh(&queue->lock);
        if (failopen)
                nfqnl_reinject(entry, NF_ACCEPT);
err_out:
        return err;
}

static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
        struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);

        if (!entry)
                return NULL;

        if (nf_queue_entry_get_refs(entry))
                return entry;

        kfree(entry);
        return NULL;
}

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
/* When called from bridge netfilter, skb->data must point to MAC header
 * before calling skb_gso_segment(). Else, original MAC header is lost
 * and segmented skbs will be sent to wrong destination.
 */
static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
{
        if (nf_bridge_info_get(skb))
                __skb_push(skb, skb->network_header - skb->mac_header);
}

static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
{
        if (nf_bridge_info_get(skb))
                __skb_pull(skb, skb->network_header - skb->mac_header);
}
#else
#define nf_bridge_adjust_skb_data(s) do {} while (0)
#define nf_bridge_adjust_segmented_data(s) do {} while (0)
#endif

static int
__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
                           struct sk_buff *skb, struct nf_queue_entry *entry)
{
        int ret = -ENOMEM;
        struct nf_queue_entry *entry_seg;

        nf_bridge_adjust_segmented_data(skb);

        if (skb->next == NULL) { /* last packet, no need to copy entry */
                struct sk_buff *gso_skb = entry->skb;
                entry->skb = skb;
                ret = __nfqnl_enqueue_packet(net, queue, entry);
                if (ret)
                        entry->skb = gso_skb;
                return ret;
        }

        skb_mark_not_on_list(skb);

        entry_seg = nf_queue_entry_dup(entry);
        if (entry_seg) {
                entry_seg->skb = skb;
                ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
                if (ret)
                        nf_queue_entry_free(entry_seg);
        }
        return ret;
}

static int
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
        unsigned int queued;
        struct nfqnl_instance *queue;
        struct sk_buff *skb, *segs, *nskb;
        int err = -ENOBUFS;
        struct net *net = entry->state.net;
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);

        /* rcu_read_lock()ed by nf_hook_thresh */
        queue = instance_lookup(q, queuenum);
        if (!queue)
                return -ESRCH;

        if (queue->copy_mode == NFQNL_COPY_NONE)
                return -EINVAL;

        skb = entry->skb;

        switch (entry->state.pf) {
        case NFPROTO_IPV4:
                skb->protocol = htons(ETH_P_IP);
                break;
        case NFPROTO_IPV6:
                skb->protocol = htons(ETH_P_IPV6);
                break;
        }

        if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb))
                return __nfqnl_enqueue_packet(net, queue, entry);

        nf_bridge_adjust_skb_data(skb);
        segs = skb_gso_segment(skb, 0);
        /* Does not use PTR_ERR to limit the number of error codes that can be
         * returned by nf_queue.  For instance, callers rely on -ESRCH to
         * mean 'ignore this hook'.
         */
        if (IS_ERR_OR_NULL(segs))
                goto out_err;
        queued = 0;
        err = 0;
        skb_list_walk_safe(segs, segs, nskb) {
                if (err == 0)
                        err = __nfqnl_enqueue_packet_gso(net, queue,
                                                        segs, entry);
                if (err == 0)
                        queued++;
                else
                        kfree_skb(segs);
        }

        if (queued) {
                if (err) /* some segments are already queued */
                        nf_queue_entry_free(entry);
                kfree_skb(skb);
                return 0;
        }
 out_err:
        nf_bridge_adjust_segmented_data(skb);
        return err;
}

static int
nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
        struct sk_buff *nskb;

        if (diff < 0) {
                unsigned int min_len = skb_transport_offset(e->skb);

                if (data_len < min_len)
                        return -EINVAL;

                if (pskb_trim(e->skb, data_len))
                        return -ENOMEM;
        } else if (diff > 0) {
                if (data_len > 0xFFFF)
                        return -EINVAL;
                if (diff > skb_tailroom(e->skb)) {
                        nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
                                               diff, GFP_ATOMIC);
                        if (!nskb)
                                return -ENOMEM;
                        kfree_skb(e->skb);
                        e->skb = nskb;
                }
                skb_put(e->skb, diff);
        }
        if (skb_ensure_writable(e->skb, data_len))
                return -ENOMEM;
        skb_copy_to_linear_data(e->skb, data, data_len);
        e->skb->ip_summed = CHECKSUM_NONE;
        return 0;
}

static int
nfqnl_set_mode(struct nfqnl_instance *queue,
               unsigned char mode, unsigned int range)
{
        int status = 0;

        spin_lock_bh(&queue->lock);
        switch (mode) {
        case NFQNL_COPY_NONE:
        case NFQNL_COPY_META:
                queue->copy_mode = mode;
                queue->copy_range = 0;
                break;

        case NFQNL_COPY_PACKET:
                queue->copy_mode = mode;
                if (range == 0 || range > NFQNL_MAX_COPY_RANGE)
                        queue->copy_range = NFQNL_MAX_COPY_RANGE;
                else
                        queue->copy_range = range;
                break;

        default:
                status = -EINVAL;

        }
        spin_unlock_bh(&queue->lock);

        return status;
}

static int
dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        int physinif, physoutif;

        physinif = nf_bridge_get_physinif(entry->skb);
        physoutif = nf_bridge_get_physoutif(entry->skb);

        if (physinif == ifindex || physoutif == ifindex)
                return 1;
#endif
        if (entry->state.in)
                if (entry->state.in->ifindex == ifindex)
                        return 1;
        if (entry->state.out)
                if (entry->state.out->ifindex == ifindex)
                        return 1;

        return 0;
}

/* drop all packets with either indev or outdev == ifindex from all queue
 * instances */
static void
nfqnl_dev_drop(struct net *net, int ifindex)
{
        int i;
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);

        rcu_read_lock();

        for (i = 0; i < INSTANCE_BUCKETS; i++) {
                struct nfqnl_instance *inst;
                struct hlist_head *head = &q->instance_table[i];

                hlist_for_each_entry_rcu(inst, head, hlist)
                        nfqnl_flush(inst, dev_cmp, ifindex);
        }

        rcu_read_unlock();
}

static int
nfqnl_rcv_dev_event(struct notifier_block *this,
                    unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        /* Drop any packets associated with the downed device */
        if (event == NETDEV_DOWN)
                nfqnl_dev_drop(dev_net(dev), dev->ifindex);
        return NOTIFY_DONE;
}

static struct notifier_block nfqnl_dev_notifier = {
        .notifier_call        = nfqnl_rcv_dev_event,
};

static void nfqnl_nf_hook_drop(struct net *net)
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);
        int i;

        /* This function is also called on net namespace error unwind,
         * when pernet_ops->init() failed and ->exit() functions of the
         * previous pernet_ops gets called.
         *
         * This may result in a call to nfqnl_nf_hook_drop() before
         * struct nfnl_queue_net was allocated.
         */
        if (!q)
                return;

        for (i = 0; i < INSTANCE_BUCKETS; i++) {
                struct nfqnl_instance *inst;
                struct hlist_head *head = &q->instance_table[i];

                hlist_for_each_entry_rcu(inst, head, hlist)
                        nfqnl_flush(inst, NULL, 0);
        }
}

static int
nfqnl_rcv_nl_event(struct notifier_block *this,
                   unsigned long event, void *ptr)
{
        struct netlink_notify *n = ptr;
        struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);

        if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
                int i;

                /* destroy all instances for this portid */
                spin_lock(&q->instances_lock);
                for (i = 0; i < INSTANCE_BUCKETS; i++) {
                        struct hlist_node *t2;
                        struct nfqnl_instance *inst;
                        struct hlist_head *head = &q->instance_table[i];

                        hlist_for_each_entry_safe(inst, t2, head, hlist) {
                                if (n->portid == inst->peer_portid)
                                        __instance_destroy(inst);
                        }
                }
                spin_unlock(&q->instances_lock);
        }
        return NOTIFY_DONE;
}

static struct notifier_block nfqnl_rtnl_notifier = {
        .notifier_call        = nfqnl_rcv_nl_event,
};

static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = {
        [NFQA_VLAN_TCI]                = { .type = NLA_U16},
        [NFQA_VLAN_PROTO]        = { .type = NLA_U16},
};

static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
        [NFQA_VERDICT_HDR]        = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
        [NFQA_MARK]                = { .type = NLA_U32 },
        [NFQA_PAYLOAD]                = { .type = NLA_UNSPEC },
        [NFQA_CT]                = { .type = NLA_UNSPEC },
        [NFQA_EXP]                = { .type = NLA_UNSPEC },
        [NFQA_VLAN]                = { .type = NLA_NESTED },
        [NFQA_PRIORITY]                = { .type = NLA_U32 },
};

static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
        [NFQA_VERDICT_HDR]        = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
        [NFQA_MARK]                = { .type = NLA_U32 },
        [NFQA_PRIORITY]                = { .type = NLA_U32 },
};

static struct nfqnl_instance *
verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid)
{
        struct nfqnl_instance *queue;

        queue = instance_lookup(q, queue_num);
        if (!queue)
                return ERR_PTR(-ENODEV);

        if (queue->peer_portid != nlportid)
                return ERR_PTR(-EPERM);

        return queue;
}

static struct nfqnl_msg_verdict_hdr*
verdicthdr_get(const struct nlattr * const nfqa[])
{
        struct nfqnl_msg_verdict_hdr *vhdr;
        unsigned int verdict;

        if (!nfqa[NFQA_VERDICT_HDR])
                return NULL;

        vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
        verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK;
        if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN)
                return NULL;
        return vhdr;
}

static int nfq_id_after(unsigned int id, unsigned int max)
{
        return (int)(id - max) > 0;
}

static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
                                    const struct nfnl_info *info,
                                    const struct nlattr * const nfqa[])
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
        u16 queue_num = ntohs(info->nfmsg->res_id);
        struct nf_queue_entry *entry, *tmp;
        struct nfqnl_msg_verdict_hdr *vhdr;
        struct nfqnl_instance *queue;
        unsigned int verdict, maxid;
        LIST_HEAD(batch_list);

        queue = verdict_instance_lookup(q, queue_num,
                                        NETLINK_CB(skb).portid);
        if (IS_ERR(queue))
                return PTR_ERR(queue);

        vhdr = verdicthdr_get(nfqa);
        if (!vhdr)
                return -EINVAL;

        verdict = ntohl(vhdr->verdict);
        maxid = ntohl(vhdr->id);

        spin_lock_bh(&queue->lock);

        list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) {
                if (nfq_id_after(entry->id, maxid))
                        break;
                __dequeue_entry(queue, entry);
                list_add_tail(&entry->list, &batch_list);
        }

        spin_unlock_bh(&queue->lock);

        if (list_empty(&batch_list))
                return -ENOENT;

        list_for_each_entry_safe(entry, tmp, &batch_list, list) {
                if (nfqa[NFQA_MARK])
                        entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));

                if (nfqa[NFQA_PRIORITY])
                        entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));

                nfqnl_reinject(entry, verdict);
        }
        return 0;
}

static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct,
                                      const struct nlmsghdr *nlh,
                                      const struct nlattr * const nfqa[],
                                      struct nf_queue_entry *entry,
                                      enum ip_conntrack_info *ctinfo)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        struct nf_conn *ct;

        ct = nf_ct_get(entry->skb, ctinfo);
        if (ct == NULL)
                return NULL;

        if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0)
                return NULL;

        if (nfqa[NFQA_EXP])
                nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct,
                                      NETLINK_CB(entry->skb).portid,
                                      nlmsg_report(nlh));
        return ct;
#else
        return NULL;
#endif
}

static int nfqa_parse_bridge(struct nf_queue_entry *entry,
                             const struct nlattr * const nfqa[])
{
        if (nfqa[NFQA_VLAN]) {
                struct nlattr *tb[NFQA_VLAN_MAX + 1];
                int err;

                err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX,
                                                  nfqa[NFQA_VLAN],
                                                  nfqa_vlan_policy, NULL);
                if (err < 0)
                        return err;

                if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO])
                        return -EINVAL;

                __vlan_hwaccel_put_tag(entry->skb,
                        nla_get_be16(tb[NFQA_VLAN_PROTO]),
                        ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])));
        }

        if (nfqa[NFQA_L2HDR]) {
                int mac_header_len = entry->skb->network_header -
                        entry->skb->mac_header;

                if (mac_header_len != nla_len(nfqa[NFQA_L2HDR]))
                        return -EINVAL;
                else if (mac_header_len > 0)
                        memcpy(skb_mac_header(entry->skb),
                               nla_data(nfqa[NFQA_L2HDR]),
                               mac_header_len);
        }

        return 0;
}

static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nfqa[])
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
        u_int16_t queue_num = ntohs(info->nfmsg->res_id);
        const struct nfnl_ct_hook *nfnl_ct;
        struct nfqnl_msg_verdict_hdr *vhdr;
        enum ip_conntrack_info ctinfo;
        struct nfqnl_instance *queue;
        struct nf_queue_entry *entry;
        struct nf_conn *ct = NULL;
        unsigned int verdict;
        int err;

        queue = verdict_instance_lookup(q, queue_num,
                                        NETLINK_CB(skb).portid);
        if (IS_ERR(queue))
                return PTR_ERR(queue);

        vhdr = verdicthdr_get(nfqa);
        if (!vhdr)
                return -EINVAL;

        verdict = ntohl(vhdr->verdict);

        entry = find_dequeue_entry(queue, ntohl(vhdr->id));
        if (entry == NULL)
                return -ENOENT;

        /* rcu lock already held from nfnl->call_rcu. */
        nfnl_ct = rcu_dereference(nfnl_ct_hook);

        if (nfqa[NFQA_CT]) {
                if (nfnl_ct != NULL)
                        ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry,
                                            &ctinfo);
        }

        if (entry->state.pf == PF_BRIDGE) {
                err = nfqa_parse_bridge(entry, nfqa);
                if (err < 0)
                        return err;
        }

        if (nfqa[NFQA_PAYLOAD]) {
                u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
                int diff = payload_len - entry->skb->len;

                if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
                                 payload_len, entry, diff) < 0)
                        verdict = NF_DROP;

                if (ct && diff)
                        nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff);
        }

        if (nfqa[NFQA_MARK])
                entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));

        if (nfqa[NFQA_PRIORITY])
                entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));

        nfqnl_reinject(entry, verdict);
        return 0;
}

static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const cda[])
{
        return -ENOTSUPP;
}

static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
        [NFQA_CFG_CMD]                = { .len = sizeof(struct nfqnl_msg_config_cmd) },
        [NFQA_CFG_PARAMS]        = { .len = sizeof(struct nfqnl_msg_config_params) },
        [NFQA_CFG_QUEUE_MAXLEN]        = { .type = NLA_U32 },
        [NFQA_CFG_MASK]                = { .type = NLA_U32 },
        [NFQA_CFG_FLAGS]        = { .type = NLA_U32 },
};

static const struct nf_queue_handler nfqh = {
        .outfn                = nfqnl_enqueue_packet,
        .nf_hook_drop        = nfqnl_nf_hook_drop,
};

static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nfqa[])
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
        u_int16_t queue_num = ntohs(info->nfmsg->res_id);
        struct nfqnl_msg_config_cmd *cmd = NULL;
        struct nfqnl_instance *queue;
        __u32 flags = 0, mask = 0;
        int ret = 0;

        if (nfqa[NFQA_CFG_CMD]) {
                cmd = nla_data(nfqa[NFQA_CFG_CMD]);

                /* Obsolete commands without queue context */
                switch (cmd->command) {
                case NFQNL_CFG_CMD_PF_BIND: return 0;
                case NFQNL_CFG_CMD_PF_UNBIND: return 0;
                }
        }

        /* Check if we support these flags in first place, dependencies should
         * be there too not to break atomicity.
         */
        if (nfqa[NFQA_CFG_FLAGS]) {
                if (!nfqa[NFQA_CFG_MASK]) {
                        /* A mask is needed to specify which flags are being
                         * changed.
                         */
                        return -EINVAL;
                }

                flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
                mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));

                if (flags >= NFQA_CFG_F_MAX)
                        return -EOPNOTSUPP;

#if !IS_ENABLED(CONFIG_NETWORK_SECMARK)
                if (flags & mask & NFQA_CFG_F_SECCTX)
                        return -EOPNOTSUPP;
#endif
                if ((flags & mask & NFQA_CFG_F_CONNTRACK) &&
                    !rcu_access_pointer(nfnl_ct_hook)) {
#ifdef CONFIG_MODULES
                        nfnl_unlock(NFNL_SUBSYS_QUEUE);
                        request_module("ip_conntrack_netlink");
                        nfnl_lock(NFNL_SUBSYS_QUEUE);
                        if (rcu_access_pointer(nfnl_ct_hook))
                                return -EAGAIN;
#endif
                        return -EOPNOTSUPP;
                }
        }

        rcu_read_lock();
        queue = instance_lookup(q, queue_num);
        if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
                ret = -EPERM;
                goto err_out_unlock;
        }

        if (cmd != NULL) {
                switch (cmd->command) {
                case NFQNL_CFG_CMD_BIND:
                        if (queue) {
                                ret = -EBUSY;
                                goto err_out_unlock;
                        }
                        queue = instance_create(q, queue_num,
                                                NETLINK_CB(skb).portid);
                        if (IS_ERR(queue)) {
                                ret = PTR_ERR(queue);
                                goto err_out_unlock;
                        }
                        break;
                case NFQNL_CFG_CMD_UNBIND:
                        if (!queue) {
                                ret = -ENODEV;
                                goto err_out_unlock;
                        }
                        instance_destroy(q, queue);
                        goto err_out_unlock;
                case NFQNL_CFG_CMD_PF_BIND:
                case NFQNL_CFG_CMD_PF_UNBIND:
                        break;
                default:
                        ret = -ENOTSUPP;
                        goto err_out_unlock;
                }
        }

        if (!queue) {
                ret = -ENODEV;
                goto err_out_unlock;
        }

        if (nfqa[NFQA_CFG_PARAMS]) {
                struct nfqnl_msg_config_params *params =
                        nla_data(nfqa[NFQA_CFG_PARAMS]);

                nfqnl_set_mode(queue, params->copy_mode,
                                ntohl(params->copy_range));
        }

        if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
                __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);

                spin_lock_bh(&queue->lock);
                queue->queue_maxlen = ntohl(*queue_maxlen);
                spin_unlock_bh(&queue->lock);
        }

        if (nfqa[NFQA_CFG_FLAGS]) {
                spin_lock_bh(&queue->lock);
                queue->flags &= ~mask;
                queue->flags |= flags & mask;
                spin_unlock_bh(&queue->lock);
        }

err_out_unlock:
        rcu_read_unlock();
        return ret;
}

static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
        [NFQNL_MSG_PACKET]        = {
                .call                = nfqnl_recv_unsupp,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFQA_MAX,
        },
        [NFQNL_MSG_VERDICT]        = {
                .call                = nfqnl_recv_verdict,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFQA_MAX,
                .policy                = nfqa_verdict_policy
        },
        [NFQNL_MSG_CONFIG]        = {
                .call                = nfqnl_recv_config,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = NFQA_CFG_MAX,
                .policy                = nfqa_cfg_policy
        },
        [NFQNL_MSG_VERDICT_BATCH] = {
                .call                = nfqnl_recv_verdict_batch,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFQA_MAX,
                .policy                = nfqa_verdict_batch_policy
        },
};

static const struct nfnetlink_subsystem nfqnl_subsys = {
        .name                = "nf_queue",
        .subsys_id        = NFNL_SUBSYS_QUEUE,
        .cb_count        = NFQNL_MSG_MAX,
        .cb                = nfqnl_cb,
};

#ifdef CONFIG_PROC_FS
struct iter_state {
        struct seq_net_private p;
        unsigned int bucket;
};

static struct hlist_node *get_first(struct seq_file *seq)
{
        struct iter_state *st = seq->private;
        struct net *net;
        struct nfnl_queue_net *q;

        if (!st)
                return NULL;

        net = seq_file_net(seq);
        q = nfnl_queue_pernet(net);
        for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
                if (!hlist_empty(&q->instance_table[st->bucket]))
                        return q->instance_table[st->bucket].first;
        }
        return NULL;
}

static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
{
        struct iter_state *st = seq->private;
        struct net *net = seq_file_net(seq);

        h = h->next;
        while (!h) {
                struct nfnl_queue_net *q;

                if (++st->bucket >= INSTANCE_BUCKETS)
                        return NULL;

                q = nfnl_queue_pernet(net);
                h = q->instance_table[st->bucket].first;
        }
        return h;
}

static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
{
        struct hlist_node *head;
        head = get_first(seq);

        if (head)
                while (pos && (head = get_next(seq, head)))
                        pos--;
        return pos ? NULL : head;
}

static void *seq_start(struct seq_file *s, loff_t *pos)
        __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
        spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
        return get_idx(s, *pos);
}

static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        (*pos)++;
        return get_next(s, v);
}

static void seq_stop(struct seq_file *s, void *v)
        __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
        spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
}

static int seq_show(struct seq_file *s, void *v)
{
        const struct nfqnl_instance *inst = v;

        seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n",
                   inst->queue_num,
                   inst->peer_portid, inst->queue_total,
                   inst->copy_mode, inst->copy_range,
                   inst->queue_dropped, inst->queue_user_dropped,
                   inst->id_sequence, 1);
        return 0;
}

static const struct seq_operations nfqnl_seq_ops = {
        .start        = seq_start,
        .next        = seq_next,
        .stop        = seq_stop,
        .show        = seq_show,
};
#endif /* PROC_FS */

static int __net_init nfnl_queue_net_init(struct net *net)
{
        unsigned int i;
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);

        for (i = 0; i < INSTANCE_BUCKETS; i++)
                INIT_HLIST_HEAD(&q->instance_table[i]);

        spin_lock_init(&q->instances_lock);

#ifdef CONFIG_PROC_FS
        if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter,
                        &nfqnl_seq_ops, sizeof(struct iter_state)))
                return -ENOMEM;
#endif
        return 0;
}

static void __net_exit nfnl_queue_net_exit(struct net *net)
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);
        unsigned int i;

#ifdef CONFIG_PROC_FS
        remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
        for (i = 0; i < INSTANCE_BUCKETS; i++)
                WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
}

static struct pernet_operations nfnl_queue_net_ops = {
        .init                = nfnl_queue_net_init,
        .exit                = nfnl_queue_net_exit,
        .id                = &nfnl_queue_net_id,
        .size                = sizeof(struct nfnl_queue_net),
};

static int __init nfnetlink_queue_init(void)
{
        int status;

        status = register_pernet_subsys(&nfnl_queue_net_ops);
        if (status < 0) {
                pr_err("failed to register pernet ops\n");
                goto out;
        }

        netlink_register_notifier(&nfqnl_rtnl_notifier);
        status = nfnetlink_subsys_register(&nfqnl_subsys);
        if (status < 0) {
                pr_err("failed to create netlink socket\n");
                goto cleanup_netlink_notifier;
        }

        status = register_netdevice_notifier(&nfqnl_dev_notifier);
        if (status < 0) {
                pr_err("failed to register netdevice notifier\n");
                goto cleanup_netlink_subsys;
        }

        nf_register_queue_handler(&nfqh);

        return status;

cleanup_netlink_subsys:
        nfnetlink_subsys_unregister(&nfqnl_subsys);
cleanup_netlink_notifier:
        netlink_unregister_notifier(&nfqnl_rtnl_notifier);
        unregister_pernet_subsys(&nfnl_queue_net_ops);
out:
        return status;
}

static void __exit nfnetlink_queue_fini(void)
{
        nf_unregister_queue_handler();
        unregister_netdevice_notifier(&nfqnl_dev_notifier);
        nfnetlink_subsys_unregister(&nfqnl_subsys);
        netlink_unregister_notifier(&nfqnl_rtnl_notifier);
        unregister_pernet_subsys(&nfnl_queue_net_ops);

        rcu_barrier(); /* Wait for completion of call_rcu()'s */
}

MODULE_DESCRIPTION("netfilter packet queue handler");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);

module_init(nfnetlink_queue_init);
module_exit(nfnetlink_queue_fini);











    1 

































    7 

































    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/**
 * css_get - obtain a reference on the specified css
 * @css: target css
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get(&css->refcnt);
}
CGROUP_REF_EXPORT(css_get)

/**
 * css_get_many - obtain references on the specified css
 * @css: target css
 * @n: number of references to get
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_get_many)

/**
 * css_tryget - try to obtain a reference on the specified css
 * @css: target css
 *
 * Obtain a reference on @css unless it already has reached zero and is
 * being released.  This function doesn't care whether @css is on or
 * offline.  The caller naturally needs to ensure that @css is accessible
 * but doesn't have to be holding a reference on it - IOW, RCU protected
 * access is good enough for this function.  Returns %true if a reference
 * count was successfully obtained; %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget)

/**
 * css_tryget_online - try to obtain a reference on the specified css if online
 * @css: target css
 *
 * Obtain a reference on @css if it's online.  The caller naturally needs
 * to ensure that @css is accessible but doesn't have to be holding a
 * reference on it - IOW, RCU protected access is good enough for this
 * function.  Returns %true if a reference count was successfully obtained;
 * %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget_online(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget_live(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget_online)

/**
 * css_put - put a css reference
 * @css: target css
 *
 * Put a reference obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put(&css->refcnt);
}
CGROUP_REF_EXPORT(css_put)

/**
 * css_put_many - put css references
 * @css: target css
 * @n: number of references to put
 *
 * Put references obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_put_many)






























































    5 



    7 












































































































































    5 










































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_FIND_H_
#define __LINUX_FIND_H_

#ifndef __LINUX_BITMAP_H
#error only <linux/bitmap.h> can be included directly
#endif

#include <linux/bitops.h>

unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
                                unsigned long start);
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
                                         unsigned long start);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n);
unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n);
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long size, unsigned long n);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        const unsigned long *addr3, unsigned long size,
                                        unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
                                         const unsigned long *addr2, unsigned long size);
unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                      const unsigned long *addr3, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);

#ifdef __BIG_ENDIAN
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
                                        long size, unsigned long offset);
unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
                                long size, unsigned long offset);
#endif

#ifndef find_next_bit
/**
 * find_next_bit - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
                            unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit(addr, size, offset);
}
#endif

#ifndef find_next_and_bit
/**
 * find_next_and_bit - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & *addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_and_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_andnot_bit
/**
 * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits
 *                        in *addr2
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_andnot_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_andnot_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_or_bit
/**
 * find_next_or_bit - find the next set bit in either memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_or_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = (*addr1 | *addr2) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_or_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_zero_bit
/**
 * find_next_zero_bit - find the next cleared bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number of the next zero bit
 * If no bits are zero, returns @size.
 */
static inline
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                                 unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit(addr, size, offset);
}
#endif

#ifndef find_first_bit
/**
 * find_first_bit - find the first set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first set bit.
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_bit(addr, size);
}
#endif

/**
 * find_nth_bit - find N'th set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * The following is semantically equivalent:
 *         idx = find_nth_bit(addr, size, 0);
 *         idx = find_first_bit(addr, size);
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns >= @size.
 */
static inline
unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_bit(addr, size, n);
}

/**
 * find_nth_and_bit - find N'th set bit in 2 memory regions
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static inline
unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_bit(addr1, addr2, size, n);
}

/**
 * find_nth_andnot_bit - find N'th set bit in 2 memory regions,
 *                         flipping bits in 2nd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static inline
unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & (~*addr2) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_andnot_bit(addr1, addr2, size, n);
}

/**
 * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
 *                             excluding those set in 3rd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @addr3: The 3rd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        const unsigned long *addr3,
                                        unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
}

#ifndef find_first_and_bit
/**
 * find_first_and_bit - find the first set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_first_and_bit(const unsigned long *addr1,
                                 const unsigned long *addr2,
                                 unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_bit(addr1, addr2, size);
}
#endif

/**
 * find_first_and_and_bit - find the first set bit in 3 memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @addr3: The third address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the first set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_first_and_and_bit(const unsigned long *addr1,
                                     const unsigned long *addr2,
                                     const unsigned long *addr3,
                                     unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_and_bit(addr1, addr2, addr3, size);
}

#ifndef find_first_zero_bit
/**
 * find_first_zero_bit - find the first cleared bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first cleared bit.
 * If no bits are zero, returns @size.
 */
static inline
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit(addr, size);
}
#endif

#ifndef find_last_bit
/**
 * find_last_bit - find the last set bit in a memory region
 * @addr: The address to start the search at
 * @size: The number of bits to search
 *
 * Returns the bit number of the last set bit, or size.
 */
static inline
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __fls(val) : size;
        }

        return _find_last_bit(addr, size);
}
#endif

/**
 * find_next_and_bit_wrap - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_and_bit(addr1, addr2, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_and_bit(addr1, addr2, offset);
        return bit < offset ? bit : size;
}

/**
 * find_next_bit_wrap - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_bit_wrap(const unsigned long *addr,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_bit(addr, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_bit(addr, offset);
        return bit < offset ? bit : size;
}

/*
 * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
 * before using it alone.
 */
static inline
unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
                                 unsigned long start, unsigned long n)
{
        unsigned long bit;

        /* If not wrapped around */
        if (n > start) {
                /* and have a bit, just return it. */
                bit = find_next_bit(bitmap, size, n);
                if (bit < size)
                        return bit;

                /* Otherwise, wrap around and ... */
                n = 0;
        }

        /* Search the other part. */
        bit = find_next_bit(bitmap, start, n);
        return bit < start ? bit : size;
}

/**
 * find_next_clump8 - find next 8-bit clump with set bits in a memory region
 * @clump: location to store copy of found clump
 * @addr: address to base the search on
 * @size: bitmap size in number of bits
 * @offset: bit offset at which to start searching
 *
 * Returns the bit offset for the next set clump; the found clump value is
 * copied to the location pointed by @clump. If no bits are set, returns @size.
 */
extern unsigned long find_next_clump8(unsigned long *clump,
                                      const unsigned long *addr,
                                      unsigned long size, unsigned long offset);

#define find_first_clump8(clump, bits, size) \
        find_next_clump8((clump), (bits), (size), 0)

#if defined(__LITTLE_ENDIAN)

static inline unsigned long find_next_zero_bit_le(const void *addr,
                unsigned long size, unsigned long offset)
{
        return find_next_zero_bit(addr, size, offset);
}

static inline unsigned long find_next_bit_le(const void *addr,
                unsigned long size, unsigned long offset)
{
        return find_next_bit(addr, size, offset);
}

static inline unsigned long find_first_zero_bit_le(const void *addr,
                unsigned long size)
{
        return find_first_zero_bit(addr, size);
}

#elif defined(__BIG_ENDIAN)

#ifndef find_next_zero_bit_le
static inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit_le(addr, size, offset);
}
#endif

#ifndef find_first_zero_bit_le
static inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit_le(addr, size);
}
#endif

#ifndef find_next_bit_le
static inline
unsigned long find_next_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit_le(addr, size, offset);
}
#endif

#else
#error "Please fix <asm/byteorder.h>"
#endif

#define for_each_set_bit(bit, addr, size) \
        for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_and_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_andnot_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_or_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
        for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_clear_bit(bit, addr, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);                \
             (bit)++)

/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
        for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

/**
 * for_each_set_bitrange - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit)
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange(b, e, addr, size)                        \
        for ((b) = 0;                                                \
             (b) = find_next_bit((addr), (size), b),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_bit((addr), (size), (b)),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first unset bit)
 * @e: bit offset of end of current bitrange (first set bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange(b, e, addr, size)                \
        for ((b) = 0;                                                \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
 * wrapping around the end of bitmap.
 * @bit: offset for current iteration
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
 */
#define for_each_set_bit_wrap(bit, addr, size, start) \
        for ((bit) = find_next_bit_wrap((addr), (size), (start));                \
             (bit) < (size);                                                        \
             (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))

/**
 * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
 * @start: bit offset to start search and to store the current iteration offset
 * @clump: location to store copy of current 8-bit clump
 * @bits: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_clump8(start, clump, bits, size) \
        for ((start) = find_first_clump8(&(clump), (bits), (size)); \
             (start) < (size); \
             (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))

#endif /*__LINUX_FIND_H_ */





























































    2 


































































































    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIME64_H
#define _LINUX_TIME64_H

#include <linux/math64.h>
#include <vdso/time64.h>

typedef __s64 time64_t;
typedef __u64 timeu64_t;

#include <uapi/linux/time.h>

struct timespec64 {
        time64_t        tv_sec;                        /* seconds */
        long                tv_nsec;                /* nanoseconds */
};

struct itimerspec64 {
        struct timespec64 it_interval;
        struct timespec64 it_value;
};

/* Parameters used to convert the timespec values: */
#define PSEC_PER_NSEC                        1000L

/* Located here for timespec[64]_valid_strict */
#define TIME64_MAX                        ((s64)~((u64)1 << 63))
#define TIME64_MIN                        (-TIME64_MAX - 1)

#define KTIME_MAX                        ((s64)~((u64)1 << 63))
#define KTIME_MIN                        (-KTIME_MAX - 1)
#define KTIME_SEC_MAX                        (KTIME_MAX / NSEC_PER_SEC)
#define KTIME_SEC_MIN                        (KTIME_MIN / NSEC_PER_SEC)

/*
 * Limits for settimeofday():
 *
 * To prevent setting the time close to the wraparound point time setting
 * is limited so a reasonable uptime can be accomodated. Uptime of 30 years
 * should be really sufficient, which means the cutoff is 2232. At that
 * point the cutoff is just a small part of the larger problem.
 */
#define TIME_UPTIME_SEC_MAX                (30LL * 365 * 24 *3600)
#define TIME_SETTOD_SEC_MAX                (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX)

static inline int timespec64_equal(const struct timespec64 *a,
                                   const struct timespec64 *b)
{
        return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
}

/*
 * lhs < rhs:  return <0
 * lhs == rhs: return 0
 * lhs > rhs:  return >0
 */
static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs)
{
        if (lhs->tv_sec < rhs->tv_sec)
                return -1;
        if (lhs->tv_sec > rhs->tv_sec)
                return 1;
        return lhs->tv_nsec - rhs->tv_nsec;
}

extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);

static inline struct timespec64 timespec64_add(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec,
                                lhs.tv_nsec + rhs.tv_nsec);
        return ts_delta;
}

/*
 * sub = lhs - rhs, in normalized form
 */
static inline struct timespec64 timespec64_sub(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec,
                                lhs.tv_nsec - rhs.tv_nsec);
        return ts_delta;
}

/*
 * Returns true if the timespec64 is norm, false if denorm:
 */
static inline bool timespec64_valid(const struct timespec64 *ts)
{
        /* Dates before 1970 are bogus */
        if (ts->tv_sec < 0)
                return false;
        /* Can't have more nanoseconds then a second */
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return false;
        return true;
}

static inline bool timespec64_valid_strict(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values that could overflow ktime_t */
        if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
                return false;
        return true;
}

static inline bool timespec64_valid_settod(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */
        if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX)
                return false;
        return true;
}

/**
 * timespec64_to_ns - Convert timespec64 to nanoseconds
 * @ts:                pointer to the timespec64 variable to be converted
 *
 * Returns the scalar nanosecond representation of the timespec64
 * parameter.
 */
static inline s64 timespec64_to_ns(const struct timespec64 *ts)
{
        /* Prevent multiplication overflow / underflow */
        if (ts->tv_sec >= KTIME_SEC_MAX)
                return KTIME_MAX;

        if (ts->tv_sec <= KTIME_SEC_MIN)
                return KTIME_MIN;

        return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
}

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the timespec64 representation of the nsec parameter.
 */
extern struct timespec64 ns_to_timespec64(s64 nsec);

/**
 * timespec64_add_ns - Adds nanoseconds to a timespec64
 * @a:                pointer to timespec64 to be incremented
 * @ns:                unsigned nanoseconds value to be added
 *
 * This must always be inlined because its used from the x86-64 vdso,
 * which cannot call other kernel functions.
 */
static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
{
        a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
        a->tv_nsec = ns;
}

/*
 * timespec64_add_safe assumes both values are positive and checks for
 * overflow. It will return TIME64_MAX in case of overflow.
 */
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                         const struct timespec64 rhs);

#endif /* _LINUX_TIME64_H */





















































    1 




    1 
    1 























    1 


    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0-only */
/* A pointer that can point to either kernel or userspace memory. */
#ifndef _LINUX_BPFPTR_H
#define _LINUX_BPFPTR_H

#include <linux/mm.h>
#include <linux/sockptr.h>

typedef sockptr_t bpfptr_t;

static inline bool bpfptr_is_kernel(bpfptr_t bpfptr)
{
        return bpfptr.is_kernel;
}

static inline bpfptr_t KERNEL_BPFPTR(void *p)
{
        return (bpfptr_t) { .kernel = p, .is_kernel = true };
}

static inline bpfptr_t USER_BPFPTR(void __user *p)
{
        return (bpfptr_t) { .user = p };
}

static inline bpfptr_t make_bpfptr(u64 addr, bool is_kernel)
{
        if (is_kernel)
                return KERNEL_BPFPTR((void*) (uintptr_t) addr);
        else
                return USER_BPFPTR(u64_to_user_ptr(addr));
}

static inline bool bpfptr_is_null(bpfptr_t bpfptr)
{
        if (bpfptr_is_kernel(bpfptr))
                return !bpfptr.kernel;
        return !bpfptr.user;
}

static inline void bpfptr_add(bpfptr_t *bpfptr, size_t val)
{
        if (bpfptr_is_kernel(*bpfptr))
                bpfptr->kernel += val;
        else
                bpfptr->user += val;
}

static inline int copy_from_bpfptr_offset(void *dst, bpfptr_t src,
                                          size_t offset, size_t size)
{
        if (!bpfptr_is_kernel(src))
                return copy_from_user(dst, src.user + offset, size);
        return copy_from_kernel_nofault(dst, src.kernel + offset, size);
}

static inline int copy_from_bpfptr(void *dst, bpfptr_t src, size_t size)
{
        return copy_from_bpfptr_offset(dst, src, 0, size);
}

static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
                                        const void *src, size_t size)
{
        return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size);
}

static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len)
{
        void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_bpfptr(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }
        return p;
}
#define kvmemdup_bpfptr(...)        alloc_hooks(kvmemdup_bpfptr_noprof(__VA_ARGS__))

static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
{
        if (bpfptr_is_kernel(src))
                return strncpy_from_kernel_nofault(dst, src.kernel, count);
        return strncpy_from_user(dst, src.user, count);
}

#endif /* _LINUX_BPFPTR_H */




























































































    1 
    1 











































































    1 







































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
/* Netfilter messages via netlink socket. Allows for user space
 * protocol helpers and general trouble making from userspace.
 *
 * (C) 2001 by Jay Schulist <jschlst@samba.org>,
 * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org>
 * (C) 2005-2017 by Pablo Neira Ayuso <pablo@netfilter.org>
 *
 * Initial netfilter messages via netlink development funded and
 * generally made possible by Network Robots, Inc. (www.networkrobots.com)
 *
 * Further development of this code funded by Astaro AG (http://www.astaro.com)
 *
 * This software may be used and distributed according to the terms
 * of the GNU General Public License, incorporated herein by reference.
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/init.h>
#include <linux/sched/signal.h>

#include <net/netlink.h>
#include <net/netns/generic.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
MODULE_DESCRIPTION("Netfilter messages via netlink socket");

#define nfnl_dereference_protected(id) \
        rcu_dereference_protected(table[(id)].subsys, \
                                  lockdep_nfnl_is_held((id)))

#define NFNL_MAX_ATTR_COUNT        32

static unsigned int nfnetlink_pernet_id __read_mostly;

#ifdef CONFIG_NF_CONNTRACK_EVENTS
static DEFINE_SPINLOCK(nfnl_grp_active_lock);
#endif

struct nfnl_net {
        struct sock *nfnl;
};

static struct {
        struct mutex                                mutex;
        const struct nfnetlink_subsystem __rcu        *subsys;
} table[NFNL_SUBSYS_COUNT];

static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT];

static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = {
        [NFNL_SUBSYS_NONE] = "nfnl_subsys_none",
        [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink",
        [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp",
        [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue",
        [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog",
        [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf",
        [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset",
        [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct",
        [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout",
        [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper",
        [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables",
        [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat",
        [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook",
};

static const int nfnl_group2type[NFNLGRP_MAX+1] = {
        [NFNLGRP_CONNTRACK_NEW]                = NFNL_SUBSYS_CTNETLINK,
        [NFNLGRP_CONNTRACK_UPDATE]        = NFNL_SUBSYS_CTNETLINK,
        [NFNLGRP_CONNTRACK_DESTROY]        = NFNL_SUBSYS_CTNETLINK,
        [NFNLGRP_CONNTRACK_EXP_NEW]        = NFNL_SUBSYS_CTNETLINK_EXP,
        [NFNLGRP_CONNTRACK_EXP_UPDATE]        = NFNL_SUBSYS_CTNETLINK_EXP,
        [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
        [NFNLGRP_NFTABLES]                = NFNL_SUBSYS_NFTABLES,
        [NFNLGRP_ACCT_QUOTA]                = NFNL_SUBSYS_ACCT,
        [NFNLGRP_NFTRACE]                = NFNL_SUBSYS_NFTABLES,
};

static struct nfnl_net *nfnl_pernet(struct net *net)
{
        return net_generic(net, nfnetlink_pernet_id);
}

void nfnl_lock(__u8 subsys_id)
{
        mutex_lock(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(nfnl_lock);

void nfnl_unlock(__u8 subsys_id)
{
        mutex_unlock(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(nfnl_unlock);

#ifdef CONFIG_PROVE_LOCKING
bool lockdep_nfnl_is_held(u8 subsys_id)
{
        return lockdep_is_held(&table[subsys_id].mutex);
}
EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
#endif

int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
{
        u8 cb_id;

        /* Sanity-check attr_count size to avoid stack buffer overflow. */
        for (cb_id = 0; cb_id < n->cb_count; cb_id++)
                if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
                        return -EINVAL;

        nfnl_lock(n->subsys_id);
        if (table[n->subsys_id].subsys) {
                nfnl_unlock(n->subsys_id);
                return -EBUSY;
        }
        rcu_assign_pointer(table[n->subsys_id].subsys, n);
        nfnl_unlock(n->subsys_id);

        return 0;
}
EXPORT_SYMBOL_GPL(nfnetlink_subsys_register);

int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n)
{
        nfnl_lock(n->subsys_id);
        table[n->subsys_id].subsys = NULL;
        nfnl_unlock(n->subsys_id);
        synchronize_rcu();
        return 0;
}
EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister);

static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type)
{
        u8 subsys_id = NFNL_SUBSYS_ID(type);

        if (subsys_id >= NFNL_SUBSYS_COUNT)
                return NULL;

        return rcu_dereference(table[subsys_id].subsys);
}

static inline const struct nfnl_callback *
nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
{
        u8 cb_id = NFNL_MSG_TYPE(type);

        if (cb_id >= ss->cb_count)
                return NULL;

        return &ss->cb[cb_id];
}

int nfnetlink_has_listeners(struct net *net, unsigned int group)
{
        struct nfnl_net *nfnlnet = nfnl_pernet(net);

        return netlink_has_listeners(nfnlnet->nfnl, group);
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);

int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
                   unsigned int group, int echo, gfp_t flags)
{
        struct nfnl_net *nfnlnet = nfnl_pernet(net);

        return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_send);

int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
        struct nfnl_net *nfnlnet = nfnl_pernet(net);

        return netlink_set_err(nfnlnet->nfnl, portid, group, error);
}
EXPORT_SYMBOL_GPL(nfnetlink_set_err);

int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid)
{
        struct nfnl_net *nfnlnet = nfnl_pernet(net);
        int err;

        err = nlmsg_unicast(nfnlnet->nfnl, skb, portid);
        if (err == -EAGAIN)
                err = -ENOBUFS;

        return err;
}
EXPORT_SYMBOL_GPL(nfnetlink_unicast);

void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid,
                         __u32 group, gfp_t allocation)
{
        struct nfnl_net *nfnlnet = nfnl_pernet(net);

        netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation);
}
EXPORT_SYMBOL_GPL(nfnetlink_broadcast);

/* Process one complete nfnetlink message. */
static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        const struct nfnl_callback *nc;
        const struct nfnetlink_subsystem *ss;
        int type, err;

        /* All the messages must at least contain nfgenmsg */
        if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
                return 0;

        type = nlh->nlmsg_type;
replay:
        rcu_read_lock();

        ss = nfnetlink_get_subsys(type);
        if (!ss) {
#ifdef CONFIG_MODULES
                rcu_read_unlock();
                request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
                rcu_read_lock();
                ss = nfnetlink_get_subsys(type);
                if (!ss)
#endif
                {
                        rcu_read_unlock();
                        return -EINVAL;
                }
        }

        nc = nfnetlink_find_client(type, ss);
        if (!nc) {
                rcu_read_unlock();
                return -EINVAL;
        }

        {
                int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
                struct nfnl_net *nfnlnet = nfnl_pernet(net);
                u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
                struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
                struct nlattr *attr = (void *)nlh + min_len;
                int attrlen = nlh->nlmsg_len - min_len;
                __u8 subsys_id = NFNL_SUBSYS_ID(type);
                struct nfnl_info info = {
                        .net        = net,
                        .sk        = nfnlnet->nfnl,
                        .nlh        = nlh,
                        .nfmsg        = nlmsg_data(nlh),
                        .extack        = extack,
                };

                /* Sanity-check NFNL_MAX_ATTR_COUNT */
                if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
                        rcu_read_unlock();
                        return -ENOMEM;
                }

                err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count,
                                           attr, attrlen,
                                           ss->cb[cb_id].policy, extack);
                if (err < 0) {
                        rcu_read_unlock();
                        return err;
                }

                if (!nc->call) {
                        rcu_read_unlock();
                        return -EINVAL;
                }

                switch (nc->type) {
                case NFNL_CB_RCU:
                        err = nc->call(skb, &info, (const struct nlattr **)cda);
                        rcu_read_unlock();
                        break;
                case NFNL_CB_MUTEX:
                        rcu_read_unlock();
                        nfnl_lock(subsys_id);
                        if (nfnl_dereference_protected(subsys_id) != ss ||
                            nfnetlink_find_client(type, ss) != nc) {
                                nfnl_unlock(subsys_id);
                                err = -EAGAIN;
                                break;
                        }
                        err = nc->call(skb, &info, (const struct nlattr **)cda);
                        nfnl_unlock(subsys_id);
                        break;
                default:
                        rcu_read_unlock();
                        err = -EINVAL;
                        break;
                }
                if (err == -EAGAIN)
                        goto replay;
                return err;
        }
}

struct nfnl_err {
        struct list_head        head;
        struct nlmsghdr                *nlh;
        int                        err;
        struct netlink_ext_ack        extack;
};

static int nfnl_err_add(struct list_head *list, struct nlmsghdr *nlh, int err,
                        const struct netlink_ext_ack *extack)
{
        struct nfnl_err *nfnl_err;

        nfnl_err = kmalloc(sizeof(struct nfnl_err), GFP_KERNEL);
        if (nfnl_err == NULL)
                return -ENOMEM;

        nfnl_err->nlh = nlh;
        nfnl_err->err = err;
        nfnl_err->extack = *extack;
        list_add_tail(&nfnl_err->head, list);

        return 0;
}

static void nfnl_err_del(struct nfnl_err *nfnl_err)
{
        list_del(&nfnl_err->head);
        kfree(nfnl_err);
}

static void nfnl_err_reset(struct list_head *err_list)
{
        struct nfnl_err *nfnl_err, *next;

        list_for_each_entry_safe(nfnl_err, next, err_list, head)
                nfnl_err_del(nfnl_err);
}

static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb)
{
        struct nfnl_err *nfnl_err, *next;

        list_for_each_entry_safe(nfnl_err, next, err_list, head) {
                netlink_ack(skb, nfnl_err->nlh, nfnl_err->err,
                            &nfnl_err->extack);
                nfnl_err_del(nfnl_err);
        }
}

enum {
        NFNL_BATCH_FAILURE        = (1 << 0),
        NFNL_BATCH_DONE                = (1 << 1),
        NFNL_BATCH_REPLAY        = (1 << 2),
};

static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
                                u16 subsys_id, u32 genid)
{
        struct sk_buff *oskb = skb;
        struct net *net = sock_net(skb->sk);
        const struct nfnetlink_subsystem *ss;
        const struct nfnl_callback *nc;
        struct netlink_ext_ack extack;
        LIST_HEAD(err_list);
        u32 status;
        int err;

        if (subsys_id >= NFNL_SUBSYS_COUNT)
                return netlink_ack(skb, nlh, -EINVAL, NULL);
replay:
        status = 0;
replay_abort:
        skb = netlink_skb_clone(oskb, GFP_KERNEL);
        if (!skb)
                return netlink_ack(oskb, nlh, -ENOMEM, NULL);

        nfnl_lock(subsys_id);
        ss = nfnl_dereference_protected(subsys_id);
        if (!ss) {
#ifdef CONFIG_MODULES
                nfnl_unlock(subsys_id);
                request_module("nfnetlink-subsys-%d", subsys_id);
                nfnl_lock(subsys_id);
                ss = nfnl_dereference_protected(subsys_id);
                if (!ss)
#endif
                {
                        nfnl_unlock(subsys_id);
                        netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
                        return kfree_skb(skb);
                }
        }

        if (!ss->valid_genid || !ss->commit || !ss->abort) {
                nfnl_unlock(subsys_id);
                netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
                return kfree_skb(skb);
        }

        if (!try_module_get(ss->owner)) {
                nfnl_unlock(subsys_id);
                netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
                return kfree_skb(skb);
        }

        if (!ss->valid_genid(net, genid)) {
                module_put(ss->owner);
                nfnl_unlock(subsys_id);
                netlink_ack(oskb, nlh, -ERESTART, NULL);
                return kfree_skb(skb);
        }

        nfnl_unlock(subsys_id);

        if (nlh->nlmsg_flags & NLM_F_ACK)
                nfnl_err_add(&err_list, nlh, 0, &extack);

        while (skb->len >= nlmsg_total_size(0)) {
                int msglen, type;

                if (fatal_signal_pending(current)) {
                        nfnl_err_reset(&err_list);
                        err = -EINTR;
                        status = NFNL_BATCH_FAILURE;
                        goto done;
                }

                memset(&extack, 0, sizeof(extack));
                nlh = nlmsg_hdr(skb);
                err = 0;

                if (nlh->nlmsg_len < NLMSG_HDRLEN ||
                    skb->len < nlh->nlmsg_len ||
                    nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
                        nfnl_err_reset(&err_list);
                        status |= NFNL_BATCH_FAILURE;
                        goto done;
                }

                /* Only requests are handled by the kernel */
                if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) {
                        err = -EINVAL;
                        goto ack;
                }

                type = nlh->nlmsg_type;
                if (type == NFNL_MSG_BATCH_BEGIN) {
                        /* Malformed: Batch begin twice */
                        nfnl_err_reset(&err_list);
                        status |= NFNL_BATCH_FAILURE;
                        goto done;
                } else if (type == NFNL_MSG_BATCH_END) {
                        status |= NFNL_BATCH_DONE;
                        goto done;
                } else if (type < NLMSG_MIN_TYPE) {
                        err = -EINVAL;
                        goto ack;
                }

                /* We only accept a batch with messages for the same
                 * subsystem.
                 */
                if (NFNL_SUBSYS_ID(type) != subsys_id) {
                        err = -EINVAL;
                        goto ack;
                }

                nc = nfnetlink_find_client(type, ss);
                if (!nc) {
                        err = -EINVAL;
                        goto ack;
                }

                if (nc->type != NFNL_CB_BATCH) {
                        err = -EINVAL;
                        goto ack;
                }

                {
                        int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
                        struct nfnl_net *nfnlnet = nfnl_pernet(net);
                        struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
                        struct nlattr *attr = (void *)nlh + min_len;
                        u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
                        int attrlen = nlh->nlmsg_len - min_len;
                        struct nfnl_info info = {
                                .net        = net,
                                .sk        = nfnlnet->nfnl,
                                .nlh        = nlh,
                                .nfmsg        = nlmsg_data(nlh),
                                .extack        = &extack,
                        };

                        /* Sanity-check NFTA_MAX_ATTR */
                        if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
                                err = -ENOMEM;
                                goto ack;
                        }

                        err = nla_parse_deprecated(cda,
                                                   ss->cb[cb_id].attr_count,
                                                   attr, attrlen,
                                                   ss->cb[cb_id].policy, NULL);
                        if (err < 0)
                                goto ack;

                        err = nc->call(skb, &info, (const struct nlattr **)cda);

                        /* The lock was released to autoload some module, we
                         * have to abort and start from scratch using the
                         * original skb.
                         */
                        if (err == -EAGAIN) {
                                status |= NFNL_BATCH_REPLAY;
                                goto done;
                        }
                }
ack:
                if (nlh->nlmsg_flags & NLM_F_ACK || err) {
                        /* Errors are delivered once the full batch has been
                         * processed, this avoids that the same error is
                         * reported several times when replaying the batch.
                         */
                        if (err == -ENOMEM ||
                            nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
                                /* We failed to enqueue an error, reset the
                                 * list of errors and send OOM to userspace
                                 * pointing to the batch header.
                                 */
                                nfnl_err_reset(&err_list);
                                netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM,
                                            NULL);
                                status |= NFNL_BATCH_FAILURE;
                                goto done;
                        }
                        /* We don't stop processing the batch on errors, thus,
                         * userspace gets all the errors that the batch
                         * triggers.
                         */
                        if (err)
                                status |= NFNL_BATCH_FAILURE;
                }

                msglen = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msglen > skb->len)
                        msglen = skb->len;
                skb_pull(skb, msglen);
        }
done:
        if (status & NFNL_BATCH_REPLAY) {
                ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD);
                nfnl_err_reset(&err_list);
                kfree_skb(skb);
                module_put(ss->owner);
                goto replay;
        } else if (status == NFNL_BATCH_DONE) {
                err = ss->commit(net, oskb);
                if (err == -EAGAIN) {
                        status |= NFNL_BATCH_REPLAY;
                        goto done;
                } else if (err) {
                        ss->abort(net, oskb, NFNL_ABORT_NONE);
                        netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
                } else if (nlh->nlmsg_flags & NLM_F_ACK) {
                        nfnl_err_add(&err_list, nlh, 0, &extack);
                }
        } else {
                enum nfnl_abort_action abort_action;

                if (status & NFNL_BATCH_FAILURE)
                        abort_action = NFNL_ABORT_NONE;
                else
                        abort_action = NFNL_ABORT_VALIDATE;

                err = ss->abort(net, oskb, abort_action);
                if (err == -EAGAIN) {
                        nfnl_err_reset(&err_list);
                        kfree_skb(skb);
                        module_put(ss->owner);
                        status |= NFNL_BATCH_FAILURE;
                        goto replay_abort;
                }
        }

        nfnl_err_deliver(&err_list, oskb);
        kfree_skb(skb);
        module_put(ss->owner);
}

static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
        [NFNL_BATCH_GENID]        = { .type = NLA_U32 },
};

static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
        struct nlattr *attr = (void *)nlh + min_len;
        struct nlattr *cda[NFNL_BATCH_MAX + 1];
        int attrlen = nlh->nlmsg_len - min_len;
        struct nfgenmsg *nfgenmsg;
        int msglen, err;
        u32 gen_id = 0;
        u16 res_id;

        msglen = NLMSG_ALIGN(nlh->nlmsg_len);
        if (msglen > skb->len)
                msglen = skb->len;

        if (skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
                return;

        err = nla_parse_deprecated(cda, NFNL_BATCH_MAX, attr, attrlen,
                                   nfnl_batch_policy, NULL);
        if (err < 0) {
                netlink_ack(skb, nlh, err, NULL);
                return;
        }
        if (cda[NFNL_BATCH_GENID])
                gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID]));

        nfgenmsg = nlmsg_data(nlh);
        skb_pull(skb, msglen);
        /* Work around old nft using host byte order */
        if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES)
                res_id = NFNL_SUBSYS_NFTABLES;
        else
                res_id = ntohs(nfgenmsg->res_id);

        nfnetlink_rcv_batch(skb, nlh, res_id, gen_id);
}

static void nfnetlink_rcv(struct sk_buff *skb)
{
        struct nlmsghdr *nlh = nlmsg_hdr(skb);

        if (skb->len < NLMSG_HDRLEN ||
            nlh->nlmsg_len < NLMSG_HDRLEN ||
            skb->len < nlh->nlmsg_len)
                return;

        if (!netlink_net_capable(skb, CAP_NET_ADMIN)) {
                netlink_ack(skb, nlh, -EPERM, NULL);
                return;
        }

        if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN)
                nfnetlink_rcv_skb_batch(skb, nlh);
        else
                netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}

static void nfnetlink_bind_event(struct net *net, unsigned int group)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        int type, group_bit;
        u8 v;

        /* All NFNLGRP_CONNTRACK_* group bits fit into u8.
         * The other groups are not relevant and can be ignored.
         */
        if (group >= 8)
                return;

        type = nfnl_group2type[group];

        switch (type) {
        case NFNL_SUBSYS_CTNETLINK:
                break;
        case NFNL_SUBSYS_CTNETLINK_EXP:
                break;
        default:
                return;
        }

        group_bit = (1 << group);

        spin_lock(&nfnl_grp_active_lock);
        v = READ_ONCE(nf_ctnetlink_has_listener);
        if ((v & group_bit) == 0) {
                v |= group_bit;

                /* read concurrently without nfnl_grp_active_lock held. */
                WRITE_ONCE(nf_ctnetlink_has_listener, v);
        }

        spin_unlock(&nfnl_grp_active_lock);
#endif
}

static int nfnetlink_bind(struct net *net, int group)
{
        const struct nfnetlink_subsystem *ss;
        int type;

        if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
                return 0;

        type = nfnl_group2type[group];

        rcu_read_lock();
        ss = nfnetlink_get_subsys(type << 8);
        rcu_read_unlock();
        if (!ss)
                request_module_nowait("nfnetlink-subsys-%d", type);

        nfnetlink_bind_event(net, group);
        return 0;
}

static void nfnetlink_unbind(struct net *net, int group)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        int type, group_bit;

        if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
                return;

        type = nfnl_group2type[group];

        switch (type) {
        case NFNL_SUBSYS_CTNETLINK:
                break;
        case NFNL_SUBSYS_CTNETLINK_EXP:
                break;
        default:
                return;
        }

        /* ctnetlink_has_listener is u8 */
        if (group >= 8)
                return;

        group_bit = (1 << group);

        spin_lock(&nfnl_grp_active_lock);
        if (!nfnetlink_has_listeners(net, group)) {
                u8 v = READ_ONCE(nf_ctnetlink_has_listener);

                v &= ~group_bit;

                /* read concurrently without nfnl_grp_active_lock held. */
                WRITE_ONCE(nf_ctnetlink_has_listener, v);
        }
        spin_unlock(&nfnl_grp_active_lock);
#endif
}

static int __net_init nfnetlink_net_init(struct net *net)
{
        struct nfnl_net *nfnlnet = nfnl_pernet(net);
        struct netlink_kernel_cfg cfg = {
                .groups        = NFNLGRP_MAX,
                .input        = nfnetlink_rcv,
                .bind        = nfnetlink_bind,
                .unbind        = nfnetlink_unbind,
        };

        nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
        if (!nfnlnet->nfnl)
                return -ENOMEM;
        return 0;
}

static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
{
        struct nfnl_net *nfnlnet;
        struct net *net;

        list_for_each_entry(net, net_exit_list, exit_list) {
                nfnlnet = nfnl_pernet(net);

                netlink_kernel_release(nfnlnet->nfnl);
        }
}

static struct pernet_operations nfnetlink_net_ops = {
        .init                = nfnetlink_net_init,
        .exit_batch        = nfnetlink_net_exit_batch,
        .id                = &nfnetlink_pernet_id,
        .size                = sizeof(struct nfnl_net),
};

static int __init nfnetlink_init(void)
{
        int i;

        for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++)
                BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);

        for (i=0; i<NFNL_SUBSYS_COUNT; i++)
                __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]);

        return register_pernet_subsys(&nfnetlink_net_ops);
}

static void __exit nfnetlink_exit(void)
{
        unregister_pernet_subsys(&nfnetlink_net_ops);
}
module_init(nfnetlink_init);
module_exit(nfnetlink_exit);

















































































    1 

























































































    1 


















    1 





    4 

    4 














    2 





    2 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * include/net/l3mdev.h - L3 master device API
 * Copyright (c) 2015 Cumulus Networks
 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
 */
#ifndef _NET_L3MDEV_H_
#define _NET_L3MDEV_H_

#include <net/dst.h>
#include <net/fib_rules.h>

enum l3mdev_type {
        L3MDEV_TYPE_UNSPEC,
        L3MDEV_TYPE_VRF,
        __L3MDEV_TYPE_MAX
};

#define L3MDEV_TYPE_MAX (__L3MDEV_TYPE_MAX - 1)

typedef int (*lookup_by_table_id_t)(struct net *net, u32 table_d);

/**
 * struct l3mdev_ops - l3mdev operations
 *
 * @l3mdev_fib_table: Get FIB table id to use for lookups
 *
 * @l3mdev_l3_rcv:    Hook in L3 receive path
 *
 * @l3mdev_l3_out:    Hook in L3 output path
 *
 * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations
 */

struct l3mdev_ops {
        u32                (*l3mdev_fib_table)(const struct net_device *dev);
        struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev,
                                          struct sk_buff *skb, u16 proto);
        struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev,
                                          struct sock *sk, struct sk_buff *skb,
                                          u16 proto);

        /* IPv6 ops */
        struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev,
                                                 struct flowi6 *fl6);
};

#ifdef CONFIG_NET_L3_MASTER_DEV

int l3mdev_table_lookup_register(enum l3mdev_type l3type,
                                 lookup_by_table_id_t fn);

void l3mdev_table_lookup_unregister(enum l3mdev_type l3type,
                                    lookup_by_table_id_t fn);

int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net,
                                      u32 table_id);

int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
                          struct fib_lookup_arg *arg);

void l3mdev_update_flow(struct net *net, struct flowi *fl);

int l3mdev_master_ifindex_rcu(const struct net_device *dev);
static inline int l3mdev_master_ifindex(struct net_device *dev)
{
        int ifindex;

        rcu_read_lock();
        ifindex = l3mdev_master_ifindex_rcu(dev);
        rcu_read_unlock();

        return ifindex;
}

static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        int rc = 0;

        if (likely(ifindex)) {
                rcu_read_lock();

                dev = dev_get_by_index_rcu(net, ifindex);
                if (dev)
                        rc = l3mdev_master_ifindex_rcu(dev);

                rcu_read_unlock();
        }

        return rc;
}

static inline
struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev)
{
        /* netdev_master_upper_dev_get_rcu calls
         * list_first_or_null_rcu to walk the upper dev list.
         * list_first_or_null_rcu does not handle a const arg. We aren't
         * making changes, just want the master device from that list so
         * typecast to remove the const
         */
        struct net_device *dev = (struct net_device *)_dev;
        struct net_device *master;

        if (!dev)
                return NULL;

        if (netif_is_l3_master(dev))
                master = dev;
        else if (netif_is_l3_slave(dev))
                master = netdev_master_upper_dev_get_rcu(dev);
        else
                master = NULL;

        return master;
}

int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
        rcu_read_lock();
        ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
        rcu_read_unlock();

        return ifindex;
}

u32 l3mdev_fib_table_rcu(const struct net_device *dev);
u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
static inline u32 l3mdev_fib_table(const struct net_device *dev)
{
        u32 tb_id;

        rcu_read_lock();
        tb_id = l3mdev_fib_table_rcu(dev);
        rcu_read_unlock();

        return tb_id;
}

static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
        struct net_device *dev;
        bool rc = false;

        if (ifindex == 0)
                return false;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
                rc = netif_is_l3_master(dev);

        rcu_read_unlock();

        return rc;
}

struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);

static inline
struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto)
{
        struct net_device *master = NULL;

        if (netif_is_l3_slave(skb->dev))
                master = netdev_master_upper_dev_get_rcu(skb->dev);
        else if (netif_is_l3_master(skb->dev) ||
                 netif_has_l3_rx_handler(skb->dev))
                master = skb->dev;

        if (master && master->l3mdev_ops->l3mdev_l3_rcv)
                skb = master->l3mdev_ops->l3mdev_l3_rcv(master, skb, proto);

        return skb;
}

static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
        return l3mdev_l3_rcv(skb, AF_INET);
}

static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
        return l3mdev_l3_rcv(skb, AF_INET6);
}

static inline
struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
{
        struct net_device *dev = skb_dst(skb)->dev;

        if (netif_is_l3_slave(dev)) {
                struct net_device *master;

                master = netdev_master_upper_dev_get_rcu(dev);
                if (master && master->l3mdev_ops->l3mdev_l3_out)
                        skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
                                                                skb, proto);
        }

        return skb;
}

static inline
struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
{
        return l3mdev_l3_out(sk, skb, AF_INET);
}

static inline
struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
{
        return l3mdev_l3_out(sk, skb, AF_INET6);
}
#else

static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev)
{
        return 0;
}
static inline int l3mdev_master_ifindex(struct net_device *dev)
{
        return 0;
}

static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex)
{
        return 0;
}

static inline
int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
{
        return 0;
}
static inline
int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
{
        return 0;
}

static inline
struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
{
        return NULL;
}

static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev)
{
        return 0;
}
static inline u32 l3mdev_fib_table(const struct net_device *dev)
{
        return 0;
}
static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
{
        return 0;
}

static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
{
        return false;
}

static inline
struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
{
        return NULL;
}

static inline
struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb)
{
        return skb;
}

static inline
struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
{
        return skb;
}

static inline
struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb)
{
        return skb;
}

static inline
struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb)
{
        return skb;
}

static inline
int l3mdev_table_lookup_register(enum l3mdev_type l3type,
                                 lookup_by_table_id_t fn)
{
        return -EOPNOTSUPP;
}

static inline
void l3mdev_table_lookup_unregister(enum l3mdev_type l3type,
                                    lookup_by_table_id_t fn)
{
}

static inline
int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net,
                                      u32 table_id)
{
        return -ENODEV;
}

static inline
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
                          struct fib_lookup_arg *arg)
{
        return 1;
}
static inline
void l3mdev_update_flow(struct net *net, struct flowi *fl)
{
}
#endif

#endif /* _NET_L3MDEV_H_ */




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   11 






    4 








   12 








    4 














































































































































































































































































































































































    2 







    1 
    2 
    2 




























































































    1 



    2 




    1 



    2 




    2 








    2 








    2 

































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
 * eXtensible Arrays
 * Copyright (c) 2017 Microsoft Corporation
 * Author: Matthew Wilcox <willy@infradead.org>
 *
 * See Documentation/core-api/xarray.rst for how to use the XArray.
 */

#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/rcupdate.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/types.h>

struct list_lru;

/*
 * The bottom two bits of the entry determine how the XArray interprets
 * the contents:
 *
 * 00: Pointer entry
 * 10: Internal entry
 * x1: Value entry or tagged pointer
 *
 * Attempting to store internal entries in the XArray is a bug.
 *
 * Most internal entries are pointers to the next node in the tree.
 * The following internal entries have a special meaning:
 *
 * 0-62: Sibling entries
 * 256: Retry entry
 * 257: Zero entry
 *
 * Errors are also represented as internal entries, but use the negative
 * space (-4094 to -2).  They're never stored in the slots array; only
 * returned by the normal API.
 */

#define BITS_PER_XA_VALUE        (BITS_PER_LONG - 1)

/**
 * xa_mk_value() - Create an XArray entry from an integer.
 * @v: Value to store in XArray.
 *
 * Context: Any context.
 * Return: An entry suitable for storing in the XArray.
 */
static inline void *xa_mk_value(unsigned long v)
{
        WARN_ON((long)v < 0);
        return (void *)((v << 1) | 1);
}

/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
        return (unsigned long)entry >> 1;
}

/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
        return (unsigned long)entry & 1;
}

/**
 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
 * @p: Plain pointer.
 * @tag: Tag value (0, 1 or 3).
 *
 * If the user of the XArray prefers, they can tag their pointers instead
 * of storing value entries.  Three tags are available (0, 1 and 3).
 * These are distinct from the xa_mark_t as they are not replicated up
 * through the array and cannot be searched for.
 *
 * Context: Any context.
 * Return: An XArray entry.
 */
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
        return (void *)((unsigned long)p | tag);
}

/**
 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the untagged version of the pointer.
 *
 * Context: Any context.
 * Return: A pointer.
 */
static inline void *xa_untag_pointer(void *entry)
{
        return (void *)((unsigned long)entry & ~3UL);
}

/**
 * xa_pointer_tag() - Get the tag stored in an XArray entry.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the tag of that pointer.
 *
 * Context: Any context.
 * Return: A tag.
 */
static inline unsigned int xa_pointer_tag(void *entry)
{
        return (unsigned long)entry & 3UL;
}

/*
 * xa_mk_internal() - Create an internal entry.
 * @v: Value to turn into an internal entry.
 *
 * Internal entries are used for a number of purposes.  Entries 0-255 are
 * used for sibling entries (only 0-62 are used by the current code).  256
 * is used for the retry entry.  257 is used for the reserved / zero entry.
 * Negative internal entries are used to represent errnos.  Node pointers
 * are also tagged as internal entries in some situations.
 *
 * Context: Any context.
 * Return: An XArray internal entry corresponding to this value.
 */
static inline void *xa_mk_internal(unsigned long v)
{
        return (void *)((v << 2) | 2);
}

/*
 * xa_to_internal() - Extract the value from an internal entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value which was stored in the internal entry.
 */
static inline unsigned long xa_to_internal(const void *entry)
{
        return (unsigned long)entry >> 2;
}

/*
 * xa_is_internal() - Is the entry an internal entry?
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: %true if the entry is an internal entry.
 */
static inline bool xa_is_internal(const void *entry)
{
        return ((unsigned long)entry & 3) == 2;
}

#define XA_ZERO_ENTRY                xa_mk_internal(257)

/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
        return unlikely(entry == XA_ZERO_ENTRY);
}

/**
 * xa_is_err() - Report whether an XArray operation returned an error
 * @entry: Result from calling an XArray function
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special value indicating an error.  This function tells you
 * whether an error occurred; xa_err() tells you which error occurred.
 *
 * Context: Any context.
 * Return: %true if the entry indicates an error.
 */
static inline bool xa_is_err(const void *entry)
{
        return unlikely(xa_is_internal(entry) &&
                        entry >= xa_mk_internal(-MAX_ERRNO));
}

/**
 * xa_err() - Turn an XArray result into an errno.
 * @entry: Result from calling an XArray function.
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special pointer value which encodes an errno.  This function extracts
 * the errno from the pointer value, or returns 0 if the pointer does not
 * represent an errno.
 *
 * Context: Any context.
 * Return: A negative errno or 0.
 */
static inline int xa_err(void *entry)
{
        /* xa_to_internal() would not do sign extension. */
        if (xa_is_err(entry))
                return (long)entry >> 2;
        return 0;
}

/**
 * struct xa_limit - Represents a range of IDs.
 * @min: The lowest ID to allocate (inclusive).
 * @max: The maximum ID to allocate (inclusive).
 *
 * This structure is used either directly or via the XA_LIMIT() macro
 * to communicate the range of IDs that are valid for allocation.
 * Three common ranges are predefined for you:
 * * xa_limit_32b        - [0 - UINT_MAX]
 * * xa_limit_31b        - [0 - INT_MAX]
 * * xa_limit_16b        - [0 - USHRT_MAX]
 */
struct xa_limit {
        u32 max;
        u32 min;
};

#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }

#define xa_limit_32b        XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b        XA_LIMIT(0, INT_MAX)
#define xa_limit_16b        XA_LIMIT(0, USHRT_MAX)

typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0                ((__force xa_mark_t)0U)
#define XA_MARK_1                ((__force xa_mark_t)1U)
#define XA_MARK_2                ((__force xa_mark_t)2U)
#define XA_PRESENT                ((__force xa_mark_t)8U)
#define XA_MARK_MAX                XA_MARK_2
#define XA_FREE_MARK                XA_MARK_0

enum xa_lock_type {
        XA_LOCK_IRQ = 1,
        XA_LOCK_BH = 2,
};

/*
 * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
 * and we remain compatible with that.
 */
#define XA_FLAGS_LOCK_IRQ        ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH        ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE        ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY        ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED        ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT        ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark)        ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
                                                (__force unsigned)(mark)))

/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC        (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1        (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)

/**
 * struct xarray - The anchor of the XArray.
 * @xa_lock: Lock that protects the contents of the XArray.
 *
 * To use the xarray, define it statically or embed it in your data structure.
 * It is a very small data structure, so it does not usually make sense to
 * allocate it separately and keep a pointer to it in your data structure.
 *
 * You may use the xa_lock to protect your own data structures as well.
 */
/*
 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
 * If the only non-NULL entry in the array is at index 0, @xa_head is that
 * entry.  If any other entry in the array is non-NULL, @xa_head points
 * to an @xa_node.
 */
struct xarray {
        spinlock_t        xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t                xa_flags;
        void __rcu *        xa_head;
};

#define XARRAY_INIT(name, flags) {                                \
        .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),                \
        .xa_flags = flags,                                        \
        .xa_head = NULL,                                        \
}

/**
 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
 * @name: A string that names your XArray.
 * @flags: XA_FLAG values.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name and flags.  It is
 * equivalent to calling xa_init_flags() on the array, but it does the
 * initialisation at compiletime instead of runtime.
 */
#define DEFINE_XARRAY_FLAGS(name, flags)                                \
        struct xarray name = XARRAY_INIT(name, flags)

/**
 * DEFINE_XARRAY() - Define an XArray.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name.  It is equivalent
 * to calling xa_init() on the array, but it does the initialisation at
 * compiletime instead of runtime.
 */
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)

/**
 * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)

/**
 * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)

void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
                        void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
                unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);

/**
 * xa_init_flags() - Initialise an empty XArray with flags.
 * @xa: XArray.
 * @flags: XA_FLAG values.
 *
 * If you need to initialise an XArray with special flags (eg you need
 * to take the lock from interrupt context), use this function instead
 * of xa_init().
 *
 * Context: Any context.
 */
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
        spin_lock_init(&xa->xa_lock);
        xa->xa_flags = flags;
        xa->xa_head = NULL;
}

/**
 * xa_init() - Initialise an empty XArray.
 * @xa: XArray.
 *
 * An empty XArray is full of NULL entries.
 *
 * Context: Any context.
 */
static inline void xa_init(struct xarray *xa)
{
        xa_init_flags(xa, 0);
}

/**
 * xa_empty() - Determine if an array has any present entries.
 * @xa: XArray.
 *
 * Context: Any context.
 * Return: %true if the array contains only NULL pointers.
 */
static inline bool xa_empty(const struct xarray *xa)
{
        return xa->xa_head == NULL;
}

/**
 * xa_marked() - Inquire whether any entry in this array has a mark set
 * @xa: Array
 * @mark: Mark value
 *
 * Context: Any context.
 * Return: %true if any entry has this mark set.
 */
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
        return xa->xa_flags & XA_FLAGS_MARK(mark);
}

/**
 * xa_for_each_range() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 * @last: Last index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_range() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_range().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_range(xa, index, entry, start, last)                \
        for (index = start,                                                \
             entry = xa_find(xa, &index, last, XA_PRESENT);                \
             entry;                                                        \
             entry = xa_find_after(xa, &index, last, XA_PRESENT))

/**
 * xa_for_each_start() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_start() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_start().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_start(xa, index, entry, start) \
        xa_for_each_range(xa, index, entry, start, ULONG_MAX)

/**
 * xa_for_each() - Iterate over present entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you want
 * to skip or reprocess indices.  It is safe to modify the array during the
 * iteration.  At the end of the iteration, @entry will be set to NULL and
 * @index will have a value less than or equal to max.
 *
 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
 * will spin if it hits a retry entry; if you intend to see retry entries,
 * you should use the xas_for_each() iterator instead.  The xas_for_each()
 * iterator will expand into more inline code than xa_for_each().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each(xa, index, entry) \
        xa_for_each_start(xa, index, entry, 0)

/**
 * xa_for_each_marked() - Iterate over marked entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @filter: Selection criterion.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  The iteration will skip all entries in the array
 * which do not match @filter.  You may modify @index during the iteration
 * if you want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set to
 * NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
 * You have to handle your own locking with xas_for_each(), and if you have
 * to unlock after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each_marked() iterator
 * instead.  The xas_for_each_marked() iterator will expand into more inline
 * code than xa_for_each_marked().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_marked(xa, index, entry, filter) \
        for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
             entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))

#define xa_trylock(xa)                spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa)                spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa)                spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa)                spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa)        spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa)                spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa)        spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
                                spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
                                spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
                                spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
                                spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
                                spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
                spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)

/*
 * Versions of the normal API which require the caller to hold the
 * xa_lock.  If the GFP flags allow it, they will drop the lock to
 * allocate memory, then reacquire it afterwards.  These functions
 * may also re-enable interrupts if the XArray flags indicate the
 * locking should be interrupt safe.
 */
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
                void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
                void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
                struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
                struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);

/**
 * xa_store_bh() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_store_irq() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_erase_bh() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_bh(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_bh(xa);

        return entry;
}

/**
 * xa_erase_irq() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_irq(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_irq(xa);

        return entry;
}

/**
 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock(xa);

        return curr;
}

/**
 * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_insert() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_insert_bh() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_bh(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_insert_irq() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_irq(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_reserve() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * Ensures there is somewhere to store an entry at @index in the array.
 * If there is already something stored at @index, this function does
 * nothing.  If there was nothing there, the entry is marked as reserved.
 * Loading from a reserved entry returns a %NULL pointer.
 *
 * If you do not use the entry that you have reserved, call xa_release()
 * or xa_erase() to free any unnecessary memory.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_bh() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * A softirq-disabling version of xa_reserve().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_irq() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * An interrupt-disabling version of xa_reserve().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_release() - Release a reserved entry.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After calling xa_reserve(), you can call this function to release the
 * reservation.  If the entry at @index has been stored to, this function
 * will do nothing.
 */
static inline void xa_release(struct xarray *xa, unsigned long index)
{
        xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}

/* Everything below here is the Advanced API.  Proceed with caution. */

/*
 * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
 * the best chunk size requires some tradeoffs.  A power of two recommends
 * itself so that we can walk the tree based purely on shifts and masks.
 * Generally, the larger the better; as the number of slots per level of the
 * tree increases, the less tall the tree needs to be.  But that needs to be
 * balanced against the memory consumption of each node.  On a 64-bit system,
 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
 */
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT                (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE                (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK                (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS                3
#define XA_MARK_LONGS                BITS_TO_LONGS(XA_CHUNK_SIZE)

/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is a value entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @nr_values is the count of every element in ->slots which is
 * either a value entry or a sibling of a value entry.
 */
struct xa_node {
        unsigned char        shift;                /* Bits remaining in each slot */
        unsigned char        offset;                /* Slot offset in parent */
        unsigned char        count;                /* Total entry count */
        unsigned char        nr_values;        /* Value entry count */
        struct xa_node __rcu *parent;        /* NULL at top of tree */
        struct xarray        *array;                /* The array we belong to */
        union {
                struct list_head private_list;        /* For tree user */
                struct rcu_head        rcu_head;        /* Used when freeing node */
        };
        void __rcu        *slots[XA_CHUNK_SIZE];
        union {
                unsigned long        tags[XA_MAX_MARKS][XA_MARK_LONGS];
                unsigned long        marks[XA_MAX_MARKS][XA_MARK_LONGS];
        };
};

void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);

#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do {                                        \
                if (x) {                                        \
                        xa_dump(xa);                                \
                        BUG();                                        \
                }                                                \
        } while (0)
#define XA_NODE_BUG_ON(node, x) do {                                \
                if (x) {                                        \
                        if (node) xa_dump_node(node);                \
                        BUG();                                        \
                }                                                \
        } while (0)
#else
#define XA_BUG_ON(xa, x)        do { } while (0)
#define XA_NODE_BUG_ON(node, x)        do { } while (0)
#endif

/* Private */
static inline void *xa_head(const struct xarray *xa)
{
        return rcu_dereference_check(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
        return rcu_dereference_protected(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_check(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_protected(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_check(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_protected(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
        return (void *)((unsigned long)node | 2);
}

/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
        return (struct xa_node *)((unsigned long)entry - 2);
}

/* Private */
static inline bool xa_is_node(const void *entry)
{
        return xa_is_internal(entry) && (unsigned long)entry > 4096;
}

/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
        return xa_mk_internal(offset);
}

/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
        return xa_to_internal(entry);
}

/**
 * xa_is_sibling() - Is the entry a sibling entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a sibling entry.
 */
static inline bool xa_is_sibling(const void *entry)
{
        return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
                (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}

#define XA_RETRY_ENTRY                xa_mk_internal(256)

/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
        return unlikely(entry == XA_RETRY_ENTRY);
}

/**
 * xa_is_advanced() - Is the entry only permitted for the advanced API?
 * @entry: Entry to be stored in the XArray.
 *
 * Return: %true if the entry cannot be stored by the normal API.
 */
static inline bool xa_is_advanced(const void *entry)
{
        return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}

/**
 * typedef xa_update_node_t - A callback function from the XArray.
 * @node: The node which is being processed
 *
 * This function is called every time the XArray updates the count of
 * present and value entries in a node.  It allows advanced users to
 * maintain the private_list in the node.
 *
 * Context: The xa_lock is held and interrupts may be disabled.
 *            Implementations should not drop the xa_lock, nor re-enable
 *            interrupts.
 */
typedef void (*xa_update_node_t)(struct xa_node *node);

void xa_delete_node(struct xa_node *, xa_update_node_t);

/*
 * The xa_state is opaque to its users.  It contains various different pieces
 * of state involved in the current operation on the XArray.  It should be
 * declared on the stack and passed between the various internal routines.
 * The various elements in it should not be accessed directly, but only
 * through the provided accessor functions.  The below documentation is for
 * the benefit of those working on the code, not for users of the XArray.
 *
 * @xa_node usually points to the xa_node containing the slot we're operating
 * on (and @xa_offset is the offset in the slots array).  If there is a
 * single entry in the array at index 0, there are no allocated xa_nodes to
 * point to, and so we store %NULL in @xa_node.  @xa_node is set to
 * the value %XAS_RESTART if the xa_state is not walked to the correct
 * position in the tree of nodes for this operation.  If an error occurs
 * during an operation, it is set to an %XAS_ERROR value.  If we run off the
 * end of the allocated nodes, it is set to %XAS_BOUNDS.
 */
struct xa_state {
        struct xarray *xa;
        unsigned long xa_index;
        unsigned char xa_shift;
        unsigned char xa_sibs;
        unsigned char xa_offset;
        unsigned char xa_pad;                /* Helps gcc generate better code */
        struct xa_node *xa_node;
        struct xa_node *xa_alloc;
        xa_update_node_t xa_update;
        struct list_lru *xa_lru;
};

/*
 * We encode errnos in the xas->xa_node.  If an error has happened, we need to
 * drop the lock to fix it, and once we've done so the xa_state is invalid.
 */
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS        ((struct xa_node *)1UL)
#define XAS_RESTART        ((struct xa_node *)3UL)

#define __XA_STATE(array, index, shift, sibs)  {        \
        .xa = array,                                        \
        .xa_index = index,                                \
        .xa_shift = shift,                                \
        .xa_sibs = sibs,                                \
        .xa_offset = 0,                                        \
        .xa_pad = 0,                                        \
        .xa_node = XAS_RESTART,                                \
        .xa_alloc = NULL,                                \
        .xa_update = NULL,                                \
        .xa_lru = NULL,                                        \
}

/**
 * XA_STATE() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 *
 * Declare and initialise an xa_state on the stack.
 */
#define XA_STATE(name, array, index)                                \
        struct xa_state name = __XA_STATE(array, index, 0, 0)

/**
 * XA_STATE_ORDER() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 * @order: Order of entry.
 *
 * Declare and initialise an xa_state on the stack.  This variant of
 * XA_STATE() allows you to specify the 'order' of the element you
 * want to operate on.`
 */
#define XA_STATE_ORDER(name, array, index, order)                \
        struct xa_state name = __XA_STATE(array,                \
                        (index >> order) << order,                \
                        order - (order % XA_CHUNK_SHIFT),        \
                        (1U << (order % XA_CHUNK_SHIFT)) - 1)

#define xas_marked(xas, mark)        xa_marked((xas)->xa, (mark))
#define xas_trylock(xas)        xa_trylock((xas)->xa)
#define xas_lock(xas)                xa_lock((xas)->xa)
#define xas_unlock(xas)                xa_unlock((xas)->xa)
#define xas_lock_bh(xas)        xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas)        xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas)        xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas)        xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
                                xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
                                xa_unlock_irqrestore((xas)->xa, flags)

/**
 * xas_error() - Return an errno stored in the xa_state.
 * @xas: XArray operation state.
 *
 * Return: 0 if no error has been noted.  A negative errno if one has.
 */
static inline int xas_error(const struct xa_state *xas)
{
        return xa_err(xas->xa_node);
}

/**
 * xas_set_err() - Note an error in the xa_state.
 * @xas: XArray operation state.
 * @err: Negative error number.
 *
 * Only call this function with a negative @err; zero or positive errors
 * will probably not behave the way you think they should.  If you want
 * to clear the error from an xa_state, use xas_reset().
 */
static inline void xas_set_err(struct xa_state *xas, long err)
{
        xas->xa_node = XA_ERROR(err);
}

/**
 * xas_invalid() - Is the xas in a retry or error state?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas cannot be used for operations.
 */
static inline bool xas_invalid(const struct xa_state *xas)
{
        return (unsigned long)xas->xa_node & 3;
}

/**
 * xas_valid() - Is the xas a valid cursor into the array?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas can be used for operations.
 */
static inline bool xas_valid(const struct xa_state *xas)
{
        return !xas_invalid(xas);
}

/**
 * xas_is_node() - Does the xas point to a node?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas currently references a node.
 */
static inline bool xas_is_node(const struct xa_state *xas)
{
        return xas_valid(xas) && xas->xa_node;
}

/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
        return ((unsigned long)node & 3) || !node;
}

/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
        return (unsigned long)node & 2;
}

/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
        return node <= XAS_RESTART;
}

/**
 * xas_reset() - Reset an XArray operation state.
 * @xas: XArray operation state.
 *
 * Resets the error or walk state of the @xas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * xarray lock and want to reuse the xa_state.
 *
 * Context: Any context.
 */
static inline void xas_reset(struct xa_state *xas)
{
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_retry() - Retry the operation if appropriate.
 * @xas: XArray operation state.
 * @entry: Entry from xarray.
 *
 * The advanced functions may sometimes return an internal entry, such as
 * a retry entry or a zero entry.  This function sets up the @xas to restart
 * the walk from the head of the array if needed.
 *
 * Context: Any context.
 * Return: true if the operation needs to be retried.
 */
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
        if (xa_is_zero(entry))
                return true;
        if (!xa_is_retry(entry))
                return false;
        xas_reset(xas);
        return true;
}

void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);

bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
void xas_destroy(struct xa_state *);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);

#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
int xas_get_order(struct xa_state *xas);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
        return 0;
}

static inline int xas_get_order(struct xa_state *xas)
{
        return 0;
}

static inline void xas_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
        xas_store(xas, entry);
}

static inline void xas_split_alloc(struct xa_state *xas, void *entry,
                unsigned int order, gfp_t gfp)
{
}
#endif

/**
 * xas_reload() - Refetch an entry from the xarray.
 * @xas: XArray operation state.
 *
 * Use this function to check that a previously loaded entry still has
 * the same value.  This is useful for the lockless pagecache lookup where
 * we walk the array with only the RCU lock to protect us, lock the page,
 * then check that the page hasn't moved since we looked it up.
 *
 * The caller guarantees that @xas is still valid.  If it may be in an
 * error or restart state, call xas_load() instead.
 *
 * Return: The entry at this location in the xarray.
 */
static inline void *xas_reload(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        char offset;

        if (!node)
                return xa_head(xas->xa);
        if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
                offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
                entry = xa_entry(xas->xa, node, offset);
                if (!xa_is_sibling(entry))
                        return entry;
                offset = xa_to_sibling(entry);
        } else {
                offset = xas->xa_offset;
        }
        return xa_entry(xas->xa, node, offset);
}

/**
 * xas_set() - Set up XArray operation state for a different index.
 * @xas: XArray operation state.
 * @index: New index into the XArray.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see xas_next()
 * to move to an adjacent index.
 */
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
        xas->xa_index = index;
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_advance() - Skip over sibling entries.
 * @xas: XArray operation state.
 * @index: Index of last sibling entry.
 *
 * Move the operation state to refer to the last sibling entry.
 * This is useful for loops that normally want to see sibling
 * entries but sometimes want to skip them.  Use xas_set() if you
 * want to move to an index which is not part of this entry.
 */
static inline void xas_advance(struct xa_state *xas, unsigned long index)
{
        unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0;

        xas->xa_index = index;
        xas->xa_offset = (index >> shift) & XA_CHUNK_MASK;
}

/**
 * xas_set_order() - Set up XArray operation state for a multislot entry.
 * @xas: XArray operation state.
 * @index: Target of the operation.
 * @order: Entry occupies 2^@order indices.
 */
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
                                        unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
        xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
        xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
        xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        xas->xa_node = XAS_RESTART;
#else
        BUG_ON(order > 0);
        xas_set(xas, index);
#endif
}

/**
 * xas_set_update() - Set up XArray operation state for a callback.
 * @xas: XArray operation state.
 * @update: Function to call when updating a node.
 *
 * The XArray can notify a caller after it has updated an xa_node.
 * This is advanced functionality and is only needed by the page
 * cache and swap cache.
 */
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
        xas->xa_update = update;
}

static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru)
{
        xas->xa_lru = lru;
}

/**
 * xas_next_entry() - Advance iterator to next present entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * xas_next_entry() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find(), and will call xas_find()
 * for all the hard cases.
 *
 * Return: The next present entry after the one currently referred to by @xas.
 */
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
        struct xa_node *node = xas->xa_node;
        void *entry;

        if (unlikely(xas_not_node(node) || node->shift ||
                        xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
                return xas_find(xas, max);

        do {
                if (unlikely(xas->xa_index >= max))
                        return xas_find(xas, max);
                if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
                        return xas_find(xas, max);
                entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
                if (unlikely(xa_is_internal(entry)))
                        return xas_find(xas, max);
                xas->xa_offset++;
                xas->xa_index++;
        } while (!entry);

        return entry;
}

/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
                xa_mark_t mark)
{
        unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
        unsigned int offset = xas->xa_offset;

        if (advance)
                offset++;
        if (XA_CHUNK_SIZE == BITS_PER_LONG) {
                if (offset < XA_CHUNK_SIZE) {
                        unsigned long data = *addr & (~0UL << offset);
                        if (data)
                                return __ffs(data);
                }
                return XA_CHUNK_SIZE;
        }

        return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}

/**
 * xas_next_marked() - Advance iterator to next marked entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark to search for.
 *
 * xas_next_marked() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find_marked(), and will call
 * xas_find_marked() for all the hard cases.
 *
 * Return: The next marked entry after the one currently referred to by @xas.
 */
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
                                                                xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        unsigned int offset;

        if (unlikely(xas_not_node(node) || node->shift))
                return xas_find_marked(xas, max, mark);
        offset = xas_find_chunk(xas, true, mark);
        xas->xa_offset = offset;
        xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
        if (xas->xa_index > max)
                return NULL;
        if (offset == XA_CHUNK_SIZE)
                return xas_find_marked(xas, max, mark);
        entry = xa_entry(xas->xa, node, offset);
        if (!entry)
                return xas_find_marked(xas, max, mark);
        return entry;
}

/*
 * If iterating while holding a lock, drop the lock and reschedule
 * every %XA_CHECK_SCHED loops.
 */
enum {
        XA_CHECK_SCHED = 4096,
};

/**
 * xas_for_each() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 *
 * The loop body will be executed for each entry present in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each(xas, entry, max) \
        for (entry = xas_find(xas, max); entry; \
             entry = xas_next_entry(xas, max))

/**
 * xas_for_each_marked() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 * @mark: Mark to search for.
 *
 * The loop body will be executed for each marked entry in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each_marked(xas, entry, max, mark) \
        for (entry = xas_find_marked(xas, max, mark); entry; \
             entry = xas_next_marked(xas, max, mark))

/**
 * xas_for_each_conflict() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 *
 * The loop body will be executed for each entry in the XArray that
 * lies within the range specified by @xas.  If the loop terminates
 * normally, @entry will be %NULL.  The user may break out of the loop,
 * which will leave @entry set to the conflicting entry.  The caller
 * may also call xa_set_err() to exit the loop while setting an error
 * to record the reason.
 */
#define xas_for_each_conflict(xas, entry) \
        while ((entry = xas_find_conflict(xas)))

void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);

/**
 * xas_prev() - Move iterator to previous index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * subtracted from the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index 0, this function wraps
 * around to %ULONG_MAX.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_prev(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == 0))
                return __xas_prev(xas);

        xas->xa_index--;
        xas->xa_offset--;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

/**
 * xas_next() - Move state to next index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * added to the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index %ULONG_MAX, this function wraps
 * around to 0.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_next(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == XA_CHUNK_MASK))
                return __xas_next(xas);

        xas->xa_index++;
        xas->xa_offset++;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

#endif /* _LINUX_XARRAY_H */





































































    2 












    3 





    4 




















    2 





    2 



















    1 





















    2 

































    1 





















    4 




    3 






    1 











    2 







    1 









    2 
























    3 










































    3 










    3 
    3 
    2 


























































    2 

    3 












    2 






    1 



    1 






























































    1 


    1 

























































    3 





    3 
    2 

    2 



































































































































































































    1 







    2 






    1 
    2 








    1 

    1 








































    1 
    1 






    1 
    2 



























    3 






















    2 


    3 






    2 

    2 




    3 






































    2 































































    2 
    2 
    2 

    2 
    3 


























































    2 




















    1 



    1 


    1 







    1 

    1 






















    3 
































    1 




    1 












    1 



    1 
    1 














    1 
    1 

    1 









    1 





    1 




    1 

    1 


    1 



    1 



    1 


    1 


    1 








































































































































































































































    4 
    3 







    4 






    3 





    2 









    2 









    1 
    2 
    2 



    3 
    2 
    1 

    1 


    1 






    1 






    2 



    3 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2005 SGI, Christoph Lameter
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 * Copyright (C) 2016 Intel, Matthew Wilcox
 * Copyright (C) 2016 Intel, Ross Zwisler
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/percpu.h>
#include <linux/preempt.h>                /* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Radix tree node cache.
 */
struct kmem_cache *radix_tree_node_cachep;

/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

/*
 * The IDR does not have to be as high as the radix tree since it uses
 * signed integers, not unsigned longs.
 */
#define IDR_INDEX_BITS                (8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH                (DIV_ROUND_UP(IDR_INDEX_BITS, \
                                                RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE        (IDR_MAX_PATH * 2 - 1)

/*
 * Per-cpu pool of preloaded nodes
 */
DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = {
        .lock = INIT_LOCAL_LOCK(lock),
};
EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads);

static inline struct radix_tree_node *entry_to_node(void *ptr)
{
        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

static inline void *node_to_entry(void *ptr)
{
        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}

#define RADIX_TREE_RETRY        XA_RETRY_ENTRY

static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
{
        return parent ? slot - parent->slots : 0;
}

static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
                        struct radix_tree_node **nodep, unsigned long index)
{
        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
        void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);

        *nodep = (void *)entry;
        return offset;
}

static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
        return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}

static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __set_bit(offset, node->tags[tag]);
}

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __clear_bit(offset, node->tags[tag]);
}

static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        return test_bit(offset, node->tags[tag]);
}

static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
        root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}

static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
        return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}

static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
        return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}

static inline bool is_idr(const struct radix_tree_root *root)
{
        return !!(root->xa_flags & ROOT_IS_IDR);
}

/*
 * Returns 1 if any slot in the node has this tag set.
 * Otherwise returns 0.
 */
static inline int any_tag_set(const struct radix_tree_node *node,
                                                        unsigned int tag)
{
        unsigned idx;
        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
                if (node->tags[tag][idx])
                        return 1;
        }
        return 0;
}

static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
        bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}

/**
 * radix_tree_find_next_bit - find the next set bit in a memory region
 *
 * @node: where to begin the search
 * @tag: the tag index
 * @offset: the bitnumber to start searching at
 *
 * Unrollable variant of find_next_bit() for constant size arrays.
 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 * Returns next bit offset, or size if nothing found.
 */
static __always_inline unsigned long
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
                         unsigned long offset)
{
        const unsigned long *addr = node->tags[tag];

        if (offset < RADIX_TREE_MAP_SIZE) {
                unsigned long tmp;

                addr += offset / BITS_PER_LONG;
                tmp = *addr >> (offset % BITS_PER_LONG);
                if (tmp)
                        return __ffs(tmp) + offset;
                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
                while (offset < RADIX_TREE_MAP_SIZE) {
                        tmp = *++addr;
                        if (tmp)
                                return __ffs(tmp) + offset;
                        offset += BITS_PER_LONG;
                }
        }
        return RADIX_TREE_MAP_SIZE;
}

static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
        return iter->index & RADIX_TREE_MAP_MASK;
}

/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
        return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
        return shift_maxindex(node->shift);
}

static unsigned long next_index(unsigned long index,
                                const struct radix_tree_node *node,
                                unsigned long offset)
{
        return (index & ~node_maxindex(node)) + (offset << node->shift);
}

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
                        struct radix_tree_root *root,
                        unsigned int shift, unsigned int offset,
                        unsigned int count, unsigned int nr_values)
{
        struct radix_tree_node *ret = NULL;

        /*
         * Preload code isn't irq safe and it doesn't make sense to use
         * preloading during an interrupt anyway as all the allocations have
         * to be atomic. So just do normal allocation when in interrupt.
         */
        if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
                struct radix_tree_preload *rtp;

                /*
                 * Even if the caller has preloaded, try to allocate from the
                 * cache first for the new node to get accounted to the memory
                 * cgroup.
                 */
                ret = kmem_cache_alloc(radix_tree_node_cachep,
                                       gfp_mask | __GFP_NOWARN);
                if (ret)
                        goto out;

                /*
                 * Provided the caller has preloaded here, we will always
                 * succeed in getting a node here (and never reach
                 * kmem_cache_alloc)
                 */
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes;
                        rtp->nodes = ret->parent;
                        rtp->nr--;
                }
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(ret);
                goto out;
        }
        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
        BUG_ON(radix_tree_is_internal_node(ret));
        if (ret) {
                ret->shift = shift;
                ret->offset = offset;
                ret->count = count;
                ret->nr_values = nr_values;
                ret->parent = parent;
                ret->array = root;
        }
        return ret;
}

void radix_tree_node_rcu_free(struct rcu_head *head)
{
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);

        /*
         * Must only free zeroed nodes into the slab.  We can be left with
         * non-NULL entries by radix_tree_free_nodes, so clear the entries
         * and tags here.
         */
        memset(node->slots, 0, sizeof(node->slots));
        memset(node->tags, 0, sizeof(node->tags));
        INIT_LIST_HEAD(&node->private_list);

        kmem_cache_free(radix_tree_node_cachep, node);
}

static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
        int ret = -ENOMEM;

        /*
         * Nodes preloaded by one cgroup can be used by another cgroup, so
         * they should never be accounted to any particular memory cgroup.
         */
        gfp_mask &= ~__GFP_ACCOUNT;

        local_lock(&radix_tree_preloads.lock);
        rtp = this_cpu_ptr(&radix_tree_preloads);
        while (rtp->nr < nr) {
                local_unlock(&radix_tree_preloads.lock);
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
                if (node == NULL)
                        goto out;
                local_lock(&radix_tree_preloads.lock);
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < nr) {
                        node->parent = rtp->nodes;
                        rtp->nodes = node;
                        rtp->nr++;
                } else {
                        kmem_cache_free(radix_tree_node_cachep, node);
                }
        }
        ret = 0;
out:
        return ret;
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
        /* Warn on non-sensical use... */
        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
        return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}
EXPORT_SYMBOL(radix_tree_preload);

/*
 * The same as above function, except we don't guarantee preloading happens.
 * We do it, if we decide it helps. On success, return zero with preemption
 * disabled. On error, return -ENOMEM with preemption not disabled.
 */
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
        if (gfpflags_allow_blocking(gfp_mask))
                return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
        /* Preloading doesn't help anything with this gfp mask, skip it */
        local_lock(&radix_tree_preloads.lock);
        return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

static unsigned radix_tree_load_root(const struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
{
        struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);

        *nodep = node;

        if (likely(radix_tree_is_internal_node(node))) {
                node = entry_to_node(node);
                *maxindex = node_maxindex(node);
                return node->shift + RADIX_TREE_MAP_SHIFT;
        }

        *maxindex = 0;
        return 0;
}

/*
 *        Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
                                unsigned long index, unsigned int shift)
{
        void *entry;
        unsigned int maxshift;
        int tag;

        /* Figure out what the shift should be.  */
        maxshift = shift;
        while (index > shift_maxindex(maxshift))
                maxshift += RADIX_TREE_MAP_SHIFT;

        entry = rcu_dereference_raw(root->xa_head);
        if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
                goto out;

        do {
                struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
                                                        root, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;

                if (is_idr(root)) {
                        all_tag_set(node, IDR_FREE);
                        if (!root_tag_get(root, IDR_FREE)) {
                                tag_clear(node, IDR_FREE, 0);
                                root_tag_set(root, IDR_FREE);
                        }
                } else {
                        /* Propagate the aggregated tag info to the new child */
                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                                if (root_tag_get(root, tag))
                                        tag_set(node, tag, 0);
                        }
                }

                BUG_ON(shift > BITS_PER_LONG);
                if (radix_tree_is_internal_node(entry)) {
                        entry_to_node(entry)->parent = node;
                } else if (xa_is_value(entry)) {
                        /* Moving a value entry root->xa_head to a node */
                        node->nr_values = 1;
                }
                /*
                 * entry was already in the radix tree, so we do not need
                 * rcu_assign_pointer here
                 */
                node->slots[0] = (void __rcu *)entry;
                entry = node_to_entry(node);
                rcu_assign_pointer(root->xa_head, entry);
                shift += RADIX_TREE_MAP_SHIFT;
        } while (shift <= maxshift);
out:
        return maxshift + RADIX_TREE_MAP_SHIFT;
}

/**
 *        radix_tree_shrink    -    shrink radix tree to minimum height
 *        @root:                radix tree root
 */
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
        bool shrunk = false;

        for (;;) {
                struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
                struct radix_tree_node *child;

                if (!radix_tree_is_internal_node(node))
                        break;
                node = entry_to_node(node);

                /*
                 * The candidate node has more than one child, or its child
                 * is not at the leftmost slot, we cannot shrink.
                 */
                if (node->count != 1)
                        break;
                child = rcu_dereference_raw(node->slots[0]);
                if (!child)
                        break;

                /*
                 * For an IDR, we must not shrink entry 0 into the root in
                 * case somebody calls idr_replace() with a pointer that
                 * appears to be an internal entry
                 */
                if (!node->shift && is_idr(root))
                        break;

                if (radix_tree_is_internal_node(child))
                        entry_to_node(child)->parent = NULL;

                /*
                 * We don't need rcu_assign_pointer(), since we are simply
                 * moving the node from one part of the tree to another: if it
                 * was safe to dereference the old pointer to it
                 * (node->slots[0]), it will be safe to dereference the new
                 * one (root->xa_head) as far as dependent read barriers go.
                 */
                root->xa_head = (void __rcu *)child;
                if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
                        root_tag_clear(root, IDR_FREE);

                /*
                 * We have a dilemma here. The node's slot[0] must not be
                 * NULLed in case there are concurrent lookups expecting to
                 * find the item. However if this was a bottom-level node,
                 * then it may be subject to the slot pointer being visible
                 * to callers dereferencing it. If item corresponding to
                 * slot[0] is subsequently deleted, these callers would expect
                 * their slot to become empty sooner or later.
                 *
                 * For example, lockless pagecache will look up a slot, deref
                 * the page pointer, and if the page has 0 refcount it means it
                 * was concurrently deleted from pagecache so try the deref
                 * again. Fortunately there is already a requirement for logic
                 * to retry the entire slot lookup -- the indirect pointer
                 * problem (replacing direct root node with an indirect pointer
                 * also results in a stale slot). So tag the slot as indirect
                 * to force callers to retry.
                 */
                node->count = 0;
                if (!radix_tree_is_internal_node(child)) {
                        node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                shrunk = true;
        }

        return shrunk;
}

static bool delete_node(struct radix_tree_root *root,
                        struct radix_tree_node *node)
{
        bool deleted = false;

        do {
                struct radix_tree_node *parent;

                if (node->count) {
                        if (node_to_entry(node) ==
                                        rcu_dereference_raw(root->xa_head))
                                deleted |= radix_tree_shrink(root);
                        return deleted;
                }

                parent = node->parent;
                if (parent) {
                        parent->slots[node->offset] = NULL;
                        parent->count--;
                } else {
                        /*
                         * Shouldn't the tags already have all been cleared
                         * by the caller?
                         */
                        if (!is_idr(root))
                                root_tag_clear_all(root);
                        root->xa_head = NULL;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                deleted = true;

                node = parent;
        } while (node);

        return deleted;
}

/**
 *        __radix_tree_create        -        create a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Create, if necessary, and return the node and slot for an item
 *        at position @index in the radix tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 *
 *        Returns -ENOMEM, or 0 for success.
 */
static int __radix_tree_create(struct radix_tree_root *root,
                unsigned long index, struct radix_tree_node **nodep,
                void __rcu ***slotp)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex;
        unsigned int shift, offset = 0;
        unsigned long max = index;
        gfp_t gfp = root_gfp_mask(root);

        shift = radix_tree_load_root(root, &child, &maxindex);

        /* Make sure the tree is high enough.  */
        if (max > maxindex) {
                int error = radix_tree_extend(root, gfp, max, shift);
                if (error < 0)
                        return error;
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }

        while (shift > 0) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                /* Go a level down */
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);
                slot = &node->slots[offset];
        }

        if (nodep)
                *nodep = node;
        if (slotp)
                *slotp = slot;
        return 0;
}

/*
 * Free any nodes below this node.  The tree is presumed to not need
 * shrinking, and any user data in the tree is presumed to not need a
 * destructor called on it.  If we need to add a destructor, we can
 * add that functionality later.  Note that we may not clear tags or
 * slots from the tree as an RCU walker may still have a pointer into
 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
        unsigned offset = 0;
        struct radix_tree_node *child = entry_to_node(node);

        for (;;) {
                void *entry = rcu_dereference_raw(child->slots[offset]);
                if (xa_is_node(entry) && child->shift) {
                        child = entry_to_node(entry);
                        offset = 0;
                        continue;
                }
                offset++;
                while (offset == RADIX_TREE_MAP_SIZE) {
                        struct radix_tree_node *old = child;
                        offset = child->offset + 1;
                        child = child->parent;
                        WARN_ON_ONCE(!list_empty(&old->private_list));
                        radix_tree_node_free(old);
                        if (old == entry_to_node(node))
                                return;
                }
        }
}

static inline int insert_entries(struct radix_tree_node *node,
                void __rcu **slot, void *item)
{
        if (*slot)
                return -EEXIST;
        rcu_assign_pointer(*slot, item);
        if (node) {
                node->count++;
                if (xa_is_value(item))
                        node->nr_values++;
        }
        return 1;
}

/**
 *        radix_tree_insert    -    insert into a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @item:                item to insert
 *
 *        Insert an item into the radix tree at position @index.
 */
int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
                        void *item)
{
        struct radix_tree_node *node;
        void __rcu **slot;
        int error;

        BUG_ON(radix_tree_is_internal_node(item));

        error = __radix_tree_create(root, index, &node, &slot);
        if (error)
                return error;

        error = insert_entries(node, slot, item);
        if (error < 0)
                return error;

        if (node) {
                unsigned offset = get_slot_offset(node, slot);
                BUG_ON(tag_get(node, 0, offset));
                BUG_ON(tag_get(node, 1, offset));
                BUG_ON(tag_get(node, 2, offset));
        } else {
                BUG_ON(root_tags_get(root));
        }

        return 0;
}
EXPORT_SYMBOL(radix_tree_insert);

/**
 *        __radix_tree_lookup        -        lookup an item in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Lookup and return the item at position @index in the radix
 *        tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
                          unsigned long index, struct radix_tree_node **nodep,
                          void __rcu ***slotp)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        void __rcu **slot;

 restart:
        parent = NULL;
        slot = (void __rcu **)&root->xa_head;
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                slot = parent->slots + offset;
                if (node == RADIX_TREE_RETRY)
                        goto restart;
                if (parent->shift == 0)
                        break;
        }

        if (nodep)
                *nodep = parent;
        if (slotp)
                *slotp = slot;
        return node;
}

/**
 *        radix_tree_lookup_slot    -    lookup a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Returns:  the slot corresponding to the position @index in the
 *        radix tree @root. This is useful for update-if-exists operations.
 *
 *        This function can be called under rcu_read_lock iff the slot is not
 *        modified by radix_tree_replace_slot, otherwise it must be called
 *        exclusive from other writers. Any dereference of the slot must be done
 *        using radix_tree_deref_slot.
 */
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
                                unsigned long index)
{
        void __rcu **slot;

        if (!__radix_tree_lookup(root, index, NULL, &slot))
                return NULL;
        return slot;
}
EXPORT_SYMBOL(radix_tree_lookup_slot);

/**
 *        radix_tree_lookup    -    perform lookup operation on a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Lookup the item at the position @index in the radix tree @root.
 *
 *        This function can be called under rcu_read_lock, however the caller
 *        must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *        them safely). No RCU barriers are required to access or modify the
 *        returned item, however.
 */
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
        return __radix_tree_lookup(root, index, NULL, NULL);
}
EXPORT_SYMBOL(radix_tree_lookup);

static void replace_slot(void __rcu **slot, void *item,
                struct radix_tree_node *node, int count, int values)
{
        if (node && (count || values)) {
                node->count += count;
                node->nr_values += values;
        }

        rcu_assign_pointer(*slot, item);
}

static bool node_tag_get(const struct radix_tree_root *root,
                                const struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        if (node)
                return tag_get(node, tag, offset);
        return root_tag_get(root, tag);
}

/*
 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 * free, don't adjust the count, even if it's transitioning between NULL and
 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 * have empty bits, but it only stores NULL in slots when they're being
 * deleted.
 */
static int calculate_count(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot,
                                void *item, void *old)
{
        if (is_idr(root)) {
                unsigned offset = get_slot_offset(node, slot);
                bool free = node_tag_get(root, node, IDR_FREE, offset);
                if (!free)
                        return 0;
                if (!old)
                        return 1;
        }
        return !!item - !!old;
}

/**
 * __radix_tree_replace                - replace item in a slot
 * @root:                radix tree root
 * @node:                pointer to tree node
 * @slot:                pointer to slot in @node
 * @item:                new item to store in the slot.
 *
 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 * across slot lookup and replacement.
 */
void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
                          void __rcu **slot, void *item)
{
        void *old = rcu_dereference_raw(*slot);
        int values = !!xa_is_value(item) - !!xa_is_value(old);
        int count = calculate_count(root, node, slot, item, old);

        /*
         * This function supports replacing value entries and
         * deleting entries, but that needs accounting against the
         * node unless the slot is root->xa_head.
         */
        WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
                        (count || values));
        replace_slot(slot, item, node, count, values);

        if (!node)
                return;

        delete_node(root, node);
}

/**
 * radix_tree_replace_slot        - replace item in a slot
 * @root:        radix tree root
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_lookup_slot() and
 * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
 * across slot lookup and replacement.
 *
 * NOTE: This cannot be used to switch between non-entries (empty slots),
 * regular entries, and value entries, as that requires accounting
 * inside the radix tree node. When switching from one type of entry or
 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
 * radix_tree_iter_replace().
 */
void radix_tree_replace_slot(struct radix_tree_root *root,
                             void __rcu **slot, void *item)
{
        __radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);

/**
 * radix_tree_iter_replace - replace item in a slot
 * @root:        radix tree root
 * @iter:        iterator state
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_for_each_slot().
 * Caller must hold tree write locked.
 */
void radix_tree_iter_replace(struct radix_tree_root *root,
                                const struct radix_tree_iter *iter,
                                void __rcu **slot, void *item)
{
        __radix_tree_replace(root, iter->node, slot, item);
}

static void node_tag_set(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (tag_get(node, tag, offset))
                        return;
                tag_set(node, tag, offset);
                offset = node->offset;
                node = node->parent;
        }

        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);
}

/**
 *        radix_tree_tag_set - set a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  From
 *        the root all the way down to the leaf node.
 *
 *        Returns the address of the tagged item.  Setting a tag on a not-present
 *        item is a bug.
 */
void *radix_tree_tag_set(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        radix_tree_load_root(root, &node, &maxindex);
        BUG_ON(index > maxindex);

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                BUG_ON(!node);

                if (!tag_get(parent, tag, offset))
                        tag_set(parent, tag, offset);
        }

        /* set the root's tag bit */
        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);

static void node_tag_clear(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (!tag_get(node, tag, offset))
                        return;
                tag_clear(node, tag, offset);
                if (any_tag_set(node, tag))
                        return;

                offset = node->offset;
                node = node->parent;
        }

        /* clear the root's tag bit */
        if (root_tag_get(root, tag))
                root_tag_clear(root, tag);
}

/**
 *        radix_tree_tag_clear - clear a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  If this causes
 *        the leaf node to have no tags set then clear the tag in the
 *        next-to-leaf node, etc.
 *
 *        Returns the address of the tagged item on success, else NULL.  ie:
 *        has the same return value and semantics as radix_tree_lookup().
 */
void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        int offset = 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        parent = NULL;

        while (radix_tree_is_internal_node(node)) {
                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
        }

        if (node)
                node_tag_clear(root, parent, tag, offset);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);

/**
  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
  * @root: radix tree root
  * @iter: iterator state
  * @tag: tag to clear
  */
void radix_tree_iter_tag_clear(struct radix_tree_root *root,
                        const struct radix_tree_iter *iter, unsigned int tag)
{
        node_tag_clear(root, iter->node, tag, iter_offset(iter));
}

/**
 * radix_tree_tag_get - get a tag on a radix tree node
 * @root:                radix tree root
 * @index:                index key
 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
 *
 * Return values:
 *
 *  0: tag not present or not set
 *  1: tag set
 *
 * Note that the return value of this function may not be relied on, even if
 * the RCU lock is held, unless tag modification and node deletion are excluded
 * from concurrency.
 */
int radix_tree_tag_get(const struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        if (!root_tag_get(root, tag))
                return 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return 0;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);

                if (!tag_get(parent, tag, offset))
                        return 0;
                if (node == RADIX_TREE_RETRY)
                        break;
        }

        return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);

/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
                                struct radix_tree_node *node, unsigned offset,
                                unsigned tag)
{
        unsigned tag_long = offset / BITS_PER_LONG;
        unsigned tag_bit  = offset % BITS_PER_LONG;

        if (!node) {
                iter->tags = 1;
                return;
        }

        iter->tags = node->tags[tag][tag_long] >> tag_bit;

        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
                /* Pick tags from next element */
                if (tag_bit)
                        iter->tags |= node->tags[tag][tag_long + 1] <<
                                                (BITS_PER_LONG - tag_bit);
                /* Clip chunk size, here only BITS_PER_LONG tags */
                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
        }
}

void __rcu **radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter)
{
        iter->index = __radix_tree_iter_add(iter, 1);
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if iteration is over
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags)
{
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *node, *child;
        unsigned long index, offset, maxindex;

        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
                return NULL;

        /*
         * Catch next_index overflow after ~0UL. iter->index never overflows
         * during iterating; it can be zero only at the beginning.
         * And we cannot overflow iter->next_index in a single step,
         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
         *
         * This condition also used by radix_tree_next_slot() to stop
         * contiguous iterating, and forbid switching to the next chunk.
         */
        index = iter->next_index;
        if (!index && iter->index)
                return NULL;

 restart:
        radix_tree_load_root(root, &child, &maxindex);
        if (index > maxindex)
                return NULL;
        if (!child)
                return NULL;

        if (!radix_tree_is_internal_node(child)) {
                /* Single-slot tree */
                iter->index = index;
                iter->next_index = maxindex + 1;
                iter->tags = 1;
                iter->node = NULL;
                return (void __rcu **)&root->xa_head;
        }

        do {
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);

                if ((flags & RADIX_TREE_ITER_TAGGED) ?
                                !tag_get(node, tag, offset) : !child) {
                        /* Hole detected */
                        if (flags & RADIX_TREE_ITER_CONTIG)
                                return NULL;

                        if (flags & RADIX_TREE_ITER_TAGGED)
                                offset = radix_tree_find_next_bit(node, tag,
                                                offset + 1);
                        else
                                while (++offset        < RADIX_TREE_MAP_SIZE) {
                                        void *slot = rcu_dereference_raw(
                                                        node->slots[offset]);
                                        if (slot)
                                                break;
                                }
                        index &= ~node_maxindex(node);
                        index += offset << node->shift;
                        /* Overflow after ~0UL */
                        if (!index)
                                return NULL;
                        if (offset == RADIX_TREE_MAP_SIZE)
                                goto restart;
                        child = rcu_dereference_raw(node->slots[offset]);
                }

                if (!child)
                        goto restart;
                if (child == RADIX_TREE_RETRY)
                        break;
        } while (node->shift && radix_tree_is_internal_node(child));

        /* Update the iterator state */
        iter->index = (index &~ node_maxindex(node)) | offset;
        iter->next_index = (index | node_maxindex(node)) + 1;
        iter->node = node;

        if (flags & RADIX_TREE_ITER_TAGGED)
                set_iter_tags(iter, node, offset, tag);

        return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);

/**
 *        radix_tree_gang_lookup - perform multiple lookup on a radix tree
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *
 *        Performs an index-ascending scan of the tree for present items.  Places
 *        them at *@results and returns the number of items which were placed at
 *        *@results.
 *
 *        The implementation is naive.
 *
 *        Like radix_tree_lookup, radix_tree_gang_lookup may be called under
 *        rcu_read_lock. In this case, rather than the returned results being
 *        an atomic snapshot of the tree at a single point in time, the
 *        semantics of an RCU protected gang lookup are as though multiple
 *        radix_tree_lookups have been issued in individual locks, and results
 *        stored in 'results'.
 */
unsigned int
radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_slot(slot, root, &iter, first_index) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);

/**
 *        radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
 *                                     based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the items at *@results and
 *        returns the number of items which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);

/**
 *        radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
 *                                          radix tree based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the slots at *@results and
 *        returns the number of slots which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = slot;
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);

static bool __radix_tree_delete(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot)
{
        void *old = rcu_dereference_raw(*slot);
        int values = xa_is_value(old) ? -1 : 0;
        unsigned offset = get_slot_offset(node, slot);
        int tag;

        if (is_idr(root))
                node_tag_set(root, node, IDR_FREE, offset);
        else
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        node_tag_clear(root, node, tag, offset);

        replace_slot(slot, NULL, node, -1, values);
        return node && delete_node(root, node);
}

/**
 * radix_tree_iter_delete - delete the entry at this iterator position
 * @root: radix tree root
 * @iter: iterator state
 * @slot: pointer to slot
 *
 * Delete the entry at the position currently pointed to by the iterator.
 * This may result in the current node being freed; if it is, the iterator
 * is advanced so that it will not reference the freed memory.  This
 * function may be called without any locking if there are no other threads
 * which can access this tree.
 */
void radix_tree_iter_delete(struct radix_tree_root *root,
                                struct radix_tree_iter *iter, void __rcu **slot)
{
        if (__radix_tree_delete(root, iter->node, slot))
                iter->index = iter->next_index;
}
EXPORT_SYMBOL(radix_tree_iter_delete);

/**
 * radix_tree_delete_item - delete an item from a radix tree
 * @root: radix tree root
 * @index: index key
 * @item: expected item
 *
 * Remove @item at @index from the radix tree rooted at @root.
 *
 * Return: the deleted entry, or %NULL if it was not present
 * or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
{
        struct radix_tree_node *node = NULL;
        void __rcu **slot = NULL;
        void *entry;

        entry = __radix_tree_lookup(root, index, &node, &slot);
        if (!slot)
                return NULL;
        if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
                                                get_slot_offset(node, slot))))
                return NULL;

        if (item && entry != item)
                return NULL;

        __radix_tree_delete(root, node, slot);

        return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);

/**
 * radix_tree_delete - delete an entry from a radix tree
 * @root: radix tree root
 * @index: index key
 *
 * Remove the entry at @index from the radix tree rooted at @root.
 *
 * Return: The deleted entry, or %NULL if it was not present.
 */
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
        return radix_tree_delete_item(root, index, NULL);
}
EXPORT_SYMBOL(radix_tree_delete);

/**
 *        radix_tree_tagged - test whether any items in the tree are tagged
 *        @root:                radix tree root
 *        @tag:                tag to test
 */
int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
{
        return root_tag_get(root, tag);
}
EXPORT_SYMBOL(radix_tree_tagged);

/**
 * idr_preload - preload for idr_alloc()
 * @gfp_mask: allocation mask to use for preloading
 *
 * Preallocate memory to use for the next call to idr_alloc().  This function
 * returns with preemption disabled.  It will be enabled by idr_preload_end().
 */
void idr_preload(gfp_t gfp_mask)
{
        if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
                local_lock(&radix_tree_preloads.lock);
}
EXPORT_SYMBOL(idr_preload);

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex, start = iter->next_index;
        unsigned int shift, offset = 0;

 grow:
        shift = radix_tree_load_root(root, &child, &maxindex);
        if (!radix_tree_tagged(root, IDR_FREE))
                start = max(start, maxindex + 1);
        if (start > max)
                return ERR_PTR(-ENOSPC);

        if (start > maxindex) {
                int error = radix_tree_extend(root, gfp, start, shift);
                if (error < 0)
                        return ERR_PTR(error);
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }
        if (start == 0 && shift == 0)
                shift = RADIX_TREE_MAP_SHIFT;

        while (shift) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return ERR_PTR(-ENOMEM);
                        all_tag_set(child, IDR_FREE);
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, start);
                if (!tag_get(node, IDR_FREE, offset)) {
                        offset = radix_tree_find_next_bit(node, IDR_FREE,
                                                        offset + 1);
                        start = next_index(start, node, offset);
                        if (start > max || start == 0)
                                return ERR_PTR(-ENOSPC);
                        while (offset == RADIX_TREE_MAP_SIZE) {
                                offset = node->offset + 1;
                                node = node->parent;
                                if (!node)
                                        goto grow;
                                shift = node->shift;
                        }
                        child = rcu_dereference_raw(node->slots[offset]);
                }
                slot = &node->slots[offset];
        }

        iter->index = start;
        if (node)
                iter->next_index = 1 + min(max, (start | node_maxindex(node)));
        else
                iter->next_index = 1;
        iter->node = node;
        set_iter_tags(iter, node, offset, IDR_FREE);

        return slot;
}

/**
 * idr_destroy - release all internal memory from an IDR
 * @idr: idr handle
 *
 * After this function is called, the IDR is empty, and may be reused or
 * the data structure containing it may be freed.
 *
 * A typical clean-up sequence for objects stored in an idr tree will use
 * idr_for_each() to free all objects, if necessary, then idr_destroy() to
 * free the memory used to keep track of those objects.
 */
void idr_destroy(struct idr *idr)
{
        struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
        if (radix_tree_is_internal_node(node))
                radix_tree_free_nodes(node);
        idr->idr_rt.xa_head = NULL;
        root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);

static void
radix_tree_node_ctor(void *arg)
{
        struct radix_tree_node *node = arg;

        memset(node, 0, sizeof(*node));
        INIT_LIST_HEAD(&node->private_list);
}

static int radix_tree_cpu_dead(unsigned int cpu)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;

        /* Free per-cpu pool of preloaded nodes */
        rtp = &per_cpu(radix_tree_preloads, cpu);
        while (rtp->nr) {
                node = rtp->nodes;
                rtp->nodes = node->parent;
                kmem_cache_free(radix_tree_node_cachep, node);
                rtp->nr--;
        }
        return 0;
}

void __init radix_tree_init(void)
{
        int ret;

        BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
        BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
        BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
                        sizeof(struct radix_tree_node), 0,
                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
                        radix_tree_node_ctor);
        ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
                                        NULL, radix_tree_cpu_dead);
        WARN_ON(ret < 0);
}



















































































































    5 























































































































































    5 




    4 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 












































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H

#include <linux/pfn.h>
#include <asm/pgtable.h>

#define PMD_ORDER        (PMD_SHIFT - PAGE_SHIFT)
#define PUD_ORDER        (PUD_SHIFT - PAGE_SHIFT)

#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU

#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif

/*
 * On almost all architectures and configurations, 0 can be used as the
 * upper ceiling to free_pgtables(): on many architectures it has the same
 * effect as using TASK_SIZE.  However, there is one configuration which
 * must impose a more careful limit, to avoid freeing kernel pgtables.
 */
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING        0UL
#endif

/*
 * This defines the first usable user address. Platforms
 * can override its value with custom FIRST_USER_ADDRESS
 * defined in their respective <asm/pgtable.h>.
 */
#ifndef FIRST_USER_ADDRESS
#define FIRST_USER_ADDRESS        0UL
#endif

/*
 * This defines the generic helper for accessing PMD page
 * table page. Although platforms can still override this
 * via their respective <asm/pgtable.h>.
 */
#ifndef pmd_pgtable
#define pmd_pgtable(pmd) pmd_page(pmd)
#endif

#define pmd_folio(pmd) page_folio(pmd_page(pmd))

/*
 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
 *
 * The pXx_index() functions return the index of the entry in the page
 * table page which would control the given virtual address
 *
 * As these functions may be used by the same code for different levels of
 * the page table folding, they are always available, regardless of
 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
 * because in such cases PTRS_PER_PxD equals 1.
 */

static inline unsigned long pte_index(unsigned long address)
{
        return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}

#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif

#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
        return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif

#ifdef CONFIG_HIGHPTE
#define __pte_map(pmd, address) \
        ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
#define pte_unmap(pte)        do {        \
        kunmap_local((pte));        \
        rcu_read_unlock();        \
} while (0)
#else
static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline void pte_unmap(pte_t *pte)
{
        rcu_read_unlock();
}
#endif

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);

/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
        return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
        return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
        return (pgd + pgd_index(address));
};

/*
 * a shortcut to get a pgd_t in a given mm
 */
#ifndef pgd_offset
#define pgd_offset(mm, address)                pgd_offset_pgd((mm)->pgd, (address))
#endif

/*
 * a shortcut which implies the use of the kernel's pgd, instead
 * of a process's
 */
#define pgd_offset_k(address)                pgd_offset(&init_mm, (address))

/*
 * In many cases it is known that a virtual address is mapped at PMD or PTE
 * level, so instead of traversing all the page table levels, we can get a
 * pointer to the PMD entry in user or kernel page table or translate a virtual
 * address to the pointer in the PTE in the kernel page tables with simple
 * helpers.
 */
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}

static inline pmd_t *pmd_off_k(unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}

static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
        pmd_t *pmd = pmd_off_k(vaddr);

        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}

#ifndef pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return 0;
}
#endif

#ifndef pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
        return 0;
}
#endif

/*
 * A facility to provide lazy MMU batching.  This allows PTE updates and
 * page invalidations to be delayed until a call to leave lazy MMU mode
 * is issued.  Some architectures may benefit from doing this, and it is
 * beneficial for both shadow and direct mode hypervisors, which may batch
 * the PTE updates which happen during this window.  Note that using this
 * interface requires that read hazards be removed from the code.  A read
 * hazard could result in the direct mode hypervisor case, since the actual
 * write to the page tables may not yet have taken place, so reads though
 * a raw PTE pointer after it has been modified are not guaranteed to be
 * up to date.  This mode can only be entered and left under the protection of
 * the page table locks for all page tables which may be modified.  In the UP
 * case, this is required so that preemption is disabled, and in the SMP case,
 * it must synchronize the delayed page table writes properly on other CPUs.
 */
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode()        do {} while (0)
#define arch_leave_lazy_mmu_mode()        do {} while (0)
#define arch_flush_lazy_mmu_mode()        do {} while (0)
#endif

#ifndef pte_batch_hint
/**
 * pte_batch_hint - Number of pages that can be added to batch without scanning.
 * @ptep: Page table pointer for the entry.
 * @pte: Page table entry.
 *
 * Some architectures know that a set of contiguous ptes all map the same
 * contiguous memory with the same permissions. In this case, it can provide a
 * hint to aid pte batching without the core code needing to scan every pte.
 *
 * An architecture implementation may ignore the PTE accessed state. Further,
 * the dirty state must apply atomically to all the PTEs described by the hint.
 *
 * May be overridden by the architecture, else pte_batch_hint is always 1.
 */
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
{
        return 1;
}
#endif

#ifndef pte_advance_pfn
static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif

#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)

#ifndef set_ptes
/**
 * set_ptes - Map consecutive pages to a contiguous range of addresses.
 * @mm: Address space to map the pages into.
 * @addr: Address to map the first page at.
 * @ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @nr: Number of pages to map.
 *
 * When nr==1, initial state of pte may be present or not present, and new state
 * may be present or not present. When nr>1, initial state of all ptes must be
 * not present, and new state must be present.
 *
 * May be overridden by the architecture, or the architecture can define
 * set_pte() and PFN_PTE_SHIFT.
 *
 * Context: The caller holds the page table lock.  The pages all belong
 * to the same folio.  The PTEs are all in the same PMD.
 */
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        page_table_check_ptes_set(mm, ptep, pte, nr);

        arch_enter_lazy_mmu_mode();
        for (;;) {
                set_pte(ptep, pte);
                if (--nr == 0)
                        break;
                ptep++;
                pte = pte_next_pfn(pte);
        }
        arch_leave_lazy_mmu_mode();
}
#endif
#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pud_t *pudp,
                                        pud_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}
#endif

#ifndef pmdp_get
static inline pmd_t pmdp_get(pmd_t *pmdp)
{
        return READ_ONCE(*pmdp);
}
#endif

#ifndef pudp_get
static inline pud_t pudp_get(pud_t *pudp)
{
        return READ_ONCE(*pudp);
}
#endif

#ifndef p4dp_get
static inline p4d_t p4dp_get(p4d_t *p4dp)
{
        return READ_ONCE(*p4dp);
}
#endif

#ifndef pgdp_get
static inline pgd_t pgdp_get(pgd_t *pgdp)
{
        return READ_ONCE(*pgdp);
}
#endif

#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        int r = 1;
        if (!pte_young(pte))
                r = 0;
        else
                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
        return r;
}
#endif

#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        int r = 1;
        if (!pmd_young(pmd))
                r = 0;
        else
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return r;
}
#else
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
#else
/*
 * Despite relevant to THP only, this API is called from generic rmap code
 * under PageTransHuge(), hence needs a dummy implementation for !THP
 */
static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef arch_has_hw_nonleaf_pmd_young
/*
 * Return whether the accessed bit in non-leaf PMD entries is supported on the
 * local CPU.
 */
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
}
#endif

#ifndef arch_has_hw_pte_young
/*
 * Return whether the accessed bit is supported on the local CPU.
 *
 * This stub assumes accessing through an old PTE triggers a page fault.
 * Architectures that automatically set the access bit should overwrite it.
 */
static inline bool arch_has_hw_pte_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
}
#endif

#ifndef arch_check_zapped_pte
static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
                                         pte_t pte)
{
}
#endif

#ifndef arch_check_zapped_pmd
static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
                                         pmd_t pmd)
{
}
#endif

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        pte_clear(mm, address, ptep);
        page_table_check_pte_clear(mm, pte);
        return pte;
}
#endif

#ifndef clear_young_dirty_ptes
/**
 * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the
 *                same folio as old/clean.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to mark old/clean.
 * @flags: Flags to modify the PTE batch semantics.
 *
 * May be overridden by the architecture; otherwise, implemented by
 * get_and_clear/modify/set for each pte in the range.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep,
                                          unsigned int nr, cydp_t flags)
{
        pte_t pte;

        for (;;) {
                if (flags == CYDP_CLEAR_YOUNG)
                        ptep_test_and_clear_young(vma, addr, ptep);
                else {
                        pte = ptep_get_and_clear(vma->vm_mm, addr, ptep);
                        if (flags & CYDP_CLEAR_YOUNG)
                                pte = pte_mkold(pte);
                        if (flags & CYDP_CLEAR_DIRTY)
                                pte = pte_mkclean(pte);
                        set_pte_at(vma->vm_mm, addr, ptep, pte);
                }
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
{
        ptep_get_and_clear(mm, addr, ptep);
}

#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
/*
 * For walking the pagetables without holding any locks.  Some architectures
 * (eg x86-32 PAE) cannot load the entries atomically without using expensive
 * instructions.  We are guaranteed that a PTE will only either go from not
 * present to present, or present to not present -- it will not switch to a
 * completely different present page without a TLB flush inbetween; which we
 * are blocking by holding interrupts off.
 *
 * Setting ptes from not present to present goes:
 *
 *   ptep->pte_high = h;
 *   smp_wmb();
 *   ptep->pte_low = l;
 *
 * And present to not present goes:
 *
 *   ptep->pte_low = 0;
 *   smp_wmb();
 *   ptep->pte_high = 0;
 *
 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
 * We load pte_high *after* loading pte_low, which ensures we don't see an older
 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
 * picked up a changed pte high. We might have gotten rubbish values from
 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
 * operates on present ptes we're safe.
 */
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        pte_t pte;

        do {
                pte.pte_low = ptep->pte_low;
                smp_rmb();
                pte.pte_high = ptep->pte_high;
                smp_rmb();
        } while (unlikely(pte.pte_low != ptep->pte_low));

        return pte;
}
#define ptep_get_lockless ptep_get_lockless

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        pmd_t pmd;

        do {
                pmd.pmd_low = pmdp->pmd_low;
                smp_rmb();
                pmd.pmd_high = pmdp->pmd_high;
                smp_rmb();
        } while (unlikely(pmd.pmd_low != pmdp->pmd_low));

        return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */

/*
 * We require that the PTE can be read atomically.
 */
#ifndef ptep_get_lockless
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif

#ifndef pmdp_get_lockless
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        return pmdp_get(pmdp);
}
static inline void pmdp_get_lockless_sync(void)
{
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;

        pmd_clear(pmdp);
        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pud_t *pudp)
{
        pud_t pud = *pudp;

        pud_clear(pudp);
        page_table_check_pud_clear(mm, pud);

        return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
{
        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif

#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pud_t *pudp,
                                            int full)
{
        return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pte_t *ptep,
                                            int full)
{
        return ptep_get_and_clear(mm, address, ptep);
}
#endif

#ifndef get_and_clear_full_ptes
/**
 * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
 *                             the same folio, collecting dirty/accessed bits.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
 * returned PTE.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        pte_t pte, tmp_pte;

        pte = ptep_get_and_clear_full(mm, addr, ptep, full);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}
#endif

#ifndef clear_full_ptes
/**
 * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
 *                     folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                ptep_get_and_clear_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/*
 * If two threads concurrently fault at the same page, the thread that
 * won the race updates the PTE and its local TLB/Cache. The other thread
 * gives up, simply does nothing, and continues; on architectures where
 * software can update TLB,  local TLB can be updated here to avoid next page
 * fault. This function updates TLB only, do nothing with cache or others.
 * It is the difference with function update_mmu_cache.
 */
#ifndef __HAVE_ARCH_UPDATE_MMU_TLB
static inline void update_mmu_tlb(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep)
{
}
#define __HAVE_ARCH_UPDATE_MMU_TLB
#endif

/*
 * Some architectures may be able to avoid expensive synchronization
 * primitives when modifications are made to PTE's which are already
 * not present, or in the process of an address space destruction.
 */
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
                                              unsigned long address,
                                              pte_t *ptep,
                                              int full)
{
        pte_clear(mm, address, ptep);
}
#endif

#ifndef clear_not_present_full_ptes
/**
 * clear_not_present_full_ptes - Clear multiple not present PTEs which are
 *                                 consecutive in the pgtable.
 * @mm: Address space the ptes represent.
 * @addr: Address of the first pte.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over pte_clear_not_present_full().
 *
 * Context: The caller holds the page table lock.  The PTEs are all not present.
 * The PTEs are all in the same PMD.
 */
static inline void clear_not_present_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                pte_clear_not_present_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pud_t *pudp);
#endif

#ifndef pte_mkwrite
static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        return pte_mkwrite_novma(pte);
}
#endif

#if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite)
static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        return pmd_mkwrite_novma(pmd);
}
#endif

#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
        pte_t old_pte = ptep_get(ptep);
        set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif

#ifndef wrprotect_ptes
/**
 * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
 *                    folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to write-protect.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_set_wrprotect().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        for (;;) {
                ptep_set_wrprotect(mm, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/*
 * On some architectures hardware does not set page access bit when accessing
 * memory page, it is responsibility of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
 * where software maintains page access bit.
 */
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
        return pte;
}
#define pte_sw_mkyoung        pte_sw_mkyoung
#endif

#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        pud_t old_pud = *pudp;

        set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif

#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pmd_t *pmdp)
{
        BUILD_BUG();
        return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif

#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
 * can't race with CPU which sets these bits and non-atomic approach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
        return old_pmd;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD

/*
 * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
 * hugepage mapping in the page tables. This function is similar to
 * pmdp_invalidate(), but should only be used if the access and dirty bits would
 * not be cleared by the software in the new PMD value. The function ensures
 * that hardware changes of the access and dirty bits updates would not be lost.
 *
 * Doing so can allow in certain architectures to avoid a TLB flush in most
 * cases. Yet, another TLB flush might be necessary later if the PMD update
 * itself requires such flush (e.g., if protection was set to be stricter). Yet,
 * even when a TLB flush is needed because of the update, the caller may be able
 * to batch these TLB flushing operations, so fewer TLB flush operations are
 * needed.
 */
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
        return pte_val(pte_a) == pte_val(pte_b);
}
#endif

#ifndef __HAVE_ARCH_PTE_UNUSED
/*
 * Some architectures provide facilities to virtualization guests
 * so that they can flag allocated pages as unused. This allows the
 * host to transparently reclaim unused pages. This function returns
 * whether the pte's page is unused.
 */
static inline int pte_unused(pte_t pte)
{
        return 0;
}
#endif

#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
        (pte_present(pte) && (!(write) || pte_write(pte)))
#endif

#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
        (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif

#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
        (pud_present(pud) && (!(write) || pud_write(pud)))
#endif

#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
        (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif

#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
        (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif

#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
        return pmd_val(pmd_a) == pmd_val(pmd_b);
}
#endif

#ifndef pud_same
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
        return pud_val(pud_a) == pud_val(pud_b);
}
#define pud_same pud_same
#endif

#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
        return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif

#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
        return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif

/*
 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
 * TLB flush will be required as a result of the "set". For example, use
 * in scenarios where it is known ahead of time that the routine is
 * setting non-present entries, or re-setting an existing entry to the
 * same value. Otherwise, use the typical "set" helpers and flush the
 * TLB.
 */
#define set_pte_safe(ptep, pte) \
({ \
        WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
        set_pte(ptep, pte); \
})

#define set_pmd_safe(pmdp, pmd) \
({ \
        WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
        set_pmd(pmdp, pmd); \
})

#define set_pud_safe(pudp, pud) \
({ \
        WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
        set_pud(pudp, pud); \
})

#define set_p4d_safe(p4dp, p4d) \
({ \
        WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
        set_p4d(p4dp, p4d); \
})

#define set_pgd_safe(pgdp, pgd) \
({ \
        WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
        set_pgd(pgdp, pgd); \
})

#ifndef __HAVE_ARCH_DO_SWAP_PAGE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_do_swap_page() can restore this
 * metadata when a page is swapped back in.
 */
static inline void arch_do_swap_page(struct mm_struct *mm,
                                     struct vm_area_struct *vma,
                                     unsigned long addr,
                                     pte_t pte, pte_t oldpte)
{

}
#endif

#ifndef __HAVE_ARCH_UNMAP_ONE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_unmap_one() can save this
 * metadata on a swap-out of a page.
 */
static inline int arch_unmap_one(struct mm_struct *mm,
                                  struct vm_area_struct *vma,
                                  unsigned long addr,
                                  pte_t orig_pte)
{
        return 0;
}
#endif

/*
 * Allow architectures to preserve additional metadata associated with
 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
 * prototypes must be defined in the arch-specific asm/pgtable.h file.
 */
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct folio *folio)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}

static inline void arch_swap_invalidate_area(int type)
{
}
#endif

#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
{
}
#endif

#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
#define pgd_offset_gate(mm, addr)        pgd_offset(mm, addr)
#endif

#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, old_addr, new_addr)        (pte)
#endif

#ifndef pte_accessible
# define pte_accessible(mm, pte)        ((void)(pte), 1)
#endif

#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif

/*
 * When walking page tables, get the address of the next boundary,
 * or the end address of the range if that comes earlier.  Although no
 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
 */

#define pgd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})

#ifndef p4d_addr_end
#define p4d_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pmd_addr_end
#define pmd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

/*
 * When walking page tables, we usually want to skip any p?d_none entries;
 * and any p?d_bad entries - reporting the error before resetting to none.
 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
 */
void pgd_clear_bad(pgd_t *);

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d)        do { } while (0)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d)        do { } while (0)
#endif

void pmd_clear_bad(pmd_t *);

static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
        if (pgd_none(*pgd))
                return 1;
        if (unlikely(pgd_bad(*pgd))) {
                pgd_clear_bad(pgd);
                return 1;
        }
        return 0;
}

static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
        if (p4d_none(*p4d))
                return 1;
        if (unlikely(p4d_bad(*p4d))) {
                p4d_clear_bad(p4d);
                return 1;
        }
        return 0;
}

static inline int pud_none_or_clear_bad(pud_t *pud)
{
        if (pud_none(*pud))
                return 1;
        if (unlikely(pud_bad(*pud))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
        if (pmd_none(*pmd))
                return 1;
        if (unlikely(pmd_bad(*pmd))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep)
{
        /*
         * Get the current pte state, but zero it out to make it
         * non-present, preventing the hardware from asynchronously
         * updating it.
         */
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep, pte_t pte)
{
        /*
         * The pte is non-present, so there's no hardware state to
         * preserve.
         */
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
 * Start a pte protection read-modify-write transaction, which
 * protects against asynchronous hardware modifications to the pte.
 * The intention is not to prevent the hardware from making pte
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
 * pte; the appropriate pte lock must be held over the transaction.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
 * queue the update to be done at some later time.  The update must be
 * actually committed before the pte lock is released, however.
 */
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep)
{
        return __ptep_modify_prot_start(vma, addr, ptep);
}

/*
 * Commit an update to a pte, leaving any hardware-controlled bits in
 * the PTE unmodified.
 */
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{
        __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */

/*
 * No-op macros that just return the current protection value. Defined here
 * because these macros can be used even if CONFIG_MMU is not defined.
 */

#ifndef pgprot_nx
#define pgprot_nx(prot)        (prot)
#endif

#ifndef pgprot_noncached
#define pgprot_noncached(prot)        (prot)
#endif

#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif

#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif

#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif

#ifndef pgprot_mhp
#define pgprot_mhp(prot)        (prot)
#endif

#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
                newprot = pgprot_noncached(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
                newprot = pgprot_writecombine(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
                newprot = pgprot_device(newprot);
        return newprot;
}
#endif
#endif /* CONFIG_MMU */

#ifndef pgprot_encrypted
#define pgprot_encrypted(prot)        (prot)
#endif

#ifndef pgprot_decrypted
#define pgprot_decrypted(prot)        (prot)
#endif

/*
 * A facility to provide batching of the reload of page tables and
 * other process state with the actual context switch code for
 * paravirtualized guests.  By convention, only one of the batched
 * update (lazy) modes (CPU, MMU) should be active at any given time,
 * entry should never be nested, and entry and exits should always be
 * paired.  This is for sanity of maintaining and reasoning about the
 * kernel code.  In this case, the exit (end of the context switch) is
 * in architecture-specific code, and so doesn't need a generic
 * definition.
 */
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev)        do {} while (0)
#endif

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
        return 0;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return 0;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif

#ifndef __HAVE_PFNMAP_TRACKING
/*
 * Interfaces that can be used by architecture code to keep track of
 * memory type of pfn mappings specified by the remap_pfn_range,
 * vmf_insert_pfn.
 */

/*
 * track_pfn_remap is called when a _new_ pfn mapping is being established
 * by remap_pfn_range() for physical range indicated by pfn and size.
 */
static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                                  unsigned long pfn, unsigned long addr,
                                  unsigned long size)
{
        return 0;
}

/*
 * track_pfn_insert is called when a _new_ single pfn is established
 * by vmf_insert_pfn().
 */
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                                    pfn_t pfn)
{
}

/*
 * track_pfn_copy is called when vma that is covering the pfnmap gets
 * copied through copy_page_range().
 */
static inline int track_pfn_copy(struct vm_area_struct *vma)
{
        return 0;
}

/*
 * untrack_pfn is called while unmapping a pfnmap for a region.
 * untrack can be called for a specific region indicated by pfn and size or
 * can be for the entire vma (in which case pfn, size are zero).
 */
static inline void untrack_pfn(struct vm_area_struct *vma,
                               unsigned long pfn, unsigned long size,
                               bool mm_wr_locked)
{
}

/*
 * untrack_pfn_clear is called while mremapping a pfnmap for a new region
 * or fails to copy pgtable during duplicate vm area.
 */
static inline void untrack_pfn_clear(struct vm_area_struct *vma)
{
}
#else
extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                           unsigned long pfn, unsigned long addr,
                           unsigned long size);
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                             pfn_t pfn);
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                        unsigned long size, bool mm_wr_locked);
extern void untrack_pfn_clear(struct vm_area_struct *vma);
#endif

#ifdef CONFIG_MMU
#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        unsigned long offset_from_zero_pfn = pfn - zero_pfn;
        return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}

#define my_zero_pfn(addr)        page_to_pfn(ZERO_PAGE(addr))

#else
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        return pfn == zero_pfn;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        extern unsigned long zero_pfn;
        return zero_pfn;
}
#endif
#else
static inline int is_zero_pfn(unsigned long pfn)
{
        return 0;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        return 0;
}
#endif /* CONFIG_MMU */

#ifdef CONFIG_MMU

#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
        BUG();
        return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef pud_write
static inline int pud_write(pud_t pud)
{
        BUG();
        return 0;
}
#endif /* pud_write */

#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
        return 0;
}
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif

#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
        return 0;
}
#endif

static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        pud_t pudval = READ_ONCE(*pud);

        if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
                return 1;
        if (unlikely(pud_bad(pudval))) {
                pud_clear_bad(pud);
                return 1;
        }
#endif
        return 0;
}

#ifndef CONFIG_NUMA_BALANCING
/*
 * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
 * perfectly valid to indicate "no" in that case, which is why our default
 * implementation defaults to "always no".
 *
 * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE
 * page protection due to NUMA hinting. NUMA hinting faults only apply in
 * accessible VMAs.
 *
 * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault,
 * looking at the VMA accessibility is sufficient.
 */
static inline int pte_protnone(pte_t pte)
{
        return 0;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

#endif /* CONFIG_MMU */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
void p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
#endif /* !__PAGETABLE_P4D_FOLDED */

int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else        /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
static inline int pud_clear_huge(pud_t *pud)
{
        return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
        return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return 0;
}
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
 * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single THP flush will
 * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
#define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end)        BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end)        BUILD_BUG()
#endif
#endif

struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                        unsigned long size, pgprot_t *vma_prot);

#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif

extern void __init pgtable_cache_init(void);

#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        return true;
}

static inline bool arch_has_pfn_modify_check(void)
{
        return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */

/*
 * Architecture PAGE_KERNEL_* fallbacks
 *
 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
 * because they really don't support them, or the port needs to be updated to
 * reflect the required functionality. Below are a set of relatively safe
 * fallbacks, as best effort, which we can count on in lieu of the architectures
 * not defining them on their own yet.
 */

#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif

#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/*
 * Page Table Modification bits for pgtbl_mod_mask.
 *
 * These are used by the p?d_alloc_track*() set of functions an in the generic
 * vmalloc/ioremap code to track at which page-table levels entries have been
 * modified. Based on that the code can better decide when vmalloc and ioremap
 * mapping changes need to be synchronized to other page-tables in the system.
 */
#define                __PGTBL_PGD_MODIFIED        0
#define                __PGTBL_P4D_MODIFIED        1
#define                __PGTBL_PUD_MODIFIED        2
#define                __PGTBL_PMD_MODIFIED        3
#define                __PGTBL_PTE_MODIFIED        4

#define                PGTBL_PGD_MODIFIED        BIT(__PGTBL_PGD_MODIFIED)
#define                PGTBL_P4D_MODIFIED        BIT(__PGTBL_P4D_MODIFIED)
#define                PGTBL_PUD_MODIFIED        BIT(__PGTBL_PUD_MODIFIED)
#define                PGTBL_PMD_MODIFIED        BIT(__PGTBL_PMD_MODIFIED)
#define                PGTBL_PTE_MODIFIED        BIT(__PGTBL_PTE_MODIFIED)

/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;

#endif /* !__ASSEMBLY__ */

#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
 * with physical address space extension, but falls back to
 * BITS_PER_LONG otherwise.
 */
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif

#ifndef has_transparent_hugepage
#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif

#ifndef has_transparent_pud_hugepage
#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
#endif
/*
 * On some architectures it depends on the mm if the p4d/pud or pmd
 * layer of the page table hierarchy is folded or not.
 */
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm)        __is_defined(__PAGETABLE_P4D_FOLDED)
#endif

#ifndef mm_pud_folded
#define mm_pud_folded(mm)        __is_defined(__PAGETABLE_PUD_FOLDED)
#endif

#ifndef mm_pmd_folded
#define mm_pmd_folded(mm)        __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
 * pXd_leaf() is the API to check whether a pgtable entry is a huge page
 * mapping.  It should work globally across all archs, without any
 * dependency on CONFIG_* options.  For architectures that do not support
 * huge mappings on specific levels, below fallbacks will be used.
 *
 * A leaf pgtable entry should always imply the following:
 *
 * - It is a "present" entry.  IOW, before using this API, please check it
 *   with pXd_present() first. NOTE: it may not always mean the "present
 *   bit" is set.  For example, PROT_NONE entries are always "present".
 *
 * - It should _never_ be a swap entry of any type.  Above "present" check
 *   should have guarded this, but let's be crystal clear on this.
 *
 * - It should contain a huge PFN, which points to a huge page larger than
 *   PAGE_SIZE of the platform.  The PFN format isn't important here.
 *
 * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(),
 *   pXd_devmap(), or hugetlb mappings).
 */
#ifndef pgd_leaf
#define pgd_leaf(x)        false
#endif
#ifndef p4d_leaf
#define p4d_leaf(x)        false
#endif
#ifndef pud_leaf
#define pud_leaf(x)        false
#endif
#ifndef pmd_leaf
#define pmd_leaf(x)        false
#endif

#ifndef pgd_leaf_size
#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
#endif
#ifndef p4d_leaf_size
#define p4d_leaf_size(x) P4D_SIZE
#endif
#ifndef pud_leaf_size
#define pud_leaf_size(x) PUD_SIZE
#endif
#ifndef pmd_leaf_size
#define pmd_leaf_size(x) PMD_SIZE
#endif
#ifndef pte_leaf_size
#define pte_leaf_size(x) PAGE_SIZE
#endif

/*
 * We always define pmd_pfn for all archs as it's used in lots of generic
 * code.  Now it happens too for pud_pfn (and can happen for larger
 * mappings too in the future; we're not there yet).  Instead of defining
 * it for all archs (like pmd_pfn), provide a fallback.
 *
 * Note that returning 0 here means any arch that didn't define this can
 * get severely wrong when it hits a real pud leaf.  It's arch's
 * responsibility to properly define it when a huge pud is possible.
 */
#ifndef pud_pfn
#define pud_pfn(x) 0
#endif

/*
 * Some architectures have MMUs that are configurable or selectable at boot
 * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
 * helps to have a static maximum value.
 */

#ifndef MAX_PTRS_PER_PTE
#define MAX_PTRS_PER_PTE PTRS_PER_PTE
#endif

#ifndef MAX_PTRS_PER_PMD
#define MAX_PTRS_PER_PMD PTRS_PER_PMD
#endif

#ifndef MAX_PTRS_PER_PUD
#define MAX_PTRS_PER_PUD PTRS_PER_PUD
#endif

#ifndef MAX_PTRS_PER_P4D
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
#endif

/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type        prot
 *                PROT_NONE        PROT_READ        PROT_WRITE        PROT_EXEC
 * MAP_SHARED        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (yes) yes        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * MAP_PRIVATE        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (copy) copy        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
 * MAP_PRIVATE (with Enhanced PAN supported):
 *                                                                r: (no) no
 *                                                                w: (no) no
 *                                                                x: (yes) yes
 */
#define DECLARE_VM_GET_PAGE_PROT                                        \
pgprot_t vm_get_page_prot(unsigned long vm_flags)                        \
{                                                                        \
                return protection_map[vm_flags &                        \
                        (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];        \
}                                                                        \
EXPORT_SYMBOL(vm_get_page_prot);

#endif /* _LINUX_PGTABLE_H */









































    2 
















    2 























    3 




    1 



    2 
    1 



















    2 






    3 




    2 
    2 





    2 
    3 









    3 









































    2 










    1 











    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
// SPDX-License-Identifier: GPL-2.0-only
#include <net/tcp.h>

/* The bandwidth estimator estimates the rate at which the network
 * can currently deliver outbound data packets for this flow. At a high
 * level, it operates by taking a delivery rate sample for each ACK.
 *
 * A rate sample records the rate at which the network delivered packets
 * for this flow, calculated over the time interval between the transmission
 * of a data packet and the acknowledgment of that packet.
 *
 * Specifically, over the interval between each transmit and corresponding ACK,
 * the estimator generates a delivery rate sample. Typically it uses the rate
 * at which packets were acknowledged. However, the approach of using only the
 * acknowledgment rate faces a challenge under the prevalent ACK decimation or
 * compression: packets can temporarily appear to be delivered much quicker
 * than the bottleneck rate. Since it is physically impossible to do that in a
 * sustained fashion, when the estimator notices that the ACK rate is faster
 * than the transmit rate, it uses the latter:
 *
 *    send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
 *    ack_rate  = #pkts_delivered/(last_ack_time - first_ack_time)
 *    bw = min(send_rate, ack_rate)
 *
 * Notice the estimator essentially estimates the goodput, not always the
 * network bottleneck link rate when the sending or receiving is limited by
 * other factors like applications or receiver window limits.  The estimator
 * deliberately avoids using the inter-packet spacing approach because that
 * approach requires a large number of samples and sophisticated filtering.
 *
 * TCP flows can often be application-limited in request/response workloads.
 * The estimator marks a bandwidth sample as application-limited if there
 * was some moment during the sampled window of packets when there was no data
 * ready to send in the write queue.
 */

/* Snapshot the current delivery information in the skb, to generate
 * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
 */
void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

         /* In general we need to start delivery rate samples from the
          * time we received the most recent ACK, to ensure we include
          * the full time the network needs to deliver all in-flight
          * packets. If there are no packets in flight yet, then we
          * know that any ACKs after now indicate that the network was
          * able to deliver those packets completely in the sampling
          * interval between now and the next ACK.
          *
          * Note that we use packets_out instead of tcp_packets_in_flight(tp)
          * because the latter is a guess based on RTO and loss-marking
          * heuristics. We don't want spurious RTOs or loss markings to cause
          * a spuriously small time interval, causing a spuriously high
          * bandwidth estimate.
          */
        if (!tp->packets_out) {
                u64 tstamp_us = tcp_skb_timestamp_us(skb);

                tp->first_tx_mstamp  = tstamp_us;
                tp->delivered_mstamp = tstamp_us;
        }

        TCP_SKB_CB(skb)->tx.first_tx_mstamp        = tp->first_tx_mstamp;
        TCP_SKB_CB(skb)->tx.delivered_mstamp        = tp->delivered_mstamp;
        TCP_SKB_CB(skb)->tx.delivered                = tp->delivered;
        TCP_SKB_CB(skb)->tx.delivered_ce        = tp->delivered_ce;
        TCP_SKB_CB(skb)->tx.is_app_limited        = tp->app_limited ? 1 : 0;
}

/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
 * delivery information when the skb was last transmitted.
 *
 * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
 * called multiple times. We favor the information from the most recently
 * sent skb, i.e., the skb with the most recently sent time and the highest
 * sequence.
 */
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
                            struct rate_sample *rs)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
        u64 tx_tstamp;

        if (!scb->tx.delivered_mstamp)
                return;

        tx_tstamp = tcp_skb_timestamp_us(skb);
        if (!rs->prior_delivered ||
            tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
                               scb->end_seq, rs->last_end_seq)) {
                rs->prior_delivered_ce  = scb->tx.delivered_ce;
                rs->prior_delivered  = scb->tx.delivered;
                rs->prior_mstamp     = scb->tx.delivered_mstamp;
                rs->is_app_limited   = scb->tx.is_app_limited;
                rs->is_retrans             = scb->sacked & TCPCB_RETRANS;
                rs->last_end_seq     = scb->end_seq;

                /* Record send time of most recently ACKed packet: */
                tp->first_tx_mstamp  = tx_tstamp;
                /* Find the duration of the "send phase" of this window: */
                rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
                                                     scb->tx.first_tx_mstamp);

        }
        /* Mark off the skb delivered once it's sacked to avoid being
         * used again when it's cumulatively acked. For acked packets
         * we don't need to reset since it'll be freed soon.
         */
        if (scb->sacked & TCPCB_SACKED_ACKED)
                scb->tx.delivered_mstamp = 0;
}

/* Update the connection delivery information and generate a rate sample. */
void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
                  bool is_sack_reneg, struct rate_sample *rs)
{
        struct tcp_sock *tp = tcp_sk(sk);
        u32 snd_us, ack_us;

        /* Clear app limited if bubble is acked and gone. */
        if (tp->app_limited && after(tp->delivered, tp->app_limited))
                tp->app_limited = 0;

        /* TODO: there are multiple places throughout tcp_ack() to get
         * current time. Refactor the code using a new "tcp_acktag_state"
         * to carry current time, flags, stats like "tcp_sacktag_state".
         */
        if (delivered)
                tp->delivered_mstamp = tp->tcp_mstamp;

        rs->acked_sacked = delivered;        /* freshly ACKed or SACKed */
        rs->losses = lost;                /* freshly marked lost */
        /* Return an invalid sample if no timing information is available or
         * in recovery from loss with SACK reneging. Rate samples taken during
         * a SACK reneging event may overestimate bw by including packets that
         * were SACKed before the reneg.
         */
        if (!rs->prior_mstamp || is_sack_reneg) {
                rs->delivered = -1;
                rs->interval_us = -1;
                return;
        }
        rs->delivered   = tp->delivered - rs->prior_delivered;

        rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
        /* delivered_ce occupies less than 32 bits in the skb control block */
        rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;

        /* Model sending data and receiving ACKs as separate pipeline phases
         * for a window. Usually the ACK phase is longer, but with ACK
         * compression the send phase can be longer. To be safe we use the
         * longer phase.
         */
        snd_us = rs->interval_us;                                /* send phase */
        ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
                                    rs->prior_mstamp); /* ack phase */
        rs->interval_us = max(snd_us, ack_us);

        /* Record both segment send and ack receive intervals */
        rs->snd_interval_us = snd_us;
        rs->rcv_interval_us = ack_us;

        /* Normally we expect interval_us >= min-rtt.
         * Note that rate may still be over-estimated when a spuriously
         * retransmistted skb was first (s)acked because "interval_us"
         * is under-estimated (up to an RTT). However continuously
         * measuring the delivery rate during loss recovery is crucial
         * for connections suffer heavy or prolonged losses.
         */
        if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
                if (!rs->is_retrans)
                        pr_debug("tcp rate: %ld %d %u %u %u\n",
                                 rs->interval_us, rs->delivered,
                                 inet_csk(sk)->icsk_ca_state,
                                 tp->rx_opt.sack_ok, tcp_min_rtt(tp));
                rs->interval_us = -1;
                return;
        }

        /* Record the last non-app-limited or the highest app-limited bw */
        if (!rs->is_app_limited ||
            ((u64)rs->delivered * tp->rate_interval_us >=
             (u64)tp->rate_delivered * rs->interval_us)) {
                tp->rate_delivered = rs->delivered;
                tp->rate_interval_us = rs->interval_us;
                tp->rate_app_limited = rs->is_app_limited;
        }
}

/* If a gap is detected between sends, mark the socket application-limited. */
void tcp_rate_check_app_limited(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (/* We have less than one packet to send. */
            tp->write_seq - tp->snd_nxt < tp->mss_cache &&
            /* Nothing in sending host's qdisc queues or NIC tx queue. */
            sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
            /* We are not limited by CWND. */
            tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
            /* All lost packets have been retransmitted. */
            tp->lost_out <= tp->retrans_out)
                tp->app_limited =
                        (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
}
EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 





































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
// SPDX-License-Identifier: GPL-2.0+
/*
 * User-space Probes (UProbes)
 *
 * Copyright (C) IBM Corporation, 2008-2012
 * Authors:
 *        Srikar Dronamraju
 *        Jim Keniston
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 */

#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>        /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h>                /* anon_vma_prepare */
#include <linux/mmu_notifier.h>
#include <linux/swap.h>                /* folio_free_swap */
#include <linux/ptrace.h>        /* user_enable_single_step */
#include <linux/kdebug.h>        /* notifier mechanism */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>

#include <linux/uprobes.h>

#define UINSNS_PER_PAGE                        (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS                UINSNS_PER_PAGE

static struct rb_root uprobes_tree = RB_ROOT;
/*
 * allows us to skip the uprobe_mmap if there are no uprobe events active
 * at this time.  Probably a fine grained per inode count is better?
 */
#define no_uprobe_events()        RB_EMPTY_ROOT(&uprobes_tree)

static DEFINE_RWLOCK(uprobes_treelock);        /* serialize rbtree access */

#define UPROBES_HASH_SZ        13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v)        (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])

DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);

/* Have a copy of original instruction */
#define UPROBE_COPY_INSN        0

struct uprobe {
        struct rb_node                rb_node;        /* node in the rb tree */
        refcount_t                ref;
        struct rw_semaphore        register_rwsem;
        struct rw_semaphore        consumer_rwsem;
        struct list_head        pending_list;
        struct uprobe_consumer        *consumers;
        struct inode                *inode;                /* Also hold a ref to inode */
        loff_t                        offset;
        loff_t                        ref_ctr_offset;
        unsigned long                flags;

        /*
         * The generic code assumes that it has two members of unknown type
         * owned by the arch-specific code:
         *
         *         insn -        copy_insn() saves the original instruction here for
         *                arch_uprobe_analyze_insn().
         *
         *        ixol -        potentially modified instruction to execute out of
         *                line, copied to xol_area by xol_get_insn_slot().
         */
        struct arch_uprobe        arch;
};

struct delayed_uprobe {
        struct list_head list;
        struct uprobe *uprobe;
        struct mm_struct *mm;
};

static DEFINE_MUTEX(delayed_uprobe_lock);
static LIST_HEAD(delayed_uprobe_list);

/*
 * Execute out of line area: anonymous executable mapping installed
 * by the probed task to execute the copy of the original instruction
 * mangled by set_swbp().
 *
 * On a breakpoint hit, thread contests for a slot.  It frees the
 * slot after singlestep. Currently a fixed number of slots are
 * allocated.
 */
struct xol_area {
        wait_queue_head_t                 wq;                /* if all slots are busy */
        atomic_t                         slot_count;        /* number of in-use slots */
        unsigned long                         *bitmap;        /* 0 = free slot */

        struct vm_special_mapping        xol_mapping;
        struct page                         *pages[2];
        /*
         * We keep the vma's vm_start rather than a pointer to the vma
         * itself.  The probed process or a naughty kernel module could make
         * the vma go away, and we must handle that reasonably gracefully.
         */
        unsigned long                         vaddr;                /* Page(s) of instruction slots */
};

/*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
 * changed after breakpoint was inserted.
 *        - is_register: indicates if we are in register context.
 *        - Return 1 if the specified virtual address is in an
 *          executable vma.
 */
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;

        if (is_register)
                flags |= VM_WRITE;

        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}

static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
{
        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
}

static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
{
        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}

/**
 * __replace_page - replace page in vma by new page.
 * based on replace_page in mm/ksm.c
 *
 * @vma:      vma that holds the pte pointing to page
 * @addr:     address the old @page is mapped at
 * @old_page: the page we are replacing by new_page
 * @new_page: the modified page we replace page by
 *
 * If @new_page is NULL, only unmap @old_page.
 *
 * Returns 0 on success, negative error code otherwise.
 */
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                                struct page *old_page, struct page *new_page)
{
        struct folio *old_folio = page_folio(old_page);
        struct folio *new_folio;
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
        int err;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
                                addr + PAGE_SIZE);

        if (new_page) {
                new_folio = page_folio(new_page);
                err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
                if (err)
                        return err;
        }

        /* For folio_free_swap() below */
        folio_lock(old_folio);

        mmu_notifier_invalidate_range_start(&range);
        err = -EAGAIN;
        if (!page_vma_mapped_walk(&pvmw))
                goto unlock;
        VM_BUG_ON_PAGE(addr != pvmw.address, old_page);

        if (new_page) {
                folio_get(new_folio);
                folio_add_new_anon_rmap(new_folio, vma, addr);
                folio_add_lru_vma(new_folio, vma);
        } else
                /* no new page, just dec_mm_counter for old_page */
                dec_mm_counter(mm, MM_ANONPAGES);

        if (!folio_test_anon(old_folio)) {
                dec_mm_counter(mm, mm_counter_file(old_folio));
                inc_mm_counter(mm, MM_ANONPAGES);
        }

        flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
        ptep_clear_flush(vma, addr, pvmw.pte);
        if (new_page)
                set_pte_at(mm, addr, pvmw.pte,
                           mk_pte(new_page, vma->vm_page_prot));

        folio_remove_rmap_pte(old_folio, old_page, vma);
        if (!folio_mapped(old_folio))
                folio_free_swap(old_folio);
        page_vma_mapped_walk_done(&pvmw);
        folio_put(old_folio);

        err = 0;
 unlock:
        mmu_notifier_invalidate_range_end(&range);
        folio_unlock(old_folio);
        return err;
}

/**
 * is_swbp_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_swbp_insn
 * Returns true if @insn is a breakpoint instruction.
 */
bool __weak is_swbp_insn(uprobe_opcode_t *insn)
{
        return *insn == UPROBE_SWBP_INSN;
}

/**
 * is_trap_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_trap_insn
 * Returns true if @insn is a breakpoint instruction.
 *
 * This function is needed for the case where an architecture has multiple
 * trap instructions (like powerpc).
 */
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
        return is_swbp_insn(insn);
}

static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
        kunmap_atomic(kaddr);
}

static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
        kunmap_atomic(kaddr);
}

static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
        uprobe_opcode_t old_opcode;
        bool is_swbp;

        /*
         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
         * We do not check if it is any other 'trap variant' which could
         * be conditional trap instruction such as the one powerpc supports.
         *
         * The logic is that we do not care if the underlying instruction
         * is a trap variant; uprobes always wins over any other (gdb)
         * breakpoint.
         */
        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
        is_swbp = is_swbp_insn(&old_opcode);

        if (is_swbp_insn(new_opcode)) {
                if (is_swbp)                /* register: already installed? */
                        return 0;
        } else {
                if (!is_swbp)                /* unregister: was it changed by us? */
                        return 0;
        }

        return 1;
}

static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        list_for_each_entry(du, &delayed_uprobe_list, list)
                if (du->uprobe == uprobe && du->mm == mm)
                        return du;
        return NULL;
}

static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        if (delayed_uprobe_check(uprobe, mm))
                return 0;

        du  = kzalloc(sizeof(*du), GFP_KERNEL);
        if (!du)
                return -ENOMEM;

        du->uprobe = uprobe;
        du->mm = mm;
        list_add(&du->list, &delayed_uprobe_list);
        return 0;
}

static void delayed_uprobe_delete(struct delayed_uprobe *du)
{
        if (WARN_ON(!du))
                return;
        list_del(&du->list);
        kfree(du);
}

static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;

        if (!uprobe && !mm)
                return;

        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (uprobe && du->uprobe != uprobe)
                        continue;
                if (mm && du->mm != mm)
                        continue;

                delayed_uprobe_delete(du);
        }
}

static bool valid_ref_ctr_vma(struct uprobe *uprobe,
                              struct vm_area_struct *vma)
{
        unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);

        return uprobe->ref_ctr_offset &&
                vma->vm_file &&
                file_inode(vma->vm_file) == uprobe->inode &&
                (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
                vma->vm_start <= vaddr &&
                vma->vm_end > vaddr;
}

static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *tmp;

        for_each_vma(vmi, tmp)
                if (valid_ref_ctr_vma(uprobe, tmp))
                        return tmp;

        return NULL;
}

static int
__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
        void *kaddr;
        struct page *page;
        int ret;
        short *ptr;

        if (!vaddr || !d)
                return -EINVAL;

        ret = get_user_pages_remote(mm, vaddr, 1,
                                    FOLL_WRITE, &page, NULL);
        if (unlikely(ret <= 0)) {
                /*
                 * We are asking for 1 page. If get_user_pages_remote() fails,
                 * it may return 0, in that case we have to return error.
                 */
                return ret == 0 ? -EBUSY : ret;
        }

        kaddr = kmap_atomic(page);
        ptr = kaddr + (vaddr & ~PAGE_MASK);

        if (unlikely(*ptr + d < 0)) {
                pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
                        "curr val: %d, delta: %d\n", vaddr, *ptr, d);
                ret = -EINVAL;
                goto out;
        }

        *ptr += d;
        ret = 0;
out:
        kunmap_atomic(kaddr);
        put_page(page);
        return ret;
}

static void update_ref_ctr_warn(struct uprobe *uprobe,
                                struct mm_struct *mm, short d)
{
        pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
                "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
                d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
                (unsigned long long) uprobe->offset,
                (unsigned long long) uprobe->ref_ctr_offset, mm);
}

static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
                          short d)
{
        struct vm_area_struct *rc_vma;
        unsigned long rc_vaddr;
        int ret = 0;

        rc_vma = find_ref_ctr_vma(uprobe, mm);

        if (rc_vma) {
                rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(mm, rc_vaddr, d);
                if (ret)
                        update_ref_ctr_warn(uprobe, mm, d);

                if (d > 0)
                        return ret;
        }

        mutex_lock(&delayed_uprobe_lock);
        if (d > 0)
                ret = delayed_uprobe_add(uprobe, mm);
        else
                delayed_uprobe_remove(uprobe, mm);
        mutex_unlock(&delayed_uprobe_lock);

        return ret;
}

/*
 * NOTE:
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
 * uprobe_write_opcode accordingly. This would never be a problem for archs
 * that have fixed length instructions.
 *
 * uprobe_write_opcode - write the opcode at a given virtual address.
 * @auprobe: arch specific probepoint information.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
 *
 * Called with mm->mmap_lock held for write.
 * Return 0 (success) or a negative errno.
 */
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
                        unsigned long vaddr, uprobe_opcode_t opcode)
{
        struct uprobe *uprobe;
        struct page *old_page, *new_page;
        struct vm_area_struct *vma;
        int ret, is_register, ref_ctr_updated = 0;
        bool orig_page_huge = false;
        unsigned int gup_flags = FOLL_FORCE;

        is_register = is_swbp_insn(&opcode);
        uprobe = container_of(auprobe, struct uprobe, arch);

retry:
        if (is_register)
                gup_flags |= FOLL_SPLIT_PMD;
        /* Read the page with vaddr into memory */
        old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
        if (IS_ERR(old_page))
                return PTR_ERR(old_page);

        ret = verify_opcode(old_page, vaddr, &opcode);
        if (ret <= 0)
                goto put_old;

        if (WARN(!is_register && PageCompound(old_page),
                 "uprobe unregister should never work on compound page\n")) {
                ret = -EINVAL;
                goto put_old;
        }

        /* We are going to replace instruction, update ref_ctr. */
        if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
                ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
                if (ret)
                        goto put_old;

                ref_ctr_updated = 1;
        }

        ret = 0;
        if (!is_register && !PageAnon(old_page))
                goto put_old;

        ret = anon_vma_prepare(vma);
        if (ret)
                goto put_old;

        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
                goto put_old;

        __SetPageUptodate(new_page);
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);

        if (!is_register) {
                struct page *orig_page;
                pgoff_t index;

                VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);

                index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
                orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
                                          index);

                if (orig_page) {
                        if (PageUptodate(orig_page) &&
                            pages_identical(new_page, orig_page)) {
                                /* let go new_page */
                                put_page(new_page);
                                new_page = NULL;

                                if (PageCompound(orig_page))
                                        orig_page_huge = true;
                        }
                        put_page(orig_page);
                }
        }

        ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
        if (new_page)
                put_page(new_page);
put_old:
        put_page(old_page);

        if (unlikely(ret == -EAGAIN))
                goto retry;

        /* Revert back reference counter if instruction update failed. */
        if (ret && is_register && ref_ctr_updated)
                update_ref_ctr(uprobe, mm, -1);

        /* try collapse pmd for compound page */
        if (!ret && orig_page_huge)
                collapse_pte_mapped_thp(mm, vaddr, false);

        return ret;
}

/**
 * set_swbp - store breakpoint at a given address.
 * @auprobe: arch specific probepoint information.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, store the breakpoint instruction at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
}

/**
 * set_orig_insn - Restore the original instruction.
 * @mm: the probed process address space.
 * @auprobe: arch specific probepoint information.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr,
                        *(uprobe_opcode_t *)&auprobe->insn);
}

static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
        refcount_inc(&uprobe->ref);
        return uprobe;
}

static void put_uprobe(struct uprobe *uprobe)
{
        if (refcount_dec_and_test(&uprobe->ref)) {
                /*
                 * If application munmap(exec_vma) before uprobe_unregister()
                 * gets called, we don't get a chance to remove uprobe from
                 * delayed_uprobe_list from remove_breakpoint(). Do it here.
                 */
                mutex_lock(&delayed_uprobe_lock);
                delayed_uprobe_remove(uprobe, NULL);
                mutex_unlock(&delayed_uprobe_lock);
                kfree(uprobe);
        }
}

static __always_inline
int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
               const struct uprobe *r)
{
        if (l_inode < r->inode)
                return -1;

        if (l_inode > r->inode)
                return 1;

        if (l_offset < r->offset)
                return -1;

        if (l_offset > r->offset)
                return 1;

        return 0;
}

#define __node_2_uprobe(node) \
        rb_entry((node), struct uprobe, rb_node)

struct __uprobe_key {
        struct inode *inode;
        loff_t offset;
};

static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
{
        const struct __uprobe_key *a = key;
        return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
}

static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
{
        struct uprobe *u = __node_2_uprobe(a);
        return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}

static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
        struct __uprobe_key key = {
                .inode = inode,
                .offset = offset,
        };
        struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);

        if (node)
                return get_uprobe(__node_2_uprobe(node));

        return NULL;
}

/*
 * Find a uprobe corresponding to a given inode:offset
 * Acquires uprobes_treelock
 */
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
        struct uprobe *uprobe;

        read_lock(&uprobes_treelock);
        uprobe = __find_uprobe(inode, offset);
        read_unlock(&uprobes_treelock);

        return uprobe;
}

static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
        struct rb_node *node;

        node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
        if (node)
                return get_uprobe(__node_2_uprobe(node));

        /* get access + creation ref */
        refcount_set(&uprobe->ref, 2);
        return NULL;
}

/*
 * Acquire uprobes_treelock.
 * Matching uprobe already exists in rbtree;
 *        increment (access refcount) and return the matching uprobe.
 *
 * No matching uprobe; insert the uprobe in rb_tree;
 *        get a double refcount (access + creation) and return NULL.
 */
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
        struct uprobe *u;

        write_lock(&uprobes_treelock);
        u = __insert_uprobe(uprobe);
        write_unlock(&uprobes_treelock);

        return u;
}

static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
        pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
                "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
                uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
                (unsigned long long) cur_uprobe->ref_ctr_offset,
                (unsigned long long) uprobe->ref_ctr_offset);
}

static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
                                   loff_t ref_ctr_offset)
{
        struct uprobe *uprobe, *cur_uprobe;

        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
        if (!uprobe)
                return NULL;

        uprobe->inode = inode;
        uprobe->offset = offset;
        uprobe->ref_ctr_offset = ref_ctr_offset;
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);

        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
        /* a uprobe exists for this inode:offset combination */
        if (cur_uprobe) {
                if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
                        ref_ctr_mismatch_warn(cur_uprobe, uprobe);
                        put_uprobe(cur_uprobe);
                        kfree(uprobe);
                        return ERR_PTR(-EINVAL);
                }
                kfree(uprobe);
                uprobe = cur_uprobe;
        }

        return uprobe;
}

static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        down_write(&uprobe->consumer_rwsem);
        uc->next = uprobe->consumers;
        uprobe->consumers = uc;
        up_write(&uprobe->consumer_rwsem);
}

/*
 * For uprobe @uprobe, delete the consumer @uc.
 * Return true if the @uc is deleted successfully
 * or return false.
 */
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        struct uprobe_consumer **con;
        bool ret = false;

        down_write(&uprobe->consumer_rwsem);
        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
                if (*con == uc) {
                        *con = uc->next;
                        ret = true;
                        break;
                }
        }
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static int __copy_insn(struct address_space *mapping, struct file *filp,
                        void *insn, int nbytes, loff_t offset)
{
        struct page *page;
        /*
         * Ensure that the page that has the original instruction is populated
         * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
         * see uprobe_register().
         */
        if (mapping->a_ops->read_folio)
                page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
        else
                page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
        if (IS_ERR(page))
                return PTR_ERR(page);

        copy_from_page(page, offset, insn, nbytes);
        put_page(page);

        return 0;
}

static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
        struct address_space *mapping = uprobe->inode->i_mapping;
        loff_t offs = uprobe->offset;
        void *insn = &uprobe->arch.insn;
        int size = sizeof(uprobe->arch.insn);
        int len, err = -EIO;

        /* Copy only available bytes, -EIO if nothing was read */
        do {
                if (offs >= i_size_read(uprobe->inode))
                        break;

                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
                err = __copy_insn(mapping, filp, insn, len, offs);
                if (err)
                        break;

                insn += len;
                offs += len;
                size -= len;
        } while (size);

        return err;
}

static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                                struct mm_struct *mm, unsigned long vaddr)
{
        int ret = 0;

        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;

        /* TODO: move this into _register, until then we abuse this sem. */
        down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;

        ret = copy_insn(uprobe, file);
        if (ret)
                goto out;

        ret = -ENOTSUPP;
        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
                goto out;

        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
        if (ret)
                goto out;

        smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);

 out:
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static inline bool consumer_filter(struct uprobe_consumer *uc,
                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        return !uc->filter || uc->filter(uc, ctx, mm);
}

static bool filter_chain(struct uprobe *uprobe,
                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        struct uprobe_consumer *uc;
        bool ret = false;

        down_read(&uprobe->consumer_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                ret = consumer_filter(uc, ctx, mm);
                if (ret)
                        break;
        }
        up_read(&uprobe->consumer_rwsem);

        return ret;
}

static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long vaddr)
{
        bool first_uprobe;
        int ret;

        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;

        /*
         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
         * the task can hit this breakpoint right after __replace_page().
         */
        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
        if (first_uprobe)
                set_bit(MMF_HAS_UPROBES, &mm->flags);

        ret = set_swbp(&uprobe->arch, mm, vaddr);
        if (!ret)
                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
        else if (first_uprobe)
                clear_bit(MMF_HAS_UPROBES, &mm->flags);

        return ret;
}

static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
        return set_orig_insn(&uprobe->arch, mm, vaddr);
}

static inline bool uprobe_is_active(struct uprobe *uprobe)
{
        return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
 * There could be threads that have already hit the breakpoint. They
 * will recheck the current insn and restart if find_uprobe() fails.
 * See find_active_uprobe().
 */
static void delete_uprobe(struct uprobe *uprobe)
{
        if (WARN_ON(!uprobe_is_active(uprobe)))
                return;

        write_lock(&uprobes_treelock);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        write_unlock(&uprobes_treelock);
        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
        put_uprobe(uprobe);
}

struct map_info {
        struct map_info *next;
        struct mm_struct *mm;
        unsigned long vaddr;
};

static inline struct map_info *free_map_info(struct map_info *info)
{
        struct map_info *next = info->next;
        kfree(info);
        return next;
}

static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
        unsigned long pgoff = offset >> PAGE_SHIFT;
        struct vm_area_struct *vma;
        struct map_info *curr = NULL;
        struct map_info *prev = NULL;
        struct map_info *info;
        int more = 0;

 again:
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;

                if (!prev && !more) {
                        /*
                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
                         * reclaim. This is optimistic, no harm done if it fails.
                         */
                        prev = kmalloc(sizeof(struct map_info),
                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
                        if (prev)
                                prev->next = NULL;
                }
                if (!prev) {
                        more++;
                        continue;
                }

                if (!mmget_not_zero(vma->vm_mm))
                        continue;

                info = prev;
                prev = prev->next;
                info->next = curr;
                curr = info;

                info->mm = vma->vm_mm;
                info->vaddr = offset_to_vaddr(vma, offset);
        }
        i_mmap_unlock_read(mapping);

        if (!more)
                goto out;

        prev = curr;
        while (curr) {
                mmput(curr->mm);
                curr = curr->next;
        }

        do {
                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
                if (!info) {
                        curr = ERR_PTR(-ENOMEM);
                        goto out;
                }
                info->next = prev;
                prev = info;
        } while (--more);

        goto again;
 out:
        while (prev)
                prev = free_map_info(prev);
        return curr;
}

static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
        bool is_register = !!new;
        struct map_info *info;
        int err = 0;

        percpu_down_write(&dup_mmap_sem);
        info = build_map_info(uprobe->inode->i_mapping,
                                        uprobe->offset, is_register);
        if (IS_ERR(info)) {
                err = PTR_ERR(info);
                goto out;
        }

        while (info) {
                struct mm_struct *mm = info->mm;
                struct vm_area_struct *vma;

                if (err && is_register)
                        goto free;

                mmap_write_lock(mm);
                vma = find_vma(mm, info->vaddr);
                if (!vma || !valid_vma(vma, is_register) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        goto unlock;

                if (vma->vm_start > info->vaddr ||
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;

                if (is_register) {
                        /* consult only the "caller", new consumer. */
                        if (consumer_filter(new,
                                        UPROBE_FILTER_REGISTER, mm))
                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
                        if (!filter_chain(uprobe,
                                        UPROBE_FILTER_UNREGISTER, mm))
                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
                }

 unlock:
                mmap_write_unlock(mm);
 free:
                mmput(mm);
                info = free_map_info(info);
        }
 out:
        percpu_up_write(&dup_mmap_sem);
        return err;
}

static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        int err;

        if (WARN_ON(!consumer_del(uprobe, uc)))
                return;

        err = register_for_each_vma(uprobe, NULL);
        /* TODO : cant unregister? schedule a worker thread */
        if (!uprobe->consumers && !err)
                delete_uprobe(uprobe);
}

/*
 * uprobe_unregister - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: identify which probe if multiple probes are colocated.
 */
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return;

        down_write(&uprobe->register_rwsem);
        __uprobe_unregister(uprobe, uc);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);

/*
 * __uprobe_register - register a probe
 * @inode: the file in which the probe has to be placed.
 * @offset: offset from the start of the file.
 * @uc: information on howto handle the probe..
 *
 * Apart from the access refcount, __uprobe_register() takes a creation
 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
 * inserted into the rbtree (i.e first consumer for a @inode:@offset
 * tuple).  Creation refcount stops uprobe_unregister from freeing the
 * @uprobe even before the register operation is complete. Creation
 * refcount is released when the last @uc for the @uprobe
 * unregisters. Caller of __uprobe_register() is required to keep @inode
 * (and the containing mount) referenced.
 *
 * Return errno if it cannot successully install probes
 * else return 0 (success)
 */
static int __uprobe_register(struct inode *inode, loff_t offset,
                             loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;
        int ret;

        /* Uprobe must have at least one set consumer */
        if (!uc->handler && !uc->ret_handler)
                return -EINVAL;

        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
        if (!inode->i_mapping->a_ops->read_folio &&
            !shmem_mapping(inode->i_mapping))
                return -EIO;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;

        /*
         * This ensures that copy_from_page(), copy_to_page() and
         * __update_ref_ctr() can't cross page boundary.
         */
        if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
                return -EINVAL;
        if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
                return -EINVAL;

 retry:
        uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
        if (!uprobe)
                return -ENOMEM;
        if (IS_ERR(uprobe))
                return PTR_ERR(uprobe);

        /*
         * We can race with uprobe_unregister()->delete_uprobe().
         * Check uprobe_is_active() and retry if it is false.
         */
        down_write(&uprobe->register_rwsem);
        ret = -EAGAIN;
        if (likely(uprobe_is_active(uprobe))) {
                consumer_add(uprobe, uc);
                ret = register_for_each_vma(uprobe, uc);
                if (ret)
                        __uprobe_unregister(uprobe, uc);
        }
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        if (unlikely(ret == -EAGAIN))
                goto retry;
        return ret;
}

int uprobe_register(struct inode *inode, loff_t offset,
                    struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, 0, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register);

int uprobe_register_refctr(struct inode *inode, loff_t offset,
                           loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, ref_ctr_offset, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register_refctr);

/*
 * uprobe_apply - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: consumer which wants to add more or remove some breakpoints
 * @add: add or remove the breakpoints
 */
int uprobe_apply(struct inode *inode, loff_t offset,
                        struct uprobe_consumer *uc, bool add)
{
        struct uprobe *uprobe;
        struct uprobe_consumer *con;
        int ret = -ENOENT;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return ret;

        down_write(&uprobe->register_rwsem);
        for (con = uprobe->consumers; con && con != uc ; con = con->next)
                ;
        if (con)
                ret = register_for_each_vma(uprobe, add ? uc : NULL);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        return ret;
}

static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *vma;
        int err = 0;

        mmap_read_lock(mm);
        for_each_vma(vmi, vma) {
                unsigned long vaddr;
                loff_t offset;

                if (!valid_vma(vma, false) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        continue;

                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
                if (uprobe->offset <  offset ||
                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
                        continue;

                vaddr = offset_to_vaddr(vma, uprobe->offset);
                err |= remove_breakpoint(uprobe, mm, vaddr);
        }
        mmap_read_unlock(mm);

        return err;
}

static struct rb_node *
find_node_in_range(struct inode *inode, loff_t min, loff_t max)
{
        struct rb_node *n = uprobes_tree.rb_node;

        while (n) {
                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);

                if (inode < u->inode) {
                        n = n->rb_left;
                } else if (inode > u->inode) {
                        n = n->rb_right;
                } else {
                        if (max < u->offset)
                                n = n->rb_left;
                        else if (min > u->offset)
                                n = n->rb_right;
                        else
                                break;
                }
        }

        return n;
}

/*
 * For a given range in vma, build a list of probes that need to be inserted.
 */
static void build_probe_list(struct inode *inode,
                                struct vm_area_struct *vma,
                                unsigned long start, unsigned long end,
                                struct list_head *head)
{
        loff_t min, max;
        struct rb_node *n, *t;
        struct uprobe *u;

        INIT_LIST_HEAD(head);
        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        read_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        if (n) {
                for (t = n; t; t = rb_prev(t)) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset < min)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
                for (t = n; (t = rb_next(t)); ) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset > max)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
        }
        read_unlock(&uprobes_treelock);
}

/* @vma contains reference counter, not the probed instruction. */
static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;
        unsigned long vaddr;
        int ret = 0, err = 0;

        mutex_lock(&delayed_uprobe_lock);
        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (du->mm != vma->vm_mm ||
                    !valid_ref_ctr_vma(du->uprobe, vma))
                        continue;

                vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
                if (ret) {
                        update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
                        if (!err)
                                err = ret;
                }
                delayed_uprobe_delete(du);
        }
        mutex_unlock(&delayed_uprobe_lock);
        return err;
}

/*
 * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
 *
 * Currently we ignore all errors and always return 0, the callers
 * can't handle the failure anyway.
 */
int uprobe_mmap(struct vm_area_struct *vma)
{
        struct list_head tmp_list;
        struct uprobe *uprobe, *u;
        struct inode *inode;

        if (no_uprobe_events())
                return 0;

        if (vma->vm_file &&
            (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
            test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
                delayed_ref_ctr_inc(vma);

        if (!valid_vma(vma, true))
                return 0;

        inode = file_inode(vma->vm_file);
        if (!inode)
                return 0;

        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
        /*
         * We can race with uprobe_unregister(), this uprobe can be already
         * removed. But in this case filter_chain() must return false, all
         * consumers have gone away.
         */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
                if (!fatal_signal_pending(current) &&
                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                }
                put_uprobe(uprobe);
        }
        mutex_unlock(uprobes_mmap_hash(inode));

        return 0;
}

static bool
vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        loff_t min, max;
        struct inode *inode;
        struct rb_node *n;

        inode = file_inode(vma->vm_file);

        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        read_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        read_unlock(&uprobes_treelock);

        return !!n;
}

/*
 * Called in context of a munmap of a vma.
 */
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        if (no_uprobe_events() || !valid_vma(vma, false))
                return;

        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
                return;

        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
                return;

        if (vma_has_uprobes(vma, start, end))
                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}

/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
        struct vm_area_struct *vma;
        int ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        if (mm->uprobes_state.xol_area) {
                ret = -EALREADY;
                goto fail;
        }

        if (!area->vaddr) {
                /* Try to map as high as possible, this is only a hint. */
                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
                                                PAGE_SIZE, 0, 0);
                if (IS_ERR_VALUE(area->vaddr)) {
                        ret = area->vaddr;
                        goto fail;
                }
        }

        vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
                                &area->xol_mapping);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto fail;
        }

        ret = 0;
        /* pairs with get_xol_area() */
        smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
 fail:
        mmap_write_unlock(mm);

        return ret;
}

static struct xol_area *__create_xol_area(unsigned long vaddr)
{
        struct mm_struct *mm = current->mm;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
        struct xol_area *area;

        area = kmalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;

        area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
                               GFP_KERNEL);
        if (!area->bitmap)
                goto free_area;

        area->xol_mapping.name = "[uprobes]";
        area->xol_mapping.fault = NULL;
        area->xol_mapping.pages = area->pages;
        area->pages[0] = alloc_page(GFP_HIGHUSER);
        if (!area->pages[0])
                goto free_bitmap;
        area->pages[1] = NULL;

        area->vaddr = vaddr;
        init_waitqueue_head(&area->wq);
        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
        atomic_set(&area->slot_count, 1);
        arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);

        if (!xol_add_vma(mm, area))
                return area;

        __free_page(area->pages[0]);
 free_bitmap:
        kfree(area->bitmap);
 free_area:
        kfree(area);
 out:
        return NULL;
}

/*
 * get_xol_area - Allocate process's xol_area if necessary.
 * This area will be used for storing instructions for execution out of line.
 *
 * Returns the allocated area or NULL.
 */
static struct xol_area *get_xol_area(void)
{
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        if (!mm->uprobes_state.xol_area)
                __create_xol_area(0);

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
        return area;
}

/*
 * uprobe_clear_state - Free the area allocated for slots.
 */
void uprobe_clear_state(struct mm_struct *mm)
{
        struct xol_area *area = mm->uprobes_state.xol_area;

        mutex_lock(&delayed_uprobe_lock);
        delayed_uprobe_remove(NULL, mm);
        mutex_unlock(&delayed_uprobe_lock);

        if (!area)
                return;

        put_page(area->pages[0]);
        kfree(area->bitmap);
        kfree(area);
}

void uprobe_start_dup_mmap(void)
{
        percpu_down_read(&dup_mmap_sem);
}

void uprobe_end_dup_mmap(void)
{
        percpu_up_read(&dup_mmap_sem);
}

void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
                set_bit(MMF_HAS_UPROBES, &newmm->flags);
                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
        }
}

/*
 *  - search for a free slot.
 */
static unsigned long xol_take_insn_slot(struct xol_area *area)
{
        unsigned long slot_addr;
        int slot_nr;

        do {
                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
                if (slot_nr < UINSNS_PER_PAGE) {
                        if (!test_and_set_bit(slot_nr, area->bitmap))
                                break;

                        slot_nr = UINSNS_PER_PAGE;
                        continue;
                }
                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
        } while (slot_nr >= UINSNS_PER_PAGE);

        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
        atomic_inc(&area->slot_count);

        return slot_addr;
}

/*
 * xol_get_insn_slot - allocate a slot for xol.
 * Returns the allocated slot address or 0.
 */
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
        struct xol_area *area;
        unsigned long xol_vaddr;

        area = get_xol_area();
        if (!area)
                return 0;

        xol_vaddr = xol_take_insn_slot(area);
        if (unlikely(!xol_vaddr))
                return 0;

        arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));

        return xol_vaddr;
}

/*
 * xol_free_insn_slot - If slot was earlier allocated by
 * @xol_get_insn_slot(), make the slot available for
 * subsequent requests.
 */
static void xol_free_insn_slot(struct task_struct *tsk)
{
        struct xol_area *area;
        unsigned long vma_end;
        unsigned long slot_addr;

        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
                return;

        slot_addr = tsk->utask->xol_vaddr;
        if (unlikely(!slot_addr))
                return;

        area = tsk->mm->uprobes_state.xol_area;
        vma_end = area->vaddr + PAGE_SIZE;
        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
                unsigned long offset;
                int slot_nr;

                offset = slot_addr - area->vaddr;
                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
                if (slot_nr >= UINSNS_PER_PAGE)
                        return;

                clear_bit(slot_nr, area->bitmap);
                atomic_dec(&area->slot_count);
                smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
                if (waitqueue_active(&area->wq))
                        wake_up(&area->wq);

                tsk->utask->xol_vaddr = 0;
        }
}

void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
                                  void *src, unsigned long len)
{
        /* Initialize the slot */
        copy_to_page(page, vaddr, src, len);

        /*
         * We probably need flush_icache_user_page() but it needs vma.
         * This should work on most of architectures by default. If
         * architecture needs to do something different it can define
         * its own version of the function.
         */
        flush_dcache_page(page);
}

/**
 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
 * @regs: Reflects the saved state of the task after it has hit a breakpoint
 * instruction.
 * Return the address of the breakpoint instruction.
 */
unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
{
        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}

unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (unlikely(utask && utask->active_uprobe))
                return utask->vaddr;

        return instruction_pointer(regs);
}

static struct return_instance *free_ret_instance(struct return_instance *ri)
{
        struct return_instance *next = ri->next;
        put_uprobe(ri->uprobe);
        kfree(ri);
        return next;
}

/*
 * Called with no locks held.
 * Called in context of an exiting or an exec-ing thread.
 */
void uprobe_free_utask(struct task_struct *t)
{
        struct uprobe_task *utask = t->utask;
        struct return_instance *ri;

        if (!utask)
                return;

        if (utask->active_uprobe)
                put_uprobe(utask->active_uprobe);

        ri = utask->return_instances;
        while (ri)
                ri = free_ret_instance(ri);

        xol_free_insn_slot(t);
        kfree(utask);
        t->utask = NULL;
}

/*
 * Allocate a uprobe_task object for the task if necessary.
 * Called when the thread hits a breakpoint.
 *
 * Returns:
 * - pointer to new uprobe_task on success
 * - NULL otherwise
 */
static struct uprobe_task *get_utask(void)
{
        if (!current->utask)
                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        return current->utask;
}

static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
        struct uprobe_task *n_utask;
        struct return_instance **p, *o, *n;

        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        if (!n_utask)
                return -ENOMEM;
        t->utask = n_utask;

        p = &n_utask->return_instances;
        for (o = o_utask->return_instances; o; o = o->next) {
                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
                if (!n)
                        return -ENOMEM;

                *n = *o;
                get_uprobe(n->uprobe);
                n->next = NULL;

                *p = n;
                p = &n->next;
                n_utask->depth++;
        }

        return 0;
}

static void uprobe_warn(struct task_struct *t, const char *msg)
{
        pr_warn("uprobe: %s:%d failed to %s\n",
                        current->comm, current->pid, msg);
}

static void dup_xol_work(struct callback_head *work)
{
        if (current->flags & PF_EXITING)
                return;

        if (!__create_xol_area(current->utask->dup_xol_addr) &&
                        !fatal_signal_pending(current))
                uprobe_warn(current, "dup xol area");
}

/*
 * Called in context of a new clone/fork from copy_process.
 */
void uprobe_copy_process(struct task_struct *t, unsigned long flags)
{
        struct uprobe_task *utask = current->utask;
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        t->utask = NULL;

        if (!utask || !utask->return_instances)
                return;

        if (mm == t->mm && !(flags & CLONE_VFORK))
                return;

        if (dup_utask(t, utask))
                return uprobe_warn(t, "dup ret instances");

        /* The task can fork() after dup_xol_work() fails */
        area = mm->uprobes_state.xol_area;
        if (!area)
                return uprobe_warn(t, "dup xol area");

        if (mm == t->mm)
                return;

        t->utask->dup_xol_addr = area->vaddr;
        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
        task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}

/*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
 *
 * Returns -1 in case the xol_area is not allocated.
 */
static unsigned long get_trampoline_vaddr(void)
{
        struct xol_area *area;
        unsigned long trampoline_vaddr = -1;

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
        if (area)
                trampoline_vaddr = area->vaddr;

        return trampoline_vaddr;
}

static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
                                        struct pt_regs *regs)
{
        struct return_instance *ri = utask->return_instances;
        enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;

        while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
                ri = free_ret_instance(ri);
                utask->depth--;
        }
        utask->return_instances = ri;
}

static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct return_instance *ri;
        struct uprobe_task *utask;
        unsigned long orig_ret_vaddr, trampoline_vaddr;
        bool chained;

        if (!get_xol_area())
                return;

        utask = get_utask();
        if (!utask)
                return;

        if (utask->depth >= MAX_URETPROBE_DEPTH) {
                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
                                " nestedness limit pid/tgid=%d/%d\n",
                                current->pid, current->tgid);
                return;
        }

        ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
        if (!ri)
                return;

        trampoline_vaddr = get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
        if (orig_ret_vaddr == -1)
                goto fail;

        /* drop the entries invalidated by longjmp() */
        chained = (orig_ret_vaddr == trampoline_vaddr);
        cleanup_return_instances(utask, chained, regs);

        /*
         * We don't want to keep trampoline address in stack, rather keep the
         * original return address of first caller thru all the consequent
         * instances. This also makes breakpoint unwrapping easier.
         */
        if (chained) {
                if (!utask->return_instances) {
                        /*
                         * This situation is not possible. Likely we have an
                         * attack from user-space.
                         */
                        uprobe_warn(current, "handle tail call");
                        goto fail;
                }
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }

        ri->uprobe = get_uprobe(uprobe);
        ri->func = instruction_pointer(regs);
        ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
        ri->chained = chained;

        utask->depth++;
        ri->next = utask->return_instances;
        utask->return_instances = ri;

        return;
 fail:
        kfree(ri);
}

/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
        struct uprobe_task *utask;
        unsigned long xol_vaddr;
        int err;

        utask = get_utask();
        if (!utask)
                return -ENOMEM;

        xol_vaddr = xol_get_insn_slot(uprobe);
        if (!xol_vaddr)
                return -ENOMEM;

        utask->xol_vaddr = xol_vaddr;
        utask->vaddr = bp_vaddr;

        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
        if (unlikely(err)) {
                xol_free_insn_slot(current);
                return err;
        }

        utask->active_uprobe = uprobe;
        utask->state = UTASK_SSTEP;
        return 0;
}

/*
 * If we are singlestepping, then ensure this thread is not connected to
 * non-fatal signals until completion of singlestep.  When xol insn itself
 * triggers the signal,  restart the original insn even if the task is
 * already SIGKILL'ed (since coredump should report the correct ip).  This
 * is even more important if the task has a handler for SIGSEGV/etc, The
 * _same_ instruction should be repeated again after return from the signal
 * handler, and SSTEP can never finish in this case.
 */
bool uprobe_deny_signal(void)
{
        struct task_struct *t = current;
        struct uprobe_task *utask = t->utask;

        if (likely(!utask || !utask->active_uprobe))
                return false;

        WARN_ON_ONCE(utask->state != UTASK_SSTEP);

        if (task_sigpending(t)) {
                spin_lock_irq(&t->sighand->siglock);
                clear_tsk_thread_flag(t, TIF_SIGPENDING);
                spin_unlock_irq(&t->sighand->siglock);

                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
                        utask->state = UTASK_SSTEP_TRAPPED;
                        set_tsk_thread_flag(t, TIF_UPROBE);
                }
        }

        return true;
}

static void mmf_recalc_uprobes(struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *vma;

        for_each_vma(vmi, vma) {
                if (!valid_vma(vma, false))
                        continue;
                /*
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
                 * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
        }

        clear_bit(MMF_HAS_UPROBES, &mm->flags);
}

static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
        struct page *page;
        uprobe_opcode_t opcode;
        int result;

        if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
                return -EINVAL;

        pagefault_disable();
        result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
        pagefault_enable();

        if (likely(result == 0))
                goto out;

        /*
         * The NULL 'tsk' here ensures that any faults that occur here
         * will not be accounted to the task.  'mm' *is* current->mm,
         * but we treat this as a 'remote' access since it is
         * essentially a kernel access to the memory.
         */
        result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
        if (result < 0)
                return result;

        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        put_page(page);
 out:
        /* This needs to return true for any variant of the trap insn */
        return is_trap_insn(&opcode);
}

static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
        struct mm_struct *mm = current->mm;
        struct uprobe *uprobe = NULL;
        struct vm_area_struct *vma;

        mmap_read_lock(mm);
        vma = vma_lookup(mm, bp_vaddr);
        if (vma) {
                if (valid_vma(vma, false)) {
                        struct inode *inode = file_inode(vma->vm_file);
                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);

                        uprobe = find_uprobe(inode, offset);
                }

                if (!uprobe)
                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
        } else {
                *is_swbp = -EFAULT;
        }

        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
                mmf_recalc_uprobes(mm);
        mmap_read_unlock(mm);

        return uprobe;
}

static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct uprobe_consumer *uc;
        int remove = UPROBE_HANDLER_REMOVE;
        bool need_prep = false; /* prepare return uprobe, when needed */

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                int rc = 0;

                if (uc->handler) {
                        rc = uc->handler(uc, regs);
                        WARN(rc & ~UPROBE_HANDLER_MASK,
                                "bad rc=0x%x from %ps()\n", rc, uc->handler);
                }

                if (uc->ret_handler)
                        need_prep = true;

                remove &= rc;
        }

        if (need_prep && !remove)
                prepare_uretprobe(uprobe, regs); /* put bp at return */

        if (remove && uprobe->consumers) {
                WARN_ON(!uprobe_is_active(uprobe));
                unapply_uprobe(uprobe, current->mm);
        }
        up_read(&uprobe->register_rwsem);
}

static void
handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
        struct uprobe *uprobe = ri->uprobe;
        struct uprobe_consumer *uc;

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                if (uc->ret_handler)
                        uc->ret_handler(uc, ri->func, regs);
        }
        up_read(&uprobe->register_rwsem);
}

static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
        bool chained;

        do {
                chained = ri->chained;
                ri = ri->next;        /* can't be NULL if chained */
        } while (chained);

        return ri;
}

static void handle_trampoline(struct pt_regs *regs)
{
        struct uprobe_task *utask;
        struct return_instance *ri, *next;
        bool valid;

        utask = current->utask;
        if (!utask)
                goto sigill;

        ri = utask->return_instances;
        if (!ri)
                goto sigill;

        do {
                /*
                 * We should throw out the frames invalidated by longjmp().
                 * If this chain is valid, then the next one should be alive
                 * or NULL; the latter case means that nobody but ri->func
                 * could hit this trampoline on return. TODO: sigaltstack().
                 */
                next = find_next_ret_chain(ri);
                valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);

                instruction_pointer_set(regs, ri->orig_ret_vaddr);
                do {
                        if (valid)
                                handle_uretprobe_chain(ri, regs);
                        ri = free_ret_instance(ri);
                        utask->depth--;
                } while (ri != next);
        } while (!valid);

        utask->return_instances = ri;
        return;

 sigill:
        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
        force_sig(SIGILL);

}

bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
{
        return false;
}

bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
                                        struct pt_regs *regs)
{
        return true;
}

/*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
static void handle_swbp(struct pt_regs *regs)
{
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int is_swbp;

        bp_vaddr = uprobe_get_swbp_addr(regs);
        if (bp_vaddr == get_trampoline_vaddr())
                return handle_trampoline(regs);

        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
        if (!uprobe) {
                if (is_swbp > 0) {
                        /* No matching uprobe; signal SIGTRAP. */
                        force_sig(SIGTRAP);
                } else {
                        /*
                         * Either we raced with uprobe_unregister() or we can't
                         * access this memory. The latter is only possible if
                         * another thread plays with our ->mm. In both cases
                         * we can simply restart. If this vma was unmapped we
                         * can pretend this insn was not executed yet and get
                         * the (correct) SIGSEGV after restart.
                         */
                        instruction_pointer_set(regs, bp_vaddr);
                }
                return;
        }

        /* change it in advance for ->handler() and restart */
        instruction_pointer_set(regs, bp_vaddr);

        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
         * new and not-yet-analyzed uprobe at the same address, restart.
         */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
                goto out;

        /*
         * Pairs with the smp_wmb() in prepare_uprobe().
         *
         * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
         * we must also see the stores to &uprobe->arch performed by the
         * prepare_uprobe() call.
         */
        smp_rmb();

        /* Tracing handlers use ->utask to communicate with fetch methods */
        if (!get_utask())
                goto out;

        if (arch_uprobe_ignore(&uprobe->arch, regs))
                goto out;

        handler_chain(uprobe, regs);

        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;

        if (!pre_ssout(uprobe, regs, bp_vaddr))
                return;

        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
        put_uprobe(uprobe);
}

/*
 * Perform required fix-ups and disable singlestep.
 * Allow pending signals to take effect.
 */
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
        struct uprobe *uprobe;
        int err = 0;

        uprobe = utask->active_uprobe;
        if (utask->state == UTASK_SSTEP_ACK)
                err = arch_uprobe_post_xol(&uprobe->arch, regs);
        else if (utask->state == UTASK_SSTEP_TRAPPED)
                arch_uprobe_abort_xol(&uprobe->arch, regs);
        else
                WARN_ON_ONCE(1);

        put_uprobe(uprobe);
        utask->active_uprobe = NULL;
        utask->state = UTASK_RUNNING;
        xol_free_insn_slot(current);

        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* see uprobe_deny_signal() */
        spin_unlock_irq(&current->sighand->siglock);

        if (unlikely(err)) {
                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
                force_sig(SIGILL);
        }
}

/*
 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
 * allows the thread to return from interrupt. After that handle_swbp()
 * sets utask->active_uprobe.
 *
 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
 * and allows the thread to return from interrupt.
 *
 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
 * uprobe_notify_resume().
 */
void uprobe_notify_resume(struct pt_regs *regs)
{
        struct uprobe_task *utask;

        clear_thread_flag(TIF_UPROBE);

        utask = current->utask;
        if (utask && utask->active_uprobe)
                handle_singlestep(utask, regs);
        else
                handle_swbp(regs);
}

/*
 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
 */
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
        if (!current->mm)
                return 0;

        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
            (!current->utask || !current->utask->return_instances))
                return 0;

        set_thread_flag(TIF_UPROBE);
        return 1;
}

/*
 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
 */
int uprobe_post_sstep_notifier(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (!current->mm || !utask || !utask->active_uprobe)
                /* task is currently not uprobed */
                return 0;

        utask->state = UTASK_SSTEP_ACK;
        set_thread_flag(TIF_UPROBE);
        return 1;
}

static struct notifier_block uprobe_exception_nb = {
        .notifier_call                = arch_uprobe_exception_notify,
        .priority                = INT_MAX-1,        /* notified after kprobes, kgdb */
};

void __init uprobes_init(void)
{
        int i;

        for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);

        BUG_ON(register_die_notifier(&uprobe_exception_nb));
}


























    2 









    2 






































    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl>
 */

#ifndef _NF_CONNTRACK_ACCT_H
#define _NF_CONNTRACK_ACCT_H
#include <net/net_namespace.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>

struct nf_conn_counter {
        atomic64_t packets;
        atomic64_t bytes;
};

struct nf_conn_acct {
        struct nf_conn_counter counter[IP_CT_DIR_MAX];
};

static inline
struct nf_conn_acct *nf_conn_acct_find(const struct nf_conn *ct)
{
        return nf_ct_ext_find(ct, NF_CT_EXT_ACCT);
}

static inline
struct nf_conn_acct *nf_ct_acct_ext_add(struct nf_conn *ct, gfp_t gfp)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        struct net *net = nf_ct_net(ct);
        struct nf_conn_acct *acct;

        if (!net->ct.sysctl_acct)
                return NULL;

        acct = nf_ct_ext_add(ct, NF_CT_EXT_ACCT, gfp);
        if (!acct)
                pr_debug("failed to add accounting extension area");


        return acct;
#else
        return NULL;
#endif
}

/* Check if connection tracking accounting is enabled */
static inline bool nf_ct_acct_enabled(struct net *net)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return net->ct.sysctl_acct != 0;
#else
        return false;
#endif
}

/* Enable/disable connection tracking accounting */
static inline void nf_ct_set_acct(struct net *net, bool enable)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        net->ct.sysctl_acct = enable;
#endif
}

void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
                    unsigned int bytes);

static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir,
                                     unsigned int bytes)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        nf_ct_acct_add(ct, dir, 1, bytes);
#endif
}

void nf_conntrack_acct_pernet_init(struct net *net);

#endif /* _NF_CONNTRACK_ACCT_H */





























































































































































































































































































































    1 





    1 





















    1 



    1 
    1 




































































































































































































































































































































































































































































































































































































































































































































































    1 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  net/dccp/proto.c
 *
 *  An implementation of the DCCP protocol
 *  Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 */

#include <linux/dccp.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <net/checksum.h>

#include <net/inet_sock.h>
#include <net/inet_common.h>
#include <net/sock.h>
#include <net/xfrm.h>

#include <asm/ioctls.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/delay.h>
#include <linux/poll.h>

#include "ccid.h"
#include "dccp.h"
#include "feat.h"

#define CREATE_TRACE_POINTS
#include "trace.h"

DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;

EXPORT_SYMBOL_GPL(dccp_statistics);

DEFINE_PER_CPU(unsigned int, dccp_orphan_count);
EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count);

struct inet_hashinfo dccp_hashinfo;
EXPORT_SYMBOL_GPL(dccp_hashinfo);

/* the maximum queue length for tx in packets. 0 is no limit */
int sysctl_dccp_tx_qlen __read_mostly = 5;

#ifdef CONFIG_IP_DCCP_DEBUG
static const char *dccp_state_name(const int state)
{
        static const char *const dccp_state_names[] = {
        [DCCP_OPEN]                = "OPEN",
        [DCCP_REQUESTING]        = "REQUESTING",
        [DCCP_PARTOPEN]                = "PARTOPEN",
        [DCCP_LISTEN]                = "LISTEN",
        [DCCP_RESPOND]                = "RESPOND",
        [DCCP_CLOSING]                = "CLOSING",
        [DCCP_ACTIVE_CLOSEREQ]        = "CLOSEREQ",
        [DCCP_PASSIVE_CLOSE]        = "PASSIVE_CLOSE",
        [DCCP_PASSIVE_CLOSEREQ]        = "PASSIVE_CLOSEREQ",
        [DCCP_TIME_WAIT]        = "TIME_WAIT",
        [DCCP_CLOSED]                = "CLOSED",
        };

        if (state >= DCCP_MAX_STATES)
                return "INVALID STATE!";
        else
                return dccp_state_names[state];
}
#endif

void dccp_set_state(struct sock *sk, const int state)
{
        const int oldstate = sk->sk_state;

        dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
                      dccp_state_name(oldstate), dccp_state_name(state));
        WARN_ON(state == oldstate);

        switch (state) {
        case DCCP_OPEN:
                if (oldstate != DCCP_OPEN)
                        DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
                /* Client retransmits all Confirm options until entering OPEN */
                if (oldstate == DCCP_PARTOPEN)
                        dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
                break;

        case DCCP_CLOSED:
                if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
                    oldstate == DCCP_CLOSING)
                        DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);

                sk->sk_prot->unhash(sk);
                if (inet_csk(sk)->icsk_bind_hash != NULL &&
                    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
                        inet_put_port(sk);
                fallthrough;
        default:
                if (oldstate == DCCP_OPEN)
                        DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
        }

        /* Change state AFTER socket is unhashed to avoid closed
         * socket sitting in hash tables.
         */
        inet_sk_set_state(sk, state);
}

EXPORT_SYMBOL_GPL(dccp_set_state);

static void dccp_finish_passive_close(struct sock *sk)
{
        switch (sk->sk_state) {
        case DCCP_PASSIVE_CLOSE:
                /* Node (client or server) has received Close packet. */
                dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
                dccp_set_state(sk, DCCP_CLOSED);
                break;
        case DCCP_PASSIVE_CLOSEREQ:
                /*
                 * Client received CloseReq. We set the `active' flag so that
                 * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
                 */
                dccp_send_close(sk, 1);
                dccp_set_state(sk, DCCP_CLOSING);
        }
}

void dccp_done(struct sock *sk)
{
        dccp_set_state(sk, DCCP_CLOSED);
        dccp_clear_xmit_timers(sk);

        sk->sk_shutdown = SHUTDOWN_MASK;

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_state_change(sk);
        else
                inet_csk_destroy_sock(sk);
}

EXPORT_SYMBOL_GPL(dccp_done);

const char *dccp_packet_name(const int type)
{
        static const char *const dccp_packet_names[] = {
                [DCCP_PKT_REQUEST]  = "REQUEST",
                [DCCP_PKT_RESPONSE] = "RESPONSE",
                [DCCP_PKT_DATA]            = "DATA",
                [DCCP_PKT_ACK]            = "ACK",
                [DCCP_PKT_DATAACK]  = "DATAACK",
                [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
                [DCCP_PKT_CLOSE]    = "CLOSE",
                [DCCP_PKT_RESET]    = "RESET",
                [DCCP_PKT_SYNC]            = "SYNC",
                [DCCP_PKT_SYNCACK]  = "SYNCACK",
        };

        if (type >= DCCP_NR_PKT_TYPES)
                return "INVALID";
        else
                return dccp_packet_names[type];
}

EXPORT_SYMBOL_GPL(dccp_packet_name);

void dccp_destruct_common(struct sock *sk)
{
        struct dccp_sock *dp = dccp_sk(sk);

        ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
        dp->dccps_hc_tx_ccid = NULL;
}
EXPORT_SYMBOL_GPL(dccp_destruct_common);

static void dccp_sk_destruct(struct sock *sk)
{
        dccp_destruct_common(sk);
        inet_sock_destruct(sk);
}

int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
{
        struct dccp_sock *dp = dccp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);

        pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, "
                     "please contact the netdev mailing list\n");

        icsk->icsk_rto                = DCCP_TIMEOUT_INIT;
        icsk->icsk_syn_retries        = sysctl_dccp_request_retries;
        sk->sk_state                = DCCP_CLOSED;
        sk->sk_write_space        = dccp_write_space;
        sk->sk_destruct                = dccp_sk_destruct;
        icsk->icsk_sync_mss        = dccp_sync_mss;
        dp->dccps_mss_cache        = 536;
        dp->dccps_rate_last        = jiffies;
        dp->dccps_role                = DCCP_ROLE_UNDEFINED;
        dp->dccps_service        = DCCP_SERVICE_CODE_IS_ABSENT;
        dp->dccps_tx_qlen        = sysctl_dccp_tx_qlen;

        dccp_init_xmit_timers(sk);

        INIT_LIST_HEAD(&dp->dccps_featneg);
        /* control socket doesn't need feat nego */
        if (likely(ctl_sock_initialized))
                return dccp_feat_init(sk);
        return 0;
}

EXPORT_SYMBOL_GPL(dccp_init_sock);

void dccp_destroy_sock(struct sock *sk)
{
        struct dccp_sock *dp = dccp_sk(sk);

        __skb_queue_purge(&sk->sk_write_queue);
        if (sk->sk_send_head != NULL) {
                kfree_skb(sk->sk_send_head);
                sk->sk_send_head = NULL;
        }

        /* Clean up a referenced DCCP bind bucket. */
        if (inet_csk(sk)->icsk_bind_hash != NULL)
                inet_put_port(sk);

        kfree(dp->dccps_service_list);
        dp->dccps_service_list = NULL;

        if (dp->dccps_hc_rx_ackvec != NULL) {
                dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
                dp->dccps_hc_rx_ackvec = NULL;
        }
        ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
        dp->dccps_hc_rx_ccid = NULL;

        /* clean up feature negotiation state */
        dccp_feat_list_purge(&dp->dccps_featneg);
}

EXPORT_SYMBOL_GPL(dccp_destroy_sock);

static inline int dccp_need_reset(int state)
{
        return state != DCCP_CLOSED && state != DCCP_LISTEN &&
               state != DCCP_REQUESTING;
}

int dccp_disconnect(struct sock *sk, int flags)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_sock *inet = inet_sk(sk);
        struct dccp_sock *dp = dccp_sk(sk);
        const int old_state = sk->sk_state;

        if (old_state != DCCP_CLOSED)
                dccp_set_state(sk, DCCP_CLOSED);

        /*
         * This corresponds to the ABORT function of RFC793, sec. 3.8
         * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
         */
        if (old_state == DCCP_LISTEN) {
                inet_csk_listen_stop(sk);
        } else if (dccp_need_reset(old_state)) {
                dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
                sk->sk_err = ECONNRESET;
        } else if (old_state == DCCP_REQUESTING)
                sk->sk_err = ECONNRESET;

        dccp_clear_xmit_timers(sk);
        ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
        dp->dccps_hc_rx_ccid = NULL;

        __skb_queue_purge(&sk->sk_receive_queue);
        __skb_queue_purge(&sk->sk_write_queue);
        if (sk->sk_send_head != NULL) {
                __kfree_skb(sk->sk_send_head);
                sk->sk_send_head = NULL;
        }

        inet->inet_dport = 0;

        inet_bhash2_reset_saddr(sk);

        sk->sk_shutdown = 0;
        sock_reset_flag(sk, SOCK_DONE);

        icsk->icsk_backoff = 0;
        inet_csk_delack_init(sk);
        __sk_dst_reset(sk);

        WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);

        sk_error_report(sk);
        return 0;
}

EXPORT_SYMBOL_GPL(dccp_disconnect);

/*
 *        Wait for a DCCP event.
 *
 *        Note that we don't need to lock the socket, as the upper poll layers
 *        take care of normal races (between the test and the event) and we don't
 *        go look at any of the socket buffers directly.
 */
__poll_t dccp_poll(struct file *file, struct socket *sock,
                       poll_table *wait)
{
        struct sock *sk = sock->sk;
        __poll_t mask;
        u8 shutdown;
        int state;

        sock_poll_wait(file, sock, wait);

        state = inet_sk_state_load(sk);
        if (state == DCCP_LISTEN)
                return inet_csk_listen_poll(sk);

        /* Socket is not locked. We are protected from async events
           by poll logic and correct handling of state changes
           made by another threads is impossible in any case.
         */

        mask = 0;
        if (READ_ONCE(sk->sk_err))
                mask = EPOLLERR;
        shutdown = READ_ONCE(sk->sk_shutdown);

        if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED)
                mask |= EPOLLHUP;
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;

        /* Connected? */
        if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
                if (atomic_read(&sk->sk_rmem_alloc) > 0)
                        mask |= EPOLLIN | EPOLLRDNORM;

                if (!(shutdown & SEND_SHUTDOWN)) {
                        if (sk_stream_is_writeable(sk)) {
                                mask |= EPOLLOUT | EPOLLWRNORM;
                        } else {  /* send SIGIO later */
                                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

                                /* Race breaker. If space is freed after
                                 * wspace test but before the flags are set,
                                 * IO signal will be lost.
                                 */
                                if (sk_stream_is_writeable(sk))
                                        mask |= EPOLLOUT | EPOLLWRNORM;
                        }
                }
        }
        return mask;
}
EXPORT_SYMBOL_GPL(dccp_poll);

int dccp_ioctl(struct sock *sk, int cmd, int *karg)
{
        int rc = -ENOTCONN;

        lock_sock(sk);

        if (sk->sk_state == DCCP_LISTEN)
                goto out;

        switch (cmd) {
        case SIOCOUTQ: {
                *karg = sk_wmem_alloc_get(sk);
                /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and
                 * always 0, comparably to UDP.
                 */

                rc = 0;
        }
                break;
        case SIOCINQ: {
                struct sk_buff *skb;
                *karg = 0;

                skb = skb_peek(&sk->sk_receive_queue);
                if (skb != NULL) {
                        /*
                         * We will only return the amount of this packet since
                         * that is all that will be read.
                         */
                        *karg = skb->len;
                }
                rc = 0;
        }
                break;
        default:
                rc = -ENOIOCTLCMD;
                break;
        }
out:
        release_sock(sk);
        return rc;
}

EXPORT_SYMBOL_GPL(dccp_ioctl);

static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
                                   sockptr_t optval, unsigned int optlen)
{
        struct dccp_sock *dp = dccp_sk(sk);
        struct dccp_service_list *sl = NULL;

        if (service == DCCP_SERVICE_INVALID_VALUE ||
            optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
                return -EINVAL;

        if (optlen > sizeof(service)) {
                sl = kmalloc(optlen, GFP_KERNEL);
                if (sl == NULL)
                        return -ENOMEM;

                sl->dccpsl_nr = optlen / sizeof(u32) - 1;
                if (copy_from_sockptr_offset(sl->dccpsl_list, optval,
                                sizeof(service), optlen - sizeof(service)) ||
                    dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
                        kfree(sl);
                        return -EFAULT;
                }
        }

        lock_sock(sk);
        dp->dccps_service = service;

        kfree(dp->dccps_service_list);

        dp->dccps_service_list = sl;
        release_sock(sk);
        return 0;
}

static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
{
        u8 *list, len;
        int i, rc;

        if (cscov < 0 || cscov > 15)
                return -EINVAL;
        /*
         * Populate a list of permissible values, in the range cscov...15. This
         * is necessary since feature negotiation of single values only works if
         * both sides incidentally choose the same value. Since the list starts
         * lowest-value first, negotiation will pick the smallest shared value.
         */
        if (cscov == 0)
                return 0;
        len = 16 - cscov;

        list = kmalloc(len, GFP_KERNEL);
        if (list == NULL)
                return -ENOBUFS;

        for (i = 0; i < len; i++)
                list[i] = cscov++;

        rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);

        if (rc == 0) {
                if (rx)
                        dccp_sk(sk)->dccps_pcrlen = cscov;
                else
                        dccp_sk(sk)->dccps_pcslen = cscov;
        }
        kfree(list);
        return rc;
}

static int dccp_setsockopt_ccid(struct sock *sk, int type,
                                sockptr_t optval, unsigned int optlen)
{
        u8 *val;
        int rc = 0;

        if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
                return -EINVAL;

        val = memdup_sockptr(optval, optlen);
        if (IS_ERR(val))
                return PTR_ERR(val);

        lock_sock(sk);
        if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
                rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);

        if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
                rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
        release_sock(sk);

        kfree(val);
        return rc;
}

static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
                sockptr_t optval, unsigned int optlen)
{
        struct dccp_sock *dp = dccp_sk(sk);
        int val, err = 0;

        switch (optname) {
        case DCCP_SOCKOPT_PACKET_SIZE:
                DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
                return 0;
        case DCCP_SOCKOPT_CHANGE_L:
        case DCCP_SOCKOPT_CHANGE_R:
                DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
                return 0;
        case DCCP_SOCKOPT_CCID:
        case DCCP_SOCKOPT_RX_CCID:
        case DCCP_SOCKOPT_TX_CCID:
                return dccp_setsockopt_ccid(sk, optname, optval, optlen);
        }

        if (optlen < (int)sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(int)))
                return -EFAULT;

        if (optname == DCCP_SOCKOPT_SERVICE)
                return dccp_setsockopt_service(sk, val, optval, optlen);

        lock_sock(sk);
        switch (optname) {
        case DCCP_SOCKOPT_SERVER_TIMEWAIT:
                if (dp->dccps_role != DCCP_ROLE_SERVER)
                        err = -EOPNOTSUPP;
                else
                        dp->dccps_server_timewait = (val != 0);
                break;
        case DCCP_SOCKOPT_SEND_CSCOV:
                err = dccp_setsockopt_cscov(sk, val, false);
                break;
        case DCCP_SOCKOPT_RECV_CSCOV:
                err = dccp_setsockopt_cscov(sk, val, true);
                break;
        case DCCP_SOCKOPT_QPOLICY_ID:
                if (sk->sk_state != DCCP_CLOSED)
                        err = -EISCONN;
                else if (val < 0 || val >= DCCPQ_POLICY_MAX)
                        err = -EINVAL;
                else
                        dp->dccps_qpolicy = val;
                break;
        case DCCP_SOCKOPT_QPOLICY_TXQLEN:
                if (val < 0)
                        err = -EINVAL;
                else
                        dp->dccps_tx_qlen = val;
                break;
        default:
                err = -ENOPROTOOPT;
                break;
        }
        release_sock(sk);

        return err;
}

int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                    unsigned int optlen)
{
        if (level != SOL_DCCP)
                return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
                                                             optname, optval,
                                                             optlen);
        return do_dccp_setsockopt(sk, level, optname, optval, optlen);
}

EXPORT_SYMBOL_GPL(dccp_setsockopt);

static int dccp_getsockopt_service(struct sock *sk, int len,
                                   __be32 __user *optval,
                                   int __user *optlen)
{
        const struct dccp_sock *dp = dccp_sk(sk);
        const struct dccp_service_list *sl;
        int err = -ENOENT, slen = 0, total_len = sizeof(u32);

        lock_sock(sk);
        if ((sl = dp->dccps_service_list) != NULL) {
                slen = sl->dccpsl_nr * sizeof(u32);
                total_len += slen;
        }

        err = -EINVAL;
        if (total_len > len)
                goto out;

        err = 0;
        if (put_user(total_len, optlen) ||
            put_user(dp->dccps_service, optval) ||
            (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
                err = -EFAULT;
out:
        release_sock(sk);
        return err;
}

static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int __user *optlen)
{
        struct dccp_sock *dp;
        int val, len;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < (int)sizeof(int))
                return -EINVAL;

        dp = dccp_sk(sk);

        switch (optname) {
        case DCCP_SOCKOPT_PACKET_SIZE:
                DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
                return 0;
        case DCCP_SOCKOPT_SERVICE:
                return dccp_getsockopt_service(sk, len,
                                               (__be32 __user *)optval, optlen);
        case DCCP_SOCKOPT_GET_CUR_MPS:
                val = READ_ONCE(dp->dccps_mss_cache);
                break;
        case DCCP_SOCKOPT_AVAILABLE_CCIDS:
                return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
        case DCCP_SOCKOPT_TX_CCID:
                val = ccid_get_current_tx_ccid(dp);
                if (val < 0)
                        return -ENOPROTOOPT;
                break;
        case DCCP_SOCKOPT_RX_CCID:
                val = ccid_get_current_rx_ccid(dp);
                if (val < 0)
                        return -ENOPROTOOPT;
                break;
        case DCCP_SOCKOPT_SERVER_TIMEWAIT:
                val = dp->dccps_server_timewait;
                break;
        case DCCP_SOCKOPT_SEND_CSCOV:
                val = dp->dccps_pcslen;
                break;
        case DCCP_SOCKOPT_RECV_CSCOV:
                val = dp->dccps_pcrlen;
                break;
        case DCCP_SOCKOPT_QPOLICY_ID:
                val = dp->dccps_qpolicy;
                break;
        case DCCP_SOCKOPT_QPOLICY_TXQLEN:
                val = dp->dccps_tx_qlen;
                break;
        case 128 ... 191:
                return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
                                             len, (u32 __user *)optval, optlen);
        case 192 ... 255:
                return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
                                             len, (u32 __user *)optval, optlen);
        default:
                return -ENOPROTOOPT;
        }

        len = sizeof(val);
        if (put_user(len, optlen) || copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

int dccp_getsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int __user *optlen)
{
        if (level != SOL_DCCP)
                return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
                                                             optname, optval,
                                                             optlen);
        return do_dccp_getsockopt(sk, level, optname, optval, optlen);
}

EXPORT_SYMBOL_GPL(dccp_getsockopt);

static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
{
        struct cmsghdr *cmsg;

        /*
         * Assign an (opaque) qpolicy priority value to skb->priority.
         *
         * We are overloading this skb field for use with the qpolicy subystem.
         * The skb->priority is normally used for the SO_PRIORITY option, which
         * is initialised from sk_priority. Since the assignment of sk_priority
         * to skb->priority happens later (on layer 3), we overload this field
         * for use with queueing priorities as long as the skb is on layer 4.
         * The default priority value (if nothing is set) is 0.
         */
        skb->priority = 0;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;

                if (cmsg->cmsg_level != SOL_DCCP)
                        continue;

                if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
                    !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
                        return -EINVAL;

                switch (cmsg->cmsg_type) {
                case DCCP_SCM_PRIORITY:
                        if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
                                return -EINVAL;
                        skb->priority = *(__u32 *)CMSG_DATA(cmsg);
                        break;
                default:
                        return -EINVAL;
                }
        }
        return 0;
}

int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
        const struct dccp_sock *dp = dccp_sk(sk);
        const int flags = msg->msg_flags;
        const int noblock = flags & MSG_DONTWAIT;
        struct sk_buff *skb;
        int rc, size;
        long timeo;

        trace_dccp_probe(sk, len);

        if (len > READ_ONCE(dp->dccps_mss_cache))
                return -EMSGSIZE;

        lock_sock(sk);

        timeo = sock_sndtimeo(sk, noblock);

        /*
         * We have to use sk_stream_wait_connect here to set sk_write_pending,
         * so that the trick in dccp_rcv_request_sent_state_process.
         */
        /* Wait for a connection to finish. */
        if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
                if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
                        goto out_release;

        size = sk->sk_prot->max_header + len;
        release_sock(sk);
        skb = sock_alloc_send_skb(sk, size, noblock, &rc);
        lock_sock(sk);
        if (skb == NULL)
                goto out_release;

        if (dccp_qpolicy_full(sk)) {
                rc = -EAGAIN;
                goto out_discard;
        }

        if (sk->sk_state == DCCP_CLOSED) {
                rc = -ENOTCONN;
                goto out_discard;
        }

        /* We need to check dccps_mss_cache after socket is locked. */
        if (len > dp->dccps_mss_cache) {
                rc = -EMSGSIZE;
                goto out_discard;
        }

        skb_reserve(skb, sk->sk_prot->max_header);
        rc = memcpy_from_msg(skb_put(skb, len), msg, len);
        if (rc != 0)
                goto out_discard;

        rc = dccp_msghdr_parse(msg, skb);
        if (rc != 0)
                goto out_discard;

        dccp_qpolicy_push(sk, skb);
        /*
         * The xmit_timer is set if the TX CCID is rate-based and will expire
         * when congestion control permits to release further packets into the
         * network. Window-based CCIDs do not use this timer.
         */
        if (!timer_pending(&dp->dccps_xmit_timer))
                dccp_write_xmit(sk);
out_release:
        release_sock(sk);
        return rc ? : len;
out_discard:
        kfree_skb(skb);
        goto out_release;
}

EXPORT_SYMBOL_GPL(dccp_sendmsg);

int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
                 int *addr_len)
{
        const struct dccp_hdr *dh;
        long timeo;

        lock_sock(sk);

        if (sk->sk_state == DCCP_LISTEN) {
                len = -ENOTCONN;
                goto out;
        }

        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        do {
                struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);

                if (skb == NULL)
                        goto verify_sock_status;

                dh = dccp_hdr(skb);

                switch (dh->dccph_type) {
                case DCCP_PKT_DATA:
                case DCCP_PKT_DATAACK:
                        goto found_ok_skb;

                case DCCP_PKT_CLOSE:
                case DCCP_PKT_CLOSEREQ:
                        if (!(flags & MSG_PEEK))
                                dccp_finish_passive_close(sk);
                        fallthrough;
                case DCCP_PKT_RESET:
                        dccp_pr_debug("found fin (%s) ok!\n",
                                      dccp_packet_name(dh->dccph_type));
                        len = 0;
                        goto found_fin_ok;
                default:
                        dccp_pr_debug("packet_type=%s\n",
                                      dccp_packet_name(dh->dccph_type));
                        sk_eat_skb(sk, skb);
                }
verify_sock_status:
                if (sock_flag(sk, SOCK_DONE)) {
                        len = 0;
                        break;
                }

                if (sk->sk_err) {
                        len = sock_error(sk);
                        break;
                }

                if (sk->sk_shutdown & RCV_SHUTDOWN) {
                        len = 0;
                        break;
                }

                if (sk->sk_state == DCCP_CLOSED) {
                        if (!sock_flag(sk, SOCK_DONE)) {
                                /* This occurs when user tries to read
                                 * from never connected socket.
                                 */
                                len = -ENOTCONN;
                                break;
                        }
                        len = 0;
                        break;
                }

                if (!timeo) {
                        len = -EAGAIN;
                        break;
                }

                if (signal_pending(current)) {
                        len = sock_intr_errno(timeo);
                        break;
                }

                sk_wait_data(sk, &timeo, NULL);
                continue;
        found_ok_skb:
                if (len > skb->len)
                        len = skb->len;
                else if (len < skb->len)
                        msg->msg_flags |= MSG_TRUNC;

                if (skb_copy_datagram_msg(skb, 0, msg, len)) {
                        /* Exception. Bailout! */
                        len = -EFAULT;
                        break;
                }
                if (flags & MSG_TRUNC)
                        len = skb->len;
        found_fin_ok:
                if (!(flags & MSG_PEEK))
                        sk_eat_skb(sk, skb);
                break;
        } while (1);
out:
        release_sock(sk);
        return len;
}

EXPORT_SYMBOL_GPL(dccp_recvmsg);

int inet_dccp_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        unsigned char old_state;
        int err;

        lock_sock(sk);

        err = -EINVAL;
        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
                goto out;

        old_state = sk->sk_state;
        if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
                goto out;

        WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
        /* Really, if the socket is already in listen state
         * we can only allow the backlog to be adjusted.
         */
        if (old_state != DCCP_LISTEN) {
                struct dccp_sock *dp = dccp_sk(sk);

                dp->dccps_role = DCCP_ROLE_LISTEN;

                /* do not start to listen if feature negotiation setup fails */
                if (dccp_feat_finalise_settings(dp)) {
                        err = -EPROTO;
                        goto out;
                }

                err = inet_csk_listen_start(sk);
                if (err)
                        goto out;
        }
        err = 0;

out:
        release_sock(sk);
        return err;
}

EXPORT_SYMBOL_GPL(inet_dccp_listen);

static void dccp_terminate_connection(struct sock *sk)
{
        u8 next_state = DCCP_CLOSED;

        switch (sk->sk_state) {
        case DCCP_PASSIVE_CLOSE:
        case DCCP_PASSIVE_CLOSEREQ:
                dccp_finish_passive_close(sk);
                break;
        case DCCP_PARTOPEN:
                dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
                fallthrough;
        case DCCP_OPEN:
                dccp_send_close(sk, 1);

                if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
                    !dccp_sk(sk)->dccps_server_timewait)
                        next_state = DCCP_ACTIVE_CLOSEREQ;
                else
                        next_state = DCCP_CLOSING;
                fallthrough;
        default:
                dccp_set_state(sk, next_state);
        }
}

void dccp_close(struct sock *sk, long timeout)
{
        struct dccp_sock *dp = dccp_sk(sk);
        struct sk_buff *skb;
        u32 data_was_unread = 0;
        int state;

        lock_sock(sk);

        sk->sk_shutdown = SHUTDOWN_MASK;

        if (sk->sk_state == DCCP_LISTEN) {
                dccp_set_state(sk, DCCP_CLOSED);

                /* Special case. */
                inet_csk_listen_stop(sk);

                goto adjudge_to_death;
        }

        sk_stop_timer(sk, &dp->dccps_xmit_timer);

        /*
         * We need to flush the recv. buffs.  We do this only on the
         * descriptor close, not protocol-sourced closes, because the
          *reader process may not have drained the data yet!
         */
        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
                data_was_unread += skb->len;
                __kfree_skb(skb);
        }

        /* If socket has been already reset kill it. */
        if (sk->sk_state == DCCP_CLOSED)
                goto adjudge_to_death;

        if (data_was_unread) {
                /* Unread data was tossed, send an appropriate Reset Code */
                DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
                dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
                dccp_set_state(sk, DCCP_CLOSED);
        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
                /* Check zero linger _after_ checking for unread data. */
                sk->sk_prot->disconnect(sk, 0);
        } else if (sk->sk_state != DCCP_CLOSED) {
                /*
                 * Normal connection termination. May need to wait if there are
                 * still packets in the TX queue that are delayed by the CCID.
                 */
                dccp_flush_write_queue(sk, &timeout);
                dccp_terminate_connection(sk);
        }

        /*
         * Flush write queue. This may be necessary in several cases:
         * - we have been closed by the peer but still have application data;
         * - abortive termination (unread data or zero linger time),
         * - normal termination but queue could not be flushed within time limit
         */
        __skb_queue_purge(&sk->sk_write_queue);

        sk_stream_wait_close(sk, timeout);

adjudge_to_death:
        state = sk->sk_state;
        sock_hold(sk);
        sock_orphan(sk);

        /*
         * It is the last release_sock in its life. It will remove backlog.
         */
        release_sock(sk);
        /*
         * Now socket is owned by kernel and we acquire BH lock
         * to finish close. No need to check for user refs.
         */
        local_bh_disable();
        bh_lock_sock(sk);
        WARN_ON(sock_owned_by_user(sk));

        this_cpu_inc(dccp_orphan_count);

        /* Have we already been destroyed by a softirq or backlog? */
        if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
                goto out;

        if (sk->sk_state == DCCP_CLOSED)
                inet_csk_destroy_sock(sk);

        /* Otherwise, socket is reprieved until protocol close. */

out:
        bh_unlock_sock(sk);
        local_bh_enable();
        sock_put(sk);
}

EXPORT_SYMBOL_GPL(dccp_close);

void dccp_shutdown(struct sock *sk, int how)
{
        dccp_pr_debug("called shutdown(%x)\n", how);
}

EXPORT_SYMBOL_GPL(dccp_shutdown);

static inline int __init dccp_mib_init(void)
{
        dccp_statistics = alloc_percpu(struct dccp_mib);
        if (!dccp_statistics)
                return -ENOMEM;
        return 0;
}

static inline void dccp_mib_exit(void)
{
        free_percpu(dccp_statistics);
}

static int thash_entries;
module_param(thash_entries, int, 0444);
MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");

#ifdef CONFIG_IP_DCCP_DEBUG
bool dccp_debug;
module_param(dccp_debug, bool, 0644);
MODULE_PARM_DESC(dccp_debug, "Enable debug messages");

EXPORT_SYMBOL_GPL(dccp_debug);
#endif

static int __init dccp_init(void)
{
        unsigned long goal;
        unsigned long nr_pages = totalram_pages();
        int ehash_order, bhash_order, i;
        int rc;

        BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
                     sizeof_field(struct sk_buff, cb));
        rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
        if (rc)
                goto out_fail;
        rc = -ENOBUFS;
        dccp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("dccp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
        if (!dccp_hashinfo.bind_bucket_cachep)
                goto out_free_hashinfo2;
        dccp_hashinfo.bind2_bucket_cachep =
                kmem_cache_create("dccp_bind2_bucket",
                                  sizeof(struct inet_bind2_bucket), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
        if (!dccp_hashinfo.bind2_bucket_cachep)
                goto out_free_bind_bucket_cachep;

        /*
         * Size and allocate the main established and bind bucket
         * hash tables.
         *
         * The methodology is similar to that of the buffer cache.
         */
        if (nr_pages >= (128 * 1024))
                goal = nr_pages >> (21 - PAGE_SHIFT);
        else
                goal = nr_pages >> (23 - PAGE_SHIFT);

        if (thash_entries)
                goal = (thash_entries *
                        sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
        for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
                ;
        do {
                unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
                                        sizeof(struct inet_ehash_bucket);

                while (hash_size & (hash_size - 1))
                        hash_size--;
                dccp_hashinfo.ehash_mask = hash_size - 1;
                dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
                        __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
        } while (!dccp_hashinfo.ehash && --ehash_order > 0);

        if (!dccp_hashinfo.ehash) {
                DCCP_CRIT("Failed to allocate DCCP established hash table");
                goto out_free_bind2_bucket_cachep;
        }

        for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
                INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);

        if (inet_ehash_locks_alloc(&dccp_hashinfo))
                        goto out_free_dccp_ehash;

        bhash_order = ehash_order;

        do {
                dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
                                        sizeof(struct inet_bind_hashbucket);
                if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
                    bhash_order > 0)
                        continue;
                dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
                        __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
        } while (!dccp_hashinfo.bhash && --bhash_order >= 0);

        if (!dccp_hashinfo.bhash) {
                DCCP_CRIT("Failed to allocate DCCP bind hash table");
                goto out_free_dccp_locks;
        }

        dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *)
                __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);

        if (!dccp_hashinfo.bhash2) {
                DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
                goto out_free_dccp_bhash;
        }

        for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
                spin_lock_init(&dccp_hashinfo.bhash[i].lock);
                INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
                spin_lock_init(&dccp_hashinfo.bhash2[i].lock);
                INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
        }

        dccp_hashinfo.pernet = false;

        rc = dccp_mib_init();
        if (rc)
                goto out_free_dccp_bhash2;

        rc = dccp_ackvec_init();
        if (rc)
                goto out_free_dccp_mib;

        rc = dccp_sysctl_init();
        if (rc)
                goto out_ackvec_exit;

        rc = ccid_initialize_builtins();
        if (rc)
                goto out_sysctl_exit;

        dccp_timestamping_init();

        return 0;

out_sysctl_exit:
        dccp_sysctl_exit();
out_ackvec_exit:
        dccp_ackvec_exit();
out_free_dccp_mib:
        dccp_mib_exit();
out_free_dccp_bhash2:
        free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
out_free_dccp_bhash:
        free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
out_free_dccp_locks:
        inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
        free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
out_free_bind2_bucket_cachep:
        kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
out_free_bind_bucket_cachep:
        kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2:
        inet_hashinfo2_free_mod(&dccp_hashinfo);
out_fail:
        dccp_hashinfo.bhash = NULL;
        dccp_hashinfo.bhash2 = NULL;
        dccp_hashinfo.ehash = NULL;
        dccp_hashinfo.bind_bucket_cachep = NULL;
        dccp_hashinfo.bind2_bucket_cachep = NULL;
        return rc;
}

static void __exit dccp_fini(void)
{
        int bhash_order = get_order(dccp_hashinfo.bhash_size *
                                    sizeof(struct inet_bind_hashbucket));

        ccid_cleanup_builtins();
        dccp_mib_exit();
        free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
        free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
        free_pages((unsigned long)dccp_hashinfo.ehash,
                   get_order((dccp_hashinfo.ehash_mask + 1) *
                             sizeof(struct inet_ehash_bucket)));
        inet_ehash_locks_free(&dccp_hashinfo);
        kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
        dccp_ackvec_exit();
        dccp_sysctl_exit();
        inet_hashinfo2_free_mod(&dccp_hashinfo);
}

module_init(dccp_init);
module_exit(dccp_fini);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");












































































    1 





    1 









    1 


    1 


























































































































































































































































































    1 








    1 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>

#define CREATE_TRACE_POINTS
#include <trace/events/notifier.h>

/*
 *        Notifier list for kernel code which wants to be called
 *        at shutdown. This is used to stop any idling DMA operations
 *        and the like.
 */
BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);

/*
 *        Notifier chain core routines.  The exported routines below
 *        are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                                   struct notifier_block *n,
                                   bool unique_priority)
{
        while ((*nl) != NULL) {
                if (unlikely((*nl) == n)) {
                        WARN(1, "notifier callback %ps already registered",
                             n->notifier_call);
                        return -EEXIST;
                }
                if (n->priority > (*nl)->priority)
                        break;
                if (n->priority == (*nl)->priority && unique_priority)
                        return -EBUSY;
                nl = &((*nl)->next);
        }
        n->next = *nl;
        rcu_assign_pointer(*nl, n);
        trace_notifier_register((void *)n->notifier_call);
        return 0;
}

static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
{
        while ((*nl) != NULL) {
                if ((*nl) == n) {
                        rcu_assign_pointer(*nl, n->next);
                        trace_notifier_unregister((void *)n->notifier_call);
                        return 0;
                }
                nl = &((*nl)->next);
        }
        return -ENOENT;
}

/**
 * notifier_call_chain - Informs the registered notifiers about an event.
 *        @nl:                Pointer to head of the blocking notifier chain
 *        @val:                Value passed unmodified to notifier function
 *        @v:                Pointer passed unmodified to notifier function
 *        @nr_to_call:        Number of notifier functions to be called. Don't care
 *                        value of this parameter is -1.
 *        @nr_calls:        Records the number of notifications sent. Don't care
 *                        value of this field is NULL.
 *        Return:                notifier_call_chain returns the value returned by the
 *                        last notifier function called.
 */
static int notifier_call_chain(struct notifier_block **nl,
                               unsigned long val, void *v,
                               int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference_raw(*nl);

        while (nb && nr_to_call) {
                next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                trace_notifier_run((void *)nb->notifier_call);
                ret = nb->notifier_call(nb, val, v);

                if (nr_calls)
                        (*nr_calls)++;

                if (ret & NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);

/**
 * notifier_call_chain_robust - Inform the registered notifiers about an event
 *                              and rollback on error.
 * @nl:                Pointer to head of the blocking notifier chain
 * @val_up:        Value passed unmodified to the notifier function
 * @val_down:        Value passed unmodified to the notifier function when recovering
 *              from an error on @val_up
 * @v:                Pointer passed unmodified to the notifier function
 *
 * NOTE:        It is important the @nl chain doesn't change between the two
 *                invocations of notifier_call_chain() such that we visit the
 *                exact same notifier callbacks; this rules out any RCU usage.
 *
 * Return:        the return value of the @val_up call.
 */
static int notifier_call_chain_robust(struct notifier_block **nl,
                                     unsigned long val_up, unsigned long val_down,
                                     void *v)
{
        int ret, nr = 0;

        ret = notifier_call_chain(nl, val_up, v, -1, &nr);
        if (ret & NOTIFY_STOP_MASK)
                notifier_call_chain(nl, val_down, v, nr-1, NULL);

        return ret;
}

/*
 *        Atomic notifier chain routines.  Registration and unregistration
 *        use a spinlock, and call_chain is synchronized by RCU (no locks).
 */

/**
 *        atomic_notifier_chain_register - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, false);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);

/**
 *        atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
                                               struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, true);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);

/**
 *        atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an atomic notifier chain.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_unregister(&nh->head, n);
        spin_unlock_irqrestore(&nh->lock, flags);
        synchronize_rcu();
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);

/**
 *        atomic_notifier_call_chain - Call functions in an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an atomic context, so they must not block.
 *        This routine uses RCU to synchronize with changes to the chain.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                               unsigned long val, void *v)
{
        int ret;

        rcu_read_lock();
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);

/**
 *        atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
 *        @nh: Pointer to head of the atomic notifier chain
 *
 *        Checks whether notifier chain is empty.
 *
 *        Returns true is notifier chain is empty, false otherwise.
 */
bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
{
        return !rcu_access_pointer(nh->head);
}

/*
 *        Blocking notifier chain routines.  All access to the chain is
 *        synchronized by an rwsem.
 */

static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                                              struct notifier_block *n,
                                              bool unique_priority)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, unique_priority);

        down_write(&nh->rwsem);
        ret = notifier_chain_register(&nh->head, n, unique_priority);
        up_write(&nh->rwsem);
        return ret;
}

/**
 *        blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a blocking notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, false);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);

/**
 *        blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an blocking notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
                                                 struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, true);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);

/**
 *        blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a blocking notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        down_write(&nh->rwsem);
        ret = notifier_chain_unregister(&nh->head, n);
        up_write(&nh->rwsem);
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);

int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);

/**
 *        blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);

/*
 *        Raw notifier chain routines.  There is no protection;
 *        the caller must provide it.  Use at your own risk!
 */

/**
 *        raw_notifier_chain_register - Add notifier to a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);

/**
 *        raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_unregister(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);

int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);

/**
 *        raw_notifier_call_chain - Call functions in a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an undefined context.
 *        All locking must be provided by the caller.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v)
{
        return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);

/*
 *        SRCU notifier chain routines.    Registration and unregistration
 *        use a mutex, and call_chain is synchronized by SRCU (no locks).
 */

/**
 *        srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an SRCU notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, false);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_register(&nh->head, n, false);
        mutex_unlock(&nh->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);

/**
 *        srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an SRCU notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_unregister(&nh->head, n);
        mutex_unlock(&nh->mutex);
        synchronize_srcu(&nh->srcu);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);

/**
 *        srcu_notifier_call_chain - Call functions in an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret;
        int idx;

        idx = srcu_read_lock(&nh->srcu);
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        srcu_read_unlock(&nh->srcu, idx);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);

/**
 *        srcu_init_notifier_head - Initialize an SRCU notifier head
 *        @nh: Pointer to head of the srcu notifier chain
 *
 *        Unlike other sorts of notifier heads, SRCU notifier heads require
 *        dynamic initialization.  Be sure to call this routine before
 *        calling any of the other SRCU notifier routines for this head.
 *
 *        If an SRCU notifier head is deallocated, it must first be cleaned
 *        up by calling srcu_cleanup_notifier_head().  Otherwise the head's
 *        per-cpu data (used by the SRCU mechanism) will leak.
 */
void srcu_init_notifier_head(struct srcu_notifier_head *nh)
{
        mutex_init(&nh->mutex);
        if (init_srcu_struct(&nh->srcu) < 0)
                BUG();
        nh->head = NULL;
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);

static ATOMIC_NOTIFIER_HEAD(die_chain);

int notrace notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
{
        struct die_args args = {
                .regs        = regs,
                .str        = str,
                .err        = err,
                .trapnr        = trap,
                .signr        = sig,

        };
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                           "notify_die called but RCU thinks we're quiescent");
        return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);

int register_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);

int unregister_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);


























































































































































    1 










    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Cryptographic API.
 *
 * Null algorithms, aka Much Ado About Nothing.
 *
 * These are needed for IPsec, and may be useful in general for
 * testing & debugging.
 *
 * The null cipher is compliant with RFC2410.
 *
 * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
 */

#include <crypto/null.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/skcipher.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/string.h>

static DEFINE_MUTEX(crypto_default_null_skcipher_lock);
static struct crypto_sync_skcipher *crypto_default_null_skcipher;
static int crypto_default_null_skcipher_refcnt;

static int null_compress(struct crypto_tfm *tfm, const u8 *src,
                         unsigned int slen, u8 *dst, unsigned int *dlen)
{
        if (slen > *dlen)
                return -EINVAL;
        memcpy(dst, src, slen);
        *dlen = slen;
        return 0;
}

static int null_init(struct shash_desc *desc)
{
        return 0;
}

static int null_update(struct shash_desc *desc, const u8 *data,
                       unsigned int len)
{
        return 0;
}

static int null_final(struct shash_desc *desc, u8 *out)
{
        return 0;
}

static int null_digest(struct shash_desc *desc, const u8 *data,
                       unsigned int len, u8 *out)
{
        return 0;
}

static int null_hash_setkey(struct crypto_shash *tfm, const u8 *key,
                            unsigned int keylen)
{ return 0; }

static int null_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
                                unsigned int keylen)
{ return 0; }

static int null_setkey(struct crypto_tfm *tfm, const u8 *key,
                       unsigned int keylen)
{ return 0; }

static void null_crypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
        memcpy(dst, src, NULL_BLOCK_SIZE);
}

static int null_skcipher_crypt(struct skcipher_request *req)
{
        struct skcipher_walk walk;
        int err;

        err = skcipher_walk_virt(&walk, req, false);

        while (walk.nbytes) {
                if (walk.src.virt.addr != walk.dst.virt.addr)
                        memcpy(walk.dst.virt.addr, walk.src.virt.addr,
                               walk.nbytes);
                err = skcipher_walk_done(&walk, 0);
        }

        return err;
}

static struct shash_alg digest_null = {
        .digestsize                =        NULL_DIGEST_SIZE,
        .setkey                   =        null_hash_setkey,
        .init                   =        null_init,
        .update                 =        null_update,
        .finup                         =        null_digest,
        .digest                 =        null_digest,
        .final                  =        null_final,
        .base                        =        {
                .cra_name                =        "digest_null",
                .cra_driver_name        =        "digest_null-generic",
                .cra_blocksize                =        NULL_BLOCK_SIZE,
                .cra_module                =        THIS_MODULE,
        }
};

static struct skcipher_alg skcipher_null = {
        .base.cra_name                =        "ecb(cipher_null)",
        .base.cra_driver_name        =        "ecb-cipher_null",
        .base.cra_priority        =        100,
        .base.cra_blocksize        =        NULL_BLOCK_SIZE,
        .base.cra_ctxsize        =        0,
        .base.cra_module        =        THIS_MODULE,
        .min_keysize                =        NULL_KEY_SIZE,
        .max_keysize                =        NULL_KEY_SIZE,
        .ivsize                        =        NULL_IV_SIZE,
        .setkey                        =        null_skcipher_setkey,
        .encrypt                =        null_skcipher_crypt,
        .decrypt                =        null_skcipher_crypt,
};

static struct crypto_alg null_algs[] = { {
        .cra_name                =        "cipher_null",
        .cra_driver_name        =        "cipher_null-generic",
        .cra_flags                =        CRYPTO_ALG_TYPE_CIPHER,
        .cra_blocksize                =        NULL_BLOCK_SIZE,
        .cra_ctxsize                =        0,
        .cra_module                =        THIS_MODULE,
        .cra_u                        =        { .cipher = {
        .cia_min_keysize        =        NULL_KEY_SIZE,
        .cia_max_keysize        =        NULL_KEY_SIZE,
        .cia_setkey                =         null_setkey,
        .cia_encrypt                =        null_crypt,
        .cia_decrypt                =        null_crypt } }
}, {
        .cra_name                =        "compress_null",
        .cra_driver_name        =        "compress_null-generic",
        .cra_flags                =        CRYPTO_ALG_TYPE_COMPRESS,
        .cra_blocksize                =        NULL_BLOCK_SIZE,
        .cra_ctxsize                =        0,
        .cra_module                =        THIS_MODULE,
        .cra_u                        =        { .compress = {
        .coa_compress                =        null_compress,
        .coa_decompress                =        null_compress } }
} };

MODULE_ALIAS_CRYPTO("compress_null");
MODULE_ALIAS_CRYPTO("digest_null");
MODULE_ALIAS_CRYPTO("cipher_null");

struct crypto_sync_skcipher *crypto_get_default_null_skcipher(void)
{
        struct crypto_sync_skcipher *tfm;

        mutex_lock(&crypto_default_null_skcipher_lock);
        tfm = crypto_default_null_skcipher;

        if (!tfm) {
                tfm = crypto_alloc_sync_skcipher("ecb(cipher_null)", 0, 0);
                if (IS_ERR(tfm))
                        goto unlock;

                crypto_default_null_skcipher = tfm;
        }

        crypto_default_null_skcipher_refcnt++;

unlock:
        mutex_unlock(&crypto_default_null_skcipher_lock);

        return tfm;
}
EXPORT_SYMBOL_GPL(crypto_get_default_null_skcipher);

void crypto_put_default_null_skcipher(void)
{
        mutex_lock(&crypto_default_null_skcipher_lock);
        if (!--crypto_default_null_skcipher_refcnt) {
                crypto_free_sync_skcipher(crypto_default_null_skcipher);
                crypto_default_null_skcipher = NULL;
        }
        mutex_unlock(&crypto_default_null_skcipher_lock);
}
EXPORT_SYMBOL_GPL(crypto_put_default_null_skcipher);

static int __init crypto_null_mod_init(void)
{
        int ret = 0;

        ret = crypto_register_algs(null_algs, ARRAY_SIZE(null_algs));
        if (ret < 0)
                goto out;

        ret = crypto_register_shash(&digest_null);
        if (ret < 0)
                goto out_unregister_algs;

        ret = crypto_register_skcipher(&skcipher_null);
        if (ret < 0)
                goto out_unregister_shash;

        return 0;

out_unregister_shash:
        crypto_unregister_shash(&digest_null);
out_unregister_algs:
        crypto_unregister_algs(null_algs, ARRAY_SIZE(null_algs));
out:
        return ret;
}

static void __exit crypto_null_mod_fini(void)
{
        crypto_unregister_algs(null_algs, ARRAY_SIZE(null_algs));
        crypto_unregister_shash(&digest_null);
        crypto_unregister_skcipher(&skcipher_null);
}

subsys_initcall(crypto_null_mod_init);
module_exit(crypto_null_mod_fini);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Null Cryptographic Algorithms");










































   19 










    1 























   19 

















    1 






































    3 
   19 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_

#include <asm/processor-flags.h>

#ifndef __ASSEMBLY__

#include <asm/nospec-branch.h>

/*
 * Interrupt control:
 */

/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern __always_inline unsigned long native_save_fl(void)
{
        unsigned long flags;

        /*
         * "=rm" is safe here, because "pop" adjusts the stack before
         * it evaluates its effective address -- this is part of the
         * documented behavior of the "pop" instruction.
         */
        asm volatile("# __raw_save_flags\n\t"
                     "pushf ; pop %0"
                     : "=rm" (flags)
                     : /* no input */
                     : "memory");

        return flags;
}

static __always_inline void native_irq_disable(void)
{
        asm volatile("cli": : :"memory");
}

static __always_inline void native_irq_enable(void)
{
        asm volatile("sti": : :"memory");
}

static __always_inline void native_safe_halt(void)
{
        mds_idle_clear_cpu_buffers();
        asm volatile("sti; hlt": : :"memory");
}

static __always_inline void native_halt(void)
{
        mds_idle_clear_cpu_buffers();
        asm volatile("hlt": : :"memory");
}

#endif

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
#include <linux/types.h>

static __always_inline unsigned long arch_local_save_flags(void)
{
        return native_save_fl();
}

static __always_inline void arch_local_irq_disable(void)
{
        native_irq_disable();
}

static __always_inline void arch_local_irq_enable(void)
{
        native_irq_enable();
}

/*
 * Used in the idle loop; sti takes one instruction cycle
 * to complete:
 */
static __always_inline void arch_safe_halt(void)
{
        native_safe_halt();
}

/*
 * Used when interrupts are already enabled or to
 * shutdown the processor:
 */
static __always_inline void halt(void)
{
        native_halt();
}

/*
 * For spinlocks, etc:
 */
static __always_inline unsigned long arch_local_irq_save(void)
{
        unsigned long flags = arch_local_save_flags();
        arch_local_irq_disable();
        return flags;
}
#else

#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS                pushfq; popq %rax
#endif

#endif

#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT_XXL */

#ifndef __ASSEMBLY__
static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline int arch_irqs_disabled(void)
{
        unsigned long flags = arch_local_save_flags();

        return arch_irqs_disabled_flags(flags);
}

static __always_inline void arch_local_irq_restore(unsigned long flags)
{
        if (!arch_irqs_disabled_flags(flags))
                arch_local_irq_enable();
}
#endif /* !__ASSEMBLY__ */

#endif










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
























































    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright Red Hat Inc. 2017
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions implement sctp stream message interleaving, mostly
 * including I-DATA and I-FORWARD-TSN chunks process.
 *
 * Please send any bug reports or fixes you make to the
 * email addresched(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    Xin Long <lucien.xin@gmail.com>
 */

#include <net/busy_poll.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/ulpevent.h>
#include <linux/sctp.h>

static struct sctp_chunk *sctp_make_idatafrag_empty(
                                        const struct sctp_association *asoc,
                                        const struct sctp_sndrcvinfo *sinfo,
                                        int len, __u8 flags, gfp_t gfp)
{
        struct sctp_chunk *retval;
        struct sctp_idatahdr dp;

        memset(&dp, 0, sizeof(dp));
        dp.stream = htons(sinfo->sinfo_stream);

        if (sinfo->sinfo_flags & SCTP_UNORDERED)
                flags |= SCTP_DATA_UNORDERED;

        retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp);
        if (!retval)
                return NULL;

        retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
        memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));

        return retval;
}

static void sctp_chunk_assign_mid(struct sctp_chunk *chunk)
{
        struct sctp_stream *stream;
        struct sctp_chunk *lchunk;
        __u32 cfsn = 0;
        __u16 sid;

        if (chunk->has_mid)
                return;

        sid = sctp_chunk_stream_no(chunk);
        stream = &chunk->asoc->stream;

        list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) {
                struct sctp_idatahdr *hdr;
                __u32 mid;

                lchunk->has_mid = 1;

                hdr = lchunk->subh.idata_hdr;

                if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)
                        hdr->ppid = lchunk->sinfo.sinfo_ppid;
                else
                        hdr->fsn = htonl(cfsn++);

                if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
                        mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
                                sctp_mid_uo_next(stream, out, sid) :
                                sctp_mid_uo_peek(stream, out, sid);
                } else {
                        mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ?
                                sctp_mid_next(stream, out, sid) :
                                sctp_mid_peek(stream, out, sid);
                }
                hdr->mid = htonl(mid);
        }
}

static bool sctp_validate_data(struct sctp_chunk *chunk)
{
        struct sctp_stream *stream;
        __u16 sid, ssn;

        if (chunk->chunk_hdr->type != SCTP_CID_DATA)
                return false;

        if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
                return true;

        stream = &chunk->asoc->stream;
        sid = sctp_chunk_stream_no(chunk);
        ssn = ntohs(chunk->subh.data_hdr->ssn);

        return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid));
}

static bool sctp_validate_idata(struct sctp_chunk *chunk)
{
        struct sctp_stream *stream;
        __u32 mid;
        __u16 sid;

        if (chunk->chunk_hdr->type != SCTP_CID_I_DATA)
                return false;

        if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
                return true;

        stream = &chunk->asoc->stream;
        sid = sctp_chunk_stream_no(chunk);
        mid = ntohl(chunk->subh.idata_hdr->mid);

        return !MID_lt(mid, sctp_mid_peek(stream, in, sid));
}

static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq,
                                  struct sctp_ulpevent *event)
{
        struct sctp_ulpevent *cevent;
        struct sk_buff *pos, *loc;

        pos = skb_peek_tail(&ulpq->reasm);
        if (!pos) {
                __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
                return;
        }

        cevent = sctp_skb2event(pos);

        if (event->stream == cevent->stream &&
            event->mid == cevent->mid &&
            (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
             (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
              event->fsn > cevent->fsn))) {
                __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
                return;
        }

        if ((event->stream == cevent->stream &&
             MID_lt(cevent->mid, event->mid)) ||
            event->stream > cevent->stream) {
                __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
                return;
        }

        loc = NULL;
        skb_queue_walk(&ulpq->reasm, pos) {
                cevent = sctp_skb2event(pos);

                if (event->stream < cevent->stream ||
                    (event->stream == cevent->stream &&
                     MID_lt(event->mid, cevent->mid))) {
                        loc = pos;
                        break;
                }
                if (event->stream == cevent->stream &&
                    event->mid == cevent->mid &&
                    !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
                    (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
                     event->fsn < cevent->fsn)) {
                        loc = pos;
                        break;
                }
        }

        if (!loc)
                __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event));
        else
                __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event));
}

static struct sctp_ulpevent *sctp_intl_retrieve_partial(
                                                struct sctp_ulpq *ulpq,
                                                struct sctp_ulpevent *event)
{
        struct sk_buff *first_frag = NULL;
        struct sk_buff *last_frag = NULL;
        struct sctp_ulpevent *retval;
        struct sctp_stream_in *sin;
        struct sk_buff *pos;
        __u32 next_fsn = 0;
        int is_last = 0;

        sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);

        skb_queue_walk(&ulpq->reasm, pos) {
                struct sctp_ulpevent *cevent = sctp_skb2event(pos);

                if (cevent->stream < event->stream)
                        continue;

                if (cevent->stream > event->stream ||
                    cevent->mid != sin->mid)
                        break;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        goto out;
                case SCTP_DATA_MIDDLE_FRAG:
                        if (!first_frag) {
                                if (cevent->fsn == sin->fsn) {
                                        first_frag = pos;
                                        last_frag = pos;
                                        next_fsn = cevent->fsn + 1;
                                }
                        } else if (cevent->fsn == next_fsn) {
                                last_frag = pos;
                                next_fsn++;
                        } else {
                                goto out;
                        }
                        break;
                case SCTP_DATA_LAST_FRAG:
                        if (!first_frag) {
                                if (cevent->fsn == sin->fsn) {
                                        first_frag = pos;
                                        last_frag = pos;
                                        next_fsn = 0;
                                        is_last = 1;
                                }
                        } else if (cevent->fsn == next_fsn) {
                                last_frag = pos;
                                next_fsn = 0;
                                is_last = 1;
                        }
                        goto out;
                default:
                        goto out;
                }
        }

out:
        if (!first_frag)
                return NULL;

        retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm,
                                             first_frag, last_frag);
        if (retval) {
                sin->fsn = next_fsn;
                if (is_last) {
                        retval->msg_flags |= MSG_EOR;
                        sin->pd_mode = 0;
                }
        }

        return retval;
}

static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
                                                struct sctp_ulpq *ulpq,
                                                struct sctp_ulpevent *event)
{
        struct sctp_association *asoc = ulpq->asoc;
        struct sk_buff *pos, *first_frag = NULL;
        struct sctp_ulpevent *retval = NULL;
        struct sk_buff *pd_first = NULL;
        struct sk_buff *pd_last = NULL;
        struct sctp_stream_in *sin;
        __u32 next_fsn = 0;
        __u32 pd_point = 0;
        __u32 pd_len = 0;
        __u32 mid = 0;

        sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);

        skb_queue_walk(&ulpq->reasm, pos) {
                struct sctp_ulpevent *cevent = sctp_skb2event(pos);

                if (cevent->stream < event->stream)
                        continue;
                if (cevent->stream > event->stream)
                        break;

                if (MID_lt(cevent->mid, event->mid))
                        continue;
                if (MID_lt(event->mid, cevent->mid))
                        break;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        if (cevent->mid == sin->mid) {
                                pd_first = pos;
                                pd_last = pos;
                                pd_len = pos->len;
                        }

                        first_frag = pos;
                        next_fsn = 0;
                        mid = cevent->mid;
                        break;

                case SCTP_DATA_MIDDLE_FRAG:
                        if (first_frag && cevent->mid == mid &&
                            cevent->fsn == next_fsn) {
                                next_fsn++;
                                if (pd_first) {
                                        pd_last = pos;
                                        pd_len += pos->len;
                                }
                        } else {
                                first_frag = NULL;
                        }
                        break;

                case SCTP_DATA_LAST_FRAG:
                        if (first_frag && cevent->mid == mid &&
                            cevent->fsn == next_fsn)
                                goto found;
                        else
                                first_frag = NULL;
                        break;
                }
        }

        if (!pd_first)
                goto out;

        pd_point = sctp_sk(asoc->base.sk)->pd_point;
        if (pd_point && pd_point <= pd_len) {
                retval = sctp_make_reassembled_event(asoc->base.net,
                                                     &ulpq->reasm,
                                                     pd_first, pd_last);
                if (retval) {
                        sin->fsn = next_fsn;
                        sin->pd_mode = 1;
                }
        }
        goto out;

found:
        retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm,
                                             first_frag, pos);
        if (retval)
                retval->msg_flags |= MSG_EOR;

out:
        return retval;
}

static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq,
                                             struct sctp_ulpevent *event)
{
        struct sctp_ulpevent *retval = NULL;
        struct sctp_stream_in *sin;

        if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
                event->msg_flags |= MSG_EOR;
                return event;
        }

        sctp_intl_store_reasm(ulpq, event);

        sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
        if (sin->pd_mode && event->mid == sin->mid &&
            event->fsn == sin->fsn)
                retval = sctp_intl_retrieve_partial(ulpq, event);

        if (!retval)
                retval = sctp_intl_retrieve_reassembled(ulpq, event);

        return retval;
}

static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq,
                                    struct sctp_ulpevent *event)
{
        struct sctp_ulpevent *cevent;
        struct sk_buff *pos, *loc;

        pos = skb_peek_tail(&ulpq->lobby);
        if (!pos) {
                __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
                return;
        }

        cevent = (struct sctp_ulpevent *)pos->cb;
        if (event->stream == cevent->stream &&
            MID_lt(cevent->mid, event->mid)) {
                __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
                return;
        }

        if (event->stream > cevent->stream) {
                __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
                return;
        }

        loc = NULL;
        skb_queue_walk(&ulpq->lobby, pos) {
                cevent = (struct sctp_ulpevent *)pos->cb;

                if (cevent->stream > event->stream) {
                        loc = pos;
                        break;
                }
                if (cevent->stream == event->stream &&
                    MID_lt(event->mid, cevent->mid)) {
                        loc = pos;
                        break;
                }
        }

        if (!loc)
                __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event));
        else
                __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event));
}

static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq,
                                       struct sctp_ulpevent *event)
{
        struct sk_buff_head *event_list;
        struct sctp_stream *stream;
        struct sk_buff *pos, *tmp;
        __u16 sid = event->stream;

        stream  = &ulpq->asoc->stream;
        event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev;

        sctp_skb_for_each(pos, &ulpq->lobby, tmp) {
                struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb;

                if (cevent->stream > sid)
                        break;

                if (cevent->stream < sid)
                        continue;

                if (cevent->mid != sctp_mid_peek(stream, in, sid))
                        break;

                sctp_mid_next(stream, in, sid);

                __skb_unlink(pos, &ulpq->lobby);

                __skb_queue_tail(event_list, pos);
        }
}

static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq,
                                             struct sctp_ulpevent *event)
{
        struct sctp_stream *stream;
        __u16 sid;

        stream  = &ulpq->asoc->stream;
        sid = event->stream;

        if (event->mid != sctp_mid_peek(stream, in, sid)) {
                sctp_intl_store_ordered(ulpq, event);
                return NULL;
        }

        sctp_mid_next(stream, in, sid);

        sctp_intl_retrieve_ordered(ulpq, event);

        return event;
}

static int sctp_enqueue_event(struct sctp_ulpq *ulpq,
                              struct sk_buff_head *skb_list)
{
        struct sock *sk = ulpq->asoc->base.sk;
        struct sctp_sock *sp = sctp_sk(sk);
        struct sctp_ulpevent *event;
        struct sk_buff *skb;

        skb = __skb_peek(skb_list);
        event = sctp_skb2event(skb);

        if (sk->sk_shutdown & RCV_SHUTDOWN &&
            (sk->sk_shutdown & SEND_SHUTDOWN ||
             !sctp_ulpevent_is_notification(event)))
                goto out_free;

        if (!sctp_ulpevent_is_notification(event)) {
                sk_mark_napi_id(sk, skb);
                sk_incoming_cpu_update(sk);
        }

        if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe))
                goto out_free;

        skb_queue_splice_tail_init(skb_list,
                                   &sk->sk_receive_queue);

        if (!sp->data_ready_signalled) {
                sp->data_ready_signalled = 1;
                sk->sk_data_ready(sk);
        }

        return 1;

out_free:
        sctp_queue_purge_ulpevents(skb_list);

        return 0;
}

static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq,
                                     struct sctp_ulpevent *event)
{
        struct sctp_ulpevent *cevent;
        struct sk_buff *pos;

        pos = skb_peek_tail(&ulpq->reasm_uo);
        if (!pos) {
                __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
                return;
        }

        cevent = sctp_skb2event(pos);

        if (event->stream == cevent->stream &&
            event->mid == cevent->mid &&
            (cevent->msg_flags & SCTP_DATA_FIRST_FRAG ||
             (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) &&
              event->fsn > cevent->fsn))) {
                __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
                return;
        }

        if ((event->stream == cevent->stream &&
             MID_lt(cevent->mid, event->mid)) ||
            event->stream > cevent->stream) {
                __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event));
                return;
        }

        skb_queue_walk(&ulpq->reasm_uo, pos) {
                cevent = sctp_skb2event(pos);

                if (event->stream < cevent->stream ||
                    (event->stream == cevent->stream &&
                     MID_lt(event->mid, cevent->mid)))
                        break;

                if (event->stream == cevent->stream &&
                    event->mid == cevent->mid &&
                    !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) &&
                    (event->msg_flags & SCTP_DATA_FIRST_FRAG ||
                     event->fsn < cevent->fsn))
                        break;
        }

        __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event));
}

static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo(
                                                struct sctp_ulpq *ulpq,
                                                struct sctp_ulpevent *event)
{
        struct sk_buff *first_frag = NULL;
        struct sk_buff *last_frag = NULL;
        struct sctp_ulpevent *retval;
        struct sctp_stream_in *sin;
        struct sk_buff *pos;
        __u32 next_fsn = 0;
        int is_last = 0;

        sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);

        skb_queue_walk(&ulpq->reasm_uo, pos) {
                struct sctp_ulpevent *cevent = sctp_skb2event(pos);

                if (cevent->stream < event->stream)
                        continue;
                if (cevent->stream > event->stream)
                        break;

                if (MID_lt(cevent->mid, sin->mid_uo))
                        continue;
                if (MID_lt(sin->mid_uo, cevent->mid))
                        break;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        goto out;
                case SCTP_DATA_MIDDLE_FRAG:
                        if (!first_frag) {
                                if (cevent->fsn == sin->fsn_uo) {
                                        first_frag = pos;
                                        last_frag = pos;
                                        next_fsn = cevent->fsn + 1;
                                }
                        } else if (cevent->fsn == next_fsn) {
                                last_frag = pos;
                                next_fsn++;
                        } else {
                                goto out;
                        }
                        break;
                case SCTP_DATA_LAST_FRAG:
                        if (!first_frag) {
                                if (cevent->fsn == sin->fsn_uo) {
                                        first_frag = pos;
                                        last_frag = pos;
                                        next_fsn = 0;
                                        is_last = 1;
                                }
                        } else if (cevent->fsn == next_fsn) {
                                last_frag = pos;
                                next_fsn = 0;
                                is_last = 1;
                        }
                        goto out;
                default:
                        goto out;
                }
        }

out:
        if (!first_frag)
                return NULL;

        retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
                                             &ulpq->reasm_uo, first_frag,
                                             last_frag);
        if (retval) {
                sin->fsn_uo = next_fsn;
                if (is_last) {
                        retval->msg_flags |= MSG_EOR;
                        sin->pd_mode_uo = 0;
                }
        }

        return retval;
}

static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
                                                struct sctp_ulpq *ulpq,
                                                struct sctp_ulpevent *event)
{
        struct sctp_association *asoc = ulpq->asoc;
        struct sk_buff *pos, *first_frag = NULL;
        struct sctp_ulpevent *retval = NULL;
        struct sk_buff *pd_first = NULL;
        struct sk_buff *pd_last = NULL;
        struct sctp_stream_in *sin;
        __u32 next_fsn = 0;
        __u32 pd_point = 0;
        __u32 pd_len = 0;
        __u32 mid = 0;

        sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);

        skb_queue_walk(&ulpq->reasm_uo, pos) {
                struct sctp_ulpevent *cevent = sctp_skb2event(pos);

                if (cevent->stream < event->stream)
                        continue;
                if (cevent->stream > event->stream)
                        break;

                if (MID_lt(cevent->mid, event->mid))
                        continue;
                if (MID_lt(event->mid, cevent->mid))
                        break;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        if (!sin->pd_mode_uo) {
                                sin->mid_uo = cevent->mid;
                                pd_first = pos;
                                pd_last = pos;
                                pd_len = pos->len;
                        }

                        first_frag = pos;
                        next_fsn = 0;
                        mid = cevent->mid;
                        break;

                case SCTP_DATA_MIDDLE_FRAG:
                        if (first_frag && cevent->mid == mid &&
                            cevent->fsn == next_fsn) {
                                next_fsn++;
                                if (pd_first) {
                                        pd_last = pos;
                                        pd_len += pos->len;
                                }
                        } else {
                                first_frag = NULL;
                        }
                        break;

                case SCTP_DATA_LAST_FRAG:
                        if (first_frag && cevent->mid == mid &&
                            cevent->fsn == next_fsn)
                                goto found;
                        else
                                first_frag = NULL;
                        break;
                }
        }

        if (!pd_first)
                goto out;

        pd_point = sctp_sk(asoc->base.sk)->pd_point;
        if (pd_point && pd_point <= pd_len) {
                retval = sctp_make_reassembled_event(asoc->base.net,
                                                     &ulpq->reasm_uo,
                                                     pd_first, pd_last);
                if (retval) {
                        sin->fsn_uo = next_fsn;
                        sin->pd_mode_uo = 1;
                }
        }
        goto out;

found:
        retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo,
                                             first_frag, pos);
        if (retval)
                retval->msg_flags |= MSG_EOR;

out:
        return retval;
}

static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq,
                                                struct sctp_ulpevent *event)
{
        struct sctp_ulpevent *retval = NULL;
        struct sctp_stream_in *sin;

        if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) {
                event->msg_flags |= MSG_EOR;
                return event;
        }

        sctp_intl_store_reasm_uo(ulpq, event);

        sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
        if (sin->pd_mode_uo && event->mid == sin->mid_uo &&
            event->fsn == sin->fsn_uo)
                retval = sctp_intl_retrieve_partial_uo(ulpq, event);

        if (!retval)
                retval = sctp_intl_retrieve_reassembled_uo(ulpq, event);

        return retval;
}

static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq)
{
        struct sctp_stream_in *csin, *sin = NULL;
        struct sk_buff *first_frag = NULL;
        struct sk_buff *last_frag = NULL;
        struct sctp_ulpevent *retval;
        struct sk_buff *pos;
        __u32 next_fsn = 0;
        __u16 sid = 0;

        skb_queue_walk(&ulpq->reasm_uo, pos) {
                struct sctp_ulpevent *cevent = sctp_skb2event(pos);

                csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream);
                if (csin->pd_mode_uo)
                        continue;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        if (first_frag)
                                goto out;
                        first_frag = pos;
                        last_frag = pos;
                        next_fsn = 0;
                        sin = csin;
                        sid = cevent->stream;
                        sin->mid_uo = cevent->mid;
                        break;
                case SCTP_DATA_MIDDLE_FRAG:
                        if (!first_frag)
                                break;
                        if (cevent->stream == sid &&
                            cevent->mid == sin->mid_uo &&
                            cevent->fsn == next_fsn) {
                                next_fsn++;
                                last_frag = pos;
                        } else {
                                goto out;
                        }
                        break;
                case SCTP_DATA_LAST_FRAG:
                        if (first_frag)
                                goto out;
                        break;
                default:
                        break;
                }
        }

        if (!first_frag)
                return NULL;

out:
        retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
                                             &ulpq->reasm_uo, first_frag,
                                             last_frag);
        if (retval) {
                sin->fsn_uo = next_fsn;
                sin->pd_mode_uo = 1;
        }

        return retval;
}

static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq,
                               struct sctp_chunk *chunk, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sk_buff_head temp;
        int event_eor = 0;

        event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp);
        if (!event)
                return -ENOMEM;

        event->mid = ntohl(chunk->subh.idata_hdr->mid);
        if (event->msg_flags & SCTP_DATA_FIRST_FRAG)
                event->ppid = chunk->subh.idata_hdr->ppid;
        else
                event->fsn = ntohl(chunk->subh.idata_hdr->fsn);

        if (!(event->msg_flags & SCTP_DATA_UNORDERED)) {
                event = sctp_intl_reasm(ulpq, event);
                if (event) {
                        skb_queue_head_init(&temp);
                        __skb_queue_tail(&temp, sctp_event2skb(event));

                        if (event->msg_flags & MSG_EOR)
                                event = sctp_intl_order(ulpq, event);
                }
        } else {
                event = sctp_intl_reasm_uo(ulpq, event);
                if (event) {
                        skb_queue_head_init(&temp);
                        __skb_queue_tail(&temp, sctp_event2skb(event));
                }
        }

        if (event) {
                event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0;
                sctp_enqueue_event(ulpq, &temp);
        }

        return event_eor;
}

static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq)
{
        struct sctp_stream_in *csin, *sin = NULL;
        struct sk_buff *first_frag = NULL;
        struct sk_buff *last_frag = NULL;
        struct sctp_ulpevent *retval;
        struct sk_buff *pos;
        __u32 next_fsn = 0;
        __u16 sid = 0;

        skb_queue_walk(&ulpq->reasm, pos) {
                struct sctp_ulpevent *cevent = sctp_skb2event(pos);

                csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream);
                if (csin->pd_mode)
                        continue;

                switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) {
                case SCTP_DATA_FIRST_FRAG:
                        if (first_frag)
                                goto out;
                        if (cevent->mid == csin->mid) {
                                first_frag = pos;
                                last_frag = pos;
                                next_fsn = 0;
                                sin = csin;
                                sid = cevent->stream;
                        }
                        break;
                case SCTP_DATA_MIDDLE_FRAG:
                        if (!first_frag)
                                break;
                        if (cevent->stream == sid &&
                            cevent->mid == sin->mid &&
                            cevent->fsn == next_fsn) {
                                next_fsn++;
                                last_frag = pos;
                        } else {
                                goto out;
                        }
                        break;
                case SCTP_DATA_LAST_FRAG:
                        if (first_frag)
                                goto out;
                        break;
                default:
                        break;
                }
        }

        if (!first_frag)
                return NULL;

out:
        retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
                                             &ulpq->reasm, first_frag,
                                             last_frag);
        if (retval) {
                sin->fsn = next_fsn;
                sin->pd_mode = 1;
        }

        return retval;
}

static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sk_buff_head temp;

        if (!skb_queue_empty(&ulpq->reasm)) {
                do {
                        event = sctp_intl_retrieve_first(ulpq);
                        if (event) {
                                skb_queue_head_init(&temp);
                                __skb_queue_tail(&temp, sctp_event2skb(event));
                                sctp_enqueue_event(ulpq, &temp);
                        }
                } while (event);
        }

        if (!skb_queue_empty(&ulpq->reasm_uo)) {
                do {
                        event = sctp_intl_retrieve_first_uo(ulpq);
                        if (event) {
                                skb_queue_head_init(&temp);
                                __skb_queue_tail(&temp, sctp_event2skb(event));
                                sctp_enqueue_event(ulpq, &temp);
                        }
                } while (event);
        }
}

static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
                               gfp_t gfp)
{
        struct sctp_association *asoc = ulpq->asoc;
        __u32 freed = 0;
        __u16 needed;

        needed = ntohs(chunk->chunk_hdr->length) -
                 sizeof(struct sctp_idata_chunk);

        if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) {
                freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed);
                if (freed < needed)
                        freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm,
                                                       needed);
                if (freed < needed)
                        freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo,
                                                       needed);
        }

        if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0)
                sctp_intl_start_pd(ulpq, gfp);
}

static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid,
                                      __u32 mid, __u16 flags, gfp_t gfp)
{
        struct sock *sk = ulpq->asoc->base.sk;
        struct sctp_ulpevent *ev = NULL;

        if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe,
                                        SCTP_PARTIAL_DELIVERY_EVENT))
                return;

        ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED,
                                      sid, mid, flags, gfp);
        if (ev) {
                struct sctp_sock *sp = sctp_sk(sk);

                __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev));

                if (!sp->data_ready_signalled) {
                        sp->data_ready_signalled = 1;
                        sk->sk_data_ready(sk);
                }
        }
}

static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
{
        struct sctp_stream *stream = &ulpq->asoc->stream;
        struct sctp_ulpevent *cevent, *event = NULL;
        struct sk_buff_head *lobby = &ulpq->lobby;
        struct sk_buff *pos, *tmp;
        struct sk_buff_head temp;
        __u16 csid;
        __u32 cmid;

        skb_queue_head_init(&temp);
        sctp_skb_for_each(pos, lobby, tmp) {
                cevent = (struct sctp_ulpevent *)pos->cb;
                csid = cevent->stream;
                cmid = cevent->mid;

                if (csid > sid)
                        break;

                if (csid < sid)
                        continue;

                if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid)))
                        break;

                __skb_unlink(pos, lobby);
                if (!event)
                        event = sctp_skb2event(pos);

                __skb_queue_tail(&temp, pos);
        }

        if (!event && pos != (struct sk_buff *)lobby) {
                cevent = (struct sctp_ulpevent *)pos->cb;
                csid = cevent->stream;
                cmid = cevent->mid;

                if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) {
                        sctp_mid_next(stream, in, csid);
                        __skb_unlink(pos, lobby);
                        __skb_queue_tail(&temp, pos);
                        event = sctp_skb2event(pos);
                }
        }

        if (event) {
                sctp_intl_retrieve_ordered(ulpq, event);
                sctp_enqueue_event(ulpq, &temp);
        }
}

static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
{
        struct sctp_stream *stream = &ulpq->asoc->stream;
        __u16 sid;

        for (sid = 0; sid < stream->incnt; sid++) {
                struct sctp_stream_in *sin = SCTP_SI(stream, sid);
                __u32 mid;

                if (sin->pd_mode_uo) {
                        sin->pd_mode_uo = 0;

                        mid = sin->mid_uo;
                        sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp);
                }

                if (sin->pd_mode) {
                        sin->pd_mode = 0;

                        mid = sin->mid;
                        sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp);
                        sctp_mid_skip(stream, in, sid, mid);

                        sctp_intl_reap_ordered(ulpq, sid);
                }
        }

        /* intl abort pd happens only when all data needs to be cleaned */
        sctp_ulpq_flush(ulpq);
}

static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist,
                                    int nskips, __be16 stream, __u8 flags)
{
        int i;

        for (i = 0; i < nskips; i++)
                if (skiplist[i].stream == stream &&
                    skiplist[i].flags == flags)
                        return i;

        return i;
}

#define SCTP_FTSN_U_BIT        0x1
static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
{
        struct sctp_ifwdtsn_skip ftsn_skip_arr[10];
        struct sctp_association *asoc = q->asoc;
        struct sctp_chunk *ftsn_chunk = NULL;
        struct list_head *lchunk, *temp;
        int nskips = 0, skip_pos;
        struct sctp_chunk *chunk;
        __u32 tsn;

        if (!asoc->peer.prsctp_capable)
                return;

        if (TSN_lt(asoc->adv_peer_ack_point, ctsn))
                asoc->adv_peer_ack_point = ctsn;

        list_for_each_safe(lchunk, temp, &q->abandoned) {
                chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list);
                tsn = ntohl(chunk->subh.data_hdr->tsn);

                if (TSN_lte(tsn, ctsn)) {
                        list_del_init(lchunk);
                        sctp_chunk_free(chunk);
                } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) {
                        __be16 sid = chunk->subh.idata_hdr->stream;
                        __be32 mid = chunk->subh.idata_hdr->mid;
                        __u8 flags = 0;

                        if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
                                flags |= SCTP_FTSN_U_BIT;

                        asoc->adv_peer_ack_point = tsn;
                        skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips,
                                                     sid, flags);
                        ftsn_skip_arr[skip_pos].stream = sid;
                        ftsn_skip_arr[skip_pos].reserved = 0;
                        ftsn_skip_arr[skip_pos].flags = flags;
                        ftsn_skip_arr[skip_pos].mid = mid;
                        if (skip_pos == nskips)
                                nskips++;
                        if (nskips == 10)
                                break;
                } else {
                        break;
                }
        }

        if (asoc->adv_peer_ack_point > ctsn)
                ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point,
                                               nskips, &ftsn_skip_arr[0]);

        if (ftsn_chunk) {
                list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
                SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS);
        }
}

#define _sctp_walk_ifwdtsn(pos, chunk, end) \
        for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \
             (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \
                            sizeof(struct sctp_ifwdtsn_skip); pos++)

#define sctp_walk_ifwdtsn(pos, ch) \
        _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \
                                        sizeof(struct sctp_ifwdtsn_chunk))

static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk)
{
        struct sctp_fwdtsn_skip *skip;
        __u16 incnt;

        if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN)
                return false;

        incnt = chunk->asoc->stream.incnt;
        sctp_walk_fwdtsn(skip, chunk)
                if (ntohs(skip->stream) >= incnt)
                        return false;

        return true;
}

static bool sctp_validate_iftsn(struct sctp_chunk *chunk)
{
        struct sctp_ifwdtsn_skip *skip;
        __u16 incnt;

        if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN)
                return false;

        incnt = chunk->asoc->stream.incnt;
        sctp_walk_ifwdtsn(skip, chunk)
                if (ntohs(skip->stream) >= incnt)
                        return false;

        return true;
}

static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
{
        /* Move the Cumulattive TSN Ack ahead. */
        sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
        /* purge the fragmentation queue */
        sctp_ulpq_reasm_flushtsn(ulpq, ftsn);
        /* Abort any in progress partial delivery. */
        sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC);
}

static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn)
{
        struct sk_buff *pos, *tmp;

        skb_queue_walk_safe(&ulpq->reasm, pos, tmp) {
                struct sctp_ulpevent *event = sctp_skb2event(pos);
                __u32 tsn = event->tsn;

                if (TSN_lte(tsn, ftsn)) {
                        __skb_unlink(pos, &ulpq->reasm);
                        sctp_ulpevent_free(event);
                }
        }

        skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) {
                struct sctp_ulpevent *event = sctp_skb2event(pos);
                __u32 tsn = event->tsn;

                if (TSN_lte(tsn, ftsn)) {
                        __skb_unlink(pos, &ulpq->reasm_uo);
                        sctp_ulpevent_free(event);
                }
        }
}

static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn)
{
        /* Move the Cumulattive TSN Ack ahead. */
        sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn);
        /* purge the fragmentation queue */
        sctp_intl_reasm_flushtsn(ulpq, ftsn);
        /* abort only when it's for all data */
        if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map))
                sctp_intl_abort_pd(ulpq, GFP_ATOMIC);
}

static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
{
        struct sctp_fwdtsn_skip *skip;

        /* Walk through all the skipped SSNs */
        sctp_walk_fwdtsn(skip, chunk)
                sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
}

static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid,
                           __u8 flags)
{
        struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid);
        struct sctp_stream *stream  = &ulpq->asoc->stream;

        if (flags & SCTP_FTSN_U_BIT) {
                if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) {
                        sin->pd_mode_uo = 0;
                        sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1,
                                                  GFP_ATOMIC);
                }
                return;
        }

        if (MID_lt(mid, sctp_mid_peek(stream, in, sid)))
                return;

        if (sin->pd_mode) {
                sin->pd_mode = 0;
                sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC);
        }

        sctp_mid_skip(stream, in, sid, mid);

        sctp_intl_reap_ordered(ulpq, sid);
}

static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
{
        struct sctp_ifwdtsn_skip *skip;

        /* Walk through all the skipped MIDs and abort stream pd if possible */
        sctp_walk_ifwdtsn(skip, chunk)
                sctp_intl_skip(ulpq, ntohs(skip->stream),
                               ntohl(skip->mid), skip->flags);
}

static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
{
        struct sk_buff_head temp;

        skb_queue_head_init(&temp);
        __skb_queue_tail(&temp, sctp_event2skb(event));
        return sctp_ulpq_tail_event(ulpq, &temp);
}

static struct sctp_stream_interleave sctp_stream_interleave_0 = {
        .data_chunk_len                = sizeof(struct sctp_data_chunk),
        .ftsn_chunk_len                = sizeof(struct sctp_fwdtsn_chunk),
        /* DATA process functions */
        .make_datafrag                = sctp_make_datafrag_empty,
        .assign_number                = sctp_chunk_assign_ssn,
        .validate_data                = sctp_validate_data,
        .ulpevent_data                = sctp_ulpq_tail_data,
        .enqueue_event                = do_ulpq_tail_event,
        .renege_events                = sctp_ulpq_renege,
        .start_pd                = sctp_ulpq_partial_delivery,
        .abort_pd                = sctp_ulpq_abort_pd,
        /* FORWARD-TSN process functions */
        .generate_ftsn                = sctp_generate_fwdtsn,
        .validate_ftsn                = sctp_validate_fwdtsn,
        .report_ftsn                = sctp_report_fwdtsn,
        .handle_ftsn                = sctp_handle_fwdtsn,
};

static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq,
                                 struct sctp_ulpevent *event)
{
        struct sk_buff_head temp;

        skb_queue_head_init(&temp);
        __skb_queue_tail(&temp, sctp_event2skb(event));
        return sctp_enqueue_event(ulpq, &temp);
}

static struct sctp_stream_interleave sctp_stream_interleave_1 = {
        .data_chunk_len                = sizeof(struct sctp_idata_chunk),
        .ftsn_chunk_len                = sizeof(struct sctp_ifwdtsn_chunk),
        /* I-DATA process functions */
        .make_datafrag                = sctp_make_idatafrag_empty,
        .assign_number                = sctp_chunk_assign_mid,
        .validate_data                = sctp_validate_idata,
        .ulpevent_data                = sctp_ulpevent_idata,
        .enqueue_event                = do_sctp_enqueue_event,
        .renege_events                = sctp_renege_events,
        .start_pd                = sctp_intl_start_pd,
        .abort_pd                = sctp_intl_abort_pd,
        /* I-FORWARD-TSN process functions */
        .generate_ftsn                = sctp_generate_iftsn,
        .validate_ftsn                = sctp_validate_iftsn,
        .report_ftsn                = sctp_report_iftsn,
        .handle_ftsn                = sctp_handle_iftsn,
};

void sctp_stream_interleave_init(struct sctp_stream *stream)
{
        struct sctp_association *asoc;

        asoc = container_of(stream, struct sctp_association, stream);
        stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1
                                             : &sctp_stream_interleave_0;
}
























    2 























































    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_BL_H
#define _LINUX_RCULIST_BL_H

/*
 * RCU-protected bl list version. See include/linux/list_bl.h.
 */
#include <linux/list_bl.h>
#include <linux/rcupdate.h>

static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        rcu_assign_pointer(h->first,
                (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
}

static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}

/**
 * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_bl_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry().
 */
static inline void hlist_bl_del_rcu(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->pprev = LIST_POISON2;
}

/**
 * hlist_bl_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_bl,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first;

        /* don't need hlist_bl_first_rcu because we're under lock */
        first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;

        /* need _rcu because we can have concurrent lock free readers */
        hlist_bl_set_first_rcu(h, n);
}
/**
 * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_bl_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_bl_node within the struct.
 *
 */
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)                \
        for (pos = hlist_bl_first_rcu(head);                                \
                pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(pos->next))

#endif




































   11 




    4 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_COMMON_H
#define _NF_CONNTRACK_COMMON_H

#include <linux/refcount.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>

struct ip_conntrack_stat {
        unsigned int found;
        unsigned int invalid;
        unsigned int insert;
        unsigned int insert_failed;
        unsigned int clash_resolve;
        unsigned int drop;
        unsigned int early_drop;
        unsigned int error;
        unsigned int expect_new;
        unsigned int expect_create;
        unsigned int expect_delete;
        unsigned int search_restart;
        unsigned int chaintoolong;
};

#define NFCT_INFOMASK        7UL
#define NFCT_PTRMASK        ~(NFCT_INFOMASK)

struct nf_conntrack {
        refcount_t use;
};

void nf_conntrack_destroy(struct nf_conntrack *nfct);

/* like nf_ct_put, but without module dependency on nf_conntrack */
static inline void nf_conntrack_put(struct nf_conntrack *nfct)
{
        if (nfct && refcount_dec_and_test(&nfct->use))
                nf_conntrack_destroy(nfct);
}
static inline void nf_conntrack_get(struct nf_conntrack *nfct)
{
        if (nfct)
                refcount_inc(&nfct->use);
}

#endif /* _NF_CONNTRACK_COMMON_H */

























































    2 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Integer base 2 logarithm calculation
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_LOG2_H
#define _LINUX_LOG2_H

#include <linux/types.h>
#include <linux/bitops.h>

/*
 * non-constant log of base 2 calculators
 * - the arch may override these in asm/bitops.h if they can be implemented
 *   more efficiently than using fls() and fls64()
 * - the arch is not required to handle n==0 if implementing the fallback
 */
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
static __always_inline __attribute__((const))
int __ilog2_u32(u32 n)
{
        return fls(n) - 1;
}
#endif

#ifndef CONFIG_ARCH_HAS_ILOG2_U64
static __always_inline __attribute__((const))
int __ilog2_u64(u64 n)
{
        return fls64(n) - 1;
}
#endif

/**
 * is_power_of_2() - check if a value is a power of two
 * @n: the value to check
 *
 * Determine whether some value is a power of two, where zero is
 * *not* considered a power of two.
 * Return: true if @n is a power of 2, otherwise false.
 */
static inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
        return (n != 0 && ((n & (n - 1)) == 0));
}

/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
        return 1UL << fls_long(n - 1);
}

/**
 * __rounddown_pow_of_two() - round down to nearest power of two
 * @n: value to round down
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
        return 1UL << (fls_long(n) - 1);
}

/**
 * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
 * @n: parameter
 *
 * Use this where sparse expects a true constant expression, e.g. for array
 * indices.
 */
#define const_ilog2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                (n) < 2 ? 0 :                        \
                (n) & (1ULL << 63) ? 63 :        \
                (n) & (1ULL << 62) ? 62 :        \
                (n) & (1ULL << 61) ? 61 :        \
                (n) & (1ULL << 60) ? 60 :        \
                (n) & (1ULL << 59) ? 59 :        \
                (n) & (1ULL << 58) ? 58 :        \
                (n) & (1ULL << 57) ? 57 :        \
                (n) & (1ULL << 56) ? 56 :        \
                (n) & (1ULL << 55) ? 55 :        \
                (n) & (1ULL << 54) ? 54 :        \
                (n) & (1ULL << 53) ? 53 :        \
                (n) & (1ULL << 52) ? 52 :        \
                (n) & (1ULL << 51) ? 51 :        \
                (n) & (1ULL << 50) ? 50 :        \
                (n) & (1ULL << 49) ? 49 :        \
                (n) & (1ULL << 48) ? 48 :        \
                (n) & (1ULL << 47) ? 47 :        \
                (n) & (1ULL << 46) ? 46 :        \
                (n) & (1ULL << 45) ? 45 :        \
                (n) & (1ULL << 44) ? 44 :        \
                (n) & (1ULL << 43) ? 43 :        \
                (n) & (1ULL << 42) ? 42 :        \
                (n) & (1ULL << 41) ? 41 :        \
                (n) & (1ULL << 40) ? 40 :        \
                (n) & (1ULL << 39) ? 39 :        \
                (n) & (1ULL << 38) ? 38 :        \
                (n) & (1ULL << 37) ? 37 :        \
                (n) & (1ULL << 36) ? 36 :        \
                (n) & (1ULL << 35) ? 35 :        \
                (n) & (1ULL << 34) ? 34 :        \
                (n) & (1ULL << 33) ? 33 :        \
                (n) & (1ULL << 32) ? 32 :        \
                (n) & (1ULL << 31) ? 31 :        \
                (n) & (1ULL << 30) ? 30 :        \
                (n) & (1ULL << 29) ? 29 :        \
                (n) & (1ULL << 28) ? 28 :        \
                (n) & (1ULL << 27) ? 27 :        \
                (n) & (1ULL << 26) ? 26 :        \
                (n) & (1ULL << 25) ? 25 :        \
                (n) & (1ULL << 24) ? 24 :        \
                (n) & (1ULL << 23) ? 23 :        \
                (n) & (1ULL << 22) ? 22 :        \
                (n) & (1ULL << 21) ? 21 :        \
                (n) & (1ULL << 20) ? 20 :        \
                (n) & (1ULL << 19) ? 19 :        \
                (n) & (1ULL << 18) ? 18 :        \
                (n) & (1ULL << 17) ? 17 :        \
                (n) & (1ULL << 16) ? 16 :        \
                (n) & (1ULL << 15) ? 15 :        \
                (n) & (1ULL << 14) ? 14 :        \
                (n) & (1ULL << 13) ? 13 :        \
                (n) & (1ULL << 12) ? 12 :        \
                (n) & (1ULL << 11) ? 11 :        \
                (n) & (1ULL << 10) ? 10 :        \
                (n) & (1ULL <<  9) ?  9 :        \
                (n) & (1ULL <<  8) ?  8 :        \
                (n) & (1ULL <<  7) ?  7 :        \
                (n) & (1ULL <<  6) ?  6 :        \
                (n) & (1ULL <<  5) ?  5 :        \
                (n) & (1ULL <<  4) ?  4 :        \
                (n) & (1ULL <<  3) ?  3 :        \
                (n) & (1ULL <<  2) ?  2 :        \
                1) :                                \
        -1)

/**
 * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
 * @n: parameter
 *
 * constant-capable log of base 2 calculation
 * - this can be used to initialise global variables from constant data, hence
 * the massive ternary operator construction
 *
 * selects the appropriately-sized optimised version depending on sizeof(n)
 */
#define ilog2(n) \
( \
        __builtin_constant_p(n) ?        \
        ((n) < 2 ? 0 :                        \
         63 - __builtin_clzll(n)) :        \
        (sizeof(n) <= 4) ?                \
        __ilog2_u32(n) :                \
        __ilog2_u64(n)                        \
 )

/**
 * roundup_pow_of_two - round the given value up to nearest power of two
 * @n: parameter
 *
 * round the given value up to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define roundup_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 1) ? 1 :                \
                (1UL << (ilog2((n) - 1) + 1))        \
                                   ) :                \
        __roundup_pow_of_two(n)                        \
 )

/**
 * rounddown_pow_of_two - round the given value down to nearest power of two
 * @n: parameter
 *
 * round the given value down to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define rounddown_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                (1UL << ilog2(n))) :                \
        __rounddown_pow_of_two(n)                \
 )

static inline __attribute_const__
int __order_base_2(unsigned long n)
{
        return n > 1 ? ilog2(n - 1) + 1 : 0;
}

/**
 * order_base_2 - calculate the (rounded up) base 2 order of the argument
 * @n: parameter
 *
 * The first few values calculated by this routine:
 *  ob2(0) = 0
 *  ob2(1) = 0
 *  ob2(2) = 1
 *  ob2(3) = 2
 *  ob2(4) = 2
 *  ob2(5) = 3
 *  ... and so on.
 */
#define order_base_2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1) ? 0 :        \
                ilog2((n) - 1) + 1) :                \
        __order_base_2(n)                        \
)

static inline __attribute__((const))
int __bits_per(unsigned long n)
{
        if (n < 2)
                return 1;
        if (is_power_of_2(n))
                return order_base_2(n) + 1;
        return order_base_2(n);
}

/**
 * bits_per - calculate the number of bits required for the argument
 * @n: parameter
 *
 * This is constant-capable and can be used for compile time
 * initializations, e.g bitfields.
 *
 * The first few values calculated by this routine:
 * bf(0) = 1
 * bf(1) = 1
 * bf(2) = 2
 * bf(3) = 2
 * bf(4) = 3
 * ... and so on.
 */
#define bits_per(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1)                \
                        ? 1 : ilog2(n) + 1        \
        ) :                                        \
        __bits_per(n)                                \
)
#endif /* _LINUX_LOG2_H */



















































































































    1 











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Internal procfs definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>

struct ctl_table_header;
struct mempolicy;

/*
 * This is not completely implemented yet. The idea is to
 * create an in-memory tree (like the actual /proc filesystem
 * tree) of these proc_dir_entries, so that we can dynamically
 * add new files to /proc.
 *
 * parent/subdir are used for the directory structure (every /proc file has a
 * parent, but "subdir" is empty for all non-directory entries).
 * subdir_node is used to build the rb tree "subdir" of the parent.
 */
struct proc_dir_entry {
        /*
         * number of callers into module in progress;
         * negative -> it's going away RSN
         */
        atomic_t in_use;
        refcount_t refcnt;
        struct list_head pde_openers;        /* who did ->open, but not ->release */
        /* protects ->pde_openers and all struct pde_opener instances */
        spinlock_t pde_unload_lock;
        struct completion *pde_unload_completion;
        const struct inode_operations *proc_iops;
        union {
                const struct proc_ops *proc_ops;
                const struct file_operations *proc_dir_ops;
        };
        const struct dentry_operations *proc_dops;
        union {
                const struct seq_operations *seq_ops;
                int (*single_show)(struct seq_file *, void *);
        };
        proc_write_t write;
        void *data;
        unsigned int state_size;
        unsigned int low_ino;
        nlink_t nlink;
        kuid_t uid;
        kgid_t gid;
        loff_t size;
        struct proc_dir_entry *parent;
        struct rb_root subdir;
        struct rb_node subdir_node;
        char *name;
        umode_t mode;
        u8 flags;
        u8 namelen;
        char inline_name[];
} __randomize_layout;

#define SIZEOF_PDE        (                                \
        sizeof(struct proc_dir_entry) < 128 ? 128 :        \
        sizeof(struct proc_dir_entry) < 192 ? 192 :        \
        sizeof(struct proc_dir_entry) < 256 ? 256 :        \
        sizeof(struct proc_dir_entry) < 512 ? 512 :        \
        0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))

static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_PERMANENT;
}

static inline void pde_make_permanent(struct proc_dir_entry *pde)
{
        pde->flags |= PROC_ENTRY_PERMANENT;
}

extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);

union proc_op {
        int (*proc_get_link)(struct dentry *, struct path *);
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
                struct task_struct *task);
        int lsmid;
};

struct proc_inode {
        struct pid *pid;
        unsigned int fd;
        union proc_op op;
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        struct ctl_table *sysctl_entry;
        struct hlist_node sibling_inodes;
        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
} __randomize_layout;

/*
 * General functions
 */
static inline struct proc_inode *PROC_I(const struct inode *inode)
{
        return container_of(inode, struct proc_inode, vfs_inode);
}

static inline struct proc_dir_entry *PDE(const struct inode *inode)
{
        return PROC_I(inode)->pde;
}

static inline struct pid *proc_pid(const struct inode *inode)
{
        return PROC_I(inode)->pid;
}

static inline struct task_struct *get_proc_task(const struct inode *inode)
{
        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid);

unsigned name_to_int(const struct qstr *qstr);
/*
 * Offset of the first process in the /proc root directory..
 */
#define FIRST_PROCESS_ENTRY 256

/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13

/*
 * array.c
 */
extern const struct file_operations proc_tid_children_operations;

extern void proc_task_name(struct seq_file *m, struct task_struct *p,
                           bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
                         struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);
extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
                           struct pid *, struct task_struct *);
extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);

/*
 * base.c
 */
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(struct mnt_idmap *, const struct path *,
                       struct kstat *, u32, unsigned int);
extern int proc_setattr(struct mnt_idmap *, struct dentry *,
                        struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);

/* Lookups */
typedef struct dentry *instantiate_t(struct dentry *,
                                     struct task_struct *, const void *);
bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
                           instantiate_t, struct task_struct *, const void *);

/*
 * generic.c
 */
struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data);
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp);
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);

static inline void pde_get(struct proc_dir_entry *pde)
{
        refcount_inc(&pde->refcnt);
}
extern void pde_put(struct proc_dir_entry *);

static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
        return S_ISDIR(pde->mode) && !pde->proc_iops;
}
extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);

/*
 * inode.c
 */
struct pde_opener {
        struct list_head lh;
        struct file *file;
        bool closing;
        struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;

void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);

/*
 * proc_namespaces.c
 */
extern const struct inode_operations proc_ns_dir_inode_operations;
extern const struct file_operations proc_ns_dir_operations;

/*
 * proc_net.c
 */
extern const struct file_operations proc_net_operations;
extern const struct inode_operations proc_net_inode_operations;

#ifdef CONFIG_NET
extern int proc_net_init(void);
#else
static inline int proc_net_init(void) { return 0; }
#endif

/*
 * proc_self.c
 */
extern int proc_setup_self(struct super_block *);

/*
 * proc_thread_self.c
 */
extern int proc_setup_thread_self(struct super_block *);
extern void proc_thread_self_init(void);

/*
 * proc_sysctl.c
 */
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
extern void proc_sys_evict_inode(struct inode *inode,
                                 struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
static inline void proc_sys_evict_inode(struct  inode *inode,
                                        struct ctl_table_header *head) { }
#endif

/*
 * proc_tty.c
 */
#ifdef CONFIG_TTY
extern void proc_tty_init(void);
#else
static inline void proc_tty_init(void) {}
#endif

/*
 * root.c
 */
extern struct proc_dir_entry proc_root;

extern void proc_self_init(void);

/*
 * task_[no]mmu.c
 */
struct mem_size_stats;
struct proc_maps_private {
        struct inode *inode;
        struct task_struct *task;
        struct mm_struct *mm;
        struct vma_iterator iter;
#ifdef CONFIG_NUMA
        struct mempolicy *task_mempolicy;
#endif
} __randomize_layout;

struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);

extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;

extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
                                unsigned long *, unsigned long *,
                                unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);

extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
        /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
        pde->proc_dops = &proc_net_dentry_ops;
}




























































    1 












    1 








    1 





































































    1 

    1 










    1 






    1 



    1 


    1 



















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP token management
 * Copyright (c) 2017 - 2019, Intel Corporation.
 *
 * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org,
 *       authored by:
 *
 *       Sébastien Barré <sebastien.barre@uclouvain.be>
 *       Christoph Paasch <christoph.paasch@uclouvain.be>
 *       Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
 *       Gregory Detal <gregory.detal@uclouvain.be>
 *       Fabien Duchêne <fabien.duchene@uclouvain.be>
 *       Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
 *       Lavkesh Lahngir <lavkesh51@gmail.com>
 *       Andreas Ripke <ripke@neclab.eu>
 *       Vlad Dogaru <vlad.dogaru@intel.com>
 *       Octavian Purdila <octavian.purdila@intel.com>
 *       John Ronan <jronan@tssg.org>
 *       Catalin Nicutar <catalin.nicutar@gmail.com>
 *       Brandon Heller <brandonh@stanford.edu>
 */

#define pr_fmt(fmt) "MPTCP: " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/memblock.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/protocol.h>
#include <net/mptcp.h>
#include "protocol.h"

#define TOKEN_MAX_CHAIN_LEN        4

struct token_bucket {
        spinlock_t                lock;
        int                        chain_len;
        struct hlist_nulls_head        req_chain;
        struct hlist_nulls_head        msk_chain;
};

static struct token_bucket *token_hash __read_mostly;
static unsigned int token_mask __read_mostly;

static struct token_bucket *token_bucket(u32 token)
{
        return &token_hash[token & token_mask];
}

/* called with bucket lock held */
static struct mptcp_subflow_request_sock *
__token_lookup_req(struct token_bucket *t, u32 token)
{
        struct mptcp_subflow_request_sock *req;
        struct hlist_nulls_node *pos;

        hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node)
                if (req->token == token)
                        return req;
        return NULL;
}

/* called with bucket lock held */
static struct mptcp_sock *
__token_lookup_msk(struct token_bucket *t, u32 token)
{
        struct hlist_nulls_node *pos;
        struct sock *sk;

        sk_nulls_for_each_rcu(sk, pos, &t->msk_chain)
                if (mptcp_sk(sk)->token == token)
                        return mptcp_sk(sk);
        return NULL;
}

static bool __token_bucket_busy(struct token_bucket *t, u32 token)
{
        return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN ||
               __token_lookup_req(t, token) || __token_lookup_msk(t, token);
}

static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
{
        /* we might consider a faster version that computes the key as a
         * hash of some information available in the MPTCP socket. Use
         * random data at the moment, as it's probably the safest option
         * in case multiple sockets are opened in different namespaces at
         * the same time.
         */
        get_random_bytes(key, sizeof(u64));
        mptcp_crypto_key_sha(*key, token, idsn);
}

/**
 * mptcp_token_new_request - create new key/idsn/token for subflow_request
 * @req: the request socket
 *
 * This function is called when a new mptcp connection is coming in.
 *
 * It creates a unique token to identify the new mptcp connection,
 * a secret local key and the initial data sequence number (idsn).
 *
 * Returns 0 on success.
 */
int mptcp_token_new_request(struct request_sock *req)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct token_bucket *bucket;
        u32 token;

        mptcp_crypto_key_sha(subflow_req->local_key,
                             &subflow_req->token,
                             &subflow_req->idsn);
        pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
                 req, subflow_req->local_key, subflow_req->token,
                 subflow_req->idsn);

        token = subflow_req->token;
        bucket = token_bucket(token);
        spin_lock_bh(&bucket->lock);
        if (__token_bucket_busy(bucket, token)) {
                spin_unlock_bh(&bucket->lock);
                return -EBUSY;
        }

        hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain);
        bucket->chain_len++;
        spin_unlock_bh(&bucket->lock);
        return 0;
}

/**
 * mptcp_token_new_connect - create new key/idsn/token for subflow
 * @ssk: the socket that will initiate a connection
 *
 * This function is called when a new outgoing mptcp connection is
 * initiated.
 *
 * It creates a unique token to identify the new mptcp connection,
 * a secret local key and the initial data sequence number (idsn).
 *
 * On success, the mptcp connection can be found again using
 * the computed token at a later time, this is needed to process
 * join requests.
 *
 * returns 0 on success.
 */
int mptcp_token_new_connect(struct sock *ssk)
{
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        int retries = MPTCP_TOKEN_MAX_RETRIES;
        struct sock *sk = subflow->conn;
        struct token_bucket *bucket;

again:
        mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
                                 &subflow->idsn);

        bucket = token_bucket(subflow->token);
        spin_lock_bh(&bucket->lock);
        if (__token_bucket_busy(bucket, subflow->token)) {
                spin_unlock_bh(&bucket->lock);
                if (!--retries)
                        return -EBUSY;
                goto again;
        }

        pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
                 ssk, subflow->local_key, subflow->token, subflow->idsn);

        WRITE_ONCE(msk->token, subflow->token);
        __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
        bucket->chain_len++;
        spin_unlock_bh(&bucket->lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        return 0;
}

/**
 * mptcp_token_accept - replace a req sk with full sock in token hash
 * @req: the request socket to be removed
 * @msk: the just cloned socket linked to the new connection
 *
 * Called when a SYN packet creates a new logical connection, i.e.
 * is not a join request.
 */
void mptcp_token_accept(struct mptcp_subflow_request_sock *req,
                        struct mptcp_sock *msk)
{
        struct mptcp_subflow_request_sock *pos;
        struct sock *sk = (struct sock *)msk;
        struct token_bucket *bucket;

        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        bucket = token_bucket(req->token);
        spin_lock_bh(&bucket->lock);

        /* pedantic lookup check for the moved token */
        pos = __token_lookup_req(bucket, req->token);
        if (!WARN_ON_ONCE(pos != req))
                hlist_nulls_del_init_rcu(&req->token_node);
        __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain);
        spin_unlock_bh(&bucket->lock);
}

bool mptcp_token_exists(u32 token)
{
        struct hlist_nulls_node *pos;
        struct token_bucket *bucket;
        struct mptcp_sock *msk;
        struct sock *sk;

        rcu_read_lock();
        bucket = token_bucket(token);

again:
        sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
                msk = mptcp_sk(sk);
                if (READ_ONCE(msk->token) == token)
                        goto found;
        }
        if (get_nulls_value(pos) != (token & token_mask))
                goto again;

        rcu_read_unlock();
        return false;
found:
        rcu_read_unlock();
        return true;
}

/**
 * mptcp_token_get_sock - retrieve mptcp connection sock using its token
 * @net: restrict to this namespace
 * @token: token of the mptcp connection to retrieve
 *
 * This function returns the mptcp connection structure with the given token.
 * A reference count on the mptcp socket returned is taken.
 *
 * returns NULL if no connection with the given token value exists.
 */
struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token)
{
        struct hlist_nulls_node *pos;
        struct token_bucket *bucket;
        struct mptcp_sock *msk;
        struct sock *sk;

        rcu_read_lock();
        bucket = token_bucket(token);

again:
        sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
                msk = mptcp_sk(sk);
                if (READ_ONCE(msk->token) != token ||
                    !net_eq(sock_net(sk), net))
                        continue;

                if (!refcount_inc_not_zero(&sk->sk_refcnt))
                        goto not_found;

                if (READ_ONCE(msk->token) != token ||
                    !net_eq(sock_net(sk), net)) {
                        sock_put(sk);
                        goto again;
                }
                goto found;
        }
        if (get_nulls_value(pos) != (token & token_mask))
                goto again;

not_found:
        msk = NULL;

found:
        rcu_read_unlock();
        return msk;
}
EXPORT_SYMBOL_GPL(mptcp_token_get_sock);

/**
 * mptcp_token_iter_next - iterate over the token container from given pos
 * @net: namespace to be iterated
 * @s_slot: start slot number
 * @s_num: start number inside the given lock
 *
 * This function returns the first mptcp connection structure found inside the
 * token container starting from the specified position, or NULL.
 *
 * On successful iteration, the iterator is moved to the next position and
 * a reference to the returned socket is acquired.
 */
struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
                                         long *s_num)
{
        struct mptcp_sock *ret = NULL;
        struct hlist_nulls_node *pos;
        int slot, num = 0;

        for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) {
                struct token_bucket *bucket = &token_hash[slot];
                struct sock *sk;

                num = 0;

                if (hlist_nulls_empty(&bucket->msk_chain))
                        continue;

                rcu_read_lock();
                sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) {
                        ++num;
                        if (!net_eq(sock_net(sk), net))
                                continue;

                        if (num <= *s_num)
                                continue;

                        if (!refcount_inc_not_zero(&sk->sk_refcnt))
                                continue;

                        if (!net_eq(sock_net(sk), net)) {
                                sock_put(sk);
                                continue;
                        }

                        ret = mptcp_sk(sk);
                        rcu_read_unlock();
                        goto out;
                }
                rcu_read_unlock();
        }

out:
        *s_slot = slot;
        *s_num = num;
        return ret;
}
EXPORT_SYMBOL_GPL(mptcp_token_iter_next);

/**
 * mptcp_token_destroy_request - remove mptcp connection/token
 * @req: mptcp request socket dropping the token
 *
 * Remove the token associated to @req.
 */
void mptcp_token_destroy_request(struct request_sock *req)
{
        struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
        struct mptcp_subflow_request_sock *pos;
        struct token_bucket *bucket;

        if (hlist_nulls_unhashed(&subflow_req->token_node))
                return;

        bucket = token_bucket(subflow_req->token);
        spin_lock_bh(&bucket->lock);
        pos = __token_lookup_req(bucket, subflow_req->token);
        if (!WARN_ON_ONCE(pos != subflow_req)) {
                hlist_nulls_del_init_rcu(&pos->token_node);
                bucket->chain_len--;
        }
        spin_unlock_bh(&bucket->lock);
}

/**
 * mptcp_token_destroy - remove mptcp connection/token
 * @msk: mptcp connection dropping the token
 *
 * Remove the token associated to @msk
 */
void mptcp_token_destroy(struct mptcp_sock *msk)
{
        struct sock *sk = (struct sock *)msk;
        struct token_bucket *bucket;
        struct mptcp_sock *pos;

        if (sk_unhashed((struct sock *)msk))
                return;

        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
        bucket = token_bucket(msk->token);
        spin_lock_bh(&bucket->lock);
        pos = __token_lookup_msk(bucket, msk->token);
        if (!WARN_ON_ONCE(pos != msk)) {
                __sk_nulls_del_node_init_rcu((struct sock *)pos);
                bucket->chain_len--;
        }
        spin_unlock_bh(&bucket->lock);
        WRITE_ONCE(msk->token, 0);
}

void __init mptcp_token_init(void)
{
        int i;

        token_hash = alloc_large_system_hash("MPTCP token",
                                             sizeof(struct token_bucket),
                                             0,
                                             20,/* one slot per 1MB of memory */
                                             HASH_ZERO,
                                             NULL,
                                             &token_mask,
                                             0,
                                             64 * 1024);
        for (i = 0; i < token_mask + 1; ++i) {
                INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i);
                INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i);
                spin_lock_init(&token_hash[i].lock);
        }
}

#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
EXPORT_SYMBOL_GPL(mptcp_token_new_request);
EXPORT_SYMBOL_GPL(mptcp_token_new_connect);
EXPORT_SYMBOL_GPL(mptcp_token_accept);
EXPORT_SYMBOL_GPL(mptcp_token_destroy_request);
EXPORT_SYMBOL_GPL(mptcp_token_destroy);
#endif






























































































































































































































































    3 


    2 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 











    2 




































































    2 



    3 





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/locks.c
 *
 * We implement four types of file locks: BSD locks, posix locks, open
 * file description locks, and leases.  For details about BSD locks,
 * see the flock(2) man page; for details about the other three, see
 * fcntl(2).
 *
 *
 * Locking conflicts and dependencies:
 * If multiple threads attempt to lock the same byte (or flock the same file)
 * only one can be granted the lock, and other must wait their turn.
 * The first lock has been "applied" or "granted", the others are "waiting"
 * and are "blocked" by the "applied" lock..
 *
 * Waiting and applied locks are all kept in trees whose properties are:
 *
 *        - the root of a tree may be an applied or waiting lock.
 *        - every other node in the tree is a waiting lock that
 *          conflicts with every ancestor of that node.
 *
 * Every such tree begins life as a waiting singleton which obviously
 * satisfies the above properties.
 *
 * The only ways we modify trees preserve these properties:
 *
 *        1. We may add a new leaf node, but only after first verifying that it
 *           conflicts with all of its ancestors.
 *        2. We may remove the root of a tree, creating a new singleton
 *           tree from the root and N new trees rooted in the immediate
 *           children.
 *        3. If the root of a tree is not currently an applied lock, we may
 *           apply it (if possible).
 *        4. We may upgrade the root of the tree (either extend its range,
 *           or upgrade its entire range from read to write).
 *
 * When an applied lock is modified in a way that reduces or downgrades any
 * part of its range, we remove all its children (2 above).  This particularly
 * happens when a lock is unlocked.
 *
 * For each of those child trees we "wake up" the thread which is
 * waiting for the lock so it can continue handling as follows: if the
 * root of the tree applies, we do so (3).  If it doesn't, it must
 * conflict with some applied lock.  We remove (wake up) all of its children
 * (2), and add it is a new leaf to the tree rooted in the applied
 * lock (1).  We then repeat the process recursively with those
 * children.
 *
 */
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/filelock.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
#include <linux/sysctl.h>

#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>

#include <linux/uaccess.h>

static struct file_lock *file_lock(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lock, c);
}

static struct file_lease *file_lease(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lease, c);
}

static bool lease_breaking(struct file_lease *fl)
{
        return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}

static int target_leasetype(struct file_lease *fl)
{
        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                return F_UNLCK;
        if (fl->c.flc_flags & FL_DOWNGRADE_PENDING)
                return F_RDLCK;
        return fl->c.flc_type;
}

static int leases_enable = 1;
static int lease_break_time = 45;

#ifdef CONFIG_SYSCTL
static struct ctl_table locks_sysctls[] = {
        {
                .procname        = "leases-enable",
                .data                = &leases_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_MMU
        {
                .procname        = "lease-break-time",
                .data                = &lease_break_time,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif /* CONFIG_MMU */
};

static int __init init_fs_locks_sysctls(void)
{
        register_sysctl_init("fs", locks_sysctls);
        return 0;
}
early_initcall(init_fs_locks_sysctls);
#endif /* CONFIG_SYSCTL */

/*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock.
 * Global serialization is done using file_rwsem.
 *
 * Note that alterations to the list also require that the relevant flc_lock is
 * held.
 */
struct file_lock_list_struct {
        spinlock_t                lock;
        struct hlist_head        hlist;
};
static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);


/*
 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
 * It is protected by blocked_lock_lock.
 *
 * We hash locks by lockowner in order to optimize searching for the lock a
 * particular lockowner is waiting on.
 *
 * FIXME: make this value scale via some heuristic? We generally will want more
 * buckets when we have more lockowners holding locks, but that's a little
 * difficult to determine without knowing what the workload will look like.
 */
#define BLOCKED_HASH_BITS        7
static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);

/*
 * This lock protects the blocked_hash. Generally, if you're accessing it, you
 * want to be holding this lock.
 *
 * In addition, it also protects the fl->fl_blocked_requests list, and the
 * fl->fl_blocker pointer for file_lock structures that are acting as lock
 * requests (in contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
 * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
 * flc_lock.
 */
static DEFINE_SPINLOCK(blocked_lock_lock);

static struct kmem_cache *flctx_cache __ro_after_init;
static struct kmem_cache *filelock_cache __ro_after_init;
static struct kmem_cache *filelease_cache __ro_after_init;

static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
        struct file_lock_context *ctx;

        /* paired with cmpxchg() below */
        ctx = locks_inode_context(inode);
        if (likely(ctx) || type == F_UNLCK)
                goto out;

        ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
        if (!ctx)
                goto out;

        spin_lock_init(&ctx->flc_lock);
        INIT_LIST_HEAD(&ctx->flc_flock);
        INIT_LIST_HEAD(&ctx->flc_posix);
        INIT_LIST_HEAD(&ctx->flc_lease);

        /*
         * Assign the pointer if it's not already assigned. If it is, then
         * free the context we just allocated.
         */
        if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
                kmem_cache_free(flctx_cache, ctx);
                ctx = locks_inode_context(inode);
        }
out:
        trace_locks_get_lock_context(inode, type, ctx);
        return ctx;
}

static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;

        list_for_each_entry(flc, list, flc_list)
                pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                        list_type, flc->flc_owner, flc->flc_flags,
                        flc->flc_type, flc->flc_pid);
}

static void
locks_check_ctx_lists(struct inode *inode)
{
        struct file_lock_context *ctx = inode->i_flctx;

        if (unlikely(!list_empty(&ctx->flc_flock) ||
                     !list_empty(&ctx->flc_posix) ||
                     !list_empty(&ctx->flc_lease))) {
                pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
                        MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
                        inode->i_ino);
                locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
                locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
                locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
        }
}

static void
locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;
        struct inode *inode = file_inode(filp);

        list_for_each_entry(flc, list, flc_list)
                if (flc->flc_file == filp)
                        pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
                                " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                                list_type, MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino,
                                flc->flc_owner, flc->flc_flags,
                                flc->flc_type, flc->flc_pid);
}

void
locks_free_lock_context(struct inode *inode)
{
        struct file_lock_context *ctx = locks_inode_context(inode);

        if (unlikely(ctx)) {
                locks_check_ctx_lists(inode);
                kmem_cache_free(flctx_cache, ctx);
        }
}

static void locks_init_lock_heads(struct file_lock_core *flc)
{
        INIT_HLIST_NODE(&flc->flc_link);
        INIT_LIST_HEAD(&flc->flc_list);
        INIT_LIST_HEAD(&flc->flc_blocked_requests);
        INIT_LIST_HEAD(&flc->flc_blocked_member);
        init_waitqueue_head(&flc->flc_wait);
}

/* Allocate an empty lock structure. */
struct file_lock *locks_alloc_lock(void)
{
        struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);

/* Allocate an empty lock structure. */
struct file_lease *locks_alloc_lease(void)
{
        struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lease);

void locks_release_private(struct file_lock *fl)
{
        struct file_lock_core *flc = &fl->c;

        BUG_ON(waitqueue_active(&flc->flc_wait));
        BUG_ON(!list_empty(&flc->flc_list));
        BUG_ON(!list_empty(&flc->flc_blocked_requests));
        BUG_ON(!list_empty(&flc->flc_blocked_member));
        BUG_ON(!hlist_unhashed(&flc->flc_link));

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_release_private)
                        fl->fl_ops->fl_release_private(fl);
                fl->fl_ops = NULL;
        }

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_put_owner) {
                        fl->fl_lmops->lm_put_owner(flc->flc_owner);
                        flc->flc_owner = NULL;
                }
                fl->fl_lmops = NULL;
        }
}
EXPORT_SYMBOL_GPL(locks_release_private);

/**
 * locks_owner_has_blockers - Check for blocking lock requests
 * @flctx: file lock context
 * @owner: lock owner
 *
 * Return values:
 *   %true: @owner has at least one blocker
 *   %false: @owner has no blockers
 */
bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner)
{
        struct file_lock_core *flc;

        spin_lock(&flctx->flc_lock);
        list_for_each_entry(flc, &flctx->flc_posix, flc_list) {
                if (flc->flc_owner != owner)
                        continue;
                if (!list_empty(&flc->flc_blocked_requests)) {
                        spin_unlock(&flctx->flc_lock);
                        return true;
                }
        }
        spin_unlock(&flctx->flc_lock);
        return false;
}
EXPORT_SYMBOL_GPL(locks_owner_has_blockers);

/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
}
EXPORT_SYMBOL(locks_free_lock);

/* Free a lease which is not in use. */
void locks_free_lease(struct file_lease *fl)
{
        kmem_cache_free(filelease_cache, fl);
}
EXPORT_SYMBOL(locks_free_lease);

static void
locks_dispose_list(struct list_head *dispose)
{
        struct file_lock_core *flc;

        while (!list_empty(dispose)) {
                flc = list_first_entry(dispose, struct file_lock_core, flc_list);
                list_del_init(&flc->flc_list);
                if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
                        locks_free_lease(file_lease(flc));
                else
                        locks_free_lock(file_lock(flc));
        }
}

void locks_init_lock(struct file_lock *fl)
{
        memset(fl, 0, sizeof(struct file_lock));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lock);

void locks_init_lease(struct file_lease *fl)
{
        memset(fl, 0, sizeof(*fl));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lease);

/*
 * Initialize a new lock from an existing file_lock structure.
 */
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        new->c.flc_owner = fl->c.flc_owner;
        new->c.flc_pid = fl->c.flc_pid;
        new->c.flc_file = NULL;
        new->c.flc_flags = fl->c.flc_flags;
        new->c.flc_type = fl->c.flc_type;
        new->fl_start = fl->fl_start;
        new->fl_end = fl->fl_end;
        new->fl_lmops = fl->fl_lmops;
        new->fl_ops = NULL;

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_get_owner)
                        fl->fl_lmops->lm_get_owner(fl->c.flc_owner);
        }
}
EXPORT_SYMBOL(locks_copy_conflock);

void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        /* "new" must be a freshly-initialized lock */
        WARN_ON_ONCE(new->fl_ops);

        locks_copy_conflock(new, fl);

        new->c.flc_file = fl->c.flc_file;
        new->fl_ops = fl->fl_ops;

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_copy_lock)
                        fl->fl_ops->fl_copy_lock(new, fl);
        }
}
EXPORT_SYMBOL(locks_copy_lock);

static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
{
        struct file_lock *f;

        /*
         * As ctx->flc_lock is held, new requests cannot be added to
         * ->flc_blocked_requests, so we don't need a lock to check if it
         * is empty.
         */
        if (list_empty(&fl->c.flc_blocked_requests))
                return;
        spin_lock(&blocked_lock_lock);
        list_splice_init(&fl->c.flc_blocked_requests,
                         &new->c.flc_blocked_requests);
        list_for_each_entry(f, &new->c.flc_blocked_requests,
                            c.flc_blocked_member)
                f->c.flc_blocker = &new->c;
        spin_unlock(&blocked_lock_lock);
}

static inline int flock_translate_cmd(int cmd) {
        switch (cmd) {
        case LOCK_SH:
                return F_RDLCK;
        case LOCK_EX:
                return F_WRLCK;
        case LOCK_UN:
                return F_UNLCK;
        }
        return -EINVAL;
}

/* Fill in a file_lock structure with an appropriate FLOCK lock. */
static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
        locks_init_lock(fl);

        fl->c.flc_file = filp;
        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_flags = FL_FLOCK;
        fl->c.flc_type = type;
        fl->fl_end = OFFSET_MAX;
}

static int assign_type(struct file_lock_core *flc, int type)
{
        switch (type) {
        case F_RDLCK:
        case F_WRLCK:
        case F_UNLCK:
                flc->flc_type = type;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
                                 struct flock64 *l)
{
        switch (l->l_whence) {
        case SEEK_SET:
                fl->fl_start = 0;
                break;
        case SEEK_CUR:
                fl->fl_start = filp->f_pos;
                break;
        case SEEK_END:
                fl->fl_start = i_size_read(file_inode(filp));
                break;
        default:
                return -EINVAL;
        }
        if (l->l_start > OFFSET_MAX - fl->fl_start)
                return -EOVERFLOW;
        fl->fl_start += l->l_start;
        if (fl->fl_start < 0)
                return -EINVAL;

        /* POSIX-1996 leaves the case l->l_len < 0 undefined;
           POSIX-2001 defines it. */
        if (l->l_len > 0) {
                if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
                        return -EOVERFLOW;
                fl->fl_end = fl->fl_start + (l->l_len - 1);

        } else if (l->l_len < 0) {
                if (fl->fl_start + l->l_len < 0)
                        return -EINVAL;
                fl->fl_end = fl->fl_start - 1;
                fl->fl_start += l->l_len;
        } else
                fl->fl_end = OFFSET_MAX;

        fl->c.flc_owner = current->files;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_file = filp;
        fl->c.flc_flags = FL_POSIX;
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;

        return assign_type(&fl->c, l->l_type);
}

/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
 * style lock.
 */
static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
                               struct flock *l)
{
        struct flock64 ll = {
                .l_type = l->l_type,
                .l_whence = l->l_whence,
                .l_start = l->l_start,
                .l_len = l->l_len,
        };

        return flock64_to_posix_lock(filp, fl, &ll);
}

/* default lease lock manager operations */
static bool
lease_break_callback(struct file_lease *fl)
{
        kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
        return false;
}

static void
lease_setup(struct file_lease *fl, void **priv)
{
        struct file *filp = fl->c.flc_file;
        struct fasync_struct *fa = *priv;

        /*
         * fasync_insert_entry() returns the old entry if any. If there was no
         * old entry, then it used "priv" and inserted it into the fasync list.
         * Clear the pointer to indicate that it shouldn't be freed.
         */
        if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
                *priv = NULL;

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}

static const struct lease_manager_operations lease_manager_ops = {
        .lm_break = lease_break_callback,
        .lm_change = lease_modify,
        .lm_setup = lease_setup,
};

/*
 * Initialize a lease, use the default lock manager operations
 */
static int lease_init(struct file *filp, int type, struct file_lease *fl)
{
        if (assign_type(&fl->c, type) != 0)
                return -EINVAL;

        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;

        fl->c.flc_file = filp;
        fl->c.flc_flags = FL_LEASE;
        fl->fl_lmops = &lease_manager_ops;
        return 0;
}

/* Allocate a file_lock initialised to this type of lease */
static struct file_lease *lease_alloc(struct file *filp, int type)
{
        struct file_lease *fl = locks_alloc_lease();
        int error = -ENOMEM;

        if (fl == NULL)
                return ERR_PTR(error);

        error = lease_init(filp, type, fl);
        if (error) {
                locks_free_lease(fl);
                return ERR_PTR(error);
        }
        return fl;
}

/* Check if two locks overlap each other.
 */
static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
{
        return ((fl1->fl_end >= fl2->fl_start) &&
                (fl2->fl_end >= fl1->fl_start));
}

/*
 * Check whether two locks have the same owner.
 */
static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2)
{
        return fl1->flc_owner == fl2->flc_owner;
}

/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);

        percpu_rwsem_assert_held(&file_rwsem);

        spin_lock(&fll->lock);
        flc->flc_link_cpu = smp_processor_id();
        hlist_add_head(&flc->flc_link, &fll->hlist);
        spin_unlock(&fll->lock);
}

/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll;

        percpu_rwsem_assert_held(&file_rwsem);

        /*
         * Avoid taking lock if already unhashed. This is safe since this check
         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&flc->flc_link))
                return;

        fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu);
        spin_lock(&fll->lock);
        hlist_del_init(&flc->flc_link);
        spin_unlock(&fll->lock);
}

static unsigned long
posix_owner_key(struct file_lock_core *flc)
{
        return (unsigned long) flc->flc_owner;
}

static void locks_insert_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter));
}

static void locks_delete_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_del(&waiter->flc_link);
}

/* Remove waiter from blocker's block list.
 * When blocker ends up pointing to itself then the list is empty.
 *
 * Must be called with blocked_lock_lock held.
 */
static void __locks_unlink_block(struct file_lock_core *waiter)
{
        locks_delete_global_blocked(waiter);
        list_del_init(&waiter->flc_blocked_member);
}

static void __locks_wake_up_blocks(struct file_lock_core *blocker)
{
        while (!list_empty(&blocker->flc_blocked_requests)) {
                struct file_lock_core *waiter;
                struct file_lock *fl;

                waiter = list_first_entry(&blocker->flc_blocked_requests,
                                          struct file_lock_core, flc_blocked_member);

                fl = file_lock(waiter);
                __locks_unlink_block(waiter);
                if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) &&
                    fl->fl_lmops && fl->fl_lmops->lm_notify)
                        fl->fl_lmops->lm_notify(fl);
                else
                        locks_wake_up(fl);

                /*
                 * The setting of flc_blocker to NULL marks the "done"
                 * point in deleting a block. Paired with acquire at the top
                 * of locks_delete_block().
                 */
                smp_store_release(&waiter->flc_blocker, NULL);
        }
}

static int __locks_delete_block(struct file_lock_core *waiter)
{
        int status = -ENOENT;

        /*
         * If fl_blocker is NULL, it won't be set again as this thread "owns"
         * the lock and is the only one that might try to claim the lock.
         *
         * We use acquire/release to manage fl_blocker so that we can
         * optimize away taking the blocked_lock_lock in many cases.
         *
         * The smp_load_acquire guarantees two things:
         *
         * 1/ that fl_blocked_requests can be tested locklessly. If something
         * was recently added to that list it must have been in a locked region
         * *before* the locked region when fl_blocker was set to NULL.
         *
         * 2/ that no other thread is accessing 'waiter', so it is safe to free
         * it.  __locks_wake_up_blocks is careful not to touch waiter after
         * fl_blocker is released.
         *
         * If a lockless check of fl_blocker shows it to be NULL, we know that
         * no new locks can be inserted into its fl_blocked_requests list, and
         * can avoid doing anything further if the list is empty.
         */
        if (!smp_load_acquire(&waiter->flc_blocker) &&
            list_empty(&waiter->flc_blocked_requests))
                return status;

        spin_lock(&blocked_lock_lock);
        if (waiter->flc_blocker)
                status = 0;
        __locks_wake_up_blocks(waiter);
        __locks_unlink_block(waiter);

        /*
         * The setting of fl_blocker to NULL marks the "done" point in deleting
         * a block. Paired with acquire at the top of this function.
         */
        smp_store_release(&waiter->flc_blocker, NULL);
        spin_unlock(&blocked_lock_lock);
        return status;
}

/**
 *        locks_delete_block - stop waiting for a file lock
 *        @waiter: the lock which was waiting
 *
 *        lockd/nfsd need to disconnect the lock while working on it.
 */
int locks_delete_block(struct file_lock *waiter)
{
        return __locks_delete_block(&waiter->c);
}
EXPORT_SYMBOL(locks_delete_block);

/* Insert waiter into blocker's block list.
 * We use a circular list so that processes can be easily woken up in
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
 * Must be called with both the flc_lock and blocked_lock_lock held. The
 * fl_blocked_requests list itself is protected by the blocked_lock_lock,
 * but by ensuring that the flc_lock is also held on insertions we can avoid
 * taking the blocked_lock_lock in some cases when we see that the
 * fl_blocked_requests list is empty.
 *
 * Rather than just adding to the list, we check for conflicts with any existing
 * waiters, and add beneath any waiter that blocks the new waiter.
 * Thus wakeups don't happen until needed.
 */
static void __locks_insert_block(struct file_lock_core *blocker,
                                 struct file_lock_core *waiter,
                                 bool conflict(struct file_lock_core *,
                                               struct file_lock_core *))
{
        struct file_lock_core *flc;

        BUG_ON(!list_empty(&waiter->flc_blocked_member));
new_blocker:
        list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member)
                if (conflict(flc, waiter)) {
                        blocker =  flc;
                        goto new_blocker;
                }
        waiter->flc_blocker = blocker;
        list_add_tail(&waiter->flc_blocked_member,
                      &blocker->flc_blocked_requests);

        if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX)
                locks_insert_global_blocked(waiter);

        /* The requests in waiter->flc_blocked are known to conflict with
         * waiter, but might not conflict with blocker, or the requests
         * and lock which block it.  So they all need to be woken.
         */
        __locks_wake_up_blocks(waiter);
}

/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock_core *blocker,
                               struct file_lock_core *waiter,
                               bool conflict(struct file_lock_core *,
                                             struct file_lock_core *))
{
        spin_lock(&blocked_lock_lock);
        __locks_insert_block(blocker, waiter, conflict);
        spin_unlock(&blocked_lock_lock);
}

/*
 * Wake up processes blocked waiting for blocker.
 *
 * Must be called with the inode->flc_lock held!
 */
static void locks_wake_up_blocks(struct file_lock_core *blocker)
{
        /*
         * Avoid taking global lock if list is empty. This is safe since new
         * blocked requests are only added to the list under the flc_lock, and
         * the flc_lock is always held here. Note that removal from the
         * fl_blocked_requests list does not require the flc_lock, so we must
         * recheck list_empty() after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->flc_blocked_requests))
                return;

        spin_lock(&blocked_lock_lock);
        __locks_wake_up_blocks(blocker);
        spin_unlock(&blocked_lock_lock);
}

static void
locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before)
{
        list_add_tail(&fl->flc_list, before);
        locks_insert_global_locks(fl);
}

static void
locks_unlink_lock_ctx(struct file_lock_core *fl)
{
        locks_delete_global_locks(fl);
        list_del_init(&fl->flc_list);
        locks_wake_up_blocks(fl);
}

static void
locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose)
{
        locks_unlink_lock_ctx(fl);
        if (dispose)
                list_add(&fl->flc_list, dispose);
        else
                locks_free_lock(file_lock(fl));
}

/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
 * checks for shared/exclusive status of overlapping locks.
 */
static bool locks_conflict(struct file_lock_core *caller_flc,
                           struct file_lock_core *sys_flc)
{
        if (sys_flc->flc_type == F_WRLCK)
                return true;
        if (caller_flc->flc_type == F_WRLCK)
                return true;
        return false;
}

/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
 * checking before calling the locks_conflict().
 */
static bool posix_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        struct file_lock *caller_fl = file_lock(caller_flc);
        struct file_lock *sys_fl = file_lock(sys_flc);

        /* POSIX locks owned by the same process do not conflict with
         * each other.
         */
        if (posix_same_owner(caller_flc, sys_flc))
                return false;

        /* Check whether they overlap */
        if (!locks_overlap(caller_fl, sys_fl))
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
 * path so checks for additional GETLK-specific things like F_UNLCK.
 */
static bool posix_test_locks_conflict(struct file_lock *caller_fl,
                                      struct file_lock *sys_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *sys = &sys_fl->c;

        /* F_UNLCK checks any locks on the same fd. */
        if (lock_is_unlock(caller_fl)) {
                if (!posix_same_owner(caller, sys))
                        return false;
                return locks_overlap(caller_fl, sys_fl);
        }
        return posix_locks_conflict(caller, sys);
}

/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
 * checking before calling the locks_conflict().
 */
static bool flock_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        /* FLOCK locks referring to the same filp do not conflict with
         * each other.
         */
        if (caller_flc->flc_file == sys_flc->flc_file)
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
        struct file_lock *cfl;
        struct file_lock_context *ctx;
        struct inode *inode = file_inode(filp);
        void *owner;
        void (*func)(void);

        ctx = locks_inode_context(inode);
        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
                fl->c.flc_type = F_UNLCK;
                return;
        }

retry:
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) {
                if (!posix_test_locks_conflict(fl, cfl))
                        continue;
                if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
                        && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
                        owner = cfl->fl_lmops->lm_mod_owner;
                        func = cfl->fl_lmops->lm_expire_lock;
                        __module_get(owner);
                        spin_unlock(&ctx->flc_lock);
                        (*func)();
                        module_put(owner);
                        goto retry;
                }
                locks_copy_conflock(fl, cfl);
                goto out;
        }
        fl->c.flc_type = F_UNLCK;
out:
        spin_unlock(&ctx->flc_lock);
        return;
}
EXPORT_SYMBOL(posix_test_lock);

/*
 * Deadlock detection:
 *
 * We attempt to detect deadlocks that are due purely to posix file
 * locks.
 *
 * We assume that a task can be waiting for at most one lock at a time.
 * So for any acquired lock, the process holding that lock may be
 * waiting on at most one other lock.  That lock in turns may be held by
 * someone waiting for at most one other lock.  Given a requested lock
 * caller_fl which is about to wait for a conflicting lock block_fl, we
 * follow this chain of waiters to ensure we are not about to create a
 * cycle.
 *
 * Since we do this before we ever put a process to sleep on a lock, we
 * are ensured that there is never a cycle; that is what guarantees that
 * the while() loop in posix_locks_deadlock() eventually completes.
 *
 * Note: the above assumption may not be true when handling lock
 * requests from a broken NFS client. It may also fail in the presence
 * of tasks (such as posix threads) sharing the same open file table.
 * To handle those cases, we just bail out after a few iterations.
 *
 * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
 * Because the owner is not even nominally tied to a thread of
 * execution, the deadlock detection below can't reasonably work well. Just
 * skip it for those.
 *
 * In principle, we could do a more limited deadlock detection on FL_OFDLCK
 * locks that just checks for the case where two tasks are attempting to
 * upgrade from read to write locks on the same inode.
 */

#define MAX_DEADLK_ITERATIONS 10

/* Find a lock that the owner of the given @blocker is blocking on. */
static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker)
{
        struct file_lock_core *flc;

        hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) {
                if (posix_same_owner(flc, blocker)) {
                        while (flc->flc_blocker)
                                flc = flc->flc_blocker;
                        return flc;
                }
        }
        return NULL;
}

/* Must be called with the blocked_lock_lock held! */
static bool posix_locks_deadlock(struct file_lock *caller_fl,
                                 struct file_lock *block_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *blocker = &block_fl->c;
        int i = 0;

        lockdep_assert_held(&blocked_lock_lock);

        /*
         * This deadlock detector can't reasonably detect deadlocks with
         * FL_OFDLCK locks, since they aren't owned by a process, per-se.
         */
        if (caller->flc_flags & FL_OFDLCK)
                return false;

        while ((blocker = what_owner_is_waiting_for(blocker))) {
                if (i++ > MAX_DEADLK_ITERATIONS)
                        return false;
                if (posix_same_owner(caller, blocker))
                        return true;
        }
        return false;
}

/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
 * after any leases, but before any posix locks.
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
static int flock_lock_inode(struct inode *inode, struct file_lock *request)
{
        struct file_lock *new_fl = NULL;
        struct file_lock *fl;
        struct file_lock_context *ctx;
        int error = 0;
        bool found = false;
        LIST_HEAD(dispose);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx) {
                if (request->c.flc_type != F_UNLCK)
                        return -ENOMEM;
                return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0;
        }

        if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        if (request->c.flc_flags & FL_ACCESS)
                goto find_conflict;

        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (request->c.flc_file != fl->c.flc_file)
                        continue;
                if (request->c.flc_type == fl->c.flc_type)
                        goto out;
                found = true;
                locks_delete_lock_ctx(&fl->c, &dispose);
                break;
        }

        if (lock_is_unlock(request)) {
                if ((request->c.flc_flags & FL_EXISTS) && !found)
                        error = -ENOENT;
                goto out;
        }

find_conflict:
        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (!flock_locks_conflict(&request->c, &fl->c))
                        continue;
                error = -EAGAIN;
                if (!(request->c.flc_flags & FL_SLEEP))
                        goto out;
                error = FILE_LOCK_DEFERRED;
                locks_insert_block(&fl->c, &request->c, flock_locks_conflict);
                goto out;
        }
        if (request->c.flc_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
        locks_move_blocks(new_fl, request);
        locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;

out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
        trace_flock_lock_inode(inode, request, error);
        return error;
}

static int posix_lock_inode(struct inode *inode, struct file_lock *request,
                            struct file_lock *conflock)
{
        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
        void *owner;
        void (*func)(void);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx)
                return lock_is_unlock(request) ? 0 : -ENOMEM;

        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
         *
         * In some cases we can be sure, that no new locks will be needed
         */
        if (!(request->c.flc_flags & FL_ACCESS) &&
            (request->c.flc_type != F_UNLCK ||
             request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
                new_fl = locks_alloc_lock();
                new_fl2 = locks_alloc_lock();
        }

retry:
        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->c.flc_type != F_UNLCK) {
                list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                        if (!posix_locks_conflict(&request->c, &fl->c))
                                continue;
                        if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
                                && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
                                owner = fl->fl_lmops->lm_mod_owner;
                                func = fl->fl_lmops->lm_expire_lock;
                                __module_get(owner);
                                spin_unlock(&ctx->flc_lock);
                                percpu_up_read(&file_rwsem);
                                (*func)();
                                module_put(owner);
                                goto retry;
                        }
                        if (conflock)
                                locks_copy_conflock(conflock, fl);
                        error = -EAGAIN;
                        if (!(request->c.flc_flags & FL_SLEEP))
                                goto out;
                        /*
                         * Deadlock detection and insertion into the blocked
                         * locks list must be done while holding the same lock!
                         */
                        error = -EDEADLK;
                        spin_lock(&blocked_lock_lock);
                        /*
                         * Ensure that we don't find any locks blocked on this
                         * request during deadlock detection.
                         */
                        __locks_wake_up_blocks(&request->c);
                        if (likely(!posix_locks_deadlock(request, fl))) {
                                error = FILE_LOCK_DEFERRED;
                                __locks_insert_block(&fl->c, &request->c,
                                                     posix_locks_conflict);
                        }
                        spin_unlock(&blocked_lock_lock);
                        goto out;
                }
        }

        /* If we're just looking for a conflict, we're done. */
        error = 0;
        if (request->c.flc_flags & FL_ACCESS)
                goto out;

        /* Find the first old lock with the same owner as the new lock */
        list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                if (posix_same_owner(&request->c, &fl->c))
                        break;
        }

        /* Process locks with this owner. */
        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) {
                if (!posix_same_owner(&request->c, &fl->c))
                        break;

                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->c.flc_type == fl->c.flc_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
                        if (fl->fl_start - 1 > request->fl_end)
                                break;

                        /* If we come here, the new and old lock are of the
                         * same type and adjacent or overlapping. Make one
                         * lock yielding from the lower start address of both
                         * locks to the higher end address.
                         */
                        if (fl->fl_start > request->fl_start)
                                fl->fl_start = request->fl_start;
                        else
                                request->fl_start = fl->fl_start;
                        if (fl->fl_end < request->fl_end)
                                fl->fl_end = request->fl_end;
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
                } else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (lock_is_unlock(request))
                                added = true;
                        if (fl->fl_start < request->fl_start)
                                left = fl;
                        /* If the next lock in the list has a higher end
                         * address than the new one, insert the new one here.
                         */
                        if (fl->fl_end > request->fl_end) {
                                right = fl;
                                break;
                        }
                        if (fl->fl_start >= request->fl_start) {
                                /* The new lock completely replaces an old
                                 * one (This may happen several times).
                                 */
                                if (added) {
                                        locks_delete_lock_ctx(&fl->c, &dispose);
                                        continue;
                                }
                                /*
                                 * Replace the old lock with new_fl, and
                                 * remove the old one. It's safe to do the
                                 * insert here since we know that we won't be
                                 * using new_fl later, and that the lock is
                                 * just replacing an existing lock.
                                 */
                                error = -ENOLCK;
                                if (!new_fl)
                                        goto out;
                                locks_copy_lock(new_fl, request);
                                locks_move_blocks(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
                                locks_insert_lock_ctx(&request->c,
                                                      &fl->c.flc_list);
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                added = true;
                        }
                }
        }

        /*
         * The above code only modifies existing locks in case of merging or
         * replacing. If new lock(s) need to be inserted all modifications are
         * done below this, so it's safe yet to bail out.
         */
        error = -ENOLCK; /* "no luck" */
        if (right && left == right && !new_fl2)
                goto out;

        error = 0;
        if (!added) {
                if (lock_is_unlock(request)) {
                        if (request->c.flc_flags & FL_EXISTS)
                                error = -ENOENT;
                        goto out;
                }

                if (!new_fl) {
                        error = -ENOLCK;
                        goto out;
                }
                locks_copy_lock(new_fl, request);
                locks_move_blocks(new_fl, request);
                locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list);
                fl = new_fl;
                new_fl = NULL;
        }
        if (right) {
                if (left == right) {
                        /* The new lock breaks the old one in two pieces,
                         * so we have to use the second new lock.
                         */
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
                        locks_insert_lock_ctx(&left->c, &fl->c.flc_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(&right->c);
        }
        if (left) {
                left->fl_end = request->fl_start - 1;
                locks_wake_up_blocks(&left->c);
        }
 out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        trace_posix_lock_inode(inode, request, error);
        /*
         * Free any unused locks.
         */
        if (new_fl)
                locks_free_lock(new_fl);
        if (new_fl2)
                locks_free_lock(new_fl2);
        locks_dispose_list(&dispose);

        return error;
}

/**
 * posix_lock_file - Apply a POSIX-style lock to a file
 * @filp: The file to apply the lock to
 * @fl: The lock to be applied
 * @conflock: Place to return a copy of the conflicting lock, if found.
 *
 * Add a POSIX style lock to a file.
 * We merge adjacent & overlapping locks whenever possible.
 * POSIX locks are sorted by owner task, then by starting address
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
int posix_lock_file(struct file *filp, struct file_lock *fl,
                        struct file_lock *conflock)
{
        return posix_lock_inode(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);

/**
 * posix_lock_inode_wait - Apply a POSIX-style lock to a file
 * @inode: inode of file to which lock request should be applied
 * @fl: The lock to be applied
 *
 * Apply a POSIX style lock request to an inode.
 */
static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep ();
        for (;;) {
                error = posix_lock_inode(inode, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

static void lease_clear_pending(struct file_lease *fl, int arg)
{
        switch (arg) {
        case F_UNLCK:
                fl->c.flc_flags &= ~FL_UNLOCK_PENDING;
                fallthrough;
        case F_RDLCK:
                fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING;
        }
}

/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
{
        int error = assign_type(&fl->c, arg);

        if (error)
                return error;
        lease_clear_pending(fl, arg);
        locks_wake_up_blocks(&fl->c);
        if (arg == F_UNLCK) {
                struct file *filp = fl->c.flc_file;

                f_delown(filp);
                filp->f_owner.signum = 0;
                fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
                if (fl->fl_fasync != NULL) {
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
                locks_delete_lock_ctx(&fl->c, dispose);
        }
        return 0;
}
EXPORT_SYMBOL(lease_modify);

static bool past_time(unsigned long then)
{
        if (!then)
                /* 0 is a special value meaning "this never expires": */
                return false;
        return time_after(jiffies, then);
}

static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lease *fl, *tmp;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
                        lease_modify(fl, F_RDLCK, dispose);
                if (past_time(fl->fl_break_time))
                        lease_modify(fl, F_UNLCK, dispose);
        }
}

static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc)
{
        bool rc;
        struct file_lease *lease = file_lease(lc);
        struct file_lease *breaker = file_lease(bc);

        if (lease->fl_lmops->lm_breaker_owns_lease
                        && lease->fl_lmops->lm_breaker_owns_lease(lease))
                return false;
        if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) {
                rc = false;
                goto trace;
        }
        if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) {
                rc = false;
                goto trace;
        }

        rc = locks_conflict(bc, lc);
trace:
        trace_leases_conflict(rc, lease, breaker);
        return rc;
}

static bool
any_leases_conflict(struct inode *inode, struct file_lease *breaker)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock_core *flc;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
                if (leases_conflict(flc, &breaker->c))
                        return true;
        }
        return false;
}

/**
 *        __break_lease        -        revoke all outstanding leases on file
 *        @inode: the inode of the file to return
 *        @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
 *            break all leases
 *        @type: FL_LEASE: break leases and delegations; FL_DELEG: break
 *            only delegations
 *
 *        break_lease (inlined for speed) has checked there already is at least
 *        some kind of lock (maybe a lease) on this file.  Leases are broken on
 *        a call to open() or truncate().  This function can sleep unless you
 *        specified %O_NONBLOCK to your open().
 */
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        int error = 0;
        struct file_lock_context *ctx;
        struct file_lease *new_fl, *fl, *tmp;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);

        new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
        if (IS_ERR(new_fl))
                return PTR_ERR(new_fl);
        new_fl->c.flc_flags = type;

        /* typically we will check that ctx is non-NULL before calling */
        ctx = locks_inode_context(inode);
        if (!ctx) {
                WARN_ON_ONCE(1);
                goto free_lock;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);

        time_out_leases(inode, &dispose);

        if (!any_leases_conflict(inode, new_fl))
                goto out;

        break_time = 0;
        if (lease_break_time > 0) {
                break_time = jiffies + lease_break_time * HZ;
                if (break_time == 0)
                        break_time++;        /* so that 0 means no break time */
        }

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                if (!leases_conflict(&fl->c, &new_fl->c))
                        continue;
                if (want_write) {
                        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                                continue;
                        fl->c.flc_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
                        if (lease_breaking(fl))
                                continue;
                        fl->c.flc_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
                        locks_delete_lock_ctx(&fl->c, &dispose);
        }

        if (list_empty(&ctx->flc_lease))
                goto out;

        if (mode & O_NONBLOCK) {
                trace_break_lease_noblock(inode, new_fl);
                error = -EWOULDBLOCK;
                goto out;
        }

restart:
        fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
        break_time = fl->fl_break_time;
        if (break_time != 0)
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
        locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
        trace_break_lease_block(inode, new_fl);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->c.flc_wait,
                                                 list_empty(&new_fl->c.flc_blocked_member),
                                                 break_time);

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        __locks_delete_block(&new_fl->c);
        if (error >= 0) {
                /*
                 * Wait for the next conflicting lease that has not been
                 * broken yet
                 */
                if (error == 0)
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
free_lock:
        locks_free_lease(new_fl);
        return error;
}
EXPORT_SYMBOL(__break_lease);

/**
 *        lease_get_mtime - update modified time of an inode with exclusive lease
 *        @inode: the inode
 *      @time:  pointer to a timespec which contains the last modified time
 *
 * This is to force NFS clients to flush their caches for files with
 * exclusive leases.  The justification is that if someone has an
 * exclusive lease, then they could be modifying it.
 */
void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
        bool has_lease = false;
        struct file_lock_context *ctx;
        struct file_lock_core *flc;

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                spin_lock(&ctx->flc_lock);
                flc = list_first_entry_or_null(&ctx->flc_lease,
                                               struct file_lock_core, flc_list);
                if (flc && flc->flc_type == F_WRLCK)
                        has_lease = true;
                spin_unlock(&ctx->flc_lock);
        }

        if (has_lease)
                *time = current_time(inode);
}
EXPORT_SYMBOL(lease_get_mtime);

/**
 *        fcntl_getlease - Enquire what lease is currently active
 *        @filp: the file
 *
 *        The value returned by this function will be one of
 *        (if no lease break is pending):
 *
 *        %F_RDLCK to indicate a shared lease is held.
 *
 *        %F_WRLCK to indicate an exclusive lease is held.
 *
 *        %F_UNLCK to indicate no lease is held.
 *
 *        (if a lease break is pending):
 *
 *        %F_RDLCK to indicate an exclusive lease needs to be
 *                changed to a shared lease (or removed).
 *
 *        %F_UNLCK to indicate the lease needs to be removed.
 *
 *        XXX: sfr & willy disagree over whether F_INPROGRESS
 *        should be returned to userspace.
 */
int fcntl_getlease(struct file *filp)
{
        struct file_lease *fl;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                percpu_down_read(&file_rwsem);
                spin_lock(&ctx->flc_lock);
                time_out_leases(inode, &dispose);
                list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                        if (fl->c.flc_file != filp)
                                continue;
                        type = target_leasetype(fl);
                        break;
                }
                spin_unlock(&ctx->flc_lock);
                percpu_up_read(&file_rwsem);

                locks_dispose_list(&dispose);
        }
        return type;
}

/**
 * check_conflicting_open - see if the given file points to an inode that has
 *                            an existing open that would conflict with the
 *                            desired lease.
 * @filp:        file to check
 * @arg:        type of lease that we're trying to acquire
 * @flags:        current lock flags
 *
 * Check to see if there's an existing open fd on this file that would
 * conflict with the lease we're trying to set.
 */
static int
check_conflicting_open(struct file *filp, const int arg, int flags)
{
        struct inode *inode = file_inode(filp);
        int self_wcount = 0, self_rcount = 0;

        if (flags & FL_LAYOUT)
                return 0;
        if (flags & FL_DELEG)
                /* We leave these checks to the caller */
                return 0;

        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
        else if (arg != F_WRLCK)
                return 0;

        /*
         * Make sure that only read/write count is from lease requestor.
         * Note that this will result in denying write leases when i_writecount
         * is negative, which is what we want.  (We shouldn't grant write leases
         * on files open for execution.)
         */
        if (filp->f_mode & FMODE_WRITE)
                self_wcount = 1;
        else if (filp->f_mode & FMODE_READ)
                self_rcount = 1;

        if (atomic_read(&inode->i_writecount) != self_wcount ||
            atomic_read(&inode->i_readcount) != self_rcount)
                return -EAGAIN;

        return 0;
}

static int
generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv)
{
        struct file_lease *fl, *my_fl = NULL, *lease;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->c.flc_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);

        lease = *flp;
        trace_generic_add_lease(inode, lease);

        /* Note that arg is never F_UNLCK here */
        ctx = locks_get_lock_context(inode, arg);
        if (!ctx)
                return -ENOMEM;

        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_mutex.  We trylock
         * because delegations are an optional optimization, and if
         * there's some chance of a conflict--we'd rather not
         * bother, maybe that's a sign this just isn't a good file to
         * hand out a delegation on.
         */
        if (is_deleg && !inode_trylock(inode))
                return -EAGAIN;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
        error = check_conflicting_open(filp, arg, lease->c.flc_flags);
        if (error)
                goto out;

        /*
         * At this point, we know that if there is an exclusive
         * lease on this file, then we hold it on this filp
         * (otherwise our open of this file would have blocked).
         * And if we are trying to acquire an exclusive lease,
         * then the file is not open by anyone (including us)
         * except for this filp.
         */
        error = -EAGAIN;
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == lease->c.flc_owner) {
                        my_fl = fl;
                        continue;
                }

                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
                 */
                if (arg == F_WRLCK)
                        goto out;
                /*
                 * Modifying our existing lease is OK, but no getting a
                 * new lease if someone else is opening for write:
                 */
                if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                        goto out;
        }

        if (my_fl != NULL) {
                lease = my_fl;
                error = lease->fl_lmops->lm_change(lease, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
        }

        error = -EINVAL;
        if (!leases_enable)
                goto out;

        locks_insert_lock_ctx(&lease->c, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
         * open but before the lease was inserted. Check again for a
         * conflicting open and cancel the lease if there is one.
         *
         * We also add a barrier here to ensure that the insertion of the lock
         * precedes these checks.
         */
        smp_mb();
        error = check_conflicting_open(filp, arg, lease->c.flc_flags);
        if (error) {
                locks_unlink_lock_ctx(&lease->c);
                goto out;
        }

out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        if (is_deleg)
                inode_unlock(inode);
        if (!error && !my_fl)
                *flp = NULL;
        return error;
}

static int generic_delete_lease(struct file *filp, void *owner)
{
        int error = -EAGAIN;
        struct file_lease *fl, *victim = NULL;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (!ctx) {
                trace_generic_delete_lease(inode, NULL);
                return error;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == owner) {
                        victim = fl;
                        break;
                }
        }
        trace_generic_delete_lease(inode, victim);
        if (victim)
                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        return error;
}

/**
 *        generic_setlease        -        sets a lease on an open file
 *        @filp:        file pointer
 *        @arg:        type of lease to obtain
 *        @flp:        input - file_lock to use, output - file_lock inserted
 *        @priv:        private data for lm_setup (may be NULL if lm_setup
 *                doesn't require it)
 *
 *        The (input) flp->fl_lmops->lm_break function is required
 *        by break_lease().
 */
int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
                        void **priv)
{
        switch (arg) {
        case F_UNLCK:
                return generic_delete_lease(filp, *priv);
        case F_RDLCK:
        case F_WRLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }

                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(generic_setlease);

/*
 * Kernel subsystems can register to be notified on any attempt to set
 * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
 * to close files that it may have cached when there is an attempt to set a
 * conflicting lease.
 */
static struct srcu_notifier_head lease_notifier_chain;

static inline void
lease_notifier_chain_init(void)
{
        srcu_init_notifier_head(&lease_notifier_chain);
}

static inline void
setlease_notifier(int arg, struct file_lease *lease)
{
        if (arg != F_UNLCK)
                srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
}

int lease_register_notifier(struct notifier_block *nb)
{
        return srcu_notifier_chain_register(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_register_notifier);

void lease_unregister_notifier(struct notifier_block *nb)
{
        srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);


int
kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        if (lease)
                setlease_notifier(arg, *lease);
        if (filp->f_op->setlease)
                return filp->f_op->setlease(filp, arg, lease, priv);
        else
                return generic_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(kernel_setlease);

/**
 * vfs_setlease        -       sets a lease on an open file
 * @filp:        file pointer
 * @arg:        type of lease to obtain
 * @lease:        file_lock to use when adding a lease
 * @priv:        private info for lm_setup when adding a lease (may be
 *                NULL if lm_setup doesn't require it)
 *
 * Call this to establish a lease on the file. The "lease" argument is not
 * used for F_UNLCK requests and may be NULL. For commands that set or alter
 * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be
 * set; if not, this function will return -ENOLCK (and generate a scary-looking
 * stack trace).
 *
 * The "priv" pointer is passed directly to the lm_setup function as-is. It
 * may be NULL if the lm_setup operation doesn't require it.
 */
int
vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        struct inode *inode = file_inode(filp);
        vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
        int error;

        if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
                return -EACCES;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
        error = security_file_lock(filp, arg);
        if (error)
                return error;
        return kernel_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);

static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
{
        struct file_lease *fl;
        struct fasync_struct *new;
        int error;

        fl = lease_alloc(filp, arg);
        if (IS_ERR(fl))
                return PTR_ERR(fl);

        new = fasync_alloc();
        if (!new) {
                locks_free_lease(fl);
                return -ENOMEM;
        }
        new->fa_fd = fd;

        error = vfs_setlease(filp, arg, &fl, (void **)&new);
        if (fl)
                locks_free_lease(fl);
        if (new)
                fasync_free(new);
        return error;
}

/**
 *        fcntl_setlease        -        sets a lease on an open file
 *        @fd: open file descriptor
 *        @filp: file pointer
 *        @arg: type of lease to obtain
 *
 *        Call this fcntl to establish a lease on the file.
 *        Note that you also need to call %F_SETSIG to
 *        receive a signal when the lease is broken.
 */
int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        if (arg == F_UNLCK)
                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, arg);
}

/**
 * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a FLOCK style lock request to an inode.
 */
static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep();
        for (;;) {
                error = flock_lock_inode(inode, fl);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

/**
 * locks_lock_inode_wait - Apply a lock to an inode
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a POSIX or FLOCK style lock request to an inode.
 */
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int res = 0;
        switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) {
                case FL_POSIX:
                        res = posix_lock_inode_wait(inode, fl);
                        break;
                case FL_FLOCK:
                        res = flock_lock_inode_wait(inode, fl);
                        break;
                default:
                        BUG();
        }
        return res;
}
EXPORT_SYMBOL(locks_lock_inode_wait);

/**
 *        sys_flock: - flock() system call.
 *        @fd: the file descriptor to lock.
 *        @cmd: the type of lock to apply.
 *
 *        Apply a %FL_FLOCK style lock to an open file descriptor.
 *        The @cmd can be one of:
 *
 *        - %LOCK_SH -- a shared lock.
 *        - %LOCK_EX -- an exclusive lock.
 *        - %LOCK_UN -- remove an existing lock.
 *        - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
 *
 *        %LOCK_MAND support has been removed from the kernel.
 */
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
        int can_sleep, error, type;
        struct file_lock fl;
        struct fd f;

        /*
         * LOCK_MAND locks were broken for a long time in that they never
         * conflicted with one another and didn't prevent any sort of open,
         * read or write activity.
         *
         * Just ignore these requests now, to preserve legacy behavior, but
         * throw a warning to let people know that they don't actually work.
         */
        if (cmd & LOCK_MAND) {
                pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
                return 0;
        }

        type = flock_translate_cmd(cmd & ~LOCK_NB);
        if (type < 0)
                return type;

        error = -EBADF;
        f = fdget(fd);
        if (!f.file)
                return error;

        if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
                goto out_putf;

        flock_make_lock(f.file, &fl, type);

        error = security_file_lock(f.file, fl.c.flc_type);
        if (error)
                goto out_putf;

        can_sleep = !(cmd & LOCK_NB);
        if (can_sleep)
                fl.c.flc_flags |= FL_SLEEP;

        if (f.file->f_op->flock)
                error = f.file->f_op->flock(f.file,
                                            (can_sleep) ? F_SETLKW : F_SETLK,
                                            &fl);
        else
                error = locks_lock_file_wait(f.file, &fl);

        locks_release_private(&fl);
 out_putf:
        fdput(f);

        return error;
}

/**
 * vfs_test_lock - test file byte range lock
 * @filp: The file to test lock for
 * @fl: The lock to test; also used to hold result
 *
 * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
 * setting conf->fl_type to something other than F_UNLCK.
 */
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_GETLK, fl);
        posix_test_lock(filp, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_test_lock);

/**
 * locks_translate_pid - translate a file_lock's fl_pid number into a namespace
 * @fl: The file_lock who's fl_pid should be translated
 * @ns: The namespace into which the pid should be translated
 *
 * Used to translate a fl_pid into a namespace virtual pid number
 */
static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns)
{
        pid_t vnr;
        struct pid *pid;

        if (fl->flc_flags & FL_OFDLCK)
                return -1;

        /* Remote locks report a negative pid value */
        if (fl->flc_pid <= 0)
                return fl->flc_pid;

        /*
         * If the flock owner process is dead and its pid has been already
         * freed, the translation below won't work, but we still want to show
         * flock owner pid number in init pidns.
         */
        if (ns == &init_pid_ns)
                return (pid_t) fl->flc_pid;

        rcu_read_lock();
        pid = find_pid_ns(fl->flc_pid, &init_pid_ns);
        vnr = pid_nr_ns(pid, ns);
        rcu_read_unlock();
        return vnr;
}

static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
        /*
         * Make sure we can represent the posix lock via
         * legacy 32bit flock.
         */
        if (fl->fl_start > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
        if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
#endif
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
        return 0;
}

#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
}
#endif

/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;
        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK) {
                error = posix_lock_to_flock(flock, fl);
                if (error)
                        goto out;
        }
out:
        locks_free_lock(fl);
        return error;
}

/**
 * vfs_lock_file - file byte range lock
 * @filp: The file to apply the lock to
 * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
 * @fl: The lock to be applied
 * @conf: Place to return a copy of the conflicting lock, if found.
 *
 * A caller that doesn't care about the conflicting lock may pass NULL
 * as the final argument.
 *
 * If the filesystem defines a private ->lock() method, then @conf will
 * be left unchanged; so a caller that cares should initialize it to
 * some acceptable default.
 *
 * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
 * locks, the ->lock() interface may return asynchronously, before the lock has
 * been granted or denied by the underlying filesystem, if (and only if)
 * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
 * flags need to be set.
 *
 * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
 * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
 * blocking lock. When ->lock() does return asynchronously, it must return
 * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
 * If the request is for non-blocking lock the file system should return
 * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
 * with the result. If the request timed out the callback routine will return a
 * nonzero return code and the file system should release the lock. The file
 * system is also responsible to keep a corresponding posix lock when it
 * grants a lock so the VFS can find out which locks are locally held and do
 * the correct lock cleanup when required.
 * The underlying filesystem must not drop the kernel lock or call
 * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED
 * return code.
 */
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, cmd, fl);
        else
                return posix_lock_file(filp, fl, conf);
}
EXPORT_SYMBOL_GPL(vfs_lock_file);

static int do_lock_file_wait(struct file *filp, unsigned int cmd,
                             struct file_lock *fl)
{
        int error;

        error = security_file_lock(filp, fl->c.flc_type);
        if (error)
                return error;

        for (;;) {
                error = vfs_lock_file(filp, cmd, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);

        return error;
}

/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
        switch (fl->c.flc_type) {
        case F_RDLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_READ))
                        return -EBADF;
                break;
        case F_WRLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_WRITE))
                        return -EBADF;
        }
        return 0;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct inode *inode = file_inode(filp);
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Attempt to detect a close/fcntl race and recover by releasing the
         * lock that was just acquired. There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        file_lock->c.flc_type = F_UNLCK;
                        error = do_lock_file_wait(filp, cmd, file_lock);
                        WARN_ON_ONCE(error);
                        error = -EBADF;
                }
        }
out:
        trace_fcntl_setlk(inode, file_lock, error);
        locks_free_lock(file_lock);
        return error;
}

#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;

        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock64_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK)
                posix_lock_to_flock64(flock, fl);

out:
        locks_free_lock(fl);
        return error;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock64 *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock64_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW64:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Attempt to detect a close/fcntl race and recover by releasing the
         * lock that was just acquired. There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        file_lock->c.flc_type = F_UNLCK;
                        error = do_lock_file_wait(filp, cmd, file_lock);
                        WARN_ON_ONCE(error);
                        error = -EBADF;
                }
        }
out:
        locks_free_lock(file_lock);
        return error;
}
#endif /* BITS_PER_LONG == 32 */

/*
 * This function is called when the file is being removed
 * from the task's fd array.  POSIX locks belonging to this task
 * are deleted at this time.
 */
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        int error;
        struct inode *inode = file_inode(filp);
        struct file_lock lock;
        struct file_lock_context *ctx;

        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
        ctx = locks_inode_context(inode);
        if (!ctx || list_empty(&ctx->flc_posix))
                return;

        locks_init_lock(&lock);
        lock.c.flc_type = F_UNLCK;
        lock.c.flc_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
        lock.c.flc_owner = owner;
        lock.c.flc_pid = current->tgid;
        lock.c.flc_file = filp;
        lock.fl_ops = NULL;
        lock.fl_lmops = NULL;

        error = vfs_lock_file(filp, F_SETLK, &lock, NULL);

        if (lock.fl_ops && lock.fl_ops->fl_release_private)
                lock.fl_ops->fl_release_private(&lock);
        trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);

/* The i_flctx must be valid when calling into here */
static void
locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
        struct file_lock fl;
        struct inode *inode = file_inode(filp);

        if (list_empty(&flctx->flc_flock))
                return;

        flock_make_lock(filp, &fl, F_UNLCK);
        fl.c.flc_flags |= FL_CLOSE;

        if (filp->f_op->flock)
                filp->f_op->flock(filp, F_SETLKW, &fl);
        else
                flock_lock_inode(inode, &fl);

        if (fl.fl_ops && fl.fl_ops->fl_release_private)
                fl.fl_ops->fl_release_private(&fl);
}

/* The i_flctx must be valid when calling into here */
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
        struct file_lease *fl, *tmp;
        LIST_HEAD(dispose);

        if (list_empty(&ctx->flc_lease))
                return;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list)
                if (filp == fl->c.flc_file)
                        lease_modify(fl, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
}

/*
 * This function is called on the last close of an open file.
 */
void locks_remove_file(struct file *filp)
{
        struct file_lock_context *ctx;

        ctx = locks_inode_context(file_inode(filp));
        if (!ctx)
                return;

        /* remove any OFD locks */
        locks_remove_posix(filp, filp);

        /* remove flock locks */
        locks_remove_flock(filp, ctx);

        /* remove any leases */
        locks_remove_lease(filp, ctx);

        spin_lock(&ctx->flc_lock);
        locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX");
        locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK");
        locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE");
        spin_unlock(&ctx->flc_lock);
}

/**
 * vfs_cancel_lock - file byte range unblock lock
 * @filp: The file to apply the unblock to
 * @fl: The lock to be unblocked
 *
 * Used by lock managers to cancel blocked requests
 */
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_CANCELLK, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);

/**
 * vfs_inode_has_locks - are any file locks held on @inode?
 * @inode: inode to check for locks
 *
 * Return true if there are any FL_POSIX or FL_FLOCK locks currently
 * set on @inode.
 */
bool vfs_inode_has_locks(struct inode *inode)
{
        struct file_lock_context *ctx;
        bool ret;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return false;

        spin_lock(&ctx->flc_lock);
        ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
        spin_unlock(&ctx->flc_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_inode_has_locks);

#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

struct locks_iterator {
        int        li_cpu;
        loff_t        li_pos;
};

static void lock_get_status(struct seq_file *f, struct file_lock_core *flc,
                            loff_t id, char *pfx, int repeat)
{
        struct inode *inode = NULL;
        unsigned int pid;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int type = flc->flc_type;
        struct file_lock *fl = file_lock(flc);

        pid = locks_translate_pid(flc, proc_pidns);

        /*
         * If lock owner is dead (and pid is freed) or not visible in current
         * pidns, zero is shown as a pid value. Check lock info from
         * init_pid_ns to get saved lock pid value.
         */
        if (flc->flc_file != NULL)
                inode = file_inode(flc->flc_file);

        seq_printf(f, "%lld: ", id);

        if (repeat)
                seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);

        if (flc->flc_flags & FL_POSIX) {
                if (flc->flc_flags & FL_ACCESS)
                        seq_puts(f, "ACCESS");
                else if (flc->flc_flags & FL_OFDLCK)
                        seq_puts(f, "OFDLCK");
                else
                        seq_puts(f, "POSIX ");

                seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
        } else if (flc->flc_flags & FL_FLOCK) {
                seq_puts(f, "FLOCK  ADVISORY  ");
        } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) {
                struct file_lease *lease = file_lease(flc);

                type = target_leasetype(lease);

                if (flc->flc_flags & FL_DELEG)
                        seq_puts(f, "DELEG  ");
                else
                        seq_puts(f, "LEASE  ");

                if (lease_breaking(lease))
                        seq_puts(f, "BREAKING  ");
                else if (flc->flc_file)
                        seq_puts(f, "ACTIVE    ");
                else
                        seq_puts(f, "BREAKER   ");
        } else {
                seq_puts(f, "UNKNOWN UNKNOWN  ");
        }

        seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
                             (type == F_RDLCK) ? "READ" : "UNLCK");
        if (inode) {
                /* userspace relies on this representation of dev_t */
                seq_printf(f, "%d %02x:%02x:%lu ", pid,
                                MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino);
        } else {
                seq_printf(f, "%d <none>:0 ", pid);
        }
        if (flc->flc_flags & FL_POSIX) {
                if (fl->fl_end == OFFSET_MAX)
                        seq_printf(f, "%Ld EOF\n", fl->fl_start);
                else
                        seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
        } else {
                seq_puts(f, "0 EOF\n");
        }
}

static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node)
{
        struct file_lock_core *tmp;

        /* NULL node or root node */
        if (node == NULL || node->flc_blocker == NULL)
                return NULL;

        /* Next member in the linked list could be itself */
        tmp = list_next_entry(node, flc_blocked_member);
        if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests,
                               flc_blocked_member)
                || tmp == node) {
                return NULL;
        }

        return tmp;
}

static int locks_show(struct seq_file *f, void *v)
{
        struct locks_iterator *iter = f->private;
        struct file_lock_core *cur, *tmp;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int level = 0;

        cur = hlist_entry(v, struct file_lock_core, flc_link);

        if (locks_translate_pid(cur, proc_pidns) == 0)
                return 0;

        /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests
         * is the left child of current node, the next silibing in flc_blocked_member is the
         * right child, we can alse get the parent of current node from flc_blocker, so this
         * question becomes traversal of a binary tree
         */
        while (cur != NULL) {
                if (level)
                        lock_get_status(f, cur, iter->li_pos, "-> ", level);
                else
                        lock_get_status(f, cur, iter->li_pos, "", level);

                if (!list_empty(&cur->flc_blocked_requests)) {
                        /* Turn left */
                        cur = list_first_entry_or_null(&cur->flc_blocked_requests,
                                                       struct file_lock_core,
                                                       flc_blocked_member);
                        level++;
                } else {
                        /* Turn right */
                        tmp = get_next_blocked_member(cur);
                        /* Fall back to parent node */
                        while (tmp == NULL && cur->flc_blocker != NULL) {
                                cur = cur->flc_blocker;
                                level--;
                                tmp = get_next_blocked_member(cur);
                        }
                        cur = tmp;
                }
        }

        return 0;
}

static void __show_fd_locks(struct seq_file *f,
                        struct list_head *head, int *id,
                        struct file *filp, struct files_struct *files)
{
        struct file_lock_core *fl;

        list_for_each_entry(fl, head, flc_list) {

                if (filp != fl->flc_file)
                        continue;
                if (fl->flc_owner != files && fl->flc_owner != filp)
                        continue;

                (*id)++;
                seq_puts(f, "lock:\t");
                lock_get_status(f, fl, *id, "", 0);
        }
}

void show_fd_locks(struct seq_file *f,
                  struct file *filp, struct files_struct *files)
{
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int id = 0;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return;

        spin_lock(&ctx->flc_lock);
        __show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
        spin_unlock(&ctx->flc_lock);
}

static void *locks_start(struct seq_file *f, loff_t *pos)
        __acquires(&blocked_lock_lock)
{
        struct locks_iterator *iter = f->private;

        iter->li_pos = *pos + 1;
        percpu_down_write(&file_rwsem);
        spin_lock(&blocked_lock_lock);
        return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}

static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
        struct locks_iterator *iter = f->private;

        ++iter->li_pos;
        return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}

static void locks_stop(struct seq_file *f, void *v)
        __releases(&blocked_lock_lock)
{
        spin_unlock(&blocked_lock_lock);
        percpu_up_write(&file_rwsem);
}

static const struct seq_operations locks_seq_operations = {
        .start        = locks_start,
        .next        = locks_next,
        .stop        = locks_stop,
        .show        = locks_show,
};

static int __init proc_locks_init(void)
{
        proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
                        sizeof(struct locks_iterator), NULL);
        return 0;
}
fs_initcall(proc_locks_init);
#endif

static int __init filelock_init(void)
{
        int i;

        flctx_cache = kmem_cache_create("file_lock_ctx",
                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);

        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);

        filelease_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lease), 0, SLAB_PANIC, NULL);

        for_each_possible_cpu(i) {
                struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);

                spin_lock_init(&fll->lock);
                INIT_HLIST_HEAD(&fll->hlist);
        }

        lease_notifier_chain_init();
        return 0;
}
core_initcall(filelock_init);






















































    1 






































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel reference Implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * This file is part of the SCTP kernel reference Implementation
 *
 * Various protocol defined structures.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Or submit a bug report through the following website:
 *    http://www.sf.net/projects/lksctp
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson <karl@athena.chicago.il.us>
 *    Jon Grimm <jgrimm@us.ibm.com>
 *    Xingang Guo <xingang.guo@intel.com>
 *    randall@sctp.chicago.il.us
 *    kmorneau@cisco.com
 *    qxie1@email.mot.com
 *    Sridhar Samudrala <sri@us.ibm.com>
 *    Kevin Gao <kevin.gao@intel.com>
 *
 * Any bugs reported given to us we will try to fix... any fixes shared will
 * be incorporated into the next SCTP release.
 */
#ifndef __LINUX_SCTP_H__
#define __LINUX_SCTP_H__

#include <linux/in.h>                /* We need in_addr.  */
#include <linux/in6.h>                /* We need in6_addr.  */
#include <linux/skbuff.h>

#include <uapi/linux/sctp.h>

/* Section 3.1.  SCTP Common Header Format */
struct sctphdr {
        __be16 source;
        __be16 dest;
        __be32 vtag;
        __le32 checksum;
};

static inline struct sctphdr *sctp_hdr(const struct sk_buff *skb)
{
        return (struct sctphdr *)skb_transport_header(skb);
}

/* Section 3.2.  Chunk Field Descriptions. */
struct sctp_chunkhdr {
        __u8 type;
        __u8 flags;
        __be16 length;
};


/* Section 3.2.  Chunk Type Values.
 * [Chunk Type] identifies the type of information contained in the Chunk
 * Value field. It takes a value from 0 to 254. The value of 255 is
 * reserved for future use as an extension field.
 */
enum sctp_cid {
        SCTP_CID_DATA                        = 0,
        SCTP_CID_INIT                        = 1,
        SCTP_CID_INIT_ACK                = 2,
        SCTP_CID_SACK                        = 3,
        SCTP_CID_HEARTBEAT                = 4,
        SCTP_CID_HEARTBEAT_ACK                = 5,
        SCTP_CID_ABORT                        = 6,
        SCTP_CID_SHUTDOWN                = 7,
        SCTP_CID_SHUTDOWN_ACK                = 8,
        SCTP_CID_ERROR                        = 9,
        SCTP_CID_COOKIE_ECHO                = 10,
        SCTP_CID_COOKIE_ACK                = 11,
        SCTP_CID_ECN_ECNE                = 12,
        SCTP_CID_ECN_CWR                = 13,
        SCTP_CID_SHUTDOWN_COMPLETE        = 14,

        /* AUTH Extension Section 4.1 */
        SCTP_CID_AUTH                        = 0x0F,

        /* sctp ndata 5.1. I-DATA */
        SCTP_CID_I_DATA                        = 0x40,

        /* PR-SCTP Sec 3.2 */
        SCTP_CID_FWD_TSN                = 0xC0,

        /* Use hex, as defined in ADDIP sec. 3.1 */
        SCTP_CID_ASCONF                        = 0xC1,
        SCTP_CID_I_FWD_TSN                = 0xC2,
        SCTP_CID_ASCONF_ACK                = 0x80,
        SCTP_CID_RECONF                        = 0x82,
        SCTP_CID_PAD                        = 0x84,
}; /* enum */


/* Section 3.2
 *  Chunk Types are encoded such that the highest-order two bits specify
 *  the action that must be taken if the processing endpoint does not
 *  recognize the Chunk Type.
 */
enum {
        SCTP_CID_ACTION_DISCARD     = 0x00,
        SCTP_CID_ACTION_DISCARD_ERR = 0x40,
        SCTP_CID_ACTION_SKIP        = 0x80,
        SCTP_CID_ACTION_SKIP_ERR    = 0xc0,
};

enum { SCTP_CID_ACTION_MASK = 0xc0, };

/* This flag is used in Chunk Flags for ABORT and SHUTDOWN COMPLETE.
 *
 * 3.3.7 Abort Association (ABORT) (6):
 *    The T bit is set to 0 if the sender had a TCB that it destroyed.
 *    If the sender did not have a TCB it should set this bit to 1.
 */
enum { SCTP_CHUNK_FLAG_T = 0x01 };

/*
 *  Set the T bit
 *
 *      0                   1                   2                   3
 *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |   Type = 14   |Reserved     |T|      Length = 4               |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * Chunk Flags: 8 bits
 *
 *   Reserved:  7 bits
 *     Set to 0 on transmit and ignored on receipt.
 *
 *   T bit:  1 bit
 *     The T bit is set to 0 if the sender had a TCB that it destroyed. If
 *     the sender did NOT have a TCB it should set this bit to 1.
 *
 * Note: Special rules apply to this chunk for verification, please
 * see Section 8.5.1 for details.
 */

#define sctp_test_T_bit(c)    ((c)->chunk_hdr->flags & SCTP_CHUNK_FLAG_T)

/* RFC 2960
 * Section 3.2.1 Optional/Variable-length Parmaeter Format.
 */

struct sctp_paramhdr {
        __be16 type;
        __be16 length;
};

enum sctp_param {

        /* RFC 2960 Section 3.3.5 */
        SCTP_PARAM_HEARTBEAT_INFO                = cpu_to_be16(1),
        /* RFC 2960 Section 3.3.2.1 */
        SCTP_PARAM_IPV4_ADDRESS                        = cpu_to_be16(5),
        SCTP_PARAM_IPV6_ADDRESS                        = cpu_to_be16(6),
        SCTP_PARAM_STATE_COOKIE                        = cpu_to_be16(7),
        SCTP_PARAM_UNRECOGNIZED_PARAMETERS        = cpu_to_be16(8),
        SCTP_PARAM_COOKIE_PRESERVATIVE                = cpu_to_be16(9),
        SCTP_PARAM_HOST_NAME_ADDRESS                = cpu_to_be16(11),
        SCTP_PARAM_SUPPORTED_ADDRESS_TYPES        = cpu_to_be16(12),
        SCTP_PARAM_ECN_CAPABLE                        = cpu_to_be16(0x8000),

        /* AUTH Extension Section 3 */
        SCTP_PARAM_RANDOM                        = cpu_to_be16(0x8002),
        SCTP_PARAM_CHUNKS                        = cpu_to_be16(0x8003),
        SCTP_PARAM_HMAC_ALGO                        = cpu_to_be16(0x8004),

        /* Add-IP: Supported Extensions, Section 4.2 */
        SCTP_PARAM_SUPPORTED_EXT        = cpu_to_be16(0x8008),

        /* PR-SCTP Sec 3.1 */
        SCTP_PARAM_FWD_TSN_SUPPORT        = cpu_to_be16(0xc000),

        /* Add-IP Extension. Section 3.2 */
        SCTP_PARAM_ADD_IP                = cpu_to_be16(0xc001),
        SCTP_PARAM_DEL_IP                = cpu_to_be16(0xc002),
        SCTP_PARAM_ERR_CAUSE                = cpu_to_be16(0xc003),
        SCTP_PARAM_SET_PRIMARY                = cpu_to_be16(0xc004),
        SCTP_PARAM_SUCCESS_REPORT        = cpu_to_be16(0xc005),
        SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006),

        /* RE-CONFIG. Section 4 */
        SCTP_PARAM_RESET_OUT_REQUEST                = cpu_to_be16(0x000d),
        SCTP_PARAM_RESET_IN_REQUEST                = cpu_to_be16(0x000e),
        SCTP_PARAM_RESET_TSN_REQUEST                = cpu_to_be16(0x000f),
        SCTP_PARAM_RESET_RESPONSE                = cpu_to_be16(0x0010),
        SCTP_PARAM_RESET_ADD_OUT_STREAMS        = cpu_to_be16(0x0011),
        SCTP_PARAM_RESET_ADD_IN_STREAMS                = cpu_to_be16(0x0012),
}; /* enum */


/* RFC 2960 Section 3.2.1
 *  The Parameter Types are encoded such that the highest-order two bits
 *  specify the action that must be taken if the processing endpoint does
 *  not recognize the Parameter Type.
 *
 */
enum {
        SCTP_PARAM_ACTION_DISCARD     = cpu_to_be16(0x0000),
        SCTP_PARAM_ACTION_DISCARD_ERR = cpu_to_be16(0x4000),
        SCTP_PARAM_ACTION_SKIP        = cpu_to_be16(0x8000),
        SCTP_PARAM_ACTION_SKIP_ERR    = cpu_to_be16(0xc000),
};

enum { SCTP_PARAM_ACTION_MASK = cpu_to_be16(0xc000), };

/* RFC 2960 Section 3.3.1 Payload Data (DATA) (0) */

struct sctp_datahdr {
        __be32 tsn;
        __be16 stream;
        __be16 ssn;
        __u32 ppid;
        /* __u8  payload[]; */
};

struct sctp_data_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_datahdr data_hdr;
};

struct sctp_idatahdr {
        __be32 tsn;
        __be16 stream;
        __be16 reserved;
        __be32 mid;
        union {
                __u32 ppid;
                __be32 fsn;
        };
        __u8 payload[0];
};

struct sctp_idata_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_idatahdr data_hdr;
};

/* DATA Chuck Specific Flags */
enum {
        SCTP_DATA_MIDDLE_FRAG        = 0x00,
        SCTP_DATA_LAST_FRAG        = 0x01,
        SCTP_DATA_FIRST_FRAG        = 0x02,
        SCTP_DATA_NOT_FRAG        = 0x03,
        SCTP_DATA_UNORDERED        = 0x04,
        SCTP_DATA_SACK_IMM        = 0x08,
};
enum { SCTP_DATA_FRAG_MASK = 0x03, };


/* RFC 2960 Section 3.3.2 Initiation (INIT) (1)
 *
 *  This chunk is used to initiate a SCTP association between two
 *  endpoints.
 */
struct sctp_inithdr {
        __be32 init_tag;
        __be32 a_rwnd;
        __be16 num_outbound_streams;
        __be16 num_inbound_streams;
        __be32 initial_tsn;
        /* __u8  params[]; */
};

struct sctp_init_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_inithdr init_hdr;
};


/* Section 3.3.2.1. IPv4 Address Parameter (5) */
struct sctp_ipv4addr_param {
        struct sctp_paramhdr param_hdr;
        struct in_addr addr;
};

/* Section 3.3.2.1. IPv6 Address Parameter (6) */
struct sctp_ipv6addr_param {
        struct sctp_paramhdr param_hdr;
        struct in6_addr addr;
};

/* Section 3.3.2.1 Cookie Preservative (9) */
struct sctp_cookie_preserve_param {
        struct sctp_paramhdr param_hdr;
        __be32 lifespan_increment;
};

/* Section 3.3.2.1 Host Name Address (11) */
struct sctp_hostname_param {
        struct sctp_paramhdr param_hdr;
        uint8_t hostname[];
};

/* Section 3.3.2.1 Supported Address Types (12) */
struct sctp_supported_addrs_param {
        struct sctp_paramhdr param_hdr;
        __be16 types[];
};

/* ADDIP Section 3.2.6 Adaptation Layer Indication */
struct sctp_adaptation_ind_param {
        struct sctp_paramhdr param_hdr;
        __be32 adaptation_ind;
};

/* ADDIP Section 4.2.7 Supported Extensions Parameter */
struct sctp_supported_ext_param {
        struct sctp_paramhdr param_hdr;
        __u8 chunks[];
};

/* AUTH Section 3.1 Random */
struct sctp_random_param {
        struct sctp_paramhdr param_hdr;
        __u8 random_val[];
};

/* AUTH Section 3.2 Chunk List */
struct sctp_chunks_param {
        struct sctp_paramhdr param_hdr;
        __u8 chunks[];
};

/* AUTH Section 3.3 HMAC Algorithm */
struct sctp_hmac_algo_param {
        struct sctp_paramhdr param_hdr;
        __be16 hmac_ids[];
};

/* RFC 2960.  Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2):
 *   The INIT ACK chunk is used to acknowledge the initiation of an SCTP
 *   association.
 */
struct sctp_initack_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_inithdr init_hdr;
};

/* Section 3.3.3.1 State Cookie (7) */
struct sctp_cookie_param {
        struct sctp_paramhdr p;
        __u8 body[];
};

/* Section 3.3.3.1 Unrecognized Parameters (8) */
struct sctp_unrecognized_param {
        struct sctp_paramhdr param_hdr;
        struct sctp_paramhdr unrecognized;
};



/*
 * 3.3.4 Selective Acknowledgement (SACK) (3):
 *
 *  This chunk is sent to the peer endpoint to acknowledge received DATA
 *  chunks and to inform the peer endpoint of gaps in the received
 *  subsequences of DATA chunks as represented by their TSNs.
 */

struct sctp_gap_ack_block {
        __be16 start;
        __be16 end;
};

union sctp_sack_variable {
        struct sctp_gap_ack_block gab;
        __be32 dup;
};

struct sctp_sackhdr {
        __be32 cum_tsn_ack;
        __be32 a_rwnd;
        __be16 num_gap_ack_blocks;
        __be16 num_dup_tsns;
        /* union sctp_sack_variable variable[]; */
};

struct sctp_sack_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_sackhdr sack_hdr;
};


/* RFC 2960.  Section 3.3.5 Heartbeat Request (HEARTBEAT) (4):
 *
 *  An endpoint should send this chunk to its peer endpoint to probe the
 *  reachability of a particular destination transport address defined in
 *  the present association.
 */

struct sctp_heartbeathdr {
        struct sctp_paramhdr info;
};

struct sctp_heartbeat_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_heartbeathdr hb_hdr;
};


/* PAD chunk could be bundled with heartbeat chunk to probe pmtu */
struct sctp_pad_chunk {
        struct sctp_chunkhdr uh;
};


/* For the abort and shutdown ACK we must carry the init tag in the
 * common header. Just the common header is all that is needed with a
 * chunk descriptor.
 */
struct sctp_abort_chunk {
        struct sctp_chunkhdr uh;
};


/* For the graceful shutdown we must carry the tag (in common header)
 * and the highest consecutive acking value.
 */
struct sctp_shutdownhdr {
        __be32 cum_tsn_ack;
};

struct sctp_shutdown_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_shutdownhdr shutdown_hdr;
};

/* RFC 2960.  Section 3.3.10 Operation Error (ERROR) (9) */

struct sctp_errhdr {
        __be16 cause;
        __be16 length;
        /* __u8  variable[]; */
};

struct sctp_operr_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_errhdr err_hdr;
};

/* RFC 2960 3.3.10 - Operation Error
 *
 * Cause Code: 16 bits (unsigned integer)
 *
 *     Defines the type of error conditions being reported.
 *    Cause Code
 *     Value           Cause Code
 *     ---------      ----------------
 *      1              Invalid Stream Identifier
 *      2              Missing Mandatory Parameter
 *      3              Stale Cookie Error
 *      4              Out of Resource
 *      5              Unresolvable Address
 *      6              Unrecognized Chunk Type
 *      7              Invalid Mandatory Parameter
 *      8              Unrecognized Parameters
 *      9              No User Data
 *     10              Cookie Received While Shutting Down
 */
enum sctp_error {

        SCTP_ERROR_NO_ERROR           = cpu_to_be16(0x00),
        SCTP_ERROR_INV_STRM           = cpu_to_be16(0x01),
        SCTP_ERROR_MISS_PARAM            = cpu_to_be16(0x02),
        SCTP_ERROR_STALE_COOKIE           = cpu_to_be16(0x03),
        SCTP_ERROR_NO_RESOURCE            = cpu_to_be16(0x04),
        SCTP_ERROR_DNS_FAILED      = cpu_to_be16(0x05),
        SCTP_ERROR_UNKNOWN_CHUNK   = cpu_to_be16(0x06),
        SCTP_ERROR_INV_PARAM       = cpu_to_be16(0x07),
        SCTP_ERROR_UNKNOWN_PARAM   = cpu_to_be16(0x08),
        SCTP_ERROR_NO_DATA         = cpu_to_be16(0x09),
        SCTP_ERROR_COOKIE_IN_SHUTDOWN = cpu_to_be16(0x0a),


        /* SCTP Implementation Guide:
         *  11  Restart of an association with new addresses
         *  12  User Initiated Abort
         *  13  Protocol Violation
         *  14  Restart of an Association with New Encapsulation Port
         */

        SCTP_ERROR_RESTART         = cpu_to_be16(0x0b),
        SCTP_ERROR_USER_ABORT      = cpu_to_be16(0x0c),
        SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d),
        SCTP_ERROR_NEW_ENCAP_PORT  = cpu_to_be16(0x0e),

        /* ADDIP Section 3.3  New Error Causes
         *
         * Four new Error Causes are added to the SCTP Operational Errors,
         * primarily for use in the ASCONF-ACK chunk.
         *
         * Value          Cause Code
         * ---------      ----------------
         * 0x00A0          Request to Delete Last Remaining IP Address.
         * 0x00A1          Operation Refused Due to Resource Shortage.
         * 0x00A2          Request to Delete Source IP Address.
         * 0x00A3          Association Aborted due to illegal ASCONF-ACK
         * 0x00A4          Request refused - no authorization.
         */
        SCTP_ERROR_DEL_LAST_IP        = cpu_to_be16(0x00A0),
        SCTP_ERROR_RSRC_LOW        = cpu_to_be16(0x00A1),
        SCTP_ERROR_DEL_SRC_IP        = cpu_to_be16(0x00A2),
        SCTP_ERROR_ASCONF_ACK   = cpu_to_be16(0x00A3),
        SCTP_ERROR_REQ_REFUSED        = cpu_to_be16(0x00A4),

        /* AUTH Section 4.  New Error Cause
         *
         * This section defines a new error cause that will be sent if an AUTH
         * chunk is received with an unsupported HMAC identifier.
         * illustrates the new error cause.
         *
         * Cause Code      Error Cause Name
         * --------------------------------------------------------------
         * 0x0105          Unsupported HMAC Identifier
         */
         SCTP_ERROR_UNSUP_HMAC        = cpu_to_be16(0x0105)
};



/* RFC 2960.  Appendix A.  Explicit Congestion Notification.
 *   Explicit Congestion Notification Echo (ECNE) (12)
 */
struct sctp_ecnehdr {
        __be32 lowest_tsn;
};

struct sctp_ecne_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_ecnehdr ence_hdr;
};

/* RFC 2960.  Appendix A.  Explicit Congestion Notification.
 *   Congestion Window Reduced (CWR) (13)
 */
struct sctp_cwrhdr {
        __be32 lowest_tsn;
};

/* PR-SCTP
 * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN)
 *
 * Forward Cumulative TSN chunk has the following format:
 *
 *        0                   1                   2                   3
 *        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *      |   Type = 192  |  Flags = 0x00 |        Length = Variable      |
 *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *      |                      New Cumulative TSN                       |
 *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *      |         Stream-1              |       Stream Sequence-1       |
 *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *      \                                                               /
 *      /                                                               \
 *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *      |         Stream-N              |       Stream Sequence-N       |
 *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 *      Chunk Flags:
 *
 *        Set to all zeros on transmit and ignored on receipt.
 *
 *      New Cumulative TSN: 32 bit u_int
 *
 *       This indicates the new cumulative TSN to the data receiver. Upon
 *       the reception of this value, the data receiver MUST consider
 *       any missing TSNs earlier than or equal to this value as received
 *       and stop reporting them as gaps in any subsequent SACKs.
 *
 *      Stream-N: 16 bit u_int
 *
 *       This field holds a stream number that was skipped by this
 *       FWD-TSN.
 *
 *      Stream Sequence-N: 16 bit u_int
 *       This field holds the sequence number associated with the stream
 *       that was skipped. The stream sequence field holds the largest stream
 *       sequence number in this stream being skipped.  The receiver of
 *       the FWD-TSN's can use the Stream-N and Stream Sequence-N fields
 *       to enable delivery of any stranded TSN's that remain on the stream
 *       re-ordering queues. This field MUST NOT report TSN's corresponding
 *       to DATA chunk that are marked as unordered. For ordered DATA
 *       chunks this field MUST be filled in.
 */
struct sctp_fwdtsn_skip {
        __be16 stream;
        __be16 ssn;
};

struct sctp_fwdtsn_hdr {
        __be32 new_cum_tsn;
        /* struct sctp_fwdtsn_skip skip[]; */
};

struct sctp_fwdtsn_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_fwdtsn_hdr fwdtsn_hdr;
};

struct sctp_ifwdtsn_skip {
        __be16 stream;
        __u8 reserved;
        __u8 flags;
        __be32 mid;
};

struct sctp_ifwdtsn_hdr {
        __be32 new_cum_tsn;
        /* struct sctp_ifwdtsn_skip skip[]; */
};

struct sctp_ifwdtsn_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_ifwdtsn_hdr fwdtsn_hdr;
};

/* ADDIP
 * Section 3.1.1 Address Configuration Change Chunk (ASCONF)
 *
 *         Serial Number: 32 bits (unsigned integer)
 *        This value represents a Serial Number for the ASCONF Chunk. The
 *        valid range of Serial Number is from 0 to 2^32-1.
 *        Serial Numbers wrap back to 0 after reaching 2^32 -1.
 *
 *        Address Parameter: 8 or 20 bytes (depending on type)
 *        The address is an address of the sender of the ASCONF chunk,
 *        the address MUST be considered part of the association by the
 *        peer endpoint. This field may be used by the receiver of the 
 *        ASCONF to help in finding the association. This parameter MUST
 *        be present in every ASCONF message i.e. it is a mandatory TLV
 *        parameter.
 *
 *        ASCONF Parameter: TLV format
 *        Each Address configuration change is represented by a TLV
 *        parameter as defined in Section 3.2. One or more requests may
 *        be present in an ASCONF Chunk.
 *
 * Section 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK)
 * 
 *        Serial Number: 32 bits (unsigned integer)
 *        This value represents the Serial Number for the received ASCONF
 *        Chunk that is acknowledged by this chunk. This value is copied
 *        from the received ASCONF Chunk. 
 *
 *        ASCONF Parameter Response: TLV format
 *        The ASCONF Parameter Response is used in the ASCONF-ACK to
 *        report status of ASCONF processing.
 */
struct sctp_addip_param {
        struct sctp_paramhdr param_hdr;
        __be32 crr_id;
};

struct sctp_addiphdr {
        __be32        serial;
        /* __u8        params[]; */
};

struct sctp_addip_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_addiphdr addip_hdr;
};

/* AUTH
 * Section 4.1  Authentication Chunk (AUTH)
 *
 *   This chunk is used to hold the result of the HMAC calculation.
 *
 *    0                   1                   2                   3
 *    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *   | Type = 0x0F   |   Flags=0     |             Length            |
 *   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *   |     Shared Key Identifier     |   HMAC Identifier             |
 *   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *   |                                                               |
 *   \                             HMAC                              /
 *   /                                                               \
 *   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 *   Type: 1 byte (unsigned integer)
 *           This value MUST be set to 0x0F for  all AUTH-chunks.
 *
 *   Flags: 1 byte (unsigned integer)
 *        Set to zero on transmit and ignored on receipt.
 *
 *   Length: 2 bytes (unsigned integer)
 *           This value holds the length of the HMAC in bytes plus 8.
 *
 *  Shared Key Identifier: 2 bytes (unsigned integer)
 *        This value describes which endpoint pair shared key is used.
 *
 *   HMAC Identifier: 2 bytes (unsigned integer)
 *           This value describes which message digest is being used.  Table 2
 *        shows the currently defined values.
 *
 *    The following Table 2 shows the currently defined values for HMAC
 *       identifiers.
 *
 *         +-----------------+--------------------------+
 *         | HMAC Identifier | Message Digest Algorithm |
 *         +-----------------+--------------------------+
 *         | 0               | Reserved                 |
 *         | 1               | SHA-1 defined in [8]     |
 *         | 2               | Reserved                 |
 *         | 3               | SHA-256 defined in [8]   |
 *         +-----------------+--------------------------+
 *
 *
 *   HMAC: n bytes (unsigned integer) This hold the result of the HMAC
 *      calculation.
 */
struct sctp_authhdr {
        __be16 shkey_id;
        __be16 hmac_id;
        /* __u8   hmac[]; */
};

struct sctp_auth_chunk {
        struct sctp_chunkhdr chunk_hdr;
        struct sctp_authhdr auth_hdr;
};

struct sctp_infox {
        struct sctp_info *sctpinfo;
        struct sctp_association *asoc;
};

struct sctp_reconf_chunk {
        struct sctp_chunkhdr chunk_hdr;
        /* __u8 params[]; */
};

struct sctp_strreset_outreq {
        struct sctp_paramhdr param_hdr;
        __be32 request_seq;
        __be32 response_seq;
        __be32 send_reset_at_tsn;
        __be16 list_of_streams[];
};

struct sctp_strreset_inreq {
        struct sctp_paramhdr param_hdr;
        __be32 request_seq;
        __be16 list_of_streams[];
};

struct sctp_strreset_tsnreq {
        struct sctp_paramhdr param_hdr;
        __be32 request_seq;
};

struct sctp_strreset_addstrm {
        struct sctp_paramhdr param_hdr;
        __be32 request_seq;
        __be16 number_of_streams;
        __be16 reserved;
};

enum {
        SCTP_STRRESET_NOTHING_TO_DO        = 0x00,
        SCTP_STRRESET_PERFORMED                = 0x01,
        SCTP_STRRESET_DENIED                = 0x02,
        SCTP_STRRESET_ERR_WRONG_SSN        = 0x03,
        SCTP_STRRESET_ERR_IN_PROGRESS        = 0x04,
        SCTP_STRRESET_ERR_BAD_SEQNO        = 0x05,
        SCTP_STRRESET_IN_PROGRESS        = 0x06,
};

struct sctp_strreset_resp {
        struct sctp_paramhdr param_hdr;
        __be32 response_seq;
        __be32 result;
};

struct sctp_strreset_resptsn {
        struct sctp_paramhdr param_hdr;
        __be32 response_seq;
        __be32 result;
        __be32 senders_next_tsn;
        __be32 receivers_next_tsn;
};

enum {
        SCTP_DSCP_SET_MASK = 0x1,
        SCTP_DSCP_VAL_MASK = 0xfc,
        SCTP_FLOWLABEL_SET_MASK = 0x100000,
        SCTP_FLOWLABEL_VAL_MASK = 0xfffff
};

/* UDP Encapsulation
 * draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.html#section-4-4
 *
 *   The error cause indicating an "Restart of an Association with
 *   New Encapsulation Port"
 *
 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |        Cause Code = 14        |       Cause Length = 8        |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |   Current Encapsulation Port  |     New Encapsulation Port    |
 * +-------------------------------+-------------------------------+
 */
struct sctp_new_encap_port_hdr {
        __be16 cur_port;
        __be16 new_port;
};

/* Round an int up to the next multiple of 4.  */
#define SCTP_PAD4(s) (((s)+3)&~3)
/* Truncate to the previous multiple of 4.  */
#define SCTP_TRUNC4(s) ((s)&~3)

#endif /* __LINUX_SCTP_H__ */
































































































































































































































































































































































































    2 
   10 


































    2 


















    2 




    3 
    6 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * AppArmor security module
 *
 * This file contains AppArmor label definitions
 *
 * Copyright 2017 Canonical Ltd.
 */

#ifndef __AA_LABEL_H
#define __AA_LABEL_H

#include <linux/atomic.h>
#include <linux/audit.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

#include "apparmor.h"
#include "lib.h"

struct aa_ns;

#define LOCAL_VEC_ENTRIES 8
#define DEFINE_VEC(T, V)                                                \
        struct aa_ ## T *(_ ## V ## _localtmp)[LOCAL_VEC_ENTRIES];        \
        struct aa_ ## T **(V)

#define vec_setup(T, V, N, GFP)                                                \
({                                                                        \
        if ((N) <= LOCAL_VEC_ENTRIES) {                                        \
                typeof(N) i;                                                \
                (V) = (_ ## V ## _localtmp);                                \
                for (i = 0; i < (N); i++)                                \
                        (V)[i] = NULL;                                        \
        } else                                                                \
                (V) = kzalloc(sizeof(struct aa_ ## T *) * (N), (GFP));        \
        (V) ? 0 : -ENOMEM;                                                \
})

#define vec_cleanup(T, V, N)                                                \
do {                                                                        \
        int i;                                                                \
        for (i = 0; i < (N); i++) {                                        \
                if (!IS_ERR_OR_NULL((V)[i]))                                \
                        aa_put_ ## T((V)[i]);                                \
        }                                                                \
        if ((V) != _ ## V ## _localtmp)                                        \
                kfree(V);                                                \
} while (0)

#define vec_last(VEC, SIZE) ((VEC)[(SIZE) - 1])
#define vec_ns(VEC, SIZE) (vec_last((VEC), (SIZE))->ns)
#define vec_labelset(VEC, SIZE) (&vec_ns((VEC), (SIZE))->labels)
#define cleanup_domain_vec(V, L) cleanup_label_vec((V), (L)->size)

struct aa_profile;
#define VEC_FLAG_TERMINATE 1
int aa_vec_unique(struct aa_profile **vec, int n, int flags);
struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len,
                                             gfp_t gfp);
#define aa_sort_and_merge_vec(N, V) \
        aa_sort_and_merge_profiles((N), (struct aa_profile **)(V))


/* struct aa_labelset - set of labels for a namespace
 *
 * Labels are reference counted; aa_labelset does not contribute to label
 * reference counts. Once a label's last refcount is put it is removed from
 * the set.
 */
struct aa_labelset {
        rwlock_t lock;

        struct rb_root root;
};

#define __labelset_for_each(LS, N) \
        for ((N) = rb_first(&(LS)->root); (N); (N) = rb_next(N))

enum label_flags {
        FLAG_HAT = 1,                        /* profile is a hat */
        FLAG_UNCONFINED = 2,                /* label unconfined only if all */
        FLAG_NULL = 4,                        /* profile is null learning profile */
        FLAG_IX_ON_NAME_ERROR = 8,        /* fallback to ix on name lookup fail */
        FLAG_IMMUTIBLE = 0x10,                /* don't allow changes/replacement */
        FLAG_USER_DEFINED = 0x20,        /* user based profile - lower privs */
        FLAG_NO_LIST_REF = 0x40,        /* list doesn't keep profile ref */
        FLAG_NS_COUNT = 0x80,                /* carries NS ref count */
        FLAG_IN_TREE = 0x100,                /* label is in tree */
        FLAG_PROFILE = 0x200,                /* label is a profile */
        FLAG_EXPLICIT = 0x400,                /* explicit static label */
        FLAG_STALE = 0x800,                /* replaced/removed */
        FLAG_RENAMED = 0x1000,                /* label has renaming in it */
        FLAG_REVOKED = 0x2000,                /* label has revocation in it */
        FLAG_DEBUG1 = 0x4000,
        FLAG_DEBUG2 = 0x8000,

        /* These flags must correspond with PATH_flags */
        /* TODO: add new path flags */
};

struct aa_label;
struct aa_proxy {
        struct kref count;
        struct aa_label __rcu *label;
};

struct label_it {
        int i, j;
};

/* struct aa_label - lazy labeling struct
 * @count: ref count of active users
 * @node: rbtree position
 * @rcu: rcu callback struct
 * @proxy: is set to the label that replaced this label
 * @hname: text representation of the label (MAYBE_NULL)
 * @flags: stale and other flags - values may change under label set lock
 * @secid: secid that references this label
 * @size: number of entries in @ent[]
 * @ent: set of profiles for label, actual size determined by @size
 */
struct aa_label {
        struct kref count;
        struct rb_node node;
        struct rcu_head rcu;
        struct aa_proxy *proxy;
        __counted char *hname;
        long flags;
        u32 secid;
        int size;
        struct aa_profile *vec[];
};

#define last_error(E, FN)                                \
do {                                                        \
        int __subE = (FN);                                \
        if (__subE)                                        \
                (E) = __subE;                                \
} while (0)

#define label_isprofile(X) ((X)->flags & FLAG_PROFILE)
#define label_unconfined(X) ((X)->flags & FLAG_UNCONFINED)
#define unconfined(X) label_unconfined(X)
#define label_is_stale(X) ((X)->flags & FLAG_STALE)
#define __label_make_stale(X) ((X)->flags |= FLAG_STALE)
#define labels_ns(X) (vec_ns(&((X)->vec[0]), (X)->size))
#define labels_set(X) (&labels_ns(X)->labels)
#define labels_view(X) labels_ns(X)
#define labels_profile(X) ((X)->vec[(X)->size - 1])


int aa_label_next_confined(struct aa_label *l, int i);

/* for each profile in a label */
#define label_for_each(I, L, P)                                                \
        for ((I).i = 0; ((P) = (L)->vec[(I).i]); ++((I).i))

/* assumes break/goto ended label_for_each */
#define label_for_each_cont(I, L, P)                                        \
        for (++((I).i); ((P) = (L)->vec[(I).i]); ++((I).i))

#define next_comb(I, L1, L2)                                                \
do {                                                                        \
        (I).j++;                                                        \
        if ((I).j >= (L2)->size) {                                        \
                (I).i++;                                                \
                (I).j = 0;                                                \
        }                                                                \
} while (0)


/* for each combination of P1 in L1, and P2 in L2 */
#define label_for_each_comb(I, L1, L2, P1, P2)                                \
for ((I).i = (I).j = 0;                                                        \
        ((P1) = (L1)->vec[(I).i]) && ((P2) = (L2)->vec[(I).j]);                \
        (I) = next_comb(I, L1, L2))

#define fn_for_each_comb(L1, L2, P1, P2, FN)                                \
({                                                                        \
        struct label_it i;                                                \
        int __E = 0;                                                        \
        label_for_each_comb(i, (L1), (L2), (P1), (P2)) {                \
                last_error(__E, (FN));                                        \
        }                                                                \
        __E;                                                                \
})

/* for each profile that is enforcing confinement in a label */
#define label_for_each_confined(I, L, P)                                \
        for ((I).i = aa_label_next_confined((L), 0);                        \
             ((P) = (L)->vec[(I).i]);                                        \
             (I).i = aa_label_next_confined((L), (I).i + 1))

#define label_for_each_in_merge(I, A, B, P)                                \
        for ((I).i = (I).j = 0;                                                \
             ((P) = aa_label_next_in_merge(&(I), (A), (B)));                \
             )

#define label_for_each_not_in_set(I, SET, SUB, P)                        \
        for ((I).i = (I).j = 0;                                                \
             ((P) = __aa_label_next_not_in_set(&(I), (SET), (SUB)));        \
             )

#define next_in_ns(i, NS, L)                                                \
({                                                                        \
        typeof(i) ___i = (i);                                                \
        while ((L)->vec[___i] && (L)->vec[___i]->ns != (NS))                \
                (___i)++;                                                \
        (___i);                                                                \
})

#define label_for_each_in_ns(I, NS, L, P)                                \
        for ((I).i = next_in_ns(0, (NS), (L));                                \
             ((P) = (L)->vec[(I).i]);                                        \
             (I).i = next_in_ns((I).i + 1, (NS), (L)))

#define fn_for_each_in_ns(L, P, FN)                                        \
({                                                                        \
        struct label_it __i;                                                \
        struct aa_ns *__ns = labels_ns(L);                                \
        int __E = 0;                                                        \
        label_for_each_in_ns(__i, __ns, (L), (P)) {                        \
                last_error(__E, (FN));                                        \
        }                                                                \
        __E;                                                                \
})


#define fn_for_each_XXX(L, P, FN, ...)                                        \
({                                                                        \
        struct label_it i;                                                \
        int __E = 0;                                                        \
        label_for_each ## __VA_ARGS__(i, (L), (P)) {                        \
                last_error(__E, (FN));                                        \
        }                                                                \
        __E;                                                                \
})

#define fn_for_each(L, P, FN) fn_for_each_XXX(L, P, FN)
#define fn_for_each_confined(L, P, FN) fn_for_each_XXX(L, P, FN, _confined)

#define fn_for_each2_XXX(L1, L2, P, FN, ...)                                \
({                                                                        \
        struct label_it i;                                                \
        int __E = 0;                                                        \
        label_for_each ## __VA_ARGS__(i, (L1), (L2), (P)) {                \
                last_error(__E, (FN));                                        \
        }                                                                \
        __E;                                                                \
})

#define fn_for_each_in_merge(L1, L2, P, FN)                                \
        fn_for_each2_XXX((L1), (L2), P, FN, _in_merge)
#define fn_for_each_not_in_set(L1, L2, P, FN)                                \
        fn_for_each2_XXX((L1), (L2), P, FN, _not_in_set)

#define LABEL_MEDIATES(L, C)                                                \
({                                                                        \
        struct aa_profile *profile;                                        \
        struct label_it i;                                                \
        int ret = 0;                                                        \
        label_for_each(i, (L), profile) {                                \
                if (RULE_MEDIATES(&profile->rules, (C))) {                \
                        ret = 1;                                        \
                        break;                                                \
                }                                                        \
        }                                                                \
        ret;                                                                \
})


void aa_labelset_destroy(struct aa_labelset *ls);
void aa_labelset_init(struct aa_labelset *ls);
void __aa_labelset_update_subtree(struct aa_ns *ns);

void aa_label_destroy(struct aa_label *label);
void aa_label_free(struct aa_label *label);
void aa_label_kref(struct kref *kref);
bool aa_label_init(struct aa_label *label, int size, gfp_t gfp);
struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp);

bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub);
bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub);
struct aa_profile *__aa_label_next_not_in_set(struct label_it *I,
                                             struct aa_label *set,
                                             struct aa_label *sub);
bool aa_label_remove(struct aa_label *label);
struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *l);
bool aa_label_replace(struct aa_label *old, struct aa_label *new);
bool aa_label_make_newest(struct aa_labelset *ls, struct aa_label *old,
                          struct aa_label *new);

struct aa_label *aa_label_find(struct aa_label *l);

struct aa_profile *aa_label_next_in_merge(struct label_it *I,
                                          struct aa_label *a,
                                          struct aa_label *b);
struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b);
struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b,
                                gfp_t gfp);


bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp);

#define FLAGS_NONE 0
#define FLAG_SHOW_MODE 1
#define FLAG_VIEW_SUBNS 2
#define FLAG_HIDDEN_UNCONFINED 4
#define FLAG_ABS_ROOT 8
int aa_label_snxprint(char *str, size_t size, struct aa_ns *view,
                      struct aa_label *label, int flags);
int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label,
                      int flags, gfp_t gfp);
int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns,
                         struct aa_label *label, int flags, gfp_t gfp);
void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns,
                     struct aa_label *label, int flags, gfp_t gfp);
void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns,
                         struct aa_label *label, int flags, gfp_t gfp);
void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags,
                      gfp_t gfp);
void aa_label_audit(struct audit_buffer *ab, struct aa_label *label, gfp_t gfp);
void aa_label_seq_print(struct seq_file *f, struct aa_label *label, gfp_t gfp);
void aa_label_printk(struct aa_label *label, gfp_t gfp);

struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str,
                                     size_t n, gfp_t gfp, bool create,
                                     bool force_stack);
struct aa_label *aa_label_parse(struct aa_label *base, const char *str,
                                gfp_t gfp, bool create, bool force_stack);

static inline const char *aa_label_strn_split(const char *str, int n)
{
        const char *pos;
        aa_state_t state;

        state = aa_dfa_matchn_until(stacksplitdfa, DFA_START, str, n, &pos);
        if (!ACCEPT_TABLE(stacksplitdfa)[state])
                return NULL;

        return pos - 3;
}

static inline const char *aa_label_str_split(const char *str)
{
        const char *pos;
        aa_state_t state;

        state = aa_dfa_match_until(stacksplitdfa, DFA_START, str, &pos);
        if (!ACCEPT_TABLE(stacksplitdfa)[state])
                return NULL;

        return pos - 3;
}



struct aa_perms;
struct aa_ruleset;
int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules,
                   struct aa_label *label, aa_state_t state, bool subns,
                   u32 request, struct aa_perms *perms);


/**
 * __aa_get_label - get a reference count to uncounted label reference
 * @l: reference to get a count on
 *
 * Returns: pointer to reference OR NULL if race is lost and reference is
 *          being repeated.
 * Requires: lock held, and the return code MUST be checked
 */
static inline struct aa_label *__aa_get_label(struct aa_label *l)
{
        if (l && kref_get_unless_zero(&l->count))
                return l;

        return NULL;
}

static inline struct aa_label *aa_get_label(struct aa_label *l)
{
        if (l)
                kref_get(&(l->count));

        return l;
}


/**
 * aa_get_label_rcu - increment refcount on a label that can be replaced
 * @l: pointer to label that can be replaced (NOT NULL)
 *
 * Returns: pointer to a refcounted label.
 *     else NULL if no label
 */
static inline struct aa_label *aa_get_label_rcu(struct aa_label __rcu **l)
{
        struct aa_label *c;

        rcu_read_lock();
        do {
                c = rcu_dereference(*l);
        } while (c && !kref_get_unless_zero(&c->count));
        rcu_read_unlock();

        return c;
}

/**
 * aa_get_newest_label - find the newest version of @l
 * @l: the label to check for newer versions of
 *
 * Returns: refcounted newest version of @l taking into account
 *          replacement, renames and removals
 *          return @l.
 */
static inline struct aa_label *aa_get_newest_label(struct aa_label *l)
{
        if (!l)
                return NULL;

        if (label_is_stale(l)) {
                struct aa_label *tmp;

                AA_BUG(!l->proxy);
                AA_BUG(!l->proxy->label);
                /* BUG: only way this can happen is @l ref count and its
                 * replacement count have gone to 0 and are on their way
                 * to destruction. ie. we have a refcounting error
                 */
                tmp = aa_get_label_rcu(&l->proxy->label);
                AA_BUG(!tmp);

                return tmp;
        }

        return aa_get_label(l);
}

static inline void aa_put_label(struct aa_label *l)
{
        if (l)
                kref_put(&l->count, aa_label_kref);
}


struct aa_proxy *aa_alloc_proxy(struct aa_label *l, gfp_t gfp);
void aa_proxy_kref(struct kref *kref);

static inline struct aa_proxy *aa_get_proxy(struct aa_proxy *proxy)
{
        if (proxy)
                kref_get(&(proxy->count));

        return proxy;
}

static inline void aa_put_proxy(struct aa_proxy *proxy)
{
        if (proxy)
                kref_put(&proxy->count, aa_proxy_kref);
}

void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new);

#endif /* __AA_LABEL_H */











































































































































































    1 














    4 












    1 











    4 














































































    5 






    2 






    4 























    4 


    4 
    3 











































    5 







    4 

























    4 

    4 





























































































































































































    5 


    5 

    5 
    2 
    2 












    4 
    3 



    1 

















    1 
    4 

    3 






















    2 
    2 
    2 





















    2 
    2 

























    4 

    4 
    1 

    4 













    4 






    2 


























    2 






    4 



    4 










    4 

    4 




    4 
























































    1 




















































    1 




















































    2 








































    2 





    2 




    2 






















    2 















    2 








    2 
















    2 

    2 






    2 




    2 























    2 




































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
 * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H

#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
#include <linux/bit_spinlock.h>

#include <linux/rhashtable-types.h>
/*
 * Objects in an rhashtable have an embedded struct rhash_head
 * which is linked into as hash chain from the hash table - or one
 * of two or more hash tables when the rhashtable is being resized.
 * The end of the chain is marked with a special nulls marks which has
 * the least significant bit set but otherwise stores the address of
 * the hash bucket.  This allows us to be sure we've found the end
 * of the right list.
 * The value stored in the hash bucket has BIT(0) used as a lock bit.
 * This bit must be atomically set before any changes are made to
 * the chain.  To avoid dereferencing this pointer without clearing
 * the bit first, we use an opaque 'struct rhash_lock_head *' for the
 * pointer stored in the bucket.  This struct needs to be defined so
 * that rcu_dereference() works on it, but it has no content so a
 * cast is needed for it to be useful.  This ensures it isn't
 * used by mistake with clearing the lock bit first.
 */
struct rhash_lock_head {};

/* Maximum chain length before rehash
 *
 * The maximum (not average) chain length grows with the size of the hash
 * table, at a rate of (log N)/(log log N).
 *
 * The value of 16 is selected so that even if the hash table grew to
 * 2^32 you would not expect the maximum chain length to exceed it
 * unless we are under attack (or extremely unlucky).
 *
 * As this limit is only to detect attacks, we don't need to set it to a
 * lower value as you'd need the chain length to vastly exceed 16 to have
 * any real effect on the system.
 */
#define RHT_ELASTICITY        16u

/**
 * struct bucket_table - Table of hash buckets
 * @size: Number of hash buckets
 * @nest: Number of bits of first-level nested table.
 * @rehash: Current bucket being rehashed
 * @hash_rnd: Random seed to fold into hash
 * @walkers: List of active walkers
 * @rcu: RCU structure for freeing the table
 * @future_tbl: Table under construction during rehashing
 * @ntbl: Nested table used when out of memory.
 * @buckets: size * hash buckets
 */
struct bucket_table {
        unsigned int                size;
        unsigned int                nest;
        u32                        hash_rnd;
        struct list_head        walkers;
        struct rcu_head                rcu;

        struct bucket_table __rcu *future_tbl;

        struct lockdep_map        dep_map;

        struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};

/*
 * NULLS_MARKER() expects a hash value with the low
 * bits mostly likely to be significant, and it discards
 * the msb.
 * We give it an address, in which the bottom bit is
 * always 0, and the msb might be significant.
 * So we shift the address down one bit to align with
 * expectations and avoid losing a significant bit.
 *
 * We never store the NULLS_MARKER in the hash table
 * itself as we need the lsb for locking.
 * Instead we store a NULL
 */
#define        RHT_NULLS_MARKER(ptr)        \
        ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
#define INIT_RHT_NULLS_HEAD(ptr)        \
        ((ptr) = NULL)

static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
        return ((unsigned long) ptr & 1);
}

static inline void *rht_obj(const struct rhashtable *ht,
                            const struct rhash_head *he)
{
        return (char *)he - ht->p.head_offset;
}

static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
                                            unsigned int hash)
{
        return hash & (tbl->size - 1);
}

static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
        const void *key, const struct rhashtable_params params,
        unsigned int hash_rnd)
{
        unsigned int hash;

        /* params must be equal to ht->p if it isn't constant. */
        if (!__builtin_constant_p(params.key_len))
                hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
        else if (params.key_len) {
                unsigned int key_len = params.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else if (key_len & (sizeof(u32) - 1))
                        hash = jhash(key, key_len, hash_rnd);
                else
                        hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
        } else {
                unsigned int key_len = ht->p.key_len;

                if (params.hashfn)
                        hash = params.hashfn(key, key_len, hash_rnd);
                else
                        hash = jhash(key, key_len, hash_rnd);
        }

        return hash;
}

static inline unsigned int rht_key_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const void *key, const struct rhashtable_params params)
{
        unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);

        return rht_bucket_index(tbl, hash);
}

static inline unsigned int rht_head_hashfn(
        struct rhashtable *ht, const struct bucket_table *tbl,
        const struct rhash_head *he, const struct rhashtable_params params)
{
        const char *ptr = rht_obj(ht, he);

        return likely(params.obj_hashfn) ?
               rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?:
                                                            ht->p.key_len,
                                                       tbl->hash_rnd)) :
               rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}

/**
 * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_75(const struct rhashtable *ht,
                                     const struct bucket_table *tbl)
{
        /* Expand table when exceeding 75% load */
        return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
               (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
                                       const struct bucket_table *tbl)
{
        /* Shrink table beneath 30% load */
        return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
               tbl->size > ht->p.min_size;
}

/**
 * rht_grow_above_100 - returns true if nelems > table-size
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_100(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) > tbl->size &&
                (!ht->p.max_size || tbl->size < ht->p.max_size);
}

/**
 * rht_grow_above_max - returns true if table is above maximum
 * @ht:                hash table
 * @tbl:        current table
 */
static inline bool rht_grow_above_max(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
{
        return atomic_read(&ht->nelems) >= ht->max_elems;
}

#ifdef CONFIG_PROVE_LOCKING
int lockdep_rht_mutex_is_held(struct rhashtable *ht);
int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
#else
static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
        return 1;
}

static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
                                             u32 hash)
{
        return 1;
}
#endif /* CONFIG_PROVE_LOCKING */

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
                             struct rhash_head *obj);

void rhashtable_walk_enter(struct rhashtable *ht,
                           struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);

static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
{
        (void)rhashtable_walk_start_check(iter);
}

void *rhashtable_walk_next(struct rhashtable_iter *iter);
void *rhashtable_walk_peek(struct rhashtable_iter *iter);
void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);

void rhashtable_free_and_destroy(struct rhashtable *ht,
                                 void (*free_fn)(void *ptr, void *arg),
                                 void *arg);
void rhashtable_destroy(struct rhashtable *ht);

struct rhash_lock_head __rcu **rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **__rht_bucket_nested(
        const struct bucket_table *tbl, unsigned int hash);
struct rhash_lock_head __rcu **rht_bucket_nested_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);

#define rht_dereference(p, ht) \
        rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_rcu(p, ht) \
        rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))

#define rht_dereference_bucket(p, tbl, hash) \
        rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_dereference_bucket_rcu(p, tbl, hash) \
        rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash))

#define rht_entry(tpos, pos, member) \
        ({ tpos = container_of(pos, typeof(*tpos), member); 1; })

static inline struct rhash_lock_head __rcu *const *rht_bucket(
        const struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_var(
        struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
                                     &tbl->buckets[hash];
}

static inline struct rhash_lock_head __rcu **rht_bucket_insert(
        struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
{
        return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
                                     &tbl->buckets[hash];
}

/*
 * We lock a bucket by setting BIT(0) in the pointer - this is always
 * zero in real pointers.  The NULLS mark is never stored in the bucket,
 * rather we store NULL if the bucket is empty.
 * bit_spin_locks do not handle contention well, but the whole point
 * of the hashtable design is to achieve minimum per-bucket contention.
 * A nested hash table might not have a bucket pointer.  In that case
 * we cannot get a lock.  For remove and replace the bucket cannot be
 * interesting and doesn't need locking.
 * For insert we allocate the bucket if this is the last bucket_table,
 * and then take the lock.
 * Sometimes we unlock a bucket by writing a new pointer there.  In that
 * case we don't need to unlock, but we do need to reset state such as
 * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
 * provides the same release semantics that bit_spin_unlock() provides,
 * this is safe.
 * When we write to a bucket without unlocking, we use rht_assign_locked().
 */

static inline unsigned long rht_lock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt)
{
        unsigned long flags;

        local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bkt);
        lock_map_acquire(&tbl->dep_map);
        return flags;
}

static inline unsigned long rht_lock_nested(struct bucket_table *tbl,
                                        struct rhash_lock_head __rcu **bucket,
                                        unsigned int subclass)
{
        unsigned long flags;

        local_irq_save(flags);
        bit_spin_lock(0, (unsigned long *)bucket);
        lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
        return flags;
}

static inline void rht_unlock(struct bucket_table *tbl,
                              struct rhash_lock_head __rcu **bkt,
                              unsigned long flags)
{
        lock_map_release(&tbl->dep_map);
        bit_spin_unlock(0, (unsigned long *)bkt);
        local_irq_restore(flags);
}

static inline struct rhash_head *__rht_ptr(
        struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
{
        return (struct rhash_head *)
                ((unsigned long)p & ~BIT(0) ?:
                 (unsigned long)RHT_NULLS_MARKER(bkt));
}

/*
 * Where 'bkt' is a bucket and might be locked:
 *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
 *   rht_ptr() dereferences in a context where the bucket is locked.
 *   rht_ptr_exclusive() dereferences in a context where exclusive
 *            access is guaranteed, such as when destroying the table.
 */
static inline struct rhash_head *rht_ptr_rcu(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference(*bkt), bkt);
}

static inline struct rhash_head *rht_ptr(
        struct rhash_lock_head __rcu *const *bkt,
        struct bucket_table *tbl,
        unsigned int hash)
{
        return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
}

static inline struct rhash_head *rht_ptr_exclusive(
        struct rhash_lock_head __rcu *const *bkt)
{
        return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt);
}

static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
}

static inline void rht_assign_unlock(struct bucket_table *tbl,
                                     struct rhash_lock_head __rcu **bkt,
                                     struct rhash_head *obj,
                                     unsigned long flags)
{
        if (rht_is_a_nulls(obj))
                obj = NULL;
        lock_map_release(&tbl->dep_map);
        rcu_assign_pointer(*bkt, (void *)obj);
        preempt_enable();
        __release(bitlock);
        local_irq_restore(flags);
}

/**
 * rht_for_each_from - iterate over hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each_from(pos, head, tbl, hash) \
        for (pos = head;                        \
             !rht_is_a_nulls(pos);                \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each - iterate over hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 */
#define rht_for_each(pos, tbl, hash) \
        rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
                          tbl, hash)

/**
 * rht_for_each_entry_from - iterate over hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)        \
        for (pos = head;                                                \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);        \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))

/**
 * rht_for_each_entry - iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 */
#define rht_for_each_entry(tpos, pos, tbl, hash, member)                \
        rht_for_each_entry_from(tpos, pos,                                \
                                rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
                                tbl, hash, member)

/**
 * rht_for_each_entry_safe - safely iterate over hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @next:        the &struct rhash_head to use as next in loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive allows for the looped code to
 * remove the loop cursor from the list.
 */
#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)              \
        for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);              \
             pos = next,                                                      \
             next = !rht_is_a_nulls(pos) ?                                      \
                       rht_dereference_bucket(pos->next, tbl, hash) : NULL)

/**
 * rht_for_each_rcu_from - iterate over rcu hash chain from given head
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu_from(pos, head, tbl, hash)                        \
        for (({barrier(); }),                                                \
             pos = head;                                                \
             !rht_is_a_nulls(pos);                                        \
             pos = rcu_dereference_raw(pos->next))

/**
 * rht_for_each_rcu - iterate over rcu hash chain
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_rcu(pos, tbl, hash)                        \
        for (({barrier(); }),                                        \
             pos = rht_ptr_rcu(rht_bucket(tbl, hash));                \
             !rht_is_a_nulls(pos);                                \
             pos = rcu_dereference_raw(pos->next))

/**
 * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @head:        the &struct rhash_head to start from
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
        for (({barrier(); }),                                                    \
             pos = head;                                                    \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);            \
             pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))

/**
 * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rhash_head to use as a loop cursor.
 * @tbl:        the &struct bucket_table
 * @hash:        the hash value / bucket index
 * @member:        name of the &struct rhash_head within the hashable struct.
 *
 * This hash chain list-traversal primitive may safely run concurrently with
 * the _rcu mutation primitives such as rhashtable_insert() as long as the
 * traversal is guarded by rcu_read_lock().
 */
#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)                   \
        rht_for_each_entry_rcu_from(tpos, pos,                                   \
                                    rht_ptr_rcu(rht_bucket(tbl, hash)),           \
                                    tbl, hash, member)

/**
 * rhl_for_each_rcu - iterate over rcu hash table list
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_rcu(pos, list)                                        \
        for (pos = list; pos; pos = rcu_dereference_raw(pos->next))

/**
 * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct rlist_head to use as a loop cursor.
 * @list:        the head of the list
 * @member:        name of the &struct rlist_head within the hashable struct.
 *
 * This hash chain list-traversal primitive should be used on the
 * list returned by rhltable_lookup.
 */
#define rhl_for_each_entry_rcu(tpos, pos, list, member)                        \
        for (pos = list; pos && rht_entry(tpos, pos, member);                \
             pos = rcu_dereference_raw(pos->next))

static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
                                     const void *obj)
{
        struct rhashtable *ht = arg->ht;
        const char *ptr = obj;

        return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}

/* Internal function, do not use. */
static inline struct rhash_head *__rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu *const *bkt;
        struct bucket_table *tbl;
        struct rhash_head *he;
        unsigned int hash;

        tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
        hash = rht_key_hashfn(ht, tbl, key, params);
        bkt = rht_bucket(tbl, hash);
        do {
                rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
                        if (params.obj_cmpfn ?
                            params.obj_cmpfn(&arg, rht_obj(ht, he)) :
                            rhashtable_compare(&arg, rht_obj(ht, he)))
                                continue;
                        return he;
                }
                /* An object might have been moved to a different hash chain,
                 * while we walk along it - better check and retry.
                 */
        } while (he != RHT_NULLS_MARKER(bkt));

        /* Ensure we see any new tables. */
        smp_rmb();

        tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(tbl))
                goto restart;

        return NULL;
}

/**
 * rhashtable_lookup - search hash table
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key. The first matching entry is returned.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the first entry on which the compare function returned true.
 */
static inline void *rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(ht, key, params);

        return he ? rht_obj(ht, he) : NULL;
}

/**
 * rhashtable_lookup_fast - search hash table, without RCU read lock
 * @ht:                hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key. The first matching entry is returned.
 *
 * Only use this function when you have other mechanisms guaranteeing
 * that the object won't go away after the RCU read lock is released.
 *
 * Returns the first entry on which the compare function returned true.
 */
static inline void *rhashtable_lookup_fast(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
{
        void *obj;

        rcu_read_lock();
        obj = rhashtable_lookup(ht, key, params);
        rcu_read_unlock();

        return obj;
}

/**
 * rhltable_lookup - search hash list table
 * @hlt:        hash table
 * @key:        the pointer to the key
 * @params:        hash table parameters
 *
 * Computes the hash value for the key and traverses the bucket chain looking
 * for an entry with an identical key.  All matching entries are returned
 * in a list.
 *
 * This must only be called under the RCU read lock.
 *
 * Returns the list of entries that match the given key.
 */
static inline struct rhlist_head *rhltable_lookup(
        struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
{
        struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);

        return he ? container_of(he, struct rhlist_head, rhead) : NULL;
}

/* Internal function, please use rhashtable_insert_fast() instead. This
 * function returns the existing element already in hashes if there is a clash,
 * otherwise it returns an error via ERR_PTR().
 */
static inline void *__rhashtable_insert_fast(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct bucket_table *tbl;
        struct rhash_head *head;
        unsigned long flags;
        unsigned int hash;
        int elasticity;
        void *data;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);
        hash = rht_head_hashfn(ht, tbl, obj, params);
        elasticity = RHT_ELASTICITY;
        bkt = rht_bucket_insert(ht, tbl, hash);
        data = ERR_PTR(-ENOMEM);
        if (!bkt)
                goto out;
        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
slow_path:
                rht_unlock(tbl, bkt, flags);
                rcu_read_unlock();
                return rhashtable_insert_slow(ht, key, obj);
        }

        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *plist;
                struct rhlist_head *list;

                elasticity--;
                if (!key ||
                    (params.obj_cmpfn ?
                     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
                        pprev = &head->next;
                        continue;
                }

                data = rht_obj(ht, head);

                if (!rhlist)
                        goto out_unlock;


                list = container_of(obj, struct rhlist_head, rhead);
                plist = container_of(head, struct rhlist_head, rhead);

                RCU_INIT_POINTER(list->next, plist);
                head = rht_dereference_bucket(head->next, tbl, hash);
                RCU_INIT_POINTER(list->rhead.next, head);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt, flags);
                } else
                        rht_assign_unlock(tbl, bkt, obj, flags);
                data = NULL;
                goto out;
        }

        if (elasticity <= 0)
                goto slow_path;

        data = ERR_PTR(-E2BIG);
        if (unlikely(rht_grow_above_max(ht, tbl)))
                goto out_unlock;

        if (unlikely(rht_grow_above_100(ht, tbl)))
                goto slow_path;

        /* Inserting at head of list makes unlocking free. */
        head = rht_ptr(bkt, tbl, hash);

        RCU_INIT_POINTER(obj->next, head);
        if (rhlist) {
                struct rhlist_head *list;

                list = container_of(obj, struct rhlist_head, rhead);
                RCU_INIT_POINTER(list->next, NULL);
        }

        atomic_inc(&ht->nelems);
        rht_assign_unlock(tbl, bkt, obj, flags);

        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);

        data = NULL;
out:
        rcu_read_unlock();

        return data;

out_unlock:
        rht_unlock(tbl, bkt, flags);
        goto out;
}

/**
 * rhashtable_insert_fast - insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhashtable_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhltable_insert_key - insert object into hash list table
 * @hlt:        hash list table
 * @key:        the pointer to the key
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhltable_insert_key(
        struct rhltable *hlt, const void *key, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
                                                params, true));
}

/**
 * rhltable_insert - insert object into hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Will take the per bucket bitlock to protect against mutual mutations
 * on the same bucket. Multiple insertions may occur in parallel unless
 * they map to the same bucket.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhltable_insert(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(&hlt->ht, &list->rhead);

        key += params.key_offset;

        return rhltable_insert_key(hlt, key, list, params);
}

/**
 * rhashtable_lookup_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * This lookup function may only be used for fixed key hash table (key_len
 * parameter set). It will BUG() if used inappropriately.
 *
 * It is safe to call this function from atomic context.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 */
static inline int rhashtable_lookup_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);
        void *ret;

        BUG_ON(ht->p.obj_hashfn);

        ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                       false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_fast(), but this function returns the
 * object if it exists, NULL if it did not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static inline void *rhashtable_lookup_get_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        const char *key = rht_obj(ht, obj);

        BUG_ON(ht->p.obj_hashfn);

        return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
                                        false);
}

/**
 * rhashtable_lookup_insert_key - search and insert object to hash table
 *                                  with explicit key
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Lookups may occur in parallel with hashtable mutations and resizing.
 *
 * Will trigger an automatic deferred table resizing if residency in the
 * table grows beyond 70%.
 *
 * Returns zero on success.
 */
static inline int rhashtable_lookup_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        void *ret;

        BUG_ON(!ht->p.obj_hashfn || !key);

        ret = __rhashtable_insert_fast(ht, key, obj, params, false);
        if (IS_ERR(ret))
                return PTR_ERR(ret);

        return ret == NULL ? 0 : -EEXIST;
}

/**
 * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
 * @ht:                hash table
 * @key:        key
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Just like rhashtable_lookup_insert_key(), but this function returns the
 * object if it exists, NULL if it does not and the insertion was successful,
 * and an ERR_PTR otherwise.
 */
static inline void *rhashtable_lookup_get_insert_key(
        struct rhashtable *ht, const void *key, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        BUG_ON(!ht->p.obj_hashfn || !key);

        return __rhashtable_insert_fast(ht, key, obj, params, false);
}

/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast_one(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj, const struct rhashtable_params params,
        bool rhlist)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;

        hash = rht_head_hashfn(ht, tbl, obj, params);
        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;
        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                struct rhlist_head *list;

                list = container_of(he, struct rhlist_head, rhead);

                if (he != obj) {
                        struct rhlist_head __rcu **lpprev;

                        pprev = &he->next;

                        if (!rhlist)
                                continue;

                        do {
                                lpprev = &list->next;
                                list = rht_dereference_bucket(list->next,
                                                              tbl, hash);
                        } while (list && obj != &list->rhead);

                        if (!list)
                                continue;

                        list = rht_dereference_bucket(list->next, tbl, hash);
                        RCU_INIT_POINTER(*lpprev, list);
                        err = 0;
                        break;
                }

                obj = rht_dereference_bucket(obj->next, tbl, hash);
                err = 1;

                if (rhlist) {
                        list = rht_dereference_bucket(list->next, tbl, hash);
                        if (list) {
                                RCU_INIT_POINTER(list->rhead.next, obj);
                                obj = &list->rhead;
                                err = 0;
                        }
                }

                if (pprev) {
                        rcu_assign_pointer(*pprev, obj);
                        rht_unlock(tbl, bkt, flags);
                } else {
                        rht_assign_unlock(tbl, bkt, obj, flags);
                }
                goto unlocked;
        }

        rht_unlock(tbl, bkt, flags);
unlocked:
        if (err > 0) {
                atomic_dec(&ht->nelems);
                if (unlikely(ht->p.automatic_shrinking &&
                             rht_shrink_below_30(ht, tbl)))
                        schedule_work(&ht->run_work);
                err = 0;
        }

        return err;
}

/* Internal function, please use rhashtable_remove_fast() instead */
static inline int __rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params, bool rhlist)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
                                                   rhlist)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhashtable_remove_fast - remove object from hash table
 * @ht:                hash table
 * @obj:        pointer to hash head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerable slow if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%.
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static inline int rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(ht, obj, params, false);
}

/**
 * rhltable_remove - remove object from hash list table
 * @hlt:        hash list table
 * @list:        pointer to hash list head inside object
 * @params:        hash table parameters
 *
 * Since the hash chain is single linked, the removal operation needs to
 * walk the bucket chain upon removal. The removal operation is thus
 * considerably slower if the hash table is not correctly sized.
 *
 * Will automatically shrink the table if permitted when residency drops
 * below 30%
 *
 * Returns zero on success, -ENOENT if the entry could not be found.
 */
static inline int rhltable_remove(
        struct rhltable *hlt, struct rhlist_head *list,
        const struct rhashtable_params params)
{
        return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
}

/* Internal function, please use rhashtable_replace_fast() instead */
static inline int __rhashtable_replace_fast(
        struct rhashtable *ht, struct bucket_table *tbl,
        struct rhash_head *obj_old, struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
        unsigned long flags;
        unsigned int hash;
        int err = -ENOENT;

        /* Minimally, the old and new objects must have same hash
         * (which should mean identifiers are the same).
         */
        hash = rht_head_hashfn(ht, tbl, obj_old, params);
        if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
                return -EINVAL;

        bkt = rht_bucket_var(tbl, hash);
        if (!bkt)
                return -ENOENT;

        pprev = NULL;
        flags = rht_lock(tbl, bkt);

        rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
                if (he != obj_old) {
                        pprev = &he->next;
                        continue;
                }

                rcu_assign_pointer(obj_new->next, obj_old->next);
                if (pprev) {
                        rcu_assign_pointer(*pprev, obj_new);
                        rht_unlock(tbl, bkt, flags);
                } else {
                        rht_assign_unlock(tbl, bkt, obj_new, flags);
                }
                err = 0;
                goto unlocked;
        }

        rht_unlock(tbl, bkt, flags);

unlocked:
        return err;
}

/**
 * rhashtable_replace_fast - replace an object in hash table
 * @ht:                hash table
 * @obj_old:        pointer to hash head inside object being replaced
 * @obj_new:        pointer to hash head inside object which is new
 * @params:        hash table parameters
 *
 * Replacing an object doesn't affect the number of elements in the hash table
 * or bucket, so we don't need to worry about shrinking or expanding the
 * table here.
 *
 * Returns zero on success, -ENOENT if the entry could not be found,
 * -EINVAL if hash is not the same for the old and new objects.
 */
static inline int rhashtable_replace_fast(
        struct rhashtable *ht, struct rhash_head *obj_old,
        struct rhash_head *obj_new,
        const struct rhashtable_params params)
{
        struct bucket_table *tbl;
        int err;

        rcu_read_lock();

        tbl = rht_dereference_rcu(ht->tbl, ht);

        /* Because we have already taken (and released) the bucket
         * lock in old_tbl, if we find that future_tbl is not yet
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
        while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
                                                obj_new, params)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;

        rcu_read_unlock();

        return err;
}

/**
 * rhltable_walk_enter - Initialise an iterator
 * @hlt:        Table to walk over
 * @iter:        Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
 *
 * You must call rhashtable_walk_exit after this function returns.
 */
static inline void rhltable_walk_enter(struct rhltable *hlt,
                                       struct rhashtable_iter *iter)
{
        return rhashtable_walk_enter(&hlt->ht, iter);
}

/**
 * rhltable_free_and_destroy - free elements and destroy hash list table
 * @hlt:        the hash list table to destroy
 * @free_fn:        callback to release resources of element
 * @arg:        pointer passed to free_fn
 *
 * See documentation for rhashtable_free_and_destroy.
 */
static inline void rhltable_free_and_destroy(struct rhltable *hlt,
                                             void (*free_fn)(void *ptr,
                                                             void *arg),
                                             void *arg)
{
        return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
}

static inline void rhltable_destroy(struct rhltable *hlt)
{
        return rhltable_free_and_destroy(hlt, NULL, NULL);
}

#endif /* _LINUX_RHASHTABLE_H */













































































































































































    1 
    4 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * include/linux/idr.h
 * 
 * 2002-10-18  written by Jim Houston jim.houston@ccur.com
 *        Copyright (C) 2002 by Concurrent Computer Corporation
 *
 * Small id to pointer translation service avoiding fixed sized
 * tables.
 */

#ifndef __IDR_H__
#define __IDR_H__

#include <linux/radix-tree.h>
#include <linux/gfp.h>
#include <linux/percpu.h>

struct idr {
        struct radix_tree_root        idr_rt;
        unsigned int                idr_base;
        unsigned int                idr_next;
};

/*
 * The IDR API does not expose the tagging functionality of the radix tree
 * to users.  Use tag 0 to track whether a node has free space below it.
 */
#define IDR_FREE        0

/* Set the IDR flag and the IDR_FREE tag */
#define IDR_RT_MARKER        (ROOT_IS_IDR | (__force gfp_t)                        \
                                        (1 << (ROOT_TAG_SHIFT + IDR_FREE)))

#define IDR_INIT_BASE(name, base) {                                        \
        .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER),                        \
        .idr_base = (base),                                                \
        .idr_next = 0,                                                        \
}

/**
 * IDR_INIT() - Initialise an IDR.
 * @name: Name of IDR.
 *
 * A freshly-initialised IDR contains no IDs.
 */
#define IDR_INIT(name)        IDR_INIT_BASE(name, 0)

/**
 * DEFINE_IDR() - Define a statically-allocated IDR.
 * @name: Name of IDR.
 *
 * An IDR defined using this macro is ready for use with no additional
 * initialisation required.  It contains no IDs.
 */
#define DEFINE_IDR(name)        struct idr name = IDR_INIT(name)

/**
 * idr_get_cursor - Return the current position of the cyclic allocator
 * @idr: idr handle
 *
 * The value returned is the value that will be next returned from
 * idr_alloc_cyclic() if it is free (otherwise the search will start from
 * this position).
 */
static inline unsigned int idr_get_cursor(const struct idr *idr)
{
        return READ_ONCE(idr->idr_next);
}

/**
 * idr_set_cursor - Set the current position of the cyclic allocator
 * @idr: idr handle
 * @val: new position
 *
 * The next call to idr_alloc_cyclic() will return @val if it is free
 * (otherwise the search will start from this position).
 */
static inline void idr_set_cursor(struct idr *idr, unsigned int val)
{
        WRITE_ONCE(idr->idr_next, val);
}

/**
 * DOC: idr sync
 * idr synchronization (stolen from radix-tree.h)
 *
 * idr_find() is able to be called locklessly, using RCU. The caller must
 * ensure calls to this function are made within rcu_read_lock() regions.
 * Other readers (lock-free or otherwise) and modifications may be running
 * concurrently.
 *
 * It is still required that the caller manage the synchronization and
 * lifetimes of the items. So if RCU lock-free lookups are used, typically
 * this would mean that the items have their own locks, or are amenable to
 * lock-free access; and that the items are freed by RCU (or only freed after
 * having been deleted from the idr tree *and* a synchronize_rcu() grace
 * period).
 */

#define idr_lock(idr)                xa_lock(&(idr)->idr_rt)
#define idr_unlock(idr)                xa_unlock(&(idr)->idr_rt)
#define idr_lock_bh(idr)        xa_lock_bh(&(idr)->idr_rt)
#define idr_unlock_bh(idr)        xa_unlock_bh(&(idr)->idr_rt)
#define idr_lock_irq(idr)        xa_lock_irq(&(idr)->idr_rt)
#define idr_unlock_irq(idr)        xa_unlock_irq(&(idr)->idr_rt)
#define idr_lock_irqsave(idr, flags) \
                                xa_lock_irqsave(&(idr)->idr_rt, flags)
#define idr_unlock_irqrestore(idr, flags) \
                                xa_unlock_irqrestore(&(idr)->idr_rt, flags)

void idr_preload(gfp_t gfp_mask);

int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t);
int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id,
                                unsigned long max, gfp_t);
int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t);
void *idr_remove(struct idr *, unsigned long id);
void *idr_find(const struct idr *, unsigned long id);
int idr_for_each(const struct idr *,
                 int (*fn)(int id, void *p, void *data), void *data);
void *idr_get_next(struct idr *, int *nextid);
void *idr_get_next_ul(struct idr *, unsigned long *nextid);
void *idr_replace(struct idr *, void *, unsigned long id);
void idr_destroy(struct idr *);

/**
 * idr_init_base() - Initialise an IDR.
 * @idr: IDR handle.
 * @base: The base value for the IDR.
 *
 * This variation of idr_init() creates an IDR which will allocate IDs
 * starting at %base.
 */
static inline void idr_init_base(struct idr *idr, int base)
{
        INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
        idr->idr_base = base;
        idr->idr_next = 0;
}

/**
 * idr_init() - Initialise an IDR.
 * @idr: IDR handle.
 *
 * Initialise a dynamically allocated IDR.  To initialise a
 * statically allocated IDR, use DEFINE_IDR().
 */
static inline void idr_init(struct idr *idr)
{
        idr_init_base(idr, 0);
}

/**
 * idr_is_empty() - Are there any IDs allocated?
 * @idr: IDR handle.
 *
 * Return: %true if any IDs have been allocated from this IDR.
 */
static inline bool idr_is_empty(const struct idr *idr)
{
        return radix_tree_empty(&idr->idr_rt) &&
                radix_tree_tagged(&idr->idr_rt, IDR_FREE);
}

/**
 * idr_preload_end - end preload section started with idr_preload()
 *
 * Each idr_preload() should be matched with an invocation of this
 * function.  See idr_preload() for details.
 */
static inline void idr_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

/**
 * idr_for_each_entry() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry(idr, entry, id)                        \
        for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U)

/**
 * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type.
 * @idr: IDR handle.
 * @entry: The type * to use as cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * @entry and @id do not need to be initialized before the loop, and
 * after normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_ul(idr, entry, tmp, id)                        \
        for (tmp = 0, id = 0;                                                \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/**
 * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 */
#define idr_for_each_entry_continue(idr, entry, id)                        \
        for ((entry) = idr_get_next((idr), &(id));                        \
             entry;                                                        \
             ++id, (entry) = idr_get_next((idr), &(id)))

/**
 * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type
 * @idr: IDR handle.
 * @entry: The type * to use as a cursor.
 * @tmp: A temporary placeholder for ID.
 * @id: Entry ID.
 *
 * Continue to iterate over entries, continuing after the current position.
 * After normal termination @entry is left with the value NULL.  This
 * is convenient for a "not found" value.
 */
#define idr_for_each_entry_continue_ul(idr, entry, tmp, id)                \
        for (tmp = id;                                                        \
             ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
             tmp = id, ++id)

/*
 * IDA - ID Allocator, use when translation from id to pointer isn't necessary.
 */
#define IDA_CHUNK_SIZE                128        /* 128 bytes per chunk */
#define IDA_BITMAP_LONGS        (IDA_CHUNK_SIZE / sizeof(long))
#define IDA_BITMAP_BITS         (IDA_BITMAP_LONGS * sizeof(long) * 8)

struct ida_bitmap {
        unsigned long                bitmap[IDA_BITMAP_LONGS];
};

struct ida {
        struct xarray xa;
};

#define IDA_INIT_FLAGS        (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC)

#define IDA_INIT(name)        {                                                \
        .xa = XARRAY_INIT(name, IDA_INIT_FLAGS)                                \
}
#define DEFINE_IDA(name)        struct ida name = IDA_INIT(name)

int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t);
void ida_free(struct ida *, unsigned int id);
void ida_destroy(struct ida *ida);

/**
 * ida_alloc() - Allocate an unused ID.
 * @ida: IDA handle.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc(struct ida *ida, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, ~0, gfp);
}

/**
 * ida_alloc_min() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and %INT_MAX, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
{
        return ida_alloc_range(ida, min, ~0, gfp);
}

/**
 * ida_alloc_max() - Allocate an unused ID.
 * @ida: IDA handle.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between 0 and @max, inclusive.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp)
{
        return ida_alloc_range(ida, 0, max, gfp);
}

static inline void ida_init(struct ida *ida)
{
        xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}

/*
 * ida_simple_get() and ida_simple_remove() are deprecated. Use
 * ida_alloc() and ida_free() instead respectively.
 */
#define ida_simple_get(ida, start, end, gfp)        \
                        ida_alloc_range(ida, start, (end) - 1, gfp)
#define ida_simple_remove(ida, id)        ida_free(ida, id)

static inline bool ida_is_empty(const struct ida *ida)
{
        return xa_empty(&ida->xa);
}
#endif /* __IDR_H__ */
















































































































































































    1 










































































    1 



























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/backing-dev.h
 *
 * low-level device information and state which is propagated up through
 * to high-level code.
 */

#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/writeback.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>

static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
        kref_get(&bdi->refcnt);
        return bdi;
}

struct backing_dev_info *bdi_get_by_id(u64 id);
void bdi_put(struct backing_dev_info *bdi);

__printf(2, 3)
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
__printf(2, 0)
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
                    va_list args);
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);

struct backing_dev_info *bdi_alloc(int node_id);

void wb_start_background_writeback(struct bdi_writeback *wb);
void wb_workfn(struct work_struct *work);

void wb_wait_for_completion(struct wb_completion *done);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
        return test_bit(WB_has_dirty_io, &wb->state);
}

static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
{
        /*
         * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
         * any dirty wbs.  See wb_update_write_bandwidth().
         */
        return atomic_long_read(&bdi->tot_write_bandwidth);
}

static inline void wb_stat_mod(struct bdi_writeback *wb,
                                 enum wb_stat_item item, s64 amount)
{
        percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}

static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        wb_stat_mod(wb, item, 1);
}

static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        wb_stat_mod(wb, item, -1);
}

static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_read_positive(&wb->stat[item]);
}

static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_sum_positive(&wb->stat[item]);
}

extern void wb_writeout_inc(struct bdi_writeback *wb);

/*
 * maximal error of a stat counter.
 */
static inline unsigned long wb_stat_error(void)
{
#ifdef CONFIG_SMP
        return nr_cpu_ids * WB_STAT_BATCH;
#else
        return 1;
#endif
}

/* BDI ratio is expressed as part per 1000000 for finer granularity. */
#define BDI_RATIO_SCALE 10000

u64 bdi_get_min_bytes(struct backing_dev_info *bdi);
u64 bdi_get_max_bytes(struct backing_dev_info *bdi);
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes);
int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes);
int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit);

/*
 * Flags in backing_dev_info::capability
 *
 * BDI_CAP_WRITEBACK:                Supports dirty page writeback, and dirty pages
 *                                should contribute to accounting
 * BDI_CAP_WRITEBACK_ACCT:        Automatically account writeback pages
 * BDI_CAP_STRICTLIMIT:                Keep number of dirty pages below bdi threshold
 */
#define BDI_CAP_WRITEBACK                (1 << 0)
#define BDI_CAP_WRITEBACK_ACCT                (1 << 1)
#define BDI_CAP_STRICTLIMIT                (1 << 2)

extern struct backing_dev_info noop_backing_dev_info;

int bdi_init(struct backing_dev_info *bdi);

/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @wb: bdi_writeback of interest
 *
 * Determine whether there is writeback waiting to be handled against a
 * bdi_writeback.
 */
static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
        return test_bit(WB_writeback_running, &wb->state);
}

struct backing_dev_info *inode_to_bdi(struct inode *inode);

static inline bool mapping_can_writeback(struct address_space *mapping)
{
        return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct cgroup_subsys_state *css);

/**
 * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
 * @inode: inode of interest
 *
 * Cgroup writeback requires support from the filesystem.  Also, both memcg and
 * iocg have to be on the default hierarchy.  Test whether all conditions are
 * met.
 *
 * Note that the test result may change dynamically on the same inode
 * depending on how memcg and iocg are configured.
 */
static inline bool inode_cgwb_enabled(struct inode *inode)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);

        return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
                cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                (bdi->capabilities & BDI_CAP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
}

/**
 * wb_find_current - find wb for %current on a bdi
 * @bdi: bdi of interest
 *
 * Find the wb of @bdi which matches both the memcg and blkcg of %current.
 * Must be called under rcu_read_lock() which protects the returend wb.
 * NULL if not found.
 */
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;

        memcg_css = task_css(current, memory_cgrp_id);
        if (!memcg_css->parent)
                return &bdi->wb;

        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);

        /*
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
        if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
}

/**
 * wb_get_create_current - get or create wb for %current on a bdi
 * @bdi: bdi of interest
 * @gfp: allocation mask
 *
 * Equivalent to wb_get_create() on %current's memcg.  This function is
 * called from a relatively hot path and optimizes the common cases using
 * wb_find_current().
 */
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        wb = wb_find_current(bdi);
        if (wb && unlikely(!wb_tryget(wb)))
                wb = NULL;
        rcu_read_unlock();

        if (unlikely(!wb)) {
                struct cgroup_subsys_state *memcg_css;

                memcg_css = task_get_css(current, memory_cgrp_id);
                wb = wb_get_create(bdi, memcg_css, gfp);
                css_put(memcg_css);
        }
        return wb;
}

/**
 * inode_to_wb - determine the wb of an inode
 * @inode: inode of interest
 *
 * Returns the wb @inode is currently associated with.  The caller must be
 * holding either @inode->i_lock, the i_pages lock, or the
 * associated wb's list_lock.
 */
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(debug_locks &&
                     (!lockdep_is_held(&inode->i_lock) &&
                      !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
                      !lockdep_is_held(&inode->i_wb->list_lock)));
#endif
        return inode->i_wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        /*
         * If wbc does not have inode attached, it means cgroup writeback was
         * disabled when wbc started. Just use the default wb in that case.
         */
        return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
}

/**
 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
 * @inode: target inode
 * @cookie: output param, to be passed to the end function
 *
 * The caller wants to access the wb associated with @inode but isn't
 * holding inode->i_lock, the i_pages lock or wb->list_lock.  This
 * function determines the wb associated with @inode and ensures that the
 * association doesn't change until the transaction is finished with
 * unlocked_inode_to_wb_end().
 *
 * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
 * can't sleep during the transaction.  IRQs may or may not be disabled on
 * return.
 */
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        rcu_read_lock();

        /*
         * Paired with store_release in inode_switch_wbs_work_fn() and
         * ensures that we see the new wb if we see cleared I_WB_SWITCH.
         */
        cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;

        if (unlikely(cookie->locked))
                xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);

        /*
         * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
         * lock.  inode_to_wb() will bark.  Deref directly.
         */
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_end - end inode wb access transaction
 * @inode: target inode
 * @cookie: @cookie from unlocked_inode_to_wb_begin()
 */
static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
        if (unlikely(cookie->locked))
                xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);

        rcu_read_unlock();
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
{
        return false;
}

static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
        return &inode_to_bdi(inode)->wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        return inode_to_wb(inode);
}


static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        return inode_to_wb(inode);
}

static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
}

static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}

static inline void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

const char *bdi_dev_name(struct backing_dev_info *bdi);

#endif        /* _LINUX_BACKING_DEV_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips
 * Copyright (c) 2008-2009 Marvell Semiconductor
 */

#ifndef __LINUX_NET_DSA_H
#define __LINUX_NET_DSA_H

#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/of.h>
#include <linux/ethtool.h>
#include <linux/net_tstamp.h>
#include <linux/phy.h>
#include <linux/platform_data/dsa.h>
#include <linux/phylink.h>
#include <net/devlink.h>
#include <net/switchdev.h>

struct dsa_8021q_context;
struct tc_action;

#define DSA_TAG_PROTO_NONE_VALUE                0
#define DSA_TAG_PROTO_BRCM_VALUE                1
#define DSA_TAG_PROTO_BRCM_PREPEND_VALUE        2
#define DSA_TAG_PROTO_DSA_VALUE                        3
#define DSA_TAG_PROTO_EDSA_VALUE                4
#define DSA_TAG_PROTO_GSWIP_VALUE                5
#define DSA_TAG_PROTO_KSZ9477_VALUE                6
#define DSA_TAG_PROTO_KSZ9893_VALUE                7
#define DSA_TAG_PROTO_LAN9303_VALUE                8
#define DSA_TAG_PROTO_MTK_VALUE                        9
#define DSA_TAG_PROTO_QCA_VALUE                        10
#define DSA_TAG_PROTO_TRAILER_VALUE                11
#define DSA_TAG_PROTO_8021Q_VALUE                12
#define DSA_TAG_PROTO_SJA1105_VALUE                13
#define DSA_TAG_PROTO_KSZ8795_VALUE                14
#define DSA_TAG_PROTO_OCELOT_VALUE                15
#define DSA_TAG_PROTO_AR9331_VALUE                16
#define DSA_TAG_PROTO_RTL4_A_VALUE                17
#define DSA_TAG_PROTO_HELLCREEK_VALUE                18
#define DSA_TAG_PROTO_XRS700X_VALUE                19
#define DSA_TAG_PROTO_OCELOT_8021Q_VALUE        20
#define DSA_TAG_PROTO_SEVILLE_VALUE                21
#define DSA_TAG_PROTO_BRCM_LEGACY_VALUE                22
#define DSA_TAG_PROTO_SJA1110_VALUE                23
#define DSA_TAG_PROTO_RTL8_4_VALUE                24
#define DSA_TAG_PROTO_RTL8_4T_VALUE                25
#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE                26
#define DSA_TAG_PROTO_LAN937X_VALUE                27

enum dsa_tag_protocol {
        DSA_TAG_PROTO_NONE                = DSA_TAG_PROTO_NONE_VALUE,
        DSA_TAG_PROTO_BRCM                = DSA_TAG_PROTO_BRCM_VALUE,
        DSA_TAG_PROTO_BRCM_LEGACY        = DSA_TAG_PROTO_BRCM_LEGACY_VALUE,
        DSA_TAG_PROTO_BRCM_PREPEND        = DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
        DSA_TAG_PROTO_DSA                = DSA_TAG_PROTO_DSA_VALUE,
        DSA_TAG_PROTO_EDSA                = DSA_TAG_PROTO_EDSA_VALUE,
        DSA_TAG_PROTO_GSWIP                = DSA_TAG_PROTO_GSWIP_VALUE,
        DSA_TAG_PROTO_KSZ9477                = DSA_TAG_PROTO_KSZ9477_VALUE,
        DSA_TAG_PROTO_KSZ9893                = DSA_TAG_PROTO_KSZ9893_VALUE,
        DSA_TAG_PROTO_LAN9303                = DSA_TAG_PROTO_LAN9303_VALUE,
        DSA_TAG_PROTO_MTK                = DSA_TAG_PROTO_MTK_VALUE,
        DSA_TAG_PROTO_QCA                = DSA_TAG_PROTO_QCA_VALUE,
        DSA_TAG_PROTO_TRAILER                = DSA_TAG_PROTO_TRAILER_VALUE,
        DSA_TAG_PROTO_8021Q                = DSA_TAG_PROTO_8021Q_VALUE,
        DSA_TAG_PROTO_SJA1105                = DSA_TAG_PROTO_SJA1105_VALUE,
        DSA_TAG_PROTO_KSZ8795                = DSA_TAG_PROTO_KSZ8795_VALUE,
        DSA_TAG_PROTO_OCELOT                = DSA_TAG_PROTO_OCELOT_VALUE,
        DSA_TAG_PROTO_AR9331                = DSA_TAG_PROTO_AR9331_VALUE,
        DSA_TAG_PROTO_RTL4_A                = DSA_TAG_PROTO_RTL4_A_VALUE,
        DSA_TAG_PROTO_HELLCREEK                = DSA_TAG_PROTO_HELLCREEK_VALUE,
        DSA_TAG_PROTO_XRS700X                = DSA_TAG_PROTO_XRS700X_VALUE,
        DSA_TAG_PROTO_OCELOT_8021Q        = DSA_TAG_PROTO_OCELOT_8021Q_VALUE,
        DSA_TAG_PROTO_SEVILLE                = DSA_TAG_PROTO_SEVILLE_VALUE,
        DSA_TAG_PROTO_SJA1110                = DSA_TAG_PROTO_SJA1110_VALUE,
        DSA_TAG_PROTO_RTL8_4                = DSA_TAG_PROTO_RTL8_4_VALUE,
        DSA_TAG_PROTO_RTL8_4T                = DSA_TAG_PROTO_RTL8_4T_VALUE,
        DSA_TAG_PROTO_RZN1_A5PSW        = DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
        DSA_TAG_PROTO_LAN937X                = DSA_TAG_PROTO_LAN937X_VALUE,
};

struct dsa_switch;

struct dsa_device_ops {
        struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
        struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
        void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
                             int *offset);
        int (*connect)(struct dsa_switch *ds);
        void (*disconnect)(struct dsa_switch *ds);
        unsigned int needed_headroom;
        unsigned int needed_tailroom;
        const char *name;
        enum dsa_tag_protocol proto;
        /* Some tagging protocols either mangle or shift the destination MAC
         * address, in which case the DSA conduit would drop packets on ingress
         * if what it understands out of the destination MAC address is not in
         * its RX filter.
         */
        bool promisc_on_conduit;
};

struct dsa_lag {
        struct net_device *dev;
        unsigned int id;
        struct mutex fdb_lock;
        struct list_head fdbs;
        refcount_t refcount;
};

struct dsa_switch_tree {
        struct list_head        list;

        /* List of switch ports */
        struct list_head ports;

        /* Notifier chain for switch-wide events */
        struct raw_notifier_head        nh;

        /* Tree identifier */
        unsigned int index;

        /* Number of switches attached to this tree */
        struct kref refcount;

        /* Maps offloaded LAG netdevs to a zero-based linear ID for
         * drivers that need it.
         */
        struct dsa_lag **lags;

        /* Tagging protocol operations */
        const struct dsa_device_ops *tag_ops;

        /* Default tagging protocol preferred by the switches in this
         * tree.
         */
        enum dsa_tag_protocol default_proto;

        /* Has this tree been applied to the hardware? */
        bool setup;

        /*
         * Configuration data for the platform device that owns
         * this dsa switch tree instance.
         */
        struct dsa_platform_data        *pd;

        /* List of DSA links composing the routing table */
        struct list_head rtable;

        /* Length of "lags" array */
        unsigned int lags_len;

        /* Track the largest switch index within a tree */
        unsigned int last_switch;
};

/* LAG IDs are one-based, the dst->lags array is zero-based */
#define dsa_lags_foreach_id(_id, _dst)                                \
        for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++)        \
                if ((_dst)->lags[(_id) - 1])

#define dsa_lag_foreach_port(_dp, _dst, _lag)                        \
        list_for_each_entry((_dp), &(_dst)->ports, list)        \
                if (dsa_port_offloads_lag((_dp), (_lag)))

#define dsa_hsr_foreach_port(_dp, _ds, _hsr)                        \
        list_for_each_entry((_dp), &(_ds)->dst->ports, list)        \
                if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr))

static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst,
                                            unsigned int id)
{
        /* DSA LAG IDs are one-based, dst->lags is zero-based */
        return dst->lags[id - 1];
}

static inline int dsa_lag_id(struct dsa_switch_tree *dst,
                             struct net_device *lag_dev)
{
        unsigned int id;

        dsa_lags_foreach_id(id, dst) {
                struct dsa_lag *lag = dsa_lag_by_id(dst, id);

                if (lag->dev == lag_dev)
                        return lag->id;
        }

        return -ENODEV;
}

/* TC matchall action types */
enum dsa_port_mall_action_type {
        DSA_PORT_MALL_MIRROR,
        DSA_PORT_MALL_POLICER,
};

/* TC mirroring entry */
struct dsa_mall_mirror_tc_entry {
        u8 to_local_port;
        bool ingress;
};

/* TC port policer entry */
struct dsa_mall_policer_tc_entry {
        u32 burst;
        u64 rate_bytes_per_sec;
};

/* TC matchall entry */
struct dsa_mall_tc_entry {
        struct list_head list;
        unsigned long cookie;
        enum dsa_port_mall_action_type type;
        union {
                struct dsa_mall_mirror_tc_entry mirror;
                struct dsa_mall_policer_tc_entry policer;
        };
};

struct dsa_bridge {
        struct net_device *dev;
        unsigned int num;
        bool tx_fwd_offload;
        refcount_t refcount;
};

struct dsa_port {
        /* A CPU port is physically connected to a conduit device. A user port
         * exposes a network device to user-space, called 'user' here.
         */
        union {
                struct net_device *conduit;
                struct net_device *user;
        };

        /* Copy of the tagging protocol operations, for quicker access
         * in the data path. Valid only for the CPU ports.
         */
        const struct dsa_device_ops *tag_ops;

        /* Copies for faster access in conduit receive hot path */
        struct dsa_switch_tree *dst;
        struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);

        struct dsa_switch        *ds;

        unsigned int                index;

        enum {
                DSA_PORT_TYPE_UNUSED = 0,
                DSA_PORT_TYPE_CPU,
                DSA_PORT_TYPE_DSA,
                DSA_PORT_TYPE_USER,
        } type;

        const char                *name;
        struct dsa_port                *cpu_dp;
        u8                        mac[ETH_ALEN];

        u8                        stp_state;

        /* Warning: the following bit fields are not atomic, and updating them
         * can only be done from code paths where concurrency is not possible
         * (probe time or under rtnl_lock).
         */
        u8                        vlan_filtering:1;

        /* Managed by DSA on user ports and by drivers on CPU and DSA ports */
        u8                        learning:1;

        u8                        lag_tx_enabled:1;

        /* conduit state bits, valid only on CPU ports */
        u8                        conduit_admin_up:1;
        u8                        conduit_oper_up:1;

        /* Valid only on user ports */
        u8                        cpu_port_in_lag:1;

        u8                        setup:1;

        struct device_node        *dn;
        unsigned int                ageing_time;

        struct dsa_bridge        *bridge;
        struct devlink_port        devlink_port;
        struct phylink                *pl;
        struct phylink_config        pl_config;
        struct dsa_lag                *lag;
        struct net_device        *hsr_dev;

        struct list_head list;

        /*
         * Original copy of the conduit netdev ethtool_ops
         */
        const struct ethtool_ops *orig_ethtool_ops;

        /* List of MAC addresses that must be forwarded on this port.
         * These are only valid on CPU ports and DSA links.
         */
        struct mutex                addr_lists_lock;
        struct list_head        fdbs;
        struct list_head        mdbs;

        struct mutex                vlans_lock;
        union {
                /* List of VLANs that CPU and DSA ports are members of.
                 * Access to this is serialized by the sleepable @vlans_lock.
                 */
                struct list_head        vlans;
                /* List of VLANs that user ports are members of.
                 * Access to this is serialized by netif_addr_lock_bh().
                 */
                struct list_head        user_vlans;
        };
};

static inline struct dsa_port *
dsa_phylink_to_port(struct phylink_config *config)
{
        return container_of(config, struct dsa_port, pl_config);
}

/* TODO: ideally DSA ports would have a single dp->link_dp member,
 * and no dst->rtable nor this struct dsa_link would be needed,
 * but this would require some more complex tree walking,
 * so keep it stupid at the moment and list them all.
 */
struct dsa_link {
        struct dsa_port *dp;
        struct dsa_port *link_dp;
        struct list_head list;
};

enum dsa_db_type {
        DSA_DB_PORT,
        DSA_DB_LAG,
        DSA_DB_BRIDGE,
};

struct dsa_db {
        enum dsa_db_type type;

        union {
                const struct dsa_port *dp;
                struct dsa_lag lag;
                struct dsa_bridge bridge;
        };
};

struct dsa_mac_addr {
        unsigned char addr[ETH_ALEN];
        u16 vid;
        refcount_t refcount;
        struct list_head list;
        struct dsa_db db;
};

struct dsa_vlan {
        u16 vid;
        refcount_t refcount;
        struct list_head list;
};

struct dsa_switch {
        struct device *dev;

        /*
         * Parent switch tree, and switch index.
         */
        struct dsa_switch_tree        *dst;
        unsigned int                index;

        /* Warning: the following bit fields are not atomic, and updating them
         * can only be done from code paths where concurrency is not possible
         * (probe time or under rtnl_lock).
         */
        u32                        setup:1;

        /* Disallow bridge core from requesting different VLAN awareness
         * settings on ports if not hardware-supported
         */
        u32                        vlan_filtering_is_global:1;

        /* Keep VLAN filtering enabled on ports not offloading any upper */
        u32                        needs_standalone_vlan_filtering:1;

        /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges
         * that have vlan_filtering=0. All drivers should ideally set this (and
         * then the option would get removed), but it is unknown whether this
         * would break things or not.
         */
        u32                        configure_vlan_while_not_filtering:1;

        /* If the switch driver always programs the CPU port as egress tagged
         * despite the VLAN configuration indicating otherwise, then setting
         * @untag_bridge_pvid will force the DSA receive path to pop the
         * bridge's default_pvid VLAN tagged frames to offer a consistent
         * behavior between a vlan_filtering=0 and vlan_filtering=1 bridge
         * device.
         */
        u32                        untag_bridge_pvid:1;

        /* Let DSA manage the FDB entries towards the
         * CPU, based on the software bridge database.
         */
        u32                        assisted_learning_on_cpu_port:1;

        /* In case vlan_filtering_is_global is set, the VLAN awareness state
         * should be retrieved from here and not from the per-port settings.
         */
        u32                        vlan_filtering:1;

        /* For switches that only have the MRU configurable. To ensure the
         * configured MTU is not exceeded, normalization of MRU on all bridged
         * interfaces is needed.
         */
        u32                        mtu_enforcement_ingress:1;

        /* Drivers that isolate the FDBs of multiple bridges must set this
         * to true to receive the bridge as an argument in .port_fdb_{add,del}
         * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be
         * passed as zero.
         */
        u32                        fdb_isolation:1;

        /* Drivers that have global DSCP mapping settings must set this to
         * true to automatically apply the settings to all ports.
         */
        u32                        dscp_prio_mapping_is_global:1;

        /* Listener for switch fabric events */
        struct notifier_block        nb;

        /*
         * Give the switch driver somewhere to hang its private data
         * structure.
         */
        void *priv;

        void *tagger_data;

        /*
         * Configuration data for this switch.
         */
        struct dsa_chip_data        *cd;

        /*
         * The switch operations.
         */
        const struct dsa_switch_ops        *ops;

        /*
         * Allow a DSA switch driver to override the phylink MAC ops
         */
        const struct phylink_mac_ops        *phylink_mac_ops;

        /*
         * User mii_bus and devices for the individual ports.
         */
        u32                        phys_mii_mask;
        struct mii_bus                *user_mii_bus;

        /* Ageing Time limits in msecs */
        unsigned int ageing_time_min;
        unsigned int ageing_time_max;

        /* Storage for drivers using tag_8021q */
        struct dsa_8021q_context *tag_8021q_ctx;

        /* devlink used to represent this switch device */
        struct devlink                *devlink;

        /* Number of switch port queues */
        unsigned int                num_tx_queues;

        /* Drivers that benefit from having an ID associated with each
         * offloaded LAG should set this to the maximum number of
         * supported IDs. DSA will then maintain a mapping of _at
         * least_ these many IDs, accessible to drivers via
         * dsa_lag_id().
         */
        unsigned int                num_lag_ids;

        /* Drivers that support bridge forwarding offload or FDB isolation
         * should set this to the maximum number of bridges spanning the same
         * switch tree (or all trees, in the case of cross-tree bridging
         * support) that can be offloaded.
         */
        unsigned int                max_num_bridges;

        unsigned int                num_ports;
};

static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
{
        struct dsa_switch_tree *dst = ds->dst;
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dp->ds == ds && dp->index == p)
                        return dp;

        return NULL;
}

static inline bool dsa_port_is_dsa(struct dsa_port *port)
{
        return port->type == DSA_PORT_TYPE_DSA;
}

static inline bool dsa_port_is_cpu(struct dsa_port *port)
{
        return port->type == DSA_PORT_TYPE_CPU;
}

static inline bool dsa_port_is_user(struct dsa_port *dp)
{
        return dp->type == DSA_PORT_TYPE_USER;
}

static inline bool dsa_port_is_unused(struct dsa_port *dp)
{
        return dp->type == DSA_PORT_TYPE_UNUSED;
}

static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp)
{
        return dsa_port_is_cpu(dp) && dp->conduit_admin_up &&
               dp->conduit_oper_up;
}

static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED;
}

static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU;
}

static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA;
}

static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER;
}

#define dsa_tree_for_each_user_port(_dp, _dst) \
        list_for_each_entry((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_user((_dp)))

#define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \
        list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_user((_dp)))

#define dsa_tree_for_each_cpu_port(_dp, _dst) \
        list_for_each_entry((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_cpu((_dp)))

#define dsa_switch_for_each_port(_dp, _ds) \
        list_for_each_entry((_dp), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_port_safe(_dp, _next, _ds) \
        list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \
        list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_available_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (!dsa_port_is_unused((_dp)))

#define dsa_switch_for_each_user_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (dsa_port_is_user((_dp)))

#define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \
        dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
                if (dsa_port_is_user((_dp)))

#define dsa_switch_for_each_cpu_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (dsa_port_is_cpu((_dp)))

#define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \
        dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
                if (dsa_port_is_cpu((_dp)))

static inline u32 dsa_user_ports(struct dsa_switch *ds)
{
        struct dsa_port *dp;
        u32 mask = 0;

        dsa_switch_for_each_user_port(dp, ds)
                mask |= BIT(dp->index);

        return mask;
}

static inline u32 dsa_cpu_ports(struct dsa_switch *ds)
{
        struct dsa_port *cpu_dp;
        u32 mask = 0;

        dsa_switch_for_each_cpu_port(cpu_dp, ds)
                mask |= BIT(cpu_dp->index);

        return mask;
}

/* Return the local port used to reach an arbitrary switch device */
static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device)
{
        struct dsa_switch_tree *dst = ds->dst;
        struct dsa_link *dl;

        list_for_each_entry(dl, &dst->rtable, list)
                if (dl->dp->ds == ds && dl->link_dp->ds->index == device)
                        return dl->dp->index;

        return ds->num_ports;
}

/* Return the local port used to reach an arbitrary switch port */
static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device,
                                            int port)
{
        if (device == ds->index)
                return port;
        else
                return dsa_routing_port(ds, device);
}

/* Return the local port used to reach the dedicated CPU port */
static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port)
{
        const struct dsa_port *dp = dsa_to_port(ds, port);
        const struct dsa_port *cpu_dp = dp->cpu_dp;

        if (!cpu_dp)
                return port;

        return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index);
}

/* Return true if this is the local port used to reach the CPU port */
static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port)
{
        if (dsa_is_unused_port(ds, port))
                return false;

        return port == dsa_upstream_port(ds, port);
}

/* Return true if this is a DSA port leading away from the CPU */
static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port)
{
        return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port);
}

/* Return the local port used to reach the CPU port */
static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds)
{
        struct dsa_port *dp;

        dsa_switch_for_each_available_port(dp, ds) {
                return dsa_upstream_port(ds, dp->index);
        }

        return ds->num_ports;
}

/* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning
 * that the routing port from @downstream_ds to @upstream_ds is also the port
 * which @downstream_ds uses to reach its dedicated CPU.
 */
static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds,
                                             struct dsa_switch *downstream_ds)
{
        int routing_port;

        if (upstream_ds == downstream_ds)
                return true;

        routing_port = dsa_routing_port(downstream_ds, upstream_ds->index);

        return dsa_is_upstream_port(downstream_ds, routing_port);
}

static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
{
        const struct dsa_switch *ds = dp->ds;

        if (ds->vlan_filtering_is_global)
                return ds->vlan_filtering;
        else
                return dp->vlan_filtering;
}

static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp)
{
        return dp->lag ? dp->lag->id : 0;
}

static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp)
{
        return dp->lag ? dp->lag->dev : NULL;
}

static inline bool dsa_port_offloads_lag(struct dsa_port *dp,
                                         const struct dsa_lag *lag)
{
        return dsa_port_lag_dev_get(dp) == lag->dev;
}

static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp)
{
        if (dp->cpu_port_in_lag)
                return dsa_port_lag_dev_get(dp->cpu_dp);

        return dp->cpu_dp->conduit;
}

static inline
struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp)
{
        if (!dp->bridge)
                return NULL;

        if (dp->lag)
                return dp->lag->dev;
        else if (dp->hsr_dev)
                return dp->hsr_dev;

        return dp->user;
}

static inline struct net_device *
dsa_port_bridge_dev_get(const struct dsa_port *dp)
{
        return dp->bridge ? dp->bridge->dev : NULL;
}

static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp)
{
        return dp->bridge ? dp->bridge->num : 0;
}

static inline bool dsa_port_bridge_same(const struct dsa_port *a,
                                        const struct dsa_port *b)
{
        struct net_device *br_a = dsa_port_bridge_dev_get(a);
        struct net_device *br_b = dsa_port_bridge_dev_get(b);

        /* Standalone ports are not in the same bridge with one another */
        return (!br_a || !br_b) ? false : (br_a == br_b);
}

static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp,
                                                 const struct net_device *dev)
{
        return dsa_port_to_bridge_port(dp) == dev;
}

static inline bool
dsa_port_offloads_bridge_dev(struct dsa_port *dp,
                             const struct net_device *bridge_dev)
{
        /* DSA ports connected to a bridge, and event was emitted
         * for the bridge.
         */
        return dsa_port_bridge_dev_get(dp) == bridge_dev;
}

static inline bool dsa_port_offloads_bridge(struct dsa_port *dp,
                                            const struct dsa_bridge *bridge)
{
        return dsa_port_bridge_dev_get(dp) == bridge->dev;
}

/* Returns true if any port of this tree offloads the given net_device */
static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst,
                                                 const struct net_device *dev)
{
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dsa_port_offloads_bridge_port(dp, dev))
                        return true;

        return false;
}

/* Returns true if any port of this tree offloads the given bridge */
static inline bool
dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst,
                             const struct net_device *bridge_dev)
{
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dsa_port_offloads_bridge_dev(dp, bridge_dev))
                        return true;

        return false;
}

static inline bool dsa_port_tree_same(const struct dsa_port *a,
                                      const struct dsa_port *b)
{
        return a->ds->dst == b->ds->dst;
}

typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
                              bool is_static, void *data);
struct dsa_switch_ops {
        /*
         * Tagging protocol helpers called for the CPU ports and DSA links.
         * @get_tag_protocol retrieves the initial tagging protocol and is
         * mandatory. Switches which can operate using multiple tagging
         * protocols should implement @change_tag_protocol and report in
         * @get_tag_protocol the tagger in current use.
         */
        enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
                                                  int port,
                                                  enum dsa_tag_protocol mprot);
        int        (*change_tag_protocol)(struct dsa_switch *ds,
                                       enum dsa_tag_protocol proto);
        /*
         * Method for switch drivers to connect to the tagging protocol driver
         * in current use. The switch driver can provide handlers for certain
         * types of packets for switch management.
         */
        int        (*connect_tag_protocol)(struct dsa_switch *ds,
                                        enum dsa_tag_protocol proto);

        int        (*port_change_conduit)(struct dsa_switch *ds, int port,
                                       struct net_device *conduit,
                                       struct netlink_ext_ack *extack);

        /* Optional switch-wide initialization and destruction methods */
        int        (*setup)(struct dsa_switch *ds);
        void        (*teardown)(struct dsa_switch *ds);

        /* Per-port initialization and destruction methods. Mandatory if the
         * driver registers devlink port regions, optional otherwise.
         */
        int        (*port_setup)(struct dsa_switch *ds, int port);
        void        (*port_teardown)(struct dsa_switch *ds, int port);

        u32        (*get_phy_flags)(struct dsa_switch *ds, int port);

        /*
         * Access to the switch's PHY registers.
         */
        int        (*phy_read)(struct dsa_switch *ds, int port, int regnum);
        int        (*phy_write)(struct dsa_switch *ds, int port,
                             int regnum, u16 val);

        /*
         * PHYLINK integration
         */
        void        (*phylink_get_caps)(struct dsa_switch *ds, int port,
                                    struct phylink_config *config);
        struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds,
                                                      int port,
                                                      phy_interface_t iface);
        void        (*phylink_mac_config)(struct dsa_switch *ds, int port,
                                      unsigned int mode,
                                      const struct phylink_link_state *state);
        void        (*phylink_mac_link_down)(struct dsa_switch *ds, int port,
                                         unsigned int mode,
                                         phy_interface_t interface);
        void        (*phylink_mac_link_up)(struct dsa_switch *ds, int port,
                                       unsigned int mode,
                                       phy_interface_t interface,
                                       struct phy_device *phydev,
                                       int speed, int duplex,
                                       bool tx_pause, bool rx_pause);
        void        (*phylink_fixed_state)(struct dsa_switch *ds, int port,
                                       struct phylink_link_state *state);
        /*
         * Port statistics counters.
         */
        void        (*get_strings)(struct dsa_switch *ds, int port,
                               u32 stringset, uint8_t *data);
        void        (*get_ethtool_stats)(struct dsa_switch *ds,
                                     int port, uint64_t *data);
        int        (*get_sset_count)(struct dsa_switch *ds, int port, int sset);
        void        (*get_ethtool_phy_stats)(struct dsa_switch *ds,
                                         int port, uint64_t *data);
        void        (*get_eth_phy_stats)(struct dsa_switch *ds, int port,
                                     struct ethtool_eth_phy_stats *phy_stats);
        void        (*get_eth_mac_stats)(struct dsa_switch *ds, int port,
                                     struct ethtool_eth_mac_stats *mac_stats);
        void        (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port,
                                      struct ethtool_eth_ctrl_stats *ctrl_stats);
        void        (*get_rmon_stats)(struct dsa_switch *ds, int port,
                                  struct ethtool_rmon_stats *rmon_stats,
                                  const struct ethtool_rmon_hist_range **ranges);
        void        (*get_stats64)(struct dsa_switch *ds, int port,
                                   struct rtnl_link_stats64 *s);
        void        (*get_pause_stats)(struct dsa_switch *ds, int port,
                                   struct ethtool_pause_stats *pause_stats);
        void        (*self_test)(struct dsa_switch *ds, int port,
                             struct ethtool_test *etest, u64 *data);

        /*
         * ethtool Wake-on-LAN
         */
        void        (*get_wol)(struct dsa_switch *ds, int port,
                           struct ethtool_wolinfo *w);
        int        (*set_wol)(struct dsa_switch *ds, int port,
                           struct ethtool_wolinfo *w);

        /*
         * ethtool timestamp info
         */
        int        (*get_ts_info)(struct dsa_switch *ds, int port,
                               struct ethtool_ts_info *ts);

        /*
         * ethtool MAC merge layer
         */
        int        (*get_mm)(struct dsa_switch *ds, int port,
                          struct ethtool_mm_state *state);
        int        (*set_mm)(struct dsa_switch *ds, int port,
                          struct ethtool_mm_cfg *cfg,
                          struct netlink_ext_ack *extack);
        void        (*get_mm_stats)(struct dsa_switch *ds, int port,
                                struct ethtool_mm_stats *stats);

        /*
         * DCB ops
         */
        int        (*port_get_default_prio)(struct dsa_switch *ds, int port);
        int        (*port_set_default_prio)(struct dsa_switch *ds, int port,
                                         u8 prio);
        int        (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp);
        int        (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
                                      u8 prio);
        int        (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
                                      u8 prio);
        int        (*port_set_apptrust)(struct dsa_switch *ds, int port,
                                     const u8 *sel, int nsel);
        int        (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel,
                                     int *nsel);

        /*
         * Suspend and resume
         */
        int        (*suspend)(struct dsa_switch *ds);
        int        (*resume)(struct dsa_switch *ds);

        /*
         * Port enable/disable
         */
        int        (*port_enable)(struct dsa_switch *ds, int port,
                               struct phy_device *phy);
        void        (*port_disable)(struct dsa_switch *ds, int port);


        /*
         * Notification for MAC address changes on user ports. Drivers can
         * currently only veto operations. They should not use the method to
         * program the hardware, since the operation is not rolled back in case
         * of other errors.
         */
        int        (*port_set_mac_address)(struct dsa_switch *ds, int port,
                                        const unsigned char *addr);

        /*
         * Compatibility between device trees defining multiple CPU ports and
         * drivers which are not OK to use by default the numerically smallest
         * CPU port of a switch for its local ports. This can return NULL,
         * meaning "don't know/don't care".
         */
        struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds);

        /*
         * Port's MAC EEE settings
         */
        int        (*set_mac_eee)(struct dsa_switch *ds, int port,
                               struct ethtool_keee *e);
        int        (*get_mac_eee)(struct dsa_switch *ds, int port,
                               struct ethtool_keee *e);

        /* EEPROM access */
        int        (*get_eeprom_len)(struct dsa_switch *ds);
        int        (*get_eeprom)(struct dsa_switch *ds,
                              struct ethtool_eeprom *eeprom, u8 *data);
        int        (*set_eeprom)(struct dsa_switch *ds,
                              struct ethtool_eeprom *eeprom, u8 *data);

        /*
         * Register access.
         */
        int        (*get_regs_len)(struct dsa_switch *ds, int port);
        void        (*get_regs)(struct dsa_switch *ds, int port,
                            struct ethtool_regs *regs, void *p);

        /*
         * Upper device tracking.
         */
        int        (*port_prechangeupper)(struct dsa_switch *ds, int port,
                                       struct netdev_notifier_changeupper_info *info);

        /*
         * Bridge integration
         */
        int        (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs);
        int        (*port_bridge_join)(struct dsa_switch *ds, int port,
                                    struct dsa_bridge bridge,
                                    bool *tx_fwd_offload,
                                    struct netlink_ext_ack *extack);
        void        (*port_bridge_leave)(struct dsa_switch *ds, int port,
                                     struct dsa_bridge bridge);
        void        (*port_stp_state_set)(struct dsa_switch *ds, int port,
                                      u8 state);
        int        (*port_mst_state_set)(struct dsa_switch *ds, int port,
                                      const struct switchdev_mst_state *state);
        void        (*port_fast_age)(struct dsa_switch *ds, int port);
        int        (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid);
        int        (*port_pre_bridge_flags)(struct dsa_switch *ds, int port,
                                         struct switchdev_brport_flags flags,
                                         struct netlink_ext_ack *extack);
        int        (*port_bridge_flags)(struct dsa_switch *ds, int port,
                                     struct switchdev_brport_flags flags,
                                     struct netlink_ext_ack *extack);
        void        (*port_set_host_flood)(struct dsa_switch *ds, int port,
                                       bool uc, bool mc);

        /*
         * VLAN support
         */
        int        (*port_vlan_filtering)(struct dsa_switch *ds, int port,
                                       bool vlan_filtering,
                                       struct netlink_ext_ack *extack);
        int        (*port_vlan_add)(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_vlan *vlan,
                                 struct netlink_ext_ack *extack);
        int        (*port_vlan_del)(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_vlan *vlan);
        int        (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge,
                                 const struct switchdev_vlan_msti *msti);

        /*
         * Forwarding database
         */
        int        (*port_fdb_add)(struct dsa_switch *ds, int port,
                                const unsigned char *addr, u16 vid,
                                struct dsa_db db);
        int        (*port_fdb_del)(struct dsa_switch *ds, int port,
                                const unsigned char *addr, u16 vid,
                                struct dsa_db db);
        int        (*port_fdb_dump)(struct dsa_switch *ds, int port,
                                 dsa_fdb_dump_cb_t *cb, void *data);
        int        (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag,
                               const unsigned char *addr, u16 vid,
                               struct dsa_db db);
        int        (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag,
                               const unsigned char *addr, u16 vid,
                               struct dsa_db db);

        /*
         * Multicast database
         */
        int        (*port_mdb_add)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_port_mdb *mdb,
                                struct dsa_db db);
        int        (*port_mdb_del)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_port_mdb *mdb,
                                struct dsa_db db);
        /*
         * RXNFC
         */
        int        (*get_rxnfc)(struct dsa_switch *ds, int port,
                             struct ethtool_rxnfc *nfc, u32 *rule_locs);
        int        (*set_rxnfc)(struct dsa_switch *ds, int port,
                             struct ethtool_rxnfc *nfc);

        /*
         * TC integration
         */
        int        (*cls_flower_add)(struct dsa_switch *ds, int port,
                                  struct flow_cls_offload *cls, bool ingress);
        int        (*cls_flower_del)(struct dsa_switch *ds, int port,
                                  struct flow_cls_offload *cls, bool ingress);
        int        (*cls_flower_stats)(struct dsa_switch *ds, int port,
                                    struct flow_cls_offload *cls, bool ingress);
        int        (*port_mirror_add)(struct dsa_switch *ds, int port,
                                   struct dsa_mall_mirror_tc_entry *mirror,
                                   bool ingress, struct netlink_ext_ack *extack);
        void        (*port_mirror_del)(struct dsa_switch *ds, int port,
                                   struct dsa_mall_mirror_tc_entry *mirror);
        int        (*port_policer_add)(struct dsa_switch *ds, int port,
                                    struct dsa_mall_policer_tc_entry *policer);
        void        (*port_policer_del)(struct dsa_switch *ds, int port);
        int        (*port_setup_tc)(struct dsa_switch *ds, int port,
                                 enum tc_setup_type type, void *type_data);

        /*
         * Cross-chip operations
         */
        int        (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index,
                                         int sw_index, int port,
                                         struct dsa_bridge bridge,
                                         struct netlink_ext_ack *extack);
        void        (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index,
                                          int sw_index, int port,
                                          struct dsa_bridge bridge);
        int        (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index,
                                        int port);
        int        (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index,
                                      int port, struct dsa_lag lag,
                                      struct netdev_lag_upper_info *info,
                                      struct netlink_ext_ack *extack);
        int        (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index,
                                       int port, struct dsa_lag lag);

        /*
         * PTP functionality
         */
        int        (*port_hwtstamp_get)(struct dsa_switch *ds, int port,
                                     struct ifreq *ifr);
        int        (*port_hwtstamp_set)(struct dsa_switch *ds, int port,
                                     struct ifreq *ifr);
        void        (*port_txtstamp)(struct dsa_switch *ds, int port,
                                 struct sk_buff *skb);
        bool        (*port_rxtstamp)(struct dsa_switch *ds, int port,
                                 struct sk_buff *skb, unsigned int type);

        /* Devlink parameters, etc */
        int        (*devlink_param_get)(struct dsa_switch *ds, u32 id,
                                     struct devlink_param_gset_ctx *ctx);
        int        (*devlink_param_set)(struct dsa_switch *ds, u32 id,
                                     struct devlink_param_gset_ctx *ctx);
        int        (*devlink_info_get)(struct dsa_switch *ds,
                                    struct devlink_info_req *req,
                                    struct netlink_ext_ack *extack);
        int        (*devlink_sb_pool_get)(struct dsa_switch *ds,
                                       unsigned int sb_index, u16 pool_index,
                                       struct devlink_sb_pool_info *pool_info);
        int        (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index,
                                       u16 pool_index, u32 size,
                                       enum devlink_sb_threshold_type threshold_type,
                                       struct netlink_ext_ack *extack);
        int        (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port,
                                            unsigned int sb_index, u16 pool_index,
                                            u32 *p_threshold);
        int        (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port,
                                            unsigned int sb_index, u16 pool_index,
                                            u32 threshold,
                                            struct netlink_ext_ack *extack);
        int        (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port,
                                               unsigned int sb_index, u16 tc_index,
                                               enum devlink_sb_pool_type pool_type,
                                               u16 *p_pool_index, u32 *p_threshold);
        int        (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port,
                                               unsigned int sb_index, u16 tc_index,
                                               enum devlink_sb_pool_type pool_type,
                                               u16 pool_index, u32 threshold,
                                               struct netlink_ext_ack *extack);
        int        (*devlink_sb_occ_snapshot)(struct dsa_switch *ds,
                                           unsigned int sb_index);
        int        (*devlink_sb_occ_max_clear)(struct dsa_switch *ds,
                                            unsigned int sb_index);
        int        (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port,
                                                unsigned int sb_index, u16 pool_index,
                                                u32 *p_cur, u32 *p_max);
        int        (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port,
                                                   unsigned int sb_index, u16 tc_index,
                                                   enum devlink_sb_pool_type pool_type,
                                                   u32 *p_cur, u32 *p_max);

        /*
         * MTU change functionality. Switches can also adjust their MRU through
         * this method. By MTU, one understands the SDU (L2 payload) length.
         * If the switch needs to account for the DSA tag on the CPU port, this
         * method needs to do so privately.
         */
        int        (*port_change_mtu)(struct dsa_switch *ds, int port,
                                   int new_mtu);
        int        (*port_max_mtu)(struct dsa_switch *ds, int port);

        /*
         * LAG integration
         */
        int        (*port_lag_change)(struct dsa_switch *ds, int port);
        int        (*port_lag_join)(struct dsa_switch *ds, int port,
                                 struct dsa_lag lag,
                                 struct netdev_lag_upper_info *info,
                                 struct netlink_ext_ack *extack);
        int        (*port_lag_leave)(struct dsa_switch *ds, int port,
                                  struct dsa_lag lag);

        /*
         * HSR integration
         */
        int        (*port_hsr_join)(struct dsa_switch *ds, int port,
                                 struct net_device *hsr,
                                 struct netlink_ext_ack *extack);
        int        (*port_hsr_leave)(struct dsa_switch *ds, int port,
                                  struct net_device *hsr);

        /*
         * MRP integration
         */
        int        (*port_mrp_add)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_mrp *mrp);
        int        (*port_mrp_del)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_mrp *mrp);
        int        (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port,
                                          const struct switchdev_obj_ring_role_mrp *mrp);
        int        (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,
                                          const struct switchdev_obj_ring_role_mrp *mrp);

        /*
         * tag_8021q operations
         */
        int        (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid,
                                      u16 flags);
        int        (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);

        /*
         * DSA conduit tracking operations
         */
        void        (*conduit_state_change)(struct dsa_switch *ds,
                                        const struct net_device *conduit,
                                        bool operational);
};

#define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)                \
        DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes,                \
                             dsa_devlink_param_get, dsa_devlink_param_set, NULL)

int dsa_devlink_param_get(struct devlink *dl, u32 id,
                          struct devlink_param_gset_ctx *ctx);
int dsa_devlink_param_set(struct devlink *dl, u32 id,
                          struct devlink_param_gset_ctx *ctx,
                          struct netlink_ext_ack *extack);
int dsa_devlink_params_register(struct dsa_switch *ds,
                                const struct devlink_param *params,
                                size_t params_count);
void dsa_devlink_params_unregister(struct dsa_switch *ds,
                                   const struct devlink_param *params,
                                   size_t params_count);
int dsa_devlink_resource_register(struct dsa_switch *ds,
                                  const char *resource_name,
                                  u64 resource_size,
                                  u64 resource_id,
                                  u64 parent_resource_id,
                                  const struct devlink_resource_size_params *size_params);

void dsa_devlink_resources_unregister(struct dsa_switch *ds);

void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
                                           u64 resource_id,
                                           devlink_resource_occ_get_t *occ_get,
                                           void *occ_get_priv);
void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
                                             u64 resource_id);
struct devlink_region *
dsa_devlink_region_create(struct dsa_switch *ds,
                          const struct devlink_region_ops *ops,
                          u32 region_max_snapshots, u64 region_size);
struct devlink_region *
dsa_devlink_port_region_create(struct dsa_switch *ds,
                               int port,
                               const struct devlink_port_region_ops *ops,
                               u32 region_max_snapshots, u64 region_size);
void dsa_devlink_region_destroy(struct devlink_region *region);

struct dsa_port *dsa_port_from_netdev(struct net_device *netdev);

struct dsa_devlink_priv {
        struct dsa_switch *ds;
};

static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl)
{
        struct dsa_devlink_priv *dl_priv = devlink_priv(dl);

        return dl_priv->ds;
}

static inline
struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port)
{
        struct devlink *dl = port->devlink;
        struct dsa_devlink_priv *dl_priv = devlink_priv(dl);

        return dl_priv->ds;
}

static inline int dsa_devlink_port_to_port(struct devlink_port *port)
{
        return port->index;
}

struct dsa_switch_driver {
        struct list_head        list;
        const struct dsa_switch_ops *ops;
};

bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
                                 const unsigned char *addr, u16 vid,
                                 struct dsa_db db);
bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_mdb *mdb,
                                 struct dsa_db db);

/* Keep inline for faster access in hot path */
static inline bool netdev_uses_dsa(const struct net_device *dev)
{
#if IS_ENABLED(CONFIG_NET_DSA)
        return dev->dsa_ptr && dev->dsa_ptr->rcv;
#endif
        return false;
}

/* All DSA tags that push the EtherType to the right (basically all except tail
 * tags, which don't break dissection) can be treated the same from the
 * perspective of the flow dissector.
 *
 * We need to return:
 *  - offset: the (B - A) difference between:
 *    A. the position of the real EtherType and
 *    B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes
 *       after the normal EtherType was supposed to be)
 *    The offset in bytes is exactly equal to the tagger overhead (and half of
 *    that, in __be16 shorts).
 *
 *  - proto: the value of the real EtherType.
 */
static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb,
                                                __be16 *proto, int *offset)
{
#if IS_ENABLED(CONFIG_NET_DSA)
        const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops;
        int tag_len = ops->needed_headroom;

        *offset = tag_len;
        *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1];
#endif
}

void dsa_unregister_switch(struct dsa_switch *ds);
int dsa_register_switch(struct dsa_switch *ds);
void dsa_switch_shutdown(struct dsa_switch *ds);
struct dsa_switch *dsa_switch_find(int tree_index, int sw_index);
void dsa_flush_workqueue(void);
#ifdef CONFIG_PM_SLEEP
int dsa_switch_suspend(struct dsa_switch *ds);
int dsa_switch_resume(struct dsa_switch *ds);
#else
static inline int dsa_switch_suspend(struct dsa_switch *ds)
{
        return 0;
}
static inline int dsa_switch_resume(struct dsa_switch *ds)
{
        return 0;
}
#endif /* CONFIG_PM_SLEEP */

#if IS_ENABLED(CONFIG_NET_DSA)
bool dsa_user_dev_check(const struct net_device *dev);
#else
static inline bool dsa_user_dev_check(const struct net_device *dev)
{
        return false;
}
#endif

netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);

#endif
























































































































































































































































    2 






    3 





















    2 











    3 















    5 





    5 

















































    1 





































































































    6 


    2 
    4 









    5 


















    3 

































































































































































    1 








    1 






    2 























































    8 




    7 





    7 



    8 











































































































































































































































































































    1 
    5 



    6 














    5 


    6 































    1 





    1 















    3 






    3 





    1 


    1 





    2 









    2 


    1 
    1 























































    1 














    1 




















    1 


























































































    1 


















































    5 


    5 




    4 



















    1 
    2 
















































































    4 

















    3 






























    4 



    3 
    4 

































    3 








































































    3 































    3 






    3 



    3 








    3 


    3 









    1 














































































    1 











    1 














    1 












    1 



    1 




    1 











    1 












    1 

    1 




    1 







    2 





































    2 









































    2 

















    2 





    2 

    2 





    1 





    1 


































    1 






    1 









    1 
    1 





    1 












    1 
















    1 


    1 




    1 












    1 

















    1 

    1 




    1 












    2 






















    1 









    2 




    2 























    2 




















    1 






    1 




    2 























    2 












    2 












    1 
    1 



    1 












    2 

    2 









    2 



    2 










    1 









    1 











    1 




    1 













    1 








    1 



    1 



    1 















    2 











    2 
    1 




    2 

    2 

































    5 
    1 

    4 
    2 

    5 
    1 











    1 


    5 








    6 






    6 












    4 





















    4 
















    4 







    4 











    3 

    1 





    4 

















    3 


















    3 



    4 
















    3 











    3 

    3 




    3 










    1 


























    1 











    1 




    1 











    1 








    1 


















    1 











    2 



















    1 

    2 





    2 



    1 



























    1 
    1 


    1 

















    1 











    1 

    1 




    1 












    1 








    1 











    1 









    1 












    1 

    1 

    1 

    1 











    1 





































    1 


    1 



















    1 















    1 






    1 























































































































































































































































































































































































































































































































































































































































    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NET                An implementation of the SOCKET network access protocol.
 *
 * Version:        @(#)socket.c        1.1.93        18/02/95
 *
 * Authors:        Orest Zborowski, <obz@Kodak.COM>
 *                Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *                Anonymous        :        NOTSOCK/BADF cleanup. Error fix in
 *                                        shutdown()
 *                Alan Cox        :        verify_area() fixes
 *                Alan Cox        :        Removed DDI
 *                Jonathan Kamens        :        SOCK_DGRAM reconnect bug
 *                Alan Cox        :        Moved a load of checks to the very
 *                                        top level.
 *                Alan Cox        :        Move address structures to/from user
 *                                        mode above the protocol layers.
 *                Rob Janssen        :        Allow 0 length sends.
 *                Alan Cox        :        Asynchronous I/O support (cribbed from the
 *                                        tty drivers).
 *                Niibe Yutaka        :        Asynchronous I/O for writes (4.4BSD style)
 *                Jeff Uphoff        :        Made max number of sockets command-line
 *                                        configurable.
 *                Matti Aarnio        :        Made the number of sockets dynamic,
 *                                        to be allocated when needed, and mr.
 *                                        Uphoff's max is used as max to be
 *                                        allowed to allocate.
 *                Linus                :        Argh. removed all the socket allocation
 *                                        altogether: it's in the inode now.
 *                Alan Cox        :        Made sock_alloc()/sock_release() public
 *                                        for NetROM and future kernel nfsd type
 *                                        stuff.
 *                Alan Cox        :        sendmsg/recvmsg basics.
 *                Tom Dyas        :        Export net symbols.
 *                Marcin Dalecki        :        Fixed problems with CONFIG_NET="n".
 *                Alan Cox        :        Added thread locking to sys_* calls
 *                                        for sockets. May have errors at the
 *                                        moment.
 *                Kevin Buhr        :        Fixed the dumb errors in the above.
 *                Andi Kleen        :        Some small cleanups, optimizations,
 *                                        and fixed a copy_from_user() bug.
 *                Tigran Aivazian        :        sys_send(args) calls sys_sendto(args, NULL, 0)
 *                Tigran Aivazian        :        Made listen(2) backlog sanity checks
 *                                        protocol-independent
 *
 *        This module is effectively the top level interface to the BSD socket
 *        paradigm.
 *
 *        Based upon Swansea University Computer Society NET3.039
 */

#include <linux/bpf-cgroup.h>
#include <linux/ethtool.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/thread_info.h>
#include <linux/rcupdate.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <linux/if_bridge.h>
#include <linux/if_vlan.h>
#include <linux/ptp_classify.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
#include <linux/audit.h>
#include <linux/wireless.h>
#include <linux/nsproxy.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/nospec.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/io_uring/net.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include <net/compat.h>
#include <net/wext.h>
#include <net/cls_cgroup.h>

#include <net/sock.h>
#include <linux/netfilter.h>

#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <net/busy_poll.h>
#include <linux/errqueue.h>
#include <linux/ptp_clock_kernel.h>
#include <trace/events/sock.h>

#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
#endif

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
static int sock_mmap(struct file *file, struct vm_area_struct *vma);

static int sock_close(struct inode *inode, struct file *file);
static __poll_t sock_poll(struct file *file,
                              struct poll_table_struct *wait);
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
                              unsigned int cmd, unsigned long arg);
#endif
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags);
static void sock_splice_eof(struct file *file);

#ifdef CONFIG_PROC_FS
static void sock_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct socket *sock = f->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);

        if (ops->show_fdinfo)
                ops->show_fdinfo(m, sock);
}
#else
#define sock_show_fdinfo NULL
#endif

/*
 *        Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *        in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
        .owner =        THIS_MODULE,
        .llseek =        no_llseek,
        .read_iter =        sock_read_iter,
        .write_iter =        sock_write_iter,
        .poll =                sock_poll,
        .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = compat_sock_ioctl,
#endif
        .uring_cmd =    io_uring_cmd_sock,
        .mmap =                sock_mmap,
        .release =        sock_close,
        .fasync =        sock_fasync,
        .splice_write = splice_to_socket,
        .splice_read =        sock_splice_read,
        .splice_eof =        sock_splice_eof,
        .show_fdinfo =        sock_show_fdinfo,
};

static const char * const pf_family_names[] = {
        [PF_UNSPEC]        = "PF_UNSPEC",
        [PF_UNIX]        = "PF_UNIX/PF_LOCAL",
        [PF_INET]        = "PF_INET",
        [PF_AX25]        = "PF_AX25",
        [PF_IPX]        = "PF_IPX",
        [PF_APPLETALK]        = "PF_APPLETALK",
        [PF_NETROM]        = "PF_NETROM",
        [PF_BRIDGE]        = "PF_BRIDGE",
        [PF_ATMPVC]        = "PF_ATMPVC",
        [PF_X25]        = "PF_X25",
        [PF_INET6]        = "PF_INET6",
        [PF_ROSE]        = "PF_ROSE",
        [PF_DECnet]        = "PF_DECnet",
        [PF_NETBEUI]        = "PF_NETBEUI",
        [PF_SECURITY]        = "PF_SECURITY",
        [PF_KEY]        = "PF_KEY",
        [PF_NETLINK]        = "PF_NETLINK/PF_ROUTE",
        [PF_PACKET]        = "PF_PACKET",
        [PF_ASH]        = "PF_ASH",
        [PF_ECONET]        = "PF_ECONET",
        [PF_ATMSVC]        = "PF_ATMSVC",
        [PF_RDS]        = "PF_RDS",
        [PF_SNA]        = "PF_SNA",
        [PF_IRDA]        = "PF_IRDA",
        [PF_PPPOX]        = "PF_PPPOX",
        [PF_WANPIPE]        = "PF_WANPIPE",
        [PF_LLC]        = "PF_LLC",
        [PF_IB]                = "PF_IB",
        [PF_MPLS]        = "PF_MPLS",
        [PF_CAN]        = "PF_CAN",
        [PF_TIPC]        = "PF_TIPC",
        [PF_BLUETOOTH]        = "PF_BLUETOOTH",
        [PF_IUCV]        = "PF_IUCV",
        [PF_RXRPC]        = "PF_RXRPC",
        [PF_ISDN]        = "PF_ISDN",
        [PF_PHONET]        = "PF_PHONET",
        [PF_IEEE802154]        = "PF_IEEE802154",
        [PF_CAIF]        = "PF_CAIF",
        [PF_ALG]        = "PF_ALG",
        [PF_NFC]        = "PF_NFC",
        [PF_VSOCK]        = "PF_VSOCK",
        [PF_KCM]        = "PF_KCM",
        [PF_QIPCRTR]        = "PF_QIPCRTR",
        [PF_SMC]        = "PF_SMC",
        [PF_XDP]        = "PF_XDP",
        [PF_MCTP]        = "PF_MCTP",
};

/*
 *        The protocol list. Each protocol is registered in here.
 */

static DEFINE_SPINLOCK(net_family_lock);
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;

/*
 * Support routines.
 * Move socket addresses back and forth across the kernel/user
 * divide and look after the messy bits.
 */

/**
 *        move_addr_to_kernel        -        copy a socket address into kernel space
 *        @uaddr: Address in user space
 *        @kaddr: Address in kernel space
 *        @ulen: Length in user space
 *
 *        The address is copied into kernel space. If the provided address is
 *        too long an error code of -EINVAL is returned. If the copy gives
 *        invalid addresses -EFAULT is returned. On a success 0 is returned.
 */

int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
{
        if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
                return -EINVAL;
        if (ulen == 0)
                return 0;
        if (copy_from_user(kaddr, uaddr, ulen))
                return -EFAULT;
        return audit_sockaddr(ulen, kaddr);
}

/**
 *        move_addr_to_user        -        copy an address to user space
 *        @kaddr: kernel space address
 *        @klen: length of address in kernel
 *        @uaddr: user space address
 *        @ulen: pointer to user length field
 *
 *        The value pointed to by ulen on entry is the buffer length available.
 *        This is overwritten with the buffer space used. -EINVAL is returned
 *        if an overlong buffer is specified or a negative buffer size. -EFAULT
 *        is returned if either the buffer or the length field are not
 *        accessible.
 *        After copying the data up to the limit the user specifies, the true
 *        length of the data is written over the length limit the user
 *        specified. Zero is returned for a success.
 */

static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
                             void __user *uaddr, int __user *ulen)
{
        int err;
        int len;

        BUG_ON(klen > sizeof(struct sockaddr_storage));
        err = get_user(len, ulen);
        if (err)
                return err;
        if (len > klen)
                len = klen;
        if (len < 0)
                return -EINVAL;
        if (len) {
                if (audit_sockaddr(klen, kaddr))
                        return -ENOMEM;
                if (copy_to_user(uaddr, kaddr, len))
                        return -EFAULT;
        }
        /*
         *      "fromlen shall refer to the value before truncation.."
         *                      1003.1g
         */
        return __put_user(klen, ulen);
}

static struct kmem_cache *sock_inode_cachep __ro_after_init;

static struct inode *sock_alloc_inode(struct super_block *sb)
{
        struct socket_alloc *ei;

        ei = alloc_inode_sb(sb, sock_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        init_waitqueue_head(&ei->socket.wq.wait);
        ei->socket.wq.fasync_list = NULL;
        ei->socket.wq.flags = 0;

        ei->socket.state = SS_UNCONNECTED;
        ei->socket.flags = 0;
        ei->socket.ops = NULL;
        ei->socket.sk = NULL;
        ei->socket.file = NULL;

        return &ei->vfs_inode;
}

static void sock_free_inode(struct inode *inode)
{
        struct socket_alloc *ei;

        ei = container_of(inode, struct socket_alloc, vfs_inode);
        kmem_cache_free(sock_inode_cachep, ei);
}

static void init_once(void *foo)
{
        struct socket_alloc *ei = (struct socket_alloc *)foo;

        inode_init_once(&ei->vfs_inode);
}

static void init_inodecache(void)
{
        sock_inode_cachep = kmem_cache_create("sock_inode_cache",
                                              sizeof(struct socket_alloc),
                                              0,
                                              (SLAB_HWCACHE_ALIGN |
                                               SLAB_RECLAIM_ACCOUNT |
                                               SLAB_ACCOUNT),
                                              init_once);
        BUG_ON(sock_inode_cachep == NULL);
}

static const struct super_operations sockfs_ops = {
        .alloc_inode        = sock_alloc_inode,
        .free_inode        = sock_free_inode,
        .statfs                = simple_statfs,
};

/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "socket:[%lu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations sockfs_dentry_operations = {
        .d_dname  = sockfs_dname,
};

static int sockfs_xattr_get(const struct xattr_handler *handler,
                            struct dentry *dentry, struct inode *inode,
                            const char *suffix, void *value, size_t size)
{
        if (value) {
                if (dentry->d_name.len + 1 > size)
                        return -ERANGE;
                memcpy(value, dentry->d_name.name, dentry->d_name.len + 1);
        }
        return dentry->d_name.len + 1;
}

#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)

static const struct xattr_handler sockfs_xattr_handler = {
        .name = XATTR_NAME_SOCKPROTONAME,
        .get = sockfs_xattr_get,
};

static int sockfs_security_xattr_set(const struct xattr_handler *handler,
                                     struct mnt_idmap *idmap,
                                     struct dentry *dentry, struct inode *inode,
                                     const char *suffix, const void *value,
                                     size_t size, int flags)
{
        /* Handled by LSM. */
        return -EAGAIN;
}

static const struct xattr_handler sockfs_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .set = sockfs_security_xattr_set,
};

static const struct xattr_handler * const sockfs_xattr_handlers[] = {
        &sockfs_xattr_handler,
        &sockfs_security_xattr_handler,
        NULL
};

static int sockfs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, SOCKFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &sockfs_ops;
        ctx->dops = &sockfs_dentry_operations;
        ctx->xattr = sockfs_xattr_handlers;
        return 0;
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
        .name =                "sockfs",
        .init_fs_context = sockfs_init_fs_context,
        .kill_sb =        kill_anon_super,
};

/*
 *        Obtains the first available file descriptor and sets it up for use.
 *
 *        These functions create file structures and maps them to fd space
 *        of the current process. On success it returns file descriptor
 *        and file struct implicitly stored in sock->file.
 *        Note that another thread may close file descriptor before we return
 *        from this function. We use the fact that now we do not refer
 *        to socket after mapping. If one day we will need it, this
 *        function will increment ref. count on file by 1.
 *
 *        In any case returned fd MAY BE not valid!
 *        This race condition is unavoidable
 *        with shared fd spaces, we cannot solve it inside kernel,
 *        but we take care of internal coherence yet.
 */

/**
 *        sock_alloc_file - Bind a &socket to a &file
 *        @sock: socket
 *        @flags: file status flags
 *        @dname: protocol name
 *
 *        Returns the &file bound with @sock, implicitly storing it
 *        in sock->file. If dname is %NULL, sets to "".
 *
 *        On failure @sock is released, and an ERR pointer is returned.
 *
 *        This function uses GFP_KERNEL internally.
 */

struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
        struct file *file;

        if (!dname)
                dname = sock->sk ? sock->sk->sk_prot_creator->name : "";

        file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
                                O_RDWR | (flags & O_NONBLOCK),
                                &socket_file_ops);
        if (IS_ERR(file)) {
                sock_release(sock);
                return file;
        }

        file->f_mode |= FMODE_NOWAIT;
        sock->file = file;
        file->private_data = sock;
        stream_open(SOCK_INODE(sock), file);
        return file;
}
EXPORT_SYMBOL(sock_alloc_file);

static int sock_map_fd(struct socket *sock, int flags)
{
        struct file *newfile;
        int fd = get_unused_fd_flags(flags);
        if (unlikely(fd < 0)) {
                sock_release(sock);
                return fd;
        }

        newfile = sock_alloc_file(sock, flags, NULL);
        if (!IS_ERR(newfile)) {
                fd_install(fd, newfile);
                return fd;
        }

        put_unused_fd(fd);
        return PTR_ERR(newfile);
}

/**
 *        sock_from_file - Return the &socket bounded to @file.
 *        @file: file
 *
 *        On failure returns %NULL.
 */

struct socket *sock_from_file(struct file *file)
{
        if (file->f_op == &socket_file_ops)
                return file->private_data;        /* set in sock_alloc_file */

        return NULL;
}
EXPORT_SYMBOL(sock_from_file);

/**
 *        sockfd_lookup - Go from a file number to its socket slot
 *        @fd: file handle
 *        @err: pointer to an error code return
 *
 *        The file handle passed in is locked and the socket it is bound
 *        to is returned. If an error occurs the err pointer is overwritten
 *        with a negative errno code and NULL is returned. The function checks
 *        for both invalid handles and passing a handle which is not a socket.
 *
 *        On a success the socket object pointer is returned.
 */

struct socket *sockfd_lookup(int fd, int *err)
{
        struct file *file;
        struct socket *sock;

        file = fget(fd);
        if (!file) {
                *err = -EBADF;
                return NULL;
        }

        sock = sock_from_file(file);
        if (!sock) {
                *err = -ENOTSOCK;
                fput(file);
        }
        return sock;
}
EXPORT_SYMBOL(sockfd_lookup);

static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
        struct fd f = fdget(fd);
        struct socket *sock;

        *err = -EBADF;
        if (f.file) {
                sock = sock_from_file(f.file);
                if (likely(sock)) {
                        *fput_needed = f.flags & FDPUT_FPUT;
                        return sock;
                }
                *err = -ENOTSOCK;
                fdput(f);
        }
        return NULL;
}

static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
                                size_t size)
{
        ssize_t len;
        ssize_t used = 0;

        len = security_inode_listsecurity(d_inode(dentry), buffer, size);
        if (len < 0)
                return len;
        used += len;
        if (buffer) {
                if (size < used)
                        return -ERANGE;
                buffer += len;
        }

        len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
        used += len;
        if (buffer) {
                if (size < used)
                        return -ERANGE;
                memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
                buffer += len;
        }

        return used;
}

static int sockfs_setattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, struct iattr *iattr)
{
        int err = simple_setattr(&nop_mnt_idmap, dentry, iattr);

        if (!err && (iattr->ia_valid & ATTR_UID)) {
                struct socket *sock = SOCKET_I(d_inode(dentry));

                if (sock->sk)
                        sock->sk->sk_uid = iattr->ia_uid;
                else
                        err = -ENOENT;
        }

        return err;
}

static const struct inode_operations sockfs_inode_ops = {
        .listxattr = sockfs_listxattr,
        .setattr = sockfs_setattr,
};

/**
 *        sock_alloc - allocate a socket
 *
 *        Allocate a new inode and socket object. The two are bound together
 *        and initialised. The socket is then returned. If we are out of inodes
 *        NULL is returned. This functions uses GFP_KERNEL internally.
 */

struct socket *sock_alloc(void)
{
        struct inode *inode;
        struct socket *sock;

        inode = new_inode_pseudo(sock_mnt->mnt_sb);
        if (!inode)
                return NULL;

        sock = SOCKET_I(inode);

        inode->i_ino = get_next_ino();
        inode->i_mode = S_IFSOCK | S_IRWXUGO;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_op = &sockfs_inode_ops;

        return sock;
}
EXPORT_SYMBOL(sock_alloc);

static void __sock_release(struct socket *sock, struct inode *inode)
{
        const struct proto_ops *ops = READ_ONCE(sock->ops);

        if (ops) {
                struct module *owner = ops->owner;

                if (inode)
                        inode_lock(inode);
                ops->release(sock);
                sock->sk = NULL;
                if (inode)
                        inode_unlock(inode);
                sock->ops = NULL;
                module_put(owner);
        }

        if (sock->wq.fasync_list)
                pr_err("%s: fasync list not empty!\n", __func__);

        if (!sock->file) {
                iput(SOCK_INODE(sock));
                return;
        }
        sock->file = NULL;
}

/**
 *        sock_release - close a socket
 *        @sock: socket to close
 *
 *        The socket is released from the protocol stack if it has a release
 *        callback, and the inode is then released if the socket is bound to
 *        an inode not a file.
 */
void sock_release(struct socket *sock)
{
        __sock_release(sock, NULL);
}
EXPORT_SYMBOL(sock_release);

void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
{
        u8 flags = *tx_flags;

        if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) {
                flags |= SKBTX_HW_TSTAMP;

                /* PTP hardware clocks can provide a free running cycle counter
                 * as a time base for virtual clocks. Tell driver to use the
                 * free running cycle counter for timestamp if socket is bound
                 * to virtual clock.
                 */
                if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
                        flags |= SKBTX_HW_TSTAMP_USE_CYCLES;
        }

        if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
                flags |= SKBTX_SW_TSTAMP;

        if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
                flags |= SKBTX_SCHED_TSTAMP;

        *tx_flags = flags;
}
EXPORT_SYMBOL(__sock_tx_timestamp);

INDIRECT_CALLABLE_DECLARE(int inet_sendmsg(struct socket *, struct msghdr *,
                                           size_t));
INDIRECT_CALLABLE_DECLARE(int inet6_sendmsg(struct socket *, struct msghdr *,
                                            size_t));

static noinline void call_trace_sock_send_length(struct sock *sk, int ret,
                                                 int flags)
{
        trace_sock_send_length(sk, ret, 0);
}

static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
{
        int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->sendmsg, inet6_sendmsg,
                                     inet_sendmsg, sock, msg,
                                     msg_data_left(msg));
        BUG_ON(ret == -EIOCBQUEUED);

        if (trace_sock_send_length_enabled())
                call_trace_sock_send_length(sock->sk, ret, 0);
        return ret;
}

static int __sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        int err = security_socket_sendmsg(sock, msg,
                                          msg_data_left(msg));

        return err ?: sock_sendmsg_nosec(sock, msg);
}

/**
 *        sock_sendmsg - send a message through @sock
 *        @sock: socket
 *        @msg: message to send
 *
 *        Sends @msg through @sock, passing through LSM.
 *        Returns the number of bytes sent, or an error code.
 */
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
{
        struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name;
        struct sockaddr_storage address;
        int save_len = msg->msg_namelen;
        int ret;

        if (msg->msg_name) {
                memcpy(&address, msg->msg_name, msg->msg_namelen);
                msg->msg_name = &address;
        }

        ret = __sock_sendmsg(sock, msg);
        msg->msg_name = save_addr;
        msg->msg_namelen = save_len;

        return ret;
}
EXPORT_SYMBOL(sock_sendmsg);

/**
 *        kernel_sendmsg - send a message through @sock (kernel-space)
 *        @sock: socket
 *        @msg: message header
 *        @vec: kernel vec
 *        @num: vec array length
 *        @size: total message data size
 *
 *        Builds the message data with @vec and sends it through @sock.
 *        Returns the number of bytes sent, or an error code.
 */

int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size)
{
        iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size);
        return sock_sendmsg(sock, msg);
}
EXPORT_SYMBOL(kernel_sendmsg);

/**
 *        kernel_sendmsg_locked - send a message through @sock (kernel-space)
 *        @sk: sock
 *        @msg: message header
 *        @vec: output s/g array
 *        @num: output s/g array length
 *        @size: total message data size
 *
 *        Builds the message data with @vec and sends it through @sock.
 *        Returns the number of bytes sent, or an error code.
 *        Caller must hold @sk.
 */

int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
                          struct kvec *vec, size_t num, size_t size)
{
        struct socket *sock = sk->sk_socket;
        const struct proto_ops *ops = READ_ONCE(sock->ops);

        if (!ops->sendmsg_locked)
                return sock_no_sendmsg_locked(sk, msg, size);

        iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size);

        return ops->sendmsg_locked(sk, msg, msg_data_left(msg));
}
EXPORT_SYMBOL(kernel_sendmsg_locked);

static bool skb_is_err_queue(const struct sk_buff *skb)
{
        /* pkt_type of skbs enqueued on the error queue are set to
         * PACKET_OUTGOING in skb_set_err_queue(). This is only safe to do
         * in recvmsg, since skbs received on a local socket will never
         * have a pkt_type of PACKET_OUTGOING.
         */
        return skb->pkt_type == PACKET_OUTGOING;
}

/* On transmit, software and hardware timestamps are returned independently.
 * As the two skb clones share the hardware timestamp, which may be updated
 * before the software timestamp is received, a hardware TX timestamp may be
 * returned only if there is no software TX timestamp. Ignore false software
 * timestamps, which may be made in the __sock_recv_timestamp() call when the
 * option SO_TIMESTAMP_OLD(NS) is enabled on the socket, even when the skb has a
 * hardware timestamp.
 */
static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
{
        return skb->tstamp && !false_tstamp && skb_is_err_queue(skb);
}

static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index)
{
        bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC;
        struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
        struct net_device *orig_dev;
        ktime_t hwtstamp;

        rcu_read_lock();
        orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
        if (orig_dev) {
                *if_index = orig_dev->ifindex;
                hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles);
        } else {
                hwtstamp = shhwtstamps->hwtstamp;
        }
        rcu_read_unlock();

        return hwtstamp;
}

static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb,
                           int if_index)
{
        struct scm_ts_pktinfo ts_pktinfo;
        struct net_device *orig_dev;

        if (!skb_mac_header_was_set(skb))
                return;

        memset(&ts_pktinfo, 0, sizeof(ts_pktinfo));

        if (!if_index) {
                rcu_read_lock();
                orig_dev = dev_get_by_napi_id(skb_napi_id(skb));
                if (orig_dev)
                        if_index = orig_dev->ifindex;
                rcu_read_unlock();
        }
        ts_pktinfo.if_index = if_index;

        ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb);
        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO,
                 sizeof(ts_pktinfo), &ts_pktinfo);
}

/*
 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 */
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
        int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
        struct scm_timestamping_internal tss;
        int empty = 1, false_tstamp = 0;
        struct skb_shared_hwtstamps *shhwtstamps =
                skb_hwtstamps(skb);
        int if_index;
        ktime_t hwtstamp;
        u32 tsflags;

        /* Race occurred between timestamp enabling and packet
           receiving.  Fill in the current time for now. */
        if (need_software_tstamp && skb->tstamp == 0) {
                __net_timestamp(skb);
                false_tstamp = 1;
        }

        if (need_software_tstamp) {
                if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
                        if (new_tstamp) {
                                struct __kernel_sock_timeval tv;

                                skb_get_new_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
                                         sizeof(tv), &tv);
                        } else {
                                struct __kernel_old_timeval tv;

                                skb_get_timestamp(skb, &tv);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
                                         sizeof(tv), &tv);
                        }
                } else {
                        if (new_tstamp) {
                                struct __kernel_timespec ts;

                                skb_get_new_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
                                         sizeof(ts), &ts);
                        } else {
                                struct __kernel_old_timespec ts;

                                skb_get_timestampns(skb, &ts);
                                put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
                                         sizeof(ts), &ts);
                        }
                }
        }

        memset(&tss, 0, sizeof(tss));
        tsflags = READ_ONCE(sk->sk_tsflags);
        if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
            ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
                empty = 0;
        if (shhwtstamps &&
            (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
            !skb_is_swtx_tstamp(skb, false_tstamp)) {
                if_index = 0;
                if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
                        hwtstamp = get_timestamp(sk, skb, &if_index);
                else
                        hwtstamp = shhwtstamps->hwtstamp;

                if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
                        hwtstamp = ptp_convert_timestamp(&hwtstamp,
                                                         READ_ONCE(sk->sk_bind_phc));

                if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) {
                        empty = 0;

                        if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
                            !skb_is_err_queue(skb))
                                put_ts_pktinfo(msg, skb, if_index);
                }
        }
        if (!empty) {
                if (sock_flag(sk, SOCK_TSTAMP_NEW))
                        put_cmsg_scm_timestamping64(msg, &tss);
                else
                        put_cmsg_scm_timestamping(msg, &tss);

                if (skb_is_err_queue(skb) && skb->len &&
                    SKB_EXT_ERR(skb)->opt_stats)
                        put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
                                 skb->len, skb->data);
        }
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);

#ifdef CONFIG_WIRELESS
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
        struct sk_buff *skb)
{
        int ack;

        if (!sock_flag(sk, SOCK_WIFI_STATUS))
                return;
        if (!skb->wifi_acked_valid)
                return;

        ack = skb->wifi_acked;

        put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
#endif

static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
                put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
                        sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
}

static void sock_recv_mark(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb)
{
        if (sock_flag(sk, SOCK_RCVMARK) && skb) {
                /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */
                __u32 mark = skb->mark;

                put_cmsg(msg, SOL_SOCKET, SO_MARK, sizeof(__u32), &mark);
        }
}

void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                       struct sk_buff *skb)
{
        sock_recv_timestamp(msg, sk, skb);
        sock_recv_drops(msg, sk, skb);
        sock_recv_mark(msg, sk, skb);
}
EXPORT_SYMBOL_GPL(__sock_recv_cmsgs);

INDIRECT_CALLABLE_DECLARE(int inet_recvmsg(struct socket *, struct msghdr *,
                                           size_t, int));
INDIRECT_CALLABLE_DECLARE(int inet6_recvmsg(struct socket *, struct msghdr *,
                                            size_t, int));

static noinline void call_trace_sock_recv_length(struct sock *sk, int ret, int flags)
{
        trace_sock_recv_length(sk, ret, flags);
}

static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
                                     int flags)
{
        int ret = INDIRECT_CALL_INET(READ_ONCE(sock->ops)->recvmsg,
                                     inet6_recvmsg,
                                     inet_recvmsg, sock, msg,
                                     msg_data_left(msg), flags);
        if (trace_sock_recv_length_enabled())
                call_trace_sock_recv_length(sock->sk, ret, flags);
        return ret;
}

/**
 *        sock_recvmsg - receive a message from @sock
 *        @sock: socket
 *        @msg: message to receive
 *        @flags: message flags
 *
 *        Receives @msg from @sock, passing through LSM. Returns the total number
 *        of bytes received, or an error.
 */
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
{
        int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);

        return err ?: sock_recvmsg_nosec(sock, msg, flags);
}
EXPORT_SYMBOL(sock_recvmsg);

/**
 *        kernel_recvmsg - Receive a message from a socket (kernel space)
 *        @sock: The socket to receive the message from
 *        @msg: Received message
 *        @vec: Input s/g array for message data
 *        @num: Size of input s/g array
 *        @size: Number of bytes to read
 *        @flags: Message flags (MSG_DONTWAIT, etc...)
 *
 *        On return the msg structure contains the scatter/gather array passed in the
 *        vec argument. The array is modified so that it consists of the unfilled
 *        portion of the original array.
 *
 *        The returned value is the total number of bytes received, or an error.
 */

int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
                   struct kvec *vec, size_t num, size_t size, int flags)
{
        msg->msg_control_is_user = false;
        iov_iter_kvec(&msg->msg_iter, ITER_DEST, vec, num, size);
        return sock_recvmsg(sock, msg, flags);
}
EXPORT_SYMBOL(kernel_recvmsg);

static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops;

        ops = READ_ONCE(sock->ops);
        if (unlikely(!ops->splice_read))
                return copy_splice_read(file, ppos, pipe, len, flags);

        return ops->splice_read(sock, ppos, pipe, len, flags);
}

static void sock_splice_eof(struct file *file)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops;

        ops = READ_ONCE(sock->ops);
        if (ops->splice_eof)
                ops->splice_eof(sock);
}

static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *to,
                             .msg_iocb = iocb};
        ssize_t res;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (!iov_iter_count(to))        /* Match SYS5 behaviour */
                return 0;

        res = sock_recvmsg(sock, &msg, msg.msg_flags);
        *to = msg.msg_iter;
        return res;
}

static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct socket *sock = file->private_data;
        struct msghdr msg = {.msg_iter = *from,
                             .msg_iocb = iocb};
        ssize_t res;

        if (iocb->ki_pos != 0)
                return -ESPIPE;

        if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
                msg.msg_flags = MSG_DONTWAIT;

        if (sock->type == SOCK_SEQPACKET)
                msg.msg_flags |= MSG_EOR;

        res = __sock_sendmsg(sock, &msg);
        *from = msg.msg_iter;
        return res;
}

/*
 * Atomic setting of ioctl hooks to avoid race
 * with module unload.
 */

static DEFINE_MUTEX(br_ioctl_mutex);
static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br,
                            unsigned int cmd, struct ifreq *ifr,
                            void __user *uarg);

void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br,
                             unsigned int cmd, struct ifreq *ifr,
                             void __user *uarg))
{
        mutex_lock(&br_ioctl_mutex);
        br_ioctl_hook = hook;
        mutex_unlock(&br_ioctl_mutex);
}
EXPORT_SYMBOL(brioctl_set);

int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd,
                  struct ifreq *ifr, void __user *uarg)
{
        int err = -ENOPKG;

        if (!br_ioctl_hook)
                request_module("bridge");

        mutex_lock(&br_ioctl_mutex);
        if (br_ioctl_hook)
                err = br_ioctl_hook(net, br, cmd, ifr, uarg);
        mutex_unlock(&br_ioctl_mutex);

        return err;
}

static DEFINE_MUTEX(vlan_ioctl_mutex);
static int (*vlan_ioctl_hook) (struct net *, void __user *arg);

void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
{
        mutex_lock(&vlan_ioctl_mutex);
        vlan_ioctl_hook = hook;
        mutex_unlock(&vlan_ioctl_mutex);
}
EXPORT_SYMBOL(vlan_ioctl_set);

static long sock_do_ioctl(struct net *net, struct socket *sock,
                          unsigned int cmd, unsigned long arg)
{
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        struct ifreq ifr;
        bool need_copyout;
        int err;
        void __user *argp = (void __user *)arg;
        void __user *data;

        err = ops->ioctl(sock, cmd, arg);

        /*
         * If this ioctl is unknown try to hand it down
         * to the NIC driver.
         */
        if (err != -ENOIOCTLCMD)
                return err;

        if (!is_socket_ioctl_cmd(cmd))
                return -ENOTTY;

        if (get_user_ifreq(&ifr, &data, argp))
                return -EFAULT;
        err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
        if (!err && need_copyout)
                if (put_user_ifreq(&ifr, argp))
                        return -EFAULT;

        return err;
}

/*
 *        With an ioctl, arg may well be a user mode pointer, but we don't know
 *        what to do with it - that's up to the protocol still.
 */

static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
        const struct proto_ops  *ops;
        struct socket *sock;
        struct sock *sk;
        void __user *argp = (void __user *)arg;
        int pid, err;
        struct net *net;

        sock = file->private_data;
        ops = READ_ONCE(sock->ops);
        sk = sock->sk;
        net = sock_net(sk);
        if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
                struct ifreq ifr;
                void __user *data;
                bool need_copyout;
                if (get_user_ifreq(&ifr, &data, argp))
                        return -EFAULT;
                err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
                if (!err && need_copyout)
                        if (put_user_ifreq(&ifr, argp))
                                return -EFAULT;
        } else
#ifdef CONFIG_WEXT_CORE
        if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
                err = wext_handle_ioctl(net, cmd, argp);
        } else
#endif
                switch (cmd) {
                case FIOSETOWN:
                case SIOCSPGRP:
                        err = -EFAULT;
                        if (get_user(pid, (int __user *)argp))
                                break;
                        err = f_setown(sock->file, pid, 1);
                        break;
                case FIOGETOWN:
                case SIOCGPGRP:
                        err = put_user(f_getown(sock->file),
                                       (int __user *)argp);
                        break;
                case SIOCGIFBR:
                case SIOCSIFBR:
                case SIOCBRADDBR:
                case SIOCBRDELBR:
                        err = br_ioctl_call(net, NULL, cmd, NULL, argp);
                        break;
                case SIOCGIFVLAN:
                case SIOCSIFVLAN:
                        err = -ENOPKG;
                        if (!vlan_ioctl_hook)
                                request_module("8021q");

                        mutex_lock(&vlan_ioctl_mutex);
                        if (vlan_ioctl_hook)
                                err = vlan_ioctl_hook(net, argp);
                        mutex_unlock(&vlan_ioctl_mutex);
                        break;
                case SIOCGSKNS:
                        err = -EPERM;
                        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                                break;

                        err = open_related_ns(&net->ns, get_net_ns);
                        break;
                case SIOCGSTAMP_OLD:
                case SIOCGSTAMPNS_OLD:
                        if (!ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = ops->gettstamp(sock, argp,
                                             cmd == SIOCGSTAMP_OLD,
                                             !IS_ENABLED(CONFIG_64BIT));
                        break;
                case SIOCGSTAMP_NEW:
                case SIOCGSTAMPNS_NEW:
                        if (!ops->gettstamp) {
                                err = -ENOIOCTLCMD;
                                break;
                        }
                        err = ops->gettstamp(sock, argp,
                                             cmd == SIOCGSTAMP_NEW,
                                             false);
                        break;

                case SIOCGIFCONF:
                        err = dev_ifconf(net, argp);
                        break;

                default:
                        err = sock_do_ioctl(net, sock, cmd, arg);
                        break;
                }
        return err;
}

/**
 *        sock_create_lite - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        The new socket initialization is not complete, see kernel_accept().
 *        Returns 0 or an error. On failure @res is set to %NULL.
 *        This function internally uses GFP_KERNEL.
 */

int sock_create_lite(int family, int type, int protocol, struct socket **res)
{
        int err;
        struct socket *sock = NULL;

        err = security_socket_create(family, type, protocol, 1);
        if (err)
                goto out;

        sock = sock_alloc();
        if (!sock) {
                err = -ENOMEM;
                goto out;
        }

        sock->type = type;
        err = security_socket_post_create(sock, family, type, protocol, 1);
        if (err)
                goto out_release;

out:
        *res = sock;
        return err;
out_release:
        sock_release(sock);
        sock = NULL;
        goto out;
}
EXPORT_SYMBOL(sock_create_lite);

/* No kernel lock held - perfect */
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        __poll_t events = poll_requested_events(wait), flag = 0;

        if (!ops->poll)
                return 0;

        if (sk_can_busy_loop(sock->sk)) {
                /* poll once if requested by the syscall */
                if (events & POLL_BUSY_LOOP)
                        sk_busy_loop(sock->sk, 1);

                /* if this socket can poll_ll, tell the system call */
                flag = POLL_BUSY_LOOP;
        }

        return ops->poll(file, sock, wait) | flag;
}

static int sock_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct socket *sock = file->private_data;

        return READ_ONCE(sock->ops)->mmap(file, sock, vma);
}

static int sock_close(struct inode *inode, struct file *filp)
{
        __sock_release(SOCKET_I(inode), inode);
        return 0;
}

/*
 *        Update the socket async list
 *
 *        Fasync_list locking strategy.
 *
 *        1. fasync_list is modified only under process context socket lock
 *           i.e. under semaphore.
 *        2. fasync_list is used under read_lock(&sk->sk_callback_lock)
 *           or under socket lock
 */

static int sock_fasync(int fd, struct file *filp, int on)
{
        struct socket *sock = filp->private_data;
        struct sock *sk = sock->sk;
        struct socket_wq *wq = &sock->wq;

        if (sk == NULL)
                return -EINVAL;

        lock_sock(sk);
        fasync_helper(fd, filp, on, &wq->fasync_list);

        if (!wq->fasync_list)
                sock_reset_flag(sk, SOCK_FASYNC);
        else
                sock_set_flag(sk, SOCK_FASYNC);

        release_sock(sk);
        return 0;
}

/* This function may be called only under rcu_lock */

int sock_wake_async(struct socket_wq *wq, int how, int band)
{
        if (!wq || !wq->fasync_list)
                return -1;

        switch (how) {
        case SOCK_WAKE_WAITD:
                if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
                        break;
                goto call_kill;
        case SOCK_WAKE_SPACE:
                if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
                        break;
                fallthrough;
        case SOCK_WAKE_IO:
call_kill:
                kill_fasync(&wq->fasync_list, SIGIO, band);
                break;
        case SOCK_WAKE_URG:
                kill_fasync(&wq->fasync_list, SIGURG, band);
        }

        return 0;
}
EXPORT_SYMBOL(sock_wake_async);

/**
 *        __sock_create - creates a socket
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *        @kern: boolean for kernel space sockets
 *
 *        Creates a new socket and assigns it to @res, passing through LSM.
 *        Returns 0 or an error. On failure @res is set to %NULL. @kern must
 *        be set to true if the socket resides in kernel space.
 *        This function internally uses GFP_KERNEL.
 */

int __sock_create(struct net *net, int family, int type, int protocol,
                         struct socket **res, int kern)
{
        int err;
        struct socket *sock;
        const struct net_proto_family *pf;

        /*
         *      Check protocol is in range
         */
        if (family < 0 || family >= NPROTO)
                return -EAFNOSUPPORT;
        if (type < 0 || type >= SOCK_MAX)
                return -EINVAL;

        /* Compatibility.

           This uglymoron is moved from INET layer to here to avoid
           deadlock in module load.
         */
        if (family == PF_INET && type == SOCK_PACKET) {
                pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
                             current->comm);
                family = PF_PACKET;
        }

        err = security_socket_create(family, type, protocol, kern);
        if (err)
                return err;

        /*
         *        Allocate the socket and allow the family to set things up. if
         *        the protocol is 0, the family is instructed to select an appropriate
         *        default.
         */
        sock = sock_alloc();
        if (!sock) {
                net_warn_ratelimited("socket: no more sockets\n");
                return -ENFILE;        /* Not exactly a match, but its the
                                   closest posix thing */
        }

        sock->type = type;

#ifdef CONFIG_MODULES
        /* Attempt to load a protocol module if the find failed.
         *
         * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
         * requested real, full-featured networking support upon configuration.
         * Otherwise module support will break!
         */
        if (rcu_access_pointer(net_families[family]) == NULL)
                request_module("net-pf-%d", family);
#endif

        rcu_read_lock();
        pf = rcu_dereference(net_families[family]);
        err = -EAFNOSUPPORT;
        if (!pf)
                goto out_release;

        /*
         * We will call the ->create function, that possibly is in a loadable
         * module, so we have to bump that loadable module refcnt first.
         */
        if (!try_module_get(pf->owner))
                goto out_release;

        /* Now protected by module ref count */
        rcu_read_unlock();

        err = pf->create(net, sock, protocol, kern);
        if (err < 0)
                goto out_module_put;

        /*
         * Now to bump the refcnt of the [loadable] module that owns this
         * socket at sock_release time we decrement its refcnt.
         */
        if (!try_module_get(sock->ops->owner))
                goto out_module_busy;

        /*
         * Now that we're done with the ->create function, the [loadable]
         * module can have its refcnt decremented
         */
        module_put(pf->owner);
        err = security_socket_post_create(sock, family, type, protocol, kern);
        if (err)
                goto out_sock_release;
        *res = sock;

        return 0;

out_module_busy:
        err = -EAFNOSUPPORT;
out_module_put:
        sock->ops = NULL;
        module_put(pf->owner);
out_sock_release:
        sock_release(sock);
        return err;

out_release:
        rcu_read_unlock();
        goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

/**
 *        sock_create - creates a socket
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create(int family, int type, int protocol, struct socket **res)
{
        return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);

/**
 *        sock_create_kern - creates a socket (kernel space)
 *        @net: net namespace
 *        @family: protocol family (AF_INET, ...)
 *        @type: communication type (SOCK_STREAM, ...)
 *        @protocol: protocol (0, ...)
 *        @res: new socket
 *
 *        A wrapper around __sock_create().
 *        Returns 0 or an error. This function internally uses GFP_KERNEL.
 */

int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
{
        return __sock_create(net, family, type, protocol, res, 1);
}
EXPORT_SYMBOL(sock_create_kern);

static struct socket *__sys_socket_create(int family, int type, int protocol)
{
        struct socket *sock;
        int retval;

        /* Check the SOCK_* constants for consistency.  */
        BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
        BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

        if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return ERR_PTR(-EINVAL);
        type &= SOCK_TYPE_MASK;

        retval = sock_create(family, type, protocol, &sock);
        if (retval < 0)
                return ERR_PTR(retval);

        return sock;
}

struct file *__sys_socket_file(int family, int type, int protocol)
{
        struct socket *sock;
        int flags;

        sock = __sys_socket_create(family, type, protocol);
        if (IS_ERR(sock))
                return ERR_CAST(sock);

        flags = type & ~SOCK_TYPE_MASK;
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        return sock_alloc_file(sock, flags, NULL);
}

/*        A hook for bpf progs to attach to and update socket protocol.
 *
 *        A static noinline declaration here could cause the compiler to
 *        optimize away the function. A global noinline declaration will
 *        keep the definition, but may optimize away the callsite.
 *        Therefore, __weak is needed to ensure that the call is still
 *        emitted, by telling the compiler that we don't know what the
 *        function might eventually be.
 */

__bpf_hook_start();

__weak noinline int update_socket_protocol(int family, int type, int protocol)
{
        return protocol;
}

__bpf_hook_end();

int __sys_socket(int family, int type, int protocol)
{
        struct socket *sock;
        int flags;

        sock = __sys_socket_create(family, type,
                                   update_socket_protocol(family, type, protocol));
        if (IS_ERR(sock))
                return PTR_ERR(sock);

        flags = type & ~SOCK_TYPE_MASK;
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
        return __sys_socket(family, type, protocol);
}

/*
 *        Create a pair of connected sockets.
 */

int __sys_socketpair(int family, int type, int protocol, int __user *usockvec)
{
        struct socket *sock1, *sock2;
        int fd1, fd2, err;
        struct file *newfile1, *newfile2;
        int flags;

        flags = type & ~SOCK_TYPE_MASK;
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;
        type &= SOCK_TYPE_MASK;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        /*
         * reserve descriptors and make sure we won't fail
         * to return them to userland.
         */
        fd1 = get_unused_fd_flags(flags);
        if (unlikely(fd1 < 0))
                return fd1;

        fd2 = get_unused_fd_flags(flags);
        if (unlikely(fd2 < 0)) {
                put_unused_fd(fd1);
                return fd2;
        }

        err = put_user(fd1, &usockvec[0]);
        if (err)
                goto out;

        err = put_user(fd2, &usockvec[1]);
        if (err)
                goto out;

        /*
         * Obtain the first socket and check if the underlying protocol
         * supports the socketpair call.
         */

        err = sock_create(family, type, protocol, &sock1);
        if (unlikely(err < 0))
                goto out;

        err = sock_create(family, type, protocol, &sock2);
        if (unlikely(err < 0)) {
                sock_release(sock1);
                goto out;
        }

        err = security_socket_socketpair(sock1, sock2);
        if (unlikely(err)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        err = READ_ONCE(sock1->ops)->socketpair(sock1, sock2);
        if (unlikely(err < 0)) {
                sock_release(sock2);
                sock_release(sock1);
                goto out;
        }

        newfile1 = sock_alloc_file(sock1, flags, NULL);
        if (IS_ERR(newfile1)) {
                err = PTR_ERR(newfile1);
                sock_release(sock2);
                goto out;
        }

        newfile2 = sock_alloc_file(sock2, flags, NULL);
        if (IS_ERR(newfile2)) {
                err = PTR_ERR(newfile2);
                fput(newfile1);
                goto out;
        }

        audit_fd_pair(fd1, fd2);

        fd_install(fd1, newfile1);
        fd_install(fd2, newfile2);
        return 0;

out:
        put_unused_fd(fd2);
        put_unused_fd(fd1);
        return err;
}

SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
                int __user *, usockvec)
{
        return __sys_socketpair(family, type, protocol, usockvec);
}

/*
 *        Bind a name to a socket. Nothing much to do here since it's
 *        the protocol's responsibility to handle the local address.
 *
 *        We move the socket address to kernel space before we call
 *        the protocol layer (having also checked the address is ok).
 */

int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock) {
                err = move_addr_to_kernel(umyaddr, addrlen, &address);
                if (!err) {
                        err = security_socket_bind(sock,
                                                   (struct sockaddr *)&address,
                                                   addrlen);
                        if (!err)
                                err = READ_ONCE(sock->ops)->bind(sock,
                                                      (struct sockaddr *)
                                                      &address, addrlen);
                }
                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
        return __sys_bind(fd, umyaddr, addrlen);
}

/*
 *        Perform a listen. Basically, we allow the protocol to do anything
 *        necessary for a listen, and if that works, we mark the socket as
 *        ready for listening.
 */

int __sys_listen(int fd, int backlog)
{
        struct socket *sock;
        int err, fput_needed;
        int somaxconn;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock) {
                somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
                if ((unsigned int)backlog > somaxconn)
                        backlog = somaxconn;

                err = security_socket_listen(sock, backlog);
                if (!err)
                        err = READ_ONCE(sock->ops)->listen(sock, backlog);

                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
        return __sys_listen(fd, backlog);
}

struct file *do_accept(struct file *file, struct proto_accept_arg *arg,
                       struct sockaddr __user *upeer_sockaddr,
                       int __user *upeer_addrlen, int flags)
{
        struct socket *sock, *newsock;
        struct file *newfile;
        int err, len;
        struct sockaddr_storage address;
        const struct proto_ops *ops;

        sock = sock_from_file(file);
        if (!sock)
                return ERR_PTR(-ENOTSOCK);

        newsock = sock_alloc();
        if (!newsock)
                return ERR_PTR(-ENFILE);
        ops = READ_ONCE(sock->ops);

        newsock->type = sock->type;
        newsock->ops = ops;

        /*
         * We don't need try_module_get here, as the listening socket (sock)
         * has the protocol module (sock->ops->owner) held.
         */
        __module_get(ops->owner);

        newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
        if (IS_ERR(newfile))
                return newfile;

        err = security_socket_accept(sock, newsock);
        if (err)
                goto out_fd;

        arg->flags |= sock->file->f_flags;
        err = ops->accept(sock, newsock, arg);
        if (err < 0)
                goto out_fd;

        if (upeer_sockaddr) {
                len = ops->getname(newsock, (struct sockaddr *)&address, 2);
                if (len < 0) {
                        err = -ECONNABORTED;
                        goto out_fd;
                }
                err = move_addr_to_user(&address,
                                        len, upeer_sockaddr, upeer_addrlen);
                if (err < 0)
                        goto out_fd;
        }

        /* File flags are not inherited via accept() unlike another OSes. */
        return newfile;
out_fd:
        fput(newfile);
        return ERR_PTR(err);
}

static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr,
                              int __user *upeer_addrlen, int flags)
{
        struct proto_accept_arg arg = { };
        struct file *newfile;
        int newfd;

        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
                return -EINVAL;

        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
                flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

        newfd = get_unused_fd_flags(flags);
        if (unlikely(newfd < 0))
                return newfd;

        newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen,
                            flags);
        if (IS_ERR(newfile)) {
                put_unused_fd(newfd);
                return PTR_ERR(newfile);
        }
        fd_install(newfd, newfile);
        return newfd;
}

/*
 *        For accept, we attempt to create a new socket, set up the link
 *        with the client, wake up the client, then return the new
 *        connected fd. We collect the address of the connector in kernel
 *        space and move it to user at the very end. This is unclean because
 *        we open the socket then return an error.
 *
 *        1003.1g adds the ability to recvmsg() to query connection pending
 *        status to recvmsg. We need to add that support in a way thats
 *        clean when we restructure accept also.
 */

int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
                  int __user *upeer_addrlen, int flags)
{
        int ret = -EBADF;
        struct fd f;

        f = fdget(fd);
        if (f.file) {
                ret = __sys_accept4_file(f.file, upeer_sockaddr,
                                         upeer_addrlen, flags);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen, int, flags)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, flags);
}

SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
                int __user *, upeer_addrlen)
{
        return __sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
}

/*
 *        Attempt to connect to a socket with the server address.  The address
 *        is in user space so we verify it is OK and move it to kernel space.
 *
 *        For 1003.1g we need to add clean support for a bind to AF_UNSPEC to
 *        break bindings
 *
 *        NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and
 *        other SEQPACKET protocols that take time to connect() as it doesn't
 *        include the -EINPROGRESS status for such sockets.
 */

int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
                       int addrlen, int file_flags)
{
        struct socket *sock;
        int err;

        sock = sock_from_file(file);
        if (!sock) {
                err = -ENOTSOCK;
                goto out;
        }

        err =
            security_socket_connect(sock, (struct sockaddr *)address, addrlen);
        if (err)
                goto out;

        err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address,
                                addrlen, sock->file->f_flags | file_flags);
out:
        return err;
}

int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
        int ret = -EBADF;
        struct fd f;

        f = fdget(fd);
        if (f.file) {
                struct sockaddr_storage address;

                ret = move_addr_to_kernel(uservaddr, addrlen, &address);
                if (!ret)
                        ret = __sys_connect_file(f.file, &address, addrlen, 0);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
                int, addrlen)
{
        return __sys_connect(fd, uservaddr, addrlen);
}

/*
 *        Get the local address ('name') of a socket object. Move the obtained
 *        name to user space.
 */

int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
                      int __user *usockaddr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        err = security_socket_getsockname(sock);
        if (err)
                goto out_put;

        err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0);
        if (err < 0)
                goto out_put;
        /* "err" is actually length in this case */
        err = move_addr_to_user(&address, err, usockaddr, usockaddr_len);

out_put:
        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getsockname(fd, usockaddr, usockaddr_len);
}

/*
 *        Get the remote address ('name') of a socket object. Move the obtained
 *        name to user space.
 */

int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
                      int __user *usockaddr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock != NULL) {
                const struct proto_ops *ops = READ_ONCE(sock->ops);

                err = security_socket_getpeername(sock);
                if (err) {
                        fput_light(sock->file, fput_needed);
                        return err;
                }

                err = ops->getname(sock, (struct sockaddr *)&address, 1);
                if (err >= 0)
                        /* "err" is actually length in this case */
                        err = move_addr_to_user(&address, err, usockaddr,
                                                usockaddr_len);
                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
                int __user *, usockaddr_len)
{
        return __sys_getpeername(fd, usockaddr, usockaddr_len);
}

/*
 *        Send a datagram to a given address. We move the address into kernel
 *        space and check the user space data area is readable before invoking
 *        the protocol.
 */
int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
                 struct sockaddr __user *addr,  int addr_len)
{
        struct socket *sock;
        struct sockaddr_storage address;
        int err;
        struct msghdr msg;
        int fput_needed;

        err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);
        if (unlikely(err))
                return err;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        msg.msg_name = NULL;
        msg.msg_control = NULL;
        msg.msg_controllen = 0;
        msg.msg_namelen = 0;
        msg.msg_ubuf = NULL;
        if (addr) {
                err = move_addr_to_kernel(addr, addr_len, &address);
                if (err < 0)
                        goto out_put;
                msg.msg_name = (struct sockaddr *)&address;
                msg.msg_namelen = addr_len;
        }
        flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        msg.msg_flags = flags;
        err = __sock_sendmsg(sock, &msg);

out_put:
        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags, struct sockaddr __user *, addr,
                int, addr_len)
{
        return __sys_sendto(fd, buff, len, flags, addr, addr_len);
}

/*
 *        Send a datagram down a socket.
 */

SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len,
                unsigned int, flags)
{
        return __sys_sendto(fd, buff, len, flags, NULL, 0);
}

/*
 *        Receive a frame from the socket and optionally record the address of the
 *        sender. We verify the buffers are writable and if needed move the
 *        sender address from kernel to user space.
 */
int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
                   struct sockaddr __user *addr, int __user *addr_len)
{
        struct sockaddr_storage address;
        struct msghdr msg = {
                /* Save some cycles and don't copy the address if not needed */
                .msg_name = addr ? (struct sockaddr *)&address : NULL,
        };
        struct socket *sock;
        int err, err2;
        int fput_needed;

        err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
        if (unlikely(err))
                return err;
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;
        err = sock_recvmsg(sock, &msg, flags);

        if (err >= 0 && addr != NULL) {
                err2 = move_addr_to_user(&address,
                                         msg.msg_namelen, addr, addr_len);
                if (err2 < 0)
                        err = err2;
        }

        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags, struct sockaddr __user *, addr,
                int __user *, addr_len)
{
        return __sys_recvfrom(fd, ubuf, size, flags, addr, addr_len);
}

/*
 *        Receive a datagram from a socket.
 */

SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
                unsigned int, flags)
{
        return __sys_recvfrom(fd, ubuf, size, flags, NULL, NULL);
}

static bool sock_use_custom_sol_socket(const struct socket *sock)
{
        return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
}

int do_sock_setsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, int optlen)
{
        const struct proto_ops *ops;
        char *kernel_optval = NULL;
        int err;

        if (optlen < 0)
                return -EINVAL;

        err = security_socket_setsockopt(sock, level, optname);
        if (err)
                goto out_put;

        if (!compat)
                err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
                                                     optval, &optlen,
                                                     &kernel_optval);
        if (err < 0)
                goto out_put;
        if (err > 0) {
                err = 0;
                goto out_put;
        }

        if (kernel_optval)
                optval = KERNEL_SOCKPTR(kernel_optval);
        ops = READ_ONCE(sock->ops);
        if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock))
                err = sock_setsockopt(sock, level, optname, optval, optlen);
        else if (unlikely(!ops->setsockopt))
                err = -EOPNOTSUPP;
        else
                err = ops->setsockopt(sock, level, optname, optval,
                                            optlen);
        kfree(kernel_optval);
out_put:
        return err;
}
EXPORT_SYMBOL(do_sock_setsockopt);

/* Set a socket option. Because we don't know the option lengths we have
 * to pass the user mode parameter for the protocols to sort out.
 */
int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
                     int optlen)
{
        sockptr_t optval = USER_SOCKPTR(user_optval);
        bool compat = in_compat_syscall();
        int err, fput_needed;
        struct socket *sock;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        err = do_sock_setsockopt(sock, compat, level, optname, optval, optlen);

        fput_light(sock->file, fput_needed);
        return err;
}

SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int, optlen)
{
        return __sys_setsockopt(fd, level, optname, optval, optlen);
}

INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
                                                         int optname));

int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, sockptr_t optlen)
{
        int max_optlen __maybe_unused;
        const struct proto_ops *ops;
        int err;

        err = security_socket_getsockopt(sock, level, optname);
        if (err)
                return err;

        if (!compat)
                max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);

        ops = READ_ONCE(sock->ops);
        if (level == SOL_SOCKET) {
                err = sk_getsockopt(sock->sk, level, optname, optval, optlen);
        } else if (unlikely(!ops->getsockopt)) {
                err = -EOPNOTSUPP;
        } else {
                if (WARN_ONCE(optval.is_kernel || optlen.is_kernel,
                              "Invalid argument type"))
                        return -EOPNOTSUPP;

                err = ops->getsockopt(sock, level, optname, optval.user,
                                      optlen.user);
        }

        if (!compat)
                err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
                                                     optval, optlen, max_optlen,
                                                     err);

        return err;
}
EXPORT_SYMBOL(do_sock_getsockopt);

/*
 *        Get a socket option. Because we don't know the option lengths we have
 *        to pass a user mode parameter for the protocols to sort out.
 */
int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
                int __user *optlen)
{
        int err, fput_needed;
        struct socket *sock;
        bool compat;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        compat = in_compat_syscall();
        err = do_sock_getsockopt(sock, compat, level, optname,
                                 USER_SOCKPTR(optval), USER_SOCKPTR(optlen));

        fput_light(sock->file, fput_needed);
        return err;
}

SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
                char __user *, optval, int __user *, optlen)
{
        return __sys_getsockopt(fd, level, optname, optval, optlen);
}

/*
 *        Shutdown a socket.
 */

int __sys_shutdown_sock(struct socket *sock, int how)
{
        int err;

        err = security_socket_shutdown(sock, how);
        if (!err)
                err = READ_ONCE(sock->ops)->shutdown(sock, how);

        return err;
}

int __sys_shutdown(int fd, int how)
{
        int err, fput_needed;
        struct socket *sock;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (sock != NULL) {
                err = __sys_shutdown_sock(sock, how);
                fput_light(sock->file, fput_needed);
        }
        return err;
}

SYSCALL_DEFINE2(shutdown, int, fd, int, how)
{
        return __sys_shutdown(fd, how);
}

/* A couple of helpful macros for getting the address of the 32/64 bit
 * fields which are the same type (int / unsigned) on our platforms.
 */
#define COMPAT_MSG(msg, member)        ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member)
#define COMPAT_NAMELEN(msg)        COMPAT_MSG(msg, msg_namelen)
#define COMPAT_FLAGS(msg)        COMPAT_MSG(msg, msg_flags)

struct used_address {
        struct sockaddr_storage name;
        unsigned int name_len;
};

int __copy_msghdr(struct msghdr *kmsg,
                  struct user_msghdr *msg,
                  struct sockaddr __user **save_addr)
{
        ssize_t err;

        kmsg->msg_control_is_user = true;
        kmsg->msg_get_inq = 0;
        kmsg->msg_control_user = msg->msg_control;
        kmsg->msg_controllen = msg->msg_controllen;
        kmsg->msg_flags = msg->msg_flags;

        kmsg->msg_namelen = msg->msg_namelen;
        if (!msg->msg_name)
                kmsg->msg_namelen = 0;

        if (kmsg->msg_namelen < 0)
                return -EINVAL;

        if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
                kmsg->msg_namelen = sizeof(struct sockaddr_storage);

        if (save_addr)
                *save_addr = msg->msg_name;

        if (msg->msg_name && kmsg->msg_namelen) {
                if (!save_addr) {
                        err = move_addr_to_kernel(msg->msg_name,
                                                  kmsg->msg_namelen,
                                                  kmsg->msg_name);
                        if (err < 0)
                                return err;
                }
        } else {
                kmsg->msg_name = NULL;
                kmsg->msg_namelen = 0;
        }

        if (msg->msg_iovlen > UIO_MAXIOV)
                return -EMSGSIZE;

        kmsg->msg_iocb = NULL;
        kmsg->msg_ubuf = NULL;
        return 0;
}

static int copy_msghdr_from_user(struct msghdr *kmsg,
                                 struct user_msghdr __user *umsg,
                                 struct sockaddr __user **save_addr,
                                 struct iovec **iov)
{
        struct user_msghdr msg;
        ssize_t err;

        if (copy_from_user(&msg, umsg, sizeof(*umsg)))
                return -EFAULT;

        err = __copy_msghdr(kmsg, &msg, save_addr);
        if (err)
                return err;

        err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE,
                            msg.msg_iov, msg.msg_iovlen,
                            UIO_FASTIOV, iov, &kmsg->msg_iter);
        return err < 0 ? err : 0;
}

static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
                           unsigned int flags, struct used_address *used_address,
                           unsigned int allowed_msghdr_flags)
{
        unsigned char ctl[sizeof(struct cmsghdr) + 20]
                                __aligned(sizeof(__kernel_size_t));
        /* 20 is size of ipv6_pktinfo */
        unsigned char *ctl_buf = ctl;
        int ctl_len;
        ssize_t err;

        err = -ENOBUFS;

        if (msg_sys->msg_controllen > INT_MAX)
                goto out;
        flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
        ctl_len = msg_sys->msg_controllen;
        if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
                err =
                    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
                                                     sizeof(ctl));
                if (err)
                        goto out;
                ctl_buf = msg_sys->msg_control;
                ctl_len = msg_sys->msg_controllen;
        } else if (ctl_len) {
                BUILD_BUG_ON(sizeof(struct cmsghdr) !=
                             CMSG_ALIGN(sizeof(struct cmsghdr)));
                if (ctl_len > sizeof(ctl)) {
                        ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
                        if (ctl_buf == NULL)
                                goto out;
                }
                err = -EFAULT;
                if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len))
                        goto out_freectl;
                msg_sys->msg_control = ctl_buf;
                msg_sys->msg_control_is_user = false;
        }
        flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
        msg_sys->msg_flags = flags;

        if (sock->file->f_flags & O_NONBLOCK)
                msg_sys->msg_flags |= MSG_DONTWAIT;
        /*
         * If this is sendmmsg() and current destination address is same as
         * previously succeeded address, omit asking LSM's decision.
         * used_address->name_len is initialized to UINT_MAX so that the first
         * destination address never matches.
         */
        if (used_address && msg_sys->msg_name &&
            used_address->name_len == msg_sys->msg_namelen &&
            !memcmp(&used_address->name, msg_sys->msg_name,
                    used_address->name_len)) {
                err = sock_sendmsg_nosec(sock, msg_sys);
                goto out_freectl;
        }
        err = __sock_sendmsg(sock, msg_sys);
        /*
         * If this is sendmmsg() and sending to current destination address was
         * successful, remember it.
         */
        if (used_address && err >= 0) {
                used_address->name_len = msg_sys->msg_namelen;
                if (msg_sys->msg_name)
                        memcpy(&used_address->name, msg_sys->msg_name,
                               used_address->name_len);
        }

out_freectl:
        if (ctl_buf != ctl)
                sock_kfree_s(sock->sk, ctl_buf, ctl_len);
out:
        return err;
}

static int sendmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct iovec **iov)
{
        int err;

        if (flags & MSG_CMSG_COMPAT) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, NULL, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, NULL, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags,
                         struct used_address *used_address,
                         unsigned int allowed_msghdr_flags)
{
        struct sockaddr_storage address;
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        ssize_t err;

        msg_sys->msg_name = &address;

        err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov);
        if (err < 0)
                return err;

        err = ____sys_sendmsg(sock, msg_sys, flags, used_address,
                                allowed_msghdr_flags);
        kfree(iov);
        return err;
}

/*
 *        BSD sendmsg interface
 */
long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
                        unsigned int flags)
{
        return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}

long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        int fput_needed, err;
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);

        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
{
        return __sys_sendmsg(fd, msg, flags, true);
}

/*
 *        Linux sendmmsg interface
 */

int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
                   unsigned int flags, bool forbid_cmsg_compat)
{
        int fput_needed, err, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct used_address used_address;
        unsigned int oflags = flags;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        if (vlen > UIO_MAXIOV)
                vlen = UIO_MAXIOV;

        datagrams = 0;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        used_address.name_len = UINT_MAX;
        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;
        err = 0;
        flags |= MSG_BATCH;

        while (datagrams < vlen) {
                if (datagrams == vlen - 1)
                        flags = oflags;

                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_sendmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags, &used_address, MSG_EOR);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;
                if (msg_data_left(&msg_sys))
                        break;
                cond_resched();
        }

        fput_light(sock->file, fput_needed);

        /* We only return an error if no datagrams were able to be sent */
        if (datagrams != 0)
                return datagrams;

        return err;
}

SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags)
{
        return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
}

static int recvmsg_copy_msghdr(struct msghdr *msg,
                               struct user_msghdr __user *umsg, unsigned flags,
                               struct sockaddr __user **uaddr,
                               struct iovec **iov)
{
        ssize_t err;

        if (MSG_CMSG_COMPAT & flags) {
                struct compat_msghdr __user *msg_compat;

                msg_compat = (struct compat_msghdr __user *) umsg;
                err = get_compat_msghdr(msg, msg_compat, uaddr, iov);
        } else {
                err = copy_msghdr_from_user(msg, umsg, uaddr, iov);
        }
        if (err < 0)
                return err;

        return 0;
}

static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys,
                           struct user_msghdr __user *msg,
                           struct sockaddr __user *uaddr,
                           unsigned int flags, int nosec)
{
        struct compat_msghdr __user *msg_compat =
                                        (struct compat_msghdr __user *) msg;
        int __user *uaddr_len = COMPAT_NAMELEN(msg);
        struct sockaddr_storage addr;
        unsigned long cmsg_ptr;
        int len;
        ssize_t err;

        msg_sys->msg_name = &addr;
        cmsg_ptr = (unsigned long)msg_sys->msg_control;
        msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);

        /* We assume all kernel code knows the size of sockaddr_storage */
        msg_sys->msg_namelen = 0;

        if (sock->file->f_flags & O_NONBLOCK)
                flags |= MSG_DONTWAIT;

        if (unlikely(nosec))
                err = sock_recvmsg_nosec(sock, msg_sys, flags);
        else
                err = sock_recvmsg(sock, msg_sys, flags);

        if (err < 0)
                goto out;
        len = err;

        if (uaddr != NULL) {
                err = move_addr_to_user(&addr,
                                        msg_sys->msg_namelen, uaddr,
                                        uaddr_len);
                if (err < 0)
                        goto out;
        }
        err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
                         COMPAT_FLAGS(msg));
        if (err)
                goto out;
        if (MSG_CMSG_COMPAT & flags)
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg_compat->msg_controllen);
        else
                err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
                                 &msg->msg_controllen);
        if (err)
                goto out;
        err = len;
out:
        return err;
}

static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
                         struct msghdr *msg_sys, unsigned int flags, int nosec)
{
        struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
        /* user mode address pointers */
        struct sockaddr __user *uaddr;
        ssize_t err;

        err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov);
        if (err < 0)
                return err;

        err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec);
        kfree(iov);
        return err;
}

/*
 *        BSD recvmsg interface
 */

long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
                        struct user_msghdr __user *umsg,
                        struct sockaddr __user *uaddr, unsigned int flags)
{
        return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}

long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
                   bool forbid_cmsg_compat)
{
        int fput_needed, err;
        struct msghdr msg_sys;
        struct socket *sock;

        if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
                return -EINVAL;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;

        err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);

        fput_light(sock->file, fput_needed);
out:
        return err;
}

SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
                unsigned int, flags)
{
        return __sys_recvmsg(fd, msg, flags, true);
}

/*
 *     Linux recvmmsg interface
 */

static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                          unsigned int vlen, unsigned int flags,
                          struct timespec64 *timeout)
{
        int fput_needed, err, datagrams;
        struct socket *sock;
        struct mmsghdr __user *entry;
        struct compat_mmsghdr __user *compat_entry;
        struct msghdr msg_sys;
        struct timespec64 end_time;
        struct timespec64 timeout64;

        if (timeout &&
            poll_select_set_timeout(&end_time, timeout->tv_sec,
                                    timeout->tv_nsec))
                return -EINVAL;

        datagrams = 0;

        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                return err;

        if (likely(!(flags & MSG_ERRQUEUE))) {
                err = sock_error(sock->sk);
                if (err) {
                        datagrams = err;
                        goto out_put;
                }
        }

        entry = mmsg;
        compat_entry = (struct compat_mmsghdr __user *)mmsg;

        while (datagrams < vlen) {
                /*
                 * No need to ask LSM for more than the first datagram.
                 */
                if (MSG_CMSG_COMPAT & flags) {
                        err = ___sys_recvmsg(sock, (struct user_msghdr __user *)compat_entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = __put_user(err, &compat_entry->msg_len);
                        ++compat_entry;
                } else {
                        err = ___sys_recvmsg(sock,
                                             (struct user_msghdr __user *)entry,
                                             &msg_sys, flags & ~MSG_WAITFORONE,
                                             datagrams);
                        if (err < 0)
                                break;
                        err = put_user(err, &entry->msg_len);
                        ++entry;
                }

                if (err)
                        break;
                ++datagrams;

                /* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
                if (flags & MSG_WAITFORONE)
                        flags |= MSG_DONTWAIT;

                if (timeout) {
                        ktime_get_ts64(&timeout64);
                        *timeout = timespec64_sub(end_time, timeout64);
                        if (timeout->tv_sec < 0) {
                                timeout->tv_sec = timeout->tv_nsec = 0;
                                break;
                        }

                        /* Timeout, return less than vlen datagrams */
                        if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
                                break;
                }

                /* Out of band data, return right away */
                if (msg_sys.msg_flags & MSG_OOB)
                        break;
                cond_resched();
        }

        if (err == 0)
                goto out_put;

        if (datagrams == 0) {
                datagrams = err;
                goto out_put;
        }

        /*
         * We may return less entries than requested (vlen) if the
         * sock is non block and there aren't enough datagrams...
         */
        if (err != -EAGAIN) {
                /*
                 * ... or  if recvmsg returns an error after we
                 * received some datagrams, where we record the
                 * error to return on the next call or if the
                 * app asks about it using getsockopt(SO_ERROR).
                 */
                WRITE_ONCE(sock->sk->sk_err, -err);
        }
out_put:
        fput_light(sock->file, fput_needed);

        return datagrams;
}

int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
                   unsigned int vlen, unsigned int flags,
                   struct __kernel_timespec __user *timeout,
                   struct old_timespec32 __user *timeout32)
{
        int datagrams;
        struct timespec64 timeout_sys;

        if (timeout && get_timespec64(&timeout_sys, timeout))
                return -EFAULT;

        if (timeout32 && get_old_timespec32(&timeout_sys, timeout32))
                return -EFAULT;

        if (!timeout && !timeout32)
                return do_recvmmsg(fd, mmsg, vlen, flags, NULL);

        datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);

        if (datagrams <= 0)
                return datagrams;

        if (timeout && put_timespec64(&timeout_sys, timeout))
                datagrams = -EFAULT;

        if (timeout32 && put_old_timespec32(&timeout_sys, timeout32))
                datagrams = -EFAULT;

        return datagrams;
}

SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct __kernel_timespec __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL);
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg,
                unsigned int, vlen, unsigned int, flags,
                struct old_timespec32 __user *, timeout)
{
        if (flags & MSG_CMSG_COMPAT)
                return -EINVAL;

        return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout);
}
#endif

#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[21] = {
        AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
        AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
        AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
        AL(4), AL(5), AL(4)
};

#undef AL

/*
 *        System call vectors.
 *
 *        Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
        unsigned long a[AUDITSC_ARGS];
        unsigned long a0, a1;
        int err;
        unsigned int len;

        if (call < 1 || call > SYS_SENDMMSG)
                return -EINVAL;
        call = array_index_nospec(call, SYS_SENDMMSG + 1);

        len = nargs[call];
        if (len > sizeof(a))
                return -EINVAL;

        /* copy_from_user should be SMP safe. */
        if (copy_from_user(a, args, len))
                return -EFAULT;

        err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
        if (err)
                return err;

        a0 = a[0];
        a1 = a[1];

        switch (call) {
        case SYS_SOCKET:
                err = __sys_socket(a0, a1, a[2]);
                break;
        case SYS_BIND:
                err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_CONNECT:
                err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
                break;
        case SYS_LISTEN:
                err = __sys_listen(a0, a1);
                break;
        case SYS_ACCEPT:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], 0);
                break;
        case SYS_GETSOCKNAME:
                err =
                    __sys_getsockname(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2]);
                break;
        case SYS_GETPEERNAME:
                err =
                    __sys_getpeername(a0, (struct sockaddr __user *)a1,
                                      (int __user *)a[2]);
                break;
        case SYS_SOCKETPAIR:
                err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
                break;
        case SYS_SEND:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   NULL, 0);
                break;
        case SYS_SENDTO:
                err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                                   (struct sockaddr __user *)a[4], a[5]);
                break;
        case SYS_RECV:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     NULL, NULL);
                break;
        case SYS_RECVFROM:
                err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                                     (struct sockaddr __user *)a[4],
                                     (int __user *)a[5]);
                break;
        case SYS_SHUTDOWN:
                err = __sys_shutdown(a0, a1);
                break;
        case SYS_SETSOCKOPT:
                err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3],
                                       a[4]);
                break;
        case SYS_GETSOCKOPT:
                err =
                    __sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                                     (int __user *)a[4]);
                break;
        case SYS_SENDMSG:
                err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_SENDMMSG:
                err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2],
                                     a[3], true);
                break;
        case SYS_RECVMSG:
                err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1,
                                    a[2], true);
                break;
        case SYS_RECVMMSG:
                if (IS_ENABLED(CONFIG_64BIT))
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3],
                                             (struct __kernel_timespec __user *)a[4],
                                             NULL);
                else
                        err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                                             a[2], a[3], NULL,
                                             (struct old_timespec32 __user *)a[4]);
                break;
        case SYS_ACCEPT4:
                err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                                    (int __user *)a[2], a[3]);
                break;
        default:
                err = -EINVAL;
                break;
        }
        return err;
}

#endif                                /* __ARCH_WANT_SYS_SOCKETCALL */

/**
 *        sock_register - add a socket protocol handler
 *        @ops: description of protocol
 *
 *        This function is called by a protocol handler that wants to
 *        advertise its address family, and have it linked into the
 *        socket interface. The value ops->family corresponds to the
 *        socket system call protocol family.
 */
int sock_register(const struct net_proto_family *ops)
{
        int err;

        if (ops->family >= NPROTO) {
                pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
                return -ENOBUFS;
        }

        spin_lock(&net_family_lock);
        if (rcu_dereference_protected(net_families[ops->family],
                                      lockdep_is_held(&net_family_lock)))
                err = -EEXIST;
        else {
                rcu_assign_pointer(net_families[ops->family], ops);
                err = 0;
        }
        spin_unlock(&net_family_lock);

        pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]);
        return err;
}
EXPORT_SYMBOL(sock_register);

/**
 *        sock_unregister - remove a protocol handler
 *        @family: protocol family to remove
 *
 *        This function is called by a protocol handler that wants to
 *        remove its address family, and have it unlinked from the
 *        new socket creation.
 *
 *        If protocol handler is a module, then it can use module reference
 *        counts to protect against new references. If protocol handler is not
 *        a module then it needs to provide its own protection in
 *        the ops->create routine.
 */
void sock_unregister(int family)
{
        BUG_ON(family < 0 || family >= NPROTO);

        spin_lock(&net_family_lock);
        RCU_INIT_POINTER(net_families[family], NULL);
        spin_unlock(&net_family_lock);

        synchronize_rcu();

        pr_info("NET: Unregistered %s protocol family\n", pf_family_names[family]);
}
EXPORT_SYMBOL(sock_unregister);

bool sock_is_registered(int family)
{
        return family < NPROTO && rcu_access_pointer(net_families[family]);
}

static int __init sock_init(void)
{
        int err;
        /*
         *      Initialize the network sysctl infrastructure.
         */
        err = net_sysctl_init();
        if (err)
                goto out;

        /*
         *      Initialize skbuff SLAB cache
         */
        skb_init();

        /*
         *      Initialize the protocols module.
         */

        init_inodecache();

        err = register_filesystem(&sock_fs_type);
        if (err)
                goto out;
        sock_mnt = kern_mount(&sock_fs_type);
        if (IS_ERR(sock_mnt)) {
                err = PTR_ERR(sock_mnt);
                goto out_mount;
        }

        /* The real protocol initialization is performed in later initcalls.
         */

#ifdef CONFIG_NETFILTER
        err = netfilter_init();
        if (err)
                goto out;
#endif

        ptp_classifier_init();

out:
        return err;

out_mount:
        unregister_filesystem(&sock_fs_type);
        goto out;
}

core_initcall(sock_init);        /* early initcall */

#ifdef CONFIG_PROC_FS
void socket_seq_show(struct seq_file *seq)
{
        seq_printf(seq, "sockets: used %d\n",
                   sock_inuse_get(seq->private));
}
#endif                                /* CONFIG_PROC_FS */

/* Handle the fact that while struct ifreq has the same *layout* on
 * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
 * which are handled elsewhere, it still has different *size* due to
 * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
 * resulting in struct ifreq being 32 and 40 bytes respectively).
 * As a result, if the struct happens to be at the end of a page and
 * the next page isn't readable/writable, we get a fault. To prevent
 * that, copy back and forth to the full size.
 */
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg)
{
        if (in_compat_syscall()) {
                struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr;

                memset(ifr, 0, sizeof(*ifr));
                if (copy_from_user(ifr32, arg, sizeof(*ifr32)))
                        return -EFAULT;

                if (ifrdata)
                        *ifrdata = compat_ptr(ifr32->ifr_data);

                return 0;
        }

        if (copy_from_user(ifr, arg, sizeof(*ifr)))
                return -EFAULT;

        if (ifrdata)
                *ifrdata = ifr->ifr_data;

        return 0;
}
EXPORT_SYMBOL(get_user_ifreq);

int put_user_ifreq(struct ifreq *ifr, void __user *arg)
{
        size_t size = sizeof(*ifr);

        if (in_compat_syscall())
                size = sizeof(struct compat_ifreq);

        if (copy_to_user(arg, ifr, size))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(put_user_ifreq);

#ifdef CONFIG_COMPAT
static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
{
        compat_uptr_t uptr32;
        struct ifreq ifr;
        void __user *saved;
        int err;

        if (get_user_ifreq(&ifr, NULL, uifr32))
                return -EFAULT;

        if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
                return -EFAULT;

        saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
        ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);

        err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL);
        if (!err) {
                ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
                if (put_user_ifreq(&ifr, uifr32))
                        err = -EFAULT;
        }
        return err;
}

/* Handle ioctls that use ifreq::ifr_data and just need struct ifreq converted */
static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd,
                                 struct compat_ifreq __user *u_ifreq32)
{
        struct ifreq ifreq;
        void __user *data;

        if (!is_socket_ioctl_cmd(cmd))
                return -ENOTTY;
        if (get_user_ifreq(&ifreq, &data, u_ifreq32))
                return -EFAULT;
        ifreq.ifr_data = data;

        return dev_ioctl(net, cmd, &ifreq, data, NULL);
}

static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
                         unsigned int cmd, unsigned long arg)
{
        void __user *argp = compat_ptr(arg);
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        const struct proto_ops *ops;

        if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
                return sock_ioctl(file, cmd, (unsigned long)argp);

        switch (cmd) {
        case SIOCWANDEV:
                return compat_siocwandev(net, argp);
        case SIOCGSTAMP_OLD:
        case SIOCGSTAMPNS_OLD:
                ops = READ_ONCE(sock->ops);
                if (!ops->gettstamp)
                        return -ENOIOCTLCMD;
                return ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
                                      !COMPAT_USE_64BIT_TIME);

        case SIOCETHTOOL:
        case SIOCBONDSLAVEINFOQUERY:
        case SIOCBONDINFOQUERY:
        case SIOCSHWTSTAMP:
        case SIOCGHWTSTAMP:
                return compat_ifr_data_ioctl(net, cmd, argp);

        case FIOSETOWN:
        case SIOCSPGRP:
        case FIOGETOWN:
        case SIOCGPGRP:
        case SIOCBRADDBR:
        case SIOCBRDELBR:
        case SIOCGIFVLAN:
        case SIOCSIFVLAN:
        case SIOCGSKNS:
        case SIOCGSTAMP_NEW:
        case SIOCGSTAMPNS_NEW:
        case SIOCGIFCONF:
        case SIOCSIFBR:
        case SIOCGIFBR:
                return sock_ioctl(file, cmd, arg);

        case SIOCGIFFLAGS:
        case SIOCSIFFLAGS:
        case SIOCGIFMAP:
        case SIOCSIFMAP:
        case SIOCGIFMETRIC:
        case SIOCSIFMETRIC:
        case SIOCGIFMTU:
        case SIOCSIFMTU:
        case SIOCGIFMEM:
        case SIOCSIFMEM:
        case SIOCGIFHWADDR:
        case SIOCSIFHWADDR:
        case SIOCADDMULTI:
        case SIOCDELMULTI:
        case SIOCGIFINDEX:
        case SIOCGIFADDR:
        case SIOCSIFADDR:
        case SIOCSIFHWBROADCAST:
        case SIOCDIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCSIFBRDADDR:
        case SIOCGIFDSTADDR:
        case SIOCSIFDSTADDR:
        case SIOCGIFNETMASK:
        case SIOCSIFNETMASK:
        case SIOCSIFPFLAGS:
        case SIOCGIFPFLAGS:
        case SIOCGIFTXQLEN:
        case SIOCSIFTXQLEN:
        case SIOCBRADDIF:
        case SIOCBRDELIF:
        case SIOCGIFNAME:
        case SIOCSIFNAME:
        case SIOCGMIIPHY:
        case SIOCGMIIREG:
        case SIOCSMIIREG:
        case SIOCBONDENSLAVE:
        case SIOCBONDRELEASE:
        case SIOCBONDSETHWADDR:
        case SIOCBONDCHANGEACTIVE:
        case SIOCSARP:
        case SIOCGARP:
        case SIOCDARP:
        case SIOCOUTQ:
        case SIOCOUTQNSD:
        case SIOCATMARK:
                return sock_do_ioctl(net, sock, cmd, arg);
        }

        return -ENOIOCTLCMD;
}

static long compat_sock_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct socket *sock = file->private_data;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        int ret = -ENOIOCTLCMD;
        struct sock *sk;
        struct net *net;

        sk = sock->sk;
        net = sock_net(sk);

        if (ops->compat_ioctl)
                ret = ops->compat_ioctl(sock, cmd, arg);

        if (ret == -ENOIOCTLCMD &&
            (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST))
                ret = compat_wext_handle_ioctl(net, cmd, arg);

        if (ret == -ENOIOCTLCMD)
                ret = compat_sock_ioctl_trans(file, sock, cmd, arg);

        return ret;
}
#endif

/**
 *        kernel_bind - bind an address to a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: length of address
 *
 *        Returns 0 or an error.
 */

int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address,
                                          addrlen);
}
EXPORT_SYMBOL(kernel_bind);

/**
 *        kernel_listen - move socket to listening state (kernel space)
 *        @sock: socket
 *        @backlog: pending connections queue size
 *
 *        Returns 0 or an error.
 */

int kernel_listen(struct socket *sock, int backlog)
{
        return READ_ONCE(sock->ops)->listen(sock, backlog);
}
EXPORT_SYMBOL(kernel_listen);

/**
 *        kernel_accept - accept a connection (kernel space)
 *        @sock: listening socket
 *        @newsock: new connected socket
 *        @flags: flags
 *
 *        @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0.
 *        If it fails, @newsock is guaranteed to be %NULL.
 *        Returns 0 or an error.
 */

int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
{
        struct sock *sk = sock->sk;
        const struct proto_ops *ops = READ_ONCE(sock->ops);
        struct proto_accept_arg arg = {
                .flags = flags,
                .kern = true,
        };
        int err;

        err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
                               newsock);
        if (err < 0)
                goto done;

        err = ops->accept(sock, *newsock, &arg);
        if (err < 0) {
                sock_release(*newsock);
                *newsock = NULL;
                goto done;
        }

        (*newsock)->ops = ops;
        __module_get(ops->owner);

done:
        return err;
}
EXPORT_SYMBOL(kernel_accept);

/**
 *        kernel_connect - connect a socket (kernel space)
 *        @sock: socket
 *        @addr: address
 *        @addrlen: address length
 *        @flags: flags (O_NONBLOCK, ...)
 *
 *        For datagram sockets, @addr is the address to which datagrams are sent
 *        by default, and the only address from which datagrams are received.
 *        For stream sockets, attempts to connect to @addr.
 *        Returns 0 or an error code.
 */

int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
                   int flags)
{
        struct sockaddr_storage address;

        memcpy(&address, addr, addrlen);

        return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address,
                                             addrlen, flags);
}
EXPORT_SYMBOL(kernel_connect);

/**
 *        kernel_getsockname - get the address which the socket is bound (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is bound.
 *        Returns the length of the address in bytes or an error code.
 */

int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
{
        return READ_ONCE(sock->ops)->getname(sock, addr, 0);
}
EXPORT_SYMBOL(kernel_getsockname);

/**
 *        kernel_getpeername - get the address which the socket is connected (kernel space)
 *        @sock: socket
 *        @addr: address holder
 *
 *         Fills the @addr pointer with the address which the socket is connected.
 *        Returns the length of the address in bytes or an error code.
 */

int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
{
        return READ_ONCE(sock->ops)->getname(sock, addr, 1);
}
EXPORT_SYMBOL(kernel_getpeername);

/**
 *        kernel_sock_shutdown - shut down part of a full-duplex connection (kernel space)
 *        @sock: socket
 *        @how: connection part
 *
 *        Returns 0 or an error.
 */

int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
{
        return READ_ONCE(sock->ops)->shutdown(sock, how);
}
EXPORT_SYMBOL(kernel_sock_shutdown);

/**
 *        kernel_sock_ip_overhead - returns the IP overhead imposed by a socket
 *        @sk: socket
 *
 *        This routine returns the IP overhead imposed by a socket i.e.
 *        the length of the underlying IP header, depending on whether
 *        this is an IPv4 or IPv6 socket and the length from IP options turned
 *        on at the socket. Assumes that the caller has a lock on the socket.
 */

u32 kernel_sock_ip_overhead(struct sock *sk)
{
        struct inet_sock *inet;
        struct ip_options_rcu *opt;
        u32 overhead = 0;
#if IS_ENABLED(CONFIG_IPV6)
        struct ipv6_pinfo *np;
        struct ipv6_txoptions *optv6 = NULL;
#endif /* IS_ENABLED(CONFIG_IPV6) */

        if (!sk)
                return overhead;

        switch (sk->sk_family) {
        case AF_INET:
                inet = inet_sk(sk);
                overhead += sizeof(struct iphdr);
                opt = rcu_dereference_protected(inet->inet_opt,
                                                sock_owned_by_user(sk));
                if (opt)
                        overhead += opt->opt.optlen;
                return overhead;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                np = inet6_sk(sk);
                overhead += sizeof(struct ipv6hdr);
                if (np)
                        optv6 = rcu_dereference_protected(np->opt,
                                                          sock_owned_by_user(sk));
                if (optv6)
                        overhead += (optv6->opt_flen + optv6->opt_nflen);
                return overhead;
#endif /* IS_ENABLED(CONFIG_IPV6) */
        default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */
                return overhead;
        }
}
EXPORT_SYMBOL(kernel_sock_ip_overhead);








































    2 
    2 












    1 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMENS_H
#define _LINUX_TIMENS_H


#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>
#include <linux/time64.h>

struct user_namespace;
extern struct user_namespace init_user_ns;

struct vm_area_struct;

struct timens_offsets {
        struct timespec64 monotonic;
        struct timespec64 boottime;
};

struct time_namespace {
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct ns_common        ns;
        struct timens_offsets        offsets;
        struct page                *vvar_page;
        /* If set prevents changing offsets after any task joined namespace. */
        bool                        frozen_offsets;
} __randomize_layout;

extern struct time_namespace init_time_ns;

#ifdef CONFIG_TIME_NS
extern int vdso_join_timens(struct task_struct *task,
                            struct time_namespace *ns);
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        refcount_inc(&ns->ns.count);
        return ns;
}

struct time_namespace *copy_time_ns(unsigned long flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns);
void free_time_ns(struct time_namespace *ns);
void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
struct page *find_timens_vvar_page(struct vm_area_struct *vma);

static inline void put_time_ns(struct time_namespace *ns)
{
        if (refcount_dec_and_test(&ns->ns.count))
                free_time_ns(ns);
}

void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m);

struct proc_timens_offset {
        int                        clockid;
        struct timespec64        val;
};

int proc_timens_set_offset(struct file *file, struct task_struct *p,
                           struct proc_timens_offset *offsets, int n);

static inline void timens_add_monotonic(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->monotonic);
}

static inline void timens_add_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->boottime);
}

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        return nsec + timespec64_to_ns(&ns_offsets->boottime);
}

static inline void timens_sub_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_sub(*ts, ns_offsets->boottime);
}

ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
                                struct timens_offsets *offsets);

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        struct time_namespace *ns = current->nsproxy->time_ns;

        if (likely(ns == &init_time_ns))
                return tim;

        return do_timens_ktime_to_host(clockid, tim, &ns->offsets);
}

#else
static inline int vdso_join_timens(struct task_struct *task,
                                   struct time_namespace *ns)
{
        return 0;
}

static inline void timens_commit(struct task_struct *tsk,
                                 struct time_namespace *ns)
{
}

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        return NULL;
}

static inline void put_time_ns(struct time_namespace *ns)
{
}

static inline
struct time_namespace *copy_time_ns(unsigned long flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns)
{
        if (flags & CLONE_NEWTIME)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline void timens_on_fork(struct nsproxy *nsproxy,
                                 struct task_struct *tsk)
{
        return;
}

static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void timens_add_monotonic(struct timespec64 *ts) { }
static inline void timens_add_boottime(struct timespec64 *ts) { }

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        return nsec;
}

static inline void timens_sub_boottime(struct timespec64 *ts) { }

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        return tim;
}
#endif

struct vdso_data *arch_get_vdso_data(void *vvar_page);

#endif /* _LINUX_TIMENS_H */





























































































































































   11 





















































































































    3 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_XSTATE_H
#define __X86_KERNEL_FPU_XSTATE_H

#include <asm/cpufeature.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>

#ifdef CONFIG_X86_64
DECLARE_PER_CPU(u64, xfd_state);
#endif

static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
{
        /*
         * XRSTORS requires these bits set in xcomp_bv, or it will
         * trigger #GP:
         */
        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
                xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
}

static inline u64 xstate_get_group_perm(bool guest)
{
        struct fpu *fpu = &current->group_leader->thread.fpu;
        struct fpu_state_perm *perm;

        /* Pairs with WRITE_ONCE() in xstate_request_perm() */
        perm = guest ? &fpu->guest_perm : &fpu->perm;
        return READ_ONCE(perm->__state_perm);
}

static inline u64 xstate_get_host_group_perm(void)
{
        return xstate_get_group_perm(false);
}

enum xstate_copy_mode {
        XSTATE_COPY_FP,
        XSTATE_COPY_FX,
        XSTATE_COPY_XSAVE,
};

struct membuf;
extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                                      u64 xfeatures, u32 pkru_val,
                                      enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                                    enum xstate_copy_mode mode);
extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);


extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);

static inline u64 xfeatures_mask_supervisor(void)
{
        return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}

static inline u64 xfeatures_mask_independent(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
                return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;

        return XFEATURE_MASK_INDEPENDENT;
}

/* XSAVE/XRSTOR wrapper functions */

#ifdef CONFIG_X86_64
#define REX_PREFIX        "0x48, "
#else
#define REX_PREFIX
#endif

/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE                ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT        ".byte " REX_PREFIX "0x0f,0xae,0x37"
#define XSAVEC                ".byte " REX_PREFIX "0x0f,0xc7,0x27"
#define XSAVES                ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR                ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"

/*
 * After this @err contains 0 on success or the trap number when the
 * operation raises an exception.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)                                \
        asm volatile("1:" op "\n\t"                                        \
                     "xor %[err], %[err]\n"                                \
                     "2:\n\t"                                                \
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)        \
                     : [err] "=a" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
 * states in addition to XSAVEC.
 *
 * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
 * compacted storage format in addition to XSAVEOPT.
 *
 * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
 * supports modified optimization which is not supported by XSAVE.
 *
 * We use XSAVE as a fallback.
 *
 * The 661 label is defined in the ALTERNATIVE* macros as the address of the
 * original instruction which gets replaced. We need to use it here as the
 * address of the instruction where we might get an exception at.
 */
#define XSTATE_XSAVE(st, lmask, hmask, err)                                \
        asm volatile(ALTERNATIVE_3(XSAVE,                                \
                                   XSAVEOPT, X86_FEATURE_XSAVEOPT,        \
                                   XSAVEC,   X86_FEATURE_XSAVEC,        \
                                   XSAVES,   X86_FEATURE_XSAVES)        \
                     "\n"                                                \
                     "xor %[err], %[err]\n"                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
                     : [err] "=r" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
 * XSAVE area format.
 */
#define XSTATE_XRESTORE(st, lmask, hmask)                                \
        asm volatile(ALTERNATIVE(XRSTOR,                                \
                                 XRSTORS, X86_FEATURE_XSAVES)                \
                     "\n"                                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE)        \
                     :                                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
#else
static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
#endif

#ifdef CONFIG_X86_64
static inline void xfd_set_state(u64 xfd)
{
        wrmsrl(MSR_IA32_XFD, xfd);
        __this_cpu_write(xfd_state, xfd);
}

static inline void xfd_update_state(struct fpstate *fpstate)
{
        if (fpu_state_size_dynamic()) {
                u64 xfd = fpstate->xfd;

                if (__this_cpu_read(xfd_state) != xfd)
                        xfd_set_state(xfd);
        }
}

extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
static inline void xfd_set_state(u64 xfd) { }

static inline void xfd_update_state(struct fpstate *fpstate) { }

static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
        return -EPERM;
}
#endif

/*
 * Save processor xstate to xsave area.
 *
 * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
 * and command line options. The choice is permanent until the next reboot.
 */
static inline void os_xsave(struct fpstate *fpstate)
{
        u64 mask = fpstate->xfeatures;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON_FPU(!alternatives_patched);
        xfd_validate_state(fpstate, mask, false);

        XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);

        /* We should never fault when copying to a kernel buffer: */
        WARN_ON_FPU(err);
}

/*
 * Restore processor xstate from xsave area.
 *
 * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
 */
static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        xfd_validate_state(fpstate, mask, true);
        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/* Restore of supervisor state. Does not require XFD */
static inline void os_xrstor_supervisor(struct fpstate *fpstate)
{
        u64 mask = xfeatures_mask_supervisor();
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/*
 * XSAVE itself always writes all requested xfeatures.  Removing features
 * from the request bitmap reduces the features which are written.
 * Generate a mask of features which must be written to a sigframe.  The
 * unset features can be optimized away and not written.
 *
 * This optimization is user-visible.  Only use for states where
 * uninitialized sigframe contents are tolerable, like dynamic features.
 *
 * Users of buffers produced with this optimization must check XSTATE_BV
 * to determine which features have been optimized out.
 */
static inline u64 xfeatures_need_sigframe_write(void)
{
        u64 xfeaures_to_write;

        /* In-use features must be written: */
        xfeaures_to_write = xfeatures_in_use();

        /* Also write all non-optimizable sigframe features: */
        xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
                             ~XFEATURE_MASK_SIGFRAME_INITOPT;

        return xfeaures_to_write;
}

/*
 * Save xstate to user space xsave area.
 *
 * We don't use modified optimization because xrstor/xrstors might track
 * a different application.
 *
 * We don't use compacted format xsave area for backward compatibility for
 * old applications which don't understand the compacted format of the
 * xsave area.
 *
 * The caller has to zero buf::header before calling this because XSAVE*
 * does not touch the reserved fields in the header.
 */
static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
{
        /*
         * Include the features which are not xsaved/rstored by the kernel
         * internally, e.g. PKRU. That's user space ABI and also required
         * to allow the signal handler to modify PKRU.
         */
        struct fpstate *fpstate = current->thread.fpu.fpstate;
        u64 mask = fpstate->user_xfeatures;
        u32 lmask;
        u32 hmask;
        int err;

        /* Optimize away writing unnecessary xfeatures: */
        if (fpu_state_size_dynamic())
                mask &= xfeatures_need_sigframe_write();

        lmask = mask;
        hmask = mask >> 32;
        xfd_validate_state(fpstate, mask, false);

        stac();
        XSTATE_OP(XSAVE, buf, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from user space xsave area.
 */
static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
{
        struct xregs_state *xstate = ((__force struct xregs_state *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        xfd_validate_state(current->thread.fpu.fpstate, mask, true);

        stac();
        XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
{
        struct xregs_state *xstate = &fpstate->regs.xsave;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        /* Ensure that XFD is up to date */
        xfd_update_state(fpstate);

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        return err;
}


#endif
















































































































































































































































































































































































































































































































































    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


















    3 









    3 

























































































































































































































































    3 
    3 




























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/common.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/string_helpers.h>
#include "common.h"

/* String table for operation mode. */
const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE] = {
        [TOMOYO_CONFIG_DISABLED]   = "disabled",
        [TOMOYO_CONFIG_LEARNING]   = "learning",
        [TOMOYO_CONFIG_PERMISSIVE] = "permissive",
        [TOMOYO_CONFIG_ENFORCING]  = "enforcing"
};

/* String table for /sys/kernel/security/tomoyo/profile */
const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                       + TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = "execute",
        [TOMOYO_MAC_FILE_OPEN]       = "open",
        [TOMOYO_MAC_FILE_CREATE]     = "create",
        [TOMOYO_MAC_FILE_UNLINK]     = "unlink",
        [TOMOYO_MAC_FILE_GETATTR]    = "getattr",
        [TOMOYO_MAC_FILE_MKDIR]      = "mkdir",
        [TOMOYO_MAC_FILE_RMDIR]      = "rmdir",
        [TOMOYO_MAC_FILE_MKFIFO]     = "mkfifo",
        [TOMOYO_MAC_FILE_MKSOCK]     = "mksock",
        [TOMOYO_MAC_FILE_TRUNCATE]   = "truncate",
        [TOMOYO_MAC_FILE_SYMLINK]    = "symlink",
        [TOMOYO_MAC_FILE_MKBLOCK]    = "mkblock",
        [TOMOYO_MAC_FILE_MKCHAR]     = "mkchar",
        [TOMOYO_MAC_FILE_LINK]       = "link",
        [TOMOYO_MAC_FILE_RENAME]     = "rename",
        [TOMOYO_MAC_FILE_CHMOD]      = "chmod",
        [TOMOYO_MAC_FILE_CHOWN]      = "chown",
        [TOMOYO_MAC_FILE_CHGRP]      = "chgrp",
        [TOMOYO_MAC_FILE_IOCTL]      = "ioctl",
        [TOMOYO_MAC_FILE_CHROOT]     = "chroot",
        [TOMOYO_MAC_FILE_MOUNT]      = "mount",
        [TOMOYO_MAC_FILE_UMOUNT]     = "unmount",
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = "pivot_root",
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       = "inet_stream_bind",
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     = "inet_stream_listen",
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    = "inet_stream_connect",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        = "inet_dgram_bind",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        = "inet_dgram_send",
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          = "inet_raw_bind",
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          = "inet_raw_send",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       = "unix_stream_bind",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     = "unix_stream_listen",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    = "unix_stream_connect",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        = "unix_dgram_bind",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        = "unix_dgram_send",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    = "unix_seqpacket_bind",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  = "unix_seqpacket_listen",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = "unix_seqpacket_connect",
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON] = "env",
        /* CONFIG group */
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_FILE] = "file",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_MISC] = "misc",
};

/* String table for conditions. */
const char * const tomoyo_condition_keyword[TOMOYO_MAX_CONDITION_KEYWORD] = {
        [TOMOYO_TASK_UID]             = "task.uid",
        [TOMOYO_TASK_EUID]            = "task.euid",
        [TOMOYO_TASK_SUID]            = "task.suid",
        [TOMOYO_TASK_FSUID]           = "task.fsuid",
        [TOMOYO_TASK_GID]             = "task.gid",
        [TOMOYO_TASK_EGID]            = "task.egid",
        [TOMOYO_TASK_SGID]            = "task.sgid",
        [TOMOYO_TASK_FSGID]           = "task.fsgid",
        [TOMOYO_TASK_PID]             = "task.pid",
        [TOMOYO_TASK_PPID]            = "task.ppid",
        [TOMOYO_EXEC_ARGC]            = "exec.argc",
        [TOMOYO_EXEC_ENVC]            = "exec.envc",
        [TOMOYO_TYPE_IS_SOCKET]       = "socket",
        [TOMOYO_TYPE_IS_SYMLINK]      = "symlink",
        [TOMOYO_TYPE_IS_FILE]         = "file",
        [TOMOYO_TYPE_IS_BLOCK_DEV]    = "block",
        [TOMOYO_TYPE_IS_DIRECTORY]    = "directory",
        [TOMOYO_TYPE_IS_CHAR_DEV]     = "char",
        [TOMOYO_TYPE_IS_FIFO]         = "fifo",
        [TOMOYO_MODE_SETUID]          = "setuid",
        [TOMOYO_MODE_SETGID]          = "setgid",
        [TOMOYO_MODE_STICKY]          = "sticky",
        [TOMOYO_MODE_OWNER_READ]      = "owner_read",
        [TOMOYO_MODE_OWNER_WRITE]     = "owner_write",
        [TOMOYO_MODE_OWNER_EXECUTE]   = "owner_execute",
        [TOMOYO_MODE_GROUP_READ]      = "group_read",
        [TOMOYO_MODE_GROUP_WRITE]     = "group_write",
        [TOMOYO_MODE_GROUP_EXECUTE]   = "group_execute",
        [TOMOYO_MODE_OTHERS_READ]     = "others_read",
        [TOMOYO_MODE_OTHERS_WRITE]    = "others_write",
        [TOMOYO_MODE_OTHERS_EXECUTE]  = "others_execute",
        [TOMOYO_EXEC_REALPATH]        = "exec.realpath",
        [TOMOYO_SYMLINK_TARGET]       = "symlink.target",
        [TOMOYO_PATH1_UID]            = "path1.uid",
        [TOMOYO_PATH1_GID]            = "path1.gid",
        [TOMOYO_PATH1_INO]            = "path1.ino",
        [TOMOYO_PATH1_MAJOR]          = "path1.major",
        [TOMOYO_PATH1_MINOR]          = "path1.minor",
        [TOMOYO_PATH1_PERM]           = "path1.perm",
        [TOMOYO_PATH1_TYPE]           = "path1.type",
        [TOMOYO_PATH1_DEV_MAJOR]      = "path1.dev_major",
        [TOMOYO_PATH1_DEV_MINOR]      = "path1.dev_minor",
        [TOMOYO_PATH2_UID]            = "path2.uid",
        [TOMOYO_PATH2_GID]            = "path2.gid",
        [TOMOYO_PATH2_INO]            = "path2.ino",
        [TOMOYO_PATH2_MAJOR]          = "path2.major",
        [TOMOYO_PATH2_MINOR]          = "path2.minor",
        [TOMOYO_PATH2_PERM]           = "path2.perm",
        [TOMOYO_PATH2_TYPE]           = "path2.type",
        [TOMOYO_PATH2_DEV_MAJOR]      = "path2.dev_major",
        [TOMOYO_PATH2_DEV_MINOR]      = "path2.dev_minor",
        [TOMOYO_PATH1_PARENT_UID]     = "path1.parent.uid",
        [TOMOYO_PATH1_PARENT_GID]     = "path1.parent.gid",
        [TOMOYO_PATH1_PARENT_INO]     = "path1.parent.ino",
        [TOMOYO_PATH1_PARENT_PERM]    = "path1.parent.perm",
        [TOMOYO_PATH2_PARENT_UID]     = "path2.parent.uid",
        [TOMOYO_PATH2_PARENT_GID]     = "path2.parent.gid",
        [TOMOYO_PATH2_PARENT_INO]     = "path2.parent.ino",
        [TOMOYO_PATH2_PARENT_PERM]    = "path2.parent.perm",
};

/* String table for PREFERENCE keyword. */
static const char * const tomoyo_pref_keywords[TOMOYO_MAX_PREF] = {
        [TOMOYO_PREF_MAX_AUDIT_LOG]      = "max_audit_log",
        [TOMOYO_PREF_MAX_LEARNING_ENTRY] = "max_learning_entry",
};

/* String table for path operation. */
const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = "execute",
        [TOMOYO_TYPE_READ]       = "read",
        [TOMOYO_TYPE_WRITE]      = "write",
        [TOMOYO_TYPE_APPEND]     = "append",
        [TOMOYO_TYPE_UNLINK]     = "unlink",
        [TOMOYO_TYPE_GETATTR]    = "getattr",
        [TOMOYO_TYPE_RMDIR]      = "rmdir",
        [TOMOYO_TYPE_TRUNCATE]   = "truncate",
        [TOMOYO_TYPE_SYMLINK]    = "symlink",
        [TOMOYO_TYPE_CHROOT]     = "chroot",
        [TOMOYO_TYPE_UMOUNT]     = "unmount",
};

/* String table for socket's operation. */
const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION] = {
        [TOMOYO_NETWORK_BIND]    = "bind",
        [TOMOYO_NETWORK_LISTEN]  = "listen",
        [TOMOYO_NETWORK_CONNECT] = "connect",
        [TOMOYO_NETWORK_SEND]    = "send",
};

/* String table for categories. */
static const char * const tomoyo_category_keywords
[TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        [TOMOYO_MAC_CATEGORY_FILE]    = "file",
        [TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAC_CATEGORY_MISC]    = "misc",
};

/* Permit policy management by non-root user? */
static bool tomoyo_manage_by_non_root;

/* Utility functions. */

/**
 * tomoyo_addprintf - strncat()-like-snprintf().
 *
 * @buffer: Buffer to write to. Must be '\0'-terminated.
 * @len:    Size of @buffer.
 * @fmt:    The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
__printf(3, 4)
static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...)
{
        va_list args;
        const int pos = strlen(buffer);

        va_start(args, fmt);
        vsnprintf(buffer + pos, len - pos - 1, fmt, args);
        va_end(args);
}

/**
 * tomoyo_flush - Flush queued string to userspace's buffer.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if all data was flushed, false otherwise.
 */
static bool tomoyo_flush(struct tomoyo_io_buffer *head)
{
        while (head->r.w_pos) {
                const char *w = head->r.w[0];
                size_t len = strlen(w);

                if (len) {
                        if (len > head->read_user_buf_avail)
                                len = head->read_user_buf_avail;
                        if (!len)
                                return false;
                        if (copy_to_user(head->read_user_buf, w, len))
                                return false;
                        head->read_user_buf_avail -= len;
                        head->read_user_buf += len;
                        w += len;
                }
                head->r.w[0] = w;
                if (*w)
                        return false;
                /* Add '\0' for audit logs and query. */
                if (head->poll) {
                        if (!head->read_user_buf_avail ||
                            copy_to_user(head->read_user_buf, "", 1))
                                return false;
                        head->read_user_buf_avail--;
                        head->read_user_buf++;
                }
                head->r.w_pos--;
                for (len = 0; len < head->r.w_pos; len++)
                        head->r.w[len] = head->r.w[len + 1];
        }
        head->r.avail = 0;
        return true;
}

/**
 * tomoyo_set_string - Queue string to "struct tomoyo_io_buffer" structure.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @string: String to print.
 *
 * Note that @string has to be kept valid until @head is kfree()d.
 * This means that char[] allocated on stack memory cannot be passed to
 * this function. Use tomoyo_io_printf() for char[] allocated on stack memory.
 */
static void tomoyo_set_string(struct tomoyo_io_buffer *head, const char *string)
{
        if (head->r.w_pos < TOMOYO_MAX_IO_READ_QUEUE) {
                head->r.w[head->r.w_pos++] = string;
                tomoyo_flush(head);
        } else
                WARN_ON(1);
}

static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...) __printf(2, 3);

/**
 * tomoyo_io_printf - printf() to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @fmt:  The printf()'s format string, followed by parameters.
 */
static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...)
{
        va_list args;
        size_t len;
        size_t pos = head->r.avail;
        int size = head->readbuf_size - pos;

        if (size <= 0)
                return;
        va_start(args, fmt);
        len = vsnprintf(head->read_buf + pos, size, fmt, args) + 1;
        va_end(args);
        if (pos + len >= head->readbuf_size) {
                WARN_ON(1);
                return;
        }
        head->r.avail += len;
        tomoyo_set_string(head, head->read_buf + pos);
}

/**
 * tomoyo_set_space - Put a space to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_space(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, " ");
}

/**
 * tomoyo_set_lf - Put a line feed to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static bool tomoyo_set_lf(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "\n");
        return !head->r.w_pos;
}

/**
 * tomoyo_set_slash - Put a shash to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_slash(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "/");
}

/* List of namespaces. */
LIST_HEAD(tomoyo_namespace_list);
/* True if namespace other than tomoyo_kernel_namespace is defined. */
static bool tomoyo_namespace_enabled;

/**
 * tomoyo_init_policy_namespace - Initialize namespace.
 *
 * @ns: Pointer to "struct tomoyo_policy_namespace".
 *
 * Returns nothing.
 */
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns)
{
        unsigned int idx;

        for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++)
                INIT_LIST_HEAD(&ns->acl_group[idx]);
        for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++)
                INIT_LIST_HEAD(&ns->group_list[idx]);
        for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++)
                INIT_LIST_HEAD(&ns->policy_list[idx]);
        ns->profile_version = 20150505;
        tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list);
        list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list);
}

/**
 * tomoyo_print_namespace - Print namespace header.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_print_namespace(struct tomoyo_io_buffer *head)
{
        if (!tomoyo_namespace_enabled)
                return;
        tomoyo_set_string(head,
                          container_of(head->r.ns,
                                       struct tomoyo_policy_namespace,
                                       namespace_list)->name);
        tomoyo_set_space(head);
}

/**
 * tomoyo_print_name_union - Print a tomoyo_name_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 */
static void tomoyo_print_name_union(struct tomoyo_io_buffer *head,
                                    const struct tomoyo_name_union *ptr)
{
        tomoyo_set_space(head);
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, ptr->filename->name);
        }
}

/**
 * tomoyo_print_name_union_quoted - Print a tomoyo_name_union with a quote.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_name_union_quoted(struct tomoyo_io_buffer *head,
                                           const struct tomoyo_name_union *ptr)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, "\"");
                tomoyo_set_string(head, ptr->filename->name);
                tomoyo_set_string(head, "\"");
        }
}

/**
 * tomoyo_print_number_union_nospace - Print a tomoyo_number_union without a space.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_number_union_nospace
(struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                int i;
                unsigned long min = ptr->values[0];
                const unsigned long max = ptr->values[1];
                u8 min_type = ptr->value_type[0];
                const u8 max_type = ptr->value_type[1];
                char buffer[128];

                buffer[0] = '\0';
                for (i = 0; i < 2; i++) {
                        switch (min_type) {
                        case TOMOYO_VALUE_TYPE_HEXADECIMAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0x%lX", min);
                                break;
                        case TOMOYO_VALUE_TYPE_OCTAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0%lo", min);
                                break;
                        default:
                                tomoyo_addprintf(buffer, sizeof(buffer), "%lu",
                                                 min);
                                break;
                        }
                        if (min == max && min_type == max_type)
                                break;
                        tomoyo_addprintf(buffer, sizeof(buffer), "-");
                        min_type = max_type;
                        min = max;
                }
                tomoyo_io_printf(head, "%s", buffer);
        }
}

/**
 * tomoyo_print_number_union - Print a tomoyo_number_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_number_union(struct tomoyo_io_buffer *head,
                                      const struct tomoyo_number_union *ptr)
{
        tomoyo_set_space(head);
        tomoyo_print_number_union_nospace(head, ptr);
}

/**
 * tomoyo_assign_profile - Create a new profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to create.
 *
 * Returns pointer to "struct tomoyo_profile" on success, NULL otherwise.
 */
static struct tomoyo_profile *tomoyo_assign_profile
(struct tomoyo_policy_namespace *ns, const unsigned int profile)
{
        struct tomoyo_profile *ptr;
        struct tomoyo_profile *entry;

        if (profile >= TOMOYO_MAX_PROFILES)
                return NULL;
        ptr = ns->profile_ptr[profile];
        if (ptr)
                return ptr;
        entry = kzalloc(sizeof(*entry), GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = ns->profile_ptr[profile];
        if (!ptr && tomoyo_memory_ok(entry)) {
                ptr = entry;
                ptr->default_config = TOMOYO_CONFIG_DISABLED |
                        TOMOYO_CONFIG_WANT_GRANT_LOG |
                        TOMOYO_CONFIG_WANT_REJECT_LOG;
                memset(ptr->config, TOMOYO_CONFIG_USE_DEFAULT,
                       sizeof(ptr->config));
                ptr->pref[TOMOYO_PREF_MAX_AUDIT_LOG] =
                        CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG;
                ptr->pref[TOMOYO_PREF_MAX_LEARNING_ENTRY] =
                        CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY;
                mb(); /* Avoid out-of-order execution. */
                ns->profile_ptr[profile] = ptr;
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
 out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_profile - Find a profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to find.
 *
 * Returns pointer to "struct tomoyo_profile".
 */
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile)
{
        static struct tomoyo_profile tomoyo_null_profile;
        struct tomoyo_profile *ptr = ns->profile_ptr[profile];

        if (!ptr)
                ptr = &tomoyo_null_profile;
        return ptr;
}

/**
 * tomoyo_find_yesno - Find values for specified keyword.
 *
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns 1 if "@find=yes" was found, 0 if "@find=no" was found, -1 otherwise.
 */
static s8 tomoyo_find_yesno(const char *string, const char *find)
{
        const char *cp = strstr(string, find);

        if (cp) {
                cp += strlen(find);
                if (!strncmp(cp, "=yes", 4))
                        return 1;
                else if (!strncmp(cp, "=no", 3))
                        return 0;
        }
        return -1;
}

/**
 * tomoyo_set_uint - Set value for specified preference.
 *
 * @i:      Pointer to "unsigned int".
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns nothing.
 */
static void tomoyo_set_uint(unsigned int *i, const char *string,
                            const char *find)
{
        const char *cp = strstr(string, find);

        if (cp)
                sscanf(cp + strlen(find), "=%u", i);
}

/**
 * tomoyo_set_mode - Set mode for specified profile.
 *
 * @name:    Name of functionality.
 * @value:   Mode for @name.
 * @profile: Pointer to "struct tomoyo_profile".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_set_mode(char *name, const char *value,
                           struct tomoyo_profile *profile)
{
        u8 i;
        u8 config;

        if (!strcmp(name, "CONFIG")) {
                i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX;
                config = profile->default_config;
        } else if (tomoyo_str_starts(&name, "CONFIG::")) {
                config = 0;
                for (i = 0; i < TOMOYO_MAX_MAC_INDEX
                             + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) {
                        int len = 0;

                        if (i < TOMOYO_MAX_MAC_INDEX) {
                                const u8 c = tomoyo_index2category[i];
                                const char *category =
                                        tomoyo_category_keywords[c];

                                len = strlen(category);
                                if (strncmp(name, category, len) ||
                                    name[len++] != ':' || name[len++] != ':')
                                        continue;
                        }
                        if (strcmp(name + len, tomoyo_mac_keywords[i]))
                                continue;
                        config = profile->config[i];
                        break;
                }
                if (i == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (strstr(value, "use_default")) {
                config = TOMOYO_CONFIG_USE_DEFAULT;
        } else {
                u8 mode;

                for (mode = 0; mode < 4; mode++)
                        if (strstr(value, tomoyo_mode[mode]))
                                /*
                                 * Update lower 3 bits in order to distinguish
                                 * 'config' from 'TOMOYO_CONFIG_USE_DEFAULT'.
                                 */
                                config = (config & ~7) | mode;
                if (config != TOMOYO_CONFIG_USE_DEFAULT) {
                        switch (tomoyo_find_yesno(value, "grant_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        }
                        switch (tomoyo_find_yesno(value, "reject_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        }
                }
        }
        if (i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                profile->config[i] = config;
        else if (config != TOMOYO_CONFIG_USE_DEFAULT)
                profile->default_config = config;
        return 0;
}

/**
 * tomoyo_write_profile - Write profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        unsigned int i;
        char *cp;
        struct tomoyo_profile *profile;

        if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version)
            == 1)
                return 0;
        i = simple_strtoul(data, &cp, 10);
        if (*cp != '-')
                return -EINVAL;
        data = cp + 1;
        profile = tomoyo_assign_profile(head->w.ns, i);
        if (!profile)
                return -EINVAL;
        cp = strchr(data, '=');
        if (!cp)
                return -EINVAL;
        *cp++ = '\0';
        if (!strcmp(data, "COMMENT")) {
                static DEFINE_SPINLOCK(lock);
                const struct tomoyo_path_info *new_comment
                        = tomoyo_get_name(cp);
                const struct tomoyo_path_info *old_comment;

                if (!new_comment)
                        return -ENOMEM;
                spin_lock(&lock);
                old_comment = profile->comment;
                profile->comment = new_comment;
                spin_unlock(&lock);
                tomoyo_put_name(old_comment);
                return 0;
        }
        if (!strcmp(data, "PREFERENCE")) {
                for (i = 0; i < TOMOYO_MAX_PREF; i++)
                        tomoyo_set_uint(&profile->pref[i], cp,
                                        tomoyo_pref_keywords[i]);
                return 0;
        }
        return tomoyo_set_mode(data, cp, profile);
}

/**
 * tomoyo_print_config - Print mode for specified functionality.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @config: Mode for that functionality.
 *
 * Returns nothing.
 *
 * Caller prints functionality's name.
 */
static void tomoyo_print_config(struct tomoyo_io_buffer *head, const u8 config)
{
        tomoyo_io_printf(head, "={ mode=%s grant_log=%s reject_log=%s }\n",
                         tomoyo_mode[config & 3],
                         str_yes_no(config & TOMOYO_CONFIG_WANT_GRANT_LOG),
                         str_yes_no(config & TOMOYO_CONFIG_WANT_REJECT_LOG));
}

/**
 * tomoyo_read_profile - Read profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
{
        u8 index;
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        const struct tomoyo_profile *profile;

        if (head->r.eof)
                return;
 next:
        index = head->r.index;
        profile = ns->profile_ptr[index];
        switch (head->r.step) {
        case 0:
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "PROFILE_VERSION=%u\n",
                                 ns->profile_version);
                head->r.step++;
                break;
        case 1:
                for ( ; head->r.index < TOMOYO_MAX_PROFILES;
                      head->r.index++)
                        if (ns->profile_ptr[head->r.index])
                                break;
                if (head->r.index == TOMOYO_MAX_PROFILES) {
                        head->r.eof = true;
                        return;
                }
                head->r.step++;
                break;
        case 2:
                {
                        u8 i;
                        const struct tomoyo_path_info *comment =
                                profile->comment;

                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-COMMENT=", index);
                        tomoyo_set_string(head, comment ? comment->name : "");
                        tomoyo_set_lf(head);
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-PREFERENCE={ ", index);
                        for (i = 0; i < TOMOYO_MAX_PREF; i++)
                                tomoyo_io_printf(head, "%s=%u ",
                                                 tomoyo_pref_keywords[i],
                                                 profile->pref[i]);
                        tomoyo_set_string(head, "}\n");
                        head->r.step++;
                }
                break;
        case 3:
                {
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-%s", index, "CONFIG");
                        tomoyo_print_config(head, profile->default_config);
                        head->r.bit = 0;
                        head->r.step++;
                }
                break;
        case 4:
                for ( ; head->r.bit < TOMOYO_MAX_MAC_INDEX
                              + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) {
                        const u8 i = head->r.bit;
                        const u8 config = profile->config[i];

                        if (config == TOMOYO_CONFIG_USE_DEFAULT)
                                continue;
                        tomoyo_print_namespace(head);
                        if (i < TOMOYO_MAX_MAC_INDEX)
                                tomoyo_io_printf(head, "%u-CONFIG::%s::%s",
                                                 index,
                                                 tomoyo_category_keywords
                                                 [tomoyo_index2category[i]],
                                                 tomoyo_mac_keywords[i]);
                        else
                                tomoyo_io_printf(head, "%u-CONFIG::%s", index,
                                                 tomoyo_mac_keywords[i]);
                        tomoyo_print_config(head, config);
                        head->r.bit++;
                        break;
                }
                if (head->r.bit == TOMOYO_MAX_MAC_INDEX
                    + TOMOYO_MAX_MAC_CATEGORY_INDEX) {
                        head->r.index++;
                        head->r.step = 1;
                }
                break;
        }
        if (tomoyo_flush(head))
                goto next;
}

/**
 * tomoyo_same_manager - Check for duplicated "struct tomoyo_manager" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_manager(const struct tomoyo_acl_head *a,
                                const struct tomoyo_acl_head *b)
{
        return container_of(a, struct tomoyo_manager, head)->manager ==
                container_of(b, struct tomoyo_manager, head)->manager;
}

/**
 * tomoyo_update_manager_entry - Add a manager entry.
 *
 * @manager:   The path to manager or the domainnamme.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_manager_entry(const char *manager,
                                       const bool is_delete)
{
        struct tomoyo_manager e = { };
        struct tomoyo_acl_param param = {
                /* .ns = &tomoyo_kernel_namespace, */
                .is_delete = is_delete,
                .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER],
        };
        int error = is_delete ? -ENOENT : -ENOMEM;

        if (!tomoyo_correct_domain(manager) &&
            !tomoyo_correct_word(manager))
                return -EINVAL;
        e.manager = tomoyo_get_name(manager);
        if (e.manager) {
                error = tomoyo_update_policy(&e.head, sizeof(e), &param,
                                             tomoyo_same_manager);
                tomoyo_put_name(e.manager);
        }
        return error;
}

/**
 * tomoyo_write_manager - Write manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_manager(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;

        if (!strcmp(data, "manage_by_non_root")) {
                tomoyo_manage_by_non_root = !head->w.is_delete;
                return 0;
        }
        return tomoyo_update_manager_entry(data, head->w.is_delete);
}

/**
 * tomoyo_read_manager - Read manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_manager(struct tomoyo_io_buffer *head)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) {
                struct tomoyo_manager *ptr =
                        list_entry(head->r.acl, typeof(*ptr), head.list);

                if (ptr->head.is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return;
                tomoyo_set_string(head, ptr->manager->name);
                tomoyo_set_lf(head);
        }
        head->r.eof = true;
}

/**
 * tomoyo_manager - Check whether the current process is a policy manager.
 *
 * Returns true if the current process is permitted to modify policy
 * via /sys/kernel/security/tomoyo/ interface.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_manager(void)
{
        struct tomoyo_manager *ptr;
        const char *exe;
        const struct task_struct *task = current;
        const struct tomoyo_path_info *domainname = tomoyo_domain()->domainname;
        bool found = IS_ENABLED(CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING);

        if (!tomoyo_policy_loaded)
                return true;
        if (!tomoyo_manage_by_non_root &&
            (!uid_eq(task->cred->uid,  GLOBAL_ROOT_UID) ||
             !uid_eq(task->cred->euid, GLOBAL_ROOT_UID)))
                return false;
        exe = tomoyo_get_exe();
        if (!exe)
                return false;
        list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!ptr->head.is_deleted &&
                    (!tomoyo_pathcmp(domainname, ptr->manager) ||
                     !strcmp(exe, ptr->manager->name))) {
                        found = true;
                        break;
                }
        }
        if (!found) { /* Reduce error messages. */
                static pid_t last_pid;
                const pid_t pid = current->pid;

                if (last_pid != pid) {
                        pr_warn("%s ( %s ) is not permitted to update policies.\n",
                                domainname->name, exe);
                        last_pid = pid;
                }
        }
        kfree(exe);
        return found;
}

static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial);

/**
 * tomoyo_select_domain - Parse select command.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @data: String to parse.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
                                 const char *data)
{
        unsigned int pid;
        struct tomoyo_domain_info *domain = NULL;
        bool global_pid = false;

        if (strncmp(data, "select ", 7))
                return false;
        data += 7;
        if (sscanf(data, "pid=%u", &pid) == 1 ||
            (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) {
                struct task_struct *p;

                rcu_read_lock();
                if (global_pid)
                        p = find_task_by_pid_ns(pid, &init_pid_ns);
                else
                        p = find_task_by_vpid(pid);
                if (p)
                        domain = tomoyo_task(p)->domain_info;
                rcu_read_unlock();
        } else if (!strncmp(data, "domain=", 7)) {
                if (tomoyo_domain_def(data + 7))
                        domain = tomoyo_find_domain(data + 7);
        } else if (sscanf(data, "Q=%u", &pid) == 1) {
                domain = tomoyo_find_domain_by_qid(pid);
        } else
                return false;
        head->w.domain = domain;
        /* Accessing read_buf is safe because head->io_sem is held. */
        if (!head->read_buf)
                return true; /* Do nothing if open(O_WRONLY). */
        memset(&head->r, 0, sizeof(head->r));
        head->r.print_this_domain_only = true;
        if (domain)
                head->r.domain = &domain->list;
        else
                head->r.eof = true;
        tomoyo_io_printf(head, "# select %s\n", data);
        if (domain && domain->is_deleted)
                tomoyo_io_printf(head, "# This is a deleted domain.\n");
        return true;
}

/**
 * tomoyo_same_task_acl - Check for duplicated "struct tomoyo_task_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head);

        return p1->domainname == p2->domainname;
}

/**
 * tomoyo_write_task - Update task related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_task(struct tomoyo_acl_param *param)
{
        int error = -EINVAL;

        if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) {
                struct tomoyo_task_acl e = {
                        .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL,
                        .domainname = tomoyo_get_domainname(param),
                };

                if (e.domainname)
                        error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                                     tomoyo_same_task_acl,
                                                     NULL);
                tomoyo_put_name(e.domainname);
        }
        return error;
}

/**
 * tomoyo_delete_domain - Delete a domain.
 *
 * @domainname: The name of domain.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_delete_domain(char *domainname)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -EINTR;
        /* Is there an active domain? */
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                /* Never delete tomoyo_kernel_domain */
                if (domain == &tomoyo_kernel_domain)
                        continue;
                if (domain->is_deleted ||
                    tomoyo_pathcmp(domain->domainname, &name))
                        continue;
                domain->is_deleted = true;
                break;
        }
        mutex_unlock(&tomoyo_policy_lock);
        return 0;
}

/**
 * tomoyo_write_domain2 - Write domain policy.
 *
 * @ns:        Pointer to "struct tomoyo_policy_namespace".
 * @list:      Pointer to "struct list_head".
 * @data:      Policy to be interpreted.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns,
                                struct list_head *list, char *data,
                                const bool is_delete)
{
        struct tomoyo_acl_param param = {
                .ns = ns,
                .list = list,
                .data = data,
                .is_delete = is_delete,
        };
        static const struct {
                const char *keyword;
                int (*write)(struct tomoyo_acl_param *param);
        } tomoyo_callback[5] = {
                { "file ", tomoyo_write_file },
                { "network inet ", tomoyo_write_inet_network },
                { "network unix ", tomoyo_write_unix_network },
                { "misc ", tomoyo_write_misc },
                { "task ", tomoyo_write_task },
        };
        u8 i;

        for (i = 0; i < ARRAY_SIZE(tomoyo_callback); i++) {
                if (!tomoyo_str_starts(&param.data,
                                       tomoyo_callback[i].keyword))
                        continue;
                return tomoyo_callback[i].write(&param);
        }
        return -EINVAL;
}

/* String table for domain flags. */
const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS] = {
        [TOMOYO_DIF_QUOTA_WARNED]      = "quota_exceeded\n",
        [TOMOYO_DIF_TRANSITION_FAILED] = "transition_failed\n",
};

/**
 * tomoyo_write_domain - Write domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        struct tomoyo_policy_namespace *ns;
        struct tomoyo_domain_info *domain = head->w.domain;
        const bool is_delete = head->w.is_delete;
        bool is_select = !is_delete && tomoyo_str_starts(&data, "select ");
        unsigned int idx;

        if (*data == '<') {
                int ret = 0;

                domain = NULL;
                if (is_delete)
                        ret = tomoyo_delete_domain(data);
                else if (is_select)
                        domain = tomoyo_find_domain(data);
                else
                        domain = tomoyo_assign_domain(data, false);
                head->w.domain = domain;
                return ret;
        }
        if (!domain)
                return -EINVAL;
        ns = domain->ns;
        if (sscanf(data, "use_profile %u", &idx) == 1
            && idx < TOMOYO_MAX_PROFILES) {
                if (!tomoyo_policy_loaded || ns->profile_ptr[idx])
                        if (!is_delete)
                                domain->profile = (u8) idx;
                return 0;
        }
        if (sscanf(data, "use_group %u\n", &idx) == 1
            && idx < TOMOYO_MAX_ACL_GROUPS) {
                if (!is_delete)
                        set_bit(idx, domain->group);
                else
                        clear_bit(idx, domain->group);
                return 0;
        }
        for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) {
                const char *cp = tomoyo_dif[idx];

                if (strncmp(data, cp, strlen(cp) - 1))
                        continue;
                domain->flags[idx] = !is_delete;
                return 0;
        }
        return tomoyo_write_domain2(ns, &domain->acl_info_list, data,
                                    is_delete);
}

/**
 * tomoyo_print_condition - Print condition part.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @cond: Pointer to "struct tomoyo_condition".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                                   const struct tomoyo_condition *cond)
{
        switch (head->r.cond_step) {
        case 0:
                head->r.cond_index = 0;
                head->r.cond_step++;
                if (cond->transit) {
                        tomoyo_set_space(head);
                        tomoyo_set_string(head, cond->transit->name);
                }
                fallthrough;
        case 1:
                {
                        const u16 condc = cond->condc;
                        const struct tomoyo_condition_element *condp =
                                (typeof(condp)) (cond + 1);
                        const struct tomoyo_number_union *numbers_p =
                                (typeof(numbers_p)) (condp + condc);
                        const struct tomoyo_name_union *names_p =
                                (typeof(names_p))
                                (numbers_p + cond->numbers_count);
                        const struct tomoyo_argv *argv =
                                (typeof(argv)) (names_p + cond->names_count);
                        const struct tomoyo_envp *envp =
                                (typeof(envp)) (argv + cond->argc);
                        u16 skip;

                        for (skip = 0; skip < head->r.cond_index; skip++) {
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                condp++;
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        names_p++;
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                        }
                        while (head->r.cond_index < condc) {
                                const u8 match = condp->equals;
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                if (!tomoyo_flush(head))
                                        return false;
                                condp++;
                                head->r.cond_index++;
                                tomoyo_set_space(head);
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        tomoyo_io_printf(head,
                                                         "exec.argv[%lu]%s=\"",
                                                         argv->index, argv->is_not ? "!" : "");
                                        tomoyo_set_string(head,
                                                          argv->value->name);
                                        tomoyo_set_string(head, "\"");
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        tomoyo_set_string(head,
                                                          "exec.envp[\"");
                                        tomoyo_set_string(head,
                                                          envp->name->name);
                                        tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : "");
                                        if (envp->value) {
                                                tomoyo_set_string(head, "\"");
                                                tomoyo_set_string(head, envp->value->name);
                                                tomoyo_set_string(head, "\"");
                                        } else {
                                                tomoyo_set_string(head,
                                                                  "NULL");
                                        }
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                               tomoyo_condition_keyword[left]);
                                        break;
                                }
                                tomoyo_set_string(head, match ? "=" : "!=");
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        tomoyo_print_name_union_quoted
                                                (head, names_p++);
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                          tomoyo_condition_keyword[right]);
                                        break;
                                }
                        }
                }
                head->r.cond_step++;
                fallthrough;
        case 2:
                if (!tomoyo_flush(head))
                        break;
                head->r.cond_step++;
                fallthrough;
        case 3:
                if (cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                        tomoyo_io_printf(head, " grant_log=%s",
                                         str_yes_no(cond->grant_log ==
                                                    TOMOYO_GRANTLOG_YES));
                tomoyo_set_lf(head);
                return true;
        }
        return false;
}

/**
 * tomoyo_set_group - Print "acl_group " header keyword and category name.
 *
 * @head:     Pointer to "struct tomoyo_io_buffer".
 * @category: Category name.
 *
 * Returns nothing.
 */
static void tomoyo_set_group(struct tomoyo_io_buffer *head,
                             const char *category)
{
        if (head->type == TOMOYO_EXCEPTIONPOLICY) {
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "acl_group %u ",
                                 head->r.acl_group_index);
        }
        tomoyo_set_string(head, category);
}

/**
 * tomoyo_print_entry - Print an ACL entry.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @acl:  Pointer to an ACL entry.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                               struct tomoyo_acl_info *acl)
{
        const u8 acl_type = acl->type;
        bool first = true;
        u8 bit;

        if (head->r.print_cond_part)
                goto print_cond_part;
        if (acl->is_deleted)
                return true;
        if (!tomoyo_flush(head))
                return false;
        else if (acl_type == TOMOYO_TYPE_PATH_ACL) {
                struct tomoyo_path_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u16 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (head->r.print_transition_related_only &&
                            bit != TOMOYO_TYPE_EXECUTE)
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_path_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) {
                struct tomoyo_task_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "task ");
                tomoyo_set_string(head, "manual_domain_transition ");
                tomoyo_set_string(head, ptr->domainname->name);
        } else if (head->r.print_transition_related_only) {
                return true;
        } else if (acl_type == TOMOYO_TYPE_PATH2_ACL) {
                struct tomoyo_path2_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pp2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name1);
                tomoyo_print_name_union(head, &ptr->name2);
        } else if (acl_type == TOMOYO_TYPE_PATH_NUMBER_ACL) {
                struct tomoyo_path_number_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->number);
        } else if (acl_type == TOMOYO_TYPE_MKDEV_ACL) {
                struct tomoyo_mkdev_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pnnn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->mode);
                tomoyo_print_number_union(head, &ptr->major);
                tomoyo_print_number_union(head, &ptr->minor);
        } else if (acl_type == TOMOYO_TYPE_INET_ACL) {
                struct tomoyo_inet_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network inet ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_set_space(head);
                if (ptr->address.group) {
                        tomoyo_set_string(head, "@");
                        tomoyo_set_string(head, ptr->address.group->group_name
                                          ->name);
                } else {
                        char buf[128];

                        tomoyo_print_ip(buf, sizeof(buf), &ptr->address);
                        tomoyo_io_printf(head, "%s", buf);
                }
                tomoyo_print_number_union(head, &ptr->port);
        } else if (acl_type == TOMOYO_TYPE_UNIX_ACL) {
                struct tomoyo_unix_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network unix ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) {
                struct tomoyo_mount_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "file mount");
                tomoyo_print_name_union(head, &ptr->dev_name);
                tomoyo_print_name_union(head, &ptr->dir_name);
                tomoyo_print_name_union(head, &ptr->fs_type);
                tomoyo_print_number_union(head, &ptr->flags);
        } else if (acl_type == TOMOYO_TYPE_ENV_ACL) {
                struct tomoyo_env_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "misc env ");
                tomoyo_set_string(head, ptr->env->name);
        }
        if (acl->cond) {
                head->r.print_cond_part = true;
                head->r.cond_step = 0;
                if (!tomoyo_flush(head))
                        return false;
print_cond_part:
                if (!tomoyo_print_condition(head, acl->cond))
                        return false;
                head->r.print_cond_part = false;
        } else {
                tomoyo_set_lf(head);
        }
        return true;
}

/**
 * tomoyo_read_domain2 - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @list: Pointer to "struct list_head".
 *
 * Caller holds tomoyo_read_lock().
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head,
                                struct list_head *list)
{
        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_info *ptr =
                        list_entry(head->r.acl, typeof(*ptr), list);

                if (!tomoyo_print_entry(head, ptr))
                        return false;
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_domain - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.domain, &tomoyo_domain_list) {
                struct tomoyo_domain_info *domain =
                        list_entry(head->r.domain, typeof(*domain), list);
                u8 i;

                switch (head->r.step) {
                case 0:
                        if (domain->is_deleted &&
                            !head->r.print_this_domain_only)
                                continue;
                        /* Print domainname and flags. */
                        tomoyo_set_string(head, domain->domainname->name);
                        tomoyo_set_lf(head);
                        tomoyo_io_printf(head, "use_profile %u\n",
                                         domain->profile);
                        for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++)
                                if (domain->flags[i])
                                        tomoyo_set_string(head, tomoyo_dif[i]);
                        head->r.index = 0;
                        head->r.step++;
                        fallthrough;
                case 1:
                        while (head->r.index < TOMOYO_MAX_ACL_GROUPS) {
                                i = head->r.index++;
                                if (!test_bit(i, domain->group))
                                        continue;
                                tomoyo_io_printf(head, "use_group %u\n", i);
                                if (!tomoyo_flush(head))
                                        return;
                        }
                        head->r.index = 0;
                        head->r.step++;
                        tomoyo_set_lf(head);
                        fallthrough;
                case 2:
                        if (!tomoyo_read_domain2(head, &domain->acl_info_list))
                                return;
                        head->r.step++;
                        if (!tomoyo_set_lf(head))
                                return;
                        fallthrough;
                case 3:
                        head->r.step = 0;
                        if (head->r.print_this_domain_only)
                                goto done;
                }
        }
 done:
        head->r.eof = true;
}

/**
 * tomoyo_write_pid: Specify PID to obtain domainname.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_pid(struct tomoyo_io_buffer *head)
{
        head->r.eof = false;
        return 0;
}

/**
 * tomoyo_read_pid - Get domainname of the specified PID.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns the domainname which the specified PID is in on success,
 * empty string otherwise.
 * The PID is specified by tomoyo_write_pid() so that the user can obtain
 * using read()/write() interface rather than sysctl() interface.
 */
static void tomoyo_read_pid(struct tomoyo_io_buffer *head)
{
        char *buf = head->write_buf;
        bool global_pid = false;
        unsigned int pid;
        struct task_struct *p;
        struct tomoyo_domain_info *domain = NULL;

        /* Accessing write_buf is safe because head->io_sem is held. */
        if (!buf) {
                head->r.eof = true;
                return; /* Do nothing if open(O_RDONLY). */
        }
        if (head->r.w_pos || head->r.eof)
                return;
        head->r.eof = true;
        if (tomoyo_str_starts(&buf, "global-pid "))
                global_pid = true;
        if (kstrtouint(buf, 10, &pid))
                return;
        rcu_read_lock();
        if (global_pid)
                p = find_task_by_pid_ns(pid, &init_pid_ns);
        else
                p = find_task_by_vpid(pid);
        if (p)
                domain = tomoyo_task(p)->domain_info;
        rcu_read_unlock();
        if (!domain)
                return;
        tomoyo_io_printf(head, "%u %u ", pid, domain->profile);
        tomoyo_set_string(head, domain->domainname->name);
}

/* String table for domain transition control keywords. */
static const char *tomoyo_transition_type[TOMOYO_MAX_TRANSITION_TYPE] = {
        [TOMOYO_TRANSITION_CONTROL_NO_RESET]      = "no_reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_RESET]         = "reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE] = "no_initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_INITIALIZE]    = "initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_KEEP]       = "no_keep_domain ",
        [TOMOYO_TRANSITION_CONTROL_KEEP]          = "keep_domain ",
};

/* String table for grouping keywords. */
static const char *tomoyo_group_name[TOMOYO_MAX_GROUP] = {
        [TOMOYO_PATH_GROUP]    = "path_group ",
        [TOMOYO_NUMBER_GROUP]  = "number_group ",
        [TOMOYO_ADDRESS_GROUP] = "address_group ",
};

/**
 * tomoyo_write_exception - Write exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
{
        const bool is_delete = head->w.is_delete;
        struct tomoyo_acl_param param = {
                .ns = head->w.ns,
                .is_delete = is_delete,
                .data = head->write_buf,
        };
        u8 i;

        if (tomoyo_str_starts(&param.data, "aggregator "))
                return tomoyo_write_aggregator(&param);
        for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_transition_type[i]))
                        return tomoyo_write_transition_control(&param, i);
        for (i = 0; i < TOMOYO_MAX_GROUP; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_group_name[i]))
                        return tomoyo_write_group(&param, i);
        if (tomoyo_str_starts(&param.data, "acl_group ")) {
                unsigned int group;
                char *data;

                group = simple_strtoul(param.data, &data, 10);
                if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ')
                        return tomoyo_write_domain2
                                (head->w.ns, &head->w.ns->acl_group[group],
                                 data, is_delete);
        }
        return -EINVAL;
}

/**
 * tomoyo_read_group - Read "struct tomoyo_path_group"/"struct tomoyo_number_group"/"struct tomoyo_address_group" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->group_list[idx];

        list_for_each_cookie(head->r.group, list) {
                struct tomoyo_group *group =
                        list_entry(head->r.group, typeof(*group), head.list);

                list_for_each_cookie(head->r.acl, &group->member_list) {
                        struct tomoyo_acl_head *ptr =
                                list_entry(head->r.acl, typeof(*ptr), list);

                        if (ptr->is_deleted)
                                continue;
                        if (!tomoyo_flush(head))
                                return false;
                        tomoyo_print_namespace(head);
                        tomoyo_set_string(head, tomoyo_group_name[idx]);
                        tomoyo_set_string(head, group->group_name->name);
                        if (idx == TOMOYO_PATH_GROUP) {
                                tomoyo_set_space(head);
                                tomoyo_set_string(head, container_of
                                               (ptr, struct tomoyo_path_group,
                                                head)->member_name->name);
                        } else if (idx == TOMOYO_NUMBER_GROUP) {
                                tomoyo_print_number_union(head, &container_of
                                                          (ptr,
                                                   struct tomoyo_number_group,
                                                           head)->number);
                        } else if (idx == TOMOYO_ADDRESS_GROUP) {
                                char buffer[128];
                                struct tomoyo_address_group *member =
                                        container_of(ptr, typeof(*member),
                                                     head);

                                tomoyo_print_ip(buffer, sizeof(buffer),
                                                &member->address);
                                tomoyo_io_printf(head, " %s", buffer);
                        }
                        tomoyo_set_lf(head);
                }
                head->r.acl = NULL;
        }
        head->r.group = NULL;
        return true;
}

/**
 * tomoyo_read_policy - Read "struct tomoyo_..._entry" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->policy_list[idx];

        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_head *acl =
                        container_of(head->r.acl, typeof(*acl), list);
                if (acl->is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return false;
                switch (idx) {
                case TOMOYO_ID_TRANSITION_CONTROL:
                        {
                                struct tomoyo_transition_control *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, tomoyo_transition_type
                                                  [ptr->type]);
                                tomoyo_set_string(head, ptr->program ?
                                                  ptr->program->name : "any");
                                tomoyo_set_string(head, " from ");
                                tomoyo_set_string(head, ptr->domainname ?
                                                  ptr->domainname->name :
                                                  "any");
                        }
                        break;
                case TOMOYO_ID_AGGREGATOR:
                        {
                                struct tomoyo_aggregator *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, "aggregator ");
                                tomoyo_set_string(head,
                                                  ptr->original_name->name);
                                tomoyo_set_space(head);
                                tomoyo_set_string(head,
                                               ptr->aggregated_name->name);
                        }
                        break;
                default:
                        continue;
                }
                tomoyo_set_lf(head);
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_exception - Read exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_exception(struct tomoyo_io_buffer *head)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);

        if (head->r.eof)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY &&
               tomoyo_read_policy(head, head->r.step))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP &&
               tomoyo_read_group(head, head->r.step - TOMOYO_MAX_POLICY))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP
               + TOMOYO_MAX_ACL_GROUPS) {
                head->r.acl_group_index = head->r.step - TOMOYO_MAX_POLICY
                        - TOMOYO_MAX_GROUP;
                if (!tomoyo_read_domain2(head, &ns->acl_group
                                         [head->r.acl_group_index]))
                        return;
                head->r.step++;
        }
        head->r.eof = true;
}

/* Wait queue for kernel -> userspace notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_query_wait);
/* Wait queue for userspace -> kernel notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_answer_wait);

/* Structure for query. */
struct tomoyo_query {
        struct list_head list;
        struct tomoyo_domain_info *domain;
        char *query;
        size_t query_len;
        unsigned int serial;
        u8 timer;
        u8 answer;
        u8 retry;
};

/* The list for "struct tomoyo_query". */
static LIST_HEAD(tomoyo_query_list);

/* Lock for manipulating tomoyo_query_list. */
static DEFINE_SPINLOCK(tomoyo_query_list_lock);

/*
 * Number of "struct file" referring /sys/kernel/security/tomoyo/query
 * interface.
 */
static atomic_t tomoyo_query_observers = ATOMIC_INIT(0);

/**
 * tomoyo_truncate - Truncate a line.
 *
 * @str: String to truncate.
 *
 * Returns length of truncated @str.
 */
static int tomoyo_truncate(char *str)
{
        char *start = str;

        while (*(unsigned char *) str > (unsigned char) ' ')
                str++;
        *str = '\0';
        return strlen(start) + 1;
}

/**
 * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode.
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @header: Lines containing ACL.
 *
 * Returns nothing.
 */
static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header)
{
        char *buffer;
        char *realpath = NULL;
        char *argv0 = NULL;
        char *symlink = NULL;
        char *cp = strchr(header, '\n');
        int len;

        if (!cp)
                return;
        cp = strchr(cp + 1, '\n');
        if (!cp)
                return;
        *cp++ = '\0';
        len = strlen(cp) + 1;
        /* strstr() will return NULL if ordering is wrong. */
        if (*cp == 'f') {
                argv0 = strstr(header, " argv[]={ \"");
                if (argv0) {
                        argv0 += 10;
                        len += tomoyo_truncate(argv0) + 14;
                }
                realpath = strstr(header, " exec={ realpath=\"");
                if (realpath) {
                        realpath += 8;
                        len += tomoyo_truncate(realpath) + 6;
                }
                symlink = strstr(header, " symlink.target=\"");
                if (symlink)
                        len += tomoyo_truncate(symlink + 1) + 1;
        }
        buffer = kmalloc(len, GFP_NOFS);
        if (!buffer)
                return;
        snprintf(buffer, len - 1, "%s", cp);
        if (realpath)
                tomoyo_addprintf(buffer, len, " exec.%s", realpath);
        if (argv0)
                tomoyo_addprintf(buffer, len, " exec.argv[0]=%s", argv0);
        if (symlink)
                tomoyo_addprintf(buffer, len, "%s", symlink);
        tomoyo_normalize_line(buffer);
        if (!tomoyo_write_domain2(domain->ns, &domain->acl_info_list, buffer,
                                  false))
                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
        kfree(buffer);
}

/**
 * tomoyo_supervisor - Ask for the supervisor's decision.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns 0 if the supervisor decided to permit the access request which
 * violated the policy in enforcing mode, TOMOYO_RETRY_REQUEST if the
 * supervisor decided to retry the access request which violated the policy in
 * enforcing mode, 0 if it is not in enforcing mode, -EPERM otherwise.
 */
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int error;
        int len;
        static unsigned int tomoyo_serial;
        struct tomoyo_query entry = { };
        bool quota_exceeded = false;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        /* Write /sys/kernel/security/tomoyo/audit. */
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
        /* Nothing more to do if granted. */
        if (r->granted)
                return 0;
        if (r->mode)
                tomoyo_update_stat(r->mode);
        switch (r->mode) {
        case TOMOYO_CONFIG_ENFORCING:
                error = -EPERM;
                if (atomic_read(&tomoyo_query_observers))
                        break;
                goto out;
        case TOMOYO_CONFIG_LEARNING:
                error = 0;
                /* Check max_learning_entry parameter. */
                if (tomoyo_domain_quota_is_ok(r))
                        break;
                fallthrough;
        default:
                return 0;
        }
        /* Get message. */
        va_start(args, fmt);
        entry.query = tomoyo_init_log(r, len, fmt, args);
        va_end(args);
        if (!entry.query)
                goto out;
        entry.query_len = strlen(entry.query) + 1;
        if (!error) {
                tomoyo_add_entry(r->domain, entry.query);
                goto out;
        }
        len = kmalloc_size_roundup(entry.query_len);
        entry.domain = r->domain;
        spin_lock(&tomoyo_query_list_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_QUERY] &&
            tomoyo_memory_used[TOMOYO_MEMORY_QUERY] + len
            >= tomoyo_memory_quota[TOMOYO_MEMORY_QUERY]) {
                quota_exceeded = true;
        } else {
                entry.serial = tomoyo_serial++;
                entry.retry = r->retry;
                tomoyo_memory_used[TOMOYO_MEMORY_QUERY] += len;
                list_add_tail(&entry.list, &tomoyo_query_list);
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (quota_exceeded)
                goto out;
        /* Give 10 seconds for supervisor's opinion. */
        while (entry.timer < 10) {
                wake_up_all(&tomoyo_query_wait);
                if (wait_event_interruptible_timeout
                    (tomoyo_answer_wait, entry.answer ||
                     !atomic_read(&tomoyo_query_observers), HZ))
                        break;
                entry.timer++;
        }
        spin_lock(&tomoyo_query_list_lock);
        list_del(&entry.list);
        tomoyo_memory_used[TOMOYO_MEMORY_QUERY] -= len;
        spin_unlock(&tomoyo_query_list_lock);
        switch (entry.answer) {
        case 3: /* Asked to retry by administrator. */
                error = TOMOYO_RETRY_REQUEST;
                r->retry++;
                break;
        case 1:
                /* Granted by administrator. */
                error = 0;
                break;
        default:
                /* Timed out or rejected by administrator. */
                break;
        }
out:
        kfree(entry.query);
        return error;
}

/**
 * tomoyo_find_domain_by_qid - Get domain by query id.
 *
 * @serial: Query ID assigned by tomoyo_supervisor().
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 */
static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial)
{
        struct tomoyo_query *ptr;
        struct tomoyo_domain_info *domain = NULL;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each_entry(ptr, &tomoyo_query_list, list) {
                if (ptr->serial != serial)
                        continue;
                domain = ptr->domain;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return domain;
}

/**
 * tomoyo_poll_query - poll() for /sys/kernel/security/tomoyo/query.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table".
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read, 0 otherwise.
 *
 * Waits for access requests which violated policy in enforcing mode.
 */
static __poll_t tomoyo_poll_query(struct file *file, poll_table *wait)
{
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_query_wait, wait);
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}

/**
 * tomoyo_read_query - Read access requests which violated policy in enforcing mode.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
static void tomoyo_read_query(struct tomoyo_io_buffer *head)
{
        struct list_head *tmp;
        unsigned int pos = 0;
        size_t len = 0;
        char *buf;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                len = ptr->query_len;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (!len) {
                head->r.query_index = 0;
                return;
        }
        buf = kzalloc(len + 32, GFP_NOFS);
        if (!buf)
                return;
        pos = 0;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                /*
                 * Some query can be skipped because tomoyo_query_list
                 * can change, but I don't care.
                 */
                if (len == ptr->query_len)
                        snprintf(buf, len + 31, "Q%u-%hu\n%s", ptr->serial,
                                 ptr->retry, ptr->query);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (buf[0]) {
                head->read_buf = buf;
                head->r.w[head->r.w_pos++] = buf;
                head->r.query_index++;
        } else {
                kfree(buf);
        }
}

/**
 * tomoyo_write_answer - Write the supervisor's decision.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, -EINVAL otherwise.
 */
static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        struct list_head *tmp;
        unsigned int serial;
        unsigned int answer;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                ptr->timer = 0;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (sscanf(data, "A%u=%u", &serial, &answer) != 2)
                return -EINVAL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (ptr->serial != serial)
                        continue;
                ptr->answer = answer;
                /* Remove from tomoyo_query_list. */
                if (ptr->answer)
                        list_del_init(&ptr->list);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return 0;
}

/**
 * tomoyo_read_version: Get version.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns version information.
 */
static void tomoyo_read_version(struct tomoyo_io_buffer *head)
{
        if (!head->r.eof) {
                tomoyo_io_printf(head, "2.6.0");
                head->r.eof = true;
        }
}

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_policy_headers[TOMOYO_MAX_POLICY_STAT] = {
        [TOMOYO_STAT_POLICY_UPDATES]    = "update:",
        [TOMOYO_STAT_POLICY_LEARNING]   = "violation in learning mode:",
        [TOMOYO_STAT_POLICY_PERMISSIVE] = "violation in permissive mode:",
        [TOMOYO_STAT_POLICY_ENFORCING]  = "violation in enforcing mode:",
};

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_memory_headers[TOMOYO_MAX_MEMORY_STAT] = {
        [TOMOYO_MEMORY_POLICY] = "policy:",
        [TOMOYO_MEMORY_AUDIT]  = "audit log:",
        [TOMOYO_MEMORY_QUERY]  = "query message:",
};

/* Counter for number of updates. */
static atomic_t tomoyo_stat_updated[TOMOYO_MAX_POLICY_STAT];
/* Timestamp counter for last updated. */
static time64_t tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT];

/**
 * tomoyo_update_stat - Update statistic counters.
 *
 * @index: Index for policy type.
 *
 * Returns nothing.
 */
void tomoyo_update_stat(const u8 index)
{
        atomic_inc(&tomoyo_stat_updated[index]);
        tomoyo_stat_modified[index] = ktime_get_real_seconds();
}

/**
 * tomoyo_read_stat - Read statistic data.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
{
        u8 i;
        unsigned int total = 0;

        if (head->r.eof)
                return;
        for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) {
                tomoyo_io_printf(head, "Policy %-30s %10u",
                                 tomoyo_policy_headers[i],
                                 atomic_read(&tomoyo_stat_updated[i]));
                if (tomoyo_stat_modified[i]) {
                        struct tomoyo_time stamp;

                        tomoyo_convert_time(tomoyo_stat_modified[i], &stamp);
                        tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)",
                                         stamp.year, stamp.month, stamp.day,
                                         stamp.hour, stamp.min, stamp.sec);
                }
                tomoyo_set_lf(head);
        }
        for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) {
                unsigned int used = tomoyo_memory_used[i];

                total += used;
                tomoyo_io_printf(head, "Memory used by %-22s %10u",
                                 tomoyo_memory_headers[i], used);
                used = tomoyo_memory_quota[i];
                if (used)
                        tomoyo_io_printf(head, " (Quota: %10u)", used);
                tomoyo_set_lf(head);
        }
        tomoyo_io_printf(head, "Total memory used:                    %10u\n",
                         total);
        head->r.eof = true;
}

/**
 * tomoyo_write_stat - Set memory quota.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_stat(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        u8 i;

        if (tomoyo_str_starts(&data, "Memory used by "))
                for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++)
                        if (tomoyo_str_starts(&data, tomoyo_memory_headers[i]))
                                sscanf(data, "%u", &tomoyo_memory_quota[i]);
        return 0;
}

/**
 * tomoyo_open_control - open() for /sys/kernel/security/tomoyo/ interface.
 *
 * @type: Type of interface.
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_open_control(const u8 type, struct file *file)
{
        struct tomoyo_io_buffer *head = kzalloc(sizeof(*head), GFP_NOFS);

        if (!head)
                return -ENOMEM;
        mutex_init(&head->io_sem);
        head->type = type;
        switch (type) {
        case TOMOYO_DOMAINPOLICY:
                /* /sys/kernel/security/tomoyo/domain_policy */
                head->write = tomoyo_write_domain;
                head->read = tomoyo_read_domain;
                break;
        case TOMOYO_EXCEPTIONPOLICY:
                /* /sys/kernel/security/tomoyo/exception_policy */
                head->write = tomoyo_write_exception;
                head->read = tomoyo_read_exception;
                break;
        case TOMOYO_AUDIT:
                /* /sys/kernel/security/tomoyo/audit */
                head->poll = tomoyo_poll_log;
                head->read = tomoyo_read_log;
                break;
        case TOMOYO_PROCESS_STATUS:
                /* /sys/kernel/security/tomoyo/.process_status */
                head->write = tomoyo_write_pid;
                head->read = tomoyo_read_pid;
                break;
        case TOMOYO_VERSION:
                /* /sys/kernel/security/tomoyo/version */
                head->read = tomoyo_read_version;
                head->readbuf_size = 128;
                break;
        case TOMOYO_STAT:
                /* /sys/kernel/security/tomoyo/stat */
                head->write = tomoyo_write_stat;
                head->read = tomoyo_read_stat;
                head->readbuf_size = 1024;
                break;
        case TOMOYO_PROFILE:
                /* /sys/kernel/security/tomoyo/profile */
                head->write = tomoyo_write_profile;
                head->read = tomoyo_read_profile;
                break;
        case TOMOYO_QUERY: /* /sys/kernel/security/tomoyo/query */
                head->poll = tomoyo_poll_query;
                head->write = tomoyo_write_answer;
                head->read = tomoyo_read_query;
                break;
        case TOMOYO_MANAGER:
                /* /sys/kernel/security/tomoyo/manager */
                head->write = tomoyo_write_manager;
                head->read = tomoyo_read_manager;
                break;
        }
        if (!(file->f_mode & FMODE_READ)) {
                /*
                 * No need to allocate read_buf since it is not opened
                 * for reading.
                 */
                head->read = NULL;
                head->poll = NULL;
        } else if (!head->poll) {
                /* Don't allocate read_buf for poll() access. */
                if (!head->readbuf_size)
                        head->readbuf_size = 4096 * 2;
                head->read_buf = kzalloc(head->readbuf_size, GFP_NOFS);
                if (!head->read_buf) {
                        kfree(head);
                        return -ENOMEM;
                }
        }
        if (!(file->f_mode & FMODE_WRITE)) {
                /*
                 * No need to allocate write_buf since it is not opened
                 * for writing.
                 */
                head->write = NULL;
        } else if (head->write) {
                head->writebuf_size = 4096 * 2;
                head->write_buf = kzalloc(head->writebuf_size, GFP_NOFS);
                if (!head->write_buf) {
                        kfree(head->read_buf);
                        kfree(head);
                        return -ENOMEM;
                }
        }
        /*
         * If the file is /sys/kernel/security/tomoyo/query , increment the
         * observer counter.
         * The obserber counter is used by tomoyo_supervisor() to see if
         * there is some process monitoring /sys/kernel/security/tomoyo/query.
         */
        if (type == TOMOYO_QUERY)
                atomic_inc(&tomoyo_query_observers);
        file->private_data = head;
        tomoyo_notify_gc(head, true);
        return 0;
}

/**
 * tomoyo_poll_control - poll() for /sys/kernel/security/tomoyo/ interface.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write,
 * EPOLLOUT | EPOLLWRNORM otherwise.
 */
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
{
        struct tomoyo_io_buffer *head = file->private_data;

        if (head->poll)
                return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM;
        return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM;
}

/**
 * tomoyo_set_namespace_cursor - Set namespace to read.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head)
{
        struct list_head *ns;

        if (head->type != TOMOYO_EXCEPTIONPOLICY &&
            head->type != TOMOYO_PROFILE)
                return;
        /*
         * If this is the first read, or reading previous namespace finished
         * and has more namespaces to read, update the namespace cursor.
         */
        ns = head->r.ns;
        if (!ns || (head->r.eof && ns->next != &tomoyo_namespace_list)) {
                /* Clearing is OK because tomoyo_flush() returned true. */
                memset(&head->r, 0, sizeof(head->r));
                head->r.ns = ns ? ns->next : tomoyo_namespace_list.next;
        }
}

/**
 * tomoyo_has_more_namespace - Check for unread namespaces.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if we have more entries to print, false otherwise.
 */
static inline bool tomoyo_has_more_namespace(struct tomoyo_io_buffer *head)
{
        return (head->type == TOMOYO_EXCEPTIONPOLICY ||
                head->type == TOMOYO_PROFILE) && head->r.eof &&
                head->r.ns->next != &tomoyo_namespace_list;
}

/**
 * tomoyo_read_control - read() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to write to.
 * @buffer_len: Size of @buffer.
 *
 * Returns bytes read on success, negative value otherwise.
 */
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len)
{
        int len;
        int idx;

        if (!head->read)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        head->read_user_buf = buffer;
        head->read_user_buf_avail = buffer_len;
        idx = tomoyo_read_lock();
        if (tomoyo_flush(head))
                /* Call the policy handler. */
                do {
                        tomoyo_set_namespace_cursor(head);
                        head->read(head);
                } while (tomoyo_flush(head) &&
                         tomoyo_has_more_namespace(head));
        tomoyo_read_unlock(idx);
        len = head->read_user_buf - buffer;
        mutex_unlock(&head->io_sem);
        return len;
}

/**
 * tomoyo_parse_policy - Parse a policy line.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @line: Line to parse.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line)
{
        /* Delete request? */
        head->w.is_delete = !strncmp(line, "delete ", 7);
        if (head->w.is_delete)
                memmove(line, line + 7, strlen(line + 7) + 1);
        /* Selecting namespace to update. */
        if (head->type == TOMOYO_EXCEPTIONPOLICY ||
            head->type == TOMOYO_PROFILE) {
                if (*line == '<') {
                        char *cp = strchr(line, ' ');

                        if (cp) {
                                *cp++ = '\0';
                                head->w.ns = tomoyo_assign_namespace(line);
                                memmove(line, cp, strlen(cp) + 1);
                        } else
                                head->w.ns = NULL;
                } else
                        head->w.ns = &tomoyo_kernel_namespace;
                /* Don't allow updating if namespace is invalid. */
                if (!head->w.ns)
                        return -ENOENT;
        }
        /* Do the update. */
        return head->write(head);
}

/**
 * tomoyo_write_control - write() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to read from.
 * @buffer_len: Size of @buffer.
 *
 * Returns @buffer_len on success, negative value otherwise.
 */
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len)
{
        int error = buffer_len;
        size_t avail_len = buffer_len;
        char *cp0;
        int idx;

        if (!head->write)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        cp0 = head->write_buf;
        head->read_user_buf_avail = 0;
        idx = tomoyo_read_lock();
        /* Read a line and dispatch it to the policy handler. */
        while (avail_len > 0) {
                char c;

                if (head->w.avail >= head->writebuf_size - 1) {
                        const int len = head->writebuf_size * 2;
                        char *cp = kzalloc(len, GFP_NOFS);

                        if (!cp) {
                                error = -ENOMEM;
                                break;
                        }
                        memmove(cp, cp0, head->w.avail);
                        kfree(cp0);
                        head->write_buf = cp;
                        cp0 = cp;
                        head->writebuf_size = len;
                }
                if (get_user(c, buffer)) {
                        error = -EFAULT;
                        break;
                }
                buffer++;
                avail_len--;
                cp0[head->w.avail++] = c;
                if (c != '\n')
                        continue;
                cp0[head->w.avail - 1] = '\0';
                head->w.avail = 0;
                tomoyo_normalize_line(cp0);
                if (!strcmp(cp0, "reset")) {
                        head->w.ns = &tomoyo_kernel_namespace;
                        head->w.domain = NULL;
                        memset(&head->r, 0, sizeof(head->r));
                        continue;
                }
                /* Don't allow updating policies by non manager programs. */
                switch (head->type) {
                case TOMOYO_PROCESS_STATUS:
                        /* This does not write anything. */
                        break;
                case TOMOYO_DOMAINPOLICY:
                        if (tomoyo_select_domain(head, cp0))
                                continue;
                        fallthrough;
                case TOMOYO_EXCEPTIONPOLICY:
                        if (!strcmp(cp0, "select transition_only")) {
                                head->r.print_transition_related_only = true;
                                continue;
                        }
                        fallthrough;
                default:
                        if (!tomoyo_manager()) {
                                error = -EPERM;
                                goto out;
                        }
                }
                switch (tomoyo_parse_policy(head, cp0)) {
                case -EPERM:
                        error = -EPERM;
                        goto out;
                case 0:
                        switch (head->type) {
                        case TOMOYO_DOMAINPOLICY:
                        case TOMOYO_EXCEPTIONPOLICY:
                        case TOMOYO_STAT:
                        case TOMOYO_PROFILE:
                        case TOMOYO_MANAGER:
                                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                                break;
                        default:
                                break;
                        }
                        break;
                }
        }
out:
        tomoyo_read_unlock(idx);
        mutex_unlock(&head->io_sem);
        return error;
}

/**
 * tomoyo_close_control - close() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
void tomoyo_close_control(struct tomoyo_io_buffer *head)
{
        /*
         * If the file is /sys/kernel/security/tomoyo/query , decrement the
         * observer counter.
         */
        if (head->type == TOMOYO_QUERY &&
            atomic_dec_and_test(&tomoyo_query_observers))
                wake_up_all(&tomoyo_answer_wait);
        tomoyo_notify_gc(head, false);
}

/**
 * tomoyo_check_profile - Check all profiles currently assigned to domains are defined.
 */
void tomoyo_check_profile(void)
{
        struct tomoyo_domain_info *domain;
        const int idx = tomoyo_read_lock();

        tomoyo_policy_loaded = true;
        pr_info("TOMOYO: 2.6.0\n");
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                const u8 profile = domain->profile;
                struct tomoyo_policy_namespace *ns = domain->ns;

                if (ns->profile_version == 20110903) {
                        pr_info_once("Converting profile version from %u to %u.\n",
                                     20110903, 20150505);
                        ns->profile_version = 20150505;
                }
                if (ns->profile_version != 20150505)
                        pr_err("Profile version %u is not supported.\n",
                               ns->profile_version);
                else if (!ns->profile_ptr[profile])
                        pr_err("Profile %u (used by '%s') is not defined.\n",
                               profile, domain->domainname->name);
                else
                        continue;
                pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n");
                pr_err("Please see https://tomoyo.sourceforge.net/2.6/ for more information.\n");
                panic("STOP!");
        }
        tomoyo_read_unlock(idx);
        pr_info("Mandatory Access Control activated.\n");
}

/**
 * tomoyo_load_builtin_policy - Load built-in policy.
 *
 * Returns nothing.
 */
void __init tomoyo_load_builtin_policy(void)
{
#ifdef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        static char tomoyo_builtin_profile[] __initdata =
                "PROFILE_VERSION=20150505\n"
                "0-CONFIG={ mode=learning grant_log=no reject_log=yes }\n";
        static char tomoyo_builtin_exception_policy[] __initdata =
                "aggregator proc:/self/exe /proc/self/exe\n";
        static char tomoyo_builtin_domain_policy[] __initdata = "";
        static char tomoyo_builtin_manager[] __initdata = "";
        static char tomoyo_builtin_stat[] __initdata = "";
#else
        /*
         * This include file is manually created and contains built-in policy
         * named "tomoyo_builtin_profile", "tomoyo_builtin_exception_policy",
         * "tomoyo_builtin_domain_policy", "tomoyo_builtin_manager",
         * "tomoyo_builtin_stat" in the form of "static char [] __initdata".
         */
#include "builtin-policy.h"
#endif
        u8 i;
        const int idx = tomoyo_read_lock();

        for (i = 0; i < 5; i++) {
                struct tomoyo_io_buffer head = { };
                char *start = "";

                switch (i) {
                case 0:
                        start = tomoyo_builtin_profile;
                        head.type = TOMOYO_PROFILE;
                        head.write = tomoyo_write_profile;
                        break;
                case 1:
                        start = tomoyo_builtin_exception_policy;
                        head.type = TOMOYO_EXCEPTIONPOLICY;
                        head.write = tomoyo_write_exception;
                        break;
                case 2:
                        start = tomoyo_builtin_domain_policy;
                        head.type = TOMOYO_DOMAINPOLICY;
                        head.write = tomoyo_write_domain;
                        break;
                case 3:
                        start = tomoyo_builtin_manager;
                        head.type = TOMOYO_MANAGER;
                        head.write = tomoyo_write_manager;
                        break;
                case 4:
                        start = tomoyo_builtin_stat;
                        head.type = TOMOYO_STAT;
                        head.write = tomoyo_write_stat;
                        break;
                }
                while (1) {
                        char *end = strchr(start, '\n');

                        if (!end)
                                break;
                        *end = '\0';
                        tomoyo_normalize_line(start);
                        head.write_buf = start;
                        tomoyo_parse_policy(&head, start);
                        start = end + 1;
                }
        }
        tomoyo_read_unlock(idx);
#ifdef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        tomoyo_check_profile();
#endif
}









































































































    1 





    1 
    1 

    2 


    2 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IPv6 library code, needed by static components when full IPv6 support is
 * not configured or static.  These functions are needed by GSO/GRO implementation.
 */
#include <linux/export.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/addrconf.h>
#include <net/secure_seq.h>
#include <linux/netfilter.h>

static u32 __ipv6_select_ident(struct net *net,
                               const struct in6_addr *dst,
                               const struct in6_addr *src)
{
        return get_random_u32_above(0);
}

/* This function exists only for tap drivers that must support broken
 * clients requesting UFO without specifying an IPv6 fragment ID.
 *
 * This is similar to ipv6_select_ident() but we use an independent hash
 * seed to limit information leakage.
 *
 * The network header must be set before calling this.
 */
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
{
        struct in6_addr buf[2];
        struct in6_addr *addrs;
        u32 id;

        addrs = skb_header_pointer(skb,
                                   skb_network_offset(skb) +
                                   offsetof(struct ipv6hdr, saddr),
                                   sizeof(buf), buf);
        if (!addrs)
                return 0;

        id = __ipv6_select_ident(net, &addrs[1], &addrs[0]);
        return htonl(id);
}
EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);

__be32 ipv6_select_ident(struct net *net,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr)
{
        u32 id;

        id = __ipv6_select_ident(net, daddr, saddr);
        return htonl(id);
}
EXPORT_SYMBOL(ipv6_select_ident);

int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
{
        unsigned int offset = sizeof(struct ipv6hdr);
        unsigned int packet_len = skb_tail_pointer(skb) -
                skb_network_header(skb);
        int found_rhdr = 0;
        *nexthdr = &ipv6_hdr(skb)->nexthdr;

        while (offset <= packet_len) {
                struct ipv6_opt_hdr *exthdr;

                switch (**nexthdr) {

                case NEXTHDR_HOP:
                        break;
                case NEXTHDR_ROUTING:
                        found_rhdr = 1;
                        break;
                case NEXTHDR_DEST:
#if IS_ENABLED(CONFIG_IPV6_MIP6)
                        if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
                                break;
#endif
                        if (found_rhdr)
                                return offset;
                        break;
                default:
                        return offset;
                }

                if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
                        return -EINVAL;

                exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
                                                 offset);
                offset += ipv6_optlen(exthdr);
                if (offset > IPV6_MAXPLEN)
                        return -EINVAL;
                *nexthdr = &exthdr->nexthdr;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(ip6_find_1stfragopt);

#if IS_ENABLED(CONFIG_IPV6)
int ip6_dst_hoplimit(struct dst_entry *dst)
{
        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
        if (hoplimit == 0) {
                struct net_device *dev = dst->dev;
                struct inet6_dev *idev;

                rcu_read_lock();
                idev = __in6_dev_get(dev);
                if (idev)
                        hoplimit = READ_ONCE(idev->cnf.hop_limit);
                else
                        hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
                rcu_read_unlock();
        }
        return hoplimit;
}
EXPORT_SYMBOL(ip6_dst_hoplimit);
#endif

int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        int len;

        len = skb->len - sizeof(struct ipv6hdr);
        if (len > IPV6_MAXPLEN)
                len = 0;
        ipv6_hdr(skb)->payload_len = htons(len);
        IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);

        /* if egress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip6_out(sk, skb);
        if (unlikely(!skb))
                return 0;

        skb->protocol = htons(ETH_P_IPV6);

        return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
                       net, sk, skb, NULL, skb_dst(skb)->dev,
                       dst_output);
}
EXPORT_SYMBOL_GPL(__ip6_local_out);

int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        int err;

        err = __ip6_local_out(net, sk, skb);
        if (likely(err == 1))
                err = dst_output(net, sk, skb);

        return err;
}
EXPORT_SYMBOL_GPL(ip6_local_out);













































    2 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _NF_CONNTRACK_LABELS_H
#define _NF_CONNTRACK_LABELS_H

#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <linux/types.h>
#include <net/net_namespace.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <uapi/linux/netfilter/xt_connlabel.h>

#define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE)

struct nf_conn_labels {
        unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)];
};

/* Can't use nf_ct_ext_find(), flow dissector cannot use symbols
 * exported by nf_conntrack module.
 */
static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_LABELS
        struct nf_ct_ext *ext = ct->ext;

        if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS))
                return NULL;

        return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS];
#else
        return NULL;
#endif
}

static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_LABELS
        struct net *net = nf_ct_net(ct);

        if (atomic_read(&net->ct.labels_used) == 0)
                return NULL;

        return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC);
#else
        return NULL;
#endif
}

int nf_connlabels_replace(struct nf_conn *ct,
                          const u32 *data, const u32 *mask, unsigned int words);

#ifdef CONFIG_NF_CONNTRACK_LABELS
int nf_connlabels_get(struct net *net, unsigned int bit);
void nf_connlabels_put(struct net *net);
#else
static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; }
static inline void nf_connlabels_put(struct net *net) {}
#endif

#endif /* _NF_CONNTRACK_LABELS_H */


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IPV6_FRAG_H
#define _IPV6_FRAG_H
#include <linux/icmpv6.h>
#include <linux/kernel.h>
#include <net/addrconf.h>
#include <net/ipv6.h>
#include <net/inet_frag.h>

enum ip6_defrag_users {
        IP6_DEFRAG_LOCAL_DELIVER,
        IP6_DEFRAG_CONNTRACK_IN,
        __IP6_DEFRAG_CONNTRACK_IN        = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX,
        IP6_DEFRAG_CONNTRACK_OUT,
        __IP6_DEFRAG_CONNTRACK_OUT        = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
        IP6_DEFRAG_CONNTRACK_BRIDGE_IN,
        __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
};

/*
 *        Equivalent of ipv4 struct ip
 */
struct frag_queue {
        struct inet_frag_queue        q;

        int                        iif;
        __u16                        nhoffset;
        u8                        ecn;
};

#if IS_ENABLED(CONFIG_IPV6)
static inline void ip6frag_init(struct inet_frag_queue *q, const void *a)
{
        struct frag_queue *fq = container_of(q, struct frag_queue, q);
        const struct frag_v6_compare_key *key = a;

        q->key.v6 = *key;
        fq->ecn = 0;
}

static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed)
{
        return jhash2(data,
                      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}

static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed)
{
        const struct inet_frag_queue *fq = data;

        return jhash2((const u32 *)&fq->key.v6,
                      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
}

static inline int
ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
{
        const struct frag_v6_compare_key *key = arg->key;
        const struct inet_frag_queue *fq = ptr;

        return !!memcmp(&fq->key, key, sizeof(*key));
}

static inline void
ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
{
        struct net_device *dev = NULL;
        struct sk_buff *head;

        rcu_read_lock();
        /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
        if (READ_ONCE(fq->q.fqdir->dead))
                goto out_rcu_unlock;
        spin_lock(&fq->q.lock);

        if (fq->q.flags & INET_FRAG_COMPLETE)
                goto out;

        fq->q.flags |= INET_FRAG_DROP;
        inet_frag_kill(&fq->q);

        dev = dev_get_by_index_rcu(net, fq->iif);
        if (!dev)
                goto out;

        __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
        __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);

        /* Don't send error if the first segment did not arrive. */
        if (!(fq->q.flags & INET_FRAG_FIRST_IN))
                goto out;

        /* sk_buff::dev and sk_buff::rbnode are unionized. So we
         * pull the head out of the tree in order to be able to
         * deal with head->dev.
         */
        head = inet_frag_pull_head(&fq->q);
        if (!head)
                goto out;

        head->dev = dev;
        spin_unlock(&fq->q.lock);

        icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
        kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
        goto out_rcu_unlock;

out:
        spin_unlock(&fq->q.lock);
out_rcu_unlock:
        rcu_read_unlock();
        inet_frag_put(&fq->q);
}

/* Check if the upper layer header is truncated in the first fragment. */
static inline bool
ipv6frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp)
{
        u8 nexthdr = *nexthdrp;
        __be16 frag_off;
        int offset;

        offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off);
        if (offset < 0 || (frag_off & htons(IP6_OFFSET)))
                return false;
        switch (nexthdr) {
        case NEXTHDR_TCP:
                offset += sizeof(struct tcphdr);
                break;
        case NEXTHDR_UDP:
                offset += sizeof(struct udphdr);
                break;
        case NEXTHDR_ICMP:
                offset += sizeof(struct icmp6hdr);
                break;
        default:
                offset += 1;
        }
        if (offset > skb->len)
                return true;
        return false;
}

#endif
#endif


















































































    2 



















    2 




    2 






































    9 











    3 






    1 
















   10 







   10 


    5 

   10 










    9 
   10 


   10 


























    9 


    5 




   10 



























    2 


























    2 

    2 



















































    5 






















    3 





    4 



































    4 















    2 

































































































































    4 




    4 






































































    5 








    3 

    4 

    4 


    4 

    4 







    4 







    5 


    4 


    4 







    4 
    4 



































































    5 
    5 

















    6 












    1 


    1 


    5 

    5 






    5 
    5 


    5 











    5 

    5 




    4 
















    5 





























































    2 







    2 

    2 
















































































































































































































































































































































    1 












    2 

    1 




    1 

    2 














































    2 














    1 

    2 








    2 













    2 
    2 





    2 


    2 




    2 



    2 


    2 





















































































    8 



    9 





    9 









    1 



















    1 
















    1 



    1 






































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
// SPDX-License-Identifier: GPL-2.0+
/*
 * XArray implementation
 * Copyright (c) 2017-2018 Microsoft Corporation
 * Copyright (c) 2018-2020 Oracle
 * Author: Matthew Wilcox <willy@infradead.org>
 */

#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Coding conventions in this file:
 *
 * @xa is used to refer to the entire xarray.
 * @xas is the 'xarray operation state'.  It may be either a pointer to
 * an xa_state, or an xa_state stored on the stack.  This is an unfortunate
 * ambiguity.
 * @index is the index of the entry being operated on
 * @mark is an xa_mark_t; a small number indicating one of the mark bits.
 * @node refers to an xa_node; usually the primary one being operated on by
 * this function.
 * @offset is the index into the slots array inside an xa_node.
 * @parent refers to the @xa_node closer to the head than @node.
 * @entry refers to something stored in a slot in the xarray
 */

static inline unsigned int xa_lock_type(const struct xarray *xa)
{
        return (__force unsigned int)xa->xa_flags & 3;
}

static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_lock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_lock_bh(xas);
        else
                xas_lock(xas);
}

static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_unlock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_unlock_bh(xas);
        else
                xas_unlock(xas);
}

static inline bool xa_track_free(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_TRACK_FREE;
}

static inline bool xa_zero_busy(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
}

static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
{
        if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
                xa->xa_flags |= XA_FLAGS_MARK(mark);
}

static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
{
        if (xa->xa_flags & XA_FLAGS_MARK(mark))
                xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
}

static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
{
        return node->marks[(__force unsigned)mark];
}

static inline bool node_get_mark(struct xa_node *node,
                unsigned int offset, xa_mark_t mark)
{
        return test_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_set_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_clear_bit(offset, node_marks(node, mark));
}

static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
{
        return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
}

static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
{
        bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
}

#define mark_inc(mark) do { \
        mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
} while (0)

/*
 * xas_squash_marks() - Merge all marks to the first entry
 * @xas: Array operation state.
 *
 * Set a mark on the first entry if any entry has it set.  Clear marks on
 * all sibling entries.
 */
static void xas_squash_marks(const struct xa_state *xas)
{
        unsigned int mark = 0;
        unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;

        if (!xas->xa_sibs)
                return;

        do {
                unsigned long *marks = xas->xa_node->marks[mark];
                if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
                        continue;
                __set_bit(xas->xa_offset, marks);
                bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
        } while (mark++ != (__force unsigned)XA_MARK_MAX);
}

/* extracts the offset within this node from the index */
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
        return (index >> node->shift) & XA_CHUNK_MASK;
}

static void xas_set_offset(struct xa_state *xas)
{
        xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
}

/* move the index either forwards (find) or backwards (sibling slot) */
static void xas_move_index(struct xa_state *xas, unsigned long offset)
{
        unsigned int shift = xas->xa_node->shift;
        xas->xa_index &= ~XA_CHUNK_MASK << shift;
        xas->xa_index += offset << shift;
}

static void xas_next_offset(struct xa_state *xas)
{
        xas->xa_offset++;
        xas_move_index(xas, xas->xa_offset);
}

static void *set_bounds(struct xa_state *xas)
{
        xas->xa_node = XAS_BOUNDS;
        return NULL;
}

/*
 * Starts a walk.  If the @xas is already valid, we assume that it's on
 * the right path and just return where we've got to.  If we're in an
 * error state, return NULL.  If the index is outside the current scope
 * of the xarray, return NULL without changing @xas->xa_node.  Otherwise
 * set @xas->xa_node to NULL and return the current head of the array.
 */
static void *xas_start(struct xa_state *xas)
{
        void *entry;

        if (xas_valid(xas))
                return xas_reload(xas);
        if (xas_error(xas))
                return NULL;

        entry = xa_head(xas->xa);
        if (!xa_is_node(entry)) {
                if (xas->xa_index)
                        return set_bounds(xas);
        } else {
                if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
                        return set_bounds(xas);
        }

        xas->xa_node = NULL;
        return entry;
}

static __always_inline void *xas_descend(struct xa_state *xas,
                                        struct xa_node *node)
{
        unsigned int offset = get_offset(xas->xa_index, node);
        void *entry = xa_entry(xas->xa, node, offset);

        xas->xa_node = node;
        while (xa_is_sibling(entry)) {
                offset = xa_to_sibling(entry);
                entry = xa_entry(xas->xa, node, offset);
                if (node->shift && xa_is_node(entry))
                        entry = XA_RETRY_ENTRY;
        }

        xas->xa_offset = offset;
        return entry;
}

/**
 * xas_load() - Load an entry from the XArray (advanced).
 * @xas: XArray operation state.
 *
 * Usually walks the @xas to the appropriate state to load the entry
 * stored at xa_index.  However, it will do nothing and return %NULL if
 * @xas is in an error state.  xas_load() will never expand the tree.
 *
 * If the xa_state is set up to operate on a multi-index entry, xas_load()
 * may return %NULL or an internal entry, even if there are entries
 * present within the range specified by @xas.
 *
 * Context: Any context.  The caller should hold the xa_lock or the RCU lock.
 * Return: Usually an entry in the XArray, but see description for exceptions.
 */
void *xas_load(struct xa_state *xas)
{
        void *entry = xas_start(xas);

        while (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);

                if (xas->xa_shift > node->shift)
                        break;
                entry = xas_descend(xas, node);
                if (node->shift == 0)
                        break;
        }
        return entry;
}
EXPORT_SYMBOL_GPL(xas_load);

#define XA_RCU_FREE        ((struct xarray *)1)

static void xa_node_free(struct xa_node *node)
{
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->array = XA_RCU_FREE;
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * xas_destroy() - Free any resources allocated during the XArray operation.
 * @xas: XArray operation state.
 *
 * Most users will not need to call this function; it is called for you
 * by xas_nomem().
 */
void xas_destroy(struct xa_state *xas)
{
        struct xa_node *next, *node = xas->xa_alloc;

        while (node) {
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
                next = rcu_dereference_raw(node->parent);
                radix_tree_node_rcu_free(&node->rcu_head);
                xas->xa_alloc = node = next;
        }
}

/**
 * xas_nomem() - Allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * If we need to add new nodes to the XArray, we try to allocate memory
 * with GFP_NOWAIT while holding the lock, which will usually succeed.
 * If it fails, @xas is flagged as needing memory to continue.  The caller
 * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
 * the caller should retry the operation.
 *
 * Forward progress is guaranteed as one node is allocated here and
 * stored in the xa_state where it will be found by xas_alloc().  More
 * nodes will likely be found in the slab allocator, but we do not tie
 * them up here.
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
bool xas_nomem(struct xa_state *xas, gfp_t gfp)
{
        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}
EXPORT_SYMBOL_GPL(xas_nomem);

/*
 * __xas_nomem() - Drop locks and allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * Internal variant of xas_nomem().
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
        __must_hold(xas->xa->xa_lock)
{
        unsigned int lock_type = xa_lock_type(xas->xa);

        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        if (gfpflags_allow_blocking(gfp)) {
                xas_unlock_type(xas, lock_type);
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                xas_lock_type(xas, lock_type);
        } else {
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        }
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}

static void xas_update(struct xa_state *xas, struct xa_node *node)
{
        if (xas->xa_update)
                xas->xa_update(node);
        else
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
}

static void *xas_alloc(struct xa_state *xas, unsigned int shift)
{
        struct xa_node *parent = xas->xa_node;
        struct xa_node *node = xas->xa_alloc;

        if (xas_invalid(xas))
                return NULL;

        if (node) {
                xas->xa_alloc = NULL;
        } else {
                gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;

                if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                        gfp |= __GFP_ACCOUNT;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node) {
                        xas_set_err(xas, -ENOMEM);
                        return NULL;
                }
        }

        if (parent) {
                node->offset = xas->xa_offset;
                parent->count++;
                XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
                xas_update(xas, parent);
        }
        XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->shift = shift;
        node->count = 0;
        node->nr_values = 0;
        RCU_INIT_POINTER(node->parent, xas->xa_node);
        node->array = xas->xa;

        return node;
}

#ifdef CONFIG_XARRAY_MULTI
/* Returns the number of indices covered by a given xa_state */
static unsigned long xas_size(const struct xa_state *xas)
{
        return (xas->xa_sibs + 1UL) << xas->xa_shift;
}
#endif

/*
 * Use this to calculate the maximum index that will need to be created
 * in order to add the entry described by @xas.  Because we cannot store a
 * multi-index entry at index 0, the calculation is a little more complex
 * than you might expect.
 */
static unsigned long xas_max(struct xa_state *xas)
{
        unsigned long max = xas->xa_index;

#ifdef CONFIG_XARRAY_MULTI
        if (xas->xa_shift || xas->xa_sibs) {
                unsigned long mask = xas_size(xas) - 1;
                max |= mask;
                if (mask == max)
                        max++;
        }
#endif

        return max;
}

/* The maximum index that can be contained in the array without expanding it */
static unsigned long max_index(void *entry)
{
        if (!xa_is_node(entry))
                return 0;
        return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
}

static void xas_shrink(struct xa_state *xas)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = xas->xa_node;

        for (;;) {
                void *entry;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count != 1)
                        break;
                entry = xa_entry_locked(xa, node, 0);
                if (!entry)
                        break;
                if (!xa_is_node(entry) && node->shift)
                        break;
                if (xa_is_zero(entry) && xa_zero_busy(xa))
                        entry = NULL;
                xas->xa_node = XAS_BOUNDS;

                RCU_INIT_POINTER(xa->xa_head, entry);
                if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
                        xa_mark_clear(xa, XA_FREE_MARK);

                node->count = 0;
                node->nr_values = 0;
                if (!xa_is_node(entry))
                        RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
                xas_update(xas, node);
                xa_node_free(node);
                if (!xa_is_node(entry))
                        break;
                node = xa_to_node(entry);
                node->parent = NULL;
        }
}

/*
 * xas_delete_node() - Attempt to delete an xa_node
 * @xas: Array operation state.
 *
 * Attempts to delete the @xas->xa_node.  This will fail if xa->node has
 * a non-zero reference count.
 */
static void xas_delete_node(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        for (;;) {
                struct xa_node *parent;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count)
                        break;

                parent = xa_parent_locked(xas->xa, node);
                xas->xa_node = parent;
                xas->xa_offset = node->offset;
                xa_node_free(node);

                if (!parent) {
                        xas->xa->xa_head = NULL;
                        xas->xa_node = XAS_BOUNDS;
                        return;
                }

                parent->slots[xas->xa_offset] = NULL;
                parent->count--;
                XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
                node = parent;
                xas_update(xas, node);
        }

        if (!node->parent)
                xas_shrink(xas);
}

/**
 * xas_free_nodes() - Free this node and all nodes that it references
 * @xas: Array operation state.
 * @top: Node to free
 *
 * This node has been removed from the tree.  We must now free it and all
 * of its subnodes.  There may be RCU walkers with references into the tree,
 * so we must replace all entries with retry markers.
 */
static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
{
        unsigned int offset = 0;
        struct xa_node *node = top;

        for (;;) {
                void *entry = xa_entry_locked(xas->xa, node, offset);

                if (node->shift && xa_is_node(entry)) {
                        node = xa_to_node(entry);
                        offset = 0;
                        continue;
                }
                if (entry)
                        RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
                offset++;
                while (offset == XA_CHUNK_SIZE) {
                        struct xa_node *parent;

                        parent = xa_parent_locked(xas->xa, node);
                        offset = node->offset + 1;
                        node->count = 0;
                        node->nr_values = 0;
                        xas_update(xas, node);
                        xa_node_free(node);
                        if (node == top)
                                return;
                        node = parent;
                }
        }
}

/*
 * xas_expand adds nodes to the head of the tree until it has reached
 * sufficient height to be able to contain @xas->xa_index
 */
static int xas_expand(struct xa_state *xas, void *head)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = NULL;
        unsigned int shift = 0;
        unsigned long max = xas_max(xas);

        if (!head) {
                if (max == 0)
                        return 0;
                while ((max >> shift) >= XA_CHUNK_SIZE)
                        shift += XA_CHUNK_SHIFT;
                return shift + XA_CHUNK_SHIFT;
        } else if (xa_is_node(head)) {
                node = xa_to_node(head);
                shift = node->shift + XA_CHUNK_SHIFT;
        }
        xas->xa_node = NULL;

        while (max > max_index(head)) {
                xa_mark_t mark = 0;

                XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
                node = xas_alloc(xas, shift);
                if (!node)
                        return -ENOMEM;

                node->count = 1;
                if (xa_is_value(head))
                        node->nr_values = 1;
                RCU_INIT_POINTER(node->slots[0], head);

                /* Propagate the aggregated mark info to the new child */
                for (;;) {
                        if (xa_track_free(xa) && mark == XA_FREE_MARK) {
                                node_mark_all(node, XA_FREE_MARK);
                                if (!xa_marked(xa, XA_FREE_MARK)) {
                                        node_clear_mark(node, 0, XA_FREE_MARK);
                                        xa_mark_set(xa, XA_FREE_MARK);
                                }
                        } else if (xa_marked(xa, mark)) {
                                node_set_mark(node, 0, mark);
                        }
                        if (mark == XA_MARK_MAX)
                                break;
                        mark_inc(mark);
                }

                /*
                 * Now that the new node is fully initialised, we can add
                 * it to the tree
                 */
                if (xa_is_node(head)) {
                        xa_to_node(head)->offset = 0;
                        rcu_assign_pointer(xa_to_node(head)->parent, node);
                }
                head = xa_mk_node(node);
                rcu_assign_pointer(xa->xa_head, head);
                xas_update(xas, node);

                shift += XA_CHUNK_SHIFT;
        }

        xas->xa_node = node;
        return shift;
}

/*
 * xas_create() - Create a slot to store an entry in.
 * @xas: XArray operation state.
 * @allow_root: %true if we can store the entry in the root directly
 *
 * Most users will not need to call this function directly, as it is called
 * by xas_store().  It is useful for doing conditional store operations
 * (see the xa_cmpxchg() implementation for an example).
 *
 * Return: If the slot already existed, returns the contents of this slot.
 * If the slot was newly created, returns %NULL.  If it failed to create the
 * slot, returns %NULL and indicates the error in @xas.
 */
static void *xas_create(struct xa_state *xas, bool allow_root)
{
        struct xarray *xa = xas->xa;
        void *entry;
        void __rcu **slot;
        struct xa_node *node = xas->xa_node;
        int shift;
        unsigned int order = xas->xa_shift;

        if (xas_top(node)) {
                entry = xa_head_locked(xa);
                xas->xa_node = NULL;
                if (!entry && xa_zero_busy(xa))
                        entry = XA_ZERO_ENTRY;
                shift = xas_expand(xas, entry);
                if (shift < 0)
                        return NULL;
                if (!shift && !allow_root)
                        shift = XA_CHUNK_SHIFT;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        } else if (xas_error(xas)) {
                return NULL;
        } else if (node) {
                unsigned int offset = xas->xa_offset;

                shift = node->shift;
                entry = xa_entry_locked(xa, node, offset);
                slot = &node->slots[offset];
        } else {
                shift = 0;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        }

        while (shift > order) {
                shift -= XA_CHUNK_SHIFT;
                if (!entry) {
                        node = xas_alloc(xas, shift);
                        if (!node)
                                break;
                        if (xa_track_free(xa))
                                node_mark_all(node, XA_FREE_MARK);
                        rcu_assign_pointer(*slot, xa_mk_node(node));
                } else if (xa_is_node(entry)) {
                        node = xa_to_node(entry);
                } else {
                        break;
                }
                entry = xas_descend(xas, node);
                slot = &node->slots[xas->xa_offset];
        }

        return entry;
}

/**
 * xas_create_range() - Ensure that stores to this range will succeed
 * @xas: XArray operation state.
 *
 * Creates all of the slots in the range covered by @xas.  Sets @xas to
 * create single-index entries and positions it at the beginning of the
 * range.  This is for the benefit of users which have not yet been
 * converted to use multi-index entries.
 */
void xas_create_range(struct xa_state *xas)
{
        unsigned long index = xas->xa_index;
        unsigned char shift = xas->xa_shift;
        unsigned char sibs = xas->xa_sibs;

        xas->xa_index |= ((sibs + 1UL) << shift) - 1;
        if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
                xas->xa_offset |= sibs;
        xas->xa_shift = 0;
        xas->xa_sibs = 0;

        for (;;) {
                xas_create(xas, true);
                if (xas_error(xas))
                        goto restore;
                if (xas->xa_index <= (index | XA_CHUNK_MASK))
                        goto success;
                xas->xa_index -= XA_CHUNK_SIZE;

                for (;;) {
                        struct xa_node *node = xas->xa_node;
                        if (node->shift >= shift)
                                break;
                        xas->xa_node = xa_parent_locked(xas->xa, node);
                        xas->xa_offset = node->offset - 1;
                        if (node->offset != 0)
                                break;
                }
        }

restore:
        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
        xas->xa_index = index;
        return;
success:
        xas->xa_index = index;
        if (xas->xa_node)
                xas_set_offset(xas);
}
EXPORT_SYMBOL_GPL(xas_create_range);

static void update_node(struct xa_state *xas, struct xa_node *node,
                int count, int values)
{
        if (!node || (!count && !values))
                return;

        node->count += count;
        node->nr_values += values;
        XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
        XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
        xas_update(xas, node);
        if (count < 0)
                xas_delete_node(xas);
}

/**
 * xas_store() - Store this entry in the XArray.
 * @xas: XArray operation state.
 * @entry: New entry.
 *
 * If @xas is operating on a multi-index entry, the entry returned by this
 * function is essentially meaningless (it may be an internal entry or it
 * may be %NULL, even if there are non-NULL entries at some of the indices
 * covered by the range).  This is not a problem for any current users,
 * and can be changed if needed.
 *
 * Return: The old entry at this index.
 */
void *xas_store(struct xa_state *xas, void *entry)
{
        struct xa_node *node;
        void __rcu **slot = &xas->xa->xa_head;
        unsigned int offset, max;
        int count = 0;
        int values = 0;
        void *first, *next;
        bool value = xa_is_value(entry);

        if (entry) {
                bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry);
                first = xas_create(xas, allow_root);
        } else {
                first = xas_load(xas);
        }

        if (xas_invalid(xas))
                return first;
        node = xas->xa_node;
        if (node && (xas->xa_shift < node->shift))
                xas->xa_sibs = 0;
        if ((first == entry) && !xas->xa_sibs)
                return first;

        next = first;
        offset = xas->xa_offset;
        max = xas->xa_offset + xas->xa_sibs;
        if (node) {
                slot = &node->slots[offset];
                if (xas->xa_sibs)
                        xas_squash_marks(xas);
        }
        if (!entry)
                xas_init_marks(xas);

        for (;;) {
                /*
                 * Must clear the marks before setting the entry to NULL,
                 * otherwise xas_for_each_marked may find a NULL entry and
                 * stop early.  rcu_assign_pointer contains a release barrier
                 * so the mark clearing will appear to happen before the
                 * entry is set to NULL.
                 */
                rcu_assign_pointer(*slot, entry);
                if (xa_is_node(next) && (!node || node->shift))
                        xas_free_nodes(xas, xa_to_node(next));
                if (!node)
                        break;
                count += !next - !entry;
                values += !xa_is_value(first) - !value;
                if (entry) {
                        if (offset == max)
                                break;
                        if (!xa_is_sibling(entry))
                                entry = xa_mk_sibling(xas->xa_offset);
                } else {
                        if (offset == XA_CHUNK_MASK)
                                break;
                }
                next = xa_entry_locked(xas->xa, node, ++offset);
                if (!xa_is_sibling(next)) {
                        if (!entry && (offset > max))
                                break;
                        first = next;
                }
                slot++;
        }

        update_node(xas, node, count, values);
        return first;
}
EXPORT_SYMBOL_GPL(xas_store);

/**
 * xas_get_mark() - Returns the state of this mark.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Return: true if the mark is set, false if the mark is clear or @xas
 * is in an error state.
 */
bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
{
        if (xas_invalid(xas))
                return false;
        if (!xas->xa_node)
                return xa_marked(xas->xa, mark);
        return node_get_mark(xas->xa_node, xas->xa_offset, mark);
}
EXPORT_SYMBOL_GPL(xas_get_mark);

/**
 * xas_set_mark() - Sets the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Sets the specified mark on this entry, and walks up the tree setting it
 * on all the ancestor entries.  Does nothing if @xas has not been walked to
 * an entry, or is in an error state.
 */
void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (node_set_mark(node, offset, mark))
                        return;
                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (!xa_marked(xas->xa, mark))
                xa_mark_set(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_set_mark);

/**
 * xas_clear_mark() - Clears the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Clears the specified mark on this entry, and walks back to the head
 * attempting to clear it on all the ancestor entries.  Does nothing if
 * @xas has not been walked to an entry, or is in an error state.
 */
void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (!node_clear_mark(node, offset, mark))
                        return;
                if (node_any_mark(node, mark))
                        return;

                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (xa_marked(xas->xa, mark))
                xa_mark_clear(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_clear_mark);

/**
 * xas_init_marks() - Initialise all marks for the entry
 * @xas: Array operations state.
 *
 * Initialise all marks for the entry specified by @xas.  If we're tracking
 * free entries with a mark, we need to set it on all entries.  All other
 * marks are cleared.
 *
 * This implementation is not as efficient as it could be; we may walk
 * up the tree multiple times.
 */
void xas_init_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;

        for (;;) {
                if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
                        xas_set_mark(xas, mark);
                else
                        xas_clear_mark(xas, mark);
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}
EXPORT_SYMBOL_GPL(xas_init_marks);

#ifdef CONFIG_XARRAY_MULTI
static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
{
        unsigned int marks = 0;
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (node_get_mark(node, offset, mark))
                        marks |= 1 << (__force unsigned int)mark;
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }

        return marks;
}

static inline void node_mark_slots(struct xa_node *node, unsigned int sibs,
                xa_mark_t mark)
{
        int i;

        if (sibs == 0)
                node_mark_all(node, mark);
        else {
                for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1)
                        node_set_mark(node, i, mark);
        }
}

static void node_set_marks(struct xa_node *node, unsigned int offset,
                        struct xa_node *child, unsigned int sibs,
                        unsigned int marks)
{
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (marks & (1 << (__force unsigned int)mark)) {
                        node_set_mark(node, offset, mark);
                        if (child)
                                node_mark_slots(child, sibs, mark);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

/**
 * xas_split_alloc() - Allocate memory for splitting an entry.
 * @xas: XArray operation state.
 * @entry: New entry which will be stored in the array.
 * @order: Current entry order.
 * @gfp: Memory allocation flags.
 *
 * This function should be called before calling xas_split().
 * If necessary, it will allocate new nodes (and fill them with @entry)
 * to prepare for the upcoming split of an entry of @order size into
 * entries of the order stored in the @xas.
 *
 * Context: May sleep if @gfp flags permit.
 */
void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
                gfp_t gfp)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int mask = xas->xa_sibs;

        /* XXX: no support for splitting really large entries yet */
        if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order))
                goto nomem;
        if (xas->xa_shift + XA_CHUNK_SHIFT > order)
                return;

        do {
                unsigned int i;
                void *sibling = NULL;
                struct xa_node *node;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node)
                        goto nomem;
                node->array = xas->xa;
                for (i = 0; i < XA_CHUNK_SIZE; i++) {
                        if ((i & mask) == 0) {
                                RCU_INIT_POINTER(node->slots[i], entry);
                                sibling = xa_mk_sibling(i);
                        } else {
                                RCU_INIT_POINTER(node->slots[i], sibling);
                        }
                }
                RCU_INIT_POINTER(node->parent, xas->xa_alloc);
                xas->xa_alloc = node;
        } while (sibs-- > 0);

        return;
nomem:
        xas_destroy(xas);
        xas_set_err(xas, -ENOMEM);
}
EXPORT_SYMBOL_GPL(xas_split_alloc);

/**
 * xas_split() - Split a multi-index entry into smaller entries.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: Current entry order.
 *
 * The size of the new entries is set in @xas.  The value in @entry is
 * copied to all the replacement entries.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;
        do {
                if (xas->xa_shift < node->shift) {
                        struct xa_node *child = xas->xa_alloc;

                        xas->xa_alloc = rcu_dereference_raw(child->parent);
                        child->shift = node->shift - XA_CHUNK_SHIFT;
                        child->offset = offset;
                        child->count = XA_CHUNK_SIZE;
                        child->nr_values = xa_is_value(entry) ?
                                        XA_CHUNK_SIZE : 0;
                        RCU_INIT_POINTER(child->parent, node);
                        node_set_marks(node, offset, child, xas->xa_sibs,
                                        marks);
                        rcu_assign_pointer(node->slots[offset],
                                        xa_mk_node(child));
                        if (xa_is_value(curr))
                                values--;
                        xas_update(xas, child);
                } else {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, 0, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                }
        } while (offset-- > xas->xa_offset);

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_split);
#endif

/**
 * xas_pause() - Pause a walk to drop a lock.
 * @xas: XArray operation state.
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @xas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call xas_pause(), the xa_for_each()
 * iterator may be more appropriate.
 *
 * Note that xas_pause() only works for forward iteration.  If a user needs
 * to pause a reverse iteration, we will need a xas_pause_rev().
 */
void xas_pause(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (xas_invalid(xas))
                return;

        xas->xa_node = XAS_RESTART;
        if (node) {
                unsigned long offset = xas->xa_offset;
                while (++offset < XA_CHUNK_SIZE) {
                        if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
                                break;
                }
                xas->xa_index += (offset - xas->xa_offset) << node->shift;
                if (xas->xa_index == 0)
                        xas->xa_node = XAS_BOUNDS;
        } else {
                xas->xa_index++;
        }
}
EXPORT_SYMBOL_GPL(xas_pause);

/*
 * __xas_prev() - Find the previous entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_prev() which handles all the complex cases
 * out of line.
 */
void *__xas_prev(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index--;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset--;

        while (xas->xa_offset == 255) {
                xas->xa_offset = xas->xa_node->offset - 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_prev);

/*
 * __xas_next() - Find the next entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_next() which handles all the complex cases
 * out of line.
 */
void *__xas_next(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index++;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset++;

        while (xas->xa_offset == XA_CHUNK_SIZE) {
                xas->xa_offset = xas->xa_node->offset + 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_next);

/**
 * xas_find() - Find the next present entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * If the @xas has not yet been walked to an entry, return the entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we move to the
 * next entry.
 *
 * If no entry is found and the array is smaller than @max, the iterator
 * is set to the smallest index not yet in the array.  This allows @xas
 * to be immediately passed to xas_store().
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find(struct xa_state *xas, unsigned long max)
{
        void *entry;

        if (xas_error(xas) || xas->xa_node == XAS_BOUNDS)
                return NULL;
        if (xas->xa_index > max)
                return set_bounds(xas);

        if (!xas->xa_node) {
                xas->xa_index = 1;
                return set_bounds(xas);
        } else if (xas->xa_node == XAS_RESTART) {
                entry = xas_load(xas);
                if (entry || xas_not_node(xas->xa_node))
                        return entry;
        } else if (!xas->xa_node->shift &&
                    xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
                xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
        }

        xas_next_offset(xas);

        while (xas->xa_node && (xas->xa_index <= max)) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (xa_is_node(entry)) {
                        xas->xa_node = xa_to_node(entry);
                        xas->xa_offset = 0;
                        continue;
                }
                if (entry && !xa_is_sibling(entry))
                        return entry;

                xas_next_offset(xas);
        }

        if (!xas->xa_node)
                xas->xa_node = XAS_BOUNDS;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find);

/**
 * xas_find_marked() - Find the next marked entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark number to search for.
 *
 * If the @xas has not yet been walked to an entry, return the marked entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we return the
 * first marked entry with an index > xas.xa_index.
 *
 * If no marked entry is found and the array is smaller than @max, @xas is
 * set to the bounds state and xas->xa_index is set to the smallest index
 * not yet in the array.  This allows @xas to be immediately passed to
 * xas_store().
 *
 * If no entry is found before @max is reached, @xas is set to the restart
 * state.
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
{
        bool advance = true;
        unsigned int offset;
        void *entry;

        if (xas_error(xas))
                return NULL;
        if (xas->xa_index > max)
                goto max;

        if (!xas->xa_node) {
                xas->xa_index = 1;
                goto out;
        } else if (xas_top(xas->xa_node)) {
                advance = false;
                entry = xa_head(xas->xa);
                xas->xa_node = NULL;
                if (xas->xa_index > max_index(entry))
                        goto out;
                if (!xa_is_node(entry)) {
                        if (xa_marked(xas->xa, mark))
                                return entry;
                        xas->xa_index = 1;
                        goto out;
                }
                xas->xa_node = xa_to_node(entry);
                xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
        }

        while (xas->xa_index <= max) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        advance = false;
                        continue;
                }

                if (!advance) {
                        entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                        if (xa_is_sibling(entry)) {
                                xas->xa_offset = xa_to_sibling(entry);
                                xas_move_index(xas, xas->xa_offset);
                        }
                }

                offset = xas_find_chunk(xas, advance, mark);
                if (offset > xas->xa_offset) {
                        advance = false;
                        xas_move_index(xas, offset);
                        /* Mind the wrap */
                        if ((xas->xa_index - 1) >= max)
                                goto max;
                        xas->xa_offset = offset;
                        if (offset == XA_CHUNK_SIZE)
                                continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
                        continue;
                if (!xa_is_node(entry))
                        return entry;
                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }

out:
        if (xas->xa_index > max)
                goto max;
        return set_bounds(xas);
max:
        xas->xa_node = XAS_RESTART;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_marked);

/**
 * xas_find_conflict() - Find the next present entry in a range.
 * @xas: XArray operation state.
 *
 * The @xas describes both a range and a position within that range.
 *
 * Context: Any context.  Expects xa_lock to be held.
 * Return: The next entry in the range covered by @xas or %NULL.
 */
void *xas_find_conflict(struct xa_state *xas)
{
        void *curr;

        if (xas_error(xas))
                return NULL;

        if (!xas->xa_node)
                return NULL;

        if (xas_top(xas->xa_node)) {
                curr = xas_start(xas);
                if (!curr)
                        return NULL;
                while (xa_is_node(curr)) {
                        struct xa_node *node = xa_to_node(curr);
                        curr = xas_descend(xas, node);
                }
                if (curr)
                        return curr;
        }

        if (xas->xa_node->shift > xas->xa_shift)
                return NULL;

        for (;;) {
                if (xas->xa_node->shift == xas->xa_shift) {
                        if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
                                break;
                } else if (xas->xa_offset == XA_CHUNK_MASK) {
                        xas->xa_offset = xas->xa_node->offset;
                        xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        continue;
                }
                curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
                if (xa_is_sibling(curr))
                        continue;
                while (xa_is_node(curr)) {
                        xas->xa_node = xa_to_node(curr);
                        xas->xa_offset = 0;
                        curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
                }
                if (curr)
                        return curr;
        }
        xas->xa_offset -= xas->xa_sibs;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_conflict);

/**
 * xa_load() - Load an entry from an XArray.
 * @xa: XArray.
 * @index: index into array.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry at @index in @xa.
 */
void *xa_load(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        do {
                entry = xas_load(&xas);
                if (xa_is_zero(entry))
                        entry = NULL;
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        return entry;
}
EXPORT_SYMBOL(xa_load);

static void *xas_result(struct xa_state *xas, void *curr)
{
        if (xa_is_zero(curr))
                return NULL;
        if (xas_error(xas))
                curr = xas->xa_node;
        return curr;
}

/**
 * __xa_erase() - Erase this entry from the XArray while locked.
 * @xa: XArray.
 * @index: Index into array.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 * Return: The entry which used to be at this index.
 */
void *__xa_erase(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        return xas_result(&xas, xas_store(&xas, NULL));
}
EXPORT_SYMBOL(__xa_erase);

/**
 * xa_erase() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * Return: The entry which used to be at this index.
 */
void *xa_erase(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock(xa);
        entry = __xa_erase(xa, index);
        xa_unlock(xa);

        return entry;
}
EXPORT_SYMBOL(xa_erase);

/**
 * __xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);
        if (xa_track_free(xa) && !entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_store(&xas, entry);
                if (xa_track_free(xa))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_store);

/**
 * xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from this index will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
 * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
 * failed.
 */
void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock(xa);

        return curr;
}
EXPORT_SYMBOL(xa_store);

/**
 * __xa_cmpxchg() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);

        do {
                curr = xas_load(&xas);
                if (curr == old) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa) && entry && !curr)
                                xas_clear_mark(&xas, XA_FREE_MARK);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_cmpxchg);

/**
 * __xa_insert() - Store this entry in the XArray if no entry is present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_load(&xas);
                if (!curr) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa))
                                xas_clear_mark(&xas, XA_FREE_MARK);
                } else {
                        xas_set_err(&xas, -EBUSY);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_insert);

#ifdef CONFIG_XARRAY_MULTI
static void xas_set_range(struct xa_state *xas, unsigned long first,
                unsigned long last)
{
        unsigned int shift = 0;
        unsigned long sibs = last - first;
        unsigned int offset = XA_CHUNK_MASK;

        xas_set(xas, first);

        while ((first & XA_CHUNK_MASK) == 0) {
                if (sibs < XA_CHUNK_MASK)
                        break;
                if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
                        break;
                shift += XA_CHUNK_SHIFT;
                if (offset == XA_CHUNK_MASK)
                        offset = sibs & XA_CHUNK_MASK;
                sibs >>= XA_CHUNK_SHIFT;
                first >>= XA_CHUNK_SHIFT;
        }

        offset = first & XA_CHUNK_MASK;
        if (offset + sibs > XA_CHUNK_MASK)
                sibs = XA_CHUNK_MASK - offset;
        if ((((first + sibs + 1) << shift) - 1) > last)
                sibs -= 1;

        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
}

/**
 * xa_store_range() - Store this entry at a range of indices in the XArray.
 * @xa: XArray.
 * @first: First index to affect.
 * @last: Last index to affect.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from any index between @first and @last,
 * inclusive will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
 * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
 */
void *xa_store_range(struct xarray *xa, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_internal(entry)))
                return XA_ERROR(-EINVAL);
        if (last < first)
                return XA_ERROR(-EINVAL);

        do {
                xas_lock(&xas);
                if (entry) {
                        unsigned int order = BITS_PER_LONG;
                        if (last + 1)
                                order = __ffs(last + 1);
                        xas_set_order(&xas, last, order);
                        xas_create(&xas, true);
                        if (xas_error(&xas))
                                goto unlock;
                }
                do {
                        xas_set_range(&xas, first, last);
                        xas_store(&xas, entry);
                        if (xas_error(&xas))
                                goto unlock;
                        first += xas_size(&xas);
                } while (first <= last);
unlock:
                xas_unlock(&xas);
        } while (xas_nomem(&xas, gfp));

        return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);

/**
 * xas_get_order() - Get the order of an entry.
 * @xas: XArray operation state.
 *
 * Called after xas_load, the xas should not be in an error state.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xas_get_order(struct xa_state *xas)
{
        int order = 0;

        if (!xas->xa_node)
                return 0;

        for (;;) {
                unsigned int slot = xas->xa_offset + (1 << order);

                if (slot >= XA_CHUNK_SIZE)
                        break;
                if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot)))
                        break;
                order++;
        }

        order += xas->xa_node->shift;
        return order;
}
EXPORT_SYMBOL_GPL(xas_get_order);

/**
 * xa_get_order() - Get the order of an entry.
 * @xa: XArray.
 * @index: Index of the entry.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xa_get_order(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        int order = 0;
        void *entry;

        rcu_read_lock();
        entry = xas_load(&xas);
        if (entry)
                order = xas_get_order(&xas);
        rcu_read_unlock();

        return order;
}
EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */

/**
 * __xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @limit: Range for allocated ID.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (WARN_ON_ONCE(!xa_track_free(xa)))
                return -EINVAL;

        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                xas.xa_index = limit.min;
                xas_find_marked(&xas, limit.max, XA_FREE_MARK);
                if (xas.xa_node == XAS_RESTART)
                        xas_set_err(&xas, -EBUSY);
                else
                        *id = xas.xa_index;
                xas_store(&xas, entry);
                xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_alloc);

/**
 * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        u32 min = limit.min;
        int ret;

        limit.min = max(min, *next);
        ret = __xa_alloc(xa, id, entry, limit, gfp);
        if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }

        if (ret < 0 && limit.min > min) {
                limit.min = min;
                ret = __xa_alloc(xa, id, entry, limit, gfp);
                if (ret == 0)
                        ret = 1;
        }

        if (ret >= 0) {
                *next = *id + 1;
                if (*next == 0)
                        xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
        }
        return ret;
}
EXPORT_SYMBOL(__xa_alloc_cyclic);

/**
 * __xa_set_mark() - Set this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_set_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_set_mark);

/**
 * __xa_clear_mark() - Clear this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_clear_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_clear_mark);

/**
 * xa_get_mark() - Inquire whether this mark is set on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * This function uses the RCU read lock, so the result may be out of date
 * by the time it returns.  If you need the result to be stable, use a lock.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: True if the entry at @index has this mark set, false if it doesn't.
 */
bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        entry = xas_start(&xas);
        while (xas_get_mark(&xas, mark)) {
                if (!xa_is_node(entry))
                        goto found;
                entry = xas_descend(&xas, xa_to_node(entry));
        }
        rcu_read_unlock();
        return false;
 found:
        rcu_read_unlock();
        return true;
}
EXPORT_SYMBOL(xa_get_mark);

/**
 * xa_set_mark() - Set this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_set_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_set_mark);

/**
 * xa_clear_mark() - Clear this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Clearing a mark always succeeds.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_clear_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_clear_mark);

/**
 * xa_find() - Search the XArray for an entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter, and has the lowest
 * index that is at least @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may not find
 * entries which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry, if found, otherwise %NULL.
 */
void *xa_find(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp);
        void *entry;

        rcu_read_lock();
        do {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find);

static bool xas_sibling(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        unsigned long mask;

        if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
                return false;
        mask = (XA_CHUNK_SIZE << node->shift) - 1;
        return (xas->xa_index & mask) >
                ((unsigned long)xas->xa_offset << node->shift);
}

/**
 * xa_find_after() - Search the XArray for a present entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter and has the lowest
 * index that is above @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may miss entries
 * which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The pointer, if found, otherwise %NULL.
 */
void *xa_find_after(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp + 1);
        void *entry;

        if (xas.xa_index == 0)
                return NULL;

        rcu_read_lock();
        for (;;) {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);

                if (xas_invalid(&xas))
                        break;
                if (xas_sibling(&xas))
                        continue;
                if (!xas_retry(&xas, entry))
                        break;
        }
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find_after);

static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each(xas, entry, max) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n, xa_mark_t mark)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each_marked(xas, entry, max, mark) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * xa_extract() - Copy selected entries from the XArray into a normal array.
 * @xa: The source XArray to copy from.
 * @dst: The buffer to copy entries into.
 * @start: The first index in the XArray eligible to be selected.
 * @max: The last index in the XArray eligible to be selected.
 * @n: The maximum number of entries to copy.
 * @filter: Selection criterion.
 *
 * Copies up to @n entries that match @filter from the XArray.  The
 * copied entries will have indices between @start and @max, inclusive.
 *
 * The @filter may be an XArray mark value, in which case entries which are
 * marked with that mark will be copied.  It may also be %XA_PRESENT, in
 * which case all entries which are not %NULL will be copied.
 *
 * The entries returned may not represent a snapshot of the XArray at a
 * moment in time.  For example, if another thread stores to index 5, then
 * index 10, calling xa_extract() may return the old contents of index 5
 * and the new contents of index 10.  Indices not modified while this
 * function is running will not be skipped.
 *
 * If you need stronger guarantees, holding the xa_lock across calls to this
 * function will prevent concurrent modification.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The number of entries copied.
 */
unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
                        unsigned long max, unsigned int n, xa_mark_t filter)
{
        XA_STATE(xas, xa, start);

        if (!n)
                return 0;

        if ((__force unsigned int)filter < XA_MAX_MARKS)
                return xas_extract_marked(&xas, dst, max, n, filter);
        return xas_extract_present(&xas, dst, max, n);
}
EXPORT_SYMBOL(xa_extract);

/**
 * xa_delete_node() - Private interface for workingset code.
 * @node: Node to be removed from the tree.
 * @update: Function to call to update ancestor nodes.
 *
 * Context: xa_lock must be held on entry and will not be released.
 */
void xa_delete_node(struct xa_node *node, xa_update_node_t update)
{
        struct xa_state xas = {
                .xa = node->array,
                .xa_index = (unsigned long)node->offset <<
                                (node->shift + XA_CHUNK_SHIFT),
                .xa_shift = node->shift + XA_CHUNK_SHIFT,
                .xa_offset = node->offset,
                .xa_node = xa_parent_locked(node->array, node),
                .xa_update = update,
        };

        xas_store(&xas, NULL);
}
EXPORT_SYMBOL_GPL(xa_delete_node);        /* For the benefit of the test suite */

/**
 * xa_destroy() - Free all internal data structures.
 * @xa: XArray.
 *
 * After calling this function, the XArray is empty and has freed all memory
 * allocated for its internal data structures.  You are responsible for
 * freeing the objects referenced by the XArray.
 *
 * Context: Any context.  Takes and releases the xa_lock, interrupt-safe.
 */
void xa_destroy(struct xarray *xa)
{
        XA_STATE(xas, xa, 0);
        unsigned long flags;
        void *entry;

        xas.xa_node = NULL;
        xas_lock_irqsave(&xas, flags);
        entry = xa_head_locked(xa);
        RCU_INIT_POINTER(xa->xa_head, NULL);
        xas_init_marks(&xas);
        if (xa_zero_busy(xa))
                xa_mark_clear(xa, XA_FREE_MARK);
        /* lockdep checks we're still holding the lock in xas_free_nodes() */
        if (xa_is_node(entry))
                xas_free_nodes(&xas, xa_to_node(entry));
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(xa_destroy);

#ifdef XA_DEBUG
void xa_dump_node(const struct xa_node *node)
{
        unsigned i, j;

        if (!node)
                return;
        if ((unsigned long)node & 3) {
                pr_cont("node %px\n", node);
                return;
        }

        pr_cont("node %px %s %d parent %px shift %d count %d values %d "
                "array %px list %px %px marks",
                node, node->parent ? "offset" : "max", node->offset,
                node->parent, node->shift, node->count, node->nr_values,
                node->array, node->private_list.prev, node->private_list.next);
        for (i = 0; i < XA_MAX_MARKS; i++)
                for (j = 0; j < XA_MARK_LONGS; j++)
                        pr_cont(" %lx", node->marks[i][j]);
        pr_cont("\n");
}

void xa_dump_index(unsigned long index, unsigned int shift)
{
        if (!shift)
                pr_info("%lu: ", index);
        else if (shift >= BITS_PER_LONG)
                pr_info("0-%lu: ", ~0UL);
        else
                pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
}

void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
{
        if (!entry)
                return;

        xa_dump_index(index, shift);

        if (xa_is_node(entry)) {
                if (shift == 0) {
                        pr_cont("%px\n", entry);
                } else {
                        unsigned long i;
                        struct xa_node *node = xa_to_node(entry);
                        xa_dump_node(node);
                        for (i = 0; i < XA_CHUNK_SIZE; i++)
                                xa_dump_entry(node->slots[i],
                                      index + (i << node->shift), node->shift);
                }
        } else if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
                                                xa_to_value(entry), entry);
        else if (!xa_is_internal(entry))
                pr_cont("%px\n", entry);
        else if (xa_is_retry(entry))
                pr_cont("retry (%ld)\n", xa_to_internal(entry));
        else if (xa_is_sibling(entry))
                pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else
                pr_cont("UNKNOWN ENTRY (%px)\n", entry);
}

void xa_dump(const struct xarray *xa)
{
        void *entry = xa->xa_head;
        unsigned int shift = 0;

        pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
                        xa->xa_flags, xa_marked(xa, XA_MARK_0),
                        xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
        if (xa_is_node(entry))
                shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
        xa_dump_entry(entry, 0, shift);
}
#endif

























































































































































   19 















   11 













   12 


































   12 










    4 

























































    7 










    2 
    2 










    3 
    3 


































































































    2 








































    1 





























































































    2 


























    2 
































































































































































































































































































































































































































    5 
    5 

























    2 
















   10 
    7 



































































    3 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <linux/container_of.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>

#include <asm/barrier.h>

/*
 * Circular doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

/**
 * INIT_LIST_HEAD - Initialize a list_head structure
 * @list: list_head structure to be initialized.
 *
 * Initializes the list_head to point to itself.  If it is a list header,
 * the result is an empty list.
 */
static inline void INIT_LIST_HEAD(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

#ifdef CONFIG_LIST_HARDENED

#ifdef CONFIG_DEBUG_LIST
# define __list_valid_slowpath
#else
# define __list_valid_slowpath __cold __preserve_most
#endif

/*
 * Performs the full set of list corruption checks before __list_add().
 * On list corruption reports a warning, and returns false.
 */
extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new,
                                                             struct list_head *prev,
                                                             struct list_head *next);

/*
 * Performs list corruption checks before __list_add(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_add_valid_or_report().
 */
static __always_inline bool __list_add_valid(struct list_head *new,
                                             struct list_head *prev,
                                             struct list_head *next)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, since the immediate dereference of them below would
                 * result in a fault if NULL.
                 *
                 * With the reduced set of checks, we can afford to inline the
                 * checks, which also gives the compiler a chance to elide some
                 * of them completely if they can be proven at compile-time. If
                 * one of the pre-conditions does not hold, the slow-path will
                 * show a report which pre-condition failed.
                 */
                if (likely(next->prev == prev && prev->next == next && new != prev && new != next))
                        return true;
                ret = false;
        }

        ret &= __list_add_valid_or_report(new, prev, next);
        return ret;
}

/*
 * Performs the full set of list corruption checks before __list_del_entry().
 * On list corruption reports a warning, and returns false.
 */
extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry);

/*
 * Performs list corruption checks before __list_del_entry(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_del_entry_valid_or_report().
 */
static __always_inline bool __list_del_entry_valid(struct list_head *entry)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                struct list_head *prev = entry->prev;
                struct list_head *next = entry->next;

                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate
                 * dereference of them below would result in a fault.
                 */
                if (likely(prev->next == entry && next->prev == entry))
                        return true;
                ret = false;
        }

        ret &= __list_del_entry_valid_or_report(entry);
        return ret;
}
#else
static inline bool __list_add_valid(struct list_head *new,
                                struct list_head *prev,
                                struct list_head *next)
{
        return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
        return true;
}
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        WRITE_ONCE(prev->next, new);
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
        next->prev = prev;
        WRITE_ONCE(prev->next, next);
}

/*
 * Delete a list entry and clear the 'prev' pointer.
 *
 * This is a special-purpose list clearing method used in the networking code
 * for lists allocated as per-cpu, where we don't want to incur the extra
 * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
 * needs to check the node 'prev' pointer instead of calling list_empty().
 */
static inline void __list_del_clearprev(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->prev = NULL;
}

static inline void __list_del_entry(struct list_head *entry)
{
        if (!__list_del_entry_valid(entry))
                return;

        __list_del(entry->prev, entry->next);
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

/**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->next->prev = new;
        new->prev = old->prev;
        new->prev->next = new;
}

/**
 * list_replace_init - replace old entry by new one and initialize the old one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace_init(struct list_head *old,
                                     struct list_head *new)
{
        list_replace(old, new);
        INIT_LIST_HEAD(old);
}

/**
 * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
 * @entry1: the location to place entry2
 * @entry2: the location to place entry1
 */
static inline void list_swap(struct list_head *entry1,
                             struct list_head *entry2)
{
        struct list_head *pos = entry2->prev;

        list_del(entry2);
        list_replace(entry1, entry2);
        if (pos == entry1)
                pos = entry2;
        list_add(entry1, pos);
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
        __list_del_entry(entry);
        INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del_entry(list);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
{
        __list_del_entry(list);
        list_add_tail(list, head);
}

/**
 * list_bulk_move_tail - move a subsection of a list to its tail
 * @head: the head that will follow our entry
 * @first: first entry to move
 * @last: last entry to move, can be the same as first
 *
 * Move all entries between @first and including @last before @head.
 * All three entries must belong to the same linked list.
 */
static inline void list_bulk_move_tail(struct list_head *head,
                                       struct list_head *first,
                                       struct list_head *last)
{
        first->prev->next = last->next;
        last->next->prev = first->prev;

        head->prev->next = first;
        first->prev = head->prev;

        last->next = head;
        head->prev = last;
}

/**
 * list_is_first -- tests whether @list is the first entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_first(const struct list_head *list, const struct list_head *head)
{
        return list->prev == head;
}

/**
 * list_is_last - tests whether @list is the last entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_last(const struct list_head *list, const struct list_head *head)
{
        return list->next == head;
}

/**
 * list_is_head - tests whether @list is the list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
{
        return list == head;
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
        return READ_ONCE(head->next) == head;
}

/**
 * list_del_init_careful - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 *
 * This is the same as list_del_init(), except designed to be used
 * together with list_empty_careful() in a way to guarantee ordering
 * of other memory operations.
 *
 * Any memory operations done before a list_del_init_careful() are
 * guaranteed to be visible after a list_empty_careful() test.
 */
static inline void list_del_init_careful(struct list_head *entry)
{
        __list_del_entry(entry);
        WRITE_ONCE(entry->prev, entry);
        smp_store_release(&entry->next, entry);
}

/**
 * list_empty_careful - tests whether a list is empty and not being modified
 * @head: the list to test
 *
 * Description:
 * tests whether a list is empty _and_ checks that no other CPU might be
 * in the process of modifying either member (next or prev)
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 */
static inline int list_empty_careful(const struct list_head *head)
{
        struct list_head *next = smp_load_acquire(&head->next);
        return list_is_head(next, head) && (next == READ_ONCE(head->prev));
}

/**
 * list_rotate_left - rotate the list to the left
 * @head: the head of the list
 */
static inline void list_rotate_left(struct list_head *head)
{
        struct list_head *first;

        if (!list_empty(head)) {
                first = head->next;
                list_move_tail(first, head);
        }
}

/**
 * list_rotate_to_front() - Rotate list to specific item.
 * @list: The desired new front of the list.
 * @head: The head of the list.
 *
 * Rotates list so that @list becomes the new front of the list.
 */
static inline void list_rotate_to_front(struct list_head *list,
                                        struct list_head *head)
{
        /*
         * Deletes the list head from the list denoted by @head and
         * places it as the tail of @list, this effectively rotates the
         * list so that @list is at the front.
         */
        list_move_tail(head, list);
}

/**
 * list_is_singular - tests whether a list has just one entry.
 * @head: the list to test.
 */
static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

static inline void __list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        struct list_head *new_first = entry->next;
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry;
        entry->next = list;
        head->next = new_first;
        new_first->prev = head;
}

/**
 * list_cut_position - cut a list into two
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *        and if so we won't cut the list
 *
 * This helper moves the initial part of @head, up to and
 * including @entry, from @head to @list. You should
 * pass on @entry an element you know is on @head. @list
 * should be an empty list or a list you do not care about
 * losing its data.
 *
 */
static inline void list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        if (list_empty(head))
                return;
        if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next))
                return;
        if (list_is_head(entry, head))
                INIT_LIST_HEAD(list);
        else
                __list_cut_position(list, head, entry);
}

/**
 * list_cut_before - cut a list into two, before given entry
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *
 * This helper moves the initial part of @head, up to but
 * excluding @entry, from @head to @list.  You should pass
 * in @entry an element you know is on @head.  @list should
 * be an empty list or a list you do not care about losing
 * its data.
 * If @entry == @head, all entries on @head are moved to
 * @list.
 */
static inline void list_cut_before(struct list_head *list,
                                   struct list_head *head,
                                   struct list_head *entry)
{
        if (head->next == entry) {
                INIT_LIST_HEAD(list);
                return;
        }
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry->prev;
        list->prev->next = list;
        head->next = entry;
        entry->prev = head;
}

static inline void __list_splice(const struct list_head *list,
                                 struct list_head *prev,
                                 struct list_head *next)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        first->prev = prev;
        prev->next = first;

        last->next = next;
        next->prev = last;
}

/**
 * list_splice - join two lists, this is designed for stacks
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(const struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head, head->next);
}

/**
 * list_splice_tail - join two lists, each list being a queue
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice_tail(struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head->prev, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
                                    struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head, head->next);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_splice_tail_init - join two lists and reinitialise the emptied list
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void list_splice_tail_init(struct list_head *list,
                                         struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head->prev, head);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

/**
 * list_first_entry - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_first_entry(ptr, type, member) \
        list_entry((ptr)->next, type, member)

/**
 * list_last_entry - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_last_entry(ptr, type, member) \
        list_entry((ptr)->prev, type, member)

/**
 * list_first_entry_or_null - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_first_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->next); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_next_entry - get the next element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

/**
 * list_next_entry_circular - get the next element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the last element (return the first element).
 * Note, that list is expected to be not empty.
 */
#define list_next_entry_circular(pos, head, member) \
        (list_is_last(&(pos)->member, head) ? \
        list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member))

/**
 * list_prev_entry - get the prev element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_prev_entry(pos, member) \
        list_entry((pos)->member.prev, typeof(*(pos)), member)

/**
 * list_prev_entry_circular - get the prev element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the first element (return the last element).
 * Note, that list is expected to be not empty.
 */
#define list_prev_entry_circular(pos, head, member) \
        (list_is_first(&(pos)->member, head) ? \
        list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member))

/**
 * list_for_each        -        iterate over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each(pos, head) \
        for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_reverse - iterate backwards over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_reverse(pos, head) \
        for (pos = (head)->prev; pos != (head); pos = pos->prev)

/**
 * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_rcu(pos, head)                  \
        for (pos = rcu_dereference((head)->next); \
             !list_is_head(pos, (head)); \
             pos = rcu_dereference(pos->next))

/**
 * list_for_each_continue - continue iteration over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 *
 * Continue to iterate over a list, continuing after the current position.
 */
#define list_for_each_continue(pos, head) \
        for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_prev        -        iterate over a list backwards
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->next)

/**
 * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_prev_safe(pos, n, head) \
        for (pos = (head)->prev, n = pos->prev; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->prev)

/**
 * list_count_nodes - count nodes in the list
 * @head:        the head for your list.
 */
static inline size_t list_count_nodes(struct list_head *head)
{
        struct list_head *pos;
        size_t count = 0;

        list_for_each(pos, head)
                count++;

        return count;
}

/**
 * list_entry_is_head - test if the entry points to the head of the list
 * @pos:        the type * to cursor
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry_is_head(pos, head, member)                                \
        list_is_head(&pos->member, (head))

/**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_first_entry(head, typeof(*pos), member);        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)                        \
        for (pos = list_last_entry(head, typeof(*pos), member);                \
             !list_entry_is_head(pos, head, member);                         \
             pos = list_prev_entry(pos, member))

/**
 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
 * @pos:        the type * to use as a start point
 * @head:        the head of the list
 * @member:        the name of the list_head within the struct.
 *
 * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
 */
#define list_prepare_entry(pos, head, member) \
        ((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define list_for_each_entry_continue(pos, head, member)                 \
        for (pos = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_continue_reverse - iterate backwards from the given point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Start to iterate over list of given type backwards, continuing after
 * the current position.
 */
#define list_for_each_entry_continue_reverse(pos, head, member)                \
        for (pos = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_from - iterate over list of given type from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing from current position.
 */
#define list_for_each_entry_from(pos, head, member)                         \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_from_reverse - iterate backwards over list of given type
 *                                    from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, continuing from current position.
 */
#define list_for_each_entry_from_reverse(pos, head, member)                \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)                        \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
                n = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_continue - continue list iteration safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing after current point,
 * safe against removal of list entry.
 */
#define list_for_each_entry_safe_continue(pos, n, head, member)                 \
        for (pos = list_next_entry(pos, member),                                 \
                n = list_next_entry(pos, member);                                \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_from - iterate over list from current point safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type from current point, safe against
 * removal of list entry.
 */
#define list_for_each_entry_safe_from(pos, n, head, member)                         \
        for (n = list_next_entry(pos, member);                                        \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, safe against removal
 * of list entry.
 */
#define list_for_each_entry_safe_reverse(pos, n, head, member)                \
        for (pos = list_last_entry(head, typeof(*pos), member),                \
                n = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_prev_entry(n, member))

/**
 * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
 * @pos:        the loop cursor used in the list_for_each_entry_safe loop
 * @n:                temporary storage used in list_for_each_entry_safe
 * @member:        the name of the list_head within the struct.
 *
 * list_safe_reset_next is not safe to use in general if the list may be
 * modified concurrently (eg. the lock is dropped in the loop body). An
 * exception to this is if the cursor element (pos) is pinned in the list,
 * and list_safe_reset_next is called after re-taking the lock and before
 * completing the current iteration of the loop body.
 */
#define list_safe_reset_next(pos, n, member)                                \
        n = list_next_entry(pos, member)

/*
 * Double linked lists with a single pointer list head.
 * Mostly useful for hash tables where the two pointer list head is
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */

#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

/**
 * hlist_unhashed - Has node been removed from list and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed
 * state.  For example, hlist_nulls_del_init_rcu() does leave the
 * node in unhashed state, but hlist_nulls_del() does not.
 */
static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

/**
 * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
 * @h: Node to be checked
 *
 * This variant of hlist_unhashed() must be used in lockless contexts
 * to avoid potential load-tearing.  The READ_ONCE() is paired with the
 * various WRITE_ONCE() in hlist helpers that are defined below.
 */
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
        return !READ_ONCE(h->pprev);
}

/**
 * hlist_empty - Is the specified hlist_head structure an empty hlist?
 * @h: Structure to check.
 */
static inline int hlist_empty(const struct hlist_head *h)
{
        return !READ_ONCE(h->first);
}

static inline void __hlist_del(struct hlist_node *n)
{
        struct hlist_node *next = n->next;
        struct hlist_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (next)
                WRITE_ONCE(next->pprev, pprev);
}

/**
 * hlist_del - Delete the specified hlist_node from its list
 * @n: Node to delete.
 *
 * Note that this function leaves the node in hashed state.  Use
 * hlist_del_init() or similar instead to unhash @n.
 */
static inline void hlist_del(struct hlist_node *n)
{
        __hlist_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

/**
 * hlist_del_init - Delete the specified hlist_node from its list and initialize
 * @n: Node to delete.
 *
 * Note that this function leaves the node in unhashed state.
 */
static inline void hlist_del_init(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                INIT_HLIST_NODE(n);
        }
}

/**
 * hlist_add_head - add a new entry at the beginning of the hlist
 * @n: new entry to be added
 * @h: hlist head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
        struct hlist_node *first = h->first;
        WRITE_ONCE(n->next, first);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
        WRITE_ONCE(h->first, n);
        WRITE_ONCE(n->pprev, &h->first);
}

/**
 * hlist_add_before - add a new entry before the one specified
 * @n: new entry to be added
 * @next: hlist node to add it before, which must be non-NULL
 */
static inline void hlist_add_before(struct hlist_node *n,
                                    struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        WRITE_ONCE(n->next, next);
        WRITE_ONCE(next->pprev, &n->next);
        WRITE_ONCE(*(n->pprev), n);
}

/**
 * hlist_add_behind - add a new entry after the one specified
 * @n: new entry to be added
 * @prev: hlist node to add it after, which must be non-NULL
 */
static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
{
        WRITE_ONCE(n->next, prev->next);
        WRITE_ONCE(prev->next, n);
        WRITE_ONCE(n->pprev, &prev->next);

        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

/**
 * hlist_add_fake - create a fake hlist consisting of a single headless node
 * @n: Node to make a fake list out of
 *
 * This makes @n appear to be its own predecessor on a headless hlist.
 * The point of this is to allow things like hlist_del() to work correctly
 * in cases where there is no list.
 */
static inline void hlist_add_fake(struct hlist_node *n)
{
        n->pprev = &n->next;
}

/**
 * hlist_fake: Is this node a fake hlist?
 * @h: Node to check for being a self-referential fake hlist.
 */
static inline bool hlist_fake(struct hlist_node *h)
{
        return h->pprev == &h->next;
}

/**
 * hlist_is_singular_node - is node the only element of the specified hlist?
 * @n: Node to check for singularity.
 * @h: Header for potentially singular list.
 *
 * Check whether the node is the only node of the head without
 * accessing head, thus avoiding unnecessary cache misses.
 */
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
        return !n->next && n->pprev == &h->first;
}

/**
 * hlist_move_list - Move an hlist
 * @old: hlist_head for old list.
 * @new: hlist_head for new list.
 *
 * Move a list from one list head to another. Fixup the pprev
 * reference of the first entry if it exists.
 */
static inline void hlist_move_list(struct hlist_head *old,
                                   struct hlist_head *new)
{
        new->first = old->first;
        if (new->first)
                new->first->pprev = &new->first;
        old->first = NULL;
}

/**
 * hlist_splice_init() - move all entries from one list to another
 * @from: hlist_head from which entries will be moved
 * @last: last entry on the @from list
 * @to:   hlist_head to which entries will be moved
 *
 * @to can be empty, @from must contain at least @last.
 */
static inline void hlist_splice_init(struct hlist_head *from,
                                     struct hlist_node *last,
                                     struct hlist_head *to)
{
        if (to->first)
                to->first->pprev = &last->next;
        last->next = to->first;
        to->first = from->first;
        from->first->pprev = &to->first;
        from->first = NULL;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_for_each(pos, head) \
        for (pos = (head)->first; pos ; pos = pos->next)

#define hlist_for_each_safe(pos, n, head) \
        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
             pos = n)

#define hlist_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
        })

/**
 * hlist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)                                \
        for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(pos, member)                        \
        for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(pos, member)                                \
        for (; pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                a &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(pos, n, head, member)                 \
        for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
             pos && ({ n = pos->member.next; 1; });                        \
             pos = hlist_entry_safe(n, typeof(*pos), member))

/**
 * hlist_count_nodes - count nodes in the hlist
 * @head:        the head for your hlist.
 */
static inline size_t hlist_count_nodes(struct hlist_head *head)
{
        struct hlist_node *pos;
        size_t count = 0;

        hlist_for_each(pos, head)
                count++;

        return count;
}

#endif














    4 

























































    4 







































































    5 






    1 






    4 




















































































    1 













    4 





































    1 






    4 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM net

#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NET_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tracepoint.h>

TRACE_EVENT(net_dev_start_xmit,

        TP_PROTO(const struct sk_buff *skb, const struct net_device *dev),

        TP_ARGS(skb, dev),

        TP_STRUCT__entry(
                __string(        name,                        dev->name        )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        int,                        network_offset        )
                __field(        bool,                        transport_offset_valid)
                __field(        int,                        transport_offset)
                __field(        u8,                        tx_flags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_segs        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->network_offset = skb_network_offset(skb);
                __entry->transport_offset_valid =
                        skb_transport_header_was_set(skb);
                __entry->transport_offset = skb_transport_header_was_set(skb) ?
                        skb_transport_offset(skb) : 0;
                __entry->tx_flags = skb_shinfo(skb)->tx_flags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_segs = skb_shinfo(skb)->gso_segs;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
                  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
                  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
                  __entry->protocol, __entry->ip_summed, __entry->len,
                  __entry->data_len,
                  __entry->network_offset, __entry->transport_offset_valid,
                  __entry->transport_offset, __entry->tx_flags,
                  __entry->gso_size, __entry->gso_segs, __entry->gso_type)
);

TRACE_EVENT(net_dev_xmit,

        TP_PROTO(struct sk_buff *skb,
                 int rc,
                 struct net_device *dev,
                 unsigned int skb_len),

        TP_ARGS(skb, rc, dev, skb_len),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __field(        int,                rc                )
                __string(        name,                dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb_len;
                __entry->rc = rc;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
                __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
);

TRACE_EVENT(net_dev_xmit_timeout,

        TP_PROTO(struct net_device *dev,
                 int queue_index),

        TP_ARGS(dev, queue_index),

        TP_STRUCT__entry(
                __string(        name,                dev->name        )
                __string(        driver,                netdev_drivername(dev))
                __field(        int,                queue_index        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __assign_str(driver);
                __entry->queue_index = queue_index;
        ),

        TP_printk("dev=%s driver=%s queue=%d",
                __get_str(name), __get_str(driver), __entry->queue_index)
);

DECLARE_EVENT_CLASS(net_dev_template,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __string(        name,                skb->dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb->len;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u",
                __get_str(name), __entry->skbaddr, __entry->len)
)

DEFINE_EVENT(net_dev_template, net_dev_queue,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_receive_skb,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_rx,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __string(        name,                        skb->dev->name        )
                __field(        unsigned int,                napi_id                )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        u32,                        hash                )
                __field(        bool,                        l4_hash                )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        unsigned int,                truesize        )
                __field(        bool,                        mac_header_valid)
                __field(        int,                        mac_header        )
                __field(        unsigned char,                nr_frags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name);
#ifdef CONFIG_NET_RX_BUSY_POLL
                __entry->napi_id = skb->napi_id;
#else
                __entry->napi_id = 0;
#endif
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->hash = skb->hash;
                __entry->l4_hash = skb->l4_hash;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->truesize = skb->truesize;
                __entry->mac_header_valid = skb_mac_header_was_set(skb);
                __entry->mac_header = skb_mac_header(skb) - skb->data;
                __entry->nr_frags = skb_shinfo(skb)->nr_frags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
                  __get_str(name), __entry->napi_id, __entry->queue_mapping,
                  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
                  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
                  __entry->hash, __entry->l4_hash, __entry->len,
                  __entry->data_len, __entry->truesize,
                  __entry->mac_header_valid, __entry->mac_header,
                  __entry->nr_frags, __entry->gso_size, __entry->gso_type)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_exit_template,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(int,        ret)
        ),

        TP_fast_assign(
                __entry->ret = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

#endif /* _TRACE_NET_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































































































































































































    4 










    4 


    2 


    3 



















    5 






    5 





























    3 









    1 
    1 

    3 
















































































    1 



    1 
    1 

    1 
    1 





















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NETFILTER_H
#define __LINUX_NETFILTER_H

#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/net.h>
#include <linux/if.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/static_key.h>
#include <linux/module.h>
#include <linux/netfilter_defs.h>
#include <linux/netdevice.h>
#include <linux/sockptr.h>
#include <net/net_namespace.h>

static inline int NF_DROP_GETERR(int verdict)
{
        return -(verdict >> NF_VERDICT_QBITS);
}

static __always_inline int
NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err)
{
        BUILD_BUG_ON(err > 0xffff);

        kfree_skb_reason(skb, reason);

        return ((err << 16) | NF_STOLEN);
}

static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
                                   const union nf_inet_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return a1->all[0] == a2->all[0] &&
               a1->all[1] == a2->all[1] &&
               a1->all[2] == a2->all[2] &&
               a1->all[3] == a2->all[3];
#endif
}

static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
                                     union nf_inet_addr *result,
                                     const union nf_inet_addr *mask)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ua = (const unsigned long *)a1;
        unsigned long *ur = (unsigned long *)result;
        const unsigned long *um = (const unsigned long *)mask;

        ur[0] = ua[0] & um[0];
        ur[1] = ua[1] & um[1];
#else
        result->all[0] = a1->all[0] & mask->all[0];
        result->all[1] = a1->all[1] & mask->all[1];
        result->all[2] = a1->all[2] & mask->all[2];
        result->all[3] = a1->all[3] & mask->all[3];
#endif
}

int netfilter_init(void);

struct sk_buff;

struct nf_hook_ops;

struct sock;

struct nf_hook_state {
        u8 hook;
        u8 pf;
        struct net_device *in;
        struct net_device *out;
        struct sock *sk;
        struct net *net;
        int (*okfn)(struct net *, struct sock *, struct sk_buff *);
};

typedef unsigned int nf_hookfn(void *priv,
                               struct sk_buff *skb,
                               const struct nf_hook_state *state);
enum nf_hook_ops_type {
        NF_HOOK_OP_UNDEFINED,
        NF_HOOK_OP_NF_TABLES,
        NF_HOOK_OP_BPF,
};

struct nf_hook_ops {
        /* User fills in from here down. */
        nf_hookfn                *hook;
        struct net_device        *dev;
        void                        *priv;
        u8                        pf;
        enum nf_hook_ops_type        hook_ops_type:8;
        unsigned int                hooknum;
        /* Hooks are ordered in ascending priority. */
        int                        priority;
};

struct nf_hook_entry {
        nf_hookfn                        *hook;
        void                                *priv;
};

struct nf_hook_entries_rcu_head {
        struct rcu_head head;
        void        *allocation;
};

struct nf_hook_entries {
        u16                                num_hook_entries;
        /* padding */
        struct nf_hook_entry                hooks[];

        /* trailer: pointers to original orig_ops of each hook,
         * followed by rcu_head and scratch space used for freeing
         * the structure via call_rcu.
         *
         *   This is not part of struct nf_hook_entry since its only
         *   needed in slow path (hook register/unregister):
         * const struct nf_hook_ops     *orig_ops[]
         *
         *   For the same reason, we store this at end -- its
         *   only needed when a hook is deleted, not during
         *   packet path processing:
         * struct nf_hook_entries_rcu_head     head
         */
};

#ifdef CONFIG_NETFILTER
static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e)
{
        unsigned int n = e->num_hook_entries;
        const void *hook_end;

        hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */

        return (struct nf_hook_ops **)hook_end;
}

static inline int
nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
                     struct nf_hook_state *state)
{
        return entry->hook(entry->priv, skb, state);
}

static inline void nf_hook_state_init(struct nf_hook_state *p,
                                      unsigned int hook,
                                      u_int8_t pf,
                                      struct net_device *indev,
                                      struct net_device *outdev,
                                      struct sock *sk,
                                      struct net *net,
                                      int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        p->hook = hook;
        p->pf = pf;
        p->in = indev;
        p->out = outdev;
        p->sk = sk;
        p->net = net;
        p->okfn = okfn;
}



struct nf_sockopt_ops {
        struct list_head list;

        u_int8_t pf;

        /* Non-inclusive ranges: use 0/0/NULL to never get called. */
        int set_optmin;
        int set_optmax;
        int (*set)(struct sock *sk, int optval, sockptr_t arg,
                   unsigned int len);
        int get_optmin;
        int get_optmax;
        int (*get)(struct sock *sk, int optval, void __user *user, int *len);
        /* Use the module struct to lock set/get code in place */
        struct module *owner;
};

/* Function to register/unregister hook points. */
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                          unsigned int n);
void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                             unsigned int n);

/* Functions to register get/setsockopt ranges (non-inclusive).  You
   need to check permissions yourself! */
int nf_register_sockopt(struct nf_sockopt_ops *reg);
void nf_unregister_sockopt(struct nf_sockopt_ops *reg);

#ifdef CONFIG_JUMP_LABEL
extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
#endif

int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
                 const struct nf_hook_entries *e, unsigned int i);

void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
                       const struct nf_hook_entries *e);
/**
 *        nf_hook - call a netfilter hook
 *
 *        Returns 1 if the hook has allowed the packet to pass.  The function
 *        okfn must be invoked by the caller in this case.  Any other return
 *        value indicates the packet has been consumed by the hook.
 */
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;
        int ret = 1;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return 1;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
                if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
                        break;
                hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
                break;
        case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
                hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, indev, outdev,
                                   sk, net, okfn);

                ret = nf_hook_slow(skb, &state, hook_head, 0);
        }
        rcu_read_unlock();

        return ret;
}

/* Activate hook; either okfn or kfree_skb called, unless a hook
   returns NF_STOLEN (in which case, it's up to the hook to deal with
   the consequences).

   Returns -ERRNO if packet dropped.  Zero means queued, stolen or
   accepted.
*/

/* RR:
   > I don't want nf_hook to return anything because people might forget
   > about async and trust the return value to mean "packet was ok".

   AK:
   Just document it clearly, then you can expect some sense from kernel
   coders :)
*/

static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        int ret;

        if (!cond ||
            ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
                ret = okfn(net, sk, skb);
        return ret;
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
        struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
        if (ret == 1)
                ret = okfn(net, sk, skb);
        return ret;
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        struct nf_hook_entries *hook_head = NULL;

#ifdef CONFIG_JUMP_LABEL
        if (__builtin_constant_p(pf) &&
            __builtin_constant_p(hook) &&
            !static_key_false(&nf_hooks_needed[pf][hook]))
                return;
#endif

        rcu_read_lock();
        switch (pf) {
        case NFPROTO_IPV4:
                hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
                break;
        case NFPROTO_IPV6:
                hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (hook_head) {
                struct nf_hook_state state;

                nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn);

                nf_hook_slow_list(head, &state, hook_head);
        }
        rcu_read_unlock();
}

/* Call setsockopt() */
int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt,
                  unsigned int len);
int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
                  int *len);

struct flowi;
struct nf_queue_entry;

__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
                    unsigned int dataoff, u_int8_t protocol,
                    unsigned short family);

__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
                            unsigned int dataoff, unsigned int len,
                            u_int8_t protocol, unsigned short family);
int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
             bool strict, unsigned short family);

#include <net/flow.h>

struct nf_conn;
enum nf_nat_manip_type;
struct nlattr;
enum ip_conntrack_dir;

struct nf_nat_hook {
        int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
                               const struct nlattr *attr);
        void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
        unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
                                  enum nf_nat_manip_type mtype,
                                  enum ip_conntrack_dir dir);
        void (*remove_nat_bysrc)(struct nf_conn *ct);
};

extern const struct nf_nat_hook __rcu *nf_nat_hook;

static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
#if IS_ENABLED(CONFIG_NF_NAT)
        const struct nf_nat_hook *nat_hook;

        rcu_read_lock();
        nat_hook = rcu_dereference(nf_nat_hook);
        if (nat_hook && nat_hook->decode_session)
                nat_hook->decode_session(skb, fl);
        rcu_read_unlock();
#endif
}

#else /* !CONFIG_NETFILTER */
static inline int
NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct sk_buff *skb, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *),
             bool cond)
{
        return okfn(net, sk, skb);
}

static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
        struct sk_buff *skb, struct net_device *in, struct net_device *out,
        int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return okfn(net, sk, skb);
}

static inline void
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
             struct list_head *head, struct net_device *in, struct net_device *out,
             int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        /* nothing to do */
}

static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
                          struct sock *sk, struct sk_buff *skb,
                          struct net_device *indev, struct net_device *outdev,
                          int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
        return 1;
}
struct flowi;
static inline void
nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
{
}
#endif /*CONFIG_NETFILTER*/

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_zones_common.h>

void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
void nf_ct_set_closing(struct nf_conntrack *nfct);
struct nf_conntrack_tuple;
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                         const struct sk_buff *skb);
#else
static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {}
struct nf_conntrack_tuple;
static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                                       const struct sk_buff *skb)
{
        return false;
}
#endif

struct nf_conn;
enum ip_conntrack_info;

struct nf_ct_hook {
        int (*update)(struct net *net, struct sk_buff *skb);
        void (*destroy)(struct nf_conntrack *);
        bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
                              const struct sk_buff *);
        void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
        void (*set_closing)(struct nf_conntrack *nfct);
        int (*confirm)(struct sk_buff *skb);
};
extern const struct nf_ct_hook __rcu *nf_ct_hook;

struct nlattr;

struct nfnl_ct_hook {
        size_t (*build_size)(const struct nf_conn *ct);
        int (*build)(struct sk_buff *skb, struct nf_conn *ct,
                     enum ip_conntrack_info ctinfo,
                     u_int16_t ct_attr, u_int16_t ct_info_attr);
        int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
        int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
                             u32 portid, u32 report);
        void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
                           enum ip_conntrack_info ctinfo, s32 off);
};
extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook;

struct nf_defrag_hook {
        struct module *owner;
        int (*enable)(struct net *net);
        void (*disable)(struct net *net);
};

extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook;
extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook;

/*
 * nf_skb_duplicated - TEE target has sent a packet
 *
 * When a xtables target sends a packet, the OUTPUT and POSTROUTING
 * hooks are traversed again, i.e. nft and xtables are invoked recursively.
 *
 * This is used by xtables TEE target to prevent the duplicated skb from
 * being duplicated again.
 */
DECLARE_PER_CPU(bool, nf_skb_duplicated);

/*
 * Contains bitmask of ctnetlink event subscribers, if any.
 * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag.
 */
extern u8 nf_ctnetlink_has_listener;
#endif /*__LINUX_NETFILTER_H*/









































    2 


















































































































































































































































































    2 














    2 

    2 











    2 
















    2 



    2 





    2 














    2 
    2 





















































    1 


















    1 


    1 



























































    2 

    2 


























    1 



    1 

























































    2 















    2 




    2 













    2 













    2 




















    2 


    2 














    2 



































































    2 





















    2 









    2 














    1 
    1 




















    2 




    2 







    2 
    2 



    1 
    2 




































































































































































































    2 















    2 








































    1 






























    1 

















    1 



    1 







    1 

















    1 








    1 




    1 



    1 













    1 










    1 







    1 




    1 









































































    2 



    2 





















    1 


    2 



    2 


    2 





    1 


    1 



    2 
    2 



    2 


    2 
    2 


















    1 






    2 












































































































































    1 




    1 


















    1 










    1 































































































    1 













    1 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/kernfs/dir.c - kernfs directory implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
 */

#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/hash.h>

#include "kernfs-internal.h"

static DEFINE_RWLOCK(kernfs_rename_lock);        /* kn->parent and ->name */
/*
 * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
 * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
 * will perform wakeups when releasing console_sem. Holding rename_lock
 * will introduce deadlock if the scheduler reads the kernfs_name in the
 * wakeup path.
 */
static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
static char kernfs_pr_cont_buf[PATH_MAX];        /* protected by pr_cont_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock);        /* root->ino_idr */

#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)

static bool __kernfs_active(struct kernfs_node *kn)
{
        return atomic_read(&kn->active) >= 0;
}

static bool kernfs_active(struct kernfs_node *kn)
{
        lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
        return __kernfs_active(kn);
}

static bool kernfs_lockdep(struct kernfs_node *kn)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        return kn->flags & KERNFS_LOCKDEP;
#else
        return false;
#endif
}

static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
        if (!kn)
                return strscpy(buf, "(null)", buflen);

        return strscpy(buf, kn->parent ? kn->name : "/", buflen);
}

/* kernfs_node_depth - compute depth from @from to @to */
static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
        size_t depth = 0;

        while (to->parent && to != from) {
                depth++;
                to = to->parent;
        }
        return depth;
}

static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
                                                  struct kernfs_node *b)
{
        size_t da, db;
        struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);

        if (ra != rb)
                return NULL;

        da = kernfs_depth(ra->kn, a);
        db = kernfs_depth(rb->kn, b);

        while (da > db) {
                a = a->parent;
                da--;
        }
        while (db > da) {
                b = b->parent;
                db--;
        }

        /* worst case b and a will be the same at root */
        while (b != a) {
                b = b->parent;
                a = a->parent;
        }

        return a;
}

/**
 * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
 * where kn_from is treated as root of the path.
 * @kn_from: kernfs node which should be treated as root for the path
 * @kn_to: kernfs node to which path is needed
 * @buf: buffer to copy the path into
 * @buflen: size of @buf
 *
 * We need to handle couple of scenarios here:
 * [1] when @kn_from is an ancestor of @kn_to at some level
 * kn_from: /n1/n2/n3
 * kn_to:   /n1/n2/n3/n4/n5
 * result:  /n4/n5
 *
 * [2] when @kn_from is on a different hierarchy and we need to find common
 * ancestor between @kn_from and @kn_to.
 * kn_from: /n1/n2/n3/n4
 * kn_to:   /n1/n2/n5
 * result:  /../../n5
 * OR
 * kn_from: /n1/n2/n3/n4/n5   [depth=5]
 * kn_to:   /n1/n2/n3         [depth=3]
 * result:  /../..
 *
 * [3] when @kn_to is %NULL result will be "(null)"
 *
 * Return: the length of the constructed path.  If the path would have been
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
                                        struct kernfs_node *kn_from,
                                        char *buf, size_t buflen)
{
        struct kernfs_node *kn, *common;
        const char parent_str[] = "/..";
        size_t depth_from, depth_to, len = 0;
        ssize_t copied;
        int i, j;

        if (!kn_to)
                return strscpy(buf, "(null)", buflen);

        if (!kn_from)
                kn_from = kernfs_root(kn_to)->kn;

        if (kn_from == kn_to)
                return strscpy(buf, "/", buflen);

        common = kernfs_common_ancestor(kn_from, kn_to);
        if (WARN_ON(!common))
                return -EINVAL;

        depth_to = kernfs_depth(common, kn_to);
        depth_from = kernfs_depth(common, kn_from);

        buf[0] = '\0';

        for (i = 0; i < depth_from; i++) {
                copied = strscpy(buf + len, parent_str, buflen - len);
                if (copied < 0)
                        return copied;
                len += copied;
        }

        /* Calculate how many bytes we need for the rest */
        for (i = depth_to - 1; i >= 0; i--) {
                for (kn = kn_to, j = 0; j < i; j++)
                        kn = kn->parent;

                len += scnprintf(buf + len, buflen - len, "/%s", kn->name);
        }

        return len;
}

/**
 * kernfs_name - obtain the name of a given node
 * @kn: kernfs_node of interest
 * @buf: buffer to copy @kn's name into
 * @buflen: size of @buf
 *
 * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
 * similar to strscpy().
 *
 * Fills buffer with "(null)" if @kn is %NULL.
 *
 * Return: the resulting length of @buf. If @buf isn't long enough,
 * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG.
 *
 * This function can be called from any context.
 */
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
        unsigned long flags;
        int ret;

        read_lock_irqsave(&kernfs_rename_lock, flags);
        ret = kernfs_name_locked(kn, buf, buflen);
        read_unlock_irqrestore(&kernfs_rename_lock, flags);
        return ret;
}

/**
 * kernfs_path_from_node - build path of node @to relative to @from.
 * @from: parent kernfs_node relative to which we need to build the path
 * @to: kernfs_node of interest
 * @buf: buffer to copy @to's path into
 * @buflen: size of @buf
 *
 * Builds @to's path relative to @from in @buf. @from and @to must
 * be on the same kernfs-root. If @from is not parent of @to, then a relative
 * path (which includes '..'s) as needed to reach from @from to @to is
 * returned.
 *
 * Return: the length of the constructed path.  If the path would have been
 * greater than @buflen, @buf contains the truncated path with the trailing
 * '\0'.  On error, -errno is returned.
 */
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
                          char *buf, size_t buflen)
{
        unsigned long flags;
        int ret;

        read_lock_irqsave(&kernfs_rename_lock, flags);
        ret = kernfs_path_from_node_locked(to, from, buf, buflen);
        read_unlock_irqrestore(&kernfs_rename_lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kernfs_path_from_node);

/**
 * pr_cont_kernfs_name - pr_cont name of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_name(struct kernfs_node *kn)
{
        unsigned long flags;

        spin_lock_irqsave(&kernfs_pr_cont_lock, flags);

        kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
        pr_cont("%s", kernfs_pr_cont_buf);

        spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}

/**
 * pr_cont_kernfs_path - pr_cont path of a kernfs_node
 * @kn: kernfs_node of interest
 *
 * This function can be called from any context.
 */
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
        unsigned long flags;
        int sz;

        spin_lock_irqsave(&kernfs_pr_cont_lock, flags);

        sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
                                   sizeof(kernfs_pr_cont_buf));
        if (sz < 0) {
                if (sz == -E2BIG)
                        pr_cont("(name too long)");
                else
                        pr_cont("(error)");
                goto out;
        }

        pr_cont("%s", kernfs_pr_cont_buf);

out:
        spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}

/**
 * kernfs_get_parent - determine the parent node and pin it
 * @kn: kernfs_node of interest
 *
 * Determines @kn's parent, pins and returns it.  This function can be
 * called from any context.
 *
 * Return: parent node of @kn
 */
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
        struct kernfs_node *parent;
        unsigned long flags;

        read_lock_irqsave(&kernfs_rename_lock, flags);
        parent = kn->parent;
        kernfs_get(parent);
        read_unlock_irqrestore(&kernfs_rename_lock, flags);

        return parent;
}

/**
 *        kernfs_name_hash - calculate hash of @ns + @name
 *        @name: Null terminated string to hash
 *        @ns:   Namespace tag to hash
 *
 *        Return: 31-bit hash of ns + name (so it fits in an off_t)
 */
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
        unsigned long hash = init_name_hash(ns);
        unsigned int len = strlen(name);
        while (len--)
                hash = partial_name_hash(*name++, hash);
        hash = end_name_hash(hash);
        hash &= 0x7fffffffU;
        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
        if (hash < 2)
                hash += 2;
        if (hash >= INT_MAX)
                hash = INT_MAX - 1;
        return hash;
}

static int kernfs_name_compare(unsigned int hash, const char *name,
                               const void *ns, const struct kernfs_node *kn)
{
        if (hash < kn->hash)
                return -1;
        if (hash > kn->hash)
                return 1;
        if (ns < kn->ns)
                return -1;
        if (ns > kn->ns)
                return 1;
        return strcmp(name, kn->name);
}

static int kernfs_sd_compare(const struct kernfs_node *left,
                             const struct kernfs_node *right)
{
        return kernfs_name_compare(left->hash, left->name, left->ns, right);
}

/**
 *        kernfs_link_sibling - link kernfs_node into sibling rbtree
 *        @kn: kernfs_node of interest
 *
 *        Link @kn into its sibling rbtree which starts from
 *        @kn->parent->dir.children.
 *
 *        Locking:
 *        kernfs_rwsem held exclusive
 *
 *        Return:
 *        %0 on success, -EEXIST on failure.
 */
static int kernfs_link_sibling(struct kernfs_node *kn)
{
        struct rb_node **node = &kn->parent->dir.children.rb_node;
        struct rb_node *parent = NULL;

        while (*node) {
                struct kernfs_node *pos;
                int result;

                pos = rb_to_kn(*node);
                parent = *node;
                result = kernfs_sd_compare(kn, pos);
                if (result < 0)
                        node = &pos->rb.rb_left;
                else if (result > 0)
                        node = &pos->rb.rb_right;
                else
                        return -EEXIST;
        }

        /* add new node and rebalance the tree */
        rb_link_node(&kn->rb, parent, node);
        rb_insert_color(&kn->rb, &kn->parent->dir.children);

        /* successfully added, account subdir number */
        down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
        if (kernfs_type(kn) == KERNFS_DIR)
                kn->parent->dir.subdirs++;
        kernfs_inc_rev(kn->parent);
        up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);

        return 0;
}

/**
 *        kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
 *        @kn: kernfs_node of interest
 *
 *        Try to unlink @kn from its sibling rbtree which starts from
 *        kn->parent->dir.children.
 *
 *        Return: %true if @kn was actually removed,
 *        %false if @kn wasn't on the rbtree.
 *
 *        Locking:
 *        kernfs_rwsem held exclusive
 */
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
        if (RB_EMPTY_NODE(&kn->rb))
                return false;

        down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
        if (kernfs_type(kn) == KERNFS_DIR)
                kn->parent->dir.subdirs--;
        kernfs_inc_rev(kn->parent);
        up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);

        rb_erase(&kn->rb, &kn->parent->dir.children);
        RB_CLEAR_NODE(&kn->rb);
        return true;
}

/**
 *        kernfs_get_active - get an active reference to kernfs_node
 *        @kn: kernfs_node to get an active reference to
 *
 *        Get an active reference of @kn.  This function is noop if @kn
 *        is %NULL.
 *
 *        Return:
 *        Pointer to @kn on success, %NULL on failure.
 */
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
        if (unlikely(!kn))
                return NULL;

        if (!atomic_inc_unless_negative(&kn->active))
                return NULL;

        if (kernfs_lockdep(kn))
                rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
        return kn;
}

/**
 *        kernfs_put_active - put an active reference to kernfs_node
 *        @kn: kernfs_node to put an active reference to
 *
 *        Put an active reference to @kn.  This function is noop if @kn
 *        is %NULL.
 */
void kernfs_put_active(struct kernfs_node *kn)
{
        int v;

        if (unlikely(!kn))
                return;

        if (kernfs_lockdep(kn))
                rwsem_release(&kn->dep_map, _RET_IP_);
        v = atomic_dec_return(&kn->active);
        if (likely(v != KN_DEACTIVATED_BIAS))
                return;

        wake_up_all(&kernfs_root(kn)->deactivate_waitq);
}

/**
 * kernfs_drain - drain kernfs_node
 * @kn: kernfs_node to drain
 *
 * Drain existing usages and nuke all existing mmaps of @kn.  Multiple
 * removers may invoke this function concurrently on @kn and all will
 * return after draining is complete.
 */
static void kernfs_drain(struct kernfs_node *kn)
        __releases(&kernfs_root(kn)->kernfs_rwsem)
        __acquires(&kernfs_root(kn)->kernfs_rwsem)
{
        struct kernfs_root *root = kernfs_root(kn);

        lockdep_assert_held_write(&root->kernfs_rwsem);
        WARN_ON_ONCE(kernfs_active(kn));

        /*
         * Skip draining if already fully drained. This avoids draining and its
         * lockdep annotations for nodes which have never been activated
         * allowing embedding kernfs_remove() in create error paths without
         * worrying about draining.
         */
        if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS &&
            !kernfs_should_drain_open_files(kn))
                return;

        up_write(&root->kernfs_rwsem);

        if (kernfs_lockdep(kn)) {
                rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
                if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
                        lock_contended(&kn->dep_map, _RET_IP_);
        }

        wait_event(root->deactivate_waitq,
                   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);

        if (kernfs_lockdep(kn)) {
                lock_acquired(&kn->dep_map, _RET_IP_);
                rwsem_release(&kn->dep_map, _RET_IP_);
        }

        if (kernfs_should_drain_open_files(kn))
                kernfs_drain_open_files(kn);

        down_write(&root->kernfs_rwsem);
}

/**
 * kernfs_get - get a reference count on a kernfs_node
 * @kn: the target kernfs_node
 */
void kernfs_get(struct kernfs_node *kn)
{
        if (kn) {
                WARN_ON(!atomic_read(&kn->count));
                atomic_inc(&kn->count);
        }
}
EXPORT_SYMBOL_GPL(kernfs_get);

static void kernfs_free_rcu(struct rcu_head *rcu)
{
        struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);

        kfree_const(kn->name);

        if (kn->iattr) {
                simple_xattrs_free(&kn->iattr->xattrs, NULL);
                kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
        }

        kmem_cache_free(kernfs_node_cache, kn);
}

/**
 * kernfs_put - put a reference count on a kernfs_node
 * @kn: the target kernfs_node
 *
 * Put a reference count of @kn and destroy it if it reached zero.
 */
void kernfs_put(struct kernfs_node *kn)
{
        struct kernfs_node *parent;
        struct kernfs_root *root;

        if (!kn || !atomic_dec_and_test(&kn->count))
                return;
        root = kernfs_root(kn);
 repeat:
        /*
         * Moving/renaming is always done while holding reference.
         * kn->parent won't change beneath us.
         */
        parent = kn->parent;

        WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
                  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
                  parent ? parent->name : "", kn->name, atomic_read(&kn->active));

        if (kernfs_type(kn) == KERNFS_LINK)
                kernfs_put(kn->symlink.target_kn);

        spin_lock(&kernfs_idr_lock);
        idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
        spin_unlock(&kernfs_idr_lock);

        call_rcu(&kn->rcu, kernfs_free_rcu);

        kn = parent;
        if (kn) {
                if (atomic_dec_and_test(&kn->count))
                        goto repeat;
        } else {
                /* just released the root kn, free @root too */
                idr_destroy(&root->ino_idr);
                kfree_rcu(root, rcu);
        }
}
EXPORT_SYMBOL_GPL(kernfs_put);

/**
 * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
 * @dentry: the dentry in question
 *
 * Return: the kernfs_node associated with @dentry.  If @dentry is not a
 * kernfs one, %NULL is returned.
 *
 * While the returned kernfs_node will stay accessible as long as @dentry
 * is accessible, the returned node can be in any state and the caller is
 * fully responsible for determining what's accessible.
 */
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
        if (dentry->d_sb->s_op == &kernfs_sops)
                return kernfs_dentry_node(dentry);
        return NULL;
}

static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                                             struct kernfs_node *parent,
                                             const char *name, umode_t mode,
                                             kuid_t uid, kgid_t gid,
                                             unsigned flags)
{
        struct kernfs_node *kn;
        u32 id_highbits;
        int ret;

        name = kstrdup_const(name, GFP_KERNEL);
        if (!name)
                return NULL;

        kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
        if (!kn)
                goto err_out1;

        idr_preload(GFP_KERNEL);
        spin_lock(&kernfs_idr_lock);
        ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
        if (ret >= 0 && ret < root->last_id_lowbits)
                root->id_highbits++;
        id_highbits = root->id_highbits;
        root->last_id_lowbits = ret;
        spin_unlock(&kernfs_idr_lock);
        idr_preload_end();
        if (ret < 0)
                goto err_out2;

        kn->id = (u64)id_highbits << 32 | ret;

        atomic_set(&kn->count, 1);
        atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
        RB_CLEAR_NODE(&kn->rb);

        kn->name = name;
        kn->mode = mode;
        kn->flags = flags;

        if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) {
                struct iattr iattr = {
                        .ia_valid = ATTR_UID | ATTR_GID,
                        .ia_uid = uid,
                        .ia_gid = gid,
                };

                ret = __kernfs_setattr(kn, &iattr);
                if (ret < 0)
                        goto err_out3;
        }

        if (parent) {
                ret = security_kernfs_init_security(parent, kn);
                if (ret)
                        goto err_out3;
        }

        return kn;

 err_out3:
        spin_lock(&kernfs_idr_lock);
        idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
        spin_unlock(&kernfs_idr_lock);
 err_out2:
        kmem_cache_free(kernfs_node_cache, kn);
 err_out1:
        kfree_const(name);
        return NULL;
}

struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    kuid_t uid, kgid_t gid,
                                    unsigned flags)
{
        struct kernfs_node *kn;

        if (parent->mode & S_ISGID) {
                /* this code block imitates inode_init_owner() for
                 * kernfs
                 */

                if (parent->iattr)
                        gid = parent->iattr->ia_gid;

                if (flags & KERNFS_DIR)
                        mode |= S_ISGID;
        }

        kn = __kernfs_new_node(kernfs_root(parent), parent,
                               name, mode, uid, gid, flags);
        if (kn) {
                kernfs_get(parent);
                kn->parent = parent;
        }
        return kn;
}

/*
 * kernfs_find_and_get_node_by_id - get kernfs_node from node id
 * @root: the kernfs root
 * @id: the target node id
 *
 * @id's lower 32bits encode ino and upper gen.  If the gen portion is
 * zero, all generations are matched.
 *
 * Return: %NULL on failure,
 * otherwise a kernfs node with reference counter incremented.
 */
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
                                                   u64 id)
{
        struct kernfs_node *kn;
        ino_t ino = kernfs_id_ino(id);
        u32 gen = kernfs_id_gen(id);

        rcu_read_lock();

        kn = idr_find(&root->ino_idr, (u32)ino);
        if (!kn)
                goto err_unlock;

        if (sizeof(ino_t) >= sizeof(u64)) {
                /* we looked up with the low 32bits, compare the whole */
                if (kernfs_ino(kn) != ino)
                        goto err_unlock;
        } else {
                /* 0 matches all generations */
                if (unlikely(gen && kernfs_gen(kn) != gen))
                        goto err_unlock;
        }

        /*
         * We should fail if @kn has never been activated and guarantee success
         * if the caller knows that @kn is active. Both can be achieved by
         * __kernfs_active() which tests @kn->active without kernfs_rwsem.
         */
        if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
                goto err_unlock;

        rcu_read_unlock();
        return kn;
err_unlock:
        rcu_read_unlock();
        return NULL;
}

/**
 *        kernfs_add_one - add kernfs_node to parent without warning
 *        @kn: kernfs_node to be added
 *
 *        The caller must already have initialized @kn->parent.  This
 *        function increments nlink of the parent's inode if @kn is a
 *        directory and link into the children list of the parent.
 *
 *        Return:
 *        %0 on success, -EEXIST if entry with the given name already
 *        exists.
 */
int kernfs_add_one(struct kernfs_node *kn)
{
        struct kernfs_node *parent = kn->parent;
        struct kernfs_root *root = kernfs_root(parent);
        struct kernfs_iattrs *ps_iattr;
        bool has_ns;
        int ret;

        down_write(&root->kernfs_rwsem);

        ret = -EINVAL;
        has_ns = kernfs_ns_enabled(parent);
        if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
                 has_ns ? "required" : "invalid", parent->name, kn->name))
                goto out_unlock;

        if (kernfs_type(parent) != KERNFS_DIR)
                goto out_unlock;

        ret = -ENOENT;
        if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR))
                goto out_unlock;

        kn->hash = kernfs_name_hash(kn->name, kn->ns);

        ret = kernfs_link_sibling(kn);
        if (ret)
                goto out_unlock;

        /* Update timestamps on the parent */
        down_write(&root->kernfs_iattr_rwsem);

        ps_iattr = parent->iattr;
        if (ps_iattr) {
                ktime_get_real_ts64(&ps_iattr->ia_ctime);
                ps_iattr->ia_mtime = ps_iattr->ia_ctime;
        }

        up_write(&root->kernfs_iattr_rwsem);
        up_write(&root->kernfs_rwsem);

        /*
         * Activate the new node unless CREATE_DEACTIVATED is requested.
         * If not activated here, the kernfs user is responsible for
         * activating the node with kernfs_activate().  A node which hasn't
         * been activated is not visible to userland and its removal won't
         * trigger deactivation.
         */
        if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
                kernfs_activate(kn);
        return 0;

out_unlock:
        up_write(&root->kernfs_rwsem);
        return ret;
}

/**
 * kernfs_find_ns - find kernfs_node with the given name
 * @parent: kernfs_node to search under
 * @name: name to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with name @name under @parent.
 *
 * Return: pointer to the found kernfs_node on success, %NULL on failure.
 */
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
                                          const unsigned char *name,
                                          const void *ns)
{
        struct rb_node *node = parent->dir.children.rb_node;
        bool has_ns = kernfs_ns_enabled(parent);
        unsigned int hash;

        lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);

        if (has_ns != (bool)ns) {
                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
                     has_ns ? "required" : "invalid", parent->name, name);
                return NULL;
        }

        hash = kernfs_name_hash(name, ns);
        while (node) {
                struct kernfs_node *kn;
                int result;

                kn = rb_to_kn(node);
                result = kernfs_name_compare(hash, name, ns, kn);
                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
                        node = node->rb_right;
                else
                        return kn;
        }
        return NULL;
}

static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
                                          const unsigned char *path,
                                          const void *ns)
{
        ssize_t len;
        char *p, *name;

        lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);

        spin_lock_irq(&kernfs_pr_cont_lock);

        len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));

        if (len < 0) {
                spin_unlock_irq(&kernfs_pr_cont_lock);
                return NULL;
        }

        p = kernfs_pr_cont_buf;

        while ((name = strsep(&p, "/")) && parent) {
                if (*name == '\0')
                        continue;
                parent = kernfs_find_ns(parent, name, ns);
        }

        spin_unlock_irq(&kernfs_pr_cont_lock);

        return parent;
}

/**
 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
 * @parent: kernfs_node to search under
 * @name: name to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with name @name under @parent and get a reference
 * if found.  This function may sleep.
 *
 * Return: pointer to the found kernfs_node on success, %NULL on failure.
 */
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
                                           const char *name, const void *ns)
{
        struct kernfs_node *kn;
        struct kernfs_root *root = kernfs_root(parent);

        down_read(&root->kernfs_rwsem);
        kn = kernfs_find_ns(parent, name, ns);
        kernfs_get(kn);
        up_read(&root->kernfs_rwsem);

        return kn;
}
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);

/**
 * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
 * @parent: kernfs_node to search under
 * @path: path to look for
 * @ns: the namespace tag to use
 *
 * Look for kernfs_node with path @path under @parent and get a reference
 * if found.  This function may sleep.
 *
 * Return: pointer to the found kernfs_node on success, %NULL on failure.
 */
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
                                           const char *path, const void *ns)
{
        struct kernfs_node *kn;
        struct kernfs_root *root = kernfs_root(parent);

        down_read(&root->kernfs_rwsem);
        kn = kernfs_walk_ns(parent, path, ns);
        kernfs_get(kn);
        up_read(&root->kernfs_rwsem);

        return kn;
}

/**
 * kernfs_create_root - create a new kernfs hierarchy
 * @scops: optional syscall operations for the hierarchy
 * @flags: KERNFS_ROOT_* flags
 * @priv: opaque data associated with the new directory
 *
 * Return: the root of the new hierarchy on success, ERR_PTR() value on
 * failure.
 */
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                       unsigned int flags, void *priv)
{
        struct kernfs_root *root;
        struct kernfs_node *kn;

        root = kzalloc(sizeof(*root), GFP_KERNEL);
        if (!root)
                return ERR_PTR(-ENOMEM);

        idr_init(&root->ino_idr);
        init_rwsem(&root->kernfs_rwsem);
        init_rwsem(&root->kernfs_iattr_rwsem);
        init_rwsem(&root->kernfs_supers_rwsem);
        INIT_LIST_HEAD(&root->supers);

        /*
         * On 64bit ino setups, id is ino.  On 32bit, low 32bits are ino.
         * High bits generation.  The starting value for both ino and
         * genenration is 1.  Initialize upper 32bit allocation
         * accordingly.
         */
        if (sizeof(ino_t) >= sizeof(u64))
                root->id_highbits = 0;
        else
                root->id_highbits = 1;

        kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
                               GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
                               KERNFS_DIR);
        if (!kn) {
                idr_destroy(&root->ino_idr);
                kfree(root);
                return ERR_PTR(-ENOMEM);
        }

        kn->priv = priv;
        kn->dir.root = root;

        root->syscall_ops = scops;
        root->flags = flags;
        root->kn = kn;
        init_waitqueue_head(&root->deactivate_waitq);

        if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
                kernfs_activate(kn);

        return root;
}

/**
 * kernfs_destroy_root - destroy a kernfs hierarchy
 * @root: root of the hierarchy to destroy
 *
 * Destroy the hierarchy anchored at @root by removing all existing
 * directories and destroying @root.
 */
void kernfs_destroy_root(struct kernfs_root *root)
{
        /*
         *  kernfs_remove holds kernfs_rwsem from the root so the root
         *  shouldn't be freed during the operation.
         */
        kernfs_get(root->kn);
        kernfs_remove(root->kn);
        kernfs_put(root->kn); /* will also free @root */
}

/**
 * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
 * @root: root to use to lookup
 *
 * Return: @root's kernfs_node
 */
struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
{
        return root->kn;
}

/**
 * kernfs_create_dir_ns - create a directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 * @mode: mode of the new directory
 * @uid: uid of the new directory
 * @gid: gid of the new directory
 * @priv: opaque data associated with the new directory
 * @ns: optional namespace tag of the directory
 *
 * Return: the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         kuid_t uid, kgid_t gid,
                                         void *priv, const void *ns)
{
        struct kernfs_node *kn;
        int rc;

        /* allocate */
        kn = kernfs_new_node(parent, name, mode | S_IFDIR,
                             uid, gid, KERNFS_DIR);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->dir.root = parent->dir.root;
        kn->ns = ns;
        kn->priv = priv;

        /* link in */
        rc = kernfs_add_one(kn);
        if (!rc)
                return kn;

        kernfs_put(kn);
        return ERR_PTR(rc);
}

/**
 * kernfs_create_empty_dir - create an always empty directory
 * @parent: parent in which to create a new directory
 * @name: name of the new directory
 *
 * Return: the created node on success, ERR_PTR() value on failure.
 */
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
                                            const char *name)
{
        struct kernfs_node *kn;
        int rc;

        /* allocate */
        kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR,
                             GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR);
        if (!kn)
                return ERR_PTR(-ENOMEM);

        kn->flags |= KERNFS_EMPTY_DIR;
        kn->dir.root = parent->dir.root;
        kn->ns = NULL;
        kn->priv = NULL;

        /* link in */
        rc = kernfs_add_one(kn);
        if (!rc)
                return kn;

        kernfs_put(kn);
        return ERR_PTR(rc);
}

static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
{
        struct kernfs_node *kn;
        struct kernfs_root *root;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        /* Negative hashed dentry? */
        if (d_really_is_negative(dentry)) {
                struct kernfs_node *parent;

                /* If the kernfs parent node has changed discard and
                 * proceed to ->lookup.
                 *
                 * There's nothing special needed here when getting the
                 * dentry parent, even if a concurrent rename is in
                 * progress. That's because the dentry is negative so
                 * it can only be the target of the rename and it will
                 * be doing a d_move() not a replace. Consequently the
                 * dentry d_parent won't change over the d_move().
                 *
                 * Also kernfs negative dentries transitioning from
                 * negative to positive during revalidate won't happen
                 * because they are invalidated on containing directory
                 * changes and the lookup re-done so that a new positive
                 * dentry can be properly created.
                 */
                root = kernfs_root_from_sb(dentry->d_sb);
                down_read(&root->kernfs_rwsem);
                parent = kernfs_dentry_node(dentry->d_parent);
                if (parent) {
                        if (kernfs_dir_changed(parent, dentry)) {
                                up_read(&root->kernfs_rwsem);
                                return 0;
                        }
                }
                up_read(&root->kernfs_rwsem);

                /* The kernfs parent node hasn't changed, leave the
                 * dentry negative and return success.
                 */
                return 1;
        }

        kn = kernfs_dentry_node(dentry);
        root = kernfs_root(kn);
        down_read(&root->kernfs_rwsem);

        /* The kernfs node has been deactivated */
        if (!kernfs_active(kn))
                goto out_bad;

        /* The kernfs node has been moved? */
        if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
                goto out_bad;

        /* The kernfs node has been renamed */
        if (strcmp(dentry->d_name.name, kn->name) != 0)
                goto out_bad;

        /* The kernfs node has been moved to a different namespace */
        if (kn->parent && kernfs_ns_enabled(kn->parent) &&
            kernfs_info(dentry->d_sb)->ns != kn->ns)
                goto out_bad;

        up_read(&root->kernfs_rwsem);
        return 1;
out_bad:
        up_read(&root->kernfs_rwsem);
        return 0;
}

const struct dentry_operations kernfs_dops = {
        .d_revalidate        = kernfs_dop_revalidate,
};

static struct dentry *kernfs_iop_lookup(struct inode *dir,
                                        struct dentry *dentry,
                                        unsigned int flags)
{
        struct kernfs_node *parent = dir->i_private;
        struct kernfs_node *kn;
        struct kernfs_root *root;
        struct inode *inode = NULL;
        const void *ns = NULL;

        root = kernfs_root(parent);
        down_read(&root->kernfs_rwsem);
        if (kernfs_ns_enabled(parent))
                ns = kernfs_info(dir->i_sb)->ns;

        kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
        /* attach dentry and inode */
        if (kn) {
                /* Inactive nodes are invisible to the VFS so don't
                 * create a negative.
                 */
                if (!kernfs_active(kn)) {
                        up_read(&root->kernfs_rwsem);
                        return NULL;
                }
                inode = kernfs_get_inode(dir->i_sb, kn);
                if (!inode)
                        inode = ERR_PTR(-ENOMEM);
        }
        /*
         * Needed for negative dentry validation.
         * The negative dentry can be created in kernfs_iop_lookup()
         * or transforms from positive dentry in dentry_unlink_inode()
         * called from vfs_rmdir().
         */
        if (!IS_ERR(inode))
                kernfs_set_rev(parent, dentry);
        up_read(&root->kernfs_rwsem);

        /* instantiate and hash (possibly negative) dentry */
        return d_splice_alias(inode, dentry);
}

static int kernfs_iop_mkdir(struct mnt_idmap *idmap,
                            struct inode *dir, struct dentry *dentry,
                            umode_t mode)
{
        struct kernfs_node *parent = dir->i_private;
        struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
        int ret;

        if (!scops || !scops->mkdir)
                return -EPERM;

        if (!kernfs_get_active(parent))
                return -ENODEV;

        ret = scops->mkdir(parent, dentry->d_name.name, mode);

        kernfs_put_active(parent);
        return ret;
}

static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
{
        struct kernfs_node *kn  = kernfs_dentry_node(dentry);
        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
        int ret;

        if (!scops || !scops->rmdir)
                return -EPERM;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        ret = scops->rmdir(kn);

        kernfs_put_active(kn);
        return ret;
}

static int kernfs_iop_rename(struct mnt_idmap *idmap,
                             struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        struct kernfs_node *kn = kernfs_dentry_node(old_dentry);
        struct kernfs_node *new_parent = new_dir->i_private;
        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
        int ret;

        if (flags)
                return -EINVAL;

        if (!scops || !scops->rename)
                return -EPERM;

        if (!kernfs_get_active(kn))
                return -ENODEV;

        if (!kernfs_get_active(new_parent)) {
                kernfs_put_active(kn);
                return -ENODEV;
        }

        ret = scops->rename(kn, new_parent, new_dentry->d_name.name);

        kernfs_put_active(new_parent);
        kernfs_put_active(kn);
        return ret;
}

const struct inode_operations kernfs_dir_iops = {
        .lookup                = kernfs_iop_lookup,
        .permission        = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
        .getattr        = kernfs_iop_getattr,
        .listxattr        = kernfs_iop_listxattr,

        .mkdir                = kernfs_iop_mkdir,
        .rmdir                = kernfs_iop_rmdir,
        .rename                = kernfs_iop_rename,
};

static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
{
        struct kernfs_node *last;

        while (true) {
                struct rb_node *rbn;

                last = pos;

                if (kernfs_type(pos) != KERNFS_DIR)
                        break;

                rbn = rb_first(&pos->dir.children);
                if (!rbn)
                        break;

                pos = rb_to_kn(rbn);
        }

        return last;
}

/**
 * kernfs_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: kernfs_node whose descendants to walk
 *
 * Find the next descendant to visit for post-order traversal of @root's
 * descendants.  @root is included in the iteration and the last node to be
 * visited.
 *
 * Return: the next descendant to visit or %NULL when done.
 */
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
                                                       struct kernfs_node *root)
{
        struct rb_node *rbn;

        lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);

        /* if first iteration, visit leftmost descendant which may be root */
        if (!pos)
                return kernfs_leftmost_descendant(root);

        /* if we visited @root, we're done */
        if (pos == root)
                return NULL;

        /* if there's an unvisited sibling, visit its leftmost descendant */
        rbn = rb_next(&pos->rb);
        if (rbn)
                return kernfs_leftmost_descendant(rb_to_kn(rbn));

        /* no sibling left, visit parent */
        return pos->parent;
}

static void kernfs_activate_one(struct kernfs_node *kn)
{
        lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);

        kn->flags |= KERNFS_ACTIVATED;

        if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
                return;

        WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb));
        WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);

        atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
}

/**
 * kernfs_activate - activate a node which started deactivated
 * @kn: kernfs_node whose subtree is to be activated
 *
 * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
 * needs to be explicitly activated.  A node which hasn't been activated
 * isn't visible to userland and deactivation is skipped during its
 * removal.  This is useful to construct atomic init sequences where
 * creation of multiple nodes should either succeed or fail atomically.
 *
 * The caller is responsible for ensuring that this function is not called
 * after kernfs_remove*() is invoked on @kn.
 */
void kernfs_activate(struct kernfs_node *kn)
{
        struct kernfs_node *pos;
        struct kernfs_root *root = kernfs_root(kn);

        down_write(&root->kernfs_rwsem);

        pos = NULL;
        while ((pos = kernfs_next_descendant_post(pos, kn)))
                kernfs_activate_one(pos);

        up_write(&root->kernfs_rwsem);
}

/**
 * kernfs_show - show or hide a node
 * @kn: kernfs_node to show or hide
 * @show: whether to show or hide
 *
 * If @show is %false, @kn is marked hidden and deactivated. A hidden node is
 * ignored in future activaitons. If %true, the mark is removed and activation
 * state is restored. This function won't implicitly activate a new node in a
 * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet.
 *
 * To avoid recursion complexities, directories aren't supported for now.
 */
void kernfs_show(struct kernfs_node *kn, bool show)
{
        struct kernfs_root *root = kernfs_root(kn);

        if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR))
                return;

        down_write(&root->kernfs_rwsem);

        if (show) {
                kn->flags &= ~KERNFS_HIDDEN;
                if (kn->flags & KERNFS_ACTIVATED)
                        kernfs_activate_one(kn);
        } else {
                kn->flags |= KERNFS_HIDDEN;
                if (kernfs_active(kn))
                        atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
                kernfs_drain(kn);
        }

        up_write(&root->kernfs_rwsem);
}

static void __kernfs_remove(struct kernfs_node *kn)
{
        struct kernfs_node *pos;

        /* Short-circuit if non-root @kn has already finished removal. */
        if (!kn)
                return;

        lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);

        /*
         * This is for kernfs_remove_self() which plays with active ref
         * after removal.
         */
        if (kn->parent && RB_EMPTY_NODE(&kn->rb))
                return;

        pr_debug("kernfs %s: removing\n", kn->name);

        /* prevent new usage by marking all nodes removing and deactivating */
        pos = NULL;
        while ((pos = kernfs_next_descendant_post(pos, kn))) {
                pos->flags |= KERNFS_REMOVING;
                if (kernfs_active(pos))
                        atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
        }

        /* deactivate and unlink the subtree node-by-node */
        do {
                pos = kernfs_leftmost_descendant(kn);

                /*
                 * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's
                 * base ref could have been put by someone else by the time
                 * the function returns.  Make sure it doesn't go away
                 * underneath us.
                 */
                kernfs_get(pos);

                kernfs_drain(pos);

                /*
                 * kernfs_unlink_sibling() succeeds once per node.  Use it
                 * to decide who's responsible for cleanups.
                 */
                if (!pos->parent || kernfs_unlink_sibling(pos)) {
                        struct kernfs_iattrs *ps_iattr =
                                pos->parent ? pos->parent->iattr : NULL;

                        /* update timestamps on the parent */
                        down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);

                        if (ps_iattr) {
                                ktime_get_real_ts64(&ps_iattr->ia_ctime);
                                ps_iattr->ia_mtime = ps_iattr->ia_ctime;
                        }

                        up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
                        kernfs_put(pos);
                }

                kernfs_put(pos);
        } while (pos != kn);
}

/**
 * kernfs_remove - remove a kernfs_node recursively
 * @kn: the kernfs_node to remove
 *
 * Remove @kn along with all its subdirectories and files.
 */
void kernfs_remove(struct kernfs_node *kn)
{
        struct kernfs_root *root;

        if (!kn)
                return;

        root = kernfs_root(kn);

        down_write(&root->kernfs_rwsem);
        __kernfs_remove(kn);
        up_write(&root->kernfs_rwsem);
}

/**
 * kernfs_break_active_protection - break out of active protection
 * @kn: the self kernfs_node
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  Each invocation of
 * this function must also be matched with an invocation of
 * kernfs_unbreak_active_protection().
 *
 * This function releases the active reference of @kn the caller is
 * holding.  Once this function is called, @kn may be removed at any point
 * and the caller is solely responsible for ensuring that the objects it
 * dereferences are accessible.
 */
void kernfs_break_active_protection(struct kernfs_node *kn)
{
        /*
         * Take out ourself out of the active ref dependency chain.  If
         * we're called without an active ref, lockdep will complain.
         */
        kernfs_put_active(kn);
}

/**
 * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
 * @kn: the self kernfs_node
 *
 * If kernfs_break_active_protection() was called, this function must be
 * invoked before finishing the kernfs operation.  Note that while this
 * function restores the active reference, it doesn't and can't actually
 * restore the active protection - @kn may already or be in the process of
 * being removed.  Once kernfs_break_active_protection() is invoked, that
 * protection is irreversibly gone for the kernfs operation instance.
 *
 * While this function may be called at any point after
 * kernfs_break_active_protection() is invoked, its most useful location
 * would be right before the enclosing kernfs operation returns.
 */
void kernfs_unbreak_active_protection(struct kernfs_node *kn)
{
        /*
         * @kn->active could be in any state; however, the increment we do
         * here will be undone as soon as the enclosing kernfs operation
         * finishes and this temporary bump can't break anything.  If @kn
         * is alive, nothing changes.  If @kn is being deactivated, the
         * soon-to-follow put will either finish deactivation or restore
         * deactivated state.  If @kn is already removed, the temporary
         * bump is guaranteed to be gone before @kn is released.
         */
        atomic_inc(&kn->active);
        if (kernfs_lockdep(kn))
                rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
}

/**
 * kernfs_remove_self - remove a kernfs_node from its own method
 * @kn: the self kernfs_node to remove
 *
 * The caller must be running off of a kernfs operation which is invoked
 * with an active reference - e.g. one of kernfs_ops.  This can be used to
 * implement a file operation which deletes itself.
 *
 * For example, the "delete" file for a sysfs device directory can be
 * implemented by invoking kernfs_remove_self() on the "delete" file
 * itself.  This function breaks the circular dependency of trying to
 * deactivate self while holding an active ref itself.  It isn't necessary
 * to modify the usual removal path to use kernfs_remove_self().  The
 * "delete" implementation can simply invoke kernfs_remove_self() on self
 * before proceeding with the usual removal path.  kernfs will ignore later
 * kernfs_remove() on self.
 *
 * kernfs_remove_self() can be called multiple times concurrently on the
 * same kernfs_node.  Only the first one actually performs removal and
 * returns %true.  All others will wait until the kernfs operation which
 * won self-removal finishes and return %false.  Note that the losers wait
 * for the completion of not only the winning kernfs_remove_self() but also
 * the whole kernfs_ops which won the arbitration.  This can be used to
 * guarantee, for example, all concurrent writes to a "delete" file to
 * finish only after the whole operation is complete.
 *
 * Return: %true if @kn is removed by this call, otherwise %false.
 */
bool kernfs_remove_self(struct kernfs_node *kn)
{
        bool ret;
        struct kernfs_root *root = kernfs_root(kn);

        down_write(&root->kernfs_rwsem);
        kernfs_break_active_protection(kn);

        /*
         * SUICIDAL is used to arbitrate among competing invocations.  Only
         * the first one will actually perform removal.  When the removal
         * is complete, SUICIDED is set and the active ref is restored
         * while kernfs_rwsem for held exclusive.  The ones which lost
         * arbitration waits for SUICIDED && drained which can happen only
         * after the enclosing kernfs operation which executed the winning
         * instance of kernfs_remove_self() finished.
         */
        if (!(kn->flags & KERNFS_SUICIDAL)) {
                kn->flags |= KERNFS_SUICIDAL;
                __kernfs_remove(kn);
                kn->flags |= KERNFS_SUICIDED;
                ret = true;
        } else {
                wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
                DEFINE_WAIT(wait);

                while (true) {
                        prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);

                        if ((kn->flags & KERNFS_SUICIDED) &&
                            atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
                                break;

                        up_write(&root->kernfs_rwsem);
                        schedule();
                        down_write(&root->kernfs_rwsem);
                }
                finish_wait(waitq, &wait);
                WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
                ret = false;
        }

        /*
         * This must be done while kernfs_rwsem held exclusive; otherwise,
         * waiting for SUICIDED && deactivated could finish prematurely.
         */
        kernfs_unbreak_active_protection(kn);

        up_write(&root->kernfs_rwsem);
        return ret;
}

/**
 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
 * @parent: parent of the target
 * @name: name of the kernfs_node to remove
 * @ns: namespace tag of the kernfs_node to remove
 *
 * Look for the kernfs_node with @name and @ns under @parent and remove it.
 *
 * Return: %0 on success, -ENOENT if such entry doesn't exist.
 */
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const void *ns)
{
        struct kernfs_node *kn;
        struct kernfs_root *root;

        if (!parent) {
                WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
                        name);
                return -ENOENT;
        }

        root = kernfs_root(parent);
        down_write(&root->kernfs_rwsem);

        kn = kernfs_find_ns(parent, name, ns);
        if (kn) {
                kernfs_get(kn);
                __kernfs_remove(kn);
                kernfs_put(kn);
        }

        up_write(&root->kernfs_rwsem);

        if (kn)
                return 0;
        else
                return -ENOENT;
}

/**
 * kernfs_rename_ns - move and rename a kernfs_node
 * @kn: target node
 * @new_parent: new parent to put @sd under
 * @new_name: new name
 * @new_ns: new namespace tag
 *
 * Return: %0 on success, -errno on failure.
 */
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const void *new_ns)
{
        struct kernfs_node *old_parent;
        struct kernfs_root *root;
        const char *old_name = NULL;
        int error;

        /* can't move or rename root */
        if (!kn->parent)
                return -EINVAL;

        root = kernfs_root(kn);
        down_write(&root->kernfs_rwsem);

        error = -ENOENT;
        if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
            (new_parent->flags & KERNFS_EMPTY_DIR))
                goto out;

        error = 0;
        if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
            (strcmp(kn->name, new_name) == 0))
                goto out;        /* nothing to rename */

        error = -EEXIST;
        if (kernfs_find_ns(new_parent, new_name, new_ns))
                goto out;

        /* rename kernfs_node */
        if (strcmp(kn->name, new_name) != 0) {
                error = -ENOMEM;
                new_name = kstrdup_const(new_name, GFP_KERNEL);
                if (!new_name)
                        goto out;
        } else {
                new_name = NULL;
        }

        /*
         * Move to the appropriate place in the appropriate directories rbtree.
         */
        kernfs_unlink_sibling(kn);
        kernfs_get(new_parent);

        /* rename_lock protects ->parent and ->name accessors */
        write_lock_irq(&kernfs_rename_lock);

        old_parent = kn->parent;
        kn->parent = new_parent;

        kn->ns = new_ns;
        if (new_name) {
                old_name = kn->name;
                kn->name = new_name;
        }

        write_unlock_irq(&kernfs_rename_lock);

        kn->hash = kernfs_name_hash(kn->name, kn->ns);
        kernfs_link_sibling(kn);

        kernfs_put(old_parent);
        kfree_const(old_name);

        error = 0;
 out:
        up_write(&root->kernfs_rwsem);
        return error;
}

static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
{
        kernfs_put(filp->private_data);
        return 0;
}

static struct kernfs_node *kernfs_dir_pos(const void *ns,
        struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
{
        if (pos) {
                int valid = kernfs_active(pos) &&
                        pos->parent == parent && hash == pos->hash;
                kernfs_put(pos);
                if (!valid)
                        pos = NULL;
        }
        if (!pos && (hash > 1) && (hash < INT_MAX)) {
                struct rb_node *node = parent->dir.children.rb_node;
                while (node) {
                        pos = rb_to_kn(node);

                        if (hash < pos->hash)
                                node = node->rb_left;
                        else if (hash > pos->hash)
                                node = node->rb_right;
                        else
                                break;
                }
        }
        /* Skip over entries which are dying/dead or in the wrong namespace */
        while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
                struct rb_node *node = rb_next(&pos->rb);
                if (!node)
                        pos = NULL;
                else
                        pos = rb_to_kn(node);
        }
        return pos;
}

static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
        struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
{
        pos = kernfs_dir_pos(ns, parent, ino, pos);
        if (pos) {
                do {
                        struct rb_node *node = rb_next(&pos->rb);
                        if (!node)
                                pos = NULL;
                        else
                                pos = rb_to_kn(node);
                } while (pos && (!kernfs_active(pos) || pos->ns != ns));
        }
        return pos;
}

static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct kernfs_node *parent = kernfs_dentry_node(dentry);
        struct kernfs_node *pos = file->private_data;
        struct kernfs_root *root;
        const void *ns = NULL;

        if (!dir_emit_dots(file, ctx))
                return 0;

        root = kernfs_root(parent);
        down_read(&root->kernfs_rwsem);

        if (kernfs_ns_enabled(parent))
                ns = kernfs_info(dentry->d_sb)->ns;

        for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
             pos;
             pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
                const char *name = pos->name;
                unsigned int type = fs_umode_to_dtype(pos->mode);
                int len = strlen(name);
                ino_t ino = kernfs_ino(pos);

                ctx->pos = pos->hash;
                file->private_data = pos;
                kernfs_get(pos);

                up_read(&root->kernfs_rwsem);
                if (!dir_emit(ctx, name, len, ino, type))
                        return 0;
                down_read(&root->kernfs_rwsem);
        }
        up_read(&root->kernfs_rwsem);
        file->private_data = NULL;
        ctx->pos = INT_MAX;
        return 0;
}

const struct file_operations kernfs_dir_fops = {
        .read                = generic_read_dir,
        .iterate_shared        = kernfs_fop_readdir,
        .release        = kernfs_dir_fop_release,
        .llseek                = generic_file_llseek,
};


































    1 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions of the Internet Protocol.
 *
 * Version:        @(#)in.h        1.0.1        04/21/93
 *
 * Authors:        Original taken from the GNU Project <netinet/in.h> file.
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_IN_H
#define _LINUX_IN_H


#include <linux/errno.h>
#include <uapi/linux/in.h>

static inline int proto_ports_offset(int proto)
{
        switch (proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_DCCP:
        case IPPROTO_ESP:        /* SPI */
        case IPPROTO_SCTP:
        case IPPROTO_UDPLITE:
                return 0;
        case IPPROTO_AH:        /* SPI */
                return 4;
        default:
                return -EINVAL;
        }
}

static inline bool ipv4_is_loopback(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x7f000000);
}

static inline bool ipv4_is_multicast(__be32 addr)
{
        return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}

static inline bool ipv4_is_local_multicast(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xe0000000);
}

static inline bool ipv4_is_lbcast(__be32 addr)
{
        /* limited broadcast */
        return addr == htonl(INADDR_BROADCAST);
}

static inline bool ipv4_is_all_snoopers(__be32 addr)
{
        return addr == htonl(INADDR_ALLSNOOPERS_GROUP);
}

static inline bool ipv4_is_zeronet(__be32 addr)
{
        return (addr == 0);
}

/* Special-Use IPv4 Addresses (RFC3330) */

static inline bool ipv4_is_private_10(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x0a000000);
}

static inline bool ipv4_is_private_172(__be32 addr)
{
        return (addr & htonl(0xfff00000)) == htonl(0xac100000);
}

static inline bool ipv4_is_private_192(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xc0a80000);
}

static inline bool ipv4_is_linklocal_169(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000);
}

static inline bool ipv4_is_anycast_6to4(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0586300);
}

static inline bool ipv4_is_test_192(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0000200);
}

static inline bool ipv4_is_test_198(__be32 addr)
{
        return (addr & htonl(0xfffe0000)) == htonl(0xc6120000);
}
#endif        /* _LINUX_IN_H */



























































    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM notifier

#if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NOTIFIERS_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(notifier_info,

        TP_PROTO(void *cb),

        TP_ARGS(cb),

        TP_STRUCT__entry(
                __field(void *, cb)
        ),

        TP_fast_assign(
                __entry->cb = cb;
        ),

        TP_printk("%ps", __entry->cb)
);

/*
 * notifier_register - called upon notifier callback registration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_register,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_unregister - called upon notifier callback unregistration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_unregister,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_run - called upon notifier callback execution
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_run,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

#endif /* _TRACE_NOTIFIERS_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
































    1 
    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// SPDX-License-Identifier: GPL-2.0-only
/*
 * A generic implementation of binary search for the Linux kernel
 *
 * Copyright (C) 2008-2009 Ksplice, Inc.
 * Author: Tim Abbott <tabbott@ksplice.com>
 */

#include <linux/export.h>
#include <linux/bsearch.h>
#include <linux/kprobes.h>

/*
 * bsearch - binary search an array of elements
 * @key: pointer to item being searched for
 * @base: pointer to first element to search
 * @num: number of elements
 * @size: size of each element
 * @cmp: pointer to comparison function
 *
 * This function does a binary search on the given array.  The
 * contents of the array should already be in ascending sorted order
 * under the provided comparison function.
 *
 * Note that the key need not have the same type as the elements in
 * the array, e.g. key could be a string and the comparison function
 * could compare the string with the struct's name field.  However, if
 * the key and elements in the array are of the same type, you can use
 * the same comparison function for both sort() and bsearch().
 */
void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        return __inline_bsearch(key, base, num, size, cmp);
}
EXPORT_SYMBOL(bsearch);
NOKPROBE_SYMBOL(bsearch);
































































    1 
    1 


































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _INET_ECN_H_
#define _INET_ECN_H_

#include <linux/ip.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>

#include <net/inet_sock.h>
#include <net/dsfield.h>
#include <net/checksum.h>

enum {
        INET_ECN_NOT_ECT = 0,
        INET_ECN_ECT_1 = 1,
        INET_ECN_ECT_0 = 2,
        INET_ECN_CE = 3,
        INET_ECN_MASK = 3,
};

extern int sysctl_tunnel_ecn_log;

static inline int INET_ECN_is_ce(__u8 dsfield)
{
        return (dsfield & INET_ECN_MASK) == INET_ECN_CE;
}

static inline int INET_ECN_is_not_ect(__u8 dsfield)
{
        return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT;
}

static inline int INET_ECN_is_capable(__u8 dsfield)
{
        return dsfield & INET_ECN_ECT_0;
}

/*
 * RFC 3168 9.1.1
 *  The full-functionality option for ECN encapsulation is to copy the
 *  ECN codepoint of the inside header to the outside header on
 *  encapsulation if the inside header is not-ECT or ECT, and to set the
 *  ECN codepoint of the outside header to ECT(0) if the ECN codepoint of
 *  the inside header is CE.
 */
static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
{
        outer &= ~INET_ECN_MASK;
        outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) :
                                          INET_ECN_ECT_0;
        return outer;
}

static inline void INET_ECN_xmit(struct sock *sk)
{
        inet_sk(sk)->tos |= INET_ECN_ECT_0;
        if (inet6_sk(sk) != NULL)
                inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
}

static inline void INET_ECN_dontxmit(struct sock *sk)
{
        inet_sk(sk)->tos &= ~INET_ECN_MASK;
        if (inet6_sk(sk) != NULL)
                inet6_sk(sk)->tclass &= ~INET_ECN_MASK;
}

#define IP6_ECN_flow_init(label) do {                \
      (label) &= ~htonl(INET_ECN_MASK << 20);        \
    } while (0)

#define        IP6_ECN_flow_xmit(sk, label) do {                                \
        if (INET_ECN_is_capable(inet6_sk(sk)->tclass))                        \
                (label) |= htonl(INET_ECN_ECT_0 << 20);                        \
    } while (0)

static inline int IP_ECN_set_ce(struct iphdr *iph)
{
        u32 ecn = (iph->tos + 1) & INET_ECN_MASK;
        __be16 check_add;

        /*
         * After the last operation we have (in binary):
         * INET_ECN_NOT_ECT => 01
         * INET_ECN_ECT_1   => 10
         * INET_ECN_ECT_0   => 11
         * INET_ECN_CE      => 00
         */
        if (!(ecn & 2))
                return !ecn;

        /*
         * The following gives us:
         * INET_ECN_ECT_1 => check += htons(0xFFFD)
         * INET_ECN_ECT_0 => check += htons(0xFFFE)
         */
        check_add = (__force __be16)((__force u16)htons(0xFFFB) +
                                     (__force u16)htons(ecn));

        iph->check = csum16_add(iph->check, check_add);
        iph->tos |= INET_ECN_CE;
        return 1;
}

static inline int IP_ECN_set_ect1(struct iphdr *iph)
{
        if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0)
                return 0;

        iph->check = csum16_add(iph->check, htons(0x1));
        iph->tos ^= INET_ECN_MASK;
        return 1;
}

static inline void IP_ECN_clear(struct iphdr *iph)
{
        iph->tos &= ~INET_ECN_MASK;
}

static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner)
{
        dscp &= ~INET_ECN_MASK;
        ipv4_change_dsfield(inner, INET_ECN_MASK, dscp);
}

struct ipv6hdr;

/* Note:
 * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE,
 * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE
 * In IPv6 case, no checksum compensates the change in IPv6 header,
 * so we have to update skb->csum.
 */
static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph)
{
        __be32 from, to;

        if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph)))
                return 0;

        from = *(__be32 *)iph;
        to = from | htonl(INET_ECN_CE << 20);
        *(__be32 *)iph = to;
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
                                     (__force __wsum)to);
        return 1;
}

static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph)
{
        __be32 from, to;

        if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0)
                return 0;

        from = *(__be32 *)iph;
        to = from ^ htonl(INET_ECN_MASK << 20);
        *(__be32 *)iph = to;
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
                                     (__force __wsum)to);
        return 1;
}

static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner)
{
        dscp &= ~INET_ECN_MASK;
        ipv6_change_dsfield(inner, INET_ECN_MASK, dscp);
}

static inline int INET_ECN_set_ce(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (skb_network_header(skb) + sizeof(struct iphdr) <=
                    skb_tail_pointer(skb))
                        return IP_ECN_set_ce(ip_hdr(skb));
                break;

        case cpu_to_be16(ETH_P_IPV6):
                if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
                    skb_tail_pointer(skb))
                        return IP6_ECN_set_ce(skb, ipv6_hdr(skb));
                break;
        }

        return 0;
}

static inline int skb_get_dsfield(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
                        break;
                return ipv4_get_dsfield(ip_hdr(skb));

        case cpu_to_be16(ETH_P_IPV6):
                if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
                        break;
                return ipv6_get_dsfield(ipv6_hdr(skb));
        }

        return -1;
}

static inline int INET_ECN_set_ect1(struct sk_buff *skb)
{
        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                if (skb_network_header(skb) + sizeof(struct iphdr) <=
                    skb_tail_pointer(skb))
                        return IP_ECN_set_ect1(ip_hdr(skb));
                break;

        case cpu_to_be16(ETH_P_IPV6):
                if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
                    skb_tail_pointer(skb))
                        return IP6_ECN_set_ect1(skb, ipv6_hdr(skb));
                break;
        }

        return 0;
}

/*
 * RFC 6040 4.2
 *  To decapsulate the inner header at the tunnel egress, a compliant
 *  tunnel egress MUST set the outgoing ECN field to the codepoint at the
 *  intersection of the appropriate arriving inner header (row) and outer
 *  header (column) in Figure 4
 *
 *      +---------+------------------------------------------------+
 *      |Arriving |            Arriving Outer Header               |
 *      |   Inner +---------+------------+------------+------------+
 *      |  Header | Not-ECT | ECT(0)     | ECT(1)     |     CE     |
 *      +---------+---------+------------+------------+------------+
 *      | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)|
 *      |  ECT(0) |  ECT(0) | ECT(0)     | ECT(1)     |     CE     |
 *      |  ECT(1) |  ECT(1) | ECT(1) (!) | ECT(1)     |     CE     |
 *      |    CE   |      CE |     CE     |     CE(!!!)|     CE     |
 *      +---------+---------+------------+------------+------------+
 *
 *             Figure 4: New IP in IP Decapsulation Behaviour
 *
 *  returns 0 on success
 *          1 if something is broken and should be logged (!!! above)
 *          2 if packet should be dropped
 */
static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce)
{
        if (INET_ECN_is_not_ect(inner)) {
                switch (outer & INET_ECN_MASK) {
                case INET_ECN_NOT_ECT:
                        return 0;
                case INET_ECN_ECT_0:
                case INET_ECN_ECT_1:
                        return 1;
                case INET_ECN_CE:
                        return 2;
                }
        }

        *set_ce = INET_ECN_is_ce(outer);
        return 0;
}

static inline int INET_ECN_decapsulate(struct sk_buff *skb,
                                       __u8 outer, __u8 inner)
{
        bool set_ce = false;
        int rc;

        rc = __INET_ECN_decapsulate(outer, inner, &set_ce);
        if (!rc) {
                if (set_ce)
                        INET_ECN_set_ce(skb);
                else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1)
                        INET_ECN_set_ect1(skb);
        }

        return rc;
}

static inline int IP_ECN_decapsulate(const struct iphdr *oiph,
                                     struct sk_buff *skb)
{
        __u8 inner;

        switch (skb_protocol(skb, true)) {
        case htons(ETH_P_IP):
                inner = ip_hdr(skb)->tos;
                break;
        case htons(ETH_P_IPV6):
                inner = ipv6_get_dsfield(ipv6_hdr(skb));
                break;
        default:
                return 0;
        }

        return INET_ECN_decapsulate(skb, oiph->tos, inner);
}

static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h,
                                      struct sk_buff *skb)
{
        __u8 inner;

        switch (skb_protocol(skb, true)) {
        case htons(ETH_P_IP):
                inner = ip_hdr(skb)->tos;
                break;
        case htons(ETH_P_IPV6):
                inner = ipv6_get_dsfield(ipv6_hdr(skb));
                break;
        default:
                return 0;
        }

        return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner);
}
#endif





























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2018 Facebook */

#ifndef _LINUX_BTF_H
#define _LINUX_BTF_H 1

#include <linux/types.h>
#include <linux/bpfptr.h>
#include <linux/bsearch.h>
#include <linux/btf_ids.h>
#include <uapi/linux/btf.h>
#include <uapi/linux/bpf.h>

#define BTF_TYPE_EMIT(type) ((void)(type *)0)
#define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val)

/* These need to be macros, as the expressions are used in assembler input */
#define KF_ACQUIRE        (1 << 0) /* kfunc is an acquire function */
#define KF_RELEASE        (1 << 1) /* kfunc is a release function */
#define KF_RET_NULL        (1 << 2) /* kfunc returns a pointer that may be NULL */
/* Trusted arguments are those which are guaranteed to be valid when passed to
 * the kfunc. It is used to enforce that pointers obtained from either acquire
 * kfuncs, or from the main kernel on a tracepoint or struct_ops callback
 * invocation, remain unmodified when being passed to helpers taking trusted
 * args.
 *
 * Consider, for example, the following new task tracepoint:
 *
 *        SEC("tp_btf/task_newtask")
 *        int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags)
 *        {
 *                ...
 *        }
 *
 * And the following kfunc:
 *
 *        BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
 *
 * All invocations to the kfunc must pass the unmodified, unwalked task:
 *
 *        bpf_task_acquire(task);                    // Allowed
 *        bpf_task_acquire(task->last_wakee); // Rejected, walked task
 *
 * Programs may also pass referenced tasks directly to the kfunc:
 *
 *        struct task_struct *acquired;
 *
 *        acquired = bpf_task_acquire(task);        // Allowed, same as above
 *        bpf_task_acquire(acquired);                // Allowed
 *        bpf_task_acquire(task);                        // Allowed
 *        bpf_task_acquire(acquired->last_wakee); // Rejected, walked task
 *
 * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or
 * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these
 * pointers are guaranteed to be safe. For example, the following BPF program
 * would be rejected:
 *
 * SEC("kretprobe/free_task")
 * int BPF_PROG(free_task_probe, struct task_struct *tsk)
 * {
 *        struct task_struct *acquired;
 *
 *        acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer
 *        bpf_task_release(acquired);
 *
 *        return 0;
 * }
 */
#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
#define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
#define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
#define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */

/*
 * Tag marking a kernel function as a kfunc. This is meant to minimize the
 * amount of copy-paste that kfunc authors have to include for correctness so
 * as to avoid issues such as the compiler inlining or eliding either a static
 * kfunc, or a global kfunc in an LTO build.
 */
#define __bpf_kfunc __used __retain noinline

#define __bpf_kfunc_start_defs()                                               \
        __diag_push();                                                               \
        __diag_ignore_all("-Wmissing-declarations",                               \
                          "Global kfuncs as their definitions will be in BTF");\
        __diag_ignore_all("-Wmissing-prototypes",                               \
                          "Global kfuncs as their definitions will be in BTF")

#define __bpf_kfunc_end_defs() __diag_pop()
#define __bpf_hook_start() __bpf_kfunc_start_defs()
#define __bpf_hook_end() __bpf_kfunc_end_defs()

/*
 * Return the name of the passed struct, if exists, or halt the build if for
 * example the structure gets renamed. In this way, developers have to revisit
 * the code using that structure name, and update it accordingly.
 */
#define stringify_struct(x)                        \
        ({ BUILD_BUG_ON(sizeof(struct x) < 0);        \
           __stringify(x); })

struct btf;
struct btf_member;
struct btf_type;
union bpf_attr;
struct btf_show;
struct btf_id_set;
struct bpf_prog;

typedef int (*btf_kfunc_filter_t)(const struct bpf_prog *prog, u32 kfunc_id);

struct btf_kfunc_id_set {
        struct module *owner;
        struct btf_id_set8 *set;
        btf_kfunc_filter_t filter;
};

struct btf_id_dtor_kfunc {
        u32 btf_id;
        u32 kfunc_btf_id;
};

struct btf_struct_meta {
        u32 btf_id;
        struct btf_record *record;
};

struct btf_struct_metas {
        u32 cnt;
        struct btf_struct_meta types[];
};

extern const struct file_operations btf_fops;

const char *btf_get_name(const struct btf *btf);
void btf_get(struct btf *btf);
void btf_put(struct btf *btf);
int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
struct btf *btf_get_by_fd(int fd);
int btf_get_info_by_fd(const struct btf *btf,
                       const union bpf_attr *attr,
                       union bpf_attr __user *uattr);
/* Figure out the size of a type_id.  If type_id is a modifier
 * (e.g. const), it will be resolved to find out the type with size.
 *
 * For example:
 * In describing "const void *",  type_id is "const" and "const"
 * refers to "void *".  The return type will be "void *".
 *
 * If type_id is a simple "int", then return type will be "int".
 *
 * @btf: struct btf object
 * @type_id: Find out the size of type_id. The type_id of the return
 *           type is set to *type_id.
 * @ret_size: It can be NULL.  If not NULL, the size of the return
 *            type is set to *ret_size.
 * Return: The btf_type (resolved to another type with size info if needed).
 *         NULL is returned if type_id itself does not have size info
 *         (e.g. void) or it cannot be resolved to another type that
 *         has size info.
 *         *type_id and *ret_size will not be changed in the
 *         NULL return case.
 */
const struct btf_type *btf_type_id_size(const struct btf *btf,
                                        u32 *type_id,
                                        u32 *ret_size);

/*
 * Options to control show behaviour.
 *        - BTF_SHOW_COMPACT: no formatting around type information
 *        - BTF_SHOW_NONAME: no struct/union member names/types
 *        - BTF_SHOW_PTR_RAW: show raw (unobfuscated) pointer values;
 *          equivalent to %px.
 *        - BTF_SHOW_ZERO: show zero-valued struct/union members; they
 *          are not displayed by default
 *        - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read
 *          data before displaying it.
 */
#define BTF_SHOW_COMPACT        BTF_F_COMPACT
#define BTF_SHOW_NONAME                BTF_F_NONAME
#define BTF_SHOW_PTR_RAW        BTF_F_PTR_RAW
#define BTF_SHOW_ZERO                BTF_F_ZERO
#define BTF_SHOW_UNSAFE                (1ULL << 4)

void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
                       struct seq_file *m);
int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj,
                            struct seq_file *m, u64 flags);

/*
 * Copy len bytes of string representation of obj of BTF type_id into buf.
 *
 * @btf: struct btf object
 * @type_id: type id of type obj points to
 * @obj: pointer to typed data
 * @buf: buffer to write to
 * @len: maximum length to write to buf
 * @flags: show options (see above)
 *
 * Return: length that would have been/was copied as per snprintf, or
 *           negative error.
 */
int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
                           char *buf, int len, u64 flags);

int btf_get_fd_by_id(u32 id);
u32 btf_obj_id(const struct btf *btf);
bool btf_is_kernel(const struct btf *btf);
bool btf_is_module(const struct btf *btf);
struct module *btf_try_get_module(const struct btf *btf);
u32 btf_nr_types(const struct btf *btf);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
                           const struct btf_member *m,
                           u32 expected_offset, u32 expected_size);
struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
                                    u32 field_mask, u32 value_size);
int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec);
bool btf_type_is_void(const struct btf_type *t);
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p);
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
                                               u32 id, u32 *res_id);
const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
                                            u32 id, u32 *res_id);
const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
                                                 u32 id, u32 *res_id);
const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
                 u32 *type_size);
const char *btf_type_str(const struct btf_type *t);

#define for_each_member(i, struct_type, member)                        \
        for (i = 0, member = btf_type_member(struct_type);        \
             i < btf_type_vlen(struct_type);                        \
             i++, member++)

#define for_each_vsi(i, datasec_type, member)                        \
        for (i = 0, member = btf_type_var_secinfo(datasec_type);        \
             i < btf_type_vlen(datasec_type);                        \
             i++, member++)

static inline bool btf_type_is_ptr(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
}

static inline bool btf_type_is_int(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
}

static inline bool btf_type_is_small_int(const struct btf_type *t)
{
        return btf_type_is_int(t) && t->size <= sizeof(u64);
}

static inline u8 btf_int_encoding(const struct btf_type *t)
{
        return BTF_INT_ENCODING(*(u32 *)(t + 1));
}

static inline bool btf_type_is_signed_int(const struct btf_type *t)
{
        return btf_type_is_int(t) && (btf_int_encoding(t) & BTF_INT_SIGNED);
}

static inline bool btf_type_is_enum(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
}

static inline bool btf_is_any_enum(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM ||
               BTF_INFO_KIND(t->info) == BTF_KIND_ENUM64;
}

static inline bool btf_kind_core_compat(const struct btf_type *t1,
                                        const struct btf_type *t2)
{
        return BTF_INFO_KIND(t1->info) == BTF_INFO_KIND(t2->info) ||
               (btf_is_any_enum(t1) && btf_is_any_enum(t2));
}

static inline bool str_is_empty(const char *s)
{
        return !s || !s[0];
}

static inline u16 btf_kind(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info);
}

static inline bool btf_is_enum(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_ENUM;
}

static inline bool btf_is_enum64(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_ENUM64;
}

static inline u64 btf_enum64_value(const struct btf_enum64 *e)
{
        return ((u64)e->val_hi32 << 32) | e->val_lo32;
}

static inline bool btf_is_composite(const struct btf_type *t)
{
        u16 kind = btf_kind(t);

        return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}

static inline bool btf_is_array(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_ARRAY;
}

static inline bool btf_is_int(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_INT;
}

static inline bool btf_is_ptr(const struct btf_type *t)
{
        return btf_kind(t) == BTF_KIND_PTR;
}

static inline u8 btf_int_offset(const struct btf_type *t)
{
        return BTF_INT_OFFSET(*(u32 *)(t + 1));
}

static inline bool btf_type_is_scalar(const struct btf_type *t)
{
        return btf_type_is_int(t) || btf_type_is_enum(t);
}

static inline bool btf_type_is_typedef(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
}

static inline bool btf_type_is_volatile(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_VOLATILE;
}

static inline bool btf_type_is_func(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
}

static inline bool btf_type_is_func_proto(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
}

static inline bool btf_type_is_var(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
}

static inline bool btf_type_is_type_tag(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG;
}

/* union is only a special case of struct:
 * all its offsetof(member) == 0
 */
static inline bool btf_type_is_struct(const struct btf_type *t)
{
        u8 kind = BTF_INFO_KIND(t->info);

        return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}

static inline bool __btf_type_is_struct(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
}

static inline bool btf_type_is_array(const struct btf_type *t)
{
        return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
}

static inline u16 btf_type_vlen(const struct btf_type *t)
{
        return BTF_INFO_VLEN(t->info);
}

static inline u16 btf_vlen(const struct btf_type *t)
{
        return btf_type_vlen(t);
}

static inline u16 btf_func_linkage(const struct btf_type *t)
{
        return BTF_INFO_VLEN(t->info);
}

static inline bool btf_type_kflag(const struct btf_type *t)
{
        return BTF_INFO_KFLAG(t->info);
}

static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type,
                                          const struct btf_member *member)
{
        return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
                                           : member->offset;
}

static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type,
                                             const struct btf_member *member)
{
        return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
                                           : 0;
}

static inline struct btf_member *btf_members(const struct btf_type *t)
{
        return (struct btf_member *)(t + 1);
}

static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx)
{
        const struct btf_member *m = btf_members(t) + member_idx;

        return __btf_member_bit_offset(t, m);
}

static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx)
{
        const struct btf_member *m = btf_members(t) + member_idx;

        return __btf_member_bitfield_size(t, m);
}

static inline const struct btf_member *btf_type_member(const struct btf_type *t)
{
        return (const struct btf_member *)(t + 1);
}

static inline struct btf_array *btf_array(const struct btf_type *t)
{
        return (struct btf_array *)(t + 1);
}

static inline struct btf_enum *btf_enum(const struct btf_type *t)
{
        return (struct btf_enum *)(t + 1);
}

static inline struct btf_enum64 *btf_enum64(const struct btf_type *t)
{
        return (struct btf_enum64 *)(t + 1);
}

static inline const struct btf_var_secinfo *btf_type_var_secinfo(
                const struct btf_type *t)
{
        return (const struct btf_var_secinfo *)(t + 1);
}

static inline struct btf_param *btf_params(const struct btf_type *t)
{
        return (struct btf_param *)(t + 1);
}

static inline int btf_id_cmp_func(const void *a, const void *b)
{
        const int *pa = a, *pb = b;

        return *pa - *pb;
}

static inline bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
{
        return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}

static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
{
        return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
}

bool btf_param_match_suffix(const struct btf *btf,
                            const struct btf_param *arg,
                            const char *suffix);
int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
                       u32 arg_no);

struct bpf_verifier_log;

#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
struct bpf_struct_ops;
int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops);
const struct bpf_struct_ops_desc *bpf_struct_ops_find_value(struct btf *btf, u32 value_id);
const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id);
#else
static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id)
{
        return NULL;
}
#endif

#ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
const char *btf_name_by_offset(const struct btf *btf, u32 offset);
struct btf *btf_parse_vmlinux(void);
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id,
                               const struct bpf_prog *prog);
u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
                                const struct bpf_prog *prog);
int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
                              const struct btf_kfunc_id_set *s);
int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset);
s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
                                struct module *owner);
struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
                           const struct btf_type *t, enum bpf_prog_type prog_type,
                           int arg);
int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type);
bool btf_types_are_same(const struct btf *btf1, u32 id1,
                        const struct btf *btf2, u32 id2);
#else
static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
                                                    u32 type_id)
{
        return NULL;
}
static inline const char *btf_name_by_offset(const struct btf *btf,
                                             u32 offset)
{
        return NULL;
}
static inline u32 *btf_kfunc_id_set_contains(const struct btf *btf,
                                             u32 kfunc_btf_id,
                                             struct bpf_prog *prog)

{
        return NULL;
}
static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
                                            const struct btf_kfunc_id_set *s)
{
        return 0;
}
static inline s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
{
        return -ENOENT;
}
static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors,
                                              u32 add_cnt, struct module *owner)
{
        return 0;
}
static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
{
        return NULL;
}
static inline bool
btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
                     const struct btf_type *t, enum bpf_prog_type prog_type,
                     int arg)
{
        return false;
}
static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log,
                                      enum bpf_prog_type prog_type) {
        return -EINVAL;
}
static inline bool btf_types_are_same(const struct btf *btf1, u32 id1,
                                      const struct btf *btf2, u32 id2)
{
        return false;
}
#endif

static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
{
        if (!btf_type_is_ptr(t))
                return false;

        t = btf_type_skip_modifiers(btf, t->type, NULL);

        return btf_type_is_struct(t);
}

#endif






























































































































































































































































































































































































    2 
    2 
















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NDISC_H
#define _NDISC_H

#include <net/ipv6_stubs.h>

/*
 *        ICMP codes for neighbour discovery messages
 */

#define NDISC_ROUTER_SOLICITATION        133
#define NDISC_ROUTER_ADVERTISEMENT        134
#define NDISC_NEIGHBOUR_SOLICITATION        135
#define NDISC_NEIGHBOUR_ADVERTISEMENT        136
#define NDISC_REDIRECT                        137

/*
 * Router type: cross-layer information from link-layer to
 * IPv6 layer reported by certain link types (e.g., RFC4214).
 */
#define NDISC_NODETYPE_UNSPEC                0        /* unspecified (default) */
#define NDISC_NODETYPE_HOST                1        /* host or unauthorized router */
#define NDISC_NODETYPE_NODEFAULT        2        /* non-default router */
#define NDISC_NODETYPE_DEFAULT                3        /* default router */

/*
 *        ndisc options
 */

enum {
        __ND_OPT_PREFIX_INFO_END = 0,
        ND_OPT_SOURCE_LL_ADDR = 1,        /* RFC2461 */
        ND_OPT_TARGET_LL_ADDR = 2,        /* RFC2461 */
        ND_OPT_PREFIX_INFO = 3,                /* RFC2461 */
        ND_OPT_REDIRECT_HDR = 4,        /* RFC2461 */
        ND_OPT_MTU = 5,                        /* RFC2461 */
        ND_OPT_NONCE = 14,              /* RFC7527 */
        __ND_OPT_ARRAY_MAX,
        ND_OPT_ROUTE_INFO = 24,                /* RFC4191 */
        ND_OPT_RDNSS = 25,                /* RFC5006 */
        ND_OPT_DNSSL = 31,                /* RFC6106 */
        ND_OPT_6CO = 34,                /* RFC6775 */
        ND_OPT_CAPTIVE_PORTAL = 37,        /* RFC7710 */
        ND_OPT_PREF64 = 38,                /* RFC8781 */
        __ND_OPT_MAX
};

#define MAX_RTR_SOLICITATION_DELAY        HZ

#define ND_REACHABLE_TIME                (30*HZ)
#define ND_RETRANS_TIMER                HZ

#include <linux/compiler.h>
#include <linux/icmpv6.h>
#include <linux/in6.h>
#include <linux/types.h>
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/hash.h>

#include <net/neighbour.h>

/* Set to 3 to get tracing... */
#define ND_DEBUG 1

#define ND_PRINTK(val, level, fmt, ...)                                \
do {                                                                \
        if (val <= ND_DEBUG)                                        \
                net_##level##_ratelimited(fmt, ##__VA_ARGS__);        \
} while (0)

struct ctl_table;
struct inet6_dev;
struct net_device;
struct net_proto_family;
struct sk_buff;
struct prefix_info;

extern struct neigh_table nd_tbl;

struct nd_msg {
        struct icmp6hdr        icmph;
        struct in6_addr        target;
        __u8                opt[];
};

struct rs_msg {
        struct icmp6hdr        icmph;
        __u8                opt[];
};

struct ra_msg {
        struct icmp6hdr                icmph;
        __be32                        reachable_time;
        __be32                        retrans_timer;
};

struct rd_msg {
        struct icmp6hdr icmph;
        struct in6_addr        target;
        struct in6_addr        dest;
        __u8                opt[];
};

struct nd_opt_hdr {
        __u8                nd_opt_type;
        __u8                nd_opt_len;
} __packed;

/* ND options */
struct ndisc_options {
        struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX];
#ifdef CONFIG_IPV6_ROUTE_INFO
        struct nd_opt_hdr *nd_opts_ri;
        struct nd_opt_hdr *nd_opts_ri_end;
#endif
        struct nd_opt_hdr *nd_useropts;
        struct nd_opt_hdr *nd_useropts_end;
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1];
#endif
};

#define nd_opts_src_lladdr                nd_opt_array[ND_OPT_SOURCE_LL_ADDR]
#define nd_opts_tgt_lladdr                nd_opt_array[ND_OPT_TARGET_LL_ADDR]
#define nd_opts_pi                        nd_opt_array[ND_OPT_PREFIX_INFO]
#define nd_opts_pi_end                        nd_opt_array[__ND_OPT_PREFIX_INFO_END]
#define nd_opts_rh                        nd_opt_array[ND_OPT_REDIRECT_HDR]
#define nd_opts_mtu                        nd_opt_array[ND_OPT_MTU]
#define nd_opts_nonce                        nd_opt_array[ND_OPT_NONCE]
#define nd_802154_opts_src_lladdr        nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
#define nd_802154_opts_tgt_lladdr        nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]

#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)

struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
                                          u8 *opt, int opt_len,
                                          struct ndisc_options *ndopts);

void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data,
                              int data_len, int pad);

#define NDISC_OPS_REDIRECT_DATA_SPACE        2

/*
 * This structure defines the hooks for IPv6 neighbour discovery.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*is_useropt)(u8 nd_opt_type):
 *     This function is called when IPv6 decide RA userspace options. if
 *     this function returns 1 then the option given by nd_opt_type will
 *     be handled as userspace option additional to the IPv6 options.
 *
 * int (*parse_options)(const struct net_device *dev,
 *                        struct nd_opt_hdr *nd_opt,
 *                        struct ndisc_options *ndopts):
 *     This function is called while parsing ndisc ops and put each position
 *     as pointer into ndopts. If this function return unequal 0, then this
 *     function took care about the ndisc option, if 0 then the IPv6 ndisc
 *     option parser will take care about that option.
 *
 * void (*update)(const struct net_device *dev, struct neighbour *n,
 *                  u32 flags, u8 icmp6_type,
 *                  const struct ndisc_options *ndopts):
 *     This function is called when IPv6 ndisc updates the neighbour cache
 *     entry. Additional options which can be updated may be previously
 *     parsed by parse_opts callback and accessible over ndopts parameter.
 *
 * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
 *                         struct neighbour *neigh, u8 *ha_buf,
 *                         u8 **ha):
 *     This function is called when the necessary option space will be
 *     calculated before allocating a skb. The parameters neigh, ha_buf
 *     abd ha are available on NDISC_REDIRECT messages only.
 *
 * void (*fill_addr_option)(const struct net_device *dev,
 *                            struct sk_buff *skb, u8 icmp6_type,
 *                            const u8 *ha):
 *     This function is called when the skb will finally fill the option
 *     fields inside skb. NOTE: this callback should fill the option
 *     fields to the skb which are previously indicated by opt_space
 *     parameter. That means the decision to add such option should
 *     not lost between these two callbacks, e.g. protected by interface
 *     up state.
 *
 * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
 *                               const struct prefix_info *pinfo,
 *                               struct inet6_dev *in6_dev,
 *                               struct in6_addr *addr,
 *                               int addr_type, u32 addr_flags,
 *                               bool sllao, bool tokenized,
 *                               __u32 valid_lft, u32 prefered_lft,
 *                               bool dev_addr_generated):
 *     This function is called when a RA messages is received with valid
 *     PIO option fields and an IPv6 address will be added to the interface
 *     for autoconfiguration. The parameter dev_addr_generated reports about
 *     if the address was based on dev->dev_addr or not. This can be used
 *     to add a second address if link-layer operates with two link layer
 *     addresses. E.g. 802.15.4 6LoWPAN.
 */
struct ndisc_ops {
        int        (*is_useropt)(u8 nd_opt_type);
        int        (*parse_options)(const struct net_device *dev,
                                 struct nd_opt_hdr *nd_opt,
                                 struct ndisc_options *ndopts);
        void        (*update)(const struct net_device *dev, struct neighbour *n,
                          u32 flags, u8 icmp6_type,
                          const struct ndisc_options *ndopts);
        int        (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type,
                                  struct neighbour *neigh, u8 *ha_buf,
                                  u8 **ha);
        void        (*fill_addr_option)(const struct net_device *dev,
                                    struct sk_buff *skb, u8 icmp6_type,
                                    const u8 *ha);
        void        (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev,
                                       const struct prefix_info *pinfo,
                                       struct inet6_dev *in6_dev,
                                       struct in6_addr *addr,
                                       int addr_type, u32 addr_flags,
                                       bool sllao, bool tokenized,
                                       __u32 valid_lft, u32 prefered_lft,
                                       bool dev_addr_generated);
};

#if IS_ENABLED(CONFIG_IPV6)
static inline int ndisc_ops_is_useropt(const struct net_device *dev,
                                       u8 nd_opt_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->is_useropt)
                return dev->ndisc_ops->is_useropt(nd_opt_type);
        else
                return 0;
}

static inline int ndisc_ops_parse_options(const struct net_device *dev,
                                          struct nd_opt_hdr *nd_opt,
                                          struct ndisc_options *ndopts)
{
        if (dev->ndisc_ops && dev->ndisc_ops->parse_options)
                return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts);
        else
                return 0;
}

static inline void ndisc_ops_update(const struct net_device *dev,
                                          struct neighbour *n, u32 flags,
                                          u8 icmp6_type,
                                          const struct ndisc_options *ndopts)
{
        if (dev->ndisc_ops && dev->ndisc_ops->update)
                dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts);
}

static inline int ndisc_ops_opt_addr_space(const struct net_device *dev,
                                           u8 icmp6_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space &&
            icmp6_type != NDISC_REDIRECT)
                return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL,
                                                      NULL, NULL);
        else
                return 0;
}

static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev,
                                                    struct neighbour *neigh,
                                                    u8 *ha_buf, u8 **ha)
{
        if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space)
                return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT,
                                                      neigh, ha_buf, ha);
        else
                return 0;
}

static inline void ndisc_ops_fill_addr_option(const struct net_device *dev,
                                              struct sk_buff *skb,
                                              u8 icmp6_type)
{
        if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option &&
            icmp6_type != NDISC_REDIRECT)
                dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL);
}

static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev,
                                                       struct sk_buff *skb,
                                                       const u8 *ha)
{
        if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option)
                dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha);
}

static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net,
                                                 struct net_device *dev,
                                                 const struct prefix_info *pinfo,
                                                 struct inet6_dev *in6_dev,
                                                 struct in6_addr *addr,
                                                 int addr_type, u32 addr_flags,
                                                 bool sllao, bool tokenized,
                                                 __u32 valid_lft,
                                                 u32 prefered_lft,
                                                 bool dev_addr_generated)
{
        if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr)
                dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
                                                    addr, addr_type,
                                                    addr_flags, sllao,
                                                    tokenized, valid_lft,
                                                    prefered_lft,
                                                    dev_addr_generated);
}
#endif

/*
 * Return the padding between the option length and the start of the
 * link addr.  Currently only IP-over-InfiniBand needs this, although
 * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may
 * also need a pad of 2.
 */
static inline int ndisc_addr_option_pad(unsigned short type)
{
        switch (type) {
        case ARPHRD_INFINIBAND: return 2;
        default:                return 0;
        }
}

static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad)
{
        return NDISC_OPT_SPACE(addr_len + pad);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type)
{
        return __ndisc_opt_addr_space(dev->addr_len,
                                      ndisc_addr_option_pad(dev->type)) +
                ndisc_ops_opt_addr_space(dev, icmp6_type);
}

static inline int ndisc_redirect_opt_addr_space(struct net_device *dev,
                                                struct neighbour *neigh,
                                                u8 *ops_data_buf,
                                                u8 **ops_data)
{
        return __ndisc_opt_addr_space(dev->addr_len,
                                      ndisc_addr_option_pad(dev->type)) +
                ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf,
                                                  ops_data);
}
#endif

static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p,
                                        unsigned char addr_len, int prepad)
{
        u8 *lladdr = (u8 *)(p + 1);
        int lladdrlen = p->nd_opt_len << 3;
        if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad))
                return NULL;
        return lladdr + prepad;
}

static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
                                      struct net_device *dev)
{
        return __ndisc_opt_addr_data(p, dev->addr_len,
                                     ndisc_addr_option_pad(dev->type));
}

static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd)
{
        const u32 *p32 = pkey;

        return (((p32[0] ^ hash32_ptr(dev)) * hash_rnd[0]) +
                (p32[1] * hash_rnd[1]) +
                (p32[2] * hash_rnd[2]) +
                (p32[3] * hash_rnd[3]));
}

static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev, const void *pkey)
{
        return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev);
}

static inline
struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev,
                                                 const void *pkey)
{
        return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
                                     ndisc_hashfn, pkey, dev);
}

static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv6_neigh_lookup_noref(dev, pkey);
        if (n && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;
        rcu_read_unlock();

        return n;
}

static inline void __ipv6_confirm_neigh(struct net_device *dev,
                                        const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv6_neigh_lookup_noref(dev, pkey);
        neigh_confirm(n);
        rcu_read_unlock();
}

static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
                                             const void *pkey)
{
        struct neighbour *n;

        rcu_read_lock();
        n = __ipv6_neigh_lookup_noref_stub(dev, pkey);
        neigh_confirm(n);
        rcu_read_unlock();
}

/* uses ipv6_stub and is meant for use outside of IPv6 core */
static inline struct neighbour *ip_neigh_gw6(struct net_device *dev,
                                             const void *addr)
{
        struct neighbour *neigh;

        neigh = __ipv6_neigh_lookup_noref_stub(dev, addr);
        if (unlikely(!neigh))
                neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false);

        return neigh;
}

int ndisc_init(void);
int ndisc_late_init(void);

void ndisc_late_cleanup(void);
void ndisc_cleanup(void);

enum skb_drop_reason ndisc_rcv(struct sk_buff *skb);

struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
                                const struct in6_addr *saddr, u64 nonce);
void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
                   const struct in6_addr *daddr, const struct in6_addr *saddr,
                   u64 nonce);

void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
                    const struct in6_addr *saddr);

void ndisc_send_rs(struct net_device *dev,
                   const struct in6_addr *saddr, const struct in6_addr *daddr);
void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
                   const struct in6_addr *solicited_addr,
                   bool router, bool solicited, bool override, bool inc_opt);

void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target);

int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev,
                 int dir);

void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
                  const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
                  struct ndisc_options *ndopts);

/*
 *        IGMP
 */
int igmp6_init(void);
int igmp6_late_init(void);

void igmp6_cleanup(void);
void igmp6_late_cleanup(void);

void igmp6_event_query(struct sk_buff *skb);

void igmp6_event_report(struct sk_buff *skb);


#ifdef CONFIG_SYSCTL
int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write,
                               void *buffer, size_t *lenp, loff_t *ppos);
#endif

void inet6_ifinfo_notify(int event, struct inet6_dev *idev);

#endif













































































































































































































































































































































































































































































































































































    1 



    1 

    1 




    1 















    1 


    1 

    1 
















































































































































































































































    1 





    1 





    1 
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2011  Intel Corporation. All rights reserved.
 */

#define pr_fmt(fmt) "llcp: %s: " fmt, __func__

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/nfc.h>
#include <linux/sched/signal.h>

#include "nfc.h"
#include "llcp.h"

static int sock_wait_state(struct sock *sk, int state, unsigned long timeo)
{
        DECLARE_WAITQUEUE(wait, current);
        int err = 0;

        pr_debug("sk %p", sk);

        add_wait_queue(sk_sleep(sk), &wait);
        set_current_state(TASK_INTERRUPTIBLE);

        while (sk->sk_state != state) {
                if (!timeo) {
                        err = -EINPROGRESS;
                        break;
                }

                if (signal_pending(current)) {
                        err = sock_intr_errno(timeo);
                        break;
                }

                release_sock(sk);
                timeo = schedule_timeout(timeo);
                lock_sock(sk);
                set_current_state(TASK_INTERRUPTIBLE);

                err = sock_error(sk);
                if (err)
                        break;
        }

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(sk_sleep(sk), &wait);
        return err;
}

static struct proto llcp_sock_proto = {
        .name     = "NFC_LLCP",
        .owner    = THIS_MODULE,
        .obj_size = sizeof(struct nfc_llcp_sock),
};

static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
{
        struct sock *sk = sock->sk;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        struct nfc_llcp_local *local;
        struct nfc_dev *dev;
        struct sockaddr_nfc_llcp llcp_addr;
        int len, ret = 0;

        if (!addr || alen < offsetofend(struct sockaddr, sa_family) ||
            addr->sa_family != AF_NFC)
                return -EINVAL;

        pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family);

        memset(&llcp_addr, 0, sizeof(llcp_addr));
        len = min_t(unsigned int, sizeof(llcp_addr), alen);
        memcpy(&llcp_addr, addr, len);

        /* This is going to be a listening socket, dsap must be 0 */
        if (llcp_addr.dsap != 0)
                return -EINVAL;

        lock_sock(sk);

        if (sk->sk_state != LLCP_CLOSED) {
                ret = -EBADFD;
                goto error;
        }

        dev = nfc_get_device(llcp_addr.dev_idx);
        if (dev == NULL) {
                ret = -ENODEV;
                goto error;
        }

        local = nfc_llcp_find_local(dev);
        if (local == NULL) {
                ret = -ENODEV;
                goto put_dev;
        }

        llcp_sock->dev = dev;
        llcp_sock->local = local;
        llcp_sock->nfc_protocol = llcp_addr.nfc_protocol;
        llcp_sock->service_name_len = min_t(unsigned int,
                                            llcp_addr.service_name_len,
                                            NFC_LLCP_MAX_SERVICE_NAME);
        llcp_sock->service_name = kmemdup(llcp_addr.service_name,
                                          llcp_sock->service_name_len,
                                          GFP_KERNEL);
        if (!llcp_sock->service_name) {
                ret = -ENOMEM;
                goto sock_llcp_put_local;
        }
        llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock);
        if (llcp_sock->ssap == LLCP_SAP_MAX) {
                ret = -EADDRINUSE;
                goto free_service_name;
        }

        llcp_sock->reserved_ssap = llcp_sock->ssap;

        nfc_llcp_sock_link(&local->sockets, sk);

        pr_debug("Socket bound to SAP %d\n", llcp_sock->ssap);

        sk->sk_state = LLCP_BOUND;
        nfc_put_device(dev);
        release_sock(sk);

        return 0;

free_service_name:
        kfree(llcp_sock->service_name);
        llcp_sock->service_name = NULL;

sock_llcp_put_local:
        nfc_llcp_local_put(llcp_sock->local);
        llcp_sock->local = NULL;
        llcp_sock->dev = NULL;

put_dev:
        nfc_put_device(dev);

error:
        release_sock(sk);
        return ret;
}

static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr,
                              int alen)
{
        struct sock *sk = sock->sk;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        struct nfc_llcp_local *local;
        struct nfc_dev *dev;
        struct sockaddr_nfc_llcp llcp_addr;
        int len, ret = 0;

        if (!addr || alen < offsetofend(struct sockaddr, sa_family) ||
            addr->sa_family != AF_NFC)
                return -EINVAL;

        pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family);

        memset(&llcp_addr, 0, sizeof(llcp_addr));
        len = min_t(unsigned int, sizeof(llcp_addr), alen);
        memcpy(&llcp_addr, addr, len);

        lock_sock(sk);

        if (sk->sk_state != LLCP_CLOSED) {
                ret = -EBADFD;
                goto error;
        }

        dev = nfc_get_device(llcp_addr.dev_idx);
        if (dev == NULL) {
                ret = -ENODEV;
                goto error;
        }

        local = nfc_llcp_find_local(dev);
        if (local == NULL) {
                ret = -ENODEV;
                goto put_dev;
        }

        llcp_sock->dev = dev;
        llcp_sock->local = local;
        llcp_sock->nfc_protocol = llcp_addr.nfc_protocol;

        nfc_llcp_sock_link(&local->raw_sockets, sk);

        sk->sk_state = LLCP_BOUND;

put_dev:
        nfc_put_device(dev);

error:
        release_sock(sk);
        return ret;
}

static int llcp_sock_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        int ret = 0;

        pr_debug("sk %p backlog %d\n", sk, backlog);

        lock_sock(sk);

        if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM) ||
            sk->sk_state != LLCP_BOUND) {
                ret = -EBADFD;
                goto error;
        }

        sk->sk_max_ack_backlog = backlog;
        sk->sk_ack_backlog = 0;

        pr_debug("Socket listening\n");
        sk->sk_state = LLCP_LISTEN;

error:
        release_sock(sk);

        return ret;
}

static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname,
                               sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        u32 opt;
        int err = 0;

        pr_debug("%p optname %d\n", sk, optname);

        if (level != SOL_NFC)
                return -ENOPROTOOPT;

        lock_sock(sk);

        switch (optname) {
        case NFC_LLCP_RW:
                if (sk->sk_state == LLCP_CONNECTED ||
                    sk->sk_state == LLCP_BOUND ||
                    sk->sk_state == LLCP_LISTEN) {
                        err = -EINVAL;
                        break;
                }

                err = copy_safe_from_sockptr(&opt, sizeof(opt),
                                             optval, optlen);
                if (err)
                        break;

                if (opt > LLCP_MAX_RW) {
                        err = -EINVAL;
                        break;
                }

                llcp_sock->rw = (u8) opt;

                break;

        case NFC_LLCP_MIUX:
                if (sk->sk_state == LLCP_CONNECTED ||
                    sk->sk_state == LLCP_BOUND ||
                    sk->sk_state == LLCP_LISTEN) {
                        err = -EINVAL;
                        break;
                }

                err = copy_safe_from_sockptr(&opt, sizeof(opt),
                                             optval, optlen);
                if (err)
                        break;

                if (opt > LLCP_MAX_MIUX) {
                        err = -EINVAL;
                        break;
                }

                llcp_sock->miux = cpu_to_be16((u16) opt);

                break;

        default:
                err = -ENOPROTOOPT;
                break;
        }

        release_sock(sk);

        pr_debug("%p rw %d miux %d\n", llcp_sock,
                 llcp_sock->rw, llcp_sock->miux);

        return err;
}

static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname,
                               char __user *optval, int __user *optlen)
{
        struct nfc_llcp_local *local;
        struct sock *sk = sock->sk;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        int len, err = 0;
        u16 miux, remote_miu;
        u8 rw;

        pr_debug("%p optname %d\n", sk, optname);

        if (level != SOL_NFC)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;

        local = llcp_sock->local;
        if (!local)
                return -ENODEV;

        len = min_t(u32, len, sizeof(u32));

        lock_sock(sk);

        switch (optname) {
        case NFC_LLCP_RW:
                rw = llcp_sock->rw > LLCP_MAX_RW ? local->rw : llcp_sock->rw;
                if (put_user(rw, (u32 __user *) optval))
                        err = -EFAULT;

                break;

        case NFC_LLCP_MIUX:
                miux = be16_to_cpu(llcp_sock->miux) > LLCP_MAX_MIUX ?
                        be16_to_cpu(local->miux) : be16_to_cpu(llcp_sock->miux);

                if (put_user(miux, (u32 __user *) optval))
                        err = -EFAULT;

                break;

        case NFC_LLCP_REMOTE_MIU:
                remote_miu = llcp_sock->remote_miu > LLCP_MAX_MIU ?
                                local->remote_miu : llcp_sock->remote_miu;

                if (put_user(remote_miu, (u32 __user *) optval))
                        err = -EFAULT;

                break;

        case NFC_LLCP_REMOTE_LTO:
                if (put_user(local->remote_lto / 10, (u32 __user *) optval))
                        err = -EFAULT;

                break;

        case NFC_LLCP_REMOTE_RW:
                if (put_user(llcp_sock->remote_rw, (u32 __user *) optval))
                        err = -EFAULT;

                break;

        default:
                err = -ENOPROTOOPT;
                break;
        }

        release_sock(sk);

        if (put_user(len, optlen))
                return -EFAULT;

        return err;
}

void nfc_llcp_accept_unlink(struct sock *sk)
{
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);

        pr_debug("state %d\n", sk->sk_state);

        list_del_init(&llcp_sock->accept_queue);
        sk_acceptq_removed(llcp_sock->parent);
        llcp_sock->parent = NULL;

        sock_put(sk);
}

void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk)
{
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        struct nfc_llcp_sock *llcp_sock_parent = nfc_llcp_sock(parent);

        /* Lock will be free from unlink */
        sock_hold(sk);

        list_add_tail(&llcp_sock->accept_queue,
                      &llcp_sock_parent->accept_queue);
        llcp_sock->parent = parent;
        sk_acceptq_added(parent);
}

struct sock *nfc_llcp_accept_dequeue(struct sock *parent,
                                     struct socket *newsock)
{
        struct nfc_llcp_sock *lsk, *n, *llcp_parent;
        struct sock *sk;

        llcp_parent = nfc_llcp_sock(parent);

        list_for_each_entry_safe(lsk, n, &llcp_parent->accept_queue,
                                 accept_queue) {
                sk = &lsk->sk;
                lock_sock(sk);

                if (sk->sk_state == LLCP_CLOSED) {
                        release_sock(sk);
                        nfc_llcp_accept_unlink(sk);
                        continue;
                }

                if (sk->sk_state == LLCP_CONNECTED || !newsock) {
                        list_del_init(&lsk->accept_queue);
                        sock_put(sk);

                        if (newsock)
                                sock_graft(sk, newsock);

                        release_sock(sk);

                        pr_debug("Returning sk state %d\n", sk->sk_state);

                        sk_acceptq_removed(parent);

                        return sk;
                }

                release_sock(sk);
        }

        return NULL;
}

static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
                            struct proto_accept_arg *arg)
{
        DECLARE_WAITQUEUE(wait, current);
        struct sock *sk = sock->sk, *new_sk;
        long timeo;
        int ret = 0;

        pr_debug("parent %p\n", sk);

        lock_sock_nested(sk, SINGLE_DEPTH_NESTING);

        if (sk->sk_state != LLCP_LISTEN) {
                ret = -EBADFD;
                goto error;
        }

        timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);

        /* Wait for an incoming connection. */
        add_wait_queue_exclusive(sk_sleep(sk), &wait);
        while (!(new_sk = nfc_llcp_accept_dequeue(sk, newsock))) {
                set_current_state(TASK_INTERRUPTIBLE);

                if (!timeo) {
                        ret = -EAGAIN;
                        break;
                }

                if (signal_pending(current)) {
                        ret = sock_intr_errno(timeo);
                        break;
                }

                release_sock(sk);
                timeo = schedule_timeout(timeo);
                lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
        }
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(sk_sleep(sk), &wait);

        if (ret)
                goto error;

        newsock->state = SS_CONNECTED;

        pr_debug("new socket %p\n", new_sk);

error:
        release_sock(sk);

        return ret;
}

static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
                             int peer)
{
        struct sock *sk = sock->sk;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, llcp_addr, uaddr);

        if (llcp_sock == NULL || llcp_sock->dev == NULL)
                return -EBADFD;

        pr_debug("%p %d %d %d\n", sk, llcp_sock->target_idx,
                 llcp_sock->dsap, llcp_sock->ssap);

        memset(llcp_addr, 0, sizeof(*llcp_addr));

        lock_sock(sk);
        if (!llcp_sock->dev) {
                release_sock(sk);
                return -EBADFD;
        }
        llcp_addr->sa_family = AF_NFC;
        llcp_addr->dev_idx = llcp_sock->dev->idx;
        llcp_addr->target_idx = llcp_sock->target_idx;
        llcp_addr->nfc_protocol = llcp_sock->nfc_protocol;
        llcp_addr->dsap = llcp_sock->dsap;
        llcp_addr->ssap = llcp_sock->ssap;
        llcp_addr->service_name_len = llcp_sock->service_name_len;
        memcpy(llcp_addr->service_name, llcp_sock->service_name,
               llcp_addr->service_name_len);
        release_sock(sk);

        return sizeof(struct sockaddr_nfc_llcp);
}

static inline __poll_t llcp_accept_poll(struct sock *parent)
{
        struct nfc_llcp_sock *llcp_sock, *parent_sock;
        struct sock *sk;

        parent_sock = nfc_llcp_sock(parent);

        list_for_each_entry(llcp_sock, &parent_sock->accept_queue,
                            accept_queue) {
                sk = &llcp_sock->sk;

                if (sk->sk_state == LLCP_CONNECTED)
                        return EPOLLIN | EPOLLRDNORM;
        }

        return 0;
}

static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
                                   poll_table *wait)
{
        struct sock *sk = sock->sk;
        __poll_t mask = 0;

        pr_debug("%p\n", sk);

        sock_poll_wait(file, sock, wait);

        if (sk->sk_state == LLCP_LISTEN)
                return llcp_accept_poll(sk);

        if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR |
                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);

        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;

        if (sk->sk_state == LLCP_CLOSED)
                mask |= EPOLLHUP;

        if (sk->sk_shutdown & RCV_SHUTDOWN)
                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;

        if (sk->sk_shutdown == SHUTDOWN_MASK)
                mask |= EPOLLHUP;

        if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED)
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
        else
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        pr_debug("mask 0x%x\n", mask);

        return mask;
}

static int llcp_sock_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct nfc_llcp_local *local;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        int err = 0;

        if (!sk)
                return 0;

        pr_debug("%p\n", sk);

        local = llcp_sock->local;
        if (local == NULL) {
                err = -ENODEV;
                goto out;
        }

        lock_sock(sk);

        /* Send a DISC */
        if (sk->sk_state == LLCP_CONNECTED)
                nfc_llcp_send_disconnect(llcp_sock);

        if (sk->sk_state == LLCP_LISTEN) {
                struct nfc_llcp_sock *lsk, *n;
                struct sock *accept_sk;

                list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue,
                                         accept_queue) {
                        accept_sk = &lsk->sk;
                        lock_sock(accept_sk);

                        nfc_llcp_send_disconnect(lsk);
                        nfc_llcp_accept_unlink(accept_sk);

                        release_sock(accept_sk);
                }
        }

        if (sock->type == SOCK_RAW)
                nfc_llcp_sock_unlink(&local->raw_sockets, sk);
        else
                nfc_llcp_sock_unlink(&local->sockets, sk);

        if (llcp_sock->reserved_ssap < LLCP_SAP_MAX)
                nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap);

        release_sock(sk);

out:
        sock_orphan(sk);
        sock_put(sk);

        return err;
}

static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr,
                             int len, int flags)
{
        struct sock *sk = sock->sk;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        struct sockaddr_nfc_llcp *addr = (struct sockaddr_nfc_llcp *)_addr;
        struct nfc_dev *dev;
        struct nfc_llcp_local *local;
        int ret = 0;

        pr_debug("sock %p sk %p flags 0x%x\n", sock, sk, flags);

        if (!addr || len < sizeof(*addr) || addr->sa_family != AF_NFC)
                return -EINVAL;

        if (addr->service_name_len == 0 && addr->dsap == 0)
                return -EINVAL;

        pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx,
                 addr->target_idx, addr->nfc_protocol);

        lock_sock(sk);

        if (sk->sk_state == LLCP_CONNECTED) {
                ret = -EISCONN;
                goto error;
        }
        if (sk->sk_state == LLCP_CONNECTING) {
                ret = -EINPROGRESS;
                goto error;
        }

        dev = nfc_get_device(addr->dev_idx);
        if (dev == NULL) {
                ret = -ENODEV;
                goto error;
        }

        local = nfc_llcp_find_local(dev);
        if (local == NULL) {
                ret = -ENODEV;
                goto put_dev;
        }

        device_lock(&dev->dev);
        if (dev->dep_link_up == false) {
                ret = -ENOLINK;
                device_unlock(&dev->dev);
                goto sock_llcp_put_local;
        }
        device_unlock(&dev->dev);

        if (local->rf_mode == NFC_RF_INITIATOR &&
            addr->target_idx != local->target_idx) {
                ret = -ENOLINK;
                goto sock_llcp_put_local;
        }

        llcp_sock->dev = dev;
        llcp_sock->local = local;
        llcp_sock->ssap = nfc_llcp_get_local_ssap(local);
        if (llcp_sock->ssap == LLCP_SAP_MAX) {
                ret = -ENOMEM;
                goto sock_llcp_nullify;
        }

        llcp_sock->reserved_ssap = llcp_sock->ssap;

        if (addr->service_name_len == 0)
                llcp_sock->dsap = addr->dsap;
        else
                llcp_sock->dsap = LLCP_SAP_SDP;
        llcp_sock->nfc_protocol = addr->nfc_protocol;
        llcp_sock->service_name_len = min_t(unsigned int,
                                            addr->service_name_len,
                                            NFC_LLCP_MAX_SERVICE_NAME);
        llcp_sock->service_name = kmemdup(addr->service_name,
                                          llcp_sock->service_name_len,
                                          GFP_KERNEL);
        if (!llcp_sock->service_name) {
                ret = -ENOMEM;
                goto sock_llcp_release;
        }

        nfc_llcp_sock_link(&local->connecting_sockets, sk);

        ret = nfc_llcp_send_connect(llcp_sock);
        if (ret)
                goto sock_unlink;

        sk->sk_state = LLCP_CONNECTING;

        ret = sock_wait_state(sk, LLCP_CONNECTED,
                              sock_sndtimeo(sk, flags & O_NONBLOCK));
        if (ret && ret != -EINPROGRESS)
                goto sock_unlink;

        release_sock(sk);

        return ret;

sock_unlink:
        nfc_llcp_sock_unlink(&local->connecting_sockets, sk);
        kfree(llcp_sock->service_name);
        llcp_sock->service_name = NULL;

sock_llcp_release:
        nfc_llcp_put_ssap(local, llcp_sock->ssap);

sock_llcp_nullify:
        llcp_sock->local = NULL;
        llcp_sock->dev = NULL;

sock_llcp_put_local:
        nfc_llcp_local_put(local);

put_dev:
        nfc_put_device(dev);

error:
        release_sock(sk);
        return ret;
}

static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg,
                             size_t len)
{
        struct sock *sk = sock->sk;
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
        int ret;

        pr_debug("sock %p sk %p", sock, sk);

        ret = sock_error(sk);
        if (ret)
                return ret;

        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        lock_sock(sk);

        if (!llcp_sock->local) {
                release_sock(sk);
                return -ENODEV;
        }

        if (sk->sk_type == SOCK_DGRAM) {
                if (sk->sk_state != LLCP_BOUND) {
                        release_sock(sk);
                        return -ENOTCONN;
                }

                DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr,
                                 msg->msg_name);

                if (msg->msg_namelen < sizeof(*addr)) {
                        release_sock(sk);
                        return -EINVAL;
                }

                release_sock(sk);

                return nfc_llcp_send_ui_frame(llcp_sock, addr->dsap, addr->ssap,
                                              msg, len);
        }

        if (sk->sk_state != LLCP_CONNECTED) {
                release_sock(sk);
                return -ENOTCONN;
        }

        release_sock(sk);

        return nfc_llcp_send_i_frame(llcp_sock, msg, len);
}

static int llcp_sock_recvmsg(struct socket *sock, struct msghdr *msg,
                             size_t len, int flags)
{
        struct sock *sk = sock->sk;
        unsigned int copied, rlen;
        struct sk_buff *skb, *cskb;
        int err = 0;

        pr_debug("%p %zu\n", sk, len);

        lock_sock(sk);

        if (sk->sk_state == LLCP_CLOSED &&
            skb_queue_empty(&sk->sk_receive_queue)) {
                release_sock(sk);
                return 0;
        }

        release_sock(sk);

        if (flags & (MSG_OOB))
                return -EOPNOTSUPP;

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb) {
                pr_err("Recv datagram failed state %d %d %d",
                       sk->sk_state, err, sock_error(sk));

                if (sk->sk_shutdown & RCV_SHUTDOWN)
                        return 0;

                return err;
        }

        rlen = skb->len;                /* real length of skb */
        copied = min_t(unsigned int, rlen, len);

        cskb = skb;
        if (skb_copy_datagram_msg(cskb, 0, msg, copied)) {
                if (!(flags & MSG_PEEK))
                        skb_queue_head(&sk->sk_receive_queue, skb);
                return -EFAULT;
        }

        sock_recv_timestamp(msg, sk, skb);

        if (sk->sk_type == SOCK_DGRAM && msg->msg_name) {
                struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb);
                DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, sockaddr,
                                 msg->msg_name);

                msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp);

                pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap);

                memset(sockaddr, 0, sizeof(*sockaddr));
                sockaddr->sa_family = AF_NFC;
                sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP;
                sockaddr->dsap = ui_cb->dsap;
                sockaddr->ssap = ui_cb->ssap;
        }

        /* Mark read part of skb as used */
        if (!(flags & MSG_PEEK)) {

                /* SOCK_STREAM: re-queue skb if it contains unreceived data */
                if (sk->sk_type == SOCK_STREAM ||
                    sk->sk_type == SOCK_DGRAM ||
                    sk->sk_type == SOCK_RAW) {
                        skb_pull(skb, copied);
                        if (skb->len) {
                                skb_queue_head(&sk->sk_receive_queue, skb);
                                goto done;
                        }
                }

                kfree_skb(skb);
        }

        /* XXX Queue backlogged skbs */

done:
        /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */
        if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC))
                copied = rlen;

        return copied;
}

static const struct proto_ops llcp_sock_ops = {
        .family         = PF_NFC,
        .owner          = THIS_MODULE,
        .bind           = llcp_sock_bind,
        .connect        = llcp_sock_connect,
        .release        = llcp_sock_release,
        .socketpair     = sock_no_socketpair,
        .accept         = llcp_sock_accept,
        .getname        = llcp_sock_getname,
        .poll           = llcp_sock_poll,
        .ioctl          = sock_no_ioctl,
        .listen         = llcp_sock_listen,
        .shutdown       = sock_no_shutdown,
        .setsockopt     = nfc_llcp_setsockopt,
        .getsockopt     = nfc_llcp_getsockopt,
        .sendmsg        = llcp_sock_sendmsg,
        .recvmsg        = llcp_sock_recvmsg,
        .mmap           = sock_no_mmap,
};

static const struct proto_ops llcp_rawsock_ops = {
        .family         = PF_NFC,
        .owner          = THIS_MODULE,
        .bind           = llcp_raw_sock_bind,
        .connect        = sock_no_connect,
        .release        = llcp_sock_release,
        .socketpair     = sock_no_socketpair,
        .accept         = sock_no_accept,
        .getname        = llcp_sock_getname,
        .poll           = llcp_sock_poll,
        .ioctl          = sock_no_ioctl,
        .listen         = sock_no_listen,
        .shutdown       = sock_no_shutdown,
        .sendmsg        = sock_no_sendmsg,
        .recvmsg        = llcp_sock_recvmsg,
        .mmap           = sock_no_mmap,
};

static void llcp_sock_destruct(struct sock *sk)
{
        struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);

        pr_debug("%p\n", sk);

        if (sk->sk_state == LLCP_CONNECTED)
                nfc_put_device(llcp_sock->dev);

        skb_queue_purge(&sk->sk_receive_queue);

        nfc_llcp_sock_free(llcp_sock);

        if (!sock_flag(sk, SOCK_DEAD)) {
                pr_err("Freeing alive NFC LLCP socket %p\n", sk);
                return;
        }
}

struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern)
{
        struct sock *sk;
        struct nfc_llcp_sock *llcp_sock;

        sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern);
        if (!sk)
                return NULL;

        llcp_sock = nfc_llcp_sock(sk);

        sock_init_data(sock, sk);
        sk->sk_state = LLCP_CLOSED;
        sk->sk_protocol = NFC_SOCKPROTO_LLCP;
        sk->sk_type = type;
        sk->sk_destruct = llcp_sock_destruct;

        llcp_sock->ssap = 0;
        llcp_sock->dsap = LLCP_SAP_SDP;
        llcp_sock->rw = LLCP_MAX_RW + 1;
        llcp_sock->miux = cpu_to_be16(LLCP_MAX_MIUX + 1);
        llcp_sock->send_n = llcp_sock->send_ack_n = 0;
        llcp_sock->recv_n = llcp_sock->recv_ack_n = 0;
        llcp_sock->remote_ready = 1;
        llcp_sock->reserved_ssap = LLCP_SAP_MAX;
        nfc_llcp_socket_remote_param_init(llcp_sock);
        skb_queue_head_init(&llcp_sock->tx_queue);
        skb_queue_head_init(&llcp_sock->tx_pending_queue);
        INIT_LIST_HEAD(&llcp_sock->accept_queue);

        if (sock != NULL)
                sock->state = SS_UNCONNECTED;

        return sk;
}

void nfc_llcp_sock_free(struct nfc_llcp_sock *sock)
{
        kfree(sock->service_name);

        skb_queue_purge(&sock->tx_queue);
        skb_queue_purge(&sock->tx_pending_queue);

        list_del_init(&sock->accept_queue);

        sock->parent = NULL;

        nfc_llcp_local_put(sock->local);
}

static int llcp_sock_create(struct net *net, struct socket *sock,
                            const struct nfc_protocol *nfc_proto, int kern)
{
        struct sock *sk;

        pr_debug("%p\n", sock);

        if (sock->type != SOCK_STREAM &&
            sock->type != SOCK_DGRAM &&
            sock->type != SOCK_RAW)
                return -ESOCKTNOSUPPORT;

        if (sock->type == SOCK_RAW) {
                if (!capable(CAP_NET_RAW))
                        return -EPERM;
                sock->ops = &llcp_rawsock_ops;
        } else {
                sock->ops = &llcp_sock_ops;
        }

        sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern);
        if (sk == NULL)
                return -ENOMEM;

        return 0;
}

static const struct nfc_protocol llcp_nfc_proto = {
        .id          = NFC_SOCKPROTO_LLCP,
        .proto    = &llcp_sock_proto,
        .owner    = THIS_MODULE,
        .create   = llcp_sock_create
};

int __init nfc_llcp_sock_init(void)
{
        return nfc_proto_register(&llcp_nfc_proto);
}

void nfc_llcp_sock_exit(void)
{
        nfc_proto_unregister(&llcp_nfc_proto);
}











    2 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM percpu

#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(percpu_alloc_percpu,

        TP_PROTO(unsigned long call_site,
                 bool reserved, bool is_atomic, size_t size,
                 size_t align, void *base_addr, int off,
                 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
                ptr, bytes_alloc, gfp_flags),

        TP_STRUCT__entry(
                __field(        unsigned long,                call_site        )
                __field(        bool,                        reserved        )
                __field(        bool,                        is_atomic        )
                __field(        size_t,                        size                )
                __field(        size_t,                        align                )
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
                __field(        size_t,                        bytes_alloc        )
                __field(        unsigned long,                gfp_flags        )
        ),
        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
        ),

        TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
                  (void *)__entry->call_site,
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align,
                  __entry->base_addr, __entry->off, __entry->ptr,
                  __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);

TRACE_EVENT(percpu_free_percpu,

        TP_PROTO(void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("base_addr=%p off=%d ptr=%p",
                __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_alloc_percpu_fail,

        TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),

        TP_ARGS(reserved, is_atomic, size, align),

        TP_STRUCT__entry(
                __field(        bool,        reserved        )
                __field(        bool,        is_atomic        )
                __field(        size_t,        size                )
                __field(        size_t, align                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align)
);

TRACE_EVENT(percpu_create_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *, base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

TRACE_EVENT(percpu_destroy_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *,        base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

#endif /* _TRACE_PERCPU_H */

#include <trace/define_trace.h>
































    2 
    2 











































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 













    1 
    1 





























































































    4 



    2 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#include <asm/mtrr.h>

#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
#endif

#ifdef CONFIG_HIGHPTE
#define PGTABLE_HIGHMEM __GFP_HIGHMEM
#else
#define PGTABLE_HIGHMEM 0
#endif

#ifndef CONFIG_PARAVIRT
static inline
void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        tlb_remove_page(tlb, table);
}
#endif

gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;

pgtable_t pte_alloc_one(struct mm_struct *mm)
{
        return __pte_alloc_one(mm, __userpte_alloc_gfp);
}

static int __init setup_userpte(char *arg)
{
        if (!arg)
                return -EINVAL;

        /*
         * "userpte=nohigh" disables allocation of user pagetables in
         * high memory.
         */
        if (strcmp(arg, "nohigh") == 0)
                __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
        else
                return -EINVAL;
        return 0;
}
early_param("userpte", setup_userpte);

void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
        pagetable_pte_dtor(page_ptdesc(pte));
        paravirt_release_pte(page_to_pfn(pte));
        paravirt_tlb_remove_table(tlb, pte);
}

#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
        /*
         * NOTE! For PAE, any changes to the top page-directory-pointer-table
         * entries need a full cr3 reload to flush.
         */
#ifdef CONFIG_X86_PAE
        tlb->need_flush_all = 1;
#endif
        pagetable_pmd_dtor(ptdesc);
        paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
}

#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pud);

        pagetable_pud_dtor(ptdesc);
        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
        paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}

#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
        paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

static inline void pgd_list_add(pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        list_add(&ptdesc->pt_list, &pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        list_del(&ptdesc->pt_list);
}

#define UNSHARED_PTRS_PER_PGD                                \
        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
#define MAX_UNSHARED_PTRS_PER_PGD                        \
        max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)


static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
        virt_to_ptdesc(pgd)->pt_mm = mm;
}

struct mm_struct *pgd_page_get_mm(struct page *page)
{
        return page_ptdesc(page)->pt_mm;
}

static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
           references from swapper_pg_dir. */
        if (CONFIG_PGTABLE_LEVELS == 2 ||
            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
            CONFIG_PGTABLE_LEVELS >= 4) {
                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                KERNEL_PGD_PTRS);
        }

        /* list required to sync kernel mapping updates */
        if (!SHARED_KERNEL_PMD) {
                pgd_set_mm(pgd, mm);
                pgd_list_add(pgd);
        }
}

static void pgd_dtor(pgd_t *pgd)
{
        if (SHARED_KERNEL_PMD)
                return;

        spin_lock(&pgd_lock);
        pgd_list_del(pgd);
        spin_unlock(&pgd_lock);
}

/*
 * List of all pgd's needed for non-PAE so it can invalidate entries
 * in both cached and uncached pgd's; not needed for PAE since the
 * kernel pmd is shared. If PAE were not to share the pmd a similar
 * tactic would be needed. This is essentially codepath-based locking
 * against pageattr.c; it is the unique case in which a valid change
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 * vmalloc faults work because attached pagetables are never freed.
 * -- nyc
 */

#ifdef CONFIG_X86_PAE
/*
 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 * updating the top-level pagetable entries to guarantee the
 * processor notices the update.  Since this is expensive, and
 * all 4 top-level entries are used almost immediately in a
 * new process's life, we just pre-populate them here.
 *
 * Also, if we're in a paravirt environment where the kernel pmd is
 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 * and initialize the kernel pmds here.
 */
#define PREALLOCATED_PMDS        UNSHARED_PTRS_PER_PGD
#define MAX_PREALLOCATED_PMDS        MAX_UNSHARED_PTRS_PER_PGD

/*
 * We allocate separate PMDs for the kernel part of the user page-table
 * when PTI is enabled. We need them to map the per-process LDT into the
 * user-space page-table.
 */
#define PREALLOCATED_USER_PMDS         (boot_cpu_has(X86_FEATURE_PTI) ? \
                                        KERNEL_PGD_PTRS : 0)
#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS

void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);

        /* Note: almost everything apart from _PAGE_PRESENT is
           reserved at the pmd (PDPT) level. */
        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));

        /*
         * According to Intel App note "TLBs, Paging-Structure Caches,
         * and Their Invalidation", April 2007, document 317080-001,
         * section 8.1: in PAE mode we explicitly have to flush the
         * TLB via cr3 if the top-level pgd is changed...
         */
        flush_tlb_mm(mm);
}
#else  /* !CONFIG_X86_PAE */

/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS        0
#define MAX_PREALLOCATED_PMDS        0
#define PREALLOCATED_USER_PMDS         0
#define MAX_PREALLOCATED_USER_PMDS 0
#endif        /* CONFIG_X86_PAE */

static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;
        struct ptdesc *ptdesc;

        for (i = 0; i < count; i++)
                if (pmds[i]) {
                        ptdesc = virt_to_ptdesc(pmds[i]);

                        pagetable_pmd_dtor(ptdesc);
                        pagetable_free(ptdesc);
                        mm_dec_nr_pmds(mm);
                }
}

static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;
        bool failed = false;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp &= ~__GFP_ACCOUNT;
        gfp &= ~__GFP_HIGHMEM;

        for (i = 0; i < count; i++) {
                pmd_t *pmd = NULL;
                struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);

                if (!ptdesc)
                        failed = true;
                if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
                        pagetable_free(ptdesc);
                        ptdesc = NULL;
                        failed = true;
                }
                if (ptdesc) {
                        mm_inc_nr_pmds(mm);
                        pmd = ptdesc_address(ptdesc);
                }

                pmds[i] = pmd;
        }

        if (failed) {
                free_pmds(mm, pmds, count);
                return -ENOMEM;
        }

        return 0;
}

/*
 * Mop up any pmd pages which may still be attached to the pgd.
 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 * preallocate which never got a corresponding vma will need to be
 * freed manually.
 */
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
{
        pgd_t pgd = *pgdp;

        if (pgd_val(pgd) != 0) {
                pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);

                pgd_clear(pgdp);

                paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
                pmd_free(mm, pmd);
                mm_dec_nr_pmds(mm);
        }
}

static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
        int i;

        for (i = 0; i < PREALLOCATED_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i]);

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION

        if (!boot_cpu_has(X86_FEATURE_PTI))
                return;

        pgdp = kernel_to_user_pgdp(pgdp);

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
#endif
}

static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
        p4d_t *p4d;
        pud_t *pud;
        int i;

        p4d = p4d_offset(pgd, 0);
        pud = pud_offset(p4d, 0);

        for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
                pmd_t *pmd = pmds[i];

                if (i >= KERNEL_PGD_BOUNDARY)
                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
                               sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, pud, pmd);
        }
}

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
        pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
        p4d_t *u_p4d;
        pud_t *u_pud;
        int i;

        u_p4d = p4d_offset(u_pgd, 0);
        u_pud = pud_offset(u_p4d, 0);

        s_pgd += KERNEL_PGD_BOUNDARY;
        u_pud += KERNEL_PGD_BOUNDARY;

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
                pmd_t *pmd = pmds[i];

                memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
                       sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, u_pud, pmd);
        }

}
#else
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
}
#endif
/*
 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 * assumes that pgd should be in one page.
 *
 * But kernel with PAE paging that is not running as a Xen domain
 * only needs to allocate 32 bytes for pgd instead of one page.
 */
#ifdef CONFIG_X86_PAE

#include <linux/slab.h>

#define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
#define PGD_ALIGN        32

static struct kmem_cache *pgd_cache;

void __init pgtable_cache_init(void)
{
        /*
         * When PAE kernel is running as a Xen domain, it does not use
         * shared kernel pmd. And this requires a whole page for pgd.
         */
        if (!SHARED_KERNEL_PMD)
                return;

        /*
         * when PAE kernel is not running as a Xen domain, it uses
         * shared kernel pmd. Shared kernel pmd does not require a whole
         * page for pgd. We are able to just allocate a 32-byte for pgd.
         * During boot time, we create a 32-byte slab for pgd table allocation.
         */
        pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
                                      SLAB_PANIC, NULL);
}

static inline pgd_t *_pgd_alloc(void)
{
        /*
         * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
         * We allocate one page for pgd.
         */
        if (!SHARED_KERNEL_PMD)
                return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
                                                 PGD_ALLOCATION_ORDER);

        /*
         * Now PAE kernel is not running as a Xen domain. We can allocate
         * a 32-byte slab for pgd to save memory space.
         */
        return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
}

static inline void _pgd_free(pgd_t *pgd)
{
        if (!SHARED_KERNEL_PMD)
                free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
        else
                kmem_cache_free(pgd_cache, pgd);
}
#else

static inline pgd_t *_pgd_alloc(void)
{
        return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
                                         PGD_ALLOCATION_ORDER);
}

static inline void _pgd_free(pgd_t *pgd)
{
        free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */

pgd_t *pgd_alloc(struct mm_struct *mm)
{
        pgd_t *pgd;
        pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
        pmd_t *pmds[MAX_PREALLOCATED_PMDS];

        pgd = _pgd_alloc();

        if (pgd == NULL)
                goto out;

        mm->pgd = pgd;

        if (sizeof(pmds) != 0 &&
                        preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
                goto out_free_pgd;

        if (sizeof(u_pmds) != 0 &&
                        preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
                goto out_free_pmds;

        if (paravirt_pgd_alloc(mm) != 0)
                goto out_free_user_pmds;

        /*
         * Make sure that pre-populating the pmds is atomic with
         * respect to anything walking the pgd_list, so that they
         * never see a partially populated pgd.
         */
        spin_lock(&pgd_lock);

        pgd_ctor(mm, pgd);
        if (sizeof(pmds) != 0)
                pgd_prepopulate_pmd(mm, pgd, pmds);

        if (sizeof(u_pmds) != 0)
                pgd_prepopulate_user_pmd(mm, pgd, u_pmds);

        spin_unlock(&pgd_lock);

        return pgd;

out_free_user_pmds:
        if (sizeof(u_pmds) != 0)
                free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
        if (sizeof(pmds) != 0)
                free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
        _pgd_free(pgd);
out:
        return NULL;
}

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
        paravirt_pgd_free(mm, pgd);
        _pgd_free(pgd);
}

/*
 * Used to set accessed or dirty bits in the page table entries
 * on other architectures. On x86, the accessed and dirty bits
 * are tracked by hardware. However, do_wp_page calls this function
 * to also make the pte writeable at the same time the dirty bit is
 * set. In that case we do actually need to write the PTE.
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(*ptep, entry);

        if (changed && dirty)
                set_pte(ptep, entry);

        return changed;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        if (changed && dirty) {
                set_pmd(pmdp, entry);
                /*
                 * We had a write-protection fault here and changed the pmd
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}

int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
                          pud_t *pudp, pud_t entry, int dirty)
{
        int changed = !pud_same(*pudp, entry);

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);

        if (changed && dirty) {
                set_pud(pudp, entry);
                /*
                 * We had a write-protection fault here and changed the pud
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}
#endif

int ptep_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *ptep)
{
        int ret = 0;

        if (pte_young(*ptep))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *) &ptep->pte);

        return ret;
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pmd_t *pmdp)
{
        int ret = 0;

        if (pmd_young(*pmdp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pmdp);

        return ret;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pudp_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pud_t *pudp)
{
        int ret = 0;

        if (pud_young(*pudp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pudp);

        return ret;
}
#endif

int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
{
        /*
         * On x86 CPUs, clearing the accessed bit without a TLB flush
         * doesn't cause data corruption. [ It could cause incorrect
         * page aging and the (mistaken) reclaim of hot pages, but the
         * chance of that should be relatively low. ]
         *
         * So as a performance optimization don't flush the TLB when
         * clearing the accessed bit, it will eventually be flushed by
         * a context switch or a VM operation anyway. [ In the rare
         * event of it not getting flushed for a long time the delay
         * shouldn't really matter because there's no real memory
         * pressure for swapout to react to. ]
         */
        return ptep_test_and_clear_young(vma, address, ptep);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
{
        int young;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);

        return young;
}

pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));

        /*
         * No flush is necessary. Once an invalid PTE is established, the PTE's
         * access and dirty bits cannot be updated.
         */
        return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
}
#endif

/**
 * reserve_top_address - reserves a hole in the top of kernel address space
 * @reserve - size of hole to reserve
 *
 * Can be used to relocate the fixmap area and poke a hole in the top
 * of kernel address space to make room for a hypervisor.
 */
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
        BUG_ON(fixmaps_set > 0);
        __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
        printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
               -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}

int fixmaps_set;

void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
        unsigned long address = __fix_to_virt(idx);

#ifdef CONFIG_X86_64
       /*
        * Ensure that the static initial page tables are covering the
        * fixmap completely.
        */
        BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
                     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
#endif

        if (idx >= __end_of_fixed_addresses) {
                BUG();
                return;
        }
        set_pte_vaddr(address, pte);
        fixmaps_set++;
}

void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
                       phys_addr_t phys, pgprot_t flags)
{
        /* Sanitize 'prot' against any unsupported bits: */
        pgprot_val(flags) &= __default_kernel_pte_mask;

        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#ifdef CONFIG_X86_5LEVEL
/**
 * p4d_set_huge - setup kernel P4D mapping
 *
 * No 512GB pages yet -- always return 0
 */
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}

/**
 * p4d_clear_huge - clear kernel P4D mapping when it is set
 *
 * No 512GB pages yet -- always return 0
 */
void p4d_clear_huge(p4d_t *p4d)
{
}
#endif

/**
 * pud_set_huge - setup kernel PUD mapping
 *
 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 * function sets up a huge page only if the complete range has the same MTRR
 * caching mode.
 *
 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 * page mapping attempt fails.
 *
 * Returns 1 on success and 0 on failure.
 */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        u8 uniform;

        mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
        if (!uniform)
                return 0;

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pud_present(*pud) && !pud_leaf(*pud))
                return 0;

        set_pte((pte_t *)pud, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pmd_set_huge - setup kernel PMD mapping
 *
 * See text over pud_set_huge() above.
 *
 * Returns 1 on success and 0 on failure.
 */
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        u8 uniform;

        mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
        if (!uniform) {
                pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
                             __func__, addr, addr + PMD_SIZE);
                return 0;
        }

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pmd_present(*pmd) && !pmd_leaf(*pmd))
                return 0;

        set_pte((pte_t *)pmd, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pud_clear_huge - clear kernel PUD mapping when it is set
 *
 * Returns 1 on success and 0 on failure (no PUD map is found).
 */
int pud_clear_huge(pud_t *pud)
{
        if (pud_leaf(*pud)) {
                pud_clear(pud);
                return 1;
        }

        return 0;
}

/**
 * pmd_clear_huge - clear kernel PMD mapping when it is set
 *
 * Returns 1 on success and 0 on failure (no PMD map is found).
 */
int pmd_clear_huge(pmd_t *pmd)
{
        if (pmd_leaf(*pmd)) {
                pmd_clear(pmd);
                return 1;
        }

        return 0;
}

#ifdef CONFIG_X86_64
/**
 * pud_free_pmd_page - Clear pud entry and free pmd page.
 * @pud: Pointer to a PUD.
 * @addr: Virtual address associated with pud.
 *
 * Context: The pud range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 *
 * NOTE: Callers must allow a single page allocation.
 */
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        pmd_t *pmd, *pmd_sv;
        pte_t *pte;
        int i;

        pmd = pud_pgtable(*pud);
        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
        if (!pmd_sv)
                return 0;

        for (i = 0; i < PTRS_PER_PMD; i++) {
                pmd_sv[i] = pmd[i];
                if (!pmd_none(pmd[i]))
                        pmd_clear(&pmd[i]);
        }

        pud_clear(pud);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        for (i = 0; i < PTRS_PER_PMD; i++) {
                if (!pmd_none(pmd_sv[i])) {
                        pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
                        free_page((unsigned long)pte);
                }
        }

        free_page((unsigned long)pmd_sv);

        pagetable_pmd_dtor(virt_to_ptdesc(pmd));
        free_page((unsigned long)pmd);

        return 1;
}

/**
 * pmd_free_pte_page - Clear pmd entry and free pte page.
 * @pmd: Pointer to a PMD.
 * @addr: Virtual address associated with pmd.
 *
 * Context: The pmd range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        pte_t *pte;

        pte = (pte_t *)pmd_page_vaddr(*pmd);
        pmd_clear(pmd);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        free_page((unsigned long)pte);

        return 1;
}

#else /* !CONFIG_X86_64 */

/*
 * Disable free page handling on x86-PAE. This assures that ioremap()
 * does not update sync'd pmd entries. See vmalloc_sync_one().
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return pmd_none(*pmd);
}

#endif /* CONFIG_X86_64 */
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_SHADOW_STACK)
                return pte_mkwrite_shstk(pte);

        pte = pte_mkwrite_novma(pte);

        return pte_clear_saveddirty(pte);
}

pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_SHADOW_STACK)
                return pmd_mkwrite_shstk(pmd);

        pmd = pmd_mkwrite_novma(pmd);

        return pmd_clear_saveddirty(pmd);
}

void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
{
        /*
         * Hardware before shadow stack can (rarely) set Dirty=1
         * on a Write=0 PTE. So the below condition
         * only indicates a software bug when shadow stack is
         * supported by the HW. This checking is covered in
         * pte_shstk().
         */
        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
                        pte_shstk(pte));
}

void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
{
        /* See note in arch_check_zapped_pte() */
        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
                        pmd_shstk(pmd));
}







































































































































































































































    1 




    1 

    1 

    1 



















    1 




    1 
    1 











    1 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
 * Copyright (C) 2018-2023 Intel Corporation
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/etherdevice.h>
#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/if_arp.h>
#include <linux/timer.h>
#include <linux/rtnetlink.h>

#include <net/codel.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
#include "rate.h"
#include "sta_info.h"
#include "debugfs_sta.h"
#include "mesh.h"
#include "wme.h"

/**
 * DOC: STA information lifetime rules
 *
 * STA info structures (&struct sta_info) are managed in a hash table
 * for faster lookup and a list for iteration. They are managed using
 * RCU, i.e. access to the list and hash table is protected by RCU.
 *
 * Upon allocating a STA info structure with sta_info_alloc(), the caller
 * owns that structure. It must then insert it into the hash table using
 * either sta_info_insert() or sta_info_insert_rcu(); only in the latter
 * case (which acquires an rcu read section but must not be called from
 * within one) will the pointer still be valid after the call. Note that
 * the caller may not do much with the STA info before inserting it; in
 * particular, it may not start any mesh peer link management or add
 * encryption keys.
 *
 * When the insertion fails (sta_info_insert()) returns non-zero), the
 * structure will have been freed by sta_info_insert()!
 *
 * Station entries are added by mac80211 when you establish a link with a
 * peer. This means different things for the different type of interfaces
 * we support. For a regular station this mean we add the AP sta when we
 * receive an association response from the AP. For IBSS this occurs when
 * get to know about a peer on the same IBSS. For WDS we add the sta for
 * the peer immediately upon device open. When using AP mode we add stations
 * for each respective station upon request from userspace through nl80211.
 *
 * In order to remove a STA info structure, various sta_info_destroy_*()
 * calls are available.
 *
 * There is no concept of ownership on a STA entry; each structure is
 * owned by the global hash table/list until it is removed. All users of
 * the structure need to be RCU protected so that the structure won't be
 * freed before they are done using it.
 */

struct sta_link_alloc {
        struct link_sta_info info;
        struct ieee80211_link_sta sta;
        struct rcu_head rcu_head;
};

static const struct rhashtable_params sta_rht_params = {
        .nelem_hint = 3, /* start small */
        .automatic_shrinking = true,
        .head_offset = offsetof(struct sta_info, hash_node),
        .key_offset = offsetof(struct sta_info, addr),
        .key_len = ETH_ALEN,
        .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
};

static const struct rhashtable_params link_sta_rht_params = {
        .nelem_hint = 3, /* start small */
        .automatic_shrinking = true,
        .head_offset = offsetof(struct link_sta_info, link_hash_node),
        .key_offset = offsetof(struct link_sta_info, addr),
        .key_len = ETH_ALEN,
        .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE,
};

static int sta_info_hash_del(struct ieee80211_local *local,
                             struct sta_info *sta)
{
        return rhltable_remove(&local->sta_hash, &sta->hash_node,
                               sta_rht_params);
}

static int link_sta_info_hash_add(struct ieee80211_local *local,
                                  struct link_sta_info *link_sta)
{
        lockdep_assert_wiphy(local->hw.wiphy);

        return rhltable_insert(&local->link_sta_hash,
                               &link_sta->link_hash_node, link_sta_rht_params);
}

static int link_sta_info_hash_del(struct ieee80211_local *local,
                                  struct link_sta_info *link_sta)
{
        lockdep_assert_wiphy(local->hw.wiphy);

        return rhltable_remove(&local->link_sta_hash,
                               &link_sta->link_hash_node, link_sta_rht_params);
}

void ieee80211_purge_sta_txqs(struct sta_info *sta)
{
        struct ieee80211_local *local = sta->sdata->local;
        int i;

        for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
                struct txq_info *txqi;

                if (!sta->sta.txq[i])
                        continue;

                txqi = to_txq_info(sta->sta.txq[i]);

                ieee80211_txq_purge(local, txqi);
        }
}

static void __cleanup_single_sta(struct sta_info *sta)
{
        int ac, i;
        struct tid_ampdu_tx *tid_tx;
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        struct ps_data *ps;

        if (test_sta_flag(sta, WLAN_STA_PS_STA) ||
            test_sta_flag(sta, WLAN_STA_PS_DRIVER) ||
            test_sta_flag(sta, WLAN_STA_PS_DELIVER)) {
                if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
                    sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                        ps = &sdata->bss->ps;
                else if (ieee80211_vif_is_mesh(&sdata->vif))
                        ps = &sdata->u.mesh.ps;
                else
                        return;

                clear_sta_flag(sta, WLAN_STA_PS_STA);
                clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
                clear_sta_flag(sta, WLAN_STA_PS_DELIVER);

                atomic_dec(&ps->num_sta_ps);
        }

        ieee80211_purge_sta_txqs(sta);

        for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
                local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]);
                ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]);
                ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]);
        }

        if (ieee80211_vif_is_mesh(&sdata->vif))
                mesh_sta_cleanup(sta);

        cancel_work_sync(&sta->drv_deliver_wk);

        /*
         * Destroy aggregation state here. It would be nice to wait for the
         * driver to finish aggregation stop and then clean up, but for now
         * drivers have to handle aggregation stop being requested, followed
         * directly by station destruction.
         */
        for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
                kfree(sta->ampdu_mlme.tid_start_tx[i]);
                tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]);
                if (!tid_tx)
                        continue;
                ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending);
                kfree(tid_tx);
        }
}

static void cleanup_single_sta(struct sta_info *sta)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;

        __cleanup_single_sta(sta);
        sta_info_free(local, sta);
}

struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local,
                                         const u8 *addr)
{
        return rhltable_lookup(&local->sta_hash, addr, sta_rht_params);
}

/* protected by RCU */
struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
                              const u8 *addr)
{
        struct ieee80211_local *local = sdata->local;
        struct rhlist_head *tmp;
        struct sta_info *sta;

        rcu_read_lock();
        for_each_sta_info(local, addr, sta, tmp) {
                if (sta->sdata == sdata) {
                        rcu_read_unlock();
                        /* this is safe as the caller must already hold
                         * another rcu read section or the mutex
                         */
                        return sta;
                }
        }
        rcu_read_unlock();
        return NULL;
}

/*
 * Get sta info either from the specified interface
 * or from one of its vlans
 */
struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
                                  const u8 *addr)
{
        struct ieee80211_local *local = sdata->local;
        struct rhlist_head *tmp;
        struct sta_info *sta;

        rcu_read_lock();
        for_each_sta_info(local, addr, sta, tmp) {
                if (sta->sdata == sdata ||
                    (sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
                        rcu_read_unlock();
                        /* this is safe as the caller must already hold
                         * another rcu read section or the mutex
                         */
                        return sta;
                }
        }
        rcu_read_unlock();
        return NULL;
}

struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local,
                                              const u8 *addr)
{
        return rhltable_lookup(&local->link_sta_hash, addr,
                               link_sta_rht_params);
}

struct link_sta_info *
link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr)
{
        struct ieee80211_local *local = sdata->local;
        struct rhlist_head *tmp;
        struct link_sta_info *link_sta;

        rcu_read_lock();
        for_each_link_sta_info(local, addr, link_sta, tmp) {
                struct sta_info *sta = link_sta->sta;

                if (sta->sdata == sdata ||
                    (sta->sdata->bss && sta->sdata->bss == sdata->bss)) {
                        rcu_read_unlock();
                        /* this is safe as the caller must already hold
                         * another rcu read section or the mutex
                         */
                        return link_sta;
                }
        }
        rcu_read_unlock();
        return NULL;
}

struct ieee80211_sta *
ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw,
                                 const u8 *addr,
                                 const u8 *localaddr,
                                 unsigned int *link_id)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct link_sta_info *link_sta;
        struct rhlist_head *tmp;

        for_each_link_sta_info(local, addr, link_sta, tmp) {
                struct sta_info *sta = link_sta->sta;
                struct ieee80211_link_data *link;
                u8 _link_id = link_sta->link_id;

                if (!localaddr) {
                        if (link_id)
                                *link_id = _link_id;
                        return &sta->sta;
                }

                link = rcu_dereference(sta->sdata->link[_link_id]);
                if (!link)
                        continue;

                if (memcmp(link->conf->addr, localaddr, ETH_ALEN))
                        continue;

                if (link_id)
                        *link_id = _link_id;
                return &sta->sta;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_link_addrs);

struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local,
                                       const u8 *sta_addr, const u8 *vif_addr)
{
        struct rhlist_head *tmp;
        struct sta_info *sta;

        for_each_sta_info(local, sta_addr, sta, tmp) {
                if (ether_addr_equal(vif_addr, sta->sdata->vif.addr))
                        return sta;
        }

        return NULL;
}

struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
                                     int idx)
{
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta;
        int i = 0;

        list_for_each_entry_rcu(sta, &local->sta_list, list,
                                lockdep_is_held(&local->hw.wiphy->mtx)) {
                if (sdata != sta->sdata)
                        continue;
                if (i < idx) {
                        ++i;
                        continue;
                }
                return sta;
        }

        return NULL;
}

static void sta_info_free_link(struct link_sta_info *link_sta)
{
        free_percpu(link_sta->pcpu_rx_stats);
}

static void sta_remove_link(struct sta_info *sta, unsigned int link_id,
                            bool unhash)
{
        struct sta_link_alloc *alloc = NULL;
        struct link_sta_info *link_sta;

        lockdep_assert_wiphy(sta->local->hw.wiphy);

        link_sta = rcu_access_pointer(sta->link[link_id]);
        if (WARN_ON(!link_sta))
                return;

        if (unhash)
                link_sta_info_hash_del(sta->local, link_sta);

        if (test_sta_flag(sta, WLAN_STA_INSERTED))
                ieee80211_link_sta_debugfs_remove(link_sta);

        if (link_sta != &sta->deflink)
                alloc = container_of(link_sta, typeof(*alloc), info);

        sta->sta.valid_links &= ~BIT(link_id);
        RCU_INIT_POINTER(sta->link[link_id], NULL);
        RCU_INIT_POINTER(sta->sta.link[link_id], NULL);
        if (alloc) {
                sta_info_free_link(&alloc->info);
                kfree_rcu(alloc, rcu_head);
        }

        ieee80211_sta_recalc_aggregates(&sta->sta);
}

/**
 * sta_info_free - free STA
 *
 * @local: pointer to the global information
 * @sta: STA info to free
 *
 * This function must undo everything done by sta_info_alloc()
 * that may happen before sta_info_insert(). It may only be
 * called when sta_info_insert() has not been attempted (and
 * if that fails, the station is freed anyway.)
 */
void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
                struct link_sta_info *link_sta;

                link_sta = rcu_access_pointer(sta->link[i]);
                if (!link_sta)
                        continue;

                sta_remove_link(sta, i, false);
        }

        /*
         * If we had used sta_info_pre_move_state() then we might not
         * have gone through the state transitions down again, so do
         * it here now (and warn if it's inserted).
         *
         * This will clear state such as fast TX/RX that may have been
         * allocated during state transitions.
         */
        while (sta->sta_state > IEEE80211_STA_NONE) {
                int ret;

                WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED));

                ret = sta_info_move_state(sta, sta->sta_state - 1);
                if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret))
                        break;
        }

        if (sta->rate_ctrl)
                rate_control_free_sta(sta);

        sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr);

        kfree(to_txq_info(sta->sta.txq[0]));
        kfree(rcu_dereference_raw(sta->sta.rates));
#ifdef CONFIG_MAC80211_MESH
        kfree(sta->mesh);
#endif

        sta_info_free_link(&sta->deflink);
        kfree(sta);
}

static int sta_info_hash_add(struct ieee80211_local *local,
                             struct sta_info *sta)
{
        return rhltable_insert(&local->sta_hash, &sta->hash_node,
                               sta_rht_params);
}

static void sta_deliver_ps_frames(struct work_struct *wk)
{
        struct sta_info *sta;

        sta = container_of(wk, struct sta_info, drv_deliver_wk);

        if (sta->dead)
                return;

        local_bh_disable();
        if (!test_sta_flag(sta, WLAN_STA_PS_STA))
                ieee80211_sta_ps_deliver_wakeup(sta);
        else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL))
                ieee80211_sta_ps_deliver_poll_response(sta);
        else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD))
                ieee80211_sta_ps_deliver_uapsd(sta);
        local_bh_enable();
}

static int sta_prepare_rate_control(struct ieee80211_local *local,
                                    struct sta_info *sta, gfp_t gfp)
{
        if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
                return 0;

        sta->rate_ctrl = local->rate_ctrl;
        sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl,
                                                     sta, gfp);
        if (!sta->rate_ctrl_priv)
                return -ENOMEM;

        return 0;
}

static int sta_info_alloc_link(struct ieee80211_local *local,
                               struct link_sta_info *link_info,
                               gfp_t gfp)
{
        struct ieee80211_hw *hw = &local->hw;
        int i;

        if (ieee80211_hw_check(hw, USES_RSS)) {
                link_info->pcpu_rx_stats =
                        alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp);
                if (!link_info->pcpu_rx_stats)
                        return -ENOMEM;
        }

        link_info->rx_stats.last_rx = jiffies;
        u64_stats_init(&link_info->rx_stats.syncp);

        ewma_signal_init(&link_info->rx_stats_avg.signal);
        ewma_avg_signal_init(&link_info->status_stats.avg_ack_signal);
        for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++)
                ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]);

        return 0;
}

static void sta_info_add_link(struct sta_info *sta,
                              unsigned int link_id,
                              struct link_sta_info *link_info,
                              struct ieee80211_link_sta *link_sta)
{
        link_info->sta = sta;
        link_info->link_id = link_id;
        link_info->pub = link_sta;
        link_info->pub->sta = &sta->sta;
        link_sta->link_id = link_id;
        rcu_assign_pointer(sta->link[link_id], link_info);
        rcu_assign_pointer(sta->sta.link[link_id], link_sta);

        link_sta->smps_mode = IEEE80211_SMPS_OFF;
        link_sta->agg.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA;
}

static struct sta_info *
__sta_info_alloc(struct ieee80211_sub_if_data *sdata,
                 const u8 *addr, int link_id, const u8 *link_addr,
                 gfp_t gfp)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_hw *hw = &local->hw;
        struct sta_info *sta;
        void *txq_data;
        int size;
        int i;

        sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp);
        if (!sta)
                return NULL;

        sta->local = local;
        sta->sdata = sdata;

        if (sta_info_alloc_link(local, &sta->deflink, gfp))
                goto free;

        if (link_id >= 0) {
                sta_info_add_link(sta, link_id, &sta->deflink,
                                  &sta->sta.deflink);
                sta->sta.valid_links = BIT(link_id);
        } else {
                sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink);
        }

        sta->sta.cur = &sta->sta.deflink.agg;

        spin_lock_init(&sta->lock);
        spin_lock_init(&sta->ps_lock);
        INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames);
        wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work);
#ifdef CONFIG_MAC80211_MESH
        if (ieee80211_vif_is_mesh(&sdata->vif)) {
                sta->mesh = kzalloc(sizeof(*sta->mesh), gfp);
                if (!sta->mesh)
                        goto free;
                sta->mesh->plink_sta = sta;
                spin_lock_init(&sta->mesh->plink_lock);
                if (!sdata->u.mesh.user_mpm)
                        timer_setup(&sta->mesh->plink_timer, mesh_plink_timer,
                                    0);
                sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE;
        }
#endif

        memcpy(sta->addr, addr, ETH_ALEN);
        memcpy(sta->sta.addr, addr, ETH_ALEN);
        memcpy(sta->deflink.addr, link_addr, ETH_ALEN);
        memcpy(sta->sta.deflink.addr, link_addr, ETH_ALEN);
        sta->sta.max_rx_aggregation_subframes =
                local->hw.max_rx_aggregation_subframes;

        /* TODO link specific alloc and assignments for MLO Link STA */

        /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only.
         * The Tx path starts to use a key as soon as the key slot ptk_idx
         * references to is not NULL. To not use the initial Rx-only key
         * prematurely for Tx initialize ptk_idx to an impossible PTK keyid
         * which always will refer to a NULL key.
         */
        BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX);
        sta->ptk_idx = INVALID_PTK_KEYIDX;


        ieee80211_init_frag_cache(&sta->frags);

        sta->sta_state = IEEE80211_STA_NONE;

        if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
                sta->amsdu_mesh_control = -1;

        /* Mark TID as unreserved */
        sta->reserved_tid = IEEE80211_TID_UNRESERVED;

        sta->last_connected = ktime_get_seconds();

        size = sizeof(struct txq_info) +
               ALIGN(hw->txq_data_size, sizeof(void *));

        txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp);
        if (!txq_data)
                goto free;

        for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
                struct txq_info *txq = txq_data + i * size;

                /* might not do anything for the (bufferable) MMPDU TXQ */
                ieee80211_txq_init(sdata, sta, txq, i);
        }

        if (sta_prepare_rate_control(local, sta, gfp))
                goto free_txq;

        sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT;

        for (i = 0; i < IEEE80211_NUM_ACS; i++) {
                skb_queue_head_init(&sta->ps_tx_buf[i]);
                skb_queue_head_init(&sta->tx_filtered[i]);
                sta->airtime[i].deficit = sta->airtime_weight;
                atomic_set(&sta->airtime[i].aql_tx_pending, 0);
                sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i];
                sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i];
        }

        for (i = 0; i < IEEE80211_NUM_TIDS; i++)
                sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);

        for (i = 0; i < NUM_NL80211_BANDS; i++) {
                u32 mandatory = 0;
                int r;

                if (!hw->wiphy->bands[i])
                        continue;

                switch (i) {
                case NL80211_BAND_2GHZ:
                case NL80211_BAND_LC:
                        /*
                         * We use both here, even if we cannot really know for
                         * sure the station will support both, but the only use
                         * for this is when we don't know anything yet and send
                         * management frames, and then we'll pick the lowest
                         * possible rate anyway.
                         * If we don't include _G here, we cannot find a rate
                         * in P2P, and thus trigger the WARN_ONCE() in rate.c
                         */
                        mandatory = IEEE80211_RATE_MANDATORY_B |
                                    IEEE80211_RATE_MANDATORY_G;
                        break;
                case NL80211_BAND_5GHZ:
                        mandatory = IEEE80211_RATE_MANDATORY_A;
                        break;
                case NL80211_BAND_60GHZ:
                        WARN_ON(1);
                        mandatory = 0;
                        break;
                }

                for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) {
                        struct ieee80211_rate *rate;

                        rate = &hw->wiphy->bands[i]->bitrates[r];

                        if (!(rate->flags & mandatory))
                                continue;
                        sta->sta.deflink.supp_rates[i] |= BIT(r);
                }
        }

        sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD;
        sta->cparams.target = MS2TIME(20);
        sta->cparams.interval = MS2TIME(100);
        sta->cparams.ecn = true;
        sta->cparams.ce_threshold_selector = 0;
        sta->cparams.ce_threshold_mask = 0;

        sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr);

        return sta;

free_txq:
        kfree(to_txq_info(sta->sta.txq[0]));
free:
        sta_info_free_link(&sta->deflink);
#ifdef CONFIG_MAC80211_MESH
        kfree(sta->mesh);
#endif
        kfree(sta);
        return NULL;
}

struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
                                const u8 *addr, gfp_t gfp)
{
        return __sta_info_alloc(sdata, addr, -1, addr, gfp);
}

struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata,
                                          const u8 *mld_addr,
                                          unsigned int link_id,
                                          const u8 *link_addr,
                                          gfp_t gfp)
{
        return __sta_info_alloc(sdata, mld_addr, link_id, link_addr, gfp);
}

static int sta_info_insert_check(struct sta_info *sta)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        /*
         * Can't be a WARN_ON because it can be triggered through a race:
         * something inserts a STA (on one CPU) without holding the RTNL
         * and another CPU turns off the net device.
         */
        if (unlikely(!ieee80211_sdata_running(sdata)))
                return -ENETDOWN;

        if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) ||
                    !is_valid_ether_addr(sta->sta.addr)))
                return -EINVAL;

        /* The RCU read lock is required by rhashtable due to
         * asynchronous resize/rehash.  We also require the mutex
         * for correctness.
         */
        rcu_read_lock();
        if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) &&
            ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) {
                rcu_read_unlock();
                return -ENOTUNIQ;
        }
        rcu_read_unlock();

        return 0;
}

static int sta_info_insert_drv_state(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     struct sta_info *sta)
{
        enum ieee80211_sta_state state;
        int err = 0;

        for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) {
                err = drv_sta_state(local, sdata, sta, state, state + 1);
                if (err)
                        break;
        }

        if (!err) {
                /*
                 * Drivers using legacy sta_add/sta_remove callbacks only
                 * get uploaded set to true after sta_add is called.
                 */
                if (!local->ops->sta_add)
                        sta->uploaded = true;
                return 0;
        }

        if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
                sdata_info(sdata,
                           "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n",
                           sta->sta.addr, state + 1, err);
                err = 0;
        }

        /* unwind on error */
        for (; state > IEEE80211_STA_NOTEXIST; state--)
                WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1));

        return err;
}

static void
ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        bool allow_p2p_go_ps = sdata->vif.p2p;
        struct sta_info *sta;

        rcu_read_lock();
        list_for_each_entry_rcu(sta, &local->sta_list, list) {
                if (sdata != sta->sdata ||
                    !test_sta_flag(sta, WLAN_STA_ASSOC))
                        continue;
                if (!sta->sta.support_p2p_ps) {
                        allow_p2p_go_ps = false;
                        break;
                }
        }
        rcu_read_unlock();

        if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) {
                sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps;
                ieee80211_link_info_change_notify(sdata, &sdata->deflink,
                                                  BSS_CHANGED_P2P_PS);
        }
}

static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
{
        struct ieee80211_local *local = sta->local;
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct station_info *sinfo = NULL;
        int err = 0;

        lockdep_assert_wiphy(local->hw.wiphy);

        /* check if STA exists already */
        if (sta_info_get_bss(sdata, sta->sta.addr)) {
                err = -EEXIST;
                goto out_cleanup;
        }

        sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
        if (!sinfo) {
                err = -ENOMEM;
                goto out_cleanup;
        }

        local->num_sta++;
        local->sta_generation++;
        smp_mb();

        /* simplify things and don't accept BA sessions yet */
        set_sta_flag(sta, WLAN_STA_BLOCK_BA);

        /* make the station visible */
        err = sta_info_hash_add(local, sta);
        if (err)
                goto out_drop_sta;

        if (sta->sta.valid_links) {
                err = link_sta_info_hash_add(local, &sta->deflink);
                if (err) {
                        sta_info_hash_del(local, sta);
                        goto out_drop_sta;
                }
        }

        list_add_tail_rcu(&sta->list, &local->sta_list);

        /* update channel context before notifying the driver about state
         * change, this enables driver using the updated channel context right away.
         */
        if (sta->sta_state >= IEEE80211_STA_ASSOC) {
                ieee80211_recalc_min_chandef(sta->sdata, -1);
                if (!sta->sta.support_p2p_ps)
                        ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
        }

        /* notify driver */
        err = sta_info_insert_drv_state(local, sdata, sta);
        if (err)
                goto out_remove;

        set_sta_flag(sta, WLAN_STA_INSERTED);

        /* accept BA sessions now */
        clear_sta_flag(sta, WLAN_STA_BLOCK_BA);

        ieee80211_sta_debugfs_add(sta);
        rate_control_add_sta_debugfs(sta);
        if (sta->sta.valid_links) {
                int i;

                for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
                        struct link_sta_info *link_sta;

                        link_sta = rcu_dereference_protected(sta->link[i],
                                                             lockdep_is_held(&local->hw.wiphy->mtx));

                        if (!link_sta)
                                continue;

                        ieee80211_link_sta_debugfs_add(link_sta);
                        if (sdata->vif.active_links & BIT(i))
                                ieee80211_link_sta_debugfs_drv_add(link_sta);
                }
        } else {
                ieee80211_link_sta_debugfs_add(&sta->deflink);
                ieee80211_link_sta_debugfs_drv_add(&sta->deflink);
        }

        sinfo->generation = local->sta_generation;
        cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
        kfree(sinfo);

        sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr);

        /* move reference to rcu-protected */
        rcu_read_lock();

        if (ieee80211_vif_is_mesh(&sdata->vif))
                mesh_accept_plinks_update(sdata);

        ieee80211_check_fast_xmit(sta);

        return 0;
 out_remove:
        if (sta->sta.valid_links)
                link_sta_info_hash_del(local, &sta->deflink);
        sta_info_hash_del(local, sta);
        list_del_rcu(&sta->list);
 out_drop_sta:
        local->num_sta--;
        synchronize_net();
 out_cleanup:
        cleanup_single_sta(sta);
        kfree(sinfo);
        rcu_read_lock();
        return err;
}

int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
{
        struct ieee80211_local *local = sta->local;
        int err;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        err = sta_info_insert_check(sta);
        if (err) {
                sta_info_free(local, sta);
                rcu_read_lock();
                return err;
        }

        return sta_info_insert_finish(sta);
}

int sta_info_insert(struct sta_info *sta)
{
        int err = sta_info_insert_rcu(sta);

        rcu_read_unlock();

        return err;
}

static inline void __bss_tim_set(u8 *tim, u16 id)
{
        /*
         * This format has been mandated by the IEEE specifications,
         * so this line may not be changed to use the __set_bit() format.
         */
        tim[id / 8] |= (1 << (id % 8));
}

static inline void __bss_tim_clear(u8 *tim, u16 id)
{
        /*
         * This format has been mandated by the IEEE specifications,
         * so this line may not be changed to use the __clear_bit() format.
         */
        tim[id / 8] &= ~(1 << (id % 8));
}

static inline bool __bss_tim_get(u8 *tim, u16 id)
{
        /*
         * This format has been mandated by the IEEE specifications,
         * so this line may not be changed to use the test_bit() format.
         */
        return tim[id / 8] & (1 << (id % 8));
}

static unsigned long ieee80211_tids_for_ac(int ac)
{
        /* If we ever support TIDs > 7, this obviously needs to be adjusted */
        switch (ac) {
        case IEEE80211_AC_VO:
                return BIT(6) | BIT(7);
        case IEEE80211_AC_VI:
                return BIT(4) | BIT(5);
        case IEEE80211_AC_BE:
                return BIT(0) | BIT(3);
        case IEEE80211_AC_BK:
                return BIT(1) | BIT(2);
        default:
                WARN_ON(1);
                return 0;
        }
}

static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending)
{
        struct ieee80211_local *local = sta->local;
        struct ps_data *ps;
        bool indicate_tim = false;
        u8 ignore_for_tim = sta->sta.uapsd_queues;
        int ac;
        u16 id = sta->sta.aid;

        if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
            sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
                if (WARN_ON_ONCE(!sta->sdata->bss))
                        return;

                ps = &sta->sdata->bss->ps;
#ifdef CONFIG_MAC80211_MESH
        } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) {
                ps = &sta->sdata->u.mesh.ps;
#endif
        } else {
                return;
        }

        /* No need to do anything if the driver does all */
        if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim)
                return;

        if (sta->dead)
                goto done;

        /*
         * If all ACs are delivery-enabled then we should build
         * the TIM bit for all ACs anyway; if only some are then
         * we ignore those and build the TIM bit using only the
         * non-enabled ones.
         */
        if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1)
                ignore_for_tim = 0;

        if (ignore_pending)
                ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1;

        for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
                unsigned long tids;

                if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac])
                        continue;

                indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) ||
                                !skb_queue_empty(&sta->ps_tx_buf[ac]);
                if (indicate_tim)
                        break;

                tids = ieee80211_tids_for_ac(ac);

                indicate_tim |=
                        sta->driver_buffered_tids & tids;
                indicate_tim |=
                        sta->txq_buffered_tids & tids;
        }

 done:
        spin_lock_bh(&local->tim_lock);

        if (indicate_tim == __bss_tim_get(ps->tim, id))
                goto out_unlock;

        if (indicate_tim)
                __bss_tim_set(ps->tim, id);
        else
                __bss_tim_clear(ps->tim, id);

        if (local->ops->set_tim && !WARN_ON(sta->dead)) {
                local->tim_in_locked_section = true;
                drv_set_tim(local, &sta->sta, indicate_tim);
                local->tim_in_locked_section = false;
        }

out_unlock:
        spin_unlock_bh(&local->tim_lock);
}

void sta_info_recalc_tim(struct sta_info *sta)
{
        __sta_info_recalc_tim(sta, false);
}

static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb)
{
        struct ieee80211_tx_info *info;
        int timeout;

        if (!skb)
                return false;

        info = IEEE80211_SKB_CB(skb);

        /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */
        timeout = (sta->listen_interval *
                   sta->sdata->vif.bss_conf.beacon_int *
                   32 / 15625) * HZ;
        if (timeout < STA_TX_BUFFER_EXPIRE)
                timeout = STA_TX_BUFFER_EXPIRE;
        return time_after(jiffies, info->control.jiffies + timeout);
}


static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local,
                                                struct sta_info *sta, int ac)
{
        unsigned long flags;
        struct sk_buff *skb;

        /*
         * First check for frames that should expire on the filtered
         * queue. Frames here were rejected by the driver and are on
         * a separate queue to avoid reordering with normal PS-buffered
         * frames. They also aren't accounted for right now in the
         * total_ps_buffered counter.
         */
        for (;;) {
                spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags);
                skb = skb_peek(&sta->tx_filtered[ac]);
                if (sta_info_buffer_expired(sta, skb))
                        skb = __skb_dequeue(&sta->tx_filtered[ac]);
                else
                        skb = NULL;
                spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags);

                /*
                 * Frames are queued in order, so if this one
                 * hasn't expired yet we can stop testing. If
                 * we actually reached the end of the queue we
                 * also need to stop, of course.
                 */
                if (!skb)
                        break;
                ieee80211_free_txskb(&local->hw, skb);
        }

        /*
         * Now also check the normal PS-buffered queue, this will
         * only find something if the filtered queue was emptied
         * since the filtered frames are all before the normal PS
         * buffered frames.
         */
        for (;;) {
                spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags);
                skb = skb_peek(&sta->ps_tx_buf[ac]);
                if (sta_info_buffer_expired(sta, skb))
                        skb = __skb_dequeue(&sta->ps_tx_buf[ac]);
                else
                        skb = NULL;
                spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags);

                /*
                 * frames are queued in order, so if this one
                 * hasn't expired yet (or we reached the end of
                 * the queue) we can stop testing
                 */
                if (!skb)
                        break;

                local->total_ps_buffered--;
                ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n",
                       sta->sta.addr);
                ieee80211_free_txskb(&local->hw, skb);
        }

        /*
         * Finally, recalculate the TIM bit for this station -- it might
         * now be clear because the station was too slow to retrieve its
         * frames.
         */
        sta_info_recalc_tim(sta);

        /*
         * Return whether there are any frames still buffered, this is
         * used to check whether the cleanup timer still needs to run,
         * if there are no frames we don't need to rearm the timer.
         */
        return !(skb_queue_empty(&sta->ps_tx_buf[ac]) &&
                 skb_queue_empty(&sta->tx_filtered[ac]));
}

static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local,
                                             struct sta_info *sta)
{
        bool have_buffered = false;
        int ac;

        /* This is only necessary for stations on BSS/MBSS interfaces */
        if (!sta->sdata->bss &&
            !ieee80211_vif_is_mesh(&sta->sdata->vif))
                return false;

        for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
                have_buffered |=
                        sta_info_cleanup_expire_buffered_ac(local, sta, ac);

        return have_buffered;
}

static int __must_check __sta_info_destroy_part1(struct sta_info *sta)
{
        struct ieee80211_local *local;
        struct ieee80211_sub_if_data *sdata;
        int ret, i;

        might_sleep();

        if (!sta)
                return -ENOENT;

        local = sta->local;
        sdata = sta->sdata;

        lockdep_assert_wiphy(local->hw.wiphy);

        /*
         * Before removing the station from the driver and
         * rate control, it might still start new aggregation
         * sessions -- block that to make sure the tear-down
         * will be sufficient.
         */
        set_sta_flag(sta, WLAN_STA_BLOCK_BA);
        ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA);

        /*
         * Before removing the station from the driver there might be pending
         * rx frames on RSS queues sent prior to the disassociation - wait for
         * all such frames to be processed.
         */
        drv_sync_rx_queues(local, sta);

        for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
                struct link_sta_info *link_sta;

                if (!(sta->sta.valid_links & BIT(i)))
                        continue;

                link_sta = rcu_dereference_protected(sta->link[i],
                                                     lockdep_is_held(&local->hw.wiphy->mtx));

                link_sta_info_hash_del(local, link_sta);
        }

        ret = sta_info_hash_del(local, sta);
        if (WARN_ON(ret))
                return ret;

        /*
         * for TDLS peers, make sure to return to the base channel before
         * removal.
         */
        if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) {
                drv_tdls_cancel_channel_switch(local, sdata, &sta->sta);
                clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL);
        }

        list_del_rcu(&sta->list);
        sta->removed = true;

        if (sta->uploaded)
                drv_sta_pre_rcu_remove(local, sta->sdata, sta);

        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
            rcu_access_pointer(sdata->u.vlan.sta) == sta)
                RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);

        return 0;
}

static int _sta_info_move_state(struct sta_info *sta,
                                enum ieee80211_sta_state new_state,
                                bool recalc)
{
        struct ieee80211_local *local = sta->local;

        might_sleep();

        if (sta->sta_state == new_state)
                return 0;

        /* check allowed transitions first */

        switch (new_state) {
        case IEEE80211_STA_NONE:
                if (sta->sta_state != IEEE80211_STA_AUTH)
                        return -EINVAL;
                break;
        case IEEE80211_STA_AUTH:
                if (sta->sta_state != IEEE80211_STA_NONE &&
                    sta->sta_state != IEEE80211_STA_ASSOC)
                        return -EINVAL;
                break;
        case IEEE80211_STA_ASSOC:
                if (sta->sta_state != IEEE80211_STA_AUTH &&
                    sta->sta_state != IEEE80211_STA_AUTHORIZED)
                        return -EINVAL;
                break;
        case IEEE80211_STA_AUTHORIZED:
                if (sta->sta_state != IEEE80211_STA_ASSOC)
                        return -EINVAL;
                break;
        default:
                WARN(1, "invalid state %d", new_state);
                return -EINVAL;
        }

        sta_dbg(sta->sdata, "moving STA %pM to state %d\n",
                sta->sta.addr, new_state);

        /* notify the driver before the actual changes so it can
         * fail the transition
         */
        if (test_sta_flag(sta, WLAN_STA_INSERTED)) {
                int err = drv_sta_state(sta->local, sta->sdata, sta,
                                        sta->sta_state, new_state);
                if (err)
                        return err;
        }

        /* reflect the change in all state variables */

        switch (new_state) {
        case IEEE80211_STA_NONE:
                if (sta->sta_state == IEEE80211_STA_AUTH)
                        clear_bit(WLAN_STA_AUTH, &sta->_flags);
                break;
        case IEEE80211_STA_AUTH:
                if (sta->sta_state == IEEE80211_STA_NONE) {
                        set_bit(WLAN_STA_AUTH, &sta->_flags);
                } else if (sta->sta_state == IEEE80211_STA_ASSOC) {
                        clear_bit(WLAN_STA_ASSOC, &sta->_flags);
                        if (recalc) {
                                ieee80211_recalc_min_chandef(sta->sdata, -1);
                                if (!sta->sta.support_p2p_ps)
                                        ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
                        }
                }
                break;
        case IEEE80211_STA_ASSOC:
                if (sta->sta_state == IEEE80211_STA_AUTH) {
                        set_bit(WLAN_STA_ASSOC, &sta->_flags);
                        sta->assoc_at = ktime_get_boottime_ns();
                        if (recalc) {
                                ieee80211_recalc_min_chandef(sta->sdata, -1);
                                if (!sta->sta.support_p2p_ps)
                                        ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
                        }
                } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
                        ieee80211_vif_dec_num_mcast(sta->sdata);
                        clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags);

                        /*
                         * If we have encryption offload, flush (station) queues
                         * (after ensuring concurrent TX completed) so we won't
                         * transmit anything later unencrypted if/when keys are
                         * also removed, which might otherwise happen depending
                         * on how the hardware offload works.
                         */
                        if (local->ops->set_key) {
                                synchronize_net();
                                if (local->ops->flush_sta)
                                        drv_flush_sta(local, sta->sdata, sta);
                                else
                                        ieee80211_flush_queues(local,
                                                               sta->sdata,
                                                               false);
                        }

                        ieee80211_clear_fast_xmit(sta);
                        ieee80211_clear_fast_rx(sta);
                }
                break;
        case IEEE80211_STA_AUTHORIZED:
                if (sta->sta_state == IEEE80211_STA_ASSOC) {
                        ieee80211_vif_inc_num_mcast(sta->sdata);
                        set_bit(WLAN_STA_AUTHORIZED, &sta->_flags);
                        ieee80211_check_fast_xmit(sta);
                        ieee80211_check_fast_rx(sta);
                }
                if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
                    sta->sdata->vif.type == NL80211_IFTYPE_AP)
                        cfg80211_send_layer2_update(sta->sdata->dev,
                                                    sta->sta.addr);
                break;
        default:
                break;
        }

        sta->sta_state = new_state;

        return 0;
}

int sta_info_move_state(struct sta_info *sta,
                        enum ieee80211_sta_state new_state)
{
        return _sta_info_move_state(sta, new_state, true);
}

static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc)
{
        struct ieee80211_local *local = sta->local;
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct station_info *sinfo;
        int ret;

        /*
         * NOTE: This assumes at least synchronize_net() was done
         *         after _part1 and before _part2!
         */

        /*
         * There's a potential race in _part1 where we set WLAN_STA_BLOCK_BA
         * but someone might have just gotten past a check, and not yet into
         * queuing the work/creating the data/etc.
         *
         * Do another round of destruction so that the worker is certainly
         * canceled before we later free the station.
         *
         * Since this is after synchronize_rcu()/synchronize_net() we're now
         * certain that nobody can actually hold a reference to the STA and
         * be calling e.g. ieee80211_start_tx_ba_session().
         */
        ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA);

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
                ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc);
                WARN_ON_ONCE(ret);
        }

        /* now keys can no longer be reached */
        ieee80211_free_sta_keys(local, sta);

        /* disable TIM bit - last chance to tell driver */
        __sta_info_recalc_tim(sta, true);

        sta->dead = true;

        local->num_sta--;
        local->sta_generation++;

        while (sta->sta_state > IEEE80211_STA_NONE) {
                ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc);
                if (ret) {
                        WARN_ON_ONCE(1);
                        break;
                }
        }

        if (sta->uploaded) {
                ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE,
                                    IEEE80211_STA_NOTEXIST);
                WARN_ON_ONCE(ret != 0);
        }

        sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr);

        sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
        if (sinfo)
                sta_set_sinfo(sta, sinfo, true);
        cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
        kfree(sinfo);

        ieee80211_sta_debugfs_remove(sta);

        ieee80211_destroy_frag_cache(&sta->frags);

        cleanup_single_sta(sta);
}

int __must_check __sta_info_destroy(struct sta_info *sta)
{
        int err = __sta_info_destroy_part1(sta);

        if (err)
                return err;

        synchronize_net();

        __sta_info_destroy_part2(sta, true);

        return 0;
}

int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr)
{
        struct sta_info *sta;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        sta = sta_info_get(sdata, addr);
        return __sta_info_destroy(sta);
}

int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata,
                              const u8 *addr)
{
        struct sta_info *sta;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        sta = sta_info_get_bss(sdata, addr);
        return __sta_info_destroy(sta);
}

static void sta_info_cleanup(struct timer_list *t)
{
        struct ieee80211_local *local = from_timer(local, t, sta_cleanup);
        struct sta_info *sta;
        bool timer_needed = false;

        rcu_read_lock();
        list_for_each_entry_rcu(sta, &local->sta_list, list)
                if (sta_info_cleanup_expire_buffered(local, sta))
                        timer_needed = true;
        rcu_read_unlock();

        if (local->quiescing)
                return;

        if (!timer_needed)
                return;

        mod_timer(&local->sta_cleanup,
                  round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL));
}

int sta_info_init(struct ieee80211_local *local)
{
        int err;

        err = rhltable_init(&local->sta_hash, &sta_rht_params);
        if (err)
                return err;

        err = rhltable_init(&local->link_sta_hash, &link_sta_rht_params);
        if (err) {
                rhltable_destroy(&local->sta_hash);
                return err;
        }

        spin_lock_init(&local->tim_lock);
        INIT_LIST_HEAD(&local->sta_list);

        timer_setup(&local->sta_cleanup, sta_info_cleanup, 0);
        return 0;
}

void sta_info_stop(struct ieee80211_local *local)
{
        del_timer_sync(&local->sta_cleanup);
        rhltable_destroy(&local->sta_hash);
        rhltable_destroy(&local->link_sta_hash);
}


int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans,
                     int link_id)
{
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta, *tmp;
        LIST_HEAD(free_list);
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP);
        WARN_ON(vlans && !sdata->bss);

        list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
                if (sdata != sta->sdata &&
                    (!vlans || sdata->bss != sta->sdata->bss))
                        continue;

                if (link_id >= 0 && sta->sta.valid_links &&
                    !(sta->sta.valid_links & BIT(link_id)))
                        continue;

                if (!WARN_ON(__sta_info_destroy_part1(sta)))
                        list_add(&sta->free_list, &free_list);

                ret++;
        }

        if (!list_empty(&free_list)) {
                bool support_p2p_ps = true;

                synchronize_net();
                list_for_each_entry_safe(sta, tmp, &free_list, free_list) {
                        if (!sta->sta.support_p2p_ps)
                                support_p2p_ps = false;
                        __sta_info_destroy_part2(sta, false);
                }

                ieee80211_recalc_min_chandef(sdata, -1);
                if (!support_p2p_ps)
                        ieee80211_recalc_p2p_go_ps_allowed(sdata);
        }

        return ret;
}

void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
                          unsigned long exp_time)
{
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta, *tmp;

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
                unsigned long last_active = ieee80211_sta_last_active(sta);

                if (sdata != sta->sdata)
                        continue;

                if (time_is_before_jiffies(last_active + exp_time)) {
                        sta_dbg(sta->sdata, "expiring inactive STA %pM\n",
                                sta->sta.addr);

                        if (ieee80211_vif_is_mesh(&sdata->vif) &&
                            test_sta_flag(sta, WLAN_STA_PS_STA))
                                atomic_dec(&sdata->u.mesh.ps.num_sta_ps);

                        WARN_ON(__sta_info_destroy(sta));
                }
        }
}

struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
                                                   const u8 *addr,
                                                   const u8 *localaddr)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct rhlist_head *tmp;
        struct sta_info *sta;

        /*
         * Just return a random station if localaddr is NULL
         * ... first in list.
         */
        for_each_sta_info(local, addr, sta, tmp) {
                if (localaddr &&
                    !ether_addr_equal(sta->sdata->vif.addr, localaddr))
                        continue;
                if (!sta->uploaded)
                        return NULL;
                return &sta->sta;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);

struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
                                         const u8 *addr)
{
        struct sta_info *sta;

        if (!vif)
                return NULL;

        sta = sta_info_get_bss(vif_to_sdata(vif), addr);
        if (!sta)
                return NULL;

        if (!sta->uploaded)
                return NULL;

        return &sta->sta;
}
EXPORT_SYMBOL(ieee80211_find_sta);

/* powersave support code */
void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        struct sk_buff_head pending;
        int filtered = 0, buffered = 0, ac, i;
        unsigned long flags;
        struct ps_data *ps;

        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
                                     u.ap);

        if (sdata->vif.type == NL80211_IFTYPE_AP)
                ps = &sdata->bss->ps;
        else if (ieee80211_vif_is_mesh(&sdata->vif))
                ps = &sdata->u.mesh.ps;
        else
                return;

        clear_sta_flag(sta, WLAN_STA_SP);

        BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1);
        sta->driver_buffered_tids = 0;
        sta->txq_buffered_tids = 0;

        if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
                drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);

        for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
                if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i]))
                        continue;

                schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i]));
        }

        skb_queue_head_init(&pending);

        /* sync with ieee80211_tx_h_unicast_ps_buf */
        spin_lock_bh(&sta->ps_lock);
        /* Send all buffered frames to the station */
        for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
                int count = skb_queue_len(&pending), tmp;

                spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags);
                skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending);
                spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags);
                tmp = skb_queue_len(&pending);
                filtered += tmp - count;
                count = tmp;

                spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags);
                skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending);
                spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags);
                tmp = skb_queue_len(&pending);
                buffered += tmp - count;
        }

        ieee80211_add_pending_skbs(local, &pending);

        /* now we're no longer in the deliver code */
        clear_sta_flag(sta, WLAN_STA_PS_DELIVER);

        /* The station might have polled and then woken up before we responded,
         * so clear these flags now to avoid them sticking around.
         */
        clear_sta_flag(sta, WLAN_STA_PSPOLL);
        clear_sta_flag(sta, WLAN_STA_UAPSD);
        spin_unlock_bh(&sta->ps_lock);

        atomic_dec(&ps->num_sta_ps);

        local->total_ps_buffered -= buffered;

        sta_info_recalc_tim(sta);

        ps_dbg(sdata,
               "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n",
               sta->sta.addr, sta->sta.aid, filtered, buffered);

        ieee80211_check_fast_xmit(sta);
}

static void ieee80211_send_null_response(struct sta_info *sta, int tid,
                                         enum ieee80211_frame_release_type reason,
                                         bool call_driver, bool more_data)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_qos_hdr *nullfunc;
        struct sk_buff *skb;
        int size = sizeof(*nullfunc);
        __le16 fc;
        bool qos = sta->sta.wme;
        struct ieee80211_tx_info *info;
        struct ieee80211_chanctx_conf *chanctx_conf;

        if (qos) {
                fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
                                 IEEE80211_STYPE_QOS_NULLFUNC |
                                 IEEE80211_FCTL_FROMDS);
        } else {
                size -= 2;
                fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
                                 IEEE80211_STYPE_NULLFUNC |
                                 IEEE80211_FCTL_FROMDS);
        }

        skb = dev_alloc_skb(local->hw.extra_tx_headroom + size);
        if (!skb)
                return;

        skb_reserve(skb, local->hw.extra_tx_headroom);

        nullfunc = skb_put(skb, size);
        nullfunc->frame_control = fc;
        nullfunc->duration_id = 0;
        memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN);
        memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN);
        memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN);
        nullfunc->seq_ctrl = 0;

        skb->priority = tid;
        skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]);
        if (qos) {
                nullfunc->qos_ctrl = cpu_to_le16(tid);

                if (reason == IEEE80211_FRAME_RELEASE_UAPSD) {
                        nullfunc->qos_ctrl |=
                                cpu_to_le16(IEEE80211_QOS_CTL_EOSP);
                        if (more_data)
                                nullfunc->frame_control |=
                                        cpu_to_le16(IEEE80211_FCTL_MOREDATA);
                }
        }

        info = IEEE80211_SKB_CB(skb);

        /*
         * Tell TX path to send this frame even though the
         * STA may still remain is PS mode after this frame
         * exchange. Also set EOSP to indicate this packet
         * ends the poll/service period.
         */
        info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER |
                       IEEE80211_TX_STATUS_EOSP |
                       IEEE80211_TX_CTL_REQ_TX_STATUS;

        info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE;

        if (call_driver)
                drv_allow_buffered_frames(local, sta, BIT(tid), 1,
                                          reason, false);

        skb->dev = sdata->dev;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
        if (WARN_ON(!chanctx_conf)) {
                rcu_read_unlock();
                kfree_skb(skb);
                return;
        }

        info->band = chanctx_conf->def.chan->band;
        ieee80211_xmit(sdata, sta, skb);
        rcu_read_unlock();
}

static int find_highest_prio_tid(unsigned long tids)
{
        /* lower 3 TIDs aren't ordered perfectly */
        if (tids & 0xF8)
                return fls(tids) - 1;
        /* TID 0 is BE just like TID 3 */
        if (tids & BIT(0))
                return 0;
        return fls(tids) - 1;
}

/* Indicates if the MORE_DATA bit should be set in the last
 * frame obtained by ieee80211_sta_ps_get_frames.
 * Note that driver_release_tids is relevant only if
 * reason = IEEE80211_FRAME_RELEASE_PSPOLL
 */
static bool
ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs,
                           enum ieee80211_frame_release_type reason,
                           unsigned long driver_release_tids)
{
        int ac;

        /* If the driver has data on more than one TID then
         * certainly there's more data if we release just a
         * single frame now (from a single TID). This will
         * only happen for PS-Poll.
         */
        if (reason == IEEE80211_FRAME_RELEASE_PSPOLL &&
            hweight16(driver_release_tids) > 1)
                return true;

        for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
                if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
                        continue;

                if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
                    !skb_queue_empty(&sta->ps_tx_buf[ac]))
                        return true;
        }

        return false;
}

static void
ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs,
                            enum ieee80211_frame_release_type reason,
                            struct sk_buff_head *frames,
                            unsigned long *driver_release_tids)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        int ac;

        /* Get response frame(s) and more data bit for the last one. */
        for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
                unsigned long tids;

                if (ignored_acs & ieee80211_ac_to_qos_mask[ac])
                        continue;

                tids = ieee80211_tids_for_ac(ac);

                /* if we already have frames from software, then we can't also
                 * release from hardware queues
                 */
                if (skb_queue_empty(frames)) {
                        *driver_release_tids |=
                                sta->driver_buffered_tids & tids;
                        *driver_release_tids |= sta->txq_buffered_tids & tids;
                }

                if (!*driver_release_tids) {
                        struct sk_buff *skb;

                        while (n_frames > 0) {
                                skb = skb_dequeue(&sta->tx_filtered[ac]);
                                if (!skb) {
                                        skb = skb_dequeue(
                                                &sta->ps_tx_buf[ac]);
                                        if (skb)
                                                local->total_ps_buffered--;
                                }
                                if (!skb)
                                        break;
                                n_frames--;
                                __skb_queue_tail(frames, skb);
                        }
                }

                /* If we have more frames buffered on this AC, then abort the
                 * loop since we can't send more data from other ACs before
                 * the buffered frames from this.
                 */
                if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
                    !skb_queue_empty(&sta->ps_tx_buf[ac]))
                        break;
        }
}

static void
ieee80211_sta_ps_deliver_response(struct sta_info *sta,
                                  int n_frames, u8 ignored_acs,
                                  enum ieee80211_frame_release_type reason)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        unsigned long driver_release_tids = 0;
        struct sk_buff_head frames;
        bool more_data;

        /* Service or PS-Poll period starts */
        set_sta_flag(sta, WLAN_STA_SP);

        __skb_queue_head_init(&frames);

        ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason,
                                    &frames, &driver_release_tids);

        more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids);

        if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL)
                driver_release_tids =
                        BIT(find_highest_prio_tid(driver_release_tids));

        if (skb_queue_empty(&frames) && !driver_release_tids) {
                int tid, ac;

                /*
                 * For PS-Poll, this can only happen due to a race condition
                 * when we set the TIM bit and the station notices it, but
                 * before it can poll for the frame we expire it.
                 *
                 * For uAPSD, this is said in the standard (11.2.1.5 h):
                 *        At each unscheduled SP for a non-AP STA, the AP shall
                 *        attempt to transmit at least one MSDU or MMPDU, but no
                 *        more than the value specified in the Max SP Length field
                 *        in the QoS Capability element from delivery-enabled ACs,
                 *        that are destined for the non-AP STA.
                 *
                 * Since we have no other MSDU/MMPDU, transmit a QoS null frame.
                 */

                /* This will evaluate to 1, 3, 5 or 7. */
                for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++)
                        if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac]))
                                break;
                tid = 7 - 2 * ac;

                ieee80211_send_null_response(sta, tid, reason, true, false);
        } else if (!driver_release_tids) {
                struct sk_buff_head pending;
                struct sk_buff *skb;
                int num = 0;
                u16 tids = 0;
                bool need_null = false;

                skb_queue_head_init(&pending);

                while ((skb = __skb_dequeue(&frames))) {
                        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
                        struct ieee80211_hdr *hdr = (void *) skb->data;
                        u8 *qoshdr = NULL;

                        num++;

                        /*
                         * Tell TX path to send this frame even though the
                         * STA may still remain is PS mode after this frame
                         * exchange.
                         */
                        info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
                        info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE;

                        /*
                         * Use MoreData flag to indicate whether there are
                         * more buffered frames for this STA
                         */
                        if (more_data || !skb_queue_empty(&frames))
                                hdr->frame_control |=
                                        cpu_to_le16(IEEE80211_FCTL_MOREDATA);
                        else
                                hdr->frame_control &=
                                        cpu_to_le16(~IEEE80211_FCTL_MOREDATA);

                        if (ieee80211_is_data_qos(hdr->frame_control) ||
                            ieee80211_is_qos_nullfunc(hdr->frame_control))
                                qoshdr = ieee80211_get_qos_ctl(hdr);

                        tids |= BIT(skb->priority);

                        __skb_queue_tail(&pending, skb);

                        /* end service period after last frame or add one */
                        if (!skb_queue_empty(&frames))
                                continue;

                        if (reason != IEEE80211_FRAME_RELEASE_UAPSD) {
                                /* for PS-Poll, there's only one frame */
                                info->flags |= IEEE80211_TX_STATUS_EOSP |
                                               IEEE80211_TX_CTL_REQ_TX_STATUS;
                                break;
                        }

                        /* For uAPSD, things are a bit more complicated. If the
                         * last frame has a QoS header (i.e. is a QoS-data or
                         * QoS-nulldata frame) then just set the EOSP bit there
                         * and be done.
                         * If the frame doesn't have a QoS header (which means
                         * it should be a bufferable MMPDU) then we can't set
                         * the EOSP bit in the QoS header; add a QoS-nulldata
                         * frame to the list to send it after the MMPDU.
                         *
                         * Note that this code is only in the mac80211-release
                         * code path, we assume that the driver will not buffer
                         * anything but QoS-data frames, or if it does, will
                         * create the QoS-nulldata frame by itself if needed.
                         *
                         * Cf. 802.11-2012 10.2.1.10 (c).
                         */
                        if (qoshdr) {
                                *qoshdr |= IEEE80211_QOS_CTL_EOSP;

                                info->flags |= IEEE80211_TX_STATUS_EOSP |
                                               IEEE80211_TX_CTL_REQ_TX_STATUS;
                        } else {
                                /* The standard isn't completely clear on this
                                 * as it says the more-data bit should be set
                                 * if there are more BUs. The QoS-Null frame
                                 * we're about to send isn't buffered yet, we
                                 * only create it below, but let's pretend it
                                 * was buffered just in case some clients only
                                 * expect more-data=0 when eosp=1.
                                 */
                                hdr->frame_control |=
                                        cpu_to_le16(IEEE80211_FCTL_MOREDATA);
                                need_null = true;
                                num++;
                        }
                        break;
                }

                drv_allow_buffered_frames(local, sta, tids, num,
                                          reason, more_data);

                ieee80211_add_pending_skbs(local, &pending);

                if (need_null)
                        ieee80211_send_null_response(
                                sta, find_highest_prio_tid(tids),
                                reason, false, false);

                sta_info_recalc_tim(sta);
        } else {
                int tid;

                /*
                 * We need to release a frame that is buffered somewhere in the
                 * driver ... it'll have to handle that.
                 * Note that the driver also has to check the number of frames
                 * on the TIDs we're releasing from - if there are more than
                 * n_frames it has to set the more-data bit (if we didn't ask
                 * it to set it anyway due to other buffered frames); if there
                 * are fewer than n_frames it has to make sure to adjust that
                 * to allow the service period to end properly.
                 */
                drv_release_buffered_frames(local, sta, driver_release_tids,
                                            n_frames, reason, more_data);

                /*
                 * Note that we don't recalculate the TIM bit here as it would
                 * most likely have no effect at all unless the driver told us
                 * that the TID(s) became empty before returning here from the
                 * release function.
                 * Either way, however, when the driver tells us that the TID(s)
                 * became empty or we find that a txq became empty, we'll do the
                 * TIM recalculation.
                 */

                for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
                        if (!sta->sta.txq[tid] ||
                            !(driver_release_tids & BIT(tid)) ||
                            txq_has_queue(sta->sta.txq[tid]))
                                continue;

                        sta_info_recalc_tim(sta);
                        break;
                }
        }
}

void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta)
{
        u8 ignore_for_response = sta->sta.uapsd_queues;

        /*
         * If all ACs are delivery-enabled then we should reply
         * from any of them, if only some are enabled we reply
         * only from the non-enabled ones.
         */
        if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1)
                ignore_for_response = 0;

        ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response,
                                          IEEE80211_FRAME_RELEASE_PSPOLL);
}

void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta)
{
        int n_frames = sta->sta.max_sp;
        u8 delivery_enabled = sta->sta.uapsd_queues;

        /*
         * If we ever grow support for TSPEC this might happen if
         * the TSPEC update from hostapd comes in between a trigger
         * frame setting WLAN_STA_UAPSD in the RX path and this
         * actually getting called.
         */
        if (!delivery_enabled)
                return;

        switch (sta->sta.max_sp) {
        case 1:
                n_frames = 2;
                break;
        case 2:
                n_frames = 4;
                break;
        case 3:
                n_frames = 6;
                break;
        case 0:
                /* XXX: what is a good value? */
                n_frames = 128;
                break;
        }

        ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled,
                                          IEEE80211_FRAME_RELEASE_UAPSD);
}

void ieee80211_sta_block_awake(struct ieee80211_hw *hw,
                               struct ieee80211_sta *pubsta, bool block)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);

        trace_api_sta_block_awake(sta->local, pubsta, block);

        if (block) {
                set_sta_flag(sta, WLAN_STA_PS_DRIVER);
                ieee80211_clear_fast_xmit(sta);
                return;
        }

        if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER))
                return;

        if (!test_sta_flag(sta, WLAN_STA_PS_STA)) {
                set_sta_flag(sta, WLAN_STA_PS_DELIVER);
                clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
                ieee80211_queue_work(hw, &sta->drv_deliver_wk);
        } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) ||
                   test_sta_flag(sta, WLAN_STA_UAPSD)) {
                /* must be asleep in this case */
                clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
                ieee80211_queue_work(hw, &sta->drv_deliver_wk);
        } else {
                clear_sta_flag(sta, WLAN_STA_PS_DRIVER);
                ieee80211_check_fast_xmit(sta);
        }
}
EXPORT_SYMBOL(ieee80211_sta_block_awake);

void ieee80211_sta_eosp(struct ieee80211_sta *pubsta)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
        struct ieee80211_local *local = sta->local;

        trace_api_eosp(local, pubsta);

        clear_sta_flag(sta, WLAN_STA_SP);
}
EXPORT_SYMBOL(ieee80211_sta_eosp);

void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
        enum ieee80211_frame_release_type reason;
        bool more_data;

        trace_api_send_eosp_nullfunc(sta->local, pubsta, tid);

        reason = IEEE80211_FRAME_RELEASE_UAPSD;
        more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues,
                                               reason, 0);

        ieee80211_send_null_response(sta, tid, reason, false, more_data);
}
EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc);

void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta,
                                u8 tid, bool buffered)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);

        if (WARN_ON(tid >= IEEE80211_NUM_TIDS))
                return;

        trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered);

        if (buffered)
                set_bit(tid, &sta->driver_buffered_tids);
        else
                clear_bit(tid, &sta->driver_buffered_tids);

        sta_info_recalc_tim(sta);
}
EXPORT_SYMBOL(ieee80211_sta_set_buffered);

void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
                                    u32 tx_airtime, u32 rx_airtime)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
        struct ieee80211_local *local = sta->sdata->local;
        u8 ac = ieee80211_ac_from_tid(tid);
        u32 airtime = 0;

        if (sta->local->airtime_flags & AIRTIME_USE_TX)
                airtime += tx_airtime;
        if (sta->local->airtime_flags & AIRTIME_USE_RX)
                airtime += rx_airtime;

        spin_lock_bh(&local->active_txq_lock[ac]);
        sta->airtime[ac].tx_airtime += tx_airtime;
        sta->airtime[ac].rx_airtime += rx_airtime;

        if (ieee80211_sta_keep_active(sta, ac))
                sta->airtime[ac].deficit -= airtime;

        spin_unlock_bh(&local->active_txq_lock[ac]);
}
EXPORT_SYMBOL(ieee80211_sta_register_airtime);

void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links)
{
        bool first = true;
        int link_id;

        if (!sta->sta.valid_links || !sta->sta.mlo) {
                sta->sta.cur = &sta->sta.deflink.agg;
                return;
        }

        rcu_read_lock();
        for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) {
                struct ieee80211_link_sta *link_sta;
                int i;

                if (!(active_links & BIT(link_id)))
                        continue;

                link_sta = rcu_dereference(sta->sta.link[link_id]);
                if (!link_sta)
                        continue;

                if (first) {
                        sta->cur = sta->sta.deflink.agg;
                        first = false;
                        continue;
                }

                sta->cur.max_amsdu_len =
                        min(sta->cur.max_amsdu_len,
                            link_sta->agg.max_amsdu_len);
                sta->cur.max_rc_amsdu_len =
                        min(sta->cur.max_rc_amsdu_len,
                            link_sta->agg.max_rc_amsdu_len);

                for (i = 0; i < ARRAY_SIZE(sta->cur.max_tid_amsdu_len); i++)
                        sta->cur.max_tid_amsdu_len[i] =
                                min(sta->cur.max_tid_amsdu_len[i],
                                    link_sta->agg.max_tid_amsdu_len[i]);
        }
        rcu_read_unlock();

        sta->sta.cur = &sta->cur;
}

void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);

        __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links);
}
EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates);

void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
                                          struct sta_info *sta, u8 ac,
                                          u16 tx_airtime, bool tx_completed)
{
        int tx_pending;

        if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL))
                return;

        if (!tx_completed) {
                if (sta)
                        atomic_add(tx_airtime,
                                   &sta->airtime[ac].aql_tx_pending);

                atomic_add(tx_airtime, &local->aql_total_pending_airtime);
                atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]);
                return;
        }

        if (sta) {
                tx_pending = atomic_sub_return(tx_airtime,
                                               &sta->airtime[ac].aql_tx_pending);
                if (tx_pending < 0)
                        atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending,
                                       tx_pending, 0);
        }

        atomic_sub(tx_airtime, &local->aql_total_pending_airtime);
        tx_pending = atomic_sub_return(tx_airtime,
                                       &local->aql_ac_pending_airtime[ac]);
        if (WARN_ONCE(tx_pending < 0,
                      "Device %s AC %d pending airtime underflow: %u, %u",
                      wiphy_name(local->hw.wiphy), ac, tx_pending,
                      tx_airtime)) {
                atomic_cmpxchg(&local->aql_ac_pending_airtime[ac],
                               tx_pending, 0);
                atomic_sub(tx_pending, &local->aql_total_pending_airtime);
        }
}

static struct ieee80211_sta_rx_stats *
sta_get_last_rx_stats(struct sta_info *sta)
{
        struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats;
        int cpu;

        if (!sta->deflink.pcpu_rx_stats)
                return stats;

        for_each_possible_cpu(cpu) {
                struct ieee80211_sta_rx_stats *cpustats;

                cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu);

                if (time_after(cpustats->last_rx, stats->last_rx))
                        stats = cpustats;
        }

        return stats;
}

static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate,
                                  struct rate_info *rinfo)
{
        rinfo->bw = STA_STATS_GET(BW, rate);

        switch (STA_STATS_GET(TYPE, rate)) {
        case STA_STATS_RATE_TYPE_VHT:
                rinfo->flags = RATE_INFO_FLAGS_VHT_MCS;
                rinfo->mcs = STA_STATS_GET(VHT_MCS, rate);
                rinfo->nss = STA_STATS_GET(VHT_NSS, rate);
                if (STA_STATS_GET(SGI, rate))
                        rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
                break;
        case STA_STATS_RATE_TYPE_HT:
                rinfo->flags = RATE_INFO_FLAGS_MCS;
                rinfo->mcs = STA_STATS_GET(HT_MCS, rate);
                if (STA_STATS_GET(SGI, rate))
                        rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI;
                break;
        case STA_STATS_RATE_TYPE_LEGACY: {
                struct ieee80211_supported_band *sband;
                u16 brate;
                unsigned int shift;
                int band = STA_STATS_GET(LEGACY_BAND, rate);
                int rate_idx = STA_STATS_GET(LEGACY_IDX, rate);

                sband = local->hw.wiphy->bands[band];

                if (WARN_ON_ONCE(!sband->bitrates))
                        break;

                brate = sband->bitrates[rate_idx].bitrate;
                if (rinfo->bw == RATE_INFO_BW_5)
                        shift = 2;
                else if (rinfo->bw == RATE_INFO_BW_10)
                        shift = 1;
                else
                        shift = 0;
                rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
                break;
                }
        case STA_STATS_RATE_TYPE_HE:
                rinfo->flags = RATE_INFO_FLAGS_HE_MCS;
                rinfo->mcs = STA_STATS_GET(HE_MCS, rate);
                rinfo->nss = STA_STATS_GET(HE_NSS, rate);
                rinfo->he_gi = STA_STATS_GET(HE_GI, rate);
                rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate);
                rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate);
                break;
        case STA_STATS_RATE_TYPE_EHT:
                rinfo->flags = RATE_INFO_FLAGS_EHT_MCS;
                rinfo->mcs = STA_STATS_GET(EHT_MCS, rate);
                rinfo->nss = STA_STATS_GET(EHT_NSS, rate);
                rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate);
                rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate);
                break;
        }
}

static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
{
        u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate);

        if (rate == STA_STATS_RATE_INVALID)
                return -EINVAL;

        sta_stats_decode_rate(sta->local, rate, rinfo);
        return 0;
}

static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
                                        int tid)
{
        unsigned int start;
        u64 value;

        do {
                start = u64_stats_fetch_begin(&rxstats->syncp);
                value = rxstats->msdu[tid];
        } while (u64_stats_fetch_retry(&rxstats->syncp, start));

        return value;
}

static void sta_set_tidstats(struct sta_info *sta,
                             struct cfg80211_tid_stats *tidstats,
                             int tid)
{
        struct ieee80211_local *local = sta->local;
        int cpu;

        if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) {
                tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats,
                                                           tid);

                if (sta->deflink.pcpu_rx_stats) {
                        for_each_possible_cpu(cpu) {
                                struct ieee80211_sta_rx_stats *cpurxs;

                                cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats,
                                                     cpu);
                                tidstats->rx_msdu +=
                                        sta_get_tidstats_msdu(cpurxs, tid);
                        }
                }

                tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU);
        }

        if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) {
                tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU);
                tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid];
        }

        if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) &&
            ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
                tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES);
                tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid];
        }

        if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) &&
            ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
                tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
                tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid];
        }

        if (tid < IEEE80211_NUM_TIDS) {
                spin_lock_bh(&local->fq.lock);
                rcu_read_lock();

                tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS);
                ieee80211_fill_txq_stats(&tidstats->txq_stats,
                                         to_txq_info(sta->sta.txq[tid]));

                rcu_read_unlock();
                spin_unlock_bh(&local->fq.lock);
        }
}

static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
{
        unsigned int start;
        u64 value;

        do {
                start = u64_stats_fetch_begin(&rxstats->syncp);
                value = rxstats->bytes;
        } while (u64_stats_fetch_retry(&rxstats->syncp, start));

        return value;
}

void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
                   bool tidstats)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        u32 thr = 0;
        int i, ac, cpu;
        struct ieee80211_sta_rx_stats *last_rxstats;

        last_rxstats = sta_get_last_rx_stats(sta);

        sinfo->generation = sdata->local->sta_generation;

        /* do before driver, so beacon filtering drivers have a
         * chance to e.g. just add the number of filtered beacons
         * (or just modify the value entirely, of course)
         */
        if (sdata->vif.type == NL80211_IFTYPE_STATION)
                sinfo->rx_beacon = sdata->deflink.u.mgd.count_beacon_signal;

        drv_sta_statistics(local, sdata, &sta->sta, sinfo);
        sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) |
                         BIT_ULL(NL80211_STA_INFO_STA_FLAGS) |
                         BIT_ULL(NL80211_STA_INFO_BSS_PARAM) |
                         BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) |
                         BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) |
                         BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC);

        if (sdata->vif.type == NL80211_IFTYPE_STATION) {
                sinfo->beacon_loss_count =
                        sdata->deflink.u.mgd.beacon_loss_count;
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS);
        }

        sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
        sinfo->assoc_at = sta->assoc_at;
        sinfo->inactive_time =
                jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta));

        if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) |
                               BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) {
                sinfo->tx_bytes = 0;
                for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
                        sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac];
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) {
                sinfo->tx_packets = 0;
                for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
                        sinfo->tx_packets += sta->deflink.tx_stats.packets[ac];
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS);
        }

        if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) |
                               BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) {
                sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats);

                if (sta->deflink.pcpu_rx_stats) {
                        for_each_possible_cpu(cpu) {
                                struct ieee80211_sta_rx_stats *cpurxs;

                                cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats,
                                                     cpu);
                                sinfo->rx_bytes += sta_get_stats_bytes(cpurxs);
                        }
                }

                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) {
                sinfo->rx_packets = sta->deflink.rx_stats.packets;
                if (sta->deflink.pcpu_rx_stats) {
                        for_each_possible_cpu(cpu) {
                                struct ieee80211_sta_rx_stats *cpurxs;

                                cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats,
                                                     cpu);
                                sinfo->rx_packets += cpurxs->packets;
                        }
                }
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) {
                sinfo->tx_retries = sta->deflink.status_stats.retry_count;
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) {
                sinfo->tx_failed = sta->deflink.status_stats.retry_failed;
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) {
                for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
                        sinfo->rx_duration += sta->airtime[ac].rx_airtime;
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) {
                for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
                        sinfo->tx_duration += sta->airtime[ac].tx_airtime;
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) {
                sinfo->airtime_weight = sta->airtime_weight;
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT);
        }

        sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped;
        if (sta->deflink.pcpu_rx_stats) {
                for_each_possible_cpu(cpu) {
                        struct ieee80211_sta_rx_stats *cpurxs;

                        cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu);
                        sinfo->rx_dropped_misc += cpurxs->dropped;
                }
        }

        if (sdata->vif.type == NL80211_IFTYPE_STATION &&
            !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) |
                                 BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
                sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif);
        }

        if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
            ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) {
                if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) {
                        sinfo->signal = (s8)last_rxstats->last_signal;
                        sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL);
                }

                if (!sta->deflink.pcpu_rx_stats &&
                    !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) {
                        sinfo->signal_avg =
                                -ewma_signal_read(&sta->deflink.rx_stats_avg.signal);
                        sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG);
                }
        }

        /* for the average - if pcpu_rx_stats isn't set - rxstats must point to
         * the sta->rx_stats struct, so the check here is fine with and without
         * pcpu statistics
         */
        if (last_rxstats->chains &&
            !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) |
                               BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL);
                if (!sta->deflink.pcpu_rx_stats)
                        sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);

                sinfo->chains = last_rxstats->chains;

                for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) {
                        sinfo->chain_signal[i] =
                                last_rxstats->chain_signal_last[i];
                        sinfo->chain_signal_avg[i] =
                                -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]);
                }
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) &&
            !sta->sta.valid_links &&
            ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) {
                sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate,
                                     &sinfo->txrate);
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) &&
            !sta->sta.valid_links) {
                if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
                        sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE);
        }

        if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
                for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
                        sta_set_tidstats(sta, &sinfo->pertid[i], i);
        }

        if (ieee80211_vif_is_mesh(&sdata->vif)) {
#ifdef CONFIG_MAC80211_MESH
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) |
                                 BIT_ULL(NL80211_STA_INFO_PLID) |
                                 BIT_ULL(NL80211_STA_INFO_PLINK_STATE) |
                                 BIT_ULL(NL80211_STA_INFO_LOCAL_PM) |
                                 BIT_ULL(NL80211_STA_INFO_PEER_PM) |
                                 BIT_ULL(NL80211_STA_INFO_NONPEER_PM) |
                                 BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) |
                                 BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS);

                sinfo->llid = sta->mesh->llid;
                sinfo->plid = sta->mesh->plid;
                sinfo->plink_state = sta->mesh->plink_state;
                if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
                        sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET);
                        sinfo->t_offset = sta->mesh->t_offset;
                }
                sinfo->local_pm = sta->mesh->local_pm;
                sinfo->peer_pm = sta->mesh->peer_pm;
                sinfo->nonpeer_pm = sta->mesh->nonpeer_pm;
                sinfo->connected_to_gate = sta->mesh->connected_to_gate;
                sinfo->connected_to_as = sta->mesh->connected_to_as;
#endif
        }

        sinfo->bss_param.flags = 0;
        if (sdata->vif.bss_conf.use_cts_prot)
                sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT;
        if (sdata->vif.bss_conf.use_short_preamble)
                sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE;
        if (sdata->vif.bss_conf.use_short_slot)
                sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME;
        sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period;
        sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int;

        sinfo->sta_flags.set = 0;
        sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) |
                                BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) |
                                BIT(NL80211_STA_FLAG_WME) |
                                BIT(NL80211_STA_FLAG_MFP) |
                                BIT(NL80211_STA_FLAG_AUTHENTICATED) |
                                BIT(NL80211_STA_FLAG_ASSOCIATED) |
                                BIT(NL80211_STA_FLAG_TDLS_PEER);
        if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
                sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED);
        if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE))
                sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE);
        if (sta->sta.wme)
                sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME);
        if (test_sta_flag(sta, WLAN_STA_MFP))
                sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP);
        if (test_sta_flag(sta, WLAN_STA_AUTH))
                sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED);
        if (test_sta_flag(sta, WLAN_STA_ASSOC))
                sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED);
        if (test_sta_flag(sta, WLAN_STA_TDLS_PEER))
                sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER);

        thr = sta_get_expected_throughput(sta);

        if (thr != 0) {
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
                sinfo->expected_throughput = thr;
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) &&
            sta->deflink.status_stats.ack_signal_filled) {
                sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal;
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
        }

        if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) &&
            sta->deflink.status_stats.ack_signal_filled) {
                sinfo->avg_ack_signal =
                        -(s8)ewma_avg_signal_read(
                                &sta->deflink.status_stats.avg_ack_signal);
                sinfo->filled |=
                        BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG);
        }

        if (ieee80211_vif_is_mesh(&sdata->vif)) {
                sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC);
                sinfo->airtime_link_metric =
                        airtime_link_metric_get(local, sta);
        }
}

u32 sta_get_expected_throughput(struct sta_info *sta)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        struct rate_control_ref *ref = NULL;
        u32 thr = 0;

        if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
                ref = local->rate_ctrl;

        /* check if the driver has a SW RC implementation */
        if (ref && ref->ops->get_expected_throughput)
                thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv);
        else
                thr = drv_get_expected_throughput(local, sta);

        return thr;
}

unsigned long ieee80211_sta_last_active(struct sta_info *sta)
{
        struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta);

        if (!sta->deflink.status_stats.last_ack ||
            time_after(stats->last_rx, sta->deflink.status_stats.last_ack))
                return stats->last_rx;
        return sta->deflink.status_stats.last_ack;
}

static void sta_update_codel_params(struct sta_info *sta, u32 thr)
{
        if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) {
                sta->cparams.target = MS2TIME(50);
                sta->cparams.interval = MS2TIME(300);
                sta->cparams.ecn = false;
        } else {
                sta->cparams.target = MS2TIME(20);
                sta->cparams.interval = MS2TIME(100);
                sta->cparams.ecn = true;
        }
}

void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
                                           u32 thr)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);

        sta_update_codel_params(sta, thr);
}

int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct sta_link_alloc *alloc;
        int ret;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED));

        /* must represent an MLD from the start */
        if (WARN_ON(!sta->sta.valid_links))
                return -EINVAL;

        if (WARN_ON(sta->sta.valid_links & BIT(link_id) ||
                    sta->link[link_id]))
                return -EBUSY;

        alloc = kzalloc(sizeof(*alloc), GFP_KERNEL);
        if (!alloc)
                return -ENOMEM;

        ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL);
        if (ret) {
                kfree(alloc);
                return ret;
        }

        sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta);

        ieee80211_link_sta_debugfs_add(&alloc->info);

        return 0;
}

void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id)
{
        lockdep_assert_wiphy(sta->sdata->local->hw.wiphy);

        WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED));

        sta_remove_link(sta, link_id, false);
}

int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct link_sta_info *link_sta;
        u16 old_links = sta->sta.valid_links;
        u16 new_links = old_links | BIT(link_id);
        int ret;

        link_sta = rcu_dereference_protected(sta->link[link_id],
                                             lockdep_is_held(&sdata->local->hw.wiphy->mtx));

        if (WARN_ON(old_links == new_links || !link_sta))
                return -EINVAL;

        rcu_read_lock();
        if (link_sta_info_hash_lookup(sdata->local, link_sta->addr)) {
                rcu_read_unlock();
                return -EALREADY;
        }
        /* we only modify under the mutex so this is fine */
        rcu_read_unlock();

        sta->sta.valid_links = new_links;

        if (WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)))
                goto hash;

        ieee80211_recalc_min_chandef(sdata, link_id);

        /* Ensure the values are updated for the driver,
         * redone by sta_remove_link on failure.
         */
        ieee80211_sta_recalc_aggregates(&sta->sta);

        ret = drv_change_sta_links(sdata->local, sdata, &sta->sta,
                                   old_links, new_links);
        if (ret) {
                sta->sta.valid_links = old_links;
                sta_remove_link(sta, link_id, false);
                return ret;
        }

hash:
        ret = link_sta_info_hash_add(sdata->local, link_sta);
        WARN_ON(ret);
        return 0;
}

void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        u16 old_links = sta->sta.valid_links;

        lockdep_assert_wiphy(sdata->local->hw.wiphy);

        sta->sta.valid_links &= ~BIT(link_id);

        if (!WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)))
                drv_change_sta_links(sdata->local, sdata, &sta->sta,
                                     old_links, sta->sta.valid_links);

        sta_remove_link(sta, link_id, true);
}

void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta,
                                           const u8 *ext_capab,
                                           unsigned int ext_capab_len)
{
        u8 val;

        sta->sta.max_amsdu_subframes = 0;

        if (ext_capab_len < 8)
                return;

        /* The sender might not have sent the last bit, consider it to be 0 */
        val = u8_get_bits(ext_capab[7], WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB);

        /* we did get all the bits, take the MSB as well */
        if (ext_capab_len >= 9)
                val |= u8_get_bits(ext_capab[8],
                                   WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB) << 1;

        if (val)
                sta->sta.max_amsdu_subframes = 4 << (4 - val);
}

#ifdef CONFIG_LOCKDEP
bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);

        return lockdep_is_held(&sta->local->hw.wiphy->mtx);
}
EXPORT_SYMBOL(lockdep_sta_mutex_held);
#endif
























































    1 























    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005-2006, Devicescape Software, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2007        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright (C) 2015-2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 *
 * utilities for mac80211
 */

#include <net/mac80211.h>
#include <linux/netdevice.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/etherdevice.h>
#include <linux/if_arp.h>
#include <linux/bitmap.h>
#include <linux/crc32.h>
#include <net/net_namespace.h>
#include <net/cfg80211.h>
#include <net/rtnetlink.h>
#include <kunit/visibility.h>

#include "ieee80211_i.h"
#include "driver-ops.h"
#include "rate.h"
#include "mesh.h"
#include "wme.h"
#include "led.h"
#include "wep.h"

/* privid for wiphys to determine whether they belong to us or not */
const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid;

struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy)
{
        struct ieee80211_local *local;

        local = wiphy_priv(wiphy);
        return &local->hw;
}
EXPORT_SYMBOL(wiphy_to_ieee80211_hw);

const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited = {
        .mode = IEEE80211_CONN_MODE_EHT,
        .bw_limit = IEEE80211_CONN_BW_LIMIT_320,
};

u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
                        enum nl80211_iftype type)
{
        __le16 fc = hdr->frame_control;

        if (ieee80211_is_data(fc)) {
                if (len < 24) /* drop incorrect hdr len (data) */
                        return NULL;

                if (ieee80211_has_a4(fc))
                        return NULL;
                if (ieee80211_has_tods(fc))
                        return hdr->addr1;
                if (ieee80211_has_fromds(fc))
                        return hdr->addr2;

                return hdr->addr3;
        }

        if (ieee80211_is_s1g_beacon(fc)) {
                struct ieee80211_ext *ext = (void *) hdr;

                return ext->u.s1g_beacon.sa;
        }

        if (ieee80211_is_mgmt(fc)) {
                if (len < 24) /* drop incorrect hdr len (mgmt) */
                        return NULL;
                return hdr->addr3;
        }

        if (ieee80211_is_ctl(fc)) {
                if (ieee80211_is_pspoll(fc))
                        return hdr->addr1;

                if (ieee80211_is_back_req(fc)) {
                        switch (type) {
                        case NL80211_IFTYPE_STATION:
                                return hdr->addr2;
                        case NL80211_IFTYPE_AP:
                        case NL80211_IFTYPE_AP_VLAN:
                                return hdr->addr1;
                        default:
                                break; /* fall through to the return */
                        }
                }
        }

        return NULL;
}
EXPORT_SYMBOL(ieee80211_get_bssid);

void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
{
        struct sk_buff *skb;
        struct ieee80211_hdr *hdr;

        skb_queue_walk(&tx->skbs, skb) {
                hdr = (struct ieee80211_hdr *) skb->data;
                hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
        }
}

int ieee80211_frame_duration(enum nl80211_band band, size_t len,
                             int rate, int erp, int short_preamble)
{
        int dur;

        /* calculate duration (in microseconds, rounded up to next higher
         * integer if it includes a fractional microsecond) to send frame of
         * len bytes (does not include FCS) at the given rate. Duration will
         * also include SIFS.
         *
         * rate is in 100 kbps, so divident is multiplied by 10 in the
         * DIV_ROUND_UP() operations.
         */

        if (band == NL80211_BAND_5GHZ || erp) {
                /*
                 * OFDM:
                 *
                 * N_DBPS = DATARATE x 4
                 * N_SYM = Ceiling((16+8xLENGTH+6) / N_DBPS)
                 *        (16 = SIGNAL time, 6 = tail bits)
                 * TXTIME = T_PREAMBLE + T_SIGNAL + T_SYM x N_SYM + Signal Ext
                 *
                 * T_SYM = 4 usec
                 * 802.11a - 18.5.2: aSIFSTime = 16 usec
                 * 802.11g - 19.8.4: aSIFSTime = 10 usec +
                 *        signal ext = 6 usec
                 */
                dur = 16; /* SIFS + signal ext */
                dur += 16; /* IEEE 802.11-2012 18.3.2.4: T_PREAMBLE = 16 usec */
                dur += 4; /* IEEE 802.11-2012 18.3.2.4: T_SIGNAL = 4 usec */

                /* rates should already consider the channel bandwidth,
                 * don't apply divisor again.
                 */
                dur += 4 * DIV_ROUND_UP((16 + 8 * (len + 4) + 6) * 10,
                                        4 * rate); /* T_SYM x N_SYM */
        } else {
                /*
                 * 802.11b or 802.11g with 802.11b compatibility:
                 * 18.3.4: TXTIME = PreambleLength + PLCPHeaderTime +
                 * Ceiling(((LENGTH+PBCC)x8)/DATARATE). PBCC=0.
                 *
                 * 802.11 (DS): 15.3.3, 802.11b: 18.3.4
                 * aSIFSTime = 10 usec
                 * aPreambleLength = 144 usec or 72 usec with short preamble
                 * aPLCPHeaderLength = 48 usec or 24 usec with short preamble
                 */
                dur = 10; /* aSIFSTime = 10 usec */
                dur += short_preamble ? (72 + 24) : (144 + 48);

                dur += DIV_ROUND_UP(8 * (len + 4) * 10, rate);
        }

        return dur;
}

/* Exported duration function for driver use */
__le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw,
                                        struct ieee80211_vif *vif,
                                        enum nl80211_band band,
                                        size_t frame_len,
                                        struct ieee80211_rate *rate)
{
        struct ieee80211_sub_if_data *sdata;
        u16 dur;
        int erp;
        bool short_preamble = false;

        erp = 0;
        if (vif) {
                sdata = vif_to_sdata(vif);
                short_preamble = sdata->vif.bss_conf.use_short_preamble;
                if (sdata->deflink.operating_11g_mode)
                        erp = rate->flags & IEEE80211_RATE_ERP_G;
        }

        dur = ieee80211_frame_duration(band, frame_len, rate->bitrate, erp,
                                       short_preamble);

        return cpu_to_le16(dur);
}
EXPORT_SYMBOL(ieee80211_generic_frame_duration);

__le16 ieee80211_rts_duration(struct ieee80211_hw *hw,
                              struct ieee80211_vif *vif, size_t frame_len,
                              const struct ieee80211_tx_info *frame_txctl)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct ieee80211_rate *rate;
        struct ieee80211_sub_if_data *sdata;
        bool short_preamble;
        int erp, bitrate;
        u16 dur;
        struct ieee80211_supported_band *sband;

        sband = local->hw.wiphy->bands[frame_txctl->band];

        short_preamble = false;

        rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];

        erp = 0;
        if (vif) {
                sdata = vif_to_sdata(vif);
                short_preamble = sdata->vif.bss_conf.use_short_preamble;
                if (sdata->deflink.operating_11g_mode)
                        erp = rate->flags & IEEE80211_RATE_ERP_G;
        }

        bitrate = rate->bitrate;

        /* CTS duration */
        dur = ieee80211_frame_duration(sband->band, 10, bitrate,
                                       erp, short_preamble);
        /* Data frame duration */
        dur += ieee80211_frame_duration(sband->band, frame_len, bitrate,
                                        erp, short_preamble);
        /* ACK duration */
        dur += ieee80211_frame_duration(sband->band, 10, bitrate,
                                        erp, short_preamble);

        return cpu_to_le16(dur);
}
EXPORT_SYMBOL(ieee80211_rts_duration);

__le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
                                    struct ieee80211_vif *vif,
                                    size_t frame_len,
                                    const struct ieee80211_tx_info *frame_txctl)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct ieee80211_rate *rate;
        struct ieee80211_sub_if_data *sdata;
        bool short_preamble;
        int erp, bitrate;
        u16 dur;
        struct ieee80211_supported_band *sband;

        sband = local->hw.wiphy->bands[frame_txctl->band];

        short_preamble = false;

        rate = &sband->bitrates[frame_txctl->control.rts_cts_rate_idx];
        erp = 0;
        if (vif) {
                sdata = vif_to_sdata(vif);
                short_preamble = sdata->vif.bss_conf.use_short_preamble;
                if (sdata->deflink.operating_11g_mode)
                        erp = rate->flags & IEEE80211_RATE_ERP_G;
        }

        bitrate = rate->bitrate;

        /* Data frame duration */
        dur = ieee80211_frame_duration(sband->band, frame_len, bitrate,
                                       erp, short_preamble);
        if (!(frame_txctl->flags & IEEE80211_TX_CTL_NO_ACK)) {
                /* ACK duration */
                dur += ieee80211_frame_duration(sband->band, 10, bitrate,
                                                erp, short_preamble);
        }

        return cpu_to_le16(dur);
}
EXPORT_SYMBOL(ieee80211_ctstoself_duration);

static void wake_tx_push_queue(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               struct ieee80211_txq *queue)
{
        struct ieee80211_tx_control control = {
                .sta = queue->sta,
        };
        struct sk_buff *skb;

        while (1) {
                skb = ieee80211_tx_dequeue(&local->hw, queue);
                if (!skb)
                        break;

                drv_tx(local, &control, skb);
        }
}

/* wake_tx_queue handler for driver not implementing a custom one*/
void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw,
                                    struct ieee80211_txq *txq)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif);
        struct ieee80211_txq *queue;

        spin_lock(&local->handle_wake_tx_queue_lock);

        /* Use ieee80211_next_txq() for airtime fairness accounting */
        ieee80211_txq_schedule_start(hw, txq->ac);
        while ((queue = ieee80211_next_txq(hw, txq->ac))) {
                wake_tx_push_queue(local, sdata, queue);
                ieee80211_return_txq(hw, queue, false);
        }
        ieee80211_txq_schedule_end(hw, txq->ac);
        spin_unlock(&local->handle_wake_tx_queue_lock);
}
EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue);

static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_vif *vif = &sdata->vif;
        struct fq *fq = &local->fq;
        struct ps_data *ps = NULL;
        struct txq_info *txqi;
        struct sta_info *sta;
        int i;

        local_bh_disable();
        spin_lock(&fq->lock);

        if (!test_bit(SDATA_STATE_RUNNING, &sdata->state))
                goto out;

        if (sdata->vif.type == NL80211_IFTYPE_AP)
                ps = &sdata->bss->ps;

        list_for_each_entry_rcu(sta, &local->sta_list, list) {
                if (sdata != sta->sdata)
                        continue;

                for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
                        struct ieee80211_txq *txq = sta->sta.txq[i];

                        if (!txq)
                                continue;

                        txqi = to_txq_info(txq);

                        if (ac != txq->ac)
                                continue;

                        if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY,
                                                &txqi->flags))
                                continue;

                        spin_unlock(&fq->lock);
                        drv_wake_tx_queue(local, txqi);
                        spin_lock(&fq->lock);
                }
        }

        if (!vif->txq)
                goto out;

        txqi = to_txq_info(vif->txq);

        if (!test_and_clear_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ||
            (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac)
                goto out;

        spin_unlock(&fq->lock);

        drv_wake_tx_queue(local, txqi);
        local_bh_enable();
        return;
out:
        spin_unlock(&fq->lock);
        local_bh_enable();
}

static void
__releases(&local->queue_stop_reason_lock)
__acquires(&local->queue_stop_reason_lock)
_ieee80211_wake_txqs(struct ieee80211_local *local, unsigned long *flags)
{
        struct ieee80211_sub_if_data *sdata;
        int n_acs = IEEE80211_NUM_ACS;
        int i;

        rcu_read_lock();

        if (local->hw.queues < IEEE80211_NUM_ACS)
                n_acs = 1;

        for (i = 0; i < local->hw.queues; i++) {
                if (local->queue_stop_reasons[i])
                        continue;

                spin_unlock_irqrestore(&local->queue_stop_reason_lock, *flags);
                list_for_each_entry_rcu(sdata, &local->interfaces, list) {
                        int ac;

                        for (ac = 0; ac < n_acs; ac++) {
                                int ac_queue = sdata->vif.hw_queue[ac];

                                if (ac_queue == i ||
                                    sdata->vif.cab_queue == i)
                                        __ieee80211_wake_txqs(sdata, ac);
                        }
                }
                spin_lock_irqsave(&local->queue_stop_reason_lock, *flags);
        }

        rcu_read_unlock();
}

void ieee80211_wake_txqs(struct tasklet_struct *t)
{
        struct ieee80211_local *local = from_tasklet(local, t,
                                                     wake_txqs_tasklet);
        unsigned long flags;

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
        _ieee80211_wake_txqs(local, &flags);
        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}

static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
                                   enum queue_stop_reason reason,
                                   bool refcounted,
                                   unsigned long *flags)
{
        struct ieee80211_local *local = hw_to_local(hw);

        trace_wake_queue(local, queue, reason);

        if (WARN_ON(queue >= hw->queues))
                return;

        if (!test_bit(reason, &local->queue_stop_reasons[queue]))
                return;

        if (!refcounted) {
                local->q_stop_reasons[queue][reason] = 0;
        } else {
                local->q_stop_reasons[queue][reason]--;
                if (WARN_ON(local->q_stop_reasons[queue][reason] < 0))
                        local->q_stop_reasons[queue][reason] = 0;
        }

        if (local->q_stop_reasons[queue][reason] == 0)
                __clear_bit(reason, &local->queue_stop_reasons[queue]);

        if (local->queue_stop_reasons[queue] != 0)
                /* someone still has this queue stopped */
                return;

        if (!skb_queue_empty(&local->pending[queue]))
                tasklet_schedule(&local->tx_pending_tasklet);

        /*
         * Calling _ieee80211_wake_txqs here can be a problem because it may
         * release queue_stop_reason_lock which has been taken by
         * __ieee80211_wake_queue's caller. It is certainly not very nice to
         * release someone's lock, but it is fine because all the callers of
         * __ieee80211_wake_queue call it right before releasing the lock.
         */
        if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER)
                tasklet_schedule(&local->wake_txqs_tasklet);
        else
                _ieee80211_wake_txqs(local, flags);
}

void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
                                    enum queue_stop_reason reason,
                                    bool refcounted)
{
        struct ieee80211_local *local = hw_to_local(hw);
        unsigned long flags;

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
        __ieee80211_wake_queue(hw, queue, reason, refcounted, &flags);
        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}

void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue)
{
        ieee80211_wake_queue_by_reason(hw, queue,
                                       IEEE80211_QUEUE_STOP_REASON_DRIVER,
                                       false);
}
EXPORT_SYMBOL(ieee80211_wake_queue);

static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
                                   enum queue_stop_reason reason,
                                   bool refcounted)
{
        struct ieee80211_local *local = hw_to_local(hw);

        trace_stop_queue(local, queue, reason);

        if (WARN_ON(queue >= hw->queues))
                return;

        if (!refcounted)
                local->q_stop_reasons[queue][reason] = 1;
        else
                local->q_stop_reasons[queue][reason]++;

        set_bit(reason, &local->queue_stop_reasons[queue]);
}

void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
                                    enum queue_stop_reason reason,
                                    bool refcounted)
{
        struct ieee80211_local *local = hw_to_local(hw);
        unsigned long flags;

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
        __ieee80211_stop_queue(hw, queue, reason, refcounted);
        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}

void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue)
{
        ieee80211_stop_queue_by_reason(hw, queue,
                                       IEEE80211_QUEUE_STOP_REASON_DRIVER,
                                       false);
}
EXPORT_SYMBOL(ieee80211_stop_queue);

void ieee80211_add_pending_skb(struct ieee80211_local *local,
                               struct sk_buff *skb)
{
        struct ieee80211_hw *hw = &local->hw;
        unsigned long flags;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        int queue = info->hw_queue;

        if (WARN_ON(!info->control.vif)) {
                ieee80211_free_txskb(&local->hw, skb);
                return;
        }

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
        __ieee80211_stop_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
                               false);
        __skb_queue_tail(&local->pending[queue], skb);
        __ieee80211_wake_queue(hw, queue, IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
                               false, &flags);
        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}

void ieee80211_add_pending_skbs(struct ieee80211_local *local,
                                struct sk_buff_head *skbs)
{
        struct ieee80211_hw *hw = &local->hw;
        struct sk_buff *skb;
        unsigned long flags;
        int queue, i;

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
        while ((skb = skb_dequeue(skbs))) {
                struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);

                if (WARN_ON(!info->control.vif)) {
                        ieee80211_free_txskb(&local->hw, skb);
                        continue;
                }

                queue = info->hw_queue;

                __ieee80211_stop_queue(hw, queue,
                                IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
                                false);

                __skb_queue_tail(&local->pending[queue], skb);
        }

        for (i = 0; i < hw->queues; i++)
                __ieee80211_wake_queue(hw, i,
                        IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
                        false, &flags);
        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}

void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
                                     unsigned long queues,
                                     enum queue_stop_reason reason,
                                     bool refcounted)
{
        struct ieee80211_local *local = hw_to_local(hw);
        unsigned long flags;
        int i;

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);

        for_each_set_bit(i, &queues, hw->queues)
                __ieee80211_stop_queue(hw, i, reason, refcounted);

        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}

void ieee80211_stop_queues(struct ieee80211_hw *hw)
{
        ieee80211_stop_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
                                        IEEE80211_QUEUE_STOP_REASON_DRIVER,
                                        false);
}
EXPORT_SYMBOL(ieee80211_stop_queues);

int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue)
{
        struct ieee80211_local *local = hw_to_local(hw);
        unsigned long flags;
        int ret;

        if (WARN_ON(queue >= hw->queues))
                return true;

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
        ret = test_bit(IEEE80211_QUEUE_STOP_REASON_DRIVER,
                       &local->queue_stop_reasons[queue]);
        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
        return ret;
}
EXPORT_SYMBOL(ieee80211_queue_stopped);

void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
                                     unsigned long queues,
                                     enum queue_stop_reason reason,
                                     bool refcounted)
{
        struct ieee80211_local *local = hw_to_local(hw);
        unsigned long flags;
        int i;

        spin_lock_irqsave(&local->queue_stop_reason_lock, flags);

        for_each_set_bit(i, &queues, hw->queues)
                __ieee80211_wake_queue(hw, i, reason, refcounted, &flags);

        spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
}

void ieee80211_wake_queues(struct ieee80211_hw *hw)
{
        ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
                                        IEEE80211_QUEUE_STOP_REASON_DRIVER,
                                        false);
}
EXPORT_SYMBOL(ieee80211_wake_queues);

static unsigned int
ieee80211_get_vif_queues(struct ieee80211_local *local,
                         struct ieee80211_sub_if_data *sdata)
{
        unsigned int queues;

        if (sdata && ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) {
                int ac;

                queues = 0;

                for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
                        queues |= BIT(sdata->vif.hw_queue[ac]);
                if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE)
                        queues |= BIT(sdata->vif.cab_queue);
        } else {
                /* all queues */
                queues = BIT(local->hw.queues) - 1;
        }

        return queues;
}

void __ieee80211_flush_queues(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              unsigned int queues, bool drop)
{
        if (!local->ops->flush)
                return;

        /*
         * If no queue was set, or if the HW doesn't support
         * IEEE80211_HW_QUEUE_CONTROL - flush all queues
         */
        if (!queues || !ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
                queues = ieee80211_get_vif_queues(local, sdata);

        ieee80211_stop_queues_by_reason(&local->hw, queues,
                                        IEEE80211_QUEUE_STOP_REASON_FLUSH,
                                        false);

        if (drop) {
                struct sta_info *sta;

                /* Purge the queues, so the frames on them won't be
                 * sent during __ieee80211_wake_queue()
                 */
                list_for_each_entry(sta, &local->sta_list, list) {
                        if (sdata != sta->sdata)
                                continue;
                        ieee80211_purge_sta_txqs(sta);
                }
        }

        drv_flush(local, sdata, queues, drop);

        ieee80211_wake_queues_by_reason(&local->hw, queues,
                                        IEEE80211_QUEUE_STOP_REASON_FLUSH,
                                        false);
}

void ieee80211_flush_queues(struct ieee80211_local *local,
                            struct ieee80211_sub_if_data *sdata, bool drop)
{
        __ieee80211_flush_queues(local, sdata, 0, drop);
}

void ieee80211_stop_vif_queues(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               enum queue_stop_reason reason)
{
        ieee80211_stop_queues_by_reason(&local->hw,
                                        ieee80211_get_vif_queues(local, sdata),
                                        reason, true);
}

void ieee80211_wake_vif_queues(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               enum queue_stop_reason reason)
{
        ieee80211_wake_queues_by_reason(&local->hw,
                                        ieee80211_get_vif_queues(local, sdata),
                                        reason, true);
}

static void __iterate_interfaces(struct ieee80211_local *local,
                                 u32 iter_flags,
                                 void (*iterator)(void *data, u8 *mac,
                                                  struct ieee80211_vif *vif),
                                 void *data)
{
        struct ieee80211_sub_if_data *sdata;
        bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE;

        list_for_each_entry_rcu(sdata, &local->interfaces, list) {
                switch (sdata->vif.type) {
                case NL80211_IFTYPE_MONITOR:
                        if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE))
                                continue;
                        break;
                case NL80211_IFTYPE_AP_VLAN:
                        continue;
                default:
                        break;
                }
                if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) &&
                    active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
                        continue;
                if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) &&
                    !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
                        continue;
                if (ieee80211_sdata_running(sdata) || !active_only)
                        iterator(data, sdata->vif.addr,
                                 &sdata->vif);
        }

        sdata = rcu_dereference_check(local->monitor_sdata,
                                      lockdep_is_held(&local->iflist_mtx) ||
                                      lockdep_is_held(&local->hw.wiphy->mtx));
        if (sdata &&
            (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only ||
             sdata->flags & IEEE80211_SDATA_IN_DRIVER))
                iterator(data, sdata->vif.addr, &sdata->vif);
}

void ieee80211_iterate_interfaces(
        struct ieee80211_hw *hw, u32 iter_flags,
        void (*iterator)(void *data, u8 *mac,
                         struct ieee80211_vif *vif),
        void *data)
{
        struct ieee80211_local *local = hw_to_local(hw);

        mutex_lock(&local->iflist_mtx);
        __iterate_interfaces(local, iter_flags, iterator, data);
        mutex_unlock(&local->iflist_mtx);
}
EXPORT_SYMBOL_GPL(ieee80211_iterate_interfaces);

void ieee80211_iterate_active_interfaces_atomic(
        struct ieee80211_hw *hw, u32 iter_flags,
        void (*iterator)(void *data, u8 *mac,
                         struct ieee80211_vif *vif),
        void *data)
{
        struct ieee80211_local *local = hw_to_local(hw);

        rcu_read_lock();
        __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE,
                             iterator, data);
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic);

void ieee80211_iterate_active_interfaces_mtx(
        struct ieee80211_hw *hw, u32 iter_flags,
        void (*iterator)(void *data, u8 *mac,
                         struct ieee80211_vif *vif),
        void *data)
{
        struct ieee80211_local *local = hw_to_local(hw);

        lockdep_assert_wiphy(hw->wiphy);

        __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE,
                             iterator, data);
}
EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx);

static void __iterate_stations(struct ieee80211_local *local,
                               void (*iterator)(void *data,
                                                struct ieee80211_sta *sta),
                               void *data)
{
        struct sta_info *sta;

        list_for_each_entry_rcu(sta, &local->sta_list, list) {
                if (!sta->uploaded)
                        continue;

                iterator(data, &sta->sta);
        }
}

void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw,
                        void (*iterator)(void *data,
                                         struct ieee80211_sta *sta),
                        void *data)
{
        struct ieee80211_local *local = hw_to_local(hw);

        rcu_read_lock();
        __iterate_stations(local, iterator, data);
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic);

struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev)
{
        struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);

        if (!ieee80211_sdata_running(sdata) ||
            !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
                return NULL;
        return &sdata->vif;
}
EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif);

struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif)
{
        if (!vif)
                return NULL;

        return &vif_to_sdata(vif)->wdev;
}
EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev);

/*
 * Nothing should have been stuffed into the workqueue during
 * the suspend->resume cycle. Since we can't check each caller
 * of this function if we are already quiescing / suspended,
 * check here and don't WARN since this can actually happen when
 * the rx path (for example) is racing against __ieee80211_suspend
 * and suspending / quiescing was set after the rx path checked
 * them.
 */
static bool ieee80211_can_queue_work(struct ieee80211_local *local)
{
        if (local->quiescing || (local->suspended && !local->resuming)) {
                pr_warn("queueing ieee80211 work while going to suspend\n");
                return false;
        }

        return true;
}

void ieee80211_queue_work(struct ieee80211_hw *hw, struct work_struct *work)
{
        struct ieee80211_local *local = hw_to_local(hw);

        if (!ieee80211_can_queue_work(local))
                return;

        queue_work(local->workqueue, work);
}
EXPORT_SYMBOL(ieee80211_queue_work);

void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
                                  struct delayed_work *dwork,
                                  unsigned long delay)
{
        struct ieee80211_local *local = hw_to_local(hw);

        if (!ieee80211_can_queue_work(local))
                return;

        queue_delayed_work(local->workqueue, dwork, delay);
}
EXPORT_SYMBOL(ieee80211_queue_delayed_work);

void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
                                           struct ieee80211_tx_queue_params
                                           *qparam, int ac)
{
        struct ieee80211_chanctx_conf *chanctx_conf;
        const struct ieee80211_reg_rule *rrule;
        const struct ieee80211_wmm_ac *wmm_ac;
        u16 center_freq = 0;

        if (sdata->vif.type != NL80211_IFTYPE_AP &&
            sdata->vif.type != NL80211_IFTYPE_STATION)
                return;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
        if (chanctx_conf)
                center_freq = chanctx_conf->def.chan->center_freq;

        if (!center_freq) {
                rcu_read_unlock();
                return;
        }

        rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq));

        if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) {
                rcu_read_unlock();
                return;
        }

        if (sdata->vif.type == NL80211_IFTYPE_AP)
                wmm_ac = &rrule->wmm_rule.ap[ac];
        else
                wmm_ac = &rrule->wmm_rule.client[ac];
        qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min);
        qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max);
        qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn);
        qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32);
        rcu_read_unlock();
}

void ieee80211_set_wmm_default(struct ieee80211_link_data *link,
                               bool bss_notify, bool enable_qos)
{
        struct ieee80211_sub_if_data *sdata = link->sdata;
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_tx_queue_params qparam;
        struct ieee80211_chanctx_conf *chanctx_conf;
        int ac;
        bool use_11b;
        bool is_ocb; /* Use another EDCA parameters if dot11OCBActivated=true */
        int aCWmin, aCWmax;

        if (!local->ops->conf_tx)
                return;

        if (local->hw.queues < IEEE80211_NUM_ACS)
                return;

        memset(&qparam, 0, sizeof(qparam));

        rcu_read_lock();
        chanctx_conf = rcu_dereference(link->conf->chanctx_conf);
        use_11b = (chanctx_conf &&
                   chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) &&
                 !link->operating_11g_mode;
        rcu_read_unlock();

        is_ocb = (sdata->vif.type == NL80211_IFTYPE_OCB);

        /* Set defaults according to 802.11-2007 Table 7-37 */
        aCWmax = 1023;
        if (use_11b)
                aCWmin = 31;
        else
                aCWmin = 15;

        /* Confiure old 802.11b/g medium access rules. */
        qparam.cw_max = aCWmax;
        qparam.cw_min = aCWmin;
        qparam.txop = 0;
        qparam.aifs = 2;

        for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
                /* Update if QoS is enabled. */
                if (enable_qos) {
                        switch (ac) {
                        case IEEE80211_AC_BK:
                                qparam.cw_max = aCWmax;
                                qparam.cw_min = aCWmin;
                                qparam.txop = 0;
                                if (is_ocb)
                                        qparam.aifs = 9;
                                else
                                        qparam.aifs = 7;
                                break;
                        /* never happens but let's not leave undefined */
                        default:
                        case IEEE80211_AC_BE:
                                qparam.cw_max = aCWmax;
                                qparam.cw_min = aCWmin;
                                qparam.txop = 0;
                                if (is_ocb)
                                        qparam.aifs = 6;
                                else
                                        qparam.aifs = 3;
                                break;
                        case IEEE80211_AC_VI:
                                qparam.cw_max = aCWmin;
                                qparam.cw_min = (aCWmin + 1) / 2 - 1;
                                if (is_ocb)
                                        qparam.txop = 0;
                                else if (use_11b)
                                        qparam.txop = 6016/32;
                                else
                                        qparam.txop = 3008/32;

                                if (is_ocb)
                                        qparam.aifs = 3;
                                else
                                        qparam.aifs = 2;
                                break;
                        case IEEE80211_AC_VO:
                                qparam.cw_max = (aCWmin + 1) / 2 - 1;
                                qparam.cw_min = (aCWmin + 1) / 4 - 1;
                                if (is_ocb)
                                        qparam.txop = 0;
                                else if (use_11b)
                                        qparam.txop = 3264/32;
                                else
                                        qparam.txop = 1504/32;
                                qparam.aifs = 2;
                                break;
                        }
                }
                ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac);

                qparam.uapsd = false;

                link->tx_conf[ac] = qparam;
                drv_conf_tx(local, link, ac, &qparam);
        }

        if (sdata->vif.type != NL80211_IFTYPE_MONITOR &&
            sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
            sdata->vif.type != NL80211_IFTYPE_NAN) {
                link->conf->qos = enable_qos;
                if (bss_notify)
                        ieee80211_link_info_change_notify(sdata, link,
                                                          BSS_CHANGED_QOS);
        }
}

void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
                         u16 transaction, u16 auth_alg, u16 status,
                         const u8 *extra, size_t extra_len, const u8 *da,
                         const u8 *bssid, const u8 *key, u8 key_len, u8 key_idx,
                         u32 tx_flags)
{
        struct ieee80211_local *local = sdata->local;
        struct sk_buff *skb;
        struct ieee80211_mgmt *mgmt;
        bool multi_link = ieee80211_vif_is_mld(&sdata->vif);
        struct {
                u8 id;
                u8 len;
                u8 ext_id;
                struct ieee80211_multi_link_elem ml;
                struct ieee80211_mle_basic_common_info basic;
        } __packed mle = {
                .id = WLAN_EID_EXTENSION,
                .len = sizeof(mle) - 2,
                .ext_id = WLAN_EID_EXT_EHT_MULTI_LINK,
                .ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC),
                .basic.len = sizeof(mle.basic),
        };
        int err;

        memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN);

        /* 24 + 6 = header + auth_algo + auth_transaction + status_code */
        skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN +
                            24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN +
                            multi_link * sizeof(mle));
        if (!skb)
                return;

        skb_reserve(skb, local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN);

        mgmt = skb_put_zero(skb, 24 + 6);
        mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
                                          IEEE80211_STYPE_AUTH);
        memcpy(mgmt->da, da, ETH_ALEN);
        memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
        memcpy(mgmt->bssid, bssid, ETH_ALEN);
        mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg);
        mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
        mgmt->u.auth.status_code = cpu_to_le16(status);
        if (extra)
                skb_put_data(skb, extra, extra_len);
        if (multi_link)
                skb_put_data(skb, &mle, sizeof(mle));

        if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) {
                mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
                err = ieee80211_wep_encrypt(local, skb, key, key_len, key_idx);
                if (WARN_ON(err)) {
                        kfree_skb(skb);
                        return;
                }
        }

        IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
                                        tx_flags;
        ieee80211_tx_skb(sdata, skb);
}

void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
                                    const u8 *da, const u8 *bssid,
                                    u16 stype, u16 reason,
                                    bool send_frame, u8 *frame_buf)
{
        struct ieee80211_local *local = sdata->local;
        struct sk_buff *skb;
        struct ieee80211_mgmt *mgmt = (void *)frame_buf;

        /* build frame */
        mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
        mgmt->duration = 0; /* initialize only */
        mgmt->seq_ctrl = 0; /* initialize only */
        memcpy(mgmt->da, da, ETH_ALEN);
        memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
        memcpy(mgmt->bssid, bssid, ETH_ALEN);
        /* u.deauth.reason_code == u.disassoc.reason_code */
        mgmt->u.deauth.reason_code = cpu_to_le16(reason);

        if (send_frame) {
                skb = dev_alloc_skb(local->hw.extra_tx_headroom +
                                    IEEE80211_DEAUTH_FRAME_LEN);
                if (!skb)
                        return;

                skb_reserve(skb, local->hw.extra_tx_headroom);

                /* copy in frame */
                skb_put_data(skb, mgmt, IEEE80211_DEAUTH_FRAME_LEN);

                if (sdata->vif.type != NL80211_IFTYPE_STATION ||
                    !(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED))
                        IEEE80211_SKB_CB(skb)->flags |=
                                IEEE80211_TX_INTFL_DONT_ENCRYPT;

                ieee80211_tx_skb(sdata, skb);
        }
}

static int ieee80211_put_s1g_cap(struct sk_buff *skb,
                                 struct ieee80211_sta_s1g_cap *s1g_cap)
{
        if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_s1g_cap))
                return -ENOBUFS;

        skb_put_u8(skb, WLAN_EID_S1G_CAPABILITIES);
        skb_put_u8(skb, sizeof(struct ieee80211_s1g_cap));

        skb_put_data(skb, &s1g_cap->cap, sizeof(s1g_cap->cap));
        skb_put_data(skb, &s1g_cap->nss_mcs, sizeof(s1g_cap->nss_mcs));

        return 0;
}

static int ieee80211_put_preq_ies_band(struct sk_buff *skb,
                                       struct ieee80211_sub_if_data *sdata,
                                       const u8 *ie, size_t ie_len,
                                       size_t *offset,
                                       enum nl80211_band band,
                                       u32 rate_mask,
                                       struct cfg80211_chan_def *chandef,
                                       u32 flags)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_supported_band *sband;
        int i, err;
        size_t noffset;
        u32 rate_flags;
        bool have_80mhz = false;

        *offset = 0;

        sband = local->hw.wiphy->bands[band];
        if (WARN_ON_ONCE(!sband))
                return 0;

        rate_flags = ieee80211_chandef_rate_flags(chandef);

        /* For direct scan add S1G IE and consider its override bits */
        if (band == NL80211_BAND_S1GHZ)
                return ieee80211_put_s1g_cap(skb, &sband->s1g_cap);

        err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags,
                                        ~rate_mask, WLAN_EID_SUPP_RATES);
        if (err)
                return err;

        /* insert "request information" if in custom IEs */
        if (ie && ie_len) {
                static const u8 before_extrates[] = {
                        WLAN_EID_SSID,
                        WLAN_EID_SUPP_RATES,
                        WLAN_EID_REQUEST,
                };
                noffset = ieee80211_ie_split(ie, ie_len,
                                             before_extrates,
                                             ARRAY_SIZE(before_extrates),
                                             *offset);
                if (skb_tailroom(skb) < noffset - *offset)
                        return -ENOBUFS;
                skb_put_data(skb, ie + *offset, noffset - *offset);
                *offset = noffset;
        }

        err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags,
                                        ~rate_mask, WLAN_EID_EXT_SUPP_RATES);
        if (err)
                return err;

        if (chandef->chan && sband->band == NL80211_BAND_2GHZ) {
                if (skb_tailroom(skb) < 3)
                        return -ENOBUFS;
                skb_put_u8(skb, WLAN_EID_DS_PARAMS);
                skb_put_u8(skb, 1);
                skb_put_u8(skb,
                           ieee80211_frequency_to_channel(chandef->chan->center_freq));
        }

        if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT)
                return 0;

        /* insert custom IEs that go before HT */
        if (ie && ie_len) {
                static const u8 before_ht[] = {
                        /*
                         * no need to list the ones split off already
                         * (or generated here)
                         */
                        WLAN_EID_DS_PARAMS,
                        WLAN_EID_SUPPORTED_REGULATORY_CLASSES,
                };
                noffset = ieee80211_ie_split(ie, ie_len,
                                             before_ht, ARRAY_SIZE(before_ht),
                                             *offset);
                if (skb_tailroom(skb) < noffset - *offset)
                        return -ENOBUFS;
                skb_put_data(skb, ie + *offset, noffset - *offset);
                *offset = noffset;
        }

        if (sband->ht_cap.ht_supported) {
                u8 *pos;

                if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap))
                        return -ENOBUFS;

                pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap));
                ieee80211_ie_build_ht_cap(pos, &sband->ht_cap,
                                          sband->ht_cap.cap);
        }

        /* insert custom IEs that go before VHT */
        if (ie && ie_len) {
                static const u8 before_vht[] = {
                        /*
                         * no need to list the ones split off already
                         * (or generated here)
                         */
                        WLAN_EID_BSS_COEX_2040,
                        WLAN_EID_EXT_CAPABILITY,
                        WLAN_EID_SSID_LIST,
                        WLAN_EID_CHANNEL_USAGE,
                        WLAN_EID_INTERWORKING,
                        WLAN_EID_MESH_ID,
                        /* 60 GHz (Multi-band, DMG, MMS) can't happen */
                };
                noffset = ieee80211_ie_split(ie, ie_len,
                                             before_vht, ARRAY_SIZE(before_vht),
                                             *offset);
                if (skb_tailroom(skb) < noffset - *offset)
                        return -ENOBUFS;
                skb_put_data(skb, ie + *offset, noffset - *offset);
                *offset = noffset;
        }

        /* Check if any channel in this sband supports at least 80 MHz */
        for (i = 0; i < sband->n_channels; i++) {
                if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
                                                IEEE80211_CHAN_NO_80MHZ))
                        continue;

                have_80mhz = true;
                break;
        }

        if (sband->vht_cap.vht_supported && have_80mhz) {
                u8 *pos;

                if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_vht_cap))
                        return -ENOBUFS;

                pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_cap));
                ieee80211_ie_build_vht_cap(pos, &sband->vht_cap,
                                           sband->vht_cap.cap);
        }

        /* insert custom IEs that go before HE */
        if (ie && ie_len) {
                static const u8 before_he[] = {
                        /*
                         * no need to list the ones split off before VHT
                         * or generated here
                         */
                        WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS,
                        WLAN_EID_AP_CSN,
                        /* TODO: add 11ah/11aj/11ak elements */
                };
                noffset = ieee80211_ie_split(ie, ie_len,
                                             before_he, ARRAY_SIZE(before_he),
                                             *offset);
                if (skb_tailroom(skb) < noffset - *offset)
                        return -ENOBUFS;
                skb_put_data(skb, ie + *offset, noffset - *offset);
                *offset = noffset;
        }

        if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band),
                                         IEEE80211_CHAN_NO_HE)) {
                err = ieee80211_put_he_cap(skb, sdata, sband, NULL);
                if (err)
                        return err;
        }

        if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band),
                                         IEEE80211_CHAN_NO_HE |
                                         IEEE80211_CHAN_NO_EHT)) {
                err = ieee80211_put_eht_cap(skb, sdata, sband, NULL);
                if (err)
                        return err;
        }

        err = ieee80211_put_he_6ghz_cap(skb, sdata, IEEE80211_SMPS_OFF);
        if (err)
                return err;

        /*
         * If adding more here, adjust code in main.c
         * that calculates local->scan_ies_len.
         */

        return 0;
}

static int ieee80211_put_preq_ies(struct sk_buff *skb,
                                  struct ieee80211_sub_if_data *sdata,
                                  struct ieee80211_scan_ies *ie_desc,
                                  const u8 *ie, size_t ie_len,
                                  u8 bands_used, u32 *rate_masks,
                                  struct cfg80211_chan_def *chandef,
                                  u32 flags)
{
        size_t custom_ie_offset = 0;
        int i, err;

        memset(ie_desc, 0, sizeof(*ie_desc));

        for (i = 0; i < NUM_NL80211_BANDS; i++) {
                if (bands_used & BIT(i)) {
                        ie_desc->ies[i] = skb_tail_pointer(skb);
                        err = ieee80211_put_preq_ies_band(skb, sdata,
                                                          ie, ie_len,
                                                          &custom_ie_offset,
                                                          i, rate_masks[i],
                                                          chandef, flags);
                        if (err)
                                return err;
                        ie_desc->len[i] = skb_tail_pointer(skb) -
                                          ie_desc->ies[i];
                }
        }

        /* add any remaining custom IEs */
        if (ie && ie_len) {
                if (WARN_ONCE(skb_tailroom(skb) < ie_len - custom_ie_offset,
                              "not enough space for preq custom IEs\n"))
                        return -ENOBUFS;
                ie_desc->common_ies = skb_tail_pointer(skb);
                skb_put_data(skb, ie + custom_ie_offset,
                             ie_len - custom_ie_offset);
                ie_desc->common_ie_len = skb_tail_pointer(skb) -
                                         ie_desc->common_ies;
        }

        return 0;
};

int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer,
                             size_t buffer_len,
                             struct ieee80211_scan_ies *ie_desc,
                             const u8 *ie, size_t ie_len,
                             u8 bands_used, u32 *rate_masks,
                             struct cfg80211_chan_def *chandef,
                             u32 flags)
{
        struct sk_buff *skb = alloc_skb(buffer_len, GFP_KERNEL);
        uintptr_t offs;
        int ret, i;
        u8 *start;

        if (!skb)
                return -ENOMEM;

        start = skb_tail_pointer(skb);
        memset(start, 0, skb_tailroom(skb));
        ret = ieee80211_put_preq_ies(skb, sdata, ie_desc, ie, ie_len,
                                     bands_used, rate_masks, chandef,
                                     flags);
        if (ret < 0) {
                goto out;
        }

        if (skb->len > buffer_len) {
                ret = -ENOBUFS;
                goto out;
        }

        memcpy(buffer, start, skb->len);

        /* adjust ie_desc for copy */
        for (i = 0; i < NUM_NL80211_BANDS; i++) {
                offs = ie_desc->ies[i] - start;
                ie_desc->ies[i] = buffer + offs;
        }
        offs = ie_desc->common_ies - start;
        ie_desc->common_ies = buffer + offs;

        ret = skb->len;
out:
        consume_skb(skb);
        return ret;
}

struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
                                          const u8 *src, const u8 *dst,
                                          u32 ratemask,
                                          struct ieee80211_channel *chan,
                                          const u8 *ssid, size_t ssid_len,
                                          const u8 *ie, size_t ie_len,
                                          u32 flags)
{
        struct ieee80211_local *local = sdata->local;
        struct cfg80211_chan_def chandef;
        struct sk_buff *skb;
        struct ieee80211_mgmt *mgmt;
        u32 rate_masks[NUM_NL80211_BANDS] = {};
        struct ieee80211_scan_ies dummy_ie_desc;

        /*
         * Do not send DS Channel parameter for directed probe requests
         * in order to maximize the chance that we get a response.  Some
         * badly-behaved APs don't respond when this parameter is included.
         */
        chandef.width = sdata->vif.bss_conf.chanreq.oper.width;
        if (flags & IEEE80211_PROBE_FLAG_DIRECTED)
                chandef.chan = NULL;
        else
                chandef.chan = chan;

        skb = ieee80211_probereq_get(&local->hw, src, ssid, ssid_len,
                                     local->scan_ies_len + ie_len);
        if (!skb)
                return NULL;

        rate_masks[chan->band] = ratemask;
        ieee80211_put_preq_ies(skb, sdata, &dummy_ie_desc,
                               ie, ie_len, BIT(chan->band),
                               rate_masks, &chandef, flags);

        if (dst) {
                mgmt = (struct ieee80211_mgmt *) skb->data;
                memcpy(mgmt->da, dst, ETH_ALEN);
                memcpy(mgmt->bssid, dst, ETH_ALEN);
        }

        IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;

        return skb;
}

u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
                            struct ieee802_11_elems *elems,
                            enum nl80211_band band, u32 *basic_rates)
{
        struct ieee80211_supported_band *sband;
        size_t num_rates;
        u32 supp_rates, rate_flags;
        int i, j;

        sband = sdata->local->hw.wiphy->bands[band];
        if (WARN_ON(!sband))
                return 1;

        rate_flags =
                ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper);

        num_rates = sband->n_bitrates;
        supp_rates = 0;
        for (i = 0; i < elems->supp_rates_len +
                     elems->ext_supp_rates_len; i++) {
                u8 rate = 0;
                int own_rate;
                bool is_basic;
                if (i < elems->supp_rates_len)
                        rate = elems->supp_rates[i];
                else if (elems->ext_supp_rates)
                        rate = elems->ext_supp_rates
                                [i - elems->supp_rates_len];
                own_rate = 5 * (rate & 0x7f);
                is_basic = !!(rate & 0x80);

                if (is_basic && (rate & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY)
                        continue;

                for (j = 0; j < num_rates; j++) {
                        int brate;
                        if ((rate_flags & sband->bitrates[j].flags)
                            != rate_flags)
                                continue;

                        brate = sband->bitrates[j].bitrate;

                        if (brate == own_rate) {
                                supp_rates |= BIT(j);
                                if (basic_rates && is_basic)
                                        *basic_rates |= BIT(j);
                        }
                }
        }
        return supp_rates;
}

void ieee80211_stop_device(struct ieee80211_local *local)
{
        ieee80211_handle_queued_frames(local);

        ieee80211_led_radio(local, false);
        ieee80211_mod_tpt_led_trig(local, 0, IEEE80211_TPT_LEDTRIG_FL_RADIO);

        wiphy_work_cancel(local->hw.wiphy, &local->reconfig_filter);

        flush_workqueue(local->workqueue);
        wiphy_work_flush(local->hw.wiphy, NULL);
        drv_stop(local);
}

static void ieee80211_flush_completed_scan(struct ieee80211_local *local,
                                           bool aborted)
{
        /* It's possible that we don't handle the scan completion in
         * time during suspend, so if it's still marked as completed
         * here, queue the work and flush it to clean things up.
         * Instead of calling the worker function directly here, we
         * really queue it to avoid potential races with other flows
         * scheduling the same work.
         */
        if (test_bit(SCAN_COMPLETED, &local->scanning)) {
                /* If coming from reconfiguration failure, abort the scan so
                 * we don't attempt to continue a partial HW scan - which is
                 * possible otherwise if (e.g.) the 2.4 GHz portion was the
                 * completed scan, and a 5 GHz portion is still pending.
                 */
                if (aborted)
                        set_bit(SCAN_ABORTED, &local->scanning);
                wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0);
                wiphy_delayed_work_flush(local->hw.wiphy, &local->scan_work);
        }
}

static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_chanctx *ctx;

        lockdep_assert_wiphy(local->hw.wiphy);

        /*
         * We get here if during resume the device can't be restarted properly.
         * We might also get here if this happens during HW reset, which is a
         * slightly different situation and we need to drop all connections in
         * the latter case.
         *
         * Ask cfg80211 to turn off all interfaces, this will result in more
         * warnings but at least we'll then get into a clean stopped state.
         */

        local->resuming = false;
        local->suspended = false;
        local->in_reconfig = false;
        local->reconfig_failure = true;

        ieee80211_flush_completed_scan(local, true);

        /* scheduled scan clearly can't be running any more, but tell
         * cfg80211 and clear local state
         */
        ieee80211_sched_scan_end(local);

        list_for_each_entry(sdata, &local->interfaces, list)
                sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER;

        /* Mark channel contexts as not being in the driver any more to avoid
         * removing them from the driver during the shutdown process...
         */
        list_for_each_entry(ctx, &local->chanctx_list, list)
                ctx->driver_present = false;
}

static void ieee80211_assign_chanctx(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_link_data *link)
{
        struct ieee80211_chanctx_conf *conf;
        struct ieee80211_chanctx *ctx;

        lockdep_assert_wiphy(local->hw.wiphy);

        conf = rcu_dereference_protected(link->conf->chanctx_conf,
                                         lockdep_is_held(&local->hw.wiphy->mtx));
        if (conf) {
                ctx = container_of(conf, struct ieee80211_chanctx, conf);
                drv_assign_vif_chanctx(local, sdata, link->conf, ctx);
        }
}

static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta;

        lockdep_assert_wiphy(local->hw.wiphy);

        /* add STAs back */
        list_for_each_entry(sta, &local->sta_list, list) {
                enum ieee80211_sta_state state;

                if (!sta->uploaded || sta->sdata != sdata)
                        continue;

                for (state = IEEE80211_STA_NOTEXIST;
                     state < sta->sta_state; state++)
                        WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
                                              state + 1));
        }
}

static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata)
{
        struct cfg80211_nan_func *func, **funcs;
        int res, id, i = 0;

        res = drv_start_nan(sdata->local, sdata,
                            &sdata->u.nan.conf);
        if (WARN_ON(res))
                return res;

        funcs = kcalloc(sdata->local->hw.max_nan_de_entries + 1,
                        sizeof(*funcs),
                        GFP_KERNEL);
        if (!funcs)
                return -ENOMEM;

        /* Add all the functions:
         * This is a little bit ugly. We need to call a potentially sleeping
         * callback for each NAN function, so we can't hold the spinlock.
         */
        spin_lock_bh(&sdata->u.nan.func_lock);

        idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id)
                funcs[i++] = func;

        spin_unlock_bh(&sdata->u.nan.func_lock);

        for (i = 0; funcs[i]; i++) {
                res = drv_add_nan_func(sdata->local, sdata, funcs[i]);
                if (WARN_ON(res))
                        ieee80211_nan_func_terminated(&sdata->vif,
                                                      funcs[i]->instance_id,
                                                      NL80211_NAN_FUNC_TERM_REASON_ERROR,
                                                      GFP_KERNEL);
        }

        kfree(funcs);

        return 0;
}

static void ieee80211_reconfig_ap_links(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        u64 changed)
{
        int link_id;

        for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
                struct ieee80211_link_data *link;

                if (!(sdata->vif.active_links & BIT(link_id)))
                        continue;

                link = sdata_dereference(sdata->link[link_id], sdata);
                if (!link)
                        continue;

                if (rcu_access_pointer(link->u.ap.beacon))
                        drv_start_ap(local, sdata, link->conf);

                if (!link->conf->enable_beacon)
                        continue;

                changed |= BSS_CHANGED_BEACON |
                           BSS_CHANGED_BEACON_ENABLED;

                ieee80211_link_info_change_notify(sdata, link, changed);
        }
}

int ieee80211_reconfig(struct ieee80211_local *local)
{
        struct ieee80211_hw *hw = &local->hw;
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_chanctx *ctx;
        struct sta_info *sta;
        int res, i;
        bool reconfig_due_to_wowlan = false;
        struct ieee80211_sub_if_data *sched_scan_sdata;
        struct cfg80211_sched_scan_request *sched_scan_req;
        bool sched_scan_stopped = false;
        bool suspended = local->suspended;
        bool in_reconfig = false;

        lockdep_assert_wiphy(local->hw.wiphy);

        /* nothing to do if HW shouldn't run */
        if (!local->open_count)
                goto wake_up;

#ifdef CONFIG_PM
        if (suspended)
                local->resuming = true;

        if (local->wowlan) {
                /*
                 * In the wowlan case, both mac80211 and the device
                 * are functional when the resume op is called, so
                 * clear local->suspended so the device could operate
                 * normally (e.g. pass rx frames).
                 */
                local->suspended = false;
                res = drv_resume(local);
                local->wowlan = false;
                if (res < 0) {
                        local->resuming = false;
                        return res;
                }
                if (res == 0)
                        goto wake_up;
                WARN_ON(res > 1);
                /*
                 * res is 1, which means the driver requested
                 * to go through a regular reset on wakeup.
                 * restore local->suspended in this case.
                 */
                reconfig_due_to_wowlan = true;
                local->suspended = true;
        }
#endif

        /*
         * In case of hw_restart during suspend (without wowlan),
         * cancel restart work, as we are reconfiguring the device
         * anyway.
         * Note that restart_work is scheduled on a frozen workqueue,
         * so we can't deadlock in this case.
         */
        if (suspended && local->in_reconfig && !reconfig_due_to_wowlan)
                cancel_work_sync(&local->restart_work);

        local->started = false;

        /*
         * Upon resume hardware can sometimes be goofy due to
         * various platform / driver / bus issues, so restarting
         * the device may at times not work immediately. Propagate
         * the error.
         */
        res = drv_start(local);
        if (res) {
                if (suspended)
                        WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n");
                else
                        WARN(1, "Hardware became unavailable during restart.\n");
                ieee80211_handle_reconfig_failure(local);
                return res;
        }

        /* setup fragmentation threshold */
        drv_set_frag_threshold(local, hw->wiphy->frag_threshold);

        /* setup RTS threshold */
        drv_set_rts_threshold(local, hw->wiphy->rts_threshold);

        /* reset coverage class */
        drv_set_coverage_class(local, hw->wiphy->coverage_class);

        ieee80211_led_radio(local, true);
        ieee80211_mod_tpt_led_trig(local,
                                   IEEE80211_TPT_LEDTRIG_FL_RADIO, 0);

        /* add interfaces */
        sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata);
        if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) {
                /* in HW restart it exists already */
                WARN_ON(local->resuming);
                res = drv_add_interface(local, sdata);
                if (WARN_ON(res)) {
                        RCU_INIT_POINTER(local->monitor_sdata, NULL);
                        synchronize_net();
                        kfree(sdata);
                }
        }

        list_for_each_entry(sdata, &local->interfaces, list) {
                if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
                    sdata->vif.type != NL80211_IFTYPE_MONITOR &&
                    ieee80211_sdata_running(sdata)) {
                        res = drv_add_interface(local, sdata);
                        if (WARN_ON(res))
                                break;
                }
        }

        /* If adding any of the interfaces failed above, roll back and
         * report failure.
         */
        if (res) {
                list_for_each_entry_continue_reverse(sdata, &local->interfaces,
                                                     list)
                        if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
                            sdata->vif.type != NL80211_IFTYPE_MONITOR &&
                            ieee80211_sdata_running(sdata))
                                drv_remove_interface(local, sdata);
                ieee80211_handle_reconfig_failure(local);
                return res;
        }

        /* add channel contexts */
        list_for_each_entry(ctx, &local->chanctx_list, list)
                if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
                        WARN_ON(drv_add_chanctx(local, ctx));

        sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata);
        if (sdata && ieee80211_sdata_running(sdata))
                ieee80211_assign_chanctx(local, sdata, &sdata->deflink);

        /* reconfigure hardware */
        ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL |
                                   IEEE80211_CONF_CHANGE_MONITOR |
                                   IEEE80211_CONF_CHANGE_PS |
                                   IEEE80211_CONF_CHANGE_RETRY_LIMITS |
                                   IEEE80211_CONF_CHANGE_IDLE);

        ieee80211_configure_filter(local);

        /* Finally also reconfigure all the BSS information */
        list_for_each_entry(sdata, &local->interfaces, list) {
                /* common change flags for all interface types - link only */
                u64 changed = BSS_CHANGED_ERP_CTS_PROT |
                              BSS_CHANGED_ERP_PREAMBLE |
                              BSS_CHANGED_ERP_SLOT |
                              BSS_CHANGED_HT |
                              BSS_CHANGED_BASIC_RATES |
                              BSS_CHANGED_BEACON_INT |
                              BSS_CHANGED_BSSID |
                              BSS_CHANGED_CQM |
                              BSS_CHANGED_QOS |
                              BSS_CHANGED_TXPOWER |
                              BSS_CHANGED_MCAST_RATE;
                struct ieee80211_link_data *link = NULL;
                unsigned int link_id;
                u32 active_links = 0;

                if (!ieee80211_sdata_running(sdata))
                        continue;

                if (ieee80211_vif_is_mld(&sdata->vif)) {
                        struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS] = {
                                [0] = &sdata->vif.bss_conf,
                        };

                        if (sdata->vif.type == NL80211_IFTYPE_STATION) {
                                /* start with a single active link */
                                active_links = sdata->vif.active_links;
                                link_id = ffs(active_links) - 1;
                                sdata->vif.active_links = BIT(link_id);
                        }

                        drv_change_vif_links(local, sdata, 0,
                                             sdata->vif.active_links,
                                             old);
                }

                sdata->restart_active_links = active_links;

                for (link_id = 0;
                     link_id < ARRAY_SIZE(sdata->vif.link_conf);
                     link_id++) {
                        if (!ieee80211_vif_link_active(&sdata->vif, link_id))
                                continue;

                        link = sdata_dereference(sdata->link[link_id], sdata);
                        if (!link)
                                continue;

                        ieee80211_assign_chanctx(local, sdata, link);
                }

                switch (sdata->vif.type) {
                case NL80211_IFTYPE_AP_VLAN:
                case NL80211_IFTYPE_MONITOR:
                        break;
                case NL80211_IFTYPE_ADHOC:
                        if (sdata->vif.cfg.ibss_joined)
                                WARN_ON(drv_join_ibss(local, sdata));
                        fallthrough;
                default:
                        ieee80211_reconfig_stations(sdata);
                        fallthrough;
                case NL80211_IFTYPE_AP: /* AP stations are handled later */
                        for (i = 0; i < IEEE80211_NUM_ACS; i++)
                                drv_conf_tx(local, &sdata->deflink, i,
                                            &sdata->deflink.tx_conf[i]);
                        break;
                }

                if (sdata->vif.bss_conf.mu_mimo_owner)
                        changed |= BSS_CHANGED_MU_GROUPS;

                if (!ieee80211_vif_is_mld(&sdata->vif))
                        changed |= BSS_CHANGED_IDLE;

                switch (sdata->vif.type) {
                case NL80211_IFTYPE_STATION:
                        if (!ieee80211_vif_is_mld(&sdata->vif)) {
                                changed |= BSS_CHANGED_ASSOC |
                                           BSS_CHANGED_ARP_FILTER |
                                           BSS_CHANGED_PS;

                                /* Re-send beacon info report to the driver */
                                if (sdata->deflink.u.mgd.have_beacon)
                                        changed |= BSS_CHANGED_BEACON_INFO;

                                if (sdata->vif.bss_conf.max_idle_period ||
                                    sdata->vif.bss_conf.protected_keep_alive)
                                        changed |= BSS_CHANGED_KEEP_ALIVE;

                                ieee80211_bss_info_change_notify(sdata,
                                                                 changed);
                        } else if (!WARN_ON(!link)) {
                                ieee80211_link_info_change_notify(sdata, link,
                                                                  changed);
                                changed = BSS_CHANGED_ASSOC |
                                          BSS_CHANGED_IDLE |
                                          BSS_CHANGED_PS |
                                          BSS_CHANGED_ARP_FILTER;
                                ieee80211_vif_cfg_change_notify(sdata, changed);
                        }
                        break;
                case NL80211_IFTYPE_OCB:
                        changed |= BSS_CHANGED_OCB;
                        ieee80211_bss_info_change_notify(sdata, changed);
                        break;
                case NL80211_IFTYPE_ADHOC:
                        changed |= BSS_CHANGED_IBSS;
                        fallthrough;
                case NL80211_IFTYPE_AP:
                        changed |= BSS_CHANGED_P2P_PS;

                        if (ieee80211_vif_is_mld(&sdata->vif))
                                ieee80211_vif_cfg_change_notify(sdata,
                                                                BSS_CHANGED_SSID);
                        else
                                changed |= BSS_CHANGED_SSID;

                        if (sdata->vif.bss_conf.ftm_responder == 1 &&
                            wiphy_ext_feature_isset(sdata->local->hw.wiphy,
                                        NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
                                changed |= BSS_CHANGED_FTM_RESPONDER;

                        if (sdata->vif.type == NL80211_IFTYPE_AP) {
                                changed |= BSS_CHANGED_AP_PROBE_RESP;

                                if (ieee80211_vif_is_mld(&sdata->vif)) {
                                        ieee80211_reconfig_ap_links(local,
                                                                    sdata,
                                                                    changed);
                                        break;
                                }

                                if (rcu_access_pointer(sdata->deflink.u.ap.beacon))
                                        drv_start_ap(local, sdata,
                                                     sdata->deflink.conf);
                        }
                        fallthrough;
                case NL80211_IFTYPE_MESH_POINT:
                        if (sdata->vif.bss_conf.enable_beacon) {
                                changed |= BSS_CHANGED_BEACON |
                                           BSS_CHANGED_BEACON_ENABLED;
                                ieee80211_bss_info_change_notify(sdata, changed);
                        }
                        break;
                case NL80211_IFTYPE_NAN:
                        res = ieee80211_reconfig_nan(sdata);
                        if (res < 0) {
                                ieee80211_handle_reconfig_failure(local);
                                return res;
                        }
                        break;
                case NL80211_IFTYPE_AP_VLAN:
                case NL80211_IFTYPE_MONITOR:
                case NL80211_IFTYPE_P2P_DEVICE:
                        /* nothing to do */
                        break;
                case NL80211_IFTYPE_UNSPECIFIED:
                case NUM_NL80211_IFTYPES:
                case NL80211_IFTYPE_P2P_CLIENT:
                case NL80211_IFTYPE_P2P_GO:
                case NL80211_IFTYPE_WDS:
                        WARN_ON(1);
                        break;
                }
        }

        ieee80211_recalc_ps(local);

        /*
         * The sta might be in psm against the ap (e.g. because
         * this was the state before a hw restart), so we
         * explicitly send a null packet in order to make sure
         * it'll sync against the ap (and get out of psm).
         */
        if (!(local->hw.conf.flags & IEEE80211_CONF_PS)) {
                list_for_each_entry(sdata, &local->interfaces, list) {
                        if (sdata->vif.type != NL80211_IFTYPE_STATION)
                                continue;
                        if (!sdata->u.mgd.associated)
                                continue;

                        ieee80211_send_nullfunc(local, sdata, false);
                }
        }

        /* APs are now beaconing, add back stations */
        list_for_each_entry(sdata, &local->interfaces, list) {
                if (!ieee80211_sdata_running(sdata))
                        continue;

                switch (sdata->vif.type) {
                case NL80211_IFTYPE_AP_VLAN:
                case NL80211_IFTYPE_AP:
                        ieee80211_reconfig_stations(sdata);
                        break;
                default:
                        break;
                }
        }

        /* add back keys */
        list_for_each_entry(sdata, &local->interfaces, list)
                ieee80211_reenable_keys(sdata);

        /* re-enable multi-link for client interfaces */
        list_for_each_entry(sdata, &local->interfaces, list) {
                if (sdata->restart_active_links)
                        ieee80211_set_active_links(&sdata->vif,
                                                   sdata->restart_active_links);
                /*
                 * If a link switch was scheduled before the restart, and ran
                 * before reconfig, it will do nothing, so re-schedule.
                 */
                if (sdata->desired_active_links)
                        wiphy_work_queue(sdata->local->hw.wiphy,
                                         &sdata->activate_links_work);
        }

        /* Reconfigure sched scan if it was interrupted by FW restart */
        sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata,
                                                lockdep_is_held(&local->hw.wiphy->mtx));
        sched_scan_req = rcu_dereference_protected(local->sched_scan_req,
                                                lockdep_is_held(&local->hw.wiphy->mtx));
        if (sched_scan_sdata && sched_scan_req)
                /*
                 * Sched scan stopped, but we don't want to report it. Instead,
                 * we're trying to reschedule. However, if more than one scan
                 * plan was set, we cannot reschedule since we don't know which
                 * scan plan was currently running (and some scan plans may have
                 * already finished).
                 */
                if (sched_scan_req->n_scan_plans > 1 ||
                    __ieee80211_request_sched_scan_start(sched_scan_sdata,
                                                         sched_scan_req)) {
                        RCU_INIT_POINTER(local->sched_scan_sdata, NULL);
                        RCU_INIT_POINTER(local->sched_scan_req, NULL);
                        sched_scan_stopped = true;
                }

        if (sched_scan_stopped)
                cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0);

 wake_up:

        if (local->monitors == local->open_count && local->monitors > 0)
                ieee80211_add_virtual_monitor(local);

        /*
         * Clear the WLAN_STA_BLOCK_BA flag so new aggregation
         * sessions can be established after a resume.
         *
         * Also tear down aggregation sessions since reconfiguring
         * them in a hardware restart scenario is not easily done
         * right now, and the hardware will have lost information
         * about the sessions, but we and the AP still think they
         * are active. This is really a workaround though.
         */
        if (ieee80211_hw_check(hw, AMPDU_AGGREGATION)) {
                list_for_each_entry(sta, &local->sta_list, list) {
                        if (!local->resuming)
                                ieee80211_sta_tear_down_BA_sessions(
                                                sta, AGG_STOP_LOCAL_REQUEST);
                        clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
                }
        }

        /*
         * If this is for hw restart things are still running.
         * We may want to change that later, however.
         */
        if (local->open_count && (!suspended || reconfig_due_to_wowlan))
                drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART);

        if (local->in_reconfig) {
                in_reconfig = local->in_reconfig;
                local->in_reconfig = false;
                barrier();

                ieee80211_reconfig_roc(local);

                /* Requeue all works */
                list_for_each_entry(sdata, &local->interfaces, list)
                        wiphy_work_queue(local->hw.wiphy, &sdata->work);
        }

        ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
                                        IEEE80211_QUEUE_STOP_REASON_SUSPEND,
                                        false);

        if (in_reconfig) {
                list_for_each_entry(sdata, &local->interfaces, list) {
                        if (!ieee80211_sdata_running(sdata))
                                continue;
                        if (sdata->vif.type == NL80211_IFTYPE_STATION)
                                ieee80211_sta_restart(sdata);
                }
        }

        if (!suspended)
                return 0;

#ifdef CONFIG_PM
        /* first set suspended false, then resuming */
        local->suspended = false;
        mb();
        local->resuming = false;

        ieee80211_flush_completed_scan(local, false);

        if (local->open_count && !reconfig_due_to_wowlan)
                drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND);

        list_for_each_entry(sdata, &local->interfaces, list) {
                if (!ieee80211_sdata_running(sdata))
                        continue;
                if (sdata->vif.type == NL80211_IFTYPE_STATION)
                        ieee80211_sta_restart(sdata);
        }

        mod_timer(&local->sta_cleanup, jiffies + 1);
#else
        WARN_ON(1);
#endif

        return 0;
}

static void ieee80211_reconfig_disconnect(struct ieee80211_vif *vif, u8 flag)
{
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_local *local;
        struct ieee80211_key *key;

        if (WARN_ON(!vif))
                return;

        sdata = vif_to_sdata(vif);
        local = sdata->local;

        lockdep_assert_wiphy(local->hw.wiphy);

        if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_RESUME &&
                    !local->resuming))
                return;

        if (WARN_ON(flag & IEEE80211_SDATA_DISCONNECT_HW_RESTART &&
                    !local->in_reconfig))
                return;

        if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
                return;

        sdata->flags |= flag;

        list_for_each_entry(key, &sdata->key_list, list)
                key->flags |= KEY_FLAG_TAINTED;
}

void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif)
{
        ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_HW_RESTART);
}
EXPORT_SYMBOL_GPL(ieee80211_hw_restart_disconnect);

void ieee80211_resume_disconnect(struct ieee80211_vif *vif)
{
        ieee80211_reconfig_disconnect(vif, IEEE80211_SDATA_DISCONNECT_RESUME);
}
EXPORT_SYMBOL_GPL(ieee80211_resume_disconnect);

void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata,
                           struct ieee80211_link_data *link)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_chanctx_conf *chanctx_conf;
        struct ieee80211_chanctx *chanctx;

        lockdep_assert_wiphy(local->hw.wiphy);

        chanctx_conf = rcu_dereference_protected(link->conf->chanctx_conf,
                                                 lockdep_is_held(&local->hw.wiphy->mtx));

        /*
         * This function can be called from a work, thus it may be possible
         * that the chanctx_conf is removed (due to a disconnection, for
         * example).
         * So nothing should be done in such case.
         */
        if (!chanctx_conf)
                return;

        chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf);
        ieee80211_recalc_smps_chanctx(local, chanctx);
}

void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata,
                                  int link_id)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_chanctx_conf *chanctx_conf;
        struct ieee80211_chanctx *chanctx;
        int i;

        lockdep_assert_wiphy(local->hw.wiphy);

        for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) {
                struct ieee80211_bss_conf *bss_conf;

                if (link_id >= 0 && link_id != i)
                        continue;

                rcu_read_lock();
                bss_conf = rcu_dereference(sdata->vif.link_conf[i]);
                if (!bss_conf) {
                        rcu_read_unlock();
                        continue;
                }

                chanctx_conf = rcu_dereference_protected(bss_conf->chanctx_conf,
                                                         lockdep_is_held(&local->hw.wiphy->mtx));
                /*
                 * Since we hold the wiphy mutex (checked above)
                 * we can take the chanctx_conf pointer out of the
                 * RCU critical section, it cannot go away without
                 * the mutex. Just the way we reached it could - in
                 * theory - go away, but we don't really care and
                 * it really shouldn't happen anyway.
                 */
                rcu_read_unlock();

                if (!chanctx_conf)
                        return;

                chanctx = container_of(chanctx_conf, struct ieee80211_chanctx,
                                       conf);
                ieee80211_recalc_chanctx_min_def(local, chanctx, NULL);
        }
}

size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset)
{
        size_t pos = offset;

        while (pos < ielen && ies[pos] != WLAN_EID_VENDOR_SPECIFIC)
                pos += 2 + ies[pos + 1];

        return pos;
}

u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
                              u16 cap)
{
        __le16 tmp;

        *pos++ = WLAN_EID_HT_CAPABILITY;
        *pos++ = sizeof(struct ieee80211_ht_cap);
        memset(pos, 0, sizeof(struct ieee80211_ht_cap));

        /* capability flags */
        tmp = cpu_to_le16(cap);
        memcpy(pos, &tmp, sizeof(u16));
        pos += sizeof(u16);

        /* AMPDU parameters */
        *pos++ = ht_cap->ampdu_factor |
                 (ht_cap->ampdu_density <<
                        IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT);

        /* MCS set */
        memcpy(pos, &ht_cap->mcs, sizeof(ht_cap->mcs));
        pos += sizeof(ht_cap->mcs);

        /* extended capabilities */
        pos += sizeof(__le16);

        /* BF capabilities */
        pos += sizeof(__le32);

        /* antenna selection */
        pos += sizeof(u8);

        return pos;
}

u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
                               u32 cap)
{
        __le32 tmp;

        *pos++ = WLAN_EID_VHT_CAPABILITY;
        *pos++ = sizeof(struct ieee80211_vht_cap);
        memset(pos, 0, sizeof(struct ieee80211_vht_cap));

        /* capability flags */
        tmp = cpu_to_le32(cap);
        memcpy(pos, &tmp, sizeof(u32));
        pos += sizeof(u32);

        /* VHT MCS set */
        memcpy(pos, &vht_cap->vht_mcs, sizeof(vht_cap->vht_mcs));
        pos += sizeof(vht_cap->vht_mcs);

        return pos;
}

/* this may return more than ieee80211_put_he_6ghz_cap() will need */
u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata)
{
        const struct ieee80211_sta_he_cap *he_cap;
        struct ieee80211_supported_band *sband;
        u8 n;

        sband = ieee80211_get_sband(sdata);
        if (!sband)
                return 0;

        he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif);
        if (!he_cap)
                return 0;

        n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem);
        return 2 + 1 +
               sizeof(he_cap->he_cap_elem) + n +
               ieee80211_he_ppe_size(he_cap->ppe_thres[0],
                                     he_cap->he_cap_elem.phy_cap_info);
}

static void
ieee80211_get_adjusted_he_cap(const struct ieee80211_conn_settings *conn,
                              const struct ieee80211_sta_he_cap *he_cap,
                              struct ieee80211_he_cap_elem *elem)
{
        u8 ru_limit, max_ru;

        *elem = he_cap->he_cap_elem;

        switch (conn->bw_limit) {
        case IEEE80211_CONN_BW_LIMIT_20:
                ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242;
                break;
        case IEEE80211_CONN_BW_LIMIT_40:
                ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484;
                break;
        case IEEE80211_CONN_BW_LIMIT_80:
                ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996;
                break;
        default:
                ru_limit = IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996;
                break;
        }

        max_ru = elem->phy_cap_info[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK;
        max_ru = min(max_ru, ru_limit);
        elem->phy_cap_info[8] &= ~IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK;
        elem->phy_cap_info[8] |= max_ru;

        if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) {
                elem->phy_cap_info[0] &=
                        ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
                          IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G);
                elem->phy_cap_info[9] &=
                        ~IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM;
        }

        if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) {
                elem->phy_cap_info[0] &=
                        ~(IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
                          IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G);
                elem->phy_cap_info[5] &=
                        ~IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK;
                elem->phy_cap_info[7] &=
                        ~(IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ |
                          IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ);
        }
}

int ieee80211_put_he_cap(struct sk_buff *skb,
                         struct ieee80211_sub_if_data *sdata,
                         const struct ieee80211_supported_band *sband,
                         const struct ieee80211_conn_settings *conn)
{
        const struct ieee80211_sta_he_cap *he_cap;
        struct ieee80211_he_cap_elem elem;
        u8 *len;
        u8 n;
        u8 ie_len;

        if (!conn)
                conn = &ieee80211_conn_settings_unlimited;

        he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif);
        if (!he_cap)
                return 0;

        /* modify on stack first to calculate 'n' and 'ie_len' correctly */
        ieee80211_get_adjusted_he_cap(conn, he_cap, &elem);

        n = ieee80211_he_mcs_nss_size(&elem);
        ie_len = 2 + 1 +
                 sizeof(he_cap->he_cap_elem) + n +
                 ieee80211_he_ppe_size(he_cap->ppe_thres[0],
                                       he_cap->he_cap_elem.phy_cap_info);

        if (skb_tailroom(skb) < ie_len)
                return -ENOBUFS;

        skb_put_u8(skb, WLAN_EID_EXTENSION);
        len = skb_put(skb, 1); /* We'll set the size later below */
        skb_put_u8(skb, WLAN_EID_EXT_HE_CAPABILITY);

        /* Fixed data */
        skb_put_data(skb, &elem, sizeof(elem));

        skb_put_data(skb, &he_cap->he_mcs_nss_supp, n);

        /* Check if PPE Threshold should be present */
        if ((he_cap->he_cap_elem.phy_cap_info[6] &
             IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
                goto end;

        /*
         * Calculate how many PPET16/PPET8 pairs are to come. Algorithm:
         * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK)
         */
        n = hweight8(he_cap->ppe_thres[0] &
                     IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
        n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >>
                   IEEE80211_PPE_THRES_NSS_POS));

        /*
         * Each pair is 6 bits, and we need to add the 7 "header" bits to the
         * total size.
         */
        n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
        n = DIV_ROUND_UP(n, 8);

        /* Copy PPE Thresholds */
        skb_put_data(skb, &he_cap->ppe_thres, n);

end:
        *len = skb_tail_pointer(skb) - len - 1;
        return 0;
}

int ieee80211_put_he_6ghz_cap(struct sk_buff *skb,
                              struct ieee80211_sub_if_data *sdata,
                              enum ieee80211_smps_mode smps_mode)
{
        struct ieee80211_supported_band *sband;
        const struct ieee80211_sband_iftype_data *iftd;
        enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif);
        __le16 cap;

        if (!cfg80211_any_usable_channels(sdata->local->hw.wiphy,
                                          BIT(NL80211_BAND_6GHZ),
                                          IEEE80211_CHAN_NO_HE))
                return 0;

        sband = sdata->local->hw.wiphy->bands[NL80211_BAND_6GHZ];

        iftd = ieee80211_get_sband_iftype_data(sband, iftype);
        if (!iftd)
                return 0;

        /* Check for device HE 6 GHz capability before adding element */
        if (!iftd->he_6ghz_capa.capa)
                return 0;

        cap = iftd->he_6ghz_capa.capa;
        cap &= cpu_to_le16(~IEEE80211_HE_6GHZ_CAP_SM_PS);

        switch (smps_mode) {
        case IEEE80211_SMPS_AUTOMATIC:
        case IEEE80211_SMPS_NUM_MODES:
                WARN_ON(1);
                fallthrough;
        case IEEE80211_SMPS_OFF:
                cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED,
                                        IEEE80211_HE_6GHZ_CAP_SM_PS);
                break;
        case IEEE80211_SMPS_STATIC:
                cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC,
                                        IEEE80211_HE_6GHZ_CAP_SM_PS);
                break;
        case IEEE80211_SMPS_DYNAMIC:
                cap |= le16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC,
                                        IEEE80211_HE_6GHZ_CAP_SM_PS);
                break;
        }

        if (skb_tailroom(skb) < 2 + 1 + sizeof(cap))
                return -ENOBUFS;

        skb_put_u8(skb, WLAN_EID_EXTENSION);
        skb_put_u8(skb, 1 + sizeof(cap));
        skb_put_u8(skb, WLAN_EID_EXT_HE_6GHZ_CAPA);
        skb_put_data(skb, &cap, sizeof(cap));
        return 0;
}

u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
                               const struct cfg80211_chan_def *chandef,
                               u16 prot_mode, bool rifs_mode)
{
        struct ieee80211_ht_operation *ht_oper;
        /* Build HT Information */
        *pos++ = WLAN_EID_HT_OPERATION;
        *pos++ = sizeof(struct ieee80211_ht_operation);
        ht_oper = (struct ieee80211_ht_operation *)pos;
        ht_oper->primary_chan = ieee80211_frequency_to_channel(
                                        chandef->chan->center_freq);
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_160:
        case NL80211_CHAN_WIDTH_80P80:
        case NL80211_CHAN_WIDTH_80:
        case NL80211_CHAN_WIDTH_40:
                if (chandef->center_freq1 > chandef->chan->center_freq)
                        ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
                else
                        ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_BELOW;
                break;
        case NL80211_CHAN_WIDTH_320:
                /* HT information element should not be included on 6GHz */
                WARN_ON(1);
                return pos;
        default:
                ht_oper->ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE;
                break;
        }
        if (ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 &&
            chandef->width != NL80211_CHAN_WIDTH_20_NOHT &&
            chandef->width != NL80211_CHAN_WIDTH_20)
                ht_oper->ht_param |= IEEE80211_HT_PARAM_CHAN_WIDTH_ANY;

        if (rifs_mode)
                ht_oper->ht_param |= IEEE80211_HT_PARAM_RIFS_MODE;

        ht_oper->operation_mode = cpu_to_le16(prot_mode);
        ht_oper->stbc_param = 0x0000;

        /* It seems that Basic MCS set and Supported MCS set
           are identical for the first 10 bytes */
        memset(&ht_oper->basic_set, 0, 16);
        memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10);

        return pos + sizeof(struct ieee80211_ht_operation);
}

void ieee80211_ie_build_wide_bw_cs(u8 *pos,
                                   const struct cfg80211_chan_def *chandef)
{
        *pos++ = WLAN_EID_WIDE_BW_CHANNEL_SWITCH;        /* EID */
        *pos++ = 3;                                        /* IE length */
        /* New channel width */
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_80:
                *pos++ = IEEE80211_VHT_CHANWIDTH_80MHZ;
                break;
        case NL80211_CHAN_WIDTH_160:
                *pos++ = IEEE80211_VHT_CHANWIDTH_160MHZ;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                *pos++ = IEEE80211_VHT_CHANWIDTH_80P80MHZ;
                break;
        case NL80211_CHAN_WIDTH_320:
                /* The behavior is not defined for 320 MHz channels */
                WARN_ON(1);
                fallthrough;
        default:
                *pos++ = IEEE80211_VHT_CHANWIDTH_USE_HT;
        }

        /* new center frequency segment 0 */
        *pos++ = ieee80211_frequency_to_channel(chandef->center_freq1);
        /* new center frequency segment 1 */
        if (chandef->center_freq2)
                *pos++ = ieee80211_frequency_to_channel(chandef->center_freq2);
        else
                *pos++ = 0;
}

u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
                                const struct cfg80211_chan_def *chandef)
{
        struct ieee80211_vht_operation *vht_oper;

        *pos++ = WLAN_EID_VHT_OPERATION;
        *pos++ = sizeof(struct ieee80211_vht_operation);
        vht_oper = (struct ieee80211_vht_operation *)pos;
        vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel(
                                                        chandef->center_freq1);
        if (chandef->center_freq2)
                vht_oper->center_freq_seg1_idx =
                        ieee80211_frequency_to_channel(chandef->center_freq2);
        else
                vht_oper->center_freq_seg1_idx = 0x00;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_160:
                /*
                 * Convert 160 MHz channel width to new style as interop
                 * workaround.
                 */
                vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
                vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx;
                if (chandef->chan->center_freq < chandef->center_freq1)
                        vht_oper->center_freq_seg0_idx -= 8;
                else
                        vht_oper->center_freq_seg0_idx += 8;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                /*
                 * Convert 80+80 MHz channel width to new style as interop
                 * workaround.
                 */
                vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
                break;
        case NL80211_CHAN_WIDTH_80:
                vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
                break;
        case NL80211_CHAN_WIDTH_320:
                /* VHT information element should not be included on 6GHz */
                WARN_ON(1);
                return pos;
        default:
                vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT;
                break;
        }

        /* don't require special VHT peer rates */
        vht_oper->basic_mcs_set = cpu_to_le16(0xffff);

        return pos + sizeof(struct ieee80211_vht_operation);
}

u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef)
{
        struct ieee80211_he_operation *he_oper;
        struct ieee80211_he_6ghz_oper *he_6ghz_op;
        u32 he_oper_params;
        u8 ie_len = 1 + sizeof(struct ieee80211_he_operation);

        if (chandef->chan->band == NL80211_BAND_6GHZ)
                ie_len += sizeof(struct ieee80211_he_6ghz_oper);

        *pos++ = WLAN_EID_EXTENSION;
        *pos++ = ie_len;
        *pos++ = WLAN_EID_EXT_HE_OPERATION;

        he_oper_params = 0;
        he_oper_params |= u32_encode_bits(1023, /* disabled */
                                IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
        he_oper_params |= u32_encode_bits(1,
                                IEEE80211_HE_OPERATION_ER_SU_DISABLE);
        he_oper_params |= u32_encode_bits(1,
                                IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED);
        if (chandef->chan->band == NL80211_BAND_6GHZ)
                he_oper_params |= u32_encode_bits(1,
                                IEEE80211_HE_OPERATION_6GHZ_OP_INFO);

        he_oper = (struct ieee80211_he_operation *)pos;
        he_oper->he_oper_params = cpu_to_le32(he_oper_params);

        /* don't require special HE peer rates */
        he_oper->he_mcs_nss_set = cpu_to_le16(0xffff);
        pos += sizeof(struct ieee80211_he_operation);

        if (chandef->chan->band != NL80211_BAND_6GHZ)
                goto out;

        /* TODO add VHT operational */
        he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos;
        he_6ghz_op->minrate = 6; /* 6 Mbps */
        he_6ghz_op->primary =
                ieee80211_frequency_to_channel(chandef->chan->center_freq);
        he_6ghz_op->ccfs0 =
                ieee80211_frequency_to_channel(chandef->center_freq1);
        if (chandef->center_freq2)
                he_6ghz_op->ccfs1 =
                        ieee80211_frequency_to_channel(chandef->center_freq2);
        else
                he_6ghz_op->ccfs1 = 0;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_320:
                /*
                 * TODO: mesh operation is not defined over 6GHz 320 MHz
                 * channels.
                 */
                WARN_ON(1);
                break;
        case NL80211_CHAN_WIDTH_160:
                /* Convert 160 MHz channel width to new style as interop
                 * workaround.
                 */
                he_6ghz_op->control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ;
                he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0;
                if (chandef->chan->center_freq < chandef->center_freq1)
                        he_6ghz_op->ccfs0 -= 8;
                else
                        he_6ghz_op->ccfs0 += 8;
                fallthrough;
        case NL80211_CHAN_WIDTH_80P80:
                he_6ghz_op->control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ;
                break;
        case NL80211_CHAN_WIDTH_80:
                he_6ghz_op->control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ;
                break;
        case NL80211_CHAN_WIDTH_40:
                he_6ghz_op->control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ;
                break;
        default:
                he_6ghz_op->control =
                        IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ;
                break;
        }

        pos += sizeof(struct ieee80211_he_6ghz_oper);

out:
        return pos;
}

u8 *ieee80211_ie_build_eht_oper(u8 *pos, struct cfg80211_chan_def *chandef,
                                const struct ieee80211_sta_eht_cap *eht_cap)

{
        const struct ieee80211_eht_mcs_nss_supp_20mhz_only *eht_mcs_nss =
                                        &eht_cap->eht_mcs_nss_supp.only_20mhz;
        struct ieee80211_eht_operation *eht_oper;
        struct ieee80211_eht_operation_info *eht_oper_info;
        u8 eht_oper_len = offsetof(struct ieee80211_eht_operation, optional);
        u8 eht_oper_info_len =
                offsetof(struct ieee80211_eht_operation_info, optional);
        u8 chan_width = 0;

        *pos++ = WLAN_EID_EXTENSION;
        *pos++ = 1 + eht_oper_len + eht_oper_info_len;
        *pos++ = WLAN_EID_EXT_EHT_OPERATION;

        eht_oper = (struct ieee80211_eht_operation *)pos;

        memcpy(&eht_oper->basic_mcs_nss, eht_mcs_nss, sizeof(*eht_mcs_nss));
        eht_oper->params |= IEEE80211_EHT_OPER_INFO_PRESENT;
        pos += eht_oper_len;

        eht_oper_info =
                (struct ieee80211_eht_operation_info *)eht_oper->optional;

        eht_oper_info->ccfs0 =
                ieee80211_frequency_to_channel(chandef->center_freq1);
        if (chandef->center_freq2)
                eht_oper_info->ccfs1 =
                        ieee80211_frequency_to_channel(chandef->center_freq2);
        else
                eht_oper_info->ccfs1 = 0;

        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_320:
                chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ;
                eht_oper_info->ccfs1 = eht_oper_info->ccfs0;
                if (chandef->chan->center_freq < chandef->center_freq1)
                        eht_oper_info->ccfs0 -= 16;
                else
                        eht_oper_info->ccfs0 += 16;
                break;
        case NL80211_CHAN_WIDTH_160:
                eht_oper_info->ccfs1 = eht_oper_info->ccfs0;
                if (chandef->chan->center_freq < chandef->center_freq1)
                        eht_oper_info->ccfs0 -= 8;
                else
                        eht_oper_info->ccfs0 += 8;
                fallthrough;
        case NL80211_CHAN_WIDTH_80P80:
                chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ;
                break;
        case NL80211_CHAN_WIDTH_80:
                chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ;
                break;
        case NL80211_CHAN_WIDTH_40:
                chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ;
                break;
        default:
                chan_width = IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ;
                break;
        }
        eht_oper_info->control = chan_width;
        pos += eht_oper_info_len;

        /* TODO: eht_oper_info->optional */

        return pos;
}

bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
                               struct cfg80211_chan_def *chandef)
{
        enum nl80211_channel_type channel_type;

        if (!ht_oper)
                return false;

        switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
        case IEEE80211_HT_PARAM_CHA_SEC_NONE:
                channel_type = NL80211_CHAN_HT20;
                break;
        case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
                channel_type = NL80211_CHAN_HT40PLUS;
                break;
        case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
                channel_type = NL80211_CHAN_HT40MINUS;
                break;
        default:
                return false;
        }

        cfg80211_chandef_create(chandef, chandef->chan, channel_type);
        return true;
}

bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
                                const struct ieee80211_vht_operation *oper,
                                const struct ieee80211_ht_operation *htop,
                                struct cfg80211_chan_def *chandef)
{
        struct cfg80211_chan_def new = *chandef;
        int cf0, cf1;
        int ccfs0, ccfs1, ccfs2;
        int ccf0, ccf1;
        u32 vht_cap;
        bool support_80_80 = false;
        bool support_160 = false;
        u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info,
                                          IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
        u8 supp_chwidth = u32_get_bits(vht_cap_info,
                                       IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK);

        if (!oper || !htop)
                return false;

        vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap;
        support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK |
                                  IEEE80211_VHT_CAP_EXT_NSS_BW_MASK));
        support_80_80 = ((vht_cap &
                         IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) ||
                        (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
                         vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) ||
                        ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >>
                                    IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1));
        ccfs0 = oper->center_freq_seg0_idx;
        ccfs1 = oper->center_freq_seg1_idx;
        ccfs2 = (le16_to_cpu(htop->operation_mode) &
                                IEEE80211_HT_OP_MODE_CCFS2_MASK)
                        >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT;

        ccf0 = ccfs0;

        /* if not supported, parse as though we didn't understand it */
        if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW))
                ext_nss_bw_supp = 0;

        /*
         * Cf. IEEE 802.11 Table 9-250
         *
         * We really just consider that because it's inefficient to connect
         * at a higher bandwidth than we'll actually be able to use.
         */
        switch ((supp_chwidth << 4) | ext_nss_bw_supp) {
        default:
        case 0x00:
                ccf1 = 0;
                support_160 = false;
                support_80_80 = false;
                break;
        case 0x01:
                support_80_80 = false;
                fallthrough;
        case 0x02:
        case 0x03:
                ccf1 = ccfs2;
                break;
        case 0x10:
                ccf1 = ccfs1;
                break;
        case 0x11:
        case 0x12:
                if (!ccfs1)
                        ccf1 = ccfs2;
                else
                        ccf1 = ccfs1;
                break;
        case 0x13:
        case 0x20:
        case 0x23:
                ccf1 = ccfs1;
                break;
        }

        cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band);
        cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band);

        switch (oper->chan_width) {
        case IEEE80211_VHT_CHANWIDTH_USE_HT:
                /* just use HT information directly */
                break;
        case IEEE80211_VHT_CHANWIDTH_80MHZ:
                new.width = NL80211_CHAN_WIDTH_80;
                new.center_freq1 = cf0;
                /* If needed, adjust based on the newer interop workaround. */
                if (ccf1) {
                        unsigned int diff;

                        diff = abs(ccf1 - ccf0);
                        if ((diff == 8) && support_160) {
                                new.width = NL80211_CHAN_WIDTH_160;
                                new.center_freq1 = cf1;
                        } else if ((diff > 8) && support_80_80) {
                                new.width = NL80211_CHAN_WIDTH_80P80;
                                new.center_freq2 = cf1;
                        }
                }
                break;
        case IEEE80211_VHT_CHANWIDTH_160MHZ:
                /* deprecated encoding */
                new.width = NL80211_CHAN_WIDTH_160;
                new.center_freq1 = cf0;
                break;
        case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
                /* deprecated encoding */
                new.width = NL80211_CHAN_WIDTH_80P80;
                new.center_freq1 = cf0;
                new.center_freq2 = cf1;
                break;
        default:
                return false;
        }

        if (!cfg80211_chandef_valid(&new))
                return false;

        *chandef = new;
        return true;
}

void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info,
                                struct cfg80211_chan_def *chandef)
{
        chandef->center_freq1 =
                ieee80211_channel_to_frequency(info->ccfs0,
                                               chandef->chan->band);

        switch (u8_get_bits(info->control,
                            IEEE80211_EHT_OPER_CHAN_WIDTH)) {
        case IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ:
                chandef->width = NL80211_CHAN_WIDTH_20;
                break;
        case IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ:
                chandef->width = NL80211_CHAN_WIDTH_40;
                break;
        case IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ:
                chandef->width = NL80211_CHAN_WIDTH_80;
                break;
        case IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ:
                chandef->width = NL80211_CHAN_WIDTH_160;
                chandef->center_freq1 =
                        ieee80211_channel_to_frequency(info->ccfs1,
                                                       chandef->chan->band);
                break;
        case IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ:
                chandef->width = NL80211_CHAN_WIDTH_320;
                chandef->center_freq1 =
                        ieee80211_channel_to_frequency(info->ccfs1,
                                                       chandef->chan->band);
                break;
        }
}

bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local,
                                    const struct ieee80211_he_operation *he_oper,
                                    const struct ieee80211_eht_operation *eht_oper,
                                    struct cfg80211_chan_def *chandef)
{
        struct cfg80211_chan_def he_chandef = *chandef;
        const struct ieee80211_he_6ghz_oper *he_6ghz_oper;
        u32 freq;

        if (chandef->chan->band != NL80211_BAND_6GHZ)
                return true;

        if (!he_oper)
                return false;

        he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper);
        if (!he_6ghz_oper)
                return false;

        /*
         * The EHT operation IE does not contain the primary channel so the
         * primary channel frequency should be taken from the 6 GHz operation
         * information.
         */
        freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary,
                                              NL80211_BAND_6GHZ);
        he_chandef.chan = ieee80211_get_channel(local->hw.wiphy, freq);

        if (!he_chandef.chan)
                return false;

        if (!eht_oper ||
            !(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) {
                switch (u8_get_bits(he_6ghz_oper->control,
                                    IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) {
                case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ:
                        he_chandef.width = NL80211_CHAN_WIDTH_20;
                        break;
                case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ:
                        he_chandef.width = NL80211_CHAN_WIDTH_40;
                        break;
                case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ:
                        he_chandef.width = NL80211_CHAN_WIDTH_80;
                        break;
                case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ:
                        he_chandef.width = NL80211_CHAN_WIDTH_80;
                        if (!he_6ghz_oper->ccfs1)
                                break;
                        if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8)
                                he_chandef.width = NL80211_CHAN_WIDTH_160;
                        else
                                he_chandef.width = NL80211_CHAN_WIDTH_80P80;
                        break;
                }

                if (he_chandef.width == NL80211_CHAN_WIDTH_160) {
                        he_chandef.center_freq1 =
                                ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1,
                                                               NL80211_BAND_6GHZ);
                } else {
                        he_chandef.center_freq1 =
                                ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0,
                                                               NL80211_BAND_6GHZ);
                        he_chandef.center_freq2 =
                                ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1,
                                                               NL80211_BAND_6GHZ);
                }
        } else {
                ieee80211_chandef_eht_oper((const void *)eht_oper->optional,
                                           &he_chandef);
                he_chandef.punctured =
                        ieee80211_eht_oper_dis_subchan_bitmap(eht_oper);
        }

        if (!cfg80211_chandef_valid(&he_chandef))
                return false;

        *chandef = he_chandef;

        return true;
}

bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
                                struct cfg80211_chan_def *chandef)
{
        u32 oper_freq;

        if (!oper)
                return false;

        switch (FIELD_GET(S1G_OPER_CH_WIDTH_OPER, oper->ch_width)) {
        case IEEE80211_S1G_CHANWIDTH_1MHZ:
                chandef->width = NL80211_CHAN_WIDTH_1;
                break;
        case IEEE80211_S1G_CHANWIDTH_2MHZ:
                chandef->width = NL80211_CHAN_WIDTH_2;
                break;
        case IEEE80211_S1G_CHANWIDTH_4MHZ:
                chandef->width = NL80211_CHAN_WIDTH_4;
                break;
        case IEEE80211_S1G_CHANWIDTH_8MHZ:
                chandef->width = NL80211_CHAN_WIDTH_8;
                break;
        case IEEE80211_S1G_CHANWIDTH_16MHZ:
                chandef->width = NL80211_CHAN_WIDTH_16;
                break;
        default:
                return false;
        }

        oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch,
                                                  NL80211_BAND_S1GHZ);
        chandef->center_freq1 = KHZ_TO_MHZ(oper_freq);
        chandef->freq1_offset = oper_freq % 1000;

        return true;
}

int ieee80211_put_srates_elem(struct sk_buff *skb,
                              const struct ieee80211_supported_band *sband,
                              u32 basic_rates, u32 rate_flags, u32 masked_rates,
                              u8 element_id)
{
        u8 i, rates, skip;

        rates = 0;
        for (i = 0; i < sband->n_bitrates; i++) {
                if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
                        continue;
                if (masked_rates & BIT(i))
                        continue;
                rates++;
        }

        if (element_id == WLAN_EID_SUPP_RATES) {
                rates = min_t(u8, rates, 8);
                skip = 0;
        } else {
                skip = 8;
                if (rates <= skip)
                        return 0;
                rates -= skip;
        }

        if (skb_tailroom(skb) < rates + 2)
                return -ENOBUFS;

        skb_put_u8(skb, element_id);
        skb_put_u8(skb, rates);

        for (i = 0; i < sband->n_bitrates && rates; i++) {
                int rate;
                u8 basic;

                if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
                        continue;
                if (masked_rates & BIT(i))
                        continue;

                if (skip > 0) {
                        skip--;
                        continue;
                }

                basic = basic_rates & BIT(i) ? 0x80 : 0;

                rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5);
                skb_put_u8(skb, basic | (u8)rate);
                rates--;
        }

        WARN(rates > 0, "rates confused: rates:%d, element:%d\n",
             rates, element_id);

        return 0;
}

int ieee80211_ave_rssi(struct ieee80211_vif *vif)
{
        struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);

        if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
                return 0;

        return -ewma_beacon_signal_read(&sdata->deflink.u.mgd.ave_beacon_signal);
}
EXPORT_SYMBOL_GPL(ieee80211_ave_rssi);

u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs)
{
        if (!mcs)
                return 1;

        /* TODO: consider rx_highest */

        if (mcs->rx_mask[3])
                return 4;
        if (mcs->rx_mask[2])
                return 3;
        if (mcs->rx_mask[1])
                return 2;
        return 1;
}

/**
 * ieee80211_calculate_rx_timestamp - calculate timestamp in frame
 * @local: mac80211 hw info struct
 * @status: RX status
 * @mpdu_len: total MPDU length (including FCS)
 * @mpdu_offset: offset into MPDU to calculate timestamp at
 *
 * This function calculates the RX timestamp at the given MPDU offset, taking
 * into account what the RX timestamp was. An offset of 0 will just normalize
 * the timestamp to TSF at beginning of MPDU reception.
 *
 * Returns: the calculated timestamp
 */
u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
                                     struct ieee80211_rx_status *status,
                                     unsigned int mpdu_len,
                                     unsigned int mpdu_offset)
{
        u64 ts = status->mactime;
        bool mactime_plcp_start;
        struct rate_info ri;
        u16 rate;
        u8 n_ltf;

        if (WARN_ON(!ieee80211_have_rx_timestamp(status)))
                return 0;

        mactime_plcp_start = (status->flag & RX_FLAG_MACTIME) ==
                                RX_FLAG_MACTIME_PLCP_START;

        memset(&ri, 0, sizeof(ri));

        ri.bw = status->bw;

        /* Fill cfg80211 rate info */
        switch (status->encoding) {
        case RX_ENC_EHT:
                ri.flags |= RATE_INFO_FLAGS_EHT_MCS;
                ri.mcs = status->rate_idx;
                ri.nss = status->nss;
                ri.eht_ru_alloc = status->eht.ru;
                if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
                        ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
                /* TODO/FIXME: is this right? handle other PPDUs */
                if (mactime_plcp_start) {
                        mpdu_offset += 2;
                        ts += 36;
                }
                break;
        case RX_ENC_HE:
                ri.flags |= RATE_INFO_FLAGS_HE_MCS;
                ri.mcs = status->rate_idx;
                ri.nss = status->nss;
                ri.he_ru_alloc = status->he_ru;
                if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
                        ri.flags |= RATE_INFO_FLAGS_SHORT_GI;

                /*
                 * See P802.11ax_D6.0, section 27.3.4 for
                 * VHT PPDU format.
                 */
                if (mactime_plcp_start) {
                        mpdu_offset += 2;
                        ts += 36;

                        /*
                         * TODO:
                         * For HE MU PPDU, add the HE-SIG-B.
                         * For HE ER PPDU, add 8us for the HE-SIG-A.
                         * For HE TB PPDU, add 4us for the HE-STF.
                         * Add the HE-LTF durations - variable.
                         */
                }

                break;
        case RX_ENC_HT:
                ri.mcs = status->rate_idx;
                ri.flags |= RATE_INFO_FLAGS_MCS;
                if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
                        ri.flags |= RATE_INFO_FLAGS_SHORT_GI;

                /*
                 * See P802.11REVmd_D3.0, section 19.3.2 for
                 * HT PPDU format.
                 */
                if (mactime_plcp_start) {
                        mpdu_offset += 2;
                        if (status->enc_flags & RX_ENC_FLAG_HT_GF)
                                ts += 24;
                        else
                                ts += 32;

                        /*
                         * Add Data HT-LTFs per streams
                         * TODO: add Extension HT-LTFs, 4us per LTF
                         */
                        n_ltf = ((ri.mcs >> 3) & 3) + 1;
                        n_ltf = n_ltf == 3 ? 4 : n_ltf;
                        ts += n_ltf * 4;
                }

                break;
        case RX_ENC_VHT:
                ri.flags |= RATE_INFO_FLAGS_VHT_MCS;
                ri.mcs = status->rate_idx;
                ri.nss = status->nss;
                if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
                        ri.flags |= RATE_INFO_FLAGS_SHORT_GI;

                /*
                 * See P802.11REVmd_D3.0, section 21.3.2 for
                 * VHT PPDU format.
                 */
                if (mactime_plcp_start) {
                        mpdu_offset += 2;
                        ts += 36;

                        /*
                         * Add VHT-LTFs per streams
                         */
                        n_ltf = (ri.nss != 1) && (ri.nss % 2) ?
                                ri.nss + 1 : ri.nss;
                        ts += 4 * n_ltf;
                }

                break;
        default:
                WARN_ON(1);
                fallthrough;
        case RX_ENC_LEGACY: {
                struct ieee80211_supported_band *sband;

                sband = local->hw.wiphy->bands[status->band];
                ri.legacy = sband->bitrates[status->rate_idx].bitrate;

                if (mactime_plcp_start) {
                        if (status->band == NL80211_BAND_5GHZ) {
                                ts += 20;
                                mpdu_offset += 2;
                        } else if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) {
                                ts += 96;
                        } else {
                                ts += 192;
                        }
                }
                break;
                }
        }

        rate = cfg80211_calculate_bitrate(&ri);
        if (WARN_ONCE(!rate,
                      "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n",
                      (unsigned long long)status->flag, status->rate_idx,
                      status->nss))
                return 0;

        /* rewind from end of MPDU */
        if ((status->flag & RX_FLAG_MACTIME) == RX_FLAG_MACTIME_END)
                ts -= mpdu_len * 8 * 10 / rate;

        ts += mpdu_offset * 8 * 10 / rate;

        return ts;
}

void ieee80211_dfs_cac_cancel(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata;
        struct cfg80211_chan_def chandef;

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry(sdata, &local->interfaces, list) {
                wiphy_delayed_work_cancel(local->hw.wiphy,
                                          &sdata->dfs_cac_timer_work);

                if (sdata->wdev.cac_started) {
                        chandef = sdata->vif.bss_conf.chanreq.oper;
                        ieee80211_link_release_channel(&sdata->deflink);
                        cfg80211_cac_event(sdata->dev,
                                           &chandef,
                                           NL80211_RADAR_CAC_ABORTED,
                                           GFP_KERNEL);
                }
        }
}

void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy,
                                       struct wiphy_work *work)
{
        struct ieee80211_local *local =
                container_of(work, struct ieee80211_local, radar_detected_work);
        struct cfg80211_chan_def chandef = local->hw.conf.chandef;
        struct ieee80211_chanctx *ctx;
        int num_chanctx = 0;

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry(ctx, &local->chanctx_list, list) {
                if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER)
                        continue;

                num_chanctx++;
                chandef = ctx->conf.def;
        }

        ieee80211_dfs_cac_cancel(local);

        if (num_chanctx > 1)
                /* XXX: multi-channel is not supported yet */
                WARN_ON(1);
        else
                cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL);
}

void ieee80211_radar_detected(struct ieee80211_hw *hw)
{
        struct ieee80211_local *local = hw_to_local(hw);

        trace_api_radar_detected(local);

        wiphy_work_queue(hw->wiphy, &local->radar_detected_work);
}
EXPORT_SYMBOL(ieee80211_radar_detected);

void ieee80211_chandef_downgrade(struct cfg80211_chan_def *c,
                                 struct ieee80211_conn_settings *conn)
{
        enum nl80211_chan_width new_primary_width;
        struct ieee80211_conn_settings _ignored = {};

        /* allow passing NULL if caller doesn't care */
        if (!conn)
                conn = &_ignored;

again:
        /* no-HT indicates nothing to do */
        new_primary_width = NL80211_CHAN_WIDTH_20_NOHT;

        switch (c->width) {
        default:
        case NL80211_CHAN_WIDTH_20_NOHT:
                WARN_ON_ONCE(1);
                fallthrough;
        case NL80211_CHAN_WIDTH_20:
                c->width = NL80211_CHAN_WIDTH_20_NOHT;
                conn->mode = IEEE80211_CONN_MODE_LEGACY;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20;
                c->punctured = 0;
                break;
        case NL80211_CHAN_WIDTH_40:
                c->width = NL80211_CHAN_WIDTH_20;
                c->center_freq1 = c->chan->center_freq;
                if (conn->mode == IEEE80211_CONN_MODE_VHT)
                        conn->mode = IEEE80211_CONN_MODE_HT;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20;
                c->punctured = 0;
                break;
        case NL80211_CHAN_WIDTH_80:
                new_primary_width = NL80211_CHAN_WIDTH_40;
                if (conn->mode == IEEE80211_CONN_MODE_VHT)
                        conn->mode = IEEE80211_CONN_MODE_HT;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_40;
                break;
        case NL80211_CHAN_WIDTH_80P80:
                c->center_freq2 = 0;
                c->width = NL80211_CHAN_WIDTH_80;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80;
                break;
        case NL80211_CHAN_WIDTH_160:
                new_primary_width = NL80211_CHAN_WIDTH_80;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80;
                break;
        case NL80211_CHAN_WIDTH_320:
                new_primary_width = NL80211_CHAN_WIDTH_160;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160;
                break;
        case NL80211_CHAN_WIDTH_1:
        case NL80211_CHAN_WIDTH_2:
        case NL80211_CHAN_WIDTH_4:
        case NL80211_CHAN_WIDTH_8:
        case NL80211_CHAN_WIDTH_16:
                WARN_ON_ONCE(1);
                /* keep c->width */
                conn->mode = IEEE80211_CONN_MODE_S1G;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20;
                break;
        case NL80211_CHAN_WIDTH_5:
        case NL80211_CHAN_WIDTH_10:
                WARN_ON_ONCE(1);
                /* keep c->width */
                conn->mode = IEEE80211_CONN_MODE_LEGACY;
                conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20;
                break;
        }

        if (new_primary_width != NL80211_CHAN_WIDTH_20_NOHT) {
                c->center_freq1 = cfg80211_chandef_primary(c, new_primary_width,
                                                           &c->punctured);
                c->width = new_primary_width;
        }

        /*
         * With an 80 MHz channel, we might have the puncturing in the primary
         * 40 Mhz channel, but that's not valid when downgraded to 40 MHz width.
         * In that case, downgrade again.
         */
        if (!cfg80211_chandef_valid(c) && c->punctured)
                goto again;

        WARN_ON_ONCE(!cfg80211_chandef_valid(c));
}

/*
 * Returns true if smps_mode_new is strictly more restrictive than
 * smps_mode_old.
 */
bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
                                   enum ieee80211_smps_mode smps_mode_new)
{
        if (WARN_ON_ONCE(smps_mode_old == IEEE80211_SMPS_AUTOMATIC ||
                         smps_mode_new == IEEE80211_SMPS_AUTOMATIC))
                return false;

        switch (smps_mode_old) {
        case IEEE80211_SMPS_STATIC:
                return false;
        case IEEE80211_SMPS_DYNAMIC:
                return smps_mode_new == IEEE80211_SMPS_STATIC;
        case IEEE80211_SMPS_OFF:
                return smps_mode_new != IEEE80211_SMPS_OFF;
        default:
                WARN_ON(1);
        }

        return false;
}

int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings)
{
        struct sk_buff *skb;
        struct ieee80211_mgmt *mgmt;
        struct ieee80211_local *local = sdata->local;
        int freq;
        int hdr_len = offsetofend(struct ieee80211_mgmt,
                                  u.action.u.chan_switch);
        u8 *pos;

        if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
            sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
                return -EOPNOTSUPP;

        skb = dev_alloc_skb(local->tx_headroom + hdr_len +
                            5 + /* channel switch announcement element */
                            3 + /* secondary channel offset element */
                            5 + /* wide bandwidth channel switch announcement */
                            8); /* mesh channel switch parameters element */
        if (!skb)
                return -ENOMEM;

        skb_reserve(skb, local->tx_headroom);
        mgmt = skb_put_zero(skb, hdr_len);
        mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
                                          IEEE80211_STYPE_ACTION);

        eth_broadcast_addr(mgmt->da);
        memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
        if (ieee80211_vif_is_mesh(&sdata->vif)) {
                memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
        } else {
                struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
                memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
        }
        mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT;
        mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH;
        pos = skb_put(skb, 5);
        *pos++ = WLAN_EID_CHANNEL_SWITCH;                        /* EID */
        *pos++ = 3;                                                /* IE length */
        *pos++ = csa_settings->block_tx ? 1 : 0;                /* CSA mode */
        freq = csa_settings->chandef.chan->center_freq;
        *pos++ = ieee80211_frequency_to_channel(freq);                /* channel */
        *pos++ = csa_settings->count;                                /* count */

        if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_40) {
                enum nl80211_channel_type ch_type;

                skb_put(skb, 3);
                *pos++ = WLAN_EID_SECONDARY_CHANNEL_OFFSET;        /* EID */
                *pos++ = 1;                                        /* IE length */
                ch_type = cfg80211_get_chandef_type(&csa_settings->chandef);
                if (ch_type == NL80211_CHAN_HT40PLUS)
                        *pos++ = IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
                else
                        *pos++ = IEEE80211_HT_PARAM_CHA_SEC_BELOW;
        }

        if (ieee80211_vif_is_mesh(&sdata->vif)) {
                struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;

                skb_put(skb, 8);
                *pos++ = WLAN_EID_CHAN_SWITCH_PARAM;                /* EID */
                *pos++ = 6;                                        /* IE length */
                *pos++ = sdata->u.mesh.mshcfg.dot11MeshTTL;        /* Mesh TTL */
                *pos = 0x00;        /* Mesh Flag: Tx Restrict, Initiator, Reason */
                *pos |= WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR;
                *pos++ |= csa_settings->block_tx ?
                          WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT : 0x00;
                put_unaligned_le16(WLAN_REASON_MESH_CHAN, pos); /* Reason Cd */
                pos += 2;
                put_unaligned_le16(ifmsh->pre_value, pos);/* Precedence Value */
                pos += 2;
        }

        if (csa_settings->chandef.width == NL80211_CHAN_WIDTH_80 ||
            csa_settings->chandef.width == NL80211_CHAN_WIDTH_80P80 ||
            csa_settings->chandef.width == NL80211_CHAN_WIDTH_160) {
                skb_put(skb, 5);
                ieee80211_ie_build_wide_bw_cs(pos, &csa_settings->chandef);
        }

        ieee80211_tx_skb(sdata, skb);
        return 0;
}

static bool
ieee80211_extend_noa_desc(struct ieee80211_noa_data *data, u32 tsf, int i)
{
        s32 end = data->desc[i].start + data->desc[i].duration - (tsf + 1);
        int skip;

        if (end > 0)
                return false;

        /* One shot NOA  */
        if (data->count[i] == 1)
                return false;

        if (data->desc[i].interval == 0)
                return false;

        /* End time is in the past, check for repetitions */
        skip = DIV_ROUND_UP(-end, data->desc[i].interval);
        if (data->count[i] < 255) {
                if (data->count[i] <= skip) {
                        data->count[i] = 0;
                        return false;
                }

                data->count[i] -= skip;
        }

        data->desc[i].start += skip * data->desc[i].interval;

        return true;
}

static bool
ieee80211_extend_absent_time(struct ieee80211_noa_data *data, u32 tsf,
                             s32 *offset)
{
        bool ret = false;
        int i;

        for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) {
                s32 cur;

                if (!data->count[i])
                        continue;

                if (ieee80211_extend_noa_desc(data, tsf + *offset, i))
                        ret = true;

                cur = data->desc[i].start - tsf;
                if (cur > *offset)
                        continue;

                cur = data->desc[i].start + data->desc[i].duration - tsf;
                if (cur > *offset)
                        *offset = cur;
        }

        return ret;
}

static u32
ieee80211_get_noa_absent_time(struct ieee80211_noa_data *data, u32 tsf)
{
        s32 offset = 0;
        int tries = 0;
        /*
         * arbitrary limit, used to avoid infinite loops when combined NoA
         * descriptors cover the full time period.
         */
        int max_tries = 5;

        ieee80211_extend_absent_time(data, tsf, &offset);
        do {
                if (!ieee80211_extend_absent_time(data, tsf, &offset))
                        break;

                tries++;
        } while (tries < max_tries);

        return offset;
}

void ieee80211_update_p2p_noa(struct ieee80211_noa_data *data, u32 tsf)
{
        u32 next_offset = BIT(31) - 1;
        int i;

        data->absent = 0;
        data->has_next_tsf = false;
        for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) {
                s32 start;

                if (!data->count[i])
                        continue;

                ieee80211_extend_noa_desc(data, tsf, i);
                start = data->desc[i].start - tsf;
                if (start <= 0)
                        data->absent |= BIT(i);

                if (next_offset > start)
                        next_offset = start;

                data->has_next_tsf = true;
        }

        if (data->absent)
                next_offset = ieee80211_get_noa_absent_time(data, tsf);

        data->next_tsf = tsf + next_offset;
}
EXPORT_SYMBOL(ieee80211_update_p2p_noa);

int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr,
                            struct ieee80211_noa_data *data, u32 tsf)
{
        int ret = 0;
        int i;

        memset(data, 0, sizeof(*data));

        for (i = 0; i < IEEE80211_P2P_NOA_DESC_MAX; i++) {
                const struct ieee80211_p2p_noa_desc *desc = &attr->desc[i];

                if (!desc->count || !desc->duration)
                        continue;

                data->count[i] = desc->count;
                data->desc[i].start = le32_to_cpu(desc->start_time);
                data->desc[i].duration = le32_to_cpu(desc->duration);
                data->desc[i].interval = le32_to_cpu(desc->interval);

                if (data->count[i] > 1 &&
                    data->desc[i].interval < data->desc[i].duration)
                        continue;

                ieee80211_extend_noa_desc(data, tsf, i);
                ret++;
        }

        if (ret)
                ieee80211_update_p2p_noa(data, tsf);

        return ret;
}
EXPORT_SYMBOL(ieee80211_parse_p2p_noa);

void ieee80211_recalc_dtim(struct ieee80211_local *local,
                           struct ieee80211_sub_if_data *sdata)
{
        u64 tsf = drv_get_tsf(local, sdata);
        u64 dtim_count = 0;
        u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024;
        u8 dtim_period = sdata->vif.bss_conf.dtim_period;
        struct ps_data *ps;
        u8 bcns_from_dtim;

        if (tsf == -1ULL || !beacon_int || !dtim_period)
                return;

        if (sdata->vif.type == NL80211_IFTYPE_AP ||
            sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
                if (!sdata->bss)
                        return;

                ps = &sdata->bss->ps;
        } else if (ieee80211_vif_is_mesh(&sdata->vif)) {
                ps = &sdata->u.mesh.ps;
        } else {
                return;
        }

        /*
         * actually finds last dtim_count, mac80211 will update in
         * __beacon_add_tim().
         * dtim_count = dtim_period - (tsf / bcn_int) % dtim_period
         */
        do_div(tsf, beacon_int);
        bcns_from_dtim = do_div(tsf, dtim_period);
        /* just had a DTIM */
        if (!bcns_from_dtim)
                dtim_count = 0;
        else
                dtim_count = dtim_period - bcns_from_dtim;

        ps->dtim_count = dtim_count;
}

static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local,
                                         struct ieee80211_chanctx *ctx)
{
        struct ieee80211_link_data *link;
        u8 radar_detect = 0;

        lockdep_assert_wiphy(local->hw.wiphy);

        if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED))
                return 0;

        list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list)
                if (link->reserved_radar_required)
                        radar_detect |= BIT(link->reserved.oper.width);

        /*
         * An in-place reservation context should not have any assigned vifs
         * until it replaces the other context.
         */
        WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER &&
                !list_empty(&ctx->assigned_links));

        list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) {
                if (!link->radar_required)
                        continue;

                radar_detect |=
                        BIT(link->conf->chanreq.oper.width);
        }

        return radar_detect;
}

int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
                                 const struct cfg80211_chan_def *chandef,
                                 enum ieee80211_chanctx_mode chanmode,
                                 u8 radar_detect)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_sub_if_data *sdata_iter;
        enum nl80211_iftype iftype = sdata->wdev.iftype;
        struct ieee80211_chanctx *ctx;
        int total = 1;
        struct iface_combination_params params = {
                .radar_detect = radar_detect,
        };

        lockdep_assert_wiphy(local->hw.wiphy);

        if (WARN_ON(hweight32(radar_detect) > 1))
                return -EINVAL;

        if (WARN_ON(chandef && chanmode == IEEE80211_CHANCTX_SHARED &&
                    !chandef->chan))
                return -EINVAL;

        if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
                return -EINVAL;

        if (sdata->vif.type == NL80211_IFTYPE_AP ||
            sdata->vif.type == NL80211_IFTYPE_MESH_POINT) {
                /*
                 * always passing this is harmless, since it'll be the
                 * same value that cfg80211 finds if it finds the same
                 * interface ... and that's always allowed
                 */
                params.new_beacon_int = sdata->vif.bss_conf.beacon_int;
        }

        /* Always allow software iftypes */
        if (cfg80211_iftype_allowed(local->hw.wiphy, iftype, 0, 1)) {
                if (radar_detect)
                        return -EINVAL;
                return 0;
        }

        if (chandef)
                params.num_different_channels = 1;

        if (iftype != NL80211_IFTYPE_UNSPECIFIED)
                params.iftype_num[iftype] = 1;

        list_for_each_entry(ctx, &local->chanctx_list, list) {
                if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
                        continue;
                params.radar_detect |=
                        ieee80211_chanctx_radar_detect(local, ctx);
                if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE) {
                        params.num_different_channels++;
                        continue;
                }
                if (chandef && chanmode == IEEE80211_CHANCTX_SHARED &&
                    cfg80211_chandef_compatible(chandef,
                                                &ctx->conf.def))
                        continue;
                params.num_different_channels++;
        }

        list_for_each_entry_rcu(sdata_iter, &local->interfaces, list) {
                struct wireless_dev *wdev_iter;

                wdev_iter = &sdata_iter->wdev;

                if (sdata_iter == sdata ||
                    !ieee80211_sdata_running(sdata_iter) ||
                    cfg80211_iftype_allowed(local->hw.wiphy,
                                            wdev_iter->iftype, 0, 1))
                        continue;

                params.iftype_num[wdev_iter->iftype]++;
                total++;
        }

        if (total == 1 && !params.radar_detect)
                return 0;

        return cfg80211_check_combinations(local->hw.wiphy, &params);
}

static void
ieee80211_iter_max_chans(const struct ieee80211_iface_combination *c,
                         void *data)
{
        u32 *max_num_different_channels = data;

        *max_num_different_channels = max(*max_num_different_channels,
                                          c->num_different_channels);
}

int ieee80211_max_num_channels(struct ieee80211_local *local)
{
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_chanctx *ctx;
        u32 max_num_different_channels = 1;
        int err;
        struct iface_combination_params params = {0};

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry(ctx, &local->chanctx_list, list) {
                if (ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)
                        continue;

                params.num_different_channels++;

                params.radar_detect |=
                        ieee80211_chanctx_radar_detect(local, ctx);
        }

        list_for_each_entry_rcu(sdata, &local->interfaces, list)
                params.iftype_num[sdata->wdev.iftype]++;

        err = cfg80211_iter_combinations(local->hw.wiphy, &params,
                                         ieee80211_iter_max_chans,
                                         &max_num_different_channels);
        if (err < 0)
                return err;

        return max_num_different_channels;
}

void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata,
                                struct ieee80211_sta_s1g_cap *caps,
                                struct sk_buff *skb)
{
        struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
        struct ieee80211_s1g_cap s1g_capab;
        u8 *pos;
        int i;

        if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
                return;

        if (!caps->s1g)
                return;

        memcpy(s1g_capab.capab_info, caps->cap, sizeof(caps->cap));
        memcpy(s1g_capab.supp_mcs_nss, caps->nss_mcs, sizeof(caps->nss_mcs));

        /* override the capability info */
        for (i = 0; i < sizeof(ifmgd->s1g_capa.capab_info); i++) {
                u8 mask = ifmgd->s1g_capa_mask.capab_info[i];

                s1g_capab.capab_info[i] &= ~mask;
                s1g_capab.capab_info[i] |= ifmgd->s1g_capa.capab_info[i] & mask;
        }

        /* then MCS and NSS set */
        for (i = 0; i < sizeof(ifmgd->s1g_capa.supp_mcs_nss); i++) {
                u8 mask = ifmgd->s1g_capa_mask.supp_mcs_nss[i];

                s1g_capab.supp_mcs_nss[i] &= ~mask;
                s1g_capab.supp_mcs_nss[i] |=
                        ifmgd->s1g_capa.supp_mcs_nss[i] & mask;
        }

        pos = skb_put(skb, 2 + sizeof(s1g_capab));
        *pos++ = WLAN_EID_S1G_CAPABILITIES;
        *pos++ = sizeof(s1g_capab);

        memcpy(pos, &s1g_capab, sizeof(s1g_capab));
}

void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata,
                                  struct sk_buff *skb)
{
        u8 *pos = skb_put(skb, 3);

        *pos++ = WLAN_EID_AID_REQUEST;
        *pos++ = 1;
        *pos++ = 0;
}

u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo)
{
        *buf++ = WLAN_EID_VENDOR_SPECIFIC;
        *buf++ = 7; /* len */
        *buf++ = 0x00; /* Microsoft OUI 00:50:F2 */
        *buf++ = 0x50;
        *buf++ = 0xf2;
        *buf++ = 2; /* WME */
        *buf++ = 0; /* WME info */
        *buf++ = 1; /* WME ver */
        *buf++ = qosinfo; /* U-APSD no in use */

        return buf;
}

void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
                             unsigned long *frame_cnt,
                             unsigned long *byte_cnt)
{
        struct txq_info *txqi = to_txq_info(txq);
        u32 frag_cnt = 0, frag_bytes = 0;
        struct sk_buff *skb;

        skb_queue_walk(&txqi->frags, skb) {
                frag_cnt++;
                frag_bytes += skb->len;
        }

        if (frame_cnt)
                *frame_cnt = txqi->tin.backlog_packets + frag_cnt;

        if (byte_cnt)
                *byte_cnt = txqi->tin.backlog_bytes + frag_bytes;
}
EXPORT_SYMBOL(ieee80211_txq_get_depth);

const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS] = {
        IEEE80211_WMM_IE_STA_QOSINFO_AC_VO,
        IEEE80211_WMM_IE_STA_QOSINFO_AC_VI,
        IEEE80211_WMM_IE_STA_QOSINFO_AC_BE,
        IEEE80211_WMM_IE_STA_QOSINFO_AC_BK
};

u16 ieee80211_encode_usf(int listen_interval)
{
        static const int listen_int_usf[] = { 1, 10, 1000, 10000 };
        u16 ui, usf = 0;

        /* find greatest USF */
        while (usf < IEEE80211_MAX_USF) {
                if (listen_interval % listen_int_usf[usf + 1])
                        break;
                usf += 1;
        }
        ui = listen_interval / listen_int_usf[usf];

        /* error if there is a remainder. Should've been checked by user */
        WARN_ON_ONCE(ui > IEEE80211_MAX_UI);
        listen_interval = FIELD_PREP(LISTEN_INT_USF, usf) |
                          FIELD_PREP(LISTEN_INT_UI, ui);

        return (u16) listen_interval;
}

/* this may return more than ieee80211_put_eht_cap() will need */
u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata)
{
        const struct ieee80211_sta_he_cap *he_cap;
        const struct ieee80211_sta_eht_cap *eht_cap;
        struct ieee80211_supported_band *sband;
        bool is_ap;
        u8 n;

        sband = ieee80211_get_sband(sdata);
        if (!sband)
                return 0;

        he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif);
        eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif);
        if (!he_cap || !eht_cap)
                return 0;

        is_ap = sdata->vif.type == NL80211_IFTYPE_AP;

        n = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
                                       &eht_cap->eht_cap_elem,
                                       is_ap);
        return 2 + 1 +
               sizeof(eht_cap->eht_cap_elem) + n +
               ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0],
                                      eht_cap->eht_cap_elem.phy_cap_info);
        return 0;
}

int ieee80211_put_eht_cap(struct sk_buff *skb,
                          struct ieee80211_sub_if_data *sdata,
                          const struct ieee80211_supported_band *sband,
                          const struct ieee80211_conn_settings *conn)
{
        const struct ieee80211_sta_he_cap *he_cap =
                ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif);
        const struct ieee80211_sta_eht_cap *eht_cap =
                ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif);
        bool for_ap = sdata->vif.type == NL80211_IFTYPE_AP;
        struct ieee80211_eht_cap_elem_fixed fixed;
        struct ieee80211_he_cap_elem he;
        u8 mcs_nss_len, ppet_len;
        u8 orig_mcs_nss_len;
        u8 ie_len;

        if (!conn)
                conn = &ieee80211_conn_settings_unlimited;

        /* Make sure we have place for the IE */
        if (!he_cap || !eht_cap)
                return 0;

        orig_mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
                                                      &eht_cap->eht_cap_elem,
                                                      for_ap);

        ieee80211_get_adjusted_he_cap(conn, he_cap, &he);

        fixed = eht_cap->eht_cap_elem;

        if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_80)
                fixed.phy_cap_info[6] &=
                        ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ;

        if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160) {
                fixed.phy_cap_info[1] &=
                        ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK;
                fixed.phy_cap_info[2] &=
                        ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK;
                fixed.phy_cap_info[6] &=
                        ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ;
        }

        if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320) {
                fixed.phy_cap_info[0] &=
                        ~IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ;
                fixed.phy_cap_info[1] &=
                        ~IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK;
                fixed.phy_cap_info[2] &=
                        ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK;
                fixed.phy_cap_info[6] &=
                        ~IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ;
        }

        if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20)
                fixed.phy_cap_info[0] &=
                        ~IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ;

        mcs_nss_len = ieee80211_eht_mcs_nss_size(&he, &fixed, for_ap);
        ppet_len = ieee80211_eht_ppe_size(eht_cap->eht_ppe_thres[0],
                                          fixed.phy_cap_info);

        ie_len = 2 + 1 + sizeof(eht_cap->eht_cap_elem) + mcs_nss_len + ppet_len;
        if (skb_tailroom(skb) < ie_len)
                return -ENOBUFS;

        skb_put_u8(skb, WLAN_EID_EXTENSION);
        skb_put_u8(skb, ie_len - 2);
        skb_put_u8(skb, WLAN_EID_EXT_EHT_CAPABILITY);
        skb_put_data(skb, &fixed, sizeof(fixed));

        if (mcs_nss_len == 4 && orig_mcs_nss_len != 4) {
                /*
                 * If the (non-AP) STA became 20 MHz only, then convert from
                 * <=80 to 20-MHz-only format, where MCSes are indicated in
                 * the groups 0-7, 8-9, 10-11, 12-13 rather than just 0-9,
                 * 10-11, 12-13. Thus, use 0-9 for 0-7 and 8-9.
                 */
                skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss);
                skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs9_max_nss);
                skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs11_max_nss);
                skb_put_u8(skb, eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss);
        } else {
                skb_put_data(skb, &eht_cap->eht_mcs_nss_supp, mcs_nss_len);
        }

        if (ppet_len)
                skb_put_data(skb, &eht_cap->eht_ppe_thres, ppet_len);

        return 0;
}

const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode)
{
        static const char * const modes[] = {
                [IEEE80211_CONN_MODE_S1G] = "S1G",
                [IEEE80211_CONN_MODE_LEGACY] = "legacy",
                [IEEE80211_CONN_MODE_HT] = "HT",
                [IEEE80211_CONN_MODE_VHT] = "VHT",
                [IEEE80211_CONN_MODE_HE] = "HE",
                [IEEE80211_CONN_MODE_EHT] = "EHT",
        };

        if (WARN_ON(mode >= ARRAY_SIZE(modes)))
                return "<out of range>";

        return modes[mode] ?: "<missing string>";
}

enum ieee80211_conn_bw_limit
ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_20_NOHT:
        case NL80211_CHAN_WIDTH_20:
                return IEEE80211_CONN_BW_LIMIT_20;
        case NL80211_CHAN_WIDTH_40:
                return IEEE80211_CONN_BW_LIMIT_40;
        case NL80211_CHAN_WIDTH_80:
                return IEEE80211_CONN_BW_LIMIT_80;
        case NL80211_CHAN_WIDTH_80P80:
        case NL80211_CHAN_WIDTH_160:
                return IEEE80211_CONN_BW_LIMIT_160;
        case NL80211_CHAN_WIDTH_320:
                return IEEE80211_CONN_BW_LIMIT_320;
        default:
                WARN(1, "unhandled chandef width %d\n", chandef->width);
                return IEEE80211_CONN_BW_LIMIT_20;
        }
}

void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe)
{
        for (int i = 0; i < 2; i++) {
                tpe->max_local[i].valid = false;
                memset(tpe->max_local[i].power,
                       IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT,
                       sizeof(tpe->max_local[i].power));

                tpe->max_reg_client[i].valid = false;
                memset(tpe->max_reg_client[i].power,
                       IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT,
                       sizeof(tpe->max_reg_client[i].power));

                tpe->psd_local[i].valid = false;
                memset(tpe->psd_local[i].power,
                       IEEE80211_TPE_PSD_NO_LIMIT,
                       sizeof(tpe->psd_local[i].power));

                tpe->psd_reg_client[i].valid = false;
                memset(tpe->psd_reg_client[i].power,
                       IEEE80211_TPE_PSD_NO_LIMIT,
                       sizeof(tpe->psd_reg_client[i].power));
        }
}
































































































































































    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM mptcp

#if !defined(_TRACE_MPTCP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MPTCP_H

#include <linux/tracepoint.h>

#define show_mapping_status(status)                                        \
        __print_symbolic(status,                                        \
                { 0, "MAPPING_OK" },                                        \
                { 1, "MAPPING_INVALID" },                                \
                { 2, "MAPPING_EMPTY" },                                        \
                { 3, "MAPPING_DATA_FIN" },                                \
                { 4, "MAPPING_DUMMY" })

TRACE_EVENT(mptcp_subflow_get_send,

        TP_PROTO(struct mptcp_subflow_context *subflow),

        TP_ARGS(subflow),

        TP_STRUCT__entry(
                __field(bool, active)
                __field(bool, free)
                __field(u32, snd_wnd)
                __field(u32, pace)
                __field(u8, backup)
                __field(u64, ratio)
        ),

        TP_fast_assign(
                struct sock *ssk;

                __entry->active = mptcp_subflow_active(subflow);
                __entry->backup = subflow->backup;

                if (subflow->tcp_sock && sk_fullsock(subflow->tcp_sock))
                        __entry->free = sk_stream_memory_free(subflow->tcp_sock);
                else
                        __entry->free = 0;

                ssk = mptcp_subflow_tcp_sock(subflow);
                if (ssk && sk_fullsock(ssk)) {
                        __entry->snd_wnd = tcp_sk(ssk)->snd_wnd;
                        __entry->pace = READ_ONCE(ssk->sk_pacing_rate);
                } else {
                        __entry->snd_wnd = 0;
                        __entry->pace = 0;
                }

                if (ssk && sk_fullsock(ssk) && __entry->pace)
                        __entry->ratio = div_u64((u64)ssk->sk_wmem_queued << 32, __entry->pace);
                else
                        __entry->ratio = 0;
        ),

        TP_printk("active=%d free=%d snd_wnd=%u pace=%u backup=%u ratio=%llu",
                  __entry->active, __entry->free,
                  __entry->snd_wnd, __entry->pace,
                  __entry->backup, __entry->ratio)
);

DECLARE_EVENT_CLASS(mptcp_dump_mpext,

        TP_PROTO(struct mptcp_ext *mpext),

        TP_ARGS(mpext),

        TP_STRUCT__entry(
                __field(u64, data_ack)
                __field(u64, data_seq)
                __field(u32, subflow_seq)
                __field(u16, data_len)
                __field(u16, csum)
                __field(u8, use_map)
                __field(u8, dsn64)
                __field(u8, data_fin)
                __field(u8, use_ack)
                __field(u8, ack64)
                __field(u8, mpc_map)
                __field(u8, frozen)
                __field(u8, reset_transient)
                __field(u8, reset_reason)
                __field(u8, csum_reqd)
                __field(u8, infinite_map)
        ),

        TP_fast_assign(
                __entry->data_ack = mpext->ack64 ? mpext->data_ack : mpext->data_ack32;
                __entry->data_seq = mpext->data_seq;
                __entry->subflow_seq = mpext->subflow_seq;
                __entry->data_len = mpext->data_len;
                __entry->csum = (__force u16)mpext->csum;
                __entry->use_map = mpext->use_map;
                __entry->dsn64 = mpext->dsn64;
                __entry->data_fin = mpext->data_fin;
                __entry->use_ack = mpext->use_ack;
                __entry->ack64 = mpext->ack64;
                __entry->mpc_map = mpext->mpc_map;
                __entry->frozen = mpext->frozen;
                __entry->reset_transient = mpext->reset_transient;
                __entry->reset_reason = mpext->reset_reason;
                __entry->csum_reqd = mpext->csum_reqd;
                __entry->infinite_map = mpext->infinite_map;
        ),

        TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u infinite_map=%u",
                  __entry->data_ack, __entry->data_seq,
                  __entry->subflow_seq, __entry->data_len,
                  __entry->csum, __entry->use_map,
                  __entry->dsn64, __entry->data_fin,
                  __entry->use_ack, __entry->ack64,
                  __entry->mpc_map, __entry->frozen,
                  __entry->reset_transient, __entry->reset_reason,
                  __entry->csum_reqd, __entry->infinite_map)
);

DEFINE_EVENT(mptcp_dump_mpext, mptcp_sendmsg_frag,
        TP_PROTO(struct mptcp_ext *mpext),
        TP_ARGS(mpext));

DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status,
        TP_PROTO(struct mptcp_ext *mpext),
        TP_ARGS(mpext));

TRACE_EVENT(ack_update_msk,

        TP_PROTO(u64 data_ack, u64 old_snd_una,
                 u64 new_snd_una, u64 new_wnd_end,
                 u64 msk_wnd_end),

        TP_ARGS(data_ack, old_snd_una,
                new_snd_una, new_wnd_end,
                msk_wnd_end),

        TP_STRUCT__entry(
                __field(u64, data_ack)
                __field(u64, old_snd_una)
                __field(u64, new_snd_una)
                __field(u64, new_wnd_end)
                __field(u64, msk_wnd_end)
        ),

        TP_fast_assign(
                __entry->data_ack = data_ack;
                __entry->old_snd_una = old_snd_una;
                __entry->new_snd_una = new_snd_una;
                __entry->new_wnd_end = new_wnd_end;
                __entry->msk_wnd_end = msk_wnd_end;
        ),

        TP_printk("data_ack=%llu old_snd_una=%llu new_snd_una=%llu new_wnd_end=%llu msk_wnd_end=%llu",
                  __entry->data_ack, __entry->old_snd_una,
                  __entry->new_snd_una, __entry->new_wnd_end,
                  __entry->msk_wnd_end)
);

TRACE_EVENT(subflow_check_data_avail,

        TP_PROTO(__u8 status, struct sk_buff *skb),

        TP_ARGS(status, skb),

        TP_STRUCT__entry(
                __field(u8, status)
                __field(const void *, skb)
        ),

        TP_fast_assign(
                __entry->status = status;
                __entry->skb = skb;
        ),

        TP_printk("mapping_status=%s, skb=%p",
                  show_mapping_status(__entry->status),
                  __entry->skb)
);

#endif /* _TRACE_MPTCP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































    1 










    1 







    1 



    1 



    1 


    1 












    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// SPDX-License-Identifier: GPL-2.0
/*
 * SHA1 routine optimized to do word accesses rather than byte accesses,
 * and to avoid unnecessary copies into the context array.
 *
 * This was based on the git SHA1 implementation.
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/string.h>
#include <crypto/sha1.h>
#include <asm/unaligned.h>

/*
 * If you have 32 registers or more, the compiler can (and should)
 * try to change the array[] accesses into registers. However, on
 * machines with less than ~25 registers, that won't really work,
 * and at least gcc will make an unholy mess of it.
 *
 * So to avoid that mess which just slows things down, we force
 * the stores to memory to actually happen (we might be better off
 * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
 * suggested by Artur Skawina - that will also make gcc unable to
 * try to do the silly "optimize away loads" part because it won't
 * see what the value will be).
 *
 * Ben Herrenschmidt reports that on PPC, the C version comes close
 * to the optimized asm with this (ie on PPC you don't want that
 * 'volatile', since there are lots of registers).
 *
 * On ARM we get the best code generation by forcing a full memory barrier
 * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
 * the stack frame size simply explode and performance goes down the drain.
 */

#ifdef CONFIG_X86
  #define setW(x, val) (*(volatile __u32 *)&W(x) = (val))
#elif defined(CONFIG_ARM)
  #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0)
#else
  #define setW(x, val) (W(x) = (val))
#endif

/* This "rolls" over the 512-bit array */
#define W(x) (array[(x)&15])

/*
 * Where do we get the source from? The first 16 iterations get it from
 * the input data, the next mix it from the 512-bit array.
 */
#define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t)
#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)

#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
        __u32 TEMP = input(t); setW(t, TEMP); \
        E += TEMP + rol32(A,5) + (fn) + (constant); \
        B = ror32(B, 2); \
        TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0)

#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )

/**
 * sha1_transform - single block SHA1 transform (deprecated)
 *
 * @digest: 160 bit digest to update
 * @data:   512 bits of data to hash
 * @array:  16 words of workspace (see note)
 *
 * This function executes SHA-1's internal compression function.  It updates the
 * 160-bit internal state (@digest) with a single 512-bit data block (@data).
 *
 * Don't use this function.  SHA-1 is no longer considered secure.  And even if
 * you do have to use SHA-1, this isn't the correct way to hash something with
 * SHA-1 as this doesn't handle padding and finalization.
 *
 * Note: If the hash is security sensitive, the caller should be sure
 * to clear the workspace. This is left to the caller to avoid
 * unnecessary clears between chained hashing operations.
 */
void sha1_transform(__u32 *digest, const char *data, __u32 *array)
{
        __u32 A, B, C, D, E;
        unsigned int i = 0;

        A = digest[0];
        B = digest[1];
        C = digest[2];
        D = digest[3];
        E = digest[4];

        /* Round 1 - iterations 0-16 take their input from 'data' */
        for (; i < 16; ++i)
                T_0_15(i, A, B, C, D, E);

        /* Round 1 - tail. Input from 512-bit mixing array */
        for (; i < 20; ++i)
                T_16_19(i, A, B, C, D, E);

        /* Round 2 */
        for (; i < 40; ++i)
                T_20_39(i, A, B, C, D, E);

        /* Round 3 */
        for (; i < 60; ++i)
                T_40_59(i, A, B, C, D, E);

        /* Round 4 */
        for (; i < 80; ++i)
                T_60_79(i, A, B, C, D, E);

        digest[0] += A;
        digest[1] += B;
        digest[2] += C;
        digest[3] += D;
        digest[4] += E;
}
EXPORT_SYMBOL(sha1_transform);

/**
 * sha1_init - initialize the vectors for a SHA1 digest
 * @buf: vector to initialize
 */
void sha1_init(__u32 *buf)
{
        buf[0] = 0x67452301;
        buf[1] = 0xefcdab89;
        buf[2] = 0x98badcfe;
        buf[3] = 0x10325476;
        buf[4] = 0xc3d2e1f0;
}
EXPORT_SYMBOL(sha1_init);

MODULE_LICENSE("GPL");








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   19 








































































































   19 




































    5 














   14 
   11 








   11 
   12 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H

#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <linux/zswap.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_PAGE_ORDER 10
#else
#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)

#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)

#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,        /* the number of types on the pcp lists */
        MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
         * ZONE_MOVABLE works.  Only movable pages can be allocated
         * from MIGRATE_CMA pageblocks and page allocator never
         * implicitly change migration type of MIGRATE_CMA pageblock.
         *
         * The way to use it is to change migratetype of a range of
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.
         */
        MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
#endif
        MIGRATE_TYPES
};

/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];

#ifdef CONFIG_CMA
#  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
#  define is_migrate_cma_folio(folio, pfn)        (MIGRATE_CMA ==                \
        get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK))
#else
#  define is_migrate_cma(migratetype) false
#  define is_migrate_cma_page(_page) false
#  define is_migrate_cma_folio(folio, pfn) false
#endif

static inline bool is_migrate_movable(int mt)
{
        return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}

/*
 * Check whether a migratetype can be merged with another migratetype.
 *
 * It is only mergeable when it can fall back to other migratetypes for
 * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
 */
static inline bool migratetype_is_mergeable(int mt)
{
        return mt < MIGRATE_PCPTYPES;
}

#define for_each_migratetype_order(order, type) \
        for (order = 0; order < NR_PAGE_ORDERS; order++) \
                for (type = 0; type < MIGRATE_TYPES; type++)

extern int page_group_by_mobility_disabled;

#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)

#define get_pageblock_migratetype(page)                                        \
        get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)

#define folio_migratetype(folio)                                \
        get_pfnblock_flags_mask(&folio->page, folio_pfn(folio),                \
                        MIGRATETYPE_MASK)
struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
        unsigned long                nr_free;
};

struct pglist_data;

#ifdef CONFIG_NUMA
enum numa_stat_item {
        NUMA_HIT,                /* allocated in intended node */
        NUMA_MISS,                /* allocated in non intended node */
        NUMA_FOREIGN,                /* was intended here, hit elsewhere */
        NUMA_INTERLEAVE_HIT,        /* interleaver preferred this zone */
        NUMA_LOCAL,                /* allocation from local node */
        NUMA_OTHER,                /* allocation from other node */
        NR_VM_NUMA_EVENT_ITEMS
};
#else
#define NR_VM_NUMA_EVENT_ITEMS 0
#endif

enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
        NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
        NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
        NR_ZONE_ACTIVE_ANON,
        NR_ZONE_INACTIVE_FILE,
        NR_ZONE_ACTIVE_FILE,
        NR_ZONE_UNEVICTABLE,
        NR_ZONE_WRITE_PENDING,        /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,                /* mlock()ed pages found and moved off LRU */
        /* Second 128 byte cacheline */
        NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,                /* allocated in zsmalloc */
#endif
        NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
        NR_UNACCEPTED,
#endif
        NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
        NR_LRU_BASE,
        NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
        NR_ACTIVE_ANON,                /*  "     "     "   "       "         */
        NR_INACTIVE_FILE,        /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,                /*  "     "     "   "       "         */
        NR_UNEVICTABLE,                /*  "     "     "   "       "         */
        NR_SLAB_RECLAIMABLE_B,
        NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,        /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,        /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
        WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_FILE,
        WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_FILE,
        WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_FILE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED,        /* Mapped anonymous pages */
        NR_FILE_MAPPED,        /* pagecache pages mapped into pagetables.
                           only modified from process context */
        NR_FILE_PAGES,
        NR_FILE_DIRTY,
        NR_WRITEBACK,
        NR_WRITEBACK_TEMP,        /* Writeback using temporary buffers */
        NR_SHMEM,                /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
        NR_FILE_THPS,
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,        /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,                /* page dirtyings since bootup */
        NR_WRITTEN,                /* page writings since bootup */
        NR_THROTTLED_WRITTEN,        /* NR_WRITTEN while reclaim throttled */
        NR_KERNEL_MISC_RECLAIMABLE,        /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,        /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,        /* pages returned via unpin_user_page() */
        NR_KERNEL_STACK_KB,        /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
        NR_KERNEL_SCS_KB,        /* measured in KiB */
#endif
        NR_PAGETABLE,                /* used for pagetables */
        NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */
#ifdef CONFIG_IOMMU_SUPPORT
        NR_IOMMU_PAGES,                /* # of pages allocated by IOMMU */
#endif
#ifdef CONFIG_SWAP
        NR_SWAPCACHE,
#endif
#ifdef CONFIG_NUMA_BALANCING
        PGPROMOTE_SUCCESS,        /* promote successfully */
        PGPROMOTE_CANDIDATE,        /* candidate pages to promote */
#endif
        /* PGDEMOTE_*: pages demoted */
        PGDEMOTE_KSWAPD,
        PGDEMOTE_DIRECT,
        PGDEMOTE_KHUGEPAGED,
        NR_VM_NODE_STAT_ITEMS
};

/*
 * Returns true if the item should be printed in THPs (/proc/vmstat
 * currently prints number of anon, file and shmem THPs. But the item
 * is charged in pages).
 */
static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return false;

        return item == NR_ANON_THPS ||
               item == NR_FILE_THPS ||
               item == NR_SHMEM_THPS ||
               item == NR_SHMEM_PMDMAPPED ||
               item == NR_FILE_PMDMAPPED;
}

/*
 * Returns true if the value is measured in bytes (most vmstat values are
 * measured in pages). This defines the API part, the internal representation
 * might be different.
 */
static __always_inline bool vmstat_item_in_bytes(int idx)
{
        /*
         * Global and per-node slab counters track slab pages.
         * It's expected that changes are multiples of PAGE_SIZE.
         * Internally values are stored in pages.
         *
         * Per-memcg and per-lruvec counters track memory, consumed
         * by individual slab objects. These counters are actually
         * byte-precise.
         */
        return (idx == NR_SLAB_RECLAIMABLE_B ||
                idx == NR_SLAB_UNRECLAIMABLE_B);
}

/*
 * We do arithmetic on the LRU lists in various places in the code,
 * so it is important to keep the active lists LRU_ACTIVE higher in
 * the array than the corresponding inactive lists, and to keep
 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
 *
 * This has to be kept in sync with the statistics in zone_stat_item
 * above and the descriptions in vmstat_text in mm/vmstat.c
 */
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2

enum lru_list {
        LRU_INACTIVE_ANON = LRU_BASE,
        LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
        LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
        LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
        LRU_UNEVICTABLE,
        NR_LRU_LISTS
};

enum vmscan_throttle_state {
        VMSCAN_THROTTLE_WRITEBACK,
        VMSCAN_THROTTLE_ISOLATED,
        VMSCAN_THROTTLE_NOPROGRESS,
        VMSCAN_THROTTLE_CONGESTED,
        NR_VMSCAN_THROTTLE,
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)

static inline bool is_file_lru(enum lru_list lru)
{
        return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}

static inline bool is_active_lru(enum lru_list lru)
{
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}

#define WORKINGSET_ANON 0
#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2

enum lruvec_flags {
        /*
         * An lruvec has many dirty pages backed by a congested BDI:
         * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim.
         *    It can be cleared by cgroup reclaim or kswapd.
         * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim.
         *    It can only be cleared by kswapd.
         *
         * Essentially, kswapd can unthrottle an lruvec throttled by cgroup
         * reclaim, but not vice versa. This only applies to the root cgroup.
         * The goal is to prevent cgroup reclaim on the root cgroup (e.g.
         * memory.reclaim) to unthrottle an unbalanced node (that was throttled
         * by kswapd).
         */
        LRUVEC_CGROUP_CONGESTED,
        LRUVEC_NODE_CONGESTED,
};

#endif /* !__GENERATING_BOUNDS_H */

/*
 * Evictable pages are divided into multiple generations. The youngest and the
 * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
 * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
 * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
 * corresponding generation. The gen counter in folio->flags stores gen+1 while
 * a page is on one of lrugen->folios[]. Otherwise it stores 0.
 *
 * A page is added to the youngest generation on faulting. The aging needs to
 * check the accessed bit at least twice before handing this page over to the
 * eviction. The first check takes care of the accessed bit set on the initial
 * fault; the second check makes sure this page hasn't been used since then.
 * This process, AKA second chance, requires a minimum of two generations,
 * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
 * LRU, e.g., /proc/vmstat, these two generations are considered active; the
 * rest of generations, if they exist, are considered inactive. See
 * lru_gen_is_active().
 *
 * PG_active is always cleared while a page is on one of lrugen->folios[] so
 * that the aging needs not to worry about it. And it's set again when a page
 * considered active is isolated for non-reclaiming purposes, e.g., migration.
 * See lru_gen_add_folio() and lru_gen_del_folio().
 *
 * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
 * in folio->flags.
 */
#define MIN_NR_GENS                2U
#define MAX_NR_GENS                4U

/*
 * Each generation is divided into multiple tiers. A page accessed N times
 * through file descriptors is in tier order_base_2(N). A page in the first tier
 * (N=0,1) is marked by PG_referenced unless it was faulted in through page
 * tables or read ahead. A page in any other tier (N>1) is marked by
 * PG_referenced and PG_workingset. This implies a minimum of two tiers is
 * supported without using additional bits in folio->flags.
 *
 * In contrast to moving across generations which requires the LRU lock, moving
 * across tiers only involves atomic operations on folio->flags and therefore
 * has a negligible cost in the buffered access path. In the eviction path,
 * comparisons of refaulted/(evicted+protected) from the first tier and the
 * rest infer whether pages accessed multiple times through file descriptors
 * are statistically hot and thus worth protecting.
 *
 * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
 * folio->flags.
 */
#define MAX_NR_TIERS                4U

#ifndef __GENERATING_BOUNDS_H

struct lruvec;
struct page_vma_mapped_walk;

#define LRU_GEN_MASK                ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK                ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

#ifdef CONFIG_LRU_GEN

enum {
        LRU_GEN_ANON,
        LRU_GEN_FILE,
};

enum {
        LRU_GEN_CORE,
        LRU_GEN_MM_WALK,
        LRU_GEN_NONLEAF_YOUNG,
        NR_LRU_GEN_CAPS
};

#define MIN_LRU_BATCH                BITS_PER_LONG
#define MAX_LRU_BATCH                (MIN_LRU_BATCH * 64)

/* whether to keep historical stats from evicted generations */
#ifdef CONFIG_LRU_GEN_STATS
#define NR_HIST_GENS                MAX_NR_GENS
#else
#define NR_HIST_GENS                1U
#endif

/*
 * The youngest generation number is stored in max_seq for both anon and file
 * types as they are aged on an equal footing. The oldest generation numbers are
 * stored in min_seq[] separately for anon and file types as clean file pages
 * can be evicted regardless of swap constraints.
 *
 * Normally anon and file min_seq are in sync. But if swapping is constrained,
 * e.g., out of swap space, file min_seq is allowed to advance and leave anon
 * min_seq behind.
 *
 * The number of pages in each generation is eventually consistent and therefore
 * can be transiently negative when reset_batch_size() is pending.
 */
struct lru_gen_folio {
        /* the aging increments the youngest generation number */
        unsigned long max_seq;
        /* the eviction increments the oldest generation numbers */
        unsigned long min_seq[ANON_AND_FILE];
        /* the birth time of each generation in jiffies */
        unsigned long timestamps[MAX_NR_GENS];
        /* the multi-gen LRU lists, lazily sorted on eviction */
        struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the multi-gen LRU sizes, eventually consistent */
        long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the exponential moving average of refaulted */
        unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
        /* the exponential moving average of evicted+protected */
        unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
        /* the first tier doesn't need protection, hence the minus one */
        unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
        /* can be modified without holding the LRU lock */
        atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* whether the multi-gen LRU is enabled */
        bool enabled;
        /* the memcg generation this lru_gen_folio belongs to */
        u8 gen;
        /* the list segment this lru_gen_folio belongs to */
        u8 seg;
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_node list;
};

enum {
        MM_LEAF_TOTAL,                /* total leaf entries */
        MM_LEAF_OLD,                /* old leaf entries */
        MM_LEAF_YOUNG,                /* young leaf entries */
        MM_NONLEAF_TOTAL,        /* total non-leaf entries */
        MM_NONLEAF_FOUND,        /* non-leaf entries found in Bloom filters */
        MM_NONLEAF_ADDED,        /* non-leaf entries added to Bloom filters */
        NR_MM_STATS
};

/* double-buffering Bloom filters */
#define NR_BLOOM_FILTERS        2

struct lru_gen_mm_state {
        /* synced with max_seq after each iteration */
        unsigned long seq;
        /* where the current iteration continues after */
        struct list_head *head;
        /* where the last iteration ended before */
        struct list_head *tail;
        /* Bloom filters flip after each iteration */
        unsigned long *filters[NR_BLOOM_FILTERS];
        /* the mm stats for debugging */
        unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
};

struct lru_gen_mm_walk {
        /* the lruvec under reclaim */
        struct lruvec *lruvec;
        /* max_seq from lru_gen_folio: can be out of date */
        unsigned long seq;
        /* the next address within an mm to scan */
        unsigned long next_addr;
        /* to batch promoted pages */
        int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* to batch the mm stats */
        int mm_stats[NR_MM_STATS];
        /* total batched items */
        int batched;
        bool can_swap;
        bool force_scan;
};

/*
 * For each node, memcgs are divided into two generations: the old and the
 * young. For each generation, memcgs are randomly sharded into multiple bins
 * to improve scalability. For each bin, the hlist_nulls is virtually divided
 * into three segments: the head, the tail and the default.
 *
 * An onlining memcg is added to the tail of a random bin in the old generation.
 * The eviction starts at the head of a random bin in the old generation. The
 * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
 * the old generation, is incremented when all its bins become empty.
 *
 * There are four operations:
 * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its
 *    current generation (old or young) and updates its "seg" to "head";
 * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its
 *    current generation (old or young) and updates its "seg" to "tail";
 * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old
 *    generation, updates its "gen" to "old" and resets its "seg" to "default";
 * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the
 *    young generation, updates its "gen" to "young" and resets its "seg" to
 *    "default".
 *
 * The events that trigger the above operations are:
 * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
 * 2. The first attempt to reclaim a memcg below low, which triggers
 *    MEMCG_LRU_TAIL;
 * 3. The first attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_TAIL;
 * 4. The second attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_YOUNG;
 * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
 * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
 * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
 *
 * Notes:
 * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing
 *    of their max_seq counters ensures the eventual fairness to all eligible
 *    memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
 * 2. There are only two valid generations: old (seq) and young (seq+1).
 *    MEMCG_NR_GENS is set to three so that when reading the generation counter
 *    locklessly, a stale value (seq-1) does not wraparound to young.
 */
#define MEMCG_NR_GENS        3
#define MEMCG_NR_BINS        8

struct lru_gen_memcg {
        /* the per-node memcg generation counter */
        unsigned long seq;
        /* each memcg has one lru_gen_folio per node */
        unsigned long nr_memcgs[MEMCG_NR_GENS];
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_head        fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
        /* protects the above */
        spinlock_t lock;
};

void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);

void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}

static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}

static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
}

static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}

#endif /* CONFIG_LRU_GEN */

struct lruvec {
        struct list_head                lists[NR_LRU_LISTS];
        /* per lruvec lru_lock for memcg */
        spinlock_t                        lru_lock;
        /*
         * These track the cost of reclaiming one LRU - file or anon -
         * over the other. As the observed cost of reclaiming one LRU
         * increases, the reclaim scan balance tips toward the other.
         */
        unsigned long                        anon_cost;
        unsigned long                        file_cost;
        /* Non-resident age, driven by LRU movement */
        atomic_long_t                        nonresident_age;
        /* Refaults at the time of last reclaim cycle */
        unsigned long                        refaults[ANON_AND_FILE];
        /* Various lruvec state flags (enum lruvec_flags) */
        unsigned long                        flags;
#ifdef CONFIG_LRU_GEN
        /* evictable pages divided into generations */
        struct lru_gen_folio                lrugen;
#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* to concurrently iterate lru_gen_mm_list */
        struct lru_gen_mm_state                mm_state;
#endif
#endif /* CONFIG_LRU_GEN */
#ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
#endif
        struct zswap_lruvec_state zswap_lruvec_state;
};

/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE        ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE        ((__force isolate_mode_t)0x8)

/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;

enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
        WMARK_PROMO,
        NR_WMARK
};

/*
 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
 * for THP which will usually be GFP_MOVABLE. Even if it is another type,
 * it should not contribute to serious fragmentation causing THP allocation
 * failures.
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 1
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)

#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

/*
 * Flags used in pcp->flags field.
 *
 * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
 * previous page freeing.  To avoid to drain PCP for an accident
 * high-order page freeing.
 *
 * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
 * draining PCP for consecutive high-order pages freeing without
 * allocation if data cache slice of CPU is large enough.  To reduce
 * zone lock contention and keep cache-hot pages reusing.
 */
#define        PCPF_PREV_FREE_HIGH_ORDER        BIT(0)
#define        PCPF_FREE_HIGH_BATCH                BIT(1)

struct per_cpu_pages {
        spinlock_t lock;        /* Protects lists field */
        int count;                /* number of pages in the list */
        int high;                /* high watermark, emptying needed */
        int high_min;                /* min high watermark */
        int high_max;                /* max high watermark */
        int batch;                /* chunk size for buddy add/remove */
        u8 flags;                /* protected by pcp->lock */
        u8 alloc_factor;        /* batch scaling factor during allocate */
#ifdef CONFIG_NUMA
        u8 expire;                /* When 0, remote pagesets are drained */
#endif
        short free_count;        /* consecutive free count */

        /* Lists of pages, one per migrate type stored on the pcp-lists */
        struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;

struct per_cpu_zonestat {
#ifdef CONFIG_SMP
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
        s8 stat_threshold;
#endif
#ifdef CONFIG_NUMA
        /*
         * Low priority inaccurate counters that are only folded
         * on demand. Use a large type to avoid the overhead of
         * folding during refresh_cpu_vm_stats.
         */
        unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#endif
};

struct per_cpu_nodestat {
        s8 stat_threshold;
        s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};

#endif /* !__GENERATING_BOUNDS.H */

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        /*
         * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
         * movable pages with few exceptional cases described below. Main use
         * cases for ZONE_MOVABLE are to make memory offlining/unplug more
         * likely to succeed, and to locally limit unmovable allocations - e.g.,
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
         *    essentially turn such pages unmovable. Therefore, we do not allow
         *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
         *    faulted, they come from the right zone right away. However, it is
         *    still possible that address space already has pages in
         *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
         *    touches that memory before pinning). In such case we migrate them
         *    to a different zone. When migration fails - pinning fails.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
         * 3. Memory holes: kernelcore/movablecore setups might create very rare
         *    situations where ZONE_MOVABLE contains memory holes after boot,
         *    for example, if we have sections that are only partially
         *    populated. Memory offlining and allocations fail early.
         * 4. PG_hwpoison pages: while poisoned pages can be skipped during
         *    memory offlining, such pages cannot be allocated.
         * 5. Unmovable PG_offline pages: in paravirtualized environments,
         *    hotplugged memory blocks might only partially be managed by the
         *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
         *    parts not manged by the buddy are unmovable PG_offline pages. In
         *    some cases (virtio-mem), such pages can be skipped during
         *    memory offlining, however, cannot be moved/allocated. These
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
         * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
         *    situations where ZERO_PAGE(0) which is allocated differently
         *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
         *    cannot be migrated.
         * 7. Memory-hotplug: when using memmap_on_memory and onlining the
         *    memory to the MOVABLE zone, the vmemmap pages are also placed in
         *    such zone. Such pages cannot be really moved around as they are
         *    self-stored in the range, but they are treated as movable when
         *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
         * have to expect that migrating pages in ZONE_MOVABLE can fail (even
         * if has_unmovable_pages() states that there are no unmovable pages,
         * there can be false negatives).
         */
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};

#ifndef __GENERATING_BOUNDS_H

#define ASYNC_AND_SYNC 2

struct zone {
        /* Read-mostly fields */

        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long _watermark[NR_WMARK];
        unsigned long watermark_boost;

        unsigned long nr_reserved_highatomic;

        /*
         * We don't know if the memory that we're going to allocate will be
         * freeable or/and it will be released eventually, so to avoid totally
         * wasting several GB of ram we must reserve some of the lower zone
         * memory (otherwise we risk to run OOM on the lower zones despite
         * there being tons of freeable ram on the higher zones).  This array is
         * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
         * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
        int node;
#endif
        struct pglist_data        *zone_pgdat;
        struct per_cpu_pages        __percpu *per_cpu_pageset;
        struct per_cpu_zonestat        __percpu *per_cpu_zonestats;
        /*
         * the high and batch values are copied to individual pagesets for
         * faster access
         */
        int pageset_high_min;
        int pageset_high_max;
        int pageset_batch;

#ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long                *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long                zone_start_pfn;

        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *         spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *        present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * present_early_pages is present pages existing within the zone
         * located on memory available since early boot, excluding hotplugged
         * memory.
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         *        managed_pages = present_pages - reserved_pages;
         *
         * cma pages is present pages that are assigned for CMA use
         * (MIGRATE_CMA).
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path.  But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
         * present_pages should use get_online_mems() to get a stable value.
         */
        atomic_long_t                managed_pages;
        unsigned long                spanned_pages;
        unsigned long                present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
        unsigned long                present_early_pages;
#endif
#ifdef CONFIG_CMA
        unsigned long                cma_pages;
#endif

        const char                *name;

#ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
         * freepage counting problem due to racy retrieving migratetype
         * of pageblock. Protected by zone->lock.
         */
        unsigned long                nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t                span_seqlock;
#endif

        int initialized;

        /* Write-intensive fields used from the page allocator */
        CACHELINE_PADDING(_pad1_);

        /* free areas of different sizes */
        struct free_area        free_area[NR_PAGE_ORDERS];

#ifdef CONFIG_UNACCEPTED_MEMORY
        /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
        struct list_head        unaccepted_pages;
#endif

        /* zone flags, see below */
        unsigned long                flags;

        /* Primarily protects free_area */
        spinlock_t                lock;

        /* Write-intensive fields used by compaction and vmstats. */
        CACHELINE_PADDING(_pad2_);

        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* pfn where compaction free scanner should start */
        unsigned long                compact_cached_free_pfn;
        /* pfn where compaction migration scanner should start */
        unsigned long                compact_cached_migrate_pfn[ASYNC_AND_SYNC];
        unsigned long                compact_init_migrate_pfn;
        unsigned long                compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         * compact_order_failed is the minimum compaction failed order.
         */
        unsigned int                compact_considered;
        unsigned int                compact_defer_shift;
        int                        compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool                        compact_blockskip_flush;
#endif

        bool                        contiguous;

        CACHELINE_PADDING(_pad3_);
        /* Zone statistics */
        atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];
        atomic_long_t                vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;

enum pgdat_flags {
        PGDAT_DIRTY,                        /* reclaim scanning has recently found
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
        PGDAT_RECLAIM_LOCKED,                /* prevents concurrent reclaim */
};

enum zone_flags {
        ZONE_BOOSTED_WATERMARK,                /* zone recently boosted watermarks.
                                         * Cleared when kswapd is woken.
                                         */
        ZONE_RECLAIM_ACTIVE,                /* kswapd may be scanning the zone. */
        ZONE_BELOW_HIGH,                /* zone is below high watermark. */
};

static inline unsigned long zone_managed_pages(struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

static inline unsigned long zone_cma_pages(struct zone *zone)
{
#ifdef CONFIG_CMA
        return zone->cma_pages;
#else
        return 0;
#endif
}

static inline unsigned long zone_end_pfn(const struct zone *zone)
{
        return zone->zone_start_pfn + zone->spanned_pages;
}

static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}

static inline bool zone_is_initialized(struct zone *zone)
{
        return zone->initialized;
}

static inline bool zone_is_empty(struct zone *zone)
{
        return zone->spanned_pages == 0;
}

#ifndef BUILD_VDSO32_64
/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF                (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF                (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF        (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF                (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
#define LRU_GEN_PGOFF                (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
#define LRU_REFS_PGOFF                (LRU_GEN_PGOFF - LRU_REFS_WIDTH)

/*
 * Define the bit shifts to access each section.  For non-existent
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
#define SECTIONS_PGSHIFT        (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT                (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT                (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT        (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT        (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT                (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((SECTIONS_PGOFF < ZONES_PGOFF) ? \
                                                SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT                (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((NODES_PGOFF < ZONES_PGOFF) ? \
                                                NODES_PGOFF : ZONES_PGOFF)
#endif

#define ZONEID_PGSHIFT                (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

#define ZONES_MASK                ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK                ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK                ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK        ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK                ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK                ((1UL << ZONEID_SHIFT) - 1)

static inline enum zone_type page_zonenum(const struct page *page)
{
        ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}

static inline enum zone_type folio_zonenum(const struct folio *folio)
{
        return page_zonenum(&folio->page);
}

#ifdef CONFIG_ZONE_DEVICE
static inline bool is_zone_device_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_DEVICE;
}

/*
 * Consecutive zone device pages should not be merged into the same sgl
 * or bvec segment with other types of pages or if they belong to different
 * pgmaps. Otherwise getting the pgmap of a given segment is not possible
 * without scanning the entire segment. This helper returns true either if
 * both pages are not zone device pages or both pages are zone device pages
 * with the same pgmap.
 */
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        if (is_zone_device_page(a) != is_zone_device_page(b))
                return false;
        if (!is_zone_device_page(a))
                return true;
        return a->pgmap == b->pgmap;
}

extern void memmap_init_zone_device(struct zone *, unsigned long,
                                    unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
        return false;
}
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        return true;
}
#endif

static inline bool folio_is_zone_device(const struct folio *folio)
{
        return is_zone_device_page(&folio->page);
}

static inline bool is_zone_movable_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_MOVABLE;
}

static inline bool folio_is_zone_movable(const struct folio *folio)
{
        return folio_zonenum(folio) == ZONE_MOVABLE;
}
#endif

/*
 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
 * intersection with the given zone
 */
static inline bool zone_intersects(struct zone *zone,
                unsigned long start_pfn, unsigned long nr_pages)
{
        if (zone_is_empty(zone))
                return false;
        if (start_pfn >= zone_end_pfn(zone) ||
            start_pfn + nr_pages <= zone->zone_start_pfn)
                return false;

        return true;
}

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

enum {
        ZONELIST_FALLBACK,        /* zonelist with fallback */
#ifdef CONFIG_NUMA
        /*
         * The NUMA zonelists are doubled because we need zonelists that
         * restrict the allocations to a single node for __GFP_THISNODE.
         */
        ZONELIST_NOFALLBACK,        /* zonelist without fallback (__GFP_THISNODE) */
#endif
        MAX_ZONELISTS
};

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
        struct zone *zone;        /* Pointer to actual zone */
        int zone_idx;                /* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()        - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()        - Return the index of the zone for an entry
 * zonelist_node_idx()        - Return the index of the node for an entry
 */
struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

/*
 * The array of struct pages for flatmem.
 * It must be declared for SPARSEMEM as well because there are configurations
 * that rely on that.
 */
extern struct page *mem_map;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
        spinlock_t split_queue_lock;
        struct list_head split_queue;
        unsigned long split_queue_len;
};
#endif

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Per NUMA node memory failure handling statistics.
 */
struct memory_failure_stats {
        /*
         * Number of raw pages poisoned.
         * Cases not accounted: memory outside kernel control, offline page,
         * arch-specific memory_failure (SGX), hwpoison_filter() filtered
         * error events, and unpoison actions from hwpoison_unpoison.
         */
        unsigned long total;
        /*
         * Recovery results of poisoned raw pages handled by memory_failure,
         * in sync with mf_result.
         * total = ignored + failed + delayed + recovered.
         * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
         */
        unsigned long ignored;
        unsigned long failed;
        unsigned long delayed;
        unsigned long recovered;
};
#endif

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
        /*
         * node_zones contains just the zones for THIS node. Not all of the
         * zones may be populated, but it is the full list. It is referenced by
         * this node's node_zonelists as well as other node's node_zonelists.
         */
        struct zone node_zones[MAX_NR_ZONES];

        /*
         * node_zonelists contains references to all zones in all nodes.
         * Generally the first zones will be references to this node's
         * node_zones.
         */
        struct zonelist node_zonelists[MAX_ZONELISTS];

        int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLATMEM        /* means !SPARSEMEM */
        struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
        struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
        /*
         * Must be held any time you expect node_start_pfn,
         * node_present_pages, node_spanned_pages or nr_zones to stay constant.
         * Also synchronizes pgdat->first_deferred_pfn during deferred page
         * init.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
         * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
#endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;

        /* workqueues for throttling reclaim for different reasons. */
        wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];

        atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
        unsigned long nr_reclaim_start;        /* nr pages written while throttled
                                         * when throttling started. */
#ifdef CONFIG_MEMORY_HOTPLUG
        struct mutex kswapd_lock;
#endif
        struct task_struct *kswapd;        /* Protected by kswapd_lock */
        int kswapd_order;
        enum zone_type kswapd_highest_zoneidx;

        int kswapd_failures;                /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
        bool proactive_compact_trigger;
#endif
        /*
         * This is a per-node reserve of pages that are not available
         * to userspace allocations.
         */
        unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA
        /*
         * node reclaim becomes active if more unmapped pages exist.
         */
        unsigned long                min_unmapped_pages;
        unsigned long                min_slab_pages;
#endif /* CONFIG_NUMA */

        /* Write-intensive fields used by page reclaim */
        CACHELINE_PADDING(_pad1_);

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
         * If memory initialisation on large machines is deferred then this
         * is the first PFN that needs to be initialised.
         */
        unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_NUMA_BALANCING
        /* start time in ms of current promote rate limit period */
        unsigned int nbp_rl_start;
        /* number of promote candidate pages at start time of current rate limit period */
        unsigned long nbp_rl_nr_cand;
        /* promote threshold in ms */
        unsigned int nbp_threshold;
        /* start time in ms of current promote threshold adjustment period */
        unsigned int nbp_th_start;
        /*
         * number of promote candidate pages at start time of current promote
         * threshold adjustment period
         */
        unsigned long nbp_th_nr_cand;
#endif
        /* Fields commonly accessed by the page reclaim scanner */

        /*
         * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
         *
         * Use mem_cgroup_lruvec() to look up lruvecs.
         */
        struct lruvec                __lruvec;

        unsigned long                flags;

#ifdef CONFIG_LRU_GEN
        /* kswap mm walk data */
        struct lru_gen_mm_walk mm_walk;
        /* lru_gen_folio list */
        struct lru_gen_memcg memcg_lru;
#endif

        CACHELINE_PADDING(_pad2_);

        /* Per-node vmstats */
        struct per_cpu_nodestat __percpu *per_cpu_nodestats;
        atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];
#ifdef CONFIG_NUMA
        struct memory_tier __rcu *memtier;
#endif
#ifdef CONFIG_MEMORY_FAILURE
        struct memory_failure_stats mf_stats;
#endif
} pg_data_t;

#define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)        (NODE_DATA(nid)->node_spanned_pages)

#define node_start_pfn(nid)        (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

#include <linux/memory_hotplug.h>

void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
                   enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx);
/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
 */
enum meminit_context {
        MEMINIT_EARLY,
        MEMINIT_HOTPLUG,
};

extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
                                     unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
        return lruvec->pgdat;
#else
        return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#define zone_idx(zone)                ((zone) - (zone)->zone_pgdat->node_zones)

#ifdef CONFIG_ZONE_DEVICE
static inline bool zone_is_zone_device(struct zone *zone)
{
        return zone_idx(zone) == ZONE_DEVICE;
}
#else
static inline bool zone_is_zone_device(struct zone *zone)
{
        return false;
}
#endif

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */
static inline bool managed_zone(struct zone *zone)
{
        return zone_managed_pages(zone);
}

/* Returns true if a zone has memory */
static inline bool populated_zone(struct zone *zone)
{
        return zone->present_pages;
}

#ifdef CONFIG_NUMA
static inline int zone_to_nid(struct zone *zone)
{
        return zone->node;
}

static inline void zone_set_nid(struct zone *zone, int nid)
{
        zone->node = nid;
}
#else
static inline int zone_to_nid(struct zone *zone)
{
        return 0;
}

static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif

extern int movable_zone;

static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
        return (idx == ZONE_HIGHMEM ||
                (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
#else
        return 0;
#endif
}

/**
 * is_highmem - helper function to quickly check if a struct zone is a
 *              highmem zone or not.  This is an attempt to keep references
 *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 * @zone: pointer to struct zone variable
 * Return: 1 for a highmem zone, 0 otherwise
 */
static inline int is_highmem(struct zone *zone)
{
        return is_highmem_idx(zone_idx(zone));
}

#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void);
#else
static inline bool has_managed_dma(void)
{
        return false;
}
#endif


#ifndef CONFIG_NUMA

extern struct pglist_data contig_page_data;
static inline struct pglist_data *NODE_DATA(int nid)
{
        return &contig_page_data;
}

#else /* CONFIG_NUMA */

#include <asm/mmzone.h>

#endif /* !CONFIG_NUMA */

extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);

/**
 * for_each_online_pgdat - helper macro to iterate over all online nodes
 * @pgdat: pointer to a pg_data_t variable
 */
#define for_each_online_pgdat(pgdat)                        \
        for (pgdat = first_online_pgdat();                \
             pgdat;                                        \
             pgdat = next_online_pgdat(pgdat))
/**
 * for_each_zone - helper macro to iterate over all memory zones
 * @zone: pointer to struct zone variable
 *
 * The user only needs to declare the zone variable, for_each_zone
 * fills it in.
 */
#define for_each_zone(zone)                                \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))

#define for_each_populated_zone(zone)                        \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))                        \
                if (!populated_zone(zone))                \
                        ; /* do nothing */                \
                else

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
        return zoneref->zone_idx;
}

static inline int zonelist_node_idx(struct zoneref *zoneref)
{
        return zone_to_nid(zoneref->zone);
}

struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes);

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z: The cursor used as a starting point for the search
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 *
 * Return: the next zone at or below highest_zoneidx within the allowed
 * nodemask using a cursor within a zonelist as a starting point
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
                return z;
        return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist: The zonelist to search for a suitable zone
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 *
 * Return: Zoneref pointer for the first suitable zone found
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        return next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->_zonerefs being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 * @nodemask: Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))

#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
        for (zone = z->zone;        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))


/**
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->zones being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

/* Whether the 'nodes' are all movable nodes */
static inline bool movable_only_nodes(nodemask_t *nodes)
{
        struct zonelist *zonelist;
        struct zoneref *z;
        int nid;

        if (nodes_empty(*nodes))
                return false;

        /*
         * We can chose arbitrary node from the nodemask to get a
         * zonelist as they are interlinked. We just need to find
         * at least one zone that can satisfy kernel allocations.
         */
        nid = first_node(*nodes);
        zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
        z = first_zones_zonelist(zonelist, ZONE_NORMAL,        nodes);
        return (!z->zone) ? true : false;
}


#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif

#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn)                (0)
#endif

#ifdef CONFIG_SPARSEMEM

/*
 * PA_SECTION_SHIFT                physical address to/from section number
 * PFN_SECTION_SHIFT                pfn to/from section number
 */
#define PA_SECTION_SHIFT        (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT        (SECTION_SIZE_BITS - PAGE_SHIFT)

#define NR_MEM_SECTIONS                (1UL << SECTIONS_SHIFT)

#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK        (~(PAGES_PER_SECTION-1))

#define SECTION_BLOCKFLAGS_BITS \
        ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)

#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
        return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
        return sec << PFN_SECTION_SHIFT;
}

#define SECTION_ALIGN_UP(pfn)        (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)

#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)

#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))

#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif

#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)

struct mem_section_usage {
        struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
};

void subsection_map_init(unsigned long pfn, unsigned long nr_pages);

struct page;
struct page_ext;
struct mem_section {
        /*
         * This is, logically, a pointer to an array of struct
         * pages.  However, it is stored with some other magic.
         * (see sparse.c::sparse_init_one_section())
         *
         * Additionally during early boot we encode node id of
         * the location of the section here to guide allocation.
         * (see sparse.c::memory_present())
         *
         * Making it a UL at least makes someone do a cast
         * before using it wrong.
         */
        unsigned long section_mem_map;

        struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
         * section. (see page_ext.h about this.)
         */
        struct page_ext *page_ext;
        unsigned long pad;
#endif
        /*
         * WARNING: mem_section must be a power-of-2 in size for the
         * calculation and use of SECTION_ROOT_MASK to make sense.
         */
};

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT        1
#endif

#define SECTION_NR_TO_ROOT(sec)        ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS        DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK        (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
        return ms->usage->pageblock_flags;
}

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
        unsigned long root = SECTION_NR_TO_ROOT(nr);

        if (unlikely(root >= NR_SECTION_ROOTS))
                return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section || !mem_section[root])
                return NULL;
#endif
        return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern size_t mem_section_usage_size(void);

/*
 * We use the lower bits of the mem_map pointer to store
 * a little bit of information.  The pointer is calculated
 * as mem_map - section_nr_to_pfn(pnum).  The result is
 * aligned to the minimum alignment of the two values:
 *   1. All mem_map arrays are page-aligned.
 *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
 *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
 *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
 *      worst combination is powerpc with 256k pages,
 *      which results in PFN_SECTION_SHIFT equal 6.
 * To sum it up, at least 6 bits are available on all architectures.
 * However, we can exceed 6 bits on some other architectures except
 * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
 * with the worst case of 64K pages on arm64) if we make sure the
 * exceeded bit is not applicable to powerpc.
 */
enum {
        SECTION_MARKED_PRESENT_BIT,
        SECTION_HAS_MEM_MAP_BIT,
        SECTION_IS_ONLINE_BIT,
        SECTION_IS_EARLY_BIT,
#ifdef CONFIG_ZONE_DEVICE
        SECTION_TAINT_ZONE_DEVICE_BIT,
#endif
        SECTION_MAP_LAST_BIT,
};

#define SECTION_MARKED_PRESENT                BIT(SECTION_MARKED_PRESENT_BIT)
#define SECTION_HAS_MEM_MAP                BIT(SECTION_HAS_MEM_MAP_BIT)
#define SECTION_IS_ONLINE                BIT(SECTION_IS_ONLINE_BIT)
#define SECTION_IS_EARLY                BIT(SECTION_IS_EARLY_BIT)
#ifdef CONFIG_ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE        BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
#define SECTION_MAP_MASK                (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT                SECTION_MAP_LAST_BIT

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
        unsigned long map = section->section_mem_map;
        map &= SECTION_MAP_MASK;
        return (struct page *)map;
}

static inline int present_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}

static inline int present_section_nr(unsigned long nr)
{
        return present_section(__nr_to_section(nr));
}

static inline int valid_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}

static inline int early_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_EARLY));
}

static inline int valid_section_nr(unsigned long nr)
{
        return valid_section(__nr_to_section(nr));
}

static inline int online_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(struct mem_section *section)
{
        unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;

        return section && ((section->section_mem_map & flags) == flags);
}
#else
static inline int online_device_section(struct mem_section *section)
{
        return 0;
}
#endif

static inline int online_section_nr(unsigned long nr)
{
        return online_section(__nr_to_section(nr));
}

#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

extern unsigned long __highest_present_section_nr;

static inline int subsection_map_index(unsigned long pfn)
{
        return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        int idx = subsection_map_index(pfn);

        return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        return 1;
}
#endif

#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
 * pfn_valid - check if there is a valid memory map entry for a PFN
 * @pfn: the page frame number to check
 *
 * Check if there is a valid memory map entry aka struct page for the @pfn.
 * Note, that availability of the memory map entry does not imply that
 * there is actual usable memory at that @pfn. The struct page may
 * represent a hole or an unusable page frame.
 *
 * Return: 1 for PFNs that have memory map entries and 0 otherwise
 */
static inline int pfn_valid(unsigned long pfn)
{
        struct mem_section *ms;
        int ret;

        /*
         * Ensure the upper PAGE_SHIFT bits are clear in the
         * pfn. Else it might lead to false positives when
         * some of the upper bits are set, but the lower bits
         * match a valid pfn.
         */
        if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
                return 0;

        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        ms = __pfn_to_section(pfn);
        rcu_read_lock_sched();
        if (!valid_section(ms)) {
                rcu_read_unlock_sched();
                return 0;
        }
        /*
         * Traditionally early sections always returned pfn_valid() for
         * the entire section-sized span.
         */
        ret = early_section(ms) || pfn_section_valid(ms, pfn);
        rcu_read_unlock_sched();

        return ret;
}
#endif

static inline int pfn_in_present_section(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return present_section(__pfn_to_section(pfn));
}

static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
        while (++section_nr <= __highest_present_section_nr) {
                if (present_section_nr(section_nr))
                        return section_nr;
        }

        return -1;
}

/*
 * These are _only_ used during initialisation, therefore they
 * can use __initdata ...  They could have names to indicate
 * this restriction.
 */
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn)                                                        \
({                                                                        \
        unsigned long __pfn_to_nid_pfn = (pfn);                                \
        page_to_nid(pfn_to_page(__pfn_to_nid_pfn));                        \
})
#else
#define pfn_to_nid(pfn)                (0)
#endif

void sparse_init(void);
#else
#define sparse_init()        do {} while (0)
#define sparse_index_init(_sec, _nid)  do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */

#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */














    3 




















    3 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/mm.h>
#include <asm/current.h>
#include <asm/traps.h>
#include <asm/vdso.h>

struct vdso_exception_table_entry {
        int insn, fixup;
};

bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
                          unsigned long error_code, unsigned long fault_addr)
{
        const struct vdso_image *image = current->mm->context.vdso_image;
        const struct vdso_exception_table_entry *extable;
        unsigned int nr_entries, i;
        unsigned long base;

        /*
         * Do not attempt to fixup #DB or #BP.  It's impossible to identify
         * whether or not a #DB/#BP originated from within an SGX enclave and
         * SGX enclaves are currently the only use case for vDSO fixup.
         */
        if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP)
                return false;

        if (!current->mm->context.vdso)
                return false;

        base =  (unsigned long)current->mm->context.vdso + image->extable_base;
        nr_entries = image->extable_len / (sizeof(*extable));
        extable = image->extable;

        for (i = 0; i < nr_entries; i++) {
                if (regs->ip == base + extable[i].insn) {
                        regs->ip = base + extable[i].fixup;
                        regs->di = trapnr;
                        regs->si = error_code;
                        regs->dx = fault_addr;
                        return true;
                }
        }

        return false;
}














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions of this file
 * Copyright(c) 2016-2017 Intel Deutschland GmbH
 * Copyright (C) 2018, 2021-2024 Intel Corporation
 */
#ifndef __CFG80211_RDEV_OPS
#define __CFG80211_RDEV_OPS

#include <linux/rtnetlink.h>
#include <net/cfg80211.h>
#include "core.h"
#include "trace.h"

static inline int rdev_suspend(struct cfg80211_registered_device *rdev,
                               struct cfg80211_wowlan *wowlan)
{
        int ret;
        trace_rdev_suspend(&rdev->wiphy, wowlan);
        ret = rdev->ops->suspend(&rdev->wiphy, wowlan);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_resume(struct cfg80211_registered_device *rdev)
{
        int ret;
        trace_rdev_resume(&rdev->wiphy);
        ret = rdev->ops->resume(&rdev->wiphy);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev,
                                   bool enabled)
{
        trace_rdev_set_wakeup(&rdev->wiphy, enabled);
        rdev->ops->set_wakeup(&rdev->wiphy, enabled);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline struct wireless_dev
*rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name,
                       unsigned char name_assign_type,
                       enum nl80211_iftype type,
                       struct vif_params *params)
{
        struct wireless_dev *ret;
        trace_rdev_add_virtual_intf(&rdev->wiphy, name, type);
        ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type,
                                          type, params);
        trace_rdev_return_wdev(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_del_virtual_intf(struct cfg80211_registered_device *rdev,
                      struct wireless_dev *wdev)
{
        int ret;
        trace_rdev_del_virtual_intf(&rdev->wiphy, wdev);
        ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_change_virtual_intf(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, enum nl80211_iftype type,
                         struct vif_params *params)
{
        int ret;
        trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type);
        ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_add_key(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, int link_id,
                               u8 key_index, bool pairwise, const u8 *mac_addr,
                               struct key_params *params)
{
        int ret;
        trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
                           mac_addr, params->mode);
        ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index,
                                  pairwise, mac_addr, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev,
             int link_id, u8 key_index, bool pairwise, const u8 *mac_addr,
             void *cookie,
             void (*callback)(void *cookie, struct key_params*))
{
        int ret;
        trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
                           mac_addr);
        ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index,
                                  pairwise, mac_addr, cookie, callback);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_key(struct cfg80211_registered_device *rdev,
                               struct net_device *netdev, int link_id,
                               u8 key_index, bool pairwise, const u8 *mac_addr)
{
        int ret;
        trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise,
                           mac_addr);
        ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index,
                                  pairwise, mac_addr);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_key(struct cfg80211_registered_device *rdev,
                     struct net_device *netdev, int link_id, u8 key_index,
                     bool unicast, bool multicast)
{
        int ret;
        trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index,
                                   unicast, multicast);
        ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id,
                                          key_index, unicast, multicast);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev,
                          struct net_device *netdev, int link_id, u8 key_index)
{
        int ret;
        trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id,
                                        key_index);
        ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id,
                                               key_index);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev,
                            struct net_device *netdev, int link_id,
                            u8 key_index)
{
        int ret;

        trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id,
                                          key_index);
        ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id,
                                                 key_index);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_start_ap(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct cfg80211_ap_settings *settings)
{
        int ret;
        trace_rdev_start_ap(&rdev->wiphy, dev, settings);
        ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev,
                                     struct cfg80211_ap_update *info)
{
        int ret;
        trace_rdev_change_beacon(&rdev->wiphy, dev, info);
        ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, unsigned int link_id)
{
        int ret;
        trace_rdev_stop_ap(&rdev->wiphy, dev, link_id);
        ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_add_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev, u8 *mac,
                                   struct station_parameters *params)
{
        int ret;
        trace_rdev_add_station(&rdev->wiphy, dev, mac, params);
        ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct station_del_parameters *params)
{
        int ret;
        trace_rdev_del_station(&rdev->wiphy, dev, params);
        ret = rdev->ops->del_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_station(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev, u8 *mac,
                                      struct station_parameters *params)
{
        int ret;
        trace_rdev_change_station(&rdev->wiphy, dev, mac, params);
        ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_station(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev, const u8 *mac,
                                   struct station_info *sinfo)
{
        int ret;
        trace_rdev_get_station(&rdev->wiphy, dev, mac);
        ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo);
        trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
        return ret;
}

static inline int rdev_dump_station(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, int idx, u8 *mac,
                                    struct station_info *sinfo)
{
        int ret;
        trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac);
        ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo);
        trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo);
        return ret;
}

static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst, u8 *next_hop)
{
        int ret;
        trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst)
{
        int ret;
        trace_rdev_del_mpath(&rdev->wiphy, dev, dst);
        ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, u8 *dst,
                                    u8 *next_hop)
{
        int ret;
        trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *dst, u8 *next_hop,
                                 struct mpath_info *pinfo)
{
        int ret;
        trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop);
        ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;

}

static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, u8 *dst, u8 *mpp,
                               struct mpath_info *pinfo)
{
        int ret;

        trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp);
        ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev, int idx, u8 *dst,
                                  u8 *next_hop, struct mpath_info *pinfo)

{
        int ret;
        trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop);
        ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop,
                                    pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev,
                                struct net_device *dev, int idx, u8 *dst,
                                u8 *mpp, struct mpath_info *pinfo)

{
        int ret;

        trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp);
        ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo);
        trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo);
        return ret;
}

static inline int
rdev_get_mesh_config(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, struct mesh_config *conf)
{
        int ret;
        trace_rdev_get_mesh_config(&rdev->wiphy, dev);
        ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf);
        trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf);
        return ret;
}

static inline int
rdev_update_mesh_config(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u32 mask,
                        const struct mesh_config *nconf)
{
        int ret;
        trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf);
        ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 const struct mesh_config *conf,
                                 const struct mesh_setup *setup)
{
        int ret;
        trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup);
        ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}


static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev)
{
        int ret;
        trace_rdev_leave_mesh(&rdev->wiphy, dev);
        ret = rdev->ops->leave_mesh(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct ocb_setup *setup)
{
        int ret;
        trace_rdev_join_ocb(&rdev->wiphy, dev, setup);
        ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev)
{
        int ret;
        trace_rdev_leave_ocb(&rdev->wiphy, dev);
        ret = rdev->ops->leave_ocb(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_change_bss(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev,
                                  struct bss_parameters *params)

{
        int ret;
        trace_rdev_change_bss(&rdev->wiphy, dev, params);
        ret = rdev->ops->change_bss(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_inform_bss(struct cfg80211_registered_device *rdev,
                                   struct cfg80211_bss *bss,
                                   const struct cfg80211_bss_ies *ies,
                                   void *drv_data)

{
        trace_rdev_inform_bss(&rdev->wiphy, bss);
        if (rdev->ops->inform_bss)
                rdev->ops->inform_bss(&rdev->wiphy, bss, ies, drv_data);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct ieee80211_txq_params *params)

{
        int ret;
        trace_rdev_set_txq_params(&rdev->wiphy, dev, params);
        ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct ieee80211_channel *chan)
{
        int ret;
        trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
        ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_monitor_channel(struct cfg80211_registered_device *rdev,
                         struct cfg80211_chan_def *chandef)
{
        int ret;
        trace_rdev_set_monitor_channel(&rdev->wiphy, chandef);
        ret = rdev->ops->set_monitor_channel(&rdev->wiphy, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_scan(struct cfg80211_registered_device *rdev,
                            struct cfg80211_scan_request *request)
{
        int ret;

        if (WARN_ON_ONCE(!request->n_ssids && request->ssids))
                return -EINVAL;

        trace_rdev_scan(&rdev->wiphy, request);
        ret = rdev->ops->scan(&rdev->wiphy, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev,
                                   struct wireless_dev *wdev)
{
        trace_rdev_abort_scan(&rdev->wiphy, wdev);
        rdev->ops->abort_scan(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_auth(struct cfg80211_registered_device *rdev,
                            struct net_device *dev,
                            struct cfg80211_auth_request *req)
{
        int ret;
        trace_rdev_auth(&rdev->wiphy, dev, req);
        ret = rdev->ops->auth(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_assoc(struct cfg80211_registered_device *rdev,
                             struct net_device *dev,
                             struct cfg80211_assoc_request *req)
{
        int ret;

        trace_rdev_assoc(&rdev->wiphy, dev, req);
        ret = rdev->ops->assoc(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_deauth(struct cfg80211_registered_device *rdev,
                              struct net_device *dev,
                              struct cfg80211_deauth_request *req)
{
        int ret;
        trace_rdev_deauth(&rdev->wiphy, dev, req);
        ret = rdev->ops->deauth(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_disassoc(struct cfg80211_registered_device *rdev,
                                struct net_device *dev,
                                struct cfg80211_disassoc_request *req)
{
        int ret;
        trace_rdev_disassoc(&rdev->wiphy, dev, req);
        ret = rdev->ops->disassoc(&rdev->wiphy, dev, req);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_connect(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct cfg80211_connect_params *sme)
{
        int ret;
        trace_rdev_connect(&rdev->wiphy, dev, sme);
        ret = rdev->ops->connect(&rdev->wiphy, dev, sme);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_update_connect_params(struct cfg80211_registered_device *rdev,
                           struct net_device *dev,
                           struct cfg80211_connect_params *sme, u32 changed)
{
        int ret;
        trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed);
        ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_disconnect(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev, u16 reason_code)
{
        int ret;
        trace_rdev_disconnect(&rdev->wiphy, dev, reason_code);
        ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev,
                                 struct cfg80211_ibss_params *params)
{
        int ret;
        trace_rdev_join_ibss(&rdev->wiphy, dev, params);
        ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev,
                                  struct net_device *dev)
{
        int ret;
        trace_rdev_leave_ibss(&rdev->wiphy, dev);
        ret = rdev->ops->leave_ibss(&rdev->wiphy, dev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed)
{
        int ret;

        if (!rdev->ops->set_wiphy_params)
                return -EOPNOTSUPP;

        trace_rdev_set_wiphy_params(&rdev->wiphy, changed);
        ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev,
                                    enum nl80211_tx_power_setting type, int mbm)
{
        int ret;
        trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm);
        ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev, int *dbm)
{
        int ret;
        trace_rdev_get_tx_power(&rdev->wiphy, wdev);
        ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm);
        trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm);
        return ret;
}

static inline int
rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
                              struct net_device *dev,
                              const bool enabled)
{
        int ret;
        trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
        ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
                   struct wireless_dev *wdev,
                   struct cfg80211_txq_stats *txqstats)
{
        int ret;
        trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
        ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
{
        trace_rdev_rfkill_poll(&rdev->wiphy);
        rdev->ops->rfkill_poll(&rdev->wiphy);
        trace_rdev_return_void(&rdev->wiphy);
}


#ifdef CONFIG_NL80211_TESTMODE
static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev,
                                    void *data, int len)
{
        int ret;
        trace_rdev_testmode_cmd(&rdev->wiphy, wdev);
        ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev,
                                     struct sk_buff *skb,
                                     struct netlink_callback *cb, void *data,
                                     int len)
{
        int ret;
        trace_rdev_testmode_dump(&rdev->wiphy);
        ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}
#endif

static inline int
rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev,
                      struct net_device *dev, unsigned int link_id,
                      const u8 *peer,
                      const struct cfg80211_bitrate_mask *mask)
{
        int ret;
        trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask);
        ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id,
                                          peer, mask);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev,
                                   struct net_device *netdev, int idx,
                                   struct survey_info *info)
{
        int ret;
        trace_rdev_dump_survey(&rdev->wiphy, netdev, idx);
        ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info);
        if (ret < 0)
                trace_rdev_return_int(&rdev->wiphy, ret);
        else
                trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info);
        return ret;
}

static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_pmksa *pmksa)
{
        int ret;
        trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa);
        ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev,
                                 struct net_device *netdev,
                                 struct cfg80211_pmksa *pmksa)
{
        int ret;
        trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa);
        ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev,
                                   struct net_device *netdev)
{
        int ret;
        trace_rdev_flush_pmksa(&rdev->wiphy, netdev);
        ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_remain_on_channel(struct cfg80211_registered_device *rdev,
                       struct wireless_dev *wdev,
                       struct ieee80211_channel *chan,
                       unsigned int duration, u64 *cookie)
{
        int ret;
        trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration);
        ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan,
                                           duration, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int
rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev, u64 cookie)
{
        int ret;
        trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
        ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev,
                               struct wireless_dev *wdev,
                               struct cfg80211_mgmt_tx_params *params,
                               u64 *cookie)
{
        int ret;
        trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params);
        ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev,
                                       const void *buf, size_t len,
                                       const u8 *dest, __be16 proto,
                                       const bool noencrypt, int link,
                                       u64 *cookie)
{
        int ret;
        trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len,
                                   dest, proto, noencrypt, link);
        ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len,
                                         dest, proto, noencrypt, link, cookie);
        if (cookie)
                trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        else
                trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev,
                         struct wireless_dev *wdev, u64 cookie)
{
        int ret;
        trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
        ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev, bool enabled,
                                      int timeout)
{
        int ret;
        trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
        ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, s32 rssi_thold, u32 rssi_hyst)
{
        int ret;
        trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
                                       rssi_hyst);
        ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold,
                                       rssi_hyst);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, s32 low, s32 high)
{
        int ret;
        trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high);
        ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev,
                                                   low, high);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev,
                        struct net_device *dev, u32 rate, u32 pkts, u32 intvl)
{
        int ret;
        trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl);
        ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts,
                                             intvl);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev,
                                     struct wireless_dev *wdev,
                                     struct mgmt_frame_regs *upd)
{
        might_sleep();

        trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd);
        if (rdev->ops->update_mgmt_frame_registrations)
                rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev,
                                                           upd);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev,
                                   u32 tx_ant, u32 rx_ant)
{
        int ret;
        trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant);
        ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev,
                                   u32 *tx_ant, u32 *rx_ant)
{
        int ret;
        trace_rdev_get_antenna(&rdev->wiphy);
        ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant);
        if (ret)
                trace_rdev_return_int(&rdev->wiphy, ret);
        else
                trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant,
                                            *rx_ant);
        return ret;
}

static inline int
rdev_sched_scan_start(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct cfg80211_sched_scan_request *request)
{
        int ret;
        trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid);
        ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev, u64 reqid)
{
        int ret;
        trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid);
        ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_gtk_rekey_data *data)
{
        int ret;
        trace_rdev_set_rekey_data(&rdev->wiphy, dev);
        ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *peer,
                                 int link_id, u8 action_code,
                                 u8 dialog_token, u16 status_code,
                                 u32 peer_capability, bool initiator,
                                 const u8 *buf, size_t len)
{
        int ret;
        trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code,
                             dialog_token, status_code, peer_capability,
                             initiator, buf, len);
        ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, link_id,
                                   action_code, dialog_token, status_code,
                                   peer_capability, initiator, buf, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev,
                                 struct net_device *dev, u8 *peer,
                                 enum nl80211_tdls_operation oper)
{
        int ret;
        trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper);
        ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_probe_client(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev, const u8 *peer,
                                    u64 *cookie)
{
        int ret;
        trace_rdev_probe_client(&rdev->wiphy, dev, peer);
        ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie);
        trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie);
        return ret;
}

static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev, u16 noack_map)
{
        int ret;
        trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map);
        ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_channel(struct cfg80211_registered_device *rdev,
                 struct wireless_dev *wdev,
                 unsigned int link_id,
                 struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_get_channel(&rdev->wiphy, wdev, link_id);
        ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef);
        trace_rdev_return_chandef(&rdev->wiphy, ret, chandef);

        return ret;
}

static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev)
{
        int ret;

        trace_rdev_start_p2p_device(&rdev->wiphy, wdev);
        ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev)
{
        trace_rdev_stop_p2p_device(&rdev->wiphy, wdev);
        rdev->ops->stop_p2p_device(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_start_nan(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev,
                                 struct cfg80211_nan_conf *conf)
{
        int ret;

        trace_rdev_start_nan(&rdev->wiphy, wdev, conf);
        ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev,
                                 struct wireless_dev *wdev)
{
        trace_rdev_stop_nan(&rdev->wiphy, wdev);
        rdev->ops->stop_nan(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_add_nan_func(struct cfg80211_registered_device *rdev,
                  struct wireless_dev *wdev,
                  struct cfg80211_nan_func *nan_func)
{
        int ret;

        trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func);
        ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev,
                                    struct wireless_dev *wdev, u64 cookie)
{
        trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie);
        rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_nan_change_conf(struct cfg80211_registered_device *rdev,
                     struct wireless_dev *wdev,
                     struct cfg80211_nan_conf *conf, u32 changes)
{
        int ret;

        trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes);
        if (rdev->ops->nan_change_conf)
                ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf,
                                                 changes);
        else
                ret = -EOPNOTSUPP;
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct cfg80211_acl_data *params)
{
        int ret;

        trace_rdev_set_mac_acl(&rdev->wiphy, dev, params);
        ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev,
                                     struct net_device *dev,
                                     struct cfg80211_update_ft_ies_params *ftie)
{
        int ret;

        trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie);
        ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev,
                                        struct wireless_dev *wdev,
                                        enum nl80211_crit_proto_id protocol,
                                        u16 duration)
{
        int ret;

        trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration);
        ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev,
                                          protocol, duration);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev,
                                       struct wireless_dev *wdev)
{
        trace_rdev_crit_proto_stop(&rdev->wiphy, wdev);
        rdev->ops->crit_proto_stop(&rdev->wiphy, wdev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_csa_settings *params)
{
        int ret;

        trace_rdev_channel_switch(&rdev->wiphy, dev, params);
        ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev,
                                   struct net_device *dev,
                                   struct cfg80211_qos_map *qos_map)
{
        int ret = -EOPNOTSUPP;

        if (rdev->ops->set_qos_map) {
                trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map);
                ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map);
                trace_rdev_return_int(&rdev->wiphy, ret);
        }

        return ret;
}

static inline int
rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      unsigned int link_id,
                      struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
        ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_add_tx_ts(struct cfg80211_registered_device *rdev,
               struct net_device *dev, u8 tsid, const u8 *peer,
               u8 user_prio, u16 admitted_time)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer,
                             user_prio, admitted_time);
        if (rdev->ops->add_tx_ts)
                ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer,
                                           user_prio, admitted_time);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_del_tx_ts(struct cfg80211_registered_device *rdev,
               struct net_device *dev, u8 tsid, const u8 *peer)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer);
        if (rdev->ops->del_tx_ts)
                ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, const u8 *addr,
                         u8 oper_class, struct cfg80211_chan_def *chandef)
{
        int ret;

        trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class,
                                       chandef);
        ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr,
                                             oper_class, chandef);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev,
                                struct net_device *dev, const u8 *addr)
{
        trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
        rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_start_radar_detection(struct cfg80211_registered_device *rdev,
                           struct net_device *dev,
                           struct cfg80211_chan_def *chandef,
                           u32 cac_time_ms)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef,
                                         cac_time_ms);
        if (rdev->ops->start_radar_detection)
                ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev,
                                                       chandef, cac_time_ms);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_end_cac(struct cfg80211_registered_device *rdev,
             struct net_device *dev)
{
        trace_rdev_end_cac(&rdev->wiphy, dev);
        if (rdev->ops->end_cac)
                rdev->ops->end_cac(&rdev->wiphy, dev);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_set_mcast_rate(struct cfg80211_registered_device *rdev,
                    struct net_device *dev,
                    int mcast_rate[NUM_NL80211_BANDS])
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
        if (rdev->ops->set_mcast_rate)
                ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_coalesce(struct cfg80211_registered_device *rdev,
                  struct cfg80211_coalesce *coalesce)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_coalesce(&rdev->wiphy, coalesce);
        if (rdev->ops->set_coalesce)
                ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev,
                               struct net_device *dev,
                               struct cfg80211_pmk_conf *pmk_conf)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf);
        if (rdev->ops->set_pmk)
                ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev,
                               struct net_device *dev, const u8 *aa)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_del_pmk(&rdev->wiphy, dev, aa);
        if (rdev->ops->del_pmk)
                ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_external_auth(struct cfg80211_registered_device *rdev,
                   struct net_device *dev,
                   struct cfg80211_external_auth_params *params)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_external_auth(&rdev->wiphy, dev, params);
        if (rdev->ops->external_auth)
                ret = rdev->ops->external_auth(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
                             struct net_device *dev,
                             struct cfg80211_ftm_responder_stats *ftm_stats)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats);
        if (rdev->ops->get_ftm_responder_stats)
                ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev,
                                                        ftm_stats);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_start_pmsr(struct cfg80211_registered_device *rdev,
                struct wireless_dev *wdev,
                struct cfg80211_pmsr_request *request)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie);
        if (rdev->ops->start_pmsr)
                ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline void
rdev_abort_pmsr(struct cfg80211_registered_device *rdev,
                struct wireless_dev *wdev,
                struct cfg80211_pmsr_request *request)
{
        trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie);
        if (rdev->ops->abort_pmsr)
                rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev,
                                       struct net_device *dev,
                                       struct cfg80211_update_owe_info *oweinfo)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo);
        if (rdev->ops->update_owe_info)
                ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_probe_mesh_link(struct cfg80211_registered_device *rdev,
                     struct net_device *dev, const u8 *dest,
                     const void *buf, size_t len)
{
        int ret;

        trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len);
        ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev,
                                      struct net_device *dev,
                                      struct cfg80211_tid_config *tid_conf)
{
        int ret;

        trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf);
        ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev,
                                        struct net_device *dev, const u8 *peer,
                                        u8 tids)
{
        int ret;

        trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids);
        ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev,
                                     struct cfg80211_sar_specs *sar)
{
        int ret;

        trace_rdev_set_sar_specs(&rdev->wiphy, sar);
        ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int rdev_color_change(struct cfg80211_registered_device *rdev,
                                    struct net_device *dev,
                                    struct cfg80211_color_change_settings *params)
{
        int ret;

        trace_rdev_color_change(&rdev->wiphy, dev, params);
        ret = rdev->ops->color_change(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_set_fils_aad(struct cfg80211_registered_device *rdev,
                  struct net_device *dev, struct cfg80211_fils_aad *fils_aad)
{
        int ret = -EOPNOTSUPP;

        trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad);
        if (rdev->ops->set_fils_aad)
                ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline int
rdev_set_radar_background(struct cfg80211_registered_device *rdev,
                          struct cfg80211_chan_def *chandef)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret;

        if (!rdev->ops->set_radar_background)
                return -EOPNOTSUPP;

        trace_rdev_set_radar_background(wiphy, chandef);
        ret = rdev->ops->set_radar_background(wiphy, chandef);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}

static inline int
rdev_add_intf_link(struct cfg80211_registered_device *rdev,
                   struct wireless_dev *wdev,
                   unsigned int link_id)
{
        int ret = 0;

        trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id);
        if (rdev->ops->add_intf_link)
                ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id);
        trace_rdev_return_int(&rdev->wiphy, ret);

        return ret;
}

static inline void
rdev_del_intf_link(struct cfg80211_registered_device *rdev,
                   struct wireless_dev *wdev,
                   unsigned int link_id)
{
        trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id);
        if (rdev->ops->del_intf_link)
                rdev->ops->del_intf_link(&rdev->wiphy, wdev, link_id);
        trace_rdev_return_void(&rdev->wiphy);
}

static inline int
rdev_add_link_station(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct link_station_parameters *params)
{
        int ret;

        if (!rdev->ops->add_link_station)
                return -EOPNOTSUPP;

        trace_rdev_add_link_station(&rdev->wiphy, dev, params);
        ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_mod_link_station(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct link_station_parameters *params)
{
        int ret;

        if (!rdev->ops->mod_link_station)
                return -EOPNOTSUPP;

        trace_rdev_mod_link_station(&rdev->wiphy, dev, params);
        ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_del_link_station(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct link_station_del_parameters *params)
{
        int ret;

        if (!rdev->ops->del_link_station)
                return -EOPNOTSUPP;

        trace_rdev_del_link_station(&rdev->wiphy, dev, params);
        ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params);
        trace_rdev_return_int(&rdev->wiphy, ret);
        return ret;
}

static inline int
rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev,
                      struct net_device *dev,
                      struct cfg80211_set_hw_timestamp *hwts)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret;

        if (!rdev->ops->set_hw_timestamp)
                return -EOPNOTSUPP;

        trace_rdev_set_hw_timestamp(wiphy, dev, hwts);
        ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}

static inline int
rdev_set_ttlm(struct cfg80211_registered_device *rdev,
              struct net_device *dev,
              struct cfg80211_ttlm_params *params)
{
        struct wiphy *wiphy = &rdev->wiphy;
        int ret;

        if (!rdev->ops->set_ttlm)
                return -EOPNOTSUPP;

        trace_rdev_set_ttlm(wiphy, dev, params);
        ret = rdev->ops->set_ttlm(wiphy, dev, params);
        trace_rdev_return_int(wiphy, ret);

        return ret;
}
#endif /* __CFG80211_RDEV_OPS */






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC64_64_H
#define _ASM_X86_ATOMIC64_64_H

#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>

/* The 64-bit atomic type */

#define ATOMIC64_INIT(i)        { (i) }

static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "addq %1,%0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "subq %1,%0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test

static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "incq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_inc arch_atomic64_inc

static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "decq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_dec arch_atomic64_dec

static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test

static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test

static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
#define arch_atomic64_add_negative arch_atomic64_add_negative

static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic64_add_return arch_atomic64_add_return

#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)

static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)

static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg

static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg

static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic64_xchg arch_atomic64_xchg

static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "andq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
        return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and

static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "orq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
        return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or

static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "xorq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
        return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor

#endif /* _ASM_X86_ATOMIC64_64_H */





























































































    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 










    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 























    1 





    1 












    1 






    1 







    1 


















































































































































































    1 






    1 

















































































































    1 





















    1 
    1 




















    1 



    1 













































































    1 


















































    1 






















    1 

    1 











































    1 

    1 























    1 






































































    1 























    1 



















































    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 






    1 


























    1 








































































































    1 






























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 */
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
#include <linux/audit.h>
#include <uapi/linux/btf.h>
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>

#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
#include <net/tcx.h>

#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
                          (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
                          (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
                        IS_FD_HASH(map))

#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)

DEFINE_PER_CPU(int, bpf_prog_active);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock);

int sysctl_unprivileged_bpf_disabled __read_mostly =
        IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;

static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
        [_id] = &_ops,
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};

/*
 * If we're handed a bigger struct than we know of, ensure all the unknown bits
 * are 0 - i.e. new user-space does not rely on any kernel feature extensions
 * we don't know about yet.
 *
 * There is a ToCToU between this function call and the following
 * copy_from_user() call. However, this is not a concern since this function is
 * meant to be a future-proofing of bits.
 */
int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
                             size_t expected_size,
                             size_t actual_size)
{
        int res;

        if (unlikely(actual_size > PAGE_SIZE))        /* silly large */
                return -E2BIG;

        if (actual_size <= expected_size)
                return 0;

        if (uaddr.is_kernel)
                res = memchr_inv(uaddr.kernel + expected_size, 0,
                                 actual_size - expected_size) == NULL;
        else
                res = check_zeroed_user(uaddr.user + expected_size,
                                        actual_size - expected_size);
        if (res < 0)
                return res;
        return res ? 0 : -E2BIG;
}

const struct bpf_map_ops bpf_map_offload_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc = bpf_map_offload_map_alloc,
        .map_free = bpf_map_offload_map_free,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = bpf_map_offload_map_mem_usage,
};

static void bpf_map_write_active_inc(struct bpf_map *map)
{
        atomic64_inc(&map->writecnt);
}

static void bpf_map_write_active_dec(struct bpf_map *map)
{
        atomic64_dec(&map->writecnt);
}

bool bpf_map_write_active(const struct bpf_map *map)
{
        return atomic64_read(&map->writecnt) != 0;
}

static u32 bpf_map_value_size(const struct bpf_map *map)
{
        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
            map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
            map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
                return round_up(map->value_size, 8) * num_possible_cpus();
        else if (IS_FD_MAP(map))
                return sizeof(u32);
        else
                return  map->value_size;
}

static void maybe_wait_bpf_programs(struct bpf_map *map)
{
        /* Wait for any running non-sleepable BPF programs to complete so that
         * userspace, when we return to it, knows that all non-sleepable
         * programs that could be running use the new map value. For sleepable
         * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
         * for the completions of these programs, but considering the waiting
         * time can be very long and userspace may think it will hang forever,
         * so don't handle sleepable BPF programs now.
         */
        if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
            map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
                synchronize_rcu();
}

static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
                                void *key, void *value, __u64 flags)
{
        int err;

        /* Need to create a kthread, thus must support schedule */
        if (bpf_map_is_offloaded(map)) {
                return bpf_map_offload_update_elem(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
                   map->map_type == BPF_MAP_TYPE_ARENA ||
                   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                return map->ops->map_update_elem(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
                   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
                return sock_map_update_elem_sys(map, key, value, flags);
        } else if (IS_FD_PROG_ARRAY(map)) {
                return bpf_fd_array_map_update_elem(map, map_file, key, value,
                                                    flags);
        }

        bpf_disable_instrumentation();
        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
                err = bpf_percpu_hash_update(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_update(map, key, value, flags);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
                err = bpf_percpu_cgroup_storage_update(map, key, value,
                                                       flags);
        } else if (IS_FD_ARRAY(map)) {
                err = bpf_fd_array_map_update_elem(map, map_file, key, value,
                                                   flags);
        } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
                err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
                                                  flags);
        } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
                /* rcu_read_lock() is not needed */
                err = bpf_fd_reuseport_array_update_elem(map, key, value,
                                                         flags);
        } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
                   map->map_type == BPF_MAP_TYPE_STACK ||
                   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                err = map->ops->map_push_elem(map, value, flags);
        } else {
                rcu_read_lock();
                err = map->ops->map_update_elem(map, key, value, flags);
                rcu_read_unlock();
        }
        bpf_enable_instrumentation();

        return err;
}

static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
                              __u64 flags)
{
        void *ptr;
        int err;

        if (bpf_map_is_offloaded(map))
                return bpf_map_offload_lookup_elem(map, key, value);

        bpf_disable_instrumentation();
        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
                err = bpf_percpu_hash_copy(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
                err = bpf_percpu_array_copy(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
                err = bpf_percpu_cgroup_storage_copy(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
                err = bpf_stackmap_copy(map, key, value);
        } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
                err = bpf_fd_array_map_lookup_elem(map, key, value);
        } else if (IS_FD_HASH(map)) {
                err = bpf_fd_htab_map_lookup_elem(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
                err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
        } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
                   map->map_type == BPF_MAP_TYPE_STACK ||
                   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                err = map->ops->map_peek_elem(map, value);
        } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                /* struct_ops map requires directly updating "value" */
                err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
        } else {
                rcu_read_lock();
                if (map->ops->map_lookup_elem_sys_only)
                        ptr = map->ops->map_lookup_elem_sys_only(map, key);
                else
                        ptr = map->ops->map_lookup_elem(map, key);
                if (IS_ERR(ptr)) {
                        err = PTR_ERR(ptr);
                } else if (!ptr) {
                        err = -ENOENT;
                } else {
                        err = 0;
                        if (flags & BPF_F_LOCK)
                                /* lock 'ptr' and copy everything but lock */
                                copy_map_value_locked(map, value, ptr, true);
                        else
                                copy_map_value(map, value, ptr);
                        /* mask lock and timer, since value wasn't zero inited */
                        check_and_init_map_value(map, value);
                }
                rcu_read_unlock();
        }

        bpf_enable_instrumentation();

        return err;
}

/* Please, do not use this function outside from the map creation path
 * (e.g. in map update path) without taking care of setting the active
 * memory cgroup (see at bpf_map_kmalloc_node() for example).
 */
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
        /* We really just want to fail instead of triggering OOM killer
         * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
         * which is used for lower order allocation requests.
         *
         * It has been observed that higher order allocation requests done by
         * vmalloc with __GFP_NORETRY being set might fail due to not trying
         * to reclaim memory from the page cache, thus we set
         * __GFP_RETRY_MAYFAIL to avoid such situations.
         */

        gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
        unsigned int flags = 0;
        unsigned long align = 1;
        void *area;

        if (size >= SIZE_MAX)
                return NULL;

        /* kmalloc()'ed memory can't be mmap()'ed */
        if (mmapable) {
                BUG_ON(!PAGE_ALIGNED(size));
                align = SHMLBA;
                flags = VM_USERMAP;
        } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
                area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
                                    numa_node);
                if (area != NULL)
                        return area;
        }

        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                        gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
                        flags, numa_node, __builtin_return_address(0));
}

void *bpf_map_area_alloc(u64 size, int numa_node)
{
        return __bpf_map_area_alloc(size, numa_node, false);
}

void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
{
        return __bpf_map_area_alloc(size, numa_node, true);
}

void bpf_map_area_free(void *area)
{
        kvfree(area);
}

static u32 bpf_map_flags_retain_permanent(u32 flags)
{
        /* Some map creation flags are not tied to the map object but
         * rather to the map fd instead, so they have no meaning upon
         * map object inspection since multiple file descriptors with
         * different (access) properties can exist here. Thus, given
         * this has zero meaning for the map itself, lets clear these
         * from here.
         */
        return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
}

void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
{
        map->map_type = attr->map_type;
        map->key_size = attr->key_size;
        map->value_size = attr->value_size;
        map->max_entries = attr->max_entries;
        map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
        map->numa_node = bpf_map_attr_numa_node(attr);
        map->map_extra = attr->map_extra;
}

static int bpf_map_alloc_id(struct bpf_map *map)
{
        int id;

        idr_preload(GFP_KERNEL);
        spin_lock_bh(&map_idr_lock);
        id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
        if (id > 0)
                map->id = id;
        spin_unlock_bh(&map_idr_lock);
        idr_preload_end();

        if (WARN_ON_ONCE(!id))
                return -ENOSPC;

        return id > 0 ? 0 : id;
}

void bpf_map_free_id(struct bpf_map *map)
{
        unsigned long flags;

        /* Offloaded maps are removed from the IDR store when their device
         * disappears - even if someone holds an fd to them they are unusable,
         * the memory is gone, all ops will fail; they are simply waiting for
         * refcnt to drop to be freed.
         */
        if (!map->id)
                return;

        spin_lock_irqsave(&map_idr_lock, flags);

        idr_remove(&map_idr, map->id);
        map->id = 0;

        spin_unlock_irqrestore(&map_idr_lock, flags);
}

#ifdef CONFIG_MEMCG_KMEM
static void bpf_map_save_memcg(struct bpf_map *map)
{
        /* Currently if a map is created by a process belonging to the root
         * memory cgroup, get_obj_cgroup_from_current() will return NULL.
         * So we have to check map->objcg for being NULL each time it's
         * being used.
         */
        if (memcg_bpf_enabled())
                map->objcg = get_obj_cgroup_from_current();
}

static void bpf_map_release_memcg(struct bpf_map *map)
{
        if (map->objcg)
                obj_cgroup_put(map->objcg);
}

static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
{
        if (map->objcg)
                return get_mem_cgroup_from_objcg(map->objcg);

        return root_mem_cgroup;
}

void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
                           int node)
{
        struct mem_cgroup *memcg, *old_memcg;
        void *ptr;

        memcg = bpf_map_get_memcg(map);
        old_memcg = set_active_memcg(memcg);
        ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
        set_active_memcg(old_memcg);
        mem_cgroup_put(memcg);

        return ptr;
}

void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
        struct mem_cgroup *memcg, *old_memcg;
        void *ptr;

        memcg = bpf_map_get_memcg(map);
        old_memcg = set_active_memcg(memcg);
        ptr = kzalloc(size, flags | __GFP_ACCOUNT);
        set_active_memcg(old_memcg);
        mem_cgroup_put(memcg);

        return ptr;
}

void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
                       gfp_t flags)
{
        struct mem_cgroup *memcg, *old_memcg;
        void *ptr;

        memcg = bpf_map_get_memcg(map);
        old_memcg = set_active_memcg(memcg);
        ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
        set_active_memcg(old_memcg);
        mem_cgroup_put(memcg);

        return ptr;
}

void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
                                    size_t align, gfp_t flags)
{
        struct mem_cgroup *memcg, *old_memcg;
        void __percpu *ptr;

        memcg = bpf_map_get_memcg(map);
        old_memcg = set_active_memcg(memcg);
        ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
        set_active_memcg(old_memcg);
        mem_cgroup_put(memcg);

        return ptr;
}

#else
static void bpf_map_save_memcg(struct bpf_map *map)
{
}

static void bpf_map_release_memcg(struct bpf_map *map)
{
}
#endif

int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
                        unsigned long nr_pages, struct page **pages)
{
        unsigned long i, j;
        struct page *pg;
        int ret = 0;
#ifdef CONFIG_MEMCG_KMEM
        struct mem_cgroup *memcg, *old_memcg;

        memcg = bpf_map_get_memcg(map);
        old_memcg = set_active_memcg(memcg);
#endif
        for (i = 0; i < nr_pages; i++) {
                pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);

                if (pg) {
                        pages[i] = pg;
                        continue;
                }
                for (j = 0; j < i; j++)
                        __free_page(pages[j]);
                ret = -ENOMEM;
                break;
        }

#ifdef CONFIG_MEMCG_KMEM
        set_active_memcg(old_memcg);
        mem_cgroup_put(memcg);
#endif
        return ret;
}


static int btf_field_cmp(const void *a, const void *b)
{
        const struct btf_field *f1 = a, *f2 = b;

        if (f1->offset < f2->offset)
                return -1;
        else if (f1->offset > f2->offset)
                return 1;
        return 0;
}

struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
                                  u32 field_mask)
{
        struct btf_field *field;

        if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
                return NULL;
        field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
        if (!field || !(field->type & field_mask))
                return NULL;
        return field;
}

void btf_record_free(struct btf_record *rec)
{
        int i;

        if (IS_ERR_OR_NULL(rec))
                return;
        for (i = 0; i < rec->cnt; i++) {
                switch (rec->fields[i].type) {
                case BPF_KPTR_UNREF:
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                        if (rec->fields[i].kptr.module)
                                module_put(rec->fields[i].kptr.module);
                        btf_put(rec->fields[i].kptr.btf);
                        break;
                case BPF_LIST_HEAD:
                case BPF_LIST_NODE:
                case BPF_RB_ROOT:
                case BPF_RB_NODE:
                case BPF_SPIN_LOCK:
                case BPF_TIMER:
                case BPF_REFCOUNT:
                case BPF_WORKQUEUE:
                        /* Nothing to release */
                        break;
                default:
                        WARN_ON_ONCE(1);
                        continue;
                }
        }
        kfree(rec);
}

void bpf_map_free_record(struct bpf_map *map)
{
        btf_record_free(map->record);
        map->record = NULL;
}

struct btf_record *btf_record_dup(const struct btf_record *rec)
{
        const struct btf_field *fields;
        struct btf_record *new_rec;
        int ret, size, i;

        if (IS_ERR_OR_NULL(rec))
                return NULL;
        size = offsetof(struct btf_record, fields[rec->cnt]);
        new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
        if (!new_rec)
                return ERR_PTR(-ENOMEM);
        /* Do a deep copy of the btf_record */
        fields = rec->fields;
        new_rec->cnt = 0;
        for (i = 0; i < rec->cnt; i++) {
                switch (fields[i].type) {
                case BPF_KPTR_UNREF:
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                        btf_get(fields[i].kptr.btf);
                        if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
                                ret = -ENXIO;
                                goto free;
                        }
                        break;
                case BPF_LIST_HEAD:
                case BPF_LIST_NODE:
                case BPF_RB_ROOT:
                case BPF_RB_NODE:
                case BPF_SPIN_LOCK:
                case BPF_TIMER:
                case BPF_REFCOUNT:
                case BPF_WORKQUEUE:
                        /* Nothing to acquire */
                        break;
                default:
                        ret = -EFAULT;
                        WARN_ON_ONCE(1);
                        goto free;
                }
                new_rec->cnt++;
        }
        return new_rec;
free:
        btf_record_free(new_rec);
        return ERR_PTR(ret);
}

bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
{
        bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
        int size;

        if (!a_has_fields && !b_has_fields)
                return true;
        if (a_has_fields != b_has_fields)
                return false;
        if (rec_a->cnt != rec_b->cnt)
                return false;
        size = offsetof(struct btf_record, fields[rec_a->cnt]);
        /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
         * members are zeroed out. So memcmp is safe to do without worrying
         * about padding/unused fields.
         *
         * While spin_lock, timer, and kptr have no relation to map BTF,
         * list_head metadata is specific to map BTF, the btf and value_rec
         * members in particular. btf is the map BTF, while value_rec points to
         * btf_record in that map BTF.
         *
         * So while by default, we don't rely on the map BTF (which the records
         * were parsed from) matching for both records, which is not backwards
         * compatible, in case list_head is part of it, we implicitly rely on
         * that by way of depending on memcmp succeeding for it.
         */
        return !memcmp(rec_a, rec_b, size);
}

void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
{
        if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
                return;
        bpf_timer_cancel_and_free(obj + rec->timer_off);
}

void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
{
        if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
                return;
        bpf_wq_cancel_and_free(obj + rec->wq_off);
}

void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
        const struct btf_field *fields;
        int i;

        if (IS_ERR_OR_NULL(rec))
                return;
        fields = rec->fields;
        for (i = 0; i < rec->cnt; i++) {
                struct btf_struct_meta *pointee_struct_meta;
                const struct btf_field *field = &fields[i];
                void *field_ptr = obj + field->offset;
                void *xchgd_field;

                switch (fields[i].type) {
                case BPF_SPIN_LOCK:
                        break;
                case BPF_TIMER:
                        bpf_timer_cancel_and_free(field_ptr);
                        break;
                case BPF_WORKQUEUE:
                        bpf_wq_cancel_and_free(field_ptr);
                        break;
                case BPF_KPTR_UNREF:
                        WRITE_ONCE(*(u64 *)field_ptr, 0);
                        break;
                case BPF_KPTR_REF:
                case BPF_KPTR_PERCPU:
                        xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
                        if (!xchgd_field)
                                break;

                        if (!btf_is_kernel(field->kptr.btf)) {
                                pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
                                                                           field->kptr.btf_id);
                                migrate_disable();
                                __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
                                                                 pointee_struct_meta->record : NULL,
                                                                 fields[i].type == BPF_KPTR_PERCPU);
                                migrate_enable();
                        } else {
                                field->kptr.dtor(xchgd_field);
                        }
                        break;
                case BPF_LIST_HEAD:
                        if (WARN_ON_ONCE(rec->spin_lock_off < 0))
                                continue;
                        bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
                        break;
                case BPF_RB_ROOT:
                        if (WARN_ON_ONCE(rec->spin_lock_off < 0))
                                continue;
                        bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
                        break;
                case BPF_LIST_NODE:
                case BPF_RB_NODE:
                case BPF_REFCOUNT:
                        break;
                default:
                        WARN_ON_ONCE(1);
                        continue;
                }
        }
}

/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
        struct bpf_map *map = container_of(work, struct bpf_map, work);
        struct btf_record *rec = map->record;
        struct btf *btf = map->btf;

        security_bpf_map_free(map);
        bpf_map_release_memcg(map);
        /* implementation dependent freeing */
        map->ops->map_free(map);
        /* Delay freeing of btf_record for maps, as map_free
         * callback usually needs access to them. It is better to do it here
         * than require each callback to do the free itself manually.
         *
         * Note that the btf_record stashed in map->inner_map_meta->record was
         * already freed using the map_free callback for map in map case which
         * eventually calls bpf_map_free_meta, since inner_map_meta is only a
         * template bpf_map struct used during verification.
         */
        btf_record_free(rec);
        /* Delay freeing of btf for maps, as map_free callback may need
         * struct_meta info which will be freed with btf_put().
         */
        btf_put(btf);
}

static void bpf_map_put_uref(struct bpf_map *map)
{
        if (atomic64_dec_and_test(&map->usercnt)) {
                if (map->ops->map_release_uref)
                        map->ops->map_release_uref(map);
        }
}

static void bpf_map_free_in_work(struct bpf_map *map)
{
        INIT_WORK(&map->work, bpf_map_free_deferred);
        /* Avoid spawning kworkers, since they all might contend
         * for the same mutex like slab_mutex.
         */
        queue_work(system_unbound_wq, &map->work);
}

static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
{
        bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
}

static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
{
        if (rcu_trace_implies_rcu_gp())
                bpf_map_free_rcu_gp(rcu);
        else
                call_rcu(rcu, bpf_map_free_rcu_gp);
}

/* decrement map refcnt and schedule it for freeing via workqueue
 * (underlying map implementation ops->map_free() might sleep)
 */
void bpf_map_put(struct bpf_map *map)
{
        if (atomic64_dec_and_test(&map->refcnt)) {
                /* bpf_map_free_id() must be called first */
                bpf_map_free_id(map);

                WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
                if (READ_ONCE(map->free_after_mult_rcu_gp))
                        call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
                else if (READ_ONCE(map->free_after_rcu_gp))
                        call_rcu(&map->rcu, bpf_map_free_rcu_gp);
                else
                        bpf_map_free_in_work(map);
        }
}
EXPORT_SYMBOL_GPL(bpf_map_put);

void bpf_map_put_with_uref(struct bpf_map *map)
{
        bpf_map_put_uref(map);
        bpf_map_put(map);
}

static int bpf_map_release(struct inode *inode, struct file *filp)
{
        struct bpf_map *map = filp->private_data;

        if (map->ops->map_release)
                map->ops->map_release(map, filp);

        bpf_map_put_with_uref(map);
        return 0;
}

static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
        fmode_t mode = f.file->f_mode;

        /* Our file permissions may have been overridden by global
         * map permissions facing syscall side.
         */
        if (READ_ONCE(map->frozen))
                mode &= ~FMODE_CAN_WRITE;
        return mode;
}

#ifdef CONFIG_PROC_FS
/* Show the memory usage of a bpf map */
static u64 bpf_map_memory_usage(const struct bpf_map *map)
{
        return map->ops->map_mem_usage(map);
}

static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
        struct bpf_map *map = filp->private_data;
        u32 type = 0, jited = 0;

        if (map_type_contains_progs(map)) {
                spin_lock(&map->owner.lock);
                type  = map->owner.type;
                jited = map->owner.jited;
                spin_unlock(&map->owner.lock);
        }

        seq_printf(m,
                   "map_type:\t%u\n"
                   "key_size:\t%u\n"
                   "value_size:\t%u\n"
                   "max_entries:\t%u\n"
                   "map_flags:\t%#x\n"
                   "map_extra:\t%#llx\n"
                   "memlock:\t%llu\n"
                   "map_id:\t%u\n"
                   "frozen:\t%u\n",
                   map->map_type,
                   map->key_size,
                   map->value_size,
                   map->max_entries,
                   map->map_flags,
                   (unsigned long long)map->map_extra,
                   bpf_map_memory_usage(map),
                   map->id,
                   READ_ONCE(map->frozen));
        if (type) {
                seq_printf(m, "owner_prog_type:\t%u\n", type);
                seq_printf(m, "owner_jited:\t%u\n", jited);
        }
}
#endif

static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
                              loff_t *ppos)
{
        /* We need this handler such that alloc_file() enables
         * f_mode with FMODE_CAN_READ.
         */
        return -EINVAL;
}

static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
                               size_t siz, loff_t *ppos)
{
        /* We need this handler such that alloc_file() enables
         * f_mode with FMODE_CAN_WRITE.
         */
        return -EINVAL;
}

/* called for any extra memory-mapped regions (except initial) */
static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
        struct bpf_map *map = vma->vm_file->private_data;

        if (vma->vm_flags & VM_MAYWRITE)
                bpf_map_write_active_inc(map);
}

/* called for all unmapped memory region (including initial) */
static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
        struct bpf_map *map = vma->vm_file->private_data;

        if (vma->vm_flags & VM_MAYWRITE)
                bpf_map_write_active_dec(map);
}

static const struct vm_operations_struct bpf_map_default_vmops = {
        .open                = bpf_map_mmap_open,
        .close                = bpf_map_mmap_close,
};

static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
        struct bpf_map *map = filp->private_data;
        int err;

        if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
                return -ENOTSUPP;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        mutex_lock(&map->freeze_mutex);

        if (vma->vm_flags & VM_WRITE) {
                if (map->frozen) {
                        err = -EPERM;
                        goto out;
                }
                /* map is meant to be read-only, so do not allow mapping as
                 * writable, because it's possible to leak a writable page
                 * reference and allows user-space to still modify it after
                 * freezing, while verifier will assume contents do not change
                 */
                if (map->map_flags & BPF_F_RDONLY_PROG) {
                        err = -EACCES;
                        goto out;
                }
        }

        /* set default open/close callbacks */
        vma->vm_ops = &bpf_map_default_vmops;
        vma->vm_private_data = map;
        vm_flags_clear(vma, VM_MAYEXEC);
        if (!(vma->vm_flags & VM_WRITE))
                /* disallow re-mapping with PROT_WRITE */
                vm_flags_clear(vma, VM_MAYWRITE);

        err = map->ops->map_mmap(map, vma);
        if (err)
                goto out;

        if (vma->vm_flags & VM_MAYWRITE)
                bpf_map_write_active_inc(map);
out:
        mutex_unlock(&map->freeze_mutex);
        return err;
}

static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
{
        struct bpf_map *map = filp->private_data;

        if (map->ops->map_poll)
                return map->ops->map_poll(map, filp, pts);

        return EPOLLERR;
}

static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
                                           unsigned long len, unsigned long pgoff,
                                           unsigned long flags)
{
        struct bpf_map *map = filp->private_data;

        if (map->ops->map_get_unmapped_area)
                return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
#ifdef CONFIG_MMU
        return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
#else
        return addr;
#endif
}

const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_map_show_fdinfo,
#endif
        .release        = bpf_map_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
        .mmap                = bpf_map_mmap,
        .poll                = bpf_map_poll,
        .get_unmapped_area = bpf_get_unmapped_area,
};

int bpf_map_new_fd(struct bpf_map *map, int flags)
{
        int ret;

        ret = security_bpf_map(map, OPEN_FMODE(flags));
        if (ret < 0)
                return ret;

        return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
                                flags | O_CLOEXEC);
}

int bpf_get_file_flag(int flags)
{
        if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
                return -EINVAL;
        if (flags & BPF_F_RDONLY)
                return O_RDONLY;
        if (flags & BPF_F_WRONLY)
                return O_WRONLY;
        return O_RDWR;
}

/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
        memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
                   sizeof(attr->CMD##_LAST_FIELD), 0, \
                   sizeof(*attr) - \
                   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
                   sizeof(attr->CMD##_LAST_FIELD)) != NULL

/* dst and src must have at least "size" number of bytes.
 * Return strlen on success and < 0 on error.
 */
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
{
        const char *end = src + size;
        const char *orig_src = src;

        memset(dst, 0, size);
        /* Copy all isalnum(), '_' and '.' chars. */
        while (src < end && *src) {
                if (!isalnum(*src) &&
                    *src != '_' && *src != '.')
                        return -EINVAL;
                *dst++ = *src++;
        }

        /* No '\0' found in "size" number of bytes */
        if (src == end)
                return -EINVAL;

        return src - orig_src;
}

int map_check_no_btf(const struct bpf_map *map,
                     const struct btf *btf,
                     const struct btf_type *key_type,
                     const struct btf_type *value_type)
{
        return -ENOTSUPP;
}

static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
                         const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
{
        const struct btf_type *key_type, *value_type;
        u32 key_size, value_size;
        int ret = 0;

        /* Some maps allow key to be unspecified. */
        if (btf_key_id) {
                key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
                if (!key_type || key_size != map->key_size)
                        return -EINVAL;
        } else {
                key_type = btf_type_by_id(btf, 0);
                if (!map->ops->map_check_btf)
                        return -EINVAL;
        }

        value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
        if (!value_type || value_size != map->value_size)
                return -EINVAL;

        map->record = btf_parse_fields(btf, value_type,
                                       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
                                       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
                                       map->value_size);
        if (!IS_ERR_OR_NULL(map->record)) {
                int i;

                if (!bpf_token_capable(token, CAP_BPF)) {
                        ret = -EPERM;
                        goto free_map_tab;
                }
                if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
                        ret = -EACCES;
                        goto free_map_tab;
                }
                for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
                        switch (map->record->field_mask & (1 << i)) {
                        case 0:
                                continue;
                        case BPF_SPIN_LOCK:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY &&
                                    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        case BPF_TIMER:
                        case BPF_WORKQUEUE:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        case BPF_KPTR_UNREF:
                        case BPF_KPTR_REF:
                        case BPF_KPTR_PERCPU:
                        case BPF_REFCOUNT:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY &&
                                    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
                                    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        case BPF_LIST_HEAD:
                        case BPF_RB_ROOT:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY) {
                                        ret = -EOPNOTSUPP;
                                        goto free_map_tab;
                                }
                                break;
                        default:
                                /* Fail if map_type checks are missing for a field type */
                                ret = -EOPNOTSUPP;
                                goto free_map_tab;
                        }
                }
        }

        ret = btf_check_and_fixup_fields(btf, map->record);
        if (ret < 0)
                goto free_map_tab;

        if (map->ops->map_check_btf) {
                ret = map->ops->map_check_btf(map, btf, key_type, value_type);
                if (ret < 0)
                        goto free_map_tab;
        }

        return ret;
free_map_tab:
        bpf_map_free_record(map);
        return ret;
}

static bool bpf_net_capable(void)
{
        return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
}

#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
        const struct bpf_map_ops *ops;
        struct bpf_token *token = NULL;
        int numa_node = bpf_map_attr_numa_node(attr);
        u32 map_type = attr->map_type;
        struct bpf_map *map;
        bool token_flag;
        int f_flags;
        int err;

        err = CHECK_ATTR(BPF_MAP_CREATE);
        if (err)
                return -EINVAL;

        /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
         * to avoid per-map type checks tripping on unknown flag
         */
        token_flag = attr->map_flags & BPF_F_TOKEN_FD;
        attr->map_flags &= ~BPF_F_TOKEN_FD;

        if (attr->btf_vmlinux_value_type_id) {
                if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
                    attr->btf_key_type_id || attr->btf_value_type_id)
                        return -EINVAL;
        } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
                return -EINVAL;
        }

        if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
            attr->map_type != BPF_MAP_TYPE_ARENA &&
            attr->map_extra != 0)
                return -EINVAL;

        f_flags = bpf_get_file_flag(attr->map_flags);
        if (f_flags < 0)
                return f_flags;

        if (numa_node != NUMA_NO_NODE &&
            ((unsigned int)numa_node >= nr_node_ids ||
             !node_online(numa_node)))
                return -EINVAL;

        /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
        map_type = attr->map_type;
        if (map_type >= ARRAY_SIZE(bpf_map_types))
                return -EINVAL;
        map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
        ops = bpf_map_types[map_type];
        if (!ops)
                return -EINVAL;

        if (ops->map_alloc_check) {
                err = ops->map_alloc_check(attr);
                if (err)
                        return err;
        }
        if (attr->map_ifindex)
                ops = &bpf_map_offload_ops;
        if (!ops->map_mem_usage)
                return -EINVAL;

        if (token_flag) {
                token = bpf_token_get_from_fd(attr->map_token_fd);
                if (IS_ERR(token))
                        return PTR_ERR(token);

                /* if current token doesn't grant map creation permissions,
                 * then we can't use this token, so ignore it and rely on
                 * system-wide capabilities checks
                 */
                if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
                    !bpf_token_allow_map_type(token, attr->map_type)) {
                        bpf_token_put(token);
                        token = NULL;
                }
        }

        err = -EPERM;

        /* Intent here is for unprivileged_bpf_disabled to block BPF map
         * creation for unprivileged users; other actions depend
         * on fd availability and access to bpffs, so are dependent on
         * object creation success. Even with unprivileged BPF disabled,
         * capability checks are still carried out.
         */
        if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
                goto put_token;

        /* check privileged map type permissions */
        switch (map_type) {
        case BPF_MAP_TYPE_ARRAY:
        case BPF_MAP_TYPE_PERCPU_ARRAY:
        case BPF_MAP_TYPE_PROG_ARRAY:
        case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
        case BPF_MAP_TYPE_CGROUP_ARRAY:
        case BPF_MAP_TYPE_ARRAY_OF_MAPS:
        case BPF_MAP_TYPE_HASH:
        case BPF_MAP_TYPE_PERCPU_HASH:
        case BPF_MAP_TYPE_HASH_OF_MAPS:
        case BPF_MAP_TYPE_RINGBUF:
        case BPF_MAP_TYPE_USER_RINGBUF:
        case BPF_MAP_TYPE_CGROUP_STORAGE:
        case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
                /* unprivileged */
                break;
        case BPF_MAP_TYPE_SK_STORAGE:
        case BPF_MAP_TYPE_INODE_STORAGE:
        case BPF_MAP_TYPE_TASK_STORAGE:
        case BPF_MAP_TYPE_CGRP_STORAGE:
        case BPF_MAP_TYPE_BLOOM_FILTER:
        case BPF_MAP_TYPE_LPM_TRIE:
        case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
        case BPF_MAP_TYPE_STACK_TRACE:
        case BPF_MAP_TYPE_QUEUE:
        case BPF_MAP_TYPE_STACK:
        case BPF_MAP_TYPE_LRU_HASH:
        case BPF_MAP_TYPE_LRU_PERCPU_HASH:
        case BPF_MAP_TYPE_STRUCT_OPS:
        case BPF_MAP_TYPE_CPUMAP:
        case BPF_MAP_TYPE_ARENA:
                if (!bpf_token_capable(token, CAP_BPF))
                        goto put_token;
                break;
        case BPF_MAP_TYPE_SOCKMAP:
        case BPF_MAP_TYPE_SOCKHASH:
        case BPF_MAP_TYPE_DEVMAP:
        case BPF_MAP_TYPE_DEVMAP_HASH:
        case BPF_MAP_TYPE_XSKMAP:
                if (!bpf_token_capable(token, CAP_NET_ADMIN))
                        goto put_token;
                break;
        default:
                WARN(1, "unsupported map type %d", map_type);
                goto put_token;
        }

        map = ops->map_alloc(attr);
        if (IS_ERR(map)) {
                err = PTR_ERR(map);
                goto put_token;
        }
        map->ops = ops;
        map->map_type = map_type;

        err = bpf_obj_name_cpy(map->name, attr->map_name,
                               sizeof(attr->map_name));
        if (err < 0)
                goto free_map;

        atomic64_set(&map->refcnt, 1);
        atomic64_set(&map->usercnt, 1);
        mutex_init(&map->freeze_mutex);
        spin_lock_init(&map->owner.lock);

        if (attr->btf_key_type_id || attr->btf_value_type_id ||
            /* Even the map's value is a kernel's struct,
             * the bpf_prog.o must have BTF to begin with
             * to figure out the corresponding kernel's
             * counter part.  Thus, attr->btf_fd has
             * to be valid also.
             */
            attr->btf_vmlinux_value_type_id) {
                struct btf *btf;

                btf = btf_get_by_fd(attr->btf_fd);
                if (IS_ERR(btf)) {
                        err = PTR_ERR(btf);
                        goto free_map;
                }
                if (btf_is_kernel(btf)) {
                        btf_put(btf);
                        err = -EACCES;
                        goto free_map;
                }
                map->btf = btf;

                if (attr->btf_value_type_id) {
                        err = map_check_btf(map, token, btf, attr->btf_key_type_id,
                                            attr->btf_value_type_id);
                        if (err)
                                goto free_map;
                }

                map->btf_key_type_id = attr->btf_key_type_id;
                map->btf_value_type_id = attr->btf_value_type_id;
                map->btf_vmlinux_value_type_id =
                        attr->btf_vmlinux_value_type_id;
        }

        err = security_bpf_map_create(map, attr, token);
        if (err)
                goto free_map_sec;

        err = bpf_map_alloc_id(map);
        if (err)
                goto free_map_sec;

        bpf_map_save_memcg(map);
        bpf_token_put(token);

        err = bpf_map_new_fd(map, f_flags);
        if (err < 0) {
                /* failed to allocate fd.
                 * bpf_map_put_with_uref() is needed because the above
                 * bpf_map_alloc_id() has published the map
                 * to the userspace and the userspace may
                 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
                 */
                bpf_map_put_with_uref(map);
                return err;
        }

        return err;

free_map_sec:
        security_bpf_map_free(map);
free_map:
        btf_put(map->btf);
        map->ops->map_free(map);
put_token:
        bpf_token_put(token);
        return err;
}

/* if error is returned, fd is released.
 * On success caller should complete fd access with matching fdput()
 */
struct bpf_map *__bpf_map_get(struct fd f)
{
        if (!f.file)
                return ERR_PTR(-EBADF);
        if (f.file->f_op != &bpf_map_fops) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }

        return f.file->private_data;
}

void bpf_map_inc(struct bpf_map *map)
{
        atomic64_inc(&map->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc);

void bpf_map_inc_with_uref(struct bpf_map *map)
{
        atomic64_inc(&map->refcnt);
        atomic64_inc(&map->usercnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);

struct bpf_map *bpf_map_get(u32 ufd)
{
        struct fd f = fdget(ufd);
        struct bpf_map *map;

        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return map;

        bpf_map_inc(map);
        fdput(f);

        return map;
}
EXPORT_SYMBOL(bpf_map_get);

struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
        struct fd f = fdget(ufd);
        struct bpf_map *map;

        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return map;

        bpf_map_inc_with_uref(map);
        fdput(f);

        return map;
}

/* map_idr_lock should have been held or the map should have been
 * protected by rcu read lock.
 */
struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
        int refold;

        refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
        if (!refold)
                return ERR_PTR(-ENOENT);
        if (uref)
                atomic64_inc(&map->usercnt);

        return map;
}

struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
{
        spin_lock_bh(&map_idr_lock);
        map = __bpf_map_inc_not_zero(map, false);
        spin_unlock_bh(&map_idr_lock);

        return map;
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);

int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
        return -ENOTSUPP;
}

static void *__bpf_copy_key(void __user *ukey, u64 key_size)
{
        if (key_size)
                return vmemdup_user(ukey, key_size);

        if (ukey)
                return ERR_PTR(-EINVAL);

        return NULL;
}

static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
{
        if (key_size)
                return kvmemdup_bpfptr(ukey, key_size);

        if (!bpfptr_is_null(ukey))
                return ERR_PTR(-EINVAL);

        return NULL;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags

static int map_lookup_elem(union bpf_attr *attr)
{
        void __user *ukey = u64_to_user_ptr(attr->key);
        void __user *uvalue = u64_to_user_ptr(attr->value);
        int ufd = attr->map_fd;
        struct bpf_map *map;
        void *key, *value;
        u32 value_size;
        struct fd f;
        int err;

        if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
                return -EINVAL;

        if (attr->flags & ~BPF_F_LOCK)
                return -EINVAL;

        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
                err = -EPERM;
                goto err_put;
        }

        if ((attr->flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                err = -EINVAL;
                goto err_put;
        }

        key = __bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }

        value_size = bpf_map_value_size(map);

        err = -ENOMEM;
        value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;

        if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
                if (copy_from_user(value, uvalue, value_size))
                        err = -EFAULT;
                else
                        err = bpf_map_copy_value(map, key, value, attr->flags);
                goto free_value;
        }

        err = bpf_map_copy_value(map, key, value, attr->flags);
        if (err)
                goto free_value;

        err = -EFAULT;
        if (copy_to_user(uvalue, value, value_size) != 0)
                goto free_value;

        err = 0;

free_value:
        kvfree(value);
free_key:
        kvfree(key);
err_put:
        fdput(f);
        return err;
}


#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags

static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
        bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
        bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
        int ufd = attr->map_fd;
        struct bpf_map *map;
        void *key, *value;
        u32 value_size;
        struct fd f;
        int err;

        if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
                return -EINVAL;

        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        bpf_map_write_active_inc(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        if ((attr->flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                err = -EINVAL;
                goto err_put;
        }

        key = ___bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }

        value_size = bpf_map_value_size(map);
        value = kvmemdup_bpfptr(uvalue, value_size);
        if (IS_ERR(value)) {
                err = PTR_ERR(value);
                goto free_key;
        }

        err = bpf_map_update_value(map, f.file, key, value, attr->flags);
        if (!err)
                maybe_wait_bpf_programs(map);

        kvfree(value);
free_key:
        kvfree(key);
err_put:
        bpf_map_write_active_dec(map);
        fdput(f);
        return err;
}

#define BPF_MAP_DELETE_ELEM_LAST_FIELD key

static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
        bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
        int ufd = attr->map_fd;
        struct bpf_map *map;
        struct fd f;
        void *key;
        int err;

        if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
                return -EINVAL;

        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        bpf_map_write_active_inc(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        key = ___bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }

        if (bpf_map_is_offloaded(map)) {
                err = bpf_map_offload_delete_elem(map, key);
                goto out;
        } else if (IS_FD_PROG_ARRAY(map) ||
                   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
                /* These maps require sleepable context */
                err = map->ops->map_delete_elem(map, key);
                goto out;
        }

        bpf_disable_instrumentation();
        rcu_read_lock();
        err = map->ops->map_delete_elem(map, key);
        rcu_read_unlock();
        bpf_enable_instrumentation();
        if (!err)
                maybe_wait_bpf_programs(map);
out:
        kvfree(key);
err_put:
        bpf_map_write_active_dec(map);
        fdput(f);
        return err;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key

static int map_get_next_key(union bpf_attr *attr)
{
        void __user *ukey = u64_to_user_ptr(attr->key);
        void __user *unext_key = u64_to_user_ptr(attr->next_key);
        int ufd = attr->map_fd;
        struct bpf_map *map;
        void *key, *next_key;
        struct fd f;
        int err;

        if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
                return -EINVAL;

        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
                err = -EPERM;
                goto err_put;
        }

        if (ukey) {
                key = __bpf_copy_key(ukey, map->key_size);
                if (IS_ERR(key)) {
                        err = PTR_ERR(key);
                        goto err_put;
                }
        } else {
                key = NULL;
        }

        err = -ENOMEM;
        next_key = kvmalloc(map->key_size, GFP_USER);
        if (!next_key)
                goto free_key;

        if (bpf_map_is_offloaded(map)) {
                err = bpf_map_offload_get_next_key(map, key, next_key);
                goto out;
        }

        rcu_read_lock();
        err = map->ops->map_get_next_key(map, key, next_key);
        rcu_read_unlock();
out:
        if (err)
                goto free_next_key;

        err = -EFAULT;
        if (copy_to_user(unext_key, next_key, map->key_size) != 0)
                goto free_next_key;

        err = 0;

free_next_key:
        kvfree(next_key);
free_key:
        kvfree(key);
err_put:
        fdput(f);
        return err;
}

int generic_map_delete_batch(struct bpf_map *map,
                             const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        void __user *keys = u64_to_user_ptr(attr->batch.keys);
        u32 cp, max_count;
        int err = 0;
        void *key;

        if (attr->batch.elem_flags & ~BPF_F_LOCK)
                return -EINVAL;

        if ((attr->batch.elem_flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                return -EINVAL;
        }

        max_count = attr->batch.count;
        if (!max_count)
                return 0;

        if (put_user(0, &uattr->batch.count))
                return -EFAULT;

        key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!key)
                return -ENOMEM;

        for (cp = 0; cp < max_count; cp++) {
                err = -EFAULT;
                if (copy_from_user(key, keys + cp * map->key_size,
                                   map->key_size))
                        break;

                if (bpf_map_is_offloaded(map)) {
                        err = bpf_map_offload_delete_elem(map, key);
                        break;
                }

                bpf_disable_instrumentation();
                rcu_read_lock();
                err = map->ops->map_delete_elem(map, key);
                rcu_read_unlock();
                bpf_enable_instrumentation();
                if (err)
                        break;
                cond_resched();
        }
        if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
                err = -EFAULT;

        kvfree(key);

        return err;
}

int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
                             const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        void __user *values = u64_to_user_ptr(attr->batch.values);
        void __user *keys = u64_to_user_ptr(attr->batch.keys);
        u32 value_size, cp, max_count;
        void *key, *value;
        int err = 0;

        if (attr->batch.elem_flags & ~BPF_F_LOCK)
                return -EINVAL;

        if ((attr->batch.elem_flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                return -EINVAL;
        }

        value_size = bpf_map_value_size(map);

        max_count = attr->batch.count;
        if (!max_count)
                return 0;

        if (put_user(0, &uattr->batch.count))
                return -EFAULT;

        key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!key)
                return -ENOMEM;

        value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value) {
                kvfree(key);
                return -ENOMEM;
        }

        for (cp = 0; cp < max_count; cp++) {
                err = -EFAULT;
                if (copy_from_user(key, keys + cp * map->key_size,
                    map->key_size) ||
                    copy_from_user(value, values + cp * value_size, value_size))
                        break;

                err = bpf_map_update_value(map, map_file, key, value,
                                           attr->batch.elem_flags);

                if (err)
                        break;
                cond_resched();
        }

        if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
                err = -EFAULT;

        kvfree(value);
        kvfree(key);

        return err;
}

#define MAP_LOOKUP_RETRIES 3

int generic_map_lookup_batch(struct bpf_map *map,
                                    const union bpf_attr *attr,
                                    union bpf_attr __user *uattr)
{
        void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
        void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
        void __user *values = u64_to_user_ptr(attr->batch.values);
        void __user *keys = u64_to_user_ptr(attr->batch.keys);
        void *buf, *buf_prevkey, *prev_key, *key, *value;
        int err, retry = MAP_LOOKUP_RETRIES;
        u32 value_size, cp, max_count;

        if (attr->batch.elem_flags & ~BPF_F_LOCK)
                return -EINVAL;

        if ((attr->batch.elem_flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK))
                return -EINVAL;

        value_size = bpf_map_value_size(map);

        max_count = attr->batch.count;
        if (!max_count)
                return 0;

        if (put_user(0, &uattr->batch.count))
                return -EFAULT;

        buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
        if (!buf_prevkey)
                return -ENOMEM;

        buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
        if (!buf) {
                kvfree(buf_prevkey);
                return -ENOMEM;
        }

        err = -EFAULT;
        prev_key = NULL;
        if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
                goto free_buf;
        key = buf;
        value = key + map->key_size;
        if (ubatch)
                prev_key = buf_prevkey;

        for (cp = 0; cp < max_count;) {
                rcu_read_lock();
                err = map->ops->map_get_next_key(map, prev_key, key);
                rcu_read_unlock();
                if (err)
                        break;
                err = bpf_map_copy_value(map, key, value,
                                         attr->batch.elem_flags);

                if (err == -ENOENT) {
                        if (retry) {
                                retry--;
                                continue;
                        }
                        err = -EINTR;
                        break;
                }

                if (err)
                        goto free_buf;

                if (copy_to_user(keys + cp * map->key_size, key,
                                 map->key_size)) {
                        err = -EFAULT;
                        goto free_buf;
                }
                if (copy_to_user(values + cp * value_size, value, value_size)) {
                        err = -EFAULT;
                        goto free_buf;
                }

                if (!prev_key)
                        prev_key = buf_prevkey;

                swap(prev_key, key);
                retry = MAP_LOOKUP_RETRIES;
                cp++;
                cond_resched();
        }

        if (err == -EFAULT)
                goto free_buf;

        if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
                    (cp && copy_to_user(uobatch, prev_key, map->key_size))))
                err = -EFAULT;

free_buf:
        kvfree(buf_prevkey);
        kvfree(buf);
        return err;
}

#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags

static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
        void __user *ukey = u64_to_user_ptr(attr->key);
        void __user *uvalue = u64_to_user_ptr(attr->value);
        int ufd = attr->map_fd;
        struct bpf_map *map;
        void *key, *value;
        u32 value_size;
        struct fd f;
        int err;

        if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
                return -EINVAL;

        if (attr->flags & ~BPF_F_LOCK)
                return -EINVAL;

        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        bpf_map_write_active_inc(map);
        if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
            !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        if (attr->flags &&
            (map->map_type == BPF_MAP_TYPE_QUEUE ||
             map->map_type == BPF_MAP_TYPE_STACK)) {
                err = -EINVAL;
                goto err_put;
        }

        if ((attr->flags & BPF_F_LOCK) &&
            !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
                err = -EINVAL;
                goto err_put;
        }

        key = __bpf_copy_key(ukey, map->key_size);
        if (IS_ERR(key)) {
                err = PTR_ERR(key);
                goto err_put;
        }

        value_size = bpf_map_value_size(map);

        err = -ENOMEM;
        value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
        if (!value)
                goto free_key;

        err = -ENOTSUPP;
        if (map->map_type == BPF_MAP_TYPE_QUEUE ||
            map->map_type == BPF_MAP_TYPE_STACK) {
                err = map->ops->map_pop_elem(map, value);
        } else if (map->map_type == BPF_MAP_TYPE_HASH ||
                   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
                   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
                   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
                if (!bpf_map_is_offloaded(map)) {
                        bpf_disable_instrumentation();
                        rcu_read_lock();
                        err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
                        rcu_read_unlock();
                        bpf_enable_instrumentation();
                }
        }

        if (err)
                goto free_value;

        if (copy_to_user(uvalue, value, value_size) != 0) {
                err = -EFAULT;
                goto free_value;
        }

        err = 0;

free_value:
        kvfree(value);
free_key:
        kvfree(key);
err_put:
        bpf_map_write_active_dec(map);
        fdput(f);
        return err;
}

#define BPF_MAP_FREEZE_LAST_FIELD map_fd

static int map_freeze(const union bpf_attr *attr)
{
        int err = 0, ufd = attr->map_fd;
        struct bpf_map *map;
        struct fd f;

        if (CHECK_ATTR(BPF_MAP_FREEZE))
                return -EINVAL;

        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);

        if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
                fdput(f);
                return -ENOTSUPP;
        }

        if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                fdput(f);
                return -EPERM;
        }

        mutex_lock(&map->freeze_mutex);
        if (bpf_map_write_active(map)) {
                err = -EBUSY;
                goto err_put;
        }
        if (READ_ONCE(map->frozen)) {
                err = -EBUSY;
                goto err_put;
        }

        WRITE_ONCE(map->frozen, true);
err_put:
        mutex_unlock(&map->freeze_mutex);
        fdput(f);
        return err;
}

static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
        [_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};

static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
        const struct bpf_prog_ops *ops;

        if (type >= ARRAY_SIZE(bpf_prog_types))
                return -EINVAL;
        type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
        ops = bpf_prog_types[type];
        if (!ops)
                return -EINVAL;

        if (!bpf_prog_is_offloaded(prog->aux))
                prog->aux->ops = ops;
        else
                prog->aux->ops = &bpf_offload_prog_ops;
        prog->type = type;
        return 0;
}

enum bpf_audit {
        BPF_AUDIT_LOAD,
        BPF_AUDIT_UNLOAD,
        BPF_AUDIT_MAX,
};

static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
        [BPF_AUDIT_LOAD]   = "LOAD",
        [BPF_AUDIT_UNLOAD] = "UNLOAD",
};

static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
{
        struct audit_context *ctx = NULL;
        struct audit_buffer *ab;

        if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
                return;
        if (audit_enabled == AUDIT_OFF)
                return;
        if (!in_irq() && !irqs_disabled())
                ctx = audit_context();
        ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
        if (unlikely(!ab))
                return;
        audit_log_format(ab, "prog-id=%u op=%s",
                         prog->aux->id, bpf_audit_str[op]);
        audit_log_end(ab);
}

static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
        int id;

        idr_preload(GFP_KERNEL);
        spin_lock_bh(&prog_idr_lock);
        id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
        if (id > 0)
                prog->aux->id = id;
        spin_unlock_bh(&prog_idr_lock);
        idr_preload_end();

        /* id is in [1, INT_MAX) */
        if (WARN_ON_ONCE(!id))
                return -ENOSPC;

        return id > 0 ? 0 : id;
}

void bpf_prog_free_id(struct bpf_prog *prog)
{
        unsigned long flags;

        /* cBPF to eBPF migrations are currently not in the idr store.
         * Offloaded programs are removed from the store when their device
         * disappears - even if someone grabs an fd to them they are unusable,
         * simply waiting for refcnt to drop to be freed.
         */
        if (!prog->aux->id)
                return;

        spin_lock_irqsave(&prog_idr_lock, flags);
        idr_remove(&prog_idr, prog->aux->id);
        prog->aux->id = 0;
        spin_unlock_irqrestore(&prog_idr_lock, flags);
}

static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
        struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);

        kvfree(aux->func_info);
        kfree(aux->func_info_aux);
        free_uid(aux->user);
        security_bpf_prog_free(aux->prog);
        bpf_prog_free(aux->prog);
}

static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
        bpf_prog_kallsyms_del_all(prog);
        btf_put(prog->aux->btf);
        module_put(prog->aux->mod);
        kvfree(prog->aux->jited_linfo);
        kvfree(prog->aux->linfo);
        kfree(prog->aux->kfunc_tab);
        if (prog->aux->attach_btf)
                btf_put(prog->aux->attach_btf);

        if (deferred) {
                if (prog->sleepable)
                        call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
                else
                        call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
        } else {
                __bpf_prog_put_rcu(&prog->aux->rcu);
        }
}

static void bpf_prog_put_deferred(struct work_struct *work)
{
        struct bpf_prog_aux *aux;
        struct bpf_prog *prog;

        aux = container_of(work, struct bpf_prog_aux, work);
        prog = aux->prog;
        perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
        bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
        bpf_prog_free_id(prog);
        __bpf_prog_put_noref(prog, true);
}

static void __bpf_prog_put(struct bpf_prog *prog)
{
        struct bpf_prog_aux *aux = prog->aux;

        if (atomic64_dec_and_test(&aux->refcnt)) {
                if (in_irq() || irqs_disabled()) {
                        INIT_WORK(&aux->work, bpf_prog_put_deferred);
                        schedule_work(&aux->work);
                } else {
                        bpf_prog_put_deferred(&aux->work);
                }
        }
}

void bpf_prog_put(struct bpf_prog *prog)
{
        __bpf_prog_put(prog);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);

static int bpf_prog_release(struct inode *inode, struct file *filp)
{
        struct bpf_prog *prog = filp->private_data;

        bpf_prog_put(prog);
        return 0;
}

struct bpf_prog_kstats {
        u64 nsecs;
        u64 cnt;
        u64 misses;
};

void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
{
        struct bpf_prog_stats *stats;
        unsigned int flags;

        stats = this_cpu_ptr(prog->stats);
        flags = u64_stats_update_begin_irqsave(&stats->syncp);
        u64_stats_inc(&stats->misses);
        u64_stats_update_end_irqrestore(&stats->syncp, flags);
}

static void bpf_prog_get_stats(const struct bpf_prog *prog,
                               struct bpf_prog_kstats *stats)
{
        u64 nsecs = 0, cnt = 0, misses = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                const struct bpf_prog_stats *st;
                unsigned int start;
                u64 tnsecs, tcnt, tmisses;

                st = per_cpu_ptr(prog->stats, cpu);
                do {
                        start = u64_stats_fetch_begin(&st->syncp);
                        tnsecs = u64_stats_read(&st->nsecs);
                        tcnt = u64_stats_read(&st->cnt);
                        tmisses = u64_stats_read(&st->misses);
                } while (u64_stats_fetch_retry(&st->syncp, start));
                nsecs += tnsecs;
                cnt += tcnt;
                misses += tmisses;
        }
        stats->nsecs = nsecs;
        stats->cnt = cnt;
        stats->misses = misses;
}

#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
        const struct bpf_prog *prog = filp->private_data;
        char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
        struct bpf_prog_kstats stats;

        bpf_prog_get_stats(prog, &stats);
        bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
        seq_printf(m,
                   "prog_type:\t%u\n"
                   "prog_jited:\t%u\n"
                   "prog_tag:\t%s\n"
                   "memlock:\t%llu\n"
                   "prog_id:\t%u\n"
                   "run_time_ns:\t%llu\n"
                   "run_cnt:\t%llu\n"
                   "recursion_misses:\t%llu\n"
                   "verified_insns:\t%u\n",
                   prog->type,
                   prog->jited,
                   prog_tag,
                   prog->pages * 1ULL << PAGE_SHIFT,
                   prog->aux->id,
                   stats.nsecs,
                   stats.cnt,
                   stats.misses,
                   prog->aux->verified_insns);
}
#endif

const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_prog_show_fdinfo,
#endif
        .release        = bpf_prog_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
};

int bpf_prog_new_fd(struct bpf_prog *prog)
{
        int ret;

        ret = security_bpf_prog(prog);
        if (ret < 0)
                return ret;

        return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
                                O_RDWR | O_CLOEXEC);
}

static struct bpf_prog *____bpf_prog_get(struct fd f)
{
        if (!f.file)
                return ERR_PTR(-EBADF);
        if (f.file->f_op != &bpf_prog_fops) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }

        return f.file->private_data;
}

void bpf_prog_add(struct bpf_prog *prog, int i)
{
        atomic64_add(i, &prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_add);

void bpf_prog_sub(struct bpf_prog *prog, int i)
{
        /* Only to be used for undoing previous bpf_prog_add() in some
         * error path. We still know that another entity in our call
         * path holds a reference to the program, thus atomic_sub() can
         * be safely used in such cases!
         */
        WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
}
EXPORT_SYMBOL_GPL(bpf_prog_sub);

void bpf_prog_inc(struct bpf_prog *prog)
{
        atomic64_inc(&prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);

/* prog_idr_lock should have been held */
struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
        int refold;

        refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);

        if (!refold)
                return ERR_PTR(-ENOENT);

        return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);

bool bpf_prog_get_ok(struct bpf_prog *prog,
                            enum bpf_prog_type *attach_type, bool attach_drv)
{
        /* not an attachment, just a refcount inc, always allow */
        if (!attach_type)
                return true;

        if (prog->type != *attach_type)
                return false;
        if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
                return false;

        return true;
}

static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
                                       bool attach_drv)
{
        struct fd f = fdget(ufd);
        struct bpf_prog *prog;

        prog = ____bpf_prog_get(f);
        if (IS_ERR(prog))
                return prog;
        if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
                prog = ERR_PTR(-EINVAL);
                goto out;
        }

        bpf_prog_inc(prog);
out:
        fdput(f);
        return prog;
}

struct bpf_prog *bpf_prog_get(u32 ufd)
{
        return __bpf_prog_get(ufd, NULL, false);
}

struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
                                       bool attach_drv)
{
        return __bpf_prog_get(ufd, &type, attach_drv);
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);

/* Initially all BPF programs could be loaded w/o specifying
 * expected_attach_type. Later for some of them specifying expected_attach_type
 * at load time became required so that program could be validated properly.
 * Programs of types that are allowed to be loaded both w/ and w/o (for
 * backward compatibility) expected_attach_type, should have the default attach
 * type assigned to expected_attach_type for the latter case, so that it can be
 * validated later at attach time.
 *
 * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
 * prog type requires it but has some attach types that have to be backward
 * compatible.
 */
static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
{
        switch (attr->prog_type) {
        case BPF_PROG_TYPE_CGROUP_SOCK:
                /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
                 * exist so checking for non-zero is the way to go here.
                 */
                if (!attr->expected_attach_type)
                        attr->expected_attach_type =
                                BPF_CGROUP_INET_SOCK_CREATE;
                break;
        case BPF_PROG_TYPE_SK_REUSEPORT:
                if (!attr->expected_attach_type)
                        attr->expected_attach_type =
                                BPF_SK_REUSEPORT_SELECT;
                break;
        }
}

static int
bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
                           enum bpf_attach_type expected_attach_type,
                           struct btf *attach_btf, u32 btf_id,
                           struct bpf_prog *dst_prog)
{
        if (btf_id) {
                if (btf_id > BTF_MAX_TYPE)
                        return -EINVAL;

                if (!attach_btf && !dst_prog)
                        return -EINVAL;

                switch (prog_type) {
                case BPF_PROG_TYPE_TRACING:
                case BPF_PROG_TYPE_LSM:
                case BPF_PROG_TYPE_STRUCT_OPS:
                case BPF_PROG_TYPE_EXT:
                        break;
                default:
                        return -EINVAL;
                }
        }

        if (attach_btf && (!btf_id || dst_prog))
                return -EINVAL;

        if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
            prog_type != BPF_PROG_TYPE_EXT)
                return -EINVAL;

        switch (prog_type) {
        case BPF_PROG_TYPE_CGROUP_SOCK:
                switch (expected_attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                case BPF_CGROUP_INET_SOCK_RELEASE:
                case BPF_CGROUP_INET4_POST_BIND:
                case BPF_CGROUP_INET6_POST_BIND:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
                switch (expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_CGROUP_SKB:
                switch (expected_attach_type) {
                case BPF_CGROUP_INET_INGRESS:
                case BPF_CGROUP_INET_EGRESS:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                switch (expected_attach_type) {
                case BPF_CGROUP_SETSOCKOPT:
                case BPF_CGROUP_GETSOCKOPT:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_SK_LOOKUP:
                if (expected_attach_type == BPF_SK_LOOKUP)
                        return 0;
                return -EINVAL;
        case BPF_PROG_TYPE_SK_REUSEPORT:
                switch (expected_attach_type) {
                case BPF_SK_REUSEPORT_SELECT:
                case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
                        return 0;
                default:
                        return -EINVAL;
                }
        case BPF_PROG_TYPE_NETFILTER:
                if (expected_attach_type == BPF_NETFILTER)
                        return 0;
                return -EINVAL;
        case BPF_PROG_TYPE_SYSCALL:
        case BPF_PROG_TYPE_EXT:
                if (expected_attach_type)
                        return -EINVAL;
                fallthrough;
        default:
                return 0;
        }
}

static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
{
        switch (prog_type) {
        case BPF_PROG_TYPE_SCHED_CLS:
        case BPF_PROG_TYPE_SCHED_ACT:
        case BPF_PROG_TYPE_XDP:
        case BPF_PROG_TYPE_LWT_IN:
        case BPF_PROG_TYPE_LWT_OUT:
        case BPF_PROG_TYPE_LWT_XMIT:
        case BPF_PROG_TYPE_LWT_SEG6LOCAL:
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_EXT: /* extends any prog */
        case BPF_PROG_TYPE_NETFILTER:
                return true;
        case BPF_PROG_TYPE_CGROUP_SKB:
                /* always unpriv */
        case BPF_PROG_TYPE_SK_REUSEPORT:
                /* equivalent to SOCKET_FILTER. need CAP_BPF only */
        default:
                return false;
        }
}

static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
{
        switch (prog_type) {
        case BPF_PROG_TYPE_KPROBE:
        case BPF_PROG_TYPE_TRACEPOINT:
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
        case BPF_PROG_TYPE_TRACING:
        case BPF_PROG_TYPE_LSM:
        case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
        case BPF_PROG_TYPE_EXT: /* extends any prog */
                return true;
        default:
                return false;
        }
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd

static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
        enum bpf_prog_type type = attr->prog_type;
        struct bpf_prog *prog, *dst_prog = NULL;
        struct btf *attach_btf = NULL;
        struct bpf_token *token = NULL;
        bool bpf_cap;
        int err;
        char license[128];

        if (CHECK_ATTR(BPF_PROG_LOAD))
                return -EINVAL;

        if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
                                 BPF_F_ANY_ALIGNMENT |
                                 BPF_F_TEST_STATE_FREQ |
                                 BPF_F_SLEEPABLE |
                                 BPF_F_TEST_RND_HI32 |
                                 BPF_F_XDP_HAS_FRAGS |
                                 BPF_F_XDP_DEV_BOUND_ONLY |
                                 BPF_F_TEST_REG_INVARIANTS |
                                 BPF_F_TOKEN_FD))
                return -EINVAL;

        bpf_prog_load_fixup_attach_type(attr);

        if (attr->prog_flags & BPF_F_TOKEN_FD) {
                token = bpf_token_get_from_fd(attr->prog_token_fd);
                if (IS_ERR(token))
                        return PTR_ERR(token);
                /* if current token doesn't grant prog loading permissions,
                 * then we can't use this token, so ignore it and rely on
                 * system-wide capabilities checks
                 */
                if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
                    !bpf_token_allow_prog_type(token, attr->prog_type,
                                               attr->expected_attach_type)) {
                        bpf_token_put(token);
                        token = NULL;
                }
        }

        bpf_cap = bpf_token_capable(token, CAP_BPF);
        err = -EPERM;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
            (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
            !bpf_cap)
                goto put_token;

        /* Intent here is for unprivileged_bpf_disabled to block BPF program
         * creation for unprivileged users; other actions depend
         * on fd availability and access to bpffs, so are dependent on
         * object creation success. Even with unprivileged BPF disabled,
         * capability checks are still carried out for these
         * and other operations.
         */
        if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
                goto put_token;

        if (attr->insn_cnt == 0 ||
            attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
                err = -E2BIG;
                goto put_token;
        }
        if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
            type != BPF_PROG_TYPE_CGROUP_SKB &&
            !bpf_cap)
                goto put_token;

        if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
                goto put_token;
        if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
                goto put_token;

        /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
         * or btf, we need to check which one it is
         */
        if (attr->attach_prog_fd) {
                dst_prog = bpf_prog_get(attr->attach_prog_fd);
                if (IS_ERR(dst_prog)) {
                        dst_prog = NULL;
                        attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
                        if (IS_ERR(attach_btf)) {
                                err = -EINVAL;
                                goto put_token;
                        }
                        if (!btf_is_kernel(attach_btf)) {
                                /* attaching through specifying bpf_prog's BTF
                                 * objects directly might be supported eventually
                                 */
                                btf_put(attach_btf);
                                err = -ENOTSUPP;
                                goto put_token;
                        }
                }
        } else if (attr->attach_btf_id) {
                /* fall back to vmlinux BTF, if BTF type ID is specified */
                attach_btf = bpf_get_btf_vmlinux();
                if (IS_ERR(attach_btf)) {
                        err = PTR_ERR(attach_btf);
                        goto put_token;
                }
                if (!attach_btf) {
                        err = -EINVAL;
                        goto put_token;
                }
                btf_get(attach_btf);
        }

        if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
                                       attach_btf, attr->attach_btf_id,
                                       dst_prog)) {
                if (dst_prog)
                        bpf_prog_put(dst_prog);
                if (attach_btf)
                        btf_put(attach_btf);
                err = -EINVAL;
                goto put_token;
        }

        /* plain bpf_prog allocation */
        prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
        if (!prog) {
                if (dst_prog)
                        bpf_prog_put(dst_prog);
                if (attach_btf)
                        btf_put(attach_btf);
                err = -EINVAL;
                goto put_token;
        }

        prog->expected_attach_type = attr->expected_attach_type;
        prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
        prog->aux->attach_btf = attach_btf;
        prog->aux->attach_btf_id = attr->attach_btf_id;
        prog->aux->dst_prog = dst_prog;
        prog->aux->dev_bound = !!attr->prog_ifindex;
        prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;

        /* move token into prog->aux, reuse taken refcnt */
        prog->aux->token = token;
        token = NULL;

        prog->aux->user = get_current_user();
        prog->len = attr->insn_cnt;

        err = -EFAULT;
        if (copy_from_bpfptr(prog->insns,
                             make_bpfptr(attr->insns, uattr.is_kernel),
                             bpf_prog_insn_size(prog)) != 0)
                goto free_prog;
        /* copy eBPF program license from user space */
        if (strncpy_from_bpfptr(license,
                                make_bpfptr(attr->license, uattr.is_kernel),
                                sizeof(license) - 1) < 0)
                goto free_prog;
        license[sizeof(license) - 1] = 0;

        /* eBPF programs must be GPL compatible to use GPL-ed functions */
        prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;

        prog->orig_prog = NULL;
        prog->jited = 0;

        atomic64_set(&prog->aux->refcnt, 1);

        if (bpf_prog_is_dev_bound(prog->aux)) {
                err = bpf_prog_dev_bound_init(prog, attr);
                if (err)
                        goto free_prog;
        }

        if (type == BPF_PROG_TYPE_EXT && dst_prog &&
            bpf_prog_is_dev_bound(dst_prog->aux)) {
                err = bpf_prog_dev_bound_inherit(prog, dst_prog);
                if (err)
                        goto free_prog;
        }

        /*
         * Bookkeeping for managing the program attachment chain.
         *
         * It might be tempting to set attach_tracing_prog flag at the attachment
         * time, but this will not prevent from loading bunch of tracing prog
         * first, then attach them one to another.
         *
         * The flag attach_tracing_prog is set for the whole program lifecycle, and
         * doesn't have to be cleared in bpf_tracing_link_release, since tracing
         * programs cannot change attachment target.
         */
        if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
            dst_prog->type == BPF_PROG_TYPE_TRACING) {
                prog->aux->attach_tracing_prog = true;
        }

        /* find program type: socket_filter vs tracing_filter */
        err = find_prog_type(type, prog);
        if (err < 0)
                goto free_prog;

        prog->aux->load_time = ktime_get_boottime_ns();
        err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
                               sizeof(attr->prog_name));
        if (err < 0)
                goto free_prog;

        err = security_bpf_prog_load(prog, attr, token);
        if (err)
                goto free_prog_sec;

        /* run eBPF verifier */
        err = bpf_check(&prog, attr, uattr, uattr_size);
        if (err < 0)
                goto free_used_maps;

        prog = bpf_prog_select_runtime(prog, &err);
        if (err < 0)
                goto free_used_maps;

        err = bpf_prog_alloc_id(prog);
        if (err)
                goto free_used_maps;

        /* Upon success of bpf_prog_alloc_id(), the BPF prog is
         * effectively publicly exposed. However, retrieving via
         * bpf_prog_get_fd_by_id() will take another reference,
         * therefore it cannot be gone underneath us.
         *
         * Only for the time /after/ successful bpf_prog_new_fd()
         * and before returning to userspace, we might just hold
         * one reference and any parallel close on that fd could
         * rip everything out. Hence, below notifications must
         * happen before bpf_prog_new_fd().
         *
         * Also, any failure handling from this point onwards must
         * be using bpf_prog_put() given the program is exposed.
         */
        bpf_prog_kallsyms_add(prog);
        perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
        bpf_audit_prog(prog, BPF_AUDIT_LOAD);

        err = bpf_prog_new_fd(prog);
        if (err < 0)
                bpf_prog_put(prog);
        return err;

free_used_maps:
        /* In case we have subprogs, we need to wait for a grace
         * period before we can tear down JIT memory since symbols
         * are already exposed under kallsyms.
         */
        __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
        return err;

free_prog_sec:
        security_bpf_prog_free(prog);
free_prog:
        free_uid(prog->aux->user);
        if (prog->aux->attach_btf)
                btf_put(prog->aux->attach_btf);
        bpf_prog_free(prog);
put_token:
        bpf_token_put(token);
        return err;
}

#define BPF_OBJ_LAST_FIELD path_fd

static int bpf_obj_pin(const union bpf_attr *attr)
{
        int path_fd;

        if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
                return -EINVAL;

        /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
        if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
                return -EINVAL;

        path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
        return bpf_obj_pin_user(attr->bpf_fd, path_fd,
                                u64_to_user_ptr(attr->pathname));
}

static int bpf_obj_get(const union bpf_attr *attr)
{
        int path_fd;

        if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
            attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
                return -EINVAL;

        /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
        if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
                return -EINVAL;

        path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
        return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
                                attr->file_flags);
}

void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
                   const struct bpf_link_ops *ops, struct bpf_prog *prog)
{
        WARN_ON(ops->dealloc && ops->dealloc_deferred);
        atomic64_set(&link->refcnt, 1);
        link->type = type;
        link->id = 0;
        link->ops = ops;
        link->prog = prog;
}

static void bpf_link_free_id(int id)
{
        if (!id)
                return;

        spin_lock_bh(&link_idr_lock);
        idr_remove(&link_idr, id);
        spin_unlock_bh(&link_idr_lock);
}

/* Clean up bpf_link and corresponding anon_inode file and FD. After
 * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
 * anon_inode's release() call. This helper marks bpf_link as
 * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
 * is not decremented, it's the responsibility of a calling code that failed
 * to complete bpf_link initialization.
 * This helper eventually calls link's dealloc callback, but does not call
 * link's release callback.
 */
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
        primer->link->prog = NULL;
        bpf_link_free_id(primer->id);
        fput(primer->file);
        put_unused_fd(primer->fd);
}

void bpf_link_inc(struct bpf_link *link)
{
        atomic64_inc(&link->refcnt);
}

static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
{
        struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);

        /* free bpf_link and its containing memory */
        link->ops->dealloc_deferred(link);
}

static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
{
        if (rcu_trace_implies_rcu_gp())
                bpf_link_defer_dealloc_rcu_gp(rcu);
        else
                call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
}

/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
        const struct bpf_link_ops *ops = link->ops;
        bool sleepable = false;

        bpf_link_free_id(link->id);
        if (link->prog) {
                sleepable = link->prog->sleepable;
                /* detach BPF program, clean up used resources */
                ops->release(link);
                bpf_prog_put(link->prog);
        }
        if (ops->dealloc_deferred) {
                /* schedule BPF link deallocation; if underlying BPF program
                 * is sleepable, we need to first wait for RCU tasks trace
                 * sync, then go through "classic" RCU grace period
                 */
                if (sleepable)
                        call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
                else
                        call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
        } else if (ops->dealloc)
                ops->dealloc(link);
}

static void bpf_link_put_deferred(struct work_struct *work)
{
        struct bpf_link *link = container_of(work, struct bpf_link, work);

        bpf_link_free(link);
}

/* bpf_link_put might be called from atomic context. It needs to be called
 * from sleepable context in order to acquire sleeping locks during the process.
 */
void bpf_link_put(struct bpf_link *link)
{
        if (!atomic64_dec_and_test(&link->refcnt))
                return;

        INIT_WORK(&link->work, bpf_link_put_deferred);
        schedule_work(&link->work);
}
EXPORT_SYMBOL(bpf_link_put);

static void bpf_link_put_direct(struct bpf_link *link)
{
        if (!atomic64_dec_and_test(&link->refcnt))
                return;
        bpf_link_free(link);
}

static int bpf_link_release(struct inode *inode, struct file *filp)
{
        struct bpf_link *link = filp->private_data;

        bpf_link_put_direct(link);
        return 0;
}

#ifdef CONFIG_PROC_FS
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
static const char *bpf_link_type_strs[] = {
        [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
#include <linux/bpf_types.h>
};
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE

static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
        const struct bpf_link *link = filp->private_data;
        const struct bpf_prog *prog = link->prog;
        char prog_tag[sizeof(prog->tag) * 2 + 1] = { };

        seq_printf(m,
                   "link_type:\t%s\n"
                   "link_id:\t%u\n",
                   bpf_link_type_strs[link->type],
                   link->id);
        if (prog) {
                bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
                seq_printf(m,
                           "prog_tag:\t%s\n"
                           "prog_id:\t%u\n",
                           prog_tag,
                           prog->aux->id);
        }
        if (link->ops->show_fdinfo)
                link->ops->show_fdinfo(link, m);
}
#endif

static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
{
        struct bpf_link *link = file->private_data;

        return link->ops->poll(file, pts);
}

static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_link_show_fdinfo,
#endif
        .release        = bpf_link_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
};

static const struct file_operations bpf_link_fops_poll = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = bpf_link_show_fdinfo,
#endif
        .release        = bpf_link_release,
        .read                = bpf_dummy_read,
        .write                = bpf_dummy_write,
        .poll                = bpf_link_poll,
};

static int bpf_link_alloc_id(struct bpf_link *link)
{
        int id;

        idr_preload(GFP_KERNEL);
        spin_lock_bh(&link_idr_lock);
        id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
        spin_unlock_bh(&link_idr_lock);
        idr_preload_end();

        return id;
}

/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
 * reserving unused FD and allocating ID from link_idr. This is to be paired
 * with bpf_link_settle() to install FD and ID and expose bpf_link to
 * user-space, if bpf_link is successfully attached. If not, bpf_link and
 * pre-allocated resources are to be freed with bpf_cleanup() call. All the
 * transient state is passed around in struct bpf_link_primer.
 * This is preferred way to create and initialize bpf_link, especially when
 * there are complicated and expensive operations in between creating bpf_link
 * itself and attaching it to BPF hook. By using bpf_link_prime() and
 * bpf_link_settle() kernel code using bpf_link doesn't have to perform
 * expensive (and potentially failing) roll back operations in a rare case
 * that file, FD, or ID can't be allocated.
 */
int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
        struct file *file;
        int fd, id;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;


        id = bpf_link_alloc_id(link);
        if (id < 0) {
                put_unused_fd(fd);
                return id;
        }

        file = anon_inode_getfile("bpf_link",
                                  link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
                                  link, O_CLOEXEC);
        if (IS_ERR(file)) {
                bpf_link_free_id(id);
                put_unused_fd(fd);
                return PTR_ERR(file);
        }

        primer->link = link;
        primer->file = file;
        primer->fd = fd;
        primer->id = id;
        return 0;
}

int bpf_link_settle(struct bpf_link_primer *primer)
{
        /* make bpf_link fetchable by ID */
        spin_lock_bh(&link_idr_lock);
        primer->link->id = primer->id;
        spin_unlock_bh(&link_idr_lock);
        /* make bpf_link fetchable by FD */
        fd_install(primer->fd, primer->file);
        /* pass through installed FD */
        return primer->fd;
}

int bpf_link_new_fd(struct bpf_link *link)
{
        return anon_inode_getfd("bpf-link",
                                link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
                                link, O_CLOEXEC);
}

struct bpf_link *bpf_link_get_from_fd(u32 ufd)
{
        struct fd f = fdget(ufd);
        struct bpf_link *link;

        if (!f.file)
                return ERR_PTR(-EBADF);
        if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }

        link = f.file->private_data;
        bpf_link_inc(link);
        fdput(f);

        return link;
}
EXPORT_SYMBOL(bpf_link_get_from_fd);

static void bpf_tracing_link_release(struct bpf_link *link)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);

        WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
                                                tr_link->trampoline));

        bpf_trampoline_put(tr_link->trampoline);

        /* tgt_prog is NULL if target is a kernel function */
        if (tr_link->tgt_prog)
                bpf_prog_put(tr_link->tgt_prog);
}

static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);

        kfree(tr_link);
}

static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
                                         struct seq_file *seq)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);
        u32 target_btf_id, target_obj_id;

        bpf_trampoline_unpack_key(tr_link->trampoline->key,
                                  &target_obj_id, &target_btf_id);
        seq_printf(seq,
                   "attach_type:\t%d\n"
                   "target_obj_id:\t%u\n"
                   "target_btf_id:\t%u\n",
                   tr_link->attach_type,
                   target_obj_id,
                   target_btf_id);
}

static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
                                           struct bpf_link_info *info)
{
        struct bpf_tracing_link *tr_link =
                container_of(link, struct bpf_tracing_link, link.link);

        info->tracing.attach_type = tr_link->attach_type;
        bpf_trampoline_unpack_key(tr_link->trampoline->key,
                                  &info->tracing.target_obj_id,
                                  &info->tracing.target_btf_id);

        return 0;
}

static const struct bpf_link_ops bpf_tracing_link_lops = {
        .release = bpf_tracing_link_release,
        .dealloc = bpf_tracing_link_dealloc,
        .show_fdinfo = bpf_tracing_link_show_fdinfo,
        .fill_link_info = bpf_tracing_link_fill_link_info,
};

static int bpf_tracing_prog_attach(struct bpf_prog *prog,
                                   int tgt_prog_fd,
                                   u32 btf_id,
                                   u64 bpf_cookie)
{
        struct bpf_link_primer link_primer;
        struct bpf_prog *tgt_prog = NULL;
        struct bpf_trampoline *tr = NULL;
        struct bpf_tracing_link *link;
        u64 key = 0;
        int err;

        switch (prog->type) {
        case BPF_PROG_TYPE_TRACING:
                if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
                    prog->expected_attach_type != BPF_TRACE_FEXIT &&
                    prog->expected_attach_type != BPF_MODIFY_RETURN) {
                        err = -EINVAL;
                        goto out_put_prog;
                }
                break;
        case BPF_PROG_TYPE_EXT:
                if (prog->expected_attach_type != 0) {
                        err = -EINVAL;
                        goto out_put_prog;
                }
                break;
        case BPF_PROG_TYPE_LSM:
                if (prog->expected_attach_type != BPF_LSM_MAC) {
                        err = -EINVAL;
                        goto out_put_prog;
                }
                break;
        default:
                err = -EINVAL;
                goto out_put_prog;
        }

        if (!!tgt_prog_fd != !!btf_id) {
                err = -EINVAL;
                goto out_put_prog;
        }

        if (tgt_prog_fd) {
                /*
                 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
                 * part would be changed to implement the same for
                 * BPF_PROG_TYPE_TRACING, do not forget to update the way how
                 * attach_tracing_prog flag is set.
                 */
                if (prog->type != BPF_PROG_TYPE_EXT) {
                        err = -EINVAL;
                        goto out_put_prog;
                }

                tgt_prog = bpf_prog_get(tgt_prog_fd);
                if (IS_ERR(tgt_prog)) {
                        err = PTR_ERR(tgt_prog);
                        tgt_prog = NULL;
                        goto out_put_prog;
                }

                key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
        }

        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto out_put_prog;
        }
        bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
                      &bpf_tracing_link_lops, prog);
        link->attach_type = prog->expected_attach_type;
        link->link.cookie = bpf_cookie;

        mutex_lock(&prog->aux->dst_mutex);

        /* There are a few possible cases here:
         *
         * - if prog->aux->dst_trampoline is set, the program was just loaded
         *   and not yet attached to anything, so we can use the values stored
         *   in prog->aux
         *
         * - if prog->aux->dst_trampoline is NULL, the program has already been
         *   attached to a target and its initial target was cleared (below)
         *
         * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
         *   target_btf_id using the link_create API.
         *
         * - if tgt_prog == NULL when this function was called using the old
         *   raw_tracepoint_open API, and we need a target from prog->aux
         *
         * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
         *   was detached and is going for re-attachment.
         *
         * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
         *   are NULL, then program was already attached and user did not provide
         *   tgt_prog_fd so we have no way to find out or create trampoline
         */
        if (!prog->aux->dst_trampoline && !tgt_prog) {
                /*
                 * Allow re-attach for TRACING and LSM programs. If it's
                 * currently linked, bpf_trampoline_link_prog will fail.
                 * EXT programs need to specify tgt_prog_fd, so they
                 * re-attach in separate code path.
                 */
                if (prog->type != BPF_PROG_TYPE_TRACING &&
                    prog->type != BPF_PROG_TYPE_LSM) {
                        err = -EINVAL;
                        goto out_unlock;
                }
                /* We can allow re-attach only if we have valid attach_btf. */
                if (!prog->aux->attach_btf) {
                        err = -EINVAL;
                        goto out_unlock;
                }
                btf_id = prog->aux->attach_btf_id;
                key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
        }

        if (!prog->aux->dst_trampoline ||
            (key && key != prog->aux->dst_trampoline->key)) {
                /* If there is no saved target, or the specified target is
                 * different from the destination specified at load time, we
                 * need a new trampoline and a check for compatibility
                 */
                struct bpf_attach_target_info tgt_info = {};

                err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
                                              &tgt_info);
                if (err)
                        goto out_unlock;

                if (tgt_info.tgt_mod) {
                        module_put(prog->aux->mod);
                        prog->aux->mod = tgt_info.tgt_mod;
                }

                tr = bpf_trampoline_get(key, &tgt_info);
                if (!tr) {
                        err = -ENOMEM;
                        goto out_unlock;
                }
        } else {
                /* The caller didn't specify a target, or the target was the
                 * same as the destination supplied during program load. This
                 * means we can reuse the trampoline and reference from program
                 * load time, and there is no need to allocate a new one. This
                 * can only happen once for any program, as the saved values in
                 * prog->aux are cleared below.
                 */
                tr = prog->aux->dst_trampoline;
                tgt_prog = prog->aux->dst_prog;
        }

        err = bpf_link_prime(&link->link.link, &link_primer);
        if (err)
                goto out_unlock;

        err = bpf_trampoline_link_prog(&link->link, tr);
        if (err) {
                bpf_link_cleanup(&link_primer);
                link = NULL;
                goto out_unlock;
        }

        link->tgt_prog = tgt_prog;
        link->trampoline = tr;

        /* Always clear the trampoline and target prog from prog->aux to make
         * sure the original attach destination is not kept alive after a
         * program is (re-)attached to another target.
         */
        if (prog->aux->dst_prog &&
            (tgt_prog_fd || tr != prog->aux->dst_trampoline))
                /* got extra prog ref from syscall, or attaching to different prog */
                bpf_prog_put(prog->aux->dst_prog);
        if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
                /* we allocated a new trampoline, so free the old one */
                bpf_trampoline_put(prog->aux->dst_trampoline);

        prog->aux->dst_prog = NULL;
        prog->aux->dst_trampoline = NULL;
        mutex_unlock(&prog->aux->dst_mutex);

        return bpf_link_settle(&link_primer);
out_unlock:
        if (tr && tr != prog->aux->dst_trampoline)
                bpf_trampoline_put(tr);
        mutex_unlock(&prog->aux->dst_mutex);
        kfree(link);
out_put_prog:
        if (tgt_prog_fd && tgt_prog)
                bpf_prog_put(tgt_prog);
        return err;
}

static void bpf_raw_tp_link_release(struct bpf_link *link)
{
        struct bpf_raw_tp_link *raw_tp =
                container_of(link, struct bpf_raw_tp_link, link);

        bpf_probe_unregister(raw_tp->btp, raw_tp);
        bpf_put_raw_tracepoint(raw_tp->btp);
}

static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
{
        struct bpf_raw_tp_link *raw_tp =
                container_of(link, struct bpf_raw_tp_link, link);

        kfree(raw_tp);
}

static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
                                        struct seq_file *seq)
{
        struct bpf_raw_tp_link *raw_tp_link =
                container_of(link, struct bpf_raw_tp_link, link);

        seq_printf(seq,
                   "tp_name:\t%s\n",
                   raw_tp_link->btp->tp->name);
}

static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
                            u32 len)
{
        if (ulen >= len + 1) {
                if (copy_to_user(ubuf, buf, len + 1))
                        return -EFAULT;
        } else {
                char zero = '\0';

                if (copy_to_user(ubuf, buf, ulen - 1))
                        return -EFAULT;
                if (put_user(zero, ubuf + ulen - 1))
                        return -EFAULT;
                return -ENOSPC;
        }

        return 0;
}

static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
                                          struct bpf_link_info *info)
{
        struct bpf_raw_tp_link *raw_tp_link =
                container_of(link, struct bpf_raw_tp_link, link);
        char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
        const char *tp_name = raw_tp_link->btp->tp->name;
        u32 ulen = info->raw_tracepoint.tp_name_len;
        size_t tp_len = strlen(tp_name);

        if (!ulen ^ !ubuf)
                return -EINVAL;

        info->raw_tracepoint.tp_name_len = tp_len + 1;

        if (!ubuf)
                return 0;

        return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
}

static const struct bpf_link_ops bpf_raw_tp_link_lops = {
        .release = bpf_raw_tp_link_release,
        .dealloc_deferred = bpf_raw_tp_link_dealloc,
        .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
        .fill_link_info = bpf_raw_tp_link_fill_link_info,
};

#ifdef CONFIG_PERF_EVENTS
struct bpf_perf_link {
        struct bpf_link link;
        struct file *perf_file;
};

static void bpf_perf_link_release(struct bpf_link *link)
{
        struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
        struct perf_event *event = perf_link->perf_file->private_data;

        perf_event_free_bpf_prog(event);
        fput(perf_link->perf_file);
}

static void bpf_perf_link_dealloc(struct bpf_link *link)
{
        struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);

        kfree(perf_link);
}

static int bpf_perf_link_fill_common(const struct perf_event *event,
                                     char __user *uname, u32 ulen,
                                     u64 *probe_offset, u64 *probe_addr,
                                     u32 *fd_type, unsigned long *missed)
{
        const char *buf;
        u32 prog_id;
        size_t len;
        int err;

        if (!ulen ^ !uname)
                return -EINVAL;

        err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
                                      probe_offset, probe_addr, missed);
        if (err)
                return err;
        if (!uname)
                return 0;
        if (buf) {
                len = strlen(buf);
                err = bpf_copy_to_user(uname, buf, ulen, len);
                if (err)
                        return err;
        } else {
                char zero = '\0';

                if (put_user(zero, uname))
                        return -EFAULT;
        }
        return 0;
}

#ifdef CONFIG_KPROBE_EVENTS
static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
                                     struct bpf_link_info *info)
{
        unsigned long missed;
        char __user *uname;
        u64 addr, offset;
        u32 ulen, type;
        int err;

        uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
        ulen = info->perf_event.kprobe.name_len;
        err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
                                        &type, &missed);
        if (err)
                return err;
        if (type == BPF_FD_TYPE_KRETPROBE)
                info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
        else
                info->perf_event.type = BPF_PERF_EVENT_KPROBE;

        info->perf_event.kprobe.offset = offset;
        info->perf_event.kprobe.missed = missed;
        if (!kallsyms_show_value(current_cred()))
                addr = 0;
        info->perf_event.kprobe.addr = addr;
        info->perf_event.kprobe.cookie = event->bpf_cookie;
        return 0;
}
#endif

#ifdef CONFIG_UPROBE_EVENTS
static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
                                     struct bpf_link_info *info)
{
        char __user *uname;
        u64 addr, offset;
        u32 ulen, type;
        int err;

        uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
        ulen = info->perf_event.uprobe.name_len;
        err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
                                        &type, NULL);
        if (err)
                return err;

        if (type == BPF_FD_TYPE_URETPROBE)
                info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
        else
                info->perf_event.type = BPF_PERF_EVENT_UPROBE;
        info->perf_event.uprobe.offset = offset;
        info->perf_event.uprobe.cookie = event->bpf_cookie;
        return 0;
}
#endif

static int bpf_perf_link_fill_probe(const struct perf_event *event,
                                    struct bpf_link_info *info)
{
#ifdef CONFIG_KPROBE_EVENTS
        if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
                return bpf_perf_link_fill_kprobe(event, info);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
                return bpf_perf_link_fill_uprobe(event, info);
#endif
        return -EOPNOTSUPP;
}

static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
                                         struct bpf_link_info *info)
{
        char __user *uname;
        u32 ulen;

        uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
        ulen = info->perf_event.tracepoint.name_len;
        info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
        info->perf_event.tracepoint.cookie = event->bpf_cookie;
        return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
}

static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
                                         struct bpf_link_info *info)
{
        info->perf_event.event.type = event->attr.type;
        info->perf_event.event.config = event->attr.config;
        info->perf_event.event.cookie = event->bpf_cookie;
        info->perf_event.type = BPF_PERF_EVENT_EVENT;
        return 0;
}

static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
                                        struct bpf_link_info *info)
{
        struct bpf_perf_link *perf_link;
        const struct perf_event *event;

        perf_link = container_of(link, struct bpf_perf_link, link);
        event = perf_get_event(perf_link->perf_file);
        if (IS_ERR(event))
                return PTR_ERR(event);

        switch (event->prog->type) {
        case BPF_PROG_TYPE_PERF_EVENT:
                return bpf_perf_link_fill_perf_event(event, info);
        case BPF_PROG_TYPE_TRACEPOINT:
                return bpf_perf_link_fill_tracepoint(event, info);
        case BPF_PROG_TYPE_KPROBE:
                return bpf_perf_link_fill_probe(event, info);
        default:
                return -EOPNOTSUPP;
        }
}

static const struct bpf_link_ops bpf_perf_link_lops = {
        .release = bpf_perf_link_release,
        .dealloc = bpf_perf_link_dealloc,
        .fill_link_info = bpf_perf_link_fill_link_info,
};

static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct bpf_link_primer link_primer;
        struct bpf_perf_link *link;
        struct perf_event *event;
        struct file *perf_file;
        int err;

        if (attr->link_create.flags)
                return -EINVAL;

        perf_file = perf_event_get(attr->link_create.target_fd);
        if (IS_ERR(perf_file))
                return PTR_ERR(perf_file);

        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto out_put_file;
        }
        bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
        link->perf_file = perf_file;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto out_put_file;
        }

        event = perf_file->private_data;
        err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
        if (err) {
                bpf_link_cleanup(&link_primer);
                goto out_put_file;
        }
        /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
        bpf_prog_inc(prog);

        return bpf_link_settle(&link_primer);

out_put_file:
        fput(perf_file);
        return err;
}
#else
static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
#endif /* CONFIG_PERF_EVENTS */

static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
                                  const char __user *user_tp_name, u64 cookie)
{
        struct bpf_link_primer link_primer;
        struct bpf_raw_tp_link *link;
        struct bpf_raw_event_map *btp;
        const char *tp_name;
        char buf[128];
        int err;

        switch (prog->type) {
        case BPF_PROG_TYPE_TRACING:
        case BPF_PROG_TYPE_EXT:
        case BPF_PROG_TYPE_LSM:
                if (user_tp_name)
                        /* The attach point for this category of programs
                         * should be specified via btf_id during program load.
                         */
                        return -EINVAL;
                if (prog->type == BPF_PROG_TYPE_TRACING &&
                    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
                        tp_name = prog->aux->attach_func_name;
                        break;
                }
                return bpf_tracing_prog_attach(prog, 0, 0, 0);
        case BPF_PROG_TYPE_RAW_TRACEPOINT:
        case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
                if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
                        return -EFAULT;
                buf[sizeof(buf) - 1] = 0;
                tp_name = buf;
                break;
        default:
                return -EINVAL;
        }

        btp = bpf_get_raw_tracepoint(tp_name);
        if (!btp)
                return -ENOENT;

        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto out_put_btp;
        }
        bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
                      &bpf_raw_tp_link_lops, prog);
        link->btp = btp;
        link->cookie = cookie;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto out_put_btp;
        }

        err = bpf_probe_register(link->btp, link);
        if (err) {
                bpf_link_cleanup(&link_primer);
                goto out_put_btp;
        }

        return bpf_link_settle(&link_primer);

out_put_btp:
        bpf_put_raw_tracepoint(btp);
        return err;
}

#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie

static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
        struct bpf_prog *prog;
        void __user *tp_name;
        __u64 cookie;
        int fd;

        if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
                return -EINVAL;

        prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
        cookie = attr->raw_tracepoint.cookie;
        fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
        if (fd < 0)
                bpf_prog_put(prog);
        return fd;
}

static enum bpf_prog_type
attach_type_to_prog_type(enum bpf_attach_type attach_type)
{
        switch (attach_type) {
        case BPF_CGROUP_INET_INGRESS:
        case BPF_CGROUP_INET_EGRESS:
                return BPF_PROG_TYPE_CGROUP_SKB;
        case BPF_CGROUP_INET_SOCK_CREATE:
        case BPF_CGROUP_INET_SOCK_RELEASE:
        case BPF_CGROUP_INET4_POST_BIND:
        case BPF_CGROUP_INET6_POST_BIND:
                return BPF_PROG_TYPE_CGROUP_SOCK;
        case BPF_CGROUP_INET4_BIND:
        case BPF_CGROUP_INET6_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
        case BPF_CGROUP_UNIX_CONNECT:
        case BPF_CGROUP_INET4_GETPEERNAME:
        case BPF_CGROUP_INET6_GETPEERNAME:
        case BPF_CGROUP_UNIX_GETPEERNAME:
        case BPF_CGROUP_INET4_GETSOCKNAME:
        case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UNIX_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UNIX_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
        case BPF_CGROUP_UDP6_RECVMSG:
        case BPF_CGROUP_UNIX_RECVMSG:
                return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
        case BPF_CGROUP_SOCK_OPS:
                return BPF_PROG_TYPE_SOCK_OPS;
        case BPF_CGROUP_DEVICE:
                return BPF_PROG_TYPE_CGROUP_DEVICE;
        case BPF_SK_MSG_VERDICT:
                return BPF_PROG_TYPE_SK_MSG;
        case BPF_SK_SKB_STREAM_PARSER:
        case BPF_SK_SKB_STREAM_VERDICT:
        case BPF_SK_SKB_VERDICT:
                return BPF_PROG_TYPE_SK_SKB;
        case BPF_LIRC_MODE2:
                return BPF_PROG_TYPE_LIRC_MODE2;
        case BPF_FLOW_DISSECTOR:
                return BPF_PROG_TYPE_FLOW_DISSECTOR;
        case BPF_CGROUP_SYSCTL:
                return BPF_PROG_TYPE_CGROUP_SYSCTL;
        case BPF_CGROUP_GETSOCKOPT:
        case BPF_CGROUP_SETSOCKOPT:
                return BPF_PROG_TYPE_CGROUP_SOCKOPT;
        case BPF_TRACE_ITER:
        case BPF_TRACE_RAW_TP:
        case BPF_TRACE_FENTRY:
        case BPF_TRACE_FEXIT:
        case BPF_MODIFY_RETURN:
                return BPF_PROG_TYPE_TRACING;
        case BPF_LSM_MAC:
                return BPF_PROG_TYPE_LSM;
        case BPF_SK_LOOKUP:
                return BPF_PROG_TYPE_SK_LOOKUP;
        case BPF_XDP:
                return BPF_PROG_TYPE_XDP;
        case BPF_LSM_CGROUP:
                return BPF_PROG_TYPE_LSM;
        case BPF_TCX_INGRESS:
        case BPF_TCX_EGRESS:
        case BPF_NETKIT_PRIMARY:
        case BPF_NETKIT_PEER:
                return BPF_PROG_TYPE_SCHED_CLS;
        default:
                return BPF_PROG_TYPE_UNSPEC;
        }
}

static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
                                             enum bpf_attach_type attach_type)
{
        enum bpf_prog_type ptype;

        switch (prog->type) {
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_SK_LOOKUP:
                return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
                        /* cg-skb progs can be loaded by unpriv user.
                         * check permissions at attach time.
                         */
                        return -EPERM;

                ptype = attach_type_to_prog_type(attach_type);
                if (prog->type != ptype)
                        return -EINVAL;

                return prog->enforce_expected_attach_type &&
                        prog->expected_attach_type != attach_type ?
                        -EINVAL : 0;
        case BPF_PROG_TYPE_EXT:
                return 0;
        case BPF_PROG_TYPE_NETFILTER:
                if (attach_type != BPF_NETFILTER)
                        return -EINVAL;
                return 0;
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_TRACEPOINT:
                if (attach_type != BPF_PERF_EVENT)
                        return -EINVAL;
                return 0;
        case BPF_PROG_TYPE_KPROBE:
                if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
                    attach_type != BPF_TRACE_KPROBE_MULTI)
                        return -EINVAL;
                if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
                    attach_type != BPF_TRACE_KPROBE_SESSION)
                        return -EINVAL;
                if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
                    attach_type != BPF_TRACE_UPROBE_MULTI)
                        return -EINVAL;
                if (attach_type != BPF_PERF_EVENT &&
                    attach_type != BPF_TRACE_KPROBE_MULTI &&
                    attach_type != BPF_TRACE_KPROBE_SESSION &&
                    attach_type != BPF_TRACE_UPROBE_MULTI)
                        return -EINVAL;
                return 0;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attach_type != BPF_TCX_INGRESS &&
                    attach_type != BPF_TCX_EGRESS &&
                    attach_type != BPF_NETKIT_PRIMARY &&
                    attach_type != BPF_NETKIT_PEER)
                        return -EINVAL;
                return 0;
        default:
                ptype = attach_type_to_prog_type(attach_type);
                if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
                        return -EINVAL;
                return 0;
        }
}

#define BPF_PROG_ATTACH_LAST_FIELD expected_revision

#define BPF_F_ATTACH_MASK_BASE        \
        (BPF_F_ALLOW_OVERRIDE |        \
         BPF_F_ALLOW_MULTI |        \
         BPF_F_REPLACE)

#define BPF_F_ATTACH_MASK_MPROG        \
        (BPF_F_REPLACE |        \
         BPF_F_BEFORE |                \
         BPF_F_AFTER |                \
         BPF_F_ID |                \
         BPF_F_LINK)

static int bpf_prog_attach(const union bpf_attr *attr)
{
        enum bpf_prog_type ptype;
        struct bpf_prog *prog;
        int ret;

        if (CHECK_ATTR(BPF_PROG_ATTACH))
                return -EINVAL;

        ptype = attach_type_to_prog_type(attr->attach_type);
        if (ptype == BPF_PROG_TYPE_UNSPEC)
                return -EINVAL;
        if (bpf_mprog_supported(ptype)) {
                if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
                        return -EINVAL;
        } else {
                if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
                        return -EINVAL;
                if (attr->relative_fd ||
                    attr->expected_revision)
                        return -EINVAL;
        }

        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
                bpf_prog_put(prog);
                return -EINVAL;
        }

        switch (ptype) {
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
                ret = sock_map_get_from_fd(attr, prog);
                break;
        case BPF_PROG_TYPE_LIRC_MODE2:
                ret = lirc_prog_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
                ret = netns_bpf_prog_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_LSM:
                if (ptype == BPF_PROG_TYPE_LSM &&
                    prog->expected_attach_type != BPF_LSM_CGROUP)
                        ret = -EINVAL;
                else
                        ret = cgroup_bpf_prog_attach(attr, ptype, prog);
                break;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attr->attach_type == BPF_TCX_INGRESS ||
                    attr->attach_type == BPF_TCX_EGRESS)
                        ret = tcx_prog_attach(attr, prog);
                else
                        ret = netkit_prog_attach(attr, prog);
                break;
        default:
                ret = -EINVAL;
        }

        if (ret)
                bpf_prog_put(prog);
        return ret;
}

#define BPF_PROG_DETACH_LAST_FIELD expected_revision

static int bpf_prog_detach(const union bpf_attr *attr)
{
        struct bpf_prog *prog = NULL;
        enum bpf_prog_type ptype;
        int ret;

        if (CHECK_ATTR(BPF_PROG_DETACH))
                return -EINVAL;

        ptype = attach_type_to_prog_type(attr->attach_type);
        if (bpf_mprog_supported(ptype)) {
                if (ptype == BPF_PROG_TYPE_UNSPEC)
                        return -EINVAL;
                if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
                        return -EINVAL;
                if (attr->attach_bpf_fd) {
                        prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
                        if (IS_ERR(prog))
                                return PTR_ERR(prog);
                }
        } else if (attr->attach_flags ||
                   attr->relative_fd ||
                   attr->expected_revision) {
                return -EINVAL;
        }

        switch (ptype) {
        case BPF_PROG_TYPE_SK_MSG:
        case BPF_PROG_TYPE_SK_SKB:
                ret = sock_map_prog_detach(attr, ptype);
                break;
        case BPF_PROG_TYPE_LIRC_MODE2:
                ret = lirc_prog_detach(attr);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
                ret = netns_bpf_prog_detach(attr, ptype);
                break;
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_LSM:
                ret = cgroup_bpf_prog_detach(attr, ptype);
                break;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attr->attach_type == BPF_TCX_INGRESS ||
                    attr->attach_type == BPF_TCX_EGRESS)
                        ret = tcx_prog_detach(attr, prog);
                else
                        ret = netkit_prog_detach(attr, prog);
                break;
        default:
                ret = -EINVAL;
        }

        if (prog)
                bpf_prog_put(prog);
        return ret;
}

#define BPF_PROG_QUERY_LAST_FIELD query.revision

static int bpf_prog_query(const union bpf_attr *attr,
                          union bpf_attr __user *uattr)
{
        if (!bpf_net_capable())
                return -EPERM;
        if (CHECK_ATTR(BPF_PROG_QUERY))
                return -EINVAL;
        if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
                return -EINVAL;

        switch (attr->query.attach_type) {
        case BPF_CGROUP_INET_INGRESS:
        case BPF_CGROUP_INET_EGRESS:
        case BPF_CGROUP_INET_SOCK_CREATE:
        case BPF_CGROUP_INET_SOCK_RELEASE:
        case BPF_CGROUP_INET4_BIND:
        case BPF_CGROUP_INET6_BIND:
        case BPF_CGROUP_INET4_POST_BIND:
        case BPF_CGROUP_INET6_POST_BIND:
        case BPF_CGROUP_INET4_CONNECT:
        case BPF_CGROUP_INET6_CONNECT:
        case BPF_CGROUP_UNIX_CONNECT:
        case BPF_CGROUP_INET4_GETPEERNAME:
        case BPF_CGROUP_INET6_GETPEERNAME:
        case BPF_CGROUP_UNIX_GETPEERNAME:
        case BPF_CGROUP_INET4_GETSOCKNAME:
        case BPF_CGROUP_INET6_GETSOCKNAME:
        case BPF_CGROUP_UNIX_GETSOCKNAME:
        case BPF_CGROUP_UDP4_SENDMSG:
        case BPF_CGROUP_UDP6_SENDMSG:
        case BPF_CGROUP_UNIX_SENDMSG:
        case BPF_CGROUP_UDP4_RECVMSG:
        case BPF_CGROUP_UDP6_RECVMSG:
        case BPF_CGROUP_UNIX_RECVMSG:
        case BPF_CGROUP_SOCK_OPS:
        case BPF_CGROUP_DEVICE:
        case BPF_CGROUP_SYSCTL:
        case BPF_CGROUP_GETSOCKOPT:
        case BPF_CGROUP_SETSOCKOPT:
        case BPF_LSM_CGROUP:
                return cgroup_bpf_prog_query(attr, uattr);
        case BPF_LIRC_MODE2:
                return lirc_prog_query(attr, uattr);
        case BPF_FLOW_DISSECTOR:
        case BPF_SK_LOOKUP:
                return netns_bpf_prog_query(attr, uattr);
        case BPF_SK_SKB_STREAM_PARSER:
        case BPF_SK_SKB_STREAM_VERDICT:
        case BPF_SK_MSG_VERDICT:
        case BPF_SK_SKB_VERDICT:
                return sock_map_bpf_prog_query(attr, uattr);
        case BPF_TCX_INGRESS:
        case BPF_TCX_EGRESS:
                return tcx_prog_query(attr, uattr);
        case BPF_NETKIT_PRIMARY:
        case BPF_NETKIT_PEER:
                return netkit_prog_query(attr, uattr);
        default:
                return -EINVAL;
        }
}

#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size

static int bpf_prog_test_run(const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        struct bpf_prog *prog;
        int ret = -ENOTSUPP;

        if (CHECK_ATTR(BPF_PROG_TEST_RUN))
                return -EINVAL;

        if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
            (!attr->test.ctx_size_in && attr->test.ctx_in))
                return -EINVAL;

        if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
            (!attr->test.ctx_size_out && attr->test.ctx_out))
                return -EINVAL;

        prog = bpf_prog_get(attr->test.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->aux->ops->test_run)
                ret = prog->aux->ops->test_run(prog, attr, uattr);

        bpf_prog_put(prog);
        return ret;
}

#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id

static int bpf_obj_get_next_id(const union bpf_attr *attr,
                               union bpf_attr __user *uattr,
                               struct idr *idr,
                               spinlock_t *lock)
{
        u32 next_id = attr->start_id;
        int err = 0;

        if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        next_id++;
        spin_lock_bh(lock);
        if (!idr_get_next(idr, &next_id))
                err = -ENOENT;
        spin_unlock_bh(lock);

        if (!err)
                err = put_user(next_id, &uattr->next_id);

        return err;
}

struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
{
        struct bpf_map *map;

        spin_lock_bh(&map_idr_lock);
again:
        map = idr_get_next(&map_idr, id);
        if (map) {
                map = __bpf_map_inc_not_zero(map, false);
                if (IS_ERR(map)) {
                        (*id)++;
                        goto again;
                }
        }
        spin_unlock_bh(&map_idr_lock);

        return map;
}

struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
{
        struct bpf_prog *prog;

        spin_lock_bh(&prog_idr_lock);
again:
        prog = idr_get_next(&prog_idr, id);
        if (prog) {
                prog = bpf_prog_inc_not_zero(prog);
                if (IS_ERR(prog)) {
                        (*id)++;
                        goto again;
                }
        }
        spin_unlock_bh(&prog_idr_lock);

        return prog;
}

#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id

struct bpf_prog *bpf_prog_by_id(u32 id)
{
        struct bpf_prog *prog;

        if (!id)
                return ERR_PTR(-ENOENT);

        spin_lock_bh(&prog_idr_lock);
        prog = idr_find(&prog_idr, id);
        if (prog)
                prog = bpf_prog_inc_not_zero(prog);
        else
                prog = ERR_PTR(-ENOENT);
        spin_unlock_bh(&prog_idr_lock);
        return prog;
}

static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
{
        struct bpf_prog *prog;
        u32 id = attr->prog_id;
        int fd;

        if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        prog = bpf_prog_by_id(id);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        fd = bpf_prog_new_fd(prog);
        if (fd < 0)
                bpf_prog_put(prog);

        return fd;
}

#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags

static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
        struct bpf_map *map;
        u32 id = attr->map_id;
        int f_flags;
        int fd;

        if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
            attr->open_flags & ~BPF_OBJ_FLAG_MASK)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        f_flags = bpf_get_file_flag(attr->open_flags);
        if (f_flags < 0)
                return f_flags;

        spin_lock_bh(&map_idr_lock);
        map = idr_find(&map_idr, id);
        if (map)
                map = __bpf_map_inc_not_zero(map, true);
        else
                map = ERR_PTR(-ENOENT);
        spin_unlock_bh(&map_idr_lock);

        if (IS_ERR(map))
                return PTR_ERR(map);

        fd = bpf_map_new_fd(map, f_flags);
        if (fd < 0)
                bpf_map_put_with_uref(map);

        return fd;
}

static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
                                              unsigned long addr, u32 *off,
                                              u32 *type)
{
        const struct bpf_map *map;
        int i;

        mutex_lock(&prog->aux->used_maps_mutex);
        for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
                map = prog->aux->used_maps[i];
                if (map == (void *)addr) {
                        *type = BPF_PSEUDO_MAP_FD;
                        goto out;
                }
                if (!map->ops->map_direct_value_meta)
                        continue;
                if (!map->ops->map_direct_value_meta(map, addr, off)) {
                        *type = BPF_PSEUDO_MAP_VALUE;
                        goto out;
                }
        }
        map = NULL;

out:
        mutex_unlock(&prog->aux->used_maps_mutex);
        return map;
}

static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
                                              const struct cred *f_cred)
{
        const struct bpf_map *map;
        struct bpf_insn *insns;
        u32 off, type;
        u64 imm;
        u8 code;
        int i;

        insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
                        GFP_USER);
        if (!insns)
                return insns;

        for (i = 0; i < prog->len; i++) {
                code = insns[i].code;

                if (code == (BPF_JMP | BPF_TAIL_CALL)) {
                        insns[i].code = BPF_JMP | BPF_CALL;
                        insns[i].imm = BPF_FUNC_tail_call;
                        /* fall-through */
                }
                if (code == (BPF_JMP | BPF_CALL) ||
                    code == (BPF_JMP | BPF_CALL_ARGS)) {
                        if (code == (BPF_JMP | BPF_CALL_ARGS))
                                insns[i].code = BPF_JMP | BPF_CALL;
                        if (!bpf_dump_raw_ok(f_cred))
                                insns[i].imm = 0;
                        continue;
                }
                if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
                        insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
                        continue;
                }

                if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
                     BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
                        insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
                        continue;
                }

                if (code != (BPF_LD | BPF_IMM | BPF_DW))
                        continue;

                imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
                map = bpf_map_from_imm(prog, imm, &off, &type);
                if (map) {
                        insns[i].src_reg = type;
                        insns[i].imm = map->id;
                        insns[i + 1].imm = off;
                        continue;
                }
        }

        return insns;
}

static int set_info_rec_size(struct bpf_prog_info *info)
{
        /*
         * Ensure info.*_rec_size is the same as kernel expected size
         *
         * or
         *
         * Only allow zero *_rec_size if both _rec_size and _cnt are
         * zero.  In this case, the kernel will set the expected
         * _rec_size back to the info.
         */

        if ((info->nr_func_info || info->func_info_rec_size) &&
            info->func_info_rec_size != sizeof(struct bpf_func_info))
                return -EINVAL;

        if ((info->nr_line_info || info->line_info_rec_size) &&
            info->line_info_rec_size != sizeof(struct bpf_line_info))
                return -EINVAL;

        if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
            info->jited_line_info_rec_size != sizeof(__u64))
                return -EINVAL;

        info->func_info_rec_size = sizeof(struct bpf_func_info);
        info->line_info_rec_size = sizeof(struct bpf_line_info);
        info->jited_line_info_rec_size = sizeof(__u64);

        return 0;
}

static int bpf_prog_get_info_by_fd(struct file *file,
                                   struct bpf_prog *prog,
                                   const union bpf_attr *attr,
                                   union bpf_attr __user *uattr)
{
        struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        struct btf *attach_btf = bpf_prog_get_target_btf(prog);
        struct bpf_prog_info info;
        u32 info_len = attr->info.info_len;
        struct bpf_prog_kstats stats;
        char __user *uinsns;
        u32 ulen;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
        if (err)
                return err;
        info_len = min_t(u32, sizeof(info), info_len);

        memset(&info, 0, sizeof(info));
        if (copy_from_user(&info, uinfo, info_len))
                return -EFAULT;

        info.type = prog->type;
        info.id = prog->aux->id;
        info.load_time = prog->aux->load_time;
        info.created_by_uid = from_kuid_munged(current_user_ns(),
                                               prog->aux->user->uid);
        info.gpl_compatible = prog->gpl_compatible;

        memcpy(info.tag, prog->tag, sizeof(prog->tag));
        memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));

        mutex_lock(&prog->aux->used_maps_mutex);
        ulen = info.nr_map_ids;
        info.nr_map_ids = prog->aux->used_map_cnt;
        ulen = min_t(u32, info.nr_map_ids, ulen);
        if (ulen) {
                u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
                u32 i;

                for (i = 0; i < ulen; i++)
                        if (put_user(prog->aux->used_maps[i]->id,
                                     &user_map_ids[i])) {
                                mutex_unlock(&prog->aux->used_maps_mutex);
                                return -EFAULT;
                        }
        }
        mutex_unlock(&prog->aux->used_maps_mutex);

        err = set_info_rec_size(&info);
        if (err)
                return err;

        bpf_prog_get_stats(prog, &stats);
        info.run_time_ns = stats.nsecs;
        info.run_cnt = stats.cnt;
        info.recursion_misses = stats.misses;

        info.verified_insns = prog->aux->verified_insns;

        if (!bpf_capable()) {
                info.jited_prog_len = 0;
                info.xlated_prog_len = 0;
                info.nr_jited_ksyms = 0;
                info.nr_jited_func_lens = 0;
                info.nr_func_info = 0;
                info.nr_line_info = 0;
                info.nr_jited_line_info = 0;
                goto done;
        }

        ulen = info.xlated_prog_len;
        info.xlated_prog_len = bpf_prog_insn_size(prog);
        if (info.xlated_prog_len && ulen) {
                struct bpf_insn *insns_sanitized;
                bool fault;

                if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
                        info.xlated_prog_insns = 0;
                        goto done;
                }
                insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
                if (!insns_sanitized)
                        return -ENOMEM;
                uinsns = u64_to_user_ptr(info.xlated_prog_insns);
                ulen = min_t(u32, info.xlated_prog_len, ulen);
                fault = copy_to_user(uinsns, insns_sanitized, ulen);
                kfree(insns_sanitized);
                if (fault)
                        return -EFAULT;
        }

        if (bpf_prog_is_offloaded(prog->aux)) {
                err = bpf_prog_offload_info_fill(&info, prog);
                if (err)
                        return err;
                goto done;
        }

        /* NOTE: the following code is supposed to be skipped for offload.
         * bpf_prog_offload_info_fill() is the place to fill similar fields
         * for offload.
         */
        ulen = info.jited_prog_len;
        if (prog->aux->func_cnt) {
                u32 i;

                info.jited_prog_len = 0;
                for (i = 0; i < prog->aux->func_cnt; i++)
                        info.jited_prog_len += prog->aux->func[i]->jited_len;
        } else {
                info.jited_prog_len = prog->jited_len;
        }

        if (info.jited_prog_len && ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        uinsns = u64_to_user_ptr(info.jited_prog_insns);
                        ulen = min_t(u32, info.jited_prog_len, ulen);

                        /* for multi-function programs, copy the JITed
                         * instructions for all the functions
                         */
                        if (prog->aux->func_cnt) {
                                u32 len, free, i;
                                u8 *img;

                                free = ulen;
                                for (i = 0; i < prog->aux->func_cnt; i++) {
                                        len = prog->aux->func[i]->jited_len;
                                        len = min_t(u32, len, free);
                                        img = (u8 *) prog->aux->func[i]->bpf_func;
                                        if (copy_to_user(uinsns, img, len))
                                                return -EFAULT;
                                        uinsns += len;
                                        free -= len;
                                        if (!free)
                                                break;
                                }
                        } else {
                                if (copy_to_user(uinsns, prog->bpf_func, ulen))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_prog_insns = 0;
                }
        }

        ulen = info.nr_jited_ksyms;
        info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
        if (ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        unsigned long ksym_addr;
                        u64 __user *user_ksyms;
                        u32 i;

                        /* copy the address of the kernel symbol
                         * corresponding to each function
                         */
                        ulen = min_t(u32, info.nr_jited_ksyms, ulen);
                        user_ksyms = u64_to_user_ptr(info.jited_ksyms);
                        if (prog->aux->func_cnt) {
                                for (i = 0; i < ulen; i++) {
                                        ksym_addr = (unsigned long)
                                                prog->aux->func[i]->bpf_func;
                                        if (put_user((u64) ksym_addr,
                                                     &user_ksyms[i]))
                                                return -EFAULT;
                                }
                        } else {
                                ksym_addr = (unsigned long) prog->bpf_func;
                                if (put_user((u64) ksym_addr, &user_ksyms[0]))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_ksyms = 0;
                }
        }

        ulen = info.nr_jited_func_lens;
        info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
        if (ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        u32 __user *user_lens;
                        u32 func_len, i;

                        /* copy the JITed image lengths for each function */
                        ulen = min_t(u32, info.nr_jited_func_lens, ulen);
                        user_lens = u64_to_user_ptr(info.jited_func_lens);
                        if (prog->aux->func_cnt) {
                                for (i = 0; i < ulen; i++) {
                                        func_len =
                                                prog->aux->func[i]->jited_len;
                                        if (put_user(func_len, &user_lens[i]))
                                                return -EFAULT;
                                }
                        } else {
                                func_len = prog->jited_len;
                                if (put_user(func_len, &user_lens[0]))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_func_lens = 0;
                }
        }

        if (prog->aux->btf)
                info.btf_id = btf_obj_id(prog->aux->btf);
        info.attach_btf_id = prog->aux->attach_btf_id;
        if (attach_btf)
                info.attach_btf_obj_id = btf_obj_id(attach_btf);

        ulen = info.nr_func_info;
        info.nr_func_info = prog->aux->func_info_cnt;
        if (info.nr_func_info && ulen) {
                char __user *user_finfo;

                user_finfo = u64_to_user_ptr(info.func_info);
                ulen = min_t(u32, info.nr_func_info, ulen);
                if (copy_to_user(user_finfo, prog->aux->func_info,
                                 info.func_info_rec_size * ulen))
                        return -EFAULT;
        }

        ulen = info.nr_line_info;
        info.nr_line_info = prog->aux->nr_linfo;
        if (info.nr_line_info && ulen) {
                __u8 __user *user_linfo;

                user_linfo = u64_to_user_ptr(info.line_info);
                ulen = min_t(u32, info.nr_line_info, ulen);
                if (copy_to_user(user_linfo, prog->aux->linfo,
                                 info.line_info_rec_size * ulen))
                        return -EFAULT;
        }

        ulen = info.nr_jited_line_info;
        if (prog->aux->jited_linfo)
                info.nr_jited_line_info = prog->aux->nr_linfo;
        else
                info.nr_jited_line_info = 0;
        if (info.nr_jited_line_info && ulen) {
                if (bpf_dump_raw_ok(file->f_cred)) {
                        unsigned long line_addr;
                        __u64 __user *user_linfo;
                        u32 i;

                        user_linfo = u64_to_user_ptr(info.jited_line_info);
                        ulen = min_t(u32, info.nr_jited_line_info, ulen);
                        for (i = 0; i < ulen; i++) {
                                line_addr = (unsigned long)prog->aux->jited_linfo[i];
                                if (put_user((__u64)line_addr, &user_linfo[i]))
                                        return -EFAULT;
                        }
                } else {
                        info.jited_line_info = 0;
                }
        }

        ulen = info.nr_prog_tags;
        info.nr_prog_tags = prog->aux->func_cnt ? : 1;
        if (ulen) {
                __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
                u32 i;

                user_prog_tags = u64_to_user_ptr(info.prog_tags);
                ulen = min_t(u32, info.nr_prog_tags, ulen);
                if (prog->aux->func_cnt) {
                        for (i = 0; i < ulen; i++) {
                                if (copy_to_user(user_prog_tags[i],
                                                 prog->aux->func[i]->tag,
                                                 BPF_TAG_SIZE))
                                        return -EFAULT;
                        }
                } else {
                        if (copy_to_user(user_prog_tags[0],
                                         prog->tag, BPF_TAG_SIZE))
                                return -EFAULT;
                }
        }

done:
        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;

        return 0;
}

static int bpf_map_get_info_by_fd(struct file *file,
                                  struct bpf_map *map,
                                  const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        struct bpf_map_info info;
        u32 info_len = attr->info.info_len;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
        if (err)
                return err;
        info_len = min_t(u32, sizeof(info), info_len);

        memset(&info, 0, sizeof(info));
        info.type = map->map_type;
        info.id = map->id;
        info.key_size = map->key_size;
        info.value_size = map->value_size;
        info.max_entries = map->max_entries;
        info.map_flags = map->map_flags;
        info.map_extra = map->map_extra;
        memcpy(info.name, map->name, sizeof(map->name));

        if (map->btf) {
                info.btf_id = btf_obj_id(map->btf);
                info.btf_key_type_id = map->btf_key_type_id;
                info.btf_value_type_id = map->btf_value_type_id;
        }
        info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
        if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
                bpf_map_struct_ops_info_fill(&info, map);

        if (bpf_map_is_offloaded(map)) {
                err = bpf_map_offload_info_fill(&info, map);
                if (err)
                        return err;
        }

        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;

        return 0;
}

static int bpf_btf_get_info_by_fd(struct file *file,
                                  struct btf *btf,
                                  const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        u32 info_len = attr->info.info_len;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
        if (err)
                return err;

        return btf_get_info_by_fd(btf, attr, uattr);
}

static int bpf_link_get_info_by_fd(struct file *file,
                                  struct bpf_link *link,
                                  const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
        struct bpf_link_info info;
        u32 info_len = attr->info.info_len;
        int err;

        err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
        if (err)
                return err;
        info_len = min_t(u32, sizeof(info), info_len);

        memset(&info, 0, sizeof(info));
        if (copy_from_user(&info, uinfo, info_len))
                return -EFAULT;

        info.type = link->type;
        info.id = link->id;
        if (link->prog)
                info.prog_id = link->prog->aux->id;

        if (link->ops->fill_link_info) {
                err = link->ops->fill_link_info(link, &info);
                if (err)
                        return err;
        }

        if (copy_to_user(uinfo, &info, info_len) ||
            put_user(info_len, &uattr->info.info_len))
                return -EFAULT;

        return 0;
}


#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info

static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
                                  union bpf_attr __user *uattr)
{
        int ufd = attr->info.bpf_fd;
        struct fd f;
        int err;

        if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
                return -EINVAL;

        f = fdget(ufd);
        if (!f.file)
                return -EBADFD;

        if (f.file->f_op == &bpf_prog_fops)
                err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
                                              uattr);
        else if (f.file->f_op == &bpf_map_fops)
                err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
                                             uattr);
        else if (f.file->f_op == &btf_fops)
                err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
        else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
                err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
                                              attr, uattr);
        else
                err = -EINVAL;

        fdput(f);
        return err;
}

#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd

static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
        struct bpf_token *token = NULL;

        if (CHECK_ATTR(BPF_BTF_LOAD))
                return -EINVAL;

        if (attr->btf_flags & ~BPF_F_TOKEN_FD)
                return -EINVAL;

        if (attr->btf_flags & BPF_F_TOKEN_FD) {
                token = bpf_token_get_from_fd(attr->btf_token_fd);
                if (IS_ERR(token))
                        return PTR_ERR(token);
                if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
                        bpf_token_put(token);
                        token = NULL;
                }
        }

        if (!bpf_token_capable(token, CAP_BPF)) {
                bpf_token_put(token);
                return -EPERM;
        }

        bpf_token_put(token);

        return btf_new_fd(attr, uattr, uattr_size);
}

#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id

static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
{
        if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        return btf_get_fd_by_id(attr->btf_id);
}

static int bpf_task_fd_query_copy(const union bpf_attr *attr,
                                    union bpf_attr __user *uattr,
                                    u32 prog_id, u32 fd_type,
                                    const char *buf, u64 probe_offset,
                                    u64 probe_addr)
{
        char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
        u32 len = buf ? strlen(buf) : 0, input_len;
        int err = 0;

        if (put_user(len, &uattr->task_fd_query.buf_len))
                return -EFAULT;
        input_len = attr->task_fd_query.buf_len;
        if (input_len && ubuf) {
                if (!len) {
                        /* nothing to copy, just make ubuf NULL terminated */
                        char zero = '\0';

                        if (put_user(zero, ubuf))
                                return -EFAULT;
                } else if (input_len >= len + 1) {
                        /* ubuf can hold the string with NULL terminator */
                        if (copy_to_user(ubuf, buf, len + 1))
                                return -EFAULT;
                } else {
                        /* ubuf cannot hold the string with NULL terminator,
                         * do a partial copy with NULL terminator.
                         */
                        char zero = '\0';

                        err = -ENOSPC;
                        if (copy_to_user(ubuf, buf, input_len - 1))
                                return -EFAULT;
                        if (put_user(zero, ubuf + input_len - 1))
                                return -EFAULT;
                }
        }

        if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
            put_user(fd_type, &uattr->task_fd_query.fd_type) ||
            put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
            put_user(probe_addr, &uattr->task_fd_query.probe_addr))
                return -EFAULT;

        return err;
}

#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr

static int bpf_task_fd_query(const union bpf_attr *attr,
                             union bpf_attr __user *uattr)
{
        pid_t pid = attr->task_fd_query.pid;
        u32 fd = attr->task_fd_query.fd;
        const struct perf_event *event;
        struct task_struct *task;
        struct file *file;
        int err;

        if (CHECK_ATTR(BPF_TASK_FD_QUERY))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (attr->task_fd_query.flags != 0)
                return -EINVAL;

        rcu_read_lock();
        task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
        rcu_read_unlock();
        if (!task)
                return -ENOENT;

        err = 0;
        file = fget_task(task, fd);
        put_task_struct(task);
        if (!file)
                return -EBADF;

        if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
                struct bpf_link *link = file->private_data;

                if (link->ops == &bpf_raw_tp_link_lops) {
                        struct bpf_raw_tp_link *raw_tp =
                                container_of(link, struct bpf_raw_tp_link, link);
                        struct bpf_raw_event_map *btp = raw_tp->btp;

                        err = bpf_task_fd_query_copy(attr, uattr,
                                                     raw_tp->link.prog->aux->id,
                                                     BPF_FD_TYPE_RAW_TRACEPOINT,
                                                     btp->tp->name, 0, 0);
                        goto put_file;
                }
                goto out_not_supp;
        }

        event = perf_get_event(file);
        if (!IS_ERR(event)) {
                u64 probe_offset, probe_addr;
                u32 prog_id, fd_type;
                const char *buf;

                err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
                                              &buf, &probe_offset,
                                              &probe_addr, NULL);
                if (!err)
                        err = bpf_task_fd_query_copy(attr, uattr, prog_id,
                                                     fd_type, buf,
                                                     probe_offset,
                                                     probe_addr);
                goto put_file;
        }

out_not_supp:
        err = -ENOTSUPP;
put_file:
        fput(file);
        return err;
}

#define BPF_MAP_BATCH_LAST_FIELD batch.flags

#define BPF_DO_BATCH(fn, ...)                        \
        do {                                        \
                if (!fn) {                        \
                        err = -ENOTSUPP;        \
                        goto err_put;                \
                }                                \
                err = fn(__VA_ARGS__);                \
        } while (0)

static int bpf_map_do_batch(const union bpf_attr *attr,
                            union bpf_attr __user *uattr,
                            int cmd)
{
        bool has_read  = cmd == BPF_MAP_LOOKUP_BATCH ||
                         cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
        bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
        struct bpf_map *map;
        int err, ufd;
        struct fd f;

        if (CHECK_ATTR(BPF_MAP_BATCH))
                return -EINVAL;

        ufd = attr->batch.map_fd;
        f = fdget(ufd);
        map = __bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
        if (has_write)
                bpf_map_write_active_inc(map);
        if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
                err = -EPERM;
                goto err_put;
        }
        if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
                err = -EPERM;
                goto err_put;
        }

        if (cmd == BPF_MAP_LOOKUP_BATCH)
                BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
        else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
                BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
        else if (cmd == BPF_MAP_UPDATE_BATCH)
                BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
        else
                BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
        if (has_write) {
                maybe_wait_bpf_programs(map);
                bpf_map_write_active_dec(map);
        }
        fdput(f);
        return err;
}

#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
        struct bpf_prog *prog;
        int ret;

        if (CHECK_ATTR(BPF_LINK_CREATE))
                return -EINVAL;

        if (attr->link_create.attach_type == BPF_STRUCT_OPS)
                return bpf_struct_ops_link_create(attr);

        prog = bpf_prog_get(attr->link_create.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        ret = bpf_prog_attach_check_attach_type(prog,
                                                attr->link_create.attach_type);
        if (ret)
                goto out;

        switch (prog->type) {
        case BPF_PROG_TYPE_CGROUP_SKB:
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_SOCK_OPS:
        case BPF_PROG_TYPE_CGROUP_DEVICE:
        case BPF_PROG_TYPE_CGROUP_SYSCTL:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
                ret = cgroup_bpf_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_EXT:
                ret = bpf_tracing_prog_attach(prog,
                                              attr->link_create.target_fd,
                                              attr->link_create.target_btf_id,
                                              attr->link_create.tracing.cookie);
                break;
        case BPF_PROG_TYPE_LSM:
        case BPF_PROG_TYPE_TRACING:
                if (attr->link_create.attach_type != prog->expected_attach_type) {
                        ret = -EINVAL;
                        goto out;
                }
                if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
                        ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
                else if (prog->expected_attach_type == BPF_TRACE_ITER)
                        ret = bpf_iter_link_attach(attr, uattr, prog);
                else if (prog->expected_attach_type == BPF_LSM_CGROUP)
                        ret = cgroup_bpf_link_attach(attr, prog);
                else
                        ret = bpf_tracing_prog_attach(prog,
                                                      attr->link_create.target_fd,
                                                      attr->link_create.target_btf_id,
                                                      attr->link_create.tracing.cookie);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
        case BPF_PROG_TYPE_SK_LOOKUP:
                ret = netns_bpf_link_create(attr, prog);
                break;
        case BPF_PROG_TYPE_SK_MSG:
        case BPF_PROG_TYPE_SK_SKB:
                ret = sock_map_link_create(attr, prog);
                break;
#ifdef CONFIG_NET
        case BPF_PROG_TYPE_XDP:
                ret = bpf_xdp_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_SCHED_CLS:
                if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
                    attr->link_create.attach_type == BPF_TCX_EGRESS)
                        ret = tcx_link_attach(attr, prog);
                else
                        ret = netkit_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_NETFILTER:
                ret = bpf_nf_link_attach(attr, prog);
                break;
#endif
        case BPF_PROG_TYPE_PERF_EVENT:
        case BPF_PROG_TYPE_TRACEPOINT:
                ret = bpf_perf_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_KPROBE:
                if (attr->link_create.attach_type == BPF_PERF_EVENT)
                        ret = bpf_perf_link_attach(attr, prog);
                else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
                         attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
                        ret = bpf_kprobe_multi_link_attach(attr, prog);
                else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
                        ret = bpf_uprobe_multi_link_attach(attr, prog);
                break;
        default:
                ret = -EINVAL;
        }

out:
        if (ret < 0)
                bpf_prog_put(prog);
        return ret;
}

static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
{
        struct bpf_map *new_map, *old_map = NULL;
        int ret;

        new_map = bpf_map_get(attr->link_update.new_map_fd);
        if (IS_ERR(new_map))
                return PTR_ERR(new_map);

        if (attr->link_update.flags & BPF_F_REPLACE) {
                old_map = bpf_map_get(attr->link_update.old_map_fd);
                if (IS_ERR(old_map)) {
                        ret = PTR_ERR(old_map);
                        goto out_put;
                }
        } else if (attr->link_update.old_map_fd) {
                ret = -EINVAL;
                goto out_put;
        }

        ret = link->ops->update_map(link, new_map, old_map);

        if (old_map)
                bpf_map_put(old_map);
out_put:
        bpf_map_put(new_map);
        return ret;
}

#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd

static int link_update(union bpf_attr *attr)
{
        struct bpf_prog *old_prog = NULL, *new_prog;
        struct bpf_link *link;
        u32 flags;
        int ret;

        if (CHECK_ATTR(BPF_LINK_UPDATE))
                return -EINVAL;

        flags = attr->link_update.flags;
        if (flags & ~BPF_F_REPLACE)
                return -EINVAL;

        link = bpf_link_get_from_fd(attr->link_update.link_fd);
        if (IS_ERR(link))
                return PTR_ERR(link);

        if (link->ops->update_map) {
                ret = link_update_map(link, attr);
                goto out_put_link;
        }

        new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
        if (IS_ERR(new_prog)) {
                ret = PTR_ERR(new_prog);
                goto out_put_link;
        }

        if (flags & BPF_F_REPLACE) {
                old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
                if (IS_ERR(old_prog)) {
                        ret = PTR_ERR(old_prog);
                        old_prog = NULL;
                        goto out_put_progs;
                }
        } else if (attr->link_update.old_prog_fd) {
                ret = -EINVAL;
                goto out_put_progs;
        }

        if (link->ops->update_prog)
                ret = link->ops->update_prog(link, new_prog, old_prog);
        else
                ret = -EINVAL;

out_put_progs:
        if (old_prog)
                bpf_prog_put(old_prog);
        if (ret)
                bpf_prog_put(new_prog);
out_put_link:
        bpf_link_put_direct(link);
        return ret;
}

#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd

static int link_detach(union bpf_attr *attr)
{
        struct bpf_link *link;
        int ret;

        if (CHECK_ATTR(BPF_LINK_DETACH))
                return -EINVAL;

        link = bpf_link_get_from_fd(attr->link_detach.link_fd);
        if (IS_ERR(link))
                return PTR_ERR(link);

        if (link->ops->detach)
                ret = link->ops->detach(link);
        else
                ret = -EOPNOTSUPP;

        bpf_link_put_direct(link);
        return ret;
}

struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
        return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL(bpf_link_inc_not_zero);

struct bpf_link *bpf_link_by_id(u32 id)
{
        struct bpf_link *link;

        if (!id)
                return ERR_PTR(-ENOENT);

        spin_lock_bh(&link_idr_lock);
        /* before link is "settled", ID is 0, pretend it doesn't exist yet */
        link = idr_find(&link_idr, id);
        if (link) {
                if (link->id)
                        link = bpf_link_inc_not_zero(link);
                else
                        link = ERR_PTR(-EAGAIN);
        } else {
                link = ERR_PTR(-ENOENT);
        }
        spin_unlock_bh(&link_idr_lock);
        return link;
}

struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
{
        struct bpf_link *link;

        spin_lock_bh(&link_idr_lock);
again:
        link = idr_get_next(&link_idr, id);
        if (link) {
                link = bpf_link_inc_not_zero(link);
                if (IS_ERR(link)) {
                        (*id)++;
                        goto again;
                }
        }
        spin_unlock_bh(&link_idr_lock);

        return link;
}

#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id

static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
{
        struct bpf_link *link;
        u32 id = attr->link_id;
        int fd;

        if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        link = bpf_link_by_id(id);
        if (IS_ERR(link))
                return PTR_ERR(link);

        fd = bpf_link_new_fd(link);
        if (fd < 0)
                bpf_link_put_direct(link);

        return fd;
}

DEFINE_MUTEX(bpf_stats_enabled_mutex);

static int bpf_stats_release(struct inode *inode, struct file *file)
{
        mutex_lock(&bpf_stats_enabled_mutex);
        static_key_slow_dec(&bpf_stats_enabled_key.key);
        mutex_unlock(&bpf_stats_enabled_mutex);
        return 0;
}

static const struct file_operations bpf_stats_fops = {
        .release = bpf_stats_release,
};

static int bpf_enable_runtime_stats(void)
{
        int fd;

        mutex_lock(&bpf_stats_enabled_mutex);

        /* Set a very high limit to avoid overflow */
        if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
                mutex_unlock(&bpf_stats_enabled_mutex);
                return -EBUSY;
        }

        fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
        if (fd >= 0)
                static_key_slow_inc(&bpf_stats_enabled_key.key);

        mutex_unlock(&bpf_stats_enabled_mutex);
        return fd;
}

#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type

static int bpf_enable_stats(union bpf_attr *attr)
{

        if (CHECK_ATTR(BPF_ENABLE_STATS))
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        switch (attr->enable_stats.type) {
        case BPF_STATS_RUN_TIME:
                return bpf_enable_runtime_stats();
        default:
                break;
        }
        return -EINVAL;
}

#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags

static int bpf_iter_create(union bpf_attr *attr)
{
        struct bpf_link *link;
        int err;

        if (CHECK_ATTR(BPF_ITER_CREATE))
                return -EINVAL;

        if (attr->iter_create.flags)
                return -EINVAL;

        link = bpf_link_get_from_fd(attr->iter_create.link_fd);
        if (IS_ERR(link))
                return PTR_ERR(link);

        err = bpf_iter_new_fd(link);
        bpf_link_put_direct(link);

        return err;
}

#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags

static int bpf_prog_bind_map(union bpf_attr *attr)
{
        struct bpf_prog *prog;
        struct bpf_map *map;
        struct bpf_map **used_maps_old, **used_maps_new;
        int i, ret = 0;

        if (CHECK_ATTR(BPF_PROG_BIND_MAP))
                return -EINVAL;

        if (attr->prog_bind_map.flags)
                return -EINVAL;

        prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        map = bpf_map_get(attr->prog_bind_map.map_fd);
        if (IS_ERR(map)) {
                ret = PTR_ERR(map);
                goto out_prog_put;
        }

        mutex_lock(&prog->aux->used_maps_mutex);

        used_maps_old = prog->aux->used_maps;

        for (i = 0; i < prog->aux->used_map_cnt; i++)
                if (used_maps_old[i] == map) {
                        bpf_map_put(map);
                        goto out_unlock;
                }

        used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
                                      sizeof(used_maps_new[0]),
                                      GFP_KERNEL);
        if (!used_maps_new) {
                ret = -ENOMEM;
                goto out_unlock;
        }

        /* The bpf program will not access the bpf map, but for the sake of
         * simplicity, increase sleepable_refcnt for sleepable program as well.
         */
        if (prog->sleepable)
                atomic64_inc(&map->sleepable_refcnt);
        memcpy(used_maps_new, used_maps_old,
               sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
        used_maps_new[prog->aux->used_map_cnt] = map;

        prog->aux->used_map_cnt++;
        prog->aux->used_maps = used_maps_new;

        kfree(used_maps_old);

out_unlock:
        mutex_unlock(&prog->aux->used_maps_mutex);

        if (ret)
                bpf_map_put(map);
out_prog_put:
        bpf_prog_put(prog);
        return ret;
}

#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd

static int token_create(union bpf_attr *attr)
{
        if (CHECK_ATTR(BPF_TOKEN_CREATE))
                return -EINVAL;

        /* no flags are supported yet */
        if (attr->token_create.flags)
                return -EINVAL;

        return bpf_token_create(attr);
}

static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
        union bpf_attr attr;
        int err;

        err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
        if (err)
                return err;
        size = min_t(u32, size, sizeof(attr));

        /* copy attributes from user space, may be less than sizeof(bpf_attr) */
        memset(&attr, 0, sizeof(attr));
        if (copy_from_bpfptr(&attr, uattr, size) != 0)
                return -EFAULT;

        err = security_bpf(cmd, &attr, size);
        if (err < 0)
                return err;

        switch (cmd) {
        case BPF_MAP_CREATE:
                err = map_create(&attr);
                break;
        case BPF_MAP_LOOKUP_ELEM:
                err = map_lookup_elem(&attr);
                break;
        case BPF_MAP_UPDATE_ELEM:
                err = map_update_elem(&attr, uattr);
                break;
        case BPF_MAP_DELETE_ELEM:
                err = map_delete_elem(&attr, uattr);
                break;
        case BPF_MAP_GET_NEXT_KEY:
                err = map_get_next_key(&attr);
                break;
        case BPF_MAP_FREEZE:
                err = map_freeze(&attr);
                break;
        case BPF_PROG_LOAD:
                err = bpf_prog_load(&attr, uattr, size);
                break;
        case BPF_OBJ_PIN:
                err = bpf_obj_pin(&attr);
                break;
        case BPF_OBJ_GET:
                err = bpf_obj_get(&attr);
                break;
        case BPF_PROG_ATTACH:
                err = bpf_prog_attach(&attr);
                break;
        case BPF_PROG_DETACH:
                err = bpf_prog_detach(&attr);
                break;
        case BPF_PROG_QUERY:
                err = bpf_prog_query(&attr, uattr.user);
                break;
        case BPF_PROG_TEST_RUN:
                err = bpf_prog_test_run(&attr, uattr.user);
                break;
        case BPF_PROG_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &prog_idr, &prog_idr_lock);
                break;
        case BPF_MAP_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &map_idr, &map_idr_lock);
                break;
        case BPF_BTF_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &btf_idr, &btf_idr_lock);
                break;
        case BPF_PROG_GET_FD_BY_ID:
                err = bpf_prog_get_fd_by_id(&attr);
                break;
        case BPF_MAP_GET_FD_BY_ID:
                err = bpf_map_get_fd_by_id(&attr);
                break;
        case BPF_OBJ_GET_INFO_BY_FD:
                err = bpf_obj_get_info_by_fd(&attr, uattr.user);
                break;
        case BPF_RAW_TRACEPOINT_OPEN:
                err = bpf_raw_tracepoint_open(&attr);
                break;
        case BPF_BTF_LOAD:
                err = bpf_btf_load(&attr, uattr, size);
                break;
        case BPF_BTF_GET_FD_BY_ID:
                err = bpf_btf_get_fd_by_id(&attr);
                break;
        case BPF_TASK_FD_QUERY:
                err = bpf_task_fd_query(&attr, uattr.user);
                break;
        case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
                err = map_lookup_and_delete_elem(&attr);
                break;
        case BPF_MAP_LOOKUP_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
                break;
        case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user,
                                       BPF_MAP_LOOKUP_AND_DELETE_BATCH);
                break;
        case BPF_MAP_UPDATE_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
                break;
        case BPF_MAP_DELETE_BATCH:
                err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
                break;
        case BPF_LINK_CREATE:
                err = link_create(&attr, uattr);
                break;
        case BPF_LINK_UPDATE:
                err = link_update(&attr);
                break;
        case BPF_LINK_GET_FD_BY_ID:
                err = bpf_link_get_fd_by_id(&attr);
                break;
        case BPF_LINK_GET_NEXT_ID:
                err = bpf_obj_get_next_id(&attr, uattr.user,
                                          &link_idr, &link_idr_lock);
                break;
        case BPF_ENABLE_STATS:
                err = bpf_enable_stats(&attr);
                break;
        case BPF_ITER_CREATE:
                err = bpf_iter_create(&attr);
                break;
        case BPF_LINK_DETACH:
                err = link_detach(&attr);
                break;
        case BPF_PROG_BIND_MAP:
                err = bpf_prog_bind_map(&attr);
                break;
        case BPF_TOKEN_CREATE:
                err = token_create(&attr);
                break;
        default:
                err = -EINVAL;
                break;
        }

        return err;
}

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
        return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
}

static bool syscall_prog_is_valid_access(int off, int size,
                                         enum bpf_access_type type,
                                         const struct bpf_prog *prog,
                                         struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= U16_MAX)
                return false;
        if (off % size != 0)
                return false;
        return true;
}

BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
        switch (cmd) {
        case BPF_MAP_CREATE:
        case BPF_MAP_DELETE_ELEM:
        case BPF_MAP_UPDATE_ELEM:
        case BPF_MAP_FREEZE:
        case BPF_MAP_GET_FD_BY_ID:
        case BPF_PROG_LOAD:
        case BPF_BTF_LOAD:
        case BPF_LINK_CREATE:
        case BPF_RAW_TRACEPOINT_OPEN:
                break;
        default:
                return -EINVAL;
        }
        return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}


/* To shut up -Wmissing-prototypes.
 * This function is used by the kernel light skeleton
 * to load bpf programs when modules are loaded or during kernel boot.
 * See tools/lib/bpf/skel_internal.h
 */
int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);

int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
        struct bpf_prog * __maybe_unused prog;
        struct bpf_tramp_run_ctx __maybe_unused run_ctx;

        switch (cmd) {
#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
        case BPF_PROG_TEST_RUN:
                if (attr->test.data_in || attr->test.data_out ||
                    attr->test.ctx_out || attr->test.duration ||
                    attr->test.repeat || attr->test.flags)
                        return -EINVAL;

                prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);

                if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
                    attr->test.ctx_size_in > U16_MAX) {
                        bpf_prog_put(prog);
                        return -EINVAL;
                }

                run_ctx.bpf_cookie = 0;
                if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
                        /* recursion detected */
                        __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
                        bpf_prog_put(prog);
                        return -EBUSY;
                }
                attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
                __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
                                                &run_ctx);
                bpf_prog_put(prog);
                return 0;
#endif
        default:
                return ____bpf_sys_bpf(cmd, attr, size);
        }
}
EXPORT_SYMBOL(kern_sys_bpf);

static const struct bpf_func_proto bpf_sys_bpf_proto = {
        .func                = bpf_sys_bpf,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

const struct bpf_func_proto * __weak
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        return bpf_base_func_proto(func_id, prog);
}

BPF_CALL_1(bpf_sys_close, u32, fd)
{
        /* When bpf program calls this helper there should not be
         * an fdget() without matching completed fdput().
         * This helper is allowed in the following callchain only:
         * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
         */
        return close_fd(fd);
}

static const struct bpf_func_proto bpf_sys_close_proto = {
        .func                = bpf_sys_close,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
        if (flags)
                return -EINVAL;

        if (name_sz <= 1 || name[name_sz - 1])
                return -EINVAL;

        if (!bpf_dump_raw_ok(current_cred()))
                return -EPERM;

        *res = kallsyms_lookup_name(name);
        return *res ? 0 : -ENOENT;
}

static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
        .func                = bpf_kallsyms_lookup_name,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_LONG,
};

static const struct bpf_func_proto *
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_sys_bpf:
                return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
                       ? NULL : &bpf_sys_bpf_proto;
        case BPF_FUNC_btf_find_by_name_kind:
                return &bpf_btf_find_by_name_kind_proto;
        case BPF_FUNC_sys_close:
                return &bpf_sys_close_proto;
        case BPF_FUNC_kallsyms_lookup_name:
                return &bpf_kallsyms_lookup_name_proto;
        default:
                return tracing_prog_func_proto(func_id, prog);
        }
}

const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
        .get_func_proto  = syscall_prog_func_proto,
        .is_valid_access = syscall_prog_is_valid_access,
};

const struct bpf_prog_ops bpf_syscall_prog_ops = {
        .test_run = bpf_prog_test_run_syscall,
};

#ifdef CONFIG_SYSCTL
static int bpf_stats_handler(struct ctl_table *table, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        struct static_key *key = (struct static_key *)table->data;
        static int saved_val;
        int val, ret;
        struct ctl_table tmp = {
                .data   = &val,
                .maxlen = sizeof(val),
                .mode   = table->mode,
                .extra1 = SYSCTL_ZERO,
                .extra2 = SYSCTL_ONE,
        };

        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;

        mutex_lock(&bpf_stats_enabled_mutex);
        val = saved_val;
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        if (write && !ret && val != saved_val) {
                if (val)
                        static_key_slow_inc(key);
                else
                        static_key_slow_dec(key);
                saved_val = val;
        }
        mutex_unlock(&bpf_stats_enabled_mutex);
        return ret;
}

void __weak unpriv_ebpf_notify(int new_state)
{
}

static int bpf_unpriv_handler(struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret, unpriv_enable = *(int *)table->data;
        bool locked_state = unpriv_enable == 1;
        struct ctl_table tmp = *table;

        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;

        tmp.data = &unpriv_enable;
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        if (write && !ret) {
                if (locked_state && unpriv_enable != 1)
                        return -EPERM;
                *(int *)table->data = unpriv_enable;
        }

        if (write)
                unpriv_ebpf_notify(unpriv_enable);

        return ret;
}

static struct ctl_table bpf_syscall_table[] = {
        {
                .procname        = "unprivileged_bpf_disabled",
                .data                = &sysctl_unprivileged_bpf_disabled,
                .maxlen                = sizeof(sysctl_unprivileged_bpf_disabled),
                .mode                = 0644,
                .proc_handler        = bpf_unpriv_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "bpf_stats_enabled",
                .data                = &bpf_stats_enabled_key.key,
                .mode                = 0644,
                .proc_handler        = bpf_stats_handler,
        },
};

static int __init bpf_syscall_sysctl_init(void)
{
        register_sysctl_init("kernel", bpf_syscall_table);
        return 0;
}
late_initcall(bpf_syscall_sysctl_init);
#endif /* CONFIG_SYSCTL */














































    1 





    1 

    1 



















    1 


























































































































































    1 





    1 








    1 


















































































































































































































































































































































































































































































































































    1 



    1 















    1 














    1 














    1 






    1 

























































































































































    1 




    1 












































































































































































































































































































































































    1 


























































































    1 











































    1 
















































    1 



























































































































































    1 



































































    1 

























    1 



    1 























    1 















    1 






    1 


















































































































    1 

    1 








































































    1 
















    1 





























































































































































































































    1 

































































































































    1 

    1 















































    1 



    1 




















































    1 
    1 


























































































































































































































































































































































































































































































































































































































    1 






















































































































































































































































































































    1 






















    1 


































































    1 
























































































































































































































    1 






































































































    1 


































































































































































































































































































    1 




    1 

























    1 


















    1 










    1 
    1 
    1 
    1 
    1 


    1 







    1 
    1 





























    1 
    1 

    1 









































    1 







    1 










































































































































    1 










    1 





















    1 

    1 


    1 









































































































































































































































































































































































































































































































































































    1 


















    1 
    1 



    1 


    1 





















    1 



















    1 

























































    1 























    1 







































    1 










    1 


    1 




    1 





















































































    1 













    1 





























































































































    1 

    1 






    1 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005-2006, Devicescape Software, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2007-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 */

#include <linux/jiffies.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/kcov.h>
#include <linux/bitops.h>
#include <kunit/visibility.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>
#include <asm/unaligned.h>

#include "ieee80211_i.h"
#include "driver-ops.h"
#include "led.h"
#include "mesh.h"
#include "wep.h"
#include "wpa.h"
#include "tkip.h"
#include "wme.h"
#include "rate.h"

/*
 * monitor mode reception
 *
 * This function cleans up the SKB, i.e. it removes all the stuff
 * only useful for monitoring.
 */
static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb,
                                           unsigned int present_fcs_len,
                                           unsigned int rtap_space)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_hdr *hdr;
        unsigned int hdrlen;
        __le16 fc;

        if (present_fcs_len)
                __pskb_trim(skb, skb->len - present_fcs_len);
        pskb_pull(skb, rtap_space);

        /* After pulling radiotap header, clear all flags that indicate
         * info in skb->data.
         */
        status->flag &= ~(RX_FLAG_RADIOTAP_TLV_AT_END |
                          RX_FLAG_RADIOTAP_LSIG |
                          RX_FLAG_RADIOTAP_HE_MU |
                          RX_FLAG_RADIOTAP_HE);

        hdr = (void *)skb->data;
        fc = hdr->frame_control;

        /*
         * Remove the HT-Control field (if present) on management
         * frames after we've sent the frame to monitoring. We
         * (currently) don't need it, and don't properly parse
         * frames with it present, due to the assumption of a
         * fixed management header length.
         */
        if (likely(!ieee80211_is_mgmt(fc) || !ieee80211_has_order(fc)))
                return skb;

        hdrlen = ieee80211_hdrlen(fc);
        hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_ORDER);

        if (!pskb_may_pull(skb, hdrlen)) {
                dev_kfree_skb(skb);
                return NULL;
        }

        memmove(skb->data + IEEE80211_HT_CTL_LEN, skb->data,
                hdrlen - IEEE80211_HT_CTL_LEN);
        pskb_pull(skb, IEEE80211_HT_CTL_LEN);

        return skb;
}

static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
                                     unsigned int rtap_space)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_hdr *hdr;

        hdr = (void *)(skb->data + rtap_space);

        if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
                            RX_FLAG_FAILED_PLCP_CRC |
                            RX_FLAG_ONLY_MONITOR |
                            RX_FLAG_NO_PSDU))
                return true;

        if (unlikely(skb->len < 16 + present_fcs_len + rtap_space))
                return true;

        if (ieee80211_is_ctl(hdr->frame_control) &&
            !ieee80211_is_pspoll(hdr->frame_control) &&
            !ieee80211_is_back_req(hdr->frame_control))
                return true;

        return false;
}

static int
ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
                             struct ieee80211_rx_status *status,
                             struct sk_buff *skb)
{
        int len;

        /* always present fields */
        len = sizeof(struct ieee80211_radiotap_header) + 8;

        /* allocate extra bitmaps */
        if (status->chains)
                len += 4 * hweight8(status->chains);

        if (ieee80211_have_rx_timestamp(status)) {
                len = ALIGN(len, 8);
                len += 8;
        }
        if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
                len += 1;

        /* antenna field, if we don't have per-chain info */
        if (!status->chains)
                len += 1;

        /* padding for RX_FLAGS if necessary */
        len = ALIGN(len, 2);

        if (status->encoding == RX_ENC_HT) /* HT info */
                len += 3;

        if (status->flag & RX_FLAG_AMPDU_DETAILS) {
                len = ALIGN(len, 4);
                len += 8;
        }

        if (status->encoding == RX_ENC_VHT) {
                len = ALIGN(len, 2);
                len += 12;
        }

        if (local->hw.radiotap_timestamp.units_pos >= 0) {
                len = ALIGN(len, 8);
                len += 12;
        }

        if (status->encoding == RX_ENC_HE &&
            status->flag & RX_FLAG_RADIOTAP_HE) {
                len = ALIGN(len, 2);
                len += 12;
                BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12);
        }

        if (status->encoding == RX_ENC_HE &&
            status->flag & RX_FLAG_RADIOTAP_HE_MU) {
                len = ALIGN(len, 2);
                len += 12;
                BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12);
        }

        if (status->flag & RX_FLAG_NO_PSDU)
                len += 1;

        if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
                len = ALIGN(len, 2);
                len += 4;
                BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_lsig) != 4);
        }

        if (status->chains) {
                /* antenna and antenna signal fields */
                len += 2 * hweight8(status->chains);
        }

        if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END) {
                int tlv_offset = 0;

                /*
                 * The position to look at depends on the existence (or non-
                 * existence) of other elements, so take that into account...
                 */
                if (status->flag & RX_FLAG_RADIOTAP_HE)
                        tlv_offset +=
                                sizeof(struct ieee80211_radiotap_he);
                if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
                        tlv_offset +=
                                sizeof(struct ieee80211_radiotap_he_mu);
                if (status->flag & RX_FLAG_RADIOTAP_LSIG)
                        tlv_offset +=
                                sizeof(struct ieee80211_radiotap_lsig);

                /* ensure 4 byte alignment for TLV */
                len = ALIGN(len, 4);

                /* TLVs until the mac header */
                len += skb_mac_header(skb) - &skb->data[tlv_offset];
        }

        return len;
}

static void __ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata,
                                           int link_id,
                                           struct sta_info *sta,
                                           struct sk_buff *skb)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);

        if (link_id >= 0) {
                status->link_valid = 1;
                status->link_id = link_id;
        } else {
                status->link_valid = 0;
        }

        skb_queue_tail(&sdata->skb_queue, skb);
        wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work);
        if (sta)
                sta->deflink.rx_stats.packets++;
}

static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata,
                                         int link_id,
                                         struct sta_info *sta,
                                         struct sk_buff *skb)
{
        skb->protocol = 0;
        __ieee80211_queue_skb_to_iface(sdata, link_id, sta, skb);
}

static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
                                         struct sk_buff *skb,
                                         int rtap_space)
{
        struct {
                struct ieee80211_hdr_3addr hdr;
                u8 category;
                u8 action_code;
        } __packed __aligned(2) action;

        if (!sdata)
                return;

        BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);

        if (skb->len < rtap_space + sizeof(action) +
                       VHT_MUMIMO_GROUPS_DATA_LEN)
                return;

        if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
                return;

        skb_copy_bits(skb, rtap_space, &action, sizeof(action));

        if (!ieee80211_is_action(action.hdr.frame_control))
                return;

        if (action.category != WLAN_CATEGORY_VHT)
                return;

        if (action.action_code != WLAN_VHT_ACTION_GROUPID_MGMT)
                return;

        if (!ether_addr_equal(action.hdr.addr1, sdata->u.mntr.mu_follow_addr))
                return;

        skb = skb_copy(skb, GFP_ATOMIC);
        if (!skb)
                return;

        ieee80211_queue_skb_to_iface(sdata, -1, NULL, skb);
}

/*
 * ieee80211_add_rx_radiotap_header - add radiotap header
 *
 * add a radiotap header containing all the fields which the hardware provided.
 */
static void
ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
                                 struct sk_buff *skb,
                                 struct ieee80211_rate *rate,
                                 int rtap_len, bool has_fcs)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_radiotap_header *rthdr;
        unsigned char *pos;
        __le32 *it_present;
        u32 it_present_val;
        u16 rx_flags = 0;
        u16 channel_flags = 0;
        u32 tlvs_len = 0;
        int mpdulen, chain;
        unsigned long chains = status->chains;
        struct ieee80211_radiotap_he he = {};
        struct ieee80211_radiotap_he_mu he_mu = {};
        struct ieee80211_radiotap_lsig lsig = {};

        if (status->flag & RX_FLAG_RADIOTAP_HE) {
                he = *(struct ieee80211_radiotap_he *)skb->data;
                skb_pull(skb, sizeof(he));
                WARN_ON_ONCE(status->encoding != RX_ENC_HE);
        }

        if (status->flag & RX_FLAG_RADIOTAP_HE_MU) {
                he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data;
                skb_pull(skb, sizeof(he_mu));
        }

        if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
                lsig = *(struct ieee80211_radiotap_lsig *)skb->data;
                skb_pull(skb, sizeof(lsig));
        }

        if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END) {
                /* data is pointer at tlv all other info was pulled off */
                tlvs_len = skb_mac_header(skb) - skb->data;
        }

        mpdulen = skb->len;
        if (!(has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)))
                mpdulen += FCS_LEN;

        rthdr = skb_push(skb, rtap_len - tlvs_len);
        memset(rthdr, 0, rtap_len - tlvs_len);
        it_present = &rthdr->it_present;

        /* radiotap header, set always present flags */
        rthdr->it_len = cpu_to_le16(rtap_len);
        it_present_val = BIT(IEEE80211_RADIOTAP_FLAGS) |
                         BIT(IEEE80211_RADIOTAP_CHANNEL) |
                         BIT(IEEE80211_RADIOTAP_RX_FLAGS);

        if (!status->chains)
                it_present_val |= BIT(IEEE80211_RADIOTAP_ANTENNA);

        for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
                it_present_val |=
                        BIT(IEEE80211_RADIOTAP_EXT) |
                        BIT(IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE);
                put_unaligned_le32(it_present_val, it_present);
                it_present++;
                it_present_val = BIT(IEEE80211_RADIOTAP_ANTENNA) |
                                 BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
        }

        if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END)
                it_present_val |= BIT(IEEE80211_RADIOTAP_TLV);

        put_unaligned_le32(it_present_val, it_present);

        /* This references through an offset into it_optional[] rather
         * than via it_present otherwise later uses of pos will cause
         * the compiler to think we have walked past the end of the
         * struct member.
         */
        pos = (void *)&rthdr->it_optional[it_present + 1 - rthdr->it_optional];

        /* the order of the following fields is important */

        /* IEEE80211_RADIOTAP_TSFT */
        if (ieee80211_have_rx_timestamp(status)) {
                /* padding */
                while ((pos - (u8 *)rthdr) & 7)
                        *pos++ = 0;
                put_unaligned_le64(
                        ieee80211_calculate_rx_timestamp(local, status,
                                                         mpdulen, 0),
                        pos);
                rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_TSFT));
                pos += 8;
        }

        /* IEEE80211_RADIOTAP_FLAGS */
        if (has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS))
                *pos |= IEEE80211_RADIOTAP_F_FCS;
        if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC))
                *pos |= IEEE80211_RADIOTAP_F_BADFCS;
        if (status->enc_flags & RX_ENC_FLAG_SHORTPRE)
                *pos |= IEEE80211_RADIOTAP_F_SHORTPRE;
        pos++;

        /* IEEE80211_RADIOTAP_RATE */
        if (!rate || status->encoding != RX_ENC_LEGACY) {
                /*
                 * Without rate information don't add it. If we have,
                 * MCS information is a separate field in radiotap,
                 * added below. The byte here is needed as padding
                 * for the channel though, so initialise it to 0.
                 */
                *pos = 0;
        } else {
                int shift = 0;
                rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE));
                if (status->bw == RATE_INFO_BW_10)
                        shift = 1;
                else if (status->bw == RATE_INFO_BW_5)
                        shift = 2;
                *pos = DIV_ROUND_UP(rate->bitrate, 5 * (1 << shift));
        }
        pos++;

        /* IEEE80211_RADIOTAP_CHANNEL */
        /* TODO: frequency offset in KHz */
        put_unaligned_le16(status->freq, pos);
        pos += 2;
        if (status->bw == RATE_INFO_BW_10)
                channel_flags |= IEEE80211_CHAN_HALF;
        else if (status->bw == RATE_INFO_BW_5)
                channel_flags |= IEEE80211_CHAN_QUARTER;

        if (status->band == NL80211_BAND_5GHZ ||
            status->band == NL80211_BAND_6GHZ)
                channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ;
        else if (status->encoding != RX_ENC_LEGACY)
                channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ;
        else if (rate && rate->flags & IEEE80211_RATE_ERP_G)
                channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ;
        else if (rate)
                channel_flags |= IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ;
        else
                channel_flags |= IEEE80211_CHAN_2GHZ;
        put_unaligned_le16(channel_flags, pos);
        pos += 2;

        /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */
        if (ieee80211_hw_check(&local->hw, SIGNAL_DBM) &&
            !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
                *pos = status->signal;
                rthdr->it_present |=
                        cpu_to_le32(BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL));
                pos++;
        }

        /* IEEE80211_RADIOTAP_LOCK_QUALITY is missing */

        if (!status->chains) {
                /* IEEE80211_RADIOTAP_ANTENNA */
                *pos = status->antenna;
                pos++;
        }

        /* IEEE80211_RADIOTAP_DB_ANTNOISE is not used */

        /* IEEE80211_RADIOTAP_RX_FLAGS */
        /* ensure 2 byte alignment for the 2 byte field as required */
        if ((pos - (u8 *)rthdr) & 1)
                *pos++ = 0;
        if (status->flag & RX_FLAG_FAILED_PLCP_CRC)
                rx_flags |= IEEE80211_RADIOTAP_F_RX_BADPLCP;
        put_unaligned_le16(rx_flags, pos);
        pos += 2;

        if (status->encoding == RX_ENC_HT) {
                unsigned int stbc;

                rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS));
                *pos = local->hw.radiotap_mcs_details;
                if (status->enc_flags & RX_ENC_FLAG_HT_GF)
                        *pos |= IEEE80211_RADIOTAP_MCS_HAVE_FMT;
                if (status->enc_flags & RX_ENC_FLAG_LDPC)
                        *pos |= IEEE80211_RADIOTAP_MCS_HAVE_FEC;
                pos++;
                *pos = 0;
                if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
                        *pos |= IEEE80211_RADIOTAP_MCS_SGI;
                if (status->bw == RATE_INFO_BW_40)
                        *pos |= IEEE80211_RADIOTAP_MCS_BW_40;
                if (status->enc_flags & RX_ENC_FLAG_HT_GF)
                        *pos |= IEEE80211_RADIOTAP_MCS_FMT_GF;
                if (status->enc_flags & RX_ENC_FLAG_LDPC)
                        *pos |= IEEE80211_RADIOTAP_MCS_FEC_LDPC;
                stbc = (status->enc_flags & RX_ENC_FLAG_STBC_MASK) >> RX_ENC_FLAG_STBC_SHIFT;
                *pos |= stbc << IEEE80211_RADIOTAP_MCS_STBC_SHIFT;
                pos++;
                *pos++ = status->rate_idx;
        }

        if (status->flag & RX_FLAG_AMPDU_DETAILS) {
                u16 flags = 0;

                /* ensure 4 byte alignment */
                while ((pos - (u8 *)rthdr) & 3)
                        pos++;
                rthdr->it_present |=
                        cpu_to_le32(BIT(IEEE80211_RADIOTAP_AMPDU_STATUS));
                put_unaligned_le32(status->ampdu_reference, pos);
                pos += 4;
                if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN)
                        flags |= IEEE80211_RADIOTAP_AMPDU_LAST_KNOWN;
                if (status->flag & RX_FLAG_AMPDU_IS_LAST)
                        flags |= IEEE80211_RADIOTAP_AMPDU_IS_LAST;
                if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_ERROR)
                        flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR;
                if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
                        flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN;
                if (status->flag & RX_FLAG_AMPDU_EOF_BIT_KNOWN)
                        flags |= IEEE80211_RADIOTAP_AMPDU_EOF_KNOWN;
                if (status->flag & RX_FLAG_AMPDU_EOF_BIT)
                        flags |= IEEE80211_RADIOTAP_AMPDU_EOF;
                put_unaligned_le16(flags, pos);
                pos += 2;
                if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
                        *pos++ = status->ampdu_delimiter_crc;
                else
                        *pos++ = 0;
                *pos++ = 0;
        }

        if (status->encoding == RX_ENC_VHT) {
                u16 known = local->hw.radiotap_vht_details;

                rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT));
                put_unaligned_le16(known, pos);
                pos += 2;
                /* flags */
                if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
                        *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
                /* in VHT, STBC is binary */
                if (status->enc_flags & RX_ENC_FLAG_STBC_MASK)
                        *pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC;
                if (status->enc_flags & RX_ENC_FLAG_BF)
                        *pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED;
                pos++;
                /* bandwidth */
                switch (status->bw) {
                case RATE_INFO_BW_80:
                        *pos++ = 4;
                        break;
                case RATE_INFO_BW_160:
                        *pos++ = 11;
                        break;
                case RATE_INFO_BW_40:
                        *pos++ = 1;
                        break;
                default:
                        *pos++ = 0;
                }
                /* MCS/NSS */
                *pos = (status->rate_idx << 4) | status->nss;
                pos += 4;
                /* coding field */
                if (status->enc_flags & RX_ENC_FLAG_LDPC)
                        *pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0;
                pos++;
                /* group ID */
                pos++;
                /* partial_aid */
                pos += 2;
        }

        if (local->hw.radiotap_timestamp.units_pos >= 0) {
                u16 accuracy = 0;
                u8 flags;
                u64 ts;

                rthdr->it_present |=
                        cpu_to_le32(BIT(IEEE80211_RADIOTAP_TIMESTAMP));

                /* ensure 8 byte alignment */
                while ((pos - (u8 *)rthdr) & 7)
                        pos++;

                if (status->flag & RX_FLAG_MACTIME_IS_RTAP_TS64) {
                        flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT;
                        ts = status->mactime;
                } else {
                        flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT;
                        ts = status->device_timestamp;
                }

                put_unaligned_le64(ts, pos);
                pos += sizeof(u64);

                if (local->hw.radiotap_timestamp.accuracy >= 0) {
                        accuracy = local->hw.radiotap_timestamp.accuracy;
                        flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY;
                }
                put_unaligned_le16(accuracy, pos);
                pos += sizeof(u16);

                *pos++ = local->hw.radiotap_timestamp.units_pos;
                *pos++ = flags;
        }

        if (status->encoding == RX_ENC_HE &&
            status->flag & RX_FLAG_RADIOTAP_HE) {
#define HE_PREP(f, val)        le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f)

                if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) {
                        he.data6 |= HE_PREP(DATA6_NSTS,
                                            FIELD_GET(RX_ENC_FLAG_STBC_MASK,
                                                      status->enc_flags));
                        he.data3 |= HE_PREP(DATA3_STBC, 1);
                } else {
                        he.data6 |= HE_PREP(DATA6_NSTS, status->nss);
                }

#define CHECK_GI(s) \
        BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \
                     (int)NL80211_RATE_INFO_HE_GI_##s)

                CHECK_GI(0_8);
                CHECK_GI(1_6);
                CHECK_GI(3_2);

                he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx);
                he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm);
                he.data3 |= HE_PREP(DATA3_CODING,
                                    !!(status->enc_flags & RX_ENC_FLAG_LDPC));

                he.data5 |= HE_PREP(DATA5_GI, status->he_gi);

                switch (status->bw) {
                case RATE_INFO_BW_20:
                        he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
                                            IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ);
                        break;
                case RATE_INFO_BW_40:
                        he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
                                            IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ);
                        break;
                case RATE_INFO_BW_80:
                        he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
                                            IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ);
                        break;
                case RATE_INFO_BW_160:
                        he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
                                            IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ);
                        break;
                case RATE_INFO_BW_HE_RU:
#define CHECK_RU_ALLOC(s) \
        BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \
                     NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4)

                        CHECK_RU_ALLOC(26);
                        CHECK_RU_ALLOC(52);
                        CHECK_RU_ALLOC(106);
                        CHECK_RU_ALLOC(242);
                        CHECK_RU_ALLOC(484);
                        CHECK_RU_ALLOC(996);
                        CHECK_RU_ALLOC(2x996);

                        he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
                                            status->he_ru + 4);
                        break;
                default:
                        WARN_ONCE(1, "Invalid SU BW %d\n", status->bw);
                }

                /* ensure 2 byte alignment */
                while ((pos - (u8 *)rthdr) & 1)
                        pos++;
                rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE));
                memcpy(pos, &he, sizeof(he));
                pos += sizeof(he);
        }

        if (status->encoding == RX_ENC_HE &&
            status->flag & RX_FLAG_RADIOTAP_HE_MU) {
                /* ensure 2 byte alignment */
                while ((pos - (u8 *)rthdr) & 1)
                        pos++;
                rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE_MU));
                memcpy(pos, &he_mu, sizeof(he_mu));
                pos += sizeof(he_mu);
        }

        if (status->flag & RX_FLAG_NO_PSDU) {
                rthdr->it_present |=
                        cpu_to_le32(BIT(IEEE80211_RADIOTAP_ZERO_LEN_PSDU));
                *pos++ = status->zero_length_psdu_type;
        }

        if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
                /* ensure 2 byte alignment */
                while ((pos - (u8 *)rthdr) & 1)
                        pos++;
                rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_LSIG));
                memcpy(pos, &lsig, sizeof(lsig));
                pos += sizeof(lsig);
        }

        for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
                *pos++ = status->chain_signal[chain];
                *pos++ = chain;
        }
}

static struct sk_buff *
ieee80211_make_monitor_skb(struct ieee80211_local *local,
                           struct sk_buff **origskb,
                           struct ieee80211_rate *rate,
                           int rtap_space, bool use_origskb)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(*origskb);
        int rt_hdrlen, needed_headroom;
        struct sk_buff *skb;

        /* room for the radiotap header based on driver features */
        rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, *origskb);
        needed_headroom = rt_hdrlen - rtap_space;

        if (use_origskb) {
                /* only need to expand headroom if necessary */
                skb = *origskb;
                *origskb = NULL;

                /*
                 * This shouldn't trigger often because most devices have an
                 * RX header they pull before we get here, and that should
                 * be big enough for our radiotap information. We should
                 * probably export the length to drivers so that we can have
                 * them allocate enough headroom to start with.
                 */
                if (skb_headroom(skb) < needed_headroom &&
                    pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) {
                        dev_kfree_skb(skb);
                        return NULL;
                }
        } else {
                /*
                 * Need to make a copy and possibly remove radiotap header
                 * and FCS from the original.
                 */
                skb = skb_copy_expand(*origskb, needed_headroom + NET_SKB_PAD,
                                      0, GFP_ATOMIC);

                if (!skb)
                        return NULL;
        }

        /* prepend radiotap information */
        ieee80211_add_rx_radiotap_header(local, skb, rate, rt_hdrlen, true);

        skb_reset_mac_header(skb);
        skb->ip_summed = CHECKSUM_UNNECESSARY;
        skb->pkt_type = PACKET_OTHERHOST;
        skb->protocol = htons(ETH_P_802_2);

        return skb;
}

/*
 * This function copies a received frame to all monitor interfaces and
 * returns a cleaned-up SKB that no longer includes the FCS nor the
 * radiotap header the driver might have added.
 */
static struct sk_buff *
ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
                     struct ieee80211_rate *rate)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(origskb);
        struct ieee80211_sub_if_data *sdata;
        struct sk_buff *monskb = NULL;
        int present_fcs_len = 0;
        unsigned int rtap_space = 0;
        struct ieee80211_sub_if_data *monitor_sdata =
                rcu_dereference(local->monitor_sdata);
        bool only_monitor = false;
        unsigned int min_head_len;

        if (WARN_ON_ONCE(status->flag & RX_FLAG_RADIOTAP_TLV_AT_END &&
                         !skb_mac_header_was_set(origskb))) {
                /* with this skb no way to know where frame payload starts */
                dev_kfree_skb(origskb);
                return NULL;
        }

        if (status->flag & RX_FLAG_RADIOTAP_HE)
                rtap_space += sizeof(struct ieee80211_radiotap_he);

        if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
                rtap_space += sizeof(struct ieee80211_radiotap_he_mu);

        if (status->flag & RX_FLAG_RADIOTAP_LSIG)
                rtap_space += sizeof(struct ieee80211_radiotap_lsig);

        if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END)
                rtap_space += skb_mac_header(origskb) - &origskb->data[rtap_space];

        min_head_len = rtap_space;

        /*
         * First, we may need to make a copy of the skb because
         *  (1) we need to modify it for radiotap (if not present), and
         *  (2) the other RX handlers will modify the skb we got.
         *
         * We don't need to, of course, if we aren't going to return
         * the SKB because it has a bad FCS/PLCP checksum.
         */

        if (!(status->flag & RX_FLAG_NO_PSDU)) {
                if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) {
                        if (unlikely(origskb->len <= FCS_LEN + rtap_space)) {
                                /* driver bug */
                                WARN_ON(1);
                                dev_kfree_skb(origskb);
                                return NULL;
                        }
                        present_fcs_len = FCS_LEN;
                }

                /* also consider the hdr->frame_control */
                min_head_len += 2;
        }

        /* ensure that the expected data elements are in skb head */
        if (!pskb_may_pull(origskb, min_head_len)) {
                dev_kfree_skb(origskb);
                return NULL;
        }

        only_monitor = should_drop_frame(origskb, present_fcs_len, rtap_space);

        if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) {
                if (only_monitor) {
                        dev_kfree_skb(origskb);
                        return NULL;
                }

                return ieee80211_clean_skb(origskb, present_fcs_len,
                                           rtap_space);
        }

        ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space);

        list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) {
                bool last_monitor = list_is_last(&sdata->u.mntr.list,
                                                 &local->mon_list);

                if (!monskb)
                        monskb = ieee80211_make_monitor_skb(local, &origskb,
                                                            rate, rtap_space,
                                                            only_monitor &&
                                                            last_monitor);

                if (monskb) {
                        struct sk_buff *skb;

                        if (last_monitor) {
                                skb = monskb;
                                monskb = NULL;
                        } else {
                                skb = skb_clone(monskb, GFP_ATOMIC);
                        }

                        if (skb) {
                                skb->dev = sdata->dev;
                                dev_sw_netstats_rx_add(skb->dev, skb->len);
                                netif_receive_skb(skb);
                        }
                }

                if (last_monitor)
                        break;
        }

        /* this happens if last_monitor was erroneously false */
        dev_kfree_skb(monskb);

        /* ditto */
        if (!origskb)
                return NULL;

        return ieee80211_clean_skb(origskb, present_fcs_len, rtap_space);
}

static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
        int tid, seqno_idx, security_idx;

        /* does the frame have a qos control field? */
        if (ieee80211_is_data_qos(hdr->frame_control)) {
                u8 *qc = ieee80211_get_qos_ctl(hdr);
                /* frame has qos control */
                tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
                if (*qc & IEEE80211_QOS_CTL_A_MSDU_PRESENT)
                        status->rx_flags |= IEEE80211_RX_AMSDU;

                seqno_idx = tid;
                security_idx = tid;
        } else {
                /*
                 * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"):
                 *
                 *        Sequence numbers for management frames, QoS data
                 *        frames with a broadcast/multicast address in the
                 *        Address 1 field, and all non-QoS data frames sent
                 *        by QoS STAs are assigned using an additional single
                 *        modulo-4096 counter, [...]
                 *
                 * We also use that counter for non-QoS STAs.
                 */
                seqno_idx = IEEE80211_NUM_TIDS;
                security_idx = 0;
                if (ieee80211_is_mgmt(hdr->frame_control))
                        security_idx = IEEE80211_NUM_TIDS;
                tid = 0;
        }

        rx->seqno_idx = seqno_idx;
        rx->security_idx = security_idx;
        /* Set skb->priority to 1d tag if highest order bit of TID is not set.
         * For now, set skb->priority to 0 for other cases. */
        rx->skb->priority = (tid > 7) ? 0 : tid;
}

/**
 * DOC: Packet alignment
 *
 * Drivers always need to pass packets that are aligned to two-byte boundaries
 * to the stack.
 *
 * Additionally, they should, if possible, align the payload data in a way that
 * guarantees that the contained IP header is aligned to a four-byte
 * boundary. In the case of regular frames, this simply means aligning the
 * payload to a four-byte boundary (because either the IP header is directly
 * contained, or IV/RFC1042 headers that have a length divisible by four are
 * in front of it).  If the payload data is not properly aligned and the
 * architecture doesn't support efficient unaligned operations, mac80211
 * will align the data.
 *
 * With A-MSDU frames, however, the payload data address must yield two modulo
 * four because there are 14-byte 802.3 headers within the A-MSDU frames that
 * push the IP header further back to a multiple of four again. Thankfully, the
 * specs were sane enough this time around to require padding each A-MSDU
 * subframe to a length that is a multiple of four.
 *
 * Padding like Atheros hardware adds which is between the 802.11 header and
 * the payload is not supported; the driver is required to move the 802.11
 * header to be directly in front of the payload in that case.
 */
static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx)
{
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
        WARN_ON_ONCE((unsigned long)rx->skb->data & 1);
#endif
}


/* rx handlers */

static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;

        if (is_multicast_ether_addr(hdr->addr1))
                return 0;

        return ieee80211_is_robust_mgmt_frame(skb);
}


static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;

        if (!is_multicast_ether_addr(hdr->addr1))
                return 0;

        return ieee80211_is_robust_mgmt_frame(skb);
}


/* Get the BIP key index from MMIE; return -1 if this is not a BIP frame */
static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
{
        struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data;
        struct ieee80211_mmie *mmie;
        struct ieee80211_mmie_16 *mmie16;

        if (skb->len < 24 + sizeof(*mmie) || !is_multicast_ether_addr(hdr->da))
                return -1;

        if (!ieee80211_is_robust_mgmt_frame(skb) &&
            !ieee80211_is_beacon(hdr->frame_control))
                return -1; /* not a robust management frame */

        mmie = (struct ieee80211_mmie *)
                (skb->data + skb->len - sizeof(*mmie));
        if (mmie->element_id == WLAN_EID_MMIE &&
            mmie->length == sizeof(*mmie) - 2)
                return le16_to_cpu(mmie->key_id);

        mmie16 = (struct ieee80211_mmie_16 *)
                (skb->data + skb->len - sizeof(*mmie16));
        if (skb->len >= 24 + sizeof(*mmie16) &&
            mmie16->element_id == WLAN_EID_MMIE &&
            mmie16->length == sizeof(*mmie16) - 2)
                return le16_to_cpu(mmie16->key_id);

        return -1;
}

static int ieee80211_get_keyid(struct sk_buff *skb)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        __le16 fc = hdr->frame_control;
        int hdrlen = ieee80211_hdrlen(fc);
        u8 keyid;

        /* WEP, TKIP, CCMP and GCMP */
        if (unlikely(skb->len < hdrlen + IEEE80211_WEP_IV_LEN))
                return -EINVAL;

        skb_copy_bits(skb, hdrlen + 3, &keyid, 1);

        keyid >>= 6;

        return keyid;
}

static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
        char *dev_addr = rx->sdata->vif.addr;

        if (ieee80211_is_data(hdr->frame_control)) {
                if (is_multicast_ether_addr(hdr->addr1)) {
                        if (ieee80211_has_tods(hdr->frame_control) ||
                            !ieee80211_has_fromds(hdr->frame_control))
                                return RX_DROP_MONITOR;
                        if (ether_addr_equal(hdr->addr3, dev_addr))
                                return RX_DROP_MONITOR;
                } else {
                        if (!ieee80211_has_a4(hdr->frame_control))
                                return RX_DROP_MONITOR;
                        if (ether_addr_equal(hdr->addr4, dev_addr))
                                return RX_DROP_MONITOR;
                }
        }

        /* If there is not an established peer link and this is not a peer link
         * establisment frame, beacon or probe, drop the frame.
         */

        if (!rx->sta || sta_plink_state(rx->sta) != NL80211_PLINK_ESTAB) {
                struct ieee80211_mgmt *mgmt;

                if (!ieee80211_is_mgmt(hdr->frame_control))
                        return RX_DROP_MONITOR;

                if (ieee80211_is_action(hdr->frame_control)) {
                        u8 category;

                        /* make sure category field is present */
                        if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE)
                                return RX_DROP_MONITOR;

                        mgmt = (struct ieee80211_mgmt *)hdr;
                        category = mgmt->u.action.category;
                        if (category != WLAN_CATEGORY_MESH_ACTION &&
                            category != WLAN_CATEGORY_SELF_PROTECTED)
                                return RX_DROP_MONITOR;
                        return RX_CONTINUE;
                }

                if (ieee80211_is_probe_req(hdr->frame_control) ||
                    ieee80211_is_probe_resp(hdr->frame_control) ||
                    ieee80211_is_beacon(hdr->frame_control) ||
                    ieee80211_is_auth(hdr->frame_control))
                        return RX_CONTINUE;

                return RX_DROP_MONITOR;
        }

        return RX_CONTINUE;
}

static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx,
                                              int index)
{
        struct sk_buff_head *frames = &tid_agg_rx->reorder_buf[index];
        struct sk_buff *tail = skb_peek_tail(frames);
        struct ieee80211_rx_status *status;

        if (tid_agg_rx->reorder_buf_filtered &&
            tid_agg_rx->reorder_buf_filtered & BIT_ULL(index))
                return true;

        if (!tail)
                return false;

        status = IEEE80211_SKB_RXCB(tail);
        if (status->flag & RX_FLAG_AMSDU_MORE)
                return false;

        return true;
}

static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
                                            struct tid_ampdu_rx *tid_agg_rx,
                                            int index,
                                            struct sk_buff_head *frames)
{
        struct sk_buff_head *skb_list = &tid_agg_rx->reorder_buf[index];
        struct sk_buff *skb;
        struct ieee80211_rx_status *status;

        lockdep_assert_held(&tid_agg_rx->reorder_lock);

        if (skb_queue_empty(skb_list))
                goto no_frame;

        if (!ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
                __skb_queue_purge(skb_list);
                goto no_frame;
        }

        /* release frames from the reorder ring buffer */
        tid_agg_rx->stored_mpdu_num--;
        while ((skb = __skb_dequeue(skb_list))) {
                status = IEEE80211_SKB_RXCB(skb);
                status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE;
                __skb_queue_tail(frames, skb);
        }

no_frame:
        if (tid_agg_rx->reorder_buf_filtered)
                tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
        tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
}

static void ieee80211_release_reorder_frames(struct ieee80211_sub_if_data *sdata,
                                             struct tid_ampdu_rx *tid_agg_rx,
                                             u16 head_seq_num,
                                             struct sk_buff_head *frames)
{
        int index;

        lockdep_assert_held(&tid_agg_rx->reorder_lock);

        while (ieee80211_sn_less(tid_agg_rx->head_seq_num, head_seq_num)) {
                index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
                ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
                                                frames);
        }
}

/*
 * Timeout (in jiffies) for skb's that are waiting in the RX reorder buffer. If
 * the skb was added to the buffer longer than this time ago, the earlier
 * frames that have not yet been received are assumed to be lost and the skb
 * can be released for processing. This may also release other skb's from the
 * reorder buffer if there are no additional gaps between the frames.
 *
 * Callers must hold tid_agg_rx->reorder_lock.
 */
#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10)

static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
                                          struct tid_ampdu_rx *tid_agg_rx,
                                          struct sk_buff_head *frames)
{
        int index, i, j;

        lockdep_assert_held(&tid_agg_rx->reorder_lock);

        /* release the buffer until next missing frame */
        index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
        if (!ieee80211_rx_reorder_ready(tid_agg_rx, index) &&
            tid_agg_rx->stored_mpdu_num) {
                /*
                 * No buffers ready to be released, but check whether any
                 * frames in the reorder buffer have timed out.
                 */
                int skipped = 1;
                for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
                     j = (j + 1) % tid_agg_rx->buf_size) {
                        if (!ieee80211_rx_reorder_ready(tid_agg_rx, j)) {
                                skipped++;
                                continue;
                        }
                        if (skipped &&
                            !time_after(jiffies, tid_agg_rx->reorder_time[j] +
                                        HT_RX_REORDER_BUF_TIMEOUT))
                                goto set_release_timer;

                        /* don't leave incomplete A-MSDUs around */
                        for (i = (index + 1) % tid_agg_rx->buf_size; i != j;
                             i = (i + 1) % tid_agg_rx->buf_size)
                                __skb_queue_purge(&tid_agg_rx->reorder_buf[i]);

                        ht_dbg_ratelimited(sdata,
                                           "release an RX reorder frame due to timeout on earlier frames\n");
                        ieee80211_release_reorder_frame(sdata, tid_agg_rx, j,
                                                        frames);

                        /*
                         * Increment the head seq# also for the skipped slots.
                         */
                        tid_agg_rx->head_seq_num =
                                (tid_agg_rx->head_seq_num +
                                 skipped) & IEEE80211_SN_MASK;
                        skipped = 0;
                }
        } else while (ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
                ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
                                                frames);
                index =        tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
        }

        if (tid_agg_rx->stored_mpdu_num) {
                j = index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;

                for (; j != (index - 1) % tid_agg_rx->buf_size;
                     j = (j + 1) % tid_agg_rx->buf_size) {
                        if (ieee80211_rx_reorder_ready(tid_agg_rx, j))
                                break;
                }

 set_release_timer:

                if (!tid_agg_rx->removed)
                        mod_timer(&tid_agg_rx->reorder_timer,
                                  tid_agg_rx->reorder_time[j] + 1 +
                                  HT_RX_REORDER_BUF_TIMEOUT);
        } else {
                del_timer(&tid_agg_rx->reorder_timer);
        }
}

/*
 * As this function belongs to the RX path it must be under
 * rcu_read_lock protection. It returns false if the frame
 * can be processed immediately, true if it was consumed.
 */
static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata,
                                             struct tid_ampdu_rx *tid_agg_rx,
                                             struct sk_buff *skb,
                                             struct sk_buff_head *frames)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        u16 mpdu_seq_num = ieee80211_get_sn(hdr);
        u16 head_seq_num, buf_size;
        int index;
        bool ret = true;

        spin_lock(&tid_agg_rx->reorder_lock);

        /*
         * Offloaded BA sessions have no known starting sequence number so pick
         * one from first Rxed frame for this tid after BA was started.
         */
        if (unlikely(tid_agg_rx->auto_seq)) {
                tid_agg_rx->auto_seq = false;
                tid_agg_rx->ssn = mpdu_seq_num;
                tid_agg_rx->head_seq_num = mpdu_seq_num;
        }

        buf_size = tid_agg_rx->buf_size;
        head_seq_num = tid_agg_rx->head_seq_num;

        /*
         * If the current MPDU's SN is smaller than the SSN, it shouldn't
         * be reordered.
         */
        if (unlikely(!tid_agg_rx->started)) {
                if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
                        ret = false;
                        goto out;
                }
                tid_agg_rx->started = true;
        }

        /* frame with out of date sequence number */
        if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
                dev_kfree_skb(skb);
                goto out;
        }

        /*
         * If frame the sequence number exceeds our buffering window
         * size release some previous frames to make room for this one.
         */
        if (!ieee80211_sn_less(mpdu_seq_num, head_seq_num + buf_size)) {
                head_seq_num = ieee80211_sn_inc(
                                ieee80211_sn_sub(mpdu_seq_num, buf_size));
                /* release stored frames up to new head to stack */
                ieee80211_release_reorder_frames(sdata, tid_agg_rx,
                                                 head_seq_num, frames);
        }

        /* Now the new frame is always in the range of the reordering buffer */

        index = mpdu_seq_num % tid_agg_rx->buf_size;

        /* check if we already stored this frame */
        if (ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
                dev_kfree_skb(skb);
                goto out;
        }

        /*
         * If the current MPDU is in the right order and nothing else
         * is stored we can process it directly, no need to buffer it.
         * If it is first but there's something stored, we may be able
         * to release frames after this one.
         */
        if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
            tid_agg_rx->stored_mpdu_num == 0) {
                if (!(status->flag & RX_FLAG_AMSDU_MORE))
                        tid_agg_rx->head_seq_num =
                                ieee80211_sn_inc(tid_agg_rx->head_seq_num);
                ret = false;
                goto out;
        }

        /* put the frame in the reordering buffer */
        __skb_queue_tail(&tid_agg_rx->reorder_buf[index], skb);
        if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
                tid_agg_rx->reorder_time[index] = jiffies;
                tid_agg_rx->stored_mpdu_num++;
                ieee80211_sta_reorder_release(sdata, tid_agg_rx, frames);
        }

 out:
        spin_unlock(&tid_agg_rx->reorder_lock);
        return ret;
}

/*
 * Reorder MPDUs from A-MPDUs, keeping them on a buffer. Returns
 * true if the MPDU was buffered, false if it should be processed.
 */
static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
                                       struct sk_buff_head *frames)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
        struct sta_info *sta = rx->sta;
        struct tid_ampdu_rx *tid_agg_rx;
        u16 sc;
        u8 tid, ack_policy;

        if (!ieee80211_is_data_qos(hdr->frame_control) ||
            is_multicast_ether_addr(hdr->addr1))
                goto dont_reorder;

        /*
         * filter the QoS data rx stream according to
         * STA/TID and check if this STA/TID is on aggregation
         */

        if (!sta)
                goto dont_reorder;

        ack_policy = *ieee80211_get_qos_ctl(hdr) &
                     IEEE80211_QOS_CTL_ACK_POLICY_MASK;
        tid = ieee80211_get_tid(hdr);

        tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
        if (!tid_agg_rx) {
                if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
                    !test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
                    !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
                        ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
                                             WLAN_BACK_RECIPIENT,
                                             WLAN_REASON_QSTA_REQUIRE_SETUP);
                goto dont_reorder;
        }

        /* qos null data frames are excluded */
        if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
                goto dont_reorder;

        /* not part of a BA session */
        if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_NOACK)
                goto dont_reorder;

        /* new, potentially un-ordered, ampdu frame - process it */

        /* reset session timer */
        if (tid_agg_rx->timeout)
                tid_agg_rx->last_rx = jiffies;

        /* if this mpdu is fragmented - terminate rx aggregation session */
        sc = le16_to_cpu(hdr->seq_ctrl);
        if (sc & IEEE80211_SCTL_FRAG) {
                ieee80211_queue_skb_to_iface(rx->sdata, rx->link_id, NULL, skb);
                return;
        }

        /*
         * No locking needed -- we will only ever process one
         * RX packet at a time, and thus own tid_agg_rx. All
         * other code manipulating it needs to (and does) make
         * sure that we cannot get to it any more before doing
         * anything with it.
         */
        if (ieee80211_sta_manage_reorder_buf(rx->sdata, tid_agg_rx, skb,
                                             frames))
                return;

 dont_reorder:
        __skb_queue_tail(frames, skb);
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);

        if (status->flag & RX_FLAG_DUP_VALIDATED)
                return RX_CONTINUE;

        /*
         * Drop duplicate 802.11 retransmissions
         * (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery")
         */

        if (rx->skb->len < 24)
                return RX_CONTINUE;

        if (ieee80211_is_ctl(hdr->frame_control) ||
            ieee80211_is_any_nullfunc(hdr->frame_control))
                return RX_CONTINUE;

        if (!rx->sta)
                return RX_CONTINUE;

        if (unlikely(is_multicast_ether_addr(hdr->addr1))) {
                struct ieee80211_sub_if_data *sdata = rx->sdata;
                u16 sn = ieee80211_get_sn(hdr);

                if (!ieee80211_is_data_present(hdr->frame_control))
                        return RX_CONTINUE;

                if (!ieee80211_vif_is_mld(&sdata->vif) ||
                    sdata->vif.type != NL80211_IFTYPE_STATION)
                        return RX_CONTINUE;

                if (sdata->u.mgd.mcast_seq_last != IEEE80211_SN_MODULO &&
                    ieee80211_sn_less_eq(sn, sdata->u.mgd.mcast_seq_last))
                        return RX_DROP_U_DUP;

                sdata->u.mgd.mcast_seq_last = sn;
                return RX_CONTINUE;
        }

        if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
                     rx->sta->last_seq_ctrl[rx->seqno_idx] == hdr->seq_ctrl)) {
                I802_DEBUG_INC(rx->local->dot11FrameDuplicateCount);
                rx->link_sta->rx_stats.num_duplicates++;
                return RX_DROP_U_DUP;
        } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
                rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl;
        }

        return RX_CONTINUE;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;

        /* Drop disallowed frame classes based on STA auth/assoc state;
         * IEEE 802.11, Chap 5.5.
         *
         * mac80211 filters only based on association state, i.e. it drops
         * Class 3 frames from not associated stations. hostapd sends
         * deauth/disassoc frames when needed. In addition, hostapd is
         * responsible for filtering on both auth and assoc states.
         */

        if (ieee80211_vif_is_mesh(&rx->sdata->vif))
                return ieee80211_rx_mesh_check(rx);

        if (unlikely((ieee80211_is_data(hdr->frame_control) ||
                      ieee80211_is_pspoll(hdr->frame_control)) &&
                     rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
                     rx->sdata->vif.type != NL80211_IFTYPE_OCB &&
                     (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))) {
                /*
                 * accept port control frames from the AP even when it's not
                 * yet marked ASSOC to prevent a race where we don't set the
                 * assoc bit quickly enough before it sends the first frame
                 */
                if (rx->sta && rx->sdata->vif.type == NL80211_IFTYPE_STATION &&
                    ieee80211_is_data_present(hdr->frame_control)) {
                        unsigned int hdrlen;
                        __be16 ethertype;

                        hdrlen = ieee80211_hdrlen(hdr->frame_control);

                        if (rx->skb->len < hdrlen + 8)
                                return RX_DROP_MONITOR;

                        skb_copy_bits(rx->skb, hdrlen + 6, &ethertype, 2);
                        if (ethertype == rx->sdata->control_port_protocol)
                                return RX_CONTINUE;
                }

                if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
                    cfg80211_rx_spurious_frame(rx->sdata->dev,
                                               hdr->addr2,
                                               GFP_ATOMIC))
                        return RX_DROP_U_SPURIOUS;

                return RX_DROP_MONITOR;
        }

        return RX_CONTINUE;
}


static ieee80211_rx_result debug_noinline
ieee80211_rx_h_check_more_data(struct ieee80211_rx_data *rx)
{
        struct ieee80211_local *local;
        struct ieee80211_hdr *hdr;
        struct sk_buff *skb;

        local = rx->local;
        skb = rx->skb;
        hdr = (struct ieee80211_hdr *) skb->data;

        if (!local->pspolling)
                return RX_CONTINUE;

        if (!ieee80211_has_fromds(hdr->frame_control))
                /* this is not from AP */
                return RX_CONTINUE;

        if (!ieee80211_is_data(hdr->frame_control))
                return RX_CONTINUE;

        if (!ieee80211_has_moredata(hdr->frame_control)) {
                /* AP has no more frames buffered for us */
                local->pspolling = false;
                return RX_CONTINUE;
        }

        /* more data bit is set, let's request a new frame from the AP */
        ieee80211_send_pspoll(local, rx->sdata);

        return RX_CONTINUE;
}

static void sta_ps_start(struct sta_info *sta)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        struct ps_data *ps;
        int tid;

        if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
            sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                ps = &sdata->bss->ps;
        else
                return;

        atomic_inc(&ps->num_sta_ps);
        set_sta_flag(sta, WLAN_STA_PS_STA);
        if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
                drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta);
        ps_dbg(sdata, "STA %pM aid %d enters power save mode\n",
               sta->sta.addr, sta->sta.aid);

        ieee80211_clear_fast_xmit(sta);

        for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
                struct ieee80211_txq *txq = sta->sta.txq[tid];
                struct txq_info *txqi = to_txq_info(txq);

                spin_lock(&local->active_txq_lock[txq->ac]);
                if (!list_empty(&txqi->schedule_order))
                        list_del_init(&txqi->schedule_order);
                spin_unlock(&local->active_txq_lock[txq->ac]);

                if (txq_has_queue(txq))
                        set_bit(tid, &sta->txq_buffered_tids);
                else
                        clear_bit(tid, &sta->txq_buffered_tids);
        }
}

static void sta_ps_end(struct sta_info *sta)
{
        ps_dbg(sta->sdata, "STA %pM aid %d exits power save mode\n",
               sta->sta.addr, sta->sta.aid);

        if (test_sta_flag(sta, WLAN_STA_PS_DRIVER)) {
                /*
                 * Clear the flag only if the other one is still set
                 * so that the TX path won't start TX'ing new frames
                 * directly ... In the case that the driver flag isn't
                 * set ieee80211_sta_ps_deliver_wakeup() will clear it.
                 */
                clear_sta_flag(sta, WLAN_STA_PS_STA);
                ps_dbg(sta->sdata, "STA %pM aid %d driver-ps-blocked\n",
                       sta->sta.addr, sta->sta.aid);
                return;
        }

        set_sta_flag(sta, WLAN_STA_PS_DELIVER);
        clear_sta_flag(sta, WLAN_STA_PS_STA);
        ieee80211_sta_ps_deliver_wakeup(sta);
}

int ieee80211_sta_ps_transition(struct ieee80211_sta *pubsta, bool start)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
        bool in_ps;

        WARN_ON(!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS));

        /* Don't let the same PS state be set twice */
        in_ps = test_sta_flag(sta, WLAN_STA_PS_STA);
        if ((start && in_ps) || (!start && !in_ps))
                return -EINVAL;

        if (start)
                sta_ps_start(sta);
        else
                sta_ps_end(sta);

        return 0;
}
EXPORT_SYMBOL(ieee80211_sta_ps_transition);

void ieee80211_sta_pspoll(struct ieee80211_sta *pubsta)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);

        if (test_sta_flag(sta, WLAN_STA_SP))
                return;

        if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER))
                ieee80211_sta_ps_deliver_poll_response(sta);
        else
                set_sta_flag(sta, WLAN_STA_PSPOLL);
}
EXPORT_SYMBOL(ieee80211_sta_pspoll);

void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid)
{
        struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
        int ac = ieee80211_ac_from_tid(tid);

        /*
         * If this AC is not trigger-enabled do nothing unless the
         * driver is calling us after it already checked.
         *
         * NB: This could/should check a separate bitmap of trigger-
         * enabled queues, but for now we only implement uAPSD w/o
         * TSPEC changes to the ACs, so they're always the same.
         */
        if (!(sta->sta.uapsd_queues & ieee80211_ac_to_qos_mask[ac]) &&
            tid != IEEE80211_NUM_TIDS)
                return;

        /* if we are in a service period, do nothing */
        if (test_sta_flag(sta, WLAN_STA_SP))
                return;

        if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER))
                ieee80211_sta_ps_deliver_uapsd(sta);
        else
                set_sta_flag(sta, WLAN_STA_UAPSD);
}
EXPORT_SYMBOL(ieee80211_sta_uapsd_trigger);

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_hdr *hdr = (void *)rx->skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);

        if (!rx->sta)
                return RX_CONTINUE;

        if (sdata->vif.type != NL80211_IFTYPE_AP &&
            sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
                return RX_CONTINUE;

        /*
         * The device handles station powersave, so don't do anything about
         * uAPSD and PS-Poll frames (the latter shouldn't even come up from
         * it to mac80211 since they're handled.)
         */
        if (ieee80211_hw_check(&sdata->local->hw, AP_LINK_PS))
                return RX_CONTINUE;

        /*
         * Don't do anything if the station isn't already asleep. In
         * the uAPSD case, the station will probably be marked asleep,
         * in the PS-Poll case the station must be confused ...
         */
        if (!test_sta_flag(rx->sta, WLAN_STA_PS_STA))
                return RX_CONTINUE;

        if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) {
                ieee80211_sta_pspoll(&rx->sta->sta);

                /* Free PS Poll skb here instead of returning RX_DROP that would
                 * count as an dropped frame. */
                dev_kfree_skb(rx->skb);

                return RX_QUEUED;
        } else if (!ieee80211_has_morefrags(hdr->frame_control) &&
                   !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
                   ieee80211_has_pm(hdr->frame_control) &&
                   (ieee80211_is_data_qos(hdr->frame_control) ||
                    ieee80211_is_qos_nullfunc(hdr->frame_control))) {
                u8 tid = ieee80211_get_tid(hdr);

                ieee80211_sta_uapsd_trigger(&rx->sta->sta, tid);
        }

        return RX_CONTINUE;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
{
        struct sta_info *sta = rx->sta;
        struct link_sta_info *link_sta = rx->link_sta;
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        int i;

        if (!sta || !link_sta)
                return RX_CONTINUE;

        /*
         * Update last_rx only for IBSS packets which are for the current
         * BSSID and for station already AUTHORIZED to avoid keeping the
         * current IBSS network alive in cases where other STAs start
         * using different BSSID. This will also give the station another
         * chance to restart the authentication/authorization in case
         * something went wrong the first time.
         */
        if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
                u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
                                                NL80211_IFTYPE_ADHOC);
                if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) &&
                    test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
                        link_sta->rx_stats.last_rx = jiffies;
                        if (ieee80211_is_data_present(hdr->frame_control) &&
                            !is_multicast_ether_addr(hdr->addr1))
                                link_sta->rx_stats.last_rate =
                                        sta_stats_encode_rate(status);
                }
        } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) {
                link_sta->rx_stats.last_rx = jiffies;
        } else if (!ieee80211_is_s1g_beacon(hdr->frame_control) &&
                   !is_multicast_ether_addr(hdr->addr1)) {
                /*
                 * Mesh beacons will update last_rx when if they are found to
                 * match the current local configuration when processed.
                 */
                link_sta->rx_stats.last_rx = jiffies;
                if (ieee80211_is_data_present(hdr->frame_control))
                        link_sta->rx_stats.last_rate = sta_stats_encode_rate(status);
        }

        link_sta->rx_stats.fragments++;

        u64_stats_update_begin(&link_sta->rx_stats.syncp);
        link_sta->rx_stats.bytes += rx->skb->len;
        u64_stats_update_end(&link_sta->rx_stats.syncp);

        if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
                link_sta->rx_stats.last_signal = status->signal;
                ewma_signal_add(&link_sta->rx_stats_avg.signal,
                                -status->signal);
        }

        if (status->chains) {
                link_sta->rx_stats.chains = status->chains;
                for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
                        int signal = status->chain_signal[i];

                        if (!(status->chains & BIT(i)))
                                continue;

                        link_sta->rx_stats.chain_signal_last[i] = signal;
                        ewma_signal_add(&link_sta->rx_stats_avg.chain_signal[i],
                                        -signal);
                }
        }

        if (ieee80211_is_s1g_beacon(hdr->frame_control))
                return RX_CONTINUE;

        /*
         * Change STA power saving mode only at the end of a frame
         * exchange sequence, and only for a data or management
         * frame as specified in IEEE 802.11-2016 11.2.3.2
         */
        if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) &&
            !ieee80211_has_morefrags(hdr->frame_control) &&
            !is_multicast_ether_addr(hdr->addr1) &&
            (ieee80211_is_mgmt(hdr->frame_control) ||
             ieee80211_is_data(hdr->frame_control)) &&
            !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
            (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
             rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) {
                if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
                        if (!ieee80211_has_pm(hdr->frame_control))
                                sta_ps_end(sta);
                } else {
                        if (ieee80211_has_pm(hdr->frame_control))
                                sta_ps_start(sta);
                }
        }

        /* mesh power save support */
        if (ieee80211_vif_is_mesh(&rx->sdata->vif))
                ieee80211_mps_rx_h_sta_process(sta, hdr);

        /*
         * Drop (qos-)data::nullfunc frames silently, since they
         * are used only to control station power saving mode.
         */
        if (ieee80211_is_any_nullfunc(hdr->frame_control)) {
                I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc);

                /*
                 * If we receive a 4-addr nullfunc frame from a STA
                 * that was not moved to a 4-addr STA vlan yet send
                 * the event to userspace and for older hostapd drop
                 * the frame to the monitor interface.
                 */
                if (ieee80211_has_a4(hdr->frame_control) &&
                    (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
                     (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
                      !rx->sdata->u.vlan.sta))) {
                        if (!test_and_set_sta_flag(sta, WLAN_STA_4ADDR_EVENT))
                                cfg80211_rx_unexpected_4addr_frame(
                                        rx->sdata->dev, sta->sta.addr,
                                        GFP_ATOMIC);
                        return RX_DROP_M_UNEXPECTED_4ADDR_FRAME;
                }
                /*
                 * Update counter and free packet here to avoid
                 * counting this as a dropped packed.
                 */
                link_sta->rx_stats.packets++;
                dev_kfree_skb(rx->skb);
                return RX_QUEUED;
        }

        return RX_CONTINUE;
} /* ieee80211_rx_h_sta_process */

static struct ieee80211_key *
ieee80211_rx_get_bigtk(struct ieee80211_rx_data *rx, int idx)
{
        struct ieee80211_key *key = NULL;
        int idx2;

        /* Make sure key gets set if either BIGTK key index is set so that
         * ieee80211_drop_unencrypted_mgmt() can properly drop both unprotected
         * Beacon frames and Beacon frames that claim to use another BIGTK key
         * index (i.e., a key that we do not have).
         */

        if (idx < 0) {
                idx = NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS;
                idx2 = idx + 1;
        } else {
                if (idx == NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
                        idx2 = idx + 1;
                else
                        idx2 = idx - 1;
        }

        if (rx->link_sta)
                key = rcu_dereference(rx->link_sta->gtk[idx]);
        if (!key)
                key = rcu_dereference(rx->link->gtk[idx]);
        if (!key && rx->link_sta)
                key = rcu_dereference(rx->link_sta->gtk[idx2]);
        if (!key)
                key = rcu_dereference(rx->link->gtk[idx2]);

        return key;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        int keyidx;
        ieee80211_rx_result result = RX_DROP_U_DECRYPT_FAIL;
        struct ieee80211_key *sta_ptk = NULL;
        struct ieee80211_key *ptk_idx = NULL;
        int mmie_keyidx = -1;
        __le16 fc;

        if (ieee80211_is_ext(hdr->frame_control))
                return RX_CONTINUE;

        /*
         * Key selection 101
         *
         * There are five types of keys:
         *  - GTK (group keys)
         *  - IGTK (group keys for management frames)
         *  - BIGTK (group keys for Beacon frames)
         *  - PTK (pairwise keys)
         *  - STK (station-to-station pairwise keys)
         *
         * When selecting a key, we have to distinguish between multicast
         * (including broadcast) and unicast frames, the latter can only
         * use PTKs and STKs while the former always use GTKs, IGTKs, and
         * BIGTKs. Unless, of course, actual WEP keys ("pre-RSNA") are used,
         * then unicast frames can also use key indices like GTKs. Hence, if we
         * don't have a PTK/STK we check the key index for a WEP key.
         *
         * Note that in a regular BSS, multicast frames are sent by the
         * AP only, associated stations unicast the frame to the AP first
         * which then multicasts it on their behalf.
         *
         * There is also a slight problem in IBSS mode: GTKs are negotiated
         * with each station, that is something we don't currently handle.
         * The spec seems to expect that one negotiates the same key with
         * every station but there's no such requirement; VLANs could be
         * possible.
         */

        /* start without a key */
        rx->key = NULL;
        fc = hdr->frame_control;

        if (rx->sta) {
                int keyid = rx->sta->ptk_idx;
                sta_ptk = rcu_dereference(rx->sta->ptk[keyid]);

                if (ieee80211_has_protected(fc) &&
                    !(status->flag & RX_FLAG_IV_STRIPPED)) {
                        keyid = ieee80211_get_keyid(rx->skb);

                        if (unlikely(keyid < 0))
                                return RX_DROP_U_NO_KEY_ID;

                        ptk_idx = rcu_dereference(rx->sta->ptk[keyid]);
                }
        }

        if (!ieee80211_has_protected(fc))
                mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);

        if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
                rx->key = ptk_idx ? ptk_idx : sta_ptk;
                if ((status->flag & RX_FLAG_DECRYPTED) &&
                    (status->flag & RX_FLAG_IV_STRIPPED))
                        return RX_CONTINUE;
                /* Skip decryption if the frame is not protected. */
                if (!ieee80211_has_protected(fc))
                        return RX_CONTINUE;
        } else if (mmie_keyidx >= 0 && ieee80211_is_beacon(fc)) {
                /* Broadcast/multicast robust management frame / BIP */
                if ((status->flag & RX_FLAG_DECRYPTED) &&
                    (status->flag & RX_FLAG_IV_STRIPPED))
                        return RX_CONTINUE;

                if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS ||
                    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS +
                                   NUM_DEFAULT_BEACON_KEYS) {
                        if (rx->sdata->dev)
                                cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
                                                             skb->data,
                                                             skb->len);
                        return RX_DROP_M_BAD_BCN_KEYIDX;
                }

                rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx);
                if (!rx->key)
                        return RX_CONTINUE; /* Beacon protection not in use */
        } else if (mmie_keyidx >= 0) {
                /* Broadcast/multicast robust management frame / BIP */
                if ((status->flag & RX_FLAG_DECRYPTED) &&
                    (status->flag & RX_FLAG_IV_STRIPPED))
                        return RX_CONTINUE;

                if (mmie_keyidx < NUM_DEFAULT_KEYS ||
                    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
                        return RX_DROP_M_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */
                if (rx->link_sta) {
                        if (ieee80211_is_group_privacy_action(skb) &&
                            test_sta_flag(rx->sta, WLAN_STA_MFP))
                                return RX_DROP_MONITOR;

                        rx->key = rcu_dereference(rx->link_sta->gtk[mmie_keyidx]);
                }
                if (!rx->key)
                        rx->key = rcu_dereference(rx->link->gtk[mmie_keyidx]);
        } else if (!ieee80211_has_protected(fc)) {
                /*
                 * The frame was not protected, so skip decryption. However, we
                 * need to set rx->key if there is a key that could have been
                 * used so that the frame may be dropped if encryption would
                 * have been expected.
                 */
                struct ieee80211_key *key = NULL;
                int i;

                if (ieee80211_is_beacon(fc)) {
                        key = ieee80211_rx_get_bigtk(rx, -1);
                } else if (ieee80211_is_mgmt(fc) &&
                           is_multicast_ether_addr(hdr->addr1)) {
                        key = rcu_dereference(rx->link->default_mgmt_key);
                } else {
                        if (rx->link_sta) {
                                for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
                                        key = rcu_dereference(rx->link_sta->gtk[i]);
                                        if (key)
                                                break;
                                }
                        }
                        if (!key) {
                                for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
                                        key = rcu_dereference(rx->link->gtk[i]);
                                        if (key)
                                                break;
                                }
                        }
                }
                if (key)
                        rx->key = key;
                return RX_CONTINUE;
        } else {
                /*
                 * The device doesn't give us the IV so we won't be
                 * able to look up the key. That's ok though, we
                 * don't need to decrypt the frame, we just won't
                 * be able to keep statistics accurate.
                 * Except for key threshold notifications, should
                 * we somehow allow the driver to tell us which key
                 * the hardware used if this flag is set?
                 */
                if ((status->flag & RX_FLAG_DECRYPTED) &&
                    (status->flag & RX_FLAG_IV_STRIPPED))
                        return RX_CONTINUE;

                keyidx = ieee80211_get_keyid(rx->skb);

                if (unlikely(keyidx < 0))
                        return RX_DROP_U_NO_KEY_ID;

                /* check per-station GTK first, if multicast packet */
                if (is_multicast_ether_addr(hdr->addr1) && rx->link_sta)
                        rx->key = rcu_dereference(rx->link_sta->gtk[keyidx]);

                /* if not found, try default key */
                if (!rx->key) {
                        if (is_multicast_ether_addr(hdr->addr1))
                                rx->key = rcu_dereference(rx->link->gtk[keyidx]);
                        if (!rx->key)
                                rx->key = rcu_dereference(rx->sdata->keys[keyidx]);

                        /*
                         * RSNA-protected unicast frames should always be
                         * sent with pairwise or station-to-station keys,
                         * but for WEP we allow using a key index as well.
                         */
                        if (rx->key &&
                            rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
                            rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
                            !is_multicast_ether_addr(hdr->addr1))
                                rx->key = NULL;
                }
        }

        if (rx->key) {
                if (unlikely(rx->key->flags & KEY_FLAG_TAINTED))
                        return RX_DROP_MONITOR;

                /* TODO: add threshold stuff again */
        } else {
                return RX_DROP_MONITOR;
        }

        switch (rx->key->conf.cipher) {
        case WLAN_CIPHER_SUITE_WEP40:
        case WLAN_CIPHER_SUITE_WEP104:
                result = ieee80211_crypto_wep_decrypt(rx);
                break;
        case WLAN_CIPHER_SUITE_TKIP:
                result = ieee80211_crypto_tkip_decrypt(rx);
                break;
        case WLAN_CIPHER_SUITE_CCMP:
                result = ieee80211_crypto_ccmp_decrypt(
                        rx, IEEE80211_CCMP_MIC_LEN);
                break;
        case WLAN_CIPHER_SUITE_CCMP_256:
                result = ieee80211_crypto_ccmp_decrypt(
                        rx, IEEE80211_CCMP_256_MIC_LEN);
                break;
        case WLAN_CIPHER_SUITE_AES_CMAC:
                result = ieee80211_crypto_aes_cmac_decrypt(rx);
                break;
        case WLAN_CIPHER_SUITE_BIP_CMAC_256:
                result = ieee80211_crypto_aes_cmac_256_decrypt(rx);
                break;
        case WLAN_CIPHER_SUITE_BIP_GMAC_128:
        case WLAN_CIPHER_SUITE_BIP_GMAC_256:
                result = ieee80211_crypto_aes_gmac_decrypt(rx);
                break;
        case WLAN_CIPHER_SUITE_GCMP:
        case WLAN_CIPHER_SUITE_GCMP_256:
                result = ieee80211_crypto_gcmp_decrypt(rx);
                break;
        default:
                result = RX_DROP_U_BAD_CIPHER;
        }

        /* the hdr variable is invalid after the decrypt handlers */

        /* either the frame has been decrypted or will be dropped */
        status->flag |= RX_FLAG_DECRYPTED;

        if (unlikely(ieee80211_is_beacon(fc) && RX_RES_IS_UNUSABLE(result) &&
                     rx->sdata->dev))
                cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
                                             skb->data, skb->len);

        return result;
}

void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(cache->entries); i++)
                skb_queue_head_init(&cache->entries[i].skb_list);
}

void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(cache->entries); i++)
                __skb_queue_purge(&cache->entries[i].skb_list);
}

static inline struct ieee80211_fragment_entry *
ieee80211_reassemble_add(struct ieee80211_fragment_cache *cache,
                         unsigned int frag, unsigned int seq, int rx_queue,
                         struct sk_buff **skb)
{
        struct ieee80211_fragment_entry *entry;

        entry = &cache->entries[cache->next++];
        if (cache->next >= IEEE80211_FRAGMENT_MAX)
                cache->next = 0;

        __skb_queue_purge(&entry->skb_list);

        __skb_queue_tail(&entry->skb_list, *skb); /* no need for locking */
        *skb = NULL;
        entry->first_frag_time = jiffies;
        entry->seq = seq;
        entry->rx_queue = rx_queue;
        entry->last_frag = frag;
        entry->check_sequential_pn = false;
        entry->extra_len = 0;

        return entry;
}

static inline struct ieee80211_fragment_entry *
ieee80211_reassemble_find(struct ieee80211_fragment_cache *cache,
                          unsigned int frag, unsigned int seq,
                          int rx_queue, struct ieee80211_hdr *hdr)
{
        struct ieee80211_fragment_entry *entry;
        int i, idx;

        idx = cache->next;
        for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) {
                struct ieee80211_hdr *f_hdr;
                struct sk_buff *f_skb;

                idx--;
                if (idx < 0)
                        idx = IEEE80211_FRAGMENT_MAX - 1;

                entry = &cache->entries[idx];
                if (skb_queue_empty(&entry->skb_list) || entry->seq != seq ||
                    entry->rx_queue != rx_queue ||
                    entry->last_frag + 1 != frag)
                        continue;

                f_skb = __skb_peek(&entry->skb_list);
                f_hdr = (struct ieee80211_hdr *) f_skb->data;

                /*
                 * Check ftype and addresses are equal, else check next fragment
                 */
                if (((hdr->frame_control ^ f_hdr->frame_control) &
                     cpu_to_le16(IEEE80211_FCTL_FTYPE)) ||
                    !ether_addr_equal(hdr->addr1, f_hdr->addr1) ||
                    !ether_addr_equal(hdr->addr2, f_hdr->addr2))
                        continue;

                if (time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
                        __skb_queue_purge(&entry->skb_list);
                        continue;
                }
                return entry;
        }

        return NULL;
}

static bool requires_sequential_pn(struct ieee80211_rx_data *rx, __le16 fc)
{
        return rx->key &&
                (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
                 rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
                 rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
                 rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
                ieee80211_has_protected(fc);
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
{
        struct ieee80211_fragment_cache *cache = &rx->sdata->frags;
        struct ieee80211_hdr *hdr;
        u16 sc;
        __le16 fc;
        unsigned int frag, seq;
        struct ieee80211_fragment_entry *entry;
        struct sk_buff *skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);

        hdr = (struct ieee80211_hdr *)rx->skb->data;
        fc = hdr->frame_control;

        if (ieee80211_is_ctl(fc) || ieee80211_is_ext(fc))
                return RX_CONTINUE;

        sc = le16_to_cpu(hdr->seq_ctrl);
        frag = sc & IEEE80211_SCTL_FRAG;

        if (rx->sta)
                cache = &rx->sta->frags;

        if (likely(!ieee80211_has_morefrags(fc) && frag == 0))
                goto out;

        if (is_multicast_ether_addr(hdr->addr1))
                return RX_DROP_MONITOR;

        I802_DEBUG_INC(rx->local->rx_handlers_fragments);

        if (skb_linearize(rx->skb))
                return RX_DROP_U_OOM;

        /*
         *  skb_linearize() might change the skb->data and
         *  previously cached variables (in this case, hdr) need to
         *  be refreshed with the new data.
         */
        hdr = (struct ieee80211_hdr *)rx->skb->data;
        seq = (sc & IEEE80211_SCTL_SEQ) >> 4;

        if (frag == 0) {
                /* This is the first fragment of a new frame. */
                entry = ieee80211_reassemble_add(cache, frag, seq,
                                                 rx->seqno_idx, &(rx->skb));
                if (requires_sequential_pn(rx, fc)) {
                        int queue = rx->security_idx;

                        /* Store CCMP/GCMP PN so that we can verify that the
                         * next fragment has a sequential PN value.
                         */
                        entry->check_sequential_pn = true;
                        entry->is_protected = true;
                        entry->key_color = rx->key->color;
                        memcpy(entry->last_pn,
                               rx->key->u.ccmp.rx_pn[queue],
                               IEEE80211_CCMP_PN_LEN);
                        BUILD_BUG_ON(offsetof(struct ieee80211_key,
                                              u.ccmp.rx_pn) !=
                                     offsetof(struct ieee80211_key,
                                              u.gcmp.rx_pn));
                        BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
                                     sizeof(rx->key->u.gcmp.rx_pn[queue]));
                        BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
                                     IEEE80211_GCMP_PN_LEN);
                } else if (rx->key &&
                           (ieee80211_has_protected(fc) ||
                            (status->flag & RX_FLAG_DECRYPTED))) {
                        entry->is_protected = true;
                        entry->key_color = rx->key->color;
                }
                return RX_QUEUED;
        }

        /* This is a fragment for a frame that should already be pending in
         * fragment cache. Add this fragment to the end of the pending entry.
         */
        entry = ieee80211_reassemble_find(cache, frag, seq,
                                          rx->seqno_idx, hdr);
        if (!entry) {
                I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
                return RX_DROP_MONITOR;
        }

        /* "The receiver shall discard MSDUs and MMPDUs whose constituent
         *  MPDU PN values are not incrementing in steps of 1."
         * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
         * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
         */
        if (entry->check_sequential_pn) {
                int i;
                u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;

                if (!requires_sequential_pn(rx, fc))
                        return RX_DROP_U_NONSEQ_PN;

                /* Prevent mixed key and fragment cache attacks */
                if (entry->key_color != rx->key->color)
                        return RX_DROP_U_BAD_KEY_COLOR;

                memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
                for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
                        pn[i]++;
                        if (pn[i])
                                break;
                }

                rpn = rx->ccm_gcm.pn;
                if (memcmp(pn, rpn, IEEE80211_CCMP_PN_LEN))
                        return RX_DROP_U_REPLAY;
                memcpy(entry->last_pn, pn, IEEE80211_CCMP_PN_LEN);
        } else if (entry->is_protected &&
                   (!rx->key ||
                    (!ieee80211_has_protected(fc) &&
                     !(status->flag & RX_FLAG_DECRYPTED)) ||
                    rx->key->color != entry->key_color)) {
                /* Drop this as a mixed key or fragment cache attack, even
                 * if for TKIP Michael MIC should protect us, and WEP is a
                 * lost cause anyway.
                 */
                return RX_DROP_U_EXPECT_DEFRAG_PROT;
        } else if (entry->is_protected && rx->key &&
                   entry->key_color != rx->key->color &&
                   (status->flag & RX_FLAG_DECRYPTED)) {
                return RX_DROP_U_BAD_KEY_COLOR;
        }

        skb_pull(rx->skb, ieee80211_hdrlen(fc));
        __skb_queue_tail(&entry->skb_list, rx->skb);
        entry->last_frag = frag;
        entry->extra_len += rx->skb->len;
        if (ieee80211_has_morefrags(fc)) {
                rx->skb = NULL;
                return RX_QUEUED;
        }

        rx->skb = __skb_dequeue(&entry->skb_list);
        if (skb_tailroom(rx->skb) < entry->extra_len) {
                I802_DEBUG_INC(rx->local->rx_expand_skb_head_defrag);
                if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len,
                                              GFP_ATOMIC))) {
                        I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
                        __skb_queue_purge(&entry->skb_list);
                        return RX_DROP_U_OOM;
                }
        }
        while ((skb = __skb_dequeue(&entry->skb_list))) {
                skb_put_data(rx->skb, skb->data, skb->len);
                dev_kfree_skb(skb);
        }

 out:
        ieee80211_led_rx(rx->local);
        if (rx->sta)
                rx->link_sta->rx_stats.packets++;
        return RX_CONTINUE;
}

static int ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx)
{
        if (unlikely(!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_AUTHORIZED)))
                return -EACCES;

        return 0;
}

static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);

        /*
         * Pass through unencrypted frames if the hardware has
         * decrypted them already.
         */
        if (status->flag & RX_FLAG_DECRYPTED)
                return 0;

        /* Drop unencrypted frames if key is set. */
        if (unlikely(!ieee80211_has_protected(fc) &&
                     !ieee80211_is_any_nullfunc(fc) &&
                     ieee80211_is_data(fc) && rx->key))
                return -EACCES;

        return 0;
}

VISIBLE_IF_MAC80211_KUNIT ieee80211_rx_result
ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
        struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
        __le16 fc = mgmt->frame_control;

        /*
         * Pass through unencrypted frames if the hardware has
         * decrypted them already.
         */
        if (status->flag & RX_FLAG_DECRYPTED)
                return RX_CONTINUE;

        /* drop unicast protected dual (that wasn't protected) */
        if (ieee80211_is_action(fc) &&
            mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION)
                return RX_DROP_U_UNPROT_DUAL;

        if (rx->sta && test_sta_flag(rx->sta, WLAN_STA_MFP)) {
                if (unlikely(!ieee80211_has_protected(fc) &&
                             ieee80211_is_unicast_robust_mgmt_frame(rx->skb))) {
                        if (ieee80211_is_deauth(fc) ||
                            ieee80211_is_disassoc(fc)) {
                                /*
                                 * Permit unprotected deauth/disassoc frames
                                 * during 4-way-HS (key is installed after HS).
                                 */
                                if (!rx->key)
                                        return RX_CONTINUE;

                                cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
                                                             rx->skb->data,
                                                             rx->skb->len);
                        }
                        return RX_DROP_U_UNPROT_UCAST_MGMT;
                }
                /* BIP does not use Protected field, so need to check MMIE */
                if (unlikely(ieee80211_is_multicast_robust_mgmt_frame(rx->skb) &&
                             ieee80211_get_mmie_keyidx(rx->skb) < 0)) {
                        if (ieee80211_is_deauth(fc) ||
                            ieee80211_is_disassoc(fc))
                                cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
                                                             rx->skb->data,
                                                             rx->skb->len);
                        return RX_DROP_U_UNPROT_MCAST_MGMT;
                }
                if (unlikely(ieee80211_is_beacon(fc) && rx->key &&
                             ieee80211_get_mmie_keyidx(rx->skb) < 0)) {
                        cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
                                                     rx->skb->data,
                                                     rx->skb->len);
                        return RX_DROP_U_UNPROT_BEACON;
                }
                /*
                 * When using MFP, Action frames are not allowed prior to
                 * having configured keys.
                 */
                if (unlikely(ieee80211_is_action(fc) && !rx->key &&
                             ieee80211_is_robust_mgmt_frame(rx->skb)))
                        return RX_DROP_U_UNPROT_ACTION;

                /* drop unicast public action frames when using MPF */
                if (is_unicast_ether_addr(mgmt->da) &&
                    ieee80211_is_protected_dual_of_public_action(rx->skb))
                        return RX_DROP_U_UNPROT_UNICAST_PUB_ACTION;
        }

        /*
         * Drop robust action frames before assoc regardless of MFP state,
         * after assoc we also have decided on MFP or not.
         */
        if (ieee80211_is_action(fc) &&
            ieee80211_is_robust_mgmt_frame(rx->skb) &&
            (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))
                return RX_DROP_U_UNPROT_ROBUST_ACTION;

        return RX_CONTINUE;
}
EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_drop_unencrypted_mgmt);

static ieee80211_rx_result
__ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
        bool check_port_control = false;
        struct ethhdr *ehdr;
        int ret;

        *port_control = false;
        if (ieee80211_has_a4(hdr->frame_control) &&
            sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
                return RX_DROP_U_UNEXPECTED_VLAN_4ADDR;

        if (sdata->vif.type == NL80211_IFTYPE_STATION &&
            !!sdata->u.mgd.use_4addr != !!ieee80211_has_a4(hdr->frame_control)) {
                if (!sdata->u.mgd.use_4addr)
                        return RX_DROP_U_UNEXPECTED_STA_4ADDR;
                else if (!ether_addr_equal(hdr->addr1, sdata->vif.addr))
                        check_port_control = true;
        }

        if (is_multicast_ether_addr(hdr->addr1) &&
            sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sdata->u.vlan.sta)
                return RX_DROP_U_UNEXPECTED_VLAN_MCAST;

        ret = ieee80211_data_to_8023(rx->skb, sdata->vif.addr, sdata->vif.type);
        if (ret < 0)
                return RX_DROP_U_INVALID_8023;

        ehdr = (struct ethhdr *) rx->skb->data;
        if (ehdr->h_proto == rx->sdata->control_port_protocol)
                *port_control = true;
        else if (check_port_control)
                return RX_DROP_U_NOT_PORT_CONTROL;

        return RX_CONTINUE;
}

bool ieee80211_is_our_addr(struct ieee80211_sub_if_data *sdata,
                           const u8 *addr, int *out_link_id)
{
        unsigned int link_id;

        /* non-MLO, or MLD address replaced by hardware */
        if (ether_addr_equal(sdata->vif.addr, addr))
                return true;

        if (!ieee80211_vif_is_mld(&sdata->vif))
                return false;

        for (link_id = 0; link_id < ARRAY_SIZE(sdata->vif.link_conf); link_id++) {
                struct ieee80211_bss_conf *conf;

                conf = rcu_dereference(sdata->vif.link_conf[link_id]);

                if (!conf)
                        continue;
                if (ether_addr_equal(conf->addr, addr)) {
                        if (out_link_id)
                                *out_link_id = link_id;
                        return true;
                }
        }

        return false;
}

/*
 * requires that rx->skb is a frame with ethernet header
 */
static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
{
        static const u8 pae_group_addr[ETH_ALEN] __aligned(2)
                = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 };
        struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;

        /*
         * Allow EAPOL frames to us/the PAE group address regardless of
         * whether the frame was encrypted or not, and always disallow
         * all other destination addresses for them.
         */
        if (unlikely(ehdr->h_proto == rx->sdata->control_port_protocol))
                return ieee80211_is_our_addr(rx->sdata, ehdr->h_dest, NULL) ||
                       ether_addr_equal(ehdr->h_dest, pae_group_addr);

        if (ieee80211_802_1x_port_control(rx) ||
            ieee80211_drop_unencrypted(rx, fc))
                return false;

        return true;
}

static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
                                                 struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct net_device *dev = sdata->dev;

        if (unlikely((skb->protocol == sdata->control_port_protocol ||
                     (skb->protocol == cpu_to_be16(ETH_P_PREAUTH) &&
                      !sdata->control_port_no_preauth)) &&
                     sdata->control_port_over_nl80211)) {
                struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
                bool noencrypt = !(status->flag & RX_FLAG_DECRYPTED);

                cfg80211_rx_control_port(dev, skb, noencrypt, rx->link_id);
                dev_kfree_skb(skb);
        } else {
                struct ethhdr *ehdr = (void *)skb_mac_header(skb);

                memset(skb->cb, 0, sizeof(skb->cb));

                /*
                 * 802.1X over 802.11 requires that the authenticator address
                 * be used for EAPOL frames. However, 802.1X allows the use of
                 * the PAE group address instead. If the interface is part of
                 * a bridge and we pass the frame with the PAE group address,
                 * then the bridge will forward it to the network (even if the
                 * client was not associated yet), which isn't supposed to
                 * happen.
                 * To avoid that, rewrite the destination address to our own
                 * address, so that the authenticator (e.g. hostapd) will see
                 * the frame, but bridge won't forward it anywhere else. Note
                 * that due to earlier filtering, the only other address can
                 * be the PAE group address, unless the hardware allowed them
                 * through in 802.3 offloaded mode.
                 */
                if (unlikely(skb->protocol == sdata->control_port_protocol &&
                             !ether_addr_equal(ehdr->h_dest, sdata->vif.addr)))
                        ether_addr_copy(ehdr->h_dest, sdata->vif.addr);

                /* deliver to local stack */
                if (rx->list)
                        list_add_tail(&skb->list, rx->list);
                else
                        netif_receive_skb(skb);
        }
}

/*
 * requires that rx->skb is a frame with ethernet header
 */
static void
ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct net_device *dev = sdata->dev;
        struct sk_buff *skb, *xmit_skb;
        struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
        struct sta_info *dsta;

        skb = rx->skb;
        xmit_skb = NULL;

        dev_sw_netstats_rx_add(dev, skb->len);

        if (rx->sta) {
                /* The seqno index has the same property as needed
                 * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
                 * for non-QoS-data frames. Here we know it's a data
                 * frame, so count MSDUs.
                 */
                u64_stats_update_begin(&rx->link_sta->rx_stats.syncp);
                rx->link_sta->rx_stats.msdu[rx->seqno_idx]++;
                u64_stats_update_end(&rx->link_sta->rx_stats.syncp);
        }

        if ((sdata->vif.type == NL80211_IFTYPE_AP ||
             sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
            !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
            ehdr->h_proto != rx->sdata->control_port_protocol &&
            (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
                if (is_multicast_ether_addr(ehdr->h_dest) &&
                    ieee80211_vif_get_num_mcast_if(sdata) != 0) {
                        /*
                         * send multicast frames both to higher layers in
                         * local net stack and back to the wireless medium
                         */
                        xmit_skb = skb_copy(skb, GFP_ATOMIC);
                        if (!xmit_skb)
                                net_info_ratelimited("%s: failed to clone multicast frame\n",
                                                    dev->name);
                } else if (!is_multicast_ether_addr(ehdr->h_dest) &&
                           !ether_addr_equal(ehdr->h_dest, ehdr->h_source)) {
                        dsta = sta_info_get(sdata, ehdr->h_dest);
                        if (dsta) {
                                /*
                                 * The destination station is associated to
                                 * this AP (in this VLAN), so send the frame
                                 * directly to it and do not pass it to local
                                 * net stack.
                                 */
                                xmit_skb = skb;
                                skb = NULL;
                        }
                }
        }

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        if (skb) {
                /* 'align' will only take the values 0 or 2 here since all
                 * frames are required to be aligned to 2-byte boundaries
                 * when being passed to mac80211; the code here works just
                 * as well if that isn't true, but mac80211 assumes it can
                 * access fields as 2-byte aligned (e.g. for ether_addr_equal)
                 */
                int align;

                align = (unsigned long)(skb->data + sizeof(struct ethhdr)) & 3;
                if (align) {
                        if (WARN_ON(skb_headroom(skb) < 3)) {
                                dev_kfree_skb(skb);
                                skb = NULL;
                        } else {
                                u8 *data = skb->data;
                                size_t len = skb_headlen(skb);
                                skb->data -= align;
                                memmove(skb->data, data, len);
                                skb_set_tail_pointer(skb, len);
                        }
                }
        }
#endif

        if (skb) {
                skb->protocol = eth_type_trans(skb, dev);
                ieee80211_deliver_skb_to_local_stack(skb, rx);
        }

        if (xmit_skb) {
                /*
                 * Send to wireless media and increase priority by 256 to
                 * keep the received priority instead of reclassifying
                 * the frame (see cfg80211_classify8021d).
                 */
                xmit_skb->priority += 256;
                xmit_skb->protocol = htons(ETH_P_802_3);
                skb_reset_network_header(xmit_skb);
                skb_reset_mac_header(xmit_skb);
                dev_queue_xmit(xmit_skb);
        }
}

#ifdef CONFIG_MAC80211_MESH
static bool
ieee80211_rx_mesh_fast_forward(struct ieee80211_sub_if_data *sdata,
                               struct sk_buff *skb, int hdrlen)
{
        struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
        struct ieee80211_mesh_fast_tx_key key = {
                .type = MESH_FAST_TX_TYPE_FORWARDED
        };
        struct ieee80211_mesh_fast_tx *entry;
        struct ieee80211s_hdr *mesh_hdr;
        struct tid_ampdu_tx *tid_tx;
        struct sta_info *sta;
        struct ethhdr eth;
        u8 tid;

        mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(eth));
        if ((mesh_hdr->flags & MESH_FLAGS_AE) == MESH_FLAGS_AE_A5_A6)
                ether_addr_copy(key.addr, mesh_hdr->eaddr1);
        else if (!(mesh_hdr->flags & MESH_FLAGS_AE))
                ether_addr_copy(key.addr, skb->data);
        else
                return false;

        entry = mesh_fast_tx_get(sdata, &key);
        if (!entry)
                return false;

        sta = rcu_dereference(entry->mpath->next_hop);
        if (!sta)
                return false;

        if (skb_linearize(skb))
                return false;

        tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
        tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
        if (tid_tx) {
                if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state))
                        return false;

                if (tid_tx->timeout)
                        tid_tx->last_tx = jiffies;
        }

        ieee80211_aggr_check(sdata, sta, skb);

        if (ieee80211_get_8023_tunnel_proto(skb->data + hdrlen,
                                            &skb->protocol))
                hdrlen += ETH_ALEN;
        else
                skb->protocol = htons(skb->len - hdrlen);
        skb_set_network_header(skb, hdrlen + 2);

        skb->dev = sdata->dev;
        memcpy(&eth, skb->data, ETH_HLEN - 2);
        skb_pull(skb, 2);
        __ieee80211_xmit_fast(sdata, sta, &entry->fast_tx, skb, tid_tx,
                              eth.h_dest, eth.h_source);
        IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_unicast);
        IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames);

        return true;
}
#endif

static ieee80211_rx_result
ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta,
                       struct sk_buff *skb)
{
#ifdef CONFIG_MAC80211_MESH
        struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
        struct ieee80211_local *local = sdata->local;
        uint16_t fc = IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_DATA;
        struct ieee80211_hdr hdr = {
                .frame_control = cpu_to_le16(fc)
        };
        struct ieee80211_hdr *fwd_hdr;
        struct ieee80211s_hdr *mesh_hdr;
        struct ieee80211_tx_info *info;
        struct sk_buff *fwd_skb;
        struct ethhdr *eth;
        bool multicast;
        int tailroom = 0;
        int hdrlen, mesh_hdrlen;
        u8 *qos;

        if (!ieee80211_vif_is_mesh(&sdata->vif))
                return RX_CONTINUE;

        if (!pskb_may_pull(skb, sizeof(*eth) + 6))
                return RX_DROP_MONITOR;

        mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(*eth));
        mesh_hdrlen = ieee80211_get_mesh_hdrlen(mesh_hdr);

        if (!pskb_may_pull(skb, sizeof(*eth) + mesh_hdrlen))
                return RX_DROP_MONITOR;

        eth = (struct ethhdr *)skb->data;
        multicast = is_multicast_ether_addr(eth->h_dest);

        mesh_hdr = (struct ieee80211s_hdr *)(eth + 1);
        if (!mesh_hdr->ttl)
                return RX_DROP_MONITOR;

        /* frame is in RMC, don't forward */
        if (is_multicast_ether_addr(eth->h_dest) &&
            mesh_rmc_check(sdata, eth->h_source, mesh_hdr))
                return RX_DROP_MONITOR;

        /* forward packet */
        if (sdata->crypto_tx_tailroom_needed_cnt)
                tailroom = IEEE80211_ENCRYPT_TAILROOM;

        if (mesh_hdr->flags & MESH_FLAGS_AE) {
                struct mesh_path *mppath;
                char *proxied_addr;
                bool update = false;

                if (multicast)
                        proxied_addr = mesh_hdr->eaddr1;
                else if ((mesh_hdr->flags & MESH_FLAGS_AE) == MESH_FLAGS_AE_A5_A6)
                        /* has_a4 already checked in ieee80211_rx_mesh_check */
                        proxied_addr = mesh_hdr->eaddr2;
                else
                        return RX_DROP_MONITOR;

                rcu_read_lock();
                mppath = mpp_path_lookup(sdata, proxied_addr);
                if (!mppath) {
                        mpp_path_add(sdata, proxied_addr, eth->h_source);
                } else {
                        spin_lock_bh(&mppath->state_lock);
                        if (!ether_addr_equal(mppath->mpp, eth->h_source)) {
                                memcpy(mppath->mpp, eth->h_source, ETH_ALEN);
                                update = true;
                        }
                        mppath->exp_time = jiffies;
                        spin_unlock_bh(&mppath->state_lock);
                }

                /* flush fast xmit cache if the address path changed */
                if (update)
                        mesh_fast_tx_flush_addr(sdata, proxied_addr);

                rcu_read_unlock();
        }

        /* Frame has reached destination.  Don't forward */
        if (ether_addr_equal(sdata->vif.addr, eth->h_dest))
                goto rx_accept;

        if (!--mesh_hdr->ttl) {
                if (multicast)
                        goto rx_accept;

                IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl);
                return RX_DROP_MONITOR;
        }

        if (!ifmsh->mshcfg.dot11MeshForwarding) {
                if (is_multicast_ether_addr(eth->h_dest))
                        goto rx_accept;

                return RX_DROP_MONITOR;
        }

        skb_set_queue_mapping(skb, ieee802_1d_to_ac[skb->priority]);

        if (!multicast &&
            ieee80211_rx_mesh_fast_forward(sdata, skb, mesh_hdrlen))
                return RX_QUEUED;

        ieee80211_fill_mesh_addresses(&hdr, &hdr.frame_control,
                                      eth->h_dest, eth->h_source);
        hdrlen = ieee80211_hdrlen(hdr.frame_control);
        if (multicast) {
                int extra_head = sizeof(struct ieee80211_hdr) - sizeof(*eth);

                fwd_skb = skb_copy_expand(skb, local->tx_headroom + extra_head +
                                               IEEE80211_ENCRYPT_HEADROOM,
                                          tailroom, GFP_ATOMIC);
                if (!fwd_skb)
                        goto rx_accept;
        } else {
                fwd_skb = skb;
                skb = NULL;

                if (skb_cow_head(fwd_skb, hdrlen - sizeof(struct ethhdr)))
                        return RX_DROP_U_OOM;

                if (skb_linearize(fwd_skb))
                        return RX_DROP_U_OOM;
        }

        fwd_hdr = skb_push(fwd_skb, hdrlen - sizeof(struct ethhdr));
        memcpy(fwd_hdr, &hdr, hdrlen - 2);
        qos = ieee80211_get_qos_ctl(fwd_hdr);
        qos[0] = qos[1] = 0;

        skb_reset_mac_header(fwd_skb);
        hdrlen += mesh_hdrlen;
        if (ieee80211_get_8023_tunnel_proto(fwd_skb->data + hdrlen,
                                            &fwd_skb->protocol))
                hdrlen += ETH_ALEN;
        else
                fwd_skb->protocol = htons(fwd_skb->len - hdrlen);
        skb_set_network_header(fwd_skb, hdrlen + 2);

        info = IEEE80211_SKB_CB(fwd_skb);
        memset(info, 0, sizeof(*info));
        info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING;
        info->control.vif = &sdata->vif;
        info->control.jiffies = jiffies;
        fwd_skb->dev = sdata->dev;
        if (multicast) {
                IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_mcast);
                memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN);
                /* update power mode indication when forwarding */
                ieee80211_mps_set_frame_flags(sdata, NULL, fwd_hdr);
        } else if (!mesh_nexthop_lookup(sdata, fwd_skb)) {
                /* mesh power mode flags updated in mesh_nexthop_lookup */
                IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_unicast);
        } else {
                /* unable to resolve next hop */
                if (sta)
                        mesh_path_error_tx(sdata, ifmsh->mshcfg.element_ttl,
                                           hdr.addr3, 0,
                                           WLAN_REASON_MESH_PATH_NOFORWARD,
                                           sta->sta.addr);
                IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_no_route);
                kfree_skb(fwd_skb);
                goto rx_accept;
        }

        IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames);
        ieee80211_add_pending_skb(local, fwd_skb);

rx_accept:
        if (!skb)
                return RX_QUEUED;

        ieee80211_strip_8023_mesh_hdr(skb);
#endif

        return RX_CONTINUE;
}

static ieee80211_rx_result debug_noinline
__ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset)
{
        struct net_device *dev = rx->sdata->dev;
        struct sk_buff *skb = rx->skb;
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        __le16 fc = hdr->frame_control;
        struct sk_buff_head frame_list;
        ieee80211_rx_result res;
        struct ethhdr ethhdr;
        const u8 *check_da = ethhdr.h_dest, *check_sa = ethhdr.h_source;

        if (unlikely(ieee80211_has_a4(hdr->frame_control))) {
                check_da = NULL;
                check_sa = NULL;
        } else switch (rx->sdata->vif.type) {
                case NL80211_IFTYPE_AP:
                case NL80211_IFTYPE_AP_VLAN:
                        check_da = NULL;
                        break;
                case NL80211_IFTYPE_STATION:
                        if (!rx->sta ||
                            !test_sta_flag(rx->sta, WLAN_STA_TDLS_PEER))
                                check_sa = NULL;
                        break;
                case NL80211_IFTYPE_MESH_POINT:
                        check_sa = NULL;
                        check_da = NULL;
                        break;
                default:
                        break;
        }

        skb->dev = dev;
        __skb_queue_head_init(&frame_list);

        if (ieee80211_data_to_8023_exthdr(skb, &ethhdr,
                                          rx->sdata->vif.addr,
                                          rx->sdata->vif.type,
                                          data_offset, true))
                return RX_DROP_U_BAD_AMSDU;

        if (rx->sta->amsdu_mesh_control < 0) {
                s8 valid = -1;
                int i;

                for (i = 0; i <= 2; i++) {
                        if (!ieee80211_is_valid_amsdu(skb, i))
                                continue;

                        if (valid >= 0) {
                                /* ambiguous */
                                valid = -1;
                                break;
                        }

                        valid = i;
                }

                rx->sta->amsdu_mesh_control = valid;
        }

        ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
                                 rx->sdata->vif.type,
                                 rx->local->hw.extra_tx_headroom,
                                 check_da, check_sa,
                                 rx->sta->amsdu_mesh_control);

        while (!skb_queue_empty(&frame_list)) {
                rx->skb = __skb_dequeue(&frame_list);

                res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb);
                switch (res) {
                case RX_QUEUED:
                        continue;
                case RX_CONTINUE:
                        break;
                default:
                        goto free;
                }

                if (!ieee80211_frame_allowed(rx, fc))
                        goto free;

                ieee80211_deliver_skb(rx);
                continue;

free:
                dev_kfree_skb(rx->skb);
        }

        return RX_QUEUED;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        __le16 fc = hdr->frame_control;

        if (!(status->rx_flags & IEEE80211_RX_AMSDU))
                return RX_CONTINUE;

        if (unlikely(!ieee80211_is_data(fc)))
                return RX_CONTINUE;

        if (unlikely(!ieee80211_is_data_present(fc)))
                return RX_DROP_MONITOR;

        if (unlikely(ieee80211_has_a4(hdr->frame_control))) {
                switch (rx->sdata->vif.type) {
                case NL80211_IFTYPE_AP_VLAN:
                        if (!rx->sdata->u.vlan.sta)
                                return RX_DROP_U_BAD_4ADDR;
                        break;
                case NL80211_IFTYPE_STATION:
                        if (!rx->sdata->u.mgd.use_4addr)
                                return RX_DROP_U_BAD_4ADDR;
                        break;
                case NL80211_IFTYPE_MESH_POINT:
                        break;
                default:
                        return RX_DROP_U_BAD_4ADDR;
                }
        }

        if (is_multicast_ether_addr(hdr->addr1) || !rx->sta)
                return RX_DROP_U_BAD_AMSDU;

        if (rx->key) {
                /*
                 * We should not receive A-MSDUs on pre-HT connections,
                 * and HT connections cannot use old ciphers. Thus drop
                 * them, as in those cases we couldn't even have SPP
                 * A-MSDUs or such.
                 */
                switch (rx->key->conf.cipher) {
                case WLAN_CIPHER_SUITE_WEP40:
                case WLAN_CIPHER_SUITE_WEP104:
                case WLAN_CIPHER_SUITE_TKIP:
                        return RX_DROP_U_BAD_AMSDU_CIPHER;
                default:
                        break;
                }
        }

        return __ieee80211_rx_h_amsdu(rx, 0);
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_local *local = rx->local;
        struct net_device *dev = sdata->dev;
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
        __le16 fc = hdr->frame_control;
        ieee80211_rx_result res;
        bool port_control;

        if (unlikely(!ieee80211_is_data(hdr->frame_control)))
                return RX_CONTINUE;

        if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
                return RX_DROP_MONITOR;

        /*
         * Send unexpected-4addr-frame event to hostapd. For older versions,
         * also drop the frame to cooked monitor interfaces.
         */
        if (ieee80211_has_a4(hdr->frame_control) &&
            sdata->vif.type == NL80211_IFTYPE_AP) {
                if (rx->sta &&
                    !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT))
                        cfg80211_rx_unexpected_4addr_frame(
                                rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC);
                return RX_DROP_MONITOR;
        }

        res = __ieee80211_data_to_8023(rx, &port_control);
        if (unlikely(res != RX_CONTINUE))
                return res;

        res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb);
        if (res != RX_CONTINUE)
                return res;

        if (!ieee80211_frame_allowed(rx, fc))
                return RX_DROP_MONITOR;

        /* directly handle TDLS channel switch requests/responses */
        if (unlikely(((struct ethhdr *)rx->skb->data)->h_proto ==
                                                cpu_to_be16(ETH_P_TDLS))) {
                struct ieee80211_tdls_data *tf = (void *)rx->skb->data;

                if (pskb_may_pull(rx->skb,
                                  offsetof(struct ieee80211_tdls_data, u)) &&
                    tf->payload_type == WLAN_TDLS_SNAP_RFTYPE &&
                    tf->category == WLAN_CATEGORY_TDLS &&
                    (tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_REQUEST ||
                     tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_RESPONSE)) {
                        rx->skb->protocol = cpu_to_be16(ETH_P_TDLS);
                        __ieee80211_queue_skb_to_iface(sdata, rx->link_id,
                                                       rx->sta, rx->skb);
                        return RX_QUEUED;
                }
        }

        if (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
            unlikely(port_control) && sdata->bss) {
                sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
                                     u.ap);
                dev = sdata->dev;
                rx->sdata = sdata;
        }

        rx->skb->dev = dev;

        if (!ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) &&
            local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 &&
            !is_multicast_ether_addr(
                    ((struct ethhdr *)rx->skb->data)->h_dest) &&
            (!local->scanning &&
             !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)))
                mod_timer(&local->dynamic_ps_timer, jiffies +
                          msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));

        ieee80211_deliver_skb(rx);

        return RX_QUEUED;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_bar *bar = (struct ieee80211_bar *)skb->data;
        struct tid_ampdu_rx *tid_agg_rx;
        u16 start_seq_num;
        u16 tid;

        if (likely(!ieee80211_is_ctl(bar->frame_control)))
                return RX_CONTINUE;

        if (ieee80211_is_back_req(bar->frame_control)) {
                struct {
                        __le16 control, start_seq_num;
                } __packed bar_data;
                struct ieee80211_event event = {
                        .type = BAR_RX_EVENT,
                };

                if (!rx->sta)
                        return RX_DROP_MONITOR;

                if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control),
                                  &bar_data, sizeof(bar_data)))
                        return RX_DROP_MONITOR;

                tid = le16_to_cpu(bar_data.control) >> 12;

                if (!test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) &&
                    !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg))
                        ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid,
                                             WLAN_BACK_RECIPIENT,
                                             WLAN_REASON_QSTA_REQUIRE_SETUP);

                tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
                if (!tid_agg_rx)
                        return RX_DROP_MONITOR;

                start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
                event.u.ba.tid = tid;
                event.u.ba.ssn = start_seq_num;
                event.u.ba.sta = &rx->sta->sta;

                /* reset session timer */
                if (tid_agg_rx->timeout)
                        mod_timer(&tid_agg_rx->session_timer,
                                  TU_TO_EXP_TIME(tid_agg_rx->timeout));

                spin_lock(&tid_agg_rx->reorder_lock);
                /* release stored frames up to start of BAR */
                ieee80211_release_reorder_frames(rx->sdata, tid_agg_rx,
                                                 start_seq_num, frames);
                spin_unlock(&tid_agg_rx->reorder_lock);

                drv_event_callback(rx->local, rx->sdata, &event);

                kfree_skb(skb);
                return RX_QUEUED;
        }

        /*
         * After this point, we only want management frames,
         * so we can drop all remaining control frames to
         * cooked monitor interfaces.
         */
        return RX_DROP_MONITOR;
}

static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
                                           struct ieee80211_mgmt *mgmt,
                                           size_t len)
{
        struct ieee80211_local *local = sdata->local;
        struct sk_buff *skb;
        struct ieee80211_mgmt *resp;

        if (!ether_addr_equal(mgmt->da, sdata->vif.addr)) {
                /* Not to own unicast address */
                return;
        }

        if (!ether_addr_equal(mgmt->sa, sdata->deflink.u.mgd.bssid) ||
            !ether_addr_equal(mgmt->bssid, sdata->deflink.u.mgd.bssid)) {
                /* Not from the current AP or not associated yet. */
                return;
        }

        if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) {
                /* Too short SA Query request frame */
                return;
        }

        skb = dev_alloc_skb(sizeof(*resp) + local->hw.extra_tx_headroom);
        if (skb == NULL)
                return;

        skb_reserve(skb, local->hw.extra_tx_headroom);
        resp = skb_put_zero(skb, 24);
        memcpy(resp->da, mgmt->sa, ETH_ALEN);
        memcpy(resp->sa, sdata->vif.addr, ETH_ALEN);
        memcpy(resp->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN);
        resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
                                          IEEE80211_STYPE_ACTION);
        skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
        resp->u.action.category = WLAN_CATEGORY_SA_QUERY;
        resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE;
        memcpy(resp->u.action.u.sa_query.trans_id,
               mgmt->u.action.u.sa_query.trans_id,
               WLAN_SA_QUERY_TR_ID_LEN);

        ieee80211_tx_skb(sdata, skb);
}

static void
ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx)
{
        struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
        const struct element *ie;
        size_t baselen;

        if (!wiphy_ext_feature_isset(rx->local->hw.wiphy,
                                     NL80211_EXT_FEATURE_BSS_COLOR))
                return;

        if (ieee80211_hw_check(&rx->local->hw, DETECTS_COLOR_COLLISION))
                return;

        if (rx->link->conf->csa_active)
                return;

        baselen = mgmt->u.beacon.variable - rx->skb->data;
        if (baselen > rx->skb->len)
                return;

        ie = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION,
                                    mgmt->u.beacon.variable,
                                    rx->skb->len - baselen);
        if (ie && ie->datalen >= sizeof(struct ieee80211_he_operation) &&
            ie->datalen >= ieee80211_he_oper_size(ie->data + 1)) {
                struct ieee80211_bss_conf *bss_conf = rx->link->conf;
                const struct ieee80211_he_operation *he_oper;
                u8 color;

                he_oper = (void *)(ie->data + 1);
                if (le32_get_bits(he_oper->he_oper_params,
                                  IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED))
                        return;

                color = le32_get_bits(he_oper->he_oper_params,
                                      IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
                if (color == bss_conf->he_bss_color.color)
                        ieee80211_obss_color_collision_notify(&rx->sdata->vif,
                                                              BIT_ULL(color),
                                                              bss_conf->link_id);
        }
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
{
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);

        if (ieee80211_is_s1g_beacon(mgmt->frame_control))
                return RX_CONTINUE;

        /*
         * From here on, look only at management frames.
         * Data and control frames are already handled,
         * and unknown (reserved) frames are useless.
         */
        if (rx->skb->len < 24)
                return RX_DROP_MONITOR;

        if (!ieee80211_is_mgmt(mgmt->frame_control))
                return RX_DROP_MONITOR;

        /* drop too small action frames */
        if (ieee80211_is_action(mgmt->frame_control) &&
            rx->skb->len < IEEE80211_MIN_ACTION_SIZE)
                return RX_DROP_U_RUNT_ACTION;

        if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
            ieee80211_is_beacon(mgmt->frame_control) &&
            !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) {
                int sig = 0;

                /* sw bss color collision detection */
                ieee80211_rx_check_bss_color_collision(rx);

                if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) &&
                    !(status->flag & RX_FLAG_NO_SIGNAL_VAL))
                        sig = status->signal;

                cfg80211_report_obss_beacon_khz(rx->local->hw.wiphy,
                                                rx->skb->data, rx->skb->len,
                                                ieee80211_rx_status_to_khz(status),
                                                sig);
                rx->flags |= IEEE80211_RX_BEACON_REPORTED;
        }

        return ieee80211_drop_unencrypted_mgmt(rx);
}

static bool
ieee80211_process_rx_twt_action(struct ieee80211_rx_data *rx)
{
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)rx->skb->data;
        struct ieee80211_sub_if_data *sdata = rx->sdata;

        /* TWT actions are only supported in AP for the moment */
        if (sdata->vif.type != NL80211_IFTYPE_AP)
                return false;

        if (!rx->local->ops->add_twt_setup)
                return false;

        if (!sdata->vif.bss_conf.twt_responder)
                return false;

        if (!rx->sta)
                return false;

        switch (mgmt->u.action.u.s1g.action_code) {
        case WLAN_S1G_TWT_SETUP: {
                struct ieee80211_twt_setup *twt;

                if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE +
                                   1 + /* action code */
                                   sizeof(struct ieee80211_twt_setup) +
                                   2 /* TWT req_type agrt */)
                        break;

                twt = (void *)mgmt->u.action.u.s1g.variable;
                if (twt->element_id != WLAN_EID_S1G_TWT)
                        break;

                if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE +
                                   4 + /* action code + token + tlv */
                                   twt->length)
                        break;

                return true; /* queue the frame */
        }
        case WLAN_S1G_TWT_TEARDOWN:
                if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + 2)
                        break;

                return true; /* queue the frame */
        default:
                break;
        }

        return false;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
{
        struct ieee80211_local *local = rx->local;
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
        int len = rx->skb->len;

        if (!ieee80211_is_action(mgmt->frame_control))
                return RX_CONTINUE;

        if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC &&
            mgmt->u.action.category != WLAN_CATEGORY_SELF_PROTECTED &&
            mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT)
                return RX_DROP_U_ACTION_UNKNOWN_SRC;

        switch (mgmt->u.action.category) {
        case WLAN_CATEGORY_HT:
                /* reject HT action frames from stations not supporting HT */
                if (!rx->link_sta->pub->ht_cap.ht_supported)
                        goto invalid;

                if (sdata->vif.type != NL80211_IFTYPE_STATION &&
                    sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
                    sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
                    sdata->vif.type != NL80211_IFTYPE_AP &&
                    sdata->vif.type != NL80211_IFTYPE_ADHOC)
                        break;

                /* verify action & smps_control/chanwidth are present */
                if (len < IEEE80211_MIN_ACTION_SIZE + 2)
                        goto invalid;

                switch (mgmt->u.action.u.ht_smps.action) {
                case WLAN_HT_ACTION_SMPS: {
                        struct ieee80211_supported_band *sband;
                        enum ieee80211_smps_mode smps_mode;
                        struct sta_opmode_info sta_opmode = {};

                        if (sdata->vif.type != NL80211_IFTYPE_AP &&
                            sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
                                goto handled;

                        /* convert to HT capability */
                        switch (mgmt->u.action.u.ht_smps.smps_control) {
                        case WLAN_HT_SMPS_CONTROL_DISABLED:
                                smps_mode = IEEE80211_SMPS_OFF;
                                break;
                        case WLAN_HT_SMPS_CONTROL_STATIC:
                                smps_mode = IEEE80211_SMPS_STATIC;
                                break;
                        case WLAN_HT_SMPS_CONTROL_DYNAMIC:
                                smps_mode = IEEE80211_SMPS_DYNAMIC;
                                break;
                        default:
                                goto invalid;
                        }

                        /* if no change do nothing */
                        if (rx->link_sta->pub->smps_mode == smps_mode)
                                goto handled;
                        rx->link_sta->pub->smps_mode = smps_mode;
                        sta_opmode.smps_mode =
                                ieee80211_smps_mode_to_smps_mode(smps_mode);
                        sta_opmode.changed = STA_OPMODE_SMPS_MODE_CHANGED;

                        sband = rx->local->hw.wiphy->bands[status->band];

                        rate_control_rate_update(local, sband, rx->sta, 0,
                                                 IEEE80211_RC_SMPS_CHANGED);
                        cfg80211_sta_opmode_change_notify(sdata->dev,
                                                          rx->sta->addr,
                                                          &sta_opmode,
                                                          GFP_ATOMIC);
                        goto handled;
                }
                case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: {
                        struct ieee80211_supported_band *sband;
                        u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth;
                        enum ieee80211_sta_rx_bandwidth max_bw, new_bw;
                        struct sta_opmode_info sta_opmode = {};

                        /* If it doesn't support 40 MHz it can't change ... */
                        if (!(rx->link_sta->pub->ht_cap.cap &
                                        IEEE80211_HT_CAP_SUP_WIDTH_20_40))
                                goto handled;

                        if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ)
                                max_bw = IEEE80211_STA_RX_BW_20;
                        else
                                max_bw = ieee80211_sta_cap_rx_bw(rx->link_sta);

                        /* set cur_max_bandwidth and recalc sta bw */
                        rx->link_sta->cur_max_bandwidth = max_bw;
                        new_bw = ieee80211_sta_cur_vht_bw(rx->link_sta);

                        if (rx->link_sta->pub->bandwidth == new_bw)
                                goto handled;

                        rx->link_sta->pub->bandwidth = new_bw;
                        sband = rx->local->hw.wiphy->bands[status->band];
                        sta_opmode.bw =
                                ieee80211_sta_rx_bw_to_chan_width(rx->link_sta);
                        sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED;

                        rate_control_rate_update(local, sband, rx->sta, 0,
                                                 IEEE80211_RC_BW_CHANGED);
                        cfg80211_sta_opmode_change_notify(sdata->dev,
                                                          rx->sta->addr,
                                                          &sta_opmode,
                                                          GFP_ATOMIC);
                        goto handled;
                }
                default:
                        goto invalid;
                }

                break;
        case WLAN_CATEGORY_PUBLIC:
                if (len < IEEE80211_MIN_ACTION_SIZE + 1)
                        goto invalid;
                if (sdata->vif.type != NL80211_IFTYPE_STATION)
                        break;
                if (!rx->sta)
                        break;
                if (!ether_addr_equal(mgmt->bssid, sdata->deflink.u.mgd.bssid))
                        break;
                if (mgmt->u.action.u.ext_chan_switch.action_code !=
                                WLAN_PUB_ACTION_EXT_CHANSW_ANN)
                        break;
                if (len < offsetof(struct ieee80211_mgmt,
                                   u.action.u.ext_chan_switch.variable))
                        goto invalid;
                goto queue;
        case WLAN_CATEGORY_VHT:
                if (sdata->vif.type != NL80211_IFTYPE_STATION &&
                    sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
                    sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
                    sdata->vif.type != NL80211_IFTYPE_AP &&
                    sdata->vif.type != NL80211_IFTYPE_ADHOC)
                        break;

                /* verify action code is present */
                if (len < IEEE80211_MIN_ACTION_SIZE + 1)
                        goto invalid;

                switch (mgmt->u.action.u.vht_opmode_notif.action_code) {
                case WLAN_VHT_ACTION_OPMODE_NOTIF: {
                        /* verify opmode is present */
                        if (len < IEEE80211_MIN_ACTION_SIZE + 2)
                                goto invalid;
                        goto queue;
                }
                case WLAN_VHT_ACTION_GROUPID_MGMT: {
                        if (len < IEEE80211_MIN_ACTION_SIZE + 25)
                                goto invalid;
                        goto queue;
                }
                default:
                        break;
                }
                break;
        case WLAN_CATEGORY_BACK:
                if (sdata->vif.type != NL80211_IFTYPE_STATION &&
                    sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
                    sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
                    sdata->vif.type != NL80211_IFTYPE_AP &&
                    sdata->vif.type != NL80211_IFTYPE_ADHOC)
                        break;

                /* verify action_code is present */
                if (len < IEEE80211_MIN_ACTION_SIZE + 1)
                        break;

                switch (mgmt->u.action.u.addba_req.action_code) {
                case WLAN_ACTION_ADDBA_REQ:
                        if (len < (IEEE80211_MIN_ACTION_SIZE +
                                   sizeof(mgmt->u.action.u.addba_req)))
                                goto invalid;
                        break;
                case WLAN_ACTION_ADDBA_RESP:
                        if (len < (IEEE80211_MIN_ACTION_SIZE +
                                   sizeof(mgmt->u.action.u.addba_resp)))
                                goto invalid;
                        break;
                case WLAN_ACTION_DELBA:
                        if (len < (IEEE80211_MIN_ACTION_SIZE +
                                   sizeof(mgmt->u.action.u.delba)))
                                goto invalid;
                        break;
                default:
                        goto invalid;
                }

                goto queue;
        case WLAN_CATEGORY_SPECTRUM_MGMT:
                /* verify action_code is present */
                if (len < IEEE80211_MIN_ACTION_SIZE + 1)
                        break;

                switch (mgmt->u.action.u.measurement.action_code) {
                case WLAN_ACTION_SPCT_MSR_REQ:
                        if (status->band != NL80211_BAND_5GHZ)
                                break;

                        if (len < (IEEE80211_MIN_ACTION_SIZE +
                                   sizeof(mgmt->u.action.u.measurement)))
                                break;

                        if (sdata->vif.type != NL80211_IFTYPE_STATION)
                                break;

                        ieee80211_process_measurement_req(sdata, mgmt, len);
                        goto handled;
                case WLAN_ACTION_SPCT_CHL_SWITCH: {
                        u8 *bssid;
                        if (len < (IEEE80211_MIN_ACTION_SIZE +
                                   sizeof(mgmt->u.action.u.chan_switch)))
                                break;

                        if (sdata->vif.type != NL80211_IFTYPE_STATION &&
                            sdata->vif.type != NL80211_IFTYPE_ADHOC &&
                            sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
                                break;

                        if (sdata->vif.type == NL80211_IFTYPE_STATION)
                                bssid = sdata->deflink.u.mgd.bssid;
                        else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
                                bssid = sdata->u.ibss.bssid;
                        else if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
                                bssid = mgmt->sa;
                        else
                                break;

                        if (!ether_addr_equal(mgmt->bssid, bssid))
                                break;

                        goto queue;
                        }
                }
                break;
        case WLAN_CATEGORY_SELF_PROTECTED:
                if (len < (IEEE80211_MIN_ACTION_SIZE +
                           sizeof(mgmt->u.action.u.self_prot.action_code)))
                        break;

                switch (mgmt->u.action.u.self_prot.action_code) {
                case WLAN_SP_MESH_PEERING_OPEN:
                case WLAN_SP_MESH_PEERING_CLOSE:
                case WLAN_SP_MESH_PEERING_CONFIRM:
                        if (!ieee80211_vif_is_mesh(&sdata->vif))
                                goto invalid;
                        if (sdata->u.mesh.user_mpm)
                                /* userspace handles this frame */
                                break;
                        goto queue;
                case WLAN_SP_MGK_INFORM:
                case WLAN_SP_MGK_ACK:
                        if (!ieee80211_vif_is_mesh(&sdata->vif))
                                goto invalid;
                        break;
                }
                break;
        case WLAN_CATEGORY_MESH_ACTION:
                if (len < (IEEE80211_MIN_ACTION_SIZE +
                           sizeof(mgmt->u.action.u.mesh_action.action_code)))
                        break;

                if (!ieee80211_vif_is_mesh(&sdata->vif))
                        break;
                if (mesh_action_is_path_sel(mgmt) &&
                    !mesh_path_sel_is_hwmp(sdata))
                        break;
                goto queue;
        case WLAN_CATEGORY_S1G:
                if (len < offsetofend(typeof(*mgmt),
                                      u.action.u.s1g.action_code))
                        break;

                switch (mgmt->u.action.u.s1g.action_code) {
                case WLAN_S1G_TWT_SETUP:
                case WLAN_S1G_TWT_TEARDOWN:
                        if (ieee80211_process_rx_twt_action(rx))
                                goto queue;
                        break;
                default:
                        break;
                }
                break;
        case WLAN_CATEGORY_PROTECTED_EHT:
                if (len < offsetofend(typeof(*mgmt),
                                      u.action.u.ttlm_req.action_code))
                        break;

                switch (mgmt->u.action.u.ttlm_req.action_code) {
                case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ:
                        if (sdata->vif.type != NL80211_IFTYPE_STATION)
                                break;

                        if (len < offsetofend(typeof(*mgmt),
                                              u.action.u.ttlm_req))
                                goto invalid;
                        goto queue;
                case WLAN_PROTECTED_EHT_ACTION_TTLM_RES:
                        if (sdata->vif.type != NL80211_IFTYPE_STATION)
                                break;

                        if (len < offsetofend(typeof(*mgmt),
                                              u.action.u.ttlm_res))
                                goto invalid;
                        goto queue;
                default:
                        break;
                }
                break;
        }

        return RX_CONTINUE;

 invalid:
        status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM;
        /* will return in the next handlers */
        return RX_CONTINUE;

 handled:
        if (rx->sta)
                rx->link_sta->rx_stats.packets++;
        dev_kfree_skb(rx->skb);
        return RX_QUEUED;

 queue:
        ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb);
        return RX_QUEUED;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
        struct cfg80211_rx_info info = {
                .freq = ieee80211_rx_status_to_khz(status),
                .buf = rx->skb->data,
                .len = rx->skb->len,
                .link_id = rx->link_id,
                .have_link_id = rx->link_id >= 0,
        };

        /* skip known-bad action frames and return them in the next handler */
        if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
                return RX_CONTINUE;

        /*
         * Getting here means the kernel doesn't know how to handle
         * it, but maybe userspace does ... include returned frames
         * so userspace can register for those to know whether ones
         * it transmitted were processed or returned.
         */

        if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) &&
            !(status->flag & RX_FLAG_NO_SIGNAL_VAL))
                info.sig_dbm = status->signal;

        if (ieee80211_is_timing_measurement(rx->skb) ||
            ieee80211_is_ftm(rx->skb)) {
                info.rx_tstamp = ktime_to_ns(skb_hwtstamps(rx->skb)->hwtstamp);
                info.ack_tstamp = ktime_to_ns(status->ack_tx_hwtstamp);
        }

        if (cfg80211_rx_mgmt_ext(&rx->sdata->wdev, &info)) {
                if (rx->sta)
                        rx->link_sta->rx_stats.packets++;
                dev_kfree_skb(rx->skb);
                return RX_QUEUED;
        }

        return RX_CONTINUE;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
        int len = rx->skb->len;

        if (!ieee80211_is_action(mgmt->frame_control))
                return RX_CONTINUE;

        switch (mgmt->u.action.category) {
        case WLAN_CATEGORY_SA_QUERY:
                if (len < (IEEE80211_MIN_ACTION_SIZE +
                           sizeof(mgmt->u.action.u.sa_query)))
                        break;

                switch (mgmt->u.action.u.sa_query.action) {
                case WLAN_ACTION_SA_QUERY_REQUEST:
                        if (sdata->vif.type != NL80211_IFTYPE_STATION)
                                break;
                        ieee80211_process_sa_query_req(sdata, mgmt, len);
                        goto handled;
                }
                break;
        }

        return RX_CONTINUE;

 handled:
        if (rx->sta)
                rx->link_sta->rx_stats.packets++;
        dev_kfree_skb(rx->skb);
        return RX_QUEUED;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
{
        struct ieee80211_local *local = rx->local;
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
        struct sk_buff *nskb;
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);

        if (!ieee80211_is_action(mgmt->frame_control))
                return RX_CONTINUE;

        /*
         * For AP mode, hostapd is responsible for handling any action
         * frames that we didn't handle, including returning unknown
         * ones. For all other modes we will return them to the sender,
         * setting the 0x80 bit in the action category, as required by
         * 802.11-2012 9.24.4.
         * Newer versions of hostapd shall also use the management frame
         * registration mechanisms, but older ones still use cooked
         * monitor interfaces so push all frames there.
         */
        if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
            (sdata->vif.type == NL80211_IFTYPE_AP ||
             sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
                return RX_DROP_MONITOR;

        if (is_multicast_ether_addr(mgmt->da))
                return RX_DROP_MONITOR;

        /* do not return rejected action frames */
        if (mgmt->u.action.category & 0x80)
                return RX_DROP_U_REJECTED_ACTION_RESPONSE;

        nskb = skb_copy_expand(rx->skb, local->hw.extra_tx_headroom, 0,
                               GFP_ATOMIC);
        if (nskb) {
                struct ieee80211_mgmt *nmgmt = (void *)nskb->data;

                nmgmt->u.action.category |= 0x80;
                memcpy(nmgmt->da, nmgmt->sa, ETH_ALEN);
                memcpy(nmgmt->sa, rx->sdata->vif.addr, ETH_ALEN);

                memset(nskb->cb, 0, sizeof(nskb->cb));

                if (rx->sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) {
                        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(nskb);

                        info->flags = IEEE80211_TX_CTL_TX_OFFCHAN |
                                      IEEE80211_TX_INTFL_OFFCHAN_TX_OK |
                                      IEEE80211_TX_CTL_NO_CCK_RATE;
                        if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
                                info->hw_queue =
                                        local->hw.offchannel_tx_hw_queue;
                }

                __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7, -1,
                                            status->band);
        }

        return RX_DROP_U_UNKNOWN_ACTION_REJECTED;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_ext(struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_hdr *hdr = (void *)rx->skb->data;

        if (!ieee80211_is_ext(hdr->frame_control))
                return RX_CONTINUE;

        if (sdata->vif.type != NL80211_IFTYPE_STATION)
                return RX_DROP_MONITOR;

        /* for now only beacons are ext, so queue them */
        ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb);

        return RX_QUEUED;
}

static ieee80211_rx_result debug_noinline
ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
        __le16 stype;

        stype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE);

        if (!ieee80211_vif_is_mesh(&sdata->vif) &&
            sdata->vif.type != NL80211_IFTYPE_ADHOC &&
            sdata->vif.type != NL80211_IFTYPE_OCB &&
            sdata->vif.type != NL80211_IFTYPE_STATION)
                return RX_DROP_MONITOR;

        switch (stype) {
        case cpu_to_le16(IEEE80211_STYPE_AUTH):
        case cpu_to_le16(IEEE80211_STYPE_BEACON):
        case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP):
                /* process for all: mesh, mlme, ibss */
                break;
        case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
                if (is_multicast_ether_addr(mgmt->da) &&
                    !is_broadcast_ether_addr(mgmt->da))
                        return RX_DROP_MONITOR;

                /* process only for station/IBSS */
                if (sdata->vif.type != NL80211_IFTYPE_STATION &&
                    sdata->vif.type != NL80211_IFTYPE_ADHOC)
                        return RX_DROP_MONITOR;
                break;
        case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP):
        case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP):
        case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
                if (is_multicast_ether_addr(mgmt->da) &&
                    !is_broadcast_ether_addr(mgmt->da))
                        return RX_DROP_MONITOR;

                /* process only for station */
                if (sdata->vif.type != NL80211_IFTYPE_STATION)
                        return RX_DROP_MONITOR;
                break;
        case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ):
                /* process only for ibss and mesh */
                if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
                    sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
                        return RX_DROP_MONITOR;
                break;
        default:
                return RX_DROP_MONITOR;
        }

        ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb);

        return RX_QUEUED;
}

static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
                                        struct ieee80211_rate *rate,
                                        ieee80211_rx_result reason)
{
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_local *local = rx->local;
        struct sk_buff *skb = rx->skb, *skb2;
        struct net_device *prev_dev = NULL;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        int needed_headroom;

        /*
         * If cooked monitor has been processed already, then
         * don't do it again. If not, set the flag.
         */
        if (rx->flags & IEEE80211_RX_CMNTR)
                goto out_free_skb;
        rx->flags |= IEEE80211_RX_CMNTR;

        /* If there are no cooked monitor interfaces, just free the SKB */
        if (!local->cooked_mntrs)
                goto out_free_skb;

        /* room for the radiotap header based on driver features */
        needed_headroom = ieee80211_rx_radiotap_hdrlen(local, status, skb);

        if (skb_headroom(skb) < needed_headroom &&
            pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC))
                goto out_free_skb;

        /* prepend radiotap information */
        ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom,
                                         false);

        skb_reset_mac_header(skb);
        skb->ip_summed = CHECKSUM_UNNECESSARY;
        skb->pkt_type = PACKET_OTHERHOST;
        skb->protocol = htons(ETH_P_802_2);

        list_for_each_entry_rcu(sdata, &local->interfaces, list) {
                if (!ieee80211_sdata_running(sdata))
                        continue;

                if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
                    !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES))
                        continue;

                if (prev_dev) {
                        skb2 = skb_clone(skb, GFP_ATOMIC);
                        if (skb2) {
                                skb2->dev = prev_dev;
                                netif_receive_skb(skb2);
                        }
                }

                prev_dev = sdata->dev;
                dev_sw_netstats_rx_add(sdata->dev, skb->len);
        }

        if (prev_dev) {
                skb->dev = prev_dev;
                netif_receive_skb(skb);
                return;
        }

 out_free_skb:
        kfree_skb_reason(skb, (__force u32)reason);
}

static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
                                         ieee80211_rx_result res)
{
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
        struct ieee80211_supported_band *sband;
        struct ieee80211_rate *rate = NULL;

        if (res == RX_QUEUED) {
                I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
                return;
        }

        if (res != RX_CONTINUE) {
                I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
                if (rx->sta)
                        rx->link_sta->rx_stats.dropped++;
        }

        if (u32_get_bits((__force u32)res, SKB_DROP_REASON_SUBSYS_MASK) ==
                        SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE) {
                kfree_skb_reason(rx->skb, (__force u32)res);
                return;
        }

        sband = rx->local->hw.wiphy->bands[status->band];
        if (status->encoding == RX_ENC_LEGACY)
                rate = &sband->bitrates[status->rate_idx];

        ieee80211_rx_cooked_monitor(rx, rate, res);
}

static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
                                  struct sk_buff_head *frames)
{
        ieee80211_rx_result res = RX_DROP_MONITOR;
        struct sk_buff *skb;

#define CALL_RXH(rxh)                        \
        do {                                \
                res = rxh(rx);                \
                if (res != RX_CONTINUE)        \
                        goto rxh_next;  \
        } while (0)

        /* Lock here to avoid hitting all of the data used in the RX
         * path (e.g. key data, station data, ...) concurrently when
         * a frame is released from the reorder buffer due to timeout
         * from the timer, potentially concurrently with RX from the
         * driver.
         */
        spin_lock_bh(&rx->local->rx_path_lock);

        while ((skb = __skb_dequeue(frames))) {
                /*
                 * all the other fields are valid across frames
                 * that belong to an aMPDU since they are on the
                 * same TID from the same station
                 */
                rx->skb = skb;

                if (WARN_ON_ONCE(!rx->link))
                        goto rxh_next;

                CALL_RXH(ieee80211_rx_h_check_more_data);
                CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll);
                CALL_RXH(ieee80211_rx_h_sta_process);
                CALL_RXH(ieee80211_rx_h_decrypt);
                CALL_RXH(ieee80211_rx_h_defragment);
                CALL_RXH(ieee80211_rx_h_michael_mic_verify);
                /* must be after MMIC verify so header is counted in MPDU mic */
                CALL_RXH(ieee80211_rx_h_amsdu);
                CALL_RXH(ieee80211_rx_h_data);

                /* special treatment -- needs the queue */
                res = ieee80211_rx_h_ctrl(rx, frames);
                if (res != RX_CONTINUE)
                        goto rxh_next;

                CALL_RXH(ieee80211_rx_h_mgmt_check);
                CALL_RXH(ieee80211_rx_h_action);
                CALL_RXH(ieee80211_rx_h_userspace_mgmt);
                CALL_RXH(ieee80211_rx_h_action_post_userspace);
                CALL_RXH(ieee80211_rx_h_action_return);
                CALL_RXH(ieee80211_rx_h_ext);
                CALL_RXH(ieee80211_rx_h_mgmt);

 rxh_next:
                ieee80211_rx_handlers_result(rx, res);

#undef CALL_RXH
        }

        spin_unlock_bh(&rx->local->rx_path_lock);
}

static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
{
        struct sk_buff_head reorder_release;
        ieee80211_rx_result res = RX_DROP_MONITOR;

        __skb_queue_head_init(&reorder_release);

#define CALL_RXH(rxh)                        \
        do {                                \
                res = rxh(rx);                \
                if (res != RX_CONTINUE)        \
                        goto rxh_next;  \
        } while (0)

        CALL_RXH(ieee80211_rx_h_check_dup);
        CALL_RXH(ieee80211_rx_h_check);

        ieee80211_rx_reorder_ampdu(rx, &reorder_release);

        ieee80211_rx_handlers(rx, &reorder_release);
        return;

 rxh_next:
        ieee80211_rx_handlers_result(rx, res);

#undef CALL_RXH
}

static bool
ieee80211_rx_is_valid_sta_link_id(struct ieee80211_sta *sta, u8 link_id)
{
        return !!(sta->valid_links & BIT(link_id));
}

static bool ieee80211_rx_data_set_link(struct ieee80211_rx_data *rx,
                                       u8 link_id)
{
        rx->link_id = link_id;
        rx->link = rcu_dereference(rx->sdata->link[link_id]);

        if (!rx->sta)
                return rx->link;

        if (!ieee80211_rx_is_valid_sta_link_id(&rx->sta->sta, link_id))
                return false;

        rx->link_sta = rcu_dereference(rx->sta->link[link_id]);

        return rx->link && rx->link_sta;
}

static bool ieee80211_rx_data_set_sta(struct ieee80211_rx_data *rx,
                                      struct sta_info *sta, int link_id)
{
        rx->link_id = link_id;
        rx->sta = sta;

        if (sta) {
                rx->local = sta->sdata->local;
                if (!rx->sdata)
                        rx->sdata = sta->sdata;
                rx->link_sta = &sta->deflink;
        } else {
                rx->link_sta = NULL;
        }

        if (link_id < 0)
                rx->link = &rx->sdata->deflink;
        else if (!ieee80211_rx_data_set_link(rx, link_id))
                return false;

        return true;
}

/*
 * This function makes calls into the RX path, therefore
 * it has to be invoked under RCU read lock.
 */
void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
{
        struct sk_buff_head frames;
        struct ieee80211_rx_data rx = {
                /* This is OK -- must be QoS data frame */
                .security_idx = tid,
                .seqno_idx = tid,
        };
        struct tid_ampdu_rx *tid_agg_rx;
        int link_id = -1;

        /* FIXME: statistics won't be right with this */
        if (sta->sta.valid_links)
                link_id = ffs(sta->sta.valid_links) - 1;

        if (!ieee80211_rx_data_set_sta(&rx, sta, link_id))
                return;

        tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
        if (!tid_agg_rx)
                return;

        __skb_queue_head_init(&frames);

        spin_lock(&tid_agg_rx->reorder_lock);
        ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);
        spin_unlock(&tid_agg_rx->reorder_lock);

        if (!skb_queue_empty(&frames)) {
                struct ieee80211_event event = {
                        .type = BA_FRAME_TIMEOUT,
                        .u.ba.tid = tid,
                        .u.ba.sta = &sta->sta,
                };
                drv_event_callback(rx.local, rx.sdata, &event);
        }

        ieee80211_rx_handlers(&rx, &frames);
}

void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
                                          u16 ssn, u64 filtered,
                                          u16 received_mpdus)
{
        struct ieee80211_local *local;
        struct sta_info *sta;
        struct tid_ampdu_rx *tid_agg_rx;
        struct sk_buff_head frames;
        struct ieee80211_rx_data rx = {
                /* This is OK -- must be QoS data frame */
                .security_idx = tid,
                .seqno_idx = tid,
        };
        int i, diff;

        if (WARN_ON(!pubsta || tid >= IEEE80211_NUM_TIDS))
                return;

        __skb_queue_head_init(&frames);

        sta = container_of(pubsta, struct sta_info, sta);

        local = sta->sdata->local;
        WARN_ONCE(local->hw.max_rx_aggregation_subframes > 64,
                  "RX BA marker can't support max_rx_aggregation_subframes %u > 64\n",
                  local->hw.max_rx_aggregation_subframes);

        if (!ieee80211_rx_data_set_sta(&rx, sta, -1))
                return;

        rcu_read_lock();
        tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
        if (!tid_agg_rx)
                goto out;

        spin_lock_bh(&tid_agg_rx->reorder_lock);

        if (received_mpdus >= IEEE80211_SN_MODULO >> 1) {
                int release;

                /* release all frames in the reorder buffer */
                release = (tid_agg_rx->head_seq_num + tid_agg_rx->buf_size) %
                           IEEE80211_SN_MODULO;
                ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx,
                                                 release, &frames);
                /* update ssn to match received ssn */
                tid_agg_rx->head_seq_num = ssn;
        } else {
                ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, ssn,
                                                 &frames);
        }

        /* handle the case that received ssn is behind the mac ssn.
         * it can be tid_agg_rx->buf_size behind and still be valid */
        diff = (tid_agg_rx->head_seq_num - ssn) & IEEE80211_SN_MASK;
        if (diff >= tid_agg_rx->buf_size) {
                tid_agg_rx->reorder_buf_filtered = 0;
                goto release;
        }
        filtered = filtered >> diff;
        ssn += diff;

        /* update bitmap */
        for (i = 0; i < tid_agg_rx->buf_size; i++) {
                int index = (ssn + i) % tid_agg_rx->buf_size;

                tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
                if (filtered & BIT_ULL(i))
                        tid_agg_rx->reorder_buf_filtered |= BIT_ULL(index);
        }

        /* now process also frames that the filter marking released */
        ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);

release:
        spin_unlock_bh(&tid_agg_rx->reorder_lock);

        ieee80211_rx_handlers(&rx, &frames);

 out:
        rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_mark_rx_ba_filtered_frames);

/* main receive path */

static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
{
        return ether_addr_equal(raddr, addr) ||
               is_broadcast_ether_addr(raddr);
}

static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
{
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct sk_buff *skb = rx->skb;
        struct ieee80211_hdr *hdr = (void *)skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
        bool multicast = is_multicast_ether_addr(hdr->addr1) ||
                         ieee80211_is_s1g_beacon(hdr->frame_control);

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_STATION:
                if (!bssid && !sdata->u.mgd.use_4addr)
                        return false;
                if (ieee80211_is_first_frag(hdr->seq_ctrl) &&
                    ieee80211_is_robust_mgmt_frame(skb) && !rx->sta)
                        return false;
                if (multicast)
                        return true;
                return ieee80211_is_our_addr(sdata, hdr->addr1, &rx->link_id);
        case NL80211_IFTYPE_ADHOC:
                if (!bssid)
                        return false;
                if (ether_addr_equal(sdata->vif.addr, hdr->addr2) ||
                    ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2) ||
                    !is_valid_ether_addr(hdr->addr2))
                        return false;
                if (ieee80211_is_beacon(hdr->frame_control))
                        return true;
                if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid))
                        return false;
                if (!multicast &&
                    !ether_addr_equal(sdata->vif.addr, hdr->addr1))
                        return false;
                if (!rx->sta) {
                        int rate_idx;
                        if (status->encoding != RX_ENC_LEGACY)
                                rate_idx = 0; /* TODO: HT/VHT rates */
                        else
                                rate_idx = status->rate_idx;
                        ieee80211_ibss_rx_no_sta(sdata, bssid, hdr->addr2,
                                                 BIT(rate_idx));
                }
                return true;
        case NL80211_IFTYPE_OCB:
                if (!bssid)
                        return false;
                if (!ieee80211_is_data_present(hdr->frame_control))
                        return false;
                if (!is_broadcast_ether_addr(bssid))
                        return false;
                if (!multicast &&
                    !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1))
                        return false;
                if (!rx->sta) {
                        int rate_idx;
                        if (status->encoding != RX_ENC_LEGACY)
                                rate_idx = 0; /* TODO: HT rates */
                        else
                                rate_idx = status->rate_idx;
                        ieee80211_ocb_rx_no_sta(sdata, bssid, hdr->addr2,
                                                BIT(rate_idx));
                }
                return true;
        case NL80211_IFTYPE_MESH_POINT:
                if (ether_addr_equal(sdata->vif.addr, hdr->addr2))
                        return false;
                if (multicast)
                        return true;
                return ether_addr_equal(sdata->vif.addr, hdr->addr1);
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_AP:
                if (!bssid)
                        return ieee80211_is_our_addr(sdata, hdr->addr1,
                                                     &rx->link_id);

                if (!is_broadcast_ether_addr(bssid) &&
                    !ieee80211_is_our_addr(sdata, bssid, NULL)) {
                        /*
                         * Accept public action frames even when the
                         * BSSID doesn't match, this is used for P2P
                         * and location updates. Note that mac80211
                         * itself never looks at these frames.
                         */
                        if (!multicast &&
                            !ieee80211_is_our_addr(sdata, hdr->addr1,
                                                   &rx->link_id))
                                return false;
                        if (ieee80211_is_public_action(hdr, skb->len))
                                return true;
                        return ieee80211_is_beacon(hdr->frame_control);
                }

                if (!ieee80211_has_tods(hdr->frame_control)) {
                        /* ignore data frames to TDLS-peers */
                        if (ieee80211_is_data(hdr->frame_control))
                                return false;
                        /* ignore action frames to TDLS-peers */
                        if (ieee80211_is_action(hdr->frame_control) &&
                            !is_broadcast_ether_addr(bssid) &&
                            !ether_addr_equal(bssid, hdr->addr1))
                                return false;
                }

                /*
                 * 802.11-2016 Table 9-26 says that for data frames, A1 must be
                 * the BSSID - we've checked that already but may have accepted
                 * the wildcard (ff:ff:ff:ff:ff:ff).
                 *
                 * It also says:
                 *        The BSSID of the Data frame is determined as follows:
                 *        a) If the STA is contained within an AP or is associated
                 *           with an AP, the BSSID is the address currently in use
                 *           by the STA contained in the AP.
                 *
                 * So we should not accept data frames with an address that's
                 * multicast.
                 *
                 * Accepting it also opens a security problem because stations
                 * could encrypt it with the GTK and inject traffic that way.
                 */
                if (ieee80211_is_data(hdr->frame_control) && multicast)
                        return false;

                return true;
        case NL80211_IFTYPE_P2P_DEVICE:
                return ieee80211_is_public_action(hdr, skb->len) ||
                       ieee80211_is_probe_req(hdr->frame_control) ||
                       ieee80211_is_probe_resp(hdr->frame_control) ||
                       ieee80211_is_beacon(hdr->frame_control);
        case NL80211_IFTYPE_NAN:
                /* Currently no frames on NAN interface are allowed */
                return false;
        default:
                break;
        }

        WARN_ON_ONCE(1);
        return false;
}

void ieee80211_check_fast_rx(struct sta_info *sta)
{
        struct ieee80211_sub_if_data *sdata = sta->sdata;
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_key *key;
        struct ieee80211_fast_rx fastrx = {
                .dev = sdata->dev,
                .vif_type = sdata->vif.type,
                .control_port_protocol = sdata->control_port_protocol,
        }, *old, *new = NULL;
        u32 offload_flags;
        bool set_offload = false;
        bool assign = false;
        bool offload;

        /* use sparse to check that we don't return without updating */
        __acquire(check_fast_rx);

        BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != sizeof(rfc1042_header));
        BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != ETH_ALEN);
        ether_addr_copy(fastrx.rfc1042_hdr, rfc1042_header);
        ether_addr_copy(fastrx.vif_addr, sdata->vif.addr);

        fastrx.uses_rss = ieee80211_hw_check(&local->hw, USES_RSS);

        /* fast-rx doesn't do reordering */
        if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) &&
            !ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER))
                goto clear;

        switch (sdata->vif.type) {
        case NL80211_IFTYPE_STATION:
                if (sta->sta.tdls) {
                        fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
                        fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
                        fastrx.expected_ds_bits = 0;
                } else {
                        fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
                        fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3);
                        fastrx.expected_ds_bits =
                                cpu_to_le16(IEEE80211_FCTL_FROMDS);
                }

                if (sdata->u.mgd.use_4addr && !sta->sta.tdls) {
                        fastrx.expected_ds_bits |=
                                cpu_to_le16(IEEE80211_FCTL_TODS);
                        fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3);
                        fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4);
                }

                if (!sdata->u.mgd.powersave)
                        break;

                /* software powersave is a huge mess, avoid all of it */
                if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
                        goto clear;
                if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) &&
                    !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
                        goto clear;
                break;
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_AP:
                /* parallel-rx requires this, at least with calls to
                 * ieee80211_sta_ps_transition()
                 */
                if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
                        goto clear;
                fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3);
                fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
                fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_TODS);

                fastrx.internal_forward =
                        !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
                        (sdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
                         !sdata->u.vlan.sta);

                if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
                    sdata->u.vlan.sta) {
                        fastrx.expected_ds_bits |=
                                cpu_to_le16(IEEE80211_FCTL_FROMDS);
                        fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4);
                        fastrx.internal_forward = 0;
                }

                break;
        case NL80211_IFTYPE_MESH_POINT:
                fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_FROMDS |
                                                      IEEE80211_FCTL_TODS);
                fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3);
                fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4);
                break;
        default:
                goto clear;
        }

        if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
                goto clear;

        rcu_read_lock();
        key = rcu_dereference(sta->ptk[sta->ptk_idx]);
        if (!key)
                key = rcu_dereference(sdata->default_unicast_key);
        if (key) {
                switch (key->conf.cipher) {
                case WLAN_CIPHER_SUITE_TKIP:
                        /* we don't want to deal with MMIC in fast-rx */
                        goto clear_rcu;
                case WLAN_CIPHER_SUITE_CCMP:
                case WLAN_CIPHER_SUITE_CCMP_256:
                case WLAN_CIPHER_SUITE_GCMP:
                case WLAN_CIPHER_SUITE_GCMP_256:
                        break;
                default:
                        /* We also don't want to deal with
                         * WEP or cipher scheme.
                         */
                        goto clear_rcu;
                }

                fastrx.key = true;
                fastrx.icv_len = key->conf.icv_len;
        }

        assign = true;
 clear_rcu:
        rcu_read_unlock();
 clear:
        __release(check_fast_rx);

        if (assign)
                new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL);

        offload_flags = get_bss_sdata(sdata)->vif.offload_flags;
        offload = offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED;

        if (assign && offload)
                set_offload = !test_and_set_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD);
        else
                set_offload = test_and_clear_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD);

        if (set_offload)
                drv_sta_set_decap_offload(local, sdata, &sta->sta, assign);

        spin_lock_bh(&sta->lock);
        old = rcu_dereference_protected(sta->fast_rx, true);
        rcu_assign_pointer(sta->fast_rx, new);
        spin_unlock_bh(&sta->lock);

        if (old)
                kfree_rcu(old, rcu_head);
}

void ieee80211_clear_fast_rx(struct sta_info *sta)
{
        struct ieee80211_fast_rx *old;

        spin_lock_bh(&sta->lock);
        old = rcu_dereference_protected(sta->fast_rx, true);
        RCU_INIT_POINTER(sta->fast_rx, NULL);
        spin_unlock_bh(&sta->lock);

        if (old)
                kfree_rcu(old, rcu_head);
}

void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        struct sta_info *sta;

        lockdep_assert_wiphy(local->hw.wiphy);

        list_for_each_entry(sta, &local->sta_list, list) {
                if (sdata != sta->sdata &&
                    (!sta->sdata->bss || sta->sdata->bss != sdata->bss))
                        continue;
                ieee80211_check_fast_rx(sta);
        }
}

void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;

        lockdep_assert_wiphy(local->hw.wiphy);

        __ieee80211_check_fast_rx_iface(sdata);
}

static void ieee80211_rx_8023(struct ieee80211_rx_data *rx,
                              struct ieee80211_fast_rx *fast_rx,
                              int orig_len)
{
        struct ieee80211_sta_rx_stats *stats;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
        struct sta_info *sta = rx->sta;
        struct link_sta_info *link_sta;
        struct sk_buff *skb = rx->skb;
        void *sa = skb->data + ETH_ALEN;
        void *da = skb->data;

        if (rx->link_id >= 0) {
                link_sta = rcu_dereference(sta->link[rx->link_id]);
                if (WARN_ON_ONCE(!link_sta)) {
                        dev_kfree_skb(rx->skb);
                        return;
                }
        } else {
                link_sta = &sta->deflink;
        }

        stats = &link_sta->rx_stats;
        if (fast_rx->uses_rss)
                stats = this_cpu_ptr(link_sta->pcpu_rx_stats);

        /* statistics part of ieee80211_rx_h_sta_process() */
        if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
                stats->last_signal = status->signal;
                if (!fast_rx->uses_rss)
                        ewma_signal_add(&link_sta->rx_stats_avg.signal,
                                        -status->signal);
        }

        if (status->chains) {
                int i;

                stats->chains = status->chains;
                for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
                        int signal = status->chain_signal[i];

                        if (!(status->chains & BIT(i)))
                                continue;

                        stats->chain_signal_last[i] = signal;
                        if (!fast_rx->uses_rss)
                                ewma_signal_add(&link_sta->rx_stats_avg.chain_signal[i],
                                                -signal);
                }
        }
        /* end of statistics */

        stats->last_rx = jiffies;
        stats->last_rate = sta_stats_encode_rate(status);

        stats->fragments++;
        stats->packets++;

        skb->dev = fast_rx->dev;

        dev_sw_netstats_rx_add(fast_rx->dev, skb->len);

        /* The seqno index has the same property as needed
         * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS
         * for non-QoS-data frames. Here we know it's a data
         * frame, so count MSDUs.
         */
        u64_stats_update_begin(&stats->syncp);
        stats->msdu[rx->seqno_idx]++;
        stats->bytes += orig_len;
        u64_stats_update_end(&stats->syncp);

        if (fast_rx->internal_forward) {
                struct sk_buff *xmit_skb = NULL;
                if (is_multicast_ether_addr(da)) {
                        xmit_skb = skb_copy(skb, GFP_ATOMIC);
                } else if (!ether_addr_equal(da, sa) &&
                           sta_info_get(rx->sdata, da)) {
                        xmit_skb = skb;
                        skb = NULL;
                }

                if (xmit_skb) {
                        /*
                         * Send to wireless media and increase priority by 256
                         * to keep the received priority instead of
                         * reclassifying the frame (see cfg80211_classify8021d).
                         */
                        xmit_skb->priority += 256;
                        xmit_skb->protocol = htons(ETH_P_802_3);
                        skb_reset_network_header(xmit_skb);
                        skb_reset_mac_header(xmit_skb);
                        dev_queue_xmit(xmit_skb);
                }

                if (!skb)
                        return;
        }

        /* deliver to local stack */
        skb->protocol = eth_type_trans(skb, fast_rx->dev);
        ieee80211_deliver_skb_to_local_stack(skb, rx);
}

static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
                                     struct ieee80211_fast_rx *fast_rx)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_hdr *hdr = (void *)skb->data;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        static ieee80211_rx_result res;
        int orig_len = skb->len;
        int hdrlen = ieee80211_hdrlen(hdr->frame_control);
        int snap_offs = hdrlen;
        struct {
                u8 snap[sizeof(rfc1042_header)];
                __be16 proto;
        } *payload __aligned(2);
        struct {
                u8 da[ETH_ALEN];
                u8 sa[ETH_ALEN];
        } addrs __aligned(2);
        struct ieee80211_sta_rx_stats *stats;

        /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write
         * to a common data structure; drivers can implement that per queue
         * but we don't have that information in mac80211
         */
        if (!(status->flag & RX_FLAG_DUP_VALIDATED))
                return false;

#define FAST_RX_CRYPT_FLAGS        (RX_FLAG_PN_VALIDATED | RX_FLAG_DECRYPTED)

        /* If using encryption, we also need to have:
         *  - PN_VALIDATED: similar, but the implementation is tricky
         *  - DECRYPTED: necessary for PN_VALIDATED
         */
        if (fast_rx->key &&
            (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS)
                return false;

        if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
                return false;

        if (unlikely(ieee80211_is_frag(hdr)))
                return false;

        /* Since our interface address cannot be multicast, this
         * implicitly also rejects multicast frames without the
         * explicit check.
         *
         * We shouldn't get any *data* frames not addressed to us
         * (AP mode will accept multicast *management* frames), but
         * punting here will make it go through the full checks in
         * ieee80211_accept_frame().
         */
        if (!ether_addr_equal(fast_rx->vif_addr, hdr->addr1))
                return false;

        if ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FROMDS |
                                              IEEE80211_FCTL_TODS)) !=
            fast_rx->expected_ds_bits)
                return false;

        /* assign the key to drop unencrypted frames (later)
         * and strip the IV/MIC if necessary
         */
        if (fast_rx->key && !(status->flag & RX_FLAG_IV_STRIPPED)) {
                /* GCMP header length is the same */
                snap_offs += IEEE80211_CCMP_HDR_LEN;
        }

        if (!ieee80211_vif_is_mesh(&rx->sdata->vif) &&
            !(status->rx_flags & IEEE80211_RX_AMSDU)) {
                if (!pskb_may_pull(skb, snap_offs + sizeof(*payload)))
                        return false;

                payload = (void *)(skb->data + snap_offs);

                if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr))
                        return false;

                /* Don't handle these here since they require special code.
                 * Accept AARP and IPX even though they should come with a
                 * bridge-tunnel header - but if we get them this way then
                 * there's little point in discarding them.
                 */
                if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) ||
                             payload->proto == fast_rx->control_port_protocol))
                        return false;
        }

        /* after this point, don't punt to the slowpath! */

        if (rx->key && !(status->flag & RX_FLAG_MIC_STRIPPED) &&
            pskb_trim(skb, skb->len - fast_rx->icv_len))
                goto drop;

        if (rx->key && !ieee80211_has_protected(hdr->frame_control))
                goto drop;

        if (status->rx_flags & IEEE80211_RX_AMSDU) {
                if (__ieee80211_rx_h_amsdu(rx, snap_offs - hdrlen) !=
                    RX_QUEUED)
                        goto drop;

                return true;
        }

        /* do the header conversion - first grab the addresses */
        ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs);
        ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs);
        if (ieee80211_vif_is_mesh(&rx->sdata->vif)) {
            skb_pull(skb, snap_offs - 2);
            put_unaligned_be16(skb->len - 2, skb->data);
        } else {
            skb_postpull_rcsum(skb, skb->data + snap_offs,
                               sizeof(rfc1042_header) + 2);

            /* remove the SNAP but leave the ethertype */
            skb_pull(skb, snap_offs + sizeof(rfc1042_header));
        }
        /* push the addresses in front */
        memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs));

        res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb);
        switch (res) {
        case RX_QUEUED:
                return true;
        case RX_CONTINUE:
                break;
        default:
                goto drop;
        }

        ieee80211_rx_8023(rx, fast_rx, orig_len);

        return true;
 drop:
        dev_kfree_skb(skb);

        if (fast_rx->uses_rss)
                stats = this_cpu_ptr(rx->link_sta->pcpu_rx_stats);
        else
                stats = &rx->link_sta->rx_stats;

        stats->dropped++;
        return true;
}

/*
 * This function returns whether or not the SKB
 * was destined for RX processing or not, which,
 * if consume is true, is equivalent to whether
 * or not the skb was consumed.
 */
static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
                                            struct sk_buff *skb, bool consume)
{
        struct ieee80211_local *local = rx->local;
        struct ieee80211_sub_if_data *sdata = rx->sdata;
        struct ieee80211_hdr *hdr = (void *)skb->data;
        struct link_sta_info *link_sta = rx->link_sta;
        struct ieee80211_link_data *link = rx->link;

        rx->skb = skb;

        /* See if we can do fast-rx; if we have to copy we already lost,
         * so punt in that case. We should never have to deliver a data
         * frame to multiple interfaces anyway.
         *
         * We skip the ieee80211_accept_frame() call and do the necessary
         * checking inside ieee80211_invoke_fast_rx().
         */
        if (consume && rx->sta) {
                struct ieee80211_fast_rx *fast_rx;

                fast_rx = rcu_dereference(rx->sta->fast_rx);
                if (fast_rx && ieee80211_invoke_fast_rx(rx, fast_rx))
                        return true;
        }

        if (!ieee80211_accept_frame(rx))
                return false;

        if (!consume) {
                struct skb_shared_hwtstamps *shwt;

                rx->skb = skb_copy(skb, GFP_ATOMIC);
                if (!rx->skb) {
                        if (net_ratelimit())
                                wiphy_debug(local->hw.wiphy,
                                        "failed to copy skb for %s\n",
                                        sdata->name);
                        return true;
                }

                /* skb_copy() does not copy the hw timestamps, so copy it
                 * explicitly
                 */
                shwt = skb_hwtstamps(rx->skb);
                shwt->hwtstamp = skb_hwtstamps(skb)->hwtstamp;

                /* Update the hdr pointer to the new skb for translation below */
                hdr = (struct ieee80211_hdr *)rx->skb->data;
        }

        if (unlikely(rx->sta && rx->sta->sta.mlo) &&
            is_unicast_ether_addr(hdr->addr1) &&
            !ieee80211_is_probe_resp(hdr->frame_control) &&
            !ieee80211_is_beacon(hdr->frame_control)) {
                /* translate to MLD addresses */
                if (ether_addr_equal(link->conf->addr, hdr->addr1))
                        ether_addr_copy(hdr->addr1, rx->sdata->vif.addr);
                if (ether_addr_equal(link_sta->addr, hdr->addr2))
                        ether_addr_copy(hdr->addr2, rx->sta->addr);
                /* translate A3 only if it's the BSSID */
                if (!ieee80211_has_tods(hdr->frame_control) &&
                    !ieee80211_has_fromds(hdr->frame_control)) {
                        if (ether_addr_equal(link_sta->addr, hdr->addr3))
                                ether_addr_copy(hdr->addr3, rx->sta->addr);
                        else if (ether_addr_equal(link->conf->addr, hdr->addr3))
                                ether_addr_copy(hdr->addr3, rx->sdata->vif.addr);
                }
                /* not needed for A4 since it can only carry the SA */
        }

        ieee80211_invoke_rx_handlers(rx);
        return true;
}

static void __ieee80211_rx_handle_8023(struct ieee80211_hw *hw,
                                       struct ieee80211_sta *pubsta,
                                       struct sk_buff *skb,
                                       struct list_head *list)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_fast_rx *fast_rx;
        struct ieee80211_rx_data rx;
        struct sta_info *sta;
        int link_id = -1;

        memset(&rx, 0, sizeof(rx));
        rx.skb = skb;
        rx.local = local;
        rx.list = list;
        rx.link_id = -1;

        I802_DEBUG_INC(local->dot11ReceivedFragmentCount);

        /* drop frame if too short for header */
        if (skb->len < sizeof(struct ethhdr))
                goto drop;

        if (!pubsta)
                goto drop;

        if (status->link_valid)
                link_id = status->link_id;

        /*
         * TODO: Should the frame be dropped if the right link_id is not
         * available? Or may be it is fine in the current form to proceed with
         * the frame processing because with frame being in 802.3 format,
         * link_id is used only for stats purpose and updating the stats on
         * the deflink is fine?
         */
        sta = container_of(pubsta, struct sta_info, sta);
        if (!ieee80211_rx_data_set_sta(&rx, sta, link_id))
                goto drop;

        fast_rx = rcu_dereference(rx.sta->fast_rx);
        if (!fast_rx)
                goto drop;

        ieee80211_rx_8023(&rx, fast_rx, skb->len);
        return;

drop:
        dev_kfree_skb(skb);
}

static bool ieee80211_rx_for_interface(struct ieee80211_rx_data *rx,
                                       struct sk_buff *skb, bool consume)
{
        struct link_sta_info *link_sta;
        struct ieee80211_hdr *hdr = (void *)skb->data;
        struct sta_info *sta;
        int link_id = -1;

        /*
         * Look up link station first, in case there's a
         * chance that they might have a link address that
         * is identical to the MLD address, that way we'll
         * have the link information if needed.
         */
        link_sta = link_sta_info_get_bss(rx->sdata, hdr->addr2);
        if (link_sta) {
                sta = link_sta->sta;
                link_id = link_sta->link_id;
        } else {
                struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);

                sta = sta_info_get_bss(rx->sdata, hdr->addr2);
                if (status->link_valid)
                        link_id = status->link_id;
        }

        if (!ieee80211_rx_data_set_sta(rx, sta, link_id))
                return false;

        return ieee80211_prepare_and_rx_handle(rx, skb, consume);
}

/*
 * This is the actual Rx frames handler. as it belongs to Rx path it must
 * be called with rcu_read_lock protection.
 */
static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
                                         struct ieee80211_sta *pubsta,
                                         struct sk_buff *skb,
                                         struct list_head *list)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_hdr *hdr;
        __le16 fc;
        struct ieee80211_rx_data rx;
        struct ieee80211_sub_if_data *prev;
        struct rhlist_head *tmp;
        int err = 0;

        fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
        memset(&rx, 0, sizeof(rx));
        rx.skb = skb;
        rx.local = local;
        rx.list = list;
        rx.link_id = -1;

        if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc))
                I802_DEBUG_INC(local->dot11ReceivedFragmentCount);

        if (ieee80211_is_mgmt(fc)) {
                /* drop frame if too short for header */
                if (skb->len < ieee80211_hdrlen(fc))
                        err = -ENOBUFS;
                else
                        err = skb_linearize(skb);
        } else {
                err = !pskb_may_pull(skb, ieee80211_hdrlen(fc));
        }

        if (err) {
                dev_kfree_skb(skb);
                return;
        }

        hdr = (struct ieee80211_hdr *)skb->data;
        ieee80211_parse_qos(&rx);
        ieee80211_verify_alignment(&rx);

        if (unlikely(ieee80211_is_probe_resp(hdr->frame_control) ||
                     ieee80211_is_beacon(hdr->frame_control) ||
                     ieee80211_is_s1g_beacon(hdr->frame_control)))
                ieee80211_scan_rx(local, skb);

        if (ieee80211_is_data(fc)) {
                struct sta_info *sta, *prev_sta;
                int link_id = -1;

                if (status->link_valid)
                        link_id = status->link_id;

                if (pubsta) {
                        sta = container_of(pubsta, struct sta_info, sta);
                        if (!ieee80211_rx_data_set_sta(&rx, sta, link_id))
                                goto out;

                        /*
                         * In MLO connection, fetch the link_id using addr2
                         * when the driver does not pass link_id in status.
                         * When the address translation is already performed by
                         * driver/hw, the valid link_id must be passed in
                         * status.
                         */

                        if (!status->link_valid && pubsta->mlo) {
                                struct link_sta_info *link_sta;

                                link_sta = link_sta_info_get_bss(rx.sdata,
                                                                 hdr->addr2);
                                if (!link_sta)
                                        goto out;

                                ieee80211_rx_data_set_link(&rx, link_sta->link_id);
                        }

                        if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
                                return;
                        goto out;
                }

                prev_sta = NULL;

                for_each_sta_info(local, hdr->addr2, sta, tmp) {
                        if (!prev_sta) {
                                prev_sta = sta;
                                continue;
                        }

                        rx.sdata = prev_sta->sdata;
                        if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id))
                                goto out;

                        if (!status->link_valid && prev_sta->sta.mlo)
                                continue;

                        ieee80211_prepare_and_rx_handle(&rx, skb, false);

                        prev_sta = sta;
                }

                if (prev_sta) {
                        rx.sdata = prev_sta->sdata;
                        if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id))
                                goto out;

                        if (!status->link_valid && prev_sta->sta.mlo)
                                goto out;

                        if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
                                return;
                        goto out;
                }
        }

        prev = NULL;

        list_for_each_entry_rcu(sdata, &local->interfaces, list) {
                if (!ieee80211_sdata_running(sdata))
                        continue;

                if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
                    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                        continue;

                /*
                 * frame is destined for this interface, but if it's
                 * not also for the previous one we handle that after
                 * the loop to avoid copying the SKB once too much
                 */

                if (!prev) {
                        prev = sdata;
                        continue;
                }

                rx.sdata = prev;
                ieee80211_rx_for_interface(&rx, skb, false);

                prev = sdata;
        }

        if (prev) {
                rx.sdata = prev;

                if (ieee80211_rx_for_interface(&rx, skb, true))
                        return;
        }

 out:
        dev_kfree_skb(skb);
}

/*
 * This is the receive path handler. It is called by a low level driver when an
 * 802.11 MPDU is received from the hardware.
 */
void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
                       struct sk_buff *skb, struct list_head *list)
{
        struct ieee80211_local *local = hw_to_local(hw);
        struct ieee80211_rate *rate = NULL;
        struct ieee80211_supported_band *sband;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;

        WARN_ON_ONCE(softirq_count() == 0);

        if (WARN_ON(status->band >= NUM_NL80211_BANDS))
                goto drop;

        sband = local->hw.wiphy->bands[status->band];
        if (WARN_ON(!sband))
                goto drop;

        /*
         * If we're suspending, it is possible although not too likely
         * that we'd be receiving frames after having already partially
         * quiesced the stack. We can't process such frames then since
         * that might, for example, cause stations to be added or other
         * driver callbacks be invoked.
         */
        if (unlikely(local->quiescing || local->suspended))
                goto drop;

        /* We might be during a HW reconfig, prevent Rx for the same reason */
        if (unlikely(local->in_reconfig))
                goto drop;

        /*
         * The same happens when we're not even started,
         * but that's worth a warning.
         */
        if (WARN_ON(!local->started))
                goto drop;

        if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
                /*
                 * Validate the rate, unless a PLCP error means that
                 * we probably can't have a valid rate here anyway.
                 */

                switch (status->encoding) {
                case RX_ENC_HT:
                        /*
                         * rate_idx is MCS index, which can be [0-76]
                         * as documented on:
                         *
                         * https://wireless.wiki.kernel.org/en/developers/Documentation/ieee80211/802.11n
                         *
                         * Anything else would be some sort of driver or
                         * hardware error. The driver should catch hardware
                         * errors.
                         */
                        if (WARN(status->rate_idx > 76,
                                 "Rate marked as an HT rate but passed "
                                 "status->rate_idx is not "
                                 "an MCS index [0-76]: %d (0x%02x)\n",
                                 status->rate_idx,
                                 status->rate_idx))
                                goto drop;
                        break;
                case RX_ENC_VHT:
                        if (WARN_ONCE(status->rate_idx > 11 ||
                                      !status->nss ||
                                      status->nss > 8,
                                      "Rate marked as a VHT rate but data is invalid: MCS: %d, NSS: %d\n",
                                      status->rate_idx, status->nss))
                                goto drop;
                        break;
                case RX_ENC_HE:
                        if (WARN_ONCE(status->rate_idx > 11 ||
                                      !status->nss ||
                                      status->nss > 8,
                                      "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n",
                                      status->rate_idx, status->nss))
                                goto drop;
                        break;
                case RX_ENC_EHT:
                        if (WARN_ONCE(status->rate_idx > 15 ||
                                      !status->nss ||
                                      status->nss > 8 ||
                                      status->eht.gi > NL80211_RATE_INFO_EHT_GI_3_2,
                                      "Rate marked as an EHT rate but data is invalid: MCS:%d, NSS:%d, GI:%d\n",
                                      status->rate_idx, status->nss, status->eht.gi))
                                goto drop;
                        break;
                default:
                        WARN_ON_ONCE(1);
                        fallthrough;
                case RX_ENC_LEGACY:
                        if (WARN_ON(status->rate_idx >= sband->n_bitrates))
                                goto drop;
                        rate = &sband->bitrates[status->rate_idx];
                }
        }

        if (WARN_ON_ONCE(status->link_id >= IEEE80211_LINK_UNSPECIFIED))
                goto drop;

        status->rx_flags = 0;

        kcov_remote_start_common(skb_get_kcov_handle(skb));

        /*
         * Frames with failed FCS/PLCP checksum are not returned,
         * all other frames are returned without radiotap header
         * if it was previously present.
         * Also, frames with less than 16 bytes are dropped.
         */
        if (!(status->flag & RX_FLAG_8023))
                skb = ieee80211_rx_monitor(local, skb, rate);
        if (skb) {
                if ((status->flag & RX_FLAG_8023) ||
                        ieee80211_is_data_present(hdr->frame_control))
                        ieee80211_tpt_led_trig_rx(local, skb->len);

                if (status->flag & RX_FLAG_8023)
                        __ieee80211_rx_handle_8023(hw, pubsta, skb, list);
                else
                        __ieee80211_rx_handle_packet(hw, pubsta, skb, list);
        }

        kcov_remote_stop();
        return;
 drop:
        kfree_skb(skb);
}
EXPORT_SYMBOL(ieee80211_rx_list);

void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
                       struct sk_buff *skb, struct napi_struct *napi)
{
        struct sk_buff *tmp;
        LIST_HEAD(list);


        /*
         * key references and virtual interfaces are protected using RCU
         * and this requires that we are in a read-side RCU section during
         * receive processing
         */
        rcu_read_lock();
        ieee80211_rx_list(hw, pubsta, skb, &list);
        rcu_read_unlock();

        if (!napi) {
                netif_receive_skb_list(&list);
                return;
        }

        list_for_each_entry_safe(skb, tmp, &list, list) {
                skb_list_del_init(skb);
                napi_gro_receive(napi, skb);
        }
}
EXPORT_SYMBOL(ieee80211_rx_napi);

/* This is a version of the rx handler that can be called from hard irq
 * context. Post the skb on the queue and schedule the tasklet */
void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb)
{
        struct ieee80211_local *local = hw_to_local(hw);

        BUILD_BUG_ON(sizeof(struct ieee80211_rx_status) > sizeof(skb->cb));

        skb->pkt_type = IEEE80211_RX_MSG;
        skb_queue_tail(&local->skb_queue, skb);
        tasklet_schedule(&local->tasklet);
}
EXPORT_SYMBOL(ieee80211_rx_irqsafe);



































    3 





    2 








    3 





























    1 










    1 


























    3 







    4 






    3 







































    1 






























































    1 



    1 
























    1 

    1 























    1 










    1 















































































    2 














































    2 







    2 
    2 





















    2 


    2 








































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>

/**
 * idr_alloc_u32() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @nextid: Pointer to an ID.
 * @max: The maximum ID to allocate (inclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @nextid and @max.
 * Note that @max is inclusive whereas the @end parameter to idr_alloc()
 * is exclusive.  The new ID is assigned to @nextid before the pointer
 * is inserted into the IDR, so if @nextid points into the object pointed
 * to by @ptr, a concurrent lookup will not find an uninitialised ID.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.  If an error occurred,
 * @nextid is unchanged.
 */
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
                        unsigned long max, gfp_t gfp)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int base = idr->idr_base;
        unsigned int id = *nextid;

        if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
                idr->idr_rt.xa_flags |= IDR_RT_MARKER;

        id = (id < base) ? 0 : id - base;
        radix_tree_iter_init(&iter, id);
        slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
        if (IS_ERR(slot))
                return PTR_ERR(slot);

        *nextid = iter.index + base;
        /* there is a memory barrier inside radix_tree_iter_replace() */
        radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
        radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);

        return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_u32);

/**
 * idr_alloc() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = start;
        int ret;

        if (WARN_ON_ONCE(start < 0))
                return -EINVAL;

        ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp);
        if (ret)
                return ret;

        return id;
}
EXPORT_SYMBOL_GPL(idr_alloc);

/**
 * idr_alloc_cyclic() - Allocate an ID cyclically.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 * The search for an unused ID will start at the last ID allocated and will
 * wrap around to @start if no free IDs are found before reaching @end.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = idr->idr_next;
        int err, max = end > 0 ? end - 1 : INT_MAX;

        if ((int)id < start)
                id = start;

        err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        if ((err == -ENOSPC) && (id > start)) {
                id = start;
                err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        }
        if (err)
                return err;

        idr->idr_next = id + 1;
        return id;
}
EXPORT_SYMBOL(idr_alloc_cyclic);

/**
 * idr_remove() - Remove an ID from the IDR.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Removes this ID from the IDR.  If the ID was not previously in the IDR,
 * this function returns %NULL.
 *
 * Since this function modifies the IDR, the caller should provide their
 * own locking to ensure that concurrent modification of the same IDR is
 * not possible.
 *
 * Return: The pointer formerly associated with this ID.
 */
void *idr_remove(struct idr *idr, unsigned long id)
{
        return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}
EXPORT_SYMBOL_GPL(idr_remove);

/**
 * idr_find() - Return pointer for given ID.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Looks up the pointer associated with this ID.  A %NULL pointer may
 * indicate that @id is not allocated or that the %NULL pointer was
 * associated with this ID.
 *
 * This function can be called under rcu_read_lock(), given that the leaf
 * pointers lifetimes are correctly managed.
 *
 * Return: The pointer associated with this ID.
 */
void *idr_find(const struct idr *idr, unsigned long id)
{
        return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}
EXPORT_SYMBOL_GPL(idr_find);

/**
 * idr_for_each() - Iterate through all stored pointers.
 * @idr: IDR handle.
 * @fn: Function to be called for each pointer.
 * @data: Data passed to callback function.
 *
 * The callback function will be called for each entry in @idr, passing
 * the ID, the entry and @data.
 *
 * If @fn returns anything other than %0, the iteration stops and that
 * value is returned from this function.
 *
 * idr_for_each() can be called concurrently with idr_alloc() and
 * idr_remove() if protected by RCU.  Newly added entries may not be
 * seen and deleted entries may be seen, but adding and removing entries
 * will not cause other entries to be skipped, nor spurious ones to be seen.
 */
int idr_for_each(const struct idr *idr,
                int (*fn)(int id, void *p, void *data), void *data)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        int base = idr->idr_base;

        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
                int ret;
                unsigned long id = iter.index + base;

                if (WARN_ON_ONCE(id > INT_MAX))
                        break;
                ret = fn(id, rcu_dereference_raw(*slot), data);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(idr_for_each);

/**
 * idr_get_next_ul() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        void *entry = NULL;
        unsigned long base = idr->idr_base;
        unsigned long id = *nextid;

        id = (id < base) ? 0 : id - base;
        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) {
                entry = rcu_dereference_raw(*slot);
                if (!entry)
                        continue;
                if (!xa_is_internal(entry))
                        break;
                if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry))
                        break;
                slot = radix_tree_iter_retry(&iter);
        }
        if (!slot)
                return NULL;

        *nextid = iter.index + base;
        return entry;
}
EXPORT_SYMBOL(idr_get_next_ul);

/**
 * idr_get_next() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next(struct idr *idr, int *nextid)
{
        unsigned long id = *nextid;
        void *entry = idr_get_next_ul(idr, &id);

        if (WARN_ON_ONCE(id > INT_MAX))
                return NULL;
        *nextid = id;
        return entry;
}
EXPORT_SYMBOL(idr_get_next);

/**
 * idr_replace() - replace pointer for given ID.
 * @idr: IDR handle.
 * @ptr: New pointer to associate with the ID.
 * @id: ID to change.
 *
 * Replace the pointer registered with an ID and return the old value.
 * This function can be called under the RCU read lock concurrently with
 * idr_alloc() and idr_remove() (as long as the ID being removed is not
 * the one being replaced!).
 *
 * Returns: the old value on success.  %-ENOENT indicates that @id was not
 * found.  %-EINVAL indicates that @ptr was not valid.
 */
void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
{
        struct radix_tree_node *node;
        void __rcu **slot = NULL;
        void *entry;

        id -= idr->idr_base;

        entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
        if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
                return ERR_PTR(-ENOENT);

        __radix_tree_replace(&idr->idr_rt, node, slot, ptr);

        return entry;
}
EXPORT_SYMBOL(idr_replace);

/**
 * DOC: IDA description
 *
 * The IDA is an ID allocator which does not provide the ability to
 * associate an ID with a pointer.  As such, it only needs to store one
 * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
 * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
 * then initialise it using ida_init()).  To allocate a new ID, call
 * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range().
 * To free an ID, call ida_free().
 *
 * ida_destroy() can be used to dispose of an IDA without needing to
 * free the individual IDs in it.  You can use ida_is_empty() to find
 * out whether the IDA has any IDs currently allocated.
 *
 * The IDA handles its own locking.  It is safe to call any of the IDA
 * functions without synchronisation in your code.
 *
 * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
 * limitation, it should be quite straightforward to raise the maximum.
 */

/*
 * Developer's notes:
 *
 * The IDA uses the functionality provided by the XArray to store bitmaps in
 * each entry.  The XA_FREE_MARK is only cleared when all bits in the bitmap
 * have been set.
 *
 * I considered telling the XArray that each slot is an order-10 node
 * and indexing by bit number, but the XArray can't allow a single multi-index
 * entry in the head, which would significantly increase memory consumption
 * for the IDA.  So instead we divide the index by the number of bits in the
 * leaf bitmap before doing a radix tree lookup.
 *
 * As an optimisation, if there are only a few low bits set in any given
 * leaf, instead of allocating a 128-byte bitmap, we store the bits
 * as a value entry.  Value entries never have the XA_FREE_MARK cleared
 * because we can always convert them into a bitmap entry.
 *
 * It would be possible to optimise further; once we've run out of a
 * single 128-byte bitmap, we currently switch to a 576-byte node, put
 * the 128-byte bitmap in the first entry and then start allocating extra
 * 128-byte entries.  We could instead use the 512 bytes of the node's
 * data as a bitmap before moving to that scheme.  I do not believe this
 * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
 * users of the IDA and almost none of them use more than 1024 entries.
 * Those that do use more than the 8192 IDs that the 512 bytes would
 * provide.
 *
 * The IDA always uses a lock to alloc/free.  If we add a 'test_bit'
 * equivalent, it will still need locking.  Going to RCU lookup would require
 * using RCU to free bitmaps, and that's not trivial without embedding an
 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
 * bitmap, which is excessive.
 */

/**
 * ida_alloc_range() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and @max, inclusive.  The allocated ID will
 * not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
                        gfp_t gfp)
{
        XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
        unsigned bit = min % IDA_BITMAP_BITS;
        unsigned long flags;
        struct ida_bitmap *bitmap, *alloc = NULL;

        if ((int)min < 0)
                return -ENOSPC;

        if ((int)max < 0)
                max = INT_MAX;

retry:
        xas_lock_irqsave(&xas, flags);
next:
        bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
        if (xas.xa_index > min / IDA_BITMAP_BITS)
                bit = 0;
        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                goto nospc;

        if (xa_is_value(bitmap)) {
                unsigned long tmp = xa_to_value(bitmap);

                if (bit < BITS_PER_XA_VALUE) {
                        bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
                        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                                goto nospc;
                        if (bit < BITS_PER_XA_VALUE) {
                                tmp |= 1UL << bit;
                                xas_store(&xas, xa_mk_value(tmp));
                                goto out;
                        }
                }
                bitmap = alloc;
                if (!bitmap)
                        bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                if (!bitmap)
                        goto alloc;
                bitmap->bitmap[0] = tmp;
                xas_store(&xas, bitmap);
                if (xas_error(&xas)) {
                        bitmap->bitmap[0] = 0;
                        goto out;
                }
        }

        if (bitmap) {
                bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
                if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                        goto nospc;
                if (bit == IDA_BITMAP_BITS)
                        goto next;

                __set_bit(bit, bitmap->bitmap);
                if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } else {
                if (bit < BITS_PER_XA_VALUE) {
                        bitmap = xa_mk_value(1UL << bit);
                } else {
                        bitmap = alloc;
                        if (!bitmap)
                                bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                        if (!bitmap)
                                goto alloc;
                        __set_bit(bit, bitmap->bitmap);
                }
                xas_store(&xas, bitmap);
        }
out:
        xas_unlock_irqrestore(&xas, flags);
        if (xas_nomem(&xas, gfp)) {
                xas.xa_index = min / IDA_BITMAP_BITS;
                bit = min % IDA_BITMAP_BITS;
                goto retry;
        }
        if (bitmap != alloc)
                kfree(alloc);
        if (xas_error(&xas))
                return xas_error(&xas);
        return xas.xa_index * IDA_BITMAP_BITS + bit;
alloc:
        xas_unlock_irqrestore(&xas, flags);
        alloc = kzalloc(sizeof(*bitmap), gfp);
        if (!alloc)
                return -ENOMEM;
        xas_set(&xas, min / IDA_BITMAP_BITS);
        bit = min % IDA_BITMAP_BITS;
        goto retry;
nospc:
        xas_unlock_irqrestore(&xas, flags);
        kfree(alloc);
        return -ENOSPC;
}
EXPORT_SYMBOL(ida_alloc_range);

/**
 * ida_free() - Release an allocated ID.
 * @ida: IDA handle.
 * @id: Previously allocated ID.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_free(struct ida *ida, unsigned int id)
{
        XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
        unsigned bit = id % IDA_BITMAP_BITS;
        struct ida_bitmap *bitmap;
        unsigned long flags;

        if ((int)id < 0)
                return;

        xas_lock_irqsave(&xas, flags);
        bitmap = xas_load(&xas);

        if (xa_is_value(bitmap)) {
                unsigned long v = xa_to_value(bitmap);
                if (bit >= BITS_PER_XA_VALUE)
                        goto err;
                if (!(v & (1UL << bit)))
                        goto err;
                v &= ~(1UL << bit);
                if (!v)
                        goto delete;
                xas_store(&xas, xa_mk_value(v));
        } else {
                if (!bitmap || !test_bit(bit, bitmap->bitmap))
                        goto err;
                __clear_bit(bit, bitmap->bitmap);
                xas_set_mark(&xas, XA_FREE_MARK);
                if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
                        kfree(bitmap);
delete:
                        xas_store(&xas, NULL);
                }
        }
        xas_unlock_irqrestore(&xas, flags);
        return;
 err:
        xas_unlock_irqrestore(&xas, flags);
        WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
EXPORT_SYMBOL(ida_free);

/**
 * ida_destroy() - Free all IDs.
 * @ida: IDA handle.
 *
 * Calling this function frees all IDs and releases all resources used
 * by an IDA.  When this call returns, the IDA is empty and can be reused
 * or freed.  If the IDA is already empty, there is no need to call this
 * function.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_destroy(struct ida *ida)
{
        XA_STATE(xas, &ida->xa, 0);
        struct ida_bitmap *bitmap;
        unsigned long flags;

        xas_lock_irqsave(&xas, flags);
        xas_for_each(&xas, bitmap, ULONG_MAX) {
                if (!xa_is_value(bitmap))
                        kfree(bitmap);
                xas_store(&xas, NULL);
        }
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);

#ifndef __KERNEL__
extern void xa_dump_index(unsigned long index, unsigned int shift);
#define IDA_CHUNK_SHIFT                ilog2(IDA_BITMAP_BITS)

static void ida_dump_entry(void *entry, unsigned long index)
{
        unsigned long i;

        if (!entry)
                return;

        if (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);
                unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
                        XA_CHUNK_SHIFT;

                xa_dump_index(index * IDA_BITMAP_BITS, shift);
                xa_dump_node(node);
                for (i = 0; i < XA_CHUNK_SIZE; i++)
                        ida_dump_entry(node->slots[i],
                                        index | (i << node->shift));
        } else if (xa_is_value(entry)) {
                xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
                pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
        } else {
                struct ida_bitmap *bitmap = entry;

                xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
                pr_cont("bitmap: %p data", bitmap);
                for (i = 0; i < IDA_BITMAP_LONGS; i++)
                        pr_cont(" %lx", bitmap->bitmap[i]);
                pr_cont("\n");
        }
}

static void ida_dump(struct ida *ida)
{
        struct xarray *xa = &ida->xa;
        pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
                                xa->xa_flags >> ROOT_TAG_SHIFT);
        ida_dump_entry(xa->xa_head, 0);
}
#endif











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Dynamic loading of modules into the kernel.
 *
 * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
 * Rewritten again by Rusty Russell, 2002
 */

#ifndef _LINUX_MODULE_H
#define _LINUX_MODULE_H

#include <linux/list.h>
#include <linux/stat.h>
#include <linux/buildid.h>
#include <linux/compiler.h>
#include <linux/cache.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/elf.h>
#include <linux/stringify.h>
#include <linux/kobject.h>
#include <linux/moduleparam.h>
#include <linux/jump_label.h>
#include <linux/export.h>
#include <linux/rbtree_latch.h>
#include <linux/error-injection.h>
#include <linux/tracepoint-defs.h>
#include <linux/srcu.h>
#include <linux/static_call_types.h>
#include <linux/dynamic_debug.h>

#include <linux/percpu.h>
#include <asm/module.h>

#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN

struct modversion_info {
        unsigned long crc;
        char name[MODULE_NAME_LEN];
};

struct module;
struct exception_table_entry;

struct module_kobject {
        struct kobject kobj;
        struct module *mod;
        struct kobject *drivers_dir;
        struct module_param_attrs *mp;
        struct completion *kobj_completion;
} __randomize_layout;

struct module_attribute {
        struct attribute attr;
        ssize_t (*show)(struct module_attribute *, struct module_kobject *,
                        char *);
        ssize_t (*store)(struct module_attribute *, struct module_kobject *,
                         const char *, size_t count);
        void (*setup)(struct module *, const char *);
        int (*test)(struct module *);
        void (*free)(struct module *);
};

struct module_version_attribute {
        struct module_attribute mattr;
        const char *module_name;
        const char *version;
};

extern ssize_t __modver_version_show(struct module_attribute *,
                                     struct module_kobject *, char *);

extern struct module_attribute module_uevent;

/* These are either module local, or the kernel's dummy ones. */
extern int init_module(void);
extern void cleanup_module(void);

#ifndef MODULE
/**
 * module_init() - driver initialization entry point
 * @x: function to be run at kernel boot time or module insertion
 *
 * module_init() will either be called during do_initcalls() (if
 * builtin) or at module insertion time (if a module).  There can only
 * be one per module.
 */
#define module_init(x)        __initcall(x);

/**
 * module_exit() - driver exit entry point
 * @x: function to be run when driver is removed
 *
 * module_exit() will wrap the driver clean-up code
 * with cleanup_module() when used with rmmod when
 * the driver is a module.  If the driver is statically
 * compiled into the kernel, module_exit() has no effect.
 * There can only be one per module.
 */
#define module_exit(x)        __exitcall(x);

#else /* MODULE */

/*
 * In most cases loadable modules do not need custom
 * initcall levels. There are still some valid cases where
 * a driver may be needed early if built in, and does not
 * matter when built as a loadable module. Like bus
 * snooping debug drivers.
 */
#define early_initcall(fn)                module_init(fn)
#define core_initcall(fn)                module_init(fn)
#define core_initcall_sync(fn)                module_init(fn)
#define postcore_initcall(fn)                module_init(fn)
#define postcore_initcall_sync(fn)        module_init(fn)
#define arch_initcall(fn)                module_init(fn)
#define subsys_initcall(fn)                module_init(fn)
#define subsys_initcall_sync(fn)        module_init(fn)
#define fs_initcall(fn)                        module_init(fn)
#define fs_initcall_sync(fn)                module_init(fn)
#define rootfs_initcall(fn)                module_init(fn)
#define device_initcall(fn)                module_init(fn)
#define device_initcall_sync(fn)        module_init(fn)
#define late_initcall(fn)                module_init(fn)
#define late_initcall_sync(fn)                module_init(fn)

#define console_initcall(fn)                module_init(fn)

/* Each module must use one module_init(). */
#define module_init(initfn)                                        \
        static inline initcall_t __maybe_unused __inittest(void)                \
        { return initfn; }                                        \
        int init_module(void) __copy(initfn)                        \
                __attribute__((alias(#initfn)));                \
        ___ADDRESSABLE(init_module, __initdata);

/* This is only required if you want to be unloadable. */
#define module_exit(exitfn)                                        \
        static inline exitcall_t __maybe_unused __exittest(void)                \
        { return exitfn; }                                        \
        void cleanup_module(void) __copy(exitfn)                \
                __attribute__((alias(#exitfn)));                \
        ___ADDRESSABLE(cleanup_module, __exitdata);

#endif

/* This means "can be init if no module support, otherwise module load
   may call it." */
#ifdef CONFIG_MODULES
#define __init_or_module
#define __initdata_or_module
#define __initconst_or_module
#define __INIT_OR_MODULE        .text
#define __INITDATA_OR_MODULE        .data
#define __INITRODATA_OR_MODULE        .section ".rodata","a",%progbits
#else
#define __init_or_module __init
#define __initdata_or_module __initdata
#define __initconst_or_module __initconst
#define __INIT_OR_MODULE __INIT
#define __INITDATA_OR_MODULE __INITDATA
#define __INITRODATA_OR_MODULE __INITRODATA
#endif /*CONFIG_MODULES*/

/* Generic info of form tag = "info" */
#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)

/* For userspace: you can also call me... */
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)

/* Soft module dependencies. See man modprobe.d for details.
 * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz")
 */
#define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep)

/*
 * MODULE_FILE is used for generating modules.builtin
 * So, make it no-op when this is being built as a module
 */
#ifdef MODULE
#define MODULE_FILE
#else
#define MODULE_FILE        MODULE_INFO(file, KBUILD_MODFILE);
#endif

/*
 * The following license idents are currently accepted as indicating free
 * software modules
 *
 *        "GPL"                                [GNU Public License v2]
 *        "GPL v2"                        [GNU Public License v2]
 *        "GPL and additional rights"        [GNU Public License v2 rights and more]
 *        "Dual BSD/GPL"                        [GNU Public License v2
 *                                         or BSD license choice]
 *        "Dual MIT/GPL"                        [GNU Public License v2
 *                                         or MIT license choice]
 *        "Dual MPL/GPL"                        [GNU Public License v2
 *                                         or Mozilla license choice]
 *
 * The following other idents are available
 *
 *        "Proprietary"                        [Non free products]
 *
 * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are
 * merely stating that the module is licensed under the GPL v2, but are not
 * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there
 * are two variants is a historic and failed attempt to convey more
 * information in the MODULE_LICENSE string. For module loading the
 * "only/or later" distinction is completely irrelevant and does neither
 * replace the proper license identifiers in the corresponding source file
 * nor amends them in any way. The sole purpose is to make the
 * 'Proprietary' flagging work and to refuse to bind symbols which are
 * exported with EXPORT_SYMBOL_GPL when a non free module is loaded.
 *
 * In the same way "BSD" is not a clear license information. It merely
 * states, that the module is licensed under one of the compatible BSD
 * license variants. The detailed and correct license information is again
 * to be found in the corresponding source files.
 *
 * There are dual licensed components, but when running with Linux it is the
 * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL
 * is a GPL combined work.
 *
 * This exists for several reasons
 * 1.        So modinfo can show license info for users wanting to vet their setup
 *        is free
 * 2.        So the community can ignore bug reports including proprietary modules
 * 3.        So vendors can do likewise based on their own policies
 */
#define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license)

/*
 * Author(s), use "Name <email>" or just "Name", for multiple
 * authors use multiple MODULE_AUTHOR() statements/lines.
 */
#define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)

/* What your module does. */
#define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description)

#ifdef MODULE
/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name)                                        \
extern typeof(name) __mod_##type##__##name##_device_table                \
  __attribute__ ((unused, alias(__stringify(name))))
#else  /* !MODULE */
#define MODULE_DEVICE_TABLE(type, name)
#endif

/* Version of form [<epoch>:]<version>[-<extra-version>].
 * Or for CVS/RCS ID version, everything but the number is stripped.
 * <epoch>: A (small) unsigned integer which allows you to start versions
 * anew. If not mentioned, it's zero.  eg. "2:1.0" is after
 * "1:2.0".

 * <version>: The <version> may contain only alphanumerics and the
 * character `.'.  Ordered by numeric sort for numeric parts,
 * ascii sort for ascii parts (as per RPM or DEB algorithm).

 * <extraversion>: Like <version>, but inserted for local
 * customizations, eg "rh3" or "rusty1".

 * Using this automatically adds a checksum of the .c files and the
 * local headers in "srcversion".
 */

#if defined(MODULE) || !defined(CONFIG_SYSFS)
#define MODULE_VERSION(_version) MODULE_INFO(version, _version)
#else
#define MODULE_VERSION(_version)                                        \
        MODULE_INFO(version, _version);                                        \
        static struct module_version_attribute __modver_attr                \
                __used __section("__modver")                                \
                __aligned(__alignof__(struct module_version_attribute)) \
                = {                                                        \
                        .mattr        = {                                        \
                                .attr        = {                                \
                                        .name        = "version",                \
                                        .mode        = S_IRUGO,                \
                                },                                        \
                                .show        = __modver_version_show,        \
                        },                                                \
                        .module_name        = KBUILD_MODNAME,                \
                        .version        = _version,                        \
                }
#endif

/* Optional firmware file (or files) needed by the module
 * format is simply firmware file name.  Multiple firmware
 * files require multiple MODULE_FIRMWARE() specifiers */
#define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)

#define MODULE_IMPORT_NS(ns)        MODULE_INFO(import_ns, __stringify(ns))

struct notifier_block;

#ifdef CONFIG_MODULES

extern int modules_disabled; /* for sysctl */
/* Get/put a kernel symbol (calls must be symmetric) */
void *__symbol_get(const char *symbol);
void *__symbol_get_gpl(const char *symbol);
#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x))))

/* modules using other modules: kdb wants to see this. */
struct module_use {
        struct list_head source_list;
        struct list_head target_list;
        struct module *source, *target;
};

enum module_state {
        MODULE_STATE_LIVE,        /* Normal state. */
        MODULE_STATE_COMING,        /* Full formed, running module_init. */
        MODULE_STATE_GOING,        /* Going away. */
        MODULE_STATE_UNFORMED,        /* Still setting it up. */
};

struct mod_tree_node {
        struct module *mod;
        struct latch_tree_node node;
};

enum mod_mem_type {
        MOD_TEXT = 0,
        MOD_DATA,
        MOD_RODATA,
        MOD_RO_AFTER_INIT,
        MOD_INIT_TEXT,
        MOD_INIT_DATA,
        MOD_INIT_RODATA,

        MOD_MEM_NUM_TYPES,
        MOD_INVALID = -1,
};

#define mod_mem_type_is_init(type)        \
        ((type) == MOD_INIT_TEXT ||        \
         (type) == MOD_INIT_DATA ||        \
         (type) == MOD_INIT_RODATA)

#define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type))

#define mod_mem_type_is_text(type)        \
         ((type) == MOD_TEXT ||                \
          (type) == MOD_INIT_TEXT)

#define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type))

#define mod_mem_type_is_core_data(type)        \
        (mod_mem_type_is_core(type) &&        \
         mod_mem_type_is_data(type))

#define for_each_mod_mem_type(type)                        \
        for (enum mod_mem_type (type) = 0;                \
             (type) < MOD_MEM_NUM_TYPES; (type)++)

#define for_class_mod_mem_type(type, class)                \
        for_each_mod_mem_type(type)                        \
                if (mod_mem_type_is_##class(type))

struct module_memory {
        void *base;
        unsigned int size;

#ifdef CONFIG_MODULES_TREE_LOOKUP
        struct mod_tree_node mtn;
#endif
};

#ifdef CONFIG_MODULES_TREE_LOOKUP
/* Only touch one cacheline for common rbtree-for-core-layout case. */
#define __module_memory_align ____cacheline_aligned
#else
#define __module_memory_align
#endif

struct mod_kallsyms {
        Elf_Sym *symtab;
        unsigned int num_symtab;
        char *strtab;
        char *typetab;
};

#ifdef CONFIG_LIVEPATCH
/**
 * struct klp_modinfo - ELF information preserved from the livepatch module
 *
 * @hdr: ELF header
 * @sechdrs: Section header table
 * @secstrings: String table for the section headers
 * @symndx: The symbol table section index
 */
struct klp_modinfo {
        Elf_Ehdr hdr;
        Elf_Shdr *sechdrs;
        char *secstrings;
        unsigned int symndx;
};
#endif

struct module {
        enum module_state state;

        /* Member of list of modules */
        struct list_head list;

        /* Unique handle for this module */
        char name[MODULE_NAME_LEN];

#ifdef CONFIG_STACKTRACE_BUILD_ID
        /* Module build ID */
        unsigned char build_id[BUILD_ID_SIZE_MAX];
#endif

        /* Sysfs stuff. */
        struct module_kobject mkobj;
        struct module_attribute *modinfo_attrs;
        const char *version;
        const char *srcversion;
        struct kobject *holders_dir;

        /* Exported symbols */
        const struct kernel_symbol *syms;
        const s32 *crcs;
        unsigned int num_syms;

#ifdef CONFIG_ARCH_USES_CFI_TRAPS
        s32 *kcfi_traps;
        s32 *kcfi_traps_end;
#endif

        /* Kernel parameters. */
#ifdef CONFIG_SYSFS
        struct mutex param_lock;
#endif
        struct kernel_param *kp;
        unsigned int num_kp;

        /* GPL-only exported symbols. */
        unsigned int num_gpl_syms;
        const struct kernel_symbol *gpl_syms;
        const s32 *gpl_crcs;
        bool using_gplonly_symbols;

#ifdef CONFIG_MODULE_SIG
        /* Signature was verified. */
        bool sig_ok;
#endif

        bool async_probe_requested;

        /* Exception table */
        unsigned int num_exentries;
        struct exception_table_entry *extable;

        /* Startup function. */
        int (*init)(void);

        struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align;

        /* Arch-specific module values */
        struct mod_arch_specific arch;

        unsigned long taints;        /* same bits as kernel:taint_flags */

#ifdef CONFIG_GENERIC_BUG
        /* Support for BUG */
        unsigned num_bugs;
        struct list_head bug_list;
        struct bug_entry *bug_table;
#endif

#ifdef CONFIG_KALLSYMS
        /* Protected by RCU and/or module_mutex: use rcu_dereference() */
        struct mod_kallsyms __rcu *kallsyms;
        struct mod_kallsyms core_kallsyms;

        /* Section attributes */
        struct module_sect_attrs *sect_attrs;

        /* Notes attributes */
        struct module_notes_attrs *notes_attrs;
#endif

        /* The command line arguments (may be mangled).  People like
           keeping pointers to this stuff */
        char *args;

#ifdef CONFIG_SMP
        /* Per-cpu data. */
        void __percpu *percpu;
        unsigned int percpu_size;
#endif
        void *noinstr_text_start;
        unsigned int noinstr_text_size;

#ifdef CONFIG_TRACEPOINTS
        unsigned int num_tracepoints;
        tracepoint_ptr_t *tracepoints_ptrs;
#endif
#ifdef CONFIG_TREE_SRCU
        unsigned int num_srcu_structs;
        struct srcu_struct **srcu_struct_ptrs;
#endif
#ifdef CONFIG_BPF_EVENTS
        unsigned int num_bpf_raw_events;
        struct bpf_raw_event_map *bpf_raw_events;
#endif
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        unsigned int btf_data_size;
        void *btf_data;
#endif
#ifdef CONFIG_JUMP_LABEL
        struct jump_entry *jump_entries;
        unsigned int num_jump_entries;
#endif
#ifdef CONFIG_TRACING
        unsigned int num_trace_bprintk_fmt;
        const char **trace_bprintk_fmt_start;
#endif
#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call **trace_events;
        unsigned int num_trace_events;
        struct trace_eval_map **trace_evals;
        unsigned int num_trace_evals;
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
        unsigned int num_ftrace_callsites;
        unsigned long *ftrace_callsites;
#endif
#ifdef CONFIG_KPROBES
        void *kprobes_text_start;
        unsigned int kprobes_text_size;
        unsigned long *kprobe_blacklist;
        unsigned int num_kprobe_blacklist;
#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
        int num_static_call_sites;
        struct static_call_site *static_call_sites;
#endif
#if IS_ENABLED(CONFIG_KUNIT)
        int num_kunit_init_suites;
        struct kunit_suite **kunit_init_suites;
        int num_kunit_suites;
        struct kunit_suite **kunit_suites;
#endif


#ifdef CONFIG_LIVEPATCH
        bool klp; /* Is this a livepatch module? */
        bool klp_alive;

        /* ELF information */
        struct klp_modinfo *klp_info;
#endif

#ifdef CONFIG_PRINTK_INDEX
        unsigned int printk_index_size;
        struct pi_entry **printk_index_start;
#endif

#ifdef CONFIG_MODULE_UNLOAD
        /* What modules depend on me? */
        struct list_head source_list;
        /* What modules do I depend on? */
        struct list_head target_list;

        /* Destruction function. */
        void (*exit)(void);

        atomic_t refcnt;
#endif

#ifdef CONFIG_CONSTRUCTORS
        /* Constructor functions. */
        ctor_fn_t *ctors;
        unsigned int num_ctors;
#endif

#ifdef CONFIG_FUNCTION_ERROR_INJECTION
        struct error_injection_entry *ei_funcs;
        unsigned int num_ei_funcs;
#endif
#ifdef CONFIG_DYNAMIC_DEBUG_CORE
        struct _ddebug_info dyndbg_info;
#endif
} ____cacheline_aligned __randomize_layout;
#ifndef MODULE_ARCH_INIT
#define MODULE_ARCH_INIT {}
#endif

#ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
{
        return sym->st_value;
}
#endif

/* FIXME: It'd be nice to isolate modules during init, too, so they
   aren't used before they (may) fail.  But presently too much code
   (IDE & SCSI) require entry into the module during init.*/
static inline bool module_is_live(struct module *mod)
{
        return mod->state != MODULE_STATE_GOING;
}

static inline bool module_is_coming(struct module *mod)
{
        return mod->state == MODULE_STATE_COMING;
}

struct module *__module_text_address(unsigned long addr);
struct module *__module_address(unsigned long addr);
bool is_module_address(unsigned long addr);
bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
bool is_module_percpu_address(unsigned long addr);
bool is_module_text_address(unsigned long addr);

static inline bool within_module_mem_type(unsigned long addr,
                                          const struct module *mod,
                                          enum mod_mem_type type)
{
        unsigned long base, size;

        base = (unsigned long)mod->mem[type].base;
        size = mod->mem[type].size;
        return addr - base < size;
}

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        for_class_mod_mem_type(type, core) {
                if (within_module_mem_type(addr, mod, type))
                        return true;
        }
        return false;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        for_class_mod_mem_type(type, init) {
                if (within_module_mem_type(addr, mod, type))
                        return true;
        }
        return false;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return within_module_init(addr, mod) || within_module_core(addr, mod);
}

/* Search for module by name: must be in a RCU-sched critical section. */
struct module *find_module(const char *name);

extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
                        long code);
#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)

#ifdef CONFIG_MODULE_UNLOAD
int module_refcount(struct module *mod);
void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(__stringify(x))
void symbol_put_addr(void *addr);

/* Sometimes we know we already have a refcount, and it's easier not
   to handle the error case (which only happens with rmmod --wait). */
extern void __module_get(struct module *module);

/**
 * try_module_get() - take module refcount unless module is being removed
 * @module: the module we should check for
 *
 * Only try to get a module reference count if the module is not being removed.
 * This call will fail if the module is in the process of being removed.
 *
 * Care must also be taken to ensure the module exists and is alive prior to
 * usage of this call. This can be gauranteed through two means:
 *
 * 1) Direct protection: you know an earlier caller must have increased the
 *    module reference through __module_get(). This can typically be achieved
 *    by having another entity other than the module itself increment the
 *    module reference count.
 *
 * 2) Implied protection: there is an implied protection against module
 *    removal. An example of this is the implied protection used by kernfs /
 *    sysfs. The sysfs store / read file operations are guaranteed to exist
 *    through the use of kernfs's active reference (see kernfs_active()) and a
 *    sysfs / kernfs file removal cannot happen unless the same file is not
 *    active. Therefore, if a sysfs file is being read or written to the module
 *    which created it must still exist. It is therefore safe to use
 *    try_module_get() on module sysfs store / read ops.
 *
 * One of the real values to try_module_get() is the module_is_live() check
 * which ensures that the caller of try_module_get() can yield to userspace
 * module removal requests and gracefully fail if the module is on its way out.
 *
 * Returns true if the reference count was successfully incremented.
 */
extern bool try_module_get(struct module *module);

/**
 * module_put() - release a reference count to a module
 * @module: the module we should release a reference count for
 *
 * If you successfully bump a reference count to a module with try_module_get(),
 * when you are finished you must call module_put() to release that reference
 * count.
 */
extern void module_put(struct module *module);

#else /*!CONFIG_MODULE_UNLOAD*/
static inline bool try_module_get(struct module *module)
{
        return !module || module_is_live(module);
}
static inline void module_put(struct module *module)
{
}
static inline void __module_get(struct module *module)
{
}
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(p) do { } while (0)

#endif /* CONFIG_MODULE_UNLOAD */

/* This is a #define so the string doesn't get put in every .o file */
#define module_name(mod)                        \
({                                                \
        struct module *__mod = (mod);                \
        __mod ? __mod->name : "kernel";                \
})

/* Dereference module function descriptor */
void *dereference_module_function_descriptor(struct module *mod, void *ptr);

int register_module_notifier(struct notifier_block *nb);
int unregister_module_notifier(struct notifier_block *nb);

extern void print_modules(void);

static inline bool module_requested_async_probing(struct module *module)
{
        return module && module->async_probe_requested;
}

static inline bool is_livepatch_module(struct module *mod)
{
#ifdef CONFIG_LIVEPATCH
        return mod->klp;
#else
        return false;
#endif
}

void set_module_sig_enforced(void);

#else /* !CONFIG_MODULES... */

static inline struct module *__module_address(unsigned long addr)
{
        return NULL;
}

static inline struct module *__module_text_address(unsigned long addr)
{
        return NULL;
}

static inline bool is_module_address(unsigned long addr)
{
        return false;
}

static inline bool is_module_percpu_address(unsigned long addr)
{
        return false;
}

static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        return false;
}

static inline bool is_module_text_address(unsigned long addr)
{
        return false;
}

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return false;
}

/* Get/put a kernel symbol (calls should be symmetric) */
#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(x) do { } while (0)

static inline void __module_get(struct module *module)
{
}

static inline bool try_module_get(struct module *module)
{
        return true;
}

static inline void module_put(struct module *module)
{
}

#define module_name(mod) "kernel"

static inline int register_module_notifier(struct notifier_block *nb)
{
        /* no events will happen anyway, so this can always succeed */
        return 0;
}

static inline int unregister_module_notifier(struct notifier_block *nb)
{
        return 0;
}

#define module_put_and_kthread_exit(code) kthread_exit(code)

static inline void print_modules(void)
{
}

static inline bool module_requested_async_probing(struct module *module)
{
        return false;
}


static inline void set_module_sig_enforced(void)
{
}

/* Dereference module function descriptor */
static inline
void *dereference_module_function_descriptor(struct module *mod, void *ptr)
{
        return ptr;
}

static inline bool module_is_coming(struct module *mod)
{
        return false;
}
#endif /* CONFIG_MODULES */

#ifdef CONFIG_SYSFS
extern struct kset *module_kset;
extern const struct kobj_type module_ktype;
#endif /* CONFIG_SYSFS */

#define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)

/* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */

#define __MODULE_STRING(x) __stringify(x)

#ifdef CONFIG_GENERIC_BUG
void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
                         struct module *);
void module_bug_cleanup(struct module *);

#else        /* !CONFIG_GENERIC_BUG */

static inline void module_bug_finalize(const Elf_Ehdr *hdr,
                                        const Elf_Shdr *sechdrs,
                                        struct module *mod)
{
}
static inline void module_bug_cleanup(struct module *mod) {}
#endif        /* CONFIG_GENERIC_BUG */

#ifdef CONFIG_MITIGATION_RETPOLINE
extern bool retpoline_module_ok(bool has_retpoline);
#else
static inline bool retpoline_module_ok(bool has_retpoline)
{
        return true;
}
#endif

#ifdef CONFIG_MODULE_SIG
bool is_module_sig_enforced(void);

static inline bool module_sig_ok(struct module *module)
{
        return module->sig_ok;
}
#else        /* !CONFIG_MODULE_SIG */
static inline bool is_module_sig_enforced(void)
{
        return false;
}

static inline bool module_sig_ok(struct module *module)
{
        return true;
}
#endif        /* CONFIG_MODULE_SIG */

#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS)
int module_kallsyms_on_each_symbol(const char *modname,
                                   int (*fn)(void *, const char *, unsigned long),
                                   void *data);

/* For kallsyms to ask for address resolution.  namebuf should be at
 * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
 * found, otherwise NULL.
 */
const char *module_address_lookup(unsigned long addr,
                                  unsigned long *symbolsize,
                                  unsigned long *offset,
                                  char **modname, const unsigned char **modbuildid,
                                  char *namebuf);
int lookup_module_symbol_name(unsigned long addr, char *symname);
int lookup_module_symbol_attrs(unsigned long addr,
                               unsigned long *size,
                               unsigned long *offset,
                               char *modname,
                               char *name);

/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
 * symnum out of range.
 */
int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *name, char *module_name, int *exported);

/* Look for this name: can be of form module:name. */
unsigned long module_kallsyms_lookup_name(const char *name);

unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);

#else        /* CONFIG_MODULES && CONFIG_KALLSYMS */

static inline int module_kallsyms_on_each_symbol(const char *modname,
                                                 int (*fn)(void *, const char *, unsigned long),
                                                 void *data)
{
        return -EOPNOTSUPP;
}

/* For kallsyms to ask for address resolution.  NULL means not found. */
static inline const char *module_address_lookup(unsigned long addr,
                                                unsigned long *symbolsize,
                                                unsigned long *offset,
                                                char **modname,
                                                const unsigned char **modbuildid,
                                                char *namebuf)
{
        return NULL;
}

static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
{
        return -ERANGE;
}

static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
                                     char *type, char *name,
                                     char *module_name, int *exported)
{
        return -ERANGE;
}

static inline unsigned long module_kallsyms_lookup_name(const char *name)
{
        return 0;
}

static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
                                                       const char *name)
{
        return 0;
}

#endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */

#endif /* _LINUX_MODULE_H */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
/*
 * net/tipc/msg.h: Include file for TIPC message header routines
 *
 * Copyright (c) 2000-2007, 2014-2017 Ericsson AB
 * Copyright (c) 2005-2008, 2010-2011, Wind River Systems
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _TIPC_MSG_H
#define _TIPC_MSG_H

#include <linux/tipc.h>
#include "core.h"

/*
 * Constants and routines used to read and write TIPC payload message headers
 *
 * Note: Some items are also used with TIPC internal message headers
 */
#define TIPC_VERSION              2
struct plist;

/*
 * Payload message users are defined in TIPC's public API:
 * - TIPC_LOW_IMPORTANCE
 * - TIPC_MEDIUM_IMPORTANCE
 * - TIPC_HIGH_IMPORTANCE
 * - TIPC_CRITICAL_IMPORTANCE
 */
#define TIPC_SYSTEM_IMPORTANCE        4


/*
 * Payload message types
 */
#define TIPC_CONN_MSG           0
#define TIPC_MCAST_MSG          1
#define TIPC_NAMED_MSG          2
#define TIPC_DIRECT_MSG         3
#define TIPC_GRP_MEMBER_EVT     4
#define TIPC_GRP_BCAST_MSG      5
#define TIPC_GRP_MCAST_MSG      6
#define TIPC_GRP_UCAST_MSG      7

/*
 * Internal message users
 */
#define  BCAST_PROTOCOL       5
#define  MSG_BUNDLER          6
#define  LINK_PROTOCOL        7
#define  CONN_MANAGER         8
#define  GROUP_PROTOCOL       9
#define  TUNNEL_PROTOCOL      10
#define  NAME_DISTRIBUTOR     11
#define  MSG_FRAGMENTER       12
#define  LINK_CONFIG          13
#define  MSG_CRYPTO           14
#define  SOCK_WAKEUP          14       /* pseudo user */
#define  TOP_SRV              15       /* pseudo user */

/*
 * Message header sizes
 */
#define SHORT_H_SIZE              24        /* In-cluster basic payload message */
#define BASIC_H_SIZE              32        /* Basic payload message */
#define NAMED_H_SIZE              40        /* Named payload message */
#define MCAST_H_SIZE              44        /* Multicast payload message */
#define GROUP_H_SIZE              44        /* Group payload message */
#define INT_H_SIZE                40        /* Internal messages */
#define MIN_H_SIZE                24        /* Smallest legal TIPC header size */
#define MAX_H_SIZE                60        /* Largest possible TIPC header size */

#define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE)
#define TIPC_MEDIA_INFO_OFFSET        5

extern const int one_page_mtu;

struct tipc_skb_cb {
        union {
                struct {
                        struct sk_buff *tail;
                        unsigned long nxt_retr;
                        unsigned long retr_stamp;
                        u32 bytes_read;
                        u32 orig_member;
                        u16 chain_imp;
                        u16 ackers;
                        u16 retr_cnt;
                } __packed;
#ifdef CONFIG_TIPC_CRYPTO
                struct {
                        struct tipc_crypto *rx;
                        struct tipc_aead *last;
                        u8 recurs;
                } tx_clone_ctx __packed;
#endif
        } __packed;
        union {
                struct {
                        u8 validated:1;
#ifdef CONFIG_TIPC_CRYPTO
                        u8 encrypted:1;
                        u8 decrypted:1;
#define SKB_PROBING        1
#define SKB_GRACING        2
                        u8 xmit_type:2;
                        u8 tx_clone_deferred:1;
#endif
                };
                u8 flags;
        };
        u8 reserved;
#ifdef CONFIG_TIPC_CRYPTO
        void *crypto_ctx;
#endif
} __packed;

#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))

struct tipc_msg {
        __be32 hdr[15];
};

/* struct tipc_gap_ack - TIPC Gap ACK block
 * @ack: seqno of the last consecutive packet in link deferdq
 * @gap: number of gap packets since the last ack
 *
 * E.g:
 *       link deferdq: 1 2 3 4      10 11      13 14 15       20
 * --> Gap ACK blocks:      <4, 5>,   <11, 1>,      <15, 4>, <20, 0>
 */
struct tipc_gap_ack {
        __be16 ack;
        __be16 gap;
};

/* struct tipc_gap_ack_blks
 * @len: actual length of the record
 * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast
 *             ones)
 * @start_index: starting index for "valid" broadcast Gap ACK blocks
 * @bgack_cnt: number of Gap ACK blocks for broadcast in the record
 * @gacks: array of Gap ACK blocks
 *
 *  31                       16 15                        0
 * +-------------+-------------+-------------+-------------+
 * |  bgack_cnt  |  ugack_cnt  |            len            |
 * +-------------+-------------+-------------+-------------+  -
 * |            gap            |            ack            |   |
 * +-------------+-------------+-------------+-------------+    > bc gacks
 * :                           :                           :   |
 * +-------------+-------------+-------------+-------------+  -
 * |            gap            |            ack            |   |
 * +-------------+-------------+-------------+-------------+    > uc gacks
 * :                           :                           :   |
 * +-------------+-------------+-------------+-------------+  -
 */
struct tipc_gap_ack_blks {
        __be16 len;
        union {
                u8 ugack_cnt;
                u8 start_index;
        };
        u8 bgack_cnt;
        struct tipc_gap_ack gacks[];
};

#define MAX_GAP_ACK_BLKS        128
#define MAX_GAP_ACK_BLKS_SZ        (sizeof(struct tipc_gap_ack_blks) + \
                                 sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS)

static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
{
        return (struct tipc_msg *)skb->data;
}

static inline u32 msg_word(struct tipc_msg *m, u32 pos)
{
        return ntohl(m->hdr[pos]);
}

static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val)
{
        m->hdr[w] = htonl(val);
}

static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask)
{
        return (msg_word(m, w) >> pos) & mask;
}

static inline void msg_set_bits(struct tipc_msg *m, u32 w,
                                u32 pos, u32 mask, u32 val)
{
        val = (val & mask) << pos;
        mask = mask << pos;
        m->hdr[w] &= ~htonl(mask);
        m->hdr[w] |= htonl(val);
}

/*
 * Word 0
 */
static inline u32 msg_version(struct tipc_msg *m)
{
        return msg_bits(m, 0, 29, 7);
}

static inline void msg_set_version(struct tipc_msg *m)
{
        msg_set_bits(m, 0, 29, 7, TIPC_VERSION);
}

static inline u32 msg_user(struct tipc_msg *m)
{
        return msg_bits(m, 0, 25, 0xf);
}

static inline u32 msg_isdata(struct tipc_msg *m)
{
        return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE;
}

static inline void msg_set_user(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 0, 25, 0xf, n);
}

static inline u32 msg_hdr_sz(struct tipc_msg *m)
{
        return msg_bits(m, 0, 21, 0xf) << 2;
}

static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 0, 21, 0xf, n>>2);
}

static inline u32 msg_size(struct tipc_msg *m)
{
        return msg_bits(m, 0, 0, 0x1ffff);
}

static inline u32 msg_blocks(struct tipc_msg *m)
{
        return (msg_size(m) / 1024) + 1;
}

static inline u32 msg_data_sz(struct tipc_msg *m)
{
        return msg_size(m) - msg_hdr_sz(m);
}

static inline int msg_non_seq(struct tipc_msg *m)
{
        return msg_bits(m, 0, 20, 1);
}

static inline void msg_set_non_seq(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 0, 20, 1, n);
}

static inline int msg_is_syn(struct tipc_msg *m)
{
        return msg_bits(m, 0, 17, 1);
}

static inline void msg_set_syn(struct tipc_msg *m, u32 d)
{
        msg_set_bits(m, 0, 17, 1, d);
}

static inline int msg_dest_droppable(struct tipc_msg *m)
{
        return msg_bits(m, 0, 19, 1);
}

static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d)
{
        msg_set_bits(m, 0, 19, 1, d);
}

static inline int msg_is_keepalive(struct tipc_msg *m)
{
        return msg_bits(m, 0, 19, 1);
}

static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d)
{
        msg_set_bits(m, 0, 19, 1, d);
}

static inline int msg_src_droppable(struct tipc_msg *m)
{
        return msg_bits(m, 0, 18, 1);
}

static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d)
{
        msg_set_bits(m, 0, 18, 1, d);
}

static inline int msg_ack_required(struct tipc_msg *m)
{
        return msg_bits(m, 0, 18, 1);
}

static inline void msg_set_ack_required(struct tipc_msg *m)
{
        msg_set_bits(m, 0, 18, 1, 1);
}

static inline int msg_nagle_ack(struct tipc_msg *m)
{
        return msg_bits(m, 0, 18, 1);
}

static inline void msg_set_nagle_ack(struct tipc_msg *m)
{
        msg_set_bits(m, 0, 18, 1, 1);
}

static inline bool msg_is_rcast(struct tipc_msg *m)
{
        return msg_bits(m, 0, 18, 0x1);
}

static inline void msg_set_is_rcast(struct tipc_msg *m, bool d)
{
        msg_set_bits(m, 0, 18, 0x1, d);
}

static inline void msg_set_size(struct tipc_msg *m, u32 sz)
{
        m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz);
}

static inline unchar *msg_data(struct tipc_msg *m)
{
        return ((unchar *)m) + msg_hdr_sz(m);
}

static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m)
{
        return (struct tipc_msg *)msg_data(m);
}

/*
 * Word 1
 */
static inline u32 msg_type(struct tipc_msg *m)
{
        return msg_bits(m, 1, 29, 0x7);
}

static inline void msg_set_type(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 1, 29, 0x7, n);
}

static inline int msg_in_group(struct tipc_msg *m)
{
        int mtyp = msg_type(m);

        return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG;
}

static inline bool msg_is_grp_evt(struct tipc_msg *m)
{
        return msg_type(m) == TIPC_GRP_MEMBER_EVT;
}

static inline u32 msg_named(struct tipc_msg *m)
{
        return msg_type(m) == TIPC_NAMED_MSG;
}

static inline u32 msg_mcast(struct tipc_msg *m)
{
        int mtyp = msg_type(m);

        return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) ||
                (mtyp == TIPC_GRP_MCAST_MSG));
}

static inline u32 msg_connected(struct tipc_msg *m)
{
        return msg_type(m) == TIPC_CONN_MSG;
}

static inline u32 msg_direct(struct tipc_msg *m)
{
        return msg_type(m) == TIPC_DIRECT_MSG;
}

static inline u32 msg_errcode(struct tipc_msg *m)
{
        return msg_bits(m, 1, 25, 0xf);
}

static inline void msg_set_errcode(struct tipc_msg *m, u32 err)
{
        msg_set_bits(m, 1, 25, 0xf, err);
}

static inline void msg_set_bulk(struct tipc_msg *m)
{
        msg_set_bits(m, 1, 28, 0x1, 1);
}

static inline u32 msg_is_bulk(struct tipc_msg *m)
{
        return msg_bits(m, 1, 28, 0x1);
}

static inline void msg_set_last_bulk(struct tipc_msg *m)
{
        msg_set_bits(m, 1, 27, 0x1, 1);
}

static inline u32 msg_is_last_bulk(struct tipc_msg *m)
{
        return msg_bits(m, 1, 27, 0x1);
}

static inline void msg_set_non_legacy(struct tipc_msg *m)
{
        msg_set_bits(m, 1, 26, 0x1, 1);
}

static inline u32 msg_is_legacy(struct tipc_msg *m)
{
        return !msg_bits(m, 1, 26, 0x1);
}

static inline u32 msg_reroute_cnt(struct tipc_msg *m)
{
        return msg_bits(m, 1, 21, 0xf);
}

static inline void msg_incr_reroute_cnt(struct tipc_msg *m)
{
        msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1);
}

static inline u32 msg_lookup_scope(struct tipc_msg *m)
{
        return msg_bits(m, 1, 19, 0x3);
}

static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 1, 19, 0x3, n);
}

static inline u16 msg_bcast_ack(struct tipc_msg *m)
{
        return msg_bits(m, 1, 0, 0xffff);
}

static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 1, 0, 0xffff, n);
}

/* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch
 * link peer session number
 */
static inline bool msg_dest_session_valid(struct tipc_msg *m)
{
        return msg_bits(m, 1, 16, 0x1);
}

static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid)
{
        msg_set_bits(m, 1, 16, 0x1, valid);
}

static inline u16 msg_dest_session(struct tipc_msg *m)
{
        return msg_bits(m, 1, 0, 0xffff);
}

static inline void msg_set_dest_session(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 1, 0, 0xffff, n);
}

/*
 * Word 2
 */
static inline u16 msg_ack(struct tipc_msg *m)
{
        return msg_bits(m, 2, 16, 0xffff);
}

static inline void msg_set_ack(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 2, 16, 0xffff, n);
}

static inline u16 msg_seqno(struct tipc_msg *m)
{
        return msg_bits(m, 2, 0, 0xffff);
}

static inline void msg_set_seqno(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 2, 0, 0xffff, n);
}

/*
 * Words 3-10
 */
static inline u32 msg_importance(struct tipc_msg *m)
{
        int usr = msg_user(m);

        if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m)))
                return usr;
        if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))
                return msg_bits(m, 9, 0, 0x7);
        return TIPC_SYSTEM_IMPORTANCE;
}

static inline void msg_set_importance(struct tipc_msg *m, u32 i)
{
        int usr = msg_user(m);

        if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)))
                msg_set_bits(m, 9, 0, 0x7, i);
        else if (i < TIPC_SYSTEM_IMPORTANCE)
                msg_set_user(m, i);
        else
                pr_warn("Trying to set illegal importance in message\n");
}

static inline u32 msg_prevnode(struct tipc_msg *m)
{
        return msg_word(m, 3);
}

static inline void msg_set_prevnode(struct tipc_msg *m, u32 a)
{
        msg_set_word(m, 3, a);
}

static inline u32 msg_origport(struct tipc_msg *m)
{
        if (msg_user(m) == MSG_FRAGMENTER)
                m = msg_inner_hdr(m);
        return msg_word(m, 4);
}

static inline void msg_set_origport(struct tipc_msg *m, u32 p)
{
        msg_set_word(m, 4, p);
}

static inline u16 msg_named_seqno(struct tipc_msg *m)
{
        return msg_bits(m, 4, 0, 0xffff);
}

static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 4, 0, 0xffff, n);
}

static inline u32 msg_destport(struct tipc_msg *m)
{
        return msg_word(m, 5);
}

static inline void msg_set_destport(struct tipc_msg *m, u32 p)
{
        msg_set_word(m, 5, p);
}

static inline u32 msg_mc_netid(struct tipc_msg *m)
{
        return msg_word(m, 5);
}

static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p)
{
        msg_set_word(m, 5, p);
}

static inline int msg_short(struct tipc_msg *m)
{
        return msg_hdr_sz(m) == SHORT_H_SIZE;
}

static inline u32 msg_orignode(struct tipc_msg *m)
{
        if (likely(msg_short(m)))
                return msg_prevnode(m);
        return msg_word(m, 6);
}

static inline void msg_set_orignode(struct tipc_msg *m, u32 a)
{
        msg_set_word(m, 6, a);
}

static inline u32 msg_destnode(struct tipc_msg *m)
{
        return msg_word(m, 7);
}

static inline void msg_set_destnode(struct tipc_msg *m, u32 a)
{
        msg_set_word(m, 7, a);
}

static inline u32 msg_nametype(struct tipc_msg *m)
{
        return msg_word(m, 8);
}

static inline void msg_set_nametype(struct tipc_msg *m, u32 n)
{
        msg_set_word(m, 8, n);
}

static inline u32 msg_nameinst(struct tipc_msg *m)
{
        return msg_word(m, 9);
}

static inline u32 msg_namelower(struct tipc_msg *m)
{
        return msg_nameinst(m);
}

static inline void msg_set_namelower(struct tipc_msg *m, u32 n)
{
        msg_set_word(m, 9, n);
}

static inline void msg_set_nameinst(struct tipc_msg *m, u32 n)
{
        msg_set_namelower(m, n);
}

static inline u32 msg_nameupper(struct tipc_msg *m)
{
        return msg_word(m, 10);
}

static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
{
        msg_set_word(m, 10, n);
}

/*
 * Constants and routines used to read and write TIPC internal message headers
 */

/*
 *  Connection management protocol message types
 */
#define CONN_PROBE        0
#define CONN_PROBE_REPLY  1
#define CONN_ACK          2

/*
 * Name distributor message types
 */
#define PUBLICATION       0
#define WITHDRAWAL        1

/*
 * Segmentation message types
 */
#define FIRST_FRAGMENT                0
#define FRAGMENT                1
#define LAST_FRAGMENT                2

/*
 * Link management protocol message types
 */
#define STATE_MSG                0
#define RESET_MSG                1
#define ACTIVATE_MSG                2

/*
 * Changeover tunnel message types
 */
#define SYNCH_MSG                0
#define FAILOVER_MSG                1

/*
 * Config protocol message types
 */
#define DSC_REQ_MSG                0
#define DSC_RESP_MSG                1
#define DSC_TRIAL_MSG                2
#define DSC_TRIAL_FAIL_MSG        3

/*
 * Group protocol message types
 */
#define GRP_JOIN_MSG         0
#define GRP_LEAVE_MSG        1
#define GRP_ADV_MSG          2
#define GRP_ACK_MSG          3
#define GRP_RECLAIM_MSG      4
#define GRP_REMIT_MSG        5

/* Crypto message types */
#define KEY_DISTR_MSG                0

/*
 * Word 1
 */
static inline u32 msg_seq_gap(struct tipc_msg *m)
{
        return msg_bits(m, 1, 16, 0x1fff);
}

static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 1, 16, 0x1fff, n);
}

static inline u32 msg_node_sig(struct tipc_msg *m)
{
        return msg_bits(m, 1, 0, 0xffff);
}

static inline void msg_set_node_sig(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 1, 0, 0xffff, n);
}

static inline u32 msg_node_capabilities(struct tipc_msg *m)
{
        return msg_bits(m, 1, 15, 0x1fff);
}

static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 1, 15, 0x1fff, n);
}

/*
 * Word 2
 */
static inline u32 msg_dest_domain(struct tipc_msg *m)
{
        return msg_word(m, 2);
}

static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n)
{
        msg_set_word(m, 2, n);
}

static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 2, 16, 0xffff, n);
}

static inline u32 msg_bcgap_to(struct tipc_msg *m)
{
        return msg_bits(m, 2, 0, 0xffff);
}

static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 2, 0, 0xffff, n);
}

/*
 * Word 4
 */
static inline u32 msg_last_bcast(struct tipc_msg *m)
{
        return msg_bits(m, 4, 16, 0xffff);
}

static inline u32 msg_bc_snd_nxt(struct tipc_msg *m)
{
        return msg_last_bcast(m) + 1;
}

static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 4, 16, 0xffff, n);
}

static inline u32 msg_nof_fragms(struct tipc_msg *m)
{
        return msg_bits(m, 4, 0, 0xffff);
}

static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 4, 0, 0xffff, n);
}

static inline u32 msg_fragm_no(struct tipc_msg *m)
{
        return msg_bits(m, 4, 16, 0xffff);
}

static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 4, 16, 0xffff, n);
}

static inline u16 msg_next_sent(struct tipc_msg *m)
{
        return msg_bits(m, 4, 0, 0xffff);
}

static inline void msg_set_next_sent(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 4, 0, 0xffff, n);
}

static inline u32 msg_bc_netid(struct tipc_msg *m)
{
        return msg_word(m, 4);
}

static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id)
{
        msg_set_word(m, 4, id);
}

static inline u32 msg_link_selector(struct tipc_msg *m)
{
        if (msg_user(m) == MSG_FRAGMENTER)
                m = (void *)msg_data(m);
        return msg_bits(m, 4, 0, 1);
}

/*
 * Word 5
 */
static inline u16 msg_session(struct tipc_msg *m)
{
        return msg_bits(m, 5, 16, 0xffff);
}

static inline void msg_set_session(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 5, 16, 0xffff, n);
}

static inline u32 msg_probe(struct tipc_msg *m)
{
        return msg_bits(m, 5, 0, 1);
}

static inline void msg_set_probe(struct tipc_msg *m, u32 val)
{
        msg_set_bits(m, 5, 0, 1, val);
}

static inline char msg_net_plane(struct tipc_msg *m)
{
        return msg_bits(m, 5, 1, 7) + 'A';
}

static inline void msg_set_net_plane(struct tipc_msg *m, char n)
{
        msg_set_bits(m, 5, 1, 7, (n - 'A'));
}

static inline u32 msg_linkprio(struct tipc_msg *m)
{
        return msg_bits(m, 5, 4, 0x1f);
}

static inline void msg_set_linkprio(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 5, 4, 0x1f, n);
}

static inline u32 msg_bearer_id(struct tipc_msg *m)
{
        return msg_bits(m, 5, 9, 0x7);
}

static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 5, 9, 0x7, n);
}

static inline u32 msg_redundant_link(struct tipc_msg *m)
{
        return msg_bits(m, 5, 12, 0x1);
}

static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r)
{
        msg_set_bits(m, 5, 12, 0x1, r);
}

static inline u32 msg_peer_stopping(struct tipc_msg *m)
{
        return msg_bits(m, 5, 13, 0x1);
}

static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s)
{
        msg_set_bits(m, 5, 13, 0x1, s);
}

static inline bool msg_bc_ack_invalid(struct tipc_msg *m)
{
        switch (msg_user(m)) {
        case BCAST_PROTOCOL:
        case NAME_DISTRIBUTOR:
        case LINK_PROTOCOL:
                return msg_bits(m, 5, 14, 0x1);
        default:
                return false;
        }
}

static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid)
{
        msg_set_bits(m, 5, 14, 0x1, invalid);
}

static inline char *msg_media_addr(struct tipc_msg *m)
{
        return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET];
}

static inline u32 msg_bc_gap(struct tipc_msg *m)
{
        return msg_bits(m, 8, 0, 0x3ff);
}

static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 8, 0, 0x3ff, n);
}

/*
 * Word 9
 */
static inline u16 msg_msgcnt(struct tipc_msg *m)
{
        return msg_bits(m, 9, 16, 0xffff);
}

static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 9, 16, 0xffff, n);
}

static inline u16 msg_syncpt(struct tipc_msg *m)
{
        return msg_bits(m, 9, 16, 0xffff);
}

static inline void msg_set_syncpt(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 9, 16, 0xffff, n);
}

static inline u32 msg_conn_ack(struct tipc_msg *m)
{
        return msg_bits(m, 9, 16, 0xffff);
}

static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 9, 16, 0xffff, n);
}

static inline u16 msg_adv_win(struct tipc_msg *m)
{
        return msg_bits(m, 9, 0, 0xffff);
}

static inline void msg_set_adv_win(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 9, 0, 0xffff, n);
}

static inline u32 msg_max_pkt(struct tipc_msg *m)
{
        return msg_bits(m, 9, 16, 0xffff) * 4;
}

static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 9, 16, 0xffff, (n / 4));
}

static inline u32 msg_link_tolerance(struct tipc_msg *m)
{
        return msg_bits(m, 9, 0, 0xffff);
}

static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 9, 0, 0xffff, n);
}

static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m)
{
        return msg_bits(m, 9, 16, 0xffff);
}

static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 9, 16, 0xffff, n);
}

static inline u16 msg_grp_bc_acked(struct tipc_msg *m)
{
        return msg_bits(m, 9, 16, 0xffff);
}

static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 9, 16, 0xffff, n);
}

static inline u16 msg_grp_remitted(struct tipc_msg *m)
{
        return msg_bits(m, 9, 16, 0xffff);
}

static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n)
{
        msg_set_bits(m, 9, 16, 0xffff, n);
}

/* Word 10
 */
static inline u16 msg_grp_evt(struct tipc_msg *m)
{
        return msg_bits(m, 10, 0, 0x3);
}

static inline void msg_set_grp_evt(struct tipc_msg *m, int n)
{
        msg_set_bits(m, 10, 0, 0x3, n);
}

static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m)
{
        return msg_bits(m, 10, 0, 0x1);
}

static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n)
{
        msg_set_bits(m, 10, 0, 0x1, n);
}

static inline u16 msg_grp_bc_seqno(struct tipc_msg *m)
{
        return msg_bits(m, 10, 16, 0xffff);
}

static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n)
{
        msg_set_bits(m, 10, 16, 0xffff, n);
}

static inline bool msg_peer_link_is_up(struct tipc_msg *m)
{
        if (likely(msg_user(m) != LINK_PROTOCOL))
                return true;
        if (msg_type(m) == STATE_MSG)
                return true;
        return false;
}

static inline bool msg_peer_node_is_up(struct tipc_msg *m)
{
        if (msg_peer_link_is_up(m))
                return true;
        return msg_redundant_link(m);
}

static inline bool msg_is_reset(struct tipc_msg *hdr)
{
        return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
}

/* Word 13
 */
static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n)
{
        msg_set_word(m, 13, n);
}

static inline u32 msg_peer_net_hash(struct tipc_msg *m)
{
        return msg_word(m, 13);
}

/* Word 14
 */
static inline u32 msg_sugg_node_addr(struct tipc_msg *m)
{
        return msg_word(m, 14);
}

static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n)
{
        msg_set_word(m, 14, n);
}

static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id)
{
        memcpy(msg_data(hdr), id, 16);
}

static inline u8 *msg_node_id(struct tipc_msg *hdr)
{
        return (u8 *)msg_data(hdr);
}

struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp);
bool tipc_msg_validate(struct sk_buff **_skb);
bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb,
                     struct sk_buff_head *xmitq);
void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type,
                   u32 hsize, u32 destnode);
struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz,
                                uint data_sz, u32 dnode, u32 onode,
                                u32 dport, u32 oport, int errcode);
int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf);
bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss,
                         u32 dnode, bool *new_bundle);
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
                      int pktmax, struct sk_buff_head *frags);
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
                   int offset, int dsz, int mtu, struct sk_buff_head *list);
int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen,
                    int mss, struct sk_buff_head *txq);
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
bool tipc_msg_assemble(struct sk_buff_head *list);
bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
                        struct sk_buff_head *cpy);
bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
                             struct sk_buff *skb);
bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy);

static inline u16 buf_seqno(struct sk_buff *skb)
{
        return msg_seqno(buf_msg(skb));
}

static inline int buf_roundup_len(struct sk_buff *skb)
{
        return (skb->len / 1024 + 1) * 1024;
}

/* tipc_skb_peek(): peek and reserve first buffer in list
 * @list: list to be peeked in
 * Returns pointer to first buffer in list, if any
 */
static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list,
                                            spinlock_t *lock)
{
        struct sk_buff *skb;

        spin_lock_bh(lock);
        skb = skb_peek(list);
        if (skb)
                skb_get(skb);
        spin_unlock_bh(lock);
        return skb;
}

/* tipc_skb_peek_port(): find a destination port, ignoring all destinations
 *                       up to and including 'filter'.
 * Note: ignoring previously tried destinations minimizes the risk of
 *       contention on the socket lock
 * @list: list to be peeked in
 * @filter: last destination to be ignored from search
 * Returns a destination port number, of applicable.
 */
static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter)
{
        struct sk_buff *skb;
        u32 dport = 0;
        bool ignore = true;

        spin_lock_bh(&list->lock);
        skb_queue_walk(list, skb) {
                dport = msg_destport(buf_msg(skb));
                if (!filter || skb_queue_is_last(list, skb))
                        break;
                if (dport == filter)
                        ignore = false;
                else if (!ignore)
                        break;
        }
        spin_unlock_bh(&list->lock);
        return dport;
}

/* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list
 * @list: list to be unlinked from
 * @dport: selection criteria for buffer to unlink
 */
static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list,
                                               u32 dport)
{
        struct sk_buff *_skb, *tmp, *skb = NULL;

        spin_lock_bh(&list->lock);
        skb_queue_walk_safe(list, _skb, tmp) {
                if (msg_destport(buf_msg(_skb)) == dport) {
                        __skb_unlink(_skb, list);
                        skb = _skb;
                        break;
                }
        }
        spin_unlock_bh(&list->lock);
        return skb;
}

/* tipc_skb_queue_splice_tail - append an skb list to lock protected list
 * @list: the new list to append. Not lock protected
 * @head: target list. Lock protected.
 */
static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list,
                                              struct sk_buff_head *head)
{
        spin_lock_bh(&head->lock);
        skb_queue_splice_tail(list, head);
        spin_unlock_bh(&head->lock);
}

/* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists
 * @list: the new list to add. Lock protected. Will be reinitialized
 * @head: target list. Lock protected.
 */
static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list,
                                                   struct sk_buff_head *head)
{
        struct sk_buff_head tmp;

        __skb_queue_head_init(&tmp);

        spin_lock_bh(&list->lock);
        skb_queue_splice_tail_init(list, &tmp);
        spin_unlock_bh(&list->lock);
        tipc_skb_queue_splice_tail(&tmp, head);
}

/* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno
 * @list: list to be dequeued from
 * @seqno: seqno of the expected msg
 *
 * returns skb dequeued from the list if its seqno is less than or equal to
 * the expected one, otherwise the skb is still hold
 *
 * Note: must be used with appropriate locks held only
 */
static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list,
                                                 u16 seqno)
{
        struct sk_buff *skb = skb_peek(list);

        if (skb && less_eq(buf_seqno(skb), seqno)) {
                __skb_unlink(skb, list);
                return skb;
        }
        return NULL;
}

#endif






















































































































































































































































































































































































































































































































































































































































    4 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAP_H
#define _LINUX_SWAP_H

#include <linux/spinlock.h>
#include <linux/linkage.h>
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/node.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
#include <uapi/linux/mempolicy.h>
#include <asm/page.h>

struct notifier_block;

struct bio;

struct pagevec;

#define SWAP_FLAG_PREFER        0x8000        /* set if swap priority specified */
#define SWAP_FLAG_PRIO_MASK        0x7fff
#define SWAP_FLAG_PRIO_SHIFT        0
#define SWAP_FLAG_DISCARD        0x10000 /* enable discard for swap */
#define SWAP_FLAG_DISCARD_ONCE        0x20000 /* discard swap area at swapon-time */
#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */

#define SWAP_FLAGS_VALID        (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
                                 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
                                 SWAP_FLAG_DISCARD_PAGES)
#define SWAP_BATCH 64

static inline int current_is_kswapd(void)
{
        return current->flags & PF_KSWAPD;
}

/*
 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
 * be swapped to.  The swap type and the offset into that swap type are
 * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
 * for the type means that the maximum number of swapcache pages is 27 bits
 * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
 * the type/offset into the pte as 5/27 as well.
 */
#define MAX_SWAPFILES_SHIFT        5

/*
 * Use some of the swap files numbers for other purposes. This
 * is a convenient way to hook into the VM to trigger special
 * actions on faults.
 */

/*
 * PTE markers are used to persist information onto PTEs that otherwise
 * should be a none pte.  As its name "PTE" hints, it should only be
 * applied to the leaves of pgtables.
 */
#define SWP_PTE_MARKER_NUM 1
#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
                            SWP_MIGRATION_NUM + SWP_DEVICE_NUM)

/*
 * Unaddressable device memory support. See include/linux/hmm.h and
 * Documentation/mm/hmm.rst. Short description is we need struct pages for
 * device memory that is unaddressable (inaccessible) by CPU, so that we can
 * migrate part of a process memory to device memory.
 *
 * When a page is migrated from CPU to device, we set the CPU page table entry
 * to a special SWP_DEVICE_{READ|WRITE} entry.
 *
 * When a page is mapped by the device for exclusive access we set the CPU page
 * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
 */
#ifdef CONFIG_DEVICE_PRIVATE
#define SWP_DEVICE_NUM 4
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
#else
#define SWP_DEVICE_NUM 0
#endif

/*
 * Page migration support.
 *
 * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and
 * indicates that the referenced (part of) an anonymous page is exclusive to
 * a single process. For SWP_MIGRATION_WRITE, that information is implicit:
 * (part of) an anonymous page that are mapped writable are exclusive to a
 * single process.
 */
#ifdef CONFIG_MIGRATION
#define SWP_MIGRATION_NUM 3
#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
#else
#define SWP_MIGRATION_NUM 0
#endif

/*
 * Handling of hardware poisoned pages with memory corruption.
 */
#ifdef CONFIG_MEMORY_FAILURE
#define SWP_HWPOISON_NUM 1
#define SWP_HWPOISON                MAX_SWAPFILES
#else
#define SWP_HWPOISON_NUM 0
#endif

#define MAX_SWAPFILES \
        ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
        SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
        SWP_PTE_MARKER_NUM)

/*
 * Magic header for a swap area. The first part of the union is
 * what the swap magic looks like for the old (limited to 128MB)
 * swap area format, the second part of the union adds - in the
 * old reserved area - some extra information. Note that the first
 * kilobyte is reserved for boot loader or disk label stuff...
 *
 * Having the magic at the end of the PAGE_SIZE makes detecting swap
 * areas somewhat tricky on machines that support multiple page sizes.
 * For 2.5 we'll probably want to move the magic to just beyond the
 * bootbits...
 */
union swap_header {
        struct {
                char reserved[PAGE_SIZE - 10];
                char magic[10];                        /* SWAP-SPACE or SWAPSPACE2 */
        } magic;
        struct {
                char                bootbits[1024];        /* Space for disklabel etc. */
                __u32                version;
                __u32                last_page;
                __u32                nr_badpages;
                unsigned char        sws_uuid[16];
                unsigned char        sws_volume[16];
                __u32                padding[117];
                __u32                badpages[1];
        } info;
};

/*
 * current->reclaim_state points to one of these when a task is running
 * memory reclaim
 */
struct reclaim_state {
        /* pages reclaimed outside of LRU-based reclaim */
        unsigned long reclaimed;
#ifdef CONFIG_LRU_GEN
        /* per-thread mm walk data */
        struct lru_gen_mm_walk *mm_walk;
#endif
};

/*
 * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
 * reclaim
 * @pages: number of pages reclaimed
 *
 * If the current process is undergoing a reclaim operation, increment the
 * number of reclaimed pages by @pages.
 */
static inline void mm_account_reclaimed_pages(unsigned long pages)
{
        if (current->reclaim_state)
                current->reclaim_state->reclaimed += pages;
}

#ifdef __KERNEL__

struct address_space;
struct sysinfo;
struct writeback_control;
struct zone;

/*
 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
 * disk blocks.  A rbtree of swap extents maps the entire swapfile (Where the
 * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart
 * from setup, they're handled identically.
 *
 * We always assume that blocks are of size PAGE_SIZE.
 */
struct swap_extent {
        struct rb_node rb_node;
        pgoff_t start_page;
        pgoff_t nr_pages;
        sector_t start_block;
};

/*
 * Max bad pages in the new format..
 */
#define MAX_SWAP_BADPAGES \
        ((offsetof(union swap_header, magic.magic) - \
          offsetof(union swap_header, info.badpages)) / sizeof(int))

enum {
        SWP_USED        = (1 << 0),        /* is slot in swap_info[] used? */
        SWP_WRITEOK        = (1 << 1),        /* ok to write to this swap?        */
        SWP_DISCARDABLE = (1 << 2),        /* blkdev support discard */
        SWP_DISCARDING        = (1 << 3),        /* now discarding a free cluster */
        SWP_SOLIDSTATE        = (1 << 4),        /* blkdev seeks are cheap */
        SWP_CONTINUED        = (1 << 5),        /* swap_map has count continuation */
        SWP_BLKDEV        = (1 << 6),        /* its a block device */
        SWP_ACTIVATED        = (1 << 7),        /* set after swap_activate success */
        SWP_FS_OPS        = (1 << 8),        /* swapfile operations go through fs */
        SWP_AREA_DISCARD = (1 << 9),        /* single-time swap area discards */
        SWP_PAGE_DISCARD = (1 << 10),        /* freed swap page-cluster discards */
        SWP_STABLE_WRITES = (1 << 11),        /* no overwrite PG_writeback pages */
        SWP_SYNCHRONOUS_IO = (1 << 12),        /* synchronous IO is efficient */
                                        /* add others here before... */
        SWP_SCANNING        = (1 << 14),        /* refcount in scan_swap_map */
};

#define SWAP_CLUSTER_MAX 32UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX

/* Bit flag in swap_map */
#define SWAP_HAS_CACHE        0x40        /* Flag page is cached, in first swap_map */
#define COUNT_CONTINUED        0x80        /* Flag swap_map continuation for full count */

/* Special value in first swap_map */
#define SWAP_MAP_MAX        0x3e        /* Max count */
#define SWAP_MAP_BAD        0x3f        /* Note page is bad */
#define SWAP_MAP_SHMEM        0xbf        /* Owned by shmem/tmpfs */

/* Special value in each swap_map continuation */
#define SWAP_CONT_MAX        0x7f        /* Max count */

/*
 * We use this to track usage of a cluster. A cluster is a block of swap disk
 * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
 * free clusters are organized into a list. We fetch an entry from the list to
 * get a free cluster.
 *
 * The data field stores next cluster if the cluster is free or cluster usage
 * counter otherwise. The flags field determines if a cluster is free. This is
 * protected by swap_info_struct.lock.
 */
struct swap_cluster_info {
        spinlock_t lock;        /*
                                 * Protect swap_cluster_info fields
                                 * and swap_info_struct->swap_map
                                 * elements correspond to the swap
                                 * cluster
                                 */
        unsigned int data:24;
        unsigned int flags:8;
};
#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */

/*
 * The first page in the swap file is the swap header, which is always marked
 * bad to prevent it from being allocated as an entry. This also prevents the
 * cluster to which it belongs being marked free. Therefore 0 is safe to use as
 * a sentinel to indicate next is not valid in percpu_cluster.
 */
#define SWAP_NEXT_INVALID        0

#ifdef CONFIG_THP_SWAP
#define SWAP_NR_ORDERS                (PMD_ORDER + 1)
#else
#define SWAP_NR_ORDERS                1
#endif

/*
 * We assign a cluster to each CPU, so each CPU can allocate swap entry from
 * its own cluster and swapout sequentially. The purpose is to optimize swapout
 * throughput.
 */
struct percpu_cluster {
        unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};

struct swap_cluster_list {
        struct swap_cluster_info head;
        struct swap_cluster_info tail;
};

/*
 * The in-memory structure used to track swap areas.
 */
struct swap_info_struct {
        struct percpu_ref users;        /* indicate and keep swap device valid. */
        unsigned long        flags;                /* SWP_USED etc: see above */
        signed short        prio;                /* swap priority of this type */
        struct plist_node list;                /* entry in swap_active_head */
        signed char        type;                /* strange name for an index */
        unsigned int        max;                /* extent of the swap_map */
        unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
        struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
        struct swap_cluster_list free_clusters; /* free clusters list */
        unsigned int lowest_bit;        /* index of first free in swap_map */
        unsigned int highest_bit;        /* index of last free in swap_map */
        unsigned int pages;                /* total of usable pages of swap */
        unsigned int inuse_pages;        /* number of those currently in use */
        unsigned int cluster_next;        /* likely index for next allocation */
        unsigned int cluster_nr;        /* countdown to next cluster search */
        unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
        struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
        struct rb_root swap_extent_root;/* root of the swap extent rbtree */
        struct block_device *bdev;        /* swap device or bdev of swap file */
        struct file *swap_file;                /* seldom referenced */
        struct completion comp;                /* seldom referenced */
        spinlock_t lock;                /*
                                         * protect map scan related fields like
                                         * swap_map, lowest_bit, highest_bit,
                                         * inuse_pages, cluster_next,
                                         * cluster_nr, lowest_alloc,
                                         * highest_alloc, free/discard cluster
                                         * list. other fields are only changed
                                         * at swapon/swapoff, so are protected
                                         * by swap_lock. changing flags need
                                         * hold this lock and swap_lock. If
                                         * both locks need hold, hold swap_lock
                                         * first.
                                         */
        spinlock_t cont_lock;                /*
                                         * protect swap count continuation page
                                         * list.
                                         */
        struct work_struct discard_work; /* discard worker */
        struct swap_cluster_list discard_clusters; /* discard clusters list */
        struct plist_node avail_lists[]; /*
                                           * entries in swap_avail_heads, one
                                           * entry per node.
                                           * Must be last as the number of the
                                           * array is nr_node_ids, which is not
                                           * a fixed value so have to allocate
                                           * dynamically.
                                           * And it has to be an array so that
                                           * plist_for_each_* can work.
                                           */
};

static inline swp_entry_t page_swap_entry(struct page *page)
{
        struct folio *folio = page_folio(page);
        swp_entry_t entry = folio->swap;

        entry.val += folio_page_idx(folio, page);
        return entry;
}

/* linux/mm/workingset.c */
bool workingset_test_recent(void *shadow, bool file, bool *workingset);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
void workingset_activation(struct folio *folio);

/* linux/mm/page_alloc.c */
extern unsigned long totalreserve_pages;

/* Definition of global_zone_page_state not available yet */
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)


/* linux/mm/swap.c */
void lru_note_cost(struct lruvec *lruvec, bool file,
                   unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);

extern atomic_t lru_disable_count;

static inline bool lru_cache_disabled(void)
{
        return atomic_read(&lru_disable_count);
}

static inline void lru_cache_enable(void)
{
        atomic_dec(&lru_disable_count);
}

extern void lru_cache_disable(void);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
void folio_deactivate(struct folio *folio);
void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);

/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask, nodemask_t *mask);

#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                  unsigned long nr_pages,
                                                  gfp_t gfp_mask,
                                                  unsigned int reclaim_options);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
long remove_mapping(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_NUMA
extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
#else
#define node_reclaim_mode 0
#endif

static inline bool node_reclaim_enabled(void)
{
        /* Is any node_reclaim_mode bit set? */
        return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
}

void check_move_unevictable_folios(struct folio_batch *fbatch);

extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);

#ifdef CONFIG_SWAP

int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
                sector_t *);

static inline unsigned long total_swapcache_pages(void)
{
        return global_node_page_state(NR_SWAPCACHE);
}

void free_swap_cache(struct folio *folio);
void free_page_and_swap_cache(struct page *);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;
extern bool has_usable_swap(void);

/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
{
        return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
}

static inline long get_nr_swap_pages(void)
{
        return atomic_long_read(&nr_swap_pages);
}

extern void si_swapinfo(struct sysinfo *);
swp_entry_t folio_alloc_swap(struct folio *folio);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);

static inline void put_swap_device(struct swap_info_struct *si)
{
        percpu_ref_put(&si->users);
}

#else /* CONFIG_SWAP */
static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
        return NULL;
}

static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        return NULL;
}

static inline void put_swap_device(struct swap_info_struct *si)
{
}

#define get_nr_swap_pages()                        0L
#define total_swap_pages                        0L
#define total_swapcache_pages()                        0UL
#define vm_swap_full()                                0

#define si_swapinfo(val) \
        do { (val)->freeswap = (val)->totalswap = 0; } while (0)
/* only sparc can not include linux/pagemap.h in this file
 * so leave put_page and release_pages undeclared... */
#define free_page_and_swap_cache(page) \
        put_page(page)
#define free_pages_and_swap_cache(pages, nr) \
        release_pages((pages), (nr));

static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
}

static inline void free_swap_cache(struct folio *folio)
{
}

static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
        return 0;
}

static inline void swap_shmem_alloc(swp_entry_t swp)
{
}

static inline int swap_duplicate(swp_entry_t swp)
{
        return 0;
}

static inline int swapcache_prepare(swp_entry_t swp)
{
        return 0;
}

static inline void swap_free(swp_entry_t swp)
{
}

static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
{
}

static inline int __swap_count(swp_entry_t entry)
{
        return 0;
}

static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
        return 0;
}

static inline int swp_swapcount(swp_entry_t entry)
{
        return 0;
}

static inline swp_entry_t folio_alloc_swap(struct folio *folio)
{
        swp_entry_t entry;
        entry.val = 0;
        return entry;
}

static inline bool folio_free_swap(struct folio *folio)
{
        return false;
}

static inline int add_swap_extent(struct swap_info_struct *sis,
                                  unsigned long start_page,
                                  unsigned long nr_pages, sector_t start_block)
{
        return -EINVAL;
}
#endif /* CONFIG_SWAP */

static inline void free_swap_and_cache(swp_entry_t entry)
{
        free_swap_and_cache_nr(entry, 1);
}

#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
        /* Cgroup2 doesn't have per-cgroup swappiness */
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return READ_ONCE(vm_swappiness);

        /* root ? */
        if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
                return READ_ONCE(vm_swappiness);

        return READ_ONCE(memcg->swappiness);
}
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
        return READ_ONCE(vm_swappiness);
}
#endif

#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return;
        __folio_throttle_swaprate(folio, gfp);
}
#else
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
}
#endif

#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                swp_entry_t entry)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_try_charge_swap(folio, entry);
}

extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_swap(entry, nr_pages);
}

extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct folio *folio);
#else
static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
}

static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                                             swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
                                            unsigned int nr_pages)
{
}

static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
        return get_nr_swap_pages();
}

static inline bool mem_cgroup_swap_full(struct folio *folio)
{
        return vm_swap_full();
}
#endif

#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */


























































































































    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * File: af_phonet.h
 *
 * Phonet sockets kernel definitions
 *
 * Copyright (C) 2008 Nokia Corporation.
 */

#ifndef AF_PHONET_H
#define AF_PHONET_H

#include <linux/phonet.h>
#include <linux/skbuff.h>
#include <net/sock.h>

/*
 * The lower layers may not require more space, ever. Make sure it's
 * enough.
 */
#define MAX_PHONET_HEADER        (8 + MAX_HEADER)

/*
 * Every Phonet* socket has this structure first in its
 * protocol-specific structure under name c.
 */
struct pn_sock {
        struct sock        sk;
        u16                sobject;
        u16                dobject;
        u8                resource;
};

static inline struct pn_sock *pn_sk(struct sock *sk)
{
        return (struct pn_sock *)sk;
}

extern const struct proto_ops phonet_dgram_ops;

void pn_sock_init(void);
struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa);
void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb);
void phonet_get_local_port_range(int *min, int *max);
int pn_sock_hash(struct sock *sk);
void pn_sock_unhash(struct sock *sk);
int pn_sock_get_port(struct sock *sk, unsigned short sport);

struct sock *pn_find_sock_by_res(struct net *net, u8 res);
int pn_sock_bind_res(struct sock *sock, u8 res);
int pn_sock_unbind_res(struct sock *sk, u8 res);
void pn_sock_unbind_all_res(struct sock *sk);

int pn_skb_send(struct sock *sk, struct sk_buff *skb,
                const struct sockaddr_pn *target);

static inline struct phonethdr *pn_hdr(struct sk_buff *skb)
{
        return (struct phonethdr *)skb_network_header(skb);
}

static inline struct phonetmsg *pn_msg(struct sk_buff *skb)
{
        return (struct phonetmsg *)skb_transport_header(skb);
}

/*
 * Get the other party's sockaddr from received skb. The skb begins
 * with a Phonet header.
 */
static inline
void pn_skb_get_src_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa)
{
        struct phonethdr *ph = pn_hdr(skb);
        u16 obj = pn_object(ph->pn_sdev, ph->pn_sobj);

        sa->spn_family = AF_PHONET;
        pn_sockaddr_set_object(sa, obj);
        pn_sockaddr_set_resource(sa, ph->pn_res);
        memset(sa->spn_zero, 0, sizeof(sa->spn_zero));
}

static inline
void pn_skb_get_dst_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa)
{
        struct phonethdr *ph = pn_hdr(skb);
        u16 obj = pn_object(ph->pn_rdev, ph->pn_robj);

        sa->spn_family = AF_PHONET;
        pn_sockaddr_set_object(sa, obj);
        pn_sockaddr_set_resource(sa, ph->pn_res);
        memset(sa->spn_zero, 0, sizeof(sa->spn_zero));
}

/* Protocols in Phonet protocol family. */
struct phonet_protocol {
        const struct proto_ops        *ops;
        struct proto                *prot;
        int                        sock_type;
};

int phonet_proto_register(unsigned int protocol,
                const struct phonet_protocol *pp);
void phonet_proto_unregister(unsigned int protocol,
                const struct phonet_protocol *pp);

int phonet_sysctl_init(void);
void phonet_sysctl_exit(void);
int isi_register(void);
void isi_unregister(void);

static inline bool sk_is_phonet(struct sock *sk)
{
        return sk->sk_family == PF_PHONET;
}

static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd,
                                  void __user *arg)
{
        int karg;

        switch (cmd) {
        case SIOCPNADDRESOURCE:
        case SIOCPNDELRESOURCE:
                if (get_user(karg, (int __user *)arg))
                        return -EFAULT;

                return sk->sk_prot->ioctl(sk, cmd, &karg);
        }
        /* A positive return value means that the ioctl was not processed */
        return 1;
}
#endif













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/* SPDX-License-Identifier: GPL-2.0-or-later */

#include <net/inet_common.h>

enum linux_mptcp_mib_field {
        MPTCP_MIB_NUM = 0,
        MPTCP_MIB_MPCAPABLEPASSIVE,        /* Received SYN with MP_CAPABLE */
        MPTCP_MIB_MPCAPABLEACTIVE,        /* Sent SYN with MP_CAPABLE */
        MPTCP_MIB_MPCAPABLEACTIVEACK,        /* Received SYN/ACK with MP_CAPABLE */
        MPTCP_MIB_MPCAPABLEPASSIVEACK,        /* Received third ACK with MP_CAPABLE */
        MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
        MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
        MPTCP_MIB_TOKENFALLBACKINIT,        /* Could not init/allocate token */
        MPTCP_MIB_RETRANSSEGS,                /* Segments retransmitted at the MPTCP-level */
        MPTCP_MIB_JOINNOTOKEN,                /* Received MP_JOIN but the token was not found */
        MPTCP_MIB_JOINSYNRX,                /* Received a SYN + MP_JOIN */
        MPTCP_MIB_JOINSYNACKRX,                /* Received a SYN/ACK + MP_JOIN */
        MPTCP_MIB_JOINSYNACKMAC,        /* HMAC was wrong on SYN/ACK + MP_JOIN */
        MPTCP_MIB_JOINACKRX,                /* Received an ACK + MP_JOIN */
        MPTCP_MIB_JOINACKMAC,                /* HMAC was wrong on ACK + MP_JOIN */
        MPTCP_MIB_DSSNOMATCH,                /* Received a new mapping that did not match the previous one */
        MPTCP_MIB_INFINITEMAPTX,        /* Sent an infinite mapping */
        MPTCP_MIB_INFINITEMAPRX,        /* Received an infinite mapping */
        MPTCP_MIB_DSSTCPMISMATCH,        /* DSS-mapping did not map with TCP's sequence numbers */
        MPTCP_MIB_DATACSUMERR,                /* The data checksum fail */
        MPTCP_MIB_OFOQUEUETAIL,        /* Segments inserted into OoO queue tail */
        MPTCP_MIB_OFOQUEUE,                /* Segments inserted into OoO queue */
        MPTCP_MIB_OFOMERGE,                /* Segments merged in OoO queue */
        MPTCP_MIB_NODSSWINDOW,                /* Segments not in MPTCP windows */
        MPTCP_MIB_DUPDATA,                /* Segments discarded due to duplicate DSS */
        MPTCP_MIB_ADDADDR,                /* Received ADD_ADDR with echo-flag=0 */
        MPTCP_MIB_ADDADDRTX,                /* Sent ADD_ADDR with echo-flag=0 */
        MPTCP_MIB_ADDADDRTXDROP,        /* ADD_ADDR with echo-flag=0 not send due to
                                         * resource exhaustion
                                         */
        MPTCP_MIB_ECHOADD,                /* Received ADD_ADDR with echo-flag=1 */
        MPTCP_MIB_ECHOADDTX,                /* Send ADD_ADDR with echo-flag=1 */
        MPTCP_MIB_ECHOADDTXDROP,        /* ADD_ADDR with echo-flag=1 not send due
                                         * to resource exhaustion
                                         */
        MPTCP_MIB_PORTADD,                /* Received ADD_ADDR with a port-number */
        MPTCP_MIB_ADDADDRDROP,                /* Dropped incoming ADD_ADDR */
        MPTCP_MIB_JOINPORTSYNRX,        /* Received a SYN MP_JOIN with a different port-number */
        MPTCP_MIB_JOINPORTSYNACKRX,        /* Received a SYNACK MP_JOIN with a different port-number */
        MPTCP_MIB_JOINPORTACKRX,        /* Received an ACK MP_JOIN with a different port-number */
        MPTCP_MIB_MISMATCHPORTSYNRX,        /* Received a SYN MP_JOIN with a mismatched port-number */
        MPTCP_MIB_MISMATCHPORTACKRX,        /* Received an ACK MP_JOIN with a mismatched port-number */
        MPTCP_MIB_RMADDR,                /* Received RM_ADDR */
        MPTCP_MIB_RMADDRDROP,                /* Dropped incoming RM_ADDR */
        MPTCP_MIB_RMADDRTX,                /* Sent RM_ADDR */
        MPTCP_MIB_RMADDRTXDROP,                /* RM_ADDR not sent due to resource exhaustion */
        MPTCP_MIB_RMSUBFLOW,                /* Remove a subflow */
        MPTCP_MIB_MPPRIOTX,                /* Transmit a MP_PRIO */
        MPTCP_MIB_MPPRIORX,                /* Received a MP_PRIO */
        MPTCP_MIB_MPFAILTX,                /* Transmit a MP_FAIL */
        MPTCP_MIB_MPFAILRX,                /* Received a MP_FAIL */
        MPTCP_MIB_MPFASTCLOSETX,        /* Transmit a MP_FASTCLOSE */
        MPTCP_MIB_MPFASTCLOSERX,        /* Received a MP_FASTCLOSE */
        MPTCP_MIB_MPRSTTX,                /* Transmit a MP_RST */
        MPTCP_MIB_MPRSTRX,                /* Received a MP_RST */
        MPTCP_MIB_RCVPRUNED,                /* Incoming packet dropped due to memory limit */
        MPTCP_MIB_SUBFLOWSTALE,                /* Subflows entered 'stale' status */
        MPTCP_MIB_SUBFLOWRECOVER,        /* Subflows returned to active status after being stale */
        MPTCP_MIB_SNDWNDSHARED,                /* Subflow snd wnd is overridden by msk's one */
        MPTCP_MIB_RCVWNDSHARED,                /* Subflow rcv wnd is overridden by msk's one */
        MPTCP_MIB_RCVWNDCONFLICTUPDATE,        /* subflow rcv wnd is overridden by msk's one due to
                                         * conflict with another subflow while updating msk rcv wnd
                                         */
        MPTCP_MIB_RCVWNDCONFLICT,        /* Conflict with while updating msk rcv wnd */
        MPTCP_MIB_CURRESTAB,                /* Current established MPTCP connections */
        __MPTCP_MIB_MAX
};

#define LINUX_MIB_MPTCP_MAX        __MPTCP_MIB_MAX
struct mptcp_mib {
        unsigned long mibs[LINUX_MIB_MPTCP_MAX];
};

static inline void MPTCP_ADD_STATS(struct net *net,
                                   enum linux_mptcp_mib_field field,
                                   int val)
{
        if (likely(net->mib.mptcp_statistics))
                SNMP_ADD_STATS(net->mib.mptcp_statistics, field, val);
}

static inline void MPTCP_INC_STATS(struct net *net,
                                   enum linux_mptcp_mib_field field)
{
        if (likely(net->mib.mptcp_statistics))
                SNMP_INC_STATS(net->mib.mptcp_statistics, field);
}

static inline void __MPTCP_INC_STATS(struct net *net,
                                     enum linux_mptcp_mib_field field)
{
        if (likely(net->mib.mptcp_statistics))
                __SNMP_INC_STATS(net->mib.mptcp_statistics, field);
}

static inline void MPTCP_DEC_STATS(struct net *net,
                                   enum linux_mptcp_mib_field field)
{
        if (likely(net->mib.mptcp_statistics))
                SNMP_DEC_STATS(net->mib.mptcp_statistics, field);
}

bool mptcp_mib_alloc(struct net *net);















































































































































































    1 




    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
// SPDX-License-Identifier: GPL-2.0
/*
 *        XFRM virtual interface
 *
 *        Copyright (C) 2018 secunet Security Networks AG
 *
 *        Author:
 *        Steffen Klassert <steffen.klassert@secunet.com>
 */

#include <linux/module.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/sockios.h>
#include <linux/icmp.h>
#include <linux/if.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_link.h>
#include <linux/if_arp.h>
#include <linux/icmpv6.h>
#include <linux/init.h>
#include <linux/route.h>
#include <linux/rtnetlink.h>
#include <linux/netfilter_ipv6.h>
#include <linux/slab.h>
#include <linux/hash.h>

#include <linux/uaccess.h>
#include <linux/atomic.h>

#include <net/gso.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/ip_tunnels.h>
#include <net/addrconf.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/dst_metadata.h>
#include <net/netns/generic.h>
#include <linux/etherdevice.h>

static int xfrmi_dev_init(struct net_device *dev);
static void xfrmi_dev_setup(struct net_device *dev);
static struct rtnl_link_ops xfrmi_link_ops __read_mostly;
static unsigned int xfrmi_net_id __read_mostly;
static const struct net_device_ops xfrmi_netdev_ops;

#define XFRMI_HASH_BITS        8
#define XFRMI_HASH_SIZE        BIT(XFRMI_HASH_BITS)

struct xfrmi_net {
        /* lists for storing interfaces in use */
        struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE];
        struct xfrm_if __rcu *collect_md_xfrmi;
};

static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = {
        [LWT_XFRM_IF_ID]        = NLA_POLICY_MIN(NLA_U32, 1),
        [LWT_XFRM_LINK]                = NLA_POLICY_MIN(NLA_U32, 1),
};

static void xfrmi_destroy_state(struct lwtunnel_state *lwt)
{
}

static int xfrmi_build_state(struct net *net, struct nlattr *nla,
                             unsigned int family, const void *cfg,
                             struct lwtunnel_state **ts,
                             struct netlink_ext_ack *extack)
{
        struct nlattr *tb[LWT_XFRM_MAX + 1];
        struct lwtunnel_state *new_state;
        struct xfrm_md_info *info;
        int ret;

        ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack);
        if (ret < 0)
                return ret;

        if (!tb[LWT_XFRM_IF_ID]) {
                NL_SET_ERR_MSG(extack, "if_id must be set");
                return -EINVAL;
        }

        new_state = lwtunnel_state_alloc(sizeof(*info));
        if (!new_state) {
                NL_SET_ERR_MSG(extack, "failed to create encap info");
                return -ENOMEM;
        }

        new_state->type = LWTUNNEL_ENCAP_XFRM;

        info = lwt_xfrm_info(new_state);

        info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]);

        if (tb[LWT_XFRM_LINK])
                info->link = nla_get_u32(tb[LWT_XFRM_LINK]);

        *ts = new_state;
        return 0;
}

static int xfrmi_fill_encap_info(struct sk_buff *skb,
                                 struct lwtunnel_state *lwt)
{
        struct xfrm_md_info *info = lwt_xfrm_info(lwt);

        if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) ||
            (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link)))
                return -EMSGSIZE;

        return 0;
}

static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate)
{
        return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */
                nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */
}

static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
        struct xfrm_md_info *a_info = lwt_xfrm_info(a);
        struct xfrm_md_info *b_info = lwt_xfrm_info(b);

        return memcmp(a_info, b_info, sizeof(*a_info));
}

static const struct lwtunnel_encap_ops xfrmi_encap_ops = {
        .build_state        = xfrmi_build_state,
        .destroy_state        = xfrmi_destroy_state,
        .fill_encap        = xfrmi_fill_encap_info,
        .get_encap_size = xfrmi_encap_nlsize,
        .cmp_encap        = xfrmi_encap_cmp,
        .owner                = THIS_MODULE,
};

#define for_each_xfrmi_rcu(start, xi) \
        for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))

static u32 xfrmi_hash(u32 if_id)
{
        return hash_32(if_id, XFRMI_HASH_BITS);
}

static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
{
        struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
        struct xfrm_if *xi;

        for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) {
                if (x->if_id == xi->p.if_id &&
                    (xi->dev->flags & IFF_UP))
                        return xi;
        }

        xi = rcu_dereference(xfrmn->collect_md_xfrmi);
        if (xi && (xi->dev->flags & IFF_UP))
                return xi;

        return NULL;
}

static bool xfrmi_decode_session(struct sk_buff *skb,
                                 unsigned short family,
                                 struct xfrm_if_decode_session_result *res)
{
        struct net_device *dev;
        struct xfrm_if *xi;
        int ifindex = 0;

        if (!secpath_exists(skb) || !skb->dev)
                return false;

        switch (family) {
        case AF_INET6:
                ifindex = inet6_sdif(skb);
                break;
        case AF_INET:
                ifindex = inet_sdif(skb);
                break;
        }

        if (ifindex) {
                struct net *net = xs_net(xfrm_input_state(skb));

                dev = dev_get_by_index_rcu(net, ifindex);
        } else {
                dev = skb->dev;
        }

        if (!dev || !(dev->flags & IFF_UP))
                return false;
        if (dev->netdev_ops != &xfrmi_netdev_ops)
                return false;

        xi = netdev_priv(dev);
        res->net = xi->net;

        if (xi->p.collect_md)
                res->if_id = xfrm_input_state(skb)->if_id;
        else
                res->if_id = xi->p.if_id;
        return true;
}

static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
{
        struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)];

        rcu_assign_pointer(xi->next , rtnl_dereference(*xip));
        rcu_assign_pointer(*xip, xi);
}

static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
{
        struct xfrm_if __rcu **xip;
        struct xfrm_if *iter;

        for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)];
             (iter = rtnl_dereference(*xip)) != NULL;
             xip = &iter->next) {
                if (xi == iter) {
                        rcu_assign_pointer(*xip, xi->next);
                        break;
                }
        }
}

static void xfrmi_dev_free(struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);

        gro_cells_destroy(&xi->gro_cells);
}

static int xfrmi_create(struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);
        struct net *net = dev_net(dev);
        struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
        int err;

        dev->rtnl_link_ops = &xfrmi_link_ops;
        err = register_netdevice(dev);
        if (err < 0)
                goto out;

        if (xi->p.collect_md)
                rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi);
        else
                xfrmi_link(xfrmn, xi);

        return 0;

out:
        return err;
}

static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p)
{
        struct xfrm_if __rcu **xip;
        struct xfrm_if *xi;
        struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);

        for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)];
             (xi = rtnl_dereference(*xip)) != NULL;
             xip = &xi->next)
                if (xi->p.if_id == p->if_id)
                        return xi;

        return NULL;
}

static void xfrmi_dev_uninit(struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);
        struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);

        if (xi->p.collect_md)
                RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL);
        else
                xfrmi_unlink(xfrmn, xi);
}

static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
{
        skb_clear_tstamp(skb);
        skb->pkt_type = PACKET_HOST;
        skb->skb_iif = 0;
        skb->ignore_df = 0;
        skb_dst_drop(skb);
        nf_reset_ct(skb);
        nf_reset_trace(skb);

        if (!xnet)
                return;

        ipvs_reset(skb);
        secpath_reset(skb);
        skb_orphan(skb);
        skb->mark = 0;
}

static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi,
                       int encap_type, unsigned short family)
{
        struct sec_path *sp;

        sp = skb_sec_path(skb);
        if (sp && (sp->len || sp->olen) &&
            !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
                goto discard;

        XFRM_SPI_SKB_CB(skb)->family = family;
        if (family == AF_INET) {
                XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
                XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
        } else {
                XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
                XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
        }

        return xfrm_input(skb, nexthdr, spi, encap_type);
discard:
        kfree_skb(skb);
        return 0;
}

static int xfrmi4_rcv(struct sk_buff *skb)
{
        return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET);
}

static int xfrmi6_rcv(struct sk_buff *skb)
{
        return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
                           0, 0, AF_INET6);
}

static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
{
        return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET);
}

static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
{
        return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6);
}

static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
{
        const struct xfrm_mode *inner_mode;
        struct net_device *dev;
        struct xfrm_state *x;
        struct xfrm_if *xi;
        bool xnet;
        int link;

        if (err && !secpath_exists(skb))
                return 0;

        x = xfrm_input_state(skb);

        xi = xfrmi_lookup(xs_net(x), x);
        if (!xi)
                return 1;

        link = skb->dev->ifindex;
        dev = xi->dev;
        skb->dev = dev;

        if (err) {
                DEV_STATS_INC(dev, rx_errors);
                DEV_STATS_INC(dev, rx_dropped);

                return 0;
        }

        xnet = !net_eq(xi->net, dev_net(skb->dev));

        if (xnet) {
                inner_mode = &x->inner_mode;

                if (x->sel.family == AF_UNSPEC) {
                        inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
                        if (inner_mode == NULL) {
                                XFRM_INC_STATS(dev_net(skb->dev),
                                               LINUX_MIB_XFRMINSTATEMODEERROR);
                                return -EINVAL;
                        }
                }

                if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb,
                                       inner_mode->family))
                        return -EPERM;
        }

        xfrmi_scrub_packet(skb, xnet);
        if (xi->p.collect_md) {
                struct metadata_dst *md_dst;

                md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC);
                if (!md_dst)
                        return -ENOMEM;

                md_dst->u.xfrm_info.if_id = x->if_id;
                md_dst->u.xfrm_info.link = link;
                skb_dst_set(skb, (struct dst_entry *)md_dst);
        }
        dev_sw_netstats_rx_add(dev, skb->len);

        return 0;
}

static int
xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
{
        struct xfrm_if *xi = netdev_priv(dev);
        struct dst_entry *dst = skb_dst(skb);
        unsigned int length = skb->len;
        struct net_device *tdev;
        struct xfrm_state *x;
        int err = -1;
        u32 if_id;
        int mtu;

        if (xi->p.collect_md) {
                struct xfrm_md_info *md_info = skb_xfrm_md_info(skb);

                if (unlikely(!md_info))
                        return -EINVAL;

                if_id = md_info->if_id;
                fl->flowi_oif = md_info->link;
                if (md_info->dst_orig) {
                        struct dst_entry *tmp_dst = dst;

                        dst = md_info->dst_orig;
                        skb_dst_set(skb, dst);
                        md_info->dst_orig = NULL;
                        dst_release(tmp_dst);
                }
        } else {
                if_id = xi->p.if_id;
        }

        dst_hold(dst);
        dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                dst = NULL;
                goto tx_err_link_failure;
        }

        x = dst->xfrm;
        if (!x)
                goto tx_err_link_failure;

        if (x->if_id != if_id)
                goto tx_err_link_failure;

        tdev = dst->dev;

        if (tdev == dev) {
                DEV_STATS_INC(dev, collisions);
                net_warn_ratelimited("%s: Local routing loop detected!\n",
                                     dev->name);
                goto tx_err_dst_release;
        }

        mtu = dst_mtu(dst);
        if ((!skb_is_gso(skb) && skb->len > mtu) ||
            (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) {
                skb_dst_update_pmtu_no_confirm(skb, mtu);

                if (skb->protocol == htons(ETH_P_IPV6)) {
                        if (mtu < IPV6_MIN_MTU)
                                mtu = IPV6_MIN_MTU;

                        if (skb->len > 1280)
                                icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                        else
                                goto xmit;
                } else {
                        if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
                                goto xmit;
                        icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                                      htonl(mtu));
                }

                dst_release(dst);
                return -EMSGSIZE;
        }

xmit:
        xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev)));
        skb_dst_set(skb, dst);
        skb->dev = tdev;

        err = dst_output(xi->net, skb->sk, skb);
        if (net_xmit_eval(err) == 0) {
                dev_sw_netstats_tx_add(dev, 1, length);
        } else {
                DEV_STATS_INC(dev, tx_errors);
                DEV_STATS_INC(dev, tx_aborted_errors);
        }

        return 0;
tx_err_link_failure:
        DEV_STATS_INC(dev, tx_carrier_errors);
        dst_link_failure(skb);
tx_err_dst_release:
        dst_release(dst);
        return err;
}

static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);
        struct dst_entry *dst = skb_dst(skb);
        struct flowi fl;
        int ret;

        memset(&fl, 0, sizeof(fl));

        switch (skb->protocol) {
        case htons(ETH_P_IPV6):
                memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
                xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6);
                if (!dst) {
                        fl.u.ip6.flowi6_oif = dev->ifindex;
                        fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
                        dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6);
                        if (dst->error) {
                                dst_release(dst);
                                DEV_STATS_INC(dev, tx_carrier_errors);
                                goto tx_err;
                        }
                        skb_dst_set(skb, dst);
                }
                break;
        case htons(ETH_P_IP):
                memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
                xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET);
                if (!dst) {
                        struct rtable *rt;

                        fl.u.ip4.flowi4_oif = dev->ifindex;
                        fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
                        rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4);
                        if (IS_ERR(rt)) {
                                DEV_STATS_INC(dev, tx_carrier_errors);
                                goto tx_err;
                        }
                        skb_dst_set(skb, &rt->dst);
                }
                break;
        default:
                goto tx_err;
        }

        fl.flowi_oif = xi->p.link;

        ret = xfrmi_xmit2(skb, dev, &fl);
        if (ret < 0)
                goto tx_err;

        return NETDEV_TX_OK;

tx_err:
        DEV_STATS_INC(dev, tx_errors);
        DEV_STATS_INC(dev, tx_dropped);
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

static int xfrmi4_err(struct sk_buff *skb, u32 info)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct net *net = dev_net(skb->dev);
        int protocol = iph->protocol;
        struct ip_comp_hdr *ipch;
        struct ip_esp_hdr *esph;
        struct ip_auth_hdr *ah ;
        struct xfrm_state *x;
        struct xfrm_if *xi;
        __be32 spi;

        switch (protocol) {
        case IPPROTO_ESP:
                esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
                spi = esph->spi;
                break;
        case IPPROTO_AH:
                ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
                spi = ah->spi;
                break;
        case IPPROTO_COMP:
                ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
                spi = htonl(ntohs(ipch->cpi));
                break;
        default:
                return 0;
        }

        switch (icmp_hdr(skb)->type) {
        case ICMP_DEST_UNREACH:
                if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
                        return 0;
                break;
        case ICMP_REDIRECT:
                break;
        default:
                return 0;
        }

        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, protocol, AF_INET);
        if (!x)
                return 0;

        xi = xfrmi_lookup(net, x);
        if (!xi) {
                xfrm_state_put(x);
                return -1;
        }

        if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
                ipv4_update_pmtu(skb, net, info, 0, protocol);
        else
                ipv4_redirect(skb, net, 0, protocol);
        xfrm_state_put(x);

        return 0;
}

static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                    u8 type, u8 code, int offset, __be32 info)
{
        const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
        struct net *net = dev_net(skb->dev);
        int protocol = iph->nexthdr;
        struct ip_comp_hdr *ipch;
        struct ip_esp_hdr *esph;
        struct ip_auth_hdr *ah;
        struct xfrm_state *x;
        struct xfrm_if *xi;
        __be32 spi;

        switch (protocol) {
        case IPPROTO_ESP:
                esph = (struct ip_esp_hdr *)(skb->data + offset);
                spi = esph->spi;
                break;
        case IPPROTO_AH:
                ah = (struct ip_auth_hdr *)(skb->data + offset);
                spi = ah->spi;
                break;
        case IPPROTO_COMP:
                ipch = (struct ip_comp_hdr *)(skb->data + offset);
                spi = htonl(ntohs(ipch->cpi));
                break;
        default:
                return 0;
        }

        if (type != ICMPV6_PKT_TOOBIG &&
            type != NDISC_REDIRECT)
                return 0;

        x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
                              spi, protocol, AF_INET6);
        if (!x)
                return 0;

        xi = xfrmi_lookup(net, x);
        if (!xi) {
                xfrm_state_put(x);
                return -1;
        }

        if (type == NDISC_REDIRECT)
                ip6_redirect(skb, net, skb->dev->ifindex, 0,
                             sock_net_uid(net, NULL));
        else
                ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
        xfrm_state_put(x);

        return 0;
}

static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
{
        if (xi->p.link != p->link)
                return -EINVAL;

        xi->p.if_id = p->if_id;

        return 0;
}

static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
{
        struct net *net = xi->net;
        struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
        int err;

        xfrmi_unlink(xfrmn, xi);
        synchronize_net();
        err = xfrmi_change(xi, p);
        xfrmi_link(xfrmn, xi);
        netdev_state_change(xi->dev);
        return err;
}

static int xfrmi_get_iflink(const struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);

        return READ_ONCE(xi->p.link);
}

static const struct net_device_ops xfrmi_netdev_ops = {
        .ndo_init        = xfrmi_dev_init,
        .ndo_uninit        = xfrmi_dev_uninit,
        .ndo_start_xmit = xfrmi_xmit,
        .ndo_get_stats64 = dev_get_tstats64,
        .ndo_get_iflink = xfrmi_get_iflink,
};

static void xfrmi_dev_setup(struct net_device *dev)
{
        dev->netdev_ops         = &xfrmi_netdev_ops;
        dev->header_ops                = &ip_tunnel_header_ops;
        dev->type                = ARPHRD_NONE;
        dev->mtu                = ETH_DATA_LEN;
        dev->min_mtu                = ETH_MIN_MTU;
        dev->max_mtu                = IP_MAX_MTU;
        dev->flags                 = IFF_NOARP;
        dev->needs_free_netdev        = true;
        dev->priv_destructor        = xfrmi_dev_free;
        dev->pcpu_stat_type        = NETDEV_PCPU_STAT_TSTATS;
        netif_keep_dst(dev);

        eth_broadcast_addr(dev->broadcast);
}

#define XFRMI_FEATURES (NETIF_F_SG |                \
                        NETIF_F_FRAGLIST |        \
                        NETIF_F_GSO_SOFTWARE |        \
                        NETIF_F_HW_CSUM)

static int xfrmi_dev_init(struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);
        struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link);
        int err;

        err = gro_cells_init(&xi->gro_cells, dev);
        if (err)
                return err;

        dev->features |= NETIF_F_LLTX;
        dev->features |= XFRMI_FEATURES;
        dev->hw_features |= XFRMI_FEATURES;

        if (phydev) {
                dev->needed_headroom = phydev->needed_headroom;
                dev->needed_tailroom = phydev->needed_tailroom;

                if (is_zero_ether_addr(dev->dev_addr))
                        eth_hw_addr_inherit(dev, phydev);
                if (is_zero_ether_addr(dev->broadcast))
                        memcpy(dev->broadcast, phydev->broadcast,
                               dev->addr_len);
        } else {
                eth_hw_addr_random(dev);
                eth_broadcast_addr(dev->broadcast);
        }

        return 0;
}

static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[],
                         struct netlink_ext_ack *extack)
{
        return 0;
}

static void xfrmi_netlink_parms(struct nlattr *data[],
                               struct xfrm_if_parms *parms)
{
        memset(parms, 0, sizeof(*parms));

        if (!data)
                return;

        if (data[IFLA_XFRM_LINK])
                parms->link = nla_get_u32(data[IFLA_XFRM_LINK]);

        if (data[IFLA_XFRM_IF_ID])
                parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);

        if (data[IFLA_XFRM_COLLECT_METADATA])
                parms->collect_md = true;
}

static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
                        struct nlattr *tb[], struct nlattr *data[],
                        struct netlink_ext_ack *extack)
{
        struct net *net = dev_net(dev);
        struct xfrm_if_parms p = {};
        struct xfrm_if *xi;
        int err;

        xfrmi_netlink_parms(data, &p);
        if (p.collect_md) {
                struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);

                if (p.link || p.if_id) {
                        NL_SET_ERR_MSG(extack, "link and if_id must be zero");
                        return -EINVAL;
                }

                if (rtnl_dereference(xfrmn->collect_md_xfrmi))
                        return -EEXIST;

        } else {
                if (!p.if_id) {
                        NL_SET_ERR_MSG(extack, "if_id must be non zero");
                        return -EINVAL;
                }

                xi = xfrmi_locate(net, &p);
                if (xi)
                        return -EEXIST;
        }

        xi = netdev_priv(dev);
        xi->p = p;
        xi->net = net;
        xi->dev = dev;

        err = xfrmi_create(dev);
        return err;
}

static void xfrmi_dellink(struct net_device *dev, struct list_head *head)
{
        unregister_netdevice_queue(dev, head);
}

static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
                           struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        struct xfrm_if *xi = netdev_priv(dev);
        struct net *net = xi->net;
        struct xfrm_if_parms p = {};

        xfrmi_netlink_parms(data, &p);
        if (!p.if_id) {
                NL_SET_ERR_MSG(extack, "if_id must be non zero");
                return -EINVAL;
        }

        if (p.collect_md) {
                NL_SET_ERR_MSG(extack, "collect_md can't be changed");
                return -EINVAL;
        }

        xi = xfrmi_locate(net, &p);
        if (!xi) {
                xi = netdev_priv(dev);
        } else {
                if (xi->dev != dev)
                        return -EEXIST;
                if (xi->p.collect_md) {
                        NL_SET_ERR_MSG(extack,
                                       "device can't be changed to collect_md");
                        return -EINVAL;
                }
        }

        return xfrmi_update(xi, &p);
}

static size_t xfrmi_get_size(const struct net_device *dev)
{
        return
                /* IFLA_XFRM_LINK */
                nla_total_size(4) +
                /* IFLA_XFRM_IF_ID */
                nla_total_size(4) +
                /* IFLA_XFRM_COLLECT_METADATA */
                nla_total_size(0) +
                0;
}

static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);
        struct xfrm_if_parms *parm = &xi->p;

        if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
            nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) ||
            (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static struct net *xfrmi_get_link_net(const struct net_device *dev)
{
        struct xfrm_if *xi = netdev_priv(dev);

        return READ_ONCE(xi->net);
}

static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
        [IFLA_XFRM_UNSPEC]                = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA },
        [IFLA_XFRM_LINK]                = { .type = NLA_U32 },
        [IFLA_XFRM_IF_ID]                = { .type = NLA_U32 },
        [IFLA_XFRM_COLLECT_METADATA]        = { .type = NLA_FLAG },
};

static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
        .kind                = "xfrm",
        .maxtype        = IFLA_XFRM_MAX,
        .policy                = xfrmi_policy,
        .priv_size        = sizeof(struct xfrm_if),
        .setup                = xfrmi_dev_setup,
        .validate        = xfrmi_validate,
        .newlink        = xfrmi_newlink,
        .dellink        = xfrmi_dellink,
        .changelink        = xfrmi_changelink,
        .get_size        = xfrmi_get_size,
        .fill_info        = xfrmi_fill_info,
        .get_link_net        = xfrmi_get_link_net,
};

static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list,
                                             struct list_head *dev_to_kill)
{
        struct net *net;

        ASSERT_RTNL();
        list_for_each_entry(net, net_exit_list, exit_list) {
                struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
                struct xfrm_if __rcu **xip;
                struct xfrm_if *xi;
                int i;

                for (i = 0; i < XFRMI_HASH_SIZE; i++) {
                        for (xip = &xfrmn->xfrmi[i];
                             (xi = rtnl_dereference(*xip)) != NULL;
                             xip = &xi->next)
                                unregister_netdevice_queue(xi->dev, dev_to_kill);
                }
                xi = rtnl_dereference(xfrmn->collect_md_xfrmi);
                if (xi)
                        unregister_netdevice_queue(xi->dev, dev_to_kill);
        }
}

static struct pernet_operations xfrmi_net_ops = {
        .exit_batch_rtnl = xfrmi_exit_batch_rtnl,
        .id   = &xfrmi_net_id,
        .size = sizeof(struct xfrmi_net),
};

static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
        .handler        =        xfrmi6_rcv,
        .input_handler        =        xfrmi6_input,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi6_err,
        .priority        =        10,
};

static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
        .handler        =        xfrm6_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi6_err,
        .priority        =        10,
};

static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = {
        .handler        =        xfrm6_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi6_err,
        .priority        =        10,
};

#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
static int xfrmi6_rcv_tunnel(struct sk_buff *skb)
{
        const xfrm_address_t *saddr;
        __be32 spi;

        saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr;
        spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr);

        return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL);
}

static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = {
        .handler        =        xfrmi6_rcv_tunnel,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi6_err,
        .priority        =        2,
};

static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = {
        .handler        =        xfrmi6_rcv_tunnel,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi6_err,
        .priority        =        2,
};
#endif

static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
        .handler        =        xfrmi4_rcv,
        .input_handler        =        xfrmi4_input,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi4_err,
        .priority        =        10,
};

static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = {
        .handler        =        xfrm4_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi4_err,
        .priority        =        10,
};

static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = {
        .handler        =        xfrm4_rcv,
        .input_handler        =        xfrm_input,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi4_err,
        .priority        =        10,
};

#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL)
static int xfrmi4_rcv_tunnel(struct sk_buff *skb)
{
        return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr);
}

static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = {
        .handler        =        xfrmi4_rcv_tunnel,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi4_err,
        .priority        =        3,
};

static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = {
        .handler        =        xfrmi4_rcv_tunnel,
        .cb_handler        =        xfrmi_rcv_cb,
        .err_handler        =        xfrmi4_err,
        .priority        =        2,
};
#endif

static int __init xfrmi4_init(void)
{
        int err;

        err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP);
        if (err < 0)
                goto xfrm_proto_esp_failed;
        err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH);
        if (err < 0)
                goto xfrm_proto_ah_failed;
        err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
        if (err < 0)
                goto xfrm_proto_comp_failed;
#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL)
        err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET);
        if (err < 0)
                goto xfrm_tunnel_ipip_failed;
        err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6);
        if (err < 0)
                goto xfrm_tunnel_ipip6_failed;
#endif

        return 0;

#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL)
xfrm_tunnel_ipip6_failed:
        xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET);
xfrm_tunnel_ipip_failed:
        xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
#endif
xfrm_proto_comp_failed:
        xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
xfrm_proto_ah_failed:
        xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
xfrm_proto_esp_failed:
        return err;
}

static void xfrmi4_fini(void)
{
#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL)
        xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6);
        xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET);
#endif
        xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
        xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
        xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
}

static int __init xfrmi6_init(void)
{
        int err;

        err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP);
        if (err < 0)
                goto xfrm_proto_esp_failed;
        err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH);
        if (err < 0)
                goto xfrm_proto_ah_failed;
        err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
        if (err < 0)
                goto xfrm_proto_comp_failed;
#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
        err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6);
        if (err < 0)
                goto xfrm_tunnel_ipv6_failed;
        err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET);
        if (err < 0)
                goto xfrm_tunnel_ip6ip_failed;
#endif

        return 0;

#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
xfrm_tunnel_ip6ip_failed:
        xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6);
xfrm_tunnel_ipv6_failed:
        xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
#endif
xfrm_proto_comp_failed:
        xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
xfrm_proto_ah_failed:
        xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
xfrm_proto_esp_failed:
        return err;
}

static void xfrmi6_fini(void)
{
#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
        xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET);
        xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6);
#endif
        xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
        xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
        xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
}

static const struct xfrm_if_cb xfrm_if_cb = {
        .decode_session =        xfrmi_decode_session,
};

static int __init xfrmi_init(void)
{
        const char *msg;
        int err;

        pr_info("IPsec XFRM device driver\n");

        msg = "tunnel device";
        err = register_pernet_device(&xfrmi_net_ops);
        if (err < 0)
                goto pernet_dev_failed;

        msg = "xfrm4 protocols";
        err = xfrmi4_init();
        if (err < 0)
                goto xfrmi4_failed;

        msg = "xfrm6 protocols";
        err = xfrmi6_init();
        if (err < 0)
                goto xfrmi6_failed;


        msg = "netlink interface";
        err = rtnl_link_register(&xfrmi_link_ops);
        if (err < 0)
                goto rtnl_link_failed;

        err = register_xfrm_interface_bpf();
        if (err < 0)
                goto kfunc_failed;

        lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);

        xfrm_if_register_cb(&xfrm_if_cb);

        return err;

kfunc_failed:
        rtnl_link_unregister(&xfrmi_link_ops);
rtnl_link_failed:
        xfrmi6_fini();
xfrmi6_failed:
        xfrmi4_fini();
xfrmi4_failed:
        unregister_pernet_device(&xfrmi_net_ops);
pernet_dev_failed:
        pr_err("xfrmi init: failed to register %s\n", msg);
        return err;
}

static void __exit xfrmi_fini(void)
{
        xfrm_if_unregister_cb();
        lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
        rtnl_link_unregister(&xfrmi_link_ops);
        xfrmi4_fini();
        xfrmi6_fini();
        unregister_pernet_device(&xfrmi_net_ops);
}

module_init(xfrmi_init);
module_exit(xfrmi_fini);
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("xfrm");
MODULE_ALIAS_NETDEV("xfrm0");
MODULE_AUTHOR("Steffen Klassert");
MODULE_DESCRIPTION("XFRM virtual interface");




































    3 

















    3 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
/* Freezer declarations */

#ifndef FREEZER_H_INCLUDED
#define FREEZER_H_INCLUDED

#include <linux/debug_locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>

#ifdef CONFIG_FREEZER
DECLARE_STATIC_KEY_FALSE(freezer_active);

extern bool pm_freezing;                /* PM freezing in effect */
extern bool pm_nosig_freezing;                /* PM nosig freezing in effect */

/*
 * Timeout for stopping processes
 */
extern unsigned int freeze_timeout_msecs;

/*
 * Check if a process has been frozen
 */
extern bool frozen(struct task_struct *p);

extern bool freezing_slow_path(struct task_struct *p);

/*
 * Check if there is a request to freeze a process
 */
static inline bool freezing(struct task_struct *p)
{
        if (static_branch_unlikely(&freezer_active))
                return freezing_slow_path(p);

        return false;
}

/* Takes and releases task alloc lock using task_lock() */
extern void __thaw_task(struct task_struct *t);

extern bool __refrigerator(bool check_kthr_stop);
extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);

static inline bool try_to_freeze(void)
{
        might_sleep();
        if (likely(!freezing(current)))
                return false;
        if (!(current->flags & PF_NOFREEZE))
                debug_check_no_locks_held();
        return __refrigerator(false);
}

extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);

#ifdef CONFIG_CGROUP_FREEZER
extern bool cgroup_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
static inline bool cgroup_freezing(struct task_struct *task)
{
        return false;
}
#endif /* !CONFIG_CGROUP_FREEZER */

#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
static inline void __thaw_task(struct task_struct *t) {}

static inline bool __refrigerator(bool check_kthr_stop) { return false; }
static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}

static inline bool try_to_freeze(void) { return false; }

static inline void set_freezable(void) {}

#endif /* !CONFIG_FREEZER */

#endif        /* FREEZER_H_INCLUDED */





































    2 








    3 
    2 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H

#ifdef CONFIG_RSEQ

#include <linux/preempt.h>
#include <linux/sched.h>

/*
 * Map the event mask on the user-space ABI enum rseq_cs_flags
 * for direct mask checks.
 */
enum rseq_event_mask_bits {
        RSEQ_EVENT_PREEMPT_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
        RSEQ_EVENT_SIGNAL_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
        RSEQ_EVENT_MIGRATE_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
};

enum rseq_event_mask {
        RSEQ_EVENT_PREEMPT        = (1U << RSEQ_EVENT_PREEMPT_BIT),
        RSEQ_EVENT_SIGNAL        = (1U << RSEQ_EVENT_SIGNAL_BIT),
        RSEQ_EVENT_MIGRATE        = (1U << RSEQ_EVENT_MIGRATE_BIT),
};

static inline void rseq_set_notify_resume(struct task_struct *t)
{
        if (t->rseq)
                set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}

void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);

static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
        if (current->rseq)
                __rseq_handle_notify_resume(ksig, regs);
}

static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
        preempt_disable();
        __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
        preempt_enable();
        rseq_handle_notify_resume(ksig, regs);
}

/* rseq_preempt() requires preemption to be disabled. */
static inline void rseq_preempt(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
 */
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
        if (clone_flags & CLONE_VM) {
                t->rseq = NULL;
                t->rseq_len = 0;
                t->rseq_sig = 0;
                t->rseq_event_mask = 0;
        } else {
                t->rseq = current->rseq;
                t->rseq_len = current->rseq_len;
                t->rseq_sig = current->rseq_sig;
                t->rseq_event_mask = current->rseq_event_mask;
        }
}

static inline void rseq_execve(struct task_struct *t)
{
        t->rseq = NULL;
        t->rseq_len = 0;
        t->rseq_sig = 0;
        t->rseq_event_mask = 0;
}

#else

static inline void rseq_set_notify_resume(struct task_struct *t)
{
}
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
}
static inline void rseq_preempt(struct task_struct *t)
{
}
static inline void rseq_migrate(struct task_struct *t)
{
}
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
}
static inline void rseq_execve(struct task_struct *t)
{
}

#endif

#ifdef CONFIG_DEBUG_RSEQ

void rseq_syscall(struct pt_regs *regs);

#else

static inline void rseq_syscall(struct pt_regs *regs)
{
}

#endif

#endif /* _LINUX_RSEQ_H */















































































































































































































































































































































































































































































































































































































































































































    2 














    2 














































































































































































































































































    2 











































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 */

#ifndef _NET_IPV6_H
#define _NET_IPV6_H

#include <linux/ipv6.h>
#include <linux/hardirq.h>
#include <linux/jhash.h>
#include <linux/refcount.h>
#include <linux/jump_label_ratelimit.h>
#include <net/if_inet6.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
#include <net/inet_dscp.h>
#include <net/snmp.h>
#include <net/netns/hash.h>

struct ip_tunnel_info;

#define SIN6_LEN_RFC2133        24

#define IPV6_MAXPLEN                65535

/*
 *        NextHeader field of IPv6 header
 */

#define NEXTHDR_HOP                0        /* Hop-by-hop option header. */
#define NEXTHDR_IPV4                4        /* IPv4 in IPv6 */
#define NEXTHDR_TCP                6        /* TCP segment. */
#define NEXTHDR_UDP                17        /* UDP message. */
#define NEXTHDR_IPV6                41        /* IPv6 in IPv6 */
#define NEXTHDR_ROUTING                43        /* Routing header. */
#define NEXTHDR_FRAGMENT        44        /* Fragmentation/reassembly header. */
#define NEXTHDR_GRE                47        /* GRE header. */
#define NEXTHDR_ESP                50        /* Encapsulating security payload. */
#define NEXTHDR_AUTH                51        /* Authentication header. */
#define NEXTHDR_ICMP                58        /* ICMP for IPv6. */
#define NEXTHDR_NONE                59        /* No next header */
#define NEXTHDR_DEST                60        /* Destination options header. */
#define NEXTHDR_SCTP                132        /* SCTP message. */
#define NEXTHDR_MOBILITY        135        /* Mobility header. */

#define NEXTHDR_MAX                255

#define IPV6_DEFAULT_HOPLIMIT   64
#define IPV6_DEFAULT_MCASTHOPS        1

/* Limits on Hop-by-Hop and Destination options.
 *
 * Per RFC8200 there is no limit on the maximum number or lengths of options in
 * Hop-by-Hop or Destination options other then the packet must fit in an MTU.
 * We allow configurable limits in order to mitigate potential denial of
 * service attacks.
 *
 * There are three limits that may be set:
 *   - Limit the number of options in a Hop-by-Hop or Destination options
 *     extension header
 *   - Limit the byte length of a Hop-by-Hop or Destination options extension
 *     header
 *   - Disallow unknown options
 *
 * The limits are expressed in corresponding sysctls:
 *
 * ipv6.sysctl.max_dst_opts_cnt
 * ipv6.sysctl.max_hbh_opts_cnt
 * ipv6.sysctl.max_dst_opts_len
 * ipv6.sysctl.max_hbh_opts_len
 *
 * max_*_opts_cnt is the number of TLVs that are allowed for Destination
 * options or Hop-by-Hop options. If the number is less than zero then unknown
 * TLVs are disallowed and the number of known options that are allowed is the
 * absolute value. Setting the value to INT_MAX indicates no limit.
 *
 * max_*_opts_len is the length limit in bytes of a Destination or
 * Hop-by-Hop options extension header. Setting the value to INT_MAX
 * indicates no length limit.
 *
 * If a limit is exceeded when processing an extension header the packet is
 * silently discarded.
 */

/* Default limits for Hop-by-Hop and Destination options */
#define IP6_DEFAULT_MAX_DST_OPTS_CNT         8
#define IP6_DEFAULT_MAX_HBH_OPTS_CNT         8
#define IP6_DEFAULT_MAX_DST_OPTS_LEN         INT_MAX /* No limit */
#define IP6_DEFAULT_MAX_HBH_OPTS_LEN         INT_MAX /* No limit */

/*
 *        Addr type
 *        
 *        type        -        unicast | multicast
 *        scope        -        local        | site            | global
 *        v4        -        compat
 *        v4mapped
 *        any
 *        loopback
 */

#define IPV6_ADDR_ANY                0x0000U

#define IPV6_ADDR_UNICAST        0x0001U
#define IPV6_ADDR_MULTICAST        0x0002U

#define IPV6_ADDR_LOOPBACK        0x0010U
#define IPV6_ADDR_LINKLOCAL        0x0020U
#define IPV6_ADDR_SITELOCAL        0x0040U

#define IPV6_ADDR_COMPATv4        0x0080U

#define IPV6_ADDR_SCOPE_MASK        0x00f0U

#define IPV6_ADDR_MAPPED        0x1000U

/*
 *        Addr scopes
 */
#define IPV6_ADDR_MC_SCOPE(a)        \
        ((a)->s6_addr[1] & 0x0f)        /* nonstandard */
#define __IPV6_ADDR_SCOPE_INVALID        -1
#define IPV6_ADDR_SCOPE_NODELOCAL        0x01
#define IPV6_ADDR_SCOPE_LINKLOCAL        0x02
#define IPV6_ADDR_SCOPE_SITELOCAL        0x05
#define IPV6_ADDR_SCOPE_ORGLOCAL        0x08
#define IPV6_ADDR_SCOPE_GLOBAL                0x0e

/*
 *        Addr flags
 */
#define IPV6_ADDR_MC_FLAG_TRANSIENT(a)        \
        ((a)->s6_addr[1] & 0x10)
#define IPV6_ADDR_MC_FLAG_PREFIX(a)        \
        ((a)->s6_addr[1] & 0x20)
#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a)        \
        ((a)->s6_addr[1] & 0x40)

/*
 *        fragmentation header
 */

struct frag_hdr {
        __u8        nexthdr;
        __u8        reserved;
        __be16        frag_off;
        __be32        identification;
};

/*
 * Jumbo payload option, as described in RFC 2675 2.
 */
struct hop_jumbo_hdr {
        u8        nexthdr;
        u8        hdrlen;
        u8        tlv_type;        /* IPV6_TLV_JUMBO, 0xC2 */
        u8        tlv_len;        /* 4 */
        __be32        jumbo_payload_len;
};

#define        IP6_MF                0x0001
#define        IP6_OFFSET        0xFFF8

struct ip6_fraglist_iter {
        struct ipv6hdr        *tmp_hdr;
        struct sk_buff        *frag;
        int                offset;
        unsigned int        hlen;
        __be32                frag_id;
        u8                nexthdr;
};

int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
                      u8 nexthdr, __be32 frag_id,
                      struct ip6_fraglist_iter *iter);
void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter);

static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter)
{
        struct sk_buff *skb = iter->frag;

        iter->frag = skb->next;
        skb_mark_not_on_list(skb);

        return skb;
}

struct ip6_frag_state {
        u8                *prevhdr;
        unsigned int        hlen;
        unsigned int        mtu;
        unsigned int        left;
        int                offset;
        int                ptr;
        int                hroom;
        int                troom;
        __be32                frag_id;
        u8                nexthdr;
};

void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
                   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
                   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state);
struct sk_buff *ip6_frag_next(struct sk_buff *skb,
                              struct ip6_frag_state *state);

#define IP6_REPLY_MARK(net, mark) \
        ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0)

#include <net/sock.h>

/* sysctls */
extern int sysctl_mld_max_msf;
extern int sysctl_mld_qrv;

#define _DEVINC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\
        mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\
})

/* per device counters are atomic_long_t */
#define _DEVINCATOMIC(net, statname, mod, idev, field)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\
})

/* per device and per net counters are atomic_long_t */
#define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field)                \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \
        SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\
})

#define _DEVADD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \
        mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\
})

#define _DEVUPD(net, statname, mod, idev, field, val)                        \
({                                                                        \
        struct inet6_dev *_idev = (idev);                                \
        if (likely(_idev != NULL))                                        \
                mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \
        mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
})

/* MIBs */

#define IP6_INC_STATS(net, idev,field)                \
                _DEVINC(net, ipv6, , idev, field)
#define __IP6_INC_STATS(net, idev,field)        \
                _DEVINC(net, ipv6, __, idev, field)
#define IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, , idev, field, val)
#define __IP6_ADD_STATS(net, idev,field,val)        \
                _DEVADD(net, ipv6, __, idev, field, val)
#define IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, , idev, field, val)
#define __IP6_UPD_PO_STATS(net, idev,field,val)   \
                _DEVUPD(net, ipv6, __, idev, field, val)
#define ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, , idev, field)
#define __ICMP6_INC_STATS(net, idev, field)        \
                _DEVINCATOMIC(net, icmpv6, __, idev, field)

#define ICMP6MSGOUT_INC_STATS(net, idev, field)                \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256)
#define ICMP6MSGIN_INC_STATS(net, idev, field)        \
        _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field)

struct ip6_ra_chain {
        struct ip6_ra_chain        *next;
        struct sock                *sk;
        int                        sel;
        void                        (*destructor)(struct sock *);
};

extern struct ip6_ra_chain        *ip6_ra_chain;
extern rwlock_t ip6_ra_lock;

/*
   This structure is prepared by protocol, when parsing
   ancillary data and passed to IPv6.
 */

struct ipv6_txoptions {
        refcount_t                refcnt;
        /* Length of this structure */
        int                        tot_len;

        /* length of extension headers   */

        __u16                        opt_flen;        /* after fragment hdr */
        __u16                        opt_nflen;        /* before fragment hdr */

        struct ipv6_opt_hdr        *hopopt;
        struct ipv6_opt_hdr        *dst0opt;
        struct ipv6_rt_hdr        *srcrt;        /* Routing Header */
        struct ipv6_opt_hdr        *dst1opt;
        struct rcu_head                rcu;
        /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
};

/* flowlabel_reflect sysctl values */
enum flowlabel_reflect {
        FLOWLABEL_REFLECT_ESTABLISHED                = 1,
        FLOWLABEL_REFLECT_TCP_RESET                = 2,
        FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES        = 4,
};

struct ip6_flowlabel {
        struct ip6_flowlabel __rcu *next;
        __be32                        label;
        atomic_t                users;
        struct in6_addr                dst;
        struct ipv6_txoptions        *opt;
        unsigned long                linger;
        struct rcu_head                rcu;
        u8                        share;
        union {
                struct pid *pid;
                kuid_t uid;
        } owner;
        unsigned long                lastuse;
        unsigned long                expires;
        struct net                *fl_net;
};

#define IPV6_FLOWINFO_MASK                cpu_to_be32(0x0FFFFFFF)
#define IPV6_FLOWLABEL_MASK                cpu_to_be32(0x000FFFFF)
#define IPV6_FLOWLABEL_STATELESS_FLAG        cpu_to_be32(0x00080000)

#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
#define IPV6_TCLASS_SHIFT        20

struct ipv6_fl_socklist {
        struct ipv6_fl_socklist        __rcu        *next;
        struct ip6_flowlabel                *fl;
        struct rcu_head                        rcu;
};

struct ipcm6_cookie {
        struct sockcm_cookie sockc;
        __s16 hlimit;
        __s16 tclass;
        __u16 gso_size;
        __s8  dontfrag;
        struct ipv6_txoptions *opt;
};

static inline void ipcm6_init(struct ipcm6_cookie *ipc6)
{
        *ipc6 = (struct ipcm6_cookie) {
                .hlimit = -1,
                .tclass = -1,
                .dontfrag = -1,
        };
}

static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
                                 const struct sock *sk)
{
        *ipc6 = (struct ipcm6_cookie) {
                .hlimit = -1,
                .tclass = inet6_sk(sk)->tclass,
                .dontfrag = inet6_test_bit(DONTFRAG, sk),
        };
}

static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
{
        struct ipv6_txoptions *opt;

        rcu_read_lock();
        opt = rcu_dereference(np->opt);
        if (opt) {
                if (!refcount_inc_not_zero(&opt->refcnt))
                        opt = NULL;
                else
                        opt = rcu_pointer_handoff(opt);
        }
        rcu_read_unlock();
        return opt;
}

static inline void txopt_put(struct ipv6_txoptions *opt)
{
        if (opt && refcount_dec_and_test(&opt->refcnt))
                kfree_rcu(opt, rcu);
}

#if IS_ENABLED(CONFIG_IPV6)
struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label);

extern struct static_key_false_deferred ipv6_flowlabel_exclusive;
static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk,
                                                    __be32 label)
{
        if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) &&
            READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl))
                return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);

        return NULL;
}
#endif

struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
                                         struct ip6_flowlabel *fl,
                                         struct ipv6_txoptions *fopt);
void fl6_free_socklist(struct sock *sk);
int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen);
int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
                           int flags);
int ip6_flowlabel_init(void);
void ip6_flowlabel_cleanup(void);
bool ip6_autoflowlabel(struct net *net, const struct sock *sk);

static inline void fl6_sock_release(struct ip6_flowlabel *fl)
{
        if (fl)
                atomic_dec(&fl->users);
}

enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
                                   u8 code, __be32 info);

void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                struct icmp6hdr *thdr, int len);

int ip6_ra_control(struct sock *sk, int sel);

int ipv6_parse_hopopts(struct sk_buff *skb);

struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
                                        struct ipv6_txoptions *opt);
struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
                                          struct ipv6_txoptions *opt,
                                          int newtype,
                                          struct ipv6_opt_hdr *newopt);
struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
                                            struct ipv6_txoptions *opt);

static inline struct ipv6_txoptions *
ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt)
{
        if (!opt)
                return NULL;
        return __ipv6_fixup_options(opt_space, opt);
}

bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
                       const struct inet6_skb_parm *opt);
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
                                           struct ipv6_txoptions *opt);

/* This helper is specialized for BIG TCP needs.
 * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header.
 * It assumes headers are already in skb->head.
 * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there.
 */
static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
{
        const struct hop_jumbo_hdr *jhdr;
        const struct ipv6hdr *nhdr;

        if (likely(skb->len <= GRO_LEGACY_MAX_SIZE))
                return 0;

        if (skb->protocol != htons(ETH_P_IPV6))
                return 0;

        if (skb_network_offset(skb) +
            sizeof(struct ipv6hdr) +
            sizeof(struct hop_jumbo_hdr) > skb_headlen(skb))
                return 0;

        nhdr = ipv6_hdr(skb);

        if (nhdr->nexthdr != NEXTHDR_HOP)
                return 0;

        jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1);
        if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 ||
            jhdr->nexthdr != IPPROTO_TCP)
                return 0;
        return jhdr->nexthdr;
}

/* Return 0 if HBH header is successfully removed
 * Or if HBH removal is unnecessary (packet is not big TCP)
 * Return error to indicate dropping the packet
 */
static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb)
{
        const int hophdr_len = sizeof(struct hop_jumbo_hdr);
        int nexthdr = ipv6_has_hopopt_jumbo(skb);
        struct ipv6hdr *h6;

        if (!nexthdr)
                return 0;

        if (skb_cow_head(skb, 0))
                return -1;

        /* Remove the HBH header.
         * Layout: [Ethernet header][IPv6 header][HBH][L4 Header]
         */
        memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb),
                skb_network_header(skb) - skb_mac_header(skb) +
                sizeof(struct ipv6hdr));

        __skb_pull(skb, hophdr_len);
        skb->network_header += hophdr_len;
        skb->mac_header += hophdr_len;

        h6 = ipv6_hdr(skb);
        h6->nexthdr = nexthdr;

        return 0;
}

static inline bool ipv6_accept_ra(const struct inet6_dev *idev)
{
        s32 accept_ra = READ_ONCE(idev->cnf.accept_ra);

        /* If forwarding is enabled, RA are not accepted unless the special
         * hybrid mode (accept_ra=2) is enabled.
         */
        return READ_ONCE(idev->cnf.forwarding) ? accept_ra == 2 :
                accept_ra;
}

#define IPV6_FRAG_HIGH_THRESH        (4 * 1024*1024)        /* 4194304 */
#define IPV6_FRAG_LOW_THRESH        (3 * 1024*1024)        /* 3145728 */
#define IPV6_FRAG_TIMEOUT        (60 * HZ)        /* 60 seconds */

int __ipv6_addr_type(const struct in6_addr *addr);
static inline int ipv6_addr_type(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & 0xffff;
}

static inline int ipv6_addr_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK;
}

static inline int __ipv6_addr_src_scope(int type)
{
        return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16);
}

static inline int ipv6_addr_src_scope(const struct in6_addr *addr)
{
        return __ipv6_addr_src_scope(__ipv6_addr_type(addr));
}

static inline bool __ipv6_addr_needs_scope_id(int type)
{
        return type & IPV6_ADDR_LINKLOCAL ||
               (type & IPV6_ADDR_MULTICAST &&
                (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)));
}

static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface)
{
        return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0;
}

static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return memcmp(a1, a2, sizeof(struct in6_addr));
}

static inline bool
ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m,
                     const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ulm = (const unsigned long *)m;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return !!(((ul1[0] ^ ul2[0]) & ulm[0]) |
                  ((ul1[1] ^ ul2[1]) & ulm[1]));
#else
        return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) |
                  ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) |
                  ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) |
                  ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3]));
#endif
}

static inline void ipv6_addr_prefix(struct in6_addr *pfx,
                                    const struct in6_addr *addr,
                                    int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr));
        memcpy(pfx->s6_addr, addr, o);
        if (b != 0)
                pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b);
}

static inline void ipv6_addr_prefix_copy(struct in6_addr *addr,
                                         const struct in6_addr *pfx,
                                         int plen)
{
        /* caller must guarantee 0 <= plen <= 128 */
        int o = plen >> 3,
            b = plen & 0x7;

        memcpy(addr->s6_addr, pfx, o);
        if (b != 0) {
                addr->s6_addr[o] &= ~(0xff00 >> b);
                addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b));
        }
}

static inline void __ipv6_addr_set_half(__be32 *addr,
                                        __be32 wh, __be32 wl)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#if defined(__BIG_ENDIAN)
        if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) {
                *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl));
                return;
        }
#elif defined(__LITTLE_ENDIAN)
        if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) {
                *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh));
                return;
        }
#endif
#endif
        addr[0] = wh;
        addr[1] = wl;
}

static inline void ipv6_addr_set(struct in6_addr *addr,
                                     __be32 w1, __be32 w2,
                                     __be32 w3, __be32 w4)
{
        __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2);
        __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4);
}

static inline bool ipv6_addr_equal(const struct in6_addr *a1,
                                   const struct in6_addr *a2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul1 = (const unsigned long *)a1;
        const unsigned long *ul2 = (const unsigned long *)a2;

        return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
#else
        return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
                (a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
                (a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
                (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
#endif
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline bool __ipv6_prefix_equal64_half(const __be64 *a1,
                                              const __be64 *a2,
                                              unsigned int len)
{
        if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len))))
                return false;
        return true;
}

static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be64 *a1 = (const __be64 *)addr1;
        const __be64 *a2 = (const __be64 *)addr2;

        if (prefixlen >= 64) {
                if (a1[0] ^ a2[0])
                        return false;
                return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64);
        }
        return __ipv6_prefix_equal64_half(a1, a2, prefixlen);
}
#else
static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
                                     const struct in6_addr *addr2,
                                     unsigned int prefixlen)
{
        const __be32 *a1 = addr1->s6_addr32;
        const __be32 *a2 = addr2->s6_addr32;
        unsigned int pdw, pbi;

        /* check complete u32 in prefix */
        pdw = prefixlen >> 5;
        if (pdw && memcmp(a1, a2, pdw << 2))
                return false;

        /* check incomplete u32 in prefix */
        pbi = prefixlen & 0x1f;
        if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi))))
                return false;

        return true;
}
#endif

static inline bool ipv6_addr_any(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;

        return (ul[0] | ul[1]) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | a->s6_addr32[3]) == 0;
#endif
}

static inline u32 ipv6_addr_hash(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const unsigned long *ul = (const unsigned long *)a;
        unsigned long x = ul[0] ^ ul[1];

        return (u32)(x ^ (x >> 32));
#else
        return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^
                             a->s6_addr32[2] ^ a->s6_addr32[3]);
#endif
}

/* more secured version of ipv6_addr_hash() */
static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval)
{
        return jhash2((__force const u32 *)a->s6_addr32,
                      ARRAY_SIZE(a->s6_addr32), initval);
}

static inline bool ipv6_addr_loopback(const struct in6_addr *a)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        const __be64 *be = (const __be64 *)a;

        return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return (a->s6_addr32[0] | a->s6_addr32[1] |
                a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0;
#endif
}

/*
 * Note that we must __force cast these to unsigned long to make sparse happy,
 * since all of the endian-annotated types are fixed size regardless of arch.
 */
static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
{
        return (
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
                *(unsigned long *)a |
#else
                (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) |
#endif
                (__force unsigned long)(a->s6_addr32[2] ^
                                        cpu_to_be32(0x0000ffff))) == 0UL;
}

static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
{
        return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);
}

static inline u32 ipv6_portaddr_hash(const struct net *net,
                                     const struct in6_addr *addr6,
                                     unsigned int port)
{
        unsigned int hash, mix = net_hash_mix(net);

        if (ipv6_addr_any(addr6))
                hash = jhash_1word(0, mix);
        else if (ipv6_addr_v4mapped(addr6))
                hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix);
        else
                hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix);

        return hash ^ port;
}

/*
 * Check for a RFC 4843 ORCHID address
 * (Overlay Routable Cryptographic Hash Identifiers)
 */
static inline bool ipv6_addr_orchid(const struct in6_addr *a)
{
        return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010);
}

static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr)
{
        return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000);
}

static inline void ipv6_addr_set_v4mapped(const __be32 addr,
                                          struct in6_addr *v4mapped)
{
        ipv6_addr_set(v4mapped,
                        0, 0,
                        htonl(0x0000FFFF),
                        addr);
}

/*
 * find the first different bit between two addresses
 * length of address must be a multiple of 32bits
 */
static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen)
{
        const __be32 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 2;

        for (i = 0; i < addrlen; i++) {
                __be32 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 32 + 31 - __fls(ntohl(xb));
        }

        /*
         *        we should *never* get to this point since that
         *        would mean the addrs are equal
         *
         *        However, we do get to it 8) And exacly, when
         *        addresses are equal 8)
         *
         *        ip route add 1111::/128 via ...
         *        ip route add 1111::/64 via ...
         *        and we are here.
         *
         *        Ideally, this function should stop comparison
         *        at prefix length. It does not, but it is still OK,
         *        if returned value is greater than prefix length.
         *                                        --ANK (980803)
         */
        return addrlen << 5;
}

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen)
{
        const __be64 *a1 = token1, *a2 = token2;
        int i;

        addrlen >>= 3;

        for (i = 0; i < addrlen; i++) {
                __be64 xb = a1[i] ^ a2[i];
                if (xb)
                        return i * 64 + 63 - __fls(be64_to_cpu(xb));
        }

        return addrlen << 6;
}
#endif

static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        if (__builtin_constant_p(addrlen) && !(addrlen & 7))
                return __ipv6_addr_diff64(token1, token2, addrlen);
#endif
        return __ipv6_addr_diff32(token1, token2, addrlen);
}

static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2)
{
        return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
}

__be32 ipv6_select_ident(struct net *net,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr);
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);

int ip6_dst_hoplimit(struct dst_entry *dst);

static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
                                      struct dst_entry *dst)
{
        int hlimit;

        if (ipv6_addr_is_multicast(&fl6->daddr))
                hlimit = READ_ONCE(np->mcast_hops);
        else
                hlimit = READ_ONCE(np->hop_limit);
        if (hlimit < 0)
                hlimit = ip6_dst_hoplimit(dst);
        return hlimit;
}

/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
 * Equivalent to :        flow->v6addrs.src = iph->saddr;
 *                        flow->v6addrs.dst = iph->daddr;
 */
static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
                                            const struct ipv6hdr *iph)
{
        BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
                     offsetof(typeof(flow->addrs), v6addrs.src) +
                     sizeof(flow->addrs.v6addrs.src));
        memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs));
        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}

#if IS_ENABLED(CONFIG_IPV6)

static inline bool ipv6_can_nonlocal_bind(struct net *net,
                                          struct inet_sock *inet)
{
        return net->ipv6.sysctl.ip_nonlocal_bind ||
                test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
                test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}

/* Sysctl settings for net ipv6.auto_flowlabels */
#define IP6_AUTO_FLOW_LABEL_OFF                0
#define IP6_AUTO_FLOW_LABEL_OPTOUT        1
#define IP6_AUTO_FLOW_LABEL_OPTIN        2
#define IP6_AUTO_FLOW_LABEL_FORCED        3

#define IP6_AUTO_FLOW_LABEL_MAX                IP6_AUTO_FLOW_LABEL_FORCED

#define IP6_DEFAULT_AUTO_FLOW_LABELS        IP6_AUTO_FLOW_LABEL_OPTOUT

static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        u32 hash;

        /* @flowlabel may include more than a flow label, eg, the traffic class.
         * Here we want only the flow label value.
         */
        flowlabel &= IPV6_FLOWLABEL_MASK;

        if (flowlabel ||
            net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
            (!autolabel &&
             net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
                return flowlabel;

        hash = skb_get_hash_flowi6(skb, fl6);

        /* Since this is being sent on the wire obfuscate hash a bit
         * to minimize possbility that any useful information to an
         * attacker is leaked. Only lower 20 bits are relevant.
         */
        hash = rol32(hash, 16);

        flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;

        if (net->ipv6.sysctl.flowlabel_state_ranges)
                flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;

        return flowlabel;
}

static inline int ip6_default_np_autolabel(struct net *net)
{
        switch (net->ipv6.sysctl.auto_flowlabels) {
        case IP6_AUTO_FLOW_LABEL_OFF:
        case IP6_AUTO_FLOW_LABEL_OPTIN:
        default:
                return 0;
        case IP6_AUTO_FLOW_LABEL_OPTOUT:
        case IP6_AUTO_FLOW_LABEL_FORCED:
                return 1;
        }
}
#else
static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
                                        __be32 flowlabel, bool autolabel,
                                        struct flowi6 *fl6)
{
        return flowlabel;
}
static inline int ip6_default_np_autolabel(struct net *net)
{
        return 0;
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return net->ipv6.sysctl.multipath_hash_policy;
}
static inline u32 ip6_multipath_hash_fields(const struct net *net)
{
        return net->ipv6.sysctl.multipath_hash_fields;
}
#else
static inline int ip6_multipath_hash_policy(const struct net *net)
{
        return 0;
}
static inline u32 ip6_multipath_hash_fields(const struct net *net)
{
        return 0;
}
#endif

/*
 *        Header manipulation
 */
static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass,
                                __be32 flowlabel)
{
        *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel;
}

static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWINFO_MASK;
}

static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
{
        return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
}

static inline u8 ip6_tclass(__be32 flowinfo)
{
        return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
}

static inline dscp_t ip6_dscp(__be32 flowinfo)
{
        return inet_dsfield_to_dscp(ip6_tclass(flowinfo));
}

static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
{
        return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
}

static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
{
        return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
}

/*
 *        Prototypes exported by ipv6
 */

/*
 *        rcv function (called from netdevice level)
 */

int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
             struct packet_type *pt, struct net_device *orig_dev);
void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
                   struct net_device *orig_dev);

int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        upper-layer output functions
 */
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
             __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);

int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);

int ip6_append_data(struct sock *sk,
                    int getfrag(void *from, char *to, int offset, int len,
                                int odd, struct sk_buff *skb),
                    void *from, size_t length, int transhdrlen,
                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
                    struct rt6_info *rt, unsigned int flags);

int ip6_push_pending_frames(struct sock *sk);

void ip6_flush_pending_frames(struct sock *sk);

int ip6_send_skb(struct sk_buff *skb);

struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue,
                               struct inet_cork_full *cork,
                               struct inet6_cork *v6_cork);
struct sk_buff *ip6_make_skb(struct sock *sk,
                             int getfrag(void *from, char *to, int offset,
                                         int len, int odd, struct sk_buff *skb),
                             void *from, size_t length, int transhdrlen,
                             struct ipcm6_cookie *ipc6,
                             struct rt6_info *rt, unsigned int flags,
                             struct inet_cork_full *cork);

static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
{
        return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork,
                              &inet6_sk(sk)->cork);
}

int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
                   struct flowi6 *fl6);
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
                                      const struct in6_addr *final_dst);
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
                                         const struct in6_addr *final_dst,
                                         bool connected);
struct dst_entry *ip6_blackhole_route(struct net *net,
                                      struct dst_entry *orig_dst);

/*
 *        skb processing functions
 */

int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_forward(struct sk_buff *skb);
int ip6_input(struct sk_buff *skb);
int ip6_mc_input(struct sk_buff *skb);
void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
                              bool have_final);

int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);

/*
 *        Extension header (options) processing
 */

void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                          u8 *proto, struct in6_addr **daddr_p,
                          struct in6_addr *saddr);
void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
                         u8 *proto);

int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp,
                     __be16 *frag_offp);

bool ipv6_ext_hdr(u8 nexthdr);

enum {
        IP6_FH_F_FRAG                = (1 << 0),
        IP6_FH_F_AUTH                = (1 << 1),
        IP6_FH_F_SKIP_RH        = (1 << 2),
};

/* find specified header and get offset to it */
int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
                  unsigned short *fragoff, int *fragflg);

int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);

struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
                                const struct ipv6_txoptions *opt,
                                struct in6_addr *orig);

/*
 *        socket options (ipv6_sockglue.c)
 */
DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount);

int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                       unsigned int optlen);
int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
                    unsigned int optlen);
int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
                       sockptr_t optval, sockptr_t optlen);
int ipv6_getsockopt(struct sock *sk, int level, int optname,
                    char __user *optval, int __user *optlen);

int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr,
                           int addr_len);
int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
                                 int addr_len);
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
void ip6_datagram_release_cb(struct sock *sk);

int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
                    int *addr_len);
int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
                     int *addr_len);
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
                     u32 info, u8 *payload);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);

void inet6_cleanup_sock(struct sock *sk);
void inet6_sock_destruct(struct sock *sk);
int inet6_release(struct socket *sock);
int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
                  int peer);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int inet6_compat_ioctl(struct socket *sock, unsigned int cmd,
                unsigned long arg);

int inet6_hash_connect(struct inet_timewait_death_row *death_row,
                              struct sock *sk);
int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                  int flags);

/*
 * reassembly.c
 */
extern const struct proto_ops inet6_stream_ops;
extern const struct proto_ops inet6_dgram_ops;
extern const struct proto_ops inet6_sockraw_ops;

struct group_source_req;
struct group_filter;

int ip6_mc_source(int add, int omode, struct sock *sk,
                  struct group_source_req *pgsr);
int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
                  struct sockaddr_storage *list);
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
                  sockptr_t optval, size_t ss_offset);

#ifdef CONFIG_PROC_FS
int ac6_proc_init(struct net *net);
void ac6_proc_exit(struct net *net);
int raw6_proc_init(void);
void raw6_proc_exit(void);
int tcp6_proc_init(struct net *net);
void tcp6_proc_exit(struct net *net);
int udp6_proc_init(struct net *net);
void udp6_proc_exit(struct net *net);
int udplite6_proc_init(void);
void udplite6_proc_exit(void);
int ipv6_misc_proc_init(void);
void ipv6_misc_proc_exit(void);
int snmp6_register_dev(struct inet6_dev *idev);
int snmp6_unregister_dev(struct inet6_dev *idev);

#else
static inline int ac6_proc_init(struct net *net) { return 0; }
static inline void ac6_proc_exit(struct net *net) { }
static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; }
static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
#endif

#ifdef CONFIG_SYSCTL
struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
size_t ipv6_icmp_sysctl_table_size(void);
struct ctl_table *ipv6_route_sysctl_init(struct net *net);
size_t ipv6_route_sysctl_table_size(struct net *net);
int ipv6_sysctl_register(void);
void ipv6_sysctl_unregister(void);
#endif

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
                          const struct in6_addr *addr, unsigned int mode);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);

static inline int ip6_sock_set_v6only(struct sock *sk)
{
        if (inet_sk(sk)->inet_num)
                return -EINVAL;
        lock_sock(sk);
        sk->sk_ipv6only = true;
        release_sock(sk);
        return 0;
}

static inline void ip6_sock_set_recverr(struct sock *sk)
{
        inet6_set_bit(RECVERR6, sk);
}

#define IPV6_PREFER_SRC_MASK (IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBLIC | \
                              IPV6_PREFER_SRC_COA)

static inline int ip6_sock_set_addr_preferences(struct sock *sk, int val)
{
        unsigned int prefmask = ~IPV6_PREFER_SRC_MASK;
        unsigned int pref = 0;

        /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
        switch (val & (IPV6_PREFER_SRC_PUBLIC |
                       IPV6_PREFER_SRC_TMP |
                       IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
        case IPV6_PREFER_SRC_PUBLIC:
                pref |= IPV6_PREFER_SRC_PUBLIC;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_TMP:
                pref |= IPV6_PREFER_SRC_TMP;
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
                prefmask &= ~(IPV6_PREFER_SRC_PUBLIC |
                              IPV6_PREFER_SRC_TMP);
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check HOME/COA conflicts */
        switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) {
        case IPV6_PREFER_SRC_HOME:
                prefmask &= ~IPV6_PREFER_SRC_COA;
                break;
        case IPV6_PREFER_SRC_COA:
                pref |= IPV6_PREFER_SRC_COA;
                break;
        case 0:
                break;
        default:
                return -EINVAL;
        }

        /* check CGA/NONCGA conflicts */
        switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
        case IPV6_PREFER_SRC_CGA:
        case IPV6_PREFER_SRC_NONCGA:
        case 0:
                break;
        default:
                return -EINVAL;
        }

        WRITE_ONCE(inet6_sk(sk)->srcprefs,
                   (READ_ONCE(inet6_sk(sk)->srcprefs) & prefmask) | pref);
        return 0;
}

static inline void ip6_sock_set_recvpktinfo(struct sock *sk)
{
        lock_sock(sk);
        inet6_sk(sk)->rxopt.bits.rxinfo = true;
        release_sock(sk);
}

#endif /* _NET_IPV6_H */








































































































    1 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * fs/kernfs/kernfs-internal.h - kernfs internal header file
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
 */

#ifndef __KERNFS_INTERNAL_H
#define __KERNFS_INTERNAL_H

#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>

#include <linux/kernfs.h>
#include <linux/fs_context.h>

struct kernfs_iattrs {
        kuid_t                        ia_uid;
        kgid_t                        ia_gid;
        struct timespec64        ia_atime;
        struct timespec64        ia_mtime;
        struct timespec64        ia_ctime;

        struct simple_xattrs        xattrs;
        atomic_t                nr_user_xattrs;
        atomic_t                user_xattr_size;
};

struct kernfs_root {
        /* published fields */
        struct kernfs_node        *kn;
        unsigned int                flags;        /* KERNFS_ROOT_* flags */

        /* private fields, do not use outside kernfs proper */
        struct idr                ino_idr;
        u32                        last_id_lowbits;
        u32                        id_highbits;
        struct kernfs_syscall_ops *syscall_ops;

        /* list of kernfs_super_info of this root, protected by kernfs_rwsem */
        struct list_head        supers;

        wait_queue_head_t        deactivate_waitq;
        struct rw_semaphore        kernfs_rwsem;
        struct rw_semaphore        kernfs_iattr_rwsem;
        struct rw_semaphore        kernfs_supers_rwsem;

        struct rcu_head                rcu;
};

/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS                (INT_MIN + 1)

/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */

/**
 * kernfs_root - find out the kernfs_root a kernfs_node belongs to
 * @kn: kernfs_node of interest
 *
 * Return: the kernfs_root @kn belongs to.
 */
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
{
        /* if parent exists, it's always a dir; otherwise, @sd is a dir */
        if (kn->parent)
                kn = kn->parent;
        return kn->dir.root;
}

/*
 * mount.c
 */
struct kernfs_super_info {
        struct super_block        *sb;

        /*
         * The root associated with this super_block.  Each super_block is
         * identified by the root and ns it's associated with.
         */
        struct kernfs_root        *root;

        /*
         * Each sb is associated with one namespace tag, currently the
         * network namespace of the task which mounted this kernfs
         * instance.  If multiple tags become necessary, make the following
         * an array and compare kernfs_node tag against every entry.
         */
        const void                *ns;

        /* anchored at kernfs_root->supers, protected by kernfs_rwsem */
        struct list_head        node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))

static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
        if (d_really_is_negative(dentry))
                return NULL;
        return d_inode(dentry)->i_private;
}

static inline void kernfs_set_rev(struct kernfs_node *parent,
                                  struct dentry *dentry)
{
        dentry->d_time = parent->dir.rev;
}

static inline void kernfs_inc_rev(struct kernfs_node *parent)
{
        parent->dir.rev++;
}

static inline bool kernfs_dir_changed(struct kernfs_node *parent,
                                      struct dentry *dentry)
{
        if (parent->dir.rev != dentry->d_time)
                return true;
        return false;
}

extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;

/*
 * inode.c
 */
extern const struct xattr_handler * const kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct mnt_idmap *idmap,
                          struct inode *inode, int mask);
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *iattr);
int kernfs_iop_getattr(struct mnt_idmap *idmap,
                       const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int query_flags);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);

/*
 * dir.c
 */
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;

struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
void kernfs_put_active(struct kernfs_node *kn);
int kernfs_add_one(struct kernfs_node *kn);
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    kuid_t uid, kgid_t gid,
                                    unsigned flags);

/*
 * file.c
 */
extern const struct file_operations kernfs_file_fops;

bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);

/*
 * symlink.c
 */
extern const struct inode_operations kernfs_symlink_iops;

/*
 * kernfs locks
 */
extern struct kernfs_global_locks *kernfs_locks;
#endif        /* __KERNFS_INTERNAL_H */






















































































    3 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * descriptor table internals; you almost certainly want file.h instead.
 */

#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H

#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>

#include <linux/atomic.h>

/*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
#define NR_OPEN_DEFAULT BITS_PER_LONG
#define NR_OPEN_MAX ~0U

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
struct vfsmount;
struct dentry;

#define rcu_dereference_check_fdtable(files, fdtfd) \
        rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))

#define files_fdtable(files) \
        rcu_dereference_check_fdtable((files), (files)->fdt)

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds);
        struct file *needs_masking;

        /*
         * 'mask' is zero for an out-of-bounds fd, all ones for ok.
         * 'fd&mask' is 'fd' for ok, or 0 for out of bounds.
         *
         * Accessing fdt->fd[0] is ok, but needs masking of the result.
         */
        needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]);
        return (struct file *)(mask & (unsigned long)needs_masking);
}

static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
                           "suspicious rcu_dereference_check() usage");
        return files_lookup_fd_raw(files, fd);
}

struct file *lookup_fdget_rcu(unsigned int fd);
struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);

static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
{
        return test_bit(fd, files_fdtable(files)->close_on_exec);
}

struct task_struct;

void put_files_struct(struct files_struct *fs);
int unshare_files(void);
struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
                int (*)(const void *, struct file *, unsigned),
                const void *);

extern int close_fd(unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern struct file *file_close_fd(unsigned int fd);
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
                      struct files_struct **new_fdp);

extern struct kmem_cache *files_cachep;

#endif /* __LINUX_FDTABLE_H */




























































    1 


























































































    1 











































































































































    1 













    1 



























    1 





    1 























    1 


























































    1 


























    1 














    1 























    1 

















    1 




















    1 





























































































































































































    1 














    1 














    1 



    1 

































































































































































































































































































































































































































    1 


















    1 







    1 

    1 
















    1 
















    1 




















    1 























































































































































































































































    1 






















    1 








    1 


















































































































































































































































































































    1 
























    1 



    1 




































    1 

























    1 







    1 






























    1 








































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
// SPDX-License-Identifier: GPL-2.0+
/*
 * linux/fs/jbd2/transaction.c
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
 *
 * Copyright 1998 Red Hat corp --- All Rights Reserved
 *
 * Generic filesystem transaction handling code; part of the ext2fs
 * journaling system.
 *
 * This file manages transactions (compound commits managed by the
 * journaling code) and handles (individual atomic operations by the
 * filesystem).
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
#include <linux/sched/mm.h>

#include <trace/events/jbd2.h>

static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
static void __jbd2_journal_unfile_buffer(struct journal_head *jh);

static struct kmem_cache *transaction_cache;
int __init jbd2_journal_init_transaction_cache(void)
{
        J_ASSERT(!transaction_cache);
        transaction_cache = kmem_cache_create("jbd2_transaction_s",
                                        sizeof(transaction_t),
                                        0,
                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
                                        NULL);
        if (!transaction_cache) {
                pr_emerg("JBD2: failed to create transaction cache\n");
                return -ENOMEM;
        }
        return 0;
}

void jbd2_journal_destroy_transaction_cache(void)
{
        kmem_cache_destroy(transaction_cache);
        transaction_cache = NULL;
}

void jbd2_journal_free_transaction(transaction_t *transaction)
{
        if (unlikely(ZERO_OR_NULL_PTR(transaction)))
                return;
        kmem_cache_free(transaction_cache, transaction);
}

/*
 * Base amount of descriptor blocks we reserve for each transaction.
 */
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
        int tag_space = journal->j_blocksize - sizeof(journal_header_t);
        int tags_per_block;

        /* Subtract UUID */
        tag_space -= 16;
        if (jbd2_journal_has_csum_v2or3(journal))
                tag_space -= sizeof(struct jbd2_journal_block_tail);
        /* Commit code leaves a slack space of 16 bytes at the end of block */
        tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
        /*
         * Revoke descriptors are accounted separately so we need to reserve
         * space for commit block and normal transaction descriptor blocks.
         */
        return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
                                tags_per_block);
}

/*
 * jbd2_get_transaction: obtain a new transaction_t object.
 *
 * Simply initialise a new transaction. Initialize it in
 * RUNNING state and add it to the current journal (which should not
 * have an existing running transaction: we only make a new transaction
 * once we have started to commit the old one).
 *
 * Preconditions:
 *        The journal MUST be locked.  We don't perform atomic mallocs on the
 *        new transaction        and we can't block without protecting against other
 *        processes trying to touch the journal while it is in transition.
 *
 */

static void jbd2_get_transaction(journal_t *journal,
                                transaction_t *transaction)
{
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        atomic_set(&transaction->t_updates, 0);
        atomic_set(&transaction->t_outstanding_credits,
                   jbd2_descriptor_blocks_per_trans(journal) +
                   atomic_read(&journal->j_reserved_credits));
        atomic_set(&transaction->t_outstanding_revokes, 0);
        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);

        /* Set up the commit timer for the new transaction. */
        journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
        add_timer(&journal->j_commit_timer);

        J_ASSERT(journal->j_running_transaction == NULL);
        journal->j_running_transaction = transaction;
        transaction->t_max_wait = 0;
        transaction->t_start = jiffies;
        transaction->t_requested = 0;
}

/*
 * Handle management.
 *
 * A handle_t is an object which represents a single atomic update to a
 * filesystem, and which tracks all of the modifications which form part
 * of that one update.
 */

/*
 * Update transaction's maximum wait time, if debugging is enabled.
 *
 * t_max_wait is carefully updated here with use of atomic compare exchange.
 * Note that there could be multiplre threads trying to do this simultaneously
 * hence using cmpxchg to avoid any use of locks in this case.
 * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
 */
static inline void update_t_max_wait(transaction_t *transaction,
                                     unsigned long ts)
{
        unsigned long oldts, newts;

        if (time_after(transaction->t_start, ts)) {
                newts = jbd2_time_diff(ts, transaction->t_start);
                oldts = READ_ONCE(transaction->t_max_wait);
                while (oldts < newts)
                        oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
        }
}

/*
 * Wait until running transaction passes to T_FLUSH state and new transaction
 * can thus be started. Also starts the commit if needed. The function expects
 * running transaction to exist and releases j_state_lock.
 */
static void wait_transaction_locked(journal_t *journal)
        __releases(journal->j_state_lock)
{
        DEFINE_WAIT(wait);
        int need_to_start;
        tid_t tid = journal->j_running_transaction->t_tid;

        prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
                        TASK_UNINTERRUPTIBLE);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        jbd2_might_wait_for_commit(journal);
        schedule();
        finish_wait(&journal->j_wait_transaction_locked, &wait);
}

/*
 * Wait until running transaction transitions from T_SWITCH to T_FLUSH
 * state and new transaction can thus be started. The function releases
 * j_state_lock.
 */
static void wait_transaction_switching(journal_t *journal)
        __releases(journal->j_state_lock)
{
        DEFINE_WAIT(wait);

        if (WARN_ON(!journal->j_running_transaction ||
                    journal->j_running_transaction->t_state != T_SWITCH)) {
                read_unlock(&journal->j_state_lock);
                return;
        }
        prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
                        TASK_UNINTERRUPTIBLE);
        read_unlock(&journal->j_state_lock);
        /*
         * We don't call jbd2_might_wait_for_commit() here as there's no
         * waiting for outstanding handles happening anymore in T_SWITCH state
         * and handling of reserved handles actually relies on that for
         * correctness.
         */
        schedule();
        finish_wait(&journal->j_wait_transaction_locked, &wait);
}

static void sub_reserved_credits(journal_t *journal, int blocks)
{
        atomic_sub(blocks, &journal->j_reserved_credits);
        wake_up(&journal->j_wait_reserved);
}

/*
 * Wait until we can add credits for handle to the running transaction.  Called
 * with j_state_lock held for reading. Returns 0 if handle joined the running
 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
 * caller must retry.
 *
 * Note: because j_state_lock may be dropped depending on the return
 * value, we need to fake out sparse so ti doesn't complain about a
 * locking imbalance.  Callers of add_transaction_credits will need to
 * make a similar accomodation.
 */
static int add_transaction_credits(journal_t *journal, int blocks,
                                   int rsv_blocks)
__must_hold(&journal->j_state_lock)
{
        transaction_t *t = journal->j_running_transaction;
        int needed;
        int total = blocks + rsv_blocks;

        /*
         * If the current transaction is locked down for commit, wait
         * for the lock to be released.
         */
        if (t->t_state != T_RUNNING) {
                WARN_ON_ONCE(t->t_state >= T_FLUSH);
                wait_transaction_locked(journal);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }

        /*
         * If there is not enough space left in the log to write all
         * potential buffers requested by this operation, we need to
         * stall pending a log checkpoint to free some more log space.
         */
        needed = atomic_add_return(total, &t->t_outstanding_credits);
        if (needed > journal->j_max_transaction_buffers) {
                /*
                 * If the current transaction is already too large,
                 * then start to commit it: we can then go back and
                 * attach this handle to a new transaction.
                 */
                atomic_sub(total, &t->t_outstanding_credits);

                /*
                 * Is the number of reserved credits in the current transaction too
                 * big to fit this handle? Wait until reserved credits are freed.
                 */
                if (atomic_read(&journal->j_reserved_credits) + total >
                    journal->j_max_transaction_buffers) {
                        read_unlock(&journal->j_state_lock);
                        jbd2_might_wait_for_commit(journal);
                        wait_event(journal->j_wait_reserved,
                                   atomic_read(&journal->j_reserved_credits) + total <=
                                   journal->j_max_transaction_buffers);
                        __acquire(&journal->j_state_lock); /* fake out sparse */
                        return 1;
                }

                wait_transaction_locked(journal);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }

        /*
         * The commit code assumes that it can get enough log space
         * without forcing a checkpoint.  This is *critical* for
         * correctness: a checkpoint of a buffer which is also
         * associated with a committing transaction creates a deadlock,
         * so commit simply cannot force through checkpoints.
         *
         * We must therefore ensure the necessary space in the journal
         * *before* starting to dirty potentially checkpointed buffers
         * in the new transaction.
         */
        if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                write_lock(&journal->j_state_lock);
                if (jbd2_log_space_left(journal) <
                                        journal->j_max_transaction_buffers)
                        __jbd2_log_wait_for_space(journal);
                write_unlock(&journal->j_state_lock);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }

        /* No reservation? We are done... */
        if (!rsv_blocks)
                return 0;

        needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
        /* We allow at most half of a transaction to be reserved */
        if (needed > journal->j_max_transaction_buffers / 2) {
                sub_reserved_credits(journal, rsv_blocks);
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                wait_event(journal->j_wait_reserved,
                         atomic_read(&journal->j_reserved_credits) + rsv_blocks
                         <= journal->j_max_transaction_buffers / 2);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }
        return 0;
}

/*
 * start_this_handle: Given a handle, deal with any locking or stalling
 * needed to make sure that there is enough journal space for the handle
 * to begin.  Attach the handle to a transaction and set up the
 * transaction's buffer credits.
 */

static int start_this_handle(journal_t *journal, handle_t *handle,
                             gfp_t gfp_mask)
{
        transaction_t        *transaction, *new_transaction = NULL;
        int                blocks = handle->h_total_credits;
        int                rsv_blocks = 0;
        unsigned long ts = jiffies;

        if (handle->h_rsv_handle)
                rsv_blocks = handle->h_rsv_handle->h_total_credits;

        /*
         * Limit the number of reserved credits to 1/2 of maximum transaction
         * size and limit the number of total credits to not exceed maximum
         * transaction size per operation.
         */
        if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
            (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
                printk(KERN_ERR "JBD2: %s wants too many credits "
                       "credits:%d rsv_credits:%d max:%d\n",
                       current->comm, blocks, rsv_blocks,
                       journal->j_max_transaction_buffers);
                WARN_ON(1);
                return -ENOSPC;
        }

alloc_transaction:
        /*
         * This check is racy but it is just an optimization of allocating new
         * transaction early if there are high chances we'll need it. If we
         * guess wrong, we'll retry or free unused transaction.
         */
        if (!data_race(journal->j_running_transaction)) {
                /*
                 * If __GFP_FS is not present, then we may be being called from
                 * inside the fs writeback layer, so we MUST NOT fail.
                 */
                if ((gfp_mask & __GFP_FS) == 0)
                        gfp_mask |= __GFP_NOFAIL;
                new_transaction = kmem_cache_zalloc(transaction_cache,
                                                    gfp_mask);
                if (!new_transaction)
                        return -ENOMEM;
        }

        jbd2_debug(3, "New handle %p going live.\n", handle);

        /*
         * We need to hold j_state_lock until t_updates has been incremented,
         * for proper journal barrier handling
         */
repeat:
        read_lock(&journal->j_state_lock);
        BUG_ON(journal->j_flags & JBD2_UNMOUNT);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
                jbd2_journal_free_transaction(new_transaction);
                return -EROFS;
        }

        /*
         * Wait on the journal's transaction barrier if necessary. Specifically
         * we allow reserved handles to proceed because otherwise commit could
         * deadlock on page writeback not being able to complete.
         */
        if (!handle->h_reserved && journal->j_barrier_count) {
                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_transaction_locked,
                                journal->j_barrier_count == 0);
                goto repeat;
        }

        if (!journal->j_running_transaction) {
                read_unlock(&journal->j_state_lock);
                if (!new_transaction)
                        goto alloc_transaction;
                write_lock(&journal->j_state_lock);
                if (!journal->j_running_transaction &&
                    (handle->h_reserved || !journal->j_barrier_count)) {
                        jbd2_get_transaction(journal, new_transaction);
                        new_transaction = NULL;
                }
                write_unlock(&journal->j_state_lock);
                goto repeat;
        }

        transaction = journal->j_running_transaction;

        if (!handle->h_reserved) {
                /* We may have dropped j_state_lock - restart in that case */
                if (add_transaction_credits(journal, blocks, rsv_blocks)) {
                        /*
                         * add_transaction_credits releases
                         * j_state_lock on a non-zero return
                         */
                        __release(&journal->j_state_lock);
                        goto repeat;
                }
        } else {
                /*
                 * We have handle reserved so we are allowed to join T_LOCKED
                 * transaction and we don't have to check for transaction size
                 * and journal space. But we still have to wait while running
                 * transaction is being switched to a committing one as it
                 * won't wait for any handles anymore.
                 */
                if (transaction->t_state == T_SWITCH) {
                        wait_transaction_switching(journal);
                        goto repeat;
                }
                sub_reserved_credits(journal, blocks);
                handle->h_reserved = 0;
        }

        /* OK, account for the buffers that this operation expects to
         * use and add the handle to the running transaction.
         */
        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
        handle->h_requested_credits = blocks;
        handle->h_revoke_credits_requested = handle->h_revoke_credits;
        handle->h_start_jiffies = jiffies;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
        jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
                  handle, blocks,
                  atomic_read(&transaction->t_outstanding_credits),
                  jbd2_log_space_left(journal));
        read_unlock(&journal->j_state_lock);
        current->journal_info = handle;

        rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
        jbd2_journal_free_transaction(new_transaction);
        /*
         * Ensure that no allocations done while the transaction is open are
         * going to recurse back to the fs layer.
         */
        handle->saved_alloc_context = memalloc_nofs_save();
        return 0;
}

/* Allocate a new handle.  This should probably be in a slab... */
static handle_t *new_handle(int nblocks)
{
        handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
        if (!handle)
                return NULL;
        handle->h_total_credits = nblocks;
        handle->h_ref = 1;

        return handle;
}

handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
                              int revoke_records, gfp_t gfp_mask,
                              unsigned int type, unsigned int line_no)
{
        handle_t *handle = journal_current_handle();
        int err;

        if (!journal)
                return ERR_PTR(-EROFS);

        if (handle) {
                J_ASSERT(handle->h_transaction->t_journal == journal);
                handle->h_ref++;
                return handle;
        }

        nblocks += DIV_ROUND_UP(revoke_records,
                                journal->j_revoke_records_per_block);
        handle = new_handle(nblocks);
        if (!handle)
                return ERR_PTR(-ENOMEM);
        if (rsv_blocks) {
                handle_t *rsv_handle;

                rsv_handle = new_handle(rsv_blocks);
                if (!rsv_handle) {
                        jbd2_free_handle(handle);
                        return ERR_PTR(-ENOMEM);
                }
                rsv_handle->h_reserved = 1;
                rsv_handle->h_journal = journal;
                handle->h_rsv_handle = rsv_handle;
        }
        handle->h_revoke_credits = revoke_records;

        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
                if (handle->h_rsv_handle)
                        jbd2_free_handle(handle->h_rsv_handle);
                jbd2_free_handle(handle);
                return ERR_PTR(err);
        }
        handle->h_type = type;
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
                                line_no, nblocks);

        return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);


/**
 * jbd2_journal_start() - Obtain a new handle.
 * @journal: Journal to start transaction on.
 * @nblocks: number of block buffer we might modify
 *
 * We make sure that the transaction can guarantee at least nblocks of
 * modified buffers in the log.  We block until the log can guarantee
 * that much space. Additionally, if rsv_blocks > 0, we also create another
 * handle with rsv_blocks reserved blocks in the journal. This handle is
 * stored in h_rsv_handle. It is not attached to any particular transaction
 * and thus doesn't block transaction commit. If the caller uses this reserved
 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
 * on the parent handle will dispose the reserved one. Reserved handle has to
 * be converted to a normal handle using jbd2_journal_start_reserved() before
 * it can be used.
 *
 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 * on failure.
 */
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
        return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);

static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
{
        journal_t *journal = handle->h_journal;

        WARN_ON(!handle->h_reserved);
        sub_reserved_credits(journal, handle->h_total_credits);
        if (t)
                atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
}

void jbd2_journal_free_reserved(handle_t *handle)
{
        journal_t *journal = handle->h_journal;

        /* Get j_state_lock to pin running transaction if it exists */
        read_lock(&journal->j_state_lock);
        __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
        read_unlock(&journal->j_state_lock);
        jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);

/**
 * jbd2_journal_start_reserved() - start reserved handle
 * @handle: handle to start
 * @type: for handle statistics
 * @line_no: for handle statistics
 *
 * Start handle that has been previously reserved with jbd2_journal_reserve().
 * This attaches @handle to the running transaction (or creates one if there's
 * not transaction running). Unlike jbd2_journal_start() this function cannot
 * block on journal commit, checkpointing, or similar stuff. It can block on
 * memory allocation or frozen journal though.
 *
 * Return 0 on success, non-zero on error - handle is freed in that case.
 */
int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
                                unsigned int line_no)
{
        journal_t *journal = handle->h_journal;
        int ret = -EIO;

        if (WARN_ON(!handle->h_reserved)) {
                /* Someone passed in normal handle? Just stop it. */
                jbd2_journal_stop(handle);
                return ret;
        }
        /*
         * Usefulness of mixing of reserved and unreserved handles is
         * questionable. So far nobody seems to need it so just error out.
         */
        if (WARN_ON(current->journal_info)) {
                jbd2_journal_free_reserved(handle);
                return ret;
        }

        handle->h_journal = NULL;
        /*
         * GFP_NOFS is here because callers are likely from writeback or
         * similarly constrained call sites
         */
        ret = start_this_handle(journal, handle, GFP_NOFS);
        if (ret < 0) {
                handle->h_journal = journal;
                jbd2_journal_free_reserved(handle);
                return ret;
        }
        handle->h_type = type;
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
                                line_no, handle->h_total_credits);
        return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);

/**
 * jbd2_journal_extend() - extend buffer credits.
 * @handle:  handle to 'extend'
 * @nblocks: nr blocks to try to extend by.
 * @revoke_records: number of revoke records to try to extend by.
 *
 * Some transactions, such as large extends and truncates, can be done
 * atomically all at once or in several stages.  The operation requests
 * a credit for a number of buffer modifications in advance, but can
 * extend its credit if it needs more.
 *
 * jbd2_journal_extend tries to give the running handle more buffer credits.
 * It does not guarantee that allocation - this is a best-effort only.
 * The calling process MUST be able to deal cleanly with a failure to
 * extend here.
 *
 * Return 0 on success, non-zero on failure.
 *
 * return code < 0 implies an error
 * return code > 0 implies normal transaction-full status.
 */
int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int result;
        int wanted;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        result = 1;

        read_lock(&journal->j_state_lock);

        /* Don't extend a locked-down transaction! */
        if (transaction->t_state != T_RUNNING) {
                jbd2_debug(3, "denied handle %p %d blocks: "
                          "transaction not running\n", handle, nblocks);
                goto error_out;
        }

        nblocks += DIV_ROUND_UP(
                        handle->h_revoke_credits_requested + revoke_records,
                        journal->j_revoke_records_per_block) -
                DIV_ROUND_UP(
                        handle->h_revoke_credits_requested,
                        journal->j_revoke_records_per_block);
        wanted = atomic_add_return(nblocks,
                                   &transaction->t_outstanding_credits);

        if (wanted > journal->j_max_transaction_buffers) {
                jbd2_debug(3, "denied handle %p %d blocks: "
                          "transaction too large\n", handle, nblocks);
                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                goto error_out;
        }

        trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
                                 transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_total_credits,
                                 nblocks);

        handle->h_total_credits += nblocks;
        handle->h_requested_credits += nblocks;
        handle->h_revoke_credits += revoke_records;
        handle->h_revoke_credits_requested += revoke_records;
        result = 0;

        jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
error_out:
        read_unlock(&journal->j_state_lock);
        return result;
}

static void stop_this_handle(handle_t *handle)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
        int revokes;

        J_ASSERT(journal_current_handle() == handle);
        J_ASSERT(atomic_read(&transaction->t_updates) > 0);
        current->journal_info = NULL;
        /*
         * Subtract necessary revoke descriptor blocks from handle credits. We
         * take care to account only for revoke descriptor blocks the
         * transaction will really need as large sequences of transactions with
         * small numbers of revokes are relatively common.
         */
        revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
        if (revokes) {
                int t_revokes, revoke_descriptors;
                int rr_per_blk = journal->j_revoke_records_per_block;

                WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
                                > handle->h_total_credits);
                t_revokes = atomic_add_return(revokes,
                                &transaction->t_outstanding_revokes);
                revoke_descriptors =
                        DIV_ROUND_UP(t_revokes, rr_per_blk) -
                        DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
                handle->h_total_credits -= revoke_descriptors;
        }
        atomic_sub(handle->h_total_credits,
                   &transaction->t_outstanding_credits);
        if (handle->h_rsv_handle)
                __jbd2_journal_unreserve_handle(handle->h_rsv_handle,
                                                transaction);
        if (atomic_dec_and_test(&transaction->t_updates))
                wake_up(&journal->j_wait_updates);

        rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
        /*
         * Scope of the GFP_NOFS context is over here and so we can restore the
         * original alloc context.
         */
        memalloc_nofs_restore(handle->saved_alloc_context);
}

/**
 * jbd2__journal_restart() - restart a handle .
 * @handle:  handle to restart
 * @nblocks: nr credits requested
 * @revoke_records: number of revoke record credits requested
 * @gfp_mask: memory allocation flags (for start_this_handle)
 *
 * Restart a handle for a multi-transaction filesystem
 * operation.
 *
 * If the jbd2_journal_extend() call above fails to grant new buffer credits
 * to a running handle, a call to jbd2_journal_restart will commit the
 * handle's transaction so far and reattach the handle to a new
 * transaction capable of guaranteeing the requested number of
 * credits. We preserve reserved handle if there's any attached to the
 * passed in handle.
 */
int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
                          gfp_t gfp_mask)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        tid_t                tid;
        int                need_to_start;
        int                ret;

        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
                return 0;
        journal = transaction->t_journal;
        tid = transaction->t_tid;

        /*
         * First unlink the handle from its current transaction, and start the
         * commit on that.
         */
        jbd2_debug(2, "restarting handle %p\n", handle);
        stop_this_handle(handle);
        handle->h_transaction = NULL;

        /*
         * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
          * get rid of pointless j_state_lock traffic like this.
         */
        read_lock(&journal->j_state_lock);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        handle->h_total_credits = nblocks +
                DIV_ROUND_UP(revoke_records,
                             journal->j_revoke_records_per_block);
        handle->h_revoke_credits = revoke_records;
        ret = start_this_handle(journal, handle, gfp_mask);
        trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
                                 ret ? 0 : handle->h_transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_total_credits);
        return ret;
}
EXPORT_SYMBOL(jbd2__journal_restart);


int jbd2_journal_restart(handle_t *handle, int nblocks)
{
        return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
}
EXPORT_SYMBOL(jbd2_journal_restart);

/*
 * Waits for any outstanding t_updates to finish.
 * This is called with write j_state_lock held.
 */
void jbd2_journal_wait_updates(journal_t *journal)
{
        DEFINE_WAIT(wait);

        while (1) {
                /*
                 * Note that the running transaction can get freed under us if
                 * this transaction is getting committed in
                 * jbd2_journal_commit_transaction() ->
                 * jbd2_journal_free_transaction(). This can only happen when we
                 * release j_state_lock -> schedule() -> acquire j_state_lock.
                 * Hence we should everytime retrieve new j_running_transaction
                 * value (after j_state_lock release acquire cycle), else it may
                 * lead to use-after-free of old freed transaction.
                 */
                transaction_t *transaction = journal->j_running_transaction;

                if (!transaction)
                        break;

                prepare_to_wait(&journal->j_wait_updates, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (!atomic_read(&transaction->t_updates)) {
                        finish_wait(&journal->j_wait_updates, &wait);
                        break;
                }
                write_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_updates, &wait);
                write_lock(&journal->j_state_lock);
        }
}

/**
 * jbd2_journal_lock_updates () - establish a transaction barrier.
 * @journal:  Journal to establish a barrier on.
 *
 * This locks out any further updates from being started, and blocks
 * until all existing updates have completed, returning only once the
 * journal is in a quiescent state with no updates running.
 *
 * The journal lock should not be held on entry.
 */
void jbd2_journal_lock_updates(journal_t *journal)
{
        jbd2_might_wait_for_commit(journal);

        write_lock(&journal->j_state_lock);
        ++journal->j_barrier_count;

        /* Wait until there are no reserved handles */
        if (atomic_read(&journal->j_reserved_credits)) {
                write_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_reserved,
                           atomic_read(&journal->j_reserved_credits) == 0);
                write_lock(&journal->j_state_lock);
        }

        /* Wait until there are no running t_updates */
        jbd2_journal_wait_updates(journal);

        write_unlock(&journal->j_state_lock);

        /*
         * We have now established a barrier against other normal updates, but
         * we also need to barrier against other jbd2_journal_lock_updates() calls
         * to make sure that we serialise special journal-locked operations
         * too.
         */
        mutex_lock(&journal->j_barrier);
}

/**
 * jbd2_journal_unlock_updates () - release barrier
 * @journal:  Journal to release the barrier on.
 *
 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
 *
 * Should be called without the journal lock held.
 */
void jbd2_journal_unlock_updates (journal_t *journal)
{
        J_ASSERT(journal->j_barrier_count != 0);

        mutex_unlock(&journal->j_barrier);
        write_lock(&journal->j_state_lock);
        --journal->j_barrier_count;
        write_unlock(&journal->j_state_lock);
        wake_up_all(&journal->j_wait_transaction_locked);
}

static void warn_dirty_buffer(struct buffer_head *bh)
{
        printk(KERN_WARNING
               "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
               "There's a risk of filesystem corruption in case of system "
               "crash.\n",
               bh->b_bdev, (unsigned long long)bh->b_blocknr);
}

/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
static void jbd2_freeze_jh_data(struct journal_head *jh)
{
        char *source;
        struct buffer_head *bh = jh2bh(jh);

        J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
        source = kmap_local_folio(bh->b_folio, bh_offset(bh));
        /* Fire data frozen trigger just before we copy the data */
        jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
        memcpy(jh->b_frozen_data, source, bh->b_size);
        kunmap_local(source);

        /*
         * Now that the frozen data is saved off, we need to store any matching
         * triggers.
         */
        jh->b_frozen_triggers = jh->b_triggers;
}

/*
 * If the buffer is already part of the current transaction, then there
 * is nothing we need to do.  If it is already part of a prior
 * transaction which we are still committing to disk, then we need to
 * make sure that we do not overwrite the old copy: we do copy-out to
 * preserve the copy going to disk.  We also account the buffer against
 * the handle's metadata buffer credits (unless the buffer is already
 * part of the transaction, that is).
 *
 */
static int
do_get_write_access(handle_t *handle, struct journal_head *jh,
                        int force_copy)
{
        struct buffer_head *bh;
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int error;
        char *frozen_buffer = NULL;
        unsigned long start_lock, time_lock;

        journal = transaction->t_journal;

        jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);

        JBUFFER_TRACE(jh, "entry");
repeat:
        bh = jh2bh(jh);

        /* @@@ Need to check for errors here at some point. */

         start_lock = jiffies;
        lock_buffer(bh);
        spin_lock(&jh->b_state_lock);

        /* If it takes too long to lock the buffer, trace it */
        time_lock = jbd2_time_diff(start_lock, jiffies);
        if (time_lock > HZ/10)
                trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
                        jiffies_to_msecs(time_lock));

        /* We now hold the buffer lock so it is safe to query the buffer
         * state.  Is the buffer dirty?
         *
         * If so, there are two possibilities.  The buffer may be
         * non-journaled, and undergoing a quite legitimate writeback.
         * Otherwise, it is journaled, and we don't expect dirty buffers
         * in that state (the buffers should be marked JBD_Dirty
         * instead.)  So either the IO is being done under our own
         * control and this is a bug, or it's a third party IO such as
         * dump(8) (which may leave the buffer scheduled for read ---
         * ie. locked but not dirty) or tune2fs (which may actually have
         * the buffer dirtied, ugh.)  */

        if (buffer_dirty(bh) && jh->b_transaction) {
                warn_dirty_buffer(bh);
                /*
                 * We need to clean the dirty flag and we must do it under the
                 * buffer lock to be sure we don't race with running write-out.
                 */
                JBUFFER_TRACE(jh, "Journalling dirty buffer");
                clear_buffer_dirty(bh);
                /*
                 * The buffer is going to be added to BJ_Reserved list now and
                 * nothing guarantees jbd2_journal_dirty_metadata() will be
                 * ever called for it. So we need to set jbddirty bit here to
                 * make sure the buffer is dirtied and written out when the
                 * journaling machinery is done with it.
                 */
                set_buffer_jbddirty(bh);
        }

        error = -EROFS;
        if (is_handle_aborted(handle)) {
                spin_unlock(&jh->b_state_lock);
                unlock_buffer(bh);
                goto out;
        }
        error = 0;

        /*
         * The buffer is already part of this transaction if b_transaction or
         * b_next_transaction points to it
         */
        if (jh->b_transaction == transaction ||
            jh->b_next_transaction == transaction) {
                unlock_buffer(bh);
                goto done;
        }

        /*
         * this is the first time this transaction is touching this buffer,
         * reset the modified flag
         */
        jh->b_modified = 0;

        /*
         * If the buffer is not journaled right now, we need to make sure it
         * doesn't get written to disk before the caller actually commits the
         * new data
         */
        if (!jh->b_transaction) {
                JBUFFER_TRACE(jh, "no transaction");
                J_ASSERT_JH(jh, !jh->b_next_transaction);
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                /*
                 * Make sure all stores to jh (b_modified, b_frozen_data) are
                 * visible before attaching it to the running transaction.
                 * Paired with barrier in jbd2_write_access_granted()
                 */
                smp_wmb();
                spin_lock(&journal->j_list_lock);
                if (test_clear_buffer_dirty(bh)) {
                        /*
                         * Execute buffer dirty clearing and jh->b_transaction
                         * assignment under journal->j_list_lock locked to
                         * prevent bh being removed from checkpoint list if
                         * the buffer is in an intermediate state (not dirty
                         * and jh->b_transaction is NULL).
                         */
                        JBUFFER_TRACE(jh, "Journalling dirty buffer");
                        set_buffer_jbddirty(bh);
                }
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
                spin_unlock(&journal->j_list_lock);
                unlock_buffer(bh);
                goto done;
        }
        unlock_buffer(bh);

        /*
         * If there is already a copy-out version of this buffer, then we don't
         * need to make another one
         */
        if (jh->b_frozen_data) {
                JBUFFER_TRACE(jh, "has frozen data");
                J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
                goto attach_next;
        }

        JBUFFER_TRACE(jh, "owned by older transaction");
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);

        /*
         * There is one case we have to be very careful about.  If the
         * committing transaction is currently writing this buffer out to disk
         * and has NOT made a copy-out, then we cannot modify the buffer
         * contents at all right now.  The essence of copy-out is that it is
         * the extra copy, not the primary copy, which gets journaled.  If the
         * primary copy is already going to disk then we cannot do copy-out
         * here.
         */
        if (buffer_shadow(bh)) {
                JBUFFER_TRACE(jh, "on shadow: sleep");
                spin_unlock(&jh->b_state_lock);
                wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }

        /*
         * Only do the copy if the currently-owning transaction still needs it.
         * If buffer isn't on BJ_Metadata list, the committing transaction is
         * past that stage (here we use the fact that BH_Shadow is set under
         * bh_state lock together with refiling to BJ_Shadow list and at this
         * point we know the buffer doesn't have BH_Shadow set).
         *
         * Subtle point, though: if this is a get_undo_access, then we will be
         * relying on the frozen_data to contain the new value of the
         * committed_data record after the transaction, so we HAVE to force the
         * frozen_data copy in that case.
         */
        if (jh->b_jlist == BJ_Metadata || force_copy) {
                JBUFFER_TRACE(jh, "generate frozen data");
                if (!frozen_buffer) {
                        JBUFFER_TRACE(jh, "allocate memory for buffer");
                        spin_unlock(&jh->b_state_lock);
                        frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
                                                   GFP_NOFS | __GFP_NOFAIL);
                        goto repeat;
                }
                jh->b_frozen_data = frozen_buffer;
                frozen_buffer = NULL;
                jbd2_freeze_jh_data(jh);
        }
attach_next:
        /*
         * Make sure all stores to jh (b_modified, b_frozen_data) are visible
         * before attaching it to the running transaction. Paired with barrier
         * in jbd2_write_access_granted()
         */
        smp_wmb();
        jh->b_next_transaction = transaction;

done:
        spin_unlock(&jh->b_state_lock);

        /*
         * If we are about to journal a buffer, then any revoke pending on it is
         * no longer valid
         */
        jbd2_journal_cancel_revoke(handle, jh);

out:
        if (unlikely(frozen_buffer))        /* It's usually NULL */
                jbd2_free(frozen_buffer, bh->b_size);

        JBUFFER_TRACE(jh, "exit");
        return error;
}

/* Fast check whether buffer is already attached to the required transaction */
static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
                                                        bool undo)
{
        struct journal_head *jh;
        bool ret = false;

        /* Dirty buffers require special handling... */
        if (buffer_dirty(bh))
                return false;

        /*
         * RCU protects us from dereferencing freed pages. So the checks we do
         * are guaranteed not to oops. However the jh slab object can get freed
         * & reallocated while we work with it. So we have to be careful. When
         * we see jh attached to the running transaction, we know it must stay
         * so until the transaction is committed. Thus jh won't be freed and
         * will be attached to the same bh while we run.  However it can
         * happen jh gets freed, reallocated, and attached to the transaction
         * just after we get pointer to it from bh. So we have to be careful
         * and recheck jh still belongs to our bh before we return success.
         */
        rcu_read_lock();
        if (!buffer_jbd(bh))
                goto out;
        /* This should be bh2jh() but that doesn't work with inline functions */
        jh = READ_ONCE(bh->b_private);
        if (!jh)
                goto out;
        /* For undo access buffer must have data copied */
        if (undo && !jh->b_committed_data)
                goto out;
        if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
            READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
                goto out;
        /*
         * There are two reasons for the barrier here:
         * 1) Make sure to fetch b_bh after we did previous checks so that we
         * detect when jh went through free, realloc, attach to transaction
         * while we were checking. Paired with implicit barrier in that path.
         * 2) So that access to bh done after jbd2_write_access_granted()
         * doesn't get reordered and see inconsistent state of concurrent
         * do_get_write_access().
         */
        smp_mb();
        if (unlikely(jh->b_bh != bh))
                goto out;
        ret = true;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * jbd2_journal_get_write_access() - notify intent to modify a buffer
 *                                     for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
 *
 * Returns: error code or 0 on success.
 *
 * In full data journalling mode the buffer may be of type BJ_AsyncData,
 * because we're ``write()ing`` a buffer which is also part of a shared mapping.
 */

int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
        struct journal_head *jh;
        journal_t *journal;
        int rc;

        if (is_handle_aborted(handle))
                return -EROFS;

        journal = handle->h_transaction->t_journal;
        if (jbd2_check_fs_dev_write_error(journal)) {
                /*
                 * If the fs dev has writeback errors, it may have failed
                 * to async write out metadata buffers in the background.
                 * In this case, we could read old data from disk and write
                 * it out again, which may lead to on-disk filesystem
                 * inconsistency. Aborting journal can avoid it happen.
                 */
                jbd2_journal_abort(journal, -EIO);
                return -EIO;
        }

        if (jbd2_write_access_granted(handle, bh, false))
                return 0;

        jh = jbd2_journal_add_journal_head(bh);
        /* We do not want to get caught playing with fields which the
         * log thread also manipulates.  Make sure that the buffer
         * completes any outstanding IO before proceeding. */
        rc = do_get_write_access(handle, jh, 0);
        jbd2_journal_put_journal_head(jh);
        return rc;
}


/*
 * When the user wants to journal a newly created buffer_head
 * (ie. getblk() returned a new buffer and we are going to populate it
 * manually rather than reading off disk), then we need to keep the
 * buffer_head locked until it has been completely filled with new
 * data.  In this case, we should be able to make the assertion that
 * the bh is not already part of an existing transaction.
 *
 * The buffer should already be locked by the caller by this point.
 * There is no lock ranking violation: it was a newly created,
 * unlocked buffer beforehand. */

/**
 * jbd2_journal_get_create_access () - notify intent to use newly created bh
 * @handle: transaction to new buffer to
 * @bh: new buffer.
 *
 * Call this if you create a new bh.
 */
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
        int err;

        jbd2_debug(5, "journal_head %p\n", jh);
        err = -EROFS;
        if (is_handle_aborted(handle))
                goto out;
        journal = transaction->t_journal;
        err = 0;

        JBUFFER_TRACE(jh, "entry");
        /*
         * The buffer may already belong to this transaction due to pre-zeroing
         * in the filesystem's new_block code.  It may also be on the previous,
         * committing transaction's lists, but it HAS to be in Forget state in
         * that case: the transaction must have deleted the buffer for it to be
         * reused here.
         */
        spin_lock(&jh->b_state_lock);
        J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                jh->b_transaction == NULL ||
                (jh->b_transaction == journal->j_committing_transaction &&
                          jh->b_jlist == BJ_Forget)));

        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));

        if (jh->b_transaction == NULL) {
                /*
                 * Previous jbd2_journal_forget() could have left the buffer
                 * with jbddirty bit set because it was being committed. When
                 * the commit finished, we've filed the buffer for
                 * checkpointing and marked it dirty. Now we are reallocating
                 * the buffer so the transaction freeing it must have
                 * committed and so it's safe to clear the dirty bit.
                 */
                clear_buffer_dirty(jh2bh(jh));
                /* first access by this transaction */
                jh->b_modified = 0;

                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                spin_lock(&journal->j_list_lock);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction == journal->j_committing_transaction) {
                /* first access by this transaction */
                jh->b_modified = 0;

                JBUFFER_TRACE(jh, "set next transaction");
                spin_lock(&journal->j_list_lock);
                jh->b_next_transaction = transaction;
                spin_unlock(&journal->j_list_lock);
        }
        spin_unlock(&jh->b_state_lock);

        /*
         * akpm: I added this.  ext3_alloc_branch can pick up new indirect
         * blocks which contain freed but then revoked metadata.  We need
         * to cancel the revoke in case we end up freeing it yet again
         * and the reallocating as data - this would cause a second revoke,
         * which hits an assertion error.
         */
        JBUFFER_TRACE(jh, "cancelling revoke");
        jbd2_journal_cancel_revoke(handle, jh);
out:
        jbd2_journal_put_journal_head(jh);
        return err;
}

/**
 * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
 *     non-rewindable consequences
 * @handle: transaction
 * @bh: buffer to undo
 *
 * Sometimes there is a need to distinguish between metadata which has
 * been committed to disk and that which has not.  The ext3fs code uses
 * this for freeing and allocating space, we have to make sure that we
 * do not reuse freed space until the deallocation has been committed,
 * since if we overwrote that space we would make the delete
 * un-rewindable in case of a crash.
 *
 * To deal with that, jbd2_journal_get_undo_access requests write access to a
 * buffer for parts of non-rewindable operations such as delete
 * operations on the bitmaps.  The journaling code must keep a copy of
 * the buffer's contents prior to the undo_access call until such time
 * as we know that the buffer has definitely been committed to disk.
 *
 * We never need to know which transaction the committed data is part
 * of, buffers touched here are guaranteed to be dirtied later and so
 * will be committed to a new transaction in due course, at which point
 * we can discard the old committed data pointer.
 *
 * Returns error number or 0 on success.
 */
int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
{
        int err;
        struct journal_head *jh;
        char *committed_data = NULL;

        if (is_handle_aborted(handle))
                return -EROFS;

        if (jbd2_write_access_granted(handle, bh, true))
                return 0;

        jh = jbd2_journal_add_journal_head(bh);
        JBUFFER_TRACE(jh, "entry");

        /*
         * Do this first --- it can drop the journal lock, so we want to
         * make sure that obtaining the committed_data is done
         * atomically wrt. completion of any outstanding commits.
         */
        err = do_get_write_access(handle, jh, 1);
        if (err)
                goto out;

repeat:
        if (!jh->b_committed_data)
                committed_data = jbd2_alloc(jh2bh(jh)->b_size,
                                            GFP_NOFS|__GFP_NOFAIL);

        spin_lock(&jh->b_state_lock);
        if (!jh->b_committed_data) {
                /* Copy out the current buffer contents into the
                 * preserved, committed copy. */
                JBUFFER_TRACE(jh, "generate b_committed data");
                if (!committed_data) {
                        spin_unlock(&jh->b_state_lock);
                        goto repeat;
                }

                jh->b_committed_data = committed_data;
                committed_data = NULL;
                memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
        }
        spin_unlock(&jh->b_state_lock);
out:
        jbd2_journal_put_journal_head(jh);
        if (unlikely(committed_data))
                jbd2_free(committed_data, bh->b_size);
        return err;
}

/**
 * jbd2_journal_set_triggers() - Add triggers for commit writeout
 * @bh: buffer to trigger on
 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
 *
 * Set any triggers on this journal_head.  This is always safe, because
 * triggers for a committing buffer will be saved off, and triggers for
 * a running transaction will match the buffer in that transaction.
 *
 * Call with NULL to clear the triggers.
 */
void jbd2_journal_set_triggers(struct buffer_head *bh,
                               struct jbd2_buffer_trigger_type *type)
{
        struct journal_head *jh = jbd2_journal_grab_journal_head(bh);

        if (WARN_ON_ONCE(!jh))
                return;
        jh->b_triggers = type;
        jbd2_journal_put_journal_head(jh);
}

void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
                                struct jbd2_buffer_trigger_type *triggers)
{
        struct buffer_head *bh = jh2bh(jh);

        if (!triggers || !triggers->t_frozen)
                return;

        triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
}

void jbd2_buffer_abort_trigger(struct journal_head *jh,
                               struct jbd2_buffer_trigger_type *triggers)
{
        if (!triggers || !triggers->t_abort)
                return;

        triggers->t_abort(triggers, jh2bh(jh));
}

/**
 * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
 *
 * mark dirty metadata which needs to be journaled as part of the current
 * transaction.
 *
 * The buffer must have previously had jbd2_journal_get_write_access()
 * called so that it has a valid journal_head attached to the buffer
 * head.
 *
 * The buffer is placed on the transaction's metadata list and is marked
 * as belonging to the transaction.
 *
 * Returns error number or 0 on success.
 *
 * Special care needs to be taken if the buffer already belongs to the
 * current committing transaction (in which case we should have frozen
 * data present for that commit).  In that case, we don't relink the
 * buffer: that only gets done when the old transaction finally
 * completes its commit.
 */
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh;
        int ret = 0;

        if (!buffer_jbd(bh))
                return -EUCLEAN;

        /*
         * We don't grab jh reference here since the buffer must be part
         * of the running transaction.
         */
        jh = bh2jh(bh);
        jbd2_debug(5, "journal_head %p\n", jh);
        JBUFFER_TRACE(jh, "entry");

        /*
         * This and the following assertions are unreliable since we may see jh
         * in inconsistent state unless we grab bh_state lock. But this is
         * crucial to catch bugs so let's do a reliable check until the
         * lockless handling is fully proven.
         */
        if (data_race(jh->b_transaction != transaction &&
            jh->b_next_transaction != transaction)) {
                spin_lock(&jh->b_state_lock);
                J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_next_transaction == transaction);
                spin_unlock(&jh->b_state_lock);
        }
        if (jh->b_modified == 1) {
                /* If it's in our transaction it must be in BJ_Metadata list. */
                if (data_race(jh->b_transaction == transaction &&
                    jh->b_jlist != BJ_Metadata)) {
                        spin_lock(&jh->b_state_lock);
                        if (jh->b_transaction == transaction &&
                            jh->b_jlist != BJ_Metadata)
                                pr_err("JBD2: assertion failure: h_type=%u "
                                       "h_line_no=%u block_no=%llu jlist=%u\n",
                                       handle->h_type, handle->h_line_no,
                                       (unsigned long long) bh->b_blocknr,
                                       jh->b_jlist);
                        J_ASSERT_JH(jh, jh->b_transaction != transaction ||
                                        jh->b_jlist == BJ_Metadata);
                        spin_unlock(&jh->b_state_lock);
                }
                goto out;
        }

        journal = transaction->t_journal;
        spin_lock(&jh->b_state_lock);

        if (is_handle_aborted(handle)) {
                /*
                 * Check journal aborting with @jh->b_state_lock locked,
                 * since 'jh->b_transaction' could be replaced with
                 * 'jh->b_next_transaction' during old transaction
                 * committing if journal aborted, which may fail
                 * assertion on 'jh->b_frozen_data == NULL'.
                 */
                ret = -EROFS;
                goto out_unlock_bh;
        }

        if (jh->b_modified == 0) {
                /*
                 * This buffer's got modified and becoming part
                 * of the transaction. This needs to be done
                 * once a transaction -bzzz
                 */
                if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
                        ret = -ENOSPC;
                        goto out_unlock_bh;
                }
                jh->b_modified = 1;
                handle->h_total_credits--;
        }

        /*
         * fastpath, to avoid expensive locking.  If this buffer is already
         * on the running transaction's metadata list there is nothing to do.
         * Nobody can take it off again because there is a handle open.
         * I _think_ we're OK here with SMP barriers - a mistaken decision will
         * result in this test being false, so we go in and take the locks.
         */
        if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
                JBUFFER_TRACE(jh, "fastpath");
                if (unlikely(jh->b_transaction !=
                             journal->j_running_transaction)) {
                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_transaction (%llu, %p, %u) != "
                               "journal->j_running_transaction (%p, %u)\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               jh->b_transaction,
                               jh->b_transaction ? jh->b_transaction->t_tid : 0,
                               journal->j_running_transaction,
                               journal->j_running_transaction ?
                               journal->j_running_transaction->t_tid : 0);
                        ret = -EINVAL;
                }
                goto out_unlock_bh;
        }

        set_buffer_jbddirty(bh);

        /*
         * Metadata already on the current transaction list doesn't
         * need to be filed.  Metadata on another transaction's list must
         * be committing, and will be refiled once the commit completes:
         * leave it alone for now.
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
                if (unlikely(((jh->b_transaction !=
                               journal->j_committing_transaction)) ||
                             (jh->b_next_transaction != transaction))) {
                        printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
                               "bad jh for block %llu: "
                               "transaction (%p, %u), "
                               "jh->b_transaction (%p, %u), "
                               "jh->b_next_transaction (%p, %u), jlist %u\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               transaction, transaction->t_tid,
                               jh->b_transaction,
                               jh->b_transaction ?
                               jh->b_transaction->t_tid : 0,
                               jh->b_next_transaction,
                               jh->b_next_transaction ?
                               jh->b_next_transaction->t_tid : 0,
                               jh->b_jlist);
                        WARN_ON(1);
                        ret = -EINVAL;
                }
                /* And this case is illegal: we can't reuse another
                 * transaction's data buffer, ever. */
                goto out_unlock_bh;
        }

        /* That test should have eliminated the following case: */
        J_ASSERT_JH(jh, jh->b_frozen_data == NULL);

        JBUFFER_TRACE(jh, "file as BJ_Metadata");
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
        spin_unlock(&journal->j_list_lock);
out_unlock_bh:
        spin_unlock(&jh->b_state_lock);
out:
        JBUFFER_TRACE(jh, "exit");
        return ret;
}

/**
 * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
 * @handle: transaction handle
 * @bh:     bh to 'forget'
 *
 * We can only do the bforget if there are no commits pending against the
 * buffer.  If the buffer is dirty in the current running transaction we
 * can safely unlink it.
 *
 * bh may not be a journalled buffer at all - it may be a non-JBD
 * buffer which came off the hashtable.  Check for this.
 *
 * Decrements bh->b_count by one.
 *
 * Allow this call even if the handle has aborted --- it may be part of
 * the caller's cleanup after an abort.
 */
int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh;
        int drop_reserve = 0;
        int err = 0;
        int was_modified = 0;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        BUFFER_TRACE(bh, "entry");

        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh) {
                __bforget(bh);
                return 0;
        }

        spin_lock(&jh->b_state_lock);

        /* Critical error: attempting to delete a bitmap buffer, maybe?
         * Don't do any jbd operations, and return an error. */
        if (!J_EXPECT_JH(jh, !jh->b_committed_data,
                         "inconsistent data on disk")) {
                err = -EIO;
                goto drop;
        }

        /* keep track of whether or not this transaction modified us */
        was_modified = jh->b_modified;

        /*
         * The buffer's going from the transaction, we must drop
         * all references -bzzz
         */
        jh->b_modified = 0;

        if (jh->b_transaction == transaction) {
                J_ASSERT_JH(jh, !jh->b_frozen_data);

                /* If we are forgetting a buffer which is already part
                 * of this transaction, then we can just drop it from
                 * the transaction immediately. */
                clear_buffer_dirty(bh);
                clear_buffer_jbddirty(bh);

                JBUFFER_TRACE(jh, "belongs to current transaction: unfile");

                /*
                 * we only want to drop a reference if this transaction
                 * modified the buffer
                 */
                if (was_modified)
                        drop_reserve = 1;

                /*
                 * We are no longer going to journal this buffer.
                 * However, the commit of this transaction is still
                 * important to the buffer: the delete that we are now
                 * processing might obsolete an old log entry, so by
                 * committing, we can satisfy the buffer's checkpoint.
                 *
                 * So, if we have a checkpoint on the buffer, we should
                 * now refile the buffer on our BJ_Forget list so that
                 * we know to remove the checkpoint after we commit.
                 */

                spin_lock(&journal->j_list_lock);
                if (jh->b_cp_transaction) {
                        __jbd2_journal_temp_unlink_buffer(jh);
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __jbd2_journal_unfile_buffer(jh);
                        jbd2_journal_put_journal_head(jh);
                }
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction) {
                J_ASSERT_JH(jh, (jh->b_transaction ==
                                 journal->j_committing_transaction));
                /* However, if the buffer is still owned by a prior
                 * (committing) transaction, we can't drop it yet... */
                JBUFFER_TRACE(jh, "belongs to older transaction");
                /* ... but we CAN drop it from the new transaction through
                 * marking the buffer as freed and set j_next_transaction to
                 * the new transaction, so that not only the commit code
                 * knows it should clear dirty bits when it is done with the
                 * buffer, but also the buffer can be checkpointed only
                 * after the new transaction commits. */

                set_buffer_freed(bh);

                if (!jh->b_next_transaction) {
                        spin_lock(&journal->j_list_lock);
                        jh->b_next_transaction = transaction;
                        spin_unlock(&journal->j_list_lock);
                } else {
                        J_ASSERT(jh->b_next_transaction == transaction);

                        /*
                         * only drop a reference if this transaction modified
                         * the buffer
                         */
                        if (was_modified)
                                drop_reserve = 1;
                }
        } else {
                /*
                 * Finally, if the buffer is not belongs to any
                 * transaction, we can just drop it now if it has no
                 * checkpoint.
                 */
                spin_lock(&journal->j_list_lock);
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "belongs to none transaction");
                        spin_unlock(&journal->j_list_lock);
                        goto drop;
                }

                /*
                 * Otherwise, if the buffer has been written to disk,
                 * it is safe to remove the checkpoint and drop it.
                 */
                if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
                        spin_unlock(&journal->j_list_lock);
                        goto drop;
                }

                /*
                 * The buffer is still not written to disk, we should
                 * attach this buffer to current transaction so that the
                 * buffer can be checkpointed only after the current
                 * transaction commits.
                 */
                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                spin_unlock(&journal->j_list_lock);
        }
drop:
        __brelse(bh);
        spin_unlock(&jh->b_state_lock);
        jbd2_journal_put_journal_head(jh);
        if (drop_reserve) {
                /* no need to reserve log space for this block -bzzz */
                handle->h_total_credits++;
        }
        return err;
}

/**
 * jbd2_journal_stop() - complete a transaction
 * @handle: transaction to complete.
 *
 * All done for a particular handle.
 *
 * There is not much action needed here.  We just return any remaining
 * buffer credits to the transaction and remove the handle.  The only
 * complication is that we need to start a commit operation if the
 * filesystem is marked for synchronous update.
 *
 * jbd2_journal_stop itself will not usually return an error, but it may
 * do so in unusual circumstances.  In particular, expect it to
 * return -EIO if a jbd2_journal_abort has been executed since the
 * transaction began.
 */
int jbd2_journal_stop(handle_t *handle)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int err = 0, wait_for_commit = 0;
        tid_t tid;
        pid_t pid;

        if (--handle->h_ref > 0) {
                jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
                                                 handle->h_ref);
                if (is_handle_aborted(handle))
                        return -EIO;
                return 0;
        }
        if (!transaction) {
                /*
                 * Handle is already detached from the transaction so there is
                 * nothing to do other than free the handle.
                 */
                memalloc_nofs_restore(handle->saved_alloc_context);
                goto free_and_exit;
        }
        journal = transaction->t_journal;
        tid = transaction->t_tid;

        if (is_handle_aborted(handle))
                err = -EIO;

        jbd2_debug(4, "Handle %p going down\n", handle);
        trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
                                tid, handle->h_type, handle->h_line_no,
                                jiffies - handle->h_start_jiffies,
                                handle->h_sync, handle->h_requested_credits,
                                (handle->h_requested_credits -
                                 handle->h_total_credits));

        /*
         * Implement synchronous transaction batching.  If the handle
         * was synchronous, don't force a commit immediately.  Let's
         * yield and let another thread piggyback onto this
         * transaction.  Keep doing that while new threads continue to
         * arrive.  It doesn't cost much - we're about to run a commit
         * and sleep on IO anyway.  Speeds up many-threaded, many-dir
         * operations by 30x or more...
         *
         * We try and optimize the sleep time against what the
         * underlying disk can do, instead of having a static sleep
         * time.  This is useful for the case where our storage is so
         * fast that it is more optimal to go ahead and force a flush
         * and wait for the transaction to be committed than it is to
         * wait for an arbitrary amount of time for new writers to
         * join the transaction.  We achieve this by measuring how
         * long it takes to commit a transaction, and compare it with
         * how long this transaction has been running, and if run time
         * < commit time then we sleep for the delta and commit.  This
         * greatly helps super fast disks that would see slowdowns as
         * more threads started doing fsyncs.
         *
         * But don't do this if this process was the most recent one
         * to perform a synchronous write.  We do this to detect the
         * case where a single process is doing a stream of sync
         * writes.  No point in waiting for joiners in that case.
         *
         * Setting max_batch_time to 0 disables this completely.
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid &&
            journal->j_max_batch_time) {
                u64 commit_time, trans_time;

                journal->j_last_sync_writer = pid;

                read_lock(&journal->j_state_lock);
                commit_time = journal->j_average_commit_time;
                read_unlock(&journal->j_state_lock);

                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
                                                   transaction->t_start_time));

                commit_time = max_t(u64, commit_time,
                                    1000*journal->j_min_batch_time);
                commit_time = min_t(u64, commit_time,
                                    1000*journal->j_max_batch_time);

                if (trans_time < commit_time) {
                        ktime_t expires = ktime_add_ns(ktime_get(),
                                                       commit_time);
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
                }
        }

        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;

        /*
         * If the handle is marked SYNC, we need to set another commit
         * going!  We also want to force a commit if the transaction is too
         * old now.
         */
        if (handle->h_sync ||
            time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
                 * anything to disk. */

                jbd2_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
                jbd2_log_start_commit(journal, tid);

                /*
                 * Special case: JBD2_SYNC synchronous updates require us
                 * to wait for the commit to complete.
                 */
                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
                        wait_for_commit = 1;
        }

        /*
         * Once stop_this_handle() drops t_updates, the transaction could start
         * committing on us and eventually disappear.  So we must not
         * dereference transaction pointer again after calling
         * stop_this_handle().
         */
        stop_this_handle(handle);

        if (wait_for_commit)
                err = jbd2_log_wait_commit(journal, tid);

free_and_exit:
        if (handle->h_rsv_handle)
                jbd2_free_handle(handle->h_rsv_handle);
        jbd2_free_handle(handle);
        return err;
}

/*
 *
 * List management code snippets: various functions for manipulating the
 * transaction buffer lists.
 *
 */

/*
 * Append a buffer to a transaction list, given the transaction's list head
 * pointer.
 *
 * j_list_lock is held.
 *
 * jh->b_state_lock is held.
 */

static inline void
__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
{
        if (!*list) {
                jh->b_tnext = jh->b_tprev = jh;
                *list = jh;
        } else {
                /* Insert at the tail of the list to preserve order */
                struct journal_head *first = *list, *last = first->b_tprev;
                jh->b_tprev = last;
                jh->b_tnext = first;
                last->b_tnext = first->b_tprev = jh;
        }
}

/*
 * Remove a buffer from a transaction list, given the transaction's list
 * head pointer.
 *
 * Called with j_list_lock held, and the journal may not be locked.
 *
 * jh->b_state_lock is held.
 */

static inline void
__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
{
        if (*list == jh) {
                *list = jh->b_tnext;
                if (*list == jh)
                        *list = NULL;
        }
        jh->b_tprev->b_tnext = jh->b_tnext;
        jh->b_tnext->b_tprev = jh->b_tprev;
}

/*
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
 * t_reserved_list.  If the caller is holding onto a copy of one of these
 * pointers, it could go bad.  Generally the caller needs to re-read the
 * pointer from the transaction_t.
 *
 * Called under j_list_lock.
 */
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
{
        struct journal_head **list = NULL;
        transaction_t *transaction;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        transaction = jh->b_transaction;
        if (transaction)
                assert_spin_locked(&transaction->t_journal->j_list_lock);

        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        if (jh->b_jlist != BJ_None)
                J_ASSERT_JH(jh, transaction != NULL);

        switch (jh->b_jlist) {
        case BJ_None:
                return;
        case BJ_Metadata:
                transaction->t_nr_buffers--;
                J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
                list = &transaction->t_buffers;
                break;
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
        }

        __blist_del_buffer(list, jh);
        jh->b_jlist = BJ_None;
        if (transaction && is_journal_aborted(transaction->t_journal))
                clear_buffer_jbddirty(bh);
        else if (test_clear_buffer_jbddirty(bh))
                mark_buffer_dirty(bh);        /* Expose it to the VM */
}

/*
 * Remove buffer from all transactions. The caller is responsible for dropping
 * the jh reference that belonged to the transaction.
 *
 * Called with bh_state lock and j_list_lock
 */
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
        J_ASSERT_JH(jh, jh->b_transaction != NULL);
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);

        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
}

void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
        struct buffer_head *bh = jh2bh(jh);

        /* Get reference so that buffer cannot be freed before we unlock it */
        get_bh(bh);
        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
        jbd2_journal_put_journal_head(jh);
        __brelse(bh);
}

/**
 * jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @folio: Folio to detach data from.
 *
 * For all the buffers on this page,
 * if they are fully written out ordered data, move them onto BUF_CLEAN
 * so try_to_free_buffers() can reap them.
 *
 * This function returns non-zero if we wish try_to_free_buffers()
 * to be called. We do this if the page is releasable by try_to_free_buffers().
 * We also do it if the page has locked or dirty buffers and the caller wants
 * us to perform sync or async writeout.
 *
 * This complicates JBD locking somewhat.  We aren't protected by the
 * BKL here.  We wish to remove the buffer from its committing or
 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
 *
 * This may *change* the value of transaction_t->t_datalist, so anyone
 * who looks at t_datalist needs to lock against this function.
 *
 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
 * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
 * will come out of the lock with the buffer dirty, which makes it
 * ineligible for release here.
 *
 * Who else is affected by this?  hmm...  Really the only contender
 * is do_get_write_access() - it could be looking at the buffer while
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
 *
 * Return false on failure, true on success
 */
bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
{
        struct buffer_head *head;
        struct buffer_head *bh;
        bool ret = false;

        J_ASSERT(folio_test_locked(folio));

        head = folio_buffers(folio);
        bh = head;
        do {
                struct journal_head *jh;

                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
                 * jbd2_journal_put_journal_head().
                 */
                jh = jbd2_journal_grab_journal_head(bh);
                if (!jh)
                        continue;

                spin_lock(&jh->b_state_lock);
                if (!jh->b_transaction && !jh->b_next_transaction) {
                        spin_lock(&journal->j_list_lock);
                        /* Remove written-back checkpointed metadata buffer */
                        if (jh->b_cp_transaction != NULL)
                                jbd2_journal_try_remove_checkpoint(jh);
                        spin_unlock(&journal->j_list_lock);
                }
                spin_unlock(&jh->b_state_lock);
                jbd2_journal_put_journal_head(jh);
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);

        ret = try_to_free_buffers(folio);
busy:
        return ret;
}

/*
 * This buffer is no longer needed.  If it is on an older transaction's
 * checkpoint list we need to record it on this transaction's forget list
 * to pin this buffer (and hence its checkpointing transaction) down until
 * this transaction commits.  If the buffer isn't on a checkpoint list, we
 * release it.
 * Returns non-zero if JBD no longer has an interest in the buffer.
 *
 * Called under j_list_lock.
 *
 * Called under jh->b_state_lock.
 */
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
        int may_free = 1;
        struct buffer_head *bh = jh2bh(jh);

        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
                __jbd2_journal_temp_unlink_buffer(jh);
                /*
                 * We don't want to write the buffer anymore, clear the
                 * bit so that we don't confuse checks in
                 * __journal_file_buffer
                 */
                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
                __jbd2_journal_unfile_buffer(jh);
                jbd2_journal_put_journal_head(jh);
        }
        return may_free;
}

/*
 * jbd2_journal_invalidate_folio
 *
 * This code is tricky.  It has a number of cases to deal with.
 *
 * There are two invariants which this code relies on:
 *
 * i_size must be updated on disk before we start calling invalidate_folio
 * on the data.
 *
 *  This is done in ext3 by defining an ext3_setattr method which
 *  updates i_size before truncate gets going.  By maintaining this
 *  invariant, we can be sure that it is safe to throw away any buffers
 *  attached to the current transaction: once the transaction commits,
 *  we know that the data will not be needed.
 *
 *  Note however that we can *not* throw away data belonging to the
 *  previous, committing transaction!
 *
 * Any disk blocks which *are* part of the previous, committing
 * transaction (and which therefore cannot be discarded immediately) are
 * not going to be reused in the new running transaction
 *
 *  The bitmap committed_data images guarantee this: any block which is
 *  allocated in one transaction and removed in the next will be marked
 *  as in-use in the committed_data bitmap, so cannot be reused until
 *  the next transaction to delete the block commits.  This means that
 *  leaving committing buffers dirty is quite safe: the disk blocks
 *  cannot be reallocated to a different file and so buffer aliasing is
 *  not possible.
 *
 *
 * The above applies mainly to ordered data mode.  In writeback mode we
 * don't make guarantees about the order in which data hits disk --- in
 * particular we don't guarantee that new dirty data is flushed before
 * transaction commit --- so it is always safe just to discard data
 * immediately in that mode.  --sct
 */

/*
 * The journal_unmap_buffer helper function returns zero if the buffer
 * concerned remains pinned as an anonymous buffer belonging to an older
 * transaction.
 *
 * We're outside-transaction here.  Either or both of j_running_transaction
 * and j_committing_transaction may be NULL.
 */
static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
                                int partial_page)
{
        transaction_t *transaction;
        struct journal_head *jh;
        int may_free = 1;

        BUFFER_TRACE(bh, "entry");

        /*
         * It is safe to proceed here without the j_list_lock because the
         * buffers cannot be stolen by try_to_free_buffers as long as we are
         * holding the page lock. --sct
         */

        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh)
                goto zap_buffer_unlocked;

        /* OK, we have data buffer in journaled mode */
        write_lock(&journal->j_state_lock);
        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);

        /*
         * We cannot remove the buffer from checkpoint lists until the
         * transaction adding inode to orphan list (let's call it T)
         * is committed.  Otherwise if the transaction changing the
         * buffer would be cleaned from the journal before T is
         * committed, a crash will cause that the correct contents of
         * the buffer will be lost.  On the other hand we have to
         * clear the buffer dirty bit at latest at the moment when the
         * transaction marking the buffer as freed in the filesystem
         * structures is committed because from that moment on the
         * block can be reallocated and used by a different page.
         * Since the block hasn't been freed yet but the inode has
         * already been added to orphan list, it is safe for us to add
         * the buffer to BJ_Forget list of the newest transaction.
         *
         * Also we have to clear buffer_mapped flag of a truncated buffer
         * because the buffer_head may be attached to the page straddling
         * i_size (can happen only when blocksize < pagesize) and thus the
         * buffer_head can be reused when the file is extended again. So we end
         * up keeping around invalidated buffers attached to transactions'
         * BJ_Forget list just to stop checkpointing code from cleaning up
         * the transaction this buffer was modified in.
         */
        transaction = jh->b_transaction;
        if (transaction == NULL) {
                /* First case: not on any transaction.  If it
                 * has no checkpoint link, then we can zap it:
                 * it's a writeback-mode buffer so we don't care
                 * if it hits disk safely. */
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "not on any transaction: zap");
                        goto zap_buffer;
                }

                if (!buffer_dirty(bh)) {
                        /* bdflush has written it.  We can drop it now */
                        __jbd2_journal_remove_checkpoint(jh);
                        goto zap_buffer;
                }

                /* OK, it must be in the journal but still not
                 * written fully to disk: it's metadata or
                 * journaled data... */

                if (journal->j_running_transaction) {
                        /* ... and once the current transaction has
                         * committed, the buffer won't be needed any
                         * longer. */
                        JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
                        may_free = __dispose_buffer(jh,
                                        journal->j_running_transaction);
                        goto zap_buffer;
                } else {
                        /* There is no currently-running transaction. So the
                         * orphan record which we wrote for this file must have
                         * passed into commit.  We must attach this buffer to
                         * the committing transaction, if it exists. */
                        if (journal->j_committing_transaction) {
                                JBUFFER_TRACE(jh, "give to committing trans");
                                may_free = __dispose_buffer(jh,
                                        journal->j_committing_transaction);
                                goto zap_buffer;
                        } else {
                                /* The orphan record's transaction has
                                 * committed.  We can cleanse this buffer */
                                clear_buffer_jbddirty(bh);
                                __jbd2_journal_remove_checkpoint(jh);
                                goto zap_buffer;
                        }
                }
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
                /*
                 * The buffer is committing, we simply cannot touch
                 * it. If the page is straddling i_size we have to wait
                 * for commit and try again.
                 */
                if (partial_page) {
                        spin_unlock(&journal->j_list_lock);
                        spin_unlock(&jh->b_state_lock);
                        write_unlock(&journal->j_state_lock);
                        jbd2_journal_put_journal_head(jh);
                        /* Already zapped buffer? Nothing to do... */
                        if (!bh->b_bdev)
                                return 0;
                        return -EBUSY;
                }
                /*
                 * OK, buffer won't be reachable after truncate. We just clear
                 * b_modified to not confuse transaction credit accounting, and
                 * set j_next_transaction to the running transaction (if there
                 * is one) and mark buffer as freed so that commit code knows
                 * it should clear dirty bits when it is done with the buffer.
                 */
                set_buffer_freed(bh);
                if (journal->j_running_transaction && buffer_jbddirty(bh))
                        jh->b_next_transaction = journal->j_running_transaction;
                jh->b_modified = 0;
                spin_unlock(&journal->j_list_lock);
                spin_unlock(&jh->b_state_lock);
                write_unlock(&journal->j_state_lock);
                jbd2_journal_put_journal_head(jh);
                return 0;
        } else {
                /* Good, the buffer belongs to the running transaction.
                 * We are writing our own transaction's data, not any
                 * previous one's, so it is safe to throw it away
                 * (remember that we expect the filesystem to have set
                 * i_size already for this truncate so recovery will not
                 * expose the disk blocks we are discarding here.) */
                J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
                JBUFFER_TRACE(jh, "on running transaction");
                may_free = __dispose_buffer(jh, transaction);
        }

zap_buffer:
        /*
         * This is tricky. Although the buffer is truncated, it may be reused
         * if blocksize < pagesize and it is attached to the page straddling
         * EOF. Since the buffer might have been added to BJ_Forget list of the
         * running transaction, journal_get_write_access() won't clear
         * b_modified and credit accounting gets confused. So clear b_modified
         * here.
         */
        jh->b_modified = 0;
        spin_unlock(&journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
        write_unlock(&journal->j_state_lock);
        jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
        clear_buffer_dirty(bh);
        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
        clear_buffer_mapped(bh);
        clear_buffer_req(bh);
        clear_buffer_new(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
        bh->b_bdev = NULL;
        return may_free;
}

/**
 * jbd2_journal_invalidate_folio()
 * @journal: journal to use for flush...
 * @folio:    folio to flush
 * @offset:  start of the range to invalidate
 * @length:  length of the range to invalidate
 *
 * Reap page buffers containing data after in the specified range in page.
 * Can return -EBUSY if buffers are part of the committing transaction and
 * the page is straddling i_size. Caller then has to wait for current commit
 * and try again.
 */
int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
                                size_t offset, size_t length)
{
        struct buffer_head *head, *bh, *next;
        unsigned int stop = offset + length;
        unsigned int curr_off = 0;
        int partial_page = (offset || length < folio_size(folio));
        int may_free = 1;
        int ret = 0;

        if (!folio_test_locked(folio))
                BUG();
        head = folio_buffers(folio);
        if (!head)
                return 0;

        BUG_ON(stop > folio_size(folio) || stop < length);

        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
         * cautious in our locking. */

        bh = head;
        do {
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                if (next_off > stop)
                        return 0;

                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
                        ret = journal_unmap_buffer(journal, bh, partial_page);
                        unlock_buffer(bh);
                        if (ret < 0)
                                return ret;
                        may_free &= ret;
                }
                curr_off = next_off;
                bh = next;

        } while (bh != head);

        if (!partial_page) {
                if (may_free && try_to_free_buffers(folio))
                        J_ASSERT(!folio_buffers(folio));
        }
        return 0;
}

/*
 * File a buffer on the given transaction list.
 */
void __jbd2_journal_file_buffer(struct journal_head *jh,
                        transaction_t *transaction, int jlist)
{
        struct journal_head **list = NULL;
        int was_dirty = 0;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        assert_spin_locked(&transaction->t_journal->j_list_lock);

        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_transaction == NULL);

        if (jh->b_transaction && jh->b_jlist == jlist)
                return;

        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
                /*
                 * For metadata buffers, we track dirty bit in buffer_jbddirty
                 * instead of buffer_dirty. We should not see a dirty bit set
                 * here because we clear it in do_get_write_access but e.g.
                 * tune2fs can modify the sb and set the dirty bit at any time
                 * so we try to gracefully handle that.
                 */
                if (buffer_dirty(bh))
                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
        }

        if (jh->b_transaction)
                __jbd2_journal_temp_unlink_buffer(jh);
        else
                jbd2_journal_grab_journal_head(bh);
        jh->b_transaction = transaction;

        switch (jlist) {
        case BJ_None:
                J_ASSERT_JH(jh, !jh->b_committed_data);
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                return;
        case BJ_Metadata:
                transaction->t_nr_buffers++;
                list = &transaction->t_buffers;
                break;
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
        }

        __blist_add_buffer(list, jh);
        jh->b_jlist = jlist;

        if (was_dirty)
                set_buffer_jbddirty(bh);
}

void jbd2_journal_file_buffer(struct journal_head *jh,
                                transaction_t *transaction, int jlist)
{
        spin_lock(&jh->b_state_lock);
        spin_lock(&transaction->t_journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, jlist);
        spin_unlock(&transaction->t_journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
}

/*
 * Remove a buffer from its current buffer list in preparation for
 * dropping it from its current transaction entirely.  If the buffer has
 * already started to be used by a subsequent transaction, refile the
 * buffer on that transaction's metadata list.
 *
 * Called under j_list_lock
 * Called under jh->b_state_lock
 *
 * When this function returns true, there's no next transaction to refile to
 * and the caller has to drop jh reference through
 * jbd2_journal_put_journal_head().
 */
bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
        int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        if (jh->b_transaction)
                assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);

        /* If the buffer is now unused, just drop it. */
        if (jh->b_next_transaction == NULL) {
                __jbd2_journal_unfile_buffer(jh);
                return true;
        }

        /*
         * It has been modified by a later transaction: add it to the new
         * transaction's metadata list.
         */

        was_dirty = test_clear_buffer_jbddirty(bh);
        __jbd2_journal_temp_unlink_buffer(jh);

        /*
         * b_transaction must be set, otherwise the new b_transaction won't
         * be holding jh reference
         */
        J_ASSERT_JH(jh, jh->b_transaction != NULL);

        /*
         * We set b_transaction here because b_next_transaction will inherit
         * our jh reference and thus __jbd2_journal_file_buffer() must not
         * take a new one.
         */
        WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
        WRITE_ONCE(jh->b_next_transaction, NULL);
        if (buffer_freed(bh))
                jlist = BJ_Forget;
        else if (jh->b_modified)
                jlist = BJ_Metadata;
        else
                jlist = BJ_Reserved;
        __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);

        if (was_dirty)
                set_buffer_jbddirty(bh);
        return false;
}

/*
 * __jbd2_journal_refile_buffer() with necessary locking added. We take our
 * bh reference so that we can safely unlock bh.
 *
 * The jh and bh may be freed by this call.
 */
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
        bool drop;

        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        drop = __jbd2_journal_refile_buffer(jh);
        spin_unlock(&jh->b_state_lock);
        spin_unlock(&journal->j_list_lock);
        if (drop)
                jbd2_journal_put_journal_head(jh);
}

/*
 * File inode in the inode list of the handle's transaction
 */
static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
                unsigned long flags, loff_t start_byte, loff_t end_byte)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
                        transaction->t_tid);

        spin_lock(&journal->j_list_lock);
        jinode->i_flags |= flags;

        if (jinode->i_dirty_end) {
                jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
                jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
        } else {
                jinode->i_dirty_start = start_byte;
                jinode->i_dirty_end = end_byte;
        }

        /* Is inode already attached where we need it? */
        if (jinode->i_transaction == transaction ||
            jinode->i_next_transaction == transaction)
                goto done;

        /*
         * We only ever set this variable to 1 so the test is safe. Since
         * t_need_data_flush is likely to be set, we do the test to save some
         * cacheline bouncing
         */
        if (!transaction->t_need_data_flush)
                transaction->t_need_data_flush = 1;
        /* On some different transaction's list - should be
         * the committing one */
        if (jinode->i_transaction) {
                J_ASSERT(jinode->i_next_transaction == NULL);
                J_ASSERT(jinode->i_transaction ==
                                        journal->j_committing_transaction);
                jinode->i_next_transaction = transaction;
                goto done;
        }
        /* Not on any transaction list... */
        J_ASSERT(!jinode->i_next_transaction);
        jinode->i_transaction = transaction;
        list_add(&jinode->i_list, &transaction->t_inode_list);
done:
        spin_unlock(&journal->j_list_lock);

        return 0;
}

int jbd2_journal_inode_ranged_write(handle_t *handle,
                struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
{
        return jbd2_journal_file_inode(handle, jinode,
                        JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
                        start_byte + length - 1);
}

int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
                loff_t start_byte, loff_t length)
{
        return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
                        start_byte, start_byte + length - 1);
}

/*
 * File truncate and transaction commit interact with each other in a
 * non-trivial way.  If a transaction writing data block A is
 * committing, we cannot discard the data by truncate until we have
 * written them.  Otherwise if we crashed after the transaction with
 * write has committed but before the transaction with truncate has
 * committed, we could see stale data in block A.  This function is a
 * helper to solve this problem.  It starts writeout of the truncated
 * part in case it is in the committing transaction.
 *
 * Filesystem code must call this function when inode is journaled in
 * ordered mode before truncation happens and after the inode has been
 * placed on orphan list with the new inode size. The second condition
 * avoids the race that someone writes new data and we start
 * committing the transaction after this function has been called but
 * before a transaction for truncate is started (and furthermore it
 * allows us to optimize the case where the addition to orphan list
 * happens in the same transaction as write --- we don't have to write
 * any data in such case).
 */
int jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                        struct jbd2_inode *jinode,
                                        loff_t new_size)
{
        transaction_t *inode_trans, *commit_trans;
        int ret = 0;

        /* This is a quick check to avoid locking if not necessary */
        if (!jinode->i_transaction)
                goto out;
        /* Locks are here just to force reading of recent values, it is
         * enough that the transaction was not committing before we started
         * a transaction adding the inode to orphan list */
        read_lock(&journal->j_state_lock);
        commit_trans = journal->j_committing_transaction;
        read_unlock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        inode_trans = jinode->i_transaction;
        spin_unlock(&journal->j_list_lock);
        if (inode_trans == commit_trans) {
                ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
                        new_size, LLONG_MAX);
                if (ret)
                        jbd2_journal_abort(journal, ret);
        }
out:
        return ret;
}



































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/instrumented.h>
#include <linux/kasan-checks.h>
#include <linux/mm_types.h>
#include <linux/string.h>
#include <linux/mmap_lock.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
#include <asm/tlbflush.h>

#ifdef CONFIG_X86_32
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
#endif

#include <asm-generic/access_ok.h>

extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_nocheck_1(void);
extern int __get_user_nocheck_2(void);
extern int __get_user_nocheck_4(void);
extern int __get_user_nocheck_8(void);
extern int __get_user_bad(void);

#define __uaccess_begin() stac()
#define __uaccess_end()   clac()
#define __uaccess_begin_nospec()        \
({                                        \
        stac();                                \
        barrier_nospec();                \
})

/*
 * This is the smallest unsigned integer type that can fit a value
 * (up to 'long long')
 */
#define __inttype(x) __typeof__(                \
        __typefits(x,char,                        \
          __typefits(x,short,                        \
            __typefits(x,int,                        \
              __typefits(x,long,0ULL)))))

#define __typefits(x,type,not) \
        __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not)

/*
 * This is used for both get_user() and __get_user() to expand to
 * the proper special function call that has odd calling conventions
 * due to returning both a value and an error, and that depends on
 * the size of the pointer passed in.
 *
 * Careful: we have to cast the result to the type of the pointer
 * for sign reasons.
 *
 * The use of _ASM_DX as the register specifier is a bit of a
 * simplification, as gcc only cares about it as the starting point
 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
 * (%ecx being the next register in gcc's x86 register sequence), and
 * %rdx on 64 bits.
 *
 * Clang/LLVM cares about the size of the register, but still wants
 * the base register for something that ends up being a pair.
 */
#define do_get_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_gu;                                                        \
        register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);                \
        __chk_user_ptr(ptr);                                                \
        asm volatile("call __" #fn "_%c[size]"                                \
                     : "=a" (__ret_gu), "=r" (__val_gu),                \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (ptr), [size] "i" (sizeof(*(ptr))));                \
        instrument_get_user(__val_gu);                                        \
        (x) = (__force __typeof__(*(ptr))) __val_gu;                        \
        __builtin_expect(__ret_gu, 0);                                        \
})

/**
 * get_user - Get a simple variable from user space.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })

/**
 * __get_user - Get a simple variable from user space, with less checking.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr)


#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label)                        \
        asm goto("\n"                                        \
                     "1:        movl %%eax,0(%1)\n"                \
                     "2:        movl %%edx,4(%1)\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                        \
                     _ASM_EXTABLE_UA(2b, %l2)                        \
                     : : "A" (x), "r" (addr)                        \
                     : : label)

#else
#define __put_user_goto_u64(x, ptr, label) \
        __put_user_goto(x, ptr, "q", "er", label)
#endif

extern void __put_user_bad(void);

/*
 * Strange magic calling convention: pointer in %ecx,
 * value in %eax(:%edx), return value in %ecx. clobbers %rbx
 */
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
extern void __put_user_nocheck_1(void);
extern void __put_user_nocheck_2(void);
extern void __put_user_nocheck_4(void);
extern void __put_user_nocheck_8(void);

/*
 * ptr must be evaluated and assigned to the temporary __ptr_pu before
 * the assignment of x to __val_pu, to avoid any function calls
 * involved in the ptr expression (possibly implicitly generated due
 * to KASAN) from clobbering %ax.
 */
#define do_put_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_pu;                                                        \
        void __user *__ptr_pu;                                                \
        register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX);                \
        __typeof__(*(ptr)) __x = (x); /* eval x once */                        \
        __typeof__(ptr) __ptr = (ptr); /* eval ptr once */                \
        __chk_user_ptr(__ptr);                                                \
        __ptr_pu = __ptr;                                                \
        __val_pu = __x;                                                        \
        asm volatile("call __" #fn "_%c[size]"                                \
                     : "=c" (__ret_pu),                                        \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (__ptr_pu),                                        \
                       "r" (__val_pu),                                        \
                       [size] "i" (sizeof(*(ptr)))                        \
                     :"ebx");                                                \
        instrument_put_user(__x, __ptr, sizeof(*(ptr)));                \
        __builtin_expect(__ret_pu, 0);                                        \
})

/**
 * put_user - Write a simple value into user space.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })

/**
 * __put_user - Write a simple value into user space, with less checking.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr)

#define __put_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __typeof__(*(ptr)) __x = (x); /* eval x once */                        \
        __typeof__(ptr) __ptr = (ptr); /* eval ptr once */                \
        __chk_user_ptr(__ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __put_user_goto(__x, __ptr, "b", "iq", label);                \
                break;                                                        \
        case 2:                                                                \
                __put_user_goto(__x, __ptr, "w", "ir", label);                \
                break;                                                        \
        case 4:                                                                \
                __put_user_goto(__x, __ptr, "l", "ir", label);                \
                break;                                                        \
        case 8:                                                                \
                __put_user_goto_u64(__x, __ptr, label);                        \
                break;                                                        \
        default:                                                        \
                __put_user_bad();                                        \
        }                                                                \
        instrument_put_user(__x, __ptr, size);                                \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, label) do {                                \
        unsigned int __gu_low, __gu_high;                                \
        const unsigned int __user *__gu_ptr;                                \
        __gu_ptr = (const void __user *)(ptr);                                \
        __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);                \
        __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);        \
        (x) = ((unsigned long long)__gu_high << 32) | __gu_low;                \
} while (0)
#else
#define __get_user_asm_u64(x, ptr, label)                                \
        __get_user_asm(x, ptr, "q", "=r", label)
#endif

#define __get_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:        {                                                        \
                unsigned char x_u8__;                                        \
                __get_user_asm(x_u8__, ptr, "b", "=q", label);                \
                (x) = x_u8__;                                                \
                break;                                                        \
        }                                                                \
        case 2:                                                                \
                __get_user_asm(x, ptr, "w", "=r", label);                \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, "l", "=r", label);                \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, label);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
        instrument_get_user(x);                                                \
} while (0)

#define __get_user_asm(x, addr, itype, ltype, label)                        \
        asm_goto_output("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                                \
                     : [output] ltype(x)                                \
                     : [umem] "m" (__m(addr))                                \
                     : : label)

#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval)                                \
({                                                                        \
        __typeof__(ptr) __ptr = (ptr);                                        \
        asm volatile("\n"                                                \
                     "1:        movl %[lowbits],%%eax\n"                \
                     "2:        movl %[highbits],%%edx\n"                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG |        \
                                           EX_FLAG_CLEAR_AX_DX,                \
                                           %[errout])                        \
                     _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG |        \
                                           EX_FLAG_CLEAR_AX_DX,                \
                                           %[errout])                        \
                     : [errout] "=r" (retval),                                \
                       [output] "=&A"(x)                                \
                     : [lowbits] "m" (__m(__ptr)),                        \
                       [highbits] "m" __m(((u32 __user *)(__ptr)) + 1),        \
                       "0" (retval));                                        \
})

#else
#define __get_user_asm_u64(x, ptr, retval) \
         __get_user_asm(x, ptr, retval, "q")
#endif

#define __get_user_size(x, ptr, size, retval)                                \
do {                                                                        \
        unsigned char x_u8__;                                                \
                                                                        \
        retval = 0;                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __get_user_asm(x_u8__, ptr, retval, "b");                \
                (x) = x_u8__;                                                \
                break;                                                        \
        case 2:                                                                \
                __get_user_asm(x, ptr, retval, "w");                        \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, retval, "l");                        \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, retval);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
} while (0)

#define __get_user_asm(x, addr, err, itype)                                \
        asm volatile("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
                                           EX_FLAG_CLEAR_AX,                \
                                           %[errout])                        \
                     : [errout] "=r" (err),                                \
                       [output] "=a" (x)                                \
                     : [umem] "m" (__m(addr)),                                \
                       "0" (err))

#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_goto_output("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_goto_output("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })
#endif // CONFIG_X86_32
#else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        int __err = 0;                                                        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     CC_SET(z)                                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[errout])                        \
                     : CC_OUT(z) (success),                                \
                       [errout] "+r" (__err),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory");                                        \
        if (unlikely(__err))                                                \
                goto label;                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
/*
 * Unlike the normal CMPXCHG, use output GPR for both success/fail and error.
 * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
 * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
 * both ESI and EDI for the memory operand, compilation will fail if the error
 * is an input+output as there will be no register available for input.
 */
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        int __result;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     "mov $0, %[result]\n\t"                                \
                     "setz %b[result]\n"                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[result])                        \
                     : [result] "=q" (__result),                        \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory", "cc");                                        \
        if (unlikely(__result < 0))                                        \
                goto label;                                                \
        if (unlikely(!__result))                                        \
                *_old = __old;                                                \
        likely(__result);                                        })
#endif // CONFIG_X86_32
#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT

/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))

/*
 * Tell gcc we read from memory instead of writing: this is because
 * we do not write to any memory gcc knows about, so there are no
 * aliasing issues.
 */
#define __put_user_goto(x, addr, itype, ltype, label)                        \
        asm goto("\n"                                                        \
                "1:        mov"itype" %0,%1\n"                                \
                _ASM_EXTABLE_UA(1b, %l2)                                \
                : : ltype(x), "m" (__m(addr))                                \
                : : label)

extern unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);

extern __must_check long strnlen_user(const char __user *str, long n);

#ifdef CONFIG_ARCH_HAS_COPY_MC
unsigned long __must_check
copy_mc_to_kernel(void *to, const void *from, unsigned len);
#define copy_mc_to_kernel copy_mc_to_kernel

unsigned long __must_check
copy_mc_to_user(void __user *to, const void *from, unsigned len);
#endif

/*
 * movsl can be slow when source and dest are not both 8-byte aligned
 */
#ifdef CONFIG_X86_INTEL_USERCOPY
extern struct movsl_mask {
        int mask;
} ____cacheline_aligned_in_smp movsl_mask;
#endif

#define ARCH_HAS_NOCACHE_UACCESS 1

/*
 * The "unsafe" user accesses aren't really "unsafe", but the naming
 * is a big fat warning: you have to not only do the access_ok()
 * checking before using them, but you have to surround them with the
 * user_access_begin/end() pair.
 */
static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
{
        if (unlikely(!access_ok(ptr,len)))
                return 0;
        __uaccess_begin_nospec();
        return 1;
}
#define user_access_begin(a,b)        user_access_begin(a,b)
#define user_access_end()        __uaccess_end()

#define user_access_save()        smap_save()
#define user_access_restore(x)        smap_restore(x)

#define unsafe_put_user(x, ptr, label)        \
        __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        int __gu_err;                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
        if (unlikely(__gu_err)) goto err_label;                                        \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

extern void __try_cmpxchg_user_wrong_size(void);

#ifndef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)                \
        __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
#endif

/*
 * Force the pointer to u<size> to match the size expected by the asm helper.
 * clang/LLVM compiles all cases and only discards the unused paths after
 * processing errors, which breaks i386 if the pointer is an 8-byte value.
 */
#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({                        \
        bool __ret;                                                                \
        __chk_user_ptr(_ptr);                                                        \
        switch (sizeof(*(_ptr))) {                                                \
        case 1:        __ret = __try_cmpxchg_user_asm("b", "q",                        \
                                               (__force u8 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 2:        __ret = __try_cmpxchg_user_asm("w", "r",                        \
                                               (__force u16 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 4:        __ret = __try_cmpxchg_user_asm("l", "r",                        \
                                               (__force u32 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 8:        __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
                                                 (_nval), _label);                \
                break;                                                                \
        default: __try_cmpxchg_user_wrong_size();                                \
        }                                                                        \
        __ret;                                                })

/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label)        ({                \
        int __ret = -EFAULT;                                                \
        __uaccess_begin_nospec();                                        \
        __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label);        \
_label:                                                                        \
        __uaccess_end();                                                \
        __ret;                                                                \
                                                        })

/*
 * We want the unsafe accessors to always be inlined and use
 * the error labels - thus the macro games.
 */
#define unsafe_copy_loop(dst, src, len, type, label)                                \
        while (len >= sizeof(type)) {                                                \
                unsafe_put_user(*(type *)(src),(type __user *)(dst),label);        \
                dst += sizeof(type);                                                \
                src += sizeof(type);                                                \
                len -= sizeof(type);                                                \
        }

#define unsafe_copy_to_user(_dst,_src,_len,label)                        \
do {                                                                        \
        char __user *__ucu_dst = (_dst);                                \
        const char *__ucu_src = (_src);                                        \
        size_t __ucu_len = (_len);                                        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);        \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
do {                                                                        \
        int __kr_err;                                                        \
                                                                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), __kr_err);                        \
        if (unlikely(__kr_err))                                                \
                goto err_label;                                                \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#define __put_kernel_nofault(dst, src, type, err_label)                        \
        __put_user_size(*((type *)(src)), (__force type __user *)(dst),        \
                        sizeof(type), err_label)

#endif /* _ASM_X86_UACCESS_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
// SPDX-License-Identifier: GPL-2.0
/*
 * device.h - generic, centralized driver model
 *
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2009 Novell Inc.
 *
 * See Documentation/driver-api/driver-model/ for more information.
 */

#ifndef _DEVICE_H_
#define _DEVICE_H_

#include <linux/dev_printk.h>
#include <linux/energy_model.h>
#include <linux/ioport.h>
#include <linux/kobject.h>
#include <linux/klist.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/device/bus.h>
#include <linux/device/class.h>
#include <linux/device/driver.h>
#include <linux/cleanup.h>
#include <asm/device.h>

struct device;
struct device_private;
struct device_driver;
struct driver_private;
struct module;
struct class;
struct subsys_private;
struct device_node;
struct fwnode_handle;
struct iommu_group;
struct dev_pin_info;
struct dev_iommu;
struct msi_device_data;

/**
 * struct subsys_interface - interfaces to device functions
 * @name:       name of the device function
 * @subsys:     subsystem of the devices to attach to
 * @node:       the list of functions registered at the subsystem
 * @add_dev:    device hookup to device function handler
 * @remove_dev: device hookup to device function handler
 *
 * Simple interfaces attached to a subsystem. Multiple interfaces can
 * attach to a subsystem and its devices. Unlike drivers, they do not
 * exclusively claim or control devices. Interfaces usually represent
 * a specific functionality of a subsystem/class of devices.
 */
struct subsys_interface {
        const char *name;
        const struct bus_type *subsys;
        struct list_head node;
        int (*add_dev)(struct device *dev, struct subsys_interface *sif);
        void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
};

int subsys_interface_register(struct subsys_interface *sif);
void subsys_interface_unregister(struct subsys_interface *sif);

int subsys_system_register(const struct bus_type *subsys,
                           const struct attribute_group **groups);
int subsys_virtual_register(const struct bus_type *subsys,
                            const struct attribute_group **groups);

/*
 * The type of device, "struct device" is embedded in. A class
 * or bus can contain devices of different types
 * like "partitions" and "disks", "mouse" and "event".
 * This identifies the device type and carries type-specific
 * information, equivalent to the kobj_type of a kobject.
 * If "name" is specified, the uevent will contain it in
 * the DEVTYPE variable.
 */
struct device_type {
        const char *name;
        const struct attribute_group **groups;
        int (*uevent)(const struct device *dev, struct kobj_uevent_env *env);
        char *(*devnode)(const struct device *dev, umode_t *mode,
                         kuid_t *uid, kgid_t *gid);
        void (*release)(struct device *dev);

        const struct dev_pm_ops *pm;
};

/**
 * struct device_attribute - Interface for exporting device attributes.
 * @attr: sysfs attribute definition.
 * @show: Show handler.
 * @store: Store handler.
 */
struct device_attribute {
        struct attribute        attr;
        ssize_t (*show)(struct device *dev, struct device_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
};

/**
 * struct dev_ext_attribute - Exported device attribute with extra context.
 * @attr: Exported device attribute.
 * @var: Pointer to context.
 */
struct dev_ext_attribute {
        struct device_attribute attr;
        void *var;
};

ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr,
                          char *buf);
ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count);
ssize_t device_show_int(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_int(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_string(struct device *dev, struct device_attribute *attr,
                           char *buf);

/**
 * DEVICE_ATTR - Define a device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Convenience macro for defining a struct device_attribute.
 *
 * For example, ``DEVICE_ATTR(foo, 0644, foo_show, foo_store);`` expands to:
 *
 * .. code-block:: c
 *
 *        struct device_attribute dev_attr_foo = {
 *                .attr        = { .name = "foo", .mode = 0644 },
 *                .show        = foo_show,
 *                .store        = foo_store,
 *        };
 */
#define DEVICE_ATTR(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode.
 */
#define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = \
                __ATTR_PREALLOC(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_RW - Define a read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0644, @_show is <_name>_show,
 * and @_store is <_name>_store.
 */
#define DEVICE_ATTR_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW(_name)

/**
 * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RW(), but @_mode is 0600.
 */
#define DEVICE_ATTR_ADMIN_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600)

/**
 * DEVICE_ATTR_RO - Define a readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show.
 */
#define DEVICE_ATTR_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO(_name)

/**
 * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RO(), but @_mode is 0400.
 */
#define DEVICE_ATTR_ADMIN_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400)

/**
 * DEVICE_ATTR_WO - Define an admin-only writable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store.
 */
#define DEVICE_ATTR_WO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_WO(_name)

/**
 * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of unsigned long.
 *
 * Like DEVICE_ATTR(), but @_show and @_store are automatically provided
 * such that reads and writes to the attribute from userspace affect @_var.
 */
#define DEVICE_ULONG_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }

/**
 * DEVICE_INT_ATTR - Define a device attribute backed by an int.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of int.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is an int.
 */
#define DEVICE_INT_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) }

/**
 * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of bool.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a bool.
 */
#define DEVICE_BOOL_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) }

/**
 * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of string.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the
 * string allocation is unknown, the attribute must be read-only.
 */
#define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) }

#define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name =                \
                __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)

int device_create_file(struct device *device,
                       const struct device_attribute *entry);
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr);
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr);
int __must_check device_create_bin_file(struct device *dev,
                                        const struct bin_attribute *attr);
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr);

/* device resource management */
typedef void (*dr_release_t)(struct device *dev, void *res);
typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);

void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
                          int nid, const char *name) __malloc;
#define devres_alloc(release, size, gfp) \
        __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
#define devres_alloc_node(release, size, gfp, nid) \
        __devres_alloc_node(release, size, gfp, nid, #release)

void devres_for_each_res(struct device *dev, dr_release_t release,
                         dr_match_t match, void *match_data,
                         void (*fn)(struct device *, void *, void *),
                         void *data);
void devres_free(void *res);
void devres_add(struct device *dev, void *res);
void *devres_find(struct device *dev, dr_release_t release,
                  dr_match_t match, void *match_data);
void *devres_get(struct device *dev, void *new_res,
                 dr_match_t match, void *match_data);
void *devres_remove(struct device *dev, dr_release_t release,
                    dr_match_t match, void *match_data);
int devres_destroy(struct device *dev, dr_release_t release,
                   dr_match_t match, void *match_data);
int devres_release(struct device *dev, dr_release_t release,
                   dr_match_t match, void *match_data);

/* devres group */
void * __must_check devres_open_group(struct device *dev, void *id, gfp_t gfp);
void devres_close_group(struct device *dev, void *id);
void devres_remove_group(struct device *dev, void *id);
int devres_release_group(struct device *dev, void *id);

/* managed devm_k.alloc/kfree for device drivers */
void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __alloc_size(2);
void *devm_krealloc(struct device *dev, void *ptr, size_t size,
                    gfp_t gfp) __must_check __realloc_size(3);
__printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp,
                                     const char *fmt, va_list ap) __malloc;
__printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp,
                                    const char *fmt, ...) __malloc;
static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
{
        return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
}
static inline void *devm_kmalloc_array(struct device *dev,
                                       size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return devm_kmalloc(dev, bytes, flags);
}
static inline void *devm_kcalloc(struct device *dev,
                                 size_t n, size_t size, gfp_t flags)
{
        return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
}
static inline __realloc_size(3, 4) void * __must_check
devm_krealloc_array(struct device *dev, void *p, size_t new_n, size_t new_size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
                return NULL;

        return devm_krealloc(dev, p, bytes, flags);
}

void devm_kfree(struct device *dev, const void *p);
char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp);
void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp)
        __realloc_size(3);

unsigned long devm_get_free_pages(struct device *dev,
                                  gfp_t gfp_mask, unsigned int order);
void devm_free_pages(struct device *dev, unsigned long addr);

#ifdef CONFIG_HAS_IOMEM
void __iomem *devm_ioremap_resource(struct device *dev,
                                    const struct resource *res);
void __iomem *devm_ioremap_resource_wc(struct device *dev,
                                       const struct resource *res);

void __iomem *devm_of_iomap(struct device *dev,
                            struct device_node *node, int index,
                            resource_size_t *size);
#else

static inline
void __iomem *devm_ioremap_resource(struct device *dev,
                                    const struct resource *res)
{
        return ERR_PTR(-EINVAL);
}

static inline
void __iomem *devm_ioremap_resource_wc(struct device *dev,
                                       const struct resource *res)
{
        return ERR_PTR(-EINVAL);
}

static inline
void __iomem *devm_of_iomap(struct device *dev,
                            struct device_node *node, int index,
                            resource_size_t *size)
{
        return ERR_PTR(-EINVAL);
}

#endif

/* allows to add/remove a custom action to devres stack */
void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
void devm_release_action(struct device *dev, void (*action)(void *), void *data);

int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name);
#define devm_add_action(dev, action, data) \
        __devm_add_action(dev, action, data, #action)

static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *),
                                             void *data, const char *name)
{
        int ret;

        ret = __devm_add_action(dev, action, data, name);
        if (ret)
                action(data);

        return ret;
}
#define devm_add_action_or_reset(dev, action, data) \
        __devm_add_action_or_reset(dev, action, data, #action)

/**
 * devm_alloc_percpu - Resource-managed alloc_percpu
 * @dev: Device to allocate per-cpu memory for
 * @type: Type to allocate per-cpu memory for
 *
 * Managed alloc_percpu. Per-cpu memory allocated with this function is
 * automatically freed on driver detach.
 *
 * RETURNS:
 * Pointer to allocated memory on success, NULL on failure.
 */
#define devm_alloc_percpu(dev, type)      \
        ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
                                                      __alignof__(type)))

void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
                                   size_t align);
void devm_free_percpu(struct device *dev, void __percpu *pdata);

struct device_dma_parameters {
        /*
         * a low level driver may set these to teach IOMMU code about
         * sg limitations.
         */
        unsigned int max_segment_size;
        unsigned int min_align_mask;
        unsigned long segment_boundary_mask;
};

/**
 * enum device_link_state - Device link states.
 * @DL_STATE_NONE: The presence of the drivers is not being tracked.
 * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present.
 * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not.
 * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present).
 * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present.
 * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding.
 */
enum device_link_state {
        DL_STATE_NONE = -1,
        DL_STATE_DORMANT = 0,
        DL_STATE_AVAILABLE,
        DL_STATE_CONSUMER_PROBE,
        DL_STATE_ACTIVE,
        DL_STATE_SUPPLIER_UNBIND,
};

/*
 * Device link flags.
 *
 * STATELESS: The core will not remove this link automatically.
 * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
 * PM_RUNTIME: If set, the runtime PM framework will use this link.
 * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
 * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
 * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
 * MANAGED: The core tracks presence of supplier/consumer drivers (internal).
 * SYNC_STATE_ONLY: Link only affects sync_state() behavior.
 * INFERRED: Inferred from data (eg: firmware) and not from driver actions.
 */
#define DL_FLAG_STATELESS                BIT(0)
#define DL_FLAG_AUTOREMOVE_CONSUMER        BIT(1)
#define DL_FLAG_PM_RUNTIME                BIT(2)
#define DL_FLAG_RPM_ACTIVE                BIT(3)
#define DL_FLAG_AUTOREMOVE_SUPPLIER        BIT(4)
#define DL_FLAG_AUTOPROBE_CONSUMER        BIT(5)
#define DL_FLAG_MANAGED                        BIT(6)
#define DL_FLAG_SYNC_STATE_ONLY                BIT(7)
#define DL_FLAG_INFERRED                BIT(8)
#define DL_FLAG_CYCLE                        BIT(9)

/**
 * enum dl_dev_state - Device driver presence tracking information.
 * @DL_DEV_NO_DRIVER: There is no driver attached to the device.
 * @DL_DEV_PROBING: A driver is probing.
 * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device.
 * @DL_DEV_UNBINDING: The driver is unbinding from the device.
 */
enum dl_dev_state {
        DL_DEV_NO_DRIVER = 0,
        DL_DEV_PROBING,
        DL_DEV_DRIVER_BOUND,
        DL_DEV_UNBINDING,
};

/**
 * enum device_removable - Whether the device is removable. The criteria for a
 * device to be classified as removable is determined by its subsystem or bus.
 * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this
 *                                    device (default).
 * @DEVICE_REMOVABLE_UNKNOWN:  Device location is Unknown.
 * @DEVICE_FIXED: Device is not removable by the user.
 * @DEVICE_REMOVABLE: Device is removable by the user.
 */
enum device_removable {
        DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */
        DEVICE_REMOVABLE_UNKNOWN,
        DEVICE_FIXED,
        DEVICE_REMOVABLE,
};

/**
 * struct dev_links_info - Device data related to device links.
 * @suppliers: List of links to supplier devices.
 * @consumers: List of links to consumer devices.
 * @defer_sync: Hook to global list of devices that have deferred sync_state.
 * @status: Driver status information.
 */
struct dev_links_info {
        struct list_head suppliers;
        struct list_head consumers;
        struct list_head defer_sync;
        enum dl_dev_state status;
};

/**
 * struct dev_msi_info - Device data related to MSI
 * @domain:        The MSI interrupt domain associated to the device
 * @data:        Pointer to MSI device data
 */
struct dev_msi_info {
#ifdef CONFIG_GENERIC_MSI_IRQ
        struct irq_domain        *domain;
        struct msi_device_data        *data;
#endif
};

/**
 * enum device_physical_location_panel - Describes which panel surface of the
 * system's housing the device connection point resides on.
 * @DEVICE_PANEL_TOP: Device connection point is on the top panel.
 * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel.
 * @DEVICE_PANEL_LEFT: Device connection point is on the left panel.
 * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel.
 * @DEVICE_PANEL_FRONT: Device connection point is on the front panel.
 * @DEVICE_PANEL_BACK: Device connection point is on the back panel.
 * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown.
 */
enum device_physical_location_panel {
        DEVICE_PANEL_TOP,
        DEVICE_PANEL_BOTTOM,
        DEVICE_PANEL_LEFT,
        DEVICE_PANEL_RIGHT,
        DEVICE_PANEL_FRONT,
        DEVICE_PANEL_BACK,
        DEVICE_PANEL_UNKNOWN,
};

/**
 * enum device_physical_location_vertical_position - Describes vertical
 * position of the device connection point on the panel surface.
 * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel.
 * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel.
 */
enum device_physical_location_vertical_position {
        DEVICE_VERT_POS_UPPER,
        DEVICE_VERT_POS_CENTER,
        DEVICE_VERT_POS_LOWER,
};

/**
 * enum device_physical_location_horizontal_position - Describes horizontal
 * position of the device connection point on the panel surface.
 * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel.
 * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel.
 */
enum device_physical_location_horizontal_position {
        DEVICE_HORI_POS_LEFT,
        DEVICE_HORI_POS_CENTER,
        DEVICE_HORI_POS_RIGHT,
};

/**
 * struct device_physical_location - Device data related to physical location
 * of the device connection point.
 * @panel: Panel surface of the system's housing that the device connection
 *         point resides on.
 * @vertical_position: Vertical position of the device connection point within
 *                     the panel.
 * @horizontal_position: Horizontal position of the device connection point
 *                       within the panel.
 * @dock: Set if the device connection point resides in a docking station or
 *        port replicator.
 * @lid: Set if this device connection point resides on the lid of laptop
 *       system.
 */
struct device_physical_location {
        enum device_physical_location_panel panel;
        enum device_physical_location_vertical_position vertical_position;
        enum device_physical_location_horizontal_position horizontal_position;
        bool dock;
        bool lid;
};

/**
 * struct device - The basic device structure
 * @parent:        The device's "parent" device, the device to which it is attached.
 *                 In most cases, a parent device is some sort of bus or host
 *                 controller. If parent is NULL, the device, is a top-level device,
 *                 which is not usually what you want.
 * @p:                Holds the private data of the driver core portions of the device.
 *                 See the comment of the struct device_private for detail.
 * @kobj:        A top-level, abstract class from which other classes are derived.
 * @init_name:        Initial name of the device.
 * @type:        The type of device.
 *                 This identifies the device type and carries type-specific
 *                 information.
 * @mutex:        Mutex to synchronize calls to its driver.
 * @bus:        Type of bus device is on.
 * @driver:        Which driver has allocated this
 * @platform_data: Platform data specific to the device.
 *                 Example: For devices on custom boards, as typical of embedded
 *                 and SOC based hardware, Linux often uses platform_data to point
 *                 to board-specific structures describing devices and how they
 *                 are wired.  That can include what ports are available, chip
 *                 variants, which GPIO pins act in what additional roles, and so
 *                 on.  This shrinks the "Board Support Packages" (BSPs) and
 *                 minimizes board-specific #ifdefs in drivers.
 * @driver_data: Private pointer for driver specific info.
 * @links:        Links to suppliers and consumers of this device.
 * @power:        For device power management.
 *                See Documentation/driver-api/pm/devices.rst for details.
 * @pm_domain:        Provide callbacks that are executed during system suspend,
 *                 hibernation, system resume and during runtime PM transitions
 *                 along with subsystem-level and driver-level callbacks.
 * @em_pd:        device's energy model performance domain
 * @pins:        For device pin management.
 *                See Documentation/driver-api/pin-control.rst for details.
 * @msi:        MSI related data
 * @numa_node:        NUMA node this device is close to.
 * @dma_ops:    DMA mapping operations for this device.
 * @dma_mask:        Dma mask (if dma'ble device).
 * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
 *                 hardware supports 64-bit addresses for consistent allocations
 *                 such descriptors.
 * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
 *                DMA limit than the device itself supports.
 * @dma_range_map: map for DMA memory ranges relative to that of RAM
 * @dma_parms:        A low level driver may set these to teach IOMMU code about
 *                 segment limitations.
 * @dma_pools:        Dma pools (if dma'ble device).
 * @dma_mem:        Internal for coherent mem override.
 * @cma_area:        Contiguous memory area for dma allocations
 * @dma_io_tlb_mem: Software IO TLB allocator.  Not for driver use.
 * @dma_io_tlb_pools:        List of transient swiotlb memory pools.
 * @dma_io_tlb_lock:        Protects changes to the list of active pools.
 * @dma_uses_io_tlb: %true if device has used the software IO TLB.
 * @archdata:        For arch-specific additions.
 * @of_node:        Associated device tree node.
 * @fwnode:        Associated device node supplied by platform firmware.
 * @devt:        For creating the sysfs "dev".
 * @id:                device instance
 * @devres_lock: Spinlock to protect the resource of the device.
 * @devres_head: The resources list of the device.
 * @class:        The class of the device.
 * @groups:        Optional attribute groups.
 * @release:        Callback to free the device after all references have
 *                 gone away. This should be set by the allocator of the
 *                 device (i.e. the bus driver that discovered the device).
 * @iommu_group: IOMMU group the device belongs to.
 * @iommu:        Per device generic IOMMU runtime data
 * @physical_location: Describes physical location of the device connection
 *                point in the system housing.
 * @removable:  Whether the device can be removed from the system. This
 *              should be set by the subsystem / bus driver that discovered
 *              the device.
 *
 * @offline_disabled: If set, the device is permanently online.
 * @offline:        Set after successful invocation of bus type's .offline().
 * @of_node_reused: Set if the device-tree node is shared with an ancestor
 *              device.
 * @state_synced: The hardware state of this device has been synced to match
 *                  the software state of this device by calling the driver/bus
 *                  sync_state() callback.
 * @can_match:        The device has matched with a driver at least once or it is in
 *                a bus (like AMBA) which can't check for matching drivers until
 *                other devices probe successfully.
 * @dma_coherent: this particular device is dma coherent, even if the
 *                architecture supports non-coherent devices.
 * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
 *                streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
 *                and optionall (if the coherent mask is large enough) also
 *                for dma allocations.  This flag is managed by the dma ops
 *                instance from ->dma_supported.
 * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
 *
 * At the lowest level, every device in a Linux system is represented by an
 * instance of struct device. The device structure contains the information
 * that the device model core needs to model the system. Most subsystems,
 * however, track additional information about the devices they host. As a
 * result, it is rare for devices to be represented by bare device structures;
 * instead, that structure, like kobject structures, is usually embedded within
 * a higher-level representation of the device.
 */
struct device {
        struct kobject kobj;
        struct device                *parent;

        struct device_private        *p;

        const char                *init_name; /* initial name of the device */
        const struct device_type *type;

        const struct bus_type        *bus;        /* type of bus device is on */
        struct device_driver *driver;        /* which driver has allocated this
                                           device */
        void                *platform_data;        /* Platform specific data, device
                                           core doesn't touch it */
        void                *driver_data;        /* Driver data, set and get with
                                           dev_set_drvdata/dev_get_drvdata */
        struct mutex                mutex;        /* mutex to synchronize calls to
                                         * its driver.
                                         */

        struct dev_links_info        links;
        struct dev_pm_info        power;
        struct dev_pm_domain        *pm_domain;

#ifdef CONFIG_ENERGY_MODEL
        struct em_perf_domain        *em_pd;
#endif

#ifdef CONFIG_PINCTRL
        struct dev_pin_info        *pins;
#endif
        struct dev_msi_info        msi;
#ifdef CONFIG_DMA_OPS
        const struct dma_map_ops *dma_ops;
#endif
        u64                *dma_mask;        /* dma mask (if dma'able device) */
        u64                coherent_dma_mask;/* Like dma_mask, but for
                                             alloc_coherent mappings as
                                             not all hardware supports
                                             64 bit addresses for consistent
                                             allocations such descriptors. */
        u64                bus_dma_limit;        /* upstream dma constraint */
        const struct bus_dma_region *dma_range_map;

        struct device_dma_parameters *dma_parms;

        struct list_head        dma_pools;        /* dma pools (if dma'ble) */

#ifdef CONFIG_DMA_DECLARE_COHERENT
        struct dma_coherent_mem        *dma_mem; /* internal for coherent mem
                                             override */
#endif
#ifdef CONFIG_DMA_CMA
        struct cma *cma_area;                /* contiguous memory area for dma
                                           allocations */
#endif
#ifdef CONFIG_SWIOTLB
        struct io_tlb_mem *dma_io_tlb_mem;
#endif
#ifdef CONFIG_SWIOTLB_DYNAMIC
        struct list_head dma_io_tlb_pools;
        spinlock_t dma_io_tlb_lock;
        bool dma_uses_io_tlb;
#endif
        /* arch specific additions */
        struct dev_archdata        archdata;

        struct device_node        *of_node; /* associated device tree node */
        struct fwnode_handle        *fwnode; /* firmware device node */

#ifdef CONFIG_NUMA
        int                numa_node;        /* NUMA node this device is close to */
#endif
        dev_t                        devt;        /* dev_t, creates the sysfs "dev" */
        u32                        id;        /* device instance */

        spinlock_t                devres_lock;
        struct list_head        devres_head;

        const struct class        *class;
        const struct attribute_group **groups;        /* optional groups */

        void        (*release)(struct device *dev);
        struct iommu_group        *iommu_group;
        struct dev_iommu        *iommu;

        struct device_physical_location *physical_location;

        enum device_removable        removable;

        bool                        offline_disabled:1;
        bool                        offline:1;
        bool                        of_node_reused:1;
        bool                        state_synced:1;
        bool                        can_match:1;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        bool                        dma_coherent:1;
#endif
#ifdef CONFIG_DMA_OPS_BYPASS
        bool                        dma_ops_bypass : 1;
#endif
#ifdef CONFIG_DMA_NEED_SYNC
        bool                        dma_skip_sync:1;
#endif
};

/**
 * struct device_link - Device link representation.
 * @supplier: The device on the supplier end of the link.
 * @s_node: Hook to the supplier device's list of links to consumers.
 * @consumer: The device on the consumer end of the link.
 * @c_node: Hook to the consumer device's list of links to suppliers.
 * @link_dev: device used to expose link details in sysfs
 * @status: The state of the link (with respect to the presence of drivers).
 * @flags: Link flags.
 * @rpm_active: Whether or not the consumer device is runtime-PM-active.
 * @kref: Count repeated addition of the same link.
 * @rm_work: Work structure used for removing the link.
 * @supplier_preactivated: Supplier has been made active before consumer probe.
 */
struct device_link {
        struct device *supplier;
        struct list_head s_node;
        struct device *consumer;
        struct list_head c_node;
        struct device link_dev;
        enum device_link_state status;
        u32 flags;
        refcount_t rpm_active;
        struct kref kref;
        struct work_struct rm_work;
        bool supplier_preactivated; /* Owned by consumer probe. */
};

#define kobj_to_dev(__kobj)        container_of_const(__kobj, struct device, kobj)

/**
 * device_iommu_mapped - Returns true when the device DMA is translated
 *                         by an IOMMU
 * @dev: Device to perform the check on
 */
static inline bool device_iommu_mapped(struct device *dev)
{
        return (dev->iommu_group != NULL);
}

/* Get the wakeup routines, which depend on struct device */
#include <linux/pm_wakeup.h>

/**
 * dev_name - Return a device's name.
 * @dev: Device with name to get.
 * Return: The kobject name of the device, or its initial name if unavailable.
 */
static inline const char *dev_name(const struct device *dev)
{
        /* Use the init name until the kobject becomes available */
        if (dev->init_name)
                return dev->init_name;

        return kobject_name(&dev->kobj);
}

/**
 * dev_bus_name - Return a device's bus/class name, if at all possible
 * @dev: struct device to get the bus/class name of
 *
 * Will return the name of the bus/class the device is attached to.  If it is
 * not attached to a bus/class, an empty string will be returned.
 */
static inline const char *dev_bus_name(const struct device *dev)
{
        return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "");
}

__printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...);

#ifdef CONFIG_NUMA
static inline int dev_to_node(struct device *dev)
{
        return dev->numa_node;
}
static inline void set_dev_node(struct device *dev, int node)
{
        dev->numa_node = node;
}
#else
static inline int dev_to_node(struct device *dev)
{
        return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
}
#endif

static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        return dev->msi.domain;
#else
        return NULL;
#endif
}

static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        dev->msi.domain = d;
#endif
}

static inline void *dev_get_drvdata(const struct device *dev)
{
        return dev->driver_data;
}

static inline void dev_set_drvdata(struct device *dev, void *data)
{
        dev->driver_data = data;
}

static inline struct pm_subsys_data *dev_to_psd(struct device *dev)
{
        return dev ? dev->power.subsys_data : NULL;
}

static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
{
        return dev->kobj.uevent_suppress;
}

static inline void dev_set_uevent_suppress(struct device *dev, int val)
{
        dev->kobj.uevent_suppress = val;
}

static inline int device_is_registered(struct device *dev)
{
        return dev->kobj.state_in_sysfs;
}

static inline void device_enable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = true;
}

static inline void device_disable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = false;
}

static inline bool device_async_suspend_enabled(struct device *dev)
{
        return !!dev->power.async_suspend;
}

static inline bool device_pm_not_required(struct device *dev)
{
        return dev->power.no_pm;
}

static inline void device_set_pm_not_required(struct device *dev)
{
        dev->power.no_pm = true;
}

static inline void dev_pm_syscore_device(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
        dev->power.syscore = val;
#endif
}

static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags)
{
        dev->power.driver_flags = flags;
}

static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags)
{
        return !!(dev->power.driver_flags & flags);
}

static inline void device_lock(struct device *dev)
{
        mutex_lock(&dev->mutex);
}

static inline int device_lock_interruptible(struct device *dev)
{
        return mutex_lock_interruptible(&dev->mutex);
}

static inline int device_trylock(struct device *dev)
{
        return mutex_trylock(&dev->mutex);
}

static inline void device_unlock(struct device *dev)
{
        mutex_unlock(&dev->mutex);
}

DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T))

static inline void device_lock_assert(struct device *dev)
{
        lockdep_assert_held(&dev->mutex);
}

static inline struct device_node *dev_of_node(struct device *dev)
{
        if (!IS_ENABLED(CONFIG_OF) || !dev)
                return NULL;
        return dev->of_node;
}

static inline bool dev_has_sync_state(struct device *dev)
{
        if (!dev)
                return false;
        if (dev->driver && dev->driver->sync_state)
                return true;
        if (dev->bus && dev->bus->sync_state)
                return true;
        return false;
}

static inline void dev_set_removable(struct device *dev,
                                     enum device_removable removable)
{
        dev->removable = removable;
}

static inline bool dev_is_removable(struct device *dev)
{
        return dev->removable == DEVICE_REMOVABLE;
}

static inline bool dev_removable_is_valid(struct device *dev)
{
        return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
}

/*
 * High level routines for use by the bus drivers
 */
int __must_check device_register(struct device *dev);
void device_unregister(struct device *dev);
void device_initialize(struct device *dev);
int __must_check device_add(struct device *dev);
void device_del(struct device *dev);

DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T))

int device_for_each_child(struct device *dev, void *data,
                          int (*fn)(struct device *dev, void *data));
int device_for_each_child_reverse(struct device *dev, void *data,
                                  int (*fn)(struct device *dev, void *data));
struct device *device_find_child(struct device *dev, void *data,
                                 int (*match)(struct device *dev, void *data));
struct device *device_find_child_by_name(struct device *parent,
                                         const char *name);
struct device *device_find_any_child(struct device *parent);

int device_rename(struct device *dev, const char *new_name);
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order);
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);

static inline bool device_supports_offline(struct device *dev)
{
        return dev->bus && dev->bus->offline && dev->bus->online;
}

#define __device_lock_set_class(dev, name, key)                        \
do {                                                                   \
        struct device *__d2 __maybe_unused = dev;                      \
        lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \
} while (0)

/**
 * device_lock_set_class - Specify a temporary lock class while a device
 *                           is attached to a driver
 * @dev: device to modify
 * @key: lock class key data
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->probe(). Take care to only override the default
 * lockdep_no_validate class.
 */
#ifdef CONFIG_LOCKDEP
#define device_lock_set_class(dev, key)                                    \
do {                                                                       \
        struct device *__d = dev;                                          \
        dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex,               \
                                                &__lockdep_no_validate__), \
                 "overriding existing custom lock class\n");               \
        __device_lock_set_class(__d, #key, key);                           \
} while (0)
#else
#define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key)
#endif

/**
 * device_lock_reset_class - Return a device to the default lockdep novalidate state
 * @dev: device to modify
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->remove().
 */
#define device_lock_reset_class(dev) \
do { \
        struct device *__d __maybe_unused = dev;                       \
        lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex",  \
                                  _THIS_IP_);                          \
} while (0)

void lock_device_hotplug(void);
void unlock_device_hotplug(void);
int lock_device_hotplug_sysfs(void);
int device_offline(struct device *dev);
int device_online(struct device *dev);
void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);
void device_set_node(struct device *dev, struct fwnode_handle *fwnode);

static inline int dev_num_vf(struct device *dev)
{
        if (dev->bus && dev->bus->num_vf)
                return dev->bus->num_vf(dev);
        return 0;
}

/*
 * Root device objects for grouping under /sys/devices
 */
struct device *__root_device_register(const char *name, struct module *owner);

/* This is a macro to avoid include problems with THIS_MODULE */
#define root_device_register(name) \
        __root_device_register(name, THIS_MODULE)

void root_device_unregister(struct device *root);

static inline void *dev_get_platdata(const struct device *dev)
{
        return dev->platform_data;
}

/*
 * Manual binding of a device to driver. See drivers/base/bus.c
 * for information on use.
 */
int __must_check device_driver_attach(struct device_driver *drv,
                                      struct device *dev);
int __must_check device_bind_driver(struct device *dev);
void device_release_driver(struct device *dev);
int  __must_check device_attach(struct device *dev);
int __must_check driver_attach(struct device_driver *drv);
void device_initial_probe(struct device *dev);
int __must_check device_reprobe(struct device *dev);

bool device_is_bound(struct device *dev);

/*
 * Easy functions for dynamically creating devices on the fly
 */
__printf(5, 6) struct device *
device_create(const struct class *cls, struct device *parent, dev_t devt,
              void *drvdata, const char *fmt, ...);
__printf(6, 7) struct device *
device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt,
                          void *drvdata, const struct attribute_group **groups,
                          const char *fmt, ...);
void device_destroy(const struct class *cls, dev_t devt);

int __must_check device_add_groups(struct device *dev,
                                   const struct attribute_group **groups);
void device_remove_groups(struct device *dev,
                          const struct attribute_group **groups);

static inline int __must_check device_add_group(struct device *dev,
                                        const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        return device_add_groups(dev, groups);
}

static inline void device_remove_group(struct device *dev,
                                       const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        return device_remove_groups(dev, groups);
}

int __must_check devm_device_add_group(struct device *dev,
                                       const struct attribute_group *grp);

/*
 * get_device - atomically increment the reference count for the device.
 *
 */
struct device *get_device(struct device *dev);
void put_device(struct device *dev);

DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T))

bool kill_device(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_mount(void);
#else
static inline int devtmpfs_mount(void) { return 0; }
#endif

/* drivers/base/power/shutdown.c */
void device_shutdown(void);

/* debugging and troubleshooting/diagnostic helpers. */
const char *dev_driver_string(const struct device *dev);

/* Device links interface. */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags);
void device_link_del(struct device_link *link);
void device_link_remove(void *consumer, struct device *supplier);
void device_links_supplier_sync_state_pause(void);
void device_links_supplier_sync_state_resume(void);
void device_link_wait_removal(void);

/* Create alias, so I can be autoloaded. */
#define MODULE_ALIAS_CHARDEV(major,minor) \
        MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_CHARDEV_MAJOR(major) \
        MODULE_ALIAS("char-major-" __stringify(major) "-*")

#endif /* _DEVICE_H_ */














































































































































































































































































































































































































































    1 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FILELOCK_H
#define _LINUX_FILELOCK_H

#include <linux/fs.h>

#define FL_POSIX        1
#define FL_FLOCK        2
#define FL_DELEG        4        /* NFSv4 delegation */
#define FL_ACCESS        8        /* not trying to lock, just looking */
#define FL_EXISTS        16        /* when unlocking, test for existence */
#define FL_LEASE        32        /* lease held on this file */
#define FL_CLOSE        64        /* unlock on close */
#define FL_SLEEP        128        /* A blocking lock */
#define FL_DOWNGRADE_PENDING        256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING        512 /* Lease is being broken */
#define FL_OFDLCK        1024        /* lock is "owned" by struct file */
#define FL_LAYOUT        2048        /* outstanding pNFS layout */
#define FL_RECLAIM        4096        /* reclaiming from a reboot server */

#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)

/*
 * Special return value from posix_lock_file() and vfs_lock_file() for
 * asynchronous locking.
 */
#define FILE_LOCK_DEFERRED 1

struct file_lock;
struct file_lease;

struct file_lock_operations {
        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
        void (*fl_release_private)(struct file_lock *);
};

struct lock_manager_operations {
        void *lm_mod_owner;
        fl_owner_t (*lm_get_owner)(fl_owner_t);
        void (*lm_put_owner)(fl_owner_t);
        void (*lm_notify)(struct file_lock *);        /* unblock callback */
        int (*lm_grant)(struct file_lock *, int);
        bool (*lm_lock_expirable)(struct file_lock *cfl);
        void (*lm_expire_lock)(void);
};

struct lease_manager_operations {
        bool (*lm_break)(struct file_lease *);
        int (*lm_change)(struct file_lease *, int, struct list_head *);
        void (*lm_setup)(struct file_lease *, void **);
        bool (*lm_breaker_owns_lease)(struct file_lease *);
};

struct lock_manager {
        struct list_head list;
        /*
         * NFSv4 and up also want opens blocked during the grace period;
         * NLM doesn't care:
         */
        bool block_opens;
};

struct net;
void locks_start_grace(struct net *, struct lock_manager *);
void locks_end_grace(struct lock_manager *);
bool locks_in_grace(struct net *);
bool opens_in_grace(struct net *);

/*
 * struct file_lock has a union that some filesystems use to track
 * their own private info. The NFS side of things is defined here:
 */
#include <linux/nfs_fs_i.h>

/*
 * struct file_lock represents a generic "file lock". It's used to represent
 * POSIX byte range locks, BSD (flock) locks, and leases. It's important to
 * note that the same struct is used to represent both a request for a lock and
 * the lock itself, but the same object is never used for both.
 *
 * FIXME: should we create a separate "struct lock_request" to help distinguish
 * these two uses?
 *
 * The varous i_flctx lists are ordered by:
 *
 * 1) lock owner
 * 2) lock range start
 * 3) lock range end
 *
 * Obviously, the last two criteria only matter for POSIX locks.
 */

struct file_lock_core {
        struct file_lock_core *flc_blocker;        /* The lock that is blocking us */
        struct list_head flc_list;        /* link into file_lock_context */
        struct hlist_node flc_link;        /* node in global lists */
        struct list_head flc_blocked_requests;        /* list of requests with
                                                 * ->fl_blocker pointing here
                                                 */
        struct list_head flc_blocked_member;        /* node in
                                                 * ->fl_blocker->fl_blocked_requests
                                                 */
        fl_owner_t flc_owner;
        unsigned int flc_flags;
        unsigned char flc_type;
        pid_t flc_pid;
        int flc_link_cpu;                /* what cpu's list is this on? */
        wait_queue_head_t flc_wait;
        struct file *flc_file;
};

struct file_lock {
        struct file_lock_core c;
        loff_t fl_start;
        loff_t fl_end;

        const struct file_lock_operations *fl_ops;        /* Callbacks for filesystems */
        const struct lock_manager_operations *fl_lmops;        /* Callbacks for lockmanagers */
        union {
                struct nfs_lock_info        nfs_fl;
                struct nfs4_lock_info        nfs4_fl;
                struct {
                        struct list_head link;        /* link in AFS vnode's pending_locks list */
                        int state;                /* state of grant or error if -ve */
                        unsigned int        debug_id;
                } afs;
                struct {
                        struct inode *inode;
                } ceph;
        } fl_u;
} __randomize_layout;

struct file_lease {
        struct file_lock_core c;
        struct fasync_struct *        fl_fasync; /* for lease break notifications */
        /* for lease breaks: */
        unsigned long fl_break_time;
        unsigned long fl_downgrade_time;
        const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */
} __randomize_layout;

struct file_lock_context {
        spinlock_t                flc_lock;
        struct list_head        flc_flock;
        struct list_head        flc_posix;
        struct list_head        flc_lease;
};

#ifdef CONFIG_FILE_LOCKING
int fcntl_getlk(struct file *, unsigned int, struct flock *);
int fcntl_setlk(unsigned int, struct file *, unsigned int,
                        struct flock *);

#if BITS_PER_LONG == 32
int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
int fcntl_setlk64(unsigned int, struct file *, unsigned int,
                        struct flock64 *);
#endif

int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
int fcntl_getlease(struct file *filp);

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return fl->c.flc_type == F_UNLCK;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return fl->c.flc_type == F_RDLCK;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return fl->c.flc_type == F_WRLCK;
}

static inline void locks_wake_up(struct file_lock *fl)
{
        wake_up(&fl->c.flc_wait);
}

/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
void locks_init_lock(struct file_lock *);
struct file_lock *locks_alloc_lock(void);
void locks_copy_lock(struct file_lock *, struct file_lock *);
void locks_copy_conflock(struct file_lock *, struct file_lock *);
void locks_remove_posix(struct file *, fl_owner_t);
void locks_remove_file(struct file *);
void locks_release_private(struct file_lock *);
void posix_test_lock(struct file *, struct file_lock *);
int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
int locks_delete_block(struct file_lock *);
int vfs_test_lock(struct file *, struct file_lock *);
int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
bool vfs_inode_has_locks(struct inode *inode);
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);

void locks_init_lease(struct file_lease *);
void locks_free_lease(struct file_lease *fl);
struct file_lease *locks_alloc_lease(void);
int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
void lease_get_mtime(struct inode *, struct timespec64 *time);
int generic_setlease(struct file *, int, struct file_lease **, void **priv);
int kernel_setlease(struct file *, int, struct file_lease **, void **);
int vfs_setlease(struct file *, int, struct file_lease **, void **);
int lease_modify(struct file_lease *, int, struct list_head *);

struct notifier_block;
int lease_register_notifier(struct notifier_block *);
void lease_unregister_notifier(struct notifier_block *);

struct files_struct;
void show_fd_locks(struct seq_file *f,
                         struct file *filp, struct files_struct *files);
bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner);

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return smp_load_acquire(&inode->i_flctx);
}

#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                              struct flock __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk(unsigned int fd, struct file *file,
                              unsigned int cmd, struct flock __user *user)
{
        return -EACCES;
}

#if BITS_PER_LONG == 32
static inline int fcntl_getlk64(struct file *file, unsigned int cmd,
                                struct flock64 *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk64(unsigned int fd, struct file *file,
                                unsigned int cmd, struct flock64 *user)
{
        return -EACCES;
}
#endif
static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        return -EINVAL;
}

static inline int fcntl_getlease(struct file *filp)
{
        return F_UNLCK;
}

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return false;
}

static inline void locks_wake_up(struct file_lock *fl)
{
}

static inline void
locks_free_lock_context(struct inode *inode)
{
}

static inline void locks_init_lock(struct file_lock *fl)
{
        return;
}

static inline void locks_init_lease(struct file_lease *fl)
{
        return;
}

static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        return;
}

static inline void locks_remove_file(struct file *filp)
{
        return;
}

static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
{
        return;
}

static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
                                  struct file_lock *conflock)
{
        return -ENOLCK;
}

static inline int locks_delete_block(struct file_lock *waiter)
{
        return -ENOENT;
}

static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
                                struct file_lock *fl, struct file_lock *conf)
{
        return -ENOLCK;
}

static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline bool vfs_inode_has_locks(struct inode *inode)
{
        return false;
}

static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        return -ENOLCK;
}

static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        return 0;
}

static inline void lease_get_mtime(struct inode *inode,
                                   struct timespec64 *time)
{
        return;
}

static inline int generic_setlease(struct file *filp, int arg,
                                    struct file_lease **flp, void **priv)
{
        return -EINVAL;
}

static inline int kernel_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int vfs_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int lease_modify(struct file_lease *fl, int arg,
                               struct list_head *dispose)
{
        return -EINVAL;
}

struct files_struct;
static inline void show_fd_locks(struct seq_file *f,
                        struct file *filp, struct files_struct *files) {}
static inline bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner)
{
        return false;
}

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return NULL;
}

#endif /* !CONFIG_FILE_LOCKING */

/* for walking lists of file_locks linked by fl_list */
#define for_each_file_lock(_fl, _head)        list_for_each_entry(_fl, _head, c.flc_list)

static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
{
        return locks_lock_inode_wait(file_inode(filp), fl);
}

#ifdef CONFIG_FILE_LOCKING
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_LEASE);
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_DELEG);
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
        if (ret == -EWOULDBLOCK && delegated_inode) {
                *delegated_inode = inode;
                ihold(inode);
        }
        return ret;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(*delegated_inode, O_WRONLY);
        iput(*delegated_inode);
        *delegated_inode = NULL;
        return ret;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode,
                                wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
                                FL_LAYOUT);
        return 0;
}

#else /* !CONFIG_FILE_LOCKING */
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        return 0;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        BUG();
        return 0;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        return 0;
}

#endif /* CONFIG_FILE_LOCKING */

#endif /* _LINUX_FILELOCK_H */




















































































































































    1 











    1 
    1 




    1 
















    1 

    1 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/resume_user_mode.h>

static struct callback_head work_exited; /* all we need is ->next == NULL */

/**
 * task_work_add - ask the @task to execute @work->func()
 * @task: the task which should run the callback
 * @work: the callback to run
 * @notify: how to notify the targeted task
 *
 * Queue @work for task_work_run() below and notify the @task if @notify
 * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
 *
 * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
 * task and run the task_work, regardless of whether the task is currently
 * running in the kernel or userspace.
 * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
 * reschedule IPI to force the targeted task to reschedule and run task_work.
 * This can be advantageous if there's no strict requirement that the
 * task_work be run as soon as possible, just whenever the task enters the
 * kernel anyway.
 * @TWA_RESUME work is run only when the task exits the kernel and returns to
 * user mode, or before entering guest mode.
 *
 * Fails if the @task is exiting/exited and thus it can't process this @work.
 * Otherwise @work->func() will be called when the @task goes through one of
 * the aforementioned transitions, or exits.
 *
 * If the targeted task is exiting, then an error is returned and the work item
 * is not queued. It's up to the caller to arrange for an alternative mechanism
 * in that case.
 *
 * Note: there is no ordering guarantee on works queued here. The task_work
 * list is LIFO.
 *
 * RETURNS:
 * 0 if succeeds or -ESRCH.
 */
int task_work_add(struct task_struct *task, struct callback_head *work,
                  enum task_work_notify_mode notify)
{
        struct callback_head *head;

        /* record the work call stack in order to print it in KASAN reports */
        kasan_record_aux_stack(work);

        head = READ_ONCE(task->task_works);
        do {
                if (unlikely(head == &work_exited))
                        return -ESRCH;
                work->next = head;
        } while (!try_cmpxchg(&task->task_works, &head, work));

        switch (notify) {
        case TWA_NONE:
                break;
        case TWA_RESUME:
                set_notify_resume(task);
                break;
        case TWA_SIGNAL:
                set_notify_signal(task);
                break;
        case TWA_SIGNAL_NO_IPI:
                __set_notify_signal(task);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return 0;
}

/**
 * task_work_cancel_match - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @match: match function to call
 * @data: data to be passed in to match function
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_match(struct task_struct *task,
                       bool (*match)(struct callback_head *, void *data),
                       void *data)
{
        struct callback_head **pprev = &task->task_works;
        struct callback_head *work;
        unsigned long flags;

        if (likely(!task_work_pending(task)))
                return NULL;
        /*
         * If cmpxchg() fails we continue without updating pprev.
         * Either we raced with task_work_add() which added the
         * new entry before this work, we will find it again. Or
         * we raced with task_work_run(), *pprev == NULL/exited.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        work = READ_ONCE(*pprev);
        while (work) {
                if (!match(work, data)) {
                        pprev = &work->next;
                        work = READ_ONCE(*pprev);
                } else if (try_cmpxchg(pprev, &work, work->next))
                        break;
        }
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);

        return work;
}

static bool task_work_func_match(struct callback_head *cb, void *data)
{
        return cb->func == data;
}

/**
 * task_work_cancel - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @func: identifies the work to remove
 *
 * Find the last queued pending work with ->func == @func and remove
 * it from queue.
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
        return task_work_cancel_match(task, task_work_func_match, func);
}

/**
 * task_work_run - execute the works added by task_work_add()
 *
 * Flush the pending works. Should be used by the core kernel code.
 * Called before the task returns to the user-mode or stops, or when
 * it exits. In the latter case task_work_add() can no longer add the
 * new work after task_work_run() returns.
 */
void task_work_run(void)
{
        struct task_struct *task = current;
        struct callback_head *work, *head, *next;

        for (;;) {
                /*
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
                work = READ_ONCE(task->task_works);
                do {
                        head = NULL;
                        if (!work) {
                                if (task->flags & PF_EXITING)
                                        head = &work_exited;
                                else
                                        break;
                        }
                } while (!try_cmpxchg(&task->task_works, &work, head));

                if (!work)
                        break;
                /*
                 * Synchronize with task_work_cancel(). It can not remove
                 * the first entry == work, cmpxchg(task_works) must fail.
                 * But it can remove another entry from the ->next list.
                 */
                raw_spin_lock_irq(&task->pi_lock);
                raw_spin_unlock_irq(&task->pi_lock);

                do {
                        next = work->next;
                        work->func(work);
                        work = next;
                        cond_resched();
                } while (work);
        }
}



















































    3 











    2 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_NAT_H
#define _NF_NAT_H

#include <linux/list.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/nf_conntrack_pptp.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <uapi/linux/netfilter/nf_nat.h>

enum nf_nat_manip_type {
        NF_NAT_MANIP_SRC,
        NF_NAT_MANIP_DST
};

/* SRC manip occurs POST_ROUTING or LOCAL_IN */
#define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \
                             (hooknum) != NF_INET_LOCAL_IN)

/* per conntrack: nat application helper private data */
union nf_conntrack_nat_help {
        /* insert nat helper private data here */
#if IS_ENABLED(CONFIG_NF_NAT_PPTP)
        struct nf_nat_pptp nat_pptp_info;
#endif
};

/* The structure embedded in the conntrack structure. */
struct nf_conn_nat {
        union nf_conntrack_nat_help help;
#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE)
        int masq_index;
#endif
};

/* Set up the info structure to map into this range. */
unsigned int nf_nat_setup_info(struct nf_conn *ct,
                               const struct nf_nat_range2 *range,
                               enum nf_nat_manip_type maniptype);

extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct,
                                              unsigned int hooknum);

struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct);

static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct)
{
#if IS_ENABLED(CONFIG_NF_NAT)
        return nf_ct_ext_find(ct, NF_CT_EXT_NAT);
#else
        return NULL;
#endif
}

static inline bool nf_nat_oif_changed(unsigned int hooknum,
                                      enum ip_conntrack_info ctinfo,
                                      struct nf_conn_nat *nat,
                                      const struct net_device *out)
{
#if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE)
        return nat && nat->masq_index && hooknum == NF_INET_POST_ROUTING &&
               CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL &&
               nat->masq_index != out->ifindex;
#else
        return false;
#endif
}

int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
                       const struct nf_hook_ops *nat_ops, unsigned int ops_count);
void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
                          unsigned int ops_count);

unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                           unsigned int hooknum, struct sk_buff *skb);

unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
                              enum nf_nat_manip_type mtype,
                              enum ip_conntrack_dir dir);
void nf_nat_csum_recalc(struct sk_buff *skb,
                        u8 nfproto, u8 proto, void *data, __sum16 *check,
                        int datalen, int oldlen);

int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
                                  enum ip_conntrack_info ctinfo,
                                  unsigned int hooknum);

int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
                                    enum ip_conntrack_info ctinfo,
                                    unsigned int hooknum, unsigned int hdrlen);

int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops);
void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops);

int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops);
void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops);

int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops);
void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops);

unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
               const struct nf_hook_state *state);

int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
              enum ip_conntrack_info ctinfo, int *action,
              const struct nf_nat_range2 *range, bool commit);

static inline int nf_nat_initialized(const struct nf_conn *ct,
                                     enum nf_nat_manip_type manip)
{
        if (manip == NF_NAT_MANIP_SRC)
                return ct->status & IPS_SRC_NAT_DONE;
        else
                return ct->status & IPS_DST_NAT_DONE;
}
#endif












































    1 



    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/lib/kasprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/stdarg.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>

/* Simplified asprintf. */
char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
{
        unsigned int first, second;
        char *p;
        va_list aq;

        va_copy(aq, ap);
        first = vsnprintf(NULL, 0, fmt, aq);
        va_end(aq);

        p = kmalloc_track_caller(first+1, gfp);
        if (!p)
                return NULL;

        second = vsnprintf(p, first+1, fmt, ap);
        WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)",
             first, second, fmt);

        return p;
}
EXPORT_SYMBOL(kvasprintf);

/*
 * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt
 * (or the sole vararg) points to rodata, we will then save a memory
 * allocation and string copy. In any case, the return value should be
 * freed using kfree_const().
 */
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap)
{
        if (!strchr(fmt, '%'))
                return kstrdup_const(fmt, gfp);
        if (!strcmp(fmt, "%s"))
                return kstrdup_const(va_arg(ap, const char*), gfp);
        return kvasprintf(gfp, fmt, ap);
}
EXPORT_SYMBOL(kvasprintf_const);

char *kasprintf(gfp_t gfp, const char *fmt, ...)
{
        va_list ap;
        char *p;

        va_start(ap, fmt);
        p = kvasprintf(gfp, fmt, ap);
        va_end(ap);

        return p;
}
EXPORT_SYMBOL(kasprintf);

















































































































    1 



    1 







































    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/fs_struct.h>
#include "internal.h"

/*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_root(struct fs_struct *fs, const struct path *path)
{
        struct path old_root;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
                path_put(&old_root);
}

/*
 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
        struct path old_pwd;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);

        if (old_pwd.dentry)
                path_put(&old_pwd);
}

static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
        if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
                return 0;
        *p = *new;
        return 1;
}

void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
{
        struct task_struct *g, *p;
        struct fs_struct *fs;
        int count = 0;

        read_lock(&tasklist_lock);
        for_each_process_thread(g, p) {
                task_lock(p);
                fs = p->fs;
                if (fs) {
                        int hits = 0;
                        spin_lock(&fs->lock);
                        write_seqcount_begin(&fs->seq);
                        hits += replace_path(&fs->root, old_root, new_root);
                        hits += replace_path(&fs->pwd, old_root, new_root);
                        write_seqcount_end(&fs->seq);
                        while (hits--) {
                                count++;
                                path_get(new_root);
                        }
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        }
        read_unlock(&tasklist_lock);
        while (count--)
                path_put(old_root);
}

void free_fs_struct(struct fs_struct *fs)
{
        path_put(&fs->root);
        path_put(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
}

void exit_fs(struct task_struct *tsk)
{
        struct fs_struct *fs = tsk->fs;

        if (fs) {
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
                tsk->fs = NULL;
                kill = !--fs->users;
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
        }
}

struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
        /* We don't need to lock fs - think why ;-) */
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
                seqcount_spinlock_init(&fs->seq, &fs->lock);
                fs->umask = old->umask;

                spin_lock(&old->lock);
                fs->root = old->root;
                path_get(&fs->root);
                fs->pwd = old->pwd;
                path_get(&fs->pwd);
                spin_unlock(&old->lock);
        }
        return fs;
}

int unshare_fs_struct(void)
{
        struct fs_struct *fs = current->fs;
        struct fs_struct *new_fs = copy_fs_struct(fs);
        int kill;

        if (!new_fs)
                return -ENOMEM;

        task_lock(current);
        spin_lock(&fs->lock);
        kill = !--fs->users;
        current->fs = new_fs;
        spin_unlock(&fs->lock);
        task_unlock(current);

        if (kill)
                free_fs_struct(fs);

        return 0;
}
EXPORT_SYMBOL_GPL(unshare_fs_struct);

int current_umask(void)
{
        return current->fs->umask;
}
EXPORT_SYMBOL(current_umask);

/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
        .users                = 1,
        .lock                = __SPIN_LOCK_UNLOCKED(init_fs.lock),
        .seq                = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
        .umask                = 0022,
};











































































































































































































































    1 




    1 



    1 

















































































































































































































    1 



    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Pluggable TCP congestion control support and newReno
 * congestion control.
 * Based on ideas from I/O scheduler support and Web100.
 *
 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
 */

#define pr_fmt(fmt) "TCP: " fmt

#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h>
#include <trace/events/tcp.h>

static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);

/* Simple linear search, don't expect many entries! */
struct tcp_congestion_ops *tcp_ca_find(const char *name)
{
        struct tcp_congestion_ops *e;

        list_for_each_entry_rcu(e, &tcp_cong_list, list) {
                if (strcmp(e->name, name) == 0)
                        return e;
        }

        return NULL;
}

void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        trace_tcp_cong_state_set(sk, ca_state);

        if (icsk->icsk_ca_ops->set_state)
                icsk->icsk_ca_ops->set_state(sk, ca_state);
        icsk->icsk_ca_state = ca_state;
}

/* Must be called with rcu lock held */
static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name)
{
        struct tcp_congestion_ops *ca = tcp_ca_find(name);

#ifdef CONFIG_MODULES
        if (!ca && capable(CAP_NET_ADMIN)) {
                rcu_read_unlock();
                request_module("tcp_%s", name);
                rcu_read_lock();
                ca = tcp_ca_find(name);
        }
#endif
        return ca;
}

/* Simple linear search, not much in here. */
struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
{
        struct tcp_congestion_ops *e;

        list_for_each_entry_rcu(e, &tcp_cong_list, list) {
                if (e->key == key)
                        return e;
        }

        return NULL;
}

int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
{
        /* all algorithms must implement these */
        if (!ca->ssthresh || !ca->undo_cwnd ||
            !(ca->cong_avoid || ca->cong_control)) {
                pr_err("%s does not implement required ops\n", ca->name);
                return -EINVAL;
        }

        return 0;
}

/* Attach new congestion control algorithm to the list
 * of available options.
 */
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
{
        int ret;

        ret = tcp_validate_congestion_control(ca);
        if (ret)
                return ret;

        ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));

        spin_lock(&tcp_cong_list_lock);
        if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) {
                pr_notice("%s already registered or non-unique key\n",
                          ca->name);
                ret = -EEXIST;
        } else {
                list_add_tail_rcu(&ca->list, &tcp_cong_list);
                pr_debug("%s registered\n", ca->name);
        }
        spin_unlock(&tcp_cong_list_lock);

        return ret;
}
EXPORT_SYMBOL_GPL(tcp_register_congestion_control);

/*
 * Remove congestion control algorithm, called from
 * the module's remove function.  Module ref counts are used
 * to ensure that this can't be done till all sockets using
 * that method are closed.
 */
void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
{
        spin_lock(&tcp_cong_list_lock);
        list_del_rcu(&ca->list);
        spin_unlock(&tcp_cong_list_lock);

        /* Wait for outstanding readers to complete before the
         * module gets removed entirely.
         *
         * A try_module_get() should fail by now as our module is
         * in "going" state since no refs are held anymore and
         * module_exit() handler being called.
         */
        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);

/* Replace a registered old ca with a new one.
 *
 * The new ca must have the same name as the old one, that has been
 * registered.
 */
int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
{
        struct tcp_congestion_ops *existing;
        int ret = 0;

        ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));

        spin_lock(&tcp_cong_list_lock);
        existing = tcp_ca_find_key(old_ca->key);
        if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
                pr_notice("%s not registered or non-unique key\n",
                          ca->name);
                ret = -EINVAL;
        } else if (existing != old_ca) {
                pr_notice("invalid old congestion control algorithm to replace\n");
                ret = -EINVAL;
        } else {
                /* Add the new one before removing the old one to keep
                 * one implementation available all the time.
                 */
                list_add_tail_rcu(&ca->list, &tcp_cong_list);
                list_del_rcu(&existing->list);
                pr_debug("%s updated\n", ca->name);
        }
        spin_unlock(&tcp_cong_list_lock);

        /* Wait for outstanding readers to complete before the
         * module or struct_ops gets removed entirely.
         */
        if (!ret)
                synchronize_rcu();

        return ret;
}

u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
        const struct tcp_congestion_ops *ca;
        u32 key = TCP_CA_UNSPEC;

        might_sleep();

        rcu_read_lock();
        ca = tcp_ca_find_autoload(name);
        if (ca) {
                key = ca->key;
                *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
        }
        rcu_read_unlock();

        return key;
}

char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
        const struct tcp_congestion_ops *ca;
        char *ret = NULL;

        rcu_read_lock();
        ca = tcp_ca_find_key(key);
        if (ca)
                ret = strncpy(buffer, ca->name,
                              TCP_CA_NAME_MAX);
        rcu_read_unlock();

        return ret;
}

/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
{
        struct net *net = sock_net(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_congestion_ops *ca;

        rcu_read_lock();
        ca = rcu_dereference(net->ipv4.tcp_congestion_control);
        if (unlikely(!bpf_try_module_get(ca, ca->owner)))
                ca = &tcp_reno;
        icsk->icsk_ca_ops = ca;
        rcu_read_unlock();

        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
        if (ca->flags & TCP_CONG_NEEDS_ECN)
                INET_ECN_xmit(sk);
        else
                INET_ECN_dontxmit(sk);
}

void tcp_init_congestion_control(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_sk(sk)->prior_ssthresh = 0;
        if (icsk->icsk_ca_ops->init)
                icsk->icsk_ca_ops->init(sk);
        if (tcp_ca_needs_ecn(sk))
                INET_ECN_xmit(sk);
        else
                INET_ECN_dontxmit(sk);
        icsk->icsk_ca_initialized = 1;
}

static void tcp_reinit_congestion_control(struct sock *sk,
                                          const struct tcp_congestion_ops *ca)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        tcp_cleanup_congestion_control(sk);
        icsk->icsk_ca_ops = ca;
        icsk->icsk_ca_setsockopt = 1;
        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));

        if (ca->flags & TCP_CONG_NEEDS_ECN)
                INET_ECN_xmit(sk);
        else
                INET_ECN_dontxmit(sk);

        if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                tcp_init_congestion_control(sk);
}

/* Manage refcounts on socket close. */
void tcp_cleanup_congestion_control(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);

        if (icsk->icsk_ca_ops->release)
                icsk->icsk_ca_ops->release(sk);
        bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
}

/* Used by sysctl to change default congestion control */
int tcp_set_default_congestion_control(struct net *net, const char *name)
{
        struct tcp_congestion_ops *ca;
        const struct tcp_congestion_ops *prev;
        int ret;

        rcu_read_lock();
        ca = tcp_ca_find_autoload(name);
        if (!ca) {
                ret = -ENOENT;
        } else if (!bpf_try_module_get(ca, ca->owner)) {
                ret = -EBUSY;
        } else if (!net_eq(net, &init_net) &&
                        !(ca->flags & TCP_CONG_NON_RESTRICTED)) {
                /* Only init netns can set default to a restricted algorithm */
                ret = -EPERM;
        } else {
                prev = xchg(&net->ipv4.tcp_congestion_control, ca);
                if (prev)
                        bpf_module_put(prev, prev->owner);

                ca->flags |= TCP_CONG_NON_RESTRICTED;
                ret = 0;
        }
        rcu_read_unlock();

        return ret;
}

/* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default(void)
{
        return tcp_set_default_congestion_control(&init_net,
                                                  CONFIG_DEFAULT_TCP_CONG);
}
late_initcall(tcp_congestion_default);

/* Build string with list of available congestion control values */
void tcp_get_available_congestion_control(char *buf, size_t maxlen)
{
        struct tcp_congestion_ops *ca;
        size_t offs = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
                offs += snprintf(buf + offs, maxlen - offs,
                                 "%s%s",
                                 offs == 0 ? "" : " ", ca->name);

                if (WARN_ON_ONCE(offs >= maxlen))
                        break;
        }
        rcu_read_unlock();
}

/* Get current default congestion control */
void tcp_get_default_congestion_control(struct net *net, char *name)
{
        const struct tcp_congestion_ops *ca;

        rcu_read_lock();
        ca = rcu_dereference(net->ipv4.tcp_congestion_control);
        strncpy(name, ca->name, TCP_CA_NAME_MAX);
        rcu_read_unlock();
}

/* Built list of non-restricted congestion control values */
void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
{
        struct tcp_congestion_ops *ca;
        size_t offs = 0;

        *buf = '\0';
        rcu_read_lock();
        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
                if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
                        continue;
                offs += snprintf(buf + offs, maxlen - offs,
                                 "%s%s",
                                 offs == 0 ? "" : " ", ca->name);

                if (WARN_ON_ONCE(offs >= maxlen))
                        break;
        }
        rcu_read_unlock();
}

/* Change list of non-restricted congestion control */
int tcp_set_allowed_congestion_control(char *val)
{
        struct tcp_congestion_ops *ca;
        char *saved_clone, *clone, *name;
        int ret = 0;

        saved_clone = clone = kstrdup(val, GFP_USER);
        if (!clone)
                return -ENOMEM;

        spin_lock(&tcp_cong_list_lock);
        /* pass 1 check for bad entries */
        while ((name = strsep(&clone, " ")) && *name) {
                ca = tcp_ca_find(name);
                if (!ca) {
                        ret = -ENOENT;
                        goto out;
                }
        }

        /* pass 2 clear old values */
        list_for_each_entry_rcu(ca, &tcp_cong_list, list)
                ca->flags &= ~TCP_CONG_NON_RESTRICTED;

        /* pass 3 mark as allowed */
        while ((name = strsep(&val, " ")) && *name) {
                ca = tcp_ca_find(name);
                WARN_ON(!ca);
                if (ca)
                        ca->flags |= TCP_CONG_NON_RESTRICTED;
        }
out:
        spin_unlock(&tcp_cong_list_lock);
        kfree(saved_clone);

        return ret;
}

/* Change congestion control for socket. If load is false, then it is the
 * responsibility of the caller to call tcp_init_congestion_control or
 * tcp_reinit_congestion_control (if the current congestion control was
 * already initialized.
 */
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
                               bool cap_net_admin)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_congestion_ops *ca;
        int err = 0;

        if (icsk->icsk_ca_dst_locked)
                return -EPERM;

        rcu_read_lock();
        if (!load)
                ca = tcp_ca_find(name);
        else
                ca = tcp_ca_find_autoload(name);

        /* No change asking for existing value */
        if (ca == icsk->icsk_ca_ops) {
                icsk->icsk_ca_setsockopt = 1;
                goto out;
        }

        if (!ca)
                err = -ENOENT;
        else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
                err = -EPERM;
        else if (!bpf_try_module_get(ca, ca->owner))
                err = -EBUSY;
        else
                tcp_reinit_congestion_control(sk, ca);
 out:
        rcu_read_unlock();
        return err;
}

/* Slow start is used when congestion window is no greater than the slow start
 * threshold. We base on RFC2581 and also handle stretch ACKs properly.
 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
 * something better;) a packet is only considered (s)acked in its entirety to
 * defend the ACK attacks described in the RFC. Slow start processes a stretch
 * ACK of degree N as if N acks of degree 1 are received back to back except
 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
 * returns the leftover acks to adjust cwnd in congestion avoidance mode.
 */
__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
        u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);

        acked -= cwnd - tcp_snd_cwnd(tp);
        tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));

        return acked;
}
EXPORT_SYMBOL_GPL(tcp_slow_start);

/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
 * for every packet that was ACKed.
 */
__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
{
        /* If credits accumulated at a higher w, apply them gently now. */
        if (tp->snd_cwnd_cnt >= w) {
                tp->snd_cwnd_cnt = 0;
                tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
        }

        tp->snd_cwnd_cnt += acked;
        if (tp->snd_cwnd_cnt >= w) {
                u32 delta = tp->snd_cwnd_cnt / w;

                tp->snd_cwnd_cnt -= delta * w;
                tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
        }
        tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
}
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);

/*
 * TCP Reno congestion control
 * This is special case used for fallback as well.
 */
/* This is Jacobson's slow start and congestion avoidance.
 * SIGCOMM '88, p. 328.
 */
__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
        struct tcp_sock *tp = tcp_sk(sk);

        if (!tcp_is_cwnd_limited(sk))
                return;

        /* In "safe" area, increase. */
        if (tcp_in_slow_start(tp)) {
                acked = tcp_slow_start(tp, acked);
                if (!acked)
                        return;
        }
        /* In dangerous area, increase slowly. */
        tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);

/* Slow start threshold is half the congestion window (min 2) */
__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        return max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);

__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);

        return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
}
EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);

struct tcp_congestion_ops tcp_reno = {
        .flags                = TCP_CONG_NON_RESTRICTED,
        .name                = "reno",
        .owner                = THIS_MODULE,
        .ssthresh        = tcp_reno_ssthresh,
        .cong_avoid        = tcp_reno_cong_avoid,
        .undo_cwnd        = tcp_reno_undo_cwnd,
};











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   15 










































   15 
   15 

























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xsave/xrstor support.
 *
 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
 */
#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>

#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xcr.h>

#include <asm/tlbflush.h>
#include <asm/prctl.h>
#include <asm/elf.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define for_each_extended_xfeature(bit, mask)                                \
        (bit) = FIRST_EXTENDED_XFEATURE;                                \
        for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))

/*
 * Although we spell it out in here, the Processor Trace
 * xfeature is completely unused.  We use other mechanisms
 * to save/restore PT state in Linux.
 */
static const char *xfeature_names[] =
{
        "x87 floating point registers",
        "SSE registers",
        "AVX registers",
        "MPX bounds registers",
        "MPX CSR",
        "AVX-512 opmask",
        "AVX-512 Hi256",
        "AVX-512 ZMM_Hi256",
        "Processor Trace (unused)",
        "Protection Keys User registers",
        "PASID state",
        "Control-flow User registers",
        "Control-flow Kernel registers (unused)",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "AMX Tile config",
        "AMX Tile data",
        "unknown xstate feature",
};

static unsigned short xsave_cpuid_features[] __initdata = {
        [XFEATURE_FP]                                = X86_FEATURE_FPU,
        [XFEATURE_SSE]                                = X86_FEATURE_XMM,
        [XFEATURE_YMM]                                = X86_FEATURE_AVX,
        [XFEATURE_BNDREGS]                        = X86_FEATURE_MPX,
        [XFEATURE_BNDCSR]                        = X86_FEATURE_MPX,
        [XFEATURE_OPMASK]                        = X86_FEATURE_AVX512F,
        [XFEATURE_ZMM_Hi256]                        = X86_FEATURE_AVX512F,
        [XFEATURE_Hi16_ZMM]                        = X86_FEATURE_AVX512F,
        [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]        = X86_FEATURE_INTEL_PT,
        [XFEATURE_PKRU]                                = X86_FEATURE_OSPKE,
        [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
        [XFEATURE_CET_USER]                        = X86_FEATURE_SHSTK,
        [XFEATURE_XTILE_CFG]                        = X86_FEATURE_AMX_TILE,
        [XFEATURE_XTILE_DATA]                        = X86_FEATURE_AMX_TILE,
};

static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;

#define XSTATE_FLAG_SUPERVISOR        BIT(0)
#define XSTATE_FLAG_ALIGNED64        BIT(1)

/*
 * Return whether the system supports a given xfeature.
 *
 * Also return the name of the (most advanced) feature that the caller requested:
 */
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
        u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;

        if (unlikely(feature_name)) {
                long xfeature_idx, max_idx;
                u64 xfeatures_print;
                /*
                 * So we use FLS here to be able to print the most advanced
                 * feature that was requested but is missing. So if a driver
                 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
                 * missing AVX feature - this is the most informative message
                 * to users:
                 */
                if (xfeatures_missing)
                        xfeatures_print = xfeatures_missing;
                else
                        xfeatures_print = xfeatures_needed;

                xfeature_idx = fls64(xfeatures_print)-1;
                max_idx = ARRAY_SIZE(xfeature_names)-1;
                xfeature_idx = min(xfeature_idx, max_idx);

                *feature_name = xfeature_names[xfeature_idx];
        }

        if (xfeatures_missing)
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);

static bool xfeature_is_aligned64(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
}

static bool xfeature_is_supervisor(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
}

static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
{
        unsigned int offs, i;

        /*
         * Non-compacted format and legacy features use the cached fixed
         * offsets.
         */
        if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
            xfeature <= XFEATURE_SSE)
                return xstate_offsets[xfeature];

        /*
         * Compacted format offsets depend on the actual content of the
         * compacted xsave area which is determined by the xcomp_bv header
         * field.
         */
        offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        for_each_extended_xfeature(i, xcomp_bv) {
                if (xfeature_is_aligned64(i))
                        offs = ALIGN(offs, 64);
                if (i == xfeature)
                        break;
                offs += xstate_sizes[i];
        }
        return offs;
}

/*
 * Enable the extended processor state save/restore feature.
 * Called once per CPU onlining.
 */
void fpu__init_cpu_xstate(void)
{
        if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
                return;

        cr4_set_bits(X86_CR4_OSXSAVE);

        /*
         * Must happen after CR4 setup and before xsetbv() to allow KVM
         * lazy passthrough.  Write independent of the dynamic state static
         * key as that does not work on the boot CPU. This also ensures
         * that any stale state is wiped out from XFD. Reset the per CPU
         * xfd cache too.
         */
        if (cpu_feature_enabled(X86_FEATURE_XFD))
                xfd_set_state(init_fpstate.xfd);

        /*
         * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
         * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
         * states can be set here.
         */
        xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * MSR_IA32_XSS sets supervisor states managed by XSAVES.
         */
        if (boot_cpu_has(X86_FEATURE_XSAVES)) {
                wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
                                     xfeatures_mask_independent());
        }
}

static bool xfeature_enabled(enum xfeature xfeature)
{
        return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}

/*
 * Record the offsets and sizes of various xstates contained
 * in the XSAVE state memory layout.
 */
static void __init setup_xstate_cache(void)
{
        u32 eax, ebx, ecx, edx, i;
        /* start at the beginning of the "extended state" */
        unsigned int last_good_offset = offsetof(struct xregs_state,
                                                 extended_state_area);
        /*
         * The FP xstates and SSE xstates are legacy states. They are always
         * in the fixed offsets in the xsave area in either compacted form
         * or standard form.
         */
        xstate_offsets[XFEATURE_FP]        = 0;
        xstate_sizes[XFEATURE_FP]        = offsetof(struct fxregs_state,
                                                   xmm_space);

        xstate_offsets[XFEATURE_SSE]        = xstate_sizes[XFEATURE_FP];
        xstate_sizes[XFEATURE_SSE]        = sizeof_field(struct fxregs_state,
                                                       xmm_space);

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);

                xstate_sizes[i] = eax;
                xstate_flags[i] = ecx;

                /*
                 * If an xfeature is supervisor state, the offset in EBX is
                 * invalid, leave it to -1.
                 */
                if (xfeature_is_supervisor(i))
                        continue;

                xstate_offsets[i] = ebx;

                /*
                 * In our xstate size checks, we assume that the highest-numbered
                 * xstate feature has the highest offset in the buffer.  Ensure
                 * it does.
                 */
                WARN_ONCE(last_good_offset > xstate_offsets[i],
                          "x86/fpu: misordered xstate at %d\n", last_good_offset);

                last_good_offset = xstate_offsets[i];
        }
}

static void __init print_xstate_feature(u64 xstate_mask)
{
        const char *feature_name;

        if (cpu_has_xfeatures(xstate_mask, &feature_name))
                pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
}

/*
 * Print out all the supported xstate features:
 */
static void __init print_xstate_features(void)
{
        print_xstate_feature(XFEATURE_MASK_FP);
        print_xstate_feature(XFEATURE_MASK_SSE);
        print_xstate_feature(XFEATURE_MASK_YMM);
        print_xstate_feature(XFEATURE_MASK_BNDREGS);
        print_xstate_feature(XFEATURE_MASK_BNDCSR);
        print_xstate_feature(XFEATURE_MASK_OPMASK);
        print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
        print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
        print_xstate_feature(XFEATURE_MASK_PKRU);
        print_xstate_feature(XFEATURE_MASK_PASID);
        print_xstate_feature(XFEATURE_MASK_CET_USER);
        print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
        print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
}

/*
 * This check is important because it is easy to get XSTATE_*
 * confused with XSTATE_BIT_*.
 */
#define CHECK_XFEATURE(nr) do {                \
        WARN_ON(nr < FIRST_EXTENDED_XFEATURE);        \
        WARN_ON(nr >= XFEATURE_MAX);        \
} while (0)

/*
 * Print out xstate component offsets and sizes
 */
static void __init print_xstate_offset_size(void)
{
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
                        i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
                        i, xstate_sizes[i]);
        }
}

/*
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static __init void os_xrstor_booting(struct xregs_state *xstate)
{
        u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        /*
         * We should never fault when copying from a kernel buffer, and the FPU
         * state we set at boot time should be valid.
         */
        WARN_ON_FPU(err);
}

/*
 * All supported features have either init state all zeros or are
 * handled in setup_init_fpu() individually. This is an explicit
 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
 * newly added supported features at build time and make people
 * actually look at the init state for the new feature.
 */
#define XFEATURES_INIT_FPSTATE_HANDLED                \
        (XFEATURE_MASK_FP |                        \
         XFEATURE_MASK_SSE |                        \
         XFEATURE_MASK_YMM |                        \
         XFEATURE_MASK_OPMASK |                        \
         XFEATURE_MASK_ZMM_Hi256 |                \
         XFEATURE_MASK_Hi16_ZMM         |                \
         XFEATURE_MASK_PKRU |                        \
         XFEATURE_MASK_BNDREGS |                \
         XFEATURE_MASK_BNDCSR |                        \
         XFEATURE_MASK_PASID |                        \
         XFEATURE_MASK_CET_USER |                \
         XFEATURE_MASK_XTILE)

/*
 * setup the xstate image representing the init state
 */
static void __init setup_init_fpu_buf(void)
{
        BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
                      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
                     XFEATURES_INIT_FPSTATE_HANDLED);

        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return;

        print_xstate_features();

        xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);

        /*
         * Init all the features state with header.xfeatures being 0x0
         */
        os_xrstor_booting(&init_fpstate.regs.xsave);

        /*
         * All components are now in init state. Read the state back so
         * that init_fpstate contains all non-zero init state. This only
         * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
         * those use the init optimization which skips writing data for
         * components in init state.
         *
         * XSAVE could be used, but that would require to reshuffle the
         * data when XSAVEC/S is available because XSAVEC/S uses xstate
         * compaction. But doing so is a pointless exercise because most
         * components have an all zeros init state except for the legacy
         * ones (FP and SSE). Those can be saved with FXSAVE into the
         * legacy area. Adding new features requires to ensure that init
         * state is all zeroes or if not to add the necessary handling
         * here.
         */
        fxsave(&init_fpstate.regs.fxsave);
}

int xfeature_size(int xfeature_nr)
{
        u32 eax, ebx, ecx, edx;

        CHECK_XFEATURE(xfeature_nr);
        cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
        return eax;
}

/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
static int validate_user_xstate_header(const struct xstate_header *hdr,
                                       struct fpstate *fpstate)
{
        /* No unknown or supervisor features may be set */
        if (hdr->xfeatures & ~fpstate->user_xfeatures)
                return -EINVAL;

        /* Userspace must use the uncompacted format */
        if (hdr->xcomp_bv)
                return -EINVAL;

        /*
         * If 'reserved' is shrunken to add a new field, make sure to validate
         * that new field here!
         */
        BUILD_BUG_ON(sizeof(hdr->reserved) != 48);

        /* No reserved bits may be set */
        if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
                return -EINVAL;

        return 0;
}

static void __init __xstate_dump_leaves(void)
{
        int i;
        u32 eax, ebx, ecx, edx;
        static int should_dump = 1;

        if (!should_dump)
                return;
        should_dump = 0;
        /*
         * Dump out a few leaves past the ones that we support
         * just in case there are some goodies up there
         */
        for (i = 0; i < XFEATURE_MAX + 10; i++) {
                cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
                pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
                        XSTATE_CPUID, i, eax, ebx, ecx, edx);
        }
}

#define XSTATE_WARN_ON(x, fmt, ...) do {                                        \
        if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {        \
                __xstate_dump_leaves();                                                \
        }                                                                        \
} while (0)

#define XCHECK_SZ(sz, nr, __struct) ({                                        \
        if (WARN_ONCE(sz != sizeof(__struct),                                \
            "[%s]: struct is %zu bytes, cpu state %d bytes\n",                \
            xfeature_names[nr], sizeof(__struct), sz)) {                \
                __xstate_dump_leaves();                                        \
        }                                                                \
        true;                                                                \
})


/**
 * check_xtile_data_against_struct - Check tile data state size.
 *
 * Calculate the state size by multiplying the single tile size which is
 * recorded in a C struct, and the number of tiles that the CPU informs.
 * Compare the provided size with the calculation.
 *
 * @size:        The tile data state size
 *
 * Returns:        0 on success, -EINVAL on mismatch.
 */
static int __init check_xtile_data_against_struct(int size)
{
        u32 max_palid, palid, state_size;
        u32 eax, ebx, ecx, edx;
        u16 max_tile;

        /*
         * Check the maximum palette id:
         *   eax: the highest numbered palette subleaf.
         */
        cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);

        /*
         * Cross-check each tile size and find the maximum number of
         * supported tiles.
         */
        for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
                u16 tile_size, max;

                /*
                 * Check the tile size info:
                 *   eax[31:16]:  bytes per title
                 *   ebx[31:16]:  the max names (or max number of tiles)
                 */
                cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
                tile_size = eax >> 16;
                max = ebx >> 16;

                if (tile_size != sizeof(struct xtile_data)) {
                        pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
                               __stringify(XFEATURE_XTILE_DATA),
                               sizeof(struct xtile_data), tile_size);
                        __xstate_dump_leaves();
                        return -EINVAL;
                }

                if (max > max_tile)
                        max_tile = max;
        }

        state_size = sizeof(struct xtile_data) * max_tile;
        if (size != state_size) {
                pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
                       __stringify(XFEATURE_XTILE_DATA), state_size, size);
                __xstate_dump_leaves();
                return -EINVAL;
        }
        return 0;
}

/*
 * We have a C struct for each 'xstate'.  We need to ensure
 * that our software representation matches what the CPU
 * tells us about the state's size.
 */
static bool __init check_xstate_against_struct(int nr)
{
        /*
         * Ask the CPU for the size of the state.
         */
        int sz = xfeature_size(nr);

        /*
         * Match each CPU state with the corresponding software
         * structure.
         */
        switch (nr) {
        case XFEATURE_YMM:          return XCHECK_SZ(sz, nr, struct ymmh_struct);
        case XFEATURE_BNDREGS:          return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
        case XFEATURE_BNDCSR:          return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
        case XFEATURE_OPMASK:          return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
        case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
        case XFEATURE_Hi16_ZMM:          return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
        case XFEATURE_PKRU:          return XCHECK_SZ(sz, nr, struct pkru_state);
        case XFEATURE_PASID:          return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
        case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
        case XFEATURE_CET_USER:          return XCHECK_SZ(sz, nr, struct cet_user_state);
        case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
        default:
                XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
                return false;
        }

        return true;
}

static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
{
        unsigned int topmost = fls64(xfeatures) -  1;
        unsigned int offset = xstate_offsets[topmost];

        if (topmost <= XFEATURE_SSE)
                return sizeof(struct xregs_state);

        if (compacted)
                offset = xfeature_get_offset(xfeatures, topmost);
        return offset + xstate_sizes[topmost];
}

/*
 * This essentially double-checks what the cpu told us about
 * how large the XSAVE buffer needs to be.  We are recalculating
 * it to be safe.
 *
 * Independent XSAVE features allocate their own buffers and are not
 * covered by these checks. Only the size of the buffer for task->fpu
 * is checked here.
 */
static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
        unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                if (!check_xstate_against_struct(i))
                        return false;
                /*
                 * Supervisor state components can be managed only by
                 * XSAVES.
                 */
                if (!xsaves && xfeature_is_supervisor(i)) {
                        XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
                        return false;
                }
        }
        size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
        XSTATE_WARN_ON(size != kernel_size,
                       "size %u != kernel_size %u\n", size, kernel_size);
        return size == kernel_size;
}

/*
 * Get total size of enabled xstates in XCR0 | IA32_XSS.
 *
 * Note the SDM's wording here.  "sub-function 0" only enumerates
 * the size of the *user* states.  If we use it to size a buffer
 * that we use 'XSAVES' on, we could potentially overflow the
 * buffer because 'XSAVES' saves system states too.
 *
 * This also takes compaction into account. So this works for
 * XSAVEC as well.
 */
static unsigned int __init get_compacted_size(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 1:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVES instruction for an XSAVE area
         *    containing all the state components
         *    corresponding to bits currently set in
         *    XCR0 | IA32_XSS.
         *
         * When XSAVES is not available but XSAVEC is (virt), then there
         * are no supervisor states, but XSAVEC still uses compacted
         * format.
         */
        cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
        return ebx;
}

/*
 * Get the total size of the enabled xstates without the independent supervisor
 * features.
 */
static unsigned int __init get_xsave_compacted_size(void)
{
        u64 mask = xfeatures_mask_independent();
        unsigned int size;

        if (!mask)
                return get_compacted_size();

        /* Disable independent features. */
        wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());

        /*
         * Ask the hardware what size is required of the buffer.
         * This is the size required for the task->fpu buffer.
         */
        size = get_compacted_size();

        /* Re-enable independent features so XSAVES will work on them again. */
        wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);

        return size;
}

static unsigned int __init get_xsave_size_user(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 0:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVE instruction for an XSAVE area
         *    containing all the *user* state components
         *    corresponding to bits currently set in XCR0.
         */
        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
        return ebx;
}

static int __init init_xstate_size(void)
{
        /* Recompute the context size for enabled features: */
        unsigned int user_size, kernel_size, kernel_default_size;
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);

        /* Uncompacted user space size */
        user_size = get_xsave_size_user();

        /*
         * XSAVES kernel size includes supervisor states and uses compacted
         * format. XSAVEC uses compacted format, but does not save
         * supervisor states.
         *
         * XSAVE[OPT] do not support supervisor states so kernel and user
         * size is identical.
         */
        if (compacted)
                kernel_size = get_xsave_compacted_size();
        else
                kernel_size = user_size;

        kernel_default_size =
                xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);

        if (!paranoid_xstate_size_valid(kernel_size))
                return -EINVAL;

        fpu_kernel_cfg.max_size = kernel_size;
        fpu_user_cfg.max_size = user_size;

        fpu_kernel_cfg.default_size = kernel_default_size;
        fpu_user_cfg.default_size =
                xstate_calculate_size(fpu_user_cfg.default_features, false);

        return 0;
}

/*
 * We enabled the XSAVE hardware, but something went wrong and
 * we can not use it.  Disable it.
 */
static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
        fpu_kernel_cfg.max_features = 0;
        cr4_clear_bits(X86_CR4_OSXSAVE);
        setup_clear_cpu_cap(X86_FEATURE_XSAVE);

        /* Restore the legacy size.*/
        fpu_kernel_cfg.max_size = legacy_size;
        fpu_kernel_cfg.default_size = legacy_size;
        fpu_user_cfg.max_size = legacy_size;
        fpu_user_cfg.default_size = legacy_size;

        /*
         * Prevent enabling the static branch which enables writes to the
         * XFD MSR.
         */
        init_fpstate.xfd = 0;

        fpstate_reset(&current->thread.fpu);
}

/*
 * Enable and initialize the xsave feature.
 * Called once per system bootup.
 */
void __init fpu__init_system_xstate(unsigned int legacy_size)
{
        unsigned int eax, ebx, ecx, edx;
        u64 xfeatures;
        int err;
        int i;

        if (!boot_cpu_has(X86_FEATURE_FPU)) {
                pr_info("x86/fpu: No FPU detected\n");
                return;
        }

        if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                pr_info("x86/fpu: x87 FPU will use %s\n",
                        boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
                return;
        }

        if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
                WARN_ON_FPU(1);
                return;
        }

        /*
         * Find user xstates supported by the processor.
         */
        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);

        /*
         * Find supervisor xstates supported by the processor.
         */
        cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);

        if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
                /*
                 * This indicates that something really unexpected happened
                 * with the enumeration.  Disable XSAVE and try to continue
                 * booting without it.  This is too early to BUG().
                 */
                pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
                       fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        /*
         * Clear XSAVE features that are disabled in the normal CPUID.
         */
        for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
                unsigned short cid = xsave_cpuid_features[i];

                /* Careful: X86_FEATURE_FPU is 0! */
                if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
                        fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
        }

        if (!cpu_feature_enabled(X86_FEATURE_XFD))
                fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
        else
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
                                        XFEATURE_MASK_SUPERVISOR_SUPPORTED;

        fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
        fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;

        /* Clean out dynamic features from default */
        fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
        fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        fpu_user_cfg.default_features = fpu_user_cfg.max_features;
        fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        /* Store it for paranoia check at the end */
        xfeatures = fpu_kernel_cfg.max_features;

        /*
         * Initialize the default XFD state in initfp_state and enable the
         * dynamic sizing mechanism if dynamic states are available.  The
         * static key cannot be enabled here because this runs before
         * jump_label_init(). This is delayed to an initcall.
         */
        init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;

        /* Set up compaction feature bit */
        if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
            cpu_feature_enabled(X86_FEATURE_XSAVES))
                setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);

        /* Enable xstate instructions to be able to continue with initialization: */
        fpu__init_cpu_xstate();

        /* Cache size, offset and flags for initialization */
        setup_xstate_cache();

        err = init_xstate_size();
        if (err)
                goto out_disable;

        /* Reset the state for the current task */
        fpstate_reset(&current->thread.fpu);

        /*
         * Update info used for ptrace frames; use standard-format size and no
         * supervisor xstates:
         */
        update_regset_xstate_info(fpu_user_cfg.max_size,
                                  fpu_user_cfg.max_features);

        /*
         * init_fpstate excludes dynamic states as they are large but init
         * state is zero.
         */
        init_fpstate.size                = fpu_kernel_cfg.default_size;
        init_fpstate.xfeatures                = fpu_kernel_cfg.default_features;

        if (init_fpstate.size > sizeof(init_fpstate.regs)) {
                pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
                        sizeof(init_fpstate.regs), init_fpstate.size);
                goto out_disable;
        }

        setup_init_fpu_buf();

        /*
         * Paranoia check whether something in the setup modified the
         * xfeatures mask.
         */
        if (xfeatures != fpu_kernel_cfg.max_features) {
                pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
                       xfeatures, fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        /*
         * CPU capabilities initialization runs before FPU init. So
         * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
         * functional, set the feature bit so depending code works.
         */
        setup_force_cpu_cap(X86_FEATURE_OSXSAVE);

        print_xstate_offset_size();
        pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
                fpu_kernel_cfg.max_features,
                fpu_kernel_cfg.max_size,
                boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
        return;

out_disable:
        /* something went wrong, try to boot without any XSAVE support */
        fpu__init_disable_system_xstate(legacy_size);
}

/*
 * Restore minimal FPU state after suspend:
 */
void fpu__resume_cpu(void)
{
        /*
         * Restore XCR0 on xsave capable CPUs:
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVE))
                xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * Restore IA32_XSS. The same CPUID bit enumerates support
         * of XSAVES and MSR_IA32_XSS.
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
                wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
                                     xfeatures_mask_independent());
        }

        if (fpu_state_size_dynamic())
                wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
}

/*
 * Given an xstate feature nr, calculate where in the xsave
 * buffer the state is.  Callers should ensure that the buffer
 * is valid.
 */
static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        u64 xcomp_bv = xsave->header.xcomp_bv;

        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
                if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
                        return NULL;
        }

        return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
}

/*
 * Given the xsave area and a state inside, this function returns the
 * address of the state.
 *
 * This is the API that is called to get xstate address in either
 * standard format or compacted format of xsave area.
 *
 * Note that if there is no data for the field in the xsave buffer
 * this will return NULL.
 *
 * Inputs:
 *        xstate: the thread's storage area for all FPU data
 *        xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
 *        XFEATURE_SSE, etc...)
 * Output:
 *        address of the state in the xsave area, or NULL if the
 *        field is not present in the xsave buffer.
 */
void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        /*
         * Do we even *have* xsave state?
         */
        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return NULL;

        /*
         * We should not ever be requesting features that we
         * have not enabled.
         */
        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        /*
         * This assumes the last 'xsave*' instruction to
         * have requested that 'xfeature_nr' be saved.
         * If it did not, we might be seeing and old value
         * of the field in the buffer.
         *
         * This can happen because the last 'xsave' did not
         * request that this feature be saved (unlikely)
         * or because the "init optimization" caused it
         * to not be saved.
         */
        if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
                return NULL;

        return __raw_xsave_addr(xsave, xfeature_nr);
}
EXPORT_SYMBOL_GPL(get_xsave_addr);

#ifdef CONFIG_ARCH_HAS_PKEYS

/*
 * This will go out and modify PKRU register to set the access
 * rights for @pkey to @init_val.
 */
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                              unsigned long init_val)
{
        u32 old_pkru, new_pkru_bits = 0;
        int pkey_shift;

        /*
         * This check implies XSAVE support.  OSPKE only gets
         * set if we enable XSAVE and we enable PKU in XCR0.
         */
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return -EINVAL;

        /*
         * This code should only be called with valid 'pkey'
         * values originating from in-kernel users.  Complain
         * if a bad value is observed.
         */
        if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
                return -EINVAL;

        /* Set the bits we need in PKRU:  */
        if (init_val & PKEY_DISABLE_ACCESS)
                new_pkru_bits |= PKRU_AD_BIT;
        if (init_val & PKEY_DISABLE_WRITE)
                new_pkru_bits |= PKRU_WD_BIT;

        /* Shift the bits in to the correct place in PKRU for pkey: */
        pkey_shift = pkey * PKRU_BITS_PER_PKEY;
        new_pkru_bits <<= pkey_shift;

        /* Get old PKRU and mask off any old bits in place: */
        old_pkru = read_pkru();
        old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);

        /* Write old part along with new part: */
        write_pkru(old_pkru | new_pkru_bits);

        return 0;
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */

static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
                         void *init_xstate, unsigned int size)
{
        membuf_write(to, from_xstate ? xstate : init_xstate, size);
}

/**
 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @fpstate:        The fpstate buffer from which to copy
 * @xfeatures:        The mask of xfeatures to save (XSAVE mode only)
 * @pkru_val:        The PKRU value to store in the PKRU component
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                               u64 xfeatures, u32 pkru_val,
                               enum xstate_copy_mode copy_mode)
{
        const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
        struct xregs_state *xinit = &init_fpstate.regs.xsave;
        struct xregs_state *xsave = &fpstate->regs.xsave;
        struct xstate_header header;
        unsigned int zerofrom;
        u64 mask;
        int i;

        memset(&header, 0, sizeof(header));
        header.xfeatures = xsave->header.xfeatures;

        /* Mask out the feature bits depending on copy mode */
        switch (copy_mode) {
        case XSTATE_COPY_FP:
                header.xfeatures &= XFEATURE_MASK_FP;
                break;

        case XSTATE_COPY_FX:
                header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
                break;

        case XSTATE_COPY_XSAVE:
                header.xfeatures &= fpstate->user_xfeatures & xfeatures;
                break;
        }

        /* Copy FP state up to MXCSR */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
                     &xinit->i387, off_mxcsr);

        /* Copy MXCSR when SSE or YMM are set in the feature mask */
        copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
                     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
                     MXCSR_AND_FLAGS_SIZE);

        /* Copy the remaining FP state */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP,
                     &to, &xsave->i387.st_space, &xinit->i387.st_space,
                     sizeof(xsave->i387.st_space));

        /* Copy the SSE state - shared with YMM, but independently managed */
        copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
                     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
                     sizeof(xsave->i387.xmm_space));

        if (copy_mode != XSTATE_COPY_XSAVE)
                goto out;

        /* Zero the padding area */
        membuf_zero(&to, sizeof(xsave->i387.padding));

        /* Copy xsave->i387.sw_reserved */
        membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));

        /* Copy the user space relevant state of @xsave->header */
        membuf_write(&to, &header, sizeof(header));

        zerofrom = offsetof(struct xregs_state, extended_state_area);

        /*
         * This 'mask' indicates which states to copy from fpstate.
         * Those extended states that are not present in fpstate are
         * either disabled or initialized:
         *
         * In non-compacted format, disabled features still occupy
         * state space but there is no state to copy from in the
         * compacted init_fpstate. The gap tracking will zero these
         * states.
         *
         * The extended features have an all zeroes init state. Thus,
         * remove them from 'mask' to zero those features in the user
         * buffer instead of retrieving them from init_fpstate.
         */
        mask = header.xfeatures;

        for_each_extended_xfeature(i, mask) {
                /*
                 * If there was a feature or alignment gap, zero the space
                 * in the destination buffer.
                 */
                if (zerofrom < xstate_offsets[i])
                        membuf_zero(&to, xstate_offsets[i] - zerofrom);

                if (i == XFEATURE_PKRU) {
                        struct pkru_state pkru = {0};
                        /*
                         * PKRU is not necessarily up to date in the
                         * XSAVE buffer. Use the provided value.
                         */
                        pkru.pkru = pkru_val;
                        membuf_write(&to, &pkru, sizeof(pkru));
                } else {
                        membuf_write(&to,
                                     __raw_xsave_addr(xsave, i),
                                     xstate_sizes[i]);
                }
                /*
                 * Keep track of the last copied state in the non-compacted
                 * target buffer for gap zeroing.
                 */
                zerofrom = xstate_offsets[i] + xstate_sizes[i];
        }

out:
        if (to.left)
                membuf_zero(&to, to.left);
}

/**
 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @tsk:        The task from which to copy the saved xstate
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                             enum xstate_copy_mode copy_mode)
{
        __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
                                  tsk->thread.fpu.fpstate->user_xfeatures,
                                  tsk->thread.pkru, copy_mode);
}

static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
                            const void *kbuf, const void __user *ubuf)
{
        if (kbuf) {
                memcpy(dst, kbuf + offset, size);
        } else {
                if (copy_from_user(dst, ubuf + offset, size))
                        return -EFAULT;
        }
        return 0;
}


/**
 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
 * @fpstate:        The fpstate buffer to copy to
 * @kbuf:        The UABI format buffer, if it comes from the kernel
 * @ubuf:        The UABI format buffer, if it comes from userspace
 * @pkru:        The location to write the PKRU value to
 *
 * Converts from the UABI format into the kernel internal hardware
 * dependent format.
 *
 * This function ultimately has three different callers with distinct PKRU
 * behavior.
 * 1.        When called from sigreturn the PKRU register will be restored from
 *        @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
 *        @fpstate is sufficient to cover this case, but the caller will also
 *        pass a pointer to the thread_struct's pkru field in @pkru and updating
 *        it is harmless.
 * 2.        When called from ptrace the PKRU register will be restored from the
 *        thread_struct's pkru field. A pointer to that is passed in @pkru.
 *        The kernel will restore it manually, so the XRSTOR behavior that resets
 *        the PKRU register to the hardware init value (0) if the corresponding
 *        xfeatures bit is not set is emulated here.
 * 3.        When called from KVM the PKRU register will be restored from the vcpu's
 *        pkru field. A pointer to that is passed in @pkru. KVM hasn't used
 *        XRSTOR and hasn't had the PKRU resetting behavior described above. To
 *        preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
 *        bit is not set.
 */
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
                               const void __user *ubuf, u32 *pkru)
{
        struct xregs_state *xsave = &fpstate->regs.xsave;
        unsigned int offset, size;
        struct xstate_header hdr;
        u64 mask;
        int i;

        offset = offsetof(struct xregs_state, header);
        if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
                return -EFAULT;

        if (validate_user_xstate_header(&hdr, fpstate))
                return -EINVAL;

        /* Validate MXCSR when any of the related features is in use */
        mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
        if (hdr.xfeatures & mask) {
                u32 mxcsr[2];

                offset = offsetof(struct fxregs_state, mxcsr);
                if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
                        return -EFAULT;

                /* Reserved bits in MXCSR must be zero. */
                if (mxcsr[0] & ~mxcsr_feature_mask)
                        return -EINVAL;

                /* SSE and YMM require MXCSR even when FP is not in use. */
                if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
                        xsave->i387.mxcsr = mxcsr[0];
                        xsave->i387.mxcsr_mask = mxcsr[1];
                }
        }

        for (i = 0; i < XFEATURE_MAX; i++) {
                mask = BIT_ULL(i);

                if (hdr.xfeatures & mask) {
                        void *dst = __raw_xsave_addr(xsave, i);

                        offset = xstate_offsets[i];
                        size = xstate_sizes[i];

                        if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
                                return -EFAULT;
                }
        }

        if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
                struct pkru_state *xpkru;

                xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
                *pkru = xpkru->pkru;
        } else {
                /*
                 * KVM may pass NULL here to indicate that it does not need
                 * PKRU updated.
                 */
                if (pkru)
                        *pkru = 0;
        }

        /*
         * The state that came in from userspace was user-state only.
         * Mask all the user states out of 'xfeatures':
         */
        xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;

        /*
         * Add back in the features that came in from userspace:
         */
        xsave->header.xfeatures |= hdr.xfeatures;

        return 0;
}

/*
 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
 * format and copy to the target thread. Used by ptrace and KVM.
 */
int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
        return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}

/*
 * Convert from a sigreturn standard-format user-space buffer to kernel
 * XSAVE[S] format and copy to the target thread. This is called from the
 * sigreturn() and rt_sigreturn() system calls.
 */
int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
                                      const void __user *ubuf)
{
        return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
}

static bool validate_independent_components(u64 mask)
{
        u64 xchk;

        if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
                return false;

        xchk = ~xfeatures_mask_independent();

        if (WARN_ON_ONCE(!mask || mask & xchk))
                return false;

        return true;
}

/**
 * xsaves - Save selected components to a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to save
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized as
 * XSAVES does not write the full xstate header. Before first use the
 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
 * can #GP.
 *
 * The feature mask must be a subset of the independent features.
 */
void xsaves(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

/**
 * xrstors - Restore selected components from a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to restore
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized
 * otherwise XRSTORS from that buffer can #GP.
 *
 * Proper usage is to restore the state which was saved with
 * xsaves() into @xstate.
 *
 * The feature mask must be a subset of the independent features.
 */
void xrstors(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

#if IS_ENABLED(CONFIG_KVM)
void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
{
        void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);

        if (addr)
                memset(addr, 0, xstate_sizes[xfeature]);
}
EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
#endif

#ifdef CONFIG_X86_64

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
 * can safely operate on the @fpstate buffer.
 */
static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
{
        u64 xfd = __this_cpu_read(xfd_state);

        if (fpstate->xfd == xfd)
                return true;

         /*
          * The XFD MSR does not match fpstate->xfd. That's invalid when
          * the passed in fpstate is current's fpstate.
          */
        if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
                return false;

        /*
         * XRSTOR(S) from init_fpstate are always correct as it will just
         * bring all components into init state and not read from the
         * buffer. XSAVE(S) raises #PF after init.
         */
        if (fpstate == &init_fpstate)
                return rstor;

        /*
         * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
         * XRSTORS(S): fpu_swap_kvm_fpstate()
         */

        /*
         * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
         * the buffer area for XFD-disabled state components.
         */
        mask &= ~xfd;

        /*
         * Remove features which are valid in fpstate. They
         * have space allocated in fpstate.
         */
        mask &= ~fpstate->xfeatures;

        /*
         * Any remaining state components in 'mask' might be written
         * by XSAVE/XRSTOR. Fail validation it found.
         */
        return !mask;
}

void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
{
        WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
}
#endif /* CONFIG_X86_DEBUG_FPU */

static int __init xfd_update_static_branch(void)
{
        /*
         * If init_fpstate.xfd has bits set then dynamic features are
         * available and the dynamic sizing must be enabled.
         */
        if (init_fpstate.xfd)
                static_branch_enable(&__fpu_state_size_dynamic);
        return 0;
}
arch_initcall(xfd_update_static_branch)

void fpstate_free(struct fpu *fpu)
{
        if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
                vfree(fpu->fpstate);
}

/**
 * fpstate_realloc - Reallocate struct fpstate for the requested new features
 *
 * @xfeatures:        A bitmap of xstate features which extend the enabled features
 *                of that task
 * @ksize:        The required size for the kernel buffer
 * @usize:        The required size for user space buffers
 * @guest_fpu:        Pointer to a guest FPU container. NULL for host allocations
 *
 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
 * with large states are likely to live longer.
 *
 * Returns: 0 on success, -ENOMEM on allocation error.
 */
static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
                           unsigned int usize, struct fpu_guest *guest_fpu)
{
        struct fpu *fpu = &current->thread.fpu;
        struct fpstate *curfps, *newfps = NULL;
        unsigned int fpsize;
        bool in_use;

        fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);

        newfps = vzalloc(fpsize);
        if (!newfps)
                return -ENOMEM;
        newfps->size = ksize;
        newfps->user_size = usize;
        newfps->is_valloc = true;

        /*
         * When a guest FPU is supplied, use @guest_fpu->fpstate
         * as reference independent whether it is in use or not.
         */
        curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;

        /* Determine whether @curfps is the active fpstate */
        in_use = fpu->fpstate == curfps;

        if (guest_fpu) {
                newfps->is_guest = true;
                newfps->is_confidential = curfps->is_confidential;
                newfps->in_use = curfps->in_use;
                guest_fpu->xfeatures |= xfeatures;
                guest_fpu->uabi_size = usize;
        }

        fpregs_lock();
        /*
         * If @curfps is in use, ensure that the current state is in the
         * registers before swapping fpstate as that might invalidate it
         * due to layout changes.
         */
        if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();

        newfps->xfeatures = curfps->xfeatures | xfeatures;
        newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
        newfps->xfd = curfps->xfd & ~xfeatures;

        /* Do the final updates within the locked region */
        xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);

        if (guest_fpu) {
                guest_fpu->fpstate = newfps;
                /* If curfps is active, update the FPU fpstate pointer */
                if (in_use)
                        fpu->fpstate = newfps;
        } else {
                fpu->fpstate = newfps;
        }

        if (in_use)
                xfd_update_state(fpu->fpstate);
        fpregs_unlock();

        /* Only free valloc'ed state */
        if (curfps && curfps->is_valloc)
                vfree(curfps);

        return 0;
}

static int validate_sigaltstack(unsigned int usize)
{
        struct task_struct *thread, *leader = current->group_leader;
        unsigned long framesize = get_sigframe_size();

        lockdep_assert_held(&current->sighand->siglock);

        /* get_sigframe_size() is based on fpu_user_cfg.max_size */
        framesize -= fpu_user_cfg.max_size;
        framesize += usize;
        for_each_thread(leader, thread) {
                if (thread->sas_ss_size && thread->sas_ss_size < framesize)
                        return -ENOSPC;
        }
        return 0;
}

static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
{
        /*
         * This deliberately does not exclude !XSAVES as we still might
         * decide to optionally context switch XCR0 or talk the silicon
         * vendors into extending XFD for the pre AMX states, especially
         * AVX512.
         */
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        struct fpu *fpu = &current->group_leader->thread.fpu;
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        u64 mask;
        int ret = 0;

        /* Check whether fully enabled */
        if ((permitted & requested) == requested)
                return 0;

        /* Calculate the resulting kernel state size */
        mask = permitted | requested;
        /* Take supervisor states into account on the host */
        if (!guest)
                mask |= xfeatures_mask_supervisor();
        ksize = xstate_calculate_size(mask, compacted);

        /* Calculate the resulting user state size */
        mask &= XFEATURE_MASK_USER_SUPPORTED;
        usize = xstate_calculate_size(mask, false);

        if (!guest) {
                ret = validate_sigaltstack(usize);
                if (ret)
                        return ret;
        }

        perm = guest ? &fpu->guest_perm : &fpu->perm;
        /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
        WRITE_ONCE(perm->__state_perm, mask);
        /* Protected by sighand lock */
        perm->__state_size = ksize;
        perm->__user_state_size = usize;
        return ret;
}

/*
 * Permissions array to map facilities with more than one component
 */
static const u64 xstate_prctl_req[XFEATURE_MAX] = {
        [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
};

static int xstate_request_perm(unsigned long idx, bool guest)
{
        u64 permitted, requested;
        int ret;

        if (idx >= XFEATURE_MAX)
                return -EINVAL;

        /*
         * Look up the facility mask which can require more than
         * one xstate component.
         */
        idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
        requested = xstate_prctl_req[idx];
        if (!requested)
                return -EOPNOTSUPP;

        if ((fpu_user_cfg.max_features & requested) != requested)
                return -EOPNOTSUPP;

        /* Lockless quick check */
        permitted = xstate_get_group_perm(guest);
        if ((permitted & requested) == requested)
                return 0;

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);
        permitted = xstate_get_group_perm(guest);

        /* First vCPU allocation locks the permissions. */
        if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
                ret = -EBUSY;
        else
                ret = __xstate_request_perm(permitted, requested, guest);
        spin_unlock_irq(&current->sighand->siglock);
        return ret;
}

int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
{
        u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        struct fpu *fpu;

        if (!xfd_event) {
                if (!guest_fpu)
                        pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
                return 0;
        }

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);

        /* If not permitted let it die */
        if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
                spin_unlock_irq(&current->sighand->siglock);
                return -EPERM;
        }

        fpu = &current->group_leader->thread.fpu;
        perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
        ksize = perm->__state_size;
        usize = perm->__user_state_size;

        /*
         * The feature is permitted. State size is sufficient.  Dropping
         * the lock is safe here even if more features are added from
         * another task, the retrieved buffer sizes are valid for the
         * currently requested feature(s).
         */
        spin_unlock_irq(&current->sighand->siglock);

        /*
         * Try to allocate a new fpstate. If that fails there is no way
         * out.
         */
        if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
                return -EFAULT;
        return 0;
}

int xfd_enable_feature(u64 xfd_err)
{
        return __xfd_enable_feature(xfd_err, NULL);
}

#else /* CONFIG_X86_64 */
static inline int xstate_request_perm(unsigned long idx, bool guest)
{
        return -EPERM;
}
#endif  /* !CONFIG_X86_64 */

u64 xstate_get_guest_group_perm(void)
{
        return xstate_get_group_perm(true);
}
EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);

/**
 * fpu_xstate_prctl - xstate permission operations
 * @option:        A subfunction of arch_prctl()
 * @arg2:        option argument
 * Return:        0 if successful; otherwise, an error code
 *
 * Option arguments:
 *
 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
 * ARCH_REQ_XCOMP_PERM: Facility number requested
 *
 * For facilities which require more than one XSTATE component, the request
 * must be the highest state component number related to that facility,
 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
 */
long fpu_xstate_prctl(int option, unsigned long arg2)
{
        u64 __user *uptr = (u64 __user *)arg2;
        u64 permitted, supported;
        unsigned long idx = arg2;
        bool guest = false;

        switch (option) {
        case ARCH_GET_XCOMP_SUPP:
                supported = fpu_user_cfg.max_features |        fpu_user_cfg.legacy_features;
                return put_user(supported, uptr);

        case ARCH_GET_XCOMP_PERM:
                /*
                 * Lockless snapshot as it can also change right after the
                 * dropping the lock.
                 */
                permitted = xstate_get_host_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_GET_XCOMP_GUEST_PERM:
                permitted = xstate_get_guest_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_REQ_XCOMP_GUEST_PERM:
                guest = true;
                fallthrough;

        case ARCH_REQ_XCOMP_PERM:
                if (!IS_ENABLED(CONFIG_X86_64))
                        return -EOPNOTSUPP;

                return xstate_request_perm(idx, guest);

        default:
                return -EINVAL;
        }
}

#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
 * Report the amount of time elapsed in millisecond since last AVX512
 * use in the task.
 */
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
        unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
        long delta;

        if (!timestamp) {
                /*
                 * Report -1 if no AVX512 usage
                 */
                delta = -1;
        } else {
                delta = (long)(jiffies - timestamp);
                /*
                 * Cap to LONG_MAX if time difference > LONG_MAX
                 */
                if (delta < 0)
                        delta = LONG_MAX;
                delta = jiffies_to_msecs(delta);
        }

        seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
        seq_putc(m, '\n');
}

/*
 * Report architecture specific information
 */
int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        /*
         * Report AVX512 state if the processor and build option supported.
         */
        if (cpu_feature_enabled(X86_FEATURE_AVX512F))
                avx512_status(m, task);

        return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */













































































































    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_NULLS_H
#define _LINUX_LIST_NULLS_H

#include <linux/poison.h>
#include <linux/const.h>

/*
 * Special version of lists, where end of list is not a NULL pointer,
 * but a 'nulls' marker, which can have many different values.
 * (up to 2^31 different values guaranteed on all platforms)
 *
 * In the standard hlist, termination of a list is the NULL pointer.
 * In this special 'nulls' variant, we use the fact that objects stored in
 * a list are aligned on a word (4 or 8 bytes alignment).
 * We therefore use the last significant bit of 'ptr' :
 * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
 * Set to 0 : This is a pointer to some object (ptr)
 */

struct hlist_nulls_head {
        struct hlist_nulls_node *first;
};

struct hlist_nulls_node {
        struct hlist_nulls_node *next, **pprev;
};
#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
        ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))

#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_nulls_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
        })
/**
 * ptr_is_a_nulls - Test if a ptr is a nulls
 * @ptr: ptr to be tested
 *
 */
static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr & 1);
}

/**
 * get_nulls_value - Get the 'nulls' value of the end of chain
 * @ptr: end of chain
 *
 * Should be called only if is_a_nulls(ptr);
 */
static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr) >> 1;
}

/**
 * hlist_nulls_unhashed - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.
 */
static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
{
        return !h->pprev;
}

/**
 * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
 * function may be used locklessly.
 */
static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
{
        return !READ_ONCE(h->pprev);
}

static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
{
        return is_a_nulls(READ_ONCE(h->first));
}

static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        h->first = n;
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
{
        struct hlist_nulls_node *next = n->next;
        struct hlist_nulls_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (!is_a_nulls(next))
                WRITE_ONCE(next->pprev, pprev);
}

static inline void hlist_nulls_del(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry(tpos, pos, head, member)                       \
        for (pos = (head)->first;                                               \
             (!is_a_nulls(pos)) &&                                               \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry_from(tpos, pos, member)        \
        for (; (!is_a_nulls(pos)) &&                                 \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

#endif



































































    4 













    2 





    2 


























































    2 



    2 
    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_RCUREF_H
#define _LINUX_RCUREF_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>

#define RCUREF_ONEREF                0x00000000U
#define RCUREF_MAXREF                0x7FFFFFFFU
#define RCUREF_SATURATED        0xA0000000U
#define RCUREF_RELEASED                0xC0000000U
#define RCUREF_DEAD                0xE0000000U
#define RCUREF_NOREF                0xFFFFFFFFU

/**
 * rcuref_init - Initialize a rcuref reference count with the given reference count
 * @ref:        Pointer to the reference count
 * @cnt:        The initial reference count typically '1'
 */
static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
{
        atomic_set(&ref->refcnt, cnt - 1);
}

/**
 * rcuref_read - Read the number of held reference counts of a rcuref
 * @ref:        Pointer to the reference count
 *
 * Return: The number of held references (0 ... N)
 */
static inline unsigned int rcuref_read(rcuref_t *ref)
{
        unsigned int c = atomic_read(&ref->refcnt);

        /* Return 0 if within the DEAD zone. */
        return c >= RCUREF_RELEASED ? 0 : c + 1;
}

extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);

/**
 * rcuref_get - Acquire one reference on a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See documentation in lib/rcuref.c
 *
 * Return:
 *        False if the attempt to acquire a reference failed. This happens
 *        when the last reference has been put already
 *
 *        True if a reference was successfully acquired
 */
static inline __must_check bool rcuref_get(rcuref_t *ref)
{
        /*
         * Unconditionally increase the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
                return true;

        /* Handle the cases inside the saturation and dead zones */
        return rcuref_get_slowpath(ref);
}

extern __must_check bool rcuref_put_slowpath(rcuref_t *ref);

/*
 * Internal helper. Do not invoke directly.
 */
static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
                         "suspicious rcuref_put_rcusafe() usage");
        /*
         * Unconditionally decrease the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        if (likely(!atomic_add_negative_release(-1, &ref->refcnt)))
                return false;

        /*
         * Handle the last reference drop and cases inside the saturation
         * and dead zones.
         */
        return rcuref_put_slowpath(ref);
}

/**
 * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
 * @ref:        Pointer to the reference count
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Can be invoked from contexts, which guarantee that no grace period can
 * happen which would free the object concurrently if the decrement drops
 * the last reference and the slowpath races against a concurrent get() and
 * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely release the
 *        object which is protected by the reference counter.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        release the protected object.
 */
static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
{
        return __rcuref_put(ref);
}

/**
 * rcuref_put -- Release one reference for a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Can be invoked from any context.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return:
 *
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
static inline __must_check bool rcuref_put(rcuref_t *ref)
{
        bool released;

        preempt_disable();
        released = __rcuref_put(ref);
        preempt_enable();
        return released;
}

#endif









































































































































































































































































































































































































































































































































































































































































    1 



    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2018 - 2021, 2023 Intel Corporation
 */
#include <net/cfg80211.h>
#include "core.h"
#include "nl80211.h"
#include "rdev-ops.h"

static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
                          struct nlattr *ftmreq,
                          struct cfg80211_pmsr_request_peer *out,
                          struct genl_info *info)
{
        const struct cfg80211_pmsr_capabilities *capa = rdev->wiphy.pmsr_capa;
        struct nlattr *tb[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1];
        u32 preamble = NL80211_PREAMBLE_DMG; /* only optional in DMG */

        /* validate existing data */
        if (!(rdev->wiphy.pmsr_capa->ftm.bandwidths & BIT(out->chandef.width))) {
                NL_SET_ERR_MSG(info->extack, "FTM: unsupported bandwidth");
                return -EINVAL;
        }

        /* no validation needed - was already done via nested policy */
        nla_parse_nested_deprecated(tb, NL80211_PMSR_FTM_REQ_ATTR_MAX, ftmreq,
                                    NULL, NULL);

        if (tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE])
                preamble = nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]);

        /* set up values - struct is 0-initialized */
        out->ftm.requested = true;

        switch (out->chandef.chan->band) {
        case NL80211_BAND_60GHZ:
                /* optional */
                break;
        default:
                if (!tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE]) {
                        NL_SET_ERR_MSG(info->extack,
                                       "FTM: must specify preamble");
                        return -EINVAL;
                }
        }

        if (!(capa->ftm.preambles & BIT(preamble))) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE],
                                    "FTM: invalid preamble");
                return -EINVAL;
        }

        out->ftm.preamble = preamble;

        out->ftm.burst_period = 0;
        if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD])
                out->ftm.burst_period =
                        nla_get_u16(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_PERIOD]);

        out->ftm.asap = !!tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP];
        if (out->ftm.asap && !capa->ftm.asap) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_ASAP],
                                    "FTM: ASAP mode not supported");
                return -EINVAL;
        }

        if (!out->ftm.asap && !capa->ftm.non_asap) {
                NL_SET_ERR_MSG(info->extack,
                               "FTM: non-ASAP mode not supported");
                return -EINVAL;
        }

        out->ftm.num_bursts_exp = 0;
        if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP])
                out->ftm.num_bursts_exp =
                        nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP]);

        if (capa->ftm.max_bursts_exponent >= 0 &&
            out->ftm.num_bursts_exp > capa->ftm.max_bursts_exponent) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_BURSTS_EXP],
                                    "FTM: max NUM_BURSTS_EXP must be set lower than the device limit");
                return -EINVAL;
        }

        out->ftm.burst_duration = 15;
        if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION])
                out->ftm.burst_duration =
                        nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]);

        out->ftm.ftms_per_burst = 0;
        if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST])
                out->ftm.ftms_per_burst =
                        nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]);

        if (capa->ftm.max_ftms_per_burst &&
            (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst ||
             out->ftm.ftms_per_burst == 0)) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST],
                                    "FTM: FTMs per burst must be set lower than the device limit but non-zero");
                return -EINVAL;
        }

        out->ftm.ftmr_retries = 3;
        if (tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES])
                out->ftm.ftmr_retries =
                        nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES]);

        out->ftm.request_lci = !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI];
        if (out->ftm.request_lci && !capa->ftm.request_lci) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI],
                                    "FTM: LCI request not supported");
        }

        out->ftm.request_civicloc =
                !!tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC];
        if (out->ftm.request_civicloc && !capa->ftm.request_civicloc) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC],
                            "FTM: civic location request not supported");
        }

        out->ftm.trigger_based =
                !!tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED];
        if (out->ftm.trigger_based && !capa->ftm.trigger_based) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED],
                                    "FTM: trigger based ranging is not supported");
                return -EINVAL;
        }

        out->ftm.non_trigger_based =
                !!tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED];
        if (out->ftm.non_trigger_based && !capa->ftm.non_trigger_based) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED],
                                    "FTM: trigger based ranging is not supported");
                return -EINVAL;
        }

        if (out->ftm.trigger_based && out->ftm.non_trigger_based) {
                NL_SET_ERR_MSG(info->extack,
                               "FTM: can't set both trigger based and non trigger based");
                return -EINVAL;
        }

        if ((out->ftm.trigger_based || out->ftm.non_trigger_based) &&
            out->ftm.preamble != NL80211_PREAMBLE_HE) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE],
                                    "FTM: non EDCA based ranging must use HE preamble");
                return -EINVAL;
        }

        out->ftm.lmr_feedback =
                !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK];
        if (!out->ftm.trigger_based && !out->ftm.non_trigger_based &&
            out->ftm.lmr_feedback) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK],
                                    "FTM: LMR feedback set for EDCA based ranging");
                return -EINVAL;
        }

        if (tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]) {
                if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) {
                        NL_SET_ERR_MSG_ATTR(info->extack,
                                            tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR],
                                            "FTM: BSS color set for EDCA based ranging");
                        return -EINVAL;
                }

                out->ftm.bss_color =
                        nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]);
        }

        return 0;
}

static int pmsr_parse_peer(struct cfg80211_registered_device *rdev,
                           struct nlattr *peer,
                           struct cfg80211_pmsr_request_peer *out,
                           struct genl_info *info)
{
        struct nlattr *tb[NL80211_PMSR_PEER_ATTR_MAX + 1];
        struct nlattr *req[NL80211_PMSR_REQ_ATTR_MAX + 1];
        struct nlattr *treq;
        int err, rem;

        /* no validation needed - was already done via nested policy */
        nla_parse_nested_deprecated(tb, NL80211_PMSR_PEER_ATTR_MAX, peer,
                                    NULL, NULL);

        if (!tb[NL80211_PMSR_PEER_ATTR_ADDR] ||
            !tb[NL80211_PMSR_PEER_ATTR_CHAN] ||
            !tb[NL80211_PMSR_PEER_ATTR_REQ]) {
                NL_SET_ERR_MSG_ATTR(info->extack, peer,
                                    "insufficient peer data");
                return -EINVAL;
        }

        memcpy(out->addr, nla_data(tb[NL80211_PMSR_PEER_ATTR_ADDR]), ETH_ALEN);

        /* reuse info->attrs */
        memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1));
        err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX,
                                          tb[NL80211_PMSR_PEER_ATTR_CHAN],
                                          NULL, info->extack);
        if (err)
                return err;

        err = nl80211_parse_chandef(rdev, info, &out->chandef);
        if (err)
                return err;

        /* no validation needed - was already done via nested policy */
        nla_parse_nested_deprecated(req, NL80211_PMSR_REQ_ATTR_MAX,
                                    tb[NL80211_PMSR_PEER_ATTR_REQ], NULL,
                                    NULL);

        if (!req[NL80211_PMSR_REQ_ATTR_DATA]) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NL80211_PMSR_PEER_ATTR_REQ],
                                    "missing request type/data");
                return -EINVAL;
        }

        if (req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF])
                out->report_ap_tsf = true;

        if (out->report_ap_tsf && !rdev->wiphy.pmsr_capa->report_ap_tsf) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    req[NL80211_PMSR_REQ_ATTR_GET_AP_TSF],
                                    "reporting AP TSF is not supported");
                return -EINVAL;
        }

        nla_for_each_nested(treq, req[NL80211_PMSR_REQ_ATTR_DATA], rem) {
                switch (nla_type(treq)) {
                case NL80211_PMSR_TYPE_FTM:
                        err = pmsr_parse_ftm(rdev, treq, out, info);
                        break;
                default:
                        NL_SET_ERR_MSG_ATTR(info->extack, treq,
                                            "unsupported measurement type");
                        err = -EINVAL;
                }
        }

        if (err)
                return err;

        return 0;
}

int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr *reqattr = info->attrs[NL80211_ATTR_PEER_MEASUREMENTS];
        struct cfg80211_registered_device *rdev = info->user_ptr[0];
        struct wireless_dev *wdev = info->user_ptr[1];
        struct cfg80211_pmsr_request *req;
        struct nlattr *peers, *peer;
        int count, rem, err, idx;

        if (!rdev->wiphy.pmsr_capa)
                return -EOPNOTSUPP;

        if (!reqattr)
                return -EINVAL;

        peers = nla_find(nla_data(reqattr), nla_len(reqattr),
                         NL80211_PMSR_ATTR_PEERS);
        if (!peers)
                return -EINVAL;

        count = 0;
        nla_for_each_nested(peer, peers, rem) {
                count++;

                if (count > rdev->wiphy.pmsr_capa->max_peers) {
                        NL_SET_ERR_MSG_ATTR(info->extack, peer,
                                            "Too many peers used");
                        return -EINVAL;
                }
        }

        req = kzalloc(struct_size(req, peers, count), GFP_KERNEL);
        if (!req)
                return -ENOMEM;
        req->n_peers = count;

        if (info->attrs[NL80211_ATTR_TIMEOUT])
                req->timeout = nla_get_u32(info->attrs[NL80211_ATTR_TIMEOUT]);

        if (info->attrs[NL80211_ATTR_MAC]) {
                if (!rdev->wiphy.pmsr_capa->randomize_mac_addr) {
                        NL_SET_ERR_MSG_ATTR(info->extack,
                                            info->attrs[NL80211_ATTR_MAC],
                                            "device cannot randomize MAC address");
                        err = -EINVAL;
                        goto out_err;
                }

                err = nl80211_parse_random_mac(info->attrs, req->mac_addr,
                                               req->mac_addr_mask);
                if (err)
                        goto out_err;
        } else {
                memcpy(req->mac_addr, wdev_address(wdev), ETH_ALEN);
                eth_broadcast_addr(req->mac_addr_mask);
        }

        idx = 0;
        nla_for_each_nested(peer, peers, rem) {
                /* NB: this reuses info->attrs, but we no longer need it */
                err = pmsr_parse_peer(rdev, peer, &req->peers[idx], info);
                if (err)
                        goto out_err;
                idx++;
        }
        req->cookie = cfg80211_assign_cookie(rdev);
        req->nl_portid = info->snd_portid;

        err = rdev_start_pmsr(rdev, wdev, req);
        if (err)
                goto out_err;

        list_add_tail(&req->list, &wdev->pmsr_list);

        nl_set_extack_cookie_u64(info->extack, req->cookie);
        return 0;
out_err:
        kfree(req);
        return err;
}

void cfg80211_pmsr_complete(struct wireless_dev *wdev,
                            struct cfg80211_pmsr_request *req,
                            gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct cfg80211_pmsr_request *tmp, *prev, *to_free = NULL;
        struct sk_buff *msg;
        void *hdr;

        trace_cfg80211_pmsr_complete(wdev->wiphy, wdev, req->cookie);

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                goto free_request;

        hdr = nl80211hdr_put(msg, 0, 0, 0,
                             NL80211_CMD_PEER_MEASUREMENT_COMPLETE);
        if (!hdr)
                goto free_msg;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto free_msg;

        if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie,
                              NL80211_ATTR_PAD))
                goto free_msg;

        genlmsg_end(msg, hdr);
        genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid);
        goto free_request;
free_msg:
        nlmsg_free(msg);
free_request:
        spin_lock_bh(&wdev->pmsr_lock);
        /*
         * cfg80211_pmsr_process_abort() may have already moved this request
         * to the free list, and will free it later. In this case, don't free
         * it here.
         */
        list_for_each_entry_safe(tmp, prev, &wdev->pmsr_list, list) {
                if (tmp == req) {
                        list_del(&req->list);
                        to_free = req;
                        break;
                }
        }
        spin_unlock_bh(&wdev->pmsr_lock);
        kfree(to_free);
}
EXPORT_SYMBOL_GPL(cfg80211_pmsr_complete);

static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg,
                                     struct cfg80211_pmsr_result *res)
{
        if (res->status == NL80211_PMSR_STATUS_FAILURE) {
                if (nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_FAIL_REASON,
                                res->ftm.failure_reason))
                        goto error;

                if (res->ftm.failure_reason ==
                        NL80211_PMSR_FTM_FAILURE_PEER_BUSY &&
                    res->ftm.busy_retry_time &&
                    nla_put_u32(msg, NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME,
                                res->ftm.busy_retry_time))
                        goto error;

                return 0;
        }

#define PUT(tp, attr, val)                                                \
        do {                                                                \
                if (nla_put_##tp(msg,                                        \
                                 NL80211_PMSR_FTM_RESP_ATTR_##attr,        \
                                 res->ftm.val))                                \
                        goto error;                                        \
        } while (0)

#define PUTOPT(tp, attr, val)                                                \
        do {                                                                \
                if (res->ftm.val##_valid)                                \
                        PUT(tp, attr, val);                                \
        } while (0)

#define PUT_U64(attr, val)                                                \
        do {                                                                \
                if (nla_put_u64_64bit(msg,                                \
                                      NL80211_PMSR_FTM_RESP_ATTR_##attr,\
                                      res->ftm.val,                        \
                                      NL80211_PMSR_FTM_RESP_ATTR_PAD))        \
                        goto error;                                        \
        } while (0)

#define PUTOPT_U64(attr, val)                                                \
        do {                                                                \
                if (res->ftm.val##_valid)                                \
                        PUT_U64(attr, val);                                \
        } while (0)

        if (res->ftm.burst_index >= 0)
                PUT(u32, BURST_INDEX, burst_index);
        PUTOPT(u32, NUM_FTMR_ATTEMPTS, num_ftmr_attempts);
        PUTOPT(u32, NUM_FTMR_SUCCESSES, num_ftmr_successes);
        PUT(u8, NUM_BURSTS_EXP, num_bursts_exp);
        PUT(u8, BURST_DURATION, burst_duration);
        PUT(u8, FTMS_PER_BURST, ftms_per_burst);
        PUTOPT(s32, RSSI_AVG, rssi_avg);
        PUTOPT(s32, RSSI_SPREAD, rssi_spread);
        if (res->ftm.tx_rate_valid &&
            !nl80211_put_sta_rate(msg, &res->ftm.tx_rate,
                                  NL80211_PMSR_FTM_RESP_ATTR_TX_RATE))
                goto error;
        if (res->ftm.rx_rate_valid &&
            !nl80211_put_sta_rate(msg, &res->ftm.rx_rate,
                                  NL80211_PMSR_FTM_RESP_ATTR_RX_RATE))
                goto error;
        PUTOPT_U64(RTT_AVG, rtt_avg);
        PUTOPT_U64(RTT_VARIANCE, rtt_variance);
        PUTOPT_U64(RTT_SPREAD, rtt_spread);
        PUTOPT_U64(DIST_AVG, dist_avg);
        PUTOPT_U64(DIST_VARIANCE, dist_variance);
        PUTOPT_U64(DIST_SPREAD, dist_spread);
        if (res->ftm.lci && res->ftm.lci_len &&
            nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_LCI,
                    res->ftm.lci_len, res->ftm.lci))
                goto error;
        if (res->ftm.civicloc && res->ftm.civicloc_len &&
            nla_put(msg, NL80211_PMSR_FTM_RESP_ATTR_CIVICLOC,
                    res->ftm.civicloc_len, res->ftm.civicloc))
                goto error;
#undef PUT
#undef PUTOPT
#undef PUT_U64
#undef PUTOPT_U64

        return 0;
error:
        return -ENOSPC;
}

static int nl80211_pmsr_send_result(struct sk_buff *msg,
                                    struct cfg80211_pmsr_result *res)
{
        struct nlattr *pmsr, *peers, *peer, *resp, *data, *typedata;

        pmsr = nla_nest_start_noflag(msg, NL80211_ATTR_PEER_MEASUREMENTS);
        if (!pmsr)
                goto error;

        peers = nla_nest_start_noflag(msg, NL80211_PMSR_ATTR_PEERS);
        if (!peers)
                goto error;

        peer = nla_nest_start_noflag(msg, 1);
        if (!peer)
                goto error;

        if (nla_put(msg, NL80211_PMSR_PEER_ATTR_ADDR, ETH_ALEN, res->addr))
                goto error;

        resp = nla_nest_start_noflag(msg, NL80211_PMSR_PEER_ATTR_RESP);
        if (!resp)
                goto error;

        if (nla_put_u32(msg, NL80211_PMSR_RESP_ATTR_STATUS, res->status) ||
            nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_HOST_TIME,
                              res->host_time, NL80211_PMSR_RESP_ATTR_PAD))
                goto error;

        if (res->ap_tsf_valid &&
            nla_put_u64_64bit(msg, NL80211_PMSR_RESP_ATTR_AP_TSF,
                              res->ap_tsf, NL80211_PMSR_RESP_ATTR_PAD))
                goto error;

        if (res->final && nla_put_flag(msg, NL80211_PMSR_RESP_ATTR_FINAL))
                goto error;

        data = nla_nest_start_noflag(msg, NL80211_PMSR_RESP_ATTR_DATA);
        if (!data)
                goto error;

        typedata = nla_nest_start_noflag(msg, res->type);
        if (!typedata)
                goto error;

        switch (res->type) {
        case NL80211_PMSR_TYPE_FTM:
                if (nl80211_pmsr_send_ftm_res(msg, res))
                        goto error;
                break;
        default:
                WARN_ON(1);
        }

        nla_nest_end(msg, typedata);
        nla_nest_end(msg, data);
        nla_nest_end(msg, resp);
        nla_nest_end(msg, peer);
        nla_nest_end(msg, peers);
        nla_nest_end(msg, pmsr);

        return 0;
error:
        return -ENOSPC;
}

void cfg80211_pmsr_report(struct wireless_dev *wdev,
                          struct cfg80211_pmsr_request *req,
                          struct cfg80211_pmsr_result *result,
                          gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct sk_buff *msg;
        void *hdr;
        int err;

        trace_cfg80211_pmsr_report(wdev->wiphy, wdev, req->cookie,
                                   result->addr);

        /*
         * Currently, only variable items are LCI and civic location,
         * both of which are reasonably short so we don't need to
         * worry about them here for the allocation.
         */
        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
        if (!msg)
                return;

        hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PEER_MEASUREMENT_RESULT);
        if (!hdr)
                goto free;

        if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
            nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
                              NL80211_ATTR_PAD))
                goto free;

        if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->cookie,
                              NL80211_ATTR_PAD))
                goto free;

        err = nl80211_pmsr_send_result(msg, result);
        if (err) {
                pr_err_ratelimited("peer measurement result: message didn't fit!");
                goto free;
        }

        genlmsg_end(msg, hdr);
        genlmsg_unicast(wiphy_net(wdev->wiphy), msg, req->nl_portid);
        return;
free:
        nlmsg_free(msg);
}
EXPORT_SYMBOL_GPL(cfg80211_pmsr_report);

static void cfg80211_pmsr_process_abort(struct wireless_dev *wdev)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct cfg80211_pmsr_request *req, *tmp;
        LIST_HEAD(free_list);

        lockdep_assert_wiphy(wdev->wiphy);

        spin_lock_bh(&wdev->pmsr_lock);
        list_for_each_entry_safe(req, tmp, &wdev->pmsr_list, list) {
                if (req->nl_portid)
                        continue;
                list_move_tail(&req->list, &free_list);
        }
        spin_unlock_bh(&wdev->pmsr_lock);

        list_for_each_entry_safe(req, tmp, &free_list, list) {
                rdev_abort_pmsr(rdev, wdev, req);

                kfree(req);
        }
}

void cfg80211_pmsr_free_wk(struct work_struct *work)
{
        struct wireless_dev *wdev = container_of(work, struct wireless_dev,
                                                 pmsr_free_wk);

        wiphy_lock(wdev->wiphy);
        cfg80211_pmsr_process_abort(wdev);
        wiphy_unlock(wdev->wiphy);
}

void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev)
{
        struct cfg80211_pmsr_request *req;
        bool found = false;

        spin_lock_bh(&wdev->pmsr_lock);
        list_for_each_entry(req, &wdev->pmsr_list, list) {
                found = true;
                req->nl_portid = 0;
        }
        spin_unlock_bh(&wdev->pmsr_lock);

        if (found)
                cfg80211_pmsr_process_abort(wdev);

        WARN_ON(!list_empty(&wdev->pmsr_list));
}

void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid)
{
        struct cfg80211_pmsr_request *req;

        spin_lock_bh(&wdev->pmsr_lock);
        list_for_each_entry(req, &wdev->pmsr_list, list) {
                if (req->nl_portid == portid) {
                        req->nl_portid = 0;
                        schedule_work(&wdev->pmsr_free_wk);
                }
        }
        spin_unlock_bh(&wdev->pmsr_lock);
}































































































































    1 






    1 






    1 























































































































































































































































































































































































































































































































































































































































































































    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM writeback

#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WRITEBACK_H

#include <linux/tracepoint.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>

#define show_inode_state(state)                                        \
        __print_flags(state, "|",                                \
                {I_DIRTY_SYNC,                "I_DIRTY_SYNC"},        \
                {I_DIRTY_DATASYNC,        "I_DIRTY_DATASYNC"},        \
                {I_DIRTY_PAGES,                "I_DIRTY_PAGES"},        \
                {I_NEW,                        "I_NEW"},                \
                {I_WILL_FREE,                "I_WILL_FREE"},                \
                {I_FREEING,                "I_FREEING"},                \
                {I_CLEAR,                "I_CLEAR"},                \
                {I_SYNC,                "I_SYNC"},                \
                {I_DIRTY_TIME,                "I_DIRTY_TIME"},        \
                {I_REFERENCED,                "I_REFERENCED"}                \
        )

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a,b)         TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

#define WB_WORK_REASON                                                        \
        EM( WB_REASON_BACKGROUND,                "background")                \
        EM( WB_REASON_VMSCAN,                        "vmscan")                \
        EM( WB_REASON_SYNC,                        "sync")                        \
        EM( WB_REASON_PERIODIC,                        "periodic")                \
        EM( WB_REASON_LAPTOP_TIMER,                "laptop_timer")                \
        EM( WB_REASON_FS_FREE_SPACE,                "fs_free_space")        \
        EM( WB_REASON_FORKER_THREAD,                "forker_thread")        \
        EMe(WB_REASON_FOREIGN_FLUSH,                "foreign_flush")

WB_WORK_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

struct wb_writeback_work;

DECLARE_EVENT_CLASS(writeback_folio_template,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(pgoff_t, index)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
                __entry->index = folio->index;
        ),

        TP_printk("bdi %s: ino=%lu index=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->index
        )
);

DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DEFINE_EVENT(writeback_folio_template, folio_wait_writeback,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, flags)
        ),

        TP_fast_assign(
                struct backing_dev_info *bdi = inode_to_bdi(inode);

                /* may be called for files on pseudo FSes w/ unregistered bdi */
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->flags                = flags;
        ),

        TP_printk("bdi %s: ino=%lu state=%s flags=%s",
                __entry->name,
                (unsigned long)__entry->ino,
                show_inode_state(__entry->state),
                show_inode_state(__entry->flags)
        )
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

#ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return cgroup_ino(wb->memcg_css->cgroup);
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        if (wbc->wb)
                return __trace_wb_assign_cgroup(wbc->wb);
        else
                return 1;
}
#else        /* CONFIG_CGROUP_WRITEBACK */

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return 1;
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        return 1;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */
#endif        /* CREATE_TRACE_POINTS */

#ifdef CONFIG_CGROUP_WRITEBACK
TRACE_EVENT(inode_foreign_history,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                 unsigned int history),

        TP_ARGS(inode, wbc, history),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        history)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
                __entry->history        = history;
        ),

        TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->cgroup_ino,
                __entry->history
        )
);

TRACE_EVENT(inode_switch_wbs,

        TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
                 struct bdi_writeback *new_wb),

        TP_ARGS(inode, old_wb, new_wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                old_cgroup_ino)
                __field(ino_t,                new_cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
        ),

        TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->old_cgroup_ino,
                (unsigned long)__entry->new_cgroup_ino
        )
);

TRACE_EVENT(track_foreign_dirty,

        TP_PROTO(struct folio *folio, struct bdi_writeback *wb),

        TP_ARGS(folio, wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                bdi_id)
                __field(ino_t,                ino)
                __field(unsigned int,        memcg_id)
                __field(ino_t,                cgroup_ino)
                __field(ino_t,                page_cgroup_ino)
        ),

        TP_fast_assign(
                struct address_space *mapping = folio_mapping(folio);
                struct inode *inode = mapping ? mapping->host : NULL;

                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->bdi_id                = wb->bdi->id;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->memcg_id        = wb->memcg_css->id;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
        ),

        TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
                __entry->name,
                __entry->bdi_id,
                (unsigned long)__entry->ino,
                __entry->memcg_id,
                (unsigned long)__entry->cgroup_ino,
                (unsigned long)__entry->page_cgroup_ino
        )
);

TRACE_EVENT(flush_foreign,

        TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
                 unsigned int frn_memcg_id),

        TP_ARGS(wb, frn_bdi_id, frn_memcg_id),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        frn_bdi_id)
                __field(unsigned int,        frn_memcg_id)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->frn_bdi_id        = frn_bdi_id;
                __entry->frn_memcg_id        = frn_memcg_id;
        ),

        TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u",
                __entry->name,
                (unsigned long)__entry->cgroup_ino,
                __entry->frn_bdi_id,
                __entry->frn_memcg_id
        )
);
#endif

DECLARE_EVENT_CLASS(writeback_write_inode_template,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(int, sync_mode)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->sync_mode,
                (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DECLARE_EVENT_CLASS(writeback_work_class,
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
        TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
                __field(dev_t, sb_dev)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
                __entry->for_kupdate = work->for_kupdate;
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background        = work->for_background;
                __entry->reason = work->reason;
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
                  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
                  __entry->sync_mode,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
                  __print_symbolic(__entry->reason, WB_WORK_REASON),
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
        TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);

TRACE_EVENT(writeback_pages_written,
        TP_PROTO(long pages_written),
        TP_ARGS(pages_written),
        TP_STRUCT__entry(
                __field(long,                pages)
        ),
        TP_fast_assign(
                __entry->pages                = pages_written;
        ),
        TP_printk("%ld", __entry->pages)
);

DECLARE_EVENT_CLASS(writeback_class,
        TP_PROTO(struct bdi_writeback *wb),
        TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))

DEFINE_WRITEBACK_EVENT(writeback_wake_background);

TRACE_EVENT(writeback_bdi_register,
        TP_PROTO(struct backing_dev_info *bdi),
        TP_ARGS(bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
        ),
        TP_printk("bdi %s",
                __entry->name
        )
);

DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
        TP_ARGS(wbc, bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_to_write)
                __field(long, pages_skipped)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, for_background)
                __field(int, for_reclaim)
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_background        = wbc->for_background;
                __entry->for_reclaim        = wbc->for_reclaim;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->range_start        = (long)wbc->range_start;
                __entry->range_end        = (long)wbc->range_end;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                "bgrd=%d reclm=%d cyclic=%d "
                "start=0x%lx end=0x%lx cgroup_ino=%lu",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
                __entry->sync_mode,
                __entry->for_kupdate,
                __entry->for_background,
                __entry->for_reclaim,
                __entry->range_cyclic,
                __entry->range_start,
                __entry->range_end,
                (unsigned long)__entry->cgroup_ino
        )
)

#define DEFINE_WBC_EVENT(name) \
DEFINE_EVENT(wbc_class, name, \
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
        TP_ARGS(wbc, bdi))
DEFINE_WBC_EVENT(wbc_writepage);

TRACE_EVENT(writeback_queue_io,
        TP_PROTO(struct bdi_writeback *wb,
                 struct wb_writeback_work *work,
                 unsigned long dirtied_before,
                 int moved),
        TP_ARGS(wb, work, dirtied_before, moved),
        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(unsigned long,        older)
                __field(long,                age)
                __field(int,                moved)
                __field(int,                reason)
                __field(ino_t,                cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->older        = dirtied_before;
                __entry->age        = (jiffies - dirtied_before) * 1000 / HZ;
                __entry->moved        = moved;
                __entry->reason        = work->reason;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
                __entry->name,
                __entry->older,        /* dirtied_before in jiffies */
                __entry->age,        /* dirtied_before in relative milliseconds */
                __entry->moved,
                __print_symbolic(__entry->reason, WB_WORK_REASON),
                (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(global_dirty_state,

        TP_PROTO(unsigned long background_thresh,
                 unsigned long dirty_thresh
        ),

        TP_ARGS(background_thresh,
                dirty_thresh
        ),

        TP_STRUCT__entry(
                __field(unsigned long,        nr_dirty)
                __field(unsigned long,        nr_writeback)
                __field(unsigned long,        background_thresh)
                __field(unsigned long,        dirty_thresh)
                __field(unsigned long,        dirty_limit)
                __field(unsigned long,        nr_dirtied)
                __field(unsigned long,        nr_written)
        ),

        TP_fast_assign(
                __entry->nr_dirty        = global_node_page_state(NR_FILE_DIRTY);
                __entry->nr_writeback        = global_node_page_state(NR_WRITEBACK);
                __entry->nr_dirtied        = global_node_page_state(NR_DIRTIED);
                __entry->nr_written        = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh        = dirty_thresh;
                __entry->dirty_limit        = global_wb_domain.dirty_limit;
        ),

        TP_printk("dirty=%lu writeback=%lu "
                  "bg_thresh=%lu thresh=%lu limit=%lu "
                  "dirtied=%lu written=%lu",
                  __entry->nr_dirty,
                  __entry->nr_writeback,
                  __entry->background_thresh,
                  __entry->dirty_thresh,
                  __entry->dirty_limit,
                  __entry->nr_dirtied,
                  __entry->nr_written
        )
);

#define KBps(x)                        ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(bdi_dirty_ratelimit,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),

        TP_ARGS(wb, dirty_rate, task_ratelimit),

        TP_STRUCT__entry(
                __array(char,                bdi, 32)
                __field(unsigned long,        write_bw)
                __field(unsigned long,        avg_write_bw)
                __field(unsigned long,        dirty_rate)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        balanced_dirty_ratelimit)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
                __entry->write_bw        = KBps(wb->write_bandwidth);
                __entry->avg_write_bw        = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate        = KBps(dirty_rate);
                __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
                                        KBps(wb->balanced_dirty_ratelimit);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),

        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "balanced_dirty_ratelimit=%lu cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->write_bw,                /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,                /* bdi dirty rate */
                  __entry->dirty_ratelimit,        /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
                  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
                  (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(balance_dirty_pages,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long thresh,
                 unsigned long bg_thresh,
                 unsigned long dirty,
                 unsigned long bdi_thresh,
                 unsigned long bdi_dirty,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
                 unsigned long period,
                 long pause,
                 unsigned long start_time),

        TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),

        TP_STRUCT__entry(
                __array(         char,        bdi, 32)
                __field(unsigned long,        limit)
                __field(unsigned long,        setpoint)
                __field(unsigned long,        dirty)
                __field(unsigned long,        bdi_setpoint)
                __field(unsigned long,        bdi_dirty)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned int,        dirtied)
                __field(unsigned int,        dirtied_pause)
                __field(unsigned long,        paused)
                __field(         long,        pause)
                __field(unsigned long,        period)
                __field(         long,        think)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                unsigned long freerun = (thresh + bg_thresh) / 2;
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);

                __entry->limit                = global_wb_domain.dirty_limit;
                __entry->setpoint        = (global_wb_domain.dirty_limit +
                                                freerun) / 2;
                __entry->dirty                = dirty;
                __entry->bdi_setpoint        = __entry->setpoint *
                                                bdi_thresh / (thresh + 1);
                __entry->bdi_dirty        = bdi_dirty;
                __entry->dirty_ratelimit = KBps(dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause        = current->nr_dirtied_pause;
                __entry->think                = current->dirty_paused_when == 0 ? 0 :
                         (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
                __entry->period                = period * 1000 / HZ;
                __entry->pause                = pause * 1000 / HZ;
                __entry->paused                = (jiffies - start_time) * 1000 / HZ;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),


        TP_printk("bdi %s: "
                  "limit=%lu setpoint=%lu dirty=%lu "
                  "bdi_setpoint=%lu bdi_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
                  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
                  __entry->dirty,
                  __entry->bdi_setpoint,
                  __entry->bdi_dirty,
                  __entry->dirty_ratelimit,
                  __entry->task_ratelimit,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,        /* ms */
                  __entry->pause,        /* ms */
                  __entry->period,        /* ms */
                  __entry->think,        /* ms */
                  (unsigned long)__entry->cgroup_ino
          )
);

TRACE_EVENT(writeback_sb_inodes_requeue,

        TP_PROTO(struct inode *inode),
        TP_ARGS(inode),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(inode_to_wb(inode));
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  (unsigned long)__entry->cgroup_ino
        )
);

DECLARE_EVENT_CLASS(writeback_single_inode_template,

        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write
        ),

        TP_ARGS(inode, wbc, nr_to_write),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write        = nr_to_write;
                __entry->wrote                = nr_to_write - wbc->nr_to_write;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
                  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
                  __entry->wrote,
                  (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DECLARE_EVENT_CLASS(writeback_inode_template,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        state                        )
                __field(        __u16, mode                        )
                __field(unsigned long, dirtied_when                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->state        = inode->i_state;
                __entry->mode        = inode->i_mode;
                __entry->dirtied_when = inode->dirtied_when;
        ),

        TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long)__entry->ino, __entry->dirtied_when,
                  show_inode_state(__entry->state), __entry->mode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/*
 * Inode writeback list tracking.
 */

DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

#endif /* _TRACE_WRITEBACK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
































































    1 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SYNC_CORE_H
#define _ASM_X86_SYNC_CORE_H

#include <linux/preempt.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
#include <asm/special_insns.h>

#ifdef CONFIG_X86_32
static inline void iret_to_self(void)
{
        asm volatile (
                "pushfl\n\t"
                "pushl %%cs\n\t"
                "pushl $1f\n\t"
                "iret\n\t"
                "1:"
                : ASM_CALL_CONSTRAINT : : "memory");
}
#else
static inline void iret_to_self(void)
{
        unsigned int tmp;

        asm volatile (
                "mov %%ss, %0\n\t"
                "pushq %q0\n\t"
                "pushq %%rsp\n\t"
                "addq $8, (%%rsp)\n\t"
                "pushfq\n\t"
                "mov %%cs, %0\n\t"
                "pushq %q0\n\t"
                "pushq $1f\n\t"
                "iretq\n\t"
                "1:"
                : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
}
#endif /* CONFIG_X86_32 */

/*
 * This function forces the icache and prefetched instruction stream to
 * catch up with reality in two very specific cases:
 *
 *  a) Text was modified using one virtual address and is about to be executed
 *     from the same physical page at a different virtual address.
 *
 *  b) Text was modified on a different CPU, may subsequently be
 *     executed on this CPU, and you want to make sure the new version
 *     gets executed.  This generally means you're calling this in an IPI.
 *
 * If you're calling this for a different reason, you're probably doing
 * it wrong.
 *
 * Like all of Linux's memory ordering operations, this is a
 * compiler barrier as well.
 */
static inline void sync_core(void)
{
        /*
         * The SERIALIZE instruction is the most straightforward way to
         * do this, but it is not universally available.
         */
        if (static_cpu_has(X86_FEATURE_SERIALIZE)) {
                serialize();
                return;
        }

        /*
         * For all other processors, there are quite a few ways to do this.
         * IRET-to-self is nice because it works on every CPU, at any CPL
         * (so it's compatible with paravirtualization), and it never exits
         * to a hypervisor.  The only downsides are that it's a bit slow
         * (it seems to be a bit more than 2x slower than the fastest
         * options) and that it unmasks NMIs.  The "push %cs" is needed,
         * because in paravirtual environments __KERNEL_CS may not be a
         * valid CS value when we do IRET directly.
         *
         * In case NMI unmasking or performance ever becomes a problem,
         * the next best option appears to be MOV-to-CR2 and an
         * unconditional jump.  That sequence also works on all CPUs,
         * but it will fault at CPL3 (i.e. Xen PV).
         *
         * CPUID is the conventional way, but it's nasty: it doesn't
         * exist on some 486-like CPUs, and it usually exits to a
         * hypervisor.
         */
        iret_to_self();
}

/*
 * Ensure that a core serializing instruction is issued before returning
 * to user-mode. x86 implements return to user-space through sysexit,
 * sysrel, and sysretq, which are not core serializing.
 */
static inline void sync_core_before_usermode(void)
{
        /* With PTI, we unconditionally serialize before running user code. */
        if (static_cpu_has(X86_FEATURE_PTI))
                return;

        /*
         * Even if we're in an interrupt, we might reschedule before returning,
         * in which case we could switch to a different thread in the same mm
         * and return using SYSRET or SYSEXIT.  Instead of trying to keep
         * track of our need to sync the core, just sync right away.
         */
        sync_core();
}

#endif /* _ASM_X86_SYNC_CORE_H */











    1 






    1 






























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
// SPDX-License-Identifier: GPL-2.0
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/netpoll.h>
#include <linux/export.h>
#include <net/gro.h>
#include "vlan.h"

bool vlan_do_receive(struct sk_buff **skbp)
{
        struct sk_buff *skb = *skbp;
        __be16 vlan_proto = skb->vlan_proto;
        u16 vlan_id = skb_vlan_tag_get_id(skb);
        struct net_device *vlan_dev;
        struct vlan_pcpu_stats *rx_stats;

        vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id);
        if (!vlan_dev)
                return false;

        skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                return false;

        if (unlikely(!(vlan_dev->flags & IFF_UP))) {
                kfree_skb(skb);
                *skbp = NULL;
                return false;
        }

        skb->dev = vlan_dev;
        if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
                /* Our lower layer thinks this is not local, let's make sure.
                 * This allows the VLAN to have a different MAC than the
                 * underlying device, and still route correctly. */
                if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr))
                        skb->pkt_type = PACKET_HOST;
        }

        if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) &&
            !netif_is_macvlan_port(vlan_dev) &&
            !netif_is_bridge_port(vlan_dev)) {
                unsigned int offset = skb->data - skb_mac_header(skb);

                /*
                 * vlan_insert_tag expect skb->data pointing to mac header.
                 * So change skb->data before calling it and change back to
                 * original position later
                 */
                skb_push(skb, offset);
                skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto,
                                                    skb->vlan_tci, skb->mac_len);
                if (!skb)
                        return false;
                skb_pull(skb, offset + VLAN_HLEN);
                skb_reset_mac_len(skb);
        }

        skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
        __vlan_hwaccel_clear_tag(skb);

        rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);

        u64_stats_update_begin(&rx_stats->syncp);
        u64_stats_inc(&rx_stats->rx_packets);
        u64_stats_add(&rx_stats->rx_bytes, skb->len);
        if (skb->pkt_type == PACKET_MULTICAST)
                u64_stats_inc(&rx_stats->rx_multicast);
        u64_stats_update_end(&rx_stats->syncp);

        return true;
}

/* Must be invoked with rcu_read_lock. */
struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
                                        __be16 vlan_proto, u16 vlan_id)
{
        struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);

        if (vlan_info) {
                return vlan_group_get_device(&vlan_info->grp,
                                             vlan_proto, vlan_id);
        } else {
                /*
                 * Lower devices of master uppers (bonding, team) do not have
                 * grp assigned to themselves. Grp is assigned to upper device
                 * instead.
                 */
                struct net_device *upper_dev;

                upper_dev = netdev_master_upper_dev_get_rcu(dev);
                if (upper_dev)
                        return __vlan_find_dev_deep_rcu(upper_dev,
                                                    vlan_proto, vlan_id);
        }

        return NULL;
}
EXPORT_SYMBOL(__vlan_find_dev_deep_rcu);

struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        struct net_device *ret = vlan_dev_priv(dev)->real_dev;

        while (is_vlan_dev(ret))
                ret = vlan_dev_priv(ret)->real_dev;

        return ret;
}
EXPORT_SYMBOL(vlan_dev_real_dev);

u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        return vlan_dev_priv(dev)->vlan_id;
}
EXPORT_SYMBOL(vlan_dev_vlan_id);

__be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        return vlan_dev_priv(dev)->vlan_proto;
}
EXPORT_SYMBOL(vlan_dev_vlan_proto);

/*
 * vlan info and vid list
 */

static void vlan_group_free(struct vlan_group *grp)
{
        int i, j;

        for (i = 0; i < VLAN_PROTO_NUM; i++)
                for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++)
                        kfree(grp->vlan_devices_arrays[i][j]);
}

static void vlan_info_free(struct vlan_info *vlan_info)
{
        vlan_group_free(&vlan_info->grp);
        kfree(vlan_info);
}

static void vlan_info_rcu_free(struct rcu_head *rcu)
{
        vlan_info_free(container_of(rcu, struct vlan_info, rcu));
}

static struct vlan_info *vlan_info_alloc(struct net_device *dev)
{
        struct vlan_info *vlan_info;

        vlan_info = kzalloc(sizeof(struct vlan_info), GFP_KERNEL);
        if (!vlan_info)
                return NULL;

        vlan_info->real_dev = dev;
        INIT_LIST_HEAD(&vlan_info->vid_list);
        return vlan_info;
}

struct vlan_vid_info {
        struct list_head list;
        __be16 proto;
        u16 vid;
        int refcount;
};

static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) &&
            dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
                return true;
        if (proto == htons(ETH_P_8021AD) &&
            dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
                return true;
        return false;
}

static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
                                               __be16 proto, u16 vid)
{
        struct vlan_vid_info *vid_info;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (vid_info->proto == proto && vid_info->vid == vid)
                        return vid_info;
        }
        return NULL;
}

static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
{
        struct vlan_vid_info *vid_info;

        vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL);
        if (!vid_info)
                return NULL;
        vid_info->proto = proto;
        vid_info->vid = vid;

        return vid_info;
}

static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
        if (!vlan_hw_filter_capable(dev, proto))
                return 0;

        if (netif_device_present(dev))
                return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid);
        else
                return -ENODEV;
}

static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
        if (!vlan_hw_filter_capable(dev, proto))
                return 0;

        if (netif_device_present(dev))
                return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
        else
                return -ENODEV;
}

int vlan_for_each(struct net_device *dev,
                  int (*action)(struct net_device *dev, int vid, void *arg),
                  void *arg)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;
        struct net_device *vdev;
        int ret;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return 0;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
                                             vid_info->vid);
                ret = action(vdev, vid_info->vid, arg);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(vlan_for_each);

int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
{
        struct net_device *real_dev = vlan_info->real_dev;
        struct vlan_vid_info *vlan_vid_info;
        int err;

        list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) {
                if (vlan_vid_info->proto == proto) {
                        err = vlan_add_rx_filter_info(real_dev, proto,
                                                      vlan_vid_info->vid);
                        if (err)
                                goto unwind;
                }
        }

        return 0;

unwind:
        list_for_each_entry_continue_reverse(vlan_vid_info,
                                             &vlan_info->vid_list, list) {
                if (vlan_vid_info->proto == proto)
                        vlan_kill_rx_filter_info(real_dev, proto,
                                                 vlan_vid_info->vid);
        }

        return err;
}
EXPORT_SYMBOL(vlan_filter_push_vids);

void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto)
{
        struct vlan_vid_info *vlan_vid_info;

        list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list)
                if (vlan_vid_info->proto == proto)
                        vlan_kill_rx_filter_info(vlan_info->real_dev,
                                                 vlan_vid_info->proto,
                                                 vlan_vid_info->vid);
}
EXPORT_SYMBOL(vlan_filter_drop_vids);

static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
                          struct vlan_vid_info **pvid_info)
{
        struct net_device *dev = vlan_info->real_dev;
        struct vlan_vid_info *vid_info;
        int err;

        vid_info = vlan_vid_info_alloc(proto, vid);
        if (!vid_info)
                return -ENOMEM;

        err = vlan_add_rx_filter_info(dev, proto, vid);
        if (err) {
                kfree(vid_info);
                return err;
        }

        list_add(&vid_info->list, &vlan_info->vid_list);
        vlan_info->nr_vids++;
        *pvid_info = vid_info;
        return 0;
}

int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        struct vlan_info *vlan_info;
        struct vlan_vid_info *vid_info;
        bool vlan_info_created = false;
        int err;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info) {
                vlan_info = vlan_info_alloc(dev);
                if (!vlan_info)
                        return -ENOMEM;
                vlan_info_created = true;
        }
        vid_info = vlan_vid_info_get(vlan_info, proto, vid);
        if (!vid_info) {
                err = __vlan_vid_add(vlan_info, proto, vid, &vid_info);
                if (err)
                        goto out_free_vlan_info;
        }
        vid_info->refcount++;

        if (vlan_info_created)
                rcu_assign_pointer(dev->vlan_info, vlan_info);

        return 0;

out_free_vlan_info:
        if (vlan_info_created)
                kfree(vlan_info);
        return err;
}
EXPORT_SYMBOL(vlan_vid_add);

static void __vlan_vid_del(struct vlan_info *vlan_info,
                           struct vlan_vid_info *vid_info)
{
        struct net_device *dev = vlan_info->real_dev;
        __be16 proto = vid_info->proto;
        u16 vid = vid_info->vid;
        int err;

        err = vlan_kill_rx_filter_info(dev, proto, vid);
        if (err && dev->reg_state != NETREG_UNREGISTERING)
                netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid);

        list_del(&vid_info->list);
        kfree(vid_info);
        vlan_info->nr_vids--;
}

void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
        struct vlan_info *vlan_info;
        struct vlan_vid_info *vid_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return;

        vid_info = vlan_vid_info_get(vlan_info, proto, vid);
        if (!vid_info)
                return;
        vid_info->refcount--;
        if (vid_info->refcount == 0) {
                __vlan_vid_del(vlan_info, vid_info);
                if (vlan_info->nr_vids == 0) {
                        RCU_INIT_POINTER(dev->vlan_info, NULL);
                        call_rcu(&vlan_info->rcu, vlan_info_rcu_free);
                }
        }
}
EXPORT_SYMBOL(vlan_vid_del);

int vlan_vids_add_by_dev(struct net_device *dev,
                         const struct net_device *by_dev)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;
        int err;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(by_dev->vlan_info);
        if (!vlan_info)
                return 0;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
                if (err)
                        goto unwind;
        }
        return 0;

unwind:
        list_for_each_entry_continue_reverse(vid_info,
                                             &vlan_info->vid_list,
                                             list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                vlan_vid_del(dev, vid_info->proto, vid_info->vid);
        }

        return err;
}
EXPORT_SYMBOL(vlan_vids_add_by_dev);

void vlan_vids_del_by_dev(struct net_device *dev,
                          const struct net_device *by_dev)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(by_dev->vlan_info);
        if (!vlan_info)
                return;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                vlan_vid_del(dev, vid_info->proto, vid_info->vid);
        }
}
EXPORT_SYMBOL(vlan_vids_del_by_dev);

bool vlan_uses_dev(const struct net_device *dev)
{
        struct vlan_info *vlan_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return false;
        return vlan_info->grp.nr_vlan_devs ? true : false;
}
EXPORT_SYMBOL(vlan_uses_dev);

static struct sk_buff *vlan_gro_receive(struct list_head *head,
                                        struct sk_buff *skb)
{
        const struct packet_offload *ptype;
        unsigned int hlen, off_vlan;
        struct sk_buff *pp = NULL;
        struct vlan_hdr *vhdr;
        struct sk_buff *p;
        __be16 type;
        int flush = 1;

        off_vlan = skb_gro_offset(skb);
        hlen = off_vlan + sizeof(*vhdr);
        vhdr = skb_gro_header(skb, hlen, off_vlan);
        if (unlikely(!vhdr))
                goto out;

        NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;

        type = vhdr->h_vlan_encapsulated_proto;

        ptype = gro_find_receive_by_type(type);
        if (!ptype)
                goto out;

        flush = 0;

        list_for_each_entry(p, head, list) {
                struct vlan_hdr *vhdr2;

                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
                if (compare_vlan_header(vhdr, vhdr2))
                        NAPI_GRO_CB(p)->same_flow = 0;
        }

        skb_gro_pull(skb, sizeof(*vhdr));
        skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));

        pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
                                            ipv6_gro_receive, inet_gro_receive,
                                            head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}

static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
        __be16 type = vhdr->h_vlan_encapsulated_proto;
        struct packet_offload *ptype;
        int err = -ENOENT;

        ptype = gro_find_complete_by_type(type);
        if (ptype)
                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
                                         ipv6_gro_complete, inet_gro_complete,
                                         skb, nhoff + sizeof(*vhdr));

        return err;
}

static struct packet_offload vlan_packet_offloads[] __read_mostly = {
        {
                .type = cpu_to_be16(ETH_P_8021Q),
                .priority = 10,
                .callbacks = {
                        .gro_receive = vlan_gro_receive,
                        .gro_complete = vlan_gro_complete,
                },
        },
        {
                .type = cpu_to_be16(ETH_P_8021AD),
                .priority = 10,
                .callbacks = {
                        .gro_receive = vlan_gro_receive,
                        .gro_complete = vlan_gro_complete,
                },
        },
};

static int __init vlan_offload_init(void)
{
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
                dev_add_offload(&vlan_packet_offloads[i]);

        return 0;
}

fs_initcall(vlan_offload_init);




































































































































































































































































































































































    8 






    3 







    6 









    8 









    2 
    2 




    2 



























































    3 
















    1 

    2 


    3 













    1 
    2 




    1 
    1 






































































    3 














    3 

    1 

    2 


    1 
    2 



    3 










    2 





    2 




    2 

    2 




    2 





    1 





    1 
    1 



    1 
    1 
    2 





    1 



















    1 





    1 


    1 
























    1 





    1 


    1 












    1 



    1 
    1 


    1 






    1 













    1 



    2 












    2 



































































































































    1 



























    2 

    2 
    2 






    2 

















    1 






    1 

    1 



    1 
    1 







    1 






    1 




    1 
    1 


    1 







    1 





    1 




    1 

    1 



    1 
























    1 


    1 



















    1 


    1 





    1 





    1 





    1 




    1 














    1 




    1 








































































































    1 



















    1 



























    1 

























    1 








    1 
    1 













    1 

    1 























    1 













    1 
























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/read_write.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/sched/xacct.h>
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include "internal.h"

#include <linux/uaccess.h>
#include <asm/unistd.h>

const struct file_operations generic_ro_fops = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .mmap                = generic_file_readonly_mmap,
        .splice_read        = filemap_splice_read,
};

EXPORT_SYMBOL(generic_ro_fops);

static inline bool unsigned_offsets(struct file *file)
{
        return file->f_mode & FMODE_UNSIGNED_OFFSET;
}

/**
 * vfs_setpos - update the file offset for lseek
 * @file:        file structure in question
 * @offset:        file offset to seek to
 * @maxsize:        maximum file size
 *
 * This is a low-level filesystem helper for updating the file offset to
 * the value specified by @offset if the given offset is valid and it is
 * not equal to the current file offset.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
        if (offset > maxsize)
                return -EINVAL;

        if (offset != file->f_pos) {
                file->f_pos = offset;
                file->f_version = 0;
        }
        return offset;
}
EXPORT_SYMBOL(vfs_setpos);

/**
 * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @maxsize:        max size of this file in file system
 * @eof:        offset used for SEEK_END position
 *
 * This is a variant of generic_file_llseek that allows passing in a custom
 * maximum file size and a custom EOF position, for e.g. hashed directories
 *
 * Synchronization:
 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
 * read/writes behave like SEEK_SET against seeks.
 */
loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                loff_t maxsize, loff_t eof)
{
        switch (whence) {
        case SEEK_END:
                offset += eof;
                break;
        case SEEK_CUR:
                /*
                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
                 * position-querying operation.  Avoid rewriting the "same"
                 * f_pos value back to the file because a concurrent read(),
                 * write() or lseek() might have altered it
                 */
                if (offset == 0)
                        return file->f_pos;
                /*
                 * f_lock protects against read/modify/write race with other
                 * SEEK_CURs. Note that parallel writes and reads behave
                 * like SEEK_SET.
                 */
                spin_lock(&file->f_lock);
                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
                spin_unlock(&file->f_lock);
                return offset;
        case SEEK_DATA:
                /*
                 * In the generic case the entire file is data, so as long as
                 * offset isn't at the end of the file then the offset is data.
                 */
                if ((unsigned long long)offset >= eof)
                        return -ENXIO;
                break;
        case SEEK_HOLE:
                /*
                 * There is a virtual hole at the end of the file, so as long as
                 * offset isn't i_size or larger, return i_size.
                 */
                if ((unsigned long long)offset >= eof)
                        return -ENXIO;
                offset = eof;
                break;
        }

        return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);

/**
 * generic_file_llseek - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is a generic implemenation of ->llseek useable for all normal local
 * filesystems.  It just updates the file offset to the value specified by
 * @offset and @whence.
 */
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;

        return generic_file_llseek_size(file, offset, whence,
                                        inode->i_sb->s_maxbytes,
                                        i_size_read(inode));
}
EXPORT_SYMBOL(generic_file_llseek);

/**
 * fixed_size_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        size of the file
 *
 */
loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR: case SEEK_END:
                return generic_file_llseek_size(file, offset, whence,
                                                size, size);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(fixed_size_llseek);

/**
 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 */
loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                OFFSET_MAX, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek);

/**
 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        maximal offset allowed
 *
 */
loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                size, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek_size);

/**
 * noop_llseek - No Operation Performed llseek implementation
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is an implementation of ->llseek useable for the rare special case when
 * userspace expects the seek to succeed but the (device) file is actually not
 * able to perform the seek. In this case you use noop_llseek() instead of
 * falling back to the default implementation of ->llseek.
 */
loff_t noop_llseek(struct file *file, loff_t offset, int whence)
{
        return file->f_pos;
}
EXPORT_SYMBOL(noop_llseek);

loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file_inode(file);
        loff_t retval;

        inode_lock(inode);
        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
                        break;
                case SEEK_CUR:
                        if (offset == 0) {
                                retval = file->f_pos;
                                goto out;
                        }
                        offset += file->f_pos;
                        break;
                case SEEK_DATA:
                        /*
                         * In the generic case the entire file is data, so as
                         * long as offset isn't at the end of the file then the
                         * offset is data.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        break;
                case SEEK_HOLE:
                        /*
                         * There is a virtual hole at the end of the file, so
                         * as long as offset isn't i_size or larger, return
                         * i_size.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        offset = inode->i_size;
                        break;
        }
        retval = -EINVAL;
        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
out:
        inode_unlock(inode);
        return retval;
}
EXPORT_SYMBOL(default_llseek);

loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
        if (!(file->f_mode & FMODE_LSEEK))
                return -ESPIPE;
        return file->f_op->llseek(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);

static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
        off_t retval;
        struct fd f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;

        retval = -EINVAL;
        if (whence <= SEEK_MAX) {
                loff_t res = vfs_llseek(f.file, offset, whence);
                retval = res;
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;        /* LFS: should only happen on 32 bit platforms */
        }
        fdput_pos(f);
        return retval;
}

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}
#endif

#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
        defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
                unsigned int, whence)
{
        int retval;
        struct fd f = fdget_pos(fd);
        loff_t offset;

        if (!f.file)
                return -EBADF;

        retval = -EINVAL;
        if (whence > SEEK_MAX)
                goto out_putf;

        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
                        whence);

        retval = (int)offset;
        if (offset >= 0) {
                retval = -EFAULT;
                if (!copy_to_user(result, &offset, sizeof(offset)))
                        retval = 0;
        }
out_putf:
        fdput_pos(f);
        return retval;
}
#endif

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
        int mask = read_write == READ ? MAY_READ : MAY_WRITE;
        int ret;

        if (unlikely((ssize_t) count < 0))
                return -EINVAL;

        if (ppos) {
                loff_t pos = *ppos;

                if (unlikely(pos < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                        if (count >= -pos) /* both values are in 0..LLONG_MAX */
                                return -EOVERFLOW;
                } else if (unlikely((loff_t) (pos + count) < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                }
        }

        ret = security_file_permission(file, mask);
        if (ret)
                return ret;

        return fsnotify_file_area_perm(file, mask, ppos, count);
}
EXPORT_SYMBOL(rw_verify_area);

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_DEST, buf, len);

        ret = filp->f_op->read_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

static int warn_unsupported(struct file *file, const char *op)
{
        pr_warn_ratelimited(
                "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
                return -EINVAL;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        /*
         * Also fail if ->read_iter and ->read are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->read_iter || file->f_op->read))
                return warn_unsupported(file, "read");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
        ret = file->f_op->read_iter(&kiocb, &iter);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        return __kernel_read(file, buf, count, pos);
}
EXPORT_SYMBOL(kernel_read);

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        if (file->f_op->read)
                ret = file->f_op->read(file, buf, count, pos);
        else if (file->f_op->read_iter)
                ret = new_sync_read(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);

        ret = filp->f_op->write_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ret > 0 && ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
{
        struct kiocb kiocb;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        /*
         * Also fail if ->write_iter and ->write are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->write_iter || file->f_op->write))
                return warn_unsupported(file, "write");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        ret = file->f_op->write_iter(&kiocb, from);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = (void *)buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct iov_iter iter;
        iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
        return __kernel_write_iter(file, &iter, pos);
}
/*
 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 * but autofs is one of the few internal kernel users that actually
 * wants this _and_ can be built as a module. So we need to export
 * this symbol for autofs, even though it really isn't appropriate
 * for any other kernel modules.
 */
EXPORT_SYMBOL_GPL(__kernel_write);

ssize_t kernel_write(struct file *file, const void *buf, size_t count,
                            loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;

        file_start_write(file);
        ret =  __kernel_write(file, buf, count, pos);
        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL(kernel_write);

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;
        file_start_write(file);
        if (file->f_op->write)
                ret = file->f_op->write(file, buf, count, pos);
        else if (file->f_op->write_iter)
                ret = new_sync_write(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
        return ret;
}

/* file_ppos returns &file->f_pos or NULL if file is stream */
static inline loff_t *file_ppos(struct file *file)
{
        return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_read(f.file, buf, count, ppos);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }
        return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
        return ksys_read(fd, buf, count);
}

ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_write(f.file, buf, count, ppos);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
{
        return ksys_write(fd, buf, count);
}

ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
                     loff_t pos)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
                        ret = vfs_read(f.file, buf, count, &pos);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
                        size_t, count, loff_t, pos)
{
        return ksys_pread64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
                      size_t count, loff_t pos)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)  
                        ret = vfs_write(f.file, buf, count, &pos);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
                         size_t, count, loff_t, pos)
{
        return ksys_pwrite64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        struct kiocb kiocb;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        ret = kiocb_set_rw_flags(&kiocb, flags);
        if (ret)
                return ret;
        kiocb.ki_pos = (ppos ? *ppos : 0);

        if (type == READ)
                ret = filp->f_op->read_iter(&kiocb, iter);
        else
                ret = filp->f_op->write_iter(&kiocb, iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        ssize_t ret = 0;

        if (flags & ~RWF_HIPRI)
                return -EOPNOTSUPP;

        while (iov_iter_count(iter)) {
                ssize_t nr;

                if (type == READ) {
                        nr = filp->f_op->read(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                } else {
                        nr = filp->f_op->write(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                }

                if (nr < 0) {
                        if (!ret)
                                ret = nr;
                        break;
                }
                ret += nr;
                if (nr != iter_iov_len(iter))
                        break;
                iov_iter_advance(iter, nr);
        }

        return ret;
}

ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        ret = file->f_op->read_iter(iocb, iter);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_read);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                      rwf_t flags)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iter_read);

/*
 * Caller is responsible for calling kiocb_end_write() on completion
 * if async iocb was queued.
 */
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->write_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;
        ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        kiocb_start_write(iocb);
        ret = file->f_op->write_iter(iocb, iter);
        if (ret != -EIOCBQUEUED)
                kiocb_end_write(iocb);
        if (ret > 0)
                fsnotify_modify(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_write);

ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                       rwf_t flags)
{
        size_t tot_len;
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (!file->f_op->write_iter)
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;

        ret = rw_verify_area(WRITE, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        file_start_write(file);
        ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iter_write);

static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
                         unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(READ, file, pos, tot_len);
        if (ret < 0)
                goto out;

        if (file->f_op->read_iter)
                ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        kfree(iov);
        return ret;
}

static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
                          unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(WRITE, file, pos, tot_len);
        if (ret < 0)
                goto out;

        file_start_write(file);
        if (file->f_op->write_iter)
                ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);
out:
        kfree(iov);
        return ret;
}

static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
                        unsigned long vlen, rwf_t flags)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_readv(f.file, vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, rwf_t flags)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_writev(f.file, vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
{
#define HALF_LONG_BITS (BITS_PER_LONG / 2)
        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}

static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, loff_t pos, rwf_t flags)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
                          unsigned long vlen, loff_t pos, rwf_t flags)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)
                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_readv(fd, vec, vlen, 0);
}

SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_writev(fd, vec, vlen, 0);
}

SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_preadv(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);

        return do_preadv(fd, vec, vlen, pos, flags);
}

SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_pwritev(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);

        return do_pwritev(fd, vec, vlen, pos, flags);
}

/*
 * Various compat syscalls.  Note that they all pretend to take a native
 * iovec - import_iovec will properly treat those as compat_iovecs based on
 * in_compat_syscall().
 */
#ifdef CONFIG_COMPAT
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_preadv(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_preadv(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
                rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_pwritev(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif /* CONFIG_COMPAT */

static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                           size_t count, loff_t max)
{
        struct fd in, out;
        struct inode *in_inode, *out_inode;
        struct pipe_inode_info *opipe;
        loff_t pos;
        loff_t out_pos;
        ssize_t retval;
        int fl;

        /*
         * Get input file, and verify that it is ok..
         */
        retval = -EBADF;
        in = fdget(in_fd);
        if (!in.file)
                goto out;
        if (!(in.file->f_mode & FMODE_READ))
                goto fput_in;
        retval = -ESPIPE;
        if (!ppos) {
                pos = in.file->f_pos;
        } else {
                pos = *ppos;
                if (!(in.file->f_mode & FMODE_PREAD))
                        goto fput_in;
        }
        retval = rw_verify_area(READ, in.file, &pos, count);
        if (retval < 0)
                goto fput_in;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        /*
         * Get output file, and verify that it is ok..
         */
        retval = -EBADF;
        out = fdget(out_fd);
        if (!out.file)
                goto fput_in;
        if (!(out.file->f_mode & FMODE_WRITE))
                goto fput_out;
        in_inode = file_inode(in.file);
        out_inode = file_inode(out.file);
        out_pos = out.file->f_pos;

        if (!max)
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);

        if (unlikely(pos + count > max)) {
                retval = -EOVERFLOW;
                if (pos >= max)
                        goto fput_out;
                count = max - pos;
        }

        fl = 0;
#if 0
        /*
         * We need to debate whether we can enable this or not. The
         * man page documents EAGAIN return for the output at least,
         * and the application is arguably buggy if it doesn't expect
         * EAGAIN on a non-blocking file descriptor.
         */
        if (in.file->f_flags & O_NONBLOCK)
                fl = SPLICE_F_NONBLOCK;
#endif
        opipe = get_pipe_info(out.file, true);
        if (!opipe) {
                retval = rw_verify_area(WRITE, out.file, &out_pos, count);
                if (retval < 0)
                        goto fput_out;
                retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
                                          count, fl);
        } else {
                if (out.file->f_flags & O_NONBLOCK)
                        fl |= SPLICE_F_NONBLOCK;

                retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
        }

        if (retval > 0) {
                add_rchar(current, retval);
                add_wchar(current, retval);
                fsnotify_access(in.file);
                fsnotify_modify(out.file);
                out.file->f_pos = out_pos;
                if (ppos)
                        *ppos = pos;
                else
                        in.file->f_pos = pos;
        }

        inc_syscr(current);
        inc_syscw(current);
        if (pos > max)
                retval = -EOVERFLOW;

fput_out:
        fdput(out);
fput_in:
        fdput(in);
out:
        return retval;
}

SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
                compat_off_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
                compat_loff_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif

/*
 * Performs necessary checks before doing a file copy
 *
 * Can adjust amount of bytes to copy via @req_count argument.
 * Returns appropriate error code that caller should return or
 * zero in case the copy should be allowed.
 */
static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    size_t *req_count, unsigned int flags)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);
        uint64_t count = *req_count;
        loff_t size_in;
        int ret;

        ret = generic_file_rw_checks(file_in, file_out);
        if (ret)
                return ret;

        /*
         * We allow some filesystems to handle cross sb copy, but passing
         * a file of the wrong filesystem type to filesystem driver can result
         * in an attempt to dereference the wrong type of ->private_data, so
         * avoid doing that until we really have a good reason.
         *
         * nfs and cifs define several different file_system_type structures
         * and several different sets of file_operations, but they all end up
         * using the same ->copy_file_range() function pointer.
         */
        if (flags & COPY_FILE_SPLICE) {
                /* cross sb splice is allowed */
        } else if (file_out->f_op->copy_file_range) {
                if (file_in->f_op->copy_file_range !=
                    file_out->f_op->copy_file_range)
                        return -EXDEV;
        } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
                return -EXDEV;
        }

        /* Don't touch certain kinds of inodes */
        if (IS_IMMUTABLE(inode_out))
                return -EPERM;

        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
                return -ETXTBSY;

        /* Ensure offsets don't wrap. */
        if (pos_in + count < pos_in || pos_out + count < pos_out)
                return -EOVERFLOW;

        /* Shorten the copy to EOF */
        size_in = i_size_read(inode_in);
        if (pos_in >= size_in)
                count = 0;
        else
                count = min(count, size_in - (uint64_t)pos_in);

        ret = generic_write_check_limits(file_out, pos_out, &count);
        if (ret)
                return ret;

        /* Don't allow overlapped copying within the same file. */
        if (inode_in == inode_out &&
            pos_out + count > pos_in &&
            pos_out < pos_in + count)
                return -EINVAL;

        *req_count = count;
        return 0;
}

/*
 * copy_file_range() differs from regular file read and write in that it
 * specifically allows return partial success.  When it does so is up to
 * the copy_file_range method.
 */
ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            size_t len, unsigned int flags)
{
        ssize_t ret;
        bool splice = flags & COPY_FILE_SPLICE;
        bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;

        if (flags & ~COPY_FILE_SPLICE)
                return -EINVAL;

        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
                                       flags);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(READ, file_in, &pos_in, len);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
        if (unlikely(ret))
                return ret;

        if (len == 0)
                return 0;

        file_start_write(file_out);

        /*
         * Cloning is supported by more file systems, so we implement copy on
         * same sb using clone, but for filesystems where both clone and copy
         * are supported (e.g. nfs,cifs), we only call the copy method.
         */
        if (!splice && file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in,
                                                      file_out, pos_out,
                                                      len, flags);
        } else if (!splice && file_in->f_op->remap_file_range && samesb) {
                ret = file_in->f_op->remap_file_range(file_in, pos_in,
                                file_out, pos_out,
                                min_t(loff_t, MAX_RW_COUNT, len),
                                REMAP_FILE_CAN_SHORTEN);
                /* fallback to splice */
                if (ret <= 0)
                        splice = true;
        } else if (samesb) {
                /* Fallback to splice for same sb copy for backward compat */
                splice = true;
        }

        file_end_write(file_out);

        if (!splice)
                goto done;

        /*
         * We can get here for same sb copy of filesystems that do not implement
         * ->copy_file_range() in case filesystem does not support clone or in
         * case filesystem supports clone but rejected the clone request (e.g.
         * because it was not block aligned).
         *
         * In both cases, fall back to kernel copy so we are able to maintain a
         * consistent story about which filesystems support copy_file_range()
         * and which filesystems do not, that will allow userspace tools to
         * make consistent desicions w.r.t using copy_file_range().
         *
         * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
         * for server-side-copy between any two sb.
         *
         * In any case, we call do_splice_direct() and not splice_file_range(),
         * without file_start_write() held, to avoid possible deadlocks related
         * to splicing from input file, while file_start_write() is held on
         * the output file on a different sb.
         */
        ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
                               min_t(size_t, len, MAX_RW_COUNT), 0);
done:
        if (ret > 0) {
                fsnotify_access(file_in);
                add_rchar(current, ret);
                fsnotify_modify(file_out);
                add_wchar(current, ret);
        }

        inc_syscr(current);
        inc_syscw(current);

        return ret;
}
EXPORT_SYMBOL(vfs_copy_file_range);

SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        loff_t pos_in;
        loff_t pos_out;
        struct fd f_in;
        struct fd f_out;
        ssize_t ret = -EBADF;

        f_in = fdget(fd_in);
        if (!f_in.file)
                goto out2;

        f_out = fdget(fd_out);
        if (!f_out.file)
                goto out1;

        ret = -EFAULT;
        if (off_in) {
                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
                        goto out;
        } else {
                pos_in = f_in.file->f_pos;
        }

        if (off_out) {
                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
                        goto out;
        } else {
                pos_out = f_out.file->f_pos;
        }

        ret = -EINVAL;
        if (flags != 0)
                goto out;

        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
                                  flags);
        if (ret > 0) {
                pos_in += ret;
                pos_out += ret;

                if (off_in) {
                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        f_in.file->f_pos = pos_in;
                }

                if (off_out) {
                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        f_out.file->f_pos = pos_out;
                }
        }

out:
        fdput(f_out);
out1:
        fdput(f_in);
out2:
        return ret;
}

/*
 * Don't operate on ranges the page cache doesn't support, and don't exceed the
 * LFS limits.  If pos is under the limit it becomes a short access.  If it
 * exceeds the limit we return -EFBIG.
 */
int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
        struct inode *inode = file->f_mapping->host;
        loff_t max_size = inode->i_sb->s_maxbytes;
        loff_t limit = rlimit(RLIMIT_FSIZE);

        if (limit != RLIM_INFINITY) {
                if (pos >= limit) {
                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                *count = min(*count, limit - pos);
        }

        if (!(file->f_flags & O_LARGEFILE))
                max_size = MAX_NON_LFS;

        if (unlikely(pos >= max_size))
                return -EFBIG;

        *count = min(*count, max_size - pos);

        return 0;
}
EXPORT_SYMBOL_GPL(generic_write_check_limits);

/* Like generic_write_checks(), but takes size of write instead of iter. */
int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;

        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        if (!*count)
                return 0;

        if (iocb->ki_flags & IOCB_APPEND)
                iocb->ki_pos = i_size_read(inode);

        if ((iocb->ki_flags & IOCB_NOWAIT) &&
            !((iocb->ki_flags & IOCB_DIRECT) ||
              (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
                return -EINVAL;

        return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
}
EXPORT_SYMBOL(generic_write_checks_count);

/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position or amount of bytes to write.
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        loff_t count = iov_iter_count(from);
        int ret;

        ret = generic_write_checks_count(iocb, &count);
        if (ret)
                return ret;

        iov_iter_truncate(from, count);
        return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);

/*
 * Performs common checks before doing a file copy/clone
 * from @file_in to @file_out.
 */
int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);

        /* Don't copy dirs, pipes, sockets... */
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;

        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
            (file_out->f_flags & O_APPEND))
                return -EBADF;

        return 0;
}
































































































    1 


    1 
    1 






















































































































































































    5 



    4 





    4 


    3 





    4 







    3 





    3 

















































    3 










    4 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
// SPDX-License-Identifier: GPL-2.0
/*
 *  mm/pgtable-generic.c
 *
 *  Generic pgtable methods declared in linux/pgtable.h
 *
 *  Copyright (C) 2010  Linus Torvalds
 */

#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>

/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
        pgd_ERROR(*pgd);
        pgd_clear(pgd);
}

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *p4d)
{
        p4d_ERROR(*p4d);
        p4d_clear(p4d);
}
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *pud)
{
        pud_ERROR(*pud);
        pud_clear(pud);
}
#endif

/*
 * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
 * above. pmd folding is special and typically pmd_* macros refer to upper
 * level even when folded
 */
void pmd_clear_bad(pmd_t *pmd)
{
        pmd_ERROR(*pmd);
        pmd_clear(pmd);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
 * Only sets the access flags (dirty, accessed), as well as write
 * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
 * used to be done in the caller, but sparc needs minor faults to
 * force that call on sun4c so we changed this macro slightly
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(ptep_get(ptep), entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
                flush_tlb_fix_spurious_fault(vma, address, ptep);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
{
        int young;
        young = ptep_test_and_clear_young(vma, address, ptep);
        if (young)
                flush_tlb_page(vma, address);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
                       pte_t *ptep)
{
        struct mm_struct *mm = (vma)->vm_mm;
        pte_t pte;
        pte = ptep_get_and_clear(mm, address, ptep);
        if (pte_accessible(mm, pte))
                flush_tlb_page(vma, address);
        return pte;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        if (changed) {
                set_pmd_at(vma->vm_mm, address, pmdp, entry);
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
{
        int young;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp)
{
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
                           !pmd_devmap(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pud_t *pudp)
{
        pud_t pud;

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
        VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
        pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
        flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
        return pud;
}
#endif
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
{
        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(&pgtable->lru);
        else
                list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
        pmd_huge_pte(mm, pmdp) = pgtable;
}
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
/* no "address" argument so destroys page coloring of some arch */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
        pgtable_t pgtable;

        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        pgtable = pmd_huge_pte(mm, pmdp);
        pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
                                                          struct page, lru);
        if (pmd_huge_pte(mm, pmdp))
                list_del(&pgtable->lru);
        return pgtable;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return old;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        return pmdp_invalidate(vma, address, pmdp);
}
#endif

#ifndef pmdp_collapse_flush
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
{
        /*
         * pmd and hugepage pte format are same. So we could
         * use the same function.
         */
        pmd_t pmd;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);

        /* collapse entails shooting down ptes not pmd */
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}
#endif

/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
#ifndef pte_free_defer
static void pte_free_now(struct rcu_head *head)
{
        struct page *page;

        page = container_of(head, struct page, rcu_head);
        pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
}

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
{
        struct page *page;

        page = pgtable;
        call_rcu(&page->rcu_head, pte_free_now);
}
#endif /* pte_free_defer */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
        (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
/*
 * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
 * the barriers in pmdp_get_lockless() cannot guarantee that the value in
 * pmd_high actually belongs with the value in pmd_low; but holding interrupts
 * off blocks the TLB flush between present updates, which guarantees that a
 * successful __pte_offset_map() points to a page from matched halves.
 */
static unsigned long pmdp_get_lockless_start(void)
{
        unsigned long irqflags;

        local_irq_save(irqflags);
        return irqflags;
}
static void pmdp_get_lockless_end(unsigned long irqflags)
{
        local_irq_restore(irqflags);
}
#else
static unsigned long pmdp_get_lockless_start(void) { return 0; }
static void pmdp_get_lockless_end(unsigned long irqflags) { }
#endif

pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
        unsigned long irqflags;
        pmd_t pmdval;

        rcu_read_lock();
        irqflags = pmdp_get_lockless_start();
        pmdval = pmdp_get_lockless(pmd);
        pmdp_get_lockless_end(irqflags);

        if (pmdvalp)
                *pmdvalp = pmdval;
        if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
                goto nomap;
        if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
                goto nomap;
        if (unlikely(pmd_bad(pmdval))) {
                pmd_clear_bad(pmd);
                goto nomap;
        }
        return __pte_map(&pmdval, addr);
nomap:
        rcu_read_unlock();
        return NULL;
}

pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
                             unsigned long addr, spinlock_t **ptlp)
{
        pmd_t pmdval;
        pte_t *pte;

        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, &pmdval);
        return pte;
}

/*
 * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
 * __pte_offset_map_lock() below, is usually called with the pmd pointer for
 * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while
 * holding mmap_lock or vma lock for read or for write; or in truncate or rmap
 * context, while holding file's i_mmap_lock or anon_vma lock for read (or for
 * write). In a few cases, it may be used with pmd pointing to a pmd_t already
 * copied to or constructed on the stack.
 *
 * When successful, it returns the pte pointer for addr, with its page table
 * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
 * modification by software, with a pointer to that spinlock in ptlp (in some
 * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
 * struct page).  pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
 *
 * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
 * page table at *pmd: if, for example, the page table has just been removed,
 * or replaced by the huge pmd of a THP.  (When successful, *pmd is rechecked
 * after acquiring the ptlock, and retried internally if it changed: so that a
 * page table can be safely removed or replaced by THP while holding its lock.)
 *
 * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
 * just returns the pte pointer for addr, its page table kmapped if necessary;
 * or NULL if there is no page table at *pmd.  It does not attempt to lock the
 * page table, so cannot normally be used when the page table is to be updated,
 * or when entries read must be stable.  But it does take rcu_read_lock(): so
 * that even when page table is racily removed, it remains a valid though empty
 * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
 * afterwards.
 *
 * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
 * but when successful, it also outputs a pointer to the spinlock in ptlp - as
 * pte_offset_map_lock() does, but in this case without locking it.  This helps
 * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
 * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
 * pointer for the page table that it returns.  In principle, the caller should
 * recheck *pmd once the lock is taken; in practice, no callsite needs that -
 * either the mmap_lock for write, or pte_same() check on contents, is enough.
 *
 * Note that free_pgtables(), used after unmapping detached vmas, or when
 * exiting the whole mm, does not take page table lock before freeing a page
 * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
 * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
 */
pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                             unsigned long addr, spinlock_t **ptlp)
{
        spinlock_t *ptl;
        pmd_t pmdval;
        pte_t *pte;
again:
        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (unlikely(!pte))
                return pte;
        ptl = pte_lockptr(mm, &pmdval);
        spin_lock(ptl);
        if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
                *ptlp = ptl;
                return pte;
        }
        pte_unmap_unlock(pte, ptl);
        goto again;
}












































































































































































































































































    1 

















    1 

    1 
















    1 


























































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/balloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "mballoc.h"

#include <trace/events/ext4.h>
#include <kunit/static_stub.h>

static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
                                            ext4_group_t block_group);
/*
 * balloc.c contains the blocks allocation and deallocation routines
 */

/*
 * Calculate block group number for a given block number
 */
ext4_group_t ext4_get_group_number(struct super_block *sb,
                                   ext4_fsblk_t block)
{
        ext4_group_t group;

        if (test_opt2(sb, STD_GROUP_SIZE))
                group = (block -
                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
                        (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
        else
                ext4_get_group_no_and_offset(sb, block, &group, NULL);
        return group;
}

/*
 * Calculate the block group number and offset into the block/cluster
 * allocation bitmap, given a block number
 */
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        ext4_grpblk_t offset;

        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
        offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
                EXT4_SB(sb)->s_cluster_bits;
        if (offsetp)
                *offsetp = offset;
        if (blockgrpp)
                *blockgrpp = blocknr;

}

/*
 * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
 * and 0 otherwise.
 */
static inline int ext4_block_in_group(struct super_block *sb,
                                      ext4_fsblk_t block,
                                      ext4_group_t block_group)
{
        ext4_group_t actual_group;

        actual_group = ext4_get_group_number(sb, block);
        return (actual_group == block_group) ? 1 : 0;
}

/*
 * Return the number of clusters used for file system metadata; this
 * represents the overhead needed by the file system.
 */
static unsigned ext4_num_overhead_clusters(struct super_block *sb,
                                           ext4_group_t block_group,
                                           struct ext4_group_desc *gdp)
{
        unsigned base_clusters, num_clusters;
        int block_cluster = -1, inode_cluster;
        int itbl_cluster_start = -1, itbl_cluster_end = -1;
        ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
        ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        ext4_fsblk_t itbl_blk_start, itbl_blk_end;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* This is the number of clusters used by the superblock,
         * block group descriptors, and reserved block group
         * descriptor blocks */
        base_clusters = ext4_num_base_meta_clusters(sb, block_group);
        num_clusters = base_clusters;

        /*
         * Account and record inode table clusters if any cluster
         * is in the block group, or inode table cluster range is
         * [-1, -1] and won't overlap with block/inode bitmap cluster
         * accounted below.
         */
        itbl_blk_start = ext4_inode_table(sb, gdp);
        itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
        if (itbl_blk_start <= end && itbl_blk_end >= start) {
                itbl_blk_start = max(itbl_blk_start, start);
                itbl_blk_end = min(itbl_blk_end, end);

                itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
                itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);

                num_clusters += itbl_cluster_end - itbl_cluster_start + 1;
                /* check if border cluster is overlapped */
                if (itbl_cluster_start == base_clusters - 1)
                        num_clusters--;
        }

        /*
         * For the allocation bitmaps, we first need to check to see
         * if the block is in the block group.  If it is, then check
         * to see if the cluster is already accounted for in the clusters
         * used for the base metadata cluster and inode tables cluster.
         * Normally all of these blocks are contiguous, so the special
         * case handling shouldn't be necessary except for *very*
         * unusual file system layouts.
         */
        if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
                block_cluster = EXT4_B2C(sbi,
                                         ext4_block_bitmap(sb, gdp) - start);
                if (block_cluster >= base_clusters &&
                    (block_cluster < itbl_cluster_start ||
                    block_cluster > itbl_cluster_end))
                        num_clusters++;
        }

        if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
                inode_cluster = EXT4_B2C(sbi,
                                         ext4_inode_bitmap(sb, gdp) - start);
                /*
                 * Additional check if inode bitmap is in just accounted
                 * block_cluster
                 */
                if (inode_cluster != block_cluster &&
                    inode_cluster >= base_clusters &&
                    (inode_cluster < itbl_cluster_start ||
                    inode_cluster > itbl_cluster_end))
                        num_clusters++;
        }

        return num_clusters;
}

static unsigned int num_clusters_in_group(struct super_block *sb,
                                          ext4_group_t block_group)
{
        unsigned int blocks;

        if (block_group == ext4_get_groups_count(sb) - 1) {
                /*
                 * Even though mke2fs always initializes the first and
                 * last group, just in case some other tool was used,
                 * we need to make sure we calculate the right free
                 * blocks.
                 */
                blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
                        ext4_group_first_block_no(sb, block_group);
        } else
                blocks = EXT4_BLOCKS_PER_GROUP(sb);
        return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
}

/* Initializes an uninitialized block bitmap */
static int ext4_init_block_bitmap(struct super_block *sb,
                                   struct buffer_head *bh,
                                   ext4_group_t block_group,
                                   struct ext4_group_desc *gdp)
{
        unsigned int bit, bit_max;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t start, tmp;

        ASSERT(buffer_locked(bh));

        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT |
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        memset(bh->b_data, 0, sb->s_blocksize);

        bit_max = ext4_num_base_meta_clusters(sb, block_group);
        if ((bit_max >> 3) >= bh->b_size)
                return -EFSCORRUPTED;

        for (bit = 0; bit < bit_max; bit++)
                ext4_set_bit(bit, bh->b_data);

        start = ext4_group_first_block_no(sb, block_group);

        /* Set bits for block and inode bitmaps, and inode table */
        tmp = ext4_block_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_table(sb, gdp);
        for (; tmp < ext4_inode_table(sb, gdp) +
                     sbi->s_itb_per_group; tmp++) {
                if (ext4_block_in_group(sb, tmp, block_group))
                        ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
        }

        /*
         * Also if the number of blocks within the group is less than
         * the blocksize * 8 ( which is the size of bitmap ), set rest
         * of the block bitmap to 1
         */
        ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
                             sb->s_blocksize * 8, bh->b_data);
        return 0;
}

/* Return the number of free blocks in a block group.  It is used when
 * the block bitmap is uninitialized, so we can't just count the bits
 * in the bitmap. */
unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                       ext4_group_t block_group,
                                       struct ext4_group_desc *gdp)
{
        return num_clusters_in_group(sb, block_group) -
                ext4_num_overhead_clusters(sb, block_group, gdp);
}

/*
 * The free blocks are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
 * when a file system is mounted (see ext4_fill_super).
 */

/**
 * ext4_get_group_desc() -- load group descriptor from disk
 * @sb:                        super block
 * @block_group:        given block group
 * @bh:                        pointer to the buffer head to store the block
 *                        group descriptor
 */
struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
{
        unsigned int group_desc;
        unsigned int offset;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh_p;

        KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
                                   sb, block_group, bh);

        if (block_group >= ngroups) {
                ext4_error(sb, "block_group >= groups_count - block_group = %u,"
                           " groups_count = %u", block_group, ngroups);

                return NULL;
        }

        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
        /*
         * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
         * the pointer being dereferenced won't be dereferenced again. By
         * looking at the usage in add_new_gdb() the value isn't modified,
         * just the pointer, and so it remains valid.
         */
        if (!bh_p) {
                ext4_error(sb, "Group descriptor not loaded - "
                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }

        desc = (struct ext4_group_desc *)(
                (__u8 *)bh_p->b_data +
                offset * EXT4_DESC_SIZE(sb));
        if (bh)
                *bh = bh_p;
        return desc;
}

static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head *bh)
{
        ext4_grpblk_t next_zero_bit;
        unsigned long bitmap_size = sb->s_blocksize * 8;
        unsigned int offset = num_clusters_in_group(sb, block_group);

        if (bitmap_size <= offset)
                return 0;

        next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);

        return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
}

struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                            ext4_group_t group)
{
        struct ext4_group_info **grp_info;
        long indexv, indexh;

        if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
                return NULL;
        indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
        indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
        grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
        return grp_info[indexh];
}

/*
 * Return the block number which was discovered to be invalid, or 0 if
 * the block bitmap is valid.
 */
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
                                            struct ext4_group_desc *desc,
                                            ext4_group_t block_group,
                                            struct buffer_head *bh)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
        ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_fsblk_t blk;
        ext4_fsblk_t group_first_block;

        if (ext4_has_feature_flex_bg(sb)) {
                /* with FLEX_BG, the inode/block bitmaps and itable
                 * blocks may not be in the group at all
                 * so the bitmap validation will be skipped for those groups
                 * or it has to also read the block group where the bitmaps
                 * are located to verify they are set.
                 */
                return 0;
        }
        group_first_block = ext4_group_first_block_no(sb, block_group);

        /* check whether block bitmap block number is set */
        blk = ext4_block_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode bitmap block number is set */
        blk = ext4_inode_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode table block number is set */
        blk = ext4_inode_table(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit)
                return blk;
        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
                        EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1,
                        EXT4_B2C(sbi, offset));
        if (next_zero_bit <
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1)
                /* bad bitmap for inode tables */
                return blk;
        return 0;
}

static int ext4_validate_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *desc,
                                      ext4_group_t block_group,
                                      struct buffer_head *bh)
{
        ext4_fsblk_t        blk;
        struct ext4_group_info *grp;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        grp = ext4_get_group_info(sb, block_group);

        if (buffer_verified(bh))
                return 0;
        if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                return -EFSCORRUPTED;

        ext4_lock_group(sb, block_group);
        if (buffer_verified(bh))
                goto verified;
        if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) ||
                     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                                 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        set_buffer_verified(bh);
verified:
        ext4_unlock_group(sb, block_group);
        return 0;
}

/**
 * ext4_read_block_bitmap_nowait()
 * @sb:                        super block
 * @block_group:        given block group
 * @ignore_locked:        ignore locked buffers
 *
 * Read the bitmap for a given block_group,and validate the
 * bits for block/inode/inode tables are set in the bitmaps
 *
 * Return buffer_head on success or an ERR_PTR in case of failure.
 */
struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
                              bool ignore_locked)
{
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        ext4_fsblk_t bitmap_blk;
        int err;

        KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
                                   sb, block_group, ignore_locked);

        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);
        bitmap_blk = ext4_block_bitmap(sb, desc);
        if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                ext4_error(sb, "Invalid block bitmap block %llu in "
                           "block_group %u", bitmap_blk, block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return ERR_PTR(-EFSCORRUPTED);
        }
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                ext4_warning(sb, "Cannot get buffer for block bitmap - "
                             "block_group = %u, block_bitmap = %llu",
                             block_group, bitmap_blk);
                return ERR_PTR(-ENOMEM);
        }

        if (ignore_locked && buffer_locked(bh)) {
                /* buffer under IO already, return if called for prefetching */
                put_bh(bh);
                return NULL;
        }

        if (bitmap_uptodate(bh))
                goto verify;

        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
                goto verify;
        }
        ext4_lock_group(sb, block_group);
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                if (block_group == 0) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Block bitmap for bg 0 marked "
                                   "uninitialized");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                err = ext4_init_block_bitmap(sb, bh, block_group, desc);
                if (err) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Failed to init block bitmap for group "
                                   "%u: %d", block_group, err);
                        goto out;
                }
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
                 * bitmap is also uptodate
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
                goto verify;
        }
        /*
         * submit the buffer_head for reading
         */
        set_buffer_new(bh);
        trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
                            (ignore_locked ? REQ_RAHEAD : 0),
                            ext4_end_bitmap_read);
        return bh;
verify:
        err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
        if (err)
                goto out;
        return bh;
out:
        put_bh(bh);
        return ERR_PTR(err);
}

/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
                           struct buffer_head *bh)
{
        struct ext4_group_desc *desc;

        KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
                                   sb, block_group, bh);

        if (!buffer_new(bh))
                return 0;
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return -EFSCORRUPTED;
        wait_on_buffer(bh);
        ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
        if (!buffer_uptodate(bh)) {
                ext4_error_err(sb, EIO, "Cannot read block bitmap - "
                               "block_group = %u, block_bitmap = %llu",
                               block_group, (unsigned long long) bh->b_blocknr);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EIO;
        }
        clear_buffer_new(bh);
        /* Panic or remount fs read-only if block bitmap is invalid */
        return ext4_validate_block_bitmap(sb, desc, block_group, bh);
}

struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
        struct buffer_head *bh;
        int err;

        bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
        if (IS_ERR(bh))
                return bh;
        err = ext4_wait_block_bitmap(sb, block_group, bh);
        if (err) {
                put_bh(bh);
                return ERR_PTR(err);
        }
        return bh;
}

/**
 * ext4_has_free_clusters()
 * @sbi:        in-core super block structure.
 * @nclusters:        number of needed blocks
 * @flags:        flags from ext4_mb_new_blocks()
 *
 * Check if filesystem has nclusters free & available for allocation.
 * On success return 1, return 0 on failure.
 */
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
                                  s64 nclusters, unsigned int flags)
{
        s64 free_clusters, dirty_clusters, rsv, resv_clusters;
        struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
        struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;

        free_clusters  = percpu_counter_read_positive(fcc);
        dirty_clusters = percpu_counter_read_positive(dcc);
        resv_clusters = atomic64_read(&sbi->s_resv_clusters);

        /*
         * r_blocks_count should always be multiple of the cluster ratio so
         * we are safe to do a plane bit shift only.
         */
        rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
              resv_clusters;

        if (free_clusters - (nclusters + rsv + dirty_clusters) <
                                        EXT4_FREECLUSTERS_WATERMARK) {
                free_clusters  = percpu_counter_sum_positive(fcc);
                dirty_clusters = percpu_counter_sum_positive(dcc);
        }
        /* Check whether we have space after accounting for current
         * dirty clusters & root reserved clusters.
         */
        if (free_clusters >= (rsv + nclusters + dirty_clusters))
                return 1;

        /* Hm, nope.  Are (enough) root reserved clusters available? */
        if (uid_eq(sbi->s_resuid, current_fsuid()) ||
            (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
            capable(CAP_SYS_RESOURCE) ||
            (flags & EXT4_MB_USE_ROOT_BLOCKS)) {

                if (free_clusters >= (nclusters + dirty_clusters +
                                      resv_clusters))
                        return 1;
        }
        /* No free blocks. Let's see if we can dip into reserved pool */
        if (flags & EXT4_MB_USE_RESERVED) {
                if (free_clusters >= (nclusters + dirty_clusters))
                        return 1;
        }

        return 0;
}

int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                             s64 nclusters, unsigned int flags)
{
        if (ext4_has_free_clusters(sbi, nclusters, flags)) {
                percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
                return 0;
        } else
                return -ENOSPC;
}

/**
 * ext4_should_retry_alloc() - check if a block allocation should be retried
 * @sb:                        superblock
 * @retries:                number of retry attempts made so far
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned while
 * attempting to allocate blocks.  If there's an indication that a pending
 * journal transaction might free some space and allow another attempt to
 * succeed, this function will wait for the current or committing transaction
 * to complete and then return TRUE.
 */
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!sbi->s_journal)
                return 0;

        if (++(*retries) > 3) {
                percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
                return 0;
        }

        /*
         * if there's no indication that blocks are about to be freed it's
         * possible we just missed a transaction commit that did so
         */
        smp_mb();
        if (sbi->s_mb_free_pending == 0) {
                if (test_opt(sb, DISCARD)) {
                        atomic_inc(&sbi->s_retry_alloc_pending);
                        flush_work(&sbi->s_discard_work);
                        atomic_dec(&sbi->s_retry_alloc_pending);
                }
                return ext4_has_free_clusters(sbi, 1, 0);
        }

        /*
         * it's possible we've just missed a transaction commit here,
         * so ignore the returned status
         */
        ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
        (void) jbd2_journal_force_commit_nested(sbi->s_journal);
        return 1;
}

/*
 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:                pointer to total number of clusters needed
 * @errp:               error code
 *
 * Return 1st allocated block number on success, *count stores total account
 * error stores in errp pointer
 */
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                  ext4_fsblk_t goal, unsigned int flags,
                                  unsigned long *count, int *errp)
{
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;

        memset(&ar, 0, sizeof(ar));
        /* Fill with neighbour allocated blocks */
        ar.inode = inode;
        ar.goal = goal;
        ar.len = count ? *count : 1;
        ar.flags = flags;

        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
                dquot_alloc_block_nofail(inode,
                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
        }
        return ret;
}

/**
 * ext4_count_free_clusters() -- count filesystem free clusters
 * @sb:                superblock
 *
 * Adds up the number of free clusters from each block group.
 */
ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
        unsigned int x;
        struct buffer_head *bitmap_bh = NULL;

        es = EXT4_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;

        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (IS_ERR(bitmap_bh)) {
                        bitmap_bh = NULL;
                        continue;
                }

                x = ext4_count_free(bitmap_bh->b_data,
                                    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
                        i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
               ", computed = %llu, %llu\n",
               EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
#else
        desc_count = 0;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
        }

        return desc_count;
#endif
}

static inline int test_root(ext4_group_t a, int b)
{
        while (1) {
                if (a < b)
                        return 0;
                if (a == b)
                        return 1;
                if ((a % b) != 0)
                        return 0;
                a = a / b;
        }
}

/**
 *        ext4_bg_has_super - number of blocks used by the superblock in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the superblock (primary or backup)
 *        in this group.  Currently this will be only 0 or 1.
 */
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (group == 0)
                return 1;
        if (ext4_has_feature_sparse_super2(sb)) {
                if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
                    group == le32_to_cpu(es->s_backup_bgs[1]))
                        return 1;
                return 0;
        }
        if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
                return 1;
        if (!(group & 1))
                return 0;
        if (test_root(group, 3) || (test_root(group, 5)) ||
            test_root(group, 7))
                return 1;

        return 0;
}

static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
                                        ext4_group_t group)
{
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;

        if (group == first || group == first + 1 || group == last)
                return 1;
        return 0;
}

static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
{
        if (!ext4_bg_has_super(sb, group))
                return 0;

        if (ext4_has_feature_meta_bg(sb))
                return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        else
                return EXT4_SB(sb)->s_gdb_count;
}

/**
 *        ext4_bg_num_gdb - number of blocks used by the group table in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the group descriptor table
 *        (primary or backup) in this group.  In the future there may be a
 *        different number of descriptor blocks in each group.
 */
unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
{
        unsigned long first_meta_bg =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);

        if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
                return ext4_bg_num_gdb_nometa(sb, group);

        return ext4_bg_num_gdb_meta(sb,group);

}

/*
 * This function returns the number of file system metadata blocks at
 * the beginning of a block group, including the reserved gdt blocks.
 */
unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                       ext4_group_t block_group)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned num;

        /* Check for superblock and gdt backups in this group */
        num = ext4_bg_has_super(sb, block_group);

        if (!ext4_has_feature_meta_bg(sb) ||
            block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
                          sbi->s_desc_per_block) {
                if (num) {
                        num += ext4_bg_num_gdb_nometa(sb, block_group);
                        num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
                }
        } else { /* For META_BG_BLOCK_GROUPS */
                num += ext4_bg_num_gdb_meta(sb, block_group);
        }
        return num;
}

static unsigned int ext4_num_base_meta_clusters(struct super_block *sb,
                                                ext4_group_t block_group)
{
        return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group));
}

/**
 *        ext4_inode_to_goal_block - return a hint for block allocation
 *        @inode: inode for block allocation
 *
 *        Return the ideal location to start allocating blocks for a
 *        newly created inode.
 */
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_group_t block_group;
        ext4_grpblk_t colour;
        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;

        block_group = ei->i_block_group;
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
                 * block groups per flexgroup, reserve the first block
                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
                if (S_ISREG(inode->i_mode))
                        block_group++;
        }
        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

        /*
         * If we are doing delayed allocation, we don't need take
         * colour into account.
         */
        if (test_opt(inode->i_sb, DELALLOC))
                return bg_start;

        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (task_pid_nr(current) % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
        else
                colour = (task_pid_nr(current) % 16) *
                        ((last_block - bg_start) / 16);
        return bg_start + colour;
}





















































































































































































































































































    1 




















    1 



















































    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
// SPDX-License-Identifier: GPL-2.0-only
/*
 * linux/kernel/ptrace.c
 *
 * (C) Copyright 1999 Linus Torvalds
 *
 * Common interfaces for "ptrace()" which we do not want
 * to continually duplicate across every architecture.
 */

#include <linux/capability.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
#include <linux/syscall_user_dispatch.h>

#include <asm/syscall.h>        /* for syscall_get_* */

/*
 * Access another process' address space via ptrace.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
                     void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        if (!tsk->ptrace ||
            (current != tsk->parent) ||
            ((get_dumpable(mm) != SUID_DUMP_USER) &&
             !ptracer_capable(tsk, mm->user_ns))) {
                mmput(mm);
                return 0;
        }

        ret = access_remote_vm(mm, addr, buf, len, gup_flags);
        mmput(mm);

        return ret;
}


void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
                   const struct cred *ptracer_cred)
{
        BUG_ON(!list_empty(&child->ptrace_entry));
        list_add(&child->ptrace_entry, &new_parent->ptraced);
        child->parent = new_parent;
        child->ptracer_cred = get_cred(ptracer_cred);
}

/*
 * ptrace a task: make the debugger its new parent and
 * move it to the ptrace list.
 *
 * Must be called with the tasklist lock write-held.
 */
static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
        __ptrace_link(child, new_parent, current_cred());
}

/**
 * __ptrace_unlink - unlink ptracee and restore its execution state
 * @child: ptracee to be unlinked
 *
 * Remove @child from the ptrace list, move it back to the original parent,
 * and restore the execution state so that it conforms to the group stop
 * state.
 *
 * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
 * exiting.  For PTRACE_DETACH, unless the ptracee has been killed between
 * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
 * If the ptracer is exiting, the ptracee can be in any state.
 *
 * After detach, the ptracee should be in a state which conforms to the
 * group stop.  If the group is stopped or in the process of stopping, the
 * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
 * up from TASK_TRACED.
 *
 * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
 * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
 * to but in the opposite direction of what happens while attaching to a
 * stopped task.  However, in this direction, the intermediate RUNNING
 * state is not hidden even from the current ptracer and if it immediately
 * re-attaches and performs a WNOHANG wait(2), it may fail.
 *
 * CONTEXT:
 * write_lock_irq(tasklist_lock)
 */
void __ptrace_unlink(struct task_struct *child)
{
        const struct cred *old_cred;
        BUG_ON(!child->ptrace);

        clear_task_syscall_work(child, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        clear_task_syscall_work(child, SYSCALL_EMU);
#endif

        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
        old_cred = child->ptracer_cred;
        child->ptracer_cred = NULL;
        put_cred(old_cred);

        spin_lock(&child->sighand->siglock);
        child->ptrace = 0;
        /*
         * Clear all pending traps and TRAPPING.  TRAPPING should be
         * cleared regardless of JOBCTL_STOP_PENDING.  Do it explicitly.
         */
        task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
        task_clear_jobctl_trapping(child);

        /*
         * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
         * @child isn't dead.
         */
        if (!(child->flags & PF_EXITING) &&
            (child->signal->flags & SIGNAL_STOP_STOPPED ||
             child->signal->group_stop_count))
                child->jobctl |= JOBCTL_STOP_PENDING;

        /*
         * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
         * @child in the butt.  Note that @resume should be used iff @child
         * is in TASK_TRACED; otherwise, we might unduly disrupt
         * TASK_KILLABLE sleeps.
         */
        if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
                ptrace_signal_wake_up(child, true);

        spin_unlock(&child->sighand->siglock);
}

static bool looks_like_a_spurious_pid(struct task_struct *task)
{
        if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP))
                return false;

        if (task_pid_vnr(task) == task->ptrace_message)
                return false;
        /*
         * The tracee changed its pid but the PTRACE_EVENT_EXEC event
         * was not wait()'ed, most probably debugger targets the old
         * leader which was destroyed in de_thread().
         */
        return true;
}

/*
 * Ensure that nothing can wake it up, even SIGKILL
 *
 * A task is switched to this state while a ptrace operation is in progress;
 * such that the ptrace operation is uninterruptible.
 */
static bool ptrace_freeze_traced(struct task_struct *task)
{
        bool ret = false;

        /* Lockless, nobody but us can set this flag */
        if (task->jobctl & JOBCTL_LISTENING)
                return ret;

        spin_lock_irq(&task->sighand->siglock);
        if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
            !__fatal_signal_pending(task)) {
                task->jobctl |= JOBCTL_PTRACE_FROZEN;
                ret = true;
        }
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static void ptrace_unfreeze_traced(struct task_struct *task)
{
        unsigned long flags;

        /*
         * The child may be awake and may have cleared
         * JOBCTL_PTRACE_FROZEN (see ptrace_resume).  The child will
         * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
         */
        if (lock_task_sighand(task, &flags)) {
                task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
                if (__fatal_signal_pending(task)) {
                        task->jobctl &= ~JOBCTL_TRACED;
                        wake_up_state(task, __TASK_TRACED);
                }
                unlock_task_sighand(task, &flags);
        }
}

/**
 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
 * @child: ptracee to check for
 * @ignore_state: don't check whether @child is currently %TASK_TRACED
 *
 * Check whether @child is being ptraced by %current and ready for further
 * ptrace operations.  If @ignore_state is %false, @child also should be in
 * %TASK_TRACED state and on return the child is guaranteed to be traced
 * and not executing.  If @ignore_state is %true, @child can be in any
 * state.
 *
 * CONTEXT:
 * Grabs and releases tasklist_lock and @child->sighand->siglock.
 *
 * RETURNS:
 * 0 on success, -ESRCH if %child is not ready.
 */
static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
        int ret = -ESRCH;

        /*
         * We take the read lock around doing both checks to close a
         * possible race where someone else was tracing our child and
         * detached between these two checks.  After this locked check,
         * we are sure that this is our traced child and that can only
         * be changed by us so it's not changing right after this.
         */
        read_lock(&tasklist_lock);
        if (child->ptrace && child->parent == current) {
                /*
                 * child->sighand can't be NULL, release_task()
                 * does ptrace_unlink() before __exit_signal().
                 */
                if (ignore_state || ptrace_freeze_traced(child))
                        ret = 0;
        }
        read_unlock(&tasklist_lock);

        if (!ret && !ignore_state &&
            WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN)))
                ret = -ESRCH;

        return ret;
}

static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
        if (mode & PTRACE_MODE_NOAUDIT)
                return ns_capable_noaudit(ns, CAP_SYS_PTRACE);
        return ns_capable(ns, CAP_SYS_PTRACE);
}

/* Returns 0 on success, -errno on denial. */
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
        const struct cred *cred = current_cred(), *tcred;
        struct mm_struct *mm;
        kuid_t caller_uid;
        kgid_t caller_gid;

        if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) {
                WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n");
                return -EPERM;
        }

        /* May we inspect the given task?
         * This check is used both for attaching with ptrace
         * and for allowing access to sensitive information in /proc.
         *
         * ptrace_attach denies several cases that /proc allows
         * because setting up the necessary parent/child relationship
         * or halting the specified task is impossible.
         */

        /* Don't let security modules deny introspection */
        if (same_thread_group(task, current))
                return 0;
        rcu_read_lock();
        if (mode & PTRACE_MODE_FSCREDS) {
                caller_uid = cred->fsuid;
                caller_gid = cred->fsgid;
        } else {
                /*
                 * Using the euid would make more sense here, but something
                 * in userland might rely on the old behavior, and this
                 * shouldn't be a security problem since
                 * PTRACE_MODE_REALCREDS implies that the caller explicitly
                 * used a syscall that requests access to another process
                 * (and not a filesystem syscall to procfs).
                 */
                caller_uid = cred->uid;
                caller_gid = cred->gid;
        }
        tcred = __task_cred(task);
        if (uid_eq(caller_uid, tcred->euid) &&
            uid_eq(caller_uid, tcred->suid) &&
            uid_eq(caller_uid, tcred->uid)  &&
            gid_eq(caller_gid, tcred->egid) &&
            gid_eq(caller_gid, tcred->sgid) &&
            gid_eq(caller_gid, tcred->gid))
                goto ok;
        if (ptrace_has_cap(tcred->user_ns, mode))
                goto ok;
        rcu_read_unlock();
        return -EPERM;
ok:
        rcu_read_unlock();
        /*
         * If a task drops privileges and becomes nondumpable (through a syscall
         * like setresuid()) while we are trying to access it, we must ensure
         * that the dumpability is read after the credentials; otherwise,
         * we may be able to attach to a task that we shouldn't be able to
         * attach to (as if the task had dropped privileges without becoming
         * nondumpable).
         * Pairs with a write barrier in commit_creds().
         */
        smp_rmb();
        mm = task->mm;
        if (mm &&
            ((get_dumpable(mm) != SUID_DUMP_USER) &&
             !ptrace_has_cap(mm->user_ns, mode)))
            return -EPERM;

        return security_ptrace_access_check(task, mode);
}

bool ptrace_may_access(struct task_struct *task, unsigned int mode)
{
        int err;
        task_lock(task);
        err = __ptrace_may_access(task, mode);
        task_unlock(task);
        return !err;
}

static int check_ptrace_options(unsigned long data)
{
        if (data & ~(unsigned long)PTRACE_O_MASK)
                return -EINVAL;

        if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
                if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
                    !IS_ENABLED(CONFIG_SECCOMP))
                        return -EINVAL;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;

                if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
                    current->ptrace & PT_SUSPEND_SECCOMP)
                        return -EPERM;
        }
        return 0;
}

static inline void ptrace_set_stopped(struct task_struct *task, bool seize)
{
        guard(spinlock)(&task->sighand->siglock);

        /* SEIZE doesn't trap tracee on attach */
        if (!seize)
                send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID);
        /*
         * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
         * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
         * will be cleared if the child completes the transition or any
         * event which clears the group stop states happens.  We'll wait
         * for the transition to complete before returning from this
         * function.
         *
         * This hides STOPPED -> RUNNING -> TRACED transition from the
         * attaching thread but a different thread in the same group can
         * still observe the transient RUNNING state.  IOW, if another
         * thread's WNOHANG wait(2) on the stopped tracee races against
         * ATTACH, the wait(2) may fail due to the transient RUNNING.
         *
         * The following task_is_stopped() test is safe as both transitions
         * in and out of STOPPED are protected by siglock.
         */
        if (task_is_stopped(task) &&
            task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_STOPPED;
                signal_wake_up_state(task, __TASK_STOPPED);
        }
}

static int ptrace_attach(struct task_struct *task, long request,
                         unsigned long addr,
                         unsigned long flags)
{
        bool seize = (request == PTRACE_SEIZE);
        int retval;

        if (seize) {
                if (addr != 0)
                        return -EIO;
                /*
                 * This duplicates the check in check_ptrace_options() because
                 * ptrace_attach() and ptrace_setoptions() have historically
                 * used different error codes for unknown ptrace options.
                 */
                if (flags & ~(unsigned long)PTRACE_O_MASK)
                        return -EIO;

                retval = check_ptrace_options(flags);
                if (retval)
                        return retval;
                flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
        } else {
                flags = PT_PTRACED;
        }

        audit_ptrace(task);

        if (unlikely(task->flags & PF_KTHREAD))
                return -EPERM;
        if (same_thread_group(task, current))
                return -EPERM;

        /*
         * Protect exec's credential calculations against our interference;
         * SUID, SGID and LSM creds get determined differently
         * under ptrace.
         */
        scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
                           &task->signal->cred_guard_mutex) {

                scoped_guard (task_lock, task) {
                        retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
                        if (retval)
                                return retval;
                }

                scoped_guard (write_lock_irq, &tasklist_lock) {
                        if (unlikely(task->exit_state))
                                return -EPERM;
                        if (task->ptrace)
                                return -EPERM;

                        task->ptrace = flags;
                        ptrace_link(task, current);
                        ptrace_set_stopped(task, seize);
                }
        }

        /*
         * We do not bother to change retval or clear JOBCTL_TRAPPING
         * if wait_on_bit() was interrupted by SIGKILL. The tracer will
         * not return to user-mode, it will exit and clear this bit in
         * __ptrace_unlink() if it wasn't already cleared by the tracee;
         * and until then nobody can ptrace this task.
         */
        wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
        proc_ptrace_connector(task, PTRACE_ATTACH);

        return 0;
}

/**
 * ptrace_traceme  --  helper for PTRACE_TRACEME
 *
 * Performs checks and sets PT_PTRACED.
 * Should be used by all ptrace implementations for PTRACE_TRACEME.
 */
static int ptrace_traceme(void)
{
        int ret = -EPERM;

        write_lock_irq(&tasklist_lock);
        /* Are we already being traced? */
        if (!current->ptrace) {
                ret = security_ptrace_traceme(current->parent);
                /*
                 * Check PF_EXITING to ensure ->real_parent has not passed
                 * exit_ptrace(). Otherwise we don't report the error but
                 * pretend ->real_parent untraces us right after return.
                 */
                if (!ret && !(current->real_parent->flags & PF_EXITING)) {
                        current->ptrace = PT_PTRACED;
                        ptrace_link(current, current->real_parent);
                }
        }
        write_unlock_irq(&tasklist_lock);

        return ret;
}

/*
 * Called with irqs disabled, returns true if childs should reap themselves.
 */
static int ignoring_children(struct sighand_struct *sigh)
{
        int ret;
        spin_lock(&sigh->siglock);
        ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
              (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
        spin_unlock(&sigh->siglock);
        return ret;
}

/*
 * Called with tasklist_lock held for writing.
 * Unlink a traced task, and clean it up if it was a traced zombie.
 * Return true if it needs to be reaped with release_task().
 * (We can't call release_task() here because we already hold tasklist_lock.)
 *
 * If it's a zombie, our attachedness prevented normal parent notification
 * or self-reaping.  Do notification now if it would have happened earlier.
 * If it should reap itself, return true.
 *
 * If it's our own child, there is no notification to do. But if our normal
 * children self-reap, then this child was prevented by ptrace and we must
 * reap it now, in that case we must also wake up sub-threads sleeping in
 * do_wait().
 */
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
        bool dead;

        __ptrace_unlink(p);

        if (p->exit_state != EXIT_ZOMBIE)
                return false;

        dead = !thread_group_leader(p);

        if (!dead && thread_group_empty(p)) {
                if (!same_thread_group(p->real_parent, tracer))
                        dead = do_notify_parent(p, p->exit_signal);
                else if (ignoring_children(tracer->sighand)) {
                        __wake_up_parent(p, tracer);
                        dead = true;
                }
        }
        /* Mark it as in the process of being reaped. */
        if (dead)
                p->exit_state = EXIT_DEAD;
        return dead;
}

static int ptrace_detach(struct task_struct *child, unsigned int data)
{
        if (!valid_signal(data))
                return -EIO;

        /* Architecture-specific hardware disable .. */
        ptrace_disable(child);

        write_lock_irq(&tasklist_lock);
        /*
         * We rely on ptrace_freeze_traced(). It can't be killed and
         * untraced by another thread, it can't be a zombie.
         */
        WARN_ON(!child->ptrace || child->exit_state);
        /*
         * tasklist_lock avoids the race with wait_task_stopped(), see
         * the comment in ptrace_resume().
         */
        child->exit_code = data;
        __ptrace_detach(current, child);
        write_unlock_irq(&tasklist_lock);

        proc_ptrace_connector(child, PTRACE_DETACH);

        return 0;
}

/*
 * Detach all tasks we were using ptrace on. Called with tasklist held
 * for writing.
 */
void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
        struct task_struct *p, *n;

        list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
                if (unlikely(p->ptrace & PT_EXITKILL))
                        send_sig_info(SIGKILL, SEND_SIG_PRIV, p);

                if (__ptrace_detach(tracer, p))
                        list_add(&p->ptrace_entry, dead);
        }
}

int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
        int copied = 0;

        while (len > 0) {
                char buf[128];
                int this_len, retval;

                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
                retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);

                if (!retval) {
                        if (copied)
                                break;
                        return -EIO;
                }
                if (copy_to_user(dst, buf, retval))
                        return -EFAULT;
                copied += retval;
                src += retval;
                dst += retval;
                len -= retval;
        }
        return copied;
}

int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
{
        int copied = 0;

        while (len > 0) {
                char buf[128];
                int this_len, retval;

                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
                if (copy_from_user(buf, src, this_len))
                        return -EFAULT;
                retval = ptrace_access_vm(tsk, dst, buf, this_len,
                                FOLL_FORCE | FOLL_WRITE);
                if (!retval) {
                        if (copied)
                                break;
                        return -EIO;
                }
                copied += retval;
                src += retval;
                dst += retval;
                len -= retval;
        }
        return copied;
}

static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
        unsigned flags;
        int ret;

        ret = check_ptrace_options(data);
        if (ret)
                return ret;

        /* Avoid intermediate state when all opts are cleared */
        flags = child->ptrace;
        flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
        flags |= (data << PT_OPT_FLAG_SHIFT);
        child->ptrace = flags;

        return 0;
}

static int ptrace_getsiginfo(struct task_struct *child, kernel_siginfo_t *info)
{
        unsigned long flags;
        int error = -ESRCH;

        if (lock_task_sighand(child, &flags)) {
                error = -EINVAL;
                if (likely(child->last_siginfo != NULL)) {
                        copy_siginfo(info, child->last_siginfo);
                        error = 0;
                }
                unlock_task_sighand(child, &flags);
        }
        return error;
}

static int ptrace_setsiginfo(struct task_struct *child, const kernel_siginfo_t *info)
{
        unsigned long flags;
        int error = -ESRCH;

        if (lock_task_sighand(child, &flags)) {
                error = -EINVAL;
                if (likely(child->last_siginfo != NULL)) {
                        copy_siginfo(child->last_siginfo, info);
                        error = 0;
                }
                unlock_task_sighand(child, &flags);
        }
        return error;
}

static int ptrace_peek_siginfo(struct task_struct *child,
                                unsigned long addr,
                                unsigned long data)
{
        struct ptrace_peeksiginfo_args arg;
        struct sigpending *pending;
        struct sigqueue *q;
        int ret, i;

        ret = copy_from_user(&arg, (void __user *) addr,
                                sizeof(struct ptrace_peeksiginfo_args));
        if (ret)
                return -EFAULT;

        if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
                return -EINVAL; /* unknown flags */

        if (arg.nr < 0)
                return -EINVAL;

        /* Ensure arg.off fits in an unsigned long */
        if (arg.off > ULONG_MAX)
                return 0;

        if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
                pending = &child->signal->shared_pending;
        else
                pending = &child->pending;

        for (i = 0; i < arg.nr; ) {
                kernel_siginfo_t info;
                unsigned long off = arg.off + i;
                bool found = false;

                spin_lock_irq(&child->sighand->siglock);
                list_for_each_entry(q, &pending->list, list) {
                        if (!off--) {
                                found = true;
                                copy_siginfo(&info, &q->info);
                                break;
                        }
                }
                spin_unlock_irq(&child->sighand->siglock);

                if (!found) /* beyond the end of the list */
                        break;

#ifdef CONFIG_COMPAT
                if (unlikely(in_compat_syscall())) {
                        compat_siginfo_t __user *uinfo = compat_ptr(data);

                        if (copy_siginfo_to_user32(uinfo, &info)) {
                                ret = -EFAULT;
                                break;
                        }

                } else
#endif
                {
                        siginfo_t __user *uinfo = (siginfo_t __user *) data;

                        if (copy_siginfo_to_user(uinfo, &info)) {
                                ret = -EFAULT;
                                break;
                        }
                }

                data += sizeof(siginfo_t);
                i++;

                if (signal_pending(current))
                        break;

                cond_resched();
        }

        if (i > 0)
                return i;

        return ret;
}

#ifdef CONFIG_RSEQ
static long ptrace_get_rseq_configuration(struct task_struct *task,
                                          unsigned long size, void __user *data)
{
        struct ptrace_rseq_configuration conf = {
                .rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
                .rseq_abi_size = task->rseq_len,
                .signature = task->rseq_sig,
                .flags = 0,
        };

        size = min_t(unsigned long, size, sizeof(conf));
        if (copy_to_user(data, &conf, size))
                return -EFAULT;
        return sizeof(conf);
}
#endif

#define is_singlestep(request)                ((request) == PTRACE_SINGLESTEP)

#ifdef PTRACE_SINGLEBLOCK
#define is_singleblock(request)                ((request) == PTRACE_SINGLEBLOCK)
#else
#define is_singleblock(request)                0
#endif

#ifdef PTRACE_SYSEMU
#define is_sysemu_singlestep(request)        ((request) == PTRACE_SYSEMU_SINGLESTEP)
#else
#define is_sysemu_singlestep(request)        0
#endif

static int ptrace_resume(struct task_struct *child, long request,
                         unsigned long data)
{
        if (!valid_signal(data))
                return -EIO;

        if (request == PTRACE_SYSCALL)
                set_task_syscall_work(child, SYSCALL_TRACE);
        else
                clear_task_syscall_work(child, SYSCALL_TRACE);

#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
                set_task_syscall_work(child, SYSCALL_EMU);
        else
                clear_task_syscall_work(child, SYSCALL_EMU);
#endif

        if (is_singleblock(request)) {
                if (unlikely(!arch_has_block_step()))
                        return -EIO;
                user_enable_block_step(child);
        } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
                if (unlikely(!arch_has_single_step()))
                        return -EIO;
                user_enable_single_step(child);
        } else {
                user_disable_single_step(child);
        }

        /*
         * Change ->exit_code and ->state under siglock to avoid the race
         * with wait_task_stopped() in between; a non-zero ->exit_code will
         * wrongly look like another report from tracee.
         *
         * Note that we need siglock even if ->exit_code == data and/or this
         * status was not reported yet, the new status must not be cleared by
         * wait_task_stopped() after resume.
         */
        spin_lock_irq(&child->sighand->siglock);
        child->exit_code = data;
        child->jobctl &= ~JOBCTL_TRACED;
        wake_up_state(child, __TASK_TRACED);
        spin_unlock_irq(&child->sighand->siglock);

        return 0;
}

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK

static const struct user_regset *
find_regset(const struct user_regset_view *view, unsigned int type)
{
        const struct user_regset *regset;
        int n;

        for (n = 0; n < view->n; ++n) {
                regset = view->regsets + n;
                if (regset->core_note_type == type)
                        return regset;
        }

        return NULL;
}

static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
                         struct iovec *kiov)
{
        const struct user_regset_view *view = task_user_regset_view(task);
        const struct user_regset *regset = find_regset(view, type);
        int regset_no;

        if (!regset || (kiov->iov_len % regset->size) != 0)
                return -EINVAL;

        regset_no = regset - view->regsets;
        kiov->iov_len = min(kiov->iov_len,
                            (__kernel_size_t) (regset->n * regset->size));

        if (req == PTRACE_GETREGSET)
                return copy_regset_to_user(task, view, regset_no, 0,
                                           kiov->iov_len, kiov->iov_base);
        else
                return copy_regset_from_user(task, view, regset_no, 0,
                                             kiov->iov_len, kiov->iov_base);
}

/*
 * This is declared in linux/regset.h and defined in machine-dependent
 * code.  We put the export here, near the primary machine-neutral use,
 * to ensure no machine forgets it.
 */
EXPORT_SYMBOL_GPL(task_user_regset_view);

static unsigned long
ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
                              struct ptrace_syscall_info *info)
{
        unsigned long args[ARRAY_SIZE(info->entry.args)];
        int i;

        info->op = PTRACE_SYSCALL_INFO_ENTRY;
        info->entry.nr = syscall_get_nr(child, regs);
        syscall_get_arguments(child, regs, args);
        for (i = 0; i < ARRAY_SIZE(args); i++)
                info->entry.args[i] = args[i];

        /* args is the last field in struct ptrace_syscall_info.entry */
        return offsetofend(struct ptrace_syscall_info, entry.args);
}

static unsigned long
ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
                                struct ptrace_syscall_info *info)
{
        /*
         * As struct ptrace_syscall_info.entry is currently a subset
         * of struct ptrace_syscall_info.seccomp, it makes sense to
         * initialize that subset using ptrace_get_syscall_info_entry().
         * This can be reconsidered in the future if these structures
         * diverge significantly enough.
         */
        ptrace_get_syscall_info_entry(child, regs, info);
        info->op = PTRACE_SYSCALL_INFO_SECCOMP;
        info->seccomp.ret_data = child->ptrace_message;

        /* ret_data is the last field in struct ptrace_syscall_info.seccomp */
        return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
}

static unsigned long
ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
                             struct ptrace_syscall_info *info)
{
        info->op = PTRACE_SYSCALL_INFO_EXIT;
        info->exit.rval = syscall_get_error(child, regs);
        info->exit.is_error = !!info->exit.rval;
        if (!info->exit.is_error)
                info->exit.rval = syscall_get_return_value(child, regs);

        /* is_error is the last field in struct ptrace_syscall_info.exit */
        return offsetofend(struct ptrace_syscall_info, exit.is_error);
}

static int
ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
                        void __user *datavp)
{
        struct pt_regs *regs = task_pt_regs(child);
        struct ptrace_syscall_info info = {
                .op = PTRACE_SYSCALL_INFO_NONE,
                .arch = syscall_get_arch(child),
                .instruction_pointer = instruction_pointer(regs),
                .stack_pointer = user_stack_pointer(regs),
        };
        unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
        unsigned long write_size;

        /*
         * This does not need lock_task_sighand() to access
         * child->last_siginfo because ptrace_freeze_traced()
         * called earlier by ptrace_check_attach() ensures that
         * the tracee cannot go away and clear its last_siginfo.
         */
        switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
        case SIGTRAP | 0x80:
                switch (child->ptrace_message) {
                case PTRACE_EVENTMSG_SYSCALL_ENTRY:
                        actual_size = ptrace_get_syscall_info_entry(child, regs,
                                                                    &info);
                        break;
                case PTRACE_EVENTMSG_SYSCALL_EXIT:
                        actual_size = ptrace_get_syscall_info_exit(child, regs,
                                                                   &info);
                        break;
                }
                break;
        case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
                actual_size = ptrace_get_syscall_info_seccomp(child, regs,
                                                              &info);
                break;
        }

        write_size = min(actual_size, user_size);
        return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */

int ptrace_request(struct task_struct *child, long request,
                   unsigned long addr, unsigned long data)
{
        bool seized = child->ptrace & PT_SEIZED;
        int ret = -EIO;
        kernel_siginfo_t siginfo, *si;
        void __user *datavp = (void __user *) data;
        unsigned long __user *datalp = datavp;
        unsigned long flags;

        switch (request) {
        case PTRACE_PEEKTEXT:
        case PTRACE_PEEKDATA:
                return generic_ptrace_peekdata(child, addr, data);
        case PTRACE_POKETEXT:
        case PTRACE_POKEDATA:
                return generic_ptrace_pokedata(child, addr, data);

#ifdef PTRACE_OLDSETOPTIONS
        case PTRACE_OLDSETOPTIONS:
#endif
        case PTRACE_SETOPTIONS:
                ret = ptrace_setoptions(child, data);
                break;
        case PTRACE_GETEVENTMSG:
                ret = put_user(child->ptrace_message, datalp);
                break;

        case PTRACE_PEEKSIGINFO:
                ret = ptrace_peek_siginfo(child, addr, data);
                break;

        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
                        ret = copy_siginfo_to_user(datavp, &siginfo);
                break;

        case PTRACE_SETSIGINFO:
                ret = copy_siginfo_from_user(&siginfo, datavp);
                if (!ret)
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;

        case PTRACE_GETSIGMASK: {
                sigset_t *mask;

                if (addr != sizeof(sigset_t)) {
                        ret = -EINVAL;
                        break;
                }

                if (test_tsk_restore_sigmask(child))
                        mask = &child->saved_sigmask;
                else
                        mask = &child->blocked;

                if (copy_to_user(datavp, mask, sizeof(sigset_t)))
                        ret = -EFAULT;
                else
                        ret = 0;

                break;
        }

        case PTRACE_SETSIGMASK: {
                sigset_t new_set;

                if (addr != sizeof(sigset_t)) {
                        ret = -EINVAL;
                        break;
                }

                if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
                        ret = -EFAULT;
                        break;
                }

                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                /*
                 * Every thread does recalc_sigpending() after resume, so
                 * retarget_shared_pending() and recalc_sigpending() are not
                 * called here.
                 */
                spin_lock_irq(&child->sighand->siglock);
                child->blocked = new_set;
                spin_unlock_irq(&child->sighand->siglock);

                clear_tsk_restore_sigmask(child);

                ret = 0;
                break;
        }

        case PTRACE_INTERRUPT:
                /*
                 * Stop tracee without any side-effect on signal or job
                 * control.  At least one trap is guaranteed to happen
                 * after this request.  If @child is already trapped, the
                 * current trap is not disturbed and another trap will
                 * happen after the current trap is ended with PTRACE_CONT.
                 *
                 * The actual trap might not be PTRACE_EVENT_STOP trap but
                 * the pending condition is cleared regardless.
                 */
                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
                        break;

                /*
                 * INTERRUPT doesn't disturb existing trap sans one
                 * exception.  If ptracer issued LISTEN for the current
                 * STOP, this INTERRUPT should clear LISTEN and re-trap
                 * tracee into STOP.
                 */
                if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
                        ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);

                unlock_task_sighand(child, &flags);
                ret = 0;
                break;

        case PTRACE_LISTEN:
                /*
                 * Listen for events.  Tracee must be in STOP.  It's not
                 * resumed per-se but is not considered to be in TRACED by
                 * wait(2) or ptrace(2).  If an async event (e.g. group
                 * stop state change) happens, tracee will enter STOP trap
                 * again.  Alternatively, ptracer can issue INTERRUPT to
                 * finish listening and re-trap tracee into STOP.
                 */
                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
                        break;

                si = child->last_siginfo;
                if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
                        child->jobctl |= JOBCTL_LISTENING;
                        /*
                         * If NOTIFY is set, it means event happened between
                         * start of this trap and now.  Trigger re-trap.
                         */
                        if (child->jobctl & JOBCTL_TRAP_NOTIFY)
                                ptrace_signal_wake_up(child, true);
                        ret = 0;
                }
                unlock_task_sighand(child, &flags);
                break;

        case PTRACE_DETACH:         /* detach a process that was attached. */
                ret = ptrace_detach(child, data);
                break;

#ifdef CONFIG_BINFMT_ELF_FDPIC
        case PTRACE_GETFDPIC: {
                struct mm_struct *mm = get_task_mm(child);
                unsigned long tmp = 0;

                ret = -ESRCH;
                if (!mm)
                        break;

                switch (addr) {
                case PTRACE_GETFDPIC_EXEC:
                        tmp = mm->context.exec_fdpic_loadmap;
                        break;
                case PTRACE_GETFDPIC_INTERP:
                        tmp = mm->context.interp_fdpic_loadmap;
                        break;
                default:
                        break;
                }
                mmput(mm);

                ret = put_user(tmp, datalp);
                break;
        }
#endif

        case PTRACE_SINGLESTEP:
#ifdef PTRACE_SINGLEBLOCK
        case PTRACE_SINGLEBLOCK:
#endif
#ifdef PTRACE_SYSEMU
        case PTRACE_SYSEMU:
        case PTRACE_SYSEMU_SINGLESTEP:
#endif
        case PTRACE_SYSCALL:
        case PTRACE_CONT:
                return ptrace_resume(child, request, data);

        case PTRACE_KILL:
                send_sig_info(SIGKILL, SEND_SIG_NOINFO, child);
                return 0;

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        case PTRACE_GETREGSET:
        case PTRACE_SETREGSET: {
                struct iovec kiov;
                struct iovec __user *uiov = datavp;

                if (!access_ok(uiov, sizeof(*uiov)))
                        return -EFAULT;

                if (__get_user(kiov.iov_base, &uiov->iov_base) ||
                    __get_user(kiov.iov_len, &uiov->iov_len))
                        return -EFAULT;

                ret = ptrace_regset(child, request, addr, &kiov);
                if (!ret)
                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
                break;
        }

        case PTRACE_GET_SYSCALL_INFO:
                ret = ptrace_get_syscall_info(child, addr, datavp);
                break;
#endif

        case PTRACE_SECCOMP_GET_FILTER:
                ret = seccomp_get_filter(child, addr, datavp);
                break;

        case PTRACE_SECCOMP_GET_METADATA:
                ret = seccomp_get_metadata(child, addr, datavp);
                break;

#ifdef CONFIG_RSEQ
        case PTRACE_GET_RSEQ_CONFIGURATION:
                ret = ptrace_get_rseq_configuration(child, addr, datavp);
                break;
#endif

        case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
                ret = syscall_user_dispatch_set_config(child, addr, datavp);
                break;

        case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
                ret = syscall_user_dispatch_get_config(child, addr, datavp);
                break;

        default:
                break;
        }

        return ret;
}

SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
                unsigned long, data)
{
        struct task_struct *child;
        long ret;

        if (request == PTRACE_TRACEME) {
                ret = ptrace_traceme();
                goto out;
        }

        child = find_get_task_by_vpid(pid);
        if (!child) {
                ret = -ESRCH;
                goto out;
        }

        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
                ret = ptrace_attach(child, request, addr, data);
                goto out_put_task_struct;
        }

        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
                                  request == PTRACE_INTERRUPT);
        if (ret < 0)
                goto out_put_task_struct;

        ret = arch_ptrace(child, request, addr, data);
        if (ret || request != PTRACE_DETACH)
                ptrace_unfreeze_traced(child);

 out_put_task_struct:
        put_task_struct(child);
 out:
        return ret;
}

int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data)
{
        unsigned long tmp;
        int copied;

        copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
        if (copied != sizeof(tmp))
                return -EIO;
        return put_user(tmp, (unsigned long __user *)data);
}

int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data)
{
        int copied;

        copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
                        FOLL_FORCE | FOLL_WRITE);
        return (copied == sizeof(data)) ? 0 : -EIO;
}

#if defined CONFIG_COMPAT

int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                          compat_ulong_t addr, compat_ulong_t data)
{
        compat_ulong_t __user *datap = compat_ptr(data);
        compat_ulong_t word;
        kernel_siginfo_t siginfo;
        int ret;

        switch (request) {
        case PTRACE_PEEKTEXT:
        case PTRACE_PEEKDATA:
                ret = ptrace_access_vm(child, addr, &word, sizeof(word),
                                FOLL_FORCE);
                if (ret != sizeof(word))
                        ret = -EIO;
                else
                        ret = put_user(word, datap);
                break;

        case PTRACE_POKETEXT:
        case PTRACE_POKEDATA:
                ret = ptrace_access_vm(child, addr, &data, sizeof(data),
                                FOLL_FORCE | FOLL_WRITE);
                ret = (ret != sizeof(data) ? -EIO : 0);
                break;

        case PTRACE_GETEVENTMSG:
                ret = put_user((compat_ulong_t) child->ptrace_message, datap);
                break;

        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
                        ret = copy_siginfo_to_user32(
                                (struct compat_siginfo __user *) datap,
                                &siginfo);
                break;

        case PTRACE_SETSIGINFO:
                ret = copy_siginfo_from_user32(
                        &siginfo, (struct compat_siginfo __user *) datap);
                if (!ret)
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        case PTRACE_GETREGSET:
        case PTRACE_SETREGSET:
        {
                struct iovec kiov;
                struct compat_iovec __user *uiov =
                        (struct compat_iovec __user *) datap;
                compat_uptr_t ptr;
                compat_size_t len;

                if (!access_ok(uiov, sizeof(*uiov)))
                        return -EFAULT;

                if (__get_user(ptr, &uiov->iov_base) ||
                    __get_user(len, &uiov->iov_len))
                        return -EFAULT;

                kiov.iov_base = compat_ptr(ptr);
                kiov.iov_len = len;

                ret = ptrace_regset(child, request, addr, &kiov);
                if (!ret)
                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
                break;
        }
#endif

        default:
                ret = ptrace_request(child, request, addr, data);
        }

        return ret;
}

COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
                       compat_long_t, addr, compat_long_t, data)
{
        struct task_struct *child;
        long ret;

        if (request == PTRACE_TRACEME) {
                ret = ptrace_traceme();
                goto out;
        }

        child = find_get_task_by_vpid(pid);
        if (!child) {
                ret = -ESRCH;
                goto out;
        }

        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
                ret = ptrace_attach(child, request, addr, data);
                goto out_put_task_struct;
        }

        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
                                  request == PTRACE_INTERRUPT);
        if (!ret) {
                ret = compat_arch_ptrace(child, request, addr, data);
                if (ret || request != PTRACE_DETACH)
                        ptrace_unfreeze_traced(child);
        }

 out_put_task_struct:
        put_task_struct(child);
 out:
        return ret;
}
#endif        /* CONFIG_COMPAT */





































































































































































































































































































































































































































    3 





    3 





















    2 





















































































































































































































































































































































































    3 
    4 







    6 


    5 












    3 


    3 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   23 


   22 






























   22 



   25 
   23 














   25 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2002 Richard Henderson
 * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
 */

#define INCLUDE_VERMAGIC

#include <linux/export.h>
#include <linux/extable.h>
#include <linux/moduleloader.h>
#include <linux/module_signature.h>
#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/buildid.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/rcupdate.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/moduleparam.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <linux/set_memory.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
#include <asm/sections.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/livepatch.h>
#include <linux/async.h>
#include <linux/percpu.h>
#include <linux/kmemleak.h>
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
#include <linux/cfi.h>
#include <linux/codetag.h>
#include <linux/debugfs.h>
#include <linux/execmem.h>
#include <uapi/linux/module.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/module.h>

/*
 * Mutex protects:
 * 1) List of modules (also safely readable with preempt_disable),
 * 2) module_use links,
 * 3) mod_tree.addr_min/mod_tree.addr_max.
 * (delete and add uses RCU list operations).
 */
DEFINE_MUTEX(module_mutex);
LIST_HEAD(modules);

/* Work queue for freeing init sections in success case */
static void do_free_init(struct work_struct *w);
static DECLARE_WORK(init_free_wq, do_free_init);
static LLIST_HEAD(init_free_list);

struct mod_tree_root mod_tree __cacheline_aligned = {
        .addr_min = -1UL,
};

struct symsearch {
        const struct kernel_symbol *start, *stop;
        const s32 *crcs;
        enum mod_license license;
};

/*
 * Bounds of module memory, for speeding up __module_address.
 * Protected by module_mutex.
 */
static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
                                unsigned int size, struct mod_tree_root *tree)
{
        unsigned long min = (unsigned long)base;
        unsigned long max = min + size;

#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        if (mod_mem_type_is_core_data(type)) {
                if (min < tree->data_addr_min)
                        tree->data_addr_min = min;
                if (max > tree->data_addr_max)
                        tree->data_addr_max = max;
                return;
        }
#endif
        if (min < tree->addr_min)
                tree->addr_min = min;
        if (max > tree->addr_max)
                tree->addr_max = max;
}

static void mod_update_bounds(struct module *mod)
{
        for_each_mod_mem_type(type) {
                struct module_memory *mod_mem = &mod->mem[type];

                if (mod_mem->size)
                        __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
        }
}

/* Block module loading/unloading? */
int modules_disabled;
core_param(nomodule, modules_disabled, bint, 0);

/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);

static BLOCKING_NOTIFIER_HEAD(module_notify_list);

int register_module_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&module_notify_list, nb);
}
EXPORT_SYMBOL(register_module_notifier);

int unregister_module_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&module_notify_list, nb);
}
EXPORT_SYMBOL(unregister_module_notifier);

/*
 * We require a truly strong try_module_get(): 0 means success.
 * Otherwise an error is returned due to ongoing or failed
 * initialization etc.
 */
static inline int strong_try_module_get(struct module *mod)
{
        BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
        if (mod && mod->state == MODULE_STATE_COMING)
                return -EBUSY;
        if (try_module_get(mod))
                return 0;
        else
                return -ENOENT;
}

static inline void add_taint_module(struct module *mod, unsigned flag,
                                    enum lockdep_ok lockdep_ok)
{
        add_taint(flag, lockdep_ok);
        set_bit(flag, &mod->taints);
}

/*
 * A thread that wants to hold a reference to a module only while it
 * is running can call this to safely exit.
 */
void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
{
        module_put(mod);
        kthread_exit(code);
}
EXPORT_SYMBOL(__module_put_and_kthread_exit);

/* Find a module section: 0 means not found. */
static unsigned int find_sec(const struct load_info *info, const char *name)
{
        unsigned int i;

        for (i = 1; i < info->hdr->e_shnum; i++) {
                Elf_Shdr *shdr = &info->sechdrs[i];
                /* Alloc bit cleared means "ignore it." */
                if ((shdr->sh_flags & SHF_ALLOC)
                    && strcmp(info->secstrings + shdr->sh_name, name) == 0)
                        return i;
        }
        return 0;
}

/* Find a module section, or NULL. */
static void *section_addr(const struct load_info *info, const char *name)
{
        /* Section 0 has sh_addr 0. */
        return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
}

/* Find a module section, or NULL.  Fill in number of "objects" in section. */
static void *section_objs(const struct load_info *info,
                          const char *name,
                          size_t object_size,
                          unsigned int *num)
{
        unsigned int sec = find_sec(info, name);

        /* Section 0 has sh_addr 0 and sh_size 0. */
        *num = info->sechdrs[sec].sh_size / object_size;
        return (void *)info->sechdrs[sec].sh_addr;
}

/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
static unsigned int find_any_sec(const struct load_info *info, const char *name)
{
        unsigned int i;

        for (i = 1; i < info->hdr->e_shnum; i++) {
                Elf_Shdr *shdr = &info->sechdrs[i];
                if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
                        return i;
        }
        return 0;
}

/*
 * Find a module section, or NULL. Fill in number of "objects" in section.
 * Ignores SHF_ALLOC flag.
 */
static __maybe_unused void *any_section_objs(const struct load_info *info,
                                             const char *name,
                                             size_t object_size,
                                             unsigned int *num)
{
        unsigned int sec = find_any_sec(info, name);

        /* Section 0 has sh_addr 0 and sh_size 0. */
        *num = info->sechdrs[sec].sh_size / object_size;
        return (void *)info->sechdrs[sec].sh_addr;
}

#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
#else
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif

static const char *kernel_symbol_name(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        return offset_to_ptr(&sym->name_offset);
#else
        return sym->name;
#endif
}

static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        if (!sym->namespace_offset)
                return NULL;
        return offset_to_ptr(&sym->namespace_offset);
#else
        return sym->namespace;
#endif
}

int cmp_name(const void *name, const void *sym)
{
        return strcmp(name, kernel_symbol_name(sym));
}

static bool find_exported_symbol_in_section(const struct symsearch *syms,
                                            struct module *owner,
                                            struct find_symbol_arg *fsa)
{
        struct kernel_symbol *sym;

        if (!fsa->gplok && syms->license == GPL_ONLY)
                return false;

        sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
                        sizeof(struct kernel_symbol), cmp_name);
        if (!sym)
                return false;

        fsa->owner = owner;
        fsa->crc = symversion(syms->crcs, sym - syms->start);
        fsa->sym = sym;
        fsa->license = syms->license;

        return true;
}

/*
 * Find an exported symbol and return it, along with, (optional) crc and
 * (optional) module which owns it.  Needs preempt disabled or module_mutex.
 */
bool find_symbol(struct find_symbol_arg *fsa)
{
        static const struct symsearch arr[] = {
                { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
                  NOT_GPL_ONLY },
                { __start___ksymtab_gpl, __stop___ksymtab_gpl,
                  __start___kcrctab_gpl,
                  GPL_ONLY },
        };
        struct module *mod;
        unsigned int i;

        module_assert_mutex_or_preempt();

        for (i = 0; i < ARRAY_SIZE(arr); i++)
                if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
                        return true;

        list_for_each_entry_rcu(mod, &modules, list,
                                lockdep_is_held(&module_mutex)) {
                struct symsearch arr[] = {
                        { mod->syms, mod->syms + mod->num_syms, mod->crcs,
                          NOT_GPL_ONLY },
                        { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
                          mod->gpl_crcs,
                          GPL_ONLY },
                };

                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;

                for (i = 0; i < ARRAY_SIZE(arr); i++)
                        if (find_exported_symbol_in_section(&arr[i], mod, fsa))
                                return true;
        }

        pr_debug("Failed to find symbol %s\n", fsa->name);
        return false;
}

/*
 * Search for module by name: must hold module_mutex (or preempt disabled
 * for read-only access).
 */
struct module *find_module_all(const char *name, size_t len,
                               bool even_unformed)
{
        struct module *mod;

        module_assert_mutex_or_preempt();

        list_for_each_entry_rcu(mod, &modules, list,
                                lockdep_is_held(&module_mutex)) {
                if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
                        continue;
                if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
                        return mod;
        }
        return NULL;
}

struct module *find_module(const char *name)
{
        return find_module_all(name, strlen(name), false);
}

#ifdef CONFIG_SMP

static inline void __percpu *mod_percpu(struct module *mod)
{
        return mod->percpu;
}

static int percpu_modalloc(struct module *mod, struct load_info *info)
{
        Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
        unsigned long align = pcpusec->sh_addralign;

        if (!pcpusec->sh_size)
                return 0;

        if (align > PAGE_SIZE) {
                pr_warn("%s: per-cpu alignment %li > %li\n",
                        mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }

        mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
        if (!mod->percpu) {
                pr_warn("%s: Could not allocate %lu bytes percpu data\n",
                        mod->name, (unsigned long)pcpusec->sh_size);
                return -ENOMEM;
        }
        mod->percpu_size = pcpusec->sh_size;
        return 0;
}

static void percpu_modfree(struct module *mod)
{
        free_percpu(mod->percpu);
}

static unsigned int find_pcpusec(struct load_info *info)
{
        return find_sec(info, ".data..percpu");
}

static void percpu_modcopy(struct module *mod,
                           const void *from, unsigned long size)
{
        int cpu;

        for_each_possible_cpu(cpu)
                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
}

bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        struct module *mod;
        unsigned int cpu;

        preempt_disable();

        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
                if (!mod->percpu_size)
                        continue;
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(mod->percpu, cpu);
                        void *va = (void *)addr;

                        if (va >= start && va < start + mod->percpu_size) {
                                if (can_addr) {
                                        *can_addr = (unsigned long) (va - start);
                                        *can_addr += (unsigned long)
                                                per_cpu_ptr(mod->percpu,
                                                            get_boot_cpu_id());
                                }
                                preempt_enable();
                                return true;
                        }
                }
        }

        preempt_enable();
        return false;
}

/**
 * is_module_percpu_address() - test whether address is from module static percpu
 * @addr: address to test
 *
 * Test whether @addr belongs to module static percpu area.
 *
 * Return: %true if @addr is from module static percpu area
 */
bool is_module_percpu_address(unsigned long addr)
{
        return __is_module_percpu_address(addr, NULL);
}

#else /* ... !CONFIG_SMP */

static inline void __percpu *mod_percpu(struct module *mod)
{
        return NULL;
}
static int percpu_modalloc(struct module *mod, struct load_info *info)
{
        /* UP modules shouldn't have this section: ENOMEM isn't quite right */
        if (info->sechdrs[info->index.pcpu].sh_size != 0)
                return -ENOMEM;
        return 0;
}
static inline void percpu_modfree(struct module *mod)
{
}
static unsigned int find_pcpusec(struct load_info *info)
{
        return 0;
}
static inline void percpu_modcopy(struct module *mod,
                                  const void *from, unsigned long size)
{
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
}
bool is_module_percpu_address(unsigned long addr)
{
        return false;
}

bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        return false;
}

#endif /* CONFIG_SMP */

#define MODINFO_ATTR(field)        \
static void setup_modinfo_##field(struct module *mod, const char *s)  \
{                                                                     \
        mod->field = kstrdup(s, GFP_KERNEL);                          \
}                                                                     \
static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
                        struct module_kobject *mk, char *buffer)      \
{                                                                     \
        return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field);  \
}                                                                     \
static int modinfo_##field##_exists(struct module *mod)               \
{                                                                     \
        return mod->field != NULL;                                    \
}                                                                     \
static void free_modinfo_##field(struct module *mod)                  \
{                                                                     \
        kfree(mod->field);                                            \
        mod->field = NULL;                                            \
}                                                                     \
static struct module_attribute modinfo_##field = {                    \
        .attr = { .name = __stringify(field), .mode = 0444 },         \
        .show = show_modinfo_##field,                                 \
        .setup = setup_modinfo_##field,                               \
        .test = modinfo_##field##_exists,                             \
        .free = free_modinfo_##field,                                 \
};

MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);

static struct {
        char name[MODULE_NAME_LEN + 1];
        char taints[MODULE_FLAGS_BUF_SIZE];
} last_unloaded_module;

#ifdef CONFIG_MODULE_UNLOAD

EXPORT_TRACEPOINT_SYMBOL(module_get);

/* MODULE_REF_BASE is the base reference count by kmodule loader. */
#define MODULE_REF_BASE        1

/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
        /*
         * Initialize reference counter to MODULE_REF_BASE.
         * refcnt == 0 means module is going.
         */
        atomic_set(&mod->refcnt, MODULE_REF_BASE);

        INIT_LIST_HEAD(&mod->source_list);
        INIT_LIST_HEAD(&mod->target_list);

        /* Hold reference count during initialization. */
        atomic_inc(&mod->refcnt);

        return 0;
}

/* Does a already use b? */
static int already_uses(struct module *a, struct module *b)
{
        struct module_use *use;

        list_for_each_entry(use, &b->source_list, source_list) {
                if (use->source == a)
                        return 1;
        }
        pr_debug("%s does not use %s!\n", a->name, b->name);
        return 0;
}

/*
 * Module a uses b
 *  - we add 'a' as a "source", 'b' as a "target" of module use
 *  - the module_use is added to the list of 'b' sources (so
 *    'b' can walk the list to see who sourced them), and of 'a'
 *    targets (so 'a' can see what modules it targets).
 */
static int add_module_usage(struct module *a, struct module *b)
{
        struct module_use *use;

        pr_debug("Allocating new usage for %s.\n", a->name);
        use = kmalloc(sizeof(*use), GFP_ATOMIC);
        if (!use)
                return -ENOMEM;

        use->source = a;
        use->target = b;
        list_add(&use->source_list, &b->source_list);
        list_add(&use->target_list, &a->target_list);
        return 0;
}

/* Module a uses b: caller needs module_mutex() */
static int ref_module(struct module *a, struct module *b)
{
        int err;

        if (b == NULL || already_uses(a, b))
                return 0;

        /* If module isn't available, we fail. */
        err = strong_try_module_get(b);
        if (err)
                return err;

        err = add_module_usage(a, b);
        if (err) {
                module_put(b);
                return err;
        }
        return 0;
}

/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
{
        struct module_use *use, *tmp;

        mutex_lock(&module_mutex);
        list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
                struct module *i = use->target;
                pr_debug("%s unusing %s\n", mod->name, i->name);
                module_put(i);
                list_del(&use->source_list);
                list_del(&use->target_list);
                kfree(use);
        }
        mutex_unlock(&module_mutex);
}

#ifdef CONFIG_MODULE_FORCE_UNLOAD
static inline int try_force_unload(unsigned int flags)
{
        int ret = (flags & O_TRUNC);
        if (ret)
                add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
        return ret;
}
#else
static inline int try_force_unload(unsigned int flags)
{
        return 0;
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */

/* Try to release refcount of module, 0 means success. */
static int try_release_module_ref(struct module *mod)
{
        int ret;

        /* Try to decrement refcnt which we set at loading */
        ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
        BUG_ON(ret < 0);
        if (ret)
                /* Someone can put this right now, recover with checking */
                ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);

        return ret;
}

static int try_stop_module(struct module *mod, int flags, int *forced)
{
        /* If it's not unused, quit unless we're forcing. */
        if (try_release_module_ref(mod) != 0) {
                *forced = try_force_unload(flags);
                if (!(*forced))
                        return -EWOULDBLOCK;
        }

        /* Mark it as dying. */
        mod->state = MODULE_STATE_GOING;

        return 0;
}

/**
 * module_refcount() - return the refcount or -1 if unloading
 * @mod:        the module we're checking
 *
 * Return:
 *        -1 if the module is in the process of unloading
 *        otherwise the number of references in the kernel to the module
 */
int module_refcount(struct module *mod)
{
        return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);

/* This exists whether we can unload or not */
static void free_module(struct module *mod);

SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                unsigned int, flags)
{
        struct module *mod;
        char name[MODULE_NAME_LEN];
        char buf[MODULE_FLAGS_BUF_SIZE];
        int ret, forced = 0;

        if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;

        if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';

        audit_log_kern_module(name);

        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;

        mod = find_module(name);
        if (!mod) {
                ret = -ENOENT;
                goto out;
        }

        if (!list_empty(&mod->source_list)) {
                /* Other modules depend on us: get rid of them first. */
                ret = -EWOULDBLOCK;
                goto out;
        }

        /* Doing init or already dying? */
        if (mod->state != MODULE_STATE_LIVE) {
                /* FIXME: if (force), slam module count damn the torpedoes */
                pr_debug("%s already dying\n", mod->name);
                ret = -EBUSY;
                goto out;
        }

        /* If it has an init func, it must have an exit func to unload */
        if (mod->init && !mod->exit) {
                forced = try_force_unload(flags);
                if (!forced) {
                        /* This module can't be removed */
                        ret = -EBUSY;
                        goto out;
                }
        }

        ret = try_stop_module(mod, flags, &forced);
        if (ret != 0)
                goto out;

        mutex_unlock(&module_mutex);
        /* Final destruction now no one is using it. */
        if (mod->exit != NULL)
                mod->exit();
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
        klp_module_going(mod);
        ftrace_release_mod(mod);

        async_synchronize_full();

        /* Store the name and taints of the last unloaded module for diagnostic purposes */
        strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name));
        strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints));

        free_module(mod);
        /* someone could wait for the module in add_unformed_module() */
        wake_up_all(&module_wq);
        return 0;
out:
        mutex_unlock(&module_mutex);
        return ret;
}

void __symbol_put(const char *symbol)
{
        struct find_symbol_arg fsa = {
                .name        = symbol,
                .gplok        = true,
        };

        preempt_disable();
        BUG_ON(!find_symbol(&fsa));
        module_put(fsa.owner);
        preempt_enable();
}
EXPORT_SYMBOL(__symbol_put);

/* Note this assumes addr is a function, which it currently always is. */
void symbol_put_addr(void *addr)
{
        struct module *modaddr;
        unsigned long a = (unsigned long)dereference_function_descriptor(addr);

        if (core_kernel_text(a))
                return;

        /*
         * Even though we hold a reference on the module; we still need to
         * disable preemption in order to safely traverse the data structure.
         */
        preempt_disable();
        modaddr = __module_text_address(a);
        BUG_ON(!modaddr);
        module_put(modaddr);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(symbol_put_addr);

static ssize_t show_refcnt(struct module_attribute *mattr,
                           struct module_kobject *mk, char *buffer)
{
        return sprintf(buffer, "%i\n", module_refcount(mk->mod));
}

static struct module_attribute modinfo_refcnt =
        __ATTR(refcnt, 0444, show_refcnt, NULL);

void __module_get(struct module *module)
{
        if (module) {
                atomic_inc(&module->refcnt);
                trace_module_get(module, _RET_IP_);
        }
}
EXPORT_SYMBOL(__module_get);

bool try_module_get(struct module *module)
{
        bool ret = true;

        if (module) {
                /* Note: here, we can fail to get a reference */
                if (likely(module_is_live(module) &&
                           atomic_inc_not_zero(&module->refcnt) != 0))
                        trace_module_get(module, _RET_IP_);
                else
                        ret = false;
        }
        return ret;
}
EXPORT_SYMBOL(try_module_get);

void module_put(struct module *module)
{
        int ret;

        if (module) {
                ret = atomic_dec_if_positive(&module->refcnt);
                WARN_ON(ret < 0);        /* Failed to put refcount */
                trace_module_put(module, _RET_IP_);
        }
}
EXPORT_SYMBOL(module_put);

#else /* !CONFIG_MODULE_UNLOAD */
static inline void module_unload_free(struct module *mod)
{
}

static int ref_module(struct module *a, struct module *b)
{
        return strong_try_module_get(b);
}

static inline int module_unload_init(struct module *mod)
{
        return 0;
}
#endif /* CONFIG_MODULE_UNLOAD */

size_t module_flags_taint(unsigned long taints, char *buf)
{
        size_t l = 0;
        int i;

        for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
                if (taint_flags[i].module && test_bit(i, &taints))
                        buf[l++] = taint_flags[i].c_true;
        }

        return l;
}

static ssize_t show_initstate(struct module_attribute *mattr,
                              struct module_kobject *mk, char *buffer)
{
        const char *state = "unknown";

        switch (mk->mod->state) {
        case MODULE_STATE_LIVE:
                state = "live";
                break;
        case MODULE_STATE_COMING:
                state = "coming";
                break;
        case MODULE_STATE_GOING:
                state = "going";
                break;
        default:
                BUG();
        }
        return sprintf(buffer, "%s\n", state);
}

static struct module_attribute modinfo_initstate =
        __ATTR(initstate, 0444, show_initstate, NULL);

static ssize_t store_uevent(struct module_attribute *mattr,
                            struct module_kobject *mk,
                            const char *buffer, size_t count)
{
        int rc;

        rc = kobject_synth_uevent(&mk->kobj, buffer, count);
        return rc ? rc : count;
}

struct module_attribute module_uevent =
        __ATTR(uevent, 0200, NULL, store_uevent);

static ssize_t show_coresize(struct module_attribute *mattr,
                             struct module_kobject *mk, char *buffer)
{
        unsigned int size = mk->mod->mem[MOD_TEXT].size;

        if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
                for_class_mod_mem_type(type, core_data)
                        size += mk->mod->mem[type].size;
        }
        return sprintf(buffer, "%u\n", size);
}

static struct module_attribute modinfo_coresize =
        __ATTR(coresize, 0444, show_coresize, NULL);

#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
static ssize_t show_datasize(struct module_attribute *mattr,
                             struct module_kobject *mk, char *buffer)
{
        unsigned int size = 0;

        for_class_mod_mem_type(type, core_data)
                size += mk->mod->mem[type].size;
        return sprintf(buffer, "%u\n", size);
}

static struct module_attribute modinfo_datasize =
        __ATTR(datasize, 0444, show_datasize, NULL);
#endif

static ssize_t show_initsize(struct module_attribute *mattr,
                             struct module_kobject *mk, char *buffer)
{
        unsigned int size = 0;

        for_class_mod_mem_type(type, init)
                size += mk->mod->mem[type].size;
        return sprintf(buffer, "%u\n", size);
}

static struct module_attribute modinfo_initsize =
        __ATTR(initsize, 0444, show_initsize, NULL);

static ssize_t show_taint(struct module_attribute *mattr,
                          struct module_kobject *mk, char *buffer)
{
        size_t l;

        l = module_flags_taint(mk->mod->taints, buffer);
        buffer[l++] = '\n';
        return l;
}

static struct module_attribute modinfo_taint =
        __ATTR(taint, 0444, show_taint, NULL);

struct module_attribute *modinfo_attrs[] = {
        &module_uevent,
        &modinfo_version,
        &modinfo_srcversion,
        &modinfo_initstate,
        &modinfo_coresize,
#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        &modinfo_datasize,
#endif
        &modinfo_initsize,
        &modinfo_taint,
#ifdef CONFIG_MODULE_UNLOAD
        &modinfo_refcnt,
#endif
        NULL,
};

size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);

static const char vermagic[] = VERMAGIC_STRING;

int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
        if (!test_taint(TAINT_FORCED_MODULE))
                pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
        add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
        return 0;
#else
        return -ENOEXEC;
#endif
}

/* Parse tag=value strings from .modinfo section */
char *module_next_tag_pair(char *string, unsigned long *secsize)
{
        /* Skip non-zero chars */
        while (string[0]) {
                string++;
                if ((*secsize)-- <= 1)
                        return NULL;
        }

        /* Skip any zero padding. */
        while (!string[0]) {
                string++;
                if ((*secsize)-- <= 1)
                        return NULL;
        }
        return string;
}

static char *get_next_modinfo(const struct load_info *info, const char *tag,
                              char *prev)
{
        char *p;
        unsigned int taglen = strlen(tag);
        Elf_Shdr *infosec = &info->sechdrs[info->index.info];
        unsigned long size = infosec->sh_size;

        /*
         * get_modinfo() calls made before rewrite_section_headers()
         * must use sh_offset, as sh_addr isn't set!
         */
        char *modinfo = (char *)info->hdr + infosec->sh_offset;

        if (prev) {
                size -= prev - modinfo;
                modinfo = module_next_tag_pair(prev, &size);
        }

        for (p = modinfo; p; p = module_next_tag_pair(p, &size)) {
                if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
                        return p + taglen + 1;
        }
        return NULL;
}

static char *get_modinfo(const struct load_info *info, const char *tag)
{
        return get_next_modinfo(info, tag, NULL);
}

static int verify_namespace_is_imported(const struct load_info *info,
                                        const struct kernel_symbol *sym,
                                        struct module *mod)
{
        const char *namespace;
        char *imported_namespace;

        namespace = kernel_symbol_namespace(sym);
        if (namespace && namespace[0]) {
                for_each_modinfo_entry(imported_namespace, info, "import_ns") {
                        if (strcmp(namespace, imported_namespace) == 0)
                                return 0;
                }
#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
                pr_warn(
#else
                pr_err(
#endif
                        "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
                        mod->name, kernel_symbol_name(sym), namespace);
#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
                return -EINVAL;
#endif
        }
        return 0;
}

static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
{
        if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
                return true;

        if (mod->using_gplonly_symbols) {
                pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
                        mod->name, name, owner->name);
                return false;
        }

        if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
                pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
                        mod->name, name, owner->name);
                set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
        }
        return true;
}

/* Resolve a symbol for this module.  I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
                                                  const struct load_info *info,
                                                  const char *name,
                                                  char ownername[])
{
        struct find_symbol_arg fsa = {
                .name        = name,
                .gplok        = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
                .warn        = true,
        };
        int err;

        /*
         * The module_mutex should not be a heavily contended lock;
         * if we get the occasional sleep here, we'll go an extra iteration
         * in the wait_event_interruptible(), which is harmless.
         */
        sched_annotate_sleep();
        mutex_lock(&module_mutex);
        if (!find_symbol(&fsa))
                goto unlock;

        if (fsa.license == GPL_ONLY)
                mod->using_gplonly_symbols = true;

        if (!inherit_taint(mod, fsa.owner, name)) {
                fsa.sym = NULL;
                goto getname;
        }

        if (!check_version(info, name, mod, fsa.crc)) {
                fsa.sym = ERR_PTR(-EINVAL);
                goto getname;
        }

        err = verify_namespace_is_imported(info, fsa.sym, mod);
        if (err) {
                fsa.sym = ERR_PTR(err);
                goto getname;
        }

        err = ref_module(mod, fsa.owner);
        if (err) {
                fsa.sym = ERR_PTR(err);
                goto getname;
        }

getname:
        /* We must make copy under the lock if we failed to get ref. */
        strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
unlock:
        mutex_unlock(&module_mutex);
        return fsa.sym;
}

static const struct kernel_symbol *
resolve_symbol_wait(struct module *mod,
                    const struct load_info *info,
                    const char *name)
{
        const struct kernel_symbol *ksym;
        char owner[MODULE_NAME_LEN];

        if (wait_event_interruptible_timeout(module_wq,
                        !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
                        || PTR_ERR(ksym) != -EBUSY,
                                             30 * HZ) <= 0) {
                pr_warn("%s: gave up waiting for init of module %s.\n",
                        mod->name, owner);
        }
        return ksym;
}

void __weak module_arch_cleanup(struct module *mod)
{
}

void __weak module_arch_freeing_init(struct module *mod)
{
}

static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
{
        unsigned int size = PAGE_ALIGN(mod->mem[type].size);
        enum execmem_type execmem_type;
        void *ptr;

        mod->mem[type].size = size;

        if (mod_mem_type_is_data(type))
                execmem_type = EXECMEM_MODULE_DATA;
        else
                execmem_type = EXECMEM_MODULE_TEXT;

        ptr = execmem_alloc(execmem_type, size);
        if (!ptr)
                return -ENOMEM;

        /*
         * The pointer to these blocks of memory are stored on the module
         * structure and we keep that around so long as the module is
         * around. We only free that memory when we unload the module.
         * Just mark them as not being a leak then. The .init* ELF
         * sections *do* get freed after boot so we *could* treat them
         * slightly differently with kmemleak_ignore() and only grey
         * them out as they work as typical memory allocations which
         * *do* eventually get freed, but let's just keep things simple
         * and avoid *any* false positives.
         */
        kmemleak_not_leak(ptr);

        memset(ptr, 0, size);
        mod->mem[type].base = ptr;

        return 0;
}

static void module_memory_free(struct module *mod, enum mod_mem_type type,
                               bool unload_codetags)
{
        void *ptr = mod->mem[type].base;

        if (!unload_codetags && mod_mem_type_is_core_data(type))
                return;

        execmem_free(ptr);
}

static void free_mod_mem(struct module *mod, bool unload_codetags)
{
        for_each_mod_mem_type(type) {
                struct module_memory *mod_mem = &mod->mem[type];

                if (type == MOD_DATA)
                        continue;

                /* Free lock-classes; relies on the preceding sync_rcu(). */
                lockdep_free_key_range(mod_mem->base, mod_mem->size);
                if (mod_mem->size)
                        module_memory_free(mod, type, unload_codetags);
        }

        /* MOD_DATA hosts mod, so free it at last */
        lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
        module_memory_free(mod, MOD_DATA, unload_codetags);
}

/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
        bool unload_codetags;

        trace_module_free(mod);

        unload_codetags = codetag_unload_module(mod);
        if (!unload_codetags)
                pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n",
                        mod->name);

        mod_sysfs_teardown(mod);

        /*
         * We leave it in list to prevent duplicate loads, but make sure
         * that noone uses it while it's being deconstructed.
         */
        mutex_lock(&module_mutex);
        mod->state = MODULE_STATE_UNFORMED;
        mutex_unlock(&module_mutex);

        /* Arch-specific cleanup. */
        module_arch_cleanup(mod);

        /* Module unload stuff */
        module_unload_free(mod);

        /* Free any allocated parameters. */
        destroy_params(mod->kp, mod->num_kp);

        if (is_livepatch_module(mod))
                free_module_elf(mod);

        /* Now we can delete it from the lists */
        mutex_lock(&module_mutex);
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
        mod_tree_remove(mod);
        /* Remove this module from bug list, this uses list_del_rcu */
        module_bug_cleanup(mod);
        /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
        synchronize_rcu();
        if (try_add_tainted_module(mod))
                pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
                       mod->name);
        mutex_unlock(&module_mutex);

        /* This may be empty, but that's OK */
        module_arch_freeing_init(mod);
        kfree(mod->args);
        percpu_modfree(mod);

        free_mod_mem(mod, unload_codetags);
}

void *__symbol_get(const char *symbol)
{
        struct find_symbol_arg fsa = {
                .name        = symbol,
                .gplok        = true,
                .warn        = true,
        };

        preempt_disable();
        if (!find_symbol(&fsa))
                goto fail;
        if (fsa.license != GPL_ONLY) {
                pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
                        symbol);
                goto fail;
        }
        if (strong_try_module_get(fsa.owner))
                goto fail;
        preempt_enable();
        return (void *)kernel_symbol_value(fsa.sym);
fail:
        preempt_enable();
        return NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);

/*
 * Ensure that an exported symbol [global namespace] does not already exist
 * in the kernel or in some other module's exported symbol table.
 *
 * You must hold the module_mutex.
 */
static int verify_exported_symbols(struct module *mod)
{
        unsigned int i;
        const struct kernel_symbol *s;
        struct {
                const struct kernel_symbol *sym;
                unsigned int num;
        } arr[] = {
                { mod->syms, mod->num_syms },
                { mod->gpl_syms, mod->num_gpl_syms },
        };

        for (i = 0; i < ARRAY_SIZE(arr); i++) {
                for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
                        struct find_symbol_arg fsa = {
                                .name        = kernel_symbol_name(s),
                                .gplok        = true,
                        };
                        if (find_symbol(&fsa)) {
                                pr_err("%s: exports duplicate symbol %s"
                                       " (owned by %s)\n",
                                       mod->name, kernel_symbol_name(s),
                                       module_name(fsa.owner));
                                return -ENOEXEC;
                        }
                }
        }
        return 0;
}

static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
{
        /*
         * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
         * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
         * i386 has a similar problem but may not deserve a fix.
         *
         * If we ever have to ignore many symbols, consider refactoring the code to
         * only warn if referenced by a relocation.
         */
        if (emachine == EM_386 || emachine == EM_X86_64)
                return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
        return false;
}

/* Change all symbols so that st_value encodes the pointer directly. */
static int simplify_symbols(struct module *mod, const struct load_info *info)
{
        Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
        Elf_Sym *sym = (void *)symsec->sh_addr;
        unsigned long secbase;
        unsigned int i;
        int ret = 0;
        const struct kernel_symbol *ksym;

        for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
                const char *name = info->strtab + sym[i].st_name;

                switch (sym[i].st_shndx) {
                case SHN_COMMON:
                        /* Ignore common symbols */
                        if (!strncmp(name, "__gnu_lto", 9))
                                break;

                        /*
                         * We compiled with -fno-common.  These are not
                         * supposed to happen.
                         */
                        pr_debug("Common symbol: %s\n", name);
                        pr_warn("%s: please compile with -fno-common\n",
                               mod->name);
                        ret = -ENOEXEC;
                        break;

                case SHN_ABS:
                        /* Don't need to do anything */
                        pr_debug("Absolute symbol: 0x%08lx %s\n",
                                 (long)sym[i].st_value, name);
                        break;

                case SHN_LIVEPATCH:
                        /* Livepatch symbols are resolved by livepatch */
                        break;

                case SHN_UNDEF:
                        ksym = resolve_symbol_wait(mod, info, name);
                        /* Ok if resolved.  */
                        if (ksym && !IS_ERR(ksym)) {
                                sym[i].st_value = kernel_symbol_value(ksym);
                                break;
                        }

                        /* Ok if weak or ignored.  */
                        if (!ksym &&
                            (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
                             ignore_undef_symbol(info->hdr->e_machine, name)))
                                break;

                        ret = PTR_ERR(ksym) ?: -ENOENT;
                        pr_warn("%s: Unknown symbol %s (err %d)\n",
                                mod->name, name, ret);
                        break;

                default:
                        /* Divert to percpu allocation if a percpu var. */
                        if (sym[i].st_shndx == info->index.pcpu)
                                secbase = (unsigned long)mod_percpu(mod);
                        else
                                secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
                        break;
                }
        }

        return ret;
}

static int apply_relocations(struct module *mod, const struct load_info *info)
{
        unsigned int i;
        int err = 0;

        /* Now do relocations. */
        for (i = 1; i < info->hdr->e_shnum; i++) {
                unsigned int infosec = info->sechdrs[i].sh_info;

                /* Not a valid relocation section? */
                if (infosec >= info->hdr->e_shnum)
                        continue;

                /* Don't bother with non-allocated sections */
                if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
                        continue;

                if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
                        err = klp_apply_section_relocs(mod, info->sechdrs,
                                                       info->secstrings,
                                                       info->strtab,
                                                       info->index.sym, i,
                                                       NULL);
                else if (info->sechdrs[i].sh_type == SHT_REL)
                        err = apply_relocate(info->sechdrs, info->strtab,
                                             info->index.sym, i, mod);
                else if (info->sechdrs[i].sh_type == SHT_RELA)
                        err = apply_relocate_add(info->sechdrs, info->strtab,
                                                 info->index.sym, i, mod);
                if (err < 0)
                        break;
        }
        return err;
}

/* Additional bytes needed by arch in front of individual sections */
unsigned int __weak arch_mod_section_prepend(struct module *mod,
                                             unsigned int section)
{
        /* default implementation just returns zero */
        return 0;
}

long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
                                Elf_Shdr *sechdr, unsigned int section)
{
        long offset;
        long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;

        mod->mem[type].size += arch_mod_section_prepend(mod, section);
        offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
        mod->mem[type].size = offset + sechdr->sh_size;

        WARN_ON_ONCE(offset & mask);
        return offset | mask;
}

bool module_init_layout_section(const char *sname)
{
#ifndef CONFIG_MODULE_UNLOAD
        if (module_exit_section(sname))
                return true;
#endif
        return module_init_section(sname);
}

static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
{
        unsigned int m, i;

        static const unsigned long masks[][2] = {
                /*
                 * NOTE: all executable code must be the first section
                 * in this array; otherwise modify the text_size
                 * finder in the two loops below
                 */
                { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
                { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
                { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
                { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
                { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
        };
        static const int core_m_to_mem_type[] = {
                MOD_TEXT,
                MOD_RODATA,
                MOD_RO_AFTER_INIT,
                MOD_DATA,
                MOD_DATA,
        };
        static const int init_m_to_mem_type[] = {
                MOD_INIT_TEXT,
                MOD_INIT_RODATA,
                MOD_INVALID,
                MOD_INIT_DATA,
                MOD_INIT_DATA,
        };

        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];

                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
                        const char *sname = info->secstrings + s->sh_name;

                        if ((s->sh_flags & masks[m][0]) != masks[m][0]
                            || (s->sh_flags & masks[m][1])
                            || s->sh_entsize != ~0UL
                            || is_init != module_init_layout_section(sname))
                                continue;

                        if (WARN_ON_ONCE(type == MOD_INVALID))
                                continue;

                        s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
                        pr_debug("\t%s\n", sname);
                }
        }
}

/*
 * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
 * might -- code, read-only data, read-write data, small data.  Tally
 * sizes, and place the offsets into sh_entsize fields: high bit means it
 * belongs in init.
 */
static void layout_sections(struct module *mod, struct load_info *info)
{
        unsigned int i;

        for (i = 0; i < info->hdr->e_shnum; i++)
                info->sechdrs[i].sh_entsize = ~0UL;

        pr_debug("Core section allocation order for %s:\n", mod->name);
        __layout_sections(mod, info, false);

        pr_debug("Init section allocation order for %s:\n", mod->name);
        __layout_sections(mod, info, true);
}

static void module_license_taint_check(struct module *mod, const char *license)
{
        if (!license)
                license = "unspecified";

        if (!license_is_gpl_compatible(license)) {
                if (!test_taint(TAINT_PROPRIETARY_MODULE))
                        pr_warn("%s: module license '%s' taints kernel.\n",
                                mod->name, license);
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
                                 LOCKDEP_NOW_UNRELIABLE);
        }
}

static void setup_modinfo(struct module *mod, struct load_info *info)
{
        struct module_attribute *attr;
        int i;

        for (i = 0; (attr = modinfo_attrs[i]); i++) {
                if (attr->setup)
                        attr->setup(mod, get_modinfo(info, attr->attr.name));
        }
}

static void free_modinfo(struct module *mod)
{
        struct module_attribute *attr;
        int i;

        for (i = 0; (attr = modinfo_attrs[i]); i++) {
                if (attr->free)
                        attr->free(mod);
        }
}

bool __weak module_init_section(const char *name)
{
        return strstarts(name, ".init");
}

bool __weak module_exit_section(const char *name)
{
        return strstarts(name, ".exit");
}

static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
{
#if defined(CONFIG_64BIT)
        unsigned long long secend;
#else
        unsigned long secend;
#endif

        /*
         * Check for both overflow and offset/size being
         * too large.
         */
        secend = shdr->sh_offset + shdr->sh_size;
        if (secend < shdr->sh_offset || secend > info->len)
                return -ENOEXEC;

        return 0;
}

/*
 * Check userspace passed ELF module against our expectations, and cache
 * useful variables for further processing as we go.
 *
 * This does basic validity checks against section offsets and sizes, the
 * section name string table, and the indices used for it (sh_name).
 *
 * As a last step, since we're already checking the ELF sections we cache
 * useful variables which will be used later for our convenience:
 *
 *         o pointers to section headers
 *         o cache the modinfo symbol section
 *         o cache the string symbol section
 *         o cache the module section
 *
 * As a last step we set info->mod to the temporary copy of the module in
 * info->hdr. The final one will be allocated in move_module(). Any
 * modifications we make to our copy of the module will be carried over
 * to the final minted module.
 */
static int elf_validity_cache_copy(struct load_info *info, int flags)
{
        unsigned int i;
        Elf_Shdr *shdr, *strhdr;
        int err;
        unsigned int num_mod_secs = 0, mod_idx;
        unsigned int num_info_secs = 0, info_idx;
        unsigned int num_sym_secs = 0, sym_idx;

        if (info->len < sizeof(*(info->hdr))) {
                pr_err("Invalid ELF header len %lu\n", info->len);
                goto no_exec;
        }

        if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
                pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
                goto no_exec;
        }
        if (info->hdr->e_type != ET_REL) {
                pr_err("Invalid ELF header type: %u != %u\n",
                       info->hdr->e_type, ET_REL);
                goto no_exec;
        }
        if (!elf_check_arch(info->hdr)) {
                pr_err("Invalid architecture in ELF header: %u\n",
                       info->hdr->e_machine);
                goto no_exec;
        }
        if (!module_elf_check_arch(info->hdr)) {
                pr_err("Invalid module architecture in ELF header: %u\n",
                       info->hdr->e_machine);
                goto no_exec;
        }
        if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
                pr_err("Invalid ELF section header size\n");
                goto no_exec;
        }

        /*
         * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
         * known and small. So e_shnum * sizeof(Elf_Shdr)
         * will not overflow unsigned long on any platform.
         */
        if (info->hdr->e_shoff >= info->len
            || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
                info->len - info->hdr->e_shoff)) {
                pr_err("Invalid ELF section header overflow\n");
                goto no_exec;
        }

        info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;

        /*
         * Verify if the section name table index is valid.
         */
        if (info->hdr->e_shstrndx == SHN_UNDEF
            || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
                pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
                       info->hdr->e_shstrndx, info->hdr->e_shstrndx,
                       info->hdr->e_shnum);
                goto no_exec;
        }

        strhdr = &info->sechdrs[info->hdr->e_shstrndx];
        err = validate_section_offset(info, strhdr);
        if (err < 0) {
                pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
                return err;
        }

        /*
         * The section name table must be NUL-terminated, as required
         * by the spec. This makes strcmp and pr_* calls that access
         * strings in the section safe.
         */
        info->secstrings = (void *)info->hdr + strhdr->sh_offset;
        if (strhdr->sh_size == 0) {
                pr_err("empty section name table\n");
                goto no_exec;
        }
        if (info->secstrings[strhdr->sh_size - 1] != '\0') {
                pr_err("ELF Spec violation: section name table isn't null terminated\n");
                goto no_exec;
        }

        /*
         * The code assumes that section 0 has a length of zero and
         * an addr of zero, so check for it.
         */
        if (info->sechdrs[0].sh_type != SHT_NULL
            || info->sechdrs[0].sh_size != 0
            || info->sechdrs[0].sh_addr != 0) {
                pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
                       info->sechdrs[0].sh_type);
                goto no_exec;
        }

        for (i = 1; i < info->hdr->e_shnum; i++) {
                shdr = &info->sechdrs[i];
                switch (shdr->sh_type) {
                case SHT_NULL:
                case SHT_NOBITS:
                        continue;
                case SHT_SYMTAB:
                        if (shdr->sh_link == SHN_UNDEF
                            || shdr->sh_link >= info->hdr->e_shnum) {
                                pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
                                       shdr->sh_link, shdr->sh_link,
                                       info->hdr->e_shnum);
                                goto no_exec;
                        }
                        num_sym_secs++;
                        sym_idx = i;
                        fallthrough;
                default:
                        err = validate_section_offset(info, shdr);
                        if (err < 0) {
                                pr_err("Invalid ELF section in module (section %u type %u)\n",
                                        i, shdr->sh_type);
                                return err;
                        }
                        if (strcmp(info->secstrings + shdr->sh_name,
                                   ".gnu.linkonce.this_module") == 0) {
                                num_mod_secs++;
                                mod_idx = i;
                        } else if (strcmp(info->secstrings + shdr->sh_name,
                                   ".modinfo") == 0) {
                                num_info_secs++;
                                info_idx = i;
                        }

                        if (shdr->sh_flags & SHF_ALLOC) {
                                if (shdr->sh_name >= strhdr->sh_size) {
                                        pr_err("Invalid ELF section name in module (section %u type %u)\n",
                                               i, shdr->sh_type);
                                        return -ENOEXEC;
                                }
                        }
                        break;
                }
        }

        if (num_info_secs > 1) {
                pr_err("Only one .modinfo section must exist.\n");
                goto no_exec;
        } else if (num_info_secs == 1) {
                /* Try to find a name early so we can log errors with a module name */
                info->index.info = info_idx;
                info->name = get_modinfo(info, "name");
        }

        if (num_sym_secs != 1) {
                pr_warn("%s: module has no symbols (stripped?)\n",
                        info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        /* Sets internal symbols and strings. */
        info->index.sym = sym_idx;
        shdr = &info->sechdrs[sym_idx];
        info->index.str = shdr->sh_link;
        info->strtab = (char *)info->hdr + info->sechdrs[info->index.str].sh_offset;

        /*
         * The ".gnu.linkonce.this_module" ELF section is special. It is
         * what modpost uses to refer to __this_module and let's use rely
         * on THIS_MODULE to point to &__this_module properly. The kernel's
         * modpost declares it on each modules's *.mod.c file. If the struct
         * module of the kernel changes a full kernel rebuild is required.
         *
         * We have a few expectaions for this special section, the following
         * code validates all this for us:
         *
         *   o Only one section must exist
         *   o We expect the kernel to always have to allocate it: SHF_ALLOC
         *   o The section size must match the kernel's run time's struct module
         *     size
         */
        if (num_mod_secs != 1) {
                pr_err("module %s: Only one .gnu.linkonce.this_module section must exist.\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        shdr = &info->sechdrs[mod_idx];

        /*
         * This is already implied on the switch above, however let's be
         * pedantic about it.
         */
        if (shdr->sh_type == SHT_NOBITS) {
                pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        if (!(shdr->sh_flags & SHF_ALLOC)) {
                pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        if (shdr->sh_size != sizeof(struct module)) {
                pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        info->index.mod = mod_idx;

        /* This is temporary: point mod into copy of data. */
        info->mod = (void *)info->hdr + shdr->sh_offset;

        /*
         * If we didn't load the .modinfo 'name' field earlier, fall back to
         * on-disk struct mod 'name' field.
         */
        if (!info->name)
                info->name = info->mod->name;

        if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
                info->index.vers = 0; /* Pretend no __versions section! */
        else
                info->index.vers = find_sec(info, "__versions");

        info->index.pcpu = find_pcpusec(info);

        return 0;

no_exec:
        return -ENOEXEC;
}

#define COPY_CHUNK_SIZE (16*PAGE_SIZE)

static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
{
        do {
                unsigned long n = min(len, COPY_CHUNK_SIZE);

                if (copy_from_user(dst, usrc, n) != 0)
                        return -EFAULT;
                cond_resched();
                dst += n;
                usrc += n;
                len -= n;
        } while (len);
        return 0;
}

static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
{
        if (!get_modinfo(info, "livepatch"))
                /* Nothing more to do */
                return 0;

        if (set_livepatch_module(mod))
                return 0;

        pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
               mod->name);
        return -ENOEXEC;
}

static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
{
        if (retpoline_module_ok(get_modinfo(info, "retpoline")))
                return;

        pr_warn("%s: loading module not compiled with retpoline compiler.\n",
                mod->name);
}

/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
                                  struct load_info *info)
{
        int err;

        info->len = len;
        if (info->len < sizeof(*(info->hdr)))
                return -ENOEXEC;

        err = security_kernel_load_data(LOADING_MODULE, true);
        if (err)
                return err;

        /* Suck in entire file: we'll want most of it. */
        info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
        if (!info->hdr)
                return -ENOMEM;

        if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
                err = -EFAULT;
                goto out;
        }

        err = security_kernel_post_load_data((char *)info->hdr, info->len,
                                             LOADING_MODULE, "init_module");
out:
        if (err)
                vfree(info->hdr);

        return err;
}

static void free_copy(struct load_info *info, int flags)
{
        if (flags & MODULE_INIT_COMPRESSED_FILE)
                module_decompress_cleanup(info);
        else
                vfree(info->hdr);
}

static int rewrite_section_headers(struct load_info *info, int flags)
{
        unsigned int i;

        /* This should always be true, but let's be sure. */
        info->sechdrs[0].sh_addr = 0;

        for (i = 1; i < info->hdr->e_shnum; i++) {
                Elf_Shdr *shdr = &info->sechdrs[i];

                /*
                 * Mark all sections sh_addr with their address in the
                 * temporary image.
                 */
                shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;

        }

        /* Track but don't keep modinfo and version sections. */
        info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
        info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;

        return 0;
}

/*
 * These calls taint the kernel depending certain module circumstances */
static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
{
        int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);

        if (!get_modinfo(info, "intree")) {
                if (!test_taint(TAINT_OOT_MODULE))
                        pr_warn("%s: loading out-of-tree module taints kernel.\n",
                                mod->name);
                add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
        }

        check_modinfo_retpoline(mod, info);

        if (get_modinfo(info, "staging")) {
                add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
                pr_warn("%s: module is from the staging directory, the quality "
                        "is unknown, you have been warned.\n", mod->name);
        }

        if (is_livepatch_module(mod)) {
                add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
                pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
                                mod->name);
        }

        module_license_taint_check(mod, get_modinfo(info, "license"));

        if (get_modinfo(info, "test")) {
                if (!test_taint(TAINT_TEST))
                        pr_warn("%s: loading test module taints kernel.\n",
                                mod->name);
                add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
        }
#ifdef CONFIG_MODULE_SIG
        mod->sig_ok = info->sig_ok;
        if (!mod->sig_ok) {
                pr_notice_once("%s: module verification failed: signature "
                               "and/or required key missing - tainting "
                               "kernel\n", mod->name);
                add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
        }
#endif

        /*
         * ndiswrapper is under GPL by itself, but loads proprietary modules.
         * Don't use add_taint_module(), as it would prevent ndiswrapper from
         * using GPL-only symbols it needs.
         */
        if (strcmp(mod->name, "ndiswrapper") == 0)
                add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);

        /* driverloader was caught wrongly pretending to be under GPL */
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
                                 LOCKDEP_NOW_UNRELIABLE);

        /* lve claims to be GPL but upstream won't provide source */
        if (strcmp(mod->name, "lve") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
                                 LOCKDEP_NOW_UNRELIABLE);

        if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
                pr_warn("%s: module license taints kernel.\n", mod->name);

}

static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
        const char *modmagic = get_modinfo(info, "vermagic");
        int err;

        if (flags & MODULE_INIT_IGNORE_VERMAGIC)
                modmagic = NULL;

        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
                err = try_to_force_load(mod, "bad vermagic");
                if (err)
                        return err;
        } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
                pr_err("%s: version magic '%s' should be '%s'\n",
                       info->name, modmagic, vermagic);
                return -ENOEXEC;
        }

        err = check_modinfo_livepatch(mod, info);
        if (err)
                return err;

        return 0;
}

static int find_module_sections(struct module *mod, struct load_info *info)
{
        mod->kp = section_objs(info, "__param",
                               sizeof(*mod->kp), &mod->num_kp);
        mod->syms = section_objs(info, "__ksymtab",
                                 sizeof(*mod->syms), &mod->num_syms);
        mod->crcs = section_addr(info, "__kcrctab");
        mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
                                     sizeof(*mod->gpl_syms),
                                     &mod->num_gpl_syms);
        mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");

#ifdef CONFIG_CONSTRUCTORS
        mod->ctors = section_objs(info, ".ctors",
                                  sizeof(*mod->ctors), &mod->num_ctors);
        if (!mod->ctors)
                mod->ctors = section_objs(info, ".init_array",
                                sizeof(*mod->ctors), &mod->num_ctors);
        else if (find_sec(info, ".init_array")) {
                /*
                 * This shouldn't happen with same compiler and binutils
                 * building all parts of the module.
                 */
                pr_warn("%s: has both .ctors and .init_array.\n",
                       mod->name);
                return -EINVAL;
        }
#endif

        mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
                                                &mod->noinstr_text_size);

#ifdef CONFIG_TRACEPOINTS
        mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
                                             sizeof(*mod->tracepoints_ptrs),
                                             &mod->num_tracepoints);
#endif
#ifdef CONFIG_TREE_SRCU
        mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
                                             sizeof(*mod->srcu_struct_ptrs),
                                             &mod->num_srcu_structs);
#endif
#ifdef CONFIG_BPF_EVENTS
        mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
                                           sizeof(*mod->bpf_raw_events),
                                           &mod->num_bpf_raw_events);
#endif
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
#endif
#ifdef CONFIG_JUMP_LABEL
        mod->jump_entries = section_objs(info, "__jump_table",
                                        sizeof(*mod->jump_entries),
                                        &mod->num_jump_entries);
#endif
#ifdef CONFIG_EVENT_TRACING
        mod->trace_events = section_objs(info, "_ftrace_events",
                                         sizeof(*mod->trace_events),
                                         &mod->num_trace_events);
        mod->trace_evals = section_objs(info, "_ftrace_eval_map",
                                        sizeof(*mod->trace_evals),
                                        &mod->num_trace_evals);
#endif
#ifdef CONFIG_TRACING
        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
                                         sizeof(*mod->trace_bprintk_fmt_start),
                                         &mod->num_trace_bprintk_fmt);
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
        mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
                                             sizeof(*mod->ftrace_callsites),
                                             &mod->num_ftrace_callsites);
#endif
#ifdef CONFIG_FUNCTION_ERROR_INJECTION
        mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
                                            sizeof(*mod->ei_funcs),
                                            &mod->num_ei_funcs);
#endif
#ifdef CONFIG_KPROBES
        mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
                                                &mod->kprobes_text_size);
        mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
                                                sizeof(unsigned long),
                                                &mod->num_kprobe_blacklist);
#endif
#ifdef CONFIG_PRINTK_INDEX
        mod->printk_index_start = section_objs(info, ".printk_index",
                                               sizeof(*mod->printk_index_start),
                                               &mod->printk_index_size);
#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
        mod->static_call_sites = section_objs(info, ".static_call_sites",
                                              sizeof(*mod->static_call_sites),
                                              &mod->num_static_call_sites);
#endif
#if IS_ENABLED(CONFIG_KUNIT)
        mod->kunit_suites = section_objs(info, ".kunit_test_suites",
                                              sizeof(*mod->kunit_suites),
                                              &mod->num_kunit_suites);
        mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
                                              sizeof(*mod->kunit_init_suites),
                                              &mod->num_kunit_init_suites);
#endif

        mod->extable = section_objs(info, "__ex_table",
                                    sizeof(*mod->extable), &mod->num_exentries);

        if (section_addr(info, "__obsparm"))
                pr_warn("%s: Ignoring obsolete parameters\n", mod->name);

#ifdef CONFIG_DYNAMIC_DEBUG_CORE
        mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
                                              sizeof(*mod->dyndbg_info.descs),
                                              &mod->dyndbg_info.num_descs);
        mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
                                                sizeof(*mod->dyndbg_info.classes),
                                                &mod->dyndbg_info.num_classes);
#endif

        return 0;
}

static int move_module(struct module *mod, struct load_info *info)
{
        int i;
        enum mod_mem_type t = 0;
        int ret = -ENOMEM;

        for_each_mod_mem_type(type) {
                if (!mod->mem[type].size) {
                        mod->mem[type].base = NULL;
                        continue;
                }

                ret = module_memory_alloc(mod, type);
                if (ret) {
                        t = type;
                        goto out_enomem;
                }
        }

        /* Transfer each section which specifies SHF_ALLOC */
        pr_debug("Final section addresses for %s:\n", mod->name);
        for (i = 0; i < info->hdr->e_shnum; i++) {
                void *dest;
                Elf_Shdr *shdr = &info->sechdrs[i];
                enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;

                if (!(shdr->sh_flags & SHF_ALLOC))
                        continue;

                dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);

                if (shdr->sh_type != SHT_NOBITS) {
                        /*
                         * Our ELF checker already validated this, but let's
                         * be pedantic and make the goal clearer. We actually
                         * end up copying over all modifications made to the
                         * userspace copy of the entire struct module.
                         */
                        if (i == info->index.mod &&
                           (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
                                ret = -ENOEXEC;
                                goto out_enomem;
                        }
                        memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
                }
                /*
                 * Update the userspace copy's ELF section address to point to
                 * our newly allocated memory as a pure convenience so that
                 * users of info can keep taking advantage and using the newly
                 * minted official memory area.
                 */
                shdr->sh_addr = (unsigned long)dest;
                pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
                         (long)shdr->sh_size, info->secstrings + shdr->sh_name);
        }

        return 0;
out_enomem:
        for (t--; t >= 0; t--)
                module_memory_free(mod, t, true);
        return ret;
}

static int check_export_symbol_versions(struct module *mod)
{
#ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs) ||
            (mod->num_gpl_syms && !mod->gpl_crcs)) {
                return try_to_force_load(mod,
                                         "no versions for exported symbols");
        }
#endif
        return 0;
}

static void flush_module_icache(const struct module *mod)
{
        /*
         * Flush the instruction cache, since we've played with text.
         * Do it before processing of module parameters, so the module
         * can provide parameter accessor functions of its own.
         */
        for_each_mod_mem_type(type) {
                const struct module_memory *mod_mem = &mod->mem[type];

                if (mod_mem->size) {
                        flush_icache_range((unsigned long)mod_mem->base,
                                           (unsigned long)mod_mem->base + mod_mem->size);
                }
        }
}

bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
{
        return true;
}

int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
                                     Elf_Shdr *sechdrs,
                                     char *secstrings,
                                     struct module *mod)
{
        return 0;
}

/* module_blacklist is a comma-separated list of module names */
static char *module_blacklist;
static bool blacklisted(const char *module_name)
{
        const char *p;
        size_t len;

        if (!module_blacklist)
                return false;

        for (p = module_blacklist; *p; p += len) {
                len = strcspn(p, ",");
                if (strlen(module_name) == len && !memcmp(module_name, p, len))
                        return true;
                if (p[len] == ',')
                        len++;
        }
        return false;
}
core_param(module_blacklist, module_blacklist, charp, 0400);

static struct module *layout_and_allocate(struct load_info *info, int flags)
{
        struct module *mod;
        unsigned int ndx;
        int err;

        /* Allow arches to frob section contents and sizes.  */
        err = module_frob_arch_sections(info->hdr, info->sechdrs,
                                        info->secstrings, info->mod);
        if (err < 0)
                return ERR_PTR(err);

        err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
                                          info->secstrings, info->mod);
        if (err < 0)
                return ERR_PTR(err);

        /* We will do a special allocation for per-cpu sections later. */
        info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;

        /*
         * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
         * layout_sections() can put it in the right place.
         * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
         */
        ndx = find_sec(info, ".data..ro_after_init");
        if (ndx)
                info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
        /*
         * Mark the __jump_table section as ro_after_init as well: these data
         * structures are never modified, with the exception of entries that
         * refer to code in the __init section, which are annotated as such
         * at module load time.
         */
        ndx = find_sec(info, "__jump_table");
        if (ndx)
                info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;

        /*
         * Determine total sizes, and put offsets in sh_entsize.  For now
         * this is done generically; there doesn't appear to be any
         * special cases for the architectures.
         */
        layout_sections(info->mod, info);
        layout_symtab(info->mod, info);

        /* Allocate and move to the final place */
        err = move_module(info->mod, info);
        if (err)
                return ERR_PTR(err);

        /* Module has been copied to its final place now: return it. */
        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
        kmemleak_load_module(mod, info);
        return mod;
}

/* mod is no longer valid after this! */
static void module_deallocate(struct module *mod, struct load_info *info)
{
        percpu_modfree(mod);
        module_arch_freeing_init(mod);

        free_mod_mem(mod, true);
}

int __weak module_finalize(const Elf_Ehdr *hdr,
                           const Elf_Shdr *sechdrs,
                           struct module *me)
{
        return 0;
}

static int post_relocation(struct module *mod, const struct load_info *info)
{
        /* Sort exception table now relocations are done. */
        sort_extable(mod->extable, mod->extable + mod->num_exentries);

        /* Copy relocated percpu area over. */
        percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
                       info->sechdrs[info->index.pcpu].sh_size);

        /* Setup kallsyms-specific fields. */
        add_kallsyms(mod, info);

        /* Arch-specific module finalizing. */
        return module_finalize(info->hdr, info->sechdrs, mod);
}

/* Call module constructors. */
static void do_mod_ctors(struct module *mod)
{
#ifdef CONFIG_CONSTRUCTORS
        unsigned long i;

        for (i = 0; i < mod->num_ctors; i++)
                mod->ctors[i]();
#endif
}

/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
        struct llist_node node;
        void *init_text;
        void *init_data;
        void *init_rodata;
};

static void do_free_init(struct work_struct *w)
{
        struct llist_node *pos, *n, *list;
        struct mod_initfree *initfree;

        list = llist_del_all(&init_free_list);

        synchronize_rcu();

        llist_for_each_safe(pos, n, list) {
                initfree = container_of(pos, struct mod_initfree, node);
                execmem_free(initfree->init_text);
                execmem_free(initfree->init_data);
                execmem_free(initfree->init_rodata);
                kfree(initfree);
        }
}

void flush_module_init_free_work(void)
{
        flush_work(&init_free_wq);
}

#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "module."
/* Default value for module->async_probe_requested */
static bool async_probe;
module_param(async_probe, bool, 0644);

/*
 * This is where the real work happens.
 *
 * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
 * helper command 'lx-symbols'.
 */
static noinline int do_init_module(struct module *mod)
{
        int ret = 0;
        struct mod_initfree *freeinit;
#if defined(CONFIG_MODULE_STATS)
        unsigned int text_size = 0, total_size = 0;

        for_each_mod_mem_type(type) {
                const struct module_memory *mod_mem = &mod->mem[type];
                if (mod_mem->size) {
                        total_size += mod_mem->size;
                        if (type == MOD_TEXT || type == MOD_INIT_TEXT)
                                text_size += mod_mem->size;
                }
        }
#endif

        freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
        if (!freeinit) {
                ret = -ENOMEM;
                goto fail;
        }
        freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
        freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
        freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;

        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
                ret = do_one_initcall(mod->init);
        if (ret < 0) {
                goto fail_free_freeinit;
        }
        if (ret > 0) {
                pr_warn("%s: '%s'->init suspiciously returned %d, it should "
                        "follow 0/-E convention\n"
                        "%s: loading module anyway...\n",
                        __func__, mod->name, ret, __func__);
                dump_stack();
        }

        /* Now it's a first class citizen! */
        mod->state = MODULE_STATE_LIVE;
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_LIVE, mod);

        /* Delay uevent until module has finished its init routine */
        kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);

        /*
         * We need to finish all async code before the module init sequence
         * is done. This has potential to deadlock if synchronous module
         * loading is requested from async (which is not allowed!).
         *
         * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
         * request_module() from async workers") for more details.
         */
        if (!mod->async_probe_requested)
                async_synchronize_full();

        ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
                        mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
        mutex_lock(&module_mutex);
        /* Drop initial reference. */
        module_put(mod);
        trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
        /* Switch to core kallsyms now init is done: kallsyms may be walking! */
        rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
        ret = module_enable_rodata_ro(mod, true);
        if (ret)
                goto fail_mutex_unlock;
        mod_tree_remove_init(mod);
        module_arch_freeing_init(mod);
        for_class_mod_mem_type(type, init) {
                mod->mem[type].base = NULL;
                mod->mem[type].size = 0;
        }

#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
        mod->btf_data = NULL;
#endif
        /*
         * We want to free module_init, but be aware that kallsyms may be
         * walking this with preempt disabled.  In all the failure paths, we
         * call synchronize_rcu(), but we don't want to slow down the success
         * path. execmem_free() cannot be called in an interrupt, so do the
         * work and call synchronize_rcu() in a work queue.
         *
         * Note that execmem_alloc() on most architectures creates W+X page
         * mappings which won't be cleaned up until do_free_init() runs.  Any
         * code such as mark_rodata_ro() which depends on those mappings to
         * be cleaned up needs to sync with the queued work by invoking
         * flush_module_init_free_work().
         */
        if (llist_add(&freeinit->node, &init_free_list))
                schedule_work(&init_free_wq);

        mutex_unlock(&module_mutex);
        wake_up_all(&module_wq);

        mod_stat_add_long(text_size, &total_text_size);
        mod_stat_add_long(total_size, &total_mod_size);

        mod_stat_inc(&modcount);

        return 0;

fail_mutex_unlock:
        mutex_unlock(&module_mutex);
fail_free_freeinit:
        kfree(freeinit);
fail:
        /* Try to protect us from buggy refcounters. */
        mod->state = MODULE_STATE_GOING;
        synchronize_rcu();
        module_put(mod);
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
        klp_module_going(mod);
        ftrace_release_mod(mod);
        free_module(mod);
        wake_up_all(&module_wq);

        return ret;
}

static int may_init_module(void)
{
        if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;

        return 0;
}

/* Is this module of this name done loading?  No locks held. */
static bool finished_loading(const char *name)
{
        struct module *mod;
        bool ret;

        /*
         * The module_mutex should not be a heavily contended lock;
         * if we get the occasional sleep here, we'll go an extra iteration
         * in the wait_event_interruptible(), which is harmless.
         */
        sched_annotate_sleep();
        mutex_lock(&module_mutex);
        mod = find_module_all(name, strlen(name), true);
        ret = !mod || mod->state == MODULE_STATE_LIVE
                || mod->state == MODULE_STATE_GOING;
        mutex_unlock(&module_mutex);

        return ret;
}

/* Must be called with module_mutex held */
static int module_patient_check_exists(const char *name,
                                       enum fail_dup_mod_reason reason)
{
        struct module *old;
        int err = 0;

        old = find_module_all(name, strlen(name), true);
        if (old == NULL)
                return 0;

        if (old->state == MODULE_STATE_COMING ||
            old->state == MODULE_STATE_UNFORMED) {
                /* Wait in case it fails to load. */
                mutex_unlock(&module_mutex);
                err = wait_event_interruptible(module_wq,
                                       finished_loading(name));
                mutex_lock(&module_mutex);
                if (err)
                        return err;

                /* The module might have gone in the meantime. */
                old = find_module_all(name, strlen(name), true);
        }

        if (try_add_failed_module(name, reason))
                pr_warn("Could not add fail-tracking for module: %s\n", name);

        /*
         * We are here only when the same module was being loaded. Do
         * not try to load it again right now. It prevents long delays
         * caused by serialized module load failures. It might happen
         * when more devices of the same type trigger load of
         * a particular module.
         */
        if (old && old->state == MODULE_STATE_LIVE)
                return -EEXIST;
        return -EBUSY;
}

/*
 * We try to place it in the list now to make sure it's unique before
 * we dedicate too many resources.  In particular, temporary percpu
 * memory exhaustion.
 */
static int add_unformed_module(struct module *mod)
{
        int err;

        mod->state = MODULE_STATE_UNFORMED;

        mutex_lock(&module_mutex);
        err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD);
        if (err)
                goto out;

        mod_update_bounds(mod);
        list_add_rcu(&mod->list, &modules);
        mod_tree_insert(mod);
        err = 0;

out:
        mutex_unlock(&module_mutex);
        return err;
}

static int complete_formation(struct module *mod, struct load_info *info)
{
        int err;

        mutex_lock(&module_mutex);

        /* Find duplicate symbols (must be called under lock). */
        err = verify_exported_symbols(mod);
        if (err < 0)
                goto out;

        /* These rely on module_mutex for list integrity. */
        module_bug_finalize(info->hdr, info->sechdrs, mod);
        module_cfi_finalize(info->hdr, info->sechdrs, mod);

        err = module_enable_rodata_ro(mod, false);
        if (err)
                goto out_strict_rwx;
        err = module_enable_data_nx(mod);
        if (err)
                goto out_strict_rwx;
        err = module_enable_text_rox(mod);
        if (err)
                goto out_strict_rwx;

        /*
         * Mark state as coming so strong_try_module_get() ignores us,
         * but kallsyms etc. can see us.
         */
        mod->state = MODULE_STATE_COMING;
        mutex_unlock(&module_mutex);

        return 0;

out_strict_rwx:
        module_bug_cleanup(mod);
out:
        mutex_unlock(&module_mutex);
        return err;
}

static int prepare_coming_module(struct module *mod)
{
        int err;

        ftrace_module_enable(mod);
        err = klp_module_coming(mod);
        if (err)
                return err;

        err = blocking_notifier_call_chain_robust(&module_notify_list,
                        MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
        err = notifier_to_errno(err);
        if (err)
                klp_module_going(mod);

        return err;
}

static int unknown_module_param_cb(char *param, char *val, const char *modname,
                                   void *arg)
{
        struct module *mod = arg;
        int ret;

        if (strcmp(param, "async_probe") == 0) {
                if (kstrtobool(val, &mod->async_probe_requested))
                        mod->async_probe_requested = true;
                return 0;
        }

        /* Check for magic 'dyndbg' arg */
        ret = ddebug_dyndbg_module_param_cb(param, val, modname);
        if (ret != 0)
                pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
        return 0;
}

/* Module within temporary copy, this doesn't do any allocation  */
static int early_mod_check(struct load_info *info, int flags)
{
        int err;

        /*
         * Now that we know we have the correct module name, check
         * if it's blacklisted.
         */
        if (blacklisted(info->name)) {
                pr_err("Module %s is blacklisted\n", info->name);
                return -EPERM;
        }

        err = rewrite_section_headers(info, flags);
        if (err)
                return err;

        /* Check module struct version now, before we try to use module. */
        if (!check_modstruct_version(info, info->mod))
                return -ENOEXEC;

        err = check_modinfo(info->mod, info, flags);
        if (err)
                return err;

        mutex_lock(&module_mutex);
        err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING);
        mutex_unlock(&module_mutex);

        return err;
}

/*
 * Allocate and load the module: note that size of section 0 is always
 * zero, and we rely on this for optional sections.
 */
static int load_module(struct load_info *info, const char __user *uargs,
                       int flags)
{
        struct module *mod;
        bool module_allocated = false;
        long err = 0;
        char *after_dashes;

        /*
         * Do the signature check (if any) first. All that
         * the signature check needs is info->len, it does
         * not need any of the section info. That can be
         * set up later. This will minimize the chances
         * of a corrupt module causing problems before
         * we even get to the signature check.
         *
         * The check will also adjust info->len by stripping
         * off the sig length at the end of the module, making
         * checks against info->len more correct.
         */
        err = module_sig_check(info, flags);
        if (err)
                goto free_copy;

        /*
         * Do basic sanity checks against the ELF header and
         * sections. Cache useful sections and set the
         * info->mod to the userspace passed struct module.
         */
        err = elf_validity_cache_copy(info, flags);
        if (err)
                goto free_copy;

        err = early_mod_check(info, flags);
        if (err)
                goto free_copy;

        /* Figure out module layout, and allocate all the memory. */
        mod = layout_and_allocate(info, flags);
        if (IS_ERR(mod)) {
                err = PTR_ERR(mod);
                goto free_copy;
        }

        module_allocated = true;

        audit_log_kern_module(mod->name);

        /* Reserve our place in the list. */
        err = add_unformed_module(mod);
        if (err)
                goto free_module;

        /*
         * We are tainting your kernel if your module gets into
         * the modules linked list somehow.
         */
        module_augment_kernel_taints(mod, info);

        /* To avoid stressing percpu allocator, do this once we're unique. */
        err = percpu_modalloc(mod, info);
        if (err)
                goto unlink_mod;

        /* Now module is in final location, initialize linked lists, etc. */
        err = module_unload_init(mod);
        if (err)
                goto unlink_mod;

        init_param_lock(mod);

        /*
         * Now we've got everything in the final locations, we can
         * find optional sections.
         */
        err = find_module_sections(mod, info);
        if (err)
                goto free_unload;

        err = check_export_symbol_versions(mod);
        if (err)
                goto free_unload;

        /* Set up MODINFO_ATTR fields */
        setup_modinfo(mod, info);

        /* Fix up syms, so that st_value is a pointer to location. */
        err = simplify_symbols(mod, info);
        if (err < 0)
                goto free_modinfo;

        err = apply_relocations(mod, info);
        if (err < 0)
                goto free_modinfo;

        err = post_relocation(mod, info);
        if (err < 0)
                goto free_modinfo;

        flush_module_icache(mod);

        /* Now copy in args */
        mod->args = strndup_user(uargs, ~0UL >> 1);
        if (IS_ERR(mod->args)) {
                err = PTR_ERR(mod->args);
                goto free_arch_cleanup;
        }

        init_build_id(mod, info);

        /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
        ftrace_module_init(mod);

        /* Finally it's fully formed, ready to start executing. */
        err = complete_formation(mod, info);
        if (err)
                goto ddebug_cleanup;

        err = prepare_coming_module(mod);
        if (err)
                goto bug_cleanup;

        mod->async_probe_requested = async_probe;

        /* Module is ready to execute: parsing args may do that. */
        after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
                                  -32768, 32767, mod,
                                  unknown_module_param_cb);
        if (IS_ERR(after_dashes)) {
                err = PTR_ERR(after_dashes);
                goto coming_cleanup;
        } else if (after_dashes) {
                pr_warn("%s: parameters '%s' after `--' ignored\n",
                       mod->name, after_dashes);
        }

        /* Link in to sysfs. */
        err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
        if (err < 0)
                goto coming_cleanup;

        if (is_livepatch_module(mod)) {
                err = copy_module_elf(mod, info);
                if (err < 0)
                        goto sysfs_cleanup;
        }

        /* Get rid of temporary copy. */
        free_copy(info, flags);

        codetag_load_module(mod);

        /* Done! */
        trace_module_load(mod);

        return do_init_module(mod);

 sysfs_cleanup:
        mod_sysfs_teardown(mod);
 coming_cleanup:
        mod->state = MODULE_STATE_GOING;
        destroy_params(mod->kp, mod->num_kp);
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
        klp_module_going(mod);
 bug_cleanup:
        mod->state = MODULE_STATE_GOING;
        /* module_bug_cleanup needs module_mutex protection */
        mutex_lock(&module_mutex);
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);

 ddebug_cleanup:
        ftrace_release_mod(mod);
        synchronize_rcu();
        kfree(mod->args);
 free_arch_cleanup:
        module_arch_cleanup(mod);
 free_modinfo:
        free_modinfo(mod);
 free_unload:
        module_unload_free(mod);
 unlink_mod:
        mutex_lock(&module_mutex);
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
        mod_tree_remove(mod);
        wake_up_all(&module_wq);
        /* Wait for RCU-sched synchronizing before releasing mod->list. */
        synchronize_rcu();
        mutex_unlock(&module_mutex);
 free_module:
        mod_stat_bump_invalid(info, flags);
        /* Free lock-classes; relies on the preceding sync_rcu() */
        for_class_mod_mem_type(type, core_data) {
                lockdep_free_key_range(mod->mem[type].base,
                                       mod->mem[type].size);
        }

        module_deallocate(mod, info);
 free_copy:
        /*
         * The info->len is always set. We distinguish between
         * failures once the proper module was allocated and
         * before that.
         */
        if (!module_allocated)
                mod_stat_bump_becoming(info, flags);
        free_copy(info, flags);
        return err;
}

SYSCALL_DEFINE3(init_module, void __user *, umod,
                unsigned long, len, const char __user *, uargs)
{
        int err;
        struct load_info info = { };

        err = may_init_module();
        if (err)
                return err;

        pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
               umod, len, uargs);

        err = copy_module_from_user(umod, len, &info);
        if (err) {
                mod_stat_inc(&failed_kreads);
                mod_stat_add_long(len, &invalid_kread_bytes);
                return err;
        }

        return load_module(&info, uargs, 0);
}

struct idempotent {
        const void *cookie;
        struct hlist_node entry;
        struct completion complete;
        int ret;
};

#define IDEM_HASH_BITS 8
static struct hlist_head idem_hash[1 << IDEM_HASH_BITS];
static DEFINE_SPINLOCK(idem_lock);

static bool idempotent(struct idempotent *u, const void *cookie)
{
        int hash = hash_ptr(cookie, IDEM_HASH_BITS);
        struct hlist_head *head = idem_hash + hash;
        struct idempotent *existing;
        bool first;

        u->ret = 0;
        u->cookie = cookie;
        init_completion(&u->complete);

        spin_lock(&idem_lock);
        first = true;
        hlist_for_each_entry(existing, head, entry) {
                if (existing->cookie != cookie)
                        continue;
                first = false;
                break;
        }
        hlist_add_head(&u->entry, idem_hash + hash);
        spin_unlock(&idem_lock);

        return !first;
}

/*
 * We were the first one with 'cookie' on the list, and we ended
 * up completing the operation. We now need to walk the list,
 * remove everybody - which includes ourselves - fill in the return
 * value, and then complete the operation.
 */
static int idempotent_complete(struct idempotent *u, int ret)
{
        const void *cookie = u->cookie;
        int hash = hash_ptr(cookie, IDEM_HASH_BITS);
        struct hlist_head *head = idem_hash + hash;
        struct hlist_node *next;
        struct idempotent *pos;

        spin_lock(&idem_lock);
        hlist_for_each_entry_safe(pos, next, head, entry) {
                if (pos->cookie != cookie)
                        continue;
                hlist_del(&pos->entry);
                pos->ret = ret;
                complete(&pos->complete);
        }
        spin_unlock(&idem_lock);
        return ret;
}

static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
{
        struct load_info info = { };
        void *buf = NULL;
        int len;

        len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE);
        if (len < 0) {
                mod_stat_inc(&failed_kreads);
                return len;
        }

        if (flags & MODULE_INIT_COMPRESSED_FILE) {
                int err = module_decompress(&info, buf, len);
                vfree(buf); /* compressed data is no longer needed */
                if (err) {
                        mod_stat_inc(&failed_decompress);
                        mod_stat_add_long(len, &invalid_decompress_bytes);
                        return err;
                }
        } else {
                info.hdr = buf;
                info.len = len;
        }

        return load_module(&info, uargs, flags);
}

static int idempotent_init_module(struct file *f, const char __user * uargs, int flags)
{
        struct idempotent idem;

        if (!f || !(f->f_mode & FMODE_READ))
                return -EBADF;

        /* See if somebody else is doing the operation? */
        if (idempotent(&idem, file_inode(f))) {
                wait_for_completion(&idem.complete);
                return idem.ret;
        }

        /* Otherwise, we'll do it and complete others */
        return idempotent_complete(&idem,
                init_module_from_file(f, uargs, flags));
}

SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
        int err;
        struct fd f;

        err = may_init_module();
        if (err)
                return err;

        pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);

        if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
                      |MODULE_INIT_IGNORE_VERMAGIC
                      |MODULE_INIT_COMPRESSED_FILE))
                return -EINVAL;

        f = fdget(fd);
        err = idempotent_init_module(f.file, uargs, flags);
        fdput(f);
        return err;
}

/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
char *module_flags(struct module *mod, char *buf, bool show_state)
{
        int bx = 0;

        BUG_ON(mod->state == MODULE_STATE_UNFORMED);
        if (!mod->taints && !show_state)
                goto out;
        if (mod->taints ||
            mod->state == MODULE_STATE_GOING ||
            mod->state == MODULE_STATE_COMING) {
                buf[bx++] = '(';
                bx += module_flags_taint(mod->taints, buf + bx);
                /* Show a - for module-is-being-unloaded */
                if (mod->state == MODULE_STATE_GOING && show_state)
                        buf[bx++] = '-';
                /* Show a + for module-is-being-loaded */
                if (mod->state == MODULE_STATE_COMING && show_state)
                        buf[bx++] = '+';
                buf[bx++] = ')';
        }
out:
        buf[bx] = '\0';

        return buf;
}

/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
        const struct exception_table_entry *e = NULL;
        struct module *mod;

        preempt_disable();
        mod = __module_address(addr);
        if (!mod)
                goto out;

        if (!mod->num_exentries)
                goto out;

        e = search_extable(mod->extable,
                           mod->num_exentries,
                           addr);
out:
        preempt_enable();

        /*
         * Now, if we found one, we are running inside it now, hence
         * we cannot unload the module, hence no refcnt needed.
         */
        return e;
}

/**
 * is_module_address() - is this address inside a module?
 * @addr: the address to check.
 *
 * See is_module_text_address() if you simply want to see if the address
 * is code (not data).
 */
bool is_module_address(unsigned long addr)
{
        bool ret;

        preempt_disable();
        ret = __module_address(addr) != NULL;
        preempt_enable();

        return ret;
}

/**
 * __module_address() - get the module which contains an address.
 * @addr: the address.
 *
 * Must be called with preempt disabled or module mutex held so that
 * module doesn't get freed during this.
 */
struct module *__module_address(unsigned long addr)
{
        struct module *mod;

        if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
                goto lookup;

#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
                goto lookup;
#endif

        return NULL;

lookup:
        module_assert_mutex_or_preempt();

        mod = mod_find(addr, &mod_tree);
        if (mod) {
                BUG_ON(!within_module(addr, mod));
                if (mod->state == MODULE_STATE_UNFORMED)
                        mod = NULL;
        }
        return mod;
}

/**
 * is_module_text_address() - is this address inside module code?
 * @addr: the address to check.
 *
 * See is_module_address() if you simply want to see if the address is
 * anywhere in a module.  See kernel_text_address() for testing if an
 * address corresponds to kernel or module code.
 */
bool is_module_text_address(unsigned long addr)
{
        bool ret;

        preempt_disable();
        ret = __module_text_address(addr) != NULL;
        preempt_enable();

        return ret;
}

/**
 * __module_text_address() - get the module whose code contains an address.
 * @addr: the address.
 *
 * Must be called with preempt disabled or module mutex held so that
 * module doesn't get freed during this.
 */
struct module *__module_text_address(unsigned long addr)
{
        struct module *mod = __module_address(addr);
        if (mod) {
                /* Make sure it's within the text section. */
                if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
                    !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
                        mod = NULL;
        }
        return mod;
}

/* Don't grab lock, we're oopsing. */
void print_modules(void)
{
        struct module *mod;
        char buf[MODULE_FLAGS_BUF_SIZE];

        printk(KERN_DEFAULT "Modules linked in:");
        /* Most callers should already have preempt disabled, but make sure */
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
                pr_cont(" %s%s", mod->name, module_flags(mod, buf, true));
        }

        print_unloaded_tainted_modules();
        preempt_enable();
        if (last_unloaded_module.name[0])
                pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name,
                        last_unloaded_module.taints);
        pr_cont("\n");
}

#ifdef CONFIG_MODULE_DEBUGFS
struct dentry *mod_debugfs_root;

static int module_debugfs_init(void)
{
        mod_debugfs_root = debugfs_create_dir("modules", NULL);
        return 0;
}
module_init(module_debugfs_init);
#endif





























































































    2 




    2 




    1 


    1 

    1 





    3 






    1 
    1 



    2 










    1 



    1 


    1 
    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
// SPDX-License-Identifier: GPL-2.0-or-later
/* Structure dynamic extension infrastructure
 * Copyright (C) 2004 Rusty Russell IBM Corporation
 * Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org>
 * Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org>
 */
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>

#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_act_ct.h>
#include <net/netfilter/nf_nat.h>

#define NF_CT_EXT_PREALLOC        128u /* conntrack events are on by default */

atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1);

static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
        [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
#if IS_ENABLED(CONFIG_NF_NAT)
        [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat),
#endif
        [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj),
        [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct),
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
        [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp),
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout),
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
        [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
#endif
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
        [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy),
#endif
#if IS_ENABLED(CONFIG_NET_ACT_CT)
        [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext),
#endif
};

static __always_inline unsigned int total_extension_size(void)
{
        /* remember to add new extensions below */
        BUILD_BUG_ON(NF_CT_EXT_NUM > 10);

        return sizeof(struct nf_ct_ext) +
               sizeof(struct nf_conn_help)
#if IS_ENABLED(CONFIG_NF_NAT)
                + sizeof(struct nf_conn_nat)
#endif
                + sizeof(struct nf_conn_seqadj)
                + sizeof(struct nf_conn_acct)
#ifdef CONFIG_NF_CONNTRACK_EVENTS
                + sizeof(struct nf_conntrack_ecache)
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
                + sizeof(struct nf_conn_tstamp)
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
                + sizeof(struct nf_conn_timeout)
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
                + sizeof(struct nf_conn_labels)
#endif
#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
                + sizeof(struct nf_conn_synproxy)
#endif
#if IS_ENABLED(CONFIG_NET_ACT_CT)
                + sizeof(struct nf_conn_act_ct_ext)
#endif
        ;
}

void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
        unsigned int newlen, newoff, oldlen, alloc;
        struct nf_ct_ext *new;

        /* Conntrack must not be confirmed to avoid races on reallocation. */
        WARN_ON(nf_ct_is_confirmed(ct));

        /* struct nf_ct_ext uses u8 to store offsets/size */
        BUILD_BUG_ON(total_extension_size() > 255u);

        if (ct->ext) {
                const struct nf_ct_ext *old = ct->ext;

                if (__nf_ct_ext_exist(old, id))
                        return NULL;
                oldlen = old->len;
        } else {
                oldlen = sizeof(*new);
        }

        newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext));
        newlen = newoff + nf_ct_ext_type_len[id];

        alloc = max(newlen, NF_CT_EXT_PREALLOC);
        new = krealloc(ct->ext, alloc, gfp);
        if (!new)
                return NULL;

        if (!ct->ext) {
                memset(new->offset, 0, sizeof(new->offset));
                new->gen_id = atomic_read(&nf_conntrack_ext_genid);
        }

        new->offset[id] = newoff;
        new->len = newlen;
        memset((void *)new + newoff, 0, newlen - newoff);

        ct->ext = new;
        return (void *)new + newoff;
}
EXPORT_SYMBOL(nf_ct_ext_add);

/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */
void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id)
{
        unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid);
        unsigned int this_id = READ_ONCE(ext->gen_id);

        if (!__nf_ct_ext_exist(ext, id))
                return NULL;

        if (this_id == 0 || ext->gen_id == gen_id)
                return (void *)ext + ext->offset[id];

        return NULL;
}
EXPORT_SYMBOL(__nf_ct_ext_find);

void nf_ct_ext_bump_genid(void)
{
        unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid);

        if (value == UINT_MAX)
                atomic_set(&nf_conntrack_ext_genid, 1);

        msleep(HZ);
}















































































    2 
    2 
    1 
    1 
    1 








    2 


















    2 




    2 








    2 
    2 

    2 
    2 





































    1 
    1 
    1 

    1 












    1 
    1 
















    1 
    1 



    1 




    1 
    1 


    1 


    1 


    1 



    1 





    3 
    4 


















































































































    1 

    1 






































































    1 


    1 




    1 







    1 


























    1 

    1 








































    4 


    3 

    4 


    4 

    4 


    4 
    3 

    4 












    2 
    4 




    3 













































































































    1 
    1 



    1 

    1 

    1 









    1 



    1 



    1 


    1 








































































    1 
    1 

    1 


















































































































































































































    1 




    1 
    1 
    1 
















    1 











    1 





    1 



    1 
    1 
    1 





    1 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swap.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * This file contains the default values for the operation of the
 * Linux VM subsystem. Fine-tuning documentation can be found in
 * Documentation/admin-guide/sysctl/vm.rst.
 * Started 18.12.91
 * Swap aging added 23.2.95, Stephen Tweedie.
 * Buffermem limits added 12.3.98, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
#include <linux/buffer_head.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>

/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
const int page_cluster_max = 31;

/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
        local_lock_t lock;
        struct folio_batch fbatch;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

/*
 * The following folio batches are grouped together because they are protected
 * by disabling preemption (and interrupts remain enabled).
 */
struct cpu_fbatches {
        local_lock_t lock;
        struct folio_batch lru_add;
        struct folio_batch lru_deactivate_file;
        struct folio_batch lru_deactivate;
        struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
        struct folio_batch activate;
#endif
};
static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
                unsigned long *flagsp)
{
        if (folio_test_lru(folio)) {
                folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
                lruvec_del_folio(*lruvecp, folio);
                __folio_clear_lru_flags(folio);
        }

        /*
         * In rare cases, when truncation or holepunching raced with
         * munlock after VM_LOCKED was cleared, Mlocked may still be
         * found set here.  This does not indicate a problem, unless
         * "unevictable_pgs_cleared" appears worryingly large.
         */
        if (unlikely(folio_test_mlocked(folio))) {
                long nr_pages = folio_nr_pages(folio);

                __folio_clear_mlocked(folio);
                zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
                count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
        }
}

/*
 * This path almost never happens for VM activity - pages are normally freed
 * in batches.  But it gets used by networking - and for compound pages.
 */
static void page_cache_release(struct folio *folio)
{
        struct lruvec *lruvec = NULL;
        unsigned long flags;

        __page_cache_release(folio, &lruvec, &flags);
        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
}

void __folio_put(struct folio *folio)
{
        if (unlikely(folio_is_zone_device(folio))) {
                free_zone_device_folio(folio);
                return;
        } else if (folio_test_hugetlb(folio)) {
                free_huge_folio(folio);
                return;
        }

        page_cache_release(folio);
        if (folio_test_large(folio) && folio_test_large_rmappable(folio))
                folio_undo_large_rmappable(folio);
        mem_cgroup_uncharge(folio);
        free_unref_page(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);

/**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
 *
 * Release a list of pages which are strung together on page.lru.
 */
void put_pages_list(struct list_head *pages)
{
        struct folio_batch fbatch;
        struct folio *folio, *next;

        folio_batch_init(&fbatch);
        list_for_each_entry_safe(folio, next, pages, lru) {
                if (!folio_put_testzero(folio))
                        continue;
                if (folio_test_hugetlb(folio)) {
                        free_huge_folio(folio);
                        continue;
                }
                /* LRU flag must be clear because it's passed using the lru */
                if (folio_batch_add(&fbatch, folio) > 0)
                        continue;
                free_unref_folios(&fbatch);
        }

        if (fbatch.nr)
                free_unref_folios(&fbatch);
        INIT_LIST_HEAD(pages);
}
EXPORT_SYMBOL(put_pages_list);

typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);

static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
{
        int was_unevictable = folio_test_clear_unevictable(folio);
        long nr_pages = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /*
         * Is an smp_mb__after_atomic() still required here, before
         * folio_evictable() tests the mlocked flag, to rule out the possibility
         * of stranding an evictable folio on an unevictable LRU?  I think
         * not, because __munlock_folio() only clears the mlocked flag
         * while the LRU lock is held.
         *
         * (That is not true of __page_cache_release(), and not necessarily
         * true of folios_put(): but those only clear the mlocked flag after
         * folio_put_testzero() has excluded any other users of the folio.)
         */
        if (folio_evictable(folio)) {
                if (was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        } else {
                folio_clear_active(folio);
                folio_set_unevictable(folio);
                /*
                 * folio->mlock_count = !!folio_test_mlocked(folio)?
                 * But that leaves __mlock_folio() in doubt whether another
                 * actor has already counted the mlock or not.  Err on the
                 * safe side, underestimate, let page reclaim fix it, rather
                 * than leaving a page on the unevictable LRU indefinitely.
                 */
                folio->mlock_count = 0;
                if (!was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
        }

        lruvec_add_folio(lruvec, folio);
        trace_mm_lru_insertion(folio);
}

static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
{
        int i;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                /* block memcg migration while the folio moves between lru */
                if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
                        continue;

                folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
                move_fn(lruvec, folio);

                folio_set_lru(folio);
        }

        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
        folios_put(fbatch);
}

static void folio_batch_add_and_move(struct folio_batch *fbatch,
                struct folio *folio, move_fn_t move_fn)
{
        if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
            !lru_cache_disabled())
                return;
        folio_batch_move_lru(fbatch, move_fn);
}

static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (!folio_test_unevictable(folio)) {
                lruvec_del_folio(lruvec, folio);
                folio_clear_active(folio);
                lruvec_add_folio_tail(lruvec, folio);
                __count_vm_events(PGROTATED, folio_nr_pages(folio));
        }
}

/*
 * Writeback is about to end against a folio which has been marked for
 * immediate reclaim.  If it still appears to be reclaimable, move it
 * to the tail of the inactive list.
 *
 * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
 */
void folio_rotate_reclaimable(struct folio *folio)
{
        if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
            !folio_test_unevictable(folio) && folio_test_lru(folio)) {
                struct folio_batch *fbatch;
                unsigned long flags;

                folio_get(folio);
                local_lock_irqsave(&lru_rotate.lock, flags);
                fbatch = this_cpu_ptr(&lru_rotate.fbatch);
                folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }
}

void lru_note_cost(struct lruvec *lruvec, bool file,
                   unsigned int nr_io, unsigned int nr_rotated)
{
        unsigned long cost;

        /*
         * Reflect the relative cost of incurring IO and spending CPU
         * time on rotations. This doesn't attempt to make a precise
         * comparison, it just says: if reloads are about comparable
         * between the LRU lists, or rotations are overwhelmingly
         * different between them, adjust scan balance for CPU work.
         */
        cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;

        do {
                unsigned long lrusize;

                /*
                 * Hold lruvec->lru_lock is safe here, since
                 * 1) The pinned lruvec in reclaim, or
                 * 2) From a pre-LRU page during refault (which also holds the
                 *    rcu lock, so would be safe even if the page was on the LRU
                 *    and could move simultaneously to a new lruvec).
                 */
                spin_lock_irq(&lruvec->lru_lock);
                /* Record cost event */
                if (file)
                        lruvec->file_cost += cost;
                else
                        lruvec->anon_cost += cost;

                /*
                 * Decay previous events
                 *
                 * Because workloads change over time (and to avoid
                 * overflow) we keep these statistics as a floating
                 * average, which ends up weighing recent refaults
                 * more than old ones.
                 */
                lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
                          lruvec_page_state(lruvec, NR_ACTIVE_FILE);

                if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
                        lruvec->file_cost /= 2;
                        lruvec->anon_cost /= 2;
                }
                spin_unlock_irq(&lruvec->lru_lock);
        } while ((lruvec = parent_lruvec(lruvec)));
}

void lru_note_cost_refault(struct folio *folio)
{
        lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
                      folio_nr_pages(folio), 0);
}

static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
                long nr_pages = folio_nr_pages(folio);

                lruvec_del_folio(lruvec, folio);
                folio_set_active(folio);
                lruvec_add_folio(lruvec, folio);
                trace_mm_lru_activate(folio);

                __count_vm_events(PGACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
                                     nr_pages);
        }
}

#ifdef CONFIG_SMP
static void folio_activate_drain(int cpu)
{
        struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu);

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, folio_activate_fn);
}

void folio_activate(struct folio *folio)
{
        if (folio_test_lru(folio) && !folio_test_active(folio) &&
            !folio_test_unevictable(folio)) {
                struct folio_batch *fbatch;

                folio_get(folio);
                local_lock(&cpu_fbatches.lock);
                fbatch = this_cpu_ptr(&cpu_fbatches.activate);
                folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
                local_unlock(&cpu_fbatches.lock);
        }
}

#else
static inline void folio_activate_drain(int cpu)
{
}

void folio_activate(struct folio *folio)
{
        struct lruvec *lruvec;

        if (folio_test_clear_lru(folio)) {
                lruvec = folio_lruvec_lock_irq(folio);
                folio_activate_fn(lruvec, folio);
                unlock_page_lruvec_irq(lruvec);
                folio_set_lru(folio);
        }
}
#endif

static void __lru_cache_activate_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int i;

        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);

        /*
         * Search backwards on the optimistic assumption that the folio being
         * activated has just been added to this batch. Note that only
         * the local batch is examined as a !LRU folio could be in the
         * process of being released, reclaimed, migrated or on a remote
         * batch that is currently being drained. Furthermore, marking
         * a remote batch's folio active potentially hits a race where
         * a folio is marked active just after it is added to the inactive
         * list causing accounting errors and BUG_ON checks to trigger.
         */
        for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
                struct folio *batch_folio = fbatch->folios[i];

                if (batch_folio == folio) {
                        folio_set_active(folio);
                        break;
                }
        }

        local_unlock(&cpu_fbatches.lock);
}

#ifdef CONFIG_LRU_GEN
static void folio_inc_refs(struct folio *folio)
{
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags);

        if (folio_test_unevictable(folio))
                return;

        if (!folio_test_referenced(folio)) {
                folio_set_referenced(folio);
                return;
        }

        if (!folio_test_workingset(folio)) {
                folio_set_workingset(folio);
                return;
        }

        /* see the comment on MAX_NR_TIERS */
        do {
                new_flags = old_flags & LRU_REFS_MASK;
                if (new_flags == LRU_REFS_MASK)
                        break;

                new_flags += BIT(LRU_REFS_PGOFF);
                new_flags |= old_flags & ~LRU_REFS_MASK;
        } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
}
#else
static void folio_inc_refs(struct folio *folio)
{
}
#endif /* CONFIG_LRU_GEN */

/**
 * folio_mark_accessed - Mark a folio as having seen activity.
 * @folio: The folio to mark.
 *
 * This function will perform one of the following transitions:
 *
 * * inactive,unreferenced        ->        inactive,referenced
 * * inactive,referenced        ->        active,unreferenced
 * * active,unreferenced        ->        active,referenced
 *
 * When a newly allocated folio is not yet visible, so safe for non-atomic ops,
 * __folio_set_referenced() may be substituted for folio_mark_accessed().
 */
void folio_mark_accessed(struct folio *folio)
{
        if (lru_gen_enabled()) {
                folio_inc_refs(folio);
                return;
        }

        if (!folio_test_referenced(folio)) {
                folio_set_referenced(folio);
        } else if (folio_test_unevictable(folio)) {
                /*
                 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                 * this list is never rotated or maintained, so marking an
                 * unevictable page accessed has no effect.
                 */
        } else if (!folio_test_active(folio)) {
                /*
                 * If the folio is on the LRU, queue it for activation via
                 * cpu_fbatches.activate. Otherwise, assume the folio is in a
                 * folio_batch, mark it active and it'll be moved to the active
                 * LRU on the next drain.
                 */
                if (folio_test_lru(folio))
                        folio_activate(folio);
                else
                        __lru_cache_activate_folio(folio);
                folio_clear_referenced(folio);
                workingset_activation(folio);
        }
        if (folio_test_idle(folio))
                folio_clear_idle(folio);
}
EXPORT_SYMBOL(folio_mark_accessed);

/**
 * folio_add_lru - Add a folio to an LRU list.
 * @folio: The folio to be added to the LRU.
 *
 * Queue the folio for addition to the LRU. The decision on whether
 * to add the page to the [in]active [file|anon] list is deferred until the
 * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
 * have the folio added to the active list using folio_mark_accessed().
 */
void folio_add_lru(struct folio *folio)
{
        struct folio_batch *fbatch;

        VM_BUG_ON_FOLIO(folio_test_active(folio) &&
                        folio_test_unevictable(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /* see the comment in lru_gen_add_folio() */
        if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
            lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
                folio_set_active(folio);

        folio_get(folio);
        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
        folio_batch_add_and_move(fbatch, folio, lru_add_fn);
        local_unlock(&cpu_fbatches.lock);
}
EXPORT_SYMBOL(folio_add_lru);

/**
 * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
 * @folio: The folio to be added to the LRU.
 * @vma: VMA in which the folio is mapped.
 *
 * If the VMA is mlocked, @folio is added to the unevictable list.
 * Otherwise, it is treated the same way as folio_add_lru().
 */
void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
                mlock_new_folio(folio);
        else
                folio_add_lru(folio);
}

/*
 * If the folio cannot be invalidated, it is moved to the
 * inactive list to speed up its reclaim.  It is moved to the
 * head of the list, rather than the tail, to give the flusher
 * threads some time to write it out, as this is much more
 * effective than the single-page writeout from reclaim.
 *
 * If the folio isn't mapped and dirty/writeback, the folio
 * could be reclaimed asap using the reclaim flag.
 *
 * 1. active, mapped folio -> none
 * 2. active, dirty/writeback folio -> inactive, head, reclaim
 * 3. inactive, mapped folio -> none
 * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
 * 5. inactive, clean -> inactive, tail
 * 6. Others -> none
 *
 * In 4, it moves to the head of the inactive list so the folio is
 * written out by flusher threads as this is much more efficient
 * than the single-page writeout from reclaim.
 */
static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
{
        bool active = folio_test_active(folio);
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_unevictable(folio))
                return;

        /* Some processes are using the folio */
        if (folio_mapped(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_clear_referenced(folio);

        if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
                /*
                 * Setting the reclaim flag could race with
                 * folio_end_writeback() and confuse readahead.  But the
                 * race window is _really_ small and  it's not a critical
                 * problem.
                 */
                lruvec_add_folio(lruvec, folio);
                folio_set_reclaim(folio);
        } else {
                /*
                 * The folio's writeback ended while it was in the batch.
                 * We move that folio to the tail of the inactive list.
                 */
                lruvec_add_folio_tail(lruvec, folio);
                __count_vm_events(PGROTATED, nr_pages);
        }

        if (active) {
                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
                long nr_pages = folio_nr_pages(folio);

                lruvec_del_folio(lruvec, folio);
                folio_clear_active(folio);
                folio_clear_referenced(folio);
                lruvec_add_folio(lruvec, folio);

                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
            !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
                long nr_pages = folio_nr_pages(folio);

                lruvec_del_folio(lruvec, folio);
                folio_clear_active(folio);
                folio_clear_referenced(folio);
                /*
                 * Lazyfree folios are clean anonymous folios.  They have
                 * the swapbacked flag cleared, to distinguish them from normal
                 * anonymous folios
                 */
                folio_clear_swapbacked(folio);
                lruvec_add_folio(lruvec, folio);

                __count_vm_events(PGLAZYFREE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
                                     nr_pages);
        }
}

/*
 * Drain pages out of the cpu's folio_batch.
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 */
void lru_add_drain_cpu(int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
        struct folio_batch *fbatch = &fbatches->lru_add;

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_add_fn);

        fbatch = &per_cpu(lru_rotate.fbatch, cpu);
        /* Disabling interrupts below acts as a compiler barrier. */
        if (data_race(folio_batch_count(fbatch))) {
                unsigned long flags;

                /* No harm done if a racing interrupt already did this */
                local_lock_irqsave(&lru_rotate.lock, flags);
                folio_batch_move_lru(fbatch, lru_move_tail_fn);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }

        fbatch = &fbatches->lru_deactivate_file;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate_file_fn);

        fbatch = &fbatches->lru_deactivate;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate_fn);

        fbatch = &fbatches->lru_lazyfree;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_lazyfree_fn);

        folio_activate_drain(cpu);
}

/**
 * deactivate_file_folio() - Deactivate a file folio.
 * @folio: Folio to deactivate.
 *
 * This function hints to the VM that @folio is a good reclaim candidate,
 * for example if its invalidation fails due to the folio being dirty
 * or under writeback.
 *
 * Context: Caller holds a reference on the folio.
 */
void deactivate_file_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        /* Deactivating an unevictable folio will not accelerate reclaim */
        if (folio_test_unevictable(folio))
                return;

        folio_get(folio);
        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
        folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
        local_unlock(&cpu_fbatches.lock);
}

/*
 * folio_deactivate - deactivate a folio
 * @folio: folio to deactivate
 *
 * folio_deactivate() moves @folio to the inactive list if @folio was on the
 * active list and was not unevictable. This is done to accelerate the
 * reclaim of @folio.
 */
void folio_deactivate(struct folio *folio)
{
        if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
            (folio_test_active(folio) || lru_gen_enabled())) {
                struct folio_batch *fbatch;

                folio_get(folio);
                local_lock(&cpu_fbatches.lock);
                fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
                folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
                local_unlock(&cpu_fbatches.lock);
        }
}

/**
 * folio_mark_lazyfree - make an anon folio lazyfree
 * @folio: folio to deactivate
 *
 * folio_mark_lazyfree() moves @folio to the inactive file list.
 * This is done to accelerate the reclaim of @folio.
 */
void folio_mark_lazyfree(struct folio *folio)
{
        if (folio_test_lru(folio) && folio_test_anon(folio) &&
            folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
            !folio_test_unevictable(folio)) {
                struct folio_batch *fbatch;

                folio_get(folio);
                local_lock(&cpu_fbatches.lock);
                fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
                folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
                local_unlock(&cpu_fbatches.lock);
        }
}

void lru_add_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

/*
 * It's called from per-cpu workqueue context in SMP case so
 * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
 * the same cpu. It shouldn't be a problem in !SMP case since
 * the core is only one and the locks will disable preemption.
 */
static void lru_add_and_bh_lrus_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        invalidate_bh_lrus_cpu();
        mlock_drain_local();
}

void lru_add_drain_cpu_zone(struct zone *zone)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        drain_local_pages(zone);
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
        lru_add_and_bh_lrus_drain();
}

static bool cpu_needs_drain(unsigned int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);

        /* Check these in order of likelihood that they're not zero */
        return folio_batch_count(&fbatches->lru_add) ||
                data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) ||
                folio_batch_count(&fbatches->lru_deactivate_file) ||
                folio_batch_count(&fbatches->lru_deactivate) ||
                folio_batch_count(&fbatches->lru_lazyfree) ||
                folio_batch_count(&fbatches->activate) ||
                need_mlock_drain(cpu) ||
                has_bh_in_lru(cpu, NULL);
}

/*
 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 * kworkers being shut down before our page_alloc_cpu_dead callback is
 * executed on the offlined cpu.
 * Calling this function with cpu hotplug locks held can actually lead
 * to obscure indirect dependencies via WQ context.
 */
static inline void __lru_add_drain_all(bool force_all_cpus)
{
        /*
         * lru_drain_gen - Global pages generation number
         *
         * (A) Definition: global lru_drain_gen = x implies that all generations
         *     0 < n <= x are already *scheduled* for draining.
         *
         * This is an optimization for the highly-contended use case where a
         * user space workload keeps constantly generating a flow of pages for
         * each CPU.
         */
        static unsigned int lru_drain_gen;
        static struct cpumask has_work;
        static DEFINE_MUTEX(lock);
        unsigned cpu, this_gen;

        /*
         * Make sure nobody triggers this path before mm_percpu_wq is fully
         * initialized.
         */
        if (WARN_ON(!mm_percpu_wq))
                return;

        /*
         * Guarantee folio_batch counter stores visible by this CPU
         * are visible to other CPUs before loading the current drain
         * generation.
         */
        smp_mb();

        /*
         * (B) Locally cache global LRU draining generation number
         *
         * The read barrier ensures that the counter is loaded before the mutex
         * is taken. It pairs with smp_mb() inside the mutex critical section
         * at (D).
         */
        this_gen = smp_load_acquire(&lru_drain_gen);

        mutex_lock(&lock);

        /*
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
        if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
                goto done;

        /*
         * (D) Increment global generation number
         *
         * Pairs with smp_load_acquire() at (B), outside of the critical
         * section. Use a full memory barrier to guarantee that the
         * new global drain generation number is stored before loading
         * folio_batch counters.
         *
         * This pairing must be done here, before the for_each_online_cpu loop
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
         * Assume CPU #z is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
         *
         * If the paired barrier is done at any later step, e.g. after the
         * loop, CPU #x will just exit at (C) and miss flushing out all of its
         * added pages.
         */
        WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
        smp_mb();

        cpumask_clear(&has_work);
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (cpu_needs_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

done:
        mutex_unlock(&lock);
}

void lru_add_drain_all(void)
{
        __lru_add_drain_all(false);
}
#else
void lru_add_drain_all(void)
{
        lru_add_drain();
}
#endif /* CONFIG_SMP */

atomic_t lru_disable_count = ATOMIC_INIT(0);

/*
 * lru_cache_disable() needs to be called before we start compiling
 * a list of pages to be migrated using isolate_lru_page().
 * It drains pages on LRU cache and then disable on all cpus until
 * lru_cache_enable is called.
 *
 * Must be paired with a call to lru_cache_enable().
 */
void lru_cache_disable(void)
{
        atomic_inc(&lru_disable_count);
        /*
         * Readers of lru_disable_count are protected by either disabling
         * preemption or rcu_read_lock:
         *
         * preempt_disable, local_irq_disable  [bh_lru_lock()]
         * rcu_read_lock                       [rt_spin_lock CONFIG_PREEMPT_RT]
         * preempt_disable                       [local_lock !CONFIG_PREEMPT_RT]
         *
         * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
         * preempt_disable() regions of code. So any CPU which sees
         * lru_disable_count = 0 will have exited the critical
         * section when synchronize_rcu() returns.
         */
        synchronize_rcu_expedited();
#ifdef CONFIG_SMP
        __lru_add_drain_all(true);
#else
        lru_add_and_bh_lrus_drain();
#endif
}

/**
 * folios_put_refs - Reduce the reference count on a batch of folios.
 * @folios: The folios.
 * @refs: The number of refs to subtract from each folio.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need
 * to reinitialise it.  If @refs is NULL, we subtract one from each
 * folio refcount.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
{
        int i, j;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0, j = 0; i < folios->nr; i++) {
                struct folio *folio = folios->folios[i];
                unsigned int nr_refs = refs ? refs[i] : 1;

                if (is_huge_zero_folio(folio))
                        continue;

                if (folio_is_zone_device(folio)) {
                        if (lruvec) {
                                unlock_page_lruvec_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        if (put_devmap_managed_folio_refs(folio, nr_refs))
                                continue;
                        if (folio_ref_sub_and_test(folio, nr_refs))
                                free_zone_device_folio(folio);
                        continue;
                }

                if (!folio_ref_sub_and_test(folio, nr_refs))
                        continue;

                /* hugetlb has its own memcg */
                if (folio_test_hugetlb(folio)) {
                        if (lruvec) {
                                unlock_page_lruvec_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        free_huge_folio(folio);
                        continue;
                }
                if (folio_test_large(folio) &&
                    folio_test_large_rmappable(folio))
                        folio_undo_large_rmappable(folio);

                __page_cache_release(folio, &lruvec, &flags);

                if (j != i)
                        folios->folios[j] = folio;
                j++;
        }
        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
        if (!j) {
                folio_batch_reinit(folios);
                return;
        }

        folios->nr = j;
        mem_cgroup_uncharge_folios(folios);
        free_unref_folios(folios);
}
EXPORT_SYMBOL(folios_put_refs);

/**
 * release_pages - batched put_page()
 * @arg: array of pages to release
 * @nr: number of pages
 *
 * Decrement the reference count on all the pages in @arg.  If it
 * fell to zero, remove the page from the LRU and free it.
 *
 * Note that the argument can be an array of pages, encoded pages,
 * or folio pointers. We ignore any encoded bits, and turn any of
 * them into just a folio that gets free'd.
 */
void release_pages(release_pages_arg arg, int nr)
{
        struct folio_batch fbatch;
        int refs[PAGEVEC_SIZE];
        struct encoded_page **encoded = arg.encoded_pages;
        int i;

        folio_batch_init(&fbatch);
        for (i = 0; i < nr; i++) {
                /* Turn any of the argument types into a folio */
                struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));

                /* Is our next entry actually "nr_pages" -> "nr_refs" ? */
                refs[fbatch.nr] = 1;
                if (unlikely(encoded_page_flags(encoded[i]) &
                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                        refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);

                if (folio_batch_add(&fbatch, folio) > 0)
                        continue;
                folios_put_refs(&fbatch, refs);
        }

        if (fbatch.nr)
                folios_put_refs(&fbatch, refs);
}
EXPORT_SYMBOL(release_pages);

/*
 * The folios which we're about to release may be in the deferred lru-addition
 * queues.  That would prevent them from really being freed right now.  That's
 * OK from a correctness point of view but is inefficient - those folios may be
 * cache-warm and we want to give them back to the page allocator ASAP.
 *
 * So __folio_batch_release() will drain those queues here.
 * folio_batch_move_lru() calls folios_put() directly to avoid
 * mutual recursion.
 */
void __folio_batch_release(struct folio_batch *fbatch)
{
        if (!fbatch->percpu_pvec_drained) {
                lru_add_drain();
                fbatch->percpu_pvec_drained = true;
        }
        folios_put(fbatch);
}
EXPORT_SYMBOL(__folio_batch_release);

/**
 * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
 * @fbatch: The batch to prune
 *
 * find_get_entries() fills a batch with both folios and shadow/swap/DAX
 * entries.  This function prunes all the non-folio entries from @fbatch
 * without leaving holes, so that it can be passed on to folio-only batch
 * operations.
 */
void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
        unsigned int i, j;

        for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];
                if (!xa_is_value(folio))
                        fbatch->folios[j++] = folio;
        }
        fbatch->nr = j;
}

/*
 * Perform any setup for the swap system
 */
void __init swap_setup(void)
{
        unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);

        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
        else
                page_cluster = 3;
        /*
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
}












































































































































































































































































































































































    2 

















    2 
    2 












































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/capability.c
 *
 * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
 *
 * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@kernel.org>
 * 30 May 2002:        Cleanup, Robert M. Love <rml@tech9.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>

int file_caps_enabled = 1;

static int __init file_caps_disable(char *str)
{
        file_caps_enabled = 0;
        return 1;
}
__setup("no_file_caps", file_caps_disable);

#ifdef CONFIG_MULTIUSER
/*
 * More recent versions of libcap are available from:
 *
 *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
 */

static void warn_legacy_capability_use(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
                     get_task_comm(name, current));
}

/*
 * Version 2 capabilities worked fine, but the linux/capability.h file
 * that accompanied their introduction encouraged their use without
 * the necessary user-space source code changes. As such, we have
 * created a version 3 with equivalent functionality to version 2, but
 * with a header change to protect legacy source code from using
 * version 2 when it wanted to use version 1. If your system has code
 * that trips the following warning, it is using version 2 specific
 * capabilities and may be doing so insecurely.
 *
 * The remedy is to either upgrade your version of libcap (to 2.10+,
 * if the application is linked against it), or recompile your
 * application with modern kernel headers and this warning will go
 * away.
 */

static void warn_deprecated_v2(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
                     get_task_comm(name, current));
}

/*
 * Version check. Return the number of u32s in each capability flag
 * array, or a negative value on error.
 */
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
        __u32 version;

        if (get_user(version, &header->version))
                return -EFAULT;

        switch (version) {
        case _LINUX_CAPABILITY_VERSION_1:
                warn_legacy_capability_use();
                *tocopy = _LINUX_CAPABILITY_U32S_1;
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
                fallthrough;        /* v3 is otherwise equivalent to v2 */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
        default:
                if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
        }

        return 0;
}

/*
 * The only thing that can change the capabilities of the current
 * process is the current process. As such, we can't be in this code
 * at the same time as we are in the process of setting capabilities
 * in this process. The net result is that we can limit our use of
 * locks to when we are reading the caps of another process.
 */
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
{
        int ret;

        if (pid && (pid != task_pid_vnr(current))) {
                const struct task_struct *target;

                rcu_read_lock();

                target = find_task_by_vpid(pid);
                if (!target)
                        ret = -ESRCH;
                else
                        ret = security_capget(target, pEp, pIp, pPp);

                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);

        return ret;
}

/**
 * sys_capget - get the capabilities of a given process.
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @dataptr: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities that are returned
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
        int ret = 0;
        pid_t pid;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;
        struct __user_cap_data_struct kdata[2];

        ret = cap_validate_magic(header, &tocopy);
        if ((dataptr == NULL) || (ret != 0))
                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        if (pid < 0)
                return -EINVAL;

        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
        if (ret)
                return ret;

        /*
         * Annoying legacy format with 64-bit capabilities exposed
         * as two sets of 32-bit fields, so we need to split the
         * capability values up.
         */
        kdata[0].effective   = pE.val; kdata[1].effective   = pE.val >> 32;
        kdata[0].permitted   = pP.val; kdata[1].permitted   = pP.val >> 32;
        kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;

        /*
         * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
         * we silently drop the upper capabilities here. This
         * has the effect of making older libcap
         * implementations implicitly drop upper capability
         * bits when they perform a: capget/modify/capset
         * sequence.
         *
         * This behavior is considered fail-safe
         * behavior. Upgrading the application to a newer
         * version of libcap will enable access to the newer
         * capabilities.
         *
         * An alternative would be to return an error here
         * (-ERANGE), but that causes legacy applications to
         * unexpectedly fail; the capget/modify/capset aborts
         * before modification is attempted and the application
         * fails.
         */
        if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
                return -EFAULT;

        return 0;
}

static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
{
        return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}

/**
 * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @data: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities
 *
 * Set capabilities for the current process only.  The ability to any other
 * process(es) has been deprecated and removed.
 *
 * The restrictions on setting capabilities are specified as:
 *
 * I: any raised capabilities must be a subset of the old permitted
 * P: any raised capabilities must be a subset of the old permitted
 * E: must be set to a subset of new permitted
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
        struct __user_cap_data_struct kdata[2] = { { 0, }, };
        unsigned tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
        pid_t pid;

        ret = cap_validate_magic(header, &tocopy);
        if (ret != 0)
                return ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        /* may only affect current now */
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;

        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
        if (copybytes > sizeof(kdata))
                return -EFAULT;

        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;

        effective   = mk_kernel_cap(kdata[0].effective,   kdata[1].effective);
        permitted   = mk_kernel_cap(kdata[0].permitted,   kdata[1].permitted);
        inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = security_capset(new, current_cred(),
                              &effective, &inheritable, &permitted);
        if (ret < 0)
                goto error;

        audit_log_capset(new, current_cred());

        return commit_creds(new);

error:
        abort_creds(new);
        return ret;
}

/**
 * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability(struct task_struct *t,
                       struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability - Does a task have a capability in init_user_ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the initial user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability(struct task_struct *t, int cap)
{
        return has_ns_capability(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability);

/**
 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
 * in a specific user ns.
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability_noaudit(struct task_struct *t,
                               struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability_noaudit - Does a task have a capability (unaudited) in the
 * initial user ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to init_user_ns, false if not.  Don't write an
 * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability_noaudit);

static bool ns_capable_common(struct user_namespace *ns,
                              int cap,
                              unsigned int opts)
{
        int capable;

        if (unlikely(!cap_valid(cap))) {
                pr_crit("capable() called with invalid cap=%u\n", cap);
                BUG();
        }

        capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
        return false;
}

/**
 * ns_capable - Determine if the current task has a superior capability in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);

/**
 * ns_capable_noaudit - Determine if the current task has a superior capability
 * (unaudited) in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);

/**
 * ns_capable_setid - Determine if the current task has a superior capability
 * in effect, while signalling that this check is being done from within a
 * setid or setgroups syscall.
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);

/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool capable(int cap)
{
        return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */

/**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if task that opened the file had a capability in effect
 * when the file was opened.
 *
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
{

        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;

        if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;

        return false;
}
EXPORT_SYMBOL(file_ns_capable);

/**
 * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
 * @ns: The user namespace in question
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 *
 * Return true if the inode uid and gid are within the namespace.
 */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode)
{
        return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
               vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}

/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 * @cap: The capability in question
 *
 * Return true if the current task has the given capability targeted at
 * its own user namespace and that the given inode's uid and gid are
 * mapped into the current user namespace.
 */
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap)
{
        struct user_namespace *ns = current_user_ns();

        return ns_capable(ns, cap) &&
               privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

/**
 * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
 * @tsk: The task that may be ptraced
 * @ns: The user namespace to search for CAP_SYS_PTRACE in
 *
 * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
 * in the specified user namespace.
 */
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;

        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
                ret = security_capable(cred, ns, CAP_SYS_PTRACE,
                                       CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
}




























    3 







    2 




    1 







    1 



































































































































































































































































































































































    3 


    3 

































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
// SPDX-License-Identifier: GPL-2.0
/*
 * trace context switch
 *
 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
 *
 */
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <linux/kmemleak.h>
#include <linux/ftrace.h>
#include <trace/events/sched.h>

#include "trace.h"

#define RECORD_CMDLINE        1
#define RECORD_TGID        2

static int                sched_cmdline_ref;
static int                sched_tgid_ref;
static DEFINE_MUTEX(sched_register_mutex);

static void
probe_sched_switch(void *ignore, bool preempt,
                   struct task_struct *prev, struct task_struct *next,
                   unsigned int prev_state)
{
        int flags;

        flags = (RECORD_TGID * !!sched_tgid_ref) +
                (RECORD_CMDLINE * !!sched_cmdline_ref);

        if (!flags)
                return;
        tracing_record_taskinfo_sched_switch(prev, next, flags);
}

static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
        int flags;

        flags = (RECORD_TGID * !!sched_tgid_ref) +
                (RECORD_CMDLINE * !!sched_cmdline_ref);

        if (!flags)
                return;
        tracing_record_taskinfo_sched_switch(current, wakee, flags);
}

static int tracing_sched_register(void)
{
        int ret;

        ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup\n");
                return ret;
        }

        ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup_new\n");
                goto fail_deprobe;
        }

        ret = register_trace_sched_switch(probe_sched_switch, NULL);
        if (ret) {
                pr_info("sched trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_switch\n");
                goto fail_deprobe_wake_new;
        }

        return ret;
fail_deprobe_wake_new:
        unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
fail_deprobe:
        unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
        return ret;
}

static void tracing_sched_unregister(void)
{
        unregister_trace_sched_switch(probe_sched_switch, NULL);
        unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
        unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}

static void tracing_start_sched_switch(int ops)
{
        bool sched_register;

        mutex_lock(&sched_register_mutex);
        sched_register = (!sched_cmdline_ref && !sched_tgid_ref);

        switch (ops) {
        case RECORD_CMDLINE:
                sched_cmdline_ref++;
                break;

        case RECORD_TGID:
                sched_tgid_ref++;
                break;
        }

        if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
                tracing_sched_register();
        mutex_unlock(&sched_register_mutex);
}

static void tracing_stop_sched_switch(int ops)
{
        mutex_lock(&sched_register_mutex);

        switch (ops) {
        case RECORD_CMDLINE:
                sched_cmdline_ref--;
                break;

        case RECORD_TGID:
                sched_tgid_ref--;
                break;
        }

        if (!sched_cmdline_ref && !sched_tgid_ref)
                tracing_sched_unregister();
        mutex_unlock(&sched_register_mutex);
}

void tracing_start_cmdline_record(void)
{
        tracing_start_sched_switch(RECORD_CMDLINE);
}

void tracing_stop_cmdline_record(void)
{
        tracing_stop_sched_switch(RECORD_CMDLINE);
}

void tracing_start_tgid_record(void)
{
        tracing_start_sched_switch(RECORD_TGID);
}

void tracing_stop_tgid_record(void)
{
        tracing_stop_sched_switch(RECORD_TGID);
}

/*
 * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
 * is the tgid last observed corresponding to pid=i.
 */
static int *tgid_map;

/* The maximum valid index into tgid_map. */
static size_t tgid_map_max;

#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
/*
 * Preemption must be disabled before acquiring trace_cmdline_lock.
 * The various trace_arrays' max_lock must be acquired in a context
 * where interrupt is disabled.
 */
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
        unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
        unsigned *map_cmdline_to_pid;
        unsigned cmdline_num;
        int cmdline_idx;
        char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;

/* Holds the size of a cmdline and pid element */
#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s)                        \
        (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))

static inline char *get_saved_cmdlines(int idx)
{
        return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
}

static inline void set_cmdline(int idx, const char *cmdline)
{
        strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}

static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
        int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);

        kmemleak_free(s);
        free_pages((unsigned long)s, order);
}

static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
{
        struct saved_cmdlines_buffer *s;
        struct page *page;
        int orig_size, size;
        int order;

        /* Figure out how much is needed to hold the given number of cmdlines */
        orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
        order = get_order(orig_size);
        size = 1 << (order + PAGE_SHIFT);
        page = alloc_pages(GFP_KERNEL, order);
        if (!page)
                return NULL;

        s = page_address(page);
        kmemleak_alloc(s, size, 1, GFP_KERNEL);
        memset(s, 0, sizeof(*s));

        /* Round up to actual allocation */
        val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
        s->cmdline_num = val;

        /* Place map_cmdline_to_pid array right after saved_cmdlines */
        s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];

        s->cmdline_idx = 0;
        memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
               sizeof(s->map_pid_to_cmdline));
        memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
               val * sizeof(*s->map_cmdline_to_pid));

        return s;
}

int trace_create_savedcmd(void)
{
        savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);

        return savedcmd ? 0 : -ENOMEM;
}

int trace_save_cmdline(struct task_struct *tsk)
{
        unsigned tpid, idx;

        /* treat recording of idle task as a success */
        if (!tsk->pid)
                return 1;

        tpid = tsk->pid & (PID_MAX_DEFAULT - 1);

        /*
         * It's not the end of the world if we don't get
         * the lock, but we also don't want to spin
         * nor do we want to disable interrupts,
         * so if we miss here, then better luck next time.
         *
         * This is called within the scheduler and wake up, so interrupts
         * had better been disabled and run queue lock been held.
         */
        lockdep_assert_preemption_disabled();
        if (!arch_spin_trylock(&trace_cmdline_lock))
                return 0;

        idx = savedcmd->map_pid_to_cmdline[tpid];
        if (idx == NO_CMDLINE_MAP) {
                idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;

                savedcmd->map_pid_to_cmdline[tpid] = idx;
                savedcmd->cmdline_idx = idx;
        }

        savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
        set_cmdline(idx, tsk->comm);

        arch_spin_unlock(&trace_cmdline_lock);

        return 1;
}

static void __trace_find_cmdline(int pid, char comm[])
{
        unsigned map;
        int tpid;

        if (!pid) {
                strcpy(comm, "<idle>");
                return;
        }

        if (WARN_ON_ONCE(pid < 0)) {
                strcpy(comm, "<XXX>");
                return;
        }

        tpid = pid & (PID_MAX_DEFAULT - 1);
        map = savedcmd->map_pid_to_cmdline[tpid];
        if (map != NO_CMDLINE_MAP) {
                tpid = savedcmd->map_cmdline_to_pid[map];
                if (tpid == pid) {
                        strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
                        return;
                }
        }
        strcpy(comm, "<...>");
}

void trace_find_cmdline(int pid, char comm[])
{
        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);

        __trace_find_cmdline(pid, comm);

        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
}

static int *trace_find_tgid_ptr(int pid)
{
        /*
         * Pairs with the smp_store_release in set_tracer_flag() to ensure that
         * if we observe a non-NULL tgid_map then we also observe the correct
         * tgid_map_max.
         */
        int *map = smp_load_acquire(&tgid_map);

        if (unlikely(!map || pid > tgid_map_max))
                return NULL;

        return &map[pid];
}

int trace_find_tgid(int pid)
{
        int *ptr = trace_find_tgid_ptr(pid);

        return ptr ? *ptr : 0;
}

static int trace_save_tgid(struct task_struct *tsk)
{
        int *ptr;

        /* treat recording of idle task as a success */
        if (!tsk->pid)
                return 1;

        ptr = trace_find_tgid_ptr(tsk->pid);
        if (!ptr)
                return 0;

        *ptr = tsk->tgid;
        return 1;
}

static bool tracing_record_taskinfo_skip(int flags)
{
        if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
                return true;
        if (!__this_cpu_read(trace_taskinfo_save))
                return true;
        return false;
}

/**
 * tracing_record_taskinfo - record the task info of a task
 *
 * @task:  task to record
 * @flags: TRACE_RECORD_CMDLINE for recording comm
 *         TRACE_RECORD_TGID for recording tgid
 */
void tracing_record_taskinfo(struct task_struct *task, int flags)
{
        bool done;

        if (tracing_record_taskinfo_skip(flags))
                return;

        /*
         * Record as much task information as possible. If some fail, continue
         * to try to record the others.
         */
        done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
        done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);

        /* If recording any information failed, retry again soon. */
        if (!done)
                return;

        __this_cpu_write(trace_taskinfo_save, false);
}

/**
 * tracing_record_taskinfo_sched_switch - record task info for sched_switch
 *
 * @prev: previous task during sched_switch
 * @next: next task during sched_switch
 * @flags: TRACE_RECORD_CMDLINE for recording comm
 *         TRACE_RECORD_TGID for recording tgid
 */
void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
                                          struct task_struct *next, int flags)
{
        bool done;

        if (tracing_record_taskinfo_skip(flags))
                return;

        /*
         * Record as much task information as possible. If some fail, continue
         * to try to record the others.
         */
        done  = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
        done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
        done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
        done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);

        /* If recording any information failed, retry again soon. */
        if (!done)
                return;

        __this_cpu_write(trace_taskinfo_save, false);
}

/* Helpers to record a specific task information */
void tracing_record_cmdline(struct task_struct *task)
{
        tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
}

void tracing_record_tgid(struct task_struct *task)
{
        tracing_record_taskinfo(task, TRACE_RECORD_TGID);
}

int trace_alloc_tgid_map(void)
{
        int *map;

        if (tgid_map)
                return 0;

        tgid_map_max = pid_max;
        map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
                       GFP_KERNEL);
        if (!map)
                return -ENOMEM;

        /*
         * Pairs with smp_load_acquire() in
         * trace_find_tgid_ptr() to ensure that if it observes
         * the tgid_map we just allocated then it also observes
         * the corresponding tgid_map_max value.
         */
        smp_store_release(&tgid_map, map);
        return 0;
}

static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
        int pid = ++(*pos);

        return trace_find_tgid_ptr(pid);
}

static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
        int pid = *pos;

        return trace_find_tgid_ptr(pid);
}

static void saved_tgids_stop(struct seq_file *m, void *v)
{
}

static int saved_tgids_show(struct seq_file *m, void *v)
{
        int *entry = (int *)v;
        int pid = entry - tgid_map;
        int tgid = *entry;

        if (tgid == 0)
                return SEQ_SKIP;

        seq_printf(m, "%d %d\n", pid, tgid);
        return 0;
}

static const struct seq_operations tracing_saved_tgids_seq_ops = {
        .start                = saved_tgids_start,
        .stop                = saved_tgids_stop,
        .next                = saved_tgids_next,
        .show                = saved_tgids_show,
};

static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        return seq_open(filp, &tracing_saved_tgids_seq_ops);
}


const struct file_operations tracing_saved_tgids_fops = {
        .open                = tracing_saved_tgids_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
        unsigned int *ptr = v;

        if (*pos || m->count)
                ptr++;

        (*pos)++;

        for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
             ptr++) {
                if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
                        continue;

                return ptr;
        }

        return NULL;
}

static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
{
        void *v;
        loff_t l = 0;

        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);

        v = &savedcmd->map_cmdline_to_pid[0];
        while (l <= *pos) {
                v = saved_cmdlines_next(m, v, &l);
                if (!v)
                        return NULL;
        }

        return v;
}

static void saved_cmdlines_stop(struct seq_file *m, void *v)
{
        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
}

static int saved_cmdlines_show(struct seq_file *m, void *v)
{
        char buf[TASK_COMM_LEN];
        unsigned int *pid = v;

        __trace_find_cmdline(*pid, buf);
        seq_printf(m, "%d %s\n", *pid, buf);
        return 0;
}

static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
        .start                = saved_cmdlines_start,
        .next                = saved_cmdlines_next,
        .stop                = saved_cmdlines_stop,
        .show                = saved_cmdlines_show,
};

static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
}

const struct file_operations tracing_saved_cmdlines_fops = {
        .open                = tracing_saved_cmdlines_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static ssize_t
tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
                                 size_t cnt, loff_t *ppos)
{
        char buf[64];
        int r;

        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);
        r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

void trace_free_saved_cmdlines_buffer(void)
{
        free_saved_cmdlines_buffer(savedcmd);
}

static int tracing_resize_saved_cmdlines(unsigned int val)
{
        struct saved_cmdlines_buffer *s, *savedcmd_temp;

        s = allocate_cmdlines_buffer(val);
        if (!s)
                return -ENOMEM;

        preempt_disable();
        arch_spin_lock(&trace_cmdline_lock);
        savedcmd_temp = savedcmd;
        savedcmd = s;
        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
        free_saved_cmdlines_buffer(savedcmd_temp);

        return 0;
}

static ssize_t
tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
                                  size_t cnt, loff_t *ppos)
{
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        /* must have at least 1 entry or less than PID_MAX_DEFAULT */
        if (!val || val > PID_MAX_DEFAULT)
                return -EINVAL;

        ret = tracing_resize_saved_cmdlines((unsigned int)val);
        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

const struct file_operations tracing_saved_cmdlines_size_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_saved_cmdlines_size_read,
        .write                = tracing_saved_cmdlines_size_write,
};











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __NET_CFG80211_H
#define __NET_CFG80211_H
/*
 * 802.11 device and configuration interface
 *
 * Copyright 2006-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014 Intel Mobile Communications GmbH
 * Copyright 2015-2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 */

#include <linux/ethtool.h>
#include <uapi/linux/rfkill.h>
#include <linux/netdevice.h>
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/bug.h>
#include <linux/netlink.h>
#include <linux/skbuff.h>
#include <linux/nl80211.h>
#include <linux/if_ether.h>
#include <linux/ieee80211.h>
#include <linux/net.h>
#include <linux/rfkill.h>
#include <net/regulatory.h>

/**
 * DOC: Introduction
 *
 * cfg80211 is the configuration API for 802.11 devices in Linux. It bridges
 * userspace and drivers, and offers some utility functionality associated
 * with 802.11. cfg80211 must, directly or indirectly via mac80211, be used
 * by all modern wireless drivers in Linux, so that they offer a consistent
 * API through nl80211. For backward compatibility, cfg80211 also offers
 * wireless extensions to userspace, but hides them from drivers completely.
 *
 * Additionally, cfg80211 contains code to help enforce regulatory spectrum
 * use restrictions.
 */


/**
 * DOC: Device registration
 *
 * In order for a driver to use cfg80211, it must register the hardware device
 * with cfg80211. This happens through a number of hardware capability structs
 * described below.
 *
 * The fundamental structure for each device is the 'wiphy', of which each
 * instance describes a physical wireless device connected to the system. Each
 * such wiphy can have zero, one, or many virtual interfaces associated with
 * it, which need to be identified as such by pointing the network interface's
 * @ieee80211_ptr pointer to a &struct wireless_dev which further describes
 * the wireless part of the interface. Normally this struct is embedded in the
 * network interface's private data area. Drivers can optionally allow creating
 * or destroying virtual interfaces on the fly, but without at least one or the
 * ability to create some the wireless device isn't useful.
 *
 * Each wiphy structure contains device capability information, and also has
 * a pointer to the various operations the driver offers. The definitions and
 * structures here describe these capabilities in detail.
 */

struct wiphy;

/*
 * wireless hardware capability structures
 */

/**
 * enum ieee80211_channel_flags - channel flags
 *
 * Channel flags set by the regulatory control code.
 *
 * @IEEE80211_CHAN_DISABLED: This channel is disabled.
 * @IEEE80211_CHAN_NO_IR: do not initiate radiation, this includes
 *        sending probe requests or beaconing.
 * @IEEE80211_CHAN_PSD: Power spectral density (in dBm) is set for this
 *        channel.
 * @IEEE80211_CHAN_RADAR: Radar detection is required on this channel.
 * @IEEE80211_CHAN_NO_HT40PLUS: extension channel above this channel
 *        is not permitted.
 * @IEEE80211_CHAN_NO_HT40MINUS: extension channel below this channel
 *        is not permitted.
 * @IEEE80211_CHAN_NO_OFDM: OFDM is not allowed on this channel.
 * @IEEE80211_CHAN_NO_80MHZ: If the driver supports 80 MHz on the band,
 *        this flag indicates that an 80 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_NO_160MHZ: If the driver supports 160 MHz on the band,
 *        this flag indicates that an 160 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_INDOOR_ONLY: see %NL80211_FREQUENCY_ATTR_INDOOR_ONLY
 * @IEEE80211_CHAN_IR_CONCURRENT: see %NL80211_FREQUENCY_ATTR_IR_CONCURRENT
 * @IEEE80211_CHAN_NO_20MHZ: 20 MHz bandwidth is not permitted
 *        on this channel.
 * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted
 *        on this channel.
 * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel.
 * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted
 *        on this channel.
 * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band,
 *        this flag indicates that a 320 MHz channel cannot use this
 *        channel as the control or any of the secondary channels.
 *        This may be due to the driver or due to regulatory bandwidth
 *        restrictions.
 * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel.
 * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT
 * @IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT: Client connection with VLP AP
 *        not permitted using this channel
 * @IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT: Client connection with AFC AP
 *        not permitted using this channel
 * @IEEE80211_CHAN_CAN_MONITOR: This channel can be used for monitor
 *        mode even in the presence of other (regulatory) restrictions,
 *        even if it is otherwise disabled.
 */
enum ieee80211_channel_flags {
        IEEE80211_CHAN_DISABLED                = 1<<0,
        IEEE80211_CHAN_NO_IR                = 1<<1,
        IEEE80211_CHAN_PSD                = 1<<2,
        IEEE80211_CHAN_RADAR                = 1<<3,
        IEEE80211_CHAN_NO_HT40PLUS        = 1<<4,
        IEEE80211_CHAN_NO_HT40MINUS        = 1<<5,
        IEEE80211_CHAN_NO_OFDM                = 1<<6,
        IEEE80211_CHAN_NO_80MHZ                = 1<<7,
        IEEE80211_CHAN_NO_160MHZ        = 1<<8,
        IEEE80211_CHAN_INDOOR_ONLY        = 1<<9,
        IEEE80211_CHAN_IR_CONCURRENT        = 1<<10,
        IEEE80211_CHAN_NO_20MHZ                = 1<<11,
        IEEE80211_CHAN_NO_10MHZ                = 1<<12,
        IEEE80211_CHAN_NO_HE                = 1<<13,
        IEEE80211_CHAN_1MHZ                = 1<<14,
        IEEE80211_CHAN_2MHZ                = 1<<15,
        IEEE80211_CHAN_4MHZ                = 1<<16,
        IEEE80211_CHAN_8MHZ                = 1<<17,
        IEEE80211_CHAN_16MHZ                = 1<<18,
        IEEE80211_CHAN_NO_320MHZ        = 1<<19,
        IEEE80211_CHAN_NO_EHT                = 1<<20,
        IEEE80211_CHAN_DFS_CONCURRENT        = 1<<21,
        IEEE80211_CHAN_NO_6GHZ_VLP_CLIENT = 1<<22,
        IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = 1<<23,
        IEEE80211_CHAN_CAN_MONITOR        = 1<<24,
};

#define IEEE80211_CHAN_NO_HT40 \
        (IEEE80211_CHAN_NO_HT40PLUS | IEEE80211_CHAN_NO_HT40MINUS)

#define IEEE80211_DFS_MIN_CAC_TIME_MS                60000
#define IEEE80211_DFS_MIN_NOP_TIME_MS                (30 * 60 * 1000)

/**
 * struct ieee80211_channel - channel definition
 *
 * This structure describes a single channel for use
 * with cfg80211.
 *
 * @center_freq: center frequency in MHz
 * @freq_offset: offset from @center_freq, in KHz
 * @hw_value: hardware-specific value for the channel
 * @flags: channel flags from &enum ieee80211_channel_flags.
 * @orig_flags: channel flags at registration time, used by regulatory
 *        code to support devices with additional restrictions
 * @band: band this channel belongs to.
 * @max_antenna_gain: maximum antenna gain in dBi
 * @max_power: maximum transmission power (in dBm)
 * @max_reg_power: maximum regulatory transmission power (in dBm)
 * @beacon_found: helper to regulatory code to indicate when a beacon
 *        has been found on this channel. Use regulatory_hint_found_beacon()
 *        to enable this, this is useful only on 5 GHz band.
 * @orig_mag: internal use
 * @orig_mpwr: internal use
 * @dfs_state: current state of this channel. Only relevant if radar is required
 *        on this channel.
 * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered.
 * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels.
 * @psd: power spectral density (in dBm)
 */
struct ieee80211_channel {
        enum nl80211_band band;
        u32 center_freq;
        u16 freq_offset;
        u16 hw_value;
        u32 flags;
        int max_antenna_gain;
        int max_power;
        int max_reg_power;
        bool beacon_found;
        u32 orig_flags;
        int orig_mag, orig_mpwr;
        enum nl80211_dfs_state dfs_state;
        unsigned long dfs_state_entered;
        unsigned int dfs_cac_ms;
        s8 psd;
};

/**
 * enum ieee80211_rate_flags - rate flags
 *
 * Hardware/specification flags for rates. These are structured
 * in a way that allows using the same bitrate structure for
 * different bands/PHY modes.
 *
 * @IEEE80211_RATE_SHORT_PREAMBLE: Hardware can send with short
 *        preamble on this bitrate; only relevant in 2.4GHz band and
 *        with CCK rates.
 * @IEEE80211_RATE_MANDATORY_A: This bitrate is a mandatory rate
 *        when used with 802.11a (on the 5 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_MANDATORY_B: This bitrate is a mandatory rate
 *        when used with 802.11b (on the 2.4 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_MANDATORY_G: This bitrate is a mandatory rate
 *        when used with 802.11g (on the 2.4 GHz band); filled by the
 *        core code when registering the wiphy.
 * @IEEE80211_RATE_ERP_G: This is an ERP rate in 802.11g mode.
 * @IEEE80211_RATE_SUPPORTS_5MHZ: Rate can be used in 5 MHz mode
 * @IEEE80211_RATE_SUPPORTS_10MHZ: Rate can be used in 10 MHz mode
 */
enum ieee80211_rate_flags {
        IEEE80211_RATE_SHORT_PREAMBLE        = 1<<0,
        IEEE80211_RATE_MANDATORY_A        = 1<<1,
        IEEE80211_RATE_MANDATORY_B        = 1<<2,
        IEEE80211_RATE_MANDATORY_G        = 1<<3,
        IEEE80211_RATE_ERP_G                = 1<<4,
        IEEE80211_RATE_SUPPORTS_5MHZ        = 1<<5,
        IEEE80211_RATE_SUPPORTS_10MHZ        = 1<<6,
};

/**
 * enum ieee80211_bss_type - BSS type filter
 *
 * @IEEE80211_BSS_TYPE_ESS: Infrastructure BSS
 * @IEEE80211_BSS_TYPE_PBSS: Personal BSS
 * @IEEE80211_BSS_TYPE_IBSS: Independent BSS
 * @IEEE80211_BSS_TYPE_MBSS: Mesh BSS
 * @IEEE80211_BSS_TYPE_ANY: Wildcard value for matching any BSS type
 */
enum ieee80211_bss_type {
        IEEE80211_BSS_TYPE_ESS,
        IEEE80211_BSS_TYPE_PBSS,
        IEEE80211_BSS_TYPE_IBSS,
        IEEE80211_BSS_TYPE_MBSS,
        IEEE80211_BSS_TYPE_ANY
};

/**
 * enum ieee80211_privacy - BSS privacy filter
 *
 * @IEEE80211_PRIVACY_ON: privacy bit set
 * @IEEE80211_PRIVACY_OFF: privacy bit clear
 * @IEEE80211_PRIVACY_ANY: Wildcard value for matching any privacy setting
 */
enum ieee80211_privacy {
        IEEE80211_PRIVACY_ON,
        IEEE80211_PRIVACY_OFF,
        IEEE80211_PRIVACY_ANY
};

#define IEEE80211_PRIVACY(x)        \
        ((x) ? IEEE80211_PRIVACY_ON : IEEE80211_PRIVACY_OFF)

/**
 * struct ieee80211_rate - bitrate definition
 *
 * This structure describes a bitrate that an 802.11 PHY can
 * operate with. The two values @hw_value and @hw_value_short
 * are only for driver use when pointers to this structure are
 * passed around.
 *
 * @flags: rate-specific flags from &enum ieee80211_rate_flags
 * @bitrate: bitrate in units of 100 Kbps
 * @hw_value: driver/hardware value for this rate
 * @hw_value_short: driver/hardware value for this rate when
 *        short preamble is used
 */
struct ieee80211_rate {
        u32 flags;
        u16 bitrate;
        u16 hw_value, hw_value_short;
};

/**
 * struct ieee80211_he_obss_pd - AP settings for spatial reuse
 *
 * @enable: is the feature enabled.
 * @sr_ctrl: The SR Control field of SRP element.
 * @non_srg_max_offset: non-SRG maximum tx power offset
 * @min_offset: minimal tx power offset an associated station shall use
 * @max_offset: maximum tx power offset an associated station shall use
 * @bss_color_bitmap: bitmap that indicates the BSS color values used by
 *        members of the SRG
 * @partial_bssid_bitmap: bitmap that indicates the partial BSSID values
 *        used by members of the SRG
 */
struct ieee80211_he_obss_pd {
        bool enable;
        u8 sr_ctrl;
        u8 non_srg_max_offset;
        u8 min_offset;
        u8 max_offset;
        u8 bss_color_bitmap[8];
        u8 partial_bssid_bitmap[8];
};

/**
 * struct cfg80211_he_bss_color - AP settings for BSS coloring
 *
 * @color: the current color.
 * @enabled: HE BSS color is used
 * @partial: define the AID equation.
 */
struct cfg80211_he_bss_color {
        u8 color;
        bool enabled;
        bool partial;
};

/**
 * struct ieee80211_sta_ht_cap - STA's HT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11n HT capabilities for an STA.
 *
 * @ht_supported: is HT supported by the STA
 * @cap: HT capabilities map as described in 802.11n spec
 * @ampdu_factor: Maximum A-MPDU length factor
 * @ampdu_density: Minimum A-MPDU spacing
 * @mcs: Supported MCS rates
 */
struct ieee80211_sta_ht_cap {
        u16 cap; /* use IEEE80211_HT_CAP_ */
        bool ht_supported;
        u8 ampdu_factor;
        u8 ampdu_density;
        struct ieee80211_mcs_info mcs;
};

/**
 * struct ieee80211_sta_vht_cap - STA's VHT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ac VHT capabilities for an STA.
 *
 * @vht_supported: is VHT supported by the STA
 * @cap: VHT capabilities map as described in 802.11ac spec
 * @vht_mcs: Supported VHT MCS rates
 */
struct ieee80211_sta_vht_cap {
        bool vht_supported;
        u32 cap; /* use IEEE80211_VHT_CAP_ */
        struct ieee80211_vht_mcs_info vht_mcs;
};

#define IEEE80211_HE_PPE_THRES_MAX_LEN                25

/**
 * struct ieee80211_sta_he_cap - STA's HE capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ax HE capabilities for a STA.
 *
 * @has_he: true iff HE data is valid.
 * @he_cap_elem: Fixed portion of the HE capabilities element.
 * @he_mcs_nss_supp: The supported NSS/MCS combinations.
 * @ppe_thres: Holds the PPE Thresholds data.
 */
struct ieee80211_sta_he_cap {
        bool has_he;
        struct ieee80211_he_cap_elem he_cap_elem;
        struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp;
        u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN];
};

/**
 * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS
 *
 * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS
 * and NSS Set field"
 *
 * @only_20mhz: MCS/NSS support for 20 MHz-only STA.
 * @bw: MCS/NSS support for 80, 160 and 320 MHz
 * @bw._80: MCS/NSS support for BW <= 80 MHz
 * @bw._160: MCS/NSS support for BW = 160 MHz
 * @bw._320: MCS/NSS support for BW = 320 MHz
 */
struct ieee80211_eht_mcs_nss_supp {
        union {
                struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz;
                struct {
                        struct ieee80211_eht_mcs_nss_supp_bw _80;
                        struct ieee80211_eht_mcs_nss_supp_bw _160;
                        struct ieee80211_eht_mcs_nss_supp_bw _320;
                } __packed bw;
        } __packed;
} __packed;

#define IEEE80211_EHT_PPE_THRES_MAX_LEN                32

/**
 * struct ieee80211_sta_eht_cap - STA's EHT capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11be EHT capabilities for a STA.
 *
 * @has_eht: true iff EHT data is valid.
 * @eht_cap_elem: Fixed portion of the eht capabilities element.
 * @eht_mcs_nss_supp: The supported NSS/MCS combinations.
 * @eht_ppe_thres: Holds the PPE Thresholds data.
 */
struct ieee80211_sta_eht_cap {
        bool has_eht;
        struct ieee80211_eht_cap_elem_fixed eht_cap_elem;
        struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp;
        u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN];
};

/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */
#ifdef __CHECKER__
/*
 * This is used to mark the sband->iftype_data pointer which is supposed
 * to be an array with special access semantics (per iftype), but a lot
 * of code got it wrong in the past, so with this marking sparse will be
 * noisy when the pointer is used directly.
 */
# define __iftd                __attribute__((noderef, address_space(__iftype_data)))
#else
# define __iftd
#endif /* __CHECKER__ */

/**
 * struct ieee80211_sband_iftype_data - sband data per interface type
 *
 * This structure encapsulates sband data that is relevant for the
 * interface types defined in @types_mask.  Each type in the
 * @types_mask must be unique across all instances of iftype_data.
 *
 * @types_mask: interface types mask
 * @he_cap: holds the HE capabilities
 * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a
 *        6 GHz band channel (and 0 may be valid value).
 * @eht_cap: STA's EHT capabilities
 * @vendor_elems: vendor element(s) to advertise
 * @vendor_elems.data: vendor element(s) data
 * @vendor_elems.len: vendor element(s) length
 */
struct ieee80211_sband_iftype_data {
        u16 types_mask;
        struct ieee80211_sta_he_cap he_cap;
        struct ieee80211_he_6ghz_capa he_6ghz_capa;
        struct ieee80211_sta_eht_cap eht_cap;
        struct {
                const u8 *data;
                unsigned int len;
        } vendor_elems;
};

/**
 * enum ieee80211_edmg_bw_config - allowed channel bandwidth configurations
 *
 * @IEEE80211_EDMG_BW_CONFIG_4: 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_5: 2.16GHz and 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_6: 2.16GHz, 4.32GHz and 6.48GHz
 * @IEEE80211_EDMG_BW_CONFIG_7: 2.16GHz, 4.32GHz, 6.48GHz and 8.64GHz
 * @IEEE80211_EDMG_BW_CONFIG_8: 2.16GHz and 2.16GHz + 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_9: 2.16GHz, 4.32GHz and 2.16GHz + 2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_10: 2.16GHz, 4.32GHz, 6.48GHz and 2.16GHz+2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_11: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz and
 *        2.16GHz+2.16GHz
 * @IEEE80211_EDMG_BW_CONFIG_12: 2.16GHz, 2.16GHz + 2.16GHz and
 *        4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_13: 2.16GHz, 4.32GHz, 2.16GHz + 2.16GHz and
 *        4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_14: 2.16GHz, 4.32GHz, 6.48GHz, 2.16GHz + 2.16GHz
 *        and 4.32GHz + 4.32GHz
 * @IEEE80211_EDMG_BW_CONFIG_15: 2.16GHz, 4.32GHz, 6.48GHz, 8.64GHz,
 *        2.16GHz + 2.16GHz and 4.32GHz + 4.32GHz
 */
enum ieee80211_edmg_bw_config {
        IEEE80211_EDMG_BW_CONFIG_4        = 4,
        IEEE80211_EDMG_BW_CONFIG_5        = 5,
        IEEE80211_EDMG_BW_CONFIG_6        = 6,
        IEEE80211_EDMG_BW_CONFIG_7        = 7,
        IEEE80211_EDMG_BW_CONFIG_8        = 8,
        IEEE80211_EDMG_BW_CONFIG_9        = 9,
        IEEE80211_EDMG_BW_CONFIG_10        = 10,
        IEEE80211_EDMG_BW_CONFIG_11        = 11,
        IEEE80211_EDMG_BW_CONFIG_12        = 12,
        IEEE80211_EDMG_BW_CONFIG_13        = 13,
        IEEE80211_EDMG_BW_CONFIG_14        = 14,
        IEEE80211_EDMG_BW_CONFIG_15        = 15,
};

/**
 * struct ieee80211_edmg - EDMG configuration
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ay EDMG configuration
 *
 * @channels: bitmap that indicates the 2.16 GHz channel(s)
 *        that are allowed to be used for transmissions.
 *        Bit 0 indicates channel 1, bit 1 indicates channel 2, etc.
 *        Set to 0 indicate EDMG not supported.
 * @bw_config: Channel BW Configuration subfield encodes
 *        the allowed channel bandwidth configurations
 */
struct ieee80211_edmg {
        u8 channels;
        enum ieee80211_edmg_bw_config bw_config;
};

/**
 * struct ieee80211_sta_s1g_cap - STA's S1G capabilities
 *
 * This structure describes most essential parameters needed
 * to describe 802.11ah S1G capabilities for a STA.
 *
 * @s1g: is STA an S1G STA
 * @cap: S1G capabilities information
 * @nss_mcs: Supported NSS MCS set
 */
struct ieee80211_sta_s1g_cap {
        bool s1g;
        u8 cap[10]; /* use S1G_CAPAB_ */
        u8 nss_mcs[5];
};

/**
 * struct ieee80211_supported_band - frequency band definition
 *
 * This structure describes a frequency band a wiphy
 * is able to operate in.
 *
 * @channels: Array of channels the hardware can operate with
 *        in this band.
 * @band: the band this structure represents
 * @n_channels: Number of channels in @channels
 * @bitrates: Array of bitrates the hardware can operate with
 *        in this band. Must be sorted to give a valid "supported
 *        rates" IE, i.e. CCK rates first, then OFDM.
 * @n_bitrates: Number of bitrates in @bitrates
 * @ht_cap: HT capabilities in this band
 * @vht_cap: VHT capabilities in this band
 * @s1g_cap: S1G capabilities in this band
 * @edmg_cap: EDMG capabilities in this band
 * @s1g_cap: S1G capabilities in this band (S1B band only, of course)
 * @n_iftype_data: number of iftype data entries
 * @iftype_data: interface type data entries.  Note that the bits in
 *        @types_mask inside this structure cannot overlap (i.e. only
 *        one occurrence of each type is allowed across all instances of
 *        iftype_data).
 */
struct ieee80211_supported_band {
        struct ieee80211_channel *channels;
        struct ieee80211_rate *bitrates;
        enum nl80211_band band;
        int n_channels;
        int n_bitrates;
        struct ieee80211_sta_ht_cap ht_cap;
        struct ieee80211_sta_vht_cap vht_cap;
        struct ieee80211_sta_s1g_cap s1g_cap;
        struct ieee80211_edmg edmg_cap;
        u16 n_iftype_data;
        const struct ieee80211_sband_iftype_data __iftd *iftype_data;
};

/**
 * _ieee80211_set_sband_iftype_data - set sband iftype data array
 * @sband: the sband to initialize
 * @iftd: the iftype data array pointer
 * @n_iftd: the length of the iftype data array
 *
 * Set the sband iftype data array; use this where the length cannot
 * be derived from the ARRAY_SIZE() of the argument, but prefer
 * ieee80211_set_sband_iftype_data() where it can be used.
 */
static inline void
_ieee80211_set_sband_iftype_data(struct ieee80211_supported_band *sband,
                                 const struct ieee80211_sband_iftype_data *iftd,
                                 u16 n_iftd)
{
        sband->iftype_data = (const void __iftd __force *)iftd;
        sband->n_iftype_data = n_iftd;
}

/**
 * ieee80211_set_sband_iftype_data - set sband iftype data array
 * @sband: the sband to initialize
 * @iftd: the iftype data array
 */
#define ieee80211_set_sband_iftype_data(sband, iftd)        \
        _ieee80211_set_sband_iftype_data(sband, iftd, ARRAY_SIZE(iftd))

/**
 * for_each_sband_iftype_data - iterate sband iftype data entries
 * @sband: the sband whose iftype_data array to iterate
 * @i: iterator counter
 * @iftd: iftype data pointer to set
 */
#define for_each_sband_iftype_data(sband, i, iftd)                                \
        for (i = 0, iftd = (const void __force *)&(sband)->iftype_data[i];        \
             i < (sband)->n_iftype_data;                                        \
             i++, iftd = (const void __force *)&(sband)->iftype_data[i])

/**
 * ieee80211_get_sband_iftype_data - return sband data for a given iftype
 * @sband: the sband to search for the STA on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found
 */
static inline const struct ieee80211_sband_iftype_data *
ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
                                u8 iftype)
{
        const struct ieee80211_sband_iftype_data *data;
        int i;

        if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
                return NULL;

        if (iftype == NL80211_IFTYPE_AP_VLAN)
                iftype = NL80211_IFTYPE_AP;

        for_each_sband_iftype_data(sband, i, data) {
                if (data->types_mask & BIT(iftype))
                        return data;
        }

        return NULL;
}

/**
 * ieee80211_get_he_iftype_cap - return HE capabilities for an sband's iftype
 * @sband: the sband to search for the iftype on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found
 */
static inline const struct ieee80211_sta_he_cap *
ieee80211_get_he_iftype_cap(const struct ieee80211_supported_band *sband,
                            u8 iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (data && data->he_cap.has_he)
                return &data->he_cap;

        return NULL;
}

/**
 * ieee80211_get_he_6ghz_capa - return HE 6 GHz capabilities
 * @sband: the sband to search for the STA on
 * @iftype: the iftype to search for
 *
 * Return: the 6GHz capabilities
 */
static inline __le16
ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
                           enum nl80211_iftype iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (WARN_ON(!data || !data->he_cap.has_he))
                return 0;

        return data->he_6ghz_capa.capa;
}

/**
 * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype
 * @sband: the sband to search for the iftype on
 * @iftype: enum nl80211_iftype
 *
 * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found
 */
static inline const struct ieee80211_sta_eht_cap *
ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband,
                             enum nl80211_iftype iftype)
{
        const struct ieee80211_sband_iftype_data *data =
                ieee80211_get_sband_iftype_data(sband, iftype);

        if (data && data->eht_cap.has_eht)
                return &data->eht_cap;

        return NULL;
}

/**
 * wiphy_read_of_freq_limits - read frequency limits from device tree
 *
 * @wiphy: the wireless device to get extra limits for
 *
 * Some devices may have extra limitations specified in DT. This may be useful
 * for chipsets that normally support more bands but are limited due to board
 * design (e.g. by antennas or external power amplifier).
 *
 * This function reads info from DT and uses it to *modify* channels (disable
 * unavailable ones). It's usually a *bad* idea to use it in drivers with
 * shared channel data as DT limitations are device specific. You should make
 * sure to call it only if channels in wiphy are copied and can be modified
 * without affecting other devices.
 *
 * As this function access device node it has to be called after set_wiphy_dev.
 * It also modifies channels so they have to be set first.
 * If using this helper, call it before wiphy_register().
 */
#ifdef CONFIG_OF
void wiphy_read_of_freq_limits(struct wiphy *wiphy);
#else /* CONFIG_OF */
static inline void wiphy_read_of_freq_limits(struct wiphy *wiphy)
{
}
#endif /* !CONFIG_OF */


/*
 * Wireless hardware/device configuration structures and methods
 */

/**
 * DOC: Actions and configuration
 *
 * Each wireless device and each virtual interface offer a set of configuration
 * operations and other actions that are invoked by userspace. Each of these
 * actions is described in the operations structure, and the parameters these
 * operations use are described separately.
 *
 * Additionally, some operations are asynchronous and expect to get status
 * information via some functions that drivers need to call.
 *
 * Scanning and BSS list handling with its associated functionality is described
 * in a separate chapter.
 */

#define VHT_MUMIMO_GROUPS_DATA_LEN (WLAN_MEMBERSHIP_LEN +\
                                    WLAN_USER_POSITION_LEN)

/**
 * struct vif_params - describes virtual interface parameters
 * @flags: monitor interface flags, unchanged if 0, otherwise
 *        %MONITOR_FLAG_CHANGED will be set
 * @use_4addr: use 4-address frames
 * @macaddr: address to use for this virtual interface.
 *        If this parameter is set to zero address the driver may
 *        determine the address as needed.
 *        This feature is only fully supported by drivers that enable the
 *        %NL80211_FEATURE_MAC_ON_CREATE flag.  Others may support creating
 **        only p2p devices with specified MAC.
 * @vht_mumimo_groups: MU-MIMO groupID, used for monitoring MU-MIMO packets
 *        belonging to that MU-MIMO groupID; %NULL if not changed
 * @vht_mumimo_follow_addr: MU-MIMO follow address, used for monitoring
 *        MU-MIMO packets going to the specified station; %NULL if not changed
 */
struct vif_params {
        u32 flags;
        int use_4addr;
        u8 macaddr[ETH_ALEN];
        const u8 *vht_mumimo_groups;
        const u8 *vht_mumimo_follow_addr;
};

/**
 * struct key_params - key information
 *
 * Information about a key
 *
 * @key: key material
 * @key_len: length of key material
 * @cipher: cipher suite selector
 * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used
 *        with the get_key() callback, must be in little endian,
 *        length given by @seq_len.
 * @seq_len: length of @seq.
 * @vlan_id: vlan_id for VLAN group key (if nonzero)
 * @mode: key install mode (RX_TX, NO_TX or SET_TX)
 */
struct key_params {
        const u8 *key;
        const u8 *seq;
        int key_len;
        int seq_len;
        u16 vlan_id;
        u32 cipher;
        enum nl80211_key_mode mode;
};

/**
 * struct cfg80211_chan_def - channel definition
 * @chan: the (control) channel
 * @width: channel width
 * @center_freq1: center frequency of first segment
 * @center_freq2: center frequency of second segment
 *        (only with 80+80 MHz)
 * @edmg: define the EDMG channels configuration.
 *        If edmg is requested (i.e. the .channels member is non-zero),
 *        chan will define the primary channel and all other
 *        parameters are ignored.
 * @freq1_offset: offset from @center_freq1, in KHz
 * @punctured: mask of the punctured 20 MHz subchannels, with
 *        bits turned on being disabled (punctured); numbered
 *        from lower to higher frequency (like in the spec)
 */
struct cfg80211_chan_def {
        struct ieee80211_channel *chan;
        enum nl80211_chan_width width;
        u32 center_freq1;
        u32 center_freq2;
        struct ieee80211_edmg edmg;
        u16 freq1_offset;
        u16 punctured;
};

/*
 * cfg80211_bitrate_mask - masks for bitrate control
 */
struct cfg80211_bitrate_mask {
        struct {
                u32 legacy;
                u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN];
                u16 vht_mcs[NL80211_VHT_NSS_MAX];
                u16 he_mcs[NL80211_HE_NSS_MAX];
                enum nl80211_txrate_gi gi;
                enum nl80211_he_gi he_gi;
                enum nl80211_he_ltf he_ltf;
        } control[NUM_NL80211_BANDS];
};


/**
 * struct cfg80211_tid_cfg - TID specific configuration
 * @config_override: Flag to notify driver to reset TID configuration
 *        of the peer.
 * @tids: bitmap of TIDs to modify
 * @mask: bitmap of attributes indicating which parameter changed,
 *        similar to &nl80211_tid_config_supp.
 * @noack: noack configuration value for the TID
 * @retry_long: retry count value
 * @retry_short: retry count value
 * @ampdu: Enable/Disable MPDU aggregation
 * @rtscts: Enable/Disable RTS/CTS
 * @amsdu: Enable/Disable MSDU aggregation
 * @txrate_type: Tx bitrate mask type
 * @txrate_mask: Tx bitrate to be applied for the TID
 */
struct cfg80211_tid_cfg {
        bool config_override;
        u8 tids;
        u64 mask;
        enum nl80211_tid_config noack;
        u8 retry_long, retry_short;
        enum nl80211_tid_config ampdu;
        enum nl80211_tid_config rtscts;
        enum nl80211_tid_config amsdu;
        enum nl80211_tx_rate_setting txrate_type;
        struct cfg80211_bitrate_mask txrate_mask;
};

/**
 * struct cfg80211_tid_config - TID configuration
 * @peer: Station's MAC address
 * @n_tid_conf: Number of TID specific configurations to be applied
 * @tid_conf: Configuration change info
 */
struct cfg80211_tid_config {
        const u8 *peer;
        u32 n_tid_conf;
        struct cfg80211_tid_cfg tid_conf[] __counted_by(n_tid_conf);
};

/**
 * struct cfg80211_fils_aad - FILS AAD data
 * @macaddr: STA MAC address
 * @kek: FILS KEK
 * @kek_len: FILS KEK length
 * @snonce: STA Nonce
 * @anonce: AP Nonce
 */
struct cfg80211_fils_aad {
        const u8 *macaddr;
        const u8 *kek;
        u8 kek_len;
        const u8 *snonce;
        const u8 *anonce;
};

/**
 * struct cfg80211_set_hw_timestamp - enable/disable HW timestamping
 * @macaddr: peer MAC address. NULL to enable/disable HW timestamping for all
 *        addresses.
 * @enable: if set, enable HW timestamping for the specified MAC address.
 *        Otherwise disable HW timestamping for the specified MAC address.
 */
struct cfg80211_set_hw_timestamp {
        const u8 *macaddr;
        bool enable;
};

/**
 * cfg80211_get_chandef_type - return old channel type from chandef
 * @chandef: the channel definition
 *
 * Return: The old channel type (NOHT, HT20, HT40+/-) from a given
 * chandef, which must have a bandwidth allowing this conversion.
 */
static inline enum nl80211_channel_type
cfg80211_get_chandef_type(const struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_20_NOHT:
                return NL80211_CHAN_NO_HT;
        case NL80211_CHAN_WIDTH_20:
                return NL80211_CHAN_HT20;
        case NL80211_CHAN_WIDTH_40:
                if (chandef->center_freq1 > chandef->chan->center_freq)
                        return NL80211_CHAN_HT40PLUS;
                return NL80211_CHAN_HT40MINUS;
        default:
                WARN_ON(1);
                return NL80211_CHAN_NO_HT;
        }
}

/**
 * cfg80211_chandef_create - create channel definition using channel type
 * @chandef: the channel definition struct to fill
 * @channel: the control channel
 * @chantype: the channel type
 *
 * Given a channel type, create a channel definition.
 */
void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
                             struct ieee80211_channel *channel,
                             enum nl80211_channel_type chantype);

/**
 * cfg80211_chandef_identical - check if two channel definitions are identical
 * @chandef1: first channel definition
 * @chandef2: second channel definition
 *
 * Return: %true if the channels defined by the channel definitions are
 * identical, %false otherwise.
 */
static inline bool
cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
                           const struct cfg80211_chan_def *chandef2)
{
        return (chandef1->chan == chandef2->chan &&
                chandef1->width == chandef2->width &&
                chandef1->center_freq1 == chandef2->center_freq1 &&
                chandef1->freq1_offset == chandef2->freq1_offset &&
                chandef1->center_freq2 == chandef2->center_freq2 &&
                chandef1->punctured == chandef2->punctured);
}

/**
 * cfg80211_chandef_is_edmg - check if chandef represents an EDMG channel
 *
 * @chandef: the channel definition
 *
 * Return: %true if EDMG defined, %false otherwise.
 */
static inline bool
cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef)
{
        return chandef->edmg.channels || chandef->edmg.bw_config;
}

/**
 * cfg80211_chandef_compatible - check if two channel definitions are compatible
 * @chandef1: first channel definition
 * @chandef2: second channel definition
 *
 * Return: %NULL if the given channel definitions are incompatible,
 * chandef1 or chandef2 otherwise.
 */
const struct cfg80211_chan_def *
cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1,
                            const struct cfg80211_chan_def *chandef2);

/**
 * nl80211_chan_width_to_mhz - get the channel width in MHz
 * @chan_width: the channel width from &enum nl80211_chan_width
 *
 * Return: channel width in MHz if the chan_width from &enum nl80211_chan_width
 * is valid. -1 otherwise.
 */
int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width);

/**
 * cfg80211_chandef_valid - check if a channel definition is valid
 * @chandef: the channel definition to check
 * Return: %true if the channel definition is valid. %false otherwise.
 */
bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_chandef_usable - check if secondary channels can be used
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 * @prohibited_flags: the regulatory channel flags that must not be set
 * Return: %true if secondary channels are usable. %false otherwise.
 */
bool cfg80211_chandef_usable(struct wiphy *wiphy,
                             const struct cfg80211_chan_def *chandef,
                             u32 prohibited_flags);

/**
 * cfg80211_chandef_dfs_required - checks if radar detection is required
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 * @iftype: the interface type as specified in &enum nl80211_iftype
 * Returns:
 *        1 if radar detection is required, 0 if it is not, < 0 on error
 */
int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
                                  const struct cfg80211_chan_def *chandef,
                                  enum nl80211_iftype iftype);

/**
 * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable and we
 *                                 can/need start CAC on such channel
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 *
 * Return: true if all channels available and at least
 *           one channel requires CAC (NL80211_DFS_USABLE)
 */
bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
                                 const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_chandef_dfs_cac_time - get the DFS CAC time (in ms) for given
 *                                   channel definition
 * @wiphy: the wiphy to validate against
 * @chandef: the channel definition to check
 *
 * Returns: DFS CAC time (in ms) which applies for this channel definition
 */
unsigned int
cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
                              const struct cfg80211_chan_def *chandef);

/**
 * cfg80211_chandef_primary - calculate primary 40/80/160 MHz freq
 * @chandef: chandef to calculate for
 * @primary_chan_width: primary channel width to calculate center for
 * @punctured: punctured sub-channel bitmap, will be recalculated
 *        according to the new bandwidth, can be %NULL
 *
 * Returns: the primary 40/80/160 MHz channel center frequency, or -1
 *        for errors, updating the punctured bitmap
 */
int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef,
                             enum nl80211_chan_width primary_chan_width,
                             u16 *punctured);

/**
 * nl80211_send_chandef - sends the channel definition.
 * @msg: the msg to send channel definition
 * @chandef: the channel definition to check
 *
 * Returns: 0 if sent the channel definition to msg, < 0 on error
 **/
int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef);

/**
 * ieee80211_chanwidth_rate_flags - return rate flags for channel width
 * @width: the channel width of the channel
 *
 * In some channel types, not all rates may be used - for example CCK
 * rates may not be used in 5/10 MHz channels.
 *
 * Returns: rate flags which apply for this channel width
 */
static inline enum ieee80211_rate_flags
ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width)
{
        switch (width) {
        case NL80211_CHAN_WIDTH_5:
                return IEEE80211_RATE_SUPPORTS_5MHZ;
        case NL80211_CHAN_WIDTH_10:
                return IEEE80211_RATE_SUPPORTS_10MHZ;
        default:
                break;
        }
        return 0;
}

/**
 * ieee80211_chandef_rate_flags - returns rate flags for a channel
 * @chandef: channel definition for the channel
 *
 * See ieee80211_chanwidth_rate_flags().
 *
 * Returns: rate flags which apply for this channel
 */
static inline enum ieee80211_rate_flags
ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
{
        return ieee80211_chanwidth_rate_flags(chandef->width);
}

/**
 * ieee80211_chandef_max_power - maximum transmission power for the chandef
 *
 * In some regulations, the transmit power may depend on the configured channel
 * bandwidth which may be defined as dBm/MHz. This function returns the actual
 * max_power for non-standard (20 MHz) channels.
 *
 * @chandef: channel definition for the channel
 *
 * Returns: maximum allowed transmission power in dBm for the chandef
 */
static inline int
ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef)
{
        switch (chandef->width) {
        case NL80211_CHAN_WIDTH_5:
                return min(chandef->chan->max_reg_power - 6,
                           chandef->chan->max_power);
        case NL80211_CHAN_WIDTH_10:
                return min(chandef->chan->max_reg_power - 3,
                           chandef->chan->max_power);
        default:
                break;
        }
        return chandef->chan->max_power;
}

/**
 * cfg80211_any_usable_channels - check for usable channels
 * @wiphy: the wiphy to check for
 * @band_mask: which bands to check on
 * @prohibited_flags: which channels to not consider usable,
 *        %IEEE80211_CHAN_DISABLED is always taken into account
 *
 * Return: %true if usable channels found, %false otherwise
 */
bool cfg80211_any_usable_channels(struct wiphy *wiphy,
                                  unsigned long band_mask,
                                  u32 prohibited_flags);

/**
 * enum survey_info_flags - survey information flags
 *
 * @SURVEY_INFO_NOISE_DBM: noise (in dBm) was filled in
 * @SURVEY_INFO_IN_USE: channel is currently being used
 * @SURVEY_INFO_TIME: active time (in ms) was filled in
 * @SURVEY_INFO_TIME_BUSY: busy time was filled in
 * @SURVEY_INFO_TIME_EXT_BUSY: extension channel busy time was filled in
 * @SURVEY_INFO_TIME_RX: receive time was filled in
 * @SURVEY_INFO_TIME_TX: transmit time was filled in
 * @SURVEY_INFO_TIME_SCAN: scan time was filled in
 * @SURVEY_INFO_TIME_BSS_RX: local BSS receive time was filled in
 *
 * Used by the driver to indicate which info in &struct survey_info
 * it has filled in during the get_survey().
 */
enum survey_info_flags {
        SURVEY_INFO_NOISE_DBM                = BIT(0),
        SURVEY_INFO_IN_USE                = BIT(1),
        SURVEY_INFO_TIME                = BIT(2),
        SURVEY_INFO_TIME_BUSY                = BIT(3),
        SURVEY_INFO_TIME_EXT_BUSY        = BIT(4),
        SURVEY_INFO_TIME_RX                = BIT(5),
        SURVEY_INFO_TIME_TX                = BIT(6),
        SURVEY_INFO_TIME_SCAN                = BIT(7),
        SURVEY_INFO_TIME_BSS_RX                = BIT(8),
};

/**
 * struct survey_info - channel survey response
 *
 * @channel: the channel this survey record reports, may be %NULL for a single
 *        record to report global statistics
 * @filled: bitflag of flags from &enum survey_info_flags
 * @noise: channel noise in dBm. This and all following fields are
 *        optional
 * @time: amount of time in ms the radio was turn on (on the channel)
 * @time_busy: amount of time the primary channel was sensed busy
 * @time_ext_busy: amount of time the extension channel was sensed busy
 * @time_rx: amount of time the radio spent receiving data
 * @time_tx: amount of time the radio spent transmitting data
 * @time_scan: amount of time the radio spent for scanning
 * @time_bss_rx: amount of time the radio spent receiving data on a local BSS
 *
 * Used by dump_survey() to report back per-channel survey information.
 *
 * This structure can later be expanded with things like
 * channel duty cycle etc.
 */
struct survey_info {
        struct ieee80211_channel *channel;
        u64 time;
        u64 time_busy;
        u64 time_ext_busy;
        u64 time_rx;
        u64 time_tx;
        u64 time_scan;
        u64 time_bss_rx;
        u32 filled;
        s8 noise;
};

#define CFG80211_MAX_NUM_AKM_SUITES        10

/**
 * struct cfg80211_crypto_settings - Crypto settings
 * @wpa_versions: indicates which, if any, WPA versions are enabled
 *        (from enum nl80211_wpa_versions)
 * @cipher_group: group key cipher suite (or 0 if unset)
 * @n_ciphers_pairwise: number of AP supported unicast ciphers
 * @ciphers_pairwise: unicast key cipher suites
 * @n_akm_suites: number of AKM suites
 * @akm_suites: AKM suites
 * @control_port: Whether user space controls IEEE 802.1X port, i.e.,
 *        sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
 *        required to assume that the port is unauthorized until authorized by
 *        user space. Otherwise, port is marked authorized by default.
 * @control_port_ethertype: the control port protocol that should be
 *        allowed through even on unauthorized ports
 * @control_port_no_encrypt: TRUE to prevent encryption of control port
 *        protocol frames.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 * @control_port_no_preauth: disables pre-auth rx over the nl80211 control
 *        port for mac80211
 * @psk: PSK (for devices supporting 4-way-handshake offload)
 * @sae_pwd: password for SAE authentication (for devices supporting SAE
 *        offload)
 * @sae_pwd_len: length of SAE password (for devices supporting SAE offload)
 * @sae_pwe: The mechanisms allowed for SAE PWE derivation:
 *
 *        NL80211_SAE_PWE_UNSPECIFIED
 *          Not-specified, used to indicate userspace did not specify any
 *          preference. The driver should follow its internal policy in
 *          such a scenario.
 *
 *        NL80211_SAE_PWE_HUNT_AND_PECK
 *          Allow hunting-and-pecking loop only
 *
 *        NL80211_SAE_PWE_HASH_TO_ELEMENT
 *          Allow hash-to-element only
 *
 *        NL80211_SAE_PWE_BOTH
 *          Allow either hunting-and-pecking loop or hash-to-element
 */
struct cfg80211_crypto_settings {
        u32 wpa_versions;
        u32 cipher_group;
        int n_ciphers_pairwise;
        u32 ciphers_pairwise[NL80211_MAX_NR_CIPHER_SUITES];
        int n_akm_suites;
        u32 akm_suites[CFG80211_MAX_NUM_AKM_SUITES];
        bool control_port;
        __be16 control_port_ethertype;
        bool control_port_no_encrypt;
        bool control_port_over_nl80211;
        bool control_port_no_preauth;
        const u8 *psk;
        const u8 *sae_pwd;
        u8 sae_pwd_len;
        enum nl80211_sae_pwe_mechanism sae_pwe;
};

/**
 * struct cfg80211_mbssid_config - AP settings for multi bssid
 *
 * @tx_wdev: pointer to the transmitted interface in the MBSSID set
 * @index: index of this AP in the multi bssid group.
 * @ema: set to true if the beacons should be sent out in EMA mode.
 */
struct cfg80211_mbssid_config {
        struct wireless_dev *tx_wdev;
        u8 index;
        bool ema;
};

/**
 * struct cfg80211_mbssid_elems - Multiple BSSID elements
 *
 * @cnt: Number of elements in array %elems.
 *
 * @elem: Array of multiple BSSID element(s) to be added into Beacon frames.
 * @elem.data: Data for multiple BSSID elements.
 * @elem.len: Length of data.
 */
struct cfg80211_mbssid_elems {
        u8 cnt;
        struct {
                const u8 *data;
                size_t len;
        } elem[] __counted_by(cnt);
};

/**
 * struct cfg80211_rnr_elems - Reduced neighbor report (RNR) elements
 *
 * @cnt: Number of elements in array %elems.
 *
 * @elem: Array of RNR element(s) to be added into Beacon frames.
 * @elem.data: Data for RNR elements.
 * @elem.len: Length of data.
 */
struct cfg80211_rnr_elems {
        u8 cnt;
        struct {
                const u8 *data;
                size_t len;
        } elem[] __counted_by(cnt);
};

/**
 * struct cfg80211_beacon_data - beacon data
 * @link_id: the link ID for the AP MLD link sending this beacon
 * @head: head portion of beacon (before TIM IE)
 *        or %NULL if not changed
 * @tail: tail portion of beacon (after TIM IE)
 *        or %NULL if not changed
 * @head_len: length of @head
 * @tail_len: length of @tail
 * @beacon_ies: extra information element(s) to add into Beacon frames or %NULL
 * @beacon_ies_len: length of beacon_ies in octets
 * @proberesp_ies: extra information element(s) to add into Probe Response
 *        frames or %NULL
 * @proberesp_ies_len: length of proberesp_ies in octets
 * @assocresp_ies: extra information element(s) to add into (Re)Association
 *        Response frames or %NULL
 * @assocresp_ies_len: length of assocresp_ies in octets
 * @probe_resp_len: length of probe response template (@probe_resp)
 * @probe_resp: probe response template (AP mode only)
 * @mbssid_ies: multiple BSSID elements
 * @rnr_ies: reduced neighbor report elements
 * @ftm_responder: enable FTM responder functionality; -1 for no change
 *        (which also implies no change in LCI/civic location data)
 * @lci: Measurement Report element content, starting with Measurement Token
 *        (measurement type 8)
 * @civicloc: Measurement Report element content, starting with Measurement
 *        Token (measurement type 11)
 * @lci_len: LCI data length
 * @civicloc_len: Civic location data length
 * @he_bss_color: BSS Color settings
 * @he_bss_color_valid: indicates whether bss color
 *        attribute is present in beacon data or not.
 */
struct cfg80211_beacon_data {
        unsigned int link_id;

        const u8 *head, *tail;
        const u8 *beacon_ies;
        const u8 *proberesp_ies;
        const u8 *assocresp_ies;
        const u8 *probe_resp;
        const u8 *lci;
        const u8 *civicloc;
        struct cfg80211_mbssid_elems *mbssid_ies;
        struct cfg80211_rnr_elems *rnr_ies;
        s8 ftm_responder;

        size_t head_len, tail_len;
        size_t beacon_ies_len;
        size_t proberesp_ies_len;
        size_t assocresp_ies_len;
        size_t probe_resp_len;
        size_t lci_len;
        size_t civicloc_len;
        struct cfg80211_he_bss_color he_bss_color;
        bool he_bss_color_valid;
};

struct mac_address {
        u8 addr[ETH_ALEN];
};

/**
 * struct cfg80211_acl_data - Access control list data
 *
 * @acl_policy: ACL policy to be applied on the station's
 *        entry specified by mac_addr
 * @n_acl_entries: Number of MAC address entries passed
 * @mac_addrs: List of MAC addresses of stations to be used for ACL
 */
struct cfg80211_acl_data {
        enum nl80211_acl_policy acl_policy;
        int n_acl_entries;

        /* Keep it last */
        struct mac_address mac_addrs[] __counted_by(n_acl_entries);
};

/**
 * struct cfg80211_fils_discovery - FILS discovery parameters from
 * IEEE Std 802.11ai-2016, Annex C.3 MIB detail.
 *
 * @update: Set to true if the feature configuration should be updated.
 * @min_interval: Minimum packet interval in TUs (0 - 10000)
 * @max_interval: Maximum packet interval in TUs (0 - 10000)
 * @tmpl_len: Template length
 * @tmpl: Template data for FILS discovery frame including the action
 *        frame headers.
 */
struct cfg80211_fils_discovery {
        bool update;
        u32 min_interval;
        u32 max_interval;
        size_t tmpl_len;
        const u8 *tmpl;
};

/**
 * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe
 *        response parameters in 6GHz.
 *
 * @update: Set to true if the feature configuration should be updated.
 * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned
 *        in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive
 *        scanning
 * @tmpl_len: Template length
 * @tmpl: Template data for probe response
 */
struct cfg80211_unsol_bcast_probe_resp {
        bool update;
        u32 interval;
        size_t tmpl_len;
        const u8 *tmpl;
};

/**
 * struct cfg80211_ap_settings - AP configuration
 *
 * Used to configure an AP interface.
 *
 * @chandef: defines the channel to use
 * @beacon: beacon data
 * @beacon_interval: beacon interval
 * @dtim_period: DTIM period
 * @ssid: SSID to be used in the BSS (note: may be %NULL if not provided from
 *        user space)
 * @ssid_len: length of @ssid
 * @hidden_ssid: whether to hide the SSID in Beacon/Probe Response frames
 * @crypto: crypto settings
 * @privacy: the BSS uses privacy
 * @auth_type: Authentication type (algorithm)
 * @smps_mode: SMPS mode
 * @inactivity_timeout: time in seconds to determine station's inactivity.
 * @p2p_ctwindow: P2P CT Window
 * @p2p_opp_ps: P2P opportunistic PS
 * @acl: ACL configuration used by the drivers which has support for
 *        MAC address based access control
 * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
 *        networks.
 * @beacon_rate: bitrate to be used for beacons
 * @ht_cap: HT capabilities (or %NULL if HT isn't enabled)
 * @vht_cap: VHT capabilities (or %NULL if VHT isn't enabled)
 * @he_cap: HE capabilities (or %NULL if HE isn't enabled)
 * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled)
 * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled)
 * @ht_required: stations must support HT
 * @vht_required: stations must support VHT
 * @twt_responder: Enable Target Wait Time
 * @he_required: stations must support HE
 * @sae_h2e_required: stations must support direct H2E technique in SAE
 * @flags: flags, as defined in &enum nl80211_ap_settings_flags
 * @he_obss_pd: OBSS Packet Detection settings
 * @he_oper: HE operation IE (or %NULL if HE isn't enabled)
 * @fils_discovery: FILS discovery transmission parameters
 * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
 * @mbssid_config: AP settings for multiple bssid
 */
struct cfg80211_ap_settings {
        struct cfg80211_chan_def chandef;

        struct cfg80211_beacon_data beacon;

        int beacon_interval, dtim_period;
        const u8 *ssid;
        size_t ssid_len;
        enum nl80211_hidden_ssid hidden_ssid;
        struct cfg80211_crypto_settings crypto;
        bool privacy;
        enum nl80211_auth_type auth_type;
        enum nl80211_smps_mode smps_mode;
        int inactivity_timeout;
        u8 p2p_ctwindow;
        bool p2p_opp_ps;
        const struct cfg80211_acl_data *acl;
        bool pbss;
        struct cfg80211_bitrate_mask beacon_rate;

        const struct ieee80211_ht_cap *ht_cap;
        const struct ieee80211_vht_cap *vht_cap;
        const struct ieee80211_he_cap_elem *he_cap;
        const struct ieee80211_he_operation *he_oper;
        const struct ieee80211_eht_cap_elem *eht_cap;
        const struct ieee80211_eht_operation *eht_oper;
        bool ht_required, vht_required, he_required, sae_h2e_required;
        bool twt_responder;
        u32 flags;
        struct ieee80211_he_obss_pd he_obss_pd;
        struct cfg80211_fils_discovery fils_discovery;
        struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
        struct cfg80211_mbssid_config mbssid_config;
};


/**
 * struct cfg80211_ap_update - AP configuration update
 *
 * Subset of &struct cfg80211_ap_settings, for updating a running AP.
 *
 * @beacon: beacon data
 * @fils_discovery: FILS discovery transmission parameters
 * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
 */
struct cfg80211_ap_update {
        struct cfg80211_beacon_data beacon;
        struct cfg80211_fils_discovery fils_discovery;
        struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
};

/**
 * struct cfg80211_csa_settings - channel switch settings
 *
 * Used for channel switch
 *
 * @chandef: defines the channel to use after the switch
 * @beacon_csa: beacon data while performing the switch
 * @counter_offsets_beacon: offsets of the counters within the beacon (tail)
 * @counter_offsets_presp: offsets of the counters within the probe response
 * @n_counter_offsets_beacon: number of csa counters the beacon (tail)
 * @n_counter_offsets_presp: number of csa counters in the probe response
 * @beacon_after: beacon data to be used on the new channel
 * @radar_required: whether radar detection is required on the new channel
 * @block_tx: whether transmissions should be blocked while changing
 * @count: number of beacons until switch
 * @link_id: defines the link on which channel switch is expected during
 *        MLO. 0 in case of non-MLO.
 */
struct cfg80211_csa_settings {
        struct cfg80211_chan_def chandef;
        struct cfg80211_beacon_data beacon_csa;
        const u16 *counter_offsets_beacon;
        const u16 *counter_offsets_presp;
        unsigned int n_counter_offsets_beacon;
        unsigned int n_counter_offsets_presp;
        struct cfg80211_beacon_data beacon_after;
        bool radar_required;
        bool block_tx;
        u8 count;
        u8 link_id;
};

/**
 * struct cfg80211_color_change_settings - color change settings
 *
 * Used for bss color change
 *
 * @beacon_color_change: beacon data while performing the color countdown
 * @counter_offset_beacon: offsets of the counters within the beacon (tail)
 * @counter_offset_presp: offsets of the counters within the probe response
 * @beacon_next: beacon data to be used after the color change
 * @count: number of beacons until the color change
 * @color: the color used after the change
 * @link_id: defines the link on which color change is expected during MLO.
 *        0 in case of non-MLO.
 */
struct cfg80211_color_change_settings {
        struct cfg80211_beacon_data beacon_color_change;
        u16 counter_offset_beacon;
        u16 counter_offset_presp;
        struct cfg80211_beacon_data beacon_next;
        u8 count;
        u8 color;
        u8 link_id;
};

/**
 * struct iface_combination_params - input parameters for interface combinations
 *
 * Used to pass interface combination parameters
 *
 * @num_different_channels: the number of different channels we want
 *        to use for verification
 * @radar_detect: a bitmap where each bit corresponds to a channel
 *        width where radar detection is needed, as in the definition of
 *        &struct ieee80211_iface_combination.@radar_detect_widths
 * @iftype_num: array with the number of interfaces of each interface
 *        type.  The index is the interface type as specified in &enum
 *        nl80211_iftype.
 * @new_beacon_int: set this to the beacon interval of a new interface
 *        that's not operating yet, if such is to be checked as part of
 *        the verification
 */
struct iface_combination_params {
        int num_different_channels;
        u8 radar_detect;
        int iftype_num[NUM_NL80211_IFTYPES];
        u32 new_beacon_int;
};

/**
 * enum station_parameters_apply_mask - station parameter values to apply
 * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
 * @STATION_PARAM_APPLY_CAPABILITY: apply new capability
 * @STATION_PARAM_APPLY_PLINK_STATE: apply new plink state
 *
 * Not all station parameters have in-band "no change" signalling,
 * for those that don't these flags will are used.
 */
enum station_parameters_apply_mask {
        STATION_PARAM_APPLY_UAPSD = BIT(0),
        STATION_PARAM_APPLY_CAPABILITY = BIT(1),
        STATION_PARAM_APPLY_PLINK_STATE = BIT(2),
};

/**
 * struct sta_txpwr - station txpower configuration
 *
 * Used to configure txpower for station.
 *
 * @power: tx power (in dBm) to be used for sending data traffic. If tx power
 *        is not provided, the default per-interface tx power setting will be
 *        overriding. Driver should be picking up the lowest tx power, either tx
 *        power per-interface or per-station.
 * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
 *        will be less than or equal to specified from userspace, whereas if TPC
 *        %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
 *        NL80211_TX_POWER_FIXED is not a valid configuration option for
 *        per peer TPC.
 */
struct sta_txpwr {
        s16 power;
        enum nl80211_tx_power_setting type;
};

/**
 * struct link_station_parameters - link station parameters
 *
 * Used to change and create a new link station.
 *
 * @mld_mac: MAC address of the station
 * @link_id: the link id (-1 for non-MLD station)
 * @link_mac: MAC address of the link
 * @supported_rates: supported rates in IEEE 802.11 format
 *        (or NULL for no change)
 * @supported_rates_len: number of supported rates
 * @ht_capa: HT capabilities of station
 * @vht_capa: VHT capabilities of station
 * @opmode_notif: operating mode field from Operating Mode Notification
 * @opmode_notif_used: information if operating mode field is used
 * @he_capa: HE capabilities of station
 * @he_capa_len: the length of the HE capabilities
 * @txpwr: transmit power for an associated station
 * @txpwr_set: txpwr field is set
 * @he_6ghz_capa: HE 6 GHz Band capabilities of station
 * @eht_capa: EHT capabilities of station
 * @eht_capa_len: the length of the EHT capabilities
 */
struct link_station_parameters {
        const u8 *mld_mac;
        int link_id;
        const u8 *link_mac;
        const u8 *supported_rates;
        u8 supported_rates_len;
        const struct ieee80211_ht_cap *ht_capa;
        const struct ieee80211_vht_cap *vht_capa;
        u8 opmode_notif;
        bool opmode_notif_used;
        const struct ieee80211_he_cap_elem *he_capa;
        u8 he_capa_len;
        struct sta_txpwr txpwr;
        bool txpwr_set;
        const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
        const struct ieee80211_eht_cap_elem *eht_capa;
        u8 eht_capa_len;
};

/**
 * struct link_station_del_parameters - link station deletion parameters
 *
 * Used to delete a link station entry (or all stations).
 *
 * @mld_mac: MAC address of the station
 * @link_id: the link id
 */
struct link_station_del_parameters {
        const u8 *mld_mac;
        u32 link_id;
};

/**
 * struct cfg80211_ttlm_params: TID to link mapping parameters
 *
 * Used for setting a TID to link mapping.
 *
 * @dlink: Downlink TID to link mapping, as defined in section 9.4.2.314
 *     (TID-To-Link Mapping element) in Draft P802.11be_D4.0.
 * @ulink: Uplink TID to link mapping, as defined in section 9.4.2.314
 *     (TID-To-Link Mapping element) in Draft P802.11be_D4.0.
 */
struct cfg80211_ttlm_params {
        u16 dlink[8];
        u16 ulink[8];
};

/**
 * struct station_parameters - station parameters
 *
 * Used to change and create a new station.
 *
 * @vlan: vlan interface station should belong to
 * @sta_flags_mask: station flags that changed
 *        (bitmask of BIT(%NL80211_STA_FLAG_...))
 * @sta_flags_set: station flags values
 *        (bitmask of BIT(%NL80211_STA_FLAG_...))
 * @listen_interval: listen interval or -1 for no change
 * @aid: AID or zero for no change
 * @vlan_id: VLAN ID for station (if nonzero)
 * @peer_aid: mesh peer AID or zero for no change
 * @plink_action: plink action to take
 * @plink_state: set the peer link state for a station
 * @uapsd_queues: bitmap of queues configured for uapsd. same format
 *        as the AC bitmap in the QoS info field
 * @max_sp: max Service Period. same format as the MAX_SP in the
 *        QoS info field (but already shifted down)
 * @sta_modify_mask: bitmap indicating which parameters changed
 *        (for those that don't have a natural "no change" value),
 *        see &enum station_parameters_apply_mask
 * @local_pm: local link-specific mesh power save mode (no change when set
 *        to unknown)
 * @capability: station capability
 * @ext_capab: extended capabilities of the station
 * @ext_capab_len: number of extended capabilities
 * @supported_channels: supported channels in IEEE 802.11 format
 * @supported_channels_len: number of supported channels
 * @supported_oper_classes: supported oper classes in IEEE 802.11 format
 * @supported_oper_classes_len: number of supported operating classes
 * @support_p2p_ps: information if station supports P2P PS mechanism
 * @airtime_weight: airtime scheduler weight for this station
 * @link_sta_params: link related params.
 */
struct station_parameters {
        struct net_device *vlan;
        u32 sta_flags_mask, sta_flags_set;
        u32 sta_modify_mask;
        int listen_interval;
        u16 aid;
        u16 vlan_id;
        u16 peer_aid;
        u8 plink_action;
        u8 plink_state;
        u8 uapsd_queues;
        u8 max_sp;
        enum nl80211_mesh_power_mode local_pm;
        u16 capability;
        const u8 *ext_capab;
        u8 ext_capab_len;
        const u8 *supported_channels;
        u8 supported_channels_len;
        const u8 *supported_oper_classes;
        u8 supported_oper_classes_len;
        int support_p2p_ps;
        u16 airtime_weight;
        struct link_station_parameters link_sta_params;
};

/**
 * struct station_del_parameters - station deletion parameters
 *
 * Used to delete a station entry (or all stations).
 *
 * @mac: MAC address of the station to remove or NULL to remove all stations
 * @subtype: Management frame subtype to use for indicating removal
 *        (10 = Disassociation, 12 = Deauthentication)
 * @reason_code: Reason code for the Disassociation/Deauthentication frame
 * @link_id: Link ID indicating a link that stations to be flushed must be
 *        using; valid only for MLO, but can also be -1 for MLO to really
 *        remove all stations.
 */
struct station_del_parameters {
        const u8 *mac;
        u8 subtype;
        u16 reason_code;
        int link_id;
};

/**
 * enum cfg80211_station_type - the type of station being modified
 * @CFG80211_STA_AP_CLIENT: client of an AP interface
 * @CFG80211_STA_AP_CLIENT_UNASSOC: client of an AP interface that is still
 *        unassociated (update properties for this type of client is permitted)
 * @CFG80211_STA_AP_MLME_CLIENT: client of an AP interface that has
 *        the AP MLME in the device
 * @CFG80211_STA_AP_STA: AP station on managed interface
 * @CFG80211_STA_IBSS: IBSS station
 * @CFG80211_STA_TDLS_PEER_SETUP: TDLS peer on managed interface (dummy entry
 *        while TDLS setup is in progress, it moves out of this state when
 *        being marked authorized; use this only if TDLS with external setup is
 *        supported/used)
 * @CFG80211_STA_TDLS_PEER_ACTIVE: TDLS peer on managed interface (active
 *        entry that is operating, has been marked authorized by userspace)
 * @CFG80211_STA_MESH_PEER_KERNEL: peer on mesh interface (kernel managed)
 * @CFG80211_STA_MESH_PEER_USER: peer on mesh interface (user managed)
 */
enum cfg80211_station_type {
        CFG80211_STA_AP_CLIENT,
        CFG80211_STA_AP_CLIENT_UNASSOC,
        CFG80211_STA_AP_MLME_CLIENT,
        CFG80211_STA_AP_STA,
        CFG80211_STA_IBSS,
        CFG80211_STA_TDLS_PEER_SETUP,
        CFG80211_STA_TDLS_PEER_ACTIVE,
        CFG80211_STA_MESH_PEER_KERNEL,
        CFG80211_STA_MESH_PEER_USER,
};

/**
 * cfg80211_check_station_change - validate parameter changes
 * @wiphy: the wiphy this operates on
 * @params: the new parameters for a station
 * @statype: the type of station being modified
 *
 * Utility function for the @change_station driver method. Call this function
 * with the appropriate station type looking up the station (and checking that
 * it exists). It will verify whether the station change is acceptable.
 *
 * Return: 0 if the change is acceptable, otherwise an error code. Note that
 * it may modify the parameters for backward compatibility reasons, so don't
 * use them before calling this.
 */
int cfg80211_check_station_change(struct wiphy *wiphy,
                                  struct station_parameters *params,
                                  enum cfg80211_station_type statype);

/**
 * enum rate_info_flags - bitrate info flags
 *
 * Used by the driver to indicate the specific rate transmission
 * type for 802.11n transmissions.
 *
 * @RATE_INFO_FLAGS_MCS: mcs field filled with HT MCS
 * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS
 * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval
 * @RATE_INFO_FLAGS_DMG: 60GHz MCS
 * @RATE_INFO_FLAGS_HE_MCS: HE MCS information
 * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode
 * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS
 * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information
 * @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS
 */
enum rate_info_flags {
        RATE_INFO_FLAGS_MCS                        = BIT(0),
        RATE_INFO_FLAGS_VHT_MCS                        = BIT(1),
        RATE_INFO_FLAGS_SHORT_GI                = BIT(2),
        RATE_INFO_FLAGS_DMG                        = BIT(3),
        RATE_INFO_FLAGS_HE_MCS                        = BIT(4),
        RATE_INFO_FLAGS_EDMG                        = BIT(5),
        RATE_INFO_FLAGS_EXTENDED_SC_DMG                = BIT(6),
        RATE_INFO_FLAGS_EHT_MCS                        = BIT(7),
        RATE_INFO_FLAGS_S1G_MCS                        = BIT(8),
};

/**
 * enum rate_info_bw - rate bandwidth information
 *
 * Used by the driver to indicate the rate bandwidth.
 *
 * @RATE_INFO_BW_5: 5 MHz bandwidth
 * @RATE_INFO_BW_10: 10 MHz bandwidth
 * @RATE_INFO_BW_20: 20 MHz bandwidth
 * @RATE_INFO_BW_40: 40 MHz bandwidth
 * @RATE_INFO_BW_80: 80 MHz bandwidth
 * @RATE_INFO_BW_160: 160 MHz bandwidth
 * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
 * @RATE_INFO_BW_320: 320 MHz bandwidth
 * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation
 * @RATE_INFO_BW_1: 1 MHz bandwidth
 * @RATE_INFO_BW_2: 2 MHz bandwidth
 * @RATE_INFO_BW_4: 4 MHz bandwidth
 * @RATE_INFO_BW_8: 8 MHz bandwidth
 * @RATE_INFO_BW_16: 16 MHz bandwidth
 */
enum rate_info_bw {
        RATE_INFO_BW_20 = 0,
        RATE_INFO_BW_5,
        RATE_INFO_BW_10,
        RATE_INFO_BW_40,
        RATE_INFO_BW_80,
        RATE_INFO_BW_160,
        RATE_INFO_BW_HE_RU,
        RATE_INFO_BW_320,
        RATE_INFO_BW_EHT_RU,
        RATE_INFO_BW_1,
        RATE_INFO_BW_2,
        RATE_INFO_BW_4,
        RATE_INFO_BW_8,
        RATE_INFO_BW_16,
};

/**
 * struct rate_info - bitrate information
 *
 * Information about a receiving or transmitting bitrate
 *
 * @flags: bitflag of flags from &enum rate_info_flags
 * @legacy: bitrate in 100kbit/s for 802.11abg
 * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate
 * @nss: number of streams (VHT & HE only)
 * @bw: bandwidth (from &enum rate_info_bw)
 * @he_gi: HE guard interval (from &enum nl80211_he_gi)
 * @he_dcm: HE DCM value
 * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc,
 *        only valid if bw is %RATE_INFO_BW_HE_RU)
 * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4)
 * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi)
 * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc,
 *        only valid if bw is %RATE_INFO_BW_EHT_RU)
 */
struct rate_info {
        u16 flags;
        u16 legacy;
        u8 mcs;
        u8 nss;
        u8 bw;
        u8 he_gi;
        u8 he_dcm;
        u8 he_ru_alloc;
        u8 n_bonded_ch;
        u8 eht_gi;
        u8 eht_ru_alloc;
};

/**
 * enum bss_param_flags - bitrate info flags
 *
 * Used by the driver to indicate the specific rate transmission
 * type for 802.11n transmissions.
 *
 * @BSS_PARAM_FLAGS_CTS_PROT: whether CTS protection is enabled
 * @BSS_PARAM_FLAGS_SHORT_PREAMBLE: whether short preamble is enabled
 * @BSS_PARAM_FLAGS_SHORT_SLOT_TIME: whether short slot time is enabled
 */
enum bss_param_flags {
        BSS_PARAM_FLAGS_CTS_PROT        = 1<<0,
        BSS_PARAM_FLAGS_SHORT_PREAMBLE        = 1<<1,
        BSS_PARAM_FLAGS_SHORT_SLOT_TIME        = 1<<2,
};

/**
 * struct sta_bss_parameters - BSS parameters for the attached station
 *
 * Information about the currently associated BSS
 *
 * @flags: bitflag of flags from &enum bss_param_flags
 * @dtim_period: DTIM period for the BSS
 * @beacon_interval: beacon interval
 */
struct sta_bss_parameters {
        u8 flags;
        u8 dtim_period;
        u16 beacon_interval;
};

/**
 * struct cfg80211_txq_stats - TXQ statistics for this TID
 * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to
 *        indicate the relevant values in this struct are filled
 * @backlog_bytes: total number of bytes currently backlogged
 * @backlog_packets: total number of packets currently backlogged
 * @flows: number of new flows seen
 * @drops: total number of packets dropped
 * @ecn_marks: total number of packets marked with ECN CE
 * @overlimit: number of drops due to queue space overflow
 * @overmemory: number of drops due to memory limit overflow
 * @collisions: number of hash collisions
 * @tx_bytes: total number of bytes dequeued
 * @tx_packets: total number of packets dequeued
 * @max_flows: maximum number of flows supported
 */
struct cfg80211_txq_stats {
        u32 filled;
        u32 backlog_bytes;
        u32 backlog_packets;
        u32 flows;
        u32 drops;
        u32 ecn_marks;
        u32 overlimit;
        u32 overmemory;
        u32 collisions;
        u32 tx_bytes;
        u32 tx_packets;
        u32 max_flows;
};

/**
 * struct cfg80211_tid_stats - per-TID statistics
 * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to
 *        indicate the relevant values in this struct are filled
 * @rx_msdu: number of received MSDUs
 * @tx_msdu: number of (attempted) transmitted MSDUs
 * @tx_msdu_retries: number of retries (not counting the first) for
 *        transmitted MSDUs
 * @tx_msdu_failed: number of failed transmitted MSDUs
 * @txq_stats: TXQ statistics
 */
struct cfg80211_tid_stats {
        u32 filled;
        u64 rx_msdu;
        u64 tx_msdu;
        u64 tx_msdu_retries;
        u64 tx_msdu_failed;
        struct cfg80211_txq_stats txq_stats;
};

#define IEEE80211_MAX_CHAINS        4

/**
 * struct station_info - station information
 *
 * Station information filled by driver for get_station() and dump_station.
 *
 * @filled: bitflag of flags using the bits of &enum nl80211_sta_info to
 *        indicate the relevant values in this struct for them
 * @connected_time: time(in secs) since a station is last connected
 * @inactive_time: time since last station activity (tx/rx) in milliseconds
 * @assoc_at: bootime (ns) of the last association
 * @rx_bytes: bytes (size of MPDUs) received from this station
 * @tx_bytes: bytes (size of MPDUs) transmitted to this station
 * @llid: mesh local link id
 * @plid: mesh peer link id
 * @plink_state: mesh peer link state
 * @signal: The signal strength, type depends on the wiphy's signal_type.
 *        For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
 * @signal_avg: Average signal strength, type depends on the wiphy's signal_type.
 *        For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
 * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg
 * @chain_signal: per-chain signal strength of last received packet in dBm
 * @chain_signal_avg: per-chain signal strength average in dBm
 * @txrate: current unicast bitrate from this station
 * @rxrate: current unicast bitrate to this station
 * @rx_packets: packets (MSDUs & MMPDUs) received from this station
 * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this station
 * @tx_retries: cumulative retry counts (MPDUs)
 * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK)
 * @rx_dropped_misc:  Dropped for un-specified reason.
 * @bss_param: current BSS parameters
 * @generation: generation number for nl80211 dumps.
 *        This number should increase every time the list of stations
 *        changes, i.e. when a station is added or removed, so that
 *        userspace can tell whether it got a consistent snapshot.
 * @assoc_req_ies: IEs from (Re)Association Request.
 *        This is used only when in AP mode with drivers that do not use
 *        user space MLME/SME implementation. The information is provided for
 *        the cfg80211_new_sta() calls to notify user space of the IEs.
 * @assoc_req_ies_len: Length of assoc_req_ies buffer in octets.
 * @sta_flags: station flags mask & values
 * @beacon_loss_count: Number of times beacon loss event has triggered.
 * @t_offset: Time offset of the station relative to this host.
 * @local_pm: local mesh STA power save mode
 * @peer_pm: peer mesh STA power save mode
 * @nonpeer_pm: non-peer mesh STA power save mode
 * @expected_throughput: expected throughput in kbps (including 802.11 headers)
 *        towards this station.
 * @rx_beacon: number of beacons received from this peer
 * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
 *        from this peer
 * @connected_to_gate: true if mesh STA has a path to mesh gate
 * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
 * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
 * @airtime_weight: current airtime scheduling weight
 * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
 *        (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
 *        Note that this doesn't use the @filled bit, but is used if non-NULL.
 * @ack_signal: signal strength (in dBm) of the last ACK frame.
 * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
 *        been sent.
 * @rx_mpdu_count: number of MPDUs received from this station
 * @fcs_err_count: number of packets (MPDUs) received from this station with
 *        an FCS error. This counter should be incremented only when TA of the
 *        received packet with an FCS error matches the peer MAC address.
 * @airtime_link_metric: mesh airtime link metric.
 * @connected_to_as: true if mesh STA has a path to authentication server
 * @mlo_params_valid: Indicates @assoc_link_id and @mld_addr fields are filled
 *        by driver. Drivers use this only in cfg80211_new_sta() calls when AP
 *        MLD's MLME/SME is offload to driver. Drivers won't fill this
 *        information in cfg80211_del_sta_sinfo(), get_station() and
 *        dump_station() callbacks.
 * @assoc_link_id: Indicates MLO link ID of the AP, with which the station
 *        completed (re)association. This information filled for both MLO
 *        and non-MLO STA connections when the AP affiliated with an MLD.
 * @mld_addr: For MLO STA connection, filled with MLD address of the station.
 *        For non-MLO STA connection, filled with all zeros.
 * @assoc_resp_ies: IEs from (Re)Association Response.
 *        This is used only when in AP mode with drivers that do not use user
 *        space MLME/SME implementation. The information is provided only for the
 *        cfg80211_new_sta() calls to notify user space of the IEs. Drivers won't
 *        fill this information in cfg80211_del_sta_sinfo(), get_station() and
 *        dump_station() callbacks. User space needs this information to determine
 *        the accepted and rejected affiliated links of the connected station.
 * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets.
 */
struct station_info {
        u64 filled;
        u32 connected_time;
        u32 inactive_time;
        u64 assoc_at;
        u64 rx_bytes;
        u64 tx_bytes;
        u16 llid;
        u16 plid;
        u8 plink_state;
        s8 signal;
        s8 signal_avg;

        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];
        s8 chain_signal_avg[IEEE80211_MAX_CHAINS];

        struct rate_info txrate;
        struct rate_info rxrate;
        u32 rx_packets;
        u32 tx_packets;
        u32 tx_retries;
        u32 tx_failed;
        u32 rx_dropped_misc;
        struct sta_bss_parameters bss_param;
        struct nl80211_sta_flag_update sta_flags;

        int generation;

        const u8 *assoc_req_ies;
        size_t assoc_req_ies_len;

        u32 beacon_loss_count;
        s64 t_offset;
        enum nl80211_mesh_power_mode local_pm;
        enum nl80211_mesh_power_mode peer_pm;
        enum nl80211_mesh_power_mode nonpeer_pm;

        u32 expected_throughput;

        u64 tx_duration;
        u64 rx_duration;
        u64 rx_beacon;
        u8 rx_beacon_signal_avg;
        u8 connected_to_gate;

        struct cfg80211_tid_stats *pertid;
        s8 ack_signal;
        s8 avg_ack_signal;

        u16 airtime_weight;

        u32 rx_mpdu_count;
        u32 fcs_err_count;

        u32 airtime_link_metric;

        u8 connected_to_as;

        bool mlo_params_valid;
        u8 assoc_link_id;
        u8 mld_addr[ETH_ALEN] __aligned(2);
        const u8 *assoc_resp_ies;
        size_t assoc_resp_ies_len;
};

/**
 * struct cfg80211_sar_sub_specs - sub specs limit
 * @power: power limitation in 0.25dbm
 * @freq_range_index: index the power limitation applies to
 */
struct cfg80211_sar_sub_specs {
        s32 power;
        u32 freq_range_index;
};

/**
 * struct cfg80211_sar_specs - sar limit specs
 * @type: it's set with power in 0.25dbm or other types
 * @num_sub_specs: number of sar sub specs
 * @sub_specs: memory to hold the sar sub specs
 */
struct cfg80211_sar_specs {
        enum nl80211_sar_type type;
        u32 num_sub_specs;
        struct cfg80211_sar_sub_specs sub_specs[] __counted_by(num_sub_specs);
};


/**
 * struct cfg80211_sar_freq_ranges - sar frequency ranges
 * @start_freq:  start range edge frequency
 * @end_freq:    end range edge frequency
 */
struct cfg80211_sar_freq_ranges {
        u32 start_freq;
        u32 end_freq;
};

/**
 * struct cfg80211_sar_capa - sar limit capability
 * @type: it's set via power in 0.25dbm or other types
 * @num_freq_ranges: number of frequency ranges
 * @freq_ranges: memory to hold the freq ranges.
 *
 * Note: WLAN driver may append new ranges or split an existing
 * range to small ones and then append them.
 */
struct cfg80211_sar_capa {
        enum nl80211_sar_type type;
        u32 num_freq_ranges;
        const struct cfg80211_sar_freq_ranges *freq_ranges;
};

#if IS_ENABLED(CONFIG_CFG80211)
/**
 * cfg80211_get_station - retrieve information about a given station
 * @dev: the device where the station is supposed to be connected to
 * @mac_addr: the mac address of the station of interest
 * @sinfo: pointer to the structure to fill with the information
 *
 * Return: 0 on success and sinfo is filled with the available information
 * otherwise returns a negative error code and the content of sinfo has to be
 * considered undefined.
 */
int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
                         struct station_info *sinfo);
#else
static inline int cfg80211_get_station(struct net_device *dev,
                                       const u8 *mac_addr,
                                       struct station_info *sinfo)
{
        return -ENOENT;
}
#endif

/**
 * enum monitor_flags - monitor flags
 *
 * Monitor interface configuration flags. Note that these must be the bits
 * according to the nl80211 flags.
 *
 * @MONITOR_FLAG_CHANGED: set if the flags were changed
 * @MONITOR_FLAG_FCSFAIL: pass frames with bad FCS
 * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP
 * @MONITOR_FLAG_CONTROL: pass control frames
 * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering
 * @MONITOR_FLAG_COOK_FRAMES: report frames after processing
 * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address
 */
enum monitor_flags {
        MONITOR_FLAG_CHANGED                = 1<<__NL80211_MNTR_FLAG_INVALID,
        MONITOR_FLAG_FCSFAIL                = 1<<NL80211_MNTR_FLAG_FCSFAIL,
        MONITOR_FLAG_PLCPFAIL                = 1<<NL80211_MNTR_FLAG_PLCPFAIL,
        MONITOR_FLAG_CONTROL                = 1<<NL80211_MNTR_FLAG_CONTROL,
        MONITOR_FLAG_OTHER_BSS                = 1<<NL80211_MNTR_FLAG_OTHER_BSS,
        MONITOR_FLAG_COOK_FRAMES        = 1<<NL80211_MNTR_FLAG_COOK_FRAMES,
        MONITOR_FLAG_ACTIVE                = 1<<NL80211_MNTR_FLAG_ACTIVE,
};

/**
 * enum mpath_info_flags -  mesh path information flags
 *
 * Used by the driver to indicate which info in &struct mpath_info it has filled
 * in during get_station() or dump_station().
 *
 * @MPATH_INFO_FRAME_QLEN: @frame_qlen filled
 * @MPATH_INFO_SN: @sn filled
 * @MPATH_INFO_METRIC: @metric filled
 * @MPATH_INFO_EXPTIME: @exptime filled
 * @MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled
 * @MPATH_INFO_DISCOVERY_RETRIES: @discovery_retries filled
 * @MPATH_INFO_FLAGS: @flags filled
 * @MPATH_INFO_HOP_COUNT: @hop_count filled
 * @MPATH_INFO_PATH_CHANGE: @path_change_count filled
 */
enum mpath_info_flags {
        MPATH_INFO_FRAME_QLEN                = BIT(0),
        MPATH_INFO_SN                        = BIT(1),
        MPATH_INFO_METRIC                = BIT(2),
        MPATH_INFO_EXPTIME                = BIT(3),
        MPATH_INFO_DISCOVERY_TIMEOUT        = BIT(4),
        MPATH_INFO_DISCOVERY_RETRIES        = BIT(5),
        MPATH_INFO_FLAGS                = BIT(6),
        MPATH_INFO_HOP_COUNT                = BIT(7),
        MPATH_INFO_PATH_CHANGE                = BIT(8),
};

/**
 * struct mpath_info - mesh path information
 *
 * Mesh path information filled by driver for get_mpath() and dump_mpath().
 *
 * @filled: bitfield of flags from &enum mpath_info_flags
 * @frame_qlen: number of queued frames for this destination
 * @sn: target sequence number
 * @metric: metric (cost) of this mesh path
 * @exptime: expiration time for the mesh path from now, in msecs
 * @flags: mesh path flags from &enum mesh_path_flags
 * @discovery_timeout: total mesh path discovery timeout, in msecs
 * @discovery_retries: mesh path discovery retries
 * @generation: generation number for nl80211 dumps.
 *        This number should increase every time the list of mesh paths
 *        changes, i.e. when a station is added or removed, so that
 *        userspace can tell whether it got a consistent snapshot.
 * @hop_count: hops to destination
 * @path_change_count: total number of path changes to destination
 */
struct mpath_info {
        u32 filled;
        u32 frame_qlen;
        u32 sn;
        u32 metric;
        u32 exptime;
        u32 discovery_timeout;
        u8 discovery_retries;
        u8 flags;
        u8 hop_count;
        u32 path_change_count;

        int generation;
};

/**
 * struct bss_parameters - BSS parameters
 *
 * Used to change BSS parameters (mainly for AP mode).
 *
 * @link_id: link_id or -1 for non-MLD
 * @use_cts_prot: Whether to use CTS protection
 *        (0 = no, 1 = yes, -1 = do not change)
 * @use_short_preamble: Whether the use of short preambles is allowed
 *        (0 = no, 1 = yes, -1 = do not change)
 * @use_short_slot_time: Whether the use of short slot time is allowed
 *        (0 = no, 1 = yes, -1 = do not change)
 * @basic_rates: basic rates in IEEE 802.11 format
 *        (or NULL for no change)
 * @basic_rates_len: number of basic rates
 * @ap_isolate: do not forward packets between connected stations
 *        (0 = no, 1 = yes, -1 = do not change)
 * @ht_opmode: HT Operation mode
 *        (u16 = opmode, -1 = do not change)
 * @p2p_ctwindow: P2P CT Window (-1 = no change)
 * @p2p_opp_ps: P2P opportunistic PS (-1 = no change)
 */
struct bss_parameters {
        int link_id;
        int use_cts_prot;
        int use_short_preamble;
        int use_short_slot_time;
        const u8 *basic_rates;
        u8 basic_rates_len;
        int ap_isolate;
        int ht_opmode;
        s8 p2p_ctwindow, p2p_opp_ps;
};

/**
 * struct mesh_config - 802.11s mesh configuration
 *
 * These parameters can be changed while the mesh is active.
 *
 * @dot11MeshRetryTimeout: the initial retry timeout in millisecond units used
 *        by the Mesh Peering Open message
 * @dot11MeshConfirmTimeout: the initial retry timeout in millisecond units
 *        used by the Mesh Peering Open message
 * @dot11MeshHoldingTimeout: the confirm timeout in millisecond units used by
 *        the mesh peering management to close a mesh peering
 * @dot11MeshMaxPeerLinks: the maximum number of peer links allowed on this
 *        mesh interface
 * @dot11MeshMaxRetries: the maximum number of peer link open retries that can
 *        be sent to establish a new peer link instance in a mesh
 * @dot11MeshTTL: the value of TTL field set at a source mesh STA
 * @element_ttl: the value of TTL field set at a mesh STA for path selection
 *        elements
 * @auto_open_plinks: whether we should automatically open peer links when we
 *        detect compatible mesh peers
 * @dot11MeshNbrOffsetMaxNeighbor: the maximum number of neighbors to
 *        synchronize to for 11s default synchronization method
 * @dot11MeshHWMPmaxPREQretries: the number of action frames containing a PREQ
 *        that an originator mesh STA can send to a particular path target
 * @path_refresh_time: how frequently to refresh mesh paths in milliseconds
 * @min_discovery_timeout: the minimum length of time to wait until giving up on
 *        a path discovery in milliseconds
 * @dot11MeshHWMPactivePathTimeout: the time (in TUs) for which mesh STAs
 *        receiving a PREQ shall consider the forwarding information from the
 *        root to be valid. (TU = time unit)
 * @dot11MeshHWMPpreqMinInterval: the minimum interval of time (in TUs) during
 *        which a mesh STA can send only one action frame containing a PREQ
 *        element
 * @dot11MeshHWMPperrMinInterval: the minimum interval of time (in TUs) during
 *        which a mesh STA can send only one Action frame containing a PERR
 *        element
 * @dot11MeshHWMPnetDiameterTraversalTime: the interval of time (in TUs) that
 *        it takes for an HWMP information element to propagate across the mesh
 * @dot11MeshHWMPRootMode: the configuration of a mesh STA as root mesh STA
 * @dot11MeshHWMPRannInterval: the interval of time (in TUs) between root
 *        announcements are transmitted
 * @dot11MeshGateAnnouncementProtocol: whether to advertise that this mesh
 *        station has access to a broader network beyond the MBSS. (This is
 *        missnamed in draft 12.0: dot11MeshGateAnnouncementProtocol set to true
 *        only means that the station will announce others it's a mesh gate, but
 *        not necessarily using the gate announcement protocol. Still keeping the
 *        same nomenclature to be in sync with the spec)
 * @dot11MeshForwarding: whether the Mesh STA is forwarding or non-forwarding
 *        entity (default is TRUE - forwarding entity)
 * @rssi_threshold: the threshold for average signal strength of candidate
 *        station to establish a peer link
 * @ht_opmode: mesh HT protection mode
 *
 * @dot11MeshHWMPactivePathToRootTimeout: The time (in TUs) for which mesh STAs
 *        receiving a proactive PREQ shall consider the forwarding information to
 *        the root mesh STA to be valid.
 *
 * @dot11MeshHWMProotInterval: The interval of time (in TUs) between proactive
 *        PREQs are transmitted.
 * @dot11MeshHWMPconfirmationInterval: The minimum interval of time (in TUs)
 *        during which a mesh STA can send only one Action frame containing
 *        a PREQ element for root path confirmation.
 * @power_mode: The default mesh power save mode which will be the initial
 *        setting for new peer links.
 * @dot11MeshAwakeWindowDuration: The duration in TUs the STA will remain awake
 *        after transmitting its beacon.
 * @plink_timeout: If no tx activity is seen from a STA we've established
 *        peering with for longer than this time (in seconds), then remove it
 *        from the STA's list of peers.  Default is 30 minutes.
 * @dot11MeshConnectedToAuthServer: if set to true then this mesh STA
 *        will advertise that it is connected to a authentication server
 *        in the mesh formation field.
 * @dot11MeshConnectedToMeshGate: if set to true, advertise that this STA is
 *      connected to a mesh gate in mesh formation info.  If false, the
 *      value in mesh formation is determined by the presence of root paths
 *      in the mesh path table
 * @dot11MeshNolearn: Try to avoid multi-hop path discovery (e.g. PREQ/PREP
 *      for HWMP) if the destination is a direct neighbor. Note that this might
 *      not be the optimal decision as a multi-hop route might be better. So
 *      if using this setting you will likely also want to disable
 *      dot11MeshForwarding and use another mesh routing protocol on top.
 */
struct mesh_config {
        u16 dot11MeshRetryTimeout;
        u16 dot11MeshConfirmTimeout;
        u16 dot11MeshHoldingTimeout;
        u16 dot11MeshMaxPeerLinks;
        u8 dot11MeshMaxRetries;
        u8 dot11MeshTTL;
        u8 element_ttl;
        bool auto_open_plinks;
        u32 dot11MeshNbrOffsetMaxNeighbor;
        u8 dot11MeshHWMPmaxPREQretries;
        u32 path_refresh_time;
        u16 min_discovery_timeout;
        u32 dot11MeshHWMPactivePathTimeout;
        u16 dot11MeshHWMPpreqMinInterval;
        u16 dot11MeshHWMPperrMinInterval;
        u16 dot11MeshHWMPnetDiameterTraversalTime;
        u8 dot11MeshHWMPRootMode;
        bool dot11MeshConnectedToMeshGate;
        bool dot11MeshConnectedToAuthServer;
        u16 dot11MeshHWMPRannInterval;
        bool dot11MeshGateAnnouncementProtocol;
        bool dot11MeshForwarding;
        s32 rssi_threshold;
        u16 ht_opmode;
        u32 dot11MeshHWMPactivePathToRootTimeout;
        u16 dot11MeshHWMProotInterval;
        u16 dot11MeshHWMPconfirmationInterval;
        enum nl80211_mesh_power_mode power_mode;
        u16 dot11MeshAwakeWindowDuration;
        u32 plink_timeout;
        bool dot11MeshNolearn;
};

/**
 * struct mesh_setup - 802.11s mesh setup configuration
 * @chandef: defines the channel to use
 * @mesh_id: the mesh ID
 * @mesh_id_len: length of the mesh ID, at least 1 and at most 32 bytes
 * @sync_method: which synchronization method to use
 * @path_sel_proto: which path selection protocol to use
 * @path_metric: which metric to use
 * @auth_id: which authentication method this mesh is using
 * @ie: vendor information elements (optional)
 * @ie_len: length of vendor information elements
 * @is_authenticated: this mesh requires authentication
 * @is_secure: this mesh uses security
 * @user_mpm: userspace handles all MPM functions
 * @dtim_period: DTIM period to use
 * @beacon_interval: beacon interval to use
 * @mcast_rate: multicast rate for Mesh Node [6Mbps is the default for 802.11a]
 * @basic_rates: basic rates to use when creating the mesh
 * @beacon_rate: bitrate to be used for beacons
 * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
 *        changes the channel when a radar is detected. This is required
 *        to operate on DFS channels.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 *
 * These parameters are fixed when the mesh is created.
 */
struct mesh_setup {
        struct cfg80211_chan_def chandef;
        const u8 *mesh_id;
        u8 mesh_id_len;
        u8 sync_method;
        u8 path_sel_proto;
        u8 path_metric;
        u8 auth_id;
        const u8 *ie;
        u8 ie_len;
        bool is_authenticated;
        bool is_secure;
        bool user_mpm;
        u8 dtim_period;
        u16 beacon_interval;
        int mcast_rate[NUM_NL80211_BANDS];
        u32 basic_rates;
        struct cfg80211_bitrate_mask beacon_rate;
        bool userspace_handles_dfs;
        bool control_port_over_nl80211;
};

/**
 * struct ocb_setup - 802.11p OCB mode setup configuration
 * @chandef: defines the channel to use
 *
 * These parameters are fixed when connecting to the network
 */
struct ocb_setup {
        struct cfg80211_chan_def chandef;
};

/**
 * struct ieee80211_txq_params - TX queue parameters
 * @ac: AC identifier
 * @txop: Maximum burst time in units of 32 usecs, 0 meaning disabled
 * @cwmin: Minimum contention window [a value of the form 2^n-1 in the range
 *        1..32767]
 * @cwmax: Maximum contention window [a value of the form 2^n-1 in the range
 *        1..32767]
 * @aifs: Arbitration interframe space [0..255]
 * @link_id: link_id or -1 for non-MLD
 */
struct ieee80211_txq_params {
        enum nl80211_ac ac;
        u16 txop;
        u16 cwmin;
        u16 cwmax;
        u8 aifs;
        int link_id;
};

/**
 * DOC: Scanning and BSS list handling
 *
 * The scanning process itself is fairly simple, but cfg80211 offers quite
 * a bit of helper functionality. To start a scan, the scan operation will
 * be invoked with a scan definition. This scan definition contains the
 * channels to scan, and the SSIDs to send probe requests for (including the
 * wildcard, if desired). A passive scan is indicated by having no SSIDs to
 * probe. Additionally, a scan request may contain extra information elements
 * that should be added to the probe request. The IEs are guaranteed to be
 * well-formed, and will not exceed the maximum length the driver advertised
 * in the wiphy structure.
 *
 * When scanning finds a BSS, cfg80211 needs to be notified of that, because
 * it is responsible for maintaining the BSS list; the driver should not
 * maintain a list itself. For this notification, various functions exist.
 *
 * Since drivers do not maintain a BSS list, there are also a number of
 * functions to search for a BSS and obtain information about it from the
 * BSS structure cfg80211 maintains. The BSS list is also made available
 * to userspace.
 */

/**
 * struct cfg80211_ssid - SSID description
 * @ssid: the SSID
 * @ssid_len: length of the ssid
 */
struct cfg80211_ssid {
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len;
};

/**
 * struct cfg80211_scan_info - information about completed scan
 * @scan_start_tsf: scan start time in terms of the TSF of the BSS that the
 *        wireless device that requested the scan is connected to. If this
 *        information is not available, this field is left zero.
 * @tsf_bssid: the BSSID according to which %scan_start_tsf is set.
 * @aborted: set to true if the scan was aborted for any reason,
 *        userspace will be notified of that
 */
struct cfg80211_scan_info {
        u64 scan_start_tsf;
        u8 tsf_bssid[ETH_ALEN] __aligned(2);
        bool aborted;
};

/**
 * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only
 *
 * @short_ssid: short ssid to scan for
 * @bssid: bssid to scan for
 * @channel_idx: idx of the channel in the channel array in the scan request
 *         which the above info is relevant to
 * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU
 * @short_ssid_valid: @short_ssid is valid and can be used
 * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait
 *       20 TUs before starting to send probe requests.
 * @psd_20: The AP's 20 MHz PSD value.
 */
struct cfg80211_scan_6ghz_params {
        u32 short_ssid;
        u32 channel_idx;
        u8 bssid[ETH_ALEN];
        bool unsolicited_probe;
        bool short_ssid_valid;
        bool psc_no_listen;
        s8 psd_20;
};

/**
 * struct cfg80211_scan_request - scan request description
 *
 * @ssids: SSIDs to scan for (active scan only)
 * @n_ssids: number of SSIDs
 * @channels: channels to scan on.
 * @n_channels: total number of channels to scan
 * @ie: optional information element(s) to add into Probe Request or %NULL
 * @ie_len: length of ie in octets
 * @duration: how long to listen on each channel, in TUs. If
 *        %duration_mandatory is not set, this is the maximum dwell time and
 *        the actual dwell time may be shorter.
 * @duration_mandatory: if set, the scan duration must be as specified by the
 *        %duration field.
 * @flags: control flags from &enum nl80211_scan_flags
 * @rates: bitmap of rates to advertise for each band
 * @wiphy: the wiphy this was for
 * @scan_start: time (in jiffies) when the scan started
 * @wdev: the wireless device to scan for
 * @info: (internal) information about completed scan
 * @notified: (internal) scan request was notified as done or aborted
 * @no_cck: used to send probe requests at non CCK rate in 2GHz band
 * @mac_addr: MAC address used with randomisation
 * @mac_addr_mask: MAC address mask used with randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @scan_6ghz: relevant for split scan request only,
 *        true if this is the second scan request
 * @n_6ghz_params: number of 6 GHz params
 * @scan_6ghz_params: 6 GHz params
 * @bssid: BSSID to scan for (most commonly, the wildcard BSSID)
 * @tsf_report_link_id: for MLO, indicates the link ID of the BSS that should be
 *      used for TSF reporting. Can be set to -1 to indicate no preference.
 */
struct cfg80211_scan_request {
        struct cfg80211_ssid *ssids;
        int n_ssids;
        u32 n_channels;
        const u8 *ie;
        size_t ie_len;
        u16 duration;
        bool duration_mandatory;
        u32 flags;

        u32 rates[NUM_NL80211_BANDS];

        struct wireless_dev *wdev;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);
        u8 bssid[ETH_ALEN] __aligned(2);

        /* internal */
        struct wiphy *wiphy;
        unsigned long scan_start;
        struct cfg80211_scan_info info;
        bool notified;
        bool no_cck;
        bool scan_6ghz;
        u32 n_6ghz_params;
        struct cfg80211_scan_6ghz_params *scan_6ghz_params;
        s8 tsf_report_link_id;

        /* keep last */
        struct ieee80211_channel *channels[] __counted_by(n_channels);
};

static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
{
        int i;

        get_random_bytes(buf, ETH_ALEN);
        for (i = 0; i < ETH_ALEN; i++) {
                buf[i] &= ~mask[i];
                buf[i] |= addr[i] & mask[i];
        }
}

/**
 * struct cfg80211_match_set - sets of attributes to match
 *
 * @ssid: SSID to be matched; may be zero-length in case of BSSID match
 *        or no match (RSSI only)
 * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match
 *        or no match (RSSI only)
 * @rssi_thold: don't report scan results below this threshold (in s32 dBm)
 */
struct cfg80211_match_set {
        struct cfg80211_ssid ssid;
        u8 bssid[ETH_ALEN];
        s32 rssi_thold;
};

/**
 * struct cfg80211_sched_scan_plan - scan plan for scheduled scan
 *
 * @interval: interval between scheduled scan iterations. In seconds.
 * @iterations: number of scan iterations in this scan plan. Zero means
 *        infinite loop.
 *        The last scan plan will always have this parameter set to zero,
 *        all other scan plans will have a finite number of iterations.
 */
struct cfg80211_sched_scan_plan {
        u32 interval;
        u32 iterations;
};

/**
 * struct cfg80211_bss_select_adjust - BSS selection with RSSI adjustment.
 *
 * @band: band of BSS which should match for RSSI level adjustment.
 * @delta: value of RSSI level adjustment.
 */
struct cfg80211_bss_select_adjust {
        enum nl80211_band band;
        s8 delta;
};

/**
 * struct cfg80211_sched_scan_request - scheduled scan request description
 *
 * @reqid: identifies this request.
 * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans)
 * @n_ssids: number of SSIDs
 * @n_channels: total number of channels to scan
 * @ie: optional information element(s) to add into Probe Request or %NULL
 * @ie_len: length of ie in octets
 * @flags: control flags from &enum nl80211_scan_flags
 * @match_sets: sets of parameters to be matched for a scan result
 *        entry to be considered valid and to be passed to the host
 *        (others are filtered out).
 *        If omitted, all results are passed.
 * @n_match_sets: number of match sets
 * @report_results: indicates that results were reported for this request
 * @wiphy: the wiphy this was for
 * @dev: the interface
 * @scan_start: start time of the scheduled scan
 * @channels: channels to scan
 * @min_rssi_thold: for drivers only supporting a single threshold, this
 *        contains the minimum over all matchsets
 * @mac_addr: MAC address used with randomisation
 * @mac_addr_mask: MAC address mask used with randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @scan_plans: scan plans to be executed in this scheduled scan. Lowest
 *        index must be executed first.
 * @n_scan_plans: number of scan plans, at least 1.
 * @rcu_head: RCU callback used to free the struct
 * @owner_nlportid: netlink portid of owner (if this should is a request
 *        owned by a particular socket)
 * @nl_owner_dead: netlink owner socket was closed - this request be freed
 * @list: for keeping list of requests.
 * @delay: delay in seconds to use before starting the first scan
 *        cycle.  The driver may ignore this parameter and start
 *        immediately (or at any other time), if this feature is not
 *        supported.
 * @relative_rssi_set: Indicates whether @relative_rssi is set or not.
 * @relative_rssi: Relative RSSI threshold in dB to restrict scan result
 *        reporting in connected state to cases where a matching BSS is determined
 *        to have better or slightly worse RSSI than the current connected BSS.
 *        The relative RSSI threshold values are ignored in disconnected state.
 * @rssi_adjust: delta dB of RSSI preference to be given to the BSSs that belong
 *        to the specified band while deciding whether a better BSS is reported
 *        using @relative_rssi. If delta is a negative number, the BSSs that
 *        belong to the specified band will be penalized by delta dB in relative
 *        comparisons.
 */
struct cfg80211_sched_scan_request {
        u64 reqid;
        struct cfg80211_ssid *ssids;
        int n_ssids;
        u32 n_channels;
        const u8 *ie;
        size_t ie_len;
        u32 flags;
        struct cfg80211_match_set *match_sets;
        int n_match_sets;
        s32 min_rssi_thold;
        u32 delay;
        struct cfg80211_sched_scan_plan *scan_plans;
        int n_scan_plans;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);

        bool relative_rssi_set;
        s8 relative_rssi;
        struct cfg80211_bss_select_adjust rssi_adjust;

        /* internal */
        struct wiphy *wiphy;
        struct net_device *dev;
        unsigned long scan_start;
        bool report_results;
        struct rcu_head rcu_head;
        u32 owner_nlportid;
        bool nl_owner_dead;
        struct list_head list;

        /* keep last */
        struct ieee80211_channel *channels[] __counted_by(n_channels);
};

/**
 * enum cfg80211_signal_type - signal type
 *
 * @CFG80211_SIGNAL_TYPE_NONE: no signal strength information available
 * @CFG80211_SIGNAL_TYPE_MBM: signal strength in mBm (100*dBm)
 * @CFG80211_SIGNAL_TYPE_UNSPEC: signal strength, increasing from 0 through 100
 */
enum cfg80211_signal_type {
        CFG80211_SIGNAL_TYPE_NONE,
        CFG80211_SIGNAL_TYPE_MBM,
        CFG80211_SIGNAL_TYPE_UNSPEC,
};

/**
 * struct cfg80211_inform_bss - BSS inform data
 * @chan: channel the frame was received on
 * @signal: signal strength value, according to the wiphy's
 *        signal type
 * @boottime_ns: timestamp (CLOCK_BOOTTIME) when the information was
 *        received; should match the time when the frame was actually
 *        received by the device (not just by the host, in case it was
 *        buffered on the device) and be accurate to about 10ms.
 *        If the frame isn't buffered, just passing the return value of
 *        ktime_get_boottime_ns() is likely appropriate.
 * @parent_tsf: the time at the start of reception of the first octet of the
 *        timestamp field of the frame. The time is the TSF of the BSS specified
 *        by %parent_bssid.
 * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to
 *        the BSS that requested the scan in which the beacon/probe was received.
 * @chains: bitmask for filled values in @chain_signal.
 * @chain_signal: per-chain signal strength of last received BSS in dBm.
 * @restrict_use: restrict usage, if not set, assume @use_for is
 *        %NL80211_BSS_USE_FOR_NORMAL.
 * @use_for: bitmap of possible usage for this BSS, see
 *        &enum nl80211_bss_use_for
 * @cannot_use_reasons: the reasons (bitmap) for not being able to connect,
 *        if @restrict_use is set and @use_for is zero (empty); may be 0 for
 *        unspecified reasons; see &enum nl80211_bss_cannot_use_reasons
 * @drv_data: Data to be passed through to @inform_bss
 */
struct cfg80211_inform_bss {
        struct ieee80211_channel *chan;
        s32 signal;
        u64 boottime_ns;
        u64 parent_tsf;
        u8 parent_bssid[ETH_ALEN] __aligned(2);
        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];

        u8 restrict_use:1, use_for:7;
        u8 cannot_use_reasons;

        void *drv_data;
};

/**
 * struct cfg80211_bss_ies - BSS entry IE data
 * @tsf: TSF contained in the frame that carried these IEs
 * @rcu_head: internal use, for freeing
 * @len: length of the IEs
 * @from_beacon: these IEs are known to come from a beacon
 * @data: IE data
 */
struct cfg80211_bss_ies {
        u64 tsf;
        struct rcu_head rcu_head;
        int len;
        bool from_beacon;
        u8 data[];
};

/**
 * struct cfg80211_bss - BSS description
 *
 * This structure describes a BSS (which may also be a mesh network)
 * for use in scan results and similar.
 *
 * @channel: channel this BSS is on
 * @bssid: BSSID of the BSS
 * @beacon_interval: the beacon interval as from the frame
 * @capability: the capability field in host byte order
 * @ies: the information elements (Note that there is no guarantee that these
 *        are well-formed!); this is a pointer to either the beacon_ies or
 *        proberesp_ies depending on whether Probe Response frame has been
 *        received. It is always non-%NULL.
 * @beacon_ies: the information elements from the last Beacon frame
 *        (implementation note: if @hidden_beacon_bss is set this struct doesn't
 *        own the beacon_ies, but they're just pointers to the ones from the
 *        @hidden_beacon_bss struct)
 * @proberesp_ies: the information elements from the last Probe Response frame
 * @proberesp_ecsa_stuck: ECSA element is stuck in the Probe Response frame,
 *        cannot rely on it having valid data
 * @hidden_beacon_bss: in case this BSS struct represents a probe response from
 *        a BSS that hides the SSID in its beacon, this points to the BSS struct
 *        that holds the beacon data. @beacon_ies is still valid, of course, and
 *        points to the same data as hidden_beacon_bss->beacon_ies in that case.
 * @transmitted_bss: pointer to the transmitted BSS, if this is a
 *        non-transmitted one (multi-BSSID support)
 * @nontrans_list: list of non-transmitted BSS, if this is a transmitted one
 *        (multi-BSSID support)
 * @signal: signal strength value (type depends on the wiphy's signal_type)
 * @chains: bitmask for filled values in @chain_signal.
 * @chain_signal: per-chain signal strength of last received BSS in dBm.
 * @bssid_index: index in the multiple BSS set
 * @max_bssid_indicator: max number of members in the BSS set
 * @use_for: bitmap of possible usage for this BSS, see
 *        &enum nl80211_bss_use_for
 * @cannot_use_reasons: the reasons (bitmap) for not being able to connect,
 *        if @restrict_use is set and @use_for is zero (empty); may be 0 for
 *        unspecified reasons; see &enum nl80211_bss_cannot_use_reasons
 * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes
 */
struct cfg80211_bss {
        struct ieee80211_channel *channel;

        const struct cfg80211_bss_ies __rcu *ies;
        const struct cfg80211_bss_ies __rcu *beacon_ies;
        const struct cfg80211_bss_ies __rcu *proberesp_ies;

        struct cfg80211_bss *hidden_beacon_bss;
        struct cfg80211_bss *transmitted_bss;
        struct list_head nontrans_list;

        s32 signal;

        u16 beacon_interval;
        u16 capability;

        u8 bssid[ETH_ALEN];
        u8 chains;
        s8 chain_signal[IEEE80211_MAX_CHAINS];

        u8 proberesp_ecsa_stuck:1;

        u8 bssid_index;
        u8 max_bssid_indicator;

        u8 use_for;
        u8 cannot_use_reasons;

        u8 priv[] __aligned(sizeof(void *));
};

/**
 * ieee80211_bss_get_elem - find element with given ID
 * @bss: the bss to search
 * @id: the element ID
 *
 * Note that the return value is an RCU-protected pointer, so
 * rcu_read_lock() must be held when calling this function.
 * Return: %NULL if not found.
 */
const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id);

/**
 * ieee80211_bss_get_ie - find IE with given ID
 * @bss: the bss to search
 * @id: the element ID
 *
 * Note that the return value is an RCU-protected pointer, so
 * rcu_read_lock() must be held when calling this function.
 * Return: %NULL if not found.
 */
static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id)
{
        return (const void *)ieee80211_bss_get_elem(bss, id);
}


/**
 * struct cfg80211_auth_request - Authentication request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * authentication.
 *
 * @bss: The BSS to authenticate with, the callee must obtain a reference
 *        to it if it needs to keep it.
 * @auth_type: Authentication type (algorithm)
 * @ie: Extra IEs to add to Authentication frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @key_len: length of WEP key for shared key authentication
 * @key_idx: index of WEP key for shared key authentication
 * @key: WEP key for shared key authentication
 * @auth_data: Fields and elements in Authentication frames. This contains
 *        the authentication frame body (non-IE and IE data), excluding the
 *        Authentication algorithm number, i.e., starting at the Authentication
 *        transaction sequence number field.
 * @auth_data_len: Length of auth_data buffer in octets
 * @link_id: if >= 0, indicates authentication should be done as an MLD,
 *        the interface address is included as the MLD address and the
 *        necessary link (with the given link_id) will be created (and
 *        given an MLD address) by the driver
 * @ap_mld_addr: AP MLD address in case of authentication request with
 *        an AP MLD, valid iff @link_id >= 0
 */
struct cfg80211_auth_request {
        struct cfg80211_bss *bss;
        const u8 *ie;
        size_t ie_len;
        enum nl80211_auth_type auth_type;
        const u8 *key;
        u8 key_len;
        s8 key_idx;
        const u8 *auth_data;
        size_t auth_data_len;
        s8 link_id;
        const u8 *ap_mld_addr;
};

/**
 * struct cfg80211_assoc_link - per-link information for MLO association
 * @bss: the BSS pointer, see also &struct cfg80211_assoc_request::bss;
 *        if this is %NULL for a link, that link is not requested
 * @elems: extra elements for the per-STA profile for this link
 * @elems_len: length of the elements
 * @disabled: If set this link should be included during association etc. but it
 *        should not be used until enabled by the AP MLD.
 * @error: per-link error code, must be <= 0. If there is an error, then the
 *        operation as a whole must fail.
 */
struct cfg80211_assoc_link {
        struct cfg80211_bss *bss;
        const u8 *elems;
        size_t elems_len;
        bool disabled;
        int error;
};

/**
 * enum cfg80211_assoc_req_flags - Over-ride default behaviour in association.
 *
 * @ASSOC_REQ_DISABLE_HT:  Disable HT (802.11n)
 * @ASSOC_REQ_DISABLE_VHT:  Disable VHT
 * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association
 * @CONNECT_REQ_EXTERNAL_AUTH_SUPPORT: User space indicates external
 *        authentication capability. Drivers can offload authentication to
 *        userspace if this flag is set. Only applicable for cfg80211_connect()
 *        request (connect callback).
 * @ASSOC_REQ_DISABLE_HE:  Disable HE
 * @ASSOC_REQ_DISABLE_EHT:  Disable EHT
 * @CONNECT_REQ_MLO_SUPPORT: Userspace indicates support for handling MLD links.
 *        Drivers shall disable MLO features for the current association if this
 *        flag is not set.
 * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any)
 */
enum cfg80211_assoc_req_flags {
        ASSOC_REQ_DISABLE_HT                        = BIT(0),
        ASSOC_REQ_DISABLE_VHT                        = BIT(1),
        ASSOC_REQ_USE_RRM                        = BIT(2),
        CONNECT_REQ_EXTERNAL_AUTH_SUPPORT        = BIT(3),
        ASSOC_REQ_DISABLE_HE                        = BIT(4),
        ASSOC_REQ_DISABLE_EHT                        = BIT(5),
        CONNECT_REQ_MLO_SUPPORT                        = BIT(6),
        ASSOC_REQ_SPP_AMSDU                        = BIT(7),
};

/**
 * struct cfg80211_assoc_request - (Re)Association request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * (re)association.
 * @bss: The BSS to associate with. If the call is successful the driver is
 *        given a reference that it must give back to cfg80211_send_rx_assoc()
 *        or to cfg80211_assoc_timeout(). To ensure proper refcounting, new
 *        association requests while already associating must be rejected.
 *        This also applies to the @links.bss parameter, which is used instead
 *        of this one (it is %NULL) for MLO associations.
 * @ie: Extra IEs to add to (Re)Association Request frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @use_mfp: Use management frame protection (IEEE 802.11w) in this association
 * @crypto: crypto settings
 * @prev_bssid: previous BSSID, if not %NULL use reassociate frame. This is used
 *        to indicate a request to reassociate within the ESS instead of a request
 *        do the initial association with the ESS. When included, this is set to
 *        the BSSID of the current association, i.e., to the value that is
 *        included in the Current AP address field of the Reassociation Request
 *        frame.
 * @flags:  See &enum cfg80211_assoc_req_flags
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @vht_capa: VHT capability override
 * @vht_capa_mask: VHT capability mask indicating which fields to use
 * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or
 *        %NULL if FILS is not used.
 * @fils_kek_len: Length of fils_kek in octets
 * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association
 *        Request/Response frame or %NULL if FILS is not used. This field starts
 *        with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
 * @s1g_capa: S1G capability override
 * @s1g_capa_mask: S1G capability override mask
 * @links: per-link information for MLO connections
 * @link_id: >= 0 for MLO connections, where links are given, and indicates
 *        the link on which the association request should be sent
 * @ap_mld_addr: AP MLD address in case of MLO association request,
 *        valid iff @link_id >= 0
 */
struct cfg80211_assoc_request {
        struct cfg80211_bss *bss;
        const u8 *ie, *prev_bssid;
        size_t ie_len;
        struct cfg80211_crypto_settings crypto;
        bool use_mfp;
        u32 flags;
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct ieee80211_vht_cap vht_capa, vht_capa_mask;
        const u8 *fils_kek;
        size_t fils_kek_len;
        const u8 *fils_nonces;
        struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask;
        struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS];
        const u8 *ap_mld_addr;
        s8 link_id;
};

/**
 * struct cfg80211_deauth_request - Deauthentication request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * deauthentication.
 *
 * @bssid: the BSSID or AP MLD address to deauthenticate from
 * @ie: Extra IEs to add to Deauthentication frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @reason_code: The reason code for the deauthentication
 * @local_state_change: if set, change local state only and
 *        do not set a deauth frame
 */
struct cfg80211_deauth_request {
        const u8 *bssid;
        const u8 *ie;
        size_t ie_len;
        u16 reason_code;
        bool local_state_change;
};

/**
 * struct cfg80211_disassoc_request - Disassociation request data
 *
 * This structure provides information needed to complete IEEE 802.11
 * disassociation.
 *
 * @ap_addr: the BSSID or AP MLD address to disassociate from
 * @ie: Extra IEs to add to Disassociation frame or %NULL
 * @ie_len: Length of ie buffer in octets
 * @reason_code: The reason code for the disassociation
 * @local_state_change: This is a request for a local state only, i.e., no
 *        Disassociation frame is to be transmitted.
 */
struct cfg80211_disassoc_request {
        const u8 *ap_addr;
        const u8 *ie;
        size_t ie_len;
        u16 reason_code;
        bool local_state_change;
};

/**
 * struct cfg80211_ibss_params - IBSS parameters
 *
 * This structure defines the IBSS parameters for the join_ibss()
 * method.
 *
 * @ssid: The SSID, will always be non-null.
 * @ssid_len: The length of the SSID, will always be non-zero.
 * @bssid: Fixed BSSID requested, maybe be %NULL, if set do not
 *        search for IBSSs with a different BSSID.
 * @chandef: defines the channel to use if no other IBSS to join can be found
 * @channel_fixed: The channel should be fixed -- do not search for
 *        IBSSs to join on other channels.
 * @ie: information element(s) to include in the beacon
 * @ie_len: length of that
 * @beacon_interval: beacon interval to use
 * @privacy: this is a protected network, keys will be configured
 *        after joining
 * @control_port: whether user space controls IEEE 802.1X port, i.e.,
 *        sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is
 *        required to assume that the port is unauthorized until authorized by
 *        user space. Otherwise, port is marked authorized by default.
 * @control_port_over_nl80211: TRUE if userspace expects to exchange control
 *        port frames over NL80211 instead of the network interface.
 * @userspace_handles_dfs: whether user space controls DFS operation, i.e.
 *        changes the channel when a radar is detected. This is required
 *        to operate on DFS channels.
 * @basic_rates: bitmap of basic rates to use when creating the IBSS
 * @mcast_rate: per-band multicast rate index + 1 (0: disabled)
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @wep_keys: static WEP keys, if not NULL points to an array of
 *        CFG80211_MAX_WEP_KEYS WEP keys
 * @wep_tx_key: key index (0..3) of the default TX static WEP key
 */
struct cfg80211_ibss_params {
        const u8 *ssid;
        const u8 *bssid;
        struct cfg80211_chan_def chandef;
        const u8 *ie;
        u8 ssid_len, ie_len;
        u16 beacon_interval;
        u32 basic_rates;
        bool channel_fixed;
        bool privacy;
        bool control_port;
        bool control_port_over_nl80211;
        bool userspace_handles_dfs;
        int mcast_rate[NUM_NL80211_BANDS];
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct key_params *wep_keys;
        int wep_tx_key;
};

/**
 * struct cfg80211_bss_selection - connection parameters for BSS selection.
 *
 * @behaviour: requested BSS selection behaviour.
 * @param: parameters for requestion behaviour.
 * @param.band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF.
 * @param.adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST.
 */
struct cfg80211_bss_selection {
        enum nl80211_bss_select_attr behaviour;
        union {
                enum nl80211_band band_pref;
                struct cfg80211_bss_select_adjust adjust;
        } param;
};

/**
 * struct cfg80211_connect_params - Connection parameters
 *
 * This structure provides information needed to complete IEEE 802.11
 * authentication and association.
 *
 * @channel: The channel to use or %NULL if not specified (auto-select based
 *        on scan results)
 * @channel_hint: The channel of the recommended BSS for initial connection or
 *        %NULL if not specified
 * @bssid: The AP BSSID or %NULL if not specified (auto-select based on scan
 *        results)
 * @bssid_hint: The recommended AP BSSID for initial connection to the BSS or
 *        %NULL if not specified. Unlike the @bssid parameter, the driver is
 *        allowed to ignore this @bssid_hint if it has knowledge of a better BSS
 *        to use.
 * @ssid: SSID
 * @ssid_len: Length of ssid in octets
 * @auth_type: Authentication type (algorithm)
 * @ie: IEs for association request
 * @ie_len: Length of assoc_ie in octets
 * @privacy: indicates whether privacy-enabled APs should be used
 * @mfp: indicate whether management frame protection is used
 * @crypto: crypto settings
 * @key_len: length of WEP key for shared key authentication
 * @key_idx: index of WEP key for shared key authentication
 * @key: WEP key for shared key authentication
 * @flags:  See &enum cfg80211_assoc_req_flags
 * @bg_scan_period:  Background scan period in seconds
 *        or -1 to indicate that default value is to be used.
 * @ht_capa:  HT Capabilities over-rides.  Values set in ht_capa_mask
 *        will be used in ht_capa.  Un-supported values will be ignored.
 * @ht_capa_mask:  The bits of ht_capa which are to be used.
 * @vht_capa:  VHT Capability overrides
 * @vht_capa_mask: The bits of vht_capa which are to be used.
 * @pbss: if set, connect to a PCP instead of AP. Valid for DMG
 *        networks.
 * @bss_select: criteria to be used for BSS selection.
 * @prev_bssid: previous BSSID, if not %NULL use reassociate frame. This is used
 *        to indicate a request to reassociate within the ESS instead of a request
 *        do the initial association with the ESS. When included, this is set to
 *        the BSSID of the current association, i.e., to the value that is
 *        included in the Current AP address field of the Reassociation Request
 *        frame.
 * @fils_erp_username: EAP re-authentication protocol (ERP) username part of the
 *        NAI or %NULL if not specified. This is used to construct FILS wrapped
 *        data IE.
 * @fils_erp_username_len: Length of @fils_erp_username in octets.
 * @fils_erp_realm: EAP re-authentication protocol (ERP) realm part of NAI or
 *        %NULL if not specified. This specifies the domain name of ER server and
 *        is used to construct FILS wrapped data IE.
 * @fils_erp_realm_len: Length of @fils_erp_realm in octets.
 * @fils_erp_next_seq_num: The next sequence number to use in the FILS ERP
 *        messages. This is also used to construct FILS wrapped data IE.
 * @fils_erp_rrk: ERP re-authentication Root Key (rRK) used to derive additional
 *        keys in FILS or %NULL if not specified.
 * @fils_erp_rrk_len: Length of @fils_erp_rrk in octets.
 * @want_1x: indicates user-space supports and wants to use 802.1X driver
 *        offload of 4-way handshake.
 * @edmg: define the EDMG channels.
 *        This may specify multiple channels and bonding options for the driver
 *        to choose from, based on BSS configuration.
 */
struct cfg80211_connect_params {
        struct ieee80211_channel *channel;
        struct ieee80211_channel *channel_hint;
        const u8 *bssid;
        const u8 *bssid_hint;
        const u8 *ssid;
        size_t ssid_len;
        enum nl80211_auth_type auth_type;
        const u8 *ie;
        size_t ie_len;
        bool privacy;
        enum nl80211_mfp mfp;
        struct cfg80211_crypto_settings crypto;
        const u8 *key;
        u8 key_len, key_idx;
        u32 flags;
        int bg_scan_period;
        struct ieee80211_ht_cap ht_capa;
        struct ieee80211_ht_cap ht_capa_mask;
        struct ieee80211_vht_cap vht_capa;
        struct ieee80211_vht_cap vht_capa_mask;
        bool pbss;
        struct cfg80211_bss_selection bss_select;
        const u8 *prev_bssid;
        const u8 *fils_erp_username;
        size_t fils_erp_username_len;
        const u8 *fils_erp_realm;
        size_t fils_erp_realm_len;
        u16 fils_erp_next_seq_num;
        const u8 *fils_erp_rrk;
        size_t fils_erp_rrk_len;
        bool want_1x;
        struct ieee80211_edmg edmg;
};

/**
 * enum cfg80211_connect_params_changed - Connection parameters being updated
 *
 * This enum provides information of all connect parameters that
 * have to be updated as part of update_connect_params() call.
 *
 * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
 * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm,
 *        username, erp sequence number and rrk) are updated
 * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated
 */
enum cfg80211_connect_params_changed {
        UPDATE_ASSOC_IES                = BIT(0),
        UPDATE_FILS_ERP_INFO                = BIT(1),
        UPDATE_AUTH_TYPE                = BIT(2),
};

/**
 * enum wiphy_params_flags - set_wiphy_params bitfield values
 * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed
 * @WIPHY_PARAM_RETRY_LONG: wiphy->retry_long has changed
 * @WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed
 * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed
 * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed
 * @WIPHY_PARAM_DYN_ACK: dynack has been enabled
 * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed
 * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed
 * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum
 */
enum wiphy_params_flags {
        WIPHY_PARAM_RETRY_SHORT                = 1 << 0,
        WIPHY_PARAM_RETRY_LONG                = 1 << 1,
        WIPHY_PARAM_FRAG_THRESHOLD        = 1 << 2,
        WIPHY_PARAM_RTS_THRESHOLD        = 1 << 3,
        WIPHY_PARAM_COVERAGE_CLASS        = 1 << 4,
        WIPHY_PARAM_DYN_ACK                = 1 << 5,
        WIPHY_PARAM_TXQ_LIMIT                = 1 << 6,
        WIPHY_PARAM_TXQ_MEMORY_LIMIT        = 1 << 7,
        WIPHY_PARAM_TXQ_QUANTUM                = 1 << 8,
};

#define IEEE80211_DEFAULT_AIRTIME_WEIGHT        256

/* The per TXQ device queue limit in airtime */
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L        5000
#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H        12000

/* The per interface airtime threshold to switch to lower queue limit */
#define IEEE80211_AQL_THRESHOLD                        24000

/**
 * struct cfg80211_pmksa - PMK Security Association
 *
 * This structure is passed to the set/del_pmksa() method for PMKSA
 * caching.
 *
 * @bssid: The AP's BSSID (may be %NULL).
 * @pmkid: The identifier to refer a PMKSA.
 * @pmk: The PMK for the PMKSA identified by @pmkid. This is used for key
 *        derivation by a FILS STA. Otherwise, %NULL.
 * @pmk_len: Length of the @pmk. The length of @pmk can differ depending on
 *        the hash algorithm used to generate this.
 * @ssid: SSID to specify the ESS within which a PMKSA is valid when using FILS
 *        cache identifier (may be %NULL).
 * @ssid_len: Length of the @ssid in octets.
 * @cache_id: 2-octet cache identifier advertized by a FILS AP identifying the
 *        scope of PMKSA. This is valid only if @ssid_len is non-zero (may be
 *        %NULL).
 * @pmk_lifetime: Maximum lifetime for PMKSA in seconds
 *        (dot11RSNAConfigPMKLifetime) or 0 if not specified.
 *        The configured PMKSA must not be used for PMKSA caching after
 *        expiration and any keys derived from this PMK become invalid on
 *        expiration, i.e., the current association must be dropped if the PMK
 *        used for it expires.
 * @pmk_reauth_threshold: Threshold time for reauthentication (percentage of
 *        PMK lifetime, dot11RSNAConfigPMKReauthThreshold) or 0 if not specified.
 *        Drivers are expected to trigger a full authentication instead of using
 *        this PMKSA for caching when reassociating to a new BSS after this
 *        threshold to generate a new PMK before the current one expires.
 */
struct cfg80211_pmksa {
        const u8 *bssid;
        const u8 *pmkid;
        const u8 *pmk;
        size_t pmk_len;
        const u8 *ssid;
        size_t ssid_len;
        const u8 *cache_id;
        u32 pmk_lifetime;
        u8 pmk_reauth_threshold;
};

/**
 * struct cfg80211_pkt_pattern - packet pattern
 * @mask: bitmask where to match pattern and where to ignore bytes,
 *        one bit per byte, in same format as nl80211
 * @pattern: bytes to match where bitmask is 1
 * @pattern_len: length of pattern (in bytes)
 * @pkt_offset: packet offset (in bytes)
 *
 * Internal note: @mask and @pattern are allocated in one chunk of
 * memory, free @mask only!
 */
struct cfg80211_pkt_pattern {
        const u8 *mask, *pattern;
        int pattern_len;
        int pkt_offset;
};

/**
 * struct cfg80211_wowlan_tcp - TCP connection parameters
 *
 * @sock: (internal) socket for source port allocation
 * @src: source IP address
 * @dst: destination IP address
 * @dst_mac: destination MAC address
 * @src_port: source port
 * @dst_port: destination port
 * @payload_len: data payload length
 * @payload: data payload buffer
 * @payload_seq: payload sequence stamping configuration
 * @data_interval: interval at which to send data packets
 * @wake_len: wakeup payload match length
 * @wake_data: wakeup payload match data
 * @wake_mask: wakeup payload match mask
 * @tokens_size: length of the tokens buffer
 * @payload_tok: payload token usage configuration
 */
struct cfg80211_wowlan_tcp {
        struct socket *sock;
        __be32 src, dst;
        u16 src_port, dst_port;
        u8 dst_mac[ETH_ALEN];
        int payload_len;
        const u8 *payload;
        struct nl80211_wowlan_tcp_data_seq payload_seq;
        u32 data_interval;
        u32 wake_len;
        const u8 *wake_data, *wake_mask;
        u32 tokens_size;
        /* must be last, variable member */
        struct nl80211_wowlan_tcp_data_token payload_tok;
};

/**
 * struct cfg80211_wowlan - Wake on Wireless-LAN support info
 *
 * This structure defines the enabled WoWLAN triggers for the device.
 * @any: wake up on any activity -- special trigger if device continues
 *        operating as normal during suspend
 * @disconnect: wake up if getting disconnected
 * @magic_pkt: wake up on receiving magic packet
 * @patterns: wake up on receiving packet matching a pattern
 * @n_patterns: number of patterns
 * @gtk_rekey_failure: wake up on GTK rekey failure
 * @eap_identity_req: wake up on EAP identity request packet
 * @four_way_handshake: wake up on 4-way handshake
 * @rfkill_release: wake up when rfkill is released
 * @tcp: TCP connection establishment/wakeup parameters, see nl80211.h.
 *        NULL if not configured.
 * @nd_config: configuration for the scan to be used for net detect wake.
 */
struct cfg80211_wowlan {
        bool any, disconnect, magic_pkt, gtk_rekey_failure,
             eap_identity_req, four_way_handshake,
             rfkill_release;
        struct cfg80211_pkt_pattern *patterns;
        struct cfg80211_wowlan_tcp *tcp;
        int n_patterns;
        struct cfg80211_sched_scan_request *nd_config;
};

/**
 * struct cfg80211_coalesce_rules - Coalesce rule parameters
 *
 * This structure defines coalesce rule for the device.
 * @delay: maximum coalescing delay in msecs.
 * @condition: condition for packet coalescence.
 *        see &enum nl80211_coalesce_condition.
 * @patterns: array of packet patterns
 * @n_patterns: number of patterns
 */
struct cfg80211_coalesce_rules {
        int delay;
        enum nl80211_coalesce_condition condition;
        struct cfg80211_pkt_pattern *patterns;
        int n_patterns;
};

/**
 * struct cfg80211_coalesce - Packet coalescing settings
 *
 * This structure defines coalescing settings.
 * @rules: array of coalesce rules
 * @n_rules: number of rules
 */
struct cfg80211_coalesce {
        int n_rules;
        struct cfg80211_coalesce_rules rules[] __counted_by(n_rules);
};

/**
 * struct cfg80211_wowlan_nd_match - information about the match
 *
 * @ssid: SSID of the match that triggered the wake up
 * @n_channels: Number of channels where the match occurred.  This
 *        value may be zero if the driver can't report the channels.
 * @channels: center frequencies of the channels where a match
 *        occurred (in MHz)
 */
struct cfg80211_wowlan_nd_match {
        struct cfg80211_ssid ssid;
        int n_channels;
        u32 channels[] __counted_by(n_channels);
};

/**
 * struct cfg80211_wowlan_nd_info - net detect wake up information
 *
 * @n_matches: Number of match information instances provided in
 *        @matches.  This value may be zero if the driver can't provide
 *        match information.
 * @matches: Array of pointers to matches containing information about
 *        the matches that triggered the wake up.
 */
struct cfg80211_wowlan_nd_info {
        int n_matches;
        struct cfg80211_wowlan_nd_match *matches[] __counted_by(n_matches);
};

/**
 * struct cfg80211_wowlan_wakeup - wakeup report
 * @disconnect: woke up by getting disconnected
 * @magic_pkt: woke up by receiving magic packet
 * @gtk_rekey_failure: woke up by GTK rekey failure
 * @eap_identity_req: woke up by EAP identity request packet
 * @four_way_handshake: woke up by 4-way handshake
 * @rfkill_release: woke up by rfkill being released
 * @pattern_idx: pattern that caused wakeup, -1 if not due to pattern
 * @packet_present_len: copied wakeup packet data
 * @packet_len: original wakeup packet length
 * @packet: The packet causing the wakeup, if any.
 * @packet_80211:  For pattern match, magic packet and other data
 *        frame triggers an 802.3 frame should be reported, for
 *        disconnect due to deauth 802.11 frame. This indicates which
 *        it is.
 * @tcp_match: TCP wakeup packet received
 * @tcp_connlost: TCP connection lost or failed to establish
 * @tcp_nomoretokens: TCP data ran out of tokens
 * @net_detect: if not %NULL, woke up because of net detect
 * @unprot_deauth_disassoc: woke up due to unprotected deauth or
 *        disassoc frame (in MFP).
 */
struct cfg80211_wowlan_wakeup {
        bool disconnect, magic_pkt, gtk_rekey_failure,
             eap_identity_req, four_way_handshake,
             rfkill_release, packet_80211,
             tcp_match, tcp_connlost, tcp_nomoretokens,
             unprot_deauth_disassoc;
        s32 pattern_idx;
        u32 packet_present_len, packet_len;
        const void *packet;
        struct cfg80211_wowlan_nd_info *net_detect;
};

/**
 * struct cfg80211_gtk_rekey_data - rekey data
 * @kek: key encryption key (@kek_len bytes)
 * @kck: key confirmation key (@kck_len bytes)
 * @replay_ctr: replay counter (NL80211_REPLAY_CTR_LEN bytes)
 * @kek_len: length of kek
 * @kck_len: length of kck
 * @akm: akm (oui, id)
 */
struct cfg80211_gtk_rekey_data {
        const u8 *kek, *kck, *replay_ctr;
        u32 akm;
        u8 kek_len, kck_len;
};

/**
 * struct cfg80211_update_ft_ies_params - FT IE Information
 *
 * This structure provides information needed to update the fast transition IE
 *
 * @md: The Mobility Domain ID, 2 Octet value
 * @ie: Fast Transition IEs
 * @ie_len: Length of ft_ie in octets
 */
struct cfg80211_update_ft_ies_params {
        u16 md;
        const u8 *ie;
        size_t ie_len;
};

/**
 * struct cfg80211_mgmt_tx_params - mgmt tx parameters
 *
 * This structure provides information needed to transmit a mgmt frame
 *
 * @chan: channel to use
 * @offchan: indicates whether off channel operation is required
 * @wait: duration for ROC
 * @buf: buffer to transmit
 * @len: buffer length
 * @no_cck: don't use cck rates for this frame
 * @dont_wait_for_ack: tells the low level not to wait for an ack
 * @n_csa_offsets: length of csa_offsets array
 * @csa_offsets: array of all the csa offsets in the frame
 * @link_id: for MLO, the link ID to transmit on, -1 if not given; note
 *        that the link ID isn't validated (much), it's in range but the
 *        link might not exist (or be used by the receiver STA)
 */
struct cfg80211_mgmt_tx_params {
        struct ieee80211_channel *chan;
        bool offchan;
        unsigned int wait;
        const u8 *buf;
        size_t len;
        bool no_cck;
        bool dont_wait_for_ack;
        int n_csa_offsets;
        const u16 *csa_offsets;
        int link_id;
};

/**
 * struct cfg80211_dscp_exception - DSCP exception
 *
 * @dscp: DSCP value that does not adhere to the user priority range definition
 * @up: user priority value to which the corresponding DSCP value belongs
 */
struct cfg80211_dscp_exception {
        u8 dscp;
        u8 up;
};

/**
 * struct cfg80211_dscp_range - DSCP range definition for user priority
 *
 * @low: lowest DSCP value of this user priority range, inclusive
 * @high: highest DSCP value of this user priority range, inclusive
 */
struct cfg80211_dscp_range {
        u8 low;
        u8 high;
};

/* QoS Map Set element length defined in IEEE Std 802.11-2012, 8.4.2.97 */
#define IEEE80211_QOS_MAP_MAX_EX        21
#define IEEE80211_QOS_MAP_LEN_MIN        16
#define IEEE80211_QOS_MAP_LEN_MAX \
        (IEEE80211_QOS_MAP_LEN_MIN + 2 * IEEE80211_QOS_MAP_MAX_EX)

/**
 * struct cfg80211_qos_map - QoS Map Information
 *
 * This struct defines the Interworking QoS map setting for DSCP values
 *
 * @num_des: number of DSCP exceptions (0..21)
 * @dscp_exception: optionally up to maximum of 21 DSCP exceptions from
 *        the user priority DSCP range definition
 * @up: DSCP range definition for a particular user priority
 */
struct cfg80211_qos_map {
        u8 num_des;
        struct cfg80211_dscp_exception dscp_exception[IEEE80211_QOS_MAP_MAX_EX];
        struct cfg80211_dscp_range up[8];
};

/**
 * struct cfg80211_nan_conf - NAN configuration
 *
 * This struct defines NAN configuration parameters
 *
 * @master_pref: master preference (1 - 255)
 * @bands: operating bands, a bitmap of &enum nl80211_band values.
 *        For instance, for NL80211_BAND_2GHZ, bit 0 would be set
 *        (i.e. BIT(NL80211_BAND_2GHZ)).
 */
struct cfg80211_nan_conf {
        u8 master_pref;
        u8 bands;
};

/**
 * enum cfg80211_nan_conf_changes - indicates changed fields in NAN
 * configuration
 *
 * @CFG80211_NAN_CONF_CHANGED_PREF: master preference
 * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands
 */
enum cfg80211_nan_conf_changes {
        CFG80211_NAN_CONF_CHANGED_PREF = BIT(0),
        CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1),
};

/**
 * struct cfg80211_nan_func_filter - a NAN function Rx / Tx filter
 *
 * @filter: the content of the filter
 * @len: the length of the filter
 */
struct cfg80211_nan_func_filter {
        const u8 *filter;
        u8 len;
};

/**
 * struct cfg80211_nan_func - a NAN function
 *
 * @type: &enum nl80211_nan_function_type
 * @service_id: the service ID of the function
 * @publish_type: &nl80211_nan_publish_type
 * @close_range: if true, the range should be limited. Threshold is
 *        implementation specific.
 * @publish_bcast: if true, the solicited publish should be broadcasted
 * @subscribe_active: if true, the subscribe is active
 * @followup_id: the instance ID for follow up
 * @followup_reqid: the requester instance ID for follow up
 * @followup_dest: MAC address of the recipient of the follow up
 * @ttl: time to live counter in DW.
 * @serv_spec_info: Service Specific Info
 * @serv_spec_info_len: Service Specific Info length
 * @srf_include: if true, SRF is inclusive
 * @srf_bf: Bloom Filter
 * @srf_bf_len: Bloom Filter length
 * @srf_bf_idx: Bloom Filter index
 * @srf_macs: SRF MAC addresses
 * @srf_num_macs: number of MAC addresses in SRF
 * @rx_filters: rx filters that are matched with corresponding peer's tx_filter
 * @tx_filters: filters that should be transmitted in the SDF.
 * @num_rx_filters: length of &rx_filters.
 * @num_tx_filters: length of &tx_filters.
 * @instance_id: driver allocated id of the function.
 * @cookie: unique NAN function identifier.
 */
struct cfg80211_nan_func {
        enum nl80211_nan_function_type type;
        u8 service_id[NL80211_NAN_FUNC_SERVICE_ID_LEN];
        u8 publish_type;
        bool close_range;
        bool publish_bcast;
        bool subscribe_active;
        u8 followup_id;
        u8 followup_reqid;
        struct mac_address followup_dest;
        u32 ttl;
        const u8 *serv_spec_info;
        u8 serv_spec_info_len;
        bool srf_include;
        const u8 *srf_bf;
        u8 srf_bf_len;
        u8 srf_bf_idx;
        struct mac_address *srf_macs;
        int srf_num_macs;
        struct cfg80211_nan_func_filter *rx_filters;
        struct cfg80211_nan_func_filter *tx_filters;
        u8 num_tx_filters;
        u8 num_rx_filters;
        u8 instance_id;
        u64 cookie;
};

/**
 * struct cfg80211_pmk_conf - PMK configuration
 *
 * @aa: authenticator address
 * @pmk_len: PMK length in bytes.
 * @pmk: the PMK material
 * @pmk_r0_name: PMK-R0 Name. NULL if not applicable (i.e., the PMK
 *        is not PMK-R0). When pmk_r0_name is not NULL, the pmk field
 *        holds PMK-R0.
 */
struct cfg80211_pmk_conf {
        const u8 *aa;
        u8 pmk_len;
        const u8 *pmk;
        const u8 *pmk_r0_name;
};

/**
 * struct cfg80211_external_auth_params - Trigger External authentication.
 *
 * Commonly used across the external auth request and event interfaces.
 *
 * @action: action type / trigger for external authentication. Only significant
 *        for the authentication request event interface (driver to user space).
 * @bssid: BSSID of the peer with which the authentication has
 *        to happen. Used by both the authentication request event and
 *        authentication response command interface.
 * @ssid: SSID of the AP.  Used by both the authentication request event and
 *        authentication response command interface.
 * @key_mgmt_suite: AKM suite of the respective authentication. Used by the
 *        authentication request event interface.
 * @status: status code, %WLAN_STATUS_SUCCESS for successful authentication,
 *        use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space cannot give you
 *        the real status code for failures. Used only for the authentication
 *        response command interface (user space to driver).
 * @pmkid: The identifier to refer a PMKSA.
 * @mld_addr: MLD address of the peer. Used by the authentication request event
 *        interface. Driver indicates this to enable MLO during the authentication
 *        offload to user space. Driver shall look at %NL80211_ATTR_MLO_SUPPORT
 *        flag capability in NL80211_CMD_CONNECT to know whether the user space
 *        supports enabling MLO during the authentication offload.
 *        User space should use the address of the interface (on which the
 *        authentication request event reported) as self MLD address. User space
 *        and driver should use MLD addresses in RA, TA and BSSID fields of
 *        authentication frames sent or received via cfg80211. The driver
 *        translates the MLD addresses to/from link addresses based on the link
 *        chosen for the authentication.
 */
struct cfg80211_external_auth_params {
        enum nl80211_external_auth_action action;
        u8 bssid[ETH_ALEN] __aligned(2);
        struct cfg80211_ssid ssid;
        unsigned int key_mgmt_suite;
        u16 status;
        const u8 *pmkid;
        u8 mld_addr[ETH_ALEN] __aligned(2);
};

/**
 * struct cfg80211_ftm_responder_stats - FTM responder statistics
 *
 * @filled: bitflag of flags using the bits of &enum nl80211_ftm_stats to
 *        indicate the relevant values in this struct for them
 * @success_num: number of FTM sessions in which all frames were successfully
 *        answered
 * @partial_num: number of FTM sessions in which part of frames were
 *        successfully answered
 * @failed_num: number of failed FTM sessions
 * @asap_num: number of ASAP FTM sessions
 * @non_asap_num: number of  non-ASAP FTM sessions
 * @total_duration_ms: total sessions durations - gives an indication
 *        of how much time the responder was busy
 * @unknown_triggers_num: number of unknown FTM triggers - triggers from
 *        initiators that didn't finish successfully the negotiation phase with
 *        the responder
 * @reschedule_requests_num: number of FTM reschedule requests - initiator asks
 *        for a new scheduling although it already has scheduled FTM slot
 * @out_of_window_triggers_num: total FTM triggers out of scheduled window
 */
struct cfg80211_ftm_responder_stats {
        u32 filled;
        u32 success_num;
        u32 partial_num;
        u32 failed_num;
        u32 asap_num;
        u32 non_asap_num;
        u64 total_duration_ms;
        u32 unknown_triggers_num;
        u32 reschedule_requests_num;
        u32 out_of_window_triggers_num;
};

/**
 * struct cfg80211_pmsr_ftm_result - FTM result
 * @failure_reason: if this measurement failed (PMSR status is
 *        %NL80211_PMSR_STATUS_FAILURE), this gives a more precise
 *        reason than just "failure"
 * @burst_index: if reporting partial results, this is the index
 *        in [0 .. num_bursts-1] of the burst that's being reported
 * @num_ftmr_attempts: number of FTM request frames transmitted
 * @num_ftmr_successes: number of FTM request frames acked
 * @busy_retry_time: if failure_reason is %NL80211_PMSR_FTM_FAILURE_PEER_BUSY,
 *        fill this to indicate in how many seconds a retry is deemed possible
 *        by the responder
 * @num_bursts_exp: actual number of bursts exponent negotiated
 * @burst_duration: actual burst duration negotiated
 * @ftms_per_burst: actual FTMs per burst negotiated
 * @lci_len: length of LCI information (if present)
 * @civicloc_len: length of civic location information (if present)
 * @lci: LCI data (may be %NULL)
 * @civicloc: civic location data (may be %NULL)
 * @rssi_avg: average RSSI over FTM action frames reported
 * @rssi_spread: spread of the RSSI over FTM action frames reported
 * @tx_rate: bitrate for transmitted FTM action frame response
 * @rx_rate: bitrate of received FTM action frame
 * @rtt_avg: average of RTTs measured (must have either this or @dist_avg)
 * @rtt_variance: variance of RTTs measured (note that standard deviation is
 *        the square root of the variance)
 * @rtt_spread: spread of the RTTs measured
 * @dist_avg: average of distances (mm) measured
 *        (must have either this or @rtt_avg)
 * @dist_variance: variance of distances measured (see also @rtt_variance)
 * @dist_spread: spread of distances measured (see also @rtt_spread)
 * @num_ftmr_attempts_valid: @num_ftmr_attempts is valid
 * @num_ftmr_successes_valid: @num_ftmr_successes is valid
 * @rssi_avg_valid: @rssi_avg is valid
 * @rssi_spread_valid: @rssi_spread is valid
 * @tx_rate_valid: @tx_rate is valid
 * @rx_rate_valid: @rx_rate is valid
 * @rtt_avg_valid: @rtt_avg is valid
 * @rtt_variance_valid: @rtt_variance is valid
 * @rtt_spread_valid: @rtt_spread is valid
 * @dist_avg_valid: @dist_avg is valid
 * @dist_variance_valid: @dist_variance is valid
 * @dist_spread_valid: @dist_spread is valid
 */
struct cfg80211_pmsr_ftm_result {
        const u8 *lci;
        const u8 *civicloc;
        unsigned int lci_len;
        unsigned int civicloc_len;
        enum nl80211_peer_measurement_ftm_failure_reasons failure_reason;
        u32 num_ftmr_attempts, num_ftmr_successes;
        s16 burst_index;
        u8 busy_retry_time;
        u8 num_bursts_exp;
        u8 burst_duration;
        u8 ftms_per_burst;
        s32 rssi_avg;
        s32 rssi_spread;
        struct rate_info tx_rate, rx_rate;
        s64 rtt_avg;
        s64 rtt_variance;
        s64 rtt_spread;
        s64 dist_avg;
        s64 dist_variance;
        s64 dist_spread;

        u16 num_ftmr_attempts_valid:1,
            num_ftmr_successes_valid:1,
            rssi_avg_valid:1,
            rssi_spread_valid:1,
            tx_rate_valid:1,
            rx_rate_valid:1,
            rtt_avg_valid:1,
            rtt_variance_valid:1,
            rtt_spread_valid:1,
            dist_avg_valid:1,
            dist_variance_valid:1,
            dist_spread_valid:1;
};

/**
 * struct cfg80211_pmsr_result - peer measurement result
 * @addr: address of the peer
 * @host_time: host time (use ktime_get_boottime() adjust to the time when the
 *        measurement was made)
 * @ap_tsf: AP's TSF at measurement time
 * @status: status of the measurement
 * @final: if reporting partial results, mark this as the last one; if not
 *        reporting partial results always set this flag
 * @ap_tsf_valid: indicates the @ap_tsf value is valid
 * @type: type of the measurement reported, note that we only support reporting
 *        one type at a time, but you can report multiple results separately and
 *        they're all aggregated for userspace.
 * @ftm: FTM result
 */
struct cfg80211_pmsr_result {
        u64 host_time, ap_tsf;
        enum nl80211_peer_measurement_status status;

        u8 addr[ETH_ALEN];

        u8 final:1,
           ap_tsf_valid:1;

        enum nl80211_peer_measurement_type type;

        union {
                struct cfg80211_pmsr_ftm_result ftm;
        };
};

/**
 * struct cfg80211_pmsr_ftm_request_peer - FTM request data
 * @requested: indicates FTM is requested
 * @preamble: frame preamble to use
 * @burst_period: burst period to use
 * @asap: indicates to use ASAP mode
 * @num_bursts_exp: number of bursts exponent
 * @burst_duration: burst duration
 * @ftms_per_burst: number of FTMs per burst
 * @ftmr_retries: number of retries for FTM request
 * @request_lci: request LCI information
 * @request_civicloc: request civic location information
 * @trigger_based: use trigger based ranging for the measurement
 *                 If neither @trigger_based nor @non_trigger_based is set,
 *                 EDCA based ranging will be used.
 * @non_trigger_based: use non trigger based ranging for the measurement
 *                 If neither @trigger_based nor @non_trigger_based is set,
 *                 EDCA based ranging will be used.
 * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either
 *                 @trigger_based or @non_trigger_based is set.
 * @bss_color: the bss color of the responder. Optional. Set to zero to
 *        indicate the driver should set the BSS color. Only valid if
 *        @non_trigger_based or @trigger_based is set.
 *
 * See also nl80211 for the respective attribute documentation.
 */
struct cfg80211_pmsr_ftm_request_peer {
        enum nl80211_preamble preamble;
        u16 burst_period;
        u8 requested:1,
           asap:1,
           request_lci:1,
           request_civicloc:1,
           trigger_based:1,
           non_trigger_based:1,
           lmr_feedback:1;
        u8 num_bursts_exp;
        u8 burst_duration;
        u8 ftms_per_burst;
        u8 ftmr_retries;
        u8 bss_color;
};

/**
 * struct cfg80211_pmsr_request_peer - peer data for a peer measurement request
 * @addr: MAC address
 * @chandef: channel to use
 * @report_ap_tsf: report the associated AP's TSF
 * @ftm: FTM data, see &struct cfg80211_pmsr_ftm_request_peer
 */
struct cfg80211_pmsr_request_peer {
        u8 addr[ETH_ALEN];
        struct cfg80211_chan_def chandef;
        u8 report_ap_tsf:1;
        struct cfg80211_pmsr_ftm_request_peer ftm;
};

/**
 * struct cfg80211_pmsr_request - peer measurement request
 * @cookie: cookie, set by cfg80211
 * @nl_portid: netlink portid - used by cfg80211
 * @drv_data: driver data for this request, if required for aborting,
 *        not otherwise freed or anything by cfg80211
 * @mac_addr: MAC address used for (randomised) request
 * @mac_addr_mask: MAC address mask used for randomisation, bits that
 *        are 0 in the mask should be randomised, bits that are 1 should
 *        be taken from the @mac_addr
 * @list: used by cfg80211 to hold on to the request
 * @timeout: timeout (in milliseconds) for the whole operation, if
 *        zero it means there's no timeout
 * @n_peers: number of peers to do measurements with
 * @peers: per-peer measurement request data
 */
struct cfg80211_pmsr_request {
        u64 cookie;
        void *drv_data;
        u32 n_peers;
        u32 nl_portid;

        u32 timeout;

        u8 mac_addr[ETH_ALEN] __aligned(2);
        u8 mac_addr_mask[ETH_ALEN] __aligned(2);

        struct list_head list;

        struct cfg80211_pmsr_request_peer peers[] __counted_by(n_peers);
};

/**
 * struct cfg80211_update_owe_info - OWE Information
 *
 * This structure provides information needed for the drivers to offload OWE
 * (Opportunistic Wireless Encryption) processing to the user space.
 *
 * Commonly used across update_owe_info request and event interfaces.
 *
 * @peer: MAC address of the peer device for which the OWE processing
 *        has to be done.
 * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info
 *        processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space
 *        cannot give you the real status code for failures. Used only for
 *        OWE update request command interface (user space to driver).
 * @ie: IEs obtained from the peer or constructed by the user space. These are
 *        the IEs of the remote peer in the event from the host driver and
 *        the constructed IEs by the user space in the request interface.
 * @ie_len: Length of IEs in octets.
 * @assoc_link_id: MLO link ID of the AP, with which (re)association requested
 *        by peer. This will be filled by driver for both MLO and non-MLO station
 *        connections when the AP affiliated with an MLD. For non-MLD AP mode, it
 *        will be -1. Used only with OWE update event (driver to user space).
 * @peer_mld_addr: For MLO connection, MLD address of the peer. For non-MLO
 *        connection, it will be all zeros. This is applicable only when
 *        @assoc_link_id is not -1, i.e., the AP affiliated with an MLD. Used only
 *        with OWE update event (driver to user space).
 */
struct cfg80211_update_owe_info {
        u8 peer[ETH_ALEN] __aligned(2);
        u16 status;
        const u8 *ie;
        size_t ie_len;
        int assoc_link_id;
        u8 peer_mld_addr[ETH_ALEN] __aligned(2);
};

/**
 * struct mgmt_frame_regs - management frame registrations data
 * @global_stypes: bitmap of management frame subtypes registered
 *        for the entire device
 * @interface_stypes: bitmap of management frame subtypes registered
 *        for the given interface
 * @global_mcast_stypes: mcast RX is needed globally for these subtypes
 * @interface_mcast_stypes: mcast RX is needed on this interface
 *        for these subtypes
 */
struct mgmt_frame_regs {
        u32 global_stypes, interface_stypes;
        u32 global_mcast_stypes, interface_mcast_stypes;
};

/**
 * struct cfg80211_ops - backend description for wireless configuration
 *
 * This struct is registered by fullmac card drivers and/or wireless stacks
 * in order to handle configuration requests on their interfaces.
 *
 * All callbacks except where otherwise noted should return 0
 * on success or a negative error code.
 *
 * All operations are invoked with the wiphy mutex held. The RTNL may be
 * held in addition (due to wireless extensions) but this cannot be relied
 * upon except in cases where documented below. Note that due to ordering,
 * the RTNL also cannot be acquired in any handlers.
 *
 * @suspend: wiphy device needs to be suspended. The variable @wow will
 *        be %NULL or contain the enabled Wake-on-Wireless triggers that are
 *        configured for the device.
 * @resume: wiphy device needs to be resumed
 * @set_wakeup: Called when WoWLAN is enabled/disabled, use this callback
 *        to call device_set_wakeup_enable() to enable/disable wakeup from
 *        the device.
 *
 * @add_virtual_intf: create a new virtual interface with the given name,
 *        must set the struct wireless_dev's iftype. Beware: You must create
 *        the new netdev in the wiphy's network namespace! Returns the struct
 *        wireless_dev, or an ERR_PTR. For P2P device wdevs, the driver must
 *        also set the address member in the wdev.
 *        This additionally holds the RTNL to be able to do netdev changes.
 *
 * @del_virtual_intf: remove the virtual interface
 *        This additionally holds the RTNL to be able to do netdev changes.
 *
 * @change_virtual_intf: change type/configuration of virtual interface,
 *        keep the struct wireless_dev's iftype updated.
 *        This additionally holds the RTNL to be able to do netdev changes.
 *
 * @add_intf_link: Add a new MLO link to the given interface. Note that
 *        the wdev->link[] data structure has been updated, so the new link
 *        address is available.
 * @del_intf_link: Remove an MLO link from the given interface.
 *
 * @add_key: add a key with the given parameters. @mac_addr will be %NULL
 *        when adding a group key. @link_id will be -1 for non-MLO connection.
 *        For MLO connection, @link_id will be >= 0 for group key and -1 for
 *        pairwise key, @mac_addr will be peer's MLD address for MLO pairwise key.
 *
 * @get_key: get information about the key with the given parameters.
 *        @mac_addr will be %NULL when requesting information for a group
 *        key. All pointers given to the @callback function need not be valid
 *        after it returns. This function should return an error if it is
 *        not possible to retrieve the key, -ENOENT if it doesn't exist.
 *        @link_id will be -1 for non-MLO connection. For MLO connection,
 *        @link_id will be >= 0 for group key and -1 for pairwise key, @mac_addr
 *        will be peer's MLD address for MLO pairwise key.
 *
 * @del_key: remove a key given the @mac_addr (%NULL for a group key)
 *        and @key_index, return -ENOENT if the key doesn't exist. @link_id will
 *        be -1 for non-MLO connection. For MLO connection, @link_id will be >= 0
 *        for group key and -1 for pairwise key, @mac_addr will be peer's MLD
 *        address for MLO pairwise key.
 *
 * @set_default_key: set the default key on an interface. @link_id will be >= 0
 *        for MLO connection and -1 for non-MLO connection.
 *
 * @set_default_mgmt_key: set the default management frame key on an interface.
 *        @link_id will be >= 0 for MLO connection and -1 for non-MLO connection.
 *
 * @set_default_beacon_key: set the default Beacon frame key on an interface.
 *        @link_id will be >= 0 for MLO connection and -1 for non-MLO connection.
 *
 * @set_rekey_data: give the data necessary for GTK rekeying to the driver
 *
 * @start_ap: Start acting in AP mode defined by the parameters.
 * @change_beacon: Change the beacon parameters for an access point mode
 *        interface. This should reject the call when AP mode wasn't started.
 * @stop_ap: Stop being an AP, including stopping beaconing.
 *
 * @add_station: Add a new station.
 * @del_station: Remove a station
 * @change_station: Modify a given station. Note that flags changes are not much
 *        validated in cfg80211, in particular the auth/assoc/authorized flags
 *        might come to the driver in invalid combinations -- make sure to check
 *        them, also against the existing state! Drivers must call
 *        cfg80211_check_station_change() to validate the information.
 * @get_station: get station information for the station identified by @mac
 * @dump_station: dump station callback -- resume dump at index @idx
 *
 * @add_mpath: add a fixed mesh path
 * @del_mpath: delete a given mesh path
 * @change_mpath: change a given mesh path
 * @get_mpath: get a mesh path for the given parameters
 * @dump_mpath: dump mesh path callback -- resume dump at index @idx
 * @get_mpp: get a mesh proxy path for the given parameters
 * @dump_mpp: dump mesh proxy path callback -- resume dump at index @idx
 * @join_mesh: join the mesh network with the specified parameters
 *        (invoked with the wireless_dev mutex held)
 * @leave_mesh: leave the current mesh network
 *        (invoked with the wireless_dev mutex held)
 *
 * @get_mesh_config: Get the current mesh configuration
 *
 * @update_mesh_config: Update mesh parameters on a running mesh.
 *        The mask is a bitfield which tells us which parameters to
 *        set, and which to leave alone.
 *
 * @change_bss: Modify parameters for a given BSS.
 *
 * @inform_bss: Called by cfg80211 while being informed about new BSS data
 *        for every BSS found within the reported data or frame. This is called
 *        from within the cfg8011 inform_bss handlers while holding the bss_lock.
 *        The data parameter is passed through from drv_data inside
 *        struct cfg80211_inform_bss.
 *        The new IE data for the BSS is explicitly passed.
 *
 * @set_txq_params: Set TX queue parameters
 *
 * @libertas_set_mesh_channel: Only for backward compatibility for libertas,
 *        as it doesn't implement join_mesh and needs to set the channel to
 *        join the mesh instead.
 *
 * @set_monitor_channel: Set the monitor mode channel for the device. If other
 *        interfaces are active this callback should reject the configuration.
 *        If no interfaces are active or the device is down, the channel should
 *        be stored for when a monitor interface becomes active.
 *
 * @scan: Request to do a scan. If returning zero, the scan request is given
 *        the driver, and will be valid until passed to cfg80211_scan_done().
 *        For scan results, call cfg80211_inform_bss(); you can call this outside
 *        the scan/scan_done bracket too.
 * @abort_scan: Tell the driver to abort an ongoing scan. The driver shall
 *        indicate the status of the scan through cfg80211_scan_done().
 *
 * @auth: Request to authenticate with the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @assoc: Request to (re)associate with the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @deauth: Request to deauthenticate from the specified peer
 *        (invoked with the wireless_dev mutex held)
 * @disassoc: Request to disassociate from the specified peer
 *        (invoked with the wireless_dev mutex held)
 *
 * @connect: Connect to the ESS with the specified parameters. When connected,
 *        call cfg80211_connect_result()/cfg80211_connect_bss() with status code
 *        %WLAN_STATUS_SUCCESS. If the connection fails for some reason, call
 *        cfg80211_connect_result()/cfg80211_connect_bss() with the status code
 *        from the AP or cfg80211_connect_timeout() if no frame with status code
 *        was received.
 *        The driver is allowed to roam to other BSSes within the ESS when the
 *        other BSS matches the connect parameters. When such roaming is initiated
 *        by the driver, the driver is expected to verify that the target matches
 *        the configured security parameters and to use Reassociation Request
 *        frame instead of Association Request frame.
 *        The connect function can also be used to request the driver to perform a
 *        specific roam when connected to an ESS. In that case, the prev_bssid
 *        parameter is set to the BSSID of the currently associated BSS as an
 *        indication of requesting reassociation.
 *        In both the driver-initiated and new connect() call initiated roaming
 *        cases, the result of roaming is indicated with a call to
 *        cfg80211_roamed(). (invoked with the wireless_dev mutex held)
 * @update_connect_params: Update the connect parameters while connected to a
 *        BSS. The updated parameters can be used by driver/firmware for
 *        subsequent BSS selection (roaming) decisions and to form the
 *        Authentication/(Re)Association Request frames. This call does not
 *        request an immediate disassociation or reassociation with the current
 *        BSS, i.e., this impacts only subsequent (re)associations. The bits in
 *        changed are defined in &enum cfg80211_connect_params_changed.
 *        (invoked with the wireless_dev mutex held)
 * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if
 *      connection is in progress. Once done, call cfg80211_disconnected() in
 *      case connection was already established (invoked with the
 *      wireless_dev mutex held), otherwise call cfg80211_connect_timeout().
 *
 * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call
 *        cfg80211_ibss_joined(), also call that function when changing BSSID due
 *        to a merge.
 *        (invoked with the wireless_dev mutex held)
 * @leave_ibss: Leave the IBSS.
 *        (invoked with the wireless_dev mutex held)
 *
 * @set_mcast_rate: Set the specified multicast rate (only if vif is in ADHOC or
 *        MESH mode)
 *
 * @set_wiphy_params: Notify that wiphy parameters have changed;
 *        @changed bitfield (see &enum wiphy_params_flags) describes which values
 *        have changed. The actual parameter values are available in
 *        struct wiphy. If returning an error, no value should be changed.
 *
 * @set_tx_power: set the transmit power according to the parameters,
 *        the power passed is in mBm, to get dBm use MBM_TO_DBM(). The
 *        wdev may be %NULL if power was set for the wiphy, and will
 *        always be %NULL unless the driver supports per-vif TX power
 *        (as advertised by the nl80211 feature flag.)
 * @get_tx_power: store the current TX power into the dbm variable;
 *        return 0 if successful
 *
 * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting
 *        functions to adjust rfkill hw state
 *
 * @dump_survey: get site survey information.
 *
 * @remain_on_channel: Request the driver to remain awake on the specified
 *        channel for the specified duration to complete an off-channel
 *        operation (e.g., public action frame exchange). When the driver is
 *        ready on the requested channel, it must indicate this with an event
 *        notification by calling cfg80211_ready_on_channel().
 * @cancel_remain_on_channel: Cancel an on-going remain-on-channel operation.
 *        This allows the operation to be terminated prior to timeout based on
 *        the duration value.
 * @mgmt_tx: Transmit a management frame.
 * @mgmt_tx_cancel_wait: Cancel the wait time from transmitting a management
 *        frame on another channel
 *
 * @testmode_cmd: run a test mode command; @wdev may be %NULL
 * @testmode_dump: Implement a test mode dump. The cb->args[2] and up may be
 *        used by the function, but 0 and 1 must not be touched. Additionally,
 *        return error codes other than -ENOBUFS and -ENOENT will terminate the
 *        dump and return to userspace with an error, so be careful. If any data
 *        was passed in from userspace then the data/len arguments will be present
 *        and point to the data contained in %NL80211_ATTR_TESTDATA.
 *
 * @set_bitrate_mask: set the bitrate mask configuration
 *
 * @set_pmksa: Cache a PMKID for a BSSID. This is mostly useful for fullmac
 *        devices running firmwares capable of generating the (re) association
 *        RSN IE. It allows for faster roaming between WPA2 BSSIDs.
 * @del_pmksa: Delete a cached PMKID.
 * @flush_pmksa: Flush all cached PMKIDs.
 * @set_power_mgmt: Configure WLAN power management. A timeout value of -1
 *        allows the driver to adjust the dynamic ps timeout value.
 * @set_cqm_rssi_config: Configure connection quality monitor RSSI threshold.
 *        After configuration, the driver should (soon) send an event indicating
 *        the current level is above/below the configured threshold; this may
 *        need some care when the configuration is changed (without first being
 *        disabled.)
 * @set_cqm_rssi_range_config: Configure two RSSI thresholds in the
 *        connection quality monitor.  An event is to be sent only when the
 *        signal level is found to be outside the two values.  The driver should
 *        set %NL80211_EXT_FEATURE_CQM_RSSI_LIST if this method is implemented.
 *        If it is provided then there's no point providing @set_cqm_rssi_config.
 * @set_cqm_txe_config: Configure connection quality monitor TX error
 *        thresholds.
 * @sched_scan_start: Tell the driver to start a scheduled scan.
 * @sched_scan_stop: Tell the driver to stop an ongoing scheduled scan with
 *        given request id. This call must stop the scheduled scan and be ready
 *        for starting a new one before it returns, i.e. @sched_scan_start may be
 *        called immediately after that again and should not fail in that case.
 *        The driver should not call cfg80211_sched_scan_stopped() for a requested
 *        stop (when this method returns 0).
 *
 * @update_mgmt_frame_registrations: Notify the driver that management frame
 *        registrations were updated. The callback is allowed to sleep.
 *
 * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device.
 *        Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may
 *        reject TX/RX mask combinations they cannot support by returning -EINVAL
 *        (also see nl80211.h @NL80211_ATTR_WIPHY_ANTENNA_TX).
 *
 * @get_antenna: Get current antenna configuration from device (tx_ant, rx_ant).
 *
 * @tdls_mgmt: Transmit a TDLS management frame.
 * @tdls_oper: Perform a high-level TDLS operation (e.g. TDLS link setup).
 *
 * @probe_client: probe an associated client, must return a cookie that it
 *        later passes to cfg80211_probe_status().
 *
 * @set_noack_map: Set the NoAck Map for the TIDs.
 *
 * @get_channel: Get the current operating channel for the virtual interface.
 *        For monitor interfaces, it should return %NULL unless there's a single
 *        current monitoring channel.
 *
 * @start_p2p_device: Start the given P2P device.
 * @stop_p2p_device: Stop the given P2P device.
 *
 * @set_mac_acl: Sets MAC address control list in AP and P2P GO mode.
 *        Parameters include ACL policy, an array of MAC address of stations
 *        and the number of MAC addresses. If there is already a list in driver
 *        this new list replaces the existing one. Driver has to clear its ACL
 *        when number of MAC addresses entries is passed as 0. Drivers which
 *        advertise the support for MAC based ACL have to implement this callback.
 *
 * @start_radar_detection: Start radar detection in the driver.
 *
 * @end_cac: End running CAC, probably because a related CAC
 *        was finished on another phy.
 *
 * @update_ft_ies: Provide updated Fast BSS Transition information to the
 *        driver. If the SME is in the driver/firmware, this information can be
 *        used in building Authentication and Reassociation Request frames.
 *
 * @crit_proto_start: Indicates a critical protocol needs more link reliability
 *        for a given duration (milliseconds). The protocol is provided so the
 *        driver can take the most appropriate actions.
 * @crit_proto_stop: Indicates critical protocol no longer needs increased link
 *        reliability. This operation can not fail.
 * @set_coalesce: Set coalesce parameters.
 *
 * @channel_switch: initiate channel-switch procedure (with CSA). Driver is
 *        responsible for veryfing if the switch is possible. Since this is
 *        inherently tricky driver may decide to disconnect an interface later
 *        with cfg80211_stop_iface(). This doesn't mean driver can accept
 *        everything. It should do it's best to verify requests and reject them
 *        as soon as possible.
 *
 * @set_qos_map: Set QoS mapping information to the driver
 *
 * @set_ap_chanwidth: Set the AP (including P2P GO) mode channel width for the
 *        given interface This is used e.g. for dynamic HT 20/40 MHz channel width
 *        changes during the lifetime of the BSS.
 *
 * @add_tx_ts: validate (if admitted_time is 0) or add a TX TS to the device
 *        with the given parameters; action frame exchange has been handled by
 *        userspace so this just has to modify the TX path to take the TS into
 *        account.
 *        If the admitted time is 0 just validate the parameters to make sure
 *        the session can be created at all; it is valid to just always return
 *        success for that but that may result in inefficient behaviour (handshake
 *        with the peer followed by immediate teardown when the addition is later
 *        rejected)
 * @del_tx_ts: remove an existing TX TS
 *
 * @join_ocb: join the OCB network with the specified parameters
 *        (invoked with the wireless_dev mutex held)
 * @leave_ocb: leave the current OCB network
 *        (invoked with the wireless_dev mutex held)
 *
 * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver
 *        is responsible for continually initiating channel-switching operations
 *        and returning to the base channel for communication with the AP.
 * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both
 *        peers must be on the base channel when the call completes.
 * @start_nan: Start the NAN interface.
 * @stop_nan: Stop the NAN interface.
 * @add_nan_func: Add a NAN function. Returns negative value on failure.
 *        On success @nan_func ownership is transferred to the driver and
 *        it may access it outside of the scope of this function. The driver
 *        should free the @nan_func when no longer needed by calling
 *        cfg80211_free_nan_func().
 *        On success the driver should assign an instance_id in the
 *        provided @nan_func.
 * @del_nan_func: Delete a NAN function.
 * @nan_change_conf: changes NAN configuration. The changed parameters must
 *        be specified in @changes (using &enum cfg80211_nan_conf_changes);
 *        All other parameters must be ignored.
 *
 * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
 *
 * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this
 *      function should return phy stats, and interface stats otherwise.
 *
 * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake.
 *        If not deleted through @del_pmk the PMK remains valid until disconnect
 *        upon which the driver should clear it.
 *        (invoked with the wireless_dev mutex held)
 * @del_pmk: delete the previously configured PMK for the given authenticator.
 *        (invoked with the wireless_dev mutex held)
 *
 * @external_auth: indicates result of offloaded authentication processing from
 *     user space
 *
 * @tx_control_port: TX a control port frame (EAPoL).  The noencrypt parameter
 *        tells the driver that the frame should not be encrypted.
 *
 * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available.
 *        Statistics should be cumulative, currently no way to reset is provided.
 * @start_pmsr: start peer measurement (e.g. FTM)
 * @abort_pmsr: abort peer measurement
 *
 * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME
 *        but offloading OWE processing to the user space will get the updated
 *        DH IE through this interface.
 *
 * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame
 *        and overrule HWMP path selection algorithm.
 * @set_tid_config: TID specific configuration, this can be peer or BSS specific
 *        This callback may sleep.
 * @reset_tid_config: Reset TID specific configuration for the peer, for the
 *        given TIDs. This callback may sleep.
 *
 * @set_sar_specs: Update the SAR (TX power) settings.
 *
 * @color_change: Initiate a color change.
 *
 * @set_fils_aad: Set FILS AAD data to the AP driver so that the driver can use
 *        those to decrypt (Re)Association Request and encrypt (Re)Association
 *        Response frame.
 *
 * @set_radar_background: Configure dedicated offchannel chain available for
 *        radar/CAC detection on some hw. This chain can't be used to transmit
 *        or receive frames and it is bounded to a running wdev.
 *        Background radar/CAC detection allows to avoid the CAC downtime
 *        switching to a different channel during CAC detection on the selected
 *        radar channel.
 *        The caller is expected to set chandef pointer to NULL in order to
 *        disable background CAC/radar detection.
 * @add_link_station: Add a link to a station.
 * @mod_link_station: Modify a link of a station.
 * @del_link_station: Remove a link of a station.
 *
 * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames.
 * @set_ttlm: set the TID to link mapping.
 */
struct cfg80211_ops {
        int        (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
        int        (*resume)(struct wiphy *wiphy);
        void        (*set_wakeup)(struct wiphy *wiphy, bool enabled);

        struct wireless_dev * (*add_virtual_intf)(struct wiphy *wiphy,
                                                  const char *name,
                                                  unsigned char name_assign_type,
                                                  enum nl80211_iftype type,
                                                  struct vif_params *params);
        int        (*del_virtual_intf)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev);
        int        (*change_virtual_intf)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       enum nl80211_iftype type,
                                       struct vif_params *params);

        int        (*add_intf_link)(struct wiphy *wiphy,
                                 struct wireless_dev *wdev,
                                 unsigned int link_id);
        void        (*del_intf_link)(struct wiphy *wiphy,
                                 struct wireless_dev *wdev,
                                 unsigned int link_id);

        int        (*add_key)(struct wiphy *wiphy, struct net_device *netdev,
                           int link_id, u8 key_index, bool pairwise,
                           const u8 *mac_addr, struct key_params *params);
        int        (*get_key)(struct wiphy *wiphy, struct net_device *netdev,
                           int link_id, u8 key_index, bool pairwise,
                           const u8 *mac_addr, void *cookie,
                           void (*callback)(void *cookie, struct key_params*));
        int        (*del_key)(struct wiphy *wiphy, struct net_device *netdev,
                           int link_id, u8 key_index, bool pairwise,
                           const u8 *mac_addr);
        int        (*set_default_key)(struct wiphy *wiphy,
                                   struct net_device *netdev, int link_id,
                                   u8 key_index, bool unicast, bool multicast);
        int        (*set_default_mgmt_key)(struct wiphy *wiphy,
                                        struct net_device *netdev, int link_id,
                                        u8 key_index);
        int        (*set_default_beacon_key)(struct wiphy *wiphy,
                                          struct net_device *netdev,
                                          int link_id,
                                          u8 key_index);

        int        (*start_ap)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_ap_settings *settings);
        int        (*change_beacon)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_ap_update *info);
        int        (*stop_ap)(struct wiphy *wiphy, struct net_device *dev,
                           unsigned int link_id);


        int        (*add_station)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *mac,
                               struct station_parameters *params);
        int        (*del_station)(struct wiphy *wiphy, struct net_device *dev,
                               struct station_del_parameters *params);
        int        (*change_station)(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *mac,
                                  struct station_parameters *params);
        int        (*get_station)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *mac, struct station_info *sinfo);
        int        (*dump_station)(struct wiphy *wiphy, struct net_device *dev,
                                int idx, u8 *mac, struct station_info *sinfo);

        int        (*add_mpath)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *dst, const u8 *next_hop);
        int        (*del_mpath)(struct wiphy *wiphy, struct net_device *dev,
                               const u8 *dst);
        int        (*change_mpath)(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *dst, const u8 *next_hop);
        int        (*get_mpath)(struct wiphy *wiphy, struct net_device *dev,
                             u8 *dst, u8 *next_hop, struct mpath_info *pinfo);
        int        (*dump_mpath)(struct wiphy *wiphy, struct net_device *dev,
                              int idx, u8 *dst, u8 *next_hop,
                              struct mpath_info *pinfo);
        int        (*get_mpp)(struct wiphy *wiphy, struct net_device *dev,
                           u8 *dst, u8 *mpp, struct mpath_info *pinfo);
        int        (*dump_mpp)(struct wiphy *wiphy, struct net_device *dev,
                            int idx, u8 *dst, u8 *mpp,
                            struct mpath_info *pinfo);
        int        (*get_mesh_config)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct mesh_config *conf);
        int        (*update_mesh_config)(struct wiphy *wiphy,
                                      struct net_device *dev, u32 mask,
                                      const struct mesh_config *nconf);
        int        (*join_mesh)(struct wiphy *wiphy, struct net_device *dev,
                             const struct mesh_config *conf,
                             const struct mesh_setup *setup);
        int        (*leave_mesh)(struct wiphy *wiphy, struct net_device *dev);

        int        (*join_ocb)(struct wiphy *wiphy, struct net_device *dev,
                            struct ocb_setup *setup);
        int        (*leave_ocb)(struct wiphy *wiphy, struct net_device *dev);

        int        (*change_bss)(struct wiphy *wiphy, struct net_device *dev,
                              struct bss_parameters *params);

        void        (*inform_bss)(struct wiphy *wiphy, struct cfg80211_bss *bss,
                              const struct cfg80211_bss_ies *ies, void *data);

        int        (*set_txq_params)(struct wiphy *wiphy, struct net_device *dev,
                                  struct ieee80211_txq_params *params);

        int        (*libertas_set_mesh_channel)(struct wiphy *wiphy,
                                             struct net_device *dev,
                                             struct ieee80211_channel *chan);

        int        (*set_monitor_channel)(struct wiphy *wiphy,
                                       struct cfg80211_chan_def *chandef);

        int        (*scan)(struct wiphy *wiphy,
                        struct cfg80211_scan_request *request);
        void        (*abort_scan)(struct wiphy *wiphy, struct wireless_dev *wdev);

        int        (*auth)(struct wiphy *wiphy, struct net_device *dev,
                        struct cfg80211_auth_request *req);
        int        (*assoc)(struct wiphy *wiphy, struct net_device *dev,
                         struct cfg80211_assoc_request *req);
        int        (*deauth)(struct wiphy *wiphy, struct net_device *dev,
                          struct cfg80211_deauth_request *req);
        int        (*disassoc)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_disassoc_request *req);

        int        (*connect)(struct wiphy *wiphy, struct net_device *dev,
                           struct cfg80211_connect_params *sme);
        int        (*update_connect_params)(struct wiphy *wiphy,
                                         struct net_device *dev,
                                         struct cfg80211_connect_params *sme,
                                         u32 changed);
        int        (*disconnect)(struct wiphy *wiphy, struct net_device *dev,
                              u16 reason_code);

        int        (*join_ibss)(struct wiphy *wiphy, struct net_device *dev,
                             struct cfg80211_ibss_params *params);
        int        (*leave_ibss)(struct wiphy *wiphy, struct net_device *dev);

        int        (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev,
                                  int rate[NUM_NL80211_BANDS]);

        int        (*set_wiphy_params)(struct wiphy *wiphy, u32 changed);

        int        (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                enum nl80211_tx_power_setting type, int mbm);
        int        (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                int *dbm);

        void        (*rfkill_poll)(struct wiphy *wiphy);

#ifdef CONFIG_NL80211_TESTMODE
        int        (*testmode_cmd)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                void *data, int len);
        int        (*testmode_dump)(struct wiphy *wiphy, struct sk_buff *skb,
                                 struct netlink_callback *cb,
                                 void *data, int len);
#endif

        int        (*set_bitrate_mask)(struct wiphy *wiphy,
                                    struct net_device *dev,
                                    unsigned int link_id,
                                    const u8 *peer,
                                    const struct cfg80211_bitrate_mask *mask);

        int        (*dump_survey)(struct wiphy *wiphy, struct net_device *netdev,
                        int idx, struct survey_info *info);

        int        (*set_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
                             struct cfg80211_pmksa *pmksa);
        int        (*del_pmksa)(struct wiphy *wiphy, struct net_device *netdev,
                             struct cfg80211_pmksa *pmksa);
        int        (*flush_pmksa)(struct wiphy *wiphy, struct net_device *netdev);

        int        (*remain_on_channel)(struct wiphy *wiphy,
                                     struct wireless_dev *wdev,
                                     struct ieee80211_channel *chan,
                                     unsigned int duration,
                                     u64 *cookie);
        int        (*cancel_remain_on_channel)(struct wiphy *wiphy,
                                            struct wireless_dev *wdev,
                                            u64 cookie);

        int        (*mgmt_tx)(struct wiphy *wiphy, struct wireless_dev *wdev,
                           struct cfg80211_mgmt_tx_params *params,
                           u64 *cookie);
        int        (*mgmt_tx_cancel_wait)(struct wiphy *wiphy,
                                       struct wireless_dev *wdev,
                                       u64 cookie);

        int        (*set_power_mgmt)(struct wiphy *wiphy, struct net_device *dev,
                                  bool enabled, int timeout);

        int        (*set_cqm_rssi_config)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       s32 rssi_thold, u32 rssi_hyst);

        int        (*set_cqm_rssi_range_config)(struct wiphy *wiphy,
                                             struct net_device *dev,
                                             s32 rssi_low, s32 rssi_high);

        int        (*set_cqm_txe_config)(struct wiphy *wiphy,
                                      struct net_device *dev,
                                      u32 rate, u32 pkts, u32 intvl);

        void        (*update_mgmt_frame_registrations)(struct wiphy *wiphy,
                                                   struct wireless_dev *wdev,
                                                   struct mgmt_frame_regs *upd);

        int        (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant);
        int        (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant);

        int        (*sched_scan_start)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_sched_scan_request *request);
        int        (*sched_scan_stop)(struct wiphy *wiphy, struct net_device *dev,
                                   u64 reqid);

        int        (*set_rekey_data)(struct wiphy *wiphy, struct net_device *dev,
                                  struct cfg80211_gtk_rekey_data *data);

        int        (*tdls_mgmt)(struct wiphy *wiphy, struct net_device *dev,
                             const u8 *peer, int link_id,
                             u8 action_code, u8 dialog_token, u16 status_code,
                             u32 peer_capability, bool initiator,
                             const u8 *buf, size_t len);
        int        (*tdls_oper)(struct wiphy *wiphy, struct net_device *dev,
                             const u8 *peer, enum nl80211_tdls_operation oper);

        int        (*probe_client)(struct wiphy *wiphy, struct net_device *dev,
                                const u8 *peer, u64 *cookie);

        int        (*set_noack_map)(struct wiphy *wiphy,
                                  struct net_device *dev,
                                  u16 noack_map);

        int        (*get_channel)(struct wiphy *wiphy,
                               struct wireless_dev *wdev,
                               unsigned int link_id,
                               struct cfg80211_chan_def *chandef);

        int        (*start_p2p_device)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev);
        void        (*stop_p2p_device)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev);

        int        (*set_mac_acl)(struct wiphy *wiphy, struct net_device *dev,
                               const struct cfg80211_acl_data *params);

        int        (*start_radar_detection)(struct wiphy *wiphy,
                                         struct net_device *dev,
                                         struct cfg80211_chan_def *chandef,
                                         u32 cac_time_ms);
        void        (*end_cac)(struct wiphy *wiphy,
                                struct net_device *dev);
        int        (*update_ft_ies)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_update_ft_ies_params *ftie);
        int        (*crit_proto_start)(struct wiphy *wiphy,
                                    struct wireless_dev *wdev,
                                    enum nl80211_crit_proto_id protocol,
                                    u16 duration);
        void        (*crit_proto_stop)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev);
        int        (*set_coalesce)(struct wiphy *wiphy,
                                struct cfg80211_coalesce *coalesce);

        int        (*channel_switch)(struct wiphy *wiphy,
                                  struct net_device *dev,
                                  struct cfg80211_csa_settings *params);

        int     (*set_qos_map)(struct wiphy *wiphy,
                               struct net_device *dev,
                               struct cfg80211_qos_map *qos_map);

        int        (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev,
                                    unsigned int link_id,
                                    struct cfg80211_chan_def *chandef);

        int        (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
                             u8 tsid, const u8 *peer, u8 user_prio,
                             u16 admitted_time);
        int        (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev,
                             u8 tsid, const u8 *peer);

        int        (*tdls_channel_switch)(struct wiphy *wiphy,
                                       struct net_device *dev,
                                       const u8 *addr, u8 oper_class,
                                       struct cfg80211_chan_def *chandef);
        void        (*tdls_cancel_channel_switch)(struct wiphy *wiphy,
                                              struct net_device *dev,
                                              const u8 *addr);
        int        (*start_nan)(struct wiphy *wiphy, struct wireless_dev *wdev,
                             struct cfg80211_nan_conf *conf);
        void        (*stop_nan)(struct wiphy *wiphy, struct wireless_dev *wdev);
        int        (*add_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
                                struct cfg80211_nan_func *nan_func);
        void        (*del_nan_func)(struct wiphy *wiphy, struct wireless_dev *wdev,
                               u64 cookie);
        int        (*nan_change_conf)(struct wiphy *wiphy,
                                   struct wireless_dev *wdev,
                                   struct cfg80211_nan_conf *conf,
                                   u32 changes);

        int        (*set_multicast_to_unicast)(struct wiphy *wiphy,
                                            struct net_device *dev,
                                            const bool enabled);

        int        (*get_txq_stats)(struct wiphy *wiphy,
                                 struct wireless_dev *wdev,
                                 struct cfg80211_txq_stats *txqstats);

        int        (*set_pmk)(struct wiphy *wiphy, struct net_device *dev,
                           const struct cfg80211_pmk_conf *conf);
        int        (*del_pmk)(struct wiphy *wiphy, struct net_device *dev,
                           const u8 *aa);
        int     (*external_auth)(struct wiphy *wiphy, struct net_device *dev,
                                 struct cfg80211_external_auth_params *params);

        int        (*tx_control_port)(struct wiphy *wiphy,
                                   struct net_device *dev,
                                   const u8 *buf, size_t len,
                                   const u8 *dest, const __be16 proto,
                                   const bool noencrypt, int link_id,
                                   u64 *cookie);

        int        (*get_ftm_responder_stats)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_ftm_responder_stats *ftm_stats);

        int        (*start_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
                              struct cfg80211_pmsr_request *request);
        void        (*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
                              struct cfg80211_pmsr_request *request);
        int        (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev,
                                   struct cfg80211_update_owe_info *owe_info);
        int        (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev,
                                   const u8 *buf, size_t len);
        int     (*set_tid_config)(struct wiphy *wiphy, struct net_device *dev,
                                  struct cfg80211_tid_config *tid_conf);
        int        (*reset_tid_config)(struct wiphy *wiphy, struct net_device *dev,
                                    const u8 *peer, u8 tids);
        int        (*set_sar_specs)(struct wiphy *wiphy,
                                 struct cfg80211_sar_specs *sar);
        int        (*color_change)(struct wiphy *wiphy,
                                struct net_device *dev,
                                struct cfg80211_color_change_settings *params);
        int     (*set_fils_aad)(struct wiphy *wiphy, struct net_device *dev,
                                struct cfg80211_fils_aad *fils_aad);
        int        (*set_radar_background)(struct wiphy *wiphy,
                                        struct cfg80211_chan_def *chandef);
        int        (*add_link_station)(struct wiphy *wiphy, struct net_device *dev,
                                    struct link_station_parameters *params);
        int        (*mod_link_station)(struct wiphy *wiphy, struct net_device *dev,
                                    struct link_station_parameters *params);
        int        (*del_link_station)(struct wiphy *wiphy, struct net_device *dev,
                                    struct link_station_del_parameters *params);
        int        (*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev,
                                    struct cfg80211_set_hw_timestamp *hwts);
        int        (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev,
                            struct cfg80211_ttlm_params *params);
};

/*
 * wireless hardware and networking interfaces structures
 * and registration/helper functions
 */

/**
 * enum wiphy_flags - wiphy capability flags
 *
 * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split
 *         into two, first for legacy bands and second for 6 GHz.
 * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this
 *        wiphy at all
 * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled
 *        by default -- this flag will be set depending on the kernel's default
 *        on wiphy_new(), but can be changed by the driver if it has a good
 *        reason to override the default
 * @WIPHY_FLAG_4ADDR_AP: supports 4addr mode even on AP (with a single station
 *        on a VLAN interface). This flag also serves an extra purpose of
 *        supporting 4ADDR AP mode on devices which do not support AP/VLAN iftype.
 * @WIPHY_FLAG_4ADDR_STATION: supports 4addr mode even as a station
 * @WIPHY_FLAG_CONTROL_PORT_PROTOCOL: This device supports setting the
 *        control port protocol ethertype. The device also honours the
 *        control_port_no_encrypt flag.
 * @WIPHY_FLAG_IBSS_RSN: The device supports IBSS RSN.
 * @WIPHY_FLAG_MESH_AUTH: The device supports mesh authentication by routing
 *        auth frames to userspace. See @NL80211_MESH_SETUP_USERSPACE_AUTH.
 * @WIPHY_FLAG_SUPPORTS_FW_ROAM: The device supports roaming feature in the
 *        firmware.
 * @WIPHY_FLAG_AP_UAPSD: The device supports uapsd on AP.
 * @WIPHY_FLAG_SUPPORTS_TDLS: The device supports TDLS (802.11z) operation.
 * @WIPHY_FLAG_TDLS_EXTERNAL_SETUP: The device does not handle TDLS (802.11z)
 *        link setup/discovery operations internally. Setup, discovery and
 *        teardown packets should be sent through the @NL80211_CMD_TDLS_MGMT
 *        command. When this flag is not set, @NL80211_CMD_TDLS_OPER should be
 *        used for asking the driver/firmware to perform a TDLS operation.
 * @WIPHY_FLAG_HAVE_AP_SME: device integrates AP SME
 * @WIPHY_FLAG_REPORTS_OBSS: the device will report beacons from other BSSes
 *        when there are virtual interfaces in AP mode by calling
 *        cfg80211_report_obss_beacon().
 * @WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD: When operating as an AP, the device
 *        responds to probe-requests in hardware.
 * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX.
 * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call.
 * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels.
 * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in
 *        beaconing mode (AP, IBSS, Mesh, ...).
 * @WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK: The device supports bigger kek and kck keys
 * @WIPHY_FLAG_SUPPORTS_MLO: This is a temporary flag gating the MLO APIs,
 *        in order to not have them reachable in normal drivers, until we have
 *        complete feature/interface combinations/etc. advertisement. No driver
 *        should set this flag for now.
 * @WIPHY_FLAG_SUPPORTS_EXT_KCK_32: The device supports 32-byte KCK keys.
 * @WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER: The device could handle reg notify for
 *        NL80211_REGDOM_SET_BY_DRIVER.
 * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver
 *        set this flag to update channels on beacon hints.
 * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link
 *        of an NSTR mobile AP MLD.
 * @WIPHY_FLAG_DISABLE_WEXT: disable wireless extensions for this device
 */
enum wiphy_flags {
        WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK                = BIT(0),
        WIPHY_FLAG_SUPPORTS_MLO                        = BIT(1),
        WIPHY_FLAG_SPLIT_SCAN_6GHZ                = BIT(2),
        WIPHY_FLAG_NETNS_OK                        = BIT(3),
        WIPHY_FLAG_PS_ON_BY_DEFAULT                = BIT(4),
        WIPHY_FLAG_4ADDR_AP                        = BIT(5),
        WIPHY_FLAG_4ADDR_STATION                = BIT(6),
        WIPHY_FLAG_CONTROL_PORT_PROTOCOL        = BIT(7),
        WIPHY_FLAG_IBSS_RSN                        = BIT(8),
        WIPHY_FLAG_DISABLE_WEXT                        = BIT(9),
        WIPHY_FLAG_MESH_AUTH                        = BIT(10),
        WIPHY_FLAG_SUPPORTS_EXT_KCK_32          = BIT(11),
        WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY        = BIT(12),
        WIPHY_FLAG_SUPPORTS_FW_ROAM                = BIT(13),
        WIPHY_FLAG_AP_UAPSD                        = BIT(14),
        WIPHY_FLAG_SUPPORTS_TDLS                = BIT(15),
        WIPHY_FLAG_TDLS_EXTERNAL_SETUP                = BIT(16),
        WIPHY_FLAG_HAVE_AP_SME                        = BIT(17),
        WIPHY_FLAG_REPORTS_OBSS                        = BIT(18),
        WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD        = BIT(19),
        WIPHY_FLAG_OFFCHAN_TX                        = BIT(20),
        WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL        = BIT(21),
        WIPHY_FLAG_SUPPORTS_5_10_MHZ                = BIT(22),
        WIPHY_FLAG_HAS_CHANNEL_SWITCH                = BIT(23),
        WIPHY_FLAG_NOTIFY_REGDOM_BY_DRIVER        = BIT(24),
        WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON     = BIT(25),
};

/**
 * struct ieee80211_iface_limit - limit on certain interface types
 * @max: maximum number of interfaces of these types
 * @types: interface types (bits)
 */
struct ieee80211_iface_limit {
        u16 max;
        u16 types;
};

/**
 * struct ieee80211_iface_combination - possible interface combination
 *
 * With this structure the driver can describe which interface
 * combinations it supports concurrently.
 *
 * Examples:
 *
 * 1. Allow #STA <= 1, #AP <= 1, matching BI, channels = 1, 2 total:
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits1[] = {
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), },
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_AP), },
 *        };
 *        struct ieee80211_iface_combination combination1 = {
 *                .limits = limits1,
 *                .n_limits = ARRAY_SIZE(limits1),
 *                .max_interfaces = 2,
 *                .beacon_int_infra_match = true,
 *        };
 *
 *
 * 2. Allow #{AP, P2P-GO} <= 8, channels = 1, 8 total:
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits2[] = {
 *                { .max = 8, .types = BIT(NL80211_IFTYPE_AP) |
 *                                     BIT(NL80211_IFTYPE_P2P_GO), },
 *        };
 *        struct ieee80211_iface_combination combination2 = {
 *                .limits = limits2,
 *                .n_limits = ARRAY_SIZE(limits2),
 *                .max_interfaces = 8,
 *                .num_different_channels = 1,
 *        };
 *
 *
 * 3. Allow #STA <= 1, #{P2P-client,P2P-GO} <= 3 on two channels, 4 total.
 *
 *    This allows for an infrastructure connection and three P2P connections.
 *
 *    .. code-block:: c
 *
 *        struct ieee80211_iface_limit limits3[] = {
 *                { .max = 1, .types = BIT(NL80211_IFTYPE_STATION), },
 *                { .max = 3, .types = BIT(NL80211_IFTYPE_P2P_GO) |
 *                                     BIT(NL80211_IFTYPE_P2P_CLIENT), },
 *        };
 *        struct ieee80211_iface_combination combination3 = {
 *                .limits = limits3,
 *                .n_limits = ARRAY_SIZE(limits3),
 *                .max_interfaces = 4,
 *                .num_different_channels = 2,
 *        };
 *
 */
struct ieee80211_iface_combination {
        /**
         * @limits:
         * limits for the given interface types
         */
        const struct ieee80211_iface_limit *limits;

        /**
         * @num_different_channels:
         * can use up to this many different channels
         */
        u32 num_different_channels;

        /**
         * @max_interfaces:
         * maximum number of interfaces in total allowed in this group
         */
        u16 max_interfaces;

        /**
         * @n_limits:
         * number of limitations
         */
        u8 n_limits;

        /**
         * @beacon_int_infra_match:
         * In this combination, the beacon intervals between infrastructure
         * and AP types must match. This is required only in special cases.
         */
        bool beacon_int_infra_match;

        /**
         * @radar_detect_widths:
         * bitmap of channel widths supported for radar detection
         */
        u8 radar_detect_widths;

        /**
         * @radar_detect_regions:
         * bitmap of regions supported for radar detection
         */
        u8 radar_detect_regions;

        /**
         * @beacon_int_min_gcd:
         * This interface combination supports different beacon intervals.
         *
         * = 0
         *   all beacon intervals for different interface must be same.
         * > 0
         *   any beacon interval for the interface part of this combination AND
         *   GCD of all beacon intervals from beaconing interfaces of this
         *   combination must be greater or equal to this value.
         */
        u32 beacon_int_min_gcd;
};

struct ieee80211_txrx_stypes {
        u16 tx, rx;
};

/**
 * enum wiphy_wowlan_support_flags - WoWLAN support flags
 * @WIPHY_WOWLAN_ANY: supports wakeup for the special "any"
 *        trigger that keeps the device operating as-is and
 *        wakes up the host on any activity, for example a
 *        received packet that passed filtering; note that the
 *        packet should be preserved in that case
 * @WIPHY_WOWLAN_MAGIC_PKT: supports wakeup on magic packet
 *        (see nl80211.h)
 * @WIPHY_WOWLAN_DISCONNECT: supports wakeup on disconnect
 * @WIPHY_WOWLAN_SUPPORTS_GTK_REKEY: supports GTK rekeying while asleep
 * @WIPHY_WOWLAN_GTK_REKEY_FAILURE: supports wakeup on GTK rekey failure
 * @WIPHY_WOWLAN_EAP_IDENTITY_REQ: supports wakeup on EAP identity request
 * @WIPHY_WOWLAN_4WAY_HANDSHAKE: supports wakeup on 4-way handshake failure
 * @WIPHY_WOWLAN_RFKILL_RELEASE: supports wakeup on RF-kill release
 * @WIPHY_WOWLAN_NET_DETECT: supports wakeup on network detection
 */
enum wiphy_wowlan_support_flags {
        WIPHY_WOWLAN_ANY                = BIT(0),
        WIPHY_WOWLAN_MAGIC_PKT                = BIT(1),
        WIPHY_WOWLAN_DISCONNECT                = BIT(2),
        WIPHY_WOWLAN_SUPPORTS_GTK_REKEY        = BIT(3),
        WIPHY_WOWLAN_GTK_REKEY_FAILURE        = BIT(4),
        WIPHY_WOWLAN_EAP_IDENTITY_REQ        = BIT(5),
        WIPHY_WOWLAN_4WAY_HANDSHAKE        = BIT(6),
        WIPHY_WOWLAN_RFKILL_RELEASE        = BIT(7),
        WIPHY_WOWLAN_NET_DETECT                = BIT(8),
};

struct wiphy_wowlan_tcp_support {
        const struct nl80211_wowlan_tcp_data_token_feature *tok;
        u32 data_payload_max;
        u32 data_interval_max;
        u32 wake_payload_max;
        bool seq;
};

/**
 * struct wiphy_wowlan_support - WoWLAN support data
 * @flags: see &enum wiphy_wowlan_support_flags
 * @n_patterns: number of supported wakeup patterns
 *        (see nl80211.h for the pattern definition)
 * @pattern_max_len: maximum length of each pattern
 * @pattern_min_len: minimum length of each pattern
 * @max_pkt_offset: maximum Rx packet offset
 * @max_nd_match_sets: maximum number of matchsets for net-detect,
 *        similar, but not necessarily identical, to max_match_sets for
 *        scheduled scans.
 *        See &struct cfg80211_sched_scan_request.@match_sets for more
 *        details.
 * @tcp: TCP wakeup support information
 */
struct wiphy_wowlan_support {
        u32 flags;
        int n_patterns;
        int pattern_max_len;
        int pattern_min_len;
        int max_pkt_offset;
        int max_nd_match_sets;
        const struct wiphy_wowlan_tcp_support *tcp;
};

/**
 * struct wiphy_coalesce_support - coalesce support data
 * @n_rules: maximum number of coalesce rules
 * @max_delay: maximum supported coalescing delay in msecs
 * @n_patterns: number of supported patterns in a rule
 *        (see nl80211.h for the pattern definition)
 * @pattern_max_len: maximum length of each pattern
 * @pattern_min_len: minimum length of each pattern
 * @max_pkt_offset: maximum Rx packet offset
 */
struct wiphy_coalesce_support {
        int n_rules;
        int max_delay;
        int n_patterns;
        int pattern_max_len;
        int pattern_min_len;
        int max_pkt_offset;
};

/**
 * enum wiphy_vendor_command_flags - validation flags for vendor commands
 * @WIPHY_VENDOR_CMD_NEED_WDEV: vendor command requires wdev
 * @WIPHY_VENDOR_CMD_NEED_NETDEV: vendor command requires netdev
 * @WIPHY_VENDOR_CMD_NEED_RUNNING: interface/wdev must be up & running
 *        (must be combined with %_WDEV or %_NETDEV)
 */
enum wiphy_vendor_command_flags {
        WIPHY_VENDOR_CMD_NEED_WDEV = BIT(0),
        WIPHY_VENDOR_CMD_NEED_NETDEV = BIT(1),
        WIPHY_VENDOR_CMD_NEED_RUNNING = BIT(2),
};

/**
 * enum wiphy_opmode_flag - Station's ht/vht operation mode information flags
 *
 * @STA_OPMODE_MAX_BW_CHANGED: Max Bandwidth changed
 * @STA_OPMODE_SMPS_MODE_CHANGED: SMPS mode changed
 * @STA_OPMODE_N_SS_CHANGED: max N_SS (number of spatial streams) changed
 *
 */
enum wiphy_opmode_flag {
        STA_OPMODE_MAX_BW_CHANGED        = BIT(0),
        STA_OPMODE_SMPS_MODE_CHANGED        = BIT(1),
        STA_OPMODE_N_SS_CHANGED                = BIT(2),
};

/**
 * struct sta_opmode_info - Station's ht/vht operation mode information
 * @changed: contains value from &enum wiphy_opmode_flag
 * @smps_mode: New SMPS mode value from &enum nl80211_smps_mode of a station
 * @bw: new max bandwidth value from &enum nl80211_chan_width of a station
 * @rx_nss: new rx_nss value of a station
 */

struct sta_opmode_info {
        u32 changed;
        enum nl80211_smps_mode smps_mode;
        enum nl80211_chan_width bw;
        u8 rx_nss;
};

#define VENDOR_CMD_RAW_DATA ((const struct nla_policy *)(long)(-ENODATA))

/**
 * struct wiphy_vendor_command - vendor command definition
 * @info: vendor command identifying information, as used in nl80211
 * @flags: flags, see &enum wiphy_vendor_command_flags
 * @doit: callback for the operation, note that wdev is %NULL if the
 *        flags didn't ask for a wdev and non-%NULL otherwise; the data
 *        pointer may be %NULL if userspace provided no data at all
 * @dumpit: dump callback, for transferring bigger/multiple items. The
 *        @storage points to cb->args[5], ie. is preserved over the multiple
 *        dumpit calls.
 * @policy: policy pointer for attributes within %NL80211_ATTR_VENDOR_DATA.
 *        Set this to %VENDOR_CMD_RAW_DATA if no policy can be given and the
 *        attribute is just raw data (e.g. a firmware command).
 * @maxattr: highest attribute number in policy
 * It's recommended to not have the same sub command with both @doit and
 * @dumpit, so that userspace can assume certain ones are get and others
 * are used with dump requests.
 */
struct wiphy_vendor_command {
        struct nl80211_vendor_cmd_info info;
        u32 flags;
        int (*doit)(struct wiphy *wiphy, struct wireless_dev *wdev,
                    const void *data, int data_len);
        int (*dumpit)(struct wiphy *wiphy, struct wireless_dev *wdev,
                      struct sk_buff *skb, const void *data, int data_len,
                      unsigned long *storage);
        const struct nla_policy *policy;
        unsigned int maxattr;
};

/**
 * struct wiphy_iftype_ext_capab - extended capabilities per interface type
 * @iftype: interface type
 * @extended_capabilities: extended capabilities supported by the driver,
 *        additional capabilities might be supported by userspace; these are the
 *        802.11 extended capabilities ("Extended Capabilities element") and are
 *        in the same format as in the information element. See IEEE Std
 *        802.11-2012 8.4.2.29 for the defined fields.
 * @extended_capabilities_mask: mask of the valid values
 * @extended_capabilities_len: length of the extended capabilities
 * @eml_capabilities: EML capabilities (for MLO)
 * @mld_capa_and_ops: MLD capabilities and operations (for MLO)
 */
struct wiphy_iftype_ext_capab {
        enum nl80211_iftype iftype;
        const u8 *extended_capabilities;
        const u8 *extended_capabilities_mask;
        u8 extended_capabilities_len;
        u16 eml_capabilities;
        u16 mld_capa_and_ops;
};

/**
 * cfg80211_get_iftype_ext_capa - lookup interface type extended capability
 * @wiphy: the wiphy to look up from
 * @type: the interface type to look up
 *
 * Return: The extended capability for the given interface @type, may be %NULL
 */
const struct wiphy_iftype_ext_capab *
cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type);

/**
 * struct cfg80211_pmsr_capabilities - cfg80211 peer measurement capabilities
 * @max_peers: maximum number of peers in a single measurement
 * @report_ap_tsf: can report assoc AP's TSF for radio resource measurement
 * @randomize_mac_addr: can randomize MAC address for measurement
 * @ftm: FTM measurement data
 * @ftm.supported: FTM measurement is supported
 * @ftm.asap: ASAP-mode is supported
 * @ftm.non_asap: non-ASAP-mode is supported
 * @ftm.request_lci: can request LCI data
 * @ftm.request_civicloc: can request civic location data
 * @ftm.preambles: bitmap of preambles supported (&enum nl80211_preamble)
 * @ftm.bandwidths: bitmap of bandwidths supported (&enum nl80211_chan_width)
 * @ftm.max_bursts_exponent: maximum burst exponent supported
 *        (set to -1 if not limited; note that setting this will necessarily
 *        forbid using the value 15 to let the responder pick)
 * @ftm.max_ftms_per_burst: maximum FTMs per burst supported (set to 0 if
 *        not limited)
 * @ftm.trigger_based: trigger based ranging measurement is supported
 * @ftm.non_trigger_based: non trigger based ranging measurement is supported
 */
struct cfg80211_pmsr_capabilities {
        unsigned int max_peers;
        u8 report_ap_tsf:1,
           randomize_mac_addr:1;

        struct {
                u32 preambles;
                u32 bandwidths;
                s8 max_bursts_exponent;
                u8 max_ftms_per_burst;
                u8 supported:1,
                   asap:1,
                   non_asap:1,
                   request_lci:1,
                   request_civicloc:1,
                   trigger_based:1,
                   non_trigger_based:1;
        } ftm;
};

/**
 * struct wiphy_iftype_akm_suites - This structure encapsulates supported akm
 * suites for interface types defined in @iftypes_mask. Each type in the
 * @iftypes_mask must be unique across all instances of iftype_akm_suites.
 *
 * @iftypes_mask: bitmask of interfaces types
 * @akm_suites: points to an array of supported akm suites
 * @n_akm_suites: number of supported AKM suites
 */
struct wiphy_iftype_akm_suites {
        u16 iftypes_mask;
        const u32 *akm_suites;
        int n_akm_suites;
};

#define CFG80211_HW_TIMESTAMP_ALL_PEERS        0xffff

/**
 * struct wiphy - wireless hardware description
 * @mtx: mutex for the data (structures) of this device
 * @reg_notifier: the driver's regulatory notification callback,
 *        note that if your driver uses wiphy_apply_custom_regulatory()
 *        the reg_notifier's request can be passed as NULL
 * @regd: the driver's regulatory domain, if one was requested via
 *        the regulatory_hint() API. This can be used by the driver
 *        on the reg_notifier() if it chooses to ignore future
 *        regulatory domain changes caused by other drivers.
 * @signal_type: signal type reported in &struct cfg80211_bss.
 * @cipher_suites: supported cipher suites
 * @n_cipher_suites: number of supported cipher suites
 * @akm_suites: supported AKM suites. These are the default AKMs supported if
 *        the supported AKMs not advertized for a specific interface type in
 *        iftype_akm_suites.
 * @n_akm_suites: number of supported AKM suites
 * @iftype_akm_suites: array of supported akm suites info per interface type.
 *        Note that the bits in @iftypes_mask inside this structure cannot
 *        overlap (i.e. only one occurrence of each type is allowed across all
 *        instances of iftype_akm_suites).
 * @num_iftype_akm_suites: number of interface types for which supported akm
 *        suites are specified separately.
 * @retry_short: Retry limit for short frames (dot11ShortRetryLimit)
 * @retry_long: Retry limit for long frames (dot11LongRetryLimit)
 * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold);
 *        -1 = fragmentation disabled, only odd values >= 256 used
 * @rts_threshold: RTS threshold (dot11RTSThreshold); -1 = RTS/CTS disabled
 * @_net: the network namespace this wiphy currently lives in
 * @perm_addr: permanent MAC address of this device
 * @addr_mask: If the device supports multiple MAC addresses by masking,
 *        set this to a mask with variable bits set to 1, e.g. if the last
 *        four bits are variable then set it to 00-00-00-00-00-0f. The actual
 *        variable bits shall be determined by the interfaces added, with
 *        interfaces not matching the mask being rejected to be brought up.
 * @n_addresses: number of addresses in @addresses.
 * @addresses: If the device has more than one address, set this pointer
 *        to a list of addresses (6 bytes each). The first one will be used
 *        by default for perm_addr. In this case, the mask should be set to
 *        all-zeroes. In this case it is assumed that the device can handle
 *        the same number of arbitrary MAC addresses.
 * @registered: protects ->resume and ->suspend sysfs callbacks against
 *        unregister hardware
 * @debugfsdir: debugfs directory used for this wiphy (ieee80211/<wiphyname>).
 *        It will be renamed automatically on wiphy renames
 * @dev: (virtual) struct device for this wiphy. The item in
 *        /sys/class/ieee80211/ points to this. You need use set_wiphy_dev()
 *        (see below).
 * @wext: wireless extension handlers
 * @priv: driver private data (sized according to wiphy_new() parameter)
 * @interface_modes: bitmask of interfaces types valid for this wiphy,
 *        must be set by driver
 * @iface_combinations: Valid interface combinations array, should not
 *        list single interface types.
 * @n_iface_combinations: number of entries in @iface_combinations array.
 * @software_iftypes: bitmask of software interface types, these are not
 *        subject to any restrictions since they are purely managed in SW.
 * @flags: wiphy flags, see &enum wiphy_flags
 * @regulatory_flags: wiphy regulatory flags, see
 *        &enum ieee80211_regulatory_flags
 * @features: features advertised to nl80211, see &enum nl80211_feature_flags.
 * @ext_features: extended features advertised to nl80211, see
 *        &enum nl80211_ext_feature_index.
 * @bss_priv_size: each BSS struct has private data allocated with it,
 *        this variable determines its size
 * @max_scan_ssids: maximum number of SSIDs the device can scan for in
 *        any given scan
 * @max_sched_scan_reqs: maximum number of scheduled scan requests that
 *        the device can run concurrently.
 * @max_sched_scan_ssids: maximum number of SSIDs the device can scan
 *        for in any given scheduled scan
 * @max_match_sets: maximum number of match sets the device can handle
 *        when performing a scheduled scan, 0 if filtering is not
 *        supported.
 * @max_scan_ie_len: maximum length of user-controlled IEs device can
 *        add to probe request frames transmitted during a scan, must not
 *        include fixed IEs like supported rates
 * @max_sched_scan_ie_len: same as max_scan_ie_len, but for scheduled
 *        scans
 * @max_sched_scan_plans: maximum number of scan plans (scan interval and number
 *        of iterations) for scheduled scan supported by the device.
 * @max_sched_scan_plan_interval: maximum interval (in seconds) for a
 *        single scan plan supported by the device.
 * @max_sched_scan_plan_iterations: maximum number of iterations for a single
 *        scan plan supported by the device.
 * @coverage_class: current coverage class
 * @fw_version: firmware version for ethtool reporting
 * @hw_version: hardware version for ethtool reporting
 * @max_num_pmkids: maximum number of PMKIDs supported by device
 * @privid: a pointer that drivers can use to identify if an arbitrary
 *        wiphy is theirs, e.g. in global notifiers
 * @bands: information about bands/channels supported by this device
 *
 * @mgmt_stypes: bitmasks of frame subtypes that can be subscribed to or
 *        transmitted through nl80211, points to an array indexed by interface
 *        type
 *
 * @available_antennas_tx: bitmap of antennas which are available to be
 *        configured as TX antennas. Antenna configuration commands will be
 *        rejected unless this or @available_antennas_rx is set.
 *
 * @available_antennas_rx: bitmap of antennas which are available to be
 *        configured as RX antennas. Antenna configuration commands will be
 *        rejected unless this or @available_antennas_tx is set.
 *
 * @probe_resp_offload:
 *         Bitmap of supported protocols for probe response offloading.
 *         See &enum nl80211_probe_resp_offload_support_attr. Only valid
 *         when the wiphy flag @WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD is set.
 *
 * @max_remain_on_channel_duration: Maximum time a remain-on-channel operation
 *        may request, if implemented.
 *
 * @wowlan: WoWLAN support information
 * @wowlan_config: current WoWLAN configuration; this should usually not be
 *        used since access to it is necessarily racy, use the parameter passed
 *        to the suspend() operation instead.
 *
 * @ap_sme_capa: AP SME capabilities, flags from &enum nl80211_ap_sme_features.
 * @ht_capa_mod_mask:  Specify what ht_cap values can be over-ridden.
 *        If null, then none can be over-ridden.
 * @vht_capa_mod_mask:  Specify what VHT capabilities can be over-ridden.
 *        If null, then none can be over-ridden.
 *
 * @wdev_list: the list of associated (virtual) interfaces; this list must
 *        not be modified by the driver, but can be read with RTNL/RCU protection.
 *
 * @max_acl_mac_addrs: Maximum number of MAC addresses that the device
 *        supports for ACL.
 *
 * @extended_capabilities: extended capabilities supported by the driver,
 *        additional capabilities might be supported by userspace; these are
 *        the 802.11 extended capabilities ("Extended Capabilities element")
 *        and are in the same format as in the information element. See
 *        802.11-2012 8.4.2.29 for the defined fields. These are the default
 *        extended capabilities to be used if the capabilities are not specified
 *        for a specific interface type in iftype_ext_capab.
 * @extended_capabilities_mask: mask of the valid values
 * @extended_capabilities_len: length of the extended capabilities
 * @iftype_ext_capab: array of extended capabilities per interface type
 * @num_iftype_ext_capab: number of interface types for which extended
 *        capabilities are specified separately.
 * @coalesce: packet coalescing support information
 *
 * @vendor_commands: array of vendor commands supported by the hardware
 * @n_vendor_commands: number of vendor commands
 * @vendor_events: array of vendor events supported by the hardware
 * @n_vendor_events: number of vendor events
 *
 * @max_ap_assoc_sta: maximum number of associated stations supported in AP mode
 *        (including P2P GO) or 0 to indicate no such limit is advertised. The
 *        driver is allowed to advertise a theoretical limit that it can reach in
 *        some cases, but may not always reach.
 *
 * @max_num_csa_counters: Number of supported csa_counters in beacons
 *        and probe responses.  This value should be set if the driver
 *        wishes to limit the number of csa counters. Default (0) means
 *        infinite.
 * @bss_select_support: bitmask indicating the BSS selection criteria supported
 *        by the driver in the .connect() callback. The bit position maps to the
 *        attribute indices defined in &enum nl80211_bss_select_attr.
 *
 * @nan_supported_bands: bands supported by the device in NAN mode, a
 *        bitmap of &enum nl80211_band values.  For instance, for
 *        NL80211_BAND_2GHZ, bit 0 would be set
 *        (i.e. BIT(NL80211_BAND_2GHZ)).
 *
 * @txq_limit: configuration of internal TX queue frame limit
 * @txq_memory_limit: configuration internal TX queue memory limit
 * @txq_quantum: configuration of internal TX queue scheduler quantum
 *
 * @tx_queue_len: allow setting transmit queue len for drivers not using
 *        wake_tx_queue
 *
 * @support_mbssid: can HW support association with nontransmitted AP
 * @support_only_he_mbssid: don't parse MBSSID elements if it is not
 *        HE AP, in order to avoid compatibility issues.
 *        @support_mbssid must be set for this to have any effect.
 *
 * @pmsr_capa: peer measurement capabilities
 *
 * @tid_config_support: describes the per-TID config support that the
 *        device has
 * @tid_config_support.vif: bitmap of attributes (configurations)
 *        supported by the driver for each vif
 * @tid_config_support.peer: bitmap of attributes (configurations)
 *        supported by the driver for each peer
 * @tid_config_support.max_retry: maximum supported retry count for
 *        long/short retry configuration
 *
 * @max_data_retry_count: maximum supported per TID retry count for
 *        configuration through the %NL80211_TID_CONFIG_ATTR_RETRY_SHORT and
 *        %NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes
 * @sar_capa: SAR control capabilities
 * @rfkill: a pointer to the rfkill structure
 *
 * @mbssid_max_interfaces: maximum number of interfaces supported by the driver
 *        in a multiple BSSID set. This field must be set to a non-zero value
 *        by the driver to advertise MBSSID support.
 * @ema_max_profile_periodicity: maximum profile periodicity supported by
 *        the driver. Setting this field to a non-zero value indicates that the
 *        driver supports enhanced multi-BSSID advertisements (EMA AP).
 * @max_num_akm_suites: maximum number of AKM suites allowed for
 *        configuration through %NL80211_CMD_CONNECT, %NL80211_CMD_ASSOCIATE and
 *        %NL80211_CMD_START_AP. Set to NL80211_MAX_NR_AKM_SUITES if not set by
 *        driver. If set by driver minimum allowed value is
 *        NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with
 *        legacy userspace and maximum allowed value is
 *        CFG80211_MAX_NUM_AKM_SUITES.
 *
 * @hw_timestamp_max_peers: maximum number of peers that the driver supports
 *        enabling HW timestamping for concurrently. Setting this field to a
 *        non-zero value indicates that the driver supports HW timestamping.
 *        A value of %CFG80211_HW_TIMESTAMP_ALL_PEERS indicates the driver
 *        supports enabling HW timestamping for all peers (i.e. no need to
 *        specify a mac address).
 */
struct wiphy {
        struct mutex mtx;

        /* assign these fields before you register the wiphy */

        u8 perm_addr[ETH_ALEN];
        u8 addr_mask[ETH_ALEN];

        struct mac_address *addresses;

        const struct ieee80211_txrx_stypes *mgmt_stypes;

        const struct ieee80211_iface_combination *iface_combinations;
        int n_iface_combinations;
        u16 software_iftypes;

        u16 n_addresses;

        /* Supported interface modes, OR together BIT(NL80211_IFTYPE_...) */
        u16 interface_modes;

        u16 max_acl_mac_addrs;

        u32 flags, regulatory_flags, features;
        u8 ext_features[DIV_ROUND_UP(NUM_NL80211_EXT_FEATURES, 8)];

        u32 ap_sme_capa;

        enum cfg80211_signal_type signal_type;

        int bss_priv_size;
        u8 max_scan_ssids;
        u8 max_sched_scan_reqs;
        u8 max_sched_scan_ssids;
        u8 max_match_sets;
        u16 max_scan_ie_len;
        u16 max_sched_scan_ie_len;
        u32 max_sched_scan_plans;
        u32 max_sched_scan_plan_interval;
        u32 max_sched_scan_plan_iterations;

        int n_cipher_suites;
        const u32 *cipher_suites;

        int n_akm_suites;
        const u32 *akm_suites;

        const struct wiphy_iftype_akm_suites *iftype_akm_suites;
        unsigned int num_iftype_akm_suites;

        u8 retry_short;
        u8 retry_long;
        u32 frag_threshold;
        u32 rts_threshold;
        u8 coverage_class;

        char fw_version[ETHTOOL_FWVERS_LEN];
        u32 hw_version;

#ifdef CONFIG_PM
        const struct wiphy_wowlan_support *wowlan;
        struct cfg80211_wowlan *wowlan_config;
#endif

        u16 max_remain_on_channel_duration;

        u8 max_num_pmkids;

        u32 available_antennas_tx;
        u32 available_antennas_rx;

        u32 probe_resp_offload;

        const u8 *extended_capabilities, *extended_capabilities_mask;
        u8 extended_capabilities_len;

        const struct wiphy_iftype_ext_capab *iftype_ext_capab;
        unsigned int num_iftype_ext_capab;

        const void *privid;

        struct ieee80211_supported_band *bands[NUM_NL80211_BANDS];

        void (*reg_notifier)(struct wiphy *wiphy,
                             struct regulatory_request *request);

        /* fields below are read-only, assigned by cfg80211 */

        const struct ieee80211_regdomain __rcu *regd;

        struct device dev;

        bool registered;

        struct dentry *debugfsdir;

        const struct ieee80211_ht_cap *ht_capa_mod_mask;
        const struct ieee80211_vht_cap *vht_capa_mod_mask;

        struct list_head wdev_list;

        possible_net_t _net;

#ifdef CONFIG_CFG80211_WEXT
        const struct iw_handler_def *wext;
#endif

        const struct wiphy_coalesce_support *coalesce;

        const struct wiphy_vendor_command *vendor_commands;
        const struct nl80211_vendor_cmd_info *vendor_events;
        int n_vendor_commands, n_vendor_events;

        u16 max_ap_assoc_sta;

        u8 max_num_csa_counters;

        u32 bss_select_support;

        u8 nan_supported_bands;

        u32 txq_limit;
        u32 txq_memory_limit;
        u32 txq_quantum;

        unsigned long tx_queue_len;

        u8 support_mbssid:1,
           support_only_he_mbssid:1;

        const struct cfg80211_pmsr_capabilities *pmsr_capa;

        struct {
                u64 peer, vif;
                u8 max_retry;
        } tid_config_support;

        u8 max_data_retry_count;

        const struct cfg80211_sar_capa *sar_capa;

        struct rfkill *rfkill;

        u8 mbssid_max_interfaces;
        u8 ema_max_profile_periodicity;
        u16 max_num_akm_suites;

        u16 hw_timestamp_max_peers;

        char priv[] __aligned(NETDEV_ALIGN);
};

static inline struct net *wiphy_net(struct wiphy *wiphy)
{
        return read_pnet(&wiphy->_net);
}

static inline void wiphy_net_set(struct wiphy *wiphy, struct net *net)
{
        write_pnet(&wiphy->_net, net);
}

/**
 * wiphy_priv - return priv from wiphy
 *
 * @wiphy: the wiphy whose priv pointer to return
 * Return: The priv of @wiphy.
 */
static inline void *wiphy_priv(struct wiphy *wiphy)
{
        BUG_ON(!wiphy);
        return &wiphy->priv;
}

/**
 * priv_to_wiphy - return the wiphy containing the priv
 *
 * @priv: a pointer previously returned by wiphy_priv
 * Return: The wiphy of @priv.
 */
static inline struct wiphy *priv_to_wiphy(void *priv)
{
        BUG_ON(!priv);
        return container_of(priv, struct wiphy, priv);
}

/**
 * set_wiphy_dev - set device pointer for wiphy
 *
 * @wiphy: The wiphy whose device to bind
 * @dev: The device to parent it to
 */
static inline void set_wiphy_dev(struct wiphy *wiphy, struct device *dev)
{
        wiphy->dev.parent = dev;
}

/**
 * wiphy_dev - get wiphy dev pointer
 *
 * @wiphy: The wiphy whose device struct to look up
 * Return: The dev of @wiphy.
 */
static inline struct device *wiphy_dev(struct wiphy *wiphy)
{
        return wiphy->dev.parent;
}

/**
 * wiphy_name - get wiphy name
 *
 * @wiphy: The wiphy whose name to return
 * Return: The name of @wiphy.
 */
static inline const char *wiphy_name(const struct wiphy *wiphy)
{
        return dev_name(&wiphy->dev);
}

/**
 * wiphy_new_nm - create a new wiphy for use with cfg80211
 *
 * @ops: The configuration operations for this device
 * @sizeof_priv: The size of the private area to allocate
 * @requested_name: Request a particular name.
 *        NULL is valid value, and means use the default phy%d naming.
 *
 * Create a new wiphy and associate the given operations with it.
 * @sizeof_priv bytes are allocated for private use.
 *
 * Return: A pointer to the new wiphy. This pointer must be
 * assigned to each netdev's ieee80211_ptr for proper operation.
 */
struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
                           const char *requested_name);

/**
 * wiphy_new - create a new wiphy for use with cfg80211
 *
 * @ops: The configuration operations for this device
 * @sizeof_priv: The size of the private area to allocate
 *
 * Create a new wiphy and associate the given operations with it.
 * @sizeof_priv bytes are allocated for private use.
 *
 * Return: A pointer to the new wiphy. This pointer must be
 * assigned to each netdev's ieee80211_ptr for proper operation.
 */
static inline struct wiphy *wiphy_new(const struct cfg80211_ops *ops,
                                      int sizeof_priv)
{
        return wiphy_new_nm(ops, sizeof_priv, NULL);
}

/**
 * wiphy_register - register a wiphy with cfg80211
 *
 * @wiphy: The wiphy to register.
 *
 * Return: A non-negative wiphy index or a negative error code.
 */
int wiphy_register(struct wiphy *wiphy);

/* this is a define for better error reporting (file/line) */
#define lockdep_assert_wiphy(wiphy) lockdep_assert_held(&(wiphy)->mtx)

/**
 * rcu_dereference_wiphy - rcu_dereference with debug checking
 * @wiphy: the wiphy to check the locking on
 * @p: The pointer to read, prior to dereferencing
 *
 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
 * or RTNL. Note: Please prefer wiphy_dereference() or rcu_dereference().
 */
#define rcu_dereference_wiphy(wiphy, p)                                \
        rcu_dereference_check(p, lockdep_is_held(&wiphy->mtx))

/**
 * wiphy_dereference - fetch RCU pointer when updates are prevented by wiphy mtx
 * @wiphy: the wiphy to check the locking on
 * @p: The pointer to read, prior to dereferencing
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * READ_ONCE(), because caller holds the wiphy mutex used for updates.
 */
#define wiphy_dereference(wiphy, p)                                \
        rcu_dereference_protected(p, lockdep_is_held(&wiphy->mtx))

/**
 * get_wiphy_regdom - get custom regdomain for the given wiphy
 * @wiphy: the wiphy to get the regdomain from
 *
 * Context: Requires any of RTNL, wiphy mutex or RCU protection.
 *
 * Return: pointer to the regulatory domain associated with the wiphy
 */
const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy);

/**
 * wiphy_unregister - deregister a wiphy from cfg80211
 *
 * @wiphy: The wiphy to unregister.
 *
 * After this call, no more requests can be made with this priv
 * pointer, but the call may sleep to wait for an outstanding
 * request that is being handled.
 */
void wiphy_unregister(struct wiphy *wiphy);

/**
 * wiphy_free - free wiphy
 *
 * @wiphy: The wiphy to free
 */
void wiphy_free(struct wiphy *wiphy);

/* internal structs */
struct cfg80211_conn;
struct cfg80211_internal_bss;
struct cfg80211_cached_keys;
struct cfg80211_cqm_config;

/**
 * wiphy_lock - lock the wiphy
 * @wiphy: the wiphy to lock
 *
 * This is needed around registering and unregistering netdevs that
 * aren't created through cfg80211 calls, since that requires locking
 * in cfg80211 when the notifiers is called, but that cannot
 * differentiate which way it's called.
 *
 * It can also be used by drivers for their own purposes.
 *
 * When cfg80211 ops are called, the wiphy is already locked.
 *
 * Note that this makes sure that no workers that have been queued
 * with wiphy_queue_work() are running.
 */
static inline void wiphy_lock(struct wiphy *wiphy)
        __acquires(&wiphy->mtx)
{
        mutex_lock(&wiphy->mtx);
        __acquire(&wiphy->mtx);
}

/**
 * wiphy_unlock - unlock the wiphy again
 * @wiphy: the wiphy to unlock
 */
static inline void wiphy_unlock(struct wiphy *wiphy)
        __releases(&wiphy->mtx)
{
        __release(&wiphy->mtx);
        mutex_unlock(&wiphy->mtx);
}

struct wiphy_work;
typedef void (*wiphy_work_func_t)(struct wiphy *, struct wiphy_work *);

struct wiphy_work {
        struct list_head entry;
        wiphy_work_func_t func;
};

static inline void wiphy_work_init(struct wiphy_work *work,
                                   wiphy_work_func_t func)
{
        INIT_LIST_HEAD(&work->entry);
        work->func = func;
}

/**
 * wiphy_work_queue - queue work for the wiphy
 * @wiphy: the wiphy to queue for
 * @work: the work item
 *
 * This is useful for work that must be done asynchronously, and work
 * queued here has the special property that the wiphy mutex will be
 * held as if wiphy_lock() was called, and that it cannot be running
 * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can
 * use just cancel_work() instead of cancel_work_sync(), it requires
 * being in a section protected by wiphy_lock().
 */
void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work);

/**
 * wiphy_work_cancel - cancel previously queued work
 * @wiphy: the wiphy, for debug purposes
 * @work: the work to cancel
 *
 * Cancel the work *without* waiting for it, this assumes being
 * called under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work);

/**
 * wiphy_work_flush - flush previously queued work
 * @wiphy: the wiphy, for debug purposes
 * @work: the work to flush, this can be %NULL to flush all work
 *
 * Flush the work (i.e. run it if pending). This must be called
 * under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work);

struct wiphy_delayed_work {
        struct wiphy_work work;
        struct wiphy *wiphy;
        struct timer_list timer;
};

void wiphy_delayed_work_timer(struct timer_list *t);

static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork,
                                           wiphy_work_func_t func)
{
        timer_setup(&dwork->timer, wiphy_delayed_work_timer, 0);
        wiphy_work_init(&dwork->work, func);
}

/**
 * wiphy_delayed_work_queue - queue delayed work for the wiphy
 * @wiphy: the wiphy to queue for
 * @dwork: the delayable worker
 * @delay: number of jiffies to wait before queueing
 *
 * This is useful for work that must be done asynchronously, and work
 * queued here has the special property that the wiphy mutex will be
 * held as if wiphy_lock() was called, and that it cannot be running
 * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can
 * use just cancel_work() instead of cancel_work_sync(), it requires
 * being in a section protected by wiphy_lock().
 */
void wiphy_delayed_work_queue(struct wiphy *wiphy,
                              struct wiphy_delayed_work *dwork,
                              unsigned long delay);

/**
 * wiphy_delayed_work_cancel - cancel previously queued delayed work
 * @wiphy: the wiphy, for debug purposes
 * @dwork: the delayed work to cancel
 *
 * Cancel the work *without* waiting for it, this assumes being
 * called under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_delayed_work_cancel(struct wiphy *wiphy,
                               struct wiphy_delayed_work *dwork);

/**
 * wiphy_delayed_work_flush - flush previously queued delayed work
 * @wiphy: the wiphy, for debug purposes
 * @dwork: the delayed work to flush
 *
 * Flush the work (i.e. run it if pending). This must be called
 * under the wiphy mutex acquired by wiphy_lock().
 */
void wiphy_delayed_work_flush(struct wiphy *wiphy,
                              struct wiphy_delayed_work *dwork);

/**
 * struct wireless_dev - wireless device state
 *
 * For netdevs, this structure must be allocated by the driver
 * that uses the ieee80211_ptr field in struct net_device (this
 * is intentional so it can be allocated along with the netdev.)
 * It need not be registered then as netdev registration will
 * be intercepted by cfg80211 to see the new wireless device,
 * however, drivers must lock the wiphy before registering or
 * unregistering netdevs if they pre-create any netdevs (in ops
 * called from cfg80211, the wiphy is already locked.)
 *
 * For non-netdev uses, it must also be allocated by the driver
 * in response to the cfg80211 callbacks that require it, as
 * there's no netdev registration in that case it may not be
 * allocated outside of callback operations that return it.
 *
 * @wiphy: pointer to hardware description
 * @iftype: interface type
 * @registered: is this wdev already registered with cfg80211
 * @registering: indicates we're doing registration under wiphy lock
 *        for the notifier
 * @list: (private) Used to collect the interfaces
 * @netdev: (private) Used to reference back to the netdev, may be %NULL
 * @identifier: (private) Identifier used in nl80211 to identify this
 *        wireless device if it has no netdev
 * @u: union containing data specific to @iftype
 * @connected: indicates if connected or not (STA mode)
 * @wext: (private) Used by the internal wireless extensions compat code
 * @wext.ibss: (private) IBSS data part of wext handling
 * @wext.connect: (private) connection handling data
 * @wext.keys: (private) (WEP) key data
 * @wext.ie: (private) extra elements for association
 * @wext.ie_len: (private) length of extra elements
 * @wext.bssid: (private) selected network BSSID
 * @wext.ssid: (private) selected network SSID
 * @wext.default_key: (private) selected default key index
 * @wext.default_mgmt_key: (private) selected default management key index
 * @wext.prev_bssid: (private) previous BSSID for reassociation
 * @wext.prev_bssid_valid: (private) previous BSSID validity
 * @use_4addr: indicates 4addr mode is used on this interface, must be
 *        set by driver (if supported) on add_interface BEFORE registering the
 *        netdev and may otherwise be used by driver read-only, will be update
 *        by cfg80211 on change_interface
 * @mgmt_registrations: list of registrations for management frames
 * @mgmt_registrations_need_update: mgmt registrations were updated,
 *        need to propagate the update to the driver
 * @address: The address for this device, valid only if @netdev is %NULL
 * @is_running: true if this is a non-netdev device that has been started, e.g.
 *        the P2P Device.
 * @cac_started: true if DFS channel availability check has been started
 * @cac_start_time: timestamp (jiffies) when the dfs state was entered.
 * @cac_time_ms: CAC time in ms
 * @ps: powersave mode is enabled
 * @ps_timeout: dynamic powersave timeout
 * @ap_unexpected_nlportid: (private) netlink port ID of application
 *        registered for unexpected class 3 frames (AP mode)
 * @conn: (private) cfg80211 software SME connection state machine data
 * @connect_keys: (private) keys to set after connection is established
 * @conn_bss_type: connecting/connected BSS type
 * @conn_owner_nlportid: (private) connection owner socket port ID
 * @disconnect_wk: (private) auto-disconnect work
 * @disconnect_bssid: (private) the BSSID to use for auto-disconnect
 * @event_list: (private) list for internal event processing
 * @event_lock: (private) lock for event list
 * @owner_nlportid: (private) owner socket port ID
 * @nl_owner_dead: (private) owner socket went away
 * @cqm_rssi_work: (private) CQM RSSI reporting work
 * @cqm_config: (private) nl80211 RSSI monitor state
 * @pmsr_list: (private) peer measurement requests
 * @pmsr_lock: (private) peer measurements requests/results lock
 * @pmsr_free_wk: (private) peer measurements cleanup work
 * @unprot_beacon_reported: (private) timestamp of last
 *        unprotected beacon report
 * @links: array of %IEEE80211_MLD_MAX_NUM_LINKS elements containing @addr
 *        @ap and @client for each link
 * @valid_links: bitmap describing what elements of @links are valid
 */
struct wireless_dev {
        struct wiphy *wiphy;
        enum nl80211_iftype iftype;

        /* the remainder of this struct should be private to cfg80211 */
        struct list_head list;
        struct net_device *netdev;

        u32 identifier;

        struct list_head mgmt_registrations;
        u8 mgmt_registrations_need_update:1;

        bool use_4addr, is_running, registered, registering;

        u8 address[ETH_ALEN] __aligned(sizeof(u16));

        /* currently used for IBSS and SME - might be rearranged later */
        struct cfg80211_conn *conn;
        struct cfg80211_cached_keys *connect_keys;
        enum ieee80211_bss_type conn_bss_type;
        u32 conn_owner_nlportid;

        struct work_struct disconnect_wk;
        u8 disconnect_bssid[ETH_ALEN];

        struct list_head event_list;
        spinlock_t event_lock;

        u8 connected:1;

        bool ps;
        int ps_timeout;

        u32 ap_unexpected_nlportid;

        u32 owner_nlportid;
        bool nl_owner_dead;

        /* FIXME: need to rework radar detection for MLO */
        bool cac_started;
        unsigned long cac_start_time;
        unsigned int cac_time_ms;

#ifdef CONFIG_CFG80211_WEXT
        /* wext data */
        struct {
                struct cfg80211_ibss_params ibss;
                struct cfg80211_connect_params connect;
                struct cfg80211_cached_keys *keys;
                const u8 *ie;
                size_t ie_len;
                u8 bssid[ETH_ALEN];
                u8 prev_bssid[ETH_ALEN];
                u8 ssid[IEEE80211_MAX_SSID_LEN];
                s8 default_key, default_mgmt_key;
                bool prev_bssid_valid;
        } wext;
#endif

        struct wiphy_work cqm_rssi_work;
        struct cfg80211_cqm_config __rcu *cqm_config;

        struct list_head pmsr_list;
        spinlock_t pmsr_lock;
        struct work_struct pmsr_free_wk;

        unsigned long unprot_beacon_reported;

        union {
                struct {
                        u8 connected_addr[ETH_ALEN] __aligned(2);
                        u8 ssid[IEEE80211_MAX_SSID_LEN];
                        u8 ssid_len;
                } client;
                struct {
                        int beacon_interval;
                        struct cfg80211_chan_def preset_chandef;
                        struct cfg80211_chan_def chandef;
                        u8 id[IEEE80211_MAX_MESH_ID_LEN];
                        u8 id_len, id_up_len;
                } mesh;
                struct {
                        struct cfg80211_chan_def preset_chandef;
                        u8 ssid[IEEE80211_MAX_SSID_LEN];
                        u8 ssid_len;
                } ap;
                struct {
                        struct cfg80211_internal_bss *current_bss;
                        struct cfg80211_chan_def chandef;
                        int beacon_interval;
                        u8 ssid[IEEE80211_MAX_SSID_LEN];
                        u8 ssid_len;
                } ibss;
                struct {
                        struct cfg80211_chan_def chandef;
                } ocb;
        } u;

        struct {
                u8 addr[ETH_ALEN] __aligned(2);
                union {
                        struct {
                                unsigned int beacon_interval;
                                struct cfg80211_chan_def chandef;
                        } ap;
                        struct {
                                struct cfg80211_internal_bss *current_bss;
                        } client;
                };
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
        u16 valid_links;
};

static inline const u8 *wdev_address(struct wireless_dev *wdev)
{
        if (wdev->netdev)
                return wdev->netdev->dev_addr;
        return wdev->address;
}

static inline bool wdev_running(struct wireless_dev *wdev)
{
        if (wdev->netdev)
                return netif_running(wdev->netdev);
        return wdev->is_running;
}

/**
 * wdev_priv - return wiphy priv from wireless_dev
 *
 * @wdev: The wireless device whose wiphy's priv pointer to return
 * Return: The wiphy priv of @wdev.
 */
static inline void *wdev_priv(struct wireless_dev *wdev)
{
        BUG_ON(!wdev);
        return wiphy_priv(wdev->wiphy);
}

/**
 * wdev_chandef - return chandef pointer from wireless_dev
 * @wdev: the wdev
 * @link_id: the link ID for MLO
 *
 * Return: The chandef depending on the mode, or %NULL.
 */
struct cfg80211_chan_def *wdev_chandef(struct wireless_dev *wdev,
                                       unsigned int link_id);

static inline void WARN_INVALID_LINK_ID(struct wireless_dev *wdev,
                                        unsigned int link_id)
{
        WARN_ON(link_id && !wdev->valid_links);
        WARN_ON(wdev->valid_links &&
                !(wdev->valid_links & BIT(link_id)));
}

#define for_each_valid_link(link_info, link_id)                        \
        for (link_id = 0;                                        \
             link_id < ((link_info)->valid_links ?                \
                        ARRAY_SIZE((link_info)->links) : 1);        \
             link_id++)                                                \
                if (!(link_info)->valid_links ||                \
                    ((link_info)->valid_links & BIT(link_id)))

/**
 * DOC: Utility functions
 *
 * cfg80211 offers a number of utility functions that can be useful.
 */

/**
 * ieee80211_channel_equal - compare two struct ieee80211_channel
 *
 * @a: 1st struct ieee80211_channel
 * @b: 2nd struct ieee80211_channel
 * Return: true if center frequency of @a == @b
 */
static inline bool
ieee80211_channel_equal(struct ieee80211_channel *a,
                        struct ieee80211_channel *b)
{
        return (a->center_freq == b->center_freq &&
                a->freq_offset == b->freq_offset);
}

/**
 * ieee80211_channel_to_khz - convert ieee80211_channel to frequency in KHz
 * @chan: struct ieee80211_channel to convert
 * Return: The corresponding frequency (in KHz)
 */
static inline u32
ieee80211_channel_to_khz(const struct ieee80211_channel *chan)
{
        return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset;
}

/**
 * ieee80211_s1g_channel_width - get allowed channel width from @chan
 *
 * Only allowed for band NL80211_BAND_S1GHZ
 * @chan: channel
 * Return: The allowed channel width for this center_freq
 */
enum nl80211_chan_width
ieee80211_s1g_channel_width(const struct ieee80211_channel *chan);

/**
 * ieee80211_channel_to_freq_khz - convert channel number to frequency
 * @chan: channel number
 * @band: band, necessary due to channel number overlap
 * Return: The corresponding frequency (in KHz), or 0 if the conversion failed.
 */
u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band);

/**
 * ieee80211_channel_to_frequency - convert channel number to frequency
 * @chan: channel number
 * @band: band, necessary due to channel number overlap
 * Return: The corresponding frequency (in MHz), or 0 if the conversion failed.
 */
static inline int
ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
{
        return KHZ_TO_MHZ(ieee80211_channel_to_freq_khz(chan, band));
}

/**
 * ieee80211_freq_khz_to_channel - convert frequency to channel number
 * @freq: center frequency in KHz
 * Return: The corresponding channel, or 0 if the conversion failed.
 */
int ieee80211_freq_khz_to_channel(u32 freq);

/**
 * ieee80211_frequency_to_channel - convert frequency to channel number
 * @freq: center frequency in MHz
 * Return: The corresponding channel, or 0 if the conversion failed.
 */
static inline int
ieee80211_frequency_to_channel(int freq)
{
        return ieee80211_freq_khz_to_channel(MHZ_TO_KHZ(freq));
}

/**
 * ieee80211_get_channel_khz - get channel struct from wiphy for specified
 * frequency
 * @wiphy: the struct wiphy to get the channel for
 * @freq: the center frequency (in KHz) of the channel
 * Return: The channel struct from @wiphy at @freq.
 */
struct ieee80211_channel *
ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq);

/**
 * ieee80211_get_channel - get channel struct from wiphy for specified frequency
 *
 * @wiphy: the struct wiphy to get the channel for
 * @freq: the center frequency (in MHz) of the channel
 * Return: The channel struct from @wiphy at @freq.
 */
static inline struct ieee80211_channel *
ieee80211_get_channel(struct wiphy *wiphy, int freq)
{
        return ieee80211_get_channel_khz(wiphy, MHZ_TO_KHZ(freq));
}

/**
 * cfg80211_channel_is_psc - Check if the channel is a 6 GHz PSC
 * @chan: control channel to check
 *
 * The Preferred Scanning Channels (PSC) are defined in
 * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3
 *
 * Return: %true if channel is a PSC, %false otherwise
 */
static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan)
{
        if (chan->band != NL80211_BAND_6GHZ)
                return false;

        return ieee80211_frequency_to_channel(chan->center_freq) % 16 == 5;
}

/**
 * ieee80211_get_response_rate - get basic rate for a given rate
 *
 * @sband: the band to look for rates in
 * @basic_rates: bitmap of basic rates
 * @bitrate: the bitrate for which to find the basic rate
 *
 * Return: The basic rate corresponding to a given bitrate, that
 * is the next lower bitrate contained in the basic rate map,
 * which is, for this function, given as a bitmap of indices of
 * rates in the band's bitrate table.
 */
const struct ieee80211_rate *
ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
                            u32 basic_rates, int bitrate);

/**
 * ieee80211_mandatory_rates - get mandatory rates for a given band
 * @sband: the band to look for rates in
 *
 * Return: a bitmap of the mandatory rates for the given band, bits
 * are set according to the rate position in the bitrates array.
 */
u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband);

/*
 * Radiotap parsing functions -- for controlled injection support
 *
 * Implemented in net/wireless/radiotap.c
 * Documentation in Documentation/networking/radiotap-headers.rst
 */

struct radiotap_align_size {
        uint8_t align:4, size:4;
};

struct ieee80211_radiotap_namespace {
        const struct radiotap_align_size *align_size;
        int n_bits;
        uint32_t oui;
        uint8_t subns;
};

struct ieee80211_radiotap_vendor_namespaces {
        const struct ieee80211_radiotap_namespace *ns;
        int n_ns;
};

/**
 * struct ieee80211_radiotap_iterator - tracks walk thru present radiotap args
 * @this_arg_index: index of current arg, valid after each successful call
 *        to ieee80211_radiotap_iterator_next()
 * @this_arg: pointer to current radiotap arg; it is valid after each
 *        call to ieee80211_radiotap_iterator_next() but also after
 *        ieee80211_radiotap_iterator_init() where it will point to
 *        the beginning of the actual data portion
 * @this_arg_size: length of the current arg, for convenience
 * @current_namespace: pointer to the current namespace definition
 *        (or internally %NULL if the current namespace is unknown)
 * @is_radiotap_ns: indicates whether the current namespace is the default
 *        radiotap namespace or not
 *
 * @_rtheader: pointer to the radiotap header we are walking through
 * @_max_length: length of radiotap header in cpu byte ordering
 * @_arg_index: next argument index
 * @_arg: next argument pointer
 * @_next_bitmap: internal pointer to next present u32
 * @_bitmap_shifter: internal shifter for curr u32 bitmap, b0 set == arg present
 * @_vns: vendor namespace definitions
 * @_next_ns_data: beginning of the next namespace's data
 * @_reset_on_ext: internal; reset the arg index to 0 when going to the
 *        next bitmap word
 *
 * Describes the radiotap parser state. Fields prefixed with an underscore
 * must not be used by users of the parser, only by the parser internally.
 */

struct ieee80211_radiotap_iterator {
        struct ieee80211_radiotap_header *_rtheader;
        const struct ieee80211_radiotap_vendor_namespaces *_vns;
        const struct ieee80211_radiotap_namespace *current_namespace;

        unsigned char *_arg, *_next_ns_data;
        __le32 *_next_bitmap;

        unsigned char *this_arg;
        int this_arg_index;
        int this_arg_size;

        int is_radiotap_ns;

        int _max_length;
        int _arg_index;
        uint32_t _bitmap_shifter;
        int _reset_on_ext;
};

int
ieee80211_radiotap_iterator_init(struct ieee80211_radiotap_iterator *iterator,
                                 struct ieee80211_radiotap_header *radiotap_header,
                                 int max_length,
                                 const struct ieee80211_radiotap_vendor_namespaces *vns);

int
ieee80211_radiotap_iterator_next(struct ieee80211_radiotap_iterator *iterator);


extern const unsigned char rfc1042_header[6];
extern const unsigned char bridge_tunnel_header[6];

/**
 * ieee80211_get_hdrlen_from_skb - get header length from data
 *
 * @skb: the frame
 *
 * Given an skb with a raw 802.11 header at the data pointer this function
 * returns the 802.11 header length.
 *
 * Return: The 802.11 header length in bytes (not including encryption
 * headers). Or 0 if the data in the sk_buff is too short to contain a valid
 * 802.11 header.
 */
unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb);

/**
 * ieee80211_hdrlen - get header length in bytes from frame control
 * @fc: frame control field in little-endian format
 * Return: The header length in bytes.
 */
unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc);

/**
 * ieee80211_get_mesh_hdrlen - get mesh extension header length
 * @meshhdr: the mesh extension header, only the flags field
 *        (first byte) will be accessed
 * Return: The length of the extension header, which is always at
 * least 6 bytes and at most 18 if address 5 and 6 are present.
 */
unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr);

/**
 * DOC: Data path helpers
 *
 * In addition to generic utilities, cfg80211 also offers
 * functions that help implement the data path for devices
 * that do not do the 802.11/802.3 conversion on the device.
 */

/**
 * ieee80211_data_to_8023_exthdr - convert an 802.11 data frame to 802.3
 * @skb: the 802.11 data frame
 * @ehdr: pointer to a &struct ethhdr that will get the header, instead
 *        of it being pushed into the SKB
 * @addr: the device MAC address
 * @iftype: the virtual interface type
 * @data_offset: offset of payload after the 802.11 header
 * @is_amsdu: true if the 802.11 header is A-MSDU
 * Return: 0 on success. Non-zero on error.
 */
int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
                                  const u8 *addr, enum nl80211_iftype iftype,
                                  u8 data_offset, bool is_amsdu);

/**
 * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3
 * @skb: the 802.11 data frame
 * @addr: the device MAC address
 * @iftype: the virtual interface type
 * Return: 0 on success. Non-zero on error.
 */
static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
                                         enum nl80211_iftype iftype)
{
        return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0, false);
}

/**
 * ieee80211_is_valid_amsdu - check if subframe lengths of an A-MSDU are valid
 *
 * This is used to detect non-standard A-MSDU frames, e.g. the ones generated
 * by ath10k and ath11k, where the subframe length includes the length of the
 * mesh control field.
 *
 * @skb: The input A-MSDU frame without any headers.
 * @mesh_hdr: the type of mesh header to test
 *        0: non-mesh A-MSDU length field
 *        1: big-endian mesh A-MSDU length field
 *        2: little-endian mesh A-MSDU length field
 * Returns: true if subframe header lengths are valid for the @mesh_hdr mode
 */
bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr);

/**
 * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame
 *
 * Decode an IEEE 802.11 A-MSDU and convert it to a list of 802.3 frames.
 * The @list will be empty if the decode fails. The @skb must be fully
 * header-less before being passed in here; it is freed in this function.
 *
 * @skb: The input A-MSDU frame without any headers.
 * @list: The output list of 802.3 frames. It must be allocated and
 *        initialized by the caller.
 * @addr: The device MAC address.
 * @iftype: The device interface type.
 * @extra_headroom: The hardware extra headroom for SKBs in the @list.
 * @check_da: DA to check in the inner ethernet header, or NULL
 * @check_sa: SA to check in the inner ethernet header, or NULL
 * @mesh_control: see mesh_hdr in ieee80211_is_valid_amsdu
 */
void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
                              const u8 *addr, enum nl80211_iftype iftype,
                              const unsigned int extra_headroom,
                              const u8 *check_da, const u8 *check_sa,
                              u8 mesh_control);

/**
 * ieee80211_get_8023_tunnel_proto - get RFC1042 or bridge tunnel encap protocol
 *
 * Check for RFC1042 or bridge tunnel header and fetch the encapsulated
 * protocol.
 *
 * @hdr: pointer to the MSDU payload
 * @proto: destination pointer to store the protocol
 * Return: true if encapsulation was found
 */
bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto);

/**
 * ieee80211_strip_8023_mesh_hdr - strip mesh header from converted 802.3 frames
 *
 * Strip the mesh header, which was left in by ieee80211_data_to_8023 as part
 * of the MSDU data. Also move any source/destination addresses from the mesh
 * header to the ethernet header (if present).
 *
 * @skb: The 802.3 frame with embedded mesh header
 *
 * Return: 0 on success. Non-zero on error.
 */
int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb);

/**
 * cfg80211_classify8021d - determine the 802.1p/1d tag for a data frame
 * @skb: the data frame
 * @qos_map: Interworking QoS mapping or %NULL if not in use
 * Return: The 802.1p/1d tag.
 */
unsigned int cfg80211_classify8021d(struct sk_buff *skb,
                                    struct cfg80211_qos_map *qos_map);

/**
 * cfg80211_find_elem_match - match information element and byte array in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 * @match: byte array to match
 * @match_len: number of bytes in the match array
 * @match_offset: offset in the IE data where the byte array should match.
 *        Note the difference to cfg80211_find_ie_match() which considers
 *        the offset to start from the element ID byte, but here we take
 *        the data portion instead.
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data and being large enough for the
 * byte array to match.
 */
const struct element *
cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len,
                         const u8 *match, unsigned int match_len,
                         unsigned int match_offset);

/**
 * cfg80211_find_ie_match - match information element and byte array in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 * @match: byte array to match
 * @match_len: number of bytes in the match array
 * @match_offset: offset in the IE where the byte array should match.
 *        If match_len is zero, this must also be set to zero.
 *        Otherwise this must be set to 2 or more, because the first
 *        byte is the element id, which is already compared to eid, and
 *        the second byte is the IE length.
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match, or a pointer to the first
 * byte of the requested element, that is the byte containing the
 * element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data and being large enough for the
 * byte array to match.
 */
static inline const u8 *
cfg80211_find_ie_match(u8 eid, const u8 *ies, unsigned int len,
                       const u8 *match, unsigned int match_len,
                       unsigned int match_offset)
{
        /* match_offset can't be smaller than 2, unless match_len is
         * zero, in which case match_offset must be zero as well.
         */
        if (WARN_ON((match_len && match_offset < 2) ||
                    (!match_len && match_offset)))
                return NULL;

        return (const void *)cfg80211_find_elem_match(eid, ies, len,
                                                      match, match_len,
                                                      match_offset ?
                                                        match_offset - 2 : 0);
}

/**
 * cfg80211_find_elem - find information element in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const struct element *
cfg80211_find_elem(u8 eid, const u8 *ies, int len)
{
        return cfg80211_find_elem_match(eid, ies, len, NULL, 0, 0);
}

/**
 * cfg80211_find_ie - find information element in data
 *
 * @eid: element ID
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data), or a pointer to the first byte of the requested
 * element, that is the byte containing the element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
{
        return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
}

/**
 * cfg80211_find_ext_elem - find information element with EID Extension in data
 *
 * @ext_eid: element ID Extension
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the extended element could not be found or if
 * the element is invalid (claims to be longer than the given
 * data) or if the byte array doesn't match; otherwise return the
 * requested element struct.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const struct element *
cfg80211_find_ext_elem(u8 ext_eid, const u8 *ies, int len)
{
        return cfg80211_find_elem_match(WLAN_EID_EXTENSION, ies, len,
                                        &ext_eid, 1, 0);
}

/**
 * cfg80211_find_ext_ie - find information element with EID Extension in data
 *
 * @ext_eid: element ID Extension
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the extended element ID could not be found or if
 * the element is invalid (claims to be longer than the given
 * data), or a pointer to the first byte of the requested
 * element, that is the byte containing the element ID.
 *
 * Note: There are no checks on the element length other than
 * having to fit into the given data.
 */
static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len)
{
        return cfg80211_find_ie_match(WLAN_EID_EXTENSION, ies, len,
                                      &ext_eid, 1, 2);
}

/**
 * cfg80211_find_vendor_elem - find vendor specific information element in data
 *
 * @oui: vendor OUI
 * @oui_type: vendor-specific OUI type (must be < 0xff), negative means any
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the vendor specific element ID could not be found or if the
 * element is invalid (claims to be longer than the given data); otherwise
 * return the element structure for the requested element.
 *
 * Note: There are no checks on the element length other than having to fit into
 * the given data.
 */
const struct element *cfg80211_find_vendor_elem(unsigned int oui, int oui_type,
                                                const u8 *ies,
                                                unsigned int len);

/**
 * cfg80211_find_vendor_ie - find vendor specific information element in data
 *
 * @oui: vendor OUI
 * @oui_type: vendor-specific OUI type (must be < 0xff), negative means any
 * @ies: data consisting of IEs
 * @len: length of data
 *
 * Return: %NULL if the vendor specific element ID could not be found or if the
 * element is invalid (claims to be longer than the given data), or a pointer to
 * the first byte of the requested element, that is the byte containing the
 * element ID.
 *
 * Note: There are no checks on the element length other than having to fit into
 * the given data.
 */
static inline const u8 *
cfg80211_find_vendor_ie(unsigned int oui, int oui_type,
                        const u8 *ies, unsigned int len)
{
        return (const void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len);
}

/**
 * enum cfg80211_rnr_iter_ret - reduced neighbor report iteration state
 * @RNR_ITER_CONTINUE: continue iterating with the next entry
 * @RNR_ITER_BREAK: break iteration and return success
 * @RNR_ITER_ERROR: break iteration and return error
 */
enum cfg80211_rnr_iter_ret {
        RNR_ITER_CONTINUE,
        RNR_ITER_BREAK,
        RNR_ITER_ERROR,
};

/**
 * cfg80211_iter_rnr - iterate reduced neighbor report entries
 * @elems: the frame elements to iterate RNR elements and then
 *        their entries in
 * @elems_len: length of the elements
 * @iter: iteration function, see also &enum cfg80211_rnr_iter_ret
 *        for the return value
 * @iter_data: additional data passed to the iteration function
 * Return: %true on success (after successfully iterating all entries
 *        or if the iteration function returned %RNR_ITER_BREAK),
 *        %false on error (iteration function returned %RNR_ITER_ERROR
 *        or elements were malformed.)
 */
bool cfg80211_iter_rnr(const u8 *elems, size_t elems_len,
                       enum cfg80211_rnr_iter_ret
                       (*iter)(void *data, u8 type,
                               const struct ieee80211_neighbor_ap_info *info,
                               const u8 *tbtt_info, u8 tbtt_info_len),
                       void *iter_data);

/**
 * cfg80211_defragment_element - Defrag the given element data into a buffer
 *
 * @elem: the element to defragment
 * @ies: elements where @elem is contained
 * @ieslen: length of @ies
 * @data: buffer to store element data, or %NULL to just determine size
 * @data_len: length of @data, or 0
 * @frag_id: the element ID of fragments
 *
 * Return: length of @data, or -EINVAL on error
 *
 * Copy out all data from an element that may be fragmented into @data, while
 * skipping all headers.
 *
 * The function uses memmove() internally. It is acceptable to defragment an
 * element in-place.
 */
ssize_t cfg80211_defragment_element(const struct element *elem, const u8 *ies,
                                    size_t ieslen, u8 *data, size_t data_len,
                                    u8 frag_id);

/**
 * cfg80211_send_layer2_update - send layer 2 update frame
 *
 * @dev: network device
 * @addr: STA MAC address
 *
 * Wireless drivers can use this function to update forwarding tables in bridge
 * devices upon STA association.
 */
void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr);

/**
 * DOC: Regulatory enforcement infrastructure
 *
 * TODO
 */

/**
 * regulatory_hint - driver hint to the wireless core a regulatory domain
 * @wiphy: the wireless device giving the hint (used only for reporting
 *        conflicts)
 * @alpha2: the ISO/IEC 3166 alpha2 the driver claims its regulatory domain
 *        should be in. If @rd is set this should be NULL. Note that if you
 *        set this to NULL you should still set rd->alpha2 to some accepted
 *        alpha2.
 *
 * Wireless drivers can use this function to hint to the wireless core
 * what it believes should be the current regulatory domain by
 * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory
 * domain should be in or by providing a completely build regulatory domain.
 * If the driver provides an ISO/IEC 3166 alpha2 userspace will be queried
 * for a regulatory domain structure for the respective country.
 *
 * The wiphy must have been registered to cfg80211 prior to this call.
 * For cfg80211 drivers this means you must first use wiphy_register(),
 * for mac80211 drivers you must first use ieee80211_register_hw().
 *
 * Drivers should check the return value, its possible you can get
 * an -ENOMEM.
 *
 * Return: 0 on success. -ENOMEM.
 */
int regulatory_hint(struct wiphy *wiphy, const char *alpha2);

/**
 * regulatory_set_wiphy_regd - set regdom info for self managed drivers
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @rd: the regulatory domain information to use for this wiphy
 *
 * Set the regulatory domain information for self-managed wiphys, only they
 * may use this function. See %REGULATORY_WIPHY_SELF_MANAGED for more
 * information.
 *
 * Return: 0 on success. -EINVAL, -EPERM
 */
int regulatory_set_wiphy_regd(struct wiphy *wiphy,
                              struct ieee80211_regdomain *rd);

/**
 * regulatory_set_wiphy_regd_sync - set regdom for self-managed drivers
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @rd: the regulatory domain information to use for this wiphy
 *
 * This functions requires the RTNL and the wiphy mutex to be held and
 * applies the new regdomain synchronously to this wiphy. For more details
 * see regulatory_set_wiphy_regd().
 *
 * Return: 0 on success. -EINVAL, -EPERM
 */
int regulatory_set_wiphy_regd_sync(struct wiphy *wiphy,
                                   struct ieee80211_regdomain *rd);

/**
 * wiphy_apply_custom_regulatory - apply a custom driver regulatory domain
 * @wiphy: the wireless device we want to process the regulatory domain on
 * @regd: the custom regulatory domain to use for this wiphy
 *
 * Drivers can sometimes have custom regulatory domains which do not apply
 * to a specific country. Drivers can use this to apply such custom regulatory
 * domains. This routine must be called prior to wiphy registration. The
 * custom regulatory domain will be trusted completely and as such previous
 * default channel settings will be disregarded. If no rule is found for a
 * channel on the regulatory domain the channel will be disabled.
 * Drivers using this for a wiphy should also set the wiphy flag
 * REGULATORY_CUSTOM_REG or cfg80211 will set it for the wiphy
 * that called this helper.
 */
void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
                                   const struct ieee80211_regdomain *regd);

/**
 * freq_reg_info - get regulatory information for the given frequency
 * @wiphy: the wiphy for which we want to process this rule for
 * @center_freq: Frequency in KHz for which we want regulatory information for
 *
 * Use this function to get the regulatory rule for a specific frequency on
 * a given wireless device. If the device has a specific regulatory domain
 * it wants to follow we respect that unless a country IE has been received
 * and processed already.
 *
 * Return: A valid pointer, or, when an error occurs, for example if no rule
 * can be found, the return value is encoded using ERR_PTR(). Use IS_ERR() to
 * check and PTR_ERR() to obtain the numeric return value. The numeric return
 * value will be -ERANGE if we determine the given center_freq does not even
 * have a regulatory rule for a frequency range in the center_freq's band.
 * See freq_in_rule_band() for our current definition of a band -- this is
 * purely subjective and right now it's 802.11 specific.
 */
const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
                                               u32 center_freq);

/**
 * reg_initiator_name - map regulatory request initiator enum to name
 * @initiator: the regulatory request initiator
 *
 * You can use this to map the regulatory request initiator enum to a
 * proper string representation.
 *
 * Return: pointer to string representation of the initiator
 */
const char *reg_initiator_name(enum nl80211_reg_initiator initiator);

/**
 * regulatory_pre_cac_allowed - check if pre-CAC allowed in the current regdom
 * @wiphy: wiphy for which pre-CAC capability is checked.
 *
 * Pre-CAC is allowed only in some regdomains (notable ETSI).
 *
 * Return: %true if allowed, %false otherwise
 */
bool regulatory_pre_cac_allowed(struct wiphy *wiphy);

/**
 * DOC: Internal regulatory db functions
 *
 */

/**
 * reg_query_regdb_wmm -  Query internal regulatory db for wmm rule
 * Regulatory self-managed driver can use it to proactively
 *
 * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried.
 * @freq: the frequency (in MHz) to be queried.
 * @rule: pointer to store the wmm rule from the regulatory db.
 *
 * Self-managed wireless drivers can use this function to  query
 * the internal regulatory database to check whether the given
 * ISO/IEC 3166 alpha2 country and freq have wmm rule limitations.
 *
 * Drivers should check the return value, its possible you can get
 * an -ENODATA.
 *
 * Return: 0 on success. -ENODATA.
 */
int reg_query_regdb_wmm(char *alpha2, int freq,
                        struct ieee80211_reg_rule *rule);

/*
 * callbacks for asynchronous cfg80211 methods, notification
 * functions and BSS handling helpers
 */

/**
 * cfg80211_scan_done - notify that scan finished
 *
 * @request: the corresponding scan request
 * @info: information about the completed scan
 */
void cfg80211_scan_done(struct cfg80211_scan_request *request,
                        struct cfg80211_scan_info *info);

/**
 * cfg80211_sched_scan_results - notify that new scan results are available
 *
 * @wiphy: the wiphy which got scheduled scan results
 * @reqid: identifier for the related scheduled scan request
 */
void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_sched_scan_stopped - notify that the scheduled scan has stopped
 *
 * @wiphy: the wiphy on which the scheduled scan stopped
 * @reqid: identifier for the related scheduled scan request
 *
 * The driver can call this function to inform cfg80211 that the
 * scheduled scan had to be stopped, for whatever reason.  The driver
 * is then called back via the sched_scan_stop operation when done.
 */
void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_sched_scan_stopped_locked - notify that the scheduled scan has stopped
 *
 * @wiphy: the wiphy on which the scheduled scan stopped
 * @reqid: identifier for the related scheduled scan request
 *
 * The driver can call this function to inform cfg80211 that the
 * scheduled scan had to be stopped, for whatever reason.  The driver
 * is then called back via the sched_scan_stop operation when done.
 * This function should be called with the wiphy mutex held.
 */
void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid);

/**
 * cfg80211_inform_bss_frame_data - inform cfg80211 of a received BSS frame
 * @wiphy: the wiphy reporting the BSS
 * @data: the BSS metadata
 * @mgmt: the management frame (probe response or beacon)
 * @len: length of the management frame
 * @gfp: context flags
 *
 * This informs cfg80211 that BSS information was found and
 * the BSS should be updated/added.
 *
 * Return: A referenced struct, must be released with cfg80211_put_bss()!
 * Or %NULL on error.
 */
struct cfg80211_bss * __must_check
cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
                               struct cfg80211_inform_bss *data,
                               struct ieee80211_mgmt *mgmt, size_t len,
                               gfp_t gfp);

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss_frame(struct wiphy *wiphy,
                          struct ieee80211_channel *rx_channel,
                          struct ieee80211_mgmt *mgmt, size_t len,
                          s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .signal = signal,
        };

        return cfg80211_inform_bss_frame_data(wiphy, &data, mgmt, len, gfp);
}

/**
 * cfg80211_gen_new_bssid - generate a nontransmitted BSSID for multi-BSSID
 * @bssid: transmitter BSSID
 * @max_bssid: max BSSID indicator, taken from Multiple BSSID element
 * @mbssid_index: BSSID index, taken from Multiple BSSID index element
 * @new_bssid: calculated nontransmitted BSSID
 */
static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
                                          u8 mbssid_index, u8 *new_bssid)
{
        u64 bssid_u64 = ether_addr_to_u64(bssid);
        u64 mask = GENMASK_ULL(max_bssid - 1, 0);
        u64 new_bssid_u64;

        new_bssid_u64 = bssid_u64 & ~mask;

        new_bssid_u64 |= ((bssid_u64 & mask) + mbssid_index) & mask;

        u64_to_ether_addr(new_bssid_u64, new_bssid);
}

/**
 * cfg80211_is_element_inherited - returns if element ID should be inherited
 * @element: element to check
 * @non_inherit_element: non inheritance element
 *
 * Return: %true if should be inherited, %false otherwise
 */
bool cfg80211_is_element_inherited(const struct element *element,
                                   const struct element *non_inherit_element);

/**
 * cfg80211_merge_profile - merges a MBSSID profile if it is split between IEs
 * @ie: ies
 * @ielen: length of IEs
 * @mbssid_elem: current MBSSID element
 * @sub_elem: current MBSSID subelement (profile)
 * @merged_ie: location of the merged profile
 * @max_copy_len: max merged profile length
 *
 * Return: the number of bytes merged
 */
size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
                              const struct element *mbssid_elem,
                              const struct element *sub_elem,
                              u8 *merged_ie, size_t max_copy_len);

/**
 * enum cfg80211_bss_frame_type - frame type that the BSS data came from
 * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is
 *        from a beacon or probe response
 * @CFG80211_BSS_FTYPE_BEACON: data comes from a beacon
 * @CFG80211_BSS_FTYPE_PRESP: data comes from a probe response
 * @CFG80211_BSS_FTYPE_S1G_BEACON: data comes from an S1G beacon
 */
enum cfg80211_bss_frame_type {
        CFG80211_BSS_FTYPE_UNKNOWN,
        CFG80211_BSS_FTYPE_BEACON,
        CFG80211_BSS_FTYPE_PRESP,
        CFG80211_BSS_FTYPE_S1G_BEACON,
};

/**
 * cfg80211_get_ies_channel_number - returns the channel number from ies
 * @ie: IEs
 * @ielen: length of IEs
 * @band: enum nl80211_band of the channel
 *
 * Return: the channel number, or -1 if none could be determined.
 */
int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
                                    enum nl80211_band band);

/**
 * cfg80211_ssid_eq - compare two SSIDs
 * @a: first SSID
 * @b: second SSID
 *
 * Return: %true if SSIDs are equal, %false otherwise.
 */
static inline bool
cfg80211_ssid_eq(struct cfg80211_ssid *a, struct cfg80211_ssid *b)
{
        if (WARN_ON(!a || !b))
                return false;
        if (a->ssid_len != b->ssid_len)
                return false;
        return memcmp(a->ssid, b->ssid, a->ssid_len) ? false : true;
}

/**
 * cfg80211_inform_bss_data - inform cfg80211 of a new BSS
 *
 * @wiphy: the wiphy reporting the BSS
 * @data: the BSS metadata
 * @ftype: frame type (if known)
 * @bssid: the BSSID of the BSS
 * @tsf: the TSF sent by the peer in the beacon/probe response (or 0)
 * @capability: the capability field sent by the peer
 * @beacon_interval: the beacon interval announced by the peer
 * @ie: additional IEs sent by the peer
 * @ielen: length of the additional IEs
 * @gfp: context flags
 *
 * This informs cfg80211 that BSS information was found and
 * the BSS should be updated/added.
 *
 * Return: A referenced struct, must be released with cfg80211_put_bss()!
 * Or %NULL on error.
 */
struct cfg80211_bss * __must_check
cfg80211_inform_bss_data(struct wiphy *wiphy,
                         struct cfg80211_inform_bss *data,
                         enum cfg80211_bss_frame_type ftype,
                         const u8 *bssid, u64 tsf, u16 capability,
                         u16 beacon_interval, const u8 *ie, size_t ielen,
                         gfp_t gfp);

static inline struct cfg80211_bss * __must_check
cfg80211_inform_bss(struct wiphy *wiphy,
                    struct ieee80211_channel *rx_channel,
                    enum cfg80211_bss_frame_type ftype,
                    const u8 *bssid, u64 tsf, u16 capability,
                    u16 beacon_interval, const u8 *ie, size_t ielen,
                    s32 signal, gfp_t gfp)
{
        struct cfg80211_inform_bss data = {
                .chan = rx_channel,
                .signal = signal,
        };

        return cfg80211_inform_bss_data(wiphy, &data, ftype, bssid, tsf,
                                        capability, beacon_interval, ie, ielen,
                                        gfp);
}

/**
 * __cfg80211_get_bss - get a BSS reference
 * @wiphy: the wiphy this BSS struct belongs to
 * @channel: the channel to search on (or %NULL)
 * @bssid: the desired BSSID (or %NULL)
 * @ssid: the desired SSID (or %NULL)
 * @ssid_len: length of the SSID (or 0)
 * @bss_type: type of BSS, see &enum ieee80211_bss_type
 * @privacy: privacy filter, see &enum ieee80211_privacy
 * @use_for: indicates which use is intended
 *
 * Return: Reference-counted BSS on success. %NULL on error.
 */
struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy,
                                        struct ieee80211_channel *channel,
                                        const u8 *bssid,
                                        const u8 *ssid, size_t ssid_len,
                                        enum ieee80211_bss_type bss_type,
                                        enum ieee80211_privacy privacy,
                                        u32 use_for);

/**
 * cfg80211_get_bss - get a BSS reference
 * @wiphy: the wiphy this BSS struct belongs to
 * @channel: the channel to search on (or %NULL)
 * @bssid: the desired BSSID (or %NULL)
 * @ssid: the desired SSID (or %NULL)
 * @ssid_len: length of the SSID (or 0)
 * @bss_type: type of BSS, see &enum ieee80211_bss_type
 * @privacy: privacy filter, see &enum ieee80211_privacy
 *
 * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL.
 *
 * Return: Reference-counted BSS on success. %NULL on error.
 */
static inline struct cfg80211_bss *
cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel,
                 const u8 *bssid, const u8 *ssid, size_t ssid_len,
                 enum ieee80211_bss_type bss_type,
                 enum ieee80211_privacy privacy)
{
        return __cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len,
                                  bss_type, privacy,
                                  NL80211_BSS_USE_FOR_NORMAL);
}

static inline struct cfg80211_bss *
cfg80211_get_ibss(struct wiphy *wiphy,
                  struct ieee80211_channel *channel,
                  const u8 *ssid, size_t ssid_len)
{
        return cfg80211_get_bss(wiphy, channel, NULL, ssid, ssid_len,
                                IEEE80211_BSS_TYPE_IBSS,
                                IEEE80211_PRIVACY_ANY);
}

/**
 * cfg80211_ref_bss - reference BSS struct
 * @wiphy: the wiphy this BSS struct belongs to
 * @bss: the BSS struct to reference
 *
 * Increments the refcount of the given BSS struct.
 */
void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_put_bss - unref BSS struct
 * @wiphy: the wiphy this BSS struct belongs to
 * @bss: the BSS struct
 *
 * Decrements the refcount of the given BSS struct.
 */
void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_unlink_bss - unlink BSS from internal data structures
 * @wiphy: the wiphy
 * @bss: the bss to remove
 *
 * This function removes the given BSS from the internal data structures
 * thereby making it no longer show up in scan results etc. Use this
 * function when you detect a BSS is gone. Normally BSSes will also time
 * out, so it is not necessary to use this function at all.
 */
void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);

/**
 * cfg80211_bss_iter - iterate all BSS entries
 *
 * This function iterates over the BSS entries associated with the given wiphy
 * and calls the callback for the iterated BSS. The iterator function is not
 * allowed to call functions that might modify the internal state of the BSS DB.
 *
 * @wiphy: the wiphy
 * @chandef: if given, the iterator function will be called only if the channel
 *     of the currently iterated BSS is a subset of the given channel.
 * @iter: the iterator function to call
 * @iter_data: an argument to the iterator function
 */
void cfg80211_bss_iter(struct wiphy *wiphy,
                       struct cfg80211_chan_def *chandef,
                       void (*iter)(struct wiphy *wiphy,
                                    struct cfg80211_bss *bss,
                                    void *data),
                       void *iter_data);

/**
 * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame
 * @dev: network device
 * @buf: authentication frame (header + body)
 * @len: length of the frame data
 *
 * This function is called whenever an authentication, disassociation or
 * deauthentication frame has been received and processed in station mode.
 * After being asked to authenticate via cfg80211_ops::auth() the driver must
 * call either this function or cfg80211_auth_timeout().
 * After being asked to associate via cfg80211_ops::assoc() the driver must
 * call either this function or cfg80211_auth_timeout().
 * While connected, the driver must calls this for received and processed
 * disassociation and deauthentication frames. If the frame couldn't be used
 * because it was unprotected, the driver must call the function
 * cfg80211_rx_unprot_mlme_mgmt() instead.
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len);

/**
 * cfg80211_auth_timeout - notification of timed out authentication
 * @dev: network device
 * @addr: The MAC address of the device with which the authentication timed out
 *
 * This function may sleep. The caller must hold the corresponding wdev's
 * mutex.
 */
void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);

/**
 * struct cfg80211_rx_assoc_resp_data - association response data
 * @buf: (Re)Association Response frame (header + body)
 * @len: length of the frame data
 * @uapsd_queues: bitmap of queues configured for uapsd. Same format
 *        as the AC bitmap in the QoS info field
 * @req_ies: information elements from the (Re)Association Request frame
 * @req_ies_len: length of req_ies data
 * @ap_mld_addr: AP MLD address (in case of MLO)
 * @links: per-link information indexed by link ID, use links[0] for
 *        non-MLO connections
 * @links.bss: the BSS that association was requested with, ownership of the
 *      pointer moves to cfg80211 in the call to cfg80211_rx_assoc_resp()
 * @links.status: Set this (along with a BSS pointer) for links that
 *        were rejected by the AP.
 */
struct cfg80211_rx_assoc_resp_data {
        const u8 *buf;
        size_t len;
        const u8 *req_ies;
        size_t req_ies_len;
        int uapsd_queues;
        const u8 *ap_mld_addr;
        struct {
                u8 addr[ETH_ALEN] __aligned(2);
                struct cfg80211_bss *bss;
                u16 status;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * cfg80211_rx_assoc_resp - notification of processed association response
 * @dev: network device
 * @data: association response data, &struct cfg80211_rx_assoc_resp_data
 *
 * After being asked to associate via cfg80211_ops::assoc() the driver must
 * call either this function or cfg80211_auth_timeout().
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_rx_assoc_resp(struct net_device *dev,
                            const struct cfg80211_rx_assoc_resp_data *data);

/**
 * struct cfg80211_assoc_failure - association failure data
 * @ap_mld_addr: AP MLD address, or %NULL
 * @bss: list of BSSes, must use entry 0 for non-MLO connections
 *        (@ap_mld_addr is %NULL)
 * @timeout: indicates the association failed due to timeout, otherwise
 *        the association was abandoned for a reason reported through some
 *        other API (e.g. deauth RX)
 */
struct cfg80211_assoc_failure {
        const u8 *ap_mld_addr;
        struct cfg80211_bss *bss[IEEE80211_MLD_MAX_NUM_LINKS];
        bool timeout;
};

/**
 * cfg80211_assoc_failure - notification of association failure
 * @dev: network device
 * @data: data describing the association failure
 *
 * This function may sleep. The caller must hold the corresponding wdev's mutex.
 */
void cfg80211_assoc_failure(struct net_device *dev,
                            struct cfg80211_assoc_failure *data);

/**
 * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame
 * @dev: network device
 * @buf: 802.11 frame (header + body)
 * @len: length of the frame data
 * @reconnect: immediate reconnect is desired (include the nl80211 attribute)
 *
 * This function is called whenever deauthentication has been processed in
 * station mode. This includes both received deauthentication frames and
 * locally generated ones. This function may sleep. The caller must hold the
 * corresponding wdev's mutex.
 */
void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len,
                           bool reconnect);

/**
 * cfg80211_rx_unprot_mlme_mgmt - notification of unprotected mlme mgmt frame
 * @dev: network device
 * @buf: received management frame (header + body)
 * @len: length of the frame data
 *
 * This function is called whenever a received deauthentication or dissassoc
 * frame has been dropped in station mode because of MFP being used but the
 * frame was not protected. This is also used to notify reception of a Beacon
 * frame that was dropped because it did not include a valid MME MIC while
 * beacon protection was enabled (BIGTK configured in station mode).
 *
 * This function may sleep.
 */
void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev,
                                  const u8 *buf, size_t len);

/**
 * cfg80211_michael_mic_failure - notification of Michael MIC failure (TKIP)
 * @dev: network device
 * @addr: The source MAC address of the frame
 * @key_type: The key type that the received frame used
 * @key_id: Key identifier (0..3). Can be -1 if missing.
 * @tsc: The TSC value of the frame that generated the MIC failure (6 octets)
 * @gfp: allocation flags
 *
 * This function is called whenever the local MAC detects a MIC failure in a
 * received frame. This matches with MLME-MICHAELMICFAILURE.indication()
 * primitive.
 */
void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
                                  enum nl80211_key_type key_type, int key_id,
                                  const u8 *tsc, gfp_t gfp);

/**
 * cfg80211_ibss_joined - notify cfg80211 that device joined an IBSS
 *
 * @dev: network device
 * @bssid: the BSSID of the IBSS joined
 * @channel: the channel of the IBSS joined
 * @gfp: allocation flags
 *
 * This function notifies cfg80211 that the device joined an IBSS or
 * switched to a different BSSID. Before this function can be called,
 * either a beacon has to have been received from the IBSS, or one of
 * the cfg80211_inform_bss{,_frame} functions must have been called
 * with the locally generated beacon -- this guarantees that there is
 * always a scan result for this IBSS. cfg80211 will handle the rest.
 */
void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
                          struct ieee80211_channel *channel, gfp_t gfp);

/**
 * cfg80211_notify_new_peer_candidate - notify cfg80211 of a new mesh peer
 *                                         candidate
 *
 * @dev: network device
 * @macaddr: the MAC address of the new candidate
 * @ie: information elements advertised by the peer candidate
 * @ie_len: length of the information elements buffer
 * @sig_dbm: signal level in dBm
 * @gfp: allocation flags
 *
 * This function notifies cfg80211 that the mesh peer candidate has been
 * detected, most likely via a beacon or, less likely, via a probe response.
 * cfg80211 then sends a notification to userspace.
 */
void cfg80211_notify_new_peer_candidate(struct net_device *dev,
                const u8 *macaddr, const u8 *ie, u8 ie_len,
                int sig_dbm, gfp_t gfp);

/**
 * DOC: RFkill integration
 *
 * RFkill integration in cfg80211 is almost invisible to drivers,
 * as cfg80211 automatically registers an rfkill instance for each
 * wireless device it knows about. Soft kill is also translated
 * into disconnecting and turning all interfaces off. Drivers are
 * expected to turn off the device when all interfaces are down.
 *
 * However, devices may have a hard RFkill line, in which case they
 * also need to interact with the rfkill subsystem, via cfg80211.
 * They can do this with a few helper functions documented here.
 */

/**
 * wiphy_rfkill_set_hw_state_reason - notify cfg80211 about hw block state
 * @wiphy: the wiphy
 * @blocked: block status
 * @reason: one of reasons in &enum rfkill_hard_block_reasons
 */
void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked,
                                      enum rfkill_hard_block_reasons reason);

static inline void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked)
{
        wiphy_rfkill_set_hw_state_reason(wiphy, blocked,
                                         RFKILL_HARD_BLOCK_SIGNAL);
}

/**
 * wiphy_rfkill_start_polling - start polling rfkill
 * @wiphy: the wiphy
 */
void wiphy_rfkill_start_polling(struct wiphy *wiphy);

/**
 * wiphy_rfkill_stop_polling - stop polling rfkill
 * @wiphy: the wiphy
 */
static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy)
{
        rfkill_pause_polling(wiphy->rfkill);
}

/**
 * DOC: Vendor commands
 *
 * Occasionally, there are special protocol or firmware features that
 * can't be implemented very openly. For this and similar cases, the
 * vendor command functionality allows implementing the features with
 * (typically closed-source) userspace and firmware, using nl80211 as
 * the configuration mechanism.
 *
 * A driver supporting vendor commands must register them as an array
 * in struct wiphy, with handlers for each one. Each command has an
 * OUI and sub command ID to identify it.
 *
 * Note that this feature should not be (ab)used to implement protocol
 * features that could openly be shared across drivers. In particular,
 * it must never be required to use vendor commands to implement any
 * "normal" functionality that higher-level userspace like connection
 * managers etc. need.
 */

struct sk_buff *__cfg80211_alloc_reply_skb(struct wiphy *wiphy,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           int approxlen);

struct sk_buff *__cfg80211_alloc_event_skb(struct wiphy *wiphy,
                                           struct wireless_dev *wdev,
                                           enum nl80211_commands cmd,
                                           enum nl80211_attrs attr,
                                           unsigned int portid,
                                           int vendor_event_idx,
                                           int approxlen, gfp_t gfp);

void __cfg80211_send_event_skb(struct sk_buff *skb, gfp_t gfp);

/**
 * cfg80211_vendor_cmd_alloc_reply_skb - allocate vendor command reply
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 *
 * This function allocates and pre-fills an skb for a reply to
 * a vendor command. Since it is intended for a reply, calling
 * it outside of a vendor command's doit() operation is invalid.
 *
 * The returned skb is pre-filled with some identifying data in
 * a way that any data that is put into the skb (with skb_put(),
 * nla_put() or similar) will end up being within the
 * %NL80211_ATTR_VENDOR_DATA attribute, so all that needs to be done
 * with the skb is adding data for the corresponding userspace tool
 * which can then read that data out of the vendor data attribute.
 * You must not modify the skb in any other way.
 *
 * When done, call cfg80211_vendor_cmd_reply() with the skb and return
 * its error code as the result of the doit() operation.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_cmd_alloc_reply_skb(struct wiphy *wiphy, int approxlen)
{
        return __cfg80211_alloc_reply_skb(wiphy, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA, approxlen);
}

/**
 * cfg80211_vendor_cmd_reply - send the reply skb
 * @skb: The skb, must have been allocated with
 *        cfg80211_vendor_cmd_alloc_reply_skb()
 *
 * Since calling this function will usually be the last thing
 * before returning from the vendor command doit() you should
 * return the error code.  Note that this function consumes the
 * skb regardless of the return value.
 *
 * Return: An error code or 0 on success.
 */
int cfg80211_vendor_cmd_reply(struct sk_buff *skb);

/**
 * cfg80211_vendor_cmd_get_sender - get the current sender netlink ID
 * @wiphy: the wiphy
 *
 * Return: the current netlink port ID in a vendor command handler.
 *
 * Context: May only be called from a vendor command handler
 */
unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy);

/**
 * cfg80211_vendor_event_alloc - allocate vendor-specific event skb
 * @wiphy: the wiphy
 * @wdev: the wireless device
 * @event_idx: index of the vendor event in the wiphy's vendor_events
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event on the
 * vendor-specific multicast group.
 *
 * If wdev != NULL, both the ifindex and identifier of the specified
 * wireless device are added to the event message before the vendor data
 * attribute.
 *
 * When done filling the skb, call cfg80211_vendor_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_event_alloc(struct wiphy *wiphy, struct wireless_dev *wdev,
                             int approxlen, int event_idx, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, wdev, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA,
                                          0, event_idx, approxlen, gfp);
}

/**
 * cfg80211_vendor_event_alloc_ucast - alloc unicast vendor-specific event skb
 * @wiphy: the wiphy
 * @wdev: the wireless device
 * @event_idx: index of the vendor event in the wiphy's vendor_events
 * @portid: port ID of the receiver
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event to send to
 * a specific (userland) socket. This socket would previously have been
 * obtained by cfg80211_vendor_cmd_get_sender(), and the caller MUST take
 * care to register a netlink notifier to see when the socket closes.
 *
 * If wdev != NULL, both the ifindex and identifier of the specified
 * wireless device are added to the event message before the vendor data
 * attribute.
 *
 * When done filling the skb, call cfg80211_vendor_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_vendor_event_alloc_ucast(struct wiphy *wiphy,
                                  struct wireless_dev *wdev,
                                  unsigned int portid, int approxlen,
                                  int event_idx, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, wdev, NL80211_CMD_VENDOR,
                                          NL80211_ATTR_VENDOR_DATA,
                                          portid, event_idx, approxlen, gfp);
}

/**
 * cfg80211_vendor_event - send the event
 * @skb: The skb, must have been allocated with cfg80211_vendor_event_alloc()
 * @gfp: allocation flags
 *
 * This function sends the given @skb, which must have been allocated
 * by cfg80211_vendor_event_alloc(), as an event. It always consumes it.
 */
static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp)
{
        __cfg80211_send_event_skb(skb, gfp);
}

#ifdef CONFIG_NL80211_TESTMODE
/**
 * DOC: Test mode
 *
 * Test mode is a set of utility functions to allow drivers to
 * interact with driver-specific tools to aid, for instance,
 * factory programming.
 *
 * This chapter describes how drivers interact with it. For more
 * information see the nl80211 book's chapter on it.
 */

/**
 * cfg80211_testmode_alloc_reply_skb - allocate testmode reply
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 *
 * This function allocates and pre-fills an skb for a reply to
 * the testmode command. Since it is intended for a reply, calling
 * it outside of the @testmode_cmd operation is invalid.
 *
 * The returned skb is pre-filled with the wiphy index and set up in
 * a way that any data that is put into the skb (with skb_put(),
 * nla_put() or similar) will end up being within the
 * %NL80211_ATTR_TESTDATA attribute, so all that needs to be done
 * with the skb is adding data for the corresponding userspace tool
 * which can then read that data out of the testdata attribute. You
 * must not modify the skb in any other way.
 *
 * When done, call cfg80211_testmode_reply() with the skb and return
 * its error code as the result of the @testmode_cmd operation.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_testmode_alloc_reply_skb(struct wiphy *wiphy, int approxlen)
{
        return __cfg80211_alloc_reply_skb(wiphy, NL80211_CMD_TESTMODE,
                                          NL80211_ATTR_TESTDATA, approxlen);
}

/**
 * cfg80211_testmode_reply - send the reply skb
 * @skb: The skb, must have been allocated with
 *        cfg80211_testmode_alloc_reply_skb()
 *
 * Since calling this function will usually be the last thing
 * before returning from the @testmode_cmd you should return
 * the error code.  Note that this function consumes the skb
 * regardless of the return value.
 *
 * Return: An error code or 0 on success.
 */
static inline int cfg80211_testmode_reply(struct sk_buff *skb)
{
        return cfg80211_vendor_cmd_reply(skb);
}

/**
 * cfg80211_testmode_alloc_event_skb - allocate testmode event
 * @wiphy: the wiphy
 * @approxlen: an upper bound of the length of the data that will
 *        be put into the skb
 * @gfp: allocation flags
 *
 * This function allocates and pre-fills an skb for an event on the
 * testmode multicast group.
 *
 * The returned skb is set up in the same way as with
 * cfg80211_testmode_alloc_reply_skb() but prepared for an event. As
 * there, you should simply add data to it that will then end up in the
 * %NL80211_ATTR_TESTDATA attribute. Again, you must not modify the skb
 * in any other way.
 *
 * When done filling the skb, call cfg80211_testmode_event() with the
 * skb to send the event.
 *
 * Return: An allocated and pre-filled skb. %NULL if any errors happen.
 */
static inline struct sk_buff *
cfg80211_testmode_alloc_event_skb(struct wiphy *wiphy, int approxlen, gfp_t gfp)
{
        return __cfg80211_alloc_event_skb(wiphy, NULL, NL80211_CMD_TESTMODE,
                                          NL80211_ATTR_TESTDATA, 0, -1,
                                          approxlen, gfp);
}

/**
 * cfg80211_testmode_event - send the event
 * @skb: The skb, must have been allocated with
 *        cfg80211_testmode_alloc_event_skb()
 * @gfp: allocation flags
 *
 * This function sends the given @skb, which must have been allocated
 * by cfg80211_testmode_alloc_event_skb(), as an event. It always
 * consumes it.
 */
static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
{
        __cfg80211_send_event_skb(skb, gfp);
}

#define CFG80211_TESTMODE_CMD(cmd)        .testmode_cmd = (cmd),
#define CFG80211_TESTMODE_DUMP(cmd)        .testmode_dump = (cmd),
#else
#define CFG80211_TESTMODE_CMD(cmd)
#define CFG80211_TESTMODE_DUMP(cmd)
#endif

/**
 * struct cfg80211_fils_resp_params - FILS connection response params
 * @kek: KEK derived from a successful FILS connection (may be %NULL)
 * @kek_len: Length of @fils_kek in octets
 * @update_erp_next_seq_num: Boolean value to specify whether the value in
 *        @erp_next_seq_num is valid.
 * @erp_next_seq_num: The next sequence number to use in ERP message in
 *        FILS Authentication. This value should be specified irrespective of the
 *        status for a FILS connection.
 * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
 * @pmk_len: Length of @pmk in octets
 * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
 *        used for this FILS connection (may be %NULL).
 */
struct cfg80211_fils_resp_params {
        const u8 *kek;
        size_t kek_len;
        bool update_erp_next_seq_num;
        u16 erp_next_seq_num;
        const u8 *pmk;
        size_t pmk_len;
        const u8 *pmkid;
};

/**
 * struct cfg80211_connect_resp_params - Connection response params
 * @status: Status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures. If this call is used to report a
 *        failure due to a timeout (e.g., not receiving an Authentication frame
 *        from the AP) instead of an explicit rejection by the AP, -1 is used to
 *        indicate that this is a failure, but without a status code.
 *        @timeout_reason is used to report the reason for the timeout in that
 *        case.
 * @req_ie: Association request IEs (may be %NULL)
 * @req_ie_len: Association request IEs length
 * @resp_ie: Association response IEs (may be %NULL)
 * @resp_ie_len: Association response IEs length
 * @fils: FILS connection response parameters.
 * @timeout_reason: Reason for connection timeout. This is used when the
 *        connection fails due to a timeout instead of an explicit rejection from
 *        the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
 *        not known. This value is used only if @status < 0 to indicate that the
 *        failure is due to a timeout and not due to explicit rejection by the AP.
 *        This value is ignored in other cases (@status >= 0).
 * @valid_links: For MLO connection, BIT mask of the valid link ids. Otherwise
 *        zero.
 * @ap_mld_addr: For MLO connection, MLD address of the AP. Otherwise %NULL.
 * @links : For MLO connection, contains link info for the valid links indicated
 *        using @valid_links. For non-MLO connection, links[0] contains the
 *        connected AP info.
 * @links.addr: For MLO connection, MAC address of the STA link. Otherwise
 *        %NULL.
 * @links.bssid: For MLO connection, MAC address of the AP link. For non-MLO
 *        connection, links[0].bssid points to the BSSID of the AP (may be %NULL).
 * @links.bss: For MLO connection, entry of bss to which STA link is connected.
 *        For non-MLO connection, links[0].bss points to entry of bss to which STA
 *        is connected. It can be obtained through cfg80211_get_bss() (may be
 *        %NULL). It is recommended to store the bss from the connect_request and
 *        hold a reference to it and return through this param to avoid a warning
 *        if the bss is expired during the connection, esp. for those drivers
 *        implementing connect op. Only one parameter among @bssid and @bss needs
 *        to be specified.
 * @links.status: per-link status code, to report a status code that's not
 *        %WLAN_STATUS_SUCCESS for a given link, it must also be in the
 *        @valid_links bitmap and may have a BSS pointer (which is then released)
 */
struct cfg80211_connect_resp_params {
        int status;
        const u8 *req_ie;
        size_t req_ie_len;
        const u8 *resp_ie;
        size_t resp_ie_len;
        struct cfg80211_fils_resp_params fils;
        enum nl80211_timeout_reason timeout_reason;

        const u8 *ap_mld_addr;
        u16 valid_links;
        struct {
                const u8 *addr;
                const u8 *bssid;
                struct cfg80211_bss *bss;
                u16 status;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * cfg80211_connect_done - notify cfg80211 of connection result
 *
 * @dev: network device
 * @params: connection response parameters
 * @gfp: allocation flags
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_bss(), but takes a structure pointer for connection response
 * parameters. Only one of the functions among cfg80211_connect_bss(),
 * cfg80211_connect_result(), cfg80211_connect_timeout(),
 * and cfg80211_connect_done() should be called.
 */
void cfg80211_connect_done(struct net_device *dev,
                           struct cfg80211_connect_resp_params *params,
                           gfp_t gfp);

/**
 * cfg80211_connect_bss - notify cfg80211 of connection result
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @bss: Entry of bss to which STA got connected to, can be obtained through
 *        cfg80211_get_bss() (may be %NULL). But it is recommended to store the
 *        bss from the connect_request and hold a reference to it and return
 *        through this param to avoid a warning if the bss is expired during the
 *        connection, esp. for those drivers implementing connect op.
 *        Only one parameter among @bssid and @bss needs to be specified.
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @status: status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures. If this call is used to report a
 *        failure due to a timeout (e.g., not receiving an Authentication frame
 *        from the AP) instead of an explicit rejection by the AP, -1 is used to
 *        indicate that this is a failure, but without a status code.
 *        @timeout_reason is used to report the reason for the timeout in that
 *        case.
 * @gfp: allocation flags
 * @timeout_reason: reason for connection timeout. This is used when the
 *        connection fails due to a timeout instead of an explicit rejection from
 *        the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
 *        not known. This value is used only if @status < 0 to indicate that the
 *        failure is due to a timeout and not due to explicit rejection by the AP.
 *        This value is ignored in other cases (@status >= 0).
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_result(), but with the option of identifying the exact bss
 * entry for the connection. Only one of the functions among
 * cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_bss(struct net_device *dev, const u8 *bssid,
                     struct cfg80211_bss *bss, const u8 *req_ie,
                     size_t req_ie_len, const u8 *resp_ie,
                     size_t resp_ie_len, int status, gfp_t gfp,
                     enum nl80211_timeout_reason timeout_reason)
{
        struct cfg80211_connect_resp_params params;

        memset(&params, 0, sizeof(params));
        params.status = status;
        params.links[0].bssid = bssid;
        params.links[0].bss = bss;
        params.req_ie = req_ie;
        params.req_ie_len = req_ie_len;
        params.resp_ie = resp_ie;
        params.resp_ie_len = resp_ie_len;
        params.timeout_reason = timeout_reason;

        cfg80211_connect_done(dev, &params, gfp);
}

/**
 * cfg80211_connect_result - notify cfg80211 of connection result
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @status: status code, %WLAN_STATUS_SUCCESS for successful connection, use
 *        %WLAN_STATUS_UNSPECIFIED_FAILURE if your device cannot give you
 *        the real status code for failures.
 * @gfp: allocation flags
 *
 * It should be called by the underlying driver once execution of the connection
 * request from connect() has been completed. This is similar to
 * cfg80211_connect_bss() which allows the exact bss entry to be specified. Only
 * one of the functions among cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
                        const u8 *req_ie, size_t req_ie_len,
                        const u8 *resp_ie, size_t resp_ie_len,
                        u16 status, gfp_t gfp)
{
        cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, resp_ie,
                             resp_ie_len, status, gfp,
                             NL80211_TIMEOUT_UNSPECIFIED);
}

/**
 * cfg80211_connect_timeout - notify cfg80211 of connection timeout
 *
 * @dev: network device
 * @bssid: the BSSID of the AP
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @gfp: allocation flags
 * @timeout_reason: reason for connection timeout.
 *
 * It should be called by the underlying driver whenever connect() has failed
 * in a sequence where no explicit authentication/association rejection was
 * received from the AP. This could happen, e.g., due to not being able to send
 * out the Authentication or Association Request frame or timing out while
 * waiting for the response. Only one of the functions among
 * cfg80211_connect_bss(), cfg80211_connect_result(),
 * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called.
 */
static inline void
cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
                         const u8 *req_ie, size_t req_ie_len, gfp_t gfp,
                         enum nl80211_timeout_reason timeout_reason)
{
        cfg80211_connect_bss(dev, bssid, NULL, req_ie, req_ie_len, NULL, 0, -1,
                             gfp, timeout_reason);
}

/**
 * struct cfg80211_roam_info - driver initiated roaming information
 *
 * @req_ie: association request IEs (maybe be %NULL)
 * @req_ie_len: association request IEs length
 * @resp_ie: association response IEs (may be %NULL)
 * @resp_ie_len: assoc response IEs length
 * @fils: FILS related roaming information.
 * @valid_links: For MLO roaming, BIT mask of the new valid links is set.
 *        Otherwise zero.
 * @ap_mld_addr: For MLO roaming, MLD address of the new AP. Otherwise %NULL.
 * @links : For MLO roaming, contains new link info for the valid links set in
 *        @valid_links. For non-MLO roaming, links[0] contains the new AP info.
 * @links.addr: For MLO roaming, MAC address of the STA link. Otherwise %NULL.
 * @links.bssid: For MLO roaming, MAC address of the new AP link. For non-MLO
 *        roaming, links[0].bssid points to the BSSID of the new AP. May be
 *        %NULL if %links.bss is set.
 * @links.channel: the channel of the new AP.
 * @links.bss: For MLO roaming, entry of new bss to which STA link got
 *        roamed. For non-MLO roaming, links[0].bss points to entry of bss to
 *        which STA got roamed (may be %NULL if %links.bssid is set)
 */
struct cfg80211_roam_info {
        const u8 *req_ie;
        size_t req_ie_len;
        const u8 *resp_ie;
        size_t resp_ie_len;
        struct cfg80211_fils_resp_params fils;

        const u8 *ap_mld_addr;
        u16 valid_links;
        struct {
                const u8 *addr;
                const u8 *bssid;
                struct ieee80211_channel *channel;
                struct cfg80211_bss *bss;
        } links[IEEE80211_MLD_MAX_NUM_LINKS];
};

/**
 * cfg80211_roamed - notify cfg80211 of roaming
 *
 * @dev: network device
 * @info: information about the new BSS. struct &cfg80211_roam_info.
 * @gfp: allocation flags
 *
 * This function may be called with the driver passing either the BSSID of the
 * new AP or passing the bss entry to avoid a race in timeout of the bss entry.
 * It should be called by the underlying driver whenever it roamed from one AP
 * to another while connected. Drivers which have roaming implemented in
 * firmware should pass the bss entry to avoid a race in bss entry timeout where
 * the bss entry of the new AP is seen in the driver, but gets timed out by the
 * time it is accessed in __cfg80211_roamed() due to delay in scheduling
 * rdev->event_work. In case of any failures, the reference is released
 * either in cfg80211_roamed() or in __cfg80211_romed(), Otherwise, it will be
 * released while disconnecting from the current bss.
 */
void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
                     gfp_t gfp);

/**
 * cfg80211_port_authorized - notify cfg80211 of successful security association
 *
 * @dev: network device
 * @peer_addr: BSSID of the AP/P2P GO in case of STA/GC or STA/GC MAC address
 *        in case of AP/P2P GO
 * @td_bitmap: transition disable policy
 * @td_bitmap_len: Length of transition disable policy
 * @gfp: allocation flags
 *
 * This function should be called by a driver that supports 4 way handshake
 * offload after a security association was successfully established (i.e.,
 * the 4 way handshake was completed successfully). The call to this function
 * should be preceded with a call to cfg80211_connect_result(),
 * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to
 * indicate the 802.11 association.
 * This function can also be called by AP/P2P GO driver that supports
 * authentication offload. In this case the peer_mac passed is that of
 * associated STA/GC.
 */
void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr,
                              const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp);

/**
 * cfg80211_disconnected - notify cfg80211 that connection was dropped
 *
 * @dev: network device
 * @ie: information elements of the deauth/disassoc frame (may be %NULL)
 * @ie_len: length of IEs
 * @reason: reason code for the disconnection, set it to 0 if unknown
 * @locally_generated: disconnection was requested locally
 * @gfp: allocation flags
 *
 * After it calls this function, the driver should enter an idle state
 * and not try to connect to any AP any more.
 */
void cfg80211_disconnected(struct net_device *dev, u16 reason,
                           const u8 *ie, size_t ie_len,
                           bool locally_generated, gfp_t gfp);

/**
 * cfg80211_ready_on_channel - notification of remain_on_channel start
 * @wdev: wireless device
 * @cookie: the request cookie
 * @chan: The current channel (from remain_on_channel request)
 * @duration: Duration in milliseconds that the driver intents to remain on the
 *        channel
 * @gfp: allocation flags
 */
void cfg80211_ready_on_channel(struct wireless_dev *wdev, u64 cookie,
                               struct ieee80211_channel *chan,
                               unsigned int duration, gfp_t gfp);

/**
 * cfg80211_remain_on_channel_expired - remain_on_channel duration expired
 * @wdev: wireless device
 * @cookie: the request cookie
 * @chan: The current channel (from remain_on_channel request)
 * @gfp: allocation flags
 */
void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
                                        struct ieee80211_channel *chan,
                                        gfp_t gfp);

/**
 * cfg80211_tx_mgmt_expired - tx_mgmt duration expired
 * @wdev: wireless device
 * @cookie: the requested cookie
 * @chan: The current channel (from tx_mgmt request)
 * @gfp: allocation flags
 */
void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
                              struct ieee80211_channel *chan, gfp_t gfp);

/**
 * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics.
 *
 * @sinfo: the station information
 * @gfp: allocation flags
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_sinfo_release_content - release contents of station info
 * @sinfo: the station information
 *
 * Releases any potentially allocated sub-information of the station
 * information, but not the struct itself (since it's typically on
 * the stack.)
 */
static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
{
        kfree(sinfo->pertid);
}

/**
 * cfg80211_new_sta - notify userspace about station
 *
 * @dev: the netdev
 * @mac_addr: the station's address
 * @sinfo: the station information
 * @gfp: allocation flags
 */
void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
                      struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_del_sta_sinfo - notify userspace about deletion of a station
 * @dev: the netdev
 * @mac_addr: the station's address. For MLD station, MLD address is used.
 * @sinfo: the station information/statistics
 * @gfp: allocation flags
 */
void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
                            struct station_info *sinfo, gfp_t gfp);

/**
 * cfg80211_del_sta - notify userspace about deletion of a station
 *
 * @dev: the netdev
 * @mac_addr: the station's address. For MLD station, MLD address is used.
 * @gfp: allocation flags
 */
static inline void cfg80211_del_sta(struct net_device *dev,
                                    const u8 *mac_addr, gfp_t gfp)
{
        cfg80211_del_sta_sinfo(dev, mac_addr, NULL, gfp);
}

/**
 * cfg80211_conn_failed - connection request failed notification
 *
 * @dev: the netdev
 * @mac_addr: the station's address
 * @reason: the reason for connection failure
 * @gfp: allocation flags
 *
 * Whenever a station tries to connect to an AP and if the station
 * could not connect to the AP as the AP has rejected the connection
 * for some reasons, this function is called.
 *
 * The reason for connection failure can be any of the value from
 * nl80211_connect_failed_reason enum
 */
void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
                          enum nl80211_connect_failed_reason reason,
                          gfp_t gfp);

/**
 * struct cfg80211_rx_info - received management frame info
 *
 * @freq: Frequency on which the frame was received in kHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @have_link_id: indicates the frame was received on a link of
 *        an MLD, i.e. the @link_id field is valid
 * @link_id: the ID of the link the frame was received        on
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in &enum nl80211_rxmgmt_flags
 * @rx_tstamp: Hardware timestamp of frame RX in nanoseconds
 * @ack_tstamp: Hardware timestamp of ack TX in nanoseconds
 */
struct cfg80211_rx_info {
        int freq;
        int sig_dbm;
        bool have_link_id;
        u8 link_id;
        const u8 *buf;
        size_t len;
        u32 flags;
        u64 rx_tstamp;
        u64 ack_tstamp;
};

/**
 * cfg80211_rx_mgmt_ext - management frame notification with extended info
 * @wdev: wireless device receiving the frame
 * @info: RX info as defined in struct cfg80211_rx_info
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev,
                          struct cfg80211_rx_info *info);

/**
 * cfg80211_rx_mgmt_khz - notification of received, unprocessed management frame
 * @wdev: wireless device receiving the frame
 * @freq: Frequency on which the frame was received in KHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in enum nl80211_rxmgmt_flags
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
static inline bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq,
                                        int sig_dbm, const u8 *buf, size_t len,
                                        u32 flags)
{
        struct cfg80211_rx_info info = {
                .freq = freq,
                .sig_dbm = sig_dbm,
                .buf = buf,
                .len = len,
                .flags = flags
        };

        return cfg80211_rx_mgmt_ext(wdev, &info);
}

/**
 * cfg80211_rx_mgmt - notification of received, unprocessed management frame
 * @wdev: wireless device receiving the frame
 * @freq: Frequency on which the frame was received in MHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @flags: flags, as defined in enum nl80211_rxmgmt_flags
 *
 * This function is called whenever an Action frame is received for a station
 * mode interface, but is not processed in kernel.
 *
 * Return: %true if a user space application has registered for this frame.
 * For action frames, that makes it responsible for rejecting unrecognized
 * action frames; %false otherwise, in which case for action frames the
 * driver is responsible for rejecting the frame.
 */
static inline bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq,
                                    int sig_dbm, const u8 *buf, size_t len,
                                    u32 flags)
{
        struct cfg80211_rx_info info = {
                .freq = MHZ_TO_KHZ(freq),
                .sig_dbm = sig_dbm,
                .buf = buf,
                .len = len,
                .flags = flags
        };

        return cfg80211_rx_mgmt_ext(wdev, &info);
}

/**
 * struct cfg80211_tx_status - TX status for management frame information
 *
 * @cookie: Cookie returned by cfg80211_ops::mgmt_tx()
 * @tx_tstamp: hardware TX timestamp in nanoseconds
 * @ack_tstamp: hardware ack RX timestamp in nanoseconds
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 */
struct cfg80211_tx_status {
        u64 cookie;
        u64 tx_tstamp;
        u64 ack_tstamp;
        const u8 *buf;
        size_t len;
        bool ack;
};

/**
 * cfg80211_mgmt_tx_status_ext - TX status notification with extended info
 * @wdev: wireless device receiving the frame
 * @status: TX status data
 * @gfp: context flags
 *
 * This function is called whenever a management frame was requested to be
 * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the
 * transmission attempt with extended info.
 */
void cfg80211_mgmt_tx_status_ext(struct wireless_dev *wdev,
                                 struct cfg80211_tx_status *status, gfp_t gfp);

/**
 * cfg80211_mgmt_tx_status - notification of TX status for management frame
 * @wdev: wireless device receiving the frame
 * @cookie: Cookie returned by cfg80211_ops::mgmt_tx()
 * @buf: Management frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 * @gfp: context flags
 *
 * This function is called whenever a management frame was requested to be
 * transmitted with cfg80211_ops::mgmt_tx() to report the TX status of the
 * transmission attempt.
 */
static inline void cfg80211_mgmt_tx_status(struct wireless_dev *wdev,
                                           u64 cookie, const u8 *buf,
                                           size_t len, bool ack, gfp_t gfp)
{
        struct cfg80211_tx_status status = {
                .cookie = cookie,
                .buf = buf,
                .len = len,
                .ack = ack
        };

        cfg80211_mgmt_tx_status_ext(wdev, &status, gfp);
}

/**
 * cfg80211_control_port_tx_status - notification of TX status for control
 *                                   port frames
 * @wdev: wireless device receiving the frame
 * @cookie: Cookie returned by cfg80211_ops::tx_control_port()
 * @buf: Data frame (header + body)
 * @len: length of the frame data
 * @ack: Whether frame was acknowledged
 * @gfp: context flags
 *
 * This function is called whenever a control port frame was requested to be
 * transmitted with cfg80211_ops::tx_control_port() to report the TX status of
 * the transmission attempt.
 */
void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
                                     const u8 *buf, size_t len, bool ack,
                                     gfp_t gfp);

/**
 * cfg80211_rx_control_port - notification about a received control port frame
 * @dev: The device the frame matched to
 * @skb: The skbuf with the control port frame.  It is assumed that the skbuf
 *        is 802.3 formatted (with 802.3 header).  The skb can be non-linear.
 *        This function does not take ownership of the skb, so the caller is
 *        responsible for any cleanup.  The caller must also ensure that
 *        skb->protocol is set appropriately.
 * @unencrypted: Whether the frame was received unencrypted
 * @link_id: the link the frame was received on, -1 if not applicable or unknown
 *
 * This function is used to inform userspace about a received control port
 * frame.  It should only be used if userspace indicated it wants to receive
 * control port frames over nl80211.
 *
 * The frame is the data portion of the 802.3 or 802.11 data frame with all
 * network layer headers removed (e.g. the raw EAPoL frame).
 *
 * Return: %true if the frame was passed to userspace
 */
bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb,
                              bool unencrypted, int link_id);

/**
 * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
 * @dev: network device
 * @rssi_event: the triggered RSSI event
 * @rssi_level: new RSSI level value or 0 if not available
 * @gfp: context flags
 *
 * This function is called when a configured connection quality monitoring
 * rssi threshold reached event occurs.
 */
void cfg80211_cqm_rssi_notify(struct net_device *dev,
                              enum nl80211_cqm_rssi_threshold_event rssi_event,
                              s32 rssi_level, gfp_t gfp);

/**
 * cfg80211_cqm_pktloss_notify - notify userspace about packetloss to peer
 * @dev: network device
 * @peer: peer's MAC address
 * @num_packets: how many packets were lost -- should be a fixed threshold
 *        but probably no less than maybe 50, or maybe a throughput dependent
 *        threshold (to account for temporary interference)
 * @gfp: context flags
 */
void cfg80211_cqm_pktloss_notify(struct net_device *dev,
                                 const u8 *peer, u32 num_packets, gfp_t gfp);

/**
 * cfg80211_cqm_txe_notify - TX error rate event
 * @dev: network device
 * @peer: peer's MAC address
 * @num_packets: how many packets were lost
 * @rate: % of packets which failed transmission
 * @intvl: interval (in s) over which the TX failure threshold was breached.
 * @gfp: context flags
 *
 * Notify userspace when configured % TX failures over number of packets in a
 * given interval is exceeded.
 */
void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer,
                             u32 num_packets, u32 rate, u32 intvl, gfp_t gfp);

/**
 * cfg80211_cqm_beacon_loss_notify - beacon loss event
 * @dev: network device
 * @gfp: context flags
 *
 * Notify userspace about beacon loss from the connected AP.
 */
void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp);

/**
 * __cfg80211_radar_event - radar detection event
 * @wiphy: the wiphy
 * @chandef: chandef for the current channel
 * @offchan: the radar has been detected on the offchannel chain
 * @gfp: context flags
 *
 * This function is called when a radar is detected on the current chanenl.
 */
void __cfg80211_radar_event(struct wiphy *wiphy,
                            struct cfg80211_chan_def *chandef,
                            bool offchan, gfp_t gfp);

static inline void
cfg80211_radar_event(struct wiphy *wiphy,
                     struct cfg80211_chan_def *chandef,
                     gfp_t gfp)
{
        __cfg80211_radar_event(wiphy, chandef, false, gfp);
}

static inline void
cfg80211_background_radar_event(struct wiphy *wiphy,
                                struct cfg80211_chan_def *chandef,
                                gfp_t gfp)
{
        __cfg80211_radar_event(wiphy, chandef, true, gfp);
}

/**
 * cfg80211_sta_opmode_change_notify - STA's ht/vht operation mode change event
 * @dev: network device
 * @mac: MAC address of a station which opmode got modified
 * @sta_opmode: station's current opmode value
 * @gfp: context flags
 *
 * Driver should call this function when station's opmode modified via action
 * frame.
 */
void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac,
                                       struct sta_opmode_info *sta_opmode,
                                       gfp_t gfp);

/**
 * cfg80211_cac_event - Channel availability check (CAC) event
 * @netdev: network device
 * @chandef: chandef for the current channel
 * @event: type of event
 * @gfp: context flags
 *
 * This function is called when a Channel availability check (CAC) is finished
 * or aborted. This must be called to notify the completion of a CAC process,
 * also by full-MAC drivers.
 */
void cfg80211_cac_event(struct net_device *netdev,
                        const struct cfg80211_chan_def *chandef,
                        enum nl80211_radar_event event, gfp_t gfp);

/**
 * cfg80211_background_cac_abort - Channel Availability Check offchan abort event
 * @wiphy: the wiphy
 *
 * This function is called by the driver when a Channel Availability Check
 * (CAC) is aborted by a offchannel dedicated chain.
 */
void cfg80211_background_cac_abort(struct wiphy *wiphy);

/**
 * cfg80211_gtk_rekey_notify - notify userspace about driver rekeying
 * @dev: network device
 * @bssid: BSSID of AP (to avoid races)
 * @replay_ctr: new replay counter
 * @gfp: allocation flags
 */
void cfg80211_gtk_rekey_notify(struct net_device *dev, const u8 *bssid,
                               const u8 *replay_ctr, gfp_t gfp);

/**
 * cfg80211_pmksa_candidate_notify - notify about PMKSA caching candidate
 * @dev: network device
 * @index: candidate index (the smaller the index, the higher the priority)
 * @bssid: BSSID of AP
 * @preauth: Whether AP advertises support for RSN pre-authentication
 * @gfp: allocation flags
 */
void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
                                     const u8 *bssid, bool preauth, gfp_t gfp);

/**
 * cfg80211_rx_spurious_frame - inform userspace about a spurious frame
 * @dev: The device the frame matched to
 * @addr: the transmitter address
 * @gfp: context flags
 *
 * This function is used in AP mode (only!) to inform userspace that
 * a spurious class 3 frame was received, to be able to deauth the
 * sender.
 * Return: %true if the frame was passed to userspace (or this failed
 * for a reason other than not having a subscription.)
 */
bool cfg80211_rx_spurious_frame(struct net_device *dev,
                                const u8 *addr, gfp_t gfp);

/**
 * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame
 * @dev: The device the frame matched to
 * @addr: the transmitter address
 * @gfp: context flags
 *
 * This function is used in AP mode (only!) to inform userspace that
 * an associated station sent a 4addr frame but that wasn't expected.
 * It is allowed and desirable to send this event only once for each
 * station to avoid event flooding.
 * Return: %true if the frame was passed to userspace (or this failed
 * for a reason other than not having a subscription.)
 */
bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
                                        const u8 *addr, gfp_t gfp);

/**
 * cfg80211_probe_status - notify userspace about probe status
 * @dev: the device the probe was sent on
 * @addr: the address of the peer
 * @cookie: the cookie filled in @probe_client previously
 * @acked: indicates whether probe was acked or not
 * @ack_signal: signal strength (in dBm) of the ACK frame.
 * @is_valid_ack_signal: indicates the ack_signal is valid or not.
 * @gfp: allocation flags
 */
void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
                           u64 cookie, bool acked, s32 ack_signal,
                           bool is_valid_ack_signal, gfp_t gfp);

/**
 * cfg80211_report_obss_beacon_khz - report beacon from other APs
 * @wiphy: The wiphy that received the beacon
 * @frame: the frame
 * @len: length of the frame
 * @freq: frequency the frame was received on in KHz
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 *
 * Use this function to report to userspace when a beacon was
 * received. It is not useful to call this when there is no
 * netdev that is in AP/GO mode.
 */
void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame,
                                     size_t len, int freq, int sig_dbm);

/**
 * cfg80211_report_obss_beacon - report beacon from other APs
 * @wiphy: The wiphy that received the beacon
 * @frame: the frame
 * @len: length of the frame
 * @freq: frequency the frame was received on
 * @sig_dbm: signal strength in dBm, or 0 if unknown
 *
 * Use this function to report to userspace when a beacon was
 * received. It is not useful to call this when there is no
 * netdev that is in AP/GO mode.
 */
static inline void cfg80211_report_obss_beacon(struct wiphy *wiphy,
                                               const u8 *frame, size_t len,
                                               int freq, int sig_dbm)
{
        cfg80211_report_obss_beacon_khz(wiphy, frame, len, MHZ_TO_KHZ(freq),
                                        sig_dbm);
}

/**
 * cfg80211_reg_can_beacon - check if beaconing is allowed
 * @wiphy: the wiphy
 * @chandef: the channel definition
 * @iftype: interface type
 *
 * Return: %true if there is no secondary channel or the secondary channel(s)
 * can be used for beaconing (i.e. is not a radar channel etc.)
 */
bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
                             struct cfg80211_chan_def *chandef,
                             enum nl80211_iftype iftype);

/**
 * cfg80211_reg_can_beacon_relax - check if beaconing is allowed with relaxation
 * @wiphy: the wiphy
 * @chandef: the channel definition
 * @iftype: interface type
 *
 * Return: %true if there is no secondary channel or the secondary channel(s)
 * can be used for beaconing (i.e. is not a radar channel etc.). This version
 * also checks if IR-relaxation conditions apply, to allow beaconing under
 * more permissive conditions.
 *
 * Context: Requires the wiphy mutex to be held.
 */
bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
                                   struct cfg80211_chan_def *chandef,
                                   enum nl80211_iftype iftype);

/**
 * cfg80211_ch_switch_notify - update wdev channel and notify userspace
 * @dev: the device which switched channels
 * @chandef: the new channel definition
 * @link_id: the link ID for MLO, must be 0 for non-MLO
 *
 * Caller must hold wiphy mutex, therefore must only be called from sleepable
 * driver context!
 */
void cfg80211_ch_switch_notify(struct net_device *dev,
                               struct cfg80211_chan_def *chandef,
                               unsigned int link_id);

/**
 * cfg80211_ch_switch_started_notify - notify channel switch start
 * @dev: the device on which the channel switch started
 * @chandef: the future channel definition
 * @link_id: the link ID for MLO, must be 0 for non-MLO
 * @count: the number of TBTTs until the channel switch happens
 * @quiet: whether or not immediate quiet was requested by the AP
 *
 * Inform the userspace about the channel switch that has just
 * started, so that it can take appropriate actions (eg. starting
 * channel switch on other vifs), if necessary.
 */
void cfg80211_ch_switch_started_notify(struct net_device *dev,
                                       struct cfg80211_chan_def *chandef,
                                       unsigned int link_id, u8 count,
                                       bool quiet);

/**
 * ieee80211_operating_class_to_band - convert operating class to band
 *
 * @operating_class: the operating class to convert
 * @band: band pointer to fill
 *
 * Return: %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_operating_class_to_band(u8 operating_class,
                                       enum nl80211_band *band);

/**
 * ieee80211_operating_class_to_chandef - convert operating class to chandef
 *
 * @operating_class: the operating class to convert
 * @chan: the ieee80211_channel to convert
 * @chandef: a pointer to the resulting chandef
 *
 * Return: %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_operating_class_to_chandef(u8 operating_class,
                                          struct ieee80211_channel *chan,
                                          struct cfg80211_chan_def *chandef);

/**
 * ieee80211_chandef_to_operating_class - convert chandef to operation class
 *
 * @chandef: the chandef to convert
 * @op_class: a pointer to the resulting operating class
 *
 * Return: %true if the conversion was successful, %false otherwise.
 */
bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
                                          u8 *op_class);

/**
 * ieee80211_chandef_to_khz - convert chandef to frequency in KHz
 *
 * @chandef: the chandef to convert
 *
 * Return: the center frequency of chandef (1st segment) in KHz.
 */
static inline u32
ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef)
{
        return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
}

/**
 * cfg80211_tdls_oper_request - request userspace to perform TDLS operation
 * @dev: the device on which the operation is requested
 * @peer: the MAC address of the peer device
 * @oper: the requested TDLS operation (NL80211_TDLS_SETUP or
 *        NL80211_TDLS_TEARDOWN)
 * @reason_code: the reason code for teardown request
 * @gfp: allocation flags
 *
 * This function is used to request userspace to perform TDLS operation that
 * requires knowledge of keys, i.e., link setup or teardown when the AP
 * connection uses encryption. This is optional mechanism for the driver to use
 * if it can automatically determine when a TDLS link could be useful (e.g.,
 * based on traffic and signal strength for a peer).
 */
void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
                                enum nl80211_tdls_operation oper,
                                u16 reason_code, gfp_t gfp);

/**
 * cfg80211_calculate_bitrate - calculate actual bitrate (in 100Kbps units)
 * @rate: given rate_info to calculate bitrate from
 *
 * Return: calculated bitrate
 */
u32 cfg80211_calculate_bitrate(struct rate_info *rate);

/**
 * cfg80211_unregister_wdev - remove the given wdev
 * @wdev: struct wireless_dev to remove
 *
 * This function removes the device so it can no longer be used. It is necessary
 * to call this function even when cfg80211 requests the removal of the device
 * by calling the del_virtual_intf() callback. The function must also be called
 * when the driver wishes to unregister the wdev, e.g. when the hardware device
 * is unbound from the driver.
 *
 * Context: Requires the RTNL and wiphy mutex to be held.
 */
void cfg80211_unregister_wdev(struct wireless_dev *wdev);

/**
 * cfg80211_register_netdevice - register the given netdev
 * @dev: the netdev to register
 *
 * Note: In contexts coming from cfg80211 callbacks, you must call this rather
 * than register_netdevice(), unregister_netdev() is impossible as the RTNL is
 * held. Otherwise, both register_netdevice() and register_netdev() are usable
 * instead as well.
 *
 * Context: Requires the RTNL and wiphy mutex to be held.
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_register_netdevice(struct net_device *dev);

/**
 * cfg80211_unregister_netdevice - unregister the given netdev
 * @dev: the netdev to register
 *
 * Note: In contexts coming from cfg80211 callbacks, you must call this rather
 * than unregister_netdevice(), unregister_netdev() is impossible as the RTNL
 * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are
 * usable instead as well.
 *
 * Context: Requires the RTNL and wiphy mutex to be held.
 */
static inline void cfg80211_unregister_netdevice(struct net_device *dev)
{
#if IS_ENABLED(CONFIG_CFG80211)
        cfg80211_unregister_wdev(dev->ieee80211_ptr);
#endif
}

/**
 * struct cfg80211_ft_event_params - FT Information Elements
 * @ies: FT IEs
 * @ies_len: length of the FT IE in bytes
 * @target_ap: target AP's MAC address
 * @ric_ies: RIC IE
 * @ric_ies_len: length of the RIC IE in bytes
 */
struct cfg80211_ft_event_params {
        const u8 *ies;
        size_t ies_len;
        const u8 *target_ap;
        const u8 *ric_ies;
        size_t ric_ies_len;
};

/**
 * cfg80211_ft_event - notify userspace about FT IE and RIC IE
 * @netdev: network device
 * @ft_event: IE information
 */
void cfg80211_ft_event(struct net_device *netdev,
                       struct cfg80211_ft_event_params *ft_event);

/**
 * cfg80211_get_p2p_attr - find and copy a P2P attribute from IE buffer
 * @ies: the input IE buffer
 * @len: the input length
 * @attr: the attribute ID to find
 * @buf: output buffer, can be %NULL if the data isn't needed, e.g.
 *        if the function is only called to get the needed buffer size
 * @bufsize: size of the output buffer
 *
 * The function finds a given P2P attribute in the (vendor) IEs and
 * copies its contents to the given buffer.
 *
 * Return: A negative error code (-%EILSEQ or -%ENOENT) if the data is
 * malformed or the attribute can't be found (respectively), or the
 * length of the found attribute (which can be zero).
 */
int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
                          enum ieee80211_p2p_attr_id attr,
                          u8 *buf, unsigned int bufsize);

/**
 * ieee80211_ie_split_ric - split an IE buffer according to ordering (with RIC)
 * @ies: the IE buffer
 * @ielen: the length of the IE buffer
 * @ids: an array with element IDs that are allowed before
 *        the split. A WLAN_EID_EXTENSION value means that the next
 *        EID in the list is a sub-element of the EXTENSION IE.
 * @n_ids: the size of the element ID array
 * @after_ric: array IE types that come after the RIC element
 * @n_after_ric: size of the @after_ric array
 * @offset: offset where to start splitting in the buffer
 *
 * This function splits an IE buffer by updating the @offset
 * variable to point to the location where the buffer should be
 * split.
 *
 * It assumes that the given IE buffer is well-formed, this
 * has to be guaranteed by the caller!
 *
 * It also assumes that the IEs in the buffer are ordered
 * correctly, if not the result of using this function will not
 * be ordered correctly either, i.e. it does no reordering.
 *
 * Return: The offset where the next part of the buffer starts, which
 * may be @ielen if the entire (remainder) of the buffer should be
 * used.
 */
size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
                              const u8 *ids, int n_ids,
                              const u8 *after_ric, int n_after_ric,
                              size_t offset);

/**
 * ieee80211_ie_split - split an IE buffer according to ordering
 * @ies: the IE buffer
 * @ielen: the length of the IE buffer
 * @ids: an array with element IDs that are allowed before
 *        the split. A WLAN_EID_EXTENSION value means that the next
 *        EID in the list is a sub-element of the EXTENSION IE.
 * @n_ids: the size of the element ID array
 * @offset: offset where to start splitting in the buffer
 *
 * This function splits an IE buffer by updating the @offset
 * variable to point to the location where the buffer should be
 * split.
 *
 * It assumes that the given IE buffer is well-formed, this
 * has to be guaranteed by the caller!
 *
 * It also assumes that the IEs in the buffer are ordered
 * correctly, if not the result of using this function will not
 * be ordered correctly either, i.e. it does no reordering.
 *
 * Return: The offset where the next part of the buffer starts, which
 * may be @ielen if the entire (remainder) of the buffer should be
 * used.
 */
static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
                                        const u8 *ids, int n_ids, size_t offset)
{
        return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset);
}

/**
 * ieee80211_fragment_element - fragment the last element in skb
 * @skb: The skbuf that the element was added to
 * @len_pos: Pointer to length of the element to fragment
 * @frag_id: The element ID to use for fragments
 *
 * This function fragments all data after @len_pos, adding fragmentation
 * elements with the given ID as appropriate. The SKB will grow in size
 * accordingly.
 */
void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id);

/**
 * cfg80211_report_wowlan_wakeup - report wakeup from WoWLAN
 * @wdev: the wireless device reporting the wakeup
 * @wakeup: the wakeup report
 * @gfp: allocation flags
 *
 * This function reports that the given device woke up. If it
 * caused the wakeup, report the reason(s), otherwise you may
 * pass %NULL as the @wakeup parameter to advertise that something
 * else caused the wakeup.
 */
void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev,
                                   struct cfg80211_wowlan_wakeup *wakeup,
                                   gfp_t gfp);

/**
 * cfg80211_crit_proto_stopped() - indicate critical protocol stopped by driver.
 *
 * @wdev: the wireless device for which critical protocol is stopped.
 * @gfp: allocation flags
 *
 * This function can be called by the driver to indicate it has reverted
 * operation back to normal. One reason could be that the duration given
 * by .crit_proto_start() has expired.
 */
void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp);

/**
 * ieee80211_get_num_supported_channels - get number of channels device has
 * @wiphy: the wiphy
 *
 * Return: the number of channels supported by the device.
 */
unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy);

/**
 * cfg80211_check_combinations - check interface combinations
 *
 * @wiphy: the wiphy
 * @params: the interface combinations parameter
 *
 * This function can be called by the driver to check whether a
 * combination of interfaces and their types are allowed according to
 * the interface combinations.
 *
 * Return: 0 if combinations are allowed. Non-zero on error.
 */
int cfg80211_check_combinations(struct wiphy *wiphy,
                                struct iface_combination_params *params);

/**
 * cfg80211_iter_combinations - iterate over matching combinations
 *
 * @wiphy: the wiphy
 * @params: the interface combinations parameter
 * @iter: function to call for each matching combination
 * @data: pointer to pass to iter function
 *
 * This function can be called by the driver to check what possible
 * combinations it fits in at a given moment, e.g. for channel switching
 * purposes.
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_iter_combinations(struct wiphy *wiphy,
                               struct iface_combination_params *params,
                               void (*iter)(const struct ieee80211_iface_combination *c,
                                            void *data),
                               void *data);

/**
 * cfg80211_stop_iface - trigger interface disconnection
 *
 * @wiphy: the wiphy
 * @wdev: wireless device
 * @gfp: context flags
 *
 * Trigger interface to be stopped as if AP was stopped, IBSS/mesh left, STA
 * disconnected.
 *
 * Note: This doesn't need any locks and is asynchronous.
 */
void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
                         gfp_t gfp);

/**
 * cfg80211_shutdown_all_interfaces - shut down all interfaces for a wiphy
 * @wiphy: the wiphy to shut down
 *
 * This function shuts down all interfaces belonging to this wiphy by
 * calling dev_close() (and treating non-netdev interfaces as needed).
 * It shouldn't really be used unless there are some fatal device errors
 * that really can't be recovered in any other way.
 *
 * Callers must hold the RTNL and be able to deal with callbacks into
 * the driver while the function is running.
 */
void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy);

/**
 * wiphy_ext_feature_set - set the extended feature flag
 *
 * @wiphy: the wiphy to modify.
 * @ftidx: extended feature bit index.
 *
 * The extended features are flagged in multiple bytes (see
 * &struct wiphy.@ext_features)
 */
static inline void wiphy_ext_feature_set(struct wiphy *wiphy,
                                         enum nl80211_ext_feature_index ftidx)
{
        u8 *ft_byte;

        ft_byte = &wiphy->ext_features[ftidx / 8];
        *ft_byte |= BIT(ftidx % 8);
}

/**
 * wiphy_ext_feature_isset - check the extended feature flag
 *
 * @wiphy: the wiphy to modify.
 * @ftidx: extended feature bit index.
 *
 * The extended features are flagged in multiple bytes (see
 * &struct wiphy.@ext_features)
 *
 * Return: %true if extended feature flag is set, %false otherwise
 */
static inline bool
wiphy_ext_feature_isset(struct wiphy *wiphy,
                        enum nl80211_ext_feature_index ftidx)
{
        u8 ft_byte;

        ft_byte = wiphy->ext_features[ftidx / 8];
        return (ft_byte & BIT(ftidx % 8)) != 0;
}

/**
 * cfg80211_free_nan_func - free NAN function
 * @f: NAN function that should be freed
 *
 * Frees all the NAN function and all it's allocated members.
 */
void cfg80211_free_nan_func(struct cfg80211_nan_func *f);

/**
 * struct cfg80211_nan_match_params - NAN match parameters
 * @type: the type of the function that triggered a match. If it is
 *         %NL80211_NAN_FUNC_SUBSCRIBE it means that we replied to a subscriber.
 *         If it is %NL80211_NAN_FUNC_PUBLISH, it means that we got a discovery
 *         result.
 *         If it is %NL80211_NAN_FUNC_FOLLOW_UP, we received a follow up.
 * @inst_id: the local instance id
 * @peer_inst_id: the instance id of the peer's function
 * @addr: the MAC address of the peer
 * @info_len: the length of the &info
 * @info: the Service Specific Info from the peer (if any)
 * @cookie: unique identifier of the corresponding function
 */
struct cfg80211_nan_match_params {
        enum nl80211_nan_function_type type;
        u8 inst_id;
        u8 peer_inst_id;
        const u8 *addr;
        u8 info_len;
        const u8 *info;
        u64 cookie;
};

/**
 * cfg80211_nan_match - report a match for a NAN function.
 * @wdev: the wireless device reporting the match
 * @match: match notification parameters
 * @gfp: allocation flags
 *
 * This function reports that the a NAN function had a match. This
 * can be a subscribe that had a match or a solicited publish that
 * was sent. It can also be a follow up that was received.
 */
void cfg80211_nan_match(struct wireless_dev *wdev,
                        struct cfg80211_nan_match_params *match, gfp_t gfp);

/**
 * cfg80211_nan_func_terminated - notify about NAN function termination.
 *
 * @wdev: the wireless device reporting the match
 * @inst_id: the local instance id
 * @reason: termination reason (one of the NL80211_NAN_FUNC_TERM_REASON_*)
 * @cookie: unique NAN function identifier
 * @gfp: allocation flags
 *
 * This function reports that the a NAN function is terminated.
 */
void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
                                  u8 inst_id,
                                  enum nl80211_nan_func_term_reason reason,
                                  u64 cookie, gfp_t gfp);

/* ethtool helper */
void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info);

/**
 * cfg80211_external_auth_request - userspace request for authentication
 * @netdev: network device
 * @params: External authentication parameters
 * @gfp: allocation flags
 * Returns: 0 on success, < 0 on error
 */
int cfg80211_external_auth_request(struct net_device *netdev,
                                   struct cfg80211_external_auth_params *params,
                                   gfp_t gfp);

/**
 * cfg80211_pmsr_report - report peer measurement result data
 * @wdev: the wireless device reporting the measurement
 * @req: the original measurement request
 * @result: the result data
 * @gfp: allocation flags
 */
void cfg80211_pmsr_report(struct wireless_dev *wdev,
                          struct cfg80211_pmsr_request *req,
                          struct cfg80211_pmsr_result *result,
                          gfp_t gfp);

/**
 * cfg80211_pmsr_complete - report peer measurement completed
 * @wdev: the wireless device reporting the measurement
 * @req: the original measurement request
 * @gfp: allocation flags
 *
 * Report that the entire measurement completed, after this
 * the request pointer will no longer be valid.
 */
void cfg80211_pmsr_complete(struct wireless_dev *wdev,
                            struct cfg80211_pmsr_request *req,
                            gfp_t gfp);

/**
 * cfg80211_iftype_allowed - check whether the interface can be allowed
 * @wiphy: the wiphy
 * @iftype: interface type
 * @is_4addr: use_4addr flag, must be '0' when check_swif is '1'
 * @check_swif: check iftype against software interfaces
 *
 * Check whether the interface is allowed to operate; additionally, this API
 * can be used to check iftype against the software interfaces when
 * check_swif is '1'.
 *
 * Return: %true if allowed, %false otherwise
 */
bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype,
                             bool is_4addr, u8 check_swif);


/**
 * cfg80211_assoc_comeback - notification of association that was
 * temporarily rejected with a comeback
 * @netdev: network device
 * @ap_addr: AP (MLD) address that rejected the association
 * @timeout: timeout interval value TUs.
 *
 * this function may sleep. the caller must hold the corresponding wdev's mutex.
 */
void cfg80211_assoc_comeback(struct net_device *netdev,
                             const u8 *ap_addr, u32 timeout);

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* wiphy_printk helpers, similar to dev_printk */

#define wiphy_printk(level, wiphy, format, args...)                \
        dev_printk(level, &(wiphy)->dev, format, ##args)
#define wiphy_emerg(wiphy, format, args...)                        \
        dev_emerg(&(wiphy)->dev, format, ##args)
#define wiphy_alert(wiphy, format, args...)                        \
        dev_alert(&(wiphy)->dev, format, ##args)
#define wiphy_crit(wiphy, format, args...)                        \
        dev_crit(&(wiphy)->dev, format, ##args)
#define wiphy_err(wiphy, format, args...)                        \
        dev_err(&(wiphy)->dev, format, ##args)
#define wiphy_warn(wiphy, format, args...)                        \
        dev_warn(&(wiphy)->dev, format, ##args)
#define wiphy_notice(wiphy, format, args...)                        \
        dev_notice(&(wiphy)->dev, format, ##args)
#define wiphy_info(wiphy, format, args...)                        \
        dev_info(&(wiphy)->dev, format, ##args)
#define wiphy_info_once(wiphy, format, args...)                        \
        dev_info_once(&(wiphy)->dev, format, ##args)

#define wiphy_err_ratelimited(wiphy, format, args...)                \
        dev_err_ratelimited(&(wiphy)->dev, format, ##args)
#define wiphy_warn_ratelimited(wiphy, format, args...)                \
        dev_warn_ratelimited(&(wiphy)->dev, format, ##args)

#define wiphy_debug(wiphy, format, args...)                        \
        wiphy_printk(KERN_DEBUG, wiphy, format, ##args)

#define wiphy_dbg(wiphy, format, args...)                        \
        dev_dbg(&(wiphy)->dev, format, ##args)

#if defined(VERBOSE_DEBUG)
#define wiphy_vdbg        wiphy_dbg
#else
#define wiphy_vdbg(wiphy, format, args...)                                \
({                                                                        \
        if (0)                                                                \
                wiphy_printk(KERN_DEBUG, wiphy, format, ##args);        \
        0;                                                                \
})
#endif

/*
 * wiphy_WARN() acts like wiphy_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define wiphy_WARN(wiphy, format, args...)                        \
        WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args);

/**
 * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space
 * @netdev: network device
 * @owe_info: peer's owe info
 * @gfp: allocation flags
 */
void cfg80211_update_owe_info_event(struct net_device *netdev,
                                    struct cfg80211_update_owe_info *owe_info,
                                    gfp_t gfp);

/**
 * cfg80211_bss_flush - resets all the scan entries
 * @wiphy: the wiphy
 */
void cfg80211_bss_flush(struct wiphy *wiphy);

/**
 * cfg80211_bss_color_notify - notify about bss color event
 * @dev: network device
 * @cmd: the actual event we want to notify
 * @count: the number of TBTTs until the color change happens
 * @color_bitmap: representations of the colors that the local BSS is aware of
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Return: 0 on success. Non-zero on error.
 */
int cfg80211_bss_color_notify(struct net_device *dev,
                              enum nl80211_commands cmd, u8 count,
                              u64 color_bitmap, u8 link_id);

/**
 * cfg80211_obss_color_collision_notify - notify about bss color collision
 * @dev: network device
 * @color_bitmap: representations of the colors that the local BSS is aware of
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
                                                       u64 color_bitmap,
                                                       u8 link_id)
{
        return cfg80211_bss_color_notify(dev, NL80211_CMD_OBSS_COLOR_COLLISION,
                                         0, color_bitmap, link_id);
}

/**
 * cfg80211_color_change_started_notify - notify color change start
 * @dev: the device on which the color is switched
 * @count: the number of TBTTs until the color change happens
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Inform the userspace about the color change that has started.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_color_change_started_notify(struct net_device *dev,
                                                       u8 count, u8 link_id)
{
        return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_STARTED,
                                         count, 0, link_id);
}

/**
 * cfg80211_color_change_aborted_notify - notify color change abort
 * @dev: the device on which the color is switched
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Inform the userspace about the color change that has aborted.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_color_change_aborted_notify(struct net_device *dev,
                                                       u8 link_id)
{
        return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_ABORTED,
                                         0, 0, link_id);
}

/**
 * cfg80211_color_change_notify - notify color change completion
 * @dev: the device on which the color was switched
 * @link_id: valid link_id in case of MLO or 0 for non-MLO.
 *
 * Inform the userspace about the color change that has completed.
 *
 * Return: 0 on success. Non-zero on error.
 */
static inline int cfg80211_color_change_notify(struct net_device *dev,
                                               u8 link_id)
{
        return cfg80211_bss_color_notify(dev,
                                         NL80211_CMD_COLOR_CHANGE_COMPLETED,
                                         0, 0, link_id);
}

/**
 * cfg80211_links_removed - Notify about removed STA MLD setup links.
 * @dev: network device.
 * @link_mask: BIT mask of removed STA MLD setup link IDs.
 *
 * Inform cfg80211 and the userspace about removed STA MLD setup links due to
 * AP MLD removing the corresponding affiliated APs with Multi-Link
 * reconfiguration. Note that it's not valid to remove all links, in this
 * case disconnect instead.
 * Also note that the wdev mutex must be held.
 */
void cfg80211_links_removed(struct net_device *dev, u16 link_mask);

/**
 * cfg80211_schedule_channels_check - schedule regulatory check if needed
 * @wdev: the wireless device to check
 *
 * In case the device supports NO_IR or DFS relaxations, schedule regulatory
 * channels check, as previous concurrent operation conditions may not
 * hold anymore.
 */
void cfg80211_schedule_channels_check(struct wireless_dev *wdev);

#ifdef CONFIG_CFG80211_DEBUGFS
/**
 * wiphy_locked_debugfs_read - do a locked read in debugfs
 * @wiphy: the wiphy to use
 * @file: the file being read
 * @buf: the buffer to fill and then read from
 * @bufsize: size of the buffer
 * @userbuf: the user buffer to copy to
 * @count: read count
 * @ppos: read position
 * @handler: the read handler to call (under wiphy lock)
 * @data: additional data to pass to the read handler
 *
 * Return: the number of characters read, or a negative errno
 */
ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file,
                                  char *buf, size_t bufsize,
                                  char __user *userbuf, size_t count,
                                  loff_t *ppos,
                                  ssize_t (*handler)(struct wiphy *wiphy,
                                                     struct file *file,
                                                     char *buf,
                                                     size_t bufsize,
                                                     void *data),
                                  void *data);

/**
 * wiphy_locked_debugfs_write - do a locked write in debugfs
 * @wiphy: the wiphy to use
 * @file: the file being written to
 * @buf: the buffer to copy the user data to
 * @bufsize: size of the buffer
 * @userbuf: the user buffer to copy from
 * @count: read count
 * @handler: the write handler to call (under wiphy lock)
 * @data: additional data to pass to the write handler
 *
 * Return: the number of characters written, or a negative errno
 */
ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file,
                                   char *buf, size_t bufsize,
                                   const char __user *userbuf, size_t count,
                                   ssize_t (*handler)(struct wiphy *wiphy,
                                                      struct file *file,
                                                      char *buf,
                                                      size_t count,
                                                      void *data),
                                   void *data);
#endif

#endif /* __NET_CFG80211_H */







































    2 



    2 

    2 

    1 

    2 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// SPDX-License-Identifier: GPL-2.0-only
/*
 * File: sysctl.c
 *
 * Phonet /proc/sys/net/phonet interface implementation
 *
 * Copyright (C) 2008 Nokia Corporation.
 *
 * Author: Rémi Denis-Courmont
 */

#include <linux/seqlock.h>
#include <linux/sysctl.h>
#include <linux/errno.h>
#include <linux/init.h>

#include <net/sock.h>
#include <linux/phonet.h>
#include <net/phonet/phonet.h>

#define DYNAMIC_PORT_MIN        0x40
#define DYNAMIC_PORT_MAX        0x7f

static DEFINE_SEQLOCK(local_port_range_lock);
static int local_port_range_min[2] = {0, 0};
static int local_port_range_max[2] = {1023, 1023};
static int local_port_range[2] = {DYNAMIC_PORT_MIN, DYNAMIC_PORT_MAX};
static struct ctl_table_header *phonet_table_hrd;

static void set_local_port_range(int range[2])
{
        write_seqlock(&local_port_range_lock);
        local_port_range[0] = range[0];
        local_port_range[1] = range[1];
        write_sequnlock(&local_port_range_lock);
}

void phonet_get_local_port_range(int *min, int *max)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&local_port_range_lock);
                if (min)
                        *min = local_port_range[0];
                if (max)
                        *max = local_port_range[1];
        } while (read_seqretry(&local_port_range_lock, seq));
}

static int proc_local_port_range(struct ctl_table *table, int write,
                                 void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        int range[2] = {local_port_range[0], local_port_range[1]};
        struct ctl_table tmp = {
                .data = &range,
                .maxlen = sizeof(range),
                .mode = table->mode,
                .extra1 = &local_port_range_min,
                .extra2 = &local_port_range_max,
        };

        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);

        if (write && ret == 0) {
                if (range[1] < range[0])
                        ret = -EINVAL;
                else
                        set_local_port_range(range);
        }

        return ret;
}

static struct ctl_table phonet_table[] = {
        {
                .procname        = "local_port_range",
                .data                = &local_port_range,
                .maxlen                = sizeof(local_port_range),
                .mode                = 0644,
                .proc_handler        = proc_local_port_range,
        },
};

int __init phonet_sysctl_init(void)
{
        phonet_table_hrd = register_net_sysctl(&init_net, "net/phonet", phonet_table);
        return phonet_table_hrd == NULL ? -ENOMEM : 0;
}

void phonet_sysctl_exit(void)
{
        unregister_net_sysctl_table(phonet_table_hrd);
}













































































































    2 

    2 























































































    1 
    1 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/ipc/namespace.c
 * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc.
 */

#include <linux/ipc.h>
#include <linux/msg.h>
#include <linux/ipc_namespace.h>
#include <linux/rcupdate.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/sched/task.h>

#include "util.h"

/*
 * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount.
 */
static void free_ipc(struct work_struct *unused);
static DECLARE_WORK(free_ipc_work, free_ipc);

static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES);
}

static void dec_ipc_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES);
}

static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                                           struct ipc_namespace *old_ns)
{
        struct ipc_namespace *ns;
        struct ucounts *ucounts;
        int err;

        err = -ENOSPC;
 again:
        ucounts = inc_ipc_namespaces(user_ns);
        if (!ucounts) {
                /*
                 * IPC namespaces are freed asynchronously, by free_ipc_work.
                 * If frees were pending, flush_work will wait, and
                 * return true. Fail the allocation if no frees are pending.
                 */
                if (flush_work(&free_ipc_work))
                        goto again;
                goto fail;
        }

        err = -ENOMEM;
        ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT);
        if (ns == NULL)
                goto fail_dec;

        err = ns_alloc_inum(&ns->ns);
        if (err)
                goto fail_free;
        ns->ns.ops = &ipcns_operations;

        refcount_set(&ns->ns.count, 1);
        ns->user_ns = get_user_ns(user_ns);
        ns->ucounts = ucounts;

        err = mq_init_ns(ns);
        if (err)
                goto fail_put;

        err = -ENOMEM;
        if (!setup_mq_sysctls(ns))
                goto fail_put;

        if (!setup_ipc_sysctls(ns))
                goto fail_mq;

        err = msg_init_ns(ns);
        if (err)
                goto fail_put;

        sem_init_ns(ns);
        shm_init_ns(ns);

        return ns;

fail_mq:
        retire_mq_sysctls(ns);

fail_put:
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
fail_free:
        kfree(ns);
fail_dec:
        dec_ipc_namespaces(ucounts);
fail:
        return ERR_PTR(err);
}

struct ipc_namespace *copy_ipcs(unsigned long flags,
        struct user_namespace *user_ns, struct ipc_namespace *ns)
{
        if (!(flags & CLONE_NEWIPC))
                return get_ipc_ns(ns);
        return create_ipc_ns(user_ns, ns);
}

/*
 * free_ipcs - free all ipcs of one type
 * @ns:   the namespace to remove the ipcs from
 * @ids:  the table of ipcs to free
 * @free: the function called to free each individual ipc
 *
 * Called for each kind of ipc when an ipc_namespace exits.
 */
void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
               void (*free)(struct ipc_namespace *, struct kern_ipc_perm *))
{
        struct kern_ipc_perm *perm;
        int next_id;
        int total, in_use;

        down_write(&ids->rwsem);

        in_use = ids->in_use;

        for (total = 0, next_id = 0; total < in_use; next_id++) {
                perm = idr_find(&ids->ipcs_idr, next_id);
                if (perm == NULL)
                        continue;
                rcu_read_lock();
                ipc_lock_object(perm);
                free(ns, perm);
                total++;
        }
        up_write(&ids->rwsem);
}

static void free_ipc_ns(struct ipc_namespace *ns)
{
        /*
         * Caller needs to wait for an RCU grace period to have passed
         * after making the mount point inaccessible to new accesses.
         */
        mntput(ns->mq_mnt);
        sem_exit_ns(ns);
        msg_exit_ns(ns);
        shm_exit_ns(ns);

        retire_mq_sysctls(ns);
        retire_ipc_sysctls(ns);

        dec_ipc_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
        kfree(ns);
}

static LLIST_HEAD(free_ipc_list);
static void free_ipc(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&free_ipc_list);
        struct ipc_namespace *n, *t;

        llist_for_each_entry_safe(n, t, node, mnt_llist)
                mnt_make_shortterm(n->mq_mnt);

        /* Wait for any last users to have gone away. */
        synchronize_rcu();

        llist_for_each_entry_safe(n, t, node, mnt_llist)
                free_ipc_ns(n);
}

/*
 * put_ipc_ns - drop a reference to an ipc namespace.
 * @ns: the namespace to put
 *
 * If this is the last task in the namespace exiting, and
 * it is dropping the refcount to 0, then it can race with
 * a task in another ipc namespace but in a mounts namespace
 * which has this ipcns's mqueuefs mounted, doing some action
 * with one of the mqueuefs files.  That can raise the refcount.
 * So dropping the refcount, and raising the refcount when
 * accessing it through the VFS, are protected with mq_lock.
 *
 * (Clearly, a task raising the refcount on its own ipc_ns
 * needn't take mq_lock since it can't race with the last task
 * in the ipcns exiting).
 */
void put_ipc_ns(struct ipc_namespace *ns)
{
        if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) {
                mq_clear_sbinfo(ns);
                spin_unlock(&mq_lock);

                if (llist_add(&ns->mnt_llist, &free_ipc_list))
                        schedule_work(&free_ipc_work);
        }
}

static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
{
        return container_of(ns, struct ipc_namespace, ns);
}

static struct ns_common *ipcns_get(struct task_struct *task)
{
        struct ipc_namespace *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy)
                ns = get_ipc_ns(nsproxy->ipc_ns);
        task_unlock(task);

        return ns ? &ns->ns : NULL;
}

static void ipcns_put(struct ns_common *ns)
{
        return put_ipc_ns(to_ipc_ns(ns));
}

static int ipcns_install(struct nsset *nsset, struct ns_common *new)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct ipc_namespace *ns = to_ipc_ns(new);
        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        put_ipc_ns(nsproxy->ipc_ns);
        nsproxy->ipc_ns = get_ipc_ns(ns);
        return 0;
}

static struct user_namespace *ipcns_owner(struct ns_common *ns)
{
        return to_ipc_ns(ns)->user_ns;
}

const struct proc_ns_operations ipcns_operations = {
        .name                = "ipc",
        .type                = CLONE_NEWIPC,
        .get                = ipcns_get,
        .put                = ipcns_put,
        .install        = ipcns_install,
        .owner                = ipcns_owner,
};
































































    2 


    2 
    3 




    2 










































































    8 


    8 






    8 


























    7 







    8 


















    9 









































































    9 





    6 
    2 
    9 
    1 

    8 
    1 




    7 
    1 
















    9 



















    9 








    9 


































    1 
    1 










    3 










    3 




    2 



    3 



    1 
    1 
    2 



    3 

    3 

    2 








































   22 
   21 































    2 

    3 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/swap.h>
#include <linux/kmemleak.h>

#include <linux/atomic.h>

#include "internal.h"

/* sysctl tunables... */
static struct files_stat_struct files_stat = {
        .max_files = NR_FILE
};

/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;

static struct percpu_counter nr_files __cacheline_aligned_in_smp;

/* Container for backing file with optional user path */
struct backing_file {
        struct file file;
        struct path user_path;
};

static inline struct backing_file *backing_file(struct file *f)
{
        return container_of(f, struct backing_file, file);
}

struct path *backing_file_user_path(struct file *f)
{
        return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);

static inline void file_free(struct file *f)
{
        security_file_free(f);
        if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
                percpu_counter_dec(&nr_files);
        put_cred(f->f_cred);
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                path_put(backing_file_user_path(f));
                kfree(backing_file(f));
        } else {
                kmem_cache_free(filp_cachep, f);
        }
}

/*
 * Return the total number of open files in the system
 */
static long get_nr_files(void)
{
        return percpu_counter_read_positive(&nr_files);
}

/*
 * Return the maximum number of open files in the system
 */
unsigned long get_max_files(void)
{
        return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
 * Handle nr_files sysctl
 */
static int proc_nr_files(struct ctl_table *table, int write, void *buffer,
                         size_t *lenp, loff_t *ppos)
{
        files_stat.nr_files = get_nr_files();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table fs_stat_sysctls[] = {
        {
                .procname        = "file-nr",
                .data                = &files_stat,
                .maxlen                = sizeof(files_stat),
                .mode                = 0444,
                .proc_handler        = proc_nr_files,
        },
        {
                .procname        = "file-max",
                .data                = &files_stat.max_files,
                .maxlen                = sizeof(files_stat.max_files),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = SYSCTL_LONG_ZERO,
                .extra2                = SYSCTL_LONG_MAX,
        },
        {
                .procname        = "nr_open",
                .data                = &sysctl_nr_open,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = &sysctl_nr_open_min,
                .extra2                = &sysctl_nr_open_max,
        },
};

static int __init init_fs_stat_sysctls(void)
{
        register_sysctl_init("fs", fs_stat_sysctls);
        if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
                struct ctl_table_header *hdr;
                hdr = register_sysctl_mount_point("fs/binfmt_misc");
                kmemleak_not_leak(hdr);
        }
        return 0;
}
fs_initcall(init_fs_stat_sysctls);
#endif

static int init_file(struct file *f, int flags, const struct cred *cred)
{
        int error;

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                put_cred(f->f_cred);
                return error;
        }

        rwlock_init(&f->f_owner.lock);
        spin_lock_init(&f->f_lock);
        mutex_init(&f->f_pos_lock);
        f->f_flags = flags;
        f->f_mode = OPEN_FMODE(flags);
        /* f->f_version: 0 */

        /*
         * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
         * fget-rcu pattern users need to be able to handle spurious
         * refcount bumps we should reinitialize the reused file first.
         */
        atomic_long_set(&f->f_count, 1);
        return 0;
}

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
        static long old_max;
        struct file *f;
        int error;

        /*
         * Privileged users can go above max_files
         */
        if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
                /*
                 * percpu_counters are inaccurate.  Do an expensive check before
                 * we go and fail.
                 */
                if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
                        goto over;
        }

        f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        percpu_counter_inc(&nr_files);

        return f;

over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
                old_max = get_nr_files();
        }
        return ERR_PTR(-ENFILE);
}

/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
        struct file *f;
        int error;

        f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        f->f_mode |= FMODE_NOACCOUNT;

        return f;
}

/*
 * Variant of alloc_empty_file() that allocates a backing_file container
 * and doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
{
        struct backing_file *ff;
        int error;

        ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
        if (unlikely(!ff))
                return ERR_PTR(-ENOMEM);

        error = init_file(&ff->file, flags, cred);
        if (unlikely(error)) {
                kfree(ff);
                return ERR_PTR(error);
        }

        ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
        return &ff->file;
}

/**
 * file_init_path - initialize a 'struct file' based on path
 *
 * @file: the file to set up
 * @path: the (dentry, vfsmount) pair for the new file
 * @fop: the 'struct file_operations' for the new file
 */
static void file_init_path(struct file *file, const struct path *path,
                           const struct file_operations *fop)
{
        file->f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
        file->f_sb_err = file_sample_sb_err(file);
        if (fop->llseek)
                file->f_mode |= FMODE_LSEEK;
        if ((file->f_mode & FMODE_READ) &&
             likely(fop->read || fop->read_iter))
                file->f_mode |= FMODE_CAN_READ;
        if ((file->f_mode & FMODE_WRITE) &&
             likely(fop->write || fop->write_iter))
                file->f_mode |= FMODE_CAN_WRITE;
        file->f_iocb_flags = iocb_flags(file);
        file->f_mode |= FMODE_OPENED;
        file->f_op = fop;
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
}

/**
 * alloc_file - allocate and initialize a 'struct file'
 *
 * @path: the (dentry, vfsmount) pair for the new file
 * @flags: O_... flags with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 */
static struct file *alloc_file(const struct path *path, int flags,
                const struct file_operations *fop)
{
        struct file *file;

        file = alloc_empty_file(flags, current_cred());
        if (!IS_ERR(file))
                file_init_path(file, path, fop);
        return file;
}

static inline int alloc_path_pseudo(const char *name, struct inode *inode,
                                    struct vfsmount *mnt, struct path *path)
{
        struct qstr this = QSTR_INIT(name, strlen(name));

        path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
        if (!path->dentry)
                return -ENOMEM;
        path->mnt = mntget(mnt);
        d_instantiate(path->dentry, inode);
        return 0;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
                               const char *name, int flags,
                               const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_file(&path, flags, fops);
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
        }
        return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

struct file *alloc_file_pseudo_noaccount(struct inode *inode,
                                         struct vfsmount *mnt, const char *name,
                                         int flags,
                                         const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_empty_file_noaccount(flags, current_cred());
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        file_init_path(file, &path, fops);
        return file;
}
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);

struct file *alloc_file_clone(struct file *base, int flags,
                                const struct file_operations *fops)
{
        struct file *f = alloc_file(&base->f_path, flags, fops);
        if (!IS_ERR(f)) {
                path_get(&f->f_path);
                f->f_mapping = base->f_mapping;
        }
        return f;
}

/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
        struct inode *inode = file->f_inode;
        fmode_t mode = file->f_mode;

        if (unlikely(!(file->f_mode & FMODE_OPENED)))
                goto out;

        might_sleep();

        fsnotify_close(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
         */
        eventpoll_release(file);
        locks_remove_file(file);

        security_file_release(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
                        file->f_op->fasync(-1, file, 0);
        }
        if (file->f_op->release)
                file->f_op->release(inode, file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
                     !(mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
        }
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
        put_file_access(file);
        dput(dentry);
        if (unlikely(mode & FMODE_NEED_UNMOUNT))
                dissolve_on_fput(mnt);
        mntput(mnt);
out:
        file_free(file);
}

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_fput_list);
        struct file *f, *t;

        llist_for_each_entry_safe(f, t, node, f_llist)
                __fput(f);
}

static void ____fput(struct callback_head *work)
{
        __fput(container_of(work, struct file, f_task_work));
}

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
        delayed_fput(NULL);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);

static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);

void fput(struct file *file)
{
        if (atomic_long_dec_and_test(&file->f_count)) {
                struct task_struct *task = current;

                if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
                        file_free(file);
                        return;
                }
                if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                        init_task_work(&file->f_task_work, ____fput);
                        if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
                                return;
                        /*
                         * After this task has run exit_task_work(),
                         * task_work_add() will fail.  Fall through to delayed
                         * fput to avoid leaking *file.
                         */
                }

                if (llist_add(&file->f_llist, &delayed_fput_list))
                        schedule_delayed_work(&delayed_fput_work, 1);
        }
}

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
        if (atomic_long_dec_and_test(&file->f_count))
                __fput(file);
}

EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);

void __init files_init(void)
{
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
                                SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
                                SLAB_PANIC | SLAB_ACCOUNT, NULL);
        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
        unsigned long n;
        unsigned long nr_pages = totalram_pages();
        unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;

        memreserve = min(memreserve, nr_pages - 1);
        n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 



    5 































    4 










    4 









































    1 


    9 











    7 





    6 
    4 










































































































































































































































































































































































    1 



    1 
















































































































   11 


























































































    1 










































































































































    1 
    4 
    1 























    1 










































































    1 


















































































































































































































































































































































































    7 
    2 


































































    1 










































    1 





    1 
    1 












    1 




















    1 










































    4 


































    3 













    1 





















































    4 









    5 










    6 




    6 









    6 

    5 













    6 




    3 



























    3 
























































































































































    6 






































    4 
    1 









































    1 









































    1 




    1 











































































































































    1 

    5 



    7 














    3 









    2 


















    1 





    1 



















































































































































































    1 


































































































































































    1 



















    1 





































































































    1 


    1 

    1 



















    2 





















































    1 
































































    1 
































































































































































    3 




    1 























































    4 

    1 



































































































































    1 










    3 


    1 

































































































































































    2 


















    3 
































































































































































































































































































   10 








    3 































    7 

























































    4 































































































    4 
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct sk_buff' memory handlers.
 *
 *        Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Florian La Roche, <rzsfl@rz.uni-sb.de>
 */

#ifndef _LINUX_SKBUFF_H
#define _LINUX_SKBUFF_H

#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/bug.h>
#include <linux/bvec.h>
#include <linux/cache.h>
#include <linux/rbtree.h>
#include <linux/socket.h>
#include <linux/refcount.h>

#include <linux/atomic.h>
#include <asm/types.h>
#include <linux/spinlock.h>
#include <net/checksum.h>
#include <linux/rcupdate.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <net/flow_dissector.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <linux/llist.h>
#include <net/flow.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/netmem.h>

/**
 * DOC: skb checksums
 *
 * The interface for checksum offload between the stack and networking drivers
 * is as follows...
 *
 * IP checksum related features
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Drivers advertise checksum offload capabilities in the features of a device.
 * From the stack's point of view these are capabilities offered by the driver.
 * A driver typically only advertises features that it is capable of offloading
 * to its device.
 *
 * .. flat-table:: Checksum related device features
 *   :widths: 1 10
 *
 *   * - %NETIF_F_HW_CSUM
 *     - The driver (or its device) is able to compute one
 *         IP (one's complement) checksum for any combination
 *         of protocols or protocol layering. The checksum is
 *         computed and set in a packet per the CHECKSUM_PARTIAL
 *         interface (see below).
 *
 *   * - %NETIF_F_IP_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv4. These are specifically
 *         unencapsulated packets of the form IPv4|TCP or
 *         IPv4|UDP where the Protocol field in the IPv4 header
 *         is TCP or UDP. The IPv4 header may contain IP options.
 *         This feature cannot be set in features for a device
 *         with NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_IPV6_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv6. These are specifically
 *         unencapsulated packets of the form IPv6|TCP or
 *         IPv6|UDP where the Next Header field in the IPv6
 *         header is either TCP or UDP. IPv6 extension headers
 *         are not supported with this feature. This feature
 *         cannot be set in features for a device with
 *         NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_RXCSUM
 *     - Driver (device) performs receive checksum offload.
 *         This flag is only used to disable the RX checksum
 *         feature for a device. The stack will accept receive
 *         checksum indication in packets received on a device
 *         regardless of whether NETIF_F_RXCSUM is set.
 *
 * Checksumming of received packets by device
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Indication of checksum verification is set in &sk_buff.ip_summed.
 * Possible values are:
 *
 * - %CHECKSUM_NONE
 *
 *   Device did not checksum this packet e.g. due to lack of capabilities.
 *   The packet contains full (though not verified) checksum in packet but
 *   not in skb->csum. Thus, skb->csum is undefined in this case.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   The hardware you're dealing with doesn't calculate the full checksum
 *   (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums
 *   for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY
 *   if their checksums are okay. &sk_buff.csum is still undefined in this case
 *   though. A driver or device must never modify the checksum field in the
 *   packet even if checksum is verified.
 *
 *   %CHECKSUM_UNNECESSARY is applicable to following protocols:
 *
 *     - TCP: IPv6 and IPv4.
 *     - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
 *       zero UDP checksum for either IPv4 or IPv6, the networking stack
 *       may perform further validation in this case.
 *     - GRE: only if the checksum is present in the header.
 *     - SCTP: indicates the CRC in SCTP header has been validated.
 *     - FCOE: indicates the CRC in FC frame has been validated.
 *
 *   &sk_buff.csum_level indicates the number of consecutive checksums found in
 *   the packet minus one that have been verified as %CHECKSUM_UNNECESSARY.
 *   For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
 *   and a device is able to verify the checksums for UDP (possibly zero),
 *   GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to
 *   two. If the device were only able to verify the UDP checksum and not
 *   GRE, either because it doesn't support GRE checksum or because GRE
 *   checksum is bad, skb->csum_level would be set to zero (TCP checksum is
 *   not considered in this case).
 *
 * - %CHECKSUM_COMPLETE
 *
 *   This is the most generic way. The device supplied checksum of the _whole_
 *   packet as seen by netif_rx() and fills in &sk_buff.csum. This means the
 *   hardware doesn't need to parse L3/L4 headers to implement this.
 *
 *   Notes:
 *
 *   - Even if device supports only some protocols, but is able to produce
 *     skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
 *   - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
 *
 * - %CHECKSUM_PARTIAL
 *
 *   A checksum is set up to be offloaded to a device as described in the
 *   output description for CHECKSUM_PARTIAL. This may occur on a packet
 *   received directly from another Linux OS, e.g., a virtualized Linux kernel
 *   on the same host, or it may be set in the input path in GRO or remote
 *   checksum offload. For the purposes of checksum verification, the checksum
 *   referred to by skb->csum_start + skb->csum_offset and any preceding
 *   checksums in the packet are considered verified. Any checksums in the
 *   packet that are after the checksum being offloaded are not considered to
 *   be verified.
 *
 * Checksumming on transmit for non-GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * The stack requests checksum offload in the &sk_buff.ip_summed for a packet.
 * Values are:
 *
 * - %CHECKSUM_PARTIAL
 *
 *   The driver is required to checksum the packet as seen by hard_start_xmit()
 *   from &sk_buff.csum_start up to the end, and to record/write the checksum at
 *   offset &sk_buff.csum_start + &sk_buff.csum_offset.
 *   A driver may verify that the
 *   csum_start and csum_offset values are valid values given the length and
 *   offset of the packet, but it should not attempt to validate that the
 *   checksum refers to a legitimate transport layer checksum -- it is the
 *   purview of the stack to validate that csum_start and csum_offset are set
 *   correctly.
 *
 *   When the stack requests checksum offload for a packet, the driver MUST
 *   ensure that the checksum is set correctly. A driver can either offload the
 *   checksum calculation to the device, or call skb_checksum_help (in the case
 *   that the device does not support offload for a particular checksum).
 *
 *   %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of
 *   %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate
 *   checksum offload capability.
 *   skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based
 *   on network device checksumming capabilities: if a packet does not match
 *   them, skb_checksum_help() or skb_crc32c_help() (depending on the value of
 *   &sk_buff.csum_not_inet, see :ref:`crc`)
 *   is called to resolve the checksum.
 *
 * - %CHECKSUM_NONE
 *
 *   The skb was already checksummed by the protocol, or a checksum is not
 *   required.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   This has the same meaning as CHECKSUM_NONE for checksum offload on
 *   output.
 *
 * - %CHECKSUM_COMPLETE
 *
 *   Not used in checksum output. If a driver observes a packet with this value
 *   set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set.
 *
 * .. _crc:
 *
 * Non-IP checksum (CRC) offloads
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * .. flat-table::
 *   :widths: 1 10
 *
 *   * - %NETIF_F_SCTP_CRC
 *     - This feature indicates that a device is capable of
 *         offloading the SCTP CRC in a packet. To perform this offload the stack
 *         will set csum_start and csum_offset accordingly, set ip_summed to
 *         %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication
 *         in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c.
 *         A driver that supports both IP checksum offload and SCTP CRC32c offload
 *         must verify which offload is configured for a packet by testing the
 *         value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to
 *         resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
 *
 *   * - %NETIF_F_FCOE_CRC
 *     - This feature indicates that a device is capable of offloading the FCOE
 *         CRC in a packet. To perform this offload the stack will set ip_summed
 *         to %CHECKSUM_PARTIAL and set csum_start and csum_offset
 *         accordingly. Note that there is no indication in the skbuff that the
 *         %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
 *         both IP checksum offload and FCOE CRC offload must verify which offload
 *         is configured for a packet, presumably by inspecting packet headers.
 *
 * Checksumming on output with GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * In the case of a GSO packet (skb_is_gso() is true), checksum offload
 * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
 * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as
 * part of the GSO operation is implied. If a checksum is being offloaded
 * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and
 * csum_offset are set to refer to the outermost checksum being offloaded
 * (two offloaded checksums are possible with UDP encapsulation).
 */

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE                0
#define CHECKSUM_UNNECESSARY        1
#define CHECKSUM_COMPLETE        2
#define CHECKSUM_PARTIAL        3

/* Maximum value in skb->csum_level */
#define SKB_MAX_CSUM_LEVEL        3

#define SKB_DATA_ALIGN(X)        ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X)        \
        ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

/* For X bytes available in skb->head, what is the minimal
 * allocation needed, knowing struct skb_shared_info needs
 * to be aligned.
 */
#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \
        SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

#define SKB_MAX_ORDER(X, ORDER) \
        SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X)                (SKB_MAX_ORDER((X), 0))
#define SKB_MAX_ALLOC                (SKB_MAX_ORDER(0, 2))

/* return minimum truesize of one skb containing X bytes of data */
#define SKB_TRUESIZE(X) ((X) +                                                \
                         SKB_DATA_ALIGN(sizeof(struct sk_buff)) +        \
                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

struct ahash_request;
struct net_device;
struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
struct skb_ext;
struct ts_config;

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info {
        enum {
                BRNF_PROTO_UNCHANGED,
                BRNF_PROTO_8021Q,
                BRNF_PROTO_PPPOE
        } orig_proto:8;
        u8                        pkt_otherhost:1;
        u8                        in_prerouting:1;
        u8                        bridged_dnat:1;
        u8                        sabotage_in_done:1;
        __u16                        frag_max_size;
        int                        physinif;

        /* always valid & non-NULL from FORWARD on, for physdev match */
        struct net_device        *physoutdev;
        union {
                /* prerouting: detect dnat in orig/reply direction */
                __be32          ipv4_daddr;
                struct in6_addr ipv6_daddr;

                /* after prerouting + nat detected: store original source
                 * mac since neigh resolution overwrites it, only used while
                 * skb is out in neigh layer.
                 */
                char neigh_header[8];
        };
};
#endif

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
/* Chain in tc_skb_ext will be used to share the tc chain with
 * ovs recirc_id. It will be set to the current chain by tc
 * and read by ovs to recirc_id.
 */
struct tc_skb_ext {
        union {
                u64 act_miss_cookie;
                __u32 chain;
        };
        __u16 mru;
        __u16 zone;
        u8 post_ct:1;
        u8 post_ct_snat:1;
        u8 post_ct_dnat:1;
        u8 act_miss:1; /* Set if act_miss_cookie is used */
        u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */
};
#endif

struct sk_buff_head {
        /* These two members must be first to match sk_buff. */
        struct_group_tagged(sk_buff_list, list,
                struct sk_buff        *next;
                struct sk_buff        *prev;
        );

        __u32                qlen;
        spinlock_t        lock;
};

struct sk_buff;

#ifndef CONFIG_MAX_SKB_FRAGS
# define CONFIG_MAX_SKB_FRAGS 17
#endif

#define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS

/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
 * segment using its current segmentation instead.
 */
#define GSO_BY_FRAGS        0xFFFF

typedef struct skb_frag {
        netmem_ref netmem;
        unsigned int len;
        unsigned int offset;
} skb_frag_t;

/**
 * skb_frag_size() - Returns the size of a skb fragment
 * @frag: skb fragment
 */
static inline unsigned int skb_frag_size(const skb_frag_t *frag)
{
        return frag->len;
}

/**
 * skb_frag_size_set() - Sets the size of a skb fragment
 * @frag: skb fragment
 * @size: size of fragment
 */
static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
{
        frag->len = size;
}

/**
 * skb_frag_size_add() - Increments the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
{
        frag->len += delta;
}

/**
 * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to subtract
 */
static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
{
        frag->len -= delta;
}

/**
 * skb_frag_must_loop - Test if %p is a high memory page
 * @p: fragment's page
 */
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
        if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
                return true;
#endif
        return false;
}

/**
 *        skb_frag_foreach_page - loop over pages in a fragment
 *
 *        @f:                skb frag to operate on
 *        @f_off:                offset from start of f->netmem
 *        @f_len:                length from f_off to loop over
 *        @p:                (temp var) current page
 *        @p_off:                (temp var) offset from start of current page,
 *                                   non-zero only on first page.
 *        @p_len:                (temp var) length in current page,
 *                                   < PAGE_SIZE only on first and last page.
 *        @copied:        (temp var) length so far, excluding current p_len.
 *
 *        A fragment can hold a compound page, in which case per-page
 *        operations, notably kmap_atomic, must be called for each
 *        regular page.
 */
#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)        \
        for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),                \
             p_off = (f_off) & (PAGE_SIZE - 1),                                \
             p_len = skb_frag_must_loop(p) ?                                \
             min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,                \
             copied = 0;                                                \
             copied < f_len;                                                \
             copied += p_len, p++, p_off = 0,                                \
             p_len = min_t(u32, f_len - copied, PAGE_SIZE))                \

/**
 * struct skb_shared_hwtstamps - hardware time stamps
 * @hwtstamp:                hardware time stamp transformed into duration
 *                        since arbitrary point in time
 * @netdev_data:        address/cookie of network device driver used as
 *                        reference to actual hardware time stamp
 *
 * Software time stamps generated by ktime_get_real() are stored in
 * skb->tstamp.
 *
 * hwtstamps can only be compared against other hwtstamps from
 * the same device.
 *
 * This structure is attached to packets as part of the
 * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
 */
struct skb_shared_hwtstamps {
        union {
                ktime_t        hwtstamp;
                void *netdev_data;
        };
};

/* Definitions for tx_flags in struct skb_shared_info */
enum {
        /* generate hardware time stamp */
        SKBTX_HW_TSTAMP = 1 << 0,

        /* generate software time stamp when queueing packet to NIC */
        SKBTX_SW_TSTAMP = 1 << 1,

        /* device driver is going to provide hardware time stamp */
        SKBTX_IN_PROGRESS = 1 << 2,

        /* generate hardware time stamp based on cycles if supported */
        SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3,

        /* generate wifi status information (where possible) */
        SKBTX_WIFI_STATUS = 1 << 4,

        /* determine hardware time stamp based on time or cycles */
        SKBTX_HW_TSTAMP_NETDEV = 1 << 5,

        /* generate software time stamp when entering packet scheduling */
        SKBTX_SCHED_TSTAMP = 1 << 6,
};

#define SKBTX_ANY_SW_TSTAMP        (SKBTX_SW_TSTAMP    | \
                                 SKBTX_SCHED_TSTAMP)
#define SKBTX_ANY_TSTAMP        (SKBTX_HW_TSTAMP | \
                                 SKBTX_HW_TSTAMP_USE_CYCLES | \
                                 SKBTX_ANY_SW_TSTAMP)

/* Definitions for flags in struct skb_shared_info */
enum {
        /* use zcopy routines */
        SKBFL_ZEROCOPY_ENABLE = BIT(0),

        /* This indicates at least one fragment might be overwritten
         * (as in vmsplice(), sendfile() ...)
         * If we need to compute a TX checksum, we'll need to copy
         * all frags to avoid possible bad checksum
         */
        SKBFL_SHARED_FRAG = BIT(1),

        /* segment contains only zerocopy data and should not be
         * charged to the kernel memory.
         */
        SKBFL_PURE_ZEROCOPY = BIT(2),

        SKBFL_DONT_ORPHAN = BIT(3),

        /* page references are managed by the ubuf_info, so it's safe to
         * use frags only up until ubuf_info is released
         */
        SKBFL_MANAGED_FRAG_REFS = BIT(4),
};

#define SKBFL_ZEROCOPY_FRAG        (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY        (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
                                 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)

struct ubuf_info_ops {
        void (*complete)(struct sk_buff *, struct ubuf_info *,
                         bool zerocopy_success);
        /* has to be compatible with skb_zcopy_set() */
        int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg);
};

/*
 * The callback notifies userspace to release buffers when skb DMA is done in
 * lower device, the skb last reference should be 0 when calling this.
 * The zerocopy_success argument is true if zero copy transmit occurred,
 * false on data copy or out of memory error caused by data copy attempt.
 * The ctx field is used to track device context.
 * The desc field is used to track userspace buffer index.
 */
struct ubuf_info {
        const struct ubuf_info_ops *ops;
        refcount_t refcnt;
        u8 flags;
};

struct ubuf_info_msgzc {
        struct ubuf_info ubuf;

        union {
                struct {
                        unsigned long desc;
                        void *ctx;
                };
                struct {
                        u32 id;
                        u16 len;
                        u16 zerocopy:1;
                        u32 bytelen;
                };
        };

        struct mmpin {
                struct user_struct *user;
                unsigned int num_pg;
        } mmp;
};

#define skb_uarg(SKB)        ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
#define uarg_to_msgzc(ubuf_ptr)        container_of((ubuf_ptr), struct ubuf_info_msgzc, \
                                             ubuf)

int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

/* Preserve some data across TX submission and completion.
 *
 * Note, this state is stored in the driver. Extending the layout
 * might need some special care.
 */
struct xsk_tx_metadata_compl {
        __u64 *tx_timestamp;
};

/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
        __u8                flags;
        __u8                meta_len;
        __u8                nr_frags;
        __u8                tx_flags;
        unsigned short        gso_size;
        /* Warning: this field is not always filled in (UFO)! */
        unsigned short        gso_segs;
        struct sk_buff        *frag_list;
        union {
                struct skb_shared_hwtstamps hwtstamps;
                struct xsk_tx_metadata_compl xsk_meta;
        };
        unsigned int        gso_type;
        u32                tskey;

        /*
         * Warning : all fields before dataref are cleared in __alloc_skb()
         */
        atomic_t        dataref;
        unsigned int        xdp_frags_size;

        /* Intermediate layers must ensure that destructor_arg
         * remains valid until skb destructor */
        void *                destructor_arg;

        /* must be last field, see pskb_expand_head() */
        skb_frag_t        frags[MAX_SKB_FRAGS];
};

/**
 * DOC: dataref and headerless skbs
 *
 * Transport layers send out clones of payload skbs they hold for
 * retransmissions. To allow lower layers of the stack to prepend their headers
 * we split &skb_shared_info.dataref into two halves.
 * The lower 16 bits count the overall number of references.
 * The higher 16 bits indicate how many of the references are payload-only.
 * skb_header_cloned() checks if skb is allowed to add / write the headers.
 *
 * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr
 * (via __skb_header_release()). Any clone created from marked skb will get
 * &sk_buff.hdr_len populated with the available headroom.
 * If there's the only clone in existence it's able to modify the headroom
 * at will. The sequence of calls inside the transport layer is::
 *
 *  <alloc skb>
 *  skb_reserve()
 *  __skb_header_release()
 *  skb_clone()
 *  // send the clone down the stack
 *
 * This is not a very generic construct and it depends on the transport layers
 * doing the right thing. In practice there's usually only one payload-only skb.
 * Having multiple payload-only skbs with different lengths of hdr_len is not
 * possible. The payload-only skbs should never leave their owner.
 */
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)


enum {
        SKB_FCLONE_UNAVAILABLE,        /* skb has no fclone (from head_cache) */
        SKB_FCLONE_ORIG,        /* orig skb (from fclone_cache) */
        SKB_FCLONE_CLONE,        /* companion fclone skb (from fclone_cache) */
};

enum {
        SKB_GSO_TCPV4 = 1 << 0,

        /* This indicates the skb is from an untrusted source. */
        SKB_GSO_DODGY = 1 << 1,

        /* This indicates the tcp segment has CWR set. */
        SKB_GSO_TCP_ECN = 1 << 2,

        SKB_GSO_TCP_FIXEDID = 1 << 3,

        SKB_GSO_TCPV6 = 1 << 4,

        SKB_GSO_FCOE = 1 << 5,

        SKB_GSO_GRE = 1 << 6,

        SKB_GSO_GRE_CSUM = 1 << 7,

        SKB_GSO_IPXIP4 = 1 << 8,

        SKB_GSO_IPXIP6 = 1 << 9,

        SKB_GSO_UDP_TUNNEL = 1 << 10,

        SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,

        SKB_GSO_PARTIAL = 1 << 12,

        SKB_GSO_TUNNEL_REMCSUM = 1 << 13,

        SKB_GSO_SCTP = 1 << 14,

        SKB_GSO_ESP = 1 << 15,

        SKB_GSO_UDP = 1 << 16,

        SKB_GSO_UDP_L4 = 1 << 17,

        SKB_GSO_FRAGLIST = 1 << 18,
};

#if BITS_PER_LONG > 32
#define NET_SKBUFF_DATA_USES_OFFSET 1
#endif

#ifdef NET_SKBUFF_DATA_USES_OFFSET
typedef unsigned int sk_buff_data_t;
#else
typedef unsigned char *sk_buff_data_t;
#endif

enum skb_tstamp_type {
        SKB_CLOCK_REALTIME,
        SKB_CLOCK_MONOTONIC,
        SKB_CLOCK_TAI,
        __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
};

/**
 * DOC: Basic sk_buff geometry
 *
 * struct sk_buff itself is a metadata structure and does not hold any packet
 * data. All the data is held in associated buffers.
 *
 * &sk_buff.head points to the main "head" buffer. The head buffer is divided
 * into two parts:
 *
 *  - data buffer, containing headers and sometimes payload;
 *    this is the part of the skb operated on by the common helpers
 *    such as skb_put() or skb_pull();
 *  - shared info (struct skb_shared_info) which holds an array of pointers
 *    to read-only data in the (page, offset, length) format.
 *
 * Optionally &skb_shared_info.frag_list may point to another skb.
 *
 * Basic diagram may look like this::
 *
 *                                  ---------------
 *                                 | sk_buff       |
 *                                  ---------------
 *     ,---------------------------  + head
 *    /          ,-----------------  + data
 *   /          /      ,-----------  + tail
 *  |          |      |            , + end
 *  |          |      |           |
 *  v          v      v           v
 *   -----------------------------------------------
 *  | headroom | data |  tailroom | skb_shared_info |
 *   -----------------------------------------------
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]       ---------
 *                                 + frag_list    --> | sk_buff |
 *                                                     ---------
 *
 */

/**
 *        struct sk_buff - socket buffer
 *        @next: Next buffer in list
 *        @prev: Previous buffer in list
 *        @tstamp: Time we arrived/left
 *        @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
 *                for retransmit timer
 *        @rbnode: RB tree node, alternative to next/prev for netem/tcp
 *        @list: queue head
 *        @ll_node: anchor in an llist (eg socket defer_list)
 *        @sk: Socket we are owned by
 *        @dev: Device we arrived on/are leaving by
 *        @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
 *        @cb: Control buffer. Free for use by every layer. Put private vars here
 *        @_skb_refdst: destination entry (with norefcount bit)
 *        @len: Length of actual data
 *        @data_len: Data length
 *        @mac_len: Length of link layer header
 *        @hdr_len: writable header length of cloned skb
 *        @csum: Checksum (must include start/offset pair)
 *        @csum_start: Offset from skb->head where checksumming should start
 *        @csum_offset: Offset from csum_start where checksum should be stored
 *        @priority: Packet queueing priority
 *        @ignore_df: allow local fragmentation
 *        @cloned: Head may be cloned (check refcnt to be sure)
 *        @ip_summed: Driver fed us an IP checksum
 *        @nohdr: Payload reference only, must not modify header
 *        @pkt_type: Packet class
 *        @fclone: skbuff clone status
 *        @ipvs_property: skbuff is owned by ipvs
 *        @inner_protocol_type: whether the inner protocol is
 *                ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
 *        @remcsum_offload: remote checksum offload is enabled
 *        @offload_fwd_mark: Packet was L2-forwarded in hardware
 *        @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
 *        @tc_skip_classify: do not classify packet. set by IFB device
 *        @tc_at_ingress: used within tc_classify to distinguish in/egress
 *        @redirected: packet was redirected by packet classifier
 *        @from_ingress: packet was redirected from the ingress path
 *        @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h
 *        @peeked: this packet has been seen already, so stats have been
 *                done for it, don't do them again
 *        @nf_trace: netfilter packet trace flag
 *        @protocol: Packet protocol from driver
 *        @destructor: Destruct function
 *        @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
 *        @_sk_redir: socket redirection information for skmsg
 *        @_nfct: Associated connection, if any (with nfctinfo bits)
 *        @skb_iif: ifindex of device we arrived on
 *        @tc_index: Traffic control index
 *        @hash: the packet hash
 *        @queue_mapping: Queue mapping for multiqueue devices
 *        @head_frag: skb was allocated from page fragments,
 *                not allocated by kmalloc() or vmalloc().
 *        @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
 *        @pp_recycle: mark the packet for recycling instead of freeing (implies
 *                page_pool support on driver)
 *        @active_extensions: active extensions (skb_ext_id types)
 *        @ndisc_nodetype: router type (from link layer)
 *        @ooo_okay: allow the mapping of a socket to a queue to be changed
 *        @l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *                ports.
 *        @sw_hash: indicates hash was computed in software stack
 *        @wifi_acked_valid: wifi_acked was set
 *        @wifi_acked: whether frame was acked on wifi or not
 *        @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *        @encapsulation: indicates the inner headers in the skbuff are valid
 *        @encap_hdr_csum: software checksum is needed
 *        @csum_valid: checksum is already valid
 *        @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
 *        @csum_complete_sw: checksum was completed by software
 *        @csum_level: indicates the number of consecutive checksums found in
 *                the packet minus one that have been verified as
 *                CHECKSUM_UNNECESSARY (max 3)
 *        @dst_pending_confirm: need to confirm neighbour
 *        @decrypted: Decrypted SKB
 *        @slow_gro: state present at GRO time, slower prepare step required
 *        @tstamp_type: When set, skb->tstamp has the
 *                delivery_time clock base of skb->tstamp.
 *        @napi_id: id of the NAPI struct this skb came from
 *        @sender_cpu: (aka @napi_id) source CPU in XPS
 *        @alloc_cpu: CPU which did the skb allocation.
 *        @secmark: security marking
 *        @mark: Generic packet mark
 *        @reserved_tailroom: (aka @mark) number of bytes of free space available
 *                at the tail of an sk_buff
 *        @vlan_all: vlan fields (proto & tci)
 *        @vlan_proto: vlan encapsulation protocol
 *        @vlan_tci: vlan tag control information
 *        @inner_protocol: Protocol (encapsulation)
 *        @inner_ipproto: (aka @inner_protocol) stores ipproto when
 *                skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
 *        @inner_transport_header: Inner transport layer header (encapsulation)
 *        @inner_network_header: Network layer header (encapsulation)
 *        @inner_mac_header: Link layer header (encapsulation)
 *        @transport_header: Transport layer header
 *        @network_header: Network layer header
 *        @mac_header: Link layer header
 *        @kcov_handle: KCOV remote handle for remote coverage collection
 *        @tail: Tail pointer
 *        @end: End pointer
 *        @head: Head of buffer
 *        @data: Data head pointer
 *        @truesize: Buffer size
 *        @users: User count - see {datagram,tcp}.c
 *        @extensions: allocated extensions, valid if active_extensions is nonzero
 */

struct sk_buff {
        union {
                struct {
                        /* These two members must be first to match sk_buff_head. */
                        struct sk_buff                *next;
                        struct sk_buff                *prev;

                        union {
                                struct net_device        *dev;
                                /* Some protocols might use this space to store information,
                                 * while device pointer would be NULL.
                                 * UDP receive path is one user.
                                 */
                                unsigned long                dev_scratch;
                        };
                };
                struct rb_node                rbnode; /* used in netem, ip4 defrag, and tcp stack */
                struct list_head        list;
                struct llist_node        ll_node;
        };

        struct sock                *sk;

        union {
                ktime_t                tstamp;
                u64                skb_mstamp_ns; /* earliest departure time */
        };
        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char                        cb[48] __aligned(8);

        union {
                struct {
                        unsigned long        _skb_refdst;
                        void                (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
#ifdef CONFIG_NET_SOCK_MSG
                unsigned long                _sk_redir;
#endif
        };

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        unsigned long                 _nfct;
#endif
        unsigned int                len,
                                data_len;
        __u16                        mac_len,
                                hdr_len;

        /* Following fields are _not_ copied in __copy_skb_header()
         * Note that queue_mapping is here mostly to fill a hole.
         */
        __u16                        queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK        (1 << 7)
#else
#define CLONED_MASK        1
#endif
#define CLONED_OFFSET                offsetof(struct sk_buff, __cloned_offset)

        /* private: */
        __u8                        __cloned_offset[0];
        /* public: */
        __u8                        cloned:1,
                                nohdr:1,
                                fclone:2,
                                peeked:1,
                                head_frag:1,
                                pfmemalloc:1,
                                pp_recycle:1; /* page_pool recycle indicator */
#ifdef CONFIG_SKB_EXTENSIONS
        __u8                        active_extensions;
#endif

        /* Fields enclosed in headers group are copied
         * using a single memcpy() in __copy_skb_header()
         */
        struct_group(headers,

        /* private: */
        __u8                        __pkt_type_offset[0];
        /* public: */
        __u8                        pkt_type:3; /* see PKT_TYPE_MAX */
        __u8                        ignore_df:1;
        __u8                        dst_pending_confirm:1;
        __u8                        ip_summed:2;
        __u8                        ooo_okay:1;

        /* private: */
        __u8                        __mono_tc_offset[0];
        /* public: */
        __u8                        tstamp_type:2;        /* See skb_tstamp_type */
#ifdef CONFIG_NET_XGRESS
        __u8                        tc_at_ingress:1;        /* See TC_AT_INGRESS_MASK */
        __u8                        tc_skip_classify:1;
#endif
        __u8                        remcsum_offload:1;
        __u8                        csum_complete_sw:1;
        __u8                        csum_level:2;
        __u8                        inner_protocol_type:1;

        __u8                        l4_hash:1;
        __u8                        sw_hash:1;
#ifdef CONFIG_WIRELESS
        __u8                        wifi_acked_valid:1;
        __u8                        wifi_acked:1;
#endif
        __u8                        no_fcs:1;
        /* Indicates the inner headers are valid in the skbuff. */
        __u8                        encapsulation:1;
        __u8                        encap_hdr_csum:1;
        __u8                        csum_valid:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                        ndisc_nodetype:2;
#endif

#if IS_ENABLED(CONFIG_IP_VS)
        __u8                        ipvs_property:1;
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        __u8                        nf_trace:1;
#endif
#ifdef CONFIG_NET_SWITCHDEV
        __u8                        offload_fwd_mark:1;
        __u8                        offload_l3_fwd_mark:1;
#endif
        __u8                        redirected:1;
#ifdef CONFIG_NET_REDIRECT
        __u8                        from_ingress:1;
#endif
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        __u8                        nf_skip_egress:1;
#endif
#ifdef CONFIG_SKB_DECRYPTED
        __u8                        decrypted:1;
#endif
        __u8                        slow_gro:1;
#if IS_ENABLED(CONFIG_IP_SCTP)
        __u8                        csum_not_inet:1;
#endif

#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
        __u16                        tc_index;        /* traffic control index */
#endif

        u16                        alloc_cpu;

        union {
                __wsum                csum;
                struct {
                        __u16        csum_start;
                        __u16        csum_offset;
                };
        };
        __u32                        priority;
        int                        skb_iif;
        __u32                        hash;
        union {
                u32                vlan_all;
                struct {
                        __be16        vlan_proto;
                        __u16        vlan_tci;
                };
        };
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
        union {
                unsigned int        napi_id;
                unsigned int        sender_cpu;
        };
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32                secmark;
#endif

        union {
                __u32                mark;
                __u32                reserved_tailroom;
        };

        union {
                __be16                inner_protocol;
                __u8                inner_ipproto;
        };

        __u16                        inner_transport_header;
        __u16                        inner_network_header;
        __u16                        inner_mac_header;

        __be16                        protocol;
        __u16                        transport_header;
        __u16                        network_header;
        __u16                        mac_header;

#ifdef CONFIG_KCOV
        u64                        kcov_handle;
#endif

        ); /* end headers group */

        /* These elements must be at the end, see alloc_skb() for details.  */
        sk_buff_data_t                tail;
        sk_buff_data_t                end;
        unsigned char                *head,
                                *data;
        unsigned int                truesize;
        refcount_t                users;

#ifdef CONFIG_SKB_EXTENSIONS
        /* only usable after checking ->active_extensions != 0 */
        struct skb_ext                *extensions;
#endif
};

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX        (7 << 5)
#else
#define PKT_TYPE_MAX        7
#endif
#define PKT_TYPE_OFFSET                offsetof(struct sk_buff, __pkt_type_offset)

/* if you move tc_at_ingress or tstamp_type
 * around, you also must adapt these constants.
 */
#ifdef __BIG_ENDIAN_BITFIELD
#define SKB_TSTAMP_TYPE_MASK                (3 << 6)
#define SKB_TSTAMP_TYPE_RSHIFT                (6)
#define TC_AT_INGRESS_MASK                (1 << 5)
#else
#define SKB_TSTAMP_TYPE_MASK                (3)
#define TC_AT_INGRESS_MASK                (1 << 2)
#endif
#define SKB_BF_MONO_TC_OFFSET                offsetof(struct sk_buff, __mono_tc_offset)

#ifdef __KERNEL__
/*
 *        Handling routines are only of interest to the kernel
 */

#define SKB_ALLOC_FCLONE        0x01
#define SKB_ALLOC_RX                0x02
#define SKB_ALLOC_NAPI                0x04

/**
 * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
 * @skb: buffer
 */
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
{
        return unlikely(skb->pfmemalloc);
}

/*
 * skb might have a dst pointer attached, refcounted or not.
 * _skb_refdst low order bit is set if refcount was _not_ taken
 */
#define SKB_DST_NOREF        1UL
#define SKB_DST_PTRMASK        ~(SKB_DST_NOREF)

/**
 * skb_dst - returns skb dst_entry
 * @skb: buffer
 *
 * Returns skb dst_entry, regardless of reference taken or not.
 */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
        /* If refdst was not refcounted, check we still are in a
         * rcu_read_lock section
         */
        WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
                !rcu_read_lock_held() &&
                !rcu_read_lock_bh_held());
        return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

/**
 * skb_dst_set - sets skb dst
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was taken on dst and should
 * be released by skb_dst_drop()
 */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst;
}

/**
 * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was not taken on dst.
 * If dst entry is cached, we do not take reference and dst_release
 * will be avoided by refdst_drop. If dst entry is not cached, we take
 * reference, so that last dst_release can destroy the dst immediately.
 */
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
        WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}

/**
 * skb_dst_is_noref - Test if skb dst isn't refcounted
 * @skb: buffer
 */
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
        return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}

/* For mangling skb->pkt_type from user space side from applications
 * such as nft, tc, etc, we only allow a conservative subset of
 * possible pkt_types to be set.
*/
static inline bool skb_pkt_type_ok(u32 ptype)
{
        return ptype <= PACKET_OTHERHOST;
}

/**
 * skb_napi_id - Returns the skb's NAPI id
 * @skb: buffer
 */
static inline unsigned int skb_napi_id(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return skb->napi_id;
#else
        return 0;
#endif
}

static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
{
#ifdef CONFIG_WIRELESS
        return skb->wifi_acked_valid;
#else
        return 0;
#endif
}

/**
 * skb_unref - decrement the skb's reference count
 * @skb: buffer
 *
 * Returns true if we can free the skb.
 */
static inline bool skb_unref(struct sk_buff *skb)
{
        if (unlikely(!skb))
                return false;
        if (likely(refcount_read(&skb->users) == 1))
                smp_rmb();
        else if (likely(!refcount_dec_and_test(&skb->users)))
                return false;

        return true;
}

static inline bool skb_data_unref(const struct sk_buff *skb,
                                  struct skb_shared_info *shinfo)
{
        int bias;

        if (!skb->cloned)
                return true;

        bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;

        if (atomic_read(&shinfo->dataref) == bias)
                smp_rmb();
        else if (atomic_sub_return(bias, &shinfo->dataref))
                return false;

        return true;
}

void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                                      enum skb_drop_reason reason);

static inline void
kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        sk_skb_reason_drop(NULL, skb, reason);
}

/**
 *        kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
 *        @skb: buffer to free
 */
static inline void kfree_skb(struct sk_buff *skb)
{
        kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

void skb_release_head_state(struct sk_buff *skb);
void kfree_skb_list_reason(struct sk_buff *segs,
                           enum skb_drop_reason reason);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);

static inline void kfree_skb_list(struct sk_buff *segs)
{
        kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED);
}

#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
static inline void consume_skb(struct sk_buff *skb)
{
        return kfree_skb(skb);
}
#endif

void __consume_stateless_skb(struct sk_buff *skb);
void  __kfree_skb(struct sk_buff *skb);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize);

struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
                            int node);
struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size);
void skb_attempt_defer_free(struct sk_buff *skb);

struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
struct sk_buff *slab_build_skb(void *data);

/**
 * alloc_skb - allocate a network buffer
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb(unsigned int size,
                                        gfp_t priority)
{
        return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
}

struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int max_page_order,
                                     int *errcode,
                                     gfp_t gfp_mask);
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);

/* Layout of fast clones : [skb1][skb2][fclone_ref] */
struct sk_buff_fclones {
        struct sk_buff        skb1;

        struct sk_buff        skb2;

        refcount_t        fclone_ref;
};

/**
 *        skb_fclone_busy - check if fclone is busy
 *        @sk: socket
 *        @skb: buffer
 *
 * Returns true if skb is a fast clone, and its clone is not freed.
 * Some drivers call skb_orphan() in their ndo_start_xmit(),
 * so we also check that didn't happen.
 */
static inline bool skb_fclone_busy(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        const struct sk_buff_fclones *fclones;

        fclones = container_of(skb, struct sk_buff_fclones, skb1);

        return skb->fclone == SKB_FCLONE_ORIG &&
               refcount_read(&fclones->fclone_ref) > 1 &&
               READ_ONCE(fclones->skb2.sk) == sk;
}

/**
 * alloc_skb_fclone - allocate a network buffer from fclone cache
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                                               gfp_t priority)
{
        return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}

struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone);
static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
                                          gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
}

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
                                     unsigned int headroom);
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
                                int newtailroom, gfp_t priority);
int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                                     int offset, int len);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
                              int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);

/**
 *        skb_pad                        -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error.
 */
static inline int skb_pad(struct sk_buff *skb, int pad)
{
        return __skb_pad(skb, pad, true);
}
#define dev_kfree_skb(a)        consume_skb(a)

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags);

struct skb_seq_state {
        __u32                lower_offset;
        __u32                upper_offset;
        __u32                frag_idx;
        __u32                stepped_offset;
        struct sk_buff        *root_skb;
        struct sk_buff        *cur_skb;
        __u8                *frag_data;
        __u32                frag_off;
};

void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st);
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st);
void skb_abort_seq_read(struct skb_seq_state *st);

unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config);

/*
 * Packet hash types specify the type of hash in skb_set_hash.
 *
 * Hash types refer to the protocol layer addresses which are used to
 * construct a packet's hash. The hashes are used to differentiate or identify
 * flows of the protocol layer for the hash type. Hash types are either
 * layer-2 (L2), layer-3 (L3), or layer-4 (L4).
 *
 * Properties of hashes:
 *
 * 1) Two packets in different flows have different hash values
 * 2) Two packets in the same flow should have the same hash value
 *
 * A hash at a higher layer is considered to be more specific. A driver should
 * set the most specific hash possible.
 *
 * A driver cannot indicate a more specific hash than the layer at which a hash
 * was computed. For instance an L3 hash cannot be set as an L4 hash.
 *
 * A driver may indicate a hash level which is less specific than the
 * actual layer the hash was computed on. For instance, a hash computed
 * at L4 may be considered an L3 hash. This should only be done if the
 * driver can't unambiguously determine that the HW computed the hash at
 * the higher layer. Note that the "should" in the second property above
 * permits this.
 */
enum pkt_hash_types {
        PKT_HASH_TYPE_NONE,        /* Undefined type */
        PKT_HASH_TYPE_L2,        /* Input: src_MAC, dest_MAC */
        PKT_HASH_TYPE_L3,        /* Input: src_IP, dst_IP */
        PKT_HASH_TYPE_L4,        /* Input: src_IP, dst_IP, src_port, dst_port */
};

static inline void skb_clear_hash(struct sk_buff *skb)
{
        skb->hash = 0;
        skb->sw_hash = 0;
        skb->l4_hash = 0;
}

static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
{
        if (!skb->l4_hash)
                skb_clear_hash(skb);
}

static inline void
__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
{
        skb->l4_hash = is_l4;
        skb->sw_hash = is_sw;
        skb->hash = hash;
}

static inline void
skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
{
        /* Used by drivers to set hash from HW */
        __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
}

static inline void
__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
{
        __skb_set_hash(skb, hash, true, is_l4);
}

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb);

static inline u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
        return __skb_get_hash_symmetric_net(NULL, skb);
}

void __skb_get_hash_net(const struct net *net, struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen);
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                            const void *data, int hlen_proto);

static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
                                        int thoff, u8 ip_proto)
{
        return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
}

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count);

struct bpf_flow_dissector;
u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags);

bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags);

static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, unsigned int flags)
{
        return __skb_flow_dissect(NULL, skb, flow_dissector,
                                  target_container, NULL, 0, 0, 0, flags);
}

static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
                                              struct flow_keys *flow,
                                              unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
                                  flow, NULL, 0, 0, 0, flags);
}

static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
                                 const struct sk_buff *skb,
                                 struct flow_keys_basic *flow,
                                 const void *data, __be16 proto,
                                 int nhoff, int hlen, unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
                                  data, proto, nhoff, hlen, flags);
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

/* Gets a skb connection tracking info, ctinfo map should be a
 * map of mapsize to translate enum ip_conntrack_info states
 * to user states.
 */
void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container,
                    u16 *ctinfo_map, size_t mapsize,
                    bool post_ct, u16 zone);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

static inline __u32 skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(net, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(NULL, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
        if (!skb->l4_hash && !skb->sw_hash) {
                struct flow_keys keys;
                __u32 hash = __get_hash_from_flowi6(fl6, &keys);

                __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
        }

        return skb->hash;
}

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb);

static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
{
        return skb->hash;
}

static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
{
        to->hash = from->hash;
        to->sw_hash = from->sw_hash;
        to->l4_hash = from->l4_hash;
};

static inline int skb_cmp_decrypted(const struct sk_buff *skb1,
                                    const struct sk_buff *skb2)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb2->decrypted - skb1->decrypted;
#else
        return 0;
#endif
}

static inline bool skb_is_decrypted(const struct sk_buff *skb)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb->decrypted;
#else
        return false;
#endif
}

static inline void skb_copy_decrypted(struct sk_buff *to,
                                      const struct sk_buff *from)
{
#ifdef CONFIG_SKB_DECRYPTED
        to->decrypted = from->decrypted;
#endif
}

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = offset;
}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end - skb->head;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = skb->head + offset;
}
#endif

extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg);

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
                            struct sk_buff *skb, struct iov_iter *from,
                            size_t length);

static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
                                          struct msghdr *msg, int len)
{
        return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
}

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg);

/* Internal */
#define skb_shinfo(SKB)        ((struct skb_shared_info *)(skb_end_pointer(SKB)))

static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
{
        return &skb_shinfo(skb)->hwtstamps;
}

static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
        bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE;

        return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline bool skb_zcopy_pure(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}

static inline bool skb_zcopy_managed(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
}

static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
                                       const struct sk_buff *skb2)
{
        return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
}

static inline void net_zcopy_get(struct ubuf_info *uarg)
{
        refcount_inc(&uarg->refcnt);
}

static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg)
{
        skb_shinfo(skb)->destructor_arg = uarg;
        skb_shinfo(skb)->flags |= uarg->flags;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
                                 bool *have_ref)
{
        if (skb && uarg && !skb_zcopy(skb)) {
                if (unlikely(have_ref && *have_ref))
                        *have_ref = false;
                else
                        net_zcopy_get(uarg);
                skb_zcopy_init(skb, uarg);
        }
}

static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
{
        skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
        skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
}

static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
{
        return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL;
}

static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
{
        return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
}

static inline void net_zcopy_put(struct ubuf_info *uarg)
{
        if (uarg)
                uarg->ops->complete(NULL, uarg, true);
}

static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        if (uarg) {
                if (uarg->ops == &msg_zerocopy_ubuf_ops)
                        msg_zerocopy_put_abort(uarg, have_uref);
                else if (have_uref)
                        net_zcopy_put(uarg);
        }
}

/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                if (!skb_zcopy_is_nouarg(skb))
                        uarg->ops->complete(skb, uarg, zerocopy_success);

                skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
        }
}

void __skb_zcopy_downgrade_managed(struct sk_buff *skb);

static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        if (unlikely(skb_zcopy_managed(skb)))
                __skb_zcopy_downgrade_managed(skb);
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
        skb->next = NULL;
}

static inline void skb_poison_list(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        skb->next = SKB_LIST_POISON_NEXT;
#endif
}

/* Iterate through singly-linked GSO fragments of an skb. */
#define skb_list_walk_safe(first, skb, next_skb)                               \
        for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb);  \
             (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL)

static inline void skb_list_del_init(struct sk_buff *skb)
{
        __list_del_entry(&skb->list);
        skb_mark_not_on_list(skb);
}

/**
 *        skb_queue_empty - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 */
static inline int skb_queue_empty(const struct sk_buff_head *list)
{
        return list->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_empty_lockless - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 *        This variant can be used in lockless contexts.
 */
static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
{
        return READ_ONCE(list->next) == (const struct sk_buff *) list;
}


/**
 *        skb_queue_is_last - check if skb is the last entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the last buffer on the list.
 */
static inline bool skb_queue_is_last(const struct sk_buff_head *list,
                                     const struct sk_buff *skb)
{
        return skb->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_is_first - check if skb is the first entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the first buffer on the list.
 */
static inline bool skb_queue_is_first(const struct sk_buff_head *list,
                                      const struct sk_buff *skb)
{
        return skb->prev == (const struct sk_buff *) list;
}

/**
 *        skb_queue_next - return the next packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the next packet in @list after @skb.  It is only valid to
 *        call this if skb_queue_is_last() evaluates to false.
 */
static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_last(list, skb));
        return skb->next;
}

/**
 *        skb_queue_prev - return the prev packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the prev packet in @list before @skb.  It is only valid to
 *        call this if skb_queue_is_first() evaluates to false.
 */
static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_first(list, skb));
        return skb->prev;
}

/**
 *        skb_get - reference buffer
 *        @skb: buffer to reference
 *
 *        Makes another reference to a socket buffer and returns a pointer
 *        to the buffer.
 */
static inline struct sk_buff *skb_get(struct sk_buff *skb)
{
        refcount_inc(&skb->users);
        return skb;
}

/*
 * If users == 1, we are the only owner and can avoid redundant atomic changes.
 */

/**
 *        skb_cloned - is the buffer a clone
 *        @skb: buffer to check
 *
 *        Returns true if the buffer was generated with skb_clone() and is
 *        one of multiple shared copies of the buffer. Cloned buffers are
 *        shared data so must not be written to under normal circumstances.
 */
static inline int skb_cloned(const struct sk_buff *skb)
{
        return skb->cloned &&
               (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
}

static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/* This variant of skb_unclone() makes sure skb->truesize
 * and skb_end_offset() are not changed, whenever a new skb->head is needed.
 *
 * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X))
 * when various debugging features are in place.
 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri);
static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return __skb_unclone_keeptruesize(skb, pri);
        return 0;
}

/**
 *        skb_header_cloned - is the header a clone
 *        @skb: buffer to check
 *
 *        Returns true if modifying the header part of the buffer requires
 *        the data to be copied.
 */
static inline int skb_header_cloned(const struct sk_buff *skb)
{
        int dataref;

        if (!skb->cloned)
                return 0;

        dataref = atomic_read(&skb_shinfo(skb)->dataref);
        dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
        return dataref != 1;
}

static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_header_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 * __skb_header_release() - allow clones to use the headroom
 * @skb: buffer to operate on
 *
 * See "DOC: dataref and headerless skbs".
 */
static inline void __skb_header_release(struct sk_buff *skb)
{
        skb->nohdr = 1;
        atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
}


/**
 *        skb_shared - is the buffer shared
 *        @skb: buffer to check
 *
 *        Returns true if more than one person has a reference to this
 *        buffer.
 */
static inline int skb_shared(const struct sk_buff *skb)
{
        return refcount_read(&skb->users) != 1;
}

/**
 *        skb_share_check - check if buffer is shared and if so clone it
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the buffer is shared the buffer is cloned and the old copy
 *        drops a reference. A new clone with a single reference is returned.
 *        If the buffer is not shared the original buffer is returned. When
 *        being called from interrupt status or with spinlocks held pri must
 *        be GFP_ATOMIC.
 *
 *        NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);

                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/*
 *        Copy shared buffers into a new sk_buff. We effectively do COW on
 *        packets to handle cases where we have a local reader and forward
 *        and a couple of other messy ones. The normal one is tcpdumping
 *        a packet that's being forwarded.
 */

/**
 *        skb_unshare - make a copy of a shared buffer
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the socket buffer is a clone then this function creates a new
 *        copy of the data, drops a reference count on the old copy and returns
 *        the new copy with the reference count at 1. If the buffer is not a clone
 *        the original buffer is returned. When called with a spinlock held or
 *        from interrupt state @pri must be %GFP_ATOMIC
 *
 *        %NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);

                /* Free our shared copy */
                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/**
 *        skb_peek - peek at the head of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the head element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = list_->next;

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;
}

/**
 *        __skb_peek - peek at the head of a non-empty &sk_buff_head
 *        @list_: list to peek at
 *
 *        Like skb_peek(), but the caller knows that the list is not empty.
 */
static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
{
        return list_->next;
}

/**
 *        skb_peek_next - peek skb following the given one from a queue
 *        @skb: skb to start from
 *        @list_: list to peek at
 *
 *        Returns %NULL when the end of the list is met or a pointer to the
 *        next element. The reference count is not incremented and the
 *        reference is therefore volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
                const struct sk_buff_head *list_)
{
        struct sk_buff *next = skb->next;

        if (next == (struct sk_buff *)list_)
                next = NULL;
        return next;
}

/**
 *        skb_peek_tail - peek at the tail of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the tail element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = READ_ONCE(list_->prev);

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;

}

/**
 *        skb_queue_len        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 */
static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
{
        return list_->qlen;
}

/**
 *        skb_queue_len_lockless        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 *        This variant can be used in lockless contexts.
 */
static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_)
{
        return READ_ONCE(list_->qlen);
}

/**
 *        __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
 *        @list: queue to initialize
 *
 *        This initializes only the list and queue length aspects of
 *        an sk_buff_head object.  This allows to initialize the list
 *        aspects of an sk_buff_head without reinitializing things like
 *        the spinlock.  It can also be used for on-stack sk_buff_head
 *        objects where the spinlock is known to not be used.
 */
static inline void __skb_queue_head_init(struct sk_buff_head *list)
{
        list->prev = list->next = (struct sk_buff *)list;
        list->qlen = 0;
}

/*
 * This function creates a split out lock class for each invocation;
 * this is needed for now since a whole lot of users of the skb-queue
 * infrastructure in drivers have different locking usage (in hardirq)
 * than the networking core (in softirq only). In the long run either the
 * network layer or drivers should need annotation to consolidate the
 * main types of usage into 3 classes.
 */
static inline void skb_queue_head_init(struct sk_buff_head *list)
{
        spin_lock_init(&list->lock);
        __skb_queue_head_init(list);
}

static inline void skb_queue_head_init_class(struct sk_buff_head *list,
                struct lock_class_key *class)
{
        skb_queue_head_init(list);
        lockdep_set_class(&list->lock, class);
}

/*
 *        Insert an sk_buff on a list.
 *
 *        The "__skb_xxxx()" functions are the non-atomic ones that
 *        can only be called with interrupts disabled.
 */
static inline void __skb_insert(struct sk_buff *newsk,
                                struct sk_buff *prev, struct sk_buff *next,
                                struct sk_buff_head *list)
{
        /* See skb_queue_empty_lockless() and skb_peek_tail()
         * for the opposite READ_ONCE()
         */
        WRITE_ONCE(newsk->next, next);
        WRITE_ONCE(newsk->prev, prev);
        WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk);
        WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk);
        WRITE_ONCE(list->qlen, list->qlen + 1);
}

static inline void __skb_queue_splice(const struct sk_buff_head *list,
                                      struct sk_buff *prev,
                                      struct sk_buff *next)
{
        struct sk_buff *first = list->next;
        struct sk_buff *last = list->prev;

        WRITE_ONCE(first->prev, prev);
        WRITE_ONCE(prev->next, first);

        WRITE_ONCE(last->next, next);
        WRITE_ONCE(next->prev, last);
}

/**
 *        skb_queue_splice - join two skb lists, this is designed for stacks
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice(const struct sk_buff_head *list,
                                    struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_init(struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        skb_queue_splice_tail - join two skb lists, each list being a queue
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        Each of the lists is a queue.
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
                                              struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        __skb_queue_after - queue a buffer at the list head
 *        @list: list to use
 *        @prev: place after this buffer
 *        @newsk: buffer to queue
 *
 *        Queue a buffer int the middle of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_after(struct sk_buff_head *list,
                                     struct sk_buff *prev,
                                     struct sk_buff *newsk)
{
        __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list);
}

void skb_append(struct sk_buff *old, struct sk_buff *newsk,
                struct sk_buff_head *list);

static inline void __skb_queue_before(struct sk_buff_head *list,
                                      struct sk_buff *next,
                                      struct sk_buff *newsk)
{
        __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list);
}

/**
 *        __skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_head(struct sk_buff_head *list,
                                    struct sk_buff *newsk)
{
        __skb_queue_after(list, (struct sk_buff *)list, newsk);
}
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);

/**
 *        __skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the end of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_tail(struct sk_buff_head *list,
                                   struct sk_buff *newsk)
{
        __skb_queue_before(list, (struct sk_buff *)list, newsk);
}
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);

/*
 * remove sk_buff from list. _Must_ be called atomically, and with
 * the list known..
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        struct sk_buff *next, *prev;

        WRITE_ONCE(list->qlen, list->qlen - 1);
        next           = skb->next;
        prev           = skb->prev;
        skb->next  = skb->prev = NULL;
        WRITE_ONCE(next->prev, prev);
        WRITE_ONCE(prev->next, next);
}

/**
 *        __skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The head item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue(struct sk_buff_head *list);

/**
 *        __skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The tail item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek_tail(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);


static inline bool skb_is_nonlinear(const struct sk_buff *skb)
{
        return skb->data_len;
}

static inline unsigned int skb_headlen(const struct sk_buff *skb)
{
        return skb->len - skb->data_len;
}

static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
        unsigned int i, len = 0;

        for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
                len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
        return len;
}

static inline unsigned int skb_pagelen(const struct sk_buff *skb)
{
        return skb_headlen(skb) + __skb_pagelen(skb);
}

static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag,
                                             netmem_ref netmem, int off,
                                             int size)
{
        frag->netmem = netmem;
        frag->offset = off;
        skb_frag_size_set(frag, size);
}

static inline void skb_frag_fill_page_desc(skb_frag_t *frag,
                                           struct page *page,
                                           int off, int size)
{
        skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size);
}

static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo,
                                                int i, netmem_ref netmem,
                                                int off, int size)
{
        skb_frag_t *frag = &shinfo->frags[i];

        skb_frag_fill_netmem_desc(frag, netmem, off, size);
}

static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
                                              int i, struct page *page,
                                              int off, int size)
{
        __skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off,
                                     size);
}

/**
 * skb_len_add - adds a number to len fields of skb
 * @skb: buffer to add len to
 * @delta: number of bytes to add
 */
static inline void skb_len_add(struct sk_buff *skb, int delta)
{
        skb->len += delta;
        skb->data_len += delta;
        skb->truesize += delta;
}

/**
 * __skb_fill_netmem_desc - initialise a fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: fragment index to initialise
 * @netmem: the netmem to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Initialises the @i'th fragment of @skb to point to &size bytes at
 * offset @off within @page.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
                                          netmem_ref netmem, int off, int size)
{
        struct page *page = netmem_to_page(netmem);

        __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);

        /* Propagate page pfmemalloc to the skb if we can. The problem is
         * that not all callers have unique ownership of the page but rely
         * on page_is_pfmemalloc doing the right thing(tm).
         */
        page = compound_head(page);
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
                                        struct page *page, int off, int size)
{
        __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i,
                                        netmem_ref netmem, int off, int size)
{
        __skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb_shinfo(skb)->nr_frags = i + 1;
}

/**
 * skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
 * @skb to point to @size bytes at offset @off within @page. In
 * addition updates @skb such that @i is the last fragment.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
                                      struct page *page, int off, int size)
{
        skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

/**
 * skb_fill_page_desc_noacc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Variant of skb_fill_page_desc() which does not deal with
 * pfmemalloc, if page is not owned by us.
 */
static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
                                            struct page *page, int off,
                                            int size)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        __skb_fill_page_desc_noacc(shinfo, i, page, off, size);
        shinfo->nr_frags = i + 1;
}

void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
                            int off, int size, unsigned int truesize);

static inline void skb_add_rx_frag(struct sk_buff *skb, int i,
                                   struct page *page, int off, int size,
                                   unsigned int truesize)
{
        skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size,
                               truesize);
}

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize);

#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data - skb->head;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb_reset_tail_pointer(skb);
        skb->tail += offset;
}

#else /* NET_SKBUFF_DATA_USES_OFFSET */
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb->tail = skb->data + offset;
}

#endif /* NET_SKBUFF_DATA_USES_OFFSET */

static inline void skb_assert_len(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        if (WARN_ONCE(!skb->len, "%s\n", __func__))
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
#endif /* CONFIG_DEBUG_NET */
}

/*
 *        Add data to an sk_buff
 */
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
void *skb_put(struct sk_buff *skb, unsigned int len);
static inline void *__skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        return tmp;
}

static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memset(tmp, 0, len);
        return tmp;
}

static inline void *__skb_put_data(struct sk_buff *skb, const void *data,
                                   unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memcpy(tmp, data, len);
        return tmp;
}

static inline void __skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)__skb_put(skb, 1) = val;
}

static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memset(tmp, 0, len);

        return tmp;
}

static inline void *skb_put_data(struct sk_buff *skb, const void *data,
                                 unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memcpy(tmp, data, len);

        return tmp;
}

static inline void skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)skb_put(skb, 1) = val;
}

void *skb_push(struct sk_buff *skb, unsigned int len);
static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->data -= len;
        skb->len  += len;
        return skb->data;
}

void *skb_pull(struct sk_buff *skb, unsigned int len);
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->len -= len;
        if (unlikely(skb->len < skb->data_len)) {
#if defined(CONFIG_DEBUG_NET)
                skb->len += len;
                pr_err("__skb_pull(len=%u)\n", len);
                skb_dump(KERN_ERR, skb, false);
#endif
                BUG();
        }
        return skb->data += len;
}

static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}

void *skb_pull_data(struct sk_buff *skb, size_t len);

void *__pskb_pull_tail(struct sk_buff *skb, int delta);

static inline enum skb_drop_reason
pskb_may_pull_reason(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        if (likely(len <= skb_headlen(skb)))
                return SKB_NOT_DROPPED_YET;

        if (unlikely(len > skb->len))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb))))
                return SKB_DROP_REASON_NOMEM;

        return SKB_NOT_DROPPED_YET;
}

static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
}

static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
{
        if (!pskb_may_pull(skb, len))
                return NULL;

        skb->len -= len;
        return skb->data += len;
}

void skb_condense(struct sk_buff *skb);

/**
 *        skb_headroom - bytes at buffer head
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the head of an &sk_buff.
 */
static inline unsigned int skb_headroom(const struct sk_buff *skb)
{
        return skb->data - skb->head;
}

/**
 *        skb_tailroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 */
static inline int skb_tailroom(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
}

/**
 *        skb_availroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 *        allocated by sk_stream_alloc()
 */
static inline int skb_availroom(const struct sk_buff *skb)
{
        if (skb_is_nonlinear(skb))
                return 0;

        return skb->end - skb->tail - skb->reserved_tailroom;
}

/**
 *        skb_reserve - adjust headroom
 *        @skb: buffer to alter
 *        @len: bytes to move
 *
 *        Increase the headroom of an empty &sk_buff by reducing the tail
 *        room. This is only allowed for an empty buffer.
 */
static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
}

/**
 *        skb_tailroom_reserve - adjust reserved_tailroom
 *        @skb: buffer to alter
 *        @mtu: maximum amount of headlen permitted
 *        @needed_tailroom: minimum amount of reserved_tailroom
 *
 *        Set reserved_tailroom so that headlen can be as large as possible but
 *        not larger than mtu and tailroom cannot be smaller than
 *        needed_tailroom.
 *        The required headroom should already have been reserved before using
 *        this function.
 */
static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
                                        unsigned int needed_tailroom)
{
        SKB_LINEAR_ASSERT(skb);
        if (mtu < skb_tailroom(skb) - needed_tailroom)
                /* use at most mtu */
                skb->reserved_tailroom = skb_tailroom(skb) - mtu;
        else
                /* use up to all available space */
                skb->reserved_tailroom = needed_tailroom;
}

#define ENCAP_TYPE_ETHER        0
#define ENCAP_TYPE_IPPROTO        1

static inline void skb_set_inner_protocol(struct sk_buff *skb,
                                          __be16 protocol)
{
        skb->inner_protocol = protocol;
        skb->inner_protocol_type = ENCAP_TYPE_ETHER;
}

static inline void skb_set_inner_ipproto(struct sk_buff *skb,
                                         __u8 ipproto)
{
        skb->inner_ipproto = ipproto;
        skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
}

static inline void skb_reset_inner_headers(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->mac_header;
        skb->inner_network_header = skb->network_header;
        skb->inner_transport_header = skb->transport_header;
}

static inline void skb_reset_mac_len(struct sk_buff *skb)
{
        skb->mac_len = skb->network_header - skb->mac_header;
}

static inline unsigned char *skb_inner_transport_header(const struct sk_buff
                                                        *skb)
{
        return skb->head + skb->inner_transport_header;
}

static inline int skb_inner_transport_offset(const struct sk_buff *skb)
{
        return skb_inner_transport_header(skb) - skb->data;
}

static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
{
        skb->inner_transport_header = skb->data - skb->head;
}

static inline void skb_set_inner_transport_header(struct sk_buff *skb,
                                                   const int offset)
{
        skb_reset_inner_transport_header(skb);
        skb->inner_transport_header += offset;
}

static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_network_header;
}

static inline void skb_reset_inner_network_header(struct sk_buff *skb)
{
        skb->inner_network_header = skb->data - skb->head;
}

static inline void skb_set_inner_network_header(struct sk_buff *skb,
                                                const int offset)
{
        skb_reset_inner_network_header(skb);
        skb->inner_network_header += offset;
}

static inline bool skb_inner_network_header_was_set(const struct sk_buff *skb)
{
        return skb->inner_network_header > 0;
}

static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_mac_header;
}

static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->data - skb->head;
}

static inline void skb_set_inner_mac_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_inner_mac_header(skb);
        skb->inner_mac_header += offset;
}
static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
{
        return skb->transport_header != (typeof(skb->transport_header))~0U;
}

static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->head + skb->transport_header;
}

static inline void skb_reset_transport_header(struct sk_buff *skb)
{
        skb->transport_header = skb->data - skb->head;
}

static inline void skb_set_transport_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_transport_header(skb);
        skb->transport_header += offset;
}

static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->network_header;
}

static inline void skb_reset_network_header(struct sk_buff *skb)
{
        skb->network_header = skb->data - skb->head;
}

static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
        skb_reset_network_header(skb);
        skb->network_header += offset;
}

static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
        return skb->mac_header != (typeof(skb->mac_header))~0U;
}

static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->head + skb->mac_header;
}

static inline int skb_mac_offset(const struct sk_buff *skb)
{
        return skb_mac_header(skb) - skb->data;
}

static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->network_header - skb->mac_header;
}

static inline void skb_unset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->data - skb->head;
}

static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
        skb_reset_mac_header(skb);
        skb->mac_header += offset;
}

static inline void skb_pop_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->network_header;
}

static inline void skb_probe_transport_header(struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (skb_transport_header_was_set(skb))
                return;

        if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                             NULL, 0, 0, 0, 0))
                skb_set_transport_header(skb, keys.control.thoff);
}

static inline void skb_mac_header_rebuild(struct sk_buff *skb)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -skb->mac_len);
                memmove(skb_mac_header(skb), old_mac, skb->mac_len);
        }
}

/* Move the full mac header up to current network_header.
 * Leaves skb->data pointing at offset skb->mac_len into the mac_header.
 * Must be provided the complete mac header length.
 */
static inline void skb_mac_header_rebuild_full(struct sk_buff *skb, u32 full_mac_len)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -full_mac_len);
                memmove(skb_mac_header(skb), old_mac, full_mac_len);
                __skb_push(skb, full_mac_len - skb->mac_len);
        }
}

static inline int skb_checksum_start_offset(const struct sk_buff *skb)
{
        return skb->csum_start - skb_headroom(skb);
}

static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
{
        return skb->head + skb->csum_start;
}

static inline int skb_transport_offset(const struct sk_buff *skb)
{
        return skb_transport_header(skb) - skb->data;
}

static inline u32 skb_network_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->transport_header - skb->network_header;
}

static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
{
        return skb->inner_transport_header - skb->inner_network_header;
}

static inline int skb_network_offset(const struct sk_buff *skb)
{
        return skb_network_header(skb) - skb->data;
}

static inline int skb_inner_network_offset(const struct sk_buff *skb)
{
        return skb_inner_network_header(skb) - skb->data;
}

static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull(skb, skb_network_offset(skb) + len);
}

/*
 * CPUs often take a performance hit when accessing unaligned memory
 * locations. The actual performance hit varies, it can be small if the
 * hardware handles it or large if we have to take an exception and fix it
 * in software.
 *
 * Since an ethernet header is 14 bytes network drivers often end up with
 * the IP header at an unaligned offset. The IP header can be aligned by
 * shifting the start of the packet by 2 bytes. Drivers should do this
 * with:
 *
 * skb_reserve(skb, NET_IP_ALIGN);
 *
 * The downside to this alignment of the IP header is that the DMA is now
 * unaligned. On some architectures the cost of an unaligned DMA is high
 * and this cost outweighs the gains made by aligning the IP header.
 *
 * Since this trade off varies between architectures, we allow NET_IP_ALIGN
 * to be overridden.
 */
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN        2
#endif

/*
 * The networking layer reserves some headroom in skb data (via
 * dev_alloc_skb). This is used to avoid having to reallocate skb data when
 * the header has to grow. In the default case, if the header has to grow
 * 32 bytes or less we avoid the reallocation.
 *
 * Unfortunately this headroom changes the DMA alignment of the resulting
 * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
 * on some architectures. An architecture can override this value,
 * perhaps setting it to a cacheline in size (since that will maintain
 * cacheline alignment of the DMA). It must be a power of 2.
 *
 * Various parts of the networking layer expect at least 32 bytes of
 * headroom, you should not reduce this.
 *
 * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
 * to reduce average number of cache lines per packet.
 * get_rps_cpu() for example only access one 64 bytes aligned block :
 * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
 */
#ifndef NET_SKB_PAD
#define NET_SKB_PAD        max(32, L1_CACHE_BYTES)
#endif

int ___pskb_trim(struct sk_buff *skb, unsigned int len);

static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
        if (WARN_ON(skb_is_nonlinear(skb)))
                return;
        skb->len = len;
        skb_set_tail_pointer(skb, len);
}

static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
        __skb_set_length(skb, len);
}

void skb_trim(struct sk_buff *skb, unsigned int len);

static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->data_len)
                return ___pskb_trim(skb, len);
        __skb_trim(skb, len);
        return 0;
}

static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
{
        return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}

/**
 *        pskb_trim_unique - remove end from a paged unique (not cloned) buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        This is identical to pskb_trim except that the caller knows that
 *        the skb is not cloned so we should never get an error due to out-
 *        of-memory.
 */
static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
{
        int err = pskb_trim(skb, len);
        BUG_ON(err);
}

static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
{
        unsigned int diff = len - skb->len;

        if (skb_tailroom(skb) < diff) {
                int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
                                           GFP_ATOMIC);
                if (ret)
                        return ret;
        }
        __skb_set_length(skb, len);
        return 0;
}

/**
 *        skb_orphan - orphan a buffer
 *        @skb: buffer to orphan
 *
 *        If a buffer currently has an owner then we call the owner's
 *        destructor function and make the @skb unowned. The buffer continues
 *        to exist but is no longer charged to its former owner.
 */
static inline void skb_orphan(struct sk_buff *skb)
{
        if (skb->destructor) {
                skb->destructor(skb);
                skb->destructor = NULL;
                skb->sk                = NULL;
        } else {
                BUG_ON(skb->sk);
        }
}

/**
 *        skb_orphan_frags - orphan the frags contained in a buffer
 *        @skb: buffer to orphan frags from
 *        @gfp_mask: allocation mask for replacement pages
 *
 *        For each frag in the SKB which needs a destructor (i.e. has an
 *        owner) create a copy of that frag and release the original
 *        page by calling the destructor.
 */
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/**
 *        __skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function does not take the
 *        list lock and the caller must hold the relevant locks to use it.
 */
static inline void __skb_queue_purge_reason(struct sk_buff_head *list,
                                            enum skb_drop_reason reason)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue(list)) != NULL)
                kfree_skb_reason(skb, reason);
}

static inline void __skb_queue_purge(struct sk_buff_head *list)
{
        __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason);

static inline void skb_queue_purge(struct sk_buff_head *list)
{
        skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

unsigned int skb_rbtree_purge(struct rb_root *root);
void skb_errqueue_purge(struct sk_buff_head *list);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

/**
 * netdev_alloc_frag - allocate a page fragment
 * @fragsz: fragment size
 *
 * Allocates a frag from a page for receive buffer.
 * Uses GFP_ATOMIC allocations.
 */
static inline void *netdev_alloc_frag(unsigned int fragsz)
{
        return __netdev_alloc_frag_align(fragsz, ~0u);
}

static inline void *netdev_alloc_frag_align(unsigned int fragsz,
                                            unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __netdev_alloc_frag_align(fragsz, -align);
}

struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
                                   gfp_t gfp_mask);

/**
 *        netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @length: length to allocate
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has unspecified headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory. Although this function
 *        allocates memory it can be called from an interrupt.
 */
static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
                                               unsigned int length)
{
        return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
}

/* legacy helper around __netdev_alloc_skb() */
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
                                              gfp_t gfp_mask)
{
        return __netdev_alloc_skb(NULL, length, gfp_mask);
}

/* legacy helper around netdev_alloc_skb() */
static inline struct sk_buff *dev_alloc_skb(unsigned int length)
{
        return netdev_alloc_skb(NULL, length);
}


static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length, gfp_t gfp)
{
        struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);

        if (NET_IP_ALIGN && skb)
                skb_reserve(skb, NET_IP_ALIGN);
        return skb;
}

static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length)
{
        return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
}

static inline void skb_free_frag(void *addr)
{
        page_frag_free(addr);
}

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

static inline void *napi_alloc_frag(unsigned int fragsz)
{
        return __napi_alloc_frag_align(fragsz, ~0u);
}

static inline void *napi_alloc_frag_align(unsigned int fragsz,
                                          unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __napi_alloc_frag_align(fragsz, -align);
}

struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length);
void napi_consume_skb(struct sk_buff *skb, int budget);

void napi_skb_free_stolen_head(struct sk_buff *skb);
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);

/**
 * __dev_alloc_pages - allocate page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 * @order: size of the allocation
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask,
                                             unsigned int order)
{
        /* This piece of code contains several assumptions.
         * 1.  This is for device Rx, therefore a cold page is preferred.
         * 2.  The expectation is the user wants a compound page.
         * 3.  If requesting a order 0 page it will not be compound
         *     due to the check to see if order has a value in prep_new_page
         * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
         *     code in gfp_to_alloc_flags that should be enforcing this.
         */
        gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;

        return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order);
}
#define __dev_alloc_pages(...)        alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__))

#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order)

/**
 * __dev_alloc_page - allocate a page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
 */
static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask)
{
        return __dev_alloc_pages_noprof(gfp_mask, 0);
}
#define __dev_alloc_page(...)        alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__))

#define dev_alloc_page()        dev_alloc_pages(0)

/**
 * dev_page_is_reusable - check whether a page can be reused for network Rx
 * @page: the page to test
 *
 * A page shouldn't be considered for reusing/recycling if it was allocated
 * under memory pressure or at a distant memory node.
 *
 * Returns false if this page should be returned to page allocator, true
 * otherwise.
 */
static inline bool dev_page_is_reusable(const struct page *page)
{
        return likely(page_to_nid(page) == numa_mem_id() &&
                      !page_is_pfmemalloc(page));
}

/**
 *        skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
 *        @page: The page that was allocated from skb_alloc_page
 *        @skb: The skb that may need pfmemalloc set
 */
static inline void skb_propagate_pfmemalloc(const struct page *page,
                                            struct sk_buff *skb)
{
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

/**
 * skb_frag_off() - Returns the offset of a skb fragment
 * @frag: the paged fragment
 */
static inline unsigned int skb_frag_off(const skb_frag_t *frag)
{
        return frag->offset;
}

/**
 * skb_frag_off_add() - Increments the offset of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
{
        frag->offset += delta;
}

/**
 * skb_frag_off_set() - Sets the offset of a skb fragment
 * @frag: skb fragment
 * @offset: offset of fragment
 */
static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
{
        frag->offset = offset;
}

/**
 * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment
 * @fragto: skb fragment where offset is set
 * @fragfrom: skb fragment offset is copied from
 */
static inline void skb_frag_off_copy(skb_frag_t *fragto,
                                     const skb_frag_t *fragfrom)
{
        fragto->offset = fragfrom->offset;
}

/**
 * skb_frag_page - retrieve the page referred to by a paged fragment
 * @frag: the paged fragment
 *
 * Returns the &struct page associated with @frag.
 */
static inline struct page *skb_frag_page(const skb_frag_t *frag)
{
        return netmem_to_page(frag->netmem);
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         struct bpf_prog *prog);
/**
 * skb_frag_address - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns the address of the data within @frag. The page must already
 * be mapped.
 */
static inline void *skb_frag_address(const skb_frag_t *frag)
{
        return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
}

/**
 * skb_frag_address_safe - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns the address of the data within @frag. Checks that the page
 * is mapped and returns %NULL otherwise.
 */
static inline void *skb_frag_address_safe(const skb_frag_t *frag)
{
        void *ptr = page_address(skb_frag_page(frag));
        if (unlikely(!ptr))
                return NULL;

        return ptr + skb_frag_off(frag);
}

/**
 * skb_frag_page_copy() - sets the page in a fragment from another fragment
 * @fragto: skb fragment where page is set
 * @fragfrom: skb fragment page is copied from
 */
static inline void skb_frag_page_copy(skb_frag_t *fragto,
                                      const skb_frag_t *fragfrom)
{
        fragto->netmem = fragfrom->netmem;
}

bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);

/**
 * skb_frag_dma_map - maps a paged fragment via the DMA API
 * @dev: the device to map the fragment to
 * @frag: the paged fragment to map
 * @offset: the offset within the fragment (starting at the
 *          fragment's own offset)
 * @size: the number of bytes to map
 * @dir: the direction of the mapping (``PCI_DMA_*``)
 *
 * Maps the page associated with @frag to @device.
 */
static inline dma_addr_t skb_frag_dma_map(struct device *dev,
                                          const skb_frag_t *frag,
                                          size_t offset, size_t size,
                                          enum dma_data_direction dir)
{
        return dma_map_page(dev, skb_frag_page(frag),
                            skb_frag_off(frag) + offset, size, dir);
}

static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
                                        gfp_t gfp_mask)
{
        return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
}


static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
                                                  gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
}


/**
 *        skb_clone_writable - is the header of a clone writable
 *        @skb: buffer to check
 *        @len: length up to which to write
 *
 *        Returns true if modifying the header part of the cloned buffer
 *        does not requires the data to be copied.
 */
static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
{
        return !skb_header_cloned(skb) &&
               skb_headroom(skb) + len <= skb->hdr_len;
}

static inline int skb_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
               pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}

static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
                            int cloned)
{
        int delta = 0;

        if (headroom > skb_headroom(skb))
                delta = headroom - skb_headroom(skb);

        if (delta || cloned)
                return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
                                        GFP_ATOMIC);
        return 0;
}

/**
 *        skb_cow - copy header of skb when it is required
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        If the skb passed lacks sufficient headroom or its data part
 *        is shared, data is reallocated. If reallocation fails, an error
 *        is returned and original skb is not changed.
 *
 *        The result is skb with writable area skb->head...skb->tail
 *        and at least @headroom of space at head.
 */
static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_cloned(skb));
}

/**
 *        skb_cow_head - skb_cow but only making the head writable
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        This function is identical to skb_cow except that we replace the
 *        skb_cloned check by skb_header_cloned.  It should be used when
 *        you only need to push on some header and do not need to modify
 *        the data.
 */
static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_header_cloned(skb));
}

/**
 *        skb_padto        - pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int skb_padto(struct sk_buff *skb, unsigned int len)
{
        unsigned int size = skb->len;
        if (likely(size >= len))
                return 0;
        return skb_pad(skb, len - size);
}

/**
 *        __skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *        @free_on_error: free buffer on error
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error if @free_on_error is true.
 */
static inline int __must_check __skb_put_padto(struct sk_buff *skb,
                                               unsigned int len,
                                               bool free_on_error)
{
        unsigned int size = skb->len;

        if (unlikely(size < len)) {
                len -= size;
                if (__skb_pad(skb, len, free_on_error))
                        return -ENOMEM;
                __skb_put(skb, len);
        }
        return 0;
}

/**
 *        skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len)
{
        return __skb_put_padto(skb, len, true);
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
        __must_check;

static inline int skb_add_data(struct sk_buff *skb,
                               struct iov_iter *from, int copy)
{
        const int off = skb->len;

        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy,
                                                 &csum, from)) {
                        skb->csum = csum_block_add(skb->csum, csum, off);
                        return 0;
                }
        } else if (copy_from_iter_full(skb_put(skb, copy), copy, from))
                return 0;

        __skb_trim(skb, off);
        return -EFAULT;
}

static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
                                    const struct page *page, int off)
{
        if (skb_zcopy(skb))
                return false;
        if (i) {
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];

                return page == skb_frag_page(frag) &&
                       off == skb_frag_off(frag) + skb_frag_size(frag);
        }
        return false;
}

static inline int __skb_linearize(struct sk_buff *skb)
{
        return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
}

/**
 *        skb_linearize - convert paged skb to linear one
 *        @skb: buffer to linarize
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
}

/**
 * skb_has_shared_frag - can any frag be overwritten
 * @skb: buffer to test
 *
 * Return true if the skb has at least one frag that might be modified
 * by an external entity (as in vmsplice()/sendfile())
 */
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) &&
               skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
}

/**
 *        skb_linearize_cow - make sure skb is linear and writable
 *        @skb: buffer to process
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize_cow(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) || skb_cloned(skb) ?
               __skb_linearize(skb) : 0;
}

static __always_inline void
__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_sub(skb->csum,
                                           csum_partial(start, len, 0), off);
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 *        skb_postpull_rcsum - update checksum for received skb after pull
 *        @skb: buffer to update
 *        @start: start of data before pull
 *        @len: length of data pulled
 *
 *        After doing a pull on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum, or set ip_summed to
 *        CHECKSUM_NONE so that it can be recomputed from scratch.
 */
static inline void skb_postpull_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = wsum_negate(csum_partial(start, len,
                                                     wsum_negate(skb->csum)));
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

static __always_inline void
__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_add(skb->csum,
                                           csum_partial(start, len, 0), off);
}

/**
 *        skb_postpush_rcsum - update checksum for received skb after push
 *        @skb: buffer to update
 *        @start: start of data after push
 *        @len: length of data pushed
 *
 *        After doing a push on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum.
 */
static inline void skb_postpush_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpush_rcsum(skb, start, len, 0);
}

void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);

/**
 *        skb_push_rcsum - push skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_push on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_push unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_push(skb, len);
        skb_postpush_rcsum(skb, skb->data, len);
        return skb->data;
}

int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/**
 *        pskb_trim_rcsum - trim received skb and update checksum
 *        @skb: buffer to trim
 *        @len: new length
 *
 *        This is exactly the same as pskb_trim except that it ensures the
 *        checksum of received packets are still valid after the operation.
 *        It can change skb pointers.
 */

static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (likely(len >= skb->len))
                return 0;
        return pskb_trim_rcsum_slow(skb, len);
}

static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        __skb_trim(skb, len);
        return 0;
}

static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root)  rb_to_skb(rb_last(root))
#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
                for (skb = (queue)->next;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->next)

#define skb_queue_walk_safe(queue, skb, tmp)                                        \
                for (skb = (queue)->next, tmp = skb->next;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_walk_from(queue, skb)                                                \
                for (; skb != (struct sk_buff *)(queue);                        \
                     skb = skb->next)

#define skb_rbtree_walk(skb, root)                                                \
                for (skb = skb_rb_first(root); skb != NULL;                        \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb)                                                \
                for (; skb != NULL;                                                \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp)                                        \
                for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);        \
                     skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp)                                \
                for (tmp = skb->next;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_reverse_walk(queue, skb) \
                for (skb = (queue)->prev;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->prev)

#define skb_queue_reverse_walk_safe(queue, skb, tmp)                                \
                for (skb = (queue)->prev, tmp = skb->prev;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

#define skb_queue_reverse_walk_from_safe(queue, skb, tmp)                        \
                for (tmp = skb->prev;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

static inline bool skb_has_frag_list(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->frag_list != NULL;
}

static inline void skb_frag_list_init(struct sk_buff *skb)
{
        skb_shinfo(skb)->frag_list = NULL;
}

#define skb_walk_frags(skb, iter)        \
        for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)


int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
                                          struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err);
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
                           struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
                                        struct msghdr *msg, int size)
{
        return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
}
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
                                   struct msghdr *msg);
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len,
                           struct ahash_request *hash);
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from, int len);
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
                              int len);
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int len,
                    unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len);
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
                 int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
                                 unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len);
int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
int skb_eth_pop(struct sk_buff *skb);
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src);
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet);
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet);
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
int skb_mpls_dec_ttl(struct sk_buff *skb);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
                             gfp_t gfp);

static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{
        return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
}

static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
{
        return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
}

struct skb_checksum_ops {
        __wsum (*update)(const void *mem, int len, __wsum wsum);
        __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
};

extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;

__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
                      __wsum csum, const struct skb_checksum_ops *ops);
__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
                    __wsum csum);

static inline void * __must_check
__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
                     const void *data, int hlen, void *buffer)
{
        if (likely(hlen - offset >= len))
                return (void *)data + offset;

        if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
                return NULL;

        return buffer;
}

static inline void * __must_check
skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
{
        return __skb_header_pointer(skb, offset, len, skb->data,
                                    skb_headlen(skb), buffer);
}

static inline void * __must_check
skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len)
{
        if (likely(skb_headlen(skb) - offset >= len))
                return skb->data + offset;
        return NULL;
}

/**
 *        skb_needs_linearize - check if we need to linearize a given skb
 *                              depending on the given device features.
 *        @skb: socket buffer to check
 *        @features: net device features
 *
 *        Returns true if either:
 *        1. skb has frag_list and the device doesn't support FRAGLIST, or
 *        2. skb is fragmented and the device does not support SG.
 */
static inline bool skb_needs_linearize(struct sk_buff *skb,
                                       netdev_features_t features)
{
        return skb_is_nonlinear(skb) &&
               ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
                (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
}

static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
                                             void *to,
                                             const unsigned int len)
{
        memcpy(to, skb->data, len);
}

static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
                                                    const int offset, void *to,
                                                    const unsigned int len)
{
        memcpy(to, skb->data + offset, len);
}

static inline void skb_copy_to_linear_data(struct sk_buff *skb,
                                           const void *from,
                                           const unsigned int len)
{
        memcpy(skb->data, from, len);
}

static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
                                                  const int offset,
                                                  const void *from,
                                                  const unsigned int len)
{
        memcpy(skb->data + offset, from, len);
}

void skb_init(void);

static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
        return skb->tstamp;
}

/**
 *        skb_get_timestamp - get timestamp from a skb
 *        @skb: skb to get stamp from
 *        @stamp: pointer to struct __kernel_old_timeval to store stamp in
 *
 *        Timestamps are stored in the skb as offsets to a base timestamp.
 *        This function converts the offset back to a struct timeval and stores
 *        it in stamp.
 */
static inline void skb_get_timestamp(const struct sk_buff *skb,
                                     struct __kernel_old_timeval *stamp)
{
        *stamp = ns_to_kernel_old_timeval(skb->tstamp);
}

static inline void skb_get_new_timestamp(const struct sk_buff *skb,
                                         struct __kernel_sock_timeval *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_usec = ts.tv_nsec / 1000;
}

static inline void skb_get_timestampns(const struct sk_buff *skb,
                                       struct __kernel_old_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void skb_get_new_timestampns(const struct sk_buff *skb,
                                           struct __kernel_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void __net_timestamp(struct sk_buff *skb)
{
        skb->tstamp = ktime_get_real();
        skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline ktime_t net_timedelta(ktime_t t)
{
        return ktime_sub(ktime_get_real(), t);
}

static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
                                         u8 tstamp_type)
{
        skb->tstamp = kt;

        if (kt)
                skb->tstamp_type = tstamp_type;
        else
                skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
                                                    ktime_t kt, clockid_t clockid)
{
        u8 tstamp_type = SKB_CLOCK_REALTIME;

        switch (clockid) {
        case CLOCK_REALTIME:
                break;
        case CLOCK_MONOTONIC:
                tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case CLOCK_TAI:
                tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                WARN_ON_ONCE(1);
                kt = 0;
        }

        skb_set_delivery_time(skb, kt, tstamp_type);
}

DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);

/* It is used in the ingress path to clear the delivery_time.
 * If needed, set the skb->tstamp to the (rcv) timestamp.
 */
static inline void skb_clear_delivery_time(struct sk_buff *skb)
{
        if (skb->tstamp_type) {
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                if (static_branch_unlikely(&netstamp_needed_key))
                        skb->tstamp = ktime_get_real();
                else
                        skb->tstamp = 0;
        }
}

static inline void skb_clear_tstamp(struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return;

        skb->tstamp = 0;
}

static inline ktime_t skb_tstamp(const struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return 0;

        return skb->tstamp;
}

static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond)
{
        if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp)
                return skb->tstamp;

        if (static_branch_unlikely(&netstamp_needed_key) || cond)
                return ktime_get_real();

        return 0;
}

static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->meta_len;
}

static inline void *skb_metadata_end(const struct sk_buff *skb)
{
        return skb_mac_header(skb);
}

static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
                                          const struct sk_buff *skb_b,
                                          u8 meta_len)
{
        const void *a = skb_metadata_end(skb_a);
        const void *b = skb_metadata_end(skb_b);
        u64 diffs = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            BITS_PER_LONG != 64)
                goto slow;

        /* Using more efficient variant than plain call to memcmp(). */
        switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
        case 32: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 24: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 16: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  8: diffs |= __it_diff(a, b, 64);
                break;
        case 28: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 20: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 12: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  4: diffs |= __it_diff(a, b, 32);
                break;
        default:
slow:
                return memcmp(a - meta_len, b - meta_len, meta_len);
        }
        return diffs;
}

static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
                                        const struct sk_buff *skb_b)
{
        u8 len_a = skb_metadata_len(skb_a);
        u8 len_b = skb_metadata_len(skb_b);

        if (!(len_a | len_b))
                return false;

        return len_a != len_b ?
               true : __skb_metadata_differs(skb_a, skb_b, len_a);
}

static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
        skb_shinfo(skb)->meta_len = meta_len;
}

static inline void skb_metadata_clear(struct sk_buff *skb)
{
        skb_metadata_set(skb, 0);
}

struct sk_buff *skb_clone_sk(struct sk_buff *skb);

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

void skb_clone_tx_timestamp(struct sk_buff *skb);
bool skb_defer_rx_timestamp(struct sk_buff *skb);

#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */

static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
{
}

static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
        return false;
}

#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */

/**
 * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
 *
 * PHY drivers may accept clones of transmitted packets for
 * timestamping via their phy_driver.txtstamp method. These drivers
 * must call this function to return the skb back to the stack with a
 * timestamp.
 *
 * @skb: clone of the original outgoing packet
 * @hwtstamps: hardware time stamps
 *
 */
void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps);

void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype);

/**
 * skb_tstamp_tx - queue clone of skb with send time stamps
 * @orig_skb:        the original outgoing packet
 * @hwtstamps:        hardware time stamps, may be NULL if not available
 *
 * If the skb has a socket associated, then this function clones the
 * skb (thus sharing the actual data and optional structures), stores
 * the optional hardware time stamping information (if non NULL) or
 * generates a software time stamp (otherwise), then queues the clone
 * to the error queue of the socket.  Errors are silently ignored.
 */
void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps);

/**
 * skb_tx_timestamp() - Driver hook for transmit timestamping
 *
 * Ethernet MAC Drivers should call this function in their hard_xmit()
 * function immediately before giving the sk_buff to the MAC hardware.
 *
 * Specifically, one should make absolutely sure that this function is
 * called before TX completion of this packet can trigger.  Otherwise
 * the packet could potentially already be freed.
 *
 * @skb: A socket buffer.
 */
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
        skb_clone_tx_timestamp(skb);
        if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
                skb_tstamp_tx(skb, NULL);
}

/**
 * skb_complete_wifi_ack - deliver skb with wifi status
 *
 * @skb: the original outgoing packet
 * @acked: ack status
 *
 */
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
__sum16 __skb_checksum_complete(struct sk_buff *skb);

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
        return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
                skb->csum_valid ||
                (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) >= 0));
}

/**
 *        skb_checksum_complete - Calculate checksum of an entire packet
 *        @skb: packet to process
 *
 *        This function calculates the checksum over the entire packet plus
 *        the value of skb->csum.  The latter can be used to supply the
 *        checksum of a pseudo header as used by TCP/UDP.  It returns the
 *        checksum.
 *
 *        For protocols that contain complete checksums such as ICMP/TCP/UDP,
 *        this function can be used to verify that checksum on received
 *        packets.  In that case the function should return zero if the
 *        checksum is correct.  In particular, this function will return zero
 *        if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
 *        hardware has already verified the correctness of the checksum.
 */
static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb) ?
               0 : __skb_checksum_complete(skb);
}

static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level == 0)
                        skb->ip_summed = CHECKSUM_NONE;
                else
                        skb->csum_level--;
        }
}

static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
                        skb->csum_level++;
        } else if (skb->ip_summed == CHECKSUM_NONE) {
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                skb->csum_level = 0;
        }
}

static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                skb->ip_summed = CHECKSUM_NONE;
                skb->csum_level = 0;
        }
}

/* Check if we need to perform checksum complete validation.
 *
 * Returns true if checksum complete is needed, false otherwise
 * (either checksum is unnecessary or zero checksum is allowed).
 */
static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
                                                  bool zero_okay,
                                                  __sum16 check)
{
        if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
                skb->csum_valid = 1;
                __skb_decr_checksum_unnecessary(skb);
                return false;
        }

        return true;
}

/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
 * in checksum_init.
 */
#define CHECKSUM_BREAK 76

/* Unset checksum-complete
 *
 * Unset checksum complete can be done when packet is being modified
 * (uncompressed for instance) and checksum-complete value is
 * invalidated.
 */
static inline void skb_checksum_complete_unset(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/* Validate (init) checksum based on checksum complete.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete. In the latter
 *        case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
 *        checksum is stored in skb->csum for use in __skb_checksum_complete
 *   non-zero: value of invalid checksum
 *
 */
static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
                                                       bool complete,
                                                       __wsum psum)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                if (!csum_fold(csum_add(psum, skb->csum))) {
                        skb->csum_valid = 1;
                        return 0;
                }
        }

        skb->csum = psum;

        if (complete || skb->len <= CHECKSUM_BREAK) {
                __sum16 csum;

                csum = __skb_checksum_complete(skb);
                skb->csum_valid = !csum;
                return csum;
        }

        return 0;
}

static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
{
        return 0;
}

/* Perform checksum validate (init). Note that this is a macro since we only
 * want to calculate the pseudo header which is an input function if necessary.
 * First we try to validate without any computation (checksum unnecessary) and
 * then calculate based on checksum complete calling the function to compute
 * pseudo header.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete
 *   non-zero: value of invalid checksum
 */
#define __skb_checksum_validate(skb, proto, complete,                        \
                                zero_okay, check, compute_pseudo)        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        skb->csum_valid = 0;                                                \
        if (__skb_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_checksum_validate_complete(skb,                \
                                complete, compute_pseudo(skb, proto));        \
        __ret;                                                                \
})

#define skb_checksum_init(skb, proto, compute_pseudo)                        \
        __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)

#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo)        \
        __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)

#define skb_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)

#define skb_checksum_validate_zero_check(skb, proto, check,                \
                                         compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)

#define skb_checksum_simple_validate(skb)                                \
        __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)

static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
{
        return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
}

static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
{
        skb->csum = ~pseudo;
        skb->ip_summed = CHECKSUM_COMPLETE;
}

#define skb_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_checksum_convert_check(skb))                                \
                __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
} while (0)

static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
                                              u16 start, u16 offset)
{
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
        skb->csum_offset = offset - start;
}

/* Update skbuf and packet to reflect the remote checksum offload operation.
 * When called, ptr indicates the starting point for skb->csum when
 * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
 * here, skb_postpull_rcsum is done so skb->csum start is ptr.
 */
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
                                       int start, int offset, bool nopartial)
{
        __wsum delta;

        if (!nopartial) {
                skb_remcsum_adjust_partial(skb, ptr, start, offset);
                return;
        }

        if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
                __skb_checksum_complete(skb);
                skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
        }

        delta = remcsum_adjust(ptr, skb->csum, start, offset);

        /* Adjust skb->csum since we changed the packet */
        skb->csum = csum_add(skb->csum, delta);
}

static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return (void *)(skb->_nfct & NFCT_PTRMASK);
#else
        return NULL;
#endif
}

static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return skb->_nfct;
#else
        return 0UL;
#endif
}

static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        skb->slow_gro |= !!nfct;
        skb->_nfct = nfct;
#endif
}

#ifdef CONFIG_SKB_EXTENSIONS
enum skb_ext_id {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        SKB_EXT_BRIDGE_NF,
#endif
#ifdef CONFIG_XFRM
        SKB_EXT_SEC_PATH,
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        TC_SKB_EXT,
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        SKB_EXT_MPTCP,
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        SKB_EXT_MCTP,
#endif
        SKB_EXT_NUM, /* must be last */
};

/**
 *        struct skb_ext - sk_buff extensions
 *        @refcnt: 1 on allocation, deallocated on 0
 *        @offset: offset to add to @data to obtain extension address
 *        @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
 *        @data: start of extension data, variable sized
 *
 *        Note: offsets/lengths are stored in chunks of 8 bytes, this allows
 *        to use 'u8' types while allowing up to 2kb worth of extension data.
 */
struct skb_ext {
        refcount_t refcnt;
        u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
        u8 chunks;                /* same */
        char data[] __aligned(8);
};

struct skb_ext *__skb_ext_alloc(gfp_t flags);
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);

static inline void skb_ext_put(struct sk_buff *skb)
{
        if (skb->active_extensions)
                __skb_ext_put(skb->extensions);
}

static inline void __skb_ext_copy(struct sk_buff *dst,
                                  const struct sk_buff *src)
{
        dst->active_extensions = src->active_extensions;

        if (src->active_extensions) {
                struct skb_ext *ext = src->extensions;

                refcount_inc(&ext->refcnt);
                dst->extensions = ext;
        }
}

static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
{
        skb_ext_put(dst);
        __skb_ext_copy(dst, src);
}

static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
{
        return !!ext->offset[i];
}

static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
{
        return skb->active_extensions & (1 << id);
}

static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id))
                __skb_ext_del(skb, id);
}

static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id)) {
                struct skb_ext *ext = skb->extensions;

                return (void *)ext + (ext->offset[id] << 3);
        }

        return NULL;
}

static inline void skb_ext_reset(struct sk_buff *skb)
{
        if (unlikely(skb->active_extensions)) {
                __skb_ext_put(skb->extensions);
                skb->active_extensions = 0;
        }
}

static inline bool skb_has_extensions(struct sk_buff *skb)
{
        return unlikely(skb->active_extensions);
}
#else
static inline void skb_ext_put(struct sk_buff *skb) {}
static inline void skb_ext_reset(struct sk_buff *skb) {}
static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */

static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(skb));
        skb->_nfct = 0;
#endif
}

static inline void nf_reset_trace(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        skb->nf_trace = 0;
#endif
}

static inline void ipvs_reset(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_VS)
        skb->ipvs_property = 0;
#endif
}

/* Note: This doesn't put any conntrack info in dst. */
static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
                             bool copy)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        dst->_nfct = src->_nfct;
        nf_conntrack_get(skb_nfct(src));
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        if (copy)
                dst->nf_trace = src->nf_trace;
#endif
}

static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(dst));
#endif
        dst->slow_gro = src->slow_gro;
        __nf_copy(dst, src, true);
}

#ifdef CONFIG_NETWORK_SECMARK
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{
        to->secmark = from->secmark;
}

static inline void skb_init_secmark(struct sk_buff *skb)
{
        skb->secmark = 0;
}
#else
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{ }

static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif

static inline int secpath_exists(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_exist(skb, SKB_EXT_SEC_PATH);
#else
        return 0;
#endif
}

static inline bool skb_irq_freeable(const struct sk_buff *skb)
{
        return !skb->destructor &&
                !secpath_exists(skb) &&
                !skb_nfct(skb) &&
                !skb->_skb_refdst &&
                !skb_has_frag_list(skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
        skb->queue_mapping = queue_mapping;
}

static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
{
        return skb->queue_mapping;
}

static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
{
        to->queue_mapping = from->queue_mapping;
}

static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
        skb->queue_mapping = rx_queue + 1;
}

static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
        return skb->queue_mapping - 1;
}

static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
        return skb->queue_mapping != 0;
}

static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
        skb->dst_pending_confirm = val;
}

static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
        return skb->dst_pending_confirm != 0;
}

static inline struct sec_path *skb_sec_path(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_find(skb, SKB_EXT_SEC_PATH);
#else
        return NULL;
#endif
}

static inline bool skb_is_gso(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_size;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_v6(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
}

static inline void skb_gso_reset(struct sk_buff *skb)
{
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_segs = 0;
        skb_shinfo(skb)->gso_type = 0;
}

static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
                                         u16 increment)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size += increment;
}

static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
                                         u16 decrement)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size -= decrement;
}

void __skb_warn_lro_forwarding(const struct sk_buff *skb);

static inline bool skb_warn_if_lro(const struct sk_buff *skb)
{
        /* LRO sets gso_size but not gso_type, whereas if GSO is really
         * wanted then gso_type will be set. */
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
            unlikely(shinfo->gso_type == 0)) {
                __skb_warn_lro_forwarding(skb);
                return true;
        }
        return false;
}

static inline void skb_forward_csum(struct sk_buff *skb)
{
        /* Unfortunately we don't support this one.  Any brave souls? */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
 * @skb: skb to check
 *
 * fresh skbs have their ip_summed set to CHECKSUM_NONE.
 * Instead of forcing ip_summed to CHECKSUM_NONE, we can
 * use this helper, to document places where we make this assertion.
 */
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE);
}

bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);

int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb));

/**
 * skb_head_is_locked - Determine if the skb->head is locked down
 * @skb: skb to check
 *
 * The head on skbs build around a head frag can be removed if they are
 * not cloned.  This function returns true if the skb head is locked down
 * due to either being allocated via kmalloc, or by being a clone with
 * multiple references to the head.
 */
static inline bool skb_head_is_locked(const struct sk_buff *skb)
{
        return !skb->head_frag || skb_cloned(skb);
}

/* Local Checksum Offload.
 * Compute outer checksum based on the assumption that the
 * inner checksum will be offloaded later.
 * See Documentation/networking/checksum-offloads.rst for
 * explanation of how this works.
 * Fill in outer checksum adjustment (e.g. with sum of outer
 * pseudo-header) before calling.
 * Also ensure that inner checksum is in linear data area.
 */
static inline __wsum lco_csum(struct sk_buff *skb)
{
        unsigned char *csum_start = skb_checksum_start(skb);
        unsigned char *l4_hdr = skb_transport_header(skb);
        __wsum partial;

        /* Start with complement of inner checksum adjustment */
        partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
                                                    skb->csum_offset));

        /* Add in checksum of our headers (incl. outer checksum
         * adjustment filled in by caller) and return result.
         */
        return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
}

static inline bool skb_is_redirected(const struct sk_buff *skb)
{
        return skb->redirected;
}

static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
        if (skb->from_ingress)
                skb_clear_tstamp(skb);
#endif
}

static inline void skb_reset_redirect(struct sk_buff *skb)
{
        skb->redirected = 0;
}

static inline void skb_set_redirected_noclear(struct sk_buff *skb,
                                              bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
#endif
}

static inline bool skb_csum_is_sctp(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_SCTP)
        return skb->csum_not_inet;
#else
        return 0;
#endif
}

static inline void skb_reset_csum_not_inet(struct sk_buff *skb)
{
        skb->ip_summed = CHECKSUM_NONE;
#if IS_ENABLED(CONFIG_IP_SCTP)
        skb->csum_not_inet = 0;
#endif
}

static inline void skb_set_kcov_handle(struct sk_buff *skb,
                                       const u64 kcov_handle)
{
#ifdef CONFIG_KCOV
        skb->kcov_handle = kcov_handle;
#endif
}

static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
{
#ifdef CONFIG_KCOV
        return skb->kcov_handle;
#else
        return 0;
#endif
}

static inline void skb_mark_for_recycle(struct sk_buff *skb)
{
#ifdef CONFIG_PAGE_POOL
        skb->pp_recycle = 1;
#endif
}

ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize, gfp_t gfp);

#endif        /* __KERNEL__ */
#endif        /* _LINUX_SKBUFF_H */




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * kernel/workqueue_internal.h
 *
 * Workqueue internal header file.  Only to be included by workqueue and
 * core kernel subsystems.
 */
#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
#define _KERNEL_WORKQUEUE_INTERNAL_H

#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/preempt.h>

struct worker_pool;

/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
        /* on idle list while idle, on busy hash table while busy */
        union {
                struct list_head        entry;        /* L: while idle */
                struct hlist_node        hentry;        /* L: while busy */
        };

        struct work_struct        *current_work;        /* K: work being processed and its */
        work_func_t                current_func;        /* K: function */
        struct pool_workqueue        *current_pwq;        /* K: pwq */
        u64                        current_at;        /* K: runtime at start or last wakeup */
        unsigned int                current_color;        /* K: color */

        int                        sleeping;        /* S: is worker sleeping? */

        /* used by the scheduler to determine a worker's last known identity */
        work_func_t                last_func;        /* K: last work's fn */

        struct list_head        scheduled;        /* L: scheduled works */

        struct task_struct        *task;                /* I: worker task */
        struct worker_pool        *pool;                /* A: the associated pool */
                                                /* L: for rescuers */
        struct list_head        node;                /* A: anchored at pool->workers */
                                                /* A: runs through worker->node */

        unsigned long                last_active;        /* K: last active timestamp */
        unsigned int                flags;                /* L: flags */
        int                        id;                /* I: worker id */

        /*
         * Opaque string set with work_set_desc().  Printed out with task
         * dump for debugging - WARN, BUG, panic or sysrq.
         */
        char                        desc[WORKER_DESC_LEN];

        /* used only by rescuers to point to the target workqueue */
        struct workqueue_struct        *rescue_wq;        /* I: the workqueue to rescue */
};

/**
 * current_wq_worker - return struct worker if %current is a workqueue worker
 */
static inline struct worker *current_wq_worker(void)
{
        if (in_task() && (current->flags & PF_WQ_WORKER))
                return kthread_data(current);
        return NULL;
}

/*
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
 * sched/ and workqueue.c.
 */
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */






























    2 













































    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BRIDGE_NETFILTER_H
#define __LINUX_BRIDGE_NETFILTER_H

#include <uapi/linux/netfilter_bridge.h>
#include <linux/skbuff.h>

struct nf_bridge_frag_data {
        char    mac[ETH_HLEN];
        bool    vlan_present;
        u16     vlan_tci;
        __be16  vlan_proto;
};

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)

int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);

static inline void br_drop_fake_rtable(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && (dst->flags & DST_FAKE_RTABLE))
                skb_dst_drop(skb);
}

static inline struct nf_bridge_info *
nf_bridge_info_get(const struct sk_buff *skb)
{
        return skb_ext_find(skb, SKB_EXT_BRIDGE_NF);
}

static inline bool nf_bridge_info_exists(const struct sk_buff *skb)
{
        return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF);
}

static inline int nf_bridge_get_physinif(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (!nf_bridge)
                return 0;

        return nf_bridge->physinif;
}

static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        if (!nf_bridge)
                return 0;

        return nf_bridge->physoutdev ? nf_bridge->physoutdev->ifindex : 0;
}

static inline struct net_device *
nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL;
}

static inline struct net_device *
nf_bridge_get_physoutdev(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        return nf_bridge ? nf_bridge->physoutdev : NULL;
}

static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
{
        const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);

        return nf_bridge && nf_bridge->in_prerouting;
}
#else
#define br_drop_fake_rtable(skb)                do { } while (0)
static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
{
        return false;
}
#endif /* CONFIG_BRIDGE_NETFILTER */

#endif



























































































    1 




    1 


























    1 


    1 













    1 

    1 

    1 









































































































































    1 






    1 












    1 












    1 

    1 











    1 




















































































































    1 












    1 








    1 
    1 



















    1 






    1 



















    1 
    1 







    1 



    1 





    1 



    1 






    1 







    1 

























    1 























































    1 




    1 
    1 









    1 

    1 


    1 






    1 



































































    1 







    1 












    1 






















    1 
    1 





















    1 
    1 
    1 










































































































































































































































































    1 

    1 



























    1 

















    1 







    1 









    1 
































    1 



















































    1 














    1 







    1 

























    1 



















    1 




    1 



    1 














































































    1 










    1 











    1 








    1 



























































































































    1 























































































































































































































































































































    1 

    1 







    1 









    1 





    1 
























    1 




    1 











































    4 


    4 





































































































































































































































































































































































































    1 
    1 




    1 
    1 













    1 





    1 


    1 
    1 











    1 




    1 
    1 

    1 












    1 
















    1 


    1 


    1 
    1 

    1 








    1 






























































































































    1 






    1 











    1 























    1 


    1 




    1 
    1 















    1 
















    1 

    1 




    1 




    1 









    1 





    1 










    1 

    1 







































    1 



    1 

























    1 





















    1 













    1 























    1 
















    1 








    1 














    1 



















































































































    1 
    1 








    1 









    1 








































































































































































































































































































































































































































































































































    1 



























    1 
    1 

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>

#include "internal.h"

#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)        (0)
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);

static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                struct vm_area_struct *next, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked);

static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
        return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
        unsigned long vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}

/*
 * Requires inode->i_mapping->i_mmap_rwsem
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                                      struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_unmap_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
void unlink_file_vma(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;

        if (file) {
                struct address_space *mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __remove_shared_vm_struct(vma, mapping);
                i_mmap_unlock_write(mapping);
        }
}

/*
 * Close a vm structure and free it.
 */
static void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
        if (vma->vm_file)
                fput(vma->vm_file);
        mpol_put(vma_policy(vma));
        if (unreachable)
                __vm_area_free(vma);
        else
                vm_area_free(vma);
}

static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
                                                    unsigned long min)
{
        return mas_prev(&vmi->mas, min);
}

/*
 * check_brk_limits() - Use platform specific check of range & verify mlock
 * limits.
 * @addr: The address to check
 * @len: The size of increase.
 *
 * Return: 0 on success.
 */
static int check_brk_limits(unsigned long addr, unsigned long len)
{
        unsigned long mapped_addr;

        mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
        if (IS_ERR_VALUE(mapped_addr))
                return mapped_addr;

        return mlock_future_ok(current->mm, current->mm->def_flags, len)
                ? 0 : -EAGAIN;
}
static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
                unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
        unsigned long newbrk, oldbrk, origbrk;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *brkvma, *next = NULL;
        unsigned long min_brk;
        bool populate = false;
        LIST_HEAD(uf);
        struct vma_iterator vmi;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        origbrk = mm->brk;

#ifdef CONFIG_COMPAT_BRK
        /*
         * CONFIG_COMPAT_BRK can still be overridden by setting
         * randomize_va_space to 2, which will still cause mm->start_brk
         * to be arbitrarily shifted
         */
        if (current->brk_randomized)
                min_brk = mm->start_brk;
        else
                min_brk = mm->end_data;
#else
        min_brk = mm->start_brk;
#endif
        if (brk < min_brk)
                goto out;

        /*
         * Check against rlimit here. If this check is done later after the test
         * of oldbrk with newbrk then it can escape the test and let the data
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                              mm->end_data, mm->start_data))
                goto out;

        newbrk = PAGE_ALIGN(brk);
        oldbrk = PAGE_ALIGN(mm->brk);
        if (oldbrk == newbrk) {
                mm->brk = brk;
                goto success;
        }

        /* Always allow shrinking brk. */
        if (brk <= mm->brk) {
                /* Search one past newbrk */
                vma_iter_init(&vmi, mm, newbrk);
                brkvma = vma_find(&vmi, oldbrk);
                if (!brkvma || brkvma->vm_start >= oldbrk)
                        goto out; /* mapping intersects with an existing non-brk vma. */
                /*
                 * mm->brk must be protected by write mmap_lock.
                 * do_vma_munmap() will drop the lock on success,  so update it
                 * before calling do_vma_munmap().
                 */
                mm->brk = brk;
                if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true))
                        goto out;

                goto success_unlocked;
        }

        if (check_brk_limits(oldbrk, newbrk - oldbrk))
                goto out;

        /*
         * Only check if the next VMA is within the stack_guard_gap of the
         * expansion area
         */
        vma_iter_init(&vmi, mm, oldbrk);
        next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
                goto out;

        brkvma = vma_prev_limit(&vmi, mm->start_brk);
        /* Ok, looks good - let it rip. */
        if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
                goto out;

        mm->brk = brk;
        if (mm->def_flags & VM_LOCKED)
                populate = true;

success:
        mmap_write_unlock(mm);
success_unlocked:
        userfaultfd_unmap_complete(mm, &uf);
        if (populate)
                mm_populate(oldbrk, newbrk - oldbrk);
        return brk;

out:
        mm->brk = origbrk;
        mmap_write_unlock(mm);
        return origbrk;
}

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
static void validate_mm(struct mm_struct *mm)
{
        int bug = 0;
        int i = 0;
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mt_validate(&mm->mm_mt);
        for_each_vma(vmi, vma) {
#ifdef CONFIG_DEBUG_VM_RB
                struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;
#endif
                unsigned long vmi_start, vmi_end;
                bool warn = 0;

                vmi_start = vma_iter_addr(&vmi);
                vmi_end = vma_iter_end(&vmi);
                if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
                        warn = 1;

                if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
                        warn = 1;

                if (warn) {
                        pr_emerg("issue in %s\n", current->comm);
                        dump_stack();
                        dump_vma(vma);
                        pr_emerg("tree range: %px start %lx end %lx\n", vma,
                                 vmi_start, vmi_end - 1);
                        vma_iter_dump_tree(&vmi);
                }

#ifdef CONFIG_DEBUG_VM_RB
                if (anon_vma) {
                        anon_vma_lock_read(anon_vma);
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                anon_vma_interval_tree_verify(avc);
                        anon_vma_unlock_read(anon_vma);
                }
#endif
                i++;
        }
        if (i != mm->map_count) {
                pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
                bug = 1;
        }
        VM_BUG_ON_MM(bug, mm);
}

#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
#define validate_mm(mm) do { } while (0)
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */

/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_lock and by
 * the root anon_vma's mutex.
 */
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

static unsigned long count_vma_pages_range(struct mm_struct *mm,
                unsigned long addr, unsigned long end)
{
        VMA_ITERATOR(vmi, mm, addr);
        struct vm_area_struct *vma;
        unsigned long nr_pages = 0;

        for_each_vma_range(vmi, vma, end) {
                unsigned long vm_start = max(addr, vma->vm_start);
                unsigned long vm_end = min(end, vma->vm_end);

                nr_pages += PHYS_PFN(vm_end - vm_start);
        }

        return nr_pages;
}

static void __vma_link_file(struct vm_area_struct *vma,
                            struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_allow_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_insert(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

static void vma_link_file(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct address_space *mapping;

        if (file) {
                mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __vma_link_file(vma, mapping);
                i_mmap_unlock_write(mapping);
        }
}

static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
        VMA_ITERATOR(vmi, mm, 0);

        vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        vma_start_write(vma);
        vma_iter_store(&vmi, vma);
        vma_link_file(vma);
        mm->map_count++;
        validate_mm(mm);
        return 0;
}

/*
 * init_multi_vma_prep() - Initializer for struct vma_prepare
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 * @next: The next vma if it is to be adjusted
 * @remove: The first vma to be removed
 * @remove2: The second vma to be removed
 */
static inline void init_multi_vma_prep(struct vma_prepare *vp,
                struct vm_area_struct *vma, struct vm_area_struct *next,
                struct vm_area_struct *remove, struct vm_area_struct *remove2)
{
        memset(vp, 0, sizeof(struct vma_prepare));
        vp->vma = vma;
        vp->anon_vma = vma->anon_vma;
        vp->remove = remove;
        vp->remove2 = remove2;
        vp->adj_next = next;
        if (!vp->anon_vma && next)
                vp->anon_vma = next->anon_vma;

        vp->file = vma->vm_file;
        if (vp->file)
                vp->mapping = vma->vm_file->f_mapping;

}

/*
 * init_vma_prep() - Initializer wrapper for vma_prepare struct
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 */
static inline void init_vma_prep(struct vma_prepare *vp,
                                 struct vm_area_struct *vma)
{
        init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
}


/*
 * vma_prepare() - Helper function for handling locking VMAs prior to altering
 * @vp: The initialized vma_prepare struct
 */
static inline void vma_prepare(struct vma_prepare *vp)
{
        if (vp->file) {
                uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);

                if (vp->adj_next)
                        uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
                                      vp->adj_next->vm_end);

                i_mmap_lock_write(vp->mapping);
                if (vp->insert && vp->insert->vm_file) {
                        /*
                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
                         */
                        __vma_link_file(vp->insert,
                                        vp->insert->vm_file->f_mapping);
                }
        }

        if (vp->anon_vma) {
                anon_vma_lock_write(vp->anon_vma);
                anon_vma_interval_tree_pre_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_pre_update_vma(vp->adj_next);
        }

        if (vp->file) {
                flush_dcache_mmap_lock(vp->mapping);
                vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
                if (vp->adj_next)
                        vma_interval_tree_remove(vp->adj_next,
                                                 &vp->mapping->i_mmap);
        }

}

/*
 * vma_complete- Helper function for handling the unlocking after altering VMAs,
 * or for inserting a VMA.
 *
 * @vp: The vma_prepare struct
 * @vmi: The vma iterator
 * @mm: The mm_struct
 */
static inline void vma_complete(struct vma_prepare *vp,
                                struct vma_iterator *vmi, struct mm_struct *mm)
{
        if (vp->file) {
                if (vp->adj_next)
                        vma_interval_tree_insert(vp->adj_next,
                                                 &vp->mapping->i_mmap);
                vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
                flush_dcache_mmap_unlock(vp->mapping);
        }

        if (vp->remove && vp->file) {
                __remove_shared_vm_struct(vp->remove, vp->mapping);
                if (vp->remove2)
                        __remove_shared_vm_struct(vp->remove2, vp->mapping);
        } else if (vp->insert) {
                /*
                 * split_vma has split insert from vma, and needs
                 * us to insert it before dropping the locks
                 * (it may either follow vma or precede it).
                 */
                vma_iter_store(vmi, vp->insert);
                mm->map_count++;
        }

        if (vp->anon_vma) {
                anon_vma_interval_tree_post_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_post_update_vma(vp->adj_next);
                anon_vma_unlock_write(vp->anon_vma);
        }

        if (vp->file) {
                i_mmap_unlock_write(vp->mapping);
                uprobe_mmap(vp->vma);

                if (vp->adj_next)
                        uprobe_mmap(vp->adj_next);
        }

        if (vp->remove) {
again:
                vma_mark_detached(vp->remove, true);
                if (vp->file) {
                        uprobe_munmap(vp->remove, vp->remove->vm_start,
                                      vp->remove->vm_end);
                        fput(vp->file);
                }
                if (vp->remove->anon_vma)
                        anon_vma_merge(vp->vma, vp->remove);
                mm->map_count--;
                mpol_put(vma_policy(vp->remove));
                if (!vp->remove2)
                        WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
                vm_area_free(vp->remove);

                /*
                 * In mprotect's case 6 (see comments on vma_merge),
                 * we are removing both mid and next vmas
                 */
                if (vp->remove2) {
                        vp->remove = vp->remove2;
                        vp->remove2 = NULL;
                        goto again;
                }
        }
        if (vp->insert && vp->file)
                uprobe_mmap(vp->insert);
        validate_mm(mm);
}

/*
 * dup_anon_vma() - Helper function to duplicate anon_vma
 * @dst: The destination VMA
 * @src: The source VMA
 * @dup: Pointer to the destination VMA when successful.
 *
 * Returns: 0 on success.
 */
static inline int dup_anon_vma(struct vm_area_struct *dst,
                struct vm_area_struct *src, struct vm_area_struct **dup)
{
        /*
         * Easily overlooked: when mprotect shifts the boundary, make sure the
         * expanding vma has anon_vma set if the shrinking vma had, to cover any
         * anon pages imported.
         */
        if (src->anon_vma && !dst->anon_vma) {
                int ret;

                vma_assert_write_locked(dst);
                dst->anon_vma = src->anon_vma;
                ret = anon_vma_clone(dst, src);
                if (ret)
                        return ret;

                *dup = dst;
        }

        return 0;
}

/*
 * vma_expand - Expand an existing VMA
 *
 * @vmi: The vma iterator
 * @vma: The vma to expand
 * @start: The start of the vma
 * @end: The exclusive end of the vma
 * @pgoff: The page offset of vma
 * @next: The current of next vma.
 *
 * Expand @vma to @start and @end.  Can expand off the start and end.  Will
 * expand over @next if it's different from @vma and @end == @next->vm_end.
 * Checking if the @vma can expand and merge with @next needs to be handled by
 * the caller.
 *
 * Returns: 0 on success
 */
int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
               unsigned long start, unsigned long end, pgoff_t pgoff,
               struct vm_area_struct *next)
{
        struct vm_area_struct *anon_dup = NULL;
        bool remove_next = false;
        struct vma_prepare vp;

        vma_start_write(vma);
        if (next && (vma != next) && (end == next->vm_end)) {
                int ret;

                remove_next = true;
                vma_start_write(next);
                ret = dup_anon_vma(vma, next, &anon_dup);
                if (ret)
                        return ret;
        }

        init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
        /* Not merging but overwriting any part of next is not handled. */
        VM_WARN_ON(next && !vp.remove &&
                  next != vma && end > next->vm_start);
        /* Only handles expanding */
        VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);

        /* Note: vma iterator must be pointing to 'start' */
        vma_iter_config(vmi, start, end);
        if (vma_iter_prealloc(vmi, vma))
                goto nomem;

        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, start, end, 0);
        vma_set_range(vma, start, end, pgoff);
        vma_iter_store(vmi, vma);

        vma_complete(&vp, vmi, vma->vm_mm);
        return 0;

nomem:
        if (anon_dup)
                unlink_anon_vmas(anon_dup);
        return -ENOMEM;
}

/*
 * vma_shrink() - Reduce an existing VMAs memory area
 * @vmi: The vma iterator
 * @vma: The VMA to modify
 * @start: The new start
 * @end: The new end
 *
 * Returns: 0 on success, -ENOMEM otherwise
 */
int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
               unsigned long start, unsigned long end, pgoff_t pgoff)
{
        struct vma_prepare vp;

        WARN_ON((vma->vm_start != start) && (vma->vm_end != end));

        if (vma->vm_start < start)
                vma_iter_config(vmi, vma->vm_start, start);
        else
                vma_iter_config(vmi, end, vma->vm_end);

        if (vma_iter_prealloc(vmi, NULL))
                return -ENOMEM;

        vma_start_write(vma);

        init_vma_prep(&vp, vma);
        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, start, end, 0);

        vma_iter_clear(vmi);
        vma_set_range(vma, start, end, pgoff);
        vma_complete(&vp, vmi, vma->vm_mm);
        return 0;
}

/*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those if the caller indicates
 * the current vma may be removed as part of the merge.
 */
static inline bool is_mergeable_vma(struct vm_area_struct *vma,
                struct file *file, unsigned long vm_flags,
                struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
                struct anon_vma_name *anon_name, bool may_remove_vma)
{
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
         * match the flags but dirty bit -- the caller should mark
         * merged VMA as dirty. If dirty bit won't be excluded from
         * comparison, we increase pressure on the memory system forcing
         * the kernel to generate new VMAs when old one could be
         * extended instead.
         */
        if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
                return false;
        if (vma->vm_file != file)
                return false;
        if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
                return false;
        if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
                return false;
        if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
                return false;
        return true;
}

static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
                 struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
        /*
         * The list_is_singular() test is to avoid merging VMA cloned from
         * parents. This can improve scalability caused by anon_vma lock.
         */
        if ((!anon_vma1 || !anon_vma2) && (!vma ||
                list_is_singular(&vma->anon_vma_chain)))
                return true;
        return anon_vma1 == anon_vma2;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * in front of (at a lower virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 *
 * We assume the vma may be removed as part of the merge.
 */
static bool
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
                struct anon_vma *anon_vma, struct file *file,
                pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
                struct anon_vma_name *anon_name)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return true;
        }
        return false;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * beyond (at a higher virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We assume that vma is not removed as part of the merge.
 */
static bool
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
                struct anon_vma *anon_vma, struct file *file,
                pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
                struct anon_vma_name *anon_name)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = vma_pages(vma);
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
                        return true;
        }
        return false;
}

/*
 * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
 * figure out whether that can be merged with its predecessor or its
 * successor.  Or both (it neatly fills a hole).
 *
 * In most cases - when called for mmap, brk or mremap - [addr,end) is
 * certain not to be mapped by the time vma_merge is called; but when
 * called for mprotect, it is certain to be already mapped (either at
 * an offset within prev, or at the start of next), and the flags of
 * this area are about to be changed to vm_flags - and the no-change
 * case has already been eliminated.
 *
 * The following mprotect cases have to be considered, where **** is
 * the area passed down from mprotect_fixup, never extending beyond one
 * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
 * at the same address as **** and is of the same or larger span, and
 * NNNN the next vma after ****:
 *
 *     ****             ****                   ****
 *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPCCCCCC
 *    cannot merge    might become       might become
 *                    PPNNNNNNNNNN       PPPPPPPPPPCC
 *    mmap, brk or    case 4 below       case 5 below
 *    mremap move:
 *                        ****               ****
 *                    PPPP    NNNN       PPPPCCCCNNNN
 *                    might become       might become
 *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
 *                    PPPPPPPPNNNN 2 or  PPPPPPPPNNNN 7 or
 *                    PPPPNNNNNNNN 3     PPPPNNNNNNNN 8
 *
 * It is important for case 8 that the vma CCCC overlapping the
 * region **** is never going to extended over NNNN. Instead NNNN must
 * be extended in region **** and CCCC must be removed. This way in
 * all cases where vma_merge succeeds, the moment vma_merge drops the
 * rmap_locks, the properties of the merged vma will be already
 * correct for the whole merged range. Some of those properties like
 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
 * be correct for the whole merged range immediately after the
 * rmap_locks are released. Otherwise if NNNN would be removed and
 * CCCC would be extended over the NNNN range, remove_migration_ptes
 * or other rmap walkers (if working on addresses beyond the "end"
 * parameter) may establish ptes with the wrong permissions of CCCC
 * instead of the right permissions of NNNN.
 *
 * In the code below:
 * PPPP is represented by *prev
 * CCCC is represented by *curr or not represented at all (NULL)
 * NNNN is represented by *next or not represented at all (NULL)
 * **** is not represented - it will be merged and the vma containing the
 *      area is returned, or the function will return NULL
 */
static struct vm_area_struct
*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
           struct vm_area_struct *src, unsigned long addr, unsigned long end,
           unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
           struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
           struct anon_vma_name *anon_name)
{
        struct mm_struct *mm = src->vm_mm;
        struct anon_vma *anon_vma = src->anon_vma;
        struct file *file = src->vm_file;
        struct vm_area_struct *curr, *next, *res;
        struct vm_area_struct *vma, *adjust, *remove, *remove2;
        struct vm_area_struct *anon_dup = NULL;
        struct vma_prepare vp;
        pgoff_t vma_pgoff;
        int err = 0;
        bool merge_prev = false;
        bool merge_next = false;
        bool vma_expanded = false;
        unsigned long vma_start = addr;
        unsigned long vma_end = end;
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        long adj_start = 0;

        /*
         * We later require that vma->vm_flags == vm_flags,
         * so this tests vma->vm_flags & VM_SPECIAL, too.
         */
        if (vm_flags & VM_SPECIAL)
                return NULL;

        /* Does the input range span an existing VMA? (cases 5 - 8) */
        curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);

        if (!curr ||                        /* cases 1 - 4 */
            end == curr->vm_end)        /* cases 6 - 8, adjacent VMA */
                next = vma_lookup(mm, end);
        else
                next = NULL;                /* case 5 */

        if (prev) {
                vma_start = prev->vm_start;
                vma_pgoff = prev->vm_pgoff;

                /* Can we merge the predecessor? */
                if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
                    && can_vma_merge_after(prev, vm_flags, anon_vma, file,
                                           pgoff, vm_userfaultfd_ctx, anon_name)) {
                        merge_prev = true;
                        vma_prev(vmi);
                }
        }

        /* Can we merge the successor? */
        if (next && mpol_equal(policy, vma_policy(next)) &&
            can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
                                 vm_userfaultfd_ctx, anon_name)) {
                merge_next = true;
        }

        /* Verify some invariant that must be enforced by the caller. */
        VM_WARN_ON(prev && addr <= prev->vm_start);
        VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
        VM_WARN_ON(addr >= end);

        if (!merge_prev && !merge_next)
                return NULL; /* Not mergeable. */

        if (merge_prev)
                vma_start_write(prev);

        res = vma = prev;
        remove = remove2 = adjust = NULL;

        /* Can we merge both the predecessor and the successor? */
        if (merge_prev && merge_next &&
            is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
                vma_start_write(next);
                remove = next;                                /* case 1 */
                vma_end = next->vm_end;
                err = dup_anon_vma(prev, next, &anon_dup);
                if (curr) {                                /* case 6 */
                        vma_start_write(curr);
                        remove = curr;
                        remove2 = next;
                        /*
                         * Note that the dup_anon_vma below cannot overwrite err
                         * since the first caller would do nothing unless next
                         * has an anon_vma.
                         */
                        if (!next->anon_vma)
                                err = dup_anon_vma(prev, curr, &anon_dup);
                }
        } else if (merge_prev) {                        /* case 2 */
                if (curr) {
                        vma_start_write(curr);
                        if (end == curr->vm_end) {        /* case 7 */
                                /*
                                 * can_vma_merge_after() assumed we would not be
                                 * removing prev vma, so it skipped the check
                                 * for vm_ops->close, but we are removing curr
                                 */
                                if (curr->vm_ops && curr->vm_ops->close)
                                        err = -EINVAL;
                                remove = curr;
                        } else {                        /* case 5 */
                                adjust = curr;
                                adj_start = (end - curr->vm_start);
                        }
                        if (!err)
                                err = dup_anon_vma(prev, curr, &anon_dup);
                }
        } else { /* merge_next */
                vma_start_write(next);
                res = next;
                if (prev && addr < prev->vm_end) {        /* case 4 */
                        vma_start_write(prev);
                        vma_end = addr;
                        adjust = next;
                        adj_start = -(prev->vm_end - addr);
                        err = dup_anon_vma(next, prev, &anon_dup);
                } else {
                        /*
                         * Note that cases 3 and 8 are the ONLY ones where prev
                         * is permitted to be (but is not necessarily) NULL.
                         */
                        vma = next;                        /* case 3 */
                        vma_start = addr;
                        vma_end = next->vm_end;
                        vma_pgoff = next->vm_pgoff - pglen;
                        if (curr) {                        /* case 8 */
                                vma_pgoff = curr->vm_pgoff;
                                vma_start_write(curr);
                                remove = curr;
                                err = dup_anon_vma(next, curr, &anon_dup);
                        }
                }
        }

        /* Error in anon_vma clone. */
        if (err)
                goto anon_vma_fail;

        if (vma_start < vma->vm_start || vma_end > vma->vm_end)
                vma_expanded = true;

        if (vma_expanded) {
                vma_iter_config(vmi, vma_start, vma_end);
        } else {
                vma_iter_config(vmi, adjust->vm_start + adj_start,
                                adjust->vm_end);
        }

        if (vma_iter_prealloc(vmi, vma))
                goto prealloc_fail;

        init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
        VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
                   vp.anon_vma != adjust->anon_vma);

        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
        vma_set_range(vma, vma_start, vma_end, vma_pgoff);

        if (vma_expanded)
                vma_iter_store(vmi, vma);

        if (adj_start) {
                adjust->vm_start += adj_start;
                adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
                if (adj_start < 0) {
                        WARN_ON(vma_expanded);
                        vma_iter_store(vmi, next);
                }
        }

        vma_complete(&vp, vmi, mm);
        khugepaged_enter_vma(res, vm_flags);
        return res;

prealloc_fail:
        if (anon_dup)
                unlink_anon_vmas(anon_dup);

anon_vma_fail:
        vma_iter_set(vmi, addr);
        vma_iter_load(vmi);
        return NULL;
}

/*
 * Rough compatibility check to quickly see if it's even worth looking
 * at sharing an anon_vma.
 *
 * They need to have the same vm_file, and the flags can only differ
 * in things that mprotect may change.
 *
 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
 * we can merge the two vma's. For example, we refuse to merge a vma if
 * there is a vm_ops->close() function, because that indicates that the
 * driver is doing some kind of reference counting. But that doesn't
 * really matter for the anon_vma sharing case.
 */
static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
{
        return a->vm_end == b->vm_start &&
                mpol_equal(vma_policy(a), vma_policy(b)) &&
                a->vm_file == b->vm_file &&
                !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}

/*
 * Do some basic sanity checking to see if we can re-use the anon_vma
 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
 * the same as 'old', the other will be the new one that is trying
 * to share the anon_vma.
 *
 * NOTE! This runs with mmap_lock held for reading, so it is possible that
 * the anon_vma of 'old' is concurrently in the process of being set up
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
 * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
 * is to return an anon_vma that is "complex" due to having gone through
 * a fork).
 *
 * We also make sure that the two vma's are compatible (adjacent,
 * and with the same memory policies). That's all stable, even with just
 * a read lock on the mmap_lock.
 */
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
        if (anon_vma_compatible(a, b)) {
                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);

                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
        }
        return NULL;
}

/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
 * sequence of mprotects and faults may otherwise lead to distinct
 * anon_vmas being allocated, preventing vma merge in subsequent
 * mprotect.
 */
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = NULL;
        struct vm_area_struct *prev, *next;
        VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);

        /* Try next first. */
        next = vma_iter_load(&vmi);
        if (next) {
                anon_vma = reusable_anon_vma(next, vma, next);
                if (anon_vma)
                        return anon_vma;
        }

        prev = vma_prev(&vmi);
        VM_BUG_ON_VMA(prev != vma, vma);
        prev = vma_prev(&vmi);
        /* Try prev next. */
        if (prev)
                anon_vma = reusable_anon_vma(prev, prev, vma);

        /*
         * We might reach here with anon_vma == NULL if we can't find
         * any reusable anon_vma.
         * There's no absolute need to look only at touching neighbours:
         * we could search further afield for "compatible" anon_vmas.
         * But it would probably just be a waste of time searching,
         * or lead to too many vmas hanging off the same anon_vma.
         * We're trying to allow mprotect remerging later on,
         * not trying to minimize memory used for anon_vmas.
         */
        return anon_vma;
}

/*
 * If a hint addr is less than mmap_min_addr change hint to be as
 * low as possible but still greater than mmap_min_addr
 */
static inline unsigned long round_hint_to_min(unsigned long hint)
{
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
}

bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
                        unsigned long bytes)
{
        unsigned long locked_pages, limit_pages;

        if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
                return true;

        locked_pages = bytes >> PAGE_SHIFT;
        locked_pages += mm->locked_vm;

        limit_pages = rlimit(RLIMIT_MEMLOCK);
        limit_pages >>= PAGE_SHIFT;

        return locked_pages <= limit_pages;
}

static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISBLK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISSOCK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        /* Special "we do even unsigned file positions" case */
        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
                return 0;

        /* Yes, random drivers might want more. But I'm tired of buggy drivers */
        return ULONG_MAX;
}

static inline bool file_mmap_ok(struct file *file, struct inode *inode,
                                unsigned long pgoff, unsigned long len)
{
        u64 maxsize = file_mmap_size_max(file, inode);

        if (maxsize && len > maxsize)
                return false;
        maxsize -= len;
        if (pgoff > maxsize >> PAGE_SHIFT)
                return false;
        return true;
}

/*
 * The caller must write-lock current->mm->mmap_lock.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, vm_flags_t vm_flags,
                        unsigned long pgoff, unsigned long *populate,
                        struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        int pkey = 0;

        *populate = 0;

        if (!len)
                return -EINVAL;

        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
         * (the exception is when the underlying filesystem is noexec
         *  mounted, in which case we don't add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
                if (!(file && path_noexec(&file->f_path)))
                        prot |= PROT_EXEC;

        /* force arch specific MAP_FIXED handling in get_unmapped_area */
        if (flags & MAP_FIXED_NOREPLACE)
                flags |= MAP_FIXED;

        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);

        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
        if (!len)
                return -ENOMEM;

        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;

        /* Too many mappings? */
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        /*
         * addr is returned from get_unmapped_area,
         * There are two cases:
         * 1> MAP_FIXED == false
         *        unallocated memory, no need to check sealing.
         * 1> MAP_FIXED == true
         *        sealing is checked inside mmap_region when
         *        do_vmi_munmap is called.
         */

        if (prot == PROT_EXEC) {
                pkey = execute_only_pkey(mm);
                if (pkey < 0)
                        pkey = 0;
        }

        /* Do simple checking here so the lower-level routines won't have
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
        vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
        addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (flags & MAP_FIXED_NOREPLACE) {
                if (find_vma_intersection(mm, addr, addr + len))
                        return -EEXIST;
        }

        if (flags & MAP_LOCKED)
                if (!can_do_mlock())
                        return -EPERM;

        if (!mlock_future_ok(mm, vm_flags, len))
                return -EAGAIN;

        if (file) {
                struct inode *inode = file_inode(file);
                unsigned long flags_mask;

                if (!file_mmap_ok(file, inode, pgoff, len))
                        return -EOVERFLOW;

                flags_mask = LEGACY_MAP_MASK;
                if (file->f_op->fop_flags & FOP_MMAP_SYNC)
                        flags_mask |= MAP_SYNC;

                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        /*
                         * Force use of MAP_SHARED_VALIDATE with non-legacy
                         * flags. E.g. MAP_SYNC is dangerous to use with
                         * MAP_SHARED as you don't know which consistency model
                         * you will get. We silently ignore unsupported flags
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
                        fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
                        if (prot & PROT_WRITE) {
                                if (!(file->f_mode & FMODE_WRITE))
                                        return -EACCES;
                                if (IS_SWAPFILE(file->f_mapping->host))
                                        return -ETXTBSY;
                        }

                        /*
                         * Make sure we don't allow writing to an append-only
                         * file..
                         */
                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                return -EACCES;

                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
                        fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
                        if (path_noexec(&file->f_path)) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }

                        if (!file->f_op->mmap)
                                return -ENODEV;
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        break;

                default:
                        return -EINVAL;
                }
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
                         */
                        pgoff = addr >> PAGE_SHIFT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        /*
         * Set 'VM_NORESERVE' if we should not account for the
         * memory use of this mapping.
         */
        if (flags & MAP_NORESERVE) {
                /* We honor MAP_NORESERVE if allowed to overcommit */
                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vm_flags |= VM_NORESERVE;

                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
                if (file && is_file_hugepages(file))
                        vm_flags |= VM_NORESERVE;
        }

        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff)
{
        struct file *file = NULL;
        unsigned long retval;

        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
                        return -EBADF;
                if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                } else if (unlikely(flags & MAP_HUGETLB)) {
                        retval = -EINVAL;
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
                struct hstate *hs;

                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;

                len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
                                HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }

        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
        if (file)
                fput(file);
        return retval;
}

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
{
        return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}

#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
        unsigned long addr;
        unsigned long len;
        unsigned long prot;
        unsigned long flags;
        unsigned long fd;
        unsigned long offset;
};

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
        struct mmap_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        if (offset_in_page(a.offset))
                return -EINVAL;

        return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
                               a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */

static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
{
        return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
}

static bool vma_is_shared_writable(struct vm_area_struct *vma)
{
        return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
                (VM_WRITE | VM_SHARED);
}

static bool vma_fs_can_writeback(struct vm_area_struct *vma)
{
        /* No managed pages to writeback. */
        if (vma->vm_flags & VM_PFNMAP)
                return false;

        return vma->vm_file && vma->vm_file->f_mapping &&
                mapping_can_writeback(vma->vm_file->f_mapping);
}

/*
 * Does this VMA require the underlying folios to have their dirty state
 * tracked?
 */
bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
{
        /* Only shared, writable VMAs require dirty tracking. */
        if (!vma_is_shared_writable(vma))
                return false;

        /* Does the filesystem need to be notified? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /*
         * Even if the filesystem doesn't indicate a need for writenotify, if it
         * can writeback, dirty tracking is still required.
         */
        return vma_fs_can_writeback(vma);
}

/*
 * Some shared mappings will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
 * to the private version (using protection_map[] without the
 * VM_SHARED bit).
 */
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
        /* If it was private or non-writable, the write bit is already clear */
        if (!vma_is_shared_writable(vma))
                return false;

        /* The backer wishes to know when pages are first written to? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /* The open routine did something to the protections that pgprot_modify
         * won't preserve? */
        if (pgprot_val(vm_page_prot) !=
            pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
                return false;

        /*
         * Do we need to track softdirty? hugetlb does not support softdirty
         * tracking yet.
         */
        if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
                return true;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_wp(vma))
                return true;

        /* Can the mapping track the dirty pages? */
        return vma_fs_can_writeback(vma);
}

/*
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
        /*
         * hugetlb has its own accounting separate from the core VM
         * VM_HUGETLB may not be set yet so we cannot check for that flag.
         */
        if (file && is_file_hugepages(file))
                return false;

        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}

/**
 * unmapped_area() - Find an area between the low_limit and the high_limit with
 * the correct alignment and offset, all from @info. Note: current->mm is used
 * for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        /*
         * Adjust for the gap first so it doesn't interfere with the
         * later alignment. The first step is the minimum needed to
         * fulill the start gap, the next steps is the minimum to align
         * that. It is the minimum needed to fulill both.
         */
        gap = vma_iter_addr(&vmi) + info->start_gap;
        gap += (info->align_offset - gap) & info->align_mask;
        tmp = vma_next(&vmi);
        if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
                if (vm_start_gap(tmp) < gap + length - 1) {
                        low_limit = tmp->vm_end;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        low_limit = vm_end_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/**
 * unmapped_area_topdown() - Find an area between the low_limit and the
 * high_limit with the correct alignment and offset at the highest available
 * address, all from @info. Note: current->mm is used for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap, gap_end;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        gap = vma_iter_end(&vmi) - info->length;
        gap -= (gap - info->align_offset) & info->align_mask;
        gap_end = vma_iter_end(&vmi);
        tmp = vma_next(&vmi);
        if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
                if (vm_start_gap(tmp) < gap_end) {
                        high_limit = vm_start_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        high_limit = tmp->vm_start;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/*
 * Search for an unmapped address range.
 *
 * We are looking for a range that:
 * - does not intersect with any VMA;
 * - is contained within the [low_limit, high_limit) interval;
 * - is at least the desired size.
 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
 */
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long addr;

        if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
                addr = unmapped_area_topdown(info);
        else
                addr = unmapped_area(info);

        trace_vm_unmapped_area(addr, info);
        return addr;
}

/* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *        if (ret & ~PAGE_MASK)
 *                error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
 */
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)) &&
                    (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.length = len;
        info.low_limit = mm->mmap_base;
        info.high_limit = mmap_end;
        return vm_unmapped_area(&info);
}

#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags)
{
        return generic_get_unmapped_area(filp, addr, len, pgoff, flags);
}
#endif

/*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
 */
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags)
{
        struct vm_area_struct *vma, *prev;
        struct mm_struct *mm = current->mm;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        /* requested length too big for entire address space */
        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        /* requesting a specific address */
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vm_start_gap(vma)) &&
                                (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = mmap_end;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}

#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags)
{
        return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
}
#endif

#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS
unsigned long
arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
                               unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        return arch_get_unmapped_area(filp, addr, len, pgoff, flags);
}

unsigned long
arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
                                       unsigned long len, unsigned long pgoff,
                                       unsigned long flags, vm_flags_t vm_flags)
{
        return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
}
#endif

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
                                           unsigned long addr, unsigned long len,
                                           unsigned long pgoff, unsigned long flags,
                                           vm_flags_t vm_flags)
{
        if (test_bit(MMF_TOPDOWN, &mm->flags))
                return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff,
                                                              flags, vm_flags);
        return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags);
}

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long)
                                  = NULL;

        unsigned long error = arch_mmap_check(addr, len, flags);
        if (error)
                return error;

        /* Careful about overflows.. */
        if (len > TASK_SIZE)
                return -ENOMEM;

        if (file) {
                if (file->f_op->get_unmapped_area)
                        get_area = file->f_op->get_unmapped_area;
        } else if (flags & MAP_SHARED) {
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
                 */
                get_area = shmem_get_unmapped_area;
        }

        /* Always treat pgoff as zero for anonymous memory. */
        if (!file)
                pgoff = 0;

        if (get_area) {
                addr = get_area(file, addr, len, pgoff, flags);
        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                /* Ensures that larger anonymous mappings are THP aligned. */
                addr = thp_get_unmapped_area_vmflags(file, addr, len,
                                                     pgoff, flags, vm_flags);
        } else {
                addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
                                                    pgoff, flags, vm_flags);
        }
        if (IS_ERR_VALUE(addr))
                return addr;

        if (addr > TASK_SIZE - len)
                return -ENOMEM;
        if (offset_in_page(addr))
                return -EINVAL;

        error = security_mmap_addr(addr);
        return error ? error : addr;
}

unsigned long
mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
                     unsigned long addr, unsigned long len,
                     unsigned long pgoff, unsigned long flags)
{
        if (test_bit(MMF_TOPDOWN, &mm->flags))
                return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags);
        return arch_get_unmapped_area(file, addr, len, pgoff, flags);
}
EXPORT_SYMBOL(mm_get_unmapped_area);

/**
 * find_vma_intersection() - Look up the first VMA which intersects the interval
 * @mm: The process address space.
 * @start_addr: The inclusive start user address.
 * @end_addr: The exclusive end user address.
 *
 * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
 * start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                                             unsigned long start_addr,
                                             unsigned long end_addr)
{
        unsigned long index = start_addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, end_addr - 1);
}
EXPORT_SYMBOL(find_vma_intersection);

/**
 * find_vma() - Find the VMA for a given address, or the next VMA.
 * @mm: The mm_struct to check
 * @addr: The address
 *
 * Returns: The VMA associated with addr, or the next VMA.
 * May return %NULL in the case of no VMA at addr or above.
 */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        unsigned long index = addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
EXPORT_SYMBOL(find_vma);

/**
 * find_vma_prev() - Find the VMA for a given address, or the next vma and
 * set %pprev to the previous VMA, if any.
 * @mm: The mm_struct to check
 * @addr: The address
 * @pprev: The pointer to set to the previous VMA
 *
 * Note that RCU lock is missing here since the external mmap_lock() is used
 * instead.
 *
 * Returns: The VMA associated with @addr, or the next vma.
 * May return %NULL in the case of no vma at addr or above.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, addr);

        vma = vma_iter_load(&vmi);
        *pprev = vma_prev(&vmi);
        if (!vma)
                vma = vma_next(&vmi);
        return vma;
}

/*
 * Verify that the stack growth is acceptable and
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
static int acct_stack_growth(struct vm_area_struct *vma,
                             unsigned long size, unsigned long grow)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long new_start;

        /* address space limit tests */
        if (!may_expand_vm(mm, vma->vm_flags, grow))
                return -ENOMEM;

        /* Stack limit test */
        if (size > rlimit(RLIMIT_STACK))
                return -ENOMEM;

        /* mlock limit tests */
        if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
                return -ENOMEM;

        /* Check to ensure the stack will not grow into a hugetlb-only region */
        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
                        vma->vm_end - size;
        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
                return -EFAULT;

        /*
         * Overcommit..  This must be the final test, as it will
         * update security statistics.
         */
        if (security_vm_enough_memory_mm(mm, grow))
                return -ENOMEM;

        return 0;
}

#if defined(CONFIG_STACK_GROWSUP)
/*
 * PA-RISC uses this for its stack.
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next;
        unsigned long gap_addr;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!(vma->vm_flags & VM_GROWSUP))
                return -EFAULT;

        /* Guard against exceeding limits of the address space. */
        address &= PAGE_MASK;
        if (address >= (TASK_SIZE & PAGE_MASK))
                return -ENOMEM;
        address += PAGE_SIZE;

        /* Enforce stack_guard_gap */
        gap_addr = address + stack_guard_gap;

        /* Guard against overflow */
        if (gap_addr < address || gap_addr > TASK_SIZE)
                gap_addr = TASK_SIZE;

        next = find_vma_intersection(mm, vma->vm_end, gap_addr);
        if (next && vma_is_accessible(next)) {
                if (!(next->vm_flags & VM_GROWSUP))
                        return -ENOMEM;
                /* Check that both stack segments have the same anon_vma? */
        }

        if (next)
                vma_iter_prev_range_limit(&vmi, address);

        vma_iter_config(&vmi, vma->vm_start, address);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
                unsigned long size, grow;

                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * We only hold a shared mmap_lock lock here, so
                                 * we need to protect against concurrent vma
                                 * expansions.  anon_vma_lock_write() doesn't
                                 * help here, as we don't guarantee that all
                                 * growable vmas in a mm share the same root
                                 * anon vma.  So, we reuse mm->page_table_lock
                                 * to guard against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}
#endif /* CONFIG_STACK_GROWSUP */

/*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 * mmap_lock held for writing.
 */
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *prev;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!(vma->vm_flags & VM_GROWSDOWN))
                return -EFAULT;

        address &= PAGE_MASK;
        if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
                return -EPERM;

        /* Enforce stack_guard_gap */
        prev = vma_prev(&vmi);
        /* Check that both stack segments have the same anon_vma? */
        if (prev) {
                if (!(prev->vm_flags & VM_GROWSDOWN) &&
                    vma_is_accessible(prev) &&
                    (address - prev->vm_end < stack_guard_gap))
                        return -ENOMEM;
        }

        if (prev)
                vma_iter_next_range_limit(&vmi, vma->vm_start);

        vma_iter_config(&vmi, address, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
                unsigned long size, grow;

                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * We only hold a shared mmap_lock lock here, so
                                 * we need to protect against concurrent vma
                                 * expansions.  anon_vma_lock_write() doesn't
                                 * help here, as we don't guarantee that all
                                 * growable vmas in a mm share the same root
                                 * anon vma.  So, we reuse mm->page_table_lock
                                 * to guard against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}

/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

static int __init cmdline_parse_stack_guard_gap(char *p)
{
        unsigned long val;
        char *endptr;

        val = simple_strtoul(p, &endptr, 10);
        if (!*endptr)
                stack_guard_gap = val << PAGE_SHIFT;

        return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);

#ifdef CONFIG_STACK_GROWSUP
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_upwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        addr &= PAGE_MASK;
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
        if (!prev)
                return NULL;
        if (expand_stack_locked(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
}
#else
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_downwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma;
        unsigned long start;

        addr &= PAGE_MASK;
        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
                return vma;
        start = vma->vm_start;
        if (expand_stack_locked(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
}
#endif

#if defined(CONFIG_STACK_GROWSUP)

#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
#define vma_expand_down(vma, addr) (-EFAULT)

#else

#define vma_expand_up(vma,addr) (-EFAULT)
#define vma_expand_down(vma, addr) expand_downwards(vma, addr)

#endif

/*
 * expand_stack(): legacy interface for page faulting. Don't use unless
 * you have to.
 *
 * This is called with the mm locked for reading, drops the lock, takes
 * the lock for writing, tries to look up a vma again, expands it if
 * necessary, and downgrades the lock to reading again.
 *
 * If no vma is found or it can't be expanded, it returns NULL and has
 * dropped the lock.
 */
struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        mmap_read_unlock(mm);
        if (mmap_write_lock_killable(mm))
                return NULL;

        vma = find_vma_prev(mm, addr, &prev);
        if (vma && vma->vm_start <= addr)
                goto success;

        if (prev && !vma_expand_up(prev, addr)) {
                vma = prev;
                goto success;
        }

        if (vma && !vma_expand_down(vma, addr))
                goto success;

        mmap_write_unlock(mm);
        return NULL;

success:
        mmap_write_downgrade(mm);
        return vma;
}

/*
 * Ok - we have the memory areas we should free on a maple tree so release them,
 * and do the vma updates.
 *
 * Called with the mm semaphore held.
 */
static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
{
        unsigned long nr_accounted = 0;
        struct vm_area_struct *vma;

        /* Update high watermark before we lower total_vm */
        update_hiwater_vm(mm);
        mas_for_each(mas, vma, ULONG_MAX) {
                long nrpages = vma_pages(vma);

                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
                vm_stat_account(mm, vma->vm_flags, -nrpages);
                remove_vma(vma, false);
        }
        vm_unacct_memory(nr_accounted);
}

/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the mm semaphore held.
 */
static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                struct vm_area_struct *next, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked)
{
        struct mmu_gather tlb;
        unsigned long mt_start = mas->index;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
        mas_set(mas, mt_start);
        free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
                                 next ? next->vm_start : USER_PGTABLES_CEILING,
                                 mm_wr_locked);
        tlb_finish_mmu(&tlb);
}

/*
 * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
 * has already been checked or doesn't make sense to fail.
 * VMA Iterator will point to the end VMA.
 */
static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
                       unsigned long addr, int new_below)
{
        struct vma_prepare vp;
        struct vm_area_struct *new;
        int err;

        WARN_ON(vma->vm_start >= addr);
        WARN_ON(vma->vm_end <= addr);

        if (vma->vm_ops && vma->vm_ops->may_split) {
                err = vma->vm_ops->may_split(vma, addr);
                if (err)
                        return err;
        }

        new = vm_area_dup(vma);
        if (!new)
                return -ENOMEM;

        if (new_below) {
                new->vm_end = addr;
        } else {
                new->vm_start = addr;
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }

        err = -ENOMEM;
        vma_iter_config(vmi, new->vm_start, new->vm_end);
        if (vma_iter_prealloc(vmi, new))
                goto out_free_vma;

        err = vma_dup_policy(vma, new);
        if (err)
                goto out_free_vmi;

        err = anon_vma_clone(new, vma);
        if (err)
                goto out_free_mpol;

        if (new->vm_file)
                get_file(new->vm_file);

        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);

        vma_start_write(vma);
        vma_start_write(new);

        init_vma_prep(&vp, vma);
        vp.insert = new;
        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);

        if (new_below) {
                vma->vm_start = addr;
                vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
        } else {
                vma->vm_end = addr;
        }

        /* vma_complete stores the new vma */
        vma_complete(&vp, vmi, vma->vm_mm);

        /* Success. */
        if (new_below)
                vma_next(vmi);
        return 0;

out_free_mpol:
        mpol_put(vma_policy(new));
out_free_vmi:
        vma_iter_free(vmi);
out_free_vma:
        vm_area_free(new);
        return err;
}

/*
 * Split a vma into two pieces at address 'addr', a new vma is allocated
 * either for the first part or the tail.
 */
static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
                     unsigned long addr, int new_below)
{
        if (vma->vm_mm->map_count >= sysctl_max_map_count)
                return -ENOMEM;

        return __split_vma(vmi, vma, addr, new_below);
}

/*
 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
 * context and anonymous VMA name within the range [start, end).
 *
 * As a result, we might be able to merge the newly modified VMA range with an
 * adjacent VMA with identical properties.
 *
 * If no merge is possible and the range does not span the entirety of the VMA,
 * we then need to split the VMA to accommodate the change.
 *
 * The function returns either the merged VMA, the original VMA if a split was
 * required instead, or an error if the split failed.
 */
struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
                                  struct vm_area_struct *prev,
                                  struct vm_area_struct *vma,
                                  unsigned long start, unsigned long end,
                                  unsigned long vm_flags,
                                  struct mempolicy *policy,
                                  struct vm_userfaultfd_ctx uffd_ctx,
                                  struct anon_vma_name *anon_name)
{
        pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        struct vm_area_struct *merged;

        merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
                           pgoff, policy, uffd_ctx, anon_name);
        if (merged)
                return merged;

        if (vma->vm_start < start) {
                int err = split_vma(vmi, vma, start, 1);

                if (err)
                        return ERR_PTR(err);
        }

        if (vma->vm_end > end) {
                int err = split_vma(vmi, vma, end, 0);

                if (err)
                        return ERR_PTR(err);
        }

        return vma;
}

/*
 * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
 * must ensure that [start, end) does not overlap any existing VMA.
 */
static struct vm_area_struct
*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
                   struct vm_area_struct *vma, unsigned long start,
                   unsigned long end, pgoff_t pgoff)
{
        return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
                         vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}

/*
 * Expand vma by delta bytes, potentially merging with an immediately adjacent
 * VMA with identical properties.
 */
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
                                        struct vm_area_struct *vma,
                                        unsigned long delta)
{
        pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);

        /* vma is specified as prev, so case 1 or 2 will apply. */
        return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
                         vma->vm_flags, pgoff, vma_policy(vma),
                         vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}

/*
 * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
 * @vmi: The vma iterator
 * @vma: The starting vm_area_struct
 * @mm: The mm_struct
 * @start: The aligned start address to munmap.
 * @end: The aligned end address to munmap.
 * @uf: The userfaultfd list_head
 * @unlock: Set to true to drop the mmap_lock.  unlocking only happens on
 * success.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
static int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                    struct mm_struct *mm, unsigned long start,
                    unsigned long end, struct list_head *uf, bool unlock)
{
        struct vm_area_struct *prev, *next = NULL;
        struct maple_tree mt_detach;
        int count = 0;
        int error = -ENOMEM;
        unsigned long locked_vm = 0;
        MA_STATE(mas_detach, &mt_detach, 0, 0);
        mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
        mt_on_stack(mt_detach);

        /*
         * If we need to split any vma, do it now to save pain later.
         *
         * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
         * unmapped vm_area_struct will remain in use: so lower split_vma
         * places tmp vma above, and higher split_vma places tmp vma below.
         */

        /* Does it split the first one? */
        if (start > vma->vm_start) {

                /*
                 * Make sure that map_count on return from munmap() will
                 * not exceed its limit; but let map_count go just above
                 * its limit temporarily, to help free resources as expected.
                 */
                if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
                        goto map_count_exceeded;

                error = __split_vma(vmi, vma, start, 1);
                if (error)
                        goto start_split_failed;
        }

        /*
         * Detach a range of VMAs from the mm. Using next as a temp variable as
         * it is always overwritten.
         */
        next = vma;
        do {
                /* Does it split the end? */
                if (next->vm_end > end) {
                        error = __split_vma(vmi, next, end, 0);
                        if (error)
                                goto end_split_failed;
                }
                vma_start_write(next);
                mas_set(&mas_detach, count);
                error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
                if (error)
                        goto munmap_gather_failed;
                vma_mark_detached(next, true);
                if (next->vm_flags & VM_LOCKED)
                        locked_vm += vma_pages(next);

                count++;
                if (unlikely(uf)) {
                        /*
                         * If userfaultfd_unmap_prep returns an error the vmas
                         * will remain split, but userland will get a
                         * highly unexpected error anyway. This is no
                         * different than the case where the first of the two
                         * __split_vma fails, but we don't undo the first
                         * split, despite we could. This is unlikely enough
                         * failure that it's not worth optimizing it for.
                         */
                        error = userfaultfd_unmap_prep(next, start, end, uf);

                        if (error)
                                goto userfaultfd_error;
                }
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
                BUG_ON(next->vm_start < start);
                BUG_ON(next->vm_start > end);
#endif
        } for_each_vma_range(*vmi, next, end);

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        /* Make sure no VMAs are about to be lost. */
        {
                MA_STATE(test, &mt_detach, 0, 0);
                struct vm_area_struct *vma_mas, *vma_test;
                int test_count = 0;

                vma_iter_set(vmi, start);
                rcu_read_lock();
                vma_test = mas_find(&test, count - 1);
                for_each_vma_range(*vmi, vma_mas, end) {
                        BUG_ON(vma_mas != vma_test);
                        test_count++;
                        vma_test = mas_next(&test, count - 1);
                }
                rcu_read_unlock();
                BUG_ON(count != test_count);
        }
#endif

        while (vma_iter_addr(vmi) > start)
                vma_iter_prev_range(vmi);

        error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
        if (error)
                goto clear_tree_failed;

        /* Point of no return */
        mm->locked_vm -= locked_vm;
        mm->map_count -= count;
        if (unlock)
                mmap_write_downgrade(mm);

        prev = vma_iter_prev_range(vmi);
        next = vma_next(vmi);
        if (next)
                vma_iter_prev_range(vmi);

        /*
         * We can free page tables without write-locking mmap_lock because VMAs
         * were isolated before we downgraded mmap_lock.
         */
        mas_set(&mas_detach, 1);
        unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
                     !unlock);
        /* Statistics and freeing VMAs */
        mas_set(&mas_detach, 0);
        remove_mt(mm, &mas_detach);
        validate_mm(mm);
        if (unlock)
                mmap_read_unlock(mm);

        __mt_destroy(&mt_detach);
        return 0;

clear_tree_failed:
userfaultfd_error:
munmap_gather_failed:
end_split_failed:
        mas_set(&mas_detach, 0);
        mas_for_each(&mas_detach, next, end)
                vma_mark_detached(next, false);

        __mt_destroy(&mt_detach);
start_split_failed:
map_count_exceeded:
        validate_mm(mm);
        return error;
}

/*
 * do_vmi_munmap() - munmap a given range.
 * @vmi: The vma iterator
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length of the range to munmap
 * @uf: The userfaultfd list_head
 * @unlock: set to true if the user wants to drop the mmap_lock on success
 *
 * This function takes a @mas that is either pointing to the previous VMA or set
 * to MA_START and sets it up to remove the mapping(s).  The @len will be
 * aligned and any arch_unmap work will be preformed.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                  unsigned long start, size_t len, struct list_head *uf,
                  bool unlock)
{
        unsigned long end;
        struct vm_area_struct *vma;

        if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;

        end = start + PAGE_ALIGN(len);
        if (end == start)
                return -EINVAL;

        /*
         * Check if memory is sealed before arch_unmap.
         * Prevent unmapping a sealed VMA.
         * can_modify_mm assumes we have acquired the lock on MM.
         */
        if (unlikely(!can_modify_mm(mm, start, end)))
                return -EPERM;

         /* arch_unmap() might do unmaps itself.  */
        arch_unmap(mm, start, end);

        /* Find the first overlapping VMA */
        vma = vma_find(vmi, end);
        if (!vma) {
                if (unlock)
                        mmap_write_unlock(mm);
                return 0;
        }

        return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}

/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length to be munmapped.
 * @uf: The userfaultfd list_head
 *
 * Return: 0 on success, error otherwise.
 */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
              struct list_head *uf)
{
        VMA_ITERATOR(vmi, mm, start);

        return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}

unsigned long mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct vm_area_struct *next, *prev, *merge;
        pgoff_t pglen = len >> PAGE_SHIFT;
        unsigned long charged = 0;
        unsigned long end = addr + len;
        unsigned long merge_start = addr, merge_end = end;
        bool writable_file_mapping = false;
        pgoff_t vm_pgoff;
        int error;
        VMA_ITERATOR(vmi, mm, addr);

        /* Check against address space limit. */
        if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
                unsigned long nr_pages;

                /*
                 * MAP_FIXED may remove pages of mappings that intersects with
                 * requested mapping. Account for the pages it would unmap.
                 */
                nr_pages = count_vma_pages_range(mm, addr, end);

                if (!may_expand_vm(mm, vm_flags,
                                        (len >> PAGE_SHIFT) - nr_pages))
                        return -ENOMEM;
        }

        /* Unmap any existing mapping in the area */
        error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
        if (error == -EPERM)
                return error;
        else if (error)
                return -ENOMEM;

        /*
         * Private writable mapping: check memory availability
         */
        if (accountable_mapping(file, vm_flags)) {
                charged = len >> PAGE_SHIFT;
                if (security_vm_enough_memory_mm(mm, charged))
                        return -ENOMEM;
                vm_flags |= VM_ACCOUNT;
        }

        next = vma_next(&vmi);
        prev = vma_prev(&vmi);
        if (vm_flags & VM_SPECIAL) {
                if (prev)
                        vma_iter_next_range(&vmi);
                goto cannot_expand;
        }

        /* Attempt to expand an old mapping */
        /* Check next */
        if (next && next->vm_start == end && !vma_policy(next) &&
            can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
                                 NULL_VM_UFFD_CTX, NULL)) {
                merge_end = next->vm_end;
                vma = next;
                vm_pgoff = next->vm_pgoff - pglen;
        }

        /* Check prev */
        if (prev && prev->vm_end == addr && !vma_policy(prev) &&
            (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
                                       pgoff, vma->vm_userfaultfd_ctx, NULL) :
                   can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
                                       NULL_VM_UFFD_CTX, NULL))) {
                merge_start = prev->vm_start;
                vma = prev;
                vm_pgoff = prev->vm_pgoff;
        } else if (prev) {
                vma_iter_next_range(&vmi);
        }

        /* Actually expand, if possible */
        if (vma &&
            !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
                khugepaged_enter_vma(vma, vm_flags);
                goto expanded;
        }

        if (vma == prev)
                vma_iter_set(&vmi, addr);
cannot_expand:

        /*
         * Determine the object being mapped and call the appropriate
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
        vma = vm_area_alloc(mm);
        if (!vma) {
                error = -ENOMEM;
                goto unacct_error;
        }

        vma_iter_config(&vmi, addr, end);
        vma_set_range(vma, addr, end, pgoff);
        vm_flags_init(vma, vm_flags);
        vma->vm_page_prot = vm_get_page_prot(vm_flags);

        if (file) {
                vma->vm_file = get_file(file);
                error = call_mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;

                if (vma_is_shared_maywrite(vma)) {
                        error = mapping_map_writable(file->f_mapping);
                        if (error)
                                goto close_and_free_vma;

                        writable_file_mapping = true;
                }

                /*
                 * Expansion is handled above, merging is handled below.
                 * Drivers should not alter the address of the VMA.
                 */
                error = -EINVAL;
                if (WARN_ON((addr != vma->vm_start)))
                        goto close_and_free_vma;

                vma_iter_config(&vmi, addr, end);
                /*
                 * If vm_flags changed after call_mmap(), we should try merge
                 * vma again as we may succeed this time.
                 */
                if (unlikely(vm_flags != vma->vm_flags && prev)) {
                        merge = vma_merge_new_vma(&vmi, prev, vma,
                                                  vma->vm_start, vma->vm_end,
                                                  vma->vm_pgoff);
                        if (merge) {
                                /*
                                 * ->mmap() can change vma->vm_file and fput
                                 * the original file. So fput the vma->vm_file
                                 * here or we would add an extra fput for file
                                 * and cause general protection fault
                                 * ultimately.
                                 */
                                fput(vma->vm_file);
                                vm_area_free(vma);
                                vma = merge;
                                /* Update vm_flags to pick up the change. */
                                vm_flags = vma->vm_flags;
                                goto unmap_writable;
                        }
                }

                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
        } else {
                vma_set_anonymous(vma);
        }

        if (map_deny_write_exec(vma, vma->vm_flags)) {
                error = -EACCES;
                goto close_and_free_vma;
        }

        /* Allow architectures to sanity-check the vm_flags */
        error = -EINVAL;
        if (!arch_validate_flags(vma->vm_flags))
                goto close_and_free_vma;

        error = -ENOMEM;
        if (vma_iter_prealloc(&vmi, vma))
                goto close_and_free_vma;

        /* Lock the VMA since it is modified after insertion into VMA tree */
        vma_start_write(vma);
        vma_iter_store(&vmi, vma);
        mm->map_count++;
        vma_link_file(vma);

        /*
         * vma_merge() calls khugepaged_enter_vma() either, the below
         * call covers the non-merge case.
         */
        khugepaged_enter_vma(vma, vma->vm_flags);

        /* Once vma denies write, undo our temporary denial count */
unmap_writable:
        if (writable_file_mapping)
                mapping_unmap_writable(file->f_mapping);
        file = vma->vm_file;
        ksm_add_vma(vma);
expanded:
        perf_event_mmap(vma);

        vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
                                        is_vm_hugetlb_page(vma) ||
                                        vma == get_gate_vma(current->mm))
                        vm_flags_clear(vma, VM_LOCKED_MASK);
                else
                        mm->locked_vm += (len >> PAGE_SHIFT);
        }

        if (file)
                uprobe_mmap(vma);

        /*
         * New (or expanded) vma always get soft dirty status.
         * Otherwise user-space soft-dirty page tracker won't
         * be able to distinguish situation when vma area unmapped,
         * then new mapped in-place (which must be aimed as
         * a completely new data area).
         */
        vm_flags_set(vma, VM_SOFTDIRTY);

        vma_set_page_prot(vma);

        validate_mm(mm);
        return addr;

close_and_free_vma:
        if (file && vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);

        if (file || vma->vm_file) {
unmap_and_free_vma:
                fput(vma->vm_file);
                vma->vm_file = NULL;

                vma_iter_set(&vmi, vma->vm_end);
                /* Undo any partial mapping done by a device driver. */
                unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
                             vma->vm_end, vma->vm_end, true);
        }
        if (writable_file_mapping)
                mapping_unmap_writable(file->f_mapping);
free_vma:
        vm_area_free(vma);
unacct_error:
        if (charged)
                vm_unacct_memory(charged);
        validate_mm(mm);
        return error;
}

static int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
        int ret;
        struct mm_struct *mm = current->mm;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, start);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
        if (ret || !unlock)
                mmap_write_unlock(mm);

        userfaultfd_unmap_complete(mm, &uf);
        return ret;
}

int vm_munmap(unsigned long start, size_t len)
{
        return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
        addr = untagged_addr(addr);
        return __vm_munmap(addr, len, true);
}


/*
 * Emulation of deprecated remap_file_pages() syscall.
 */
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long populate = 0;
        unsigned long ret = -EINVAL;
        struct file *file;

        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
                     current->comm, current->pid);

        if (prot)
                return ret;
        start = start & PAGE_MASK;
        size = size & PAGE_MASK;

        if (start + size <= start)
                return ret;

        /* Does pgoff wrap? */
        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
                return ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vma = vma_lookup(mm, start);

        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;

        if (start + size > vma->vm_end) {
                VMA_ITERATOR(vmi, mm, vma->vm_end);
                struct vm_area_struct *next, *prev = vma;

                for_each_vma_range(vmi, next, start + size) {
                        /* hole between vmas ? */
                        if (next->vm_start != prev->vm_end)
                                goto out;

                        if (next->vm_file != vma->vm_file)
                                goto out;

                        if (next->vm_flags != vma->vm_flags)
                                goto out;

                        if (start + size <= next->vm_end)
                                break;

                        prev = next;
                }

                if (!next)
                        goto out;
        }

        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;

        file = get_file(vma->vm_file);
        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, 0, pgoff, &populate, NULL);
        fput(file);
out:
        mmap_write_unlock(mm);
        if (populate)
                mm_populate(ret, populate);
        if (!IS_ERR_VALUE(ret))
                ret = 0;
        return ret;
}

/*
 * do_vma_munmap() - Unmap a full or partial vma.
 * @vmi: The vma iterator pointing at the vma
 * @vma: The first vma to be munmapped
 * @start: the start of the address to unmap
 * @end: The end of the address to unmap
 * @uf: The userfaultfd list_head
 * @unlock: Drop the lock on success
 *
 * unmaps a VMA mapping when the vma iterator is already in position.
 * Does not handle alignment.
 *
 * Return: 0 on success drops the lock of so directed, error on failure and will
 * still hold the lock.
 */
int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                unsigned long start, unsigned long end, struct list_head *uf,
                bool unlock)
{
        struct mm_struct *mm = vma->vm_mm;

        /*
         * Check if memory is sealed before arch_unmap.
         * Prevent unmapping a sealed VMA.
         * can_modify_mm assumes we have acquired the lock on MM.
         */
        if (unlikely(!can_modify_mm(mm, start, end)))
                return -EPERM;

        arch_unmap(mm, start, end);
        return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}

/*
 * do_brk_flags() - Increase the brk vma if the flags match.
 * @vmi: The vma iterator
 * @addr: The start address
 * @len: The length of the increase
 * @vma: The vma,
 * @flags: The VMA Flags
 *
 * Extend the brk VMA from addr to addr + len.  If the VMA is NULL or the flags
 * do not match then create a new anonymous VMA.  Eventually we may be able to
 * do some brk-specific accounting here.
 */
static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
                unsigned long addr, unsigned long len, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vma_prepare vp;

        /*
         * Check against address space limits by the changed size
         * Note: This happens *after* clearing old mappings in some code paths.
         */
        flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
        if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
                return -ENOMEM;

        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;

        /*
         * Expand the existing vma if possible; Note that singular lists do not
         * occur after forking, so the expand will only happen on new VMAs.
         */
        if (vma && vma->vm_end == addr && !vma_policy(vma) &&
            can_vma_merge_after(vma, flags, NULL, NULL,
                                addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
                vma_iter_config(vmi, vma->vm_start, addr + len);
                if (vma_iter_prealloc(vmi, vma))
                        goto unacct_fail;

                vma_start_write(vma);

                init_vma_prep(&vp, vma);
                vma_prepare(&vp);
                vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
                vma->vm_end = addr + len;
                vm_flags_set(vma, VM_SOFTDIRTY);
                vma_iter_store(vmi, vma);

                vma_complete(&vp, vmi, mm);
                khugepaged_enter_vma(vma, flags);
                goto out;
        }

        if (vma)
                vma_iter_next_range(vmi);
        /* create a vma struct for an anonymous mapping */
        vma = vm_area_alloc(mm);
        if (!vma)
                goto unacct_fail;

        vma_set_anonymous(vma);
        vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
        vm_flags_init(vma, flags);
        vma->vm_page_prot = vm_get_page_prot(flags);
        vma_start_write(vma);
        if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
                goto mas_store_fail;

        mm->map_count++;
        validate_mm(mm);
        ksm_add_vma(vma);
out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
        mm->data_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
        vm_flags_set(vma, VM_SOFTDIRTY);
        return 0;

mas_store_fail:
        vm_area_free(vma);
unacct_fail:
        vm_unacct_memory(len >> PAGE_SHIFT);
        return -ENOMEM;
}

int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        unsigned long len;
        int ret;
        bool populate;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, addr);

        len = PAGE_ALIGN(request);
        if (len < request)
                return -ENOMEM;
        if (!len)
                return 0;

        /* Until we need other flags, refuse anything except VM_EXEC. */
        if ((flags & (~VM_EXEC)) != 0)
                return -EINVAL;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = check_brk_limits(addr, len);
        if (ret)
                goto limits_failed;

        ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
        if (ret)
                goto munmap_failed;

        vma = vma_prev(&vmi);
        ret = do_brk_flags(&vmi, vma, addr, len, flags);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate && !ret)
                mm_populate(addr, len);
        return ret;

munmap_failed:
limits_failed:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(vm_brk_flags);

/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        VMA_ITERATOR(vmi, mm, 0);
        int count = 0;

        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);

        mmap_read_lock(mm);
        arch_exit_mmap(mm);

        vma = vma_next(&vmi);
        if (!vma || unlikely(xa_is_zero(vma))) {
                /* Can happen if dup_mmap() received an OOM */
                mmap_read_unlock(mm);
                mmap_write_lock(mm);
                goto destroy;
        }

        lru_add_drain();
        flush_cache_mm(mm);
        tlb_gather_mmu_fullmm(&tlb, mm);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
        mmap_read_unlock(mm);

        /*
         * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
         * because the memory has been already freed.
         */
        set_bit(MMF_OOM_SKIP, &mm->flags);
        mmap_write_lock(mm);
        mt_clear_in_rcu(&mm->mm_mt);
        vma_iter_set(&vmi, vma->vm_end);
        free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
                      USER_PGTABLES_CEILING, true);
        tlb_finish_mmu(&tlb);

        /*
         * Walk the list again, actually closing and freeing it, with preemption
         * enabled, without holding any MM locks besides the unreachable
         * mmap_write_lock.
         */
        vma_iter_set(&vmi, vma->vm_end);
        do {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                remove_vma(vma, true);
                count++;
                cond_resched();
                vma = vma_next(&vmi);
        } while (vma && likely(!xa_is_zero(vma)));

        BUG_ON(count != mm->map_count);

        trace_exit_mmap(mm);
destroy:
        __mt_destroy(&mm->mm_mt);
        mmap_write_unlock(mm);
        vm_unacct_memory(nr_accounted);
}

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        unsigned long charged = vma_pages(vma);


        if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
                return -ENOMEM;

        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, charged))
                return -ENOMEM;

        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
         * are set.  But now set the vm_pgoff it will almost certainly
         * end up with (unless mremap moves it elsewhere before that
         * first wfault), so /proc/pid/maps tells a consistent story.
         *
         * By setting it to reflect the virtual start address of the
         * vma, merges and splits can happen in a seamless way, just
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap and in do_brk_flags.
         */
        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }

        if (vma_link(mm, vma)) {
                if (vma->vm_flags & VM_ACCOUNT)
                        vm_unacct_memory(charged);
                return -ENOMEM;
        }

        return 0;
}

/*
 * Copy the vma structure to a new location in the same mm,
 * prior to moving page table entries, to effect an mremap move.
 */
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks)
{
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma, *prev;
        bool faulted_in_anon_vma = true;
        VMA_ITERATOR(vmi, mm, addr);

        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }

        new_vma = find_vma_prev(mm, addr, &prev);
        if (new_vma && new_vma->vm_start < addr + len)
                return NULL;        /* should never get here */

        new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
                 */
                if (unlikely(vma_start >= new_vma->vm_start &&
                             vma_start < new_vma->vm_end)) {
                        /*
                         * The only way we can get a vma_merge with
                         * self during an mremap is if the vma hasn't
                         * been faulted in yet and we were allowed to
                         * reset the dst vma->vm_pgoff to the
                         * destination address of the mremap to allow
                         * the merge to happen. mremap must change the
                         * vm_pgoff linearity between src and dst vmas
                         * (in turn preventing a vma_merge) to be
                         * safe. It is only safe to keep the vm_pgoff
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
                        *vmap = vma = new_vma;
                }
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = vm_area_dup(vma);
                if (!new_vma)
                        goto out;
                vma_set_range(new_vma, addr, addr + len, pgoff);
                if (vma_dup_policy(vma, new_vma))
                        goto out_free_vma;
                if (anon_vma_clone(new_vma, vma))
                        goto out_free_mempol;
                if (new_vma->vm_file)
                        get_file(new_vma->vm_file);
                if (new_vma->vm_ops && new_vma->vm_ops->open)
                        new_vma->vm_ops->open(new_vma);
                if (vma_link(mm, new_vma))
                        goto out_vma_link;
                *need_rmap_locks = false;
        }
        return new_vma;

out_vma_link:
        if (new_vma->vm_ops && new_vma->vm_ops->close)
                new_vma->vm_ops->close(new_vma);

        if (new_vma->vm_file)
                fput(new_vma->vm_file);

        unlink_anon_vmas(new_vma);
out_free_mempol:
        mpol_put(vma_policy(new_vma));
out_free_vma:
        vm_area_free(new_vma);
out:
        return NULL;
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
 */
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;

        if (is_data_mapping(flags) &&
            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
                /* Workaround for Valgrind */
                if (rlimit(RLIMIT_DATA) == 0 &&
                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
                        return true;

                pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
                             current->comm, current->pid,
                             (mm->data_vm + npages) << PAGE_SHIFT,
                             rlimit(RLIMIT_DATA),
                             ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");

                if (!ignore_rlimit_data)
                        return false;
        }

        return true;
}

void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
        WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);

        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
        else if (is_data_mapping(flags))
                mm->data_vm += npages;
}

static vm_fault_t special_mapping_fault(struct vm_fault *vmf);

/*
 * Having a close hook prevents vma merging regardless of flags.
 */
static void special_mapping_close(struct vm_area_struct *vma)
{
}

static const char *special_mapping_name(struct vm_area_struct *vma)
{
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}

static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
        struct vm_special_mapping *sm = new_vma->vm_private_data;

        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
                return -EFAULT;

        if (sm->mremap)
                return sm->mremap(sm, new_vma);

        return 0;
}

static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * Forbid splitting special mappings - kernel has expectations over
         * the number of pages in mapping. Together with VM_DONTEXPAND
         * the size of vma should stay the same over the special mapping's
         * lifetime.
         */
        return -EINVAL;
}

static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
        .mremap = special_mapping_mremap,
        .name = special_mapping_name,
        /* vDSO code relies that VVAR can't be accessed remotely */
        .access = NULL,
        .may_split = special_mapping_split,
};

static const struct vm_operations_struct legacy_special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
};

static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pgoff_t pgoff;
        struct page **pages;

        if (vma->vm_ops == &legacy_special_mapping_vmops) {
                pages = vma->vm_private_data;
        } else {
                struct vm_special_mapping *sm = vma->vm_private_data;

                if (sm->fault)
                        return sm->fault(sm, vmf->vma, vmf);

                pages = sm->pages;
        }

        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;

        if (*pages) {
                struct page *page = *pages;
                get_page(page);
                vmf->page = page;
                return 0;
        }

        return VM_FAULT_SIGBUS;
}

static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, void *priv,
        const struct vm_operations_struct *ops)
{
        int ret;
        struct vm_area_struct *vma;

        vma = vm_area_alloc(mm);
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);

        vma_set_range(vma, addr, addr + len, 0);
        vm_flags_init(vma, (vm_flags | mm->def_flags |
                      VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        vma->vm_ops = ops;
        vma->vm_private_data = priv;

        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;

        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);

        perf_event_mmap(vma);

        return vma;

out:
        vm_area_free(vma);
        return ERR_PTR(ret);
}

bool vma_is_special_mapping(const struct vm_area_struct *vma,
        const struct vm_special_mapping *sm)
{
        return vma->vm_private_data == sm &&
                (vma->vm_ops == &special_mapping_vmops ||
                 vma->vm_ops == &legacy_special_mapping_vmops);
}

/*
 * Called with mm->mmap_lock held for writing.
 * Insert a new vma covering the given region, with the given flags.
 * Its pages are supplied by the given array of struct page *.
 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
 * The region past the last page supplied will always produce SIGBUS.
 * The array pointer and the pages it points to are assumed to stay alive
 * for as long as this mapping might exist.
 */
struct vm_area_struct *_install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, const struct vm_special_mapping *spec)
{
        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
                                        &special_mapping_vmops);
}

int install_special_mapping(struct mm_struct *mm,
                            unsigned long addr, unsigned long len,
                            unsigned long vm_flags, struct page **pages)
{
        struct vm_area_struct *vma = __install_special_mapping(
                mm, addr, len, vm_flags, (void *)pages,
                &legacy_special_mapping_vmops);

        return PTR_ERR_OR_ZERO(vma);
}

static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
        }
}

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change from under us because
                 * we hold the mm_all_locks_mutex.
                 *
                 * Operations on ->flags have to be atomic because
                 * even if AS_MM_ALL_LOCKS is stable thanks to the
                 * mm_all_locks_mutex, there may be other cpus
                 * changing other bitflags in parallel to us.
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
        }
}

/*
 * This operation locks against the VM for all pte/vma/mm related
 * operations that could ever happen on a certain mm. This includes
 * vmtruncate, try_to_unmap, and all page faults.
 *
 * The caller must take the mmap_lock in write mode before calling
 * mm_take_all_locks(). The caller isn't allowed to release the
 * mmap_lock until mm_drop_all_locks() returns.
 *
 * mmap_lock in write mode is required in order to block all operations
 * that could modify pagetables and free pages without need of
 * altering the vma layout. It's also needed in write mode to avoid new
 * anon_vmas to be associated with existing vmas.
 *
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We take locks in following order, accordingly to comment at beginning
 * of mm/rmap.c:
 *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
 *     hugetlb mapping);
 *   - all vmas marked locked
 *   - all i_mmap_rwsem locks;
 *   - all anon_vma->rwseml
 *
 * We can take all locks within these types randomly because the VM code
 * doesn't nest them and we protected from parallel mm_take_all_locks() by
 * mm_all_locks_mutex.
 *
 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
 * that may have to take thousand of locks.
 *
 * mm_take_all_locks() can fail if it's interrupted by signals.
 */
int mm_take_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);

        mutex_lock(&mm_all_locks_mutex);

        /*
         * vma_start_write() does not have a complement in mm_drop_all_locks()
         * because vma_start_write() is always asymmetrical; it marks a VMA as
         * being written to until mmap_write_unlock() or mmap_write_downgrade()
         * is reached.
         */
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                vma_start_write(vma);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_lock_anon_vma(mm, avc->anon_vma);
        }

        return 0;

out_unlock:
        mm_drop_all_locks(mm);
        return -EINTR;
}

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
                anon_vma_unlock_write(anon_vma);
        }
}

static void vm_unlock_mapping(struct address_space *mapping)
{
        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                i_mmap_unlock_write(mapping);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
        }
}

/*
 * The mmap_lock cannot be released by the caller until
 * mm_drop_all_locks() returns.
 */
void mm_drop_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);
        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

        for_each_vma(vmi, vma) {
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_unlock_anon_vma(avc->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
                        vm_unlock_mapping(vma->vm_file->f_mapping);
        }

        mutex_unlock(&mm_all_locks_mutex);
}

/*
 * initialise the percpu counter for VM
 */
void __init mmap_init(void)
{
        int ret;

        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
}

/*
 * Initialise sysctl_user_reserve_kbytes.
 *
 * This is intended to prevent a user from starting a single memory hogging
 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
 * mode.
 *
 * The default value is min(3% of free memory, 128MB)
 * 128MB is enough to recover with sshd/login, bash, and top/kill.
 */
static int init_user_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
        return 0;
}
subsys_initcall(init_user_reserve);

/*
 * Initialise sysctl_admin_reserve_kbytes.
 *
 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
 * to log in and kill a memory hogging process.
 *
 * Systems with more than 256MB will reserve 8MB, enough to recover
 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
 * only reserve 3% of free pages by default.
 */
static int init_admin_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
        return 0;
}
subsys_initcall(init_admin_reserve);

/*
 * Reinititalise user and admin reserves if memory is added or removed.
 *
 * The default user reserve max is 128MB, and the default max for the
 * admin reserve is 8MB. These are usually, but not always, enough to
 * enable recovery from a memory hogging process using login/sshd, a shell,
 * and tools like top. It may make sense to increase or even disable the
 * reserve depending on the existence of swap or variations in the recovery
 * tools. So, the admin may have changed them.
 *
 * If memory is added and the reserves have been eliminated or increased above
 * the default max, then we'll trust the admin.
 *
 * If memory is removed and there isn't enough free memory, then we
 * need to reset the reserves.
 *
 * Otherwise keep the reserve set by the admin.
 */
static int reserve_mem_notifier(struct notifier_block *nb,
                             unsigned long action, void *data)
{
        unsigned long tmp, free_kbytes;

        switch (action) {
        case MEM_ONLINE:
                /* Default max is 128MB. Leave alone if modified by operator. */
                tmp = sysctl_user_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_128K)
                        init_user_reserve();

                /* Default max is 8MB.  Leave alone if modified by operator. */
                tmp = sysctl_admin_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_8K)
                        init_admin_reserve();

                break;
        case MEM_OFFLINE:
                free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
                                sysctl_user_reserve_kbytes);
                }

                if (sysctl_admin_reserve_kbytes > free_kbytes) {
                        init_admin_reserve();
                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
                                sysctl_admin_reserve_kbytes);
                }
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static int __meminit init_reserve_notifier(void)
{
        if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
                pr_err("Failed registering memory add/remove notifier for admin reserve\n");

        return 0;
}
subsys_initcall(init_reserve_notifier);
































































































































































































































































































































































































































































    9 

















    1 


































































































































































































    9 




















































































































    9 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
 *
 * (C) SGI 2006, Christoph Lameter
 *         Cleaned up and restructured to ease the addition of alternative
 *         implementations of SLAB allocators.
 * (C) Linux Foundation 2008-2013
 *      Unified interface for all slab allocators
 */

#ifndef _LINUX_SLAB_H
#define        _LINUX_SLAB_H

#include <linux/cache.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
#include <linux/cleanup.h>
#include <linux/hash.h>

enum _slab_flag_bits {
        _SLAB_CONSISTENCY_CHECKS,
        _SLAB_RED_ZONE,
        _SLAB_POISON,
        _SLAB_KMALLOC,
        _SLAB_HWCACHE_ALIGN,
        _SLAB_CACHE_DMA,
        _SLAB_CACHE_DMA32,
        _SLAB_STORE_USER,
        _SLAB_PANIC,
        _SLAB_TYPESAFE_BY_RCU,
        _SLAB_TRACE,
#ifdef CONFIG_DEBUG_OBJECTS
        _SLAB_DEBUG_OBJECTS,
#endif
        _SLAB_NOLEAKTRACE,
        _SLAB_NO_MERGE,
#ifdef CONFIG_FAILSLAB
        _SLAB_FAILSLAB,
#endif
#ifdef CONFIG_MEMCG_KMEM
        _SLAB_ACCOUNT,
#endif
#ifdef CONFIG_KASAN_GENERIC
        _SLAB_KASAN,
#endif
        _SLAB_NO_USER_FLAGS,
#ifdef CONFIG_KFENCE
        _SLAB_SKIP_KFENCE,
#endif
#ifndef CONFIG_SLUB_TINY
        _SLAB_RECLAIM_ACCOUNT,
#endif
        _SLAB_OBJECT_POISON,
        _SLAB_CMPXCHG_DOUBLE,
#ifdef CONFIG_SLAB_OBJ_EXT
        _SLAB_NO_OBJ_EXT,
#endif
        _SLAB_FLAGS_LAST_BIT
};

#define __SLAB_FLAG_BIT(nr)        ((slab_flags_t __force)(1U << (nr)))
#define __SLAB_FLAG_UNUSED        ((slab_flags_t __force)(0U))

/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op
 */
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS        __SLAB_FLAG_BIT(_SLAB_CONSISTENCY_CHECKS)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE                __SLAB_FLAG_BIT(_SLAB_RED_ZONE)
/* DEBUG: Poison objects */
#define SLAB_POISON                __SLAB_FLAG_BIT(_SLAB_POISON)
/* Indicate a kmalloc slab */
#define SLAB_KMALLOC                __SLAB_FLAG_BIT(_SLAB_KMALLOC)
/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN        __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA                __SLAB_FLAG_BIT(_SLAB_CACHE_DMA)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32        __SLAB_FLAG_BIT(_SLAB_CACHE_DMA32)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER                __SLAB_FLAG_BIT(_SLAB_STORE_USER)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC                __SLAB_FLAG_BIT(_SLAB_PANIC)
/*
 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
 *
 * This delays freeing the SLAB page by a grace period, it does _NOT_
 * delay object freeing. This means that if you do kmem_cache_free()
 * that memory location is free to be reused at any time. Thus it may
 * be possible to see another object there in the same RCU grace period.
 *
 * This feature only ensures the memory location backing the object
 * stays valid, the trick to using this is relying on an independent
 * object validation pass. Something like:
 *
 * begin:
 *  rcu_read_lock();
 *  obj = lockless_lookup(key);
 *  if (obj) {
 *    if (!try_get_ref(obj)) // might fail for free objects
 *      rcu_read_unlock();
 *      goto begin;
 *
 *    if (obj->key != key) { // not the object we expected
 *      put_ref(obj);
 *      rcu_read_unlock();
 *      goto begin;
 *    }
 *  }
 *  rcu_read_unlock();
 *
 * This is useful if we need to approach a kernel structure obliquely,
 * from its address obtained without the usual locking. We can lock
 * the structure to stabilize it and check it's still at the given address,
 * only if we can be sure that the memory has not been meanwhile reused
 * for some other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * Note that it is not possible to acquire a lock within a structure
 * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
 * as described above.  The reason is that SLAB_TYPESAFE_BY_RCU pages
 * are not zeroed before being given to the slab, which means that any
 * locks must be initialized after each and every kmem_struct_alloc().
 * Alternatively, make the ctor passed to kmem_cache_create() initialize
 * the locks at page-allocation time, as is done in __i915_request_ctor(),
 * sighand_ctor(), and anon_vma_ctor().  Such a ctor permits readers
 * to safely acquire those ctor-initialized locks under rcu_read_lock()
 * protection.
 *
 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
 */
/* Defer freeing slabs to RCU */
#define SLAB_TYPESAFE_BY_RCU        __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
/* Trace allocations and frees */
#define SLAB_TRACE                __SLAB_FLAG_BIT(_SLAB_TRACE)

/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_BIT(_SLAB_DEBUG_OBJECTS)
#else
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_UNUSED
#endif

/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE        __SLAB_FLAG_BIT(_SLAB_NOLEAKTRACE)

/*
 * Prevent merging with compatible kmem caches. This flag should be used
 * cautiously. Valid use cases:
 *
 * - caches created for self-tests (e.g. kunit)
 * - general caches created and used by a subsystem, only when a
 *   (subsystem-specific) debug option is enabled
 * - performance critical caches, should be very rare and consulted with slab
 *   maintainers, and not used together with CONFIG_SLUB_TINY
 */
#define SLAB_NO_MERGE                __SLAB_FLAG_BIT(_SLAB_NO_MERGE)

/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB                __SLAB_FLAG_BIT(_SLAB_FAILSLAB)
#else
# define SLAB_FAILSLAB                __SLAB_FLAG_UNUSED
#endif
/* Account to memcg */
#ifdef CONFIG_MEMCG_KMEM
# define SLAB_ACCOUNT                __SLAB_FLAG_BIT(_SLAB_ACCOUNT)
#else
# define SLAB_ACCOUNT                __SLAB_FLAG_UNUSED
#endif

#ifdef CONFIG_KASAN_GENERIC
#define SLAB_KASAN                __SLAB_FLAG_BIT(_SLAB_KASAN)
#else
#define SLAB_KASAN                __SLAB_FLAG_UNUSED
#endif

/*
 * Ignore user specified debugging flags.
 * Intended for caches created for self-tests so they have only flags
 * specified in the code and other flags are ignored.
 */
#define SLAB_NO_USER_FLAGS        __SLAB_FLAG_BIT(_SLAB_NO_USER_FLAGS)

#ifdef CONFIG_KFENCE
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_BIT(_SLAB_SKIP_KFENCE)
#else
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_UNUSED
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#ifndef CONFIG_SLUB_TINY
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT)
#else
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_UNUSED
#endif
#define SLAB_TEMPORARY                SLAB_RECLAIM_ACCOUNT        /* Objects are short-lived */

/* Slab created using create_boot_cache */
#ifdef CONFIG_SLAB_OBJ_EXT
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
#else
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_UNUSED
#endif

/*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 *
 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 * Both make kfree a no-op.
 */
#define ZERO_SIZE_PTR ((void *)16)

#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
                                (unsigned long)ZERO_SIZE_PTR)

#include <linux/kasan.h>

struct list_lru;
struct mem_cgroup;
/*
 * struct kmem_cache related prototypes
 */
bool slab_is_available(void);

struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
                        unsigned int align, slab_flags_t flags,
                        void (*ctor)(void *));
struct kmem_cache *kmem_cache_create_usercopy(const char *name,
                        unsigned int size, unsigned int align,
                        slab_flags_t flags,
                        unsigned int useroffset, unsigned int usersize,
                        void (*ctor)(void *));
void kmem_cache_destroy(struct kmem_cache *s);
int kmem_cache_shrink(struct kmem_cache *s);

/*
 * Please use this macro to create slab caches. Simply specify the
 * name of the structure and maybe some flags that are listed above.
 *
 * The alignment of the struct determines object alignment. If you
 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
 * then the objects will be properly aligned in SMP configurations.
 */
#define KMEM_CACHE(__struct, __flags)                                        \
                kmem_cache_create(#__struct, sizeof(struct __struct),        \
                        __alignof__(struct __struct), (__flags), NULL)

/*
 * To whitelist a single field for copying to/from usercopy, use this
 * macro instead for KMEM_CACHE() above.
 */
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)                        \
                kmem_cache_create_usercopy(#__struct,                        \
                        sizeof(struct __struct),                        \
                        __alignof__(struct __struct), (__flags),        \
                        offsetof(struct __struct, __field),                \
                        sizeof_field(struct __struct, __field), NULL)

/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check krealloc_noprof(const void *objp, size_t new_size,
                                    gfp_t flags) __realloc_size(2);
#define krealloc(...)                                alloc_hooks(krealloc_noprof(__VA_ARGS__))

void kfree(const void *objp);
void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);

DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))

/**
 * ksize - Report actual allocation size of associated object
 *
 * @objp: Pointer returned from a prior kmalloc()-family allocation.
 *
 * This should not be used for writing beyond the originally requested
 * allocation size. Either use krealloc() or round up the allocation size
 * with kmalloc_size_roundup() prior to allocation. If this is used to
 * access beyond the originally requested allocation size, UBSAN_BOUNDS
 * and/or FORTIFY_SOURCE may trip, since they only know about the
 * originally allocated size via the __alloc_size attribute.
 */
size_t ksize(const void *objp);

#ifdef CONFIG_PRINTK
bool kmem_dump_obj(void *object);
#else
static inline bool kmem_dump_obj(void *object) { return false; }
#endif

/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_DMA_MINALIGN in arch headers allows that.
 */
#ifdef ARCH_HAS_DMA_MINALIGN
#if ARCH_DMA_MINALIGN > 8 && !defined(ARCH_KMALLOC_MINALIGN)
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#endif
#endif

#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#elif ARCH_KMALLOC_MINALIGN > 8
#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
#endif

/*
 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
 * Intended for arches that get misalignment faults even for 64 bit integer
 * aligned buffers.
 */
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * Arches can define this function if they want to decide the minimum slab
 * alignment at runtime. The value returned by the function must be a power
 * of two and >= ARCH_SLAB_MINALIGN.
 */
#ifndef arch_slab_minalign
static inline unsigned int arch_slab_minalign(void)
{
        return ARCH_SLAB_MINALIGN;
}
#endif

/*
 * kmem_cache_alloc and friends return pointers aligned to ARCH_SLAB_MINALIGN.
 * kmalloc and friends return pointers aligned to both ARCH_KMALLOC_MINALIGN
 * and ARCH_SLAB_MINALIGN, but here we only assume the former alignment.
 */
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)

/*
 * Kmalloc array related definitions
 */

/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH        (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX        (MAX_PAGE_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER        (KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif

/*
 * This restriction comes from byte sized index implementation.
 * Page size is normally 2^12 bytes and, in this case, if we want to use
 * byte sized index which can represent 2^8 entries, the size of the object
 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 * If minimum size of kmalloc is less than 16, we use it as minimum object
 * size and give up to use byte sized index.
 */
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
                               (KMALLOC_MIN_SIZE) : 16)

#ifdef CONFIG_RANDOM_KMALLOC_CACHES
#define RANDOM_KMALLOC_CACHES_NR        15 // # of cache copies
#else
#define RANDOM_KMALLOC_CACHES_NR        0
#endif

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 *
 * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP
 * is for accounted but unreclaimable and non-dma objects. All the other
 * kmem caches can have both accounted and unaccounted objects.
 */
enum kmalloc_cache_type {
        KMALLOC_NORMAL = 0,
#ifndef CONFIG_ZONE_DMA
        KMALLOC_DMA = KMALLOC_NORMAL,
#endif
#ifndef CONFIG_MEMCG_KMEM
        KMALLOC_CGROUP = KMALLOC_NORMAL,
#endif
        KMALLOC_RANDOM_START = KMALLOC_NORMAL,
        KMALLOC_RANDOM_END = KMALLOC_RANDOM_START + RANDOM_KMALLOC_CACHES_NR,
#ifdef CONFIG_SLUB_TINY
        KMALLOC_RECLAIM = KMALLOC_NORMAL,
#else
        KMALLOC_RECLAIM,
#endif
#ifdef CONFIG_ZONE_DMA
        KMALLOC_DMA,
#endif
#ifdef CONFIG_MEMCG_KMEM
        KMALLOC_CGROUP,
#endif
        NR_KMALLOC_TYPES
};

extern struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];

/*
 * Define gfp bits that should not be set for KMALLOC_NORMAL.
 */
#define KMALLOC_NOT_NORMAL_BITS                                        \
        (__GFP_RECLAIMABLE |                                        \
        (IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |        \
        (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))

extern unsigned long random_kmalloc_seed;

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigned long caller)
{
        /*
         * The most common case is KMALLOC_NORMAL, so test for it
         * with a single branch for all the relevant flags.
         */
        if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
                /* RANDOM_KMALLOC_CACHES_NR (=15) copies + the KMALLOC_NORMAL */
                return KMALLOC_RANDOM_START + hash_64(caller ^ random_kmalloc_seed,
                                                      ilog2(RANDOM_KMALLOC_CACHES_NR + 1));
#else
                return KMALLOC_NORMAL;
#endif

        /*
         * At least one of the flags has to be set. Their priorities in
         * decreasing order are:
         *  1) __GFP_DMA
         *  2) __GFP_RECLAIMABLE
         *  3) __GFP_ACCOUNT
         */
        if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
                return KMALLOC_DMA;
        if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
                return KMALLOC_RECLAIM;
        else
                return KMALLOC_CGROUP;
}

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 *
 * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
 * typical usage is via kmalloc_index() and therefore evaluated at compile-time.
 * Callers where !size_is_constant should only be test modules, where runtime
 * overheads of __kmalloc_index() can be tolerated.  Also see kmalloc_slab().
 */
static __always_inline unsigned int __kmalloc_index(size_t size,
                                                    bool size_is_constant)
{
        if (!size)
                return 0;

        if (size <= KMALLOC_MIN_SIZE)
                return KMALLOC_SHIFT_LOW;

        if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
                return 1;
        if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
                return 2;
        if (size <=          8) return 3;
        if (size <=         16) return 4;
        if (size <=         32) return 5;
        if (size <=         64) return 6;
        if (size <=        128) return 7;
        if (size <=        256) return 8;
        if (size <=        512) return 9;
        if (size <=       1024) return 10;
        if (size <=   2 * 1024) return 11;
        if (size <=   4 * 1024) return 12;
        if (size <=   8 * 1024) return 13;
        if (size <=  16 * 1024) return 14;
        if (size <=  32 * 1024) return 15;
        if (size <=  64 * 1024) return 16;
        if (size <= 128 * 1024) return 17;
        if (size <= 256 * 1024) return 18;
        if (size <= 512 * 1024) return 19;
        if (size <= 1024 * 1024) return 20;
        if (size <=  2 * 1024 * 1024) return 21;

        if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
                BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
        else
                BUG();

        /* Will never be reached. Needed because the compiler may complain */
        return -1;
}
static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)

#include <linux/alloc_tag.h>

void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
#define __kmalloc(...)                                alloc_hooks(__kmalloc_noprof(__VA_ARGS__))

/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.
 * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags.
 *
 * Return: pointer to the new object or %NULL in case of error
 */
void *kmem_cache_alloc_noprof(struct kmem_cache *cachep,
                              gfp_t flags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc(...)                        alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))

void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
                            gfp_t gfpflags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_lru(...)        alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))

void kmem_cache_free(struct kmem_cache *s, void *objp);

/*
 * Bulk allocation and freeing operations. These are accelerated in an
 * allocator specific way to avoid taking locks repeatedly or building
 * metadata structures unnecessarily.
 *
 * Note that interrupts must be enabled when calling these functions.
 */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);

int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
#define kmem_cache_alloc_bulk(...)        alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))

static __always_inline void kfree_bulk(size_t size, void **p)
{
        kmem_cache_free_bulk(NULL, size, p);
}

void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
                                                         __alloc_size(1);
#define __kmalloc_node(...)                        alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__))

void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
                                   int node) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_node(...)        alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))

void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
                    __assume_kmalloc_alignment __alloc_size(3);

void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
                int node, size_t size) __assume_kmalloc_alignment
                                                __alloc_size(4);
#define kmalloc_trace(...)                        alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__))

#define kmalloc_node_trace(...)                        alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__))

void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment
                                              __alloc_size(1);
#define kmalloc_large(...)                        alloc_hooks(kmalloc_large_noprof(__VA_ARGS__))

void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment
                                                             __alloc_size(1);
#define kmalloc_large_node(...)                        alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__))

/**
 * kmalloc - allocate kernel memory
 * @size: how many bytes of memory are required.
 * @flags: describe the allocation context
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
 * bytes. For @size of power of two bytes, the alignment is also guaranteed
 * to be at least to the size.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp_types.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *        Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *        Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *        Allocation will not sleep.  May use emergency pools.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_ZERO
 *        Zero the allocated memory before returning. Also see kzalloc().
 *
 * %__GFP_HIGH
 *        This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *        Indicate that this allocation is in no way allowed to fail
 *        (think twice before using).
 *
 * %__GFP_NORETRY
 *        If memory is not immediately available,
 *        then give up at once.
 *
 * %__GFP_NOWARN
 *        If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *        Try really hard to succeed the allocation but fail
 *        eventually.
 */
static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return kmalloc_large_noprof(size, flags);

                index = kmalloc_index(size);
                return kmalloc_trace_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, size);
        }
        return __kmalloc_noprof(size, flags);
}
#define kmalloc(...)                                alloc_hooks(kmalloc_noprof(__VA_ARGS__))

static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return kmalloc_large_node_noprof(size, flags, node);

                index = kmalloc_index(size);
                return kmalloc_node_trace_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, node, size);
        }
        return __kmalloc_node_noprof(size, flags, node);
}
#define kmalloc_node(...)                        alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_noprof(bytes, flags);
        return kmalloc_noprof(bytes, flags);
}
#define kmalloc_array(...)                        alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))

/**
 * krealloc_array - reallocate memory for an array.
 * @p: pointer to the memory chunk to reallocate
 * @new_n: new number of elements to alloc
 * @new_size: new size of a single member of the array
 * @flags: the type of memory to allocate (see kmalloc)
 */
static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
                                                                       size_t new_n,
                                                                       size_t new_size,
                                                                       gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
                return NULL;

        return krealloc_noprof(p, bytes, flags);
}
#define krealloc_array(...)                        alloc_hooks(krealloc_array_noprof(__VA_ARGS__))

/**
 * kcalloc - allocate memory for an array. The memory is set to zero.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
#define kcalloc(n, size, flags)                kmalloc_array(n, size, (flags) | __GFP_ZERO)

void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node,
                                  unsigned long caller) __alloc_size(1);
#define kmalloc_node_track_caller(...)                \
        alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))

/*
 * kmalloc_track_caller is a special version of kmalloc that records the
 * calling function of the routine calling it for slab leak tracking instead
 * of just the calling function (confusing, eh?).
 * It's useful when the call to kmalloc comes from a widely-used standard
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
#define kmalloc_track_caller(...)                kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)

#define kmalloc_track_caller_noprof(...)        \
                kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_)

static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
                                                          int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_node_noprof(bytes, flags, node);
        return __kmalloc_node_noprof(bytes, flags, node);
}
#define kmalloc_array_node(...)                        alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))

#define kcalloc_node(_n, _size, _flags, _node)        \
        kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)

/*
 * Shortcuts
 */
#define kmem_cache_zalloc(_k, _flags)                kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
{
        return kmalloc_noprof(size, flags | __GFP_ZERO);
}
#define kzalloc(...)                                alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node)        kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1);
#define kvmalloc_node(...)                        alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))

#define kvmalloc(_size, _flags)                        kvmalloc_node(_size, _flags, NUMA_NO_NODE)
#define kvmalloc_noprof(_size, _flags)                kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
#define kvzalloc(_size, _flags)                        kvmalloc(_size, (_flags)|__GFP_ZERO)

#define kvzalloc_node(_size, _flags, _node)        kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return kvmalloc_node_noprof(bytes, flags, node);
}

#define kvmalloc_array_noprof(...)                kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
#define kvcalloc_node_noprof(_n,_s,_f,_node)        kvmalloc_array_node_noprof(_n,_s,(_f)|__GFP_ZERO,_node)
#define kvcalloc_noprof(...)                        kvcalloc_node_noprof(__VA_ARGS__, NUMA_NO_NODE)

#define kvmalloc_array(...)                        alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
#define kvcalloc_node(...)                        alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...)                                alloc_hooks(kvcalloc_noprof(__VA_ARGS__))

extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
                      __realloc_size(3);
#define kvrealloc(...)                                alloc_hooks(kvrealloc_noprof(__VA_ARGS__))

extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))

extern void kvfree_sensitive(const void *addr, size_t len);

unsigned int kmem_cache_size(struct kmem_cache *s);

/**
 * kmalloc_size_roundup - Report allocation bucket size for the given size
 *
 * @size: Number of bytes to round up from.
 *
 * This returns the number of bytes that would be available in a kmalloc()
 * allocation of @size bytes. For example, a 126 byte request would be
 * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly
 * for the general-purpose kmalloc()-based allocations, and is not for the
 * pre-sized kmem_cache_alloc()-based allocations.)
 *
 * Use this to kmalloc() the full bucket size ahead of time instead of using
 * ksize() to query the size after an allocation.
 */
size_t kmalloc_size_roundup(size_t size);

void __init kmem_cache_init_late(void);

#endif        /* _LINUX_SLAB_H */





























































































































































































































































    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/*
 * net/tipc/bearer.h: Include file for TIPC bearer code
 *
 * Copyright (c) 1996-2006, 2013-2016, Ericsson AB
 * Copyright (c) 2005, 2010-2011, Wind River Systems
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _TIPC_BEARER_H
#define _TIPC_BEARER_H

#include "netlink.h"
#include "core.h"
#include "msg.h"
#include <net/genetlink.h>

#define MAX_MEDIA        3

/* Identifiers associated with TIPC message header media address info
 * - address info field is 32 bytes long
 * - the field's actual content and length is defined per media
 * - remaining unused bytes in the field are set to zero
 */
#define TIPC_MEDIA_INFO_SIZE        32
#define TIPC_MEDIA_TYPE_OFFSET        3
#define TIPC_MEDIA_ADDR_OFFSET        4

/*
 * Identifiers of supported TIPC media types
 */
#define TIPC_MEDIA_TYPE_ETH        1
#define TIPC_MEDIA_TYPE_IB        2
#define TIPC_MEDIA_TYPE_UDP        3

/* Minimum bearer MTU */
#define TIPC_MIN_BEARER_MTU        (MAX_H_SIZE + INT_H_SIZE)

/* Identifiers for distinguishing between broadcast/multicast and replicast
 */
#define TIPC_BROADCAST_SUPPORT  1
#define TIPC_REPLICAST_SUPPORT  2

/**
 * struct tipc_media_addr - destination address used by TIPC bearers
 * @value: address info (format defined by media)
 * @media_id: TIPC media type identifier
 * @broadcast: non-zero if address is a broadcast address
 */
struct tipc_media_addr {
        u8 value[TIPC_MEDIA_INFO_SIZE];
        u8 media_id;
        u8 broadcast;
};

struct tipc_bearer;

/**
 * struct tipc_media - Media specific info exposed to generic bearer layer
 * @send_msg: routine which handles buffer transmission
 * @enable_media: routine which enables a media
 * @disable_media: routine which disables a media
 * @addr2str: convert media address format to string
 * @addr2msg: convert from media addr format to discovery msg addr format
 * @msg2addr: convert from discovery msg addr format to media addr format
 * @raw2addr: convert from raw addr format to media addr format
 * @priority: default link (and bearer) priority
 * @tolerance: default time (in ms) before declaring link failure
 * @min_win: minimum window (in packets) before declaring link congestion
 * @max_win: maximum window (in packets) before declaring link congestion
 * @mtu: max packet size bearer can support for media type not dependent on
 * underlying device MTU
 * @type_id: TIPC media identifier
 * @hwaddr_len: TIPC media address len
 * @name: media name
 */
struct tipc_media {
        int (*send_msg)(struct net *net, struct sk_buff *buf,
                        struct tipc_bearer *b,
                        struct tipc_media_addr *dest);
        int (*enable_media)(struct net *net, struct tipc_bearer *b,
                            struct nlattr *attr[]);
        void (*disable_media)(struct tipc_bearer *b);
        int (*addr2str)(struct tipc_media_addr *addr,
                        char *strbuf,
                        int bufsz);
        int (*addr2msg)(char *msg, struct tipc_media_addr *addr);
        int (*msg2addr)(struct tipc_bearer *b,
                        struct tipc_media_addr *addr,
                        char *msg);
        int (*raw2addr)(struct tipc_bearer *b,
                        struct tipc_media_addr *addr,
                        const char *raw);
        u32 priority;
        u32 tolerance;
        u32 min_win;
        u32 max_win;
        u32 mtu;
        u32 type_id;
        u32 hwaddr_len;
        char name[TIPC_MAX_MEDIA_NAME];
};

/**
 * struct tipc_bearer - Generic TIPC bearer structure
 * @media_ptr: pointer to additional media-specific information about bearer
 * @mtu: max packet size bearer can support
 * @addr: media-specific address associated with bearer
 * @name: bearer name (format = media:interface)
 * @media: ptr to media structure associated with bearer
 * @bcast_addr: media address used in broadcasting
 * @pt: packet type for bearer
 * @rcu: rcu struct for tipc_bearer
 * @priority: default link priority for bearer
 * @min_win: minimum window (in packets) before declaring link congestion
 * @max_win: maximum window (in packets) before declaring link congestion
 * @tolerance: default link tolerance for bearer
 * @domain: network domain to which links can be established
 * @identity: array index of this bearer within TIPC bearer array
 * @disc: ptr to link setup request
 * @net_plane: network plane ('A' through 'H') currently associated with bearer
 * @encap_hlen: encap headers length
 * @up: bearer up flag (bit 0)
 * @refcnt: tipc_bearer reference counter
 *
 * Note: media-specific code is responsible for initialization of the fields
 * indicated below when a bearer is enabled; TIPC's generic bearer code takes
 * care of initializing all other fields.
 */
struct tipc_bearer {
        void __rcu *media_ptr;                        /* initialized by media */
        u32 mtu;                                /* initialized by media */
        struct tipc_media_addr addr;                /* initialized by media */
        char name[TIPC_MAX_BEARER_NAME];
        struct tipc_media *media;
        struct tipc_media_addr bcast_addr;
        struct packet_type pt;
        struct rcu_head rcu;
        u32 priority;
        u32 min_win;
        u32 max_win;
        u32 tolerance;
        u32 domain;
        u32 identity;
        struct tipc_discoverer *disc;
        char net_plane;
        u16 encap_hlen;
        unsigned long up;
        refcount_t refcnt;
};

struct tipc_bearer_names {
        char media_name[TIPC_MAX_MEDIA_NAME];
        char if_name[TIPC_MAX_IF_NAME];
};

/*
 * TIPC routines available to supported media types
 */

void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b);

/*
 * Routines made available to TIPC by supported media types
 */
extern struct tipc_media eth_media_info;

#ifdef CONFIG_TIPC_MEDIA_IB
extern struct tipc_media ib_media_info;
#endif
#ifdef CONFIG_TIPC_MEDIA_UDP
extern struct tipc_media udp_media_info;
#endif

int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info);
int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info);
int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info);
int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info);

int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info);
int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info);

int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a);
int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
                         struct nlattr *attrs[]);
bool tipc_bearer_hold(struct tipc_bearer *b);
void tipc_bearer_put(struct tipc_bearer *b);
void tipc_disable_l2_media(struct tipc_bearer *b);
int tipc_l2_send_msg(struct net *net, struct sk_buff *buf,
                     struct tipc_bearer *b, struct tipc_media_addr *dest);

void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest);
void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest);
struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name);
int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id);
struct tipc_media *tipc_media_find(const char *name);
int tipc_bearer_setup(void);
void tipc_bearer_cleanup(void);
void tipc_bearer_stop(struct net *net);
int tipc_bearer_mtu(struct net *net, u32 bearer_id);
int tipc_bearer_min_mtu(struct net *net, u32 bearer_id);
bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id);
void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
                          struct sk_buff *skb,
                          struct tipc_media_addr *dest);
void tipc_bearer_xmit(struct net *net, u32 bearer_id,
                      struct sk_buff_head *xmitq,
                      struct tipc_media_addr *dst,
                      struct tipc_node *__dnode);
void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
                         struct sk_buff_head *xmitq);
void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts);
int tipc_attach_loopback(struct net *net);
void tipc_detach_loopback(struct net *net);

static inline void tipc_loopback_trace(struct net *net,
                                       struct sk_buff_head *pkts)
{
        if (unlikely(dev_nit_active(net->loopback_dev)))
                tipc_clone_to_loopback(net, pkts);
}

/* check if device MTU is too low for tipc headers */
static inline bool tipc_mtu_bad(struct net_device *dev)
{
        if (dev->mtu >= TIPC_MIN_BEARER_MTU)
                return false;
        netdev_warn(dev, "MTU too low for tipc bearer\n");
        return true;
}

#endif        /* _TIPC_BEARER_H */
































































































































































































































































































































































































    4 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem

#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(kmem_cache_alloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 struct kmem_cache *s,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, s, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
                __field(        bool,                accounted        )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = s->object_size;
                __entry->bytes_alloc        = s->size;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
                __entry->accounted        = IS_ENABLED(CONFIG_MEMCG_KMEM) ?
                                          ((gfp_flags & __GFP_ACCOUNT) ||
                                          (s->flags & SLAB_ACCOUNT)) : false;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                __entry->accounted ? "true" : "false")
);

TRACE_EVENT(kmalloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                (IS_ENABLED(CONFIG_MEMCG_KMEM) &&
                 (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false")
);

TRACE_EVENT(kfree,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
        ),

        TP_printk("call_site=%pS ptr=%p",
                  (void *)__entry->call_site, __entry->ptr)
);

TRACE_EVENT(kmem_cache_free,

        TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s),

        TP_ARGS(call_site, ptr, s),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __string(        name,                s->name                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __assign_str(name);
        ),

        TP_printk("call_site=%pS ptr=%p name=%s",
                  (void *)__entry->call_site, __entry->ptr, __get_str(name))
);

TRACE_EVENT(mm_page_free,

        TP_PROTO(struct page *page, unsigned int order),

        TP_ARGS(page, order),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
                __entry->order                = order;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn,
                        __entry->order)
);

TRACE_EVENT(mm_page_free_batched,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
        ),

        TP_printk("page=%p pfn=0x%lx order=0",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn)
);

TRACE_EVENT(mm_page_alloc,

        TP_PROTO(struct page *page, unsigned int order,
                        gfp_t gfp_flags, int migratetype),

        TP_ARGS(page, order, gfp_flags, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_page,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
                __field(        int,                percpu_refill        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
                __entry->percpu_refill        = percpu_refill;
        ),

        TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                __entry->percpu_refill)
);

DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill)
);

TRACE_EVENT(mm_page_pcpu_drain,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
                pfn_to_page(__entry->pfn), __entry->pfn,
                __entry->order, __entry->migratetype)
);

TRACE_EVENT(mm_page_alloc_extfrag,

        TP_PROTO(struct page *page,
                int alloc_order, int fallback_order,
                int alloc_migratetype, int fallback_migratetype),

        TP_ARGS(page,
                alloc_order, fallback_order,
                alloc_migratetype, fallback_migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                        )
                __field(        int,                alloc_order                )
                __field(        int,                fallback_order                )
                __field(        int,                alloc_migratetype        )
                __field(        int,                fallback_migratetype        )
                __field(        int,                change_ownership        )
        ),

        TP_fast_assign(
                __entry->pfn                        = page_to_pfn(page);
                __entry->alloc_order                = alloc_order;
                __entry->fallback_order                = fallback_order;
                __entry->alloc_migratetype        = alloc_migratetype;
                __entry->fallback_migratetype        = fallback_migratetype;
                __entry->change_ownership        = (alloc_migratetype ==
                                        get_pageblock_migratetype(page));
        ),

        TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->alloc_order,
                __entry->fallback_order,
                pageblock_order,
                __entry->alloc_migratetype,
                __entry->fallback_migratetype,
                __entry->fallback_order < pageblock_order,
                __entry->change_ownership)
);

TRACE_EVENT(mm_alloc_contig_migrate_range_info,

        TP_PROTO(unsigned long start,
                 unsigned long end,
                 unsigned long nr_migrated,
                 unsigned long nr_reclaimed,
                 unsigned long nr_mapped,
                 int migratetype),

        TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype),

        TP_STRUCT__entry(
                __field(unsigned long, start)
                __field(unsigned long, end)
                __field(unsigned long, nr_migrated)
                __field(unsigned long, nr_reclaimed)
                __field(unsigned long, nr_mapped)
                __field(int, migratetype)
        ),

        TP_fast_assign(
                __entry->start = start;
                __entry->end = end;
                __entry->nr_migrated = nr_migrated;
                __entry->nr_reclaimed = nr_reclaimed;
                __entry->nr_mapped = nr_mapped;
                __entry->migratetype = migratetype;
        ),

        TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu",
                  __entry->start,
                  __entry->end,
                  __entry->migratetype,
                  __entry->nr_migrated,
                  __entry->nr_reclaimed,
                  __entry->nr_mapped)
);

/*
 * Required for uniquely and securely identifying mm in rss_stat tracepoint.
 */
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
        int ret;
        unsigned long hashval;

        ret = ptr_to_hashval(ptr, &hashval);
        if (ret)
                return 0;

        /* The hashed value is only 32-bit */
        return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif

#define TRACE_MM_PAGES                \
        EM(MM_FILEPAGES)        \
        EM(MM_ANONPAGES)        \
        EM(MM_SWAPENTS)                \
        EMe(MM_SHMEMPAGES)

#undef EM
#undef EMe

#define EM(a)        TRACE_DEFINE_ENUM(a);
#define EMe(a)        TRACE_DEFINE_ENUM(a);

TRACE_MM_PAGES

#undef EM
#undef EMe

#define EM(a)        { a, #a },
#define EMe(a)        { a, #a }

TRACE_EVENT(rss_stat,

        TP_PROTO(struct mm_struct *mm,
                int member),

        TP_ARGS(mm, member),

        TP_STRUCT__entry(
                __field(unsigned int, mm_id)
                __field(unsigned int, curr)
                __field(int, member)
                __field(long, size)
        ),

        TP_fast_assign(
                __entry->mm_id = mm_ptr_to_hash(mm);
                __entry->curr = !!(current->mm == mm);
                __entry->member = member;
                __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
                                                            << PAGE_SHIFT);
        ),

        TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
                __entry->mm_id,
                __entry->curr,
                __print_symbolic(__entry->member, TRACE_MM_PAGES),
                __entry->size)
        );
#endif /* _TRACE_KMEM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 

































































































































































































































































































































































































































































































































































































































    1 




    1 




    1 




































































    8 












    1 
    1 














































    1 
    1 




    1 

























































































































































































































































































































































































































































































    1 

    1 







    4 



























































































































































































































































































































































































































































































































































































    4 



















    4 























































































    1 
























































    2 
















































































    3 


























































































































































































































































































































    1 











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
#include <linux/mount.h>
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/maple_tree.h>
#include <linux/rw_hint.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>

struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct io_comp_batch;
struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
struct kiocb;
struct kobject;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
struct fscrypt_inode_info;
struct fscrypt_operations;
struct fsverity_info;
struct fsverity_operations;
struct fsnotify_mark_connector;
struct fsnotify_sb_info;
struct fs_context;
struct fs_parameter_spec;
struct fileattr;
struct iomap_ops;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init files_init(void);
extern void __init files_maxfiles_init(void);

extern unsigned long get_max_files(void);
extern unsigned int sysctl_nr_open;

typedef __kernel_rwf_t rwf_t;

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);

#define MAY_EXEC                0x00000001
#define MAY_WRITE                0x00000002
#define MAY_READ                0x00000004
#define MAY_APPEND                0x00000008
#define MAY_ACCESS                0x00000010
#define MAY_OPEN                0x00000020
#define MAY_CHDIR                0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK                0x00000080

/*
 * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
 * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
 */

/* file is open for reading */
#define FMODE_READ                ((__force fmode_t)(1 << 0))
/* file is open for writing */
#define FMODE_WRITE                ((__force fmode_t)(1 << 1))
/* file is seekable */
#define FMODE_LSEEK                ((__force fmode_t)(1 << 2))
/* file can be accessed using pread */
#define FMODE_PREAD                ((__force fmode_t)(1 << 3))
/* file can be accessed using pwrite */
#define FMODE_PWRITE                ((__force fmode_t)(1 << 4))
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC                ((__force fmode_t)(1 << 5))
/* File writes are restricted (block device specific) */
#define FMODE_WRITE_RESTRICTED        ((__force fmode_t)(1 << 6))

/* FMODE_* bits 7 to 8 */

/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
/* 64bit hashes as llseek() offset (for directories) */
#define FMODE_64BITHASH         ((__force fmode_t)(1 << 10))

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME                ((__force fmode_t)(1 << 11))

/* Expect random access pattern */
#define FMODE_RANDOM                ((__force fmode_t)(1 << 12))

/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
#define FMODE_UNSIGNED_OFFSET        ((__force fmode_t)(1 << 13))

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH                ((__force fmode_t)(1 << 14))

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS        ((__force fmode_t)(1 << 15))
/* Write access to underlying fs */
#define FMODE_WRITER                ((__force fmode_t)(1 << 16))
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)(1 << 17))
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)(1 << 18))

#define FMODE_OPENED                ((__force fmode_t)(1 << 19))
#define FMODE_CREATED                ((__force fmode_t)(1 << 20))

/* File is stream-like */
#define FMODE_STREAM                ((__force fmode_t)(1 << 21))

/* File supports DIRECT IO */
#define        FMODE_CAN_ODIRECT        ((__force fmode_t)(1 << 22))

#define        FMODE_NOREUSE                ((__force fmode_t)(1 << 23))

/* FMODE_* bit 24 */

/* File is embedded in backing_file object */
#define FMODE_BACKING                ((__force fmode_t)(1 << 25))

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY                ((__force fmode_t)(1 << 26))

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT                ((__force fmode_t)(1 << 27))

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT        ((__force fmode_t)(1 << 28))

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT                ((__force fmode_t)(1 << 29))

/*
 * Attribute flags.  These should be or-ed together to figure out what
 * has been changed!
 */
#define ATTR_MODE        (1 << 0)
#define ATTR_UID        (1 << 1)
#define ATTR_GID        (1 << 2)
#define ATTR_SIZE        (1 << 3)
#define ATTR_ATIME        (1 << 4)
#define ATTR_MTIME        (1 << 5)
#define ATTR_CTIME        (1 << 6)
#define ATTR_ATIME_SET        (1 << 7)
#define ATTR_MTIME_SET        (1 << 8)
#define ATTR_FORCE        (1 << 9) /* Not a change, but a change it */
#define ATTR_KILL_SUID        (1 << 11)
#define ATTR_KILL_SGID        (1 << 12)
#define ATTR_FILE        (1 << 13)
#define ATTR_KILL_PRIV        (1 << 14)
#define ATTR_OPEN        (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET        (1 << 16)
#define ATTR_TOUCH        (1 << 17)

/*
 * Whiteout is represented by a char device.  The following constants define the
 * mode and device number to use.
 */
#define WHITEOUT_MODE 0
#define WHITEOUT_DEV 0

/*
 * This is the Inode Attributes structure, used for notify_change().  It
 * uses the above definitions as flags, to know which values have changed.
 * Also, in this manner, a Filesystem can look at only the values it cares
 * about.  Basically, these are the attributes that the VFS layer can
 * request to change from the FS layer.
 *
 * Derek Atkins <warlord@MIT.EDU> 94-10-20
 */
struct iattr {
        unsigned int        ia_valid;
        umode_t                ia_mode;
        /*
         * The two anonymous unions wrap structures with the same member.
         *
         * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which
         * are a dedicated type requiring the filesystem to use the dedicated
         * helpers. Other filesystem can continue to use ia_{g,u}id until they
         * have been ported.
         *
         * They always contain the same value. In other words FS_ALLOW_IDMAP
         * pass down the same value on idmapped mounts as they would on regular
         * mounts.
         */
        union {
                kuid_t                ia_uid;
                vfsuid_t        ia_vfsuid;
        };
        union {
                kgid_t                ia_gid;
                vfsgid_t        ia_vfsgid;
        };
        loff_t                ia_size;
        struct timespec64 ia_atime;
        struct timespec64 ia_mtime;
        struct timespec64 ia_ctime;

        /*
         * Not an attribute, but an auxiliary info for filesystems wanting to
         * implement an ftruncate() like method.  NOTE: filesystem should
         * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
         */
        struct file        *ia_file;
};

/*
 * Includes for diskquotas.
 */
#include <linux/quota.h>

/*
 * Maximum number of layers of fs stack.  Needs to be limited to
 * prevent kernel stack overflow
 */
#define FILESYSTEM_MAX_STACK_DEPTH 2

/** 
 * enum positive_aop_returns - aop return codes with specific semantics
 *
 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 *                             completed, that the page is still locked, and
 *                             should be considered active.  The VM uses this hint
 *                             to return the page to the active list -- it won't
 *                             be a candidate for writeback again in the near
 *                             future.  Other callers must be careful to unlock
 *                             the page if they get this return.  Returned by
 *                             writepage(); 
 *
 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 *                          unlocked it and the page might have been truncated.
 *                          The caller should back up to acquiring a new page and
 *                          trying again.  The aop will be taking reasonable
 *                          precautions not to livelock.  If the caller held a page
 *                          reference, it should drop it before retrying.  Returned
 *                          by read_folio().
 *
 * address_space_operation functions return these large constants to indicate
 * special semantics to the caller.  These are much larger than the bytes in a
 * page to allow for functions that return the number of bytes operated on in a
 * given page.
 */

enum positive_aop_returns {
        AOP_WRITEPAGE_ACTIVATE        = 0x80000,
        AOP_TRUNCATED_PAGE        = 0x80001,
};

/*
 * oh the beauties of C type declarations.
 */
struct page;
struct address_space;
struct writeback_control;
struct readahead_control;

/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI                (__force int) RWF_HIPRI
#define IOCB_DSYNC                (__force int) RWF_DSYNC
#define IOCB_SYNC                (__force int) RWF_SYNC
#define IOCB_NOWAIT                (__force int) RWF_NOWAIT
#define IOCB_APPEND                (__force int) RWF_APPEND

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD                (1 << 16)
#define IOCB_DIRECT                (1 << 17)
#define IOCB_WRITE                (1 << 18)
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ                (1 << 19)
#define IOCB_NOIO                (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE        (1 << 21)
/*
 * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
 * iocb completion can be passed back to the owner for execution from a safe
 * context rather than needing to be punted through a workqueue. If this
 * flag is set, the bio completion handling may set iocb->dio_complete to a
 * handler function and iocb->private to context information for that handler.
 * The issuer should call the handler with that context information from task
 * context to complete the processing of the iocb. Note that while this
 * provides a task context for the dio_complete() callback, it should only be
 * used on the completion side for non-IO generating completions. It's fine to
 * call blocking functions from this callback, but they should not wait for
 * unrelated IO (like cache flushing, new IO generation, etc).
 */
#define IOCB_DIO_CALLER_COMP        (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW                (1 << 23)

/* for use in trace events */
#define TRACE_IOCB_STRINGS \
        { IOCB_HIPRI,                "HIPRI" }, \
        { IOCB_DSYNC,                "DSYNC" }, \
        { IOCB_SYNC,                "SYNC" }, \
        { IOCB_NOWAIT,                "NOWAIT" }, \
        { IOCB_APPEND,                "APPEND" }, \
        { IOCB_EVENTFD,                "EVENTFD"}, \
        { IOCB_DIRECT,                "DIRECT" }, \
        { IOCB_WRITE,                "WRITE" }, \
        { IOCB_WAITQ,                "WAITQ" }, \
        { IOCB_NOIO,                "NOIO" }, \
        { IOCB_ALLOC_CACHE,        "ALLOC_CACHE" }, \
        { IOCB_DIO_CALLER_COMP,        "CALLER_COMP" }

struct kiocb {
        struct file                *ki_filp;
        loff_t                        ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret);
        void                        *private;
        int                        ki_flags;
        u16                        ki_ioprio; /* See linux/ioprio.h */
        union {
                /*
                 * Only used for async buffered reads, where it denotes the
                 * page waitqueue associated with completing the read. Valid
                 * IFF IOCB_WAITQ is set.
                 */
                struct wait_page_queue        *ki_waitq;
                /*
                 * Can be used for O_DIRECT IO, where the completion handling
                 * is punted back to the issuer of the IO. May only be set
                 * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
                 * must then check for presence of this handler when ki_complete
                 * is invoked. The data passed in to this handler must be
                 * assigned to ->private when dio_complete is assigned.
                 */
                ssize_t (*dio_complete)(void *data);
        };
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
        return kiocb->ki_complete == NULL;
}

struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
        int (*read_folio)(struct file *, struct folio *);

        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);

        /* Mark a folio dirty.  Return true if this dirtied it */
        bool (*dirty_folio)(struct address_space *, struct folio *);

        void (*readahead)(struct readahead_control *);

        int (*write_begin)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len,
                                struct page **pagep, void **fsdata);
        int (*write_end)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);

        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidate_folio) (struct folio *, size_t offset, size_t len);
        bool (*release_folio)(struct folio *, gfp_t);
        void (*free_folio)(struct folio *folio);
        ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
        /*
         * migrate the contents of a folio to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
         */
        int (*migrate_folio)(struct address_space *, struct folio *dst,
                        struct folio *src, enum migrate_mode);
        int (*launder_folio)(struct folio *);
        bool (*is_partially_uptodate) (struct folio *, size_t from,
                        size_t count);
        void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
        int (*error_remove_folio)(struct address_space *, struct folio *);

        /* swapfile support */
        int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                                sector_t *span);
        void (*swap_deactivate)(struct file *file);
        int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};

extern const struct address_space_operations empty_aops;

/**
 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @invalidate_lock: Guards coherency between page cache contents and
 *   file offset->disk block mappings in the filesystem during invalidates.
 *   It is also used to block modification of page cache contents through
 *   memory mappings.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @i_private_lock: For use by the owner of the address_space.
 * @i_private_list: For use by the owner of the address_space.
 * @i_private_data: For use by the owner of the address_space.
 */
struct address_space {
        struct inode                *host;
        struct xarray                i_pages;
        struct rw_semaphore        invalidate_lock;
        gfp_t                        gfp_mask;
        atomic_t                i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        /* number of thp, only for non-shmem files */
        atomic_t                nr_thps;
#endif
        struct rb_root_cached        i_mmap;
        unsigned long                nrpages;
        pgoff_t                        writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long                flags;
        errseq_t                wb_err;
        spinlock_t                i_private_lock;
        struct list_head        i_private_list;
        struct rw_semaphore        i_mmap_rwsem;
        void *                        i_private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
         * must be enforced here for CRIS, to let the least significant bit
         * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
         */

/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY        XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK        XA_MARK_1
#define PAGECACHE_TAG_TOWRITE        XA_MARK_2

/*
 * Returns true if any of the pages in the mapping are marked with the tag.
 */
static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
{
        return xa_marked(&mapping->i_pages, tag);
}

static inline void i_mmap_lock_write(struct address_space *mapping)
{
        down_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_write(struct address_space *mapping)
{
        return down_write_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_write(struct address_space *mapping)
{
        up_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_read(struct address_space *mapping)
{
        return down_read_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_lock_read(struct address_space *mapping)
{
        down_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_read(struct address_space *mapping)
{
        up_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_locked(struct address_space *mapping)
{
        lockdep_assert_held(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_write_locked(struct address_space *mapping)
{
        lockdep_assert_held_write(&mapping->i_mmap_rwsem);
}

/*
 * Might pages of this file be mapped into userspace?
 */
static inline int mapping_mapped(struct address_space *mapping)
{
        return        !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}

/*
 * Might pages of this file have been modified in userspace?
 * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
 * marks vma as VM_SHARED if it is shared, and the file was opened for
 * writing i.e. vma may be mprotected writable even if now readonly.
 *
 * If i_mmap_writable is negative, no new writable mappings are allowed. You
 * can only deny writable mappings, if none exists right now.
 */
static inline int mapping_writably_mapped(struct address_space *mapping)
{
        return atomic_read(&mapping->i_mmap_writable) > 0;
}

static inline int mapping_map_writable(struct address_space *mapping)
{
        return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
                0 : -EPERM;
}

static inline void mapping_unmap_writable(struct address_space *mapping)
{
        atomic_dec(&mapping->i_mmap_writable);
}

static inline int mapping_deny_writable(struct address_space *mapping)
{
        return atomic_dec_unless_positive(&mapping->i_mmap_writable) ?
                0 : -EBUSY;
}

static inline void mapping_allow_writable(struct address_space *mapping)
{
        atomic_inc(&mapping->i_mmap_writable);
}

/*
 * Use sequence counter to get consistent i_size on 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif

struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
/*
 * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
 * cache the ACL.  This also means that ->get_inode_acl() can be called in RCU
 * mode with the LOOKUP_RCU flag.
 */
#define ACL_DONT_CACHE ((void *)(-3))

static inline struct posix_acl *
uncached_acl_sentinel(struct task_struct *task)
{
        return (void *)task + 1;
}

static inline bool
is_uncached_acl(struct posix_acl *acl)
{
        return (long)acl & 1;
}

#define IOP_FASTPERM        0x0001
#define IOP_LOOKUP        0x0002
#define IOP_NOFOLLOW        0x0004
#define IOP_XATTR        0x0008
#define IOP_DEFAULT_READLINK        0x0010

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                        i_mode;
        unsigned short                i_opflags;
        kuid_t                        i_uid;
        kgid_t                        i_gid;
        unsigned int                i_flags;

#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif

        const struct inode_operations        *i_op;
        struct super_block        *i_sb;
        struct address_space        *i_mapping;

#ifdef CONFIG_SECURITY
        void                        *i_security;
#endif

        /* Stat data, not accessed from path walking */
        unsigned long                i_ino;
        /*
         * Filesystems may only read i_nlink directly.  They shall use the
         * following functions for modification:
         *
         *    (set|clear|inc|drop)_nlink
         *    inode_(inc|dec)_link_count
         */
        union {
                const unsigned int i_nlink;
                unsigned int __i_nlink;
        };
        dev_t                        i_rdev;
        loff_t                        i_size;
        struct timespec64        __i_atime;
        struct timespec64        __i_mtime;
        struct timespec64        __i_ctime; /* use inode_*_ctime accessors! */
        spinlock_t                i_lock;        /* i_blocks, i_bytes, maybe i_size */
        unsigned short          i_bytes;
        u8                        i_blkbits;
        enum rw_hint                i_write_hint;
        blkcnt_t                i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t                i_size_seqcount;
#endif

        /* Misc */
        unsigned long                i_state;
        struct rw_semaphore        i_rwsem;

        unsigned long                dirtied_when;        /* jiffies of first dirtying */
        unsigned long                dirtied_time_when;

        struct hlist_node        i_hash;
        struct list_head        i_io_list;        /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback        *i_wb;                /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                        i_wb_frn_winner;
        u16                        i_wb_frn_avg_time;
        u16                        i_wb_frn_history;
#endif
        struct list_head        i_lru;                /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;        /* backing dev writeback list */
        union {
                struct hlist_head        i_dentry;
                struct rcu_head                i_rcu;
        };
        atomic64_t                i_version;
        atomic64_t                i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations        *i_fop;        /* former ->i_op->default_file_ops */
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space        i_data;
        struct list_head        i_devices;
        union {
                struct pipe_inode_info        *i_pipe;
                struct cdev                *i_cdev;
                char                        *i_link;
                unsigned                i_dir_seq;
        };

        __u32                        i_generation;

#ifdef CONFIG_FSNOTIFY
        __u32                        i_fsnotify_mask; /* all events this inode cares about */
        struct fsnotify_mark_connector __rcu        *i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_inode_info        *i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
        struct fsverity_info        *i_verity_info;
#endif

        void                        *i_private; /* fs or device private pointer */
} __randomize_layout;

struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);

static inline unsigned int i_blocksize(const struct inode *node)
{
        return (1 << node->i_blkbits);
}

static inline int inode_unhashed(struct inode *inode)
{
        return hlist_unhashed(&inode->i_hash);
}

/*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 * want special inodes in the fileset inode space, we make them
 * appear hashed, but do not put on any lists.  hlist_del()
 * will work fine and require no locking.
 */
static inline void inode_fake_hash(struct inode *inode)
{
        hlist_add_fake(&inode->i_hash);
}

/*
 * inode->i_mutex nesting subclasses for the lock validator:
 *
 * 0: the object of the current VFS operation
 * 1: parent
 * 2: child/target
 * 3: xattr
 * 4: second non-directory
 * 5: second parent (when locking independent directories in rename)
 *
 * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two
 * non-directories at once.
 *
 * The locking order between these classes is
 * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
 */
enum inode_i_mutex_lock_class
{
        I_MUTEX_NORMAL,
        I_MUTEX_PARENT,
        I_MUTEX_CHILD,
        I_MUTEX_XATTR,
        I_MUTEX_NONDIR2,
        I_MUTEX_PARENT2,
};

static inline void inode_lock(struct inode *inode)
{
        down_write(&inode->i_rwsem);
}

static inline void inode_unlock(struct inode *inode)
{
        up_write(&inode->i_rwsem);
}

static inline void inode_lock_shared(struct inode *inode)
{
        down_read(&inode->i_rwsem);
}

static inline void inode_unlock_shared(struct inode *inode)
{
        up_read(&inode->i_rwsem);
}

static inline int inode_trylock(struct inode *inode)
{
        return down_write_trylock(&inode->i_rwsem);
}

static inline int inode_trylock_shared(struct inode *inode)
{
        return down_read_trylock(&inode->i_rwsem);
}

static inline int inode_is_locked(struct inode *inode)
{
        return rwsem_is_locked(&inode->i_rwsem);
}

static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
{
        down_write_nested(&inode->i_rwsem, subclass);
}

static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass)
{
        down_read_nested(&inode->i_rwsem, subclass);
}

static inline void filemap_invalidate_lock(struct address_space *mapping)
{
        down_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock(struct address_space *mapping)
{
        up_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
{
        down_read(&mapping->invalidate_lock);
}

static inline int filemap_invalidate_trylock_shared(
                                        struct address_space *mapping)
{
        return down_read_trylock(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock_shared(
                                        struct address_space *mapping)
{
        up_read(&mapping->invalidate_lock);
}

void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);

void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2);
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2);


/*
 * NOTE: in a 32bit arch with a preemptable kernel and
 * an UP compile the i_size_read/write must be atomic
 * with respect to the local cpu (unlike with preempt disabled),
 * but they don't need to be atomic with respect to other cpus like in
 * true SMP (so they need either to either locally disable irq around
 * the read or for example on x86 they can be still implemented as a
 * cmpxchg8b without the need of the lock prefix). For SMP compiles
 * and 64bit archs it makes no difference if preempt is enabled or not.
 */
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        loff_t i_size;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&inode->i_size_seqcount);
                i_size = inode->i_size;
        } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
        return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        loff_t i_size;

        preempt_disable();
        i_size = inode->i_size;
        preempt_enable();
        return i_size;
#else
        /* Pairs with smp_store_release() in i_size_write() */
        return smp_load_acquire(&inode->i_size);
#endif
}

/*
 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
 */
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&inode->i_size_seqcount);
        inode->i_size = i_size;
        write_seqcount_end(&inode->i_size_seqcount);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        inode->i_size = i_size;
        preempt_enable();
#else
        /*
         * Pairs with smp_load_acquire() in i_size_read() to ensure
         * changes related to inode size (such as page contents) are
         * visible before we see the changed inode size.
         */
        smp_store_release(&inode->i_size, i_size);
#endif
}

static inline unsigned iminor(const struct inode *inode)
{
        return MINOR(inode->i_rdev);
}

static inline unsigned imajor(const struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

struct fown_struct {
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type;        /* Kind of process group SIGIO should be sent to */
        kuid_t uid, euid;        /* uid/euid of process setting the owner */
        int signum;                /* posix.1b rt signal to be delivered on IO */
};

/**
 * struct file_ra_state - Track a file's readahead state.
 * @start: Where the most recent readahead started.
 * @size: Number of pages read in the most recent readahead.
 * @async_size: Numer of pages that were/are not needed immediately
 *      and so were/are genuinely "ahead".  Start next readahead when
 *      the first of these pages is accessed.
 * @ra_pages: Maximum size of a readahead request, copied from the bdi.
 * @mmap_miss: How many mmap accesses missed in the page cache.
 * @prev_pos: The last byte in the most recent read request.
 *
 * When this structure is passed to ->readahead(), the "most recent"
 * readahead means the current readahead.
 */
struct file_ra_state {
        pgoff_t start;
        unsigned int size;
        unsigned int async_size;
        unsigned int ra_pages;
        unsigned int mmap_miss;
        loff_t prev_pos;
};

/*
 * Check if @index falls in the readahead windows.
 */
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
{
        return (index >= ra->start &&
                index <  ra->start + ra->size);
}

/*
 * f_{lock,count,pos_lock} members can be highly contended and share
 * the same cacheline. f_{lock,mode} are very frequently used together
 * and so share the same cacheline as well. The read-mostly
 * f_{path,inode,op} are kept on a separate cacheline.
 */
struct file {
        union {
                /* fput() uses task work when closing and freeing file (default). */
                struct callback_head         f_task_work;
                /* fput() must use workqueue (most kernel threads). */
                struct llist_node        f_llist;
                unsigned int                 f_iocb_flags;
        };

        /*
         * Protects f_ep, f_flags.
         * Must not be taken from IRQ context.
         */
        spinlock_t                f_lock;
        fmode_t                        f_mode;
        atomic_long_t                f_count;
        struct mutex                f_pos_lock;
        loff_t                        f_pos;
        unsigned int                f_flags;
        struct fown_struct        f_owner;
        const struct cred        *f_cred;
        struct file_ra_state        f_ra;
        struct path                f_path;
        struct inode                *f_inode;        /* cached value */
        const struct file_operations        *f_op;

        u64                        f_version;
#ifdef CONFIG_SECURITY
        void                        *f_security;
#endif
        /* needed for tty driver, and maybe others */
        void                        *private_data;

#ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct hlist_head        *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
        struct address_space        *f_mapping;
        errseq_t                f_wb_err;
        errseq_t                f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */

struct file_handle {
        __u32 handle_bytes;
        int handle_type;
        /* file identifier */
        unsigned char f_handle[] __counted_by(handle_bytes);
};

static inline struct file *get_file(struct file *f)
{
        long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
        WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
        return f;
}

struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f);

#define file_count(x)        atomic_long_read(&(x)->f_count)

#define        MAX_NON_LFS        ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes 
   limits, otherwise bad things can happen in VM. */ 
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE        ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE         ((loff_t)LLONG_MAX)
#endif

/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;

struct file_lock;
struct file_lease;

/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define OFFSET_MAX        type_max(loff_t)
#define OFFT_OFFSET_MAX        type_max(off_t)
#endif

extern void send_sigio(struct fown_struct *fown, int fd, int band);

static inline struct inode *file_inode(const struct file *f)
{
        return f->f_inode;
}

/*
 * file_dentry() is a relic from the days that overlayfs was using files with a
 * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
 * In those days, file_dentry() was needed to get the underlying fs dentry that
 * matches f_inode.
 * Files with "fake" path should not exist nowadays, so use an assertion to make
 * sure that file_dentry() was not papering over filesystem bugs.
 */
static inline struct dentry *file_dentry(const struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;

        WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
        return dentry;
}

struct fasync_struct {
        rwlock_t                fa_lock;
        int                        magic;
        int                        fa_fd;
        struct fasync_struct        *fa_next; /* singly linked list */
        struct file                *fa_file;
        struct rcu_head                fa_rcu;
};

#define FASYNC_MAGIC 0x4601

/* SMP safe fasync helpers: */
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
extern int fasync_remove_entry(struct file *, struct fasync_struct **);
extern struct fasync_struct *fasync_alloc(void);
extern void fasync_free(struct fasync_struct *);

/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);

extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);

/*
 * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
 * represented in both.
 */
#define SB_RDONLY       BIT(0)        /* Mount read-only */
#define SB_NOSUID       BIT(1)        /* Ignore suid and sgid bits */
#define SB_NODEV        BIT(2)        /* Disallow access to device special files */
#define SB_NOEXEC       BIT(3)        /* Disallow program execution */
#define SB_SYNCHRONOUS  BIT(4)        /* Writes are synced at once */
#define SB_MANDLOCK     BIT(6)        /* Allow mandatory locks on an FS */
#define SB_DIRSYNC      BIT(7)        /* Directory modifications are synchronous */
#define SB_NOATIME      BIT(10)        /* Do not update access times. */
#define SB_NODIRATIME   BIT(11)        /* Do not update directory access times */
#define SB_SILENT       BIT(15)
#define SB_POSIXACL     BIT(16)        /* Supports POSIX ACLs */
#define SB_INLINECRYPT  BIT(17)        /* Use blk-crypto for encrypted files */
#define SB_KERNMOUNT    BIT(22)        /* this is a kern_mount call */
#define SB_I_VERSION    BIT(23)        /* Update inode I_version field */
#define SB_LAZYTIME     BIT(25)        /* Update the on-disk [acm]times lazily */

/* These sb flags are internal to the kernel */
#define SB_DEAD         BIT(21)
#define SB_DYING        BIT(24)
#define SB_SUBMOUNT     BIT(26)
#define SB_FORCE        BIT(27)
#define SB_NOSEC        BIT(28)
#define SB_BORN         BIT(29)
#define SB_ACTIVE       BIT(30)
#define SB_NOUSER       BIT(31)

/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL        (1 << 0)

#define sb_has_strict_encoding(sb) \
        (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)

/*
 *        Umount options
 */

#define MNT_FORCE        0x00000001        /* Attempt to forcibily umount */
#define MNT_DETACH        0x00000002        /* Just detach from the tree */
#define MNT_EXPIRE        0x00000004        /* Mark for expiry */
#define UMOUNT_NOFOLLOW        0x00000008        /* Don't follow symlink on umount */
#define UMOUNT_UNUSED        0x80000000        /* Flag guaranteed to be unused */

/* sb->s_iflags */
#define SB_I_CGROUPWB        0x00000001        /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC        0x00000002        /* Ignore executables on this fs */
#define SB_I_NODEV        0x00000004        /* Ignore devices on this fs */
#define SB_I_STABLE_WRITES 0x00000008        /* don't modify blks until WB is done */

/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE                0x00000010 /* fstype already mounted */
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE        0x00000020
#define SB_I_UNTRUSTED_MOUNTER                0x00000040
#define SB_I_EVM_HMAC_UNSUPPORTED        0x00000080

#define SB_I_SKIP_SYNC        0x00000100        /* Skip superblock at global sync */
#define SB_I_PERSB_BDI        0x00000200        /* has a per-sb bdi */
#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
#define SB_I_RETIRED        0x00000800        /* superblock shouldn't be reused */
#define SB_I_NOUMASK        0x00001000        /* VFS does not apply umask */

/* Possible states of 'frozen' field */
enum {
        SB_UNFROZEN = 0,                /* FS is unfrozen */
        SB_FREEZE_WRITE        = 1,                /* Writes, dir ops, ioctls frozen */
        SB_FREEZE_PAGEFAULT = 2,        /* Page faults stopped as well */
        SB_FREEZE_FS = 3,                /* For internal FS use (e.g. to stop
                                         * internal threads if needed) */
        SB_FREEZE_COMPLETE = 4,                /* ->freeze_fs finished successfully */
};

#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
        unsigned short                        frozen;                /* Is sb frozen? */
        int                                freeze_kcount;        /* How many kernel freeze requests? */
        int                                freeze_ucount;        /* How many userspace freeze requests? */
        struct percpu_rw_semaphore        rw_sem[SB_FREEZE_LEVELS];
};

struct super_block {
        struct list_head        s_list;                /* Keep this first */
        dev_t                        s_dev;                /* search index; _not_ kdev_t */
        unsigned char                s_blocksize_bits;
        unsigned long                s_blocksize;
        loff_t                        s_maxbytes;        /* Max file size */
        struct file_system_type        *s_type;
        const struct super_operations        *s_op;
        const struct dquot_operations        *dq_op;
        const struct quotactl_ops        *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long                s_flags;
        unsigned long                s_iflags;        /* internal SB_I_* flags */
        unsigned long                s_magic;
        struct dentry                *s_root;
        struct rw_semaphore        s_umount;
        int                        s_count;
        atomic_t                s_active;
#ifdef CONFIG_SECURITY
        void                    *s_security;
#endif
        const struct xattr_handler * const *s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
        const struct fscrypt_operations        *s_cop;
        struct fscrypt_keyring        *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
        const struct fsverity_operations *s_vop;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct unicode_map *s_encoding;
        __u16 s_encoding_flags;
#endif
        struct hlist_bl_head        s_roots;        /* alternate root dentries for NFS */
        struct list_head        s_mounts;        /* list of mounts; _not_ for fs use */
        struct block_device        *s_bdev;        /* can go away once we use an accessor for @s_bdev_file */
        struct file                *s_bdev_file;
        struct backing_dev_info *s_bdi;
        struct mtd_info                *s_mtd;
        struct hlist_node        s_instances;
        unsigned int                s_quota_types;        /* Bitmask of supported quota types */
        struct quota_info        s_dquot;        /* Diskquota specific options */

        struct sb_writers        s_writers;

        /*
         * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
         * s_fsnotify_info together for cache efficiency. They are frequently
         * accessed and rarely modified.
         */
        void                        *s_fs_info;        /* Filesystem private info */

        /* Granularity of c/m/atime in ns (cannot be worse than a second) */
        u32                        s_time_gran;
        /* Time limits for c/m/atime in seconds */
        time64_t                   s_time_min;
        time64_t                   s_time_max;
#ifdef CONFIG_FSNOTIFY
        __u32                        s_fsnotify_mask;
        struct fsnotify_sb_info        *s_fsnotify_info;
#endif

        /*
         * q: why are s_id and s_sysfs_name not the same? both are human
         * readable strings that identify the filesystem
         * a: s_id is allowed to change at runtime; it's used in log messages,
         * and we want to when a device starts out as single device (s_id is dev
         * name) but then a device is hot added and we have to switch to
         * identifying it by UUID
         * but s_sysfs_name is a handle for programmatic access, and can't
         * change at runtime
         */
        char                        s_id[32];        /* Informational name */
        uuid_t                        s_uuid;                /* UUID */
        u8                        s_uuid_len;        /* Default 16, possibly smaller for weird filesystems */

        /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
        char                        s_sysfs_name[UUID_STRING_LEN + 1];

        unsigned int                s_max_links;

        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
         */
        struct mutex s_vfs_rename_mutex;        /* Kludge */

        /*
         * Filesystem subtype.  If non-empty the filesystem type field
         * in /proc/mounts will be "type.subtype"
         */
        const char *s_subtype;

        const struct dentry_operations *s_d_op; /* default d_op for dentries */

        struct shrinker *s_shrink;        /* per-sb shrinker handle */

        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;

        /* Read-only state of the superblock is being changed */
        int s_readonly_remount;

        /* per-sb errseq_t for reporting writeback errors via syncfs */
        errseq_t s_wb_err;

        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;

        /*
         * Owning user namespace and default context in which to
         * interpret filesystem uids, gids, quotas, device nodes,
         * xattrs and security labels.
         */
        struct user_namespace *s_user_ns;

        /*
         * The list_lru structure is essentially just a pointer to a table
         * of per-node lru lists, each of which has its own spinlock.
         * There is no need to put them into separate cachelines.
         */
        struct list_lru                s_dentry_lru;
        struct list_lru                s_inode_lru;
        struct rcu_head                rcu;
        struct work_struct        destroy_work;

        struct mutex                s_sync_lock;        /* sync serialisation lock */

        /*
         * Indicates how deep in a filesystem stack this SB is
         */
        int s_stack_depth;

        /* s_inode_list_lock protects s_inodes */
        spinlock_t                s_inode_list_lock ____cacheline_aligned_in_smp;
        struct list_head        s_inodes;        /* all inodes */

        spinlock_t                s_inode_wblist_lock;
        struct list_head        s_inodes_wb;        /* writeback inodes */
} __randomize_layout;

static inline struct user_namespace *i_user_ns(const struct inode *inode)
{
        return inode->i_sb->s_user_ns;
}

/* Helper functions so that in most cases filesystems will
 * not need to deal directly with kuid_t and kgid_t and can
 * instead deal with the raw numeric values that are stored
 * in the filesystem.
 */
static inline uid_t i_uid_read(const struct inode *inode)
{
        return from_kuid(i_user_ns(inode), inode->i_uid);
}

static inline gid_t i_gid_read(const struct inode *inode)
{
        return from_kgid(i_user_ns(inode), inode->i_gid);
}

static inline void i_uid_write(struct inode *inode, uid_t uid)
{
        inode->i_uid = make_kuid(i_user_ns(inode), uid);
}

static inline void i_gid_write(struct inode *inode, gid_t gid)
{
        inode->i_gid = make_kgid(i_user_ns(inode), gid);
}

/**
 * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: whe inode's i_uid mapped down according to @idmap.
 * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
 */
static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid);
}

/**
 * i_uid_needs_update - check whether inode's i_uid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_uid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_uid field needs to be updated, false if not.
 */
static inline bool i_uid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_UID) &&
                !vfsuid_eq(attr->ia_vfsuid,
                           i_uid_into_vfsuid(idmap, inode)));
}

/**
 * i_uid_update - update @inode's i_uid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_uid field translating the vfsuid of any idmapped
 * mount into the filesystem kuid.
 */
static inline void i_uid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_UID)
                inode->i_uid = from_vfsuid(idmap, i_user_ns(inode),
                                           attr->ia_vfsuid);
}

/**
 * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: the inode's i_gid mapped down according to @idmap.
 * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
 */
static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid);
}

/**
 * i_gid_needs_update - check whether inode's i_gid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_gid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_gid field needs to be updated, false if not.
 */
static inline bool i_gid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_GID) &&
                !vfsgid_eq(attr->ia_vfsgid,
                           i_gid_into_vfsgid(idmap, inode)));
}

/**
 * i_gid_update - update @inode's i_gid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_gid field translating the vfsgid of any idmapped
 * mount into the filesystem kgid.
 */
static inline void i_gid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_GID)
                inode->i_gid = from_vfsgid(idmap, i_user_ns(inode),
                                           attr->ia_vfsgid);
}

/**
 * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_uid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsuid according to @idmap.
 */
static inline void inode_fsuid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode));
}

/**
 * inode_fsgid_set - initialize inode's i_gid field with callers fsgid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_gid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsgid according to @idmap.
 */
static inline void inode_fsgid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode));
}

/**
 * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped
 * @sb: the superblock we want a mapping in
 * @idmap: idmap of the relevant mount
 *
 * Check whether the caller's fsuid and fsgid have a valid mapping in the
 * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map
 * the caller's fsuid and fsgid according to the @idmap first.
 *
 * Return: true if fsuid and fsgid is mapped, false if not.
 */
static inline bool fsuidgid_has_mapping(struct super_block *sb,
                                        struct mnt_idmap *idmap)
{
        struct user_namespace *fs_userns = sb->s_user_ns;
        kuid_t kuid;
        kgid_t kgid;

        kuid = mapped_fsuid(idmap, fs_userns);
        if (!uid_valid(kuid))
                return false;
        kgid = mapped_fsgid(idmap, fs_userns);
        if (!gid_valid(kgid))
                return false;
        return kuid_has_mapping(fs_userns, kuid) &&
               kgid_has_mapping(fs_userns, kgid);
}

struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);

static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
        return inode->__i_atime.tv_sec;
}

static inline long inode_get_atime_nsec(const struct inode *inode)
{
        return inode->__i_atime.tv_nsec;
}

static inline struct timespec64 inode_get_atime(const struct inode *inode)
{
        return inode->__i_atime;
}

static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->__i_atime = ts;
        return ts;
}

static inline struct timespec64 inode_set_atime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };
        return inode_set_atime_to_ts(inode, ts);
}

static inline time64_t inode_get_mtime_sec(const struct inode *inode)
{
        return inode->__i_mtime.tv_sec;
}

static inline long inode_get_mtime_nsec(const struct inode *inode)
{
        return inode->__i_mtime.tv_nsec;
}

static inline struct timespec64 inode_get_mtime(const struct inode *inode)
{
        return inode->__i_mtime;
}

static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->__i_mtime = ts;
        return ts;
}

static inline struct timespec64 inode_set_mtime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };
        return inode_set_mtime_to_ts(inode, ts);
}

static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
        return inode->__i_ctime.tv_sec;
}

static inline long inode_get_ctime_nsec(const struct inode *inode)
{
        return inode->__i_ctime.tv_nsec;
}

static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
        return inode->__i_ctime;
}

static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->__i_ctime = ts;
        return ts;
}

/**
 * inode_set_ctime - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @sec: tv_sec value to set
 * @nsec: tv_nsec value to set
 *
 * Set the ctime in @inode to { @sec, @nsec }
 */
static inline struct timespec64 inode_set_ctime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_ctime_to_ts(inode, ts);
}

struct timespec64 simple_inode_init_ts(struct inode *inode);

/*
 * Snapshotting support.
 */

/*
 * These are internal functions, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
static inline void __sb_end_write(struct super_block *sb, int level)
{
        percpu_up_read(sb->s_writers.rw_sem + level-1);
}

static inline void __sb_start_write(struct super_block *sb, int level)
{
        percpu_down_read(sb->s_writers.rw_sem + level - 1);
}

static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
{
        return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
}

#define __sb_writers_acquired(sb, lev)        \
        percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev)        \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)

/**
 * __sb_write_started - check if sb freeze level is held
 * @sb: the super we write to
 * @level: the freeze level
 *
 * * > 0 - sb freeze level is held
 * *   0 - sb freeze level is not held
 * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
 */
static inline int __sb_write_started(const struct super_block *sb, int level)
{
        return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
}

/**
 * sb_write_started - check if SB_FREEZE_WRITE is held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE);
}

/**
 * sb_write_not_started - check if SB_FREEZE_WRITE is not held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_not_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
}

/**
 * file_write_started - check if SB_FREEZE_WRITE is held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_started(file_inode(file)->i_sb);
}

/**
 * file_write_not_started - check if SB_FREEZE_WRITE is not held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_not_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_not_started(file_inode(file)->i_sb);
}

/**
 * sb_end_write - drop write access to a superblock
 * @sb: the super we wrote to
 *
 * Decrement number of writers to the filesystem. Wake up possible waiters
 * wanting to freeze the filesystem.
 */
static inline void sb_end_write(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_WRITE);
}

/**
 * sb_end_pagefault - drop write access to a superblock from a page fault
 * @sb: the super we wrote to
 *
 * Decrement number of processes handling write page fault to the filesystem.
 * Wake up possible waiters wanting to freeze the filesystem.
 */
static inline void sb_end_pagefault(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
 * @sb: the super we wrote to
 *
 * Decrement fs-internal number of writers to the filesystem.  Wake up possible
 * waiters wanting to freeze the filesystem.
 */
static inline void sb_end_intwrite(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_FS);
}

/**
 * sb_start_write - get write access to a superblock
 * @sb: the super we write to
 *
 * When a process wants to write data or metadata to a file system (i.e. dirty
 * a page or an inode), it should embed the operation in a sb_start_write() -
 * sb_end_write() pair to get exclusion against file system freezing. This
 * function increments number of writers preventing freezing. If the file
 * system is already frozen, the function waits until the file system is
 * thawed.
 *
 * Since freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. Generally,
 * freeze protection should be the outermost lock. In particular, we have:
 *
 * sb_start_write
 *   -> i_mutex                        (write path, truncate, directory ops, ...)
 *   -> s_umount                (freeze_super, thaw_super)
 */
static inline void sb_start_write(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_WRITE);
}

static inline bool sb_start_write_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
}

/**
 * sb_start_pagefault - get write access to a superblock from a page fault
 * @sb: the super we write to
 *
 * When a process starts handling write page fault, it should embed the
 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
 * exclusion against file system freezing. This is needed since the page fault
 * is going to dirty a page. This function increments number of running page
 * faults preventing freezing. If the file system is already frozen, the
 * function waits until the file system is thawed.
 *
 * Since page fault freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. It is advised to
 * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
 * handling code implies lock dependency:
 *
 * mmap_lock
 *   -> sb_start_pagefault
 */
static inline void sb_start_pagefault(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_start_intwrite - get write access to a superblock for internal fs purposes
 * @sb: the super we write to
 *
 * This is the third level of protection against filesystem freezing. It is
 * free for use by a filesystem. The only requirement is that it must rank
 * below sb_start_pagefault.
 *
 * For example filesystem can call sb_start_intwrite() when starting a
 * transaction which somewhat eases handling of freezing for internal sources
 * of filesystem changes (internal fs threads, discarding preallocation on file
 * close, etc.).
 */
static inline void sb_start_intwrite(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_FS);
}

static inline bool sb_start_intwrite_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}

bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode);

/*
 * VFS helper functions..
 */
int vfs_create(struct mnt_idmap *, struct inode *,
               struct dentry *, umode_t, bool);
int vfs_mkdir(struct mnt_idmap *, struct inode *,
              struct dentry *, umode_t);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
              umode_t, dev_t);
int vfs_symlink(struct mnt_idmap *, struct inode *,
                struct dentry *, const char *);
int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
             struct dentry *, struct inode **);
int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
               struct inode **);

/**
 * struct renamedata - contains all information required for renaming
 * @old_mnt_idmap:     idmap of the old mount the inode was found from
 * @old_dir:           parent of source
 * @old_dentry:                source
 * @new_mnt_idmap:     idmap of the new mount the inode was found from
 * @new_dir:           parent of destination
 * @new_dentry:                destination
 * @delegated_inode:   returns an inode needing a delegation break
 * @flags:             rename flags
 */
struct renamedata {
        struct mnt_idmap *old_mnt_idmap;
        struct inode *old_dir;
        struct dentry *old_dentry;
        struct mnt_idmap *new_mnt_idmap;
        struct inode *new_dir;
        struct dentry *new_dentry;
        struct inode **delegated_inode;
        unsigned int flags;
} __randomize_layout;

int vfs_rename(struct renamedata *);

static inline int vfs_whiteout(struct mnt_idmap *idmap,
                               struct inode *dir, struct dentry *dentry)
{
        return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
                         WHITEOUT_DEV);
}

struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred);
struct file *kernel_file_open(const struct path *path, int flags,
                              const struct cred *cred);

int vfs_mkobj(struct dentry *, umode_t,
                int (*f)(struct dentry *, umode_t, void *),
                void *);

int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);

extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);

#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
                                        unsigned long arg);
#else
#define compat_ptr_ioctl NULL
#endif

/*
 * VFS file helper functions.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode);
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode);

/*
 * This is the "filldir" function type, used by readdir() to let
 * the kernel specify what kind of dirent layout it wants to have.
 * This allows the kernel to read directories into kernel space or
 * to have different dirent layouts depending on the binary type.
 * Return 'true' to keep going and 'false' if there are no more entries.
 */
struct dir_context;
typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
                         unsigned);

struct dir_context {
        filldir_t actor;
        loff_t pos;
};

/*
 * These flags let !MMU mmap() govern direct device mapping vs immediate
 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
 *
 * NOMMU_MAP_COPY:        Copy can be mapped (MAP_PRIVATE)
 * NOMMU_MAP_DIRECT:        Can be mapped directly (MAP_SHARED)
 * NOMMU_MAP_READ:        Can be mapped for reading
 * NOMMU_MAP_WRITE:        Can be mapped for writing
 * NOMMU_MAP_EXEC:        Can be mapped for execution
 */
#define NOMMU_MAP_COPY                0x00000001
#define NOMMU_MAP_DIRECT        0x00000008
#define NOMMU_MAP_READ                VM_MAYREAD
#define NOMMU_MAP_WRITE                VM_MAYWRITE
#define NOMMU_MAP_EXEC                VM_MAYEXEC

#define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)

/*
 * These flags control the behavior of the remap_file_range function pointer.
 * If it is called with len == 0 that means "remap to end of source file".
 * See Documentation/filesystems/vfs.rst for more details about this call.
 *
 * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
 * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
 */
#define REMAP_FILE_DEDUP                (1 << 0)
#define REMAP_FILE_CAN_SHORTEN                (1 << 1)

/*
 * These flags signal that the caller is ok with altering various aspects of
 * the behavior of the remap operation.  The changes must be made by the
 * implementation; the vfs remap helper functions can take advantage of them.
 * Flags in this category exist to preserve the quirky behavior of the hoisted
 * btrfs clone/dedupe ioctls.
 */
#define REMAP_FILE_ADVISORY                (REMAP_FILE_CAN_SHORTEN)

/*
 * These flags control the behavior of vfs_copy_file_range().
 * They are not available to the user via syscall.
 *
 * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
 */
#define COPY_FILE_SPLICE                (1 << 0)

struct iov_iter;
struct io_uring_cmd;
struct offset_ctx;

typedef unsigned int __bitwise fop_flags_t;

struct file_operations {
        struct module *owner;
        fop_flags_t fop_flags;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
                        unsigned int flags);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        void (*splice_eof)(struct file *file);
        int (*setlease)(struct file *, int, struct file_lease **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
        int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
        int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
                                unsigned int poll_flags);
} __randomize_layout;

/* Supports async buffered reads */
#define FOP_BUFFER_RASYNC        ((__force fop_flags_t)(1 << 0))
/* Supports async buffered writes */
#define FOP_BUFFER_WASYNC        ((__force fop_flags_t)(1 << 1))
/* Supports synchronous page faults for mappings */
#define FOP_MMAP_SYNC                ((__force fop_flags_t)(1 << 2))
/* Supports non-exclusive O_DIRECT writes from multiple threads */
#define FOP_DIO_PARALLEL_WRITE        ((__force fop_flags_t)(1 << 3))
/* Contains huge pages */
#define FOP_HUGE_PAGES                ((__force fop_flags_t)(1 << 4))

/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
                            int (*) (struct file *, struct dir_context *));
#define WRAP_DIR_ITER(x) \
        static int shared_##x(struct file *file , struct dir_context *ctx) \
        { return wrap_directory_iterator(file, ctx, x); }

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct mnt_idmap *, struct inode *, int);
        struct posix_acl * (*get_inode_acl)(struct inode *, int, bool);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,
                       umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,
                        const char *);
        int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,
                      umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,
                      umode_t,dev_t);
        int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *);
        int (*getattr) (struct mnt_idmap *, const struct path *,
                        struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *, int);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct mnt_idmap *, struct inode *,
                        struct file *, umode_t);
        struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *,
                                     int);
        int (*set_acl)(struct mnt_idmap *, struct dentry *,
                       struct posix_acl *, int);
        int (*fileattr_set)(struct mnt_idmap *idmap,
                            struct dentry *dentry, struct fileattr *fa);
        int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;

static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
        return file->f_op->mmap(file, vma);
}

extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    loff_t *len, unsigned int remap_flags,
                                    const struct iomap_ops *dax_read_ops);
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t *count, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                        struct file *dst_file, loff_t dst_pos,
                                        loff_t len, unsigned int remap_flags);

/**
 * enum freeze_holder - holder of the freeze
 * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
 * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
 * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
 *
 * Indicate who the owner of the freeze or thaw request is and whether
 * the freeze needs to be exclusive or can nest.
 * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
 * same holder aren't allowed. It is however allowed to hold a single
 * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
 * the same time. This is relied upon by some filesystems during online
 * repair or similar.
 */
enum freeze_holder {
        FREEZE_HOLDER_KERNEL        = (1U << 0),
        FREEZE_HOLDER_USERSPACE        = (1U << 1),
        FREEZE_MAY_NEST                = (1U << 2),
};

struct super_operations {
           struct inode *(*alloc_inode)(struct super_block *sb);
        void (*destroy_inode)(struct inode *);
        void (*free_inode)(struct inode *);

           void (*dirty_inode) (struct inode *, int flags);
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *, enum freeze_holder who);
        int (*freeze_fs) (struct super_block *);
        int (*thaw_super) (struct super_block *, enum freeze_holder who);
        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*umount_begin) (struct super_block *);

        int (*show_options)(struct seq_file *, struct dentry *);
        int (*show_devname)(struct seq_file *, struct dentry *);
        int (*show_path)(struct seq_file *, struct dentry *);
        int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
        struct dquot __rcu **(*get_dquots)(struct inode *);
#endif
        long (*nr_cached_objects)(struct super_block *,
                                  struct shrink_control *);
        long (*free_cached_objects)(struct super_block *,
                                    struct shrink_control *);
        void (*shutdown)(struct super_block *sb);
};

/*
 * Inode flags - they have no relation to superblock flags now
 */
#define S_SYNC                (1 << 0)  /* Writes are synced at once */
#define S_NOATIME        (1 << 1)  /* Do not update access times */
#define S_APPEND        (1 << 2)  /* Append-only file */
#define S_IMMUTABLE        (1 << 3)  /* Immutable file */
#define S_DEAD                (1 << 4)  /* removed, but still open directory */
#define S_NOQUOTA        (1 << 5)  /* Inode is not counted to quota */
#define S_DIRSYNC        (1 << 6)  /* Directory modifications are synchronous */
#define S_NOCMTIME        (1 << 7)  /* Do not update file c/mtime */
#define S_SWAPFILE        (1 << 8)  /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE        (1 << 9)  /* Inode is fs-internal */
#define S_IMA                (1 << 10) /* Inode has an associated IMA struct */
#define S_AUTOMOUNT        (1 << 11) /* Automount/referral quasi-directory */
#define S_NOSEC                (1 << 12) /* no suid or xattr security attributes */
#ifdef CONFIG_FS_DAX
#define S_DAX                (1 << 13) /* Direct Access, avoiding the page cache */
#else
#define S_DAX                0          /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED        (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD        (1 << 15) /* Casefolded file */
#define S_VERITY        (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE        (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */

/*
 * Note that nosuid etc flags are inode-specific: setting some file-system
 * flags just means all the inodes inherit those flags by default. It might be
 * possible to override it selectively if you really wanted to with some
 * ioctl() that is not currently implemented.
 *
 * Exception: SB_RDONLY is always applied to the entire file system.
 *
 * Unfortunately, it is possible to change a filesystems flags with it mounted
 * with files in use.  This means that all of the inodes will not have their
 * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
 */
#define __IS_FLG(inode, flg)        ((inode)->i_sb->s_flags & (flg))

static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode)        sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                        ((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode)        (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                        ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode)        __IS_FLG(inode, SB_MANDLOCK)
#define IS_NOATIME(inode)        __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
#define IS_I_VERSION(inode)        __IS_FLG(inode, SB_I_VERSION)

#define IS_NOQUOTA(inode)        ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode)        ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode)        ((inode)->i_flags & S_IMMUTABLE)

#ifdef CONFIG_FS_POSIX_ACL
#define IS_POSIXACL(inode)        __IS_FLG(inode, SB_POSIXACL)
#else
#define IS_POSIXACL(inode)        0
#endif

#define IS_DEADDIR(inode)        ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode)        ((inode)->i_flags & S_NOCMTIME)

#ifdef CONFIG_SWAP
#define IS_SWAPFILE(inode)        ((inode)->i_flags & S_SWAPFILE)
#else
#define IS_SWAPFILE(inode)        ((void)(inode), 0U)
#endif

#define IS_PRIVATE(inode)        ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode)                ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode)        ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode)                ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode)        ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode)        ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode)        ((inode)->i_flags & S_VERITY)

#define IS_WHITEOUT(inode)        (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)

static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
                                   struct inode *inode)
{
        return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
               !vfsgid_valid(i_gid_into_vfsgid(idmap, inode));
}

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = filp->f_iocb_flags,
                .ki_ioprio = get_current_ioprio(),
        };
}

static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
                               struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = kiocb_src->ki_flags,
                .ki_ioprio = kiocb_src->ki_ioprio,
                .ki_pos = kiocb_src->ki_pos,
        };
}

/*
 * Inode state bits.  Protected by inode->i_lock
 *
 * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
 * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
 *
 * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
 * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
 * various stages of removing an inode.
 *
 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
 *
 * I_DIRTY_SYNC                Inode is dirty, but doesn't have to be written on
 *                        fdatasync() (unless I_DIRTY_DATASYNC is also set).
 *                        Timestamp updates are the usual cause.
 * I_DIRTY_DATASYNC        Data-related inode changes pending.  We keep track of
 *                        these changes separately from I_DIRTY_SYNC so that we
 *                        don't have to write inode on fdatasync() when only
 *                        e.g. the timestamps have changed.
 * I_DIRTY_PAGES        Inode has dirty pages.  Inode itself may be clean.
 * I_DIRTY_TIME                The inode itself has dirty timestamps, and the
 *                        lazytime mount option is enabled.  We keep track of this
 *                        separately from I_DIRTY_SYNC in order to implement
 *                        lazytime.  This gets cleared if I_DIRTY_INODE
 *                        (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
 *                        I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
 *                        in place because writeback might already be in progress
 *                        and we don't want to lose the time update
 * I_NEW                Serves as both a mutex and completion notification.
 *                        New inodes set I_NEW.  If two processes both create
 *                        the same inode, one of them will release its inode and
 *                        wait for I_NEW to be released before returning.
 *                        Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
 *                        also cause waiting on I_NEW, without I_NEW actually
 *                        being set.  find_inode() uses this to prevent returning
 *                        nearly-dead inodes.
 * I_WILL_FREE                Must be set when calling write_inode_now() if i_count
 *                        is zero.  I_FREEING must be set when I_WILL_FREE is
 *                        cleared.
 * I_FREEING                Set when inode is about to be freed but still has dirty
 *                        pages or buffers attached or the inode itself is still
 *                        dirty.
 * I_CLEAR                Added by clear_inode().  In this state the inode is
 *                        clean and can be destroyed.  Inode keeps I_FREEING.
 *
 *                        Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
 *                        prohibited for many purposes.  iget() must wait for
 *                        the inode to be completely released, then create it
 *                        anew.  Other functions will just ignore such inodes,
 *                        if appropriate.  I_NEW is used for waiting.
 *
 * I_SYNC                Writeback of inode is running. The bit is set during
 *                        data writeback, and cleared with a wakeup on the bit
 *                        address once it is done. The bit is also used to pin
 *                        the inode in memory for flusher thread.
 *
 * I_REFERENCED                Marks the inode as recently references on the LRU list.
 *
 * I_DIO_WAKEUP                Never set.  Only used as a key for wait_on_bit().
 *
 * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 *                        synchronize competing switching instances and to tell
 *                        wb stat updates to grab the i_pages lock.  See
 *                        inode_switch_wbs_work_fn() for details.
 *
 * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
 *                        and work dirs among overlayfs mounts.
 *
 * I_CREATING                New object's inode in the middle of setting up.
 *
 * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 *
 * I_SYNC_QUEUED        Inode is queued in b_io or b_more_io writeback lists.
 *                        Used to detect that mark_inode_dirty() should not move
 *                         inode between dirty lists.
 *
 * I_PINNING_FSCACHE_WB        Inode is pinning an fscache object for writeback.
 *
 * Q: What is the difference between I_WILL_FREE and I_FREEING?
 */
#define I_DIRTY_SYNC                (1 << 0)
#define I_DIRTY_DATASYNC        (1 << 1)
#define I_DIRTY_PAGES                (1 << 2)
#define __I_NEW                        3
#define I_NEW                        (1 << __I_NEW)
#define I_WILL_FREE                (1 << 4)
#define I_FREEING                (1 << 5)
#define I_CLEAR                        (1 << 6)
#define __I_SYNC                7
#define I_SYNC                        (1 << __I_SYNC)
#define I_REFERENCED                (1 << 8)
#define __I_DIO_WAKEUP                9
#define I_DIO_WAKEUP                (1 << __I_DIO_WAKEUP)
#define I_LINKABLE                (1 << 10)
#define I_DIRTY_TIME                (1 << 11)
#define I_WB_SWITCH                (1 << 13)
#define I_OVL_INUSE                (1 << 14)
#define I_CREATING                (1 << 15)
#define I_DONTCACHE                (1 << 16)
#define I_SYNC_QUEUED                (1 << 17)
#define I_PINNING_NETFS_WB        (1 << 18)

#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY);
}

static inline void mark_inode_dirty_sync(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY_SYNC);
}

/*
 * Returns true if the given inode itself only has dirty timestamps (its pages
 * may still be dirty) and isn't currently being allocated or freed.
 * Filesystems should call this if when writing an inode when lazytime is
 * enabled, they want to opportunistically write the timestamps of other inodes
 * located very nearby on-disk, e.g. in the same inode block.  This returns true
 * if the given inode is in need of such an opportunistic update.  Requires
 * i_lock, or at least later re-checking under i_lock.
 */
static inline bool inode_is_dirtytime_only(struct inode *inode)
{
        return (inode->i_state & (I_DIRTY_TIME | I_NEW |
                                  I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}

extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
extern void set_nlink(struct inode *inode, unsigned int nlink);

static inline void inode_inc_link_count(struct inode *inode)
{
        inc_nlink(inode);
        mark_inode_dirty(inode);
}

static inline void inode_dec_link_count(struct inode *inode)
{
        drop_nlink(inode);
        mark_inode_dirty(inode);
}

enum file_time_flags {
        S_ATIME = 1,
        S_MTIME = 2,
        S_CTIME = 4,
        S_VERSION = 8,
};

extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);
int inode_update_time(struct inode *inode, int flags);

static inline void file_accessed(struct file *file)
{
        if (!(file->f_flags & O_NOATIME))
                touch_atime(&file->f_path);
}

extern int file_modified(struct file *file);
int kiocb_modified(struct kiocb *iocb);

int sync_inode_metadata(struct inode *inode, int wait);

struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV                1 
#define FS_BINARY_MOUNTDATA        2
#define FS_HAS_SUBTYPE                4
#define FS_USERNS_MOUNT                8        /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM        16        /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
#define FS_RENAME_DOES_D_MOVE        32768        /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key invalidate_lock_key;
        struct lock_class_key i_mutex_dir_key;
};

#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)

extern struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void retire_super(struct super_block *sb);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);
void kill_litter_super(struct super_block *sb);
void deactivate_super(struct super_block *sb);
void deactivate_locked_super(struct super_block *sb);
int set_anon_super(struct super_block *s, void *data);
int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *));
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags, void *data);
struct super_block *sget_dev(struct fs_context *fc, dev_t dev);

/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
#define fops_get(fops) \
        (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
#define fops_put(fops) \
        do { if (fops) module_put((fops)->owner); } while(0)
/*
 * This one is to be used *ONLY* from ->open() instances.
 * fops must be non-NULL, pinned down *and* module dependencies
 * should be sufficient to pin the caller down as well.
 */
#define replace_fops(f, fops) \
        do {        \
                struct file *__file = (f); \
                fops_put(__file->f_op); \
                BUG_ON(!(__file->f_op = (fops))); \
        } while(0)

extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
int freeze_super(struct super_block *super, enum freeze_holder who);
int thaw_super(struct super_block *super, enum freeze_holder who);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);

static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len)
{
        if (WARN_ON(len > sizeof(sb->s_uuid)))
                len = sizeof(sb->s_uuid);
        sb->s_uuid_len = len;
        memcpy(&sb->s_uuid, uuid, len);
}

/* set sb sysfs name based on sb->s_bdev */
static inline void super_set_sysfs_name_bdev(struct super_block *sb)
{
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev);
}

/* set sb sysfs name based on sb->s_uuid */
static inline void super_set_sysfs_name_uuid(struct super_block *sb)
{
        WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid));
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b);
}

/* set sb sysfs name based on sb->s_id */
static inline void super_set_sysfs_name_id(struct super_block *sb)
{
        strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name));
}

/* try to use something standard before you use this */
__printf(2, 3)
static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args);
        va_end(args);
}

extern int current_umask(void);

extern void ihold(struct inode * inode);
extern void iput(struct inode *);
int inode_update_timestamps(struct inode *inode, int flags);
int generic_update_time(struct inode *, int);

/* /sys/fs */
extern struct kobject *fs_kobj;

#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)

/* fs/open.c */
struct audit_names;
struct filename {
        const char                *name;        /* pointer to actual string */
        const __user char        *uptr;        /* original userland pointer */
        atomic_t                refcnt;
        struct audit_names        *aname;
        const char                iname[];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);

static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
{
        return mnt_idmap(file->f_path.mnt);
}

/**
 * is_idmapped_mnt - check whether a mount is mapped
 * @mnt: the mount to check
 *
 * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped.
 *
 * Return: true if mount is mapped, false if not.
 */
static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
{
        return mnt_idmap(mnt) != &nop_mnt_idmap;
}

extern long vfs_truncate(const struct path *, loff_t);
int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
                unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
                        umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(const struct path *,
                                   const char *, int, umode_t);
static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
                                   const char *name, int flags, umode_t mode)
{
        return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
                              name, flags, mode);
}
struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *creds);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred);
struct path *backing_file_user_path(struct file *f);

/*
 * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
 * stored in ->vm_file is a backing file whose f_inode is on the underlying
 * filesystem.  When the mapped file path and inode number are displayed to
 * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
 * path and inode number to display to the user, which is the path of the fd
 * that user has requested to map and the inode number that would be returned
 * by fstat() on that same fd.
 */
/* Get the path to display in /proc/<pid>/maps */
static inline const struct path *file_user_path(struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return backing_file_user_path(f);
        return &f->f_path;
}
/* Get the inode whose inode number to display in /proc/<pid>/maps */
static inline const struct inode *file_user_inode(struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return d_inode(backing_file_user_path(f)->dentry);
        return file_inode(f);
}

static inline struct file *file_clone_open(struct file *file)
{
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int, int *);
extern struct filename *getname_uflags(const char __user *, int);
extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
extern void putname(struct filename *name);

extern int finish_open(struct file *file, struct dentry *dentry,
                        int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);

/* Helper for the simple case when original dentry is used */
static inline int finish_open_simple(struct file *file, int error)
{
        if (error)
                return error;

        return finish_open(file, file->f_path.dentry, NULL);
}

/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);

extern struct kmem_cache *names_cachep;

#define __getname()                kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name)                kmem_cache_free(names_cachep, (void *)(name))

extern struct super_block *blockdev_superblock;
static inline bool sb_is_blkdev_sb(struct super_block *sb)
{
        return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
}

void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
extern const struct file_operations def_chr_fops;

/* fs/char_dev.c */
#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
/* Marks the top and bottom of the second segment of free char majors */
#define CHRDEV_MAJOR_DYN_EXT_START 511
#define CHRDEV_MAJOR_DYN_EXT_END 384

extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
                             unsigned int count, const char *name,
                             const struct file_operations *fops);
extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                                unsigned int count, const char *name);
extern void unregister_chrdev_region(dev_t, unsigned);
extern void chrdev_show(struct seq_file *,off_t);

static inline int register_chrdev(unsigned int major, const char *name,
                                  const struct file_operations *fops)
{
        return __register_chrdev(major, 0, 256, name, fops);
}

static inline void unregister_chrdev(unsigned int major, const char *name)
{
        __unregister_chrdev(major, 0, 256, name);
}

extern void init_special_inode(struct inode *, umode_t, dev_t);

/* Invalid inode operations -- fs/bad_inode.c */
extern void make_bad_inode(struct inode *);
extern bool is_bad_inode(struct inode *);

extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);

static inline int file_write_and_wait(struct file *file)
{
        return file_write_and_wait_range(file, 0, LLONG_MAX);
}

extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                           int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                                unsigned int flags);

static inline bool iocb_is_dsync(const struct kiocb *iocb)
{
        return (iocb->ki_flags & IOCB_DSYNC) ||
                IS_SYNC(iocb->ki_filp->f_mapping->host);
}

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
        if (iocb_is_dsync(iocb)) {
                int ret = vfs_fsync_range(iocb->ki_filp,
                                iocb->ki_pos - count, iocb->ki_pos - 1,
                                (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
                if (ret)
                        return ret;
        }

        return count;
}

extern void emergency_sync(void);
extern void emergency_remount(void);

#ifdef CONFIG_BLOCK
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode,  sector_t *block)
{
        return -EINVAL;
}
#endif

int notify_change(struct mnt_idmap *, struct dentry *,
                  struct iattr *, struct inode **);
int inode_permission(struct mnt_idmap *, struct inode *, int);
int generic_permission(struct mnt_idmap *, struct inode *, int);
static inline int file_permission(struct file *file, int mask)
{
        return inode_permission(file_mnt_idmap(file),
                                file_inode(file), mask);
}
static inline int path_permission(const struct path *path, int mask)
{
        return inode_permission(mnt_idmap(path->mnt),
                                d_inode(path->dentry), mask);
}
int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode);

static inline bool execute_ok(struct inode *inode)
{
        return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}

static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
{
        return (inode->i_mode ^ mode) & S_IFMT;
}

/**
 * file_start_write - get write access to a superblock for regular file io
 * @file: the file we want to write to
 *
 * This is a variant of sb_start_write() which is a noop on non-regualr file.
 * Should be matched with a call to file_end_write().
 */
static inline void file_start_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_start_write(file_inode(file)->i_sb);
}

static inline bool file_start_write_trylock(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_start_write_trylock(file_inode(file)->i_sb);
}

/**
 * file_end_write - drop write access to a superblock of a regular file
 * @file: the file we wrote to
 *
 * Should be matched with a call to file_start_write().
 */
static inline void file_end_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_end_write(file_inode(file)->i_sb);
}

/**
 * kiocb_start_write - get write access to a superblock for async file io
 * @iocb: the io context we want to submit the write with
 *
 * This is a variant of sb_start_write() for async io submission.
 * Should be matched with a call to kiocb_end_write().
 */
static inline void kiocb_start_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        sb_start_write(inode->i_sb);
        /*
         * Fool lockdep by telling it the lock got released so that it
         * doesn't complain about the held lock when we return to userspace.
         */
        __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
}

/**
 * kiocb_end_write - drop write access to a superblock after async file io
 * @iocb: the io context we sumbitted the write with
 *
 * Should be matched with a call to kiocb_start_write().
 */
static inline void kiocb_end_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        /*
         * Tell lockdep we inherited freeze protection from submission thread.
         */
        __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
        sb_end_write(inode->i_sb);
}

/*
 * This is used for regular files where some users -- especially the
 * currently executed binary in a process, previously handled via
 * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
 * read-write shared) accesses.
 *
 * get_write_access() gets write permission for a file.
 * put_write_access() releases this write permission.
 * deny_write_access() denies write access to a file.
 * allow_write_access() re-enables write access to a file.
 *
 * The i_writecount field of an inode can have the following values:
 * 0: no write access, no denied write access
 * < 0: (-i_writecount) users that denied write access to the file.
 * > 0: (i_writecount) users that have write access to the file.
 *
 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 * except for the cases where we don't hold i_writecount yet. Then we need to
 * use {get,deny}_write_access() - these functions check the sign and refuse
 * to do the change if sign is wrong.
 */
static inline int get_write_access(struct inode *inode)
{
        return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline int deny_write_access(struct file *file)
{
        struct inode *inode = file_inode(file);
        return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline void put_write_access(struct inode * inode)
{
        atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
{
        if (file)
                atomic_inc(&file_inode(file)->i_writecount);
}
static inline bool inode_is_open_for_write(const struct inode *inode)
{
        return atomic_read(&inode->i_writecount) > 0;
}

#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
        BUG_ON(atomic_dec_return(&inode->i_readcount) < 0);
}
static inline void i_readcount_inc(struct inode *inode)
{
        atomic_inc(&inode->i_readcount);
}
#else
static inline void i_readcount_dec(struct inode *inode)
{
        return;
}
static inline void i_readcount_inc(struct inode *inode)
{
        return;
}
#endif
extern int do_pipe_flags(int *, int);

extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
 
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);

extern char *file_path(struct file *, char *, int);

/**
 * is_dot_dotdot - returns true only if @name is "." or ".."
 * @name: file name to check
 * @len: length of file name, in bytes
 */
static inline bool is_dot_dotdot(const char *name, size_t len)
{
        return len && unlikely(name[0] == '.') &&
                (len == 1 || (len == 2 && name[1] == '.'));
}

#include <linux/err.h>

/* needed for stackable file system support */
extern loff_t default_llseek(struct file *file, loff_t offset, int whence);

extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);

extern int inode_init_always(struct super_block *, struct inode *);
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int inode_needs_sync(struct inode *inode);
extern int generic_delete_inode(struct inode *inode);
static inline int generic_drop_inode(struct inode *inode)
{
        return !inode->i_nlink || inode_unhashed(inode);
}
extern void d_mark_dontcache(struct inode *inode);

extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
                void *data);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);

extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *),
                void *data);
extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
                                       unsigned long,
                                       int (*match)(struct inode *,
                                                    unsigned long, void *),
                                       void *data);
extern struct inode *find_inode_rcu(struct super_block *, unsigned long,
                                    int (*)(struct inode *, void *), void *);
extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
#else
static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
void dump_mapping(const struct address_space *);

/*
 * Userspace may rely on the inode number being non-zero. For example, glibc
 * simply ignores files with zero i_ino in unlink() and other places.
 *
 * As an additional complication, if userspace was compiled with
 * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
 * lower 32 bits, so we need to check that those aren't zero explicitly. With
 * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
 * better safe than sorry.
 */
static inline bool is_zero_ino(ino_t ino)
{
        return (u32)ino == 0;
}

extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
extern struct inode *new_inode_pseudo(struct super_block *sb);
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
extern int file_remove_privs_flags(struct file *file, unsigned int flags);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
                             const struct inode *inode);

/*
 * This must be used for allocating filesystems specific inodes to set
 * up the inode reclaim context correctly.
 */
#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)

extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
{
        __insert_inode_hash(inode, inode->i_ino);
}

extern void __remove_inode_hash(struct inode *);
static inline void remove_inode_hash(struct inode *inode)
{
        if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
                __remove_inode_hash(inode);
}

extern void inode_sb_list_add(struct inode *inode);
extern void inode_add_lru(struct inode *inode);

extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);

extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
extern int generic_write_check_limits(struct file *file, loff_t pos,
                loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
                ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
ssize_t generic_perform_write(struct kiocb *, struct iov_iter *);
ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter);
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);

/* fs/splice.c */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags);
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe,
                         size_t len, unsigned int flags);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);


extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
#define no_llseek NULL
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                int whence, loff_t maxsize, loff_t eof);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
int rw_verify_area(int, struct file *, const loff_t *, size_t);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);

#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
                            loff_t file_offset);

enum {
        /* need locking between buffered and direct access */
        DIO_LOCKING        = 0x01,

        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES        = 0x02,
};

ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
                             dio_iodone_t end_io,
                             int flags);

static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
                                         struct inode *inode,
                                         struct iov_iter *iter,
                                         get_block_t get_block)
{
        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                        get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
}
#endif

void inode_dio_wait(struct inode *inode);

/**
 * inode_dio_begin - signal start of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_begin(struct inode *inode)
{
        atomic_inc(&inode->i_dio_count);
}

/**
 * inode_dio_end - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_end(struct inode *inode)
{
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}

extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);

extern const struct file_operations generic_ro_fops;

#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

extern int readlink_copy(char __user *, int, const char *);
extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link(struct dentry *, struct inode *,
                                 struct delayed_call *);
extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
static inline loff_t __inode_get_bytes(struct inode *inode)
{
        return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
                            struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;

extern int iterate_dir(struct file *, struct dir_context *);

int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                int flags);
int vfs_fstat(int fd, struct kstat *stat);

static inline int vfs_stat(const char __user *filename, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, filename, stat, 0);
}
static inline int vfs_lstat(const char __user *name, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}

extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
extern int vfs_readlink(struct dentry *, char __user *, int);

extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*)(struct super_block *, void *), void *);
extern void iterate_supers_type(struct file_system_type *,
                                void (*)(struct super_block *, void *), void *);

extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
extern int simple_setattr(struct mnt_idmap *, struct dentry *,
                          struct iattr *);
extern int simple_getattr(struct mnt_idmap *, const struct path *,
                          struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                                  struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename(struct mnt_idmap *, struct inode *,
                         struct dentry *, struct inode *, struct dentry *,
                         unsigned int);
extern void simple_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata);
extern const struct address_space_operations ram_aops;
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
extern int simple_nosetlease(struct file *, int, struct file_lease **, void **);
extern const struct dentry_operations simple_dentry_operations;

extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long,
                             const struct tree_descr *);
extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
extern void simple_release_fs(struct vfsmount **mount, int *count);

extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);

struct offset_ctx {
        struct maple_tree        mt;
        unsigned long                next_offset;
};

void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
int simple_offset_empty(struct dentry *dentry);
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry);
void simple_offset_destroy(struct offset_ctx *octx);

extern const struct file_operations simple_offset_dir_operations;

extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

extern int generic_check_addressable(unsigned, u64);

extern void generic_set_sb_d_ops(struct super_block *sb);

static inline bool sb_has_encoding(const struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        return !!sb->s_encoding;
#else
        return false;
#endif
}

int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                unsigned int ia_valid);
int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
void setattr_copy(struct mnt_idmap *, struct inode *inode,
                  const struct iattr *attr);

extern int file_update_time(struct file *file);

static inline bool vma_is_dax(const struct vm_area_struct *vma)
{
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
}

static inline bool vma_is_fsdax(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
        inode = file_inode(vma->vm_file);
        if (S_ISCHR(inode->i_mode))
                return false; /* device-dax */
        return true;
}

static inline int iocb_flags(struct file *file)
{
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
        if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if (file->f_flags & O_DSYNC)
                res |= IOCB_DSYNC;
        if (file->f_flags & __O_SYNC)
                res |= IOCB_SYNC;
        return res;
}

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
{
        int kiocb_flags = 0;

        /* make sure there's no overlap between RWF and private IOCB flags */
        BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);

        if (!flags)
                return 0;
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;
        if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
                return -EINVAL;

        if (flags & RWF_NOWAIT) {
                if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                        return -EOPNOTSUPP;
                kiocb_flags |= IOCB_NOIO;
        }
        kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
        if (flags & RWF_SYNC)
                kiocb_flags |= IOCB_DSYNC;

        if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
                if (IS_APPEND(file_inode(ki->ki_filp)))
                        return -EPERM;
                ki->ki_flags &= ~IOCB_APPEND;
        }

        ki->ki_flags |= kiocb_flags;
        return 0;
}

static inline ino_t parent_ino(struct dentry *dentry)
{
        ino_t res;

        /*
         * Don't strictly need d_lock here? If the parent ino could change
         * then surely we'd have a deeper race in the caller?
         */
        spin_lock(&dentry->d_lock);
        res = dentry->d_parent->d_inode->i_ino;
        spin_unlock(&dentry->d_lock);
        return res;
}

/* Transaction based IO helpers */

/*
 * An argresp is stored in an allocated page and holds the
 * size of the argument or response, along with its content
 */
struct simple_transaction_argresp {
        ssize_t size;
        char data[];
};

#define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))

char *simple_transaction_get(struct file *file, const char __user *buf,
                                size_t size);
ssize_t simple_transaction_read(struct file *file, char __user *buf,
                                size_t size, loff_t *pos);
int simple_transaction_release(struct inode *inode, struct file *file);

void simple_transaction_set(struct file *file, size_t n);

/*
 * simple attribute files
 *
 * These attributes behave similar to those in sysfs:
 *
 * Writing to an attribute immediately sets a value, an open file can be
 * written to multiple times.
 *
 * Reading from an attribute creates a buffer from the value that might get
 * read with multiple read calls. When the attribute has been read
 * completely, no further read calls are possible until the file is opened
 * again.
 *
 * All attributes contain a text representation of a numeric value
 * that are accessed with the get() and set() functions.
 */
#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed)        \
static int __fops ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        __simple_attr_check_format(__fmt, 0ull);                        \
        return simple_attr_open(inode, file, __get, __set, __fmt);        \
}                                                                        \
static const struct file_operations __fops = {                                \
        .owner         = THIS_MODULE,                                                \
        .open         = __fops ## _open,                                        \
        .release = simple_attr_release,                                        \
        .read         = simple_attr_read,                                        \
        .write         = (__is_signed) ? simple_attr_write_signed : simple_attr_write,        \
        .llseek         = generic_file_llseek,                                        \
}

#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)                \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)

#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt)        \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)

static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
        /* don't do anything, just let the compiler check the arguments; */
}

int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt);
int simple_attr_release(struct inode *inode, struct file *file);
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                                 size_t len, loff_t *ppos);

struct ctl_table;
int __init list_bdev_fs_names(char *buf, size_t size);

#define __FMODE_EXEC                ((__force int) FMODE_EXEC)
#define __FMODE_NONOTIFY        ((__force int) FMODE_NONOTIFY)

#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
                                            (flag & __FMODE_NONOTIFY)))

static inline bool is_sxid(umode_t mode)
{
        return mode & (S_ISUID | S_ISGID);
}

static inline int check_sticky(struct mnt_idmap *idmap,
                               struct inode *dir, struct inode *inode)
{
        if (!(dir->i_mode & S_ISVTX))
                return 0;

        return __check_sticky(idmap, dir, inode);
}

static inline void inode_has_no_xattr(struct inode *inode)
{
        if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                inode->i_flags |= S_NOSEC;
}

static inline bool is_root_inode(struct inode *inode)
{
        return inode == inode->i_sb->s_root->d_inode;
}

static inline bool dir_emit(struct dir_context *ctx,
                            const char *name, int namelen,
                            u64 ino, unsigned type)
{
        return ctx->actor(ctx, name, namelen, ctx->pos, ino, type);
}
static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, ".", 1, ctx->pos,
                          file->f_path.dentry->d_inode->i_ino, DT_DIR);
}
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, "..", 2, ctx->pos,
                          parent_ino(file->f_path.dentry), DT_DIR);
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
                        return false;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (!dir_emit_dotdot(file, ctx))
                        return false;
                ctx->pos = 2;
        }
        return true;
}
static inline bool dir_relax(struct inode *inode)
{
        inode_unlock(inode);
        inode_lock(inode);
        return !IS_DEADDIR(inode);
}

static inline bool dir_relax_shared(struct inode *inode)
{
        inode_unlock_shared(inode);
        inode_lock_shared(inode);
        return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                           int advice);

#endif /* _LINUX_FS_H */


































































































































    1 





    1 




























































































































































































    1 



    1 























































































































    2 





    1 



    2 







    1 
    1 


    2 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
// SPDX-License-Identifier: GPL-2.0-only
/*
 * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
 * Home page:
 *      http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
 * This is from the implementation of CUBIC TCP in
 * Sangtae Ha, Injong Rhee and Lisong Xu,
 *  "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
 *  in ACM SIGOPS Operating System Review, July 2008.
 * Available from:
 *  http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
 *
 * CUBIC integrates a new slow start algorithm, called HyStart.
 * The details of HyStart are presented in
 *  Sangtae Ha and Injong Rhee,
 *  "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
 * Available from:
 *  http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
 *
 * All testing results are available from:
 * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
 *
 * Unless CUBIC is enabled and congestion window is large
 * this behaves the same as the original Reno.
 */

#include <linux/mm.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/module.h>
#include <linux/math64.h>
#include <net/tcp.h>

#define BICTCP_BETA_SCALE    1024        /* Scale factor beta calculation
                                         * max_cwnd = snd_cwnd * beta
                                         */
#define        BICTCP_HZ                10        /* BIC HZ 2^10 = 1024 */

/* Two methods of hybrid slow start */
#define HYSTART_ACK_TRAIN        0x1
#define HYSTART_DELAY                0x2

/* Number of delay samples for detecting the increase of delay */
#define HYSTART_MIN_SAMPLES        8
#define HYSTART_DELAY_MIN        (4000U)        /* 4 ms */
#define HYSTART_DELAY_MAX        (16000U)        /* 16 ms */
#define HYSTART_DELAY_THRESH(x)        clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)

static int fast_convergence __read_mostly = 1;
static int beta __read_mostly = 717;        /* = 717/1024 (BICTCP_BETA_SCALE) */
static int initial_ssthresh __read_mostly;
static int bic_scale __read_mostly = 41;
static int tcp_friendliness __read_mostly = 1;

static int hystart __read_mostly = 1;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
static int hystart_low_window __read_mostly = 16;
static int hystart_ack_delta_us __read_mostly = 2000;

static u32 cube_rtt_scale __read_mostly;
static u32 beta_scale __read_mostly;
static u64 cube_factor __read_mostly;

/* Note parameters that are used for precomputing scale factors are read-only */
module_param(fast_convergence, int, 0644);
MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
module_param(beta, int, 0644);
MODULE_PARM_DESC(beta, "beta for multiplicative increase");
module_param(initial_ssthresh, int, 0644);
MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
module_param(bic_scale, int, 0444);
MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
module_param(tcp_friendliness, int, 0644);
MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
module_param(hystart, int, 0644);
MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
module_param(hystart_detect, int, 0644);
MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
                 " 1: packet-train 2: delay 3: both packet-train and delay");
module_param(hystart_low_window, int, 0644);
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
module_param(hystart_ack_delta_us, int, 0644);
MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");

/* BIC TCP Parameters */
struct bictcp {
        u32        cnt;                /* increase cwnd by 1 after ACKs */
        u32        last_max_cwnd;        /* last maximum snd_cwnd */
        u32        last_cwnd;        /* the last snd_cwnd */
        u32        last_time;        /* time when updated last_cwnd */
        u32        bic_origin_point;/* origin point of bic function */
        u32        bic_K;                /* time to origin point
                                   from the beginning of the current epoch */
        u32        delay_min;        /* min delay (usec) */
        u32        epoch_start;        /* beginning of an epoch */
        u32        ack_cnt;        /* number of acks */
        u32        tcp_cwnd;        /* estimated tcp cwnd */
        u16        unused;
        u8        sample_cnt;        /* number of samples to decide curr_rtt */
        u8        found;                /* the exit point is found? */
        u32        round_start;        /* beginning of each round */
        u32        end_seq;        /* end_seq of the round */
        u32        last_ack;        /* last time when the ACK spacing is close */
        u32        curr_rtt;        /* the minimum rtt of current round */
};

static inline void bictcp_reset(struct bictcp *ca)
{
        memset(ca, 0, offsetof(struct bictcp, unused));
        ca->found = 0;
}

static inline u32 bictcp_clock_us(const struct sock *sk)
{
        return tcp_sk(sk)->tcp_mstamp;
}

static inline void bictcp_hystart_reset(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct bictcp *ca = inet_csk_ca(sk);

        ca->round_start = ca->last_ack = bictcp_clock_us(sk);
        ca->end_seq = tp->snd_nxt;
        ca->curr_rtt = ~0U;
        ca->sample_cnt = 0;
}

__bpf_kfunc static void cubictcp_init(struct sock *sk)
{
        struct bictcp *ca = inet_csk_ca(sk);

        bictcp_reset(ca);

        if (hystart)
                bictcp_hystart_reset(sk);

        if (!hystart && initial_ssthresh)
                tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}

__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
        if (event == CA_EVENT_TX_START) {
                struct bictcp *ca = inet_csk_ca(sk);
                u32 now = tcp_jiffies32;
                s32 delta;

                delta = now - tcp_sk(sk)->lsndtime;

                /* We were application limited (idle) for a while.
                 * Shift epoch_start to keep cwnd growth to cubic curve.
                 */
                if (ca->epoch_start && delta > 0) {
                        ca->epoch_start += delta;
                        if (after(ca->epoch_start, now))
                                ca->epoch_start = now;
                }
                return;
        }
}

/* calculate the cubic root of x using a table lookup followed by one
 * Newton-Raphson iteration.
 * Avg err ~= 0.195%
 */
static u32 cubic_root(u64 a)
{
        u32 x, b, shift;
        /*
         * cbrt(x) MSB values for x MSB values in [0..63].
         * Precomputed then refined by hand - Willy Tarreau
         *
         * For x in [0..63],
         *   v = cbrt(x << 18) - 1
         *   cbrt(x) = (v[x] + 10) >> 6
         */
        static const u8 v[] = {
                /* 0x00 */    0,   54,   54,   54,  118,  118,  118,  118,
                /* 0x08 */  123,  129,  134,  138,  143,  147,  151,  156,
                /* 0x10 */  157,  161,  164,  168,  170,  173,  176,  179,
                /* 0x18 */  181,  185,  187,  190,  192,  194,  197,  199,
                /* 0x20 */  200,  202,  204,  206,  209,  211,  213,  215,
                /* 0x28 */  217,  219,  221,  222,  224,  225,  227,  229,
                /* 0x30 */  231,  232,  234,  236,  237,  239,  240,  242,
                /* 0x38 */  244,  245,  246,  248,  250,  251,  252,  254,
        };

        b = fls64(a);
        if (b < 7) {
                /* a in [0..63] */
                return ((u32)v[(u32)a] + 35) >> 6;
        }

        b = ((b * 84) >> 8) - 1;
        shift = (a >> (b * 3));

        x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;

        /*
         * Newton-Raphson iteration
         *                         2
         * x    = ( 2 * x  +  a / x  ) / 3
         *  k+1          k         k
         */
        x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
        x = ((x * 341) >> 10);
        return x;
}

/*
 * Compute congestion window to use.
 */
static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
{
        u32 delta, bic_target, max_cnt;
        u64 offs, t;

        ca->ack_cnt += acked;        /* count the number of ACKed packets */

        if (ca->last_cwnd == cwnd &&
            (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
                return;

        /* The CUBIC function can update ca->cnt at most once per jiffy.
         * On all cwnd reduction events, ca->epoch_start is set to 0,
         * which will force a recalculation of ca->cnt.
         */
        if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
                goto tcp_friendliness;

        ca->last_cwnd = cwnd;
        ca->last_time = tcp_jiffies32;

        if (ca->epoch_start == 0) {
                ca->epoch_start = tcp_jiffies32;        /* record beginning */
                ca->ack_cnt = acked;                        /* start counting */
                ca->tcp_cwnd = cwnd;                        /* syn with cubic */

                if (ca->last_max_cwnd <= cwnd) {
                        ca->bic_K = 0;
                        ca->bic_origin_point = cwnd;
                } else {
                        /* Compute new K based on
                         * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
                         */
                        ca->bic_K = cubic_root(cube_factor
                                               * (ca->last_max_cwnd - cwnd));
                        ca->bic_origin_point = ca->last_max_cwnd;
                }
        }

        /* cubic function - calc*/
        /* calculate c * time^3 / rtt,
         *  while considering overflow in calculation of time^3
         * (so time^3 is done by using 64 bit)
         * and without the support of division of 64bit numbers
         * (so all divisions are done by using 32 bit)
         *  also NOTE the unit of those veriables
         *          time  = (t - K) / 2^bictcp_HZ
         *          c = bic_scale >> 10
         * rtt  = (srtt >> 3) / HZ
         * !!! The following code does not have overflow problems,
         * if the cwnd < 1 million packets !!!
         */

        t = (s32)(tcp_jiffies32 - ca->epoch_start);
        t += usecs_to_jiffies(ca->delay_min);
        /* change the unit from HZ to bictcp_HZ */
        t <<= BICTCP_HZ;
        do_div(t, HZ);

        if (t < ca->bic_K)                /* t - K */
                offs = ca->bic_K - t;
        else
                offs = t - ca->bic_K;

        /* c/rtt * (t-K)^3 */
        delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
        if (t < ca->bic_K)                            /* below origin*/
                bic_target = ca->bic_origin_point - delta;
        else                                          /* above origin*/
                bic_target = ca->bic_origin_point + delta;

        /* cubic function - calc bictcp_cnt*/
        if (bic_target > cwnd) {
                ca->cnt = cwnd / (bic_target - cwnd);
        } else {
                ca->cnt = 100 * cwnd;              /* very small increment*/
        }

        /*
         * The initial growth of cubic function may be too conservative
         * when the available bandwidth is still unknown.
         */
        if (ca->last_max_cwnd == 0 && ca->cnt > 20)
                ca->cnt = 20;        /* increase cwnd 5% per RTT */

tcp_friendliness:
        /* TCP Friendly */
        if (tcp_friendliness) {
                u32 scale = beta_scale;

                delta = (cwnd * scale) >> 3;
                while (ca->ack_cnt > delta) {                /* update tcp cwnd */
                        ca->ack_cnt -= delta;
                        ca->tcp_cwnd++;
                }

                if (ca->tcp_cwnd > cwnd) {        /* if bic is slower than tcp */
                        delta = ca->tcp_cwnd - cwnd;
                        max_cnt = cwnd / delta;
                        if (ca->cnt > max_cnt)
                                ca->cnt = max_cnt;
                }
        }

        /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
         * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
         */
        ca->cnt = max(ca->cnt, 2U);
}

__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct bictcp *ca = inet_csk_ca(sk);

        if (!tcp_is_cwnd_limited(sk))
                return;

        if (tcp_in_slow_start(tp)) {
                acked = tcp_slow_start(tp, acked);
                if (!acked)
                        return;
        }
        bictcp_update(ca, tcp_snd_cwnd(tp), acked);
        tcp_cong_avoid_ai(tp, ca->cnt, acked);
}

__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct bictcp *ca = inet_csk_ca(sk);

        ca->epoch_start = 0;        /* end of epoch */

        /* Wmax and fast convergence */
        if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
                ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
                        / (2 * BICTCP_BETA_SCALE);
        else
                ca->last_max_cwnd = tcp_snd_cwnd(tp);

        return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}

__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state)
{
        if (new_state == TCP_CA_Loss) {
                bictcp_reset(inet_csk_ca(sk));
                bictcp_hystart_reset(sk);
        }
}

/* Account for TSO/GRO delays.
 * Otherwise short RTT flows could get too small ssthresh, since during
 * slow start we begin with small TSO packets and ca->delay_min would
 * not account for long aggregation delay when TSO packets get bigger.
 * Ideally even with a very small RTT we would like to have at least one
 * TSO packet being sent and received by GRO, and another one in qdisc layer.
 * We apply another 100% factor because @rate is doubled at this point.
 * We cap the cushion to 1ms.
 */
static u32 hystart_ack_delay(const struct sock *sk)
{
        unsigned long rate;

        rate = READ_ONCE(sk->sk_pacing_rate);
        if (!rate)
                return 0;
        return min_t(u64, USEC_PER_MSEC,
                     div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate));
}

static void hystart_update(struct sock *sk, u32 delay)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct bictcp *ca = inet_csk_ca(sk);
        u32 threshold;

        if (after(tp->snd_una, ca->end_seq))
                bictcp_hystart_reset(sk);

        if (hystart_detect & HYSTART_ACK_TRAIN) {
                u32 now = bictcp_clock_us(sk);

                /* first detection parameter - ack-train detection */
                if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
                        ca->last_ack = now;

                        threshold = ca->delay_min + hystart_ack_delay(sk);

                        /* Hystart ack train triggers if we get ack past
                         * ca->delay_min/2.
                         * Pacing might have delayed packets up to RTT/2
                         * during slow start.
                         */
                        if (sk->sk_pacing_status == SK_PACING_NONE)
                                threshold >>= 1;

                        if ((s32)(now - ca->round_start) > threshold) {
                                ca->found = 1;
                                pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
                                         now - ca->round_start, threshold,
                                         ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp));
                                NET_INC_STATS(sock_net(sk),
                                              LINUX_MIB_TCPHYSTARTTRAINDETECT);
                                NET_ADD_STATS(sock_net(sk),
                                              LINUX_MIB_TCPHYSTARTTRAINCWND,
                                              tcp_snd_cwnd(tp));
                                tp->snd_ssthresh = tcp_snd_cwnd(tp);
                        }
                }
        }

        if (hystart_detect & HYSTART_DELAY) {
                /* obtain the minimum delay of more than sampling packets */
                if (ca->curr_rtt > delay)
                        ca->curr_rtt = delay;
                if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
                        ca->sample_cnt++;
                } else {
                        if (ca->curr_rtt > ca->delay_min +
                            HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
                                ca->found = 1;
                                NET_INC_STATS(sock_net(sk),
                                              LINUX_MIB_TCPHYSTARTDELAYDETECT);
                                NET_ADD_STATS(sock_net(sk),
                                              LINUX_MIB_TCPHYSTARTDELAYCWND,
                                              tcp_snd_cwnd(tp));
                                tp->snd_ssthresh = tcp_snd_cwnd(tp);
                        }
                }
        }
}

__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
        const struct tcp_sock *tp = tcp_sk(sk);
        struct bictcp *ca = inet_csk_ca(sk);
        u32 delay;

        /* Some calls are for duplicates without timetamps */
        if (sample->rtt_us < 0)
                return;

        /* Discard delay samples right after fast recovery */
        if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
                return;

        delay = sample->rtt_us;
        if (delay == 0)
                delay = 1;

        /* first time call or link delay decreases */
        if (ca->delay_min == 0 || ca->delay_min > delay)
                ca->delay_min = delay;

        /* hystart triggers when cwnd is larger than some threshold */
        if (!ca->found && tcp_in_slow_start(tp) && hystart &&
            tcp_snd_cwnd(tp) >= hystart_low_window)
                hystart_update(sk, delay);
}

static struct tcp_congestion_ops cubictcp __read_mostly = {
        .init                = cubictcp_init,
        .ssthresh        = cubictcp_recalc_ssthresh,
        .cong_avoid        = cubictcp_cong_avoid,
        .set_state        = cubictcp_state,
        .undo_cwnd        = tcp_reno_undo_cwnd,
        .cwnd_event        = cubictcp_cwnd_event,
        .pkts_acked     = cubictcp_acked,
        .owner                = THIS_MODULE,
        .name                = "cubic",
};

BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids)
BTF_ID_FLAGS(func, cubictcp_init)
BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
BTF_ID_FLAGS(func, cubictcp_cong_avoid)
BTF_ID_FLAGS(func, cubictcp_state)
BTF_ID_FLAGS(func, cubictcp_cwnd_event)
BTF_ID_FLAGS(func, cubictcp_acked)
BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids)

static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &tcp_cubic_check_kfunc_ids,
};

static int __init cubictcp_register(void)
{
        int ret;

        BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);

        /* Precompute a bunch of the scaling factors that are used per-packet
         * based on SRTT of 100ms
         */

        beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
                / (BICTCP_BETA_SCALE - beta);

        cube_rtt_scale = (bic_scale * 10);        /* 1024*c/rtt */

        /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
         *  so K = cubic_root( (wmax-cwnd)*rtt/c )
         * the unit of K is bictcp_HZ=2^10, not HZ
         *
         *  c = bic_scale >> 10
         *  rtt = 100ms
         *
         * the following code has been designed and tested for
         * cwnd < 1 million packets
         * RTT < 100 seconds
         * HZ < 1,000,00  (corresponding to 10 nano-second)
         */

        /* 1/c * 2^2*bictcp_HZ * srtt */
        cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */

        /* divide by bic_scale and by constant Srtt (100ms) */
        do_div(cube_factor, bic_scale * 10);

        ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
        if (ret < 0)
                return ret;
        return tcp_register_congestion_control(&cubictcp);
}

static void __exit cubictcp_unregister(void)
{
        tcp_unregister_congestion_control(&cubictcp);
}

module_init(cubictcp_register);
module_exit(cubictcp_unregister);

MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("CUBIC TCP");
MODULE_VERSION("2.3");








































































































    2 





    4 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 






































































































































































































































    3 



    4 
    4 





    4 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
// SPDX-License-Identifier: GPL-2.0
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 *
 * For policy-specific per-blkcg data:
 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
 *                    Arianna Avanzini <avanzini.arianna@gmail.com>
 */
#include <linux/ioprio.h>
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);

/*
 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
 * policy [un]register operations including cgroup file additions /
 * removals.  Putting cgroup file registration outside blkcg_pol_mutex
 * allows grabbing it from cgroup callbacks.
 */
static DEFINE_MUTEX(blkcg_pol_register_mutex);
static DEFINE_MUTEX(blkcg_pol_mutex);

struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);

struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);

static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];

static LIST_HEAD(all_blkcgs);                /* protected by blkcg_pol_mutex */

bool blkcg_debug_stats = false;

static DEFINE_RAW_SPINLOCK(blkg_stat_lock);

#define BLKG_DESTROY_BATCH_SIZE  64

/*
 * Lockless lists for tracking IO stats update
 *
 * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
 * There are multiple blkg's (one for each block device) attached to each
 * blkcg. The rstat code keeps track of which cpu has IO stats updated,
 * but it doesn't know which blkg has the updated stats. If there are many
 * block devices in a system, the cost of iterating all the blkg's to flush
 * out the IO stats can be high. To reduce such overhead, a set of percpu
 * lockless lists (lhead) per blkcg are used to track the set of recently
 * updated iostat_cpu's since the last flush. An iostat_cpu will be put
 * onto the lockless list on the update side [blk_cgroup_bio_start()] if
 * not there yet and then removed when being flushed [blkcg_rstat_flush()].
 * References to blkg are gotten and then put back in the process to
 * protect against blkg removal.
 *
 * Return: 0 if successful or -ENOMEM if allocation fails.
 */
static int init_blkcg_llists(struct blkcg *blkcg)
{
        int cpu;

        blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
        if (!blkcg->lhead)
                return -ENOMEM;

        for_each_possible_cpu(cpu)
                init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
        return 0;
}

/**
 * blkcg_css - find the current css
 *
 * Find the css associated with either the kthread or the current task.
 * This may return a dying css, so it is up to the caller to use tryget logic
 * to confirm it is alive and well.
 */
static struct cgroup_subsys_state *blkcg_css(void)
{
        struct cgroup_subsys_state *css;

        css = kthread_blkcg();
        if (css)
                return css;
        return task_css(current, io_cgrp_id);
}

static bool blkcg_policy_enabled(struct request_queue *q,
                                 const struct blkcg_policy *pol)
{
        return pol && test_bit(pol->plid, q->blkcg_pols);
}

static void blkg_free_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             free_work);
        struct request_queue *q = blkg->q;
        int i;

        /*
         * pd_free_fn() can also be called from blkcg_deactivate_policy(),
         * in order to make sure pd_free_fn() is called in order, the deletion
         * of the list blkg->q_node is delayed to here from blkg_destroy(), and
         * blkcg_mutex is used to synchronize blkg_free_workfn() and
         * blkcg_deactivate_policy().
         */
        mutex_lock(&q->blkcg_mutex);
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        if (blkg->parent)
                blkg_put(blkg->parent);
        spin_lock_irq(&q->queue_lock);
        list_del_init(&blkg->q_node);
        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        blk_put_queue(q);
        free_percpu(blkg->iostat_cpu);
        percpu_ref_exit(&blkg->refcnt);
        kfree(blkg);
}

/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkcg_gq *blkg)
{
        if (!blkg)
                return;

        /*
         * Both ->pd_free_fn() and request queue's release handler may
         * sleep, so free us by scheduling one work func
         */
        INIT_WORK(&blkg->free_work, blkg_free_workfn);
        schedule_work(&blkg->free_work);
}

static void __blkg_release(struct rcu_head *rcu)
{
        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
        struct blkcg *blkcg = blkg->blkcg;
        int cpu;

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
        /*
         * Flush all the non-empty percpu lockless lists before releasing
         * us, given these stat belongs to us.
         *
         * blkg_stat_lock is for serializing blkg stat update
         */
        for_each_possible_cpu(cpu)
                __blkcg_rstat_flush(blkcg, cpu);

        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
        blkg_free(blkg);
}

/*
 * A group is RCU protected, but having an rcu lock does not mean that one
 * can access all the fields of blkg and assume these are valid.  For
 * example, don't try to follow throtl_data and request queue links.
 *
 * Having a reference to blkg under an rcu allows accesses to only values
 * local to groups like group stats and group rate limits.
 */
static void blkg_release(struct percpu_ref *ref)
{
        struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);

        call_rcu(&blkg->rcu_head, __blkg_release);
}

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
static struct workqueue_struct *blkcg_punt_bio_wq;

static void blkg_async_bio_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             async_bio_work);
        struct bio_list bios = BIO_EMPTY_LIST;
        struct bio *bio;
        struct blk_plug plug;
        bool need_plug = false;

        /* as long as there are pending bios, @blkg can't go away */
        spin_lock(&blkg->async_bio_lock);
        bio_list_merge_init(&bios, &blkg->async_bios);
        spin_unlock(&blkg->async_bio_lock);

        /* start plug only when bio_list contains at least 2 bios */
        if (bios.head && bios.head->bi_next) {
                need_plug = true;
                blk_start_plug(&plug);
        }
        while ((bio = bio_list_pop(&bios)))
                submit_bio(bio);
        if (need_plug)
                blk_finish_plug(&plug);
}

/*
 * When a shared kthread issues a bio for a cgroup, doing so synchronously can
 * lead to priority inversions as the kthread can be trapped waiting for that
 * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
 * a dedicated per-blkcg work item to avoid such priority inversions.
 */
void blkcg_punt_bio_submit(struct bio *bio)
{
        struct blkcg_gq *blkg = bio->bi_blkg;

        if (blkg->parent) {
                spin_lock(&blkg->async_bio_lock);
                bio_list_add(&blkg->async_bios, bio);
                spin_unlock(&blkg->async_bio_lock);
                queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
        } else {
                /* never bounce for the root cgroup */
                submit_bio(bio);
        }
}
EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);

static int __init blkcg_punt_bio_init(void)
{
        blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
                                            WQ_MEM_RECLAIM | WQ_FREEZABLE |
                                            WQ_UNBOUND | WQ_SYSFS, 0);
        if (!blkcg_punt_bio_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(blkcg_punt_bio_init);
#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */

/**
 * bio_blkcg_css - return the blkcg CSS associated with a bio
 * @bio: target bio
 *
 * This returns the CSS for the blkcg associated with a bio, or %NULL if not
 * associated. Callers are expected to either handle %NULL or know association
 * has been done prior to calling this.
 */
struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
{
        if (!bio || !bio->bi_blkg)
                return NULL;
        return &bio->bi_blkg->blkcg->css;
}
EXPORT_SYMBOL_GPL(bio_blkcg_css);

/**
 * blkcg_parent - get the parent of a blkcg
 * @blkcg: blkcg of interest
 *
 * Return the parent blkcg of @blkcg.  Can be called anytime.
 */
static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
{
        return css_to_blkcg(blkcg->css.parent);
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @disk: gendisk the new blkg is associated with
 * @gfp_mask: allocation mask to use
 *
 * Allocate a new blkg associating @blkcg and @disk.
 */
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
                                   gfp_t gfp_mask)
{
        struct blkcg_gq *blkg;
        int i, cpu;

        /* alloc and init base part */
        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
        if (!blkg)
                return NULL;
        if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
                goto out_free_blkg;
        blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
        if (!blkg->iostat_cpu)
                goto out_exit_refcnt;
        if (!blk_get_queue(disk->queue))
                goto out_free_iostat;

        blkg->q = disk->queue;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
        blkg->iostat.blkg = blkg;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spin_lock_init(&blkg->async_bio_lock);
        bio_list_init(&blkg->async_bios);
        INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
#endif

        u64_stats_init(&blkg->iostat.sync);
        for_each_possible_cpu(cpu) {
                u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
                per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;

                if (!blkcg_policy_enabled(disk->queue, pol))
                        continue;

                /* alloc per-policy data and attach it to blkg */
                pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask);
                if (!pd)
                        goto out_free_pds;
                blkg->pd[i] = pd;
                pd->blkg = blkg;
                pd->plid = i;
                pd->online = false;
        }

        return blkg;

out_free_pds:
        while (--i >= 0)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        blk_put_queue(disk->queue);
out_free_iostat:
        free_percpu(blkg->iostat_cpu);
out_exit_refcnt:
        percpu_ref_exit(&blkg->refcnt);
out_free_blkg:
        kfree(blkg);
        return NULL;
}

/*
 * If @new_blkg is %NULL, this function tries to allocate a new one as
 * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
 */
static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
                                    struct blkcg_gq *new_blkg)
{
        struct blkcg_gq *blkg;
        int i, ret;

        lockdep_assert_held(&disk->queue->queue_lock);

        /* request_queue is dying, do not create/recreate a blkg */
        if (blk_queue_dying(disk->queue)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_css;
                }
        }
        blkg = new_blkg;

        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -ENODEV;
                        goto err_put_css;
                }
                blkg_get(blkg->parent);
        }

        /* invoke per-policy init */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && pol->pd_init_fn)
                        pol->pd_init_fn(blkg->pd[i]);
        }

        /* insert */
        spin_lock(&blkcg->lock);
        ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
        if (likely(!ret)) {
                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
                list_add(&blkg->q_node, &disk->queue->blkg_list);

                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i]) {
                                if (pol->pd_online_fn)
                                        pol->pd_online_fn(blkg->pd[i]);
                                blkg->pd[i]->online = true;
                        }
                }
        }
        blkg->online = true;
        spin_unlock(&blkcg->lock);

        if (!ret)
                return blkg;

        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);

err_put_css:
        css_put(&blkcg->css);
err_free_blkg:
        if (new_blkg)
                blkg_free(new_blkg);
        return ERR_PTR(ret);
}

/**
 * blkg_lookup_create - lookup blkg, try to create one if not there
 * @blkcg: blkcg of interest
 * @disk: gendisk of interest
 *
 * Lookup blkg for the @blkcg - @disk pair.  If it doesn't exist, try to
 * create one.  blkg creation is performed recursively from blkcg_root such
 * that all non-root blkg's have access to the parent blkg.  This function
 * should be called under RCU read lock and takes @disk->queue->queue_lock.
 *
 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
 * down from root.
 */
static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned long flags;

        WARN_ON_ONCE(!rcu_read_lock_held());

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                return blkg;

        spin_lock_irqsave(&q->queue_lock, flags);
        blkg = blkg_lookup(blkcg, q);
        if (blkg) {
                if (blkcg != &blkcg_root &&
                    blkg != rcu_dereference(blkcg->blkg_hint))
                        rcu_assign_pointer(blkcg->blkg_hint, blkg);
                goto found;
        }

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.  Returns the closest
         * blkg to the intended blkg should blkg_create() fail.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent = blkcg_parent(blkcg);
                struct blkcg_gq *ret_blkg = q->root_blkg;

                while (parent) {
                        blkg = blkg_lookup(parent, q);
                        if (blkg) {
                                /* remember closest blkg */
                                ret_blkg = blkg;
                                break;
                        }
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                blkg = blkg_create(pos, disk, NULL);
                if (IS_ERR(blkg)) {
                        blkg = ret_blkg;
                        break;
                }
                if (pos == blkcg)
                        break;
        }

found:
        spin_unlock_irqrestore(&q->queue_lock, flags);
        return blkg;
}

static void blkg_destroy(struct blkcg_gq *blkg)
{
        struct blkcg *blkcg = blkg->blkcg;
        int i;

        lockdep_assert_held(&blkg->q->queue_lock);
        lockdep_assert_held(&blkcg->lock);

        /*
         * blkg stays on the queue list until blkg_free_workfn(), see details in
         * blkg_free_workfn(), hence this function can be called from
         * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before
         * blkg_free_workfn().
         */
        if (hlist_unhashed(&blkg->blkcg_node))
                return;

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && blkg->pd[i]->online) {
                        blkg->pd[i]->online = false;
                        if (pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[i]);
                }
        }

        blkg->online = false;

        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
        hlist_del_init_rcu(&blkg->blkcg_node);

        /*
         * Both setting lookup hint to and clearing it from @blkg are done
         * under queue_lock.  If it's not pointing to @blkg now, it never
         * will.  Hint assignment itself can race safely.
         */
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);

        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
         */
        percpu_ref_kill(&blkg->refcnt);
}

static void blkg_destroy_all(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        int count = BLKG_DESTROY_BATCH_SIZE;
        int i;

restart:
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                if (hlist_unhashed(&blkg->blkcg_node))
                        continue;

                spin_lock(&blkcg->lock);
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);

                /*
                 * in order to avoid holding the spin lock for too long, release
                 * it when a batch of blkgs are destroyed.
                 */
                if (!(--count)) {
                        count = BLKG_DESTROY_BATCH_SIZE;
                        spin_unlock_irq(&q->queue_lock);
                        cond_resched();
                        goto restart;
                }
        }

        /*
         * Mark policy deactivated since policy offline has been done, and
         * the free is scheduled, so future blkcg_deactivate_policy() can
         * be bypassed
         */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (pol)
                        __clear_bit(pol->plid, q->blkcg_pols);
        }

        q->root_blkg = NULL;
        spin_unlock_irq(&q->queue_lock);
}

static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] = src->bytes[i];
                dst->ios[i] = src->ios[i];
        }
}

static void __blkg_clear_stat(struct blkg_iostat_set *bis)
{
        struct blkg_iostat cur = {0};
        unsigned long flags;

        flags = u64_stats_update_begin_irqsave(&bis->sync);
        blkg_iostat_set(&bis->cur, &cur);
        blkg_iostat_set(&bis->last, &cur);
        u64_stats_update_end_irqrestore(&bis->sync, flags);
}

static void blkg_clear_stat(struct blkcg_gq *blkg)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);

                __blkg_clear_stat(s);
        }
        __blkg_clear_stat(&blkg->iostat);
}

static int blkcg_reset_stats(struct cgroup_subsys_state *css,
                             struct cftype *cftype, u64 val)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
        int i;

        mutex_lock(&blkcg_pol_mutex);
        spin_lock_irq(&blkcg->lock);

        /*
         * Note that stat reset is racy - it doesn't synchronize against
         * stat updates.  This is a debug feature which shouldn't exist
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                blkg_clear_stat(blkg);
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i] && pol->pd_reset_stats_fn)
                                pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }

        spin_unlock_irq(&blkcg->lock);
        mutex_unlock(&blkcg_pol_mutex);
        return 0;
}

const char *blkg_dev_name(struct blkcg_gq *blkg)
{
        if (!blkg->q->disk)
                return NULL;
        return bdi_dev_name(blkg->q->disk->bdi);
}

/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data and the matching queue lock held.  If @show_total
 * is %true, the sum of the return values from @prfill is printed with
 * "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total)
{
        struct blkcg_gq *blkg;
        u64 total = 0;

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                if (blkcg_policy_enabled(blkg->q, pol))
                        total += prfill(sf, blkg->pd[pol->plid], data);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();

        if (show_total)
                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy private data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device associated with @pd.
 */
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
        const char *dname = blkg_dev_name(pd->blkg);

        if (!dname)
                return 0;

        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
        return v;
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);

/**
 * blkg_conf_init - initialize a blkg_conf_ctx
 * @ctx: blkg_conf_ctx to initialize
 * @input: input string
 *
 * Initialize @ctx which can be used to parse blkg config input string @input.
 * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
 * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
 */
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
        *ctx = (struct blkg_conf_ctx){ .input = input };
}
EXPORT_SYMBOL_GPL(blkg_conf_init);

/**
 * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
 * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
 * set to point past the device node prefix.
 *
 * This function may be called multiple times on @ctx and the extra calls become
 * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
 * explicitly if bdev access is needed without resolving the blkcg / policy part
 * of @ctx->input. Returns -errno on error.
 */
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
        char *input = ctx->input;
        unsigned int major, minor;
        struct block_device *bdev;
        int key_len;

        if (ctx->bdev)
                return 0;

        if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;

        input += key_len;
        if (!isspace(*input))
                return -EINVAL;
        input = skip_spaces(input);

        bdev = blkdev_get_no_open(MKDEV(major, minor));
        if (!bdev)
                return -ENODEV;
        if (bdev_is_partition(bdev)) {
                blkdev_put_no_open(bdev);
                return -ENODEV;
        }

        mutex_lock(&bdev->bd_queue->rq_qos_mutex);
        if (!disk_live(bdev->bd_disk)) {
                blkdev_put_no_open(bdev);
                mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
                return -ENODEV;
        }

        ctx->body = input;
        ctx->bdev = bdev;
        return 0;
}

/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @pol: target policy
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse per-blkg config update from @ctx->input and initialize @ctx
 * accordingly. On success, @ctx->body points to the part of @ctx->input
 * following MAJ:MIN, @ctx->bdev points to the target block device and
 * @ctx->blkg to the blkg being configured.
 *
 * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
 * function returns with queue lock held and must be followed by
 * blkg_conf_exit().
 */
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx)
        __acquires(&bdev->bd_queue->queue_lock)
{
        struct gendisk *disk;
        struct request_queue *q;
        struct blkcg_gq *blkg;
        int ret;

        ret = blkg_conf_open_bdev(ctx);
        if (ret)
                return ret;

        disk = ctx->bdev->bd_disk;
        q = disk->queue;

        /*
         * blkcg_deactivate_policy() requires queue to be frozen, we can grab
         * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
         */
        ret = blk_queue_enter(q, 0);
        if (ret)
                goto fail;

        spin_lock_irq(&q->queue_lock);

        if (!blkcg_policy_enabled(q, pol)) {
                ret = -EOPNOTSUPP;
                goto fail_unlock;
        }

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                goto success;

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent;
                struct blkcg_gq *new_blkg;

                parent = blkcg_parent(blkcg);
                while (parent && !blkg_lookup(parent, q)) {
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                /* Drop locks to do new blkg allocation with GFP_KERNEL. */
                spin_unlock_irq(&q->queue_lock);

                new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto fail_exit_queue;
                }

                if (radix_tree_preload(GFP_KERNEL)) {
                        blkg_free(new_blkg);
                        ret = -ENOMEM;
                        goto fail_exit_queue;
                }

                spin_lock_irq(&q->queue_lock);

                if (!blkcg_policy_enabled(q, pol)) {
                        blkg_free(new_blkg);
                        ret = -EOPNOTSUPP;
                        goto fail_preloaded;
                }

                blkg = blkg_lookup(pos, q);
                if (blkg) {
                        blkg_free(new_blkg);
                } else {
                        blkg = blkg_create(pos, disk, new_blkg);
                        if (IS_ERR(blkg)) {
                                ret = PTR_ERR(blkg);
                                goto fail_preloaded;
                        }
                }

                radix_tree_preload_end();

                if (pos == blkcg)
                        goto success;
        }
success:
        blk_queue_exit(q);
        ctx->blkg = blkg;
        return 0;

fail_preloaded:
        radix_tree_preload_end();
fail_unlock:
        spin_unlock_irq(&q->queue_lock);
fail_exit_queue:
        blk_queue_exit(q);
fail:
        /*
         * If queue was bypassing, we should retry.  Do so after a
         * short msleep().  It isn't strictly necessary but queue
         * can be bypassing for some time and it's always nice to
         * avoid busy looping.
         */
        if (ret == -EBUSY) {
                msleep(10);
                ret = restart_syscall();
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);

/**
 * blkg_conf_exit - clean up per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Clean up after per-blkg config update. This function must be called on all
 * blkg_conf_ctx's initialized with blkg_conf_init().
 */
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
        __releases(&ctx->bdev->bd_queue->queue_lock)
        __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
        if (ctx->blkg) {
                spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
                ctx->blkg = NULL;
        }

        if (ctx->bdev) {
                mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
                blkdev_put_no_open(ctx->bdev);
                ctx->body = NULL;
                ctx->bdev = NULL;
        }
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);

static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] += src->bytes[i];
                dst->ios[i] += src->ios[i];
        }
}

static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] -= src->bytes[i];
                dst->ios[i] -= src->ios[i];
        }
}

static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
                                struct blkg_iostat *last)
{
        struct blkg_iostat delta;
        unsigned long flags;

        /* propagate percpu delta to global */
        flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
        blkg_iostat_set(&delta, cur);
        blkg_iostat_sub(&delta, last);
        blkg_iostat_add(&blkg->iostat.cur, &delta);
        blkg_iostat_add(last, &delta);
        u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
        struct llist_node *lnode;
        struct blkg_iostat_set *bisc, *next_bisc;
        unsigned long flags;

        rcu_read_lock();

        lnode = llist_del_all(lhead);
        if (!lnode)
                goto out;

        /*
         * For covering concurrent parent blkg update from blkg_release().
         *
         * When flushing from cgroup, cgroup_rstat_lock is always held, so
         * this lock won't cause contention most of time.
         */
        raw_spin_lock_irqsave(&blkg_stat_lock, flags);

        /*
         * Iterate only the iostat_cpu's queued in the lockless list.
         */
        llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
                struct blkcg_gq *blkg = bisc->blkg;
                struct blkcg_gq *parent = blkg->parent;
                struct blkg_iostat cur;
                unsigned int seq;

                /*
                 * Order assignment of `next_bisc` from `bisc->lnode.next` in
                 * llist_for_each_entry_safe and clearing `bisc->lqueued` for
                 * avoiding to assign `next_bisc` with new next pointer added
                 * in blk_cgroup_bio_start() in case of re-ordering.
                 *
                 * The pair barrier is implied in llist_add() in blk_cgroup_bio_start().
                 */
                smp_mb();

                WRITE_ONCE(bisc->lqueued, false);
                if (bisc == &blkg->iostat)
                        goto propagate_up; /* propagate up to parent only */

                /* fetch the current per-cpu values */
                do {
                        seq = u64_stats_fetch_begin(&bisc->sync);
                        blkg_iostat_set(&cur, &bisc->cur);
                } while (u64_stats_fetch_retry(&bisc->sync, seq));

                blkcg_iostat_update(blkg, &cur, &bisc->last);

propagate_up:
                /* propagate global delta to parent (unless that's root) */
                if (parent && parent->parent) {
                        blkcg_iostat_update(parent, &blkg->iostat.cur,
                                            &blkg->iostat.last);
                        /*
                         * Queue parent->iostat to its blkcg's lockless
                         * list to propagate up to the grandparent if the
                         * iostat hasn't been queued yet.
                         */
                        if (!parent->iostat.lqueued) {
                                struct llist_head *plhead;

                                plhead = per_cpu_ptr(parent->blkcg->lhead, cpu);
                                llist_add(&parent->iostat.lnode, plhead);
                                parent->iostat.lqueued = true;
                        }
                }
        }
        raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
out:
        rcu_read_unlock();
}

static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
        /* Root-level stats are sourced from system-wide IO stats */
        if (cgroup_parent(css->cgroup))
                __blkcg_rstat_flush(css_to_blkcg(css), cpu);
}

/*
 * We source root cgroup stats from the system-wide stats to avoid
 * tracking the same information twice and incurring overhead when no
 * cgroups are defined. For that reason, cgroup_rstat_flush in
 * blkcg_print_stat does not actually fill out the iostat in the root
 * cgroup's blkcg_gq.
 *
 * However, we would like to re-use the printing code between the root and
 * non-root cgroups to the extent possible. For that reason, we simulate
 * flushing the root cgroup's stats by explicitly filling in the iostat
 * with disk level statistics.
 */
static void blkcg_fill_root_iostats(void)
{
        struct class_dev_iter iter;
        struct device *dev;

        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
        while ((dev = class_dev_iter_next(&iter))) {
                struct block_device *bdev = dev_to_bdev(dev);
                struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
                struct blkg_iostat tmp;
                int cpu;
                unsigned long flags;

                memset(&tmp, 0, sizeof(tmp));
                for_each_possible_cpu(cpu) {
                        struct disk_stats *cpu_dkstats;

                        cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
                        tmp.ios[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->ios[STAT_READ];
                        tmp.ios[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->ios[STAT_WRITE];
                        tmp.ios[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->ios[STAT_DISCARD];
                        // convert sectors to bytes
                        tmp.bytes[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->sectors[STAT_READ] << 9;
                        tmp.bytes[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->sectors[STAT_WRITE] << 9;
                        tmp.bytes[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->sectors[STAT_DISCARD] << 9;
                }

                flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
                blkg_iostat_set(&blkg->iostat.cur, &tmp);
                u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
        }
}

static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
{
        struct blkg_iostat_set *bis = &blkg->iostat;
        u64 rbytes, wbytes, rios, wios, dbytes, dios;
        const char *dname;
        unsigned seq;
        int i;

        if (!blkg->online)
                return;

        dname = blkg_dev_name(blkg);
        if (!dname)
                return;

        seq_printf(s, "%s ", dname);

        do {
                seq = u64_stats_fetch_begin(&bis->sync);

                rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
                wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
                dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
                rios = bis->cur.ios[BLKG_IOSTAT_READ];
                wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
                dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
        } while (u64_stats_fetch_retry(&bis->sync, seq));

        if (rbytes || wbytes || rios || wios) {
                seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
                        rbytes, wbytes, rios, wios,
                        dbytes, dios);
        }

        if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
                seq_printf(s, " use_delay=%d delay_nsec=%llu",
                        atomic_read(&blkg->use_delay),
                        atomic64_read(&blkg->delay_nsec));
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (!blkg->pd[i] || !pol->pd_stat_fn)
                        continue;

                pol->pd_stat_fn(blkg->pd[i], s);
        }

        seq_puts(s, "\n");
}

static int blkcg_print_stat(struct seq_file *sf, void *v)
{
        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
        struct blkcg_gq *blkg;

        if (!seq_css(sf)->parent)
                blkcg_fill_root_iostats();
        else
                cgroup_rstat_flush(blkcg->css.cgroup);

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                blkcg_print_one_stat(blkg, sf);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();
        return 0;
}

static struct cftype blkcg_files[] = {
        {
                .name = "stat",
                .seq_show = blkcg_print_stat,
        },
        { }        /* terminate */
};

static struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
        },
        { }        /* terminate */
};

#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
{
        return &css_to_blkcg(css)->cgwb_list;
}
#endif

/*
 * blkcg destruction is a three-stage process.
 *
 * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
 *    which offlines writeback.  Here we tie the next stage of blkg destruction
 *    to the completion of writeback associated with the blkcg.  This lets us
 *    avoid punting potentially large amounts of outstanding writeback to root
 *    while maintaining any ongoing policies.  The next stage is triggered when
 *    the nr_cgwbs count goes to zero.
 *
 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
 *    and handles the destruction of blkgs.  Here the css reference held by
 *    the blkg is put back eventually allowing blkcg_css_free() to be called.
 *    This work may occur in cgwb_release_workfn() on the cgwb_release
 *    workqueue.  Any submitted ios that fail to get the blkg ref will be
 *    punted to the root_blkg.
 *
 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
 *    This finally frees the blkcg.
 */

/**
 * blkcg_destroy_blkgs - responsible for shooting down blkgs
 * @blkcg: blkcg of interest
 *
 * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
 * is nested inside q lock, this function performs reverse double lock dancing.
 * Destroying the blkgs releases the reference held on the blkcg's css allowing
 * blkcg_css_free to eventually be called.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
static void blkcg_destroy_blkgs(struct blkcg *blkcg)
{
        might_sleep();

        spin_lock_irq(&blkcg->lock);

        while (!hlist_empty(&blkcg->blkg_list)) {
                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
                                                struct blkcg_gq, blkcg_node);
                struct request_queue *q = blkg->q;

                if (need_resched() || !spin_trylock(&q->queue_lock)) {
                        /*
                         * Given that the system can accumulate a huge number
                         * of blkgs in pathological cases, check to see if we
                         * need to rescheduling to avoid softlockup.
                         */
                        spin_unlock_irq(&blkcg->lock);
                        cond_resched();
                        spin_lock_irq(&blkcg->lock);
                        continue;
                }

                blkg_destroy(blkg);
                spin_unlock(&q->queue_lock);
        }

        spin_unlock_irq(&blkcg->lock);
}

/**
 * blkcg_pin_online - pin online state
 * @blkcg_css: blkcg of interest
 *
 * While pinned, a blkcg is kept online.  This is primarily used to
 * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
 * while an associated cgwb is still active.
 */
void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
{
        refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
}

/**
 * blkcg_unpin_online - unpin online state
 * @blkcg_css: blkcg of interest
 *
 * This is primarily used to impedance-match blkg and cgwb lifetimes so
 * that blkg doesn't go offline while an associated cgwb is still active.
 * When this count goes to zero, all active cgwbs have finished so the
 * blkcg can continue destruction by calling blkcg_destroy_blkgs().
 */
void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
{
        struct blkcg *blkcg = css_to_blkcg(blkcg_css);

        do {
                if (!refcount_dec_and_test(&blkcg->online_pin))
                        break;
                blkcg_destroy_blkgs(blkcg);
                blkcg = blkcg_parent(blkcg);
        } while (blkcg);
}

/**
 * blkcg_css_offline - cgroup css_offline callback
 * @css: css of interest
 *
 * This function is called when @css is about to go away.  Here the cgwbs are
 * offlined first and only once writeback associated with the blkcg has
 * finished do we start step 2 (see above).
 */
static void blkcg_css_offline(struct cgroup_subsys_state *css)
{
        /* this prevents anyone from attaching or migrating to this blkcg */
        wb_blkcg_offline(css);

        /* put the base online pin allowing step 2 to be triggered */
        blkcg_unpin_online(css);
}

static void blkcg_css_free(struct cgroup_subsys_state *css)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        int i;

        mutex_lock(&blkcg_pol_mutex);

        list_del(&blkcg->all_blkcgs_node);

        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);

        mutex_unlock(&blkcg_pol_mutex);

        free_percpu(blkcg->lhead);
        kfree(blkcg);
}

static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct blkcg *blkcg;
        int i;

        mutex_lock(&blkcg_pol_mutex);

        if (!parent_css) {
                blkcg = &blkcg_root;
        } else {
                blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
                if (!blkcg)
                        goto unlock;
        }

        if (init_blkcg_llists(blkcg))
                goto free_blkcg;

        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkcg_policy_data *cpd;

                /*
                 * If the policy hasn't been attached yet, wait for it
                 * to be attached before doing anything else. Otherwise,
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
                if (!pol || !pol->cpd_alloc_fn)
                        continue;

                cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd)
                        goto free_pd_blkcg;

                blkcg->cpd[i] = cpd;
                cpd->blkcg = blkcg;
                cpd->plid = i;
        }

        spin_lock_init(&blkcg->lock);
        refcount_set(&blkcg->online_pin, 1);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);

        mutex_unlock(&blkcg_pol_mutex);
        return &blkcg->css;

free_pd_blkcg:
        for (i--; i >= 0; i--)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
        free_percpu(blkcg->lhead);
free_blkcg:
        if (blkcg != &blkcg_root)
                kfree(blkcg);
unlock:
        mutex_unlock(&blkcg_pol_mutex);
        return ERR_PTR(-ENOMEM);
}

static int blkcg_css_online(struct cgroup_subsys_state *css)
{
        struct blkcg *parent = blkcg_parent(css_to_blkcg(css));

        /*
         * blkcg_pin_online() is used to delay blkcg offline so that blkgs
         * don't go offline while cgwbs are still active on them.  Pin the
         * parent so that offline always happens towards the root.
         */
        if (parent)
                blkcg_pin_online(&parent->css);
        return 0;
}

void blkg_init_queue(struct request_queue *q)
{
        INIT_LIST_HEAD(&q->blkg_list);
        mutex_init(&q->blkcg_mutex);
}

int blkcg_init_disk(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *new_blkg, *blkg;
        bool preloaded;
        int ret;

        new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
        if (!new_blkg)
                return -ENOMEM;

        preloaded = !radix_tree_preload(GFP_KERNEL);

        /* Make sure the root blkg exists. */
        /* spin_lock_irq can serve as RCU read-side critical section. */
        spin_lock_irq(&q->queue_lock);
        blkg = blkg_create(&blkcg_root, disk, new_blkg);
        if (IS_ERR(blkg))
                goto err_unlock;
        q->root_blkg = blkg;
        spin_unlock_irq(&q->queue_lock);

        if (preloaded)
                radix_tree_preload_end();

        ret = blk_ioprio_init(disk);
        if (ret)
                goto err_destroy_all;

        return 0;

err_destroy_all:
        blkg_destroy_all(disk);
        return ret;
err_unlock:
        spin_unlock_irq(&q->queue_lock);
        if (preloaded)
                radix_tree_preload_end();
        return PTR_ERR(blkg);
}

void blkcg_exit_disk(struct gendisk *disk)
{
        blkg_destroy_all(disk);
        blk_throtl_exit(disk);
}

static void blkcg_exit(struct task_struct *tsk)
{
        if (tsk->throttle_disk)
                put_disk(tsk->throttle_disk);
        tsk->throttle_disk = NULL;
}

struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_online = blkcg_css_online,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .css_rstat_flush = blkcg_rstat_flush,
        .dfl_cftypes = blkcg_files,
        .legacy_cftypes = blkcg_legacy_files,
        .legacy_name = "blkio",
        .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
         * together on the default hierarchy so that the owner cgroup can
         * be retrieved from writeback pages.
         */
        .depends_on = 1 << memory_cgrp_id,
#endif
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);

/**
 * blkcg_activate_policy - activate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to activate
 *
 * Activate @pol on @disk.  Requires %GFP_KERNEL context.  @disk goes through
 * bypass mode to populate its blkgs with policy_data for @pol.
 *
 * Activation happens with @disk bypassed, so nobody would be accessing blkgs
 * from IO path.  Update of each blkg is protected by both queue and blkcg
 * locks so that holding either lock and testing blkcg_policy_enabled() is
 * always enough for dereferencing policy data.
 *
 * The caller is responsible for synchronizing [de]activations and policy
 * [un]registerations.  Returns 0 on success, -errno on failure.
 */
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg, *pinned_blkg = NULL;
        int ret;

        if (blkcg_policy_enabled(q, pol))
                return 0;

        if (queue_is_mq(q))
                blk_mq_freeze_queue(q);
retry:
        spin_lock_irq(&q->queue_lock);

        /* blkg_list is pushed at the head, reverse walk to initialize parents first */
        list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
                struct blkg_policy_data *pd;

                if (blkg->pd[pol->plid])
                        continue;

                /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
                if (blkg == pinned_blkg) {
                        pd = pd_prealloc;
                        pd_prealloc = NULL;
                } else {
                        pd = pol->pd_alloc_fn(disk, blkg->blkcg,
                                              GFP_NOWAIT | __GFP_NOWARN);
                }

                if (!pd) {
                        /*
                         * GFP_NOWAIT failed.  Free the existing one and
                         * prealloc for @blkg w/ GFP_KERNEL.
                         */
                        if (pinned_blkg)
                                blkg_put(pinned_blkg);
                        blkg_get(blkg);
                        pinned_blkg = blkg;

                        spin_unlock_irq(&q->queue_lock);

                        if (pd_prealloc)
                                pol->pd_free_fn(pd_prealloc);
                        pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
                                                       GFP_KERNEL);
                        if (pd_prealloc)
                                goto retry;
                        else
                                goto enomem;
                }

                spin_lock(&blkg->blkcg->lock);

                pd->blkg = blkg;
                pd->plid = pol->plid;
                blkg->pd[pol->plid] = pd;

                if (pol->pd_init_fn)
                        pol->pd_init_fn(pd);

                if (pol->pd_online_fn)
                        pol->pd_online_fn(pd);
                pd->online = true;

                spin_unlock(&blkg->blkcg->lock);
        }

        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;

        spin_unlock_irq(&q->queue_lock);
out:
        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q);
        if (pinned_blkg)
                blkg_put(pinned_blkg);
        if (pd_prealloc)
                pol->pd_free_fn(pd_prealloc);
        return ret;

enomem:
        /* alloc failed, take down everything */
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;
                struct blkg_policy_data *pd;

                spin_lock(&blkcg->lock);
                pd = blkg->pd[pol->plid];
                if (pd) {
                        if (pd->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(pd);
                        pd->online = false;
                        pol->pd_free_fn(pd);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }
        spin_unlock_irq(&q->queue_lock);
        ret = -ENOMEM;
        goto out;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);

/**
 * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to deactivate
 *
 * Deactivate @pol on @disk.  Follows the same synchronization rules as
 * blkcg_activate_policy().
 */
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;

        if (!blkcg_policy_enabled(q, pol))
                return;

        if (queue_is_mq(q))
                blk_mq_freeze_queue(q);

        mutex_lock(&q->blkcg_mutex);
        spin_lock_irq(&q->queue_lock);

        __clear_bit(pol->plid, q->blkcg_pols);

        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                spin_lock(&blkcg->lock);
                if (blkg->pd[pol->plid]) {
                        if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[pol->plid]);
                        pol->pd_free_fn(blkg->pd[pol->plid]);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }

        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);

static void blkcg_free_all_cpd(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;

        list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                if (blkcg->cpd[pol->plid]) {
                        pol->cpd_free_fn(blkcg->cpd[pol->plid]);
                        blkcg->cpd[pol->plid] = NULL;
                }
        }
}

/**
 * blkcg_policy_register - register a blkcg policy
 * @pol: blkcg policy to register
 *
 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
 * successful registration.  Returns 0 on success and -errno on failure.
 */
int blkcg_policy_register(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;
        int i, ret;

        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);

        /* find an empty slot */
        ret = -ENOSPC;
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (!blkcg_policy[i])
                        break;
        if (i >= BLKCG_MAX_POLS) {
                pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
                goto err_unlock;
        }

        /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
        if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
                (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
                goto err_unlock;

        /* register @pol */
        pol->plid = i;
        blkcg_policy[pol->plid] = pol;

        /* allocate and install cpd's */
        if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;

                        cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd)
                                goto err_free_cpds;

                        blkcg->cpd[pol->plid] = cpd;
                        cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
                }
        }

        mutex_unlock(&blkcg_pol_mutex);

        /* everything is in place, add intf files for the new policy */
        if (pol->dfl_cftypes)
                WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
                                               pol->dfl_cftypes));
        if (pol->legacy_cftypes)
                WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
                                                  pol->legacy_cftypes));
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;

err_free_cpds:
        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;
err_unlock:
        mutex_unlock(&blkcg_pol_mutex);
        mutex_unlock(&blkcg_pol_register_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(blkcg_policy_register);

/**
 * blkcg_policy_unregister - unregister a blkcg policy
 * @pol: blkcg policy to unregister
 *
 * Undo blkcg_policy_register(@pol).  Might sleep.
 */
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
        mutex_lock(&blkcg_pol_register_mutex);

        if (WARN_ON(blkcg_policy[pol->plid] != pol))
                goto out_unlock;

        /* kill the intf files first */
        if (pol->dfl_cftypes)
                cgroup_rm_cftypes(pol->dfl_cftypes);
        if (pol->legacy_cftypes)
                cgroup_rm_cftypes(pol->legacy_cftypes);

        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);

        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;

        mutex_unlock(&blkcg_pol_mutex);
out_unlock:
        mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

/*
 * Scale the accumulated delay based on how long it has been since we updated
 * the delay.  We only call this when we are adding delay, in case it's been a
 * while since we added delay, and when we are checking to see if we need to
 * delay a task, to account for any delays that may have occurred.
 */
static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
        u64 old = atomic64_read(&blkg->delay_start);

        /* negative use_delay means no scaling, see blkcg_set_delay() */
        if (atomic_read(&blkg->use_delay) < 0)
                return;

        /*
         * We only want to scale down every second.  The idea here is that we
         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
         * time window.  We only want to throttle tasks for recent delay that
         * has occurred, in 1 second time windows since that's the maximum
         * things can be throttled.  We save the current delay window in
         * blkg->last_delay so we know what amount is still left to be charged
         * to the blkg from this point onward.  blkg->last_use keeps track of
         * the use_delay counter.  The idea is if we're unthrottling the blkg we
         * are ok with whatever is happening now, and we can take away more of
         * the accumulated delay as we've already throttled enough that
         * everybody is happy with their IO latencies.
         */
        if (time_before64(old + NSEC_PER_SEC, now) &&
            atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
                u64 cur = atomic64_read(&blkg->delay_nsec);
                u64 sub = min_t(u64, blkg->last_delay, now - old);
                int cur_use = atomic_read(&blkg->use_delay);

                /*
                 * We've been unthrottled, subtract a larger chunk of our
                 * accumulated delay.
                 */
                if (cur_use < blkg->last_use)
                        sub = max_t(u64, sub, blkg->last_delay >> 1);

                /*
                 * This shouldn't happen, but handle it anyway.  Our delay_nsec
                 * should only ever be growing except here where we subtract out
                 * min(last_delay, 1 second), but lord knows bugs happen and I'd
                 * rather not end up with negative numbers.
                 */
                if (unlikely(cur < sub)) {
                        atomic64_set(&blkg->delay_nsec, 0);
                        blkg->last_delay = 0;
                } else {
                        atomic64_sub(sub, &blkg->delay_nsec);
                        blkg->last_delay = cur - sub;
                }
                blkg->last_use = cur_use;
        }
}

/*
 * This is called when we want to actually walk up the hierarchy and check to
 * see if we need to throttle, and then actually throttle if there is some
 * accumulated delay.  This should only be called upon return to user space so
 * we're not holding some lock that would induce a priority inversion.
 */
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
        unsigned long pflags;
        bool clamp;
        u64 now = blk_time_get_ns();
        u64 exp;
        u64 delay_nsec = 0;
        int tok;

        while (blkg->parent) {
                int use_delay = atomic_read(&blkg->use_delay);

                if (use_delay) {
                        u64 this_delay;

                        blkcg_scale_delay(blkg, now);
                        this_delay = atomic64_read(&blkg->delay_nsec);
                        if (this_delay > delay_nsec) {
                                delay_nsec = this_delay;
                                clamp = use_delay > 0;
                        }
                }
                blkg = blkg->parent;
        }

        if (!delay_nsec)
                return;

        /*
         * Let's not sleep for all eternity if we've amassed a huge delay.
         * Swapping or metadata IO can accumulate 10's of seconds worth of
         * delay, and we want userspace to be able to do _something_ so cap the
         * delays at 0.25s. If there's 10's of seconds worth of delay then the
         * tasks will be delayed for 0.25 second for every syscall. If
         * blkcg_set_delay() was used as indicated by negative use_delay, the
         * caller is responsible for regulating the range.
         */
        if (clamp)
                delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);

        if (use_memdelay)
                psi_memstall_enter(&pflags);

        exp = ktime_add_ns(now, delay_nsec);
        tok = io_schedule_prepare();
        do {
                __set_current_state(TASK_KILLABLE);
                if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
                        break;
        } while (!fatal_signal_pending(current));
        io_schedule_finish(tok);

        if (use_memdelay)
                psi_memstall_leave(&pflags);
}

/**
 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
 *
 * This is only called if we've been marked with set_notify_resume().  Obviously
 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
 * check to see if current->throttle_disk is set and if not this doesn't do
 * anything.  This should only ever be called by the resume code, it's not meant
 * to be called by people willy-nilly as it will actually do the work to
 * throttle the task if it is setup for throttling.
 */
void blkcg_maybe_throttle_current(void)
{
        struct gendisk *disk = current->throttle_disk;
        struct blkcg *blkcg;
        struct blkcg_gq *blkg;
        bool use_memdelay = current->use_memdelay;

        if (!disk)
                return;

        current->throttle_disk = NULL;
        current->use_memdelay = false;

        rcu_read_lock();
        blkcg = css_to_blkcg(blkcg_css());
        if (!blkcg)
                goto out;
        blkg = blkg_lookup(blkcg, disk->queue);
        if (!blkg)
                goto out;
        if (!blkg_tryget(blkg))
                goto out;
        rcu_read_unlock();

        blkcg_maybe_throttle_blkg(blkg, use_memdelay);
        blkg_put(blkg);
        put_disk(disk);
        return;
out:
        rcu_read_unlock();
}

/**
 * blkcg_schedule_throttle - this task needs to check for throttling
 * @disk: disk to throttle
 * @use_memdelay: do we charge this to memory delay for PSI
 *
 * This is called by the IO controller when we know there's delay accumulated
 * for the blkg for this task.  We do not pass the blkg because there are places
 * we call this that may not have that information, the swapping code for
 * instance will only have a block_device at that point.  This set's the
 * notify_resume for the task to check and see if it requires throttling before
 * returning to user space.
 *
 * We will only schedule once per syscall.  You can call this over and over
 * again and it will only do the check once upon return to user space, and only
 * throttle once.  If the task needs to be throttled again it'll need to be
 * re-set at the next time we see the task.
 */
void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{
        if (unlikely(current->flags & PF_KTHREAD))
                return;

        if (current->throttle_disk != disk) {
                if (test_bit(GD_DEAD, &disk->state))
                        return;
                get_device(disk_to_dev(disk));

                if (current->throttle_disk)
                        put_disk(current->throttle_disk);
                current->throttle_disk = disk;
        }

        if (use_memdelay)
                current->use_memdelay = use_memdelay;
        set_notify_resume(current);
}

/**
 * blkcg_add_delay - add delay to this blkg
 * @blkg: blkg of interest
 * @now: the current time in nanoseconds
 * @delta: how many nanoseconds of delay to add
 *
 * Charge @delta to the blkg's current delay accumulation.  This is used to
 * throttle tasks if an IO controller thinks we need more throttling.
 */
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        blkcg_scale_delay(blkg, now);
        atomic64_add(delta, &blkg->delay_nsec);
}

/**
 * blkg_tryget_closest - try and get a blkg ref on the closet blkg
 * @bio: target bio
 * @css: target css
 *
 * As the failure mode here is to walk up the blkg tree, this ensure that the
 * blkg->parent pointers are always valid.  This returns the blkg that it ended
 * up taking a reference on or %NULL if no reference was taken.
 */
static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
                struct cgroup_subsys_state *css)
{
        struct blkcg_gq *blkg, *ret_blkg = NULL;

        rcu_read_lock();
        blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
        while (blkg) {
                if (blkg_tryget(blkg)) {
                        ret_blkg = blkg;
                        break;
                }
                blkg = blkg->parent;
        }
        rcu_read_unlock();

        return ret_blkg;
}

/**
 * bio_associate_blkg_from_css - associate a bio with a specified css
 * @bio: target bio
 * @css: target css
 *
 * Associate @bio with the blkg found by combining the css's blkg and the
 * request_queue of the @bio.  An association failure is handled by walking up
 * the blkg tree.  Therefore, the blkg associated can be anything between @blkg
 * and q->root_blkg.  This situation only happens when a cgroup is dying and
 * then the remaining bios will spill to the closest alive blkg.
 *
 * A reference will be taken on the blkg and will be released when @bio is
 * freed.
 */
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css)
{
        if (bio->bi_blkg)
                blkg_put(bio->bi_blkg);

        if (css && css->parent) {
                bio->bi_blkg = blkg_tryget_closest(bio, css);
        } else {
                blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
                bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
        }
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);

/**
 * bio_associate_blkg - associate a bio with a blkg
 * @bio: target bio
 *
 * Associate @bio with the blkg found from the bio's css and request_queue.
 * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
 * already associated, the css is reused and association redone as the
 * request_queue may have changed.
 */
void bio_associate_blkg(struct bio *bio)
{
        struct cgroup_subsys_state *css;

        if (blk_op_is_passthrough(bio->bi_opf))
                return;

        rcu_read_lock();

        if (bio->bi_blkg)
                css = bio_blkcg_css(bio);
        else
                css = blkcg_css();

        bio_associate_blkg_from_css(bio, css);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(bio_associate_blkg);

/**
 * bio_clone_blkg_association - clone blkg association from src to dst bio
 * @dst: destination bio
 * @src: source bio
 */
void bio_clone_blkg_association(struct bio *dst, struct bio *src)
{
        if (src->bi_blkg)
                bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
}
EXPORT_SYMBOL_GPL(bio_clone_blkg_association);

static int blk_cgroup_io_type(struct bio *bio)
{
        if (op_is_discard(bio->bi_opf))
                return BLKG_IOSTAT_DISCARD;
        if (op_is_write(bio->bi_opf))
                return BLKG_IOSTAT_WRITE;
        return BLKG_IOSTAT_READ;
}

void blk_cgroup_bio_start(struct bio *bio)
{
        struct blkcg *blkcg = bio->bi_blkg->blkcg;
        int rwd = blk_cgroup_io_type(bio), cpu;
        struct blkg_iostat_set *bis;
        unsigned long flags;

        if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
                return;

        /* Root-level stats are sourced from system-wide IO stats */
        if (!cgroup_parent(blkcg->css.cgroup))
                return;

        cpu = get_cpu();
        bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
        flags = u64_stats_update_begin_irqsave(&bis->sync);

        /*
         * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
         * bio and we would have already accounted for the size of the bio.
         */
        if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
                bio_set_flag(bio, BIO_CGROUP_ACCT);
                bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
        }
        bis->cur.ios[rwd]++;

        /*
         * If the iostat_cpu isn't in a lockless list, put it into the
         * list to indicate that a stat update is pending.
         */
        if (!READ_ONCE(bis->lqueued)) {
                struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);

                llist_add(&bis->lnode, lhead);
                WRITE_ONCE(bis->lqueued, true);
        }

        u64_stats_update_end_irqrestore(&bis->sync, flags);
        cgroup_rstat_updated(blkcg->css.cgroup, cpu);
        put_cpu();
}

bool blk_cgroup_congested(void)
{
        struct cgroup_subsys_state *css;
        bool ret = false;

        rcu_read_lock();
        for (css = blkcg_css(); css; css = css->parent) {
                if (atomic_read(&css->cgroup->congestion_count)) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
































































    1 














    1 
    1 

    1 







    1 













    3 














    4 
    4 

    4 




    3 


    3 











   13 



   12 






   11 




    9 

   14 



    2 
    4 



    1 






































    4 





    5 


    1 


    4 

    3 
    3 
    1 
    4 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// SPDX-License-Identifier: GPL-2.0

/*
 * Copyright (c) 2021, Google LLC.
 * Pasha Tatashin <pasha.tatashin@soleen.com>
 */
#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/page_table_check.h>
#include <linux/swap.h>
#include <linux/swapops.h>

#undef pr_fmt
#define pr_fmt(fmt)        "page_table_check: " fmt

struct page_table_check {
        atomic_t anon_map_count;
        atomic_t file_map_count;
};

static bool __page_table_check_enabled __initdata =
                                IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED);

DEFINE_STATIC_KEY_TRUE(page_table_check_disabled);
EXPORT_SYMBOL(page_table_check_disabled);

static int __init early_page_table_check_param(char *buf)
{
        return kstrtobool(buf, &__page_table_check_enabled);
}

early_param("page_table_check", early_page_table_check_param);

static bool __init need_page_table_check(void)
{
        return __page_table_check_enabled;
}

static void __init init_page_table_check(void)
{
        if (!__page_table_check_enabled)
                return;
        static_branch_disable(&page_table_check_disabled);
}

struct page_ext_operations page_table_check_ops = {
        .size = sizeof(struct page_table_check),
        .need = need_page_table_check,
        .init = init_page_table_check,
        .need_shared_flags = false,
};

static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
{
        BUG_ON(!page_ext);
        return page_ext_data(page_ext, &page_table_check_ops);
}

/*
 * An entry is removed from the page table, decrement the counters for that page
 * verify that it is of correct type and counters do not become negative.
 */
static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
        struct page_ext *page_ext;
        struct page *page;
        unsigned long i;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        page_ext = page_ext_get(page);

        if (!page_ext)
                return;

        BUG_ON(PageSlab(page));
        anon = PageAnon(page);

        for (i = 0; i < pgcnt; i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
                }
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

/*
 * A new entry is added to the page table, increment the counters for that page
 * verify that it is of correct type and is not being mapped with a different
 * type to a different process.
 */
static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
                                 bool rw)
{
        struct page_ext *page_ext;
        struct page *page;
        unsigned long i;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        page_ext = page_ext_get(page);

        if (!page_ext)
                return;

        BUG_ON(PageSlab(page));
        anon = PageAnon(page);

        for (i = 0; i < pgcnt; i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
                }
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

/*
 * page is on free list, or is being allocated, verify that counters are zeroes
 * crash if they are not.
 */
void __page_table_check_zero(struct page *page, unsigned int order)
{
        struct page_ext *page_ext;
        unsigned long i;

        BUG_ON(PageSlab(page));

        page_ext = page_ext_get(page);

        if (!page_ext)
                return;

        for (i = 0; i < (1ul << order); i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                BUG_ON(atomic_read(&ptc->anon_map_count));
                BUG_ON(atomic_read(&ptc->file_map_count));
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
        if (&init_mm == mm)
                return;

        if (pte_user_accessible_page(pte)) {
                page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pte_clear);

void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        if (pmd_user_accessible_page(pmd)) {
                page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);

void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
        if (&init_mm == mm)
                return;

        if (pud_user_accessible_page(pud)) {
                page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pud_clear);

/* Whether the swap entry cached writable information */
static inline bool swap_cached_writable(swp_entry_t entry)
{
        return is_writable_device_exclusive_entry(entry) ||
            is_writable_device_private_entry(entry) ||
            is_writable_migration_entry(entry);
}

static inline void page_table_check_pte_flags(pte_t pte)
{
        if (pte_present(pte) && pte_uffd_wp(pte))
                WARN_ON_ONCE(pte_write(pte));
        else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte))
                WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte)));
}

void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
                unsigned int nr)
{
        unsigned int i;

        if (&init_mm == mm)
                return;

        page_table_check_pte_flags(pte);

        for (i = 0; i < nr; i++)
                __page_table_check_pte_clear(mm, ptep_get(ptep + i));
        if (pte_user_accessible_page(pte))
                page_table_check_set(pte_pfn(pte), nr, pte_write(pte));
}
EXPORT_SYMBOL(__page_table_check_ptes_set);

static inline void page_table_check_pmd_flags(pmd_t pmd)
{
        if (pmd_present(pmd) && pmd_uffd_wp(pmd))
                WARN_ON_ONCE(pmd_write(pmd));
        else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd))
                WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}

void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        page_table_check_pmd_flags(pmd);

        __page_table_check_pmd_clear(mm, *pmdp);
        if (pmd_user_accessible_page(pmd)) {
                page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
                                     pmd_write(pmd));
        }
}
EXPORT_SYMBOL(__page_table_check_pmd_set);

void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
{
        if (&init_mm == mm)
                return;

        __page_table_check_pud_clear(mm, *pudp);
        if (pud_user_accessible_page(pud)) {
                page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
                                     pud_write(pud));
        }
}
EXPORT_SYMBOL(__page_table_check_pud_set);

void __page_table_check_pte_clear_range(struct mm_struct *mm,
                                        unsigned long addr,
                                        pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
                pte_t *ptep = pte_offset_map(&pmd, addr);
                unsigned long i;

                if (WARN_ON(!ptep))
                        return;
                for (i = 0; i < PTRS_PER_PTE; i++) {
                        __page_table_check_pte_clear(mm, ptep_get(ptep));
                        addr += PAGE_SIZE;
                        ptep++;
                }
                pte_unmap(ptep - PTRS_PER_PTE);
        }
}





























































































































































































































































































































































































































    1 









    1 





















































    1 




























































































    1 






































    1 










    1 

    1 












    1 







    1 


    1 




















































































    1 











































































































































































































































































































































































    1 
    1 


    1 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
// SPDX-License-Identifier: GPL-2.0
/*
 * NETLINK      Netlink attributes
 *
 *                 Authors:        Thomas Graf <tgraf@suug.ch>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/jiffies.h>
#include <linux/nospec.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/netlink.h>

/* For these data types, attribute length should be exactly the given
 * size. However, to maintain compatibility with broken commands, if the
 * attribute length does not match the expected size a warning is emitted
 * to the user that the command is sending invalid data and needs to be fixed.
 */
static const u8 nla_attr_len[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_MSECS]        = sizeof(u64),
        [NLA_NESTED]        = NLA_HDRLEN,
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

/*
 * Nested policies might refer back to the original
 * policy in some cases, and userspace could try to
 * abuse that and recurse by nesting in the right
 * ways. Limit recursion to avoid this problem.
 */
#define MAX_POLICY_RECURSION_DEPTH        10

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth);

static int validate_nla_bitfield32(const struct nlattr *nla,
                                   const u32 valid_flags_mask)
{
        const struct nla_bitfield32 *bf = nla_data(nla);

        if (!valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit selector */
        if (bf->selector & ~valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit values */
        if (bf->value & ~valid_flags_mask)
                return -EINVAL;

        /*disallow valid bit values that are not selected*/
        if (bf->value & ~bf->selector)
                return -EINVAL;

        return 0;
}

static int nla_validate_array(const struct nlattr *head, int len, int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack,
                              unsigned int validate, unsigned int depth)
{
        const struct nlattr *entry;
        int rem;

        nla_for_each_attr(entry, head, len, rem) {
                int ret;

                if (nla_len(entry) == 0)
                        continue;

                if (nla_len(entry) < NLA_HDRLEN) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy,
                                                "Array element too short");
                        return -ERANGE;
                }

                ret = __nla_validate_parse(nla_data(entry), nla_len(entry),
                                           maxtype, policy, validate, extack,
                                           NULL, depth + 1);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range)
{
        WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR &&
                     (pt->min < 0 || pt->max < 0));

        range->min = 0;

        switch (pt->type) {
        case NLA_U8:
                range->max = U8_MAX;
                break;
        case NLA_U16:
        case NLA_BE16:
        case NLA_BINARY:
                range->max = U16_MAX;
                break;
        case NLA_U32:
        case NLA_BE32:
                range->max = U32_MAX;
                break;
        case NLA_U64:
        case NLA_UINT:
        case NLA_MSECS:
                range->max = U64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_range_unsigned(const struct nla_policy *pt,
                                       const struct nlattr *nla,
                                       struct netlink_ext_ack *extack,
                                       unsigned int validate)
{
        struct netlink_range_validation range;
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_UINT:
                value = nla_get_uint(nla);
                break;
        case NLA_MSECS:
                value = nla_get_u64(nla);
                break;
        case NLA_BINARY:
                value = nla_len(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_unsigned(pt, &range);

        if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG &&
            pt->type == NLA_BINARY && value > range.max) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, pt->type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }

                /* this assumes min <= max (don't validate against min) */
                return 0;
        }

        if (value < range.min || value > range.max) {
                bool binary = pt->type == NLA_BINARY;

                if (binary)
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "binary attribute size out of range");
                else
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "integer out of range");

                return -ERANGE;
        }

        return 0;
}

void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range)
{
        switch (pt->type) {
        case NLA_S8:
                range->min = S8_MIN;
                range->max = S8_MAX;
                break;
        case NLA_S16:
                range->min = S16_MIN;
                range->max = S16_MAX;
                break;
        case NLA_S32:
                range->min = S32_MIN;
                range->max = S32_MAX;
                break;
        case NLA_S64:
        case NLA_SINT:
                range->min = S64_MIN;
                range->max = S64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range_signed;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_int_range_signed(const struct nla_policy *pt,
                                         const struct nlattr *nla,
                                         struct netlink_ext_ack *extack)
{
        struct netlink_range_validation_signed range;
        s64 value;

        switch (pt->type) {
        case NLA_S8:
                value = nla_get_s8(nla);
                break;
        case NLA_S16:
                value = nla_get_s16(nla);
                break;
        case NLA_S32:
                value = nla_get_s32(nla);
                break;
        case NLA_S64:
                value = nla_get_s64(nla);
                break;
        case NLA_SINT:
                value = nla_get_sint(nla);
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_signed(pt, &range);

        if (value < range.min || value > range.max) {
                NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                        "integer out of range");
                return -ERANGE;
        }

        return 0;
}

static int nla_validate_int_range(const struct nla_policy *pt,
                                  const struct nlattr *nla,
                                  struct netlink_ext_ack *extack,
                                  unsigned int validate)
{
        switch (pt->type) {
        case NLA_U8:
        case NLA_U16:
        case NLA_U32:
        case NLA_U64:
        case NLA_UINT:
        case NLA_MSECS:
        case NLA_BINARY:
        case NLA_BE16:
        case NLA_BE32:
                return nla_validate_range_unsigned(pt, nla, extack, validate);
        case NLA_S8:
        case NLA_S16:
        case NLA_S32:
        case NLA_S64:
        case NLA_SINT:
                return nla_validate_int_range_signed(pt, nla, extack);
        default:
                WARN_ON(1);
                return -EINVAL;
        }
}

static int nla_validate_mask(const struct nla_policy *pt,
                             const struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_UINT:
                value = nla_get_uint(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        if (value & ~(u64)pt->mask) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set");
                return -EINVAL;
        }

        return 0;
}

static int validate_nla(const struct nlattr *nla, int maxtype,
                        const struct nla_policy *policy, unsigned int validate,
                        struct netlink_ext_ack *extack, unsigned int depth)
{
        u16 strict_start_type = policy[0].strict_start_type;
        const struct nla_policy *pt;
        int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
        int err = -ERANGE;

        if (strict_start_type && type >= strict_start_type)
                validate |= NL_VALIDATE_STRICT;

        if (type <= 0 || type > maxtype)
                return 0;

        type = array_index_nospec(type, maxtype + 1);
        pt = &policy[type];

        BUG_ON(pt->type > NLA_TYPE_MAX);

        if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }
        }

        if (validate & NL_VALIDATE_NESTED) {
                if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) &&
                    !(nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED is missing");
                        return -EINVAL;
                }
                if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY &&
                    pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED not expected");
                        return -EINVAL;
                }
        }

        switch (pt->type) {
        case NLA_REJECT:
                if (extack && pt->reject_message) {
                        NL_SET_BAD_ATTR(extack, nla);
                        extack->_msg = pt->reject_message;
                        return -EINVAL;
                }
                err = -EINVAL;
                goto out_err;

        case NLA_FLAG:
                if (attrlen > 0)
                        goto out_err;
                break;

        case NLA_SINT:
        case NLA_UINT:
                if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }
                break;

        case NLA_BITFIELD32:
                if (attrlen != sizeof(struct nla_bitfield32))
                        goto out_err;

                err = validate_nla_bitfield32(nla, pt->bitfield32_valid);
                if (err)
                        goto out_err;
                break;

        case NLA_NUL_STRING:
                if (pt->len)
                        minlen = min_t(int, attrlen, pt->len + 1);
                else
                        minlen = attrlen;

                if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) {
                        err = -EINVAL;
                        goto out_err;
                }
                fallthrough;

        case NLA_STRING:
                if (attrlen < 1)
                        goto out_err;

                if (pt->len) {
                        char *buf = nla_data(nla);

                        if (buf[attrlen - 1] == '\0')
                                attrlen--;

                        if (attrlen > pt->len)
                                goto out_err;
                }
                break;

        case NLA_BINARY:
                if (pt->len && attrlen > pt->len)
                        goto out_err;
                break;

        case NLA_NESTED:
                /* a nested attributes is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        err = __nla_validate_parse(nla_data(nla), nla_len(nla),
                                                   pt->len, pt->nested_policy,
                                                   validate, extack, NULL,
                                                   depth + 1);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;
        case NLA_NESTED_ARRAY:
                /* a nested array attribute is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        int err;

                        err = nla_validate_array(nla_data(nla), nla_len(nla),
                                                 pt->len, pt->nested_policy,
                                                 extack, validate, depth);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;

        case NLA_UNSPEC:
                if (validate & NL_VALIDATE_UNSPEC) {
                        NL_SET_ERR_MSG_ATTR(extack, nla,
                                            "Unsupported attribute");
                        return -EINVAL;
                }
                if (attrlen < pt->len)
                        goto out_err;
                break;

        default:
                if (pt->len)
                        minlen = pt->len;
                else
                        minlen = nla_attr_minlen[pt->type];

                if (attrlen < minlen)
                        goto out_err;
        }

        /* further validation */
        switch (pt->validation_type) {
        case NLA_VALIDATE_NONE:
                /* nothing to do */
                break;
        case NLA_VALIDATE_RANGE_PTR:
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
        case NLA_VALIDATE_MIN:
        case NLA_VALIDATE_MAX:
                err = nla_validate_int_range(pt, nla, extack, validate);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_MASK:
                err = nla_validate_mask(pt, nla, extack);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_FUNCTION:
                if (pt->validate) {
                        err = pt->validate(nla, extack);
                        if (err)
                                return err;
                }
                break;
        }

        return 0;
out_err:
        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                "Attribute failed policy validation");
        return err;
}

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth)
{
        const struct nlattr *nla;
        int rem;

        if (depth >= MAX_POLICY_RECURSION_DEPTH) {
                NL_SET_ERR_MSG(extack,
                               "allowed policy recursion depth exceeded");
                return -EINVAL;
        }

        if (tb)
                memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));

        nla_for_each_attr(nla, head, len, rem) {
                u16 type = nla_type(nla);

                if (type == 0 || type > maxtype) {
                        if (validate & NL_VALIDATE_MAXTYPE) {
                                NL_SET_ERR_MSG_ATTR(extack, nla,
                                                    "Unknown attribute type");
                                return -EINVAL;
                        }
                        continue;
                }
                type = array_index_nospec(type, maxtype + 1);
                if (policy) {
                        int err = validate_nla(nla, maxtype, policy,
                                               validate, extack, depth);

                        if (err < 0)
                                return err;
                }

                if (tb)
                        tb[type] = (struct nlattr *)nla;
        }

        if (unlikely(rem > 0)) {
                pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
                                    rem, current->comm);
                NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes");
                if (validate & NL_VALIDATE_TRAILING)
                        return -EINVAL;
        }

        return 0;
}

/**
 * __nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation depends on the validate flags passed, see
 * &enum netlink_validation for more details on that.
 * See documentation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, NULL, 0);
}
EXPORT_SYMBOL(__nla_validate);

/**
 * nla_policy_len - Determine the max. length of a policy
 * @p: policy to use
 * @n: number of policies
 *
 * Determines the max. length of the policy.  It is currently used
 * to allocated Netlink buffers roughly the size of the actual
 * message.
 *
 * Returns 0 on success or a negative error code.
 */
int
nla_policy_len(const struct nla_policy *p, int n)
{
        int i, len = 0;

        for (i = 0; i < n; i++, p++) {
                if (p->len)
                        len += nla_total_size(p->len);
                else if (nla_attr_len[p->type])
                        len += nla_total_size(nla_attr_len[p->type]);
                else if (nla_attr_minlen[p->type])
                        len += nla_total_size(nla_attr_minlen[p->type]);
        }

        return len;
}
EXPORT_SYMBOL(nla_policy_len);

/**
 * __nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type.
 * Validation is controlled by the @validate parameter.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_parse(struct nlattr **tb, int maxtype,
                const struct nlattr *head, int len,
                const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, tb, 0);
}
EXPORT_SYMBOL(__nla_parse);

/**
 * nla_find - Find a specific attribute in a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute in the stream matching the specified type.
 */
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype)
{
        const struct nlattr *nla;
        int rem;

        nla_for_each_attr(nla, head, len, rem)
                if (nla_type(nla) == attrtype)
                        return (struct nlattr *)nla;

        return NULL;
}
EXPORT_SYMBOL(nla_find);

/**
 * nla_strscpy - Copy string attribute payload into a sized buffer
 * @dst: Where to copy the string to.
 * @nla: Attribute to copy the string from.
 * @dstsize: Size of destination buffer.
 *
 * Copies at most dstsize - 1 bytes into the destination buffer.
 * Unlike strscpy() the destination buffer is always padded out.
 *
 * Return:
 * * srclen - Returns @nla length (not including the trailing %NUL).
 * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater
 *            than @dstsize.
 */
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla);
        ssize_t ret;
        size_t len;

        if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX))
                return -E2BIG;

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        if (srclen >= dstsize) {
                len = dstsize - 1;
                ret = -E2BIG;
        } else {
                len = srclen;
                ret = len;
        }

        memcpy(dst, src, len);
        /* Zero pad end of dst. */
        memset(dst + len, 0, dstsize - len);

        return ret;
}
EXPORT_SYMBOL(nla_strscpy);

/**
 * nla_strdup - Copy string attribute payload into a newly allocated buffer
 * @nla: attribute to copy the string from
 * @flags: the type of memory to allocate (see kmalloc).
 *
 * Returns a pointer to the allocated buffer or NULL on error.
 */
char *nla_strdup(const struct nlattr *nla, gfp_t flags)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla), *dst;

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        dst = kmalloc(srclen + 1, flags);
        if (dst != NULL) {
                memcpy(dst, src, srclen);
                dst[srclen] = '\0';
        }
        return dst;
}
EXPORT_SYMBOL(nla_strdup);

/**
 * nla_memcpy - Copy a netlink attribute into another memory area
 * @dest: where to copy to memcpy
 * @src: netlink attribute to copy from
 * @count: size of the destination area
 *
 * Note: The number of bytes copied is limited by the length of
 *       attribute's payload. memcpy
 *
 * Returns the number of bytes copied.
 */
int nla_memcpy(void *dest, const struct nlattr *src, int count)
{
        int minlen = min_t(int, count, nla_len(src));

        memcpy(dest, nla_data(src), minlen);
        if (count > minlen)
                memset(dest + minlen, 0, count - minlen);

        return minlen;
}
EXPORT_SYMBOL(nla_memcpy);

/**
 * nla_memcmp - Compare an attribute with sized memory area
 * @nla: netlink attribute
 * @data: memory area
 * @size: size of memory area
 */
int nla_memcmp(const struct nlattr *nla, const void *data,
                             size_t size)
{
        int d = nla_len(nla) - size;

        if (d == 0)
                d = memcmp(nla_data(nla), data, size);

        return d;
}
EXPORT_SYMBOL(nla_memcmp);

/**
 * nla_strcmp - Compare a string attribute against a string
 * @nla: netlink string attribute
 * @str: another string
 */
int nla_strcmp(const struct nlattr *nla, const char *str)
{
        int len = strlen(str);
        char *buf = nla_data(nla);
        int attrlen = nla_len(nla);
        int d;

        while (attrlen > 0 && buf[attrlen - 1] == '\0')
                attrlen--;

        d = attrlen - len;
        if (d == 0)
                d = memcmp(nla_data(nla), str, len);

        return d;
}
EXPORT_SYMBOL(nla_strcmp);

#ifdef CONFIG_NET
/**
 * __nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        struct nlattr *nla;

        nla = skb_put(skb, nla_total_size(attrlen));
        nla->nla_type = attrtype;
        nla->nla_len = nla_attr_size(attrlen);

        memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));

        return nla;
}
EXPORT_SYMBOL(__nla_reserve);

/**
 * __nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr)
{
        nla_align_64bit(skb, padattr);

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(__nla_reserve_64bit);

/**
 * __nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the payload.
 */
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        return skb_put_zero(skb, NLA_ALIGN(attrlen));
}
EXPORT_SYMBOL(__nla_reserve_nohdr);

/**
 * nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return NULL;

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(nla_reserve);

/**
 * nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                                 int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return NULL;

        return __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
}
EXPORT_SYMBOL(nla_reserve_64bit);

/**
 * nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return NULL;

        return __nla_reserve_nohdr(skb, attrlen);
}
EXPORT_SYMBOL(nla_reserve_nohdr);

/**
 * __nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
                             const void *data)
{
        struct nlattr *nla;

        nla = __nla_reserve(skb, attrtype, attrlen);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put);

/**
 * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr)
{
        struct nlattr *nla;

        nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put_64bit);

/**
 * __nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute payload.
 */
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        void *start;

        start = __nla_reserve_nohdr(skb, attrlen);
        memcpy(start, data, attrlen);
}
EXPORT_SYMBOL(__nla_put_nohdr);

/**
 * nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return -EMSGSIZE;

        __nla_put(skb, attrtype, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put);

/**
 * nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return -EMSGSIZE;

        __nla_put_64bit(skb, attrtype, attrlen, data, padattr);
        return 0;
}
EXPORT_SYMBOL(nla_put_64bit);

/**
 * nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        __nla_put_nohdr(skb, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put_nohdr);

/**
 * nla_append - Add a netlink attribute without header or padding
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_append(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        skb_put_data(skb, data, attrlen);
        return 0;
}
EXPORT_SYMBOL(nla_append);
#endif







































































    3 


    2 


    4 

    4 




    4 





    3 




























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Pseudo-driver for the loopback interface.
 *
 * Version:        @(#)loopback.c        1.0.4b        08/16/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@scyld.com>
 *
 *                Alan Cox        :        Fixed oddments for NET3.014
 *                Alan Cox        :        Rejig for NET3.029 snap #3
 *                Alan Cox        :        Fixed NET3.029 bugs and sped up
 *                Larry McVoy        :        Tiny tweak to double performance
 *                Alan Cox        :        Backed out LMV's tweak - the linux mm
 *                                        can't take it...
 *              Michael Griffith:       Don't bother computing the checksums
 *                                      on packets received on the loopback
 *                                      interface.
 *                Alexey Kuznetsov:        Potential hang under some extreme
 *                                        cases removed.
 */
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/in.h>

#include <linux/uaccess.h>
#include <linux/io.h>

#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/ethtool.h>
#include <net/sch_generic.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <linux/if_ether.h>        /* For the statistics structure. */
#include <linux/if_arp.h>        /* For ARPHRD_ETHER */
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/percpu.h>
#include <linux/net_tstamp.h>
#include <net/net_namespace.h>
#include <linux/u64_stats_sync.h>

/* blackhole_netdev - a device used for dsts that are marked expired!
 * This is global device (instead of per-net-ns) since it's not needed
 * to be per-ns and gets initialized at boot time.
 */
struct net_device *blackhole_netdev;
EXPORT_SYMBOL(blackhole_netdev);

/* The higher levels take care of making this non-reentrant (it's
 * called with bh's disabled).
 */
static netdev_tx_t loopback_xmit(struct sk_buff *skb,
                                 struct net_device *dev)
{
        int len;

        skb_tx_timestamp(skb);

        /* do not fool net_timestamp_check() with various clock bases */
        skb_clear_tstamp(skb);

        skb_orphan(skb);

        /* Before queueing this packet to __netif_rx(),
         * make sure dst is refcounted.
         */
        skb_dst_force(skb);

        skb->protocol = eth_type_trans(skb, dev);

        len = skb->len;
        if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
                dev_lstats_add(dev, len);

        return NETDEV_TX_OK;
}

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes)
{
        int i;

        *packets = 0;
        *bytes = 0;

        for_each_possible_cpu(i) {
                const struct pcpu_lstats *lb_stats;
                u64 tbytes, tpackets;
                unsigned int start;

                lb_stats = per_cpu_ptr(dev->lstats, i);
                do {
                        start = u64_stats_fetch_begin(&lb_stats->syncp);
                        tpackets = u64_stats_read(&lb_stats->packets);
                        tbytes = u64_stats_read(&lb_stats->bytes);
                } while (u64_stats_fetch_retry(&lb_stats->syncp, start));
                *bytes   += tbytes;
                *packets += tpackets;
        }
}
EXPORT_SYMBOL(dev_lstats_read);

static void loopback_get_stats64(struct net_device *dev,
                                 struct rtnl_link_stats64 *stats)
{
        u64 packets, bytes;

        dev_lstats_read(dev, &packets, &bytes);

        stats->rx_packets = packets;
        stats->tx_packets = packets;
        stats->rx_bytes   = bytes;
        stats->tx_bytes   = bytes;
}

static u32 always_on(struct net_device *dev)
{
        return 1;
}

static const struct ethtool_ops loopback_ethtool_ops = {
        .get_link                = always_on,
        .get_ts_info                = ethtool_op_get_ts_info,
};

static int loopback_dev_init(struct net_device *dev)
{
        netdev_lockdep_set_classes(dev);
        return 0;
}

static void loopback_dev_free(struct net_device *dev)
{
        dev_net(dev)->loopback_dev = NULL;
}

static const struct net_device_ops loopback_ops = {
        .ndo_init        = loopback_dev_init,
        .ndo_start_xmit  = loopback_xmit,
        .ndo_get_stats64 = loopback_get_stats64,
        .ndo_set_mac_address = eth_mac_addr,
};

static void gen_lo_setup(struct net_device *dev,
                         unsigned int mtu,
                         const struct ethtool_ops *eth_ops,
                         const struct header_ops *hdr_ops,
                         const struct net_device_ops *dev_ops,
                         void (*dev_destructor)(struct net_device *dev))
{
        dev->mtu                = mtu;
        dev->hard_header_len        = ETH_HLEN;        /* 14        */
        dev->min_header_len        = ETH_HLEN;        /* 14        */
        dev->addr_len                = ETH_ALEN;        /* 6        */
        dev->type                = ARPHRD_LOOPBACK;        /* 0x0001*/
        dev->flags                = IFF_LOOPBACK;
        dev->priv_flags                |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
        netif_keep_dst(dev);
        dev->hw_features        = NETIF_F_GSO_SOFTWARE;
        dev->features                = NETIF_F_SG | NETIF_F_FRAGLIST
                | NETIF_F_GSO_SOFTWARE
                | NETIF_F_HW_CSUM
                | NETIF_F_RXCSUM
                | NETIF_F_SCTP_CRC
                | NETIF_F_HIGHDMA
                | NETIF_F_LLTX
                | NETIF_F_NETNS_LOCAL
                | NETIF_F_VLAN_CHALLENGED
                | NETIF_F_LOOPBACK;
        dev->ethtool_ops        = eth_ops;
        dev->header_ops                = hdr_ops;
        dev->netdev_ops                = dev_ops;
        dev->needs_free_netdev        = true;
        dev->pcpu_stat_type        = NETDEV_PCPU_STAT_LSTATS;
        dev->priv_destructor        = dev_destructor;

        netif_set_tso_max_size(dev, GSO_MAX_SIZE);
}

/* The loopback device is special. There is only one instance
 * per network namespace.
 */
static void loopback_setup(struct net_device *dev)
{
        gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops,
                     &loopback_ops, loopback_dev_free);
}

/* Setup and register the loopback device. */
static __net_init int loopback_net_init(struct net *net)
{
        struct net_device *dev;
        int err;

        err = -ENOMEM;
        dev = alloc_netdev(0, "lo", NET_NAME_PREDICTABLE, loopback_setup);
        if (!dev)
                goto out;

        dev_net_set(dev, net);
        err = register_netdev(dev);
        if (err)
                goto out_free_netdev;

        BUG_ON(dev->ifindex != LOOPBACK_IFINDEX);
        net->loopback_dev = dev;
        return 0;

out_free_netdev:
        free_netdev(dev);
out:
        if (net_eq(net, &init_net))
                panic("loopback: Failed to register netdevice: %d\n", err);
        return err;
}

/* Registered in net/core/dev.c */
struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
};

/* blackhole netdevice */
static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
                                         struct net_device *dev)
{
        kfree_skb(skb);
        net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
        return NETDEV_TX_OK;
}

static const struct net_device_ops blackhole_netdev_ops = {
        .ndo_start_xmit = blackhole_netdev_xmit,
};

/* This is a dst-dummy device used specifically for invalidated
 * DSTs and unlike loopback, this is not per-ns.
 */
static void blackhole_netdev_setup(struct net_device *dev)
{
        gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
}

/* Setup and register the blackhole_netdev. */
static int __init blackhole_netdev_init(void)
{
        blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
                                        blackhole_netdev_setup);
        if (!blackhole_netdev)
                return -ENOMEM;

        rtnl_lock();
        dev_init_scheduler(blackhole_netdev);
        dev_activate(blackhole_netdev);
        rtnl_unlock();

        blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
        dev_net_set(blackhole_netdev, &init_net);

        return 0;
}

device_initcall(blackhole_netdev_init);





























































































































































































































































































    4 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 *           Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                Documentation/RCU/ *.txt
 *
 */

#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H

#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>

struct srcu_struct;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
                       struct lock_class_key *key);

#define init_srcu_struct(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct((ssp), #ssp, &__srcu_key); \
})

#define __SRCU_DEP_MAP_INIT(srcu_name)        .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

int init_srcu_struct(struct srcu_struct *ssp);

#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#else
#error "Unknown SRCU implementation specified to kernel configuration"
#endif

void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
                void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
void synchronize_srcu(struct srcu_struct *ssp);
unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);

#ifdef CONFIG_NEED_SRCU_NMI_SAFE
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp);
#else
static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
        return __srcu_read_lock(ssp);
}
static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
        __srcu_read_unlock(ssp, idx);
}
#endif /* CONFIG_NEED_SRCU_NMI_SAFE */

void srcu_init(void);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

/**
 * srcu_read_lock_held - might we be in SRCU read-side critical section?
 * @ssp: The srcu_struct structure to check
 *
 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
 * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
 * this assumes we are in an SRCU read-side critical section unless it can
 * prove otherwise.
 *
 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
 * and while lockdep is disabled.
 *
 * Note that SRCU is based on its own statemachine and it doesn't
 * relies on normal RCU, it can be called from the CPU which
 * is in the idle loop from an RCU point of view or offline.
 */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        if (!debug_lockdep_rcu_enabled())
                return 1;
        return lock_is_held(&ssp->dep_map);
}

/*
 * Annotations provide deadlock detection for SRCU.
 *
 * Similar to other lockdep annotations, except there is an additional
 * srcu_lock_sync(), which is basically an empty *write*-side critical section,
 * see lock_sync() for more information.
 */

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_acquire(struct lockdep_map *map)
{
        lock_map_acquire_read(map);
}

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_release(struct lockdep_map *map)
{
        lock_map_release(map);
}

/* Annotates a synchronize_srcu() */
static inline void srcu_lock_sync(struct lockdep_map *map)
{
        lock_map_sync(map);
}

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        return 1;
}

#define srcu_lock_acquire(m) do { } while (0)
#define srcu_lock_release(m) do { } while (0)
#define srcu_lock_sync(m) do { } while (0)

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#define SRCU_NMI_UNKNOWN        0x0
#define SRCU_NMI_UNSAFE                0x1
#define SRCU_NMI_SAFE                0x2

#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe);
#else
static inline void srcu_check_nmi_safety(struct srcu_struct *ssp,
                                         bool nmi_safe) { }
#endif


/**
 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 * @c: condition to check for update-side use
 *
 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
 * critical section will result in an RCU-lockdep splat, unless @c evaluates
 * to 1.  The @c argument will normally be a logical expression containing
 * lockdep_is_held() calls.
 */
#define srcu_dereference_check(p, ssp, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || srcu_read_lock_held(ssp), __rcu)

/**
 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 *
 * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
 * is enabled, invoking this outside of an RCU read-side critical
 * section will result in an RCU-lockdep splat.
 */
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)

/**
 * srcu_dereference_notrace - no tracing and no lockdep calls from here
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 */
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)

/**
 * srcu_read_lock - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section.  Note that SRCU read-side
 * critical sections may be nested.  However, it is illegal to
 * call anything that waits on an SRCU grace period for the same
 * srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().
 *
 * Note that srcu_read_lock() and the matching srcu_read_unlock() must
 * occur in the same context, for example, it is illegal to invoke
 * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
 * was invoked in process context.
 */
static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_nmi_safety(ssp, false);
        retval = __srcu_read_lock(ssp);
        srcu_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but in an NMI-safe manner.
 * See srcu_read_lock() for more information.
 */
static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_nmi_safety(ssp, true);
        retval = __srcu_read_lock_nmisafe(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_nmi_safety(ssp, false);
        retval = __srcu_read_lock(ssp);
        return retval;
}

/**
 * srcu_down_read - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section.  Note that
 * SRCU read-side critical sections may be nested.  However, it is
 * illegal to call anything that waits on an SRCU grace period for the
 * same srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().  But if you want lockdep to help you
 * keep this stuff straight, you should instead use srcu_read_lock().
 *
 * The semaphore-like nature of srcu_down_read() means that the matching
 * srcu_up_read() can be invoked from some other context, for example,
 * from some other task or from an irq handler.  However, neither
 * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler.
 *
 * Calls to srcu_down_read() may be nested, similar to the manner in
 * which calls to down_read() may be nested.
 */
static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
{
        WARN_ON_ONCE(in_nmi());
        srcu_check_nmi_safety(ssp, false);
        return __srcu_read_lock(ssp);
}

/**
 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section.
 */
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_nmi_safety(ssp, false);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section, but in an NMI-safe manner.
 */
static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_nmi_safety(ssp, true);
        rcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_nmisafe(ssp, idx);
}

/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
        srcu_check_nmi_safety(ssp, false);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_up_read - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read().
 */
static inline void srcu_up_read(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        WARN_ON_ONCE(in_nmi());
        srcu_check_nmi_safety(ssp, false);
        __srcu_read_unlock(ssp, idx);
}

/**
 * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
 *
 * Converts the preceding srcu_read_unlock into a two-way memory barrier.
 *
 * Call this after srcu_read_unlock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
 * the preceding srcu_read_unlock.
 */
static inline void smp_mb__after_srcu_read_unlock(void)
{
        /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}

DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
                    _T->idx = srcu_read_lock(_T->lock),
                    srcu_read_unlock(_T->lock, _T->idx),
                    int idx)

#endif






















































































































































































































































































    1 
    1 
    3 
























































    1 









    1 





    1 







    1 











    1 


    1 
    1 


























    3 



    3 






    1 







    1 

























    1 

























    1 














    2 









    2 































    1 
    3 











    1 


    1 



























    3 
    1 






















    1 


    3 







    3 














    2 































    1 

    1 









    1 
    1 



    1 
    1 
    1 



    1 

    1 

    1 





    1 



    1 



    1 


    1 

    1 






















    2 





    2 
























































































    2 



















    1 





    1 








    1 














































































































    3 











    2 


    2 


    3 





    3 
    2 

    1 

    1 









































































































































































    1 
    3 




















































































    3 






    2 












    1 


    1 



    1 





    1 


    1 






    1 
    1 























    1 




    1 































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
 * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
 * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
 */

#include <linux/types.h>
#include <linux/timer.h>
#include <linux/module.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/spinlock.h>
#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <net/ip6_checksum.h>
#include <asm/unaligned.h>

#include <net/tcp.h>

#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_log.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>

  /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
     closely.  They're more complex. --RR */

static const char *const tcp_conntrack_names[] = {
        "NONE",
        "SYN_SENT",
        "SYN_RECV",
        "ESTABLISHED",
        "FIN_WAIT",
        "CLOSE_WAIT",
        "LAST_ACK",
        "TIME_WAIT",
        "CLOSE",
        "SYN_SENT2",
};

enum nf_ct_tcp_action {
        NFCT_TCP_IGNORE,
        NFCT_TCP_INVALID,
        NFCT_TCP_ACCEPT,
};

#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
#define DAYS * 24 HOURS

static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
        [TCP_CONNTRACK_SYN_SENT]        = 2 MINS,
        [TCP_CONNTRACK_SYN_RECV]        = 60 SECS,
        [TCP_CONNTRACK_ESTABLISHED]        = 5 DAYS,
        [TCP_CONNTRACK_FIN_WAIT]        = 2 MINS,
        [TCP_CONNTRACK_CLOSE_WAIT]        = 60 SECS,
        [TCP_CONNTRACK_LAST_ACK]        = 30 SECS,
        [TCP_CONNTRACK_TIME_WAIT]        = 2 MINS,
        [TCP_CONNTRACK_CLOSE]                = 10 SECS,
        [TCP_CONNTRACK_SYN_SENT2]        = 2 MINS,
/* RFC1122 says the R2 limit should be at least 100 seconds.
   Linux uses 15 packets as limit, which corresponds
   to ~13-30min depending on RTO. */
        [TCP_CONNTRACK_RETRANS]                = 5 MINS,
        [TCP_CONNTRACK_UNACK]                = 5 MINS,
};

#define sNO TCP_CONNTRACK_NONE
#define sSS TCP_CONNTRACK_SYN_SENT
#define sSR TCP_CONNTRACK_SYN_RECV
#define sES TCP_CONNTRACK_ESTABLISHED
#define sFW TCP_CONNTRACK_FIN_WAIT
#define sCW TCP_CONNTRACK_CLOSE_WAIT
#define sLA TCP_CONNTRACK_LAST_ACK
#define sTW TCP_CONNTRACK_TIME_WAIT
#define sCL TCP_CONNTRACK_CLOSE
#define sS2 TCP_CONNTRACK_SYN_SENT2
#define sIV TCP_CONNTRACK_MAX
#define sIG TCP_CONNTRACK_IGNORE

/* What TCP flags are set from RST/SYN/FIN/ACK. */
enum tcp_bit_set {
        TCP_SYN_SET,
        TCP_SYNACK_SET,
        TCP_FIN_SET,
        TCP_ACK_SET,
        TCP_RST_SET,
        TCP_NONE_SET,
};

/*
 * The TCP state transition table needs a few words...
 *
 * We are the man in the middle. All the packets go through us
 * but might get lost in transit to the destination.
 * It is assumed that the destinations can't receive segments
 * we haven't seen.
 *
 * The checked segment is in window, but our windows are *not*
 * equivalent with the ones of the sender/receiver. We always
 * try to guess the state of the current sender.
 *
 * The meaning of the states are:
 *
 * NONE:        initial state
 * SYN_SENT:        SYN-only packet seen
 * SYN_SENT2:        SYN-only packet seen from reply dir, simultaneous open
 * SYN_RECV:        SYN-ACK packet seen
 * ESTABLISHED:        ACK packet seen
 * FIN_WAIT:        FIN packet seen
 * CLOSE_WAIT:        ACK seen (after FIN)
 * LAST_ACK:        FIN seen (after FIN)
 * TIME_WAIT:        last ACK seen
 * CLOSE:        closed connection (RST)
 *
 * Packets marked as IGNORED (sIG):
 *        if they may be either invalid or valid
 *        and the receiver may send back a connection
 *        closing RST or a SYN/ACK.
 *
 * Packets marked as INVALID (sIV):
 *        if we regard them as truly invalid packets
 */
static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
        {
/* ORIGINAL */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*syn*/           { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
/*
 *        sNO -> sSS        Initialize a new connection
 *        sSS -> sSS        Retransmitted SYN
 *        sS2 -> sS2        Late retransmitted SYN
 *        sSR -> sIG
 *        sES -> sIG        Error: SYNs in window outside the SYN_SENT state
 *                        are errors. Receiver will reply with RST
 *                        and close the connection.
 *                        Or we are not in sync and hold a dead connection.
 *        sFW -> sIG
 *        sCW -> sIG
 *        sLA -> sIG
 *        sTW -> sSS        Reopened connection (RFC 1122).
 *        sCL -> sSS
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
/*
 *        sNO -> sIV        Too late and no reason to do anything
 *        sSS -> sIV        Client can't send SYN and then SYN/ACK
 *        sS2 -> sSR        SYN/ACK sent to SYN2 in simultaneous open
 *        sSR -> sSR        Late retransmitted SYN/ACK in simultaneous open
 *        sES -> sIV        Invalid SYN/ACK packets sent by the client
 *        sFW -> sIV
 *        sCW -> sIV
 *        sLA -> sIV
 *        sTW -> sIV
 *        sCL -> sIV
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
 *        sNO -> sIV        Too late and no reason to do anything...
 *        sSS -> sIV        Client migth not send FIN in this state:
 *                        we enforce waiting for a SYN/ACK reply first.
 *        sS2 -> sIV
 *        sSR -> sFW        Close started.
 *        sES -> sFW
 *        sFW -> sLA        FIN seen in both directions, waiting for
 *                        the last ACK.
 *                        Migth be a retransmitted FIN as well...
 *        sCW -> sLA
 *        sLA -> sLA        Retransmitted FIN. Remain in the same state.
 *        sTW -> sTW
 *        sCL -> sCL
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*ack*/           { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
/*
 *        sNO -> sES        Assumed.
 *        sSS -> sIV        ACK is invalid: we haven't seen a SYN/ACK yet.
 *        sS2 -> sIV
 *        sSR -> sES        Established state is reached.
 *        sES -> sES        :-)
 *        sFW -> sCW        Normal close request answered by ACK.
 *        sCW -> sCW
 *        sLA -> sTW        Last ACK detected (RFC5961 challenged)
 *        sTW -> sTW        Retransmitted last ACK. Remain in the same state.
 *        sCL -> sCL
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
        },
        {
/* REPLY */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*syn*/           { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
/*
 *        sNO -> sIV        Never reached.
 *        sSS -> sS2        Simultaneous open
 *        sS2 -> sS2        Retransmitted simultaneous SYN
 *        sSR -> sIV        Invalid SYN packets sent by the server
 *        sES -> sIV
 *        sFW -> sIV
 *        sCW -> sIV
 *        sLA -> sIV
 *        sTW -> sSS        Reopened connection, but server may have switched role
 *        sCL -> sIV
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
/*
 *        sSS -> sSR        Standard open.
 *        sS2 -> sSR        Simultaneous open
 *        sSR -> sIG        Retransmitted SYN/ACK, ignore it.
 *        sES -> sIG        Late retransmitted SYN/ACK?
 *        sFW -> sIG        Might be SYN/ACK answering ignored SYN
 *        sCW -> sIG
 *        sLA -> sIG
 *        sTW -> sIG
 *        sCL -> sIG
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
 *        sSS -> sIV        Server might not send FIN in this state.
 *        sS2 -> sIV
 *        sSR -> sFW        Close started.
 *        sES -> sFW
 *        sFW -> sLA        FIN seen in both directions.
 *        sCW -> sLA
 *        sLA -> sLA        Retransmitted FIN.
 *        sTW -> sTW
 *        sCL -> sCL
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*ack*/           { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
/*
 *        sSS -> sIG        Might be a half-open connection.
 *        sS2 -> sIG
 *        sSR -> sSR        Might answer late resent SYN.
 *        sES -> sES        :-)
 *        sFW -> sCW        Normal close request answered by ACK.
 *        sCW -> sCW
 *        sLA -> sTW        Last ACK detected (RFC5961 challenged)
 *        sTW -> sTW        Retransmitted last ACK.
 *        sCL -> sCL
 */
/*              sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2        */
/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
        }
};

#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* Print out the private part of the conntrack. */
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
        if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
                return;

        seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
}
#endif

static unsigned int get_conntrack_index(const struct tcphdr *tcph)
{
        if (tcph->rst) return TCP_RST_SET;
        else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
        else if (tcph->fin) return TCP_FIN_SET;
        else if (tcph->ack) return TCP_ACK_SET;
        else return TCP_NONE_SET;
}

/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
   in IP Filter' by Guido van Rooij.

   http://www.sane.nl/events/sane2000/papers.html
   http://www.darkart.com/mirrors/www.obfuscation.org/ipf/

   The boundaries and the conditions are changed according to RFC793:
   the packet must intersect the window (i.e. segments may be
   after the right or before the left edge) and thus receivers may ACK
   segments after the right edge of the window.

        td_maxend = max(sack + max(win,1)) seen in reply packets
        td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
        td_maxwin += seq + len - sender.td_maxend
                        if seq + len > sender.td_maxend
        td_end    = max(seq + len) seen in sent packets

   I.   Upper bound for valid data:        seq <= sender.td_maxend
   II.  Lower bound for valid data:        seq + len >= sender.td_end - receiver.td_maxwin
   III.        Upper bound for valid (s)ack:   sack <= receiver.td_end
   IV.        Lower bound for valid (s)ack:        sack >= receiver.td_end - MAXACKWINDOW

   where sack is the highest right edge of sack block found in the packet
   or ack in the case of packet without SACK option.

   The upper bound limit for a valid (s)ack is not ignored -
   we doesn't have to deal with fragments.
*/

static inline __u32 segment_seq_plus_len(__u32 seq,
                                         size_t len,
                                         unsigned int dataoff,
                                         const struct tcphdr *tcph)
{
        /* XXX Should I use payload length field in IP/IPv6 header ?
         * - YK */
        return (seq + len - dataoff - tcph->doff*4
                + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
}

/* Fixme: what about big packets? */
#define MAXACKWINCONST                        66000
#define MAXACKWINDOW(sender)                                                \
        ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin        \
                                              : MAXACKWINCONST)

/*
 * Simplified tcp_parse_options routine from tcp_input.c
 */
static void tcp_options(const struct sk_buff *skb,
                        unsigned int dataoff,
                        const struct tcphdr *tcph,
                        struct ip_ct_tcp_state *state)
{
        unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
        const unsigned char *ptr;
        int length = (tcph->doff*4) - sizeof(struct tcphdr);

        if (!length)
                return;

        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                                 length, buff);
        if (!ptr)
                return;

        state->td_scale = 0;
        state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;

        while (length > 0) {
                int opcode=*ptr++;
                int opsize;

                switch (opcode) {
                case TCPOPT_EOL:
                        return;
                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
                        length--;
                        continue;
                default:
                        if (length < 2)
                                return;
                        opsize=*ptr++;
                        if (opsize < 2) /* "silly options" */
                                return;
                        if (opsize > length)
                                return;        /* don't parse partial options */

                        if (opcode == TCPOPT_SACK_PERM
                            && opsize == TCPOLEN_SACK_PERM)
                                state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
                        else if (opcode == TCPOPT_WINDOW
                                 && opsize == TCPOLEN_WINDOW) {
                                state->td_scale = *(u_int8_t *)ptr;

                                if (state->td_scale > TCP_MAX_WSCALE)
                                        state->td_scale = TCP_MAX_WSCALE;

                                state->flags |=
                                        IP_CT_TCP_FLAG_WINDOW_SCALE;
                        }
                        ptr += opsize - 2;
                        length -= opsize;
                }
        }
}

static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
                     const struct tcphdr *tcph, __u32 *sack)
{
        unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
        const unsigned char *ptr;
        int length = (tcph->doff*4) - sizeof(struct tcphdr);
        __u32 tmp;

        if (!length)
                return;

        ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                                 length, buff);
        if (!ptr)
                return;

        /* Fast path for timestamp-only option */
        if (length == TCPOLEN_TSTAMP_ALIGNED
            && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
                                       | (TCPOPT_NOP << 16)
                                       | (TCPOPT_TIMESTAMP << 8)
                                       | TCPOLEN_TIMESTAMP))
                return;

        while (length > 0) {
                int opcode = *ptr++;
                int opsize, i;

                switch (opcode) {
                case TCPOPT_EOL:
                        return;
                case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
                        length--;
                        continue;
                default:
                        if (length < 2)
                                return;
                        opsize = *ptr++;
                        if (opsize < 2) /* "silly options" */
                                return;
                        if (opsize > length)
                                return;        /* don't parse partial options */

                        if (opcode == TCPOPT_SACK
                            && opsize >= (TCPOLEN_SACK_BASE
                                          + TCPOLEN_SACK_PERBLOCK)
                            && !((opsize - TCPOLEN_SACK_BASE)
                                 % TCPOLEN_SACK_PERBLOCK)) {
                                for (i = 0;
                                     i < (opsize - TCPOLEN_SACK_BASE);
                                     i += TCPOLEN_SACK_PERBLOCK) {
                                        tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);

                                        if (after(tmp, *sack))
                                                *sack = tmp;
                                }
                                return;
                        }
                        ptr += opsize - 2;
                        length -= opsize;
                }
        }
}

static void tcp_init_sender(struct ip_ct_tcp_state *sender,
                            struct ip_ct_tcp_state *receiver,
                            const struct sk_buff *skb,
                            unsigned int dataoff,
                            const struct tcphdr *tcph,
                            u32 end, u32 win,
                            enum ip_conntrack_dir dir)
{
        /* SYN-ACK in reply to a SYN
         * or SYN from reply direction in simultaneous open.
         */
        sender->td_end =
        sender->td_maxend = end;
        sender->td_maxwin = (win == 0 ? 1 : win);

        tcp_options(skb, dataoff, tcph, sender);
        /* RFC 1323:
         * Both sides must send the Window Scale option
         * to enable window scaling in either direction.
         */
        if (dir == IP_CT_DIR_REPLY &&
            !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
              receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
                sender->td_scale = 0;
                receiver->td_scale = 0;
        }
}

__printf(6, 7)
static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb,
                                                const struct nf_conn *ct,
                                                const struct nf_hook_state *state,
                                                const struct ip_ct_tcp_state *sender,
                                                enum nf_ct_tcp_action ret,
                                                const char *fmt, ...)
{
        const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct));
        struct va_format vaf;
        va_list args;
        bool be_liberal;

        be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal;
        if (be_liberal)
                return NFCT_TCP_ACCEPT;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf);
        va_end(args);

        return ret;
}

static enum nf_ct_tcp_action
tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir,
              unsigned int index, const struct sk_buff *skb,
              unsigned int dataoff, const struct tcphdr *tcph,
              const struct nf_hook_state *hook_state)
{
        struct ip_ct_tcp *state = &ct->proto.tcp;
        struct ip_ct_tcp_state *sender = &state->seen[dir];
        struct ip_ct_tcp_state *receiver = &state->seen[!dir];
        __u32 seq, ack, sack, end, win, swin;
        bool in_recv_win, seq_ok;
        s32 receiver_offset;
        u16 win_raw;

        /*
         * Get the required data from the packet.
         */
        seq = ntohl(tcph->seq);
        ack = sack = ntohl(tcph->ack_seq);
        win_raw = ntohs(tcph->window);
        win = win_raw;
        end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);

        if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
                tcp_sack(skb, dataoff, tcph, &sack);

        /* Take into account NAT sequence number mangling */
        receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
        ack -= receiver_offset;
        sack -= receiver_offset;

        if (sender->td_maxwin == 0) {
                /*
                 * Initialize sender data.
                 */
                if (tcph->syn) {
                        tcp_init_sender(sender, receiver,
                                        skb, dataoff, tcph,
                                        end, win, dir);
                        if (!tcph->ack)
                                /* Simultaneous open */
                                return NFCT_TCP_ACCEPT;
                } else {
                        /*
                         * We are in the middle of a connection,
                         * its history is lost for us.
                         * Let's try to use the data from the packet.
                         */
                        sender->td_end = end;
                        swin = win << sender->td_scale;
                        sender->td_maxwin = (swin == 0 ? 1 : swin);
                        sender->td_maxend = end + sender->td_maxwin;
                        if (receiver->td_maxwin == 0) {
                                /* We haven't seen traffic in the other
                                 * direction yet but we have to tweak window
                                 * tracking to pass III and IV until that
                                 * happens.
                                 */
                                receiver->td_end = receiver->td_maxend = sack;
                        } else if (sack == receiver->td_end + 1) {
                                /* Likely a reply to a keepalive.
                                 * Needed for III.
                                 */
                                receiver->td_end++;
                        }

                }
        } else if (tcph->syn &&
                   after(end, sender->td_end) &&
                   (state->state == TCP_CONNTRACK_SYN_SENT ||
                    state->state == TCP_CONNTRACK_SYN_RECV)) {
                /*
                 * RFC 793: "if a TCP is reinitialized ... then it need
                 * not wait at all; it must only be sure to use sequence
                 * numbers larger than those recently used."
                 *
                 * Re-init state for this direction, just like for the first
                 * syn(-ack) reply, it might differ in seq, ack or tcp options.
                 */
                tcp_init_sender(sender, receiver,
                                skb, dataoff, tcph,
                                end, win, dir);

                if (dir == IP_CT_DIR_REPLY && !tcph->ack)
                        return NFCT_TCP_ACCEPT;
        }

        if (!(tcph->ack)) {
                /*
                 * If there is no ACK, just pretend it was set and OK.
                 */
                ack = sack = receiver->td_end;
        } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
                    (TCP_FLAG_ACK|TCP_FLAG_RST))
                   && (ack == 0)) {
                /*
                 * Broken TCP stacks, that set ACK in RST packets as well
                 * with zero ack value.
                 */
                ack = sack = receiver->td_end;
        }

        if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
                /*
                 * RST sent answering SYN.
                 */
                seq = end = sender->td_end;

        seq_ok = before(seq, sender->td_maxend + 1);
        if (!seq_ok) {
                u32 overshot = end - sender->td_maxend + 1;
                bool ack_ok;

                ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1);
                in_recv_win = receiver->td_maxwin &&
                              after(end, sender->td_end - receiver->td_maxwin - 1);

                if (in_recv_win &&
                    ack_ok &&
                    overshot <= receiver->td_maxwin &&
                    before(sack, receiver->td_end + 1)) {
                        /* Work around TCPs that send more bytes than allowed by
                         * the receive window.
                         *
                         * If the (marked as invalid) packet is allowed to pass by
                         * the ruleset and the peer acks this data, then its possible
                         * all future packets will trigger 'ACK is over upper bound' check.
                         *
                         * Thus if only the sequence check fails then do update td_end so
                         * possible ACK for this data can update internal state.
                         */
                        sender->td_end = end;
                        sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;

                        return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
                                                  "%u bytes more than expected", overshot);
                }

                return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
                                          "SEQ is over upper bound %u (over the window of the receiver)",
                                          sender->td_maxend + 1);
        }

        if (!before(sack, receiver->td_end + 1))
                return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
                                          "ACK is over upper bound %u (ACKed data not seen yet)",
                                          receiver->td_end + 1);

        /* Is the ending sequence in the receive window (if available)? */
        in_recv_win = !receiver->td_maxwin ||
                      after(end, sender->td_end - receiver->td_maxwin - 1);
        if (!in_recv_win)
                return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
                                          "SEQ is under lower bound %u (already ACKed data retransmitted)",
                                          sender->td_end - receiver->td_maxwin - 1);
        if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1))
                return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
                                          "ignored ACK under lower bound %u (possible overly delayed)",
                                          receiver->td_end - MAXACKWINDOW(sender) - 1);

        /* Take into account window scaling (RFC 1323). */
        if (!tcph->syn)
                win <<= sender->td_scale;

        /* Update sender data. */
        swin = win + (sack - ack);
        if (sender->td_maxwin < swin)
                sender->td_maxwin = swin;
        if (after(end, sender->td_end)) {
                sender->td_end = end;
                sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
        }
        if (tcph->ack) {
                if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
                        sender->td_maxack = ack;
                        sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
                } else if (after(ack, sender->td_maxack)) {
                        sender->td_maxack = ack;
                }
        }

        /* Update receiver data. */
        if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
                receiver->td_maxwin += end - sender->td_maxend;
        if (after(sack + win, receiver->td_maxend - 1)) {
                receiver->td_maxend = sack + win;
                if (win == 0)
                        receiver->td_maxend++;
        }
        if (ack == receiver->td_end)
                receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;

        /* Check retransmissions. */
        if (index == TCP_ACK_SET) {
                if (state->last_dir == dir &&
                    state->last_seq == seq &&
                    state->last_ack == ack &&
                    state->last_end == end &&
                    state->last_win == win_raw) {
                        state->retrans++;
                } else {
                        state->last_dir = dir;
                        state->last_seq = seq;
                        state->last_ack = ack;
                        state->last_end = end;
                        state->last_win = win_raw;
                        state->retrans = 0;
                }
        }

        return NFCT_TCP_ACCEPT;
}

static void __cold nf_tcp_handle_invalid(struct nf_conn *ct,
                                         enum ip_conntrack_dir dir,
                                         int index,
                                         const struct sk_buff *skb,
                                         const struct nf_hook_state *hook_state)
{
        const unsigned int *timeouts;
        const struct nf_tcp_net *tn;
        unsigned int timeout;
        u32 expires;

        if (!test_bit(IPS_ASSURED_BIT, &ct->status) ||
            test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
                return;

        /* We don't want to have connections hanging around in ESTABLISHED
         * state for long time 'just because' conntrack deemed a FIN/RST
         * out-of-window.
         *
         * Shrink the timeout just like when there is unacked data.
         * This speeds up eviction of 'dead' connections where the
         * connection and conntracks internal state are out of sync.
         */
        switch (index) {
        case TCP_RST_SET:
        case TCP_FIN_SET:
                break;
        default:
                return;
        }

        if (ct->proto.tcp.last_dir != dir &&
            (ct->proto.tcp.last_index == TCP_FIN_SET ||
             ct->proto.tcp.last_index == TCP_RST_SET)) {
                expires = nf_ct_expires(ct);
                if (expires < 120 * HZ)
                        return;

                tn = nf_tcp_pernet(nf_ct_net(ct));
                timeouts = nf_ct_timeout_lookup(ct);
                if (!timeouts)
                        timeouts = tn->timeouts;

                timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]);
                if (expires > timeout) {
                        nf_ct_l4proto_log_invalid(skb, ct, hook_state,
                                          "packet (index %d, dir %d) response for index %d lower timeout to %u",
                                          index, dir, ct->proto.tcp.last_index, timeout);

                        WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
                }
        } else {
                ct->proto.tcp.last_index = index;
                ct->proto.tcp.last_dir = dir;
        }
}

/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
                                 TCPHDR_URG) + 1] =
{
        [TCPHDR_SYN]                                = 1,
        [TCPHDR_SYN|TCPHDR_URG]                        = 1,
        [TCPHDR_SYN|TCPHDR_ACK]                        = 1,
        [TCPHDR_RST]                                = 1,
        [TCPHDR_RST|TCPHDR_ACK]                        = 1,
        [TCPHDR_FIN|TCPHDR_ACK]                        = 1,
        [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]        = 1,
        [TCPHDR_ACK]                                = 1,
        [TCPHDR_ACK|TCPHDR_URG]                        = 1,
};

static void tcp_error_log(const struct sk_buff *skb,
                          const struct nf_hook_state *state,
                          const char *msg)
{
        nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
}

/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
static bool tcp_error(const struct tcphdr *th,
                      struct sk_buff *skb,
                      unsigned int dataoff,
                      const struct nf_hook_state *state)
{
        unsigned int tcplen = skb->len - dataoff;
        u8 tcpflags;

        /* Not whole TCP header or malformed packet */
        if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
                tcp_error_log(skb, state, "truncated packet");
                return true;
        }

        /* Checksum invalid? Ignore.
         * We skip checking packets on the outgoing path
         * because the checksum is assumed to be correct.
         */
        /* FIXME: Source route IP option packets --RR */
        if (state->net->ct.sysctl_checksum &&
            state->hook == NF_INET_PRE_ROUTING &&
            nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
                tcp_error_log(skb, state, "bad checksum");
                return true;
        }

        /* Check TCP flags. */
        tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
        if (!tcp_valid_flags[tcpflags]) {
                tcp_error_log(skb, state, "invalid tcp flag combination");
                return true;
        }

        return false;
}

static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
                             unsigned int dataoff,
                             const struct tcphdr *th,
                             const struct nf_hook_state *state)
{
        enum tcp_conntrack new_state;
        struct net *net = nf_ct_net(ct);
        const struct nf_tcp_net *tn = nf_tcp_pernet(net);

        /* Don't need lock here: this conntrack not in circulation yet */
        new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];

        /* Invalid: delete conntrack */
        if (new_state >= TCP_CONNTRACK_MAX) {
                tcp_error_log(skb, state, "invalid new");
                return false;
        }

        if (new_state == TCP_CONNTRACK_SYN_SENT) {
                memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
                /* SYN packet */
                ct->proto.tcp.seen[0].td_end =
                        segment_seq_plus_len(ntohl(th->seq), skb->len,
                                             dataoff, th);
                ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
                if (ct->proto.tcp.seen[0].td_maxwin == 0)
                        ct->proto.tcp.seen[0].td_maxwin = 1;
                ct->proto.tcp.seen[0].td_maxend =
                        ct->proto.tcp.seen[0].td_end;

                tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
        } else if (tn->tcp_loose == 0) {
                /* Don't try to pick up connections. */
                return false;
        } else {
                memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
                /*
                 * We are in the middle of a connection,
                 * its history is lost for us.
                 * Let's try to use the data from the packet.
                 */
                ct->proto.tcp.seen[0].td_end =
                        segment_seq_plus_len(ntohl(th->seq), skb->len,
                                             dataoff, th);
                ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
                if (ct->proto.tcp.seen[0].td_maxwin == 0)
                        ct->proto.tcp.seen[0].td_maxwin = 1;
                ct->proto.tcp.seen[0].td_maxend =
                        ct->proto.tcp.seen[0].td_end +
                        ct->proto.tcp.seen[0].td_maxwin;

                /* We assume SACK and liberal window checking to handle
                 * window scaling */
                ct->proto.tcp.seen[0].flags =
                ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
                                              IP_CT_TCP_FLAG_BE_LIBERAL;
        }

        /* tcp_packet will set them */
        ct->proto.tcp.last_index = TCP_NONE_SET;
        return true;
}

static bool tcp_can_early_drop(const struct nf_conn *ct)
{
        switch (ct->proto.tcp.state) {
        case TCP_CONNTRACK_FIN_WAIT:
        case TCP_CONNTRACK_LAST_ACK:
        case TCP_CONNTRACK_TIME_WAIT:
        case TCP_CONNTRACK_CLOSE:
        case TCP_CONNTRACK_CLOSE_WAIT:
                return true;
        default:
                break;
        }

        return false;
}

void nf_conntrack_tcp_set_closing(struct nf_conn *ct)
{
        enum tcp_conntrack old_state;
        const unsigned int *timeouts;
        u32 timeout;

        if (!nf_ct_is_confirmed(ct))
                return;

        spin_lock_bh(&ct->lock);
        old_state = ct->proto.tcp.state;
        ct->proto.tcp.state = TCP_CONNTRACK_CLOSE;

        if (old_state == TCP_CONNTRACK_CLOSE ||
            test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
                spin_unlock_bh(&ct->lock);
                return;
        }

        timeouts = nf_ct_timeout_lookup(ct);
        if (!timeouts) {
                const struct nf_tcp_net *tn;

                tn = nf_tcp_pernet(nf_ct_net(ct));
                timeouts = tn->timeouts;
        }

        timeout = timeouts[TCP_CONNTRACK_CLOSE];
        WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);

        spin_unlock_bh(&ct->lock);

        nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
}

static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
{
        state->td_end                = 0;
        state->td_maxend        = 0;
        state->td_maxwin        = 0;
        state->td_maxack        = 0;
        state->td_scale                = 0;
        state->flags                &= IP_CT_TCP_FLAG_BE_LIBERAL;
}

/* Returns verdict for packet, or -1 for invalid. */
int nf_conntrack_tcp_packet(struct nf_conn *ct,
                            struct sk_buff *skb,
                            unsigned int dataoff,
                            enum ip_conntrack_info ctinfo,
                            const struct nf_hook_state *state)
{
        struct net *net = nf_ct_net(ct);
        struct nf_tcp_net *tn = nf_tcp_pernet(net);
        enum tcp_conntrack new_state, old_state;
        unsigned int index, *timeouts;
        enum nf_ct_tcp_action res;
        enum ip_conntrack_dir dir;
        const struct tcphdr *th;
        struct tcphdr _tcph;
        unsigned long timeout;

        th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
        if (th == NULL)
                return -NF_ACCEPT;

        if (tcp_error(th, skb, dataoff, state))
                return -NF_ACCEPT;

        if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state))
                return -NF_ACCEPT;

        spin_lock_bh(&ct->lock);
        old_state = ct->proto.tcp.state;
        dir = CTINFO2DIR(ctinfo);
        index = get_conntrack_index(th);
        new_state = tcp_conntracks[dir][index][old_state];

        switch (new_state) {
        case TCP_CONNTRACK_SYN_SENT:
                if (old_state < TCP_CONNTRACK_TIME_WAIT)
                        break;
                /* RFC 1122: "When a connection is closed actively,
                 * it MUST linger in TIME-WAIT state for a time 2xMSL
                 * (Maximum Segment Lifetime). However, it MAY accept
                 * a new SYN from the remote TCP to reopen the connection
                 * directly from TIME-WAIT state, if..."
                 * We ignore the conditions because we are in the
                 * TIME-WAIT state anyway.
                 *
                 * Handle aborted connections: we and the server
                 * think there is an existing connection but the client
                 * aborts it and starts a new one.
                 */
                if (((ct->proto.tcp.seen[dir].flags
                      | ct->proto.tcp.seen[!dir].flags)
                     & IP_CT_TCP_FLAG_CLOSE_INIT)
                    || (ct->proto.tcp.last_dir == dir
                        && ct->proto.tcp.last_index == TCP_RST_SET)) {
                        /* Attempt to reopen a closed/aborted connection.
                         * Delete this connection and look up again. */
                        spin_unlock_bh(&ct->lock);

                        /* Only repeat if we can actually remove the timer.
                         * Destruction may already be in progress in process
                         * context and we must give it a chance to terminate.
                         */
                        if (nf_ct_kill(ct))
                                return -NF_REPEAT;
                        return NF_DROP;
                }
                fallthrough;
        case TCP_CONNTRACK_IGNORE:
                /* Ignored packets:
                 *
                 * Our connection entry may be out of sync, so ignore
                 * packets which may signal the real connection between
                 * the client and the server.
                 *
                 * a) SYN in ORIGINAL
                 * b) SYN/ACK in REPLY
                 * c) ACK in reply direction after initial SYN in original.
                 *
                 * If the ignored packet is invalid, the receiver will send
                 * a RST we'll catch below.
                 */
                if (index == TCP_SYNACK_SET
                    && ct->proto.tcp.last_index == TCP_SYN_SET
                    && ct->proto.tcp.last_dir != dir
                    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
                        /* b) This SYN/ACK acknowledges a SYN that we earlier
                         * ignored as invalid. This means that the client and
                         * the server are both in sync, while the firewall is
                         * not. We get in sync from the previously annotated
                         * values.
                         */
                        old_state = TCP_CONNTRACK_SYN_SENT;
                        new_state = TCP_CONNTRACK_SYN_RECV;
                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
                                ct->proto.tcp.last_end;
                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
                                ct->proto.tcp.last_end;
                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
                                ct->proto.tcp.last_win == 0 ?
                                        1 : ct->proto.tcp.last_win;
                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
                                ct->proto.tcp.last_wscale;
                        ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
                        ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
                                ct->proto.tcp.last_flags;
                        nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
                        break;
                }
                ct->proto.tcp.last_index = index;
                ct->proto.tcp.last_dir = dir;
                ct->proto.tcp.last_seq = ntohl(th->seq);
                ct->proto.tcp.last_end =
                    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
                ct->proto.tcp.last_win = ntohs(th->window);

                /* a) This is a SYN in ORIGINAL. The client and the server
                 * may be in sync but we are not. In that case, we annotate
                 * the TCP options and let the packet go through. If it is a
                 * valid SYN packet, the server will reply with a SYN/ACK, and
                 * then we'll get in sync. Otherwise, the server potentially
                 * responds with a challenge ACK if implementing RFC5961.
                 */
                if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
                        struct ip_ct_tcp_state seen = {};

                        ct->proto.tcp.last_flags =
                        ct->proto.tcp.last_wscale = 0;
                        tcp_options(skb, dataoff, th, &seen);
                        if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
                                ct->proto.tcp.last_flags |=
                                        IP_CT_TCP_FLAG_WINDOW_SCALE;
                                ct->proto.tcp.last_wscale = seen.td_scale;
                        }
                        if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
                                ct->proto.tcp.last_flags |=
                                        IP_CT_TCP_FLAG_SACK_PERM;
                        }
                        /* Mark the potential for RFC5961 challenge ACK,
                         * this pose a special problem for LAST_ACK state
                         * as ACK is intrepretated as ACKing last FIN.
                         */
                        if (old_state == TCP_CONNTRACK_LAST_ACK)
                                ct->proto.tcp.last_flags |=
                                        IP_CT_EXP_CHALLENGE_ACK;
                }

                /* possible challenge ack reply to syn */
                if (old_state == TCP_CONNTRACK_SYN_SENT &&
                    index == TCP_ACK_SET &&
                    dir == IP_CT_DIR_REPLY)
                        ct->proto.tcp.last_ack = ntohl(th->ack_seq);

                spin_unlock_bh(&ct->lock);
                nf_ct_l4proto_log_invalid(skb, ct, state,
                                          "packet (index %d) in dir %d ignored, state %s",
                                          index, dir,
                                          tcp_conntrack_names[old_state]);
                return NF_ACCEPT;
        case TCP_CONNTRACK_MAX:
                /* Special case for SYN proxy: when the SYN to the server or
                 * the SYN/ACK from the server is lost, the client may transmit
                 * a keep-alive packet while in SYN_SENT state. This needs to
                 * be associated with the original conntrack entry in order to
                 * generate a new SYN with the correct sequence number.
                 */
                if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
                    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
                    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
                    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
                        pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
                        spin_unlock_bh(&ct->lock);
                        return NF_ACCEPT;
                }

                /* Invalid packet */
                spin_unlock_bh(&ct->lock);
                nf_ct_l4proto_log_invalid(skb, ct, state,
                                          "packet (index %d) in dir %d invalid, state %s",
                                          index, dir,
                                          tcp_conntrack_names[old_state]);
                return -NF_ACCEPT;
        case TCP_CONNTRACK_TIME_WAIT:
                /* RFC5961 compliance cause stack to send "challenge-ACK"
                 * e.g. in response to spurious SYNs.  Conntrack MUST
                 * not believe this ACK is acking last FIN.
                 */
                if (old_state == TCP_CONNTRACK_LAST_ACK &&
                    index == TCP_ACK_SET &&
                    ct->proto.tcp.last_dir != dir &&
                    ct->proto.tcp.last_index == TCP_SYN_SET &&
                    (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
                        /* Detected RFC5961 challenge ACK */
                        ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
                        spin_unlock_bh(&ct->lock);
                        nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
                        return NF_ACCEPT; /* Don't change state */
                }
                break;
        case TCP_CONNTRACK_SYN_SENT2:
                /* tcp_conntracks table is not smart enough to handle
                 * simultaneous open.
                 */
                ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
                break;
        case TCP_CONNTRACK_SYN_RECV:
                if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
                    ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
                        new_state = TCP_CONNTRACK_ESTABLISHED;
                break;
        case TCP_CONNTRACK_CLOSE:
                if (index != TCP_RST_SET)
                        break;

                /* If we are closing, tuple might have been re-used already.
                 * last_index, last_ack, and all other ct fields used for
                 * sequence/window validation are outdated in that case.
                 *
                 * As the conntrack can already be expired by GC under pressure,
                 * just skip validation checks.
                 */
                if (tcp_can_early_drop(ct))
                        goto in_window;

                /* td_maxack might be outdated if we let a SYN through earlier */
                if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
                    ct->proto.tcp.last_index != TCP_SYN_SET) {
                        u32 seq = ntohl(th->seq);

                        /* If we are not in established state and SEQ=0 this is most
                         * likely an answer to a SYN we let go through above (last_index
                         * can be updated due to out-of-order ACKs).
                         */
                        if (seq == 0 && !nf_conntrack_tcp_established(ct))
                                break;

                        if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
                            !tn->tcp_ignore_invalid_rst) {
                                /* Invalid RST  */
                                spin_unlock_bh(&ct->lock);
                                nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
                                return -NF_ACCEPT;
                        }

                        if (!nf_conntrack_tcp_established(ct) ||
                            seq == ct->proto.tcp.seen[!dir].td_maxack)
                                break;

                        /* Check if rst is part of train, such as
                         *   foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
                         *   foo:80 > bar:4379: R, 235946602:235946602(0)  ack 42
                         */
                        if (ct->proto.tcp.last_index == TCP_ACK_SET &&
                            ct->proto.tcp.last_dir == dir &&
                            seq == ct->proto.tcp.last_end)
                                break;

                        /* ... RST sequence number doesn't match exactly, keep
                         * established state to allow a possible challenge ACK.
                         */
                        new_state = old_state;
                }
                if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
                         && ct->proto.tcp.last_index == TCP_SYN_SET)
                        || (!test_bit(IPS_ASSURED_BIT, &ct->status)
                            && ct->proto.tcp.last_index == TCP_ACK_SET))
                    && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
                        /* RST sent to invalid SYN or ACK we had let through
                         * at a) and c) above:
                         *
                         * a) SYN was in window then
                         * c) we hold a half-open connection.
                         *
                         * Delete our connection entry.
                         * We skip window checking, because packet might ACK
                         * segments we ignored. */
                        goto in_window;
                }

                /* Reset in response to a challenge-ack we let through earlier */
                if (old_state == TCP_CONNTRACK_SYN_SENT &&
                    ct->proto.tcp.last_index == TCP_ACK_SET &&
                    ct->proto.tcp.last_dir == IP_CT_DIR_REPLY &&
                    ntohl(th->seq) == ct->proto.tcp.last_ack)
                        goto in_window;

                break;
        default:
                /* Keep compilers happy. */
                break;
        }

        res = tcp_in_window(ct, dir, index,
                            skb, dataoff, th, state);
        switch (res) {
        case NFCT_TCP_IGNORE:
                spin_unlock_bh(&ct->lock);
                return NF_ACCEPT;
        case NFCT_TCP_INVALID:
                nf_tcp_handle_invalid(ct, dir, index, skb, state);
                spin_unlock_bh(&ct->lock);
                return -NF_ACCEPT;
        case NFCT_TCP_ACCEPT:
                break;
        }
     in_window:
        /* From now on we have got in-window packets */
        ct->proto.tcp.last_index = index;
        ct->proto.tcp.last_dir = dir;

        ct->proto.tcp.state = new_state;
        if (old_state != new_state
            && new_state == TCP_CONNTRACK_FIN_WAIT)
                ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;

        timeouts = nf_ct_timeout_lookup(ct);
        if (!timeouts)
                timeouts = tn->timeouts;

        if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
            timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
                timeout = timeouts[TCP_CONNTRACK_RETRANS];
        else if (unlikely(index == TCP_RST_SET))
                timeout = timeouts[TCP_CONNTRACK_CLOSE];
        else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
                 IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
                 timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
                timeout = timeouts[TCP_CONNTRACK_UNACK];
        else if (ct->proto.tcp.last_win == 0 &&
                 timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
                timeout = timeouts[TCP_CONNTRACK_RETRANS];
        else
                timeout = timeouts[new_state];
        spin_unlock_bh(&ct->lock);

        if (new_state != old_state)
                nf_conntrack_event_cache(IPCT_PROTOINFO, ct);

        if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
                /* If only reply is a RST, we can consider ourselves not to
                   have an established connection: this is a fairly common
                   problem case, so we can delete the conntrack
                   immediately.  --RR */
                if (th->rst) {
                        nf_ct_kill_acct(ct, ctinfo, skb);
                        return NF_ACCEPT;
                }

                if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
                        /* do not renew timeout on SYN retransmit.
                         *
                         * Else port reuse by client or NAT middlebox can keep
                         * entry alive indefinitely (including nat info).
                         */
                        return NF_ACCEPT;
                }

                /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
                 * pickup with loose=1. Avoid large ESTABLISHED timeout.
                 */
                if (new_state == TCP_CONNTRACK_ESTABLISHED &&
                    timeout > timeouts[TCP_CONNTRACK_UNACK])
                        timeout = timeouts[TCP_CONNTRACK_UNACK];
        } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
                   && (old_state == TCP_CONNTRACK_SYN_RECV
                       || old_state == TCP_CONNTRACK_ESTABLISHED)
                   && new_state == TCP_CONNTRACK_ESTABLISHED) {
                /* Set ASSURED if we see valid ack in ESTABLISHED
                   after SYN_RECV or a valid answer for a picked up
                   connection. */
                set_bit(IPS_ASSURED_BIT, &ct->status);
                nf_conntrack_event_cache(IPCT_ASSURED, ct);
        }
        nf_ct_refresh_acct(ct, ctinfo, skb, timeout);

        return NF_ACCEPT;
}

#if IS_ENABLED(CONFIG_NF_CT_NETLINK)

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>

static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
                         struct nf_conn *ct, bool destroy)
{
        struct nlattr *nest_parms;
        struct nf_ct_tcp_flags tmp = {};

        spin_lock_bh(&ct->lock);
        nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
        if (!nest_parms)
                goto nla_put_failure;

        if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
                goto nla_put_failure;

        if (destroy)
                goto skip_state;

        if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
                       ct->proto.tcp.seen[0].td_scale) ||
            nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
                       ct->proto.tcp.seen[1].td_scale))
                goto nla_put_failure;

        tmp.flags = ct->proto.tcp.seen[0].flags;
        if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
                    sizeof(struct nf_ct_tcp_flags), &tmp))
                goto nla_put_failure;

        tmp.flags = ct->proto.tcp.seen[1].flags;
        if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
                    sizeof(struct nf_ct_tcp_flags), &tmp))
                goto nla_put_failure;
skip_state:
        spin_unlock_bh(&ct->lock);
        nla_nest_end(skb, nest_parms);

        return 0;

nla_put_failure:
        spin_unlock_bh(&ct->lock);
        return -1;
}

static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
        [CTA_PROTOINFO_TCP_STATE]            = { .type = NLA_U8 },
        [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
        [CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
        [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
        [CTA_PROTOINFO_TCP_FLAGS_REPLY]            = { .len = sizeof(struct nf_ct_tcp_flags) },
};

#define TCP_NLATTR_SIZE        ( \
        NLA_ALIGN(NLA_HDRLEN + 1) + \
        NLA_ALIGN(NLA_HDRLEN + 1) + \
        NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
        NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))

static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
{
        struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
        struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
        int err;

        /* updates could not contain anything about the private
         * protocol info, in that case skip the parsing */
        if (!pattr)
                return 0;

        err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
                                          tcp_nla_policy, NULL);
        if (err < 0)
                return err;

        if (tb[CTA_PROTOINFO_TCP_STATE] &&
            nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
                return -EINVAL;

        spin_lock_bh(&ct->lock);
        if (tb[CTA_PROTOINFO_TCP_STATE])
                ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);

        if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
                struct nf_ct_tcp_flags *attr =
                        nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
                ct->proto.tcp.seen[0].flags &= ~attr->mask;
                ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
        }

        if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
                struct nf_ct_tcp_flags *attr =
                        nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
                ct->proto.tcp.seen[1].flags &= ~attr->mask;
                ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
        }

        if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
            tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
            ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
            ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
                ct->proto.tcp.seen[0].td_scale =
                        nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
                ct->proto.tcp.seen[1].td_scale =
                        nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
        }
        spin_unlock_bh(&ct->lock);

        return 0;
}

static unsigned int tcp_nlattr_tuple_size(void)
{
        static unsigned int size __read_mostly;

        if (!size)
                size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);

        return size;
}
#endif

#ifdef CONFIG_NF_CONNTRACK_TIMEOUT

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>

static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
                                     struct net *net, void *data)
{
        struct nf_tcp_net *tn = nf_tcp_pernet(net);
        unsigned int *timeouts = data;
        int i;

        if (!timeouts)
                timeouts = tn->timeouts;
        /* set default TCP timeouts. */
        for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
                timeouts[i] = tn->timeouts[i];

        if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) {
                timeouts[TCP_CONNTRACK_SYN_SENT] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
        }

        if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
                timeouts[TCP_CONNTRACK_SYN_RECV] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
                timeouts[TCP_CONNTRACK_ESTABLISHED] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
                timeouts[TCP_CONNTRACK_FIN_WAIT] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
                timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
                timeouts[TCP_CONNTRACK_LAST_ACK] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
                timeouts[TCP_CONNTRACK_TIME_WAIT] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
                timeouts[TCP_CONNTRACK_CLOSE] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
                timeouts[TCP_CONNTRACK_SYN_SENT2] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
                timeouts[TCP_CONNTRACK_RETRANS] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
        }
        if (tb[CTA_TIMEOUT_TCP_UNACK]) {
                timeouts[TCP_CONNTRACK_UNACK] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
        }

        timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
        return 0;
}

static int
tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
{
        const unsigned int *timeouts = data;

        if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
                        htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
                         htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
                         htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
                         htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
                         htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
                         htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
                         htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
                         htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
                         htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
                         htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
                         htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -ENOSPC;
}

static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
        [CTA_TIMEOUT_TCP_SYN_SENT]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_SYN_RECV]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_ESTABLISHED]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_FIN_WAIT]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_CLOSE_WAIT]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_LAST_ACK]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_TIME_WAIT]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_CLOSE]                = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_SYN_SENT2]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_RETRANS]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_TCP_UNACK]                = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */

void nf_conntrack_tcp_init_net(struct net *net)
{
        struct nf_tcp_net *tn = nf_tcp_pernet(net);
        int i;

        for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
                tn->timeouts[i] = tcp_timeouts[i];

        /* timeouts[0] is unused, make it same as SYN_SENT so
         * ->timeouts[0] contains 'new' timeout, like udp or icmp.
         */
        tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];

        /* If it is set to zero, we disable picking up already established
         * connections.
         */
        tn->tcp_loose = 1;

        /* "Be conservative in what you do,
         *  be liberal in what you accept from others."
         * If it's non-zero, we mark only out of window RST segments as INVALID.
         */
        tn->tcp_be_liberal = 0;

        /* If it's non-zero, we turn off RST sequence number check */
        tn->tcp_ignore_invalid_rst = 0;

        /* Max number of the retransmitted packets without receiving an (acceptable)
         * ACK from the destination. If this number is reached, a shorter timer
         * will be started.
         */
        tn->tcp_max_retrans = 3;

#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
        tn->offload_timeout = 30 * HZ;
#endif
}

const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
{
        .l4proto                 = IPPROTO_TCP,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
        .print_conntrack         = tcp_print_conntrack,
#endif
        .can_early_drop                = tcp_can_early_drop,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
        .to_nlattr                = tcp_to_nlattr,
        .from_nlattr                = nlattr_to_tcp,
        .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
        .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
        .nlattr_tuple_size        = tcp_nlattr_tuple_size,
        .nlattr_size                = TCP_NLATTR_SIZE,
        .nla_policy                = nf_ct_port_nla_policy,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        .ctnl_timeout                = {
                .nlattr_to_obj        = tcp_timeout_nlattr_to_obj,
                .obj_to_nlattr        = tcp_timeout_obj_to_nlattr,
                .nlattr_max        = CTA_TIMEOUT_TCP_MAX,
                .obj_size        = sizeof(unsigned int) *
                                        TCP_CONNTRACK_TIMEOUT_MAX,
                .nla_policy        = tcp_timeout_nla_policy,
        },
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
};

















    1 




































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
// SPDX-License-Identifier: GPL-2.0-only
/* tnum: tracked (or tristate) numbers
 *
 * A tnum tracks knowledge about the bits of a value.  Each bit can be either
 * known (0 or 1), or unknown (x).  Arithmetic operations on tnums will
 * propagate the unknown bits such that the tnum result represents all the
 * possible results for possible values of the operands.
 */
#include <linux/kernel.h>
#include <linux/tnum.h>

#define TNUM(_v, _m)        (struct tnum){.value = _v, .mask = _m}
/* A completely unknown value */
const struct tnum tnum_unknown = { .value = 0, .mask = -1 };

struct tnum tnum_const(u64 value)
{
        return TNUM(value, 0);
}

struct tnum tnum_range(u64 min, u64 max)
{
        u64 chi = min ^ max, delta;
        u8 bits = fls64(chi);

        /* special case, needed because 1ULL << 64 is undefined */
        if (bits > 63)
                return tnum_unknown;
        /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
         * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
         *  constant min (since min == max).
         */
        delta = (1ULL << bits) - 1;
        return TNUM(min & ~delta, delta);
}

struct tnum tnum_lshift(struct tnum a, u8 shift)
{
        return TNUM(a.value << shift, a.mask << shift);
}

struct tnum tnum_rshift(struct tnum a, u8 shift)
{
        return TNUM(a.value >> shift, a.mask >> shift);
}

struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
{
        /* if a.value is negative, arithmetic shifting by minimum shift
         * will have larger negative offset compared to more shifting.
         * If a.value is nonnegative, arithmetic shifting by minimum shift
         * will have larger positive offset compare to more shifting.
         */
        if (insn_bitness == 32)
                return TNUM((u32)(((s32)a.value) >> min_shift),
                            (u32)(((s32)a.mask)  >> min_shift));
        else
                return TNUM((s64)a.value >> min_shift,
                            (s64)a.mask  >> min_shift);
}

struct tnum tnum_add(struct tnum a, struct tnum b)
{
        u64 sm, sv, sigma, chi, mu;

        sm = a.mask + b.mask;
        sv = a.value + b.value;
        sigma = sm + sv;
        chi = sigma ^ sv;
        mu = chi | a.mask | b.mask;
        return TNUM(sv & ~mu, mu);
}

struct tnum tnum_sub(struct tnum a, struct tnum b)
{
        u64 dv, alpha, beta, chi, mu;

        dv = a.value - b.value;
        alpha = dv + a.mask;
        beta = dv - b.mask;
        chi = alpha ^ beta;
        mu = chi | a.mask | b.mask;
        return TNUM(dv & ~mu, mu);
}

struct tnum tnum_and(struct tnum a, struct tnum b)
{
        u64 alpha, beta, v;

        alpha = a.value | a.mask;
        beta = b.value | b.mask;
        v = a.value & b.value;
        return TNUM(v, alpha & beta & ~v);
}

struct tnum tnum_or(struct tnum a, struct tnum b)
{
        u64 v, mu;

        v = a.value | b.value;
        mu = a.mask | b.mask;
        return TNUM(v, mu & ~v);
}

struct tnum tnum_xor(struct tnum a, struct tnum b)
{
        u64 v, mu;

        v = a.value ^ b.value;
        mu = a.mask | b.mask;
        return TNUM(v & ~mu, mu);
}

/* Generate partial products by multiplying each bit in the multiplier (tnum a)
 * with the multiplicand (tnum b), and add the partial products after
 * appropriately bit-shifting them. Instead of directly performing tnum addition
 * on the generated partial products, equivalenty, decompose each partial
 * product into two tnums, consisting of the value-sum (acc_v) and the
 * mask-sum (acc_m) and then perform tnum addition on them. The following paper
 * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398.
 */
struct tnum tnum_mul(struct tnum a, struct tnum b)
{
        u64 acc_v = a.value * b.value;
        struct tnum acc_m = TNUM(0, 0);

        while (a.value || a.mask) {
                /* LSB of tnum a is a certain 1 */
                if (a.value & 1)
                        acc_m = tnum_add(acc_m, TNUM(0, b.mask));
                /* LSB of tnum a is uncertain */
                else if (a.mask & 1)
                        acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask));
                /* Note: no case for LSB is certain 0 */
                a = tnum_rshift(a, 1);
                b = tnum_lshift(b, 1);
        }
        return tnum_add(TNUM(acc_v, 0), acc_m);
}

/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
 * a 'known 0' - this will return a 'known 1' for that bit.
 */
struct tnum tnum_intersect(struct tnum a, struct tnum b)
{
        u64 v, mu;

        v = a.value | b.value;
        mu = a.mask & b.mask;
        return TNUM(v & ~mu, mu);
}

struct tnum tnum_cast(struct tnum a, u8 size)
{
        a.value &= (1ULL << (size * 8)) - 1;
        a.mask &= (1ULL << (size * 8)) - 1;
        return a;
}

bool tnum_is_aligned(struct tnum a, u64 size)
{
        if (!size)
                return true;
        return !((a.value | a.mask) & (size - 1));
}

bool tnum_in(struct tnum a, struct tnum b)
{
        if (b.mask & ~a.mask)
                return false;
        b.value &= ~a.mask;
        return a.value == b.value;
}

int tnum_sbin(char *str, size_t size, struct tnum a)
{
        size_t n;

        for (n = 64; n; n--) {
                if (n < size) {
                        if (a.mask & 1)
                                str[n - 1] = 'x';
                        else if (a.value & 1)
                                str[n - 1] = '1';
                        else
                                str[n - 1] = '0';
                }
                a.mask >>= 1;
                a.value >>= 1;
        }
        str[min(size - 1, (size_t)64)] = 0;
        return 64;
}

struct tnum tnum_subreg(struct tnum a)
{
        return tnum_cast(a, 4);
}

struct tnum tnum_clear_subreg(struct tnum a)
{
        return tnum_lshift(tnum_rshift(a, 32), 32);
}

struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
{
        return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
}

struct tnum tnum_const_subreg(struct tnum a, u32 value)
{
        return tnum_with_subreg(a, tnum_const(value));
}























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Module internals
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
 */

#include <linux/elf.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/mm.h>

#ifndef ARCH_SHF_SMALL
#define ARCH_SHF_SMALL 0
#endif

/*
 * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
 * section. This leaves 28 bits for offset on 32-bit systems, which is
 * about 256 MiB (WARN_ON_ONCE if we exceed that).
 */

#define SH_ENTSIZE_TYPE_BITS        4
#define SH_ENTSIZE_TYPE_SHIFT        (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
#define SH_ENTSIZE_TYPE_MASK        ((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
#define SH_ENTSIZE_OFFSET_MASK        ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)

/* Maximum number of characters written by module_flags() */
#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)

struct kernel_symbol {
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        int value_offset;
        int name_offset;
        int namespace_offset;
#else
        unsigned long value;
        const char *name;
        const char *namespace;
#endif
};

extern struct mutex module_mutex;
extern struct list_head modules;

extern struct module_attribute *modinfo_attrs[];
extern size_t modinfo_attrs_count;

/* Provided by the linker */
extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const s32 __start___kcrctab[];
extern const s32 __start___kcrctab_gpl[];

struct load_info {
        const char *name;
        /* pointer to module in temporary copy, freed at end of load_module() */
        struct module *mod;
        Elf_Ehdr *hdr;
        unsigned long len;
        Elf_Shdr *sechdrs;
        char *secstrings, *strtab;
        unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
        bool sig_ok;
#ifdef CONFIG_KALLSYMS
        unsigned long mod_kallsyms_init_off;
#endif
#ifdef CONFIG_MODULE_DECOMPRESS
#ifdef CONFIG_MODULE_STATS
        unsigned long compressed_len;
#endif
        struct page **pages;
        unsigned int max_pages;
        unsigned int used_pages;
#endif
        struct {
                unsigned int sym, str, mod, vers, info, pcpu;
        } index;
};

enum mod_license {
        NOT_GPL_ONLY,
        GPL_ONLY,
};

struct find_symbol_arg {
        /* Input */
        const char *name;
        bool gplok;
        bool warn;

        /* Output */
        struct module *owner;
        const s32 *crc;
        const struct kernel_symbol *sym;
        enum mod_license license;
};

int mod_verify_sig(const void *mod, struct load_info *info);
int try_to_force_load(struct module *mod, const char *reason);
bool find_symbol(struct find_symbol_arg *fsa);
struct module *find_module_all(const char *name, size_t len, bool even_unformed);
int cmp_name(const void *name, const void *sym);
long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
                                Elf_Shdr *sechdr, unsigned int section);
char *module_flags(struct module *mod, char *buf, bool show_state);
size_t module_flags_taint(unsigned long taints, char *buf);

char *module_next_tag_pair(char *string, unsigned long *secsize);

#define for_each_modinfo_entry(entry, info, name) \
        for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))

static inline void module_assert_mutex_or_preempt(void)
{
#ifdef CONFIG_LOCKDEP
        if (unlikely(!debug_locks))
                return;

        WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
                     !lockdep_is_held(&module_mutex));
#endif
}

static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        return (unsigned long)offset_to_ptr(&sym->value_offset);
#else
        return sym->value;
#endif
}

#ifdef CONFIG_LIVEPATCH
int copy_module_elf(struct module *mod, struct load_info *info);
void free_module_elf(struct module *mod);
#else /* !CONFIG_LIVEPATCH */
static inline int copy_module_elf(struct module *mod, struct load_info *info)
{
        return 0;
}

static inline void free_module_elf(struct module *mod) { }
#endif /* CONFIG_LIVEPATCH */

static inline bool set_livepatch_module(struct module *mod)
{
#ifdef CONFIG_LIVEPATCH
        mod->klp = true;
        return true;
#else
        return false;
#endif
}

/**
 * enum fail_dup_mod_reason - state at which a duplicate module was detected
 *
 * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but
 *         we've determined that another module with the same name is already loaded
 *         or being processed on our &modules list. This happens on early_mod_check()
 *         right before layout_and_allocate(). The kernel would have already
 *         vmalloc()'d space for the entire module through finit_module(). If
 *         decompression was used two vmap() spaces were used. These failures can
 *         happen when userspace has not seen the module present on the kernel and
 *         tries to load the module multiple times at same time.
 * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation
 *        checks and the kernel determines that the module was unique and because
 *        of this allocated yet another private kernel copy of the module space in
 *        layout_and_allocate() but after this determined in add_unformed_module()
 *        that another module with the same name is already loaded or being processed.
 *        These failures should be mitigated as much as possible and are indicative
 *        of really fast races in loading modules. Without module decompression
 *        they waste twice as much vmap space. With module decompression three
 *        times the module's size vmap space is wasted.
 */
enum fail_dup_mod_reason {
        FAIL_DUP_MOD_BECOMING = 0,
        FAIL_DUP_MOD_LOAD,
};

#ifdef CONFIG_MODULE_DEBUGFS
extern struct dentry *mod_debugfs_root;
#endif

#ifdef CONFIG_MODULE_STATS

#define mod_stat_add_long(count, var) atomic_long_add(count, var)
#define mod_stat_inc(name) atomic_inc(name)

extern atomic_long_t total_mod_size;
extern atomic_long_t total_text_size;
extern atomic_long_t invalid_kread_bytes;
extern atomic_long_t invalid_decompress_bytes;

extern atomic_t modcount;
extern atomic_t failed_kreads;
extern atomic_t failed_decompress;
struct mod_fail_load {
        struct list_head list;
        char name[MODULE_NAME_LEN];
        atomic_long_t count;
        unsigned long dup_fail_mask;
};

int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason);
void mod_stat_bump_invalid(struct load_info *info, int flags);
void mod_stat_bump_becoming(struct load_info *info, int flags);

#else

#define mod_stat_add_long(name, var)
#define mod_stat_inc(name)

static inline int try_add_failed_module(const char *name,
                                        enum fail_dup_mod_reason reason)
{
        return 0;
}

static inline void mod_stat_bump_invalid(struct load_info *info, int flags)
{
}

static inline void mod_stat_bump_becoming(struct load_info *info, int flags)
{
}

#endif /* CONFIG_MODULE_STATS */

#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS
bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret);
void kmod_dup_request_announce(char *module_name, int ret);
#else
static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
{
        return false;
}

static inline void kmod_dup_request_announce(char *module_name, int ret)
{
}
#endif

#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
struct mod_unload_taint {
        struct list_head list;
        char name[MODULE_NAME_LEN];
        unsigned long taints;
        u64 count;
};

int try_add_tainted_module(struct module *mod);
void print_unloaded_tainted_modules(void);
#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
static inline int try_add_tainted_module(struct module *mod)
{
        return 0;
}

static inline void print_unloaded_tainted_modules(void)
{
}
#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */

#ifdef CONFIG_MODULE_DECOMPRESS
int module_decompress(struct load_info *info, const void *buf, size_t size);
void module_decompress_cleanup(struct load_info *info);
#else
static inline int module_decompress(struct load_info *info,
                                    const void *buf, size_t size)
{
        return -EOPNOTSUPP;
}

static inline void module_decompress_cleanup(struct load_info *info)
{
}
#endif

struct mod_tree_root {
#ifdef CONFIG_MODULES_TREE_LOOKUP
        struct latch_tree_root root;
#endif
        unsigned long addr_min;
        unsigned long addr_max;
#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        unsigned long data_addr_min;
        unsigned long data_addr_max;
#endif
};

extern struct mod_tree_root mod_tree;

#ifdef CONFIG_MODULES_TREE_LOOKUP
void mod_tree_insert(struct module *mod);
void mod_tree_remove_init(struct module *mod);
void mod_tree_remove(struct module *mod);
struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
#else /* !CONFIG_MODULES_TREE_LOOKUP */

static inline void mod_tree_insert(struct module *mod) { }
static inline void mod_tree_remove_init(struct module *mod) { }
static inline void mod_tree_remove(struct module *mod) { }
static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
{
        struct module *mod;

        list_for_each_entry_rcu(mod, &modules, list,
                                lockdep_is_held(&module_mutex)) {
                if (within_module(addr, mod))
                        return mod;
        }

        return NULL;
}
#endif /* CONFIG_MODULES_TREE_LOOKUP */

int module_enable_rodata_ro(const struct module *mod, bool after_init);
int module_enable_data_nx(const struct module *mod);
int module_enable_text_rox(const struct module *mod);
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
                                char *secstrings, struct module *mod);

#ifdef CONFIG_MODULE_SIG
int module_sig_check(struct load_info *info, int flags);
#else /* !CONFIG_MODULE_SIG */
static inline int module_sig_check(struct load_info *info, int flags)
{
        return 0;
}
#endif /* !CONFIG_MODULE_SIG */

#ifdef CONFIG_DEBUG_KMEMLEAK
void kmemleak_load_module(const struct module *mod, const struct load_info *info);
#else /* !CONFIG_DEBUG_KMEMLEAK */
static inline void kmemleak_load_module(const struct module *mod,
                                        const struct load_info *info) { }
#endif /* CONFIG_DEBUG_KMEMLEAK */

#ifdef CONFIG_KALLSYMS
void init_build_id(struct module *mod, const struct load_info *info);
void layout_symtab(struct module *mod, struct load_info *info);
void add_kallsyms(struct module *mod, const struct load_info *info);

static inline bool sect_empty(const Elf_Shdr *sect)
{
        return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
}
#else /* !CONFIG_KALLSYMS */
static inline void init_build_id(struct module *mod, const struct load_info *info) { }
static inline void layout_symtab(struct module *mod, struct load_info *info) { }
static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
#endif /* CONFIG_KALLSYMS */

#ifdef CONFIG_SYSFS
int mod_sysfs_setup(struct module *mod, const struct load_info *info,
                    struct kernel_param *kparam, unsigned int num_params);
void mod_sysfs_teardown(struct module *mod);
void init_param_lock(struct module *mod);
#else /* !CONFIG_SYSFS */
static inline int mod_sysfs_setup(struct module *mod,
                                     const struct load_info *info,
                                     struct kernel_param *kparam,
                                     unsigned int num_params)
{
        return 0;
}

static inline void mod_sysfs_teardown(struct module *mod) { }
static inline void init_param_lock(struct module *mod) { }
#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MODVERSIONS
int check_version(const struct load_info *info,
                  const char *symname, struct module *mod, const s32 *crc);
void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
                   struct kernel_symbol *ks, struct tracepoint * const *tp);
int check_modstruct_version(const struct load_info *info, struct module *mod);
int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
#else /* !CONFIG_MODVERSIONS */
static inline int check_version(const struct load_info *info,
                                const char *symname,
                                struct module *mod,
                                const s32 *crc)
{
        return 1;
}

static inline int check_modstruct_version(const struct load_info *info,
                                          struct module *mod)
{
        return 1;
}

static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
{
        return strcmp(amagic, bmagic) == 0;
}
#endif /* CONFIG_MODVERSIONS */




































































































































































































































































    1 






    1 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H

#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h>
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>
#include <linux/userfaultfd_k.h>

struct ctl_table;
struct user_struct;
struct mmu_gather;
struct node;

#ifndef CONFIG_ARCH_HAS_HUGEPD
typedef struct { unsigned long pd; } hugepd_t;
#define is_hugepd(hugepd) (0)
#define __hugepd(x) ((hugepd_t) { (x) })
#endif

void free_huge_folio(struct folio *folio);

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/pagemap.h>
#include <linux/shm.h>
#include <asm/tlbflush.h>

/*
 * For HugeTLB page, there are more metadata to save in the struct page. But
 * the head struct page cannot meet our needs, so we have to abuse other tail
 * struct page to store the metadata.
 */
#define __NR_USED_SUBPAGE 3

struct hugepage_subpool {
        spinlock_t lock;
        long count;
        long max_hpages;        /* Maximum huge pages or -1 if no maximum. */
        long used_hpages;        /* Used count against maximum, includes */
                                /* both allocated and reserved pages. */
        struct hstate *hstate;
        long min_hpages;        /* Minimum huge pages or -1 if no minimum. */
        long rsv_hpages;        /* Pages reserved against global pool to */
                                /* satisfy minimum size. */
};

struct resv_map {
        struct kref refs;
        spinlock_t lock;
        struct list_head regions;
        long adds_in_progress;
        struct list_head region_cache;
        long region_cache_count;
        struct rw_semaphore rw_sema;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On private mappings, the counter to uncharge reservations is stored
         * here. If these fields are 0, then either the mapping is shared, or
         * cgroup accounting is disabled for this resv_map.
         */
        struct page_counter *reservation_counter;
        unsigned long pages_per_hpage;
        struct cgroup_subsys_state *css;
#endif
};

/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
 *
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indices into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
 */
struct file_region {
        struct list_head link;
        long from;
        long to;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On shared mappings, each reserved region appears as a struct
         * file_region in resv_map. These fields hold the info needed to
         * uncharge each reservation.
         */
        struct page_counter *reservation_counter;
        struct cgroup_subsys_state *css;
#endif
};

struct hugetlb_vma_lock {
        struct kref refs;
        struct rw_semaphore rw_sema;
        struct vm_area_struct *vma;
};

extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);

extern spinlock_t hugetlb_lock;
extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
        for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);

void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int move_hugetlb_page_tables(struct vm_area_struct *vma,
                             struct vm_area_struct *new_vma,
                             unsigned long old_addr, unsigned long new_addr,
                             unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
                            struct vm_area_struct *, struct vm_area_struct *);
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
                                      unsigned long address, unsigned int flags,
                                      unsigned int *page_mask);
void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *,
                          zap_flags_t);
void __unmap_hugepage_range(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
                          struct page *ref_page, zap_flags_t zap_flags);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo_node(int nid);
unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             uffd_flags_t flags,
                             struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
bool isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                bool *migratable_cleared);
void folio_putback_active_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud);
bool hugetlbfs_pagecache_present(struct hstate *h,
                                 struct vm_area_struct *vma,
                                 unsigned long address);

struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);

extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages[MAX_NUMNODES];

/* arch callbacks */

#ifndef CONFIG_HIGHPTE
/*
 * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures
 * which may go down to the lowest PTE level in their huge_pte_offset() and
 * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap().
 */
static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd,
                                    unsigned long address)
{
        return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address);
}
#endif

pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz);
/*
 * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
 * Returns the pte_t* if found, or NULL if the address is not mapped.
 *
 * IMPORTANT: we should normally not directly call this function, instead
 * this is only a common interface to implement arch-specific
 * walker. Please use hugetlb_walk() instead, because that will attempt to
 * verify the locking for you.
 *
 * Since this function will walk all the pgtable pages (including not only
 * high-level pgtable page, but also PUD entry that can be unshared
 * concurrently for VM_SHARED), the caller of this function should be
 * responsible of its thread safety.  One can follow this rule:
 *
 *  (1) For private mappings: pmd unsharing is not possible, so holding the
 *      mmap_lock for either read or write is sufficient. Most callers
 *      already hold the mmap_lock, so normally, no special action is
 *      required.
 *
 *  (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged
 *      pgtable page can go away from under us!  It can be done by a pmd
 *      unshare with a follow up munmap() on the other process), then we
 *      need either:
 *
 *     (2.1) hugetlb vma lock read or write held, to make sure pmd unshare
 *           won't happen upon the range (it also makes sure the pte_t we
 *           read is the right and stable one), or,
 *
 *     (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make
 *           sure even if unshare happened the racy unmap() will wait until
 *           i_mmap_rwsem is released.
 *
 * Option (2.1) is the safest, which guarantees pte stability from pmd
 * sharing pov, until the vma lock released.  Option (2.2) doesn't protect
 * a concurrent pmd unshare, but it makes sure the pgtable page is safe to
 * access.
 */
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end);

extern void __hugetlb_zap_begin(struct vm_area_struct *vma,
                                unsigned long *begin, unsigned long *end);
extern void __hugetlb_zap_end(struct vm_area_struct *vma,
                              struct zap_details *details);

static inline void hugetlb_zap_begin(struct vm_area_struct *vma,
                                     unsigned long *start, unsigned long *end)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_begin(vma, start, end);
}

static inline void hugetlb_zap_end(struct vm_area_struct *vma,
                                   struct zap_details *details)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_end(vma, details);
}

void hugetlb_vma_lock_read(struct vm_area_struct *vma);
void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
void hugetlb_vma_lock_write(struct vm_area_struct *vma);
void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot,
                unsigned long cp_flags);
bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);

#else /* !CONFIG_HUGETLB_PAGE */

static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}

static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline unsigned long hugetlb_total_pages(void)
{
        return 0;
}

static inline struct address_space *hugetlb_folio_mapping_lock_write(
                                                        struct folio *folio)
{
        return NULL;
}

static inline int huge_pmd_unshare(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long addr, pte_t *ptep)
{
        return 0;
}

static inline void adjust_range_if_pmd_sharing_possible(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_begin(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_end(
                                struct vm_area_struct *vma,
                                struct zap_details *details)
{
}

static inline int copy_hugetlb_page_range(struct mm_struct *dst,
                                          struct mm_struct *src,
                                          struct vm_area_struct *dst_vma,
                                          struct vm_area_struct *src_vma)
{
        BUG();
        return 0;
}

static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
                                           struct vm_area_struct *new_vma,
                                           unsigned long old_addr,
                                           unsigned long new_addr,
                                           unsigned long len)
{
        BUG();
        return 0;
}

static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}

static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        return 0;
}

static inline void hugetlb_show_meminfo_node(int nid)
{
}

static inline int prepare_hugepage_range(struct file *file,
                                unsigned long addr, unsigned long len)
{
        return -EINVAL;
}

static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
{
}

static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
        return 1;
}

static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
}

static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}

static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        BUG();
}

#ifdef CONFIG_USERFAULTFD
static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                                           struct vm_area_struct *dst_vma,
                                           unsigned long dst_addr,
                                           unsigned long src_addr,
                                           uffd_flags_t flags,
                                           struct folio **foliop)
{
        BUG();
        return 0;
}
#endif /* CONFIG_USERFAULTFD */

static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
{
        return NULL;
}

static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list)
{
        return false;
}

static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
        return 0;
}

static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void folio_putback_active_hugetlb(struct folio *folio)
{
}

static inline void move_hugetlb_state(struct folio *old_folio,
                                        struct folio *new_folio, int reason)
{
}

static inline long hugetlb_change_protection(
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned long end, pgprot_t newprot,
                        unsigned long cp_flags)
{
        return 0;
}

static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct page *ref_page,
                        zap_flags_t zap_flags)
{
        BUG();
}

static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
{
        BUG();
        return 0;
}

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

#endif /* !CONFIG_HUGETLB_PAGE */

#ifndef pgd_write
static inline int pgd_write(pgd_t pgd)
{
        BUG();
        return 0;
}
#endif

#define HUGETLB_ANON_FILE "anon_hugepage"

enum {
        /*
         * The file will be used as an shm file so shmfs accounting rules
         * apply
         */
        HUGETLB_SHMFS_INODE     = 1,
        /*
         * The file is being created on the internal vfs mount and shmfs
         * accounting rules do not apply
         */
        HUGETLB_ANONHUGE_INODE  = 2,
};

#ifdef CONFIG_HUGETLBFS
struct hugetlbfs_sb_info {
        long        max_inodes;   /* inodes allowed */
        long        free_inodes;  /* inodes free */
        spinlock_t        stat_lock;
        struct hstate *hstate;
        struct hugepage_subpool *spool;
        kuid_t        uid;
        kgid_t        gid;
        umode_t mode;
};

static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

struct hugetlbfs_inode_info {
        struct inode vfs_inode;
        unsigned int seals;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                int creat_flags, int page_size_log);

static inline bool is_file_hugepages(const struct file *file)
{
        return file->f_op->fop_flags & FOP_HUGE_PAGES;
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return HUGETLBFS_SB(i->i_sb)->hstate;
}
#else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file)                        false
static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
                int creat_flags, int page_size_log)
{
        return ERR_PTR(-ENOSYS);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return NULL;
}
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags);
#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */

unsigned long
generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags);

/*
 * huegtlb page specific state flags.  These flags are located in page.private
 * of the hugetlb head page.  Functions created via the below macros should be
 * used to manipulate these flags.
 *
 * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
 *        allocation time.  Cleared when page is fully instantiated.  Free
 *        routine checks flag to restore a reservation on error paths.
 *        Synchronization:  Examined or modified by code that knows it has
 *        the only reference to page.  i.e. After allocation but before use
 *        or when the page is being freed.
 * HPG_migratable  - Set after a newly allocated page is added to the page
 *        cache and/or page tables.  Indicates the page is a candidate for
 *        migration.
 *        Synchronization:  Initially set after new page allocation with no
 *        locking.  When examined and modified during migration processing
 *        (isolate, migrate, putback) the hugetlb_lock is held.
 * HPG_temporary - Set on a page that is temporarily allocated from the buddy
 *        allocator.  Typically used for migration target pages when no pages
 *        are available in the pool.  The hugetlb free page path will
 *        immediately free pages with this flag set to the buddy allocator.
 *        Synchronization: Can be set after huge page allocation from buddy when
 *        code knows it has only reference.  All other examinations and
 *        modifications require hugetlb_lock.
 * HPG_freed - Set when page is on the free lists.
 *        Synchronization: hugetlb_lock held for examination and modification.
 * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
 * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
 *     that is not tracked by raw_hwp_page list.
 */
enum hugetlb_page_flags {
        HPG_restore_reserve = 0,
        HPG_migratable,
        HPG_temporary,
        HPG_freed,
        HPG_vmemmap_optimized,
        HPG_raw_hwp_unreliable,
        __NR_HPAGEFLAGS,
};

/*
 * Macros to create test, set and clear function definitions for
 * hugetlb specific page flags.
 */
#ifdef CONFIG_HUGETLB_PAGE
#define TESTHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
bool folio_test_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                return test_bit(HPG_##flname, private);                \
        }                                                        \
static inline int HPage##uname(struct page *page)                \
        { return test_bit(HPG_##flname, &(page->private)); }

#define SETHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_set_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                set_bit(HPG_##flname, private);                        \
        }                                                        \
static inline void SetHPage##uname(struct page *page)                \
        { set_bit(HPG_##flname, &(page->private)); }

#define CLEARHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_clear_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                clear_bit(HPG_##flname, private);                \
        }                                                        \
static inline void ClearHPage##uname(struct page *page)                \
        { clear_bit(HPG_##flname, &(page->private)); }
#else
#define TESTHPAGEFLAG(uname, flname)                                \
static inline bool                                                \
folio_test_hugetlb_##flname(struct folio *folio)                \
        { return 0; }                                                \
static inline int HPage##uname(struct page *page)                \
        { return 0; }

#define SETHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_set_hugetlb_##flname(struct folio *folio)                 \
        { }                                                        \
static inline void SetHPage##uname(struct page *page)                \
        { }

#define CLEARHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_clear_hugetlb_##flname(struct folio *folio)                \
        { }                                                        \
static inline void ClearHPage##uname(struct page *page)                \
        { }
#endif

#define HPAGEFLAG(uname, flname)                                \
        TESTHPAGEFLAG(uname, flname)                                \
        SETHPAGEFLAG(uname, flname)                                \
        CLEARHPAGEFLAG(uname, flname)                                \

/*
 * Create functions associated with hugetlb page flags
 */
HPAGEFLAG(RestoreReserve, restore_reserve)
HPAGEFLAG(Migratable, migratable)
HPAGEFLAG(Temporary, temporary)
HPAGEFLAG(Freed, freed)
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)

#ifdef CONFIG_HUGETLB_PAGE

#define HSTATE_NAME_LEN 32
/* Defines one hugetlb page size */
struct hstate {
        struct mutex resize_lock;
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
        unsigned int demote_order;
        unsigned long mask;
        unsigned long max_huge_pages;
        unsigned long nr_huge_pages;
        unsigned long free_huge_pages;
        unsigned long resv_huge_pages;
        unsigned long surplus_huge_pages;
        unsigned long nr_overcommit_huge_pages;
        struct list_head hugepage_activelist;
        struct list_head hugepage_freelists[MAX_NUMNODES];
        unsigned int max_huge_pages_node[MAX_NUMNODES];
        unsigned int nr_huge_pages_node[MAX_NUMNODES];
        unsigned int free_huge_pages_node[MAX_NUMNODES];
        unsigned int surplus_huge_pages_node[MAX_NUMNODES];
#ifdef CONFIG_CGROUP_HUGETLB
        /* cgroup control files */
        struct cftype cgroup_files_dfl[8];
        struct cftype cgroup_files_legacy[10];
#endif
        char name[HSTATE_NAME_LEN];
};

struct huge_bootmem_page {
        struct list_head list;
        struct hstate *hstate;
};

int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                unsigned long addr, int avoid_reserve);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                                nodemask_t *nmask, gfp_t gfp_mask,
                                bool allow_alloc_fallback);
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
                        pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                                unsigned long address, struct folio *folio);

/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
bool __init hugetlb_node_alloc_supported(void);

void __init hugetlb_add_hstate(unsigned order);
bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);

#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif

extern struct hstate hstates[HUGE_MAX_HSTATE];
extern unsigned int default_hstate_idx;

#define default_hstate (hstates[default_hstate_idx])

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return folio->_hugetlb_subpool;
}

static inline void hugetlb_set_folio_subpool(struct folio *folio,
                                        struct hugepage_subpool *subpool)
{
        folio->_hugetlb_subpool = subpool;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return hstate_inode(file_inode(f));
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        if (!page_size_log)
                return &default_hstate;

        if (page_size_log < BITS_PER_LONG)
                return size_to_hstate(1UL << page_size_log);

        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return hstate_file(vma->vm_file);
}

static inline unsigned long huge_page_size(const struct hstate *h)
{
        return (unsigned long)PAGE_SIZE << h->order;
}

extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);

extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return h->mask;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return h->order;
}

static inline unsigned huge_page_shift(struct hstate *h)
{
        return h->order + PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return huge_page_order(h) > MAX_PAGE_ORDER;
}

static inline unsigned int pages_per_huge_page(const struct hstate *h)
{
        return 1 << h->order;
}

static inline unsigned int blocks_per_huge_page(struct hstate *h)
{
        return huge_page_size(h) / 512;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return filemap_lock_folio(mapping, idx << huge_page_order(h));
}

#include <asm/hugetlb.h>

#ifndef is_hugepage_only_range
static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}
#define is_hugepage_only_range is_hugepage_only_range
#endif

#ifndef arch_clear_hugetlb_flags
static inline void arch_clear_hugetlb_flags(struct folio *folio) { }
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#endif

#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
                                       vm_flags_t flags)
{
        return pte_mkhuge(entry);
}
#endif

static inline struct hstate *folio_hstate(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        return size_to_hstate(folio_size(folio));
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return hstates[index].order + PAGE_SHIFT;
}

static inline int hstate_index(struct hstate *h)
{
        return h - hstates;
}

int dissolve_free_hugetlb_folio(struct folio *folio);
int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                    unsigned long end_pfn);

#ifdef CONFIG_MEMORY_FAILURE
extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
#else
static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
}
#endif

#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
#ifndef arch_hugetlb_migration_supported
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        if ((huge_page_shift(h) == PMD_SHIFT) ||
                (huge_page_shift(h) == PUD_SHIFT) ||
                        (huge_page_shift(h) == PGDIR_SHIFT))
                return true;
        else
                return false;
}
#endif
#else
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        return false;
}
#endif

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return arch_hugetlb_migration_supported(h);
}

/*
 * Movability check is different as compared to migration check.
 * It determines whether or not a huge page should be placed on
 * movable zone or not. Movability of any huge page should be
 * required only if huge page size is supported for migration.
 * There won't be any reason for the huge page to be movable if
 * it is not migratable to start with. Also the size of the huge
 * page should be large enough to be placed under a movable zone
 * and still feasible enough to be migratable. Just the presence
 * in movable zone does not make the migration feasible.
 *
 * So even though large huge page sizes like the gigantic ones
 * are migratable they should not be movable because its not
 * feasible to migrate them from movable zone.
 */
static inline bool hugepage_movable_supported(struct hstate *h)
{
        if (!hugepage_migration_supported(h))
                return false;

        if (hstate_is_gigantic(h))
                return false;
        return true;
}

/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        if (hugepage_movable_supported(h))
                return GFP_HIGHUSER_MOVABLE;
        else
                return GFP_HIGHUSER;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        gfp_t modified_mask = htlb_alloc_mask(h);

        /* Some callers might want to enforce node */
        modified_mask |= (gfp_mask & __GFP_THISNODE);

        modified_mask |= (gfp_mask & __GFP_NOWARN);

        return modified_mask;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        bool allowed_fallback = false;

        /*
         * Note: the memory offline, memory failure and migration syscalls will
         * be allowed to fallback to other nodes due to lack of a better chioce,
         * that might break the per-node hugetlb pool. While other cases will
         * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool.
         */
        switch (reason) {
        case MR_MEMORY_HOTPLUG:
        case MR_MEMORY_FAILURE:
        case MR_SYSCALL:
        case MR_MEMPOLICY_MBIND:
                allowed_fallback = true;
                break;
        default:
                break;
        }

        return allowed_fallback;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        if (huge_page_size(h) == PMD_SIZE)
                return pmd_lockptr(mm, (pmd_t *) pte);
        VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
        return &mm->page_table_lock;
}

#ifndef hugepages_supported
/*
 * Some platform decide whether they support huge pages at boot
 * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
 * when there is no such support
 */
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);

static inline void hugetlb_count_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->hugetlb_usage, 0);
}

static inline void hugetlb_count_add(long l, struct mm_struct *mm)
{
        atomic_long_add(l, &mm->hugetlb_usage);
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
        atomic_long_sub(l, &mm->hugetlb_usage);
}

#ifndef huge_ptep_modify_prot_start
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep)
{
        return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
#endif

#ifndef huge_ptep_modify_prot_commit
#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep,
                                                pte_t old_pte, pte_t pte)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
#endif

#ifdef CONFIG_NUMA
void hugetlb_register_node(struct node *node);
void hugetlb_unregister_node(struct node *node);
#endif

/*
 * Check if a given raw @page in a hugepage is HWPOISON.
 */
bool is_raw_hwpoison_page_in_hugepage(struct page *page);

#else        /* CONFIG_HUGETLB_PAGE */
struct hstate {};

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return NULL;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return NULL;
}

static inline int isolate_or_dissolve_huge_page(struct page *page,
                                                struct list_head *list)
{
        return -ENOMEM;
}

static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           int avoid_reserve)
{
        return NULL;
}

static inline struct folio *
alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                        nodemask_t *nmask, gfp_t gfp_mask,
                        bool allow_alloc_fallback)
{
        return NULL;
}

static inline int __alloc_bootmem_huge_page(struct hstate *h)
{
        return 0;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return NULL;
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct hstate *folio_hstate(struct folio *folio)
{
        return NULL;
}

static inline struct hstate *size_to_hstate(unsigned long size)
{
        return NULL;
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return PAGE_SIZE;
}

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return PAGE_MASK;
}

static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return 0;
}

static inline unsigned int huge_page_shift(struct hstate *h)
{
        return PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return false;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1;
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return 0;
}

static inline int hstate_index(struct hstate *h)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folio(struct folio *folio)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                           unsigned long end_pfn)
{
        return 0;
}

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return false;
}

static inline bool hugepage_movable_supported(struct hstate *h)
{
        return false;
}

static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        return 0;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        return 0;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        return false;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}

static inline void hugetlb_count_init(struct mm_struct *mm)
{
}

static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
{
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
}

static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep)
{
#ifdef CONFIG_MMU
        return ptep_get(ptep);
#else
        return *ptep;
#endif
}

static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte, unsigned long sz)
{
}

static inline void hugetlb_register_node(struct node *node)
{
}

static inline void hugetlb_unregister_node(struct node *node)
{
}

static inline bool hugetlbfs_pagecache_present(
    struct hstate *h, struct vm_area_struct *vma, unsigned long address)
{
        return false;
}
#endif        /* CONFIG_HUGETLB_PAGE */

static inline spinlock_t *huge_pte_lock(struct hstate *h,
                                        struct mm_struct *mm, pte_t *pte)
{
        spinlock_t *ptl;

        ptl = huge_pte_lockptr(h, mm, pte);
        spin_lock(ptl);
        return ptl;
}

#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
#endif

#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return page_count(virt_to_page(pte)) > 1;
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return false;
}
#endif

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);

#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
/*
 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
 * implement this.
 */
#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#endif

static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}

bool __vma_private_lock(struct vm_area_struct *vma);

/*
 * Safe version of huge_pte_offset() to check the locks.  See comments
 * above huge_pte_offset().
 */
static inline pte_t *
hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
{
#if defined(CONFIG_HUGETLB_PAGE) && \
        defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
        struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

        /*
         * If pmd sharing possible, locking needed to safely walk the
         * hugetlb pgtables.  More information can be found at the comment
         * above huge_pte_offset() in the same file.
         *
         * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
         */
        if (__vma_shareable_lock(vma))
                WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
                             !lockdep_is_held(
                                 &vma->vm_file->f_mapping->i_mmap_rwsem));
#endif
        return huge_pte_offset(vma->vm_mm, addr, sz);
}

#endif /* _LINUX_HUGETLB_H */






























































    4 




















    4 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOPRIO_H
#define IOPRIO_H

#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>

#include <uapi/linux/ioprio.h>

/*
 * Default IO priority.
 */
#define IOPRIO_DEFAULT        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0)

/*
 * Check that a priority value has a valid class.
 */
static inline bool ioprio_valid(unsigned short ioprio)
{
        unsigned short class = IOPRIO_PRIO_CLASS(ioprio);

        return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE;
}

/*
 * if process has set io priority explicitly, use that. if not, convert
 * the cpu scheduler nice value to an io priority
 */
static inline int task_nice_ioprio(struct task_struct *task)
{
        return (task_nice(task) + 20) / 5;
}

/*
 * This is for the case where the task hasn't asked for a specific IO class.
 * Check for idle and rt task process, and return appropriate IO class.
 */
static inline int task_nice_ioclass(struct task_struct *task)
{
        if (task->policy == SCHED_IDLE)
                return IOPRIO_CLASS_IDLE;
        else if (task_is_realtime(task))
                return IOPRIO_CLASS_RT;
        else
                return IOPRIO_CLASS_BE;
}

#ifdef CONFIG_BLOCK
/*
 * If the task has set an I/O priority, use that. Otherwise, return
 * the default I/O priority.
 *
 * Expected to be called for current task or with task_lock() held to keep
 * io_context stable.
 */
static inline int __get_task_ioprio(struct task_struct *p)
{
        struct io_context *ioc = p->io_context;
        int prio;

        if (!ioc)
                return IOPRIO_DEFAULT;

        if (p != current)
                lockdep_assert_held(&p->alloc_lock);

        prio = ioc->ioprio;
        if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
                prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
                                         task_nice_ioprio(p));
        return prio;
}
#else
static inline int __get_task_ioprio(struct task_struct *p)
{
        return IOPRIO_DEFAULT;
}
#endif /* CONFIG_BLOCK */

static inline int get_current_ioprio(void)
{
        return __get_task_ioprio(current);
}

extern int set_task_ioprio(struct task_struct *task, int ioprio);

#ifdef CONFIG_BLOCK
extern int ioprio_check_cap(int ioprio);
#else
static inline int ioprio_check_cap(int ioprio)
{
        return -ENOTBLK;
}
#endif /* CONFIG_BLOCK */

#endif


















   12 



   12 




   11 








   11 













    5 




    5 















   11 


   11 








   10 
    5 















    2 


    2 
































    2 
    1 
















    4 


    4 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
// SPDX-License-Identifier: GPL-2.0
/*
 * Lockless hierarchical page accounting & limiting
 *
 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
 */

#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
#include <asm/page.h>

static void propagate_protected_usage(struct page_counter *c,
                                      unsigned long usage)
{
        unsigned long protected, old_protected;
        long delta;

        if (!c->parent)
                return;

        protected = min(usage, READ_ONCE(c->min));
        old_protected = atomic_long_read(&c->min_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->min_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_min_usage);
        }

        protected = min(usage, READ_ONCE(c->low));
        old_protected = atomic_long_read(&c->low_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->low_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_low_usage);
        }
}

/**
 * page_counter_cancel - take pages out of the local counter
 * @counter: counter
 * @nr_pages: number of pages to cancel
 */
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
        long new;

        new = atomic_long_sub_return(nr_pages, &counter->usage);
        /* More uncharges than charges? */
        if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
                      new, nr_pages)) {
                new = 0;
                atomic_long_set(&counter->usage, new);
        }
        propagate_protected_usage(counter, new);
}

/**
 * page_counter_charge - hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 *
 * NOTE: This does not consider any configured counter limits.
 */
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;

                new = atomic_long_add_return(nr_pages, &c->usage);
                propagate_protected_usage(c, new);
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
}

/**
 * page_counter_try_charge - try to hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 * @fail: points first counter to hit its limit, if any
 *
 * Returns %true on success, or %false and @fail if the counter or one
 * of its ancestors has hit its configured limit.
 */
bool page_counter_try_charge(struct page_counter *counter,
                             unsigned long nr_pages,
                             struct page_counter **fail)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;
                /*
                 * Charge speculatively to avoid an expensive CAS.  If
                 * a bigger charge fails, it might falsely lock out a
                 * racing smaller charge and send it into reclaim
                 * early, but the error is limited to the difference
                 * between the two sizes, which is less than 2M/4M in
                 * case of a THP locking out a regular page charge.
                 *
                 * The atomic_long_add_return() implies a full memory
                 * barrier between incrementing the count and reading
                 * the limit.  When racing with page_counter_set_max(),
                 * we either see the new limit or the setter sees the
                 * counter has changed and retries.
                 */
                new = atomic_long_add_return(nr_pages, &c->usage);
                if (new > c->max) {
                        atomic_long_sub(nr_pages, &c->usage);
                        /*
                         * This is racy, but we can live with some
                         * inaccuracy in the failcnt which is only used
                         * to report stats.
                         */
                        data_race(c->failcnt++);
                        *fail = c;
                        goto failed;
                }
                propagate_protected_usage(c, new);
                /*
                 * Just like with failcnt, we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
        return true;

failed:
        for (c = counter; c != *fail; c = c->parent)
                page_counter_cancel(c, nr_pages);

        return false;
}

/**
 * page_counter_uncharge - hierarchically uncharge pages
 * @counter: counter
 * @nr_pages: number of pages to uncharge
 */
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent)
                page_counter_cancel(c, nr_pages);
}

/**
 * page_counter_set_max - set the maximum number of pages allowed
 * @counter: counter
 * @nr_pages: limit to set
 *
 * Returns 0 on success, -EBUSY if the current number of pages on the
 * counter already exceeds the specified limit.
 *
 * The caller must serialize invocations on the same counter.
 */
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
        for (;;) {
                unsigned long old;
                long usage;

                /*
                 * Update the limit while making sure that it's not
                 * below the concurrently-changing counter value.
                 *
                 * The xchg implies two full memory barriers before
                 * and after, so the read-swap-read is ordered and
                 * ensures coherency with page_counter_try_charge():
                 * that function modifies the count before checking
                 * the limit, so if it sees the old limit, we see the
                 * modified counter and retry.
                 */
                usage = page_counter_read(counter);

                if (usage > nr_pages)
                        return -EBUSY;

                old = xchg(&counter->max, nr_pages);

                if (page_counter_read(counter) <= usage || nr_pages >= old)
                        return 0;

                counter->max = old;
                cond_resched();
        }
}

/**
 * page_counter_set_min - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->min, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_set_low - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->low, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
 * @max: string meaning maximum possible value
 * @nr_pages: returns the result in number of pages
 *
 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
 * limited to %PAGE_COUNTER_MAX.
 */
int page_counter_memparse(const char *buf, const char *max,
                          unsigned long *nr_pages)
{
        char *end;
        u64 bytes;

        if (!strcmp(buf, max)) {
                *nr_pages = PAGE_COUNTER_MAX;
                return 0;
        }

        bytes = memparse(buf, &end);
        if (*end != '\0')
                return -EINVAL;

        *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);

        return 0;
}












































    3 


























    1 




    3 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_TIMEOUT_H
#define _NF_CONNTRACK_TIMEOUT_H

#include <net/net_namespace.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <linux/refcount.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>

#define CTNL_TIMEOUT_NAME_MAX        32

struct nf_ct_timeout {
        __u16                        l3num;
        const struct nf_conntrack_l4proto *l4proto;
        char                        data[];
};

struct nf_conn_timeout {
        struct nf_ct_timeout __rcu *timeout;
};

static inline unsigned int *
nf_ct_timeout_data(const struct nf_conn_timeout *t)
{
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        struct nf_ct_timeout *timeout;

        timeout = rcu_dereference(t->timeout);
        if (timeout == NULL)
                return NULL;

        return (unsigned int *)timeout->data;
#else
        return NULL;
#endif
}

static inline
struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        return nf_ct_ext_find(ct, NF_CT_EXT_TIMEOUT);
#else
        return NULL;
#endif
}

static inline
struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct,
                                              struct nf_ct_timeout *timeout,
                                              gfp_t gfp)
{
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        struct nf_conn_timeout *timeout_ext;

        timeout_ext = nf_ct_ext_add(ct, NF_CT_EXT_TIMEOUT, gfp);
        if (timeout_ext == NULL)
                return NULL;

        rcu_assign_pointer(timeout_ext->timeout, timeout);

        return timeout_ext;
#else
        return NULL;
#endif
};

static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct)
{
        unsigned int *timeouts = NULL;
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        struct nf_conn_timeout *timeout_ext;

        timeout_ext = nf_ct_timeout_find(ct);
        if (timeout_ext)
                timeouts = nf_ct_timeout_data(timeout_ext);
#endif
        return timeouts;
}

#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout);
int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num,
                      const char *timeout_name);
void nf_ct_destroy_timeout(struct nf_conn *ct);
#else
static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
                                    u8 l3num, u8 l4num,
                                    const char *timeout_name)
{
        return -EOPNOTSUPP;
}

static inline void nf_ct_destroy_timeout(struct nf_conn *ct)
{
        return;
}
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */

#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
struct nf_ct_timeout_hooks {
        struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name);
        void (*timeout_put)(struct nf_ct_timeout *timeout);
};

extern const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook;
#endif

#endif /* _NF_CONNTRACK_TIMEOUT_H */




















































































































































    1 








    1 



































































































































































    1 


    1 









































    1 




    1 
    1 
    1 























    1 












    1 
    1 


























    1 












    1 
















    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>
#include <linux/rmap.h>

#include <asm/pgalloc.h>
#include <asm/tlb.h>

#ifndef CONFIG_MMU_GATHER_NO_GATHER

static bool tlb_next_batch(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        /* Limit batching if we have delayed rmaps pending */
        if (tlb->delayed_rmap && tlb->active != &tlb->local)
                return false;

        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
                return true;
        }

        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
                return false;

        batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
        if (!batch)
                return false;

        tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;

        tlb->active->next = batch;
        tlb->active = batch;

        return true;
}

#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
        struct encoded_page **pages = batch->encoded_pages;

        for (int i = 0; i < batch->nr; i++) {
                struct encoded_page *enc = pages[i];

                if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
                        struct page *page = encoded_page_ptr(enc);
                        unsigned int nr_pages = 1;

                        if (unlikely(encoded_page_flags(enc) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr_pages = encoded_nr_pages(pages[++i]);

                        folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
                                               vma);
                }
        }
}

/**
 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
 * @tlb: the current mmu_gather
 * @vma: The memory area from which the pages are being removed.
 *
 * Note that because of how tlb_next_batch() above works, we will
 * never start multiple new batches with pending delayed rmaps, so
 * we only need to walk through the current active batch and the
 * original local one.
 */
void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (!tlb->delayed_rmap)
                return;

        tlb_flush_rmap_batch(&tlb->local, vma);
        if (tlb->active != &tlb->local)
                tlb_flush_rmap_batch(tlb->active, vma);
        tlb->delayed_rmap = 0;
}
#endif

/*
 * We might end up freeing a lot of pages. Reschedule on a regular
 * basis to avoid soft lockups in configurations without full
 * preemption enabled. The magic number of 512 folios seems to work.
 */
#define MAX_NR_FOLIOS_PER_FREE                512

static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
        struct encoded_page **pages = batch->encoded_pages;
        unsigned int nr, nr_pages;

        while (batch->nr) {
                if (!page_poisoning_enabled_static() && !want_init_on_free()) {
                        nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);

                        /*
                         * Make sure we cover page + nr_pages, and don't leave
                         * nr_pages behind when capping the number of entries.
                         */
                        if (unlikely(encoded_page_flags(pages[nr - 1]) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr++;
                } else {
                        /*
                         * With page poisoning and init_on_free, the time it
                         * takes to free memory grows proportionally with the
                         * actual memory size. Therefore, limit based on the
                         * actual memory size and not the number of involved
                         * folios.
                         */
                        for (nr = 0, nr_pages = 0;
                             nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
                             nr++) {
                                if (unlikely(encoded_page_flags(pages[nr]) &
                                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                        nr_pages += encoded_nr_pages(pages[++nr]);
                                else
                                        nr_pages++;
                        }
                }

                free_pages_and_swap_cache(pages, nr);
                pages += nr;
                batch->nr -= nr;

                cond_resched();
        }
}

static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
                __tlb_batch_free_encoded_pages(batch);
        tlb->active = &tlb->local;
}

static void tlb_batch_list_free(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch, *next;

        for (batch = tlb->local.next; batch; batch = next) {
                next = batch->next;
                free_pages((unsigned long)batch, 0);
        }
        tlb->local.next = NULL;
}

static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
                struct page *page, unsigned int nr_pages, bool delay_rmap,
                int page_size)
{
        int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
        struct mmu_gather_batch *batch;

        VM_BUG_ON(!tlb->end);

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        VM_WARN_ON(tlb->page_size != page_size);
        VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
        VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
#endif

        batch = tlb->active;
        /*
         * Add the page and check if we are full. If so
         * force a flush.
         */
        if (likely(nr_pages == 1)) {
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
        } else {
                flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
                batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
        }
        /*
         * Make sure that we can always add another "page" + "nr_pages",
         * requiring two entries instead of only a single one.
         */
        if (batch->nr >= batch->max - 1) {
                if (!tlb_next_batch(tlb))
                        return true;
                batch = tlb->active;
        }
        VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);

        return false;
}

bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap)
{
        return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
                                             PAGE_SIZE);
}

bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                bool delay_rmap, int page_size)
{
        return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
}

#endif /* MMU_GATHER_NO_GATHER */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

static void __tlb_remove_table_free(struct mmu_table_batch *batch)
{
        int i;

        for (i = 0; i < batch->nr; i++)
                __tlb_remove_table(batch->tables[i]);

        free_page((unsigned long)batch);
}

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE

/*
 * Semi RCU freeing of the page directories.
 *
 * This is needed by some architectures to implement software pagetable walkers.
 *
 * gup_fast() and other software pagetable walkers do a lockless page-table
 * walk and therefore needs some synchronization with the freeing of the page
 * directories. The chosen means to accomplish that is by disabling IRQs over
 * the walk.
 *
 * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 * since we unlink the page, flush TLBs, free the page. Since the disabling of
 * IRQs delays the completion of the TLB flush we can never observe an already
 * freed page.
 *
 * Architectures that do not have this (PPC) need to delay the freeing by some
 * other means, this is that means.
 *
 * What we do is batch the freed directory pages (tables) and RCU free them.
 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 * holds off grace periods.
 *
 * However, in order to batch these pages we need to allocate storage, this
 * allocation is deep inside the MM code and can thus easily fail on memory
 * pressure. To guarantee progress we fall back to single table freeing, see
 * the implementation of tlb_remove_table_one().
 *
 */

static void tlb_remove_table_smp_sync(void *arg)
{
        /* Simply deliver the interrupt */
}

void tlb_remove_table_sync_one(void)
{
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
         * assumed to be actually RCU-freed.
         *
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling.
         */
        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
        __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
}

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        call_rcu(&batch->rcu, tlb_remove_table_rcu);
}

#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        __tlb_remove_table_free(batch);
}

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */

/*
 * If we want tlb_remove_table() to imply TLB invalidates.
 */
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
        if (tlb_needs_table_invalidate()) {
                /*
                 * Invalidate page-table caches used by hardware walkers. Then
                 * we still need to RCU-sched wait while freeing the pages
                 * because software walkers can still be in-flight.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

static void tlb_remove_table_one(void *table)
{
        tlb_remove_table_sync_one();
        __tlb_remove_table(table);
}

static void tlb_table_flush(struct mmu_gather *tlb)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch) {
                tlb_table_invalidate(tlb);
                tlb_remove_table_free(*batch);
                *batch = NULL;
        }
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
                        tlb_table_invalidate(tlb);
                        tlb_remove_table_one(table);
                        return;
                }
                (*batch)->nr = 0;
        }

        (*batch)->tables[(*batch)->nr++] = table;
        if ((*batch)->nr == MAX_TABLE_BATCH)
                tlb_table_flush(tlb);
}

static inline void tlb_table_init(struct mmu_gather *tlb)
{
        tlb->batch = NULL;
}

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_table_flush(struct mmu_gather *tlb) { }
static inline void tlb_table_init(struct mmu_gather *tlb) { }

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
        tlb_table_flush(tlb);
#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_pages_flush(tlb);
#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
{
        tlb_flush_mmu_tlbonly(tlb);
        tlb_flush_mmu_free(tlb);
}

static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                             bool fullmm)
{
        tlb->mm = mm;
        tlb->fullmm = fullmm;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb->need_flush_all = 0;
        tlb->local.next = NULL;
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
        tlb->batch_count = 0;
#endif
        tlb->delayed_rmap = 0;

        tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        tlb->page_size = 0;
#endif

        __tlb_reset_range(tlb);
        inc_tlb_flush_pending(tlb->mm);
}

/**
 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, false);
}

/**
 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * In this case, @mm is without users and we're going to destroy the
 * full address space (exit/execve).
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, true);
}

/**
 * tlb_finish_mmu - finish an mmu_gather structure
 * @tlb: the mmu_gather structure to finish
 *
 * Called at the end of the shootdown operation to free up any resources that
 * were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb)
{
        /*
         * If there are parallel threads are doing PTE changes on same range
         * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
         * flush by batching, one thread may end up seeing inconsistent PTEs
         * and result in having stale TLB entries.  So flush TLB forcefully
         * if we detect parallel PTE batching threads.
         *
         * However, some syscalls, e.g. munmap(), may free page tables, this
         * needs force flush everything in the given range. Otherwise this
         * may result in having stale TLB entries for some architectures,
         * e.g. aarch64, that could specify flush what level TLB.
         */
        if (mm_tlb_flush_nested(tlb->mm)) {
                /*
                 * The aarch64 yields better performance with fullmm by
                 * avoiding multiple CPUs spamming TLBI messages at the
                 * same time.
                 *
                 * On x86 non-fullmm doesn't yield significant difference
                 * against fullmm.
                 */
                tlb->fullmm = 1;
                __tlb_reset_range(tlb);
                tlb->freed_tables = 1;
        }

        tlb_flush_mmu(tlb);

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_list_free(tlb);
#endif
        dec_tlb_flush_pending(tlb->mm);
}
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 























































































































































































































































































































































































































    3 












    1 

    1 




    1 



























    1 
    1 








    1 









    1 





    1 










    1 




















    1 




    1 






    1 




    1 
    1 



    1 
    1 





    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        fs/libfs.c
 *        Library for filesystems writers.
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
#include <linux/pidfs.h>

#include <linux/uaccess.h>

#include "internal.h"

int simple_getattr(struct mnt_idmap *idmap, const struct path *path,
                   struct kstat *stat, u32 request_mask,
                   unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
        return 0;
}
EXPORT_SYMBOL(simple_getattr);

int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        u64 id = huge_encode_dev(dentry->d_sb->s_dev);

        buf->f_fsid = u64_to_fsid(id);
        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        return 0;
}
EXPORT_SYMBOL(simple_statfs);

/*
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
int always_delete_dentry(const struct dentry *dentry)
{
        return 1;
}
EXPORT_SYMBOL(always_delete_dentry);

const struct dentry_operations simple_dentry_operations = {
        .d_delete = always_delete_dentry,
};
EXPORT_SYMBOL(simple_dentry_operations);

/*
 * Lookup the data. This is trivial - if the dentry didn't already
 * exist, we know it is negative.  Set d_op to delete negative dentries.
 */
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        if (!dentry->d_sb->s_d_op)
                d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
}
EXPORT_SYMBOL(simple_lookup);

int dcache_dir_open(struct inode *inode, struct file *file)
{
        file->private_data = d_alloc_cursor(file->f_path.dentry);

        return file->private_data ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(dcache_dir_open);

int dcache_dir_close(struct inode *inode, struct file *file)
{
        dput(file->private_data);
        return 0;
}
EXPORT_SYMBOL(dcache_dir_close);

/* parent is locked at least shared */
/*
 * Returns an element of siblings' list.
 * We are looking for <count>th positive after <p>; if
 * found, dentry is grabbed and returned to caller.
 * If no such element exists, NULL is returned.
 */
static struct dentry *scan_positives(struct dentry *cursor,
                                        struct hlist_node **p,
                                        loff_t count,
                                        struct dentry *last)
{
        struct dentry *dentry = cursor->d_parent, *found = NULL;

        spin_lock(&dentry->d_lock);
        while (*p) {
                struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
                p = &d->d_sib.next;
                // we must at least skip cursors, to avoid livelocks
                if (d->d_flags & DCACHE_DENTRY_CURSOR)
                        continue;
                if (simple_positive(d) && !--count) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                found = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(found))
                                break;
                        count = 1;
                }
                if (need_resched()) {
                        if (!hlist_unhashed(&cursor->d_sib))
                                __hlist_del(&cursor->d_sib);
                        hlist_add_behind(&cursor->d_sib, &d->d_sib);
                        p = &cursor->d_sib.next;
                        spin_unlock(&dentry->d_lock);
                        cond_resched();
                        spin_lock(&dentry->d_lock);
                }
        }
        spin_unlock(&dentry->d_lock);
        dput(last);
        return found;
}

loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
        struct dentry *dentry = file->f_path.dentry;
        switch (whence) {
                case 1:
                        offset += file->f_pos;
                        fallthrough;
                case 0:
                        if (offset >= 0)
                                break;
                        fallthrough;
                default:
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
                struct dentry *cursor = file->private_data;
                struct dentry *to = NULL;

                inode_lock_shared(dentry->d_inode);

                if (offset > 2)
                        to = scan_positives(cursor, &dentry->d_children.first,
                                            offset - 2, NULL);
                spin_lock(&dentry->d_lock);
                hlist_del_init(&cursor->d_sib);
                if (to)
                        hlist_add_behind(&cursor->d_sib, &to->d_sib);
                spin_unlock(&dentry->d_lock);
                dput(to);

                file->f_pos = offset;

                inode_unlock_shared(dentry->d_inode);
        }
        return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);

/*
 * Directory is locked and all positive dentries in it are safe, since
 * for ramfs-type trees they can't go away without unlink() or rmdir(),
 * both impossible due to the lock on directory.
 */

int dcache_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct dentry *cursor = file->private_data;
        struct dentry *next = NULL;
        struct hlist_node **p;

        if (!dir_emit_dots(file, ctx))
                return 0;

        if (ctx->pos == 2)
                p = &dentry->d_children.first;
        else
                p = &cursor->d_sib.next;

        while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
                if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
                              d_inode(next)->i_ino,
                              fs_umode_to_dtype(d_inode(next)->i_mode)))
                        break;
                ctx->pos++;
                p = &next->d_sib.next;
        }
        spin_lock(&dentry->d_lock);
        hlist_del_init(&cursor->d_sib);
        if (next)
                hlist_add_before(&cursor->d_sib, &next->d_sib);
        spin_unlock(&dentry->d_lock);
        dput(next);

        return 0;
}
EXPORT_SYMBOL(dcache_readdir);

ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
{
        return -EISDIR;
}
EXPORT_SYMBOL(generic_read_dir);

const struct file_operations simple_dir_operations = {
        .open                = dcache_dir_open,
        .release        = dcache_dir_close,
        .llseek                = dcache_dir_lseek,
        .read                = generic_read_dir,
        .iterate_shared        = dcache_readdir,
        .fsync                = noop_fsync,
};
EXPORT_SYMBOL(simple_dir_operations);

const struct inode_operations simple_dir_inode_operations = {
        .lookup                = simple_lookup,
};
EXPORT_SYMBOL(simple_dir_inode_operations);

/* 0 is '.', 1 is '..', so always start with offset 2 or more */
enum {
        DIR_OFFSET_MIN        = 2,
};

static void offset_set(struct dentry *dentry, long offset)
{
        dentry->d_fsdata = (void *)offset;
}

static long dentry2offset(struct dentry *dentry)
{
        return (long)dentry->d_fsdata;
}

static struct lock_class_key simple_offset_lock_class;

/**
 * simple_offset_init - initialize an offset_ctx
 * @octx: directory offset map to be initialized
 *
 */
void simple_offset_init(struct offset_ctx *octx)
{
        mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
        lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
        octx->next_offset = DIR_OFFSET_MIN;
}

/**
 * simple_offset_add - Add an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: new dentry being added
 *
 * Returns zero on success. @octx and the dentry's offset are updated.
 * Otherwise, a negative errno value is returned.
 */
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
{
        unsigned long offset;
        int ret;

        if (dentry2offset(dentry) != 0)
                return -EBUSY;

        ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
                                 LONG_MAX, &octx->next_offset, GFP_KERNEL);
        if (ret < 0)
                return ret;

        offset_set(dentry, offset);
        return 0;
}

static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry,
                                 long offset)
{
        int ret;

        ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL);
        if (ret)
                return ret;
        offset_set(dentry, offset);
        return 0;
}

/**
 * simple_offset_remove - Remove an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: dentry being removed
 *
 */
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
{
        long offset;

        offset = dentry2offset(dentry);
        if (offset == 0)
                return;

        mtree_erase(&octx->mt, offset);
        offset_set(dentry, 0);
}

/**
 * simple_offset_empty - Check if a dentry can be unlinked
 * @dentry: dentry to be tested
 *
 * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
 */
int simple_offset_empty(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        struct offset_ctx *octx;
        struct dentry *child;
        unsigned long index;
        int ret = 1;

        if (!inode || !S_ISDIR(inode->i_mode))
                return ret;

        index = DIR_OFFSET_MIN;
        octx = inode->i_op->get_offset_ctx(inode);
        mt_for_each(&octx->mt, child, index, LONG_MAX) {
                spin_lock(&child->d_lock);
                if (simple_positive(child)) {
                        spin_unlock(&child->d_lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&child->d_lock);
        }

        return ret;
}

/**
 * simple_offset_rename - handle directory offsets for rename
 * @old_dir: parent directory of source entry
 * @old_dentry: dentry of source entry
 * @new_dir: parent_directory of destination entry
 * @new_dentry: dentry of destination
 *
 * Caller provides appropriate serialization.
 *
 * User space expects the directory offset value of the replaced
 * (new) directory entry to be unchanged after a rename.
 *
 * Returns zero on success, a negative errno value on failure.
 */
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long new_offset = dentry2offset(new_dentry);

        simple_offset_remove(old_ctx, old_dentry);

        if (new_offset) {
                offset_set(new_dentry, 0);
                return simple_offset_replace(new_ctx, old_dentry, new_offset);
        }
        return simple_offset_add(new_ctx, old_dentry);
}

/**
 * simple_offset_rename_exchange - exchange rename with directory offsets
 * @old_dir: parent of dentry being moved
 * @old_dentry: dentry being moved
 * @new_dir: destination parent
 * @new_dentry: destination dentry
 *
 * This API preserves the directory offset values. Caller provides
 * appropriate serialization.
 *
 * Returns zero on success. Otherwise a negative errno is returned and the
 * rename is rolled back.
 */
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long old_index = dentry2offset(old_dentry);
        long new_index = dentry2offset(new_dentry);
        int ret;

        simple_offset_remove(old_ctx, old_dentry);
        simple_offset_remove(new_ctx, new_dentry);

        ret = simple_offset_replace(new_ctx, old_dentry, new_index);
        if (ret)
                goto out_restore;

        ret = simple_offset_replace(old_ctx, new_dentry, old_index);
        if (ret) {
                simple_offset_remove(new_ctx, old_dentry);
                goto out_restore;
        }

        ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
        if (ret) {
                simple_offset_remove(new_ctx, old_dentry);
                simple_offset_remove(old_ctx, new_dentry);
                goto out_restore;
        }
        return 0;

out_restore:
        (void)simple_offset_replace(old_ctx, old_dentry, old_index);
        (void)simple_offset_replace(new_ctx, new_dentry, new_index);
        return ret;
}

/**
 * simple_offset_destroy - Release offset map
 * @octx: directory offset ctx that is about to be destroyed
 *
 * During fs teardown (eg. umount), a directory's offset map might still
 * contain entries. xa_destroy() cleans out anything that remains.
 */
void simple_offset_destroy(struct offset_ctx *octx)
{
        mtree_destroy(&octx->mt);
}

/**
 * offset_dir_llseek - Advance the read position of a directory descriptor
 * @file: an open directory whose position is to be updated
 * @offset: a byte offset
 * @whence: enumerator describing the starting position for this update
 *
 * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
 *
 * Returns the updated read position if successful; otherwise a
 * negative errno is returned and the read position remains unchanged.
 */
static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
                fallthrough;
        case SEEK_SET:
                if (offset >= 0)
                        break;
                fallthrough;
        default:
                return -EINVAL;
        }

        /* In this case, ->private_data is protected by f_pos_lock */
        file->private_data = NULL;
        return vfs_setpos(file, offset, LONG_MAX);
}

static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
{
        MA_STATE(mas, &octx->mt, offset, offset);
        struct dentry *child, *found = NULL;

        rcu_read_lock();
        child = mas_find(&mas, LONG_MAX);
        if (!child)
                goto out;
        spin_lock(&child->d_lock);
        if (simple_positive(child))
                found = dget_dlock(child);
        spin_unlock(&child->d_lock);
out:
        rcu_read_unlock();
        return found;
}

static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        long offset = dentry2offset(dentry);

        return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
                          inode->i_ino, fs_umode_to_dtype(inode->i_mode));
}

static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
{
        struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
        struct dentry *dentry;

        while (true) {
                dentry = offset_find_next(octx, ctx->pos);
                if (!dentry)
                        return ERR_PTR(-ENOENT);

                if (!offset_dir_emit(ctx, dentry)) {
                        dput(dentry);
                        break;
                }

                ctx->pos = dentry2offset(dentry) + 1;
                dput(dentry);
        }
        return NULL;
}

/**
 * offset_readdir - Emit entries starting at offset @ctx->pos
 * @file: an open directory to iterate over
 * @ctx: directory iteration context
 *
 * Caller must hold @file's i_rwsem to prevent insertion or removal of
 * entries during this call.
 *
 * On entry, @ctx->pos contains an offset that represents the first entry
 * to be read from the directory.
 *
 * The operation continues until there are no more entries to read, or
 * until the ctx->actor indicates there is no more space in the caller's
 * output buffer.
 *
 * On return, @ctx->pos contains an offset that will read the next entry
 * in this directory when offset_readdir() is called again with @ctx.
 *
 * Return values:
 *   %0 - Complete
 */
static int offset_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dir = file->f_path.dentry;

        lockdep_assert_held(&d_inode(dir)->i_rwsem);

        if (!dir_emit_dots(file, ctx))
                return 0;

        /* In this case, ->private_data is protected by f_pos_lock */
        if (ctx->pos == DIR_OFFSET_MIN)
                file->private_data = NULL;
        else if (file->private_data == ERR_PTR(-ENOENT))
                return 0;
        file->private_data = offset_iterate_dir(d_inode(dir), ctx);
        return 0;
}

const struct file_operations simple_offset_dir_operations = {
        .llseek                = offset_dir_llseek,
        .iterate_shared        = offset_readdir,
        .read                = generic_read_dir,
        .fsync                = noop_fsync,
};

static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
        struct dentry *child = NULL, *d;

        spin_lock(&parent->d_lock);
        d = prev ? d_next_sibling(prev) : d_first_child(parent);
        hlist_for_each_entry_from(d, d_sib) {
                if (simple_positive(d)) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                child = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(child))
                                break;
                }
        }
        spin_unlock(&parent->d_lock);
        dput(prev);
        return child;
}

void simple_recursive_removal(struct dentry *dentry,
                              void (*callback)(struct dentry *))
{
        struct dentry *this = dget(dentry);
        while (true) {
                struct dentry *victim = NULL, *child;
                struct inode *inode = this->d_inode;

                inode_lock(inode);
                if (d_is_dir(this))
                        inode->i_flags |= S_DEAD;
                while ((child = find_next_child(this, victim)) == NULL) {
                        // kill and ascend
                        // update metadata while it's still locked
                        inode_set_ctime_current(inode);
                        clear_nlink(inode);
                        inode_unlock(inode);
                        victim = this;
                        this = this->d_parent;
                        inode = this->d_inode;
                        inode_lock(inode);
                        if (simple_positive(victim)) {
                                d_invalidate(victim);        // avoid lost mounts
                                if (d_is_dir(victim))
                                        fsnotify_rmdir(inode, victim);
                                else
                                        fsnotify_unlink(inode, victim);
                                if (callback)
                                        callback(victim);
                                dput(victim);                // unpin it
                        }
                        if (victim == dentry) {
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));
                                if (d_is_dir(dentry))
                                        drop_nlink(inode);
                                inode_unlock(inode);
                                dput(dentry);
                                return;
                        }
                }
                inode_unlock(inode);
                this = child;
        }
}
EXPORT_SYMBOL(simple_recursive_removal);

static const struct super_operations simple_super_operations = {
        .statfs                = simple_statfs,
};

static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = fc->fs_private;
        struct inode *root;

        s->s_maxbytes = MAX_LFS_FILESIZE;
        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = ctx->magic;
        s->s_op = ctx->ops ?: &simple_super_operations;
        s->s_xattr = ctx->xattr;
        s->s_time_gran = 1;
        root = new_inode(s);
        if (!root)
                return -ENOMEM;

        /*
         * since this is the first inode, make it number 1. New inodes created
         * after this must take care not to collide with it (by passing
         * max_reserved of 1 to iunique).
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
        simple_inode_init_ts(root);
        s->s_root = d_make_root(root);
        if (!s->s_root)
                return -ENOMEM;
        s->s_d_op = ctx->dops;
        return 0;
}

static int pseudo_fs_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, pseudo_fs_fill_super);
}

static void pseudo_fs_free(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations pseudo_fs_context_ops = {
        .free                = pseudo_fs_free,
        .get_tree        = pseudo_fs_get_tree,
};

/*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
                                        unsigned long magic)
{
        struct pseudo_fs_context *ctx;

        ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
        if (likely(ctx)) {
                ctx->magic = magic;
                fc->fs_private = ctx;
                fc->ops = &pseudo_fs_context_ops;
                fc->sb_flags |= SB_NOUSER;
                fc->global = true;
        }
        return ctx;
}
EXPORT_SYMBOL(init_pseudo);

int simple_open(struct inode *inode, struct file *file)
{
        if (inode->i_private)
                file->private_data = inode->i_private;
        return 0;
}
EXPORT_SYMBOL(simple_open);

int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inc_nlink(inode);
        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
}
EXPORT_SYMBOL(simple_link);

int simple_empty(struct dentry *dentry)
{
        struct dentry *child;
        int ret = 0;

        spin_lock(&dentry->d_lock);
        hlist_for_each_entry(child, &dentry->d_children, d_sib) {
                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                if (simple_positive(child)) {
                        spin_unlock(&child->d_lock);
                        goto out;
                }
                spin_unlock(&child->d_lock);
        }
        ret = 1;
out:
        spin_unlock(&dentry->d_lock);
        return ret;
}
EXPORT_SYMBOL(simple_empty);

int simple_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        drop_nlink(inode);
        dput(dentry);
        return 0;
}
EXPORT_SYMBOL(simple_unlink);

int simple_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        simple_unlink(dir, dentry);
        drop_nlink(dir);
        return 0;
}
EXPORT_SYMBOL(simple_rmdir);

/**
 * simple_rename_timestamp - update the various inode timestamps for rename
 * @old_dir: old parent directory
 * @old_dentry: dentry that is being renamed
 * @new_dir: new parent directory
 * @new_dentry: target for rename
 *
 * POSIX mandates that the old and new parent directories have their ctime and
 * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
 * their ctime updated.
 */
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
{
        struct inode *newino = d_inode(new_dentry);

        inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
        if (new_dir != old_dir)
                inode_set_mtime_to_ts(new_dir,
                                      inode_set_ctime_current(new_dir));
        inode_set_ctime_current(d_inode(old_dentry));
        if (newino)
                inode_set_ctime_current(newino);
}
EXPORT_SYMBOL_GPL(simple_rename_timestamp);

int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
{
        bool old_is_dir = d_is_dir(old_dentry);
        bool new_is_dir = d_is_dir(new_dentry);

        if (old_dir != new_dir && old_is_dir != new_is_dir) {
                if (old_is_dir) {
                        drop_nlink(old_dir);
                        inc_nlink(new_dir);
                } else {
                        drop_nlink(new_dir);
                        inc_nlink(old_dir);
                }
        }
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_rename_exchange);

int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
                  struct dentry *old_dentry, struct inode *new_dir,
                  struct dentry *new_dentry, unsigned int flags)
{
        int they_are_dirs = d_is_dir(old_dentry);

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        if (d_really_is_positive(new_dentry)) {
                simple_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL(simple_rename);

/**
 * simple_setattr - setattr for simple filesystem
 * @idmap: idmap of the target mount
 * @dentry: dentry
 * @iattr: iattr structure
 *
 * Returns 0 on success, -error on failure.
 *
 * simple_setattr is a simple ->setattr implementation without a proper
 * implementation of size changes.
 *
 * It can either be used for in-memory filesystems or special files
 * on simple regular filesystems.  Anything that needs to change on-disk
 * or wire state on size changes needs its own setattr method.
 */
int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                   struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        int error;

        error = setattr_prepare(idmap, dentry, iattr);
        if (error)
                return error;

        if (iattr->ia_valid & ATTR_SIZE)
                truncate_setsize(inode, iattr->ia_size);
        setattr_copy(idmap, inode, iattr);
        mark_inode_dirty(inode);
        return 0;
}
EXPORT_SYMBOL(simple_setattr);

static int simple_read_folio(struct file *file, struct folio *folio)
{
        folio_zero_range(folio, 0, folio_size(folio));
        flush_dcache_folio(folio);
        folio_mark_uptodate(folio);
        folio_unlock(folio);
        return 0;
}

int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata)
{
        struct folio *folio;

        folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        *pagep = &folio->page;

        if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
                size_t from = offset_in_folio(folio, pos);

                folio_zero_segments(folio, 0, from,
                                from + len, folio_size(folio));
        }
        return 0;
}
EXPORT_SYMBOL(simple_write_begin);

/**
 * simple_write_end - .write_end helper for non-block-device FSes
 * @file: See .write_end of address_space_operations
 * @mapping:                 "
 * @pos:                 "
 * @len:                 "
 * @copied:                 "
 * @page:                 "
 * @fsdata:                 "
 *
 * simple_write_end does the minimum needed for updating a page after writing is
 * done. It has the same API signature as the .write_end of
 * address_space_operations vector. So it can just be set onto .write_end for
 * FSes that don't need any other processing. i_mutex is assumed to be held.
 * Block based filesystems should use generic_write_end().
 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
 * is not called, so a filesystem that actually does store data in .write_inode
 * should extend on what's done here with a call to mark_inode_dirty() in the
 * case that i_size has changed.
 *
 * Use *ONLY* with simple_read_folio()
 */
static int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        struct inode *inode = folio->mapping->host;
        loff_t last_pos = pos + copied;

        /* zero the stale part of the folio if we did a short copy */
        if (!folio_test_uptodate(folio)) {
                if (copied < len) {
                        size_t from = offset_in_folio(folio, pos);

                        folio_zero_range(folio, from + copied, len - copied);
                }
                folio_mark_uptodate(folio);
        }
        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold the i_mutex.
         */
        if (last_pos > inode->i_size)
                i_size_write(inode, last_pos);

        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

/*
 * Provides ramfs-style behavior: data in the pagecache, but no writeback.
 */
const struct address_space_operations ram_aops = {
        .read_folio        = simple_read_folio,
        .write_begin        = simple_write_begin,
        .write_end        = simple_write_end,
        .dirty_folio        = noop_dirty_folio,
};
EXPORT_SYMBOL(ram_aops);

/*
 * the inodes created here are not hashed. If you use iunique to generate
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
int simple_fill_super(struct super_block *s, unsigned long magic,
                      const struct tree_descr *files)
{
        struct inode *inode;
        struct dentry *dentry;
        int i;

        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
        s->s_op = &simple_super_operations;
        s->s_time_gran = 1;

        inode = new_inode(s);
        if (!inode)
                return -ENOMEM;
        /*
         * because the root inode is 1, the files array must not contain an
         * entry at index 1
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
        simple_inode_init_ts(inode);
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        set_nlink(inode, 2);
        s->s_root = d_make_root(inode);
        if (!s->s_root)
                return -ENOMEM;
        for (i = 0; !files->name || files->name[0]; i++, files++) {
                if (!files->name)
                        continue;

                /* warn if it tries to conflict with the root inode */
                if (unlikely(i == 1))
                        printk(KERN_WARNING "%s: %s passed in a files array"
                                "with an index of 1!\n", __func__,
                                s->s_type->name);

                dentry = d_alloc_name(s->s_root, files->name);
                if (!dentry)
                        return -ENOMEM;
                inode = new_inode(s);
                if (!inode) {
                        dput(dentry);
                        return -ENOMEM;
                }
                inode->i_mode = S_IFREG | files->mode;
                simple_inode_init_ts(inode);
                inode->i_fop = files->ops;
                inode->i_ino = i;
                d_add(dentry, inode);
        }
        return 0;
}
EXPORT_SYMBOL(simple_fill_super);

static DEFINE_SPINLOCK(pin_fs_lock);

int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt = NULL;
        spin_lock(&pin_fs_lock);
        if (unlikely(!*mount)) {
                spin_unlock(&pin_fs_lock);
                mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
                if (IS_ERR(mnt))
                        return PTR_ERR(mnt);
                spin_lock(&pin_fs_lock);
                if (!*mount)
                        *mount = mnt;
        }
        mntget(*mount);
        ++*count;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
        return 0;
}
EXPORT_SYMBOL(simple_pin_fs);

void simple_release_fs(struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt;
        spin_lock(&pin_fs_lock);
        mnt = *mount;
        if (!--*count)
                *mount = NULL;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
}
EXPORT_SYMBOL(simple_release_fs);

/**
 * simple_read_from_buffer - copy data from the buffer to user space
 * @to: the user space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The simple_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the user space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;
        size_t ret;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        ret = copy_to_user(to, from + pos, count);
        if (ret == count)
                return -EFAULT;
        count -= ret;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_read_from_buffer);

/**
 * simple_write_to_buffer - copy data from user space to the buffer
 * @to: the buffer to write to
 * @available: the size of the buffer
 * @ppos: the current position in the buffer
 * @from: the user space buffer to read from
 * @count: the maximum number of bytes to read
 *
 * The simple_write_to_buffer() function reads up to @count bytes from the user
 * space address starting at @from into the buffer @to at offset @ppos.
 *
 * On success, the number of bytes written is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count)
{
        loff_t pos = *ppos;
        size_t res;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        res = copy_from_user(to + pos, from, count);
        if (res == count)
                return -EFAULT;
        count -= res;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_write_to_buffer);

/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The memory_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the kernel space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available)
                return 0;
        if (count > available - pos)
                count = available - pos;
        memcpy(to, from + pos, count);
        *ppos = pos + count;

        return count;
}
EXPORT_SYMBOL(memory_read_from_buffer);

/*
 * Transaction based IO.
 * The file expects a single write which triggers the transaction, and then
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */

void simple_transaction_set(struct file *file, size_t n)
{
        struct simple_transaction_argresp *ar = file->private_data;

        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);

        /*
         * The barrier ensures that ar->size will really remain zero until
         * ar->data is ready for reading.
         */
        smp_mb();
        ar->size = n;
}
EXPORT_SYMBOL(simple_transaction_set);

char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
{
        struct simple_transaction_argresp *ar;
        static DEFINE_SPINLOCK(simple_transaction_lock);

        if (size > SIMPLE_TRANSACTION_LIMIT - 1)
                return ERR_PTR(-EFBIG);

        ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
        if (!ar)
                return ERR_PTR(-ENOMEM);

        spin_lock(&simple_transaction_lock);

        /* only one write allowed per open */
        if (file->private_data) {
                spin_unlock(&simple_transaction_lock);
                free_page((unsigned long)ar);
                return ERR_PTR(-EBUSY);
        }

        file->private_data = ar;

        spin_unlock(&simple_transaction_lock);

        if (copy_from_user(ar->data, buf, size))
                return ERR_PTR(-EFAULT);

        return ar->data;
}
EXPORT_SYMBOL(simple_transaction_get);

ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
        struct simple_transaction_argresp *ar = file->private_data;

        if (!ar)
                return 0;
        return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
}
EXPORT_SYMBOL(simple_transaction_read);

int simple_transaction_release(struct inode *inode, struct file *file)
{
        free_page((unsigned long)file->private_data);
        return 0;
}
EXPORT_SYMBOL(simple_transaction_release);

/* Simple attribute files */

struct simple_attr {
        int (*get)(void *, u64 *);
        int (*set)(void *, u64);
        char get_buf[24];        /* enough to store a u64 and "\n\0" */
        char set_buf[24];
        void *data;
        const char *fmt;        /* format for read operation */
        struct mutex mutex;        /* protects access to these buffers */
};

/* simple_attr_open is called by an actual attribute open file operation
 * to set the attribute specific access operations. */
int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt)
{
        struct simple_attr *attr;

        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
        if (!attr)
                return -ENOMEM;

        attr->get = get;
        attr->set = set;
        attr->data = inode->i_private;
        attr->fmt = fmt;
        mutex_init(&attr->mutex);

        file->private_data = attr;

        return nonseekable_open(inode, file);
}
EXPORT_SYMBOL_GPL(simple_attr_open);

int simple_attr_release(struct inode *inode, struct file *file)
{
        kfree(file->private_data);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_attr_release);        /* GPL-only?  This?  Really? */

/* read from the buffer that is filled with the get function */
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos)
{
        struct simple_attr *attr;
        size_t size;
        ssize_t ret;

        attr = file->private_data;

        if (!attr->get)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        if (*ppos && attr->get_buf[0]) {
                /* continued read */
                size = strlen(attr->get_buf);
        } else {
                /* first read */
                u64 val;
                ret = attr->get(attr->data, &val);
                if (ret)
                        goto out;

                size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
                                 attr->fmt, (unsigned long long)val);
        }

        ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
out:
        mutex_unlock(&attr->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(simple_attr_read);

/* interpret the buffer as a number to call the set function with */
static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos, bool is_signed)
{
        struct simple_attr *attr;
        unsigned long long val;
        size_t size;
        ssize_t ret;

        attr = file->private_data;
        if (!attr->set)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        ret = -EFAULT;
        size = min(sizeof(attr->set_buf) - 1, len);
        if (copy_from_user(attr->set_buf, buf, size))
                goto out;

        attr->set_buf[size] = '\0';
        if (is_signed)
                ret = kstrtoll(attr->set_buf, 0, &val);
        else
                ret = kstrtoull(attr->set_buf, 0, &val);
        if (ret)
                goto out;
        ret = attr->set(attr->data, val);
        if (ret == 0)
                ret = len; /* on success, claim we got the whole input */
out:
        mutex_unlock(&attr->mutex);
        return ret;
}

ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, false);
}
EXPORT_SYMBOL_GPL(simple_attr_write);

ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, true);
}
EXPORT_SYMBOL_GPL(simple_attr_write_signed);

/**
 * generic_encode_ino32_fh - generic export_operations->encode_fh function
 * @inode:   the object to encode
 * @fh:      where to store the file handle fragment
 * @max_len: maximum length to store there (in 4 byte units)
 * @parent:  parent directory inode, if wanted
 *
 * This generic encode_fh function assumes that the 32 inode number
 * is suitable for locating an inode, and that the generation number
 * can be used to check that it is still valid.  It places them in the
 * filehandle fragment where export_decode_fh expects to find them.
 */
int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
                            struct inode *parent)
{
        struct fid *fid = (void *)fh;
        int len = *max_len;
        int type = FILEID_INO32_GEN;

        if (parent && (len < 4)) {
                *max_len = 4;
                return FILEID_INVALID;
        } else if (len < 2) {
                *max_len = 2;
                return FILEID_INVALID;
        }

        len = 2;
        fid->i32.ino = inode->i_ino;
        fid->i32.gen = inode->i_generation;
        if (parent) {
                fid->i32.parent_ino = parent->i_ino;
                fid->i32.parent_gen = parent->i_generation;
                len = 4;
                type = FILEID_INO32_GEN_PARENT;
        }
        *max_len = len;
        return type;
}
EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);

/**
 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the object specified in the file handle.
 */
struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len < 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN:
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_dentry);

/**
 * generic_fh_to_parent - generic helper for the fh_to_parent export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the _parent_ object specified in the file handle if it
 * is specified in the file handle, or NULL otherwise.
 */
struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len <= 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.parent_ino,
                                  (fh_len > 3 ? fid->i32.parent_gen : 0));
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_parent);

/**
 * __generic_file_fsync - generic fsync implementation for simple filesystems
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
                                 int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        inode_lock(inode);
        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        inode_unlock(inode);
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);

/**
 * generic_file_fsync - generic fsync implementation for simple filesystems
 *                        with flush
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 */

int generic_file_fsync(struct file *file, loff_t start, loff_t end,
                       int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;

        err = __generic_file_fsync(file, start, end, datasync);
        if (err)
                return err;
        return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);

/**
 * generic_check_addressable - Check addressability of file system
 * @blocksize_bits:        log of file system block size
 * @num_blocks:                number of blocks in file system
 *
 * Determine whether a file system with @num_blocks blocks (and a
 * block size of 2**@blocksize_bits) is addressable by the sector_t
 * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
 */
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
        u64 last_fs_block = num_blocks - 1;
        u64 last_fs_page =
                last_fs_block >> (PAGE_SHIFT - blocksize_bits);

        if (unlikely(num_blocks == 0))
                return 0;

        if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
                return -EINVAL;

        if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
            (last_fs_page > (pgoff_t)(~0ULL))) {
                return -EFBIG;
        }
        return 0;
}
EXPORT_SYMBOL(generic_check_addressable);

/*
 * No-op implementation of ->fsync for in-memory filesystems.
 */
int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        return 0;
}
EXPORT_SYMBOL(noop_fsync);

ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        /*
         * iomap based filesystems support direct I/O without need for
         * this callback. However, it still needs to be set in
         * inode->a_ops so that open/fcntl know that direct I/O is
         * generally supported.
         */
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(noop_direct_IO);

/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void kfree_link(void *p)
{
        kfree(p);
}
EXPORT_SYMBOL(kfree_link);

struct inode *alloc_anon_inode(struct super_block *s)
{
        static const struct address_space_operations anon_aops = {
                .dirty_folio        = noop_dirty_folio,
        };
        struct inode *inode = new_inode_pseudo(s);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        inode->i_ino = get_next_ino();
        inode->i_mapping->a_ops = &anon_aops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because mark_inode_dirty() will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE;
        simple_inode_init_ts(inode);
        return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);

/**
 * simple_nosetlease - generic helper for prohibiting leases
 * @filp: file pointer
 * @arg: type of lease to obtain
 * @flp: new lease supplied for insertion
 * @priv: private data for lm_setup operation
 *
 * Generic helper for filesystems that do not wish to allow leases to be set.
 * All arguments are ignored and it just returns -EINVAL.
 */
int
simple_nosetlease(struct file *filp, int arg, struct file_lease **flp,
                  void **priv)
{
        return -EINVAL;
}
EXPORT_SYMBOL(simple_nosetlease);

/**
 * simple_get_link - generic helper to get the target of "fast" symlinks
 * @dentry: not used here
 * @inode: the symlink inode
 * @done: not used here
 *
 * Generic helper for filesystems to use for symlink inodes where a pointer to
 * the symlink target is stored in ->i_link.  NOTE: this isn't normally called,
 * since as an optimization the path lookup code uses any non-NULL ->i_link
 * directly, without calling ->get_link().  But ->get_link() still must be set,
 * to mark the inode_operations as being for a symlink.
 *
 * Return: the symlink target
 */
const char *simple_get_link(struct dentry *dentry, struct inode *inode,
                            struct delayed_call *done)
{
        return inode->i_link;
}
EXPORT_SYMBOL(simple_get_link);

const struct inode_operations simple_symlink_inode_operations = {
        .get_link = simple_get_link,
};
EXPORT_SYMBOL(simple_symlink_inode_operations);

/*
 * Operations for a permanently empty directory.
 */
static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-ENOENT);
}

static int empty_dir_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        return 0;
}

static int empty_dir_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *attr)
{
        return -EPERM;
}

static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
{
        return -EOPNOTSUPP;
}

static const struct inode_operations empty_dir_inode_operations = {
        .lookup                = empty_dir_lookup,
        .permission        = generic_permission,
        .setattr        = empty_dir_setattr,
        .getattr        = empty_dir_getattr,
        .listxattr        = empty_dir_listxattr,
};

static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
{
        /* An empty directory has two entries . and .. at offsets 0 and 1 */
        return generic_file_llseek_size(file, offset, whence, 2, 2);
}

static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
{
        dir_emit_dots(file, ctx);
        return 0;
}

static const struct file_operations empty_dir_operations = {
        .llseek                = empty_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = empty_dir_readdir,
        .fsync                = noop_fsync,
};


void make_empty_dir_inode(struct inode *inode)
{
        set_nlink(inode, 2);
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_uid = GLOBAL_ROOT_UID;
        inode->i_gid = GLOBAL_ROOT_GID;
        inode->i_rdev = 0;
        inode->i_size = 0;
        inode->i_blkbits = PAGE_SHIFT;
        inode->i_blocks = 0;

        inode->i_op = &empty_dir_inode_operations;
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &empty_dir_operations;
}

bool is_empty_dir_inode(struct inode *inode)
{
        return (inode->i_fop == &empty_dir_operations) &&
                (inode->i_op == &empty_dir_inode_operations);
}

#if IS_ENABLED(CONFIG_UNICODE)
/**
 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
 * @dentry:        dentry whose name we are checking against
 * @len:        len of name of dentry
 * @str:        str pointer to name of dentry
 * @name:        Name to compare against
 *
 * Return: 0 if names match, 1 if mismatch, or -ERRNO
 */
static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                                const char *str, const struct qstr *name)
{
        const struct dentry *parent;
        const struct inode *dir;
        char strbuf[DNAME_INLINE_LEN];
        struct qstr qstr;

        /*
         * Attempt a case-sensitive match first. It is cheaper and
         * should cover most lookups, including all the sane
         * applications that expect a case-sensitive filesystem.
         *
         * This comparison is safe under RCU because the caller
         * guarantees the consistency between str and len. See
         * __d_lookup_rcu_op_compare() for details.
         */
        if (len == name->len && !memcmp(str, name->name, len))
                return 0;

        parent = READ_ONCE(dentry->d_parent);
        dir = READ_ONCE(parent->d_inode);
        if (!dir || !IS_CASEFOLDED(dir))
                return 1;

        /*
         * If the dentry name is stored in-line, then it may be concurrently
         * modified by a rename.  If this happens, the VFS will eventually retry
         * the lookup, so it doesn't matter what ->d_compare() returns.
         * However, it's unsafe to call utf8_strncasecmp() with an unstable
         * string.  Therefore, we have to copy the name into a temporary buffer.
         */
        if (len <= DNAME_INLINE_LEN - 1) {
                memcpy(strbuf, str, len);
                strbuf[len] = 0;
                str = strbuf;
                /* prevent compiler from optimizing out the temporary buffer */
                barrier();
        }
        qstr.len = len;
        qstr.name = str;

        return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}

/**
 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
 * @dentry:        dentry of the parent directory
 * @str:        qstr of name whose hash we should fill in
 *
 * Return: 0 if hash was successful or unchanged, and -EINVAL on error
 */
static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
        const struct inode *dir = READ_ONCE(dentry->d_inode);
        struct super_block *sb = dentry->d_sb;
        const struct unicode_map *um = sb->s_encoding;
        int ret;

        if (!dir || !IS_CASEFOLDED(dir))
                return 0;

        ret = utf8_casefold_hash(um, dentry, str);
        if (ret < 0 && sb_has_strict_encoding(sb))
                return -EINVAL;
        return 0;
}

static const struct dentry_operations generic_ci_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
#ifdef CONFIG_FS_ENCRYPTION
        .d_revalidate = fscrypt_d_revalidate,
#endif
};
#endif

#ifdef CONFIG_FS_ENCRYPTION
static const struct dentry_operations generic_encrypted_dentry_ops = {
        .d_revalidate = fscrypt_d_revalidate,
};
#endif

/**
 * generic_set_sb_d_ops - helper for choosing the set of
 * filesystem-wide dentry operations for the enabled features
 * @sb: superblock to be configured
 *
 * Filesystems supporting casefolding and/or fscrypt can call this
 * helper at mount-time to configure sb->s_d_op to best set of dentry
 * operations required for the enabled features. The helper must be
 * called after these have been configured, but before the root dentry
 * is created.
 */
void generic_set_sb_d_ops(struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        if (sb->s_encoding) {
                sb->s_d_op = &generic_ci_dentry_ops;
                return;
        }
#endif
#ifdef CONFIG_FS_ENCRYPTION
        if (sb->s_cop) {
                sb->s_d_op = &generic_encrypted_dentry_ops;
                return;
        }
#endif
}
EXPORT_SYMBOL(generic_set_sb_d_ops);

/**
 * inode_maybe_inc_iversion - increments i_version
 * @inode: inode with the i_version that should be updated
 * @force: increment the counter even if it's not necessary?
 *
 * Every time the inode is modified, the i_version field must be seen to have
 * changed by any observer.
 *
 * If "force" is set or the QUERIED flag is set, then ensure that we increment
 * the value, and clear the queried flag.
 *
 * In the common case where neither is set, then we can return "false" without
 * updating i_version.
 *
 * If this function returns false, and no other metadata has changed, then we
 * can avoid logging the metadata.
 */
bool inode_maybe_inc_iversion(struct inode *inode, bool force)
{
        u64 cur, new;

        /*
         * The i_version field is not strictly ordered with any other inode
         * information, but the legacy inode_inc_iversion code used a spinlock
         * to serialize increments.
         *
         * Here, we add full memory barriers to ensure that any de-facto
         * ordering with other info is preserved.
         *
         * This barrier pairs with the barrier in inode_query_iversion()
         */
        smp_mb();
        cur = inode_peek_iversion_raw(inode);
        do {
                /* If flag is clear then we needn't do anything */
                if (!force && !(cur & I_VERSION_QUERIED))
                        return false;

                /* Since lowest bit is flag, add 2 to avoid it */
                new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return true;
}
EXPORT_SYMBOL(inode_maybe_inc_iversion);

/**
 * inode_query_iversion - read i_version for later use
 * @inode: inode from which i_version should be read
 *
 * Read the inode i_version counter. This should be used by callers that wish
 * to store the returned i_version for later comparison. This will guarantee
 * that a later query of the i_version will result in a different value if
 * anything has changed.
 *
 * In this implementation, we fetch the current value, set the QUERIED flag and
 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
 * that fails, we try again with the newly fetched value from the cmpxchg.
 */
u64 inode_query_iversion(struct inode *inode)
{
        u64 cur, new;

        cur = inode_peek_iversion_raw(inode);
        do {
                /* If flag is already set, then no need to swap */
                if (cur & I_VERSION_QUERIED) {
                        /*
                         * This barrier (and the implicit barrier in the
                         * cmpxchg below) pairs with the barrier in
                         * inode_maybe_inc_iversion().
                         */
                        smp_mb();
                        break;
                }

                new = cur | I_VERSION_QUERIED;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return cur >> I_VERSION_QUERIED_SHIFT;
}
EXPORT_SYMBOL(inode_query_iversion);

ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos - buffered_written;
        loff_t end = iocb->ki_pos - 1;
        int err;

        /*
         * If the buffered write fallback returned an error, we want to return
         * the number of bytes which were written by direct I/O, or the error
         * code if that was zero.
         *
         * Note that this differs from normal direct-io semantics, which will
         * return -EFOO even if some bytes were written.
         */
        if (unlikely(buffered_written < 0)) {
                if (direct_written)
                        return direct_written;
                return buffered_written;
        }

        /*
         * We need to ensure that the page cache pages are written to disk and
         * invalidated to preserve the expected O_DIRECT semantics.
         */
        err = filemap_write_and_wait_range(mapping, pos, end);
        if (err < 0) {
                /*
                 * We don't know how much we wrote, so just return the number of
                 * bytes which were direct-written
                 */
                iocb->ki_pos -= buffered_written;
                if (direct_written)
                        return direct_written;
                return err;
        }
        invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
        return direct_written + buffered_written;
}
EXPORT_SYMBOL_GPL(direct_write_fallback);

/**
 * simple_inode_init_ts - initialize the timestamps for a new inode
 * @inode: inode to be initialized
 *
 * When a new inode is created, most filesystems set the timestamps to the
 * current time. Add a helper to do this.
 */
struct timespec64 simple_inode_init_ts(struct inode *inode)
{
        struct timespec64 ts = inode_set_ctime_current(inode);

        inode_set_atime_to_ts(inode, ts);
        inode_set_mtime_to_ts(inode, ts);
        return ts;
}
EXPORT_SYMBOL(simple_inode_init_ts);

static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
{
        struct dentry *dentry;

        guard(rcu)();
        dentry = READ_ONCE(stashed);
        if (!dentry)
                return NULL;
        if (!lockref_get_not_dead(&dentry->d_lockref))
                return NULL;
        return dentry;
}

static struct dentry *prepare_anon_dentry(struct dentry **stashed,
                                          struct super_block *sb,
                                          void *data)
{
        struct dentry *dentry;
        struct inode *inode;
        const struct stashed_operations *sops = sb->s_fs_info;
        int ret;

        inode = new_inode_pseudo(sb);
        if (!inode) {
                sops->put_data(data);
                return ERR_PTR(-ENOMEM);
        }

        inode->i_flags |= S_IMMUTABLE;
        inode->i_mode = S_IFREG;
        simple_inode_init_ts(inode);

        ret = sops->init_inode(inode, data);
        if (ret < 0) {
                iput(inode);
                return ERR_PTR(ret);
        }

        /* Notice when this is changed. */
        WARN_ON_ONCE(!S_ISREG(inode->i_mode));
        WARN_ON_ONCE(!IS_IMMUTABLE(inode));

        dentry = d_alloc_anon(sb);
        if (!dentry) {
                iput(inode);
                return ERR_PTR(-ENOMEM);
        }

        /* Store address of location where dentry's supposed to be stashed. */
        dentry->d_fsdata = stashed;

        /* @data is now owned by the fs */
        d_instantiate(dentry, inode);
        return dentry;
}

static struct dentry *stash_dentry(struct dentry **stashed,
                                   struct dentry *dentry)
{
        guard(rcu)();
        for (;;) {
                struct dentry *old;

                /* Assume any old dentry was cleared out. */
                old = cmpxchg(stashed, NULL, dentry);
                if (likely(!old))
                        return dentry;

                /* Check if somebody else installed a reusable dentry. */
                if (lockref_get_not_dead(&old->d_lockref))
                        return old;

                /* There's an old dead dentry there, try to take it over. */
                if (likely(try_cmpxchg(stashed, &old, dentry)))
                        return dentry;
        }
}

/**
 * path_from_stashed - create path from stashed or new dentry
 * @stashed:    where to retrieve or stash dentry
 * @mnt:        mnt of the filesystems to use
 * @data:       data to store in inode->i_private
 * @path:       path to create
 *
 * The function tries to retrieve a stashed dentry from @stashed. If the dentry
 * is still valid then it will be reused. If the dentry isn't able the function
 * will allocate a new dentry and inode. It will then check again whether it
 * can reuse an existing dentry in case one has been added in the meantime or
 * update @stashed with the newly added dentry.
 *
 * Special-purpose helper for nsfs and pidfs.
 *
 * Return: On success zero and on failure a negative error is returned.
 */
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path)
{
        struct dentry *dentry;
        const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;

        /* See if dentry can be reused. */
        path->dentry = get_stashed_dentry(*stashed);
        if (path->dentry) {
                sops->put_data(data);
                goto out_path;
        }

        /* Allocate a new dentry. */
        dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        /* Added a new dentry. @data is now owned by the filesystem. */
        path->dentry = stash_dentry(stashed, dentry);
        if (path->dentry != dentry)
                dput(dentry);

out_path:
        WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
        WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
        path->mnt = mntget(mnt);
        return 0;
}

void stashed_dentry_prune(struct dentry *dentry)
{
        struct dentry **stashed = dentry->d_fsdata;
        struct inode *inode = d_inode(dentry);

        if (WARN_ON_ONCE(!stashed))
                return;

        if (!inode)
                return;

        /*
         * Only replace our own @dentry as someone else might've
         * already cleared out @dentry and stashed their own
         * dentry in there.
         */
        cmpxchg(stashed, dentry, NULL);
}

























































































































































































    7 
















    7 

    6 




    5 











    7 
    7 


















    7 

    6 






    7 















    7 































































    1 

    1 




    1 












    1 
    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Percpu refcounts:
 * (C) 2012 Google, Inc.
 * Author: Kent Overstreet <koverstreet@google.com>
 *
 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
 * atomic_dec_and_test() - but percpu.
 *
 * There's one important difference between percpu refs and normal atomic_t
 * refcounts; you have to keep track of your initial refcount, and then when you
 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
 * refcount.
 *
 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
 * than an atomic_t - this is because of the way shutdown works, see
 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
 *
 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
 * issuing the appropriate barriers, and then marks the ref as shutting down so
 * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
 * it's safe to drop the initial ref.
 *
 * USAGE:
 *
 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
 * is created when userspaces calls io_setup(), and destroyed when userspace
 * calls io_destroy() or the process exits.
 *
 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
 * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
 * After that, there can't be any new users of the kioctx (from lookup_ioctx())
 * and it's then safe to drop the initial ref with percpu_ref_put().
 *
 * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
 * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
 * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
 * with RCU protection, it must be done explicitly.
 *
 * Code that does a two stage shutdown like this often needs some kind of
 * explicit synchronization to ensure the initial refcount can only be dropped
 * once - percpu_ref_kill() does this for you, it returns true once and false if
 * someone else already called it. The aio code uses it this way, but it's not
 * necessary if the code has some other mechanism to synchronize teardown.
 * around.
 */

#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/gfp.h>

struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);

/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
        __PERCPU_REF_ATOMIC        = 1LU << 0,        /* operating in atomic mode */
        __PERCPU_REF_DEAD        = 1LU << 1,        /* (being) killed */
        __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,

        __PERCPU_REF_FLAG_BITS        = 2,
};

/* @flags for percpu_ref_init() */
enum {
        /*
         * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
         * operation using percpu_ref_switch_to_percpu().  If initialized
         * with this flag, the ref will stay in atomic mode until
         * percpu_ref_switch_to_percpu() is invoked on it.
         * Implies ALLOW_REINIT.
         */
        PERCPU_REF_INIT_ATOMIC        = 1 << 0,

        /*
         * Start dead w/ ref == 0 in atomic mode.  Must be revived with
         * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
         * ALLOW_REINIT.
         */
        PERCPU_REF_INIT_DEAD        = 1 << 1,

        /*
         * Allow switching from atomic mode to percpu mode.
         */
        PERCPU_REF_ALLOW_REINIT        = 1 << 2,
};

struct percpu_ref_data {
        atomic_long_t                count;
        percpu_ref_func_t        *release;
        percpu_ref_func_t        *confirm_switch;
        bool                        force_atomic:1;
        bool                        allow_reinit:1;
        struct rcu_head                rcu;
        struct percpu_ref        *ref;
};

struct percpu_ref {
        /*
         * The low bit of the pointer indicates whether the ref is in percpu
         * mode; if set, then get/put will manipulate the atomic_t.
         */
        unsigned long                percpu_count_ptr;

        /*
         * 'percpu_ref' is often embedded into user structure, and only
         * 'percpu_count_ptr' is required in fast path, move other fields
         * into 'percpu_ref_data', so we can reduce memory footprint in
         * fast path.
         */
        struct percpu_ref_data  *data;
};

int __must_check percpu_ref_init(struct percpu_ref *ref,
                                 percpu_ref_func_t *release, unsigned int flags,
                                 gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);

/**
 * percpu_ref_kill - drop the initial ref
 * @ref: percpu_ref to kill
 *
 * Must be used to drop the initial ref on a percpu refcount; must be called
 * precisely once before shutdown.
 *
 * Switches @ref into atomic mode before gathering up the percpu counters
 * and dropping the initial ref.
 *
 * There are no implied RCU grace periods between kill and release.
 */
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
        percpu_ref_kill_and_confirm(ref, NULL);
}

/*
 * Internal helper.  Don't use outside percpu-refcount proper.  The
 * function doesn't return the pointer and let the caller test it for NULL
 * because doing so forces the compiler to generate two conditional
 * branches as it can't assume that @ref->percpu_count is not NULL.
 */
static inline bool __ref_is_percpu(struct percpu_ref *ref,
                                          unsigned long __percpu **percpu_countp)
{
        unsigned long percpu_ptr;

        /*
         * The value of @ref->percpu_count_ptr is tested for
         * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
         * used as a pointer.  If the compiler generates a separate fetch
         * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
         * between contaminating the pointer value, meaning that
         * READ_ONCE() is required when fetching it.
         *
         * The dependency ordering from the READ_ONCE() pairs
         * with smp_store_release() in __percpu_ref_switch_to_percpu().
         */
        percpu_ptr = READ_ONCE(ref->percpu_count_ptr);

        /*
         * Theoretically, the following could test just ATOMIC; however,
         * then we'd have to mask off DEAD separately as DEAD may be
         * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
         * implies ATOMIC anyway.  Test them together.
         */
        if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
                return false;

        *percpu_countp = (unsigned long __percpu *)percpu_ptr;
        return true;
}

/**
 * percpu_ref_get_many - increment a percpu refcount
 * @ref: percpu_ref to get
 * @nr: number of references to get
 *
 * Analogous to atomic_long_add().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_add(*percpu_count, nr);
        else
                atomic_long_add(nr, &ref->data->count);

        rcu_read_unlock();
}

/**
 * percpu_ref_get - increment a percpu refcount
 * @ref: percpu_ref to get
 *
 * Analogous to atomic_long_inc().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
        percpu_ref_get_many(ref, 1);
}

/**
 * percpu_ref_tryget_many - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 * @nr: number of references to get
 *
 * Increment a percpu refcount  by @nr unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
                                          unsigned long nr)
{
        unsigned long __percpu *percpu_count;
        bool ret;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_add(*percpu_count, nr);
                ret = true;
        } else {
                ret = atomic_long_add_unless(&ref->data->count, nr, 0);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_tryget - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
        return percpu_ref_tryget_many(ref, 1);
}

/**
 * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
 * caller is responsible for taking RCU.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        bool ret = false;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (likely(__ref_is_percpu(ref, &percpu_count))) {
                this_cpu_inc(*percpu_count);
                ret = true;
        } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
                ret = atomic_long_inc_not_zero(&ref->data->count);
        }
        return ret;
}

/**
 * percpu_ref_tryget_live - try to increment a live percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless it has already been killed.  Returns
 * %true on success; %false on failure.
 *
 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
 * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
 * should be used.  After the confirm_kill callback is invoked, it's
 * guaranteed that no new reference will be given out by
 * percpu_ref_tryget_live().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
        bool ret = false;

        rcu_read_lock();
        ret = percpu_ref_tryget_live_rcu(ref);
        rcu_read_unlock();
        return ret;
}

/**
 * percpu_ref_put_many - decrement a percpu refcount
 * @ref: percpu_ref to put
 * @nr: number of references to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_sub(*percpu_count, nr);
        else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
                ref->data->release(ref);

        rcu_read_unlock();
}

/**
 * percpu_ref_put - decrement a percpu refcount
 * @ref: percpu_ref to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put(struct percpu_ref *ref)
{
        percpu_ref_put_many(ref, 1);
}

/**
 * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref is dying or dead.
 *
 * This function is safe to call as long as @ref is between init and exit
 * and the caller is responsible for synchronizing against state changes.
 */
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
        return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}

#endif






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 



























































































    2 














































































































































































    1 
    1 





















    5 




























































































































































































































































































































































































































































































































































    1 





























































































































































































































































































































































































































































































































































































































































































































    4 










    3 




































    1 
    3 









    4 












    4 











    4 










































































































































































































































































































    5 





















































































































































































































































































































































































































































    3 












































































    5 























































































    1 






    4 




























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Interfaces handler.
 *
 * Version:        @(#)dev.h        1.0.10        08/12/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Donald J. Becker, <becker@cesdis.gsfc.nasa.gov>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Bjorn Ekwall. <bj0rn@blox.se>
 *              Pekka Riikonen <priikone@poseidon.pspt.fi>
 *
 *                Moved to /usr/include/linux for NET3
 */
#ifndef _LINUX_NETDEVICE_H
#define _LINUX_NETDEVICE_H

#include <linux/timer.h>
#include <linux/bug.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <asm/cache.h>
#include <asm/byteorder.h>
#include <asm/local.h>

#include <linux/percpu.h>
#include <linux/rculist.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>

#include <net/net_namespace.h>
#ifdef CONFIG_DCB
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>

#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <linux/netdevice_xmit.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <uapi/linux/netdev.h>
#include <linux/hashtable.h>
#include <linux/rbtree.h>
#include <net/net_trackers.h>
#include <net/net_debug.h>
#include <net/dropreason-core.h>

struct netpoll_info;
struct device;
struct ethtool_ops;
struct kernel_hwtstamp_config;
struct phy_device;
struct dsa_port;
struct ip_tunnel_parm_kern;
struct macsec_context;
struct macsec_ops;
struct netdev_name_node;
struct sd_flow_limit;
struct sfp_bus;
/* 802.11 specific */
struct wireless_dev;
/* 802.15.4 specific */
struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;
struct xdp_frame;
struct xdp_metadata_ops;
struct xdp_md;

typedef u32 xdp_features_t;

void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);
void netdev_sw_irq_coalesce_default_on(struct net_device *dev);

/* Backlog congestion levels */
#define NET_RX_SUCCESS                0        /* keep 'em coming, baby */
#define NET_RX_DROP                1        /* packet dropped */

#define MAX_NEST_DEV 8

/*
 * Transmit return codes: transmit return codes originate from three different
 * namespaces:
 *
 * - qdisc return codes
 * - driver transmit return codes
 * - errno values
 *
 * Drivers are allowed to return any one of those in their hard_start_xmit()
 * function. Real network devices commonly used with qdiscs should only return
 * the driver transmit return codes though - when qdiscs are used, the actual
 * transmission happens asynchronously, so the value is not propagated to
 * higher layers. Virtual network devices transmit synchronously; in this case
 * the driver transmit return codes are consumed by dev_queue_xmit(), and all
 * others are propagated to higher layers.
 */

/* qdisc ->enqueue() return codes. */
#define NET_XMIT_SUCCESS        0x00
#define NET_XMIT_DROP                0x01        /* skb dropped                        */
#define NET_XMIT_CN                0x02        /* congestion notification        */
#define NET_XMIT_MASK                0x0f        /* qdisc flags in net/sch_generic.h */

/* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
 * indicates that the device will soon be dropping packets, or already drops
 * some packets of the same priority; prompting us to send less aggressively. */
#define net_xmit_eval(e)        ((e) == NET_XMIT_CN ? 0 : (e))
#define net_xmit_errno(e)        ((e) != NET_XMIT_CN ? -ENOBUFS : 0)

/* Driver transmit return codes */
#define NETDEV_TX_MASK                0xf0

enum netdev_tx {
        __NETDEV_TX_MIN         = INT_MIN,        /* make sure enum is signed */
        NETDEV_TX_OK         = 0x00,        /* driver took care of packet */
        NETDEV_TX_BUSY         = 0x10,        /* driver tx path was busy*/
};
typedef enum netdev_tx netdev_tx_t;

/*
 * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
 * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
 */
static inline bool dev_xmit_complete(int rc)
{
        /*
         * Positive cases with an skb consumed by a driver:
         * - successful transmission (rc == NETDEV_TX_OK)
         * - error while transmitting (rc < 0)
         * - error while queueing to a different device (rc & NET_XMIT_MASK)
         */
        if (likely(rc < NET_XMIT_MASK))
                return true;

        return false;
}

/*
 *        Compute the worst-case header length according to the protocols
 *        used.
 */

#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
# if defined(CONFIG_MAC80211_MESH)
#  define LL_MAX_HEADER 128
# else
#  define LL_MAX_HEADER 96
# endif
#else
# define LL_MAX_HEADER 32
#endif

#if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
    !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
#define MAX_HEADER LL_MAX_HEADER
#else
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif

/*
 *        Old network device statistics. Fields are native words
 *        (unsigned long) so they can be read and written atomically.
 */

#define NET_DEV_STAT(FIELD)                        \
        union {                                        \
                unsigned long FIELD;                \
                atomic_long_t __##FIELD;        \
        }

struct net_device_stats {
        NET_DEV_STAT(rx_packets);
        NET_DEV_STAT(tx_packets);
        NET_DEV_STAT(rx_bytes);
        NET_DEV_STAT(tx_bytes);
        NET_DEV_STAT(rx_errors);
        NET_DEV_STAT(tx_errors);
        NET_DEV_STAT(rx_dropped);
        NET_DEV_STAT(tx_dropped);
        NET_DEV_STAT(multicast);
        NET_DEV_STAT(collisions);
        NET_DEV_STAT(rx_length_errors);
        NET_DEV_STAT(rx_over_errors);
        NET_DEV_STAT(rx_crc_errors);
        NET_DEV_STAT(rx_frame_errors);
        NET_DEV_STAT(rx_fifo_errors);
        NET_DEV_STAT(rx_missed_errors);
        NET_DEV_STAT(tx_aborted_errors);
        NET_DEV_STAT(tx_carrier_errors);
        NET_DEV_STAT(tx_fifo_errors);
        NET_DEV_STAT(tx_heartbeat_errors);
        NET_DEV_STAT(tx_window_errors);
        NET_DEV_STAT(rx_compressed);
        NET_DEV_STAT(tx_compressed);
};
#undef NET_DEV_STAT

/* per-cpu stats, allocated on demand.
 * Try to fit them in a single cache line, for dev_get_stats() sake.
 */
struct net_device_core_stats {
        unsigned long        rx_dropped;
        unsigned long        tx_dropped;
        unsigned long        rx_nohandler;
        unsigned long        rx_otherhost_dropped;
} __aligned(4 * sizeof(unsigned long));

#include <linux/cache.h>
#include <linux/skbuff.h>

struct neighbour;
struct neigh_parms;
struct sk_buff;

struct netdev_hw_addr {
        struct list_head        list;
        struct rb_node                node;
        unsigned char                addr[MAX_ADDR_LEN];
        unsigned char                type;
#define NETDEV_HW_ADDR_T_LAN                1
#define NETDEV_HW_ADDR_T_SAN                2
#define NETDEV_HW_ADDR_T_UNICAST        3
#define NETDEV_HW_ADDR_T_MULTICAST        4
        bool                        global_use;
        int                        sync_cnt;
        int                        refcount;
        int                        synced;
        struct rcu_head                rcu_head;
};

struct netdev_hw_addr_list {
        struct list_head        list;
        int                        count;

        /* Auxiliary tree for faster lookup on addition and deletion */
        struct rb_root                tree;
};

#define netdev_hw_addr_list_count(l) ((l)->count)
#define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0)
#define netdev_hw_addr_list_for_each(ha, l) \
        list_for_each_entry(ha, &(l)->list, list)

#define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc)
#define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
#define netdev_for_each_uc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->uc)
#define netdev_for_each_synced_uc_addr(_ha, _dev) \
        netdev_for_each_uc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

#define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
#define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
#define netdev_for_each_mc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->mc)
#define netdev_for_each_synced_mc_addr(_ha, _dev) \
        netdev_for_each_mc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

struct hh_cache {
        unsigned int        hh_len;
        seqlock_t        hh_lock;

        /* cached hardware header; allow for machine alignment needs.        */
#define HH_DATA_MOD        16
#define HH_DATA_OFF(__len) \
        (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1))
#define HH_DATA_ALIGN(__len) \
        (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1))
        unsigned long        hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
};

/* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much.
 * Alternative is:
 *   dev->hard_header_len ? (dev->hard_header_len +
 *                           (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0
 *
 * We could use other alignment values, but we must maintain the
 * relationship HH alignment <= LL alignment.
 */
#define LL_RESERVED_SPACE(dev) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)

struct header_ops {
        int        (*create) (struct sk_buff *skb, struct net_device *dev,
                           unsigned short type, const void *daddr,
                           const void *saddr, unsigned int len);
        int        (*parse)(const struct sk_buff *skb, unsigned char *haddr);
        int        (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
        void        (*cache_update)(struct hh_cache *hh,
                                const struct net_device *dev,
                                const unsigned char *haddr);
        bool        (*validate)(const char *ll_header, unsigned int len);
        __be16        (*parse_protocol)(const struct sk_buff *skb);
};

/* These flag bits are private to the generic network queueing
 * layer; they may not be explicitly referenced by any other
 * code.
 */

enum netdev_state_t {
        __LINK_STATE_START,
        __LINK_STATE_PRESENT,
        __LINK_STATE_NOCARRIER,
        __LINK_STATE_LINKWATCH_PENDING,
        __LINK_STATE_DORMANT,
        __LINK_STATE_TESTING,
};

struct gro_list {
        struct list_head        list;
        int                        count;
};

/*
 * size of gro hash buckets, must less than bit number of
 * napi_struct::gro_bitmask
 */
#define GRO_HASH_BUCKETS        8

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
        /* The poll_list must only be managed by the entity which
         * changes the state of the NAPI_STATE_SCHED bit.  This means
         * whoever atomically sets that bit can add this napi_struct
         * to the per-CPU poll_list, and whoever clears that bit
         * can remove from the list right before clearing the bit.
         */
        struct list_head        poll_list;

        unsigned long                state;
        int                        weight;
        int                        defer_hard_irqs_count;
        unsigned long                gro_bitmask;
        int                        (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
        /* CPU actively polling if netpoll is configured */
        int                        poll_owner;
#endif
        /* CPU on which NAPI has been scheduled for processing */
        int                        list_owner;
        struct net_device        *dev;
        struct gro_list                gro_hash[GRO_HASH_BUCKETS];
        struct sk_buff                *skb;
        struct list_head        rx_list; /* Pending GRO_NORMAL skbs */
        int                        rx_count; /* length of rx_list */
        unsigned int                napi_id;
        struct hrtimer                timer;
        struct task_struct        *thread;
        /* control-path-only fields follow */
        struct list_head        dev_list;
        struct hlist_node        napi_hash_node;
        int                        irq;
};

enum {
        NAPI_STATE_SCHED,                /* Poll is scheduled */
        NAPI_STATE_MISSED,                /* reschedule a napi */
        NAPI_STATE_DISABLE,                /* Disable pending */
        NAPI_STATE_NPSVC,                /* Netpoll - don't dequeue from poll_list */
        NAPI_STATE_LISTED,                /* NAPI added to system lists */
        NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
        NAPI_STATE_PREFER_BUSY_POLL,        /* prefer busy-polling over softirq processing*/
        NAPI_STATE_THREADED,                /* The poll is performed inside its own thread*/
        NAPI_STATE_SCHED_THREADED,        /* Napi is currently scheduled in threaded mode */
};

enum {
        NAPIF_STATE_SCHED                = BIT(NAPI_STATE_SCHED),
        NAPIF_STATE_MISSED                = BIT(NAPI_STATE_MISSED),
        NAPIF_STATE_DISABLE                = BIT(NAPI_STATE_DISABLE),
        NAPIF_STATE_NPSVC                = BIT(NAPI_STATE_NPSVC),
        NAPIF_STATE_LISTED                = BIT(NAPI_STATE_LISTED),
        NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL        = BIT(NAPI_STATE_PREFER_BUSY_POLL),
        NAPIF_STATE_THREADED                = BIT(NAPI_STATE_THREADED),
        NAPIF_STATE_SCHED_THREADED        = BIT(NAPI_STATE_SCHED_THREADED),
};

enum gro_result {
        GRO_MERGED,
        GRO_MERGED_FREE,
        GRO_HELD,
        GRO_NORMAL,
        GRO_CONSUMED,
};
typedef enum gro_result gro_result_t;

/*
 * enum rx_handler_result - Possible return values for rx_handlers.
 * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it
 * further.
 * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in
 * case skb->dev was changed by rx_handler.
 * @RX_HANDLER_EXACT: Force exact delivery, no wildcard.
 * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called.
 *
 * rx_handlers are functions called from inside __netif_receive_skb(), to do
 * special processing of the skb, prior to delivery to protocol handlers.
 *
 * Currently, a net_device can only have a single rx_handler registered. Trying
 * to register a second rx_handler will return -EBUSY.
 *
 * To register a rx_handler on a net_device, use netdev_rx_handler_register().
 * To unregister a rx_handler on a net_device, use
 * netdev_rx_handler_unregister().
 *
 * Upon return, rx_handler is expected to tell __netif_receive_skb() what to
 * do with the skb.
 *
 * If the rx_handler consumed the skb in some way, it should return
 * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for
 * the skb to be delivered in some other way.
 *
 * If the rx_handler changed skb->dev, to divert the skb to another
 * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the
 * new device will be called if it exists.
 *
 * If the rx_handler decides the skb should be ignored, it should return
 * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that
 * are registered on exact device (ptype->dev == skb->dev).
 *
 * If the rx_handler didn't change skb->dev, but wants the skb to be normally
 * delivered, it should return RX_HANDLER_PASS.
 *
 * A device without a registered rx_handler will behave as if rx_handler
 * returned RX_HANDLER_PASS.
 */

enum rx_handler_result {
        RX_HANDLER_CONSUMED,
        RX_HANDLER_ANOTHER,
        RX_HANDLER_EXACT,
        RX_HANDLER_PASS,
};
typedef enum rx_handler_result rx_handler_result_t;
typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);

void __napi_schedule(struct napi_struct *n);
void __napi_schedule_irqoff(struct napi_struct *n);

static inline bool napi_disable_pending(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_DISABLE, &n->state);
}

static inline bool napi_prefer_busy_poll(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}

/**
 * napi_is_scheduled - test if NAPI is scheduled
 * @n: NAPI context
 *
 * This check is "best-effort". With no locking implemented,
 * a NAPI can be scheduled or terminate right after this check
 * and produce not precise results.
 *
 * NAPI_STATE_SCHED is an internal state, napi_is_scheduled
 * should not be used normally and napi_schedule should be
 * used instead.
 *
 * Use only if the driver really needs to check if a NAPI
 * is scheduled for example in the context of delayed timer
 * that can be skipped if a NAPI is already scheduled.
 *
 * Return True if NAPI is scheduled, False otherwise.
 */
static inline bool napi_is_scheduled(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_SCHED, &n->state);
}

bool napi_schedule_prep(struct napi_struct *n);

/**
 *        napi_schedule - schedule NAPI poll
 *        @n: NAPI context
 *
 * Schedule NAPI poll routine to be called if it is not already
 * running.
 * Return true if we schedule a NAPI or false if not.
 * Refer to napi_schedule_prep() for additional reason on why
 * a NAPI might not be scheduled.
 */
static inline bool napi_schedule(struct napi_struct *n)
{
        if (napi_schedule_prep(n)) {
                __napi_schedule(n);
                return true;
        }

        return false;
}

/**
 *        napi_schedule_irqoff - schedule NAPI poll
 *        @n: NAPI context
 *
 * Variant of napi_schedule(), assuming hard irqs are masked.
 */
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule_irqoff(n);
}

/**
 * napi_complete_done - NAPI processing complete
 * @n: NAPI context
 * @work_done: number of packets processed
 *
 * Mark NAPI processing as complete. Should only be called if poll budget
 * has not been completely consumed.
 * Prefer over napi_complete().
 * Return false if device should avoid rearming interrupts.
 */
bool napi_complete_done(struct napi_struct *n, int work_done);

static inline bool napi_complete(struct napi_struct *n)
{
        return napi_complete_done(n, 0);
}

int dev_set_threaded(struct net_device *dev, bool threaded);

/**
 *        napi_disable - prevent NAPI from scheduling
 *        @n: NAPI context
 *
 * Stop NAPI from being scheduled on this context.
 * Waits till any outstanding processing completes.
 */
void napi_disable(struct napi_struct *n);

void napi_enable(struct napi_struct *n);

/**
 *        napi_synchronize - wait until NAPI is not running
 *        @n: NAPI context
 *
 * Wait until NAPI is done being scheduled on this context.
 * Waits till any outstanding processing completes but
 * does not disable future activations.
 */
static inline void napi_synchronize(const struct napi_struct *n)
{
        if (IS_ENABLED(CONFIG_SMP))
                while (test_bit(NAPI_STATE_SCHED, &n->state))
                        msleep(1);
        else
                barrier();
}

/**
 *        napi_if_scheduled_mark_missed - if napi is running, set the
 *        NAPIF_STATE_MISSED
 *        @n: NAPI context
 *
 * If napi is running, set the NAPIF_STATE_MISSED, and return true if
 * NAPI is scheduled.
 **/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
        unsigned long val, new;

        val = READ_ONCE(n->state);
        do {
                if (val & NAPIF_STATE_DISABLE)
                        return true;

                if (!(val & NAPIF_STATE_SCHED))
                        return false;

                new = val | NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return true;
}

enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
        __QUEUE_STATE_STACK_XOFF,
        __QUEUE_STATE_FROZEN,
};

#define QUEUE_STATE_DRV_XOFF        (1 << __QUEUE_STATE_DRV_XOFF)
#define QUEUE_STATE_STACK_XOFF        (1 << __QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_FROZEN        (1 << __QUEUE_STATE_FROZEN)

#define QUEUE_STATE_ANY_XOFF        (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
                                        QUEUE_STATE_FROZEN)
#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \
                                        QUEUE_STATE_FROZEN)

/*
 * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue.  The
 * netif_tx_* functions below are used to manipulate this flag.  The
 * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
 * queue independently.  The netif_xmit_*stopped functions below are called
 * to check if the queue has been stopped by the driver or stack (either
 * of the XOFF bits are set in the state).  Drivers should not need to call
 * netif_xmit*stopped functions, they should only be using netif_tx_*.
 */

struct netdev_queue {
/*
 * read-mostly part
 */
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        struct Qdisc __rcu        *qdisc;
        struct Qdisc __rcu        *qdisc_sleeping;
#ifdef CONFIG_SYSFS
        struct kobject                kobj;
#endif
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        int                        numa_node;
#endif
        unsigned long                tx_maxrate;
        /*
         * Number of TX timeouts for this queue
         * (/sys/class/net/DEV/Q/trans_timeout)
         */
        atomic_long_t                trans_timeout;

        /* Subordinate device that the queue has been assigned to */
        struct net_device        *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
        struct xsk_buff_pool    *pool;
#endif
        /* NAPI instance for the queue
         * Readers and writers must hold RTNL
         */
        struct napi_struct      *napi;
/*
 * write-mostly part
 */
        spinlock_t                _xmit_lock ____cacheline_aligned_in_smp;
        int                        xmit_lock_owner;
        /*
         * Time (in jiffies) of last Tx
         */
        unsigned long                trans_start;

        unsigned long                state;

#ifdef CONFIG_BQL
        struct dql                dql;
#endif
} ____cacheline_aligned_in_smp;

extern int sysctl_fb_tunnels_only_for_init_net;
extern int sysctl_devconf_inherit_init_net;

/*
 * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns
 *                                     == 1 : For initns only
 *                                     == 2 : For none.
 */
static inline bool net_has_fallback_tunnels(const struct net *net)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);

        return !fb_tunnels_only_for_init_net ||
                (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
#else
        return true;
#endif
}

static inline int net_inherit_devconf(void)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        return READ_ONCE(sysctl_devconf_inherit_init_net);
#else
        return 0;
#endif
}

static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        return q->numa_node;
#else
        return NUMA_NO_NODE;
#endif
}

static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        q->numa_node = node;
#endif
}

#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
                         u16 filter_id);
#endif

/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
enum xps_map_type {
        XPS_CPUS = 0,
        XPS_RXQS,
        XPS_MAPS_MAX,
};

#ifdef CONFIG_XPS
/*
 * This structure holds an XPS map which can be of variable length.  The
 * map is an array of queues.
 */
struct xps_map {
        unsigned int len;
        unsigned int alloc_len;
        struct rcu_head rcu;
        u16 queues[];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
       - sizeof(struct xps_map)) / sizeof(u16))

/*
 * This structure holds all XPS maps for device.  Maps are indexed by CPU.
 *
 * We keep track of the number of cpus/rxqs used when the struct is allocated,
 * in nr_ids. This will help not accessing out-of-bound memory.
 *
 * We keep track of the number of traffic classes used when the struct is
 * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
 * not crossing its upper bound, as the original dev->num_tc can be updated in
 * the meantime.
 */
struct xps_dev_maps {
        struct rcu_head rcu;
        unsigned int nr_ids;
        s16 num_tc;
        struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};

#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +        \
        (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))

#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
        (_rxqs * (_tcs) * sizeof(struct xps_map *)))

#endif /* CONFIG_XPS */

#define TC_MAX_QUEUE        16
#define TC_BITMASK        15
/* HW offloaded queuing disciplines txq count and offset maps */
struct netdev_tc_txq {
        u16 count;
        u16 offset;
};

#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/*
 * This structure is to hold information about the device
 * configured to run FCoE protocol stack.
 */
struct netdev_fcoe_hbainfo {
        char        manufacturer[64];
        char        serial_number[64];
        char        hardware_version[64];
        char        driver_version[64];
        char        optionrom_version[64];
        char        firmware_version[64];
        char        model[256];
        char        model_description[256];
};
#endif

#define MAX_PHYS_ITEM_ID_LEN 32

/* This structure holds a unique identifier to identify some
 * physical item (port for example) used by a netdevice.
 */
struct netdev_phys_item_id {
        unsigned char id[MAX_PHYS_ITEM_ID_LEN];
        unsigned char id_len;
};

static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
                                            struct netdev_phys_item_id *b)
{
        return a->id_len == b->id_len &&
               memcmp(a->id, b->id, a->id_len) == 0;
}

typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct net_device *sb_dev);

enum net_device_path_type {
        DEV_PATH_ETHERNET = 0,
        DEV_PATH_VLAN,
        DEV_PATH_BRIDGE,
        DEV_PATH_PPPOE,
        DEV_PATH_DSA,
        DEV_PATH_MTK_WDMA,
};

struct net_device_path {
        enum net_device_path_type        type;
        const struct net_device                *dev;
        union {
                struct {
                        u16                id;
                        __be16                proto;
                        u8                h_dest[ETH_ALEN];
                } encap;
                struct {
                        enum {
                                DEV_PATH_BR_VLAN_KEEP,
                                DEV_PATH_BR_VLAN_TAG,
                                DEV_PATH_BR_VLAN_UNTAG,
                                DEV_PATH_BR_VLAN_UNTAG_HW,
                        }                vlan_mode;
                        u16                vlan_id;
                        __be16                vlan_proto;
                } bridge;
                struct {
                        int port;
                        u16 proto;
                } dsa;
                struct {
                        u8 wdma_idx;
                        u8 queue;
                        u16 wcid;
                        u8 bss;
                        u8 amsdu;
                } mtk_wdma;
        };
};

#define NET_DEVICE_PATH_STACK_MAX        5
#define NET_DEVICE_PATH_VLAN_MAX        2

struct net_device_path_stack {
        int                        num_paths;
        struct net_device_path        path[NET_DEVICE_PATH_STACK_MAX];
};

struct net_device_path_ctx {
        const struct net_device *dev;
        u8                        daddr[ETH_ALEN];

        int                        num_vlans;
        struct {
                u16                id;
                __be16                proto;
        } vlan[NET_DEVICE_PATH_VLAN_MAX];
};

enum tc_setup_type {
        TC_QUERY_CAPS,
        TC_SETUP_QDISC_MQPRIO,
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
        TC_SETUP_CLSMATCHALL,
        TC_SETUP_CLSBPF,
        TC_SETUP_BLOCK,
        TC_SETUP_QDISC_CBS,
        TC_SETUP_QDISC_RED,
        TC_SETUP_QDISC_PRIO,
        TC_SETUP_QDISC_MQ,
        TC_SETUP_QDISC_ETF,
        TC_SETUP_ROOT_QDISC,
        TC_SETUP_QDISC_GRED,
        TC_SETUP_QDISC_TAPRIO,
        TC_SETUP_FT,
        TC_SETUP_QDISC_ETS,
        TC_SETUP_QDISC_TBF,
        TC_SETUP_QDISC_FIFO,
        TC_SETUP_QDISC_HTB,
        TC_SETUP_ACT,
};

/* These structures hold the attributes of bpf state that are being passed
 * to the netdevice through the bpf op.
 */
enum bpf_netdev_command {
        /* Set or clear a bpf program used in the earliest stages of packet
         * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
         * is responsible for calling bpf_prog_put on any old progs that are
         * stored. In case of error, the callee need not release the new prog
         * reference, but on success it takes ownership and must bpf_prog_put
         * when it is no longer used.
         */
        XDP_SETUP_PROG,
        XDP_SETUP_PROG_HW,
        /* BPF program for offload callbacks, invoked at program load time. */
        BPF_OFFLOAD_MAP_ALLOC,
        BPF_OFFLOAD_MAP_FREE,
        XDP_SETUP_XSK_POOL,
};

struct bpf_prog_offload_ops;
struct netlink_ext_ack;
struct xdp_umem;
struct xdp_dev_bulk_queue;
struct bpf_xdp_link;

enum bpf_xdp_mode {
        XDP_MODE_SKB = 0,
        XDP_MODE_DRV = 1,
        XDP_MODE_HW = 2,
        __MAX_XDP_MODE
};

struct bpf_xdp_entity {
        struct bpf_prog *prog;
        struct bpf_xdp_link *link;
};

struct netdev_bpf {
        enum bpf_netdev_command command;
        union {
                /* XDP_SETUP_PROG */
                struct {
                        u32 flags;
                        struct bpf_prog *prog;
                        struct netlink_ext_ack *extack;
                };
                /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
                struct {
                        struct bpf_offloaded_map *offmap;
                };
                /* XDP_SETUP_XSK_POOL */
                struct {
                        struct xsk_buff_pool *pool;
                        u16 queue_id;
                } xsk;
        };
};

/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)

#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
        int        (*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack);
        void        (*xdo_dev_state_delete) (struct xfrm_state *x);
        void        (*xdo_dev_state_free) (struct xfrm_state *x);
        bool        (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void        (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
        void        (*xdo_dev_state_update_stats) (struct xfrm_state *x);
        int        (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
        void        (*xdo_dev_policy_delete) (struct xfrm_policy *x);
        void        (*xdo_dev_policy_free) (struct xfrm_policy *x);
};
#endif

struct dev_ifalias {
        struct rcu_head rcuhead;
        char ifalias[];
};

struct devlink;
struct tlsdev_ops;

struct netdev_net_notifier {
        struct list_head list;
        struct notifier_block *nb;
};

/*
 * This structure defines the management hooks for network devices.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*ndo_init)(struct net_device *dev);
 *     This function is called once when a network device is registered.
 *     The network device can use this for any late stage initialization
 *     or semantic validation. It can fail with an error code which will
 *     be propagated back to register_netdev.
 *
 * void (*ndo_uninit)(struct net_device *dev);
 *     This function is called when device is unregistered or when registration
 *     fails. It is not called if init fails.
 *
 * int (*ndo_open)(struct net_device *dev);
 *     This function is called when a network device transitions to the up
 *     state.
 *
 * int (*ndo_stop)(struct net_device *dev);
 *     This function is called when a network device transitions to the down
 *     state.
 *
 * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
 *                               struct net_device *dev);
 *        Called when a packet needs to be transmitted.
 *        Returns NETDEV_TX_OK.  Can return NETDEV_TX_BUSY, but you should stop
 *        the queue before that can happen; it's for obsolete devices and weird
 *        corner cases, but the stack really does a non-trivial amount
 *        of useless work if you return NETDEV_TX_BUSY.
 *        Required; cannot be NULL.
 *
 * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
 *                                           struct net_device *dev
 *                                           netdev_features_t features);
 *        Called by core transmit path to determine if device is capable of
 *        performing offload operations on a given packet. This is to give
 *        the device an opportunity to implement any restrictions that cannot
 *        be otherwise expressed by feature flags. The check is called with
 *        the set of features that the stack has calculated and it returns
 *        those the driver believes to be appropriate.
 *
 * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
 *                         struct net_device *sb_dev);
 *        Called to decide which queue to use when device supports multiple
 *        transmit queues.
 *
 * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
 *        This function is called to allow device receiver to make
 *        changes to configuration when multicast or promiscuous is enabled.
 *
 * void (*ndo_set_rx_mode)(struct net_device *dev);
 *        This function is called device changes address list filtering.
 *        If driver handles unicast address filtering, it should set
 *        IFF_UNICAST_FLT in its priv_flags.
 *
 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
 *        This function  is called when the Media Access Control address
 *        needs to be changed. If this interface is not defined, the
 *        MAC address can not be changed.
 *
 * int (*ndo_validate_addr)(struct net_device *dev);
 *        Test if Media Access Control address is valid for the device.
 *
 * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Old-style ioctl entry point. This is used internally by the
 *        appletalk and ieee802154 subsystems but is no longer called by
 *        the device ioctl handler.
 *
 * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Used by the bonding driver for its device specific ioctls:
 *        SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
 *        SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
 *
 * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
 *        SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
 *
 * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
 *        Used to set network devices bus interface parameters. This interface
 *        is retained for legacy reasons; new devices should use the bus
 *        interface (PCI) for low level management.
 *
 * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
 *        Called when a user wants to change the Maximum Transfer Unit
 *        of a device.
 *
 * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
 *        Callback used when the transmitter has not made any progress
 *        for dev->watchdog ticks.
 *
 * void (*ndo_get_stats64)(struct net_device *dev,
 *                         struct rtnl_link_stats64 *storage);
 * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 *        Called when a user wants to get the network device usage
 *        statistics. Drivers must do one of the following:
 *        1. Define @ndo_get_stats64 to fill in a zero-initialised
 *           rtnl_link_stats64 structure passed by the caller.
 *        2. Define @ndo_get_stats to update a net_device_stats structure
 *           (which should normally be dev->stats) and return a pointer to
 *           it. The structure may be changed asynchronously only if each
 *           field is written atomically.
 *        3. Update dev->stats asynchronously and atomically, and define
 *           neither operation.
 *
 * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
 *        Return true if this device supports offload stats of this attr_id.
 *
 * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
 *        void *attr_data)
 *        Get statistics for offload operations by attr_id. Write it into the
 *        attr_data pointer.
 *
 * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is registered.
 *
 * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is unregistered.
 *
 * void (*ndo_poll_controller)(struct net_device *dev);
 *
 *        SR-IOV management functions.
 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
 *                          u8 qos, __be16 proto);
 * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
 *                          int max_tx_rate);
 * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_config)(struct net_device *dev,
 *                            int vf, struct ifla_vf_info *ivf);
 * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
 * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
 *                          struct nlattr *port[]);
 *
 *      Enable or disable the VF ability to query its RSS Redirection Table and
 *      Hash Key. This is needed since on some devices VF share this information
 *      with PF and querying it may introduce a theoretical security risk.
 * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
 * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
 *                       void *type_data);
 *        Called to setup any 'tc' scheduler, classifier or action on @dev.
 *        This is always called from the stack with the rtnl lock held and netif
 *        tx queues stopped. This allows the netdevice to perform queue
 *        management safely.
 *
 *        Fiber Channel over Ethernet (FCoE) offload functions.
 * int (*ndo_fcoe_enable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to start using LLD for FCoE
 *        so the underlying device can perform whatever needed configuration or
 *        initialization to support acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_disable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to stop using LLD for FCoE
 *        so the underlying device can perform whatever needed clean-ups to
 *        stop supporting acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
 *                             struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Initiator wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_ddp_done)(struct net_device *dev,  u16 xid);
 *        Called when the FCoE Initiator/Target is done with the DDPed I/O as
 *        indicated by the FC exchange id 'xid', so the underlying device can
 *        clean up and reuse resources for later DDP requests.
 *
 * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
 *                              struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Target wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
 *                               struct netdev_fcoe_hbainfo *hbainfo);
 *        Called when the FCoE Protocol stack wants information on the underlying
 *        device. This information is utilized by the FCoE protocol stack to
 *        register attributes with Fiber Channel management service as per the
 *        FC-GS Fabric Device Management Information(FDMI) specification.
 *
 * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
 *        Called when the underlying device wants to override default World Wide
 *        Name (WWN) generation mechanism in FCoE protocol stack to pass its own
 *        World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
 *        protocol stack to use.
 *
 *        RFS acceleration.
 * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
 *                            u16 rxq_index, u32 flow_id);
 *        Set hardware filter for RFS.  rxq_index is the target queue index;
 *        flow_id is a flow ID to be passed to rps_may_expire_flow() later.
 *        Return the filter ID on success, or a negative error code.
 *
 *        Slave management functions (for bridge, bonding, etc).
 * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to make another netdev an underling.
 *
 * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to release previously enslaved netdev.
 *
 * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
 *                                            struct sk_buff *skb,
 *                                            bool all_slaves);
 *        Get the xmit slave of master device. If all_slaves is true, function
 *        assume all the slaves can transmit.
 *
 *      Feature/offload setting functions.
 * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
 *                netdev_features_t features);
 *        Adjusts the requested feature flags according to device-specific
 *        constraints, and returns the resulting flags. Must not modify
 *        the device state.
 *
 * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
 *        Called to update device configuration to new features. Passed
 *        feature set might be less than what was returned by ndo_fix_features()).
 *        Must return >0 or -errno if it changed dev->features itself.
 *
 * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid, u16 flags,
 *                      struct netlink_ext_ack *extack);
 *        Adds an FDB entry to dev for addr.
 * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid)
 *        Deletes the FDB entry from dev coresponding to addr.
 * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
 *                           struct netlink_ext_ack *extack);
 * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
 *                       struct net_device *dev, struct net_device *filter_dev,
 *                       int *idx)
 *        Used to add FDB entries to dump requests. Implementers should add
 *        entries to skb and update idx with the number of entries.
 *
 * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[],
 *                      u16 nlmsg_flags, struct netlink_ext_ack *extack);
 *        Adds an MDB entry to dev.
 * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
 *                      struct netlink_ext_ack *extack);
 *        Deletes the MDB entry from dev.
 * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[],
 *                           struct netlink_ext_ack *extack);
 *        Bulk deletes MDB entries from dev.
 * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
 *                       struct netlink_callback *cb);
 *        Dumps MDB entries from dev. The first argument (marker) in the netlink
 *        callback is used by core rtnetlink code.
 *
 * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags, struct netlink_ext_ack *extack)
 * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
 *                             struct net_device *dev, u32 filter_mask,
 *                             int nlflags)
 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags);
 *
 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
 *        Called to change device carrier. Soft-devices (like dummy, team, etc)
 *        which do not represent real hardware may define this to allow their
 *        userspace components to manage their virtual carrier state. Devices
 *        that determine carrier state from physical hardware properties (eg
 *        network cables) or protocol-dependent mechanisms (eg
 *        USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
 *
 * int (*ndo_get_phys_port_id)(struct net_device *dev,
 *                               struct netdev_phys_item_id *ppid);
 *        Called to get ID of physical port of this device. If driver does
 *        not implement this, it is assumed that the hw is not able to have
 *        multiple net devices on single physical port.
 *
 * int (*ndo_get_port_parent_id)(struct net_device *dev,
 *                                 struct netdev_phys_item_id *ppid)
 *        Called to get the parent ID of the physical port of this device.
 *
 * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
 *                                 struct net_device *dev)
 *        Called by upper layer devices to accelerate switching or other
 *        station functionality into hardware. 'pdev is the lowerdev
 *        to use for the offload and 'dev' is the net device that will
 *        back the offload. Returns a pointer to the private structure
 *        the upper layer will maintain.
 * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
 *        Called by upper layer device to delete the station created
 *        by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
 *        the station and priv is the structure returned by the add
 *        operation.
 * int (*ndo_set_tx_maxrate)(struct net_device *dev,
 *                             int queue_index, u32 maxrate);
 *        Called when a user wants to set a max-rate limitation of specific
 *        TX queue.
 * int (*ndo_get_iflink)(const struct net_device *dev);
 *        Called to get the iflink value of this device.
 * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
 *        This function is used to get egress tunnel information for given skb.
 *        This is useful for retrieving outer tunnel header parameters while
 *        sampling packet.
 * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
 *        This function is used to specify the headroom that the skb must
 *        consider when allocation skb during packet reception. Setting
 *        appropriate rx headroom value allows avoiding skb head copy on
 *        forward. Setting a negative value resets the rx headroom to the
 *        default value.
 * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
 *        This function is used to set or query state related to XDP on the
 *        netdevice and manage BPF offload. See definition of
 *        enum bpf_netdev_command for details.
 * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
 *                        u32 flags);
 *        This function is used to submit @n XDP packets for transmit on a
 *        netdevice. Returns number of frames successfully transmitted, frames
 *        that got dropped are freed/returned via xdp_return_frame().
 *        Returns negative number, means general error invoking ndo, meaning
 *        no frames were xmit'ed and core-caller will free all frames.
 * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
 *                                                struct xdp_buff *xdp);
 *      Get the xmit slave of master device based on the xdp_buff.
 * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
 *      This function is used to wake up the softirq, ksoftirqd or kthread
 *        responsible for sending and/or receiving packets on a specific
 *        queue id bound to an AF_XDP socket. The flags field specifies if
 *        only RX, only Tx, or both should be woken up using the flags
 *        XDP_WAKEUP_RX and XDP_WAKEUP_TX.
 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p,
 *                         int cmd);
 *        Add, change, delete or get information on an IPv4 tunnel.
 * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
 *        If a device is paired with a peer device, return the peer instance.
 *        The caller must be under RCU read context.
 * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
 *     Get the forwarding path to reach the real device from the HW destination address
 * ktime_t (*ndo_get_tstamp)(struct net_device *dev,
 *                             const struct skb_shared_hwtstamps *hwtstamps,
 *                             bool cycles);
 *        Get hardware timestamp based on normal/adjustable time or free running
 *        cycle counter. This function is required if physical clock supports a
 *        free running cycle counter.
 *
 * int (*ndo_hwtstamp_get)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config);
 *        Get the currently configured hardware timestamping parameters for the
 *        NIC device.
 *
 * int (*ndo_hwtstamp_set)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config,
 *                           struct netlink_ext_ack *extack);
 *        Change the hardware timestamping parameters for NIC device.
 */
struct net_device_ops {
        int                        (*ndo_init)(struct net_device *dev);
        void                        (*ndo_uninit)(struct net_device *dev);
        int                        (*ndo_open)(struct net_device *dev);
        int                        (*ndo_stop)(struct net_device *dev);
        netdev_tx_t                (*ndo_start_xmit)(struct sk_buff *skb,
                                                  struct net_device *dev);
        netdev_features_t        (*ndo_features_check)(struct sk_buff *skb,
                                                      struct net_device *dev,
                                                      netdev_features_t features);
        u16                        (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb,
                                                    struct net_device *sb_dev);
        void                        (*ndo_change_rx_flags)(struct net_device *dev,
                                                       int flags);
        void                        (*ndo_set_rx_mode)(struct net_device *dev);
        int                        (*ndo_set_mac_address)(struct net_device *dev,
                                                       void *addr);
        int                        (*ndo_validate_addr)(struct net_device *dev);
        int                        (*ndo_do_ioctl)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_eth_ioctl)(struct net_device *dev,
                                                 struct ifreq *ifr, int cmd);
        int                        (*ndo_siocbond)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_siocwandev)(struct net_device *dev,
                                                  struct if_settings *ifs);
        int                        (*ndo_siocdevprivate)(struct net_device *dev,
                                                      struct ifreq *ifr,
                                                      void __user *data, int cmd);
        int                        (*ndo_set_config)(struct net_device *dev,
                                                  struct ifmap *map);
        int                        (*ndo_change_mtu)(struct net_device *dev,
                                                  int new_mtu);
        int                        (*ndo_neigh_setup)(struct net_device *dev,
                                                   struct neigh_parms *);
        void                        (*ndo_tx_timeout) (struct net_device *dev,
                                                   unsigned int txqueue);

        void                        (*ndo_get_stats64)(struct net_device *dev,
                                                   struct rtnl_link_stats64 *storage);
        bool                        (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
        int                        (*ndo_get_offload_stats)(int attr_id,
                                                         const struct net_device *dev,
                                                         void *attr_data);
        struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);

        int                        (*ndo_vlan_rx_add_vid)(struct net_device *dev,
                                                       __be16 proto, u16 vid);
        int                        (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
                                                        __be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
        void                    (*ndo_poll_controller)(struct net_device *dev);
        int                        (*ndo_netpoll_setup)(struct net_device *dev,
                                                     struct netpoll_info *info);
        void                        (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
        int                        (*ndo_set_vf_mac)(struct net_device *dev,
                                                  int queue, u8 *mac);
        int                        (*ndo_set_vf_vlan)(struct net_device *dev,
                                                   int queue, u16 vlan,
                                                   u8 qos, __be16 proto);
        int                        (*ndo_set_vf_rate)(struct net_device *dev,
                                                   int vf, int min_tx_rate,
                                                   int max_tx_rate);
        int                        (*ndo_set_vf_spoofchk)(struct net_device *dev,
                                                       int vf, bool setting);
        int                        (*ndo_set_vf_trust)(struct net_device *dev,
                                                    int vf, bool setting);
        int                        (*ndo_get_vf_config)(struct net_device *dev,
                                                     int vf,
                                                     struct ifla_vf_info *ivf);
        int                        (*ndo_set_vf_link_state)(struct net_device *dev,
                                                         int vf, int link_state);
        int                        (*ndo_get_vf_stats)(struct net_device *dev,
                                                    int vf,
                                                    struct ifla_vf_stats
                                                    *vf_stats);
        int                        (*ndo_set_vf_port)(struct net_device *dev,
                                                   int vf,
                                                   struct nlattr *port[]);
        int                        (*ndo_get_vf_port)(struct net_device *dev,
                                                   int vf, struct sk_buff *skb);
        int                        (*ndo_get_vf_guid)(struct net_device *dev,
                                                   int vf,
                                                   struct ifla_vf_guid *node_guid,
                                                   struct ifla_vf_guid *port_guid);
        int                        (*ndo_set_vf_guid)(struct net_device *dev,
                                                   int vf, u64 guid,
                                                   int guid_type);
        int                        (*ndo_set_vf_rss_query_en)(
                                                   struct net_device *dev,
                                                   int vf, bool setting);
        int                        (*ndo_setup_tc)(struct net_device *dev,
                                                enum tc_setup_type type,
                                                void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
        int                        (*ndo_fcoe_enable)(struct net_device *dev);
        int                        (*ndo_fcoe_disable)(struct net_device *dev);
        int                        (*ndo_fcoe_ddp_setup)(struct net_device *dev,
                                                      u16 xid,
                                                      struct scatterlist *sgl,
                                                      unsigned int sgc);
        int                        (*ndo_fcoe_ddp_done)(struct net_device *dev,
                                                     u16 xid);
        int                        (*ndo_fcoe_ddp_target)(struct net_device *dev,
                                                       u16 xid,
                                                       struct scatterlist *sgl,
                                                       unsigned int sgc);
        int                        (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
                                                        struct netdev_fcoe_hbainfo *hbainfo);
#endif

#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
        int                        (*ndo_fcoe_get_wwn)(struct net_device *dev,
                                                    u64 *wwn, int type);
#endif

#ifdef CONFIG_RFS_ACCEL
        int                        (*ndo_rx_flow_steer)(struct net_device *dev,
                                                     const struct sk_buff *skb,
                                                     u16 rxq_index,
                                                     u32 flow_id);
#endif
        int                        (*ndo_add_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev,
                                                 struct netlink_ext_ack *extack);
        int                        (*ndo_del_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev);
        struct net_device*        (*ndo_get_xmit_slave)(struct net_device *dev,
                                                      struct sk_buff *skb,
                                                      bool all_slaves);
        struct net_device*        (*ndo_sk_get_lower_dev)(struct net_device *dev,
                                                        struct sock *sk);
        netdev_features_t        (*ndo_fix_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_set_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_neigh_construct)(struct net_device *dev,
                                                       struct neighbour *n);
        void                        (*ndo_neigh_destroy)(struct net_device *dev,
                                                     struct neighbour *n);

        int                        (*ndo_fdb_add)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               u16 flags,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid, struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
                                                    struct net_device *dev,
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_dump)(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                struct net_device *dev,
                                                struct net_device *filter_dev,
                                                int *idx);
        int                        (*ndo_fdb_get)(struct sk_buff *skb,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid, u32 portid, u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_add)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               u16 nlmsg_flags,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del_bulk)(struct net_device *dev,
                                                    struct nlattr *tb[],
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_dump)(struct net_device *dev,
                                                struct sk_buff *skb,
                                                struct netlink_callback *cb);
        int                        (*ndo_mdb_get)(struct net_device *dev,
                                               struct nlattr *tb[], u32 portid,
                                               u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_setlink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags,
                                                      struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_getlink)(struct sk_buff *skb,
                                                      u32 pid, u32 seq,
                                                      struct net_device *dev,
                                                      u32 filter_mask,
                                                      int nlflags);
        int                        (*ndo_bridge_dellink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags);
        int                        (*ndo_change_carrier)(struct net_device *dev,
                                                      bool new_carrier);
        int                        (*ndo_get_phys_port_id)(struct net_device *dev,
                                                        struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_port_parent_id)(struct net_device *dev,
                                                          struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_phys_port_name)(struct net_device *dev,
                                                          char *name, size_t len);
        void*                        (*ndo_dfwd_add_station)(struct net_device *pdev,
                                                        struct net_device *dev);
        void                        (*ndo_dfwd_del_station)(struct net_device *pdev,
                                                        void *priv);

        int                        (*ndo_set_tx_maxrate)(struct net_device *dev,
                                                      int queue_index,
                                                      u32 maxrate);
        int                        (*ndo_get_iflink)(const struct net_device *dev);
        int                        (*ndo_fill_metadata_dst)(struct net_device *dev,
                                                       struct sk_buff *skb);
        void                        (*ndo_set_rx_headroom)(struct net_device *dev,
                                                       int needed_headroom);
        int                        (*ndo_bpf)(struct net_device *dev,
                                           struct netdev_bpf *bpf);
        int                        (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
        struct net_device *        (*ndo_xdp_get_xmit_slave)(struct net_device *dev,
                                                          struct xdp_buff *xdp);
        int                        (*ndo_xsk_wakeup)(struct net_device *dev,
                                                  u32 queue_id, u32 flags);
        int                        (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm_kern *p,
                                                  int cmd);
        struct net_device *        (*ndo_get_peer_dev)(struct net_device *dev);
        int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
                                                         struct net_device_path *path);
        ktime_t                        (*ndo_get_tstamp)(struct net_device *dev,
                                                  const struct skb_shared_hwtstamps *hwtstamps,
                                                  bool cycles);
        int                        (*ndo_hwtstamp_get)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config);
        int                        (*ndo_hwtstamp_set)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config,
                                                    struct netlink_ext_ack *extack);
};

/**
 * enum netdev_priv_flags - &struct net_device priv_flags
 *
 * These are the &struct net_device, they are only set internally
 * by drivers and used in the kernel. These flags are invisible to
 * userspace; this means that the order of these flags can change
 * during any kernel release.
 *
 * You should have a pretty good reason to be extending these flags.
 *
 * @IFF_802_1Q_VLAN: 802.1Q VLAN device
 * @IFF_EBRIDGE: Ethernet bridging device
 * @IFF_BONDING: bonding master or slave
 * @IFF_ISATAP: ISATAP interface (RFC4214)
 * @IFF_WAN_HDLC: WAN HDLC device
 * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
 *        release skb->dst
 * @IFF_DONT_BRIDGE: disallow bridging this ether dev
 * @IFF_DISABLE_NETPOLL: disable netpoll at run-time
 * @IFF_MACVLAN_PORT: device used as macvlan port
 * @IFF_BRIDGE_PORT: device used as bridge port
 * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
 * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
 * @IFF_UNICAST_FLT: Supports unicast filtering
 * @IFF_TEAM_PORT: device used as team port
 * @IFF_SUPP_NOFCS: device supports sending custom FCS
 * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
 *        change when it's running
 * @IFF_MACVLAN: Macvlan device
 * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account
 *        underlying stacked devices
 * @IFF_L3MDEV_MASTER: device is an L3 master device
 * @IFF_NO_QUEUE: device can run without qdisc attached
 * @IFF_OPENVSWITCH: device is a Open vSwitch master
 * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
 * @IFF_TEAM: device is a team device
 * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
 * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
 *        entity (i.e. the master device for bridged veth)
 * @IFF_MACSEC: device is a MACsec device
 * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
 * @IFF_FAILOVER: device is a failover master device
 * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
 * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
 * @IFF_NO_ADDRCONF: prevent ipv6 addrconf
 * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
 *        skb_headlen(skb) == 0 (data starts from frag0)
 * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN
 * @IFF_SEE_ALL_HWTSTAMP_REQUESTS: device wants to see calls to
 *        ndo_hwtstamp_set() for all timestamp requests regardless of source,
 *        even if those aren't HWTSTAMP_SOURCE_NETDEV.
 */
enum netdev_priv_flags {
        IFF_802_1Q_VLAN                        = 1<<0,
        IFF_EBRIDGE                        = 1<<1,
        IFF_BONDING                        = 1<<2,
        IFF_ISATAP                        = 1<<3,
        IFF_WAN_HDLC                        = 1<<4,
        IFF_XMIT_DST_RELEASE                = 1<<5,
        IFF_DONT_BRIDGE                        = 1<<6,
        IFF_DISABLE_NETPOLL                = 1<<7,
        IFF_MACVLAN_PORT                = 1<<8,
        IFF_BRIDGE_PORT                        = 1<<9,
        IFF_OVS_DATAPATH                = 1<<10,
        IFF_TX_SKB_SHARING                = 1<<11,
        IFF_UNICAST_FLT                        = 1<<12,
        IFF_TEAM_PORT                        = 1<<13,
        IFF_SUPP_NOFCS                        = 1<<14,
        IFF_LIVE_ADDR_CHANGE                = 1<<15,
        IFF_MACVLAN                        = 1<<16,
        IFF_XMIT_DST_RELEASE_PERM        = 1<<17,
        IFF_L3MDEV_MASTER                = 1<<18,
        IFF_NO_QUEUE                        = 1<<19,
        IFF_OPENVSWITCH                        = 1<<20,
        IFF_L3MDEV_SLAVE                = 1<<21,
        IFF_TEAM                        = 1<<22,
        IFF_RXFH_CONFIGURED                = 1<<23,
        IFF_PHONY_HEADROOM                = 1<<24,
        IFF_MACSEC                        = 1<<25,
        IFF_NO_RX_HANDLER                = 1<<26,
        IFF_FAILOVER                        = 1<<27,
        IFF_FAILOVER_SLAVE                = 1<<28,
        IFF_L3MDEV_RX_HANDLER                = 1<<29,
        IFF_NO_ADDRCONF                        = BIT_ULL(30),
        IFF_TX_SKB_NO_LINEAR                = BIT_ULL(31),
        IFF_CHANGE_PROTO_DOWN                = BIT_ULL(32),
        IFF_SEE_ALL_HWTSTAMP_REQUESTS        = BIT_ULL(33),
};

#define IFF_802_1Q_VLAN                        IFF_802_1Q_VLAN
#define IFF_EBRIDGE                        IFF_EBRIDGE
#define IFF_BONDING                        IFF_BONDING
#define IFF_ISATAP                        IFF_ISATAP
#define IFF_WAN_HDLC                        IFF_WAN_HDLC
#define IFF_XMIT_DST_RELEASE                IFF_XMIT_DST_RELEASE
#define IFF_DONT_BRIDGE                        IFF_DONT_BRIDGE
#define IFF_DISABLE_NETPOLL                IFF_DISABLE_NETPOLL
#define IFF_MACVLAN_PORT                IFF_MACVLAN_PORT
#define IFF_BRIDGE_PORT                        IFF_BRIDGE_PORT
#define IFF_OVS_DATAPATH                IFF_OVS_DATAPATH
#define IFF_TX_SKB_SHARING                IFF_TX_SKB_SHARING
#define IFF_UNICAST_FLT                        IFF_UNICAST_FLT
#define IFF_TEAM_PORT                        IFF_TEAM_PORT
#define IFF_SUPP_NOFCS                        IFF_SUPP_NOFCS
#define IFF_LIVE_ADDR_CHANGE                IFF_LIVE_ADDR_CHANGE
#define IFF_MACVLAN                        IFF_MACVLAN
#define IFF_XMIT_DST_RELEASE_PERM        IFF_XMIT_DST_RELEASE_PERM
#define IFF_L3MDEV_MASTER                IFF_L3MDEV_MASTER
#define IFF_NO_QUEUE                        IFF_NO_QUEUE
#define IFF_OPENVSWITCH                        IFF_OPENVSWITCH
#define IFF_L3MDEV_SLAVE                IFF_L3MDEV_SLAVE
#define IFF_TEAM                        IFF_TEAM
#define IFF_RXFH_CONFIGURED                IFF_RXFH_CONFIGURED
#define IFF_PHONY_HEADROOM                IFF_PHONY_HEADROOM
#define IFF_MACSEC                        IFF_MACSEC
#define IFF_NO_RX_HANDLER                IFF_NO_RX_HANDLER
#define IFF_FAILOVER                        IFF_FAILOVER
#define IFF_FAILOVER_SLAVE                IFF_FAILOVER_SLAVE
#define IFF_L3MDEV_RX_HANDLER                IFF_L3MDEV_RX_HANDLER
#define IFF_TX_SKB_NO_LINEAR                IFF_TX_SKB_NO_LINEAR

/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
        ML_PRIV_NONE,
        ML_PRIV_CAN,
};

enum netdev_stat_type {
        NETDEV_PCPU_STAT_NONE,
        NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
        NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
        NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
};

enum netdev_reg_state {
        NETREG_UNINITIALIZED = 0,
        NETREG_REGISTERED,        /* completed register_netdevice */
        NETREG_UNREGISTERING,        /* called unregister_netdevice */
        NETREG_UNREGISTERED,        /* completed unregister todo */
        NETREG_RELEASED,        /* called free_netdev */
        NETREG_DUMMY,                /* dummy device for NAPI poll */
};

/**
 *        struct net_device - The DEVICE structure.
 *
 *        Actually, this whole structure is a big mistake.  It mixes I/O
 *        data with strictly "high-level" data, and it has to know about
 *        almost every data structure used in the INET module.
 *
 *        @name:        This is the first field of the "visible" part of this structure
 *                (i.e. as seen by users in the "Space.c" file).  It is the name
 *                of the interface.
 *
 *        @name_node:        Name hashlist node
 *        @ifalias:        SNMP alias
 *        @mem_end:        Shared memory end
 *        @mem_start:        Shared memory start
 *        @base_addr:        Device I/O address
 *        @irq:                Device IRQ number
 *
 *        @state:                Generic network queuing layer state, see netdev_state_t
 *        @dev_list:        The global list of network devices
 *        @napi_list:        List entry used for polling NAPI devices
 *        @unreg_list:        List entry  when we are unregistering the
 *                        device; see the function unregister_netdev
 *        @close_list:        List entry used when we are closing the device
 *        @ptype_all:     Device-specific packet handlers for all protocols
 *        @ptype_specific: Device-specific, protocol-specific packet handlers
 *
 *        @adj_list:        Directly linked devices, like slaves for bonding
 *        @features:        Currently active device features
 *        @hw_features:        User-changeable features
 *
 *        @wanted_features:        User-requested features
 *        @vlan_features:                Mask of features inheritable by VLAN devices
 *
 *        @hw_enc_features:        Mask of features inherited by encapsulating devices
 *                                This field indicates what encapsulation
 *                                offloads the hardware is capable of doing,
 *                                and drivers will need to set them appropriately.
 *
 *        @mpls_features:        Mask of features inheritable by MPLS
 *        @gso_partial_features: value(s) from NETIF_F_GSO\*
 *
 *        @ifindex:        interface index
 *        @group:                The group the device belongs to
 *
 *        @stats:                Statistics struct, which was left as a legacy, use
 *                        rtnl_link_stats64 instead
 *
 *        @core_stats:        core networking counters,
 *                        do not use this in drivers
 *        @carrier_up_count:        Number of times the carrier has been up
 *        @carrier_down_count:        Number of times the carrier has been down
 *
 *        @wireless_handlers:        List of functions to handle Wireless Extensions,
 *                                instead of ioctl,
 *                                see <net/iw_handler.h> for details.
 *        @wireless_data:        Instance data managed by the core of wireless extensions
 *
 *        @netdev_ops:        Includes several pointers to callbacks,
 *                        if one wants to override the ndo_*() functions
 *        @xdp_metadata_ops:        Includes pointers to XDP metadata callbacks.
 *        @xsk_tx_metadata_ops:        Includes pointers to AF_XDP TX metadata callbacks.
 *        @ethtool_ops:        Management operations
 *        @l3mdev_ops:        Layer 3 master device operations
 *        @ndisc_ops:        Includes callbacks for different IPv6 neighbour
 *                        discovery handling. Necessary for e.g. 6LoWPAN.
 *        @xfrmdev_ops:        Transformation offload operations
 *        @tlsdev_ops:        Transport Layer Security offload operations
 *        @header_ops:        Includes callbacks for creating,parsing,caching,etc
 *                        of Layer 2 headers.
 *
 *        @flags:                Interface flags (a la BSD)
 *        @xdp_features:        XDP capability supported by the device
 *        @priv_flags:        Like 'flags' but invisible to userspace,
 *                        see if.h for the definitions
 *        @gflags:        Global flags ( kept as legacy )
 *        @padded:        How much padding added by alloc_netdev()
 *        @operstate:        RFC2863 operstate
 *        @link_mode:        Mapping policy to operstate
 *        @if_port:        Selectable AUI, TP, ...
 *        @dma:                DMA channel
 *        @mtu:                Interface MTU value
 *        @min_mtu:        Interface Minimum MTU value
 *        @max_mtu:        Interface Maximum MTU value
 *        @type:                Interface hardware type
 *        @hard_header_len: Maximum hardware header length.
 *        @min_header_len:  Minimum hardware header length
 *
 *        @needed_headroom: Extra headroom the hardware may need, but not in all
 *                          cases can this be guaranteed
 *        @needed_tailroom: Extra tailroom the hardware may need, but not in all
 *                          cases can this be guaranteed. Some cases also use
 *                          LL_MAX_HEADER instead to allocate the skb
 *
 *        interface address info:
 *
 *         @perm_addr:                Permanent hw address
 *         @addr_assign_type:        Hw address assignment type
 *         @addr_len:                Hardware address length
 *        @upper_level:                Maximum depth level of upper devices.
 *        @lower_level:                Maximum depth level of lower devices.
 *        @neigh_priv_len:        Used in neigh_alloc()
 *         @dev_id:                Used to differentiate devices that share
 *                                 the same link layer address
 *         @dev_port:                Used to differentiate devices that share
 *                                 the same function
 *        @addr_list_lock:        XXX: need comments on this one
 *        @name_assign_type:        network interface name assignment type
 *        @uc_promisc:                Counter that indicates promiscuous mode
 *                                has been enabled due to the need to listen to
 *                                additional unicast addresses in a device that
 *                                does not implement ndo_set_rx_mode()
 *        @uc:                        unicast mac addresses
 *        @mc:                        multicast mac addresses
 *        @dev_addrs:                list of device hw addresses
 *        @queues_kset:                Group of all Kobjects in the Tx and RX queues
 *        @promiscuity:                Number of times the NIC is told to work in
 *                                promiscuous mode; if it becomes 0 the NIC will
 *                                exit promiscuous mode
 *        @allmulti:                Counter, enables or disables allmulticast mode
 *
 *        @vlan_info:        VLAN info
 *        @dsa_ptr:        dsa specific data
 *        @tipc_ptr:        TIPC specific data
 *        @atalk_ptr:        AppleTalk link
 *        @ip_ptr:        IPv4 specific data
 *        @ip6_ptr:        IPv6 specific data
 *        @ax25_ptr:        AX.25 specific data
 *        @ieee80211_ptr:        IEEE 802.11 specific data, assign before registering
 *        @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
 *                         device struct
 *        @mpls_ptr:        mpls_dev struct pointer
 *        @mctp_ptr:        MCTP specific data
 *
 *        @dev_addr:        Hw address (before bcast,
 *                        because most packets are unicast)
 *
 *        @_rx:                        Array of RX queues
 *        @num_rx_queues:                Number of RX queues
 *                                allocated at register_netdev() time
 *        @real_num_rx_queues:         Number of RX queues currently active in device
 *        @xdp_prog:                XDP sockets filter program pointer
 *        @gro_flush_timeout:        timeout for GRO layer in NAPI
 *        @napi_defer_hard_irqs:        If not zero, provides a counter that would
 *                                allow to avoid NIC hard IRQ, on busy queues.
 *
 *        @rx_handler:                handler for received packets
 *        @rx_handler_data:         XXX: need comments on this one
 *        @tcx_ingress:                BPF & clsact qdisc specific data for ingress processing
 *        @ingress_queue:                XXX: need comments on this one
 *        @nf_hooks_ingress:        netfilter hooks executed for ingress packets
 *        @broadcast:                hw bcast address
 *
 *        @rx_cpu_rmap:        CPU reverse-mapping for RX completion interrupts,
 *                        indexed by RX queue number. Assigned by driver.
 *                        This must only be set if the ndo_rx_flow_steer
 *                        operation is defined
 *        @index_hlist:                Device index hash chain
 *
 *        @_tx:                        Array of TX queues
 *        @num_tx_queues:                Number of TX queues allocated at alloc_netdev_mq() time
 *        @real_num_tx_queues:         Number of TX queues currently active in device
 *        @qdisc:                        Root qdisc from userspace point of view
 *        @tx_queue_len:                Max frames per queue allowed
 *        @tx_global_lock:         XXX: need comments on this one
 *        @xdp_bulkq:                XDP device bulk queue
 *        @xps_maps:                all CPUs/RXQs maps for XPS device
 *
 *        @xps_maps:        XXX: need comments on this one
 *        @tcx_egress:                BPF & clsact qdisc specific data for egress processing
 *        @nf_hooks_egress:        netfilter hooks executed for egress packets
 *        @qdisc_hash:                qdisc hash table
 *        @watchdog_timeo:        Represents the timeout that is used by
 *                                the watchdog (see dev_watchdog())
 *        @watchdog_timer:        List of timers
 *
 *        @proto_down_reason:        reason a netdev interface is held down
 *        @pcpu_refcnt:                Number of references to this device
 *        @dev_refcnt:                Number of references to this device
 *        @refcnt_tracker:        Tracker directory for tracked references to this device
 *        @todo_list:                Delayed register/unregister
 *        @link_watch_list:        XXX: need comments on this one
 *
 *        @reg_state:                Register/unregister state machine
 *        @dismantle:                Device is going to be freed
 *        @rtnl_link_state:        This enum represents the phases of creating
 *                                a new link
 *
 *        @needs_free_netdev:        Should unregister perform free_netdev?
 *        @priv_destructor:        Called from unregister
 *        @npinfo:                XXX: need comments on this one
 *         @nd_net:                Network namespace this network device is inside
 *
 *         @ml_priv:        Mid-layer private
 *        @ml_priv_type:  Mid-layer private type
 *
 *        @pcpu_stat_type:        Type of device statistics which the core should
 *                                allocate/free: none, lstats, tstats, dstats. none
 *                                means the driver is handling statistics allocation/
 *                                freeing internally.
 *        @lstats:                Loopback statistics: packets, bytes
 *        @tstats:                Tunnel statistics: RX/TX packets, RX/TX bytes
 *        @dstats:                Dummy statistics: RX/TX/drop packets, RX/TX bytes
 *
 *        @garp_port:        GARP
 *        @mrp_port:        MRP
 *
 *        @dm_private:        Drop monitor private
 *
 *        @dev:                Class/net/name entry
 *        @sysfs_groups:        Space for optional device, statistics and wireless
 *                        sysfs groups
 *
 *        @sysfs_rx_queue_group:        Space for optional per-rx queue attributes
 *        @rtnl_link_ops:        Rtnl_link_ops
 *        @stat_ops:        Optional ops for queue-aware statistics
 *        @queue_mgmt_ops:        Optional ops for queue management
 *
 *        @gso_max_size:        Maximum size of generic segmentation offload
 *        @tso_max_size:        Device (as in HW) limit on the max TSO request size
 *        @gso_max_segs:        Maximum number of segments that can be passed to the
 *                        NIC for GSO
 *        @tso_max_segs:        Device (as in HW) limit on the max TSO segment count
 *         @gso_ipv4_max_size:        Maximum size of generic segmentation offload,
 *                                 for IPv4.
 *
 *        @dcbnl_ops:        Data Center Bridging netlink ops
 *        @num_tc:        Number of traffic classes in the net device
 *        @tc_to_txq:        XXX: need comments on this one
 *        @prio_tc_map:        XXX: need comments on this one
 *
 *        @fcoe_ddp_xid:        Max exchange id for FCoE LRO by ddp
 *
 *        @priomap:        XXX: need comments on this one
 *        @phydev:        Physical device may attach itself
 *                        for hardware timestamping
 *        @sfp_bus:        attached &struct sfp_bus structure.
 *
 *        @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
 *
 *        @proto_down:        protocol port state information can be sent to the
 *                        switch driver and used to set the phys state of the
 *                        switch port.
 *
 *        @wol_enabled:        Wake-on-LAN is enabled
 *
 *        @threaded:        napi threaded mode is enabled
 *
 *        @net_notifier_list:        List of per-net netdev notifier block
 *                                that follow this device when it is moved
 *                                to another network namespace.
 *
 *        @macsec_ops:    MACsec offloading ops
 *
 *        @udp_tunnel_nic_info:        static structure describing the UDP tunnel
 *                                offload capabilities of the device
 *        @udp_tunnel_nic:        UDP tunnel offload state
 *        @xdp_state:                stores info on attached XDP BPF programs
 *
 *        @nested_level:        Used as a parameter of spin_lock_nested() of
 *                        dev->addr_list_lock.
 *        @unlink_list:        As netif_addr_lock() can be called recursively,
 *                        keep a list of interfaces to be deleted.
 *        @gro_max_size:        Maximum size of aggregated packet in generic
 *                        receive offload (GRO)
 *         @gro_ipv4_max_size:        Maximum size of aggregated packet in generic
 *                                 receive offload (GRO), for IPv4.
 *        @xdp_zc_max_segs:        Maximum number of segments supported by AF_XDP
 *                                zero copy driver
 *
 *        @dev_addr_shadow:        Copy of @dev_addr to catch direct writes.
 *        @linkwatch_dev_tracker:        refcount tracker used by linkwatch.
 *        @watchdog_dev_tracker:        refcount tracker used by watchdog.
 *        @dev_registered_tracker:        tracker for reference held while
 *                                        registered
 *        @offload_xstats_l3:        L3 HW stats for this netdevice.
 *
 *        @devlink_port:        Pointer to related devlink port structure.
 *                        Assigned by a driver before netdev registration using
 *                        SET_NETDEV_DEVLINK_PORT macro. This pointer is static
 *                        during the time netdevice is registered.
 *
 *        @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
 *                   where the clock is recovered.
 *
 *        FIXME: cleanup struct net_device such that network protocol info
 *        moves out.
 */

struct net_device {
        /* Cacheline organization can be found documented in
         * Documentation/networking/net_cachelines/net_device.rst.
         * Please update the document when adding new fields.
         */

        /* TX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_tx);
        unsigned long long        priv_flags;
        const struct net_device_ops *netdev_ops;
        const struct header_ops *header_ops;
        struct netdev_queue        *_tx;
        netdev_features_t        gso_partial_features;
        unsigned int                real_num_tx_queues;
        unsigned int                gso_max_size;
        unsigned int                gso_ipv4_max_size;
        u16                        gso_max_segs;
        s16                        num_tc;
        /* Note : dev->mtu is often read without holding a lock.
         * Writers usually hold RTNL.
         * It is recommended to use READ_ONCE() to annotate the reads,
         * and to use WRITE_ONCE() to annotate the writes.
         */
        unsigned int                mtu;
        unsigned short                needed_headroom;
        struct netdev_tc_txq        tc_to_txq[TC_MAX_QUEUE];
#ifdef CONFIG_XPS
        struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        struct nf_hook_entries __rcu *nf_hooks_egress;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_egress;
#endif
        __cacheline_group_end(net_device_read_tx);

        /* TXRX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_txrx);
        union {
                struct pcpu_lstats __percpu                *lstats;
                struct pcpu_sw_netstats __percpu        *tstats;
                struct pcpu_dstats __percpu                *dstats;
        };
        unsigned long                state;
        unsigned int                flags;
        unsigned short                hard_header_len;
        netdev_features_t        features;
        struct inet6_dev __rcu        *ip6_ptr;
        __cacheline_group_end(net_device_read_txrx);

        /* RX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_rx);
        struct bpf_prog __rcu        *xdp_prog;
        struct list_head        ptype_specific;
        int                        ifindex;
        unsigned int                real_num_rx_queues;
        struct netdev_rx_queue        *_rx;
        unsigned long                gro_flush_timeout;
        int                        napi_defer_hard_irqs;
        unsigned int                gro_max_size;
        unsigned int                gro_ipv4_max_size;
        rx_handler_func_t __rcu        *rx_handler;
        void __rcu                *rx_handler_data;
        possible_net_t                        nd_net;
#ifdef CONFIG_NETPOLL
        struct netpoll_info __rcu        *npinfo;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_ingress;
#endif
        __cacheline_group_end(net_device_read_rx);

        char                        name[IFNAMSIZ];
        struct netdev_name_node        *name_node;
        struct dev_ifalias        __rcu *ifalias;
        /*
         *        I/O specific fields
         *        FIXME: Merge these and struct ifmap into one
         */
        unsigned long                mem_end;
        unsigned long                mem_start;
        unsigned long                base_addr;

        /*
         *        Some hardware also needs these fields (state,dev_list,
         *        napi_list,unreg_list,close_list) but they are not
         *        part of the usual set specified in Space.c.
         */


        struct list_head        dev_list;
        struct list_head        napi_list;
        struct list_head        unreg_list;
        struct list_head        close_list;
        struct list_head        ptype_all;

        struct {
                struct list_head upper;
                struct list_head lower;
        } adj_list;

        /* Read-mostly cache-line for fast-path access */
        xdp_features_t                xdp_features;
        const struct xdp_metadata_ops *xdp_metadata_ops;
        const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
        unsigned short                gflags;

        unsigned short                needed_tailroom;

        netdev_features_t        hw_features;
        netdev_features_t        wanted_features;
        netdev_features_t        vlan_features;
        netdev_features_t        hw_enc_features;
        netdev_features_t        mpls_features;

        unsigned int                min_mtu;
        unsigned int                max_mtu;
        unsigned short                type;
        unsigned char                min_header_len;
        unsigned char                name_assign_type;

        int                        group;

        struct net_device_stats        stats; /* not used by modern drivers */

        struct net_device_core_stats __percpu *core_stats;

        /* Stats to monitor link on/off, flapping */
        atomic_t                carrier_up_count;
        atomic_t                carrier_down_count;

#ifdef CONFIG_WIRELESS_EXT
        const struct iw_handler_def *wireless_handlers;
        struct iw_public_data        *wireless_data;
#endif
        const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV
        const struct l3mdev_ops        *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        const struct ndisc_ops *ndisc_ops;
#endif

#ifdef CONFIG_XFRM_OFFLOAD
        const struct xfrmdev_ops *xfrmdev_ops;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
        const struct tlsdev_ops *tlsdev_ops;
#endif

        unsigned int                operstate;
        unsigned char                link_mode;

        unsigned char                if_port;
        unsigned char                dma;

        /* Interface address info. */
        unsigned char                perm_addr[MAX_ADDR_LEN];
        unsigned char                addr_assign_type;
        unsigned char                addr_len;
        unsigned char                upper_level;
        unsigned char                lower_level;

        unsigned short                neigh_priv_len;
        unsigned short          dev_id;
        unsigned short          dev_port;
        unsigned short                padded;

        spinlock_t                addr_list_lock;
        int                        irq;

        struct netdev_hw_addr_list        uc;
        struct netdev_hw_addr_list        mc;
        struct netdev_hw_addr_list        dev_addrs;

#ifdef CONFIG_SYSFS
        struct kset                *queues_kset;
#endif
#ifdef CONFIG_LOCKDEP
        struct list_head        unlink_list;
#endif
        unsigned int                promiscuity;
        unsigned int                allmulti;
        bool                        uc_promisc;
#ifdef CONFIG_LOCKDEP
        unsigned char                nested_level;
#endif


        /* Protocol-specific pointers */
        struct in_device __rcu        *ip_ptr;
#if IS_ENABLED(CONFIG_VLAN_8021Q)
        struct vlan_info __rcu        *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
        struct dsa_port                *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
        struct tipc_bearer __rcu *tipc_ptr;
#endif
#if IS_ENABLED(CONFIG_ATALK)
        void                         *atalk_ptr;
#endif
#if IS_ENABLED(CONFIG_AX25)
        void                        *ax25_ptr;
#endif
#if IS_ENABLED(CONFIG_CFG80211)
        struct wireless_dev        *ieee80211_ptr;
#endif
#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
        struct wpan_dev                *ieee802154_ptr;
#endif
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
        struct mpls_dev __rcu        *mpls_ptr;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct mctp_dev __rcu        *mctp_ptr;
#endif

/*
 * Cache lines mostly used on receive path (including eth_type_trans())
 */
        /* Interface address info used in eth_type_trans() */
        const unsigned char        *dev_addr;

        unsigned int                num_rx_queues;
#define GRO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GRO_MAX_SIZE                (8 * 65535u)
        unsigned int                xdp_zc_max_segs;
        struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
        struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif

        unsigned char                broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap                *rx_cpu_rmap;
#endif
        struct hlist_node        index_hlist;

/*
 * Cache lines mostly used on transmit path
 */
        unsigned int                num_tx_queues;
        struct Qdisc __rcu        *qdisc;
        unsigned int                tx_queue_len;
        spinlock_t                tx_global_lock;

        struct xdp_dev_bulk_queue __percpu *xdp_bulkq;

#ifdef CONFIG_NET_SCHED
        DECLARE_HASHTABLE        (qdisc_hash, 4);
#endif
        /* These may be needed for future network-power-down code. */
        struct timer_list        watchdog_timer;
        int                        watchdog_timeo;

        u32                     proto_down_reason;

        struct list_head        todo_list;

#ifdef CONFIG_PCPU_DEV_REFCNT
        int __percpu                *pcpu_refcnt;
#else
        refcount_t                dev_refcnt;
#endif
        struct ref_tracker_dir        refcnt_tracker;

        struct list_head        link_watch_list;

        u8 reg_state;

        bool dismantle;

        enum {
                RTNL_LINK_INITIALIZED,
                RTNL_LINK_INITIALIZING,
        } rtnl_link_state:16;

        bool needs_free_netdev;
        void (*priv_destructor)(struct net_device *dev);

        /* mid-layer private */
        void                                *ml_priv;
        enum netdev_ml_priv_type        ml_priv_type;

        enum netdev_stat_type                pcpu_stat_type:8;

#if IS_ENABLED(CONFIG_GARP)
        struct garp_port __rcu        *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
        struct mrp_port __rcu        *mrp_port;
#endif
#if IS_ENABLED(CONFIG_NET_DROP_MONITOR)
        struct dm_hw_stat_delta __rcu *dm_private;
#endif
        struct device                dev;
        const struct attribute_group *sysfs_groups[4];
        const struct attribute_group *sysfs_rx_queue_group;

        const struct rtnl_link_ops *rtnl_link_ops;

        const struct netdev_stat_ops *stat_ops;

        const struct netdev_queue_mgmt_ops *queue_mgmt_ops;

        /* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SEGS                65535u
#define GSO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GSO_MAX_SIZE                (8 * GSO_MAX_SEGS)

#define TSO_LEGACY_MAX_SIZE        65536
#define TSO_MAX_SIZE                UINT_MAX
        unsigned int                tso_max_size;
#define TSO_MAX_SEGS                U16_MAX
        u16                        tso_max_segs;

#ifdef CONFIG_DCB
        const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
        u8                        prio_tc_map[TC_BITMASK + 1];

#if IS_ENABLED(CONFIG_FCOE)
        unsigned int                fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
        struct netprio_map __rcu *priomap;
#endif
        struct phy_device        *phydev;
        struct sfp_bus                *sfp_bus;
        struct lock_class_key        *qdisc_tx_busylock;
        bool                        proto_down;
        bool                        threaded;
        unsigned                wol_enabled:1;

        struct list_head        net_notifier_list;

#if IS_ENABLED(CONFIG_MACSEC)
        /* MACsec management functions */
        const struct macsec_ops *macsec_ops;
#endif
        const struct udp_tunnel_nic_info        *udp_tunnel_nic_info;
        struct udp_tunnel_nic        *udp_tunnel_nic;

        /* protected by rtnl_lock */
        struct bpf_xdp_entity        xdp_state[__MAX_XDP_MODE];

        u8 dev_addr_shadow[MAX_ADDR_LEN];
        netdevice_tracker        linkwatch_dev_tracker;
        netdevice_tracker        watchdog_dev_tracker;
        netdevice_tracker        dev_registered_tracker;
        struct rtnl_hw_stats64        *offload_xstats_l3;

        struct devlink_port        *devlink_port;

#if IS_ENABLED(CONFIG_DPLL)
        struct dpll_pin        __rcu        *dpll_pin;
#endif
#if IS_ENABLED(CONFIG_PAGE_POOL)
        /** @page_pools: page pools created for this netdevice */
        struct hlist_head        page_pools;
#endif
};
#define to_net_dev(d) container_of(d, struct net_device, dev)

/*
 * Driver should use this to assign devlink port instance to a netdevice
 * before it registers the netdevice. Therefore devlink_port is static
 * during the netdev lifetime after it is registered.
 */
#define SET_NETDEV_DEVLINK_PORT(dev, port)                        \
({                                                                \
        WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED);        \
        ((dev)->devlink_port = (port));                                \
})

static inline bool netif_elide_gro(const struct net_device *dev)
{
        if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
                return true;
        return false;
}

#define        NETDEV_ALIGN                32

static inline
int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
{
        return dev->prio_tc_map[prio & TC_BITMASK];
}

static inline
int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

        dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
        return 0;
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
void netdev_reset_tc(struct net_device *dev);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc);

static inline
int netdev_get_num_tc(struct net_device *dev)
{
        return dev->num_tc;
}

static inline void net_prefetch(void *p)
{
        prefetch(p);
#if L1_CACHE_BYTES < 128
        prefetch((u8 *)p + L1_CACHE_BYTES);
#endif
}

static inline void net_prefetchw(void *p)
{
        prefetchw(p);
#if L1_CACHE_BYTES < 128
        prefetchw((u8 *)p + L1_CACHE_BYTES);
#endif
}

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev);
int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset);
int netdev_set_sb_channel(struct net_device *dev, u16 channel);
static inline int netdev_get_sb_channel(struct net_device *dev)
{
        return max_t(int, -dev->num_tc, 0);
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                                         unsigned int index)
{
        DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues);
        return &dev->_tx[index];
}

static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
}

static inline void netdev_for_each_tx_queue(struct net_device *dev,
                                            void (*f)(struct net_device *,
                                                      struct netdev_queue *,
                                                      void *),
                                            void *arg)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                f(dev, &dev->_tx[i], arg);
}

#define netdev_lockdep_set_classes(dev)                                \
{                                                                \
        static struct lock_class_key qdisc_tx_busylock_key;        \
        static struct lock_class_key qdisc_xmit_lock_key;        \
        static struct lock_class_key dev_addr_list_lock_key;        \
        unsigned int i;                                                \
                                                                \
        (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key;        \
        lockdep_set_class(&(dev)->addr_list_lock,                \
                          &dev_addr_list_lock_key);                \
        for (i = 0; i < (dev)->num_tx_queues; i++)                \
                lockdep_set_class(&(dev)->_tx[i]._xmit_lock,        \
                                  &qdisc_xmit_lock_key);        \
}

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev);

/* returns the headroom that the master device needs to take in account
 * when forwarding to this dev
 */
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
        return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}

static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
        if (dev->netdev_ops->ndo_set_rx_headroom)
                dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}

/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
        netdev_set_rx_headroom(dev, -1);
}

static inline void *netdev_get_ml_priv(struct net_device *dev,
                                       enum netdev_ml_priv_type type)
{
        if (dev->ml_priv_type != type)
                return NULL;

        return dev->ml_priv;
}

static inline void netdev_set_ml_priv(struct net_device *dev,
                                      void *ml_priv,
                                      enum netdev_ml_priv_type type)
{
        WARN(dev->ml_priv_type && dev->ml_priv_type != type,
             "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
             dev->ml_priv_type, type);
        WARN(!dev->ml_priv_type && dev->ml_priv,
             "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");

        dev->ml_priv = ml_priv;
        dev->ml_priv_type = type;
}

/*
 * Net namespace inlines
 */
static inline
struct net *dev_net(const struct net_device *dev)
{
        return read_pnet(&dev->nd_net);
}

static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
        write_pnet(&dev->nd_net, net);
}

/**
 *        netdev_priv - access network device private data
 *        @dev: network device
 *
 * Get network device private data
 */
static inline void *netdev_priv(const struct net_device *dev)
{
        return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
}

/* Set the sysfs physical device reference for the network logical device
 * if set prior to registration will cause a symlink during initialization.
 */
#define SET_NETDEV_DEV(net, pdev)        ((net)->dev.parent = (pdev))

/* Set the sysfs device type for the network logical device to allow
 * fine-grained identification of different network device types. For
 * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc.
 */
#define SET_NETDEV_DEVTYPE(net, devtype)        ((net)->dev.type = (devtype))

void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type,
                          struct napi_struct *napi);

static inline void netif_napi_set_irq(struct napi_struct *napi, int irq)
{
        napi->irq = irq;
}

/* Default NAPI poll() weight
 * Device drivers are strongly advised to not use bigger value
 */
#define NAPI_POLL_WEIGHT 64

void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
                           int (*poll)(struct napi_struct *, int), int weight);

/**
 * netif_napi_add() - initialize a NAPI context
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * netif_napi_add() must be used to initialize a NAPI context prior to calling
 * *any* of the other NAPI-related functions.
 */
static inline void
netif_napi_add(struct net_device *dev, struct napi_struct *napi,
               int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

static inline void
netif_napi_add_tx_weight(struct net_device *dev,
                         struct napi_struct *napi,
                         int (*poll)(struct napi_struct *, int),
                         int weight)
{
        set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
        netif_napi_add_weight(dev, napi, poll, weight);
}

/**
 * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * This variant of netif_napi_add() should be used from drivers using NAPI
 * to exclusively poll a TX queue.
 * This will avoid we add it into napi_hash[], thus polluting this hash table.
 */
static inline void netif_napi_add_tx(struct net_device *dev,
                                     struct napi_struct *napi,
                                     int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

/**
 *  __netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 * Warning: caller must observe RCU grace period before freeing memory
 * containing @napi. Drivers might want to call this helper to combine
 * all the needed RCU grace periods into a single one.
 */
void __netif_napi_del(struct napi_struct *napi);

/**
 *  netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 *  netif_napi_del() removes a NAPI context from the network device NAPI list
 */
static inline void netif_napi_del(struct napi_struct *napi)
{
        __netif_napi_del(napi);
        synchronize_net();
}

struct packet_type {
        __be16                        type;        /* This is really htons(ether_type). */
        bool                        ignore_outgoing;
        struct net_device        *dev;        /* NULL is wildcarded here             */
        netdevice_tracker        dev_tracker;
        int                        (*func) (struct sk_buff *,
                                         struct net_device *,
                                         struct packet_type *,
                                         struct net_device *);
        void                        (*list_func) (struct list_head *,
                                              struct packet_type *,
                                              struct net_device *);
        bool                        (*id_match)(struct packet_type *ptype,
                                            struct sock *sk);
        struct net                *af_packet_net;
        void                        *af_packet_priv;
        struct list_head        list;
};

struct offload_callbacks {
        struct sk_buff                *(*gso_segment)(struct sk_buff *skb,
                                                netdev_features_t features);
        struct sk_buff                *(*gro_receive)(struct list_head *head,
                                                struct sk_buff *skb);
        int                        (*gro_complete)(struct sk_buff *skb, int nhoff);
};

struct packet_offload {
        __be16                         type;        /* This is really htons(ether_type). */
        u16                         priority;
        struct offload_callbacks callbacks;
        struct list_head         list;
};

/* often modified stats are per-CPU, other are shared (netdev->stats) */
struct pcpu_sw_netstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync   syncp;
} __aligned(4 * sizeof(u64));

struct pcpu_dstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                rx_drops;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        u64_stats_t                tx_drops;
        struct u64_stats_sync        syncp;
} __aligned(8 * sizeof(u64));

struct pcpu_lstats {
        u64_stats_t packets;
        u64_stats_t bytes;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);

static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->rx_bytes, len);
        u64_stats_inc(&tstats->rx_packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_sw_netstats_tx_add(struct net_device *dev,
                                          unsigned int packets,
                                          unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->tx_bytes, len);
        u64_stats_add(&tstats->tx_packets, packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);

        u64_stats_update_begin(&lstats->syncp);
        u64_stats_add(&lstats->bytes, len);
        u64_stats_inc(&lstats->packets);
        u64_stats_update_end(&lstats->syncp);
}

#define __netdev_alloc_pcpu_stats(type, gfp)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
        if (pcpu_stats)        {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

#define netdev_alloc_pcpu_stats(type)                                        \
        __netdev_alloc_pcpu_stats(type, GFP_KERNEL)

#define devm_netdev_alloc_pcpu_stats(dev, type)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\
        if (pcpu_stats) {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

enum netdev_lag_tx_type {
        NETDEV_LAG_TX_TYPE_UNKNOWN,
        NETDEV_LAG_TX_TYPE_RANDOM,
        NETDEV_LAG_TX_TYPE_BROADCAST,
        NETDEV_LAG_TX_TYPE_ROUNDROBIN,
        NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
        NETDEV_LAG_TX_TYPE_HASH,
};

enum netdev_lag_hash {
        NETDEV_LAG_HASH_NONE,
        NETDEV_LAG_HASH_L2,
        NETDEV_LAG_HASH_L34,
        NETDEV_LAG_HASH_L23,
        NETDEV_LAG_HASH_E23,
        NETDEV_LAG_HASH_E34,
        NETDEV_LAG_HASH_VLAN_SRCMAC,
        NETDEV_LAG_HASH_UNKNOWN,
};

struct netdev_lag_upper_info {
        enum netdev_lag_tx_type tx_type;
        enum netdev_lag_hash hash_type;
};

struct netdev_lag_lower_state_info {
        u8 link_up : 1,
           tx_enabled : 1;
};

#include <linux/notifier.h>

/* netdevice notifier chain. Please remember to update netdev_cmd_to_name()
 * and the rtnetlink notification exclusion list in rtnetlink_event() when
 * adding new types.
 */
enum netdev_cmd {
        NETDEV_UP        = 1,        /* For now you can't veto a device up/down */
        NETDEV_DOWN,
        NETDEV_REBOOT,                /* Tell a protocol stack a network interface
                                   detected a hardware crash and restarted
                                   - we can use this eg to kick tcp sessions
                                   once done */
        NETDEV_CHANGE,                /* Notify device state change */
        NETDEV_REGISTER,
        NETDEV_UNREGISTER,
        NETDEV_CHANGEMTU,        /* notify after mtu change happened */
        NETDEV_CHANGEADDR,        /* notify after the address change */
        NETDEV_PRE_CHANGEADDR,        /* notify before the address change */
        NETDEV_GOING_DOWN,
        NETDEV_CHANGENAME,
        NETDEV_FEAT_CHANGE,
        NETDEV_BONDING_FAILOVER,
        NETDEV_PRE_UP,
        NETDEV_PRE_TYPE_CHANGE,
        NETDEV_POST_TYPE_CHANGE,
        NETDEV_POST_INIT,
        NETDEV_PRE_UNINIT,
        NETDEV_RELEASE,
        NETDEV_NOTIFY_PEERS,
        NETDEV_JOIN,
        NETDEV_CHANGEUPPER,
        NETDEV_RESEND_IGMP,
        NETDEV_PRECHANGEMTU,        /* notify before mtu change happened */
        NETDEV_CHANGEINFODATA,
        NETDEV_BONDING_INFO,
        NETDEV_PRECHANGEUPPER,
        NETDEV_CHANGELOWERSTATE,
        NETDEV_UDP_TUNNEL_PUSH_INFO,
        NETDEV_UDP_TUNNEL_DROP_INFO,
        NETDEV_CHANGE_TX_QUEUE_LEN,
        NETDEV_CVLAN_FILTER_PUSH_INFO,
        NETDEV_CVLAN_FILTER_DROP_INFO,
        NETDEV_SVLAN_FILTER_PUSH_INFO,
        NETDEV_SVLAN_FILTER_DROP_INFO,
        NETDEV_OFFLOAD_XSTATS_ENABLE,
        NETDEV_OFFLOAD_XSTATS_DISABLE,
        NETDEV_OFFLOAD_XSTATS_REPORT_USED,
        NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
        NETDEV_XDP_FEAT_CHANGE,
};
const char *netdev_cmd_to_name(enum netdev_cmd cmd);

int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb);
int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn);

struct netdev_notifier_info {
        struct net_device        *dev;
        struct netlink_ext_ack        *extack;
};

struct netdev_notifier_info_ext {
        struct netdev_notifier_info info; /* must be first */
        union {
                u32 mtu;
        } ext;
};

struct netdev_notifier_change_info {
        struct netdev_notifier_info info; /* must be first */
        unsigned int flags_changed;
};

struct netdev_notifier_changeupper_info {
        struct netdev_notifier_info info; /* must be first */
        struct net_device *upper_dev; /* new upper dev */
        bool master; /* is upper dev master */
        bool linking; /* is the notification for link or unlink */
        void *upper_info; /* upper dev info */
};

struct netdev_notifier_changelowerstate_info {
        struct netdev_notifier_info info; /* must be first */
        void *lower_state_info; /* is lower dev state */
};

struct netdev_notifier_pre_changeaddr_info {
        struct netdev_notifier_info info; /* must be first */
        const unsigned char *dev_addr;
};

enum netdev_offload_xstats_type {
        NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
};

struct netdev_notifier_offload_xstats_info {
        struct netdev_notifier_info info; /* must be first */
        enum netdev_offload_xstats_type type;

        union {
                /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */
                struct netdev_notifier_offload_xstats_rd *report_delta;
                /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */
                struct netdev_notifier_offload_xstats_ru *report_used;
        };
};

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack);
int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type);
bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type);
int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *stats, bool *used,
                              struct netlink_ext_ack *extack);
void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd,
                                   const struct rtnl_hw_stats64 *stats);
void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru);
void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *stats);

static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
                                             struct net_device *dev)
{
        info->dev = dev;
        info->extack = NULL;
}

static inline struct net_device *
netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
{
        return info->dev;
}

static inline struct netlink_ext_ack *
netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
{
        return info->extack;
}

int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info);

#define for_each_netdev(net, d)                \
                list_for_each_entry(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_reverse(net, d)        \
                list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_rcu(net, d)                \
                list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_safe(net, d, n)        \
                list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue(net, d)                \
                list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue_reverse(net, d)                \
                list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
                                                     dev_list)
#define for_each_netdev_continue_rcu(net, d)                \
        list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_in_bond_rcu(bond, slave)        \
                for_each_netdev_rcu(&init_net, slave)        \
                        if (netdev_master_upper_dev_get_rcu(slave) == (bond))
#define net_device_entry(lh)        list_entry(lh, struct net_device, dev_list)

#define for_each_netdev_dump(net, d, ifindex)                                \
        for (; (d = xa_find(&(net)->dev_by_index, &ifindex,                \
                            ULONG_MAX, XA_PRESENT)); ifindex++)

static inline struct net_device *next_net_device(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = dev->dev_list.next;
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *next_net_device_rcu(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = rcu_dereference(list_next_rcu(&dev->dev_list));
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *first_net_device(struct net *net)
{
        return list_empty(&net->dev_base_head) ? NULL :
                net_device_entry(net->dev_base_head.next);
}

static inline struct net_device *first_net_device_rcu(struct net *net)
{
        struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head));

        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

int netdev_boot_setup_check(struct net_device *dev);
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *hwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
void dev_add_pack(struct packet_type *pt);
void dev_remove_pack(struct packet_type *pt);
void __dev_remove_pack(struct packet_type *pt);
void dev_add_offload(struct packet_offload *po);
void dev_remove_offload(struct packet_offload *po);

int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack);
struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
                                      unsigned short mask);
struct net_device *dev_get_by_name(struct net *net, const char *name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
bool netdev_name_in_use(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
void dev_close(struct net_device *dev);
void dev_close_many(struct list_head *head, bool unlink);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
                       struct net_device *sb_dev);

int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);

static inline int dev_queue_xmit(struct sk_buff *skb)
{
        return __dev_queue_xmit(skb, NULL);
}

static inline int dev_queue_xmit_accel(struct sk_buff *skb,
                                       struct net_device *sb_dev)
{
        return __dev_queue_xmit(skb, sb_dev);
}

static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        int ret;

        ret = __dev_direct_xmit(skb, queue_id);
        if (!dev_xmit_complete(ret))
                kfree_skb(skb);
        return ret;
}

int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
static inline void unregister_netdevice(struct net_device *dev)
{
        unregister_netdevice_queue(dev, NULL);
}

int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);
void netdev_freemem(struct net_device *dev);
void init_dummy_netdev(struct net_device *dev);

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves);
struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk);
struct net_device *dev_get_by_index(struct net *net, int ifindex);
struct net_device *__dev_get_by_index(struct net *net, int ifindex);
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp);
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
void netdev_copy_name(struct net_device *dev, char *name);

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                                  unsigned short type,
                                  const void *daddr, const void *saddr,
                                  unsigned int len)
{
        if (!dev->header_ops || !dev->header_ops->create)
                return 0;

        return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}

static inline int dev_parse_header(const struct sk_buff *skb,
                                   unsigned char *haddr)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse)
                return 0;
        return dev->header_ops->parse(skb, haddr);
}

static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse_protocol)
                return 0;
        return dev->header_ops->parse_protocol(skb);
}

/* ll_header must have at least hard_header_len allocated */
static inline bool dev_validate_header(const struct net_device *dev,
                                       char *ll_header, int len)
{
        if (likely(len >= dev->hard_header_len))
                return true;
        if (len < dev->min_header_len)
                return false;

        if (capable(CAP_SYS_RAWIO)) {
                memset(ll_header + len, 0, dev->hard_header_len - len);
                return true;
        }

        if (dev->header_ops && dev->header_ops->validate)
                return dev->header_ops->validate(ll_header, len);

        return false;
}

static inline bool dev_has_header(const struct net_device *dev)
{
        return dev->header_ops && dev->header_ops->create;
}

/*
 * Incoming packets are placed on per-CPU queues
 */
struct softnet_data {
        struct list_head        poll_list;
        struct sk_buff_head        process_queue;
        local_lock_t                process_queue_bh_lock;

        /* stats */
        unsigned int                processed;
        unsigned int                time_squeeze;
#ifdef CONFIG_RPS
        struct softnet_data        *rps_ipi_list;
#endif

        unsigned int                received_rps;
        bool                        in_net_rx_action;
        bool                        in_napi_threaded_poll;

#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit __rcu *flow_limit;
#endif
        struct Qdisc                *output_queue;
        struct Qdisc                **output_queue_tailp;
        struct sk_buff                *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
        struct sk_buff_head        xfrm_backlog;
#endif
        /* written and read only by owning cpu: */
        struct netdev_xmit xmit;
#ifdef CONFIG_RPS
        /* input_queue_head should be written by cpu owning this struct,
         * and only read by other cpus. Worth using a cache line.
         */
        unsigned int                input_queue_head ____cacheline_aligned_in_smp;

        /* Elements below can be accessed between CPUs for RPS/RFS */
        call_single_data_t        csd ____cacheline_aligned_in_smp;
        struct softnet_data        *rps_ipi_next;
        unsigned int                cpu;
        unsigned int                input_queue_tail;
#endif
        struct sk_buff_head        input_pkt_queue;
        struct napi_struct        backlog;

        atomic_t                dropped ____cacheline_aligned_in_smp;

        /* Another possibly contended cache line */
        spinlock_t                defer_lock ____cacheline_aligned_in_smp;
        int                        defer_count;
        int                        defer_ipi_scheduled;
        struct sk_buff                *defer_list;
        call_single_data_t        defer_csd;
};

DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);

#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
        return this_cpu_read(softnet_data.xmit.recursion);
}
#else
static inline int dev_recursion_level(void)
{
        return current->net_xmit.recursion;
}

#endif

void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);

static inline void netif_tx_schedule_all(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                netif_schedule_queue(netdev_get_tx_queue(dev, i));
}

static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
        clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_start_queue - allow transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 */
static inline void netif_start_queue(struct net_device *dev)
{
        netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_start_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_start_queue(txq);
        }
}

void netif_tx_wake_queue(struct netdev_queue *dev_queue);

/**
 *        netif_wake_queue - restart transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are available.
 */
static inline void netif_wake_queue(struct net_device *dev)
{
        netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_wake_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_wake_queue(txq);
        }
}

static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
        /* Must be an atomic op see netif_txq_try_stop() */
        set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_stop_queue - stop transmitted packets
 *        @dev: network device
 *
 *        Stop upper layers calling the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are unavailable.
 */
static inline void netif_stop_queue(struct net_device *dev)
{
        netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}

void netif_tx_stop_all_queues(struct net_device *dev);

static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
        return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_queue_stopped - test if transmit queue is flowblocked
 *        @dev: network device
 *
 *        Test if transmit queue on device is currently unable to send.
 */
static inline bool netif_queue_stopped(const struct net_device *dev)
{
        return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}

static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF;
}

static inline bool
netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}

static inline bool
netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}

/**
 *        netdev_queue_set_dql_min_limit - set dql minimum limit
 *        @dev_queue: pointer to transmit queue
 *        @min_limit: dql minimum limit
 *
 * Forces xmit_more() to return true until the minimum threshold
 * defined by @min_limit is reached (or until the tx queue is
 * empty). Warning: to be use with care, misuse will impact the
 * latency.
 */
static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
                                                  unsigned int min_limit)
{
#ifdef CONFIG_BQL
        dev_queue->dql.min_limit = min_limit;
#endif
}

static inline int netdev_queue_dql_avail(const struct netdev_queue *txq)
{
#ifdef CONFIG_BQL
        /* Non-BQL migrated drivers will return 0, too. */
        return dql_avail(&txq->dql);
#else
        return 0;
#endif
}

/**
 *        netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their ndo_start_xmit(),
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.num_queued);
#endif
}

/**
 *        netdev_txq_bql_complete_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their TX completion path,
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.limit);
#endif
}

/**
 *        netdev_tx_sent_queue - report the number of bytes queued to a given tx queue
 *        @dev_queue: network device queue
 *        @bytes: number of bytes queued to the device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                        unsigned int bytes)
{
#ifdef CONFIG_BQL
        dql_queued(&dev_queue->dql, bytes);

        if (likely(dql_avail(&dev_queue->dql) >= 0))
                return;

        set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);

        /*
         * The XOFF flag must be set before checking the dql_avail below,
         * because in netdev_tx_completed_queue we update the dql_completed
         * before checking the XOFF flag.
         */
        smp_mb();

        /* check again in case another CPU has just made room avail */
        if (unlikely(dql_avail(&dev_queue->dql) >= 0))
                clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
#endif
}

/* Variant of netdev_tx_sent_queue() for drivers that are aware
 * that they should not test BQL status themselves.
 * We do want to change __QUEUE_STATE_STACK_XOFF only for the last
 * skb of a batch.
 * Returns true if the doorbell must be used to kick the NIC.
 */
static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                          unsigned int bytes,
                                          bool xmit_more)
{
        if (xmit_more) {
#ifdef CONFIG_BQL
                dql_queued(&dev_queue->dql, bytes);
#endif
                return netif_tx_queue_stopped(dev_queue);
        }
        netdev_tx_sent_queue(dev_queue, bytes);
        return true;
}

/**
 *        netdev_sent_queue - report the number of bytes queued to hardware
 *        @dev: network device
 *        @bytes: number of bytes queued to the hardware device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue#0. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
{
        netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes);
}

static inline bool __netdev_sent_queue(struct net_device *dev,
                                       unsigned int bytes,
                                       bool xmit_more)
{
        return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes,
                                      xmit_more);
}

/**
 *        netdev_tx_completed_queue - report number of packets/bytes at TX completion.
 *        @dev_queue: network device queue
 *        @pkts: number of packets (currently ignored)
 *        @bytes: number of bytes dequeued from the device queue
 *
 *        Must be called at most once per TX completion round (and not per
 *        individual packet), so that BQL can adjust its limits appropriately.
 */
static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
                                             unsigned int pkts, unsigned int bytes)
{
#ifdef CONFIG_BQL
        if (unlikely(!bytes))
                return;

        dql_completed(&dev_queue->dql, bytes);

        /*
         * Without the memory barrier there is a small possiblity that
         * netdev_tx_sent_queue will miss the update and cause the queue to
         * be stopped forever
         */
        smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */

        if (unlikely(dql_avail(&dev_queue->dql) < 0))
                return;

        if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state))
                netif_schedule_queue(dev_queue);
#endif
}

/**
 *         netdev_completed_queue - report bytes and packets completed by device
 *         @dev: network device
 *         @pkts: actual number of packets sent over the medium
 *         @bytes: actual number of bytes sent over the medium
 *
 *         Report the number of bytes and packets transmitted by the network device
 *         hardware queue over the physical medium, @bytes must exactly match the
 *         @bytes amount passed to netdev_sent_queue()
 */
static inline void netdev_completed_queue(struct net_device *dev,
                                          unsigned int pkts, unsigned int bytes)
{
        netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
}

static inline void netdev_tx_reset_queue(struct netdev_queue *q)
{
#ifdef CONFIG_BQL
        clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state);
        dql_reset(&q->dql);
#endif
}

/**
 *         netdev_reset_queue - reset the packets and bytes count of a network device
 *         @dev_queue: network device
 *
 *         Reset the bytes and packet count of a network device and clear the
 *         software flow control OFF bit for this network device
 */
static inline void netdev_reset_queue(struct net_device *dev_queue)
{
        netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
}

/**
 *         netdev_cap_txqueue - check if selected tx queue exceeds device queues
 *         @dev: network device
 *         @queue_index: given tx queue index
 *
 *         Returns 0 if given tx queue index >= number of device tx queues,
 *         otherwise returns the originally passed tx queue index.
 */
static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
                                     dev->name, queue_index,
                                     dev->real_num_tx_queues);
                return 0;
        }

        return queue_index;
}

/**
 *        netif_running - test if up
 *        @dev: network device
 *
 *        Test if the device has been brought up.
 */
static inline bool netif_running(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_START, &dev->state);
}

/*
 * Routines to manage the subqueues on a device.  We only need start,
 * stop, and a check if it's stopped.  All other device management is
 * done at the overall netdevice level.
 * Also test the device if we're multiqueue.
 */

/**
 *        netif_start_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Start individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_start_queue(txq);
}

/**
 *        netif_stop_subqueue - stop sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Stop individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
        netif_tx_stop_queue(txq);
}

/**
 *        __netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool __netif_subqueue_stopped(const struct net_device *dev,
                                            u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        return netif_tx_queue_stopped(txq);
}

/**
 *        netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @skb: sub queue buffer pointer
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool netif_subqueue_stopped(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}

/**
 *        netif_wake_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Resume individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_wake_queue(txq);
}

#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type);

/**
 *        netif_attr_test_mask - Test a CPU or Rx queue set in a mask
 *        @j: CPU/Rx queue index
 *        @mask: bitmask of all cpus/rx queues
 *        @nr_bits: number of bits in the bitmask
 *
 * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
 */
static inline bool netif_attr_test_mask(unsigned long j,
                                        const unsigned long *mask,
                                        unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);
        return test_bit(j, mask);
}

/**
 *        netif_attr_test_online - Test for online CPU/Rx queue
 *        @j: CPU/Rx queue index
 *        @online_mask: bitmask for CPUs/Rx queues that are online
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns true if a CPU/Rx queue is online.
 */
static inline bool netif_attr_test_online(unsigned long j,
                                          const unsigned long *online_mask,
                                          unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);

        if (online_mask)
                return test_bit(j, online_mask);

        return (j < nr_bits);
}

/**
 *        netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
 *        @n: CPU/Rx queue index
 *        @srcp: the cpumask/Rx queue mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns >= nr_bits if no further CPUs/Rx queues set.
 */
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
                                               unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (srcp)
                return find_next_bit(srcp, nr_bits, n + 1);

        return n + 1;
}

/**
 *        netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p
 *        @n: CPU/Rx queue index
 *        @src1p: the first CPUs/Rx queues mask pointer
 *        @src2p: the second CPUs/Rx queues mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns >= nr_bits if no further CPUs/Rx queues set in both.
 */
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
                                          const unsigned long *src2p,
                                          unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (src1p && src2p)
                return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
        else if (src1p)
                return find_next_bit(src1p, nr_bits, n + 1);
        else if (src2p)
                return find_next_bit(src2p, nr_bits, n + 1);

        return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
                                      const struct cpumask *mask,
                                      u16 index)
{
        return 0;
}

static inline int __netif_set_xps_queue(struct net_device *dev,
                                        const unsigned long *mask,
                                        u16 index, enum xps_map_type type)
{
        return 0;
}
#endif

/**
 *        netif_is_multiqueue - test if device has multiple transmit queues
 *        @dev: network device
 *
 * Check if device has multiple transmit queues
 */
static inline bool netif_is_multiqueue(const struct net_device *dev)
{
        return dev->num_tx_queues > 1;
}

int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);

#ifdef CONFIG_SYSFS
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
#else
static inline int netif_set_real_num_rx_queues(struct net_device *dev,
                                                unsigned int rxqs)
{
        dev->real_num_rx_queues = rxqs;
        return 0;
}
#endif
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq);

int netif_get_num_default_rss_queues(void);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason);
void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason);

/*
 * It is not allowed to call kfree_skb() or consume_skb() from hardware
 * interrupt context or with hardware interrupts being disabled.
 * (in_hardirq() || irqs_disabled())
 *
 * We provide four helpers that can be used in following contexts :
 *
 * dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
 *  Typically used in place of consume_skb(skb) in TX completion path
 *
 * dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_any(skb) when caller doesn't know its current irq context,
 *  and consumed a packet. Used in place of consume_skb(skb)
 */
static inline void dev_kfree_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_CONSUMED);
}

static inline void dev_kfree_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_CONSUMED);
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             struct bpf_prog *xdp_prog);
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb);
int netif_rx(struct sk_buff *skb);
int __netif_rx(struct sk_buff *skb);

int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
void netif_receive_skb_list_internal(struct list_head *head);
void netif_receive_skb_list(struct list_head *head);
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
void napi_gro_flush(struct napi_struct *napi, bool flush_old);
struct sk_buff *napi_get_frags(struct napi_struct *napi);
void napi_get_frags_check(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);

static inline void napi_free_frags(struct napi_struct *napi)
{
        kfree_skb(napi->skb);
        napi->skb = NULL;
}

bool netdev_is_rx_handler_busy(struct net_device *dev);
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data);
void netdev_rx_handler_unregister(struct net_device *dev);

bool dev_valid_name(const char *name);
static inline bool is_socket_ioctl_cmd(unsigned int cmd)
{
        return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
}
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
int put_user_ifreq(struct ifreq *ifr, void __user *arg);
int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
                void __user *data, bool *need_copyout);
int dev_ifconf(struct net *net, struct ifconf __user *ifc);
int generic_hwtstamp_get_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg);
int generic_hwtstamp_set_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg,
                               struct netlink_ext_ack *extack);
int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
unsigned int dev_get_flags(const struct net_device *);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack);
int dev_set_alias(struct net_device *, const char *, size_t);
int dev_get_alias(const struct net_device *, char *, size_t);
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex);
static inline
int dev_change_net_namespace(struct net_device *dev, struct net *net,
                             const char *pat)
{
        return __dev_change_net_namespace(dev, net, pat, 0);
}
int __dev_set_mtu(struct net_device *, int);
int dev_set_mtu(struct net_device *, int);
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                              struct netlink_ext_ack *extack);
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                        struct netlink_ext_ack *extack);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                             struct netlink_ext_ack *extack);
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
int dev_get_port_parent_id(struct net_device *dev,
                           struct netdev_phys_item_id *ppid, bool recurse);
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret);

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
u8 dev_xdp_prog_count(struct net_device *dev);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
                        const struct sk_buff *skb);

static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
                                                 const struct sk_buff *skb,
                                                 const bool check_mtu)
{
        const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
        unsigned int len;

        if (!(dev->flags & IFF_UP))
                return false;

        if (!check_mtu)
                return true;

        len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
        if (skb->len <= len)
                return true;

        /* if TSO is enabled, we don't care about the length as the packet
         * could be forwarded without being segmented before
         */
        if (skb_is_gso(skb))
                return true;

        return false;
}

void netdev_core_stats_inc(struct net_device *dev, u32 offset);

#define DEV_CORE_STATS_INC(FIELD)                                                \
static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)                \
{                                                                                \
        netdev_core_stats_inc(dev,                                                \
                        offsetof(struct net_device_core_stats, FIELD));                \
}
DEV_CORE_STATS_INC(rx_dropped)
DEV_CORE_STATS_INC(tx_dropped)
DEV_CORE_STATS_INC(rx_nohandler)
DEV_CORE_STATS_INC(rx_otherhost_dropped)
#undef DEV_CORE_STATS_INC

static __always_inline int ____dev_forward_skb(struct net_device *dev,
                                               struct sk_buff *skb,
                                               const bool check_mtu)
{
        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
            unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
                dev_core_stats_rx_dropped_inc(dev);
                kfree_skb(skb);
                return NET_RX_DROP;
        }

        skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
        skb->priority = 0;
        return 0;
}

bool dev_nit_active(struct net_device *dev);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);

static inline void __dev_put(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_dec(*dev->pcpu_refcnt);
#else
                refcount_dec(&dev->dev_refcnt);
#endif
        }
}

static inline void __dev_hold(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_inc(*dev->pcpu_refcnt);
#else
                refcount_inc(&dev->dev_refcnt);
#endif
        }
}

static inline void __netdev_tracker_alloc(struct net_device *dev,
                                          netdevice_tracker *tracker,
                                          gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp);
#endif
}

/* netdev_tracker_alloc() can upgrade a prior untracked reference
 * taken by dev_get_by_name()/dev_get_by_index() to a tracked one.
 */
static inline void netdev_tracker_alloc(struct net_device *dev,
                                        netdevice_tracker *tracker, gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        refcount_dec(&dev->refcnt_tracker.no_tracker);
        __netdev_tracker_alloc(dev, tracker, gfp);
#endif
}

static inline void netdev_tracker_free(struct net_device *dev,
                                       netdevice_tracker *tracker)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_free(&dev->refcnt_tracker, tracker);
#endif
}

static inline void netdev_hold(struct net_device *dev,
                               netdevice_tracker *tracker, gfp_t gfp)
{
        if (dev) {
                __dev_hold(dev);
                __netdev_tracker_alloc(dev, tracker, gfp);
        }
}

static inline void netdev_put(struct net_device *dev,
                              netdevice_tracker *tracker)
{
        if (dev) {
                netdev_tracker_free(dev, tracker);
                __dev_put(dev);
        }
}

/**
 *        dev_hold - get reference to device
 *        @dev: network device
 *
 * Hold reference to device to keep it from being freed.
 * Try using netdev_hold() instead.
 */
static inline void dev_hold(struct net_device *dev)
{
        netdev_hold(dev, NULL, GFP_ATOMIC);
}

/**
 *        dev_put - release reference to device
 *        @dev: network device
 *
 * Release reference to device to allow it to be freed.
 * Try using netdev_put() instead.
 */
static inline void dev_put(struct net_device *dev)
{
        netdev_put(dev, NULL);
}

DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T))

static inline void netdev_ref_replace(struct net_device *odev,
                                      struct net_device *ndev,
                                      netdevice_tracker *tracker,
                                      gfp_t gfp)
{
        if (odev)
                netdev_tracker_free(odev, tracker);

        __dev_hold(ndev);
        __dev_put(odev);

        if (ndev)
                __netdev_tracker_alloc(ndev, tracker, gfp);
}

/* Carrier loss detection, dial on demand. The functions netif_carrier_on
 * and _off may be called from IRQ context, but it is caller
 * who is responsible for serialization of these calls.
 *
 * The name carrier is inappropriate, these functions should really be
 * called netif_lowerlayer_*() because they represent the state of any
 * kind of lower layer not just hardware media.
 */
void linkwatch_fire_event(struct net_device *dev);

/**
 * linkwatch_sync_dev - sync linkwatch for the given device
 * @dev: network device to sync linkwatch for
 *
 * Sync linkwatch for the given device, removing it from the
 * pending work list (if queued).
 */
void linkwatch_sync_dev(struct net_device *dev);

/**
 *        netif_carrier_ok - test if carrier present
 *        @dev: network device
 *
 * Check if carrier is present on device
 */
static inline bool netif_carrier_ok(const struct net_device *dev)
{
        return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}

unsigned long dev_trans_start(struct net_device *dev);

void __netdev_watchdog_up(struct net_device *dev);

void netif_carrier_on(struct net_device *dev);
void netif_carrier_off(struct net_device *dev);
void netif_carrier_event(struct net_device *dev);

/**
 *        netif_dormant_on - mark device as dormant.
 *        @dev: network device
 *
 * Mark device as dormant (as per RFC2863).
 *
 * The dormant state indicates that the relevant interface is not
 * actually in a condition to pass packets (i.e., it is not 'up') but is
 * in a "pending" state, waiting for some external event.  For "on-
 * demand" interfaces, this new state identifies the situation where the
 * interface is waiting for events to place it in the up state.
 */
static inline void netif_dormant_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant_off - set device as not dormant.
 *        @dev: network device
 *
 * Device is not in dormant state.
 */
static inline void netif_dormant_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant - test if device is dormant
 *        @dev: network device
 *
 * Check if device is dormant.
 */
static inline bool netif_dormant(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_DORMANT, &dev->state);
}


/**
 *        netif_testing_on - mark device as under test.
 *        @dev: network device
 *
 * Mark device as under test (as per RFC2863).
 *
 * The testing state indicates that some test(s) must be performed on
 * the interface. After completion, of the test, the interface state
 * will change to up, dormant, or down, as appropriate.
 */
static inline void netif_testing_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing_off - set device as not under test.
 *        @dev: network device
 *
 * Device is not in testing state.
 */
static inline void netif_testing_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing - test if device is under test
 *        @dev: network device
 *
 * Check if device is under test
 */
static inline bool netif_testing(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_TESTING, &dev->state);
}


/**
 *        netif_oper_up - test if device is operational
 *        @dev: network device
 *
 * Check if carrier is operational
 */
static inline bool netif_oper_up(const struct net_device *dev)
{
        unsigned int operstate = READ_ONCE(dev->operstate);

        return        operstate == IF_OPER_UP ||
                operstate == IF_OPER_UNKNOWN /* backward compat */;
}

/**
 *        netif_device_present - is device available or removed
 *        @dev: network device
 *
 * Check if device has not been removed from system.
 */
static inline bool netif_device_present(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_PRESENT, &dev->state);
}

void netif_device_detach(struct net_device *dev);

void netif_device_attach(struct net_device *dev);

/*
 * Network interface message level settings
 */

enum {
        NETIF_MSG_DRV_BIT,
        NETIF_MSG_PROBE_BIT,
        NETIF_MSG_LINK_BIT,
        NETIF_MSG_TIMER_BIT,
        NETIF_MSG_IFDOWN_BIT,
        NETIF_MSG_IFUP_BIT,
        NETIF_MSG_RX_ERR_BIT,
        NETIF_MSG_TX_ERR_BIT,
        NETIF_MSG_TX_QUEUED_BIT,
        NETIF_MSG_INTR_BIT,
        NETIF_MSG_TX_DONE_BIT,
        NETIF_MSG_RX_STATUS_BIT,
        NETIF_MSG_PKTDATA_BIT,
        NETIF_MSG_HW_BIT,
        NETIF_MSG_WOL_BIT,

        /* When you add a new bit above, update netif_msg_class_names array
         * in net/ethtool/common.c
         */
        NETIF_MSG_CLASS_COUNT,
};
/* Both ethtool_ops interface and internal driver implementation use u32 */
static_assert(NETIF_MSG_CLASS_COUNT <= 32);

#define __NETIF_MSG_BIT(bit)        ((u32)1 << (bit))
#define __NETIF_MSG(name)        __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT)

#define NETIF_MSG_DRV                __NETIF_MSG(DRV)
#define NETIF_MSG_PROBE                __NETIF_MSG(PROBE)
#define NETIF_MSG_LINK                __NETIF_MSG(LINK)
#define NETIF_MSG_TIMER                __NETIF_MSG(TIMER)
#define NETIF_MSG_IFDOWN        __NETIF_MSG(IFDOWN)
#define NETIF_MSG_IFUP                __NETIF_MSG(IFUP)
#define NETIF_MSG_RX_ERR        __NETIF_MSG(RX_ERR)
#define NETIF_MSG_TX_ERR        __NETIF_MSG(TX_ERR)
#define NETIF_MSG_TX_QUEUED        __NETIF_MSG(TX_QUEUED)
#define NETIF_MSG_INTR                __NETIF_MSG(INTR)
#define NETIF_MSG_TX_DONE        __NETIF_MSG(TX_DONE)
#define NETIF_MSG_RX_STATUS        __NETIF_MSG(RX_STATUS)
#define NETIF_MSG_PKTDATA        __NETIF_MSG(PKTDATA)
#define NETIF_MSG_HW                __NETIF_MSG(HW)
#define NETIF_MSG_WOL                __NETIF_MSG(WOL)

#define netif_msg_drv(p)        ((p)->msg_enable & NETIF_MSG_DRV)
#define netif_msg_probe(p)        ((p)->msg_enable & NETIF_MSG_PROBE)
#define netif_msg_link(p)        ((p)->msg_enable & NETIF_MSG_LINK)
#define netif_msg_timer(p)        ((p)->msg_enable & NETIF_MSG_TIMER)
#define netif_msg_ifdown(p)        ((p)->msg_enable & NETIF_MSG_IFDOWN)
#define netif_msg_ifup(p)        ((p)->msg_enable & NETIF_MSG_IFUP)
#define netif_msg_rx_err(p)        ((p)->msg_enable & NETIF_MSG_RX_ERR)
#define netif_msg_tx_err(p)        ((p)->msg_enable & NETIF_MSG_TX_ERR)
#define netif_msg_tx_queued(p)        ((p)->msg_enable & NETIF_MSG_TX_QUEUED)
#define netif_msg_intr(p)        ((p)->msg_enable & NETIF_MSG_INTR)
#define netif_msg_tx_done(p)        ((p)->msg_enable & NETIF_MSG_TX_DONE)
#define netif_msg_rx_status(p)        ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p)        ((p)->msg_enable & NETIF_MSG_PKTDATA)
#define netif_msg_hw(p)                ((p)->msg_enable & NETIF_MSG_HW)
#define netif_msg_wol(p)        ((p)->msg_enable & NETIF_MSG_WOL)

static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
{
        /* use default */
        if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
                return default_msg_enable_bits;
        if (debug_value == 0)        /* no output */
                return 0;
        /* set low N bits */
        return (1U << debug_value) - 1;
}

static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
        spin_lock(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, cpu);
}

static inline bool __netif_tx_acquire(struct netdev_queue *txq)
{
        __acquire(&txq->_xmit_lock);
        return true;
}

static inline void __netif_tx_release(struct netdev_queue *txq)
{
        __release(&txq->_xmit_lock);
}

static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
        spin_lock_bh(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}

static inline bool __netif_tx_trylock(struct netdev_queue *txq)
{
        bool ok = spin_trylock(&txq->_xmit_lock);

        if (likely(ok)) {
                /* Pairs with READ_ONCE() in __dev_queue_xmit() */
                WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
        }
        return ok;
}

static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock(&txq->_xmit_lock);
}

static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock_bh(&txq->_xmit_lock);
}

/*
 * txq->trans_start can be read locklessly from dev_watchdog()
 */
static inline void txq_trans_update(struct netdev_queue *txq)
{
        if (txq->xmit_lock_owner != -1)
                WRITE_ONCE(txq->trans_start, jiffies);
}

static inline void txq_trans_cond_update(struct netdev_queue *txq)
{
        unsigned long now = jiffies;

        if (READ_ONCE(txq->trans_start) != now)
                WRITE_ONCE(txq->trans_start, now);
}

/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
static inline void netif_trans_update(struct net_device *dev)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);

        txq_trans_cond_update(txq);
}

/**
 *        netif_tx_lock - grab network device transmit lock
 *        @dev: network device
 *
 * Get network device transmit lock
 */
void netif_tx_lock(struct net_device *dev);

static inline void netif_tx_lock_bh(struct net_device *dev)
{
        local_bh_disable();
        netif_tx_lock(dev);
}

void netif_tx_unlock(struct net_device *dev);

static inline void netif_tx_unlock_bh(struct net_device *dev)
{
        netif_tx_unlock(dev);
        local_bh_enable();
}

#define HARD_TX_LOCK(dev, txq, cpu) {                        \
        if ((dev->features & NETIF_F_LLTX) == 0) {        \
                __netif_tx_lock(txq, cpu);                \
        } else {                                        \
                __netif_tx_acquire(txq);                \
        }                                                \
}

#define HARD_TX_TRYLOCK(dev, txq)                        \
        (((dev->features & NETIF_F_LLTX) == 0) ?        \
                __netif_tx_trylock(txq) :                \
                __netif_tx_acquire(txq))

#define HARD_TX_UNLOCK(dev, txq) {                        \
        if ((dev->features & NETIF_F_LLTX) == 0) {        \
                __netif_tx_unlock(txq);                        \
        } else {                                        \
                __netif_tx_release(txq);                \
        }                                                \
}

static inline void netif_tx_disable(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        local_bh_disable();
        cpu = smp_processor_id();
        spin_lock(&dev->tx_global_lock);
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                __netif_tx_lock(txq, cpu);
                netif_tx_stop_queue(txq);
                __netif_tx_unlock(txq);
        }
        spin_unlock(&dev->tx_global_lock);
        local_bh_enable();
}

static inline void netif_addr_lock(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_lock_bh(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        local_bh_disable();
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_unlock(struct net_device *dev)
{
        spin_unlock(&dev->addr_list_lock);
}

static inline void netif_addr_unlock_bh(struct net_device *dev)
{
        spin_unlock_bh(&dev->addr_list_lock);
}

/*
 * dev_addrs walker. Should be used only for read access. Call with
 * rcu_read_lock held.
 */
#define for_each_dev_addr(dev, ha) \
                list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)

/* These functions live elsewhere (drivers/net/net_init.c, but related) */

void ether_setup(struct net_device *dev);

/* Allocate dummy net_device */
struct net_device *alloc_netdev_dummy(int sizeof_priv);

/* Support for loadable net-drivers */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                                    unsigned char name_assign_type,
                                    void (*setup)(struct net_device *),
                                    unsigned int txqs, unsigned int rxqs);
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)

#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
                         count)

int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);

int devm_register_netdev(struct device *dev, struct net_device *ndev);

/* General hardware address lists handling functions */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
                   struct netdev_hw_addr_list *from_list, int addr_len);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
                      struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
                       struct net_device *dev,
                       int (*sync)(struct net_device *, const unsigned char *),
                       int (*unsync)(struct net_device *,
                                     const unsigned char *));
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
                           struct net_device *dev,
                           int (*sync)(struct net_device *,
                                       const unsigned char *, int),
                           int (*unsync)(struct net_device *,
                                         const unsigned char *, int));
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
                              struct net_device *dev,
                              int (*unsync)(struct net_device *,
                                            const unsigned char *, int));
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
                          struct net_device *dev,
                          int (*unsync)(struct net_device *,
                                        const unsigned char *));
void __hw_addr_init(struct netdev_hw_addr_list *list);

/* Functions used for device addresses handling */
void dev_addr_mod(struct net_device *dev, unsigned int offset,
                  const void *addr, size_t len);

static inline void
__dev_addr_set(struct net_device *dev, const void *addr, size_t len)
{
        dev_addr_mod(dev, 0, addr, len);
}

static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, dev->addr_len);
}

int dev_addr_add(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);

/* Functions used for unicast addresses handling */
int dev_uc_add(struct net_device *dev, const unsigned char *addr);
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_uc_del(struct net_device *dev, const unsigned char *addr);
int dev_uc_sync(struct net_device *to, struct net_device *from);
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_uc_unsync(struct net_device *to, struct net_device *from);
void dev_uc_flush(struct net_device *dev);
void dev_uc_init(struct net_device *dev);

/**
 *  __dev_uc_sync - Synchonize device's unicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_uc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
}

/**
 *  __dev_uc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_uc_sync().
 */
static inline void __dev_uc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->uc, dev, unsync);
}

/* Functions used for multicast addresses handling */
int dev_mc_add(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_mc_del(struct net_device *dev, const unsigned char *addr);
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_sync(struct net_device *to, struct net_device *from);
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_mc_unsync(struct net_device *to, struct net_device *from);
void dev_mc_flush(struct net_device *dev);
void dev_mc_init(struct net_device *dev);

/**
 *  __dev_mc_sync - Synchonize device's multicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_mc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
}

/**
 *  __dev_mc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_mc_sync().
 */
static inline void __dev_mc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->mc, dev, unsync);
}

/* Functions used for secondary unicast and multicast support */
void dev_set_rx_mode(struct net_device *dev);
int dev_set_promiscuity(struct net_device *dev, int inc);
int dev_set_allmulti(struct net_device *dev, int inc);
void netdev_state_change(struct net_device *dev);
void __netdev_notify_peers(struct net_device *dev);
void netdev_notify_peers(struct net_device *dev);
void netdev_features_change(struct net_device *dev);
/* Load a device via the kmod */
void dev_load(struct net *net, const char *name);
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage);
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats);
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats);
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);

enum {
        NESTED_SYNC_IMM_BIT,
        NESTED_SYNC_TODO_BIT,
};

#define __NESTED_SYNC_BIT(bit)        ((u32)1 << (bit))
#define __NESTED_SYNC(name)        __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT)

#define NESTED_SYNC_IMM                __NESTED_SYNC(IMM)
#define NESTED_SYNC_TODO        __NESTED_SYNC(TODO)

struct netdev_nested_priv {
        unsigned char flags;
        void *data;
};

bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);

/* iterate through upper list, must be called under RCU read lock */
#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
        for (iter = &(dev)->adj_list.upper, \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \
             updev; \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *upper_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev);

bool netdev_has_any_upper_dev(struct net_device *dev);

void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter);

#define netdev_for_each_lower_private(dev, priv, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             priv = netdev_lower_get_next_private(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private(dev, &(iter)))

#define netdev_for_each_lower_private_rcu(dev, priv, iter) \
        for (iter = &(dev)->adj_list.lower, \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)))

void *netdev_lower_get_next(struct net_device *dev,
                                struct list_head **iter);

#define netdev_for_each_lower_dev(dev, ldev, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             ldev = netdev_lower_get_next(dev, &(iter)); \
             ldev; \
             ldev = netdev_lower_get_next(dev, &(iter)))

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter);
int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *lower_dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv);
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *lower_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

void *netdev_adjacent_get_private(struct list_head *adj_list);
void *netdev_lower_get_first_private_rcu(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
                          struct netlink_ext_ack *extack);
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack);
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev);
int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack);
void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev);
void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev);
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev);
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info);

/* RSS keys are 40 or 52 bytes long */
#define NETDEV_RSS_KEY_LEN 52
extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len);

int skb_checksum_help(struct sk_buff *skb);
int skb_crc32c_csum_help(struct sk_buff *skb);
int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features);

struct netdev_bonding_info {
        ifslave        slave;
        ifbond        master;
};

struct netdev_notifier_bonding_info {
        struct netdev_notifier_info info; /* must be first */
        struct netdev_bonding_info  bonding_info;
};

void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info);

#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data);
#else
static inline void ethtool_notify(struct net_device *dev, unsigned int cmd,
                                  const void *data)
{
}
#endif

__be16 skb_network_protocol(struct sk_buff *skb, int *depth);

static inline bool can_checksum_protocol(netdev_features_t features,
                                         __be16 protocol)
{
        if (protocol == htons(ETH_P_FCOE))
                return !!(features & NETIF_F_FCOE_CRC);

        /* Assume this is an IP checksum (not SCTP CRC) */

        if (features & NETIF_F_HW_CSUM) {
                /* Can checksum everything */
                return true;
        }

        switch (protocol) {
        case htons(ETH_P_IP):
                return !!(features & NETIF_F_IP_CSUM);
        case htons(ETH_P_IPV6):
                return !!(features & NETIF_F_IPV6_CSUM);
        default:
                return false;
        }
}

#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb);
#else
static inline void netdev_rx_csum_fault(struct net_device *dev,
                                        struct sk_buff *skb)
{
}
#endif
/* rx skb timestamps */
void net_enable_timestamp(void);
void net_disable_timestamp(void);

static inline ktime_t netdev_get_tstamp(struct net_device *dev,
                                        const struct skb_shared_hwtstamps *hwtstamps,
                                        bool cycles)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_get_tstamp)
                return ops->ndo_get_tstamp(dev, hwtstamps, cycles);

        return hwtstamps->hwtstamp;
}

#ifndef CONFIG_PREEMPT_RT
static inline void netdev_xmit_set_more(bool more)
{
        __this_cpu_write(softnet_data.xmit.more, more);
}

static inline bool netdev_xmit_more(void)
{
        return __this_cpu_read(softnet_data.xmit.more);
}
#else
static inline void netdev_xmit_set_more(bool more)
{
        current->net_xmit.more = more;
}

static inline bool netdev_xmit_more(void)
{
        return current->net_xmit.more;
}
#endif

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
                                              struct sk_buff *skb, struct net_device *dev,
                                              bool more)
{
        netdev_xmit_set_more(more);
        return ops->ndo_start_xmit(skb, dev);
}

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                            struct netdev_queue *txq, bool more)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        netdev_tx_t rc;

        rc = __netdev_start_xmit(ops, skb, dev, more);
        if (rc == NETDEV_TX_OK)
                txq_trans_update(txq);

        return rc;
}

int netdev_class_create_file_ns(const struct class_attribute *class_attr,
                                const void *ns);
void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
                                 const void *ns);

extern const struct kobj_ns_type_operations net_ns_type_operations;

const char *netdev_drivername(const struct net_device *dev);

static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
                                                          netdev_features_t f2)
{
        if ((f1 ^ f2) & NETIF_F_HW_CSUM) {
                if (f1 & NETIF_F_HW_CSUM)
                        f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
                else
                        f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        return f1 & f2;
}

static inline netdev_features_t netdev_get_wanted_features(
        struct net_device *dev)
{
        return (dev->features & ~dev->hw_features) | dev->wanted_features;
}
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask);

/* Allow TSO being used on stacked device :
 * Performing the GSO segmentation before last device
 * is a performance improvement.
 */
static inline netdev_features_t netdev_add_tso_features(netdev_features_t features,
                                                        netdev_features_t mask)
{
        return netdev_increment_features(features, NETIF_F_ALL_TSO, mask);
}

int __netdev_update_features(struct net_device *dev);
void netdev_update_features(struct net_device *dev);
void netdev_change_features(struct net_device *dev);

void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev);

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features);
netdev_features_t netif_skb_features(struct sk_buff *skb);
void skb_warn_bad_offload(const struct sk_buff *skb);

static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
        netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT;

        /* check flags correspondence */
        BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT));

        return (features & feature) == feature;
}

static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
        return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
               (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}

static inline bool netif_needs_gso(struct sk_buff *skb,
                                   netdev_features_t features)
{
        return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
                unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
                         (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}

void netif_set_tso_max_size(struct net_device *dev, unsigned int size);
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs);
void netif_inherit_tso_max(struct net_device *to,
                           const struct net_device *from);

static inline bool netif_is_macsec(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACSEC;
}

static inline bool netif_is_macvlan(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN;
}

static inline bool netif_is_macvlan_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN_PORT;
}

static inline bool netif_is_bond_master(const struct net_device *dev)
{
        return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_is_bond_slave(const struct net_device *dev)
{
        return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_supports_nofcs(struct net_device *dev)
{
        return dev->priv_flags & IFF_SUPP_NOFCS;
}

static inline bool netif_has_l3_rx_handler(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_RX_HANDLER;
}

static inline bool netif_is_l3_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_MASTER;
}

static inline bool netif_is_l3_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_SLAVE;
}

static inline int dev_sdif(const struct net_device *dev)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        if (netif_is_l3_slave(dev))
                return dev->ifindex;
#endif
        return 0;
}

static inline bool netif_is_bridge_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_EBRIDGE;
}

static inline bool netif_is_bridge_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_BRIDGE_PORT;
}

static inline bool netif_is_ovs_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OPENVSWITCH;
}

static inline bool netif_is_ovs_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OVS_DATAPATH;
}

static inline bool netif_is_any_bridge_master(const struct net_device *dev)
{
        return netif_is_bridge_master(dev) || netif_is_ovs_master(dev);
}

static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}

static inline bool netif_is_team_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM;
}

static inline bool netif_is_team_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM_PORT;
}

static inline bool netif_is_lag_master(const struct net_device *dev)
{
        return netif_is_bond_master(dev) || netif_is_team_master(dev);
}

static inline bool netif_is_lag_port(const struct net_device *dev)
{
        return netif_is_bond_slave(dev) || netif_is_team_port(dev);
}

static inline bool netif_is_rxfh_configured(const struct net_device *dev)
{
        return dev->priv_flags & IFF_RXFH_CONFIGURED;
}

static inline bool netif_is_failover(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER;
}

static inline bool netif_is_failover_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER_SLAVE;
}

/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
        dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}

/* return true if dev can't cope with mtu frames that need vlan tag insertion */
static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
{
        /* TODO: reserve and use an additional IFF bit, if we get more users */
        return netif_is_macsec(dev);
}

extern struct pernet_operations __net_initdata loopback_net_ops;

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* netdev_printk helpers, similar to dev_printk */

static inline const char *netdev_name(const struct net_device *dev)
{
        if (!dev->name[0] || strchr(dev->name, '%'))
                return "(unnamed net_device)";
        return dev->name;
}

static inline const char *netdev_reg_state(const struct net_device *dev)
{
        u8 reg_state = READ_ONCE(dev->reg_state);

        switch (reg_state) {
        case NETREG_UNINITIALIZED: return " (uninitialized)";
        case NETREG_REGISTERED: return "";
        case NETREG_UNREGISTERING: return " (unregistering)";
        case NETREG_UNREGISTERED: return " (unregistered)";
        case NETREG_RELEASED: return " (released)";
        case NETREG_DUMMY: return " (dummy)";
        }

        WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state);
        return " (unknown)";
}

#define MODULE_ALIAS_NETDEV(device) \
        MODULE_ALIAS("netdev-" device)

/*
 * netdev_WARN() acts like dev_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define netdev_WARN(dev, format, args...)                        \
        WARN(1, "netdevice: %s%s: " format, netdev_name(dev),        \
             netdev_reg_state(dev), ##args)

#define netdev_WARN_ONCE(dev, format, args...)                                \
        WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev),        \
                  netdev_reg_state(dev), ##args)

/*
 *        The list of packet types we will receive (as opposed to discard)
 *        and the routines to invoke.
 *
 *        Why 16. Because with 16 the only overlap we get on a hash of the
 *        low nibble of the protocol value is RARP/SNAP/X.25.
 *
 *                0800        IP
 *                0001        802.3
 *                0002        AX.25
 *                0004        802.2
 *                8035        RARP
 *                0005        SNAP
 *                0805        X.25
 *                0806        ARP
 *                8137        IPX
 *                0009        Localtalk
 *                86DD        IPv6
 */
#define PTYPE_HASH_SIZE        (16)
#define PTYPE_HASH_MASK        (PTYPE_HASH_SIZE - 1)

extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

extern struct net_device *blackhole_netdev;

/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
#define DEV_STATS_ADD(DEV, FIELD, VAL)         \
                atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)

#endif        /* _LINUX_NETDEVICE_H */


























































































































































































































































































































































    1 











































    2 
























    2 



    3 



















































































































































































































    2 


















































































    3 


    3 







    2 










    2 







    3 








    2 




    2 



    2 





























    3 



    2 
    2 



    1 




    2 





























    2 






















    2 









    1 










    3 
















    2 
    2 
















    2 
















































































































    1 



    1 










































































    1 







    1 












    1 




    1 




    1 


    1 

    1 



    1 




































































































































































































































    1 


    1 
    1 
    1 
    1 







    1 

















    3 





















    1 



    1 



    1 




























































































































    2 



    2 

    2 


























































    3 












































































































































































    2 





















    3 


    3 




    2 




































































































































































































































































































































































    1 






    3 
    3 



















    3 




    3 







    2 











    3 






















    3 


    1 




    2 

    2 

    2 










    1 









    1 

















































































































































































































































































    1 














    1 






    1 

    1 
    1 








    1 










    2 













    2 




    2 

    1 








    2 



    2 
    2 


    1 



    2 














































    2 


    2 

























    2 

    2 







    1 









    1 






    1 

    2 

    2 



    1 










































































































































































































































































































































































































































    1 

    1 



















    2 
    2 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux INET6 implementation
 *        FIB front-end.
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 */

/*        Changes:
 *
 *        YOSHIFUJI Hideaki @USAGI
 *                reworked default router selection.
 *                - respect outgoing interface
 *                - select from (probably) reachable routers (i.e.
 *                routers in REACHABLE, STALE, DELAY or PROBE states).
 *                - always select the same router if it is (probably)
 *                reachable.  otherwise, round-robin the list.
 *        Ville Nuorvala
 *                Fixed routing subtrees.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/mroute6.h>
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <linux/siphash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
#include <net/ip.h>
#include <linux/uaccess.h>
#include <linux/btf_ids.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

static int ip6_rt_type_to_error(u8 fib6_type);

#define CREATE_TRACE_POINTS
#include <trace/events/fib6.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
#undef CREATE_TRACE_POINTS

enum rt6_nud_state {
        RT6_NUD_FAIL_HARD = -3,
        RT6_NUD_FAIL_PROBE = -2,
        RT6_NUD_FAIL_DO_RR = -1,
        RT6_NUD_SUCCEED = 1
};

INDIRECT_CALLABLE_SCOPE
struct dst_entry        *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int         ip6_default_advmss(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int                ip6_mtu(const struct dst_entry *dst);
static void                ip6_negative_advice(struct sock *sk,
                                            struct dst_entry *dst);
static void                ip6_dst_destroy(struct dst_entry *);
static void                ip6_dst_ifdown(struct dst_entry *,
                                       struct net_device *dev);
static void                 ip6_dst_gc(struct dst_ops *ops);

static int                ip6_pkt_discard(struct sk_buff *skb);
static int                ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int                ip6_pkt_prohibit(struct sk_buff *skb);
static int                ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void                ip6_link_failure(struct sk_buff *skb);
static void                ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu,
                                           bool confirm_neigh);
static void                rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
                           int strict);
static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                         struct fib6_info *rt, struct dst_entry *dst,
                         struct in6_addr *dest, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags);
static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr);

#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_add_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev,
                                           unsigned int pref);
static struct fib6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev);
#endif

struct uncached_list {
        spinlock_t                lock;
        struct list_head        head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

void rt6_uncached_list_add(struct rt6_info *rt)
{
        struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

        rt->dst.rt_uncached_list = ul;

        spin_lock_bh(&ul->lock);
        list_add_tail(&rt->dst.rt_uncached, &ul->head);
        spin_unlock_bh(&ul->lock);
}

void rt6_uncached_list_del(struct rt6_info *rt)
{
        if (!list_empty(&rt->dst.rt_uncached)) {
                struct uncached_list *ul = rt->dst.rt_uncached_list;

                spin_lock_bh(&ul->lock);
                list_del_init(&rt->dst.rt_uncached);
                spin_unlock_bh(&ul->lock);
        }
}

static void rt6_uncached_list_flush_dev(struct net_device *dev)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
                struct rt6_info *rt, *safe;

                if (list_empty(&ul->head))
                        continue;

                spin_lock_bh(&ul->lock);
                list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
                        struct inet6_dev *rt_idev = rt->rt6i_idev;
                        struct net_device *rt_dev = rt->dst.dev;
                        bool handled = false;

                        if (rt_idev->dev == dev) {
                                rt->rt6i_idev = in6_dev_get(blackhole_netdev);
                                in6_dev_put(rt_idev);
                                handled = true;
                        }

                        if (rt_dev == dev) {
                                rt->dst.dev = blackhole_netdev;
                                netdev_ref_replace(rt_dev, blackhole_netdev,
                                                   &rt->dst.dev_tracker,
                                                   GFP_ATOMIC);
                                handled = true;
                        }
                        if (handled)
                                list_del_init(&rt->dst.rt_uncached);
                }
                spin_unlock_bh(&ul->lock);
        }
}

static inline const void *choose_neigh_daddr(const struct in6_addr *p,
                                             struct sk_buff *skb,
                                             const void *daddr)
{
        if (!ipv6_addr_any(p))
                return (const void *) p;
        else if (skb)
                return &ipv6_hdr(skb)->daddr;
        return daddr;
}

struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
                                   struct net_device *dev,
                                   struct sk_buff *skb,
                                   const void *daddr)
{
        struct neighbour *n;

        daddr = choose_neigh_daddr(gw, skb, daddr);
        n = __ipv6_neigh_lookup(dev, daddr);
        if (n)
                return n;

        n = neigh_create(&nd_tbl, daddr, dev);
        return IS_ERR(n) ? NULL : n;
}

static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
                                              struct sk_buff *skb,
                                              const void *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);

        return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
                                dst->dev, skb, daddr);
}

static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);
        struct net_device *dev = dst->dev;

        daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
        if (!daddr)
                return;
        if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
                return;
        if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
                return;
        __ipv6_confirm_neigh(dev, daddr);
}

static struct dst_ops ip6_dst_ops_template = {
        .family                        =        AF_INET6,
        .gc                        =        ip6_dst_gc,
        .gc_thresh                =        1024,
        .check                        =        ip6_dst_check,
        .default_advmss                =        ip6_default_advmss,
        .mtu                        =        ip6_mtu,
        .cow_metrics                =        dst_cow_metrics_generic,
        .destroy                =        ip6_dst_destroy,
        .ifdown                        =        ip6_dst_ifdown,
        .negative_advice        =        ip6_negative_advice,
        .link_failure                =        ip6_link_failure,
        .update_pmtu                =        ip6_rt_update_pmtu,
        .redirect                =        rt6_do_redirect,
        .local_out                =        __ip6_local_out,
        .neigh_lookup                =        ip6_dst_neigh_lookup,
        .confirm_neigh                =        ip6_confirm_neigh,
};

static struct dst_ops ip6_dst_blackhole_ops = {
        .family                        = AF_INET6,
        .default_advmss                = ip6_default_advmss,
        .neigh_lookup                = ip6_dst_neigh_lookup,
        .check                        = ip6_dst_check,
        .destroy                = ip6_dst_destroy,
        .cow_metrics                = dst_cow_metrics_generic,
        .update_pmtu                = dst_blackhole_update_pmtu,
        .redirect                = dst_blackhole_redirect,
        .mtu                        = dst_blackhole_mtu,
};

static const u32 ip6_template_metrics[RTAX_MAX] = {
        [RTAX_HOPLIMIT - 1] = 0,
};

static const struct fib6_info fib6_null_entry_template = {
        .fib6_flags        = (RTF_REJECT | RTF_NONEXTHOP),
        .fib6_protocol  = RTPROT_KERNEL,
        .fib6_metric        = ~(u32)0,
        .fib6_ref        = REFCOUNT_INIT(1),
        .fib6_type        = RTN_UNREACHABLE,
        .fib6_metrics        = (struct dst_metrics *)&dst_default_metrics,
};

static const struct rt6_info ip6_null_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -ENETUNREACH,
                .input                = ip6_pkt_discard,
                .output                = ip6_pkt_discard_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

#ifdef CONFIG_IPV6_MULTIPLE_TABLES

static const struct rt6_info ip6_prohibit_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -EACCES,
                .input                = ip6_pkt_prohibit,
                .output                = ip6_pkt_prohibit_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

static const struct rt6_info ip6_blk_hole_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -EINVAL,
                .input                = dst_discard,
                .output                = dst_discard_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

#endif

static void rt6_info_init(struct rt6_info *rt)
{
        memset_after(rt, 0, dst);
}

/* allocate dst with ip6_dst_ops */
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
                               int flags)
{
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                        DST_OBSOLETE_FORCE_CHK, flags);

        if (rt) {
                rt6_info_init(rt);
                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
        }

        return rt;
}
EXPORT_SYMBOL(ip6_dst_alloc);

static void ip6_dst_destroy(struct dst_entry *dst)
{
        struct rt6_info *rt = dst_rt6_info(dst);
        struct fib6_info *from;
        struct inet6_dev *idev;

        ip_dst_metrics_put(dst);
        rt6_uncached_list_del(rt);

        idev = rt->rt6i_idev;
        if (idev) {
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
        }

        from = unrcu_pointer(xchg(&rt->from, NULL));
        fib6_info_release(from);
}

static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
        struct rt6_info *rt = dst_rt6_info(dst);
        struct inet6_dev *idev = rt->rt6i_idev;

        if (idev && idev->dev != blackhole_netdev) {
                struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);

                if (blackhole_idev) {
                        rt->rt6i_idev = blackhole_idev;
                        in6_dev_put(idev);
                }
        }
}

static bool __rt6_check_expired(const struct rt6_info *rt)
{
        if (rt->rt6i_flags & RTF_EXPIRES)
                return time_after(jiffies, rt->dst.expires);
        else
                return false;
}

static bool rt6_check_expired(const struct rt6_info *rt)
{
        struct fib6_info *from;

        from = rcu_dereference(rt->from);

        if (rt->rt6i_flags & RTF_EXPIRES) {
                if (time_after(jiffies, rt->dst.expires))
                        return true;
        } else if (from) {
                return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
                        fib6_check_expired(from);
        }
        return false;
}

void fib6_select_path(const struct net *net, struct fib6_result *res,
                      struct flowi6 *fl6, int oif, bool have_oif_match,
                      const struct sk_buff *skb, int strict)
{
        struct fib6_info *sibling, *next_sibling;
        struct fib6_info *match = res->f6i;

        if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
                goto out;

        if (match->nh && have_oif_match && res->nh)
                return;

        if (skb)
                IP6CB(skb)->flags |= IP6SKB_MULTIPATH;

        /* We might have already computed the hash for ICMPv6 errors. In such
         * case it will always be non-zero. Otherwise now is the time to do it.
         */
        if (!fl6->mp_hash &&
            (!match->nh || nexthop_is_multipath(match->nh)))
                fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);

        if (unlikely(match->nh)) {
                nexthop_path_fib6_result(res, fl6->mp_hash);
                return;
        }

        if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
                goto out;

        list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
                                 fib6_siblings) {
                const struct fib6_nh *nh = sibling->fib6_nh;
                int nh_upper_bound;

                nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
                if (fl6->mp_hash > nh_upper_bound)
                        continue;
                if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
                        break;
                match = sibling;
                break;
        }

out:
        res->f6i = match;
        res->nh = match->fib6_nh;
}

/*
 *        Route lookup. rcu_read_lock() should be held.
 */

static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
                               const struct in6_addr *saddr, int oif, int flags)
{
        const struct net_device *dev;

        if (nh->fib_nh_flags & RTNH_F_DEAD)
                return false;

        dev = nh->fib_nh_dev;
        if (oif) {
                if (dev->ifindex == oif)
                        return true;
        } else {
                if (ipv6_chk_addr(net, saddr, dev,
                                  flags & RT6_LOOKUP_F_IFACE))
                        return true;
        }

        return false;
}

struct fib6_nh_dm_arg {
        struct net                *net;
        const struct in6_addr        *saddr;
        int                        oif;
        int                        flags;
        struct fib6_nh                *nh;
};

static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_dm_arg *arg = _arg;

        arg->nh = nh;
        return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
                                  arg->flags);
}

/* returns fib6_nh from nexthop or NULL */
static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
                                        struct fib6_result *res,
                                        const struct in6_addr *saddr,
                                        int oif, int flags)
{
        struct fib6_nh_dm_arg arg = {
                .net   = net,
                .saddr = saddr,
                .oif   = oif,
                .flags = flags,
        };

        if (nexthop_is_blackhole(nh))
                return NULL;

        if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
                return arg.nh;

        return NULL;
}

static void rt6_device_match(struct net *net, struct fib6_result *res,
                             const struct in6_addr *saddr, int oif, int flags)
{
        struct fib6_info *f6i = res->f6i;
        struct fib6_info *spf6i;
        struct fib6_nh *nh;

        if (!oif && ipv6_addr_any(saddr)) {
                if (unlikely(f6i->nh)) {
                        nh = nexthop_fib6_nh(f6i->nh);
                        if (nexthop_is_blackhole(f6i->nh))
                                goto out_blackhole;
                } else {
                        nh = f6i->fib6_nh;
                }
                if (!(nh->fib_nh_flags & RTNH_F_DEAD))
                        goto out;
        }

        for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
                bool matched = false;

                if (unlikely(spf6i->nh)) {
                        nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
                                              oif, flags);
                        if (nh)
                                matched = true;
                } else {
                        nh = spf6i->fib6_nh;
                        if (__rt6_device_match(net, nh, saddr, oif, flags))
                                matched = true;
                }
                if (matched) {
                        res->f6i = spf6i;
                        goto out;
                }
        }

        if (oif && flags & RT6_LOOKUP_F_IFACE) {
                res->f6i = net->ipv6.fib6_null_entry;
                nh = res->f6i->fib6_nh;
                goto out;
        }

        if (unlikely(f6i->nh)) {
                nh = nexthop_fib6_nh(f6i->nh);
                if (nexthop_is_blackhole(f6i->nh))
                        goto out_blackhole;
        } else {
                nh = f6i->fib6_nh;
        }

        if (nh->fib_nh_flags & RTNH_F_DEAD) {
                res->f6i = net->ipv6.fib6_null_entry;
                nh = res->f6i->fib6_nh;
        }
out:
        res->nh = nh;
        res->fib6_type = res->f6i->fib6_type;
        res->fib6_flags = res->f6i->fib6_flags;
        return;

out_blackhole:
        res->fib6_flags |= RTF_REJECT;
        res->fib6_type = RTN_BLACKHOLE;
        res->nh = nh;
}

#ifdef CONFIG_IPV6_ROUTER_PREF
struct __rt6_probe_work {
        struct work_struct work;
        struct in6_addr target;
        struct net_device *dev;
        netdevice_tracker dev_tracker;
};

static void rt6_probe_deferred(struct work_struct *w)
{
        struct in6_addr mcaddr;
        struct __rt6_probe_work *work =
                container_of(w, struct __rt6_probe_work, work);

        addrconf_addr_solict_mult(&work->target, &mcaddr);
        ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
        netdev_put(work->dev, &work->dev_tracker);
        kfree(work);
}

static void rt6_probe(struct fib6_nh *fib6_nh)
{
        struct __rt6_probe_work *work = NULL;
        const struct in6_addr *nh_gw;
        unsigned long last_probe;
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;

        /*
         * Okay, this does not seem to be appropriate
         * for now, however, we need to check if it
         * is really so; aka Router Reachability Probing.
         *
         * Router Reachability Probe MUST be rate-limited
         * to no more than one per minute.
         */
        if (!fib6_nh->fib_nh_gw_family)
                return;

        nh_gw = &fib6_nh->fib_nh_gw6;
        dev = fib6_nh->fib_nh_dev;
        rcu_read_lock();
        last_probe = READ_ONCE(fib6_nh->last_probe);
        idev = __in6_dev_get(dev);
        if (!idev)
                goto out;
        neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
        if (neigh) {
                if (READ_ONCE(neigh->nud_state) & NUD_VALID)
                        goto out;

                write_lock_bh(&neigh->lock);
                if (!(neigh->nud_state & NUD_VALID) &&
                    time_after(jiffies,
                               neigh->updated +
                               READ_ONCE(idev->cnf.rtr_probe_interval))) {
                        work = kmalloc(sizeof(*work), GFP_ATOMIC);
                        if (work)
                                __neigh_set_probe_once(neigh);
                }
                write_unlock_bh(&neigh->lock);
        } else if (time_after(jiffies, last_probe +
                                       READ_ONCE(idev->cnf.rtr_probe_interval))) {
                work = kmalloc(sizeof(*work), GFP_ATOMIC);
        }

        if (!work || cmpxchg(&fib6_nh->last_probe,
                             last_probe, jiffies) != last_probe) {
                kfree(work);
        } else {
                INIT_WORK(&work->work, rt6_probe_deferred);
                work->target = *nh_gw;
                netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
                work->dev = dev;
                schedule_work(&work->work);
        }

out:
        rcu_read_unlock();
}
#else
static inline void rt6_probe(struct fib6_nh *fib6_nh)
{
}
#endif

/*
 * Default Router Selection (RFC 2461 6.3.6)
 */
static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
{
        enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
        struct neighbour *neigh;

        rcu_read_lock();
        neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
                                          &fib6_nh->fib_nh_gw6);
        if (neigh) {
                u8 nud_state = READ_ONCE(neigh->nud_state);

                if (nud_state & NUD_VALID)
                        ret = RT6_NUD_SUCCEED;
#ifdef CONFIG_IPV6_ROUTER_PREF
                else if (!(nud_state & NUD_FAILED))
                        ret = RT6_NUD_SUCCEED;
                else
                        ret = RT6_NUD_FAIL_PROBE;
#endif
        } else {
                ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
                      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
        }
        rcu_read_unlock();

        return ret;
}

static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
                           int strict)
{
        int m = 0;

        if (!oif || nh->fib_nh_dev->ifindex == oif)
                m = 2;

        if (!m && (strict & RT6_LOOKUP_F_IFACE))
                return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
#endif
        if ((strict & RT6_LOOKUP_F_REACHABLE) &&
            !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
                int n = rt6_check_neigh(nh);
                if (n < 0)
                        return n;
        }
        return m;
}

static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
                       int oif, int strict, int *mpri, bool *do_rr)
{
        bool match_do_rr = false;
        bool rc = false;
        int m;

        if (nh->fib_nh_flags & RTNH_F_DEAD)
                goto out;

        if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
            nh->fib_nh_flags & RTNH_F_LINKDOWN &&
            !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
                goto out;

        m = rt6_score_route(nh, fib6_flags, oif, strict);
        if (m == RT6_NUD_FAIL_DO_RR) {
                match_do_rr = true;
                m = 0; /* lowest valid score */
        } else if (m == RT6_NUD_FAIL_HARD) {
                goto out;
        }

        if (strict & RT6_LOOKUP_F_REACHABLE)
                rt6_probe(nh);

        /* note that m can be RT6_NUD_FAIL_PROBE at this point */
        if (m > *mpri) {
                *do_rr = match_do_rr;
                *mpri = m;
                rc = true;
        }
out:
        return rc;
}

struct fib6_nh_frl_arg {
        u32                flags;
        int                oif;
        int                strict;
        int                *mpri;
        bool                *do_rr;
        struct fib6_nh        *nh;
};

static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_frl_arg *arg = _arg;

        arg->nh = nh;
        return find_match(nh, arg->flags, arg->oif, arg->strict,
                          arg->mpri, arg->do_rr);
}

static void __find_rr_leaf(struct fib6_info *f6i_start,
                           struct fib6_info *nomatch, u32 metric,
                           struct fib6_result *res, struct fib6_info **cont,
                           int oif, int strict, bool *do_rr, int *mpri)
{
        struct fib6_info *f6i;

        for (f6i = f6i_start;
             f6i && f6i != nomatch;
             f6i = rcu_dereference(f6i->fib6_next)) {
                bool matched = false;
                struct fib6_nh *nh;

                if (cont && f6i->fib6_metric != metric) {
                        *cont = f6i;
                        return;
                }

                if (fib6_check_expired(f6i))
                        continue;

                if (unlikely(f6i->nh)) {
                        struct fib6_nh_frl_arg arg = {
                                .flags  = f6i->fib6_flags,
                                .oif    = oif,
                                .strict = strict,
                                .mpri   = mpri,
                                .do_rr  = do_rr
                        };

                        if (nexthop_is_blackhole(f6i->nh)) {
                                res->fib6_flags = RTF_REJECT;
                                res->fib6_type = RTN_BLACKHOLE;
                                res->f6i = f6i;
                                res->nh = nexthop_fib6_nh(f6i->nh);
                                return;
                        }
                        if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
                                                     &arg)) {
                                matched = true;
                                nh = arg.nh;
                        }
                } else {
                        nh = f6i->fib6_nh;
                        if (find_match(nh, f6i->fib6_flags, oif, strict,
                                       mpri, do_rr))
                                matched = true;
                }
                if (matched) {
                        res->f6i = f6i;
                        res->nh = nh;
                        res->fib6_flags = f6i->fib6_flags;
                        res->fib6_type = f6i->fib6_type;
                }
        }
}

static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
                         struct fib6_info *rr_head, int oif, int strict,
                         bool *do_rr, struct fib6_result *res)
{
        u32 metric = rr_head->fib6_metric;
        struct fib6_info *cont = NULL;
        int mpri = -1;

        __find_rr_leaf(rr_head, NULL, metric, res, &cont,
                       oif, strict, do_rr, &mpri);

        __find_rr_leaf(leaf, rr_head, metric, res, &cont,
                       oif, strict, do_rr, &mpri);

        if (res->f6i || !cont)
                return;

        __find_rr_leaf(cont, NULL, metric, res, NULL,
                       oif, strict, do_rr, &mpri);
}

static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
                       struct fib6_result *res, int strict)
{
        struct fib6_info *leaf = rcu_dereference(fn->leaf);
        struct fib6_info *rt0;
        bool do_rr = false;
        int key_plen;

        /* make sure this function or its helpers sets f6i */
        res->f6i = NULL;

        if (!leaf || leaf == net->ipv6.fib6_null_entry)
                goto out;

        rt0 = rcu_dereference(fn->rr_ptr);
        if (!rt0)
                rt0 = leaf;

        /* Double check to make sure fn is not an intermediate node
         * and fn->leaf does not points to its child's leaf
         * (This might happen if all routes under fn are deleted from
         * the tree and fib6_repair_tree() is called on the node.)
         */
        key_plen = rt0->fib6_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
        if (rt0->fib6_src.plen)
                key_plen = rt0->fib6_src.plen;
#endif
        if (fn->fn_bit != key_plen)
                goto out;

        find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
        if (do_rr) {
                struct fib6_info *next = rcu_dereference(rt0->fib6_next);

                /* no entries matched; do round-robin */
                if (!next || next->fib6_metric != rt0->fib6_metric)
                        next = leaf;

                if (next != rt0) {
                        spin_lock_bh(&leaf->fib6_table->tb6_lock);
                        /* make sure next is not being deleted from the tree */
                        if (next->fib6_node)
                                rcu_assign_pointer(fn->rr_ptr, next);
                        spin_unlock_bh(&leaf->fib6_table->tb6_lock);
                }
        }

out:
        if (!res->f6i) {
                res->f6i = net->ipv6.fib6_null_entry;
                res->nh = res->f6i->fib6_nh;
                res->fib6_flags = res->f6i->fib6_flags;
                res->fib6_type = res->f6i->fib6_type;
        }
}

static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
{
        return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
               res->nh->fib_nh_gw_family;
}

#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr)
{
        struct net *net = dev_net(dev);
        struct route_info *rinfo = (struct route_info *) opt;
        struct in6_addr prefix_buf, *prefix;
        struct fib6_table *table;
        unsigned int pref;
        unsigned long lifetime;
        struct fib6_info *rt;

        if (len < sizeof(struct route_info)) {
                return -EINVAL;
        }

        /* Sanity check for prefix_len and length */
        if (rinfo->length > 3) {
                return -EINVAL;
        } else if (rinfo->prefix_len > 128) {
                return -EINVAL;
        } else if (rinfo->prefix_len > 64) {
                if (rinfo->length < 2) {
                        return -EINVAL;
                }
        } else if (rinfo->prefix_len > 0) {
                if (rinfo->length < 1) {
                        return -EINVAL;
                }
        }

        pref = rinfo->route_pref;
        if (pref == ICMPV6_ROUTER_PREF_INVALID)
                return -EINVAL;

        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);

        if (rinfo->length == 3)
                prefix = (struct in6_addr *)rinfo->prefix;
        else {
                /* this function is safe */
                ipv6_addr_prefix(&prefix_buf,
                                 (struct in6_addr *)rinfo->prefix,
                                 rinfo->prefix_len);
                prefix = &prefix_buf;
        }

        if (rinfo->prefix_len == 0)
                rt = rt6_get_dflt_router(net, gwaddr, dev);
        else
                rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
                                        gwaddr, dev);

        if (rt && !lifetime) {
                ip6_del_rt(net, rt, false);
                rt = NULL;
        }

        if (!rt && lifetime)
                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
                                        dev, pref);
        else if (rt)
                rt->fib6_flags = RTF_ROUTEINFO |
                                 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

        if (rt) {
                table = rt->fib6_table;
                spin_lock_bh(&table->tb6_lock);

                if (!addrconf_finite_timeout(lifetime)) {
                        fib6_clean_expires(rt);
                        fib6_remove_gc_list(rt);
                } else {
                        fib6_set_expires(rt, jiffies + HZ * lifetime);
                        fib6_add_gc_list(rt);
                }

                spin_unlock_bh(&table->tb6_lock);

                fib6_info_release(rt);
        }
        return 0;
}
#endif

/*
 *        Misc support functions
 */

/* called with rcu_lock held */
static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
{
        struct net_device *dev = res->nh->fib_nh_dev;

        if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
                /* for copies of local routes, dst->dev needs to be the
                 * device if it is a master device, the master device if
                 * device is enslaved, and the loopback as the default
                 */
                if (netif_is_l3_slave(dev) &&
                    !rt6_need_strict(&res->f6i->fib6_dst.addr))
                        dev = l3mdev_master_dev_rcu(dev);
                else if (!netif_is_l3_master(dev))
                        dev = dev_net(dev)->loopback_dev;
                /* last case is netif_is_l3_master(dev) is true in which
                 * case we want dev returned to be dev
                 */
        }

        return dev;
}

static const int fib6_prop[RTN_MAX + 1] = {
        [RTN_UNSPEC]        = 0,
        [RTN_UNICAST]        = 0,
        [RTN_LOCAL]        = 0,
        [RTN_BROADCAST]        = 0,
        [RTN_ANYCAST]        = 0,
        [RTN_MULTICAST]        = 0,
        [RTN_BLACKHOLE]        = -EINVAL,
        [RTN_UNREACHABLE] = -EHOSTUNREACH,
        [RTN_PROHIBIT]        = -EACCES,
        [RTN_THROW]        = -EAGAIN,
        [RTN_NAT]        = -EINVAL,
        [RTN_XRESOLVE]        = -EINVAL,
};

static int ip6_rt_type_to_error(u8 fib6_type)
{
        return fib6_prop[fib6_type];
}

static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
{
        unsigned short flags = 0;

        if (rt->dst_nocount)
                flags |= DST_NOCOUNT;
        if (rt->dst_nopolicy)
                flags |= DST_NOPOLICY;

        return flags;
}

static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
{
        rt->dst.error = ip6_rt_type_to_error(fib6_type);

        switch (fib6_type) {
        case RTN_BLACKHOLE:
                rt->dst.output = dst_discard_out;
                rt->dst.input = dst_discard;
                break;
        case RTN_PROHIBIT:
                rt->dst.output = ip6_pkt_prohibit_out;
                rt->dst.input = ip6_pkt_prohibit;
                break;
        case RTN_THROW:
        case RTN_UNREACHABLE:
        default:
                rt->dst.output = ip6_pkt_discard_out;
                rt->dst.input = ip6_pkt_discard;
                break;
        }
}

static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
{
        struct fib6_info *f6i = res->f6i;

        if (res->fib6_flags & RTF_REJECT) {
                ip6_rt_init_dst_reject(rt, res->fib6_type);
                return;
        }

        rt->dst.error = 0;
        rt->dst.output = ip6_output;

        if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
                rt->dst.input = ip6_input;
        } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
                rt->dst.input = ip6_mc_input;
        } else {
                rt->dst.input = ip6_forward;
        }

        if (res->nh->fib_nh_lws) {
                rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
                lwtunnel_set_redirect(&rt->dst);
        }

        rt->dst.lastuse = jiffies;
}

/* Caller must already hold reference to @from */
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
        rt->rt6i_flags &= ~RTF_EXPIRES;
        rcu_assign_pointer(rt->from, from);
        ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
}

/* Caller must already hold reference to f6i in result */
static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
{
        const struct fib6_nh *nh = res->nh;
        const struct net_device *dev = nh->fib_nh_dev;
        struct fib6_info *f6i = res->f6i;

        ip6_rt_init_dst(rt, res);

        rt->rt6i_dst = f6i->fib6_dst;
        rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
        rt->rt6i_flags = res->fib6_flags;
        if (nh->fib_nh_gw_family) {
                rt->rt6i_gateway = nh->fib_nh_gw6;
                rt->rt6i_flags |= RTF_GATEWAY;
        }
        rt6_set_from(rt, f6i);
#ifdef CONFIG_IPV6_SUBTREES
        rt->rt6i_src = f6i->fib6_src;
#endif
}

static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
                                        struct in6_addr *saddr)
{
        struct fib6_node *pn, *sn;
        while (1) {
                if (fn->fn_flags & RTN_TL_ROOT)
                        return NULL;
                pn = rcu_dereference(fn->parent);
                sn = FIB6_SUBTREE(pn);
                if (sn && sn != fn)
                        fn = fib6_node_lookup(sn, NULL, saddr);
                else
                        fn = pn;
                if (fn->fn_flags & RTN_RTINFO)
                        return fn;
        }
}

static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
{
        struct rt6_info *rt = *prt;

        if (dst_hold_safe(&rt->dst))
                return true;
        if (net) {
                rt = net->ipv6.ip6_null_entry;
                dst_hold(&rt->dst);
        } else {
                rt = NULL;
        }
        *prt = rt;
        return false;
}

/* called with rcu_lock held */
static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
{
        struct net_device *dev = res->nh->fib_nh_dev;
        struct fib6_info *f6i = res->f6i;
        unsigned short flags;
        struct rt6_info *nrt;

        if (!fib6_info_hold_safe(f6i))
                goto fallback;

        flags = fib6_info_dst_flags(f6i);
        nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
        if (!nrt) {
                fib6_info_release(f6i);
                goto fallback;
        }

        ip6_rt_copy_init(nrt, res);
        return nrt;

fallback:
        nrt = dev_net(dev)->ipv6.ip6_null_entry;
        dst_hold(&nrt->dst);
        return nrt;
}

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        struct fib6_result res = {};
        struct fib6_node *fn;
        struct rt6_info *rt;

        rcu_read_lock();
        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
        res.f6i = rcu_dereference(fn->leaf);
        if (!res.f6i)
                res.f6i = net->ipv6.fib6_null_entry;
        else
                rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
                                 flags);

        if (res.f6i == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;

                rt = net->ipv6.ip6_null_entry;
                dst_hold(&rt->dst);
                goto out;
        } else if (res.fib6_flags & RTF_REJECT) {
                goto do_create;
        }

        fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
                         fl6->flowi6_oif != 0, skb, flags);

        /* Search through exception table */
        rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
        if (rt) {
                if (ip6_hold_safe(net, &rt))
                        dst_use_noref(&rt->dst, jiffies);
        } else {
do_create:
                rt = ip6_create_rt_rcu(&res);
        }

out:
        trace_fib6_table_lookup(net, &res, table, fl6);

        rcu_read_unlock();

        return rt;
}

struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb, int flags)
{
        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int strict)
{
        struct flowi6 fl6 = {
                .flowi6_oif = oif,
                .daddr = *daddr,
        };
        struct dst_entry *dst;
        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;

        if (saddr) {
                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
                flags |= RT6_LOOKUP_F_HAS_SADDR;
        }

        dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
        if (dst->error == 0)
                return dst_rt6_info(dst);

        dst_release(dst);

        return NULL;
}
EXPORT_SYMBOL(rt6_lookup);

/* ip6_ins_rt is called with FREE table->tb6_lock.
 * It takes new route entry, the addition fails by any reason the
 * route is released.
 * Caller must hold dst before calling it.
 */

static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
                        struct netlink_ext_ack *extack)
{
        int err;
        struct fib6_table *table;

        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);
        err = fib6_add(&table->tb6_root, rt, info, extack);
        spin_unlock_bh(&table->tb6_lock);

        return err;
}

int ip6_ins_rt(struct net *net, struct fib6_info *rt)
{
        struct nl_info info = {        .nl_net = net, };

        return __ip6_ins_rt(rt, &info, NULL);
}

static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr)
{
        struct fib6_info *f6i = res->f6i;
        struct net_device *dev;
        struct rt6_info *rt;

        /*
         *        Clone the route.
         */

        if (!fib6_info_hold_safe(f6i))
                return NULL;

        dev = ip6_rt_get_dev_rcu(res);
        rt = ip6_dst_alloc(dev_net(dev), dev, 0);
        if (!rt) {
                fib6_info_release(f6i);
                return NULL;
        }

        ip6_rt_copy_init(rt, res);
        rt->rt6i_flags |= RTF_CACHE;
        rt->rt6i_dst.addr = *daddr;
        rt->rt6i_dst.plen = 128;

        if (!rt6_is_gw_or_nonexthop(res)) {
                if (f6i->fib6_dst.plen != 128 &&
                    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
                        rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
                if (rt->rt6i_src.plen && saddr) {
                        rt->rt6i_src.addr = *saddr;
                        rt->rt6i_src.plen = 128;
                }
#endif
        }

        return rt;
}

static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
{
        struct fib6_info *f6i = res->f6i;
        unsigned short flags = fib6_info_dst_flags(f6i);
        struct net_device *dev;
        struct rt6_info *pcpu_rt;

        if (!fib6_info_hold_safe(f6i))
                return NULL;

        rcu_read_lock();
        dev = ip6_rt_get_dev_rcu(res);
        pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
        rcu_read_unlock();
        if (!pcpu_rt) {
                fib6_info_release(f6i);
                return NULL;
        }
        ip6_rt_copy_init(pcpu_rt, res);
        pcpu_rt->rt6i_flags |= RTF_PCPU;

        if (f6i->nh)
                pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));

        return pcpu_rt;
}

static bool rt6_is_valid(const struct rt6_info *rt6)
{
        return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
}

/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
        struct rt6_info *pcpu_rt;

        pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);

        if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
                struct rt6_info *prev, **p;

                p = this_cpu_ptr(res->nh->rt6i_pcpu);
                /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
                prev = xchg(p, NULL);
                if (prev) {
                        dst_dev_put(&prev->dst);
                        dst_release(&prev->dst);
                }

                pcpu_rt = NULL;
        }

        return pcpu_rt;
}

static struct rt6_info *rt6_make_pcpu_route(struct net *net,
                                            const struct fib6_result *res)
{
        struct rt6_info *pcpu_rt, *prev, **p;

        pcpu_rt = ip6_rt_pcpu_alloc(res);
        if (!pcpu_rt)
                return NULL;

        p = this_cpu_ptr(res->nh->rt6i_pcpu);
        prev = cmpxchg(p, NULL, pcpu_rt);
        BUG_ON(prev);

        if (res->f6i->fib6_destroying) {
                struct fib6_info *from;

                from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
                fib6_info_release(from);
        }

        return pcpu_rt;
}

/* exception hash table implementation
 */
static DEFINE_SPINLOCK(rt6_exception_lock);

/* Remove rt6_ex from hash table and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
                                 struct rt6_exception *rt6_ex)
{
        struct fib6_info *from;
        struct net *net;

        if (!bucket || !rt6_ex)
                return;

        net = dev_net(rt6_ex->rt6i->dst.dev);
        net->ipv6.rt6_stats->fib_rt_cache--;

        /* purge completely the exception to allow releasing the held resources:
         * some [sk] cache may keep the dst around for unlimited time
         */
        from = unrcu_pointer(xchg(&rt6_ex->rt6i->from, NULL));
        fib6_info_release(from);
        dst_dev_put(&rt6_ex->rt6i->dst);

        hlist_del_rcu(&rt6_ex->hlist);
        dst_release(&rt6_ex->rt6i->dst);
        kfree_rcu(rt6_ex, rcu);
        WARN_ON_ONCE(!bucket->depth);
        bucket->depth--;
}

/* Remove oldest rt6_ex in bucket and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
        struct rt6_exception *rt6_ex, *oldest = NULL;

        if (!bucket)
                return;

        hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
                        oldest = rt6_ex;
        }
        rt6_remove_exception(bucket, oldest);
}

static u32 rt6_exception_hash(const struct in6_addr *dst,
                              const struct in6_addr *src)
{
        static siphash_aligned_key_t rt6_exception_key;
        struct {
                struct in6_addr dst;
                struct in6_addr src;
        } __aligned(SIPHASH_ALIGNMENT) combined = {
                .dst = *dst,
        };
        u64 val;

        net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));

#ifdef CONFIG_IPV6_SUBTREES
        if (src)
                combined.src = *src;
#endif
        val = siphash(&combined, sizeof(combined), &rt6_exception_key);

        return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rt6_exception_lock
 */
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
                              const struct in6_addr *daddr,
                              const struct in6_addr *saddr)
{
        struct rt6_exception *rt6_ex;
        u32 hval;

        if (!(*bucket) || !daddr)
                return NULL;

        hval = rt6_exception_hash(daddr, saddr);
        *bucket += hval;

        hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
                struct rt6_info *rt6 = rt6_ex->rt6i;
                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
                if (matched && saddr)
                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
                if (matched)
                        return rt6_ex;
        }
        return NULL;
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rcu_read_lock()
 */
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr)
{
        struct rt6_exception *rt6_ex;
        u32 hval;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (!(*bucket) || !daddr)
                return NULL;

        hval = rt6_exception_hash(daddr, saddr);
        *bucket += hval;

        hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
                struct rt6_info *rt6 = rt6_ex->rt6i;
                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
                if (matched && saddr)
                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
                if (matched)
                        return rt6_ex;
        }
        return NULL;
}

static unsigned int fib6_mtu(const struct fib6_result *res)
{
        const struct fib6_nh *nh = res->nh;
        unsigned int mtu;

        if (res->f6i->fib6_pmtu) {
                mtu = res->f6i->fib6_pmtu;
        } else {
                struct net_device *dev = nh->fib_nh_dev;
                struct inet6_dev *idev;

                rcu_read_lock();
                idev = __in6_dev_get(dev);
                mtu = READ_ONCE(idev->cnf.mtu6);
                rcu_read_unlock();
        }

        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

        return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}

#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL

/* used when the flushed bit is not relevant, only access to the bucket
 * (ie., all bucket users except rt6_insert_exception);
 *
 * called under rcu lock; sometimes called with rt6_exception_lock held
 */
static
struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
                                                       spinlock_t *lock)
{
        struct rt6_exception_bucket *bucket;

        if (lock)
                bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                                   lockdep_is_held(lock));
        else
                bucket = rcu_dereference(nh->rt6i_exception_bucket);

        /* remove bucket flushed bit if set */
        if (bucket) {
                unsigned long p = (unsigned long)bucket;

                p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
                bucket = (struct rt6_exception_bucket *)p;
        }

        return bucket;
}

static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
{
        unsigned long p = (unsigned long)bucket;

        return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
}

/* called with rt6_exception_lock held */
static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
                                              spinlock_t *lock)
{
        struct rt6_exception_bucket *bucket;
        unsigned long p;

        bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                           lockdep_is_held(lock));

        p = (unsigned long)bucket;
        p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
        bucket = (struct rt6_exception_bucket *)p;
        rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
}

static int rt6_insert_exception(struct rt6_info *nrt,
                                const struct fib6_result *res)
{
        struct net *net = dev_net(nrt->dst.dev);
        struct rt6_exception_bucket *bucket;
        struct fib6_info *f6i = res->f6i;
        struct in6_addr *src_key = NULL;
        struct rt6_exception *rt6_ex;
        struct fib6_nh *nh = res->nh;
        int max_depth;
        int err = 0;

        spin_lock_bh(&rt6_exception_lock);

        bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                          lockdep_is_held(&rt6_exception_lock));
        if (!bucket) {
                bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
                                 GFP_ATOMIC);
                if (!bucket) {
                        err = -ENOMEM;
                        goto out;
                }
                rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
        } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
                err = -EINVAL;
                goto out;
        }

#ifdef CONFIG_IPV6_SUBTREES
        /* fib6_src.plen != 0 indicates f6i is in subtree
         * and exception table is indexed by a hash of
         * both fib6_dst and fib6_src.
         * Otherwise, the exception table is indexed by
         * a hash of only fib6_dst.
         */
        if (f6i->fib6_src.plen)
                src_key = &nrt->rt6i_src.addr;
#endif
        /* rt6_mtu_change() might lower mtu on f6i.
         * Only insert this exception route if its mtu
         * is less than f6i's mtu value.
         */
        if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
                err = -EINVAL;
                goto out;
        }

        rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
                                               src_key);
        if (rt6_ex)
                rt6_remove_exception(bucket, rt6_ex);

        rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
        if (!rt6_ex) {
                err = -ENOMEM;
                goto out;
        }
        rt6_ex->rt6i = nrt;
        rt6_ex->stamp = jiffies;
        hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
        bucket->depth++;
        net->ipv6.rt6_stats->fib_rt_cache++;

        /* Randomize max depth to avoid some side channels attacks. */
        max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
        while (bucket->depth > max_depth)
                rt6_exception_remove_oldest(bucket);

out:
        spin_unlock_bh(&rt6_exception_lock);

        /* Update fn->fn_sernum to invalidate all cached dst */
        if (!err) {
                spin_lock_bh(&f6i->fib6_table->tb6_lock);
                fib6_update_sernum(net, f6i);
                spin_unlock_bh(&f6i->fib6_table->tb6_lock);
                fib6_force_start_gc(net);
        }

        return err;
}

static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        spin_lock_bh(&rt6_exception_lock);

        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (!bucket)
                goto out;

        /* Prevent rt6_insert_exception() to recreate the bucket list */
        if (!from)
                fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
                        if (!from ||
                            rcu_access_pointer(rt6_ex->rt6i->from) == from)
                                rt6_remove_exception(bucket, rt6_ex);
                }
                WARN_ON_ONCE(!from && bucket->depth);
                bucket++;
        }
out:
        spin_unlock_bh(&rt6_exception_lock);
}

static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
{
        struct fib6_info *f6i = arg;

        fib6_nh_flush_exceptions(nh, f6i);

        return 0;
}

void rt6_flush_exceptions(struct fib6_info *f6i)
{
        if (f6i->nh)
                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
                                         f6i);
        else
                fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
}

/* Find cached rt in the hash table inside passed in rt
 * Caller has to hold rcu_read_lock()
 */
static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct rt6_info *ret = NULL;

#ifdef CONFIG_IPV6_SUBTREES
        /* fib6i_src.plen != 0 indicates f6i is in subtree
         * and exception table is indexed by a hash of
         * both fib6_dst and fib6_src.
         * However, the src addr used to create the hash
         * might not be exactly the passed in saddr which
         * is a /128 addr from the flow.
         * So we need to use f6i->fib6_src to redo lookup
         * if the passed in saddr does not find anything.
         * (See the logic in ip6_rt_cache_alloc() on how
         * rt->rt6i_src is updated.)
         */
        if (res->f6i->fib6_src.plen)
                src_key = saddr;
find_ex:
#endif
        bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
        rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);

        if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
                ret = rt6_ex->rt6i;

#ifdef CONFIG_IPV6_SUBTREES
        /* Use fib6_src as src_key and redo lookup */
        if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
                src_key = &res->f6i->fib6_src.addr;
                goto find_ex;
        }
#endif

        return ret;
}

/* Remove the passed in cached rt from the hash table that contains it */
static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
                                    const struct rt6_info *rt)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int err;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return -ENOENT;

        spin_lock_bh(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);

#ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
         * and exception table is indexed by a hash of
         * both rt6i_dst and rt6i_src.
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
        if (plen)
                src_key = &rt->rt6i_src.addr;
#endif
        rt6_ex = __rt6_find_exception_spinlock(&bucket,
                                               &rt->rt6i_dst.addr,
                                               src_key);
        if (rt6_ex) {
                rt6_remove_exception(bucket, rt6_ex);
                err = 0;
        } else {
                err = -ENOENT;
        }

        spin_unlock_bh(&rt6_exception_lock);
        return err;
}

struct fib6_nh_excptn_arg {
        struct rt6_info        *rt;
        int                plen;
};

static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_excptn_arg *arg = _arg;
        int err;

        err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
        if (err == 0)
                return 1;

        return 0;
}

static int rt6_remove_exception_rt(struct rt6_info *rt)
{
        struct fib6_info *from;

        from = rcu_dereference(rt->from);
        if (!from || !(rt->rt6i_flags & RTF_CACHE))
                return -EINVAL;

        if (from->nh) {
                struct fib6_nh_excptn_arg arg = {
                        .rt = rt,
                        .plen = from->fib6_src.plen
                };
                int rc;

                /* rc = 1 means an entry was found */
                rc = nexthop_for_each_fib6_nh(from->nh,
                                              rt6_nh_remove_exception_rt,
                                              &arg);
                return rc ? 0 : -ENOENT;
        }

        return fib6_nh_remove_exception(from->fib6_nh,
                                        from->fib6_src.plen, rt);
}

/* Find rt6_ex which contains the passed in rt cache and
 * refresh its stamp
 */
static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
                                     const struct rt6_info *rt)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;

        bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
         * and exception table is indexed by a hash of
         * both rt6i_dst and rt6i_src.
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
        if (plen)
                src_key = &rt->rt6i_src.addr;
#endif
        rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
        if (rt6_ex)
                rt6_ex->stamp = jiffies;
}

struct fib6_nh_match_arg {
        const struct net_device *dev;
        const struct in6_addr        *gw;
        struct fib6_nh                *match;
};

/* determine if fib6_nh has given device and gateway */
static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_match_arg *arg = _arg;

        if (arg->dev != nh->fib_nh_dev ||
            (arg->gw && !nh->fib_nh_gw_family) ||
            (!arg->gw && nh->fib_nh_gw_family) ||
            (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
                return 0;

        arg->match = nh;

        /* found a match, break the loop */
        return 1;
}

static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
        struct fib6_info *from;
        struct fib6_nh *fib6_nh;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (!from || !(rt->rt6i_flags & RTF_CACHE))
                goto unlock;

        if (from->nh) {
                struct fib6_nh_match_arg arg = {
                        .dev = rt->dst.dev,
                        .gw = &rt->rt6i_gateway,
                };

                nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);

                if (!arg.match)
                        goto unlock;
                fib6_nh = arg.match;
        } else {
                fib6_nh = from->fib6_nh;
        }
        fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
unlock:
        rcu_read_unlock();
}

static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
                                         struct rt6_info *rt, int mtu)
{
        /* If the new MTU is lower than the route PMTU, this new MTU will be the
         * lowest MTU in the path: always allow updating the route PMTU to
         * reflect PMTU decreases.
         *
         * If the new MTU is higher, and the route PMTU is equal to the local
         * MTU, this means the old MTU is the lowest in the path, so allow
         * updating it: if other nodes now have lower MTUs, PMTU discovery will
         * handle this.
         */

        if (dst_mtu(&rt->dst) >= mtu)
                return true;

        if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
                return true;

        return false;
}

static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
                                       const struct fib6_nh *nh, int mtu)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int i;

        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (!bucket)
                return;

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                        struct rt6_info *entry = rt6_ex->rt6i;

                        /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
                         * route), the metrics of its rt->from have already
                         * been updated.
                         */
                        if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
                            rt6_mtu_change_route_allowed(idev, entry, mtu))
                                dst_metric_set(&entry->dst, RTAX_MTU, mtu);
                }
                bucket++;
        }
}

#define RTF_CACHE_GATEWAY        (RTF_GATEWAY | RTF_CACHE)

static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
                                            const struct in6_addr *gateway)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return;

        spin_lock_bh(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (bucket) {
                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                        hlist_for_each_entry_safe(rt6_ex, tmp,
                                                  &bucket->chain, hlist) {
                                struct rt6_info *entry = rt6_ex->rt6i;

                                if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
                                    RTF_CACHE_GATEWAY &&
                                    ipv6_addr_equal(gateway,
                                                    &entry->rt6i_gateway)) {
                                        rt6_remove_exception(bucket, rt6_ex);
                                }
                        }
                        bucket++;
                }
        }

        spin_unlock_bh(&rt6_exception_lock);
}

static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
                                      struct rt6_exception *rt6_ex,
                                      struct fib6_gc_args *gc_args,
                                      unsigned long now)
{
        struct rt6_info *rt = rt6_ex->rt6i;

        /* we are pruning and obsoleting aged-out and non gateway exceptions
         * even if others have still references to them, so that on next
         * dst_check() such references can be dropped.
         * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
         * expired, independently from their aging, as per RFC 8201 section 4
         */
        if (!(rt->rt6i_flags & RTF_EXPIRES)) {
                if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
                        pr_debug("aging clone %p\n", rt);
                        rt6_remove_exception(bucket, rt6_ex);
                        return;
                }
        } else if (time_after(jiffies, rt->dst.expires)) {
                pr_debug("purging expired route %p\n", rt);
                rt6_remove_exception(bucket, rt6_ex);
                return;
        }

        if (rt->rt6i_flags & RTF_GATEWAY) {
                struct neighbour *neigh;

                neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);

                if (!(neigh && (neigh->flags & NTF_ROUTER))) {
                        pr_debug("purging route %p via non-router but gateway\n",
                                 rt);
                        rt6_remove_exception(bucket, rt6_ex);
                        return;
                }
        }

        gc_args->more++;
}

static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
                                   struct fib6_gc_args *gc_args,
                                   unsigned long now)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return;

        rcu_read_lock_bh();
        spin_lock(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (bucket) {
                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                        hlist_for_each_entry_safe(rt6_ex, tmp,
                                                  &bucket->chain, hlist) {
                                rt6_age_examine_exception(bucket, rt6_ex,
                                                          gc_args, now);
                        }
                        bucket++;
                }
        }
        spin_unlock(&rt6_exception_lock);
        rcu_read_unlock_bh();
}

struct fib6_nh_age_excptn_arg {
        struct fib6_gc_args        *gc_args;
        unsigned long                now;
};

static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_age_excptn_arg *arg = _arg;

        fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
        return 0;
}

void rt6_age_exceptions(struct fib6_info *f6i,
                        struct fib6_gc_args *gc_args,
                        unsigned long now)
{
        if (f6i->nh) {
                struct fib6_nh_age_excptn_arg arg = {
                        .gc_args = gc_args,
                        .now = now
                };

                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
                                         &arg);
        } else {
                fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
        }
}

/* must be called with rcu lock held */
int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
                      struct flowi6 *fl6, struct fib6_result *res, int strict)
{
        struct fib6_node *fn, *saved_fn;

        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
        saved_fn = fn;

redo_rt6_select:
        rt6_select(net, fn, oif, res, strict);
        if (res->f6i == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto redo_rt6_select;
                else if (strict & RT6_LOOKUP_F_REACHABLE) {
                        /* also consider unreachable route */
                        strict &= ~RT6_LOOKUP_F_REACHABLE;
                        fn = saved_fn;
                        goto redo_rt6_select;
                }
        }

        trace_fib6_table_lookup(net, res, table, fl6);

        return 0;
}

struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int oif, struct flowi6 *fl6,
                               const struct sk_buff *skb, int flags)
{
        struct fib6_result res = {};
        struct rt6_info *rt = NULL;
        int strict = 0;

        WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
                     !rcu_read_lock_held());

        strict |= flags & RT6_LOOKUP_F_IFACE;
        strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
        if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
                strict |= RT6_LOOKUP_F_REACHABLE;

        rcu_read_lock();

        fib6_table_lookup(net, table, oif, fl6, &res, strict);
        if (res.f6i == net->ipv6.fib6_null_entry)
                goto out;

        fib6_select_path(net, &res, fl6, oif, false, skb, strict);

        /*Search through exception table */
        rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
        if (rt) {
                goto out;
        } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
                            !res.nh->fib_nh_gw_family)) {
                /* Create a RTF_CACHE clone which will not be
                 * owned by the fib6 tree.  It is for the special case where
                 * the daddr in the skb during the neighbor look-up is different
                 * from the fl6->daddr used to look-up route here.
                 */
                rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);

                if (rt) {
                        /* 1 refcnt is taken during ip6_rt_cache_alloc().
                         * As rt6_uncached_list_add() does not consume refcnt,
                         * this refcnt is always returned to the caller even
                         * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
                         */
                        rt6_uncached_list_add(rt);
                        rcu_read_unlock();

                        return rt;
                }
        } else {
                /* Get a percpu copy */
                local_bh_disable();
                rt = rt6_get_pcpu_route(&res);

                if (!rt)
                        rt = rt6_make_pcpu_route(net, &res);

                local_bh_enable();
        }
out:
        if (!rt)
                rt = net->ipv6.ip6_null_entry;
        if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                ip6_hold_safe(net, &rt);
        rcu_read_unlock();

        return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
                                            struct fib6_table *table,
                                            struct flowi6 *fl6,
                                            const struct sk_buff *skb,
                                            int flags)
{
        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
}

struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
                                         struct flowi6 *fl6,
                                         const struct sk_buff *skb,
                                         int flags)
{
        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
                flags |= RT6_LOOKUP_F_IFACE;

        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
}
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);

static void ip6_multipath_l3_keys(const struct sk_buff *skb,
                                  struct flow_keys *keys,
                                  struct flow_keys *flkeys)
{
        const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
        const struct ipv6hdr *key_iph = outer_iph;
        struct flow_keys *_flkeys = flkeys;
        const struct ipv6hdr *inner_iph;
        const struct icmp6hdr *icmph;
        struct ipv6hdr _inner_iph;
        struct icmp6hdr _icmph;

        if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
                goto out;

        icmph = skb_header_pointer(skb, skb_transport_offset(skb),
                                   sizeof(_icmph), &_icmph);
        if (!icmph)
                goto out;

        if (!icmpv6_is_err(icmph->icmp6_type))
                goto out;

        inner_iph = skb_header_pointer(skb,
                                       skb_transport_offset(skb) + sizeof(*icmph),
                                       sizeof(_inner_iph), &_inner_iph);
        if (!inner_iph)
                goto out;

        key_iph = inner_iph;
        _flkeys = NULL;
out:
        if (_flkeys) {
                keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
                keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
                keys->tags.flow_label = _flkeys->tags.flow_label;
                keys->basic.ip_proto = _flkeys->basic.ip_proto;
        } else {
                keys->addrs.v6addrs.src = key_iph->saddr;
                keys->addrs.v6addrs.dst = key_iph->daddr;
                keys->tags.flow_label = ip6_flowlabel(key_iph);
                keys->basic.ip_proto = key_iph->nexthdr;
        }
}

static u32 rt6_multipath_custom_hash_outer(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool *p_has_inner)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys keys, hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
                hash_keys.tags.flow_label = keys.tags.flow_label;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 rt6_multipath_custom_hash_inner(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool has_inner)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys keys, hash_keys;

        /* We assume the packet carries an encapsulation, but if none was
         * encountered during dissection of the outer flow, then there is no
         * point in calling the flow dissector again.
         */
        if (!has_inner)
                return 0;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, 0);

        if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
                return 0;

        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
                        hash_keys.tags.flow_label = keys.tags.flow_label;
        }

        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 rt6_multipath_custom_hash_skb(const struct net *net,
                                         const struct sk_buff *skb)
{
        u32 mhash, mhash_inner;
        bool has_inner = true;

        mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
        mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);

        return jhash_2words(mhash, mhash_inner, 0);
}

static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
                                         const struct flowi6 *fl6)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v6addrs.src = fl6->saddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v6addrs.dst = fl6->daddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = fl6->flowi6_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
                hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = fl6->fl6_sport;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = fl6->fl6_dport;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
                       const struct sk_buff *skb, struct flow_keys *flkeys)
{
        struct flow_keys hash_keys;
        u32 mhash = 0;

        switch (ip6_multipath_hash_policy(net)) {
        case 0:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (skb) {
                        ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
                } else {
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 1:
                if (skb) {
                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
                        struct flow_keys keys;

                        /* short-circuit if we already have L4 hash present */
                        if (skb->l4_hash)
                                return skb_get_hash_raw(skb) >> 1;

                        memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, flag);
                                flkeys = &keys;
                        }
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
                        hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
                        hash_keys.ports.src = flkeys->ports.src;
                        hash_keys.ports.dst = flkeys->ports.dst;
                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                } else {
                        memset(&hash_keys, 0, sizeof(hash_keys));
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.ports.src = fl6->fl6_sport;
                        hash_keys.ports.dst = fl6->fl6_dport;
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 2:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (skb) {
                        struct flow_keys keys;

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, 0);
                                flkeys = &keys;
                        }

                        /* Inner can be v4 or v6 */
                        if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
                                hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
                        } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
                                hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
                                hash_keys.tags.flow_label = flkeys->tags.flow_label;
                                hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                        } else {
                                /* Same as case 0 */
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
                        }
                } else {
                        /* Same as case 0 */
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 3:
                if (skb)
                        mhash = rt6_multipath_custom_hash_skb(net, skb);
                else
                        mhash = rt6_multipath_custom_hash_fl6(net, fl6);
                break;
        }

        return mhash >> 1;
}

/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        struct net *net = dev_net(skb->dev);
        int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
        struct ip_tunnel_info *tun_info;
        struct flowi6 fl6 = {
                .flowi6_iif = skb->dev->ifindex,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_mark = skb->mark,
                .flowi6_proto = iph->nexthdr,
        };
        struct flow_keys *flkeys = NULL, _flkeys;

        tun_info = skb_tunnel_info(skb);
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;

        if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
                flkeys = &_flkeys;

        if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
                fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
        skb_dst_drop(skb);
        skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
                                                      &fl6, skb, flags));
}

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}

static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
                                                      const struct sock *sk,
                                                      struct flowi6 *fl6,
                                                      int flags)
{
        bool any_src;

        if (ipv6_addr_type(&fl6->daddr) &
            (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
                struct dst_entry *dst;

                /* This function does not take refcnt on the dst */
                dst = l3mdev_link_scope_lookup(net, fl6);
                if (dst)
                        return dst;
        }

        fl6->flowi6_iif = LOOPBACK_IFINDEX;

        flags |= RT6_LOOKUP_F_DST_NOREF;
        any_src = ipv6_addr_any(&fl6->saddr);
        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
            (fl6->flowi6_oif && any_src))
                flags |= RT6_LOOKUP_F_IFACE;

        if (!any_src)
                flags |= RT6_LOOKUP_F_HAS_SADDR;
        else if (sk)
                flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));

        return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}

struct dst_entry *ip6_route_output_flags(struct net *net,
                                         const struct sock *sk,
                                         struct flowi6 *fl6,
                                         int flags)
{
        struct dst_entry *dst;
        struct rt6_info *rt6;

        rcu_read_lock();
        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
        rt6 = dst_rt6_info(dst);
        /* For dst cached in uncached_list, refcnt is already taken. */
        if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
                dst = &net->ipv6.ip6_null_entry->dst;
                dst_hold(dst);
        }
        rcu_read_unlock();

        return dst;
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);

struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
        struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
        struct net_device *loopback_dev = net->loopback_dev;
        struct dst_entry *new = NULL;

        rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
                       DST_OBSOLETE_DEAD, 0);
        if (rt) {
                rt6_info_init(rt);
                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);

                new = &rt->dst;
                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard_out;

                dst_copy_metrics(new, &ort->dst);

                rt->rt6i_idev = in6_dev_get(loopback_dev);
                rt->rt6i_gateway = ort->rt6i_gateway;
                rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;

                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
        }

        dst_release(dst_orig);
        return new ? new : ERR_PTR(-ENOMEM);
}

/*
 *        Destination cache support functions
 */

static bool fib6_check(struct fib6_info *f6i, u32 cookie)
{
        u32 rt_cookie = 0;

        if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
                return false;

        if (fib6_check_expired(f6i))
                return false;

        return true;
}

static struct dst_entry *rt6_check(struct rt6_info *rt,
                                   struct fib6_info *from,
                                   u32 cookie)
{
        u32 rt_cookie = 0;

        if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
            rt_cookie != cookie)
                return NULL;

        if (rt6_check_expired(rt))
                return NULL;

        return &rt->dst;
}

static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
                                            struct fib6_info *from,
                                            u32 cookie)
{
        if (!__rt6_check_expired(rt) &&
            rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
            fib6_check(from, cookie))
                return &rt->dst;
        else
                return NULL;
}

INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
                                                        u32 cookie)
{
        struct dst_entry *dst_ret;
        struct fib6_info *from;
        struct rt6_info *rt;

        rt = dst_rt6_info(dst);

        if (rt->sernum)
                return rt6_is_valid(rt) ? dst : NULL;

        rcu_read_lock();

        /* All IPV6 dsts are created with ->obsolete set to the value
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         */

        from = rcu_dereference(rt->from);

        if (from && (rt->rt6i_flags & RTF_PCPU ||
            unlikely(!list_empty(&rt->dst.rt_uncached))))
                dst_ret = rt6_dst_from_check(rt, from, cookie);
        else
                dst_ret = rt6_check(rt, from, cookie);

        rcu_read_unlock();

        return dst_ret;
}
EXPORT_INDIRECT_CALLABLE(ip6_dst_check);

static void ip6_negative_advice(struct sock *sk,
                                struct dst_entry *dst)
{
        struct rt6_info *rt = dst_rt6_info(dst);

        if (rt->rt6i_flags & RTF_CACHE) {
                rcu_read_lock();
                if (rt6_check_expired(rt)) {
                        /* counteract the dst_release() in sk_dst_reset() */
                        dst_hold(dst);
                        sk_dst_reset(sk);

                        rt6_remove_exception_rt(rt);
                }
                rcu_read_unlock();
                return;
        }
        sk_dst_reset(sk);
}

static void ip6_link_failure(struct sk_buff *skb)
{
        struct rt6_info *rt;

        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);

        rt = dst_rt6_info(skb_dst(skb));
        if (rt) {
                rcu_read_lock();
                if (rt->rt6i_flags & RTF_CACHE) {
                        rt6_remove_exception_rt(rt);
                } else {
                        struct fib6_info *from;
                        struct fib6_node *fn;

                        from = rcu_dereference(rt->from);
                        if (from) {
                                fn = rcu_dereference(from->fib6_node);
                                if (fn && (rt->rt6i_flags & RTF_DEFAULT))
                                        WRITE_ONCE(fn->fn_sernum, -1);
                        }
                }
                rcu_read_unlock();
        }
}

static void rt6_update_expires(struct rt6_info *rt0, int timeout)
{
        if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
                struct fib6_info *from;

                rcu_read_lock();
                from = rcu_dereference(rt0->from);
                if (from)
                        rt0->dst.expires = from->expires;
                rcu_read_unlock();
        }

        dst_set_expires(&rt0->dst, timeout);
        rt0->rt6i_flags |= RTF_EXPIRES;
}

static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
        struct net *net = dev_net(rt->dst.dev);

        dst_metric_set(&rt->dst, RTAX_MTU, mtu);
        rt->rt6i_flags |= RTF_MODIFIED;
        rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}

static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
        return !(rt->rt6i_flags & RTF_CACHE) &&
                (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
}

static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
                                 const struct ipv6hdr *iph, u32 mtu,
                                 bool confirm_neigh)
{
        const struct in6_addr *daddr, *saddr;
        struct rt6_info *rt6 = dst_rt6_info(dst);

        /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
         * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
         * [see also comment in rt6_mtu_change_route()]
         */

        if (iph) {
                daddr = &iph->daddr;
                saddr = &iph->saddr;
        } else if (sk) {
                daddr = &sk->sk_v6_daddr;
                saddr = &inet6_sk(sk)->saddr;
        } else {
                daddr = NULL;
                saddr = NULL;
        }

        if (confirm_neigh)
                dst_confirm_neigh(dst, daddr);

        if (mtu < IPV6_MIN_MTU)
                return;
        if (mtu >= dst_mtu(dst))
                return;

        if (!rt6_cache_allowed_for_pmtu(rt6)) {
                rt6_do_update_pmtu(rt6, mtu);
                /* update rt6_ex->stamp for cache */
                if (rt6->rt6i_flags & RTF_CACHE)
                        rt6_update_exception_stamp_rt(rt6);
        } else if (daddr) {
                struct fib6_result res = {};
                struct rt6_info *nrt6;

                rcu_read_lock();
                res.f6i = rcu_dereference(rt6->from);
                if (!res.f6i)
                        goto out_unlock;

                res.fib6_flags = res.f6i->fib6_flags;
                res.fib6_type = res.f6i->fib6_type;

                if (res.f6i->nh) {
                        struct fib6_nh_match_arg arg = {
                                .dev = dst->dev,
                                .gw = &rt6->rt6i_gateway,
                        };

                        nexthop_for_each_fib6_nh(res.f6i->nh,
                                                 fib6_nh_find_match, &arg);

                        /* fib6_info uses a nexthop that does not have fib6_nh
                         * using the dst->dev + gw. Should be impossible.
                         */
                        if (!arg.match)
                                goto out_unlock;

                        res.nh = arg.match;
                } else {
                        res.nh = res.f6i->fib6_nh;
                }

                nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
                if (nrt6) {
                        rt6_do_update_pmtu(nrt6, mtu);
                        if (rt6_insert_exception(nrt6, &res))
                                dst_release_immediate(&nrt6->dst);
                }
out_unlock:
                rcu_read_unlock();
        }
}

static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu,
                               bool confirm_neigh)
{
        __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
                             confirm_neigh);
}

void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
                     int oif, u32 mark, kuid_t uid)
{
        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_oif = oif,
                .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_uid = uid,
        };

        dst = ip6_route_output(net, NULL, &fl6);
        if (!dst->error)
                __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
        dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
        int oif = sk->sk_bound_dev_if;
        struct dst_entry *dst;

        if (!oif && skb->dev)
                oif = l3mdev_master_ifindex(skb->dev);

        ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
                        sk->sk_uid);

        dst = __sk_dst_get(sk);
        if (!dst || !dst->obsolete ||
            dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
                return;

        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
                ip6_datagram_dst_update(sk, false);
        bh_unlock_sock(sk);
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
                           const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
        struct ipv6_pinfo *np = inet6_sk(sk);
#endif

        ip6_dst_store(sk, dst,
                      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
                      &sk->sk_v6_daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
                      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
                      &np->saddr :
#endif
                      NULL);
}

static bool ip6_redirect_nh_match(const struct fib6_result *res,
                                  struct flowi6 *fl6,
                                  const struct in6_addr *gw,
                                  struct rt6_info **ret)
{
        const struct fib6_nh *nh = res->nh;

        if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
            fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
                return false;

        /* rt_cache's gateway might be different from its 'parent'
         * in the case of an ip redirect.
         * So we keep searching in the exception table if the gateway
         * is different.
         */
        if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
                struct rt6_info *rt_cache;

                rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
                if (rt_cache &&
                    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
                        *ret = rt_cache;
                        return true;
                }
                return false;
        }
        return true;
}

struct fib6_nh_rd_arg {
        struct fib6_result        *res;
        struct flowi6                *fl6;
        const struct in6_addr        *gw;
        struct rt6_info                **ret;
};

static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_rd_arg *arg = _arg;

        arg->res->nh = nh;
        return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
}

/* Handle redirects */
struct ip6rd_flowi {
        struct flowi6 fl6;
        struct in6_addr gateway;
};

INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
        struct rt6_info *ret = NULL;
        struct fib6_result res = {};
        struct fib6_nh_rd_arg arg = {
                .res = &res,
                .fl6 = fl6,
                .gw  = &rdfl->gateway,
                .ret = &ret
        };
        struct fib6_info *rt;
        struct fib6_node *fn;

        /* Get the "current" route for this destination and
         * check if the redirect has come from appropriate router.
         *
         * RFC 4861 specifies that redirects should only be
         * accepted if they come from the nexthop to the target.
         * Due to the way the routes are chosen, this notion
         * is a bit fuzzy and one might need to check all possible
         * routes.
         */

        rcu_read_lock();
        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
        for_each_fib6_node_rt_rcu(fn) {
                res.f6i = rt;
                if (fib6_check_expired(rt))
                        continue;
                if (rt->fib6_flags & RTF_REJECT)
                        break;
                if (unlikely(rt->nh)) {
                        if (nexthop_is_blackhole(rt->nh))
                                continue;
                        /* on match, res->nh is filled in and potentially ret */
                        if (nexthop_for_each_fib6_nh(rt->nh,
                                                     fib6_nh_redirect_match,
                                                     &arg))
                                goto out;
                } else {
                        res.nh = rt->fib6_nh;
                        if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
                                                  &ret))
                                goto out;
                }
        }

        if (!rt)
                rt = net->ipv6.fib6_null_entry;
        else if (rt->fib6_flags & RTF_REJECT) {
                ret = net->ipv6.ip6_null_entry;
                goto out;
        }

        if (rt == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;
        }

        res.f6i = rt;
        res.nh = rt->fib6_nh;
out:
        if (ret) {
                ip6_hold_safe(net, &ret);
        } else {
                res.fib6_flags = res.f6i->fib6_flags;
                res.fib6_type = res.f6i->fib6_type;
                ret = ip6_create_rt_rcu(&res);
        }

        rcu_read_unlock();

        trace_fib6_table_lookup(net, &res, table, fl6);
        return ret;
};

static struct dst_entry *ip6_route_redirect(struct net *net,
                                            const struct flowi6 *fl6,
                                            const struct sk_buff *skb,
                                            const struct in6_addr *gateway)
{
        int flags = RT6_LOOKUP_F_HAS_SADDR;
        struct ip6rd_flowi rdfl;

        rdfl.fl6 = *fl6;
        rdfl.gateway = *gateway;

        return fib6_rule_lookup(net, &rdfl.fl6, skb,
                                flags, __ip6_route_redirect);
}

void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
                  kuid_t uid)
{
        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_iif = LOOPBACK_IFINDEX,
                .flowi6_oif = oif,
                .flowi6_mark = mark,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_uid = uid,
        };

        dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
        rt6_do_redirect(dst, NULL, skb);
        dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_iif = LOOPBACK_IFINDEX,
                .flowi6_oif = oif,
                .daddr = msg->dest,
                .saddr = iph->daddr,
                .flowi6_uid = sock_net_uid(net, NULL),
        };

        dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
        rt6_do_redirect(dst, NULL, skb);
        dst_release(dst);
}

void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
        ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
                     READ_ONCE(sk->sk_mark), sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
        struct net_device *dev = dst->dev;
        unsigned int mtu = dst_mtu(dst);
        struct net *net = dev_net(dev);

        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;

        /*
         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
         * IPV6_MAXPLEN is also valid and means: "any MSS,
         * rely only on pmtu discovery"
         */
        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
                mtu = IPV6_MAXPLEN;
        return mtu;
}

INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
{
        return ip6_dst_mtu_maybe_forward(dst, false);
}
EXPORT_INDIRECT_CALLABLE(ip6_mtu);

/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 *
 * based on ip6_dst_mtu_forward and exception logic of
 * rt6_find_cached_rt; called with rcu_read_lock
 */
u32 ip6_mtu_from_fib6(const struct fib6_result *res,
                      const struct in6_addr *daddr,
                      const struct in6_addr *saddr)
{
        const struct fib6_nh *nh = res->nh;
        struct fib6_info *f6i = res->f6i;
        struct inet6_dev *idev;
        struct rt6_info *rt;
        u32 mtu = 0;

        if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
                mtu = f6i->fib6_pmtu;
                if (mtu)
                        goto out;
        }

        rt = rt6_find_cached_rt(res, daddr, saddr);
        if (unlikely(rt)) {
                mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
        } else {
                struct net_device *dev = nh->fib_nh_dev;

                mtu = IPV6_MIN_MTU;
                idev = __in6_dev_get(dev);
                if (idev)
                        mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
        }

        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
out:
        return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}

struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
                                  struct flowi6 *fl6)
{
        struct dst_entry *dst;
        struct rt6_info *rt;
        struct inet6_dev *idev = in6_dev_get(dev);
        struct net *net = dev_net(dev);

        if (unlikely(!idev))
                return ERR_PTR(-ENODEV);

        rt = ip6_dst_alloc(net, dev, 0);
        if (unlikely(!rt)) {
                in6_dev_put(idev);
                dst = ERR_PTR(-ENOMEM);
                goto out;
        }

        rt->dst.input = ip6_input;
        rt->dst.output  = ip6_output;
        rt->rt6i_gateway  = fl6->daddr;
        rt->rt6i_dst.addr = fl6->daddr;
        rt->rt6i_dst.plen = 128;
        rt->rt6i_idev     = idev;
        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);

        /* Add this dst into uncached_list so that rt6_disable_ip() can
         * do proper release of the net_device
         */
        rt6_uncached_list_add(rt);

        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

out:
        return dst;
}

static void ip6_dst_gc(struct dst_ops *ops)
{
        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
        unsigned int val;
        int entries;

        if (time_after(rt_last_gc + rt_min_interval, jiffies))
                goto out;

        fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
        entries = dst_entries_get_slow(ops);
        if (entries < ops->gc_thresh)
                atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
out:
        val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
        atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
}

static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
                               const struct in6_addr *gw_addr, u32 tbid,
                               int flags, struct fib6_result *res)
{
        struct flowi6 fl6 = {
                .flowi6_oif = cfg->fc_ifindex,
                .daddr = *gw_addr,
                .saddr = cfg->fc_prefsrc,
        };
        struct fib6_table *table;
        int err;

        table = fib6_get_table(net, tbid);
        if (!table)
                return -EINVAL;

        if (!ipv6_addr_any(&cfg->fc_prefsrc))
                flags |= RT6_LOOKUP_F_HAS_SADDR;

        flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;

        err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
        if (!err && res->f6i != net->ipv6.fib6_null_entry)
                fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
                                 cfg->fc_ifindex != 0, NULL, flags);

        return err;
}

static int ip6_route_check_nh_onlink(struct net *net,
                                     struct fib6_config *cfg,
                                     const struct net_device *dev,
                                     struct netlink_ext_ack *extack)
{
        u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        struct fib6_result res = {};
        int err;

        err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
        if (!err && !(res.fib6_flags & RTF_REJECT) &&
            /* ignore match if it is the default route */
            !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
            (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop has invalid gateway or device mismatch");
                err = -EINVAL;
        }

        return err;
}

static int ip6_route_check_nh(struct net *net,
                              struct fib6_config *cfg,
                              struct net_device **_dev,
                              netdevice_tracker *dev_tracker,
                              struct inet6_dev **idev)
{
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        struct net_device *dev = _dev ? *_dev : NULL;
        int flags = RT6_LOOKUP_F_IFACE;
        struct fib6_result res = {};
        int err = -EHOSTUNREACH;

        if (cfg->fc_table) {
                err = ip6_nh_lookup_table(net, cfg, gw_addr,
                                          cfg->fc_table, flags, &res);
                /* gw_addr can not require a gateway or resolve to a reject
                 * route. If a device is given, it must match the result.
                 */
                if (err || res.fib6_flags & RTF_REJECT ||
                    res.nh->fib_nh_gw_family ||
                    (dev && dev != res.nh->fib_nh_dev))
                        err = -EHOSTUNREACH;
        }

        if (err < 0) {
                struct flowi6 fl6 = {
                        .flowi6_oif = cfg->fc_ifindex,
                        .daddr = *gw_addr,
                };

                err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
                if (err || res.fib6_flags & RTF_REJECT ||
                    res.nh->fib_nh_gw_family)
                        err = -EHOSTUNREACH;

                if (err)
                        return err;

                fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
                                 cfg->fc_ifindex != 0, NULL, flags);
        }

        err = 0;
        if (dev) {
                if (dev != res.nh->fib_nh_dev)
                        err = -EHOSTUNREACH;
        } else {
                *_dev = dev = res.nh->fib_nh_dev;
                netdev_hold(dev, dev_tracker, GFP_ATOMIC);
                *idev = in6_dev_get(dev);
        }

        return err;
}

static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
                           struct net_device **_dev,
                           netdevice_tracker *dev_tracker,
                           struct inet6_dev **idev,
                           struct netlink_ext_ack *extack)
{
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        int gwa_type = ipv6_addr_type(gw_addr);
        bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
        const struct net_device *dev = *_dev;
        bool need_addr_check = !dev;
        int err = -EINVAL;

        /* if gw_addr is local we will fail to detect this in case
         * address is still TENTATIVE (DAD in progress). rt6_lookup()
         * will return already-added prefix route via interface that
         * prefix route was assigned to, which might be non-loopback.
         */
        if (dev &&
            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
                goto out;
        }

        if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
                /* IPv6 strictly inhibits using not link-local
                 * addresses as nexthop address.
                 * Otherwise, router will not able to send redirects.
                 * It is very good, but in some (rare!) circumstances
                 * (SIT, PtP, NBMA NOARP links) it is handy to allow
                 * some exceptions. --ANK
                 * We allow IPv4-mapped nexthops to support RFC4798-type
                 * addressing
                 */
                if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
                        NL_SET_ERR_MSG(extack, "Invalid gateway address");
                        goto out;
                }

                rcu_read_lock();

                if (cfg->fc_flags & RTNH_F_ONLINK)
                        err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
                else
                        err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
                                                 idev);

                rcu_read_unlock();

                if (err)
                        goto out;
        }

        /* reload in case device was changed */
        dev = *_dev;

        err = -EINVAL;
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Egress device not specified");
                goto out;
        } else if (dev->flags & IFF_LOOPBACK) {
                NL_SET_ERR_MSG(extack,
                               "Egress device can not be loopback device for this route");
                goto out;
        }

        /* if we did not check gw_addr above, do so now that the
         * egress device has been resolved.
         */
        if (need_addr_check &&
            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
                goto out;
        }

        err = 0;
out:
        return err;
}

static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
{
        if ((flags & RTF_REJECT) ||
            (dev && (dev->flags & IFF_LOOPBACK) &&
             !(addr_type & IPV6_ADDR_LOOPBACK) &&
             !(flags & (RTF_ANYCAST | RTF_LOCAL))))
                return true;

        return false;
}

int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                 struct fib6_config *cfg, gfp_t gfp_flags,
                 struct netlink_ext_ack *extack)
{
        netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
        struct net_device *dev = NULL;
        struct inet6_dev *idev = NULL;
        int addr_type;
        int err;

        fib6_nh->fib_nh_family = AF_INET6;
#ifdef CONFIG_IPV6_ROUTER_PREF
        fib6_nh->last_probe = jiffies;
#endif
        if (cfg->fc_is_fdb) {
                fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
                fib6_nh->fib_nh_gw_family = AF_INET6;
                return 0;
        }

        err = -ENODEV;
        if (cfg->fc_ifindex) {
                dev = netdev_get_by_index(net, cfg->fc_ifindex,
                                          dev_tracker, gfp_flags);
                if (!dev)
                        goto out;
                idev = in6_dev_get(dev);
                if (!idev)
                        goto out;
        }

        if (cfg->fc_flags & RTNH_F_ONLINK) {
                if (!dev) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop device required for onlink");
                        goto out;
                }

                if (!(dev->flags & IFF_UP)) {
                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                        err = -ENETDOWN;
                        goto out;
                }

                fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
        }

        fib6_nh->fib_nh_weight = 1;

        /* We cannot add true routes via loopback here,
         * they would result in kernel looping; promote them to reject routes
         */
        addr_type = ipv6_addr_type(&cfg->fc_dst);
        if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
                /* hold loopback dev/idev if we haven't done so. */
                if (dev != net->loopback_dev) {
                        if (dev) {
                                netdev_put(dev, dev_tracker);
                                in6_dev_put(idev);
                        }
                        dev = net->loopback_dev;
                        netdev_hold(dev, dev_tracker, gfp_flags);
                        idev = in6_dev_get(dev);
                        if (!idev) {
                                err = -ENODEV;
                                goto out;
                        }
                }
                goto pcpu_alloc;
        }

        if (cfg->fc_flags & RTF_GATEWAY) {
                err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
                                      &idev, extack);
                if (err)
                        goto out;

                fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
                fib6_nh->fib_nh_gw_family = AF_INET6;
        }

        err = -ENODEV;
        if (!dev)
                goto out;

        if (!idev || idev->cnf.disable_ipv6) {
                NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
                err = -EACCES;
                goto out;
        }

        if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
                NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                err = -ENETDOWN;
                goto out;
        }

        if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
            !netif_carrier_ok(dev))
                fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;

        err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
                                 cfg->fc_encap_type, cfg, gfp_flags, extack);
        if (err)
                goto out;

pcpu_alloc:
        fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
        if (!fib6_nh->rt6i_pcpu) {
                err = -ENOMEM;
                goto out;
        }

        fib6_nh->fib_nh_dev = dev;
        fib6_nh->fib_nh_oif = dev->ifindex;
        err = 0;
out:
        if (idev)
                in6_dev_put(idev);

        if (err) {
                lwtstate_put(fib6_nh->fib_nh_lws);
                fib6_nh->fib_nh_lws = NULL;
                netdev_put(dev, dev_tracker);
        }

        return err;
}

void fib6_nh_release(struct fib6_nh *fib6_nh)
{
        struct rt6_exception_bucket *bucket;

        rcu_read_lock();

        fib6_nh_flush_exceptions(fib6_nh, NULL);
        bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
        if (bucket) {
                rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
                kfree(bucket);
        }

        rcu_read_unlock();

        fib6_nh_release_dsts(fib6_nh);
        free_percpu(fib6_nh->rt6i_pcpu);

        fib_nh_common_release(&fib6_nh->nh_common);
}

void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
{
        int cpu;

        if (!fib6_nh->rt6i_pcpu)
                return;

        for_each_possible_cpu(cpu) {
                struct rt6_info *pcpu_rt, **ppcpu_rt;

                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
                pcpu_rt = xchg(ppcpu_rt, NULL);
                if (pcpu_rt) {
                        dst_dev_put(&pcpu_rt->dst);
                        dst_release(&pcpu_rt->dst);
                }
        }
}

static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
                                              gfp_t gfp_flags,
                                              struct netlink_ext_ack *extack)
{
        struct net *net = cfg->fc_nlinfo.nl_net;
        struct fib6_info *rt = NULL;
        struct nexthop *nh = NULL;
        struct fib6_table *table;
        struct fib6_nh *fib6_nh;
        int err = -EINVAL;
        int addr_type;

        /* RTF_PCPU is an internal flag; can not be set by userspace */
        if (cfg->fc_flags & RTF_PCPU) {
                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
                goto out;
        }

        /* RTF_CACHE is an internal flag; can not be set by userspace */
        if (cfg->fc_flags & RTF_CACHE) {
                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
                goto out;
        }

        if (cfg->fc_type > RTN_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid route type");
                goto out;
        }

        if (cfg->fc_dst_len > 128) {
                NL_SET_ERR_MSG(extack, "Invalid prefix length");
                goto out;
        }
        if (cfg->fc_src_len > 128) {
                NL_SET_ERR_MSG(extack, "Invalid source address length");
                goto out;
        }
#ifndef CONFIG_IPV6_SUBTREES
        if (cfg->fc_src_len) {
                NL_SET_ERR_MSG(extack,
                               "Specifying source address requires IPV6_SUBTREES to be enabled");
                goto out;
        }
#endif
        if (cfg->fc_nh_id) {
                nh = nexthop_find_by_id(net, cfg->fc_nh_id);
                if (!nh) {
                        NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                        goto out;
                }
                err = fib6_check_nexthop(nh, cfg, extack);
                if (err)
                        goto out;
        }

        err = -ENOBUFS;
        if (cfg->fc_nlinfo.nlh &&
            !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
                table = fib6_get_table(net, cfg->fc_table);
                if (!table) {
                        pr_warn("NLM_F_CREATE should be specified when creating new route\n");
                        table = fib6_new_table(net, cfg->fc_table);
                }
        } else {
                table = fib6_new_table(net, cfg->fc_table);
        }

        if (!table)
                goto out;

        err = -ENOMEM;
        rt = fib6_info_alloc(gfp_flags, !nh);
        if (!rt)
                goto out;

        rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
                                               extack);
        if (IS_ERR(rt->fib6_metrics)) {
                err = PTR_ERR(rt->fib6_metrics);
                /* Do not leave garbage there. */
                rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
                goto out_free;
        }

        if (cfg->fc_flags & RTF_ADDRCONF)
                rt->dst_nocount = true;

        if (cfg->fc_flags & RTF_EXPIRES)
                fib6_set_expires(rt, jiffies +
                                clock_t_to_jiffies(cfg->fc_expires));

        if (cfg->fc_protocol == RTPROT_UNSPEC)
                cfg->fc_protocol = RTPROT_BOOT;
        rt->fib6_protocol = cfg->fc_protocol;

        rt->fib6_table = table;
        rt->fib6_metric = cfg->fc_metric;
        rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
        rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;

        ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
        rt->fib6_dst.plen = cfg->fc_dst_len;

#ifdef CONFIG_IPV6_SUBTREES
        ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
        rt->fib6_src.plen = cfg->fc_src_len;
#endif
        if (nh) {
                if (rt->fib6_src.plen) {
                        NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
                        goto out_free;
                }
                if (!nexthop_get(nh)) {
                        NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                        goto out_free;
                }
                rt->nh = nh;
                fib6_nh = nexthop_fib6_nh(rt->nh);
        } else {
                err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
                if (err)
                        goto out;

                fib6_nh = rt->fib6_nh;

                /* We cannot add true routes via loopback here, they would
                 * result in kernel looping; promote them to reject routes
                 */
                addr_type = ipv6_addr_type(&cfg->fc_dst);
                if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
                                   addr_type))
                        rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
        }

        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
                struct net_device *dev = fib6_nh->fib_nh_dev;

                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
                        NL_SET_ERR_MSG(extack, "Invalid source address");
                        err = -EINVAL;
                        goto out;
                }
                rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
                rt->fib6_prefsrc.plen = 128;
        } else
                rt->fib6_prefsrc.plen = 0;

        return rt;
out:
        fib6_info_release(rt);
        return ERR_PTR(err);
out_free:
        ip_fib_metrics_put(rt->fib6_metrics);
        kfree(rt);
        return ERR_PTR(err);
}

int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack)
{
        struct fib6_info *rt;
        int err;

        rt = ip6_route_info_create(cfg, gfp_flags, extack);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
        fib6_info_release(rt);

        return err;
}

static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
{
        struct net *net = info->nl_net;
        struct fib6_table *table;
        int err;

        if (rt == net->ipv6.fib6_null_entry) {
                err = -ENOENT;
                goto out;
        }

        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);
        err = fib6_del(rt, info);
        spin_unlock_bh(&table->tb6_lock);

out:
        fib6_info_release(rt);
        return err;
}

int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
{
        struct nl_info info = {
                .nl_net = net,
                .skip_notify = skip_notify
        };

        return __ip6_del_rt(rt, &info);
}

static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
{
        struct nl_info *info = &cfg->fc_nlinfo;
        struct net *net = info->nl_net;
        struct sk_buff *skb = NULL;
        struct fib6_table *table;
        int err = -ENOENT;

        if (rt == net->ipv6.fib6_null_entry)
                goto out_put;
        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);

        if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
                struct fib6_info *sibling, *next_sibling;
                struct fib6_node *fn;

                /* prefer to send a single notification with all hops */
                skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
                if (skb) {
                        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

                        if (rt6_fill_node(net, skb, rt, NULL,
                                          NULL, NULL, 0, RTM_DELROUTE,
                                          info->portid, seq, 0) < 0) {
                                kfree_skb(skb);
                                skb = NULL;
                        } else
                                info->skip_notify = 1;
                }

                /* 'rt' points to the first sibling route. If it is not the
                 * leaf, then we do not need to send a notification. Otherwise,
                 * we need to check if the last sibling has a next route or not
                 * and emit a replace or delete notification, respectively.
                 */
                info->skip_notify_kernel = 1;
                fn = rcu_dereference_protected(rt->fib6_node,
                                            lockdep_is_held(&table->tb6_lock));
                if (rcu_access_pointer(fn->leaf) == rt) {
                        struct fib6_info *last_sibling, *replace_rt;

                        last_sibling = list_last_entry(&rt->fib6_siblings,
                                                       struct fib6_info,
                                                       fib6_siblings);
                        replace_rt = rcu_dereference_protected(
                                            last_sibling->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                        if (replace_rt)
                                call_fib6_entry_notifiers_replace(net,
                                                                  replace_rt);
                        else
                                call_fib6_multipath_entry_notifiers(net,
                                                       FIB_EVENT_ENTRY_DEL,
                                                       rt, rt->fib6_nsiblings,
                                                       NULL);
                }
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings,
                                         fib6_siblings) {
                        err = fib6_del(sibling, info);
                        if (err)
                                goto out_unlock;
                }
        }

        err = fib6_del(rt, info);
out_unlock:
        spin_unlock_bh(&table->tb6_lock);
out_put:
        fib6_info_release(rt);

        if (skb) {
                rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                            info->nlh, gfp_any());
        }
        return err;
}

static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
        int rc = -ESRCH;

        if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
                goto out;

        if (cfg->fc_flags & RTF_GATEWAY &&
            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
                goto out;

        rc = rt6_remove_exception_rt(rt);
out:
        return rc;
}

static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
                             struct fib6_nh *nh)
{
        struct fib6_result res = {
                .f6i = rt,
                .nh = nh,
        };
        struct rt6_info *rt_cache;

        rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
        if (rt_cache)
                return __ip6_del_cached_rt(rt_cache, cfg);

        return 0;
}

struct fib6_nh_del_cached_rt_arg {
        struct fib6_config *cfg;
        struct fib6_info *f6i;
};

static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_del_cached_rt_arg *arg = _arg;
        int rc;

        rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
        return rc != -ESRCH ? rc : 0;
}

static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
{
        struct fib6_nh_del_cached_rt_arg arg = {
                .cfg = cfg,
                .f6i = f6i
        };

        return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
}

static int ip6_route_del(struct fib6_config *cfg,
                         struct netlink_ext_ack *extack)
{
        struct fib6_table *table;
        struct fib6_info *rt;
        struct fib6_node *fn;
        int err = -ESRCH;

        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
        if (!table) {
                NL_SET_ERR_MSG(extack, "FIB table does not exist");
                return err;
        }

        rcu_read_lock();

        fn = fib6_locate(&table->tb6_root,
                         &cfg->fc_dst, cfg->fc_dst_len,
                         &cfg->fc_src, cfg->fc_src_len,
                         !(cfg->fc_flags & RTF_CACHE));

        if (fn) {
                for_each_fib6_node_rt_rcu(fn) {
                        struct fib6_nh *nh;

                        if (rt->nh && cfg->fc_nh_id &&
                            rt->nh->id != cfg->fc_nh_id)
                                continue;

                        if (cfg->fc_flags & RTF_CACHE) {
                                int rc = 0;

                                if (rt->nh) {
                                        rc = ip6_del_cached_rt_nh(cfg, rt);
                                } else if (cfg->fc_nh_id) {
                                        continue;
                                } else {
                                        nh = rt->fib6_nh;
                                        rc = ip6_del_cached_rt(cfg, rt, nh);
                                }
                                if (rc != -ESRCH) {
                                        rcu_read_unlock();
                                        return rc;
                                }
                                continue;
                        }

                        if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
                                continue;
                        if (cfg->fc_protocol &&
                            cfg->fc_protocol != rt->fib6_protocol)
                                continue;

                        if (rt->nh) {
                                if (!fib6_info_hold_safe(rt))
                                        continue;
                                rcu_read_unlock();

                                return __ip6_del_rt(rt, &cfg->fc_nlinfo);
                        }
                        if (cfg->fc_nh_id)
                                continue;

                        nh = rt->fib6_nh;
                        if (cfg->fc_ifindex &&
                            (!nh->fib_nh_dev ||
                             nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
                                continue;
                        if (cfg->fc_flags & RTF_GATEWAY &&
                            !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
                                continue;
                        if (!fib6_info_hold_safe(rt))
                                continue;
                        rcu_read_unlock();

                        /* if gateway was specified only delete the one hop */
                        if (cfg->fc_flags & RTF_GATEWAY)
                                return __ip6_del_rt(rt, &cfg->fc_nlinfo);

                        return __ip6_del_rt_siblings(rt, cfg);
                }
        }
        rcu_read_unlock();

        return err;
}

static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
        struct netevent_redirect netevent;
        struct rt6_info *rt, *nrt = NULL;
        struct fib6_result res = {};
        struct ndisc_options ndopts;
        struct inet6_dev *in6_dev;
        struct neighbour *neigh;
        struct rd_msg *msg;
        int optlen, on_link;
        u8 *lladdr;

        optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
        optlen -= sizeof(*msg);

        if (optlen < 0) {
                net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
                return;
        }

        msg = (struct rd_msg *)icmp6_hdr(skb);

        if (ipv6_addr_is_multicast(&msg->dest)) {
                net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
                return;
        }

        on_link = 0;
        if (ipv6_addr_equal(&msg->dest, &msg->target)) {
                on_link = 1;
        } else if (ipv6_addr_type(&msg->target) !=
                   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
                net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
                return;
        }

        in6_dev = __in6_dev_get(skb->dev);
        if (!in6_dev)
                return;
        if (READ_ONCE(in6_dev->cnf.forwarding) ||
            !READ_ONCE(in6_dev->cnf.accept_redirects))
                return;

        /* RFC2461 8.1:
         *        The IP source address of the Redirect MUST be the same as the current
         *        first-hop router for the specified ICMP Destination Address.
         */

        if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
                net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
                return;
        }

        lladdr = NULL;
        if (ndopts.nd_opts_tgt_lladdr) {
                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
                                             skb->dev);
                if (!lladdr) {
                        net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
                        return;
                }
        }

        rt = dst_rt6_info(dst);
        if (rt->rt6i_flags & RTF_REJECT) {
                net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
                return;
        }

        /* Redirect received -> path was valid.
         * Look, redirects are sent only in response to data packets,
         * so that this nexthop apparently is reachable. --ANK
         */
        dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);

        neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
        if (!neigh)
                return;

        /*
         *        We have finally decided to accept it.
         */

        ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
                     NEIGH_UPDATE_F_OVERRIDE|
                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
                                     NEIGH_UPDATE_F_ISROUTER)),
                     NDISC_REDIRECT, &ndopts);

        rcu_read_lock();
        res.f6i = rcu_dereference(rt->from);
        if (!res.f6i)
                goto out;

        if (res.f6i->nh) {
                struct fib6_nh_match_arg arg = {
                        .dev = dst->dev,
                        .gw = &rt->rt6i_gateway,
                };

                nexthop_for_each_fib6_nh(res.f6i->nh,
                                         fib6_nh_find_match, &arg);

                /* fib6_info uses a nexthop that does not have fib6_nh
                 * using the dst->dev. Should be impossible
                 */
                if (!arg.match)
                        goto out;
                res.nh = arg.match;
        } else {
                res.nh = res.f6i->fib6_nh;
        }

        res.fib6_flags = res.f6i->fib6_flags;
        res.fib6_type = res.f6i->fib6_type;
        nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
        if (!nrt)
                goto out;

        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
        if (on_link)
                nrt->rt6i_flags &= ~RTF_GATEWAY;

        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;

        /* rt6_insert_exception() will take care of duplicated exceptions */
        if (rt6_insert_exception(nrt, &res)) {
                dst_release_immediate(&nrt->dst);
                goto out;
        }

        netevent.old = &rt->dst;
        netevent.new = &nrt->dst;
        netevent.daddr = &msg->dest;
        netevent.neigh = neigh;
        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

out:
        rcu_read_unlock();
        neigh_release(neigh);
}

#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev)
{
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
        int ifindex = dev->ifindex;
        struct fib6_node *fn;
        struct fib6_info *rt = NULL;
        struct fib6_table *table;

        table = fib6_get_table(net, tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
        if (!fn)
                goto out;

        for_each_fib6_node_rt_rcu(fn) {
                /* these routes do not use nexthops */
                if (rt->nh)
                        continue;
                if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
                        continue;
                if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
                    !rt->fib6_nh->fib_nh_gw_family)
                        continue;
                if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
                        continue;
                if (!fib6_info_hold_safe(rt))
                        continue;
                break;
        }
out:
        rcu_read_unlock();
        return rt;
}

static struct fib6_info *rt6_add_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev,
                                           unsigned int pref)
{
        struct fib6_config cfg = {
                .fc_metric        = IP6_RT_PRIO_USER,
                .fc_ifindex        = dev->ifindex,
                .fc_dst_len        = prefixlen,
                .fc_flags        = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
                                  RTF_UP | RTF_PREF(pref),
                .fc_protocol = RTPROT_RA,
                .fc_type = RTN_UNICAST,
                .fc_nlinfo.portid = 0,
                .fc_nlinfo.nlh = NULL,
                .fc_nlinfo.nl_net = net,
        };

        cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
        cfg.fc_dst = *prefix;
        cfg.fc_gateway = *gwaddr;

        /* We should treat it as a default route if prefix length is 0. */
        if (!prefixlen)
                cfg.fc_flags |= RTF_DEFAULT;

        ip6_route_add(&cfg, GFP_ATOMIC, NULL);

        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
}
#endif

struct fib6_info *rt6_get_dflt_router(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev)
{
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
        struct fib6_info *rt;
        struct fib6_table *table;

        table = fib6_get_table(net, tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        for_each_fib6_node_rt_rcu(&table->tb6_root) {
                struct fib6_nh *nh;

                /* RA routes do not use nexthops */
                if (rt->nh)
                        continue;

                nh = rt->fib6_nh;
                if (dev == nh->fib_nh_dev &&
                    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
                    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
                        break;
        }
        if (rt && !fib6_info_hold_safe(rt))
                rt = NULL;
        rcu_read_unlock();
        return rt;
}

struct fib6_info *rt6_add_dflt_router(struct net *net,
                                     const struct in6_addr *gwaddr,
                                     struct net_device *dev,
                                     unsigned int pref,
                                     u32 defrtr_usr_metric,
                                     int lifetime)
{
        struct fib6_config cfg = {
                .fc_table        = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
                .fc_metric        = defrtr_usr_metric,
                .fc_ifindex        = dev->ifindex,
                .fc_flags        = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
                .fc_protocol = RTPROT_RA,
                .fc_type = RTN_UNICAST,
                .fc_nlinfo.portid = 0,
                .fc_nlinfo.nlh = NULL,
                .fc_nlinfo.nl_net = net,
                .fc_expires = jiffies_to_clock_t(lifetime * HZ),
        };

        cfg.fc_gateway = *gwaddr;

        if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
                struct fib6_table *table;

                table = fib6_get_table(dev_net(dev), cfg.fc_table);
                if (table)
                        table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
        }

        return rt6_get_dflt_router(net, gwaddr, dev);
}

static void __rt6_purge_dflt_routers(struct net *net,
                                     struct fib6_table *table)
{
        struct fib6_info *rt;

restart:
        rcu_read_lock();
        for_each_fib6_node_rt_rcu(&table->tb6_root) {
                struct net_device *dev = fib6_info_nh_dev(rt);
                struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;

                if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
                    (!idev || idev->cnf.accept_ra != 2) &&
                    fib6_info_hold_safe(rt)) {
                        rcu_read_unlock();
                        ip6_del_rt(net, rt, false);
                        goto restart;
                }
        }
        rcu_read_unlock();

        table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();

        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
                                __rt6_purge_dflt_routers(net, table);
                }
        }

        rcu_read_unlock();
}

static void rtmsg_to_fib6_config(struct net *net,
                                 struct in6_rtmsg *rtmsg,
                                 struct fib6_config *cfg)
{
        *cfg = (struct fib6_config){
                .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
                         : RT6_TABLE_MAIN,
                .fc_ifindex = rtmsg->rtmsg_ifindex,
                .fc_metric = rtmsg->rtmsg_metric,
                .fc_expires = rtmsg->rtmsg_info,
                .fc_dst_len = rtmsg->rtmsg_dst_len,
                .fc_src_len = rtmsg->rtmsg_src_len,
                .fc_flags = rtmsg->rtmsg_flags,
                .fc_type = rtmsg->rtmsg_type,

                .fc_nlinfo.nl_net = net,

                .fc_dst = rtmsg->rtmsg_dst,
                .fc_src = rtmsg->rtmsg_src,
                .fc_gateway = rtmsg->rtmsg_gateway,
        };
}

int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
{
        struct fib6_config cfg;
        int err;

        if (cmd != SIOCADDRT && cmd != SIOCDELRT)
                return -EINVAL;
        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        rtmsg_to_fib6_config(net, rtmsg, &cfg);

        rtnl_lock();
        switch (cmd) {
        case SIOCADDRT:
                /* Only do the default setting of fc_metric in route adding */
                if (cfg.fc_metric == 0)
                        cfg.fc_metric = IP6_RT_PRIO_USER;
                err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
                break;
        case SIOCDELRT:
                err = ip6_route_del(&cfg, NULL);
                break;
        }
        rtnl_unlock();
        return err;
}

/*
 *        Drop the packet on the floor
 */

static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
{
        struct dst_entry *dst = skb_dst(skb);
        struct net *net = dev_net(dst->dev);
        struct inet6_dev *idev;
        SKB_DR(reason);
        int type;

        if (netif_is_l3_master(skb->dev) ||
            dst->dev == net->loopback_dev)
                idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
        else
                idev = ip6_dst_idev(dst);

        switch (ipstats_mib_noroutes) {
        case IPSTATS_MIB_INNOROUTES:
                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
                if (type == IPV6_ADDR_ANY) {
                        SKB_DR_SET(reason, IP_INADDRERRORS);
                        IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                        break;
                }
                SKB_DR_SET(reason, IP_INNOROUTES);
                fallthrough;
        case IPSTATS_MIB_OUTNOROUTES:
                SKB_DR_OR(reason, IP_OUTNOROUTES);
                IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
                break;
        }

        /* Start over by dropping the dst for l3mdev case */
        if (netif_is_l3_master(skb->dev))
                skb_dst_drop(skb);

        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
        kfree_skb_reason(skb, reason);
        return 0;
}

static int ip6_pkt_discard(struct sk_buff *skb)
{
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
}

static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb->dev = skb_dst(skb)->dev;
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
}

static int ip6_pkt_prohibit(struct sk_buff *skb)
{
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
}

static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb->dev = skb_dst(skb)->dev;
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
}

/*
 *        Allocate a dst for local (unicast / anycast) address.
 */

struct fib6_info *addrconf_f6i_alloc(struct net *net,
                                     struct inet6_dev *idev,
                                     const struct in6_addr *addr,
                                     bool anycast, gfp_t gfp_flags,
                                     struct netlink_ext_ack *extack)
{
        struct fib6_config cfg = {
                .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
                .fc_ifindex = idev->dev->ifindex,
                .fc_flags = RTF_UP | RTF_NONEXTHOP,
                .fc_dst = *addr,
                .fc_dst_len = 128,
                .fc_protocol = RTPROT_KERNEL,
                .fc_nlinfo.nl_net = net,
                .fc_ignore_dev_down = true,
        };
        struct fib6_info *f6i;

        if (anycast) {
                cfg.fc_type = RTN_ANYCAST;
                cfg.fc_flags |= RTF_ANYCAST;
        } else {
                cfg.fc_type = RTN_LOCAL;
                cfg.fc_flags |= RTF_LOCAL;
        }

        f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
        if (!IS_ERR(f6i)) {
                f6i->dst_nocount = true;

                if (!anycast &&
                    (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
                     READ_ONCE(idev->cnf.disable_policy)))
                        f6i->dst_nopolicy = true;
        }

        return f6i;
}

/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
        struct net *net;
        struct in6_addr *addr;
};

static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
{
        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

        if (!rt->nh &&
            rt != net->ipv6.fib6_null_entry &&
            ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
            !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
                spin_lock_bh(&rt6_exception_lock);
                /* remove prefsrc entry */
                rt->fib6_prefsrc.plen = 0;
                spin_unlock_bh(&rt6_exception_lock);
        }
        return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
        struct net *net = dev_net(ifp->idev->dev);
        struct arg_dev_net_ip adni = {
                .net = net,
                .addr = &ifp->addr,
        };
        fib6_clean_all(net, fib6_remove_prefsrc, &adni);
}

#define RTF_RA_ROUTER                (RTF_ADDRCONF | RTF_DEFAULT)

/* Remove routers and update dst entries when gateway turn into host. */
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
        struct in6_addr *gateway = (struct in6_addr *)arg;
        struct fib6_nh *nh;

        /* RA routes do not use nexthops */
        if (rt->nh)
                return 0;

        nh = rt->fib6_nh;
        if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
            nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
                return -1;

        /* Further clean up cached routes in exception table.
         * This is needed because cached route may have a different
         * gateway than its 'parent' in the case of an ip redirect.
         */
        fib6_nh_exceptions_clean_tohost(nh, gateway);

        return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
        fib6_clean_all(net, fib6_clean_tohost, gateway);
}

struct arg_netdev_event {
        const struct net_device *dev;
        union {
                unsigned char nh_flags;
                unsigned long event;
        };
};

static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        struct fib6_node *fn;

        fn = rcu_dereference_protected(rt->fib6_node,
                        lockdep_is_held(&rt->fib6_table->tb6_lock));
        iter = rcu_dereference_protected(fn->leaf,
                        lockdep_is_held(&rt->fib6_table->tb6_lock));
        while (iter) {
                if (iter->fib6_metric == rt->fib6_metric &&
                    rt6_qualify_for_ecmp(iter))
                        return iter;
                iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
        }

        return NULL;
}

/* only called for fib entries with builtin fib6_nh */
static bool rt6_is_dead(const struct fib6_info *rt)
{
        if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
            (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
             ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
                return true;

        return false;
}

static int rt6_multipath_total_weight(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        int total = 0;

        if (!rt6_is_dead(rt))
                total += rt->fib6_nh->fib_nh_weight;

        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
                if (!rt6_is_dead(iter))
                        total += iter->fib6_nh->fib_nh_weight;
        }

        return total;
}

static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
{
        int upper_bound = -1;

        if (!rt6_is_dead(rt)) {
                *weight += rt->fib6_nh->fib_nh_weight;
                upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
                                                    total) - 1;
        }
        atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}

static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
{
        struct fib6_info *iter;
        int weight = 0;

        rt6_upper_bound_set(rt, &weight, total);

        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                rt6_upper_bound_set(iter, &weight, total);
}

void rt6_multipath_rebalance(struct fib6_info *rt)
{
        struct fib6_info *first;
        int total;

        /* In case the entire multipath route was marked for flushing,
         * then there is no need to rebalance upon the removal of every
         * sibling route.
         */
        if (!rt->fib6_nsiblings || rt->should_flush)
                return;

        /* During lookup routes are evaluated in order, so we need to
         * make sure upper bounds are assigned from the first sibling
         * onwards.
         */
        first = rt6_multipath_first_sibling(rt);
        if (WARN_ON_ONCE(!first))
                return;

        total = rt6_multipath_total_weight(first);
        rt6_multipath_upper_bound_set(first, total);
}

static int fib6_ifup(struct fib6_info *rt, void *p_arg)
{
        const struct arg_netdev_event *arg = p_arg;
        struct net *net = dev_net(arg->dev);

        if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
            rt->fib6_nh->fib_nh_dev == arg->dev) {
                rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
                fib6_update_sernum_upto_root(net, rt);
                rt6_multipath_rebalance(rt);
        }

        return 0;
}

void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
{
        struct arg_netdev_event arg = {
                .dev = dev,
                {
                        .nh_flags = nh_flags,
                },
        };

        if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
                arg.nh_flags |= RTNH_F_LINKDOWN;

        fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}

/* only called for fib entries with inline fib6_nh */
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
                                   const struct net_device *dev)
{
        struct fib6_info *iter;

        if (rt->fib6_nh->fib_nh_dev == dev)
                return true;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == dev)
                        return true;

        return false;
}

static void rt6_multipath_flush(struct fib6_info *rt)
{
        struct fib6_info *iter;

        rt->should_flush = 1;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                iter->should_flush = 1;
}

static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
                                             const struct net_device *down_dev)
{
        struct fib6_info *iter;
        unsigned int dead = 0;

        if (rt->fib6_nh->fib_nh_dev == down_dev ||
            rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
                dead++;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == down_dev ||
                    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
                        dead++;

        return dead;
}

static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
                                       const struct net_device *dev,
                                       unsigned char nh_flags)
{
        struct fib6_info *iter;

        if (rt->fib6_nh->fib_nh_dev == dev)
                rt->fib6_nh->fib_nh_flags |= nh_flags;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == dev)
                        iter->fib6_nh->fib_nh_flags |= nh_flags;
}

/* called with write lock held for table with rt */
static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
{
        const struct arg_netdev_event *arg = p_arg;
        const struct net_device *dev = arg->dev;
        struct net *net = dev_net(dev);

        if (rt == net->ipv6.fib6_null_entry || rt->nh)
                return 0;

        switch (arg->event) {
        case NETDEV_UNREGISTER:
                return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
        case NETDEV_DOWN:
                if (rt->should_flush)
                        return -1;
                if (!rt->fib6_nsiblings)
                        return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
                if (rt6_multipath_uses_dev(rt, dev)) {
                        unsigned int count;

                        count = rt6_multipath_dead_count(rt, dev);
                        if (rt->fib6_nsiblings + 1 == count) {
                                rt6_multipath_flush(rt);
                                return -1;
                        }
                        rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
                                                   RTNH_F_LINKDOWN);
                        fib6_update_sernum(net, rt);
                        rt6_multipath_rebalance(rt);
                }
                return -2;
        case NETDEV_CHANGE:
                if (rt->fib6_nh->fib_nh_dev != dev ||
                    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
                        break;
                rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
                rt6_multipath_rebalance(rt);
                break;
        }

        return 0;
}

void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
{
        struct arg_netdev_event arg = {
                .dev = dev,
                {
                        .event = event,
                },
        };
        struct net *net = dev_net(dev);

        if (net->ipv6.sysctl.skip_notify_on_dev_down)
                fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
        else
                fib6_clean_all(net, fib6_ifdown, &arg);
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
        rt6_sync_down_dev(dev, event);
        rt6_uncached_list_flush_dev(dev);
        neigh_ifdown(&nd_tbl, dev);
}

struct rt6_mtu_change_arg {
        struct net_device *dev;
        unsigned int mtu;
        struct fib6_info *f6i;
};

static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
{
        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
        struct fib6_info *f6i = arg->f6i;

        /* For administrative MTU increase, there is no way to discover
         * IPv6 PMTU increase, so PMTU increase should be updated here.
         * Since RFC 1981 doesn't include administrative MTU increase
         * update PMTU increase is a MUST. (i.e. jumbo frame)
         */
        if (nh->fib_nh_dev == arg->dev) {
                struct inet6_dev *idev = __in6_dev_get(arg->dev);
                u32 mtu = f6i->fib6_pmtu;

                if (mtu >= arg->mtu ||
                    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
                        fib6_metric_set(f6i, RTAX_MTU, arg->mtu);

                spin_lock_bh(&rt6_exception_lock);
                rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
                spin_unlock_bh(&rt6_exception_lock);
        }

        return 0;
}

static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
        struct inet6_dev *idev;

        /* In IPv6 pmtu discovery is not optional,
           so that RTAX_MTU lock cannot disable it.
           We still use this lock to block changes
           caused by addrconf/ndisc.
        */

        idev = __in6_dev_get(arg->dev);
        if (!idev)
                return 0;

        if (fib6_metric_locked(f6i, RTAX_MTU))
                return 0;

        arg->f6i = f6i;
        if (f6i->nh) {
                /* fib6_nh_mtu_change only returns 0, so this is safe */
                return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
                                                arg);
        }

        return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}

void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
{
        struct rt6_mtu_change_arg arg = {
                .dev = dev,
                .mtu = mtu,
        };

        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
}

static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
        [RTA_UNSPEC]                = { .strict_start_type = RTA_DPORT + 1 },
        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
        [RTA_PREFSRC]                = { .len = sizeof(struct in6_addr) },
        [RTA_OIF]               = { .type = NLA_U32 },
        [RTA_IIF]                = { .type = NLA_U32 },
        [RTA_PRIORITY]          = { .type = NLA_U32 },
        [RTA_METRICS]           = { .type = NLA_NESTED },
        [RTA_MULTIPATH]                = { .len = sizeof(struct rtnexthop) },
        [RTA_PREF]              = { .type = NLA_U8 },
        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
        [RTA_ENCAP]                = { .type = NLA_NESTED },
        [RTA_EXPIRES]                = { .type = NLA_U32 },
        [RTA_UID]                = { .type = NLA_U32 },
        [RTA_MARK]                = { .type = NLA_U32 },
        [RTA_TABLE]                = { .type = NLA_U32 },
        [RTA_IP_PROTO]                = { .type = NLA_U8 },
        [RTA_SPORT]                = { .type = NLA_U16 },
        [RTA_DPORT]                = { .type = NLA_U16 },
        [RTA_NH_ID]                = { .type = NLA_U32 },
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct fib6_config *cfg,
                              struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        struct nlattr *tb[RTA_MAX+1];
        unsigned int pref;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                     rtm_ipv6_policy, extack);
        if (err < 0)
                goto errout;

        err = -EINVAL;
        rtm = nlmsg_data(nlh);

        if (rtm->rtm_tos) {
                NL_SET_ERR_MSG(extack,
                               "Invalid dsfield (tos): option not available for IPv6");
                goto errout;
        }

        *cfg = (struct fib6_config){
                .fc_table = rtm->rtm_table,
                .fc_dst_len = rtm->rtm_dst_len,
                .fc_src_len = rtm->rtm_src_len,
                .fc_flags = RTF_UP,
                .fc_protocol = rtm->rtm_protocol,
                .fc_type = rtm->rtm_type,

                .fc_nlinfo.portid = NETLINK_CB(skb).portid,
                .fc_nlinfo.nlh = nlh,
                .fc_nlinfo.nl_net = sock_net(skb->sk),
        };

        if (rtm->rtm_type == RTN_UNREACHABLE ||
            rtm->rtm_type == RTN_BLACKHOLE ||
            rtm->rtm_type == RTN_PROHIBIT ||
            rtm->rtm_type == RTN_THROW)
                cfg->fc_flags |= RTF_REJECT;

        if (rtm->rtm_type == RTN_LOCAL)
                cfg->fc_flags |= RTF_LOCAL;

        if (rtm->rtm_flags & RTM_F_CLONED)
                cfg->fc_flags |= RTF_CACHE;

        cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);

        if (tb[RTA_NH_ID]) {
                if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
                    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop specification and nexthop id are mutually exclusive");
                        goto errout;
                }
                cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
        }

        if (tb[RTA_GATEWAY]) {
                cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
                cfg->fc_flags |= RTF_GATEWAY;
        }
        if (tb[RTA_VIA]) {
                NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
                goto errout;
        }

        if (tb[RTA_DST]) {
                int plen = (rtm->rtm_dst_len + 7) >> 3;

                if (nla_len(tb[RTA_DST]) < plen)
                        goto errout;

                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
        }

        if (tb[RTA_SRC]) {
                int plen = (rtm->rtm_src_len + 7) >> 3;

                if (nla_len(tb[RTA_SRC]) < plen)
                        goto errout;

                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
        }

        if (tb[RTA_PREFSRC])
                cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);

        if (tb[RTA_OIF])
                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

        if (tb[RTA_PRIORITY])
                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

        if (tb[RTA_METRICS]) {
                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
        }

        if (tb[RTA_TABLE])
                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

        if (tb[RTA_MULTIPATH]) {
                cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
                cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);

                err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
                                                     cfg->fc_mp_len, extack);
                if (err < 0)
                        goto errout;
        }

        if (tb[RTA_PREF]) {
                pref = nla_get_u8(tb[RTA_PREF]);
                if (pref != ICMPV6_ROUTER_PREF_LOW &&
                    pref != ICMPV6_ROUTER_PREF_HIGH)
                        pref = ICMPV6_ROUTER_PREF_MEDIUM;
                cfg->fc_flags |= RTF_PREF(pref);
        }

        if (tb[RTA_ENCAP])
                cfg->fc_encap = tb[RTA_ENCAP];

        if (tb[RTA_ENCAP_TYPE]) {
                cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

                err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
                if (err < 0)
                        goto errout;
        }

        if (tb[RTA_EXPIRES]) {
                unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

                if (addrconf_finite_timeout(timeout)) {
                        cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
                        cfg->fc_flags |= RTF_EXPIRES;
                }
        }

        err = 0;
errout:
        return err;
}

struct rt6_nh {
        struct fib6_info *fib6_info;
        struct fib6_config r_cfg;
        struct list_head next;
};

static int ip6_route_info_append(struct net *net,
                                 struct list_head *rt6_nh_list,
                                 struct fib6_info *rt,
                                 struct fib6_config *r_cfg)
{
        struct rt6_nh *nh;
        int err = -EEXIST;

        list_for_each_entry(nh, rt6_nh_list, next) {
                /* check if fib6_info already exists */
                if (rt6_duplicate_nexthop(nh->fib6_info, rt))
                        return err;
        }

        nh = kzalloc(sizeof(*nh), GFP_KERNEL);
        if (!nh)
                return -ENOMEM;
        nh->fib6_info = rt;
        memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
        list_add_tail(&nh->next, rt6_nh_list);

        return 0;
}

static void ip6_route_mpath_notify(struct fib6_info *rt,
                                   struct fib6_info *rt_last,
                                   struct nl_info *info,
                                   __u16 nlflags)
{
        /* if this is an APPEND route, then rt points to the first route
         * inserted and rt_last points to last route inserted. Userspace
         * wants a consistent dump of the route which starts at the first
         * nexthop. Since sibling routes are always added at the end of
         * the list, find the first sibling of the last route appended
         */
        if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
                rt = list_first_entry(&rt_last->fib6_siblings,
                                      struct fib6_info,
                                      fib6_siblings);
        }

        if (rt)
                inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
}

static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
{
        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        bool should_notify = false;
        struct fib6_info *leaf;
        struct fib6_node *fn;

        rcu_read_lock();
        fn = rcu_dereference(rt->fib6_node);
        if (!fn)
                goto out;

        leaf = rcu_dereference(fn->leaf);
        if (!leaf)
                goto out;

        if (rt == leaf ||
            (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
             rt6_qualify_for_ecmp(leaf)))
                should_notify = true;
out:
        rcu_read_unlock();

        return should_notify;
}

static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        if (nla_len(nla) < sizeof(*gw)) {
                NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
                return -EINVAL;
        }

        *gw = nla_get_in6_addr(nla);

        return 0;
}

static int ip6_route_multipath_add(struct fib6_config *cfg,
                                   struct netlink_ext_ack *extack)
{
        struct fib6_info *rt_notif = NULL, *rt_last = NULL;
        struct nl_info *info = &cfg->fc_nlinfo;
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
        struct fib6_info *rt;
        struct rt6_nh *err_nh;
        struct rt6_nh *nh, *nh_safe;
        __u16 nlflags;
        int remaining;
        int attrlen;
        int err = 1;
        int nhn = 0;
        int replace = (cfg->fc_nlinfo.nlh &&
                       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
        LIST_HEAD(rt6_nh_list);

        nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
        if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
                nlflags |= NLM_F_APPEND;

        remaining = cfg->fc_mp_len;
        rtnh = (struct rtnexthop *)cfg->fc_mp;

        /* Parse a Multipath Entry and build a list (rt6_nh_list) of
         * fib6_info structs per nexthop
         */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla) {
                                err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
                                                        extack);
                                if (err)
                                        goto cleanup;

                                r_cfg.fc_flags |= RTF_GATEWAY;
                        }
                        r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);

                        /* RTA_ENCAP_TYPE length checked in
                         * lwtunnel_valid_encap_type_attr
                         */
                        nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
                        if (nla)
                                r_cfg.fc_encap_type = nla_get_u16(nla);
                }

                r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
                rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
                if (IS_ERR(rt)) {
                        err = PTR_ERR(rt);
                        rt = NULL;
                        goto cleanup;
                }
                if (!rt6_qualify_for_ecmp(rt)) {
                        err = -EINVAL;
                        NL_SET_ERR_MSG(extack,
                                       "Device only routes can not be added for IPv6 using the multipath API.");
                        fib6_info_release(rt);
                        goto cleanup;
                }

                rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;

                err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
                                            rt, &r_cfg);
                if (err) {
                        fib6_info_release(rt);
                        goto cleanup;
                }

                rtnh = rtnh_next(rtnh, &remaining);
        }

        if (list_empty(&rt6_nh_list)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid nexthop configuration - no valid nexthops");
                return -EINVAL;
        }

        /* for add and replace send one notification with all nexthops.
         * Skip the notification in fib6_add_rt2node and send one with
         * the full route when done
         */
        info->skip_notify = 1;

        /* For add and replace, send one notification with all nexthops. For
         * append, send one notification with all appended nexthops.
         */
        info->skip_notify_kernel = 1;

        err_nh = NULL;
        list_for_each_entry(nh, &rt6_nh_list, next) {
                err = __ip6_ins_rt(nh->fib6_info, info, extack);

                if (err) {
                        if (replace && nhn)
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "multipath route replace failed (check consistency of installed routes)");
                        err_nh = nh;
                        goto add_errout;
                }
                /* save reference to last route successfully inserted */
                rt_last = nh->fib6_info;

                /* save reference to first route for notification */
                if (!rt_notif)
                        rt_notif = nh->fib6_info;

                /* Because each route is added like a single route we remove
                 * these flags after the first nexthop: if there is a collision,
                 * we have already failed to add the first nexthop:
                 * fib6_add_rt2node() has rejected it; when replacing, old
                 * nexthops have been replaced by first new, the rest should
                 * be added to it.
                 */
                if (cfg->fc_nlinfo.nlh) {
                        cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
                                                             NLM_F_REPLACE);
                        cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
                }
                nhn++;
        }

        /* An in-kernel notification should only be sent in case the new
         * multipath route is added as the first route in the node, or if
         * it was appended to it. We pass 'rt_notif' since it is the first
         * sibling and might allow us to skip some checks in the replace case.
         */
        if (ip6_route_mpath_should_notify(rt_notif)) {
                enum fib_event_type fib_event;

                if (rt_notif->fib6_nsiblings != nhn - 1)
                        fib_event = FIB_EVENT_ENTRY_APPEND;
                else
                        fib_event = FIB_EVENT_ENTRY_REPLACE;

                err = call_fib6_multipath_entry_notifiers(info->nl_net,
                                                          fib_event, rt_notif,
                                                          nhn - 1, extack);
                if (err) {
                        /* Delete all the siblings that were just added */
                        err_nh = NULL;
                        goto add_errout;
                }
        }

        /* success ... tell user about new route */
        ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
        goto cleanup;

add_errout:
        /* send notification for routes that were added so that
         * the delete notifications sent by ip6_route_del are
         * coherent
         */
        if (rt_notif)
                ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

        /* Delete routes that were already added */
        list_for_each_entry(nh, &rt6_nh_list, next) {
                if (err_nh == nh)
                        break;
                ip6_route_del(&nh->r_cfg, extack);
        }

cleanup:
        list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
                fib6_info_release(nh->fib6_info);
                list_del(&nh->next);
                kfree(nh);
        }

        return err;
}

static int ip6_route_multipath_del(struct fib6_config *cfg,
                                   struct netlink_ext_ack *extack)
{
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
        int last_err = 0;
        int remaining;
        int attrlen;
        int err;

        remaining = cfg->fc_mp_len;
        rtnh = (struct rtnexthop *)cfg->fc_mp;

        /* Parse a Multipath Entry */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla) {
                                err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
                                                        extack);
                                if (err) {
                                        last_err = err;
                                        goto next_rtnh;
                                }

                                r_cfg.fc_flags |= RTF_GATEWAY;
                        }
                }
                err = ip6_route_del(&r_cfg, extack);
                if (err)
                        last_err = err;

next_rtnh:
                rtnh = rtnh_next(rtnh, &remaining);
        }

        return last_err;
}

static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct fib6_config cfg;
        int err;

        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
        if (err < 0)
                return err;

        if (cfg.fc_nh_id &&
            !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                return -EINVAL;
        }

        if (cfg.fc_mp)
                return ip6_route_multipath_del(&cfg, extack);
        else {
                cfg.fc_delete_all_nh = 1;
                return ip6_route_del(&cfg, extack);
        }
}

static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct fib6_config cfg;
        int err;

        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
        if (err < 0)
                return err;

        if (cfg.fc_metric == 0)
                cfg.fc_metric = IP6_RT_PRIO_USER;

        if (cfg.fc_mp)
                return ip6_route_multipath_add(&cfg, extack);
        else
                return ip6_route_add(&cfg, GFP_KERNEL, extack);
}

/* add the overhead of this fib6_nh to nexthop_len */
static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
{
        int *nexthop_len = arg;

        *nexthop_len += nla_total_size(0)         /* RTA_MULTIPATH */
                     + NLA_ALIGN(sizeof(struct rtnexthop))
                     + nla_total_size(16); /* RTA_GATEWAY */

        if (nh->fib_nh_lws) {
                /* RTA_ENCAP_TYPE */
                *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
                /* RTA_ENCAP */
                *nexthop_len += nla_total_size(2);
        }

        return 0;
}

static size_t rt6_nlmsg_size(struct fib6_info *f6i)
{
        int nexthop_len;

        if (f6i->nh) {
                nexthop_len = nla_total_size(4); /* RTA_NH_ID */
                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
                                         &nexthop_len);
        } else {
                struct fib6_info *sibling, *next_sibling;
                struct fib6_nh *nh = f6i->fib6_nh;

                nexthop_len = 0;
                if (f6i->fib6_nsiblings) {
                        rt6_nh_nlmsg_size(nh, &nexthop_len);

                        list_for_each_entry_safe(sibling, next_sibling,
                                                 &f6i->fib6_siblings, fib6_siblings) {
                                rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
                        }
                }
                nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
        }

        return NLMSG_ALIGN(sizeof(struct rtmsg))
               + nla_total_size(16) /* RTA_SRC */
               + nla_total_size(16) /* RTA_DST */
               + nla_total_size(16) /* RTA_GATEWAY */
               + nla_total_size(16) /* RTA_PREFSRC */
               + nla_total_size(4) /* RTA_TABLE */
               + nla_total_size(4) /* RTA_IIF */
               + nla_total_size(4) /* RTA_OIF */
               + nla_total_size(4) /* RTA_PRIORITY */
               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
               + nla_total_size(sizeof(struct rta_cacheinfo))
               + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
               + nla_total_size(1) /* RTA_PREF */
               + nexthop_len;
}

static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
                                 unsigned char *flags)
{
        if (nexthop_is_multipath(nh)) {
                struct nlattr *mp;

                mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
                if (!mp)
                        goto nla_put_failure;

                if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
                        goto nla_put_failure;

                nla_nest_end(skb, mp);
        } else {
                struct fib6_nh *fib6_nh;

                fib6_nh = nexthop_fib6_nh(nh);
                if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
                                     flags, false) < 0)
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                         struct fib6_info *rt, struct dst_entry *dst,
                         struct in6_addr *dest, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags)
{
        struct rt6_info *rt6 = dst_rt6_info(dst);
        struct rt6key *rt6_dst, *rt6_src;
        u32 *pmetrics, table, rt6_flags;
        unsigned char nh_flags = 0;
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
        long expires = 0;

        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;

        if (rt6) {
                rt6_dst = &rt6->rt6i_dst;
                rt6_src = &rt6->rt6i_src;
                rt6_flags = rt6->rt6i_flags;
        } else {
                rt6_dst = &rt->fib6_dst;
                rt6_src = &rt->fib6_src;
                rt6_flags = rt->fib6_flags;
        }

        rtm = nlmsg_data(nlh);
        rtm->rtm_family = AF_INET6;
        rtm->rtm_dst_len = rt6_dst->plen;
        rtm->rtm_src_len = rt6_src->plen;
        rtm->rtm_tos = 0;
        if (rt->fib6_table)
                table = rt->fib6_table->tb6_id;
        else
                table = RT6_TABLE_UNSPEC;
        rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, table))
                goto nla_put_failure;

        rtm->rtm_type = rt->fib6_type;
        rtm->rtm_flags = 0;
        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
        rtm->rtm_protocol = rt->fib6_protocol;

        if (rt6_flags & RTF_CACHE)
                rtm->rtm_flags |= RTM_F_CLONED;

        if (dest) {
                if (nla_put_in6_addr(skb, RTA_DST, dest))
                        goto nla_put_failure;
                rtm->rtm_dst_len = 128;
        } else if (rtm->rtm_dst_len)
                if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
                        goto nla_put_failure;
#ifdef CONFIG_IPV6_SUBTREES
        if (src) {
                if (nla_put_in6_addr(skb, RTA_SRC, src))
                        goto nla_put_failure;
                rtm->rtm_src_len = 128;
        } else if (rtm->rtm_src_len &&
                   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
                goto nla_put_failure;
#endif
        if (iif) {
#ifdef CONFIG_IPV6_MROUTE
                if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
                        int err = ip6mr_get_route(net, skb, rtm, portid);

                        if (err == 0)
                                return 0;
                        if (err < 0)
                                goto nla_put_failure;
                } else
#endif
                        if (nla_put_u32(skb, RTA_IIF, iif))
                                goto nla_put_failure;
        } else if (dest) {
                struct in6_addr saddr_buf;
                if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
                    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
                        goto nla_put_failure;
        }

        if (rt->fib6_prefsrc.plen) {
                struct in6_addr saddr_buf;
                saddr_buf = rt->fib6_prefsrc.addr;
                if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
                        goto nla_put_failure;
        }

        pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
        if (rtnetlink_put_metrics(skb, pmetrics) < 0)
                goto nla_put_failure;

        if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
                goto nla_put_failure;

        /* For multipath routes, walk the siblings list and add
         * each as a nexthop within RTA_MULTIPATH.
         */
        if (rt6) {
                if (rt6_flags & RTF_GATEWAY &&
                    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
                        goto nla_put_failure;

                if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
                        goto nla_put_failure;

                if (dst->lwtstate &&
                    lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                        goto nla_put_failure;
        } else if (rt->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;
                struct nlattr *mp;

                mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
                if (!mp)
                        goto nla_put_failure;

                if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
                                    rt->fib6_nh->fib_nh_weight, AF_INET6,
                                    0) < 0)
                        goto nla_put_failure;

                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings, fib6_siblings) {
                        if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
                                            sibling->fib6_nh->fib_nh_weight,
                                            AF_INET6, 0) < 0)
                                goto nla_put_failure;
                }

                nla_nest_end(skb, mp);
        } else if (rt->nh) {
                if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
                        goto nla_put_failure;

                if (nexthop_is_blackhole(rt->nh))
                        rtm->rtm_type = RTN_BLACKHOLE;

                if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
                    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags |= nh_flags;
        } else {
                if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
                                     &nh_flags, false) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags |= nh_flags;
        }

        if (rt6_flags & RTF_EXPIRES) {
                expires = dst ? dst->expires : rt->expires;
                expires -= jiffies;
        }

        if (!dst) {
                if (READ_ONCE(rt->offload))
                        rtm->rtm_flags |= RTM_F_OFFLOAD;
                if (READ_ONCE(rt->trap))
                        rtm->rtm_flags |= RTM_F_TRAP;
                if (READ_ONCE(rt->offload_failed))
                        rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
        }

        if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
                goto nla_put_failure;

        if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
                goto nla_put_failure;


        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
{
        const struct net_device *dev = arg;

        if (nh->fib_nh_dev == dev)
                return 1;

        return 0;
}

static bool fib6_info_uses_dev(const struct fib6_info *f6i,
                               const struct net_device *dev)
{
        if (f6i->nh) {
                struct net_device *_dev = (struct net_device *)dev;

                return !!nexthop_for_each_fib6_nh(f6i->nh,
                                                  fib6_info_nh_uses_dev,
                                                  _dev);
        }

        if (f6i->fib6_nh->fib_nh_dev == dev)
                return true;

        if (f6i->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;

                list_for_each_entry_safe(sibling, next_sibling,
                                         &f6i->fib6_siblings, fib6_siblings) {
                        if (sibling->fib6_nh->fib_nh_dev == dev)
                                return true;
                }
        }

        return false;
}

struct fib6_nh_exception_dump_walker {
        struct rt6_rtnl_dump_arg *dump;
        struct fib6_info *rt;
        unsigned int flags;
        unsigned int skip;
        unsigned int count;
};

static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
{
        struct fib6_nh_exception_dump_walker *w = arg;
        struct rt6_rtnl_dump_arg *dump = w->dump;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int i, err;

        bucket = fib6_nh_get_excptn_bucket(nh, NULL);
        if (!bucket)
                return 0;

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                        if (w->skip) {
                                w->skip--;
                                continue;
                        }

                        /* Expiration of entries doesn't bump sernum, insertion
                         * does. Removal is triggered by insertion, so we can
                         * rely on the fact that if entries change between two
                         * partial dumps, this node is scanned again completely,
                         * see rt6_insert_exception() and fib6_dump_table().
                         *
                         * Count expired entries we go through as handled
                         * entries that we'll skip next time, in case of partial
                         * node dump. Otherwise, if entries expire meanwhile,
                         * we'll skip the wrong amount.
                         */
                        if (rt6_check_expired(rt6_ex->rt6i)) {
                                w->count++;
                                continue;
                        }

                        err = rt6_fill_node(dump->net, dump->skb, w->rt,
                                            &rt6_ex->rt6i->dst, NULL, NULL, 0,
                                            RTM_NEWROUTE,
                                            NETLINK_CB(dump->cb->skb).portid,
                                            dump->cb->nlh->nlmsg_seq, w->flags);
                        if (err)
                                return err;

                        w->count++;
                }
                bucket++;
        }

        return 0;
}

/* Return -1 if done with node, number of handled routes on partial dump */
int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
        struct fib_dump_filter *filter = &arg->filter;
        unsigned int flags = NLM_F_MULTI;
        struct net *net = arg->net;
        int count = 0;

        if (rt == net->ipv6.fib6_null_entry)
                return -1;

        if ((filter->flags & RTM_F_PREFIX) &&
            !(rt->fib6_flags & RTF_PREFIX_RT)) {
                /* success since this is not a prefix route */
                return -1;
        }
        if (filter->filter_set &&
            ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
             (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
             (filter->protocol && rt->fib6_protocol != filter->protocol))) {
                return -1;
        }

        if (filter->filter_set ||
            !filter->dump_routes || !filter->dump_exceptions) {
                flags |= NLM_F_DUMP_FILTERED;
        }

        if (filter->dump_routes) {
                if (skip) {
                        skip--;
                } else {
                        if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
                                          0, RTM_NEWROUTE,
                                          NETLINK_CB(arg->cb->skb).portid,
                                          arg->cb->nlh->nlmsg_seq, flags)) {
                                return 0;
                        }
                        count++;
                }
        }

        if (filter->dump_exceptions) {
                struct fib6_nh_exception_dump_walker w = { .dump = arg,
                                                           .rt = rt,
                                                           .flags = flags,
                                                           .skip = skip,
                                                           .count = 0 };
                int err;

                rcu_read_lock();
                if (rt->nh) {
                        err = nexthop_for_each_fib6_nh(rt->nh,
                                                       rt6_nh_dump_exceptions,
                                                       &w);
                } else {
                        err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
                }
                rcu_read_unlock();

                if (err)
                        return count + w.count;
        }

        return -1;
}

static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
                                        const struct nlmsghdr *nlh,
                                        struct nlattr **tb,
                                        struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Invalid header for get route request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                              rtm_ipv6_policy, extack);

        rtm = nlmsg_data(nlh);
        if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
            rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
            rtm->rtm_type) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
                return -EINVAL;
        }
        if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Invalid flags for get route request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv6_policy, extack);
        if (err)
                return err;

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
                return -EINVAL;
        }

        for (i = 0; i <= RTA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_SRC:
                case RTA_DST:
                case RTA_IIF:
                case RTA_OIF:
                case RTA_MARK:
                case RTA_UID:
                case RTA_SPORT:
                case RTA_DPORT:
                case RTA_IP_PROTO:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX+1];
        int err, iif = 0, oif = 0;
        struct fib6_info *from;
        struct dst_entry *dst;
        struct rt6_info *rt;
        struct sk_buff *skb;
        struct rtmsg *rtm;
        struct flowi6 fl6 = {};
        bool fibmatch;

        err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                goto errout;

        err = -EINVAL;
        rtm = nlmsg_data(nlh);
        fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
        fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);

        if (tb[RTA_SRC]) {
                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
                        goto errout;

                fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
        }

        if (tb[RTA_DST]) {
                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
                        goto errout;

                fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
        }

        if (tb[RTA_IIF])
                iif = nla_get_u32(tb[RTA_IIF]);

        if (tb[RTA_OIF])
                oif = nla_get_u32(tb[RTA_OIF]);

        if (tb[RTA_MARK])
                fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

        if (tb[RTA_UID])
                fl6.flowi6_uid = make_kuid(current_user_ns(),
                                           nla_get_u32(tb[RTA_UID]));
        else
                fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

        if (tb[RTA_SPORT])
                fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);

        if (tb[RTA_DPORT])
                fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);

        if (tb[RTA_IP_PROTO]) {
                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
                                                  &fl6.flowi6_proto, AF_INET6,
                                                  extack);
                if (err)
                        goto errout;
        }

        if (iif) {
                struct net_device *dev;
                int flags = 0;

                rcu_read_lock();

                dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
                        rcu_read_unlock();
                        err = -ENODEV;
                        goto errout;
                }

                fl6.flowi6_iif = iif;

                if (!ipv6_addr_any(&fl6.saddr))
                        flags |= RT6_LOOKUP_F_HAS_SADDR;

                dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);

                rcu_read_unlock();
        } else {
                fl6.flowi6_oif = oif;

                dst = ip6_route_output(net, NULL, &fl6);
        }


        rt = dst_rt6_info(dst);
        if (rt->dst.error) {
                err = rt->dst.error;
                ip6_rt_put(rt);
                goto errout;
        }

        if (rt == net->ipv6.ip6_null_entry) {
                err = rt->dst.error;
                ip6_rt_put(rt);
                goto errout;
        }

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb) {
                ip6_rt_put(rt);
                err = -ENOBUFS;
                goto errout;
        }

        skb_dst_set(skb, &rt->dst);

        rcu_read_lock();
        from = rcu_dereference(rt->from);
        if (from) {
                if (fibmatch)
                        err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
                                            iif, RTM_NEWROUTE,
                                            NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, 0);
                else
                        err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
                                            &fl6.saddr, iif, RTM_NEWROUTE,
                                            NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, 0);
        } else {
                err = -ENETUNREACH;
        }
        rcu_read_unlock();

        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
        return err;
}

void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
                     unsigned int nlm_flags)
{
        struct sk_buff *skb;
        struct net *net = info->nl_net;
        u32 seq;
        int err;

        err = -ENOBUFS;
        seq = info->nlh ? info->nlh->nlmsg_seq : 0;

        skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
        if (!skb)
                goto errout;

        err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
                            event, info->portid, seq, nlm_flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                    info->nlh, gfp_any());
        return;
errout:
        if (err < 0)
                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}

void fib6_rt_update(struct net *net, struct fib6_info *rt,
                    struct nl_info *info)
{
        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
        if (!skb)
                goto errout;

        err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
                            RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                    info->nlh, gfp_any());
        return;
errout:
        if (err < 0)
                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}

void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
                            bool offload, bool trap, bool offload_failed)
{
        struct sk_buff *skb;
        int err;

        if (READ_ONCE(f6i->offload) == offload &&
            READ_ONCE(f6i->trap) == trap &&
            READ_ONCE(f6i->offload_failed) == offload_failed)
                return;

        WRITE_ONCE(f6i->offload, offload);
        WRITE_ONCE(f6i->trap, trap);

        /* 2 means send notifications only if offload_failed was changed. */
        if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
            READ_ONCE(f6i->offload_failed) == offload_failed)
                return;

        WRITE_ONCE(f6i->offload_failed, offload_failed);

        if (!rcu_access_pointer(f6i->fib6_node))
                /* The route was removed from the tree, do not send
                 * notification.
                 */
                return;

        if (!net->ipv6.sysctl.fib_notify_on_flag_change)
                return;

        skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
        if (!skb) {
                err = -ENOBUFS;
                goto errout;
        }

        err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
                            0, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
        return;

errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
EXPORT_SYMBOL(fib6_info_hw_flags_set);

static int ip6_route_dev_notify(struct notifier_block *this,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);

        if (!(dev->flags & IFF_LOOPBACK))
                return NOTIFY_OK;

        if (event == NETDEV_REGISTER) {
                net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
                net->ipv6.ip6_null_entry->dst.dev = dev;
                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
         } else if (event == NETDEV_UNREGISTER &&
                    dev->reg_state != NETREG_UNREGISTERED) {
                /* NETDEV_UNREGISTER could be fired for multiple times by
                 * netdev_wait_allrefs(). Make sure we only call this once.
                 */
                in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
                in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
                in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
#endif
        }

        return NOTIFY_OK;
}

/*
 *        /proc
 */

#ifdef CONFIG_PROC_FS
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
        struct net *net = (struct net *)seq->private;
        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
                   net->ipv6.rt6_stats->fib_nodes,
                   net->ipv6.rt6_stats->fib_route_nodes,
                   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
                   net->ipv6.rt6_stats->fib_rt_entries,
                   net->ipv6.rt6_stats->fib_rt_cache,
                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
                   net->ipv6.rt6_stats->fib_discarded_routes);

        return 0;
}
#endif        /* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        struct net *net;
        int delay;
        int ret;
        if (!write)
                return -EINVAL;

        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        if (ret)
                return ret;

        net = (struct net *)ctl->extra1;
        delay = net->ipv6.sysctl.flush_delay;
        fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
        return 0;
}

static struct ctl_table ipv6_route_table_template[] = {
        {
                .procname        =        "max_size",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_max_size,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "gc_thresh",
                .data                =        &ip6_dst_ops_template.gc_thresh,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "flush",
                .data                =        &init_net.ipv6.sysctl.flush_delay,
                .maxlen                =        sizeof(int),
                .mode                =        0200,
                .proc_handler        =        ipv6_sysctl_rtcache_flush
        },
        {
                .procname        =        "gc_min_interval",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_timeout",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_interval",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_elasticity",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "mtu_expires",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "min_adv_mss",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_min_advmss,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "gc_min_interval_ms",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_ms_jiffies,
        },
        {
                .procname        =        "skip_notify_on_dev_down",
                .data                =        &init_net.ipv6.sysctl.skip_notify_on_dev_down,
                .maxlen                =        sizeof(u8),
                .mode                =        0644,
                .proc_handler        =        proc_dou8vec_minmax,
                .extra1                =        SYSCTL_ZERO,
                .extra2                =        SYSCTL_ONE,
        },
};

struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
{
        struct ctl_table *table;

        table = kmemdup(ipv6_route_table_template,
                        sizeof(ipv6_route_table_template),
                        GFP_KERNEL);

        if (table) {
                table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
                table[2].data = &net->ipv6.sysctl.flush_delay;
                table[2].extra1 = net;
                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
                table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
        }

        return table;
}

size_t ipv6_route_sysctl_table_size(struct net *net)
{
        /* Don't export sysctls to unprivileged users */
        if (net->user_ns != &init_user_ns)
                return 1;

        return ARRAY_SIZE(ipv6_route_table_template);
}
#endif

static int __net_init ip6_route_net_init(struct net *net)
{
        int ret = -ENOMEM;

        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
               sizeof(net->ipv6.ip6_dst_ops));

        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
                goto out_ip6_dst_ops;

        net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
        if (!net->ipv6.fib6_null_entry)
                goto out_ip6_dst_entries;
        memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
               sizeof(*net->ipv6.fib6_null_entry));

        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
                                           sizeof(*net->ipv6.ip6_null_entry),
                                           GFP_KERNEL);
        if (!net->ipv6.ip6_null_entry)
                goto out_fib6_null_entry;
        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.fib6_has_custom_rules = false;
        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
                                               sizeof(*net->ipv6.ip6_prohibit_entry),
                                               GFP_KERNEL);
        if (!net->ipv6.ip6_prohibit_entry)
                goto out_ip6_null_entry;
        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);

        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
                                               GFP_KERNEL);
        if (!net->ipv6.ip6_blk_hole_entry)
                goto out_ip6_prohibit_entry;
        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
#ifdef CONFIG_IPV6_SUBTREES
        net->ipv6.fib6_routes_require_src = 0;
#endif
#endif

        net->ipv6.sysctl.flush_delay = 0;
        net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
        net->ipv6.sysctl.skip_notify_on_dev_down = 0;

        atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);

        ret = 0;
out:
        return ret;

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
        kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
        kfree(net->ipv6.ip6_null_entry);
#endif
out_fib6_null_entry:
        kfree(net->ipv6.fib6_null_entry);
out_ip6_dst_entries:
        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
out_ip6_dst_ops:
        goto out;
}

static void __net_exit ip6_route_net_exit(struct net *net)
{
        kfree(net->ipv6.fib6_null_entry);
        kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        kfree(net->ipv6.ip6_prohibit_entry);
        kfree(net->ipv6.ip6_blk_hole_entry);
#endif
        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
}

static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("ipv6_route", 0, net->proc_net,
                             &ipv6_route_seq_ops,
                             sizeof(struct ipv6_route_iter)))
                return -ENOMEM;

        if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
                                    rt6_stats_seq_show, NULL)) {
                remove_proc_entry("ipv6_route", net->proc_net);
                return -ENOMEM;
        }
#endif
        return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("ipv6_route", net->proc_net);
        remove_proc_entry("rt6_stats", net->proc_net);
#endif
}

static struct pernet_operations ip6_route_net_ops = {
        .init = ip6_route_net_init,
        .exit = ip6_route_net_exit,
};

static int __net_init ipv6_inetpeer_init(struct net *net)
{
        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

        if (!bp)
                return -ENOMEM;
        inet_peer_base_init(bp);
        net->ipv6.peers = bp;
        return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
        struct inet_peer_base *bp = net->ipv6.peers;

        net->ipv6.peers = NULL;
        inetpeer_invalidate_tree(bp);
        kfree(bp);
}

static struct pernet_operations ipv6_inetpeer_ops = {
        .init        =        ipv6_inetpeer_init,
        .exit        =        ipv6_inetpeer_exit,
};

static struct pernet_operations ip6_route_net_late_ops = {
        .init = ip6_route_net_init_late,
        .exit = ip6_route_net_exit_late,
};

static struct notifier_block ip6_route_dev_notifier = {
        .notifier_call = ip6_route_dev_notify,
        .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
};

void __init ip6_route_init_special_entries(void)
{
        /* Registering of the loopback is done before this portion of code,
         * the loopback reference in rt6_info will not be taken, do it
         * manually for init_net */
        init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
}

#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)

BTF_ID_LIST(btf_fib6_info_id)
BTF_ID(struct, fib6_info)

static const struct bpf_iter_seq_info ipv6_route_seq_info = {
        .seq_ops                = &ipv6_route_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct ipv6_route_iter),
};

static struct bpf_iter_reg ipv6_route_reg_info = {
        .target                        = "ipv6_route",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__ipv6_route, rt),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &ipv6_route_seq_info,
};

static int __init bpf_iter_register(void)
{
        ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
        return bpf_iter_reg_target(&ipv6_route_reg_info);
}

static void bpf_iter_unregister(void)
{
        bpf_iter_unreg_target(&ipv6_route_reg_info);
}
#endif
#endif

int __init ip6_route_init(void)
{
        int ret;
        int cpu;

        ret = -ENOMEM;
        ip6_dst_ops_template.kmem_cachep =
                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
        if (!ip6_dst_ops_template.kmem_cachep)
                goto out;

        ret = dst_entries_init(&ip6_dst_blackhole_ops);
        if (ret)
                goto out_kmem_cache;

        ret = register_pernet_subsys(&ipv6_inetpeer_ops);
        if (ret)
                goto out_dst_entries;

        ret = register_pernet_subsys(&ip6_route_net_ops);
        if (ret)
                goto out_register_inetpeer;

        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

        ret = fib6_init();
        if (ret)
                goto out_register_subsys;

        ret = xfrm6_init();
        if (ret)
                goto out_fib6_init;

        ret = fib6_rules_init();
        if (ret)
                goto xfrm6_init;

        ret = register_pernet_subsys(&ip6_route_net_late_ops);
        if (ret)
                goto fib6_rules_init;

        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
                                   inet6_rtm_newroute, NULL, 0);
        if (ret < 0)
                goto out_register_late_subsys;

        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
                                   inet6_rtm_delroute, NULL, 0);
        if (ret < 0)
                goto out_register_late_subsys;

        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
                                   inet6_rtm_getroute, NULL,
                                   RTNL_FLAG_DOIT_UNLOCKED);
        if (ret < 0)
                goto out_register_late_subsys;

        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
        if (ret)
                goto out_register_late_subsys;

#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        ret = bpf_iter_register();
        if (ret)
                goto out_register_late_subsys;
#endif
#endif

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
                spin_lock_init(&ul->lock);
        }

out:
        return ret;

out_register_late_subsys:
        rtnl_unregister_all(PF_INET6);
        unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_init:
        fib6_rules_cleanup();
xfrm6_init:
        xfrm6_fini();
out_fib6_init:
        fib6_gc_cleanup();
out_register_subsys:
        unregister_pernet_subsys(&ip6_route_net_ops);
out_register_inetpeer:
        unregister_pernet_subsys(&ipv6_inetpeer_ops);
out_dst_entries:
        dst_entries_destroy(&ip6_dst_blackhole_ops);
out_kmem_cache:
        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
        goto out;
}

void ip6_route_cleanup(void)
{
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_unregister();
#endif
#endif
        unregister_netdevice_notifier(&ip6_route_dev_notifier);
        unregister_pernet_subsys(&ip6_route_net_late_ops);
        fib6_rules_cleanup();
        xfrm6_fini();
        fib6_gc_cleanup();
        unregister_pernet_subsys(&ipv6_inetpeer_ops);
        unregister_pernet_subsys(&ip6_route_net_ops);
        dst_entries_destroy(&ip6_dst_blackhole_ops);
        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
}








































































    1 


    1 

































    1 



    1 































































































    1 

















































    1 





    1 







    1 







    1 





































    1 





















    1 




    1 







    1 

    1 







    1 

















    1 




























    1 

















    1 





    1 







    1 




    1 























    1 















    1 






    1 
    1 






    1 

    1 






    1 
























































    1 








    1 
















    1 

























    1 


    1 













    1 










































































































































































































































































































































































































































































































































































































































































































































































    1 





























    1 















    1 





















    1 
    1 









    1 



























    1 








    1 










    1 

    1 













    1 

    1 




































































































    1 









    1 











































































    1 



























    1 



































    1 




















































    1 

    1 











































    1 











































































































































































































































































































    1 



































    1 













































































    1 














    1 






















    1 
    1 
























    1 

























    1 
    1 










    1 










    1 










    1 








    1 

    1 























    1 







    1 













    1 
























    1 





































































































    1 









    1 
    1 





    1 











    1 
















    1 








































    1 































































    1 










    1 



    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001-2002 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions work with the state functions in sctp_sm_statefuns.c
 * to implement the state operations.  These functions implement the
 * steps which require modifying existing data structures.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    C. Robin              <chris@hundredacre.ac.uk>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Xingang Guo           <xingang.guo@intel.com>
 *    Dajiang Zhang            <dajiang.zhang@nokia.com>
 *    Sridhar Samudrala            <sri@us.ibm.com>
 *    Daisy Chang            <daisyc@us.ibm.com>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 *    Kevin Gao             <kevin.gao@intel.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <crypto/hash.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <net/sock.h>

#include <linux/skbuff.h>
#include <linux/random.h>        /* for get_random_bytes */
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
                                            __u8 type, __u8 flags, int paylen,
                                            gfp_t gfp);
static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
                                         __u8 flags, int paylen, gfp_t gfp);
static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
                                           __u8 type, __u8 flags, int paylen,
                                           gfp_t gfp);
static struct sctp_cookie_param *sctp_pack_cookie(
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *init_chunk,
                                        int *cookie_len,
                                        const __u8 *raw_addrs, int addrs_len);
static int sctp_process_param(struct sctp_association *asoc,
                              union sctp_params param,
                              const union sctp_addr *peer_addr,
                              gfp_t gfp);
static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
                              const void *data);

/* Control chunk destructor */
static void sctp_control_release_owner(struct sk_buff *skb)
{
        struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg;

        if (chunk->shkey) {
                struct sctp_shared_key *shkey = chunk->shkey;
                struct sctp_association *asoc = chunk->asoc;

                /* refcnt == 2 and !list_empty mean after this release, it's
                 * not being used anywhere, and it's time to notify userland
                 * that this shkey can be freed if it's been deactivated.
                 */
                if (shkey->deactivated && !list_empty(&shkey->key_list) &&
                    refcount_read(&shkey->refcnt) == 2) {
                        struct sctp_ulpevent *ev;

                        ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id,
                                                        SCTP_AUTH_FREE_KEY,
                                                        GFP_KERNEL);
                        if (ev)
                                asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
                }
                sctp_auth_shkey_release(chunk->shkey);
        }
}

static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
{
        struct sctp_association *asoc = chunk->asoc;
        struct sk_buff *skb = chunk->skb;

        /* TODO: properly account for control chunks.
         * To do it right we'll need:
         *  1) endpoint if association isn't known.
         *  2) proper memory accounting.
         *
         *  For now don't do anything for now.
         */
        if (chunk->auth) {
                chunk->shkey = asoc->shkey;
                sctp_auth_shkey_hold(chunk->shkey);
        }
        skb->sk = asoc ? asoc->base.sk : NULL;
        skb_shinfo(skb)->destructor_arg = chunk;
        skb->destructor = sctp_control_release_owner;
}

/* What was the inbound interface for this chunk? */
int sctp_chunk_iif(const struct sctp_chunk *chunk)
{
        struct sk_buff *skb = chunk->skb;

        return SCTP_INPUT_CB(skb)->af->skb_iif(skb);
}

/* RFC 2960 3.3.2 Initiation (INIT) (1)
 *
 * Note 2: The ECN capable field is reserved for future use of
 * Explicit Congestion Notification.
 */
static const struct sctp_paramhdr ecap_param = {
        SCTP_PARAM_ECN_CAPABLE,
        cpu_to_be16(sizeof(struct sctp_paramhdr)),
};
static const struct sctp_paramhdr prsctp_param = {
        SCTP_PARAM_FWD_TSN_SUPPORT,
        cpu_to_be16(sizeof(struct sctp_paramhdr)),
};

/* A helper to initialize an op error inside a provided chunk, as most
 * cause codes will be embedded inside an abort chunk.
 */
int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
                    size_t paylen)
{
        struct sctp_errhdr err;
        __u16 len;

        /* Cause code constants are now defined in network order.  */
        err.cause = cause_code;
        len = sizeof(err) + paylen;
        err.length = htons(len);

        if (skb_tailroom(chunk->skb) < len)
                return -ENOSPC;

        chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);

        return 0;
}

/* 3.3.2 Initiation (INIT) (1)
 *
 * This chunk is used to initiate a SCTP association between two
 * endpoints. The format of the INIT chunk is shown below:
 *
 *     0                   1                   2                   3
 *     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    |   Type = 1    |  Chunk Flags  |      Chunk Length             |
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    |                         Initiate Tag                          |
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    |           Advertised Receiver Window Credit (a_rwnd)          |
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    |  Number of Outbound Streams   |  Number of Inbound Streams    |
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    |                          Initial TSN                          |
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    \                                                               \
 *    /              Optional/Variable-Length Parameters              /
 *    \                                                               \
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 *
 * The INIT chunk contains the following parameters. Unless otherwise
 * noted, each parameter MUST only be included once in the INIT chunk.
 *
 * Fixed Parameters                     Status
 * ----------------------------------------------
 * Initiate Tag                        Mandatory
 * Advertised Receiver Window Credit   Mandatory
 * Number of Outbound Streams          Mandatory
 * Number of Inbound Streams           Mandatory
 * Initial TSN                         Mandatory
 *
 * Variable Parameters                  Status     Type Value
 * -------------------------------------------------------------
 * IPv4 Address (Note 1)               Optional    5
 * IPv6 Address (Note 1)               Optional    6
 * Cookie Preservative                 Optional    9
 * Reserved for ECN Capable (Note 2)   Optional    32768 (0x8000)
 * Host Name Address (Note 3)          Optional    11
 * Supported Address Types (Note 4)    Optional    12
 */
struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
                                  const struct sctp_bind_addr *bp,
                                  gfp_t gfp, int vparam_len)
{
        struct sctp_supported_ext_param ext_param;
        struct sctp_adaptation_ind_param aiparam;
        struct sctp_paramhdr *auth_chunks = NULL;
        struct sctp_paramhdr *auth_hmacs = NULL;
        struct sctp_supported_addrs_param sat;
        struct sctp_endpoint *ep = asoc->ep;
        struct sctp_chunk *retval = NULL;
        int num_types, addrs_len = 0;
        struct sctp_inithdr init;
        union sctp_params addrs;
        struct sctp_sock *sp;
        __u8 extensions[5];
        size_t chunksize;
        __be16 types[2];
        int num_ext = 0;

        /* RFC 2960 3.3.2 Initiation (INIT) (1)
         *
         * Note 1: The INIT chunks can contain multiple addresses that
         * can be IPv4 and/or IPv6 in any combination.
         */

        /* Convert the provided bind address list to raw format. */
        addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp);

        init.init_tag                   = htonl(asoc->c.my_vtag);
        init.a_rwnd                   = htonl(asoc->rwnd);
        init.num_outbound_streams  = htons(asoc->c.sinit_num_ostreams);
        init.num_inbound_streams   = htons(asoc->c.sinit_max_instreams);
        init.initial_tsn           = htonl(asoc->c.initial_tsn);

        /* How many address types are needed? */
        sp = sctp_sk(asoc->base.sk);
        num_types = sp->pf->supported_addrs(sp, types);

        chunksize = sizeof(init) + addrs_len;
        chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));

        if (asoc->ep->ecn_enable)
                chunksize += sizeof(ecap_param);

        if (asoc->ep->prsctp_enable)
                chunksize += sizeof(prsctp_param);

        /* ADDIP: Section 4.2.7:
         *  An implementation supporting this extension [ADDIP] MUST list
         *  the ASCONF,the ASCONF-ACK, and the AUTH  chunks in its INIT and
         *  INIT-ACK parameters.
         */
        if (asoc->ep->asconf_enable) {
                extensions[num_ext] = SCTP_CID_ASCONF;
                extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
                num_ext += 2;
        }

        if (asoc->ep->reconf_enable) {
                extensions[num_ext] = SCTP_CID_RECONF;
                num_ext += 1;
        }

        if (sp->adaptation_ind)
                chunksize += sizeof(aiparam);

        if (asoc->ep->intl_enable) {
                extensions[num_ext] = SCTP_CID_I_DATA;
                num_ext += 1;
        }

        chunksize += vparam_len;

        /* Account for AUTH related parameters */
        if (ep->auth_enable) {
                /* Add random parameter length*/
                chunksize += sizeof(asoc->c.auth_random);

                /* Add HMACS parameter length if any were defined */
                auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs;
                if (auth_hmacs->length)
                        chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
                else
                        auth_hmacs = NULL;

                /* Add CHUNKS parameter length */
                auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks;
                if (auth_chunks->length)
                        chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
                else
                        auth_chunks = NULL;

                extensions[num_ext] = SCTP_CID_AUTH;
                num_ext += 1;
        }

        /* If we have any extensions to report, account for that */
        if (num_ext)
                chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext);

        /* RFC 2960 3.3.2 Initiation (INIT) (1)
         *
         * Note 3: An INIT chunk MUST NOT contain more than one Host
         * Name address parameter. Moreover, the sender of the INIT
         * MUST NOT combine any other address types with the Host Name
         * address in the INIT. The receiver of INIT MUST ignore any
         * other address types if the Host Name address parameter is
         * present in the received INIT chunk.
         *
         * PLEASE DO NOT FIXME [This version does not support Host Name.]
         */

        retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp);
        if (!retval)
                goto nodata;

        retval->subh.init_hdr =
                sctp_addto_chunk(retval, sizeof(init), &init);
        retval->param_hdr.v =
                sctp_addto_chunk(retval, addrs_len, addrs.v);

        /* RFC 2960 3.3.2 Initiation (INIT) (1)
         *
         * Note 4: This parameter, when present, specifies all the
         * address types the sending endpoint can support. The absence
         * of this parameter indicates that the sending endpoint can
         * support any address type.
         */
        sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES;
        sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types));
        sctp_addto_chunk(retval, sizeof(sat), &sat);
        sctp_addto_chunk(retval, num_types * sizeof(__u16), &types);

        if (asoc->ep->ecn_enable)
                sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);

        /* Add the supported extensions parameter.  Be nice and add this
         * fist before addiding the parameters for the extensions themselves
         */
        if (num_ext) {
                ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
                ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext);
                sctp_addto_chunk(retval, sizeof(ext_param), &ext_param);
                sctp_addto_param(retval, num_ext, extensions);
        }

        if (asoc->ep->prsctp_enable)
                sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);

        if (sp->adaptation_ind) {
                aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
                aiparam.param_hdr.length = htons(sizeof(aiparam));
                aiparam.adaptation_ind = htonl(sp->adaptation_ind);
                sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
        }

        /* Add SCTP-AUTH chunks to the parameter list */
        if (ep->auth_enable) {
                sctp_addto_chunk(retval, sizeof(asoc->c.auth_random),
                                 asoc->c.auth_random);
                if (auth_hmacs)
                        sctp_addto_chunk(retval, ntohs(auth_hmacs->length),
                                        auth_hmacs);
                if (auth_chunks)
                        sctp_addto_chunk(retval, ntohs(auth_chunks->length),
                                        auth_chunks);
        }
nodata:
        kfree(addrs.v);
        return retval;
}

struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
                                      const struct sctp_chunk *chunk,
                                      gfp_t gfp, int unkparam_len)
{
        struct sctp_supported_ext_param ext_param;
        struct sctp_adaptation_ind_param aiparam;
        struct sctp_paramhdr *auth_chunks = NULL;
        struct sctp_paramhdr *auth_random = NULL;
        struct sctp_paramhdr *auth_hmacs = NULL;
        struct sctp_chunk *retval = NULL;
        struct sctp_cookie_param *cookie;
        struct sctp_inithdr initack;
        union sctp_params addrs;
        struct sctp_sock *sp;
        __u8 extensions[5];
        size_t chunksize;
        int num_ext = 0;
        int cookie_len;
        int addrs_len;

        /* Note: there may be no addresses to embed. */
        addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp);

        initack.init_tag                = htonl(asoc->c.my_vtag);
        initack.a_rwnd                        = htonl(asoc->rwnd);
        initack.num_outbound_streams        = htons(asoc->c.sinit_num_ostreams);
        initack.num_inbound_streams        = htons(asoc->c.sinit_max_instreams);
        initack.initial_tsn                = htonl(asoc->c.initial_tsn);

        /* FIXME:  We really ought to build the cookie right
         * into the packet instead of allocating more fresh memory.
         */
        cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len,
                                  addrs.v, addrs_len);
        if (!cookie)
                goto nomem_cookie;

        /* Calculate the total size of allocation, include the reserved
         * space for reporting unknown parameters if it is specified.
         */
        sp = sctp_sk(asoc->base.sk);
        chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len;

        /* Tell peer that we'll do ECN only if peer advertised such cap.  */
        if (asoc->peer.ecn_capable)
                chunksize += sizeof(ecap_param);

        if (asoc->peer.prsctp_capable)
                chunksize += sizeof(prsctp_param);

        if (asoc->peer.asconf_capable) {
                extensions[num_ext] = SCTP_CID_ASCONF;
                extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
                num_ext += 2;
        }

        if (asoc->peer.reconf_capable) {
                extensions[num_ext] = SCTP_CID_RECONF;
                num_ext += 1;
        }

        if (sp->adaptation_ind)
                chunksize += sizeof(aiparam);

        if (asoc->peer.intl_capable) {
                extensions[num_ext] = SCTP_CID_I_DATA;
                num_ext += 1;
        }

        if (asoc->peer.auth_capable) {
                auth_random = (struct sctp_paramhdr *)asoc->c.auth_random;
                chunksize += ntohs(auth_random->length);

                auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs;
                if (auth_hmacs->length)
                        chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
                else
                        auth_hmacs = NULL;

                auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks;
                if (auth_chunks->length)
                        chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
                else
                        auth_chunks = NULL;

                extensions[num_ext] = SCTP_CID_AUTH;
                num_ext += 1;
        }

        if (num_ext)
                chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext);

        /* Now allocate and fill out the chunk.  */
        retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
        if (!retval)
                goto nomem_chunk;

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it received the DATA or control chunk
         * to which it is replying.
         *
         * [INIT ACK back to where the INIT came from.]
         */
        if (chunk->transport)
                retval->transport =
                        sctp_assoc_lookup_paddr(asoc,
                                                &chunk->transport->ipaddr);

        retval->subh.init_hdr =
                sctp_addto_chunk(retval, sizeof(initack), &initack);
        retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v);
        sctp_addto_chunk(retval, cookie_len, cookie);
        if (asoc->peer.ecn_capable)
                sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
        if (num_ext) {
                ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
                ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext);
                sctp_addto_chunk(retval, sizeof(ext_param), &ext_param);
                sctp_addto_param(retval, num_ext, extensions);
        }
        if (asoc->peer.prsctp_capable)
                sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);

        if (sp->adaptation_ind) {
                aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
                aiparam.param_hdr.length = htons(sizeof(aiparam));
                aiparam.adaptation_ind = htonl(sp->adaptation_ind);
                sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
        }

        if (asoc->peer.auth_capable) {
                sctp_addto_chunk(retval, ntohs(auth_random->length),
                                 auth_random);
                if (auth_hmacs)
                        sctp_addto_chunk(retval, ntohs(auth_hmacs->length),
                                        auth_hmacs);
                if (auth_chunks)
                        sctp_addto_chunk(retval, ntohs(auth_chunks->length),
                                        auth_chunks);
        }

        /* We need to remove the const qualifier at this point.  */
        retval->asoc = (struct sctp_association *) asoc;

nomem_chunk:
        kfree(cookie);
nomem_cookie:
        kfree(addrs.v);
        return retval;
}

/* 3.3.11 Cookie Echo (COOKIE ECHO) (10):
 *
 * This chunk is used only during the initialization of an association.
 * It is sent by the initiator of an association to its peer to complete
 * the initialization process. This chunk MUST precede any DATA chunk
 * sent within the association, but MAY be bundled with one or more DATA
 * chunks in the same packet.
 *
 *      0                   1                   2                   3
 *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |   Type = 10   |Chunk  Flags   |         Length                |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     /                     Cookie                                    /
 *     \                                                               \
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * Chunk Flags: 8 bit
 *
 *   Set to zero on transmit and ignored on receipt.
 *
 * Length: 16 bits (unsigned integer)
 *
 *   Set to the size of the chunk in bytes, including the 4 bytes of
 *   the chunk header and the size of the Cookie.
 *
 * Cookie: variable size
 *
 *   This field must contain the exact cookie received in the
 *   State Cookie parameter from the previous INIT ACK.
 *
 *   An implementation SHOULD make the cookie as small as possible
 *   to insure interoperability.
 */
struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
                                         const struct sctp_chunk *chunk)
{
        struct sctp_chunk *retval;
        int cookie_len;
        void *cookie;

        cookie = asoc->peer.cookie;
        cookie_len = asoc->peer.cookie_len;

        /* Build a cookie echo chunk.  */
        retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0,
                                   cookie_len, GFP_ATOMIC);
        if (!retval)
                goto nodata;
        retval->subh.cookie_hdr =
                sctp_addto_chunk(retval, cookie_len, cookie);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [COOKIE ECHO back to where the INIT ACK came from.]
         */
        if (chunk)
                retval->transport = chunk->transport;

nodata:
        return retval;
}

/* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11):
 *
 * This chunk is used only during the initialization of an
 * association.  It is used to acknowledge the receipt of a COOKIE
 * ECHO chunk.  This chunk MUST precede any DATA or SACK chunk sent
 * within the association, but MAY be bundled with one or more DATA
 * chunks or SACK chunk in the same SCTP packet.
 *
 *      0                   1                   2                   3
 *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |   Type = 11   |Chunk  Flags   |     Length = 4                |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * Chunk Flags: 8 bits
 *
 *   Set to zero on transmit and ignored on receipt.
 */
struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk)
{
        struct sctp_chunk *retval;

        retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [COOKIE ACK back to where the COOKIE ECHO came from.]
         */
        if (retval && chunk && chunk->transport)
                retval->transport =
                        sctp_assoc_lookup_paddr(asoc,
                                                &chunk->transport->ipaddr);

        return retval;
}

/*
 *  Appendix A: Explicit Congestion Notification:
 *  CWR:
 *
 *  RFC 2481 details a specific bit for a sender to send in the header of
 *  its next outbound TCP segment to indicate to its peer that it has
 *  reduced its congestion window.  This is termed the CWR bit.  For
 *  SCTP the same indication is made by including the CWR chunk.
 *  This chunk contains one data element, i.e. the TSN number that
 *  was sent in the ECNE chunk.  This element represents the lowest
 *  TSN number in the datagram that was originally marked with the
 *  CE bit.
 *
 *     0                   1                   2                   3
 *     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    | Chunk Type=13 | Flags=00000000|    Chunk Length = 8           |
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *    |                      Lowest TSN Number                        |
 *    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 *     Note: The CWR is considered a Control chunk.
 */
struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
                                 const __u32 lowest_tsn,
                                 const struct sctp_chunk *chunk)
{
        struct sctp_chunk *retval;
        struct sctp_cwrhdr cwr;

        cwr.lowest_tsn = htonl(lowest_tsn);
        retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0,
                                   sizeof(cwr), GFP_ATOMIC);

        if (!retval)
                goto nodata;

        retval->subh.ecn_cwr_hdr =
                sctp_addto_chunk(retval, sizeof(cwr), &cwr);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [Report a reduced congestion window back to where the ECNE
         * came from.]
         */
        if (chunk)
                retval->transport = chunk->transport;

nodata:
        return retval;
}

/* Make an ECNE chunk.  This is a congestion experienced report.  */
struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
                                  const __u32 lowest_tsn)
{
        struct sctp_chunk *retval;
        struct sctp_ecnehdr ecne;

        ecne.lowest_tsn = htonl(lowest_tsn);
        retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0,
                                   sizeof(ecne), GFP_ATOMIC);
        if (!retval)
                goto nodata;
        retval->subh.ecne_hdr =
                sctp_addto_chunk(retval, sizeof(ecne), &ecne);

nodata:
        return retval;
}

/* Make a DATA chunk for the given association from the provided
 * parameters.  However, do not populate the data payload.
 */
struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
                                            const struct sctp_sndrcvinfo *sinfo,
                                            int len, __u8 flags, gfp_t gfp)
{
        struct sctp_chunk *retval;
        struct sctp_datahdr dp;

        /* We assign the TSN as LATE as possible, not here when
         * creating the chunk.
         */
        memset(&dp, 0, sizeof(dp));
        dp.ppid = sinfo->sinfo_ppid;
        dp.stream = htons(sinfo->sinfo_stream);

        /* Set the flags for an unordered send.  */
        if (sinfo->sinfo_flags & SCTP_UNORDERED)
                flags |= SCTP_DATA_UNORDERED;

        retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp);
        if (!retval)
                return NULL;

        retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
        memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));

        return retval;
}

/* Create a selective ackowledgement (SACK) for the given
 * association.  This reports on which TSN's we've seen to date,
 * including duplicates and gaps.
 */
struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc)
{
        struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
        struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
        __u16 num_gabs, num_dup_tsns;
        struct sctp_transport *trans;
        struct sctp_chunk *retval;
        struct sctp_sackhdr sack;
        __u32 ctsn;
        int len;

        memset(gabs, 0, sizeof(gabs));
        ctsn = sctp_tsnmap_get_ctsn(map);

        pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn);

        /* How much room is needed in the chunk? */
        num_gabs = sctp_tsnmap_num_gabs(map, gabs);
        num_dup_tsns = sctp_tsnmap_num_dups(map);

        /* Initialize the SACK header.  */
        sack.cum_tsn_ack            = htonl(ctsn);
        sack.a_rwnd                     = htonl(asoc->a_rwnd);
        sack.num_gap_ack_blocks     = htons(num_gabs);
        sack.num_dup_tsns           = htons(num_dup_tsns);

        len = sizeof(sack)
                + sizeof(struct sctp_gap_ack_block) * num_gabs
                + sizeof(__u32) * num_dup_tsns;

        /* Create the chunk.  */
        retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC);
        if (!retval)
                goto nodata;

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, etc.) to the same destination transport
         * address from which it received the DATA or control chunk to
         * which it is replying.  This rule should also be followed if
         * the endpoint is bundling DATA chunks together with the
         * reply chunk.
         *
         * However, when acknowledging multiple DATA chunks received
         * in packets from different source addresses in a single
         * SACK, the SACK chunk may be transmitted to one of the
         * destination transport addresses from which the DATA or
         * control chunks being acknowledged were received.
         *
         * [BUG:  We do not implement the following paragraph.
         * Perhaps we should remember the last transport we used for a
         * SACK and avoid that (if possible) if we have seen any
         * duplicates. --piggy]
         *
         * When a receiver of a duplicate DATA chunk sends a SACK to a
         * multi- homed endpoint it MAY be beneficial to vary the
         * destination address and not use the source address of the
         * DATA chunk.  The reason being that receiving a duplicate
         * from a multi-homed endpoint might indicate that the return
         * path (as specified in the source address of the DATA chunk)
         * for the SACK is broken.
         *
         * [Send to the address from which we last received a DATA chunk.]
         */
        retval->transport = asoc->peer.last_data_from;

        retval->subh.sack_hdr =
                sctp_addto_chunk(retval, sizeof(sack), &sack);

        /* Add the gap ack block information.   */
        if (num_gabs)
                sctp_addto_chunk(retval, sizeof(__u32) * num_gabs,
                                 gabs);

        /* Add the duplicate TSN information.  */
        if (num_dup_tsns) {
                asoc->stats.idupchunks += num_dup_tsns;
                sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
                                 sctp_tsnmap_get_dups(map));
        }
        /* Once we have a sack generated, check to see what our sack
         * generation is, if its 0, reset the transports to 0, and reset
         * the association generation to 1
         *
         * The idea is that zero is never used as a valid generation for the
         * association so no transport will match after a wrap event like this,
         * Until the next sack
         */
        if (++asoc->peer.sack_generation == 0) {
                list_for_each_entry(trans, &asoc->peer.transport_addr_list,
                                    transports)
                        trans->sack_generation = 0;
                asoc->peer.sack_generation = 1;
        }
nodata:
        return retval;
}

/* Make a SHUTDOWN chunk. */
struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
                                      const struct sctp_chunk *chunk)
{
        struct sctp_shutdownhdr shut;
        struct sctp_chunk *retval;
        __u32 ctsn;

        ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
        shut.cum_tsn_ack = htonl(ctsn);

        retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
                                   sizeof(shut), GFP_ATOMIC);
        if (!retval)
                goto nodata;

        retval->subh.shutdown_hdr =
                sctp_addto_chunk(retval, sizeof(shut), &shut);

        if (chunk)
                retval->transport = chunk->transport;
nodata:
        return retval;
}

struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
                                          const struct sctp_chunk *chunk)
{
        struct sctp_chunk *retval;

        retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0,
                                   GFP_ATOMIC);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [ACK back to where the SHUTDOWN came from.]
         */
        if (retval && chunk)
                retval->transport = chunk->transport;

        return retval;
}

struct sctp_chunk *sctp_make_shutdown_complete(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk)
{
        struct sctp_chunk *retval;
        __u8 flags = 0;

        /* Set the T-bit if we have no association (vtag will be
         * reflected)
         */
        flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T;

        retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags,
                                   0, GFP_ATOMIC);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK
         * came from.]
         */
        if (retval && chunk)
                retval->transport = chunk->transport;

        return retval;
}

/* Create an ABORT.  Note that we set the T bit if we have no
 * association, except when responding to an INIT (sctpimpguide 2.41).
 */
struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
                                   const struct sctp_chunk *chunk,
                                   const size_t hint)
{
        struct sctp_chunk *retval;
        __u8 flags = 0;

        /* Set the T-bit if we have no association and 'chunk' is not
         * an INIT (vtag will be reflected).
         */
        if (!asoc) {
                if (chunk && chunk->chunk_hdr &&
                    chunk->chunk_hdr->type == SCTP_CID_INIT)
                        flags = 0;
                else
                        flags = SCTP_CHUNK_FLAG_T;
        }

        retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint,
                                   GFP_ATOMIC);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [ABORT back to where the offender came from.]
         */
        if (retval && chunk)
                retval->transport = chunk->transport;

        return retval;
}

/* Helper to create ABORT with a NO_USER_DATA error.  */
struct sctp_chunk *sctp_make_abort_no_data(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk,
                                        __u32 tsn)
{
        struct sctp_chunk *retval;
        __be32 payload;

        retval = sctp_make_abort(asoc, chunk,
                                 sizeof(struct sctp_errhdr) + sizeof(tsn));

        if (!retval)
                goto no_mem;

        /* Put the tsn back into network byte order.  */
        payload = htonl(tsn);
        sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload));
        sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [ABORT back to where the offender came from.]
         */
        if (chunk)
                retval->transport = chunk->transport;

no_mem:
        return retval;
}

/* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error.  */
struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
                                        struct msghdr *msg,
                                        size_t paylen)
{
        struct sctp_chunk *retval;
        void *payload = NULL;
        int err;

        retval = sctp_make_abort(asoc, NULL,
                                 sizeof(struct sctp_errhdr) + paylen);
        if (!retval)
                goto err_chunk;

        if (paylen) {
                /* Put the msg_iov together into payload.  */
                payload = kmalloc(paylen, GFP_KERNEL);
                if (!payload)
                        goto err_payload;

                err = memcpy_from_msg(payload, msg, paylen);
                if (err < 0)
                        goto err_copy;
        }

        sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen);
        sctp_addto_chunk(retval, paylen, payload);

        if (paylen)
                kfree(payload);

        return retval;

err_copy:
        kfree(payload);
err_payload:
        sctp_chunk_free(retval);
        retval = NULL;
err_chunk:
        return retval;
}

/* Append bytes to the end of a parameter.  Will panic if chunk is not big
 * enough.
 */
static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
                              const void *data)
{
        int chunklen = ntohs(chunk->chunk_hdr->length);
        void *target;

        target = skb_put(chunk->skb, len);

        if (data)
                memcpy(target, data, len);
        else
                memset(target, 0, len);

        /* Adjust the chunk length field.  */
        chunk->chunk_hdr->length = htons(chunklen + len);
        chunk->chunk_end = skb_tail_pointer(chunk->skb);

        return target;
}

/* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */
struct sctp_chunk *sctp_make_abort_violation(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk,
                                        const __u8 *payload,
                                        const size_t paylen)
{
        struct sctp_chunk  *retval;
        struct sctp_paramhdr phdr;

        retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) +
                                              paylen + sizeof(phdr));
        if (!retval)
                goto end;

        sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen +
                                                            sizeof(phdr));

        phdr.type = htons(chunk->chunk_hdr->type);
        phdr.length = chunk->chunk_hdr->length;
        sctp_addto_chunk(retval, paylen, payload);
        sctp_addto_param(retval, sizeof(phdr), &phdr);

end:
        return retval;
}

struct sctp_chunk *sctp_make_violation_paramlen(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk,
                                        struct sctp_paramhdr *param)
{
        static const char error[] = "The following parameter had invalid length:";
        size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) +
                             sizeof(*param);
        struct sctp_chunk *retval;

        retval = sctp_make_abort(asoc, chunk, payload_len);
        if (!retval)
                goto nodata;

        sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION,
                        sizeof(error) + sizeof(*param));
        sctp_addto_chunk(retval, sizeof(error), error);
        sctp_addto_param(retval, sizeof(*param), param);

nodata:
        return retval;
}

struct sctp_chunk *sctp_make_violation_max_retrans(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk)
{
        static const char error[] = "Association exceeded its max_retrans count";
        size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr);
        struct sctp_chunk *retval;

        retval = sctp_make_abort(asoc, chunk, payload_len);
        if (!retval)
                goto nodata;

        sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error));
        sctp_addto_chunk(retval, sizeof(error), error);

nodata:
        return retval;
}

struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc,
                                            const struct sctp_chunk *chunk)
{
        struct sctp_new_encap_port_hdr nep;
        struct sctp_chunk *retval;

        retval = sctp_make_abort(asoc, chunk,
                                 sizeof(struct sctp_errhdr) + sizeof(nep));
        if (!retval)
                goto nodata;

        sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep));
        nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
        nep.new_port = chunk->transport->encap_port;
        sctp_addto_chunk(retval, sizeof(nep), &nep);

nodata:
        return retval;
}

/* Make a HEARTBEAT chunk.  */
struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
                                       const struct sctp_transport *transport,
                                       __u32 probe_size)
{
        struct sctp_sender_hb_info hbinfo = {};
        struct sctp_chunk *retval;

        retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0,
                                   sizeof(hbinfo), GFP_ATOMIC);

        if (!retval)
                goto nodata;

        hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO;
        hbinfo.param_hdr.length = htons(sizeof(hbinfo));
        hbinfo.daddr = transport->ipaddr;
        hbinfo.sent_at = jiffies;
        hbinfo.hb_nonce = transport->hb_nonce;
        hbinfo.probe_size = probe_size;

        /* Cast away the 'const', as this is just telling the chunk
         * what transport it belongs to.
         */
        retval->transport = (struct sctp_transport *) transport;
        retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo),
                                                &hbinfo);
        retval->pmtu_probe = !!probe_size;

nodata:
        return retval;
}

struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
                                           const struct sctp_chunk *chunk,
                                           const void *payload,
                                           const size_t paylen)
{
        struct sctp_chunk *retval;

        retval  = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen,
                                    GFP_ATOMIC);
        if (!retval)
                goto nodata;

        retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload);

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, * etc.) to the same destination transport
         * address from which it * received the DATA or control chunk
         * to which it is replying.
         *
         * [HBACK back to where the HEARTBEAT came from.]
         */
        if (chunk)
                retval->transport = chunk->transport;

nodata:
        return retval;
}

/* RFC4820 3. Padding Chunk (PAD)
 *  0                   1                   2                   3
 *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * | Type = 0x84   |   Flags=0     |             Length            |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                                                               |
 * \                         Padding Data                          /
 * /                                                               \
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len)
{
        struct sctp_chunk *retval;

        retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC);
        if (!retval)
                return NULL;

        skb_put_zero(retval->skb, len);
        retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len);
        retval->chunk_end = skb_tail_pointer(retval->skb);

        return retval;
}

/* Create an Operation Error chunk with the specified space reserved.
 * This routine can be used for containing multiple causes in the chunk.
 */
static struct sctp_chunk *sctp_make_op_error_space(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk,
                                        size_t size)
{
        struct sctp_chunk *retval;

        retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0,
                                   sizeof(struct sctp_errhdr) + size,
                                   GFP_ATOMIC);
        if (!retval)
                goto nodata;

        /* RFC 2960 6.4 Multi-homed SCTP Endpoints
         *
         * An endpoint SHOULD transmit reply chunks (e.g., SACK,
         * HEARTBEAT ACK, etc.) to the same destination transport
         * address from which it received the DATA or control chunk
         * to which it is replying.
         *
         */
        if (chunk)
                retval->transport = chunk->transport;

nodata:
        return retval;
}

/* Create an Operation Error chunk of a fixed size, specifically,
 * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
 * This is a helper function to allocate an error chunk for those
 * invalid parameter codes in which we may not want to report all the
 * errors, if the incoming chunk is large. If it can't fit in a single
 * packet, we ignore it.
 */
static inline struct sctp_chunk *sctp_make_op_error_limited(
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *chunk)
{
        size_t size = SCTP_DEFAULT_MAXSEGMENT;
        struct sctp_sock *sp = NULL;

        if (asoc) {
                size = min_t(size_t, size, asoc->pathmtu);
                sp = sctp_sk(asoc->base.sk);
        }

        size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr));

        return sctp_make_op_error_space(asoc, chunk, size);
}

/* Create an Operation Error chunk.  */
struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
                                      const struct sctp_chunk *chunk,
                                      __be16 cause_code, const void *payload,
                                      size_t paylen, size_t reserve_tail)
{
        struct sctp_chunk *retval;

        retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail);
        if (!retval)
                goto nodata;

        sctp_init_cause(retval, cause_code, paylen + reserve_tail);
        sctp_addto_chunk(retval, paylen, payload);
        if (reserve_tail)
                sctp_addto_param(retval, reserve_tail, NULL);

nodata:
        return retval;
}

struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc,
                                  __u16 key_id)
{
        struct sctp_authhdr auth_hdr;
        struct sctp_hmac *hmac_desc;
        struct sctp_chunk *retval;

        /* Get the first hmac that the peer told us to use */
        hmac_desc = sctp_auth_asoc_get_hmac(asoc);
        if (unlikely(!hmac_desc))
                return NULL;

        retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0,
                                   hmac_desc->hmac_len + sizeof(auth_hdr),
                                   GFP_ATOMIC);
        if (!retval)
                return NULL;

        auth_hdr.hmac_id = htons(hmac_desc->hmac_id);
        auth_hdr.shkey_id = htons(key_id);

        retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr),
                                                 &auth_hdr);

        skb_put_zero(retval->skb, hmac_desc->hmac_len);

        /* Adjust the chunk header to include the empty MAC */
        retval->chunk_hdr->length =
                htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len);
        retval->chunk_end = skb_tail_pointer(retval->skb);

        return retval;
}


/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

/* Turn an skb into a chunk.
 * FIXME: Eventually move the structure directly inside the skb->cb[].
 *
 * sctpimpguide-05.txt Section 2.8.2
 * M1) Each time a new DATA chunk is transmitted
 * set the 'TSN.Missing.Report' count for that TSN to 0. The
 * 'TSN.Missing.Report' count will be used to determine missing chunks
 * and when to fast retransmit.
 *
 */
struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
                                 const struct sctp_association *asoc,
                                 struct sock *sk, gfp_t gfp)
{
        struct sctp_chunk *retval;

        retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp);

        if (!retval)
                goto nodata;
        if (!sk)
                pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb);

        INIT_LIST_HEAD(&retval->list);
        retval->skb                = skb;
        retval->asoc                = (struct sctp_association *)asoc;
        retval->singleton        = 1;

        retval->fast_retransmit = SCTP_CAN_FRTX;

        /* Polish the bead hole.  */
        INIT_LIST_HEAD(&retval->transmitted_list);
        INIT_LIST_HEAD(&retval->frag_list);
        SCTP_DBG_OBJCNT_INC(chunk);
        refcount_set(&retval->refcnt, 1);

nodata:
        return retval;
}

/* Set chunk->source and dest based on the IP header in chunk->skb.  */
void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src,
                     union sctp_addr *dest)
{
        memcpy(&chunk->source, src, sizeof(union sctp_addr));
        memcpy(&chunk->dest, dest, sizeof(union sctp_addr));
}

/* Extract the source address from a chunk.  */
const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
{
        /* If we have a known transport, use that.  */
        if (chunk->transport) {
                return &chunk->transport->ipaddr;
        } else {
                /* Otherwise, extract it from the IP header.  */
                return &chunk->source;
        }
}

/* Create a new chunk, setting the type and flags headers from the
 * arguments, reserving enough space for a 'paylen' byte payload.
 */
static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
                                           __u8 type, __u8 flags, int paylen,
                                           gfp_t gfp)
{
        struct sctp_chunkhdr *chunk_hdr;
        struct sctp_chunk *retval;
        struct sk_buff *skb;
        struct sock *sk;
        int chunklen;

        chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen);
        if (chunklen > SCTP_MAX_CHUNK_LEN)
                goto nodata;

        /* No need to allocate LL here, as this is only a chunk. */
        skb = alloc_skb(chunklen, gfp);
        if (!skb)
                goto nodata;

        /* Make room for the chunk header.  */
        chunk_hdr = (struct sctp_chunkhdr *)skb_put(skb, sizeof(*chunk_hdr));
        chunk_hdr->type          = type;
        chunk_hdr->flags  = flags;
        chunk_hdr->length = htons(sizeof(*chunk_hdr));

        sk = asoc ? asoc->base.sk : NULL;
        retval = sctp_chunkify(skb, asoc, sk, gfp);
        if (!retval) {
                kfree_skb(skb);
                goto nodata;
        }

        retval->chunk_hdr = chunk_hdr;
        retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(*chunk_hdr);

        /* Determine if the chunk needs to be authenticated */
        if (sctp_auth_send_cid(type, asoc))
                retval->auth = 1;

        return retval;
nodata:
        return NULL;
}

static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
                                         __u8 flags, int paylen, gfp_t gfp)
{
        return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp);
}

struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc,
                                   __u8 flags, int paylen, gfp_t gfp)
{
        return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp);
}

static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
                                            __u8 type, __u8 flags, int paylen,
                                            gfp_t gfp)
{
        struct sctp_chunk *chunk;

        chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp);
        if (chunk)
                sctp_control_set_owner_w(chunk);

        return chunk;
}

/* Release the memory occupied by a chunk.  */
static void sctp_chunk_destroy(struct sctp_chunk *chunk)
{
        BUG_ON(!list_empty(&chunk->list));
        list_del_init(&chunk->transmitted_list);

        consume_skb(chunk->skb);
        consume_skb(chunk->auth_chunk);

        SCTP_DBG_OBJCNT_DEC(chunk);
        kmem_cache_free(sctp_chunk_cachep, chunk);
}

/* Possibly, free the chunk.  */
void sctp_chunk_free(struct sctp_chunk *chunk)
{
        /* Release our reference on the message tracker. */
        if (chunk->msg)
                sctp_datamsg_put(chunk->msg);

        sctp_chunk_put(chunk);
}

/* Grab a reference to the chunk. */
void sctp_chunk_hold(struct sctp_chunk *ch)
{
        refcount_inc(&ch->refcnt);
}

/* Release a reference to the chunk. */
void sctp_chunk_put(struct sctp_chunk *ch)
{
        if (refcount_dec_and_test(&ch->refcnt))
                sctp_chunk_destroy(ch);
}

/* Append bytes to the end of a chunk.  Will panic if chunk is not big
 * enough.
 */
void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
{
        int chunklen = ntohs(chunk->chunk_hdr->length);
        int padlen = SCTP_PAD4(chunklen) - chunklen;
        void *target;

        skb_put_zero(chunk->skb, padlen);
        target = skb_put_data(chunk->skb, data, len);

        /* Adjust the chunk length field.  */
        chunk->chunk_hdr->length = htons(chunklen + padlen + len);
        chunk->chunk_end = skb_tail_pointer(chunk->skb);

        return target;
}

/* Append bytes from user space to the end of a chunk.  Will panic if
 * chunk is not big enough.
 * Returns a kernel err value.
 */
int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len,
                          struct iov_iter *from)
{
        void *target;

        /* Make room in chunk for data.  */
        target = skb_put(chunk->skb, len);

        /* Copy data (whole iovec) into chunk */
        if (!copy_from_iter_full(target, len, from))
                return -EFAULT;

        /* Adjust the chunk length field.  */
        chunk->chunk_hdr->length =
                htons(ntohs(chunk->chunk_hdr->length) + len);
        chunk->chunk_end = skb_tail_pointer(chunk->skb);

        return 0;
}

/* Helper function to assign a TSN if needed.  This assumes that both
 * the data_hdr and association have already been assigned.
 */
void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
{
        struct sctp_stream *stream;
        struct sctp_chunk *lchunk;
        struct sctp_datamsg *msg;
        __u16 ssn, sid;

        if (chunk->has_ssn)
                return;

        /* All fragments will be on the same stream */
        sid = ntohs(chunk->subh.data_hdr->stream);
        stream = &chunk->asoc->stream;

        /* Now assign the sequence number to the entire message.
         * All fragments must have the same stream sequence number.
         */
        msg = chunk->msg;
        list_for_each_entry(lchunk, &msg->chunks, frag_list) {
                if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
                        ssn = 0;
                } else {
                        if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
                                ssn = sctp_ssn_next(stream, out, sid);
                        else
                                ssn = sctp_ssn_peek(stream, out, sid);
                }

                lchunk->subh.data_hdr->ssn = htons(ssn);
                lchunk->has_ssn = 1;
        }
}

/* Helper function to assign a TSN if needed.  This assumes that both
 * the data_hdr and association have already been assigned.
 */
void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
{
        if (!chunk->has_tsn) {
                /* This is the last possible instant to
                 * assign a TSN.
                 */
                chunk->subh.data_hdr->tsn =
                        htonl(sctp_association_get_next_tsn(chunk->asoc));
                chunk->has_tsn = 1;
        }
}

/* Create a CLOSED association to use with an incoming packet.  */
struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
                                             struct sctp_chunk *chunk,
                                             gfp_t gfp)
{
        struct sctp_association *asoc;
        enum sctp_scope scope;
        struct sk_buff *skb;

        /* Create the bare association.  */
        scope = sctp_scope(sctp_source(chunk));
        asoc = sctp_association_new(ep, ep->base.sk, scope, gfp);
        if (!asoc)
                goto nodata;
        asoc->temp = 1;
        skb = chunk->skb;
        /* Create an entry for the source address of the packet.  */
        SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1);

nodata:
        return asoc;
}

/* Build a cookie representing asoc.
 * This INCLUDES the param header needed to put the cookie in the INIT ACK.
 */
static struct sctp_cookie_param *sctp_pack_cookie(
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        const struct sctp_chunk *init_chunk,
                                        int *cookie_len, const __u8 *raw_addrs,
                                        int addrs_len)
{
        struct sctp_signed_cookie *cookie;
        struct sctp_cookie_param *retval;
        int headersize, bodysize;

        /* Header size is static data prior to the actual cookie, including
         * any padding.
         */
        headersize = sizeof(struct sctp_paramhdr) +
                     (sizeof(struct sctp_signed_cookie) -
                      sizeof(struct sctp_cookie));
        bodysize = sizeof(struct sctp_cookie)
                + ntohs(init_chunk->chunk_hdr->length) + addrs_len;

        /* Pad out the cookie to a multiple to make the signature
         * functions simpler to write.
         */
        if (bodysize % SCTP_COOKIE_MULTIPLE)
                bodysize += SCTP_COOKIE_MULTIPLE
                        - (bodysize % SCTP_COOKIE_MULTIPLE);
        *cookie_len = headersize + bodysize;

        /* Clear this memory since we are sending this data structure
         * out on the network.
         */
        retval = kzalloc(*cookie_len, GFP_ATOMIC);
        if (!retval)
                goto nodata;

        cookie = (struct sctp_signed_cookie *) retval->body;

        /* Set up the parameter header.  */
        retval->p.type = SCTP_PARAM_STATE_COOKIE;
        retval->p.length = htons(*cookie_len);

        /* Copy the cookie part of the association itself.  */
        cookie->c = asoc->c;
        /* Save the raw address list length in the cookie. */
        cookie->c.raw_addr_list_len = addrs_len;

        /* Remember PR-SCTP capability. */
        cookie->c.prsctp_capable = asoc->peer.prsctp_capable;

        /* Save adaptation indication in the cookie. */
        cookie->c.adaptation_ind = asoc->peer.adaptation_ind;

        /* Set an expiration time for the cookie.  */
        cookie->c.expiration = ktime_add(asoc->cookie_life,
                                         ktime_get_real());

        /* Copy the peer's init packet.  */
        memcpy(cookie + 1, init_chunk->chunk_hdr,
               ntohs(init_chunk->chunk_hdr->length));

        /* Copy the raw local address list of the association. */
        memcpy((__u8 *)(cookie + 1) +
               ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);

        if (sctp_sk(ep->base.sk)->hmac) {
                struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;
                int err;

                /* Sign the message.  */
                err = crypto_shash_setkey(tfm, ep->secret_key,
                                          sizeof(ep->secret_key)) ?:
                      crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize,
                                              cookie->signature);
                if (err)
                        goto free_cookie;
        }

        return retval;

free_cookie:
        kfree(retval);
nodata:
        *cookie_len = 0;
        return NULL;
}

/* Unpack the cookie from COOKIE ECHO chunk, recreating the association.  */
struct sctp_association *sctp_unpack_cookie(
                                        const struct sctp_endpoint *ep,
                                        const struct sctp_association *asoc,
                                        struct sctp_chunk *chunk, gfp_t gfp,
                                        int *error, struct sctp_chunk **errp)
{
        struct sctp_association *retval = NULL;
        int headersize, bodysize, fixed_size;
        struct sctp_signed_cookie *cookie;
        struct sk_buff *skb = chunk->skb;
        struct sctp_cookie *bear_cookie;
        __u8 *digest = ep->digest;
        enum sctp_scope scope;
        unsigned int len;
        ktime_t kt;

        /* Header size is static data prior to the actual cookie, including
         * any padding.
         */
        headersize = sizeof(struct sctp_chunkhdr) +
                     (sizeof(struct sctp_signed_cookie) -
                      sizeof(struct sctp_cookie));
        bodysize = ntohs(chunk->chunk_hdr->length) - headersize;
        fixed_size = headersize + sizeof(struct sctp_cookie);

        /* Verify that the chunk looks like it even has a cookie.
         * There must be enough room for our cookie and our peer's
         * INIT chunk.
         */
        len = ntohs(chunk->chunk_hdr->length);
        if (len < fixed_size + sizeof(struct sctp_chunkhdr))
                goto malformed;

        /* Verify that the cookie has been padded out. */
        if (bodysize % SCTP_COOKIE_MULTIPLE)
                goto malformed;

        /* Process the cookie.  */
        cookie = chunk->subh.cookie_hdr;
        bear_cookie = &cookie->c;

        if (!sctp_sk(ep->base.sk)->hmac)
                goto no_hmac;

        /* Check the signature.  */
        {
                struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;
                int err;

                err = crypto_shash_setkey(tfm, ep->secret_key,
                                          sizeof(ep->secret_key)) ?:
                      crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize,
                                              digest);
                if (err) {
                        *error = -SCTP_IERROR_NOMEM;
                        goto fail;
                }
        }

        if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
                *error = -SCTP_IERROR_BAD_SIG;
                goto fail;
        }

no_hmac:
        /* IG Section 2.35.2:
         *  3) Compare the port numbers and the verification tag contained
         *     within the COOKIE ECHO chunk to the actual port numbers and the
         *     verification tag within the SCTP common header of the received
         *     packet. If these values do not match the packet MUST be silently
         *     discarded,
         */
        if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) {
                *error = -SCTP_IERROR_BAD_TAG;
                goto fail;
        }

        if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port ||
            ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) {
                *error = -SCTP_IERROR_BAD_PORTS;
                goto fail;
        }

        /* Check to see if the cookie is stale.  If there is already
         * an association, there is no need to check cookie's expiration
         * for init collision case of lost COOKIE ACK.
         * If skb has been timestamped, then use the stamp, otherwise
         * use current time.  This introduces a small possibility that
         * a cookie may be considered expired, but this would only slow
         * down the new association establishment instead of every packet.
         */
        if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
                kt = skb_get_ktime(skb);
        else
                kt = ktime_get_real();

        if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
                suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
                __be32 n = htonl(usecs);

                /*
                 * Section 3.3.10.3 Stale Cookie Error (3)
                 *
                 * Cause of error
                 * ---------------
                 * Stale Cookie Error:  Indicates the receipt of a valid State
                 * Cookie that has expired.
                 */
                *errp = sctp_make_op_error(asoc, chunk,
                                           SCTP_ERROR_STALE_COOKIE, &n,
                                           sizeof(n), 0);
                if (*errp)
                        *error = -SCTP_IERROR_STALE_COOKIE;
                else
                        *error = -SCTP_IERROR_NOMEM;

                goto fail;
        }

        /* Make a new base association.  */
        scope = sctp_scope(sctp_source(chunk));
        retval = sctp_association_new(ep, ep->base.sk, scope, gfp);
        if (!retval) {
                *error = -SCTP_IERROR_NOMEM;
                goto fail;
        }

        /* Set up our peer's port number.  */
        retval->peer.port = ntohs(chunk->sctp_hdr->source);

        /* Populate the association from the cookie.  */
        memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie));

        if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie,
                                                 GFP_ATOMIC) < 0) {
                *error = -SCTP_IERROR_NOMEM;
                goto fail;
        }

        /* Also, add the destination address. */
        if (list_empty(&retval->base.bind_addr.address_list)) {
                sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest,
                                   sizeof(chunk->dest), SCTP_ADDR_SRC,
                                   GFP_ATOMIC);
        }

        retval->next_tsn = retval->c.initial_tsn;
        retval->ctsn_ack_point = retval->next_tsn - 1;
        retval->addip_serial = retval->c.initial_tsn;
        retval->strreset_outseq = retval->c.initial_tsn;
        retval->adv_peer_ack_point = retval->ctsn_ack_point;
        retval->peer.prsctp_capable = retval->c.prsctp_capable;
        retval->peer.adaptation_ind = retval->c.adaptation_ind;

        /* The INIT stuff will be done by the side effects.  */
        return retval;

fail:
        if (retval)
                sctp_association_free(retval);

        return NULL;

malformed:
        /* Yikes!  The packet is either corrupt or deliberately
         * malformed.
         */
        *error = -SCTP_IERROR_MALFORMED;
        goto fail;
}

/********************************************************************
 * 3rd Level Abstractions
 ********************************************************************/

struct __sctp_missing {
        __be32 num_missing;
        __be16 type;
}  __packed;

/*
 * Report a missing mandatory parameter.
 */
static int sctp_process_missing_param(const struct sctp_association *asoc,
                                      enum sctp_param paramtype,
                                      struct sctp_chunk *chunk,
                                      struct sctp_chunk **errp)
{
        struct __sctp_missing report;
        __u16 len;

        len = SCTP_PAD4(sizeof(report));

        /* Make an ERROR chunk, preparing enough room for
         * returning multiple unknown parameters.
         */
        if (!*errp)
                *errp = sctp_make_op_error_space(asoc, chunk, len);

        if (*errp) {
                report.num_missing = htonl(1);
                report.type = paramtype;
                sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM,
                                sizeof(report));
                sctp_addto_chunk(*errp, sizeof(report), &report);
        }

        /* Stop processing this chunk. */
        return 0;
}

/* Report an Invalid Mandatory Parameter.  */
static int sctp_process_inv_mandatory(const struct sctp_association *asoc,
                                      struct sctp_chunk *chunk,
                                      struct sctp_chunk **errp)
{
        /* Invalid Mandatory Parameter Error has no payload. */

        if (!*errp)
                *errp = sctp_make_op_error_space(asoc, chunk, 0);

        if (*errp)
                sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0);

        /* Stop processing this chunk. */
        return 0;
}

static int sctp_process_inv_paramlength(const struct sctp_association *asoc,
                                        struct sctp_paramhdr *param,
                                        const struct sctp_chunk *chunk,
                                        struct sctp_chunk **errp)
{
        /* This is a fatal error.  Any accumulated non-fatal errors are
         * not reported.
         */
        if (*errp)
                sctp_chunk_free(*errp);

        /* Create an error chunk and fill it in with our payload. */
        *errp = sctp_make_violation_paramlen(asoc, chunk, param);

        return 0;
}


/* Do not attempt to handle the HOST_NAME parm.  However, do
 * send back an indicator to the peer.
 */
static int sctp_process_hn_param(const struct sctp_association *asoc,
                                 union sctp_params param,
                                 struct sctp_chunk *chunk,
                                 struct sctp_chunk **errp)
{
        __u16 len = ntohs(param.p->length);

        /* Processing of the HOST_NAME parameter will generate an
         * ABORT.  If we've accumulated any non-fatal errors, they
         * would be unrecognized parameters and we should not include
         * them in the ABORT.
         */
        if (*errp)
                sctp_chunk_free(*errp);

        *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED,
                                   param.v, len, 0);

        /* Stop processing this chunk. */
        return 0;
}

static int sctp_verify_ext_param(struct net *net,
                                 const struct sctp_endpoint *ep,
                                 union sctp_params param)
{
        __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
        int have_asconf = 0;
        int have_auth = 0;
        int i;

        for (i = 0; i < num_ext; i++) {
                switch (param.ext->chunks[i]) {
                case SCTP_CID_AUTH:
                        have_auth = 1;
                        break;
                case SCTP_CID_ASCONF:
                case SCTP_CID_ASCONF_ACK:
                        have_asconf = 1;
                        break;
                }
        }

        /* ADD-IP Security: The draft requires us to ABORT or ignore the
         * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not.  Do this
         * only if ADD-IP is turned on and we are not backward-compatible
         * mode.
         */
        if (net->sctp.addip_noauth)
                return 1;

        if (ep->asconf_enable && !have_auth && have_asconf)
                return 0;

        return 1;
}

static void sctp_process_ext_param(struct sctp_association *asoc,
                                   union sctp_params param)
{
        __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
        int i;

        for (i = 0; i < num_ext; i++) {
                switch (param.ext->chunks[i]) {
                case SCTP_CID_RECONF:
                        if (asoc->ep->reconf_enable)
                                asoc->peer.reconf_capable = 1;
                        break;
                case SCTP_CID_FWD_TSN:
                        if (asoc->ep->prsctp_enable)
                                asoc->peer.prsctp_capable = 1;
                        break;
                case SCTP_CID_AUTH:
                        /* if the peer reports AUTH, assume that he
                         * supports AUTH.
                         */
                        if (asoc->ep->auth_enable)
                                asoc->peer.auth_capable = 1;
                        break;
                case SCTP_CID_ASCONF:
                case SCTP_CID_ASCONF_ACK:
                        if (asoc->ep->asconf_enable)
                                asoc->peer.asconf_capable = 1;
                        break;
                case SCTP_CID_I_DATA:
                        if (asoc->ep->intl_enable)
                                asoc->peer.intl_capable = 1;
                        break;
                default:
                        break;
                }
        }
}

/* RFC 3.2.1 & the Implementers Guide 2.2.
 *
 * The Parameter Types are encoded such that the
 * highest-order two bits specify the action that must be
 * taken if the processing endpoint does not recognize the
 * Parameter Type.
 *
 * 00 - Stop processing this parameter; do not process any further
 *         parameters within this chunk
 *
 * 01 - Stop processing this parameter, do not process any further
 *        parameters within this chunk, and report the unrecognized
 *        parameter in an 'Unrecognized Parameter' ERROR chunk.
 *
 * 10 - Skip this parameter and continue processing.
 *
 * 11 - Skip this parameter and continue processing but
 *        report the unrecognized parameter in an
 *        'Unrecognized Parameter' ERROR chunk.
 *
 * Return value:
 *         SCTP_IERROR_NO_ERROR - continue with the chunk
 *         SCTP_IERROR_ERROR    - stop and report an error.
 *         SCTP_IERROR_NOMEME   - out of memory.
 */
static enum sctp_ierror sctp_process_unk_param(
                                        const struct sctp_association *asoc,
                                        union sctp_params param,
                                        struct sctp_chunk *chunk,
                                        struct sctp_chunk **errp)
{
        int retval = SCTP_IERROR_NO_ERROR;

        switch (param.p->type & SCTP_PARAM_ACTION_MASK) {
        case SCTP_PARAM_ACTION_DISCARD:
                retval =  SCTP_IERROR_ERROR;
                break;
        case SCTP_PARAM_ACTION_SKIP:
                break;
        case SCTP_PARAM_ACTION_DISCARD_ERR:
                retval =  SCTP_IERROR_ERROR;
                fallthrough;
        case SCTP_PARAM_ACTION_SKIP_ERR:
                /* Make an ERROR chunk, preparing enough room for
                 * returning multiple unknown parameters.
                 */
                if (!*errp) {
                        *errp = sctp_make_op_error_limited(asoc, chunk);
                        if (!*errp) {
                                /* If there is no memory for generating the
                                 * ERROR report as specified, an ABORT will be
                                 * triggered to the peer and the association
                                 * won't be established.
                                 */
                                retval = SCTP_IERROR_NOMEM;
                                break;
                        }
                }

                if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
                                     ntohs(param.p->length)))
                        sctp_addto_chunk(*errp, ntohs(param.p->length),
                                         param.v);
                break;
        default:
                break;
        }

        return retval;
}

/* Verify variable length parameters
 * Return values:
 *         SCTP_IERROR_ABORT - trigger an ABORT
 *         SCTP_IERROR_NOMEM - out of memory (abort)
 *        SCTP_IERROR_ERROR - stop processing, trigger an ERROR
 *         SCTP_IERROR_NO_ERROR - continue with the chunk
 */
static enum sctp_ierror sctp_verify_param(struct net *net,
                                          const struct sctp_endpoint *ep,
                                          const struct sctp_association *asoc,
                                          union sctp_params param,
                                          enum sctp_cid cid,
                                          struct sctp_chunk *chunk,
                                          struct sctp_chunk **err_chunk)
{
        struct sctp_hmac_algo_param *hmacs;
        int retval = SCTP_IERROR_NO_ERROR;
        __u16 n_elt, id = 0;
        int i;

        /* FIXME - This routine is not looking at each parameter per the
         * chunk type, i.e., unrecognized parameters should be further
         * identified based on the chunk id.
         */

        switch (param.p->type) {
        case SCTP_PARAM_IPV4_ADDRESS:
        case SCTP_PARAM_IPV6_ADDRESS:
        case SCTP_PARAM_COOKIE_PRESERVATIVE:
        case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
        case SCTP_PARAM_STATE_COOKIE:
        case SCTP_PARAM_HEARTBEAT_INFO:
        case SCTP_PARAM_UNRECOGNIZED_PARAMETERS:
        case SCTP_PARAM_ECN_CAPABLE:
        case SCTP_PARAM_ADAPTATION_LAYER_IND:
                break;

        case SCTP_PARAM_SUPPORTED_EXT:
                if (!sctp_verify_ext_param(net, ep, param))
                        return SCTP_IERROR_ABORT;
                break;

        case SCTP_PARAM_SET_PRIMARY:
                if (!ep->asconf_enable)
                        goto unhandled;

                if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) +
                                             sizeof(struct sctp_paramhdr)) {
                        sctp_process_inv_paramlength(asoc, param.p,
                                                     chunk, err_chunk);
                        retval = SCTP_IERROR_ABORT;
                }
                break;

        case SCTP_PARAM_HOST_NAME_ADDRESS:
                /* This param has been Deprecated, send ABORT.  */
                sctp_process_hn_param(asoc, param, chunk, err_chunk);
                retval = SCTP_IERROR_ABORT;
                break;

        case SCTP_PARAM_FWD_TSN_SUPPORT:
                if (ep->prsctp_enable)
                        break;
                goto unhandled;

        case SCTP_PARAM_RANDOM:
                if (!ep->auth_enable)
                        goto unhandled;

                /* SCTP-AUTH: Secion 6.1
                 * If the random number is not 32 byte long the association
                 * MUST be aborted.  The ABORT chunk SHOULD contain the error
                 * cause 'Protocol Violation'.
                 */
                if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) -
                                               sizeof(struct sctp_paramhdr)) {
                        sctp_process_inv_paramlength(asoc, param.p,
                                                     chunk, err_chunk);
                        retval = SCTP_IERROR_ABORT;
                }
                break;

        case SCTP_PARAM_CHUNKS:
                if (!ep->auth_enable)
                        goto unhandled;

                /* SCTP-AUTH: Section 3.2
                 * The CHUNKS parameter MUST be included once in the INIT or
                 *  INIT-ACK chunk if the sender wants to receive authenticated
                 *  chunks.  Its maximum length is 260 bytes.
                 */
                if (260 < ntohs(param.p->length)) {
                        sctp_process_inv_paramlength(asoc, param.p,
                                                     chunk, err_chunk);
                        retval = SCTP_IERROR_ABORT;
                }
                break;

        case SCTP_PARAM_HMAC_ALGO:
                if (!ep->auth_enable)
                        goto unhandled;

                hmacs = (struct sctp_hmac_algo_param *)param.p;
                n_elt = (ntohs(param.p->length) -
                         sizeof(struct sctp_paramhdr)) >> 1;

                /* SCTP-AUTH: Section 6.1
                 * The HMAC algorithm based on SHA-1 MUST be supported and
                 * included in the HMAC-ALGO parameter.
                 */
                for (i = 0; i < n_elt; i++) {
                        id = ntohs(hmacs->hmac_ids[i]);

                        if (id == SCTP_AUTH_HMAC_ID_SHA1)
                                break;
                }

                if (id != SCTP_AUTH_HMAC_ID_SHA1) {
                        sctp_process_inv_paramlength(asoc, param.p, chunk,
                                                     err_chunk);
                        retval = SCTP_IERROR_ABORT;
                }
                break;
unhandled:
        default:
                pr_debug("%s: unrecognized param:%d for chunk:%d\n",
                         __func__, ntohs(param.p->type), cid);

                retval = sctp_process_unk_param(asoc, param, chunk, err_chunk);
                break;
        }
        return retval;
}

/* Verify the INIT packet before we process it.  */
int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
                     const struct sctp_association *asoc, enum sctp_cid cid,
                     struct sctp_init_chunk *peer_init,
                     struct sctp_chunk *chunk, struct sctp_chunk **errp)
{
        union sctp_params param;
        bool has_cookie = false;
        int result;

        /* Check for missing mandatory parameters. Note: Initial TSN is
         * also mandatory, but is not checked here since the valid range
         * is 0..2**32-1. RFC4960, section 3.3.3.
         */
        if (peer_init->init_hdr.num_outbound_streams == 0 ||
            peer_init->init_hdr.num_inbound_streams == 0 ||
            peer_init->init_hdr.init_tag == 0 ||
            ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW)
                return sctp_process_inv_mandatory(asoc, chunk, errp);

        sctp_walk_params(param, peer_init) {
                if (param.p->type == SCTP_PARAM_STATE_COOKIE)
                        has_cookie = true;
        }

        /* There is a possibility that a parameter length was bad and
         * in that case we would have stoped walking the parameters.
         * The current param.p would point at the bad one.
         * Current consensus on the mailing list is to generate a PROTOCOL
         * VIOLATION error.  We build the ERROR chunk here and let the normal
         * error handling code build and send the packet.
         */
        if (param.v != (void *)chunk->chunk_end)
                return sctp_process_inv_paramlength(asoc, param.p, chunk, errp);

        /* The only missing mandatory param possible today is
         * the state cookie for an INIT-ACK chunk.
         */
        if ((SCTP_CID_INIT_ACK == cid) && !has_cookie)
                return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE,
                                                  chunk, errp);

        /* Verify all the variable length parameters */
        sctp_walk_params(param, peer_init) {
                result = sctp_verify_param(net, ep, asoc, param, cid,
                                           chunk, errp);
                switch (result) {
                case SCTP_IERROR_ABORT:
                case SCTP_IERROR_NOMEM:
                        return 0;
                case SCTP_IERROR_ERROR:
                        return 1;
                case SCTP_IERROR_NO_ERROR:
                default:
                        break;
                }

        } /* for (loop through all parameters) */

        return 1;
}

/* Unpack the parameters in an INIT packet into an association.
 * Returns 0 on failure, else success.
 * FIXME:  This is an association method.
 */
int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
                      const union sctp_addr *peer_addr,
                      struct sctp_init_chunk *peer_init, gfp_t gfp)
{
        struct sctp_transport *transport;
        struct list_head *pos, *temp;
        union sctp_params param;
        union sctp_addr addr;
        struct sctp_af *af;
        int src_match = 0;

        /* We must include the address that the INIT packet came from.
         * This is the only address that matters for an INIT packet.
         * When processing a COOKIE ECHO, we retrieve the from address
         * of the INIT from the cookie.
         */

        /* This implementation defaults to making the first transport
         * added as the primary transport.  The source address seems to
         * be a better choice than any of the embedded addresses.
         */
        asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
        if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
                goto nomem;

        if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr))
                src_match = 1;

        /* Process the initialization parameters.  */
        sctp_walk_params(param, peer_init) {
                if (!src_match &&
                    (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
                     param.p->type == SCTP_PARAM_IPV6_ADDRESS)) {
                        af = sctp_get_af_specific(param_type2af(param.p->type));
                        if (!af->from_addr_param(&addr, param.addr,
                                                 chunk->sctp_hdr->source, 0))
                                continue;
                        if (sctp_cmp_addr_exact(sctp_source(chunk), &addr))
                                src_match = 1;
                }

                if (!sctp_process_param(asoc, param, peer_addr, gfp))
                        goto clean_up;
        }

        /* source address of chunk may not match any valid address */
        if (!src_match)
                goto clean_up;

        /* AUTH: After processing the parameters, make sure that we
         * have all the required info to potentially do authentications.
         */
        if (asoc->peer.auth_capable && (!asoc->peer.peer_random ||
                                        !asoc->peer.peer_hmacs))
                asoc->peer.auth_capable = 0;

        /* In a non-backward compatible mode, if the peer claims
         * support for ADD-IP but not AUTH,  the ADD-IP spec states
         * that we MUST ABORT the association. Section 6.  The section
         * also give us an option to silently ignore the packet, which
         * is what we'll do here.
         */
        if (!asoc->base.net->sctp.addip_noauth &&
            (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) {
                asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP |
                                                  SCTP_PARAM_DEL_IP |
                                                  SCTP_PARAM_SET_PRIMARY);
                asoc->peer.asconf_capable = 0;
                goto clean_up;
        }

        /* Walk list of transports, removing transports in the UNKNOWN state. */
        list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
                transport = list_entry(pos, struct sctp_transport, transports);
                if (transport->state == SCTP_UNKNOWN) {
                        sctp_assoc_rm_peer(asoc, transport);
                }
        }

        /* The fixed INIT headers are always in network byte
         * order.
         */
        asoc->peer.i.init_tag =
                ntohl(peer_init->init_hdr.init_tag);
        asoc->peer.i.a_rwnd =
                ntohl(peer_init->init_hdr.a_rwnd);
        asoc->peer.i.num_outbound_streams =
                ntohs(peer_init->init_hdr.num_outbound_streams);
        asoc->peer.i.num_inbound_streams =
                ntohs(peer_init->init_hdr.num_inbound_streams);
        asoc->peer.i.initial_tsn =
                ntohl(peer_init->init_hdr.initial_tsn);

        asoc->strreset_inseq = asoc->peer.i.initial_tsn;

        /* Apply the upper bounds for output streams based on peer's
         * number of inbound streams.
         */
        if (asoc->c.sinit_num_ostreams  >
            ntohs(peer_init->init_hdr.num_inbound_streams)) {
                asoc->c.sinit_num_ostreams =
                        ntohs(peer_init->init_hdr.num_inbound_streams);
        }

        if (asoc->c.sinit_max_instreams >
            ntohs(peer_init->init_hdr.num_outbound_streams)) {
                asoc->c.sinit_max_instreams =
                        ntohs(peer_init->init_hdr.num_outbound_streams);
        }

        /* Copy Initiation tag from INIT to VT_peer in cookie.   */
        asoc->c.peer_vtag = asoc->peer.i.init_tag;

        /* Peer Rwnd   : Current calculated value of the peer's rwnd.  */
        asoc->peer.rwnd = asoc->peer.i.a_rwnd;

        /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily
         * high (for example, implementations MAY use the size of the receiver
         * advertised window).
         */
        list_for_each_entry(transport, &asoc->peer.transport_addr_list,
                        transports) {
                transport->ssthresh = asoc->peer.i.a_rwnd;
        }

        /* Set up the TSN tracking pieces.  */
        if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
                                asoc->peer.i.initial_tsn, gfp))
                goto clean_up;

        /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number
         *
         * The stream sequence number in all the streams shall start
         * from 0 when the association is established.  Also, when the
         * stream sequence number reaches the value 65535 the next
         * stream sequence number shall be set to 0.
         */

        if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams,
                             asoc->c.sinit_max_instreams, gfp))
                goto clean_up;

        /* Update frag_point when stream_interleave may get changed. */
        sctp_assoc_update_frag_point(asoc);

        if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
                goto clean_up;

        /* ADDIP Section 4.1 ASCONF Chunk Procedures
         *
         * When an endpoint has an ASCONF signaled change to be sent to the
         * remote endpoint it should do the following:
         * ...
         * A2) A serial number should be assigned to the Chunk. The serial
         * number should be a monotonically increasing number. All serial
         * numbers are defined to be initialized at the start of the
         * association to the same value as the Initial TSN.
         */
        asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1;
        return 1;

clean_up:
        /* Release the transport structures. */
        list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
                transport = list_entry(pos, struct sctp_transport, transports);
                if (transport->state != SCTP_ACTIVE)
                        sctp_assoc_rm_peer(asoc, transport);
        }

nomem:
        return 0;
}


/* Update asoc with the option described in param.
 *
 * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT
 *
 * asoc is the association to update.
 * param is the variable length parameter to use for update.
 * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO.
 * If the current packet is an INIT we want to minimize the amount of
 * work we do.  In particular, we should not build transport
 * structures for the addresses.
 */
static int sctp_process_param(struct sctp_association *asoc,
                              union sctp_params param,
                              const union sctp_addr *peer_addr,
                              gfp_t gfp)
{
        struct sctp_endpoint *ep = asoc->ep;
        union sctp_addr_param *addr_param;
        struct net *net = asoc->base.net;
        struct sctp_transport *t;
        enum sctp_scope scope;
        union sctp_addr addr;
        struct sctp_af *af;
        int retval = 1, i;
        u32 stale;
        __u16 sat;

        /* We maintain all INIT parameters in network byte order all the
         * time.  This allows us to not worry about whether the parameters
         * came from a fresh INIT, and INIT ACK, or were stored in a cookie.
         */
        switch (param.p->type) {
        case SCTP_PARAM_IPV6_ADDRESS:
                if (PF_INET6 != asoc->base.sk->sk_family)
                        break;
                goto do_addr_param;

        case SCTP_PARAM_IPV4_ADDRESS:
                /* v4 addresses are not allowed on v6-only socket */
                if (ipv6_only_sock(asoc->base.sk))
                        break;
do_addr_param:
                af = sctp_get_af_specific(param_type2af(param.p->type));
                if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0))
                        break;
                scope = sctp_scope(peer_addr);
                if (sctp_in_scope(net, &addr, scope))
                        if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED))
                                return 0;
                break;

        case SCTP_PARAM_COOKIE_PRESERVATIVE:
                if (!net->sctp.cookie_preserve_enable)
                        break;

                stale = ntohl(param.life->lifespan_increment);

                /* Suggested Cookie Life span increment's unit is msec,
                 * (1/1000sec).
                 */
                asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale);
                break;

        case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
                /* Turn off the default values first so we'll know which
                 * ones are really set by the peer.
                 */
                asoc->peer.ipv4_address = 0;
                asoc->peer.ipv6_address = 0;

                /* Assume that peer supports the address family
                 * by which it sends a packet.
                 */
                if (peer_addr->sa.sa_family == AF_INET6)
                        asoc->peer.ipv6_address = 1;
                else if (peer_addr->sa.sa_family == AF_INET)
                        asoc->peer.ipv4_address = 1;

                /* Cycle through address types; avoid divide by 0. */
                sat = ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
                if (sat)
                        sat /= sizeof(__u16);

                for (i = 0; i < sat; ++i) {
                        switch (param.sat->types[i]) {
                        case SCTP_PARAM_IPV4_ADDRESS:
                                asoc->peer.ipv4_address = 1;
                                break;

                        case SCTP_PARAM_IPV6_ADDRESS:
                                if (PF_INET6 == asoc->base.sk->sk_family)
                                        asoc->peer.ipv6_address = 1;
                                break;

                        default: /* Just ignore anything else.  */
                                break;
                        }
                }
                break;

        case SCTP_PARAM_STATE_COOKIE:
                asoc->peer.cookie_len =
                        ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
                kfree(asoc->peer.cookie);
                asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp);
                if (!asoc->peer.cookie)
                        retval = 0;
                break;

        case SCTP_PARAM_HEARTBEAT_INFO:
                /* Would be odd to receive, but it causes no problems. */
                break;

        case SCTP_PARAM_UNRECOGNIZED_PARAMETERS:
                /* Rejected during verify stage. */
                break;

        case SCTP_PARAM_ECN_CAPABLE:
                if (asoc->ep->ecn_enable) {
                        asoc->peer.ecn_capable = 1;
                        break;
                }
                /* Fall Through */
                goto fall_through;


        case SCTP_PARAM_ADAPTATION_LAYER_IND:
                asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind);
                break;

        case SCTP_PARAM_SET_PRIMARY:
                if (!ep->asconf_enable)
                        goto fall_through;

                addr_param = param.v + sizeof(struct sctp_addip_param);

                af = sctp_get_af_specific(param_type2af(addr_param->p.type));
                if (!af)
                        break;

                if (!af->from_addr_param(&addr, addr_param,
                                         htons(asoc->peer.port), 0))
                        break;

                if (!af->addr_valid(&addr, NULL, NULL))
                        break;

                t = sctp_assoc_lookup_paddr(asoc, &addr);
                if (!t)
                        break;

                sctp_assoc_set_primary(asoc, t);
                break;

        case SCTP_PARAM_SUPPORTED_EXT:
                sctp_process_ext_param(asoc, param);
                break;

        case SCTP_PARAM_FWD_TSN_SUPPORT:
                if (asoc->ep->prsctp_enable) {
                        asoc->peer.prsctp_capable = 1;
                        break;
                }
                /* Fall Through */
                goto fall_through;

        case SCTP_PARAM_RANDOM:
                if (!ep->auth_enable)
                        goto fall_through;

                /* Save peer's random parameter */
                kfree(asoc->peer.peer_random);
                asoc->peer.peer_random = kmemdup(param.p,
                                            ntohs(param.p->length), gfp);
                if (!asoc->peer.peer_random) {
                        retval = 0;
                        break;
                }
                break;

        case SCTP_PARAM_HMAC_ALGO:
                if (!ep->auth_enable)
                        goto fall_through;

                /* Save peer's HMAC list */
                kfree(asoc->peer.peer_hmacs);
                asoc->peer.peer_hmacs = kmemdup(param.p,
                                            ntohs(param.p->length), gfp);
                if (!asoc->peer.peer_hmacs) {
                        retval = 0;
                        break;
                }

                /* Set the default HMAC the peer requested*/
                sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo);
                break;

        case SCTP_PARAM_CHUNKS:
                if (!ep->auth_enable)
                        goto fall_through;

                kfree(asoc->peer.peer_chunks);
                asoc->peer.peer_chunks = kmemdup(param.p,
                                            ntohs(param.p->length), gfp);
                if (!asoc->peer.peer_chunks)
                        retval = 0;
                break;
fall_through:
        default:
                /* Any unrecognized parameters should have been caught
                 * and handled by sctp_verify_param() which should be
                 * called prior to this routine.  Simply log the error
                 * here.
                 */
                pr_debug("%s: ignoring param:%d for association:%p.\n",
                         __func__, ntohs(param.p->type), asoc);
                break;
        }

        return retval;
}

/* Select a new verification tag.  */
__u32 sctp_generate_tag(const struct sctp_endpoint *ep)
{
        /* I believe that this random number generator complies with RFC1750.
         * A tag of 0 is reserved for special cases (e.g. INIT).
         */
        __u32 x;

        do {
                get_random_bytes(&x, sizeof(__u32));
        } while (x == 0);

        return x;
}

/* Select an initial TSN to send during startup.  */
__u32 sctp_generate_tsn(const struct sctp_endpoint *ep)
{
        __u32 retval;

        get_random_bytes(&retval, sizeof(__u32));
        return retval;
}

/*
 * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF)
 *      0                   1                   2                   3
 *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     | Type = 0xC1   |  Chunk Flags  |      Chunk Length             |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                       Serial Number                           |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                    Address Parameter                          |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                     ASCONF Parameter #1                       |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     \                                                               \
 *     /                             ....                              /
 *     \                                                               \
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                     ASCONF Parameter #N                       |
 *      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * Address Parameter and other parameter will not be wrapped in this function
 */
static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
                                           union sctp_addr *addr,
                                           int vparam_len)
{
        struct sctp_addiphdr asconf;
        struct sctp_chunk *retval;
        int length = sizeof(asconf) + vparam_len;
        union sctp_addr_param addrparam;
        int addrlen;
        struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family);

        addrlen = af->to_addr_param(addr, &addrparam);
        if (!addrlen)
                return NULL;
        length += addrlen;

        /* Create the chunk.  */
        retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length,
                                   GFP_ATOMIC);
        if (!retval)
                return NULL;

        asconf.serial = htonl(asoc->addip_serial++);

        retval->subh.addip_hdr =
                sctp_addto_chunk(retval, sizeof(asconf), &asconf);
        retval->param_hdr.v =
                sctp_addto_chunk(retval, addrlen, &addrparam);

        return retval;
}

/* ADDIP
 * 3.2.1 Add IP Address
 *         0                   1                   2                   3
 *         0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |        Type = 0xC001          |    Length = Variable          |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |               ASCONF-Request Correlation ID                   |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                       Address Parameter                       |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * 3.2.2 Delete IP Address
 *         0                   1                   2                   3
 *         0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |        Type = 0xC002          |    Length = Variable          |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |               ASCONF-Request Correlation ID                   |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                       Address Parameter                       |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 */
struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
                                              union sctp_addr *laddr,
                                              struct sockaddr *addrs,
                                              int addrcnt, __be16 flags)
{
        union sctp_addr_param addr_param;
        struct sctp_addip_param        param;
        int paramlen = sizeof(param);
        struct sctp_chunk *retval;
        int addr_param_len = 0;
        union sctp_addr *addr;
        int totallen = 0, i;
        int del_pickup = 0;
        struct sctp_af *af;
        void *addr_buf;

        /* Get total length of all the address parameters. */
        addr_buf = addrs;
        for (i = 0; i < addrcnt; i++) {
                addr = addr_buf;
                af = sctp_get_af_specific(addr->v4.sin_family);
                addr_param_len = af->to_addr_param(addr, &addr_param);

                totallen += paramlen;
                totallen += addr_param_len;

                addr_buf += af->sockaddr_len;
                if (asoc->asconf_addr_del_pending && !del_pickup) {
                        /* reuse the parameter length from the same scope one */
                        totallen += paramlen;
                        totallen += addr_param_len;
                        del_pickup = 1;

                        pr_debug("%s: picked same-scope del_pending addr, "
                                 "totallen for all addresses is %d\n",
                                 __func__, totallen);
                }
        }

        /* Create an asconf chunk with the required length. */
        retval = sctp_make_asconf(asoc, laddr, totallen);
        if (!retval)
                return NULL;

        /* Add the address parameters to the asconf chunk. */
        addr_buf = addrs;
        for (i = 0; i < addrcnt; i++) {
                addr = addr_buf;
                af = sctp_get_af_specific(addr->v4.sin_family);
                addr_param_len = af->to_addr_param(addr, &addr_param);
                param.param_hdr.type = flags;
                param.param_hdr.length = htons(paramlen + addr_param_len);
                param.crr_id = htonl(i);

                sctp_addto_chunk(retval, paramlen, &param);
                sctp_addto_chunk(retval, addr_param_len, &addr_param);

                addr_buf += af->sockaddr_len;
        }
        if (flags == SCTP_PARAM_ADD_IP && del_pickup) {
                addr = asoc->asconf_addr_del_pending;
                af = sctp_get_af_specific(addr->v4.sin_family);
                addr_param_len = af->to_addr_param(addr, &addr_param);
                param.param_hdr.type = SCTP_PARAM_DEL_IP;
                param.param_hdr.length = htons(paramlen + addr_param_len);
                param.crr_id = htonl(i);

                sctp_addto_chunk(retval, paramlen, &param);
                sctp_addto_chunk(retval, addr_param_len, &addr_param);
        }
        return retval;
}

/* ADDIP
 * 3.2.4 Set Primary IP Address
 *        0                   1                   2                   3
 *        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |        Type =0xC004           |    Length = Variable          |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |               ASCONF-Request Correlation ID                   |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                       Address Parameter                       |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * Create an ASCONF chunk with Set Primary IP address parameter.
 */
struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
                                             union sctp_addr *addr)
{
        struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family);
        union sctp_addr_param addrparam;
        struct sctp_addip_param        param;
        struct sctp_chunk *retval;
        int len = sizeof(param);
        int addrlen;

        addrlen = af->to_addr_param(addr, &addrparam);
        if (!addrlen)
                return NULL;
        len += addrlen;

        /* Create the chunk and make asconf header. */
        retval = sctp_make_asconf(asoc, addr, len);
        if (!retval)
                return NULL;

        param.param_hdr.type = SCTP_PARAM_SET_PRIMARY;
        param.param_hdr.length = htons(len);
        param.crr_id = 0;

        sctp_addto_chunk(retval, sizeof(param), &param);
        sctp_addto_chunk(retval, addrlen, &addrparam);

        return retval;
}

/* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK)
 *      0                   1                   2                   3
 *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     | Type = 0x80   |  Chunk Flags  |      Chunk Length             |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                       Serial Number                           |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                 ASCONF Parameter Response#1                   |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     \                                                               \
 *     /                             ....                              /
 *     \                                                               \
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *     |                 ASCONF Parameter Response#N                   |
 *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * Create an ASCONF_ACK chunk with enough space for the parameter responses.
 */
static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc,
                                               __u32 serial, int vparam_len)
{
        struct sctp_addiphdr asconf;
        struct sctp_chunk *retval;
        int length = sizeof(asconf) + vparam_len;

        /* Create the chunk.  */
        retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length,
                                   GFP_ATOMIC);
        if (!retval)
                return NULL;

        asconf.serial = htonl(serial);

        retval->subh.addip_hdr =
                sctp_addto_chunk(retval, sizeof(asconf), &asconf);

        return retval;
}

/* Add response parameters to an ASCONF_ACK chunk. */
static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id,
                                     __be16 err_code,
                                     struct sctp_addip_param *asconf_param)
{
        struct sctp_addip_param ack_param;
        struct sctp_errhdr err_param;
        int asconf_param_len = 0;
        int err_param_len = 0;
        __be16 response_type;

        if (SCTP_ERROR_NO_ERROR == err_code) {
                response_type = SCTP_PARAM_SUCCESS_REPORT;
        } else {
                response_type = SCTP_PARAM_ERR_CAUSE;
                err_param_len = sizeof(err_param);
                if (asconf_param)
                        asconf_param_len =
                                 ntohs(asconf_param->param_hdr.length);
        }

        /* Add Success Indication or Error Cause Indication parameter. */
        ack_param.param_hdr.type = response_type;
        ack_param.param_hdr.length = htons(sizeof(ack_param) +
                                           err_param_len +
                                           asconf_param_len);
        ack_param.crr_id = crr_id;
        sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param);

        if (SCTP_ERROR_NO_ERROR == err_code)
                return;

        /* Add Error Cause parameter. */
        err_param.cause = err_code;
        err_param.length = htons(err_param_len + asconf_param_len);
        sctp_addto_chunk(chunk, err_param_len, &err_param);

        /* Add the failed TLV copied from ASCONF chunk. */
        if (asconf_param)
                sctp_addto_chunk(chunk, asconf_param_len, asconf_param);
}

/* Process a asconf parameter. */
static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
                                        struct sctp_chunk *asconf,
                                        struct sctp_addip_param *asconf_param)
{
        union sctp_addr_param *addr_param;
        struct sctp_transport *peer;
        union sctp_addr        addr;
        struct sctp_af *af;

        addr_param = (void *)asconf_param + sizeof(*asconf_param);

        if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP &&
            asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP &&
            asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY)
                return SCTP_ERROR_UNKNOWN_PARAM;

        switch (addr_param->p.type) {
        case SCTP_PARAM_IPV6_ADDRESS:
                if (!asoc->peer.ipv6_address)
                        return SCTP_ERROR_DNS_FAILED;
                break;
        case SCTP_PARAM_IPV4_ADDRESS:
                if (!asoc->peer.ipv4_address)
                        return SCTP_ERROR_DNS_FAILED;
                break;
        default:
                return SCTP_ERROR_DNS_FAILED;
        }

        af = sctp_get_af_specific(param_type2af(addr_param->p.type));
        if (unlikely(!af))
                return SCTP_ERROR_DNS_FAILED;

        if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0))
                return SCTP_ERROR_DNS_FAILED;

        /* ADDIP 4.2.1  This parameter MUST NOT contain a broadcast
         * or multicast address.
         * (note: wildcard is permitted and requires special handling so
         *  make sure we check for that)
         */
        if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb))
                return SCTP_ERROR_DNS_FAILED;

        switch (asconf_param->param_hdr.type) {
        case SCTP_PARAM_ADD_IP:
                /* Section 4.2.1:
                 * If the address 0.0.0.0 or ::0 is provided, the source
                 * address of the packet MUST be added.
                 */
                if (af->is_any(&addr))
                        memcpy(&addr, &asconf->source, sizeof(addr));

                if (security_sctp_bind_connect(asoc->ep->base.sk,
                                               SCTP_PARAM_ADD_IP,
                                               (struct sockaddr *)&addr,
                                               af->sockaddr_len))
                        return SCTP_ERROR_REQ_REFUSED;

                /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address
                 * request and does not have the local resources to add this
                 * new address to the association, it MUST return an Error
                 * Cause TLV set to the new error code 'Operation Refused
                 * Due to Resource Shortage'.
                 */

                peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED);
                if (!peer)
                        return SCTP_ERROR_RSRC_LOW;

                /* Start the heartbeat timer. */
                sctp_transport_reset_hb_timer(peer);
                asoc->new_transport = peer;
                break;
        case SCTP_PARAM_DEL_IP:
                /* ADDIP 4.3 D7) If a request is received to delete the
                 * last remaining IP address of a peer endpoint, the receiver
                 * MUST send an Error Cause TLV with the error cause set to the
                 * new error code 'Request to Delete Last Remaining IP Address'.
                 */
                if (asoc->peer.transport_count == 1)
                        return SCTP_ERROR_DEL_LAST_IP;

                /* ADDIP 4.3 D8) If a request is received to delete an IP
                 * address which is also the source address of the IP packet
                 * which contained the ASCONF chunk, the receiver MUST reject
                 * this request. To reject the request the receiver MUST send
                 * an Error Cause TLV set to the new error code 'Request to
                 * Delete Source IP Address'
                 */
                if (sctp_cmp_addr_exact(&asconf->source, &addr))
                        return SCTP_ERROR_DEL_SRC_IP;

                /* Section 4.2.2
                 * If the address 0.0.0.0 or ::0 is provided, all
                 * addresses of the peer except        the source address of the
                 * packet MUST be deleted.
                 */
                if (af->is_any(&addr)) {
                        sctp_assoc_set_primary(asoc, asconf->transport);
                        sctp_assoc_del_nonprimary_peers(asoc,
                                                        asconf->transport);
                        return SCTP_ERROR_NO_ERROR;
                }

                /* If the address is not part of the association, the
                 * ASCONF-ACK with Error Cause Indication Parameter
                 * which including cause of Unresolvable Address should
                 * be sent.
                 */
                peer = sctp_assoc_lookup_paddr(asoc, &addr);
                if (!peer)
                        return SCTP_ERROR_DNS_FAILED;

                sctp_assoc_rm_peer(asoc, peer);
                break;
        case SCTP_PARAM_SET_PRIMARY:
                /* ADDIP Section 4.2.4
                 * If the address 0.0.0.0 or ::0 is provided, the receiver
                 * MAY mark the source address of the packet as its
                 * primary.
                 */
                if (af->is_any(&addr))
                        memcpy(&addr, sctp_source(asconf), sizeof(addr));

                if (security_sctp_bind_connect(asoc->ep->base.sk,
                                               SCTP_PARAM_SET_PRIMARY,
                                               (struct sockaddr *)&addr,
                                               af->sockaddr_len))
                        return SCTP_ERROR_REQ_REFUSED;

                peer = sctp_assoc_lookup_paddr(asoc, &addr);
                if (!peer)
                        return SCTP_ERROR_DNS_FAILED;

                sctp_assoc_set_primary(asoc, peer);
                break;
        }

        return SCTP_ERROR_NO_ERROR;
}

/* Verify the ASCONF packet before we process it. */
bool sctp_verify_asconf(const struct sctp_association *asoc,
                        struct sctp_chunk *chunk, bool addr_param_needed,
                        struct sctp_paramhdr **errp)
{
        struct sctp_addip_chunk *addip;
        bool addr_param_seen = false;
        union sctp_params param;

        addip = (struct sctp_addip_chunk *)chunk->chunk_hdr;
        sctp_walk_params(param, addip) {
                size_t length = ntohs(param.p->length);

                *errp = param.p;
                switch (param.p->type) {
                case SCTP_PARAM_ERR_CAUSE:
                        break;
                case SCTP_PARAM_IPV4_ADDRESS:
                        if (length != sizeof(struct sctp_ipv4addr_param))
                                return false;
                        /* ensure there is only one addr param and it's in the
                         * beginning of addip_hdr params, or we reject it.
                         */
                        if (param.v != (addip + 1))
                                return false;
                        addr_param_seen = true;
                        break;
                case SCTP_PARAM_IPV6_ADDRESS:
                        if (length != sizeof(struct sctp_ipv6addr_param))
                                return false;
                        if (param.v != (addip + 1))
                                return false;
                        addr_param_seen = true;
                        break;
                case SCTP_PARAM_ADD_IP:
                case SCTP_PARAM_DEL_IP:
                case SCTP_PARAM_SET_PRIMARY:
                        /* In ASCONF chunks, these need to be first. */
                        if (addr_param_needed && !addr_param_seen)
                                return false;
                        length = ntohs(param.addip->param_hdr.length);
                        if (length < sizeof(struct sctp_addip_param) +
                                     sizeof(**errp))
                                return false;
                        break;
                case SCTP_PARAM_SUCCESS_REPORT:
                case SCTP_PARAM_ADAPTATION_LAYER_IND:
                        if (length != sizeof(struct sctp_addip_param))
                                return false;
                        break;
                default:
                        /* This is unknown to us, reject! */
                        return false;
                }
        }

        /* Remaining sanity checks. */
        if (addr_param_needed && !addr_param_seen)
                return false;
        if (!addr_param_needed && addr_param_seen)
                return false;
        if (param.v != chunk->chunk_end)
                return false;

        return true;
}

/* Process an incoming ASCONF chunk with the next expected serial no. and
 * return an ASCONF_ACK chunk to be sent in response.
 */
struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
                                       struct sctp_chunk *asconf)
{
        union sctp_addr_param *addr_param;
        struct sctp_addip_chunk *addip;
        struct sctp_chunk *asconf_ack;
        bool all_param_pass = true;
        struct sctp_addiphdr *hdr;
        int length = 0, chunk_len;
        union sctp_params param;
        __be16 err_code;
        __u32 serial;

        addip = (struct sctp_addip_chunk *)asconf->chunk_hdr;
        chunk_len = ntohs(asconf->chunk_hdr->length) -
                    sizeof(struct sctp_chunkhdr);
        hdr = (struct sctp_addiphdr *)asconf->skb->data;
        serial = ntohl(hdr->serial);

        /* Skip the addiphdr and store a pointer to address parameter.  */
        length = sizeof(*hdr);
        addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
        chunk_len -= length;

        /* Skip the address parameter and store a pointer to the first
         * asconf parameter.
         */
        length = ntohs(addr_param->p.length);
        chunk_len -= length;

        /* create an ASCONF_ACK chunk.
         * Based on the definitions of parameters, we know that the size of
         * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF
         * parameters.
         */
        asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4);
        if (!asconf_ack)
                goto done;

        /* Process the TLVs contained within the ASCONF chunk. */
        sctp_walk_params(param, addip) {
                /* Skip preceding address parameters. */
                if (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
                    param.p->type == SCTP_PARAM_IPV6_ADDRESS)
                        continue;

                err_code = sctp_process_asconf_param(asoc, asconf,
                                                     param.addip);
                /* ADDIP 4.1 A7)
                 * If an error response is received for a TLV parameter,
                 * all TLVs with no response before the failed TLV are
                 * considered successful if not reported.  All TLVs after
                 * the failed response are considered unsuccessful unless
                 * a specific success indication is present for the parameter.
                 */
                if (err_code != SCTP_ERROR_NO_ERROR)
                        all_param_pass = false;
                if (!all_param_pass)
                        sctp_add_asconf_response(asconf_ack, param.addip->crr_id,
                                                 err_code, param.addip);

                /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add
                 * an IP address sends an 'Out of Resource' in its response, it
                 * MUST also fail any subsequent add or delete requests bundled
                 * in the ASCONF.
                 */
                if (err_code == SCTP_ERROR_RSRC_LOW)
                        goto done;
        }
done:
        asoc->peer.addip_serial++;

        /* If we are sending a new ASCONF_ACK hold a reference to it in assoc
         * after freeing the reference to old asconf ack if any.
         */
        if (asconf_ack) {
                sctp_chunk_hold(asconf_ack);
                list_add_tail(&asconf_ack->transmitted_list,
                              &asoc->asconf_ack_list);
        }

        return asconf_ack;
}

/* Process a asconf parameter that is successfully acked. */
static void sctp_asconf_param_success(struct sctp_association *asoc,
                                      struct sctp_addip_param *asconf_param)
{
        struct sctp_bind_addr *bp = &asoc->base.bind_addr;
        union sctp_addr_param *addr_param;
        struct sctp_sockaddr_entry *saddr;
        struct sctp_transport *transport;
        union sctp_addr        addr;
        struct sctp_af *af;

        addr_param = (void *)asconf_param + sizeof(*asconf_param);

        /* We have checked the packet before, so we do not check again.        */
        af = sctp_get_af_specific(param_type2af(addr_param->p.type));
        if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0))
                return;

        switch (asconf_param->param_hdr.type) {
        case SCTP_PARAM_ADD_IP:
                /* This is always done in BH context with a socket lock
                 * held, so the list can not change.
                 */
                local_bh_disable();
                list_for_each_entry(saddr, &bp->address_list, list) {
                        if (sctp_cmp_addr_exact(&saddr->a, &addr))
                                saddr->state = SCTP_ADDR_SRC;
                }
                local_bh_enable();
                list_for_each_entry(transport, &asoc->peer.transport_addr_list,
                                transports) {
                        sctp_transport_dst_release(transport);
                }
                break;
        case SCTP_PARAM_DEL_IP:
                local_bh_disable();
                sctp_del_bind_addr(bp, &addr);
                if (asoc->asconf_addr_del_pending != NULL &&
                    sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) {
                        kfree(asoc->asconf_addr_del_pending);
                        asoc->asconf_addr_del_pending = NULL;
                }
                local_bh_enable();
                list_for_each_entry(transport, &asoc->peer.transport_addr_list,
                                transports) {
                        sctp_transport_dst_release(transport);
                }
                break;
        default:
                break;
        }
}

/* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk
 * for the given asconf parameter.  If there is no response for this parameter,
 * return the error code based on the third argument 'no_err'.
 * ADDIP 4.1
 * A7) If an error response is received for a TLV parameter, all TLVs with no
 * response before the failed TLV are considered successful if not reported.
 * All TLVs after the failed response are considered unsuccessful unless a
 * specific success indication is present for the parameter.
 */
static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
                                       struct sctp_addip_param *asconf_param,
                                       int no_err)
{
        struct sctp_addip_param        *asconf_ack_param;
        struct sctp_errhdr *err_param;
        int asconf_ack_len;
        __be16 err_code;
        int length;

        if (no_err)
                err_code = SCTP_ERROR_NO_ERROR;
        else
                err_code = SCTP_ERROR_REQ_REFUSED;

        asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) -
                         sizeof(struct sctp_chunkhdr);

        /* Skip the addiphdr from the asconf_ack chunk and store a pointer to
         * the first asconf_ack parameter.
         */
        length = sizeof(struct sctp_addiphdr);
        asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data +
                                                       length);
        asconf_ack_len -= length;

        while (asconf_ack_len > 0) {
                if (asconf_ack_param->crr_id == asconf_param->crr_id) {
                        switch (asconf_ack_param->param_hdr.type) {
                        case SCTP_PARAM_SUCCESS_REPORT:
                                return SCTP_ERROR_NO_ERROR;
                        case SCTP_PARAM_ERR_CAUSE:
                                length = sizeof(*asconf_ack_param);
                                err_param = (void *)asconf_ack_param + length;
                                asconf_ack_len -= length;
                                if (asconf_ack_len > 0)
                                        return err_param->cause;
                                else
                                        return SCTP_ERROR_INV_PARAM;
                                break;
                        default:
                                return SCTP_ERROR_INV_PARAM;
                        }
                }

                length = ntohs(asconf_ack_param->param_hdr.length);
                asconf_ack_param = (void *)asconf_ack_param + length;
                asconf_ack_len -= length;
        }

        return err_code;
}

/* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */
int sctp_process_asconf_ack(struct sctp_association *asoc,
                            struct sctp_chunk *asconf_ack)
{
        struct sctp_chunk *asconf = asoc->addip_last_asconf;
        struct sctp_addip_param *asconf_param;
        __be16 err_code = SCTP_ERROR_NO_ERROR;
        union sctp_addr_param *addr_param;
        int asconf_len = asconf->skb->len;
        int all_param_pass = 0;
        int length = 0;
        int no_err = 1;
        int retval = 0;

        /* Skip the chunkhdr and addiphdr from the last asconf sent and store
         * a pointer to address parameter.
         */
        length = sizeof(struct sctp_addip_chunk);
        addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
        asconf_len -= length;

        /* Skip the address parameter in the last asconf sent and store a
         * pointer to the first asconf parameter.
         */
        length = ntohs(addr_param->p.length);
        asconf_param = (void *)addr_param + length;
        asconf_len -= length;

        /* ADDIP 4.1
         * A8) If there is no response(s) to specific TLV parameter(s), and no
         * failures are indicated, then all request(s) are considered
         * successful.
         */
        if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr))
                all_param_pass = 1;

        /* Process the TLVs contained in the last sent ASCONF chunk. */
        while (asconf_len > 0) {
                if (all_param_pass)
                        err_code = SCTP_ERROR_NO_ERROR;
                else {
                        err_code = sctp_get_asconf_response(asconf_ack,
                                                            asconf_param,
                                                            no_err);
                        if (no_err && (SCTP_ERROR_NO_ERROR != err_code))
                                no_err = 0;
                }

                switch (err_code) {
                case SCTP_ERROR_NO_ERROR:
                        sctp_asconf_param_success(asoc, asconf_param);
                        break;

                case SCTP_ERROR_RSRC_LOW:
                        retval = 1;
                        break;

                case SCTP_ERROR_UNKNOWN_PARAM:
                        /* Disable sending this type of asconf parameter in
                         * future.
                         */
                        asoc->peer.addip_disabled_mask |=
                                asconf_param->param_hdr.type;
                        break;

                case SCTP_ERROR_REQ_REFUSED:
                case SCTP_ERROR_DEL_LAST_IP:
                case SCTP_ERROR_DEL_SRC_IP:
                default:
                         break;
                }

                /* Skip the processed asconf parameter and move to the next
                 * one.
                 */
                length = ntohs(asconf_param->param_hdr.length);
                asconf_param = (void *)asconf_param + length;
                asconf_len -= length;
        }

        if (no_err && asoc->src_out_of_asoc_ok) {
                asoc->src_out_of_asoc_ok = 0;
                sctp_transport_immediate_rtx(asoc->peer.primary_path);
        }

        /* Free the cached last sent asconf chunk. */
        list_del_init(&asconf->transmitted_list);
        sctp_chunk_free(asconf);
        asoc->addip_last_asconf = NULL;

        return retval;
}

/* Make a FWD TSN chunk. */
struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
                                    __u32 new_cum_tsn, size_t nstreams,
                                    struct sctp_fwdtsn_skip *skiplist)
{
        struct sctp_chunk *retval = NULL;
        struct sctp_fwdtsn_hdr ftsn_hdr;
        struct sctp_fwdtsn_skip skip;
        size_t hint;
        int i;

        hint = (nstreams + 1) * sizeof(__u32);

        retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC);

        if (!retval)
                return NULL;

        ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
        retval->subh.fwdtsn_hdr =
                sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);

        for (i = 0; i < nstreams; i++) {
                skip.stream = skiplist[i].stream;
                skip.ssn = skiplist[i].ssn;
                sctp_addto_chunk(retval, sizeof(skip), &skip);
        }

        return retval;
}

struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc,
                                     __u32 new_cum_tsn, size_t nstreams,
                                     struct sctp_ifwdtsn_skip *skiplist)
{
        struct sctp_chunk *retval = NULL;
        struct sctp_ifwdtsn_hdr ftsn_hdr;
        size_t hint;

        hint = (nstreams + 1) * sizeof(__u32);

        retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint,
                                   GFP_ATOMIC);
        if (!retval)
                return NULL;

        ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn);
        retval->subh.ifwdtsn_hdr =
                sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr);

        sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist);

        return retval;
}

/* RE-CONFIG 3.1 (RE-CONFIG chunk)
 *   0                   1                   2                   3
 *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  | Type = 130    |  Chunk Flags  |      Chunk Length             |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  \                                                               \
 *  /                  Re-configuration Parameter                   /
 *  \                                                               \
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  \                                                               \
 *  /             Re-configuration Parameter (optional)             /
 *  \                                                               \
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc,
                                           int length)
{
        struct sctp_reconf_chunk *reconf;
        struct sctp_chunk *retval;

        retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
                                   GFP_ATOMIC);
        if (!retval)
                return NULL;

        reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
        retval->param_hdr.v = (u8 *)(reconf + 1);

        return retval;
}

/* RE-CONFIG 4.1 (STREAM OUT RESET)
 *   0                   1                   2                   3
 *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |     Parameter Type = 13       | Parameter Length = 16 + 2 * N |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |           Re-configuration Request Sequence Number            |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |           Re-configuration Response Sequence Number           |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |                Sender's Last Assigned TSN                     |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  /                            ......                             /
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * RE-CONFIG 4.2 (STREAM IN RESET)
 *   0                   1                   2                   3
 *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |     Parameter Type = 14       |  Parameter Length = 8 + 2 * N |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |          Re-configuration Request Sequence Number             |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |  Stream Number 1 (optional)   |    Stream Number 2 (optional) |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  /                            ......                             /
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |  Stream Number N-1 (optional) |    Stream Number N (optional) |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
struct sctp_chunk *sctp_make_strreset_req(
                                        const struct sctp_association *asoc,
                                        __u16 stream_num, __be16 *stream_list,
                                        bool out, bool in)
{
        __u16 stream_len = stream_num * sizeof(__u16);
        struct sctp_strreset_outreq outreq;
        struct sctp_strreset_inreq inreq;
        struct sctp_chunk *retval;
        __u16 outlen, inlen;

        outlen = (sizeof(outreq) + stream_len) * out;
        inlen = (sizeof(inreq) + stream_len) * in;

        retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen));
        if (!retval)
                return NULL;

        if (outlen) {
                outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST;
                outreq.param_hdr.length = htons(outlen);
                outreq.request_seq = htonl(asoc->strreset_outseq);
                outreq.response_seq = htonl(asoc->strreset_inseq - 1);
                outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1);

                sctp_addto_chunk(retval, sizeof(outreq), &outreq);

                if (stream_len)
                        sctp_addto_chunk(retval, stream_len, stream_list);
        }

        if (inlen) {
                inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST;
                inreq.param_hdr.length = htons(inlen);
                inreq.request_seq = htonl(asoc->strreset_outseq + out);

                sctp_addto_chunk(retval, sizeof(inreq), &inreq);

                if (stream_len)
                        sctp_addto_chunk(retval, stream_len, stream_list);
        }

        return retval;
}

/* RE-CONFIG 4.3 (SSN/TSN RESET ALL)
 *   0                   1                   2                   3
 *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |     Parameter Type = 15       |      Parameter Length = 8     |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |         Re-configuration Request Sequence Number              |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
struct sctp_chunk *sctp_make_strreset_tsnreq(
                                        const struct sctp_association *asoc)
{
        struct sctp_strreset_tsnreq tsnreq;
        __u16 length = sizeof(tsnreq);
        struct sctp_chunk *retval;

        retval = sctp_make_reconf(asoc, length);
        if (!retval)
                return NULL;

        tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST;
        tsnreq.param_hdr.length = htons(length);
        tsnreq.request_seq = htonl(asoc->strreset_outseq);

        sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq);

        return retval;
}

/* RE-CONFIG 4.5/4.6 (ADD STREAM)
 *   0                   1                   2                   3
 *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |     Parameter Type = 17       |      Parameter Length = 12    |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |          Re-configuration Request Sequence Number             |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |      Number of new streams    |         Reserved              |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
struct sctp_chunk *sctp_make_strreset_addstrm(
                                        const struct sctp_association *asoc,
                                        __u16 out, __u16 in)
{
        struct sctp_strreset_addstrm addstrm;
        __u16 size = sizeof(addstrm);
        struct sctp_chunk *retval;

        retval = sctp_make_reconf(asoc, (!!out + !!in) * size);
        if (!retval)
                return NULL;

        if (out) {
                addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS;
                addstrm.param_hdr.length = htons(size);
                addstrm.number_of_streams = htons(out);
                addstrm.request_seq = htonl(asoc->strreset_outseq);
                addstrm.reserved = 0;

                sctp_addto_chunk(retval, size, &addstrm);
        }

        if (in) {
                addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS;
                addstrm.param_hdr.length = htons(size);
                addstrm.number_of_streams = htons(in);
                addstrm.request_seq = htonl(asoc->strreset_outseq + !!out);
                addstrm.reserved = 0;

                sctp_addto_chunk(retval, size, &addstrm);
        }

        return retval;
}

/* RE-CONFIG 4.4 (RESP)
 *   0                   1                   2                   3
 *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |     Parameter Type = 16       |      Parameter Length         |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |         Re-configuration Response Sequence Number             |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |                            Result                             |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc,
                                           __u32 result, __u32 sn)
{
        struct sctp_strreset_resp resp;
        __u16 length = sizeof(resp);
        struct sctp_chunk *retval;

        retval = sctp_make_reconf(asoc, length);
        if (!retval)
                return NULL;

        resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
        resp.param_hdr.length = htons(length);
        resp.response_seq = htonl(sn);
        resp.result = htonl(result);

        sctp_addto_chunk(retval, sizeof(resp), &resp);

        return retval;
}

/* RE-CONFIG 4.4 OPTIONAL (TSNRESP)
 *   0                   1                   2                   3
 *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |     Parameter Type = 16       |      Parameter Length         |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |         Re-configuration Response Sequence Number             |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |                            Result                             |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |                   Sender's Next TSN (optional)                |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *  |                  Receiver's Next TSN (optional)               |
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc,
                                              __u32 result, __u32 sn,
                                              __u32 sender_tsn,
                                              __u32 receiver_tsn)
{
        struct sctp_strreset_resptsn tsnresp;
        __u16 length = sizeof(tsnresp);
        struct sctp_chunk *retval;

        retval = sctp_make_reconf(asoc, length);
        if (!retval)
                return NULL;

        tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE;
        tsnresp.param_hdr.length = htons(length);

        tsnresp.response_seq = htonl(sn);
        tsnresp.result = htonl(result);
        tsnresp.senders_next_tsn = htonl(sender_tsn);
        tsnresp.receivers_next_tsn = htonl(receiver_tsn);

        sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp);

        return retval;
}

bool sctp_verify_reconf(const struct sctp_association *asoc,
                        struct sctp_chunk *chunk,
                        struct sctp_paramhdr **errp)
{
        struct sctp_reconf_chunk *hdr;
        union sctp_params param;
        __be16 last = 0;
        __u16 cnt = 0;

        hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
        sctp_walk_params(param, hdr) {
                __u16 length = ntohs(param.p->length);

                *errp = param.p;
                if (cnt++ > 2)
                        return false;
                switch (param.p->type) {
                case SCTP_PARAM_RESET_OUT_REQUEST:
                        if (length < sizeof(struct sctp_strreset_outreq) ||
                            (last && last != SCTP_PARAM_RESET_RESPONSE &&
                             last != SCTP_PARAM_RESET_IN_REQUEST))
                                return false;
                        break;
                case SCTP_PARAM_RESET_IN_REQUEST:
                        if (length < sizeof(struct sctp_strreset_inreq) ||
                            (last && last != SCTP_PARAM_RESET_OUT_REQUEST))
                                return false;
                        break;
                case SCTP_PARAM_RESET_RESPONSE:
                        if ((length != sizeof(struct sctp_strreset_resp) &&
                             length != sizeof(struct sctp_strreset_resptsn)) ||
                            (last && last != SCTP_PARAM_RESET_RESPONSE &&
                             last != SCTP_PARAM_RESET_OUT_REQUEST))
                                return false;
                        break;
                case SCTP_PARAM_RESET_TSN_REQUEST:
                        if (length !=
                            sizeof(struct sctp_strreset_tsnreq) || last)
                                return false;
                        break;
                case SCTP_PARAM_RESET_ADD_IN_STREAMS:
                        if (length != sizeof(struct sctp_strreset_addstrm) ||
                            (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS))
                                return false;
                        break;
                case SCTP_PARAM_RESET_ADD_OUT_STREAMS:
                        if (length != sizeof(struct sctp_strreset_addstrm) ||
                            (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS))
                                return false;
                        break;
                default:
                        return false;
                }

                last = param.p->type;
        }

        return true;
}



































































































































    1 





    1 


    1 

    1 






    1 

    1 





















































    1 































































































    2 










    2 




    1 











    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
// SPDX-License-Identifier: GPL-2.0-only
/* Event cache for netfilter. */

/*
 * (C) 2005 Harald Welte <laforge@gnumonks.org>
 * (C) 2005 Patrick McHardy <kaber@trash.net>
 * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org>
 * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/export.h>

#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>

static DEFINE_MUTEX(nf_ct_ecache_mutex);

#define DYING_NULLS_VAL                        ((1 << 30) + 1)
#define ECACHE_MAX_JIFFIES                msecs_to_jiffies(10)
#define ECACHE_RETRY_JIFFIES                msecs_to_jiffies(10)

enum retry_state {
        STATE_CONGESTED,
        STATE_RESTART,
        STATE_DONE,
};

struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);

        return &cnet->ecache;
}
#if IS_MODULE(CONFIG_NF_CT_NETLINK)
EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
#endif

static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
{
        unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
        struct hlist_nulls_head evicted_list;
        enum retry_state ret = STATE_DONE;
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
        unsigned int sent;

        INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);

next:
        sent = 0;
        spin_lock_bh(&cnet->ecache.dying_lock);

        hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
                struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

                /* The worker owns all entries, ct remains valid until nf_ct_put
                 * in the loop below.
                 */
                if (nf_conntrack_event(IPCT_DESTROY, ct)) {
                        ret = STATE_CONGESTED;
                        break;
                }

                hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
                hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);

                if (time_after(stop, jiffies)) {
                        ret = STATE_RESTART;
                        break;
                }

                if (sent++ > 16) {
                        spin_unlock_bh(&cnet->ecache.dying_lock);
                        cond_resched();
                        goto next;
                }
        }

        spin_unlock_bh(&cnet->ecache.dying_lock);

        hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
                struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

                hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
                nf_ct_put(ct);

                cond_resched();
        }

        return ret;
}

static void ecache_work(struct work_struct *work)
{
        struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
        int ret, delay = -1;

        ret = ecache_work_evict_list(cnet);
        switch (ret) {
        case STATE_CONGESTED:
                delay = ECACHE_RETRY_JIFFIES;
                break;
        case STATE_RESTART:
                delay = 0;
                break;
        case STATE_DONE:
                break;
        }

        if (delay >= 0)
                schedule_delayed_work(&cnet->ecache.dwork, delay);
}

static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
                                           const u32 events,
                                           const u32 missed,
                                           const struct nf_ct_event *item)
{
        struct net *net = nf_ct_net(item->ct);
        struct nf_ct_event_notifier *notify;
        u32 old, want;
        int ret;

        if (!((events | missed) & e->ctmask))
                return 0;

        rcu_read_lock();

        notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
        if (!notify) {
                rcu_read_unlock();
                return 0;
        }

        ret = notify->ct_event(events | missed, item);
        rcu_read_unlock();

        if (likely(ret >= 0 && missed == 0))
                return 0;

        do {
                old = READ_ONCE(e->missed);
                if (ret < 0)
                        want = old | events;
                else
                        want = old & ~missed;
        } while (cmpxchg(&e->missed, old, want) != old);

        return ret;
}

int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
                                  u32 portid, int report)
{
        struct nf_conntrack_ecache *e;
        struct nf_ct_event item;
        unsigned int missed;
        int ret;

        if (!nf_ct_is_confirmed(ct))
                return 0;

        e = nf_ct_ecache_find(ct);
        if (!e)
                return 0;

        memset(&item, 0, sizeof(item));

        item.ct = ct;
        item.portid = e->portid ? e->portid : portid;
        item.report = report;

        /* This is a resent of a destroy event? If so, skip missed */
        missed = e->portid ? 0 : e->missed;

        ret = __nf_conntrack_eventmask_report(e, events, missed, &item);
        if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) {
                /* This is a destroy event that has been triggered by a process,
                 * we store the PORTID to include it in the retransmission.
                 */
                if (e->portid == 0 && portid != 0)
                        e->portid = portid;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);

/* deliver cached events and clear cache entry - must be called with locally
 * disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
        struct nf_conntrack_ecache *e;
        struct nf_ct_event item;
        unsigned int events;

        if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
                return;

        e = nf_ct_ecache_find(ct);
        if (e == NULL)
                return;

        events = xchg(&e->cache, 0);

        item.ct = ct;
        item.portid = 0;
        item.report = 0;

        /* We make a copy of the missed event cache without taking
         * the lock, thus we may send missed events twice. However,
         * this does not harm and it happens very rarely.
         */
        __nf_conntrack_eventmask_report(e, events, e->missed, &item);
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);

void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
                               struct nf_conntrack_expect *exp,
                               u32 portid, int report)

{
        struct net *net = nf_ct_exp_net(exp);
        struct nf_ct_event_notifier *notify;
        struct nf_conntrack_ecache *e;

        rcu_read_lock();
        notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
        if (!notify)
                goto out_unlock;

        e = nf_ct_ecache_find(exp->master);
        if (!e)
                goto out_unlock;

        if (e->expmask & (1 << event)) {
                struct nf_exp_event item = {
                        .exp        = exp,
                        .portid        = portid,
                        .report = report
                };
                notify->exp_event(1 << event, &item);
        }
out_unlock:
        rcu_read_unlock();
}

void nf_conntrack_register_notifier(struct net *net,
                                    const struct nf_ct_event_notifier *new)
{
        struct nf_ct_event_notifier *notify;

        mutex_lock(&nf_ct_ecache_mutex);
        notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
                                           lockdep_is_held(&nf_ct_ecache_mutex));
        WARN_ON_ONCE(notify);
        rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
        mutex_unlock(&nf_ct_ecache_mutex);
}
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);

void nf_conntrack_unregister_notifier(struct net *net)
{
        mutex_lock(&nf_ct_ecache_mutex);
        RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
        mutex_unlock(&nf_ct_ecache_mutex);
        /* synchronize_rcu() is called after netns pre_exit */
}
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);

void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);

        if (state == NFCT_ECACHE_DESTROY_FAIL &&
            !delayed_work_pending(&cnet->ecache.dwork)) {
                schedule_delayed_work(&cnet->ecache.dwork, HZ);
                net->ct.ecache_dwork_pending = true;
        } else if (state == NFCT_ECACHE_DESTROY_SENT) {
                if (!hlist_nulls_empty(&cnet->ecache.dying_list))
                        mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
                else
                        net->ct.ecache_dwork_pending = false;
        }
}

bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
        struct net *net = nf_ct_net(ct);
        struct nf_conntrack_ecache *e;

        switch (net->ct.sysctl_events) {
        case 0:
                 /* assignment via template / ruleset? ignore sysctl. */
                if (ctmask || expmask)
                        break;
                return true;
        case 2: /* autodetect: no event listener, don't allocate extension. */
                if (!READ_ONCE(nf_ctnetlink_has_listener))
                        return true;
                fallthrough;
        case 1:
                /* always allocate an extension. */
                if (!ctmask && !expmask) {
                        ctmask = ~0;
                        expmask = ~0;
                }
                break;
        default:
                WARN_ON_ONCE(1);
                return true;
        }

        e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
        if (e) {
                e->ctmask  = ctmask;
                e->expmask = expmask;
        }

        return e != NULL;
}
EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);

#define NF_CT_EVENTS_DEFAULT 2
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;

void nf_conntrack_ecache_pernet_init(struct net *net)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);

        net->ct.sysctl_events = nf_ct_events;

        INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
        INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
        spin_lock_init(&cnet->ecache.dying_lock);

        BUILD_BUG_ON(__IPCT_MAX >= 16);        /* e->ctmask is u16 */
}

void nf_conntrack_ecache_pernet_fini(struct net *net)
{
        struct nf_conntrack_net *cnet = nf_ct_pernet(net);

        cancel_delayed_work_sync(&cnet->ecache.dwork);
}



















































































































































    8 





    4 




   11 




















    9 





























   10 




    9 

   12 










    4 














   13 

























    3 



    8 










    5 














    8 











    7 




   10 









   13 


   13 
   15 





































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/memblock.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
#include <linux/jump_label.h>
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
#include <linux/sched/clock.h>

#include "internal.h"

/*
 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
 * to use off stack temporal storage
 */
#define PAGE_OWNER_STACK_DEPTH (16)

struct page_owner {
        unsigned short order;
        short last_migrate_reason;
        gfp_t gfp_mask;
        depot_stack_handle_t handle;
        depot_stack_handle_t free_handle;
        u64 ts_nsec;
        u64 free_ts_nsec;
        char comm[TASK_COMM_LEN];
        pid_t pid;
        pid_t tgid;
        pid_t free_pid;
        pid_t free_tgid;
};

struct stack {
        struct stack_record *stack_record;
        struct stack *next;
};
static struct stack dummy_stack;
static struct stack failure_stack;
static struct stack *stack_list;
static DEFINE_SPINLOCK(stack_list_lock);

static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);

static depot_stack_handle_t dummy_handle;
static depot_stack_handle_t failure_handle;
static depot_stack_handle_t early_handle;

static void init_early_allocated_pages(void);

static inline void set_current_in_page_owner(void)
{
        /*
         * Avoid recursion.
         *
         * We might need to allocate more memory from page_owner code, so make
         * sure to signal it in order to avoid recursion.
         */
        current->in_page_owner = 1;
}

static inline void unset_current_in_page_owner(void)
{
        current->in_page_owner = 0;
}

static int __init early_page_owner_param(char *buf)
{
        int ret = kstrtobool(buf, &page_owner_enabled);

        if (page_owner_enabled)
                stack_depot_request_early_init();

        return ret;
}
early_param("page_owner", early_page_owner_param);

static __init bool need_page_owner(void)
{
        return page_owner_enabled;
}

static __always_inline depot_stack_handle_t create_dummy_stack(void)
{
        unsigned long entries[4];
        unsigned int nr_entries;

        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
        return stack_depot_save(entries, nr_entries, GFP_KERNEL);
}

static noinline void register_dummy_stack(void)
{
        dummy_handle = create_dummy_stack();
}

static noinline void register_failure_stack(void)
{
        failure_handle = create_dummy_stack();
}

static noinline void register_early_stack(void)
{
        early_handle = create_dummy_stack();
}

static __init void init_page_owner(void)
{
        if (!page_owner_enabled)
                return;

        register_dummy_stack();
        register_failure_stack();
        register_early_stack();
        init_early_allocated_pages();
        /* Initialize dummy and failure stacks and link them to stack_list */
        dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
        failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
        if (dummy_stack.stack_record)
                refcount_set(&dummy_stack.stack_record->count, 1);
        if (failure_stack.stack_record)
                refcount_set(&failure_stack.stack_record->count, 1);
        dummy_stack.next = &failure_stack;
        stack_list = &dummy_stack;
        static_branch_enable(&page_owner_inited);
}

struct page_ext_operations page_owner_ops = {
        .size = sizeof(struct page_owner),
        .need = need_page_owner,
        .init = init_page_owner,
        .need_shared_flags = true,
};

static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
{
        return page_ext_data(page_ext, &page_owner_ops);
}

static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
        unsigned long entries[PAGE_OWNER_STACK_DEPTH];
        depot_stack_handle_t handle;
        unsigned int nr_entries;

        if (current->in_page_owner)
                return dummy_handle;

        set_current_in_page_owner();
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
        handle = stack_depot_save(entries, nr_entries, flags);
        if (!handle)
                handle = failure_handle;
        unset_current_in_page_owner();

        return handle;
}

static void add_stack_record_to_list(struct stack_record *stack_record,
                                     gfp_t gfp_mask)
{
        unsigned long flags;
        struct stack *stack;

        set_current_in_page_owner();
        stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask));
        if (!stack) {
                unset_current_in_page_owner();
                return;
        }
        unset_current_in_page_owner();

        stack->stack_record = stack_record;
        stack->next = NULL;

        spin_lock_irqsave(&stack_list_lock, flags);
        stack->next = stack_list;
        /*
         * This pairs with smp_load_acquire() from function
         * stack_start(). This guarantees that stack_start()
         * will see an updated stack_list before starting to
         * traverse the list.
         */
        smp_store_release(&stack_list, stack);
        spin_unlock_irqrestore(&stack_list_lock, flags);
}

static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
                                   int nr_base_pages)
{
        struct stack_record *stack_record = __stack_depot_get_stack_record(handle);

        if (!stack_record)
                return;

        /*
         * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
         * with REFCOUNT_SATURATED to catch spurious increments of their
         * refcount.
         * Since we do not use STACK_DEPOT_FLAG_GET API, let us
         * set a refcount of 1 ourselves.
         */
        if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
                int old = REFCOUNT_SATURATED;

                if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
                        /* Add the new stack_record to our list */
                        add_stack_record_to_list(stack_record, gfp_mask);
        }
        refcount_add(nr_base_pages, &stack_record->count);
}

static void dec_stack_record_count(depot_stack_handle_t handle,
                                   int nr_base_pages)
{
        struct stack_record *stack_record = __stack_depot_get_stack_record(handle);

        if (!stack_record)
                return;

        if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
                pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
                        handle);
}

static inline void __update_page_owner_handle(struct page_ext *page_ext,
                                              depot_stack_handle_t handle,
                                              unsigned short order,
                                              gfp_t gfp_mask,
                                              short last_migrate_reason, u64 ts_nsec,
                                              pid_t pid, pid_t tgid, char *comm)
{
        int i;
        struct page_owner *page_owner;

        for (i = 0; i < (1 << order); i++) {
                page_owner = get_page_owner(page_ext);
                page_owner->handle = handle;
                page_owner->order = order;
                page_owner->gfp_mask = gfp_mask;
                page_owner->last_migrate_reason = last_migrate_reason;
                page_owner->pid = pid;
                page_owner->tgid = tgid;
                page_owner->ts_nsec = ts_nsec;
                strscpy(page_owner->comm, comm,
                        sizeof(page_owner->comm));
                __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
                __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
                page_ext = page_ext_next(page_ext);
        }
}

static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
                                                   depot_stack_handle_t handle,
                                                   unsigned short order,
                                                   pid_t pid, pid_t tgid,
                                                   u64 free_ts_nsec)
{
        int i;
        struct page_owner *page_owner;

        for (i = 0; i < (1 << order); i++) {
                page_owner = get_page_owner(page_ext);
                /* Only __reset_page_owner() wants to clear the bit */
                if (handle) {
                        __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
                        page_owner->free_handle = handle;
                }
                page_owner->free_ts_nsec = free_ts_nsec;
                page_owner->free_pid = current->pid;
                page_owner->free_tgid = current->tgid;
                page_ext = page_ext_next(page_ext);
        }
}

void __reset_page_owner(struct page *page, unsigned short order)
{
        struct page_ext *page_ext;
        depot_stack_handle_t handle;
        depot_stack_handle_t alloc_handle;
        struct page_owner *page_owner;
        u64 free_ts_nsec = local_clock();

        page_ext = page_ext_get(page);
        if (unlikely(!page_ext))
                return;

        page_owner = get_page_owner(page_ext);
        alloc_handle = page_owner->handle;

        handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
        __update_page_owner_free_handle(page_ext, handle, order, current->pid,
                                        current->tgid, free_ts_nsec);
        page_ext_put(page_ext);

        if (alloc_handle != early_handle)
                /*
                 * early_handle is being set as a handle for all those
                 * early allocated pages. See init_pages_in_zone().
                 * Since their refcount is not being incremented because
                 * the machinery is not ready yet, we cannot decrement
                 * their refcount either.
                 */
                dec_stack_record_count(alloc_handle, 1 << order);
}

noinline void __set_page_owner(struct page *page, unsigned short order,
                                        gfp_t gfp_mask)
{
        struct page_ext *page_ext;
        u64 ts_nsec = local_clock();
        depot_stack_handle_t handle;

        handle = save_stack(gfp_mask);

        page_ext = page_ext_get(page);
        if (unlikely(!page_ext))
                return;
        __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
                                   ts_nsec, current->pid, current->tgid,
                                   current->comm);
        page_ext_put(page_ext);
        inc_stack_record_count(handle, gfp_mask, 1 << order);
}

void __set_page_owner_migrate_reason(struct page *page, int reason)
{
        struct page_ext *page_ext = page_ext_get(page);
        struct page_owner *page_owner;

        if (unlikely(!page_ext))
                return;

        page_owner = get_page_owner(page_ext);
        page_owner->last_migrate_reason = reason;
        page_ext_put(page_ext);
}

void __split_page_owner(struct page *page, int old_order, int new_order)
{
        int i;
        struct page_ext *page_ext = page_ext_get(page);
        struct page_owner *page_owner;

        if (unlikely(!page_ext))
                return;

        for (i = 0; i < (1 << old_order); i++) {
                page_owner = get_page_owner(page_ext);
                page_owner->order = new_order;
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
        int i;
        struct page_ext *old_ext;
        struct page_ext *new_ext;
        struct page_owner *old_page_owner;
        struct page_owner *new_page_owner;
        depot_stack_handle_t migrate_handle;

        old_ext = page_ext_get(&old->page);
        if (unlikely(!old_ext))
                return;

        new_ext = page_ext_get(&newfolio->page);
        if (unlikely(!new_ext)) {
                page_ext_put(old_ext);
                return;
        }

        old_page_owner = get_page_owner(old_ext);
        new_page_owner = get_page_owner(new_ext);
        migrate_handle = new_page_owner->handle;
        __update_page_owner_handle(new_ext, old_page_owner->handle,
                                   old_page_owner->order, old_page_owner->gfp_mask,
                                   old_page_owner->last_migrate_reason,
                                   old_page_owner->ts_nsec, old_page_owner->pid,
                                   old_page_owner->tgid, old_page_owner->comm);
        /*
         * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
         * will be freed after migration. Keep them until then as they may be
         * useful.
         */
        __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
                                        old_page_owner->free_pid,
                                        old_page_owner->free_tgid,
                                        old_page_owner->free_ts_nsec);
        /*
         * We linked the original stack to the new folio, we need to do the same
         * for the new one and the old folio otherwise there will be an imbalance
         * when subtracting those pages from the stack.
         */
        for (i = 0; i < (1 << new_page_owner->order); i++) {
                old_page_owner->handle = migrate_handle;
                old_ext = page_ext_next(old_ext);
                old_page_owner = get_page_owner(old_ext);
        }

        page_ext_put(new_ext);
        page_ext_put(old_ext);
}

void pagetypeinfo_showmixedcount_print(struct seq_file *m,
                                       pg_data_t *pgdat, struct zone *zone)
{
        struct page *page;
        struct page_ext *page_ext;
        struct page_owner *page_owner;
        unsigned long pfn, block_end_pfn;
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long count[MIGRATE_TYPES] = { 0, };
        int pageblock_mt, page_mt;
        int i;

        /* Scan block by block. First and last block may be incomplete */
        pfn = zone->zone_start_pfn;

        /*
         * Walk the zone in pageblock_nr_pages steps. If a page block spans
         * a zone boundary, it will be double counted between zones. This does
         * not matter as the mixed block count will still be correct
         */
        for (; pfn < end_pfn; ) {
                page = pfn_to_online_page(pfn);
                if (!page) {
                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
                        continue;
                }

                block_end_pfn = pageblock_end_pfn(pfn);
                block_end_pfn = min(block_end_pfn, end_pfn);

                pageblock_mt = get_pageblock_migratetype(page);

                for (; pfn < block_end_pfn; pfn++) {
                        /* The pageblock is online, no need to recheck. */
                        page = pfn_to_page(pfn);

                        if (page_zone(page) != zone)
                                continue;

                        if (PageBuddy(page)) {
                                unsigned long freepage_order;

                                freepage_order = buddy_order_unsafe(page);
                                if (freepage_order <= MAX_PAGE_ORDER)
                                        pfn += (1UL << freepage_order) - 1;
                                continue;
                        }

                        if (PageReserved(page))
                                continue;

                        page_ext = page_ext_get(page);
                        if (unlikely(!page_ext))
                                continue;

                        if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
                                goto ext_put_continue;

                        page_owner = get_page_owner(page_ext);
                        page_mt = gfp_migratetype(page_owner->gfp_mask);
                        if (pageblock_mt != page_mt) {
                                if (is_migrate_cma(pageblock_mt))
                                        count[MIGRATE_MOVABLE]++;
                                else
                                        count[pageblock_mt]++;

                                pfn = block_end_pfn;
                                page_ext_put(page_ext);
                                break;
                        }
                        pfn += (1UL << page_owner->order) - 1;
ext_put_continue:
                        page_ext_put(page_ext);
                }
        }

        /* Print counts */
        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
        for (i = 0; i < MIGRATE_TYPES; i++)
                seq_printf(m, "%12lu ", count[i]);
        seq_putc(m, '\n');
}

/*
 * Looking for memcg information and print it out
 */
static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
                                         struct page *page)
{
#ifdef CONFIG_MEMCG
        unsigned long memcg_data;
        struct mem_cgroup *memcg;
        bool online;
        char name[80];

        rcu_read_lock();
        memcg_data = READ_ONCE(page->memcg_data);
        if (!memcg_data)
                goto out_unlock;

        if (memcg_data & MEMCG_DATA_OBJEXTS)
                ret += scnprintf(kbuf + ret, count - ret,
                                "Slab cache page\n");

        memcg = page_memcg_check(page);
        if (!memcg)
                goto out_unlock;

        online = (memcg->css.flags & CSS_ONLINE);
        cgroup_name(memcg->css.cgroup, name, sizeof(name));
        ret += scnprintf(kbuf + ret, count - ret,
                        "Charged %sto %smemcg %s\n",
                        PageMemcgKmem(page) ? "(via objcg) " : "",
                        online ? "" : "offline ",
                        name);
out_unlock:
        rcu_read_unlock();
#endif /* CONFIG_MEMCG */

        return ret;
}

static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                struct page *page, struct page_owner *page_owner,
                depot_stack_handle_t handle)
{
        int ret, pageblock_mt, page_mt;
        char *kbuf;

        count = min_t(size_t, count, PAGE_SIZE);
        kbuf = kmalloc(count, GFP_KERNEL);
        if (!kbuf)
                return -ENOMEM;

        ret = scnprintf(kbuf, count,
                        "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n",
                        page_owner->order, page_owner->gfp_mask,
                        &page_owner->gfp_mask, page_owner->pid,
                        page_owner->tgid, page_owner->comm,
                        page_owner->ts_nsec);

        /* Print information relevant to grouping pages by mobility */
        pageblock_mt = get_pageblock_migratetype(page);
        page_mt  = gfp_migratetype(page_owner->gfp_mask);
        ret += scnprintf(kbuf + ret, count - ret,
                        "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
                        pfn,
                        migratetype_names[page_mt],
                        pfn >> pageblock_order,
                        migratetype_names[pageblock_mt],
                        &page->flags);

        ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
        if (ret >= count)
                goto err;

        if (page_owner->last_migrate_reason != -1) {
                ret += scnprintf(kbuf + ret, count - ret,
                        "Page has been migrated, last migrate reason: %s\n",
                        migrate_reason_names[page_owner->last_migrate_reason]);
        }

        ret = print_page_owner_memcg(kbuf, count, ret, page);

        ret += snprintf(kbuf + ret, count - ret, "\n");
        if (ret >= count)
                goto err;

        if (copy_to_user(buf, kbuf, ret))
                ret = -EFAULT;

        kfree(kbuf);
        return ret;

err:
        kfree(kbuf);
        return -ENOMEM;
}

void __dump_page_owner(const struct page *page)
{
        struct page_ext *page_ext = page_ext_get((void *)page);
        struct page_owner *page_owner;
        depot_stack_handle_t handle;
        gfp_t gfp_mask;
        int mt;

        if (unlikely(!page_ext)) {
                pr_alert("There is not page extension available.\n");
                return;
        }

        page_owner = get_page_owner(page_ext);
        gfp_mask = page_owner->gfp_mask;
        mt = gfp_migratetype(gfp_mask);

        if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
                pr_alert("page_owner info is not present (never set?)\n");
                page_ext_put(page_ext);
                return;
        }

        if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
                pr_alert("page_owner tracks the page as allocated\n");
        else
                pr_alert("page_owner tracks the page as freed\n");

        pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
                 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
                 page_owner->pid, page_owner->tgid, page_owner->comm,
                 page_owner->ts_nsec, page_owner->free_ts_nsec);

        handle = READ_ONCE(page_owner->handle);
        if (!handle)
                pr_alert("page_owner allocation stack trace missing\n");
        else
                stack_depot_print(handle);

        handle = READ_ONCE(page_owner->free_handle);
        if (!handle) {
                pr_alert("page_owner free stack trace missing\n");
        } else {
                pr_alert("page last free pid %d tgid %d stack trace:\n",
                          page_owner->free_pid, page_owner->free_tgid);
                stack_depot_print(handle);
        }

        if (page_owner->last_migrate_reason != -1)
                pr_alert("page has been migrated, last migrate reason: %s\n",
                        migrate_reason_names[page_owner->last_migrate_reason]);
        page_ext_put(page_ext);
}

static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        unsigned long pfn;
        struct page *page;
        struct page_ext *page_ext;
        struct page_owner *page_owner;
        depot_stack_handle_t handle;

        if (!static_branch_unlikely(&page_owner_inited))
                return -EINVAL;

        page = NULL;
        if (*ppos == 0)
                pfn = min_low_pfn;
        else
                pfn = *ppos;
        /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
        while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
                pfn++;

        /* Find an allocated page */
        for (; pfn < max_pfn; pfn++) {
                /*
                 * This temporary page_owner is required so
                 * that we can avoid the context switches while holding
                 * the rcu lock and copying the page owner information to
                 * user through copy_to_user() or GFP_KERNEL allocations.
                 */
                struct page_owner page_owner_tmp;

                /*
                 * If the new page is in a new MAX_ORDER_NR_PAGES area,
                 * validate the area as existing, skip it if not
                 */
                if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
                        pfn += MAX_ORDER_NR_PAGES - 1;
                        continue;
                }

                page = pfn_to_page(pfn);
                if (PageBuddy(page)) {
                        unsigned long freepage_order = buddy_order_unsafe(page);

                        if (freepage_order <= MAX_PAGE_ORDER)
                                pfn += (1UL << freepage_order) - 1;
                        continue;
                }

                page_ext = page_ext_get(page);
                if (unlikely(!page_ext))
                        continue;

                /*
                 * Some pages could be missed by concurrent allocation or free,
                 * because we don't hold the zone lock.
                 */
                if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
                        goto ext_put_continue;

                /*
                 * Although we do have the info about past allocation of free
                 * pages, it's not relevant for current memory usage.
                 */
                if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
                        goto ext_put_continue;

                page_owner = get_page_owner(page_ext);

                /*
                 * Don't print "tail" pages of high-order allocations as that
                 * would inflate the stats.
                 */
                if (!IS_ALIGNED(pfn, 1 << page_owner->order))
                        goto ext_put_continue;

                /*
                 * Access to page_ext->handle isn't synchronous so we should
                 * be careful to access it.
                 */
                handle = READ_ONCE(page_owner->handle);
                if (!handle)
                        goto ext_put_continue;

                /* Record the next PFN to read in the file offset */
                *ppos = pfn + 1;

                page_owner_tmp = *page_owner;
                page_ext_put(page_ext);
                return print_page_owner(buf, count, pfn, page,
                                &page_owner_tmp, handle);
ext_put_continue:
                page_ext_put(page_ext);
        }

        return 0;
}

static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
{
        switch (orig) {
        case SEEK_SET:
                file->f_pos = offset;
                break;
        case SEEK_CUR:
                file->f_pos += offset;
                break;
        default:
                return -EINVAL;
        }
        return file->f_pos;
}

static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
        unsigned long pfn = zone->zone_start_pfn;
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long count = 0;

        /*
         * Walk the zone in pageblock_nr_pages steps. If a page block spans
         * a zone boundary, it will be double counted between zones. This does
         * not matter as the mixed block count will still be correct
         */
        for (; pfn < end_pfn; ) {
                unsigned long block_end_pfn;

                if (!pfn_valid(pfn)) {
                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
                        continue;
                }

                block_end_pfn = pageblock_end_pfn(pfn);
                block_end_pfn = min(block_end_pfn, end_pfn);

                for (; pfn < block_end_pfn; pfn++) {
                        struct page *page = pfn_to_page(pfn);
                        struct page_ext *page_ext;

                        if (page_zone(page) != zone)
                                continue;

                        /*
                         * To avoid having to grab zone->lock, be a little
                         * careful when reading buddy page order. The only
                         * danger is that we skip too much and potentially miss
                         * some early allocated pages, which is better than
                         * heavy lock contention.
                         */
                        if (PageBuddy(page)) {
                                unsigned long order = buddy_order_unsafe(page);

                                if (order > 0 && order <= MAX_PAGE_ORDER)
                                        pfn += (1UL << order) - 1;
                                continue;
                        }

                        if (PageReserved(page))
                                continue;

                        page_ext = page_ext_get(page);
                        if (unlikely(!page_ext))
                                continue;

                        /* Maybe overlapping zone */
                        if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
                                goto ext_put_continue;

                        /* Found early allocated page */
                        __update_page_owner_handle(page_ext, early_handle, 0, 0,
                                                   -1, local_clock(), current->pid,
                                                   current->tgid, current->comm);
                        count++;
ext_put_continue:
                        page_ext_put(page_ext);
                }
                cond_resched();
        }

        pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
                pgdat->node_id, zone->name, count);
}

static void init_zones_in_node(pg_data_t *pgdat)
{
        struct zone *zone;
        struct zone *node_zones = pgdat->node_zones;

        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
                if (!populated_zone(zone))
                        continue;

                init_pages_in_zone(pgdat, zone);
        }
}

static void init_early_allocated_pages(void)
{
        pg_data_t *pgdat;

        for_each_online_pgdat(pgdat)
                init_zones_in_node(pgdat);
}

static const struct file_operations proc_page_owner_operations = {
        .read                = read_page_owner,
        .llseek                = lseek_page_owner,
};

static void *stack_start(struct seq_file *m, loff_t *ppos)
{
        struct stack *stack;

        if (*ppos == -1UL)
                return NULL;

        if (!*ppos) {
                /*
                 * This pairs with smp_store_release() from function
                 * add_stack_record_to_list(), so we get a consistent
                 * value of stack_list.
                 */
                stack = smp_load_acquire(&stack_list);
                m->private = stack;
        } else {
                stack = m->private;
        }

        return stack;
}

static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
{
        struct stack *stack = v;

        stack = stack->next;
        *ppos = stack ? *ppos + 1 : -1UL;
        m->private = stack;

        return stack;
}

static unsigned long page_owner_pages_threshold;

static int stack_print(struct seq_file *m, void *v)
{
        int i, nr_base_pages;
        struct stack *stack = v;
        unsigned long *entries;
        unsigned long nr_entries;
        struct stack_record *stack_record = stack->stack_record;

        if (!stack->stack_record)
                return 0;

        nr_entries = stack_record->size;
        entries = stack_record->entries;
        nr_base_pages = refcount_read(&stack_record->count) - 1;

        if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
                return 0;

        for (i = 0; i < nr_entries; i++)
                seq_printf(m, " %pS\n", (void *)entries[i]);
        seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);

        return 0;
}

static void stack_stop(struct seq_file *m, void *v)
{
}

static const struct seq_operations page_owner_stack_op = {
        .start        = stack_start,
        .next        = stack_next,
        .stop        = stack_stop,
        .show        = stack_print
};

static int page_owner_stack_open(struct inode *inode, struct file *file)
{
        return seq_open_private(file, &page_owner_stack_op, 0);
}

static const struct file_operations page_owner_stack_operations = {
        .open                = page_owner_stack_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static int page_owner_threshold_get(void *data, u64 *val)
{
        *val = READ_ONCE(page_owner_pages_threshold);
        return 0;
}

static int page_owner_threshold_set(void *data, u64 val)
{
        WRITE_ONCE(page_owner_pages_threshold, val);
        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get,
                        &page_owner_threshold_set, "%llu");


static int __init pageowner_init(void)
{
        struct dentry *dir;

        if (!static_branch_unlikely(&page_owner_inited)) {
                pr_info("page_owner is disabled\n");
                return 0;
        }

        debugfs_create_file("page_owner", 0400, NULL, NULL,
                            &proc_page_owner_operations);
        dir = debugfs_create_dir("page_owner_stacks", NULL);
        debugfs_create_file("show_stacks", 0400, dir, NULL,
                            &page_owner_stack_operations);
        debugfs_create_file("count_threshold", 0600, dir, NULL,
                            &proc_page_owner_threshold);

        return 0;
}
late_initcall(pageowner_init)


























































    1 


    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Checksumming functions for IPv6
 *
 * Authors:        Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Borrows very liberally from tcp.c and ip.c, see those
 *                files for more names.
 */

/*
 *        Fixes:
 *
 *        Ralf Baechle                        :        generic ipv6 checksum
 *        <ralf@waldorf-gmbh.de>
 */

#ifndef _CHECKSUM_IPV6_H
#define _CHECKSUM_IPV6_H

#include <asm/types.h>
#include <asm/byteorder.h>
#include <net/ip.h>
#include <asm/checksum.h>
#include <linux/in6.h>
#include <linux/tcp.h>
#include <linux/ipv6.h>

#ifndef _HAVE_ARCH_IPV6_CSUM
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
                        const struct in6_addr *daddr,
                        __u32 len, __u8 proto, __wsum csum);
#endif

static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto)
{
        return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                            &ipv6_hdr(skb)->daddr,
                                            skb->len, proto, 0));
}

static __inline__ __sum16 tcp_v6_check(int len,
                                   const struct in6_addr *saddr,
                                   const struct in6_addr *daddr,
                                   __wsum base)
{
        return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base);
}

static inline void __tcp_v6_send_check(struct sk_buff *skb,
                                       const struct in6_addr *saddr,
                                       const struct in6_addr *daddr)
{
        struct tcphdr *th = tcp_hdr(skb);

        th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0);
        skb->csum_start = skb_transport_header(skb) - skb->head;
        skb->csum_offset = offsetof(struct tcphdr, check);
}

static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb)
{
        struct ipv6hdr *ipv6h = ipv6_hdr(skb);
        struct tcphdr *th = tcp_hdr(skb);

        ipv6h->payload_len = 0;
        th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0);
}

static inline __sum16 udp_v6_check(int len,
                                   const struct in6_addr *saddr,
                                   const struct in6_addr *daddr,
                                   __wsum base)
{
        return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base);
}

void udp6_set_csum(bool nocheck, struct sk_buff *skb,
                   const struct in6_addr *saddr,
                   const struct in6_addr *daddr, int len);

int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto);
#endif




































    1 






































































    3 
    2 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_NULLS_H
#define _LINUX_RCULIST_NULLS_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list_nulls.h>
#include <linux/rcupdate.h>

/**
 * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
 * hlist_nulls_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_nulls_for_each_entry_rcu().
 */
static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
{
        if (!hlist_nulls_unhashed(n)) {
                __hlist_nulls_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * hlist_nulls_first_rcu - returns the first element of the hash list.
 * @head: the head of the list.
 */
#define hlist_nulls_first_rcu(head) \
        (*((struct hlist_nulls_node __rcu __force **)&(head)->first))

/**
 * hlist_nulls_next_rcu - returns the element of the list after @node.
 * @node: element of the list.
 */
#define hlist_nulls_next_rcu(node) \
        (*((struct hlist_nulls_node __rcu __force **)&(node)->next))

/**
 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry().
 */
static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        WRITE_ONCE(n->next, first);
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_nulls_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
                                            struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; !is_a_nulls(i); i = i->next)
                last = i;

        if (last) {
                WRITE_ONCE(n->next, last->next);
                n->pprev = &last->next;
                rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
        } else {
                hlist_nulls_add_head_rcu(n, h);
        }
}

/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
        n->pprev = &n->next;
        n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
}

/**
 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 *
 * The barrier() is needed to make sure compiler doesn't cache first element [1],
 * as this loop can be restarted [2]
 * [1] Documentation/memory-barriers.txt around line 1533
 * [2] Documentation/RCU/rculist_nulls.rst around line 146
 */
#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)                        \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))

/**
 * hlist_nulls_for_each_entry_safe -
 *   iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 */
#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)                \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member);        \
                   pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
#endif
#endif

























   13 
















   12 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// SPDX-License-Identifier: GPL-2.0
#include <linux/fault-inject.h>
#include <linux/mm.h>

static struct {
        struct fault_attr attr;

        bool ignore_gfp_highmem;
        bool ignore_gfp_reclaim;
        u32 min_order;
} fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_reclaim = true,
        .ignore_gfp_highmem = true,
        .min_order = 1,
};

static int __init setup_fail_page_alloc(char *str)
{
        return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);

bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
        int flags = 0;

        if (order < fail_page_alloc.min_order)
                return false;
        if (gfp_mask & __GFP_NOFAIL)
                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                return false;
        if (fail_page_alloc.ignore_gfp_reclaim &&
                        (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;

        /* See comment in __should_failslab() */
        if (gfp_mask & __GFP_NOWARN)
                flags |= FAULT_NOWARN;

        return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_page_alloc_debugfs(void)
{
        umode_t mode = S_IFREG | 0600;
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
                                        &fail_page_alloc.attr);

        debugfs_create_bool("ignore-gfp-wait", mode, dir,
                            &fail_page_alloc.ignore_gfp_reclaim);
        debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                            &fail_page_alloc.ignore_gfp_highmem);
        debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);

        return 0;
}

late_initcall(fail_page_alloc_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */





































































































































    1 








    1 



    1 







    1 
    1 
    1 








    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <linux/atomic.h>
#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/inet_common.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <asm/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <net/sch_generic.h>
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/udp.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
#include <net/net_namespace.h>
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netkit.h>
#include <linux/un.h>
#include <net/xdp_sock_drv.h>

#include "dev.h"

/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
        if (in_compat_syscall()) {
                struct compat_sock_fprog f32;

                if (len != sizeof(f32))
                        return -EINVAL;
                if (copy_from_sockptr(&f32, src, sizeof(f32)))
                        return -EFAULT;
                memset(dst, 0, sizeof(*dst));
                dst->len = f32.len;
                dst->filter = compat_ptr(f32.filter);
        } else {
                if (len != sizeof(*dst))
                        return -EINVAL;
                if (copy_from_sockptr(dst, src, sizeof(*dst)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);

/**
 *        sk_filter_trim_cap - run a packet through a socket filter
 *        @sk: sock associated with &sk_buff
 *        @skb: buffer to filter
 *        @cap: limit on how short the eBPF program may trim the packet
 *
 * Run the eBPF program and then cut skb->data to correct size returned by
 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to bpf_prog_run. It returns 0 if the packet should
 * be accepted or -EPERM if the packet should be tossed.
 *
 */
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
{
        int err;
        struct sk_filter *filter;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
                return -ENOMEM;
        }
        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
        if (err)
                return err;

        err = security_sock_rcv_skb(sk, skb);
        if (err)
                return err;

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter) {
                struct sock *save_sk = skb->sk;
                unsigned int pkt_len;

                skb->sk = sk;
                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
                skb->sk = save_sk;
                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
        }
        rcu_read_unlock();

        return err;
}
EXPORT_SYMBOL(sk_filter_trim_cap);

BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
        return skb_get_poff(skb);
}

BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = (struct nlattr *) &skb->data[a];
        if (!nla_ok(nla, skb->len - a))
                return 0;

        nla = nla_find_nested(nla, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        u8 tmp, *ptr;
        const int len = sizeof(tmp);

        if (offset >= 0) {
                if (headlen - offset >= len)
                        return *(u8 *)(data + offset);
                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                        return tmp;
        } else {
                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
                if (likely(ptr))
                        return *(u8 *)ptr;
        }

        return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
                                         offset);
}

BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be16 tmp, *ptr;
        const int len = sizeof(tmp);

        if (offset >= 0) {
                if (headlen - offset >= len)
                        return get_unaligned_be16(data + offset);
                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                        return be16_to_cpu(tmp);
        } else {
                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
                if (likely(ptr))
                        return get_unaligned_be16(ptr);
        }

        return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be32 tmp, *ptr;
        const int len = sizeof(tmp);

        if (likely(offset >= 0)) {
                if (headlen - offset >= len)
                        return get_unaligned_be32(data + offset);
                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                        return be32_to_cpu(tmp);
        } else {
                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
                if (likely(ptr))
                        return get_unaligned_be32(ptr);
        }

        return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
                              struct bpf_insn *insn_buf)
{
        struct bpf_insn *insn = insn_buf;

        switch (skb_field) {
        case SKF_AD_MARK:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);

                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, mark));
                break;

        case SKF_AD_PKTTYPE:
                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
#endif
                break;

        case SKF_AD_QUEUE:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);

                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, queue_mapping));
                break;

        case SKF_AD_VLAN_TAG:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);

                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_tci));
                break;
        case SKF_AD_VLAN_TAG_PRESENT:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_all));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
                break;
        }

        return insn - insn_buf;
}

static bool convert_bpf_extensions(struct sock_filter *fp,
                                   struct bpf_insn **insnp)
{
        struct bpf_insn *insn = *insnp;
        u32 cnt;

        switch (fp->k) {
        case SKF_AD_OFF + SKF_AD_PROTOCOL:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);

                /* A = *(u16 *) (CTX + offsetof(protocol)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, protocol));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PKTTYPE:
                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_IFINDEX:
        case SKF_AD_OFF + SKF_AD_HATYPE:
                BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
                BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      BPF_REG_TMP, BPF_REG_CTX,
                                      offsetof(struct sk_buff, dev));
                /* if (tmp != 0) goto pc + 1 */
                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
                *insn++ = BPF_EXIT_INSN();
                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, ifindex));
                else
                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, type));
                break;

        case SKF_AD_OFF + SKF_AD_MARK:
                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_RXHASH:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);

                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
                                    offsetof(struct sk_buff, hash));
                break;

        case SKF_AD_OFF + SKF_AD_QUEUE:
                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);

                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, vlan_proto));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
        case SKF_AD_OFF + SKF_AD_NLATTR:
        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
        case SKF_AD_OFF + SKF_AD_CPU:
        case SKF_AD_OFF + SKF_AD_RANDOM:
                /* arg1 = CTX */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
                /* arg2 = A */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
                /* arg3 = X */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
                switch (fp->k) {
                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
                        break;
                case SKF_AD_OFF + SKF_AD_CPU:
                        *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
                        break;
                case SKF_AD_OFF + SKF_AD_RANDOM:
                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
                        bpf_user_rnd_init_once();
                        break;
                }
                break;

        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
                /* A ^= X */
                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
                break;

        default:
                /* This is just a dummy call to avoid letting the compiler
                 * evict __bpf_call_base() as an optimization. Placed here
                 * where no-one bothers.
                 */
                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
                return false;
        }

        *insnp = insn;
        return true;
}

static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
{
        const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
        int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
        bool endian = BPF_SIZE(fp->code) == BPF_H ||
                      BPF_SIZE(fp->code) == BPF_W;
        bool indirect = BPF_MODE(fp->code) == BPF_IND;
        const int ip_align = NET_IP_ALIGN;
        struct bpf_insn *insn = *insnp;
        int offset = fp->k;

        if (!indirect &&
            ((unaligned_ok && offset >= 0) ||
             (!unaligned_ok && offset >= 0 &&
              offset + ip_align >= 0 &&
              offset + ip_align % size == 0))) {
                bool ldx_off_ok = offset <= S16_MAX;

                *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
                if (offset)
                        *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
                *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
                                      size, 2 + endian + (!ldx_off_ok * 2));
                if (ldx_off_ok) {
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_D, offset);
                } else {
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_TMP, 0);
                }
                if (endian)
                        *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
                *insn++ = BPF_JMP_A(8);
        }

        *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
                if (fp->k)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
        }

        switch (BPF_SIZE(fp->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
                break;
        default:
                return false;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
        *insn   = BPF_EXIT_INSN();

        *insnp = insn;
        return true;
}

/**
 *        bpf_convert_filter - convert filter program
 *        @prog: the user passed filter program
 *        @len: the length of the user passed filter program
 *        @new_prog: allocated 'struct bpf_prog' or NULL
 *        @new_len: pointer to store length of converted program
 *        @seen_ld_abs: bool whether we've seen ld_abs/ind
 *
 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
 * style extended BPF (eBPF).
 * Conversion workflow:
 *
 * 1) First pass for calculating the new program length:
 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
 *
 * 2) 2nd pass to remap in two passes: 1st pass finds new
 *    jump offsets, 2nd pass remapping:
 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
 */
static int bpf_convert_filter(struct sock_filter *prog, int len,
                              struct bpf_prog *new_prog, int *new_len,
                              bool *seen_ld_abs)
{
        int new_flen = 0, pass = 0, target, i, stack_off;
        struct bpf_insn *new_insn, *first_insn = NULL;
        struct sock_filter *fp;
        int *addrs = NULL;
        u8 bpf_src;

        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);

        if (len <= 0 || len > BPF_MAXINSNS)
                return -EINVAL;

        if (new_prog) {
                first_insn = new_prog->insnsi;
                addrs = kcalloc(len, sizeof(*addrs),
                                GFP_KERNEL | __GFP_NOWARN);
                if (!addrs)
                        return -ENOMEM;
        }

do_pass:
        new_insn = first_insn;
        fp = prog;

        /* Classic BPF related prologue emission. */
        if (new_prog) {
                /* Classic BPF expects A and X to be reset first. These need
                 * to be guaranteed to be the first two instructions.
                 */
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);

                /* All programs must keep CTX in callee saved BPF_REG_CTX.
                 * In eBPF case it's done by the compiler, here we need to
                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
                 */
                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
                if (*seen_ld_abs) {
                        /* For packet access in classic BPF, cache skb->data
                         * in callee-saved BPF R8 and skb->len - skb->data_len
                         * (headlen) in BPF R9. Since classic BPF is read-only
                         * on CTX, we only need to cache it once.
                         */
                        *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                                  BPF_REG_D, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, len));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data_len));
                        *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
                }
        } else {
                new_insn += 3;
        }

        for (i = 0; i < len; fp++, i++) {
                struct bpf_insn tmp_insns[32] = { };
                struct bpf_insn *insn = tmp_insns;

                if (addrs)
                        addrs[i] = new_insn - first_insn;

                switch (fp->code) {
                /* All arithmetic insns and skb loads map as-is. */
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_X:
                case BPF_ALU | BPF_MOD | BPF_K:
                case BPF_ALU | BPF_NEG:
                case BPF_LD | BPF_ABS | BPF_W:
                case BPF_LD | BPF_ABS | BPF_H:
                case BPF_LD | BPF_ABS | BPF_B:
                case BPF_LD | BPF_IND | BPF_W:
                case BPF_LD | BPF_IND | BPF_H:
                case BPF_LD | BPF_IND | BPF_B:
                        /* Check for overloaded BPF extension and
                         * directly convert it if found, otherwise
                         * just move on with mapping.
                         */
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            BPF_MODE(fp->code) == BPF_ABS &&
                            convert_bpf_extensions(fp, &insn))
                                break;
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            convert_bpf_ld_abs(fp, &insn)) {
                                *seen_ld_abs = true;
                                break;
                        }

                        if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
                            fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
                                *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
                                /* Error with exception code on div/mod by 0.
                                 * For cBPF programs, this was always return 0.
                                 */
                                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
                                *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                                *insn++ = BPF_EXIT_INSN();
                        }

                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
                        break;

                /* Jump transformation cannot use BPF block macros
                 * everywhere as offset calculation and target updates
                 * require a bit more work than the rest, i.e. jump
                 * opcodes map as-is, but offsets need adjustment.
                 */

#define BPF_EMIT_JMP                                                        \
        do {                                                                \
                const s32 off_min = S16_MIN, off_max = S16_MAX;                \
                s32 off;                                                \
                                                                        \
                if (target >= len || target < 0)                        \
                        goto err;                                        \
                off = addrs ? addrs[target] - addrs[i] - 1 : 0;                \
                /* Adjust pc relative offset for 2nd or 3rd insn. */        \
                off -= insn - tmp_insns;                                \
                /* Reject anything not fitting into insn->off. */        \
                if (off < off_min || off > off_max)                        \
                        goto err;                                        \
                insn->off = off;                                        \
        } while (0)

                case BPF_JMP | BPF_JA:
                        target = i + fp->k + 1;
                        insn->code = fp->code;
                        BPF_EMIT_JMP;
                        break;

                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
                                /* BPF immediates are signed, zero extend
                                 * immediate into tmp register and use it
                                 * in compare insn.
                                 */
                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);

                                insn->dst_reg = BPF_REG_A;
                                insn->src_reg = BPF_REG_TMP;
                                bpf_src = BPF_X;
                        } else {
                                insn->dst_reg = BPF_REG_A;
                                insn->imm = fp->k;
                                bpf_src = BPF_SRC(fp->code);
                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
                        }

                        /* Common case where 'jump_false' is next insn. */
                        if (fp->jf == 0) {
                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                                target = i + fp->jt + 1;
                                BPF_EMIT_JMP;
                                break;
                        }

                        /* Convert some jumps when 'jump_true' is next insn. */
                        if (fp->jt == 0) {
                                switch (BPF_OP(fp->code)) {
                                case BPF_JEQ:
                                        insn->code = BPF_JMP | BPF_JNE | bpf_src;
                                        break;
                                case BPF_JGT:
                                        insn->code = BPF_JMP | BPF_JLE | bpf_src;
                                        break;
                                case BPF_JGE:
                                        insn->code = BPF_JMP | BPF_JLT | bpf_src;
                                        break;
                                default:
                                        goto jmp_rest;
                                }

                                target = i + fp->jf + 1;
                                BPF_EMIT_JMP;
                                break;
                        }
jmp_rest:
                        /* Other jumps are mapped into two insns: Jxx and JA. */
                        target = i + fp->jt + 1;
                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                        BPF_EMIT_JMP;
                        insn++;

                        insn->code = BPF_JMP | BPF_JA;
                        target = i + fp->jf + 1;
                        BPF_EMIT_JMP;
                        break;

                /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
                case BPF_LDX | BPF_MSH | BPF_B: {
                        struct sock_filter tmp = {
                                .code        = BPF_LD | BPF_ABS | BPF_B,
                                .k        = fp->k,
                        };

                        *seen_ld_abs = true;

                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
                        convert_bpf_ld_abs(&tmp, &insn);
                        insn++;
                        /* A &= 0xf */
                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
                        /* A <<= 2 */
                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
                        /* tmp = X */
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = tmp */
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
                        break;
                }
                /* RET_K is remapped into 2 insns. RET_A case doesn't need an
                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
                 */
                case BPF_RET | BPF_A:
                case BPF_RET | BPF_K:
                        if (BPF_RVAL(fp->code) == BPF_K)
                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
                                                        0, fp->k);
                        *insn = BPF_EXIT_INSN();
                        break;

                /* Store to stack. */
                case BPF_ST:
                case BPF_STX:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
                                            -stack_off);
                        /* check_load_and_stores() verifies that classic BPF can
                         * load from stack only after write, so tracking
                         * stack_depth for ST|STX insns is enough
                         */
                        if (new_prog && new_prog->aux->stack_depth < stack_off)
                                new_prog->aux->stack_depth = stack_off;
                        break;

                /* Load from stack. */
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
                                            -stack_off);
                        break;

                /* A = K or X = K */
                case BPF_LD | BPF_IMM:
                case BPF_LDX | BPF_IMM:
                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
                                              BPF_REG_A : BPF_REG_X, fp->k);
                        break;

                /* X = A */
                case BPF_MISC | BPF_TAX:
                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        break;

                /* A = X */
                case BPF_MISC | BPF_TXA:
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
                        break;

                /* A = skb->len or X = skb->len */
                case BPF_LD | BPF_W | BPF_LEN:
                case BPF_LDX | BPF_W | BPF_LEN:
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
                                            offsetof(struct sk_buff, len));
                        break;

                /* Access seccomp_data fields. */
                case BPF_LDX | BPF_ABS | BPF_W:
                        /* A = *(u32 *) (ctx + K) */
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
                        break;

                /* Unknown instruction. */
                default:
                        goto err;
                }

                insn++;
                if (new_prog)
                        memcpy(new_insn, tmp_insns,
                               sizeof(*insn) * (insn - tmp_insns));
                new_insn += insn - tmp_insns;
        }

        if (!new_prog) {
                /* Only calculating new length. */
                *new_len = new_insn - first_insn;
                if (*seen_ld_abs)
                        *new_len += 4; /* Prologue bits. */
                return 0;
        }

        pass++;
        if (new_flen != new_insn - first_insn) {
                new_flen = new_insn - first_insn;
                if (pass > 2)
                        goto err;
                goto do_pass;
        }

        kfree(addrs);
        BUG_ON(*new_len != new_flen);
        return 0;
err:
        kfree(addrs);
        return -EINVAL;
}

/* Security:
 *
 * As we dont want to clear mem[] array for each packet going through
 * __bpf_prog_run(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
 * a malicious user doesn't try to abuse us.
 */
static int check_load_and_stores(const struct sock_filter *filter, int flen)
{
        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
        int pc, ret = 0;

        BUILD_BUG_ON(BPF_MEMWORDS > 16);

        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
        if (!masks)
                return -ENOMEM;

        memset(masks, 0xff, flen * sizeof(*masks));

        for (pc = 0; pc < flen; pc++) {
                memvalid &= masks[pc];

                switch (filter[pc].code) {
                case BPF_ST:
                case BPF_STX:
                        memvalid |= (1 << filter[pc].k);
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        if (!(memvalid & (1 << filter[pc].k))) {
                                ret = -EINVAL;
                                goto error;
                        }
                        break;
                case BPF_JMP | BPF_JA:
                        /* A jump must set masks on target */
                        masks[pc + 1 + filter[pc].k] &= memvalid;
                        memvalid = ~0;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* A jump must set masks on targets */
                        masks[pc + 1 + filter[pc].jt] &= memvalid;
                        masks[pc + 1 + filter[pc].jf] &= memvalid;
                        memvalid = ~0;
                        break;
                }
        }
error:
        kfree(masks);
        return ret;
}

static bool chk_code_allowed(u16 code_to_probe)
{
        static const bool codes[] = {
                /* 32 bit ALU operations */
                [BPF_ALU | BPF_ADD | BPF_K] = true,
                [BPF_ALU | BPF_ADD | BPF_X] = true,
                [BPF_ALU | BPF_SUB | BPF_K] = true,
                [BPF_ALU | BPF_SUB | BPF_X] = true,
                [BPF_ALU | BPF_MUL | BPF_K] = true,
                [BPF_ALU | BPF_MUL | BPF_X] = true,
                [BPF_ALU | BPF_DIV | BPF_K] = true,
                [BPF_ALU | BPF_DIV | BPF_X] = true,
                [BPF_ALU | BPF_MOD | BPF_K] = true,
                [BPF_ALU | BPF_MOD | BPF_X] = true,
                [BPF_ALU | BPF_AND | BPF_K] = true,
                [BPF_ALU | BPF_AND | BPF_X] = true,
                [BPF_ALU | BPF_OR | BPF_K] = true,
                [BPF_ALU | BPF_OR | BPF_X] = true,
                [BPF_ALU | BPF_XOR | BPF_K] = true,
                [BPF_ALU | BPF_XOR | BPF_X] = true,
                [BPF_ALU | BPF_LSH | BPF_K] = true,
                [BPF_ALU | BPF_LSH | BPF_X] = true,
                [BPF_ALU | BPF_RSH | BPF_K] = true,
                [BPF_ALU | BPF_RSH | BPF_X] = true,
                [BPF_ALU | BPF_NEG] = true,
                /* Load instructions */
                [BPF_LD | BPF_W | BPF_ABS] = true,
                [BPF_LD | BPF_H | BPF_ABS] = true,
                [BPF_LD | BPF_B | BPF_ABS] = true,
                [BPF_LD | BPF_W | BPF_LEN] = true,
                [BPF_LD | BPF_W | BPF_IND] = true,
                [BPF_LD | BPF_H | BPF_IND] = true,
                [BPF_LD | BPF_B | BPF_IND] = true,
                [BPF_LD | BPF_IMM] = true,
                [BPF_LD | BPF_MEM] = true,
                [BPF_LDX | BPF_W | BPF_LEN] = true,
                [BPF_LDX | BPF_B | BPF_MSH] = true,
                [BPF_LDX | BPF_IMM] = true,
                [BPF_LDX | BPF_MEM] = true,
                /* Store instructions */
                [BPF_ST] = true,
                [BPF_STX] = true,
                /* Misc instructions */
                [BPF_MISC | BPF_TAX] = true,
                [BPF_MISC | BPF_TXA] = true,
                /* Return instructions */
                [BPF_RET | BPF_K] = true,
                [BPF_RET | BPF_A] = true,
                /* Jump instructions */
                [BPF_JMP | BPF_JA] = true,
                [BPF_JMP | BPF_JEQ | BPF_K] = true,
                [BPF_JMP | BPF_JEQ | BPF_X] = true,
                [BPF_JMP | BPF_JGE | BPF_K] = true,
                [BPF_JMP | BPF_JGE | BPF_X] = true,
                [BPF_JMP | BPF_JGT | BPF_K] = true,
                [BPF_JMP | BPF_JGT | BPF_X] = true,
                [BPF_JMP | BPF_JSET | BPF_K] = true,
                [BPF_JMP | BPF_JSET | BPF_X] = true,
        };

        if (code_to_probe >= ARRAY_SIZE(codes))
                return false;

        return codes[code_to_probe];
}

static bool bpf_check_basics_ok(const struct sock_filter *filter,
                                unsigned int flen)
{
        if (filter == NULL)
                return false;
        if (flen == 0 || flen > BPF_MAXINSNS)
                return false;

        return true;
}

/**
 *        bpf_check_classic - verify socket filter code
 *        @filter: filter to verify
 *        @flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
 *
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int bpf_check_classic(const struct sock_filter *filter,
                             unsigned int flen)
{
        bool anc_found;
        int pc;

        /* Check the filter code now */
        for (pc = 0; pc < flen; pc++) {
                const struct sock_filter *ftest = &filter[pc];

                /* May we actually operate on this code? */
                if (!chk_code_allowed(ftest->code))
                        return -EINVAL;

                /* Some instructions need special checks */
                switch (ftest->code) {
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_K:
                        /* Check for division by zero */
                        if (ftest->k == 0)
                                return -EINVAL;
                        break;
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_K:
                        if (ftest->k >= 32)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                case BPF_ST:
                case BPF_STX:
                        /* Check for invalid memory addresses */
                        if (ftest->k >= BPF_MEMWORDS)
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JA:
                        /* Note, the large ftest->k might cause loops.
                         * Compare this with conditional jumps below,
                         * where offsets are limited. --ANK (981016)
                         */
                        if (ftest->k >= (unsigned int)(flen - pc - 1))
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* Both conditionals must be safe */
                        if (pc + ftest->jt + 1 >= flen ||
                            pc + ftest->jf + 1 >= flen)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_W | BPF_ABS:
                case BPF_LD | BPF_H | BPF_ABS:
                case BPF_LD | BPF_B | BPF_ABS:
                        anc_found = false;
                        if (bpf_anc_helper(ftest) & BPF_ANC)
                                anc_found = true;
                        /* Ancillary operation unknown or unsupported */
                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
                                return -EINVAL;
                }
        }

        /* Last instruction must be a RET code */
        switch (filter[flen - 1].code) {
        case BPF_RET | BPF_K:
        case BPF_RET | BPF_A:
                return check_load_and_stores(filter, flen);
        }

        return -EINVAL;
}

static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
                                      const struct sock_fprog *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct sock_fprog_kern *fkprog;

        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
        if (!fp->orig_prog)
                return -ENOMEM;

        fkprog = fp->orig_prog;
        fkprog->len = fprog->len;

        fkprog->filter = kmemdup(fp->insns, fsize,
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!fkprog->filter) {
                kfree(fp->orig_prog);
                return -ENOMEM;
        }

        return 0;
}

static void bpf_release_orig_filter(struct bpf_prog *fp)
{
        struct sock_fprog_kern *fprog = fp->orig_prog;

        if (fprog) {
                kfree(fprog->filter);
                kfree(fprog);
        }
}

static void __bpf_prog_release(struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
                bpf_prog_put(prog);
        } else {
                bpf_release_orig_filter(prog);
                bpf_prog_free(prog);
        }
}

static void __sk_filter_release(struct sk_filter *fp)
{
        __bpf_prog_release(fp->prog);
        kfree(fp);
}

/**
 *         sk_filter_release_rcu - Release a socket filter by rcu_head
 *        @rcu: rcu_head that contains the sk_filter to free
 */
static void sk_filter_release_rcu(struct rcu_head *rcu)
{
        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

        __sk_filter_release(fp);
}

/**
 *        sk_filter_release - release a socket filter
 *        @fp: filter to remove
 *
 *        Remove a filter from a socket and release its resources.
 */
static void sk_filter_release(struct sk_filter *fp)
{
        if (refcount_dec_and_test(&fp->refcnt))
                call_rcu(&fp->rcu, sk_filter_release_rcu);
}

void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
{
        u32 filter_size = bpf_prog_size(fp->prog->len);

        atomic_sub(filter_size, &sk->sk_omem_alloc);
        sk_filter_release(fp);
}

/* try to charge the socket memory if there is space available
 * return true on success
 */
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        u32 filter_size = bpf_prog_size(fp->prog->len);

        /* same check as in sock_kmalloc() */
        if (filter_size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
                atomic_add(filter_size, &sk->sk_omem_alloc);
                return true;
        }
        return false;
}

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        if (!refcount_inc_not_zero(&fp->refcnt))
                return false;

        if (!__sk_filter_charge(sk, fp)) {
                sk_filter_release(fp);
                return false;
        }
        return true;
}

static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
{
        struct sock_filter *old_prog;
        struct bpf_prog *old_fp;
        int err, new_len, old_len = fp->len;
        bool seen_ld_abs = false;

        /* We are free to overwrite insns et al right here as it won't be used at
         * this point in time anymore internally after the migration to the eBPF
         * instruction representation.
         */
        BUILD_BUG_ON(sizeof(struct sock_filter) !=
                     sizeof(struct bpf_insn));

        /* Conversion cannot happen on overlapping memory areas,
         * so we need to keep the user BPF around until the 2nd
         * pass. At this time, the user BPF is stored in fp->insns.
         */
        old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
                           GFP_KERNEL | __GFP_NOWARN);
        if (!old_prog) {
                err = -ENOMEM;
                goto out_err;
        }

        /* 1st pass: calculate the new program length. */
        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
                                 &seen_ld_abs);
        if (err)
                goto out_err_free;

        /* Expand fp for appending the new filter representation. */
        old_fp = fp;
        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
        if (!fp) {
                /* The old_fp is still around in case we couldn't
                 * allocate new memory, so uncharge on that one.
                 */
                fp = old_fp;
                err = -ENOMEM;
                goto out_err_free;
        }

        fp->len = new_len;

        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
        err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
                                 &seen_ld_abs);
        if (err)
                /* 2nd bpf_convert_filter() can fail only if it fails
                 * to allocate memory, remapping must succeed. Note,
                 * that at this time old_fp has already been released
                 * by krealloc().
                 */
                goto out_err_free;

        fp = bpf_prog_select_runtime(fp, &err);
        if (err)
                goto out_err_free;

        kfree(old_prog);
        return fp;

out_err_free:
        kfree(old_prog);
out_err:
        __bpf_prog_release(fp);
        return ERR_PTR(err);
}

static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
                                           bpf_aux_classic_check_t trans)
{
        int err;

        fp->bpf_func = NULL;
        fp->jited = 0;

        err = bpf_check_classic(fp->insns, fp->len);
        if (err) {
                __bpf_prog_release(fp);
                return ERR_PTR(err);
        }

        /* There might be additional checks and transformations
         * needed on classic filters, f.e. in case of seccomp.
         */
        if (trans) {
                err = trans(fp->insns, fp->len);
                if (err) {
                        __bpf_prog_release(fp);
                        return ERR_PTR(err);
                }
        }

        /* Probe if we can JIT compile the filter and if so, do
         * the compilation of the filter.
         */
        bpf_jit_compile(fp);

        /* JIT compiler couldn't process this filter, so do the eBPF translation
         * for the optimized interpreter.
         */
        if (!fp->jited)
                fp = bpf_migrate_filter(fp);

        return fp;
}

/**
 *        bpf_prog_create - create an unattached filter
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *
 * Create a filter independent of any socket. We first run some
 * sanity checks on it to make sure it does not explode on us later.
 * If an error occurs or there is insufficient memory for the filter
 * a negative errno code is returned. On success the return is zero.
 */
int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        memcpy(fp->insns, fprog->filter, fsize);

        fp->len = fprog->len;
        /* Since unattached filters are not copied back to user
         * space through sk_get_filter(), we do not need to hold
         * a copy here, and can spare us the work.
         */
        fp->orig_prog = NULL;

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, NULL);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create);

/**
 *        bpf_prog_create_from_user - create an unattached filter from user buffer
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *        @trans: post-classic verifier transformation handler
 *        @save_orig: save classic BPF program
 *
 * This function effectively does the same as bpf_prog_create(), only
 * that it builds up its insns buffer from user space provided buffer.
 * It also allows for passing a bpf_aux_classic_check_t handler.
 */
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;
        int err;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
                __bpf_prog_free(fp);
                return -EFAULT;
        }

        fp->len = fprog->len;
        fp->orig_prog = NULL;

        if (save_orig) {
                err = bpf_prog_store_orig_filter(fp, fprog);
                if (err) {
                        __bpf_prog_free(fp);
                        return -ENOMEM;
                }
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, trans);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);

void bpf_prog_destroy(struct bpf_prog *fp)
{
        __bpf_prog_release(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);

static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
        struct sk_filter *fp, *old_fp;

        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
        if (!fp)
                return -ENOMEM;

        fp->prog = prog;

        if (!__sk_filter_charge(sk, fp)) {
                kfree(fp);
                return -ENOMEM;
        }
        refcount_set(&fp->refcnt, 1);

        old_fp = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_filter, fp);

        if (old_fp)
                sk_filter_uncharge(sk, old_fp);

        return 0;
}

static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *prog;
        int err;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return ERR_PTR(-EINVAL);

        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!prog)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
                __bpf_prog_free(prog);
                return ERR_PTR(-EFAULT);
        }

        prog->len = fprog->len;

        err = bpf_prog_store_orig_filter(prog, fprog);
        if (err) {
                __bpf_prog_free(prog);
                return ERR_PTR(-ENOMEM);
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        return bpf_prepare_filter(prog, NULL);
}

/**
 *        sk_attach_filter - attach a socket filter
 *        @fprog: the filter program
 *        @sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                __bpf_prog_release(prog);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);

int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err, optmem_max;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        if (bpf_prog_size(prog->len) > optmem_max)
                err = -ENOMEM;
        else
                err = reuseport_attach_prog(sk, prog);

        if (err)
                __bpf_prog_release(prog);

        return err;
}

static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
}

int sk_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog = __get_bpf(ufd, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                bpf_prog_put(prog);
                return err;
        }

        return 0;
}

int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog;
        int err, optmem_max;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (PTR_ERR(prog) == -EINVAL)
                prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
                /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
                 * bpf prog (e.g. sockmap).  It depends on the
                 * limitation imposed by bpf_prog_load().
                 * Hence, sysctl_optmem_max is not checked.
                 */
                if ((sk->sk_type != SOCK_STREAM &&
                     sk->sk_type != SOCK_DGRAM) ||
                    (sk->sk_protocol != IPPROTO_UDP &&
                     sk->sk_protocol != IPPROTO_TCP) ||
                    (sk->sk_family != AF_INET &&
                     sk->sk_family != AF_INET6)) {
                        err = -ENOTSUPP;
                        goto err_prog_put;
                }
        } else {
                /* BPF_PROG_TYPE_SOCKET_FILTER */
                optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
                if (bpf_prog_size(prog->len) > optmem_max) {
                        err = -ENOMEM;
                        goto err_prog_put;
                }
        }

        err = reuseport_attach_prog(sk, prog);
err_prog_put:
        if (err)
                bpf_prog_put(prog);

        return err;
}

void sk_reuseport_prog_free(struct bpf_prog *prog)
{
        if (!prog)
                return;

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
                bpf_prog_put(prog);
        else
                bpf_prog_destroy(prog);
}

struct bpf_scratchpad {
        union {
                __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
                u8     buff[MAX_BPF_STACK];
        };
        local_lock_t        bh_lock;
};

static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = {
        .bh_lock        = INIT_LOCAL_LOCK(bh_lock),
};

static inline int __bpf_try_make_writable(struct sk_buff *skb,
                                          unsigned int write_len)
{
#ifdef CONFIG_DEBUG_NET
        /* Avoid a splat in pskb_may_pull_reason() */
        if (write_len > INT_MAX)
                return -EINVAL;
#endif
        return skb_ensure_writable(skb, write_len);
}

static inline int bpf_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        int err = __bpf_try_make_writable(skb, write_len);

        bpf_compute_data_pointers(skb);
        return err;
}

static int bpf_try_make_head_writable(struct sk_buff *skb)
{
        return bpf_try_make_writable(skb, skb_headlen(skb));
}

static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len, u64, flags)
{
        void *ptr;

        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
                return -EINVAL;
        if (unlikely(offset > INT_MAX))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;

        ptr = skb->data + offset;
        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpull_rcsum(skb, ptr, len, offset);

        memcpy(ptr, from, len);

        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpush_rcsum(skb, ptr, len, offset);
        if (flags & BPF_F_INVALIDATE_HASH)
                skb_clear_hash(skb);

        return 0;
}

static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
        .func                = bpf_skb_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags)
{
        return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
}

BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > INT_MAX))
                goto err_clear;

        ptr = skb_header_pointer(skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
        .func                = bpf_skb_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
{
        return ____bpf_skb_load_bytes(skb, offset, to, len);
}

BPF_CALL_4(bpf_flow_dissector_load_bytes,
           const struct bpf_flow_dissector *, ctx, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        if (unlikely(!ctx->skb))
                goto err_clear;

        ptr = skb_header_pointer(ctx->skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
        .func                = bpf_flow_dissector_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
           u32, offset, void *, to, u32, len, u32, start_header)
{
        u8 *end = skb_tail_pointer(skb);
        u8 *start, *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        switch (start_header) {
        case BPF_HDR_START_MAC:
                if (unlikely(!skb_mac_header_was_set(skb)))
                        goto err_clear;
                start = skb_mac_header(skb);
                break;
        case BPF_HDR_START_NET:
                start = skb_network_header(skb);
                break;
        default:
                goto err_clear;
        }

        ptr = start + offset;

        if (likely(ptr + len <= end)) {
                memcpy(to, ptr, len);
                return 0;
        }

err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
        .func                = bpf_skb_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto bpf_skb_pull_data_proto = {
        .func                = bpf_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
{
        return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_sk_fullsock_proto = {
        .func                = bpf_sk_fullsock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                           unsigned int write_len)
{
        return __bpf_try_make_writable(skb, write_len);
}

BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto sk_skb_pull_data_proto = {
        .func                = sk_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                csum_replace_by_diff(ptr, to);
                break;
        case 2:
                csum_replace2(ptr, from, to);
                break;
        case 4:
                csum_replace4(ptr, from, to);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
        .func                = bpf_l3_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
        bool do_mforce = flags & BPF_F_MARK_ENFORCE;
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
                               BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        if (is_mmzero && !do_mforce && !*ptr)
                return 0;

        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
                break;
        case 2:
                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
                break;
        case 4:
                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
                break;
        default:
                return -EINVAL;
        }

        if (is_mmzero && !*ptr)
                *ptr = CSUM_MANGLED_0;
        return 0;
}

static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
        .func                = bpf_l4_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
           __be32 *, to, u32, to_size, __wsum, seed)
{
        struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
        u32 diff_size = from_size + to_size;
        int i, j = 0;
        __wsum ret;

        /* This is quite flexible, some examples:
         *
         * from_size == 0, to_size > 0,  seed := csum --> pushing data
         * from_size > 0,  to_size == 0, seed := csum --> pulling data
         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
         *
         * Even for diffing, from_size and to_size don't need to be equal.
         */
        if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
                     diff_size > sizeof(sp->diff)))
                return -EINVAL;

        local_lock_nested_bh(&bpf_sp.bh_lock);
        for (i = 0; i < from_size / sizeof(__be32); i++, j++)
                sp->diff[j] = ~from[i];
        for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
                sp->diff[j] = to[i];

        ret = csum_partial(sp->diff, diff_size, seed);
        local_unlock_nested_bh(&bpf_sp.bh_lock);
        return ret;
}

static const struct bpf_func_proto bpf_csum_diff_proto = {
        .func                = bpf_csum_diff,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
{
        /* The interface is to be used in combination with bpf_csum_diff()
         * for direct packet writes. csum rotation for alignment as well
         * as emulating csum_sub() can be done from the eBPF program.
         */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                return (skb->csum = csum_add(skb->csum, csum));

        return -ENOTSUPP;
}

static const struct bpf_func_proto bpf_csum_update_proto = {
        .func                = bpf_csum_update,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
{
        /* The interface is to be used in combination with bpf_skb_adjust_room()
         * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
         * is passed as flags, for example.
         */
        switch (level) {
        case BPF_CSUM_LEVEL_INC:
                __skb_incr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_DEC:
                __skb_decr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_RESET:
                __skb_reset_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_QUERY:
                return skb->ip_summed == CHECKSUM_UNNECESSARY ?
                       skb->csum_level : -EACCES;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_csum_level_proto = {
        .func                = bpf_csum_level,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
        return dev_forward_skb_nomtu(dev, skb);
}

static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
                                      struct sk_buff *skb)
{
        int ret = ____dev_forward_skb(dev, skb, false);

        if (likely(!ret)) {
                skb->dev = dev;
                ret = netif_rx(skb);
        }

        return ret;
}

static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
        int ret;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                kfree_skb(skb);
                return -ENETDOWN;
        }

        skb->dev = dev;
        skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
        skb_clear_tstamp(skb);

        dev_xmit_recursion_inc();
        ret = dev_queue_xmit(skb);
        dev_xmit_recursion_dec();

        return ret;
}

static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        unsigned int mlen = skb_network_offset(skb);

        if (unlikely(skb->len <= mlen)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        if (mlen) {
                __skb_pull(skb, mlen);

                /* At ingress, the mac header has already been pulled once.
                 * At egress, skb_pospull_rcsum has to be done in case that
                 * the skb is originated from ingress (i.e. a forwarded skb)
                 * to ensure that rcsum starts at net header.
                 */
                if (!skb_at_tc_ingress(skb))
                        skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
        }
        skb_pop_mac_header(skb);
        skb_reset_mac_len(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        /* Verify that a link layer header is carried */
        if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        bpf_push_mac_rcsum(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
                          u32 flags)
{
        if (dev_is_mac_header_xmit(dev))
                return __bpf_redirect_common(skb, dev, flags);
        else
                return __bpf_redirect_no_mac(skb, dev, flags);
}

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        const struct in6_addr *nexthop;
        struct dst_entry *dst = NULL;
        struct neighbour *neigh;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        rcu_read_lock();
        if (!nh) {
                dst = skb_dst(skb);
                nexthop = rt6_nexthop(dst_rt6_info(dst),
                                      &ipv6_hdr(skb)->daddr);
        } else {
                nexthop = &nh->ipv6_nh;
        }
        neigh = ip_neigh_gw6(dev, nexthop);
        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, false);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock_bh();
        if (dst)
                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct dst_entry *dst;
                struct flowi6 fl6 = {
                        .flowi6_flags = FLOWI_FLAG_ANYSRC,
                        .flowi6_mark  = skb->mark,
                        .flowlabel    = ip6_flowinfo(ip6h),
                        .flowi6_oif   = dev->ifindex,
                        .flowi6_proto = ip6h->nexthdr,
                        .daddr              = ip6h->daddr,
                        .saddr              = ip6h->saddr,
                };

                dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
                if (IS_ERR(dst))
                        goto out_drop;

                skb_dst_set(skb, dst);
        } else if (nh->nh_family != AF_INET6) {
                goto out_drop;
        }

        err = bpf_out_neigh_v6(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_IPV6 */

#if IS_ENABLED(CONFIG_INET)
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        rcu_read_lock();
        if (!nh) {
                struct rtable *rt = skb_rtable(skb);

                neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        } else if (nh->nh_family == AF_INET6) {
                neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
                is_v6gw = true;
        } else if (nh->nh_family == AF_INET) {
                neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
        } else {
                rcu_read_unlock();
                goto out_drop;
        }

        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, is_v6gw);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct flowi4 fl4 = {
                        .flowi4_flags = FLOWI_FLAG_ANYSRC,
                        .flowi4_mark  = skb->mark,
                        .flowi4_tos   = RT_TOS(ip4h->tos),
                        .flowi4_oif   = dev->ifindex,
                        .flowi4_proto = ip4h->protocol,
                        .daddr              = ip4h->daddr,
                        .saddr              = ip4h->saddr,
                };
                struct rtable *rt;

                rt = ip_route_output_flow(net, &fl4, NULL);
                if (IS_ERR(rt))
                        goto out_drop;
                if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                        ip_rt_put(rt);
                        goto out_drop;
                }

                skb_dst_set(skb, &rt->dst);
        }

        err = bpf_out_neigh_v4(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_INET */

static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
                                struct bpf_nh_params *nh)
{
        struct ethhdr *ethh = eth_hdr(skb);

        if (unlikely(skb->mac_header >= skb->network_header))
                goto out;
        bpf_push_mac_rcsum(skb);
        if (is_multicast_ether_addr(ethh->h_dest))
                goto out;

        skb_pull(skb, sizeof(*ethh));
        skb_unset_mac_header(skb);
        skb_reset_network_header(skb);

        if (skb->protocol == htons(ETH_P_IP))
                return __bpf_redirect_neigh_v4(skb, dev, nh);
        else if (skb->protocol == htons(ETH_P_IPV6))
                return __bpf_redirect_neigh_v6(skb, dev, nh);
out:
        kfree_skb(skb);
        return -ENOTSUPP;
}

/* Internal, non-exposed redirect flags. */
enum {
        BPF_F_NEIGH        = (1ULL << 1),
        BPF_F_PEER        = (1ULL << 2),
        BPF_F_NEXTHOP        = (1ULL << 3),
#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};

BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
        struct net_device *dev;
        struct sk_buff *clone;
        int ret;

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return -EINVAL;

        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
        if (unlikely(!dev))
                return -EINVAL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (unlikely(!clone))
                return -ENOMEM;

        /* For direct write, we need to keep the invariant that the skbs
         * we're dealing with need to be uncloned. Should uncloning fail
         * here, we need to free the just generated clone to unclone once
         * again.
         */
        ret = bpf_try_make_head_writable(skb);
        if (unlikely(ret)) {
                kfree_skb(clone);
                return -ENOMEM;
        }

        return __bpf_redirect(clone, dev, flags);
}

static const struct bpf_func_proto bpf_clone_redirect_proto = {
        .func           = bpf_clone_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (likely(ops->ndo_get_peer_dev))
                return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
                                       netkit_peer_dev, dev);
        return NULL;
}

int skb_do_redirect(struct sk_buff *skb)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net *net = dev_net(skb->dev);
        struct net_device *dev;
        u32 flags = ri->flags;

        dev = dev_get_by_index_rcu(net, ri->tgt_index);
        ri->tgt_index = 0;
        ri->flags = 0;
        if (unlikely(!dev))
                goto out_drop;
        if (flags & BPF_F_PEER) {
                if (unlikely(!skb_at_tc_ingress(skb)))
                        goto out_drop;
                dev = skb_get_peer_dev(dev);
                if (unlikely(!dev ||
                             !(dev->flags & IFF_UP) ||
                             net_eq(net, dev_net(dev))))
                        goto out_drop;
                skb->dev = dev;
                dev_sw_netstats_rx_add(dev, skb->len);
                return -EAGAIN;
        }
        return flags & BPF_F_NEIGH ?
               __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
                                    &ri->nh : NULL) :
               __bpf_redirect(skb, dev, flags);
out_drop:
        kfree_skb(skb);
        return -EINVAL;
}

BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return TC_ACT_SHOT;

        ri->flags = flags;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_proto = {
        .func           = bpf_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_PEER;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_peer_proto = {
        .func           = bpf_redirect_peer,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
           int, plen, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely((plen && plen < sizeof(*params)) || flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
        ri->tgt_index = ifindex;

        BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
        if (plen)
                memcpy(&ri->nh, params, sizeof(ri->nh));

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_neigh_proto = {
        .func                = bpf_redirect_neigh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->apply_bytes = bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
        .func           = bpf_msg_apply_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->cork_bytes = bytes;
        return 0;
}

static void sk_msg_reset_curr(struct sk_msg *msg)
{
        u32 i = msg->sg.start;
        u32 len = 0;

        do {
                len += sk_msg_elem(msg, i)->length;
                sk_msg_iter_var_next(i);
                if (len >= msg->sg.size)
                        break;
        } while (i != msg->sg.end);

        msg->sg.curr = i;
        msg->sg.copybreak = 0;
}

static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
        .func           = bpf_msg_cork_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
           u32, end, u64, flags)
{
        u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
        u32 first_sge, last_sge, i, shift, bytes_sg_total;
        struct scatterlist *sge;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags || end <= start))
                return -EINVAL;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += len;
                len = sk_msg_elem(msg, i)->length;
                if (start < offset + len)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (unlikely(start >= offset + len))
                return -EINVAL;

        first_sge = i;
        /* The start may point into the sg element so we need to also
         * account for the headroom.
         */
        bytes_sg_total = start - offset + bytes;
        if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
                goto out;

        /* At this point we need to linearize multiple scatterlist
         * elements or a single shared page. Either way we need to
         * copy into a linear buffer exclusively owned by BPF. Then
         * place the buffer in the scatterlist and fixup the original
         * entries by removing the entries now in the linear buffer
         * and shifting the remaining entries. For now we do not try
         * to copy partial entries to avoid complexity of running out
         * of sg_entry slots. The downside is reading a single byte
         * will copy the entire sg entry.
         */
        do {
                copy += sk_msg_elem(msg, i)->length;
                sk_msg_iter_var_next(i);
                if (bytes_sg_total <= copy)
                        break;
        } while (i != msg->sg.end);
        last_sge = i;

        if (unlikely(bytes_sg_total > copy))
                return -EINVAL;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy));
        if (unlikely(!page))
                return -ENOMEM;

        raw = page_address(page);
        i = first_sge;
        do {
                sge = sk_msg_elem(msg, i);
                from = sg_virt(sge);
                len = sge->length;
                to = raw + poffset;

                memcpy(to, from, len);
                poffset += len;
                sge->length = 0;
                put_page(sg_page(sge));

                sk_msg_iter_var_next(i);
        } while (i != last_sge);

        sg_set_page(&msg->sg.data[first_sge], page, copy, 0);

        /* To repair sg ring we need to shift entries. If we only
         * had a single entry though we can just replace it and
         * be done. Otherwise walk the ring and shift the entries.
         */
        WARN_ON_ONCE(last_sge == first_sge);
        shift = last_sge > first_sge ?
                last_sge - first_sge - 1 :
                NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
        if (!shift)
                goto out;

        i = first_sge;
        sk_msg_iter_var_next(i);
        do {
                u32 move_from;

                if (i + shift >= NR_MSG_FRAG_IDS)
                        move_from = i + shift - NR_MSG_FRAG_IDS;
                else
                        move_from = i + shift;
                if (move_from == msg->sg.end)
                        break;

                msg->sg.data[i] = msg->sg.data[move_from];
                msg->sg.data[move_from].length = 0;
                msg->sg.data[move_from].page_link = 0;
                msg->sg.data[move_from].offset = 0;
                sk_msg_iter_var_next(i);
        } while (1);

        msg->sg.end = msg->sg.end - shift > msg->sg.end ?
                      msg->sg.end - shift + NR_MSG_FRAG_IDS :
                      msg->sg.end - shift;
out:
        sk_msg_reset_curr(msg);
        msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
        msg->data_end = msg->data + bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_pull_data_proto = {
        .func                = bpf_msg_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
        u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (start >= offset + l)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        /* If no space available will fallback to copy, we need at
         * least one scatterlist elem available to push data into
         * when start aligns to the beginning of an element or two
         * when it falls inside an element. We handle the start equals
         * offset case because its the common case for inserting a
         * header.
         */
        if (!space || (space == 1 && start != offset))
                copy = msg->sg.data[i].length;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy + len));
        if (unlikely(!page))
                return -ENOMEM;

        if (copy) {
                int front, back;

                raw = page_address(page);

                psge = sk_msg_elem(msg, i);
                front = start - offset;
                back = psge->length - front;
                from = sg_virt(psge);

                if (front)
                        memcpy(raw, from, front);

                if (back) {
                        from += front;
                        to = raw + front + len;

                        memcpy(to, from, back);
                }

                put_page(sg_page(psge));
        } else if (start - offset) {
                psge = sk_msg_elem(msg, i);
                rsge = sk_msg_elem_cpy(msg, i);

                psge->length = start - offset;
                rsge.length -= psge->length;
                rsge.offset += start;

                sk_msg_iter_var_next(i);
                sg_unmark_end(psge);
                sg_unmark_end(&rsge);
                sk_msg_iter_next(msg, end);
        }

        /* Slot(s) to place newly allocated data */
        new = i;

        /* Shift one or two slots as needed */
        if (!copy) {
                sge = sk_msg_elem_cpy(msg, i);

                sk_msg_iter_var_next(i);
                sg_unmark_end(&sge);
                sk_msg_iter_next(msg, end);

                nsge = sk_msg_elem_cpy(msg, i);
                if (rsge.length) {
                        sk_msg_iter_var_next(i);
                        nnsge = sk_msg_elem_cpy(msg, i);
                }

                while (i != msg->sg.end) {
                        msg->sg.data[i] = sge;
                        sge = nsge;
                        sk_msg_iter_var_next(i);
                        if (rsge.length) {
                                nsge = nnsge;
                                nnsge = sk_msg_elem_cpy(msg, i);
                        } else {
                                nsge = sk_msg_elem_cpy(msg, i);
                        }
                }
        }

        /* Place newly allocated data buffer */
        sk_mem_charge(msg->sk, len);
        msg->sg.size += len;
        __clear_bit(new, msg->sg.copy);
        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
        if (rsge.length) {
                get_page(sg_page(&rsge));
                sk_msg_iter_var_next(new);
                msg->sg.data[new] = rsge;
        }

        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_push_data_proto = {
        .func                = bpf_msg_push_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static void sk_msg_shift_left(struct sk_msg *msg, int i)
{
        int prev;

        do {
                prev = i;
                sk_msg_iter_var_next(i);
                msg->sg.data[prev] = msg->sg.data[i];
        } while (i != msg->sg.end);

        sk_msg_iter_prev(msg, end);
}

static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
        struct scatterlist tmp, sge;

        sk_msg_iter_next(msg, end);
        sge = sk_msg_elem_cpy(msg, i);
        sk_msg_iter_var_next(i);
        tmp = sk_msg_elem_cpy(msg, i);

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sk_msg_iter_var_next(i);
                sge = tmp;
                tmp = sk_msg_elem_cpy(msg, i);
        }
}

BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        u32 i = 0, l = 0, space, offset = 0;
        u64 last = start + len;
        int pop;

        if (unlikely(flags))
                return -EINVAL;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        /* Bounds checks: start and pop must be inside message */
        if (start >= offset + l || last >= msg->sg.size)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        pop = len;
        /* --------------| offset
         * -| start      |-------- len -------|
         *
         *  |----- a ----|-------- pop -------|----- b ----|
         *  |______________________________________________| length
         *
         *
         * a:   region at front of scatter element to save
         * b:   region at back of scatter element to save when length > A + pop
         * pop: region to pop from element, same as input 'pop' here will be
         *      decremented below per iteration.
         *
         * Two top-level cases to handle when start != offset, first B is non
         * zero and second B is zero corresponding to when a pop includes more
         * than one element.
         *
         * Then if B is non-zero AND there is no space allocate space and
         * compact A, B regions into page. If there is space shift ring to
         * the right free'ing the next element in ring to place B, leaving
         * A untouched except to reduce length.
         */
        if (start != offset) {
                struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
                int a = start;
                int b = sge->length - pop - a;

                sk_msg_iter_var_next(i);

                if (pop < sge->length - a) {
                        if (space) {
                                sge->length = a;
                                sk_msg_shift_right(msg, i);
                                nsge = sk_msg_elem(msg, i);
                                get_page(sg_page(sge));
                                sg_set_page(nsge,
                                            sg_page(sge),
                                            b, sge->offset + pop + a);
                        } else {
                                struct page *page, *orig;
                                u8 *to, *from;

                                page = alloc_pages(__GFP_NOWARN |
                                                   __GFP_COMP   | GFP_ATOMIC,
                                                   get_order(a + b));
                                if (unlikely(!page))
                                        return -ENOMEM;

                                sge->length = a;
                                orig = sg_page(sge);
                                from = sg_virt(sge);
                                to = page_address(page);
                                memcpy(to, from, a);
                                memcpy(to + a, from + a + pop, b);
                                sg_set_page(sge, page, a + b, 0);
                                put_page(orig);
                        }
                        pop = 0;
                } else if (pop >= sge->length - a) {
                        pop -= (sge->length - a);
                        sge->length = a;
                }
        }

        /* From above the current layout _must_ be as follows,
         *
         * -| offset
         * -| start
         *
         *  |---- pop ---|---------------- b ------------|
         *  |____________________________________________| length
         *
         * Offset and start of the current msg elem are equal because in the
         * previous case we handled offset != start and either consumed the
         * entire element and advanced to the next element OR pop == 0.
         *
         * Two cases to handle here are first pop is less than the length
         * leaving some remainder b above. Simply adjust the element's layout
         * in this case. Or pop >= length of the element so that b = 0. In this
         * case advance to next element decrementing pop.
         */
        while (pop) {
                struct scatterlist *sge = sk_msg_elem(msg, i);

                if (pop < sge->length) {
                        sge->length -= pop;
                        sge->offset += pop;
                        pop = 0;
                } else {
                        pop -= sge->length;
                        sk_msg_shift_left(msg, i);
                }
                sk_msg_iter_var_next(i);
        }

        sk_mem_uncharge(msg->sk, len - pop);
        msg->sg.size -= (len - pop);
        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_pop_data_proto = {
        .func                = bpf_msg_pop_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

#ifdef CONFIG_CGROUP_NET_CLASSID
BPF_CALL_0(bpf_get_cgroup_classid_curr)
{
        return __task_get_classid(current);
}

const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
        .func                = bpf_get_cgroup_classid_curr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
{
        struct sock *sk = skb_to_full_sk(skb);

        if (!sk || !sk_fullsock(sk))
                return 0;

        return sock_cgroup_classid(&sk->sk_cgrp_data);
}

static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
        .func                = bpf_skb_cgroup_classid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};
#endif

BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
        return task_get_classid(skb);
}

static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
        .func           = bpf_get_cgroup_classid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
        return dst_tclassid(skb);
}

static const struct bpf_func_proto bpf_get_route_realm_proto = {
        .func           = bpf_get_route_realm,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
        /* If skb_clear_hash() was called due to mangling, we can
         * trigger SW recalculation here. Later access to hash
         * can then use the inline skb->hash via context directly
         * instead of calling this helper again.
         */
        return skb_get_hash(skb);
}

static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
        .func                = bpf_get_hash_recalc,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
{
        /* After all direct packet write, this can be used once for
         * triggering a lazy recalc on next skb_get_hash() invocation.
         */
        skb_clear_hash(skb);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
        .func                = bpf_set_hash_invalid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
{
        /* Set user specified hash as L4(+), so that it gets returned
         * on skb_get_hash() call unless BPF prog later on triggers a
         * skb_clear_hash().
         */
        __skb_set_sw_hash(skb, hash, true);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_proto = {
        .func                = bpf_set_hash,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
           u16, vlan_tci)
{
        int ret;

        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
                     vlan_proto != htons(ETH_P_8021AD)))
                vlan_proto = htons(ETH_P_8021Q);

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
        bpf_pull_mac_rcsum(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
        .func           = bpf_skb_vlan_push,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
        int ret;

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_pop(skb);
        bpf_pull_mac_rcsum(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
        .func           = bpf_skb_vlan_pop,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
        /* Caller already did skb_cow() with len as headroom,
         * so no need to do it here.
         */
        skb_push(skb, len);
        memmove(skb->data, skb->data + len, off);
        memset(skb->data + off, 0, len);

        /* No skb_postpush_rcsum(skb, skb->data + off, len)
         * needed here as it does not change the skb->csum
         * result for checksum complete when summing over
         * zeroed blocks.
         */
        return 0;
}

static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
{
        void *old_data;

        /* skb_ensure_writable() is not needed here, as we're
         * already working on an uncloned skb.
         */
        if (unlikely(!pskb_may_pull(skb, off + len)))
                return -ENOMEM;

        old_data = skb->data;
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, old_data + off, len);
        memmove(skb->data, old_data, off);

        return 0;
}

static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* There's no need for __skb_push()/__skb_pull() pair to
         * get to the start of the mac header as we're guaranteed
         * to always start from here under eBPF.
         */
        ret = bpf_skb_generic_push(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header -= len;
                skb->network_header -= len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* Same here, __skb_push()/__skb_pull() pair not needed. */
        ret = bpf_skb_generic_pop(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header += len;
                skb->network_header += len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_cow(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
                if (shinfo->gso_type & SKB_GSO_TCPV4) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
                        shinfo->gso_type |=  SKB_GSO_TCPV6;
                }
        }

        skb->protocol = htons(ETH_P_IPV6);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
                if (shinfo->gso_type & SKB_GSO_TCPV6) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
                        shinfo->gso_type |=  SKB_GSO_TCPV4;
                }
        }

        skb->protocol = htons(ETH_P_IP);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
{
        __be16 from_proto = skb->protocol;

        if (from_proto == htons(ETH_P_IP) &&
              to_proto == htons(ETH_P_IPV6))
                return bpf_skb_proto_4_to_6(skb);

        if (from_proto == htons(ETH_P_IPV6) &&
              to_proto == htons(ETH_P_IP))
                return bpf_skb_proto_6_to_4(skb);

        return -ENOTSUPP;
}

BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
           u64, flags)
{
        int ret;

        if (unlikely(flags))
                return -EINVAL;

        /* General idea is that this helper does the basic groundwork
         * needed for changing the protocol, and eBPF program fills the
         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
         * and other helpers, rather than passing a raw buffer here.
         *
         * The rationale is to keep this minimal and without a need to
         * deal with raw packet data. F.e. even if we would pass buffers
         * here, the program still needs to call the bpf_lX_csum_replace()
         * helpers anyway. Plus, this way we keep also separation of
         * concerns, since f.e. bpf_skb_store_bytes() should only take
         * care of stores.
         *
         * Currently, additional options and extension header space are
         * not supported, but flags register is reserved so we can adapt
         * that. For offloads, we mark packet as dodgy, so that headers
         * need to be verified first.
         */
        ret = bpf_skb_proto_xlat(skb, proto);
        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_proto_proto = {
        .func                = bpf_skb_change_proto,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
        /* We only allow a restricted subset to be changed for now. */
        if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
                     !skb_pkt_type_ok(pkt_type)))
                return -EINVAL;

        skb->pkt_type = pkt_type;
        return 0;
}

static const struct bpf_func_proto bpf_skb_change_type_proto = {
        .func                = bpf_skb_change_type,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_IP):
                return sizeof(struct iphdr);
        case htons(ETH_P_IPV6):
                return sizeof(struct ipv6hdr);
        default:
                return ~0U;
        }
}

#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK        (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_DECAP_L3_MASK        (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_MASK                (BPF_F_ADJ_ROOM_FIXED_GSO | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
                                          BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_MASK)

static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
                            u64 flags)
{
        u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
        bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
        u16 mac_len = 0, inner_net = 0, inner_trans = 0;
        unsigned int gso_type = SKB_GSO_DODGY;
        int ret;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_cow_head(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                if (skb->protocol != htons(ETH_P_IP) &&
                    skb->protocol != htons(ETH_P_IPV6))
                        return -ENOTSUPP;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
                    inner_mac_len < ETH_HLEN)
                        return -EINVAL;

                if (skb->encapsulation)
                        return -EALREADY;

                mac_len = skb->network_header - skb->mac_header;
                inner_net = skb->network_header;
                if (inner_mac_len > len_diff)
                        return -EINVAL;
                inner_trans = skb->transport_header;
        }

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                skb->inner_mac_header = inner_net - inner_mac_len;
                skb->inner_network_header = inner_net;
                skb->inner_transport_header = inner_trans;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
                        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
                else
                        skb_set_inner_protocol(skb, skb->protocol);

                skb->encapsulation = 1;
                skb_set_network_header(skb, mac_len);

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        gso_type |= SKB_GSO_UDP_TUNNEL;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
                        gso_type |= SKB_GSO_GRE;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        gso_type |= SKB_GSO_IPXIP6;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        gso_type |= SKB_GSO_IPXIP4;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
                        int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
                                        sizeof(struct ipv6hdr) :
                                        sizeof(struct iphdr);

                        skb_set_transport_header(skb, mac_len + nh_len);
                }

                /* Match skb->protocol to new outer l3 protocol */
                if (skb->protocol == htons(ETH_P_IP) &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        skb->protocol = htons(ETH_P_IPV6);
                else if (skb->protocol == htons(ETH_P_IPV6) &&
                         flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        skb->protocol = htons(ETH_P_IP);
        }

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Due to header grow, MSS needs to be downgraded. */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        skb_decrease_gso_size(shinfo, len_diff);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= gso_type;
                shinfo->gso_segs = 0;
        }

        return 0;
}

static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
                              u64 flags)
{
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
                               BPF_F_ADJ_ROOM_DECAP_L3_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        /* Match skb->protocol to new outer l3 protocol */
        if (skb->protocol == htons(ETH_P_IP) &&
            flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
                skb->protocol = htons(ETH_P_IPV6);
        else if (skb->protocol == htons(ETH_P_IPV6) &&
                 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
                skb->protocol = htons(ETH_P_IP);

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Due to header shrink, MSS can be upgraded. */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        skb_increase_gso_size(shinfo, len_diff);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= SKB_GSO_DODGY;
                shinfo->gso_segs = 0;
        }

        return 0;
}

#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC

BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_diff_abs = abs(len_diff);
        bool shrink = len_diff < 0;
        int ret = 0;

        if (unlikely(flags || mode))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;

        if (!shrink) {
                ret = skb_cow(skb, len_diff);
                if (unlikely(ret < 0))
                        return ret;
                __skb_push(skb, len_diff_abs);
                memset(skb->data, 0, len_diff_abs);
        } else {
                if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
                        return -ENOMEM;
                __skb_pull(skb, len_diff_abs);
        }
        if (tls_sw_has_ctx_rx(skb->sk)) {
                struct strp_msg *rxm = strp_msg(skb);

                rxm->full_len += len_diff;
        }
        return ret;
}

static const struct bpf_func_proto sk_skb_adjust_room_proto = {
        .func                = sk_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_cur, len_diff_abs = abs(len_diff);
        u32 len_min = bpf_skb_net_base_len(skb);
        u32 len_max = BPF_SKB_MAX_LEN;
        __be16 proto = skb->protocol;
        bool shrink = len_diff < 0;
        u32 off;
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;
        if (unlikely(proto != htons(ETH_P_IP) &&
                     proto != htons(ETH_P_IPV6)))
                return -ENOTSUPP;

        off = skb_mac_header_len(skb);
        switch (mode) {
        case BPF_ADJ_ROOM_NET:
                off += bpf_skb_net_base_len(skb);
                break;
        case BPF_ADJ_ROOM_MAC:
                break;
        default:
                return -ENOTSUPP;
        }

        if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                if (!shrink)
                        return -EINVAL;

                switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
                        len_min = sizeof(struct iphdr);
                        break;
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
                        len_min = sizeof(struct ipv6hdr);
                        break;
                default:
                        return -EINVAL;
                }
        }

        len_cur = skb->len - skb_network_offset(skb);
        if ((shrink && (len_diff_abs >= len_cur ||
                        len_cur - len_diff_abs < len_min)) ||
            (!shrink && (skb->len + len_diff_abs > len_max &&
                         !skb_is_gso(skb))))
                return -ENOTSUPP;

        ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
                       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
        if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
                __skb_reset_checksum_unnecessary(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
        .func                = bpf_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
        u32 min_len = skb_network_offset(skb);

        if (skb_transport_header_was_set(skb))
                min_len = skb_transport_offset(skb);
        if (skb->ip_summed == CHECKSUM_PARTIAL)
                min_len = skb_checksum_start_offset(skb) +
                          skb->csum_offset + sizeof(__sum16);
        return min_len;
}

static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        unsigned int old_len = skb->len;
        int ret;

        ret = __skb_grow_rcsum(skb, new_len);
        if (!ret)
                memset(skb->data + old_len, 0, new_len - old_len);
        return ret;
}

static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        return __skb_trim_rcsum(skb, new_len);
}

static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 min_len = __bpf_skb_min_len(skb);
        int ret;

        if (unlikely(flags || new_len > max_len || new_len < min_len))
                return -EINVAL;
        if (skb->encapsulation)
                return -ENOTSUPP;

        /* The basic idea of this helper is that it's performing the
         * needed work to either grow or trim an skb, and eBPF program
         * rewrites the rest via helpers like bpf_skb_store_bytes(),
         * bpf_lX_csum_replace() and others rather than passing a raw
         * buffer here. This one is a slow path helper and intended
         * for replies with control messages.
         *
         * Like in bpf_skb_change_proto(), we want to keep this rather
         * minimal and without protocol specifics so that we are able
         * to separate concerns as in bpf_skb_store_bytes() should only
         * be the one responsible for writing buffers.
         *
         * It's really expected to be a slow path operation here for
         * control message replies, so we're implicitly linearizing,
         * uncloning and drop offloads from the skb by this.
         */
        ret = __bpf_try_make_writable(skb, skb->len);
        if (!ret) {
                if (new_len > skb->len)
                        ret = bpf_skb_grow_rcsum(skb, new_len);
                else if (new_len < skb->len)
                        ret = bpf_skb_trim_rcsum(skb, new_len);
                if (!ret && skb_is_gso(skb))
                        skb_gso_reset(skb);
        }
        return ret;
}

BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        int ret = __bpf_skb_change_tail(skb, new_len, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_tail_proto = {
        .func                = bpf_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        return __bpf_skb_change_tail(skb, new_len, flags);
}

static const struct bpf_func_proto sk_skb_change_tail_proto = {
        .func                = sk_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 new_len = skb->len + head_room;
        int ret;

        if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
                     new_len < skb->len))
                return -EINVAL;

        ret = skb_cow(skb, head_room);
        if (likely(!ret)) {
                /* Idea for this helper is that we currently only
                 * allow to expand on mac header. This means that
                 * skb->protocol network header, etc, stay as is.
                 * Compared to bpf_skb_change_tail(), we're more
                 * flexible due to not needing to linearize or
                 * reset GSO. Intention for this helper is to be
                 * used by an L3 skb that needs to push mac header
                 * for redirection into L2 device.
                 */
                __skb_push(skb, head_room);
                memset(skb->data, 0, head_room);
                skb_reset_mac_header(skb);
                skb_reset_mac_len(skb);
        }

        return ret;
}

BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        int ret = __bpf_skb_change_head(skb, head_room, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_head_proto = {
        .func                = bpf_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        return __bpf_skb_change_head(skb, head_room, flags);
}

static const struct bpf_func_proto sk_skb_change_head_proto = {
        .func                = sk_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
{
        return xdp_get_buff_len(xdp);
}

static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_get_buff_len_bpf_ids[0],
};

static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
        return xdp_data_meta_unsupported(xdp) ? 0 :
               xdp->data - xdp->data_meta;
}

BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        unsigned long metalen = xdp_get_metalen(xdp);
        void *data_start = xdp_frame_end + metalen;
        void *data = xdp->data + offset;

        if (unlikely(data < data_start ||
                     data > xdp->data_end - ETH_HLEN))
                return -EINVAL;

        if (metalen)
                memmove(xdp->data_meta + offset,
                        xdp->data_meta, metalen);
        xdp->data_meta += offset;
        xdp->data = data;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
        .func                = bpf_xdp_adjust_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush)
{
        unsigned long ptr_len, ptr_off = 0;
        skb_frag_t *next_frag, *end_frag;
        struct skb_shared_info *sinfo;
        void *src, *dst;
        u8 *ptr_buf;

        if (likely(xdp->data_end - xdp->data >= off + len)) {
                src = flush ? buf : xdp->data + off;
                dst = flush ? xdp->data + off : buf;
                memcpy(dst, src, len);
                return;
        }

        sinfo = xdp_get_shared_info_from_buff(xdp);
        end_frag = &sinfo->frags[sinfo->nr_frags];
        next_frag = &sinfo->frags[0];

        ptr_len = xdp->data_end - xdp->data;
        ptr_buf = xdp->data;

        while (true) {
                if (off < ptr_off + ptr_len) {
                        unsigned long copy_off = off - ptr_off;
                        unsigned long copy_len = min(len, ptr_len - copy_off);

                        src = flush ? buf : ptr_buf + copy_off;
                        dst = flush ? ptr_buf + copy_off : buf;
                        memcpy(dst, src, copy_len);

                        off += copy_len;
                        len -= copy_len;
                        buf += copy_len;
                }

                if (!len || next_frag == end_frag)
                        break;

                ptr_off += ptr_len;
                ptr_buf = skb_frag_address(next_frag);
                ptr_len = skb_frag_size(next_frag);
                next_frag++;
        }
}

void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        u32 size = xdp->data_end - xdp->data;
        struct skb_shared_info *sinfo;
        void *addr = xdp->data;
        int i;

        if (unlikely(offset > 0xffff || len > 0xffff))
                return ERR_PTR(-EFAULT);

        if (unlikely(offset + len > xdp_get_buff_len(xdp)))
                return ERR_PTR(-EINVAL);

        if (likely(offset < size)) /* linear area */
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        offset -= size;
        for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
                u32 frag_size = skb_frag_size(&sinfo->frags[i]);

                if  (offset < frag_size) {
                        addr = skb_frag_address(&sinfo->frags[i]);
                        size = frag_size;
                        break;
                }
                offset -= frag_size;
        }
out:
        return offset + len <= size ? addr + offset : NULL;
}

BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, false);
        else
                memcpy(buf, ptr, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
        .func                = bpf_xdp_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
}

BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, true);
        else
                memcpy(ptr, buf, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
        .func                = bpf_xdp_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
}

static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
        struct xdp_rxq_info *rxq = xdp->rxq;
        unsigned int tailroom;

        if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
                return -EOPNOTSUPP;

        tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
        if (unlikely(offset > tailroom))
                return -EINVAL;

        memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
        skb_frag_size_add(frag, offset);
        sinfo->xdp_frags_size += offset;
        if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                xsk_buff_get_tail(xdp)->data_end += offset;

        return 0;
}

static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
                                   struct xdp_mem_info *mem_info, bool release)
{
        struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);

        if (release) {
                xsk_buff_del_tail(zc_frag);
                __xdp_return(NULL, mem_info, false, zc_frag);
        } else {
                zc_frag->data_end -= shrink;
        }
}

static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
                                int shrink)
{
        struct xdp_mem_info *mem_info = &xdp->rxq->mem;
        bool release = skb_frag_size(frag) == shrink;

        if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
                bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
                goto out;
        }

        if (release) {
                struct page *page = skb_frag_page(frag);

                __xdp_return(page_address(page), mem_info, false, NULL);
        }

out:
        return release;
}

static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        int i, n_frags_free = 0, len_free = 0;

        if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
                return -EINVAL;

        for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
                skb_frag_t *frag = &sinfo->frags[i];
                int shrink = min_t(int, offset, skb_frag_size(frag));

                len_free += shrink;
                offset -= shrink;
                if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
                        n_frags_free++;
                } else {
                        skb_frag_size_sub(frag, shrink);
                        break;
                }
        }
        sinfo->nr_frags -= n_frags_free;
        sinfo->xdp_frags_size -= len_free;

        if (unlikely(!sinfo->nr_frags)) {
                xdp_buff_clear_frags_flag(xdp);
                xdp->data_end -= offset;
        }

        return 0;
}

BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
        void *data_end = xdp->data_end + offset;

        if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
                if (offset < 0)
                        return bpf_xdp_frags_shrink_tail(xdp, -offset);

                return bpf_xdp_frags_increase_tail(xdp, offset);
        }

        /* Notice that xdp_data_hard_end have reserved some tailroom */
        if (unlikely(data_end > data_hard_end))
                return -EINVAL;

        if (unlikely(data_end < xdp->data + ETH_HLEN))
                return -EINVAL;

        /* Clear memory area on grow, can contain uninit kernel memory */
        if (offset > 0)
                memset(xdp->data_end, 0, offset);

        xdp->data_end = data_end;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
        .func                = bpf_xdp_adjust_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        void *meta = xdp->data_meta + offset;
        unsigned long metalen = xdp->data - meta;

        if (xdp_data_meta_unsupported(xdp))
                return -ENOTSUPP;
        if (unlikely(meta < xdp_frame_end ||
                     meta > xdp->data))
                return -EINVAL;
        if (unlikely(xdp_metalen_invalid(metalen)))
                return -EACCES;

        xdp->data_meta = meta;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .func                = bpf_xdp_adjust_meta,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

/**
 * DOC: xdp redirect
 *
 * XDP_REDIRECT works by a three-step process, implemented in the functions
 * below:
 *
 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
 *    of the redirect and store it (along with some other metadata) in a per-CPU
 *    struct bpf_redirect_info.
 *
 * 2. When the program returns the XDP_REDIRECT return code, the driver will
 *    call xdp_do_redirect() which will use the information in struct
 *    bpf_redirect_info to actually enqueue the frame into a map type-specific
 *    bulk queue structure.
 *
 * 3. Before exiting its NAPI poll loop, the driver will call
 *    xdp_do_flush(), which will flush all the different bulk queues,
 *    thus completing the redirect. Note that xdp_do_flush() must be
 *    called before napi_complete_done() in the driver, as the
 *    XDP_REDIRECT logic relies on being inside a single NAPI instance
 *    through to the xdp_do_flush() call for RCU protection of all
 *    in-kernel data structures.
 */
/*
 * Pointers to the map entries will be kept around for this whole sequence of
 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
 * the core code; instead, the RCU protection relies on everything happening
 * inside a single NAPI poll sequence, which means it's between a pair of calls
 * to local_bh_disable()/local_bh_enable().
 *
 * The map entries are marked as __rcu and the map code makes sure to
 * dereference those pointers with rcu_dereference_check() in a way that works
 * for both sections that to hold an rcu_read_lock() and sections that are
 * called from NAPI without a separate rcu_read_lock(). The code below does not
 * use RCU annotations, but relies on those in the map code.
 */
void xdp_do_flush(void)
{
        __dev_flush();
        __cpu_map_flush();
        __xsk_map_flush();
}
EXPORT_SYMBOL_GPL(xdp_do_flush);

#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
void xdp_do_check_flushed(struct napi_struct *napi)
{
        bool ret;

        ret = dev_check_flush();
        ret |= cpu_map_check_flush();
        ret |= xsk_map_check_flush();

        WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
                  napi->poll);
}
#endif

DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net_device *master, *slave;

        master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
        slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
        if (slave && slave != xdp->rxq->dev) {
                /* The target device is different from the receiving device, so
                 * redirect it to the new device.
                 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
                 * drivers to unmap the packet from their rx ring.
                 */
                ri->tgt_index = slave->ifindex;
                ri->map_id = INT_MAX;
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return XDP_REDIRECT;
        }
        return XDP_TX;
}
EXPORT_SYMBOL_GPL(xdp_master_redirect);

static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
                                        struct net_device *dev,
                                        struct xdp_buff *xdp,
                                        struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        err = __xsk_map_redirect(fwd, xdp);
        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
                                                   struct net_device *dev,
                                                   struct xdp_frame *xdpf,
                                                   struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        struct bpf_map *map;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (unlikely(!xdpf)) {
                err = -EOVERFLOW;
                goto err;
        }

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_enqueue_multi(xdpf, dev, map,
                                                    flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_enqueue(fwd, xdpf, dev);
                }
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_enqueue(fwd, xdpf, dev);
                break;
        case BPF_MAP_TYPE_UNSPEC:
                if (map_id == INT_MAX) {
                        fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                        if (unlikely(!fwd)) {
                                err = -EINVAL;
                                break;
                        }
                        err = dev_xdp_enqueue(fwd, xdpf, dev);
                        break;
                }
                fallthrough;
        default:
                err = -EBADRQC;
        }

        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
                                       xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);

int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
                          struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);

static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct xdp_buff *xdp,
                                       struct bpf_prog *xdp_prog, void *fwd,
                                       enum bpf_map_type map_type, u32 map_id,
                                       u32 flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct bpf_map *map;
        int err;

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
                                                     flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_generic_redirect(fwd, skb, xdp_prog);
                }
                if (unlikely(err))
                        goto err;
                break;
        case BPF_MAP_TYPE_XSKMAP:
                err = xsk_generic_rcv(fwd, xdp);
                if (err)
                        goto err;
                consume_skb(skb);
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_generic_redirect(fwd, skb);
                if (unlikely(err))
                        goto err;
                break;
        default:
                err = -EBADRQC;
                goto err;
        }

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
                fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                if (unlikely(!fwd)) {
                        err = -EINVAL;
                        goto err;
                }

                err = xdp_ok_fwd_dev(fwd, skb->len);
                if (unlikely(err))
                        goto err;

                skb->dev = fwd;
                _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
                generic_xdp_tx(skb, xdp_prog);
                return 0;
        }

        return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
err:
        _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
        return err;
}

BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return XDP_ABORTED;

        /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
         * by map_idr) is used for ifindex based XDP redirect.
         */
        ri->tgt_index = ifindex;
        ri->map_id = INT_MAX;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        return XDP_REDIRECT;
}

static const struct bpf_func_proto bpf_xdp_redirect_proto = {
        .func           = bpf_xdp_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
           u64, flags)
{
        return map->ops->map_redirect(map, key, flags);
}

static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
        .func           = bpf_xdp_redirect_map,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
                                  unsigned long off, unsigned long len)
{
        void *ptr = skb_header_pointer(skb, off, len, dst_buff);

        if (unlikely(!ptr))
                return len;
        if (ptr != dst_buff)
                memcpy(dst_buff, ptr, len);

        return 0;
}

BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (unlikely(!skb || skb_size > skb->len))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
                                bpf_skb_copy);
}

static const struct bpf_func_proto bpf_skb_event_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)

const struct bpf_func_proto bpf_skb_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_skb_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static unsigned short bpf_tunnel_key_af(u64 flags)
{
        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}

BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
           u32, size, u64, flags)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        void *to_orig = to;
        int err;

        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
                                         BPF_F_TUNINFO_FLAGS)))) {
                err = -EINVAL;
                goto err_clear;
        }
        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
                err = -EPROTO;
                goto err_clear;
        }
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                err = -EINVAL;
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                        goto set_compat;
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        if (ip_tunnel_info_af(info) != AF_INET)
                                goto err_clear;
set_compat:
                        to = (struct bpf_tunnel_key *)compat;
                        break;
                default:
                        goto err_clear;
                }
        }

        to->tunnel_id = be64_to_cpu(info->key.tun_id);
        to->tunnel_tos = info->key.tos;
        to->tunnel_ttl = info->key.ttl;
        if (flags & BPF_F_TUNINFO_FLAGS)
                to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
        else
                to->tunnel_ext = 0;

        if (flags & BPF_F_TUNINFO_IPV6) {
                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
                       sizeof(to->remote_ipv6));
                memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
                       sizeof(to->local_ipv6));
                to->tunnel_label = be32_to_cpu(info->key.label);
        } else {
                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
                to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
                memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
                to->tunnel_label = 0;
        }

        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
                memcpy(to_orig, to, size);

        return 0;
err_clear:
        memset(to_orig, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
        .func                = bpf_skb_get_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        int err;

        if (unlikely(!info ||
                     !ip_tunnel_is_options_present(info->key.tun_flags))) {
                err = -ENOENT;
                goto err_clear;
        }
        if (unlikely(size < info->options_len)) {
                err = -ENOMEM;
                goto err_clear;
        }

        ip_tunnel_info_opts_get(to, info);
        if (size > info->options_len)
                memset(to + info->options_len, 0, size - info->options_len);

        return info->options_len;
err_clear:
        memset(to, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
        .func                = bpf_skb_get_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

static struct metadata_dst __percpu *md_dst;

BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
           const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
        struct metadata_dst *md = this_cpu_ptr(md_dst);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        struct ip_tunnel_info *info;

        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
                               BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
                               BPF_F_NO_TUNNEL_KEY)))
                return -EINVAL;
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        memcpy(compat, from, size);
                        memset(compat + size, 0, sizeof(compat) - size);
                        from = (const struct bpf_tunnel_key *) compat;
                        break;
                default:
                        return -EINVAL;
                }
        }
        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
                     from->tunnel_ext))
                return -EINVAL;

        skb_dst_drop(skb);
        dst_hold((struct dst_entry *) md);
        skb_dst_set(skb, (struct dst_entry *) md);

        info = &md->u.tun_info;
        memset(info, 0, sizeof(*info));
        info->mode = IP_TUNNEL_INFO_TX;

        __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
        __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
                     flags & BPF_F_DONT_FRAGMENT);
        __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
                     !(flags & BPF_F_ZERO_CSUM_TX));
        __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
                     flags & BPF_F_SEQ_NUMBER);
        __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
                     !(flags & BPF_F_NO_TUNNEL_KEY));

        info->key.tun_id = cpu_to_be64(from->tunnel_id);
        info->key.tos = from->tunnel_tos;
        info->key.ttl = from->tunnel_ttl;

        if (flags & BPF_F_TUNINFO_IPV6) {
                info->mode |= IP_TUNNEL_INFO_IPV6;
                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
                       sizeof(from->remote_ipv6));
                memcpy(&info->key.u.ipv6.src, from->local_ipv6,
                       sizeof(from->local_ipv6));
                info->key.label = cpu_to_be32(from->tunnel_label) &
                                  IPV6_FLOWLABEL_MASK;
        } else {
                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
                info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
                info->key.flow_flags = FLOWI_FLAG_ANYSRC;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
        .func                = bpf_skb_set_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
           const u8 *, from, u32, size)
{
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        const struct metadata_dst *md = this_cpu_ptr(md_dst);
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
                return -EINVAL;
        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
                return -ENOMEM;

        ip_tunnel_set_options_present(present);
        ip_tunnel_info_opts_set(info, from, size, present);

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
        .func                = bpf_skb_set_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
        if (!md_dst) {
                struct metadata_dst __percpu *tmp;

                tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
                                                METADATA_IP_TUNNEL,
                                                GFP_KERNEL);
                if (!tmp)
                        return NULL;
                if (cmpxchg(&md_dst, NULL, tmp))
                        metadata_dst_free_percpu(tmp);
        }

        switch (which) {
        case BPF_FUNC_skb_set_tunnel_key:
                return &bpf_skb_set_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return &bpf_skb_set_tunnel_opt_proto;
        default:
                return NULL;
        }
}

BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
           u32, idx)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct cgroup *cgrp;
        struct sock *sk;

        sk = skb_to_full_sk(skb);
        if (!sk || !sk_fullsock(sk))
                return -ENOENT;
        if (unlikely(idx >= array->map.max_entries))
                return -E2BIG;

        cgrp = READ_ONCE(array->ptrs[idx]);
        if (unlikely(!cgrp))
                return -EAGAIN;

        return sk_under_cgroup_hierarchy(sk, cgrp);
}

static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
        .func                = bpf_skb_under_cgroup,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

#ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        return cgroup_id(cgrp);
}

BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
        return __bpf_sk_cgroup_id(skb->sk);
}

static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
        .func           = bpf_skb_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
                                              int ancestor_level)
{
        struct cgroup *ancestor;
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        ancestor = cgroup_ancestor(cgrp, ancestor_level);
        if (!ancestor)
                return 0;

        return cgroup_id(ancestor);
}

BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
           ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
}

static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
        .func           = bpf_skb_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
        return __bpf_sk_cgroup_id(sk);
}

static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
        .func           = bpf_sk_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};

BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}

static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
        .func           = bpf_sk_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type      = ARG_ANYTHING,
};
#endif

static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
                                  unsigned long off, unsigned long len)
{
        struct xdp_buff *xdp = (struct xdp_buff *)ctx;

        bpf_xdp_copy_buf(xdp, off, dst, len, false);
        return 0;
}

BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;

        if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, xdp,
                                xdp_size, bpf_xdp_copy);
}

static const struct bpf_func_proto bpf_xdp_event_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
        return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}

static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
        .func           = bpf_get_socket_cookie,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
        .func                = bpf_get_socket_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
{
        return __sock_gen_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
        .func                = bpf_get_socket_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
{
        return sk ? sock_gen_cookie(sk) : 0;
}

const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
        .func                = bpf_get_socket_ptr_cookie,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
        .func                = bpf_get_socket_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static u64 __bpf_get_netns_cookie(struct sock *sk)
{
        const struct net *net = sk ? sock_net(sk) : &init_net;

        return net->net_cookie;
}

BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
{
        return __bpf_get_netns_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
        .func                = bpf_get_netns_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
        .func                = bpf_get_netns_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
        .func                = bpf_get_netns_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
        .func                = bpf_get_netns_cookie_sk_msg,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
        struct sock *sk = sk_to_full_sk(skb->sk);
        kuid_t kuid;

        if (!sk || !sk_fullsock(sk))
                return overflowuid;
        kuid = sock_net_uid(sock_net(sk), sk);
        return from_kuid_munged(sock_net(sk)->user_ns, kuid);
}

static const struct bpf_func_proto bpf_get_socket_uid_proto = {
        .func           = bpf_get_socket_uid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int sol_socket_sockopt(struct sock *sk, int optname,
                              char *optval, int *optlen,
                              bool getopt)
{
        switch (optname) {
        case SO_REUSEADDR:
        case SO_SNDBUF:
        case SO_RCVBUF:
        case SO_KEEPALIVE:
        case SO_PRIORITY:
        case SO_REUSEPORT:
        case SO_RCVLOWAT:
        case SO_MARK:
        case SO_MAX_PACING_RATE:
        case SO_BINDTOIFINDEX:
        case SO_TXREHASH:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case SO_BINDTODEVICE:
                break;
        default:
                return -EINVAL;
        }

        if (getopt) {
                if (optname == SO_BINDTODEVICE)
                        return -EINVAL;
                return sk_getsockopt(sk, SOL_SOCKET, optname,
                                     KERNEL_SOCKPTR(optval),
                                     KERNEL_SOCKPTR(optlen));
        }

        return sk_setsockopt(sk, SOL_SOCKET, optname,
                             KERNEL_SOCKPTR(optval), *optlen);
}

static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
                                  char *optval, int optlen)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned long timeout;
        int val;

        if (optlen != sizeof(int))
                return -EINVAL;

        val = *(int *)optval;

        /* Only some options are supported */
        switch (optname) {
        case TCP_BPF_IW:
                if (val <= 0 || tp->data_segs_out > tp->syn_data)
                        return -EINVAL;
                tcp_snd_cwnd_set(tp, val);
                break;
        case TCP_BPF_SNDCWND_CLAMP:
                if (val <= 0)
                        return -EINVAL;
                tp->snd_cwnd_clamp = val;
                tp->snd_ssthresh = val;
                break;
        case TCP_BPF_DELACK_MAX:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_DELACK_MAX ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_delack_max = timeout;
                break;
        case TCP_BPF_RTO_MIN:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_RTO_MIN ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_rto_min = timeout;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
                                      int *optlen, bool getopt)
{
        struct tcp_sock *tp;
        int ret;

        if (*optlen < 2)
                return -EINVAL;

        if (getopt) {
                if (!inet_csk(sk)->icsk_ca_ops)
                        return -EINVAL;
                /* BPF expects NULL-terminated tcp-cc string */
                optval[--(*optlen)] = '\0';
                return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        /* "cdg" is the only cc that alloc a ptr
         * in inet_csk_ca area.  The bpf-tcp-cc may
         * overwrite this ptr after switching to cdg.
         */
        if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
                return -ENOTSUPP;

        /* It stops this looping
         *
         * .init => bpf_setsockopt(tcp_cc) => .init =>
         * bpf_setsockopt(tcp_cc)" => .init => ....
         *
         * The second bpf_setsockopt(tcp_cc) is not allowed
         * in order to break the loop when both .init
         * are the same bpf prog.
         *
         * This applies even the second bpf_setsockopt(tcp_cc)
         * does not cause a loop.  This limits only the first
         * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
         * pick a fallback cc (eg. peer does not support ECN)
         * and the second '.init' cannot fallback to
         * another.
         */
        tp = tcp_sk(sk);
        if (tp->bpf_chg_cc_inprogress)
                return -EBUSY;

        tp->bpf_chg_cc_inprogress = 1;
        ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                KERNEL_SOCKPTR(optval), *optlen);
        tp->bpf_chg_cc_inprogress = 0;
        return ret;
}

static int sol_tcp_sockopt(struct sock *sk, int optname,
                           char *optval, int *optlen,
                           bool getopt)
{
        if (sk->sk_protocol != IPPROTO_TCP)
                return -EINVAL;

        switch (optname) {
        case TCP_NODELAY:
        case TCP_MAXSEG:
        case TCP_KEEPIDLE:
        case TCP_KEEPINTVL:
        case TCP_KEEPCNT:
        case TCP_SYNCNT:
        case TCP_WINDOW_CLAMP:
        case TCP_THIN_LINEAR_TIMEOUTS:
        case TCP_USER_TIMEOUT:
        case TCP_NOTSENT_LOWAT:
        case TCP_SAVE_SYN:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case TCP_CONGESTION:
                return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
        case TCP_SAVED_SYN:
                if (*optlen < 1)
                        return -EINVAL;
                break;
        default:
                if (getopt)
                        return -EINVAL;
                return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
        }

        if (getopt) {
                if (optname == TCP_SAVED_SYN) {
                        struct tcp_sock *tp = tcp_sk(sk);

                        if (!tp->saved_syn ||
                            *optlen > tcp_saved_syn_len(tp->saved_syn))
                                return -EINVAL;
                        memcpy(optval, tp->saved_syn->data, *optlen);
                        /* It cannot free tp->saved_syn here because it
                         * does not know if the user space still needs it.
                         */
                        return 0;
                }

                return do_tcp_getsockopt(sk, SOL_TCP, optname,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        return do_tcp_setsockopt(sk, SOL_TCP, optname,
                                 KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ip_sockopt(struct sock *sk, int optname,
                          char *optval, int *optlen,
                          bool getopt)
{
        if (sk->sk_family != AF_INET)
                return -EINVAL;

        switch (optname) {
        case IP_TOS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return do_ip_getsockopt(sk, SOL_IP, optname,
                                        KERNEL_SOCKPTR(optval),
                                        KERNEL_SOCKPTR(optlen));

        return do_ip_setsockopt(sk, SOL_IP, optname,
                                KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ipv6_sockopt(struct sock *sk, int optname,
                            char *optval, int *optlen,
                            bool getopt)
{
        if (sk->sk_family != AF_INET6)
                return -EINVAL;

        switch (optname) {
        case IPV6_TCLASS:
        case IPV6_AUTOFLOWLABEL:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
                                                      KERNEL_SOCKPTR(optval),
                                                      KERNEL_SOCKPTR(optlen));

        return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
                                              KERNEL_SOCKPTR(optval), *optlen);
}

static int __bpf_setsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        if (!sk_fullsock(sk))
                return -EINVAL;

        if (level == SOL_SOCKET)
                return sol_socket_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                return sol_ip_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                return sol_tcp_sockopt(sk, optname, optval, &optlen, false);

        return -EINVAL;
}

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

static int __bpf_getsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        int err, saved_optlen = optlen;

        if (!sk_fullsock(sk)) {
                err = -EINVAL;
                goto done;
        }

        if (level == SOL_SOCKET)
                err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
        else
                err = -EINVAL;

done:
        if (err)
                optlen = 0;
        if (optlen < saved_optlen)
                memset(optval + optlen, 0, saved_optlen - optlen);
        return err;
}

static int _bpf_getsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_setsockopt_proto = {
        .func                = bpf_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_getsockopt_proto = {
        .func                = bpf_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
        .func                = bpf_unlocked_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
        .func                = bpf_unlocked_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
        .func                = bpf_sock_addr_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
        .func                = bpf_sock_addr_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
        .func                = bpf_sock_ops_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
                                int optname, const u8 **start)
{
        struct sk_buff *syn_skb = bpf_sock->syn_skb;
        const u8 *hdr_start;
        int ret;

        if (syn_skb) {
                /* sk is a request_sock here */

                if (optname == TCP_BPF_SYN) {
                        hdr_start = syn_skb->data;
                        ret = tcp_hdrlen(syn_skb);
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = skb_network_header(syn_skb);
                        ret = skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                } else {
                        /* optname == TCP_BPF_SYN_MAC */
                        hdr_start = skb_mac_header(syn_skb);
                        ret = skb_mac_header_len(syn_skb) +
                                skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                }
        } else {
                struct sock *sk = bpf_sock->sk;
                struct saved_syn *saved_syn;

                if (sk->sk_state == TCP_NEW_SYN_RECV)
                        /* synack retransmit. bpf_sock->syn_skb will
                         * not be available.  It has to resort to
                         * saved_syn (if it is saved).
                         */
                        saved_syn = inet_reqsk(sk)->saved_syn;
                else
                        saved_syn = tcp_sk(sk)->saved_syn;

                if (!saved_syn)
                        return -ENOENT;

                if (optname == TCP_BPF_SYN) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen;
                        ret = saved_syn->tcp_hdrlen;
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen;
                        ret = saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                } else {
                        /* optname == TCP_BPF_SYN_MAC */

                        /* TCP_SAVE_SYN may not have saved the mac hdr */
                        if (!saved_syn->mac_hdrlen)
                                return -ENOENT;

                        hdr_start = saved_syn->data;
                        ret = saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                }
        }

        *start = hdr_start;
        return ret;
}

BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
            optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
                int ret, copy_len = 0;
                const u8 *start;

                ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
                if (ret > 0) {
                        copy_len = ret;
                        if (optlen < copy_len) {
                                copy_len = optlen;
                                ret = -ENOSPC;
                        }

                        memcpy(optval, start, copy_len);
                }

                /* Zero out unused buffer at the end */
                memset(optval + copy_len, 0, optlen - copy_len);

                return ret;
        }

        return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
        .func                = bpf_sock_ops_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
           int, argval)
{
        struct sock *sk = bpf_sock->sk;
        int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;

        if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
                return -EINVAL;

        tcp_sk(sk)->bpf_sock_ops_cb_flags = val;

        return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
}

static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
        .func                = bpf_sock_ops_cb_flags_set,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
EXPORT_SYMBOL_GPL(ipv6_bpf_stub);

BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
           int, addr_len)
{
#ifdef CONFIG_INET
        struct sock *sk = ctx->sk;
        u32 flags = BIND_FROM_BPF;
        int err;

        err = -EINVAL;
        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return err;
        if (addr->sa_family == AF_INET) {
                if (addr_len < sizeof(struct sockaddr_in))
                        return err;
                if (((struct sockaddr_in *)addr)->sin_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (addr->sa_family == AF_INET6) {
                if (addr_len < SIN6_LEN_RFC2133)
                        return err;
                if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                /* ipv6_bpf_stub cannot be NULL, since it's called from
                 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
                 */
                return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */
        }
#endif /* CONFIG_INET */

        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_bind_proto = {
        .func                = bpf_bind,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

#ifdef CONFIG_XFRM

#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))

struct metadata_dst __percpu *xfrm_bpf_md_dst;
EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);

#endif

BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
           struct bpf_xfrm_state *, to, u32, size, u64, flags)
{
        const struct sec_path *sp = skb_sec_path(skb);
        const struct xfrm_state *x;

        if (!sp || unlikely(index >= sp->len || flags))
                goto err_clear;

        x = sp->xvec[index];

        if (unlikely(size != sizeof(struct bpf_xfrm_state)))
                goto err_clear;

        to->reqid = x->props.reqid;
        to->spi = x->id.spi;
        to->family = x->props.family;
        to->ext = 0;

        if (to->family == AF_INET6) {
                memcpy(to->remote_ipv6, x->props.saddr.a6,
                       sizeof(to->remote_ipv6));
        } else {
                to->remote_ipv4 = x->props.saddr.a4;
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
        }

        return 0;
err_clear:
        memset(to, 0, size);
        return -EINVAL;
}

static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
        .func                = bpf_skb_get_xfrm_state,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};
#endif

#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
{
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;
        if (mtu)
                params->mtu_result = mtu; /* union with tot_len */

        return 0;
}
#endif

#if IS_ENABLED(CONFIG_INET)
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct fib_nh_common *nhc;
        struct in_device *in_dev;
        struct neighbour *neigh;
        struct net_device *dev;
        struct fib_result res;
        struct flowi4 fl4;
        u32 mtu = 0;
        int err;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        /* verify forwarding is enabled on this interface */
        in_dev = __in_dev_get_rcu(dev);
        if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl4.flowi4_iif = 1;
                fl4.flowi4_oif = params->ifindex;
        } else {
                fl4.flowi4_iif = params->ifindex;
                fl4.flowi4_oif = 0;
        }
        fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;

        fl4.flowi4_proto = params->l4_protocol;
        fl4.daddr = params->ipv4_dst;
        fl4.saddr = params->ipv4_src;
        fl4.fl4_sport = params->sport;
        fl4.fl4_dport = params->dport;
        fl4.flowi4_multipath_hash = 0;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = fib_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl4.flowi4_mark = params->mark;
                else
                        fl4.flowi4_mark = 0;
                fl4.flowi4_secid = 0;
                fl4.flowi4_tun_key.tun_id = 0;
                fl4.flowi4_uid = sock_net_uid(net, NULL);

                err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
        }

        if (err) {
                /* map fib lookup errors to RTN_ type */
                if (err == -EINVAL)
                        return BPF_FIB_LKUP_RET_BLACKHOLE;
                if (err == -EHOSTUNREACH)
                        return BPF_FIB_LKUP_RET_UNREACHABLE;
                if (err == -EACCES)
                        return BPF_FIB_LKUP_RET_PROHIBIT;

                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        if (res.type != RTN_UNICAST)
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        if (fib_info_num_path(res.fi) > 1)
                fib_select_path(net, &res, &fl4, NULL);

        if (check_mtu) {
                mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        nhc = res.nhc;

        /* do not handle lwt encaps right now */
        if (nhc->nhc_lwtstate)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        dev = nhc->nhc_dev;

        params->rt_metric = res.fi->fib_priority;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC)
                params->ipv4_src = fib_result_prefsrc(net, &res);

        /* xdp and cls_bpf programs are run in RCU-bh so
         * rcu_read_lock_bh is not needed here
         */
        if (likely(nhc->nhc_gw_family != AF_INET6)) {
                if (nhc->nhc_gw_family)
                        params->ipv4_dst = nhc->nhc_gw.ipv4;
        } else {
                struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;

                params->family = AF_INET6;
                *dst = nhc->nhc_gw.ipv6;
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        if (likely(nhc->nhc_gw_family != AF_INET6))
                neigh = __ipv4_neigh_lookup_noref(dev,
                                                  (__force u32)params->ipv4_dst);
        else
                neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);

        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
        struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
        struct fib6_result res = {};
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct flowi6 fl6;
        int strict = 0;
        int oif, err;
        u32 mtu = 0;

        /* link local addresses are never forwarded */
        if (rt6_need_strict(dst) || rt6_need_strict(src))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        idev = __in6_dev_get_safely(dev);
        if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl6.flowi6_iif = 1;
                oif = fl6.flowi6_oif = params->ifindex;
        } else {
                oif = fl6.flowi6_iif = params->ifindex;
                fl6.flowi6_oif = 0;
                strict = RT6_LOOKUP_F_HAS_SADDR;
        }
        fl6.flowlabel = params->flowinfo;
        fl6.flowi6_scope = 0;
        fl6.flowi6_flags = 0;
        fl6.mp_hash = 0;

        fl6.flowi6_proto = params->l4_protocol;
        fl6.daddr = *dst;
        fl6.saddr = *src;
        fl6.fl6_sport = params->sport;
        fl6.fl6_dport = params->dport;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib6_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = ipv6_stub->fib6_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
                                                   strict);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl6.flowi6_mark = params->mark;
                else
                        fl6.flowi6_mark = 0;
                fl6.flowi6_secid = 0;
                fl6.flowi6_tun_key.tun_id = 0;
                fl6.flowi6_uid = sock_net_uid(net, NULL);

                err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
        }

        if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
                     res.f6i == net->ipv6.fib6_null_entry))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        switch (res.fib6_type) {
        /* only unicast is forwarded */
        case RTN_UNICAST:
                break;
        case RTN_BLACKHOLE:
                return BPF_FIB_LKUP_RET_BLACKHOLE;
        case RTN_UNREACHABLE:
                return BPF_FIB_LKUP_RET_UNREACHABLE;
        case RTN_PROHIBIT:
                return BPF_FIB_LKUP_RET_PROHIBIT;
        default:
                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
                                    fl6.flowi6_oif != 0, NULL, strict);

        if (check_mtu) {
                mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        if (res.nh->fib_nh_lws)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        if (res.nh->fib_nh_gw_family)
                *dst = res.nh->fib_nh_gw6;

        dev = res.nh->fib_nh_dev;
        params->rt_metric = res.f6i->fib6_metric;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC) {
                if (res.f6i->fib6_prefsrc.plen) {
                        *src = res.f6i->fib6_prefsrc.addr;
                } else {
                        err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
                                                                &fl6.daddr, 0,
                                                                src);
                        if (err)
                                return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
                }
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
         * not needed here.
         */
        neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
                             BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
                             BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)

BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
        }
        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
        .func                = bpf_xdp_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        struct net *net = dev_net(skb->dev);
        int rc = -EAFNOSUPPORT;
        bool check_mtu = false;

        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        if (params->tot_len)
                check_mtu = true;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
        }

        if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
                struct net_device *dev;

                /* When tot_len isn't provided by user, check skb
                 * against MTU of FIB lookup resulting net_device
                 */
                dev = dev_get_by_index_rcu(net, params->ifindex);
                if (!is_skb_forwardable(dev, skb))
                        rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;

                params->mtu_result = dev->mtu; /* union with tot_len */
        }

        return rc;
}

static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
        .func                = bpf_skb_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
                                            u32 ifindex)
{
        struct net *netns = dev_net(dev_curr);

        /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
        if (ifindex == 0)
                return dev_curr;

        return dev_get_by_index_rcu(netns, ifindex);
}

BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
        struct net_device *dev = skb->dev;
        int skb_len, dev_len;
        int mtu;

        if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
                return -EINVAL;

        if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);

        dev_len = mtu + dev->hard_header_len;

        /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;

        skb_len += len_diff; /* minus result pass check */
        if (skb_len <= dev_len) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                goto out;
        }
        /* At this point, skb->len exceed MTU, but as it include length of all
         * segments, it can still be below MTU.  The SKB can possibly get
         * re-segmented in transmit path (see validate_xmit_skb).  Thus, user
         * must choose if segs are to be MTU checked.
         */
        if (skb_is_gso(skb)) {
                ret = BPF_MTU_CHK_RET_SUCCESS;

                if (flags & BPF_MTU_CHK_SEGS &&
                    !skb_gso_validate_network_len(skb, mtu))
                        ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
        }
out:
        /* BPF verifier guarantees valid pointer */
        *mtu_len = mtu;

        return ret;
}

BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        struct net_device *dev = xdp->rxq->dev;
        int xdp_len = xdp->data_end - xdp->data;
        int ret = BPF_MTU_CHK_RET_SUCCESS;
        int mtu, dev_len;

        /* XDP variant doesn't support multi-buffer segment check (yet) */
        if (unlikely(flags))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);

        /* Add L2-header as dev MTU is L3 size */
        dev_len = mtu + dev->hard_header_len;

        /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        if (*mtu_len)
                xdp_len = *mtu_len + dev->hard_header_len;

        xdp_len += len_diff; /* minus result pass check */
        if (xdp_len > dev_len)
                ret = BPF_MTU_CHK_RET_FRAG_NEEDED;

        /* BPF verifier guarantees valid pointer */
        *mtu_len = mtu;

        return ret;
}

static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
        .func                = bpf_skb_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_INT,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
        .func                = bpf_xdp_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_INT,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
        int err;
        struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;

        if (!seg6_validate_srh(srh, len, false))
                return -EINVAL;

        switch (type) {
        case BPF_LWT_ENCAP_SEG6_INLINE:
                if (skb->protocol != htons(ETH_P_IPV6))
                        return -EBADMSG;

                err = seg6_do_srh_inline(skb, srh);
                break;
        case BPF_LWT_ENCAP_SEG6:
                skb_reset_inner_headers(skb);
                skb->encapsulation = 1;
                err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
                break;
        default:
                return -EINVAL;
        }

        bpf_compute_data_pointers(skb);
        if (err)
                return err;

        skb_set_transport_header(skb, sizeof(struct ipv6hdr));

        return seg6_lookup_nexthop(skb, NULL, 0);
}
#endif /* CONFIG_IPV6_SEG6_BPF */

#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                             bool ingress)
{
        return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
}
#endif

BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
           u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_LWT_ENCAP_SEG6:
        case BPF_LWT_ENCAP_SEG6_INLINE:
                return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
#endif
        default:
                return -EINVAL;
        }
}

BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
           void *, hdr, u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
#endif
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
        .func                = bpf_lwt_in_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
        .func                = bpf_lwt_xmit_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_tlvs, *srh_end, *ptr;
        int srhoff = 0;

        lockdep_assert_held(&srh_state->bh_lock);
        if (srh == NULL)
                return -EINVAL;

        srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
        srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);

        ptr = skb->data + offset;
        if (ptr >= srh_tlvs && ptr + len <= srh_end)
                srh_state->valid = false;
        else if (ptr < (void *)&srh->flags ||
                 ptr + len > (void *)&srh->segments)
                return -EFAULT;

        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;
        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);

        memcpy(skb->data + offset, from, len);
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
        .func                = bpf_lwt_seg6_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static void bpf_update_srh_state(struct sk_buff *skb)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int srhoff = 0;

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
                srh_state->srh = NULL;
        } else {
                srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
                srh_state->hdrlen = srh_state->srh->hdrlen << 3;
                srh_state->valid = true;
        }
}

BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
           u32, action, void *, param, u32, param_len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int hdroff = 0;
        int err;

        lockdep_assert_held(&srh_state->bh_lock);
        switch (action) {
        case SEG6_LOCAL_ACTION_END_X:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(struct in6_addr))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
        case SEG6_LOCAL_ACTION_END_T:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_DT6:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;

                if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
                        return -EBADMSG;
                if (!pskb_pull(skb, hdroff))
                        return -EBADMSG;

                skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
                skb_reset_network_header(skb);
                skb_reset_transport_header(skb);
                skb->encapsulation = 0;

                bpf_compute_data_pointers(skb);
                bpf_update_srh_state(skb);
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_B6:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        case SEG6_LOCAL_ACTION_END_B6_ENCAP:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
        .func                = bpf_lwt_seg6_action,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
           s32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_end, *srh_tlvs, *ptr;
        struct ipv6hdr *hdr;
        int srhoff = 0;
        int ret;

        lockdep_assert_held(&srh_state->bh_lock);
        if (unlikely(srh == NULL))
                return -EINVAL;

        srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
                        ((srh->first_segment + 1) << 4));
        srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
                        srh_state->hdrlen);
        ptr = skb->data + offset;

        if (unlikely(ptr < srh_tlvs || ptr > srh_end))
                return -EFAULT;
        if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
                return -EFAULT;

        if (len > 0) {
                ret = skb_cow_head(skb, len);
                if (unlikely(ret < 0))
                        return ret;

                ret = bpf_skb_net_hdr_push(skb, offset, len);
        } else {
                ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
        }

        bpf_compute_data_pointers(skb);
        if (unlikely(ret < 0))
                return ret;

        hdr = (struct ipv6hdr *)skb->data;
        hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
        srh_state->hdrlen += len;
        srh_state->valid = false;
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
        .func                = bpf_lwt_seg6_adjust_srh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};
#endif /* CONFIG_IPV6_SEG6_BPF */

#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
                              int dif, int sdif, u8 family, u8 proto)
{
        struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
        bool refcounted = false;
        struct sock *sk = NULL;

        if (family == AF_INET) {
                __be32 src4 = tuple->ipv4.saddr;
                __be32 dst4 = tuple->ipv4.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet_lookup(net, hinfo, NULL, 0,
                                           src4, tuple->ipv4.sport,
                                           dst4, tuple->ipv4.dport,
                                           dif, sdif, &refcounted);
                else
                        sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
                                               dst4, tuple->ipv4.dport,
                                               dif, sdif, net->ipv4.udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
                struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet6_lookup(net, hinfo, NULL, 0,
                                            src6, tuple->ipv6.sport,
                                            dst6, ntohs(tuple->ipv6.dport),
                                            dif, sdif, &refcounted);
                else if (likely(ipv6_bpf_stub))
                        sk = ipv6_bpf_stub->udp6_lib_lookup(net,
                                                            src6, tuple->ipv6.sport,
                                                            dst6, tuple->ipv6.dport,
                                                            dif, sdif,
                                                            net->ipv4.udp_table, NULL);
#endif
        }

        if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                sk = NULL;
        }
        return sk;
}

/* bpf_skc_lookup performs the core lookup for different types of sockets,
 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
 */
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                 u64 flags, int sdif)
{
        struct sock *sk = NULL;
        struct net *net;
        u8 family;

        if (len == sizeof(tuple->ipv4))
                family = AF_INET;
        else if (len == sizeof(tuple->ipv6))
                family = AF_INET6;
        else
                return NULL;

        if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
                goto out;

        if (sdif < 0) {
                if (family == AF_INET)
                        sdif = inet_sdif(skb);
                else
                        sdif = inet6_sdif(skb);
        }

        if ((s32)netns_id < 0) {
                net = caller_net;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
        } else {
                net = get_net_ns_by_id(caller_net, netns_id);
                if (unlikely(!net))
                        goto out;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
                put_net(net);
        }

out:
        return sk;
}

static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                u64 flags, int sdif)
{
        struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
                                           ifindex, proto, netns_id, flags,
                                           sdif);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (!sk_fullsock(sk2))
                        sk2 = NULL;
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

static struct sock *
bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
               u8 proto, u64 netns_id, u64 flags)
{
        struct net *caller_net;
        int ifindex;

        if (skb->dev) {
                caller_net = dev_net(skb->dev);
                ifindex = skb->dev->ifindex;
        } else {
                caller_net = sock_net(skb->sk);
                ifindex = 0;
        }

        return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
                                netns_id, flags, -1);
}

static struct sock *
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
              u8 proto, u64 netns_id, u64 flags)
{
        struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
                                         flags);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (!sk_fullsock(sk2))
                        sk2 = NULL;
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
                                             netns_id, flags);
}

static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
        .func                = bpf_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
        .func                = bpf_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
        .func                = bpf_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
        .func                = bpf_tc_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
        .func                = bpf_tc_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
        .func                = bpf_tc_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
        if (sk && sk_is_refcounted(sk))
                sock_gen_put(sk);
        return 0;
}

static const struct bpf_func_proto bpf_sk_release_proto = {
        .func                = bpf_sk_release,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
};

BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
        .func           = bpf_xdp_sk_lookup_udp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
        .func           = bpf_xdp_skc_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
        .func           = bpf_xdp_sk_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
                                               sock_net(ctx->sk), 0,
                                               IPPROTO_TCP, netns_id, flags,
                                               -1);
}

static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
        .func                = bpf_sock_addr_skc_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_TCP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
        .func                = bpf_sock_addr_sk_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_UDP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
        .func                = bpf_sock_addr_sk_lookup_udp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
                                          icsk_retransmits))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_tcp_sock, bytes_received):
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                return size == sizeof(__u64);
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_TCP_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) >        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct tcp_sock, FIELD)); \
        } while (0)

#define BPF_INET_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct inet_connection_sock,        \
                                          FIELD) >                        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                        \
                                        struct inet_connection_sock,        \
                                        FIELD),                                \
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(                                \
                                        struct inet_connection_sock,        \
                                        FIELD));                        \
        } while (0)

        BTF_TYPE_EMIT(struct bpf_tcp_sock);

        switch (si->off) {
        case offsetof(struct bpf_tcp_sock, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      offsetof(struct minmax_sample, v));
                break;
        case offsetof(struct bpf_tcp_sock, snd_cwnd):
                BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
                break;
        case offsetof(struct bpf_tcp_sock, srtt_us):
                BPF_TCP_SOCK_GET_COMMON(srtt_us);
                break;
        case offsetof(struct bpf_tcp_sock, snd_ssthresh):
                BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
                break;
        case offsetof(struct bpf_tcp_sock, rcv_nxt):
                BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_nxt):
                BPF_TCP_SOCK_GET_COMMON(snd_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_una):
                BPF_TCP_SOCK_GET_COMMON(snd_una);
                break;
        case offsetof(struct bpf_tcp_sock, mss_cache):
                BPF_TCP_SOCK_GET_COMMON(mss_cache);
                break;
        case offsetof(struct bpf_tcp_sock, ecn_flags):
                BPF_TCP_SOCK_GET_COMMON(ecn_flags);
                break;
        case offsetof(struct bpf_tcp_sock, rate_delivered):
                BPF_TCP_SOCK_GET_COMMON(rate_delivered);
                break;
        case offsetof(struct bpf_tcp_sock, rate_interval_us):
                BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
                break;
        case offsetof(struct bpf_tcp_sock, packets_out):
                BPF_TCP_SOCK_GET_COMMON(packets_out);
                break;
        case offsetof(struct bpf_tcp_sock, retrans_out):
                BPF_TCP_SOCK_GET_COMMON(retrans_out);
                break;
        case offsetof(struct bpf_tcp_sock, total_retrans):
                BPF_TCP_SOCK_GET_COMMON(total_retrans);
                break;
        case offsetof(struct bpf_tcp_sock, segs_in):
                BPF_TCP_SOCK_GET_COMMON(segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_in):
                BPF_TCP_SOCK_GET_COMMON(data_segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, segs_out):
                BPF_TCP_SOCK_GET_COMMON(segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_out):
                BPF_TCP_SOCK_GET_COMMON(data_segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, lost_out):
                BPF_TCP_SOCK_GET_COMMON(lost_out);
                break;
        case offsetof(struct bpf_tcp_sock, sacked_out):
                BPF_TCP_SOCK_GET_COMMON(sacked_out);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_received):
                BPF_TCP_SOCK_GET_COMMON(bytes_received);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                BPF_TCP_SOCK_GET_COMMON(bytes_acked);
                break;
        case offsetof(struct bpf_tcp_sock, dsack_dups):
                BPF_TCP_SOCK_GET_COMMON(dsack_dups);
                break;
        case offsetof(struct bpf_tcp_sock, delivered):
                BPF_TCP_SOCK_GET_COMMON(delivered);
                break;
        case offsetof(struct bpf_tcp_sock, delivered_ce):
                BPF_TCP_SOCK_GET_COMMON(delivered_ce);
                break;
        case offsetof(struct bpf_tcp_sock, icsk_retransmits):
                BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
                break;
        }

        return insn - insn_buf;
}

BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
        if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_tcp_sock_proto = {
        .func                = bpf_tcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_TCP_SOCK_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
        sk = sk_to_full_sk(sk);

        if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_get_listener_sock_proto = {
        .func                = bpf_get_listener_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
{
        unsigned int iphdr_len;

        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                iphdr_len = sizeof(struct iphdr);
                break;
        case cpu_to_be16(ETH_P_IPV6):
                iphdr_len = sizeof(struct ipv6hdr);
                break;
        default:
                return 0;
        }

        if (skb_headlen(skb) < iphdr_len)
                return 0;

        if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
                return 0;

        return INET_ECN_set_ce(skb);
}

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_XDP_SOCK_GET(FIELD)                                                \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) >        \
                             sizeof_field(struct bpf_xdp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct xdp_sock, FIELD)); \
        } while (0)

        switch (si->off) {
        case offsetof(struct bpf_xdp_sock, queue_id):
                BPF_XDP_SOCK_GET(queue_id);
                break;
        }

        return insn - insn_buf;
}

static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
        .func           = bpf_skb_ecn_set_ce,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        int ret;

        if (unlikely(!sk || th_len < sizeof(*th)))
                return -EINVAL;

        /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -EINVAL;

        if (!th->ack || th->rst || th->syn)
                return -ENOENT;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        if (tcp_synq_no_recent_overflow(sk))
                return -ENOENT;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                ret = __cookie_v4_check((struct iphdr *)iph, th);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }

        if (ret > 0)
                return 0;

        return -ENOENT;
#else
        return -ENOTSUPP;
#endif
}

static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
        .func                = bpf_tcp_check_syncookie,
        .gpl_only        = true,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        u32 cookie;
        u16 mss;

        if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -ENOENT;

        if (!th->syn || th->ack || th->fin || th->rst)
                return -EINVAL;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }
        if (mss == 0)
                return -ENOENT;

        return cookie | ((u64)mss << 32);
#else
        return -EOPNOTSUPP;
#endif /* CONFIG_SYN_COOKIES */
}

static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
        .func                = bpf_tcp_gen_syncookie,
        .gpl_only        = true, /* __cookie_v*_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
{
        if (!sk || flags != 0)
                return -EINVAL;
        if (!skb_at_tc_ingress(skb))
                return -EOPNOTSUPP;
        if (unlikely(dev_net(skb->dev) != sock_net(sk)))
                return -ENETUNREACH;
        if (sk_unhashed(sk))
                return -EOPNOTSUPP;
        if (sk_is_refcounted(sk) &&
            unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                return -ENOENT;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_pfree;

        return 0;
}

static const struct bpf_func_proto bpf_sk_assign_proto = {
        .func                = bpf_sk_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg3_type        = ARG_ANYTHING,
};

static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
                                    u8 search_kind, const u8 *magic,
                                    u8 magic_len, bool *eol)
{
        u8 kind, kind_len;

        *eol = false;

        while (op < opend) {
                kind = op[0];

                if (kind == TCPOPT_EOL) {
                        *eol = true;
                        return ERR_PTR(-ENOMSG);
                } else if (kind == TCPOPT_NOP) {
                        op++;
                        continue;
                }

                if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
                        /* Something is wrong in the received header.
                         * Follow the TCP stack's tcp_parse_options()
                         * and just bail here.
                         */
                        return ERR_PTR(-EFAULT);

                kind_len = op[1];
                if (search_kind == kind) {
                        if (!magic_len)
                                return op;

                        if (magic_len > kind_len - 2)
                                return ERR_PTR(-ENOMSG);

                        if (!memcmp(&op[2], magic, magic_len))
                                return op;
                }

                op += kind_len;
        }

        return ERR_PTR(-ENOMSG);
}

BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           void *, search_res, u32, len, u64, flags)
{
        bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
        const u8 *op, *opend, *magic, *search = search_res;
        u8 search_kind, search_len, copy_len, magic_len;
        int ret;

        /* 2 byte is the minimal option len except TCPOPT_NOP and
         * TCPOPT_EOL which are useless for the bpf prog to learn
         * and this helper disallow loading them also.
         */
        if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
                return -EINVAL;

        search_kind = search[0];
        search_len = search[1];

        if (search_len > len || search_kind == TCPOPT_NOP ||
            search_kind == TCPOPT_EOL)
                return -EINVAL;

        if (search_kind == TCPOPT_EXP || search_kind == 253) {
                /* 16 or 32 bit magic.  +2 for kind and kind length */
                if (search_len != 4 && search_len != 6)
                        return -EINVAL;
                magic = &search[2];
                magic_len = search_len - 2;
        } else {
                if (search_len)
                        return -EINVAL;
                magic = NULL;
                magic_len = 0;
        }

        if (load_syn) {
                ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
                if (ret < 0)
                        return ret;

                opend = op + ret;
                op += sizeof(struct tcphdr);
        } else {
                if (!bpf_sock->skb ||
                    bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                        /* This bpf_sock->op cannot call this helper */
                        return -EPERM;

                opend = bpf_sock->skb_data_end;
                op = bpf_sock->skb->data + sizeof(struct tcphdr);
        }

        op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
                                &eol);
        if (IS_ERR(op))
                return PTR_ERR(op);

        copy_len = op[1];
        ret = copy_len;
        if (copy_len > len) {
                ret = -ENOSPC;
                copy_len = len;
        }

        memcpy(search_res, op, copy_len);
        return ret;
}

static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
        .func                = bpf_sock_ops_load_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           const void *, from, u32, len, u64, flags)
{
        u8 new_kind, new_kind_len, magic_len = 0, *opend;
        const u8 *op, *new_op, *magic = NULL;
        struct sk_buff *skb;
        bool eol;

        if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
                return -EPERM;

        if (len < 2 || flags)
                return -EINVAL;

        new_op = from;
        new_kind = new_op[0];
        new_kind_len = new_op[1];

        if (new_kind_len > len || new_kind == TCPOPT_NOP ||
            new_kind == TCPOPT_EOL)
                return -EINVAL;

        if (new_kind_len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        /* 253 is another experimental kind */
        if (new_kind == TCPOPT_EXP || new_kind == 253)  {
                if (new_kind_len < 4)
                        return -EINVAL;
                /* Match for the 2 byte magic also.
                 * RFC 6994: the magic could be 2 or 4 bytes.
                 * Hence, matching by 2 byte only is on the
                 * conservative side but it is the right
                 * thing to do for the 'search-for-duplication'
                 * purpose.
                 */
                magic = &new_op[2];
                magic_len = 2;
        }

        /* Check for duplication */
        skb = bpf_sock->skb;
        op = skb->data + sizeof(struct tcphdr);
        opend = bpf_sock->skb_data_end;

        op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
                                &eol);
        if (!IS_ERR(op))
                return -EEXIST;

        if (PTR_ERR(op) != -ENOMSG)
                return PTR_ERR(op);

        if (eol)
                /* The option has been ended.  Treat it as no more
                 * header option can be written.
                 */
                return -ENOSPC;

        /* No duplication found.  Store the header option. */
        memcpy(opend, from, new_kind_len);

        bpf_sock->remaining_opt_len -= new_kind_len;
        bpf_sock->skb_data_end += new_kind_len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
        .func                = bpf_sock_ops_store_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           u32, len, u64, flags)
{
        if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                return -EPERM;

        if (flags || len < 2)
                return -EINVAL;

        if (len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        bpf_sock->remaining_opt_len -= len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
        .func                = bpf_sock_ops_reserve_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
           u64, tstamp, u32, tstamp_type)
{
        /* skb_clear_delivery_time() is done for inet protocol */
        if (skb->protocol != htons(ETH_P_IP) &&
            skb->protocol != htons(ETH_P_IPV6))
                return -EOPNOTSUPP;

        switch (tstamp_type) {
        case BPF_SKB_CLOCK_REALTIME:
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                break;
        case BPF_SKB_CLOCK_MONOTONIC:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case BPF_SKB_CLOCK_TAI:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
        .func           = bpf_skb_set_tstamp,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

#ifdef CONFIG_SYN_COOKIES
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
        cookie = __cookie_v4_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
#if IS_BUILTIN(CONFIG_IPV6)
        const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
                sizeof(struct ipv6hdr);
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
        cookie = __cookie_v6_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th)
{
        if (__cookie_v4_check(iph, th) > 0)
                return 0;

        return -EACCES;
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg2_size        = sizeof(struct tcphdr),
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th)
{
#if IS_BUILTIN(CONFIG_IPV6)
        if (__cookie_v6_check(iph, th) > 0)
                return 0;

        return -EACCES;
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg2_size        = sizeof(struct tcphdr),
};
#endif /* CONFIG_SYN_COOKIES */

#endif /* CONFIG_INET */

bool bpf_helper_changes_pkt_data(void *func)
{
        if (func == bpf_skb_vlan_push ||
            func == bpf_skb_vlan_pop ||
            func == bpf_skb_store_bytes ||
            func == bpf_skb_change_proto ||
            func == bpf_skb_change_head ||
            func == sk_skb_change_head ||
            func == bpf_skb_change_tail ||
            func == sk_skb_change_tail ||
            func == bpf_skb_adjust_room ||
            func == sk_skb_adjust_room ||
            func == bpf_skb_pull_data ||
            func == sk_skb_pull_data ||
            func == bpf_clone_redirect ||
            func == bpf_l3_csum_replace ||
            func == bpf_l4_csum_replace ||
            func == bpf_xdp_adjust_head ||
            func == bpf_xdp_adjust_meta ||
            func == bpf_msg_pull_data ||
            func == bpf_msg_push_data ||
            func == bpf_msg_pop_data ||
            func == bpf_xdp_adjust_tail ||
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
            func == bpf_lwt_seg6_store_bytes ||
            func == bpf_lwt_seg6_adjust_srh ||
            func == bpf_lwt_seg6_action ||
#endif
#ifdef CONFIG_INET
            func == bpf_sock_ops_store_hdr_opt ||
#endif
            func == bpf_lwt_in_push_encap ||
            func == bpf_lwt_xmit_push_encap)
                return true;

        return false;
}

const struct bpf_func_proto bpf_event_output_data_proto __weak;
const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;

static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        func_proto = cgroup_current_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_cg_sock_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        func_proto = cgroup_current_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_bind:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                        return &bpf_bind_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_addr_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_addr_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sock_addr_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sock_addr_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_setsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_setsockopt_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_getsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_getsockopt_proto;
                default:
                        return NULL;
                }
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;

static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
        case BPF_FUNC_sk_cgroup_id:
                return &bpf_sk_cgroup_id_proto;
        case BPF_FUNC_sk_ancestor_cgroup_id:
                return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
#endif
        default:
                return sk_filter_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_skb_vlan_push:
                return &bpf_skb_vlan_push_proto;
        case BPF_FUNC_skb_vlan_pop:
                return &bpf_skb_vlan_pop_proto;
        case BPF_FUNC_skb_change_proto:
                return &bpf_skb_change_proto_proto;
        case BPF_FUNC_skb_change_type:
                return &bpf_skb_change_type_proto;
        case BPF_FUNC_skb_adjust_room:
                return &bpf_skb_adjust_room_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_redirect_neigh:
                return &bpf_redirect_neigh_proto;
        case BPF_FUNC_redirect_peer:
                return &bpf_redirect_peer_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_set_hash:
                return &bpf_set_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_skb_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_skb_check_mtu_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_XFRM
        case BPF_FUNC_skb_get_xfrm_state:
                return &bpf_skb_get_xfrm_state_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_skb_cgroup_classid:
                return &bpf_skb_cgroup_classid_proto;
#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_tc_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_tc_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_tc_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_assign_proto;
        case BPF_FUNC_skb_set_tstamp:
                return &bpf_skb_set_tstamp_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_xdp_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_xdp_adjust_head:
                return &bpf_xdp_adjust_head_proto;
        case BPF_FUNC_xdp_adjust_meta:
                return &bpf_xdp_adjust_meta_proto;
        case BPF_FUNC_redirect:
                return &bpf_xdp_redirect_proto;
        case BPF_FUNC_redirect_map:
                return &bpf_xdp_redirect_map_proto;
        case BPF_FUNC_xdp_adjust_tail:
                return &bpf_xdp_adjust_tail_proto;
        case BPF_FUNC_xdp_get_buff_len:
                return &bpf_xdp_get_buff_len_proto;
        case BPF_FUNC_xdp_load_bytes:
                return &bpf_xdp_load_bytes_proto;
        case BPF_FUNC_xdp_store_bytes:
                return &bpf_xdp_store_bytes_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_xdp_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_xdp_check_mtu_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_xdp_sk_lookup_udp_proto;
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_xdp_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_xdp_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }

#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
        /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
         * kfuncs are defined in two different modules, and we want to be able
         * to use them interchangeably with the same BTF type ID. Because modules
         * can't de-duplicate BTF IDs between each other, we need the type to be
         * referenced in the vmlinux BTF or the verifier will get confused about
         * the different types. So we add this dummy type reference which will
         * be included in vmlinux BTF, allowing both modules to refer to the
         * same type ID.
         */
        BTF_TYPE_EMIT(struct nf_conn___init);
#endif
}

const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;

static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sock_ops_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sock_ops_getsockopt_proto;
        case BPF_FUNC_sock_ops_cb_flags_set:
                return &bpf_sock_ops_cb_flags_set_proto;
        case BPF_FUNC_sock_map_update:
                return &bpf_sock_map_update_proto;
        case BPF_FUNC_sock_hash_update:
                return &bpf_sock_hash_update_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_ops_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_ops_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_load_hdr_opt:
                return &bpf_sock_ops_load_hdr_opt_proto;
        case BPF_FUNC_store_hdr_opt:
                return &bpf_sock_ops_store_hdr_opt_proto;
        case BPF_FUNC_reserve_hdr_opt:
                return &bpf_sock_ops_reserve_hdr_opt_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_msg_redirect_map:
                return &bpf_msg_redirect_map_proto;
        case BPF_FUNC_msg_redirect_hash:
                return &bpf_msg_redirect_hash_proto;
        case BPF_FUNC_msg_apply_bytes:
                return &bpf_msg_apply_bytes_proto;
        case BPF_FUNC_msg_cork_bytes:
                return &bpf_msg_cork_bytes_proto;
        case BPF_FUNC_msg_pull_data:
                return &bpf_msg_pull_data_proto;
        case BPF_FUNC_msg_push_data:
                return &bpf_msg_push_data_proto;
        case BPF_FUNC_msg_pop_data:
                return &bpf_msg_pop_data_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sk_msg_proto;
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_curr_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &sk_skb_pull_data_proto;
        case BPF_FUNC_skb_change_tail:
                return &sk_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &sk_skb_change_head_proto;
        case BPF_FUNC_skb_adjust_room:
                return &sk_skb_adjust_room_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_sk_redirect_map:
                return &bpf_sk_redirect_map_proto;
        case BPF_FUNC_sk_redirect_hash:
                return &bpf_sk_redirect_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_flow_dissector_load_bytes_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_in_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_xmit_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_FUNC_lwt_seg6_store_bytes:
                return &bpf_lwt_seg6_store_bytes_proto;
        case BPF_FUNC_lwt_seg6_action:
                return &bpf_lwt_seg6_action_proto;
        case BPF_FUNC_lwt_seg6_adjust_srh:
                return &bpf_lwt_seg6_adjust_srh_proto;
#endif
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                if (off + size > offsetofend(struct __sk_buff, cb[4]))
                        return false;
                break;
        case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
        case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (size != size_default)
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                return false;
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range(struct __sk_buff, tstamp):
                if (size != sizeof(__u64))
                        return false;
                break;
        case offsetof(struct __sk_buff, sk):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                break;
        case offsetof(struct __sk_buff, tstamp_type):
                return false;
        case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
                /* Explicitly prohibit access to padding in __sk_buff. */
                return false;
        default:
                /* Only narrow read access allowed for now. */
                if (type == BPF_WRITE) {
                        if (size != size_default)
                                return false;
                } else {
                        bpf_ctx_record_field_size(info, size_default);
                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                }
        }

        return true;
}

static bool sk_filter_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool cg_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                        return false;
                break;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                case bpf_ctx_range(struct __sk_buff, tstamp):
                        if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                                return false;
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool lwt_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
                                            enum bpf_access_type access_type,
                                            enum bpf_attach_type attach_type)
{
        switch (off) {
        case offsetof(struct bpf_sock, bound_dev_if):
        case offsetof(struct bpf_sock, mark):
        case offsetof(struct bpf_sock, priority):
                switch (attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                case BPF_CGROUP_INET_SOCK_RELEASE:
                        goto full_access;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_ip4):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
                switch (attach_type) {
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_port):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        }
read_only:
        return access_type == BPF_READ;
full_access:
        return true;
}

bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range_till(struct bpf_sock, type, priority):
                return false;
        default:
                return bpf_sock_is_valid_access(off, size, type, info);
        }
}

bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);
        int field_size;

        if (off < 0 || off >= sizeof(struct bpf_sock))
                return false;
        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_sock, state):
        case offsetof(struct bpf_sock, family):
        case offsetof(struct bpf_sock, type):
        case offsetof(struct bpf_sock, protocol):
        case offsetof(struct bpf_sock, src_port):
        case offsetof(struct bpf_sock, rx_queue_mapping):
        case bpf_ctx_range(struct bpf_sock, src_ip4):
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock, dst_ip4):
        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);
        case bpf_ctx_range(struct bpf_sock, dst_port):
                field_size = size == size_default ?
                        size_default : sizeof_field(struct bpf_sock, dst_port);
                bpf_ctx_record_field_size(info, field_size);
                return bpf_ctx_narrow_access_ok(off, size, field_size);
        case offsetofend(struct bpf_sock, dst_port) ...
             offsetof(struct bpf_sock, dst_ip4) - 1:
                return false;
        }

        return size == size_default;
}

static bool sock_filter_is_valid_access(int off, int size,
                                        enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        if (!bpf_sock_is_valid_access(off, size, type, info))
                return false;
        return __sock_filter_check_attach_type(off, type,
                                               prog->expected_attach_type);
}

static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
                             const struct bpf_prog *prog)
{
        /* Neither direct read nor direct write requires any preliminary
         * action.
         */
        return 0;
}

static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
                                const struct bpf_prog *prog, int drop_verdict)
{
        struct bpf_insn *insn = insn_buf;

        if (!direct_write)
                return 0;

        /* if (!skb->cloned)
         *       goto start;
         *
         * (Fast-path, otherwise approximation that we might be
         *  a clone, do the rest in helper.)
         */
        *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);

        /* ret = bpf_skb_pull_data(skb, 0); */
        *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
        *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
        *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
                               BPF_FUNC_skb_pull_data);
        /* if (!ret)
         *      goto restore;
         * return TC_ACT_SHOT;
         */
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
        *insn++ = BPF_EXIT_INSN();

        /* restore: */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
        /* start: */
        *insn++ = prog->insnsi[0];

        return insn - insn_buf;
}

static int bpf_gen_ld_abs(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf)
{
        bool indirect = BPF_MODE(orig->code) == BPF_IND;
        struct bpf_insn *insn = insn_buf;

        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
                if (orig->imm)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
        }
        /* We're guaranteed here that CTX is in R6. */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);

        switch (BPF_SIZE(orig->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
                break;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
        *insn++ = BPF_EXIT_INSN();

        return insn - insn_buf;
}

static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
                               const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
}

static bool tc_cls_act_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       const struct bpf_prog *prog,
                                       struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range(struct __sk_buff, tc_classid):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                case bpf_ctx_range(struct __sk_buff, tstamp):
                case bpf_ctx_range(struct __sk_buff, queue_mapping):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
                return false;
        case offsetof(struct __sk_buff, tstamp_type):
                /* The convert_ctx_access() on reading and writing
                 * __sk_buff->tstamp depends on whether the bpf prog
                 * has used __sk_buff->tstamp_type or not.
                 * Thus, we need to set prog->tstamp_type_access
                 * earlier during is_valid_access() here.
                 */
                ((struct bpf_prog *)prog)->tstamp_type_access = 1;
                return size == sizeof(__u8);
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

DEFINE_MUTEX(nf_conn_btf_access_lock);
EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);

int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                              const struct bpf_reg_state *reg,
                              int off, int size);
EXPORT_SYMBOL_GPL(nfct_btf_struct_access);

static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
                                        const struct bpf_reg_state *reg,
                                        int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool __is_valid_xdp_access(int off, int size)
{
        if (off < 0 || off >= sizeof(struct xdp_md))
                return false;
        if (off % size != 0)
                return false;
        if (size != sizeof(__u32))
                return false;

        return true;
}

static bool xdp_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
                switch (off) {
                case offsetof(struct xdp_md, egress_ifindex):
                        return false;
                }
        }

        if (type == BPF_WRITE) {
                if (bpf_prog_is_offloaded(prog->aux)) {
                        switch (off) {
                        case offsetof(struct xdp_md, rx_queue_index):
                                return __is_valid_xdp_access(off, size);
                        }
                }
                return false;
        }

        switch (off) {
        case offsetof(struct xdp_md, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case offsetof(struct xdp_md, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case offsetof(struct xdp_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return __is_valid_xdp_access(off, size);
}

void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act)
{
        const u32 act_max = XDP_REDIRECT;

        pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
                     act > act_max ? "Illegal" : "Driver unsupported",
                     act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);

static int xdp_btf_struct_access(struct bpf_verifier_log *log,
                                 const struct bpf_reg_state *reg,
                                 int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool sock_addr_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_addr))
                return false;
        if (off % size != 0)
                return false;

        /* Disallow access to fields not belonging to the attach type's address
         * family.
         */
        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP4_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP6_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        }

        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, user_port):
                if (type == BPF_READ) {
                        bpf_ctx_record_field_size(info, size_default);

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                } else {
                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (size != size_default)
                                return false;
                }
                break;
        case offsetof(struct bpf_sock_addr, sk):
                if (type != BPF_READ)
                        return false;
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        default:
                if (type == BPF_READ) {
                        if (size != size_default)
                                return false;
                } else {
                        return false;
                }
        }

        return true;
}

static bool sock_ops_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     const struct bpf_prog *prog,
                                     struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_ops))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        if (type == BPF_WRITE) {
                switch (off) {
                case offsetof(struct bpf_sock_ops, reply):
                case offsetof(struct bpf_sock_ops, sk_txhash):
                        if (size != size_default)
                                return false;
                        break;
                default:
                        return false;
                }
        } else {
                switch (off) {
                case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
                                        bytes_acked):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                case offsetof(struct bpf_sock_ops, sk):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_SOCKET_OR_NULL;
                        break;
                case offsetof(struct bpf_sock_ops, skb_data):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET;
                        break;
                case offsetof(struct bpf_sock_ops, skb_data_end):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET_END;
                        break;
                case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                        bpf_ctx_record_field_size(info, size_default);
                        return bpf_ctx_narrow_access_ok(off, size,
                                                        size_default);
                case offsetof(struct bpf_sock_ops, skb_hwtstamp):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                default:
                        if (size != size_default)
                                return false;
                        break;
                }
        }

        return true;
}

static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
                           const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
}

static bool sk_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, mark):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool sk_msg_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE)
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct sk_msg_md, data):
                info->reg_type = PTR_TO_PACKET;
                if (size != sizeof(__u64))
                        return false;
                break;
        case offsetof(struct sk_msg_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                if (size != sizeof(__u64))
                        return false;
                break;
        case offsetof(struct sk_msg_md, sk):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct sk_msg_md, family):
        case bpf_ctx_range(struct sk_msg_md, remote_ip4):
        case bpf_ctx_range(struct sk_msg_md, local_ip4):
        case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct sk_msg_md, remote_port):
        case bpf_ctx_range(struct sk_msg_md, local_port):
        case bpf_ctx_range(struct sk_msg_md, size):
                if (size != sizeof(__u32))
                        return false;
                break;
        default:
                return false;
        }
        return true;
}

static bool flow_dissector_is_valid_access(int off, int size,
                                           enum bpf_access_type type,
                                           const struct bpf_prog *prog,
                                           struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        if (type == BPF_WRITE)
                return false;

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                if (size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET;
                return true;
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET_END;
                return true;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_FLOW_KEYS;
                return true;
        default:
                return false;
        }
}

static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
                                             const struct bpf_insn *si,
                                             struct bpf_insn *insn_buf,
                                             struct bpf_prog *prog,
                                             u32 *target_size)

{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data));
                break;

        case offsetof(struct __sk_buff, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data_end));
                break;

        case offsetof(struct __sk_buff, flow_keys):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, flow_keys));
                break;
        }

        return insn - insn_buf;
}

static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
                                                     struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;
        BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
        BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
        BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
        BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
        *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
#ifdef __BIG_ENDIAN_BITFIELD
        *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
#else
        BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
                                                  struct bpf_insn *insn)
{
        /* si->dst_reg = skb_shinfo(SKB); */
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              BPF_REG_AX, skb_reg,
                              offsetof(struct sk_buff, end));
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, head));
        *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
#else
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, end));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
                                                const struct bpf_insn *si,
                                                struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, read skb->tstamp as is if tstamp_type_access is true.
         */
        if (!prog->tstamp_type_access) {
                /* AX is needed because src_reg and dst_reg could be the same */
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* check if ingress mask bits is set */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                *insn++ = BPF_JMP_A(4);
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
                *insn++ = BPF_JMP_A(2);
                /* skb->tc_at_ingress && skb->tstamp_type,
                 * read 0 as the (rcv) timestamp.
                 */
                *insn++ = BPF_MOV64_IMM(value_reg, 0);
                *insn++ = BPF_JMP_A(1);
        }
#endif

        *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
                              offsetof(struct sk_buff, tstamp));
        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
                                                 const struct bpf_insn *si,
                                                 struct bpf_insn *insn)
{
        __u8 value_reg = si->src_reg;
        __u8 skb_reg = si->dst_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, write skb->tstamp as is if tstamp_type_access is true.
         * Otherwise, writing at ingress will have to clear the
         * skb->tstamp_type bit also.
         */
        if (!prog->tstamp_type_access) {
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* Writing __sk_buff->tstamp as ingress, goto <clear> */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                /* goto <store> */
                *insn++ = BPF_JMP_A(2);
                /* <clear>: skb->tstamp_type */
                *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
                *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
        }
#endif

        /* <store>: skb->tstamp = tstamp */
        *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
                               skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
        return insn;
}

#define BPF_EMIT_STORE(size, si, off)                                        \
        BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM,                \
                     (si)->dst_reg, (si)->src_reg, (off), (si)->imm)

static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, len):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, len, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, protocol, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, vlan_proto):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_proto, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, priority):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, priority, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, priority, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, skb_iif, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, hash):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, hash, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, mark):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, mark, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, mark, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, pkt_type):
                *target_size = 1;
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                                      PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
                break;

        case offsetof(struct __sk_buff, queue_mapping):
                if (type == BPF_WRITE) {
                        u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);

                        if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
                                *insn++ = BPF_JMP_A(0); /* noop */
                                break;
                        }

                        if (BPF_CLASS(si->code) == BPF_STX)
                                *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
                } else {
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             queue_mapping,
                                                             2, target_size));
                }
                break;

        case offsetof(struct __sk_buff, vlan_present):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff,
                                                     vlan_all, 4, target_size));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
                break;

        case offsetof(struct __sk_buff, vlan_tci):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_tci, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct qdisc_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_classid):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);

                off  = si->off;
                off -= offsetof(struct __sk_buff, tc_classid);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, tc_classid);
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, data));
                break;

        case offsetof(struct __sk_buff, data_meta):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_meta);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_meta);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data_end):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_end);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_end);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si,
                                                 bpf_target_off(struct sk_buff, tc_index, 2,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, tc_index, 2,
                                                             target_size));
#else
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
                else
                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, napi_id):
#if defined(CONFIG_NET_RX_BUSY_POLL)
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, napi_id, 4,
                                                     target_size));
                *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
                *target_size = 4;
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_family,
                                                     2, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_daddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_rcv_saddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip6[0]) ...
             offsetof(struct __sk_buff, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, remote_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, local_ip6[0]) ...
             offsetof(struct __sk_buff, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, local_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_dport,
                                                     2, target_size));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct __sk_buff, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_num, 2, target_size));
                break;

        case offsetof(struct __sk_buff, tstamp):
                BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);

                if (type == BPF_WRITE)
                        insn = bpf_convert_tstamp_write(prog, si, insn);
                else
                        insn = bpf_convert_tstamp_read(prog, si, insn);
                break;

        case offsetof(struct __sk_buff, tstamp_type):
                insn = bpf_convert_tstamp_type_read(si, insn);
                break;

        case offsetof(struct __sk_buff, gso_segs):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_segs, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, gso_size):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_size, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, wire_len):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, wire_len);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, pkt_len);
                *target_size = 4;
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                break;
        case offsetof(struct __sk_buff, hwtstamp):
                BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
                BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);

                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW,
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                break;
        }

        return insn - insn_buf;
}

u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct bpf_sock, bound_dev_if):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_bound_dev_if));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_bound_dev_if));
                break;

        case offsetof(struct bpf_sock, mark):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_mark));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_mark));
                break;

        case offsetof(struct bpf_sock, priority):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_priority));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_priority));
                break;

        case offsetof(struct bpf_sock, family):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_family),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_family,
                                       sizeof_field(struct sock_common,
                                                    skc_family),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, type):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_type),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_type,
                                       sizeof_field(struct sock, sk_type),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, protocol):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_protocol),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_protocol,
                                       sizeof_field(struct sock, sk_protocol),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, src_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_rcv_saddr,
                                       sizeof_field(struct sock_common,
                                                    skc_rcv_saddr),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_daddr,
                                       sizeof_field(struct sock_common,
                                                    skc_daddr),
                                       target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, src_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(
                                struct sock_common,
                                skc_v6_rcv_saddr.s6_addr32[0],
                                sizeof_field(struct sock_common,
                                             skc_v6_rcv_saddr.s6_addr32[0]),
                                target_size) + off);
#else
                (void)off;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, dst_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_v6_daddr.s6_addr32[0],
                                       sizeof_field(struct sock_common,
                                                    skc_v6_daddr.s6_addr32[0]),
                                       target_size) + off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                *target_size = 4;
#endif
                break;

        case offsetof(struct bpf_sock, src_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_num),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_num,
                                       sizeof_field(struct sock_common,
                                                    skc_num),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_dport,
                                       sizeof_field(struct sock_common,
                                                    skc_dport),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, state):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_state),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_state,
                                       sizeof_field(struct sock_common,
                                                    skc_state),
                                       target_size));
                break;
        case offsetof(struct bpf_sock, rx_queue_mapping):
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_rx_queue_mapping,
                                       sizeof_field(struct sock,
                                                    sk_rx_queue_mapping),
                                       target_size));
                *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
                                      1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
#else
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
                *target_size = 2;
#endif
                break;
        }

        return insn - insn_buf;
}

static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
                                         const struct bpf_insn *si,
                                         struct bpf_insn *insn_buf,
                                         struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;
        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 xdp_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct xdp_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data));
                break;
        case offsetof(struct xdp_md, data_meta):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_meta));
                break;
        case offsetof(struct xdp_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_end));
                break;
        case offsetof(struct xdp_md, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        case offsetof(struct xdp_md, rx_queue_index):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info,
                                               queue_index));
                break;
        case offsetof(struct xdp_md, egress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, txq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_txq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        }

        return insn - insn_buf;
}

/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
 * context Structure, F is Field in context structure that contains a pointer
 * to Nested Structure of type NS that has the field NF.
 *
 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
 * sure that SIZE is not greater than actual size of S.F.NF.
 *
 * If offset OFF is provided, the load happens from that offset relative to
 * offset of NF.
 */
#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF)               \
        do {                                                                       \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,     \
                                      si->src_reg, offsetof(S, F));               \
                *insn++ = BPF_LDX_MEM(                                               \
                        SIZE, si->dst_reg, si->dst_reg,                               \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                + OFF);                                               \
        } while (0)

#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF)                               \
        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF,                       \
                                             BPF_FIELD_SIZEOF(NS, NF), 0)

/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
 *
 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
 * "register" since two registers available in convert_ctx_access are not
 * enough: we can't override neither SRC, since it contains value to store, nor
 * DST since it contains pointer to context that may be used by later
 * instructions. But we need a temporary place to save pointer to nested
 * structure whose field we want to store to.
 */
#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)               \
        do {                                                                       \
                int tmp_reg = BPF_REG_9;                                       \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg,               \
                                      offsetof(S, TF));                               \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,               \
                                      si->dst_reg, offsetof(S, F));               \
                *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code),   \
                                       tmp_reg, si->src_reg,                       \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                       + OFF,                                       \
                                       si->imm);                               \
                *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg,               \
                                      offsetof(S, TF));                               \
        } while (0)

#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
                                                      TF)                       \
        do {                                                                       \
                if (type == BPF_WRITE) {                                       \
                        SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
                                                         OFF, TF);               \
                } else {                                                       \
                        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(                       \
                                S, NS, F, NF, SIZE, OFF);  \
                }                                                               \
        } while (0)

#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF)                       \
        SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(                               \
                S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)

static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog, u32 *target_size)
{
        int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sock_addr, user_family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sockaddr, uaddr, sa_family);
                break;

        case offsetof(struct bpf_sock_addr, user_ip4):
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
                        sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
                        tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, user_port):
                /* To get port we need to know sa_family first and then treat
                 * sockaddr as either sockaddr_in or sockaddr_in6.
                 * Though we can simplify since port field has same offset and
                 * size in both structures.
                 * Here we check this invariant and use just one of the
                 * structures if it's true.
                 */
                BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
                             offsetof(struct sockaddr_in6, sin6_port));
                BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
                             sizeof_field(struct sockaddr_in6, sin6_port));
                /* Account for sin6_port being smaller than user_port. */
                port_size = min(port_size, BPF_LDST_BYTES(si));
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_family);
                break;

        case offsetof(struct bpf_sock_addr, type):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_type);
                break;

        case offsetof(struct bpf_sock_addr, protocol):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_protocol);
                break;

        case offsetof(struct bpf_sock_addr, msg_src_ip4):
                /* Treat t_ctx as struct in_addr for msg_src_ip4. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in_addr, t_ctx,
                        s_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
                /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
                        s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
                break;
        case offsetof(struct bpf_sock_addr, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_addr_kern, sk));
                break;
        }

        return insn - insn_buf;
}

static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
                                       const struct bpf_insn *si,
                                       struct bpf_insn *insn_buf,
                                       struct bpf_prog *prog,
                                       u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,                      \
                                                       OBJ_FIELD),              \
                                      si->dst_reg, si->dst_reg,                      \
                                      offsetof(OBJ, OBJ_FIELD));              \
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_SK()                                                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1;     \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
                SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)

/* Helper macro for adding write access to tcp_sock or sock fields.
 * The macro is called with two registers, dst_reg which contains a pointer
 * to ctx (context) and src_reg which contains the value that should be
 * stored. However, we need an additional register since we cannot overwrite
 * dst_reg because it may be used later in the program.
 * Instead we "borrow" one of the other register. We first save its value
 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
 * it at the end of the macro.
 */
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int reg = BPF_REG_9;                                              \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) |     \
                                       BPF_MEM | BPF_CLASS(si->code),              \
                                       reg, si->src_reg,                      \
                                       offsetof(OBJ, OBJ_FIELD),              \
                                       si->imm);                              \
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
        } while (0)

#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)              \
        do {                                                                      \
                if (TYPE == BPF_WRITE)                                              \
                        SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
                else                                                              \
                        SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
        } while (0)

        switch (si->off) {
        case offsetof(struct bpf_sock_ops, op):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       op),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, op));
                break;

        case offsetof(struct bpf_sock_ops, replylong[0]) ...
             offsetof(struct bpf_sock_ops, replylong[3]):
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
                             sizeof_field(struct bpf_sock_ops_kern, reply));
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
                             sizeof_field(struct bpf_sock_ops_kern, replylong));
                off = si->off;
                off -= offsetof(struct bpf_sock_ops, replylong[0]);
                off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              off);
                break;

        case offsetof(struct bpf_sock_ops, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct bpf_sock_ops, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
             offsetof(struct bpf_sock_ops, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
             offsetof(struct bpf_sock_ops, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct bpf_sock_ops, is_fullsock):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern,
                                                is_fullsock),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               is_fullsock));
                break;

        case offsetof(struct bpf_sock_ops, state):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_state));
                break;

        case offsetof(struct bpf_sock_ops, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      sizeof_field(struct minmax_sample, t));
                break;

        case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
                SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
                                   struct tcp_sock);
                break;

        case offsetof(struct bpf_sock_ops, sk_txhash):
                SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
                                          struct sock, type);
                break;
        case offsetof(struct bpf_sock_ops, snd_cwnd):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
                break;
        case offsetof(struct bpf_sock_ops, srtt_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
                break;
        case offsetof(struct bpf_sock_ops, snd_ssthresh):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
                break;
        case offsetof(struct bpf_sock_ops, rcv_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_una):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
                break;
        case offsetof(struct bpf_sock_ops, mss_cache):
                SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
                break;
        case offsetof(struct bpf_sock_ops, ecn_flags):
                SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
                break;
        case offsetof(struct bpf_sock_ops, rate_delivered):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
                break;
        case offsetof(struct bpf_sock_ops, rate_interval_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
                break;
        case offsetof(struct bpf_sock_ops, packets_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
                break;
        case offsetof(struct bpf_sock_ops, retrans_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
                break;
        case offsetof(struct bpf_sock_ops, total_retrans):
                SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
                break;
        case offsetof(struct bpf_sock_ops, segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
                break;
        case offsetof(struct bpf_sock_ops, segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
                break;
        case offsetof(struct bpf_sock_ops, lost_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
                break;
        case offsetof(struct bpf_sock_ops, sacked_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
                break;
        case offsetof(struct bpf_sock_ops, bytes_received):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
                break;
        case offsetof(struct bpf_sock_ops, bytes_acked):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
                break;
        case offsetof(struct bpf_sock_ops, sk):
                SOCK_OPS_GET_SK();
                break;
        case offsetof(struct bpf_sock_ops, skb_data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb_data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb_data_end));
                break;
        case offsetof(struct bpf_sock_ops, skb_data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, data));
                break;
        case offsetof(struct bpf_sock_ops, skb_len):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, len));
                break;
        case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                off = offsetof(struct sk_buff, cb);
                off += offsetof(struct tcp_skb_cb, tcp_flags);
                *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
                                                       tcp_flags),
                                      si->dst_reg, si->dst_reg, off);
                break;
        case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
                struct bpf_insn *jmp_on_null_skb;

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                /* Reserve one insn to test skb == NULL */
                jmp_on_null_skb = insn++;
                insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
                                               insn - jmp_on_null_skb - 1);
                break;
        }
        }
        return insn - insn_buf;
}

/* data_end = skb->data + skb_headlen() */
static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
                                                    struct bpf_insn *insn)
{
        int reg;
        int temp_reg_off = offsetof(struct sk_buff, cb) +
                           offsetof(struct sk_skb_cb, temp_reg);

        if (si->src_reg == si->dst_reg) {
                /* We need an extra register, choose and save a register. */
                reg = BPF_REG_9;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
        } else {
                reg = si->dst_reg;
        }

        /* reg = skb->data */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                              reg, si->src_reg,
                              offsetof(struct sk_buff, data));
        /* AX = skb->len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, len));
        /* reg = skb->data + skb->len */
        *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
        /* AX = skb->data_len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, data_len));

        /* reg = skb->data + skb->len - skb->data_len */
        *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);

        if (si->src_reg == si->dst_reg) {
                /* Restore the saved register */
                *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
                *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
        }

        return insn;
}

static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, data_end):
                insn = bpf_convert_data_end_access(si, insn);
                break;
        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct sk_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct sk_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;


        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
#if IS_ENABLED(CONFIG_IPV6)
        int off;
#endif

        /* convert ctx uses the fact sg element is first in struct */
        BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);

        switch (si->off) {
        case offsetof(struct sk_msg_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data));
                break;
        case offsetof(struct sk_msg_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data_end));
                break;
        case offsetof(struct sk_msg_md, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct sk_msg_md, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct sk_msg_md, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct sk_msg_md, remote_ip6[0]) ...
             offsetof(struct sk_msg_md, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, local_ip6[0]) ...
             offsetof(struct sk_msg_md, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct sk_msg_md, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct sk_msg_md, size):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg_sg, size));
                break;

        case offsetof(struct sk_msg_md, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_filter_verifier_ops = {
        .get_func_proto                = sk_filter_func_proto,
        .is_valid_access        = sk_filter_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_ld_abs                = bpf_gen_ld_abs,
};

const struct bpf_prog_ops sk_filter_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
        .get_func_proto                = tc_cls_act_func_proto,
        .is_valid_access        = tc_cls_act_is_valid_access,
        .convert_ctx_access        = tc_cls_act_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
        .gen_ld_abs                = bpf_gen_ld_abs,
        .btf_struct_access        = tc_cls_act_btf_struct_access,
};

const struct bpf_prog_ops tc_cls_act_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops xdp_verifier_ops = {
        .get_func_proto                = xdp_func_proto,
        .is_valid_access        = xdp_is_valid_access,
        .convert_ctx_access        = xdp_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
        .btf_struct_access        = xdp_btf_struct_access,
};

const struct bpf_prog_ops xdp_prog_ops = {
        .test_run                = bpf_prog_test_run_xdp,
};

const struct bpf_verifier_ops cg_skb_verifier_ops = {
        .get_func_proto                = cg_skb_func_proto,
        .is_valid_access        = cg_skb_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops cg_skb_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_in_verifier_ops = {
        .get_func_proto                = lwt_in_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_in_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_out_verifier_ops = {
        .get_func_proto                = lwt_out_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_out_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
        .get_func_proto                = lwt_xmit_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
};

const struct bpf_prog_ops lwt_xmit_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
        .get_func_proto                = lwt_seg6local_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_seg6local_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops cg_sock_verifier_ops = {
        .get_func_proto                = sock_filter_func_proto,
        .is_valid_access        = sock_filter_is_valid_access,
        .convert_ctx_access        = bpf_sock_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
        .get_func_proto                = sock_addr_func_proto,
        .is_valid_access        = sock_addr_is_valid_access,
        .convert_ctx_access        = sock_addr_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_addr_prog_ops = {
};

const struct bpf_verifier_ops sock_ops_verifier_ops = {
        .get_func_proto                = sock_ops_func_proto,
        .is_valid_access        = sock_ops_is_valid_access,
        .convert_ctx_access        = sock_ops_convert_ctx_access,
};

const struct bpf_prog_ops sock_ops_prog_ops = {
};

const struct bpf_verifier_ops sk_skb_verifier_ops = {
        .get_func_proto                = sk_skb_func_proto,
        .is_valid_access        = sk_skb_is_valid_access,
        .convert_ctx_access        = sk_skb_convert_ctx_access,
        .gen_prologue                = sk_skb_prologue,
};

const struct bpf_prog_ops sk_skb_prog_ops = {
};

const struct bpf_verifier_ops sk_msg_verifier_ops = {
        .get_func_proto                = sk_msg_func_proto,
        .is_valid_access        = sk_msg_is_valid_access,
        .convert_ctx_access        = sk_msg_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
};

const struct bpf_prog_ops sk_msg_prog_ops = {
};

const struct bpf_verifier_ops flow_dissector_verifier_ops = {
        .get_func_proto                = flow_dissector_func_proto,
        .is_valid_access        = flow_dissector_is_valid_access,
        .convert_ctx_access        = flow_dissector_convert_ctx_access,
};

const struct bpf_prog_ops flow_dissector_prog_ops = {
        .test_run                = bpf_prog_test_run_flow_dissector,
};

int sk_detach_filter(struct sock *sk)
{
        int ret = -ENOENT;
        struct sk_filter *filter;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (filter) {
                RCU_INIT_POINTER(sk->sk_filter, NULL);
                sk_filter_uncharge(sk, filter);
                ret = 0;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);

int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
        struct sock_fprog_kern *fprog;
        struct sk_filter *filter;
        int ret = 0;

        sockopt_lock_sock(sk);
        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (!filter)
                goto out;

        /* We're copying the filter that has been originally attached,
         * so no conversion/decode needed anymore. eBPF programs that
         * have no original program cannot be dumped through this.
         */
        ret = -EACCES;
        fprog = filter->prog->orig_prog;
        if (!fprog)
                goto out;

        ret = fprog->len;
        if (!len)
                /* User space only enquires number of filter blocks. */
                goto out;

        ret = -EINVAL;
        if (len < fprog->len)
                goto out;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
                goto out;

        /* Instead of bytes, the API requests to return the number
         * of filter blocks.
         */
        ret = fprog->len;
out:
        sockopt_release_sock(sk);
        return ret;
}

#ifdef CONFIG_INET
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                    struct sock_reuseport *reuse,
                                    struct sock *sk, struct sk_buff *skb,
                                    struct sock *migrating_sk,
                                    u32 hash)
{
        reuse_kern->skb = skb;
        reuse_kern->sk = sk;
        reuse_kern->selected_sk = NULL;
        reuse_kern->migrating_sk = migrating_sk;
        reuse_kern->data_end = skb->data + skb_headlen(skb);
        reuse_kern->hash = hash;
        reuse_kern->reuseport_id = reuse->reuseport_id;
        reuse_kern->bind_inany = reuse->bind_inany;
}

struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash)
{
        struct sk_reuseport_kern reuse_kern;
        enum sk_action action;

        bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
        action = bpf_prog_run(prog, &reuse_kern);

        if (action == SK_PASS)
                return reuse_kern.selected_sk;
        else
                return ERR_PTR(-ECONNREFUSED);
}

BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
           struct bpf_map *, map, void *, key, u32, flags)
{
        bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
        struct sock_reuseport *reuse;
        struct sock *selected_sk;

        selected_sk = map->ops->map_lookup_elem(map, key);
        if (!selected_sk)
                return -ENOENT;

        reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
        if (!reuse) {
                /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
                if (sk_is_refcounted(selected_sk))
                        sock_put(selected_sk);

                /* reuseport_array has only sk with non NULL sk_reuseport_cb.
                 * The only (!reuse) case here is - the sk has already been
                 * unhashed (e.g. by close()), so treat it as -ENOENT.
                 *
                 * Other maps (e.g. sock_map) do not provide this guarantee and
                 * the sk may never be in the reuseport group to begin with.
                 */
                return is_sockarray ? -ENOENT : -EINVAL;
        }

        if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
                struct sock *sk = reuse_kern->sk;

                if (sk->sk_protocol != selected_sk->sk_protocol)
                        return -EPROTOTYPE;
                else if (sk->sk_family != selected_sk->sk_family)
                        return -EAFNOSUPPORT;

                /* Catch all. Likely bound to a different sockaddr. */
                return -EBADFD;
        }

        reuse_kern->selected_sk = selected_sk;

        return 0;
}

static const struct bpf_func_proto sk_select_reuseport_proto = {
        .func           = sk_select_reuseport,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_CONST_MAP_PTR,
        .arg3_type      = ARG_PTR_TO_MAP_KEY,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(sk_reuseport_load_bytes,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len)
{
        return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
        .func                = sk_reuseport_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(sk_reuseport_load_bytes_relative,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len, u32, start_header)
{
        return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
                                               len, start_header);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
        .func                = sk_reuseport_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_reuseport_func_proto(enum bpf_func_id func_id,
                        const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_sk_select_reuseport:
                return &sk_select_reuseport_proto;
        case BPF_FUNC_skb_load_bytes:
                return &sk_reuseport_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &sk_reuseport_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_ptr_cookie_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool
sk_reuseport_is_valid_access(int off, int size,
                             enum bpf_access_type type,
                             const struct bpf_prog *prog,
                             struct bpf_insn_access_aux *info)
{
        const u32 size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
            off % size || type != BPF_READ)
                return false;

        switch (off) {
        case offsetof(struct sk_reuseport_md, data):
                info->reg_type = PTR_TO_PACKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, hash):
                return size == size_default;

        case offsetof(struct sk_reuseport_md, sk):
                info->reg_type = PTR_TO_SOCKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, migrating_sk):
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                return size == sizeof(__u64);

        /* Fields that allow narrowing */
        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                if (size < sizeof_field(struct sk_buff, protocol))
                        return false;
                fallthrough;
        case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
        case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
        case bpf_ctx_range(struct sk_reuseport_md, len):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);

        default:
                return false;
        }
}

#define SK_REUSEPORT_LOAD_FIELD(F) ({                                        \
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
                              si->dst_reg, si->src_reg,                        \
                              bpf_target_off(struct sk_reuseport_kern, F, \
                                             sizeof_field(struct sk_reuseport_kern, F), \
                                             target_size));                \
        })

#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sk_buff,                        \
                                    skb,                                \
                                    SKB_FIELD)

#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sock,                        \
                                    sk,                                        \
                                    SK_FIELD)

static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
                                           const struct bpf_insn *si,
                                           struct bpf_insn *insn_buf,
                                           struct bpf_prog *prog,
                                           u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct sk_reuseport_md, data):
                SK_REUSEPORT_LOAD_SKB_FIELD(data);
                break;

        case offsetof(struct sk_reuseport_md, len):
                SK_REUSEPORT_LOAD_SKB_FIELD(len);
                break;

        case offsetof(struct sk_reuseport_md, eth_protocol):
                SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
                break;

        case offsetof(struct sk_reuseport_md, ip_protocol):
                SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
                break;

        case offsetof(struct sk_reuseport_md, data_end):
                SK_REUSEPORT_LOAD_FIELD(data_end);
                break;

        case offsetof(struct sk_reuseport_md, hash):
                SK_REUSEPORT_LOAD_FIELD(hash);
                break;

        case offsetof(struct sk_reuseport_md, bind_inany):
                SK_REUSEPORT_LOAD_FIELD(bind_inany);
                break;

        case offsetof(struct sk_reuseport_md, sk):
                SK_REUSEPORT_LOAD_FIELD(sk);
                break;

        case offsetof(struct sk_reuseport_md, migrating_sk):
                SK_REUSEPORT_LOAD_FIELD(migrating_sk);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
        .get_func_proto                = sk_reuseport_func_proto,
        .is_valid_access        = sk_reuseport_is_valid_access,
        .convert_ctx_access        = sk_reuseport_convert_ctx_access,
};

const struct bpf_prog_ops sk_reuseport_prog_ops = {
};

DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
EXPORT_SYMBOL(bpf_sk_lookup_enabled);

BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
           struct sock *, sk, u64, flags)
{
        if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
                               BPF_SK_LOOKUP_F_NO_REUSEPORT)))
                return -EINVAL;
        if (unlikely(sk && sk_is_refcounted(sk)))
                return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
        if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
                return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
        if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
                return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */

        /* Check if socket is suitable for packet L3/L4 protocol */
        if (sk && sk->sk_protocol != ctx->protocol)
                return -EPROTOTYPE;
        if (sk && sk->sk_family != ctx->family &&
            (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
                return -EAFNOSUPPORT;

        if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
                return -EEXIST;

        /* Select socket as lookup result */
        ctx->selected_sk = sk;
        ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
        return 0;
}

static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
        .func                = bpf_sk_lookup_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_SOCKET_OR_NULL,
        .arg3_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_lookup_assign_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static bool sk_lookup_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
                return false;
        if (off % size != 0)
                return false;
        if (type != BPF_READ)
                return false;

        switch (off) {
        case offsetof(struct bpf_sk_lookup, sk):
                info->reg_type = PTR_TO_SOCKET_OR_NULL;
                return size == sizeof(__u64);

        case bpf_ctx_range(struct bpf_sk_lookup, family):
        case bpf_ctx_range(struct bpf_sk_lookup, protocol):
        case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
        case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
        case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct bpf_sk_lookup, local_port):
        case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
                bpf_ctx_record_field_size(info, sizeof(__u32));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));

        case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
                /* Allow 4-byte access to 2-byte field for backward compatibility */
                if (size == sizeof(__u32))
                        return true;
                bpf_ctx_record_field_size(info, sizeof(__be16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));

        case offsetofend(struct bpf_sk_lookup, remote_port) ...
             offsetof(struct bpf_sk_lookup, local_ip4) - 1:
                /* Allow access to zero padding for backward compatibility */
                bpf_ctx_record_field_size(info, sizeof(__u16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));

        default:
                return false;
        }
}

static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog,
                                        u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sk_lookup, sk):
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, selected_sk));
                break;

        case offsetof(struct bpf_sk_lookup, family):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     family, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     protocol, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, remote_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.saddr, 4, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, local_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.daddr, 4, target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                remote_ip6[0], remote_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.saddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                local_ip6[0], local_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.daddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case offsetof(struct bpf_sk_lookup, remote_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     sport, 2, target_size));
                break;

        case offsetofend(struct bpf_sk_lookup, remote_port):
                *target_size = 2;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                break;

        case offsetof(struct bpf_sk_lookup, local_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     dport, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     ingress_ifindex, 4, target_size));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_prog_ops sk_lookup_prog_ops = {
        .test_run = bpf_prog_test_run_sk_lookup,
};

const struct bpf_verifier_ops sk_lookup_verifier_ops = {
        .get_func_proto                = sk_lookup_func_proto,
        .is_valid_access        = sk_lookup_is_valid_access,
        .convert_ctx_access        = sk_lookup_convert_ctx_access,
};

#endif /* CONFIG_INET */

DEFINE_BPF_DISPATCHER(xdp)

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
        bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}

BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
BTF_SOCK_TYPE_xxx
#undef BTF_SOCK_TYPE

BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
{
        /* tcp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct tcp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
            sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
        .func                        = bpf_skc_to_tcp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
};

BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
{
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
        .func                        = bpf_skc_to_tcp_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
};

BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
{
        /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
         * generated if CONFIG_INET=n. Trigger an explicit generation here.
         */
        BTF_TYPE_EMIT(struct inet_timewait_sock);
        BTF_TYPE_EMIT(struct tcp_timewait_sock);

#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
        .func                        = bpf_skc_to_tcp_timewait_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
};

BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
        .func                        = bpf_skc_to_tcp_request_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
};

BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
{
        /* udp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct udp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
            sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
        .func                        = bpf_skc_to_udp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
};

BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
{
        /* unix_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct unix_sock);
        if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
        .func                        = bpf_skc_to_unix_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
};

BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
{
        BTF_TYPE_EMIT(struct mptcp_sock);
        return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
}

const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
        .func                = bpf_skc_to_mptcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
        .ret_btf_id        = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
};

BPF_CALL_1(bpf_sock_from_file, struct file *, file)
{
        return (unsigned long)sock_from_file(file);
}

BTF_ID_LIST(bpf_sock_from_file_btf_ids)
BTF_ID(struct, socket)
BTF_ID(struct, file)

const struct bpf_func_proto bpf_sock_from_file_proto = {
        .func                = bpf_sock_from_file,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .ret_btf_id        = &bpf_sock_from_file_btf_ids[0],
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_sock_from_file_btf_ids[1],
};

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func;

        switch (func_id) {
        case BPF_FUNC_skc_to_tcp6_sock:
                func = &bpf_skc_to_tcp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_sock:
                func = &bpf_skc_to_tcp_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_timewait_sock:
                func = &bpf_skc_to_tcp_timewait_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_request_sock:
                func = &bpf_skc_to_tcp_request_sock_proto;
                break;
        case BPF_FUNC_skc_to_udp6_sock:
                func = &bpf_skc_to_udp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_unix_sock:
                func = &bpf_skc_to_unix_sock_proto;
                break;
        case BPF_FUNC_skc_to_mptcp_sock:
                func = &bpf_skc_to_mptcp_sock_proto;
                break;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }

        if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
                return NULL;

        return func;
}

__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
                                    struct bpf_dynptr_kern *ptr__uninit)
{
        if (flags) {
                bpf_dynptr_set_null(ptr__uninit);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);

        return 0;
}

__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
                                    struct bpf_dynptr_kern *ptr__uninit)
{
        if (flags) {
                bpf_dynptr_set_null(ptr__uninit);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));

        return 0;
}

__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
                                           const u8 *sun_path, u32 sun_path__sz)
{
        struct sockaddr_un *un;

        if (sa_kern->sk->sk_family != AF_UNIX)
                return -EINVAL;

        /* We do not allow changing the address to unnamed or larger than the
         * maximum allowed address size for a unix sockaddr.
         */
        if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
                return -EINVAL;

        un = (struct sockaddr_un *)sa_kern->uaddr;
        memcpy(un->sun_path, sun_path, sun_path__sz);
        sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;

        return 0;
}

__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk,
                                        struct bpf_tcp_req_attrs *attrs, int attrs__sz)
{
#if IS_ENABLED(CONFIG_SYN_COOKIES)
        const struct request_sock_ops *ops;
        struct inet_request_sock *ireq;
        struct tcp_request_sock *treq;
        struct request_sock *req;
        struct net *net;
        __u16 min_mss;
        u32 tsoff = 0;

        if (attrs__sz != sizeof(*attrs) ||
            attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
                return -EINVAL;

        if (!skb_at_tc_ingress(skb))
                return -EINVAL;

        net = dev_net(skb->dev);
        if (net != sock_net(sk))
                return -ENETUNREACH;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                ops = &tcp_request_sock_ops;
                min_mss = 536;
                break;
#if IS_BUILTIN(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                ops = &tcp6_request_sock_ops;
                min_mss = IPV6_MIN_MTU - 60;
                break;
#endif
        default:
                return -EINVAL;
        }

        if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
            sk_is_mptcp(sk))
                return -EINVAL;

        if (attrs->mss < min_mss)
                return -EINVAL;

        if (attrs->wscale_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
                        return -EINVAL;

                if (attrs->snd_wscale > TCP_MAX_WSCALE ||
                    attrs->rcv_wscale > TCP_MAX_WSCALE)
                        return -EINVAL;
        }

        if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
                return -EINVAL;

        if (attrs->tstamp_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
                        return -EINVAL;

                tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
        }

        req = inet_reqsk_alloc(ops, sk, false);
        if (!req)
                return -ENOMEM;

        ireq = inet_rsk(req);
        treq = tcp_rsk(req);

        req->rsk_listener = sk;
        req->syncookie = 1;
        req->mss = attrs->mss;
        req->ts_recent = attrs->rcv_tsval;

        ireq->snd_wscale = attrs->snd_wscale;
        ireq->rcv_wscale = attrs->rcv_wscale;
        ireq->tstamp_ok        = !!attrs->tstamp_ok;
        ireq->sack_ok = !!attrs->sack_ok;
        ireq->wscale_ok = !!attrs->wscale_ok;
        ireq->ecn_ok = !!attrs->ecn_ok;

        treq->req_usec_ts = !!attrs->usec_ts_ok;
        treq->ts_off = tsoff;

        skb_orphan(skb);
        skb->sk = req_to_sk(req);
        skb->destructor = sock_pfree;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

__bpf_kfunc_end_defs();

int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
                               struct bpf_dynptr_kern *ptr__uninit)
{
        int err;

        err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
        if (err)
                return err;

        bpf_dynptr_set_rdonly(ptr__uninit);

        return 0;
}

BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)

BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)

BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)

static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_skb,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_xdp,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_sock_addr,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_tcp_reqsk,
};

static int __init bpf_kfunc_init(void)
{
        int ret;

        ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
                                               &bpf_kfunc_set_sock_addr);
        return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
}
late_initcall(bpf_kfunc_init);

__bpf_kfunc_start_defs();

/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
 *
 * The function expects a non-NULL pointer to a socket, and invokes the
 * protocol specific socket destroy handlers.
 *
 * The helper can only be called from BPF contexts that have acquired the socket
 * locks.
 *
 * Parameters:
 * @sock: Pointer to socket to be destroyed
 *
 * Return:
 * On error, may return EPROTONOSUPPORT, EINVAL.
 * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
 * 0 otherwise
 */
__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
{
        struct sock *sk = (struct sock *)sock;

        /* The locking semantics that allow for synchronous execution of the
         * destroy handlers are only supported for TCP and UDP.
         * Supporting protocols will need to acquire sock lock in the BPF context
         * prior to invoking this kfunc.
         */
        if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
                                           sk->sk_protocol != IPPROTO_UDP))
                return -EOPNOTSUPP;

        return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)

static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
        if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
            prog->expected_attach_type != BPF_TRACE_ITER)
                return -EACCES;
        return 0;
}

static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &bpf_sk_iter_kfunc_ids,
        .filter = tracing_iter_filter,
};

static int init_subsystem(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
}
late_initcall(init_subsystem);
































































































    1 












    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002-2004, Instant802 Networks, Inc.
 * Copyright 2008, Jouni Malinen <j@w1.fi>
 * Copyright (C) 2016-2017 Intel Deutschland GmbH
 * Copyright (C) 2020-2023 Intel Corporation
 */

#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/compiler.h>
#include <linux/ieee80211.h>
#include <linux/gfp.h>
#include <asm/unaligned.h>
#include <net/mac80211.h>
#include <crypto/aes.h>
#include <crypto/utils.h>

#include "ieee80211_i.h"
#include "michael.h"
#include "tkip.h"
#include "aes_ccm.h"
#include "aes_cmac.h"
#include "aes_gmac.h"
#include "aes_gcm.h"
#include "wpa.h"

ieee80211_tx_result
ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
{
        u8 *data, *key, *mic;
        size_t data_len;
        unsigned int hdrlen;
        struct ieee80211_hdr *hdr;
        struct sk_buff *skb = tx->skb;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        int tail;

        hdr = (struct ieee80211_hdr *)skb->data;
        if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
            skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control))
                return TX_CONTINUE;

        hdrlen = ieee80211_hdrlen(hdr->frame_control);
        if (skb->len < hdrlen)
                return TX_DROP;

        data = skb->data + hdrlen;
        data_len = skb->len - hdrlen;

        if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) {
                /* Need to use software crypto for the test */
                info->control.hw_key = NULL;
        }

        if (info->control.hw_key &&
            (info->flags & IEEE80211_TX_CTL_DONTFRAG ||
             ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) &&
            !(tx->key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC |
                                     IEEE80211_KEY_FLAG_PUT_MIC_SPACE))) {
                /* hwaccel - with no need for SW-generated MMIC or MIC space */
                return TX_CONTINUE;
        }

        tail = MICHAEL_MIC_LEN;
        if (!info->control.hw_key)
                tail += IEEE80211_TKIP_ICV_LEN;

        if (WARN(skb_tailroom(skb) < tail ||
                 skb_headroom(skb) < IEEE80211_TKIP_IV_LEN,
                 "mmic: not enough head/tail (%d/%d,%d/%d)\n",
                 skb_headroom(skb), IEEE80211_TKIP_IV_LEN,
                 skb_tailroom(skb), tail))
                return TX_DROP;

        mic = skb_put(skb, MICHAEL_MIC_LEN);

        if (tx->key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) {
                /* Zeroed MIC can help with debug */
                memset(mic, 0, MICHAEL_MIC_LEN);
                return TX_CONTINUE;
        }

        key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY];
        michael_mic(key, hdr, data, data_len, mic);
        if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE))
                mic[0]++;

        return TX_CONTINUE;
}


ieee80211_rx_result
ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
{
        u8 *data, *key = NULL;
        size_t data_len;
        unsigned int hdrlen;
        u8 mic[MICHAEL_MIC_LEN];
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;

        /*
         * it makes no sense to check for MIC errors on anything other
         * than data frames.
         */
        if (!ieee80211_is_data_present(hdr->frame_control))
                return RX_CONTINUE;

        /*
         * No way to verify the MIC if the hardware stripped it or
         * the IV with the key index. In this case we have solely rely
         * on the driver to set RX_FLAG_MMIC_ERROR in the event of a
         * MIC failure report.
         */
        if (status->flag & (RX_FLAG_MMIC_STRIPPED | RX_FLAG_IV_STRIPPED)) {
                if (status->flag & RX_FLAG_MMIC_ERROR)
                        goto mic_fail_no_key;

                if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key &&
                    rx->key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)
                        goto update_iv;

                return RX_CONTINUE;
        }

        /*
         * Some hardware seems to generate Michael MIC failure reports; even
         * though, the frame was not encrypted with TKIP and therefore has no
         * MIC. Ignore the flag them to avoid triggering countermeasures.
         */
        if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
            !(status->flag & RX_FLAG_DECRYPTED))
                return RX_CONTINUE;

        if (rx->sdata->vif.type == NL80211_IFTYPE_AP && rx->key->conf.keyidx) {
                /*
                 * APs with pairwise keys should never receive Michael MIC
                 * errors for non-zero keyidx because these are reserved for
                 * group keys and only the AP is sending real multicast
                 * frames in the BSS.
                 */
                return RX_DROP_U_AP_RX_GROUPCAST;
        }

        if (status->flag & RX_FLAG_MMIC_ERROR)
                goto mic_fail;

        hdrlen = ieee80211_hdrlen(hdr->frame_control);
        if (skb->len < hdrlen + MICHAEL_MIC_LEN)
                return RX_DROP_U_SHORT_MMIC;

        if (skb_linearize(rx->skb))
                return RX_DROP_U_OOM;
        hdr = (void *)skb->data;

        data = skb->data + hdrlen;
        data_len = skb->len - hdrlen - MICHAEL_MIC_LEN;
        key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY];
        michael_mic(key, hdr, data, data_len, mic);
        if (crypto_memneq(mic, data + data_len, MICHAEL_MIC_LEN))
                goto mic_fail;

        /* remove Michael MIC from payload */
        skb_trim(skb, skb->len - MICHAEL_MIC_LEN);

update_iv:
        /* update IV in key information to be able to detect replays */
        rx->key->u.tkip.rx[rx->security_idx].iv32 = rx->tkip.iv32;
        rx->key->u.tkip.rx[rx->security_idx].iv16 = rx->tkip.iv16;

        return RX_CONTINUE;

mic_fail:
        rx->key->u.tkip.mic_failures++;

mic_fail_no_key:
        /*
         * In some cases the key can be unset - e.g. a multicast packet, in
         * a driver that supports HW encryption. Send up the key idx only if
         * the key is set.
         */
        cfg80211_michael_mic_failure(rx->sdata->dev, hdr->addr2,
                                     is_multicast_ether_addr(hdr->addr1) ?
                                     NL80211_KEYTYPE_GROUP :
                                     NL80211_KEYTYPE_PAIRWISE,
                                     rx->key ? rx->key->conf.keyidx : -1,
                                     NULL, GFP_ATOMIC);
        return RX_DROP_U_MMIC_FAIL;
}

static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
        struct ieee80211_key *key = tx->key;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        unsigned int hdrlen;
        int len, tail;
        u64 pn;
        u8 *pos;

        if (info->control.hw_key &&
            !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) &&
            !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) {
                /* hwaccel - with no need for software-generated IV */
                return 0;
        }

        hdrlen = ieee80211_hdrlen(hdr->frame_control);
        len = skb->len - hdrlen;

        if (info->control.hw_key)
                tail = 0;
        else
                tail = IEEE80211_TKIP_ICV_LEN;

        if (WARN_ON(skb_tailroom(skb) < tail ||
                    skb_headroom(skb) < IEEE80211_TKIP_IV_LEN))
                return -1;

        pos = skb_push(skb, IEEE80211_TKIP_IV_LEN);
        memmove(pos, pos + IEEE80211_TKIP_IV_LEN, hdrlen);
        pos += hdrlen;

        /* the HW only needs room for the IV, but not the actual IV */
        if (info->control.hw_key &&
            (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))
                return 0;

        /* Increase IV for the frame */
        pn = atomic64_inc_return(&key->conf.tx_pn);
        pos = ieee80211_tkip_add_iv(pos, &key->conf, pn);

        /* hwaccel - with software IV */
        if (info->control.hw_key)
                return 0;

        /* Add room for ICV */
        skb_put(skb, IEEE80211_TKIP_ICV_LEN);

        return ieee80211_tkip_encrypt_data(&tx->local->wep_tx_ctx,
                                           key, skb, pos, len);
}


ieee80211_tx_result
ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx)
{
        struct sk_buff *skb;

        ieee80211_tx_set_protected(tx);

        skb_queue_walk(&tx->skbs, skb) {
                if (tkip_encrypt_skb(tx, skb) < 0)
                        return TX_DROP;
        }

        return TX_CONTINUE;
}


ieee80211_rx_result
ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data;
        int hdrlen, res, hwaccel = 0;
        struct ieee80211_key *key = rx->key;
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);

        hdrlen = ieee80211_hdrlen(hdr->frame_control);

        if (!ieee80211_is_data(hdr->frame_control))
                return RX_CONTINUE;

        if (!rx->sta || skb->len - hdrlen < 12)
                return RX_DROP_U_SHORT_TKIP;

        /* it may be possible to optimize this a bit more */
        if (skb_linearize(rx->skb))
                return RX_DROP_U_OOM;
        hdr = (void *)skb->data;

        /*
         * Let TKIP code verify IV, but skip decryption.
         * In the case where hardware checks the IV as well,
         * we don't even get here, see ieee80211_rx_h_decrypt()
         */
        if (status->flag & RX_FLAG_DECRYPTED)
                hwaccel = 1;

        res = ieee80211_tkip_decrypt_data(&rx->local->wep_rx_ctx,
                                          key, skb->data + hdrlen,
                                          skb->len - hdrlen, rx->sta->sta.addr,
                                          hdr->addr1, hwaccel, rx->security_idx,
                                          &rx->tkip.iv32,
                                          &rx->tkip.iv16);
        if (res != TKIP_DECRYPT_OK)
                return RX_DROP_U_TKIP_FAIL;

        /* Trim ICV */
        if (!(status->flag & RX_FLAG_ICV_STRIPPED))
                skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN);

        /* Remove IV */
        memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen);
        skb_pull(skb, IEEE80211_TKIP_IV_LEN);

        return RX_CONTINUE;
}

/*
 * Calculate AAD for CCMP/GCMP, returning qos_tid since we
 * need that in CCMP also for b_0.
 */
static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad, bool spp_amsdu)
{
        struct ieee80211_hdr *hdr = (void *)skb->data;
        __le16 mask_fc;
        int a4_included, mgmt;
        u8 qos_tid;
        u16 len_a = 22;

        /*
         * Mask FC: zero subtype b4 b5 b6 (if not mgmt)
         * Retry, PwrMgt, MoreData, Order (if Qos Data); set Protected
         */
        mgmt = ieee80211_is_mgmt(hdr->frame_control);
        mask_fc = hdr->frame_control;
        mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY |
                                IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA);
        if (!mgmt)
                mask_fc &= ~cpu_to_le16(0x0070);
        mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);

        a4_included = ieee80211_has_a4(hdr->frame_control);
        if (a4_included)
                len_a += 6;

        if (ieee80211_is_data_qos(hdr->frame_control)) {
                qos_tid = *ieee80211_get_qos_ctl(hdr);

                if (spp_amsdu)
                        qos_tid &= IEEE80211_QOS_CTL_TID_MASK |
                                   IEEE80211_QOS_CTL_A_MSDU_PRESENT;
                else
                        qos_tid &= IEEE80211_QOS_CTL_TID_MASK;

                mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_ORDER);
                len_a += 2;
        } else {
                qos_tid = 0;
        }

        /* AAD (extra authenticate-only data) / masked 802.11 header
         * FC | A1 | A2 | A3 | SC | [A4] | [QC] */
        put_unaligned_be16(len_a, &aad[0]);
        put_unaligned(mask_fc, (__le16 *)&aad[2]);
        memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN);

        /* Mask Seq#, leave Frag# */
        aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f;
        aad[23] = 0;

        if (a4_included) {
                memcpy(&aad[24], hdr->addr4, ETH_ALEN);
                aad[30] = qos_tid;
                aad[31] = 0;
        } else {
                memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN);
                aad[24] = qos_tid;
        }

        return qos_tid;
}

static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad,
                                bool spp_amsdu)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        u8 qos_tid = ccmp_gcmp_aad(skb, aad, spp_amsdu);

        /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC
         * mode authentication are not allowed to collide, yet both are derived
         * from this vector b_0. We only set L := 1 here to indicate that the
         * data size can be represented in (L+1) bytes. The CCM layer will take
         * care of storing the data length in the top (L+1) bytes and setting
         * and clearing the other bits as is required to derive the two IVs.
         */
        b_0[0] = 0x1;

        /* Nonce: Nonce Flags | A2 | PN
         * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7)
         */
        b_0[1] = qos_tid | (ieee80211_is_mgmt(hdr->frame_control) << 4);
        memcpy(&b_0[2], hdr->addr2, ETH_ALEN);
        memcpy(&b_0[8], pn, IEEE80211_CCMP_PN_LEN);
}

static inline void ccmp_pn2hdr(u8 *hdr, u8 *pn, int key_id)
{
        hdr[0] = pn[5];
        hdr[1] = pn[4];
        hdr[2] = 0;
        hdr[3] = 0x20 | (key_id << 6);
        hdr[4] = pn[3];
        hdr[5] = pn[2];
        hdr[6] = pn[1];
        hdr[7] = pn[0];
}


static inline void ccmp_hdr2pn(u8 *pn, u8 *hdr)
{
        pn[0] = hdr[7];
        pn[1] = hdr[6];
        pn[2] = hdr[5];
        pn[3] = hdr[4];
        pn[4] = hdr[1];
        pn[5] = hdr[0];
}


static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb,
                            unsigned int mic_len)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
        struct ieee80211_key *key = tx->key;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        int hdrlen, len, tail;
        u8 *pos;
        u8 pn[6];
        u64 pn64;
        u8 aad[CCM_AAD_LEN];
        u8 b_0[AES_BLOCK_SIZE];

        if (info->control.hw_key &&
            !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) &&
            !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) &&
            !((info->control.hw_key->flags &
               IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) &&
              ieee80211_is_mgmt(hdr->frame_control))) {
                /*
                 * hwaccel has no need for preallocated room for CCMP
                 * header or MIC fields
                 */
                return 0;
        }

        hdrlen = ieee80211_hdrlen(hdr->frame_control);
        len = skb->len - hdrlen;

        if (info->control.hw_key)
                tail = 0;
        else
                tail = mic_len;

        if (WARN_ON(skb_tailroom(skb) < tail ||
                    skb_headroom(skb) < IEEE80211_CCMP_HDR_LEN))
                return -1;

        pos = skb_push(skb, IEEE80211_CCMP_HDR_LEN);
        memmove(pos, pos + IEEE80211_CCMP_HDR_LEN, hdrlen);

        /* the HW only needs room for the IV, but not the actual IV */
        if (info->control.hw_key &&
            (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))
                return 0;

        pos += hdrlen;

        pn64 = atomic64_inc_return(&key->conf.tx_pn);

        pn[5] = pn64;
        pn[4] = pn64 >> 8;
        pn[3] = pn64 >> 16;
        pn[2] = pn64 >> 24;
        pn[1] = pn64 >> 32;
        pn[0] = pn64 >> 40;

        ccmp_pn2hdr(pos, pn, key->conf.keyidx);

        /* hwaccel - with software CCMP header */
        if (info->control.hw_key)
                return 0;

        pos += IEEE80211_CCMP_HDR_LEN;
        ccmp_special_blocks(skb, pn, b_0, aad,
                            key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);
        return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len,
                                         skb_put(skb, mic_len));
}


ieee80211_tx_result
ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx,
                              unsigned int mic_len)
{
        struct sk_buff *skb;

        ieee80211_tx_set_protected(tx);

        skb_queue_walk(&tx->skbs, skb) {
                if (ccmp_encrypt_skb(tx, skb, mic_len) < 0)
                        return TX_DROP;
        }

        return TX_CONTINUE;
}


ieee80211_rx_result
ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
                              unsigned int mic_len)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
        int hdrlen;
        struct ieee80211_key *key = rx->key;
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        u8 pn[IEEE80211_CCMP_PN_LEN];
        int data_len;
        int queue;

        hdrlen = ieee80211_hdrlen(hdr->frame_control);

        if (!ieee80211_is_data(hdr->frame_control) &&
            !ieee80211_is_robust_mgmt_frame(skb))
                return RX_CONTINUE;

        if (status->flag & RX_FLAG_DECRYPTED) {
                if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_CCMP_HDR_LEN))
                        return RX_DROP_U_SHORT_CCMP;
                if (status->flag & RX_FLAG_MIC_STRIPPED)
                        mic_len = 0;
        } else {
                if (skb_linearize(rx->skb))
                        return RX_DROP_U_OOM;
        }

        /* reload hdr - skb might have been reallocated */
        hdr = (void *)rx->skb->data;

        data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len;
        if (!rx->sta || data_len < 0)
                return RX_DROP_U_SHORT_CCMP;

        if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
                int res;

                ccmp_hdr2pn(pn, skb->data + hdrlen);

                queue = rx->security_idx;

                res = memcmp(pn, key->u.ccmp.rx_pn[queue],
                             IEEE80211_CCMP_PN_LEN);
                if (res < 0 ||
                    (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
                        key->u.ccmp.replays++;
                        return RX_DROP_U_REPLAY;
                }

                if (!(status->flag & RX_FLAG_DECRYPTED)) {
                        u8 aad[2 * AES_BLOCK_SIZE];
                        u8 b_0[AES_BLOCK_SIZE];
                        /* hardware didn't decrypt/verify MIC */
                        ccmp_special_blocks(skb, pn, b_0, aad,
                                            key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);

                        if (ieee80211_aes_ccm_decrypt(
                                    key->u.ccmp.tfm, b_0, aad,
                                    skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
                                    data_len,
                                    skb->data + skb->len - mic_len))
                                return RX_DROP_U_MIC_FAIL;
                }

                memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
                if (unlikely(ieee80211_is_frag(hdr)))
                        memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN);
        }

        /* Remove CCMP header and MIC */
        if (pskb_trim(skb, skb->len - mic_len))
                return RX_DROP_U_SHORT_CCMP_MIC;
        memmove(skb->data + IEEE80211_CCMP_HDR_LEN, skb->data, hdrlen);
        skb_pull(skb, IEEE80211_CCMP_HDR_LEN);

        return RX_CONTINUE;
}

static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad,
                                bool spp_amsdu)
{
        struct ieee80211_hdr *hdr = (void *)skb->data;

        memcpy(j_0, hdr->addr2, ETH_ALEN);
        memcpy(&j_0[ETH_ALEN], pn, IEEE80211_GCMP_PN_LEN);
        j_0[13] = 0;
        j_0[14] = 0;
        j_0[AES_BLOCK_SIZE - 1] = 0x01;

        ccmp_gcmp_aad(skb, aad, spp_amsdu);
}

static inline void gcmp_pn2hdr(u8 *hdr, const u8 *pn, int key_id)
{
        hdr[0] = pn[5];
        hdr[1] = pn[4];
        hdr[2] = 0;
        hdr[3] = 0x20 | (key_id << 6);
        hdr[4] = pn[3];
        hdr[5] = pn[2];
        hdr[6] = pn[1];
        hdr[7] = pn[0];
}

static inline void gcmp_hdr2pn(u8 *pn, const u8 *hdr)
{
        pn[0] = hdr[7];
        pn[1] = hdr[6];
        pn[2] = hdr[5];
        pn[3] = hdr[4];
        pn[4] = hdr[1];
        pn[5] = hdr[0];
}

static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
        struct ieee80211_key *key = tx->key;
        struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
        int hdrlen, len, tail;
        u8 *pos;
        u8 pn[6];
        u64 pn64;
        u8 aad[GCM_AAD_LEN];
        u8 j_0[AES_BLOCK_SIZE];

        if (info->control.hw_key &&
            !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) &&
            !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) &&
            !((info->control.hw_key->flags &
               IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) &&
              ieee80211_is_mgmt(hdr->frame_control))) {
                /* hwaccel has no need for preallocated room for GCMP
                 * header or MIC fields
                 */
                return 0;
        }

        hdrlen = ieee80211_hdrlen(hdr->frame_control);
        len = skb->len - hdrlen;

        if (info->control.hw_key)
                tail = 0;
        else
                tail = IEEE80211_GCMP_MIC_LEN;

        if (WARN_ON(skb_tailroom(skb) < tail ||
                    skb_headroom(skb) < IEEE80211_GCMP_HDR_LEN))
                return -1;

        pos = skb_push(skb, IEEE80211_GCMP_HDR_LEN);
        memmove(pos, pos + IEEE80211_GCMP_HDR_LEN, hdrlen);
        skb_set_network_header(skb, skb_network_offset(skb) +
                                    IEEE80211_GCMP_HDR_LEN);

        /* the HW only needs room for the IV, but not the actual IV */
        if (info->control.hw_key &&
            (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE))
                return 0;

        pos += hdrlen;

        pn64 = atomic64_inc_return(&key->conf.tx_pn);

        pn[5] = pn64;
        pn[4] = pn64 >> 8;
        pn[3] = pn64 >> 16;
        pn[2] = pn64 >> 24;
        pn[1] = pn64 >> 32;
        pn[0] = pn64 >> 40;

        gcmp_pn2hdr(pos, pn, key->conf.keyidx);

        /* hwaccel - with software GCMP header */
        if (info->control.hw_key)
                return 0;

        pos += IEEE80211_GCMP_HDR_LEN;
        gcmp_special_blocks(skb, pn, j_0, aad,
                            key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);
        return ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len,
                                         skb_put(skb, IEEE80211_GCMP_MIC_LEN));
}

ieee80211_tx_result
ieee80211_crypto_gcmp_encrypt(struct ieee80211_tx_data *tx)
{
        struct sk_buff *skb;

        ieee80211_tx_set_protected(tx);

        skb_queue_walk(&tx->skbs, skb) {
                if (gcmp_encrypt_skb(tx, skb) < 0)
                        return TX_DROP;
        }

        return TX_CONTINUE;
}

ieee80211_rx_result
ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
{
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
        int hdrlen;
        struct ieee80211_key *key = rx->key;
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        u8 pn[IEEE80211_GCMP_PN_LEN];
        int data_len, queue, mic_len = IEEE80211_GCMP_MIC_LEN;

        hdrlen = ieee80211_hdrlen(hdr->frame_control);

        if (!ieee80211_is_data(hdr->frame_control) &&
            !ieee80211_is_robust_mgmt_frame(skb))
                return RX_CONTINUE;

        if (status->flag & RX_FLAG_DECRYPTED) {
                if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_GCMP_HDR_LEN))
                        return RX_DROP_U_SHORT_GCMP;
                if (status->flag & RX_FLAG_MIC_STRIPPED)
                        mic_len = 0;
        } else {
                if (skb_linearize(rx->skb))
                        return RX_DROP_U_OOM;
        }

        /* reload hdr - skb might have been reallocated */
        hdr = (void *)rx->skb->data;

        data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len;
        if (!rx->sta || data_len < 0)
                return RX_DROP_U_SHORT_GCMP;

        if (!(status->flag & RX_FLAG_PN_VALIDATED)) {
                int res;

                gcmp_hdr2pn(pn, skb->data + hdrlen);

                queue = rx->security_idx;

                res = memcmp(pn, key->u.gcmp.rx_pn[queue],
                             IEEE80211_GCMP_PN_LEN);
                if (res < 0 ||
                    (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
                        key->u.gcmp.replays++;
                        return RX_DROP_U_REPLAY;
                }

                if (!(status->flag & RX_FLAG_DECRYPTED)) {
                        u8 aad[2 * AES_BLOCK_SIZE];
                        u8 j_0[AES_BLOCK_SIZE];
                        /* hardware didn't decrypt/verify MIC */
                        gcmp_special_blocks(skb, pn, j_0, aad,
                                            key->conf.flags & IEEE80211_KEY_FLAG_SPP_AMSDU);

                        if (ieee80211_aes_gcm_decrypt(
                                    key->u.gcmp.tfm, j_0, aad,
                                    skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN,
                                    data_len,
                                    skb->data + skb->len -
                                    IEEE80211_GCMP_MIC_LEN))
                                return RX_DROP_U_MIC_FAIL;
                }

                memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
                if (unlikely(ieee80211_is_frag(hdr)))
                        memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN);
        }

        /* Remove GCMP header and MIC */
        if (pskb_trim(skb, skb->len - mic_len))
                return RX_DROP_U_SHORT_GCMP_MIC;
        memmove(skb->data + IEEE80211_GCMP_HDR_LEN, skb->data, hdrlen);
        skb_pull(skb, IEEE80211_GCMP_HDR_LEN);

        return RX_CONTINUE;
}

static void bip_aad(struct sk_buff *skb, u8 *aad)
{
        __le16 mask_fc;
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;

        /* BIP AAD: FC(masked) || A1 || A2 || A3 */

        /* FC type/subtype */
        /* Mask FC Retry, PwrMgt, MoreData flags to zero */
        mask_fc = hdr->frame_control;
        mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM |
                                IEEE80211_FCTL_MOREDATA);
        put_unaligned(mask_fc, (__le16 *) &aad[0]);
        /* A1 || A2 || A3 */
        memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN);
}


static inline void bip_ipn_set64(u8 *d, u64 pn)
{
        *d++ = pn;
        *d++ = pn >> 8;
        *d++ = pn >> 16;
        *d++ = pn >> 24;
        *d++ = pn >> 32;
        *d = pn >> 40;
}

static inline void bip_ipn_swap(u8 *d, const u8 *s)
{
        *d++ = s[5];
        *d++ = s[4];
        *d++ = s[3];
        *d++ = s[2];
        *d++ = s[1];
        *d = s[0];
}


ieee80211_tx_result
ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
{
        struct sk_buff *skb;
        struct ieee80211_tx_info *info;
        struct ieee80211_key *key = tx->key;
        struct ieee80211_mmie *mmie;
        u8 aad[20];
        u64 pn64;

        if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
                return TX_DROP;

        skb = skb_peek(&tx->skbs);

        info = IEEE80211_SKB_CB(skb);

        if (info->control.hw_key &&
            !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE))
                return TX_CONTINUE;

        if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
                return TX_DROP;

        mmie = skb_put(skb, sizeof(*mmie));
        mmie->element_id = WLAN_EID_MMIE;
        mmie->length = sizeof(*mmie) - 2;
        mmie->key_id = cpu_to_le16(key->conf.keyidx);

        /* PN = PN + 1 */
        pn64 = atomic64_inc_return(&key->conf.tx_pn);

        bip_ipn_set64(mmie->sequence_number, pn64);

        if (info->control.hw_key)
                return TX_CONTINUE;

        bip_aad(skb, aad);

        /*
         * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64)
         */
        ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad,
                           skb->data + 24, skb->len - 24, mmie->mic);

        return TX_CONTINUE;
}

ieee80211_tx_result
ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx)
{
        struct sk_buff *skb;
        struct ieee80211_tx_info *info;
        struct ieee80211_key *key = tx->key;
        struct ieee80211_mmie_16 *mmie;
        u8 aad[20];
        u64 pn64;

        if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
                return TX_DROP;

        skb = skb_peek(&tx->skbs);

        info = IEEE80211_SKB_CB(skb);

        if (info->control.hw_key &&
            !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE))
                return TX_CONTINUE;

        if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
                return TX_DROP;

        mmie = skb_put(skb, sizeof(*mmie));
        mmie->element_id = WLAN_EID_MMIE;
        mmie->length = sizeof(*mmie) - 2;
        mmie->key_id = cpu_to_le16(key->conf.keyidx);

        /* PN = PN + 1 */
        pn64 = atomic64_inc_return(&key->conf.tx_pn);

        bip_ipn_set64(mmie->sequence_number, pn64);

        if (info->control.hw_key)
                return TX_CONTINUE;

        bip_aad(skb, aad);

        /* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128)
         */
        ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
                               skb->data + 24, skb->len - 24, mmie->mic);

        return TX_CONTINUE;
}

ieee80211_rx_result
ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_key *key = rx->key;
        struct ieee80211_mmie *mmie;
        u8 aad[20], mic[8], ipn[6];
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;

        if (!ieee80211_is_mgmt(hdr->frame_control))
                return RX_CONTINUE;

        /* management frames are already linear */

        if (skb->len < 24 + sizeof(*mmie))
                return RX_DROP_U_SHORT_CMAC;

        mmie = (struct ieee80211_mmie *)
                (skb->data + skb->len - sizeof(*mmie));
        if (mmie->element_id != WLAN_EID_MMIE ||
            mmie->length != sizeof(*mmie) - 2)
                return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */

        bip_ipn_swap(ipn, mmie->sequence_number);

        if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
                key->u.aes_cmac.replays++;
                return RX_DROP_U_REPLAY;
        }

        if (!(status->flag & RX_FLAG_DECRYPTED)) {
                /* hardware didn't decrypt/verify MIC */
                bip_aad(skb, aad);
                ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad,
                                   skb->data + 24, skb->len - 24, mic);
                if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
                        key->u.aes_cmac.icverrors++;
                        return RX_DROP_U_MIC_FAIL;
                }
        }

        memcpy(key->u.aes_cmac.rx_pn, ipn, 6);

        /* Remove MMIE */
        skb_trim(skb, skb->len - sizeof(*mmie));

        return RX_CONTINUE;
}

ieee80211_rx_result
ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_key *key = rx->key;
        struct ieee80211_mmie_16 *mmie;
        u8 aad[20], mic[16], ipn[6];
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;

        if (!ieee80211_is_mgmt(hdr->frame_control))
                return RX_CONTINUE;

        /* management frames are already linear */

        if (skb->len < 24 + sizeof(*mmie))
                return RX_DROP_U_SHORT_CMAC256;

        mmie = (struct ieee80211_mmie_16 *)
                (skb->data + skb->len - sizeof(*mmie));
        if (mmie->element_id != WLAN_EID_MMIE ||
            mmie->length != sizeof(*mmie) - 2)
                return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */

        bip_ipn_swap(ipn, mmie->sequence_number);

        if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
                key->u.aes_cmac.replays++;
                return RX_DROP_U_REPLAY;
        }

        if (!(status->flag & RX_FLAG_DECRYPTED)) {
                /* hardware didn't decrypt/verify MIC */
                bip_aad(skb, aad);
                ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
                                       skb->data + 24, skb->len - 24, mic);
                if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
                        key->u.aes_cmac.icverrors++;
                        return RX_DROP_U_MIC_FAIL;
                }
        }

        memcpy(key->u.aes_cmac.rx_pn, ipn, 6);

        /* Remove MMIE */
        skb_trim(skb, skb->len - sizeof(*mmie));

        return RX_CONTINUE;
}

ieee80211_tx_result
ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx)
{
        struct sk_buff *skb;
        struct ieee80211_tx_info *info;
        struct ieee80211_key *key = tx->key;
        struct ieee80211_mmie_16 *mmie;
        struct ieee80211_hdr *hdr;
        u8 aad[GMAC_AAD_LEN];
        u64 pn64;
        u8 nonce[GMAC_NONCE_LEN];

        if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
                return TX_DROP;

        skb = skb_peek(&tx->skbs);

        info = IEEE80211_SKB_CB(skb);

        if (info->control.hw_key &&
            !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE))
                return TX_CONTINUE;

        if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
                return TX_DROP;

        mmie = skb_put(skb, sizeof(*mmie));
        mmie->element_id = WLAN_EID_MMIE;
        mmie->length = sizeof(*mmie) - 2;
        mmie->key_id = cpu_to_le16(key->conf.keyidx);

        /* PN = PN + 1 */
        pn64 = atomic64_inc_return(&key->conf.tx_pn);

        bip_ipn_set64(mmie->sequence_number, pn64);

        if (info->control.hw_key)
                return TX_CONTINUE;

        bip_aad(skb, aad);

        hdr = (struct ieee80211_hdr *)skb->data;
        memcpy(nonce, hdr->addr2, ETH_ALEN);
        bip_ipn_swap(nonce + ETH_ALEN, mmie->sequence_number);

        /* MIC = AES-GMAC(IGTK, AAD || Management Frame Body || MMIE, 128) */
        if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce,
                               skb->data + 24, skb->len - 24, mmie->mic) < 0)
                return TX_DROP;

        return TX_CONTINUE;
}

ieee80211_rx_result
ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
{
        struct sk_buff *skb = rx->skb;
        struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
        struct ieee80211_key *key = rx->key;
        struct ieee80211_mmie_16 *mmie;
        u8 aad[GMAC_AAD_LEN], *mic, ipn[6], nonce[GMAC_NONCE_LEN];
        struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;

        if (!ieee80211_is_mgmt(hdr->frame_control))
                return RX_CONTINUE;

        /* management frames are already linear */

        if (skb->len < 24 + sizeof(*mmie))
                return RX_DROP_U_SHORT_GMAC;

        mmie = (struct ieee80211_mmie_16 *)
                (skb->data + skb->len - sizeof(*mmie));
        if (mmie->element_id != WLAN_EID_MMIE ||
            mmie->length != sizeof(*mmie) - 2)
                return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */

        bip_ipn_swap(ipn, mmie->sequence_number);

        if (memcmp(ipn, key->u.aes_gmac.rx_pn, 6) <= 0) {
                key->u.aes_gmac.replays++;
                return RX_DROP_U_REPLAY;
        }

        if (!(status->flag & RX_FLAG_DECRYPTED)) {
                /* hardware didn't decrypt/verify MIC */
                bip_aad(skb, aad);

                memcpy(nonce, hdr->addr2, ETH_ALEN);
                memcpy(nonce + ETH_ALEN, ipn, 6);

                mic = kmalloc(GMAC_MIC_LEN, GFP_ATOMIC);
                if (!mic)
                        return RX_DROP_U_OOM;
                if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce,
                                       skb->data + 24, skb->len - 24,
                                       mic) < 0 ||
                    crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
                        key->u.aes_gmac.icverrors++;
                        kfree(mic);
                        return RX_DROP_U_MIC_FAIL;
                }
                kfree(mic);
        }

        memcpy(key->u.aes_gmac.rx_pn, ipn, 6);

        /* Remove MMIE */
        skb_trim(skb, skb->len - sizeof(*mmie));

        return RX_CONTINUE;
}































































































































































































































































































































































    1 



    5 
    7 




    1 






    1 




















































    6 












    8 

















































































































































































































































































































































































    1 




    1 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
 *  cgroup interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>

#include <linux/cgroup-defs.h>

struct kernel_clone_args;

#ifdef CONFIG_CGROUPS

/*
 * All weight knobs on the default hierarchy should use the following min,
 * default and max values.  The default value is the logarithmic center of
 * MIN and MAX and allows 100x to be expressed in both directions.
 */
#define CGROUP_WEIGHT_MIN                1
#define CGROUP_WEIGHT_DFL                100
#define CGROUP_WEIGHT_MAX                10000

enum {
        CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
        CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
        CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
};

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
        struct cgroup_subsys                *ss;
        unsigned int                        flags;

        struct list_head                *cset_pos;
        struct list_head                *cset_head;

        struct list_head                *tcset_pos;
        struct list_head                *tcset_head;

        struct list_head                *task_pos;

        struct list_head                *cur_tasks_head;
        struct css_set                        *cur_cset;
        struct css_set                        *cur_dcset;
        struct task_struct                *cur_task;
        struct list_head                iters_node;        /* css_set->task_iters */
};

extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
extern spinlock_t css_set_lock;

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x)                                                                \
        extern struct static_key_true _x ## _cgrp_subsys_enabled_key;                \
        extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

/**
 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
 * @ss: subsystem in question
 */
#define cgroup_subsys_enabled(ss)                                                \
        static_branch_likely(&ss ## _enabled_key)

/**
 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
 * @ss: subsystem in question
 */
#define cgroup_subsys_on_dfl(ss)                                                \
        static_branch_likely(&ss ## _on_dfl_key)

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
                                         struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                             struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss);

struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);
struct cgroup *cgroup_v1v2_get_from_fd(int fd);

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);
void cgroup_file_show(struct cgroup_file *cfile, bool show);

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
                           struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
                               struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
                             struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);

int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);

/*
 * Iteration helpers and macros.
 */

struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
                                                    struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
                                                     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);

/**
 * css_for_each_child - iterate through children of a css
 * @pos: the css * to use as the loop cursor
 * @parent: css whose children to walk
 *
 * Walk @parent's children.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_child(pos, parent)                                        \
        for ((pos) = css_next_child(NULL, (parent)); (pos);                \
             (pos) = css_next_child((pos), (parent)))

/**
 * css_for_each_descendant_pre - pre-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @root: css whose descendants to walk
 *
 * Walk @root's descendants.  @root is included in the iteration and the
 * first node to be visited.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * For example, the following guarantees that a descendant can't escape
 * state updates of its ancestors.
 *
 * my_online(@css)
 * {
 *        Lock @css's parent and @css;
 *        Inherit state from the parent;
 *        Unlock both.
 * }
 *
 * my_update_state(@css)
 * {
 *        css_for_each_descendant_pre(@pos, @css) {
 *                Lock @pos;
 *                if (@pos == @css)
 *                        Update @css's state;
 *                else
 *                        Verify @pos is alive and inherit state from its parent;
 *                Unlock @pos;
 *        }
 * }
 *
 * As long as the inheriting step, including checking the parent state, is
 * enclosed inside @pos locking, double-locking the parent isn't necessary
 * while inheriting.  The state update to the parent is guaranteed to be
 * visible by walking order and, as long as inheriting operations to the
 * same @pos are atomic to each other, multiple updates racing each other
 * still result in the correct state.  It's guaranateed that at least one
 * inheritance happens for any css after the latest update to its parent.
 *
 * If checking parent's state requires locking the parent, each inheriting
 * iteration should lock and unlock both @pos->parent and @pos.
 *
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_descendant_pre(pos, css)                                \
        for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_pre((pos), (css)))

/**
 * css_for_each_descendant_post - post-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @css: css whose descendants to walk
 *
 * Similar to css_for_each_descendant_pre() but performs post-order
 * traversal instead.  @root is included in the iteration and the last
 * node to be visited.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * Note that the walk visibility guarantee example described in pre-order
 * walk doesn't apply the same to post-order walks.
 */
#define css_for_each_descendant_post(pos, css)                                \
        for ((pos) = css_next_descendant_post(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_post((pos), (css)))

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, dst_css, tset)                        \
        for ((task) = cgroup_taskset_first((tset), &(dst_css));                \
             (task);                                                        \
             (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)                \
        for ((leader) = cgroup_taskset_first((tset), &(dst_css));        \
             (leader);                                                        \
             (leader) = cgroup_taskset_next((tset), &(dst_css)))        \
                if ((leader) != (leader)->group_leader)                        \
                        ;                                                \
                else

/*
 * Inline functions.
 */

#ifdef CONFIG_DEBUG_CGROUP_REF
void css_get(struct cgroup_subsys_state *css);
void css_get_many(struct cgroup_subsys_state *css, unsigned int n);
bool css_tryget(struct cgroup_subsys_state *css);
bool css_tryget_online(struct cgroup_subsys_state *css);
void css_put(struct cgroup_subsys_state *css);
void css_put_many(struct cgroup_subsys_state *css, unsigned int n);
#else
#define CGROUP_REF_FN_ATTRS        static inline
#define CGROUP_REF_EXPORT(fn)
#include <linux/cgroup_refcnt.h>
#endif

static inline u64 cgroup_id(const struct cgroup *cgrp)
{
        return cgrp->kn->id;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
 *
 * Test whether @css is in the process of offlining or already offline.  In
 * most cases, ->css_online() and ->css_offline() callbacks should be
 * enough; however, the actual offline operations are RCU delayed and this
 * test returns %true also when @css is scheduled to be offlined.
 *
 * This is useful, for example, when the use case requires synchronous
 * behavior with respect to cgroup removal.  cgroup removal schedules css
 * offlining but the css can seem alive while the operation is being
 * delayed.  If the delay affects user visible semantics, this test can be
 * used to resolve the situation.
 */
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
        return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt);
}

static inline void cgroup_get(struct cgroup *cgrp)
{
        css_get(&cgrp->self);
}

static inline bool cgroup_tryget(struct cgroup *cgrp)
{
        return css_tryget(&cgrp->self);
}

static inline void cgroup_put(struct cgroup *cgrp)
{
        css_put(&cgrp->self);
}

extern struct mutex cgroup_mutex;

static inline void cgroup_lock(void)
{
        mutex_lock(&cgroup_mutex);
}

static inline void cgroup_unlock(void)
{
        mutex_unlock(&cgroup_mutex);
}

/**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * A task's css_set is RCU protected, initialized and exited while holding
 * task_lock(), and can only be modified while holding both cgroup_mutex
 * and task_lock() while the task is alive.  This macro verifies that the
 * caller is inside proper critical section and returns @task's css_set.
 *
 * The caller can also specify additional allowed conditions via @__c, such
 * as locks used during the cgroup_subsys::attach() methods.
 */
#ifdef CONFIG_PROVE_RCU
#define task_css_set_check(task, __c)                                        \
        rcu_dereference_check((task)->cgroups,                                \
                rcu_read_lock_sched_held() ||                                \
                lockdep_is_held(&cgroup_mutex) ||                        \
                lockdep_is_held(&css_set_lock) ||                        \
                ((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c)                                        \
        rcu_dereference((task)->cgroups)
#endif

/**
 * task_css_check - obtain css for (task, subsys) w/ extra access conds
 * @task: the target task
 * @subsys_id: the target subsystem ID
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
 * synchronization rules are the same as task_css_set_check().
 */
#define task_css_check(task, subsys_id, __c)                                \
        task_css_set_check((task), (__c))->subsys[(subsys_id)]

/**
 * task_css_set - obtain a task's css_set
 * @task: the task to obtain css_set for
 *
 * See task_css_set_check().
 */
static inline struct css_set *task_css_set(struct task_struct *task)
{
        return task_css_set_check(task, false);
}

/**
 * task_css - obtain css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * See task_css_check().
 */
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
                                                   int subsys_id)
{
        return task_css_check(task, subsys_id, false);
}

/**
 * task_get_css - find and get the css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        while (true) {
                css = task_css(task, subsys_id);
                /*
                 * Can't use css_tryget_online() here.  A task which has
                 * PF_EXITING set may stay associated with an offline css.
                 * If such task calls this function, css_tryget_online()
                 * will keep failing.
                 */
                if (likely(css_tryget(css)))
                        break;
                cpu_relax();
        }
        rcu_read_unlock();
        return css;
}

/**
 * task_css_is_root - test whether a task belongs to the root css
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Test whether @task belongs to the root css on the specified subsystem.
 * May be invoked in any context.
 */
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
        return task_css_check(task, subsys_id, true) ==
                init_css_set.subsys[subsys_id];
}

static inline struct cgroup *task_cgroup(struct task_struct *task,
                                         int subsys_id)
{
        return task_css(task, subsys_id)->cgroup;
}

static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
        return task_css_set(task)->dfl_cgrp;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *parent_css = cgrp->self.parent;

        if (parent_css)
                return container_of(parent_css, struct cgroup, self);
        return NULL;
}

/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
                                        struct cgroup *ancestor)
{
        if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                return false;
        return cgrp->ancestors[ancestor->level] == ancestor;
}

/**
 * cgroup_ancestor - find ancestor of cgroup
 * @cgrp: cgroup to find ancestor of
 * @ancestor_level: level of ancestor to find starting from root
 *
 * Find ancestor of cgroup at specified level starting from root if it exists
 * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
 * @ancestor_level.
 *
 * This function is safe to call as long as @cgrp is accessible.
 */
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                             int ancestor_level)
{
        if (ancestor_level < 0 || ancestor_level > cgrp->level)
                return NULL;
        return cgrp->ancestors[ancestor_level];
}

/**
 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        struct css_set *cset = task_css_set(task);

        return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}

/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
                cgrp->nr_populated_threaded_children;
}

/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
        return kernfs_ino(cgrp->kn);
}

/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
        return of->kn->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);

/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
        return of_cft(seq->private);
}

static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
        return of_css(seq->private);
}

/*
 * Name / path handling functions.  All are thin wrappers around the kernfs
 * counterparts and can be called under any context.
 */

static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_name(cgrp->kn, buf, buflen);
}

static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_path(cgrp->kn, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
        pr_cont_kernfs_name(cgrp->kn);
}

static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
        pr_cont_kernfs_path(cgrp->kn);
}

bool cgroup_psi_enabled(void);

static inline void cgroup_init_kthreadd(void)
{
        /*
         * kthreadd is inherited by all kthreads, keep it in the root so
         * that the new kthreads are guaranteed to stay in the root until
         * initialization is finished.
         */
        current->no_cgroup_migration = 1;
}

static inline void cgroup_kthread_ready(void)
{
        /*
         * This kthread finished initialization.  The creator should have
         * set PF_NO_SETAFFINITY if this kthread should stay in the root.
         */
        current->no_cgroup_migration = 0;
}

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
struct cgroup *cgroup_get_from_id(u64 id);
#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
struct cgroup;

static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
                                         struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
                                    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
                                  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
                                      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
                                    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        return NULL;
}

static inline bool cgroup_psi_enabled(void)
{
        return false;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        return true;
}

static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUPS
/*
 * cgroup scalable recursive statistics.
 */
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
void cgroup_rstat_flush_release(struct cgroup *cgrp);

/*
 * Basic resource stats.
 */
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
                                         u64 val) {}
#endif

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec);

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_charge(task, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime(cgrp, delta_exec);
}

static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_account_field(task, index, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime_field(cgrp, index, delta_exec);
}

#else        /* CONFIG_CGROUPS */

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec) {}

#endif        /* CONFIG_CGROUPS */

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);

static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
        return skcd->cgroup;
}

#else        /* CONFIG_CGROUP_DATA */

static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif        /* CONFIG_CGROUP_DATA */

struct cgroup_namespace {
        struct ns_common        ns;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct css_set          *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns);

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
               struct cgroup_namespace *old_ns)
{
        return old_ns;
}

#endif /* !CONFIG_CGROUPS */

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->ns.count);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->ns.count))
                free_cgroup_ns(ns);
}

#ifdef CONFIG_CGROUPS

void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
                                 struct cgroup *dst);

static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return task->frozen;
}

#else /* !CONFIG_CGROUPS */

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return false;
}

#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
        percpu_ref_get(&cgrp->bpf.refcnt);
}

static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
        percpu_ref_put(&cgrp->bpf.refcnt);
}

#else /* CONFIG_CGROUP_BPF */

static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}

#endif /* CONFIG_CGROUP_BPF */

struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);

#endif /* _LINUX_CGROUP_H */

















    1 



    2 

    2 





















































    1 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
 *
 * Copyright (C) 2015 Martin Willi
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/string.h>
#include <asm/unaligned.h>
#include <crypto/chacha.h>

static void chacha_permute(u32 *x, int nrounds)
{
        int i;

        /* whitelist the allowed round counts */
        WARN_ON_ONCE(nrounds != 20 && nrounds != 12);

        for (i = 0; i < nrounds; i += 2) {
                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);

                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
        }
}

/**
 * chacha_block_generic - generate one keystream block and increment block counter
 * @state: input state matrix (16 32-bit words)
 * @stream: output keystream block (64 bytes)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
 * The caller has already converted the endianness of the input.  This function
 * also handles incrementing the block counter in the input matrix.
 */
void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
{
        u32 x[16];
        int i;

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        for (i = 0; i < ARRAY_SIZE(x); i++)
                put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);

        state[12]++;
}
EXPORT_SYMBOL(chacha_block_generic);

/**
 * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
 * @state: input state matrix (16 32-bit words)
 * @stream: output (8 32-bit words)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
 * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
 * skips the final addition of the initial state, and outputs only certain words
 * of the state.  It should not be used for streaming directly.
 */
void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
{
        u32 x[16];

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        memcpy(&stream[0], &x[0], 16);
        memcpy(&stream[4], &x[12], 16);
}
EXPORT_SYMBOL(hchacha_block_generic);






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 
















    1 







    1 






    1 







    1 













    1 
















    1 





























    1 

    1 
    1 

    1 








    1 












    1 







    1 














    1 


    1 
    1 






















































    1 
























    1 

    1 




































    1 
    1 



























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "SMP alternatives: " fmt

#include <linux/module.h>
#include <linux/sched.h>
#include <linux/perf_event.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/stop_machine.h>
#include <linux/slab.h>
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/bsearch.h>
#include <linux/sync_core.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/mce.h>
#include <asm/nmi.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
#include <asm/cfi.h>

int __read_mostly alternatives_patched;

EXPORT_SYMBOL_GPL(alternatives_patched);

#define MAX_PATCH_LEN (255-1)

#define DA_ALL                (~0)
#define DA_ALT                0x01
#define DA_RET                0x02
#define DA_RETPOLINE        0x04
#define DA_ENDBR        0x08
#define DA_SMP                0x10

static unsigned int debug_alternative;

static int __init debug_alt(char *str)
{
        if (str && *str == '=')
                str++;

        if (!str || kstrtouint(str, 0, &debug_alternative))
                debug_alternative = DA_ALL;

        return 1;
}
__setup("debug-alternative", debug_alt);

static int noreplace_smp;

static int __init setup_noreplace_smp(char *str)
{
        noreplace_smp = 1;
        return 1;
}
__setup("noreplace-smp", setup_noreplace_smp);

#define DPRINTK(type, fmt, args...)                                        \
do {                                                                        \
        if (debug_alternative & DA_##type)                                \
                printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);                \
} while (0)

#define DUMP_BYTES(type, buf, len, fmt, args...)                        \
do {                                                                        \
        if (unlikely(debug_alternative & DA_##type)) {                        \
                int j;                                                        \
                                                                        \
                if (!(len))                                                \
                        break;                                                \
                                                                        \
                printk(KERN_DEBUG pr_fmt(fmt), ##args);                        \
                for (j = 0; j < (len) - 1; j++)                                \
                        printk(KERN_CONT "%02hhx ", buf[j]);                \
                printk(KERN_CONT "%02hhx\n", buf[j]);                        \
        }                                                                \
} while (0)

static const unsigned char x86nops[] =
{
        BYTES_NOP1,
        BYTES_NOP2,
        BYTES_NOP3,
        BYTES_NOP4,
        BYTES_NOP5,
        BYTES_NOP6,
        BYTES_NOP7,
        BYTES_NOP8,
#ifdef CONFIG_64BIT
        BYTES_NOP9,
        BYTES_NOP10,
        BYTES_NOP11,
#endif
};

const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
{
        NULL,
        x86nops,
        x86nops + 1,
        x86nops + 1 + 2,
        x86nops + 1 + 2 + 3,
        x86nops + 1 + 2 + 3 + 4,
        x86nops + 1 + 2 + 3 + 4 + 5,
        x86nops + 1 + 2 + 3 + 4 + 5 + 6,
        x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
#ifdef CONFIG_64BIT
        x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
        x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
        x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
#endif
};

/*
 * Nomenclature for variable names to simplify and clarify this code and ease
 * any potential staring at it:
 *
 * @instr: source address of the original instructions in the kernel text as
 * generated by the compiler.
 *
 * @buf: temporary buffer on which the patching operates. This buffer is
 * eventually text-poked into the kernel image.
 *
 * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
 * in the .altinstr_replacement section.
 */

/*
 * Fill the buffer with a single effective instruction of size @len.
 *
 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
 * for every single-byte NOP, try to generate the maximally available NOP of
 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
 * *jump* over instead of executing long and daft NOPs.
 */
static void add_nop(u8 *buf, unsigned int len)
{
        u8 *target = buf + len;

        if (!len)
                return;

        if (len <= ASM_NOP_MAX) {
                memcpy(buf, x86_nops[len], len);
                return;
        }

        if (len < 128) {
                __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
                buf += JMP8_INSN_SIZE;
        } else {
                __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
                buf += JMP32_INSN_SIZE;
        }

        for (;buf < target; buf++)
                *buf = INT3_INSN_OPCODE;
}

extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);

/*
 * Matches NOP and NOPL, not any of the other possible NOPs.
 */
static bool insn_is_nop(struct insn *insn)
{
        /* Anything NOP, but no REP NOP */
        if (insn->opcode.bytes[0] == 0x90 &&
            (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
                return true;

        /* NOPL */
        if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
                return true;

        /* TODO: more nops */

        return false;
}

/*
 * Find the offset of the first non-NOP instruction starting at @offset
 * but no further than @len.
 */
static int skip_nops(u8 *buf, int offset, int len)
{
        struct insn insn;

        for (; offset < len; offset += insn.length) {
                if (insn_decode_kernel(&insn, &buf[offset]))
                        break;

                if (!insn_is_nop(&insn))
                        break;
        }

        return offset;
}

/*
 * "noinline" to cause control flow change and thus invalidate I$ and
 * cause refetch after modification.
 */
static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
{
        for (int next, i = 0; i < len; i = next) {
                struct insn insn;

                if (insn_decode_kernel(&insn, &buf[i]))
                        return;

                next = i + insn.length;

                if (insn_is_nop(&insn)) {
                        int nop = i;

                        /* Has the NOP already been optimized? */
                        if (i + insn.length == len)
                                return;

                        next = skip_nops(buf, next, len);

                        add_nop(buf + nop, next - nop);
                        DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
                }
        }
}

/*
 * In this context, "source" is where the instructions are placed in the
 * section .altinstr_replacement, for example during kernel build by the
 * toolchain.
 * "Destination" is where the instructions are being patched in by this
 * machinery.
 *
 * The source offset is:
 *
 *   src_imm = target - src_next_ip                  (1)
 *
 * and the target offset is:
 *
 *   dst_imm = target - dst_next_ip                  (2)
 *
 * so rework (1) as an expression for target like:
 *
 *   target = src_imm + src_next_ip                  (1a)
 *
 * and substitute in (2) to get:
 *
 *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
 *
 * Now, since the instruction stream is 'identical' at src and dst (it
 * is being copied after all) it can be stated that:
 *
 *   src_next_ip = src + ip_offset
 *   dst_next_ip = dst + ip_offset                   (4)
 *
 * Substitute (4) in (3) and observe ip_offset being cancelled out to
 * obtain:
 *
 *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
 *           = src_imm + src - dst + ip_offset - ip_offset
 *           = src_imm + src - dst                   (5)
 *
 * IOW, only the relative displacement of the code block matters.
 */

#define apply_reloc_n(n_, p_, d_)                                \
        do {                                                        \
                s32 v = *(s##n_ *)(p_);                                \
                v += (d_);                                        \
                BUG_ON((v >> 31) != (v >> (n_-1)));                \
                *(s##n_ *)(p_) = (s##n_)v;                        \
        } while (0)


static __always_inline
void apply_reloc(int n, void *ptr, uintptr_t diff)
{
        switch (n) {
        case 1: apply_reloc_n(8, ptr, diff); break;
        case 2: apply_reloc_n(16, ptr, diff); break;
        case 4: apply_reloc_n(32, ptr, diff); break;
        default: BUG();
        }
}

static __always_inline
bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
{
        u8 *target = src + offset;
        /*
         * If the target is inside the patched block, it's relative to the
         * block itself and does not need relocation.
         */
        return (target < src || target > src + src_len);
}

static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
        for (int next, i = 0; i < instrlen; i = next) {
                struct insn insn;

                if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
                        return;

                next = i + insn.length;

                switch (insn.opcode.bytes[0]) {
                case 0x0f:
                        if (insn.opcode.bytes[1] < 0x80 ||
                            insn.opcode.bytes[1] > 0x8f)
                                break;

                        fallthrough;        /* Jcc.d32 */
                case 0x70 ... 0x7f:        /* Jcc.d8 */
                case JMP8_INSN_OPCODE:
                case JMP32_INSN_OPCODE:
                case CALL_INSN_OPCODE:
                        if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
                                apply_reloc(insn.immediate.nbytes,
                                            buf + i + insn_offset_immediate(&insn),
                                            repl - instr);
                        }

                        /*
                         * Where possible, convert JMP.d32 into JMP.d8.
                         */
                        if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
                                s32 imm = insn.immediate.value;
                                imm += repl - instr;
                                imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
                                if ((imm >> 31) == (imm >> 7)) {
                                        buf[i+0] = JMP8_INSN_OPCODE;
                                        buf[i+1] = (s8)imm;

                                        memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
                                }
                        }
                        break;
                }

                if (insn_rip_relative(&insn)) {
                        if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
                                apply_reloc(insn.displacement.nbytes,
                                            buf + i + insn_offset_displacement(&insn),
                                            repl - instr);
                        }
                }
        }
}

void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
        __apply_relocation(buf, instr, instrlen, repl, repl_len);
        optimize_nops(instr, buf, instrlen);
}

/* Low-level backend functions usable from alternative code replacements. */
DEFINE_ASM_FUNC(nop_func, "", .entry.text);
EXPORT_SYMBOL_GPL(nop_func);

noinstr void BUG_func(void)
{
        BUG();
}
EXPORT_SYMBOL(BUG_func);

#define CALL_RIP_REL_OPCODE        0xff
#define CALL_RIP_REL_MODRM        0x15

/*
 * Rewrite the "call BUG_func" replacement to point to the target of the
 * indirect pv_ops call "call *disp(%ip)".
 */
static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
{
        void *target, *bug = &BUG_func;
        s32 disp;

        if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
                pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
                BUG();
        }

        if (a->instrlen != 6 ||
            instr[0] != CALL_RIP_REL_OPCODE ||
            instr[1] != CALL_RIP_REL_MODRM) {
                pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
                BUG();
        }

        /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
        disp = *(s32 *)(instr + 2);
#ifdef CONFIG_X86_64
        /* ff 15 00 00 00 00   call   *0x0(%rip) */
        /* target address is stored at "next instruction + disp". */
        target = *(void **)(instr + a->instrlen + disp);
#else
        /* ff 15 00 00 00 00   call   *0x0 */
        /* target address is stored at disp. */
        target = *(void **)disp;
#endif
        if (!target)
                target = bug;

        /* (BUG_func - .) + (target - BUG_func) := target - . */
        *(s32 *)(insn_buff + 1) += target - bug;

        if (target == &nop_func)
                return 0;

        return 5;
}

/*
 * Replace instructions with better alternatives for this CPU type. This runs
 * before SMP is initialized to avoid SMP problems with self modifying code.
 * This implies that asymmetric systems where APs have less capabilities than
 * the boot processor are not handled. Tough. Make sure you disable such
 * features by hand.
 *
 * Marked "noinline" to cause control flow change and thus insn cache
 * to refetch changed I$ lines.
 */
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
                                                  struct alt_instr *end)
{
        u8 insn_buff[MAX_PATCH_LEN];
        u8 *instr, *replacement;
        struct alt_instr *a;

        DPRINTK(ALT, "alt table %px, -> %px", start, end);

        /*
         * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
         * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
         * During the process, KASAN becomes confused seeing partial LA57
         * conversion and triggers a false-positive out-of-bound report.
         *
         * Disable KASAN until the patching is complete.
         */
        kasan_disable_current();

        /*
         * The scan order should be from start to end. A later scanned
         * alternative code can overwrite previously scanned alternative code.
         * Some kernel functions (e.g. memcpy, memset, etc) use this order to
         * patch code.
         *
         * So be careful if you want to change the scan order to any other
         * order.
         */
        for (a = start; a < end; a++) {
                int insn_buff_sz = 0;

                instr = (u8 *)&a->instr_offset + a->instr_offset;
                replacement = (u8 *)&a->repl_offset + a->repl_offset;
                BUG_ON(a->instrlen > sizeof(insn_buff));
                BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);

                /*
                 * Patch if either:
                 * - feature is present
                 * - feature not present but ALT_FLAG_NOT is set to mean,
                 *   patch if feature is *NOT* present.
                 */
                if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
                        memcpy(insn_buff, instr, a->instrlen);
                        optimize_nops(instr, insn_buff, a->instrlen);
                        text_poke_early(instr, insn_buff, a->instrlen);
                        continue;
                }

                DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
                        a->cpuid >> 5,
                        a->cpuid & 0x1f,
                        instr, instr, a->instrlen,
                        replacement, a->replacementlen, a->flags);

                memcpy(insn_buff, replacement, a->replacementlen);
                insn_buff_sz = a->replacementlen;

                if (a->flags & ALT_FLAG_DIRECT_CALL) {
                        insn_buff_sz = alt_replace_call(instr, insn_buff, a);
                        if (insn_buff_sz < 0)
                                continue;
                }

                for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
                        insn_buff[insn_buff_sz] = 0x90;

                apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);

                DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
                DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
                DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);

                text_poke_early(instr, insn_buff, insn_buff_sz);
        }

        kasan_enable_current();
}

static inline bool is_jcc32(struct insn *insn)
{
        /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
        return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
}

#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)

/*
 * CALL/JMP *%\reg
 */
static int emit_indirect(int op, int reg, u8 *bytes)
{
        int i = 0;
        u8 modrm;

        switch (op) {
        case CALL_INSN_OPCODE:
                modrm = 0x10; /* Reg = 2; CALL r/m */
                break;

        case JMP32_INSN_OPCODE:
                modrm = 0x20; /* Reg = 4; JMP r/m */
                break;

        default:
                WARN_ON_ONCE(1);
                return -1;
        }

        if (reg >= 8) {
                bytes[i++] = 0x41; /* REX.B prefix */
                reg -= 8;
        }

        modrm |= 0xc0; /* Mod = 3 */
        modrm += reg;

        bytes[i++] = 0xff; /* opcode */
        bytes[i++] = modrm;

        return i;
}

static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
{
        u8 op = insn->opcode.bytes[0];
        int i = 0;

        /*
         * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
         * tail-calls. Deal with them.
         */
        if (is_jcc32(insn)) {
                bytes[i++] = op;
                op = insn->opcode.bytes[1];
                goto clang_jcc;
        }

        if (insn->length == 6)
                bytes[i++] = 0x2e; /* CS-prefix */

        switch (op) {
        case CALL_INSN_OPCODE:
                __text_gen_insn(bytes+i, op, addr+i,
                                __x86_indirect_call_thunk_array[reg],
                                CALL_INSN_SIZE);
                i += CALL_INSN_SIZE;
                break;

        case JMP32_INSN_OPCODE:
clang_jcc:
                __text_gen_insn(bytes+i, op, addr+i,
                                __x86_indirect_jump_thunk_array[reg],
                                JMP32_INSN_SIZE);
                i += JMP32_INSN_SIZE;
                break;

        default:
                WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
                return -1;
        }

        WARN_ON_ONCE(i != insn->length);

        return i;
}

/*
 * Rewrite the compiler generated retpoline thunk calls.
 *
 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
 * indirect instructions, avoiding the extra indirection.
 *
 * For example, convert:
 *
 *   CALL __x86_indirect_thunk_\reg
 *
 * into:
 *
 *   CALL *%\reg
 *
 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
 */
static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
{
        retpoline_thunk_t *target;
        int reg, ret, i = 0;
        u8 op, cc;

        target = addr + insn->length + insn->immediate.value;
        reg = target - __x86_indirect_thunk_array;

        if (WARN_ON_ONCE(reg & ~0xf))
                return -1;

        /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
        BUG_ON(reg == 4);

        if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
            !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
                if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
                        return emit_call_track_retpoline(addr, insn, reg, bytes);

                return -1;
        }

        op = insn->opcode.bytes[0];

        /*
         * Convert:
         *
         *   Jcc.d32 __x86_indirect_thunk_\reg
         *
         * into:
         *
         *   Jncc.d8 1f
         *   [ LFENCE ]
         *   JMP *%\reg
         *   [ NOP ]
         * 1:
         */
        if (is_jcc32(insn)) {
                cc = insn->opcode.bytes[1] & 0xf;
                cc ^= 1; /* invert condition */

                bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
                bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */

                /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
                op = JMP32_INSN_OPCODE;
        }

        /*
         * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
         */
        if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
                bytes[i++] = 0x0f;
                bytes[i++] = 0xae;
                bytes[i++] = 0xe8; /* LFENCE */
        }

        ret = emit_indirect(op, reg, bytes + i);
        if (ret < 0)
                return ret;
        i += ret;

        /*
         * The compiler is supposed to EMIT an INT3 after every unconditional
         * JMP instruction due to AMD BTC. However, if the compiler is too old
         * or MITIGATION_SLS isn't enabled, we still need an INT3 after
         * indirect JMPs even on Intel.
         */
        if (op == JMP32_INSN_OPCODE && i < insn->length)
                bytes[i++] = INT3_INSN_OPCODE;

        for (; i < insn->length;)
                bytes[i++] = BYTES_NOP1;

        return i;
}

/*
 * Generated by 'objtool --retpoline'.
 */
void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
{
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;
                struct insn insn;
                int len, ret;
                u8 bytes[16];
                u8 op1, op2;

                ret = insn_decode_kernel(&insn, addr);
                if (WARN_ON_ONCE(ret < 0))
                        continue;

                op1 = insn.opcode.bytes[0];
                op2 = insn.opcode.bytes[1];

                switch (op1) {
                case CALL_INSN_OPCODE:
                case JMP32_INSN_OPCODE:
                        break;

                case 0x0f: /* escape */
                        if (op2 >= 0x80 && op2 <= 0x8f)
                                break;
                        fallthrough;
                default:
                        WARN_ON_ONCE(1);
                        continue;
                }

                DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
                        addr, addr, insn.length,
                        addr + insn.length + insn.immediate.value);

                len = patch_retpoline(addr, &insn, bytes);
                if (len == insn.length) {
                        optimize_nops(addr, bytes, len);
                        DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
                        DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
                        text_poke_early(addr, bytes, len);
                }
        }
}

#ifdef CONFIG_MITIGATION_RETHUNK

/*
 * Rewrite the compiler generated return thunk tail-calls.
 *
 * For example, convert:
 *
 *   JMP __x86_return_thunk
 *
 * into:
 *
 *   RET
 */
static int patch_return(void *addr, struct insn *insn, u8 *bytes)
{
        int i = 0;

        /* Patch the custom return thunks... */
        if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
                i = JMP32_INSN_SIZE;
                __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
        } else {
                /* ... or patch them out if not needed. */
                bytes[i++] = RET_INSN_OPCODE;
        }

        for (; i < insn->length;)
                bytes[i++] = INT3_INSN_OPCODE;
        return i;
}

void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
        s32 *s;

        if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
                static_call_force_reinit();

        for (s = start; s < end; s++) {
                void *dest = NULL, *addr = (void *)s + *s;
                struct insn insn;
                int len, ret;
                u8 bytes[16];
                u8 op;

                ret = insn_decode_kernel(&insn, addr);
                if (WARN_ON_ONCE(ret < 0))
                        continue;

                op = insn.opcode.bytes[0];
                if (op == JMP32_INSN_OPCODE)
                        dest = addr + insn.length + insn.immediate.value;

                if (__static_call_fixup(addr, op, dest) ||
                    WARN_ONCE(dest != &__x86_return_thunk,
                              "missing return thunk: %pS-%pS: %*ph",
                              addr, dest, 5, addr))
                        continue;

                DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
                        addr, addr, insn.length,
                        addr + insn.length + insn.immediate.value);

                len = patch_return(addr, &insn, bytes);
                if (len == insn.length) {
                        DUMP_BYTES(RET, ((u8*)addr),  len, "%px: orig: ", addr);
                        DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
                        text_poke_early(addr, bytes, len);
                }
        }
}
#else
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
#endif /* CONFIG_MITIGATION_RETHUNK */

#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */

void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }

#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */

#ifdef CONFIG_X86_KERNEL_IBT

static void poison_cfi(void *addr);

static void __init_or_module poison_endbr(void *addr, bool warn)
{
        u32 endbr, poison = gen_endbr_poison();

        if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
                return;

        if (!is_endbr(endbr)) {
                WARN_ON_ONCE(warn);
                return;
        }

        DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);

        /*
         * When we have IBT, the lack of ENDBR will trigger #CP
         */
        DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
        DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
        text_poke_early(addr, &poison, 4);
}

/*
 * Generated by: objtool --ibt
 *
 * Seal the functions for indirect calls by clobbering the ENDBR instructions
 * and the kCFI hash value.
 */
void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
{
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;

                poison_endbr(addr, true);
                if (IS_ENABLED(CONFIG_FINEIBT))
                        poison_cfi(addr - 16);
        }
}

#else

void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }

#endif /* CONFIG_X86_KERNEL_IBT */

#ifdef CONFIG_FINEIBT
#define __CFI_DEFAULT        CFI_DEFAULT
#elif defined(CONFIG_CFI_CLANG)
#define __CFI_DEFAULT        CFI_KCFI
#else
#define __CFI_DEFAULT        CFI_OFF
#endif

enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;

#ifdef CONFIG_CFI_CLANG
struct bpf_insn;

/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
extern unsigned int __bpf_prog_runX(const void *ctx,
                                    const struct bpf_insn *insn);

/*
 * Force a reference to the external symbol so the compiler generates
 * __kcfi_typid.
 */
__ADDRESSABLE(__bpf_prog_runX);

/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
asm (
"        .pushsection        .data..ro_after_init,\"aw\",@progbits        \n"
"        .type        cfi_bpf_hash,@object                                \n"
"        .globl        cfi_bpf_hash                                        \n"
"        .p2align        2, 0x0                                        \n"
"cfi_bpf_hash:                                                        \n"
"        .long        __kcfi_typeid___bpf_prog_runX                        \n"
"        .size        cfi_bpf_hash, 4                                        \n"
"        .popsection                                                \n"
);

/* Must match bpf_callback_t */
extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);

__ADDRESSABLE(__bpf_callback_fn);

/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
asm (
"        .pushsection        .data..ro_after_init,\"aw\",@progbits        \n"
"        .type        cfi_bpf_subprog_hash,@object                        \n"
"        .globl        cfi_bpf_subprog_hash                                \n"
"        .p2align        2, 0x0                                        \n"
"cfi_bpf_subprog_hash:                                                \n"
"        .long        __kcfi_typeid___bpf_callback_fn                        \n"
"        .size        cfi_bpf_subprog_hash, 4                                \n"
"        .popsection                                                \n"
);

u32 cfi_get_func_hash(void *func)
{
        u32 hash;

        func -= cfi_get_offset();
        switch (cfi_mode) {
        case CFI_FINEIBT:
                func += 7;
                break;
        case CFI_KCFI:
                func += 1;
                break;
        default:
                return 0;
        }

        if (get_kernel_nofault(hash, func))
                return 0;

        return hash;
}
#endif

#ifdef CONFIG_FINEIBT

static bool cfi_rand __ro_after_init = true;
static u32  cfi_seed __ro_after_init;

/*
 * Re-hash the CFI hash with a boot-time seed while making sure the result is
 * not a valid ENDBR instruction.
 */
static u32 cfi_rehash(u32 hash)
{
        hash ^= cfi_seed;
        while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
                bool lsb = hash & 1;
                hash >>= 1;
                if (lsb)
                        hash ^= 0x80200003;
        }
        return hash;
}

static __init int cfi_parse_cmdline(char *str)
{
        if (!str)
                return -EINVAL;

        while (str) {
                char *next = strchr(str, ',');
                if (next) {
                        *next = 0;
                        next++;
                }

                if (!strcmp(str, "auto")) {
                        cfi_mode = CFI_DEFAULT;
                } else if (!strcmp(str, "off")) {
                        cfi_mode = CFI_OFF;
                        cfi_rand = false;
                } else if (!strcmp(str, "kcfi")) {
                        cfi_mode = CFI_KCFI;
                } else if (!strcmp(str, "fineibt")) {
                        cfi_mode = CFI_FINEIBT;
                } else if (!strcmp(str, "norand")) {
                        cfi_rand = false;
                } else {
                        pr_err("Ignoring unknown cfi option (%s).", str);
                }

                str = next;
        }

        return 0;
}
early_param("cfi", cfi_parse_cmdline);

/*
 * kCFI                                                FineIBT
 *
 * __cfi_\func:                                        __cfi_\func:
 *        movl   $0x12345678,%eax                // 5             endbr64                        // 4
 *        nop                                             subl   $0x12345678,%r10d   // 7
 *        nop                                             jz     1f                        // 2
 *        nop                                             ud2                        // 2
 *        nop                                        1:   nop                        // 1
 *        nop
 *        nop
 *        nop
 *        nop
 *        nop
 *        nop
 *        nop
 *
 *
 * caller:                                        caller:
 *        movl        $(-0x12345678),%r10d         // 6             movl   $0x12345678,%r10d        // 6
 *        addl        $-15(%r11),%r10d         // 4             sub    $16,%r11                // 4
 *        je        1f                         // 2             nop4                        // 4
 *        ud2                                 // 2
 * 1:        call        __x86_indirect_thunk_r11 // 5             call   *%r11; nop2;        // 5
 *
 */

asm(        ".pushsection .rodata                        \n"
        "fineibt_preamble_start:                \n"
        "        endbr64                                \n"
        "        subl        $0x12345678, %r10d        \n"
        "        je        fineibt_preamble_end        \n"
        "        ud2                                \n"
        "        nop                                \n"
        "fineibt_preamble_end:                        \n"
        ".popsection\n"
);

extern u8 fineibt_preamble_start[];
extern u8 fineibt_preamble_end[];

#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
#define fineibt_preamble_hash 7

asm(        ".pushsection .rodata                        \n"
        "fineibt_caller_start:                        \n"
        "        movl        $0x12345678, %r10d        \n"
        "        sub        $16, %r11                \n"
        ASM_NOP4
        "fineibt_caller_end:                        \n"
        ".popsection                                \n"
);

extern u8 fineibt_caller_start[];
extern u8 fineibt_caller_end[];

#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
#define fineibt_caller_hash 2

#define fineibt_caller_jmp (fineibt_caller_size - 2)

static u32 decode_preamble_hash(void *addr)
{
        u8 *p = addr;

        /* b8 78 56 34 12          mov    $0x12345678,%eax */
        if (p[0] == 0xb8)
                return *(u32 *)(addr + 1);

        return 0; /* invalid hash value */
}

static u32 decode_caller_hash(void *addr)
{
        u8 *p = addr;

        /* 41 ba 78 56 34 12       mov    $0x12345678,%r10d */
        if (p[0] == 0x41 && p[1] == 0xba)
                return -*(u32 *)(addr + 2);

        /* e8 0c 78 56 34 12           jmp.d8  +12 */
        if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
                return -*(u32 *)(addr + 2);

        return 0; /* invalid hash value */
}

/* .retpoline_sites */
static int cfi_disable_callers(s32 *start, s32 *end)
{
        /*
         * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
         * in tact for later usage. Also see decode_caller_hash() and
         * cfi_rewrite_callers().
         */
        const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;
                u32 hash;

                addr -= fineibt_caller_size;
                hash = decode_caller_hash(addr);
                if (!hash) /* nocfi callers */
                        continue;

                text_poke_early(addr, jmp, 2);
        }

        return 0;
}

static int cfi_enable_callers(s32 *start, s32 *end)
{
        /*
         * Re-enable kCFI, undo what cfi_disable_callers() did.
         */
        const u8 mov[] = { 0x41, 0xba };
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;
                u32 hash;

                addr -= fineibt_caller_size;
                hash = decode_caller_hash(addr);
                if (!hash) /* nocfi callers */
                        continue;

                text_poke_early(addr, mov, 2);
        }

        return 0;
}

/* .cfi_sites */
static int cfi_rand_preamble(s32 *start, s32 *end)
{
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;
                u32 hash;

                hash = decode_preamble_hash(addr);
                if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
                         addr, addr, 5, addr))
                        return -EINVAL;

                hash = cfi_rehash(hash);
                text_poke_early(addr + 1, &hash, 4);
        }

        return 0;
}

static int cfi_rewrite_preamble(s32 *start, s32 *end)
{
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;
                u32 hash;

                hash = decode_preamble_hash(addr);
                if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
                         addr, addr, 5, addr))
                        return -EINVAL;

                text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
                WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
                text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
        }

        return 0;
}

static void cfi_rewrite_endbr(s32 *start, s32 *end)
{
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;

                poison_endbr(addr+16, false);
        }
}

/* .retpoline_sites */
static int cfi_rand_callers(s32 *start, s32 *end)
{
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;
                u32 hash;

                addr -= fineibt_caller_size;
                hash = decode_caller_hash(addr);
                if (hash) {
                        hash = -cfi_rehash(hash);
                        text_poke_early(addr + 2, &hash, 4);
                }
        }

        return 0;
}

static int cfi_rewrite_callers(s32 *start, s32 *end)
{
        s32 *s;

        for (s = start; s < end; s++) {
                void *addr = (void *)s + *s;
                u32 hash;

                addr -= fineibt_caller_size;
                hash = decode_caller_hash(addr);
                if (hash) {
                        text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
                        WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
                        text_poke_early(addr + fineibt_caller_hash, &hash, 4);
                }
                /* rely on apply_retpolines() */
        }

        return 0;
}

static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
                            s32 *start_cfi, s32 *end_cfi, bool builtin)
{
        int ret;

        if (WARN_ONCE(fineibt_preamble_size != 16,
                      "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
                return;

        if (cfi_mode == CFI_DEFAULT) {
                cfi_mode = CFI_KCFI;
                if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
                        cfi_mode = CFI_FINEIBT;
        }

        /*
         * Rewrite the callers to not use the __cfi_ stubs, such that we might
         * rewrite them. This disables all CFI. If this succeeds but any of the
         * later stages fails, we're without CFI.
         */
        ret = cfi_disable_callers(start_retpoline, end_retpoline);
        if (ret)
                goto err;

        if (cfi_rand) {
                if (builtin) {
                        cfi_seed = get_random_u32();
                        cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
                        cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
                }

                ret = cfi_rand_preamble(start_cfi, end_cfi);
                if (ret)
                        goto err;

                ret = cfi_rand_callers(start_retpoline, end_retpoline);
                if (ret)
                        goto err;
        }

        switch (cfi_mode) {
        case CFI_OFF:
                if (builtin)
                        pr_info("Disabling CFI\n");
                return;

        case CFI_KCFI:
                ret = cfi_enable_callers(start_retpoline, end_retpoline);
                if (ret)
                        goto err;

                if (builtin)
                        pr_info("Using kCFI\n");
                return;

        case CFI_FINEIBT:
                /* place the FineIBT preamble at func()-16 */
                ret = cfi_rewrite_preamble(start_cfi, end_cfi);
                if (ret)
                        goto err;

                /* rewrite the callers to target func()-16 */
                ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
                if (ret)
                        goto err;

                /* now that nobody targets func()+0, remove ENDBR there */
                cfi_rewrite_endbr(start_cfi, end_cfi);

                if (builtin)
                        pr_info("Using FineIBT CFI\n");
                return;

        default:
                break;
        }

err:
        pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
}

static inline void poison_hash(void *addr)
{
        *(u32 *)addr = 0;
}

static void poison_cfi(void *addr)
{
        switch (cfi_mode) {
        case CFI_FINEIBT:
                /*
                 * __cfi_\func:
                 *        osp nopl (%rax)
                 *        subl        $0, %r10d
                 *        jz        1f
                 *        ud2
                 * 1:        nop
                 */
                poison_endbr(addr, false);
                poison_hash(addr + fineibt_preamble_hash);
                break;

        case CFI_KCFI:
                /*
                 * __cfi_\func:
                 *        movl        $0, %eax
                 *        .skip        11, 0x90
                 */
                poison_hash(addr + 1);
                break;

        default:
                break;
        }
}

#else

static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
                            s32 *start_cfi, s32 *end_cfi, bool builtin)
{
}

#ifdef CONFIG_X86_KERNEL_IBT
static void poison_cfi(void *addr) { }
#endif

#endif

void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
                   s32 *start_cfi, s32 *end_cfi)
{
        return __apply_fineibt(start_retpoline, end_retpoline,
                               start_cfi, end_cfi,
                               /* .builtin = */ false);
}

#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
                                  u8 *text, u8 *text_end)
{
        const s32 *poff;

        for (poff = start; poff < end; poff++) {
                u8 *ptr = (u8 *)poff + *poff;

                if (!*poff || ptr < text || ptr >= text_end)
                        continue;
                /* turn DS segment override prefix into lock prefix */
                if (*ptr == 0x3e)
                        text_poke(ptr, ((unsigned char []){0xf0}), 1);
        }
}

static void alternatives_smp_unlock(const s32 *start, const s32 *end,
                                    u8 *text, u8 *text_end)
{
        const s32 *poff;

        for (poff = start; poff < end; poff++) {
                u8 *ptr = (u8 *)poff + *poff;

                if (!*poff || ptr < text || ptr >= text_end)
                        continue;
                /* turn lock prefix into DS segment override prefix */
                if (*ptr == 0xf0)
                        text_poke(ptr, ((unsigned char []){0x3E}), 1);
        }
}

struct smp_alt_module {
        /* what is this ??? */
        struct module        *mod;
        char                *name;

        /* ptrs to lock prefixes */
        const s32        *locks;
        const s32        *locks_end;

        /* .text segment, needed to avoid patching init code ;) */
        u8                *text;
        u8                *text_end;

        struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
static bool uniproc_patched = false;        /* protected by text_mutex */

void __init_or_module alternatives_smp_module_add(struct module *mod,
                                                  char *name,
                                                  void *locks, void *locks_end,
                                                  void *text,  void *text_end)
{
        struct smp_alt_module *smp;

        mutex_lock(&text_mutex);
        if (!uniproc_patched)
                goto unlock;

        if (num_possible_cpus() == 1)
                /* Don't bother remembering, we'll never have to undo it. */
                goto smp_unlock;

        smp = kzalloc(sizeof(*smp), GFP_KERNEL);
        if (NULL == smp)
                /* we'll run the (safe but slow) SMP code then ... */
                goto unlock;

        smp->mod        = mod;
        smp->name        = name;
        smp->locks        = locks;
        smp->locks_end        = locks_end;
        smp->text        = text;
        smp->text_end        = text_end;
        DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
                smp->locks, smp->locks_end,
                smp->text, smp->text_end, smp->name);

        list_add_tail(&smp->next, &smp_alt_modules);
smp_unlock:
        alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
        mutex_unlock(&text_mutex);
}

void __init_or_module alternatives_smp_module_del(struct module *mod)
{
        struct smp_alt_module *item;

        mutex_lock(&text_mutex);
        list_for_each_entry(item, &smp_alt_modules, next) {
                if (mod != item->mod)
                        continue;
                list_del(&item->next);
                kfree(item);
                break;
        }
        mutex_unlock(&text_mutex);
}

void alternatives_enable_smp(void)
{
        struct smp_alt_module *mod;

        /* Why bother if there are no other CPUs? */
        BUG_ON(num_possible_cpus() == 1);

        mutex_lock(&text_mutex);

        if (uniproc_patched) {
                pr_info("switching to SMP code\n");
                BUG_ON(num_online_cpus() != 1);
                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
                clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
                list_for_each_entry(mod, &smp_alt_modules, next)
                        alternatives_smp_lock(mod->locks, mod->locks_end,
                                              mod->text, mod->text_end);
                uniproc_patched = false;
        }
        mutex_unlock(&text_mutex);
}

/*
 * Return 1 if the address range is reserved for SMP-alternatives.
 * Must hold text_mutex.
 */
int alternatives_text_reserved(void *start, void *end)
{
        struct smp_alt_module *mod;
        const s32 *poff;
        u8 *text_start = start;
        u8 *text_end = end;

        lockdep_assert_held(&text_mutex);

        list_for_each_entry(mod, &smp_alt_modules, next) {
                if (mod->text > text_end || mod->text_end < text_start)
                        continue;
                for (poff = mod->locks; poff < mod->locks_end; poff++) {
                        const u8 *ptr = (const u8 *)poff + *poff;

                        if (text_start <= ptr && text_end > ptr)
                                return 1;
                }
        }

        return 0;
}
#endif /* CONFIG_SMP */

/*
 * Self-test for the INT3 based CALL emulation code.
 *
 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
 * properly and that there is a stack gap between the INT3 frame and the
 * previous context. Without this gap doing a virtual PUSH on the interrupted
 * stack would corrupt the INT3 IRET frame.
 *
 * See entry_{32,64}.S for more details.
 */

/*
 * We define the int3_magic() function in assembly to control the calling
 * convention such that we can 'call' it from assembly.
 */

extern void int3_magic(unsigned int *ptr); /* defined in asm */

asm (
"        .pushsection        .init.text, \"ax\", @progbits\n"
"        .type                int3_magic, @function\n"
"int3_magic:\n"
        ANNOTATE_NOENDBR
"        movl        $1, (%" _ASM_ARG1 ")\n"
        ASM_RET
"        .size                int3_magic, .-int3_magic\n"
"        .popsection\n"
);

extern void int3_selftest_ip(void); /* defined in asm below */

static int __init
int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
        unsigned long selftest = (unsigned long)&int3_selftest_ip;
        struct die_args *args = data;
        struct pt_regs *regs = args->regs;

        OPTIMIZER_HIDE_VAR(selftest);

        if (!regs || user_mode(regs))
                return NOTIFY_DONE;

        if (val != DIE_INT3)
                return NOTIFY_DONE;

        if (regs->ip - INT3_INSN_SIZE != selftest)
                return NOTIFY_DONE;

        int3_emulate_call(regs, (unsigned long)&int3_magic);
        return NOTIFY_STOP;
}

/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
static noinline void __init int3_selftest(void)
{
        static __initdata struct notifier_block int3_exception_nb = {
                .notifier_call        = int3_exception_notify,
                .priority        = INT_MAX-1, /* last */
        };
        unsigned int val = 0;

        BUG_ON(register_die_notifier(&int3_exception_nb));

        /*
         * Basically: int3_magic(&val); but really complicated :-)
         *
         * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
         * notifier above will emulate CALL for us.
         */
        asm volatile ("int3_selftest_ip:\n\t"
                      ANNOTATE_NOENDBR
                      "    int3; nop; nop; nop; nop\n\t"
                      : ASM_CALL_CONSTRAINT
                      : __ASM_SEL_RAW(a, D) (&val)
                      : "memory");

        BUG_ON(val != 1);

        unregister_die_notifier(&int3_exception_nb);
}

static __initdata int __alt_reloc_selftest_addr;

extern void __init __alt_reloc_selftest(void *arg);
__visible noinline void __init __alt_reloc_selftest(void *arg)
{
        WARN_ON(arg != &__alt_reloc_selftest_addr);
}

static noinline void __init alt_reloc_selftest(void)
{
        /*
         * Tests apply_relocation().
         *
         * This has a relative immediate (CALL) in a place other than the first
         * instruction and additionally on x86_64 we get a RIP-relative LEA:
         *
         *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
         *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
         *
         * Getting this wrong will either crash and burn or tickle the WARN
         * above.
         */
        asm_inline volatile (
                ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
                : /* output */
                : [mem] "m" (__alt_reloc_selftest_addr)
                : _ASM_ARG1
        );
}

void __init alternative_instructions(void)
{
        int3_selftest();

        /*
         * The patching is not fully atomic, so try to avoid local
         * interruptions that might execute the to be patched code.
         * Other CPUs are not running.
         */
        stop_nmi();

        /*
         * Don't stop machine check exceptions while patching.
         * MCEs only happen when something got corrupted and in this
         * case we must do something about the corruption.
         * Ignoring it is worse than an unlikely patching race.
         * Also machine checks tend to be broadcast and if one CPU
         * goes into machine check the others follow quickly, so we don't
         * expect a machine check to cause undue problems during to code
         * patching.
         */

        /*
         * Make sure to set (artificial) features depending on used paravirt
         * functions which can later influence alternative patching.
         */
        paravirt_set_cap();

        __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
                        __cfi_sites, __cfi_sites_end, true);

        /*
         * Rewrite the retpolines, must be done before alternatives since
         * those can rewrite the retpoline thunks.
         */
        apply_retpolines(__retpoline_sites, __retpoline_sites_end);
        apply_returns(__return_sites, __return_sites_end);

        apply_alternatives(__alt_instructions, __alt_instructions_end);

        /*
         * Now all calls are established. Apply the call thunks if
         * required.
         */
        callthunks_patch_builtin_calls();

        /*
         * Seal all functions that do not have their address taken.
         */
        apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);

#ifdef CONFIG_SMP
        /* Patch to UP if other cpus not imminent. */
        if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
                uniproc_patched = true;
                alternatives_smp_module_add(NULL, "core kernel",
                                            __smp_locks, __smp_locks_end,
                                            _text, _etext);
        }

        if (!uniproc_patched || num_possible_cpus() == 1) {
                free_init_pages("SMP alternatives",
                                (unsigned long)__smp_locks,
                                (unsigned long)__smp_locks_end);
        }
#endif

        restart_nmi();
        alternatives_patched = 1;

        alt_reloc_selftest();
}

/**
 * text_poke_early - Update instructions on a live kernel at boot time
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * When you use this code to patch more than one byte of an instruction
 * you need to make sure that other CPUs cannot execute this code in parallel.
 * Also no thread must be currently preempted in the middle of these
 * instructions. And on the local CPU you need to be protected against NMI or
 * MCE handlers seeing an inconsistent instruction while you patch.
 */
void __init_or_module text_poke_early(void *addr, const void *opcode,
                                      size_t len)
{
        unsigned long flags;

        if (boot_cpu_has(X86_FEATURE_NX) &&
            is_module_text_address((unsigned long)addr)) {
                /*
                 * Modules text is marked initially as non-executable, so the
                 * code cannot be running and speculative code-fetches are
                 * prevented. Just change the code.
                 */
                memcpy(addr, opcode, len);
        } else {
                local_irq_save(flags);
                memcpy(addr, opcode, len);
                sync_core();
                local_irq_restore(flags);

                /*
                 * Could also do a CLFLUSH here to speed up CPU recovery; but
                 * that causes hangs on some VIA CPUs.
                 */
        }
}

typedef struct {
        struct mm_struct *mm;
} temp_mm_state_t;

/*
 * Using a temporary mm allows to set temporary mappings that are not accessible
 * by other CPUs. Such mappings are needed to perform sensitive memory writes
 * that override the kernel memory protections (e.g., W^X), without exposing the
 * temporary page-table mappings that are required for these write operations to
 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
 * mapping is torn down.
 *
 * Context: The temporary mm needs to be used exclusively by a single core. To
 *          harden security IRQs must be disabled while the temporary mm is
 *          loaded, thereby preventing interrupt handler bugs from overriding
 *          the kernel memory protection.
 */
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
        temp_mm_state_t temp_state;

        lockdep_assert_irqs_disabled();

        /*
         * Make sure not to be in TLB lazy mode, as otherwise we'll end up
         * with a stale address space WITHOUT being in lazy mode after
         * restoring the previous mm.
         */
        if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
                leave_mm();

        temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
        switch_mm_irqs_off(NULL, mm, current);

        /*
         * If breakpoints are enabled, disable them while the temporary mm is
         * used. Userspace might set up watchpoints on addresses that are used
         * in the temporary mm, which would lead to wrong signals being sent or
         * crashes.
         *
         * Note that breakpoints are not disabled selectively, which also causes
         * kernel breakpoints (e.g., perf's) to be disabled. This might be
         * undesirable, but still seems reasonable as the code that runs in the
         * temporary mm should be short.
         */
        if (hw_breakpoint_active())
                hw_breakpoint_disable();

        return temp_state;
}

static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
        lockdep_assert_irqs_disabled();
        switch_mm_irqs_off(NULL, prev_state.mm, current);

        /*
         * Restore the breakpoints if they were disabled before the temporary mm
         * was loaded.
         */
        if (hw_breakpoint_active())
                hw_breakpoint_restore();
}

__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;

static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
        memcpy(dst, src, len);
}

static void text_poke_memset(void *dst, const void *src, size_t len)
{
        int c = *(const int *)src;

        memset(dst, c, len);
}

typedef void text_poke_f(void *dst, const void *src, size_t len);

static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
{
        bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
        struct page *pages[2] = {NULL};
        temp_mm_state_t prev;
        unsigned long flags;
        pte_t pte, *ptep;
        spinlock_t *ptl;
        pgprot_t pgprot;

        /*
         * While boot memory allocator is running we cannot use struct pages as
         * they are not yet initialized. There is no way to recover.
         */
        BUG_ON(!after_bootmem);

        if (!core_kernel_text((unsigned long)addr)) {
                pages[0] = vmalloc_to_page(addr);
                if (cross_page_boundary)
                        pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
        } else {
                pages[0] = virt_to_page(addr);
                WARN_ON(!PageReserved(pages[0]));
                if (cross_page_boundary)
                        pages[1] = virt_to_page(addr + PAGE_SIZE);
        }
        /*
         * If something went wrong, crash and burn since recovery paths are not
         * implemented.
         */
        BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));

        /*
         * Map the page without the global bit, as TLB flushing is done with
         * flush_tlb_mm_range(), which is intended for non-global PTEs.
         */
        pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);

        /*
         * The lock is not really needed, but this allows to avoid open-coding.
         */
        ptep = get_locked_pte(poking_mm, poking_addr, &ptl);

        /*
         * This must not fail; preallocated in poking_init().
         */
        VM_BUG_ON(!ptep);

        local_irq_save(flags);

        pte = mk_pte(pages[0], pgprot);
        set_pte_at(poking_mm, poking_addr, ptep, pte);

        if (cross_page_boundary) {
                pte = mk_pte(pages[1], pgprot);
                set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
        }

        /*
         * Loading the temporary mm behaves as a compiler barrier, which
         * guarantees that the PTE will be set at the time memcpy() is done.
         */
        prev = use_temporary_mm(poking_mm);

        kasan_disable_current();
        func((u8 *)poking_addr + offset_in_page(addr), src, len);
        kasan_enable_current();

        /*
         * Ensure that the PTE is only cleared after the instructions of memcpy
         * were issued by using a compiler barrier.
         */
        barrier();

        pte_clear(poking_mm, poking_addr, ptep);
        if (cross_page_boundary)
                pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);

        /*
         * Loading the previous page-table hierarchy requires a serializing
         * instruction that already allows the core to see the updated version.
         * Xen-PV is assumed to serialize execution in a similar manner.
         */
        unuse_temporary_mm(prev);

        /*
         * Flushing the TLB might involve IPIs, which would require enabled
         * IRQs, but not if the mm is not used, as it is in this point.
         */
        flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
                           (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
                           PAGE_SHIFT, false);

        if (func == text_poke_memcpy) {
                /*
                 * If the text does not match what we just wrote then something is
                 * fundamentally screwy; there's nothing we can really do about that.
                 */
                BUG_ON(memcmp(addr, src, len));
        }

        local_irq_restore(flags);
        pte_unmap_unlock(ptep, ptl);
        return addr;
}

/**
 * text_poke - Update instructions on a live kernel
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * Only atomic text poke/set should be allowed when not doing early patching.
 * It means the size must be writable atomically and the address must be aligned
 * in a way that permits an atomic write. It also makes sure we fit on a single
 * page.
 *
 * Note that the caller must ensure that if the modified code is part of a
 * module, the module would not be removed during poking. This can be achieved
 * by registering a module notifier, and ordering module removal and patching
 * through a mutex.
 */
void *text_poke(void *addr, const void *opcode, size_t len)
{
        lockdep_assert_held(&text_mutex);

        return __text_poke(text_poke_memcpy, addr, opcode, len);
}

/**
 * text_poke_kgdb - Update instructions on a live kernel by kgdb
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * Only atomic text poke/set should be allowed when not doing early patching.
 * It means the size must be writable atomically and the address must be aligned
 * in a way that permits an atomic write. It also makes sure we fit on a single
 * page.
 *
 * Context: should only be used by kgdb, which ensures no other core is running,
 *            despite the fact it does not hold the text_mutex.
 */
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
{
        return __text_poke(text_poke_memcpy, addr, opcode, len);
}

void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
                            bool core_ok)
{
        unsigned long start = (unsigned long)addr;
        size_t patched = 0;

        if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
                return NULL;

        while (patched < len) {
                unsigned long ptr = start + patched;
                size_t s;

                s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);

                __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
                patched += s;
        }
        return addr;
}

/**
 * text_poke_copy - Copy instructions into (an unused part of) RX memory
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy, could be more than 2x PAGE_SIZE
 *
 * Not safe against concurrent execution; useful for JITs to dump
 * new code blocks into unused regions of RX memory. Can be used in
 * conjunction with synchronize_rcu_tasks() to wait for existing
 * execution to quiesce after having made sure no existing functions
 * pointers are live.
 */
void *text_poke_copy(void *addr, const void *opcode, size_t len)
{
        mutex_lock(&text_mutex);
        addr = text_poke_copy_locked(addr, opcode, len, false);
        mutex_unlock(&text_mutex);
        return addr;
}

/**
 * text_poke_set - memset into (an unused part of) RX memory
 * @addr: address to modify
 * @c: the byte to fill the area with
 * @len: length to copy, could be more than 2x PAGE_SIZE
 *
 * This is useful to overwrite unused regions of RX memory with illegal
 * instructions.
 */
void *text_poke_set(void *addr, int c, size_t len)
{
        unsigned long start = (unsigned long)addr;
        size_t patched = 0;

        if (WARN_ON_ONCE(core_kernel_text(start)))
                return NULL;

        mutex_lock(&text_mutex);
        while (patched < len) {
                unsigned long ptr = start + patched;
                size_t s;

                s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);

                __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
                patched += s;
        }
        mutex_unlock(&text_mutex);
        return addr;
}

static void do_sync_core(void *info)
{
        sync_core();
}

void text_poke_sync(void)
{
        on_each_cpu(do_sync_core, NULL, 1);
}

/*
 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
 * this thing. When len == 6 everything is prefixed with 0x0f and we map
 * opcode to Jcc.d8, using len to distinguish.
 */
struct text_poke_loc {
        /* addr := _stext + rel_addr */
        s32 rel_addr;
        s32 disp;
        u8 len;
        u8 opcode;
        const u8 text[POKE_MAX_OPCODE_SIZE];
        /* see text_poke_bp_batch() */
        u8 old;
};

struct bp_patching_desc {
        struct text_poke_loc *vec;
        int nr_entries;
        atomic_t refs;
};

static struct bp_patching_desc bp_desc;

static __always_inline
struct bp_patching_desc *try_get_desc(void)
{
        struct bp_patching_desc *desc = &bp_desc;

        if (!raw_atomic_inc_not_zero(&desc->refs))
                return NULL;

        return desc;
}

static __always_inline void put_desc(void)
{
        struct bp_patching_desc *desc = &bp_desc;

        smp_mb__before_atomic();
        raw_atomic_dec(&desc->refs);
}

static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
{
        return _stext + tp->rel_addr;
}

static __always_inline int patch_cmp(const void *key, const void *elt)
{
        struct text_poke_loc *tp = (struct text_poke_loc *) elt;

        if (key < text_poke_addr(tp))
                return -1;
        if (key > text_poke_addr(tp))
                return 1;
        return 0;
}

noinstr int poke_int3_handler(struct pt_regs *regs)
{
        struct bp_patching_desc *desc;
        struct text_poke_loc *tp;
        int ret = 0;
        void *ip;

        if (user_mode(regs))
                return 0;

        /*
         * Having observed our INT3 instruction, we now must observe
         * bp_desc with non-zero refcount:
         *
         *        bp_desc.refs = 1                INT3
         *        WMB                                RMB
         *        write INT3                        if (bp_desc.refs != 0)
         */
        smp_rmb();

        desc = try_get_desc();
        if (!desc)
                return 0;

        /*
         * Discount the INT3. See text_poke_bp_batch().
         */
        ip = (void *) regs->ip - INT3_INSN_SIZE;

        /*
         * Skip the binary search if there is a single member in the vector.
         */
        if (unlikely(desc->nr_entries > 1)) {
                tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
                                      sizeof(struct text_poke_loc),
                                      patch_cmp);
                if (!tp)
                        goto out_put;
        } else {
                tp = desc->vec;
                if (text_poke_addr(tp) != ip)
                        goto out_put;
        }

        ip += tp->len;

        switch (tp->opcode) {
        case INT3_INSN_OPCODE:
                /*
                 * Someone poked an explicit INT3, they'll want to handle it,
                 * do not consume.
                 */
                goto out_put;

        case RET_INSN_OPCODE:
                int3_emulate_ret(regs);
                break;

        case CALL_INSN_OPCODE:
                int3_emulate_call(regs, (long)ip + tp->disp);
                break;

        case JMP32_INSN_OPCODE:
        case JMP8_INSN_OPCODE:
                int3_emulate_jmp(regs, (long)ip + tp->disp);
                break;

        case 0x70 ... 0x7f: /* Jcc */
                int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
                break;

        default:
                BUG();
        }

        ret = 1;

out_put:
        put_desc();
        return ret;
}

#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];
static int tp_vec_nr;

/**
 * text_poke_bp_batch() -- update instructions on live kernel on SMP
 * @tp:                        vector of instructions to patch
 * @nr_entries:                number of entries in the vector
 *
 * Modify multi-byte instruction by using int3 breakpoint on SMP.
 * We completely avoid stop_machine() here, and achieve the
 * synchronization using int3 breakpoint.
 *
 * The way it is done:
 *        - For each entry in the vector:
 *                - add a int3 trap to the address that will be patched
 *        - sync cores
 *        - For each entry in the vector:
 *                - update all but the first byte of the patched range
 *        - sync cores
 *        - For each entry in the vector:
 *                - replace the first byte (int3) by the first byte of
 *                  replacing opcode
 *        - sync cores
 */
static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
        unsigned char int3 = INT3_INSN_OPCODE;
        unsigned int i;
        int do_sync;

        lockdep_assert_held(&text_mutex);

        bp_desc.vec = tp;
        bp_desc.nr_entries = nr_entries;

        /*
         * Corresponds to the implicit memory barrier in try_get_desc() to
         * ensure reading a non-zero refcount provides up to date bp_desc data.
         */
        atomic_set_release(&bp_desc.refs, 1);

        /*
         * Function tracing can enable thousands of places that need to be
         * updated. This can take quite some time, and with full kernel debugging
         * enabled, this could cause the softlockup watchdog to trigger.
         * This function gets called every 256 entries added to be patched.
         * Call cond_resched() here to make sure that other tasks can get scheduled
         * while processing all the functions being patched.
         */
        cond_resched();

        /*
         * Corresponding read barrier in int3 notifier for making sure the
         * nr_entries and handler are correctly ordered wrt. patching.
         */
        smp_wmb();

        /*
         * First step: add a int3 trap to the address that will be patched.
         */
        for (i = 0; i < nr_entries; i++) {
                tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
                text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
        }

        text_poke_sync();

        /*
         * Second step: update all but the first byte of the patched range.
         */
        for (do_sync = 0, i = 0; i < nr_entries; i++) {
                u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
                u8 _new[POKE_MAX_OPCODE_SIZE+1];
                const u8 *new = tp[i].text;
                int len = tp[i].len;

                if (len - INT3_INSN_SIZE > 0) {
                        memcpy(old + INT3_INSN_SIZE,
                               text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
                               len - INT3_INSN_SIZE);

                        if (len == 6) {
                                _new[0] = 0x0f;
                                memcpy(_new + 1, new, 5);
                                new = _new;
                        }

                        text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
                                  new + INT3_INSN_SIZE,
                                  len - INT3_INSN_SIZE);

                        do_sync++;
                }

                /*
                 * Emit a perf event to record the text poke, primarily to
                 * support Intel PT decoding which must walk the executable code
                 * to reconstruct the trace. The flow up to here is:
                 *   - write INT3 byte
                 *   - IPI-SYNC
                 *   - write instruction tail
                 * At this point the actual control flow will be through the
                 * INT3 and handler and not hit the old or new instruction.
                 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
                 * can still be decoded. Subsequently:
                 *   - emit RECORD_TEXT_POKE with the new instruction
                 *   - IPI-SYNC
                 *   - write first byte
                 *   - IPI-SYNC
                 * So before the text poke event timestamp, the decoder will see
                 * either the old instruction flow or FUP/TIP of INT3. After the
                 * text poke event timestamp, the decoder will see either the
                 * new instruction flow or FUP/TIP of INT3. Thus decoders can
                 * use the timestamp as the point at which to modify the
                 * executable code.
                 * The old instruction is recorded so that the event can be
                 * processed forwards or backwards.
                 */
                perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
        }

        if (do_sync) {
                /*
                 * According to Intel, this core syncing is very likely
                 * not necessary and we'd be safe even without it. But
                 * better safe than sorry (plus there's not only Intel).
                 */
                text_poke_sync();
        }

        /*
         * Third step: replace the first byte (int3) by the first byte of
         * replacing opcode.
         */
        for (do_sync = 0, i = 0; i < nr_entries; i++) {
                u8 byte = tp[i].text[0];

                if (tp[i].len == 6)
                        byte = 0x0f;

                if (byte == INT3_INSN_OPCODE)
                        continue;

                text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
                do_sync++;
        }

        if (do_sync)
                text_poke_sync();

        /*
         * Remove and wait for refs to be zero.
         */
        if (!atomic_dec_and_test(&bp_desc.refs))
                atomic_cond_read_acquire(&bp_desc.refs, !VAL);
}

static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
                               const void *opcode, size_t len, const void *emulate)
{
        struct insn insn;
        int ret, i = 0;

        if (len == 6)
                i = 1;
        memcpy((void *)tp->text, opcode+i, len-i);
        if (!emulate)
                emulate = opcode;

        ret = insn_decode_kernel(&insn, emulate);
        BUG_ON(ret < 0);

        tp->rel_addr = addr - (void *)_stext;
        tp->len = len;
        tp->opcode = insn.opcode.bytes[0];

        if (is_jcc32(&insn)) {
                /*
                 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
                 */
                tp->opcode = insn.opcode.bytes[1] - 0x10;
        }

        switch (tp->opcode) {
        case RET_INSN_OPCODE:
        case JMP32_INSN_OPCODE:
        case JMP8_INSN_OPCODE:
                /*
                 * Control flow instructions without implied execution of the
                 * next instruction can be padded with INT3.
                 */
                for (i = insn.length; i < len; i++)
                        BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
                break;

        default:
                BUG_ON(len != insn.length);
        }

        switch (tp->opcode) {
        case INT3_INSN_OPCODE:
        case RET_INSN_OPCODE:
                break;

        case CALL_INSN_OPCODE:
        case JMP32_INSN_OPCODE:
        case JMP8_INSN_OPCODE:
        case 0x70 ... 0x7f: /* Jcc */
                tp->disp = insn.immediate.value;
                break;

        default: /* assume NOP */
                switch (len) {
                case 2: /* NOP2 -- emulate as JMP8+0 */
                        BUG_ON(memcmp(emulate, x86_nops[len], len));
                        tp->opcode = JMP8_INSN_OPCODE;
                        tp->disp = 0;
                        break;

                case 5: /* NOP5 -- emulate as JMP32+0 */
                        BUG_ON(memcmp(emulate, x86_nops[len], len));
                        tp->opcode = JMP32_INSN_OPCODE;
                        tp->disp = 0;
                        break;

                default: /* unknown instruction */
                        BUG();
                }
                break;
        }
}

/*
 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
 * early if needed.
 */
static bool tp_order_fail(void *addr)
{
        struct text_poke_loc *tp;

        if (!tp_vec_nr)
                return false;

        if (!addr) /* force */
                return true;

        tp = &tp_vec[tp_vec_nr - 1];
        if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
                return true;

        return false;
}

static void text_poke_flush(void *addr)
{
        if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
                text_poke_bp_batch(tp_vec, tp_vec_nr);
                tp_vec_nr = 0;
        }
}

void text_poke_finish(void)
{
        text_poke_flush(NULL);
}

void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
{
        struct text_poke_loc *tp;

        text_poke_flush(addr);

        tp = &tp_vec[tp_vec_nr++];
        text_poke_loc_init(tp, addr, opcode, len, emulate);
}

/**
 * text_poke_bp() -- update instructions on live kernel on SMP
 * @addr:        address to patch
 * @opcode:        opcode of new instruction
 * @len:        length to copy
 * @emulate:        instruction to be emulated
 *
 * Update a single instruction with the vector in the stack, avoiding
 * dynamically allocated memory. This function should be used when it is
 * not possible to allocate memory.
 */
void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
        struct text_poke_loc tp;

        text_poke_loc_init(&tp, addr, opcode, len, emulate);
        text_poke_bp_batch(&tp, 1);
}





























    1 


    1 
    1 








































































































































































































































    1 






    1 











    1 





















    1 
    1 
    1 




    1 



    1 





















    1 


    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic INET6 transport hashtables
 *
 * Authors:        Lotsa people, from code originally in tcp, generalised here
 *                by Arnaldo Carvalho de Melo <acme@mandriva.com>
 */

#include <linux/module.h>
#include <linux/random.h>

#include <net/addrconf.h>
#include <net/hotdata.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
#include <net/sock_reuseport.h>
#include <net/tcp.h>

u32 inet6_ehashfn(const struct net *net,
                  const struct in6_addr *laddr, const u16 lport,
                  const struct in6_addr *faddr, const __be16 fport)
{
        u32 lhash, fhash;

        net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
        net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret));

        lhash = (__force u32)laddr->s6_addr32[3];
        fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret);

        return __inet6_ehashfn(lhash, lport, fhash, fport,
                               inet6_ehash_secret + net_hash_mix(net));
}
EXPORT_SYMBOL_GPL(inet6_ehashfn);

/*
 * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
 *
 * The sockhash lock must be held as a reader here.
 */
struct sock *__inet6_lookup_established(struct net *net,
                                        struct inet_hashinfo *hashinfo,
                                           const struct in6_addr *saddr,
                                           const __be16 sport,
                                           const struct in6_addr *daddr,
                                           const u16 hnum,
                                           const int dif, const int sdif)
{
        struct sock *sk;
        const struct hlist_nulls_node *node;
        const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
        /* Optimize here for direct hit, only listening connections can
         * have wildcards anyways.
         */
        unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
        unsigned int slot = hash & hashinfo->ehash_mask;
        struct inet_ehash_bucket *head = &hashinfo->ehash[slot];


begin:
        sk_nulls_for_each_rcu(sk, node, &head->chain) {
                if (sk->sk_hash != hash)
                        continue;
                if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
                        continue;
                if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                        goto out;

                if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) {
                        sock_gen_put(sk);
                        goto begin;
                }
                goto found;
        }
        if (get_nulls_value(node) != slot)
                goto begin;
out:
        sk = NULL;
found:
        return sk;
}
EXPORT_SYMBOL(__inet6_lookup_established);

static inline int compute_score(struct sock *sk, struct net *net,
                                const unsigned short hnum,
                                const struct in6_addr *daddr,
                                const int dif, const int sdif)
{
        int score = -1;

        if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
            sk->sk_family == PF_INET6) {
                if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
                        return -1;

                if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
                        return -1;

                score =  sk->sk_bound_dev_if ? 2 : 1;
                if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
                        score++;
        }
        return score;
}

/**
 * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary.
 * @net: network namespace.
 * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
 * @skb: context for a potential SK_REUSEPORT program.
 * @doff: header offset.
 * @saddr: source address.
 * @sport: source port.
 * @daddr: destination address.
 * @hnum: destination port in host byte order.
 * @ehashfn: hash function used to generate the fallback hash.
 *
 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
 *         the selected sock or an error.
 */
struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk,
                                    struct sk_buff *skb, int doff,
                                    const struct in6_addr *saddr,
                                    __be16 sport,
                                    const struct in6_addr *daddr,
                                    unsigned short hnum,
                                    inet6_ehashfn_t *ehashfn)
{
        struct sock *reuse_sk = NULL;
        u32 phash;

        if (sk->sk_reuseport) {
                phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn,
                                           net, daddr, hnum, saddr, sport);
                reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
        }
        return reuse_sk;
}
EXPORT_SYMBOL_GPL(inet6_lookup_reuseport);

/* called with rcu_read_lock() */
static struct sock *inet6_lhash2_lookup(struct net *net,
                struct inet_listen_hashbucket *ilb2,
                struct sk_buff *skb, int doff,
                const struct in6_addr *saddr,
                const __be16 sport, const struct in6_addr *daddr,
                const unsigned short hnum, const int dif, const int sdif)
{
        struct sock *sk, *result = NULL;
        struct hlist_nulls_node *node;
        int score, hiscore = 0;

        sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
                score = compute_score(sk, net, hnum, daddr, dif, sdif);
                if (score > hiscore) {
                        result = inet6_lookup_reuseport(net, sk, skb, doff,
                                                        saddr, sport, daddr, hnum, inet6_ehashfn);
                        if (result)
                                return result;

                        result = sk;
                        hiscore = score;
                }
        }

        return result;
}

struct sock *inet6_lookup_run_sk_lookup(struct net *net,
                                        int protocol,
                                        struct sk_buff *skb, int doff,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 hnum, const int dif,
                                        inet6_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool no_reuseport;

        no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport,
                                            daddr, hnum, dif, &sk);
        if (no_reuseport || IS_ERR_OR_NULL(sk))
                return sk;

        reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
                                          saddr, sport, daddr, hnum, ehashfn);
        if (reuse_sk)
                sk = reuse_sk;
        return sk;
}
EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup);

struct sock *inet6_lookup_listener(struct net *net,
                struct inet_hashinfo *hashinfo,
                struct sk_buff *skb, int doff,
                const struct in6_addr *saddr,
                const __be16 sport, const struct in6_addr *daddr,
                const unsigned short hnum, const int dif, const int sdif)
{
        struct inet_listen_hashbucket *ilb2;
        struct sock *result = NULL;
        unsigned int hash2;

        /* Lookup redirect from BPF */
        if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
            hashinfo == net->ipv4.tcp_death_row.hashinfo) {
                result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
                                                    saddr, sport, daddr, hnum, dif,
                                                    inet6_ehashfn);
                if (result)
                        goto done;
        }

        hash2 = ipv6_portaddr_hash(net, daddr, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet6_lhash2_lookup(net, ilb2, skb, doff,
                                     saddr, sport, daddr, hnum,
                                     dif, sdif);
        if (result)
                goto done;

        /* Lookup lhash2 with in6addr_any */
        hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);

        result = inet6_lhash2_lookup(net, ilb2, skb, doff,
                                     saddr, sport, &in6addr_any, hnum,
                                     dif, sdif);
done:
        if (IS_ERR(result))
                return NULL;
        return result;
}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);

struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
                          struct sk_buff *skb, int doff,
                          const struct in6_addr *saddr, const __be16 sport,
                          const struct in6_addr *daddr, const __be16 dport,
                          const int dif)
{
        struct sock *sk;
        bool refcounted;

        sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
                            ntohs(dport), dif, 0, &refcounted);
        if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
}
EXPORT_SYMBOL_GPL(inet6_lookup);

static int __inet6_check_established(struct inet_timewait_death_row *death_row,
                                     struct sock *sk, const __u16 lport,
                                     struct inet_timewait_sock **twp)
{
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_sock *inet = inet_sk(sk);
        const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
        const struct in6_addr *saddr = &sk->sk_v6_daddr;
        const int dif = sk->sk_bound_dev_if;
        struct net *net = sock_net(sk);
        const int sdif = l3mdev_master_ifindex_by_index(net, dif);
        const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
        const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
                                                inet->inet_dport);
        struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
        spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
        struct sock *sk2;
        const struct hlist_nulls_node *node;
        struct inet_timewait_sock *tw = NULL;

        spin_lock(lock);

        sk_nulls_for_each(sk2, node, &head->chain) {
                if (sk2->sk_hash != hash)
                        continue;

                if (likely(inet6_match(net, sk2, saddr, daddr, ports,
                                       dif, sdif))) {
                        if (sk2->sk_state == TCP_TIME_WAIT) {
                                tw = inet_twsk(sk2);
                                if (sk->sk_protocol == IPPROTO_TCP &&
                                    tcp_twsk_unique(sk, sk2, twp))
                                        break;
                        }
                        goto not_unique;
                }
        }

        /* Must record num and sport now. Otherwise we will see
         * in hash table socket with a funny identity.
         */
        inet->inet_num = lport;
        inet->inet_sport = htons(lport);
        sk->sk_hash = hash;
        WARN_ON(!sk_unhashed(sk));
        __sk_nulls_add_node_rcu(sk, &head->chain);
        if (tw) {
                sk_nulls_del_node_init_rcu((struct sock *)tw);
                __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
        }
        spin_unlock(lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

        if (twp) {
                *twp = tw;
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
                inet_twsk_deschedule_put(tw);
        }
        return 0;

not_unique:
        spin_unlock(lock);
        return -EADDRNOTAVAIL;
}

static u64 inet6_sk_port_offset(const struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);

        return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32,
                                          sk->sk_v6_daddr.s6_addr32,
                                          inet->inet_dport);
}

int inet6_hash_connect(struct inet_timewait_death_row *death_row,
                       struct sock *sk)
{
        u64 port_offset = 0;

        if (!inet_sk(sk)->inet_num)
                port_offset = inet6_sk_port_offset(sk);
        return __inet_hash_connect(death_row, sk, port_offset,
                                   __inet6_check_established);
}
EXPORT_SYMBOL_GPL(inet6_hash_connect);

int inet6_hash(struct sock *sk)
{
        int err = 0;

        if (sk->sk_state != TCP_CLOSE)
                err = __inet_hash(sk, NULL);

        return err;
}
EXPORT_SYMBOL_GPL(inet6_hash);





































































    3 








    3 








    3 






















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SIGNAL_H
#define _LINUX_SIGNAL_H

#include <linux/bug.h>
#include <linux/list.h>
#include <linux/signal_types.h>
#include <linux/string.h>

struct task_struct;

/* for sysctl */
extern int print_fatal_signals;

static inline void copy_siginfo(kernel_siginfo_t *to,
                                const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*to));
}

static inline void clear_siginfo(kernel_siginfo_t *info)
{
        memset(info, 0, sizeof(*info));
}

#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))

static inline void copy_siginfo_to_external(siginfo_t *to,
                                            const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*from));
        memset(((char *)to) + sizeof(struct kernel_siginfo), 0,
                SI_EXPANSION_SIZE);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);

enum siginfo_layout {
        SIL_KILL,
        SIL_TIMER,
        SIL_POLL,
        SIL_FAULT,
        SIL_FAULT_TRAPNO,
        SIL_FAULT_MCEERR,
        SIL_FAULT_BNDERR,
        SIL_FAULT_PKUERR,
        SIL_FAULT_PERF_EVENT,
        SIL_CHLD,
        SIL_RT,
        SIL_SYS,
};

enum siginfo_layout siginfo_layout(unsigned sig, int si_code);

/*
 * Define some primitives to manipulate sigset_t.
 */

#ifndef __HAVE_ARCH_SIG_BITOPS
#include <linux/bitops.h>

/* We don't use <linux/bitops.h> for these because there is no need to
   be atomic.  */
static inline void sigaddset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] |= 1UL << sig;
        else
                set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW);
}

static inline void sigdelset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] &= ~(1UL << sig);
        else
                set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW));
}

static inline int sigismember(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                return 1 & (set->sig[0] >> sig);
        else
                return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}

#endif /* __HAVE_ARCH_SIG_BITOPS */

static inline int sigisemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        case 4:
                return (set->sig[3] | set->sig[2] |
                        set->sig[1] | set->sig[0]) == 0;
        case 2:
                return (set->sig[1] | set->sig[0]) == 0;
        case 1:
                return set->sig[0] == 0;
        default:
                BUILD_BUG();
                return 0;
        }
}

static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
        switch (_NSIG_WORDS) {
        case 4:
                return        (set1->sig[3] == set2->sig[3]) &&
                        (set1->sig[2] == set2->sig[2]) &&
                        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 2:
                return        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 1:
                return        set1->sig[0] == set2->sig[0];
        }
        return 0;
}

#define sigmask(sig)        (1UL << ((sig) - 1))

#ifndef __HAVE_ARCH_SIG_SETOPS

#define _SIG_SET_BINOP(name, op)                                        \
static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
{                                                                        \
        unsigned long a0, a1, a2, a3, b0, b1, b2, b3;                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                a3 = a->sig[3]; a2 = a->sig[2];                                \
                b3 = b->sig[3]; b2 = b->sig[2];                                \
                r->sig[3] = op(a3, b3);                                        \
                r->sig[2] = op(a2, b2);                                        \
                fallthrough;                                                \
        case 2:                                                                \
                a1 = a->sig[1]; b1 = b->sig[1];                                \
                r->sig[1] = op(a1, b1);                                        \
                fallthrough;                                                \
        case 1:                                                                \
                a0 = a->sig[0]; b0 = b->sig[0];                                \
                r->sig[0] = op(a0, b0);                                        \
                break;                                                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_or(x,y)        ((x) | (y))
_SIG_SET_BINOP(sigorsets, _sig_or)

#define _sig_and(x,y)        ((x) & (y))
_SIG_SET_BINOP(sigandsets, _sig_and)

#define _sig_andn(x,y)        ((x) & ~(y))
_SIG_SET_BINOP(sigandnsets, _sig_andn)

#undef _SIG_SET_BINOP
#undef _sig_or
#undef _sig_and
#undef _sig_andn

#define _SIG_SET_OP(name, op)                                                \
static inline void name(sigset_t *set)                                        \
{                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:        set->sig[3] = op(set->sig[3]);                                \
                set->sig[2] = op(set->sig[2]);                                \
                fallthrough;                                                \
        case 2:        set->sig[1] = op(set->sig[1]);                                \
                fallthrough;                                                \
        case 1:        set->sig[0] = op(set->sig[0]);                                \
                    break;                                                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_not(x)        (~(x))
_SIG_SET_OP(signotset, _sig_not)

#undef _SIG_SET_OP
#undef _sig_not

static inline void sigemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, 0, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = 0;
                fallthrough;
        case 1:        set->sig[0] = 0;
                break;
        }
}

static inline void sigfillset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, -1, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = -1;
                fallthrough;
        case 1:        set->sig[0] = -1;
                break;
        }
}

/* Some extensions for manipulating the low 32 signals in particular.  */

static inline void sigaddsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] |= mask;
}

static inline void sigdelsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] &= ~mask;
}

static inline int sigtestsetmask(sigset_t *set, unsigned long mask)
{
        return (set->sig[0] & mask) != 0;
}

static inline void siginitset(sigset_t *set, unsigned long mask)
{
        set->sig[0] = mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = 0;
                break;
        case 1: ;
        }
}

static inline void siginitsetinv(sigset_t *set, unsigned long mask)
{
        set->sig[0] = ~mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = -1;
                break;
        case 1: ;
        }
}

#endif /* __HAVE_ARCH_SIG_SETOPS */

static inline void init_sigpending(struct sigpending *sig)
{
        sigemptyset(&sig->signal);
        INIT_LIST_HEAD(&sig->list);
}

extern void flush_sigqueue(struct sigpending *queue);

/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

struct timespec;
struct pt_regs;
enum pid_type;

extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
                                struct task_struct *p, enum pid_type type);
extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
                               struct task_struct *p, enum pid_type type);
extern int send_signal_locked(int sig, struct kernel_siginfo *info,
                              struct task_struct *p, enum pid_type type);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;

extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);

#define SIG_KTHREAD ((__force __sighandler_t)2)
#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3)

static inline void allow_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know it'll be handled, so that they don't get converted to
         * SIGKILL or just silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD);
}

static inline void allow_kernel_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know signals sent by the kernel will be handled, so that they
         * don't get silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD_KERNEL);
}

static inline void disallow_signal(int sig)
{
        kernel_sigaction(sig, SIG_IGN);
}

extern struct kmem_cache *sighand_cachep;

extern bool unhandled_signal(struct task_struct *tsk, int sig);

/*
 * In POSIX a signal is sent either to a specific thread (Linux task)
 * or to the process as a whole (Linux thread group).  How the signal
 * is sent determines whether it's to one thread or the whole group,
 * which determines which signal mask(s) are involved in blocking it
 * from being delivered until later.  When the signal is delivered,
 * either it's caught or ignored by a user handler or it has a default
 * effect that applies to the whole thread group (POSIX process).
 *
 * The possible effects an unblocked signal set to SIG_DFL can have are:
 *   ignore        - Nothing Happens
 *   terminate        - kill the process, i.e. all threads in the group,
 *                   similar to exit_group.  The group leader (only) reports
 *                  WIFSIGNALED status to its parent.
 *   coredump        - write a core dump file describing all threads using
 *                  the same mm and then kill all those threads
 *   stop         - stop all the threads in the group, i.e. TASK_STOPPED state
 *
 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
 * Other signals when not blocked and set to SIG_DFL behaves as follows.
 * The job control signals also have other special effects.
 *
 *        +--------------------+------------------+
 *        |  POSIX signal      |  default action  |
 *        +--------------------+------------------+
 *        |  SIGHUP            |  terminate        |
 *        |  SIGINT            |        terminate        |
 *        |  SIGQUIT           |        coredump         |
 *        |  SIGILL            |        coredump         |
 *        |  SIGTRAP           |        coredump         |
 *        |  SIGABRT/SIGIOT    |        coredump         |
 *        |  SIGBUS            |        coredump         |
 *        |  SIGFPE            |        coredump         |
 *        |  SIGKILL           |        terminate(+)        |
 *        |  SIGUSR1           |        terminate        |
 *        |  SIGSEGV           |        coredump         |
 *        |  SIGUSR2           |        terminate        |
 *        |  SIGPIPE           |        terminate        |
 *        |  SIGALRM           |        terminate        |
 *        |  SIGTERM           |        terminate        |
 *        |  SIGCHLD           |        ignore           |
 *        |  SIGCONT           |        ignore(*)        |
 *        |  SIGSTOP           |        stop(*)(+)          |
 *        |  SIGTSTP           |        stop(*)          |
 *        |  SIGTTIN           |        stop(*)          |
 *        |  SIGTTOU           |        stop(*)          |
 *        |  SIGURG            |        ignore           |
 *        |  SIGXCPU           |        coredump         |
 *        |  SIGXFSZ           |        coredump         |
 *        |  SIGVTALRM         |        terminate        |
 *        |  SIGPROF           |        terminate        |
 *        |  SIGPOLL/SIGIO     |        terminate        |
 *        |  SIGSYS/SIGUNUSED  |        coredump         |
 *        |  SIGSTKFLT         |        terminate        |
 *        |  SIGWINCH          |        ignore           |
 *        |  SIGPWR            |        terminate        |
 *        |  SIGRTMIN-SIGRTMAX |        terminate       |
 *        +--------------------+------------------+
 *        |  non-POSIX signal  |  default action  |
 *        +--------------------+------------------+
 *        |  SIGEMT            |  coredump        |
 *        +--------------------+------------------+
 *
 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
 * (*) Special job control effects:
 * When SIGCONT is sent, it resumes the process (all threads in the group)
 * from TASK_STOPPED state and also clears any pending/queued stop signals
 * (any of those marked with "stop(*)").  This happens regardless of blocking,
 * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
 * any pending/queued SIGCONT signals; this happens regardless of blocking,
 * catching, or ignored the stop signal, though (except for SIGSTOP) the
 * default action of stopping the process may happen later or never.
 */

#ifdef SIGEMT
#define SIGEMT_MASK        rt_sigmask(SIGEMT)
#else
#define SIGEMT_MASK        0
#endif

#if SIGRTMIN > BITS_PER_LONG
#define rt_sigmask(sig)        (1ULL << ((sig)-1))
#else
#define rt_sigmask(sig)        sigmask(sig)
#endif

#define siginmask(sig, mask) \
        ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))

#define SIG_KERNEL_ONLY_MASK (\
        rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))

#define SIG_KERNEL_STOP_MASK (\
        rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
        rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )

#define SIG_KERNEL_COREDUMP_MASK (\
        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
        rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
        SIGEMT_MASK                                       )

#define SIG_KERNEL_IGNORE_MASK (\
        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )

#define SIG_SPECIFIC_SICODES_MASK (\
        rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
        rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
        SIGEMT_MASK                                    )

#define sig_kernel_only(sig)                siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig)        siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig)                siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig)                siginmask(sig, SIG_KERNEL_STOP_MASK)
#define sig_specific_sicodes(sig)        siginmask(sig, SIG_SPECIFIC_SICODES_MASK)

#define sig_fatal(t, signr) \
        (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

void signals_init(void);

int restore_altstack(const stack_t __user *);
int __save_altstack(stack_t __user *, unsigned long);

#define unsafe_save_altstack(uss, sp, label) do { \
        stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
} while (0);

#ifdef CONFIG_DYNAMIC_SIGFRAME
bool sigaltstack_size_valid(size_t ss_size);
#else
static inline bool sigaltstack_size_valid(size_t size) { return true; }
#endif /* !CONFIG_DYNAMIC_SIGFRAME */

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
#endif

#ifndef arch_untagged_si_addr
/*
 * Given a fault address and a signal and si_code which correspond to the
 * _sigfault union member, returns the address that must appear in si_addr if
 * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags.
 */
static inline void __user *arch_untagged_si_addr(void __user *addr,
                                                 unsigned long sig,
                                                 unsigned long si_code)
{
        return addr;
}
#endif

#endif /* _LINUX_SIGNAL_H */






























































    1 







    1 










    1 



    1 



















    1 















    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_RWSEM_H
#define _LINUX_PERCPU_RWSEM_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>

struct percpu_rw_semaphore {
        struct rcu_sync                rss;
        unsigned int __percpu        *read_count;
        struct rcuwait                writer;
        wait_queue_head_t        waiters;
        atomic_t                block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)        .dep_map = { .name = #lockname },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif

#define __DEFINE_PERCPU_RWSEM(name, is_static)                                \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);                \
is_static struct percpu_rw_semaphore name = {                                \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                        \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),                \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                                \
}

#define DEFINE_PERCPU_RWSEM(name)                \
        __DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name)        \
        __DEFINE_PERCPU_RWSEM(name, static)

extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
        might_sleep();

        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

        preempt_disable();
        /*
         * We are in an RCU-sched read-side critical section, so the writer
         * cannot both change sem->state from readers_fast and start checking
         * counters while we are here. So if we see !sem->state, we know that
         * the writer won't be checking until we're past the preempt_enable()
         * and that once the synchronize_rcu() is done, the writer will see
         * anything we did within this RCU-sched read-size critical section.
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                __percpu_down_read(sem, false); /* Unconditional memory barrier */
        /*
         * The preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */
        preempt_enable();
}

static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
        bool ret = true;

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
        preempt_enable();
        /*
         * The barrier() from preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */

        if (ret)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);

        return ret;
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
        rwsem_release(&sem->dep_map, _RET_IP_);

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss))) {
                this_cpu_dec(*sem->read_count);
        } else {
                /*
                 * slowpath; reader will only ever wake a single blocked
                 * writer.
                 */
                smp_mb(); /* B matches C */
                /*
                 * In other words, if they see our decrement (presumably to
                 * aggregate zero, as that is the only time it matters) they
                 * will also see our critical section.
                 */
                this_cpu_dec(*sem->read_count);
                rcuwait_wake_up(&sem->writer);
        }
        preempt_enable();
}

extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
{
        return atomic_read(&sem->block);
}

extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);

extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

#define percpu_init_rwsem(sem)                                        \
({                                                                \
        static struct lock_class_key rwsem_key;                        \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);                \
})

#define percpu_rwsem_is_held(sem)        lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem)        lockdep_assert_held(sem)

static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_release(&sem->dep_map, ip);
}

static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}

#endif


























   27 






















   27 

   26 











    2 












   26 












    4 
    5 



    5 













   13 
   26 

















   10 

    9 


















    8 
   20 



   18 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * AppArmor security module
 *
 * This file contains AppArmor contexts used to associate "labels" to objects.
 *
 * Copyright (C) 1998-2008 Novell/SUSE
 * Copyright 2009-2010 Canonical Ltd.
 */

#ifndef __AA_CONTEXT_H
#define __AA_CONTEXT_H

#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>

#include "label.h"
#include "policy_ns.h"
#include "task.h"

static inline struct aa_label *cred_label(const struct cred *cred)
{
        struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;

        AA_BUG(!blob);
        return *blob;
}

static inline void set_cred_label(const struct cred *cred,
                                  struct aa_label *label)
{
        struct aa_label **blob = cred->security + apparmor_blob_sizes.lbs_cred;

        AA_BUG(!blob);
        *blob = label;
}

/**
 * aa_cred_raw_label - obtain cred's label
 * @cred: cred to obtain label from  (NOT NULL)
 *
 * Returns: confining label
 *
 * does NOT increment reference count
 */
static inline struct aa_label *aa_cred_raw_label(const struct cred *cred)
{
        struct aa_label *label = cred_label(cred);

        AA_BUG(!label);
        return label;
}

/**
 * aa_get_newest_cred_label - obtain the newest label on a cred
 * @cred: cred to obtain label from (NOT NULL)
 *
 * Returns: newest version of confining label
 */
static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred)
{
        return aa_get_newest_label(aa_cred_raw_label(cred));
}

/**
 * aa_current_raw_label - find the current tasks confining label
 *
 * Returns: up to date confining label or the ns unconfined label (NOT NULL)
 *
 * This fn will not update the tasks cred to the most up to date version
 * of the label so it is safe to call when inside of locks.
 */
static inline struct aa_label *aa_current_raw_label(void)
{
        return aa_cred_raw_label(current_cred());
}

/**
 * aa_get_current_label - get the newest version of the current tasks label
 *
 * Returns: newest version of confining label (NOT NULL)
 *
 * This fn will not update the tasks cred, so it is safe inside of locks
 *
 * The returned reference must be put with aa_put_label()
 */
static inline struct aa_label *aa_get_current_label(void)
{
        struct aa_label *l = aa_current_raw_label();

        if (label_is_stale(l))
                return aa_get_newest_label(l);
        return aa_get_label(l);
}

#define __end_current_label_crit_section(X) end_current_label_crit_section(X)

/**
 * end_label_crit_section - put a reference found with begin_current_label..
 * @label: label reference to put
 *
 * Should only be used with a reference obtained with
 * begin_current_label_crit_section and never used in situations where the
 * task cred may be updated
 */
static inline void end_current_label_crit_section(struct aa_label *label)
{
        if (label != aa_current_raw_label())
                aa_put_label(label);
}

/**
 * __begin_current_label_crit_section - current's confining label
 *
 * Returns: up to date confining label or the ns unconfined label (NOT NULL)
 *
 * safe to call inside locks
 *
 * The returned reference must be put with __end_current_label_crit_section()
 * This must NOT be used if the task cred could be updated within the
 * critical section between __begin_current_label_crit_section() ..
 * __end_current_label_crit_section()
 */
static inline struct aa_label *__begin_current_label_crit_section(void)
{
        struct aa_label *label = aa_current_raw_label();

        if (label_is_stale(label))
                label = aa_get_newest_label(label);

        return label;
}

/**
 * begin_current_label_crit_section - current's confining label and update it
 *
 * Returns: up to date confining label or the ns unconfined label (NOT NULL)
 *
 * Not safe to call inside locks
 *
 * The returned reference must be put with end_current_label_crit_section()
 * This must NOT be used if the task cred could be updated within the
 * critical section between begin_current_label_crit_section() ..
 * end_current_label_crit_section()
 */
static inline struct aa_label *begin_current_label_crit_section(void)
{
        struct aa_label *label = aa_current_raw_label();

        might_sleep();

        if (label_is_stale(label)) {
                label = aa_get_newest_label(label);
                if (aa_replace_current_label(label) == 0)
                        /* task cred will keep the reference */
                        aa_put_label(label);
        }

        return label;
}

static inline struct aa_ns *aa_get_current_ns(void)
{
        struct aa_label *label;
        struct aa_ns *ns;

        label  = __begin_current_label_crit_section();
        ns = aa_get_ns(labels_ns(label));
        __end_current_label_crit_section(label);

        return ns;
}

#endif /* __AA_CONTEXT_H */



























































































































































































































































    5 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds
 */

#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H

#include <linux/types.h>
#include <linux/limits.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

#include <linux/bitops.h>

/*
 * For per-arch arch_within_stack_frames() implementations, defined in
 * asm/thread_info.h.
 */
enum {
        BAD_STACK = -1,
        NOT_STACK = 0,
        GOOD_FRAME,
        GOOD_STACK,
};

#ifdef CONFIG_GENERIC_ENTRY
enum syscall_work_bit {
        SYSCALL_WORK_BIT_SECCOMP,
        SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
        SYSCALL_WORK_BIT_SYSCALL_TRACE,
        SYSCALL_WORK_BIT_SYSCALL_EMU,
        SYSCALL_WORK_BIT_SYSCALL_AUDIT,
        SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
        SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
};

#define SYSCALL_WORK_SECCOMP                BIT(SYSCALL_WORK_BIT_SECCOMP)
#define SYSCALL_WORK_SYSCALL_TRACEPOINT        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
#define SYSCALL_WORK_SYSCALL_TRACE        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
#define SYSCALL_WORK_SYSCALL_EMU        BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
#define SYSCALL_WORK_SYSCALL_AUDIT        BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
#define SYSCALL_WORK_SYSCALL_EXIT_TRAP        BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
#endif

#include <asm/thread_info.h>

#ifdef __KERNEL__

#ifndef arch_set_restart_data
#define arch_set_restart_data(restart) do { } while (0)
#endif

static inline long set_restart_fn(struct restart_block *restart,
                                        long (*fn)(struct restart_block *))
{
        restart->fn = fn;
        arch_set_restart_data(restart);
        return -ERESTART_RESTARTBLOCK;
}

#ifndef THREAD_ALIGN
#define THREAD_ALIGN        THREAD_SIZE
#endif

#define THREADINFO_GFP                (GFP_KERNEL_ACCOUNT | __GFP_ZERO)

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
        set_bit(flag, (unsigned long *)&ti->flags);
}

static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
                                         bool value)
{
        if (value)
                set_ti_thread_flag(ti, flag);
        else
                clear_ti_thread_flag(ti, flag);
}

static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_bit(flag, (unsigned long *)&ti->flags);
}

/*
 * This may be used in noinstr code, and needs to be __always_inline to prevent
 * inadvertent instrumentation.
 */
static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti)
{
        return READ_ONCE(ti->flags);
}

#define set_thread_flag(flag) \
        set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
        clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
        update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
        test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
        test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)
#define read_thread_flags() \
        read_ti_thread_flags(current_thread_info())

#define read_task_thread_flags(t) \
        read_ti_thread_flags(task_thread_info(t))

#ifdef CONFIG_GENERIC_ENTRY
#define set_syscall_work(fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define test_syscall_work(fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define clear_syscall_work(fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)

#define set_task_syscall_work(t, fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define test_task_syscall_work(t, fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define clear_task_syscall_work(t, fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)

#else /* CONFIG_GENERIC_ENTRY */

#define set_syscall_work(fl)                                                \
        set_ti_thread_flag(current_thread_info(), TIF_##fl)
#define test_syscall_work(fl) \
        test_ti_thread_flag(current_thread_info(), TIF_##fl)
#define clear_syscall_work(fl) \
        clear_ti_thread_flag(current_thread_info(), TIF_##fl)

#define set_task_syscall_work(t, fl) \
        set_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define test_task_syscall_work(t, fl) \
        test_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define clear_task_syscall_work(t, fl) \
        clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */

#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

static __always_inline bool tif_need_resched(void)
{
        return arch_test_bit(TIF_NEED_RESCHED,
                             (unsigned long *)(&current_thread_info()->flags));
}

#else

static __always_inline bool tif_need_resched(void)
{
        return test_bit(TIF_NEED_RESCHED,
                        (unsigned long *)(&current_thread_info()->flags));
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
        return 0;
}
#endif

#ifdef CONFIG_HARDENED_USERCOPY
extern void __check_object_size(const void *ptr, unsigned long n,
                                        bool to_user);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
                                              bool to_user)
{
        if (!__builtin_constant_p(n))
                __check_object_size(ptr, n, to_user);
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
                                     bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);

void __copy_overflow(int size, unsigned long count);

static inline void copy_overflow(int size, unsigned long count)
{
        if (IS_ENABLED(CONFIG_BUG))
                __copy_overflow(size, count);
}

static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
        int sz = __builtin_object_size(addr, 0);
        if (unlikely(sz >= 0 && sz < bytes)) {
                if (!__builtin_constant_p(bytes))
                        copy_overflow(sz, bytes);
                else if (is_source)
                        __bad_copy_from();
                else
                        __bad_copy_to();
                return false;
        }
        if (WARN_ON_ONCE(bytes > INT_MAX))
                return false;
        check_object_size(addr, bytes, is_source);
        return true;
}

#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif

void arch_task_cache_init(void); /* for CONFIG_SH */
void arch_release_task_struct(struct task_struct *tsk);
int arch_dup_task_struct(struct task_struct *dst,
                                struct task_struct *src);

#endif        /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */





















































































































    1 







    1 


    1 







































































    1 






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * AEAD: Authenticated Encryption with Associated Data
 *
 * This file provides API support for AEAD algorithms.
 *
 * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/internal/aead.h>
#include <linux/cryptouser.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <net/netlink.h>

#include "internal.h"

static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
                            unsigned int keylen)
{
        unsigned long alignmask = crypto_aead_alignmask(tfm);
        int ret;
        u8 *buffer, *alignbuffer;
        unsigned long absize;

        absize = keylen + alignmask;
        buffer = kmalloc(absize, GFP_ATOMIC);
        if (!buffer)
                return -ENOMEM;

        alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
        memcpy(alignbuffer, key, keylen);
        ret = crypto_aead_alg(tfm)->setkey(tfm, alignbuffer, keylen);
        kfree_sensitive(buffer);
        return ret;
}

int crypto_aead_setkey(struct crypto_aead *tfm,
                       const u8 *key, unsigned int keylen)
{
        unsigned long alignmask = crypto_aead_alignmask(tfm);
        int err;

        if ((unsigned long)key & alignmask)
                err = setkey_unaligned(tfm, key, keylen);
        else
                err = crypto_aead_alg(tfm)->setkey(tfm, key, keylen);

        if (unlikely(err)) {
                crypto_aead_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
                return err;
        }

        crypto_aead_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_aead_setkey);

int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
{
        int err;

        if ((!authsize && crypto_aead_maxauthsize(tfm)) ||
            authsize > crypto_aead_maxauthsize(tfm))
                return -EINVAL;

        if (crypto_aead_alg(tfm)->setauthsize) {
                err = crypto_aead_alg(tfm)->setauthsize(tfm, authsize);
                if (err)
                        return err;
        }

        tfm->authsize = authsize;
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_aead_setauthsize);

int crypto_aead_encrypt(struct aead_request *req)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);

        if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return crypto_aead_alg(aead)->encrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_aead_encrypt);

int crypto_aead_decrypt(struct aead_request *req)
{
        struct crypto_aead *aead = crypto_aead_reqtfm(req);

        if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        if (req->cryptlen < crypto_aead_authsize(aead))
                return -EINVAL;

        return crypto_aead_alg(aead)->decrypt(req);
}
EXPORT_SYMBOL_GPL(crypto_aead_decrypt);

static void crypto_aead_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_aead *aead = __crypto_aead_cast(tfm);
        struct aead_alg *alg = crypto_aead_alg(aead);

        alg->exit(aead);
}

static int crypto_aead_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_aead *aead = __crypto_aead_cast(tfm);
        struct aead_alg *alg = crypto_aead_alg(aead);

        crypto_aead_set_flags(aead, CRYPTO_TFM_NEED_KEY);

        aead->authsize = alg->maxauthsize;

        if (alg->exit)
                aead->base.exit = crypto_aead_exit_tfm;

        if (alg->init)
                return alg->init(aead);

        return 0;
}

static int __maybe_unused crypto_aead_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_aead raead;
        struct aead_alg *aead = container_of(alg, struct aead_alg, base);

        memset(&raead, 0, sizeof(raead));

        strscpy(raead.type, "aead", sizeof(raead.type));
        strscpy(raead.geniv, "<none>", sizeof(raead.geniv));

        raead.blocksize = alg->cra_blocksize;
        raead.maxauthsize = aead->maxauthsize;
        raead.ivsize = aead->ivsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_AEAD, sizeof(raead), &raead);
}

static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct aead_alg *aead = container_of(alg, struct aead_alg, base);

        seq_printf(m, "type         : aead\n");
        seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
                                             "yes" : "no");
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "ivsize       : %u\n", aead->ivsize);
        seq_printf(m, "maxauthsize  : %u\n", aead->maxauthsize);
        seq_printf(m, "geniv        : <none>\n");
}

static void crypto_aead_free_instance(struct crypto_instance *inst)
{
        struct aead_instance *aead = aead_instance(inst);

        aead->free(aead);
}

static const struct crypto_type crypto_aead_type = {
        .extsize = crypto_alg_extsize,
        .init_tfm = crypto_aead_init_tfm,
        .free = crypto_aead_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_aead_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_aead_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_MASK,
        .type = CRYPTO_ALG_TYPE_AEAD,
        .tfmsize = offsetof(struct crypto_aead, base),
};

int crypto_grab_aead(struct crypto_aead_spawn *spawn,
                     struct crypto_instance *inst,
                     const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_aead_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_aead);

struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_aead_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_aead);

int crypto_has_aead(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_aead_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_aead);

static int aead_prepare_alg(struct aead_alg *alg)
{
        struct crypto_alg *base = &alg->base;

        if (max3(alg->maxauthsize, alg->ivsize, alg->chunksize) >
            PAGE_SIZE / 8)
                return -EINVAL;

        if (!alg->chunksize)
                alg->chunksize = base->cra_blocksize;

        base->cra_type = &crypto_aead_type;
        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
        base->cra_flags |= CRYPTO_ALG_TYPE_AEAD;

        return 0;
}

int crypto_register_aead(struct aead_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = aead_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_aead);

void crypto_unregister_aead(struct aead_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_aead);

int crypto_register_aeads(struct aead_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_aead(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_aead(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_aeads);

void crypto_unregister_aeads(struct aead_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_aead(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_aeads);

int aead_register_instance(struct crypto_template *tmpl,
                           struct aead_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = aead_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, aead_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(aead_register_instance);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Authenticated Encryption with Associated Data (AEAD)");




































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// SPDX-License-Identifier: GPL-2.0
/*
  File: fs/ext4/xattr.h

  On-disk format of extended attributes for the ext4 filesystem.

  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/

#include <linux/xattr.h>

/* Magic value in attribute blocks */
#define EXT4_XATTR_MAGIC                0xEA020000

/* Maximum number of references to one attribute block */
#define EXT4_XATTR_REFCOUNT_MAX                1024

/* Name indexes */
#define EXT4_XATTR_INDEX_USER                        1
#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS        2
#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT        3
#define EXT4_XATTR_INDEX_TRUSTED                4
#define        EXT4_XATTR_INDEX_LUSTRE                        5
#define EXT4_XATTR_INDEX_SECURITY                6
#define EXT4_XATTR_INDEX_SYSTEM                        7
#define EXT4_XATTR_INDEX_RICHACL                8
#define EXT4_XATTR_INDEX_ENCRYPTION                9
#define EXT4_XATTR_INDEX_HURD                        10 /* Reserved for Hurd */

struct ext4_xattr_header {
        __le32        h_magic;        /* magic number for identification */
        __le32        h_refcount;        /* reference count */
        __le32        h_blocks;        /* number of disk blocks used */
        __le32        h_hash;                /* hash value of all attributes */
        __le32        h_checksum;        /* crc32c(uuid+id+xattrblock) */
                                /* id = inum if refcount=1, blknum otherwise */
        __u32        h_reserved[3];        /* zero right now */
};

struct ext4_xattr_ibody_header {
        __le32        h_magic;        /* magic number for identification */
};

struct ext4_xattr_entry {
        __u8        e_name_len;        /* length of name */
        __u8        e_name_index;        /* attribute name index */
        __le16        e_value_offs;        /* offset in disk block of value */
        __le32        e_value_inum;        /* inode in which the value is stored */
        __le32        e_value_size;        /* size of attribute value */
        __le32        e_hash;                /* hash value of name and value */
        char        e_name[];        /* attribute name */
};

#define EXT4_XATTR_PAD_BITS                2
#define EXT4_XATTR_PAD                (1<<EXT4_XATTR_PAD_BITS)
#define EXT4_XATTR_ROUND                (EXT4_XATTR_PAD-1)
#define EXT4_XATTR_LEN(name_len) \
        (((name_len) + EXT4_XATTR_ROUND + \
        sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
#define EXT4_XATTR_NEXT(entry) \
        ((struct ext4_xattr_entry *)( \
         (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
#define EXT4_XATTR_SIZE(size) \
        (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)

#define IHDR(inode, raw_inode) \
        ((struct ext4_xattr_ibody_header *) \
                ((void *)raw_inode + \
                EXT4_GOOD_OLD_INODE_SIZE + \
                EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))

/*
 * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking
 * for file system consistency errors, we use a somewhat bigger value.
 * This allows XATTR_SIZE_MAX to grow in the future, but by using this
 * instead of INT_MAX for certain consistency checks, we don't need to
 * worry about arithmetic overflows.  (Actually XATTR_SIZE_MAX is
 * defined in include/uapi/linux/limits.h, so changing it is going
 * not going to be trivial....)
 */
#define EXT4_XATTR_SIZE_MAX (1 << 24)

/*
 * The minimum size of EA value when you start storing it in an external inode
 * size of block - size of header - size of 1 entry - 4 null bytes
 */
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)                                        \
        ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)

#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
#define BFIRST(bh) ENTRY(BHDR(bh)+1)
#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)

#define EXT4_ZERO_XATTR_VALUE ((void *)-1)

/*
 * If we want to add an xattr to the inode, we should make sure that
 * i_extra_isize is not 0 and that the inode size is not less than
 * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
 *   EXT4_GOOD_OLD_INODE_SIZE   extra_isize header   entry   pad  data
 * |--------------------------|------------|------|---------|---|-------|
 */
#define EXT4_INODE_HAS_XATTR_SPACE(inode)                                \
        ((EXT4_I(inode)->i_extra_isize != 0) &&                                \
         (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize +        \
          sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <=        \
          EXT4_INODE_SIZE((inode)->i_sb)))

struct ext4_xattr_info {
        const char *name;
        const void *value;
        size_t value_len;
        int name_index;
        int in_inode;
};

struct ext4_xattr_search {
        struct ext4_xattr_entry *first;
        void *base;
        void *end;
        struct ext4_xattr_entry *here;
        int not_found;
};

struct ext4_xattr_ibody_find {
        struct ext4_xattr_search s;
        struct ext4_iloc iloc;
};

struct ext4_xattr_inode_array {
        unsigned int count;                /* # of used items in the array */
        struct inode *inodes[];
};

extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern const struct xattr_handler ext4_xattr_hurd_handler;

#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"

/*
 * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes.
 * The first is to signal that there the inline xattrs and data are
 * taking up so much space that we might as well not keep trying to
 * expand it.  The second is that xattr_sem is taken for writing, so
 * we shouldn't try to recurse into the inode expansion.  For this
 * second case, we need to make sure that we take save and restore the
 * NO_EXPAND state flag appropriately.
 */
static inline void ext4_write_lock_xattr(struct inode *inode, int *save)
{
        down_write(&EXT4_I(inode)->xattr_sem);
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
}

static inline int ext4_write_trylock_xattr(struct inode *inode, int *save)
{
        if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0)
                return 0;
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
        return 1;
}

static inline void ext4_write_unlock_xattr(struct inode *inode, int *save)
{
        if (*save == 0)
                ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
        up_write(&EXT4_I(inode)->xattr_sem);
}

extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);

extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
                                  bool is_create, int *credits);
extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
                                struct buffer_head *block_bh, size_t value_len,
                                bool is_create);

extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                                   struct ext4_xattr_inode_array **array,
                                   int extra_credits);
extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);

extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
extern void ext4_evict_ea_inode(struct inode *inode);

extern const struct xattr_handler * const ext4_xattr_handlers[];

extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
                                 struct ext4_xattr_ibody_find *is);
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
                                const char *name,
                                void *buffer, size_t buffer_size);
extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
                                struct ext4_xattr_info *i,
                                struct ext4_xattr_ibody_find *is);

extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);

#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
                              struct inode *dir, const struct qstr *qstr);
#else
static inline int ext4_init_security(handle_t *handle, struct inode *inode,
                                     struct inode *dir, const struct qstr *qstr)
{
        return 0;
}
#endif

#ifdef CONFIG_LOCKDEP
extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
#else
static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
#endif

extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 





    1 












    1 







    1 







    1 

















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NET4:        Implementation of BSD Unix domain sockets.
 *
 * Authors:        Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *
 * Fixes:
 *                Linus Torvalds        :        Assorted bug cures.
 *                Niibe Yutaka        :        async I/O support.
 *                Carsten Paeth        :        PF_UNIX check, address fixes.
 *                Alan Cox        :        Limit size of allocated blocks.
 *                Alan Cox        :        Fixed the stupid socketpair bug.
 *                Alan Cox        :        BSD compatibility fine tuning.
 *                Alan Cox        :        Fixed a bug in connect when interrupted.
 *                Alan Cox        :        Sorted out a proper draft version of
 *                                        file descriptor passing hacked up from
 *                                        Mike Shaver's work.
 *                Marty Leisner        :        Fixes to fd passing
 *                Nick Nevin        :        recvmsg bugfix.
 *                Alan Cox        :        Started proper garbage collector
 *                Heiko EiBfeldt        :        Missing verify_area check
 *                Alan Cox        :        Started POSIXisms
 *                Andreas Schwab        :        Replace inode by dentry for proper
 *                                        reference counting
 *                Kirk Petersen        :        Made this a module
 *            Christoph Rohland        :        Elegant non-blocking accept/connect algorithm.
 *                                        Lots of bug fixes.
 *             Alexey Kuznetosv        :        Repaired (I hope) bugs introduces
 *                                        by above two patches.
 *             Andrea Arcangeli        :        If possible we block in connect(2)
 *                                        if the max backlog of the listen socket
 *                                        is been reached. This won't break
 *                                        old apps and it will avoid huge amount
 *                                        of socks hashed (this for unix_gc()
 *                                        performances reasons).
 *                                        Security fix that limits the max
 *                                        number of socks to 2*max_files and
 *                                        the number of skb queueable in the
 *                                        dgram receiver.
 *                Artur Skawina   :        Hash function optimizations
 *             Alexey Kuznetsov   :        Full scale SMP. Lot of bugs are introduced 8)
 *              Malcolm Beattie   :        Set peercred for socketpair
 *             Michal Ostrowski   :       Module initialization cleanup.
 *             Arnaldo C. Melo        :        Remove MOD_{INC,DEC}_USE_COUNT,
 *                                             the core infrastructure is doing that
 *                                             for all net proto families now (2.5.69+)
 *
 * Known differences from reference BSD that was tested:
 *
 *        [TO FIX]
 *        ECONNREFUSED is not returned from one end of a connected() socket to the
 *                other the moment one end closes.
 *        fstat() doesn't return st_dev=0, and give the blksize as high water mark
 *                and a fake inode identifier (nor the BSD first socket fstat twice bug).
 *        [NOT TO FIX]
 *        accept() returns a path name even if the connecting socket has closed
 *                in the meantime (BSD loses the path and gives up).
 *        accept() returns 0 length path for an unbound connector. BSD returns 16
 *                and a null first byte in the path (but not for gethost/peername - BSD bug ??)
 *        socketpair(...SOCK_RAW..) doesn't panic the kernel.
 *        BSD af_unix apparently has connect forgetting to block properly.
 *                (need to check this with the POSIX spec in detail)
 *
 * Differences from 2.0.0-11-... (ANK)
 *        Bug fixes and improvements.
 *                - client shutdown killed server socket.
 *                - removed all useless cli/sti pairs.
 *
 *        Semantic changes/extensions.
 *                - generic control message passing.
 *                - SCM_CREDENTIALS control message.
 *                - "Abstract" (not FS based) socket bindings.
 *                  Abstract names are sequences of bytes (not zero terminated)
 *                  started by 0, so that this name space does not intersect
 *                  with BSD names.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/sched/signal.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/filter.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/af_unix.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/scm.h>
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/rtnetlink.h>
#include <linux/mount.h>
#include <net/checksum.h>
#include <linux/security.h>
#include <linux/splice.h>
#include <linux/freezer.h>
#include <linux/file.h>
#include <linux/btf_ids.h>
#include <linux/bpf-cgroup.h>

static atomic_long_t unix_nr_socks;
static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];

/* SMP locking strategy:
 *    hash table is protected with spinlock.
 *    each socket state is protected by separate spinlock.
 */
#ifdef CONFIG_PROVE_LOCKING
#define cmp_ptr(l, r)        (((l) > (r)) - ((l) < (r)))

static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
                                  const struct lockdep_map *b)
{
        return cmp_ptr(a, b);
}

static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
                                  const struct lockdep_map *_b)
{
        const struct unix_sock *a, *b;

        a = container_of(_a, struct unix_sock, lock.dep_map);
        b = container_of(_b, struct unix_sock, lock.dep_map);

        if (a->sk.sk_state == TCP_LISTEN) {
                /* unix_stream_connect(): Before the 2nd unix_state_lock(),
                 *
                 *   1. a is TCP_LISTEN.
                 *   2. b is not a.
                 *   3. concurrent connect(b -> a) must fail.
                 *
                 * Except for 2. & 3., the b's state can be any possible
                 * value due to concurrent connect() or listen().
                 *
                 * 2. is detected in debug_spin_lock_before(), and 3. cannot
                 * be expressed as lock_cmp_fn.
                 */
                switch (b->sk.sk_state) {
                case TCP_CLOSE:
                case TCP_ESTABLISHED:
                case TCP_LISTEN:
                        return -1;
                default:
                        /* Invalid case. */
                        return 0;
                }
        }

        /* Should never happen.  Just to be symmetric. */
        if (b->sk.sk_state == TCP_LISTEN) {
                switch (b->sk.sk_state) {
                case TCP_CLOSE:
                case TCP_ESTABLISHED:
                        return 1;
                default:
                        return 0;
                }
        }

        /* unix_state_double_lock(): ascending address order. */
        return cmp_ptr(a, b);
}

static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
                                  const struct lockdep_map *_b)
{
        const struct sock *a, *b;

        a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
        b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);

        /* unix_collect_skb(): listener -> embryo order. */
        if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
                return -1;

        /* Should never happen.  Just to be symmetric. */
        if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
                return 1;

        return 0;
}
#endif

static unsigned int unix_unbound_hash(struct sock *sk)
{
        unsigned long hash = (unsigned long)sk;

        hash ^= hash >> 16;
        hash ^= hash >> 8;
        hash ^= sk->sk_type;

        return hash & UNIX_HASH_MOD;
}

static unsigned int unix_bsd_hash(struct inode *i)
{
        return i->i_ino & UNIX_HASH_MOD;
}

static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
                                       int addr_len, int type)
{
        __wsum csum = csum_partial(sunaddr, addr_len, 0);
        unsigned int hash;

        hash = (__force unsigned int)csum_fold(csum);
        hash ^= hash >> 8;
        hash ^= type;

        return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
}

static void unix_table_double_lock(struct net *net,
                                   unsigned int hash1, unsigned int hash2)
{
        if (hash1 == hash2) {
                spin_lock(&net->unx.table.locks[hash1]);
                return;
        }

        if (hash1 > hash2)
                swap(hash1, hash2);

        spin_lock(&net->unx.table.locks[hash1]);
        spin_lock(&net->unx.table.locks[hash2]);
}

static void unix_table_double_unlock(struct net *net,
                                     unsigned int hash1, unsigned int hash2)
{
        if (hash1 == hash2) {
                spin_unlock(&net->unx.table.locks[hash1]);
                return;
        }

        spin_unlock(&net->unx.table.locks[hash1]);
        spin_unlock(&net->unx.table.locks[hash2]);
}

#ifdef CONFIG_SECURITY_NETWORK
static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{
        UNIXCB(skb).secid = scm->secid;
}

static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{
        scm->secid = UNIXCB(skb).secid;
}

static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
{
        return (scm->secid == UNIXCB(skb).secid);
}
#else
static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{ }

static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
{ }

static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
{
        return true;
}
#endif /* CONFIG_SECURITY_NETWORK */

static inline int unix_our_peer(struct sock *sk, struct sock *osk)
{
        return unix_peer(osk) == sk;
}

static inline int unix_may_send(struct sock *sk, struct sock *osk)
{
        return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
}

static inline int unix_recvq_full_lockless(const struct sock *sk)
{
        return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
}

struct sock *unix_peer_get(struct sock *s)
{
        struct sock *peer;

        unix_state_lock(s);
        peer = unix_peer(s);
        if (peer)
                sock_hold(peer);
        unix_state_unlock(s);
        return peer;
}
EXPORT_SYMBOL_GPL(unix_peer_get);

static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
                                             int addr_len)
{
        struct unix_address *addr;

        addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
        if (!addr)
                return NULL;

        refcount_set(&addr->refcnt, 1);
        addr->len = addr_len;
        memcpy(addr->name, sunaddr, addr_len);

        return addr;
}

static inline void unix_release_addr(struct unix_address *addr)
{
        if (refcount_dec_and_test(&addr->refcnt))
                kfree(addr);
}

/*
 *        Check unix socket name:
 *                - should be not zero length.
 *                - if started by not zero, should be NULL terminated (FS object)
 *                - if started by zero, it is abstract name.
 */

static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
{
        if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
            addr_len > sizeof(*sunaddr))
                return -EINVAL;

        if (sunaddr->sun_family != AF_UNIX)
                return -EINVAL;

        return 0;
}

static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
{
        struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
        short offset = offsetof(struct sockaddr_storage, __data);

        BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));

        /* This may look like an off by one error but it is a bit more
         * subtle.  108 is the longest valid AF_UNIX path for a binding.
         * sun_path[108] doesn't as such exist.  However in kernel space
         * we are guaranteed that it is a valid memory location in our
         * kernel address buffer because syscall functions always pass
         * a pointer of struct sockaddr_storage which has a bigger buffer
         * than 108.  Also, we must terminate sun_path for strlen() in
         * getname_kernel().
         */
        addr->__data[addr_len - offset] = 0;

        /* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
         * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
         * know the actual buffer.
         */
        return strlen(addr->__data) + offset + 1;
}

static void __unix_remove_socket(struct sock *sk)
{
        sk_del_node_init(sk);
}

static void __unix_insert_socket(struct net *net, struct sock *sk)
{
        DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
        sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
}

static void __unix_set_addr_hash(struct net *net, struct sock *sk,
                                 struct unix_address *addr, unsigned int hash)
{
        __unix_remove_socket(sk);
        smp_store_release(&unix_sk(sk)->addr, addr);

        sk->sk_hash = hash;
        __unix_insert_socket(net, sk);
}

static void unix_remove_socket(struct net *net, struct sock *sk)
{
        spin_lock(&net->unx.table.locks[sk->sk_hash]);
        __unix_remove_socket(sk);
        spin_unlock(&net->unx.table.locks[sk->sk_hash]);
}

static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
{
        spin_lock(&net->unx.table.locks[sk->sk_hash]);
        __unix_insert_socket(net, sk);
        spin_unlock(&net->unx.table.locks[sk->sk_hash]);
}

static void unix_insert_bsd_socket(struct sock *sk)
{
        spin_lock(&bsd_socket_locks[sk->sk_hash]);
        sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
        spin_unlock(&bsd_socket_locks[sk->sk_hash]);
}

static void unix_remove_bsd_socket(struct sock *sk)
{
        if (!hlist_unhashed(&sk->sk_bind_node)) {
                spin_lock(&bsd_socket_locks[sk->sk_hash]);
                __sk_del_bind_node(sk);
                spin_unlock(&bsd_socket_locks[sk->sk_hash]);

                sk_node_init(&sk->sk_bind_node);
        }
}

static struct sock *__unix_find_socket_byname(struct net *net,
                                              struct sockaddr_un *sunname,
                                              int len, unsigned int hash)
{
        struct sock *s;

        sk_for_each(s, &net->unx.table.buckets[hash]) {
                struct unix_sock *u = unix_sk(s);

                if (u->addr->len == len &&
                    !memcmp(u->addr->name, sunname, len))
                        return s;
        }
        return NULL;
}

static inline struct sock *unix_find_socket_byname(struct net *net,
                                                   struct sockaddr_un *sunname,
                                                   int len, unsigned int hash)
{
        struct sock *s;

        spin_lock(&net->unx.table.locks[hash]);
        s = __unix_find_socket_byname(net, sunname, len, hash);
        if (s)
                sock_hold(s);
        spin_unlock(&net->unx.table.locks[hash]);
        return s;
}

static struct sock *unix_find_socket_byinode(struct inode *i)
{
        unsigned int hash = unix_bsd_hash(i);
        struct sock *s;

        spin_lock(&bsd_socket_locks[hash]);
        sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
                struct dentry *dentry = unix_sk(s)->path.dentry;

                if (dentry && d_backing_inode(dentry) == i) {
                        sock_hold(s);
                        spin_unlock(&bsd_socket_locks[hash]);
                        return s;
                }
        }
        spin_unlock(&bsd_socket_locks[hash]);
        return NULL;
}

/* Support code for asymmetrically connected dgram sockets
 *
 * If a datagram socket is connected to a socket not itself connected
 * to the first socket (eg, /dev/log), clients may only enqueue more
 * messages if the present receive queue of the server socket is not
 * "too large". This means there's a second writeability condition
 * poll and sendmsg need to test. The dgram recv code will do a wake
 * up on the peer_wait wait queue of a socket upon reception of a
 * datagram which needs to be propagated to sleeping would-be writers
 * since these might not have sent anything so far. This can't be
 * accomplished via poll_wait because the lifetime of the server
 * socket might be less than that of its clients if these break their
 * association with it or if the server socket is closed while clients
 * are still connected to it and there's no way to inform "a polling
 * implementation" that it should let go of a certain wait queue
 *
 * In order to propagate a wake up, a wait_queue_entry_t of the client
 * socket is enqueued on the peer_wait queue of the server socket
 * whose wake function does a wake_up on the ordinary client socket
 * wait queue. This connection is established whenever a write (or
 * poll for write) hit the flow control condition and broken when the
 * association to the server socket is dissolved or after a wake up
 * was relayed.
 */

static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
                                      void *key)
{
        struct unix_sock *u;
        wait_queue_head_t *u_sleep;

        u = container_of(q, struct unix_sock, peer_wake);

        __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
                            q);
        u->peer_wake.private = NULL;

        /* relaying can only happen while the wq still exists */
        u_sleep = sk_sleep(&u->sk);
        if (u_sleep)
                wake_up_interruptible_poll(u_sleep, key_to_poll(key));

        return 0;
}

static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
{
        struct unix_sock *u, *u_other;
        int rc;

        u = unix_sk(sk);
        u_other = unix_sk(other);
        rc = 0;
        spin_lock(&u_other->peer_wait.lock);

        if (!u->peer_wake.private) {
                u->peer_wake.private = other;
                __add_wait_queue(&u_other->peer_wait, &u->peer_wake);

                rc = 1;
        }

        spin_unlock(&u_other->peer_wait.lock);
        return rc;
}

static void unix_dgram_peer_wake_disconnect(struct sock *sk,
                                            struct sock *other)
{
        struct unix_sock *u, *u_other;

        u = unix_sk(sk);
        u_other = unix_sk(other);
        spin_lock(&u_other->peer_wait.lock);

        if (u->peer_wake.private == other) {
                __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
                u->peer_wake.private = NULL;
        }

        spin_unlock(&u_other->peer_wait.lock);
}

static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
                                                   struct sock *other)
{
        unix_dgram_peer_wake_disconnect(sk, other);
        wake_up_interruptible_poll(sk_sleep(sk),
                                   EPOLLOUT |
                                   EPOLLWRNORM |
                                   EPOLLWRBAND);
}

/* preconditions:
 *        - unix_peer(sk) == other
 *        - association is stable
 */
static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
{
        int connected;

        connected = unix_dgram_peer_wake_connect(sk, other);

        /* If other is SOCK_DEAD, we want to make sure we signal
         * POLLOUT, such that a subsequent write() can get a
         * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
         * to other and its full, we will hang waiting for POLLOUT.
         */
        if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
                return 1;

        if (connected)
                unix_dgram_peer_wake_disconnect(sk, other);

        return 0;
}

static int unix_writable(const struct sock *sk, unsigned char state)
{
        return state != TCP_LISTEN &&
                (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
}

static void unix_write_space(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
                wq = rcu_dereference(sk->sk_wq);
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_sync_poll(&wq->wait,
                                EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
        rcu_read_unlock();
}

/* When dgram socket disconnects (or changes its peer), we clear its receive
 * queue of packets arrived from previous peer. First, it allows to do
 * flow control based only on wmem_alloc; second, sk connected to peer
 * may receive messages only from that peer. */
static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
{
        if (!skb_queue_empty(&sk->sk_receive_queue)) {
                skb_queue_purge(&sk->sk_receive_queue);
                wake_up_interruptible_all(&unix_sk(sk)->peer_wait);

                /* If one link of bidirectional dgram pipe is disconnected,
                 * we signal error. Messages are lost. Do not make this,
                 * when peer was not connected to us.
                 */
                if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
                        WRITE_ONCE(other->sk_err, ECONNRESET);
                        sk_error_report(other);
                }
        }
}

static void unix_sock_destructor(struct sock *sk)
{
        struct unix_sock *u = unix_sk(sk);

        skb_queue_purge(&sk->sk_receive_queue);

        DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
        DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
        DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
        if (!sock_flag(sk, SOCK_DEAD)) {
                pr_info("Attempt to release alive unix socket: %p\n", sk);
                return;
        }

        if (u->addr)
                unix_release_addr(u->addr);

        atomic_long_dec(&unix_nr_socks);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
#ifdef UNIX_REFCNT_DEBUG
        pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
                atomic_long_read(&unix_nr_socks));
#endif
}

static void unix_release_sock(struct sock *sk, int embrion)
{
        struct unix_sock *u = unix_sk(sk);
        struct sock *skpair;
        struct sk_buff *skb;
        struct path path;
        int state;

        unix_remove_socket(sock_net(sk), sk);
        unix_remove_bsd_socket(sk);

        /* Clear state */
        unix_state_lock(sk);
        sock_orphan(sk);
        WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
        path             = u->path;
        u->path.dentry = NULL;
        u->path.mnt = NULL;
        state = sk->sk_state;
        WRITE_ONCE(sk->sk_state, TCP_CLOSE);

        skpair = unix_peer(sk);
        unix_peer(sk) = NULL;

        unix_state_unlock(sk);

#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
        if (u->oob_skb) {
                kfree_skb(u->oob_skb);
                u->oob_skb = NULL;
        }
#endif

        wake_up_interruptible_all(&u->peer_wait);

        if (skpair != NULL) {
                if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
                        unix_state_lock(skpair);
                        /* No more writes */
                        WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
                        if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
                                WRITE_ONCE(skpair->sk_err, ECONNRESET);
                        unix_state_unlock(skpair);
                        skpair->sk_state_change(skpair);
                        sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
                }

                unix_dgram_peer_wake_disconnect(sk, skpair);
                sock_put(skpair); /* It may now die */
        }

        /* Try to flush out this socket. Throw out buffers at least */

        while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
                if (state == TCP_LISTEN)
                        unix_release_sock(skb->sk, 1);

                /* passed fds are erased in the kfree_skb hook              */
                kfree_skb(skb);
        }

        if (path.dentry)
                path_put(&path);

        sock_put(sk);

        /* ---- Socket is dead now and most probably destroyed ---- */

        /*
         * Fixme: BSD difference: In BSD all sockets connected to us get
         *          ECONNRESET and we die on the spot. In Linux we behave
         *          like files and pipes do and wait for the last
         *          dereference.
         *
         * Can't we simply set sock->err?
         *
         *          What the above comment does talk about? --ANK(980817)
         */

        if (READ_ONCE(unix_tot_inflight))
                unix_gc();                /* Garbage collect fds */
}

static void init_peercred(struct sock *sk)
{
        sk->sk_peer_pid = get_pid(task_tgid(current));
        sk->sk_peer_cred = get_current_cred();
}

static void update_peercred(struct sock *sk)
{
        const struct cred *old_cred;
        struct pid *old_pid;

        spin_lock(&sk->sk_peer_lock);
        old_pid = sk->sk_peer_pid;
        old_cred = sk->sk_peer_cred;
        init_peercred(sk);
        spin_unlock(&sk->sk_peer_lock);

        put_pid(old_pid);
        put_cred(old_cred);
}

static void copy_peercred(struct sock *sk, struct sock *peersk)
{
        lockdep_assert_held(&unix_sk(peersk)->lock);

        spin_lock(&sk->sk_peer_lock);
        sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
        sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
        spin_unlock(&sk->sk_peer_lock);
}

static int unix_listen(struct socket *sock, int backlog)
{
        int err;
        struct sock *sk = sock->sk;
        struct unix_sock *u = unix_sk(sk);

        err = -EOPNOTSUPP;
        if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
                goto out;        /* Only stream/seqpacket sockets accept */
        err = -EINVAL;
        if (!READ_ONCE(u->addr))
                goto out;        /* No listens on an unbound socket */
        unix_state_lock(sk);
        if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
                goto out_unlock;
        if (backlog > sk->sk_max_ack_backlog)
                wake_up_interruptible_all(&u->peer_wait);
        sk->sk_max_ack_backlog        = backlog;
        WRITE_ONCE(sk->sk_state, TCP_LISTEN);

        /* set credentials so connect can copy them */
        update_peercred(sk);
        err = 0;

out_unlock:
        unix_state_unlock(sk);
out:
        return err;
}

static int unix_release(struct socket *);
static int unix_bind(struct socket *, struct sockaddr *, int);
static int unix_stream_connect(struct socket *, struct sockaddr *,
                               int addr_len, int flags);
static int unix_socketpair(struct socket *, struct socket *);
static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
static int unix_getname(struct socket *, struct sockaddr *, int);
static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
static __poll_t unix_dgram_poll(struct file *, struct socket *,
                                    poll_table *);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
#endif
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
                                       struct pipe_inode_info *, size_t size,
                                       unsigned int flags);
static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
static int unix_dgram_connect(struct socket *, struct sockaddr *,
                              int, int);
static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
                                  int);

#ifdef CONFIG_PROC_FS
static int unix_count_nr_fds(struct sock *sk)
{
        struct sk_buff *skb;
        struct unix_sock *u;
        int nr_fds = 0;

        spin_lock(&sk->sk_receive_queue.lock);
        skb = skb_peek(&sk->sk_receive_queue);
        while (skb) {
                u = unix_sk(skb->sk);
                nr_fds += atomic_read(&u->scm_stat.nr_fds);
                skb = skb_peek_next(skb, &sk->sk_receive_queue);
        }
        spin_unlock(&sk->sk_receive_queue.lock);

        return nr_fds;
}

static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
{
        struct sock *sk = sock->sk;
        unsigned char s_state;
        struct unix_sock *u;
        int nr_fds = 0;

        if (sk) {
                s_state = READ_ONCE(sk->sk_state);
                u = unix_sk(sk);

                /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
                 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
                 * SOCK_DGRAM is ordinary. So, no lock is needed.
                 */
                if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
                        nr_fds = atomic_read(&u->scm_stat.nr_fds);
                else if (s_state == TCP_LISTEN)
                        nr_fds = unix_count_nr_fds(sk);

                seq_printf(m, "scm_fds: %u\n", nr_fds);
        }
}
#else
#define unix_show_fdinfo NULL
#endif

static const struct proto_ops unix_stream_ops = {
        .family =        PF_UNIX,
        .owner =        THIS_MODULE,
        .release =        unix_release,
        .bind =                unix_bind,
        .connect =        unix_stream_connect,
        .socketpair =        unix_socketpair,
        .accept =        unix_accept,
        .getname =        unix_getname,
        .poll =                unix_poll,
        .ioctl =        unix_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl =        unix_compat_ioctl,
#endif
        .listen =        unix_listen,
        .shutdown =        unix_shutdown,
        .sendmsg =        unix_stream_sendmsg,
        .recvmsg =        unix_stream_recvmsg,
        .read_skb =        unix_stream_read_skb,
        .mmap =                sock_no_mmap,
        .splice_read =        unix_stream_splice_read,
        .set_peek_off =        sk_set_peek_off,
        .show_fdinfo =        unix_show_fdinfo,
};

static const struct proto_ops unix_dgram_ops = {
        .family =        PF_UNIX,
        .owner =        THIS_MODULE,
        .release =        unix_release,
        .bind =                unix_bind,
        .connect =        unix_dgram_connect,
        .socketpair =        unix_socketpair,
        .accept =        sock_no_accept,
        .getname =        unix_getname,
        .poll =                unix_dgram_poll,
        .ioctl =        unix_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl =        unix_compat_ioctl,
#endif
        .listen =        sock_no_listen,
        .shutdown =        unix_shutdown,
        .sendmsg =        unix_dgram_sendmsg,
        .read_skb =        unix_read_skb,
        .recvmsg =        unix_dgram_recvmsg,
        .mmap =                sock_no_mmap,
        .set_peek_off =        sk_set_peek_off,
        .show_fdinfo =        unix_show_fdinfo,
};

static const struct proto_ops unix_seqpacket_ops = {
        .family =        PF_UNIX,
        .owner =        THIS_MODULE,
        .release =        unix_release,
        .bind =                unix_bind,
        .connect =        unix_stream_connect,
        .socketpair =        unix_socketpair,
        .accept =        unix_accept,
        .getname =        unix_getname,
        .poll =                unix_dgram_poll,
        .ioctl =        unix_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl =        unix_compat_ioctl,
#endif
        .listen =        unix_listen,
        .shutdown =        unix_shutdown,
        .sendmsg =        unix_seqpacket_sendmsg,
        .recvmsg =        unix_seqpacket_recvmsg,
        .mmap =                sock_no_mmap,
        .set_peek_off =        sk_set_peek_off,
        .show_fdinfo =        unix_show_fdinfo,
};

static void unix_close(struct sock *sk, long timeout)
{
        /* Nothing to do here, unix socket does not need a ->close().
         * This is merely for sockmap.
         */
}

static void unix_unhash(struct sock *sk)
{
        /* Nothing to do here, unix socket does not need a ->unhash().
         * This is merely for sockmap.
         */
}

static bool unix_bpf_bypass_getsockopt(int level, int optname)
{
        if (level == SOL_SOCKET) {
                switch (optname) {
                case SO_PEERPIDFD:
                        return true;
                default:
                        return false;
                }
        }

        return false;
}

struct proto unix_dgram_proto = {
        .name                        = "UNIX",
        .owner                        = THIS_MODULE,
        .obj_size                = sizeof(struct unix_sock),
        .close                        = unix_close,
        .bpf_bypass_getsockopt        = unix_bpf_bypass_getsockopt,
#ifdef CONFIG_BPF_SYSCALL
        .psock_update_sk_prot        = unix_dgram_bpf_update_proto,
#endif
};

struct proto unix_stream_proto = {
        .name                        = "UNIX-STREAM",
        .owner                        = THIS_MODULE,
        .obj_size                = sizeof(struct unix_sock),
        .close                        = unix_close,
        .unhash                        = unix_unhash,
        .bpf_bypass_getsockopt        = unix_bpf_bypass_getsockopt,
#ifdef CONFIG_BPF_SYSCALL
        .psock_update_sk_prot        = unix_stream_bpf_update_proto,
#endif
};

static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
{
        struct unix_sock *u;
        struct sock *sk;
        int err;

        atomic_long_inc(&unix_nr_socks);
        if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
                err = -ENFILE;
                goto err;
        }

        if (type == SOCK_STREAM)
                sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
        else /*dgram and  seqpacket */
                sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);

        if (!sk) {
                err = -ENOMEM;
                goto err;
        }

        sock_init_data(sock, sk);

        sk->sk_hash                = unix_unbound_hash(sk);
        sk->sk_allocation        = GFP_KERNEL_ACCOUNT;
        sk->sk_write_space        = unix_write_space;
        sk->sk_max_ack_backlog        = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
        sk->sk_destruct                = unix_sock_destructor;
        lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);

        u = unix_sk(sk);
        u->listener = NULL;
        u->vertex = NULL;
        u->path.dentry = NULL;
        u->path.mnt = NULL;
        spin_lock_init(&u->lock);
        lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
        mutex_init(&u->iolock); /* single task reading lock */
        mutex_init(&u->bindlock); /* single task binding lock */
        init_waitqueue_head(&u->peer_wait);
        init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
        memset(&u->scm_stat, 0, sizeof(struct scm_stat));
        unix_insert_unbound_socket(net, sk);

        sock_prot_inuse_add(net, sk->sk_prot, 1);

        return sk;

err:
        atomic_long_dec(&unix_nr_socks);
        return ERR_PTR(err);
}

static int unix_create(struct net *net, struct socket *sock, int protocol,
                       int kern)
{
        struct sock *sk;

        if (protocol && protocol != PF_UNIX)
                return -EPROTONOSUPPORT;

        sock->state = SS_UNCONNECTED;

        switch (sock->type) {
        case SOCK_STREAM:
                sock->ops = &unix_stream_ops;
                break;
                /*
                 *        Believe it or not BSD has AF_UNIX, SOCK_RAW though
                 *        nothing uses it.
                 */
        case SOCK_RAW:
                sock->type = SOCK_DGRAM;
                fallthrough;
        case SOCK_DGRAM:
                sock->ops = &unix_dgram_ops;
                break;
        case SOCK_SEQPACKET:
                sock->ops = &unix_seqpacket_ops;
                break;
        default:
                return -ESOCKTNOSUPPORT;
        }

        sk = unix_create1(net, sock, kern, sock->type);
        if (IS_ERR(sk))
                return PTR_ERR(sk);

        return 0;
}

static int unix_release(struct socket *sock)
{
        struct sock *sk = sock->sk;

        if (!sk)
                return 0;

        sk->sk_prot->close(sk, 0);
        unix_release_sock(sk, 0);
        sock->sk = NULL;

        return 0;
}

static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
                                  int type)
{
        struct inode *inode;
        struct path path;
        struct sock *sk;
        int err;

        unix_mkname_bsd(sunaddr, addr_len);
        err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
        if (err)
                goto fail;

        err = path_permission(&path, MAY_WRITE);
        if (err)
                goto path_put;

        err = -ECONNREFUSED;
        inode = d_backing_inode(path.dentry);
        if (!S_ISSOCK(inode->i_mode))
                goto path_put;

        sk = unix_find_socket_byinode(inode);
        if (!sk)
                goto path_put;

        err = -EPROTOTYPE;
        if (sk->sk_type == type)
                touch_atime(&path);
        else
                goto sock_put;

        path_put(&path);

        return sk;

sock_put:
        sock_put(sk);
path_put:
        path_put(&path);
fail:
        return ERR_PTR(err);
}

static struct sock *unix_find_abstract(struct net *net,
                                       struct sockaddr_un *sunaddr,
                                       int addr_len, int type)
{
        unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
        struct dentry *dentry;
        struct sock *sk;

        sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
        if (!sk)
                return ERR_PTR(-ECONNREFUSED);

        dentry = unix_sk(sk)->path.dentry;
        if (dentry)
                touch_atime(&unix_sk(sk)->path);

        return sk;
}

static struct sock *unix_find_other(struct net *net,
                                    struct sockaddr_un *sunaddr,
                                    int addr_len, int type)
{
        struct sock *sk;

        if (sunaddr->sun_path[0])
                sk = unix_find_bsd(sunaddr, addr_len, type);
        else
                sk = unix_find_abstract(net, sunaddr, addr_len, type);

        return sk;
}

static int unix_autobind(struct sock *sk)
{
        struct unix_sock *u = unix_sk(sk);
        unsigned int new_hash, old_hash;
        struct net *net = sock_net(sk);
        struct unix_address *addr;
        u32 lastnum, ordernum;
        int err;

        err = mutex_lock_interruptible(&u->bindlock);
        if (err)
                return err;

        if (u->addr)
                goto out;

        err = -ENOMEM;
        addr = kzalloc(sizeof(*addr) +
                       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
        if (!addr)
                goto out;

        addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
        addr->name->sun_family = AF_UNIX;
        refcount_set(&addr->refcnt, 1);

        old_hash = sk->sk_hash;
        ordernum = get_random_u32();
        lastnum = ordernum & 0xFFFFF;
retry:
        ordernum = (ordernum + 1) & 0xFFFFF;
        sprintf(addr->name->sun_path + 1, "%05x", ordernum);

        new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
        unix_table_double_lock(net, old_hash, new_hash);

        if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
                unix_table_double_unlock(net, old_hash, new_hash);

                /* __unix_find_socket_byname() may take long time if many names
                 * are already in use.
                 */
                cond_resched();

                if (ordernum == lastnum) {
                        /* Give up if all names seems to be in use. */
                        err = -ENOSPC;
                        unix_release_addr(addr);
                        goto out;
                }

                goto retry;
        }

        __unix_set_addr_hash(net, sk, addr, new_hash);
        unix_table_double_unlock(net, old_hash, new_hash);
        err = 0;

out:        mutex_unlock(&u->bindlock);
        return err;
}

static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
                         int addr_len)
{
        umode_t mode = S_IFSOCK |
               (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
        struct unix_sock *u = unix_sk(sk);
        unsigned int new_hash, old_hash;
        struct net *net = sock_net(sk);
        struct mnt_idmap *idmap;
        struct unix_address *addr;
        struct dentry *dentry;
        struct path parent;
        int err;

        addr_len = unix_mkname_bsd(sunaddr, addr_len);
        addr = unix_create_addr(sunaddr, addr_len);
        if (!addr)
                return -ENOMEM;

        /*
         * Get the parent directory, calculate the hash for last
         * component.
         */
        dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
        if (IS_ERR(dentry)) {
                err = PTR_ERR(dentry);
                goto out;
        }

        /*
         * All right, let's create it.
         */
        idmap = mnt_idmap(parent.mnt);
        err = security_path_mknod(&parent, dentry, mode, 0);
        if (!err)
                err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
        if (err)
                goto out_path;
        err = mutex_lock_interruptible(&u->bindlock);
        if (err)
                goto out_unlink;
        if (u->addr)
                goto out_unlock;

        old_hash = sk->sk_hash;
        new_hash = unix_bsd_hash(d_backing_inode(dentry));
        unix_table_double_lock(net, old_hash, new_hash);
        u->path.mnt = mntget(parent.mnt);
        u->path.dentry = dget(dentry);
        __unix_set_addr_hash(net, sk, addr, new_hash);
        unix_table_double_unlock(net, old_hash, new_hash);
        unix_insert_bsd_socket(sk);
        mutex_unlock(&u->bindlock);
        done_path_create(&parent, dentry);
        return 0;

out_unlock:
        mutex_unlock(&u->bindlock);
        err = -EINVAL;
out_unlink:
        /* failed after successful mknod?  unlink what we'd created... */
        vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
out_path:
        done_path_create(&parent, dentry);
out:
        unix_release_addr(addr);
        return err == -EEXIST ? -EADDRINUSE : err;
}

static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
                              int addr_len)
{
        struct unix_sock *u = unix_sk(sk);
        unsigned int new_hash, old_hash;
        struct net *net = sock_net(sk);
        struct unix_address *addr;
        int err;

        addr = unix_create_addr(sunaddr, addr_len);
        if (!addr)
                return -ENOMEM;

        err = mutex_lock_interruptible(&u->bindlock);
        if (err)
                goto out;

        if (u->addr) {
                err = -EINVAL;
                goto out_mutex;
        }

        old_hash = sk->sk_hash;
        new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
        unix_table_double_lock(net, old_hash, new_hash);

        if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
                goto out_spin;

        __unix_set_addr_hash(net, sk, addr, new_hash);
        unix_table_double_unlock(net, old_hash, new_hash);
        mutex_unlock(&u->bindlock);
        return 0;

out_spin:
        unix_table_double_unlock(net, old_hash, new_hash);
        err = -EADDRINUSE;
out_mutex:
        mutex_unlock(&u->bindlock);
out:
        unix_release_addr(addr);
        return err;
}

static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
        struct sock *sk = sock->sk;
        int err;

        if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
            sunaddr->sun_family == AF_UNIX)
                return unix_autobind(sk);

        err = unix_validate_addr(sunaddr, addr_len);
        if (err)
                return err;

        if (sunaddr->sun_path[0])
                err = unix_bind_bsd(sk, sunaddr, addr_len);
        else
                err = unix_bind_abstract(sk, sunaddr, addr_len);

        return err;
}

static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
{
        if (unlikely(sk1 == sk2) || !sk2) {
                unix_state_lock(sk1);
                return;
        }

        if (sk1 > sk2)
                swap(sk1, sk2);

        unix_state_lock(sk1);
        unix_state_lock(sk2);
}

static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
{
        if (unlikely(sk1 == sk2) || !sk2) {
                unix_state_unlock(sk1);
                return;
        }
        unix_state_unlock(sk1);
        unix_state_unlock(sk2);
}

static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
                              int alen, int flags)
{
        struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
        struct sock *sk = sock->sk;
        struct sock *other;
        int err;

        err = -EINVAL;
        if (alen < offsetofend(struct sockaddr, sa_family))
                goto out;

        if (addr->sa_family != AF_UNSPEC) {
                err = unix_validate_addr(sunaddr, alen);
                if (err)
                        goto out;

                err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
                if (err)
                        goto out;

                if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
                     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
                    !READ_ONCE(unix_sk(sk)->addr)) {
                        err = unix_autobind(sk);
                        if (err)
                                goto out;
                }

restart:
                other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
                if (IS_ERR(other)) {
                        err = PTR_ERR(other);
                        goto out;
                }

                unix_state_double_lock(sk, other);

                /* Apparently VFS overslept socket death. Retry. */
                if (sock_flag(other, SOCK_DEAD)) {
                        unix_state_double_unlock(sk, other);
                        sock_put(other);
                        goto restart;
                }

                err = -EPERM;
                if (!unix_may_send(sk, other))
                        goto out_unlock;

                err = security_unix_may_send(sk->sk_socket, other->sk_socket);
                if (err)
                        goto out_unlock;

                WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
                WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
        } else {
                /*
                 *        1003.1g breaking connected state with AF_UNSPEC
                 */
                other = NULL;
                unix_state_double_lock(sk, other);
        }

        /*
         * If it was connected, reconnect.
         */
        if (unix_peer(sk)) {
                struct sock *old_peer = unix_peer(sk);

                unix_peer(sk) = other;
                if (!other)
                        WRITE_ONCE(sk->sk_state, TCP_CLOSE);
                unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);

                unix_state_double_unlock(sk, other);

                if (other != old_peer) {
                        unix_dgram_disconnected(sk, old_peer);

                        unix_state_lock(old_peer);
                        if (!unix_peer(old_peer))
                                WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
                        unix_state_unlock(old_peer);
                }

                sock_put(old_peer);
        } else {
                unix_peer(sk) = other;
                unix_state_double_unlock(sk, other);
        }

        return 0;

out_unlock:
        unix_state_double_unlock(sk, other);
        sock_put(other);
out:
        return err;
}

static long unix_wait_for_peer(struct sock *other, long timeo)
        __releases(&unix_sk(other)->lock)
{
        struct unix_sock *u = unix_sk(other);
        int sched;
        DEFINE_WAIT(wait);

        prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);

        sched = !sock_flag(other, SOCK_DEAD) &&
                !(other->sk_shutdown & RCV_SHUTDOWN) &&
                unix_recvq_full_lockless(other);

        unix_state_unlock(other);

        if (sched)
                timeo = schedule_timeout(timeo);

        finish_wait(&u->peer_wait, &wait);
        return timeo;
}

static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                               int addr_len, int flags)
{
        struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
        struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
        struct unix_sock *u = unix_sk(sk), *newu, *otheru;
        struct net *net = sock_net(sk);
        struct sk_buff *skb = NULL;
        unsigned char state;
        long timeo;
        int err;

        err = unix_validate_addr(sunaddr, addr_len);
        if (err)
                goto out;

        err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
        if (err)
                goto out;

        if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
             test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
            !READ_ONCE(u->addr)) {
                err = unix_autobind(sk);
                if (err)
                        goto out;
        }

        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

        /* First of all allocate resources.
           If we will make it after state is locked,
           we will have to recheck all again in any case.
         */

        /* create new sock for complete connection */
        newsk = unix_create1(net, NULL, 0, sock->type);
        if (IS_ERR(newsk)) {
                err = PTR_ERR(newsk);
                newsk = NULL;
                goto out;
        }

        err = -ENOMEM;

        /* Allocate skb for sending to listening sock */
        skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
        if (skb == NULL)
                goto out;

restart:
        /*  Find listening sock. */
        other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
        if (IS_ERR(other)) {
                err = PTR_ERR(other);
                other = NULL;
                goto out;
        }

        unix_state_lock(other);

        /* Apparently VFS overslept socket death. Retry. */
        if (sock_flag(other, SOCK_DEAD)) {
                unix_state_unlock(other);
                sock_put(other);
                goto restart;
        }

        err = -ECONNREFUSED;
        if (other->sk_state != TCP_LISTEN)
                goto out_unlock;
        if (other->sk_shutdown & RCV_SHUTDOWN)
                goto out_unlock;

        if (unix_recvq_full_lockless(other)) {
                err = -EAGAIN;
                if (!timeo)
                        goto out_unlock;

                timeo = unix_wait_for_peer(other, timeo);

                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        goto out;
                sock_put(other);
                goto restart;
        }

        /* self connect and simultaneous connect are eliminated
         * by rejecting TCP_LISTEN socket to avoid deadlock.
         */
        state = READ_ONCE(sk->sk_state);
        if (unlikely(state != TCP_CLOSE)) {
                err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
                goto out_unlock;
        }

        unix_state_lock(sk);

        if (unlikely(sk->sk_state != TCP_CLOSE)) {
                err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
                unix_state_unlock(sk);
                goto out_unlock;
        }

        err = security_unix_stream_connect(sk, other, newsk);
        if (err) {
                unix_state_unlock(sk);
                goto out_unlock;
        }

        /* The way is open! Fastly set all the necessary fields... */

        sock_hold(sk);
        unix_peer(newsk)        = sk;
        newsk->sk_state                = TCP_ESTABLISHED;
        newsk->sk_type                = sk->sk_type;
        init_peercred(newsk);
        newu = unix_sk(newsk);
        newu->listener = other;
        RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
        otheru = unix_sk(other);

        /* copy address information from listening to new sock
         *
         * The contents of *(otheru->addr) and otheru->path
         * are seen fully set up here, since we have found
         * otheru in hash under its lock.  Insertion into the
         * hash chain we'd found it in had been done in an
         * earlier critical area protected by the chain's lock,
         * the same one where we'd set *(otheru->addr) contents,
         * as well as otheru->path and otheru->addr itself.
         *
         * Using smp_store_release() here to set newu->addr
         * is enough to make those stores, as well as stores
         * to newu->path visible to anyone who gets newu->addr
         * by smp_load_acquire().  IOW, the same warranties
         * as for unix_sock instances bound in unix_bind() or
         * in unix_autobind().
         */
        if (otheru->path.dentry) {
                path_get(&otheru->path);
                newu->path = otheru->path;
        }
        refcount_inc(&otheru->addr->refcnt);
        smp_store_release(&newu->addr, otheru->addr);

        /* Set credentials */
        copy_peercred(sk, other);

        sock->state        = SS_CONNECTED;
        WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
        sock_hold(newsk);

        smp_mb__after_atomic();        /* sock_hold() does an atomic_inc() */
        unix_peer(sk)        = newsk;

        unix_state_unlock(sk);

        /* take ten and send info to listening sock */
        spin_lock(&other->sk_receive_queue.lock);
        __skb_queue_tail(&other->sk_receive_queue, skb);
        spin_unlock(&other->sk_receive_queue.lock);
        unix_state_unlock(other);
        other->sk_data_ready(other);
        sock_put(other);
        return 0;

out_unlock:
        if (other)
                unix_state_unlock(other);

out:
        kfree_skb(skb);
        if (newsk)
                unix_release_sock(newsk, 0);
        if (other)
                sock_put(other);
        return err;
}

static int unix_socketpair(struct socket *socka, struct socket *sockb)
{
        struct sock *ska = socka->sk, *skb = sockb->sk;

        /* Join our sockets back to back */
        sock_hold(ska);
        sock_hold(skb);
        unix_peer(ska) = skb;
        unix_peer(skb) = ska;
        init_peercred(ska);
        init_peercred(skb);

        ska->sk_state = TCP_ESTABLISHED;
        skb->sk_state = TCP_ESTABLISHED;
        socka->state  = SS_CONNECTED;
        sockb->state  = SS_CONNECTED;
        return 0;
}

static void unix_sock_inherit_flags(const struct socket *old,
                                    struct socket *new)
{
        if (test_bit(SOCK_PASSCRED, &old->flags))
                set_bit(SOCK_PASSCRED, &new->flags);
        if (test_bit(SOCK_PASSPIDFD, &old->flags))
                set_bit(SOCK_PASSPIDFD, &new->flags);
        if (test_bit(SOCK_PASSSEC, &old->flags))
                set_bit(SOCK_PASSSEC, &new->flags);
}

static int unix_accept(struct socket *sock, struct socket *newsock,
                       struct proto_accept_arg *arg)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        struct sock *tsk;

        arg->err = -EOPNOTSUPP;
        if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
                goto out;

        arg->err = -EINVAL;
        if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
                goto out;

        /* If socket state is TCP_LISTEN it cannot change (for now...),
         * so that no locks are necessary.
         */

        skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
                                &arg->err);
        if (!skb) {
                /* This means receive shutdown. */
                if (arg->err == 0)
                        arg->err = -EINVAL;
                goto out;
        }

        tsk = skb->sk;
        skb_free_datagram(sk, skb);
        wake_up_interruptible(&unix_sk(sk)->peer_wait);

        /* attach accepted sock to socket */
        unix_state_lock(tsk);
        unix_update_edges(unix_sk(tsk));
        newsock->state = SS_CONNECTED;
        unix_sock_inherit_flags(sock, newsock);
        sock_graft(tsk, newsock);
        unix_state_unlock(tsk);
        return 0;

out:
        return arg->err;
}


static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
{
        struct sock *sk = sock->sk;
        struct unix_address *addr;
        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
        int err = 0;

        if (peer) {
                sk = unix_peer_get(sk);

                err = -ENOTCONN;
                if (!sk)
                        goto out;
                err = 0;
        } else {
                sock_hold(sk);
        }

        addr = smp_load_acquire(&unix_sk(sk)->addr);
        if (!addr) {
                sunaddr->sun_family = AF_UNIX;
                sunaddr->sun_path[0] = 0;
                err = offsetof(struct sockaddr_un, sun_path);
        } else {
                err = addr->len;
                memcpy(sunaddr, addr->name, addr->len);

                if (peer)
                        BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
                                               CGROUP_UNIX_GETPEERNAME);
                else
                        BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
                                               CGROUP_UNIX_GETSOCKNAME);
        }
        sock_put(sk);
out:
        return err;
}

/* The "user->unix_inflight" variable is protected by the garbage
 * collection lock, and we just read it locklessly here. If you go
 * over the limit, there might be a tiny race in actually noticing
 * it across threads. Tough.
 */
static inline bool too_many_unix_fds(struct task_struct *p)
{
        struct user_struct *user = current_user();

        if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
                return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
        return false;
}

static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
        if (too_many_unix_fds(current))
                return -ETOOMANYREFS;

        UNIXCB(skb).fp = scm->fp;
        scm->fp = NULL;

        if (unix_prepare_fpl(UNIXCB(skb).fp))
                return -ENOMEM;

        return 0;
}

static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
        scm->fp = UNIXCB(skb).fp;
        UNIXCB(skb).fp = NULL;

        unix_destroy_fpl(scm->fp);
}

static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
        scm->fp = scm_fp_dup(UNIXCB(skb).fp);
}

static void unix_destruct_scm(struct sk_buff *skb)
{
        struct scm_cookie scm;

        memset(&scm, 0, sizeof(scm));
        scm.pid  = UNIXCB(skb).pid;
        if (UNIXCB(skb).fp)
                unix_detach_fds(&scm, skb);

        /* Alas, it calls VFS */
        /* So fscking what? fput() had been SMP-safe since the last Summer */
        scm_destroy(&scm);
        sock_wfree(skb);
}

static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
{
        int err = 0;

        UNIXCB(skb).pid  = get_pid(scm->pid);
        UNIXCB(skb).uid = scm->creds.uid;
        UNIXCB(skb).gid = scm->creds.gid;
        UNIXCB(skb).fp = NULL;
        unix_get_secdata(scm, skb);
        if (scm->fp && send_fds)
                err = unix_attach_fds(scm, skb);

        skb->destructor = unix_destruct_scm;
        return err;
}

static bool unix_passcred_enabled(const struct socket *sock,
                                  const struct sock *other)
{
        return test_bit(SOCK_PASSCRED, &sock->flags) ||
               test_bit(SOCK_PASSPIDFD, &sock->flags) ||
               !other->sk_socket ||
               test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
               test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
}

/*
 * Some apps rely on write() giving SCM_CREDENTIALS
 * We include credentials if source or destination socket
 * asserted SOCK_PASSCRED.
 */
static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
                            const struct sock *other)
{
        if (UNIXCB(skb).pid)
                return;
        if (unix_passcred_enabled(sock, other)) {
                UNIXCB(skb).pid  = get_pid(task_tgid(current));
                current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
        }
}

static bool unix_skb_scm_eq(struct sk_buff *skb,
                            struct scm_cookie *scm)
{
        return UNIXCB(skb).pid == scm->pid &&
               uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
               gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
               unix_secdata_eq(scm, skb);
}

static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
{
        struct scm_fp_list *fp = UNIXCB(skb).fp;
        struct unix_sock *u = unix_sk(sk);

        if (unlikely(fp && fp->count)) {
                atomic_add(fp->count, &u->scm_stat.nr_fds);
                unix_add_edges(fp, u);
        }
}

static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
{
        struct scm_fp_list *fp = UNIXCB(skb).fp;
        struct unix_sock *u = unix_sk(sk);

        if (unlikely(fp && fp->count)) {
                atomic_sub(fp->count, &u->scm_stat.nr_fds);
                unix_del_edges(fp);
        }
}

/*
 *        Send AF_UNIX data.
 */

static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
                              size_t len)
{
        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
        struct sock *sk = sock->sk, *other = NULL;
        struct unix_sock *u = unix_sk(sk);
        struct scm_cookie scm;
        struct sk_buff *skb;
        int data_len = 0;
        int sk_locked;
        long timeo;
        int err;

        err = scm_send(sock, msg, &scm, false);
        if (err < 0)
                return err;

        wait_for_unix_gc(scm.fp);

        err = -EOPNOTSUPP;
        if (msg->msg_flags&MSG_OOB)
                goto out;

        if (msg->msg_namelen) {
                err = unix_validate_addr(sunaddr, msg->msg_namelen);
                if (err)
                        goto out;

                err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
                                                            msg->msg_name,
                                                            &msg->msg_namelen,
                                                            NULL);
                if (err)
                        goto out;
        } else {
                sunaddr = NULL;
                err = -ENOTCONN;
                other = unix_peer_get(sk);
                if (!other)
                        goto out;
        }

        if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
             test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
            !READ_ONCE(u->addr)) {
                err = unix_autobind(sk);
                if (err)
                        goto out;
        }

        err = -EMSGSIZE;
        if (len > READ_ONCE(sk->sk_sndbuf) - 32)
                goto out;

        if (len > SKB_MAX_ALLOC) {
                data_len = min_t(size_t,
                                 len - SKB_MAX_ALLOC,
                                 MAX_SKB_FRAGS * PAGE_SIZE);
                data_len = PAGE_ALIGN(data_len);

                BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
        }

        skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
                                   msg->msg_flags & MSG_DONTWAIT, &err,
                                   PAGE_ALLOC_COSTLY_ORDER);
        if (skb == NULL)
                goto out;

        err = unix_scm_to_skb(&scm, skb, true);
        if (err < 0)
                goto out_free;

        skb_put(skb, len - data_len);
        skb->data_len = data_len;
        skb->len = len;
        err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
        if (err)
                goto out_free;

        timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);

restart:
        if (!other) {
                err = -ECONNRESET;
                if (sunaddr == NULL)
                        goto out_free;

                other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
                                        sk->sk_type);
                if (IS_ERR(other)) {
                        err = PTR_ERR(other);
                        other = NULL;
                        goto out_free;
                }
        }

        if (sk_filter(other, skb) < 0) {
                /* Toss the packet but do not return any error to the sender */
                err = len;
                goto out_free;
        }

        sk_locked = 0;
        unix_state_lock(other);
restart_locked:
        err = -EPERM;
        if (!unix_may_send(sk, other))
                goto out_unlock;

        if (unlikely(sock_flag(other, SOCK_DEAD))) {
                /*
                 *        Check with 1003.1g - what should
                 *        datagram error
                 */
                unix_state_unlock(other);
                sock_put(other);

                if (!sk_locked)
                        unix_state_lock(sk);

                err = 0;
                if (sk->sk_type == SOCK_SEQPACKET) {
                        /* We are here only when racing with unix_release_sock()
                         * is clearing @other. Never change state to TCP_CLOSE
                         * unlike SOCK_DGRAM wants.
                         */
                        unix_state_unlock(sk);
                        err = -EPIPE;
                } else if (unix_peer(sk) == other) {
                        unix_peer(sk) = NULL;
                        unix_dgram_peer_wake_disconnect_wakeup(sk, other);

                        WRITE_ONCE(sk->sk_state, TCP_CLOSE);
                        unix_state_unlock(sk);

                        unix_dgram_disconnected(sk, other);
                        sock_put(other);
                        err = -ECONNREFUSED;
                } else {
                        unix_state_unlock(sk);
                }

                other = NULL;
                if (err)
                        goto out_free;
                goto restart;
        }

        err = -EPIPE;
        if (other->sk_shutdown & RCV_SHUTDOWN)
                goto out_unlock;

        if (sk->sk_type != SOCK_SEQPACKET) {
                err = security_unix_may_send(sk->sk_socket, other->sk_socket);
                if (err)
                        goto out_unlock;
        }

        /* other == sk && unix_peer(other) != sk if
         * - unix_peer(sk) == NULL, destination address bound to sk
         * - unix_peer(sk) == sk by time of get but disconnected before lock
         */
        if (other != sk &&
            unlikely(unix_peer(other) != sk &&
            unix_recvq_full_lockless(other))) {
                if (timeo) {
                        timeo = unix_wait_for_peer(other, timeo);

                        err = sock_intr_errno(timeo);
                        if (signal_pending(current))
                                goto out_free;

                        goto restart;
                }

                if (!sk_locked) {
                        unix_state_unlock(other);
                        unix_state_double_lock(sk, other);
                }

                if (unix_peer(sk) != other ||
                    unix_dgram_peer_wake_me(sk, other)) {
                        err = -EAGAIN;
                        sk_locked = 1;
                        goto out_unlock;
                }

                if (!sk_locked) {
                        sk_locked = 1;
                        goto restart_locked;
                }
        }

        if (unlikely(sk_locked))
                unix_state_unlock(sk);

        if (sock_flag(other, SOCK_RCVTSTAMP))
                __net_timestamp(skb);
        maybe_add_creds(skb, sock, other);
        scm_stat_add(other, skb);
        skb_queue_tail(&other->sk_receive_queue, skb);
        unix_state_unlock(other);
        other->sk_data_ready(other);
        sock_put(other);
        scm_destroy(&scm);
        return len;

out_unlock:
        if (sk_locked)
                unix_state_unlock(sk);
        unix_state_unlock(other);
out_free:
        kfree_skb(skb);
out:
        if (other)
                sock_put(other);
        scm_destroy(&scm);
        return err;
}

/* We use paged skbs for stream sockets, and limit occupancy to 32768
 * bytes, and a minimum of a full page.
 */
#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))

#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
                     struct scm_cookie *scm, bool fds_sent)
{
        struct unix_sock *ousk = unix_sk(other);
        struct sk_buff *skb;
        int err = 0;

        skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);

        if (!skb)
                return err;

        err = unix_scm_to_skb(scm, skb, !fds_sent);
        if (err < 0) {
                kfree_skb(skb);
                return err;
        }
        skb_put(skb, 1);
        err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);

        if (err) {
                kfree_skb(skb);
                return err;
        }

        unix_state_lock(other);

        if (sock_flag(other, SOCK_DEAD) ||
            (other->sk_shutdown & RCV_SHUTDOWN)) {
                unix_state_unlock(other);
                kfree_skb(skb);
                return -EPIPE;
        }

        maybe_add_creds(skb, sock, other);
        skb_get(skb);

        scm_stat_add(other, skb);

        spin_lock(&other->sk_receive_queue.lock);
        if (ousk->oob_skb)
                consume_skb(ousk->oob_skb);
        WRITE_ONCE(ousk->oob_skb, skb);
        __skb_queue_tail(&other->sk_receive_queue, skb);
        spin_unlock(&other->sk_receive_queue.lock);

        sk_send_sigurg(other);
        unix_state_unlock(other);
        other->sk_data_ready(other);

        return err;
}
#endif

static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
                               size_t len)
{
        struct sock *sk = sock->sk;
        struct sock *other = NULL;
        int err, size;
        struct sk_buff *skb;
        int sent = 0;
        struct scm_cookie scm;
        bool fds_sent = false;
        int data_len;

        err = scm_send(sock, msg, &scm, false);
        if (err < 0)
                return err;

        wait_for_unix_gc(scm.fp);

        err = -EOPNOTSUPP;
        if (msg->msg_flags & MSG_OOB) {
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
                if (len)
                        len--;
                else
#endif
                        goto out_err;
        }

        if (msg->msg_namelen) {
                err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
                goto out_err;
        } else {
                err = -ENOTCONN;
                other = unix_peer(sk);
                if (!other)
                        goto out_err;
        }

        if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                goto pipe_err;

        while (sent < len) {
                size = len - sent;

                if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
                        skb = sock_alloc_send_pskb(sk, 0, 0,
                                                   msg->msg_flags & MSG_DONTWAIT,
                                                   &err, 0);
                } else {
                        /* Keep two messages in the pipe so it schedules better */
                        size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);

                        /* allow fallback to order-0 allocations */
                        size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);

                        data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));

                        data_len = min_t(size_t, size, PAGE_ALIGN(data_len));

                        skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
                                                   msg->msg_flags & MSG_DONTWAIT, &err,
                                                   get_order(UNIX_SKB_FRAGS_SZ));
                }
                if (!skb)
                        goto out_err;

                /* Only send the fds in the first buffer */
                err = unix_scm_to_skb(&scm, skb, !fds_sent);
                if (err < 0) {
                        kfree_skb(skb);
                        goto out_err;
                }
                fds_sent = true;

                if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
                        err = skb_splice_from_iter(skb, &msg->msg_iter, size,
                                                   sk->sk_allocation);
                        if (err < 0) {
                                kfree_skb(skb);
                                goto out_err;
                        }
                        size = err;
                        refcount_add(size, &sk->sk_wmem_alloc);
                } else {
                        skb_put(skb, size - data_len);
                        skb->data_len = data_len;
                        skb->len = size;
                        err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
                        if (err) {
                                kfree_skb(skb);
                                goto out_err;
                        }
                }

                unix_state_lock(other);

                if (sock_flag(other, SOCK_DEAD) ||
                    (other->sk_shutdown & RCV_SHUTDOWN))
                        goto pipe_err_free;

                maybe_add_creds(skb, sock, other);
                scm_stat_add(other, skb);
                skb_queue_tail(&other->sk_receive_queue, skb);
                unix_state_unlock(other);
                other->sk_data_ready(other);
                sent += size;
        }

#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
        if (msg->msg_flags & MSG_OOB) {
                err = queue_oob(sock, msg, other, &scm, fds_sent);
                if (err)
                        goto out_err;
                sent++;
        }
#endif

        scm_destroy(&scm);

        return sent;

pipe_err_free:
        unix_state_unlock(other);
        kfree_skb(skb);
pipe_err:
        if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
                send_sig(SIGPIPE, current, 0);
        err = -EPIPE;
out_err:
        scm_destroy(&scm);
        return sent ? : err;
}

static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
                                  size_t len)
{
        int err;
        struct sock *sk = sock->sk;

        err = sock_error(sk);
        if (err)
                return err;

        if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
                return -ENOTCONN;

        if (msg->msg_namelen)
                msg->msg_namelen = 0;

        return unix_dgram_sendmsg(sock, msg, len);
}

static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
                                  size_t size, int flags)
{
        struct sock *sk = sock->sk;

        if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
                return -ENOTCONN;

        return unix_dgram_recvmsg(sock, msg, size, flags);
}

static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
{
        struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);

        if (addr) {
                msg->msg_namelen = addr->len;
                memcpy(msg->msg_name, addr->name, addr->len);
        }
}

int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
                         int flags)
{
        struct scm_cookie scm;
        struct socket *sock = sk->sk_socket;
        struct unix_sock *u = unix_sk(sk);
        struct sk_buff *skb, *last;
        long timeo;
        int skip;
        int err;

        err = -EOPNOTSUPP;
        if (flags&MSG_OOB)
                goto out;

        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        do {
                mutex_lock(&u->iolock);

                skip = sk_peek_offset(sk, flags);
                skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
                                              &skip, &err, &last);
                if (skb) {
                        if (!(flags & MSG_PEEK))
                                scm_stat_del(sk, skb);
                        break;
                }

                mutex_unlock(&u->iolock);

                if (err != -EAGAIN)
                        break;
        } while (timeo &&
                 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
                                              &err, &timeo, last));

        if (!skb) { /* implies iolock unlocked */
                unix_state_lock(sk);
                /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
                if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
                    (sk->sk_shutdown & RCV_SHUTDOWN))
                        err = 0;
                unix_state_unlock(sk);
                goto out;
        }

        if (wq_has_sleeper(&u->peer_wait))
                wake_up_interruptible_sync_poll(&u->peer_wait,
                                                EPOLLOUT | EPOLLWRNORM |
                                                EPOLLWRBAND);

        if (msg->msg_name) {
                unix_copy_addr(msg, skb->sk);

                BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
                                                      msg->msg_name,
                                                      &msg->msg_namelen);
        }

        if (size > skb->len - skip)
                size = skb->len - skip;
        else if (size < skb->len - skip)
                msg->msg_flags |= MSG_TRUNC;

        err = skb_copy_datagram_msg(skb, skip, msg, size);
        if (err)
                goto out_free;

        if (sock_flag(sk, SOCK_RCVTSTAMP))
                __sock_recv_timestamp(msg, sk, skb);

        memset(&scm, 0, sizeof(scm));

        scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
        unix_set_secdata(&scm, skb);

        if (!(flags & MSG_PEEK)) {
                if (UNIXCB(skb).fp)
                        unix_detach_fds(&scm, skb);

                sk_peek_offset_bwd(sk, skb->len);
        } else {
                /* It is questionable: on PEEK we could:
                   - do not return fds - good, but too simple 8)
                   - return fds, and do not return them on read (old strategy,
                     apparently wrong)
                   - clone fds (I chose it for now, it is the most universal
                     solution)

                   POSIX 1003.1g does not actually define this clearly
                   at all. POSIX 1003.1g doesn't define a lot of things
                   clearly however!

                */

                sk_peek_offset_fwd(sk, size);

                if (UNIXCB(skb).fp)
                        unix_peek_fds(&scm, skb);
        }
        err = (flags & MSG_TRUNC) ? skb->len - skip : size;

        scm_recv_unix(sock, msg, &scm, flags);

out_free:
        skb_free_datagram(sk, skb);
        mutex_unlock(&u->iolock);
out:
        return err;
}

static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                              int flags)
{
        struct sock *sk = sock->sk;

#ifdef CONFIG_BPF_SYSCALL
        const struct proto *prot = READ_ONCE(sk->sk_prot);

        if (prot != &unix_dgram_proto)
                return prot->recvmsg(sk, msg, size, flags, NULL);
#endif
        return __unix_dgram_recvmsg(sk, msg, size, flags);
}

static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
        struct unix_sock *u = unix_sk(sk);
        struct sk_buff *skb;
        int err;

        mutex_lock(&u->iolock);
        skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
        mutex_unlock(&u->iolock);
        if (!skb)
                return err;

        return recv_actor(sk, skb);
}

/*
 *        Sleep until more data has arrived. But check for races..
 */
static long unix_stream_data_wait(struct sock *sk, long timeo,
                                  struct sk_buff *last, unsigned int last_len,
                                  bool freezable)
{
        unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
        struct sk_buff *tail;
        DEFINE_WAIT(wait);

        unix_state_lock(sk);

        for (;;) {
                prepare_to_wait(sk_sleep(sk), &wait, state);

                tail = skb_peek_tail(&sk->sk_receive_queue);
                if (tail != last ||
                    (tail && tail->len != last_len) ||
                    sk->sk_err ||
                    (sk->sk_shutdown & RCV_SHUTDOWN) ||
                    signal_pending(current) ||
                    !timeo)
                        break;

                sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
                unix_state_unlock(sk);
                timeo = schedule_timeout(timeo);
                unix_state_lock(sk);

                if (sock_flag(sk, SOCK_DEAD))
                        break;

                sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        }

        finish_wait(sk_sleep(sk), &wait);
        unix_state_unlock(sk);
        return timeo;
}

static unsigned int unix_skb_len(const struct sk_buff *skb)
{
        return skb->len - UNIXCB(skb).consumed;
}

struct unix_stream_read_state {
        int (*recv_actor)(struct sk_buff *, int, int,
                          struct unix_stream_read_state *);
        struct socket *socket;
        struct msghdr *msg;
        struct pipe_inode_info *pipe;
        size_t size;
        int flags;
        unsigned int splice_flags;
};

#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
static int unix_stream_recv_urg(struct unix_stream_read_state *state)
{
        struct socket *sock = state->socket;
        struct sock *sk = sock->sk;
        struct unix_sock *u = unix_sk(sk);
        int chunk = 1;
        struct sk_buff *oob_skb;

        mutex_lock(&u->iolock);
        unix_state_lock(sk);
        spin_lock(&sk->sk_receive_queue.lock);

        if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
                spin_unlock(&sk->sk_receive_queue.lock);
                unix_state_unlock(sk);
                mutex_unlock(&u->iolock);
                return -EINVAL;
        }

        oob_skb = u->oob_skb;

        if (!(state->flags & MSG_PEEK))
                WRITE_ONCE(u->oob_skb, NULL);
        else
                skb_get(oob_skb);

        spin_unlock(&sk->sk_receive_queue.lock);
        unix_state_unlock(sk);

        chunk = state->recv_actor(oob_skb, 0, chunk, state);

        if (!(state->flags & MSG_PEEK))
                UNIXCB(oob_skb).consumed += 1;

        consume_skb(oob_skb);

        mutex_unlock(&u->iolock);

        if (chunk < 0)
                return -EFAULT;

        state->msg->msg_flags |= MSG_OOB;
        return 1;
}

static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
                                  int flags, int copied)
{
        struct unix_sock *u = unix_sk(sk);

        if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
                skb_unlink(skb, &sk->sk_receive_queue);
                consume_skb(skb);
                skb = NULL;
        } else {
                struct sk_buff *unlinked_skb = NULL;

                spin_lock(&sk->sk_receive_queue.lock);

                if (skb == u->oob_skb) {
                        if (copied) {
                                skb = NULL;
                        } else if (!(flags & MSG_PEEK)) {
                                if (sock_flag(sk, SOCK_URGINLINE)) {
                                        WRITE_ONCE(u->oob_skb, NULL);
                                        consume_skb(skb);
                                } else {
                                        __skb_unlink(skb, &sk->sk_receive_queue);
                                        WRITE_ONCE(u->oob_skb, NULL);
                                        unlinked_skb = skb;
                                        skb = skb_peek(&sk->sk_receive_queue);
                                }
                        } else if (!sock_flag(sk, SOCK_URGINLINE)) {
                                skb = skb_peek_next(skb, &sk->sk_receive_queue);
                        }
                }

                spin_unlock(&sk->sk_receive_queue.lock);

                if (unlinked_skb) {
                        WARN_ON_ONCE(skb_unref(unlinked_skb));
                        kfree_skb(unlinked_skb);
                }
        }
        return skb;
}
#endif

static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
        if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
                return -ENOTCONN;

        return unix_read_skb(sk, recv_actor);
}

static int unix_stream_read_generic(struct unix_stream_read_state *state,
                                    bool freezable)
{
        struct scm_cookie scm;
        struct socket *sock = state->socket;
        struct sock *sk = sock->sk;
        struct unix_sock *u = unix_sk(sk);
        int copied = 0;
        int flags = state->flags;
        int noblock = flags & MSG_DONTWAIT;
        bool check_creds = false;
        int target;
        int err = 0;
        long timeo;
        int skip;
        size_t size = state->size;
        unsigned int last_len;

        if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
                err = -EINVAL;
                goto out;
        }

        if (unlikely(flags & MSG_OOB)) {
                err = -EOPNOTSUPP;
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
                err = unix_stream_recv_urg(state);
#endif
                goto out;
        }

        target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
        timeo = sock_rcvtimeo(sk, noblock);

        memset(&scm, 0, sizeof(scm));

        /* Lock the socket to prevent queue disordering
         * while sleeps in memcpy_tomsg
         */
        mutex_lock(&u->iolock);

        skip = max(sk_peek_offset(sk, flags), 0);

        do {
                struct sk_buff *skb, *last;
                int chunk;

redo:
                unix_state_lock(sk);
                if (sock_flag(sk, SOCK_DEAD)) {
                        err = -ECONNRESET;
                        goto unlock;
                }
                last = skb = skb_peek(&sk->sk_receive_queue);
                last_len = last ? last->len : 0;

again:
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
                if (skb) {
                        skb = manage_oob(skb, sk, flags, copied);
                        if (!skb && copied) {
                                unix_state_unlock(sk);
                                break;
                        }
                }
#endif
                if (skb == NULL) {
                        if (copied >= target)
                                goto unlock;

                        /*
                         *        POSIX 1003.1g mandates this order.
                         */

                        err = sock_error(sk);
                        if (err)
                                goto unlock;
                        if (sk->sk_shutdown & RCV_SHUTDOWN)
                                goto unlock;

                        unix_state_unlock(sk);
                        if (!timeo) {
                                err = -EAGAIN;
                                break;
                        }

                        mutex_unlock(&u->iolock);

                        timeo = unix_stream_data_wait(sk, timeo, last,
                                                      last_len, freezable);

                        if (signal_pending(current)) {
                                err = sock_intr_errno(timeo);
                                scm_destroy(&scm);
                                goto out;
                        }

                        mutex_lock(&u->iolock);
                        goto redo;
unlock:
                        unix_state_unlock(sk);
                        break;
                }

                while (skip >= unix_skb_len(skb)) {
                        skip -= unix_skb_len(skb);
                        last = skb;
                        last_len = skb->len;
                        skb = skb_peek_next(skb, &sk->sk_receive_queue);
                        if (!skb)
                                goto again;
                }

                unix_state_unlock(sk);

                if (check_creds) {
                        /* Never glue messages from different writers */
                        if (!unix_skb_scm_eq(skb, &scm))
                                break;
                } else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
                           test_bit(SOCK_PASSPIDFD, &sock->flags)) {
                        /* Copy credentials */
                        scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
                        unix_set_secdata(&scm, skb);
                        check_creds = true;
                }

                /* Copy address just once */
                if (state->msg && state->msg->msg_name) {
                        DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
                                         state->msg->msg_name);
                        unix_copy_addr(state->msg, skb->sk);

                        BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
                                                              state->msg->msg_name,
                                                              &state->msg->msg_namelen);

                        sunaddr = NULL;
                }

                chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
                chunk = state->recv_actor(skb, skip, chunk, state);
                if (chunk < 0) {
                        if (copied == 0)
                                copied = -EFAULT;
                        break;
                }
                copied += chunk;
                size -= chunk;

                /* Mark read part of skb as used */
                if (!(flags & MSG_PEEK)) {
                        UNIXCB(skb).consumed += chunk;

                        sk_peek_offset_bwd(sk, chunk);

                        if (UNIXCB(skb).fp) {
                                scm_stat_del(sk, skb);
                                unix_detach_fds(&scm, skb);
                        }

                        if (unix_skb_len(skb))
                                break;

                        skb_unlink(skb, &sk->sk_receive_queue);
                        consume_skb(skb);

                        if (scm.fp)
                                break;
                } else {
                        /* It is questionable, see note in unix_dgram_recvmsg.
                         */
                        if (UNIXCB(skb).fp)
                                unix_peek_fds(&scm, skb);

                        sk_peek_offset_fwd(sk, chunk);

                        if (UNIXCB(skb).fp)
                                break;

                        skip = 0;
                        last = skb;
                        last_len = skb->len;
                        unix_state_lock(sk);
                        skb = skb_peek_next(skb, &sk->sk_receive_queue);
                        if (skb)
                                goto again;
                        unix_state_unlock(sk);
                        break;
                }
        } while (size);

        mutex_unlock(&u->iolock);
        if (state->msg)
                scm_recv_unix(sock, state->msg, &scm, flags);
        else
                scm_destroy(&scm);
out:
        return copied ? : err;
}

static int unix_stream_read_actor(struct sk_buff *skb,
                                  int skip, int chunk,
                                  struct unix_stream_read_state *state)
{
        int ret;

        ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
                                    state->msg, chunk);
        return ret ?: chunk;
}

int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
                          size_t size, int flags)
{
        struct unix_stream_read_state state = {
                .recv_actor = unix_stream_read_actor,
                .socket = sk->sk_socket,
                .msg = msg,
                .size = size,
                .flags = flags
        };

        return unix_stream_read_generic(&state, true);
}

static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
                               size_t size, int flags)
{
        struct unix_stream_read_state state = {
                .recv_actor = unix_stream_read_actor,
                .socket = sock,
                .msg = msg,
                .size = size,
                .flags = flags
        };

#ifdef CONFIG_BPF_SYSCALL
        struct sock *sk = sock->sk;
        const struct proto *prot = READ_ONCE(sk->sk_prot);

        if (prot != &unix_stream_proto)
                return prot->recvmsg(sk, msg, size, flags, NULL);
#endif
        return unix_stream_read_generic(&state, true);
}

static int unix_stream_splice_actor(struct sk_buff *skb,
                                    int skip, int chunk,
                                    struct unix_stream_read_state *state)
{
        return skb_splice_bits(skb, state->socket->sk,
                               UNIXCB(skb).consumed + skip,
                               state->pipe, chunk, state->splice_flags);
}

static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
                                       struct pipe_inode_info *pipe,
                                       size_t size, unsigned int flags)
{
        struct unix_stream_read_state state = {
                .recv_actor = unix_stream_splice_actor,
                .socket = sock,
                .pipe = pipe,
                .size = size,
                .splice_flags = flags,
        };

        if (unlikely(*ppos))
                return -ESPIPE;

        if (sock->file->f_flags & O_NONBLOCK ||
            flags & SPLICE_F_NONBLOCK)
                state.flags = MSG_DONTWAIT;

        return unix_stream_read_generic(&state, false);
}

static int unix_shutdown(struct socket *sock, int mode)
{
        struct sock *sk = sock->sk;
        struct sock *other;

        if (mode < SHUT_RD || mode > SHUT_RDWR)
                return -EINVAL;
        /* This maps:
         * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
         * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
         * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
         */
        ++mode;

        unix_state_lock(sk);
        WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
        other = unix_peer(sk);
        if (other)
                sock_hold(other);
        unix_state_unlock(sk);
        sk->sk_state_change(sk);

        if (other &&
                (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {

                int peer_mode = 0;
                const struct proto *prot = READ_ONCE(other->sk_prot);

                if (prot->unhash)
                        prot->unhash(other);
                if (mode&RCV_SHUTDOWN)
                        peer_mode |= SEND_SHUTDOWN;
                if (mode&SEND_SHUTDOWN)
                        peer_mode |= RCV_SHUTDOWN;
                unix_state_lock(other);
                WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
                unix_state_unlock(other);
                other->sk_state_change(other);
                if (peer_mode == SHUTDOWN_MASK)
                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
                else if (peer_mode & RCV_SHUTDOWN)
                        sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
        }
        if (other)
                sock_put(other);

        return 0;
}

long unix_inq_len(struct sock *sk)
{
        struct sk_buff *skb;
        long amount = 0;

        if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
                return -EINVAL;

        spin_lock(&sk->sk_receive_queue.lock);
        if (sk->sk_type == SOCK_STREAM ||
            sk->sk_type == SOCK_SEQPACKET) {
                skb_queue_walk(&sk->sk_receive_queue, skb)
                        amount += unix_skb_len(skb);
        } else {
                skb = skb_peek(&sk->sk_receive_queue);
                if (skb)
                        amount = skb->len;
        }
        spin_unlock(&sk->sk_receive_queue.lock);

        return amount;
}
EXPORT_SYMBOL_GPL(unix_inq_len);

long unix_outq_len(struct sock *sk)
{
        return sk_wmem_alloc_get(sk);
}
EXPORT_SYMBOL_GPL(unix_outq_len);

static int unix_open_file(struct sock *sk)
{
        struct path path;
        struct file *f;
        int fd;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        if (!smp_load_acquire(&unix_sk(sk)->addr))
                return -ENOENT;

        path = unix_sk(sk)->path;
        if (!path.dentry)
                return -ENOENT;

        path_get(&path);

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                goto out;

        f = dentry_open(&path, O_PATH, current_cred());
        if (IS_ERR(f)) {
                put_unused_fd(fd);
                fd = PTR_ERR(f);
                goto out;
        }

        fd_install(fd, f);
out:
        path_put(&path);

        return fd;
}

static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        struct sock *sk = sock->sk;
        long amount = 0;
        int err;

        switch (cmd) {
        case SIOCOUTQ:
                amount = unix_outq_len(sk);
                err = put_user(amount, (int __user *)arg);
                break;
        case SIOCINQ:
                amount = unix_inq_len(sk);
                if (amount < 0)
                        err = amount;
                else
                        err = put_user(amount, (int __user *)arg);
                break;
        case SIOCUNIXFILE:
                err = unix_open_file(sk);
                break;
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
        case SIOCATMARK:
                {
                        struct sk_buff *skb;
                        int answ = 0;

                        skb = skb_peek(&sk->sk_receive_queue);
                        if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
                                answ = 1;
                        err = put_user(answ, (int __user *)arg);
                }
                break;
#endif
        default:
                err = -ENOIOCTLCMD;
                break;
        }
        return err;
}

#ifdef CONFIG_COMPAT
static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
}
#endif

static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        struct sock *sk = sock->sk;
        unsigned char state;
        __poll_t mask;
        u8 shutdown;

        sock_poll_wait(file, sock, wait);
        mask = 0;
        shutdown = READ_ONCE(sk->sk_shutdown);
        state = READ_ONCE(sk->sk_state);

        /* exceptional events? */
        if (READ_ONCE(sk->sk_err))
                mask |= EPOLLERR;
        if (shutdown == SHUTDOWN_MASK)
                mask |= EPOLLHUP;
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;

        /* readable? */
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;
        if (sk_is_readable(sk))
                mask |= EPOLLIN | EPOLLRDNORM;
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
        if (READ_ONCE(unix_sk(sk)->oob_skb))
                mask |= EPOLLPRI;
#endif

        /* Connection-based need to check for termination and startup */
        if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
            state == TCP_CLOSE)
                mask |= EPOLLHUP;

        /*
         * we set writable also when the other side has shut down the
         * connection. This prevents stuck sockets.
         */
        if (unix_writable(sk, state))
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;

        return mask;
}

static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
                                    poll_table *wait)
{
        struct sock *sk = sock->sk, *other;
        unsigned int writable;
        unsigned char state;
        __poll_t mask;
        u8 shutdown;

        sock_poll_wait(file, sock, wait);
        mask = 0;
        shutdown = READ_ONCE(sk->sk_shutdown);
        state = READ_ONCE(sk->sk_state);

        /* exceptional events? */
        if (READ_ONCE(sk->sk_err) ||
            !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR |
                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);

        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
        if (shutdown == SHUTDOWN_MASK)
                mask |= EPOLLHUP;

        /* readable? */
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;
        if (sk_is_readable(sk))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Connection-based need to check for termination and startup */
        if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
                mask |= EPOLLHUP;

        /* No write status requested, avoid expensive OUT tests. */
        if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
                return mask;

        writable = unix_writable(sk, state);
        if (writable) {
                unix_state_lock(sk);

                other = unix_peer(sk);
                if (other && unix_peer(other) != sk &&
                    unix_recvq_full_lockless(other) &&
                    unix_dgram_peer_wake_me(sk, other))
                        writable = 0;

                unix_state_unlock(sk);
        }

        if (writable)
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
        else
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        return mask;
}

#ifdef CONFIG_PROC_FS

#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)

#define get_bucket(x) ((x) >> BUCKET_SPACE)
#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))

static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
{
        unsigned long offset = get_offset(*pos);
        unsigned long bucket = get_bucket(*pos);
        unsigned long count = 0;
        struct sock *sk;

        for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
             sk; sk = sk_next(sk)) {
                if (++count == offset)
                        break;
        }

        return sk;
}

static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
{
        unsigned long bucket = get_bucket(*pos);
        struct net *net = seq_file_net(seq);
        struct sock *sk;

        while (bucket < UNIX_HASH_SIZE) {
                spin_lock(&net->unx.table.locks[bucket]);

                sk = unix_from_bucket(seq, pos);
                if (sk)
                        return sk;

                spin_unlock(&net->unx.table.locks[bucket]);

                *pos = set_bucket_offset(++bucket, 1);
        }

        return NULL;
}

static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
                                  loff_t *pos)
{
        unsigned long bucket = get_bucket(*pos);

        sk = sk_next(sk);
        if (sk)
                return sk;


        spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);

        *pos = set_bucket_offset(++bucket, 1);

        return unix_get_first(seq, pos);
}

static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
{
        if (!*pos)
                return SEQ_START_TOKEN;

        return unix_get_first(seq, pos);
}

static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;

        if (v == SEQ_START_TOKEN)
                return unix_get_first(seq, pos);

        return unix_get_next(seq, v, pos);
}

static void unix_seq_stop(struct seq_file *seq, void *v)
{
        struct sock *sk = v;

        if (sk)
                spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
}

static int unix_seq_show(struct seq_file *seq, void *v)
{

        if (v == SEQ_START_TOKEN)
                seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
                         "Inode Path\n");
        else {
                struct sock *s = v;
                struct unix_sock *u = unix_sk(s);
                unix_state_lock(s);

                seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
                        s,
                        refcount_read(&s->sk_refcnt),
                        0,
                        s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
                        s->sk_type,
                        s->sk_socket ?
                        (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
                        (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
                        sock_i_ino(s));

                if (u->addr) {        // under a hash table lock here
                        int i, len;
                        seq_putc(seq, ' ');

                        i = 0;
                        len = u->addr->len -
                                offsetof(struct sockaddr_un, sun_path);
                        if (u->addr->name->sun_path[0]) {
                                len--;
                        } else {
                                seq_putc(seq, '@');
                                i++;
                        }
                        for ( ; i < len; i++)
                                seq_putc(seq, u->addr->name->sun_path[i] ?:
                                         '@');
                }
                unix_state_unlock(s);
                seq_putc(seq, '\n');
        }

        return 0;
}

static const struct seq_operations unix_seq_ops = {
        .start  = unix_seq_start,
        .next   = unix_seq_next,
        .stop   = unix_seq_stop,
        .show   = unix_seq_show,
};

#ifdef CONFIG_BPF_SYSCALL
struct bpf_unix_iter_state {
        struct seq_net_private p;
        unsigned int cur_sk;
        unsigned int end_sk;
        unsigned int max_sk;
        struct sock **batch;
        bool st_bucket_done;
};

struct bpf_iter__unix {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct unix_sock *, unix_sk);
        uid_t uid __aligned(8);
};

static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
                              struct unix_sock *unix_sk, uid_t uid)
{
        struct bpf_iter__unix ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.unix_sk = unix_sk;
        ctx.uid = uid;
        return bpf_iter_run_prog(prog, &ctx);
}

static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)

{
        struct bpf_unix_iter_state *iter = seq->private;
        unsigned int expected = 1;
        struct sock *sk;

        sock_hold(start_sk);
        iter->batch[iter->end_sk++] = start_sk;

        for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
                if (iter->end_sk < iter->max_sk) {
                        sock_hold(sk);
                        iter->batch[iter->end_sk++] = sk;
                }

                expected++;
        }

        spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);

        return expected;
}

static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
{
        while (iter->cur_sk < iter->end_sk)
                sock_put(iter->batch[iter->cur_sk++]);
}

static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
                                       unsigned int new_batch_sz)
{
        struct sock **new_batch;

        new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
                             GFP_USER | __GFP_NOWARN);
        if (!new_batch)
                return -ENOMEM;

        bpf_iter_unix_put_batch(iter);
        kvfree(iter->batch);
        iter->batch = new_batch;
        iter->max_sk = new_batch_sz;

        return 0;
}

static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
                                        loff_t *pos)
{
        struct bpf_unix_iter_state *iter = seq->private;
        unsigned int expected;
        bool resized = false;
        struct sock *sk;

        if (iter->st_bucket_done)
                *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);

again:
        /* Get a new batch */
        iter->cur_sk = 0;
        iter->end_sk = 0;

        sk = unix_get_first(seq, pos);
        if (!sk)
                return NULL; /* Done */

        expected = bpf_iter_unix_hold_batch(seq, sk);

        if (iter->end_sk == expected) {
                iter->st_bucket_done = true;
                return sk;
        }

        if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
                resized = true;
                goto again;
        }

        return sk;
}

static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
{
        if (!*pos)
                return SEQ_START_TOKEN;

        /* bpf iter does not support lseek, so it always
         * continue from where it was stop()-ped.
         */
        return bpf_iter_unix_batch(seq, pos);
}

static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct bpf_unix_iter_state *iter = seq->private;
        struct sock *sk;

        /* Whenever seq_next() is called, the iter->cur_sk is
         * done with seq_show(), so advance to the next sk in
         * the batch.
         */
        if (iter->cur_sk < iter->end_sk)
                sock_put(iter->batch[iter->cur_sk++]);

        ++*pos;

        if (iter->cur_sk < iter->end_sk)
                sk = iter->batch[iter->cur_sk];
        else
                sk = bpf_iter_unix_batch(seq, pos);

        return sk;
}

static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        struct sock *sk = v;
        uid_t uid;
        bool slow;
        int ret;

        if (v == SEQ_START_TOKEN)
                return 0;

        slow = lock_sock_fast(sk);

        if (unlikely(sk_unhashed(sk))) {
                ret = SEQ_SKIP;
                goto unlock;
        }

        uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        ret = unix_prog_seq_show(prog, &meta, v, uid);
unlock:
        unlock_sock_fast(sk, slow);
        return ret;
}

static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_unix_iter_state *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)unix_prog_seq_show(prog, &meta, v, 0);
        }

        if (iter->cur_sk < iter->end_sk)
                bpf_iter_unix_put_batch(iter);
}

static const struct seq_operations bpf_iter_unix_seq_ops = {
        .start        = bpf_iter_unix_seq_start,
        .next        = bpf_iter_unix_seq_next,
        .stop        = bpf_iter_unix_seq_stop,
        .show        = bpf_iter_unix_seq_show,
};
#endif
#endif

static const struct net_proto_family unix_family_ops = {
        .family = PF_UNIX,
        .create = unix_create,
        .owner        = THIS_MODULE,
};


static int __net_init unix_net_init(struct net *net)
{
        int i;

        net->unx.sysctl_max_dgram_qlen = 10;
        if (unix_sysctl_register(net))
                goto out;

#ifdef CONFIG_PROC_FS
        if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
                             sizeof(struct seq_net_private)))
                goto err_sysctl;
#endif

        net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
                                              sizeof(spinlock_t), GFP_KERNEL);
        if (!net->unx.table.locks)
                goto err_proc;

        net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
                                                sizeof(struct hlist_head),
                                                GFP_KERNEL);
        if (!net->unx.table.buckets)
                goto free_locks;

        for (i = 0; i < UNIX_HASH_SIZE; i++) {
                spin_lock_init(&net->unx.table.locks[i]);
                lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
                INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
        }

        return 0;

free_locks:
        kvfree(net->unx.table.locks);
err_proc:
#ifdef CONFIG_PROC_FS
        remove_proc_entry("unix", net->proc_net);
err_sysctl:
#endif
        unix_sysctl_unregister(net);
out:
        return -ENOMEM;
}

static void __net_exit unix_net_exit(struct net *net)
{
        kvfree(net->unx.table.buckets);
        kvfree(net->unx.table.locks);
        unix_sysctl_unregister(net);
        remove_proc_entry("unix", net->proc_net);
}

static struct pernet_operations unix_net_ops = {
        .init = unix_net_init,
        .exit = unix_net_exit,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
                     struct unix_sock *unix_sk, uid_t uid)

#define INIT_BATCH_SZ 16

static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
{
        struct bpf_unix_iter_state *iter = priv_data;
        int err;

        err = bpf_iter_init_seq_net(priv_data, aux);
        if (err)
                return err;

        err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
        if (err) {
                bpf_iter_fini_seq_net(priv_data);
                return err;
        }

        return 0;
}

static void bpf_iter_fini_unix(void *priv_data)
{
        struct bpf_unix_iter_state *iter = priv_data;

        bpf_iter_fini_seq_net(priv_data);
        kvfree(iter->batch);
}

static const struct bpf_iter_seq_info unix_seq_info = {
        .seq_ops                = &bpf_iter_unix_seq_ops,
        .init_seq_private        = bpf_iter_init_unix,
        .fini_seq_private        = bpf_iter_fini_unix,
        .seq_priv_size                = sizeof(struct bpf_unix_iter_state),
};

static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
                             const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sk_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sk_getsockopt_proto;
        default:
                return NULL;
        }
}

static struct bpf_iter_reg unix_reg_info = {
        .target                        = "unix",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__unix, unix_sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .get_func_proto         = bpf_iter_unix_get_func_proto,
        .seq_info                = &unix_seq_info,
};

static void __init bpf_iter_register(void)
{
        unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
        if (bpf_iter_reg_target(&unix_reg_info))
                pr_warn("Warning: could not register bpf iterator unix\n");
}
#endif

static int __init af_unix_init(void)
{
        int i, rc = -1;

        BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));

        for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
                spin_lock_init(&bsd_socket_locks[i]);
                INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
        }

        rc = proto_register(&unix_dgram_proto, 1);
        if (rc != 0) {
                pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
                goto out;
        }

        rc = proto_register(&unix_stream_proto, 1);
        if (rc != 0) {
                pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
                proto_unregister(&unix_dgram_proto);
                goto out;
        }

        sock_register(&unix_family_ops);
        register_pernet_subsys(&unix_net_ops);
        unix_bpf_build_proto();

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_register();
#endif

out:
        return rc;
}

/* Later than subsys_initcall() because we depend on stuff initialised there */
fs_initcall(af_unix_init);



























































































    2 



    2 

    2 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2004 IBM Corporation
 *
 *  Author: Serge Hallyn <serue@us.ibm.com>
 */

#include <linux/export.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/sched/task.h>

static struct kmem_cache *uts_ns_cache __ro_after_init;

static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
}

static void dec_uts_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
}

static struct uts_namespace *create_uts_ns(void)
{
        struct uts_namespace *uts_ns;

        uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
        if (uts_ns)
                refcount_set(&uts_ns->ns.count, 1);
        return uts_ns;
}

/*
 * Clone a new ns copying an original utsname, setting refcount to 1
 * @old_ns: namespace to clone
 * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise
 */
static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
                                          struct uts_namespace *old_ns)
{
        struct uts_namespace *ns;
        struct ucounts *ucounts;
        int err;

        err = -ENOSPC;
        ucounts = inc_uts_namespaces(user_ns);
        if (!ucounts)
                goto fail;

        err = -ENOMEM;
        ns = create_uts_ns();
        if (!ns)
                goto fail_dec;

        err = ns_alloc_inum(&ns->ns);
        if (err)
                goto fail_free;

        ns->ucounts = ucounts;
        ns->ns.ops = &utsns_operations;

        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
        ns->user_ns = get_user_ns(user_ns);
        up_read(&uts_sem);
        return ns;

fail_free:
        kmem_cache_free(uts_ns_cache, ns);
fail_dec:
        dec_uts_namespaces(ucounts);
fail:
        return ERR_PTR(err);
}

/*
 * Copy task tsk's utsname namespace, or clone it if flags
 * specifies CLONE_NEWUTS.  In latter case, changes to the
 * utsname of this process won't be seen by parent, and vice
 * versa.
 */
struct uts_namespace *copy_utsname(unsigned long flags,
        struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
        struct uts_namespace *new_ns;

        BUG_ON(!old_ns);
        get_uts_ns(old_ns);

        if (!(flags & CLONE_NEWUTS))
                return old_ns;

        new_ns = clone_uts_ns(user_ns, old_ns);

        put_uts_ns(old_ns);
        return new_ns;
}

void free_uts_ns(struct uts_namespace *ns)
{
        dec_uts_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
        kmem_cache_free(uts_ns_cache, ns);
}

static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
{
        return container_of(ns, struct uts_namespace, ns);
}

static struct ns_common *utsns_get(struct task_struct *task)
{
        struct uts_namespace *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->uts_ns;
                get_uts_ns(ns);
        }
        task_unlock(task);

        return ns ? &ns->ns : NULL;
}

static void utsns_put(struct ns_common *ns)
{
        put_uts_ns(to_uts_ns(ns));
}

static int utsns_install(struct nsset *nsset, struct ns_common *new)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct uts_namespace *ns = to_uts_ns(new);

        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        get_uts_ns(ns);
        put_uts_ns(nsproxy->uts_ns);
        nsproxy->uts_ns = ns;
        return 0;
}

static struct user_namespace *utsns_owner(struct ns_common *ns)
{
        return to_uts_ns(ns)->user_ns;
}

const struct proc_ns_operations utsns_operations = {
        .name                = "uts",
        .type                = CLONE_NEWUTS,
        .get                = utsns_get,
        .put                = utsns_put,
        .install        = utsns_install,
        .owner                = utsns_owner,
};

void __init uts_ns_init(void)
{
        uts_ns_cache = kmem_cache_create_usercopy(
                        "uts_namespace", sizeof(struct uts_namespace), 0,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct uts_namespace, name),
                        sizeof_field(struct uts_namespace, name),
                        NULL);
}
































































































    1 
    1 






























































































































































































































































































































































































































































































































































































































































































































































    1 



    1 





























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This is a module which is used for logging packets to userspace via
 * nfetlink.
 *
 * (C) 2005 by Harald Welte <laforge@netfilter.org>
 * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
 *
 * Based on the old ipv4-only ipt_ULOG.c:
 * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/netfilter_bridge.h>
#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_log.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/spinlock.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/netfilter/nf_log.h>
#include <net/netns/generic.h>

#include <linux/atomic.h>
#include <linux/refcount.h>


#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
#include "../bridge/br_private.h"
#endif

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif

#define NFULNL_COPY_DISABLED        0xff
#define NFULNL_NLBUFSIZ_DEFAULT        NLMSG_GOODSIZE
#define NFULNL_TIMEOUT_DEFAULT         100        /* every second */
#define NFULNL_QTHRESH_DEFAULT         100        /* 100 packets */
/* max packet size is limited by 16-bit struct nfattr nfa_len field */
#define NFULNL_COPY_RANGE_MAX        (0xFFFF - NLA_HDRLEN)

#define PRINTR(x, args...)        do { if (net_ratelimit()) \
                                     printk(x, ## args); } while (0);

struct nfulnl_instance {
        struct hlist_node hlist;        /* global list of instances */
        spinlock_t lock;
        refcount_t use;                        /* use count */

        unsigned int qlen;                /* number of nlmsgs in skb */
        struct sk_buff *skb;                /* pre-allocatd skb */
        struct timer_list timer;
        struct net *net;
        netns_tracker ns_tracker;
        struct user_namespace *peer_user_ns;        /* User namespace of the peer process */
        u32 peer_portid;                /* PORTID of the peer process */

        /* configurable parameters */
        unsigned int flushtimeout;        /* timeout until queue flush */
        unsigned int nlbufsiz;                /* netlink buffer allocation size */
        unsigned int qthreshold;        /* threshold of the queue */
        u_int32_t copy_range;
        u_int32_t seq;                        /* instance-local sequential counter */
        u_int16_t group_num;                /* number of this queue */
        u_int16_t flags;
        u_int8_t copy_mode;
        struct rcu_head rcu;
};

#define INSTANCE_BUCKETS        16

static unsigned int nfnl_log_net_id __read_mostly;

struct nfnl_log_net {
        spinlock_t instances_lock;
        struct hlist_head instance_table[INSTANCE_BUCKETS];
        atomic_t global_seq;
};

static struct nfnl_log_net *nfnl_log_pernet(struct net *net)
{
        return net_generic(net, nfnl_log_net_id);
}

static inline u_int8_t instance_hashfn(u_int16_t group_num)
{
        return ((group_num & 0xff) % INSTANCE_BUCKETS);
}

static struct nfulnl_instance *
__instance_lookup(const struct nfnl_log_net *log, u16 group_num)
{
        const struct hlist_head *head;
        struct nfulnl_instance *inst;

        head = &log->instance_table[instance_hashfn(group_num)];
        hlist_for_each_entry_rcu(inst, head, hlist) {
                if (inst->group_num == group_num)
                        return inst;
        }
        return NULL;
}

static inline void
instance_get(struct nfulnl_instance *inst)
{
        refcount_inc(&inst->use);
}

static struct nfulnl_instance *
instance_lookup_get_rcu(const struct nfnl_log_net *log, u16 group_num)
{
        struct nfulnl_instance *inst;

        inst = __instance_lookup(log, group_num);
        if (inst && !refcount_inc_not_zero(&inst->use))
                inst = NULL;

        return inst;
}

static struct nfulnl_instance *
instance_lookup_get(const struct nfnl_log_net *log, u16 group_num)
{
        struct nfulnl_instance *inst;

        rcu_read_lock();
        inst = instance_lookup_get_rcu(log, group_num);
        rcu_read_unlock();

        return inst;
}

static void nfulnl_instance_free_rcu(struct rcu_head *head)
{
        struct nfulnl_instance *inst =
                container_of(head, struct nfulnl_instance, rcu);

        put_net_track(inst->net, &inst->ns_tracker);
        kfree(inst);
        module_put(THIS_MODULE);
}

static void
instance_put(struct nfulnl_instance *inst)
{
        if (inst && refcount_dec_and_test(&inst->use))
                call_rcu(&inst->rcu, nfulnl_instance_free_rcu);
}

static void nfulnl_timer(struct timer_list *t);

static struct nfulnl_instance *
instance_create(struct net *net, u_int16_t group_num,
                u32 portid, struct user_namespace *user_ns)
{
        struct nfulnl_instance *inst;
        struct nfnl_log_net *log = nfnl_log_pernet(net);
        int err;

        spin_lock_bh(&log->instances_lock);
        if (__instance_lookup(log, group_num)) {
                err = -EEXIST;
                goto out_unlock;
        }

        inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
        if (!inst) {
                err = -ENOMEM;
                goto out_unlock;
        }

        if (!try_module_get(THIS_MODULE)) {
                kfree(inst);
                err = -EAGAIN;
                goto out_unlock;
        }

        INIT_HLIST_NODE(&inst->hlist);
        spin_lock_init(&inst->lock);
        /* needs to be two, since we _put() after creation */
        refcount_set(&inst->use, 2);

        timer_setup(&inst->timer, nfulnl_timer, 0);

        inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC);
        inst->peer_user_ns = user_ns;
        inst->peer_portid = portid;
        inst->group_num = group_num;

        inst->qthreshold         = NFULNL_QTHRESH_DEFAULT;
        inst->flushtimeout         = NFULNL_TIMEOUT_DEFAULT;
        inst->nlbufsiz                 = NFULNL_NLBUFSIZ_DEFAULT;
        inst->copy_mode         = NFULNL_COPY_PACKET;
        inst->copy_range         = NFULNL_COPY_RANGE_MAX;

        hlist_add_head_rcu(&inst->hlist,
                       &log->instance_table[instance_hashfn(group_num)]);


        spin_unlock_bh(&log->instances_lock);

        return inst;

out_unlock:
        spin_unlock_bh(&log->instances_lock);
        return ERR_PTR(err);
}

static void __nfulnl_flush(struct nfulnl_instance *inst);

/* called with BH disabled */
static void
__instance_destroy(struct nfulnl_instance *inst)
{
        /* first pull it out of the global list */
        hlist_del_rcu(&inst->hlist);

        /* then flush all pending packets from skb */

        spin_lock(&inst->lock);

        /* lockless readers wont be able to use us */
        inst->copy_mode = NFULNL_COPY_DISABLED;

        if (inst->skb)
                __nfulnl_flush(inst);
        spin_unlock(&inst->lock);

        /* and finally put the refcount */
        instance_put(inst);
}

static inline void
instance_destroy(struct nfnl_log_net *log,
                 struct nfulnl_instance *inst)
{
        spin_lock_bh(&log->instances_lock);
        __instance_destroy(inst);
        spin_unlock_bh(&log->instances_lock);
}

static int
nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode,
                  unsigned int range)
{
        int status = 0;

        spin_lock_bh(&inst->lock);

        switch (mode) {
        case NFULNL_COPY_NONE:
        case NFULNL_COPY_META:
                inst->copy_mode = mode;
                inst->copy_range = 0;
                break;

        case NFULNL_COPY_PACKET:
                inst->copy_mode = mode;
                if (range == 0)
                        range = NFULNL_COPY_RANGE_MAX;
                inst->copy_range = min_t(unsigned int,
                                         range, NFULNL_COPY_RANGE_MAX);
                break;

        default:
                status = -EINVAL;
                break;
        }

        spin_unlock_bh(&inst->lock);

        return status;
}

static int
nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
{
        int status;

        spin_lock_bh(&inst->lock);
        if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT)
                status = -ERANGE;
        else if (nlbufsiz > 131072)
                status = -ERANGE;
        else {
                inst->nlbufsiz = nlbufsiz;
                status = 0;
        }
        spin_unlock_bh(&inst->lock);

        return status;
}

static void
nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
{
        spin_lock_bh(&inst->lock);
        inst->flushtimeout = timeout;
        spin_unlock_bh(&inst->lock);
}

static void
nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
{
        spin_lock_bh(&inst->lock);
        inst->qthreshold = qthresh;
        spin_unlock_bh(&inst->lock);
}

static int
nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
{
        spin_lock_bh(&inst->lock);
        inst->flags = flags;
        spin_unlock_bh(&inst->lock);

        return 0;
}

static struct sk_buff *
nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
                 unsigned int pkt_size)
{
        struct sk_buff *skb;
        unsigned int n;

        /* alloc skb which should be big enough for a whole multipart
         * message.  WARNING: has to be <= 128k due to slab restrictions */

        n = max(inst_size, pkt_size);
        skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
        if (!skb) {
                if (n > pkt_size) {
                        /* try to allocate only as much as we need for current
                         * packet */

                        skb = alloc_skb(pkt_size, GFP_ATOMIC);
                }
        }

        return skb;
}

static void
__nfulnl_send(struct nfulnl_instance *inst)
{
        if (inst->qlen > 1) {
                struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0,
                                                 NLMSG_DONE,
                                                 sizeof(struct nfgenmsg),
                                                 0);
                if (WARN_ONCE(!nlh, "bad nlskb size: %u, tailroom %d\n",
                              inst->skb->len, skb_tailroom(inst->skb))) {
                        kfree_skb(inst->skb);
                        goto out;
                }
        }
        nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid);
out:
        inst->qlen = 0;
        inst->skb = NULL;
}

static void
__nfulnl_flush(struct nfulnl_instance *inst)
{
        /* timer holds a reference */
        if (del_timer(&inst->timer))
                instance_put(inst);
        if (inst->skb)
                __nfulnl_send(inst);
}

static void
nfulnl_timer(struct timer_list *t)
{
        struct nfulnl_instance *inst = from_timer(inst, t, timer);

        spin_lock_bh(&inst->lock);
        if (inst->skb)
                __nfulnl_send(inst);
        spin_unlock_bh(&inst->lock);
        instance_put(inst);
}

static u32 nfulnl_get_bridge_size(const struct sk_buff *skb)
{
        u32 size = 0;

        if (!skb_mac_header_was_set(skb))
                return 0;

        if (skb_vlan_tag_present(skb)) {
                size += nla_total_size(0); /* nested */
                size += nla_total_size(sizeof(u16)); /* id */
                size += nla_total_size(sizeof(u16)); /* tag */
        }

        if (skb->network_header > skb->mac_header)
                size += nla_total_size(skb->network_header - skb->mac_header);

        return size;
}

static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb)
{
        if (!skb_mac_header_was_set(skb))
                return 0;

        if (skb_vlan_tag_present(skb)) {
                struct nlattr *nest;

                nest = nla_nest_start(inst->skb, NFULA_VLAN);
                if (!nest)
                        goto nla_put_failure;

                if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) ||
                    nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto))
                        goto nla_put_failure;

                nla_nest_end(inst->skb, nest);
        }

        if (skb->mac_header < skb->network_header) {
                int len = (int)(skb->network_header - skb->mac_header);

                if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb)))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -1;
}

/* This is an inline function, we don't really care about a long
 * list of arguments */
static inline int
__build_packet_message(struct nfnl_log_net *log,
                        struct nfulnl_instance *inst,
                        const struct sk_buff *skb,
                        unsigned int data_len,
                        u_int8_t pf,
                        unsigned int hooknum,
                        const struct net_device *indev,
                        const struct net_device *outdev,
                        const char *prefix, unsigned int plen,
                        const struct nfnl_ct_hook *nfnl_ct,
                        struct nf_conn *ct, enum ip_conntrack_info ctinfo)
{
        struct nfulnl_msg_packet_hdr pmsg;
        struct nlmsghdr *nlh;
        sk_buff_data_t old_tail = inst->skb->tail;
        struct sock *sk;
        const unsigned char *hwhdrp;

        nlh = nfnl_msg_put(inst->skb, 0, 0,
                           nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
                           0, pf, NFNETLINK_V0, htons(inst->group_num));
        if (!nlh)
                return -1;

        memset(&pmsg, 0, sizeof(pmsg));
        pmsg.hw_protocol        = skb->protocol;
        pmsg.hook                = hooknum;

        if (nla_put(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg))
                goto nla_put_failure;

        if (prefix &&
            nla_put(inst->skb, NFULA_PREFIX, plen, prefix))
                goto nla_put_failure;

        if (indev) {
#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
                                 htonl(indev->ifindex)))
                        goto nla_put_failure;
#else
                if (pf == PF_BRIDGE) {
                        /* Case 1: outdev is physical input device, we need to
                         * look for bridge group (when called from
                         * netfilter_bridge) */
                        if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
                                         htonl(indev->ifindex)) ||
                        /* this is the bridge group "brX" */
                        /* rcu_read_lock()ed by nf_hook_thresh or
                         * nf_log_packet.
                         */
                            nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
                                         htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
                                goto nla_put_failure;
                } else {
                        int physinif;

                        /* Case 2: indev is bridge group, we need to look for
                         * physical device (when called from ipv4) */
                        if (nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV,
                                         htonl(indev->ifindex)))
                                goto nla_put_failure;

                        physinif = nf_bridge_get_physinif(skb);
                        if (physinif &&
                            nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
                                         htonl(physinif)))
                                goto nla_put_failure;
                }
#endif
        }

        if (outdev) {
#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
                                 htonl(outdev->ifindex)))
                        goto nla_put_failure;
#else
                if (pf == PF_BRIDGE) {
                        /* Case 1: outdev is physical output device, we need to
                         * look for bridge group (when called from
                         * netfilter_bridge) */
                        if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
                                         htonl(outdev->ifindex)) ||
                        /* this is the bridge group "brX" */
                        /* rcu_read_lock()ed by nf_hook_thresh or
                         * nf_log_packet.
                         */
                            nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
                                         htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
                                goto nla_put_failure;
                } else {
                        struct net_device *physoutdev;

                        /* Case 2: indev is a bridge group, we need to look
                         * for physical device (when called from ipv4) */
                        if (nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV,
                                         htonl(outdev->ifindex)))
                                goto nla_put_failure;

                        physoutdev = nf_bridge_get_physoutdev(skb);
                        if (physoutdev &&
                            nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
                                         htonl(physoutdev->ifindex)))
                                goto nla_put_failure;
                }
#endif
        }

        if (skb->mark &&
            nla_put_be32(inst->skb, NFULA_MARK, htonl(skb->mark)))
                goto nla_put_failure;

        if (indev && skb->dev &&
            skb_mac_header_was_set(skb) &&
            skb_mac_header_len(skb) != 0) {
                struct nfulnl_msg_packet_hw phw;
                int len;

                memset(&phw, 0, sizeof(phw));
                len = dev_parse_header(skb, phw.hw_addr);
                if (len > 0) {
                        phw.hw_addrlen = htons(len);
                        if (nla_put(inst->skb, NFULA_HWADDR, sizeof(phw), &phw))
                                goto nla_put_failure;
                }
        }

        if (indev && skb_mac_header_was_set(skb)) {
                if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) ||
                    nla_put_be16(inst->skb, NFULA_HWLEN,
                                 htons(skb->dev->hard_header_len)))
                        goto nla_put_failure;

                hwhdrp = skb_mac_header(skb);

                if (skb->dev->type == ARPHRD_SIT)
                        hwhdrp -= ETH_HLEN;

                if (hwhdrp >= skb->head &&
                    nla_put(inst->skb, NFULA_HWHEADER,
                            skb->dev->hard_header_len, hwhdrp))
                        goto nla_put_failure;
        }

        if (hooknum <= NF_INET_FORWARD) {
                struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true));
                struct nfulnl_msg_packet_timestamp ts;
                ts.sec = cpu_to_be64(kts.tv_sec);
                ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);

                if (nla_put(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts))
                        goto nla_put_failure;
        }

        /* UID */
        sk = skb->sk;
        if (sk && sk_fullsock(sk)) {
                read_lock_bh(&sk->sk_callback_lock);
                if (sk->sk_socket && sk->sk_socket->file) {
                        struct file *file = sk->sk_socket->file;
                        const struct cred *cred = file->f_cred;
                        struct user_namespace *user_ns = inst->peer_user_ns;
                        __be32 uid = htonl(from_kuid_munged(user_ns, cred->fsuid));
                        __be32 gid = htonl(from_kgid_munged(user_ns, cred->fsgid));
                        read_unlock_bh(&sk->sk_callback_lock);
                        if (nla_put_be32(inst->skb, NFULA_UID, uid) ||
                            nla_put_be32(inst->skb, NFULA_GID, gid))
                                goto nla_put_failure;
                } else
                        read_unlock_bh(&sk->sk_callback_lock);
        }

        /* local sequence number */
        if ((inst->flags & NFULNL_CFG_F_SEQ) &&
            nla_put_be32(inst->skb, NFULA_SEQ, htonl(inst->seq++)))
                goto nla_put_failure;

        /* global sequence number */
        if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) &&
            nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL,
                         htonl(atomic_inc_return(&log->global_seq))))
                goto nla_put_failure;

        if (ct && nfnl_ct->build(inst->skb, ct, ctinfo,
                                 NFULA_CT, NFULA_CT_INFO) < 0)
                goto nla_put_failure;

        if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) &&
            nfulnl_put_bridge(inst, skb) < 0)
                goto nla_put_failure;

        if (data_len) {
                struct nlattr *nla;
                int size = nla_attr_size(data_len);

                if (skb_tailroom(inst->skb) < nla_total_size(data_len))
                        goto nla_put_failure;

                nla = skb_put(inst->skb, nla_total_size(data_len));
                nla->nla_type = NFULA_PAYLOAD;
                nla->nla_len = size;

                if (skb_copy_bits(skb, 0, nla_data(nla), data_len))
                        BUG();
        }

        nlh->nlmsg_len = inst->skb->tail - old_tail;
        return 0;

nla_put_failure:
        PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
        return -1;
}

static const struct nf_loginfo default_loginfo = {
        .type =                NF_LOG_TYPE_ULOG,
        .u = {
                .ulog = {
                        .copy_len        = 0xffff,
                        .group                = 0,
                        .qthreshold        = 1,
                },
        },
};

/* log handler for internal netfilter logging api */
static void
nfulnl_log_packet(struct net *net,
                  u_int8_t pf,
                  unsigned int hooknum,
                  const struct sk_buff *skb,
                  const struct net_device *in,
                  const struct net_device *out,
                  const struct nf_loginfo *li_user,
                  const char *prefix)
{
        size_t size;
        unsigned int data_len;
        struct nfulnl_instance *inst;
        const struct nf_loginfo *li;
        unsigned int qthreshold;
        unsigned int plen = 0;
        struct nfnl_log_net *log = nfnl_log_pernet(net);
        const struct nfnl_ct_hook *nfnl_ct = NULL;
        enum ip_conntrack_info ctinfo = 0;
        struct nf_conn *ct = NULL;

        if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
                li = li_user;
        else
                li = &default_loginfo;

        inst = instance_lookup_get_rcu(log, li->u.ulog.group);
        if (!inst)
                return;

        if (prefix)
                plen = strlen(prefix) + 1;

        /* FIXME: do we want to make the size calculation conditional based on
         * what is actually present?  way more branches and checks, but more
         * memory efficient... */
        size = nlmsg_total_size(sizeof(struct nfgenmsg))
                + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
#endif
                + nla_total_size(sizeof(u_int32_t))        /* mark */
                + nla_total_size(sizeof(u_int32_t))        /* uid */
                + nla_total_size(sizeof(u_int32_t))        /* gid */
                + nla_total_size(plen)                        /* prefix */
                + nla_total_size(sizeof(struct nfulnl_msg_packet_hw))
                + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp))
                + nla_total_size(sizeof(struct nfgenmsg));        /* NLMSG_DONE */

        if (in && skb_mac_header_was_set(skb)) {
                size += nla_total_size(skb->dev->hard_header_len)
                        + nla_total_size(sizeof(u_int16_t))        /* hwtype */
                        + nla_total_size(sizeof(u_int16_t));        /* hwlen */
        }

        spin_lock_bh(&inst->lock);

        if (inst->flags & NFULNL_CFG_F_SEQ)
                size += nla_total_size(sizeof(u_int32_t));
        if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
                size += nla_total_size(sizeof(u_int32_t));
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        if (inst->flags & NFULNL_CFG_F_CONNTRACK) {
                nfnl_ct = rcu_dereference(nfnl_ct_hook);
                if (nfnl_ct != NULL) {
                        ct = nf_ct_get(skb, &ctinfo);
                        if (ct != NULL)
                                size += nfnl_ct->build_size(ct);
                }
        }
#endif
        if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE)
                size += nfulnl_get_bridge_size(skb);

        qthreshold = inst->qthreshold;
        /* per-rule qthreshold overrides per-instance */
        if (li->u.ulog.qthreshold)
                if (qthreshold > li->u.ulog.qthreshold)
                        qthreshold = li->u.ulog.qthreshold;


        switch (inst->copy_mode) {
        case NFULNL_COPY_META:
        case NFULNL_COPY_NONE:
                data_len = 0;
                break;

        case NFULNL_COPY_PACKET:
                data_len = inst->copy_range;
                if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) &&
                    (li->u.ulog.copy_len < data_len))
                        data_len = li->u.ulog.copy_len;

                if (data_len > skb->len)
                        data_len = skb->len;

                size += nla_total_size(data_len);
                break;

        case NFULNL_COPY_DISABLED:
        default:
                goto unlock_and_release;
        }

        if (inst->skb && size > skb_tailroom(inst->skb)) {
                /* either the queue len is too high or we don't have
                 * enough room in the skb left. flush to userspace. */
                __nfulnl_flush(inst);
        }

        if (!inst->skb) {
                inst->skb = nfulnl_alloc_skb(net, inst->peer_portid,
                                             inst->nlbufsiz, size);
                if (!inst->skb)
                        goto alloc_failure;
        }

        inst->qlen++;

        __build_packet_message(log, inst, skb, data_len, pf,
                                hooknum, in, out, prefix, plen,
                                nfnl_ct, ct, ctinfo);

        if (inst->qlen >= qthreshold)
                __nfulnl_flush(inst);
        /* timer_pending always called within inst->lock, so there
         * is no chance of a race here */
        else if (!timer_pending(&inst->timer)) {
                instance_get(inst);
                inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100);
                add_timer(&inst->timer);
        }

unlock_and_release:
        spin_unlock_bh(&inst->lock);
        instance_put(inst);
        return;

alloc_failure:
        /* FIXME: statistics */
        goto unlock_and_release;
}

static int
nfulnl_rcv_nl_event(struct notifier_block *this,
                   unsigned long event, void *ptr)
{
        struct netlink_notify *n = ptr;
        struct nfnl_log_net *log = nfnl_log_pernet(n->net);

        if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
                int i;

                /* destroy all instances for this portid */
                spin_lock_bh(&log->instances_lock);
                for  (i = 0; i < INSTANCE_BUCKETS; i++) {
                        struct hlist_node *t2;
                        struct nfulnl_instance *inst;
                        struct hlist_head *head = &log->instance_table[i];

                        hlist_for_each_entry_safe(inst, t2, head, hlist) {
                                if (n->portid == inst->peer_portid)
                                        __instance_destroy(inst);
                        }
                }
                spin_unlock_bh(&log->instances_lock);
        }
        return NOTIFY_DONE;
}

static struct notifier_block nfulnl_rtnl_notifier = {
        .notifier_call        = nfulnl_rcv_nl_event,
};

static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nfula[])
{
        return -ENOTSUPP;
}

static struct nf_logger nfulnl_logger __read_mostly = {
        .name        = "nfnetlink_log",
        .type        = NF_LOG_TYPE_ULOG,
        .logfn        = nfulnl_log_packet,
        .me        = THIS_MODULE,
};

static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
        [NFULA_CFG_CMD]                = { .len = sizeof(struct nfulnl_msg_config_cmd) },
        [NFULA_CFG_MODE]        = { .len = sizeof(struct nfulnl_msg_config_mode) },
        [NFULA_CFG_TIMEOUT]        = { .type = NLA_U32 },
        [NFULA_CFG_QTHRESH]        = { .type = NLA_U32 },
        [NFULA_CFG_NLBUFSIZ]        = { .type = NLA_U32 },
        [NFULA_CFG_FLAGS]        = { .type = NLA_U16 },
};

static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nfula[])
{
        struct nfnl_log_net *log = nfnl_log_pernet(info->net);
        u_int16_t group_num = ntohs(info->nfmsg->res_id);
        struct nfulnl_msg_config_cmd *cmd = NULL;
        struct nfulnl_instance *inst;
        u16 flags = 0;
        int ret = 0;

        if (nfula[NFULA_CFG_CMD]) {
                u_int8_t pf = info->nfmsg->nfgen_family;
                cmd = nla_data(nfula[NFULA_CFG_CMD]);

                /* Commands without queue context */
                switch (cmd->command) {
                case NFULNL_CFG_CMD_PF_BIND:
                        return nf_log_bind_pf(info->net, pf, &nfulnl_logger);
                case NFULNL_CFG_CMD_PF_UNBIND:
                        nf_log_unbind_pf(info->net, pf);
                        return 0;
                }
        }

        inst = instance_lookup_get(log, group_num);
        if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {
                ret = -EPERM;
                goto out_put;
        }

        /* Check if we support these flags in first place, dependencies should
         * be there too not to break atomicity.
         */
        if (nfula[NFULA_CFG_FLAGS]) {
                flags = ntohs(nla_get_be16(nfula[NFULA_CFG_FLAGS]));

                if ((flags & NFULNL_CFG_F_CONNTRACK) &&
                    !rcu_access_pointer(nfnl_ct_hook)) {
#ifdef CONFIG_MODULES
                        nfnl_unlock(NFNL_SUBSYS_ULOG);
                        request_module("ip_conntrack_netlink");
                        nfnl_lock(NFNL_SUBSYS_ULOG);
                        if (rcu_access_pointer(nfnl_ct_hook)) {
                                ret = -EAGAIN;
                                goto out_put;
                        }
#endif
                        ret = -EOPNOTSUPP;
                        goto out_put;
                }
        }

        if (cmd != NULL) {
                switch (cmd->command) {
                case NFULNL_CFG_CMD_BIND:
                        if (inst) {
                                ret = -EBUSY;
                                goto out_put;
                        }

                        inst = instance_create(info->net, group_num,
                                               NETLINK_CB(skb).portid,
                                               sk_user_ns(NETLINK_CB(skb).sk));
                        if (IS_ERR(inst)) {
                                ret = PTR_ERR(inst);
                                goto out;
                        }
                        break;
                case NFULNL_CFG_CMD_UNBIND:
                        if (!inst) {
                                ret = -ENODEV;
                                goto out;
                        }

                        instance_destroy(log, inst);
                        goto out_put;
                default:
                        ret = -ENOTSUPP;
                        goto out_put;
                }
        } else if (!inst) {
                ret = -ENODEV;
                goto out;
        }

        if (nfula[NFULA_CFG_MODE]) {
                struct nfulnl_msg_config_mode *params =
                        nla_data(nfula[NFULA_CFG_MODE]);

                nfulnl_set_mode(inst, params->copy_mode,
                                ntohl(params->copy_range));
        }

        if (nfula[NFULA_CFG_TIMEOUT]) {
                __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]);

                nfulnl_set_timeout(inst, ntohl(timeout));
        }

        if (nfula[NFULA_CFG_NLBUFSIZ]) {
                __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]);

                nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz));
        }

        if (nfula[NFULA_CFG_QTHRESH]) {
                __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]);

                nfulnl_set_qthresh(inst, ntohl(qthresh));
        }

        if (nfula[NFULA_CFG_FLAGS])
                nfulnl_set_flags(inst, flags);

out_put:
        instance_put(inst);
out:
        return ret;
}

static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
        [NFULNL_MSG_PACKET]        = {
                .call                = nfulnl_recv_unsupp,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = NFULA_MAX,
        },
        [NFULNL_MSG_CONFIG]        = {
                .call                = nfulnl_recv_config,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = NFULA_CFG_MAX,
                .policy                = nfula_cfg_policy
        },
};

static const struct nfnetlink_subsystem nfulnl_subsys = {
        .name                = "log",
        .subsys_id        = NFNL_SUBSYS_ULOG,
        .cb_count        = NFULNL_MSG_MAX,
        .cb                = nfulnl_cb,
};

#ifdef CONFIG_PROC_FS
struct iter_state {
        struct seq_net_private p;
        unsigned int bucket;
};

static struct hlist_node *get_first(struct net *net, struct iter_state *st)
{
        struct nfnl_log_net *log;
        if (!st)
                return NULL;

        log = nfnl_log_pernet(net);

        for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
                struct hlist_head *head = &log->instance_table[st->bucket];

                if (!hlist_empty(head))
                        return rcu_dereference(hlist_first_rcu(head));
        }
        return NULL;
}

static struct hlist_node *get_next(struct net *net, struct iter_state *st,
                                   struct hlist_node *h)
{
        h = rcu_dereference(hlist_next_rcu(h));
        while (!h) {
                struct nfnl_log_net *log;
                struct hlist_head *head;

                if (++st->bucket >= INSTANCE_BUCKETS)
                        return NULL;

                log = nfnl_log_pernet(net);
                head = &log->instance_table[st->bucket];
                h = rcu_dereference(hlist_first_rcu(head));
        }
        return h;
}

static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
                                  loff_t pos)
{
        struct hlist_node *head;
        head = get_first(net, st);

        if (head)
                while (pos && (head = get_next(net, st, head)))
                        pos--;
        return pos ? NULL : head;
}

static void *seq_start(struct seq_file *s, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return get_idx(seq_file_net(s), s->private, *pos);
}

static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        (*pos)++;
        return get_next(seq_file_net(s), s->private, v);
}

static void seq_stop(struct seq_file *s, void *v)
        __releases(rcu)
{
        rcu_read_unlock();
}

static int seq_show(struct seq_file *s, void *v)
{
        const struct nfulnl_instance *inst = v;

        seq_printf(s, "%5u %6u %5u %1u %5u %6u %2u\n",
                   inst->group_num,
                   inst->peer_portid, inst->qlen,
                   inst->copy_mode, inst->copy_range,
                   inst->flushtimeout, refcount_read(&inst->use));

        return 0;
}

static const struct seq_operations nful_seq_ops = {
        .start        = seq_start,
        .next        = seq_next,
        .stop        = seq_stop,
        .show        = seq_show,
};
#endif /* PROC_FS */

static int __net_init nfnl_log_net_init(struct net *net)
{
        unsigned int i;
        struct nfnl_log_net *log = nfnl_log_pernet(net);
#ifdef CONFIG_PROC_FS
        struct proc_dir_entry *proc;
        kuid_t root_uid;
        kgid_t root_gid;
#endif

        for (i = 0; i < INSTANCE_BUCKETS; i++)
                INIT_HLIST_HEAD(&log->instance_table[i]);
        spin_lock_init(&log->instances_lock);

#ifdef CONFIG_PROC_FS
        proc = proc_create_net("nfnetlink_log", 0440, net->nf.proc_netfilter,
                        &nful_seq_ops, sizeof(struct iter_state));
        if (!proc)
                return -ENOMEM;

        root_uid = make_kuid(net->user_ns, 0);
        root_gid = make_kgid(net->user_ns, 0);
        if (uid_valid(root_uid) && gid_valid(root_gid))
                proc_set_user(proc, root_uid, root_gid);
#endif
        return 0;
}

static void __net_exit nfnl_log_net_exit(struct net *net)
{
        struct nfnl_log_net *log = nfnl_log_pernet(net);
        unsigned int i;

#ifdef CONFIG_PROC_FS
        remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
#endif
        nf_log_unset(net, &nfulnl_logger);
        for (i = 0; i < INSTANCE_BUCKETS; i++)
                WARN_ON_ONCE(!hlist_empty(&log->instance_table[i]));
}

static struct pernet_operations nfnl_log_net_ops = {
        .init        = nfnl_log_net_init,
        .exit        = nfnl_log_net_exit,
        .id        = &nfnl_log_net_id,
        .size        = sizeof(struct nfnl_log_net),
};

static int __init nfnetlink_log_init(void)
{
        int status;

        status = register_pernet_subsys(&nfnl_log_net_ops);
        if (status < 0) {
                pr_err("failed to register pernet ops\n");
                goto out;
        }

        netlink_register_notifier(&nfulnl_rtnl_notifier);
        status = nfnetlink_subsys_register(&nfulnl_subsys);
        if (status < 0) {
                pr_err("failed to create netlink socket\n");
                goto cleanup_netlink_notifier;
        }

        status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
        if (status < 0) {
                pr_err("failed to register logger\n");
                goto cleanup_subsys;
        }

        return status;

cleanup_subsys:
        nfnetlink_subsys_unregister(&nfulnl_subsys);
cleanup_netlink_notifier:
        netlink_unregister_notifier(&nfulnl_rtnl_notifier);
        unregister_pernet_subsys(&nfnl_log_net_ops);
out:
        return status;
}

static void __exit nfnetlink_log_fini(void)
{
        nfnetlink_subsys_unregister(&nfulnl_subsys);
        netlink_unregister_notifier(&nfulnl_rtnl_notifier);
        unregister_pernet_subsys(&nfnl_log_net_ops);
        nf_log_unregister(&nfulnl_logger);
}

MODULE_DESCRIPTION("netfilter userspace logging");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
MODULE_ALIAS_NF_LOGGER(3, 1); /* NFPROTO_ARP */
MODULE_ALIAS_NF_LOGGER(5, 1); /* NFPROTO_NETDEV */

module_init(nfnetlink_log_init);
module_exit(nfnetlink_log_fini);























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










































































    1 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * linux/include/linux/jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>
 *
 * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
 *
 * Definitions for transaction data structures for the buffer cache
 * filesystem journaling support.
 */

#ifndef _LINUX_JBD2_H
#define _LINUX_JBD2_H

/* Allow this file to be included directly into e2fsprogs */
#ifndef __KERNEL__
#include "jfs_compat.h"
#define JBD2_DEBUG
#else

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/journal-head.h>
#include <linux/stddef.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/bit_spinlock.h>
#include <linux/blkdev.h>
#include <crypto/hash.h>
#endif

#define journal_oom_retry 1

/*
 * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds
 * certain classes of error which can occur due to failed IOs.  Under
 * normal use we want ext4 to continue after such errors, because
 * hardware _can_ fail, but for debugging purposes when running tests on
 * known-good hardware we may want to trap these errors.
 */
#undef JBD2_PARANOID_IOFAIL

/*
 * The default maximum commit age, in seconds.
 */
#define JBD2_DEFAULT_MAX_COMMIT_AGE 5

#ifdef CONFIG_JBD2_DEBUG
/*
 * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal
 * consistency checks.  By default we don't do this unless
 * CONFIG_JBD2_DEBUG is on.
 */
#define JBD2_EXPENSIVE_CHECKING
void __jbd2_debug(int level, const char *file, const char *func,
                  unsigned int line, const char *fmt, ...);

#define jbd2_debug(n, fmt, a...) \
        __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
#else
#define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
#endif

extern void *jbd2_alloc(size_t size, gfp_t flags);
extern void jbd2_free(void *ptr, size_t size);

#define JBD2_MIN_JOURNAL_BLOCKS 1024
#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256

#ifdef __KERNEL__

/**
 * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
 *
 * All filesystem modifications made by the process go
 * through this handle.  Recursive operations (such as quota operations)
 * are gathered into a single update.
 *
 * The buffer credits field is used to account for journaled buffers
 * being modified by the running process.  To ensure that there is
 * enough log space for all outstanding operations, we need to limit the
 * number of outstanding buffers possible at any time.  When the
 * operation completes, any buffer credits not used are credited back to
 * the transaction, so that at all times we know how many buffers the
 * outstanding updates on a transaction might possibly touch.
 *
 * This is an opaque datatype.
 **/
typedef struct jbd2_journal_handle handle_t;        /* Atomic operation type */


/**
 * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
 *
 * journal_t is linked to from the fs superblock structure.
 *
 * We use the journal_t to keep track of all outstanding transaction
 * activity on the filesystem, and to manage the state of the log
 * writing process.
 *
 * This is an opaque datatype.
 **/
typedef struct journal_s        journal_t;        /* Journal control structure */
#endif

/*
 * Internal structures used by the logging mechanism:
 */

#define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */

/*
 * On-disk structures
 */

/*
 * Descriptor block types:
 */

#define JBD2_DESCRIPTOR_BLOCK        1
#define JBD2_COMMIT_BLOCK        2
#define JBD2_SUPERBLOCK_V1        3
#define JBD2_SUPERBLOCK_V2        4
#define JBD2_REVOKE_BLOCK        5

/*
 * Standard header for all descriptor blocks:
 */
typedef struct journal_header_s
{
        __be32                h_magic;
        __be32                h_blocktype;
        __be32                h_sequence;
} journal_header_t;

/*
 * Checksum types.
 */
#define JBD2_CRC32_CHKSUM   1
#define JBD2_MD5_CHKSUM     2
#define JBD2_SHA1_CHKSUM    3
#define JBD2_CRC32C_CHKSUM  4

#define JBD2_CRC32_CHKSUM_SIZE 4

#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
/*
 * Commit block header for storing transactional checksums:
 *
 * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum*
 * fields are used to store a checksum of the descriptor and data blocks.
 *
 * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum
 * field is used to store crc32c(uuid+commit_block).  Each journal metadata
 * block gets its own checksum, and data block checksums are stored in
 * journal_block_tag (in the descriptor).  The other h_chksum* fields are
 * not used.
 *
 * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses
 * journal_block_tag3_t to store a full 32-bit checksum.  Everything else
 * is the same as v2.
 *
 * Checksum v1, v2, and v3 are mutually exclusive features.
 */
struct commit_header {
        __be32                h_magic;
        __be32          h_blocktype;
        __be32          h_sequence;
        unsigned char   h_chksum_type;
        unsigned char   h_chksum_size;
        unsigned char         h_padding[2];
        __be32                 h_chksum[JBD2_CHECKSUM_BYTES];
        __be64                h_commit_sec;
        __be32                h_commit_nsec;
};

/*
 * The block tag: used to describe a single buffer in the journal.
 * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this
 * raw struct shouldn't be used for pointer math or sizeof() - use
 * journal_tag_bytes(journal) instead to compute this.
 */
typedef struct journal_block_tag3_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be32                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
        __be32                t_checksum;        /* crc32c(uuid+seq+block) */
} journal_block_tag3_t;

typedef struct journal_block_tag_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be16                t_checksum;        /* truncated crc32c(uuid+seq+block) */
        __be16                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
} journal_block_tag_t;

/* Tail of descriptor or revoke block, for checksumming */
struct jbd2_journal_block_tail {
        __be32                t_checksum;        /* crc32c(uuid+descr_block) */
};

/*
 * The revoke descriptor: used on disk to describe a series of blocks to
 * be revoked from the log
 */
typedef struct jbd2_journal_revoke_header_s
{
        journal_header_t r_header;
        __be32                 r_count;        /* Count of bytes used in the block */
} jbd2_journal_revoke_header_t;

/* Definitions for the journal tag flags word: */
#define JBD2_FLAG_ESCAPE                1        /* on-disk block is escaped */
#define JBD2_FLAG_SAME_UUID        2        /* block has same uuid as previous */
#define JBD2_FLAG_DELETED        4        /* block deleted by this transaction */
#define JBD2_FLAG_LAST_TAG        8        /* last tag in this descriptor block */


/*
 * The journal superblock.  All fields are in big-endian byte order.
 */
typedef struct journal_superblock_s
{
/* 0x0000 */
        journal_header_t s_header;

/* 0x000C */
        /* Static information describing the journal */
        __be32        s_blocksize;                /* journal device blocksize */
        __be32        s_maxlen;                /* total blocks in journal file */
        __be32        s_first;                /* first block of log information */

/* 0x0018 */
        /* Dynamic information describing the current state of the log */
        __be32        s_sequence;                /* first commit ID expected in log */
        __be32        s_start;                /* blocknr of start of log */

/* 0x0020 */
        /* Error value, as set by jbd2_journal_abort(). */
        __be32        s_errno;

/* 0x0024 */
        /* Remaining fields are only valid in a version-2 superblock */
        __be32        s_feature_compat;        /* compatible feature set */
        __be32        s_feature_incompat;        /* incompatible feature set */
        __be32        s_feature_ro_compat;        /* readonly-compatible feature set */
/* 0x0030 */
        __u8        s_uuid[16];                /* 128-bit uuid for journal */

/* 0x0040 */
        __be32        s_nr_users;                /* Nr of filesystems sharing log */

        __be32        s_dynsuper;                /* Blocknr of dynamic superblock copy*/

/* 0x0048 */
        __be32        s_max_transaction;        /* Limit of journal blocks per trans.*/
        __be32        s_max_trans_data;        /* Limit of data blocks per trans. */

/* 0x0050 */
        __u8        s_checksum_type;        /* checksum type */
        __u8        s_padding2[3];
/* 0x0054 */
        __be32        s_num_fc_blks;                /* Number of fast commit blocks */
        __be32        s_head;                        /* blocknr of head of log, only uptodate
                                         * while the filesystem is clean */
/* 0x005C */
        __u32        s_padding[40];
        __be32        s_checksum;                /* crc32c(superblock) */

/* 0x0100 */
        __u8        s_users[16*48];                /* ids of all fs'es sharing the log */
/* 0x0400 */
} journal_superblock_t;

#define JBD2_FEATURE_COMPAT_CHECKSUM                0x00000001

#define JBD2_FEATURE_INCOMPAT_REVOKE                0x00000001
#define JBD2_FEATURE_INCOMPAT_64BIT                0x00000002
#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT        0x00000004
#define JBD2_FEATURE_INCOMPAT_CSUM_V2                0x00000008
#define JBD2_FEATURE_INCOMPAT_CSUM_V3                0x00000010
#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT        0x00000020

/* See "journal feature predicate functions" below */

/* Features known to this kernel version: */
#define JBD2_KNOWN_COMPAT_FEATURES        JBD2_FEATURE_COMPAT_CHECKSUM
#define JBD2_KNOWN_ROCOMPAT_FEATURES        0
#define JBD2_KNOWN_INCOMPAT_FEATURES        (JBD2_FEATURE_INCOMPAT_REVOKE | \
                                        JBD2_FEATURE_INCOMPAT_64BIT | \
                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
                                        JBD2_FEATURE_INCOMPAT_FAST_COMMIT)

#ifdef __KERNEL__

#include <linux/fs.h>
#include <linux/sched.h>

enum jbd_state_bits {
        BH_JBD                        /* Has an attached ext3 journal_head */
          = BH_PrivateStart,
        BH_JWrite,                /* Being written to log (@@@ DEBUGGING) */
        BH_Freed,                /* Has been freed (truncated) */
        BH_Revoked,                /* Has been revoked from the log */
        BH_RevokeValid,                /* Revoked flag is valid */
        BH_JBDDirty,                /* Is dirty but journaled */
        BH_JournalHead,                /* Pins bh->b_private and jh->b_bh */
        BH_Shadow,                /* IO on shadow buffer is running */
        BH_Verified,                /* Metadata block has been verified ok */
        BH_JBDPrivateStart,        /* First bit available for private use by FS */
};

BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
BUFFER_FNS(Shadow, shadow)
BUFFER_FNS(Verified, verified)

static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
        return jh->b_bh;
}

static inline struct journal_head *bh2jh(struct buffer_head *bh)
{
        return bh->b_private;
}

static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_lock(BH_JournalHead, &bh->b_state);
}

static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_unlock(BH_JournalHead, &bh->b_state);
}

#define J_ASSERT(assert)        BUG_ON(!(assert))

#define J_ASSERT_BH(bh, expr)        J_ASSERT(expr)
#define J_ASSERT_JH(jh, expr)        J_ASSERT(expr)

#if defined(JBD2_PARANOID_IOFAIL)
#define J_EXPECT(expr, why...)                J_ASSERT(expr)
#define J_EXPECT_BH(bh, expr, why...)        J_ASSERT_BH(bh, expr)
#define J_EXPECT_JH(jh, expr, why...)        J_ASSERT_JH(jh, expr)
#else
#define __journal_expect(expr, why...)                                             \
        ({                                                                     \
                int val = (expr);                                             \
                if (!val) {                                                     \
                        printk(KERN_ERR                                             \
                               "JBD2 unexpected failure: %s: %s;\n",             \
                               __func__, #expr);                             \
                        printk(KERN_ERR why "\n");                             \
                }                                                             \
                val;                                                             \
        })
#define J_EXPECT(expr, why...)                __journal_expect(expr, ## why)
#define J_EXPECT_BH(bh, expr, why...)        __journal_expect(expr, ## why)
#define J_EXPECT_JH(jh, expr, why...)        __journal_expect(expr, ## why)
#endif

/* Flags in jbd_inode->i_flags */
#define __JI_COMMIT_RUNNING 0
#define __JI_WRITE_DATA 1
#define __JI_WAIT_DATA 2

/*
 * Commit of the inode data in progress. We use this flag to protect us from
 * concurrent deletion of inode. We cannot use reference to inode for this
 * since we cannot afford doing last iput() on behalf of kjournald
 */
#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
/* Write allocated dirty buffers in this inode before commit */
#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
/* Wait for outstanding data writes for this inode before commit */
#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)

/**
 * struct jbd2_inode - The jbd_inode type is the structure linking inodes in
 * ordered mode present in a transaction so that we can sync them during commit.
 */
struct jbd2_inode {
        /**
         * @i_transaction:
         *
         * Which transaction does this inode belong to? Either the running
         * transaction or the committing one. [j_list_lock]
         */
        transaction_t *i_transaction;

        /**
         * @i_next_transaction:
         *
         * Pointer to the running transaction modifying inode's data in case
         * there is already a committing transaction touching it. [j_list_lock]
         */
        transaction_t *i_next_transaction;

        /**
         * @i_list: List of inodes in the i_transaction [j_list_lock]
         */
        struct list_head i_list;

        /**
         * @i_vfs_inode:
         *
         * VFS inode this inode belongs to [constant for lifetime of structure]
         */
        struct inode *i_vfs_inode;

        /**
         * @i_flags: Flags of inode [j_list_lock]
         */
        unsigned long i_flags;

        /**
         * @i_dirty_start:
         *
         * Offset in bytes where the dirty range for this inode starts.
         * [j_list_lock]
         */
        loff_t i_dirty_start;

        /**
         * @i_dirty_end:
         *
         * Inclusive offset in bytes where the dirty range for this inode
         * ends. [j_list_lock]
         */
        loff_t i_dirty_end;
};

struct jbd2_revoke_table_s;

/**
 * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete
 *     type associated with handle_t.
 * @h_transaction: Which compound transaction is this update a part of?
 * @h_journal: Which journal handle belongs to - used iff h_reserved set.
 * @h_rsv_handle: Handle reserved for finishing the logical operation.
 * @h_total_credits: Number of remaining buffers we are allowed to add to
 *        journal. These are dirty buffers and revoke descriptor blocks.
 * @h_revoke_credits: Number of remaining revoke records available for handle
 * @h_ref: Reference count on this handle.
 * @h_err: Field for caller's use to track errors through large fs operations.
 * @h_sync: Flag for sync-on-close.
 * @h_jdata: Flag to force data journaling.
 * @h_reserved: Flag for handle for reserved credits.
 * @h_aborted: Flag indicating fatal error on handle.
 * @h_type: For handle statistics.
 * @h_line_no: For handle statistics.
 * @h_start_jiffies: Handle Start time.
 * @h_requested_credits: Holds @h_total_credits after handle is started.
 * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started.
 * @saved_alloc_context: Saved context while transaction is open.
 **/

/* Docbook can't yet cope with the bit fields, but will leave the documentation
 * in so it can be fixed later.
 */

struct jbd2_journal_handle
{
        union {
                transaction_t        *h_transaction;
                /* Which journal handle belongs to - used iff h_reserved set */
                journal_t        *h_journal;
        };

        handle_t                *h_rsv_handle;
        int                        h_total_credits;
        int                        h_revoke_credits;
        int                        h_revoke_credits_requested;
        int                        h_ref;
        int                        h_err;

        /* Flags [no locking] */
        unsigned int        h_sync:                1;
        unsigned int        h_jdata:        1;
        unsigned int        h_reserved:        1;
        unsigned int        h_aborted:        1;
        unsigned int        h_type:                8;
        unsigned int        h_line_no:        16;

        unsigned long                h_start_jiffies;
        unsigned int                h_requested_credits;

        unsigned int                saved_alloc_context;
};


/*
 * Some stats for checkpoint phase
 */
struct transaction_chp_stats_s {
        unsigned long                cs_chp_time;
        __u32                        cs_forced_to_close;
        __u32                        cs_written;
        __u32                        cs_dropped;
};

/* The transaction_t type is the guts of the journaling mechanism.  It
 * tracks a compound transaction through its various states:
 *
 * RUNNING:        accepting new updates
 * LOCKED:        Updates still running but we don't accept new ones
 * RUNDOWN:        Updates are tidying up but have finished requesting
 *                new buffers to modify (state not used for now)
 * FLUSH:       All updates complete, but we are still writing to disk
 * COMMIT:      All data on disk, writing commit record
 * FINISHED:        We still have to keep the transaction for checkpointing.
 *
 * The transaction keeps track of all of the buffers modified by a
 * running transaction, and all of the buffers committed but not yet
 * flushed to home for finished transactions.
 * (Locking Documentation improved by LockDoc)
 */

/*
 * Lock ranking:
 *
 *    j_list_lock
 *      ->jbd_lock_bh_journal_head()        (This is "innermost")
 *
 *    j_state_lock
 *    ->b_state_lock
 *
 *    b_state_lock
 *    ->j_list_lock
 *
 *    j_state_lock
 *    ->j_list_lock                        (journal_unmap_buffer)
 *
 */

struct transaction_s
{
        /* Pointer to the journal for this transaction. [no locking] */
        journal_t                *t_journal;

        /* Sequence number for this transaction [no locking] */
        tid_t                        t_tid;

        /*
         * Transaction's current state
         * [no locking - only kjournald2 alters this]
         * [j_list_lock] guards transition of a transaction into T_FINISHED
         * state and subsequent call of __jbd2_journal_drop_transaction()
         * FIXME: needs barriers
         * KLUDGE: [use j_state_lock]
         */
        enum {
                T_RUNNING,
                T_LOCKED,
                T_SWITCH,
                T_FLUSH,
                T_COMMIT,
                T_COMMIT_DFLUSH,
                T_COMMIT_JFLUSH,
                T_COMMIT_CALLBACK,
                T_FINISHED
        }                        t_state;

        /*
         * Where in the log does this transaction's commit start? [no locking]
         */
        unsigned long                t_log_start;

        /*
         * Number of buffers on the t_buffers list [j_list_lock, no locks
         * needed for jbd2 thread]
         */
        int                        t_nr_buffers;

        /*
         * Doubly-linked circular list of all buffers reserved but not yet
         * modified by this transaction [j_list_lock, no locks needed fo
         * jbd2 thread]
         */
        struct journal_head        *t_reserved_list;

        /*
         * Doubly-linked circular list of all metadata buffers owned by this
         * transaction [j_list_lock, no locks needed for jbd2 thread]
         */
        struct journal_head        *t_buffers;

        /*
         * Doubly-linked circular list of all forget buffers (superseded
         * buffers which we can un-checkpoint once this transaction commits)
         * [j_list_lock]
         */
        struct journal_head        *t_forget;

        /*
         * Doubly-linked circular list of all buffers still to be flushed before
         * this transaction can be checkpointed. [j_list_lock]
         */
        struct journal_head        *t_checkpoint_list;

        /*
         * Doubly-linked circular list of metadata buffers being
         * shadowed by log IO.  The IO buffers on the iobuf list and
         * the shadow buffers on this list match each other one for
         * one at all times. [j_list_lock, no locks needed for jbd2
         * thread]
         */
        struct journal_head        *t_shadow_list;

        /*
         * List of inodes associated with the transaction; e.g., ext4 uses
         * this to track inodes in data=ordered and data=journal mode that
         * need special handling on transaction commit; also used by ocfs2.
         * [j_list_lock]
         */
        struct list_head        t_inode_list;

        /*
         * Longest time some handle had to wait for running transaction
         */
        unsigned long                t_max_wait;

        /*
         * When transaction started
         */
        unsigned long                t_start;

        /*
         * When commit was requested [j_state_lock]
         */
        unsigned long                t_requested;

        /*
         * Checkpointing stats [j_list_lock]
         */
        struct transaction_chp_stats_s t_chp_stats;

        /*
         * Number of outstanding updates running on this transaction
         * [none]
         */
        atomic_t                t_updates;

        /*
         * Number of blocks reserved for this transaction in the journal.
         * This is including all credits reserved when starting transaction
         * handles as well as all journal descriptor blocks needed for this
         * transaction. [none]
         */
        atomic_t                t_outstanding_credits;

        /*
         * Number of revoke records for this transaction added by already
         * stopped handles. [none]
         */
        atomic_t                t_outstanding_revokes;

        /*
         * How many handles used this transaction? [none]
         */
        atomic_t                t_handle_count;

        /*
         * Forward and backward links for the circular list of all transactions
         * awaiting checkpoint. [j_list_lock]
         */
        transaction_t                *t_cpnext, *t_cpprev;

        /*
         * When will the transaction expire (become due for commit), in jiffies?
         * [no locking]
         */
        unsigned long                t_expires;

        /*
         * When this transaction started, in nanoseconds [no locking]
         */
        ktime_t                        t_start_time;

        /*
         * This transaction is being forced and some process is
         * waiting for it to finish.
         */
        unsigned int t_synchronous_commit:1;

        /* Disk flush needs to be sent to fs partition [no locking] */
        int                        t_need_data_flush;

        /*
         * For use by the filesystem to store fs-specific data
         * structures associated with the transaction
         */
        struct list_head        t_private_list;
};

struct transaction_run_stats_s {
        unsigned long                rs_wait;
        unsigned long                rs_request_delay;
        unsigned long                rs_running;
        unsigned long                rs_locked;
        unsigned long                rs_flushing;
        unsigned long                rs_logging;

        __u32                        rs_handle_count;
        __u32                        rs_blocks;
        __u32                        rs_blocks_logged;
};

struct transaction_stats_s {
        unsigned long                ts_tid;
        unsigned long                ts_requested;
        struct transaction_run_stats_s run;
};

static inline unsigned long
jbd2_time_diff(unsigned long start, unsigned long end)
{
        if (end >= start)
                return end - start;

        return end + (MAX_JIFFY_OFFSET - start);
}

#define JBD2_NR_BATCH        64

enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};

#define JBD2_FC_REPLAY_STOP        0
#define JBD2_FC_REPLAY_CONTINUE        1

/**
 * struct journal_s - The journal_s type is the concrete type associated with
 *     journal_t.
 */
struct journal_s
{
        /**
         * @j_flags: General journaling state flags [j_state_lock,
         * no lock for quick racy checks]
         */
        unsigned long                j_flags;

        /**
         * @j_errno:
         *
         * Is there an outstanding uncleared error on the journal (from a prior
         * abort)? [j_state_lock]
         */
        int                        j_errno;

        /**
         * @j_abort_mutex: Lock the whole aborting procedure.
         */
        struct mutex                j_abort_mutex;

        /**
         * @j_sb_buffer: The first part of the superblock buffer.
         */
        struct buffer_head        *j_sb_buffer;

        /**
         * @j_superblock: The second part of the superblock buffer.
         */
        journal_superblock_t        *j_superblock;

        /**
         * @j_state_lock: Protect the various scalars in the journal.
         */
        rwlock_t                j_state_lock;

        /**
         * @j_barrier_count:
         *
         * Number of processes waiting to create a barrier lock [j_state_lock,
         * no lock for quick racy checks]
         */
        int                        j_barrier_count;

        /**
         * @j_barrier: The barrier lock itself.
         */
        struct mutex                j_barrier;

        /**
         * @j_running_transaction:
         *
         * Transactions: The current running transaction...
         * [j_state_lock, no lock for quick racy checks] [caller holding
         * open handle]
         */
        transaction_t                *j_running_transaction;

        /**
         * @j_committing_transaction:
         *
         * the transaction we are pushing to disk
         * [j_state_lock] [caller holding open handle]
         */
        transaction_t                *j_committing_transaction;

        /**
         * @j_checkpoint_transactions:
         *
         * ... and a linked circular list of all transactions waiting for
         * checkpointing. [j_list_lock]
         */
        transaction_t                *j_checkpoint_transactions;

        /**
         * @j_wait_transaction_locked:
         *
         * Wait queue for waiting for a locked transaction to start committing,
         * or for a barrier lock to be released.
         */
        wait_queue_head_t        j_wait_transaction_locked;

        /**
         * @j_wait_done_commit: Wait queue for waiting for commit to complete.
         */
        wait_queue_head_t        j_wait_done_commit;

        /**
         * @j_wait_commit: Wait queue to trigger commit.
         */
        wait_queue_head_t        j_wait_commit;

        /**
         * @j_wait_updates: Wait queue to wait for updates to complete.
         */
        wait_queue_head_t        j_wait_updates;

        /**
         * @j_wait_reserved:
         *
         * Wait queue to wait for reserved buffer credits to drop.
         */
        wait_queue_head_t        j_wait_reserved;

        /**
         * @j_fc_wait:
         *
         * Wait queue to wait for completion of async fast commits.
         */
        wait_queue_head_t        j_fc_wait;

        /**
         * @j_checkpoint_mutex:
         *
         * Semaphore for locking against concurrent checkpoints.
         */
        struct mutex                j_checkpoint_mutex;

        /**
         * @j_chkpt_bhs:
         *
         * List of buffer heads used by the checkpoint routine.  This
         * was moved from jbd2_log_do_checkpoint() to reduce stack
         * usage.  Access to this array is controlled by the
         * @j_checkpoint_mutex.  [j_checkpoint_mutex]
         */
        struct buffer_head        *j_chkpt_bhs[JBD2_NR_BATCH];

        /**
         * @j_shrinker:
         *
         * Journal head shrinker, reclaim buffer's journal head which
         * has been written back.
         */
        struct shrinker                *j_shrinker;

        /**
         * @j_checkpoint_jh_count:
         *
         * Number of journal buffers on the checkpoint list. [j_list_lock]
         */
        struct percpu_counter        j_checkpoint_jh_count;

        /**
         * @j_shrink_transaction:
         *
         * Record next transaction will shrink on the checkpoint list.
         * [j_list_lock]
         */
        transaction_t                *j_shrink_transaction;

        /**
         * @j_head:
         *
         * Journal head: identifies the first unused block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_head;

        /**
         * @j_tail:
         *
         * Journal tail: identifies the oldest still-used block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_tail;

        /**
         * @j_free:
         *
         * Journal free: how many free blocks are there in the journal?
         * [j_state_lock]
         */
        unsigned long                j_free;

        /**
         * @j_first:
         *
         * The block number of the first usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_first;

        /**
         * @j_last:
         *
         * The block number one beyond the last usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_last;

        /**
         * @j_fc_first:
         *
         * The block number of the first fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_first;

        /**
         * @j_fc_off:
         *
         * Number of fast commit blocks currently allocated. Accessed only
         * during fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        unsigned long                j_fc_off;

        /**
         * @j_fc_last:
         *
         * The block number one beyond the last fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_last;

        /**
         * @j_dev: Device where we store the journal.
         */
        struct block_device        *j_dev;

        /**
         * @j_blocksize: Block size for the location where we store the journal.
         */
        int                        j_blocksize;

        /**
         * @j_blk_offset:
         *
         * Starting block offset into the device where we store the journal.
         */
        unsigned long long        j_blk_offset;

        /**
         * @j_devname: Journal device name.
         */
        char                        j_devname[BDEVNAME_SIZE+24];

        /**
         * @j_fs_dev:
         *
         * Device which holds the client fs.  For internal journal this will be
         * equal to j_dev.
         */
        struct block_device        *j_fs_dev;

        /**
         * @j_fs_dev_wb_err:
         *
         * Records the errseq of the client fs's backing block device.
         */
        errseq_t                j_fs_dev_wb_err;

        /**
         * @j_total_len: Total maximum capacity of the journal region on disk.
         */
        unsigned int                j_total_len;

        /**
         * @j_reserved_credits:
         *
         * Number of buffers reserved from the running transaction.
         */
        atomic_t                j_reserved_credits;

        /**
         * @j_list_lock: Protects the buffer lists and internal buffer state.
         */
        spinlock_t                j_list_lock;

        /**
         * @j_inode:
         *
         * Optional inode where we store the journal.  If present, all
         * journal block numbers are mapped into this inode via bmap().
         */
        struct inode                *j_inode;

        /**
         * @j_tail_sequence:
         *
         * Sequence number of the oldest transaction in the log [j_state_lock]
         */
        tid_t                        j_tail_sequence;

        /**
         * @j_transaction_sequence:
         *
         * Sequence number of the next transaction to grant [j_state_lock]
         */
        tid_t                        j_transaction_sequence;

        /**
         * @j_commit_sequence:
         *
         * Sequence number of the most recently committed transaction
         * [j_state_lock, no lock for quick racy checks]
         */
        tid_t                        j_commit_sequence;

        /**
         * @j_commit_request:
         *
         * Sequence number of the most recent transaction wanting commit
         * [j_state_lock, no lock for quick racy checks]
         */
        tid_t                        j_commit_request;

        /**
         * @j_uuid:
         *
         * Journal uuid: identifies the object (filesystem, LVM volume etc)
         * backed by this journal.  This will eventually be replaced by an array
         * of uuids, allowing us to index multiple devices within a single
         * journal and to perform atomic updates across them.
         */
        __u8                        j_uuid[16];

        /**
         * @j_task: Pointer to the current commit thread for this journal.
         */
        struct task_struct        *j_task;

        /**
         * @j_max_transaction_buffers:
         *
         * Maximum number of metadata buffers to allow in a single compound
         * commit transaction.
         */
        int                        j_max_transaction_buffers;

        /**
         * @j_revoke_records_per_block:
         *
         * Number of revoke records that fit in one descriptor block.
         */
        int                        j_revoke_records_per_block;

        /**
         * @j_commit_interval:
         *
         * What is the maximum transaction lifetime before we begin a commit?
         */
        unsigned long                j_commit_interval;

        /**
         * @j_commit_timer: The timer used to wakeup the commit thread.
         */
        struct timer_list        j_commit_timer;

        /**
         * @j_revoke_lock: Protect the revoke table.
         */
        spinlock_t                j_revoke_lock;

        /**
         * @j_revoke:
         *
         * The revoke table - maintains the list of revoked blocks in the
         * current transaction.
         */
        struct jbd2_revoke_table_s *j_revoke;

        /**
         * @j_revoke_table: Alternate revoke tables for j_revoke.
         */
        struct jbd2_revoke_table_s *j_revoke_table[2];

        /**
         * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction.
         */
        struct buffer_head        **j_wbuf;

        /**
         * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only
         * during a fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        struct buffer_head        **j_fc_wbuf;

        /**
         * @j_wbufsize:
         *
         * Size of @j_wbuf array.
         */
        int                        j_wbufsize;

        /**
         * @j_fc_wbufsize:
         *
         * Size of @j_fc_wbuf array.
         */
        int                        j_fc_wbufsize;

        /**
         * @j_last_sync_writer:
         *
         * The pid of the last person to run a synchronous operation
         * through the journal.
         */
        pid_t                        j_last_sync_writer;

        /**
         * @j_average_commit_time:
         *
         * The average amount of time in nanoseconds it takes to commit a
         * transaction to disk. [j_state_lock]
         */
        u64                        j_average_commit_time;

        /**
         * @j_min_batch_time:
         *
         * Minimum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_min_batch_time;

        /**
         * @j_max_batch_time:
         *
         * Maximum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_max_batch_time;

        /**
         * @j_commit_callback:
         *
         * This function is called when a transaction is closed.
         */
        void                        (*j_commit_callback)(journal_t *,
                                                     transaction_t *);

        /**
         * @j_submit_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WRITE_DATA flag
         * before we start to write out the transaction to the journal.
         */
        int                        (*j_submit_inode_data_buffers)
                                        (struct jbd2_inode *);

        /**
         * @j_finish_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WAIT_DATA flag
         * after we have written the transaction to the journal
         * but before we write out the commit block.
         */
        int                        (*j_finish_inode_data_buffers)
                                        (struct jbd2_inode *);

        /*
         * Journal statistics
         */

        /**
         * @j_history_lock: Protect the transactions statistics history.
         */
        spinlock_t                j_history_lock;

        /**
         * @j_proc_entry: procfs entry for the jbd statistics directory.
         */
        struct proc_dir_entry        *j_proc_entry;

        /**
         * @j_stats: Overall statistics.
         */
        struct transaction_stats_s j_stats;

        /**
         * @j_failed_commit: Failed journal commit ID.
         */
        unsigned int                j_failed_commit;

        /**
         * @j_private:
         *
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here.
         */
        void *j_private;

        /**
         * @j_chksum_driver:
         *
         * Reference to checksum algorithm driver via cryptoapi.
         */
        struct crypto_shash *j_chksum_driver;

        /**
         * @j_csum_seed:
         *
         * Precomputed journal UUID checksum for seeding other checksums.
         */
        __u32 j_csum_seed;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /**
         * @j_trans_commit_map:
         *
         * Lockdep entity to track transaction commit dependencies. Handles
         * hold this "lock" for read, when we wait for commit, we acquire the
         * "lock" for writing. This matches the properties of jbd2 journalling
         * where the running transaction has to wait for all handles to be
         * dropped to commit that transaction and also acquiring a handle may
         * require transaction commit to finish.
         */
        struct lockdep_map        j_trans_commit_map;
#endif

        /**
         * @j_fc_cleanup_callback:
         *
         * Clean-up after fast commit or full commit. JBD2 calls this function
         * after every commit operation.
         */
        void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid);

        /**
         * @j_fc_replay_callback:
         *
         * File-system specific function that performs replay of a fast
         * commit. JBD2 calls this function for each fast commit block found in
         * the journal. This function should return JBD2_FC_REPLAY_CONTINUE
         * to indicate that the block was processed correctly and more fast
         * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP
         * indicates the end of replay (no more blocks remaining). A negative
         * return value indicates error.
         */
        int (*j_fc_replay_callback)(struct journal_s *journal,
                                    struct buffer_head *bh,
                                    enum passtype pass, int off,
                                    tid_t expected_commit_id);

        /**
         * @j_bmap:
         *
         * Bmap function that should be used instead of the generic
         * VFS bmap function.
         */
        int (*j_bmap)(struct journal_s *journal, sector_t *block);
};

#define jbd2_might_wait_for_commit(j) \
        do { \
                rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \
                rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \
        } while (0)

/*
 * We can support any known requested features iff the
 * superblock is not in version 1.  Otherwise we fail to support any
 * extended sb features.
 */
static inline bool jbd2_format_support_feature(journal_t *j)
{
        return j->j_superblock->s_header.h_blocktype !=
                                        cpu_to_be32(JBD2_SUPERBLOCK_V1);
}

/* journal feature predicate functions */
#define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_compat & \
                 cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat |= \
                cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
}

#define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_ro_compat & \
                 cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat |= \
                cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
}

#define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_incompat & \
                 cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat |= \
                cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat &= \
                ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
}

JBD2_FEATURE_COMPAT_FUNCS(checksum,                CHECKSUM)

JBD2_FEATURE_INCOMPAT_FUNCS(revoke,                REVOKE)
JBD2_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,        ASYNC_COMMIT)
JBD2_FEATURE_INCOMPAT_FUNCS(csum2,                CSUM_V2)
JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,        FAST_COMMIT)

/* Journal high priority write IO operation flags */
#define JBD2_JOURNAL_REQ_FLAGS                (REQ_META | REQ_SYNC | REQ_IDLE)

/*
 * Journal flag definitions
 */
#define JBD2_UNMOUNT        0x001        /* Journal thread is being destroyed */
#define JBD2_ABORT        0x002        /* Journaling has been aborted for errors. */
#define JBD2_ACK_ERR        0x004        /* The errno in the sb has been acked */
#define JBD2_FLUSHED        0x008        /* The journal superblock has been flushed */
#define JBD2_LOADED        0x010        /* The journal superblock has been loaded */
#define JBD2_BARRIER        0x020        /* Use IDE barriers */
#define JBD2_ABORT_ON_SYNCDATA_ERR        0x040        /* Abort the journal on file
                                                 * data write error in ordered
                                                 * mode */
#define JBD2_CYCLE_RECORD                0x080        /* Journal cycled record log on
                                                 * clean and empty filesystem
                                                 * logging area */
#define JBD2_FAST_COMMIT_ONGOING        0x100        /* Fast commit is ongoing */
#define JBD2_FULL_COMMIT_ONGOING        0x200        /* Full commit is ongoing */
#define JBD2_JOURNAL_FLUSH_DISCARD        0x0001
#define JBD2_JOURNAL_FLUSH_ZEROOUT        0x0002
#define JBD2_JOURNAL_FLUSH_VALID        (JBD2_JOURNAL_FLUSH_DISCARD | \
                                        JBD2_JOURNAL_FLUSH_ZEROOUT)

/*
 * Function declarations for the journaling transaction and buffer
 * management
 */

/* Filing buffers */
extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
extern bool __jbd2_journal_refile_buffer(struct journal_head *);
extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh)
{
        list_add_tail(&bh->b_assoc_buffers, head);
}
static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
}

/* Log buffer allocation */
struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int);
void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *);
int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
                              unsigned long *block);
int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);

/* Commit management */
extern void jbd2_journal_commit_transaction(journal_t *);

/* Checkpoint list management */
enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP};

void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);


/*
 * Triggers
 */

struct jbd2_buffer_trigger_type {
        /*
         * Fired a the moment data to write to the journal are known to be
         * stable - so either at the moment b_frozen_data is created or just
         * before a buffer is written to the journal.  mapped_data is a mapped
         * buffer that is the frozen data for commit.
         */
        void (*t_frozen)(struct jbd2_buffer_trigger_type *type,
                         struct buffer_head *bh, void *mapped_data,
                         size_t size);

        /*
         * Fired during journal abort for dirty buffers that will not be
         * committed.
         */
        void (*t_abort)(struct jbd2_buffer_trigger_type *type,
                        struct buffer_head *bh);
};

extern void jbd2_buffer_frozen_trigger(struct journal_head *jh,
                                       void *mapped_data,
                                       struct jbd2_buffer_trigger_type *triggers);
extern void jbd2_buffer_abort_trigger(struct journal_head *jh,
                                      struct jbd2_buffer_trigger_type *triggers);

/* Buffer IO */
extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                              struct journal_head *jh_in,
                                              struct buffer_head **bh_out,
                                              sector_t blocknr);

/* Transaction cache support */
extern void jbd2_journal_destroy_transaction_cache(void);
extern int __init jbd2_journal_init_transaction_cache(void);
extern void jbd2_journal_free_transaction(transaction_t *);

/*
 * Journal locking.
 *
 * We need to lock the journal during transaction state changes so that nobody
 * ever tries to take a handle on the running transaction while we are in the
 * middle of moving it to the commit phase.  j_state_lock does this.
 *
 * Note that the locking is completely interrupt unsafe.  We never touch
 * journal structures from interrupts.
 */

static inline handle_t *journal_current_handle(void)
{
        return current->journal_info;
}

/* The journaling code user interface:
 *
 * Create and destroy handles
 * Register buffer modifications against the current transaction.
 */

extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
                                     int revoke_records, gfp_t gfp_mask,
                                     unsigned int type, unsigned int line_no);
extern int         jbd2_journal_restart(handle_t *, int nblocks);
extern int         jbd2__journal_restart(handle_t *, int nblocks,
                                       int revoke_records, gfp_t gfp_mask);
extern int         jbd2_journal_start_reserved(handle_t *handle,
                                unsigned int type, unsigned int line_no);
extern void         jbd2_journal_free_reserved(handle_t *handle);
extern int         jbd2_journal_extend(handle_t *handle, int nblocks,
                                     int revoke_records);
extern int         jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
void                 jbd2_journal_set_triggers(struct buffer_head *,
                                           struct jbd2_buffer_trigger_type *type);
extern int         jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
extern int         jbd2_journal_forget (handle_t *, struct buffer_head *);
int jbd2_journal_invalidate_folio(journal_t *, struct folio *,
                                        size_t offset, size_t length);
bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio);
extern int         jbd2_journal_stop(handle_t *);
extern int         jbd2_journal_flush(journal_t *journal, unsigned int flags);
extern void         jbd2_journal_lock_updates (journal_t *);
extern void         jbd2_journal_unlock_updates (journal_t *);

void jbd2_journal_wait_updates(journal_t *);

extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                                struct block_device *fs_dev,
                                unsigned long long start, int len, int bsize);
extern journal_t * jbd2_journal_init_inode (struct inode *);
extern int           jbd2_journal_update_format (journal_t *);
extern int           jbd2_journal_check_used_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_check_available_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_set_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern void           jbd2_journal_clear_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_load       (journal_t *journal);
extern int           jbd2_journal_destroy    (journal_t *);
extern int           jbd2_journal_recover    (journal_t *journal);
extern int           jbd2_journal_wipe       (journal_t *, int);
extern int           jbd2_journal_skip_recovery        (journal_t *);
extern void           jbd2_journal_update_sb_errno(journal_t *);
extern int           jbd2_journal_update_sb_log_tail        (journal_t *, tid_t,
                                unsigned long, blk_opf_t);
extern void           jbd2_journal_abort      (journal_t *, int);
extern int           jbd2_journal_errno      (journal_t *);
extern void           jbd2_journal_ack_err    (journal_t *);
extern int           jbd2_journal_clear_err  (journal_t *);
extern int           jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int           jbd2_journal_force_commit(journal_t *);
extern int           jbd2_journal_force_commit_nested(journal_t *);
extern int           jbd2_journal_inode_ranged_write(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_inode_ranged_wait(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_finish_inode_data_buffers(
                        struct jbd2_inode *jinode);
extern int           jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                struct jbd2_inode *inode, loff_t new_size);
extern void           jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
extern void           jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);

/*
 * journal_head management
 */
struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);
struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh);
void jbd2_journal_put_journal_head(struct journal_head *jh);

/*
 * handle management
 */
extern struct kmem_cache *jbd2_handle_cache;

#define jbd2_alloc_handle(_gfp_flags)        \
                ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags))

static inline void jbd2_free_handle(handle_t *handle)
{
        kmem_cache_free(jbd2_handle_cache, handle);
}

/*
 * jbd2_inode management (optional, for those file systems that want to use
 * dynamically allocated jbd2_inode structures)
 */
extern struct kmem_cache *jbd2_inode_cache;

#define jbd2_alloc_inode(_gfp_flags)        \
                ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags))

static inline void jbd2_free_inode(struct jbd2_inode *jinode)
{
        kmem_cache_free(jbd2_inode_cache, jinode);
}

/* Primary revoke support */
#define JOURNAL_REVOKE_DEFAULT_HASH 256
extern int           jbd2_journal_init_revoke(journal_t *, int);
extern void           jbd2_journal_destroy_revoke_record_cache(void);
extern void           jbd2_journal_destroy_revoke_table_cache(void);
extern int __init jbd2_journal_init_revoke_record_cache(void);
extern int __init jbd2_journal_init_revoke_table_cache(void);

extern void           jbd2_journal_destroy_revoke(journal_t *);
extern int           jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
extern int           jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
extern void           jbd2_journal_write_revoke_records(transaction_t *transaction,
                                                     struct list_head *log_bufs);

/* Recovery revoke support */
extern int        jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
extern int        jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t);
extern void        jbd2_journal_clear_revoke(journal_t *);
extern void        jbd2_journal_switch_revoke_table(journal_t *journal);
extern void        jbd2_clear_buffer_revoked_flags(journal_t *journal);

/*
 * The log thread user interface:
 *
 * Request space in the current transaction, and force transaction commit
 * transitions on demand.
 */

int jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

void __jbd2_log_wait_for_space(journal_t *journal);
extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
extern int jbd2_cleanup_journal_tail(journal_t *);

/* Fast commit related APIs */
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid);
int jbd2_fc_end_commit(journal_t *journal);
int jbd2_fc_end_commit_fallback(journal_t *journal);
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
int jbd2_fc_release_bufs(journal_t *journal);

static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
        return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}

/*
 * is_journal_abort
 *
 * Simple test wrapper function to test the JBD2_ABORT state flag.  This
 * bit, when set, indicates that we have had a fatal error somewhere,
 * either inside the journaling layer or indicated to us by the client
 * (eg. ext3), and that we and should not commit any further
 * transactions.
 */

static inline int is_journal_aborted(journal_t *journal)
{
        return journal->j_flags & JBD2_ABORT;
}

static inline int is_handle_aborted(handle_t *handle)
{
        if (handle->h_aborted || !handle->h_transaction)
                return 1;
        return is_journal_aborted(handle->h_transaction->t_journal);
}

static inline void jbd2_journal_abort_handle(handle_t *handle)
{
        handle->h_aborted = 1;
}

static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
{
        struct address_space *mapping = journal->j_fs_dev->bd_mapping;

        /*
         * Save the original wb_err value of client fs's bdev mapping which
         * could be used to detect the client fs's metadata async write error.
         */
        errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err);
}

static inline int jbd2_check_fs_dev_write_error(journal_t *journal)
{
        struct address_space *mapping = journal->j_fs_dev->bd_mapping;

        return errseq_check(&mapping->wb_err,
                            READ_ONCE(journal->j_fs_dev_wb_err));
}

#endif /* __KERNEL__   */

/* Comparison functions for transaction IDs: perform comparisons using
 * modulo arithmetic so that they work over sequence number wraps. */

static inline int tid_gt(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference > 0);
}

static inline int tid_geq(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference >= 0);
}

extern int jbd2_journal_blocks_per_page(struct inode *inode);
extern size_t journal_tag_bytes(journal_t *journal);

static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j)
{
        return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j);
}

static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
{
        WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) &&
                     journal->j_chksum_driver == NULL);

        return journal->j_chksum_driver != NULL;
}

static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
{
        int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);

        return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
}

/*
 * Return number of free blocks in the log. Must be called under j_state_lock.
 */
static inline unsigned long jbd2_log_space_left(journal_t *journal)
{
        /* Allow for rounding errors */
        long free = journal->j_free - 32;

        if (journal->j_committing_transaction) {
                free -= atomic_read(&journal->
                        j_committing_transaction->t_outstanding_credits);
        }
        return max_t(long, free, 0);
}

/*
 * Definitions which augment the buffer_head layer
 */

/* journaling buffer types */
#define BJ_None                0        /* Not journaled */
#define BJ_Metadata        1        /* Normal journaled metadata */
#define BJ_Forget        2        /* Buffer superseded by this transaction */
#define BJ_Shadow        3        /* Buffer contents being shadowed to the log */
#define BJ_Reserved        4        /* Buffer is reserved for access by journal */
#define BJ_Types        5

/* JBD uses a CRC32 checksum */
#define JBD_MAX_CHECKSUM_SIZE 4

static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
                              const void *address, unsigned int length)
{
        struct {
                struct shash_desc shash;
                char ctx[JBD_MAX_CHECKSUM_SIZE];
        } desc;
        int err;

        BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
                JBD_MAX_CHECKSUM_SIZE);

        desc.shash.tfm = journal->j_chksum_driver;
        *(u32 *)desc.ctx = crc;

        err = crypto_shash_update(&desc.shash, address, length);
        BUG_ON(err);

        return *(u32 *)desc.ctx;
}

/* Return most recent uncommitted transaction */
static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
{
        tid_t tid;

        read_lock(&journal->j_state_lock);
        tid = journal->j_commit_request;
        if (journal->j_running_transaction)
                tid = journal->j_running_transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        return tid;
}

static inline int jbd2_handle_buffer_credits(handle_t *handle)
{
        journal_t *journal;

        if (!handle->h_reserved)
                journal = handle->h_transaction->t_journal;
        else
                journal = handle->h_journal;

        return handle->h_total_credits -
                DIV_ROUND_UP(handle->h_revoke_credits_requested,
                             journal->j_revoke_records_per_block);
}

#ifdef __KERNEL__

#define buffer_trace_init(bh)        do {} while (0)
#define print_buffer_fields(bh)        do {} while (0)
#define print_buffer_trace(bh)        do {} while (0)
#define BUFFER_TRACE(bh, info)        do {} while (0)
#define BUFFER_TRACE2(bh, bh2, info)        do {} while (0)
#define JBUFFER_TRACE(jh, info)        do {} while (0)

#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _LINUX_JBD2_H */


















    2 


    3 


























































































































































    1 
























































































































































    1 




    1 














    2 









































































































    1 














    1 














    1 














    8 



























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/tomoyo.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/lsm_hooks.h>
#include <uapi/linux/lsm.h>
#include "common.h"

/**
 * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread.
 *
 * Returns pointer to "struct tomoyo_domain_info" for current thread.
 */
struct tomoyo_domain_info *tomoyo_domain(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
        return s->domain_info;
}

/**
 * tomoyo_cred_prepare - Target for security_prepare_creds().
 *
 * @new: Pointer to "struct cred".
 * @old: Pointer to "struct cred".
 * @gfp: Memory allocation flags.
 *
 * Returns 0.
 */
static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
                               gfp_t gfp)
{
        /* Restore old_domain_info saved by previous execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = s->old_domain_info;
                s->old_domain_info = NULL;
        }
        return 0;
}

/**
 * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds().
 *
 * @bprm: Pointer to "struct linux_binprm".
 */
static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm)
{
        /* Clear old_domain_info saved by execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        atomic_dec(&s->old_domain_info->users);
        s->old_domain_info = NULL;
}

#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
/**
 * tomoyo_bprm_creds_for_exec - Target for security_bprm_creds_for_exec().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0.
 */
static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        /*
         * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested
         * for the first time.
         */
        if (!tomoyo_policy_loaded)
                tomoyo_load_policy(bprm->filename);
        return 0;
}
#endif

/**
 * tomoyo_bprm_check_security - Target for security_bprm_check().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /*
         * Execute permission is checked against pathname passed to execve()
         * using current domain.
         */
        if (!s->old_domain_info) {
                const int idx = tomoyo_read_lock();
                const int err = tomoyo_find_next_domain(bprm);

                tomoyo_read_unlock(idx);
                return err;
        }
        /*
         * Read permission is checked against interpreters using next domain.
         */
        return tomoyo_check_open_permission(s->domain_info,
                                            &bprm->file->f_path, O_RDONLY);
}

/**
 * tomoyo_inode_getattr - Target for security_inode_getattr().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_inode_getattr(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_GETATTR, path, NULL);
}

/**
 * tomoyo_path_truncate - Target for security_path_truncate().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_truncate(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL);
}

/**
 * tomoyo_file_truncate - Target for security_file_truncate().
 *
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_truncate(struct file *file)
{
        return tomoyo_path_truncate(&file->f_path);
}

/**
 * tomoyo_path_unlink - Target for security_path_unlink().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL);
}

/**
 * tomoyo_path_mkdir - Target for security_path_mkdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
                             umode_t mode)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_rmdir - Target for security_path_rmdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL);
}

/**
 * tomoyo_path_symlink - Target for security_path_symlink().
 *
 * @parent:   Pointer to "struct path".
 * @dentry:   Pointer to "struct dentry".
 * @old_name: Symlink's content.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry,
                               const char *old_name)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name);
}

/**
 * tomoyo_path_mknod - Target for security_path_mknod().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 * @dev:    Device attributes.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry,
                             umode_t mode, unsigned int dev)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
        int type = TOMOYO_TYPE_CREATE;
        const unsigned int perm = mode & S_IALLUGO;

        switch (mode & S_IFMT) {
        case S_IFCHR:
                type = TOMOYO_TYPE_MKCHAR;
                break;
        case S_IFBLK:
                type = TOMOYO_TYPE_MKBLOCK;
                break;
        default:
                goto no_dev;
        }
        return tomoyo_mkdev_perm(type, &path, perm, dev);
 no_dev:
        switch (mode & S_IFMT) {
        case S_IFIFO:
                type = TOMOYO_TYPE_MKFIFO;
                break;
        case S_IFSOCK:
                type = TOMOYO_TYPE_MKSOCK;
                break;
        }
        return tomoyo_path_number_perm(type, &path, perm);
}

/**
 * tomoyo_path_link - Target for security_path_link().
 *
 * @old_dentry: Pointer to "struct dentry".
 * @new_dir:    Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir,
                            struct dentry *new_dentry)
{
        struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry };

        return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2);
}

/**
 * tomoyo_path_rename - Target for security_path_rename().
 *
 * @old_parent: Pointer to "struct path".
 * @old_dentry: Pointer to "struct dentry".
 * @new_parent: Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 * @flags: Rename options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rename(const struct path *old_parent,
                              struct dentry *old_dentry,
                              const struct path *new_parent,
                              struct dentry *new_dentry,
                              const unsigned int flags)
{
        struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry };

        if (flags & RENAME_EXCHANGE) {
                const int err = tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path2,
                                &path1);

                if (err)
                        return err;
        }
        return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2);
}

/**
 * tomoyo_file_fcntl - Target for security_file_fcntl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for fcntl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        if (!(cmd == F_SETFL && ((arg ^ file->f_flags) & O_APPEND)))
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &file->f_path,
                                            O_WRONLY | (arg & O_APPEND));
}

/**
 * tomoyo_file_open - Target for security_file_open().
 *
 * @f: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_open(struct file *f)
{
        /* Don't check read permission here if called from execve(). */
        /* Illogically, FMODE_EXEC is in f_flags, not f_mode. */
        if (f->f_flags & __FMODE_EXEC)
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,
                                            f->f_flags);
}

/**
 * tomoyo_file_ioctl - Target for security_file_ioctl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for ioctl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_ioctl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_IOCTL, &file->f_path, cmd);
}

/**
 * tomoyo_path_chmod - Target for security_path_chmod().
 *
 * @path: Pointer to "struct path".
 * @mode: DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chmod(const struct path *path, umode_t mode)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_chown - Target for security_path_chown().
 *
 * @path: Pointer to "struct path".
 * @uid:  Owner ID.
 * @gid:  Group ID.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        int error = 0;

        if (uid_valid(uid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path,
                                                from_kuid(&init_user_ns, uid));
        if (!error && gid_valid(gid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHGRP, path,
                                                from_kgid(&init_user_ns, gid));
        return error;
}

/**
 * tomoyo_path_chroot - Target for security_path_chroot().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chroot(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL);
}

/**
 * tomoyo_sb_mount - Target for security_sb_mount().
 *
 * @dev_name: Name of device file. Maybe NULL.
 * @path:     Pointer to "struct path".
 * @type:     Name of filesystem type. Maybe NULL.
 * @flags:    Mount options.
 * @data:     Optional data. Maybe NULL.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_mount(const char *dev_name, const struct path *path,
                           const char *type, unsigned long flags, void *data)
{
        return tomoyo_mount_permission(dev_name, path, type, flags, data);
}

/**
 * tomoyo_sb_umount - Target for security_sb_umount().
 *
 * @mnt:   Pointer to "struct vfsmount".
 * @flags: Unmount options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_umount(struct vfsmount *mnt, int flags)
{
        struct path path = { .mnt = mnt, .dentry = mnt->mnt_root };

        return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL);
}

/**
 * tomoyo_sb_pivotroot - Target for security_sb_pivotroot().
 *
 * @old_path: Pointer to "struct path".
 * @new_path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path)
{
        return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path);
}

/**
 * tomoyo_socket_listen - Check permission for listen().
 *
 * @sock:    Pointer to "struct socket".
 * @backlog: Backlog parameter.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_listen(struct socket *sock, int backlog)
{
        return tomoyo_socket_listen_permission(sock);
}

/**
 * tomoyo_socket_connect - Check permission for connect().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_connect(struct socket *sock, struct sockaddr *addr,
                                 int addr_len)
{
        return tomoyo_socket_connect_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_bind - Check permission for bind().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_bind(struct socket *sock, struct sockaddr *addr,
                              int addr_len)
{
        return tomoyo_socket_bind_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_sendmsg - Check permission for sendmsg().
 *
 * @sock: Pointer to "struct socket".
 * @msg:  Pointer to "struct msghdr".
 * @size: Size of message.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                 int size)
{
        return tomoyo_socket_sendmsg_permission(sock, msg, size);
}

struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = {
        .lbs_task = sizeof(struct tomoyo_task),
};

/**
 * tomoyo_task_alloc - Target for security_task_alloc().
 *
 * @task:        Pointer to "struct task_struct".
 * @clone_flags: clone() flags.
 *
 * Returns 0.
 */
static int tomoyo_task_alloc(struct task_struct *task,
                             unsigned long clone_flags)
{
        struct tomoyo_task *old = tomoyo_task(current);
        struct tomoyo_task *new = tomoyo_task(task);

        new->domain_info = old->domain_info;
        atomic_inc(&new->domain_info->users);
        new->old_domain_info = NULL;
        return 0;
}

/**
 * tomoyo_task_free - Target for security_task_free().
 *
 * @task: Pointer to "struct task_struct".
 */
static void tomoyo_task_free(struct task_struct *task)
{
        struct tomoyo_task *s = tomoyo_task(task);

        if (s->domain_info) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = NULL;
        }
        if (s->old_domain_info) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
}

static const struct lsm_id tomoyo_lsmid = {
        .name = "tomoyo",
        .id = LSM_ID_TOMOYO,
};

/*
 * tomoyo_security_ops is a "struct security_operations" which is used for
 * registering TOMOYO.
 */
static struct security_hook_list tomoyo_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
        LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds),
        LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc),
        LSM_HOOK_INIT(task_free, tomoyo_task_free),
#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec),
#endif
        LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security),
        LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl),
        LSM_HOOK_INIT(file_open, tomoyo_file_open),
        LSM_HOOK_INIT(file_truncate, tomoyo_file_truncate),
        LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate),
        LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink),
        LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir),
        LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir),
        LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink),
        LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod),
        LSM_HOOK_INIT(path_link, tomoyo_path_link),
        LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
        LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
        LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl),
        LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
        LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
        LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
        LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount),
        LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount),
        LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot),
        LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind),
        LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect),
        LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen),
        LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg),
};

/* Lock for GC. */
DEFINE_SRCU(tomoyo_ss);

int tomoyo_enabled __ro_after_init = 1;

/**
 * tomoyo_init - Register TOMOYO Linux as a LSM module.
 *
 * Returns 0.
 */
static int __init tomoyo_init(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /* register ourselves with the security framework */
        security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks),
                           &tomoyo_lsmid);
        pr_info("TOMOYO Linux initialized\n");
        s->domain_info = &tomoyo_kernel_domain;
        atomic_inc(&tomoyo_kernel_domain.users);
        s->old_domain_info = NULL;
        tomoyo_mm_init();

        return 0;
}

DEFINE_LSM(tomoyo) = {
        .name = "tomoyo",
        .enabled = &tomoyo_enabled,
        .flags = LSM_FLAG_LEGACY_MAJOR,
        .blobs = &tomoyo_blob_sizes,
        .init = tomoyo_init,
};




































    1 

    1 

    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Generic Timer-queue
 *
 *  Manages a simple queue of timers, ordered by expiration time.
 *  Uses rbtrees for quick list adds and expiration.
 *
 *  NOTE: All of the following functions need to be serialized
 *  to avoid races. No locking is done by this library code.
 */

#include <linux/bug.h>
#include <linux/timerqueue.h>
#include <linux/rbtree.h>
#include <linux/export.h>

#define __node_2_tq(_n) \
        rb_entry((_n), struct timerqueue_node, node)

static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b)
{
        return __node_2_tq(a)->expires < __node_2_tq(b)->expires;
}

/**
 * timerqueue_add - Adds timer to timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be added
 *
 * Adds the timer node to the timerqueue, sorted by the node's expires
 * value. Returns true if the newly added timer is the first expiring timer in
 * the queue.
 */
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
        /* Make sure we don't add nodes that are already added */
        WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));

        return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
}
EXPORT_SYMBOL_GPL(timerqueue_add);

/**
 * timerqueue_del - Removes a timer from the timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be removed
 *
 * Removes the timer node from the timerqueue. Returns true if the queue is
 * not empty after the remove.
 */
bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));

        rb_erase_cached(&node->node, &head->rb_root);
        RB_CLEAR_NODE(&node->node);

        return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
}
EXPORT_SYMBOL_GPL(timerqueue_del);

/**
 * timerqueue_iterate_next - Returns the timer after the provided timer
 *
 * @node: Pointer to a timer.
 *
 * Provides the timer that is after the given node. This is used, when
 * necessary, to iterate through the list of timers in a timer list
 * without modifying the list.
 */
struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
{
        struct rb_node *next;

        if (!node)
                return NULL;
        next = rb_next(&node->node);
        if (!next)
                return NULL;
        return container_of(next, struct timerqueue_node, node);
}
EXPORT_SYMBOL_GPL(timerqueue_iterate_next);


























    1 






    1 










    1 

    1 
















    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/once.h>
#include <linux/random.h>
#include <linux/module.h>

struct once_work {
        struct work_struct work;
        struct static_key_true *key;
        struct module *module;
};

static void once_deferred(struct work_struct *w)
{
        struct once_work *work;

        work = container_of(w, struct once_work, work);
        BUG_ON(!static_key_enabled(work->key));
        static_branch_disable(work->key);
        module_put(work->module);
        kfree(work);
}

static void once_disable_jump(struct static_key_true *key, struct module *mod)
{
        struct once_work *w;

        w = kmalloc(sizeof(*w), GFP_ATOMIC);
        if (!w)
                return;

        INIT_WORK(&w->work, once_deferred);
        w->key = key;
        w->module = mod;
        __module_get(mod);
        schedule_work(&w->work);
}

static DEFINE_SPINLOCK(once_lock);

bool __do_once_start(bool *done, unsigned long *flags)
        __acquires(once_lock)
{
        spin_lock_irqsave(&once_lock, *flags);
        if (*done) {
                spin_unlock_irqrestore(&once_lock, *flags);
                /* Keep sparse happy by restoring an even lock count on
                 * this lock. In case we return here, we don't call into
                 * __do_once_done but return early in the DO_ONCE() macro.
                 */
                __acquire(once_lock);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(__do_once_start);

void __do_once_done(bool *done, struct static_key_true *once_key,
                    unsigned long *flags, struct module *mod)
        __releases(once_lock)
{
        *done = true;
        spin_unlock_irqrestore(&once_lock, *flags);
        once_disable_jump(once_key, mod);
}
EXPORT_SYMBOL(__do_once_done);

static DEFINE_MUTEX(once_mutex);

bool __do_once_sleepable_start(bool *done)
        __acquires(once_mutex)
{
        mutex_lock(&once_mutex);
        if (*done) {
                mutex_unlock(&once_mutex);
                /* Keep sparse happy by restoring an even lock count on
                 * this mutex. In case we return here, we don't call into
                 * __do_once_done but return early in the DO_ONCE_SLEEPABLE() macro.
                 */
                __acquire(once_mutex);
                return false;
        }

        return true;
}
EXPORT_SYMBOL(__do_once_sleepable_start);

void __do_once_sleepable_done(bool *done, struct static_key_true *once_key,
                         struct module *mod)
        __releases(once_mutex)
{
        *done = true;
        mutex_unlock(&once_mutex);
        once_disable_jump(once_key, mod);
}
EXPORT_SYMBOL(__do_once_sleepable_done);



























































































    1 

















    1 



    1 







































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
 * Copyright (C) 2002 Andi Kleen
 *
 * This handles calls from both 32bit and 64bit mode.
 *
 * Lock order:
 *        context.ldt_usr_sem
 *          mmap_lock
 *            context.lock
 */

#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>

#include <asm/ldt.h>
#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/pgtable_areas.h>

#include <xen/xen.h>

/* This is a multiple of PAGE_SIZE. */
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)

static inline void *ldt_slot_va(int slot)
{
        return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
}

void load_mm_ldt(struct mm_struct *mm)
{
        struct ldt_struct *ldt;

        /* READ_ONCE synchronizes with smp_store_release */
        ldt = READ_ONCE(mm->context.ldt);

        /*
         * Any change to mm->context.ldt is followed by an IPI to all
         * CPUs with the mm active.  The LDT will not be freed until
         * after the IPI is handled by all such CPUs.  This means that
         * if the ldt_struct changes before we return, the values we see
         * will be safe, and the new values will be loaded before we run
         * any user code.
         *
         * NB: don't try to convert this to use RCU without extreme care.
         * We would still need IRQs off, because we don't want to change
         * the local LDT after an IPI loaded a newer value than the one
         * that we can see.
         */

        if (unlikely(ldt)) {
                if (static_cpu_has(X86_FEATURE_PTI)) {
                        if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
                                /*
                                 * Whoops -- either the new LDT isn't mapped
                                 * (if slot == -1) or is mapped into a bogus
                                 * slot (if slot > 1).
                                 */
                                clear_LDT();
                                return;
                        }

                        /*
                         * If page table isolation is enabled, ldt->entries
                         * will not be mapped in the userspace pagetables.
                         * Tell the CPU to access the LDT through the alias
                         * at ldt_slot_va(ldt->slot).
                         */
                        set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
                } else {
                        set_ldt(ldt->entries, ldt->nr_entries);
                }
        } else {
                clear_LDT();
        }
}

void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
        /*
         * Load the LDT if either the old or new mm had an LDT.
         *
         * An mm will never go from having an LDT to not having an LDT.  Two
         * mms never share an LDT, so we don't gain anything by checking to
         * see whether the LDT changed.  There's also no guarantee that
         * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
         * then prev->context.ldt will also be non-NULL.
         *
         * If we really cared, we could optimize the case where prev == next
         * and we're exiting lazy mode.  Most of the time, if this happens,
         * we don't actually need to reload LDTR, but modify_ldt() is mostly
         * used by legacy code and emulators where we don't need this level of
         * performance.
         *
         * This uses | instead of || because it generates better code.
         */
        if (unlikely((unsigned long)prev->context.ldt |
                     (unsigned long)next->context.ldt))
                load_mm_ldt(next);

        DEBUG_LOCKS_WARN_ON(preemptible());
}

static void refresh_ldt_segments(void)
{
#ifdef CONFIG_X86_64
        unsigned short sel;

        /*
         * Make sure that the cached DS and ES descriptors match the updated
         * LDT.
         */
        savesegment(ds, sel);
        if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
                loadsegment(ds, sel);

        savesegment(es, sel);
        if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
                loadsegment(es, sel);
#endif
}

/* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm)
{
        struct mm_struct *mm = __mm;

        if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
                return;

        load_mm_ldt(mm);

        refresh_ldt_segments();
}

/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
{
        struct ldt_struct *new_ldt;
        unsigned int alloc_size;

        if (num_entries > LDT_ENTRIES)
                return NULL;

        new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
        if (!new_ldt)
                return NULL;

        BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
        alloc_size = num_entries * LDT_ENTRY_SIZE;

        /*
         * Xen is very picky: it requires a page-aligned LDT that has no
         * trailing nonzero bytes in any page that contains LDT descriptors.
         * Keep it simple: zero the whole allocation and never allocate less
         * than PAGE_SIZE.
         */
        if (alloc_size > PAGE_SIZE)
                new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        else
                new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);

        if (!new_ldt->entries) {
                kfree(new_ldt);
                return NULL;
        }

        /* The new LDT isn't aliased for PTI yet. */
        new_ldt->slot = -1;

        new_ldt->nr_entries = num_entries;
        return new_ldt;
}

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION

static void do_sanity_check(struct mm_struct *mm,
                            bool had_kernel_mapping,
                            bool had_user_mapping)
{
        if (mm->context.ldt) {
                /*
                 * We already had an LDT.  The top-level entry should already
                 * have been allocated and synchronized with the usermode
                 * tables.
                 */
                WARN_ON(!had_kernel_mapping);
                if (boot_cpu_has(X86_FEATURE_PTI))
                        WARN_ON(!had_user_mapping);
        } else {
                /*
                 * This is the first time we're mapping an LDT for this process.
                 * Sync the pgd to the usermode tables.
                 */
                WARN_ON(had_kernel_mapping);
                if (boot_cpu_has(X86_FEATURE_PTI))
                        WARN_ON(had_user_mapping);
        }
}

#ifdef CONFIG_X86_PAE

static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
{
        p4d_t *p4d;
        pud_t *pud;

        if (pgd->pgd == 0)
                return NULL;

        p4d = p4d_offset(pgd, va);
        if (p4d_none(*p4d))
                return NULL;

        pud = pud_offset(p4d, va);
        if (pud_none(*pud))
                return NULL;

        return pmd_offset(pud, va);
}

static void map_ldt_struct_to_user(struct mm_struct *mm)
{
        pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
        pmd_t *k_pmd, *u_pmd;

        k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
        u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);

        if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
                set_pmd(u_pmd, *k_pmd);
}

static void sanity_check_ldt_mapping(struct mm_struct *mm)
{
        pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
        bool had_kernel, had_user;
        pmd_t *k_pmd, *u_pmd;

        k_pmd      = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
        u_pmd      = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
        had_kernel = (k_pmd->pmd != 0);
        had_user   = (u_pmd->pmd != 0);

        do_sanity_check(mm, had_kernel, had_user);
}

#else /* !CONFIG_X86_PAE */

static void map_ldt_struct_to_user(struct mm_struct *mm)
{
        pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);

        if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
                set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}

static void sanity_check_ldt_mapping(struct mm_struct *mm)
{
        pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
        bool had_kernel = (pgd->pgd != 0);
        bool had_user   = (kernel_to_user_pgdp(pgd)->pgd != 0);

        do_sanity_check(mm, had_kernel, had_user);
}

#endif /* CONFIG_X86_PAE */

/*
 * If PTI is enabled, this maps the LDT into the kernelmode and
 * usermode tables for the given mm.
 */
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
        unsigned long va;
        bool is_vmalloc;
        spinlock_t *ptl;
        int i, nr_pages;

        if (!boot_cpu_has(X86_FEATURE_PTI))
                return 0;

        /*
         * Any given ldt_struct should have map_ldt_struct() called at most
         * once.
         */
        WARN_ON(ldt->slot != -1);

        /* Check if the current mappings are sane */
        sanity_check_ldt_mapping(mm);

        is_vmalloc = is_vmalloc_addr(ldt->entries);

        nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);

        for (i = 0; i < nr_pages; i++) {
                unsigned long offset = i << PAGE_SHIFT;
                const void *src = (char *)ldt->entries + offset;
                unsigned long pfn;
                pgprot_t pte_prot;
                pte_t pte, *ptep;

                va = (unsigned long)ldt_slot_va(slot) + offset;
                pfn = is_vmalloc ? vmalloc_to_pfn(src) :
                        page_to_pfn(virt_to_page(src));
                /*
                 * Treat the PTI LDT range as a *userspace* range.
                 * get_locked_pte() will allocate all needed pagetables
                 * and account for them in this mm.
                 */
                ptep = get_locked_pte(mm, va, &ptl);
                if (!ptep)
                        return -ENOMEM;
                /*
                 * Map it RO so the easy to find address is not a primary
                 * target via some kernel interface which misses a
                 * permission check.
                 */
                pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
                /* Filter out unsuppored __PAGE_KERNEL* bits: */
                pgprot_val(pte_prot) &= __supported_pte_mask;
                pte = pfn_pte(pfn, pte_prot);
                set_pte_at(mm, va, ptep, pte);
                pte_unmap_unlock(ptep, ptl);
        }

        /* Propagate LDT mapping to the user page-table */
        map_ldt_struct_to_user(mm);

        ldt->slot = slot;
        return 0;
}

static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
        unsigned long va;
        int i, nr_pages;

        if (!ldt)
                return;

        /* LDT map/unmap is only required for PTI */
        if (!boot_cpu_has(X86_FEATURE_PTI))
                return;

        nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);

        for (i = 0; i < nr_pages; i++) {
                unsigned long offset = i << PAGE_SHIFT;
                spinlock_t *ptl;
                pte_t *ptep;

                va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
                ptep = get_locked_pte(mm, va, &ptl);
                if (!WARN_ON_ONCE(!ptep)) {
                        pte_clear(mm, va, ptep);
                        pte_unmap_unlock(ptep, ptl);
                }
        }

        va = (unsigned long)ldt_slot_va(ldt->slot);
        flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
}

#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
        return 0;
}

static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
}
#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
        struct mmu_gather tlb;
        unsigned long start = LDT_BASE_ADDR;
        unsigned long end = LDT_END_ADDR;

        if (!boot_cpu_has(X86_FEATURE_PTI))
                return;

        /*
         * Although free_pgd_range() is intended for freeing user
         * page-tables, it also works out for kernel mappings on x86.
         * We use tlb_gather_mmu_fullmm() to avoid confusing the
         * range-tracking logic in __tlb_adjust_range().
         */
        tlb_gather_mmu_fullmm(&tlb, mm);
        free_pgd_range(&tlb, start, end, start, end);
        tlb_finish_mmu(&tlb);
#endif
}

/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
        paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}

static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
{
        mutex_lock(&mm->context.lock);

        /* Synchronizes with READ_ONCE in load_mm_ldt. */
        smp_store_release(&mm->context.ldt, ldt);

        /* Activate the LDT for all CPUs using currents mm. */
        on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);

        mutex_unlock(&mm->context.lock);
}

static void free_ldt_struct(struct ldt_struct *ldt)
{
        if (likely(!ldt))
                return;

        paravirt_free_ldt(ldt->entries, ldt->nr_entries);
        if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
                vfree_atomic(ldt->entries);
        else
                free_page((unsigned long)ldt->entries);
        kfree(ldt);
}

/*
 * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
 * the new task is not running, so nothing can be installed.
 */
int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
        struct ldt_struct *new_ldt;
        int retval = 0;

        if (!old_mm)
                return 0;

        mutex_lock(&old_mm->context.lock);
        if (!old_mm->context.ldt)
                goto out_unlock;

        new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
        if (!new_ldt) {
                retval = -ENOMEM;
                goto out_unlock;
        }

        memcpy(new_ldt->entries, old_mm->context.ldt->entries,
               new_ldt->nr_entries * LDT_ENTRY_SIZE);
        finalize_ldt_struct(new_ldt);

        retval = map_ldt_struct(mm, new_ldt, 0);
        if (retval) {
                free_ldt_pgtables(mm);
                free_ldt_struct(new_ldt);
                goto out_unlock;
        }
        mm->context.ldt = new_ldt;

out_unlock:
        mutex_unlock(&old_mm->context.lock);
        return retval;
}

/*
 * No need to lock the MM as we are the last user
 *
 * 64bit: Don't touch the LDT register - we're already in the next thread.
 */
void destroy_context_ldt(struct mm_struct *mm)
{
        free_ldt_struct(mm->context.ldt);
        mm->context.ldt = NULL;
}

void ldt_arch_exit_mmap(struct mm_struct *mm)
{
        free_ldt_pgtables(mm);
}

static int read_ldt(void __user *ptr, unsigned long bytecount)
{
        struct mm_struct *mm = current->mm;
        unsigned long entries_size;
        int retval;

        down_read(&mm->context.ldt_usr_sem);

        if (!mm->context.ldt) {
                retval = 0;
                goto out_unlock;
        }

        if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
                bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;

        entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
        if (entries_size > bytecount)
                entries_size = bytecount;

        if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
                retval = -EFAULT;
                goto out_unlock;
        }

        if (entries_size != bytecount) {
                /* Zero-fill the rest and pretend we read bytecount bytes. */
                if (clear_user(ptr + entries_size, bytecount - entries_size)) {
                        retval = -EFAULT;
                        goto out_unlock;
                }
        }
        retval = bytecount;

out_unlock:
        up_read(&mm->context.ldt_usr_sem);
        return retval;
}

static int read_default_ldt(void __user *ptr, unsigned long bytecount)
{
        /* CHECKME: Can we use _one_ random number ? */
#ifdef CONFIG_X86_32
        unsigned long size = 5 * sizeof(struct desc_struct);
#else
        unsigned long size = 128;
#endif
        if (bytecount > size)
                bytecount = size;
        if (clear_user(ptr, bytecount))
                return -EFAULT;
        return bytecount;
}

static bool allow_16bit_segments(void)
{
        if (!IS_ENABLED(CONFIG_X86_16BIT))
                return false;

#ifdef CONFIG_XEN_PV
        /*
         * Xen PV does not implement ESPFIX64, which means that 16-bit
         * segments will not work correctly.  Until either Xen PV implements
         * ESPFIX64 and can signal this fact to the guest or unless someone
         * provides compelling evidence that allowing broken 16-bit segments
         * is worthwhile, disallow 16-bit segments under Xen PV.
         */
        if (xen_pv_domain()) {
                pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n");
                return false;
        }
#endif

        return true;
}

static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
        struct mm_struct *mm = current->mm;
        struct ldt_struct *new_ldt, *old_ldt;
        unsigned int old_nr_entries, new_nr_entries;
        struct user_desc ldt_info;
        struct desc_struct ldt;
        int error;

        error = -EINVAL;
        if (bytecount != sizeof(ldt_info))
                goto out;
        error = -EFAULT;
        if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
                goto out;

        error = -EINVAL;
        if (ldt_info.entry_number >= LDT_ENTRIES)
                goto out;
        if (ldt_info.contents == 3) {
                if (oldmode)
                        goto out;
                if (ldt_info.seg_not_present == 0)
                        goto out;
        }

        if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
            LDT_empty(&ldt_info)) {
                /* The user wants to clear the entry. */
                memset(&ldt, 0, sizeof(ldt));
        } else {
                if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
                        error = -EINVAL;
                        goto out;
                }

                fill_ldt(&ldt, &ldt_info);
                if (oldmode)
                        ldt.avl = 0;
        }

        if (down_write_killable(&mm->context.ldt_usr_sem))
                return -EINTR;

        old_ldt       = mm->context.ldt;
        old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
        new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);

        error = -ENOMEM;
        new_ldt = alloc_ldt_struct(new_nr_entries);
        if (!new_ldt)
                goto out_unlock;

        if (old_ldt)
                memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);

        new_ldt->entries[ldt_info.entry_number] = ldt;
        finalize_ldt_struct(new_ldt);

        /*
         * If we are using PTI, map the new LDT into the userspace pagetables.
         * If there is already an LDT, use the other slot so that other CPUs
         * will continue to use the old LDT until install_ldt() switches
         * them over to the new LDT.
         */
        error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
        if (error) {
                /*
                 * This only can fail for the first LDT setup. If an LDT is
                 * already installed then the PTE page is already
                 * populated. Mop up a half populated page table.
                 */
                if (!WARN_ON_ONCE(old_ldt))
                        free_ldt_pgtables(mm);
                free_ldt_struct(new_ldt);
                goto out_unlock;
        }

        install_ldt(mm, new_ldt);
        unmap_ldt_struct(mm, old_ldt);
        free_ldt_struct(old_ldt);
        error = 0;

out_unlock:
        up_write(&mm->context.ldt_usr_sem);
out:
        return error;
}

SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
                unsigned long , bytecount)
{
        int ret = -ENOSYS;

        switch (func) {
        case 0:
                ret = read_ldt(ptr, bytecount);
                break;
        case 1:
                ret = write_ldt(ptr, bytecount, 1);
                break;
        case 2:
                ret = read_default_ldt(ptr, bytecount);
                break;
        case 0x11:
                ret = write_ldt(ptr, bytecount, 0);
                break;
        }
        /*
         * The SYSCALL_DEFINE() macros give us an 'unsigned long'
         * return type, but the ABI for sys_modify_ldt() expects
         * 'int'.  This cast gives us an int-sized value in %rax
         * for the return code.  The 'unsigned' is necessary so
         * the compiler does not try to sign-extend the negative
         * return codes into the high half of the register when
         * taking the value from int->long.
         */
        return (unsigned int)ret;
}












































































































































































































































































































































    1 


    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/super.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/vfs.h>
#include <linux/random.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/dax.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/fsnotify.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>

#include "ext4.h"
#include "ext4_extents.h"        /* Needed for trace points definition */
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "mballoc.h"
#include "fsmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>

static struct ext4_lazy_init *ext4_li_info;
static DEFINE_MUTEX(ext4_li_mtx);
static struct ratelimit_state ext4_mount_msg_ratelimit;

static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static void ext4_update_super(struct super_block *sb);
static int ext4_commit_super(struct super_block *sb);
static int ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
static int ext4_clear_journal_err(struct super_block *sb,
                                  struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                            unsigned int journal_inum);
static int ext4_validate_options(struct fs_context *fc);
static int ext4_check_opt_consistency(struct fs_context *fc,
                                      struct super_block *sb);
static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
static void ext4_fc_free(struct fs_context *fc);
static int ext4_init_fs_context(struct fs_context *fc);
static void ext4_kill_sb(struct super_block *sb);
static const struct fs_parameter_spec ext4_param_specs[];

/*
 * Lock ordering
 *
 * page fault path:
 * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
 *   -> page lock -> i_data_sem (rw)
 *
 * buffered write path:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> page lock ->
 *   i_data_sem (rw)
 *
 * truncate:
 * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
 *   page lock
 * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
 *   i_data_sem (rw)
 *
 * direct IO:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 *
 * writepages:
 * transaction start -> page lock(s) -> i_data_sem (rw)
 */

static const struct fs_context_operations ext4_context_ops = {
        .parse_param        = ext4_parse_param,
        .get_tree        = ext4_get_tree,
        .reconfigure        = ext4_reconfigure,
        .free                = ext4_fc_free,
};


#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext2",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif


static struct file_system_type ext3_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext3",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)


static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                                  bh_end_io_t *end_io)
{
        /*
         * buffer's verified bit is no longer valid after reading from
         * disk again due to write out error, clear it to make sure we
         * recheck the buffer contents.
         */
        clear_buffer_verified(bh);

        bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
        get_bh(bh);
        submit_bh(REQ_OP_READ | op_flags, bh);
}

void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
                         bh_end_io_t *end_io)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return;
        }
        __ext4_read_bh(bh, op_flags, end_io);
}

int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return 0;
        }

        __ext4_read_bh(bh, op_flags, end_io);

        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return 0;
        return -EIO;
}

int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
        lock_buffer(bh);
        if (!wait) {
                ext4_read_bh_nowait(bh, op_flags, NULL);
                return 0;
        }
        return ext4_read_bh(bh, op_flags, NULL);
}

/*
 * This works like __bread_gfp() except it uses ERR_PTR for error
 * returns.  Currently with sb_bread it's impossible to distinguish
 * between ENOMEM and EIO situations (since both result in a NULL
 * return.
 */
static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
                                               sector_t block,
                                               blk_opf_t op_flags, gfp_t gfp)
{
        struct buffer_head *bh;
        int ret;

        bh = sb_getblk_gfp(sb, block, gfp);
        if (bh == NULL)
                return ERR_PTR(-ENOMEM);
        if (ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
                                   blk_opf_t op_flags)
{
        gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
                        ~__GFP_FS) | __GFP_MOVABLE;

        return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
}

struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                            sector_t block)
{
        gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
                        ~__GFP_FS);

        return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}

void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
        struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
                        sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);

        if (likely(bh)) {
                if (trylock_buffer(bh))
                        ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
                brelse(bh);
        }
}

static int ext4_verify_csum_type(struct super_block *sb,
                                 struct ext4_super_block *es)
{
        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}

__le32 ext4_superblock_csum(struct super_block *sb,
                            struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int offset = offsetof(struct ext4_super_block, s_checksum);
        __u32 csum;

        csum = ext4_chksum(sbi, ~0, (char *)es, offset);

        return cpu_to_le32(csum);
}

static int ext4_superblock_csum_verify(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        if (!ext4_has_metadata_csum(sb))
                return 1;

        return es->s_checksum == ext4_superblock_csum(sb, es);
}

void ext4_superblock_csum_set(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (!ext4_has_metadata_csum(sb))
                return;

        es->s_checksum = ext4_superblock_csum(sb, es);
}

ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}

__u32 ext4_free_group_clusters(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
}

__u32 ext4_free_inodes_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
}

__u32 ext4_used_dirs_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
}

__u32 ext4_itable_unused_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
}

void ext4_block_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_table_set(struct super_block *sb,
                          struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}

void ext4_free_group_clusters_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
}

void ext4_free_inodes_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
}

void ext4_used_dirs_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
}

void ext4_itable_unused_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}

static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
{
        now = clamp_val(now, 0, (1ull << 40) - 1);

        *lo = cpu_to_le32(lower_32_bits(now));
        *hi = upper_32_bits(now);
}

static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
{
        return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
}
#define ext4_update_tstamp(es, tstamp) \
        __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
                             ktime_get_real_seconds())
#define ext4_get_tstamp(es, tstamp) \
        __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)

#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */

/*
 * The ext4_maybe_update_superblock() function checks and updates the
 * superblock if needed.
 *
 * This function is designed to update the on-disk superblock only under
 * certain conditions to prevent excessive disk writes and unnecessary
 * waking of the disk from sleep. The superblock will be updated if:
 * 1. More than an hour has passed since the last superblock update, and
 * 2. More than 16MB have been written since the last superblock update.
 *
 * @sb: The superblock
 */
static void ext4_maybe_update_superblock(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        journal_t *journal = sbi->s_journal;
        time64_t now;
        __u64 last_update;
        __u64 lifetime_write_kbytes;
        __u64 diff_size;

        if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
            !journal || (journal->j_flags & JBD2_UNMOUNT))
                return;

        now = ktime_get_real_seconds();
        last_update = ext4_get_tstamp(es, s_wtime);

        if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
                return;

        lifetime_write_kbytes = sbi->s_kbytes_written +
                ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                  sbi->s_sectors_written_start) >> 1);

        /* Get the number of kilobytes not written to disk to account
         * for statistics and compare with a multiple of 16 MB. This
         * is used to determine when the next superblock commit should
         * occur (i.e. not more often than once per 16MB if there was
         * less written in an hour).
         */
        diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);

        if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
                schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}

static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
        struct super_block                *sb = journal->j_private;
        struct ext4_sb_info                *sbi = EXT4_SB(sb);
        int                                error = is_journal_aborted(journal);
        struct ext4_journal_cb_entry        *jce;

        BUG_ON(txn->t_state == T_FINISHED);

        ext4_process_freed_data(sb, txn->t_tid);
        ext4_maybe_update_superblock(sb);

        spin_lock(&sbi->s_md_lock);
        while (!list_empty(&txn->t_private_list)) {
                jce = list_entry(txn->t_private_list.next,
                                 struct ext4_journal_cb_entry, jce_list);
                list_del_init(&jce->jce_list);
                spin_unlock(&sbi->s_md_lock);
                jce->jce_func(sb, jce, error);
                spin_lock(&sbi->s_md_lock);
        }
        spin_unlock(&sbi->s_md_lock);
}

/*
 * This writepage callback for write_cache_pages()
 * takes care of a few cases after page cleaning.
 *
 * write_cache_pages() already checks for dirty pages
 * and calls clear_page_dirty_for_io(), which we want,
 * to write protect the pages.
 *
 * However, we may have to redirty a page (see below.)
 */
static int ext4_journalled_writepage_callback(struct folio *folio,
                                              struct writeback_control *wbc,
                                              void *data)
{
        transaction_t *transaction = (transaction_t *) data;
        struct buffer_head *bh, *head;
        struct journal_head *jh;

        bh = head = folio_buffers(folio);
        do {
                /*
                 * We have to redirty a page in these cases:
                 * 1) If buffer is dirty, it means the page was dirty because it
                 * contains a buffer that needs checkpointing. So the dirty bit
                 * needs to be preserved so that checkpointing writes the buffer
                 * properly.
                 * 2) If buffer is not part of the committing transaction
                 * (we may have just accidentally come across this buffer because
                 * inode range tracking is not exact) or if the currently running
                 * transaction already contains this buffer as well, dirty bit
                 * needs to be preserved so that the buffer gets writeprotected
                 * properly on running transaction's commit.
                 */
                jh = bh2jh(bh);
                if (buffer_dirty(bh) ||
                    (jh && (jh->b_transaction != transaction ||
                            jh->b_next_transaction))) {
                        folio_redirty_for_writepage(wbc, folio);
                        goto out;
                }
        } while ((bh = bh->b_this_page) != head);

out:
        return AOP_WRITEPAGE_ACTIVATE;
}

static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
        struct writeback_control wbc = {
                .sync_mode =  WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = jinode->i_dirty_start,
                .range_end = jinode->i_dirty_end,
        };

        return write_cache_pages(mapping, &wbc,
                                 ext4_journalled_writepage_callback,
                                 jinode->i_transaction);
}

static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret;

        if (ext4_should_journal_data(jinode->i_vfs_inode))
                ret = ext4_journalled_submit_inode_data_buffers(jinode);
        else
                ret = ext4_normal_submit_inode_data_buffers(jinode);
        return ret;
}

static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret = 0;

        if (!ext4_should_journal_data(jinode->i_vfs_inode))
                ret = jbd2_journal_finish_inode_data_buffers(jinode);

        return ret;
}

static bool system_going_down(void)
{
        return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
                || system_state == SYSTEM_RESTART;
}

struct ext4_err_translation {
        int code;
        int errno;
};

#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }

static struct ext4_err_translation err_translation[] = {
        EXT4_ERR_TRANSLATE(EIO),
        EXT4_ERR_TRANSLATE(ENOMEM),
        EXT4_ERR_TRANSLATE(EFSBADCRC),
        EXT4_ERR_TRANSLATE(EFSCORRUPTED),
        EXT4_ERR_TRANSLATE(ENOSPC),
        EXT4_ERR_TRANSLATE(ENOKEY),
        EXT4_ERR_TRANSLATE(EROFS),
        EXT4_ERR_TRANSLATE(EFBIG),
        EXT4_ERR_TRANSLATE(EEXIST),
        EXT4_ERR_TRANSLATE(ERANGE),
        EXT4_ERR_TRANSLATE(EOVERFLOW),
        EXT4_ERR_TRANSLATE(EBUSY),
        EXT4_ERR_TRANSLATE(ENOTDIR),
        EXT4_ERR_TRANSLATE(ENOTEMPTY),
        EXT4_ERR_TRANSLATE(ESHUTDOWN),
        EXT4_ERR_TRANSLATE(EFAULT),
};

static int ext4_errno_to_code(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(err_translation); i++)
                if (err_translation[i].errno == errno)
                        return err_translation[i].code;
        return EXT4_ERR_UNKNOWN;
}

static void save_error_info(struct super_block *sb, int error,
                            __u32 ino, __u64 block,
                            const char *func, unsigned int line)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* We default to EFSCORRUPTED error... */
        if (error == 0)
                error = EFSCORRUPTED;

        spin_lock(&sbi->s_error_lock);
        sbi->s_add_error_count++;
        sbi->s_last_error_code = error;
        sbi->s_last_error_line = line;
        sbi->s_last_error_ino = ino;
        sbi->s_last_error_block = block;
        sbi->s_last_error_func = func;
        sbi->s_last_error_time = ktime_get_real_seconds();
        if (!sbi->s_first_error_time) {
                sbi->s_first_error_code = error;
                sbi->s_first_error_line = line;
                sbi->s_first_error_ino = ino;
                sbi->s_first_error_block = block;
                sbi->s_first_error_func = func;
                sbi->s_first_error_time = sbi->s_last_error_time;
        }
        spin_unlock(&sbi->s_error_lock);
}

/* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
 *
 * On ext2, we can store the error state of the filesystem in the
 * superblock.  That is not possible on ext4, because we may have other
 * write ordering constraints on the superblock which prevent us from
 * writing it out straight away; and given that the journal is about to
 * be aborted, we can't rely on the current, or future, transactions to
 * write out the superblock safely.
 *
 * We'll just use the jbd2_journal_abort() error code to record an error in
 * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 *
 * If force_ro is set, we unconditionally force the filesystem into an
 * ABORT|READONLY state, unless the error response on the fs has been set to
 * panic in which case we take the easy way out and panic immediately. This is
 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
 * at a critical moment in log management.
 */
static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
                              __u32 ino, __u64 block,
                              const char *func, unsigned int line)
{
        journal_t *journal = EXT4_SB(sb)->s_journal;
        bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);

        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        if (test_opt(sb, WARN_ON_ERROR))
                WARN_ON_ONCE(1);

        if (!continue_fs && !sb_rdonly(sb)) {
                set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
                if (journal)
                        jbd2_journal_abort(journal, -EIO);
        }

        if (!bdev_read_only(sb->s_bdev)) {
                save_error_info(sb, error, ino, block, func, line);
                /*
                 * In case the fs should keep running, we need to writeout
                 * superblock through the journal. Due to lock ordering
                 * constraints, it may not be safe to do it right here so we
                 * defer superblock flushing to a workqueue.
                 */
                if (continue_fs && journal)
                        schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
                else
                        ext4_commit_super(sb);
        }

        /*
         * We force ERRORS_RO behavior when system is rebooting. Otherwise we
         * could panic during 'reboot -f' as the underlying device got already
         * disabled.
         */
        if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
        }

        if (sb_rdonly(sb) || continue_fs)
                return;

        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        /*
         * Make sure updated value of ->s_mount_flags will be visible before
         * ->s_flags update
         */
        smp_wmb();
        sb->s_flags |= SB_RDONLY;
}

static void update_super_work(struct work_struct *work)
{
        struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
                                                s_sb_upd_work);
        journal_t *journal = sbi->s_journal;
        handle_t *handle;

        /*
         * If the journal is still running, we have to write out superblock
         * through the journal to avoid collisions of other journalled sb
         * updates.
         *
         * We use directly jbd2 functions here to avoid recursing back into
         * ext4 error handling code during handling of previous errors.
         */
        if (!sb_rdonly(sbi->s_sb) && journal) {
                struct buffer_head *sbh = sbi->s_sbh;
                bool call_notify_err = false;

                handle = jbd2_journal_start(journal, 1);
                if (IS_ERR(handle))
                        goto write_directly;
                if (jbd2_journal_get_write_access(handle, sbh)) {
                        jbd2_journal_stop(handle);
                        goto write_directly;
                }

                if (sbi->s_add_error_count > 0)
                        call_notify_err = true;

                ext4_update_super(sbi->s_sb);
                if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
                        ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
                                 "superblock detected");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }

                if (jbd2_journal_dirty_metadata(handle, sbh)) {
                        jbd2_journal_stop(handle);
                        goto write_directly;
                }
                jbd2_journal_stop(handle);

                if (call_notify_err)
                        ext4_notify_error_sysfs(sbi);

                return;
        }
write_directly:
        /*
         * Write through journal failed. Write sb directly to get error info
         * out and hope for the best.
         */
        ext4_commit_super(sbi->s_sb);
        ext4_notify_error_sysfs(sbi);
}

#define ext4_error_ratelimit(sb)                                        \
                ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),        \
                             "EXT4-fs error")

void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, bool force_ro, int error, __u64 block,
                  const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_forced_shutdown(sb)))
                return;

        trace_ext4_error(sb, function, line);
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT
                       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
                       sb->s_id, function, line, current->comm, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);

        ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}

void __ext4_error_inode(struct inode *inode, const char *function,
                        unsigned int line, ext4_fsblk_t block, int error,
                        const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%lu: block %llu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, &vaf);
                else
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%lu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);

        ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
                          function, line);
}

void __ext4_error_file(struct file *file, const char *function,
                       unsigned int line, ext4_fsblk_t block,
                       const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;
        struct inode *inode = file_inode(file);
        char pathname[80], *path;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                path = file_path(file, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
                               "block %llu: comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, path, &vaf);
                else
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
                               "comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, path, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);

        ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
                          function, line);
}

const char *ext4_decode_error(struct super_block *sb, int errno,
                              char nbuf[16])
{
        char *errstr = NULL;

        switch (errno) {
        case -EFSCORRUPTED:
                errstr = "Corrupt filesystem";
                break;
        case -EFSBADCRC:
                errstr = "Filesystem failed CRC";
                break;
        case -EIO:
                errstr = "IO failure";
                break;
        case -ENOMEM:
                errstr = "Out of memory";
                break;
        case -EROFS:
                if (!sb || (EXT4_SB(sb)->s_journal &&
                            EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
                        errstr = "Journal has aborted";
                else
                        errstr = "Readonly filesystem";
                break;
        default:
                /* If the caller passed in an extra buffer for unknown
                 * errors, textualise them now.  Else we just return
                 * NULL. */
                if (nbuf) {
                        /* Check for truncated error codes... */
                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
                                errstr = nbuf;
                }
                break;
        }

        return errstr;
}

/* __ext4_std_error decodes expected errors from journaling functions
 * automatically and invokes the appropriate error response.  */

void __ext4_std_error(struct super_block *sb, const char *function,
                      unsigned int line, int errno)
{
        char nbuf[16];
        const char *errstr;

        if (unlikely(ext4_forced_shutdown(sb)))
                return;

        /* Special case: if the error is EROFS, and we're not already
         * inside a transaction, then there's really no point in logging
         * an error. */
        if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
                return;

        if (ext4_error_ratelimit(sb)) {
                errstr = ext4_decode_error(sb, errno, nbuf);
                printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
                       sb->s_id, function, line, errstr);
        }
        fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);

        ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}

void __ext4_msg(struct super_block *sb,
                const char *prefix, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (sb) {
                atomic_inc(&EXT4_SB(sb)->s_msg_count);
                if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
                                  "EXT4-fs"))
                        return;
        }

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        if (sb)
                printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        else
                printk("%sEXT4-fs: %pV\n", prefix, &vaf);
        va_end(args);
}

static int ext4_warning_ratelimit(struct super_block *sb)
{
        atomic_inc(&EXT4_SB(sb)->s_warning_count);
        return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
                            "EXT4-fs warning");
}

void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
               sb->s_id, function, line, &vaf);
        va_end(args);
}

void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(inode->i_sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
               "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
               function, line, inode->i_ino, current->comm, &vaf);
        va_end(args);
}

void __ext4_grp_locked_error(const char *function, unsigned int line,
                             struct super_block *sb, ext4_group_t grp,
                             unsigned long ino, ext4_fsblk_t block,
                             const char *fmt, ...)
__releases(bitlock)
__acquires(bitlock)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_forced_shutdown(sb)))
                return;

        trace_ext4_error(sb, function, line);
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
                       sb->s_id, function, line, grp);
                if (ino)
                        printk(KERN_CONT "inode %lu: ", ino);
                if (block)
                        printk(KERN_CONT "block %llu:",
                               (unsigned long long) block);
                printk(KERN_CONT "%pV\n", &vaf);
                va_end(args);
        }

        if (test_opt(sb, ERRORS_CONT)) {
                if (test_opt(sb, WARN_ON_ERROR))
                        WARN_ON_ONCE(1);
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                if (!bdev_read_only(sb->s_bdev)) {
                        save_error_info(sb, EFSCORRUPTED, ino, block, function,
                                        line);
                        schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
                }
                return;
        }
        ext4_unlock_group(sb, grp);
        ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
        /*
         * We only get here in the ERRORS_RO case; relocking the group
         * may be dangerous, but nothing bad will happen since the
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
         */
        ext4_lock_group(sb, grp);
        return;
}

void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int flags)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
        int ret;

        if (!grp || !gdp)
                return;
        if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret)
                        percpu_counter_sub(&sbi->s_freeclusters_counter,
                                           grp->bb_free);
        }

        if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret && gdp) {
                        int count;

                        count = ext4_free_inodes_count(sb, gdp);
                        percpu_counter_sub(&sbi->s_freeinodes_counter,
                                           count);
                }
        }
}

void ext4_update_dynamic_rev(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
                return;

        ext4_warning(sb,
                     "updating to rev %d because of new feature flag, "
                     "running e2fsck is recommended",
                     EXT4_DYNAMIC_REV);

        es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
        es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
        es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
        /* leave es->s_feature_*compat flags alone */
        /* es->s_uuid will be set by e2fsck if empty */

        /*
         * The rest of the superblock fields should be zero, and if not it
         * means they are likely already in use, so leave them alone.  We
         * can leave it up to e2fsck to clean up any inconsistencies there.
         */
}

static inline struct inode *orphan_list_entry(struct list_head *l)
{
        return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
}

static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
{
        struct list_head *l;

        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
                 le32_to_cpu(sbi->s_es->s_last_orphan));

        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
                struct inode *inode = orphan_list_entry(l);
                printk(KERN_ERR "  "
                       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
                       inode->i_sb->s_id, inode->i_ino, inode,
                       inode->i_mode, inode->i_nlink,
                       NEXT_ORPHAN(inode));
        }
}

#ifdef CONFIG_QUOTA
static int ext4_quota_off(struct super_block *sb, int type);

static inline void ext4_quotas_off(struct super_block *sb, int type)
{
        BUG_ON(type > EXT4_MAXQUOTAS);

        /* Use our quota_off function to clear inode flags etc. */
        for (type--; type >= 0; type--)
                ext4_quota_off(sb, type);
}

/*
 * This is a helper function which is used in the mount/remount
 * codepaths (which holds s_umount) to fetch the quota file name.
 */
static inline char *get_qf_name(struct super_block *sb,
                                struct ext4_sb_info *sbi,
                                int type)
{
        return rcu_dereference_protected(sbi->s_qf_names[type],
                                         lockdep_is_held(&sb->s_umount));
}
#else
static inline void ext4_quotas_off(struct super_block *sb, int type)
{
}
#endif

static int ext4_percpu_param_init(struct ext4_sb_info *sbi)
{
        ext4_fsblk_t block;
        int err;

        block = ext4_count_free_clusters(sbi->s_sb);
        ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block));
        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sbi->s_sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
                                          GFP_KERNEL);
        }
        if (!err)
                err = percpu_counter_init(&sbi->s_dirs_counter,
                                          ext4_count_dirs(sbi->s_sb), GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_init_rwsem(&sbi->s_writepages_rwsem);

        if (err)
                ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory");

        return err;
}

static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi)
{
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
}

static void ext4_group_desc_free(struct ext4_sb_info *sbi)
{
        struct buffer_head **group_desc;
        int i;

        rcu_read_lock();
        group_desc = rcu_dereference(sbi->s_group_desc);
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(group_desc[i]);
        kvfree(group_desc);
        rcu_read_unlock();
}

static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
{
        struct flex_groups **flex_groups;
        int i;

        rcu_read_lock();
        flex_groups = rcu_dereference(sbi->s_flex_groups);
        if (flex_groups) {
                for (i = 0; i < sbi->s_flex_groups_allocated; i++)
                        kvfree(flex_groups[i]);
                kvfree(flex_groups);
        }
        rcu_read_unlock();
}

static void ext4_put_super(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int aborted = 0;
        int err;

        /*
         * Unregister sysfs before destroying jbd2 journal.
         * Since we could still access attr_journal_task attribute via sysfs
         * path which could have sbi->s_journal->j_task as NULL
         * Unregister sysfs before flush sbi->s_sb_upd_work.
         * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
         * read metadata verify failed then will queue error work.
         * update_super_work will call start_this_handle may trigger
         * BUG_ON.
         */
        ext4_unregister_sysfs(sb);

        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
                ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
                         &sb->s_uuid);

        ext4_unregister_li_request(sb);
        ext4_quotas_off(sb, EXT4_MAXQUOTAS);

        flush_work(&sbi->s_sb_upd_work);
        destroy_workqueue(sbi->rsv_conversion_wq);
        ext4_release_orphan_info(sb);

        if (sbi->s_journal) {
                aborted = is_journal_aborted(sbi->s_journal);
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
                if ((err < 0) && !aborted) {
                        ext4_abort(sb, -err, "Couldn't clean up the journal");
                }
        }

        ext4_es_unregister_shrinker(sbi);
        timer_shutdown_sync(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);

        if (!sb_rdonly(sb) && !aborted) {
                ext4_clear_feature_journal_needs_recovery(sb);
                ext4_clear_feature_orphan_present(sb);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
        }
        if (!sb_rdonly(sb))
                ext4_commit_super(sb);

        ext4_group_desc_free(sbi);
        ext4_flex_groups_free(sbi);
        ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
        for (int i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif

        /* Debugging code just in case the in-memory inode orphan list
         * isn't empty.  The on-disk one can be non-empty if we've
         * detected an error and taken the fs readonly, but the
         * in-memory list had better be clean by this point. */
        if (!list_empty(&sbi->s_orphan))
                dump_orphan_list(sb, sbi);
        ASSERT(list_empty(&sbi->s_orphan));

        sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
        if (sbi->s_journal_bdev_file) {
                /*
                 * Invalidate the journal device's buffers.  We don't want them
                 * floating about in memory - the physical journal device may
                 * hotswapped, and it breaks the `ro-after' testing code.
                 */
                sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
                invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
        }

        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        ext4_stop_mmpd(sbi);

        brelse(sbi->s_sbh);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);
        kfree(sbi->s_blockgroup_lock);
        fs_put_dax(sbi->s_daxdev, NULL);
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
        utf8_unload(sb->s_encoding);
#endif
        kfree(sbi);
}

static struct kmem_cache *ext4_inode_cachep;

/*
 * Called inside transaction, so use GFP_NOFS
 */
static struct inode *ext4_alloc_inode(struct super_block *sb)
{
        struct ext4_inode_info *ei;

        ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;

        inode_set_iversion(&ei->vfs_inode, 1);
        ei->i_flags = 0;
        spin_lock_init(&ei->i_raw_lock);
        ei->i_prealloc_node = RB_ROOT;
        atomic_set(&ei->i_prealloc_active, 0);
        rwlock_init(&ei->i_prealloc_lock);
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
        INIT_LIST_HEAD(&ei->i_es_list);
        ei->i_es_all_nr = 0;
        ei->i_es_shk_nr = 0;
        ei->i_es_shrink_lblk = 0;
        ei->i_reserved_data_blocks = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
        ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
        memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
        ext4_fc_init_inode(&ei->vfs_inode);
        mutex_init(&ei->i_fc_lock);
        return &ei->vfs_inode;
}

static int ext4_drop_inode(struct inode *inode)
{
        int drop = generic_drop_inode(inode);

        if (!drop)
                drop = fscrypt_drop_inode(inode);

        trace_ext4_drop_inode(inode, drop);
        return drop;
}

static void ext4_free_in_core_inode(struct inode *inode)
{
        fscrypt_free_inode(inode);
        if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
                pr_warn("%s: inode %ld still in fc list",
                        __func__, inode->i_ino);
        }
        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}

static void ext4_destroy_inode(struct inode *inode)
{
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
                         inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
                dump_stack();
        }

        if (EXT4_I(inode)->i_reserved_data_blocks)
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
                         inode->i_ino, EXT4_I(inode),
                         EXT4_I(inode)->i_reserved_data_blocks);
}

static void ext4_shutdown(struct super_block *sb)
{
       ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH);
}

static void init_once(void *foo)
{
        struct ext4_inode_info *ei = foo;

        INIT_LIST_HEAD(&ei->i_orphan);
        init_rwsem(&ei->xattr_sem);
        init_rwsem(&ei->i_data_sem);
        inode_init_once(&ei->vfs_inode);
        ext4_fc_init_inode(&ei->vfs_inode);
}

static int __init init_inodecache(void)
{
        ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
                                sizeof(struct ext4_inode_info), 0,
                                SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                                offsetof(struct ext4_inode_info, i_data),
                                sizeof_field(struct ext4_inode_info, i_data),
                                init_once);
        if (ext4_inode_cachep == NULL)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(ext4_inode_cachep);
}

void ext4_clear_inode(struct inode *inode)
{
        ext4_fc_del(inode);
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
        dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
                jbd2_free_inode(EXT4_I(inode)->jinode);
                EXT4_I(inode)->jinode = NULL;
        }
        fscrypt_put_encryption_info(inode);
        fsverity_cleanup_inode(inode);
}

static struct inode *ext4_nfs_get_inode(struct super_block *sb,
                                        u64 ino, u32 generation)
{
        struct inode *inode;

        /*
         * Currently we don't know the generation for parent directory, so
         * a generation of 0 means "accept any"
         */
        inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
        if (generation && inode->i_generation != generation) {
                iput(inode);
                return ERR_PTR(-ESTALE);
        }

        return inode;
}

static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static int ext4_nfs_commit_metadata(struct inode *inode)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL
        };

        trace_ext4_nfs_commit_metadata(inode);
        return ext4_write_inode(inode, &wbc);
}

#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])

static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags);

static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
{
        return EXT4_I(inode)->i_dquot;
}

static const struct dquot_operations ext4_quota_operations = {
        .get_reserved_space        = ext4_get_reserved_space,
        .write_dquot                = ext4_write_dquot,
        .acquire_dquot                = ext4_acquire_dquot,
        .release_dquot                = ext4_release_dquot,
        .mark_dirty                = ext4_mark_dquot_dirty,
        .write_info                = ext4_write_info,
        .alloc_dquot                = dquot_alloc,
        .destroy_dquot                = dquot_destroy,
        .get_projid                = ext4_get_projid,
        .get_inode_usage        = ext4_get_inode_usage,
        .get_next_id                = dquot_get_next_id,
};

static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on        = ext4_quota_on,
        .quota_off        = ext4_quota_off,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .set_dqblk        = dquot_set_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
};
#endif

static const struct super_operations ext4_sops = {
        .alloc_inode        = ext4_alloc_inode,
        .free_inode        = ext4_free_in_core_inode,
        .destroy_inode        = ext4_destroy_inode,
        .write_inode        = ext4_write_inode,
        .dirty_inode        = ext4_dirty_inode,
        .drop_inode        = ext4_drop_inode,
        .evict_inode        = ext4_evict_inode,
        .put_super        = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs        = ext4_freeze,
        .unfreeze_fs        = ext4_unfreeze,
        .statfs                = ext4_statfs,
        .show_options        = ext4_show_options,
        .shutdown        = ext4_shutdown,
#ifdef CONFIG_QUOTA
        .quota_read        = ext4_quota_read,
        .quota_write        = ext4_quota_write,
        .get_dquots        = ext4_get_dquots,
#endif
};

static const struct export_operations ext4_export_ops = {
        .encode_fh = generic_encode_ino32_fh,
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
        .get_parent = ext4_get_parent,
        .commit_metadata = ext4_nfs_commit_metadata,
};

enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
        Opt_resgid, Opt_resuid, Opt_sb,
        Opt_nouid32, Opt_debug, Opt_removed,
        Opt_user_xattr, Opt_acl,
        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
        Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
        Opt_inlinecrypt,
        Opt_usrjquota, Opt_grpjquota, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_usrquota, Opt_grpquota, Opt_prjquota,
        Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
        Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
        Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
        Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
        Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
#ifdef CONFIG_EXT4_DEBUG
        Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
};

static const struct constant_table ext4_param_errors[] = {
        {"continue",        EXT4_MOUNT_ERRORS_CONT},
        {"panic",        EXT4_MOUNT_ERRORS_PANIC},
        {"remount-ro",        EXT4_MOUNT_ERRORS_RO},
        {}
};

static const struct constant_table ext4_param_data[] = {
        {"journal",        EXT4_MOUNT_JOURNAL_DATA},
        {"ordered",        EXT4_MOUNT_ORDERED_DATA},
        {"writeback",        EXT4_MOUNT_WRITEBACK_DATA},
        {}
};

static const struct constant_table ext4_param_data_err[] = {
        {"abort",        Opt_data_err_abort},
        {"ignore",        Opt_data_err_ignore},
        {}
};

static const struct constant_table ext4_param_jqfmt[] = {
        {"vfsold",        QFMT_VFS_OLD},
        {"vfsv0",        QFMT_VFS_V0},
        {"vfsv1",        QFMT_VFS_V1},
        {}
};

static const struct constant_table ext4_param_dax[] = {
        {"always",        Opt_dax_always},
        {"inode",        Opt_dax_inode},
        {"never",        Opt_dax_never},
        {}
};

/*
 * Mount option specification
 * We don't use fsparam_flag_no because of the way we set the
 * options and the way we show them in _ext4_show_options(). To
 * keep the changes to a minimum, let's keep the negative options
 * separate for now.
 */
static const struct fs_parameter_spec ext4_param_specs[] = {
        fsparam_flag        ("bsddf",                Opt_bsd_df),
        fsparam_flag        ("minixdf",                Opt_minix_df),
        fsparam_flag        ("grpid",                Opt_grpid),
        fsparam_flag        ("bsdgroups",                Opt_grpid),
        fsparam_flag        ("nogrpid",                Opt_nogrpid),
        fsparam_flag        ("sysvgroups",                Opt_nogrpid),
        fsparam_u32        ("resgid",                Opt_resgid),
        fsparam_u32        ("resuid",                Opt_resuid),
        fsparam_u32        ("sb",                        Opt_sb),
        fsparam_enum        ("errors",                Opt_errors, ext4_param_errors),
        fsparam_flag        ("nouid32",                Opt_nouid32),
        fsparam_flag        ("debug",                Opt_debug),
        fsparam_flag        ("oldalloc",                Opt_removed),
        fsparam_flag        ("orlov",                Opt_removed),
        fsparam_flag        ("user_xattr",                Opt_user_xattr),
        fsparam_flag        ("acl",                        Opt_acl),
        fsparam_flag        ("norecovery",                Opt_noload),
        fsparam_flag        ("noload",                Opt_noload),
        fsparam_flag        ("bh",                        Opt_removed),
        fsparam_flag        ("nobh",                Opt_removed),
        fsparam_u32        ("commit",                Opt_commit),
        fsparam_u32        ("min_batch_time",        Opt_min_batch_time),
        fsparam_u32        ("max_batch_time",        Opt_max_batch_time),
        fsparam_u32        ("journal_dev",                Opt_journal_dev),
        fsparam_bdev        ("journal_path",        Opt_journal_path),
        fsparam_flag        ("journal_checksum",        Opt_journal_checksum),
        fsparam_flag        ("nojournal_checksum",        Opt_nojournal_checksum),
        fsparam_flag        ("journal_async_commit",Opt_journal_async_commit),
        fsparam_flag        ("abort",                Opt_abort),
        fsparam_enum        ("data",                Opt_data, ext4_param_data),
        fsparam_enum        ("data_err",                Opt_data_err,
                                                ext4_param_data_err),
        fsparam_string_empty
                        ("usrjquota",                Opt_usrjquota),
        fsparam_string_empty
                        ("grpjquota",                Opt_grpjquota),
        fsparam_enum        ("jqfmt",                Opt_jqfmt, ext4_param_jqfmt),
        fsparam_flag        ("grpquota",                Opt_grpquota),
        fsparam_flag        ("quota",                Opt_quota),
        fsparam_flag        ("noquota",                Opt_noquota),
        fsparam_flag        ("usrquota",                Opt_usrquota),
        fsparam_flag        ("prjquota",                Opt_prjquota),
        fsparam_flag        ("barrier",                Opt_barrier),
        fsparam_u32        ("barrier",                Opt_barrier),
        fsparam_flag        ("nobarrier",                Opt_nobarrier),
        fsparam_flag        ("i_version",                Opt_removed),
        fsparam_flag        ("dax",                        Opt_dax),
        fsparam_enum        ("dax",                        Opt_dax_type, ext4_param_dax),
        fsparam_u32        ("stripe",                Opt_stripe),
        fsparam_flag        ("delalloc",                Opt_delalloc),
        fsparam_flag        ("nodelalloc",                Opt_nodelalloc),
        fsparam_flag        ("warn_on_error",        Opt_warn_on_error),
        fsparam_flag        ("nowarn_on_error",        Opt_nowarn_on_error),
        fsparam_u32        ("debug_want_extra_isize",
                                                Opt_debug_want_extra_isize),
        fsparam_flag        ("mblk_io_submit",        Opt_removed),
        fsparam_flag        ("nomblk_io_submit",        Opt_removed),
        fsparam_flag        ("block_validity",        Opt_block_validity),
        fsparam_flag        ("noblock_validity",        Opt_noblock_validity),
        fsparam_u32        ("inode_readahead_blks",
                                                Opt_inode_readahead_blks),
        fsparam_u32        ("journal_ioprio",        Opt_journal_ioprio),
        fsparam_u32        ("auto_da_alloc",        Opt_auto_da_alloc),
        fsparam_flag        ("auto_da_alloc",        Opt_auto_da_alloc),
        fsparam_flag        ("noauto_da_alloc",        Opt_noauto_da_alloc),
        fsparam_flag        ("dioread_nolock",        Opt_dioread_nolock),
        fsparam_flag        ("nodioread_nolock",        Opt_dioread_lock),
        fsparam_flag        ("dioread_lock",        Opt_dioread_lock),
        fsparam_flag        ("discard",                Opt_discard),
        fsparam_flag        ("nodiscard",                Opt_nodiscard),
        fsparam_u32        ("init_itable",                Opt_init_itable),
        fsparam_flag        ("init_itable",                Opt_init_itable),
        fsparam_flag        ("noinit_itable",        Opt_noinit_itable),
#ifdef CONFIG_EXT4_DEBUG
        fsparam_flag        ("fc_debug_force",        Opt_fc_debug_force),
        fsparam_u32        ("fc_debug_max_replay",        Opt_fc_debug_max_replay),
#endif
        fsparam_u32        ("max_dir_size_kb",        Opt_max_dir_size_kb),
        fsparam_flag        ("test_dummy_encryption",
                                                Opt_test_dummy_encryption),
        fsparam_string        ("test_dummy_encryption",
                                                Opt_test_dummy_encryption),
        fsparam_flag        ("inlinecrypt",                Opt_inlinecrypt),
        fsparam_flag        ("nombcache",                Opt_nombcache),
        fsparam_flag        ("no_mbcache",                Opt_nombcache),        /* for backward compatibility */
        fsparam_flag        ("prefetch_block_bitmaps",
                                                Opt_removed),
        fsparam_flag        ("no_prefetch_block_bitmaps",
                                                Opt_no_prefetch_block_bitmaps),
        fsparam_s32        ("mb_optimize_scan",        Opt_mb_optimize_scan),
        fsparam_string        ("check",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("nocheck",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("reservation",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("noreservation",        Opt_removed),        /* mount option from ext2/3 */
        fsparam_u32        ("journal",                Opt_removed),        /* mount option from ext2/3 */
        {}
};

#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))

#define MOPT_SET        0x0001
#define MOPT_CLEAR        0x0002
#define MOPT_NOSUPPORT        0x0004
#define MOPT_EXPLICIT        0x0008
#ifdef CONFIG_QUOTA
#define MOPT_Q                0
#define MOPT_QFMT        0x0010
#else
#define MOPT_Q                MOPT_NOSUPPORT
#define MOPT_QFMT        MOPT_NOSUPPORT
#endif
#define MOPT_NO_EXT2        0x0020
#define MOPT_NO_EXT3        0x0040
#define MOPT_EXT4_ONLY        (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_SKIP        0x0080
#define        MOPT_2                0x0100

static const struct mount_opts {
        int        token;
        int        mount_opt;
        int        flags;
} ext4_mount_opts[] = {
        {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
        {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
        {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
        {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
        {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
        {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
        {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_SET},
        {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
        {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
        {Opt_delalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
        {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
        {Opt_commit, 0, MOPT_NO_EXT2},
        {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
                                    EXT4_MOUNT_JOURNAL_CHECKSUM),
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
        {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
        {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
        {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
        {Opt_dax_type, 0, MOPT_EXT4_ONLY},
        {Opt_journal_dev, 0, MOPT_NO_EXT2},
        {Opt_journal_path, 0, MOPT_NO_EXT2},
        {Opt_journal_ioprio, 0, MOPT_NO_EXT2},
        {Opt_data, 0, MOPT_NO_EXT2},
        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
#else
        {Opt_acl, 0, MOPT_NOSUPPORT},
#endif
        {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
        {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
        {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
        {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
                       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
                                                        MOPT_CLEAR | MOPT_Q},
        {Opt_usrjquota, 0, MOPT_Q},
        {Opt_grpjquota, 0, MOPT_Q},
        {Opt_jqfmt, 0, MOPT_QFMT},
        {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
        {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
         MOPT_SET},
#ifdef CONFIG_EXT4_DEBUG
        {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
         MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
#endif
        {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
        {Opt_err, 0, 0}
};

#if IS_ENABLED(CONFIG_UNICODE)
static const struct ext4_sb_encodings {
        __u16 magic;
        char *name;
        unsigned int version;
} ext4_sb_encoding_map[] = {
        {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
};

static const struct ext4_sb_encodings *
ext4_sb_read_encoding(const struct ext4_super_block *es)
{
        __u16 magic = le16_to_cpu(es->s_encoding);
        int i;

        for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
                if (magic == ext4_sb_encoding_map[i].magic)
                        return &ext4_sb_encoding_map[i];

        return NULL;
}
#endif

#define EXT4_SPEC_JQUOTA                        (1 <<  0)
#define EXT4_SPEC_JQFMT                                (1 <<  1)
#define EXT4_SPEC_DATAJ                                (1 <<  2)
#define EXT4_SPEC_SB_BLOCK                        (1 <<  3)
#define EXT4_SPEC_JOURNAL_DEV                        (1 <<  4)
#define EXT4_SPEC_JOURNAL_IOPRIO                (1 <<  5)
#define EXT4_SPEC_s_want_extra_isize                (1 <<  7)
#define EXT4_SPEC_s_max_batch_time                (1 <<  8)
#define EXT4_SPEC_s_min_batch_time                (1 <<  9)
#define EXT4_SPEC_s_inode_readahead_blks        (1 << 10)
#define EXT4_SPEC_s_li_wait_mult                (1 << 11)
#define EXT4_SPEC_s_max_dir_size_kb                (1 << 12)
#define EXT4_SPEC_s_stripe                        (1 << 13)
#define EXT4_SPEC_s_resuid                        (1 << 14)
#define EXT4_SPEC_s_resgid                        (1 << 15)
#define EXT4_SPEC_s_commit_interval                (1 << 16)
#define EXT4_SPEC_s_fc_debug_max_replay                (1 << 17)
#define EXT4_SPEC_s_sb_block                        (1 << 18)
#define EXT4_SPEC_mb_optimize_scan                (1 << 19)

struct ext4_fs_context {
        char                *s_qf_names[EXT4_MAXQUOTAS];
        struct fscrypt_dummy_policy dummy_enc_policy;
        int                s_jquota_fmt;        /* Format of quota to use */
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        unsigned short        qname_spec;
        unsigned long        vals_s_flags;        /* Bits to set in s_flags */
        unsigned long        mask_s_flags;        /* Bits changed in s_flags */
        unsigned long        journal_devnum;
        unsigned long        s_commit_interval;
        unsigned long        s_stripe;
        unsigned int        s_inode_readahead_blks;
        unsigned int        s_want_extra_isize;
        unsigned int        s_li_wait_mult;
        unsigned int        s_max_dir_size_kb;
        unsigned int        journal_ioprio;
        unsigned int        vals_s_mount_opt;
        unsigned int        mask_s_mount_opt;
        unsigned int        vals_s_mount_opt2;
        unsigned int        mask_s_mount_opt2;
        unsigned int        opt_flags;        /* MOPT flags */
        unsigned int        spec;
        u32                s_max_batch_time;
        u32                s_min_batch_time;
        kuid_t                s_resuid;
        kgid_t                s_resgid;
        ext4_fsblk_t        s_sb_block;
};

static void ext4_fc_free(struct fs_context *fc)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        int i;

        if (!ctx)
                return;

        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(ctx->s_qf_names[i]);

        fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
        kfree(ctx);
}

int ext4_init_fs_context(struct fs_context *fc)
{
        struct ext4_fs_context *ctx;

        ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        fc->fs_private = ctx;
        fc->ops = &ext4_context_ops;

        return 0;
}

#ifdef CONFIG_QUOTA
/*
 * Note the name of the specified quota file.
 */
static int note_qf_name(struct fs_context *fc, int qtype,
                       struct fs_parameter *param)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        char *qname;

        if (param->size < 1) {
                ext4_msg(NULL, KERN_ERR, "Missing quota name");
                return -EINVAL;
        }
        if (strchr(param->string, '/')) {
                ext4_msg(NULL, KERN_ERR,
                         "quotafile must be on filesystem root");
                return -EINVAL;
        }
        if (ctx->s_qf_names[qtype]) {
                if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
                        ext4_msg(NULL, KERN_ERR,
                                 "%s quota file already specified",
                                 QTYPE2NAME(qtype));
                        return -EINVAL;
                }
                return 0;
        }

        qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
        if (!qname) {
                ext4_msg(NULL, KERN_ERR,
                         "Not enough memory for storing quotafile name");
                return -ENOMEM;
        }
        ctx->s_qf_names[qtype] = qname;
        ctx->qname_spec |= 1 << qtype;
        ctx->spec |= EXT4_SPEC_JQUOTA;
        return 0;
}

/*
 * Clear the name of the specified quota file.
 */
static int unnote_qf_name(struct fs_context *fc, int qtype)
{
        struct ext4_fs_context *ctx = fc->fs_private;

        kfree(ctx->s_qf_names[qtype]);

        ctx->s_qf_names[qtype] = NULL;
        ctx->qname_spec |= 1 << qtype;
        ctx->spec |= EXT4_SPEC_JQUOTA;
        return 0;
}
#endif

static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
                                            struct ext4_fs_context *ctx)
{
        int err;

        if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
                ext4_msg(NULL, KERN_WARNING,
                         "test_dummy_encryption option not supported");
                return -EINVAL;
        }
        err = fscrypt_parse_test_dummy_encryption(param,
                                                  &ctx->dummy_enc_policy);
        if (err == -EINVAL) {
                ext4_msg(NULL, KERN_WARNING,
                         "Value of option \"%s\" is unrecognized", param->key);
        } else if (err == -EEXIST) {
                ext4_msg(NULL, KERN_WARNING,
                         "Conflicting test_dummy_encryption options");
                return -EINVAL;
        }
        return err;
}

#define EXT4_SET_CTX(name)                                                \
static inline void ctx_set_##name(struct ext4_fs_context *ctx,                \
                                  unsigned long flag)                        \
{                                                                        \
        ctx->mask_s_##name |= flag;                                        \
        ctx->vals_s_##name |= flag;                                        \
}

#define EXT4_CLEAR_CTX(name)                                                \
static inline void ctx_clear_##name(struct ext4_fs_context *ctx,        \
                                    unsigned long flag)                        \
{                                                                        \
        ctx->mask_s_##name |= flag;                                        \
        ctx->vals_s_##name &= ~flag;                                        \
}

#define EXT4_TEST_CTX(name)                                                \
static inline unsigned long                                                \
ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag)        \
{                                                                        \
        return (ctx->vals_s_##name & flag);                                \
}

EXT4_SET_CTX(flags); /* set only */
EXT4_SET_CTX(mount_opt);
EXT4_CLEAR_CTX(mount_opt);
EXT4_TEST_CTX(mount_opt);
EXT4_SET_CTX(mount_opt2);
EXT4_CLEAR_CTX(mount_opt2);
EXT4_TEST_CTX(mount_opt2);

static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        const struct mount_opts *m;
        int is_remount;
        kuid_t uid;
        kgid_t gid;
        int token;

        token = fs_parse(fc, ext4_param_specs, param, &result);
        if (token < 0)
                return token;
        is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;

        for (m = ext4_mount_opts; m->token != Opt_err; m++)
                if (token == m->token)
                        break;

        ctx->opt_flags |= m->flags;

        if (m->flags & MOPT_EXPLICIT) {
                if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
                } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
                        ctx_set_mount_opt2(ctx,
                                       EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
                } else
                        return -EINVAL;
        }

        if (m->flags & MOPT_NOSUPPORT) {
                ext4_msg(NULL, KERN_ERR, "%s option not supported",
                         param->key);
                return 0;
        }

        switch (token) {
#ifdef CONFIG_QUOTA
        case Opt_usrjquota:
                if (!*param->string)
                        return unnote_qf_name(fc, USRQUOTA);
                else
                        return note_qf_name(fc, USRQUOTA, param);
        case Opt_grpjquota:
                if (!*param->string)
                        return unnote_qf_name(fc, GRPQUOTA);
                else
                        return note_qf_name(fc, GRPQUOTA, param);
#endif
        case Opt_sb:
                if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "Ignoring %s option on remount", param->key);
                } else {
                        ctx->s_sb_block = result.uint_32;
                        ctx->spec |= EXT4_SPEC_s_sb_block;
                }
                return 0;
        case Opt_removed:
                ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
                         param->key);
                return 0;
        case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
                ctx_set_flags(ctx, SB_INLINECRYPT);
#else
                ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
#endif
                return 0;
        case Opt_errors:
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
                ctx_set_mount_opt(ctx, result.uint_32);
                return 0;
#ifdef CONFIG_QUOTA
        case Opt_jqfmt:
                ctx->s_jquota_fmt = result.uint_32;
                ctx->spec |= EXT4_SPEC_JQFMT;
                return 0;
#endif
        case Opt_data:
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
                ctx_set_mount_opt(ctx, result.uint_32);
                ctx->spec |= EXT4_SPEC_DATAJ;
                return 0;
        case Opt_commit:
                if (result.uint_32 == 0)
                        result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
                else if (result.uint_32 > INT_MAX / HZ) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Invalid commit interval %d, "
                                 "must be smaller than %d",
                                 result.uint_32, INT_MAX / HZ);
                        return -EINVAL;
                }
                ctx->s_commit_interval = HZ * result.uint_32;
                ctx->spec |= EXT4_SPEC_s_commit_interval;
                return 0;
        case Opt_debug_want_extra_isize:
                if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Invalid want_extra_isize %d", result.uint_32);
                        return -EINVAL;
                }
                ctx->s_want_extra_isize = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_want_extra_isize;
                return 0;
        case Opt_max_batch_time:
                ctx->s_max_batch_time = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_max_batch_time;
                return 0;
        case Opt_min_batch_time:
                ctx->s_min_batch_time = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_min_batch_time;
                return 0;
        case Opt_inode_readahead_blks:
                if (result.uint_32 &&
                    (result.uint_32 > (1 << 30) ||
                     !is_power_of_2(result.uint_32))) {
                        ext4_msg(NULL, KERN_ERR,
                                 "EXT4-fs: inode_readahead_blks must be "
                                 "0 or a power of 2 smaller than 2^31");
                        return -EINVAL;
                }
                ctx->s_inode_readahead_blks = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
                return 0;
        case Opt_init_itable:
                ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
                ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
                if (param->type == fs_value_is_string)
                        ctx->s_li_wait_mult = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_li_wait_mult;
                return 0;
        case Opt_max_dir_size_kb:
                ctx->s_max_dir_size_kb = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
                return 0;
#ifdef CONFIG_EXT4_DEBUG
        case Opt_fc_debug_max_replay:
                ctx->s_fc_debug_max_replay = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
                return 0;
#endif
        case Opt_stripe:
                ctx->s_stripe = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_stripe;
                return 0;
        case Opt_resuid:
                uid = make_kuid(current_user_ns(), result.uint_32);
                if (!uid_valid(uid)) {
                        ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
                                 result.uint_32);
                        return -EINVAL;
                }
                ctx->s_resuid = uid;
                ctx->spec |= EXT4_SPEC_s_resuid;
                return 0;
        case Opt_resgid:
                gid = make_kgid(current_user_ns(), result.uint_32);
                if (!gid_valid(gid)) {
                        ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
                                 result.uint_32);
                        return -EINVAL;
                }
                ctx->s_resgid = gid;
                ctx->spec |= EXT4_SPEC_s_resgid;
                return 0;
        case Opt_journal_dev:
                if (is_remount) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -EINVAL;
                }
                ctx->journal_devnum = result.uint_32;
                ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
                return 0;
        case Opt_journal_path:
        {
                struct inode *journal_inode;
                struct path path;
                int error;

                if (is_remount) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -EINVAL;
                }

                error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
                if (error) {
                        ext4_msg(NULL, KERN_ERR, "error: could not find "
                                 "journal device path");
                        return -EINVAL;
                }

                journal_inode = d_inode(path.dentry);
                ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
                ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
                path_put(&path);
                return 0;
        }
        case Opt_journal_ioprio:
                if (result.uint_32 > 7) {
                        ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
                                 " (must be 0-7)");
                        return -EINVAL;
                }
                ctx->journal_ioprio =
                        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
                ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
                return 0;
        case Opt_test_dummy_encryption:
                return ext4_parse_test_dummy_encryption(param, ctx);
        case Opt_dax:
        case Opt_dax_type:
#ifdef CONFIG_FS_DAX
        {
                int type = (token == Opt_dax) ?
                           Opt_dax : result.uint_32;

                switch (type) {
                case Opt_dax:
                case Opt_dax_always:
                        ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        break;
                case Opt_dax_never:
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        break;
                case Opt_dax_inode:
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        /* Strictly for printing options */
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
                        break;
                }
                return 0;
        }
#else
                ext4_msg(NULL, KERN_INFO, "dax option not supported");
                return -EINVAL;
#endif
        case Opt_data_err:
                if (result.uint_32 == Opt_data_err_abort)
                        ctx_set_mount_opt(ctx, m->mount_opt);
                else if (result.uint_32 == Opt_data_err_ignore)
                        ctx_clear_mount_opt(ctx, m->mount_opt);
                return 0;
        case Opt_mb_optimize_scan:
                if (result.int_32 == 1) {
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
                        ctx->spec |= EXT4_SPEC_mb_optimize_scan;
                } else if (result.int_32 == 0) {
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
                        ctx->spec |= EXT4_SPEC_mb_optimize_scan;
                } else {
                        ext4_msg(NULL, KERN_WARNING,
                                 "mb_optimize_scan should be set to 0 or 1.");
                        return -EINVAL;
                }
                return 0;
        }

        /*
         * At this point we should only be getting options requiring MOPT_SET,
         * or MOPT_CLEAR. Anything else is a bug
         */
        if (m->token == Opt_err) {
                ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
                         param->key);
                WARN_ON(1);
                return -EINVAL;
        }

        else {
                unsigned int set = 0;

                if ((param->type == fs_value_is_flag) ||
                    result.uint_32 > 0)
                        set = 1;

                if (m->flags & MOPT_CLEAR)
                        set = !set;
                else if (unlikely(!(m->flags & MOPT_SET))) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "buggy handling of option %s",
                                 param->key);
                        WARN_ON(1);
                        return -EINVAL;
                }
                if (m->flags & MOPT_2) {
                        if (set != 0)
                                ctx_set_mount_opt2(ctx, m->mount_opt);
                        else
                                ctx_clear_mount_opt2(ctx, m->mount_opt);
                } else {
                        if (set != 0)
                                ctx_set_mount_opt(ctx, m->mount_opt);
                        else
                                ctx_clear_mount_opt(ctx, m->mount_opt);
                }
        }

        return 0;
}

static int parse_options(struct fs_context *fc, char *options)
{
        struct fs_parameter param;
        int ret;
        char *key;

        if (!options)
                return 0;

        while ((key = strsep(&options, ",")) != NULL) {
                if (*key) {
                        size_t v_len = 0;
                        char *value = strchr(key, '=');

                        param.type = fs_value_is_flag;
                        param.string = NULL;

                        if (value) {
                                if (value == key)
                                        continue;

                                *value++ = 0;
                                v_len = strlen(value);
                                param.string = kmemdup_nul(value, v_len,
                                                           GFP_KERNEL);
                                if (!param.string)
                                        return -ENOMEM;
                                param.type = fs_value_is_string;
                        }

                        param.key = key;
                        param.size = v_len;

                        ret = ext4_parse_param(fc, &param);
                        kfree(param.string);
                        if (ret < 0)
                                return ret;
                }
        }

        ret = ext4_validate_options(fc);
        if (ret < 0)
                return ret;

        return 0;
}

static int parse_apply_sb_mount_options(struct super_block *sb,
                                        struct ext4_fs_context *m_ctx)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *s_mount_opts = NULL;
        struct ext4_fs_context *s_ctx = NULL;
        struct fs_context *fc = NULL;
        int ret = -ENOMEM;

        if (!sbi->s_es->s_mount_opts[0])
                return 0;

        s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
                                sizeof(sbi->s_es->s_mount_opts),
                                GFP_KERNEL);
        if (!s_mount_opts)
                return ret;

        fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
        if (!fc)
                goto out_free;

        s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
        if (!s_ctx)
                goto out_free;

        fc->fs_private = s_ctx;
        fc->s_fs_info = sbi;

        ret = parse_options(fc, s_mount_opts);
        if (ret < 0)
                goto parse_failed;

        ret = ext4_check_opt_consistency(fc, sb);
        if (ret < 0) {
parse_failed:
                ext4_msg(sb, KERN_WARNING,
                         "failed to parse options in superblock: %s",
                         s_mount_opts);
                ret = 0;
                goto out_free;
        }

        if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
                m_ctx->journal_devnum = s_ctx->journal_devnum;
        if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
                m_ctx->journal_ioprio = s_ctx->journal_ioprio;

        ext4_apply_options(fc, sb);
        ret = 0;

out_free:
        if (fc) {
                ext4_fc_free(fc);
                kfree(fc);
        }
        kfree(s_mount_opts);
        return ret;
}

static void ext4_apply_quota_options(struct fs_context *fc,
                                     struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        bool quota_feature = ext4_has_feature_quota(sb);
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *qname;
        int i;

        if (quota_feature)
                return;

        if (ctx->spec & EXT4_SPEC_JQUOTA) {
                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (!(ctx->qname_spec & (1 << i)))
                                continue;

                        qname = ctx->s_qf_names[i]; /* May be NULL */
                        if (qname)
                                set_opt(sb, QUOTA);
                        ctx->s_qf_names[i] = NULL;
                        qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
                                                lockdep_is_held(&sb->s_umount));
                        if (qname)
                                kfree_rcu_mightsleep(qname);
                }
        }

        if (ctx->spec & EXT4_SPEC_JQFMT)
                sbi->s_jquota_fmt = ctx->s_jquota_fmt;
#endif
}

/*
 * Check quota settings consistency.
 */
static int ext4_check_quota_consistency(struct fs_context *fc,
                                        struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        bool quota_feature = ext4_has_feature_quota(sb);
        bool quota_loaded = sb_any_quota_loaded(sb);
        bool usr_qf_name, grp_qf_name, usrquota, grpquota;
        int quota_flags, i;

        /*
         * We do the test below only for project quotas. 'usrquota' and
         * 'grpquota' mount options are allowed even without quota feature
         * to support legacy quotas in quota files.
         */
        if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
            !ext4_has_feature_project(sb)) {
                ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
                         "Cannot enable project quota enforcement.");
                return -EINVAL;
        }

        quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
                      EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
        if (quota_loaded &&
            ctx->mask_s_mount_opt & quota_flags &&
            !ctx_test_mount_opt(ctx, quota_flags))
                goto err_quota_change;

        if (ctx->spec & EXT4_SPEC_JQUOTA) {

                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (!(ctx->qname_spec & (1 << i)))
                                continue;

                        if (quota_loaded &&
                            !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
                                goto err_jquota_change;

                        if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
                            strcmp(get_qf_name(sb, sbi, i),
                                   ctx->s_qf_names[i]) != 0)
                                goto err_jquota_specified;
                }

                if (quota_feature) {
                        ext4_msg(NULL, KERN_INFO,
                                 "Journaled quota options ignored when "
                                 "QUOTA feature is enabled");
                        return 0;
                }
        }

        if (ctx->spec & EXT4_SPEC_JQFMT) {
                if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
                        goto err_jquota_change;
                if (quota_feature) {
                        ext4_msg(NULL, KERN_INFO, "Quota format mount options "
                                 "ignored when QUOTA feature is enabled");
                        return 0;
                }
        }

        /* Make sure we don't mix old and new quota format */
        usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
                       ctx->s_qf_names[USRQUOTA]);
        grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
                       ctx->s_qf_names[GRPQUOTA]);

        usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
                    test_opt(sb, USRQUOTA));

        grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
                    test_opt(sb, GRPQUOTA));

        if (usr_qf_name) {
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
                usrquota = false;
        }
        if (grp_qf_name) {
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
                grpquota = false;
        }

        if (usr_qf_name || grp_qf_name) {
                if (usrquota || grpquota) {
                        ext4_msg(NULL, KERN_ERR, "old and new quota "
                                 "format mixing");
                        return -EINVAL;
                }

                if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
                        ext4_msg(NULL, KERN_ERR, "journaled quota format "
                                 "not specified");
                        return -EINVAL;
                }
        }

        return 0;

err_quota_change:
        ext4_msg(NULL, KERN_ERR,
                 "Cannot change quota options when quota turned on");
        return -EINVAL;
err_jquota_change:
        ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
                 "options when quota turned on");
        return -EINVAL;
err_jquota_specified:
        ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
                 QTYPE2NAME(i));
        return -EINVAL;
#else
        return 0;
#endif
}

static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
                                            struct super_block *sb)
{
        const struct ext4_fs_context *ctx = fc->fs_private;
        const struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
                return 0;

        if (!ext4_has_feature_encrypt(sb)) {
                ext4_msg(NULL, KERN_WARNING,
                         "test_dummy_encryption requires encrypt feature");
                return -EINVAL;
        }
        /*
         * This mount option is just for testing, and it's not worthwhile to
         * implement the extra complexity (e.g. RCU protection) that would be
         * needed to allow it to be set or changed during remount.  We do allow
         * it to be specified during remount, but only if there is no change.
         */
        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
                                                 &ctx->dummy_enc_policy))
                        return 0;
                ext4_msg(NULL, KERN_WARNING,
                         "Can't set or change test_dummy_encryption on remount");
                return -EINVAL;
        }
        /* Also make sure s_mount_opts didn't contain a conflicting value. */
        if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
                if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
                                                 &ctx->dummy_enc_policy))
                        return 0;
                ext4_msg(NULL, KERN_WARNING,
                         "Conflicting test_dummy_encryption options");
                return -EINVAL;
        }
        return 0;
}

static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
                                             struct super_block *sb)
{
        if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
            /* if already set, it was already verified to be the same */
            fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
                return;
        EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
        memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
        ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
}

static int ext4_check_opt_consistency(struct fs_context *fc,
                                      struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = fc->s_fs_info;
        int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
        int err;

        if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
                ext4_msg(NULL, KERN_ERR,
                         "Mount option(s) incompatible with ext2");
                return -EINVAL;
        }
        if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
                ext4_msg(NULL, KERN_ERR,
                         "Mount option(s) incompatible with ext3");
                return -EINVAL;
        }

        if (ctx->s_want_extra_isize >
            (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
                ext4_msg(NULL, KERN_ERR,
                         "Invalid want_extra_isize %d",
                         ctx->s_want_extra_isize);
                return -EINVAL;
        }

        err = ext4_check_test_dummy_encryption(fc, sb);
        if (err)
                return err;

        if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
                if (!sbi->s_journal) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "Remounting file system with no journal "
                                 "so ignoring journalled data option");
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
                } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
                           test_opt(sb, DATA_FLAGS)) {
                        ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
                                 "on remount");
                        return -EINVAL;
                }
        }

        if (is_remount) {
                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
                    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                        ext4_msg(NULL, KERN_ERR, "can't mount with "
                                 "both data=journal and dax");
                        return -EINVAL;
                }

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
                    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
fail_dax_change_remount:
                        ext4_msg(NULL, KERN_ERR, "can't change "
                                 "dax mount option while remounting");
                        return -EINVAL;
                } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
                         (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                          (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
                        goto fail_dax_change_remount;
                } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
                           ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                            (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                            !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
                        goto fail_dax_change_remount;
                }
        }

        return ext4_check_quota_consistency(fc, sb);
}

static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = fc->s_fs_info;

        sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
        sbi->s_mount_opt |= ctx->vals_s_mount_opt;
        sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
        sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
        sb->s_flags &= ~ctx->mask_s_flags;
        sb->s_flags |= ctx->vals_s_flags;

#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
        APPLY(s_commit_interval);
        APPLY(s_stripe);
        APPLY(s_max_batch_time);
        APPLY(s_min_batch_time);
        APPLY(s_want_extra_isize);
        APPLY(s_inode_readahead_blks);
        APPLY(s_max_dir_size_kb);
        APPLY(s_li_wait_mult);
        APPLY(s_resgid);
        APPLY(s_resuid);

#ifdef CONFIG_EXT4_DEBUG
        APPLY(s_fc_debug_max_replay);
#endif

        ext4_apply_quota_options(fc, sb);
        ext4_apply_test_dummy_encryption(ctx, sb);
}


static int ext4_validate_options(struct fs_context *fc)
{
#ifdef CONFIG_QUOTA
        struct ext4_fs_context *ctx = fc->fs_private;
        char *usr_qf_name, *grp_qf_name;

        usr_qf_name = ctx->s_qf_names[USRQUOTA];
        grp_qf_name = ctx->s_qf_names[GRPQUOTA];

        if (usr_qf_name || grp_qf_name) {
                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
                    ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
                        ext4_msg(NULL, KERN_ERR, "old and new quota "
                                 "format mixing");
                        return -EINVAL;
                }
        }
#endif
        return 1;
}

static inline void ext4_show_quota_options(struct seq_file *seq,
                                           struct super_block *sb)
{
#if defined(CONFIG_QUOTA)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *usr_qf_name, *grp_qf_name;

        if (sbi->s_jquota_fmt) {
                char *fmtname = "";

                switch (sbi->s_jquota_fmt) {
                case QFMT_VFS_OLD:
                        fmtname = "vfsold";
                        break;
                case QFMT_VFS_V0:
                        fmtname = "vfsv0";
                        break;
                case QFMT_VFS_V1:
                        fmtname = "vfsv1";
                        break;
                }
                seq_printf(seq, ",jqfmt=%s", fmtname);
        }

        rcu_read_lock();
        usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
        grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
        if (usr_qf_name)
                seq_show_option(seq, "usrjquota", usr_qf_name);
        if (grp_qf_name)
                seq_show_option(seq, "grpjquota", grp_qf_name);
        rcu_read_unlock();
#endif
}

static const char *token2str(int token)
{
        const struct fs_parameter_spec *spec;

        for (spec = ext4_param_specs; spec->name != NULL; spec++)
                if (spec->opt == token && !spec->type)
                        break;
        return spec->name;
}

/*
 * Show an option if
 *  - it's set to a non-default value OR
 *  - if the per-sb default is different from the global default
 */
static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
                              int nodefs)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int def_errors;
        const struct mount_opts *m;
        char sep = nodefs ? '\n' : ',';

#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)

        if (sbi->s_sb_block != 1)
                SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);

        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
                int want_set = m->flags & MOPT_SET;
                int opt_2 = m->flags & MOPT_2;
                unsigned int mount_opt, def_mount_opt;

                if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
                    m->flags & MOPT_SKIP)
                        continue;

                if (opt_2) {
                        mount_opt = sbi->s_mount_opt2;
                        def_mount_opt = sbi->s_def_mount_opt2;
                } else {
                        mount_opt = sbi->s_mount_opt;
                        def_mount_opt = sbi->s_def_mount_opt;
                }
                /* skip if same as the default */
                if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
                        continue;
                /* select Opt_noFoo vs Opt_Foo */
                if ((want_set &&
                     (mount_opt & m->mount_opt) != m->mount_opt) ||
                    (!want_set && (mount_opt & m->mount_opt)))
                        continue;
                SEQ_OPTS_PRINT("%s", token2str(m->token));
        }

        if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
                SEQ_OPTS_PRINT("resuid=%u",
                                from_kuid_munged(&init_user_ns, sbi->s_resuid));
        if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
                SEQ_OPTS_PRINT("resgid=%u",
                                from_kgid_munged(&init_user_ns, sbi->s_resgid));
        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
                SEQ_OPTS_PUTS("errors=remount-ro");
        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
                SEQ_OPTS_PUTS("errors=continue");
        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
                SEQ_OPTS_PUTS("errors=panic");
        if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
                SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
        if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
                SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
        if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
                SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
        if (nodefs || sbi->s_stripe)
                SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
        if (nodefs || EXT4_MOUNT_DATA_FLAGS &
                        (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        SEQ_OPTS_PUTS("data=journal");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        SEQ_OPTS_PUTS("data=ordered");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                        SEQ_OPTS_PUTS("data=writeback");
        }
        if (nodefs ||
            sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
                SEQ_OPTS_PRINT("inode_readahead_blks=%u",
                               sbi->s_inode_readahead_blks);

        if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
                       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
        if (nodefs || sbi->s_max_dir_size_kb)
                SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
        if (test_opt(sb, DATA_ERR_ABORT))
                SEQ_OPTS_PUTS("data_err=abort");

        fscrypt_show_test_dummy_encryption(seq, sep, sb);

        if (sb->s_flags & SB_INLINECRYPT)
                SEQ_OPTS_PUTS("inlinecrypt");

        if (test_opt(sb, DAX_ALWAYS)) {
                if (IS_EXT2_SB(sb))
                        SEQ_OPTS_PUTS("dax");
                else
                        SEQ_OPTS_PUTS("dax=always");
        } else if (test_opt2(sb, DAX_NEVER)) {
                SEQ_OPTS_PUTS("dax=never");
        } else if (test_opt2(sb, DAX_INODE)) {
                SEQ_OPTS_PUTS("dax=inode");
        }

        if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
                        !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
                SEQ_OPTS_PUTS("mb_optimize_scan=0");
        } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
                        test_opt2(sb, MB_OPTIMIZE_SCAN)) {
                SEQ_OPTS_PUTS("mb_optimize_scan=1");
        }

        ext4_show_quota_options(seq, sb);
        return 0;
}

static int ext4_show_options(struct seq_file *seq, struct dentry *root)
{
        return _ext4_show_options(seq, root->d_sb, 0);
}

int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
        struct super_block *sb = seq->private;
        int rc;

        seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
        rc = _ext4_show_options(seq, sb, 1);
        seq_puts(seq, "\n");
        return rc;
}

static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                            int read_only)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err = 0;

        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
                ext4_msg(sb, KERN_ERR, "revision level too high, "
                         "forcing read-only mode");
                err = -EROFS;
                goto done;
        }
        if (read_only)
                goto done;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
                         "running e2fsck is recommended");
        else if (sbi->s_mount_state & EXT4_ERROR_FS)
                ext4_msg(sb, KERN_WARNING,
                         "warning: mounting fs with errors, "
                         "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                ext4_msg(sb, KERN_WARNING,
                         "warning: maximal mount count reached, "
                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                 (ext4_get_tstamp(es, s_lastcheck) +
                  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
                ext4_msg(sb, KERN_WARNING,
                         "warning: checktime reached, "
                         "running e2fsck is recommended");
        if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        ext4_update_tstamp(es, s_mtime);
        if (sbi->s_journal) {
                ext4_set_feature_journal_needs_recovery(sb);
                if (ext4_has_feature_orphan_file(sb))
                        ext4_set_feature_orphan_present(sb);
        }

        err = ext4_commit_super(sb);
done:
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return err;
}

int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct flex_groups **old_groups, **new_groups;
        int size, i, j;

        if (!sbi->s_log_groups_per_flex)
                return 0;

        size = ext4_flex_group(sbi, ngroup - 1) + 1;
        if (size <= sbi->s_flex_groups_allocated)
                return 0;

        new_groups = kvzalloc(roundup_pow_of_two(size *
                              sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
        if (!new_groups) {
                ext4_msg(sb, KERN_ERR,
                         "not enough memory for %d flex group pointers", size);
                return -ENOMEM;
        }
        for (i = sbi->s_flex_groups_allocated; i < size; i++) {
                new_groups[i] = kvzalloc(roundup_pow_of_two(
                                         sizeof(struct flex_groups)),
                                         GFP_KERNEL);
                if (!new_groups[i]) {
                        for (j = sbi->s_flex_groups_allocated; j < i; j++)
                                kvfree(new_groups[j]);
                        kvfree(new_groups);
                        ext4_msg(sb, KERN_ERR,
                                 "not enough memory for %d flex groups", size);
                        return -ENOMEM;
                }
        }
        rcu_read_lock();
        old_groups = rcu_dereference(sbi->s_flex_groups);
        if (old_groups)
                memcpy(new_groups, old_groups,
                       (sbi->s_flex_groups_allocated *
                        sizeof(struct flex_groups *)));
        rcu_read_unlock();
        rcu_assign_pointer(sbi->s_flex_groups, new_groups);
        sbi->s_flex_groups_allocated = size;
        if (old_groups)
                ext4_kvfree_array_rcu(old_groups);
        return 0;
}

static int ext4_fill_flex_info(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        struct flex_groups *fg;
        ext4_group_t flex_group;
        int i, err;

        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
        if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }

        err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
        if (err)
                goto failed;

        for (i = 0; i < sbi->s_groups_count; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);

                flex_group = ext4_flex_group(sbi, i);
                fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
                atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
                atomic64_add(ext4_free_group_clusters(sb, gdp),
                             &fg->free_clusters);
                atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
        }

        return 1;
failed:
        return 0;
}

static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
                                   struct ext4_group_desc *gdp)
{
        int offset = offsetof(struct ext4_group_desc, bg_checksum);
        __u16 crc = 0;
        __le32 le_group = cpu_to_le32(block_group);
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_metadata_csum(sbi->s_sb)) {
                /* Use new metadata_csum algorithm */
                __u32 csum32;
                __u16 dummy_csum = 0;

                csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
                                     sizeof(le_group));
                csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
                csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
                                     sizeof(dummy_csum));
                offset += sizeof(dummy_csum);
                if (offset < sbi->s_desc_size)
                        csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
                                             sbi->s_desc_size - offset);

                crc = csum32 & 0xFFFF;
                goto out;
        }

        /* old crc16 code */
        if (!ext4_has_feature_gdt_csum(sb))
                return 0;

        crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
        crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
        crc = crc16(crc, (__u8 *)gdp, offset);
        offset += sizeof(gdp->bg_checksum); /* skip checksum */
        /* for checksum of struct ext4_group_desc do the rest...*/
        if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
                crc = crc16(crc, (__u8 *)gdp + offset,
                            sbi->s_desc_size - offset);

out:
        return cpu_to_le16(crc);
}

int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
                                struct ext4_group_desc *gdp)
{
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
                return 0;

        return 1;
}

void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
                              struct ext4_group_desc *gdp)
{
        if (!ext4_has_group_desc_csum(sb))
                return;
        gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
}

/* Called at mount-time, super-block is locked */
static int ext4_check_descriptors(struct super_block *sb,
                                  ext4_fsblk_t sb_block,
                                  ext4_group_t *first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        ext4_fsblk_t last_block;
        ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
        ext4_fsblk_t block_bitmap;
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
        ext4_group_t i, grp = sbi->s_groups_count;

        if (ext4_has_feature_flex_bg(sb))
                flexbg_flag = 1;

        ext4_debug("Checking group descriptors");

        for (i = 0; i < sbi->s_groups_count; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);

                if (i == sbi->s_groups_count - 1 || flexbg_flag)
                        last_block = ext4_blocks_count(sbi->s_es) - 1;
                else
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);

                if ((grp == sbi->s_groups_count) &&
                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        grp = i;

                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap >= sb_block + 1 &&
                    block_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap >= sb_block + 1 &&
                    inode_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
                               "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table >= sb_block + 1 &&
                    inode_table <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
                               "(block %llu)!", i, inode_table);
                        return 0;
                }
                ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Checksum for group %u failed (%u!=%u)",
                                 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!sb_rdonly(sb)) {
                                ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
                ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        if (NULL != first_not_zeroed)
                *first_not_zeroed = grp;
        return 1;
}

/*
 * Maximal extent format file size.
 * Resulting logical blkno at s_maxbytes must fit in our on-disk
 * extent format containers, within a sector_t, and within i_blocks
 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
 * so that won't be a limiting factor.
 *
 * However there is other limiting factor. We do store extents in the form
 * of starting block and length, hence the resulting length of the extent
 * covering maximum file size must fit into on-disk format containers as
 * well. Given that length is always by 1 unit bigger than max unit (because
 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
 *
 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
 */
static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
        loff_t res;
        loff_t upper_limit = MAX_LFS_FILESIZE;

        BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));

        if (!has_huge_files) {
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (blkbits - 9);
                upper_limit <<= blkbits;
        }

        /*
         * 32-bit extent-start container, ee_block. We lower the maxbytes
         * by one fs block, so ee_len can cover the extent of maximum file
         * size
         */
        res = (1LL << 32) - 1;
        res <<= blkbits;

        /* Sanity check against vm- & vfs- imposed limits */
        if (res > upper_limit)
                res = upper_limit;

        return res;
}

/*
 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
 * We need to be 1 filesystem block less than the 2^48 sector limit.
 */
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
        loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
        unsigned int ppb = 1 << (bits - 2);

        /*
         * This is calculated to be the largest file size for a dense, block
         * mapped file such that the file's total number of 512-byte sectors,
         * including data and all indirect blocks, does not exceed (2^48 - 1).
         *
         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
         * number of 512-byte sectors of the file.
         */
        if (!has_huge_files) {
                /*
                 * !has_huge_files or implies that the inode i_block field
                 * represents total file blocks in 2^32 512-byte sectors ==
                 * size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (bits - 9);

        } else {
                /*
                 * We use 48 bit ext4_inode i_blocks
                 * With EXT4_HUGE_FILE_FL set the i_blocks
                 * represent total number of blocks in
                 * file system block size
                 */
                upper_limit = (1LL << 48) - 1;

        }

        /* Compute how many blocks we can address by block tree */
        res += ppb;
        res += ppb * ppb;
        res += ((loff_t)ppb) * ppb * ppb;
        /* Compute how many metadata blocks are needed */
        meta_blocks = 1;
        meta_blocks += 1 + ppb;
        meta_blocks += 1 + ppb + ppb * ppb;
        /* Does block tree limit file size? */
        if (res + meta_blocks <= upper_limit)
                goto check_lfs;

        res = upper_limit;
        /* How many metadata blocks are needed for addressing upper_limit? */
        upper_limit -= EXT4_NDIR_BLOCKS;
        /* indirect blocks */
        meta_blocks = 1;
        upper_limit -= ppb;
        /* double indirect blocks */
        if (upper_limit < ppb * ppb) {
                meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
                res -= meta_blocks;
                goto check_lfs;
        }
        meta_blocks += 1 + ppb;
        upper_limit -= ppb * ppb;
        /* tripple indirect blocks for the rest */
        meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
                DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
        res -= meta_blocks;
check_lfs:
        res <<= bits;
        if (res > MAX_LFS_FILESIZE)
                res = MAX_LFS_FILESIZE;

        return res;
}

static ext4_fsblk_t descriptor_loc(struct super_block *sb,
                                   ext4_fsblk_t logical_sb_block, int nr)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
        int has_super = 0;

        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);

        if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
                return logical_sb_block + nr + 1;
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;

        /*
         * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
         * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
         * on modern mke2fs or blksize > 1k on older mke2fs) then we must
         * compensate.
         */
        if (sb->s_blocksize == 1024 && nr == 0 &&
            le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
                has_super++;

        return (has_super + ext4_group_first_block_no(sb, bg));
}

/**
 * ext4_get_stripe_size: Get the stripe size.
 * @sbi: In memory super block info
 *
 * If we have specified it via mount option, then
 * use the mount option value. If the value specified at mount time is
 * greater than the blocks per group use the super block value.
 * If the super block value is greater than blocks per group return 0.
 * Allocator needs it be less than blocks per group.
 *
 */
static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
{
        unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
        unsigned long stripe_width =
                        le32_to_cpu(sbi->s_es->s_raid_stripe_width);
        int ret;

        if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
                ret = sbi->s_stripe;
        else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
                ret = stripe_width;
        else if (stride && stride <= sbi->s_blocks_per_group)
                ret = stride;
        else
                ret = 0;

        /*
         * If the stripe width is 1, this makes no sense and
         * we set it to 0 to turn off stripe handling code.
         */
        if (ret <= 1)
                ret = 0;

        return ret;
}

/*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
 * Returns 1 if this filesystem can be mounted as requested,
 * 0 if it cannot be.
 */
int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
        if (ext4_has_unknown_ext4_incompat_features(sb)) {
                ext4_msg(sb, KERN_ERR,
                        "Couldn't mount because of "
                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                return 0;
        }

#if !IS_ENABLED(CONFIG_UNICODE)
        if (ext4_has_feature_casefold(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Filesystem with casefold feature cannot be "
                         "mounted without CONFIG_UNICODE");
                return 0;
        }
#endif

        if (readonly)
                return 1;

        if (ext4_has_feature_readonly(sb)) {
                ext4_msg(sb, KERN_INFO, "filesystem is read-only");
                sb->s_flags |= SB_RDONLY;
                return 1;
        }

        /* Check that feature set is OK for a read-write mount */
        if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
                ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
                         "unsupported optional features (%x)",
                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                                ~EXT4_FEATURE_RO_COMPAT_SUPP));
                return 0;
        }
        if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Can't support bigalloc feature without "
                         "extents feature\n");
                return 0;
        }

#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
        if (!readonly && (ext4_has_feature_quota(sb) ||
                          ext4_has_feature_project(sb))) {
                ext4_msg(sb, KERN_ERR,
                         "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
                return 0;
        }
#endif  /* CONFIG_QUOTA */
        return 1;
}

/*
 * This function is called once a day if we have errors logged
 * on the file system
 */
static void print_daily_error_info(struct timer_list *t)
{
        struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
        struct super_block *sb = sbi->s_sb;
        struct ext4_super_block *es = sbi->s_es;

        if (es->s_error_count)
                /* fsck newer than v1.41.13 is needed to clean this condition. */
                ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
                         le32_to_cpu(es->s_error_count));
        if (es->s_first_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_first_error_time),
                       (int) sizeof(es->s_first_error_func),
                       es->s_first_error_func,
                       le32_to_cpu(es->s_first_error_line));
                if (es->s_first_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_first_error_ino));
                if (es->s_first_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_first_error_block));
                printk(KERN_CONT "\n");
        }
        if (es->s_last_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_last_error_time),
                       (int) sizeof(es->s_last_error_func),
                       es->s_last_error_func,
                       le32_to_cpu(es->s_last_error_line));
                if (es->s_last_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_last_error_ino));
                if (es->s_last_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_last_error_block));
                printk(KERN_CONT "\n");
        }
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
}

/* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr)
{
        struct ext4_group_desc *gdp = NULL;
        struct super_block *sb = elr->lr_super;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        ext4_group_t group = elr->lr_next_group;
        unsigned int prefetch_ios = 0;
        int ret = 0;
        int nr = EXT4_SB(sb)->s_mb_prefetch;
        u64 start_time;

        if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
                elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
                ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
                trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
                if (group >= elr->lr_next_group) {
                        ret = 1;
                        if (elr->lr_first_not_zeroed != ngroups &&
                            !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
                                elr->lr_next_group = elr->lr_first_not_zeroed;
                                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                                ret = 0;
                        }
                }
                return ret;
        }

        for (; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp) {
                        ret = 1;
                        break;
                }

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        if (group >= ngroups)
                ret = 1;

        if (!ret) {
                start_time = ktime_get_real_ns();
                ret = ext4_init_inode_table(sb, group,
                                            elr->lr_timeout ? 0 : 1);
                trace_ext4_lazy_itable_init(sb, group);
                if (elr->lr_timeout == 0) {
                        elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
                                EXT4_SB(elr->lr_super)->s_li_wait_mult);
                }
                elr->lr_next_sched = jiffies + elr->lr_timeout;
                elr->lr_next_group = group + 1;
        }
        return ret;
}

/*
 * Remove lr_request from the list_request and free the
 * request structure. Should be called with li_list_mtx held
 */
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
        if (!elr)
                return;

        list_del(&elr->lr_request);
        EXT4_SB(elr->lr_super)->s_li_request = NULL;
        kfree(elr);
}

static void ext4_unregister_li_request(struct super_block *sb)
{
        mutex_lock(&ext4_li_mtx);
        if (!ext4_li_info) {
                mutex_unlock(&ext4_li_mtx);
                return;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
        mutex_unlock(&ext4_li_info->li_list_mtx);
        mutex_unlock(&ext4_li_mtx);
}

static struct task_struct *ext4_lazyinit_task;

/*
 * This is the function where ext4lazyinit thread lives. It walks
 * through the request list searching for next scheduled filesystem.
 * When such a fs is found, run the lazy initialization request
 * (ext4_rn_li_request) and keep track of the time spend in this
 * function. Based on that time we compute next schedule time of
 * the request. When walking through the list is complete, compute
 * next waking time and put itself into sleep.
 */
static int ext4_lazyinit_thread(void *arg)
{
        struct ext4_lazy_init *eli = arg;
        struct list_head *pos, *n;
        struct ext4_li_request *elr;
        unsigned long next_wakeup, cur;

        BUG_ON(NULL == eli);
        set_freezable();

cont_thread:
        while (true) {
                next_wakeup = MAX_JIFFY_OFFSET;

                mutex_lock(&eli->li_list_mtx);
                if (list_empty(&eli->li_request_list)) {
                        mutex_unlock(&eli->li_list_mtx);
                        goto exit_thread;
                }
                list_for_each_safe(pos, n, &eli->li_request_list) {
                        int err = 0;
                        int progress = 0;
                        elr = list_entry(pos, struct ext4_li_request,
                                         lr_request);

                        if (time_before(jiffies, elr->lr_next_sched)) {
                                if (time_before(elr->lr_next_sched, next_wakeup))
                                        next_wakeup = elr->lr_next_sched;
                                continue;
                        }
                        if (down_read_trylock(&elr->lr_super->s_umount)) {
                                if (sb_start_write_trylock(elr->lr_super)) {
                                        progress = 1;
                                        /*
                                         * We hold sb->s_umount, sb can not
                                         * be removed from the list, it is
                                         * now safe to drop li_list_mtx
                                         */
                                        mutex_unlock(&eli->li_list_mtx);
                                        err = ext4_run_li_request(elr);
                                        sb_end_write(elr->lr_super);
                                        mutex_lock(&eli->li_list_mtx);
                                        n = pos->next;
                                }
                                up_read((&elr->lr_super->s_umount));
                        }
                        /* error, remove the lazy_init job */
                        if (err) {
                                ext4_remove_li_request(elr);
                                continue;
                        }
                        if (!progress) {
                                elr->lr_next_sched = jiffies +
                                        get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
                        }
                        if (time_before(elr->lr_next_sched, next_wakeup))
                                next_wakeup = elr->lr_next_sched;
                }
                mutex_unlock(&eli->li_list_mtx);

                try_to_freeze();

                cur = jiffies;
                if ((time_after_eq(cur, next_wakeup)) ||
                    (MAX_JIFFY_OFFSET == next_wakeup)) {
                        cond_resched();
                        continue;
                }

                schedule_timeout_interruptible(next_wakeup - cur);

                if (kthread_should_stop()) {
                        ext4_clear_request_list();
                        goto exit_thread;
                }
        }

exit_thread:
        /*
         * It looks like the request list is empty, but we need
         * to check it under the li_list_mtx lock, to prevent any
         * additions into it, and of course we should lock ext4_li_mtx
         * to atomically free the list and ext4_li_info, because at
         * this point another ext4 filesystem could be registering
         * new one.
         */
        mutex_lock(&ext4_li_mtx);
        mutex_lock(&eli->li_list_mtx);
        if (!list_empty(&eli->li_request_list)) {
                mutex_unlock(&eli->li_list_mtx);
                mutex_unlock(&ext4_li_mtx);
                goto cont_thread;
        }
        mutex_unlock(&eli->li_list_mtx);
        kfree(ext4_li_info);
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);

        return 0;
}

static void ext4_clear_request_list(void)
{
        struct list_head *pos, *n;
        struct ext4_li_request *elr;

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
                elr = list_entry(pos, struct ext4_li_request,
                                 lr_request);
                ext4_remove_li_request(elr);
        }
        mutex_unlock(&ext4_li_info->li_list_mtx);
}

static int ext4_run_lazyinit_thread(void)
{
        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
                                         ext4_li_info, "ext4lazyinit");
        if (IS_ERR(ext4_lazyinit_task)) {
                int err = PTR_ERR(ext4_lazyinit_task);
                ext4_clear_request_list();
                kfree(ext4_li_info);
                ext4_li_info = NULL;
                printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
                                 "initialization thread\n",
                                 err);
                return err;
        }
        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
        return 0;
}

/*
 * Check whether it make sense to run itable init. thread or not.
 * If there is at least one uninitialized inode table, return
 * corresponding group number, else the loop goes through all
 * groups and return total number of groups.
 */
static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
{
        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *gdp = NULL;

        if (!ext4_has_group_desc_csum(sb))
                return ngroups;

        for (group = 0; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp)
                        continue;

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        return group;
}

static int ext4_li_info_new(void)
{
        struct ext4_lazy_init *eli = NULL;

        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
        if (!eli)
                return -ENOMEM;

        INIT_LIST_HEAD(&eli->li_request_list);
        mutex_init(&eli->li_list_mtx);

        eli->li_state |= EXT4_LAZYINIT_QUIT;

        ext4_li_info = eli;

        return 0;
}

static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
                                            ext4_group_t start)
{
        struct ext4_li_request *elr;

        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
        if (!elr)
                return NULL;

        elr->lr_super = sb;
        elr->lr_first_not_zeroed = start;
        if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                elr->lr_next_group = start;
        } else {
                elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
        }

        /*
         * Randomize first schedule time of the request to
         * spread the inode table initialization requests
         * better.
         */
        elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
        return elr;
}

int ext4_register_li_request(struct super_block *sb,
                             ext4_group_t first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr = NULL;
        ext4_group_t ngroups = sbi->s_groups_count;
        int ret = 0;

        mutex_lock(&ext4_li_mtx);
        if (sbi->s_li_request != NULL) {
                /*
                 * Reset timeout so it can be computed again, because
                 * s_li_wait_mult might have changed.
                 */
                sbi->s_li_request->lr_timeout = 0;
                goto out;
        }

        if (sb_rdonly(sb) ||
            (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
             (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
                goto out;

        elr = ext4_li_request_new(sb, first_not_zeroed);
        if (!elr) {
                ret = -ENOMEM;
                goto out;
        }

        if (NULL == ext4_li_info) {
                ret = ext4_li_info_new();
                if (ret)
                        goto out;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
        mutex_unlock(&ext4_li_info->li_list_mtx);

        sbi->s_li_request = elr;
        /*
         * set elr to NULL here since it has been inserted to
         * the request_list and the removal and free of it is
         * handled by ext4_clear_request_list from now on.
         */
        elr = NULL;

        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
                if (ret)
                        goto out;
        }
out:
        mutex_unlock(&ext4_li_mtx);
        if (ret)
                kfree(elr);
        return ret;
}

/*
 * We do not need to lock anything since this is called on
 * module unload.
 */
static void ext4_destroy_lazyinit_thread(void)
{
        /*
         * If thread exited earlier
         * there's nothing to be done.
         */
        if (!ext4_li_info || !ext4_lazyinit_task)
                return;

        kthread_stop(ext4_lazyinit_task);
}

static int set_journal_csum_feature_set(struct super_block *sb)
{
        int ret = 1;
        int compat, incompat;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_metadata_csum(sb)) {
                /* journal checksum v3 */
                compat = 0;
                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
        } else {
                /* journal checksum v1 */
                compat = JBD2_FEATURE_COMPAT_CHECKSUM;
                incompat = 0;
        }

        jbd2_journal_clear_features(sbi->s_journal,
                        JBD2_FEATURE_COMPAT_CHECKSUM, 0,
                        JBD2_FEATURE_INCOMPAT_CSUM_V3 |
                        JBD2_FEATURE_INCOMPAT_CSUM_V2);
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
                                incompat);
        } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                incompat);
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        } else {
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        }

        return ret;
}

/*
 * Note: calculating the overhead so we can be compatible with
 * historical BSD practice is quite difficult in the face of
 * clusters/bigalloc.  This is because multiple metadata blocks from
 * different block group can end up in the same allocation cluster.
 * Calculating the exact overhead in the face of clustered allocation
 * requires either O(all block bitmaps) in memory or O(number of block
 * groups**2) in time.  We will still calculate the superblock for
 * older file systems --- and if we come across with a bigalloc file
 * system with zero in s_overhead_clusters the estimate will be close to
 * correct especially for very large cluster sizes --- but for newer
 * file systems, it's better to calculate this figure once at mkfs
 * time, and store it in the superblock.  If the superblock value is
 * present (even for non-bigalloc file systems), we will use it.
 */
static int count_overhead(struct super_block *sb, ext4_group_t grp,
                          char *buf)
{
        struct ext4_sb_info        *sbi = EXT4_SB(sb);
        struct ext4_group_desc        *gdp;
        ext4_fsblk_t                first_block, last_block, b;
        ext4_group_t                i, ngroups = ext4_get_groups_count(sb);
        int                        s, j, count = 0;
        int                        has_super = ext4_bg_has_super(sb, grp);

        if (!ext4_has_feature_bigalloc(sb))
                return (has_super + ext4_bg_num_gdb(sb, grp) +
                        (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
                        sbi->s_itb_per_group + 2);

        first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
                (grp * EXT4_BLOCKS_PER_GROUP(sb));
        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                b = ext4_block_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_table(sb, gdp);
                if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
                        for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
                                int c = EXT4_B2C(sbi, b - first_block);
                                ext4_set_bit(c, buf);
                                count++;
                        }
                if (i != grp)
                        continue;
                s = 0;
                if (ext4_bg_has_super(sb, grp)) {
                        ext4_set_bit(s++, buf);
                        count++;
                }
                j = ext4_bg_num_gdb(sb, grp);
                if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
                        ext4_error(sb, "Invalid number of block group "
                                   "descriptor blocks: %d", j);
                        j = EXT4_BLOCKS_PER_GROUP(sb) - s;
                }
                count += j;
                for (; j > 0; j--)
                        ext4_set_bit(EXT4_B2C(sbi, s++), buf);
        }
        if (!count)
                return 0;
        return EXT4_CLUSTERS_PER_GROUP(sb) -
                ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
}

/*
 * Compute the overhead and stash it in sbi->s_overhead
 */
int ext4_calculate_overhead(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct inode *j_inode;
        unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        ext4_fsblk_t overhead = 0;
        char *buf = (char *) get_zeroed_page(GFP_NOFS);

        if (!buf)
                return -ENOMEM;

        /*
         * Compute the overhead (FS structures).  This is constant
         * for a given filesystem unless the number of block groups
         * changes so we cache the previous value until it does.
         */

        /*
         * All of the blocks before first_data_block are overhead
         */
        overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));

        /*
         * Add the overhead found in each block group
         */
        for (i = 0; i < ngroups; i++) {
                int blks;

                blks = count_overhead(sb, i, buf);
                overhead += blks;
                if (blks)
                        memset(buf, 0, PAGE_SIZE);
                cond_resched();
        }

        /*
         * Add the internal journal blocks whether the journal has been
         * loaded or not
         */
        if (sbi->s_journal && !sbi->s_journal_bdev_file)
                overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
        else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
                /* j_inum for internal journal is non-zero */
                j_inode = ext4_get_journal_inode(sb, j_inum);
                if (!IS_ERR(j_inode)) {
                        j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
                        overhead += EXT4_NUM_B2C(sbi, j_blocks);
                        iput(j_inode);
                } else {
                        ext4_msg(sb, KERN_ERR, "can't get journal size");
                }
        }
        sbi->s_overhead = overhead;
        smp_wmb();
        free_page((unsigned long) buf);
        return 0;
}

static void ext4_set_resv_clusters(struct super_block *sb)
{
        ext4_fsblk_t resv_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * There's no need to reserve anything when we aren't using extents.
         * The space estimates are exact, there are no unwritten extents,
         * hole punching doesn't need new metadata... This is needed especially
         * to keep ext2/3 backward compatibility.
         */
        if (!ext4_has_feature_extents(sb))
                return;
        /*
         * By default we reserve 2% or 4096 clusters, whichever is smaller.
         * This should cover the situations where we can not afford to run
         * out of space like for example punch hole, or converting
         * unwritten extents in delalloc path. In most cases such
         * allocation would require 1, or 2 blocks, higher numbers are
         * very rare.
         */
        resv_clusters = (ext4_blocks_count(sbi->s_es) >>
                         sbi->s_cluster_bits);

        do_div(resv_clusters, 50);
        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);

        atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}

static const char *ext4_quota_mode(struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        if (!ext4_quota_capable(sb))
                return "none";

        if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
                return "journalled";
        else
                return "writeback";
#else
        return "disabled";
#endif
}

static void ext4_setup_csum_trigger(struct super_block *sb,
                                    enum ext4_journal_trigger_type type,
                                    void (*trigger)(
                                        struct jbd2_buffer_trigger_type *type,
                                        struct buffer_head *bh,
                                        void *mapped_data,
                                        size_t size))
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        sbi->s_journal_triggers[type].sb = sb;
        sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
}

static void ext4_free_sbi(struct ext4_sb_info *sbi)
{
        if (!sbi)
                return;

        kfree(sbi->s_blockgroup_lock);
        fs_put_dax(sbi->s_daxdev, NULL);
        kfree(sbi);
}

static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
{
        struct ext4_sb_info *sbi;

        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return NULL;

        sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
                                           NULL, NULL);

        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);

        if (!sbi->s_blockgroup_lock)
                goto err_out;

        sb->s_fs_info = sbi;
        sbi->s_sb = sb;
        return sbi;
err_out:
        fs_put_dax(sbi->s_daxdev, NULL);
        kfree(sbi);
        return NULL;
}

static void ext4_set_def_opts(struct super_block *sb,
                              struct ext4_super_block *es)
{
        unsigned long def_mount_opts;

        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
                set_opt(sb, GRPID);
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
        /* xattr user namespace & acls are now defaulted on */
        set_opt(sb, XATTR_USER);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        set_opt(sb, POSIX_ACL);
#endif
        if (ext4_has_feature_fast_commit(sb))
                set_opt2(sb, JOURNAL_FAST_COMMIT);
        /* don't forget to enable journal_csum when metadata_csum is enabled. */
        if (ext4_has_metadata_csum(sb))
                set_opt(sb, JOURNAL_CHECKSUM);

        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
                set_opt(sb, WRITEBACK_DATA);

        if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
                set_opt(sb, ERRORS_CONT);
        else
                set_opt(sb, ERRORS_RO);
        /* block_validity enabled by default; disable with noblock_validity */
        set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
                set_opt(sb, DISCARD);

        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
                set_opt(sb, BARRIER);

        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
        if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sb, DELALLOC);

        if (sb->s_blocksize <= PAGE_SIZE)
                set_opt(sb, DIOREAD_NOLOCK);
}

static int ext4_handle_clustersize(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int clustersize;

        /* Handle clustersize */
        clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
        if (ext4_has_feature_bigalloc(sb)) {
                if (clustersize < sb->s_blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "cluster size (%d) smaller than "
                                 "block size (%lu)", clustersize, sb->s_blocksize);
                        return -EINVAL;
                }
                sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
                        le32_to_cpu(es->s_log_block_size);
        } else {
                if (clustersize != sb->s_blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "fragment/cluster size (%d) != "
                                 "block size (%lu)", clustersize, sb->s_blocksize);
                        return -EINVAL;
                }
                if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
                        ext4_msg(sb, KERN_ERR,
                                 "#blocks per group too big: %lu",
                                 sbi->s_blocks_per_group);
                        return -EINVAL;
                }
                sbi->s_cluster_bits = 0;
        }
        sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
        if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
                ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
                         sbi->s_clusters_per_group);
                return -EINVAL;
        }
        if (sbi->s_blocks_per_group !=
            (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
                ext4_msg(sb, KERN_ERR,
                         "blocks per group (%lu) and clusters per group (%lu) inconsistent",
                         sbi->s_blocks_per_group, sbi->s_clusters_per_group);
                return -EINVAL;
        }
        sbi->s_cluster_ratio = clustersize / sb->s_blocksize;

        /* Do we have standard group size of clustersize * 8 blocks ? */
        if (sbi->s_blocks_per_group == clustersize << 3)
                set_opt2(sb, STD_GROUP_SIZE);

        return 0;
}

static void ext4_fast_commit_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* Initialize fast commit stuff */
        atomic_set(&sbi->s_fc_subtid, 0);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
        sbi->s_fc_bytes = 0;
        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        sbi->s_fc_ineligible_tid = 0;
        spin_lock_init(&sbi->s_fc_lock);
        memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
        sbi->s_fc_replay_state.fc_regions = NULL;
        sbi->s_fc_replay_state.fc_regions_size = 0;
        sbi->s_fc_replay_state.fc_regions_used = 0;
        sbi->s_fc_replay_state.fc_regions_valid = 0;
        sbi->s_fc_replay_state.fc_modified_inodes = NULL;
        sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
        sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
}

static int ext4_inode_info_init(struct super_block *sb,
                                struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
        } else {
                sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
                sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
                if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
                        ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
                                 sbi->s_first_ino);
                        return -EINVAL;
                }
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > sb->s_blocksize)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported inode size: %d",
                               sbi->s_inode_size);
                        ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
                        return -EINVAL;
                }
                /*
                 * i_atime_extra is the last extra field available for
                 * [acm]times in struct ext4_inode. Checking for that
                 * field should suffice to ensure we have extra space
                 * for all three.
                 */
                if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
                        sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
                        sb->s_time_gran = 1;
                        sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
                } else {
                        sb->s_time_gran = NSEC_PER_SEC;
                        sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
                }
                sb->s_time_min = EXT4_TIMESTAMP_MIN;
        }

        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                        EXT4_GOOD_OLD_INODE_SIZE;
                if (ext4_has_feature_extra_isize(sb)) {
                        unsigned v, max = (sbi->s_inode_size -
                                           EXT4_GOOD_OLD_INODE_SIZE);

                        v = le16_to_cpu(es->s_want_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_want_extra_isize: %d", v);
                                return -EINVAL;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;

                        v = le16_to_cpu(es->s_min_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_min_extra_isize: %d", v);
                                return -EINVAL;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;
                }
        }

        return 0;
}

#if IS_ENABLED(CONFIG_UNICODE)
static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
{
        const struct ext4_sb_encodings *encoding_info;
        struct unicode_map *encoding;
        __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);

        if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
                return 0;

        encoding_info = ext4_sb_read_encoding(es);
        if (!encoding_info) {
                ext4_msg(sb, KERN_ERR,
                        "Encoding requested by superblock is unknown");
                return -EINVAL;
        }

        encoding = utf8_load(encoding_info->version);
        if (IS_ERR(encoding)) {
                ext4_msg(sb, KERN_ERR,
                        "can't mount with superblock charset: %s-%u.%u.%u "
                        "not supported by the kernel. flags: 0x%x.",
                        encoding_info->name,
                        unicode_major(encoding_info->version),
                        unicode_minor(encoding_info->version),
                        unicode_rev(encoding_info->version),
                        encoding_flags);
                return -EINVAL;
        }
        ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
                "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
                unicode_major(encoding_info->version),
                unicode_minor(encoding_info->version),
                unicode_rev(encoding_info->version),
                encoding_flags);

        sb->s_encoding = encoding;
        sb->s_encoding_flags = encoding_flags;

        return 0;
}
#else
static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
{
        return 0;
}
#endif

static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* Warn if metadata_csum and gdt_csum are both set. */
        if (ext4_has_feature_metadata_csum(sb) &&
            ext4_has_feature_gdt_csum(sb))
                ext4_warning(sb, "metadata_csum and uninit_bg are "
                             "redundant flags; please run fsck.");

        /* Check for a known checksum algorithm */
        if (!ext4_verify_csum_type(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "unknown checksum algorithm.");
                return -EINVAL;
        }
        ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
                                ext4_orphan_file_block_trigger);

        /* Load the checksum driver */
        sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
        if (IS_ERR(sbi->s_chksum_driver)) {
                int ret = PTR_ERR(sbi->s_chksum_driver);
                ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
                sbi->s_chksum_driver = NULL;
                return ret;
        }

        /* Check superblock checksum */
        if (!ext4_superblock_csum_verify(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "invalid superblock checksum.  Run e2fsck?");
                return -EFSBADCRC;
        }

        /* Precompute checksum seed for all metadata */
        if (ext4_has_feature_csum_seed(sb))
                sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
        else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
                sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
                                               sizeof(es->s_uuid));
        return 0;
}

static int ext4_check_feature_compatibility(struct super_block *sb,
                                            struct ext4_super_block *es,
                                            int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
            (ext4_has_compat_features(sb) ||
             ext4_has_ro_compat_features(sb) ||
             ext4_has_incompat_features(sb)))
                ext4_msg(sb, KERN_WARNING,
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");

        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
                set_opt2(sb, HURD_COMPAT);
                if (ext4_has_feature_64bit(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "The Hurd can't support 64-bit file systems");
                        return -EINVAL;
                }

                /*
                 * ea_inode feature uses l_i_version field which is not
                 * available in HURD_COMPAT mode.
                 */
                if (ext4_has_feature_ea_inode(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "ea_inode feature is not supported for Hurd");
                        return -EINVAL;
                }
        }

        if (IS_EXT2_SB(sb)) {
                if (ext2_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext[34] filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                return -EINVAL;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
                                 "to feature incompatibilities");
                        return -EINVAL;
                }
        }

        if (IS_EXT3_SB(sb)) {
                if (ext3_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext4 filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                return -EINVAL;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
                                 "to feature incompatibilities");
                        return -EINVAL;
                }
        }

        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
         * so there is a chance incompat flags are set on a rev 0 filesystem.
         */
        if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
                return -EINVAL;

        if (sbi->s_daxdev) {
                if (sb->s_blocksize == PAGE_SIZE)
                        set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
                else
                        ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
        }

        if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
                if (ext4_has_feature_inline_data(sb)) {
                        ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
                                        " that may contain inline data");
                        return -EINVAL;
                }
                if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
                        ext4_msg(sb, KERN_ERR,
                                "DAX unsupported by block device.");
                        return -EINVAL;
                }
        }

        if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
                ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
                         es->s_encryption_level);
                return -EINVAL;
        }

        return 0;
}

static int ext4_check_geometry(struct super_block *sb,
                               struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        __u64 blocks_count;
        int err;

        if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
                ext4_msg(sb, KERN_ERR,
                         "Number of reserved GDT blocks insanely large: %d",
                         le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
                return -EINVAL;
        }
        /*
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
        err = generic_check_addressable(sb->s_blocksize_bits,
                                        ext4_blocks_count(es));
        if (err) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                return err;
        }

        /* check blocks count against device size */
        blocks_count = sb_bdev_nr_blocks(sb);
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
                       "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                return -EINVAL;
        }

        /*
         * It makes no sense for the first data block to be beyond the end
         * of the filesystem.
         */
        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block %u is beyond end of filesystem (%llu)",
                         le32_to_cpu(es->s_first_data_block),
                         ext4_blocks_count(es));
                return -EINVAL;
        }
        if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
            (sbi->s_cluster_ratio == 1)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block is 0 with a 1k block and cluster size");
                return -EINVAL;
        }

        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
                ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
                       "(block count %llu, first data block %u, "
                       "blocks per group %lu)", blocks_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
                return -EINVAL;
        }
        sbi->s_groups_count = blocks_count;
        sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
            le32_to_cpu(es->s_inodes_count)) {
                ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
                         le32_to_cpu(es->s_inodes_count),
                         ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
                return -EINVAL;
        }

        return 0;
}

static int ext4_group_desc_init(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t logical_sb_block,
                                ext4_group_t *first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned int db_count;
        ext4_fsblk_t block;
        int i;

        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
        if (ext4_has_feature_meta_bg(sb)) {
                if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
                        ext4_msg(sb, KERN_WARNING,
                                 "first meta block group too large: %u "
                                 "(group descriptor block count %u)",
                                 le32_to_cpu(es->s_first_meta_bg), db_count);
                        return -EINVAL;
                }
        }
        rcu_assign_pointer(sbi->s_group_desc,
                           kvmalloc_array(db_count,
                                          sizeof(struct buffer_head *),
                                          GFP_KERNEL));
        if (sbi->s_group_desc == NULL) {
                ext4_msg(sb, KERN_ERR, "not enough memory");
                return -ENOMEM;
        }

        bgl_lock_init(sbi->s_blockgroup_lock);

        /* Pre-read the descriptors into the buffer cache */
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
                ext4_sb_breadahead_unmovable(sb, block);
        }

        for (i = 0; i < db_count; i++) {
                struct buffer_head *bh;

                block = descriptor_loc(sb, logical_sb_block, i);
                bh = ext4_sb_bread_unmovable(sb, block);
                if (IS_ERR(bh)) {
                        ext4_msg(sb, KERN_ERR,
                               "can't read group descriptor %d", i);
                        sbi->s_gdb_count = i;
                        return PTR_ERR(bh);
                }
                rcu_read_lock();
                rcu_dereference(sbi->s_group_desc)[i] = bh;
                rcu_read_unlock();
        }
        sbi->s_gdb_count = db_count;
        if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                return -EFSCORRUPTED;
        }

        return 0;
}

static int ext4_load_and_init_journal(struct super_block *sb,
                                      struct ext4_super_block *es,
                                      struct ext4_fs_context *ctx)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        err = ext4_load_journal(sb, es, ctx->journal_devnum);
        if (err)
                return err;

        if (ext4_has_feature_64bit(sb) &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto out;
        }

        if (!set_journal_csum_feature_set(sb)) {
                ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
                         "feature set");
                goto out;
        }

        if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
                !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                          JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
                ext4_msg(sb, KERN_ERR,
                        "Failed to set fast commit journal feature");
                goto out;
        }

        /* We have now updated the journal if required, so we can
         * validate the data journaling mode. */
        switch (test_opt(sb, DATA_FLAGS)) {
        case 0:
                /* No mode set, assume a default based on the journal
                 * capabilities: ORDERED_DATA if the journal can
                 * cope, else JOURNAL_DATA
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        set_opt(sb, ORDERED_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
                } else {
                        set_opt(sb, JOURNAL_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
                }
                break;

        case EXT4_MOUNT_ORDERED_DATA:
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        ext4_msg(sb, KERN_ERR, "Journal does not support "
                               "requested data journaling mode");
                        goto out;
                }
                break;
        default:
                break;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
            test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ext4_msg(sb, KERN_ERR, "can't mount with "
                        "journal_async_commit in data=ordered mode");
                goto out;
        }

        set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);

        sbi->s_journal->j_submit_inode_data_buffers =
                ext4_journal_submit_inode_data_buffers;
        sbi->s_journal->j_finish_inode_data_buffers =
                ext4_journal_finish_inode_data_buffers;

        return 0;

out:
        /* flush s_sb_upd_work before destroying the journal. */
        flush_work(&sbi->s_sb_upd_work);
        jbd2_journal_destroy(sbi->s_journal);
        sbi->s_journal = NULL;
        return -EINVAL;
}

static int ext4_check_journal_data_mode(struct super_block *sb)
{
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
                            "data=journal disables delayed allocation, "
                            "dioread_nolock, O_DIRECT and fast_commit support!\n");
                /* can't mount with both data=journal and dioread_nolock. */
                clear_opt(sb, DIOREAD_NOLOCK);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        return -EINVAL;
                }
                if (test_opt(sb, DAX_ALWAYS)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dax");
                        return -EINVAL;
                }
                if (ext4_has_feature_encrypt(sb)) {
                        ext4_msg(sb, KERN_WARNING,
                                 "encrypted files will use data=ordered "
                                 "instead of data journaling mode");
                }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        } else {
                sb->s_iflags |= SB_I_CGROUPWB;
        }

        return 0;
}

static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
                           int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es;
        ext4_fsblk_t logical_sb_block;
        unsigned long offset = 0;
        struct buffer_head *bh;
        int ret = -EINVAL;
        int blocksize;

        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                return -EINVAL;
        }

        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
         */
        if (blocksize != EXT4_MIN_BLOCK_SIZE) {
                logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
        } else {
                logical_sb_block = sbi->s_sb_block;
        }

        bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                return PTR_ERR(bh);
        }
        /*
         * Note: s_es must be initialized as soon as possible because
         *       some ext4 macro-instructions depend on its value
         */
        es = (struct ext4_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC) {
                if (!silent)
                        ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
                goto out;
        }

        if (le32_to_cpu(es->s_log_block_size) >
            (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log block size: %u",
                         le32_to_cpu(es->s_log_block_size));
                goto out;
        }
        if (le32_to_cpu(es->s_log_cluster_size) >
            (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log cluster size: %u",
                         le32_to_cpu(es->s_log_cluster_size));
                goto out;
        }

        blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);

        /*
         * If the default block size is not the same as the real block size,
         * we need to reload it.
         */
        if (sb->s_blocksize == blocksize) {
                *lsb = logical_sb_block;
                sbi->s_sbh = bh;
                return 0;
        }

        /*
         * bh must be released before kill_bdev(), otherwise
         * it won't be freed and its page also. kill_bdev()
         * is called by sb_set_blocksize().
         */
        brelse(bh);
        /* Validate the filesystem blocksize */
        if (!sb_set_blocksize(sb, blocksize)) {
                ext4_msg(sb, KERN_ERR, "bad block size %d",
                                blocksize);
                bh = NULL;
                goto out;
        }

        logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
        offset = do_div(logical_sb_block, blocksize);
        bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
                ret = PTR_ERR(bh);
                bh = NULL;
                goto out;
        }
        es = (struct ext4_super_block *)(bh->b_data + offset);
        sbi->s_es = es;
        if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
                ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
                goto out;
        }
        *lsb = logical_sb_block;
        sbi->s_sbh = bh;
        return 0;
out:
        brelse(bh);
        return ret;
}

static void ext4_hash_info_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        unsigned int i;

        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);

        sbi->s_def_hash_version = es->s_def_hash_version;
        if (ext4_has_feature_dir_index(sb)) {
                i = le32_to_cpu(es->s_flags);
                if (i & EXT2_FLAGS_UNSIGNED_HASH)
                        sbi->s_hash_unsigned = 3;
                else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
#ifdef __CHAR_UNSIGNED__
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
                        sbi->s_hash_unsigned = 3;
#else
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
                }
        }
}

static int ext4_block_group_meta_init(struct super_block *sb, int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int has_huge_files;

        has_huge_files = ext4_has_feature_huge_file(sb);
        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
                                                      has_huge_files);
        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);

        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (ext4_has_feature_64bit(sb)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        return -EINVAL;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;

        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);

        sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
                if (!silent)
                        ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
                return -EINVAL;
        }
        if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
            sbi->s_inodes_per_group > sb->s_blocksize * 8) {
                ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
                         sbi->s_inodes_per_group);
                return -EINVAL;
        }
        sbi->s_itb_per_group = sbi->s_inodes_per_group /
                                        sbi->s_inodes_per_block;
        sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
        sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));

        return 0;
}

static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t logical_sb_block;
        struct inode *root;
        int needs_recovery;
        int err;
        ext4_group_t first_not_zeroed;
        struct ext4_fs_context *ctx = fc->fs_private;
        int silent = fc->sb_flags & SB_SILENT;

        /* Set defaults for the variables that will be set during parsing */
        if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
                ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;

        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sectors_written_start =
                part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);

        err = ext4_load_super(sb, &logical_sb_block, silent);
        if (err)
                goto out_fail;

        es = sbi->s_es;
        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);

        err = ext4_init_metadata_csum(sb, es);
        if (err)
                goto failed_mount;

        ext4_set_def_opts(sb, es);

        sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
        sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;

        /*
         * set default s_li_wait_mult for lazyinit, for the case there is
         * no mount option specified.
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;

        err = ext4_inode_info_init(sb, es);
        if (err)
                goto failed_mount;

        err = parse_apply_sb_mount_options(sb, ctx);
        if (err < 0)
                goto failed_mount;

        sbi->s_def_mount_opt = sbi->s_mount_opt;
        sbi->s_def_mount_opt2 = sbi->s_mount_opt2;

        err = ext4_check_opt_consistency(fc, sb);
        if (err < 0)
                goto failed_mount;

        ext4_apply_options(fc, sb);

        err = ext4_encoding_init(sb, es);
        if (err)
                goto failed_mount;

        err = ext4_check_journal_data_mode(sb);
        if (err)
                goto failed_mount;

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        /* i_version is always enabled now */
        sb->s_flags |= SB_I_VERSION;

        err = ext4_check_feature_compatibility(sb, es, silent);
        if (err)
                goto failed_mount;

        err = ext4_block_group_meta_init(sb, silent);
        if (err)
                goto failed_mount;

        ext4_hash_info_init(sb);

        err = ext4_handle_clustersize(sb);
        if (err)
                goto failed_mount;

        err = ext4_check_geometry(sb, es);
        if (err)
                goto failed_mount;

        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
        spin_lock_init(&sbi->s_error_lock);
        INIT_WORK(&sbi->s_sb_upd_work, update_super_work);

        err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
        if (err)
                goto failed_mount3;

        err = ext4_es_register_shrinker(sbi);
        if (err)
                goto failed_mount3;

        sbi->s_stripe = ext4_get_stripe_size(sbi);
        /*
         * It's hard to get stripe aligned blocks if stripe is not aligned with
         * cluster, just disable stripe and alert user to simpfy code and avoid
         * stripe aligned allocation which will rarely successes.
         */
        if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 &&
            sbi->s_stripe % sbi->s_cluster_ratio != 0) {
                ext4_msg(sb, KERN_WARNING,
                         "stripe (%lu) is not aligned with cluster size (%u), "
                         "stripe is disabled",
                         sbi->s_stripe, sbi->s_cluster_ratio);
                sbi->s_stripe = 0;
        }
        sbi->s_extent_max_zeroout_kb = 32;

        /*
         * set up enough so that it can read an inode
         */
        sb->s_op = &ext4_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_FS_ENCRYPTION
        sb->s_cop = &ext4_cryptops;
#endif
#ifdef CONFIG_FS_VERITY
        sb->s_vop = &ext4_verityops;
#endif
#ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (ext4_has_feature_quota(sb))
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
        super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
        super_set_sysfs_name_bdev(sb);

        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);

        ext4_fast_commit_init(sb);

        sb->s_root = NULL;

        needs_recovery = (es->s_last_orphan != 0 ||
                          ext4_has_feature_orphan_present(sb) ||
                          ext4_has_feature_journal_needs_recovery(sb));

        if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
                err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
                if (err)
                        goto failed_mount3a;
        }

        err = -EINVAL;
        /*
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
         */
        if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
                err = ext4_load_and_init_journal(sb, es, ctx);
                if (err)
                        goto failed_mount3a;
        } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
                   ext4_has_feature_journal_needs_recovery(sb)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
                       "suppressed and not mounted read-only");
                goto failed_mount3a;
        } else {
                /* Nojournal mode, all journal mount options are illegal */
                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "journal_async_commit, fs mounted w/o journal");
                        goto failed_mount3a;
                }

                if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "journal_checksum, fs mounted w/o journal");
                        goto failed_mount3a;
                }
                if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "commit=%lu, fs mounted w/o journal",
                                 sbi->s_commit_interval / HZ);
                        goto failed_mount3a;
                }
                if (EXT4_MOUNT_DATA_FLAGS &
                    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "data=, fs mounted w/o journal");
                        goto failed_mount3a;
                }
                sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
                clear_opt(sb, JOURNAL_CHECKSUM);
                clear_opt(sb, DATA_FLAGS);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                sbi->s_journal = NULL;
                needs_recovery = 0;
        }

        if (!test_opt(sb, NO_MBCACHE)) {
                sbi->s_ea_block_cache = ext4_xattr_create_cache();
                if (!sbi->s_ea_block_cache) {
                        ext4_msg(sb, KERN_ERR,
                                 "Failed to create ea_block_cache");
                        err = -EINVAL;
                        goto failed_mount_wq;
                }

                if (ext4_has_feature_ea_inode(sb)) {
                        sbi->s_ea_inode_cache = ext4_xattr_create_cache();
                        if (!sbi->s_ea_inode_cache) {
                                ext4_msg(sb, KERN_ERR,
                                         "Failed to create ea_inode_cache");
                                err = -EINVAL;
                                goto failed_mount_wq;
                        }
                }
        }

        /*
         * Get the # of file system overhead blocks from the
         * superblock if present.
         */
        sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
        /* ignore the precalculated value if it is ridiculous */
        if (sbi->s_overhead > ext4_blocks_count(es))
                sbi->s_overhead = 0;
        /*
         * If the bigalloc feature is not enabled recalculating the
         * overhead doesn't take long, so we might as well just redo
         * it to make sure we are using the correct value.
         */
        if (!ext4_has_feature_bigalloc(sb))
                sbi->s_overhead = 0;
        if (sbi->s_overhead == 0) {
                err = ext4_calculate_overhead(sb);
                if (err)
                        goto failed_mount_wq;
        }

        /*
         * The maximum number of concurrent works can be high and
         * concurrency isn't really necessary.  Limit it to 1.
         */
        EXT4_SB(sb)->rsv_conversion_wq =
                alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->rsv_conversion_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
                err = -ENOMEM;
                goto failed_mount4;
        }

        /*
         * The jbd2_journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
         */

        root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
        if (IS_ERR(root)) {
                ext4_msg(sb, KERN_ERR, "get root inode failed");
                err = PTR_ERR(root);
                root = NULL;
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                iput(root);
                err = -EFSCORRUPTED;
                goto failed_mount4;
        }

        generic_set_sb_d_ops(sb);
        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                err = -ENOMEM;
                goto failed_mount4;
        }

        err = ext4_setup_super(sb, es, sb_rdonly(sb));
        if (err == -EROFS) {
                sb->s_flags |= SB_RDONLY;
        } else if (err)
                goto failed_mount4a;

        ext4_set_resv_clusters(sb);

        if (test_opt(sb, BLOCK_VALIDITY)) {
                err = ext4_setup_system_zone(sb);
                if (err) {
                        ext4_msg(sb, KERN_ERR, "failed to initialize system "
                                 "zone (%d)", err);
                        goto failed_mount4a;
                }
        }
        ext4_fc_replay_cleanup(sb);

        ext4_ext_init(sb);

        /*
         * Enable optimize_scan if number of groups is > threshold. This can be
         * turned off by passing "mb_optimize_scan=0". This can also be
         * turned on forcefully by passing "mb_optimize_scan=1".
         */
        if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
                if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
                        set_opt2(sb, MB_OPTIMIZE_SCAN);
                else
                        clear_opt2(sb, MB_OPTIMIZE_SCAN);
        }

        err = ext4_mb_init(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
                goto failed_mount5;
        }

        /*
         * We can only set up the journal commit callback once
         * mballoc is initialized
         */
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback =
                        ext4_journal_commit_callback;

        err = ext4_percpu_param_init(sbi);
        if (err)
                goto failed_mount6;

        if (ext4_has_feature_flex_bg(sb))
                if (!ext4_fill_flex_info(sb)) {
                        ext4_msg(sb, KERN_ERR,
                               "unable to initialize "
                               "flex_bg meta info!");
                        err = -ENOMEM;
                        goto failed_mount6;
                }

        err = ext4_register_li_request(sb, first_not_zeroed);
        if (err)
                goto failed_mount6;

        err = ext4_init_orphan_info(sb);
        if (err)
                goto failed_mount7;
#ifdef CONFIG_QUOTA
        /* Enable quota usage during mount. */
        if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
                err = ext4_enable_quotas(sb);
                if (err)
                        goto failed_mount8;
        }
#endif  /* CONFIG_QUOTA */

        /*
         * Save the original bdev mapping's wb_err value which could be
         * used to detect the metadata async write error.
         */
        spin_lock_init(&sbi->s_bdev_wb_lock);
        errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
                                 &sbi->s_bdev_wb_err);
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        /*
         * Update the checksum after updating free space/inode counters and
         * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
         * checksum in the buffer cache until it is written out and
         * e2fsprogs programs trying to open a file system immediately
         * after it is mounted can fail.
         */
        ext4_superblock_csum_set(sb);
        if (needs_recovery) {
                ext4_msg(sb, KERN_INFO, "recovery complete");
                err = ext4_mark_recovery_complete(sb, es);
                if (err)
                        goto failed_mount9;
        }

        if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
                ext4_msg(sb, KERN_WARNING,
                         "mounting with \"discard\" option, but the device does not support discard");

        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */

        /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
        ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
        atomic_set(&sbi->s_warning_count, 0);
        atomic_set(&sbi->s_msg_count, 0);

        /* Register sysfs after all initializations are complete. */
        err = ext4_register_sysfs(sb);
        if (err)
                goto failed_mount9;

        return 0;

failed_mount9:
        ext4_quotas_off(sb, EXT4_MAXQUOTAS);
failed_mount8: __maybe_unused
        ext4_release_orphan_info(sb);
failed_mount7:
        ext4_unregister_li_request(sb);
failed_mount6:
        ext4_mb_release(sb);
        ext4_flex_groups_free(sbi);
        ext4_percpu_param_destroy(sbi);
failed_mount5:
        ext4_ext_release(sb);
        ext4_release_system_zone(sb);
failed_mount4a:
        dput(sb->s_root);
        sb->s_root = NULL;
failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
        if (EXT4_SB(sb)->rsv_conversion_wq)
                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        if (sbi->s_journal) {
                /* flush s_sb_upd_work before journal destroy. */
                flush_work(&sbi->s_sb_upd_work);
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
failed_mount3:
        /* flush s_sb_upd_work before sbi destroy */
        flush_work(&sbi->s_sb_upd_work);
        del_timer_sync(&sbi->s_err_report);
        ext4_stop_mmpd(sbi);
        ext4_group_desc_free(sbi);
failed_mount:
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);

#if IS_ENABLED(CONFIG_UNICODE)
        utf8_unload(sb->s_encoding);
#endif

#ifdef CONFIG_QUOTA
        for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
        brelse(sbi->s_sbh);
        if (sbi->s_journal_bdev_file) {
                invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
                bdev_fput(sbi->s_journal_bdev_file);
        }
out_fail:
        invalidate_bdev(sb->s_bdev);
        sb->s_fs_info = NULL;
        return err;
}

static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi;
        const char *descr;
        int ret;

        sbi = ext4_alloc_sbi(sb);
        if (!sbi)
                return -ENOMEM;

        fc->s_fs_info = sbi;

        /* Cleanup superblock name */
        strreplace(sb->s_id, '/', '!');

        sbi->s_sb_block = 1;        /* Default super block location */
        if (ctx->spec & EXT4_SPEC_s_sb_block)
                sbi->s_sb_block = ctx->s_sb_block;

        ret = __ext4_fill_super(fc, sb);
        if (ret < 0)
                goto free_sbi;

        if (sbi->s_journal) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        descr = " journalled data mode";
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        descr = " ordered data mode";
                else
                        descr = " writeback data mode";
        } else
                descr = "out journal";

        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
                         "Quota mode: %s.", &sb->s_uuid,
                         sb_rdonly(sb) ? "ro" : "r/w", descr,
                         ext4_quota_mode(sb));

        /* Update the s_overhead_clusters if necessary */
        ext4_update_overhead(sb, false);
        return 0;

free_sbi:
        ext4_free_sbi(sbi);
        fc->s_fs_info = NULL;
        return ret;
}

static int ext4_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, ext4_fill_super);
}

/*
 * Setup any per-fs journal parameters now.  We'll do this both on
 * initial mount, once the journal has been initialised but before we've
 * done any recovery; and again on any subsequent remount.
 */
static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        journal->j_commit_interval = sbi->s_commit_interval;
        journal->j_min_batch_time = sbi->s_min_batch_time;
        journal->j_max_batch_time = sbi->s_max_batch_time;
        ext4_fc_init(sb, journal);

        write_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
        if (test_opt(sb, DATA_ERR_ABORT))
                journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
        else
                journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
        /*
         * Always enable journal cycle record option, letting the journal
         * records log transactions continuously between each mount.
         */
        journal->j_flags |= JBD2_CYCLE_RECORD;
        write_unlock(&journal->j_state_lock);
}

static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                             unsigned int journal_inum)
{
        struct inode *journal_inode;

        /*
         * Test for the existence of a valid inode on disk.  Bad things
         * happen if we iget() an unused inode, as the subsequent iput()
         * will try to delete it.
         */
        journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
        if (IS_ERR(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "no journal found");
                return ERR_CAST(journal_inode);
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return ERR_PTR(-EFSCORRUPTED);
        }
        if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return ERR_PTR(-EFSCORRUPTED);
        }

        ext4_debug("Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        return journal_inode;
}

static int ext4_journal_bmap(journal_t *journal, sector_t *block)
{
        struct ext4_map_blocks map;
        int ret;

        if (journal->j_inode == NULL)
                return 0;

        map.m_lblk = *block;
        map.m_len = 1;
        ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
        if (ret <= 0) {
                ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
                         "journal bmap failed: block %llu ret %d\n",
                         *block, ret);
                jbd2_journal_abort(journal, ret ? ret : -EIO);
                return ret;
        }
        *block = map.m_pblk;
        return 0;
}

static journal_t *ext4_open_inode_journal(struct super_block *sb,
                                          unsigned int journal_inum)
{
        struct inode *journal_inode;
        journal_t *journal;

        journal_inode = ext4_get_journal_inode(sb, journal_inum);
        if (IS_ERR(journal_inode))
                return ERR_CAST(journal_inode);

        journal = jbd2_journal_init_inode(journal_inode);
        if (IS_ERR(journal)) {
                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return ERR_CAST(journal);
        }
        journal->j_private = sb;
        journal->j_bmap = ext4_journal_bmap;
        ext4_init_journal_params(sb, journal);
        return journal;
}

static struct file *ext4_get_journal_blkdev(struct super_block *sb,
                                        dev_t j_dev, ext4_fsblk_t *j_start,
                                        ext4_fsblk_t *j_len)
{
        struct buffer_head *bh;
        struct block_device *bdev;
        struct file *bdev_file;
        int hblock, blocksize;
        ext4_fsblk_t sb_block;
        unsigned long offset;
        struct ext4_super_block *es;
        int errno;

        bdev_file = bdev_file_open_by_dev(j_dev,
                BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
                sb, &fs_holder_ops);
        if (IS_ERR(bdev_file)) {
                ext4_msg(sb, KERN_ERR,
                         "failed to open journal device unknown-block(%u,%u) %ld",
                         MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
                return bdev_file;
        }

        bdev = file_bdev(bdev_file);
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
                ext4_msg(sb, KERN_ERR,
                        "blocksize too small for journal device");
                errno = -EINVAL;
                goto out_bdev;
        }

        sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev_file, blocksize);
        bh = __bread(bdev, sb_block, blocksize);
        if (!bh) {
                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
                       "external journal");
                errno = -EINVAL;
                goto out_bdev;
        }

        es = (struct ext4_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
                ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        if ((le32_to_cpu(es->s_feature_ro_compat) &
             EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
            es->s_checksum != ext4_superblock_csum(sb, es)) {
                ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        *j_start = sb_block + 1;
        *j_len = ext4_blocks_count(es);
        brelse(bh);
        return bdev_file;

out_bh:
        brelse(bh);
out_bdev:
        bdev_fput(bdev_file);
        return ERR_PTR(errno);
}

static journal_t *ext4_open_dev_journal(struct super_block *sb,
                                        dev_t j_dev)
{
        journal_t *journal;
        ext4_fsblk_t j_start;
        ext4_fsblk_t j_len;
        struct file *bdev_file;
        int errno = 0;

        bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
        if (IS_ERR(bdev_file))
                return ERR_CAST(bdev_file);

        journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
                                        j_len, sb->s_blocksize);
        if (IS_ERR(journal)) {
                ext4_msg(sb, KERN_ERR, "failed to create device journal");
                errno = PTR_ERR(journal);
                goto out_bdev;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
                ext4_msg(sb, KERN_ERR, "External journal has more than one "
                                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                errno = -EINVAL;
                goto out_journal;
        }
        journal->j_private = sb;
        EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
        ext4_init_journal_params(sb, journal);
        return journal;

out_journal:
        jbd2_journal_destroy(journal);
out_bdev:
        bdev_fput(bdev_file);
        return ERR_PTR(errno);
}

static int ext4_load_journal(struct super_block *sb,
                             struct ext4_super_block *es,
                             unsigned long journal_devnum)
{
        journal_t *journal;
        unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
        dev_t journal_dev;
        int err = 0;
        int really_read_only;
        int journal_dev_ro;

        if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
                return -EFSCORRUPTED;

        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));

        if (journal_inum && journal_dev) {
                ext4_msg(sb, KERN_ERR,
                         "filesystem has both journal inode and journal device!");
                return -EINVAL;
        }

        if (journal_inum) {
                journal = ext4_open_inode_journal(sb, journal_inum);
                if (IS_ERR(journal))
                        return PTR_ERR(journal);
        } else {
                journal = ext4_open_dev_journal(sb, journal_dev);
                if (IS_ERR(journal))
                        return PTR_ERR(journal);
        }

        journal_dev_ro = bdev_read_only(journal->j_dev);
        really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;

        if (journal_dev_ro && !sb_rdonly(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "journal device read-only, try mounting with '-o ro'");
                err = -EROFS;
                goto err_out;
        }

        /*
         * Are we loading a blank journal or performing recovery after a
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
        if (ext4_has_feature_journal_needs_recovery(sb)) {
                if (sb_rdonly(sb)) {
                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
                                        "required on readonly filesystem");
                        if (really_read_only) {
                                ext4_msg(sb, KERN_ERR, "write access "
                                        "unavailable, cannot proceed "
                                        "(try mounting with noload)");
                                err = -EROFS;
                                goto err_out;
                        }
                        ext4_msg(sb, KERN_INFO, "write access will "
                               "be enabled during recovery");
                }
        }

        if (!(journal->j_flags & JBD2_BARRIER))
                ext4_msg(sb, KERN_INFO, "barriers disabled");

        if (!ext4_has_feature_journal_needs_recovery(sb))
                err = jbd2_journal_wipe(journal, !really_read_only);
        if (!err) {
                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
                __le16 orig_state;
                bool changed = false;

                if (save)
                        memcpy(save, ((char *) es) +
                               EXT4_S_ERR_START, EXT4_S_ERR_LEN);
                err = jbd2_journal_load(journal);
                if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
                                   save, EXT4_S_ERR_LEN)) {
                        memcpy(((char *) es) + EXT4_S_ERR_START,
                               save, EXT4_S_ERR_LEN);
                        changed = true;
                }
                kfree(save);
                orig_state = es->s_state;
                es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
                                           EXT4_ERROR_FS);
                if (orig_state != es->s_state)
                        changed = true;
                /* Write out restored error information to the superblock */
                if (changed && !really_read_only) {
                        int err2;
                        err2 = ext4_commit_super(sb);
                        err = err ? : err2;
                }
        }

        if (err) {
                ext4_msg(sb, KERN_ERR, "error loading journal");
                goto err_out;
        }

        EXT4_SB(sb)->s_journal = journal;
        err = ext4_clear_journal_err(sb, es);
        if (err) {
                EXT4_SB(sb)->s_journal = NULL;
                jbd2_journal_destroy(journal);
                return err;
        }

        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
                ext4_commit_super(sb);
        }
        if (!really_read_only && journal_inum &&
            journal_inum != le32_to_cpu(es->s_journal_inum)) {
                es->s_journal_inum = cpu_to_le32(journal_inum);
                ext4_commit_super(sb);
        }

        return 0;

err_out:
        jbd2_journal_destroy(journal);
        return err;
}

/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
static void ext4_update_super(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct buffer_head *sbh = sbi->s_sbh;

        lock_buffer(sbh);
        /*
         * If the file system is mounted read-only, don't update the
         * superblock write time.  This avoids updating the superblock
         * write time when we are mounting the root file system
         * read/only but we need to replay the journal; at that point,
         * for people who are east of GMT and who make their clock
         * tick in localtime for Windows bug-for-bug compatibility,
         * the clock is set in the future, and this will cause e2fsck
         * to complain and force a full file system check.
         */
        if (!sb_rdonly(sb))
                ext4_update_tstamp(es, s_wtime);
        es->s_kbytes_written =
                cpu_to_le64(sbi->s_kbytes_written +
                    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                      sbi->s_sectors_written_start) >> 1));
        if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
                ext4_free_blocks_count_set(es,
                        EXT4_C2B(sbi, percpu_counter_sum_positive(
                                &sbi->s_freeclusters_counter)));
        if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
                es->s_free_inodes_count =
                        cpu_to_le32(percpu_counter_sum_positive(
                                &sbi->s_freeinodes_counter));
        /* Copy error information to the on-disk superblock */
        spin_lock(&sbi->s_error_lock);
        if (sbi->s_add_error_count > 0) {
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                if (!es->s_first_error_time && !es->s_first_error_time_hi) {
                        __ext4_update_tstamp(&es->s_first_error_time,
                                             &es->s_first_error_time_hi,
                                             sbi->s_first_error_time);
                        strtomem_pad(es->s_first_error_func,
                                     sbi->s_first_error_func, 0);
                        es->s_first_error_line =
                                cpu_to_le32(sbi->s_first_error_line);
                        es->s_first_error_ino =
                                cpu_to_le32(sbi->s_first_error_ino);
                        es->s_first_error_block =
                                cpu_to_le64(sbi->s_first_error_block);
                        es->s_first_error_errcode =
                                ext4_errno_to_code(sbi->s_first_error_code);
                }
                __ext4_update_tstamp(&es->s_last_error_time,
                                     &es->s_last_error_time_hi,
                                     sbi->s_last_error_time);
                strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0);
                es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
                es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
                es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
                es->s_last_error_errcode =
                                ext4_errno_to_code(sbi->s_last_error_code);
                /*
                 * Start the daily error reporting function if it hasn't been
                 * started already
                 */
                if (!es->s_error_count)
                        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
                le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
                sbi->s_add_error_count = 0;
        }
        spin_unlock(&sbi->s_error_lock);

        ext4_superblock_csum_set(sb);
        unlock_buffer(sbh);
}

static int ext4_commit_super(struct super_block *sb)
{
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;

        if (!sbh)
                return -EINVAL;

        ext4_update_super(sb);

        lock_buffer(sbh);
        /* Buffer got discarded which means block device got invalidated */
        if (!buffer_mapped(sbh)) {
                unlock_buffer(sbh);
                return -EIO;
        }

        if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
                 * superblock failed.  This could happen because the
                 * USB device was yanked out.  Or it could happen to
                 * be a transient write error and maybe the block will
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
                ext4_msg(sb, KERN_ERR, "previous I/O error to "
                       "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
        get_bh(sbh);
        /* Clear potential dirty bit if it was journalled update */
        clear_buffer_dirty(sbh);
        sbh->b_end_io = end_buffer_write_sync;
        submit_bh(REQ_OP_WRITE | REQ_SYNC |
                  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
        wait_on_buffer(sbh);
        if (buffer_write_io_error(sbh)) {
                ext4_msg(sb, KERN_ERR, "I/O error while writing "
                       "superblock");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
                return -EIO;
        }
        return 0;
}

/*
 * Have we just finished recovery?  If so, and if we are mounting (or
 * remounting) the filesystem readonly, then we will end up with a
 * consistent fs on disk.  Record that fact.
 */
static int ext4_mark_recovery_complete(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        int err;
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (!ext4_has_feature_journal(sb)) {
                if (journal != NULL) {
                        ext4_error(sb, "Journal got removed while the fs was "
                                   "mounted!");
                        return -EFSCORRUPTED;
                }
                return 0;
        }
        jbd2_journal_lock_updates(journal);
        err = jbd2_journal_flush(journal, 0);
        if (err < 0)
                goto out;

        if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
            ext4_has_feature_orphan_present(sb))) {
                if (!ext4_orphan_file_empty(sb)) {
                        ext4_error(sb, "Orphan file not empty on read-only fs.");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                ext4_clear_feature_journal_needs_recovery(sb);
                ext4_clear_feature_orphan_present(sb);
                ext4_commit_super(sb);
        }
out:
        jbd2_journal_unlock_updates(journal);
        return err;
}

/*
 * If we are mounting (or read-write remounting) a filesystem whose journal
 * has recorded an error from a previous lifetime, move that error to the
 * main filesystem now.
 */
static int ext4_clear_journal_err(struct super_block *sb,
                                   struct ext4_super_block *es)
{
        journal_t *journal;
        int j_errno;
        const char *errstr;

        if (!ext4_has_feature_journal(sb)) {
                ext4_error(sb, "Journal got removed while the fs was mounted!");
                return -EFSCORRUPTED;
        }

        journal = EXT4_SB(sb)->s_journal;

        /*
         * Now check for any error status which may have been recorded in the
         * journal by a prior ext4_error() or ext4_abort()
         */

        j_errno = jbd2_journal_errno(journal);
        if (j_errno) {
                char nbuf[16];

                errstr = ext4_decode_error(sb, j_errno, nbuf);
                ext4_warning(sb, "Filesystem error recorded "
                             "from previous mount: %s", errstr);

                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                j_errno = ext4_commit_super(sb);
                if (j_errno)
                        return j_errno;
                ext4_warning(sb, "Marked fs in need of filesystem check.");

                jbd2_journal_clear_err(journal);
                jbd2_journal_update_sb_errno(journal);
        }
        return 0;
}

/*
 * Force the running and committing transactions to commit,
 * and wait on the commit.
 */
int ext4_force_commit(struct super_block *sb)
{
        return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
}

static int ext4_sync_fs(struct super_block *sb, int wait)
{
        int ret = 0;
        tid_t target;
        bool needs_barrier = false;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(ext4_forced_shutdown(sb)))
                return 0;

        trace_ext4_sync_fs(sb, wait);
        flush_workqueue(sbi->rsv_conversion_wq);
        /*
         * Writeback quota in non-journalled quota case - journalled quota has
         * no dirty dquots
         */
        dquot_writeback_dquots(sb, -1);
        /*
         * Data writeback is possible w/o journal transaction, so barrier must
         * being sent at the end of the function. But we can skip it if
         * transaction_commit will do it for us.
         */
        if (sbi->s_journal) {
                target = jbd2_get_latest_transaction(sbi->s_journal);
                if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
                    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
                        needs_barrier = true;

                if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
                        if (wait)
                                ret = jbd2_log_wait_commit(sbi->s_journal,
                                                           target);
                }
        } else if (wait && test_opt(sb, BARRIER))
                needs_barrier = true;
        if (needs_barrier) {
                int err;
                err = blkdev_issue_flush(sb->s_bdev);
                if (!ret)
                        ret = err;
        }

        return ret;
}

/*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 *
 * Note that only this function cannot bring a filesystem to be in a clean
 * state independently. It relies on upper layer to stop all data & metadata
 * modifications.
 */
static int ext4_freeze(struct super_block *sb)
{
        int error = 0;
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (journal) {
                /* Now we set up the journal barrier. */
                jbd2_journal_lock_updates(journal);

                /*
                 * Don't clear the needs_recovery flag if we failed to
                 * flush the journal.
                 */
                error = jbd2_journal_flush(journal, 0);
                if (error < 0)
                        goto out;

                /* Journal blocked and flushed, clear needs_recovery flag. */
                ext4_clear_feature_journal_needs_recovery(sb);
                if (ext4_orphan_file_empty(sb))
                        ext4_clear_feature_orphan_present(sb);
        }

        error = ext4_commit_super(sb);
out:
        if (journal)
                /* we rely on upper layer to stop further updates */
                jbd2_journal_unlock_updates(journal);
        return error;
}

/*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
static int ext4_unfreeze(struct super_block *sb)
{
        if (ext4_forced_shutdown(sb))
                return 0;

        if (EXT4_SB(sb)->s_journal) {
                /* Reset the needs_recovery flag before the fs is unlocked. */
                ext4_set_feature_journal_needs_recovery(sb);
                if (ext4_has_feature_orphan_file(sb))
                        ext4_set_feature_orphan_present(sb);
        }

        ext4_commit_super(sb);
        return 0;
}

/*
 * Structure to save mount options for ext4_remount's benefit
 */
struct ext4_mount_options {
        unsigned long s_mount_opt;
        unsigned long s_mount_opt2;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned long s_commit_interval;
        u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[EXT4_MAXQUOTAS];
#endif
};

static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        ext4_group_t g;
        int err = 0;
        int alloc_ctx;
#ifdef CONFIG_QUOTA
        int enable_quota = 0;
        int i, j;
        char *to_free[EXT4_MAXQUOTAS];
#endif


        /* Store the original options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
        old_opts.s_min_batch_time = sbi->s_min_batch_time;
        old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                if (sbi->s_qf_names[i]) {
                        char *qf_name = get_qf_name(sb, sbi, i);

                        old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
                        if (!old_opts.s_qf_names[i]) {
                                for (j = 0; j < i; j++)
                                        kfree(old_opts.s_qf_names[j]);
                                return -ENOMEM;
                        }
                } else
                        old_opts.s_qf_names[i] = NULL;
#endif
        if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
                if (sbi->s_journal && sbi->s_journal->j_task->io_context)
                        ctx->journal_ioprio =
                                sbi->s_journal->j_task->io_context->ioprio;
                else
                        ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;

        }

        /*
         * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
         * two calls to ext4_should_dioread_nolock() to return inconsistent
         * values, triggering WARN_ON in ext4_add_complete_io(). we grab
         * here s_writepages_rwsem to avoid race between writepages ops and
         * remount.
         */
        alloc_ctx = ext4_writepages_down_write(sb);
        ext4_apply_options(fc, sb);
        ext4_writepages_up_write(sb, alloc_ctx);

        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
            test_opt(sb, JOURNAL_CHECKSUM)) {
                ext4_msg(sb, KERN_ERR, "changing journal_checksum "
                         "during remount not supported; ignoring");
                sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        err = -EINVAL;
                        goto restore_opts;
                }
                if (test_opt(sb, DIOREAD_NOLOCK)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dioread_nolock");
                        err = -EINVAL;
                        goto restore_opts;
                }
        } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                "journal_async_commit in data=ordered mode");
                        err = -EINVAL;
                        goto restore_opts;
                }
        }

        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
                ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
                err = -EINVAL;
                goto restore_opts;
        }

        if (test_opt2(sb, ABORT))
                ext4_abort(sb, ESHUTDOWN, "Abort forced by user");

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        es = sbi->s_es;

        if (sbi->s_journal) {
                ext4_init_journal_params(sb, sbi->s_journal);
                set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
        }

        /* Flush outstanding errors before changing fs state */
        flush_work(&sbi->s_sb_upd_work);

        if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
                if (ext4_forced_shutdown(sb)) {
                        err = -EROFS;
                        goto restore_opts;
                }

                if (fc->sb_flags & SB_RDONLY) {
                        err = sync_filesystem(sb);
                        if (err < 0)
                                goto restore_opts;
                        err = dquot_suspend(sb, -1);
                        if (err < 0)
                                goto restore_opts;

                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
                         */
                        sb->s_flags |= SB_RDONLY;

                        /*
                         * OK, test if we are remounting a valid rw partition
                         * readonly, and if so set the rdonly flag and then
                         * mark the partition as valid again.
                         */
                        if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);

                        if (sbi->s_journal) {
                                /*
                                 * We let remount-ro finish even if marking fs
                                 * as clean failed...
                                 */
                                ext4_mark_recovery_complete(sb, es);
                        }
                } else {
                        /* Make sure we can mount this feature set readwrite */
                        if (ext4_has_feature_readonly(sb) ||
                            !ext4_feature_set_ok(sb, 0)) {
                                err = -EROFS;
                                goto restore_opts;
                        }
                        /*
                         * Make sure the group descriptor checksums
                         * are sane.  If they aren't, refuse to remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);

                                if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                        ext4_msg(sb, KERN_ERR,
               "ext4_remount: Checksum for group %u failed (%u!=%u)",
                g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EFSBADCRC;
                                        goto restore_opts;
                                }
                        }

                        /*
                         * If we have an unprocessed orphan list hanging
                         * around from a previously readonly bdev mount,
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
                                ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
                                       "umount/remount instead");
                                err = -EINVAL;
                                goto restore_opts;
                        }

                        /*
                         * Mounting a RDONLY partition read-write, so reread
                         * and store the current valid flag.  (It may have
                         * been changed by e2fsck since we originally mounted
                         * the partition.)
                         */
                        if (sbi->s_journal) {
                                err = ext4_clear_journal_err(sb, es);
                                if (err)
                                        goto restore_opts;
                        }
                        sbi->s_mount_state = (le16_to_cpu(es->s_state) &
                                              ~EXT4_FC_REPLAY);

                        err = ext4_setup_super(sb, es, 0);
                        if (err)
                                goto restore_opts;

                        sb->s_flags &= ~SB_RDONLY;
                        if (ext4_has_feature_mmp(sb)) {
                                err = ext4_multi_mount_protect(sb,
                                                le64_to_cpu(es->s_mmp_block));
                                if (err)
                                        goto restore_opts;
                        }
#ifdef CONFIG_QUOTA
                        enable_quota = 1;
#endif
                }
        }

        /*
         * Handle creation of system zone data early because it can fail.
         * Releasing of existing data is done when we are sure remount will
         * succeed.
         */
        if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
                err = ext4_setup_system_zone(sb);
                if (err)
                        goto restore_opts;
        }

        if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
                err = ext4_commit_super(sb);
                if (err)
                        goto restore_opts;
        }

#ifdef CONFIG_QUOTA
        if (enable_quota) {
                if (sb_any_quota_suspended(sb))
                        dquot_resume(sb, -1);
                else if (ext4_has_feature_quota(sb)) {
                        err = ext4_enable_quotas(sb);
                        if (err)
                                goto restore_opts;
                }
        }
        /* Release old quota file names */
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(old_opts.s_qf_names[i]);
#endif
        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);

        /*
         * Reinitialize lazy itable initialization thread based on
         * current settings
         */
        if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
                ext4_unregister_li_request(sb);
        else {
                ext4_group_t first_not_zeroed;
                first_not_zeroed = ext4_has_uninit_itable(sb);
                ext4_register_li_request(sb, first_not_zeroed);
        }

        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);

        return 0;

restore_opts:
        /*
         * If there was a failing r/w to ro transition, we may need to
         * re-enable quota
         */
        if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
            sb_any_quota_suspended(sb))
                dquot_resume(sb, -1);

        alloc_ctx = ext4_writepages_down_write(sb);
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
        sbi->s_min_batch_time = old_opts.s_min_batch_time;
        sbi->s_max_batch_time = old_opts.s_max_batch_time;
        ext4_writepages_up_write(sb, alloc_ctx);

        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                to_free[i] = get_qf_name(sb, sbi, i);
                rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
        }
        synchronize_rcu();
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(to_free[i]);
#endif
        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);
        return err;
}

static int ext4_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        int ret;

        fc->s_fs_info = EXT4_SB(sb);

        ret = ext4_check_opt_consistency(fc, sb);
        if (ret < 0)
                return ret;

        ret = __ext4_remount(fc, sb);
        if (ret < 0)
                return ret;

        ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
                 &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
                 ext4_quota_mode(sb));

        return 0;
}

#ifdef CONFIG_QUOTA
static int ext4_statfs_project(struct super_block *sb,
                               kprojid_t projid, struct kstatfs *buf)
{
        struct kqid qid;
        struct dquot *dquot;
        u64 limit;
        u64 curblock;

        qid = make_kqid_projid(projid);
        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        spin_lock(&dquot->dq_dqb_lock);

        limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
                             dquot->dq_dqb.dqb_bhardlimit);
        limit >>= sb->s_blocksize_bits;

        if (limit && buf->f_blocks > limit) {
                curblock = (dquot->dq_dqb.dqb_curspace +
                            dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
                buf->f_blocks = limit;
                buf->f_bfree = buf->f_bavail =
                        (buf->f_blocks > curblock) ?
                         (buf->f_blocks - curblock) : 0;
        }

        limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
                             dquot->dq_dqb.dqb_ihardlimit);
        if (limit && buf->f_files > limit) {
                buf->f_files = limit;
                buf->f_ffree =
                        (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
                         (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
        }

        spin_unlock(&dquot->dq_dqb_lock);
        dqput(dquot);
        return 0;
}
#endif

static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_fsblk_t overhead = 0, resv_blocks;
        s64 bfree;
        resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));

        if (!test_opt(sb, MINIX_DF))
                overhead = sbi->s_overhead;

        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
        bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
        buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
        buf->f_bavail = buf->f_bfree -
                        (ext4_r_blocks_count(es) + resv_blocks);
        if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
        buf->f_namelen = EXT4_NAME_LEN;
        buf->f_fsid = uuid_to_fsid(es->s_uuid);

#ifdef CONFIG_QUOTA
        if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
            sb_has_quota_limits_enabled(sb, PRJQUOTA))
                ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
#endif
        return 0;
}


#ifdef CONFIG_QUOTA

/*
 * Helper functions so that transaction is started before we acquire dqio_sem
 * to keep correct lock ordering of transaction > dqio_sem
 */
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
        return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}

static int ext4_write_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;
        struct inode *inode;

        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to commit dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_acquire_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                              "Failed to acquire dquot type %d",
                              dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_release_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
                return PTR_ERR(handle);
        }
        ret = dquot_release(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to release dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
        struct super_block *sb = dquot->dq_sb;

        if (ext4_is_quota_journalled(sb)) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
        } else {
                return dquot_mark_dquot_dirty(dquot);
        }
}

static int ext4_write_info(struct super_block *sb, int type)
{
        int ret, err;
        handle_t *handle;

        /* Data block + inode block */
        handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit_info(sb, type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static void lockdep_set_quota_inode(struct inode *inode, int subclass)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        /* The first argument of lockdep_set_subclass has to be
         * *exactly* the same as the argument to init_rwsem() --- in
         * this case, in init_once() --- or lockdep gets unhappy
         * because the name of the lock is set using the
         * stringification of the argument to init_rwsem().
         */
        (void) ei;        /* shut up clang warning if !CONFIG_LOCKDEP */
        lockdep_set_subclass(&ei->i_data_sem, subclass);
}

/*
 * Standard function to be called on quota_on
 */
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path)
{
        int err;

        if (!test_opt(sb, QUOTA))
                return -EINVAL;

        /* Quotafile not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                return -EXDEV;

        /* Quota already enabled for this file? */
        if (IS_NOQUOTA(d_inode(path->dentry)))
                return -EBUSY;

        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
                sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
        } else {
                /*
                 * Clear the flag just in case mount options changed since
                 * last time.
                 */
                sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
        }

        lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
        err = dquot_quota_on(sb, type, format_id, path);
        if (!err) {
                struct inode *inode = d_inode(path->dentry);
                handle_t *handle;

                /*
                 * Set inode flags to prevent userspace from messing with quota
                 * files. If this fails, we return success anyway since quotas
                 * are already enabled and this is not a hard failure.
                 */
                inode_lock(inode);
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
                if (IS_ERR(handle))
                        goto unlock_inode;
                EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
                inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
                                S_NOATIME | S_IMMUTABLE);
                err = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
        unlock_inode:
                inode_unlock(inode);
                if (err)
                        dquot_quota_off(sb, type);
        }
        if (err)
                lockdep_set_quota_inode(path->dentry->d_inode,
                                             I_DATA_SEM_NORMAL);
        return err;
}

static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
{
        switch (type) {
        case USRQUOTA:
                return qf_inum == EXT4_USR_QUOTA_INO;
        case GRPQUOTA:
                return qf_inum == EXT4_GRP_QUOTA_INO;
        case PRJQUOTA:
                return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
        default:
                BUG();
        }
}

static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags)
{
        int err;
        struct inode *qf_inode;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };

        BUG_ON(!ext4_has_feature_quota(sb));

        if (!qf_inums[type])
                return -EPERM;

        if (!ext4_check_quota_inum(type, qf_inums[type])) {
                ext4_error(sb, "Bad quota inum: %lu, type: %d",
                                qf_inums[type], type);
                return -EUCLEAN;
        }

        qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
        if (IS_ERR(qf_inode)) {
                ext4_error(sb, "Bad quota inode: %lu, type: %d",
                                qf_inums[type], type);
                return PTR_ERR(qf_inode);
        }

        /* Don't account quota for quota files to avoid recursion */
        qf_inode->i_flags |= S_NOQUOTA;
        lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
        err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
        if (err)
                lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
        iput(qf_inode);

        return err;
}

/* Enable usage tracking for all quota types. */
int ext4_enable_quotas(struct super_block *sb)
{
        int type, err = 0;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };
        bool quota_mopt[EXT4_MAXQUOTAS] = {
                test_opt(sb, USRQUOTA),
                test_opt(sb, GRPQUOTA),
                test_opt(sb, PRJQUOTA),
        };

        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < EXT4_MAXQUOTAS; type++) {
                if (qf_inums[type]) {
                        err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
                                DQUOT_USAGE_ENABLED |
                                (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
                        if (err) {
                                ext4_warning(sb,
                                        "Failed to enable quota tracking "
                                        "(type=%d, err=%d, ino=%lu). "
                                        "Please run e2fsck to fix.", type,
                                        err, qf_inums[type]);

                                ext4_quotas_off(sb, type);
                                return err;
                        }
                }
        }
        return 0;
}

static int ext4_quota_off(struct super_block *sb, int type)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        handle_t *handle;
        int err;

        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);

        if (!inode || !igrab(inode))
                goto out;

        err = dquot_quota_off(sb, type);
        if (err || ext4_has_feature_quota(sb))
                goto out_put;
        /*
         * When the filesystem was remounted read-only first, we cannot cleanup
         * inode flags here. Bad luck but people should be using QUOTA feature
         * these days anyway.
         */
        if (sb_rdonly(sb))
                goto out_put;

        inode_lock(inode);
        /*
         * Update modification times of quota files when userspace can
         * start looking at them. If we fail, we return success anyway since
         * this is not a hard failure and quotas are already disabled.
         */
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_unlock;
        }
        EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
        inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
out_unlock:
        inode_unlock(inode);
out_put:
        lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
        iput(inode);
        return err;
out:
        return dquot_quota_off(sb, type);
}

/* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int offset = off & (sb->s_blocksize - 1);
        int tocopy;
        size_t toread;
        struct buffer_head *bh;
        loff_t i_size = i_size_read(inode);

        if (off > i_size)
                return 0;
        if (off+len > i_size)
                len = i_size-off;
        toread = len;
        while (toread > 0) {
                tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
                bh = ext4_bread(NULL, inode, blk, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                if (!bh)        /* A hole? */
                        memset(data, 0, tocopy);
                else
                        memcpy(data, bh->b_data+offset, tocopy);
                brelse(bh);
                offset = 0;
                toread -= tocopy;
                data += tocopy;
                blk++;
        }
        return len;
}

/* Write to quotafile (we know the transaction is already started and has
 * enough credits) */
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
        int retries = 0;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();

        if (!handle) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
        /*
         * Since we account only one data block in transaction credits,
         * then it is impossible to cross a block boundary.
         */
        if (sb->s_blocksize - offset < len) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because not block aligned",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }

        do {
                bh = ext4_bread(handle, inode, blk,
                                EXT4_GET_BLOCKS_CREATE |
                                EXT4_GET_BLOCKS_METADATA_NOFAIL);
        } while (PTR_ERR(bh) == -ENOSPC &&
                 ext4_should_retry_alloc(inode->i_sb, &retries));
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
                goto out;
        BUFFER_TRACE(bh, "get write access");
        err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
        if (err) {
                brelse(bh);
                return err;
        }
        lock_buffer(bh);
        memcpy(bh->b_data+offset, data, len);
        flush_dcache_page(bh->b_page);
        unlock_buffer(bh);
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(err2 && !err))
                        err = err2;
        }
        return err ? err : len;
}
#endif

#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
        int err = register_filesystem(&ext2_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
}

static inline void unregister_as_ext2(void)
{
        unregister_filesystem(&ext2_fs_type);
}

static inline int ext2_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext2_incompat_features(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext2_ro_compat_features(sb))
                return 0;
        return 1;
}
#else
static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { }
static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif

static inline void register_as_ext3(void)
{
        int err = register_filesystem(&ext3_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
}

static inline void unregister_as_ext3(void)
{
        unregister_filesystem(&ext3_fs_type);
}

static inline int ext3_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext3_incompat_features(sb))
                return 0;
        if (!ext4_has_feature_journal(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext3_ro_compat_features(sb))
                return 0;
        return 1;
}

static void ext4_kill_sb(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;

        kill_block_super(sb);

        if (bdev_file)
                bdev_fput(bdev_file);
}

static struct file_system_type ext4_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext4",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");

/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];

static int __init ext4_init_fs(void)
{
        int i, err;

        ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
        ext4_li_info = NULL;

        /* Build-time check for flags consistency */
        ext4_check_flag_values();

        for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
                init_waitqueue_head(&ext4__ioend_wq[i]);

        err = ext4_init_es();
        if (err)
                return err;

        err = ext4_init_pending();
        if (err)
                goto out7;

        err = ext4_init_post_read_processing();
        if (err)
                goto out6;

        err = ext4_init_pageio();
        if (err)
                goto out5;

        err = ext4_init_system_zone();
        if (err)
                goto out4;

        err = ext4_init_sysfs();
        if (err)
                goto out3;

        err = ext4_init_mballoc();
        if (err)
                goto out2;
        err = init_inodecache();
        if (err)
                goto out1;

        err = ext4_fc_init_dentry_cache();
        if (err)
                goto out05;

        register_as_ext3();
        register_as_ext2();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;

        return 0;
out:
        unregister_as_ext2();
        unregister_as_ext3();
        ext4_fc_destroy_dentry_cache();
out05:
        destroy_inodecache();
out1:
        ext4_exit_mballoc();
out2:
        ext4_exit_sysfs();
out3:
        ext4_exit_system_zone();
out4:
        ext4_exit_pageio();
out5:
        ext4_exit_post_read_processing();
out6:
        ext4_exit_pending();
out7:
        ext4_exit_es();

        return err;
}

static void __exit ext4_exit_fs(void)
{
        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        ext4_fc_destroy_dentry_cache();
        destroy_inodecache();
        ext4_exit_mballoc();
        ext4_exit_sysfs();
        ext4_exit_system_zone();
        ext4_exit_pageio();
        ext4_exit_post_read_processing();
        ext4_exit_es();
        ext4_exit_pending();
}

MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
module_init(ext4_init_fs)
module_exit(ext4_exit_fs)


























































































    1 

    1 




    1 

    1 



















































































































































































































































































































































































































































































































































































































































































































    1 


    1 







    1 




    1 




    1 






    1 





































































































    1 







    1 


























    1 







    1 




















    1 































    1 













    1 











    1 






















































































































































































































































































































































    1 




    1 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/pipe.c
 *
 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/log2.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/magic.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
#include <linux/watch_queue.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/ioctls.h>

#include "internal.h"

/*
 * New pipe buffers will be restricted to this size while the user is exceeding
 * their pipe buffer quota. The general pipe use case needs at least two
 * buffers: one for data yet to be read, and one for new data. If this is less
 * than two, then a write to a non-empty pipe may block even if the pipe is not
 * full. This can occur with GNU make jobserver or similar uses of pipes as
 * semaphores: multiple processes may be waiting to write tokens back to the
 * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
 *
 * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
 * own risk, namely: pipe writes to non-full pipes may block until the pipe is
 * emptied.
 */
#define PIPE_MIN_DEF_BUFFERS 2

/*
 * The max size that a non-root user is allowed to grow the pipe. Can
 * be set by root in /proc/sys/fs/pipe-max-size
 */
static unsigned int pipe_max_size = 1048576;

/* Maximum allocatable pages per user. Hard limit is unset by default, soft
 * matches default values.
 */
static unsigned long pipe_user_pages_hard;
static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;

/*
 * We use head and tail indices that aren't masked off, except at the point of
 * dereference, but rather they're allowed to wrap naturally.  This means there
 * isn't a dead spot in the buffer, but the ring has to be a power of two and
 * <= 2^31.
 * -- David Howells 2019-09-23.
 *
 * Reads with count = 0 should always return 0.
 * -- Julian Bradfield 1999-06-07.
 *
 * FIFOs and Pipes now generate SIGIO for both readers and writers.
 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
 *
 * pipe_read & write cleanup
 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
 */

#define cmp_int(l, r)                ((l > r) - (l < r))

#ifdef CONFIG_PROVE_LOCKING
static int pipe_lock_cmp_fn(const struct lockdep_map *a,
                            const struct lockdep_map *b)
{
        return cmp_int((unsigned long) a, (unsigned long) b);
}
#endif

void pipe_lock(struct pipe_inode_info *pipe)
{
        if (pipe->files)
                mutex_lock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_lock);

void pipe_unlock(struct pipe_inode_info *pipe)
{
        if (pipe->files)
                mutex_unlock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_unlock);

void pipe_double_lock(struct pipe_inode_info *pipe1,
                      struct pipe_inode_info *pipe2)
{
        BUG_ON(pipe1 == pipe2);

        if (pipe1 > pipe2)
                swap(pipe1, pipe2);

        pipe_lock(pipe1);
        pipe_lock(pipe2);
}

static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        /*
         * If nobody else uses this page, and we don't already have a
         * temporary page, let's keep track of it as a one-deep
         * allocation cache. (Otherwise just release our reference to it)
         */
        if (page_count(page) == 1 && !pipe->tmp_page)
                pipe->tmp_page = page;
        else
                put_page(page);
}

static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        if (page_count(page) != 1)
                return false;
        memcg_kmem_uncharge_page(page, 0);
        __SetPageLocked(page);
        return true;
}

/**
 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
 *
 * Description:
 *        This function attempts to steal the &struct page attached to
 *        @buf. If successful, this function returns 0 and returns with
 *        the page locked. The caller may then reuse the page for whatever
 *        he wishes; the typical use is insertion into a different file
 *        page cache.
 */
bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        /*
         * A reference of one is golden, that means that the owner of this
         * page is the only one holding a reference to it. lock the page
         * and return OK.
         */
        if (page_count(page) == 1) {
                lock_page(page);
                return true;
        }
        return false;
}
EXPORT_SYMBOL(generic_pipe_buf_try_steal);

/**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to get a reference to
 *
 * Description:
 *        This function grabs an extra reference to @buf. It's used in
 *        the tee() system call, when we duplicate the buffers in one
 *        pipe into another.
 */
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
        return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);

/**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to put a reference to
 *
 * Description:
 *        This function releases a reference to @buf.
 */
void generic_pipe_buf_release(struct pipe_inode_info *pipe,
                              struct pipe_buffer *buf)
{
        put_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_release);

static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .release        = anon_pipe_buf_release,
        .try_steal        = anon_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
        unsigned int head = READ_ONCE(pipe->head);
        unsigned int tail = READ_ONCE(pipe->tail);
        unsigned int writers = READ_ONCE(pipe->writers);

        return !pipe_empty(head, tail) || !writers;
}

static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
                                            struct pipe_buffer *buf,
                                            unsigned int tail)
{
        pipe_buf_release(pipe, buf);

        /*
         * If the pipe has a watch_queue, we need additional protection
         * by the spinlock because notifications get posted with only
         * this spinlock, no mutex
         */
        if (pipe_has_watch_queue(pipe)) {
                spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
                if (buf->flags & PIPE_BUF_FLAG_LOSS)
                        pipe->note_loss = true;
#endif
                pipe->tail = ++tail;
                spin_unlock_irq(&pipe->rd_wait.lock);
                return tail;
        }

        /*
         * Without a watch_queue, we can simply increment the tail
         * without the spinlock - the mutex is enough.
         */
        pipe->tail = ++tail;
        return tail;
}

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
        size_t total_len = iov_iter_count(to);
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        bool was_full, wake_next_reader = false;
        ssize_t ret;

        /* Null read succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        ret = 0;
        mutex_lock(&pipe->mutex);

        /*
         * We only wake up writers if the pipe was full when we started
         * reading in order to avoid unnecessary wakeups.
         *
         * But when we do wake up writers, we do so using a sync wakeup
         * (WF_SYNC), because we want them to get going and generate more
         * data for us.
         */
        was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
        for (;;) {
                /* Read ->head with a barrier vs post_one_notification() */
                unsigned int head = smp_load_acquire(&pipe->head);
                unsigned int tail = pipe->tail;
                unsigned int mask = pipe->ring_size - 1;

#ifdef CONFIG_WATCH_QUEUE
                if (pipe->note_loss) {
                        struct watch_notification n;

                        if (total_len < 8) {
                                if (ret == 0)
                                        ret = -ENOBUFS;
                                break;
                        }

                        n.type = WATCH_TYPE_META;
                        n.subtype = WATCH_META_LOSS_NOTIFICATION;
                        n.info = watch_sizeof(n);
                        if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
                                if (ret == 0)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += sizeof(n);
                        total_len -= sizeof(n);
                        pipe->note_loss = false;
                }
#endif

                if (!pipe_empty(head, tail)) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t chars = buf->len;
                        size_t written;
                        int error;

                        if (chars > total_len) {
                                if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
                                        if (ret == 0)
                                                ret = -ENOBUFS;
                                        break;
                                }
                                chars = total_len;
                        }

                        error = pipe_buf_confirm(pipe, buf);
                        if (error) {
                                if (!ret)
                                        ret = error;
                                break;
                        }

                        written = copy_page_to_iter(buf->page, buf->offset, chars, to);
                        if (unlikely(written < chars)) {
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += chars;
                        buf->offset += chars;
                        buf->len -= chars;

                        /* Was it a packet buffer? Clean up and exit */
                        if (buf->flags & PIPE_BUF_FLAG_PACKET) {
                                total_len = chars;
                                buf->len = 0;
                        }

                        if (!buf->len)
                                tail = pipe_update_tail(pipe, buf, tail);
                        total_len -= chars;
                        if (!total_len)
                                break;        /* common path: read succeeded */
                        if (!pipe_empty(head, tail))        /* More to do? */
                                continue;
                }

                if (!pipe->writers)
                        break;
                if (ret)
                        break;
                if ((filp->f_flags & O_NONBLOCK) ||
                    (iocb->ki_flags & IOCB_NOWAIT)) {
                        ret = -EAGAIN;
                        break;
                }
                mutex_unlock(&pipe->mutex);

                /*
                 * We only get here if we didn't actually read anything.
                 *
                 * However, we could have seen (and removed) a zero-sized
                 * pipe buffer, and might have made space in the buffers
                 * that way.
                 *
                 * You can't make zero-sized pipe buffers by doing an empty
                 * write (not even in packet mode), but they can happen if
                 * the writer gets an EFAULT when trying to fill a buffer
                 * that already got allocated and inserted in the buffer
                 * array.
                 *
                 * So we still need to wake up any pending writers in the
                 * _very_ unlikely case that the pipe was full, but we got
                 * no data.
                 */
                if (unlikely(was_full))
                        wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

                /*
                 * But because we didn't read anything, at this point we can
                 * just return directly with -ERESTARTSYS if we're interrupted,
                 * since we've done any required wakeups and there's no need
                 * to mark anything accessed. And we've dropped the lock.
                 */
                if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
                        return -ERESTARTSYS;

                mutex_lock(&pipe->mutex);
                was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
                wake_next_reader = true;
        }
        if (pipe_empty(pipe->head, pipe->tail))
                wake_next_reader = false;
        mutex_unlock(&pipe->mutex);

        if (was_full)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        if (wake_next_reader)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        if (ret > 0)
                file_accessed(filp);
        return ret;
}

static inline int is_packetized(struct file *file)
{
        return (file->f_flags & O_DIRECT) != 0;
}

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
        unsigned int head = READ_ONCE(pipe->head);
        unsigned int tail = READ_ONCE(pipe->tail);
        unsigned int max_usage = READ_ONCE(pipe->max_usage);

        return !pipe_full(head, tail, max_usage) ||
                !READ_ONCE(pipe->readers);
}

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int head;
        ssize_t ret = 0;
        size_t total_len = iov_iter_count(from);
        ssize_t chars;
        bool was_empty = false;
        bool wake_next_writer = false;

        /*
         * Reject writing to watch queue pipes before the point where we lock
         * the pipe.
         * Otherwise, lockdep would be unhappy if the caller already has another
         * pipe locked.
         * If we had to support locking a normal pipe and a notification pipe at
         * the same time, we could set up lockdep annotations for that, but
         * since we don't actually need that, it's simpler to just bail here.
         */
        if (pipe_has_watch_queue(pipe))
                return -EXDEV;

        /* Null write succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        mutex_lock(&pipe->mutex);

        if (!pipe->readers) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
                goto out;
        }

        /*
         * If it wasn't empty we try to merge new data into
         * the last buffer.
         *
         * That naturally merges small writes, but it also
         * page-aligns the rest of the writes for large writes
         * spanning multiple pages.
         */
        head = pipe->head;
        was_empty = pipe_empty(head, pipe->tail);
        chars = total_len & (PAGE_SIZE-1);
        if (chars && !was_empty) {
                unsigned int mask = pipe->ring_size - 1;
                struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
                int offset = buf->offset + buf->len;

                if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
                    offset + chars <= PAGE_SIZE) {
                        ret = pipe_buf_confirm(pipe, buf);
                        if (ret)
                                goto out;

                        ret = copy_page_from_iter(buf->page, offset, chars, from);
                        if (unlikely(ret < chars)) {
                                ret = -EFAULT;
                                goto out;
                        }

                        buf->len += ret;
                        if (!iov_iter_count(from))
                                goto out;
                }
        }

        for (;;) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                head = pipe->head;
                if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
                        unsigned int mask = pipe->ring_size - 1;
                        struct pipe_buffer *buf;
                        struct page *page = pipe->tmp_page;
                        int copied;

                        if (!page) {
                                page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
                                if (unlikely(!page)) {
                                        ret = ret ? : -ENOMEM;
                                        break;
                                }
                                pipe->tmp_page = page;
                        }

                        /* Allocate a slot in the ring in advance and attach an
                         * empty buffer.  If we fault or otherwise fail to use
                         * it, either the reader will consume it or it'll still
                         * be there for the next write.
                         */
                        pipe->head = head + 1;

                        /* Insert it into the buffer array */
                        buf = &pipe->bufs[head & mask];
                        buf->page = page;
                        buf->ops = &anon_pipe_buf_ops;
                        buf->offset = 0;
                        buf->len = 0;
                        if (is_packetized(filp))
                                buf->flags = PIPE_BUF_FLAG_PACKET;
                        else
                                buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
                        pipe->tmp_page = NULL;

                        copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
                        if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += copied;
                        buf->len = copied;

                        if (!iov_iter_count(from))
                                break;
                }

                if (!pipe_full(head, pipe->tail, pipe->max_usage))
                        continue;

                /* Wait for buffer space to become available. */
                if ((filp->f_flags & O_NONBLOCK) ||
                    (iocb->ki_flags & IOCB_NOWAIT)) {
                        if (!ret)
                                ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        if (!ret)
                                ret = -ERESTARTSYS;
                        break;
                }

                /*
                 * We're going to release the pipe lock and wait for more
                 * space. We wake up any readers if necessary, and then
                 * after waiting we need to re-check whether the pipe
                 * become empty while we dropped the lock.
                 */
                mutex_unlock(&pipe->mutex);
                if (was_empty)
                        wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
                mutex_lock(&pipe->mutex);
                was_empty = pipe_empty(pipe->head, pipe->tail);
                wake_next_writer = true;
        }
out:
        if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                wake_next_writer = false;
        mutex_unlock(&pipe->mutex);

        /*
         * If we do do a wakeup event, we do a 'sync' wakeup, because we
         * want the reader to start processing things asap, rather than
         * leave the data pending.
         *
         * This is particularly important for small writes, because of
         * how (for example) the GNU make jobserver uses small writes to
         * wake up pending jobs
         *
         * Epoll nonsensically wants a wakeup whether the pipe
         * was already empty or not.
         */
        if (was_empty || pipe->poll_usage)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        if (wake_next_writer)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
                int err = file_update_time(filp);
                if (err)
                        ret = err;
                sb_end_write(file_inode(filp)->i_sb);
        }
        return ret;
}

static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int count, head, tail, mask;

        switch (cmd) {
        case FIONREAD:
                mutex_lock(&pipe->mutex);
                count = 0;
                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                while (tail != head) {
                        count += pipe->bufs[tail & mask].len;
                        tail++;
                }
                mutex_unlock(&pipe->mutex);

                return put_user(count, (int __user *)arg);

#ifdef CONFIG_WATCH_QUEUE
        case IOC_WATCH_QUEUE_SET_SIZE: {
                int ret;
                mutex_lock(&pipe->mutex);
                ret = watch_queue_set_size(pipe, arg);
                mutex_unlock(&pipe->mutex);
                return ret;
        }

        case IOC_WATCH_QUEUE_SET_FILTER:
                return watch_queue_set_filter(
                        pipe, (struct watch_notification_filter __user *)arg);
#endif

        default:
                return -ENOIOCTLCMD;
        }
}

/* No kernel lock held - fine */
static __poll_t
pipe_poll(struct file *filp, poll_table *wait)
{
        __poll_t mask;
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int head, tail;

        /* Epoll has some historical nasty semantics, this enables them */
        WRITE_ONCE(pipe->poll_usage, true);

        /*
         * Reading pipe state only -- no need for acquiring the semaphore.
         *
         * But because this is racy, the code has to add the
         * entry to the poll table _first_ ..
         */
        if (filp->f_mode & FMODE_READ)
                poll_wait(filp, &pipe->rd_wait, wait);
        if (filp->f_mode & FMODE_WRITE)
                poll_wait(filp, &pipe->wr_wait, wait);

        /*
         * .. and only then can you do the racy tests. That way,
         * if something changes and you got it wrong, the poll
         * table entry will wake you up and fix it.
         */
        head = READ_ONCE(pipe->head);
        tail = READ_ONCE(pipe->tail);

        mask = 0;
        if (filp->f_mode & FMODE_READ) {
                if (!pipe_empty(head, tail))
                        mask |= EPOLLIN | EPOLLRDNORM;
                if (!pipe->writers && filp->f_version != pipe->w_counter)
                        mask |= EPOLLHUP;
        }

        if (filp->f_mode & FMODE_WRITE) {
                if (!pipe_full(head, tail, pipe->max_usage))
                        mask |= EPOLLOUT | EPOLLWRNORM;
                /*
                 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
                 * behave exactly like pipes for poll().
                 */
                if (!pipe->readers)
                        mask |= EPOLLERR;
        }

        return mask;
}

static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
{
        int kill = 0;

        spin_lock(&inode->i_lock);
        if (!--pipe->files) {
                inode->i_pipe = NULL;
                kill = 1;
        }
        spin_unlock(&inode->i_lock);

        if (kill)
                free_pipe_info(pipe);
}

static int
pipe_release(struct inode *inode, struct file *file)
{
        struct pipe_inode_info *pipe = file->private_data;

        mutex_lock(&pipe->mutex);
        if (file->f_mode & FMODE_READ)
                pipe->readers--;
        if (file->f_mode & FMODE_WRITE)
                pipe->writers--;

        /* Was that the last reader or writer, but not the other side? */
        if (!pipe->readers != !pipe->writers) {
                wake_up_interruptible_all(&pipe->rd_wait);
                wake_up_interruptible_all(&pipe->wr_wait);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        mutex_unlock(&pipe->mutex);

        put_pipe_info(inode, pipe);
        return 0;
}

static int
pipe_fasync(int fd, struct file *filp, int on)
{
        struct pipe_inode_info *pipe = filp->private_data;
        int retval = 0;

        mutex_lock(&pipe->mutex);
        if (filp->f_mode & FMODE_READ)
                retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
        if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
                if (retval < 0 && (filp->f_mode & FMODE_READ))
                        /* this can happen only if on == T */
                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
        }
        mutex_unlock(&pipe->mutex);
        return retval;
}

unsigned long account_pipe_buffers(struct user_struct *user,
                                   unsigned long old, unsigned long new)
{
        return atomic_long_add_return(new - old, &user->pipe_bufs);
}

bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
        unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);

        return soft_limit && user_bufs > soft_limit;
}

bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
        unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);

        return hard_limit && user_bufs > hard_limit;
}

bool pipe_is_unprivileged_user(void)
{
        return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}

struct pipe_inode_info *alloc_pipe_info(void)
{
        struct pipe_inode_info *pipe;
        unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
        struct user_struct *user = get_current_user();
        unsigned long user_bufs;
        unsigned int max_size = READ_ONCE(pipe_max_size);

        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
        if (pipe == NULL)
                goto out_free_uid;

        if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
                pipe_bufs = max_size >> PAGE_SHIFT;

        user_bufs = account_pipe_buffers(user, 0, pipe_bufs);

        if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
                user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
                pipe_bufs = PIPE_MIN_DEF_BUFFERS;
        }

        if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
                goto out_revert_acct;

        pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
                             GFP_KERNEL_ACCOUNT);

        if (pipe->bufs) {
                init_waitqueue_head(&pipe->rd_wait);
                init_waitqueue_head(&pipe->wr_wait);
                pipe->r_counter = pipe->w_counter = 1;
                pipe->max_usage = pipe_bufs;
                pipe->ring_size = pipe_bufs;
                pipe->nr_accounted = pipe_bufs;
                pipe->user = user;
                mutex_init(&pipe->mutex);
                lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
                return pipe;
        }

out_revert_acct:
        (void) account_pipe_buffers(user, pipe_bufs, 0);
        kfree(pipe);
out_free_uid:
        free_uid(user);
        return NULL;
}

void free_pipe_info(struct pipe_inode_info *pipe)
{
        unsigned int i;

#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                watch_queue_clear(pipe->watch_queue);
#endif

        (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
        free_uid(pipe->user);
        for (i = 0; i < pipe->ring_size; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }
#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                put_watch_queue(pipe->watch_queue);
#endif
        if (pipe->tmp_page)
                __free_page(pipe->tmp_page);
        kfree(pipe->bufs);
        kfree(pipe);
}

static struct vfsmount *pipe_mnt __ro_after_init;

/*
 * pipefs_dname() is called from d_path().
 */
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "pipe:[%lu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations pipefs_dentry_operations = {
        .d_dname        = pipefs_dname,
};

static struct inode * get_pipe_inode(void)
{
        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
        struct pipe_inode_info *pipe;

        if (!inode)
                goto fail_inode;

        inode->i_ino = get_next_ino();

        pipe = alloc_pipe_info();
        if (!pipe)
                goto fail_iput;

        inode->i_pipe = pipe;
        pipe->files = 2;
        pipe->readers = pipe->writers = 1;
        inode->i_fop = &pipefifo_fops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because "mark_inode_dirty()" will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        simple_inode_init_ts(inode);

        return inode;

fail_iput:
        iput(inode);

fail_inode:
        return NULL;
}

int create_pipe_files(struct file **res, int flags)
{
        struct inode *inode = get_pipe_inode();
        struct file *f;
        int error;

        if (!inode)
                return -ENFILE;

        if (flags & O_NOTIFICATION_PIPE) {
                error = watch_queue_init(inode->i_pipe);
                if (error) {
                        free_pipe_info(inode->i_pipe);
                        iput(inode);
                        return error;
                }
        }

        f = alloc_file_pseudo(inode, pipe_mnt, "",
                                O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
                                &pipefifo_fops);
        if (IS_ERR(f)) {
                free_pipe_info(inode->i_pipe);
                iput(inode);
                return PTR_ERR(f);
        }

        f->private_data = inode->i_pipe;

        res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
                                  &pipefifo_fops);
        if (IS_ERR(res[0])) {
                put_pipe_info(inode, inode->i_pipe);
                fput(f);
                return PTR_ERR(res[0]);
        }
        res[0]->private_data = inode->i_pipe;
        res[1] = f;
        stream_open(inode, res[0]);
        stream_open(inode, res[1]);
        return 0;
}

static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
        int error;
        int fdw, fdr;

        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
                return -EINVAL;

        error = create_pipe_files(files, flags);
        if (error)
                return error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_read_pipe;
        fdr = error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_fdr;
        fdw = error;

        audit_fd_pair(fdr, fdw);
        fd[0] = fdr;
        fd[1] = fdw;
        /* pipe groks IOCB_NOWAIT */
        files[0]->f_mode |= FMODE_NOWAIT;
        files[1]->f_mode |= FMODE_NOWAIT;
        return 0;

 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
        fput(files[0]);
        fput(files[1]);
        return error;
}

int do_pipe_flags(int *fd, int flags)
{
        struct file *files[2];
        int error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                fd_install(fd[0], files[0]);
                fd_install(fd[1], files[1]);
        }
        return error;
}

/*
 * sys_pipe() is the normal C calling standard for creating
 * a pipe. It's not the way Unix traditionally does this, though.
 */
static int do_pipe2(int __user *fildes, int flags)
{
        struct file *files[2];
        int fd[2];
        int error;

        error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
                        fput(files[0]);
                        fput(files[1]);
                        put_unused_fd(fd[0]);
                        put_unused_fd(fd[1]);
                        error = -EFAULT;
                } else {
                        fd_install(fd[0], files[0]);
                        fd_install(fd[1], files[1]);
                }
        }
        return error;
}

SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
        return do_pipe2(fildes, flags);
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
        return do_pipe2(fildes, 0);
}

/*
 * This is the stupid "wait for pipe to be readable or writable"
 * model.
 *
 * See pipe_read/write() for the proper kind of exclusive wait,
 * but that requires that we wake up any other readers/writers
 * if we then do not end up reading everything (ie the whole
 * "wake_next_reader/writer" logic in pipe_read/write()).
 */
void pipe_wait_readable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
        pipe_lock(pipe);
}

void pipe_wait_writable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
        pipe_lock(pipe);
}

/*
 * This depends on both the wait (here) and the wakeup (wake_up_partner)
 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
 * race with the count check and waitqueue prep.
 *
 * Normally in order to avoid races, you'd do the prepare_to_wait() first,
 * then check the condition you're waiting for, and only then sleep. But
 * because of the pipe lock, we can check the condition before being on
 * the wait queue.
 *
 * We use the 'rd_wait' waitqueue for pipe partner waiting.
 */
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
        DEFINE_WAIT(rdwait);
        int cur = *cnt;

        while (cur == *cnt) {
                prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
                pipe_unlock(pipe);
                schedule();
                finish_wait(&pipe->rd_wait, &rdwait);
                pipe_lock(pipe);
                if (signal_pending(current))
                        break;
        }
        return cur == *cnt ? -ERESTARTSYS : 0;
}

static void wake_up_partner(struct pipe_inode_info *pipe)
{
        wake_up_interruptible_all(&pipe->rd_wait);
}

static int fifo_open(struct inode *inode, struct file *filp)
{
        struct pipe_inode_info *pipe;
        bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
        int ret;

        filp->f_version = 0;

        spin_lock(&inode->i_lock);
        if (inode->i_pipe) {
                pipe = inode->i_pipe;
                pipe->files++;
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                pipe = alloc_pipe_info();
                if (!pipe)
                        return -ENOMEM;
                pipe->files = 1;
                spin_lock(&inode->i_lock);
                if (unlikely(inode->i_pipe)) {
                        inode->i_pipe->files++;
                        spin_unlock(&inode->i_lock);
                        free_pipe_info(pipe);
                        pipe = inode->i_pipe;
                } else {
                        inode->i_pipe = pipe;
                        spin_unlock(&inode->i_lock);
                }
        }
        filp->private_data = pipe;
        /* OK, we have a pipe and it's pinned down */

        mutex_lock(&pipe->mutex);

        /* We can only do regular read/write on fifos */
        stream_open(inode, filp);

        switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
        case FMODE_READ:
        /*
         *  O_RDONLY
         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
         *  opened, even when there is no process writing the FIFO.
         */
                pipe->r_counter++;
                if (pipe->readers++ == 0)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->writers) {
                        if ((filp->f_flags & O_NONBLOCK)) {
                                /* suppress EPOLLHUP until we have
                                 * seen a writer */
                                filp->f_version = pipe->w_counter;
                        } else {
                                if (wait_for_partner(pipe, &pipe->w_counter))
                                        goto err_rd;
                        }
                }
                break;

        case FMODE_WRITE:
        /*
         *  O_WRONLY
         *  POSIX.1 says that O_NONBLOCK means return -1 with
         *  errno=ENXIO when there is no process reading the FIFO.
         */
                ret = -ENXIO;
                if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
                        goto err;

                pipe->w_counter++;
                if (!pipe->writers++)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->readers) {
                        if (wait_for_partner(pipe, &pipe->r_counter))
                                goto err_wr;
                }
                break;

        case FMODE_READ | FMODE_WRITE:
        /*
         *  O_RDWR
         *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
         *  This implementation will NEVER block on a O_RDWR open, since
         *  the process can at least talk to itself.
         */

                pipe->readers++;
                pipe->writers++;
                pipe->r_counter++;
                pipe->w_counter++;
                if (pipe->readers == 1 || pipe->writers == 1)
                        wake_up_partner(pipe);
                break;

        default:
                ret = -EINVAL;
                goto err;
        }

        /* Ok! */
        mutex_unlock(&pipe->mutex);
        return 0;

err_rd:
        if (!--pipe->readers)
                wake_up_interruptible(&pipe->wr_wait);
        ret = -ERESTARTSYS;
        goto err;

err_wr:
        if (!--pipe->writers)
                wake_up_interruptible_all(&pipe->rd_wait);
        ret = -ERESTARTSYS;
        goto err;

err:
        mutex_unlock(&pipe->mutex);

        put_pipe_info(inode, pipe);
        return ret;
}

const struct file_operations pipefifo_fops = {
        .open                = fifo_open,
        .llseek                = no_llseek,
        .read_iter        = pipe_read,
        .write_iter        = pipe_write,
        .poll                = pipe_poll,
        .unlocked_ioctl        = pipe_ioctl,
        .release        = pipe_release,
        .fasync                = pipe_fasync,
        .splice_write        = iter_file_splice_write,
};

/*
 * Currently we rely on the pipe array holding a power-of-2 number
 * of pages. Returns 0 on error.
 */
unsigned int round_pipe_size(unsigned int size)
{
        if (size > (1U << 31))
                return 0;

        /* Minimum pipe size, as required by POSIX */
        if (size < PAGE_SIZE)
                return PAGE_SIZE;

        return roundup_pow_of_two(size);
}

/*
 * Resize the pipe ring to a number of slots.
 *
 * Note the pipe can be reduced in capacity, but only if the current
 * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
 * returned instead.
 */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
        struct pipe_buffer *bufs;
        unsigned int head, tail, mask, n;

        bufs = kcalloc(nr_slots, sizeof(*bufs),
                       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (unlikely(!bufs))
                return -ENOMEM;

        spin_lock_irq(&pipe->rd_wait.lock);
        mask = pipe->ring_size - 1;
        head = pipe->head;
        tail = pipe->tail;

        n = pipe_occupancy(head, tail);
        if (nr_slots < n) {
                spin_unlock_irq(&pipe->rd_wait.lock);
                kfree(bufs);
                return -EBUSY;
        }

        /*
         * The pipe array wraps around, so just start the new one at zero
         * and adjust the indices.
         */
        if (n > 0) {
                unsigned int h = head & mask;
                unsigned int t = tail & mask;
                if (h > t) {
                        memcpy(bufs, pipe->bufs + t,
                               n * sizeof(struct pipe_buffer));
                } else {
                        unsigned int tsize = pipe->ring_size - t;
                        if (h > 0)
                                memcpy(bufs + tsize, pipe->bufs,
                                       h * sizeof(struct pipe_buffer));
                        memcpy(bufs, pipe->bufs + t,
                               tsize * sizeof(struct pipe_buffer));
                }
        }

        head = n;
        tail = 0;

        kfree(pipe->bufs);
        pipe->bufs = bufs;
        pipe->ring_size = nr_slots;
        if (pipe->max_usage > nr_slots)
                pipe->max_usage = nr_slots;
        pipe->tail = tail;
        pipe->head = head;

        if (!pipe_has_watch_queue(pipe)) {
                pipe->max_usage = nr_slots;
                pipe->nr_accounted = nr_slots;
        }

        spin_unlock_irq(&pipe->rd_wait.lock);

        /* This might have made more room for writers */
        wake_up_interruptible(&pipe->wr_wait);
        return 0;
}

/*
 * Allocate a new array of pipe buffers and copy the info over. Returns the
 * pipe size if successful, or return -ERROR on error.
 */
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
{
        unsigned long user_bufs;
        unsigned int nr_slots, size;
        long ret = 0;

        if (pipe_has_watch_queue(pipe))
                return -EBUSY;

        size = round_pipe_size(arg);
        nr_slots = size >> PAGE_SHIFT;

        if (!nr_slots)
                return -EINVAL;

        /*
         * If trying to increase the pipe capacity, check that an
         * unprivileged user is not trying to exceed various limits
         * (soft limit check here, hard limit check just below).
         * Decreasing the pipe capacity is always permitted, even
         * if the user is currently over a limit.
         */
        if (nr_slots > pipe->max_usage &&
                        size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
                return -EPERM;

        user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);

        if (nr_slots > pipe->max_usage &&
                        (too_many_pipe_buffers_hard(user_bufs) ||
                         too_many_pipe_buffers_soft(user_bufs)) &&
                        pipe_is_unprivileged_user()) {
                ret = -EPERM;
                goto out_revert_acct;
        }

        ret = pipe_resize_ring(pipe, nr_slots);
        if (ret < 0)
                goto out_revert_acct;

        return pipe->max_usage * PAGE_SIZE;

out_revert_acct:
        (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
        return ret;
}

/*
 * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
 * not enough to verify that this is a pipe.
 */
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
        struct pipe_inode_info *pipe = file->private_data;

        if (file->f_op != &pipefifo_fops || !pipe)
                return NULL;
        if (for_splice && pipe_has_watch_queue(pipe))
                return NULL;
        return pipe;
}

long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
        struct pipe_inode_info *pipe;
        long ret;

        pipe = get_pipe_info(file, false);
        if (!pipe)
                return -EBADF;

        mutex_lock(&pipe->mutex);

        switch (cmd) {
        case F_SETPIPE_SZ:
                ret = pipe_set_size(pipe, arg);
                break;
        case F_GETPIPE_SZ:
                ret = pipe->max_usage * PAGE_SIZE;
                break;
        default:
                ret = -EINVAL;
                break;
        }

        mutex_unlock(&pipe->mutex);
        return ret;
}

static const struct super_operations pipefs_ops = {
        .destroy_inode = free_inode_nonrcu,
        .statfs = simple_statfs,
};

/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */

static int pipefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &pipefs_ops;
        ctx->dops = &pipefs_dentry_operations;
        return 0;
}

static struct file_system_type pipe_fs_type = {
        .name                = "pipefs",
        .init_fs_context = pipefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

#ifdef CONFIG_SYSCTL
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
                                        unsigned int *valp,
                                        int write, void *data)
{
        if (write) {
                unsigned int val;

                val = round_pipe_size(*lvalp);
                if (val == 0)
                        return -EINVAL;

                *valp = val;
        } else {
                unsigned int val = *valp;
                *lvalp = (unsigned long) val;
        }

        return 0;
}

static int proc_dopipe_max_size(struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        return do_proc_douintvec(table, write, buffer, lenp, ppos,
                                 do_proc_dopipe_max_size_conv, NULL);
}

static struct ctl_table fs_pipe_sysctls[] = {
        {
                .procname        = "pipe-max-size",
                .data                = &pipe_max_size,
                .maxlen                = sizeof(pipe_max_size),
                .mode                = 0644,
                .proc_handler        = proc_dopipe_max_size,
        },
        {
                .procname        = "pipe-user-pages-hard",
                .data                = &pipe_user_pages_hard,
                .maxlen                = sizeof(pipe_user_pages_hard),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
        {
                .procname        = "pipe-user-pages-soft",
                .data                = &pipe_user_pages_soft,
                .maxlen                = sizeof(pipe_user_pages_soft),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
};
#endif

static int __init init_pipe_fs(void)
{
        int err = register_filesystem(&pipe_fs_type);

        if (!err) {
                pipe_mnt = kern_mount(&pipe_fs_type);
                if (IS_ERR(pipe_mnt)) {
                        err = PTR_ERR(pipe_mnt);
                        unregister_filesystem(&pipe_fs_type);
                }
        }
#ifdef CONFIG_SYSCTL
        register_sysctl_init("fs", fs_pipe_sysctls);
#endif
        return err;
}

fs_initcall(init_pipe_fs);



































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _ASM_X86_NOSPEC_BRANCH_H_
#define _ASM_X86_NOSPEC_BRANCH_H_

#include <linux/static_key.h>
#include <linux/objtool.h>
#include <linux/linkage.h>

#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
#include <asm/current.h>

/*
 * Call depth tracking for Intel SKL CPUs to address the RSB underflow
 * issue in software.
 *
 * The tracking does not use a counter. It uses uses arithmetic shift
 * right on call entry and logical shift left on return.
 *
 * The depth tracking variable is initialized to 0x8000.... when the call
 * depth is zero. The arithmetic shift right sign extends the MSB and
 * saturates after the 12th call. The shift count is 5 for both directions
 * so the tracking covers 12 nested calls.
 *
 *  Call
 *  0: 0x8000000000000000        0x0000000000000000
 *  1: 0xfc00000000000000        0xf000000000000000
 * ...
 * 11: 0xfffffffffffffff8        0xfffffffffffffc00
 * 12: 0xffffffffffffffff        0xffffffffffffffe0
 *
 * After a return buffer fill the depth is credited 12 calls before the
 * next stuffing has to take place.
 *
 * There is a inaccuracy for situations like this:
 *
 *  10 calls
 *   5 returns
 *   3 calls
 *   4 returns
 *   3 calls
 *   ....
 *
 * The shift count might cause this to be off by one in either direction,
 * but there is still a cushion vs. the RSB depth. The algorithm does not
 * claim to be perfect and it can be speculated around by the CPU, but it
 * is considered that it obfuscates the problem enough to make exploitation
 * extremely difficult.
 */
#define RET_DEPTH_SHIFT                        5
#define RSB_RET_STUFF_LOOPS                16
#define RET_DEPTH_INIT                        0x8000000000000000ULL
#define RET_DEPTH_INIT_FROM_CALL        0xfc00000000000000ULL
#define RET_DEPTH_CREDIT                0xffffffffffffffffULL

#ifdef CONFIG_CALL_THUNKS_DEBUG
# define CALL_THUNKS_DEBUG_INC_CALLS                                \
        incq        PER_CPU_VAR(__x86_call_count);
# define CALL_THUNKS_DEBUG_INC_RETS                                \
        incq        PER_CPU_VAR(__x86_ret_count);
# define CALL_THUNKS_DEBUG_INC_STUFFS                                \
        incq        PER_CPU_VAR(__x86_stuffs_count);
# define CALL_THUNKS_DEBUG_INC_CTXSW                                \
        incq        PER_CPU_VAR(__x86_ctxsw_count);
#else
# define CALL_THUNKS_DEBUG_INC_CALLS
# define CALL_THUNKS_DEBUG_INC_RETS
# define CALL_THUNKS_DEBUG_INC_STUFFS
# define CALL_THUNKS_DEBUG_INC_CTXSW
#endif

#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)

#include <asm/asm-offsets.h>

#define CREDIT_CALL_DEPTH                                        \
        movq        $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);

#define RESET_CALL_DEPTH                                        \
        xor        %eax, %eax;                                        \
        bts        $63, %rax;                                        \
        movq        %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);

#define RESET_CALL_DEPTH_FROM_CALL                                \
        movb        $0xfc, %al;                                        \
        shl        $56, %rax;                                        \
        movq        %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);        \
        CALL_THUNKS_DEBUG_INC_CALLS

#define INCREMENT_CALL_DEPTH                                        \
        sarq        $5, PER_CPU_VAR(pcpu_hot + X86_call_depth);        \
        CALL_THUNKS_DEBUG_INC_CALLS

#else
#define CREDIT_CALL_DEPTH
#define RESET_CALL_DEPTH
#define RESET_CALL_DEPTH_FROM_CALL
#define INCREMENT_CALL_DEPTH
#endif

/*
 * Fill the CPU return stack buffer.
 *
 * Each entry in the RSB, if used for a speculative 'ret', contains an
 * infinite 'pause; lfence; jmp' loop to capture speculative execution.
 *
 * This is required in various cases for retpoline and IBRS-based
 * mitigations for the Spectre variant 2 vulnerability. Sometimes to
 * eliminate potentially bogus entries from the RSB, and sometimes
 * purely to ensure that it doesn't get empty, which on some CPUs would
 * allow predictions from other (unwanted!) sources to be used.
 *
 * We define a CPP macro such that it can be used from both .S files and
 * inline assembly. It's possible to do a .macro and then include that
 * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
 */

#define RETPOLINE_THUNK_SIZE        32
#define RSB_CLEAR_LOOPS                32        /* To forcibly overwrite all entries */

/*
 * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN.
 */
#define __FILL_RETURN_SLOT                        \
        ANNOTATE_INTRA_FUNCTION_CALL;                \
        call        772f;                                \
        int3;                                        \
772:

/*
 * Stuff the entire RSB.
 *
 * Google experimented with loop-unrolling and this turned out to be
 * the optimal version - two calls, each with their own speculation
 * trap should their return address end up getting used, in a loop.
 */
#ifdef CONFIG_X86_64
#define __FILL_RETURN_BUFFER(reg, nr)                        \
        mov        $(nr/2), reg;                                \
771:                                                        \
        __FILL_RETURN_SLOT                                \
        __FILL_RETURN_SLOT                                \
        add        $(BITS_PER_LONG/8) * 2, %_ASM_SP;        \
        dec        reg;                                        \
        jnz        771b;                                        \
        /* barrier for jnz misprediction */                \
        lfence;                                                \
        CREDIT_CALL_DEPTH                                \
        CALL_THUNKS_DEBUG_INC_CTXSW
#else
/*
 * i386 doesn't unconditionally have LFENCE, as such it can't
 * do a loop.
 */
#define __FILL_RETURN_BUFFER(reg, nr)                        \
        .rept nr;                                        \
        __FILL_RETURN_SLOT;                                \
        .endr;                                                \
        add        $(BITS_PER_LONG/8) * nr, %_ASM_SP;
#endif

/*
 * Stuff a single RSB slot.
 *
 * To mitigate Post-Barrier RSB speculation, one CALL instruction must be
 * forced to retire before letting a RET instruction execute.
 *
 * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed
 * before this point.
 */
#define __FILL_ONE_RETURN                                \
        __FILL_RETURN_SLOT                                \
        add        $(BITS_PER_LONG/8), %_ASM_SP;                \
        lfence;

#ifdef __ASSEMBLY__

/*
 * This should be used immediately before an indirect jump/call. It tells
 * objtool the subsequent indirect jump/call is vouched safe for retpoline
 * builds.
 */
.macro ANNOTATE_RETPOLINE_SAFE
.Lhere_\@:
        .pushsection .discard.retpoline_safe
        .long .Lhere_\@
        .popsection
.endm

/*
 * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
 * vs RETBleed validation.
 */
#define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE

/*
 * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
 * eventually turn into its own annotation.
 */
.macro VALIDATE_UNRET_END
#if defined(CONFIG_NOINSTR_VALIDATION) && \
        (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
        ANNOTATE_RETPOLINE_SAFE
        nop
#endif
.endm

/*
 * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
 * to the retpoline thunk with a CS prefix when the register requires
 * a RAX prefix byte to encode. Also see apply_retpolines().
 */
.macro __CS_PREFIX reg:req
        .irp rs,r8,r9,r10,r11,r12,r13,r14,r15
        .ifc \reg,\rs
        .byte 0x2e
        .endif
        .endr
.endm

/*
 * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
 * indirect jmp/call which may be susceptible to the Spectre variant 2
 * attack.
 *
 * NOTE: these do not take kCFI into account and are thus not comparable to C
 * indirect calls, take care when using. The target of these should be an ENDBR
 * instruction irrespective of kCFI.
 */
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_MITIGATION_RETPOLINE
        __CS_PREFIX \reg
        jmp        __x86_indirect_thunk_\reg
#else
        jmp        *%\reg
        int3
#endif
.endm

.macro CALL_NOSPEC reg:req
#ifdef CONFIG_MITIGATION_RETPOLINE
        __CS_PREFIX \reg
        call        __x86_indirect_thunk_\reg
#else
        call        *%\reg
#endif
.endm

 /*
  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
  * monstrosity above, manually.
  */
.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
        ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
                __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
                __stringify(nop;nop;__FILL_ONE_RETURN), \ftr2

.Lskip_rsb_\@:
.endm

/*
 * The CALL to srso_alias_untrain_ret() must be patched in directly at
 * the spot where untraining must be done, ie., srso_alias_untrain_ret()
 * must be the target of a CALL instruction instead of indirectly
 * jumping to a wrapper which then calls it. Therefore, this macro is
 * called outside of __UNTRAIN_RET below, for the time being, before the
 * kernel can support nested alternatives with arbitrary nesting.
 */
.macro CALL_UNTRAIN_RET
#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
        ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \
                          "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
#endif
.endm

/*
 * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the
 * return thunk isn't mapped into the userspace tables (then again, AMD
 * typically has NO_MELTDOWN).
 *
 * While retbleed_untrain_ret() doesn't clobber anything but requires stack,
 * entry_ibpb() will clobber AX, CX, DX.
 *
 * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
 * where we have a stack but before any RET instruction.
 */
.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY)
        VALIDATE_UNRET_END
        CALL_UNTRAIN_RET
        ALTERNATIVE_2 "",                                                \
                      "call entry_ibpb", \ibpb_feature,                        \
                     __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH
#endif
.endm

#define UNTRAIN_RET \
        __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH)

#define UNTRAIN_RET_VM \
        __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH)

#define UNTRAIN_RET_FROM_CALL \
        __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL)


.macro CALL_DEPTH_ACCOUNT
#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
        ALTERNATIVE "",                                                        \
                    __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm

/*
 * Macro to execute VERW instruction that mitigate transient data sampling
 * attacks such as MDS. On affected systems a microcode update overloaded VERW
 * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF.
 *
 * Note: Only the memory operand variant of VERW clears the CPU buffers.
 */
.macro CLEAR_CPU_BUFFERS
        ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF
.endm

#ifdef CONFIG_X86_64
.macro CLEAR_BRANCH_HISTORY
        ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
.endm

.macro CLEAR_BRANCH_HISTORY_VMEXIT
        ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT
.endm
#else
#define CLEAR_BRANCH_HISTORY
#define CLEAR_BRANCH_HISTORY_VMEXIT
#endif

#else /* __ASSEMBLY__ */

#define ANNOTATE_RETPOLINE_SAFE                                        \
        "999:\n\t"                                                \
        ".pushsection .discard.retpoline_safe\n\t"                \
        ".long 999b\n\t"                                        \
        ".popsection\n\t"

typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];

#ifdef CONFIG_MITIGATION_RETHUNK
extern void __x86_return_thunk(void);
#else
static inline void __x86_return_thunk(void) {}
#endif

#ifdef CONFIG_MITIGATION_UNRET_ENTRY
extern void retbleed_return_thunk(void);
#else
static inline void retbleed_return_thunk(void) {}
#endif

extern void srso_alias_untrain_ret(void);

#ifdef CONFIG_MITIGATION_SRSO
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
#else
static inline void srso_return_thunk(void) {}
static inline void srso_alias_return_thunk(void) {}
#endif

extern void retbleed_return_thunk(void);
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);

extern void entry_untrain_ret(void);
extern void entry_ibpb(void);

#ifdef CONFIG_X86_64
extern void clear_bhb_loop(void);
#endif

extern void (*x86_return_thunk)(void);

extern void __warn_thunk(void);

#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
extern void call_depth_return_thunk(void);

#define CALL_DEPTH_ACCOUNT                                        \
        ALTERNATIVE("",                                                \
                    __stringify(INCREMENT_CALL_DEPTH),                \
                    X86_FEATURE_CALL_DEPTH)

#ifdef CONFIG_CALL_THUNKS_DEBUG
DECLARE_PER_CPU(u64, __x86_call_count);
DECLARE_PER_CPU(u64, __x86_ret_count);
DECLARE_PER_CPU(u64, __x86_stuffs_count);
DECLARE_PER_CPU(u64, __x86_ctxsw_count);
#endif
#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */

static inline void call_depth_return_thunk(void) {}
#define CALL_DEPTH_ACCOUNT ""

#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */

#ifdef CONFIG_MITIGATION_RETPOLINE

#define GEN(reg) \
        extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
#include <asm/GEN-for-each-reg.h>
#undef GEN

#define GEN(reg)                                                \
        extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg;
#include <asm/GEN-for-each-reg.h>
#undef GEN

#define GEN(reg)                                                \
        extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg;
#include <asm/GEN-for-each-reg.h>
#undef GEN

#ifdef CONFIG_X86_64

/*
 * Inline asm uses the %V modifier which is only in newer GCC
 * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
 */
# define CALL_NOSPEC                                                \
        ALTERNATIVE_2(                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        "call __x86_indirect_thunk_%V[thunk_target]\n",                \
        X86_FEATURE_RETPOLINE,                                        \
        "lfence;\n"                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        X86_FEATURE_RETPOLINE_LFENCE)

# define THUNK_TARGET(addr) [thunk_target] "r" (addr)

#else /* CONFIG_X86_32 */
/*
 * For i386 we use the original ret-equivalent retpoline, because
 * otherwise we'll run out of registers. We don't care about CET
 * here, anyway.
 */
# define CALL_NOSPEC                                                \
        ALTERNATIVE_2(                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        "       jmp    904f;\n"                                        \
        "       .align 16\n"                                        \
        "901:        call   903f;\n"                                        \
        "902:        pause;\n"                                        \
        "            lfence;\n"                                        \
        "       jmp    902b;\n"                                        \
        "       .align 16\n"                                        \
        "903:        lea    4(%%esp), %%esp;\n"                        \
        "       pushl  %[thunk_target];\n"                        \
        "       ret;\n"                                                \
        "       .align 16\n"                                        \
        "904:        call   901b;\n",                                \
        X86_FEATURE_RETPOLINE,                                        \
        "lfence;\n"                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        X86_FEATURE_RETPOLINE_LFENCE)

# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif
#else /* No retpoline for C / inline asm */
# define CALL_NOSPEC "call *%[thunk_target]\n"
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif

/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
        SPECTRE_V2_NONE,
        SPECTRE_V2_RETPOLINE,
        SPECTRE_V2_LFENCE,
        SPECTRE_V2_EIBRS,
        SPECTRE_V2_EIBRS_RETPOLINE,
        SPECTRE_V2_EIBRS_LFENCE,
        SPECTRE_V2_IBRS,
};

/* The indirect branch speculation control variants */
enum spectre_v2_user_mitigation {
        SPECTRE_V2_USER_NONE,
        SPECTRE_V2_USER_STRICT,
        SPECTRE_V2_USER_STRICT_PREFERRED,
        SPECTRE_V2_USER_PRCTL,
        SPECTRE_V2_USER_SECCOMP,
};

/* The Speculative Store Bypass disable variants */
enum ssb_mitigation {
        SPEC_STORE_BYPASS_NONE,
        SPEC_STORE_BYPASS_DISABLE,
        SPEC_STORE_BYPASS_PRCTL,
        SPEC_STORE_BYPASS_SECCOMP,
};

static __always_inline
void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
{
        asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
                : : "c" (msr),
                    "a" ((u32)val),
                    "d" ((u32)(val >> 32)),
                    [feature] "i" (feature)
                : "memory");
}

extern u64 x86_pred_cmd;

static inline void indirect_branch_prediction_barrier(void)
{
        alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
}

/* The Intel SPEC CTRL MSR base value cache */
extern u64 x86_spec_ctrl_base;
DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
extern void update_spec_ctrl_cond(u64 val);
extern u64 spec_ctrl_current(void);

/*
 * With retpoline, we must use IBRS to restrict branch prediction
 * before calling into firmware.
 *
 * (Implemented as CPP macros due to header hell.)
 */
#define firmware_restrict_branch_speculation_start()                        \
do {                                                                        \
        preempt_disable();                                                \
        alternative_msr_write(MSR_IA32_SPEC_CTRL,                        \
                              spec_ctrl_current() | SPEC_CTRL_IBRS,        \
                              X86_FEATURE_USE_IBRS_FW);                        \
        alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,                \
                              X86_FEATURE_USE_IBPB_FW);                        \
} while (0)

#define firmware_restrict_branch_speculation_end()                        \
do {                                                                        \
        alternative_msr_write(MSR_IA32_SPEC_CTRL,                        \
                              spec_ctrl_current(),                        \
                              X86_FEATURE_USE_IBRS_FW);                        \
        preempt_enable();                                                \
} while (0)

DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);

DECLARE_STATIC_KEY_FALSE(mds_idle_clear);

DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);

DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);

extern u16 mds_verw_sel;

#include <asm/segment.h>

/**
 * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
 *
 * This uses the otherwise unused and obsolete VERW instruction in
 * combination with microcode which triggers a CPU buffer flush when the
 * instruction is executed.
 */
static __always_inline void mds_clear_cpu_buffers(void)
{
        static const u16 ds = __KERNEL_DS;

        /*
         * Has to be the memory-operand variant because only that
         * guarantees the CPU buffer flush functionality according to
         * documentation. The register-operand variant does not.
         * Works with any segment selector, but a valid writable
         * data segment is the fastest variant.
         *
         * "cc" clobber is required because VERW modifies ZF.
         */
        asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
}

/**
 * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
 *
 * Clear CPU buffers if the corresponding static key is enabled
 */
static __always_inline void mds_idle_clear_cpu_buffers(void)
{
        if (static_branch_likely(&mds_idle_clear))
                mds_clear_cpu_buffers();
}

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */













































































































































































































































































































































































































































































































































































































































    4 



    5 

    4 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
/* netfilter.c: look after the filters for various protocols.
 * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
 *
 * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
 * way.
 *
 * This code is GPL.
 */
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <net/protocol.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/wait.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/if.h>
#include <linux/netdevice.h>
#include <linux/netfilter_ipv6.h>
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/netfilter/nf_queue.h>
#include <net/sock.h>

#include "nf_internals.h"

const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);

DEFINE_PER_CPU(bool, nf_skb_duplicated);
EXPORT_SYMBOL_GPL(nf_skb_duplicated);

#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
#endif

static DEFINE_MUTEX(nf_hook_mutex);

/* max hooks per family/hooknum */
#define MAX_HOOK_COUNT                1024

#define nf_entry_dereference(e) \
        rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))

static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
{
        struct nf_hook_entries *e;
        size_t alloc = sizeof(*e) +
                       sizeof(struct nf_hook_entry) * num +
                       sizeof(struct nf_hook_ops *) * num +
                       sizeof(struct nf_hook_entries_rcu_head);

        if (num == 0)
                return NULL;

        e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
        if (e)
                e->num_hook_entries = num;
        return e;
}

static void __nf_hook_entries_free(struct rcu_head *h)
{
        struct nf_hook_entries_rcu_head *head;

        head = container_of(h, struct nf_hook_entries_rcu_head, head);
        kvfree(head->allocation);
}

static void nf_hook_entries_free(struct nf_hook_entries *e)
{
        struct nf_hook_entries_rcu_head *head;
        struct nf_hook_ops **ops;
        unsigned int num;

        if (!e)
                return;

        num = e->num_hook_entries;
        ops = nf_hook_entries_get_hook_ops(e);
        head = (void *)&ops[num];
        head->allocation = e;
        call_rcu(&head->head, __nf_hook_entries_free);
}

static unsigned int accept_all(void *priv,
                               struct sk_buff *skb,
                               const struct nf_hook_state *state)
{
        return NF_ACCEPT; /* ACCEPT makes nf_hook_slow call next hook */
}

static const struct nf_hook_ops dummy_ops = {
        .hook = accept_all,
        .priority = INT_MIN,
};

static struct nf_hook_entries *
nf_hook_entries_grow(const struct nf_hook_entries *old,
                     const struct nf_hook_ops *reg)
{
        unsigned int i, alloc_entries, nhooks, old_entries;
        struct nf_hook_ops **orig_ops = NULL;
        struct nf_hook_ops **new_ops;
        struct nf_hook_entries *new;
        bool inserted = false;

        alloc_entries = 1;
        old_entries = old ? old->num_hook_entries : 0;

        if (old) {
                orig_ops = nf_hook_entries_get_hook_ops(old);

                for (i = 0; i < old_entries; i++) {
                        if (orig_ops[i] != &dummy_ops)
                                alloc_entries++;

                        /* Restrict BPF hook type to force a unique priority, not
                         * shared at attach time.
                         *
                         * This is mainly to avoid ordering issues between two
                         * different bpf programs, this doesn't prevent a normal
                         * hook at same priority as a bpf one (we don't want to
                         * prevent defrag, conntrack, iptables etc from attaching).
                         */
                        if (reg->priority == orig_ops[i]->priority &&
                            reg->hook_ops_type == NF_HOOK_OP_BPF)
                                return ERR_PTR(-EBUSY);
                }
        }

        if (alloc_entries > MAX_HOOK_COUNT)
                return ERR_PTR(-E2BIG);

        new = allocate_hook_entries_size(alloc_entries);
        if (!new)
                return ERR_PTR(-ENOMEM);

        new_ops = nf_hook_entries_get_hook_ops(new);

        i = 0;
        nhooks = 0;
        while (i < old_entries) {
                if (orig_ops[i] == &dummy_ops) {
                        ++i;
                        continue;
                }

                if (inserted || reg->priority > orig_ops[i]->priority) {
                        new_ops[nhooks] = (void *)orig_ops[i];
                        new->hooks[nhooks] = old->hooks[i];
                        i++;
                } else {
                        new_ops[nhooks] = (void *)reg;
                        new->hooks[nhooks].hook = reg->hook;
                        new->hooks[nhooks].priv = reg->priv;
                        inserted = true;
                }
                nhooks++;
        }

        if (!inserted) {
                new_ops[nhooks] = (void *)reg;
                new->hooks[nhooks].hook = reg->hook;
                new->hooks[nhooks].priv = reg->priv;
        }

        return new;
}

static void hooks_validate(const struct nf_hook_entries *hooks)
{
#ifdef CONFIG_DEBUG_MISC
        struct nf_hook_ops **orig_ops;
        int prio = INT_MIN;
        size_t i = 0;

        orig_ops = nf_hook_entries_get_hook_ops(hooks);

        for (i = 0; i < hooks->num_hook_entries; i++) {
                if (orig_ops[i] == &dummy_ops)
                        continue;

                WARN_ON(orig_ops[i]->priority < prio);

                if (orig_ops[i]->priority > prio)
                        prio = orig_ops[i]->priority;
        }
#endif
}

int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
                                const struct nf_hook_ops *reg)
{
        struct nf_hook_entries *new_hooks;
        struct nf_hook_entries *p;

        p = rcu_dereference_raw(*pp);
        new_hooks = nf_hook_entries_grow(p, reg);
        if (IS_ERR(new_hooks))
                return PTR_ERR(new_hooks);

        hooks_validate(new_hooks);

        rcu_assign_pointer(*pp, new_hooks);

        BUG_ON(p == new_hooks);
        nf_hook_entries_free(p);
        return 0;
}
EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw);

/*
 * __nf_hook_entries_try_shrink - try to shrink hook array
 *
 * @old -- current hook blob at @pp
 * @pp -- location of hook blob
 *
 * Hook unregistration must always succeed, so to-be-removed hooks
 * are replaced by a dummy one that will just move to next hook.
 *
 * This counts the current dummy hooks, attempts to allocate new blob,
 * copies the live hooks, then replaces and discards old one.
 *
 * return values:
 *
 * Returns address to free, or NULL.
 */
static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
                                          struct nf_hook_entries __rcu **pp)
{
        unsigned int i, j, skip = 0, hook_entries;
        struct nf_hook_entries *new = NULL;
        struct nf_hook_ops **orig_ops;
        struct nf_hook_ops **new_ops;

        if (WARN_ON_ONCE(!old))
                return NULL;

        orig_ops = nf_hook_entries_get_hook_ops(old);
        for (i = 0; i < old->num_hook_entries; i++) {
                if (orig_ops[i] == &dummy_ops)
                        skip++;
        }

        /* if skip == hook_entries all hooks have been removed */
        hook_entries = old->num_hook_entries;
        if (skip == hook_entries)
                goto out_assign;

        if (skip == 0)
                return NULL;

        hook_entries -= skip;
        new = allocate_hook_entries_size(hook_entries);
        if (!new)
                return NULL;

        new_ops = nf_hook_entries_get_hook_ops(new);
        for (i = 0, j = 0; i < old->num_hook_entries; i++) {
                if (orig_ops[i] == &dummy_ops)
                        continue;
                new->hooks[j] = old->hooks[i];
                new_ops[j] = (void *)orig_ops[i];
                j++;
        }
        hooks_validate(new);
out_assign:
        rcu_assign_pointer(*pp, new);
        return old;
}

static struct nf_hook_entries __rcu **
nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
                   struct net_device *dev)
{
        switch (pf) {
        case NFPROTO_NETDEV:
                break;
#ifdef CONFIG_NETFILTER_FAMILY_ARP
        case NFPROTO_ARP:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= hooknum))
                        return NULL;
                return net->nf.hooks_arp + hooknum;
#endif
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
        case NFPROTO_BRIDGE:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum))
                        return NULL;
                return net->nf.hooks_bridge + hooknum;
#endif
#ifdef CONFIG_NETFILTER_INGRESS
        case NFPROTO_INET:
                if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS))
                        return NULL;
                if (!dev || dev_net(dev) != net) {
                        WARN_ON_ONCE(1);
                        return NULL;
                }
                return &dev->nf_hooks_ingress;
#endif
        case NFPROTO_IPV4:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
                        return NULL;
                return net->nf.hooks_ipv4 + hooknum;
        case NFPROTO_IPV6:
                if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
                        return NULL;
                return net->nf.hooks_ipv6 + hooknum;
        default:
                WARN_ON_ONCE(1);
                return NULL;
        }

#ifdef CONFIG_NETFILTER_INGRESS
        if (hooknum == NF_NETDEV_INGRESS) {
                if (dev && dev_net(dev) == net)
                        return &dev->nf_hooks_ingress;
        }
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        if (hooknum == NF_NETDEV_EGRESS) {
                if (dev && dev_net(dev) == net)
                        return &dev->nf_hooks_egress;
        }
#endif
        WARN_ON_ONCE(1);
        return NULL;
}

static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg,
                            int hooknum)
{
#ifndef CONFIG_NETFILTER_INGRESS
        if (reg->hooknum == hooknum)
                return -EOPNOTSUPP;
#endif
        if (reg->hooknum != hooknum ||
            !reg->dev || dev_net(reg->dev) != net)
                return -EINVAL;

        return 0;
}

static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg,
                                                  int pf)
{
        if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) ||
            (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS))
                return true;

        return false;
}

static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg,
                                                 int pf)
{
        return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS;
}

static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf)
{
#ifdef CONFIG_JUMP_LABEL
        int hooknum;

        if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
                pf = NFPROTO_NETDEV;
                hooknum = NF_NETDEV_INGRESS;
        } else {
                hooknum = reg->hooknum;
        }
        static_key_slow_inc(&nf_hooks_needed[pf][hooknum]);
#endif
}

static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf)
{
#ifdef CONFIG_JUMP_LABEL
        int hooknum;

        if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
                pf = NFPROTO_NETDEV;
                hooknum = NF_NETDEV_INGRESS;
        } else {
                hooknum = reg->hooknum;
        }
        static_key_slow_dec(&nf_hooks_needed[pf][hooknum]);
#endif
}

static int __nf_register_net_hook(struct net *net, int pf,
                                  const struct nf_hook_ops *reg)
{
        struct nf_hook_entries *p, *new_hooks;
        struct nf_hook_entries __rcu **pp;
        int err;

        switch (pf) {
        case NFPROTO_NETDEV:
#ifndef CONFIG_NETFILTER_INGRESS
                if (reg->hooknum == NF_NETDEV_INGRESS)
                        return -EOPNOTSUPP;
#endif
#ifndef CONFIG_NETFILTER_EGRESS
                if (reg->hooknum == NF_NETDEV_EGRESS)
                        return -EOPNOTSUPP;
#endif
                if ((reg->hooknum != NF_NETDEV_INGRESS &&
                     reg->hooknum != NF_NETDEV_EGRESS) ||
                    !reg->dev || dev_net(reg->dev) != net)
                        return -EINVAL;
                break;
        case NFPROTO_INET:
                if (reg->hooknum != NF_INET_INGRESS)
                        break;

                err = nf_ingress_check(net, reg, NF_INET_INGRESS);
                if (err < 0)
                        return err;
                break;
        }

        pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
        if (!pp)
                return -EINVAL;

        mutex_lock(&nf_hook_mutex);

        p = nf_entry_dereference(*pp);
        new_hooks = nf_hook_entries_grow(p, reg);

        if (!IS_ERR(new_hooks)) {
                hooks_validate(new_hooks);
                rcu_assign_pointer(*pp, new_hooks);
        }

        mutex_unlock(&nf_hook_mutex);
        if (IS_ERR(new_hooks))
                return PTR_ERR(new_hooks);

#ifdef CONFIG_NETFILTER_INGRESS
        if (nf_ingress_hook(reg, pf))
                net_inc_ingress_queue();
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        if (nf_egress_hook(reg, pf))
                net_inc_egress_queue();
#endif
        nf_static_key_inc(reg, pf);

        BUG_ON(p == new_hooks);
        nf_hook_entries_free(p);
        return 0;
}

/*
 * nf_remove_net_hook - remove a hook from blob
 *
 * @oldp: current address of hook blob
 * @unreg: hook to unregister
 *
 * This cannot fail, hook unregistration must always succeed.
 * Therefore replace the to-be-removed hook with a dummy hook.
 */
static bool nf_remove_net_hook(struct nf_hook_entries *old,
                               const struct nf_hook_ops *unreg)
{
        struct nf_hook_ops **orig_ops;
        unsigned int i;

        orig_ops = nf_hook_entries_get_hook_ops(old);
        for (i = 0; i < old->num_hook_entries; i++) {
                if (orig_ops[i] != unreg)
                        continue;
                WRITE_ONCE(old->hooks[i].hook, accept_all);
                WRITE_ONCE(orig_ops[i], (void *)&dummy_ops);
                return true;
        }

        return false;
}

static void __nf_unregister_net_hook(struct net *net, int pf,
                                     const struct nf_hook_ops *reg)
{
        struct nf_hook_entries __rcu **pp;
        struct nf_hook_entries *p;

        pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
        if (!pp)
                return;

        mutex_lock(&nf_hook_mutex);

        p = nf_entry_dereference(*pp);
        if (WARN_ON_ONCE(!p)) {
                mutex_unlock(&nf_hook_mutex);
                return;
        }

        if (nf_remove_net_hook(p, reg)) {
#ifdef CONFIG_NETFILTER_INGRESS
                if (nf_ingress_hook(reg, pf))
                        net_dec_ingress_queue();
#endif
#ifdef CONFIG_NETFILTER_EGRESS
                if (nf_egress_hook(reg, pf))
                        net_dec_egress_queue();
#endif
                nf_static_key_dec(reg, pf);
        } else {
                WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
        }

        p = __nf_hook_entries_try_shrink(p, pp);
        mutex_unlock(&nf_hook_mutex);
        if (!p)
                return;

        nf_queue_nf_hook_drop(net);
        nf_hook_entries_free(p);
}

void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
        if (reg->pf == NFPROTO_INET) {
                if (reg->hooknum == NF_INET_INGRESS) {
                        __nf_unregister_net_hook(net, NFPROTO_INET, reg);
                } else {
                        __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
                        __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
                }
        } else {
                __nf_unregister_net_hook(net, reg->pf, reg);
        }
}
EXPORT_SYMBOL(nf_unregister_net_hook);

void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
                                const struct nf_hook_ops *reg)
{
        struct nf_hook_entries *p;

        p = rcu_dereference_raw(*pp);
        if (nf_remove_net_hook(p, reg)) {
                p = __nf_hook_entries_try_shrink(p, pp);
                nf_hook_entries_free(p);
        }
}
EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw);

int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
        int err;

        if (reg->pf == NFPROTO_INET) {
                if (reg->hooknum == NF_INET_INGRESS) {
                        err = __nf_register_net_hook(net, NFPROTO_INET, reg);
                        if (err < 0)
                                return err;
                } else {
                        err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
                        if (err < 0)
                                return err;

                        err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
                        if (err < 0) {
                                __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
                                return err;
                        }
                }
        } else {
                err = __nf_register_net_hook(net, reg->pf, reg);
                if (err < 0)
                        return err;
        }

        return 0;
}
EXPORT_SYMBOL(nf_register_net_hook);

int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                          unsigned int n)
{
        unsigned int i;
        int err = 0;

        for (i = 0; i < n; i++) {
                err = nf_register_net_hook(net, &reg[i]);
                if (err)
                        goto err;
        }
        return err;

err:
        if (i > 0)
                nf_unregister_net_hooks(net, reg, i);
        return err;
}
EXPORT_SYMBOL(nf_register_net_hooks);

void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
                             unsigned int hookcount)
{
        unsigned int i;

        for (i = 0; i < hookcount; i++)
                nf_unregister_net_hook(net, &reg[i]);
}
EXPORT_SYMBOL(nf_unregister_net_hooks);

/* Returns 1 if okfn() needs to be executed by the caller,
 * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
                 const struct nf_hook_entries *e, unsigned int s)
{
        unsigned int verdict;
        int ret;

        for (; s < e->num_hook_entries; s++) {
                verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
                switch (verdict & NF_VERDICT_MASK) {
                case NF_ACCEPT:
                        break;
                case NF_DROP:
                        kfree_skb_reason(skb,
                                         SKB_DROP_REASON_NETFILTER_DROP);
                        ret = NF_DROP_GETERR(verdict);
                        if (ret == 0)
                                ret = -EPERM;
                        return ret;
                case NF_QUEUE:
                        ret = nf_queue(skb, state, s, verdict);
                        if (ret == 1)
                                continue;
                        return ret;
                case NF_STOLEN:
                        return NF_DROP_GETERR(verdict);
                default:
                        WARN_ON_ONCE(1);
                        return 0;
                }
        }

        return 1;
}
EXPORT_SYMBOL(nf_hook_slow);

void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
                       const struct nf_hook_entries *e)
{
        struct sk_buff *skb, *next;
        struct list_head sublist;
        int ret;

        INIT_LIST_HEAD(&sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                skb_list_del_init(skb);
                ret = nf_hook_slow(skb, state, e, 0);
                if (ret == 1)
                        list_add_tail(&skb->list, &sublist);
        }
        /* Put passed packets back on main list */
        list_splice(&sublist, head);
}
EXPORT_SYMBOL(nf_hook_slow_list);

/* This needs to be compiled in any case to avoid dependencies between the
 * nfnetlink_queue code and nf_conntrack.
 */
const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);

const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);

const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_defrag_v4_hook);

const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_defrag_v6_hook);

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
u8 nf_ctnetlink_has_listener;
EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener);

const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);

/* This does not belong here, but locally generated errors need it if connection
 * tracking in use: without this, connection may not be in hash table, and hence
 * manufactured ICMP or RST packets will not be associated with it.
 */
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
        const struct nf_ct_hook *ct_hook;

        if (skb->_nfct) {
                rcu_read_lock();
                ct_hook = rcu_dereference(nf_ct_hook);
                if (ct_hook)
                        ct_hook->attach(new, skb);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(nf_ct_attach);

void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
        const struct nf_ct_hook *ct_hook;

        rcu_read_lock();
        ct_hook = rcu_dereference(nf_ct_hook);
        if (ct_hook)
                ct_hook->destroy(nfct);
        rcu_read_unlock();

        WARN_ON(!ct_hook);
}
EXPORT_SYMBOL(nf_conntrack_destroy);

void nf_ct_set_closing(struct nf_conntrack *nfct)
{
        const struct nf_ct_hook *ct_hook;

        if (!nfct)
                return;

        rcu_read_lock();
        ct_hook = rcu_dereference(nf_ct_hook);
        if (ct_hook)
                ct_hook->set_closing(nfct);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_ct_set_closing);

bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
                         const struct sk_buff *skb)
{
        const struct nf_ct_hook *ct_hook;
        bool ret = false;

        rcu_read_lock();
        ct_hook = rcu_dereference(nf_ct_hook);
        if (ct_hook)
                ret = ct_hook->get_tuple_skb(dst_tuple, skb);
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(nf_ct_get_tuple_skb);

/* Built-in default zone used e.g. by modules. */
const struct nf_conntrack_zone nf_ct_zone_dflt = {
        .id        = NF_CT_DEFAULT_ZONE_ID,
        .dir        = NF_CT_DEFAULT_ZONE_DIR,
};
EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#endif /* CONFIG_NF_CONNTRACK */

static void __net_init
__netfilter_net_init(struct nf_hook_entries __rcu **e, int max)
{
        int h;

        for (h = 0; h < max; h++)
                RCU_INIT_POINTER(e[h], NULL);
}

static int __net_init netfilter_net_init(struct net *net)
{
        __netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4));
        __netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6));
#ifdef CONFIG_NETFILTER_FAMILY_ARP
        __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp));
#endif
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
        __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
#endif
#ifdef CONFIG_PROC_FS
        net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
                                                net->proc_net);
        if (!net->nf.proc_netfilter) {
                if (!net_eq(net, &init_net))
                        pr_err("cannot create netfilter proc entry");

                return -ENOMEM;
        }
#endif

        return 0;
}

static void __net_exit netfilter_net_exit(struct net *net)
{
        remove_proc_entry("netfilter", net->proc_net);
}

static struct pernet_operations netfilter_net_ops = {
        .init = netfilter_net_init,
        .exit = netfilter_net_exit,
};

int __init netfilter_init(void)
{
        int ret;

        ret = register_pernet_subsys(&netfilter_net_ops);
        if (ret < 0)
                goto err;

#ifdef CONFIG_LWTUNNEL
        ret = netfilter_lwtunnel_init();
        if (ret < 0)
                goto err_lwtunnel_pernet;
#endif
        ret = netfilter_log_init();
        if (ret < 0)
                goto err_log_pernet;

        return 0;
err_log_pernet:
#ifdef CONFIG_LWTUNNEL
        netfilter_lwtunnel_fini();
err_lwtunnel_pernet:
#endif
        unregister_pernet_subsys(&netfilter_net_ops);
err:
        return ret;
}


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 
    5 
    5 







































    3 
    1 
    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf_perf_event.h>
#include <linux/btf.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include <linux/btf_ids.h>
#include <linux/bpf_lsm.h>
#include <linux/fprobe.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
#include <linux/fileattr.h>

#include <net/bpf_sk_storage.h>

#include <uapi/linux/bpf.h>
#include <uapi/linux/btf.h>

#include <asm/tlb.h>

#include "trace_probe.h"
#include "trace.h"

#define CREATE_TRACE_POINTS
#include "bpf_trace.h"

#define bpf_event_rcu_dereference(p)                                        \
        rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))

#define MAX_UPROBE_MULTI_CNT (1U << 20)
#define MAX_KPROBE_MULTI_CNT (1U << 20)

#ifdef CONFIG_MODULES
struct bpf_trace_module {
        struct module *module;
        struct list_head list;
};

static LIST_HEAD(bpf_trace_modules);
static DEFINE_MUTEX(bpf_module_mutex);

static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
        struct bpf_raw_event_map *btp, *ret = NULL;
        struct bpf_trace_module *btm;
        unsigned int i;

        mutex_lock(&bpf_module_mutex);
        list_for_each_entry(btm, &bpf_trace_modules, list) {
                for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
                        btp = &btm->module->bpf_raw_events[i];
                        if (!strcmp(btp->tp->name, name)) {
                                if (try_module_get(btm->module))
                                        ret = btp;
                                goto out;
                        }
                }
        }
out:
        mutex_unlock(&bpf_module_mutex);
        return ret;
}
#else
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
        return NULL;
}
#endif /* CONFIG_MODULES */

u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
                                  u64 flags, const struct btf **btf,
                                  s32 *btf_id);
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);

static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx);
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx);

/**
 * trace_call_bpf - invoke BPF program
 * @call: tracepoint event
 * @ctx: opaque context pointer
 *
 * kprobe handlers execute BPF programs via this helper.
 * Can be used from static tracepoints in the future.
 *
 * Return: BPF programs always return an integer which is interpreted by
 * kprobe handler as:
 * 0 - return from kprobe (event is filtered out)
 * 1 - store kprobe event into ring buffer
 * Other values are reserved and currently alias to 1
 */
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
        unsigned int ret;

        cant_sleep();

        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
                /*
                 * since some bpf program is already running on this cpu,
                 * don't call into another bpf program (same or different)
                 * and don't send kprobe event into ring-buffer,
                 * so return zero here
                 */
                rcu_read_lock();
                bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
                rcu_read_unlock();
                ret = 0;
                goto out;
        }

        /*
         * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
         * to all call sites, we did a bpf_prog_array_valid() there to check
         * whether call->prog_array is empty or not, which is
         * a heuristic to speed up execution.
         *
         * If bpf_prog_array_valid() fetched prog_array was
         * non-NULL, we go into trace_call_bpf() and do the actual
         * proper rcu_dereference() under RCU lock.
         * If it turns out that prog_array is NULL then, we bail out.
         * For the opposite, if the bpf_prog_array_valid() fetched pointer
         * was NULL, you'll skip the prog_array with the risk of missing
         * out of events when it was updated in between this and the
         * rcu_dereference() which is accepted risk.
         */
        rcu_read_lock();
        ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
                                 ctx, bpf_prog_run);
        rcu_read_unlock();

 out:
        __this_cpu_dec(bpf_prog_active);

        return ret;
}

#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
        regs_set_return_value(regs, rc);
        override_function_with_return(regs);
        return 0;
}

static const struct bpf_func_proto bpf_override_return_proto = {
        .func                = bpf_override_return,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};
#endif

static __always_inline int
bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
{
        int ret;

        ret = copy_from_user_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
           const void __user *, unsafe_ptr)
{
        return bpf_probe_read_user_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_user_proto = {
        .func                = bpf_probe_read_user,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

static __always_inline int
bpf_probe_read_user_str_common(void *dst, u32 size,
                               const void __user *unsafe_ptr)
{
        int ret;

        /*
         * NB: We rely on strncpy_from_user() not copying junk past the NUL
         * terminator into `dst`.
         *
         * strncpy_from_user() does long-sized strides in the fast path. If the
         * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
         * then there could be junk after the NUL in `dst`. If user takes `dst`
         * and keys a hash map with it, then semantically identical strings can
         * occupy multiple entries in the map.
         */
        ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
           const void __user *, unsafe_ptr)
{
        return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_user_str_proto = {
        .func                = bpf_probe_read_user_str,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_kernel_proto = {
        .func                = bpf_probe_read_kernel,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

static __always_inline int
bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
        int ret;

        /*
         * The strncpy_from_kernel_nofault() call will likely not fill the
         * entire buffer, but that's okay in this circumstance as we're probing
         * arbitrary memory anyway similar to bpf_probe_read_*() and might
         * as well probe the stack. Thus, memory is explicitly cleared
         * only in error case, so that improper users ignoring return
         * code altogether don't copy garbage; otherwise length of string
         * is returned that can be used for bpf_perf_event_output() et al.
         */
        ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);
        return ret;
}

BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}

const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
        .func                = bpf_probe_read_kernel_str,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        if ((unsigned long)unsafe_ptr < TASK_SIZE) {
                return bpf_probe_read_user_common(dst, size,
                                (__force void __user *)unsafe_ptr);
        }
        return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}

static const struct bpf_func_proto bpf_probe_read_compat_proto = {
        .func                = bpf_probe_read_compat,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
           const void *, unsafe_ptr)
{
        if ((unsigned long)unsafe_ptr < TASK_SIZE) {
                return bpf_probe_read_user_str_common(dst, size,
                                (__force void __user *)unsafe_ptr);
        }
        return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}

static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
        .func                = bpf_probe_read_compat_str,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_ANYTHING,
};
#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */

BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
           u32, size)
{
        /*
         * Ensure we're in user context which is safe for the helper to
         * run. This helper has no business in a kthread.
         *
         * access_ok() should prevent writing to non-user memory, but in
         * some situations (nommu, temporary switch, etc) access_ok() does
         * not provide enough validation, hence the check on KERNEL_DS.
         *
         * nmi_uaccess_okay() ensures the probe is not run in an interim
         * state, when the task or mm are switched. This is specifically
         * required to prevent the use of temporary mm.
         */

        if (unlikely(in_interrupt() ||
                     current->flags & (PF_KTHREAD | PF_EXITING)))
                return -EPERM;
        if (unlikely(!nmi_uaccess_okay()))
                return -EPERM;

        return copy_to_user_nofault(unsafe_ptr, src, size);
}

static const struct bpf_func_proto bpf_probe_write_user_proto = {
        .func                = bpf_probe_write_user,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
        if (!capable(CAP_SYS_ADMIN))
                return NULL;

        pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
                            current->comm, task_pid_nr(current));

        return &bpf_probe_write_user_proto;
}

#define MAX_TRACE_PRINTK_VARARGS        3
#define BPF_TRACE_PRINTK_SIZE                1024

BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
           u64, arg2, u64, arg3)
{
        u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
                .get_buf        = true,
        };
        int ret;

        ret = bpf_bprintf_prepare(fmt, fmt_size, args,
                                  MAX_TRACE_PRINTK_VARARGS, &data);
        if (ret < 0)
                return ret;

        ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);

        trace_bpf_trace_printk(data.buf);

        bpf_bprintf_cleanup(&data);

        return ret;
}

static const struct bpf_func_proto bpf_trace_printk_proto = {
        .func                = bpf_trace_printk,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
};

static void __set_printk_clr_event(void)
{
        /*
         * This program might be calling bpf_trace_printk,
         * so enable the associated bpf_trace/bpf_trace_printk event.
         * Repeat this each time as it is possible a user has
         * disabled bpf_trace_printk events.  By loading a program
         * calling bpf_trace_printk() however the user has expressed
         * the intent to see such events.
         */
        if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
                pr_warn_ratelimited("could not enable bpf_trace_printk events");
}

const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
        __set_printk_clr_event();
        return &bpf_trace_printk_proto;
}

BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args,
           u32, data_len)
{
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
                .get_buf        = true,
        };
        int ret, num_args;

        if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
            (data_len && !args))
                return -EINVAL;
        num_args = data_len / 8;

        ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
        if (ret < 0)
                return ret;

        ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args);

        trace_bpf_trace_printk(data.buf);

        bpf_bprintf_cleanup(&data);

        return ret;
}

static const struct bpf_func_proto bpf_trace_vprintk_proto = {
        .func                = bpf_trace_vprintk,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
};

const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
        __set_printk_clr_event();
        return &bpf_trace_vprintk_proto;
}

BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
           const void *, args, u32, data_len)
{
        struct bpf_bprintf_data data = {
                .get_bin_args        = true,
        };
        int err, num_args;

        if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
            (data_len && !args))
                return -EINVAL;
        num_args = data_len / 8;

        err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);
        if (err < 0)
                return err;

        seq_bprintf(m, fmt, data.bin_args);

        bpf_bprintf_cleanup(&data);

        return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}

BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)

static const struct bpf_func_proto bpf_seq_printf_proto = {
        .func                = bpf_seq_printf,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_seq_file_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
{
        return seq_write(m, data, len) ? -EOVERFLOW : 0;
}

static const struct bpf_func_proto bpf_seq_write_proto = {
        .func                = bpf_seq_write,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_seq_file_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr,
           u32, btf_ptr_size, u64, flags)
{
        const struct btf *btf;
        s32 btf_id;
        int ret;

        ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
        if (ret)
                return ret;

        return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags);
}

static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
        .func                = bpf_seq_printf_btf,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_seq_file_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
                     u64 *value, u64 *enabled, u64 *running)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
        struct bpf_event_entry *ee;

        if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (index == BPF_F_CURRENT_CPU)
                index = cpu;
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;

        ee = READ_ONCE(array->ptrs[index]);
        if (!ee)
                return -ENOENT;

        return perf_event_read_local(ee->event, value, enabled, running);
}

BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
{
        u64 value = 0;
        int err;

        err = get_map_perf_counter(map, flags, &value, NULL, NULL);
        /*
         * this api is ugly since we miss [-22..-2] range of valid
         * counter values, but that's uapi
         */
        if (err)
                return err;
        return value;
}

static const struct bpf_func_proto bpf_perf_event_read_proto = {
        .func                = bpf_perf_event_read,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
           struct bpf_perf_event_value *, buf, u32, size)
{
        int err = -EINVAL;

        if (unlikely(size != sizeof(struct bpf_perf_event_value)))
                goto clear;
        err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
                                   &buf->running);
        if (unlikely(err))
                goto clear;
        return 0;
clear:
        memset(buf, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
        .func                = bpf_perf_event_read_value,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
                        u64 flags, struct perf_sample_data *sd)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        unsigned int cpu = smp_processor_id();
        u64 index = flags & BPF_F_INDEX_MASK;
        struct bpf_event_entry *ee;
        struct perf_event *event;

        if (index == BPF_F_CURRENT_CPU)
                index = cpu;
        if (unlikely(index >= array->map.max_entries))
                return -E2BIG;

        ee = READ_ONCE(array->ptrs[index]);
        if (!ee)
                return -ENOENT;

        event = ee->event;
        if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
                     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
                return -EINVAL;

        if (unlikely(event->oncpu != cpu))
                return -EOPNOTSUPP;

        return perf_event_output(event, sd, regs);
}

/*
 * Support executing tracepoints in normal, irq, and nmi context that each call
 * bpf_perf_event_output
 */
struct bpf_trace_sample_data {
        struct perf_sample_data sds[3];
};

static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
{
        struct bpf_trace_sample_data *sds;
        struct perf_raw_record raw = {
                .frag = {
                        .size = size,
                        .data = data,
                },
        };
        struct perf_sample_data *sd;
        int nest_level, err;

        preempt_disable();
        sds = this_cpu_ptr(&bpf_trace_sds);
        nest_level = this_cpu_inc_return(bpf_trace_nest_level);

        if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
                err = -EBUSY;
                goto out;
        }

        sd = &sds->sds[nest_level - 1];

        if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
                err = -EINVAL;
                goto out;
        }

        perf_sample_data_init(sd, 0, 0);
        perf_sample_save_raw_data(sd, &raw);

        err = __bpf_perf_event_output(regs, map, flags, sd);
out:
        this_cpu_dec(bpf_trace_nest_level);
        preempt_enable();
        return err;
}

static const struct bpf_func_proto bpf_perf_event_output_proto = {
        .func                = bpf_perf_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static DEFINE_PER_CPU(int, bpf_event_output_nest_level);
struct bpf_nested_pt_regs {
        struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs);
static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);

u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
        struct perf_raw_frag frag = {
                .copy                = ctx_copy,
                .size                = ctx_size,
                .data                = ctx,
        };
        struct perf_raw_record raw = {
                .frag = {
                        {
                                .next        = ctx_size ? &frag : NULL,
                        },
                        .size        = meta_size,
                        .data        = meta,
                },
        };
        struct perf_sample_data *sd;
        struct pt_regs *regs;
        int nest_level;
        u64 ret;

        preempt_disable();
        nest_level = this_cpu_inc_return(bpf_event_output_nest_level);

        if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
                ret = -EBUSY;
                goto out;
        }
        sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]);
        regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);

        perf_fetch_caller_regs(regs);
        perf_sample_data_init(sd, 0, 0);
        perf_sample_save_raw_data(sd, &raw);

        ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
        this_cpu_dec(bpf_event_output_nest_level);
        preempt_enable();
        return ret;
}

BPF_CALL_0(bpf_get_current_task)
{
        return (long) current;
}

const struct bpf_func_proto bpf_get_current_task_proto = {
        .func                = bpf_get_current_task,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_0(bpf_get_current_task_btf)
{
        return (unsigned long) current;
}

const struct bpf_func_proto bpf_get_current_task_btf_proto = {
        .func                = bpf_get_current_task_btf,
        .gpl_only        = true,
        .ret_type        = RET_PTR_TO_BTF_ID_TRUSTED,
        .ret_btf_id        = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};

BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
{
        return (unsigned long) task_pt_regs(task);
}

BTF_ID_LIST(bpf_task_pt_regs_ids)
BTF_ID(struct, pt_regs)

const struct bpf_func_proto bpf_task_pt_regs_proto = {
        .func                = bpf_task_pt_regs,
        .gpl_only        = true,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
        .ret_type        = RET_PTR_TO_BTF_ID,
        .ret_btf_id        = &bpf_task_pt_regs_ids[0],
};

BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct cgroup *cgrp;

        if (unlikely(idx >= array->map.max_entries))
                return -E2BIG;

        cgrp = READ_ONCE(array->ptrs[idx]);
        if (unlikely(!cgrp))
                return -EAGAIN;

        return task_under_cgroup_hierarchy(current, cgrp);
}

static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
        .func           = bpf_current_task_under_cgroup,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
};

struct send_signal_irq_work {
        struct irq_work irq_work;
        struct task_struct *task;
        u32 sig;
        enum pid_type type;
};

static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);

static void do_bpf_send_signal(struct irq_work *entry)
{
        struct send_signal_irq_work *work;

        work = container_of(entry, struct send_signal_irq_work, irq_work);
        group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
        put_task_struct(work->task);
}

static int bpf_send_signal_common(u32 sig, enum pid_type type)
{
        struct send_signal_irq_work *work = NULL;

        /* Similar to bpf_probe_write_user, task needs to be
         * in a sound condition and kernel memory access be
         * permitted in order to send signal to the current
         * task.
         */
        if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
                return -EPERM;
        if (unlikely(!nmi_uaccess_okay()))
                return -EPERM;
        /* Task should not be pid=1 to avoid kernel panic. */
        if (unlikely(is_global_init(current)))
                return -EPERM;

        if (irqs_disabled()) {
                /* Do an early check on signal validity. Otherwise,
                 * the error is lost in deferred irq_work.
                 */
                if (unlikely(!valid_signal(sig)))
                        return -EINVAL;

                work = this_cpu_ptr(&send_signal_work);
                if (irq_work_is_busy(&work->irq_work))
                        return -EBUSY;

                /* Add the current task, which is the target of sending signal,
                 * to the irq_work. The current task may change when queued
                 * irq works get executed.
                 */
                work->task = get_task_struct(current);
                work->sig = sig;
                work->type = type;
                irq_work_queue(&work->irq_work);
                return 0;
        }

        return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
}

BPF_CALL_1(bpf_send_signal, u32, sig)
{
        return bpf_send_signal_common(sig, PIDTYPE_TGID);
}

static const struct bpf_func_proto bpf_send_signal_proto = {
        .func                = bpf_send_signal,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_send_signal_thread, u32, sig)
{
        return bpf_send_signal_common(sig, PIDTYPE_PID);
}

static const struct bpf_func_proto bpf_send_signal_thread_proto = {
        .func                = bpf_send_signal_thread,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
{
        struct path copy;
        long len;
        char *p;

        if (!sz)
                return 0;

        /*
         * The path pointer is verified as trusted and safe to use,
         * but let's double check it's valid anyway to workaround
         * potentially broken verifier.
         */
        len = copy_from_kernel_nofault(&copy, path, sizeof(*path));
        if (len < 0)
                return len;

        p = d_path(&copy, buf, sz);
        if (IS_ERR(p)) {
                len = PTR_ERR(p);
        } else {
                len = buf + sz - p;
                memmove(buf, p, len);
        }

        return len;
}

BTF_SET_START(btf_allowlist_d_path)
#ifdef CONFIG_SECURITY
BTF_ID(func, security_file_permission)
BTF_ID(func, security_inode_getattr)
BTF_ID(func, security_file_open)
#endif
#ifdef CONFIG_SECURITY_PATH
BTF_ID(func, security_path_truncate)
#endif
BTF_ID(func, vfs_truncate)
BTF_ID(func, vfs_fallocate)
BTF_ID(func, dentry_open)
BTF_ID(func, vfs_getattr)
BTF_ID(func, filp_close)
BTF_SET_END(btf_allowlist_d_path)

static bool bpf_d_path_allowed(const struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_TRACING &&
            prog->expected_attach_type == BPF_TRACE_ITER)
                return true;

        if (prog->type == BPF_PROG_TYPE_LSM)
                return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);

        return btf_id_set_contains(&btf_allowlist_d_path,
                                   prog->aux->attach_btf_id);
}

BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path)

static const struct bpf_func_proto bpf_d_path_proto = {
        .func                = bpf_d_path,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_d_path_btf_ids[0],
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .allowed        = bpf_d_path_allowed,
};

#define BTF_F_ALL        (BTF_F_COMPACT  | BTF_F_NONAME | \
                         BTF_F_PTR_RAW | BTF_F_ZERO)

static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
                                  u64 flags, const struct btf **btf,
                                  s32 *btf_id)
{
        const struct btf_type *t;

        if (unlikely(flags & ~(BTF_F_ALL)))
                return -EINVAL;

        if (btf_ptr_size != sizeof(struct btf_ptr))
                return -EINVAL;

        *btf = bpf_get_btf_vmlinux();

        if (IS_ERR_OR_NULL(*btf))
                return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;

        if (ptr->type_id > 0)
                *btf_id = ptr->type_id;
        else
                return -EINVAL;

        if (*btf_id > 0)
                t = btf_type_by_id(*btf, *btf_id);
        if (*btf_id <= 0 || !t)
                return -ENOENT;

        return 0;
}

BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr,
           u32, btf_ptr_size, u64, flags)
{
        const struct btf *btf;
        s32 btf_id;
        int ret;

        ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
        if (ret)
                return ret;

        return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size,
                                      flags);
}

const struct bpf_func_proto bpf_snprintf_btf_proto = {
        .func                = bpf_snprintf_btf,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM,
        .arg2_type        = ARG_CONST_SIZE,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
{
        /* This helper call is inlined by verifier. */
        return ((u64 *)ctx)[-2];
}

static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
        .func                = bpf_get_func_ip_tracing,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

#ifdef CONFIG_X86_KERNEL_IBT
static unsigned long get_entry_ip(unsigned long fentry_ip)
{
        u32 instr;

        /* We want to be extra safe in case entry ip is on the page edge,
         * but otherwise we need to avoid get_kernel_nofault()'s overhead.
         */
        if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
                if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
                        return fentry_ip;
        } else {
                instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
        }
        if (is_endbr(instr))
                fentry_ip -= ENDBR_INSN_SIZE;
        return fentry_ip;
}
#else
#define get_entry_ip(fentry_ip) fentry_ip
#endif

BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
        struct bpf_trace_run_ctx *run_ctx __maybe_unused;
        struct kprobe *kp;

#ifdef CONFIG_UPROBES
        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        if (run_ctx->is_uprobe)
                return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr;
#endif

        kp = kprobe_running();

        if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
                return 0;

        return get_entry_ip((uintptr_t)kp->addr);
}

static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
        .func                = bpf_get_func_ip_kprobe,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
{
        return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
        .func                = bpf_get_func_ip_kprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
{
        return bpf_kprobe_multi_cookie(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
        .func                = bpf_get_attach_cookie_kprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs)
{
        return bpf_uprobe_multi_entry_ip(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = {
        .func                = bpf_get_func_ip_uprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs)
{
        return bpf_uprobe_multi_cookie(current->bpf_ctx);
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = {
        .func                = bpf_get_attach_cookie_uprobe_multi,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
        struct bpf_trace_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        return run_ctx->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
        .func                = bpf_get_attach_cookie_trace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
{
        return ctx->event->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
        .func                = bpf_get_attach_cookie_pe,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx)
{
        struct bpf_trace_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        return run_ctx->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
        .func                = bpf_get_attach_cookie_tracing,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
        static const u32 br_entry_size = sizeof(struct perf_branch_entry);
        u32 entry_cnt = size / br_entry_size;

        entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt);

        if (unlikely(flags))
                return -EINVAL;

        if (!entry_cnt)
                return -ENOENT;

        return entry_cnt * br_entry_size;
}

static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
        .func                = bpf_get_branch_snapshot,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
{
        /* This helper call is inlined by verifier. */
        u64 nr_args = ((u64 *)ctx)[-1];

        if ((u64) n >= nr_args)
                return -EINVAL;
        *value = ((u64 *)ctx)[n];
        return 0;
}

static const struct bpf_func_proto bpf_get_func_arg_proto = {
        .func                = get_func_arg,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_LONG,
};

BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
{
        /* This helper call is inlined by verifier. */
        u64 nr_args = ((u64 *)ctx)[-1];

        *value = ((u64 *)ctx)[nr_args];
        return 0;
}

static const struct bpf_func_proto bpf_get_func_ret_proto = {
        .func                = get_func_ret,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_LONG,
};

BPF_CALL_1(get_func_arg_cnt, void *, ctx)
{
        /* This helper call is inlined by verifier. */
        return ((u64 *)ctx)[-1];
}

static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
        .func                = get_func_arg_cnt,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

#ifdef CONFIG_KEYS
__bpf_kfunc_start_defs();

/**
 * bpf_lookup_user_key - lookup a key by its serial
 * @serial: key handle serial number
 * @flags: lookup-specific flags
 *
 * Search a key with a given *serial* and the provided *flags*.
 * If found, increment the reference count of the key by one, and
 * return it in the bpf_key structure.
 *
 * The bpf_key structure must be passed to bpf_key_put() when done
 * with it, so that the key reference count is decremented and the
 * bpf_key structure is freed.
 *
 * Permission checks are deferred to the time the key is used by
 * one of the available key-specific kfuncs.
 *
 * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
 * special keyring (e.g. session keyring), if it doesn't yet exist.
 * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
 * for the key construction, and to retrieve uninstantiated keys (keys
 * without data attached to them).
 *
 * Return: a bpf_key pointer with a valid key pointer if the key is found, a
 *         NULL pointer otherwise.
 */
__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
{
        key_ref_t key_ref;
        struct bpf_key *bkey;

        if (flags & ~KEY_LOOKUP_ALL)
                return NULL;

        /*
         * Permission check is deferred until the key is used, as the
         * intent of the caller is unknown here.
         */
        key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
        if (IS_ERR(key_ref))
                return NULL;

        bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
        if (!bkey) {
                key_put(key_ref_to_ptr(key_ref));
                return NULL;
        }

        bkey->key = key_ref_to_ptr(key_ref);
        bkey->has_ref = true;

        return bkey;
}

/**
 * bpf_lookup_system_key - lookup a key by a system-defined ID
 * @id: key ID
 *
 * Obtain a bpf_key structure with a key pointer set to the passed key ID.
 * The key pointer is marked as invalid, to prevent bpf_key_put() from
 * attempting to decrement the key reference count on that pointer. The key
 * pointer set in such way is currently understood only by
 * verify_pkcs7_signature().
 *
 * Set *id* to one of the values defined in include/linux/verification.h:
 * 0 for the primary keyring (immutable keyring of system keys);
 * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
 * (where keys can be added only if they are vouched for by existing keys
 * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
 * keyring (primarily used by the integrity subsystem to verify a kexec'ed
 * kerned image and, possibly, the initramfs signature).
 *
 * Return: a bpf_key pointer with an invalid key pointer set from the
 *         pre-determined ID on success, a NULL pointer otherwise
 */
__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
{
        struct bpf_key *bkey;

        if (system_keyring_id_check(id) < 0)
                return NULL;

        bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
        if (!bkey)
                return NULL;

        bkey->key = (struct key *)(unsigned long)id;
        bkey->has_ref = false;

        return bkey;
}

/**
 * bpf_key_put - decrement key reference count if key is valid and free bpf_key
 * @bkey: bpf_key structure
 *
 * Decrement the reference count of the key inside *bkey*, if the pointer
 * is valid, and free *bkey*.
 */
__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
{
        if (bkey->has_ref)
                key_put(bkey->key);

        kfree(bkey);
}

#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
/**
 * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
 * @data_ptr: data to verify
 * @sig_ptr: signature of the data
 * @trusted_keyring: keyring with keys trusted for signature verification
 *
 * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
 * with keys in a keyring referenced by *trusted_keyring*.
 *
 * Return: 0 on success, a negative value on error.
 */
__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
                               struct bpf_dynptr_kern *sig_ptr,
                               struct bpf_key *trusted_keyring)
{
        const void *data, *sig;
        u32 data_len, sig_len;
        int ret;

        if (trusted_keyring->has_ref) {
                /*
                 * Do the permission check deferred in bpf_lookup_user_key().
                 * See bpf_lookup_user_key() for more details.
                 *
                 * A call to key_task_permission() here would be redundant, as
                 * it is already done by keyring_search() called by
                 * find_asymmetric_key().
                 */
                ret = key_validate(trusted_keyring->key);
                if (ret < 0)
                        return ret;
        }

        data_len = __bpf_dynptr_size(data_ptr);
        data = __bpf_dynptr_data(data_ptr, data_len);
        sig_len = __bpf_dynptr_size(sig_ptr);
        sig = __bpf_dynptr_data(sig_ptr, sig_len);

        return verify_pkcs7_signature(data, data_len, sig, sig_len,
                                      trusted_keyring->key,
                                      VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
                                      NULL);
}
#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(key_sig_kfunc_set)
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
BTF_KFUNCS_END(key_sig_kfunc_set)

static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
        .owner = THIS_MODULE,
        .set = &key_sig_kfunc_set,
};

static int __init bpf_key_sig_kfuncs_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
                                         &bpf_key_sig_kfunc_set);
}

late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */

/* filesystem kfuncs */
__bpf_kfunc_start_defs();

/**
 * bpf_get_file_xattr - get xattr of a file
 * @file: file to get xattr from
 * @name__str: name of the xattr
 * @value_ptr: output buffer of the xattr value
 *
 * Get xattr *name__str* of *file* and store the output in *value_ptr*.
 *
 * For security reasons, only *name__str* with prefix "user." is allowed.
 *
 * Return: 0 on success, a negative value on error.
 */
__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
                                   struct bpf_dynptr_kern *value_ptr)
{
        struct dentry *dentry;
        u32 value_len;
        void *value;
        int ret;

        if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
                return -EPERM;

        value_len = __bpf_dynptr_size(value_ptr);
        value = __bpf_dynptr_data_rw(value_ptr, value_len);
        if (!value)
                return -EINVAL;

        dentry = file_dentry(file);
        ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
        if (ret)
                return ret;
        return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(fs_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_KFUNCS_END(fs_kfunc_set_ids)

static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
        if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
                return 0;

        /* Only allow to attach from LSM hooks, to avoid recursion */
        return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
}

static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
        .owner = THIS_MODULE,
        .set = &fs_kfunc_set_ids,
        .filter = bpf_get_file_xattr_filter,
};

static int __init bpf_fs_kfuncs_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
}

late_initcall(bpf_fs_kfuncs_init);

static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_map_lookup_elem:
                return &bpf_map_lookup_elem_proto;
        case BPF_FUNC_map_update_elem:
                return &bpf_map_update_elem_proto;
        case BPF_FUNC_map_delete_elem:
                return &bpf_map_delete_elem_proto;
        case BPF_FUNC_map_push_elem:
                return &bpf_map_push_elem_proto;
        case BPF_FUNC_map_pop_elem:
                return &bpf_map_pop_elem_proto;
        case BPF_FUNC_map_peek_elem:
                return &bpf_map_peek_elem_proto;
        case BPF_FUNC_map_lookup_percpu_elem:
                return &bpf_map_lookup_percpu_elem_proto;
        case BPF_FUNC_ktime_get_ns:
                return &bpf_ktime_get_ns_proto;
        case BPF_FUNC_ktime_get_boot_ns:
                return &bpf_ktime_get_boot_ns_proto;
        case BPF_FUNC_tail_call:
                return &bpf_tail_call_proto;
        case BPF_FUNC_get_current_task:
                return &bpf_get_current_task_proto;
        case BPF_FUNC_get_current_task_btf:
                return &bpf_get_current_task_btf_proto;
        case BPF_FUNC_task_pt_regs:
                return &bpf_task_pt_regs_proto;
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_get_current_comm:
                return &bpf_get_current_comm_proto;
        case BPF_FUNC_trace_printk:
                return bpf_get_trace_printk_proto();
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_get_numa_node_id:
                return &bpf_get_numa_node_id_proto;
        case BPF_FUNC_perf_event_read:
                return &bpf_perf_event_read_proto;
        case BPF_FUNC_current_task_under_cgroup:
                return &bpf_current_task_under_cgroup_proto;
        case BPF_FUNC_get_prandom_u32:
                return &bpf_get_prandom_u32_proto;
        case BPF_FUNC_probe_write_user:
                return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
                       NULL : bpf_get_probe_write_proto();
        case BPF_FUNC_probe_read_user:
                return &bpf_probe_read_user_proto;
        case BPF_FUNC_probe_read_kernel:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_kernel_proto;
        case BPF_FUNC_probe_read_user_str:
                return &bpf_probe_read_user_str_proto;
        case BPF_FUNC_probe_read_kernel_str:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        case BPF_FUNC_probe_read:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_compat_proto;
        case BPF_FUNC_probe_read_str:
                return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
                       NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
        case BPF_FUNC_cgrp_storage_get:
                return &bpf_cgrp_storage_get_proto;
        case BPF_FUNC_cgrp_storage_delete:
                return &bpf_cgrp_storage_delete_proto;
#endif
        case BPF_FUNC_send_signal:
                return &bpf_send_signal_proto;
        case BPF_FUNC_send_signal_thread:
                return &bpf_send_signal_thread_proto;
        case BPF_FUNC_perf_event_read_value:
                return &bpf_perf_event_read_value_proto;
        case BPF_FUNC_ringbuf_output:
                return &bpf_ringbuf_output_proto;
        case BPF_FUNC_ringbuf_reserve:
                return &bpf_ringbuf_reserve_proto;
        case BPF_FUNC_ringbuf_submit:
                return &bpf_ringbuf_submit_proto;
        case BPF_FUNC_ringbuf_discard:
                return &bpf_ringbuf_discard_proto;
        case BPF_FUNC_ringbuf_query:
                return &bpf_ringbuf_query_proto;
        case BPF_FUNC_jiffies64:
                return &bpf_jiffies64_proto;
        case BPF_FUNC_get_task_stack:
                return &bpf_get_task_stack_proto;
        case BPF_FUNC_copy_from_user:
                return &bpf_copy_from_user_proto;
        case BPF_FUNC_copy_from_user_task:
                return &bpf_copy_from_user_task_proto;
        case BPF_FUNC_snprintf_btf:
                return &bpf_snprintf_btf_proto;
        case BPF_FUNC_per_cpu_ptr:
                return &bpf_per_cpu_ptr_proto;
        case BPF_FUNC_this_cpu_ptr:
                return &bpf_this_cpu_ptr_proto;
        case BPF_FUNC_task_storage_get:
                if (bpf_prog_check_recur(prog))
                        return &bpf_task_storage_get_recur_proto;
                return &bpf_task_storage_get_proto;
        case BPF_FUNC_task_storage_delete:
                if (bpf_prog_check_recur(prog))
                        return &bpf_task_storage_delete_recur_proto;
                return &bpf_task_storage_delete_proto;
        case BPF_FUNC_for_each_map_elem:
                return &bpf_for_each_map_elem_proto;
        case BPF_FUNC_snprintf:
                return &bpf_snprintf_proto;
        case BPF_FUNC_get_func_ip:
                return &bpf_get_func_ip_proto_tracing;
        case BPF_FUNC_get_branch_snapshot:
                return &bpf_get_branch_snapshot_proto;
        case BPF_FUNC_find_vma:
                return &bpf_find_vma_proto;
        case BPF_FUNC_trace_vprintk:
                return bpf_get_trace_vprintk_proto();
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool is_kprobe_multi(const struct bpf_prog *prog)
{
        return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
               prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}

static inline bool is_kprobe_session(const struct bpf_prog *prog)
{
        return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}

static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
        case BPF_FUNC_override_return:
                return &bpf_override_return_proto;
#endif
        case BPF_FUNC_get_func_ip:
                if (is_kprobe_multi(prog))
                        return &bpf_get_func_ip_proto_kprobe_multi;
                if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
                        return &bpf_get_func_ip_proto_uprobe_multi;
                return &bpf_get_func_ip_proto_kprobe;
        case BPF_FUNC_get_attach_cookie:
                if (is_kprobe_multi(prog))
                        return &bpf_get_attach_cookie_proto_kmulti;
                if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
                        return &bpf_get_attach_cookie_proto_umulti;
                return &bpf_get_attach_cookie_proto_trace;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct pt_regs))
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0)
                return false;
        /*
         * Assertion for 32 bit to make sure last 8 byte access
         * (BPF_DW) to the last 4 byte member is disallowed.
         */
        if (off + size > sizeof(struct pt_regs))
                return false;

        return true;
}

const struct bpf_verifier_ops kprobe_verifier_ops = {
        .get_func_proto  = kprobe_prog_func_proto,
        .is_valid_access = kprobe_prog_is_valid_access,
};

const struct bpf_prog_ops kprobe_prog_ops = {
};

BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
           u64, flags, void *, data, u64, size)
{
        struct pt_regs *regs = *(struct pt_regs **)tp_buff;

        /*
         * r1 points to perf tracepoint buffer where first 8 bytes are hidden
         * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
         * from there and call the same bpf_perf_event_output() helper inline.
         */
        return ____bpf_perf_event_output(regs, map, flags, data, size);
}

static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
        .func                = bpf_perf_event_output_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
           u64, flags)
{
        struct pt_regs *regs = *(struct pt_regs **)tp_buff;

        /*
         * Same comment as in bpf_perf_event_output_tp(), only that this time
         * the other helper's function body cannot be inlined due to being
         * external, thus we need to call raw helper function.
         */
        return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
                               flags, 0, 0);
}

static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
        .func                = bpf_get_stackid_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
           u64, flags)
{
        struct pt_regs *regs = *(struct pt_regs **)tp_buff;

        return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
                             (unsigned long) size, flags, 0);
}

static const struct bpf_func_proto bpf_get_stack_proto_tp = {
        .func                = bpf_get_stack_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto_tp;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto_tp;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto_tp;
        case BPF_FUNC_get_attach_cookie:
                return &bpf_get_attach_cookie_proto_trace;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0)
                return false;

        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
        return true;
}

const struct bpf_verifier_ops tracepoint_verifier_ops = {
        .get_func_proto  = tp_prog_func_proto,
        .is_valid_access = tp_prog_is_valid_access,
};

const struct bpf_prog_ops tracepoint_prog_ops = {
};

BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
           struct bpf_perf_event_value *, buf, u32, size)
{
        int err = -EINVAL;

        if (unlikely(size != sizeof(struct bpf_perf_event_value)))
                goto clear;
        err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
                                    &buf->running);
        if (unlikely(err))
                goto clear;
        return 0;
clear:
        memset(buf, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
         .func           = bpf_perf_prog_read_value,
         .gpl_only       = true,
         .ret_type       = RET_INTEGER,
         .arg1_type      = ARG_PTR_TO_CTX,
         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
         .arg3_type      = ARG_CONST_SIZE,
};

BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
           void *, buf, u32, size, u64, flags)
{
        static const u32 br_entry_size = sizeof(struct perf_branch_entry);
        struct perf_branch_stack *br_stack = ctx->data->br_stack;
        u32 to_copy;

        if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
                return -EINVAL;

        if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK)))
                return -ENOENT;

        if (unlikely(!br_stack))
                return -ENOENT;

        if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
                return br_stack->nr * br_entry_size;

        if (!buf || (size % br_entry_size != 0))
                return -EINVAL;

        to_copy = min_t(u32, br_stack->nr * br_entry_size, size);
        memcpy(buf, br_stack->entries, to_copy);

        return to_copy;
}

static const struct bpf_func_proto bpf_read_branch_records_proto = {
        .func           = bpf_read_branch_records,
        .gpl_only       = true,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
};

static const struct bpf_func_proto *
pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto_tp;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto_pe;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto_pe;
        case BPF_FUNC_perf_prog_read_value:
                return &bpf_perf_prog_read_value_proto;
        case BPF_FUNC_read_branch_records:
                return &bpf_read_branch_records_proto;
        case BPF_FUNC_get_attach_cookie:
                return &bpf_get_attach_cookie_proto_pe;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

/*
 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
 * to avoid potential recursive reuse issue when/if tracepoints are added
 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
 *
 * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
 * in normal, irq, and nmi context.
 */
struct bpf_raw_tp_regs {
        struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
static struct pt_regs *get_bpf_raw_tp_regs(void)
{
        struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
        int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);

        if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
                this_cpu_dec(bpf_raw_tp_nest_level);
                return ERR_PTR(-EBUSY);
        }

        return &tp_regs->regs[nest_level - 1];
}

static void put_bpf_raw_tp_regs(void)
{
        this_cpu_dec(bpf_raw_tp_nest_level);
}

BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
           struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
        struct pt_regs *regs = get_bpf_raw_tp_regs();
        int ret;

        if (IS_ERR(regs))
                return PTR_ERR(regs);

        perf_fetch_caller_regs(regs);
        ret = ____bpf_perf_event_output(regs, map, flags, data, size);

        put_bpf_raw_tp_regs();
        return ret;
}

static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
        .func                = bpf_perf_event_output_raw_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;

BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
           struct bpf_map *, map, u64, flags)
{
        struct pt_regs *regs = get_bpf_raw_tp_regs();
        int ret;

        if (IS_ERR(regs))
                return PTR_ERR(regs);

        perf_fetch_caller_regs(regs);
        /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
        ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
                              flags, 0, 0);
        put_bpf_raw_tp_regs();
        return ret;
}

static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
        .func                = bpf_get_stackid_raw_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
           void *, buf, u32, size, u64, flags)
{
        struct pt_regs *regs = get_bpf_raw_tp_regs();
        int ret;

        if (IS_ERR(regs))
                return PTR_ERR(regs);

        perf_fetch_caller_regs(regs);
        ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
                            (unsigned long) size, flags, 0);
        put_bpf_raw_tp_regs();
        return ret;
}

static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
        .func                = bpf_get_stack_raw_tp,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_perf_event_output_proto_raw_tp;
        case BPF_FUNC_get_stackid:
                return &bpf_get_stackid_proto_raw_tp;
        case BPF_FUNC_get_stack:
                return &bpf_get_stack_proto_raw_tp;
        case BPF_FUNC_get_attach_cookie:
                return &bpf_get_attach_cookie_proto_tracing;
        default:
                return bpf_tracing_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *fn;

        switch (func_id) {
#ifdef CONFIG_NET
        case BPF_FUNC_skb_output:
                return &bpf_skb_output_proto;
        case BPF_FUNC_xdp_output:
                return &bpf_xdp_output_proto;
        case BPF_FUNC_skc_to_tcp6_sock:
                return &bpf_skc_to_tcp6_sock_proto;
        case BPF_FUNC_skc_to_tcp_sock:
                return &bpf_skc_to_tcp_sock_proto;
        case BPF_FUNC_skc_to_tcp_timewait_sock:
                return &bpf_skc_to_tcp_timewait_sock_proto;
        case BPF_FUNC_skc_to_tcp_request_sock:
                return &bpf_skc_to_tcp_request_sock_proto;
        case BPF_FUNC_skc_to_udp6_sock:
                return &bpf_skc_to_udp6_sock_proto;
        case BPF_FUNC_skc_to_unix_sock:
                return &bpf_skc_to_unix_sock_proto;
        case BPF_FUNC_skc_to_mptcp_sock:
                return &bpf_skc_to_mptcp_sock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_tracing_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_tracing_proto;
        case BPF_FUNC_sock_from_file:
                return &bpf_sock_from_file_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_ptr_cookie_proto;
        case BPF_FUNC_xdp_get_buff_len:
                return &bpf_xdp_get_buff_len_trace_proto;
#endif
        case BPF_FUNC_seq_printf:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
                       &bpf_seq_printf_proto :
                       NULL;
        case BPF_FUNC_seq_write:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
                       &bpf_seq_write_proto :
                       NULL;
        case BPF_FUNC_seq_printf_btf:
                return prog->expected_attach_type == BPF_TRACE_ITER ?
                       &bpf_seq_printf_btf_proto :
                       NULL;
        case BPF_FUNC_d_path:
                return &bpf_d_path_proto;
        case BPF_FUNC_get_func_arg:
                return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
        case BPF_FUNC_get_func_ret:
                return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
        case BPF_FUNC_get_func_arg_cnt:
                return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
        case BPF_FUNC_get_attach_cookie:
                if (prog->type == BPF_PROG_TYPE_TRACING &&
                    prog->expected_attach_type == BPF_TRACE_RAW_TP)
                        return &bpf_get_attach_cookie_proto_tracing;
                return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
        default:
                fn = raw_tp_prog_func_proto(func_id, prog);
                if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
                        fn = bpf_iter_get_func_proto(func_id, prog);
                return fn;
        }
}

static bool raw_tp_prog_is_valid_access(int off, int size,
                                        enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        return bpf_tracing_ctx_access(off, size, type);
}

static bool tracing_prog_is_valid_access(int off, int size,
                                         enum bpf_access_type type,
                                         const struct bpf_prog *prog,
                                         struct bpf_insn_access_aux *info)
{
        return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
}

int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog,
                                     const union bpf_attr *kattr,
                                     union bpf_attr __user *uattr)
{
        return -ENOTSUPP;
}

const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
        .get_func_proto  = raw_tp_prog_func_proto,
        .is_valid_access = raw_tp_prog_is_valid_access,
};

const struct bpf_prog_ops raw_tracepoint_prog_ops = {
#ifdef CONFIG_NET
        .test_run = bpf_prog_test_run_raw_tp,
#endif
};

const struct bpf_verifier_ops tracing_verifier_ops = {
        .get_func_proto  = tracing_prog_func_proto,
        .is_valid_access = tracing_prog_is_valid_access,
};

const struct bpf_prog_ops tracing_prog_ops = {
        .test_run = bpf_prog_test_run_tracing,
};

static bool raw_tp_writable_prog_is_valid_access(int off, int size,
                                                 enum bpf_access_type type,
                                                 const struct bpf_prog *prog,
                                                 struct bpf_insn_access_aux *info)
{
        if (off == 0) {
                if (size != sizeof(u64) || type != BPF_READ)
                        return false;
                info->reg_type = PTR_TO_TP_BUFFER;
        }
        return raw_tp_prog_is_valid_access(off, size, type, prog, info);
}

const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
        .get_func_proto  = raw_tp_prog_func_proto,
        .is_valid_access = raw_tp_writable_prog_is_valid_access,
};

const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
};

static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        const int size_u64 = sizeof(u64);

        if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
                return false;
        if (type != BPF_READ)
                return false;
        if (off % size != 0) {
                if (sizeof(unsigned long) != 4)
                        return false;
                if (size != 8)
                        return false;
                if (off % size != 4)
                        return false;
        }

        switch (off) {
        case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
                bpf_ctx_record_field_size(info, size_u64);
                if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
                        return false;
                break;
        case bpf_ctx_range(struct bpf_perf_event_data, addr):
                bpf_ctx_record_field_size(info, size_u64);
                if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
                        return false;
                break;
        default:
                if (size != sizeof(long))
                        return false;
        }

        return true;
}

static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
                                      const struct bpf_insn *si,
                                      struct bpf_insn *insn_buf,
                                      struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_perf_event_data, sample_period):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
                                                       data), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_perf_event_data_kern, data));
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct perf_sample_data, period, 8,
                                                     target_size));
                break;
        case offsetof(struct bpf_perf_event_data, addr):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
                                                       data), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_perf_event_data_kern, data));
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct perf_sample_data, addr, 8,
                                                     target_size));
                break;
        default:
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
                                                       regs), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_perf_event_data_kern, regs));
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
                                      si->off);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops perf_event_verifier_ops = {
        .get_func_proto                = pe_prog_func_proto,
        .is_valid_access        = pe_prog_is_valid_access,
        .convert_ctx_access        = pe_prog_convert_ctx_access,
};

const struct bpf_prog_ops perf_event_prog_ops = {
};

static DEFINE_MUTEX(bpf_event_mutex);

#define BPF_TRACE_MAX_PROGS 64

int perf_event_attach_bpf_prog(struct perf_event *event,
                               struct bpf_prog *prog,
                               u64 bpf_cookie)
{
        struct bpf_prog_array *old_array;
        struct bpf_prog_array *new_array;
        int ret = -EEXIST;

        /*
         * Kprobe override only works if they are on the function entry,
         * and only if they are on the opt-in list.
         */
        if (prog->kprobe_override &&
            (!trace_kprobe_on_func_entry(event->tp_event) ||
             !trace_kprobe_error_injectable(event->tp_event)))
                return -EINVAL;

        mutex_lock(&bpf_event_mutex);

        if (event->prog)
                goto unlock;

        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
        if (old_array &&
            bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
                ret = -E2BIG;
                goto unlock;
        }

        ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
        if (ret < 0)
                goto unlock;

        /* set the new array to event->tp_event and set event->prog */
        event->prog = prog;
        event->bpf_cookie = bpf_cookie;
        rcu_assign_pointer(event->tp_event->prog_array, new_array);
        bpf_prog_array_free_sleepable(old_array);

unlock:
        mutex_unlock(&bpf_event_mutex);
        return ret;
}

void perf_event_detach_bpf_prog(struct perf_event *event)
{
        struct bpf_prog_array *old_array;
        struct bpf_prog_array *new_array;
        int ret;

        mutex_lock(&bpf_event_mutex);

        if (!event->prog)
                goto unlock;

        old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
        ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
        if (ret == -ENOENT)
                goto unlock;
        if (ret < 0) {
                bpf_prog_array_delete_safe(old_array, event->prog);
        } else {
                rcu_assign_pointer(event->tp_event->prog_array, new_array);
                bpf_prog_array_free_sleepable(old_array);
        }

        bpf_prog_put(event->prog);
        event->prog = NULL;

unlock:
        mutex_unlock(&bpf_event_mutex);
}

int perf_event_query_prog_array(struct perf_event *event, void __user *info)
{
        struct perf_event_query_bpf __user *uquery = info;
        struct perf_event_query_bpf query = {};
        struct bpf_prog_array *progs;
        u32 *ids, prog_cnt, ids_len;
        int ret;

        if (!perfmon_capable())
                return -EPERM;
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -EINVAL;
        if (copy_from_user(&query, uquery, sizeof(query)))
                return -EFAULT;

        ids_len = query.ids_len;
        if (ids_len > BPF_TRACE_MAX_PROGS)
                return -E2BIG;
        ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
        /*
         * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
         * is required when user only wants to check for uquery->prog_cnt.
         * There is no need to check for it since the case is handled
         * gracefully in bpf_prog_array_copy_info.
         */

        mutex_lock(&bpf_event_mutex);
        progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
        ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
        mutex_unlock(&bpf_event_mutex);

        if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
            copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
                ret = -EFAULT;

        kfree(ids);
        return ret;
}

extern struct bpf_raw_event_map __start__bpf_raw_tp[];
extern struct bpf_raw_event_map __stop__bpf_raw_tp[];

struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
{
        struct bpf_raw_event_map *btp = __start__bpf_raw_tp;

        for (; btp < __stop__bpf_raw_tp; btp++) {
                if (!strcmp(btp->tp->name, name))
                        return btp;
        }

        return bpf_get_raw_tracepoint_module(name);
}

void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
        struct module *mod;

        preempt_disable();
        mod = __module_address((unsigned long)btp);
        module_put(mod);
        preempt_enable();
}

static __always_inline
void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
        struct bpf_prog *prog = link->link.prog;
        struct bpf_run_ctx *old_run_ctx;
        struct bpf_trace_run_ctx run_ctx;

        cant_sleep();
        if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
                bpf_prog_inc_misses_counter(prog);
                goto out;
        }

        run_ctx.bpf_cookie = link->cookie;
        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);

        rcu_read_lock();
        (void) bpf_prog_run(prog, args);
        rcu_read_unlock();

        bpf_reset_run_ctx(old_run_ctx);
out:
        this_cpu_dec(*(prog->active));
}

#define UNPACK(...)                        __VA_ARGS__
#define REPEAT_1(FN, DL, X, ...)        FN(X)
#define REPEAT_2(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
#define REPEAT_3(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
#define REPEAT_4(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
#define REPEAT_5(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
#define REPEAT_6(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
#define REPEAT_7(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
#define REPEAT_8(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
#define REPEAT_9(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
#define REPEAT_10(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
#define REPEAT_11(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
#define REPEAT_12(FN, DL, X, ...)        FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
#define REPEAT(X, FN, DL, ...)                REPEAT_##X(FN, DL, __VA_ARGS__)

#define SARG(X)                u64 arg##X
#define COPY(X)                args[X] = arg##X

#define __DL_COM        (,)
#define __DL_SEM        (;)

#define __SEQ_0_11        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11

#define BPF_TRACE_DEFN_x(x)                                                \
        void bpf_trace_run##x(struct bpf_raw_tp_link *link,                \
                              REPEAT(x, SARG, __DL_COM, __SEQ_0_11))        \
        {                                                                \
                u64 args[x];                                                \
                REPEAT(x, COPY, __DL_SEM, __SEQ_0_11);                        \
                __bpf_trace_run(link, args);                                \
        }                                                                \
        EXPORT_SYMBOL_GPL(bpf_trace_run##x)
BPF_TRACE_DEFN_x(1);
BPF_TRACE_DEFN_x(2);
BPF_TRACE_DEFN_x(3);
BPF_TRACE_DEFN_x(4);
BPF_TRACE_DEFN_x(5);
BPF_TRACE_DEFN_x(6);
BPF_TRACE_DEFN_x(7);
BPF_TRACE_DEFN_x(8);
BPF_TRACE_DEFN_x(9);
BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);

int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
        struct tracepoint *tp = btp->tp;
        struct bpf_prog *prog = link->link.prog;

        /*
         * check that program doesn't access arguments beyond what's
         * available in this tracepoint
         */
        if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
                return -EINVAL;

        if (prog->aux->max_tp_access > btp->writable_size)
                return -EINVAL;

        return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
}

int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
        return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
}

int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
                            u32 *fd_type, const char **buf,
                            u64 *probe_offset, u64 *probe_addr,
                            unsigned long *missed)
{
        bool is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
        int flags, err = 0;

        prog = event->prog;
        if (!prog)
                return -ENOENT;

        /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
        if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
                return -EOPNOTSUPP;

        *prog_id = prog->aux->id;
        flags = event->tp_event->flags;
        is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);

        if (is_tracepoint || is_syscall_tp) {
                *buf = is_tracepoint ? event->tp_event->tp->name
                                     : event->tp_event->name;
                /* We allow NULL pointer for tracepoint */
                if (fd_type)
                        *fd_type = BPF_FD_TYPE_TRACEPOINT;
                if (probe_offset)
                        *probe_offset = 0x0;
                if (probe_addr)
                        *probe_addr = 0x0;
        } else {
                /* kprobe/uprobe */
                err = -EOPNOTSUPP;
#ifdef CONFIG_KPROBE_EVENTS
                if (flags & TRACE_EVENT_FL_KPROBE)
                        err = bpf_get_kprobe_info(event, fd_type, buf,
                                                  probe_offset, probe_addr, missed,
                                                  event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
#ifdef CONFIG_UPROBE_EVENTS
                if (flags & TRACE_EVENT_FL_UPROBE)
                        err = bpf_get_uprobe_info(event, fd_type, buf,
                                                  probe_offset, probe_addr,
                                                  event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
        }

        return err;
}

static int __init send_signal_irq_work_init(void)
{
        int cpu;
        struct send_signal_irq_work *work;

        for_each_possible_cpu(cpu) {
                work = per_cpu_ptr(&send_signal_work, cpu);
                init_irq_work(&work->irq_work, do_bpf_send_signal);
        }
        return 0;
}

subsys_initcall(send_signal_irq_work_init);

#ifdef CONFIG_MODULES
static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
                            void *module)
{
        struct bpf_trace_module *btm, *tmp;
        struct module *mod = module;
        int ret = 0;

        if (mod->num_bpf_raw_events == 0 ||
            (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
                goto out;

        mutex_lock(&bpf_module_mutex);

        switch (op) {
        case MODULE_STATE_COMING:
                btm = kzalloc(sizeof(*btm), GFP_KERNEL);
                if (btm) {
                        btm->module = module;
                        list_add(&btm->list, &bpf_trace_modules);
                } else {
                        ret = -ENOMEM;
                }
                break;
        case MODULE_STATE_GOING:
                list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) {
                        if (btm->module == module) {
                                list_del(&btm->list);
                                kfree(btm);
                                break;
                        }
                }
                break;
        }

        mutex_unlock(&bpf_module_mutex);

out:
        return notifier_from_errno(ret);
}

static struct notifier_block bpf_module_nb = {
        .notifier_call = bpf_event_notify,
};

static int __init bpf_event_init(void)
{
        register_module_notifier(&bpf_module_nb);
        return 0;
}

fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */

struct bpf_session_run_ctx {
        struct bpf_run_ctx run_ctx;
        bool is_return;
        void *data;
};

#ifdef CONFIG_FPROBE
struct bpf_kprobe_multi_link {
        struct bpf_link link;
        struct fprobe fp;
        unsigned long *addrs;
        u64 *cookies;
        u32 cnt;
        u32 mods_cnt;
        struct module **mods;
        u32 flags;
};

struct bpf_kprobe_multi_run_ctx {
        struct bpf_session_run_ctx session_ctx;
        struct bpf_kprobe_multi_link *link;
        unsigned long entry_ip;
};

struct user_syms {
        const char **syms;
        char *buf;
};

static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
{
        unsigned long __user usymbol;
        const char **syms = NULL;
        char *buf = NULL, *p;
        int err = -ENOMEM;
        unsigned int i;

        syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL);
        if (!syms)
                goto error;

        buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL);
        if (!buf)
                goto error;

        for (p = buf, i = 0; i < cnt; i++) {
                if (__get_user(usymbol, usyms + i)) {
                        err = -EFAULT;
                        goto error;
                }
                err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN);
                if (err == KSYM_NAME_LEN)
                        err = -E2BIG;
                if (err < 0)
                        goto error;
                syms[i] = p;
                p += err + 1;
        }

        us->syms = syms;
        us->buf = buf;
        return 0;

error:
        if (err) {
                kvfree(syms);
                kvfree(buf);
        }
        return err;
}

static void kprobe_multi_put_modules(struct module **mods, u32 cnt)
{
        u32 i;

        for (i = 0; i < cnt; i++)
                module_put(mods[i]);
}

static void free_user_syms(struct user_syms *us)
{
        kvfree(us->syms);
        kvfree(us->buf);
}

static void bpf_kprobe_multi_link_release(struct bpf_link *link)
{
        struct bpf_kprobe_multi_link *kmulti_link;

        kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
        unregister_fprobe(&kmulti_link->fp);
        kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt);
}

static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
{
        struct bpf_kprobe_multi_link *kmulti_link;

        kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
        kvfree(kmulti_link->addrs);
        kvfree(kmulti_link->cookies);
        kfree(kmulti_link->mods);
        kfree(kmulti_link);
}

static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
                                                struct bpf_link_info *info)
{
        u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies);
        u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
        struct bpf_kprobe_multi_link *kmulti_link;
        u32 ucount = info->kprobe_multi.count;
        int err = 0, i;

        if (!uaddrs ^ !ucount)
                return -EINVAL;
        if (ucookies && !ucount)
                return -EINVAL;

        kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
        info->kprobe_multi.count = kmulti_link->cnt;
        info->kprobe_multi.flags = kmulti_link->flags;
        info->kprobe_multi.missed = kmulti_link->fp.nmissed;

        if (!uaddrs)
                return 0;
        if (ucount < kmulti_link->cnt)
                err = -ENOSPC;
        else
                ucount = kmulti_link->cnt;

        if (ucookies) {
                if (kmulti_link->cookies) {
                        if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64)))
                                return -EFAULT;
                } else {
                        for (i = 0; i < ucount; i++) {
                                if (put_user(0, ucookies + i))
                                        return -EFAULT;
                        }
                }
        }

        if (kallsyms_show_value(current_cred())) {
                if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
                        return -EFAULT;
        } else {
                for (i = 0; i < ucount; i++) {
                        if (put_user(0, uaddrs + i))
                                return -EFAULT;
                }
        }
        return err;
}

static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
        .release = bpf_kprobe_multi_link_release,
        .dealloc_deferred = bpf_kprobe_multi_link_dealloc,
        .fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};

static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
{
        const struct bpf_kprobe_multi_link *link = priv;
        unsigned long *addr_a = a, *addr_b = b;
        u64 *cookie_a, *cookie_b;

        cookie_a = link->cookies + (addr_a - link->addrs);
        cookie_b = link->cookies + (addr_b - link->addrs);

        /* swap addr_a/addr_b and cookie_a/cookie_b values */
        swap(*addr_a, *addr_b);
        swap(*cookie_a, *cookie_b);
}

static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b)
{
        const unsigned long *addr_a = a, *addr_b = b;

        if (*addr_a == *addr_b)
                return 0;
        return *addr_a < *addr_b ? -1 : 1;
}

static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
{
        return bpf_kprobe_multi_addrs_cmp(a, b);
}

static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        struct bpf_kprobe_multi_run_ctx *run_ctx;
        struct bpf_kprobe_multi_link *link;
        u64 *cookie, entry_ip;
        unsigned long *addr;

        if (WARN_ON_ONCE(!ctx))
                return 0;
        run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
                               session_ctx.run_ctx);
        link = run_ctx->link;
        if (!link->cookies)
                return 0;
        entry_ip = run_ctx->entry_ip;
        addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
                       bpf_kprobe_multi_addrs_cmp);
        if (!addr)
                return 0;
        cookie = link->cookies + (addr - link->addrs);
        return *cookie;
}

static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        struct bpf_kprobe_multi_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
                               session_ctx.run_ctx);
        return run_ctx->entry_ip;
}

static int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
                           unsigned long entry_ip, struct pt_regs *regs,
                           bool is_return, void *data)
{
        struct bpf_kprobe_multi_run_ctx run_ctx = {
                .session_ctx = {
                        .is_return = is_return,
                        .data = data,
                },
                .link = link,
                .entry_ip = entry_ip,
        };
        struct bpf_run_ctx *old_run_ctx;
        int err;

        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
                bpf_prog_inc_misses_counter(link->link.prog);
                err = 0;
                goto out;
        }

        migrate_disable();
        rcu_read_lock();
        old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
        err = bpf_prog_run(link->link.prog, regs);
        bpf_reset_run_ctx(old_run_ctx);
        rcu_read_unlock();
        migrate_enable();

 out:
        __this_cpu_dec(bpf_prog_active);
        return err;
}

static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
                          unsigned long ret_ip, struct pt_regs *regs,
                          void *data)
{
        struct bpf_kprobe_multi_link *link;
        int err;

        link = container_of(fp, struct bpf_kprobe_multi_link, fp);
        err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data);
        return is_kprobe_session(link->link.prog) ? err : 0;
}

static void
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
                               unsigned long ret_ip, struct pt_regs *regs,
                               void *data)
{
        struct bpf_kprobe_multi_link *link;

        link = container_of(fp, struct bpf_kprobe_multi_link, fp);
        kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data);
}

static int symbols_cmp_r(const void *a, const void *b, const void *priv)
{
        const char **str_a = (const char **) a;
        const char **str_b = (const char **) b;

        return strcmp(*str_a, *str_b);
}

struct multi_symbols_sort {
        const char **funcs;
        u64 *cookies;
};

static void symbols_swap_r(void *a, void *b, int size, const void *priv)
{
        const struct multi_symbols_sort *data = priv;
        const char **name_a = a, **name_b = b;

        swap(*name_a, *name_b);

        /* If defined, swap also related cookies. */
        if (data->cookies) {
                u64 *cookie_a, *cookie_b;

                cookie_a = data->cookies + (name_a - data->funcs);
                cookie_b = data->cookies + (name_b - data->funcs);
                swap(*cookie_a, *cookie_b);
        }
}

struct modules_array {
        struct module **mods;
        int mods_cnt;
        int mods_cap;
};

static int add_module(struct modules_array *arr, struct module *mod)
{
        struct module **mods;

        if (arr->mods_cnt == arr->mods_cap) {
                arr->mods_cap = max(16, arr->mods_cap * 3 / 2);
                mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);
                if (!mods)
                        return -ENOMEM;
                arr->mods = mods;
        }

        arr->mods[arr->mods_cnt] = mod;
        arr->mods_cnt++;
        return 0;
}

static bool has_module(struct modules_array *arr, struct module *mod)
{
        int i;

        for (i = arr->mods_cnt - 1; i >= 0; i--) {
                if (arr->mods[i] == mod)
                        return true;
        }
        return false;
}

static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
{
        struct modules_array arr = {};
        u32 i, err = 0;

        for (i = 0; i < addrs_cnt; i++) {
                struct module *mod;

                preempt_disable();
                mod = __module_address(addrs[i]);
                /* Either no module or we it's already stored  */
                if (!mod || has_module(&arr, mod)) {
                        preempt_enable();
                        continue;
                }
                if (!try_module_get(mod))
                        err = -EINVAL;
                preempt_enable();
                if (err)
                        break;
                err = add_module(&arr, mod);
                if (err) {
                        module_put(mod);
                        break;
                }
        }

        /* We return either err < 0 in case of error, ... */
        if (err) {
                kprobe_multi_put_modules(arr.mods, arr.mods_cnt);
                kfree(arr.mods);
                return err;
        }

        /* or number of modules found if everything is ok. */
        *mods = arr.mods;
        return arr.mods_cnt;
}

static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
{
        u32 i;

        for (i = 0; i < cnt; i++) {
                if (!within_error_injection_list(addrs[i]))
                        return -EINVAL;
        }
        return 0;
}

int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct bpf_kprobe_multi_link *link = NULL;
        struct bpf_link_primer link_primer;
        void __user *ucookies;
        unsigned long *addrs;
        u32 flags, cnt, size;
        void __user *uaddrs;
        u64 *cookies = NULL;
        void __user *usyms;
        int err;

        /* no support for 32bit archs yet */
        if (sizeof(u64) != sizeof(void *))
                return -EOPNOTSUPP;

        if (!is_kprobe_multi(prog))
                return -EINVAL;

        flags = attr->link_create.kprobe_multi.flags;
        if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
                return -EINVAL;

        uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
        usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
        if (!!uaddrs == !!usyms)
                return -EINVAL;

        cnt = attr->link_create.kprobe_multi.cnt;
        if (!cnt)
                return -EINVAL;
        if (cnt > MAX_KPROBE_MULTI_CNT)
                return -E2BIG;

        size = cnt * sizeof(*addrs);
        addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
        if (!addrs)
                return -ENOMEM;

        ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
        if (ucookies) {
                cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
                if (!cookies) {
                        err = -ENOMEM;
                        goto error;
                }
                if (copy_from_user(cookies, ucookies, size)) {
                        err = -EFAULT;
                        goto error;
                }
        }

        if (uaddrs) {
                if (copy_from_user(addrs, uaddrs, size)) {
                        err = -EFAULT;
                        goto error;
                }
        } else {
                struct multi_symbols_sort data = {
                        .cookies = cookies,
                };
                struct user_syms us;

                err = copy_user_syms(&us, usyms, cnt);
                if (err)
                        goto error;

                if (cookies)
                        data.funcs = us.syms;

                sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r,
                       symbols_swap_r, &data);

                err = ftrace_lookup_symbols(us.syms, cnt, addrs);
                free_user_syms(&us);
                if (err)
                        goto error;
        }

        if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
                err = -EINVAL;
                goto error;
        }

        link = kzalloc(sizeof(*link), GFP_KERNEL);
        if (!link) {
                err = -ENOMEM;
                goto error;
        }

        bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
                      &bpf_kprobe_multi_link_lops, prog);

        err = bpf_link_prime(&link->link, &link_primer);
        if (err)
                goto error;

        if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
                link->fp.entry_handler = kprobe_multi_link_handler;
        if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
                link->fp.exit_handler = kprobe_multi_link_exit_handler;
        if (is_kprobe_session(prog))
                link->fp.entry_data_size = sizeof(u64);

        link->addrs = addrs;
        link->cookies = cookies;
        link->cnt = cnt;
        link->flags = flags;

        if (cookies) {
                /*
                 * Sorting addresses will trigger sorting cookies as well
                 * (check bpf_kprobe_multi_cookie_swap). This way we can
                 * find cookie based on the address in bpf_get_attach_cookie
                 * helper.
                 */
                sort_r(addrs, cnt, sizeof(*addrs),
                       bpf_kprobe_multi_cookie_cmp,
                       bpf_kprobe_multi_cookie_swap,
                       link);
        }

        err = get_modules_for_addrs(&link->mods, addrs, cnt);
        if (err < 0) {
                bpf_link_cleanup(&link_primer);
                return err;
        }
        link->mods_cnt = err;

        err = register_fprobe_ips(&link->fp, addrs, cnt);
        if (err) {
                kprobe_multi_put_modules(link->mods, link->mods_cnt);
                bpf_link_cleanup(&link_primer);
                return err;
        }

        return bpf_link_settle(&link_primer);

error:
        kfree(link);
        kvfree(addrs);
        kvfree(cookies);
        return err;
}
#else /* !CONFIG_FPROBE */
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        return 0;
}
static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        return 0;
}
#endif

#ifdef CONFIG_UPROBES
struct bpf_uprobe_multi_link;

struct bpf_uprobe {
        struct bpf_uprobe_multi_link *link;
        loff_t offset;
        unsigned long ref_ctr_offset;
        u64 cookie;
        struct uprobe_consumer consumer;
};

struct bpf_uprobe_multi_link {
        struct path path;
        struct bpf_link link;
        u32 cnt;
        u32 flags;
        struct bpf_uprobe *uprobes;
        struct task_struct *task;
};

struct bpf_uprobe_multi_run_ctx {
        struct bpf_run_ctx run_ctx;
        unsigned long entry_ip;
        struct bpf_uprobe *uprobe;
};

static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
                                  u32 cnt)
{
        u32 i;

        for (i = 0; i < cnt; i++) {
                uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
                                  &uprobes[i].consumer);
        }
}

static void bpf_uprobe_multi_link_release(struct bpf_link *link)
{
        struct bpf_uprobe_multi_link *umulti_link;

        umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
        bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
        if (umulti_link->task)
                put_task_struct(umulti_link->task);
        path_put(&umulti_link->path);
}

static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
{
        struct bpf_uprobe_multi_link *umulti_link;

        umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
        kvfree(umulti_link->uprobes);
        kfree(umulti_link);
}

static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
                                                struct bpf_link_info *info)
{
        u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
        u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
        u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
        u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
        u32 upath_size = info->uprobe_multi.path_size;
        struct bpf_uprobe_multi_link *umulti_link;
        u32 ucount = info->uprobe_multi.count;
        int err = 0, i;
        long left;

        if (!upath ^ !upath_size)
                return -EINVAL;

        if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
                return -EINVAL;

        umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
        info->uprobe_multi.count = umulti_link->cnt;
        info->uprobe_multi.flags = umulti_link->flags;
        info->uprobe_multi.pid = umulti_link->task ?
                                 task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;

        if (upath) {
                char *p, *buf;

                upath_size = min_t(u32, upath_size, PATH_MAX);

                buf = kmalloc(upath_size, GFP_KERNEL);
                if (!buf)
                        return -ENOMEM;
                p = d_path(&umulti_link->path, buf, upath_size);
                if (IS_ERR(p)) {
                        kfree(buf);
                        return PTR_ERR(p);
                }
                upath_size = buf + upath_size - p;
                left = copy_to_user(upath, p, upath_size);
                kfree(buf);
                if (left)
                        return -EFAULT;
                info->uprobe_multi.path_size = upath_size;
        }

        if (!uoffsets && !ucookies && !uref_ctr_offsets)
                return 0;

        if (ucount < umulti_link->cnt)
                err = -ENOSPC;
        else
                ucount = umulti_link->cnt;

        for (i = 0; i < ucount; i++) {
                if (uoffsets &&
                    put_user(umulti_link->uprobes[i].offset, uoffsets + i))
                        return -EFAULT;
                if (uref_ctr_offsets &&
                    put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
                        return -EFAULT;
                if (ucookies &&
                    put_user(umulti_link->uprobes[i].cookie, ucookies + i))
                        return -EFAULT;
        }

        return err;
}

static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
        .release = bpf_uprobe_multi_link_release,
        .dealloc_deferred = bpf_uprobe_multi_link_dealloc,
        .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};

static int uprobe_prog_run(struct bpf_uprobe *uprobe,
                           unsigned long entry_ip,
                           struct pt_regs *regs)
{
        struct bpf_uprobe_multi_link *link = uprobe->link;
        struct bpf_uprobe_multi_run_ctx run_ctx = {
                .entry_ip = entry_ip,
                .uprobe = uprobe,
        };
        struct bpf_prog *prog = link->link.prog;
        bool sleepable = prog->sleepable;
        struct bpf_run_ctx *old_run_ctx;
        int err = 0;

        if (link->task && current->mm != link->task->mm)
                return 0;

        if (sleepable)
                rcu_read_lock_trace();
        else
                rcu_read_lock();

        migrate_disable();

        old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
        err = bpf_prog_run(link->link.prog, regs);
        bpf_reset_run_ctx(old_run_ctx);

        migrate_enable();

        if (sleepable)
                rcu_read_unlock_trace();
        else
                rcu_read_unlock();
        return err;
}

static bool
uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
                         struct mm_struct *mm)
{
        struct bpf_uprobe *uprobe;

        uprobe = container_of(con, struct bpf_uprobe, consumer);
        return uprobe->link->task->mm == mm;
}

static int
uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
{
        struct bpf_uprobe *uprobe;

        uprobe = container_of(con, struct bpf_uprobe, consumer);
        return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
}

static int
uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
{
        struct bpf_uprobe *uprobe;

        uprobe = container_of(con, struct bpf_uprobe, consumer);
        return uprobe_prog_run(uprobe, func, regs);
}

static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        struct bpf_uprobe_multi_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
        return run_ctx->entry_ip;
}

static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        struct bpf_uprobe_multi_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
        return run_ctx->uprobe->cookie;
}

int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct bpf_uprobe_multi_link *link = NULL;
        unsigned long __user *uref_ctr_offsets;
        struct bpf_link_primer link_primer;
        struct bpf_uprobe *uprobes = NULL;
        struct task_struct *task = NULL;
        unsigned long __user *uoffsets;
        u64 __user *ucookies;
        void __user *upath;
        u32 flags, cnt, i;
        struct path path;
        char *name;
        pid_t pid;
        int err;

        /* no support for 32bit archs yet */
        if (sizeof(u64) != sizeof(void *))
                return -EOPNOTSUPP;

        if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
                return -EINVAL;

        flags = attr->link_create.uprobe_multi.flags;
        if (flags & ~BPF_F_UPROBE_MULTI_RETURN)
                return -EINVAL;

        /*
         * path, offsets and cnt are mandatory,
         * ref_ctr_offsets and cookies are optional
         */
        upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
        uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
        cnt = attr->link_create.uprobe_multi.cnt;
        pid = attr->link_create.uprobe_multi.pid;

        if (!upath || !uoffsets || !cnt || pid < 0)
                return -EINVAL;
        if (cnt > MAX_UPROBE_MULTI_CNT)
                return -E2BIG;

        uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
        ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);

        name = strndup_user(upath, PATH_MAX);
        if (IS_ERR(name)) {
                err = PTR_ERR(name);
                return err;
        }

        err = kern_path(name, LOOKUP_FOLLOW, &path);
        kfree(name);
        if (err)
                return err;

        if (!d_is_reg(path.dentry)) {
                err = -EBADF;
                goto error_path_put;
        }

        if (pid) {
                task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
                if (!task) {
                        err = -ESRCH;
                        goto error_path_put;
                }
        }

        err = -ENOMEM;

        link = kzalloc(sizeof(*link), GFP_KERNEL);
        uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL);

        if (!uprobes || !link)
                goto error_free;

        for (i = 0; i < cnt; i++) {
                if (__get_user(uprobes[i].offset, uoffsets + i)) {
                        err = -EFAULT;
                        goto error_free;
                }
                if (uprobes[i].offset < 0) {
                        err = -EINVAL;
                        goto error_free;
                }
                if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
                        err = -EFAULT;
                        goto error_free;
                }
                if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
                        err = -EFAULT;
                        goto error_free;
                }

                uprobes[i].link = link;

                if (flags & BPF_F_UPROBE_MULTI_RETURN)
                        uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
                else
                        uprobes[i].consumer.handler = uprobe_multi_link_handler;

                if (pid)
                        uprobes[i].consumer.filter = uprobe_multi_link_filter;
        }

        link->cnt = cnt;
        link->uprobes = uprobes;
        link->path = path;
        link->task = task;
        link->flags = flags;

        bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
                      &bpf_uprobe_multi_link_lops, prog);

        for (i = 0; i < cnt; i++) {
                err = uprobe_register_refctr(d_real_inode(link->path.dentry),
                                             uprobes[i].offset,
                                             uprobes[i].ref_ctr_offset,
                                             &uprobes[i].consumer);
                if (err) {
                        bpf_uprobe_unregister(&path, uprobes, i);
                        goto error_free;
                }
        }

        err = bpf_link_prime(&link->link, &link_primer);
        if (err)
                goto error_free;

        return bpf_link_settle(&link_primer);

error_free:
        kvfree(uprobes);
        kfree(link);
        if (task)
                put_task_struct(task);
error_path_put:
        path_put(&path);
        return err;
}
#else /* !CONFIG_UPROBES */
int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}
static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
        return 0;
}
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
        return 0;
}
#endif /* CONFIG_UPROBES */

__bpf_kfunc_start_defs();

__bpf_kfunc bool bpf_session_is_return(void)
{
        struct bpf_session_run_ctx *session_ctx;

        session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
        return session_ctx->is_return;
}

__bpf_kfunc __u64 *bpf_session_cookie(void)
{
        struct bpf_session_run_ctx *session_ctx;

        session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
        return session_ctx->data;
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_session_is_return)
BTF_ID_FLAGS(func, bpf_session_cookie)
BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)

static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
        if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
                return 0;

        if (!is_kprobe_session(prog))
                return -EACCES;

        return 0;
}

static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
        .owner = THIS_MODULE,
        .set = &kprobe_multi_kfunc_set_ids,
        .filter = bpf_kprobe_multi_filter,
};

static int __init bpf_kprobe_multi_kfuncs_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
}

late_initcall(bpf_kprobe_multi_kfuncs_init);





































































    1 



















    1 
    1 



























    1 


































    1 






































































    1 
    1 



















































































































    1 




    1 

































































































































































































































































































































































































































































































    1 























    1 
    1 

























































































































    1 





























































































































































































    1 

    1 








    1 



    1 


    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 * Copyright (c) 2001 Nokia, Inc.
 * Copyright (c) 2001 La Monte H.P. Yarroll
 *
 * These functions manipulate an sctp event.   The struct ulpevent is used
 * to carry notifications and data to the ULP (sockets).
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Ardelle Fan            <ardelle.fan@intel.com>
 *    Sridhar Samudrala     <sri@us.ibm.com>
 */

#include <linux/slab.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <net/sctp/structs.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
                                       struct sctp_association *asoc);
static void sctp_ulpevent_release_data(struct sctp_ulpevent *event);
static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event);


/* Initialize an ULP event from an given skb.  */
static void sctp_ulpevent_init(struct sctp_ulpevent *event,
                               __u16 msg_flags,
                               unsigned int len)
{
        memset(event, 0, sizeof(struct sctp_ulpevent));
        event->msg_flags = msg_flags;
        event->rmem_len = len;
}

/* Create a new sctp_ulpevent.  */
static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags,
                                               gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sk_buff *skb;

        skb = alloc_skb(size, gfp);
        if (!skb)
                goto fail;

        event = sctp_skb2event(skb);
        sctp_ulpevent_init(event, msg_flags, skb->truesize);

        return event;

fail:
        return NULL;
}

/* Is this a MSG_NOTIFICATION?  */
int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event)
{
        return MSG_NOTIFICATION == (event->msg_flags & MSG_NOTIFICATION);
}

/* Hold the association in case the msg_name needs read out of
 * the association.
 */
static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
                                           const struct sctp_association *asoc)
{
        struct sctp_chunk *chunk = event->chunk;
        struct sk_buff *skb;

        /* Cast away the const, as we are just wanting to
         * bump the reference count.
         */
        sctp_association_hold((struct sctp_association *)asoc);
        skb = sctp_event2skb(event);
        event->asoc = (struct sctp_association *)asoc;
        atomic_add(event->rmem_len, &event->asoc->rmem_alloc);
        sctp_skb_set_owner_r(skb, asoc->base.sk);
        if (chunk && chunk->head_skb && !chunk->head_skb->sk)
                chunk->head_skb->sk = asoc->base.sk;
}

/* A simple destructor to give up the reference to the association. */
static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
{
        struct sctp_association *asoc = event->asoc;

        atomic_sub(event->rmem_len, &asoc->rmem_alloc);
        sctp_association_put(asoc);
}

/* Create and initialize an SCTP_ASSOC_CHANGE event.
 *
 * 5.3.1.1 SCTP_ASSOC_CHANGE
 *
 * Communication notifications inform the ULP that an SCTP association
 * has either begun or ended. The identifier for a new association is
 * provided by this notification.
 *
 * Note: There is no field checking here.  If a field is unused it will be
 * zero'd out.
 */
struct sctp_ulpevent  *sctp_ulpevent_make_assoc_change(
        const struct sctp_association *asoc,
        __u16 flags, __u16 state, __u16 error, __u16 outbound,
        __u16 inbound, struct sctp_chunk *chunk, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_assoc_change *sac;
        struct sk_buff *skb;

        /* If the lower layer passed in the chunk, it will be
         * an ABORT, so we need to include it in the sac_info.
         */
        if (chunk) {
                /* Copy the chunk data to a new skb and reserve enough
                 * head room to use as notification.
                 */
                skb = skb_copy_expand(chunk->skb,
                                      sizeof(struct sctp_assoc_change), 0, gfp);

                if (!skb)
                        goto fail;

                /* Embed the event fields inside the cloned skb.  */
                event = sctp_skb2event(skb);
                sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);

                /* Include the notification structure */
                sac = skb_push(skb, sizeof(struct sctp_assoc_change));

                /* Trim the buffer to the right length.  */
                skb_trim(skb, sizeof(struct sctp_assoc_change) +
                         ntohs(chunk->chunk_hdr->length) -
                         sizeof(struct sctp_chunkhdr));
        } else {
                event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change),
                                  MSG_NOTIFICATION, gfp);
                if (!event)
                        goto fail;

                skb = sctp_event2skb(event);
                sac = skb_put(skb, sizeof(struct sctp_assoc_change));
        }

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * sac_type:
         * It should be SCTP_ASSOC_CHANGE.
         */
        sac->sac_type = SCTP_ASSOC_CHANGE;

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * sac_state: 32 bits (signed integer)
         * This field holds one of a number of values that communicate the
         * event that happened to the association.
         */
        sac->sac_state = state;

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * sac_flags: 16 bits (unsigned integer)
         * Currently unused.
         */
        sac->sac_flags = 0;

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * sac_length: sizeof (__u32)
         * This field is the total length of the notification data, including
         * the notification header.
         */
        sac->sac_length = skb->len;

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * sac_error:  32 bits (signed integer)
         *
         * If the state was reached due to a error condition (e.g.
         * COMMUNICATION_LOST) any relevant error information is available in
         * this field. This corresponds to the protocol error codes defined in
         * [SCTP].
         */
        sac->sac_error = error;

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * sac_outbound_streams:  16 bits (unsigned integer)
         * sac_inbound_streams:  16 bits (unsigned integer)
         *
         * The maximum number of streams allowed in each direction are
         * available in sac_outbound_streams and sac_inbound streams.
         */
        sac->sac_outbound_streams = outbound;
        sac->sac_inbound_streams = inbound;

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * sac_assoc_id: sizeof (sctp_assoc_t)
         *
         * The association id field, holds the identifier for the association.
         * All notifications for a given association have the same association
         * identifier.  For TCP style socket, this field is ignored.
         */
        sctp_ulpevent_set_owner(event, asoc);
        sac->sac_assoc_id = sctp_assoc2id(asoc);

        return event;

fail:
        return NULL;
}

/* Create and initialize an SCTP_PEER_ADDR_CHANGE event.
 *
 * Socket Extensions for SCTP - draft-01
 * 5.3.1.2 SCTP_PEER_ADDR_CHANGE
 *
 * When a destination address on a multi-homed peer encounters a change
 * an interface details event is sent.
 */
static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
        const struct sctp_association *asoc,
        const struct sockaddr_storage *aaddr,
        int flags, int state, int error, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_paddr_change  *spc;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_paddr_change),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                goto fail;

        skb = sctp_event2skb(event);
        spc = skb_put(skb, sizeof(struct sctp_paddr_change));

        /* Sockets API Extensions for SCTP
         * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
         *
         * spc_type:
         *
         *    It should be SCTP_PEER_ADDR_CHANGE.
         */
        spc->spc_type = SCTP_PEER_ADDR_CHANGE;

        /* Sockets API Extensions for SCTP
         * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
         *
         * spc_length: sizeof (__u32)
         *
         * This field is the total length of the notification data, including
         * the notification header.
         */
        spc->spc_length = sizeof(struct sctp_paddr_change);

        /* Sockets API Extensions for SCTP
         * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
         *
         * spc_flags: 16 bits (unsigned integer)
         * Currently unused.
         */
        spc->spc_flags = 0;

        /* Sockets API Extensions for SCTP
         * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
         *
         * spc_state:  32 bits (signed integer)
         *
         * This field holds one of a number of values that communicate the
         * event that happened to the address.
         */
        spc->spc_state = state;

        /* Sockets API Extensions for SCTP
         * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
         *
         * spc_error:  32 bits (signed integer)
         *
         * If the state was reached due to any error condition (e.g.
         * ADDRESS_UNREACHABLE) any relevant error information is available in
         * this field.
         */
        spc->spc_error = error;

        /* Socket Extensions for SCTP
         * 5.3.1.1 SCTP_ASSOC_CHANGE
         *
         * spc_assoc_id: sizeof (sctp_assoc_t)
         *
         * The association id field, holds the identifier for the association.
         * All notifications for a given association have the same association
         * identifier.  For TCP style socket, this field is ignored.
         */
        sctp_ulpevent_set_owner(event, asoc);
        spc->spc_assoc_id = sctp_assoc2id(asoc);

        /* Sockets API Extensions for SCTP
         * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE
         *
         * spc_aaddr: sizeof (struct sockaddr_storage)
         *
         * The affected address field, holds the remote peer's address that is
         * encountering the change of state.
         */
        memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage));

        /* Map ipv4 address into v4-mapped-on-v6 address.  */
        sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_to_user(
                                        sctp_sk(asoc->base.sk),
                                        (union sctp_addr *)&spc->spc_aaddr);

        return event;

fail:
        return NULL;
}

void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport,
                                           int state, int error)
{
        struct sctp_association *asoc = transport->asoc;
        struct sockaddr_storage addr;
        struct sctp_ulpevent *event;

        if (asoc->state < SCTP_STATE_ESTABLISHED)
                return;

        memset(&addr, 0, sizeof(struct sockaddr_storage));
        memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);

        event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state,
                                                    error, GFP_ATOMIC);
        if (event)
                asoc->stream.si->enqueue_event(&asoc->ulpq, event);
}

/* Create and initialize an SCTP_REMOTE_ERROR notification.
 *
 * Note: This assumes that the chunk->skb->data already points to the
 * operation error payload.
 *
 * Socket Extensions for SCTP - draft-01
 * 5.3.1.3 SCTP_REMOTE_ERROR
 *
 * A remote peer may send an Operational Error message to its peer.
 * This message indicates a variety of error conditions on an
 * association. The entire error TLV as it appears on the wire is
 * included in a SCTP_REMOTE_ERROR event.  Please refer to the SCTP
 * specification [SCTP] and any extensions for a list of possible
 * error formats.
 */
struct sctp_ulpevent *
sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
                                struct sctp_chunk *chunk, __u16 flags,
                                gfp_t gfp)
{
        struct sctp_remote_error *sre;
        struct sctp_ulpevent *event;
        struct sctp_errhdr *ch;
        struct sk_buff *skb;
        __be16 cause;
        int elen;

        ch = (struct sctp_errhdr *)(chunk->skb->data);
        cause = ch->cause;
        elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(*ch);

        /* Pull off the ERROR header.  */
        skb_pull(chunk->skb, sizeof(*ch));

        /* Copy the skb to a new skb with room for us to prepend
         * notification with.
         */
        skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp);

        /* Pull off the rest of the cause TLV from the chunk.  */
        skb_pull(chunk->skb, elen);
        if (!skb)
                goto fail;

        /* Embed the event fields inside the cloned skb.  */
        event = sctp_skb2event(skb);
        sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);

        sre = skb_push(skb, sizeof(*sre));

        /* Trim the buffer to the right length.  */
        skb_trim(skb, sizeof(*sre) + elen);

        /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */
        memset(sre, 0, sizeof(*sre));
        sre->sre_type = SCTP_REMOTE_ERROR;
        sre->sre_flags = 0;
        sre->sre_length = skb->len;
        sre->sre_error = cause;
        sctp_ulpevent_set_owner(event, asoc);
        sre->sre_assoc_id = sctp_assoc2id(asoc);

        return event;
fail:
        return NULL;
}

/* Create and initialize a SCTP_SEND_FAILED notification.
 *
 * Socket Extensions for SCTP - draft-01
 * 5.3.1.4 SCTP_SEND_FAILED
 */
struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
        const struct sctp_association *asoc, struct sctp_chunk *chunk,
        __u16 flags, __u32 error, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_send_failed *ssf;
        struct sk_buff *skb;

        /* Pull off any padding. */
        int len = ntohs(chunk->chunk_hdr->length);

        /* Make skb with more room so we can prepend notification.  */
        skb = skb_copy_expand(chunk->skb,
                              sizeof(struct sctp_send_failed), /* headroom */
                              0,                               /* tailroom */
                              gfp);
        if (!skb)
                goto fail;

        /* Pull off the common chunk header and DATA header.  */
        skb_pull(skb, sctp_datachk_len(&asoc->stream));
        len -= sctp_datachk_len(&asoc->stream);

        /* Embed the event fields inside the cloned skb.  */
        event = sctp_skb2event(skb);
        sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);

        ssf = skb_push(skb, sizeof(struct sctp_send_failed));

        /* Socket Extensions for SCTP
         * 5.3.1.4 SCTP_SEND_FAILED
         *
         * ssf_type:
         * It should be SCTP_SEND_FAILED.
         */
        ssf->ssf_type = SCTP_SEND_FAILED;

        /* Socket Extensions for SCTP
         * 5.3.1.4 SCTP_SEND_FAILED
         *
         * ssf_flags: 16 bits (unsigned integer)
         * The flag value will take one of the following values
         *
         * SCTP_DATA_UNSENT - Indicates that the data was never put on
         *                    the wire.
         *
         * SCTP_DATA_SENT   - Indicates that the data was put on the wire.
         *                    Note that this does not necessarily mean that the
         *                    data was (or was not) successfully delivered.
         */
        ssf->ssf_flags = flags;

        /* Socket Extensions for SCTP
         * 5.3.1.4 SCTP_SEND_FAILED
         *
         * ssf_length: sizeof (__u32)
         * This field is the total length of the notification data, including
         * the notification header.
         */
        ssf->ssf_length = sizeof(struct sctp_send_failed) + len;
        skb_trim(skb, ssf->ssf_length);

        /* Socket Extensions for SCTP
         * 5.3.1.4 SCTP_SEND_FAILED
         *
         * ssf_error: 16 bits (unsigned integer)
         * This value represents the reason why the send failed, and if set,
         * will be a SCTP protocol error code as defined in [SCTP] section
         * 3.3.10.
         */
        ssf->ssf_error = error;

        /* Socket Extensions for SCTP
         * 5.3.1.4 SCTP_SEND_FAILED
         *
         * ssf_info: sizeof (struct sctp_sndrcvinfo)
         * The original send information associated with the undelivered
         * message.
         */
        memcpy(&ssf->ssf_info, &chunk->sinfo, sizeof(struct sctp_sndrcvinfo));

        /* Per TSVWG discussion with Randy. Allow the application to
         * reassemble a fragmented message.
         */
        ssf->ssf_info.sinfo_flags = chunk->chunk_hdr->flags;

        /* Socket Extensions for SCTP
         * 5.3.1.4 SCTP_SEND_FAILED
         *
         * ssf_assoc_id: sizeof (sctp_assoc_t)
         * The association id field, sf_assoc_id, holds the identifier for the
         * association.  All notifications for a given association have the
         * same association identifier.  For TCP style socket, this field is
         * ignored.
         */
        sctp_ulpevent_set_owner(event, asoc);
        ssf->ssf_assoc_id = sctp_assoc2id(asoc);
        return event;

fail:
        return NULL;
}

struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event(
        const struct sctp_association *asoc, struct sctp_chunk *chunk,
        __u16 flags, __u32 error, gfp_t gfp)
{
        struct sctp_send_failed_event *ssf;
        struct sctp_ulpevent *event;
        struct sk_buff *skb;
        int len;

        skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp);
        if (!skb)
                return NULL;

        len = ntohs(chunk->chunk_hdr->length);
        len -= sctp_datachk_len(&asoc->stream);

        skb_pull(skb, sctp_datachk_len(&asoc->stream));
        event = sctp_skb2event(skb);
        sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);

        ssf = skb_push(skb, sizeof(*ssf));
        ssf->ssf_type = SCTP_SEND_FAILED_EVENT;
        ssf->ssf_flags = flags;
        ssf->ssf_length = sizeof(*ssf) + len;
        skb_trim(skb, ssf->ssf_length);
        ssf->ssf_error = error;

        ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream;
        ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid;
        ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context;
        ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id;
        ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags;

        sctp_ulpevent_set_owner(event, asoc);
        ssf->ssf_assoc_id = sctp_assoc2id(asoc);

        return event;
}

/* Create and initialize a SCTP_SHUTDOWN_EVENT notification.
 *
 * Socket Extensions for SCTP - draft-01
 * 5.3.1.5 SCTP_SHUTDOWN_EVENT
 */
struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
        const struct sctp_association *asoc,
        __u16 flags, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_shutdown_event *sse;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_shutdown_event),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                goto fail;

        skb = sctp_event2skb(event);
        sse = skb_put(skb, sizeof(struct sctp_shutdown_event));

        /* Socket Extensions for SCTP
         * 5.3.1.5 SCTP_SHUTDOWN_EVENT
         *
         * sse_type
         * It should be SCTP_SHUTDOWN_EVENT
         */
        sse->sse_type = SCTP_SHUTDOWN_EVENT;

        /* Socket Extensions for SCTP
         * 5.3.1.5 SCTP_SHUTDOWN_EVENT
         *
         * sse_flags: 16 bits (unsigned integer)
         * Currently unused.
         */
        sse->sse_flags = 0;

        /* Socket Extensions for SCTP
         * 5.3.1.5 SCTP_SHUTDOWN_EVENT
         *
         * sse_length: sizeof (__u32)
         * This field is the total length of the notification data, including
         * the notification header.
         */
        sse->sse_length = sizeof(struct sctp_shutdown_event);

        /* Socket Extensions for SCTP
         * 5.3.1.5 SCTP_SHUTDOWN_EVENT
         *
         * sse_assoc_id: sizeof (sctp_assoc_t)
         * The association id field, holds the identifier for the association.
         * All notifications for a given association have the same association
         * identifier.  For TCP style socket, this field is ignored.
         */
        sctp_ulpevent_set_owner(event, asoc);
        sse->sse_assoc_id = sctp_assoc2id(asoc);

        return event;

fail:
        return NULL;
}

/* Create and initialize a SCTP_ADAPTATION_INDICATION notification.
 *
 * Socket Extensions for SCTP
 * 5.3.1.6 SCTP_ADAPTATION_INDICATION
 */
struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication(
        const struct sctp_association *asoc, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_adaptation_event *sai;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_adaptation_event),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                goto fail;

        skb = sctp_event2skb(event);
        sai = skb_put(skb, sizeof(struct sctp_adaptation_event));

        sai->sai_type = SCTP_ADAPTATION_INDICATION;
        sai->sai_flags = 0;
        sai->sai_length = sizeof(struct sctp_adaptation_event);
        sai->sai_adaptation_ind = asoc->peer.adaptation_ind;
        sctp_ulpevent_set_owner(event, asoc);
        sai->sai_assoc_id = sctp_assoc2id(asoc);

        return event;

fail:
        return NULL;
}

/* A message has been received.  Package this message as a notification
 * to pass it to the upper layers.  Go ahead and calculate the sndrcvinfo
 * even if filtered out later.
 *
 * Socket Extensions for SCTP
 * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
 */
struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
                                                struct sctp_chunk *chunk,
                                                gfp_t gfp)
{
        struct sctp_ulpevent *event = NULL;
        struct sk_buff *skb = chunk->skb;
        struct sock *sk = asoc->base.sk;
        size_t padding, datalen;
        int rx_count;

        /*
         * check to see if we need to make space for this
         * new skb, expand the rcvbuffer if needed, or drop
         * the frame
         */
        if (asoc->ep->rcvbuf_policy)
                rx_count = atomic_read(&asoc->rmem_alloc);
        else
                rx_count = atomic_read(&sk->sk_rmem_alloc);

        datalen = ntohs(chunk->chunk_hdr->length);

        if (rx_count >= sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, datalen))
                goto fail;

        /* Clone the original skb, sharing the data.  */
        skb = skb_clone(chunk->skb, gfp);
        if (!skb)
                goto fail;

        /* Now that all memory allocations for this chunk succeeded, we
         * can mark it as received so the tsn_map is updated correctly.
         */
        if (sctp_tsnmap_mark(&asoc->peer.tsn_map,
                             ntohl(chunk->subh.data_hdr->tsn),
                             chunk->transport))
                goto fail_mark;

        /* First calculate the padding, so we don't inadvertently
         * pass up the wrong length to the user.
         *
         * RFC 2960 - Section 3.2  Chunk Field Descriptions
         *
         * The total length of a chunk(including Type, Length and Value fields)
         * MUST be a multiple of 4 bytes.  If the length of the chunk is not a
         * multiple of 4 bytes, the sender MUST pad the chunk with all zero
         * bytes and this padding is not included in the chunk length field.
         * The sender should never pad with more than 3 bytes.  The receiver
         * MUST ignore the padding bytes.
         */
        padding = SCTP_PAD4(datalen) - datalen;

        /* Fixup cloned skb with just this chunks data.  */
        skb_trim(skb, chunk->chunk_end - padding - skb->data);

        /* Embed the event fields inside the cloned skb.  */
        event = sctp_skb2event(skb);

        /* Initialize event with flags 0  and correct length
         * Since this is a clone of the original skb, only account for
         * the data of this chunk as other chunks will be accounted separately.
         */
        sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff));

        /* And hold the chunk as we need it for getting the IP headers
         * later in recvmsg
         */
        sctp_chunk_hold(chunk);
        event->chunk = chunk;

        sctp_ulpevent_receive_data(event, asoc);

        event->stream = ntohs(chunk->subh.data_hdr->stream);
        if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) {
                event->flags |= SCTP_UNORDERED;
                event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
        }
        event->tsn = ntohl(chunk->subh.data_hdr->tsn);
        event->msg_flags |= chunk->chunk_hdr->flags;

        return event;

fail_mark:
        kfree_skb(skb);
fail:
        return NULL;
}

/* Create a partial delivery related event.
 *
 * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT
 *
 *   When a receiver is engaged in a partial delivery of a
 *   message this notification will be used to indicate
 *   various events.
 */
struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
                                        const struct sctp_association *asoc,
                                        __u32 indication, __u32 sid, __u32 seq,
                                        __u32 flags, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_pdapi_event *pd;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_pdapi_event),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                goto fail;

        skb = sctp_event2skb(event);
        pd = skb_put(skb, sizeof(struct sctp_pdapi_event));

        /* pdapi_type
         *   It should be SCTP_PARTIAL_DELIVERY_EVENT
         *
         * pdapi_flags: 16 bits (unsigned integer)
         *   Currently unused.
         */
        pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT;
        pd->pdapi_flags = flags;
        pd->pdapi_stream = sid;
        pd->pdapi_seq = seq;

        /* pdapi_length: 32 bits (unsigned integer)
         *
         * This field is the total length of the notification data, including
         * the notification header.  It will generally be sizeof (struct
         * sctp_pdapi_event).
         */
        pd->pdapi_length = sizeof(struct sctp_pdapi_event);

        /*  pdapi_indication: 32 bits (unsigned integer)
         *
         * This field holds the indication being sent to the application.
         */
        pd->pdapi_indication = indication;

        /*  pdapi_assoc_id: sizeof (sctp_assoc_t)
         *
         * The association id field, holds the identifier for the association.
         */
        sctp_ulpevent_set_owner(event, asoc);
        pd->pdapi_assoc_id = sctp_assoc2id(asoc);

        return event;
fail:
        return NULL;
}

struct sctp_ulpevent *sctp_ulpevent_make_authkey(
        const struct sctp_association *asoc, __u16 key_id,
        __u32 indication, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_authkey_event *ak;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_authkey_event),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                goto fail;

        skb = sctp_event2skb(event);
        ak = skb_put(skb, sizeof(struct sctp_authkey_event));

        ak->auth_type = SCTP_AUTHENTICATION_EVENT;
        ak->auth_flags = 0;
        ak->auth_length = sizeof(struct sctp_authkey_event);

        ak->auth_keynumber = key_id;
        ak->auth_altkeynumber = 0;
        ak->auth_indication = indication;

        /*
         * The association id field, holds the identifier for the association.
         */
        sctp_ulpevent_set_owner(event, asoc);
        ak->auth_assoc_id = sctp_assoc2id(asoc);

        return event;
fail:
        return NULL;
}

/*
 * Socket Extensions for SCTP
 * 6.3.10. SCTP_SENDER_DRY_EVENT
 */
struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event(
        const struct sctp_association *asoc, gfp_t gfp)
{
        struct sctp_ulpevent *event;
        struct sctp_sender_dry_event *sdry;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_sender_dry_event),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                return NULL;

        skb = sctp_event2skb(event);
        sdry = skb_put(skb, sizeof(struct sctp_sender_dry_event));

        sdry->sender_dry_type = SCTP_SENDER_DRY_EVENT;
        sdry->sender_dry_flags = 0;
        sdry->sender_dry_length = sizeof(struct sctp_sender_dry_event);
        sctp_ulpevent_set_owner(event, asoc);
        sdry->sender_dry_assoc_id = sctp_assoc2id(asoc);

        return event;
}

struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event(
        const struct sctp_association *asoc, __u16 flags, __u16 stream_num,
        __be16 *stream_list, gfp_t gfp)
{
        struct sctp_stream_reset_event *sreset;
        struct sctp_ulpevent *event;
        struct sk_buff *skb;
        int length, i;

        length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num;
        event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp);
        if (!event)
                return NULL;

        skb = sctp_event2skb(event);
        sreset = skb_put(skb, length);

        sreset->strreset_type = SCTP_STREAM_RESET_EVENT;
        sreset->strreset_flags = flags;
        sreset->strreset_length = length;
        sctp_ulpevent_set_owner(event, asoc);
        sreset->strreset_assoc_id = sctp_assoc2id(asoc);

        for (i = 0; i < stream_num; i++)
                sreset->strreset_stream_list[i] = ntohs(stream_list[i]);

        return event;
}

struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event(
        const struct sctp_association *asoc, __u16 flags, __u32 local_tsn,
        __u32 remote_tsn, gfp_t gfp)
{
        struct sctp_assoc_reset_event *areset;
        struct sctp_ulpevent *event;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_assoc_reset_event),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                return NULL;

        skb = sctp_event2skb(event);
        areset = skb_put(skb, sizeof(struct sctp_assoc_reset_event));

        areset->assocreset_type = SCTP_ASSOC_RESET_EVENT;
        areset->assocreset_flags = flags;
        areset->assocreset_length = sizeof(struct sctp_assoc_reset_event);
        sctp_ulpevent_set_owner(event, asoc);
        areset->assocreset_assoc_id = sctp_assoc2id(asoc);
        areset->assocreset_local_tsn = local_tsn;
        areset->assocreset_remote_tsn = remote_tsn;

        return event;
}

struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event(
        const struct sctp_association *asoc, __u16 flags,
        __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp)
{
        struct sctp_stream_change_event *schange;
        struct sctp_ulpevent *event;
        struct sk_buff *skb;

        event = sctp_ulpevent_new(sizeof(struct sctp_stream_change_event),
                                  MSG_NOTIFICATION, gfp);
        if (!event)
                return NULL;

        skb = sctp_event2skb(event);
        schange = skb_put(skb, sizeof(struct sctp_stream_change_event));

        schange->strchange_type = SCTP_STREAM_CHANGE_EVENT;
        schange->strchange_flags = flags;
        schange->strchange_length = sizeof(struct sctp_stream_change_event);
        sctp_ulpevent_set_owner(event, asoc);
        schange->strchange_assoc_id = sctp_assoc2id(asoc);
        schange->strchange_instrms = strchange_instrms;
        schange->strchange_outstrms = strchange_outstrms;

        return event;
}

/* Return the notification type, assuming this is a notification
 * event.
 */
__u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event)
{
        union sctp_notification *notification;
        struct sk_buff *skb;

        skb = sctp_event2skb(event);
        notification = (union sctp_notification *) skb->data;
        return notification->sn_header.sn_type;
}

/* RFC6458, Section 5.3.2. SCTP Header Information Structure
 * (SCTP_SNDRCV, DEPRECATED)
 */
void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
                                   struct msghdr *msghdr)
{
        struct sctp_sndrcvinfo sinfo;

        if (sctp_ulpevent_is_notification(event))
                return;

        memset(&sinfo, 0, sizeof(sinfo));
        sinfo.sinfo_stream = event->stream;
        sinfo.sinfo_ssn = event->ssn;
        sinfo.sinfo_ppid = event->ppid;
        sinfo.sinfo_flags = event->flags;
        sinfo.sinfo_tsn = event->tsn;
        sinfo.sinfo_cumtsn = event->cumtsn;
        sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc);
        /* Context value that is set via SCTP_CONTEXT socket option. */
        sinfo.sinfo_context = event->asoc->default_rcv_context;
        /* These fields are not used while receiving. */
        sinfo.sinfo_timetolive = 0;

        put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV,
                 sizeof(sinfo), &sinfo);
}

/* RFC6458, Section 5.3.5 SCTP Receive Information Structure
 * (SCTP_SNDRCV)
 */
void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event,
                                struct msghdr *msghdr)
{
        struct sctp_rcvinfo rinfo;

        if (sctp_ulpevent_is_notification(event))
                return;

        memset(&rinfo, 0, sizeof(struct sctp_rcvinfo));
        rinfo.rcv_sid = event->stream;
        rinfo.rcv_ssn = event->ssn;
        rinfo.rcv_ppid = event->ppid;
        rinfo.rcv_flags = event->flags;
        rinfo.rcv_tsn = event->tsn;
        rinfo.rcv_cumtsn = event->cumtsn;
        rinfo.rcv_assoc_id = sctp_assoc2id(event->asoc);
        rinfo.rcv_context = event->asoc->default_rcv_context;

        put_cmsg(msghdr, IPPROTO_SCTP, SCTP_RCVINFO,
                 sizeof(rinfo), &rinfo);
}

/* RFC6458, Section 5.3.6. SCTP Next Receive Information Structure
 * (SCTP_NXTINFO)
 */
static void __sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event,
                                         struct msghdr *msghdr,
                                         const struct sk_buff *skb)
{
        struct sctp_nxtinfo nxtinfo;

        memset(&nxtinfo, 0, sizeof(nxtinfo));
        nxtinfo.nxt_sid = event->stream;
        nxtinfo.nxt_ppid = event->ppid;
        nxtinfo.nxt_flags = event->flags;
        if (sctp_ulpevent_is_notification(event))
                nxtinfo.nxt_flags |= SCTP_NOTIFICATION;
        nxtinfo.nxt_length = skb->len;
        nxtinfo.nxt_assoc_id = sctp_assoc2id(event->asoc);

        put_cmsg(msghdr, IPPROTO_SCTP, SCTP_NXTINFO,
                 sizeof(nxtinfo), &nxtinfo);
}

void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event,
                                struct msghdr *msghdr,
                                struct sock *sk)
{
        struct sk_buff *skb;
        int err;

        skb = sctp_skb_recv_datagram(sk, MSG_PEEK | MSG_DONTWAIT, &err);
        if (skb != NULL) {
                __sctp_ulpevent_read_nxtinfo(sctp_skb2event(skb),
                                             msghdr, skb);
                /* Just release refcount here. */
                kfree_skb(skb);
        }
}

/* Do accounting for bytes received and hold a reference to the association
 * for each skb.
 */
static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event,
                                       struct sctp_association *asoc)
{
        struct sk_buff *skb, *frag;

        skb = sctp_event2skb(event);
        /* Set the owner and charge rwnd for bytes received.  */
        sctp_ulpevent_set_owner(event, asoc);
        sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb));

        if (!skb->data_len)
                return;

        /* Note:  Not clearing the entire event struct as this is just a
         * fragment of the real event.  However, we still need to do rwnd
         * accounting.
         * In general, the skb passed from IP can have only 1 level of
         * fragments. But we allow multiple levels of fragments.
         */
        skb_walk_frags(skb, frag)
                sctp_ulpevent_receive_data(sctp_skb2event(frag), asoc);
}

/* Do accounting for bytes just read by user and release the references to
 * the association.
 */
static void sctp_ulpevent_release_data(struct sctp_ulpevent *event)
{
        struct sk_buff *skb, *frag;
        unsigned int        len;

        /* Current stack structures assume that the rcv buffer is
         * per socket.   For UDP style sockets this is not true as
         * multiple associations may be on a single UDP-style socket.
         * Use the local private area of the skb to track the owning
         * association.
         */

        skb = sctp_event2skb(event);
        len = skb->len;

        if (!skb->data_len)
                goto done;

        /* Don't forget the fragments. */
        skb_walk_frags(skb, frag) {
                /* NOTE:  skb_shinfos are recursive. Although IP returns
                 * skb's with only 1 level of fragments, SCTP reassembly can
                 * increase the levels.
                 */
                sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
        }

done:
        sctp_assoc_rwnd_increase(event->asoc, len);
        sctp_chunk_put(event->chunk);
        sctp_ulpevent_release_owner(event);
}

static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event)
{
        struct sk_buff *skb, *frag;

        skb = sctp_event2skb(event);

        if (!skb->data_len)
                goto done;

        /* Don't forget the fragments. */
        skb_walk_frags(skb, frag) {
                /* NOTE:  skb_shinfos are recursive. Although IP returns
                 * skb's with only 1 level of fragments, SCTP reassembly can
                 * increase the levels.
                 */
                sctp_ulpevent_release_frag_data(sctp_skb2event(frag));
        }

done:
        sctp_chunk_put(event->chunk);
        sctp_ulpevent_release_owner(event);
}

/* Free a ulpevent that has an owner.  It includes releasing the reference
 * to the owner, updating the rwnd in case of a DATA event and freeing the
 * skb.
 */
void sctp_ulpevent_free(struct sctp_ulpevent *event)
{
        if (sctp_ulpevent_is_notification(event))
                sctp_ulpevent_release_owner(event);
        else
                sctp_ulpevent_release_data(event);

        kfree_skb(sctp_event2skb(event));
}

/* Purge the skb lists holding ulpevents. */
unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list)
{
        struct sk_buff *skb;
        unsigned int data_unread = 0;

        while ((skb = skb_dequeue(list)) != NULL) {
                struct sctp_ulpevent *event = sctp_skb2event(skb);

                if (!sctp_ulpevent_is_notification(event))
                        data_unread += skb->len;

                sctp_ulpevent_free(event);
        }

        return data_unread;
}


































































































































































































































    1 












    1 


















    1 

















    1 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
// SPDX-License-Identifier: GPL-2.0-only
/*
 * File: af_phonet.c
 *
 * Phonet protocols family
 *
 * Copyright (C) 2008 Nokia Corporation.
 *
 * Authors: Sakari Ailus <sakari.ailus@nokia.com>
 *          Rémi Denis-Courmont
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
#include <net/sock.h>

#include <linux/if_phonet.h>
#include <linux/phonet.h>
#include <net/phonet/phonet.h>
#include <net/phonet/pn_dev.h>

/* Transport protocol registration */
static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;

static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
{
        const struct phonet_protocol *pp;

        if (protocol >= PHONET_NPROTO)
                return NULL;

        rcu_read_lock();
        pp = rcu_dereference(proto_tab[protocol]);
        if (pp && !try_module_get(pp->prot->owner))
                pp = NULL;
        rcu_read_unlock();

        return pp;
}

static inline void phonet_proto_put(const struct phonet_protocol *pp)
{
        module_put(pp->prot->owner);
}

/* protocol family functions */

static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
                            int kern)
{
        struct sock *sk;
        struct pn_sock *pn;
        const struct phonet_protocol *pnp;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (protocol == 0) {
                /* Default protocol selection */
                switch (sock->type) {
                case SOCK_DGRAM:
                        protocol = PN_PROTO_PHONET;
                        break;
                case SOCK_SEQPACKET:
                        protocol = PN_PROTO_PIPE;
                        break;
                default:
                        return -EPROTONOSUPPORT;
                }
        }

        pnp = phonet_proto_get(protocol);
        if (pnp == NULL &&
            request_module("net-pf-%d-proto-%d", PF_PHONET, protocol) == 0)
                pnp = phonet_proto_get(protocol);

        if (pnp == NULL)
                return -EPROTONOSUPPORT;
        if (sock->type != pnp->sock_type) {
                err = -EPROTONOSUPPORT;
                goto out;
        }

        sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern);
        if (sk == NULL) {
                err = -ENOMEM;
                goto out;
        }

        sock_init_data(sock, sk);
        sock->state = SS_UNCONNECTED;
        sock->ops = pnp->ops;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
        sk->sk_protocol = protocol;
        pn = pn_sk(sk);
        pn->sobject = 0;
        pn->dobject = 0;
        pn->resource = 0;
        sk->sk_prot->init(sk);
        err = 0;

out:
        phonet_proto_put(pnp);
        return err;
}

static const struct net_proto_family phonet_proto_family = {
        .family = PF_PHONET,
        .create = pn_socket_create,
        .owner = THIS_MODULE,
};

/* Phonet device header operations */
static int pn_header_create(struct sk_buff *skb, struct net_device *dev,
                                unsigned short type, const void *daddr,
                                const void *saddr, unsigned int len)
{
        u8 *media = skb_push(skb, 1);

        if (type != ETH_P_PHONET)
                return -1;

        if (!saddr)
                saddr = dev->dev_addr;
        *media = *(const u8 *)saddr;
        return 1;
}

static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
{
        const u8 *media = skb_mac_header(skb);
        *haddr = *media;
        return 1;
}

const struct header_ops phonet_header_ops = {
        .create = pn_header_create,
        .parse = pn_header_parse,
};
EXPORT_SYMBOL(phonet_header_ops);

/*
 * Prepends an ISI header and sends a datagram.
 */
static int pn_send(struct sk_buff *skb, struct net_device *dev,
                        u16 dst, u16 src, u8 res)
{
        struct phonethdr *ph;
        int err;

        if (skb->len + 2 > 0xffff /* Phonet length field limit */ ||
            skb->len + sizeof(struct phonethdr) > dev->mtu) {
                err = -EMSGSIZE;
                goto drop;
        }

        /* Broadcast sending is not implemented */
        if (pn_addr(dst) == PNADDR_BROADCAST) {
                err = -EOPNOTSUPP;
                goto drop;
        }

        skb_reset_transport_header(skb);
        WARN_ON(skb_headroom(skb) & 1); /* HW assumes word alignment */
        skb_push(skb, sizeof(struct phonethdr));
        skb_reset_network_header(skb);
        ph = pn_hdr(skb);
        ph->pn_rdev = pn_dev(dst);
        ph->pn_sdev = pn_dev(src);
        ph->pn_res = res;
        ph->pn_length = __cpu_to_be16(skb->len + 2 - sizeof(*ph));
        ph->pn_robj = pn_obj(dst);
        ph->pn_sobj = pn_obj(src);

        skb->protocol = htons(ETH_P_PHONET);
        skb->priority = 0;
        skb->dev = dev;

        if (skb->pkt_type == PACKET_LOOPBACK) {
                skb_reset_mac_header(skb);
                skb_orphan(skb);
                err = netif_rx(skb) ? -ENOBUFS : 0;
        } else {
                err = dev_hard_header(skb, dev, ntohs(skb->protocol),
                                        NULL, NULL, skb->len);
                if (err < 0) {
                        err = -EHOSTUNREACH;
                        goto drop;
                }
                err = dev_queue_xmit(skb);
                if (unlikely(err > 0))
                        err = net_xmit_errno(err);
        }

        return err;
drop:
        kfree_skb(skb);
        return err;
}

static int pn_raw_send(const void *data, int len, struct net_device *dev,
                        u16 dst, u16 src, u8 res)
{
        struct sk_buff *skb = alloc_skb(MAX_PHONET_HEADER + len, GFP_ATOMIC);
        if (skb == NULL)
                return -ENOMEM;

        if (phonet_address_lookup(dev_net(dev), pn_addr(dst)) == 0)
                skb->pkt_type = PACKET_LOOPBACK;

        skb_reserve(skb, MAX_PHONET_HEADER);
        __skb_put(skb, len);
        skb_copy_to_linear_data(skb, data, len);
        return pn_send(skb, dev, dst, src, res);
}

/*
 * Create a Phonet header for the skb and send it out. Returns
 * non-zero error code if failed. The skb is freed then.
 */
int pn_skb_send(struct sock *sk, struct sk_buff *skb,
                const struct sockaddr_pn *target)
{
        struct net *net = sock_net(sk);
        struct net_device *dev;
        struct pn_sock *pn = pn_sk(sk);
        int err;
        u16 src, dst;
        u8 daddr, saddr, res;

        src = pn->sobject;
        if (target != NULL) {
                dst = pn_sockaddr_get_object(target);
                res = pn_sockaddr_get_resource(target);
        } else {
                dst = pn->dobject;
                res = pn->resource;
        }
        daddr = pn_addr(dst);

        err = -EHOSTUNREACH;
        if (sk->sk_bound_dev_if)
                dev = dev_get_by_index(net, sk->sk_bound_dev_if);
        else if (phonet_address_lookup(net, daddr) == 0) {
                dev = phonet_device_get(net);
                skb->pkt_type = PACKET_LOOPBACK;
        } else if (dst == 0) {
                /* Resource routing (small race until phonet_rcv()) */
                struct sock *sk = pn_find_sock_by_res(net, res);
                if (sk)        {
                        sock_put(sk);
                        dev = phonet_device_get(net);
                        skb->pkt_type = PACKET_LOOPBACK;
                } else
                        dev = phonet_route_output(net, daddr);
        } else
                dev = phonet_route_output(net, daddr);

        if (!dev || !(dev->flags & IFF_UP))
                goto drop;

        saddr = phonet_address_get(dev, daddr);
        if (saddr == PN_NO_ADDR)
                goto drop;

        if (!pn_addr(src))
                src = pn_object(saddr, pn_obj(src));

        err = pn_send(skb, dev, dst, src, res);
        dev_put(dev);
        return err;

drop:
        kfree_skb(skb);
        dev_put(dev);
        return err;
}
EXPORT_SYMBOL(pn_skb_send);

/* Do not send an error message in response to an error message */
static inline int can_respond(struct sk_buff *skb)
{
        const struct phonethdr *ph;
        const struct phonetmsg *pm;
        u8 submsg_id;

        if (!pskb_may_pull(skb, 3))
                return 0;

        ph = pn_hdr(skb);
        if (ph->pn_res == PN_PREFIX && !pskb_may_pull(skb, 5))
                return 0;
        if (ph->pn_res == PN_COMMGR) /* indications */
                return 0;

        ph = pn_hdr(skb); /* re-acquires the pointer */
        pm = pn_msg(skb);
        if (pm->pn_msg_id != PN_COMMON_MESSAGE)
                return 1;
        submsg_id = (ph->pn_res == PN_PREFIX)
                ? pm->pn_e_submsg_id : pm->pn_submsg_id;
        if (submsg_id != PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP &&
                pm->pn_e_submsg_id != PN_COMM_SERVICE_NOT_IDENTIFIED_RESP)
                return 1;
        return 0;
}

static int send_obj_unreachable(struct sk_buff *rskb)
{
        const struct phonethdr *oph = pn_hdr(rskb);
        const struct phonetmsg *opm = pn_msg(rskb);
        struct phonetmsg resp;

        memset(&resp, 0, sizeof(resp));
        resp.pn_trans_id = opm->pn_trans_id;
        resp.pn_msg_id = PN_COMMON_MESSAGE;
        if (oph->pn_res == PN_PREFIX) {
                resp.pn_e_res_id = opm->pn_e_res_id;
                resp.pn_e_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
                resp.pn_e_orig_msg_id = opm->pn_msg_id;
                resp.pn_e_status = 0;
        } else {
                resp.pn_submsg_id = PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP;
                resp.pn_orig_msg_id = opm->pn_msg_id;
                resp.pn_status = 0;
        }
        return pn_raw_send(&resp, sizeof(resp), rskb->dev,
                                pn_object(oph->pn_sdev, oph->pn_sobj),
                                pn_object(oph->pn_rdev, oph->pn_robj),
                                oph->pn_res);
}

static int send_reset_indications(struct sk_buff *rskb)
{
        struct phonethdr *oph = pn_hdr(rskb);
        static const u8 data[4] = {
                0x00 /* trans ID */, 0x10 /* subscribe msg */,
                0x00 /* subscription count */, 0x00 /* dummy */
        };

        return pn_raw_send(data, sizeof(data), rskb->dev,
                                pn_object(oph->pn_sdev, 0x00),
                                pn_object(oph->pn_rdev, oph->pn_robj),
                                PN_COMMGR);
}


/* packet type functions */

/*
 * Stuff received packets to associated sockets.
 * On error, returns non-zero and releases the skb.
 */
static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
                        struct packet_type *pkttype,
                        struct net_device *orig_dev)
{
        struct net *net = dev_net(dev);
        struct phonethdr *ph;
        struct sockaddr_pn sa;
        u16 len;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb)
                return NET_RX_DROP;

        /* check we have at least a full Phonet header */
        if (!pskb_pull(skb, sizeof(struct phonethdr)))
                goto out;

        /* check that the advertised length is correct */
        ph = pn_hdr(skb);
        len = get_unaligned_be16(&ph->pn_length);
        if (len < 2)
                goto out;
        len -= 2;
        if ((len > skb->len) || pskb_trim(skb, len))
                goto out;
        skb_reset_transport_header(skb);

        pn_skb_get_dst_sockaddr(skb, &sa);

        /* check if this is broadcasted */
        if (pn_sockaddr_get_addr(&sa) == PNADDR_BROADCAST) {
                pn_deliver_sock_broadcast(net, skb);
                goto out;
        }

        /* resource routing */
        if (pn_sockaddr_get_object(&sa) == 0) {
                struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource);
                if (sk)
                        return sk_receive_skb(sk, skb, 0);
        }

        /* check if we are the destination */
        if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) {
                /* Phonet packet input */
                struct sock *sk = pn_find_sock_by_sa(net, &sa);

                if (sk)
                        return sk_receive_skb(sk, skb, 0);

                if (can_respond(skb)) {
                        send_obj_unreachable(skb);
                        send_reset_indications(skb);
                }
        } else if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
                goto out; /* Race between address deletion and loopback */
        else {
                /* Phonet packet routing */
                struct net_device *out_dev;

                out_dev = phonet_route_output(net, pn_sockaddr_get_addr(&sa));
                if (!out_dev) {
                        net_dbg_ratelimited("No Phonet route to %02X\n",
                                            pn_sockaddr_get_addr(&sa));
                        goto out;
                }

                __skb_push(skb, sizeof(struct phonethdr));
                skb->dev = out_dev;
                if (out_dev == dev) {
                        net_dbg_ratelimited("Phonet loop to %02X on %s\n",
                                            pn_sockaddr_get_addr(&sa),
                                            dev->name);
                        goto out_dev;
                }
                /* Some drivers (e.g. TUN) do not allocate HW header space */
                if (skb_cow_head(skb, out_dev->hard_header_len))
                        goto out_dev;

                if (dev_hard_header(skb, out_dev, ETH_P_PHONET, NULL, NULL,
                                        skb->len) < 0)
                        goto out_dev;
                dev_queue_xmit(skb);
                dev_put(out_dev);
                return NET_RX_SUCCESS;
out_dev:
                dev_put(out_dev);
        }

out:
        kfree_skb(skb);
        return NET_RX_DROP;
}

static struct packet_type phonet_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_PHONET),
        .func = phonet_rcv,
};

static DEFINE_MUTEX(proto_tab_lock);

int __init_or_module phonet_proto_register(unsigned int protocol,
                                const struct phonet_protocol *pp)
{
        int err = 0;

        if (protocol >= PHONET_NPROTO)
                return -EINVAL;

        err = proto_register(pp->prot, 1);
        if (err)
                return err;

        mutex_lock(&proto_tab_lock);
        if (proto_tab[protocol])
                err = -EBUSY;
        else
                rcu_assign_pointer(proto_tab[protocol], pp);
        mutex_unlock(&proto_tab_lock);

        return err;
}
EXPORT_SYMBOL(phonet_proto_register);

void phonet_proto_unregister(unsigned int protocol,
                        const struct phonet_protocol *pp)
{
        mutex_lock(&proto_tab_lock);
        BUG_ON(proto_tab[protocol] != pp);
        RCU_INIT_POINTER(proto_tab[protocol], NULL);
        mutex_unlock(&proto_tab_lock);
        synchronize_rcu();
        proto_unregister(pp->prot);
}
EXPORT_SYMBOL(phonet_proto_unregister);

/* Module registration */
static int __init phonet_init(void)
{
        int err;

        err = phonet_device_init();
        if (err)
                return err;

        pn_sock_init();
        err = sock_register(&phonet_proto_family);
        if (err) {
                printk(KERN_ALERT
                        "phonet protocol family initialization failed\n");
                goto err_sock;
        }

        dev_add_pack(&phonet_packet_type);
        phonet_sysctl_init();

        err = isi_register();
        if (err)
                goto err;
        return 0;

err:
        phonet_sysctl_exit();
        sock_unregister(PF_PHONET);
        dev_remove_pack(&phonet_packet_type);
err_sock:
        phonet_device_exit();
        return err;
}

static void __exit phonet_exit(void)
{
        isi_unregister();
        phonet_sysctl_exit();
        sock_unregister(PF_PHONET);
        dev_remove_pack(&phonet_packet_type);
        phonet_device_exit();
}

module_init(phonet_init);
module_exit(phonet_exit);
MODULE_DESCRIPTION("Phonet protocol stack for Linux");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_PHONET);







































































































































    1 
    1 












    1 















































































































































































    1 



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/buffer_head.h
 *
 * Everything to do with buffer_heads.
 */

#ifndef _LINUX_BUFFER_HEAD_H
#define _LINUX_BUFFER_HEAD_H

#include <linux/types.h>
#include <linux/blk_types.h>
#include <linux/fs.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/wait.h>
#include <linux/atomic.h>

enum bh_state_bits {
        BH_Uptodate,        /* Contains valid data */
        BH_Dirty,        /* Is dirty */
        BH_Lock,        /* Is locked */
        BH_Req,                /* Has been submitted for I/O */

        BH_Mapped,        /* Has a disk mapping */
        BH_New,                /* Disk mapping was newly created by get_block */
        BH_Async_Read,        /* Is under end_buffer_async_read I/O */
        BH_Async_Write,        /* Is under end_buffer_async_write I/O */
        BH_Delay,        /* Buffer is not yet allocated on disk */
        BH_Boundary,        /* Block is followed by a discontiguity */
        BH_Write_EIO,        /* I/O error on write */
        BH_Unwritten,        /* Buffer is allocated on disk but not written */
        BH_Quiet,        /* Buffer Error Prinks to be quiet */
        BH_Meta,        /* Buffer contains metadata */
        BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
        BH_Defer_Completion, /* Defer AIO completion to workqueue */

        BH_PrivateStart,/* not a state bit, but the first bit available
                         * for private allocation by other entities
                         */
};

#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)

struct page;
struct buffer_head;
struct address_space;
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);

/*
 * Historically, a buffer_head was used to map a single block
 * within a page, and of course as the unit of I/O through the
 * filesystem and block layers.  Nowadays the basic I/O unit
 * is the bio, and buffer_heads are used for extracting block
 * mappings (via a get_block_t call), for tracking state within
 * a page (via a page_mapping) and for wrapping bio submission
 * for backward compatibility reasons (e.g. submit_bh).
 */
struct buffer_head {
        unsigned long b_state;                /* buffer state bitmap (see above) */
        struct buffer_head *b_this_page;/* circular list of page's buffers */
        union {
                struct page *b_page;        /* the page this bh is mapped to */
                struct folio *b_folio;        /* the folio this bh is mapped to */
        };

        sector_t b_blocknr;                /* start block number */
        size_t b_size;                        /* size of mapping */
        char *b_data;                        /* pointer to data within the page */

        struct block_device *b_bdev;
        bh_end_io_t *b_end_io;                /* I/O completion */
         void *b_private;                /* reserved for b_end_io */
        struct list_head b_assoc_buffers; /* associated with another mapping */
        struct address_space *b_assoc_map;        /* mapping this buffer is
                                                   associated with */
        atomic_t b_count;                /* users using this buffer_head */
        spinlock_t b_uptodate_lock;        /* Used by the first bh in a page, to
                                         * serialise IO completion of other
                                         * buffers in the page */
};

/*
 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
 * and buffer_foo() functions.
 * To avoid reset buffer flags that are already set, because that causes
 * a costly cache line transition, check the flag first.
 */
#define BUFFER_FNS(bit, name)                                                \
static __always_inline void set_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        if (!test_bit(BH_##bit, &(bh)->b_state))                        \
                set_bit(BH_##bit, &(bh)->b_state);                        \
}                                                                        \
static __always_inline void clear_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        clear_bit(BH_##bit, &(bh)->b_state);                                \
}                                                                        \
static __always_inline int buffer_##name(const struct buffer_head *bh)        \
{                                                                        \
        return test_bit(BH_##bit, &(bh)->b_state);                        \
}

/*
 * test_set_buffer_foo() and test_clear_buffer_foo()
 */
#define TAS_BUFFER_FNS(bit, name)                                        \
static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_set_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \
static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_clear_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \

/*
 * Emit the buffer bitops functions.   Note that there are also functions
 * of the form "mark_buffer_foo()".  These are higher-level functions which
 * do something in addition to setting a b_state bit.
 */
BUFFER_FNS(Dirty, dirty)
TAS_BUFFER_FNS(Dirty, dirty)
BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req)
TAS_BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped)
BUFFER_FNS(New, new)
BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)

static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If somebody else already set this uptodate, they will
         * have done the memory barrier, and a reader will thus
         * see *some* valid buffer state.
         *
         * Any other serialization (with IO errors or whatever that
         * might clear the bit) has to come from other state (eg BH_Lock).
         */
        if (test_bit(BH_Uptodate, &bh->b_state))
                return;

        /*
         * make it consistent with folio_mark_uptodate
         * pairs with smp_load_acquire in buffer_uptodate
         */
        smp_mb__before_atomic();
        set_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline void clear_buffer_uptodate(struct buffer_head *bh)
{
        clear_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline int buffer_uptodate(const struct buffer_head *bh)
{
        /*
         * make it consistent with folio_test_uptodate
         * pairs with smp_mb__before_atomic in set_buffer_uptodate
         */
        return test_bit_acquire(BH_Uptodate, &bh->b_state);
}

static inline unsigned long bh_offset(const struct buffer_head *bh)
{
        return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1);
}

/* If we *know* page->private refers to buffer_heads */
#define page_buffers(page)                                        \
        ({                                                        \
                BUG_ON(!PagePrivate(page));                        \
                ((struct buffer_head *)page_private(page));        \
        })
#define page_has_buffers(page)        PagePrivate(page)
#define folio_buffers(folio)                folio_get_private(folio)

void buffer_check_dirty_writeback(struct folio *folio,
                                     bool *dirty, bool *writeback);

/*
 * Declarations
 */

void mark_buffer_dirty(struct buffer_head *bh);
void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
                  unsigned long offset);
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
                                        gfp_t gfp);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                bool retry);
struct buffer_head *create_empty_buffers(struct folio *folio,
                unsigned long blocksize, unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);

/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
                                  bool datasync);
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
                          bool datasync);
void clean_bdev_aliases(struct block_device *bdev, sector_t block,
                        sector_t len);
static inline void clean_bdev_bh_alias(struct buffer_head *bh)
{
        clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1);
}

void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
                        unsigned size);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *,
                                sector_t block, unsigned size, gfp_t gfp);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void submit_bh(blk_opf_t, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[],
                     blk_opf_t op_flags, bool force_lock);

/*
 * Generic address_space_operations implementations for buffer_head-backed
 * address_spaces.
 */
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
                void *get_block);
int __block_write_full_folio(struct inode *inode, struct folio *folio,
                get_block_t *get_block, struct writeback_control *wbc);
int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                struct page **pagep, get_block_t *get_block);
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block);
int block_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
int generic_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
int cont_write_begin(struct file *, struct address_space *, loff_t,
                        unsigned, struct page **, void **,
                        get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
void block_commit_write(struct page *page, unsigned int from, unsigned int to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                                get_block_t get_block);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);

#ifdef CONFIG_MIGRATION
extern int buffer_migrate_folio(struct address_space *,
                struct folio *dst, struct folio *src, enum migrate_mode);
extern int buffer_migrate_folio_norefs(struct address_space *,
                struct folio *dst, struct folio *src, enum migrate_mode);
#else
#define buffer_migrate_folio NULL
#define buffer_migrate_folio_norefs NULL
#endif

/*
 * inline definitions
 */

static inline void get_bh(struct buffer_head *bh)
{
        atomic_inc(&bh->b_count);
}

static inline void put_bh(struct buffer_head *bh)
{
        smp_mb__before_atomic();
        atomic_dec(&bh->b_count);
}

/**
 * brelse - Release a buffer.
 * @bh: The buffer to release.
 *
 * Decrement a buffer_head's reference count.  If @bh is NULL, this
 * function is a no-op.
 *
 * If all buffers on a folio have zero reference count, are clean
 * and unlocked, and if the folio is unlocked and not under writeback
 * then try_to_free_buffers() may strip the buffers from the folio in
 * preparation for freeing it (sometimes, rarely, buffers are removed
 * from a folio but it ends up not being freed, and buffers may later
 * be reattached).
 *
 * Context: Any context.
 */
static inline void brelse(struct buffer_head *bh)
{
        if (bh)
                __brelse(bh);
}

/**
 * bforget - Discard any dirty data in a buffer.
 * @bh: The buffer to forget.
 *
 * Call this function instead of brelse() if the data written to a buffer
 * no longer needs to be written back.  It will clear the buffer's dirty
 * flag so writeback of this buffer will be skipped.
 *
 * Context: Any context.
 */
static inline void bforget(struct buffer_head *bh)
{
        if (bh)
                __bforget(bh);
}

static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}

static inline struct buffer_head *
sb_bread_unmovable(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}

static inline void
sb_breadahead(struct super_block *sb, sector_t block)
{
        __breadahead(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
                sector_t block, unsigned size)
{
        gfp_t gfp;

        gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
        gfp |= __GFP_NOFAIL;

        return bdev_getblk(bdev, block, size, gfp);
}

static inline struct buffer_head *__getblk(struct block_device *bdev,
                sector_t block, unsigned size)
{
        gfp_t gfp;

        gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
        gfp |= __GFP_MOVABLE | __GFP_NOFAIL;

        return bdev_getblk(bdev, block, size, gfp);
}

static inline struct buffer_head *sb_getblk(struct super_block *sb,
                sector_t block)
{
        return __getblk(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb,
                sector_t block, gfp_t gfp)
{
        return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);
}

static inline struct buffer_head *
sb_find_get_block(struct super_block *sb, sector_t block)
{
        return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
        set_buffer_mapped(bh);
        bh->b_bdev = sb->s_bdev;
        bh->b_blocknr = block;
        bh->b_size = sb->s_blocksize;
}

static inline void wait_on_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (buffer_locked(bh))
                __wait_on_buffer(bh);
}

static inline int trylock_buffer(struct buffer_head *bh)
{
        return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
}

static inline void lock_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (!trylock_buffer(bh))
                __lock_buffer(bh);
}

static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
                if (!buffer_uptodate(bh))
                        __bh_read(bh, op_flags, false);
                else
                        unlock_buffer(bh);
        }
}

static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (!bh_uptodate_or_lock(bh))
                __bh_read(bh, op_flags, false);
}

/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */
static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (bh_uptodate_or_lock(bh))
                return 1;
        return __bh_read(bh, op_flags, true);
}

static inline void bh_read_batch(int nr, struct buffer_head *bhs[])
{
        __bh_read_batch(nr, bhs, 0, true);
}

static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
                                      blk_opf_t op_flags)
{
        __bh_read_batch(nr, bhs, op_flags, false);
}

/**
 * __bread() - Read a block.
 * @bdev: The block device to read from.
 * @block: Block number in units of block size.
 * @size: The block size of this device in bytes.
 *
 * Read a specified block, and return the buffer head that refers
 * to it.  The memory is allocated from the movable area so that it can
 * be migrated.  The returned buffer head has its refcount increased.
 * The caller should call brelse() when it has finished with the buffer.
 *
 * Context: May sleep waiting for I/O.
 * Return: NULL if the block was unreadable.
 */
static inline struct buffer_head *__bread(struct block_device *bdev,
                sector_t block, unsigned size)
{
        return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
}

/**
 * get_nth_bh - Get a reference on the n'th buffer after this one.
 * @bh: The buffer to start counting from.
 * @count: How many buffers to skip.
 *
 * This is primarily useful for finding the nth buffer in a folio; in
 * that case you pass the head buffer and the byte offset in the folio
 * divided by the block size.  It can be used for other purposes, but
 * it will wrap at the end of the folio rather than returning NULL or
 * proceeding to the next folio for you.
 *
 * Return: The requested buffer with an elevated refcount.
 */
static inline __must_check
struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count)
{
        while (count--)
                bh = bh->b_this_page;
        get_bh(bh);
        return bh;
}

bool block_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_BUFFER_HEAD

void buffer_init(void);
bool try_to_free_buffers(struct folio *folio);
int inode_has_buffers(struct inode *inode);
void invalidate_inode_buffers(struct inode *inode);
int remove_inode_buffers(struct inode *inode);
int sync_mapping_buffers(struct address_space *mapping);
void invalidate_bh_lrus(void);
void invalidate_bh_lrus_cpu(void);
bool has_bh_in_lru(int cpu, void *dummy);
extern int buffer_heads_over_limit;

#else /* CONFIG_BUFFER_HEAD */

static inline void buffer_init(void) {}
static inline bool try_to_free_buffers(struct folio *folio) { return true; }
static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
static inline void invalidate_bh_lrus(void) {}
static inline void invalidate_bh_lrus_cpu(void) {}
static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
#define buffer_heads_over_limit 0

#endif /* CONFIG_BUFFER_HEAD */
#endif /* _LINUX_BUFFER_HEAD_H */















    1 





    1 









































































    1 




    1 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
 */

#include "ipvlan.h"

static unsigned int ipvlan_netid __read_mostly;

struct ipvlan_netns {
        unsigned int ipvl_nf_hook_refcnt;
};

static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb,
                                            struct net_device *dev)
{
        struct ipvl_addr *addr = NULL;
        struct ipvl_port *port;
        int addr_type;
        void *lyr3h;

        if (!dev || !netif_is_ipvlan_port(dev))
                goto out;

        port = ipvlan_port_get_rcu(dev);
        if (!port || port->mode != IPVLAN_MODE_L3S)
                goto out;

        lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
out:
        return addr;
}

static struct sk_buff *ipvlan_l3_rcv(struct net_device *dev,
                                     struct sk_buff *skb, u16 proto)
{
        struct ipvl_addr *addr;
        struct net_device *sdev;

        addr = ipvlan_skb_to_addr(skb, dev);
        if (!addr)
                goto out;

        sdev = addr->master->dev;
        switch (proto) {
        case AF_INET:
        {
                struct iphdr *ip4h = ip_hdr(skb);
                int err;

                err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr,
                                           ip4h->tos, sdev);
                if (unlikely(err))
                        goto out;
                break;
        }
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
        {
                struct dst_entry *dst;
                struct ipv6hdr *ip6h = ipv6_hdr(skb);
                int flags = RT6_LOOKUP_F_HAS_SADDR;
                struct flowi6 fl6 = {
                        .flowi6_iif   = sdev->ifindex,
                        .daddr        = ip6h->daddr,
                        .saddr        = ip6h->saddr,
                        .flowlabel    = ip6_flowinfo(ip6h),
                        .flowi6_mark  = skb->mark,
                        .flowi6_proto = ip6h->nexthdr,
                };

                skb_dst_drop(skb);
                dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6,
                                             skb, flags);
                skb_dst_set(skb, dst);
                break;
        }
#endif
        default:
                break;
        }
out:
        return skb;
}

static const struct l3mdev_ops ipvl_l3mdev_ops = {
        .l3mdev_l3_rcv = ipvlan_l3_rcv,
};

static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
                                    const struct nf_hook_state *state)
{
        struct ipvl_addr *addr;
        unsigned int len;

        addr = ipvlan_skb_to_addr(skb, skb->dev);
        if (!addr)
                goto out;

        skb->dev = addr->master->dev;
        skb->skb_iif = skb->dev->ifindex;
#if IS_ENABLED(CONFIG_IPV6)
        if (addr->atype == IPVL_IPV6)
                IP6CB(skb)->iif = skb->dev->ifindex;
#endif
        len = skb->len + ETH_HLEN;
        ipvlan_count_rx(addr->master, len, true, false);
out:
        return NF_ACCEPT;
}

static const struct nf_hook_ops ipvl_nfops[] = {
        {
                .hook     = ipvlan_nf_input,
                .pf       = NFPROTO_IPV4,
                .hooknum  = NF_INET_LOCAL_IN,
                .priority = INT_MAX,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook     = ipvlan_nf_input,
                .pf       = NFPROTO_IPV6,
                .hooknum  = NF_INET_LOCAL_IN,
                .priority = INT_MAX,
        },
#endif
};

static int ipvlan_register_nf_hook(struct net *net)
{
        struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);
        int err = 0;

        if (!vnet->ipvl_nf_hook_refcnt) {
                err = nf_register_net_hooks(net, ipvl_nfops,
                                            ARRAY_SIZE(ipvl_nfops));
                if (!err)
                        vnet->ipvl_nf_hook_refcnt = 1;
        } else {
                vnet->ipvl_nf_hook_refcnt++;
        }

        return err;
}

static void ipvlan_unregister_nf_hook(struct net *net)
{
        struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);

        if (WARN_ON(!vnet->ipvl_nf_hook_refcnt))
                return;

        vnet->ipvl_nf_hook_refcnt--;
        if (!vnet->ipvl_nf_hook_refcnt)
                nf_unregister_net_hooks(net, ipvl_nfops,
                                        ARRAY_SIZE(ipvl_nfops));
}

void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet)
{
        struct ipvlan_netns *old_vnet;

        ASSERT_RTNL();

        old_vnet = net_generic(oldnet, ipvlan_netid);
        if (!old_vnet->ipvl_nf_hook_refcnt)
                return;

        ipvlan_register_nf_hook(newnet);
        ipvlan_unregister_nf_hook(oldnet);
}

static void ipvlan_ns_exit(struct net *net)
{
        struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);

        if (WARN_ON_ONCE(vnet->ipvl_nf_hook_refcnt)) {
                vnet->ipvl_nf_hook_refcnt = 0;
                nf_unregister_net_hooks(net, ipvl_nfops,
                                        ARRAY_SIZE(ipvl_nfops));
        }
}

static struct pernet_operations ipvlan_net_ops = {
        .id   = &ipvlan_netid,
        .size = sizeof(struct ipvlan_netns),
        .exit = ipvlan_ns_exit,
};

int ipvlan_l3s_init(void)
{
        return register_pernet_subsys(&ipvlan_net_ops);
}

void ipvlan_l3s_cleanup(void)
{
        unregister_pernet_subsys(&ipvlan_net_ops);
}

int ipvlan_l3s_register(struct ipvl_port *port)
{
        struct net_device *dev = port->dev;
        int ret;

        ASSERT_RTNL();

        ret = ipvlan_register_nf_hook(read_pnet(&port->pnet));
        if (!ret) {
                dev->l3mdev_ops = &ipvl_l3mdev_ops;
                dev->priv_flags |= IFF_L3MDEV_RX_HANDLER;
        }

        return ret;
}

void ipvlan_l3s_unregister(struct ipvl_port *port)
{
        struct net_device *dev = port->dev;

        ASSERT_RTNL();

        dev->priv_flags &= ~IFF_L3MDEV_RX_HANDLER;
        ipvlan_unregister_nf_hook(read_pnet(&port->pnet));
        dev->l3mdev_ops = NULL;
}























































































































































    1 









































































































































































































































































































































    1 


















    1 

























    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 



    1 








































    1 















    1 





























    1 









    1 


    1 










    1 





    1 
    1 





    1 






    1 




    1 














    1 









    1 



    1 


    1 






    1 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#include <trace/events/sched.h>

#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;        /* Handle normal Linux uptimes. */
int nr_threads;                        /* The idle threads do not count.. */

static int max_threads;                /* tunable limit on nr_threads */

#define NAMED_ARRAY_INDEX(x)        [x] = __stringify(x)

static const char * const resident_page_types[] = {
        NAMED_ARRAY_INDEX(MM_FILEPAGES),
        NAMED_ARRAY_INDEX(MM_ANONPAGES),
        NAMED_ARRAY_INDEX(MM_SWAPENTS),
        NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
        return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */

int nr_processes(void)
{
        int cpu;
        int total = 0;

        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);

        return total;
}

void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

static struct kmem_cache *task_struct_cachep;

static inline struct task_struct *alloc_task_struct_node(int node)
{
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
        kmem_cache_free(task_struct_cachep, tsk);
}

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

#  ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);

struct vm_stack {
        struct rcu_head rcu;
        struct vm_struct *stack_vm_area;
};

static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
{
        unsigned int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
                        continue;
                return true;
        }
        return false;
}

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);

        if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
                return;

        vfree(vm_stack);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct vm_stack *vm_stack = tsk->stack;

        vm_stack->stack_vm_area = tsk->stack_vm_area;
        call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
}

static int free_vm_stack_cache(unsigned int cpu)
{
        struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *vm_stack = cached_vm_stacks[i];

                if (!vm_stack)
                        continue;

                vfree(vm_stack->addr);
                cached_vm_stacks[i] = NULL;
        }

        return 0;
}

static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
        int i;
        int ret;
        int nr_charged = 0;

        BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
                if (ret)
                        goto err;
                nr_charged++;
        }
        return 0;
err:
        for (i = 0; i < nr_charged; i++)
                memcg_kmem_uncharge_page(vm->pages[i], 0);
        return ret;
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct vm_struct *vm;
        void *stack;
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *s;

                s = this_cpu_xchg(cached_stacks[i], NULL);

                if (!s)
                        continue;

                /* Reset stack metadata. */
                kasan_unpoison_range(s->addr, THREAD_SIZE);

                stack = kasan_reset_tag(s->addr);

                /* Clear stale pointers from reused stack. */
                memset(stack, 0, THREAD_SIZE);

                if (memcg_charge_kernel_stack(s)) {
                        vfree(s->addr);
                        return -ENOMEM;
                }

                tsk->stack_vm_area = s;
                tsk->stack = stack;
                return 0;
        }

        /*
         * Allocated stacks are cached and later reused by new threads,
         * so memcg accounting is performed manually on assigning/releasing
         * stacks to tasks. Drop __GFP_ACCOUNT.
         */
        stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
                                     VMALLOC_START, VMALLOC_END,
                                     THREADINFO_GFP & ~__GFP_ACCOUNT,
                                     PAGE_KERNEL,
                                     0, node, __builtin_return_address(0));
        if (!stack)
                return -ENOMEM;

        vm = find_vm_area(stack);
        if (memcg_charge_kernel_stack(vm)) {
                vfree(stack);
                return -ENOMEM;
        }
        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
        tsk->stack_vm_area = vm;
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return 0;
}

static void free_thread_stack(struct task_struct *tsk)
{
        if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
                thread_stack_delayed_free(tsk);

        tsk->stack = NULL;
        tsk->stack_vm_area = NULL;
}

#  else /* !CONFIG_VMAP_STACK */

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);

        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
                return 0;
        }
        return -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

#  endif /* CONFIG_VMAP_STACK */
# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */

static struct kmem_cache *thread_stack_cache;

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        kmem_cache_free(thread_stack_cache, rh);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

void thread_stack_cache_init(void)
{
        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
}

# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;

/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;

/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;

/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;

/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;

#ifdef CONFIG_PER_VMA_LOCK

/* SLAB cache for vm_area_struct.lock */
static struct kmem_cache *vma_lock_cachep;

static bool vma_lock_alloc(struct vm_area_struct *vma)
{
        vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
        if (!vma->vm_lock)
                return false;

        init_rwsem(&vma->vm_lock->lock);
        vma->vm_lock_seq = -1;

        return true;
}

static inline void vma_lock_free(struct vm_area_struct *vma)
{
        kmem_cache_free(vma_lock_cachep, vma->vm_lock);
}

#else /* CONFIG_PER_VMA_LOCK */

static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
static inline void vma_lock_free(struct vm_area_struct *vma) {}

#endif /* CONFIG_PER_VMA_LOCK */

struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
                return NULL;

        vma_init(vma, mm);
        if (!vma_lock_alloc(vma)) {
                kmem_cache_free(vm_area_cachep, vma);
                return NULL;
        }

        return vma;
}

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

        if (!new)
                return NULL;

        ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
        ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
        /*
         * orig->shared.rb may be modified concurrently, but the clone
         * will be reinitialized.
         */
        data_race(memcpy(new, orig, sizeof(*new)));
        if (!vma_lock_alloc(new)) {
                kmem_cache_free(vm_area_cachep, new);
                return NULL;
        }
        INIT_LIST_HEAD(&new->anon_vma_chain);
        vma_numab_state_init(new);
        dup_anon_vma_name(orig, new);

        return new;
}

void __vm_area_free(struct vm_area_struct *vma)
{
        vma_numab_state_free(vma);
        free_anon_vma_name(vma);
        vma_lock_free(vma);
        kmem_cache_free(vm_area_cachep, vma);
}

#ifdef CONFIG_PER_VMA_LOCK
static void vm_area_free_rcu_cb(struct rcu_head *head)
{
        struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
                                                  vm_rcu);

        /* The vma should not be locked while being destroyed. */
        VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
        __vm_area_free(vma);
}
#endif

void vm_area_free(struct vm_area_struct *vma)
{
#ifdef CONFIG_PER_VMA_LOCK
        call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
#else
        __vm_area_free(vma);
#endif
}

static void account_kernel_stack(struct task_struct *tsk, int account)
{
        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm = task_stack_vm_area(tsk);
                int i;

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
                                              account * (PAGE_SIZE / 1024));
        } else {
                void *stack = task_stack_page(tsk);

                /* All stack pages are in the same node. */
                mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        }
}

void exit_task_stack_account(struct task_struct *tsk)
{
        account_kernel_stack(tsk, -1);

        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm;
                int i;

                vm = task_stack_vm_area(tsk);
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);
        }
}

static void release_task_stack(struct task_struct *tsk)
{
        if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */

        free_thread_stack(tsk);
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
        if (refcount_dec_and_test(&tsk->stack_refcount))
                release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
        WARN_ON_ONCE(tsk->seccomp.filter);
#endif
        release_user_cpus_ptr(tsk);
        scs_release(tsk);

#ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
         * so free both.
         */
        release_task_stack(tsk);
#else
        /*
         * If the task had a separate stack allocation, it should be gone
         * by now.
         */
        WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
        bpf_task_storage_free(tsk);
        free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
        struct file *exe_file;

        exe_file = get_mm_exe_file(oldmm);
        RCU_INIT_POINTER(mm->exe_file, exe_file);
        /*
         * We depend on the oldmm having properly denied write access to the
         * exe_file already.
         */
        if (exe_file && deny_write_access(exe_file))
                pr_warn_once("deny_write_access() failed in %s\n", __func__);
}

#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
                                        struct mm_struct *oldmm)
{
        struct vm_area_struct *mpnt, *tmp;
        int retval;
        unsigned long charge = 0;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, 0);

        uprobe_start_dup_mmap();
        if (mmap_write_lock_killable(oldmm)) {
                retval = -EINTR;
                goto fail_uprobe_end;
        }
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
        mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);

        /* No ordering required: file already has been exposed. */
        dup_mm_exe_file(mm, oldmm);

        mm->total_vm = oldmm->total_vm;
        mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;

        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
        khugepaged_fork(mm, oldmm);

        /* Use __mt_dup() to efficiently build an identical maple tree. */
        retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
        if (unlikely(retval))
                goto out;

        mt_clear_in_rcu(vmi.mas.tree);
        for_each_vma(vmi, mpnt) {
                struct file *file;

                vma_start_write(mpnt);
                if (mpnt->vm_flags & VM_DONTCOPY) {
                        retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
                                                    mpnt->vm_end, GFP_KERNEL);
                        if (retval)
                                goto loop_out;

                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                /*
                 * Don't duplicate many vmas if we've been oom-killed (for
                 * example)
                 */
                if (fatal_signal_pending(current)) {
                        retval = -EINTR;
                        goto loop_out;
                }
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);

                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
                tmp = vm_area_dup(mpnt);
                if (!tmp)
                        goto fail_nomem;
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
                tmp->vm_mm = mm;
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
                        /*
                         * VM_WIPEONFORK gets a clean slate in the child.
                         * Don't prepare anon_vma until fault since we don't
                         * copy page for current vma.
                         */
                        tmp->anon_vma = NULL;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                vm_flags_clear(tmp, VM_LOCKED_MASK);
                /*
                 * Copy/update hugetlb private vma information.
                 */
                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);

                /*
                 * Link the vma into the MT. After using __mt_dup(), memory
                 * allocation is not necessary here, so it cannot fail.
                 */
                vma_iter_bulk_store(&vmi, tmp);

                mm->map_count++;

                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);

                file = tmp->vm_file;
                if (file) {
                        struct address_space *mapping = file->f_mapping;

                        get_file(file);
                        i_mmap_lock_write(mapping);
                        if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_interval_tree_insert_after(tmp, mpnt,
                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }

                if (!(tmp->vm_flags & VM_WIPEONFORK))
                        retval = copy_page_range(tmp, mpnt);

                if (retval) {
                        mpnt = vma_next(&vmi);
                        goto loop_out;
                }
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
loop_out:
        vma_iter_free(&vmi);
        if (!retval) {
                mt_set_in_rcu(vmi.mas.tree);
        } else if (mpnt) {
                /*
                 * The entire maple tree has already been duplicated. If the
                 * mmap duplication fails, mark the failure point with
                 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
                 * stop releasing VMAs that have not been duplicated after this
                 * point.
                 */
                mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
                mas_store(&vmi.mas, XA_ZERO_ENTRY);
        }
out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
        mmap_write_unlock(oldmm);
        dup_userfaultfd_complete(&uf);
fail_uprobe_end:
        uprobe_end_dup_mmap();
        return retval;

fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
fail_nomem_policy:
        vm_area_free(tmp);
fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
        goto loop_out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
                return -ENOMEM;
        return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
        pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
        mmap_write_lock(oldmm);
        dup_mm_exe_file(mm, oldmm);
        mmap_write_unlock(oldmm);
        return 0;
}
#define mm_alloc_pgd(mm)        (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

static void check_mm(struct mm_struct *mm)
{
        int i;

        BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
                         "Please make sure 'struct resident_page_types[]' is updated as well");

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = percpu_counter_sum(&mm->rss_stat[i]);

                if (unlikely(x))
                        pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
                                 mm, resident_page_types[i], x);
        }

        if (mm_pgtables_bytes(mm))
                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
                                mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()        (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)        (kmem_cache_free(mm_cachep, (mm)))

static void do_check_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        WARN_ON_ONCE(current->active_mm == mm);
}

static void do_shoot_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        if (current->active_mm == mm) {
                WARN_ON_ONCE(current->mm);
                current->active_mm = &init_mm;
                switch_mm(mm, &init_mm, current);
        }
}

static void cleanup_lazy_tlbs(struct mm_struct *mm)
{
        if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
                /*
                 * In this case, lazy tlb mms are refounted and would not reach
                 * __mmdrop until all CPUs have switched away and mmdrop()ed.
                 */
                return;
        }

        /*
         * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
         * requires lazy mm users to switch to another mm when the refcount
         * drops to zero, before the mm is freed. This requires IPIs here to
         * switch kernel threads to init_mm.
         *
         * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
         * switch with the final userspace teardown TLB flush which leaves the
         * mm lazy on this CPU but no others, reducing the need for additional
         * IPIs here. There are cases where a final IPI is still required here,
         * such as the final mmdrop being performed on a different CPU than the
         * one exiting, or kernel threads using the mm when userspace exits.
         *
         * IPI overheads have not found to be expensive, but they could be
         * reduced in a number of possible ways, for example (roughly
         * increasing order of complexity):
         * - The last lazy reference created by exit_mm() could instead switch
         *   to init_mm, however it's probable this will run on the same CPU
         *   immediately afterwards, so this may not reduce IPIs much.
         * - A batch of mms requiring IPIs could be gathered and freed at once.
         * - CPUs store active_mm where it can be remotely checked without a
         *   lock, to filter out false-positives in the cpumask.
         * - After mm_users or mm_count reaches zero, switching away from the
         *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
         *   with some batching or delaying of the final IPIs.
         * - A delayed freeing and RCU-like quiescing sequence based on mm
         *   switching to avoid IPIs completely.
         */
        on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
        if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
                on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
}

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);

        /* Ensure no CPUs are using this as their lazy tlb mm */
        cleanup_lazy_tlbs(mm);

        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_subscriptions_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
        mm_pasid_drop(mm);
        mm_destroy_cid(mm);
        percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);

        free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);

static void mmdrop_async_fn(struct work_struct *work)
{
        struct mm_struct *mm;

        mm = container_of(work, struct mm_struct, async_put_work);
        __mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
                schedule_work(&mm->async_put_work);
        }
}

static inline void free_signal_struct(struct signal_struct *sig)
{
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
        /*
         * __mmdrop is not safe to call from softirq context on x86 due to
         * pgd_dtor so postpone it to the async context
         */
        if (sig->oom_mm)
                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
        if (refcount_dec_and_test(&sig->sigcnt))
                free_signal_struct(sig);
}

void __put_task_struct(struct task_struct *tsk)
{
        WARN_ON(!tsk->exit_state);
        WARN_ON(refcount_read(&tsk->usage));
        WARN_ON(tsk == current);

        io_uring_free(tsk);
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
        sched_core_free(tsk);
        free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);

void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
        struct task_struct *task = container_of(rhp, struct task_struct, rcu);

        __put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);

void __init __weak arch_task_cache_init(void) { }

/*
 * set_max_threads
 */
static void set_max_threads(unsigned int max_threads_suggested)
{
        u64 threads;
        unsigned long nr_pages = totalram_pages();

        /*
         * The number of threads shall be limited such that the thread
         * structures may only consume a small part of the available memory.
         */
        if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
                threads = MAX_THREADS;
        else
                threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
                                    (u64) THREAD_SIZE * 8UL);

        if (threads > max_threads_suggested)
                threads = max_threads_suggested;

        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif

static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* Fetch thread_struct whitelist for the architecture. */
        arch_thread_struct_whitelist(offset, size);

        /*
         * Handle zero-sized whitelist or empty thread_struct, otherwise
         * adjust offset to position of thread_struct in task_struct.
         */
        if (unlikely(*size == 0))
                *offset = 0;
        else
                *offset += offsetof(struct task_struct, thread);
}

void __init fork_init(void)
{
        int i;
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN        0
#endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
        unsigned long useroffset, usersize;

        /* create a slab on which task_structs can be allocated */
        task_struct_whitelist(&useroffset, &usersize);
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);

        /* do the arch specific task caches init */
        arch_task_cache_init();

        set_max_threads(MAX_THREADS);

        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];

        for (i = 0; i < UCOUNT_COUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;

        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);

#ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
#endif

        scs_init();

        lockdep_init_task(&init_task);
        uprobes_init();
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                                               struct task_struct *src)
{
        *dst = *src;
        return 0;
}

void set_task_stack_end_magic(struct task_struct *tsk)
{
        unsigned long *stackend;

        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;        /* for overflow detection */
}

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
        struct task_struct *tsk;
        int err;

        if (node == NUMA_NO_NODE)
                node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;

        err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto free_tsk;

        err = alloc_thread_stack_node(tsk, node);
        if (err)
                goto free_tsk;

#ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
#endif
        account_kernel_stack(tsk, 1);

        err = scs_prepare(tsk, node);
        if (err)
                goto free_stack;

#ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
         * the sighand lock in case orig has changed between now and
         * then. Until then, filter must be NULL to avoid messing up
         * the usage counts on the error path calling free_task.
         */
        tsk->seccomp.filter = NULL;
#endif

        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);
        clear_syscall_work_syscall_user_dispatch(tsk);

#ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
#endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;
        dup_user_cpus_ptr(tsk, orig, node);

        /*
         * One for the user space visible state that goes away when reaped.
         * One for the scheduler.
         */
        refcount_set(&tsk->rcu_users, 2);
        /* One for the rcu users */
        refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
#endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
        tsk->worker_private = NULL;

        kcov_task_init(tsk);
        kmsan_task_create(tsk);
        kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
        tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
        tsk->throttle_disk = NULL;
        tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_ARCH_HAS_CPU_PASID
        tsk->pasid_activated = 0;
#endif

#ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
#endif

#ifdef CONFIG_CPU_SUP_INTEL
        tsk->reported_split_lock = 0;
#endif

#ifdef CONFIG_SCHED_MM_CID
        tsk->mm_cid = -1;
        tsk->last_mm_cid = -1;
        tsk->mm_cid_active = 0;
        tsk->migrate_from_cpu = -1;
#endif
        return tsk;

free_stack:
        exit_task_stack_account(tsk);
        free_thread_stack(tsk);
free_tsk:
        free_task_struct(tsk);
        return NULL;
}

__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);

static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
        default_dump_filter =
                (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
                MMF_DUMP_FILTER_MASK;
        return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

#include <linux/init_task.h>

static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
        mm->ioctx_table = NULL;
#endif
}

static __always_inline void mm_clear_owner(struct mm_struct *mm,
                                           struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        if (mm->owner == p)
                WRITE_ONCE(mm->owner, NULL);
#endif
}

static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        mm->owner = p;
#endif
}

static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
        mm->uprobes_state.xol_area = NULL;
#endif
}

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
{
        mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
        mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
#ifdef CONFIG_PER_VMA_LOCK
        mm->mm_lock_seq = 0;
#endif
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        spin_lock_init(&mm->arg_lock);
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
        mm_pasid_init(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
#endif
        mm_init_uprobes_state(mm);
        hugetlb_count_init(mm);

        if (current->mm) {
                mm->flags = mmf_init_flags(current->mm->flags);
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                mm->flags = default_dump_filter;
                mm->def_flags = 0;
        }

        if (mm_alloc_pgd(mm))
                goto fail_nopgd;

        if (init_new_context(p, mm))
                goto fail_nocontext;

        if (mm_alloc_cid(mm))
                goto fail_cid;

        if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
                                     NR_MM_COUNTERS))
                goto fail_pcpu;

        mm->user_ns = get_user_ns(user_ns);
        lru_gen_init_mm(mm);
        return mm;

fail_pcpu:
        mm_destroy_cid(mm);
fail_cid:
        destroy_context(mm);
fail_nocontext:
        mm_free_pgd(mm);
fail_nopgd:
        free_mm(mm);
        return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
struct mm_struct *mm_alloc(void)
{
        struct mm_struct *mm;

        mm = allocate_mm();
        if (!mm)
                return NULL;

        memset(mm, 0, sizeof(*mm));
        return mm_init(mm, current, current_user_ns());
}

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        mm_put_huge_zero_folio(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                list_del(&mm->mmlist);
                spin_unlock(&mmlist_lock);
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
        lru_gen_del_mm(mm);
        mmdrop(mm);
}

/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
        might_sleep();

        if (atomic_dec_and_test(&mm->mm_users))
                __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
        struct mm_struct *mm = container_of(work, struct mm_struct,
                                            async_put_work);

        __mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
        if (atomic_dec_and_test(&mm->mm_users)) {
                INIT_WORK(&mm->async_put_work, mmput_async_fn);
                schedule_work(&mm->async_put_work);
        }
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif

/**
 * set_mm_exe_file - change a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve it happens before
 * the new mm is made visible to anyone.
 *
 * Can only fail if new_exe_file != NULL.
 */
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct file *old_exe_file;

        /*
         * It is safe to dereference the exe_file without RCU as
         * this function is only called if nobody else can access
         * this mm -- see comment above for justification.
         */
        old_exe_file = rcu_dereference_raw(mm->exe_file);

        if (new_exe_file) {
                /*
                 * We expect the caller (i.e., sys_execve) to already denied
                 * write access, so this is unlikely to fail.
                 */
                if (unlikely(deny_write_access(new_exe_file)))
                        return -EACCES;
                get_file(new_exe_file);
        }
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        if (old_exe_file) {
                allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * replace_mm_exe_file - replace a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
 */
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct vm_area_struct *vma;
        struct file *old_exe_file;
        int ret = 0;

        /* Forbid mm->exe_file change if old file still mapped. */
        old_exe_file = get_mm_exe_file(mm);
        if (old_exe_file) {
                VMA_ITERATOR(vmi, mm, 0);
                mmap_read_lock(mm);
                for_each_vma(vmi, vma) {
                        if (!vma->vm_file)
                                continue;
                        if (path_equal(&vma->vm_file->f_path,
                                       &old_exe_file->f_path)) {
                                ret = -EBUSY;
                                break;
                        }
                }
                mmap_read_unlock(mm);
                fput(old_exe_file);
                if (ret)
                        return ret;
        }

        ret = deny_write_access(new_exe_file);
        if (ret)
                return -EACCES;
        get_file(new_exe_file);

        /* set the new file */
        mmap_write_lock(mm);
        old_exe_file = rcu_dereference_raw(mm->exe_file);
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        mmap_write_unlock(mm);

        if (old_exe_file) {
                allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 * @mm: The mm of interest.
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
        struct file *exe_file;

        rcu_read_lock();
        exe_file = get_file_rcu(&mm->exe_file);
        rcu_read_unlock();
        return exe_file;
}

/**
 * get_task_exe_file - acquire a reference to the task's executable file
 * @task: The task.
 *
 * Returns %NULL if task's mm (if any) has no associated executable file or
 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
 * User must release file via fput().
 */
struct file *get_task_exe_file(struct task_struct *task)
{
        struct file *exe_file = NULL;
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (!(task->flags & PF_KTHREAD))
                        exe_file = get_mm_exe_file(mm);
        }
        task_unlock(task);
        return exe_file;
}

/**
 * get_task_mm - acquire a reference to the task's mm
 * @task: The task.
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
 * after use.  Typically used by /proc and ptrace.
 */
struct mm_struct *get_task_mm(struct task_struct *task)
{
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
                        mmget(mm);
        }
        task_unlock(task);
        return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);

struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
        struct mm_struct *mm;
        int err;

        err =  down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return ERR_PTR(err);

        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
                        !ptrace_may_access(task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
        up_read(&task->signal->exec_update_lock);

        return mm;
}

static void complete_vfork_done(struct task_struct *tsk)
{
        struct completion *vfork;

        task_lock(tsk);
        vfork = tsk->vfork_done;
        if (likely(vfork)) {
                tsk->vfork_done = NULL;
                complete(vfork);
        }
        task_unlock(tsk);
}

static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
{
        unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
        int killed;

        cgroup_enter_frozen();
        killed = wait_for_completion_state(vfork, state);
        cgroup_leave_frozen(false);

        if (killed) {
                task_lock(child);
                child->vfork_done = NULL;
                task_unlock(child);
        }

        put_task_struct(child);
        return killed;
}

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        uprobe_free_utask(tsk);

        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);

        /*
         * Signal userspace if we're not exiting with a core dump
         * because we want to leave the value intact for debugging
         * purposes.
         */
        if (tsk->clear_child_tid) {
                if (atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
                         * not set up a proper pointer then tough luck.
                         */
                        put_user(0, tsk->clear_child_tid);
                        do_futex(tsk->clear_child_tid, FUTEX_WAKE,
                                        1, NULL, NULL, 0, 0);
                }
                tsk->clear_child_tid = NULL;
        }

        /*
         * All done, finally we can wake up parent and return this mm to him.
         * Also kthread_stop() uses this completion for synchronization.
         */
        if (tsk->vfork_done)
                complete_vfork_done(tsk);
}

void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exit_release(tsk);
        mm_release(tsk, mm);
}

void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exec_release(tsk);
        mm_release(tsk, mm);
}

/**
 * dup_mm() - duplicates an existing mm structure
 * @tsk: the task_struct with which the new mm will be associated.
 * @oldmm: the mm to duplicate.
 *
 * Allocates a new mm structure and duplicates the provided @oldmm structure
 * content into it.
 *
 * Return: the duplicated mm or NULL on failure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk,
                                struct mm_struct *oldmm)
{
        struct mm_struct *mm;
        int err;

        mm = allocate_mm();
        if (!mm)
                goto fail_nomem;

        memcpy(mm, oldmm, sizeof(*mm));

        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;

        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;

        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;

        if (mm->binfmt && !try_module_get(mm->binfmt->module))
                goto free_pt;

        return mm;

free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
        mm_init_owner(mm, NULL);
        mmput(mm);

fail_nomem:
        return NULL;
}

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
        struct mm_struct *mm, *oldmm;

        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
        tsk->last_switch_time = 0;
#endif

        tsk->mm = NULL;
        tsk->active_mm = NULL;

        /*
         * Are we cloning a kernel thread?
         *
         * We need to steal a active VM for that..
         */
        oldmm = current->mm;
        if (!oldmm)
                return 0;

        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
        } else {
                mm = dup_mm(tsk, current->mm);
                if (!mm)
                        return -ENOMEM;
        }

        tsk->mm = mm;
        tsk->active_mm = mm;
        sched_mm_cid_fork(tsk);
        return 0;
}

static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                spin_lock(&fs->lock);
                /* "users" and "in_exec" locked for check_unsafe_exec() */
                if (fs->in_exec) {
                        spin_unlock(&fs->lock);
                        return -EAGAIN;
                }
                fs->users++;
                spin_unlock(&fs->lock);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
}

static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
                      int no_files)
{
        struct files_struct *oldf, *newf;
        int error = 0;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                goto out;

        if (no_files) {
                tsk->files = NULL;
                goto out;
        }

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                goto out;
        }

        newf = dup_fd(oldf, NR_OPEN_MAX, &error);
        if (!newf)
                goto out;

        tsk->files = newf;
        error = 0;
out:
        return error;
}

static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
        struct sighand_struct *sig;

        if (clone_flags & CLONE_SIGHAND) {
                refcount_inc(&current->sighand->count);
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
        RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;

        refcount_set(&sig->count, 1);
        spin_lock_irq(&current->sighand->siglock);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        spin_unlock_irq(&current->sighand->siglock);

        /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
        if (clone_flags & CLONE_CLEAR_SIGHAND)
                flush_signal_handlers(tsk, 0);

        return 0;
}

void __cleanup_sighand(struct sighand_struct *sighand)
{
        if (refcount_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
        }
}

/*
 * Initialize POSIX timer handling for a thread group.
 */
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;

        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        posix_cputimers_group_init(pct, cpu_limit);
}

static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
        struct signal_struct *sig;

        if (clone_flags & CLONE_THREAD)
                return 0;

        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;

        sig->nr_threads = 1;
        sig->quick_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);

        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);

        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_HLIST_HEAD(&sig->multiprocess);
        seqlock_init(&sig->stats_lock);
        prev_cputime_init(&sig->prev_cputime);

#ifdef CONFIG_POSIX_TIMERS
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
#endif

        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);

        posix_cpu_timers_init_group(sig);

        tty_audit_fork(sig);
        sched_autogroup_fork(sig);

        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;

        mutex_init(&sig->cred_guard_mutex);
        init_rwsem(&sig->exec_update_lock);

        return 0;
}

static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
        /*
         * Must be called with sighand->lock held, which is common to
         * all threads in the group. Holding cred_guard_mutex is not
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
        assert_spin_locked(&current->sighand->siglock);

        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
        p->seccomp = current->seccomp;

        /*
         * Explicitly enable no_new_privs here in case it got set
         * between the task_struct being duplicated and holding the
         * sighand lock. The seccomp state and nnp must be in sync.
         */
        if (task_no_new_privs(current))
                task_set_no_new_privs(p);

        /*
         * If the parent gained a seccomp mode after copying thread
         * flags and between before we held the sighand lock, we have
         * to manually enable the seccomp thread flag here.
         */
        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
                set_task_syscall_work(p, SECCOMP);
#endif
}

SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
        current->clear_child_tid = tidptr;

        return task_pid_vnr(current);
}

static void rt_mutex_init_task(struct task_struct *p)
{
        raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT_CACHED;
        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
#endif
}

static inline void init_task_pid_links(struct task_struct *task)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_NODE(&task->pid_links[type]);
}

static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
        if (type == PIDTYPE_PID)
                task->thread_pid = pid;
        else
                task->signal->pids[type] = pid;
}

static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special.s = 0;
        p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
        p->rcu_tasks_holdout = false;
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
        INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
        p->trc_reader_nesting = 0;
        p->trc_reader_special.s = 0;
        INIT_LIST_HEAD(&p->trc_holdout_list);
        INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

/**
 * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the file for the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper doesn't perform checks on @pid which makes it useful for pidfds
 * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
 * pidfd file are prepared.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        int pidfd;
        struct file *pidfd_file;

        pidfd = get_unused_fd_flags(O_CLOEXEC);
        if (pidfd < 0)
                return pidfd;

        pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
        if (IS_ERR(pidfd_file)) {
                put_unused_fd(pidfd);
                return PTR_ERR(pidfd_file);
        }
        /*
         * anon_inode_getfile() ignores everything outside of the
         * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
         */
        pidfd_file->f_flags |= (flags & PIDFD_THREAD);
        *ret = pidfd_file;
        return pidfd;
}

/**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper verifies that @pid is still in use, without PIDFD_THREAD the
 * task identified by @pid must be a thread-group leader.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        bool thread = flags & PIDFD_THREAD;

        if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
                return -EINVAL;

        return __pidfd_prepare(pid, flags, ret);
}

static void __delayed_free_task(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        free_task(tsk);
}

static __always_inline void delayed_free_task(struct task_struct *tsk)
{
        if (IS_ENABLED(CONFIG_MEMCG))
                call_rcu(&tsk->rcu, __delayed_free_task);
        else
                free_task(tsk);
}

static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
        /* Skip if kernel thread */
        if (!tsk->mm)
                return;

        /* Skip if spawning a thread or using vfork */
        if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
                return;

        /* We need to synchronize with __set_oom_adj */
        mutex_lock(&oom_adj_mutex);
        set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
        /* Update the values in case they were changed after copy_signal */
        tsk->signal->oom_score_adj = current->signal->oom_score_adj;
        tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_unlock(&oom_adj_mutex);
}

#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
        int i;

        for (i = 0; i < RV_PER_TASK_MONITORS; i++)
                p->rv[i].da_mon.monitoring = false;
}
#else
#define rv_task_fork(p) do {} while (0)
#endif

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
__latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
                                        struct kernel_clone_args *args)
{
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
        const u64 clone_flags = args->flags;
        struct nsproxy *nsp = current->nsproxy;

        /*
         * Don't allow sharing the root directory with processes in a different
         * namespace
         */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);

        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
                return ERR_PTR(-EINVAL);

        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
                return ERR_PTR(-EINVAL);

        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);

        /*
         * Siblings of global init remain as zombies on exit since they are
         * not reaped by their parent (swapper). To solve this and to avoid
         * multi-rooted process trees, prevent global and container-inits
         * from creating siblings.
         */
        if ((clone_flags & CLONE_PARENT) &&
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);

        /*
         * If the new process will be in a different pid or user namespace
         * do not allow it to share a thread group with the forking task.
         */
        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD) {
                /*
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 */
                if (clone_flags & CLONE_DETACHED)
                        return ERR_PTR(-EINVAL);
        }

        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
         * processes that happen during the fork and delay them so that
         * they appear to happen after the fork.
         */
        sigemptyset(&delayed.signal);
        INIT_HLIST_NODE(&delayed.node);

        spin_lock_irq(&current->sighand->siglock);
        if (!(clone_flags & CLONE_THREAD))
                hlist_add_head(&delayed.node, &current->signal->multiprocess);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        retval = -ERESTARTNOINTR;
        if (task_sigpending(current))
                goto fork_out;

        retval = -ENOMEM;
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
        p->flags &= ~PF_KTHREAD;
        if (args->kthread)
                p->flags |= PF_KTHREAD;
        if (args->user_worker) {
                /*
                 * Mark us a user worker, and block any signal that isn't
                 * fatal or STOP
                 */
                p->flags |= PF_USER_WORKER;
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }
        if (args->io_thread)
                p->flags |= PF_IO_WORKER;

        if (args->name)
                strscpy_pad(p->comm, args->name, sizeof(p->comm));

        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

        ftrace_graph_init_task(p);

        rt_mutex_init_task(p);

        lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
                goto bad_fork_free;

        retval = -EAGAIN;
        if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_cleanup_count;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;

        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
        if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;

        delayacct_tsk_init(p);        /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
        p->flags |= PF_FORKNOEXEC;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        rcu_copy_process(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);

        init_sigpending(&p->pending);

        p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        p->utimescaled = p->stimescaled = 0;
#endif
        prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqcount_init(&p->vtime.seqcount);
        p->vtime.starttime = 0;
        p->vtime.state = VTIME_INACTIVE;
#endif

#ifdef CONFIG_IO_URING
        p->io_uring = NULL;
#endif

        p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
        p->psi_flags = 0;
#endif

        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);

        posix_cputimers_init(&p->posix_cputimers);

        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
        if (args->kthread) {
                if (!set_kthread_struct(p))
                        goto bad_fork_cleanup_delayacct;
        }
#ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
                goto bad_fork_cleanup_delayacct;
        }
#endif
#ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
        seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
        memset(&p->irqtrace, 0, sizeof(p->irqtrace));
        p->irqtrace.hardirq_disable_ip        = _THIS_IP_;
        p->irqtrace.softirq_enable_ip        = _THIS_IP_;
        p->softirqs_enabled                = 1;
        p->softirq_context                = 0;
#endif

        p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
        lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg        = 0;
#endif
#ifdef CONFIG_BPF_SYSCALL
        RCU_INIT_POINTER(p->bpf_storage, NULL);
        p->bpf_ctx = NULL;
#endif
        p->bpf_net_context =  NULL;

        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_policy;

        retval = perf_event_init_task(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_policy;
        retval = audit_alloc(p);
        if (retval)
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p, args->no_files);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_fs;
        retval = copy_signal(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_sighand;
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
        retval = copy_namespaces(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_mm;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(p, args);
        if (retval)
                goto bad_fork_cleanup_io;

        stackleak_task_init(p);

        if (pid != &init_struct_pid) {
                pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                                args->set_tid_size);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
                }
        }

        /*
         * This has to happen after we've potentially unshared the file
         * descriptor table (so that the pidfd doesn't leak into the child
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
                int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;

                /* Note that no task has been attached to @pid yet. */
                retval = __pidfd_prepare(pid, flags, &pidfile);
                if (retval < 0)
                        goto bad_fork_free_pid;
                pidfd = retval;

                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }

#ifdef CONFIG_BLOCK
        p->plug = NULL;
#endif
        futex_init_task(p);

        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
                sas_ss_reset(p);

        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        clear_task_syscall_work(p, SYSCALL_EMU);
#endif
        clear_tsk_latency_tracing(p);

        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                p->tgid = current->tgid;
        } else {
                p->group_leader = p;
                p->tgid = p->pid;
        }

        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;

        p->pdeath_signal = 0;
        p->task_works = NULL;
        clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
        p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
        p->rethooks.first = NULL;
#endif

        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted that the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
        retval = cgroup_can_fork(p, args);
        if (retval)
                goto bad_fork_put_pidfd;

        /*
         * Now that the cgroups are pinned, re-clone the parent cgroup and put
         * the new task on the correct runqueue. All this *before* the task
         * becomes visible.
         *
         * This isn't part of ->can_fork() because while the re-cloning is
         * cgroup specific, it unconditionally needs to place the task on a
         * runqueue.
         */
        sched_cgroup_fork(p, args);

        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
         * not want user-space to be able to predict the process start-time by
         * stalling fork(2) after we recorded the start_time but before it is
         * visible to the system.
         */

        p->start_time = ktime_get_ns();
        p->start_boottime = ktime_get_boottime_ns();

        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
        write_lock_irq(&tasklist_lock);

        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
                if (clone_flags & CLONE_THREAD)
                        p->exit_signal = -1;
                else
                        p->exit_signal = current->group_leader->exit_signal;
        } else {
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
                p->exit_signal = args->exit_signal;
        }

        klp_copy_process(p);

        sched_core_fork(p);

        spin_lock(&current->sighand->siglock);

        rv_task_fork(p);

        rseq_fork(p, clone_flags);

        /* Don't start children in a dying pid namespace */
        if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
                retval = -ENOMEM;
                goto bad_fork_cancel_cgroup;
        }

        /* Let kill terminate clone/fork in the middle */
        if (fatal_signal_pending(current)) {
                retval = -EINTR;
                goto bad_fork_cancel_cgroup;
        }

        /* No more failure paths after this point. */

        /*
         * Copy seccomp details explicitly here, in case they were changed
         * before holding sighand lock.
         */
        copy_seccomp(p);

        init_task_pid_links(p);
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
                        init_task_pid(p, PIDTYPE_TGID, pid);
                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        init_task_pid(p, PIDTYPE_SID, task_session(current));

                        if (is_child_reaper(pid)) {
                                ns_of_pid(pid)->child_reaper = p;
                                p->signal->flags |= SIGNAL_UNKILLABLE;
                        }
                        p->signal->shared_pending.signal = delayed.signal;
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        /*
                         * Inherit has_child_subreaper flag under the same
                         * tasklist_lock with adding child to the process tree
                         * for propagate_has_child_subreaper optimization.
                         */
                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                                                         p->real_parent->signal->is_child_subreaper;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_TGID);
                        attach_pid(p, PIDTYPE_PGID);
                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
                        current->signal->quick_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
        total_forks++;
        hlist_del_init(&delayed.node);
        spin_unlock(&current->sighand->siglock);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);

        if (pidfile)
                fd_install(pidfd, pidfile);

        proc_fork_connector(p);
        sched_post_fork(p);
        cgroup_post_fork(p, args);
        perf_event_fork(p);

        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);
        user_events_fork(p, clone_flags);

        copy_oom_score_adj(clone_flags, p);

        return p;

bad_fork_cancel_cgroup:
        sched_core_free(p);
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
                put_unused_fd(pidfd);
        }
bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
bad_fork_cleanup_thread:
        exit_thread(p);
bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
bad_fork_cleanup_mm:
        if (p->mm) {
                mm_clear_owner(p->mm, p);
                mmput(p->mm);
        }
bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
bad_fork_cleanup_files:
        exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
        exit_sem(p);
bad_fork_cleanup_security:
        security_task_free(p);
bad_fork_cleanup_audit:
        audit_free(p);
bad_fork_cleanup_perf:
        perf_event_free_task(p);
bad_fork_cleanup_policy:
        lockdep_free_task(p);
#ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
        delayacct_tsk_free(p);
bad_fork_cleanup_count:
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_creds(p);
bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
        exit_task_stack_account(p);
        put_task_stack(p);
        delayed_free_task(p);
fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
        spin_unlock_irq(&current->sighand->siglock);
        return ERR_PTR(retval);
}

static inline void init_idle_pids(struct task_struct *idle)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
                init_task_pid(idle, type, &init_struct_pid);
        }
}

static int idle_dummy(void *dummy)
{
        /* This function is never called */
        return 0;
}

struct task_struct * __init fork_idle(int cpu)
{
        struct task_struct *task;
        struct kernel_clone_args args = {
                .flags                = CLONE_VM,
                .fn                = &idle_dummy,
                .fn_arg                = NULL,
                .kthread        = 1,
                .idle                = 1,
        };

        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
        }

        return task;
}

/*
 * This is like kernel_clone(), but shaved down and tailored to just
 * creating io_uring workers. It returns a created task, or an error pointer.
 * The returned task is inactive, and the caller must fire it up through
 * wake_up_new_task(p). All signals are blocked in the created task.
 */
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
        unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
                                CLONE_IO;
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .io_thread        = 1,
                .user_worker        = 1,
        };

        return copy_process(NULL, 0, node, &args);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        pid_t nr;

        /*
         * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
         * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
         * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
         * field in struct clone_args and it still doesn't make sense to have
         * them both point at the same memory location. Performing this check
         * here has the advantage that we don't need to have a separate helper
         * to check for legacy clone().
         */
        if ((clone_flags & CLONE_PIDFD) &&
            (clone_flags & CLONE_PARENT_SETTID) &&
            (args->pidfd == args->parent_tid))
                return -EINVAL;

        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;

                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }

        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();

        if (IS_ERR(p))
                return PTR_ERR(p);

        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, args->parent_tid);

        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
        }

        if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
                /* lock the task to synchronize with memcg migration */
                task_lock(p);
                lru_gen_add_mm(p->mm);
                task_unlock(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
                ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                        ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
        return nr;
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                    unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .name                = name,
                .kthread        = 1,
        };

        return kernel_clone(&args);
}

/*
 * Create a user mode thread.
 */
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
        };

        return kernel_clone(&args);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        struct kernel_clone_args args = {
                .exit_signal = SIGCHLD,
        };

        return kernel_clone(&args);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
        struct kernel_clone_args args = {
                .flags                = CLONE_VFORK | CLONE_VM,
                .exit_signal        = SIGCHLD,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 unsigned long, tls,
                 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
                unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#endif
{
        struct kernel_clone_args args = {
                .flags                = (lower_32_bits(clone_flags) & ~CSIGNAL),
                .pidfd                = parent_tidptr,
                .child_tid        = child_tidptr,
                .parent_tid        = parent_tidptr,
                .exit_signal        = (lower_32_bits(clone_flags) & CSIGNAL),
                .stack                = newsp,
                .tls                = tls,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE3

noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
{
        int err;
        struct clone_args args;
        pid_t *kset_tid = kargs->set_tid;

        BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
                     CLONE_ARGS_SIZE_VER0);
        BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
                     CLONE_ARGS_SIZE_VER1);
        BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
                     CLONE_ARGS_SIZE_VER2);
        BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (err)
                return err;

        if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
                return -EINVAL;

        if (unlikely(!args.set_tid && args.set_tid_size > 0))
                return -EINVAL;

        if (unlikely(args.set_tid && args.set_tid_size == 0))
                return -EINVAL;

        /*
         * Verify that higher 32bits of exit_signal are unset and that
         * it is a valid signal
         */
        if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
                     !valid_signal(args.exit_signal)))
                return -EINVAL;

        if ((args.flags & CLONE_INTO_CGROUP) &&
            (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                return -EINVAL;

        *kargs = (struct kernel_clone_args){
                .flags                = args.flags,
                .pidfd                = u64_to_user_ptr(args.pidfd),
                .child_tid        = u64_to_user_ptr(args.child_tid),
                .parent_tid        = u64_to_user_ptr(args.parent_tid),
                .exit_signal        = args.exit_signal,
                .stack                = args.stack,
                .stack_size        = args.stack_size,
                .tls                = args.tls,
                .set_tid_size        = args.set_tid_size,
                .cgroup                = args.cgroup,
        };

        if (args.set_tid &&
                copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
                        (kargs->set_tid_size * sizeof(pid_t))))
                return -EFAULT;

        kargs->set_tid = kset_tid;

        return 0;
}

/**
 * clone3_stack_valid - check and prepare stack
 * @kargs: kernel clone args
 *
 * Verify that the stack arguments userspace gave us are sane.
 * In addition, set the stack direction for userspace since it's easy for us to
 * determine.
 */
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
        if (kargs->stack == 0) {
                if (kargs->stack_size > 0)
                        return false;
        } else {
                if (kargs->stack_size == 0)
                        return false;

                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;

#if !defined(CONFIG_STACK_GROWSUP)
                kargs->stack += kargs->stack_size;
#endif
        }

        return true;
}

static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
        /* Verify that no unknown flags are passed along. */
        if (kargs->flags &
            ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                return false;

        /*
         * - make the CLONE_DETACHED bit reusable for clone3
         * - make the CSIGNAL bits reusable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
                return false;

        if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
            (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
                return false;

        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
            kargs->exit_signal)
                return false;

        if (!clone3_stack_valid(kargs))
                return false;

        return true;
}

/**
 * sys_clone3 - create a new process with specific properties
 * @uargs: argument structure
 * @size:  size of @uargs
 *
 * clone3() is the extensible successor to clone()/clone2().
 * It takes a struct as argument that is versioned by its size.
 *
 * Return: On success, a positive PID for the child process.
 *         On error, a negative errno number.
 */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
        int err;

        struct kernel_clone_args kargs;
        pid_t set_tid[MAX_PID_NS_LEVEL];

        kargs.set_tid = set_tid;

        err = copy_clone_args_from_user(&kargs, uargs, size);
        if (err)
                return err;

        if (!clone3_args_valid(&kargs))
                return -EINVAL;

        return kernel_clone(&kargs);
}
#endif

void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
        struct task_struct *leader, *parent, *child;
        int res;

        read_lock(&tasklist_lock);
        leader = top = top->group_leader;
down:
        for_each_thread(leader, parent) {
                list_for_each_entry(child, &parent->children, sibling) {
                        res = visitor(child, data);
                        if (res) {
                                if (res < 0)
                                        goto out;
                                leader = child;
                                goto down;
                        }
up:
                        ;
                }
        }

        if (leader != top) {
                child = leader;
                parent = child->real_parent;
                leader = parent->group_leader;
                goto up;
        }
out:
        read_unlock(&tasklist_lock);
}

#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif

static void sighand_ctor(void *data)
{
        struct sighand_struct *sighand = data;

        spin_lock_init(&sighand->siglock);
        init_waitqueue_head(&sighand->signalfd_wqh);
}

void __init mm_cache_init(void)
{
        unsigned int mm_size;

        /*
         * The mm_cpumask is located at the end of mm_struct, and is
         * dynamically sized based on the maximum CPU number this system
         * can have, taking hotplug into account (nr_cpu_ids).
         */
        mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();

        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
}

void __init proc_caches_init(void)
{
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);

        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
#ifdef CONFIG_PER_VMA_LOCK
        vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
#endif
        mmap_init();
        nsproxy_cache_init();
}

/*
 * Check constraints on flags passed to the unshare system call.
 */
static int check_unshare_flags(unsigned long unshare_flags)
{
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
                                CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
                                CLONE_NEWTIME))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing
         * to unshare.  Note that unsharing the address space or the
         * signal handlers also need to unshare the signal queues (aka
         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
                if (!thread_group_empty(current))
                        return -EINVAL;
        }
        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
                if (refcount_read(&current->sighand->count) > 1)
                        return -EINVAL;
        }
        if (unshare_flags & CLONE_VM) {
                if (!current_is_single_threaded())
                        return -EINVAL;
        }

        return 0;
}

/*
 * Unshare the filesystem structure if it is being shared
 */
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
        struct fs_struct *fs = current->fs;

        if (!(unshare_flags & CLONE_FS) || !fs)
                return 0;

        /* don't need lock here; in the worst case we'll do useless copy */
        if (fs->users == 1)
                return 0;

        *new_fsp = copy_fs_struct(fs);
        if (!*new_fsp)
                return -ENOMEM;

        return 0;
}

/*
 * Unshare file descriptor table if it is being shared
 */
int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
               struct files_struct **new_fdp)
{
        struct files_struct *fd = current->files;
        int error = 0;

        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
                *new_fdp = dup_fd(fd, max_fds, &error);
                if (!*new_fdp)
                        return error;
        }

        return 0;
}

/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by kernel_clone() cannot be used here directly
 * because they modify an inactive task_struct that is being
 * constructed. Here we are modifying the current, active,
 * task_struct.
 */
int ksys_unshare(unsigned long unshare_flags)
{
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *new_fd = NULL;
        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;

        /*
         * If unsharing a user namespace must also unshare the thread group
         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing a signal handlers, must also unshare the signal queues.
         */
        if (unshare_flags & CLONE_SIGHAND)
                unshare_flags |= CLONE_THREAD;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;

        err = check_unshare_flags(unshare_flags);
        if (err)
                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
        err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                                         new_cred, new_fs);
        if (err)
                goto bad_unshare_cleanup_cred;

        if (new_cred) {
                err = set_cred_ucounts(new_cred);
                if (err)
                        goto bad_unshare_cleanup_cred;
        }

        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                         */
                        exit_sem(current);
                }
                if (unshare_flags & CLONE_NEWIPC) {
                        /* Orphan segments in old ns (see sem above). */
                        exit_shm(current);
                        shm_init_task(current);
                }

                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);

                task_lock(current);

                if (new_fs) {
                        fs = current->fs;
                        spin_lock(&fs->lock);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
                        spin_unlock(&fs->lock);
                }

                if (new_fd)
                        swap(current->files, new_fd);

                task_unlock(current);

                if (new_cred) {
                        /* Install the new user namespace */
                        commit_creds(new_cred);
                        new_cred = NULL;
                }
        }

        perf_event_namespaces(current);

bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);

bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);

bad_unshare_out:
        return err;
}

SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
        return ksys_unshare(unshare_flags);
}

/*
 *        Helper to unshare the files of the current task.
 *        We don't want to expose copy_files internals to
 *        the exec layer of the kernel.
 */

int unshare_files(void)
{
        struct task_struct *task = current;
        struct files_struct *old, *copy = NULL;
        int error;

        error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
        if (error || !copy)
                return error;

        old = task->files;
        task_lock(task);
        task->files = copy;
        task_unlock(task);
        put_files_struct(old);
        return 0;
}

int sysctl_max_threads(struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int ret;
        int threads = max_threads;
        int min = 1;
        int max = MAX_THREADS;

        t = *table;
        t.data = &threads;
        t.extra1 = &min;
        t.extra2 = &max;

        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_threads = threads;

        return 0;
}


















































    3 













































    3 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM signal

#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SIGNAL_H

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/tracepoint.h>

#define TP_STORE_SIGINFO(__entry, info)                                \
        do {                                                        \
                if (info == SEND_SIG_NOINFO) {                        \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_USER;                \
                } else if (info == SEND_SIG_PRIV) {                \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_KERNEL;                \
                } else {                                        \
                        __entry->errno        = info->si_errno;        \
                        __entry->code        = info->si_code;        \
                }                                                \
        } while (0)

#ifndef TRACE_HEADER_MULTI_READ
enum {
        TRACE_SIGNAL_DELIVERED,
        TRACE_SIGNAL_IGNORED,
        TRACE_SIGNAL_ALREADY_PENDING,
        TRACE_SIGNAL_OVERFLOW_FAIL,
        TRACE_SIGNAL_LOSE_INFO,
};
#endif

/**
 * signal_generate - called when a signal is generated
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @task: pointer to struct task_struct
 * @group: shared or private
 * @result: TRACE_SIGNAL_*
 *
 * Current process sends a 'sig' signal to 'task' process with
 * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
 * 'info' is not a pointer and you can't access its field. Instead,
 * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
 * means that si_code is SI_KERNEL.
 */
TRACE_EVENT(signal_generate,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task,
                        int group, int result),

        TP_ARGS(sig, info, task, group, result),

        TP_STRUCT__entry(
                __field(        int,        sig                        )
                __field(        int,        errno                        )
                __field(        int,        code                        )
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        group                        )
                __field(        int,        result                        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
                __entry->pid        = task->pid;
                __entry->group        = group;
                __entry->result        = result;
        ),

        TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->comm, __entry->pid, __entry->group,
                  __entry->result)
);

/**
 * signal_deliver - called when a signal is delivered
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @ka: pointer to struct k_sigaction
 *
 * A 'sig' signal is delivered to current process with 'info' siginfo,
 * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or
 * SIG_DFL.
 * Note that some signals reported by signal_generate tracepoint can be
 * lost, ignored or modified (by debugger) before hitting this tracepoint.
 * This means, this can show which signals are actually delivered, but
 * matching generated signals and delivered signals may not be correct.
 */
TRACE_EVENT(signal_deliver,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka),

        TP_ARGS(sig, info, ka),

        TP_STRUCT__entry(
                __field(        int,                sig                )
                __field(        int,                errno                )
                __field(        int,                code                )
                __field(        unsigned long,        sa_handler        )
                __field(        unsigned long,        sa_flags        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                __entry->sa_handler        = (unsigned long)ka->sa.sa_handler;
                __entry->sa_flags        = ka->sa.sa_flags;
        ),

        TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->sa_handler, __entry->sa_flags)
);

#endif /* _TRACE_SIGNAL_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 

















    1 
    3 











    1 
    1 







    2 

    2 








    1 


















    2 


















    2 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux INET6 implementation
 *        Forwarding Information Database
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Changes:
 *        Yuji SEKIYA @USAGI:        Support default route on router node;
 *                                remove ip6_null_entry from the top of
 *                                routing table.
 *        Ville Nuorvala:                Fixed routing subtrees.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>

#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>

#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>

static struct kmem_cache *fib6_node_kmem __read_mostly;

struct fib6_cleaner {
        struct fib6_walker w;
        struct net *net;
        int (*func)(struct fib6_info *, void *arg);
        int sernum;
        void *arg;
        bool skip_notify;
};

#ifdef CONFIG_IPV6_SUBTREES
#define FWS_INIT FWS_S
#else
#define FWS_INIT FWS_L
#endif

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn);
static int fib6_walk(struct net *net, struct fib6_walker *w);
static int fib6_walk_continue(struct fib6_walker *w);

/*
 *        A routing update causes an increase of the serial number on the
 *        affected subtree. This allows for cached routes to be asynchronously
 *        tested when modifications are made to the destination cache as a
 *        result of redirects, path MTU changes, etc.
 */

static void fib6_gc_timer_cb(struct timer_list *t);

#define FOR_WALKERS(net, w) \
        list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)

static void fib6_walker_link(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_add(&w->lh, &net->ipv6.fib6_walkers);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_del(&w->lh);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static int fib6_new_sernum(struct net *net)
{
        int new, old = atomic_read(&net->ipv6.fib6_sernum);

        do {
                new = old < INT_MAX ? old + 1 : 1;
        } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));

        return new;
}

enum {
        FIB6_NO_SERNUM_CHANGE = 0,
};

void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
        struct fib6_node *fn;

        fn = rcu_dereference_protected(f6i->fib6_node,
                        lockdep_is_held(&f6i->fib6_table->tb6_lock));
        if (fn)
                WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
}

/*
 *        Auxiliary address test functions for the radix tree.
 *
 *        These assume a 32bit processor (although it will work on
 *        64bit processors)
 */

/*
 *        test bit
 */
#if defined(__LITTLE_ENDIAN)
# define BITOP_BE32_SWIZZLE        (0x1F & ~7)
#else
# define BITOP_BE32_SWIZZLE        0
#endif

static __be32 addr_bit_set(const void *token, int fn_bit)
{
        const __be32 *addr = token;
        /*
         * Here,
         *        1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
         * is optimized version of
         *        htonl(1 << ((~fn_bit)&0x1F))
         * See include/asm-generic/bitops/le.h.
         */
        return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
               addr[fn_bit >> 5];
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
        struct fib6_info *f6i;
        size_t sz = sizeof(*f6i);

        if (with_fib6_nh)
                sz += sizeof(struct fib6_nh);

        f6i = kzalloc(sz, gfp_flags);
        if (!f6i)
                return NULL;

        /* fib6_siblings is a union with nh_list, so this initializes both */
        INIT_LIST_HEAD(&f6i->fib6_siblings);
        refcount_set(&f6i->fib6_ref, 1);

        INIT_HLIST_NODE(&f6i->gc_link);

        return f6i;
}

void fib6_info_destroy_rcu(struct rcu_head *head)
{
        struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);

        WARN_ON(f6i->fib6_node);

        if (f6i->nh)
                nexthop_put(f6i->nh);
        else
                fib6_nh_release(f6i->fib6_nh);

        ip_fib_metrics_put(f6i->fib6_metrics);
        kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);

static struct fib6_node *node_alloc(struct net *net)
{
        struct fib6_node *fn;

        fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
        if (fn)
                net->ipv6.rt6_stats->fib_nodes++;

        return fn;
}

static void node_free_immediate(struct net *net, struct fib6_node *fn)
{
        kmem_cache_free(fib6_node_kmem, fn);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void node_free_rcu(struct rcu_head *head)
{
        struct fib6_node *fn = container_of(head, struct fib6_node, rcu);

        kmem_cache_free(fib6_node_kmem, fn);
}

static void node_free(struct net *net, struct fib6_node *fn)
{
        call_rcu(&fn->rcu, node_free_rcu);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void fib6_free_table(struct fib6_table *table)
{
        inetpeer_invalidate_tree(&table->tb6_peers);
        kfree(table);
}

static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
        unsigned int h;

        /*
         * Initialize table lock at a single place to give lockdep a key,
         * tables aren't visible prior to being linked to the list.
         */
        spin_lock_init(&tb->tb6_lock);
        h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);

        /*
         * No protection necessary, this is the only list mutatation
         * operation, tables never disappear once they exist.
         */
        hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES

static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
{
        struct fib6_table *table;

        table = kzalloc(sizeof(*table), GFP_ATOMIC);
        if (table) {
                table->tb6_id = id;
                rcu_assign_pointer(table->tb6_root.leaf,
                                   net->ipv6.fib6_null_entry);
                table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
                inet_peer_base_init(&table->tb6_peers);
                INIT_HLIST_HEAD(&table->tb6_gc_hlist);
        }

        return table;
}

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        struct fib6_table *tb;

        if (id == 0)
                id = RT6_TABLE_MAIN;
        tb = fib6_get_table(net, id);
        if (tb)
                return tb;

        tb = fib6_alloc_table(net, id);
        if (tb)
                fib6_link_table(net, tb);

        return tb;
}
EXPORT_SYMBOL_GPL(fib6_new_table);

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
        struct fib6_table *tb;
        struct hlist_head *head;
        unsigned int h;

        if (id == 0)
                id = RT6_TABLE_MAIN;
        h = id & (FIB6_TABLE_HASHSZ - 1);
        rcu_read_lock();
        head = &net->ipv6.fib_table_hash[h];
        hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                if (tb->tb6_id == id) {
                        rcu_read_unlock();
                        return tb;
                }
        }
        rcu_read_unlock();

        return NULL;
}
EXPORT_SYMBOL_GPL(fib6_get_table);

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
        fib6_link_table(net, net->ipv6.fib6_local_tbl);
}
#else

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        return fib6_get_table(net, id);
}

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
          return net->ipv6.fib6_main_tbl;
}

struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup)
{
        struct rt6_info *rt;

        rt = pol_lookup_func(lookup,
                        net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
        if (rt->dst.error == -EAGAIN) {
                ip6_rt_put_flags(rt, flags);
                rt = net->ipv6.ip6_null_entry;
                if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                        dst_hold(&rt->dst);
        }

        return &rt->dst;
}

/* called with rcu lock held; no reference taken on fib6_info */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags)
{
        return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
                                 res, flags);
}

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
}

#endif

unsigned int fib6_tables_seq_read(struct net *net)
{
        unsigned int h, fib_seq = 0;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist)
                        fib_seq += tb->fib_seq;
        }
        rcu_read_unlock();

        return fib_seq;
}

static int call_fib6_entry_notifier(struct notifier_block *nb,
                                    enum fib_event_type event_type,
                                    struct fib6_info *rt,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
                                              enum fib_event_type event_type,
                                              struct fib6_info *rt,
                                              unsigned int nsiblings,
                                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        rt->fib6_table->fib_seq++;
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        rt->fib6_table->fib_seq++;
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
{
        struct fib6_entry_notifier_info info = {
                .rt = rt,
                .nsiblings = rt->fib6_nsiblings,
        };

        rt->fib6_table->fib_seq++;
        return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
}

struct fib6_dump_arg {
        struct net *net;
        struct notifier_block *nb;
        struct netlink_ext_ack *extack;
};

static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
        enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
        int err;

        if (!rt || rt == arg->net->ipv6.fib6_null_entry)
                return 0;

        if (rt->fib6_nsiblings)
                err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
                                                         rt,
                                                         rt->fib6_nsiblings,
                                                         arg->extack);
        else
                err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
                                               arg->extack);

        return err;
}

static int fib6_node_dump(struct fib6_walker *w)
{
        int err;

        err = fib6_rt_dump(w->leaf, w->args);
        w->leaf = NULL;
        return err;
}

static int fib6_table_dump(struct net *net, struct fib6_table *tb,
                           struct fib6_walker *w)
{
        int err;

        w->root = &tb->tb6_root;
        spin_lock_bh(&tb->tb6_lock);
        err = fib6_walk(net, w);
        spin_unlock_bh(&tb->tb6_lock);
        return err;
}

/* Called with rcu_read_lock() */
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack)
{
        struct fib6_dump_arg arg;
        struct fib6_walker *w;
        unsigned int h;
        int err = 0;

        w = kzalloc(sizeof(*w), GFP_ATOMIC);
        if (!w)
                return -ENOMEM;

        w->func = fib6_node_dump;
        arg.net = net;
        arg.nb = nb;
        arg.extack = extack;
        w->args = &arg;

        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        err = fib6_table_dump(net, tb, w);
                        if (err)
                                goto out;
                }
        }

out:
        kfree(w);

        /* The tree traversal function should never return a positive value. */
        return err > 0 ? -EINVAL : err;
}

static int fib6_dump_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;

        for_each_fib6_walker_rt(w) {
                res = rt6_dump_route(rt, w->args, w->skip_in_node);
                if (res >= 0) {
                        /* Frame is full, suspend walking */
                        w->leaf = rt;

                        /* We'll restart from this node, so if some routes were
                         * already dumped, skip them next time.
                         */
                        w->skip_in_node += res;

                        return 1;
                }
                w->skip_in_node = 0;

                /* Multipath routes are dumped in one route with the
                 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
                 * last sibling of this route (no need to dump the
                 * sibling routes again)
                 */
                if (rt->fib6_nsiblings)
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info,
                                             fib6_siblings);
        }
        w->leaf = NULL;
        return 0;
}

static void fib6_dump_end(struct netlink_callback *cb)
{
        struct net *net = sock_net(cb->skb->sk);
        struct fib6_walker *w = (void *)cb->args[2];

        if (w) {
                if (cb->args[4]) {
                        cb->args[4] = 0;
                        fib6_walker_unlink(net, w);
                }
                cb->args[2] = 0;
                kfree(w);
        }
        cb->done = (void *)cb->args[3];
        cb->args[1] = 3;
}

static int fib6_dump_done(struct netlink_callback *cb)
{
        fib6_dump_end(cb);
        return cb->done ? cb->done(cb) : 0;
}

static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                           struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct fib6_walker *w;
        int res;

        w = (void *)cb->args[2];
        w->root = &table->tb6_root;

        if (cb->args[4] == 0) {
                w->count = 0;
                w->skip = 0;
                w->skip_in_node = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk(net, w);
                spin_unlock_bh(&table->tb6_lock);
                if (res > 0) {
                        cb->args[4] = 1;
                        cb->args[5] = READ_ONCE(w->root->fn_sernum);
                }
        } else {
                int sernum = READ_ONCE(w->root->fn_sernum);
                if (cb->args[5] != sernum) {
                        /* Begin at the root if the tree changed */
                        cb->args[5] = sernum;
                        w->state = FWS_INIT;
                        w->node = w->root;
                        w->skip = w->count;
                        w->skip_in_node = 0;
                } else
                        w->skip = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk_continue(w);
                spin_unlock_bh(&table->tb6_lock);
                if (res <= 0) {
                        fib6_walker_unlink(net, w);
                        cb->args[4] = 0;
                }
        }

        return res;
}

static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rt6_rtnl_dump_arg arg = {
                .filter.dump_exceptions = true,
                .filter.dump_routes = true,
                .filter.rtnl_held = false,
        };
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int e = 0, s_e;
        struct hlist_head *head;
        struct fib6_walker *w;
        struct fib6_table *tb;
        unsigned int h, s_h;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
                if (err < 0)
                        goto unlock;
        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
                struct rtmsg *rtm = nlmsg_data(nlh);

                if (rtm->rtm_flags & RTM_F_PREFIX)
                        arg.filter.flags = RTM_F_PREFIX;
        }

        w = (void *)cb->args[2];
        if (!w) {
                /* New dump:
                 *
                 * 1. allocate and initialize walker.
                 */
                w = kzalloc(sizeof(*w), GFP_ATOMIC);
                if (!w) {
                        err = -ENOMEM;
                        goto unlock;
                }
                w->func = fib6_dump_node;
                cb->args[2] = (long)w;

                /* 2. hook callback destructor.
                 */
                cb->args[3] = (long)cb->done;
                cb->done = fib6_dump_done;

        }

        arg.skb = skb;
        arg.cb = cb;
        arg.net = net;
        w->args = &arg;

        if (arg.filter.table_id) {
                tb = fib6_get_table(net, arg.filter.table_id);
                if (!tb) {
                        if (rtnl_msg_family(cb->nlh) != PF_INET6)
                                goto unlock;

                        NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
                        err = -ENOENT;
                        goto unlock;
                }

                if (!cb->args[0]) {
                        err = fib6_dump_table(tb, skb, cb);
                        if (!err)
                                cb->args[0] = 1;
                }
                goto unlock;
        }

        s_h = cb->args[0];
        s_e = cb->args[1];

        for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
                e = 0;
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        if (e < s_e)
                                goto next;
                        err = fib6_dump_table(tb, skb, cb);
                        if (err != 0)
                                goto out;
next:
                        e++;
                }
        }
out:
        cb->args[1] = e;
        cb->args[0] = h;

unlock:
        rcu_read_unlock();
        if (err <= 0)
                fib6_dump_end(cb);
        return err;
}

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
{
        if (!f6i)
                return;

        if (f6i->fib6_metrics == &dst_default_metrics) {
                struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);

                if (!p)
                        return;

                refcount_set(&p->refcnt, 1);
                f6i->fib6_metrics = p;
        }

        f6i->fib6_metrics->metrics[metric - 1] = val;
}

/*
 *        Routing Table
 *
 *        return the appropriate node for a routing tree "add" operation
 *        by either creating and inserting or by returning an existing
 *        node.
 */

static struct fib6_node *fib6_add_1(struct net *net,
                                    struct fib6_table *table,
                                    struct fib6_node *root,
                                    struct in6_addr *addr, int plen,
                                    int offset, int allow_create,
                                    int replace_required,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_node *fn, *in, *ln;
        struct fib6_node *pn = NULL;
        struct rt6key *key;
        int        bit;
        __be32        dir = 0;

        /* insert node in tree */

        fn = root;

        do {
                struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
                        if (!allow_create) {
                                if (replace_required) {
                                        NL_SET_ERR_MSG(extack,
                                                       "Can not replace route - no match found");
                                        pr_warn("Can't replace route, no match found\n");
                                        return ERR_PTR(-ENOENT);
                                }
                                pr_warn("NLM_F_CREATE should be set when creating new route\n");
                        }
                        goto insert_above;
                }

                /*
                 *        Exact match ?
                 */

                if (plen == fn->fn_bit) {
                        /* clean up an intermediate node */
                        if (!(fn->fn_flags & RTN_RTINFO)) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                                fib6_info_release(leaf);
                        /* remove null_entry in the root node */
                        } else if (fn->fn_flags & RTN_TL_ROOT &&
                                   rcu_access_pointer(fn->leaf) ==
                                   net->ipv6.fib6_null_entry) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                        }

                        return fn;
                }

                /*
                 *        We have more bits to go
                 */

                /* Try to walk down on tree. */
                dir = addr_bit_set(addr, fn->fn_bit);
                pn = fn;
                fn = dir ?
                     rcu_dereference_protected(fn->right,
                                        lockdep_is_held(&table->tb6_lock)) :
                     rcu_dereference_protected(fn->left,
                                        lockdep_is_held(&table->tb6_lock));
        } while (fn);

        if (!allow_create) {
                /* We should not create new node because
                 * NLM_F_REPLACE was specified without NLM_F_CREATE
                 * I assume it is safe to require NLM_F_CREATE when
                 * REPLACE flag is used! Later we may want to remove the
                 * check for replace_required, because according
                 * to netlink specification, NLM_F_CREATE
                 * MUST be specified if new route is created.
                 * That would keep IPv6 consistent with IPv4
                 */
                if (replace_required) {
                        NL_SET_ERR_MSG(extack,
                                       "Can not replace route - no match found");
                        pr_warn("Can't replace route, no match found\n");
                        return ERR_PTR(-ENOENT);
                }
                pr_warn("NLM_F_CREATE should be set when creating new route\n");
        }
        /*
         *        We walked to the bottom of tree.
         *        Create new leaf node without children.
         */

        ln = node_alloc(net);

        if (!ln)
                return ERR_PTR(-ENOMEM);
        ln->fn_bit = plen;
        RCU_INIT_POINTER(ln->parent, pn);

        if (dir)
                rcu_assign_pointer(pn->right, ln);
        else
                rcu_assign_pointer(pn->left, ln);

        return ln;


insert_above:
        /*
         * split since we don't have a common prefix anymore or
         * we have a less significant route.
         * we've to insert an intermediate node on the list
         * this new node will point to the one we need to create
         * and the current
         */

        pn = rcu_dereference_protected(fn->parent,
                                       lockdep_is_held(&table->tb6_lock));

        /* find 1st bit in difference between the 2 addrs.

           See comment in __ipv6_addr_diff: bit may be an invalid value,
           but if it is >= plen, the value is ignored in any case.
         */

        bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));

        /*
         *                (intermediate)[in]
         *                  /           \
         *        (new leaf node)[ln] (old node)[fn]
         */
        if (plen > bit) {
                in = node_alloc(net);
                ln = node_alloc(net);

                if (!in || !ln) {
                        if (in)
                                node_free_immediate(net, in);
                        if (ln)
                                node_free_immediate(net, ln);
                        return ERR_PTR(-ENOMEM);
                }

                /*
                 * new intermediate node.
                 * RTN_RTINFO will
                 * be off since that an address that chooses one of
                 * the branches would not match less specific routes
                 * in the other branch
                 */

                in->fn_bit = bit;

                RCU_INIT_POINTER(in->parent, pn);
                in->leaf = fn->leaf;
                fib6_info_hold(rcu_dereference_protected(in->leaf,
                                lockdep_is_held(&table->tb6_lock)));

                /* update parent pointer */
                if (dir)
                        rcu_assign_pointer(pn->right, in);
                else
                        rcu_assign_pointer(pn->left, in);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, in);
                rcu_assign_pointer(fn->parent, in);

                if (addr_bit_set(addr, bit)) {
                        rcu_assign_pointer(in->right, ln);
                        rcu_assign_pointer(in->left, fn);
                } else {
                        rcu_assign_pointer(in->left, ln);
                        rcu_assign_pointer(in->right, fn);
                }
        } else { /* plen <= bit */

                /*
                 *                (new leaf node)[ln]
                 *                  /           \
                 *             (old node)[fn] NULL
                 */

                ln = node_alloc(net);

                if (!ln)
                        return ERR_PTR(-ENOMEM);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, pn);

                if (addr_bit_set(&key->addr, plen))
                        RCU_INIT_POINTER(ln->right, fn);
                else
                        RCU_INIT_POINTER(ln->left, fn);

                rcu_assign_pointer(fn->parent, ln);

                if (dir)
                        rcu_assign_pointer(pn->right, ln);
                else
                        rcu_assign_pointer(pn->left, ln);
        }
        return ln;
}

static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
                                  const struct fib6_info *match,
                                  const struct fib6_table *table)
{
        int cpu;

        if (!fib6_nh->rt6i_pcpu)
                return;

        rcu_read_lock();
        /* release the reference to this fib entry from
         * all of its cached pcpu routes
         */
        for_each_possible_cpu(cpu) {
                struct rt6_info **ppcpu_rt;
                struct rt6_info *pcpu_rt;

                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);

                /* Paired with xchg() in rt6_get_pcpu_route() */
                pcpu_rt = READ_ONCE(*ppcpu_rt);

                /* only dropping the 'from' reference if the cached route
                 * is using 'match'. The cached pcpu_rt->from only changes
                 * from a fib6_info to NULL (ip6_dst_destroy); it can never
                 * change from one fib6_info reference to another
                 */
                if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
                        struct fib6_info *from;

                        from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
                        fib6_info_release(from);
                }
        }
        rcu_read_unlock();
}

struct fib6_nh_pcpu_arg {
        struct fib6_info        *from;
        const struct fib6_table *table;
};

static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_pcpu_arg *arg = _arg;

        __fib6_drop_pcpu_from(nh, arg->from, arg->table);
        return 0;
}

static void fib6_drop_pcpu_from(struct fib6_info *f6i,
                                const struct fib6_table *table)
{
        /* Make sure rt6_make_pcpu_route() wont add other percpu routes
         * while we are cleaning them here.
         */
        f6i->fib6_destroying = 1;
        mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */

        if (f6i->nh) {
                struct fib6_nh_pcpu_arg arg = {
                        .from = f6i,
                        .table = table
                };

                nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
                                         &arg);
        } else {
                struct fib6_nh *fib6_nh;

                fib6_nh = f6i->fib6_nh;
                __fib6_drop_pcpu_from(fib6_nh, f6i, table);
        }
}

static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
                          struct net *net)
{
        struct fib6_table *table = rt->fib6_table;

        /* Flush all cached dst in exception table */
        rt6_flush_exceptions(rt);
        fib6_drop_pcpu_from(rt, table);

        if (rt->nh && !list_empty(&rt->nh_list))
                list_del_init(&rt->nh_list);

        if (refcount_read(&rt->fib6_ref) != 1) {
                /* This route is used as dummy address holder in some split
                 * nodes. It is not leaked, but it still holds other resources,
                 * which must be released in time. So, scan ascendant nodes
                 * and replace dummy references to this route with references
                 * to still alive ones.
                 */
                while (fn) {
                        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                        struct fib6_info *new_leaf;
                        if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
                                new_leaf = fib6_find_prefix(net, table, fn);
                                fib6_info_hold(new_leaf);

                                rcu_assign_pointer(fn->leaf, new_leaf);
                                fib6_info_release(rt);
                        }
                        fn = rcu_dereference_protected(fn->parent,
                                    lockdep_is_held(&table->tb6_lock));
                }
        }

        fib6_clean_expires(rt);
        fib6_remove_gc_list(rt);
}

/*
 *        Insert routing information in a node.
 */

static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
                            struct nl_info *info,
                            struct netlink_ext_ack *extack)
{
        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
        struct fib6_info *iter = NULL;
        struct fib6_info __rcu **ins;
        struct fib6_info __rcu **fallback_ins = NULL;
        int replace = (info->nlh &&
                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
        int add = (!info->nlh ||
                   (info->nlh->nlmsg_flags & NLM_F_CREATE));
        int found = 0;
        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        bool notify_sibling_rt = false;
        u16 nlflags = NLM_F_EXCL;
        int err;

        if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
                nlflags |= NLM_F_APPEND;

        ins = &fn->leaf;

        for (iter = leaf; iter;
             iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock))) {
                /*
                 *        Search for duplicates
                 */

                if (iter->fib6_metric == rt->fib6_metric) {
                        /*
                         *        Same priority level
                         */
                        if (info->nlh &&
                            (info->nlh->nlmsg_flags & NLM_F_EXCL))
                                return -EEXIST;

                        nlflags &= ~NLM_F_EXCL;
                        if (replace) {
                                if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
                                        found++;
                                        break;
                                }
                                fallback_ins = fallback_ins ?: ins;
                                goto next_iter;
                        }

                        if (rt6_duplicate_nexthop(iter, rt)) {
                                if (rt->fib6_nsiblings)
                                        rt->fib6_nsiblings = 0;
                                if (!(iter->fib6_flags & RTF_EXPIRES))
                                        return -EEXIST;
                                if (!(rt->fib6_flags & RTF_EXPIRES)) {
                                        fib6_clean_expires(iter);
                                        fib6_remove_gc_list(iter);
                                } else {
                                        fib6_set_expires(iter, rt->expires);
                                        fib6_add_gc_list(iter);
                                }

                                if (rt->fib6_pmtu)
                                        fib6_metric_set(iter, RTAX_MTU,
                                                        rt->fib6_pmtu);
                                return -EEXIST;
                        }
                        /* If we have the same destination and the same metric,
                         * but not the same gateway, then the route we try to
                         * add is sibling to this route, increment our counter
                         * of siblings, and later we will add our route to the
                         * list.
                         * Only static routes (which don't have flag
                         * RTF_EXPIRES) are used for ECMPv6.
                         *
                         * To avoid long list, we only had siblings if the
                         * route have a gateway.
                         */
                        if (rt_can_ecmp &&
                            rt6_qualify_for_ecmp(iter))
                                rt->fib6_nsiblings++;
                }

                if (iter->fib6_metric > rt->fib6_metric)
                        break;

next_iter:
                ins = &iter->fib6_next;
        }

        if (fallback_ins && !found) {
                /* No matching route with same ecmp-able-ness found, replace
                 * first matching route
                 */
                ins = fallback_ins;
                iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                found++;
        }

        /* Reset round-robin state, if necessary */
        if (ins == &fn->leaf)
                fn->rr_ptr = NULL;

        /* Link this route to others same route. */
        if (rt->fib6_nsiblings) {
                unsigned int fib6_nsiblings;
                struct fib6_info *sibling, *temp_sibling;

                /* Find the first route that have the same metric */
                sibling = leaf;
                notify_sibling_rt = true;
                while (sibling) {
                        if (sibling->fib6_metric == rt->fib6_metric &&
                            rt6_qualify_for_ecmp(sibling)) {
                                list_add_tail(&rt->fib6_siblings,
                                              &sibling->fib6_siblings);
                                break;
                        }
                        sibling = rcu_dereference_protected(sibling->fib6_next,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        notify_sibling_rt = false;
                }
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
                 * is broken!
                 */
                fib6_nsiblings = 0;
                list_for_each_entry_safe(sibling, temp_sibling,
                                         &rt->fib6_siblings, fib6_siblings) {
                        sibling->fib6_nsiblings++;
                        BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
                        fib6_nsiblings++;
                }
                BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
                rt6_multipath_rebalance(temp_sibling);
        }

        /*
         *        insert node
         */
        if (!replace) {
                if (!add)
                        pr_warn("NLM_F_CREATE should be set when creating new route\n");

add:
                nlflags |= NLM_F_CREATE;

                /* The route should only be notified if it is the first
                 * route in the node or if it is added as a sibling
                 * route to the first route in the node.
                 */
                if (!info->skip_notify_kernel &&
                    (notify_sibling_rt || ins == &fn->leaf)) {
                        enum fib_event_type fib_event;

                        if (notify_sibling_rt)
                                fib_event = FIB_EVENT_ENTRY_APPEND;
                        else
                                fib_event = FIB_EVENT_ENTRY_REPLACE;
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        fib_event, rt,
                                                        extack);
                        if (err) {
                                struct fib6_info *sibling, *next_sibling;

                                /* If the route has siblings, then it first
                                 * needs to be unlinked from them.
                                 */
                                if (!rt->fib6_nsiblings)
                                        return err;

                                list_for_each_entry_safe(sibling, next_sibling,
                                                         &rt->fib6_siblings,
                                                         fib6_siblings)
                                        sibling->fib6_nsiblings--;
                                rt->fib6_nsiblings = 0;
                                list_del_init(&rt->fib6_siblings);
                                rt6_multipath_rebalance(next_sibling);
                                return err;
                        }
                }

                rcu_assign_pointer(rt->fib6_next, iter);
                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;

                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }

        } else {
                int nsiblings;

                if (!found) {
                        if (add)
                                goto add;
                        pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
                        return -ENOENT;
                }

                if (!info->skip_notify_kernel && ins == &fn->leaf) {
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        FIB_EVENT_ENTRY_REPLACE,
                                                        rt, extack);
                        if (err)
                                return err;
                }

                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rt->fib6_next = iter->fib6_next;
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->fib6_nsiblings;
                iter->fib6_node = NULL;
                fib6_purge_rt(iter, fn, info->nl_net);
                if (rcu_access_pointer(fn->rr_ptr) == iter)
                        fn->rr_ptr = NULL;
                fib6_info_release(iter);

                if (nsiblings) {
                        /* Replacing an ECMP route, remove all siblings */
                        ins = &rt->fib6_next;
                        iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        while (iter) {
                                if (iter->fib6_metric > rt->fib6_metric)
                                        break;
                                if (rt6_qualify_for_ecmp(iter)) {
                                        *ins = iter->fib6_next;
                                        iter->fib6_node = NULL;
                                        fib6_purge_rt(iter, fn, info->nl_net);
                                        if (rcu_access_pointer(fn->rr_ptr) == iter)
                                                fn->rr_ptr = NULL;
                                        fib6_info_release(iter);
                                        nsiblings--;
                                        info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
                                } else {
                                        ins = &iter->fib6_next;
                                }
                                iter = rcu_dereference_protected(*ins,
                                        lockdep_is_held(&rt->fib6_table->tb6_lock));
                        }
                        WARN_ON(nsiblings != 0);
                }
        }

        return 0;
}

static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
            (rt->fib6_flags & RTF_EXPIRES))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}

void fib6_force_start_gc(struct net *net)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}

static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
                                           int sernum)
{
        struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));

        /* paired with smp_rmb() in fib6_get_cookie_safe() */
        smp_wmb();
        while (fn) {
                WRITE_ONCE(fn->fn_sernum, sernum);
                fn = rcu_dereference_protected(fn->parent,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
        }
}

void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
{
        __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}

/* allow ipv4 to update sernum via ipv6_stub */
void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
{
        spin_lock_bh(&f6i->fib6_table->tb6_lock);
        fib6_update_sernum_upto_root(net, f6i);
        spin_unlock_bh(&f6i->fib6_table->tb6_lock);
}

/*
 *        Add routing information to the routing tree.
 *        <destination addr>/<source addr>
 *        with source addr info in sub-trees
 *        Need to own table->tb6_lock
 */

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack)
{
        struct fib6_table *table = rt->fib6_table;
        struct fib6_node *fn;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node *pn = NULL;
#endif
        int err = -ENOMEM;
        int allow_create = 1;
        int replace_required = 0;

        if (info->nlh) {
                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
                        allow_create = 0;
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        replace_required = 1;
        }
        if (!allow_create && !replace_required)
                pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");

        fn = fib6_add_1(info->nl_net, table, root,
                        &rt->fib6_dst.addr, rt->fib6_dst.plen,
                        offsetof(struct fib6_info, fib6_dst), allow_create,
                        replace_required, extack);
        if (IS_ERR(fn)) {
                err = PTR_ERR(fn);
                fn = NULL;
                goto out;
        }

#ifdef CONFIG_IPV6_SUBTREES
        pn = fn;

        if (rt->fib6_src.plen) {
                struct fib6_node *sn;

                if (!rcu_access_pointer(fn->subtree)) {
                        struct fib6_node *sfn;

                        /*
                         * Create subtree.
                         *
                         *                fn[main tree]
                         *                |
                         *                sfn[subtree root]
                         *                   \
                         *                    sn[new leaf node]
                         */

                        /* Create subtree root node */
                        sfn = node_alloc(info->nl_net);
                        if (!sfn)
                                goto failure;

                        fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
                        rcu_assign_pointer(sfn->leaf,
                                           info->nl_net->ipv6.fib6_null_entry);
                        sfn->fn_flags = RTN_ROOT;

                        /* Now add the first leaf node to new subtree */

                        sn = fib6_add_1(info->nl_net, table, sfn,
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                /* If it is failed, discard just allocated
                                   root, and then (in failure) stale node
                                   in main tree.
                                 */
                                node_free_immediate(info->nl_net, sfn);
                                err = PTR_ERR(sn);
                                goto failure;
                        }

                        /* Now link new subtree to main tree */
                        rcu_assign_pointer(sfn->parent, fn);
                        rcu_assign_pointer(fn->subtree, sfn);
                } else {
                        sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                err = PTR_ERR(sn);
                                goto failure;
                        }
                }

                if (!rcu_access_pointer(fn->leaf)) {
                        if (fn->fn_flags & RTN_TL_ROOT) {
                                /* put back null_entry for root node */
                                rcu_assign_pointer(fn->leaf,
                                            info->nl_net->ipv6.fib6_null_entry);
                        } else {
                                fib6_info_hold(rt);
                                rcu_assign_pointer(fn->leaf, rt);
                        }
                }
                fn = sn;
        }
#endif

        err = fib6_add_rt2node(fn, rt, info, extack);
        if (!err) {
                if (rt->nh)
                        list_add(&rt->nh_list, &rt->nh->f6i_list);
                __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));

                if (rt->fib6_flags & RTF_EXPIRES)
                        fib6_add_gc_list(rt);

                fib6_start_gc(info->nl_net, rt);
        }

out:
        if (err) {
#ifdef CONFIG_IPV6_SUBTREES
                /*
                 * If fib6_add_1 has cleared the old leaf pointer in the
                 * super-tree leaf node we have to find a new one for it.
                 */
                if (pn != fn) {
                        struct fib6_info *pn_leaf =
                                rcu_dereference_protected(pn->leaf,
                                    lockdep_is_held(&table->tb6_lock));
                        if (pn_leaf == rt) {
                                pn_leaf = NULL;
                                RCU_INIT_POINTER(pn->leaf, NULL);
                                fib6_info_release(rt);
                        }
                        if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
                                pn_leaf = fib6_find_prefix(info->nl_net, table,
                                                           pn);
                                if (!pn_leaf)
                                        pn_leaf =
                                            info->nl_net->ipv6.fib6_null_entry;
                                fib6_info_hold(pn_leaf);
                                rcu_assign_pointer(pn->leaf, pn_leaf);
                        }
                }
#endif
                goto failure;
        } else if (fib6_requires_src(rt)) {
                fib6_routes_require_src_inc(info->nl_net);
        }
        return err;

failure:
        /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
         * 1. fn is an intermediate node and we failed to add the new
         * route to it in both subtree creation failure and fib6_add_rt2node()
         * failure case.
         * 2. fn is the root node in the table and we fail to add the first
         * default route to it.
         */
        if (fn &&
            (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
             (fn->fn_flags & RTN_TL_ROOT &&
              !rcu_access_pointer(fn->leaf))))
                fib6_repair_tree(info->nl_net, table, fn);
        return err;
}

/*
 *        Routing tree lookup
 *
 */

struct lookup_args {
        int                        offset;                /* key offset on fib6_info */
        const struct in6_addr        *addr;                /* search key                        */
};

static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
                                            struct lookup_args *args)
{
        struct fib6_node *fn;
        __be32 dir;

        if (unlikely(args->offset == 0))
                return NULL;

        /*
         *        Descend on a tree
         */

        fn = root;

        for (;;) {
                struct fib6_node *next;

                dir = addr_bit_set(args->addr, fn->fn_bit);

                next = dir ? rcu_dereference(fn->right) :
                             rcu_dereference(fn->left);

                if (next) {
                        fn = next;
                        continue;
                }
                break;
        }

        while (fn) {
                struct fib6_node *subtree = FIB6_SUBTREE(fn);

                if (subtree || fn->fn_flags & RTN_RTINFO) {
                        struct fib6_info *leaf = rcu_dereference(fn->leaf);
                        struct rt6key *key;

                        if (!leaf)
                                goto backtrack;

                        key = (struct rt6key *) ((u8 *)leaf + args->offset);

                        if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
#ifdef CONFIG_IPV6_SUBTREES
                                if (subtree) {
                                        struct fib6_node *sfn;
                                        sfn = fib6_node_lookup_1(subtree,
                                                                 args + 1);
                                        if (!sfn)
                                                goto backtrack;
                                        fn = sfn;
                                }
#endif
                                if (fn->fn_flags & RTN_RTINFO)
                                        return fn;
                        }
                }
backtrack:
                if (fn->fn_flags & RTN_ROOT)
                        break;

                fn = rcu_dereference(fn->parent);
        }

        return NULL;
}

/* called with rcu_read_lock() held
 */
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr)
{
        struct fib6_node *fn;
        struct lookup_args args[] = {
                {
                        .offset = offsetof(struct fib6_info, fib6_dst),
                        .addr = daddr,
                },
#ifdef CONFIG_IPV6_SUBTREES
                {
                        .offset = offsetof(struct fib6_info, fib6_src),
                        .addr = saddr,
                },
#endif
                {
                        .offset = 0,        /* sentinel */
                }
        };

        fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
        if (!fn || fn->fn_flags & RTN_TL_ROOT)
                fn = root;

        return fn;
}

/*
 *        Get node with specified destination prefix (and source prefix,
 *        if subtrees are used)
 *        exact_match == true means we try to find fn with exact match of
 *        the passed in prefix addr
 *        exact_match == false means we try to find fn with longest prefix
 *        match of the passed in prefix addr. This is useful for finding fn
 *        for cached route as it will be stored in the exception table under
 *        the node with longest prefix length.
 */


static struct fib6_node *fib6_locate_1(struct fib6_node *root,
                                       const struct in6_addr *addr,
                                       int plen, int offset,
                                       bool exact_match)
{
        struct fib6_node *fn, *prev = NULL;

        for (fn = root; fn ; ) {
                struct fib6_info *leaf = rcu_dereference(fn->leaf);
                struct rt6key *key;

                /* This node is being deleted */
                if (!leaf) {
                        if (plen <= fn->fn_bit)
                                goto out;
                        else
                                goto next;
                }

                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
                        goto out;

                if (plen == fn->fn_bit)
                        return fn;

                if (fn->fn_flags & RTN_RTINFO)
                        prev = fn;

next:
                /*
                 *        We have more bits to go
                 */
                if (addr_bit_set(addr, fn->fn_bit))
                        fn = rcu_dereference(fn->right);
                else
                        fn = rcu_dereference(fn->left);
        }
out:
        if (exact_match)
                return NULL;
        else
                return prev;
}

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match)
{
        struct fib6_node *fn;

        fn = fib6_locate_1(root, daddr, dst_len,
                           offsetof(struct fib6_info, fib6_dst),
                           exact_match);

#ifdef CONFIG_IPV6_SUBTREES
        if (src_len) {
                WARN_ON(saddr == NULL);
                if (fn) {
                        struct fib6_node *subtree = FIB6_SUBTREE(fn);

                        if (subtree) {
                                fn = fib6_locate_1(subtree, saddr, src_len,
                                           offsetof(struct fib6_info, fib6_src),
                                           exact_match);
                        }
                }
        }
#endif

        if (fn && fn->fn_flags & RTN_RTINFO)
                return fn;

        return NULL;
}


/*
 *        Deletion
 *
 */

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn)
{
        struct fib6_node *child_left, *child_right;

        if (fn->fn_flags & RTN_ROOT)
                return net->ipv6.fib6_null_entry;

        while (fn) {
                child_left = rcu_dereference_protected(fn->left,
                                    lockdep_is_held(&table->tb6_lock));
                child_right = rcu_dereference_protected(fn->right,
                                    lockdep_is_held(&table->tb6_lock));
                if (child_left)
                        return rcu_dereference_protected(child_left->leaf,
                                        lockdep_is_held(&table->tb6_lock));
                if (child_right)
                        return rcu_dereference_protected(child_right->leaf,
                                        lockdep_is_held(&table->tb6_lock));

                fn = FIB6_SUBTREE(fn);
        }
        return NULL;
}

/*
 *        Called to trim the tree of intermediate nodes when possible. "fn"
 *        is the node we want to try and remove.
 *        Need to own table->tb6_lock
 */

static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn)
{
        int children;
        int nstate;
        struct fib6_node *child;
        struct fib6_walker *w;
        int iter = 0;

        /* Set fn->leaf to null_entry for root node. */
        if (fn->fn_flags & RTN_TL_ROOT) {
                rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
                return fn;
        }

        for (;;) {
                struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn = rcu_dereference_protected(fn->parent,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *new_fn_leaf;

                pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
                iter++;

                WARN_ON(fn->fn_flags & RTN_RTINFO);
                WARN_ON(fn->fn_flags & RTN_TL_ROOT);
                WARN_ON(fn_leaf);

                children = 0;
                child = NULL;
                if (fn_r) {
                        child = fn_r;
                        children |= 1;
                }
                if (fn_l) {
                        child = fn_l;
                        children |= 2;
                }

                if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
                    /* Subtree root (i.e. fn) may have one child */
                    || (children && fn->fn_flags & RTN_ROOT)
#endif
                    ) {
                        new_fn_leaf = fib6_find_prefix(net, table, fn);
#if RT6_DEBUG >= 2
                        if (!new_fn_leaf) {
                                WARN_ON(!new_fn_leaf);
                                new_fn_leaf = net->ipv6.fib6_null_entry;
                        }
#endif
                        fib6_info_hold(new_fn_leaf);
                        rcu_assign_pointer(fn->leaf, new_fn_leaf);
                        return pn;
                }

#ifdef CONFIG_IPV6_SUBTREES
                if (FIB6_SUBTREE(pn) == fn) {
                        WARN_ON(!(fn->fn_flags & RTN_ROOT));
                        RCU_INIT_POINTER(pn->subtree, NULL);
                        nstate = FWS_L;
                } else {
                        WARN_ON(fn->fn_flags & RTN_ROOT);
#endif
                        if (pn_r == fn)
                                rcu_assign_pointer(pn->right, child);
                        else if (pn_l == fn)
                                rcu_assign_pointer(pn->left, child);
#if RT6_DEBUG >= 2
                        else
                                WARN_ON(1);
#endif
                        if (child)
                                rcu_assign_pointer(child->parent, pn);
                        nstate = FWS_R;
#ifdef CONFIG_IPV6_SUBTREES
                }
#endif

                read_lock(&net->ipv6.fib6_walker_lock);
                FOR_WALKERS(net, w) {
                        if (!child) {
                                if (w->node == fn) {
                                        pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
                                                 w, w->state, nstate);
                                        w->node = pn;
                                        w->state = nstate;
                                }
                        } else {
                                if (w->node == fn) {
                                        w->node = child;
                                        if (children&2) {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
                                        } else {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
                                        }
                                }
                        }
                }
                read_unlock(&net->ipv6.fib6_walker_lock);

                node_free(net, fn);
                if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
                        return pn;

                RCU_INIT_POINTER(pn->leaf, NULL);
                fib6_info_release(pn_leaf);
                fn = pn;
        }
}

static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
                           struct fib6_info __rcu **rtp, struct nl_info *info)
{
        struct fib6_info *leaf, *replace_rt = NULL;
        struct fib6_walker *w;
        struct fib6_info *rt = rcu_dereference_protected(*rtp,
                                    lockdep_is_held(&table->tb6_lock));
        struct net *net = info->nl_net;
        bool notify_del = false;

        /* If the deleted route is the first in the node and it is not part of
         * a multipath route, then we need to replace it with the next route
         * in the node, if exists.
         */
        leaf = rcu_dereference_protected(fn->leaf,
                                         lockdep_is_held(&table->tb6_lock));
        if (leaf == rt && !rt->fib6_nsiblings) {
                if (rcu_access_pointer(rt->fib6_next))
                        replace_rt = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                else
                        notify_del = true;
        }

        /* Unlink it */
        *rtp = rt->fib6_next;
        rt->fib6_node = NULL;
        net->ipv6.rt6_stats->fib_rt_entries--;
        net->ipv6.rt6_stats->fib_discarded_routes++;

        /* Reset round-robin state, if necessary */
        if (rcu_access_pointer(fn->rr_ptr) == rt)
                fn->rr_ptr = NULL;

        /* Remove this entry from other siblings */
        if (rt->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;

                /* The route is deleted from a multipath route. If this
                 * multipath route is the first route in the node, then we need
                 * to emit a delete notification. Otherwise, we need to skip
                 * the notification.
                 */
                if (rt->fib6_metric == leaf->fib6_metric &&
                    rt6_qualify_for_ecmp(leaf))
                        notify_del = true;
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings, fib6_siblings)
                        sibling->fib6_nsiblings--;
                rt->fib6_nsiblings = 0;
                list_del_init(&rt->fib6_siblings);
                rt6_multipath_rebalance(next_sibling);
        }

        /* Adjust walkers */
        read_lock(&net->ipv6.fib6_walker_lock);
        FOR_WALKERS(net, w) {
                if (w->state == FWS_C && w->leaf == rt) {
                        pr_debug("walker %p adjusted by delroute\n", w);
                        w->leaf = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                        if (!w->leaf)
                                w->state = FWS_U;
                }
        }
        read_unlock(&net->ipv6.fib6_walker_lock);

        /* If it was last route, call fib6_repair_tree() to:
         * 1. For root node, put back null_entry as how the table was created.
         * 2. For other nodes, expunge its radix tree node.
         */
        if (!rcu_access_pointer(fn->leaf)) {
                if (!(fn->fn_flags & RTN_TL_ROOT)) {
                        fn->fn_flags &= ~RTN_RTINFO;
                        net->ipv6.rt6_stats->fib_route_nodes--;
                }
                fn = fib6_repair_tree(net, table, fn);
        }

        fib6_purge_rt(rt, fn, net);

        if (!info->skip_notify_kernel) {
                if (notify_del)
                        call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
                                                  rt, NULL);
                else if (replace_rt)
                        call_fib6_entry_notifiers_replace(net, replace_rt);
        }
        if (!info->skip_notify)
                inet6_rt_notify(RTM_DELROUTE, rt, info, 0);

        fib6_info_release(rt);
}

/* Need to own table->tb6_lock */
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
        struct net *net = info->nl_net;
        struct fib6_info __rcu **rtp;
        struct fib6_info __rcu **rtp_next;
        struct fib6_table *table;
        struct fib6_node *fn;

        if (rt == net->ipv6.fib6_null_entry)
                return -ENOENT;

        table = rt->fib6_table;
        fn = rcu_dereference_protected(rt->fib6_node,
                                       lockdep_is_held(&table->tb6_lock));
        if (!fn)
                return -ENOENT;

        WARN_ON(!(fn->fn_flags & RTN_RTINFO));

        /*
         *        Walk the leaf entries looking for ourself
         */

        for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
                struct fib6_info *cur = rcu_dereference_protected(*rtp,
                                        lockdep_is_held(&table->tb6_lock));
                if (rt == cur) {
                        if (fib6_requires_src(cur))
                                fib6_routes_require_src_dec(info->nl_net);
                        fib6_del_route(table, fn, rtp, info);
                        return 0;
                }
                rtp_next = &cur->fib6_next;
        }
        return -ENOENT;
}

/*
 *        Tree traversal function.
 *
 *        Certainly, it is not interrupt safe.
 *        However, it is internally reenterable wrt itself and fib6_add/fib6_del.
 *        It means, that we can modify tree during walking
 *        and use this function for garbage collection, clone pruning,
 *        cleaning tree when a device goes down etc. etc.
 *
 *        It guarantees that every node will be traversed,
 *        and that it will be traversed only once.
 *
 *        Callback function w->func may return:
 *        0 -> continue walking.
 *        positive value -> walking is suspended (used by tree dumps,
 *        and probably by gc, if it will be split to several slices)
 *        negative value -> terminate walking.
 *
 *        The function itself returns:
 *        0   -> walk is complete.
 *        >0  -> walk is incomplete (i.e. suspended)
 *        <0  -> walk is terminated by an error.
 *
 *        This function is called with tb6_lock held.
 */

static int fib6_walk_continue(struct fib6_walker *w)
{
        struct fib6_node *fn, *pn, *left, *right;

        /* w->root should always be table->tb6_root */
        WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));

        for (;;) {
                fn = w->node;
                if (!fn)
                        return 0;

                switch (w->state) {
#ifdef CONFIG_IPV6_SUBTREES
                case FWS_S:
                        if (FIB6_SUBTREE(fn)) {
                                w->node = FIB6_SUBTREE(fn);
                                continue;
                        }
                        w->state = FWS_L;
                        fallthrough;
#endif
                case FWS_L:
                        left = rcu_dereference_protected(fn->left, 1);
                        if (left) {
                                w->node = left;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_R;
                        fallthrough;
                case FWS_R:
                        right = rcu_dereference_protected(fn->right, 1);
                        if (right) {
                                w->node = right;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_C;
                        w->leaf = rcu_dereference_protected(fn->leaf, 1);
                        fallthrough;
                case FWS_C:
                        if (w->leaf && fn->fn_flags & RTN_RTINFO) {
                                int err;

                                if (w->skip) {
                                        w->skip--;
                                        goto skip;
                                }

                                err = w->func(w);
                                if (err)
                                        return err;

                                w->count++;
                                continue;
                        }
skip:
                        w->state = FWS_U;
                        fallthrough;
                case FWS_U:
                        if (fn == w->root)
                                return 0;
                        pn = rcu_dereference_protected(fn->parent, 1);
                        left = rcu_dereference_protected(pn->left, 1);
                        right = rcu_dereference_protected(pn->right, 1);
                        w->node = pn;
#ifdef CONFIG_IPV6_SUBTREES
                        if (FIB6_SUBTREE(pn) == fn) {
                                WARN_ON(!(fn->fn_flags & RTN_ROOT));
                                w->state = FWS_L;
                                continue;
                        }
#endif
                        if (left == fn) {
                                w->state = FWS_R;
                                continue;
                        }
                        if (right == fn) {
                                w->state = FWS_C;
                                w->leaf = rcu_dereference_protected(w->node->leaf, 1);
                                continue;
                        }
#if RT6_DEBUG >= 2
                        WARN_ON(1);
#endif
                }
        }
}

static int fib6_walk(struct net *net, struct fib6_walker *w)
{
        int res;

        w->state = FWS_INIT;
        w->node = w->root;

        fib6_walker_link(net, w);
        res = fib6_walk_continue(w);
        if (res <= 0)
                fib6_walker_unlink(net, w);
        return res;
}

static int fib6_clean_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;
        struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
        struct nl_info info = {
                .nl_net = c->net,
                .skip_notify = c->skip_notify,
        };

        if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
            READ_ONCE(w->node->fn_sernum) != c->sernum)
                WRITE_ONCE(w->node->fn_sernum, c->sernum);

        if (!c->func) {
                WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
                w->leaf = NULL;
                return 0;
        }

        for_each_fib6_walker_rt(w) {
                res = c->func(rt, c->arg);
                if (res == -1) {
                        w->leaf = rt;
                        res = fib6_del(rt, &info);
                        if (res) {
#if RT6_DEBUG >= 2
                                pr_debug("%s: del failed: rt=%p@%p err=%d\n",
                                         __func__, rt,
                                         rcu_access_pointer(rt->fib6_node),
                                         res);
#endif
                                continue;
                        }
                        return 0;
                } else if (res == -2) {
                        if (WARN_ON(!rt->fib6_nsiblings))
                                continue;
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info, fib6_siblings);
                        continue;
                }
                WARN_ON(res != 0);
        }
        w->leaf = rt;
        return 0;
}

/*
 *        Convenient frontend to tree walker.
 *
 *        func is called on each route.
 *                It may return -2 -> skip multipath route.
 *                              -1 -> delete this route.
 *                              0  -> continue walking
 */

static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                            int (*func)(struct fib6_info *, void *arg),
                            int sernum, void *arg, bool skip_notify)
{
        struct fib6_cleaner c;

        c.w.root = root;
        c.w.func = fib6_clean_node;
        c.w.count = 0;
        c.w.skip = 0;
        c.w.skip_in_node = 0;
        c.func = func;
        c.sernum = sernum;
        c.arg = arg;
        c.net = net;
        c.skip_notify = skip_notify;

        fib6_walk(net, &c.w);
}

static void __fib6_clean_all(struct net *net,
                             int (*func)(struct fib6_info *, void *),
                             int sernum, void *arg, bool skip_notify)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);
                        fib6_clean_tree(net, &table->tb6_root,
                                        func, sernum, arg, skip_notify);
                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
                    void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
}

void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *),
                                void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}

static void fib6_flush_trees(struct net *net)
{
        int new_sernum = fib6_new_sernum(net);

        __fib6_clean_all(net, NULL, new_sernum, NULL, false);
}

/*
 *        Garbage collection
 */

static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
{
        unsigned long now = jiffies;

        /*
         *        check addrconf expiration here.
         *        Routes are expired even if they are in use.
         */

        if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
                if (time_after(now, rt->expires)) {
                        pr_debug("expiring %p\n", rt);
                        return -1;
                }
                gc_args->more++;
        }

        /*        Also age clones in the exception table.
         *        Note, that clones are aged out
         *        only if they are not in use now.
         */
        rt6_age_exceptions(rt, gc_args, now);

        return 0;
}

static void fib6_gc_table(struct net *net,
                          struct fib6_table *tb6,
                          struct fib6_gc_args *gc_args)
{
        struct fib6_info *rt;
        struct hlist_node *n;
        struct nl_info info = {
                .nl_net = net,
                .skip_notify = false,
        };

        hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
                if (fib6_age(rt, gc_args) == -1)
                        fib6_del(rt, &info);
}

static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);

                        fib6_gc_table(net, table, gc_args);

                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
        struct fib6_gc_args gc_args;
        unsigned long now;

        if (force) {
                spin_lock_bh(&net->ipv6.fib6_gc_lock);
        } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
                mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
                return;
        }
        gc_args.timeout = expires ? (int)expires :
                          net->ipv6.sysctl.ip6_rt_gc_interval;
        gc_args.more = 0;

        fib6_gc_all(net, &gc_args);
        now = jiffies;
        net->ipv6.ip6_rt_last_gc = now;

        if (gc_args.more)
                mod_timer(&net->ipv6.ip6_fib_timer,
                          round_jiffies(now
                                        + net->ipv6.sysctl.ip6_rt_gc_interval));
        else
                del_timer(&net->ipv6.ip6_fib_timer);
        spin_unlock_bh(&net->ipv6.fib6_gc_lock);
}

static void fib6_gc_timer_cb(struct timer_list *t)
{
        struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);

        fib6_run_gc(0, arg, true);
}

static int __net_init fib6_net_init(struct net *net)
{
        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
        int err;

        err = fib6_notifier_init(net);
        if (err)
                return err;

        /* Default to 3-tuple */
        net->ipv6.sysctl.multipath_hash_fields =
                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;

        spin_lock_init(&net->ipv6.fib6_gc_lock);
        rwlock_init(&net->ipv6.fib6_walker_lock);
        INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
        timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);

        net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
        if (!net->ipv6.rt6_stats)
                goto out_notifier;

        /* Avoid false sharing : Use at least a full cache line */
        size = max_t(size_t, size, L1_CACHE_BYTES);

        net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
        if (!net->ipv6.fib_table_hash)
                goto out_rt6_stats;

        net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
                                          GFP_KERNEL);
        if (!net->ipv6.fib6_main_tbl)
                goto out_fib_table_hash;

        net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
        rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
                                           GFP_KERNEL);
        if (!net->ipv6.fib6_local_tbl)
                goto out_fib6_main_tbl;
        net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
        rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
#endif
        fib6_tables_init(net);

        return 0;

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_fib6_main_tbl:
        kfree(net->ipv6.fib6_main_tbl);
#endif
out_fib_table_hash:
        kfree(net->ipv6.fib_table_hash);
out_rt6_stats:
        kfree(net->ipv6.rt6_stats);
out_notifier:
        fib6_notifier_exit(net);
        return -ENOMEM;
}

static void fib6_net_exit(struct net *net)
{
        unsigned int i;

        del_timer_sync(&net->ipv6.ip6_fib_timer);

        for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[i];
                struct hlist_node *tmp;
                struct fib6_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
                        hlist_del(&tb->tb6_hlist);
                        fib6_free_table(tb);
                }
        }

        kfree(net->ipv6.fib_table_hash);
        kfree(net->ipv6.rt6_stats);
        fib6_notifier_exit(net);
}

static struct pernet_operations fib6_net_ops = {
        .init = fib6_net_init,
        .exit = fib6_net_exit,
};

int __init fib6_init(void)
{
        int ret = -ENOMEM;

        fib6_node_kmem = KMEM_CACHE(fib6_node,
                                    SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
        if (!fib6_node_kmem)
                goto out;

        ret = register_pernet_subsys(&fib6_net_ops);
        if (ret)
                goto out_kmem_cache_create;

        ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
                                   inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED |
                                   RTNL_FLAG_DUMP_SPLIT_NLM_DONE);
        if (ret)
                goto out_unregister_subsys;

        __fib6_flush_trees = fib6_flush_trees;
out:
        return ret;

out_unregister_subsys:
        unregister_pernet_subsys(&fib6_net_ops);
out_kmem_cache_create:
        kmem_cache_destroy(fib6_node_kmem);
        goto out;
}

void fib6_gc_cleanup(void)
{
        unregister_pernet_subsys(&fib6_net_ops);
        kmem_cache_destroy(fib6_node_kmem);
}

#ifdef CONFIG_PROC_FS
static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{
        struct fib6_info *rt = v;
        struct ipv6_route_iter *iter = seq->private;
        struct fib6_nh *fib6_nh = rt->fib6_nh;
        unsigned int flags = rt->fib6_flags;
        const struct net_device *dev;

        if (rt->nh)
                fib6_nh = nexthop_fib6_nh(rt->nh);

        seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);

#ifdef CONFIG_IPV6_SUBTREES
        seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
#else
        seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
        if (fib6_nh->fib_nh_gw_family) {
                flags |= RTF_GATEWAY;
                seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
        } else {
                seq_puts(seq, "00000000000000000000000000000000");
        }

        dev = fib6_nh->fib_nh_dev;
        seq_printf(seq, " %08x %08x %08x %08x %8s\n",
                   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
                   flags, dev ? dev->name : "");
        iter->w.leaf = NULL;
        return 0;
}

static int ipv6_route_yield(struct fib6_walker *w)
{
        struct ipv6_route_iter *iter = w->args;

        if (!iter->skip)
                return 1;

        do {
                iter->w.leaf = rcu_dereference_protected(
                                iter->w.leaf->fib6_next,
                                lockdep_is_held(&iter->tbl->tb6_lock));
                iter->skip--;
                if (!iter->skip && iter->w.leaf)
                        return 1;
        } while (iter->w.leaf);

        return 0;
}

static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
                                      struct net *net)
{
        memset(&iter->w, 0, sizeof(iter->w));
        iter->w.func = ipv6_route_yield;
        iter->w.root = &iter->tbl->tb6_root;
        iter->w.state = FWS_INIT;
        iter->w.node = iter->w.root;
        iter->w.args = iter;
        iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
        INIT_LIST_HEAD(&iter->w.lh);
        fib6_walker_link(net, &iter->w);
}

static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
                                                    struct net *net)
{
        unsigned int h;
        struct hlist_node *node;

        if (tbl) {
                h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
                node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
        } else {
                h = 0;
                node = NULL;
        }

        while (!node && h < FIB6_TABLE_HASHSZ) {
                node = rcu_dereference(
                        hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
        }
        return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
}

static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
{
        int sernum = READ_ONCE(iter->w.root->fn_sernum);

        if (iter->sernum != sernum) {
                iter->sernum = sernum;
                iter->w.state = FWS_INIT;
                iter->w.node = iter->w.root;
                WARN_ON(iter->w.skip);
                iter->w.skip = iter->w.count;
        }
}

static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        int r;
        struct fib6_info *n;
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        ++(*pos);
        if (!v)
                goto iter_table;

        n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
        if (n)
                return n;

iter_table:
        ipv6_route_check_sernum(iter);
        spin_lock_bh(&iter->tbl->tb6_lock);
        r = fib6_walk_continue(&iter->w);
        spin_unlock_bh(&iter->tbl->tb6_lock);
        if (r > 0) {
                return iter->w.leaf;
        } else if (r < 0) {
                fib6_walker_unlink(net, &iter->w);
                return NULL;
        }
        fib6_walker_unlink(net, &iter->w);

        iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
        if (!iter->tbl)
                return NULL;

        ipv6_route_seq_setup_walk(iter, net);
        goto iter_table;
}

static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        rcu_read_lock();
        iter->tbl = ipv6_route_seq_next_table(NULL, net);
        iter->skip = *pos;

        if (iter->tbl) {
                loff_t p = 0;

                ipv6_route_seq_setup_walk(iter, net);
                return ipv6_route_seq_next(seq, NULL, &p);
        } else {
                return NULL;
        }
}

static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
{
        struct fib6_walker *w = &iter->w;
        return w->node && !(w->state == FWS_U && w->node == w->root);
}

static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        if (ipv6_route_iter_active(iter))
                fib6_walker_unlink(net, &iter->w);

        rcu_read_unlock();
}

#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
                                    struct bpf_iter_meta *meta,
                                    void *v)
{
        struct bpf_iter__ipv6_route ctx;

        ctx.meta = meta;
        ctx.rt = v;
        return bpf_iter_run_prog(prog, &ctx);
}

static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        struct ipv6_route_iter *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        int ret;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return ipv6_route_native_seq_show(seq, v);

        ret = ipv6_route_prog_seq_show(prog, &meta, v);
        iter->w.leaf = NULL;

        return ret;
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)ipv6_route_prog_seq_show(prog, &meta, v);
        }

        ipv6_route_native_seq_stop(seq, v);
}
#else
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        return ipv6_route_native_seq_show(seq, v);
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        ipv6_route_native_seq_stop(seq, v);
}
#endif

const struct seq_operations ipv6_route_seq_ops = {
        .start        = ipv6_route_seq_start,
        .next        = ipv6_route_seq_next,
        .stop        = ipv6_route_seq_stop,
        .show        = ipv6_route_seq_show
};
#endif /* CONFIG_PROC_FS */



















































    1 
















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Synchronous Cryptographic Hash operations.
 *
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/scatterwalk.h>
#include <linux/cryptouser.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <net/netlink.h>

#include "hash.h"

int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
                    unsigned int keylen)
{
        return -ENOSYS;
}
EXPORT_SYMBOL_GPL(shash_no_setkey);

static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg)
{
        if (crypto_shash_alg_needs_key(alg))
                crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
}

int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
                        unsigned int keylen)
{
        struct shash_alg *shash = crypto_shash_alg(tfm);
        int err;

        err = shash->setkey(tfm, key, keylen);
        if (unlikely(err)) {
                shash_set_needkey(tfm, shash);
                return err;
        }

        crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_setkey);

int crypto_shash_update(struct shash_desc *desc, const u8 *data,
                        unsigned int len)
{
        return crypto_shash_alg(desc->tfm)->update(desc, data, len);
}
EXPORT_SYMBOL_GPL(crypto_shash_update);

int crypto_shash_final(struct shash_desc *desc, u8 *out)
{
        return crypto_shash_alg(desc->tfm)->final(desc, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_final);

static int shash_default_finup(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *out)
{
        struct shash_alg *shash = crypto_shash_alg(desc->tfm);

        return shash->update(desc, data, len) ?:
               shash->final(desc, out);
}

int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
                       unsigned int len, u8 *out)
{
        return crypto_shash_alg(desc->tfm)->finup(desc, data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_finup);

static int shash_default_digest(struct shash_desc *desc, const u8 *data,
                                unsigned int len, u8 *out)
{
        struct shash_alg *shash = crypto_shash_alg(desc->tfm);

        return shash->init(desc) ?:
               shash->finup(desc, data, len, out);
}

int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
                        unsigned int len, u8 *out)
{
        struct crypto_shash *tfm = desc->tfm;

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return crypto_shash_alg(tfm)->digest(desc, data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_digest);

int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data,
                            unsigned int len, u8 *out)
{
        SHASH_DESC_ON_STACK(desc, tfm);
        int err;

        desc->tfm = tfm;

        err = crypto_shash_digest(desc, data, len, out);

        shash_desc_zero(desc);

        return err;
}
EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest);

int crypto_shash_export(struct shash_desc *desc, void *out)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);

        if (shash->export)
                return shash->export(desc, out);

        memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(tfm));
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_export);

int crypto_shash_import(struct shash_desc *desc, const void *in)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        if (shash->import)
                return shash->import(desc, in);

        memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(tfm));
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_import);

static void crypto_shash_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);

        alg->exit_tfm(hash);
}

static int crypto_shash_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);
        int err;

        hash->descsize = alg->descsize;

        shash_set_needkey(hash, alg);

        if (alg->exit_tfm)
                tfm->exit = crypto_shash_exit_tfm;

        if (!alg->init_tfm)
                return 0;

        err = alg->init_tfm(hash);
        if (err)
                return err;

        /* ->init_tfm() may have increased the descsize. */
        if (WARN_ON_ONCE(hash->descsize > HASH_MAX_DESCSIZE)) {
                if (alg->exit_tfm)
                        alg->exit_tfm(hash);
                return -EINVAL;
        }

        return 0;
}

static void crypto_shash_free_instance(struct crypto_instance *inst)
{
        struct shash_instance *shash = shash_instance(inst);

        shash->free(shash);
}

static int __maybe_unused crypto_shash_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_hash rhash;
        struct shash_alg *salg = __crypto_shash_alg(alg);

        memset(&rhash, 0, sizeof(rhash));

        strscpy(rhash.type, "shash", sizeof(rhash.type));

        rhash.blocksize = alg->cra_blocksize;
        rhash.digestsize = salg->digestsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
}

static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct shash_alg *salg = __crypto_shash_alg(alg);

        seq_printf(m, "type         : shash\n");
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "digestsize   : %u\n", salg->digestsize);
}

const struct crypto_type crypto_shash_type = {
        .extsize = crypto_alg_extsize,
        .init_tfm = crypto_shash_init_tfm,
        .free = crypto_shash_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_shash_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_shash_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_MASK,
        .type = CRYPTO_ALG_TYPE_SHASH,
        .tfmsize = offsetof(struct crypto_shash, base),
};

int crypto_grab_shash(struct crypto_shash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_shash_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_shash);

struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
                                        u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_shash);

int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_shash);

struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash)
{
        struct crypto_tfm *tfm = crypto_shash_tfm(hash);
        struct shash_alg *alg = crypto_shash_alg(hash);
        struct crypto_shash *nhash;
        int err;

        if (!crypto_shash_alg_has_setkey(alg)) {
                tfm = crypto_tfm_get(tfm);
                if (IS_ERR(tfm))
                        return ERR_CAST(tfm);

                return hash;
        }

        if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init))
                return ERR_PTR(-ENOSYS);

        nhash = crypto_clone_tfm(&crypto_shash_type, tfm);
        if (IS_ERR(nhash))
                return nhash;

        nhash->descsize = hash->descsize;

        if (alg->clone_tfm) {
                err = alg->clone_tfm(nhash, hash);
                if (err) {
                        crypto_free_shash(nhash);
                        return ERR_PTR(err);
                }
        }

        return nhash;
}
EXPORT_SYMBOL_GPL(crypto_clone_shash);

int hash_prepare_alg(struct hash_alg_common *alg)
{
        struct crypto_alg *base = &alg->base;

        if (alg->digestsize > HASH_MAX_DIGESTSIZE)
                return -EINVAL;

        /* alignmask is not useful for hashes, so it is not supported. */
        if (base->cra_alignmask)
                return -EINVAL;

        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;

        return 0;
}

static int shash_prepare_alg(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->halg.base;
        int err;

        if (alg->descsize > HASH_MAX_DESCSIZE)
                return -EINVAL;

        if ((alg->export && !alg->import) || (alg->import && !alg->export))
                return -EINVAL;

        err = hash_prepare_alg(&alg->halg);
        if (err)
                return err;

        base->cra_type = &crypto_shash_type;
        base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;

        /*
         * Handle missing optional functions.  For each one we can either
         * install a default here, or we can leave the pointer as NULL and check
         * the pointer for NULL in crypto_shash_*(), avoiding an indirect call
         * when the default behavior is desired.  For ->finup and ->digest we
         * install defaults, since for optimal performance algorithms should
         * implement these anyway.  On the other hand, for ->import and
         * ->export the common case and best performance comes from the simple
         * memcpy of the shash_desc_ctx, so when those pointers are NULL we
         * leave them NULL and provide the memcpy with no indirect call.
         */
        if (!alg->finup)
                alg->finup = shash_default_finup;
        if (!alg->digest)
                alg->digest = shash_default_digest;
        if (!alg->export)
                alg->halg.statesize = alg->descsize;
        if (!alg->setkey)
                alg->setkey = shash_no_setkey;

        return 0;
}

int crypto_register_shash(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = shash_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_shash);

void crypto_unregister_shash(struct shash_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shash);

int crypto_register_shashes(struct shash_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_shash(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_shashes);

void crypto_unregister_shashes(struct shash_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shashes);

int shash_register_instance(struct crypto_template *tmpl,
                            struct shash_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = shash_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, shash_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(shash_register_instance);

void shash_free_singlespawn_instance(struct shash_instance *inst)
{
        crypto_drop_spawn(shash_instance_ctx(inst));
        kfree(inst);
}
EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Synchronous cryptographic hash type");
































































    1 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* SCTP kernel implementation
 * (C) Copyright 2007 Hewlett-Packard Development Company, L.P.
 *
 * This file is part of the SCTP kernel implementation
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *   Vlad Yasevich     <vladislav.yasevich@hp.com>
 */

#ifndef __sctp_auth_h__
#define __sctp_auth_h__

#include <linux/list.h>
#include <linux/refcount.h>

struct sctp_endpoint;
struct sctp_association;
struct sctp_authkey;
struct sctp_hmacalgo;
struct crypto_shash;

/*
 * Define a generic struct that will hold all the info
 * necessary for an HMAC transform
 */
struct sctp_hmac {
        __u16 hmac_id;                /* one of the above ids */
        char *hmac_name;        /* name for loading */
        __u16 hmac_len;                /* length of the signature */
};

/* This is generic structure that containst authentication bytes used
 * as keying material.  It's a what is referred to as byte-vector all
 * over SCTP-AUTH
 */
struct sctp_auth_bytes {
        refcount_t refcnt;
        __u32 len;
        __u8  data[];
};

/* Definition for a shared key, weather endpoint or association */
struct sctp_shared_key {
        struct list_head key_list;
        struct sctp_auth_bytes *key;
        refcount_t refcnt;
        __u16 key_id;
        __u8 deactivated;
};

#define key_for_each(__key, __list_head) \
        list_for_each_entry(__key, __list_head, key_list)

#define key_for_each_safe(__key, __tmp, __list_head) \
        list_for_each_entry_safe(__key, __tmp, __list_head, key_list)

static inline void sctp_auth_key_hold(struct sctp_auth_bytes *key)
{
        if (!key)
                return;

        refcount_inc(&key->refcnt);
}

void sctp_auth_key_put(struct sctp_auth_bytes *key);
struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp);
void sctp_auth_destroy_keys(struct list_head *keys);
int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp);
struct sctp_shared_key *sctp_auth_get_shkey(
                                const struct sctp_association *asoc,
                                __u16 key_id);
int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep,
                                struct sctp_association *asoc,
                                gfp_t gfp);
int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp);
void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]);
struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id);
struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc);
void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
                                     struct sctp_hmac_algo_param *hmacs);
int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc,
                                    __be16 hmac_id);
int sctp_auth_send_cid(enum sctp_cid chunk,
                       const struct sctp_association *asoc);
int sctp_auth_recv_cid(enum sctp_cid chunk,
                       const struct sctp_association *asoc);
void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
                              struct sk_buff *skb, struct sctp_auth_chunk *auth,
                              struct sctp_shared_key *ep_key, gfp_t gfp);
void sctp_auth_shkey_release(struct sctp_shared_key *sh_key);
void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key);

/* API Helpers */
int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id);
int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
                            struct sctp_hmacalgo *hmacs);
int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc,
                      struct sctp_authkey *auth_key);
int sctp_auth_set_active_key(struct sctp_endpoint *ep,
                             struct sctp_association *asoc, __u16 key_id);
int sctp_auth_del_key_id(struct sctp_endpoint *ep,
                         struct sctp_association *asoc, __u16 key_id);
int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
                           struct sctp_association *asoc, __u16 key_id);
int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp);
void sctp_auth_free(struct sctp_endpoint *ep);

#endif














































































































































































































    1 








    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* delayacct.h - per-task delay accounting
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 */

#ifndef _LINUX_DELAYACCT_H
#define _LINUX_DELAYACCT_H

#include <uapi/linux/taskstats.h>

#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
        raw_spinlock_t        lock;

        /* For each stat XXX, add following, aligned appropriately
         *
         * struct timespec XXX_start, XXX_end;
         * u64 XXX_delay;
         * u32 XXX_count;
         *
         * Atomicity of updates to XXX_delay, XXX_count protected by
         * single lock above (split into XXX_lock if contention is an issue).
         */

        /*
         * XXX_count is incremented on every XXX operation, the delay
         * associated with the operation is added to XXX_delay.
         * XXX_delay contains the accumulated delay time in nanoseconds.
         */
        u64 blkio_start;
        u64 blkio_delay;        /* wait for sync block io completion */
        u64 swapin_start;
        u64 swapin_delay;        /* wait for swapin */
        u32 blkio_count;        /* total count of the number of sync block */
                                /* io operations performed */
        u32 swapin_count;        /* total count of swapin */

        u64 freepages_start;
        u64 freepages_delay;        /* wait for memory reclaim */

        u64 thrashing_start;
        u64 thrashing_delay;        /* wait for thrashing page */

        u64 compact_start;
        u64 compact_delay;        /* wait for memory compact */

        u64 wpcopy_start;
        u64 wpcopy_delay;        /* wait for write-protect copy */

        u64 irq_delay;        /* wait for IRQ/SOFTIRQ */

        u32 freepages_count;        /* total count of memory reclaim */
        u32 thrashing_count;        /* total count of thrash waits */
        u32 compact_count;        /* total count of memory compact */
        u32 wpcopy_count;        /* total count of write-protect copy */
        u32 irq_count;        /* total count of IRQ/SOFTIRQ */
};
#endif

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/jump_label.h>

#ifdef CONFIG_TASK_DELAY_ACCT
DECLARE_STATIC_KEY_FALSE(delayacct_key);
extern int delayacct_on;        /* Delay accounting turned on/off */
extern struct kmem_cache *delayacct_cache;
extern void delayacct_init(void);

extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio_start(void);
extern void __delayacct_blkio_end(struct task_struct *);
extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
extern void __delayacct_thrashing_start(bool *in_thrashing);
extern void __delayacct_thrashing_end(bool *in_thrashing);
extern void __delayacct_swapin_start(void);
extern void __delayacct_swapin_end(void);
extern void __delayacct_compact_start(void);
extern void __delayacct_compact_end(void);
extern void __delayacct_wpcopy_start(void);
extern void __delayacct_wpcopy_end(void);
extern void __delayacct_irq(struct task_struct *task, u32 delta);

static inline void delayacct_tsk_init(struct task_struct *tsk)
{
        /* reinitialize in case parent's non-null pointer was dup'ed*/
        tsk->delays = NULL;
        if (delayacct_on)
                __delayacct_tsk_init(tsk);
}

/* Free tsk->delays. Called from bad fork and __put_task_struct
 * where there's no risk of tsk->delays being accessed elsewhere
 */
static inline void delayacct_tsk_free(struct task_struct *tsk)
{
        if (tsk->delays)
                kmem_cache_free(delayacct_cache, tsk->delays);
        tsk->delays = NULL;
}

static inline void delayacct_blkio_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_blkio_start();
}

static inline void delayacct_blkio_end(struct task_struct *p)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (p->delays)
                __delayacct_blkio_end(p);
}

static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{
        if (tsk->delays)
                return __delayacct_blkio_ticks(tsk);
        return 0;
}

static inline void delayacct_freepages_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_start();
}

static inline void delayacct_freepages_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_end();
}

static inline void delayacct_thrashing_start(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_start(in_thrashing);
}

static inline void delayacct_thrashing_end(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_end(in_thrashing);
}

static inline void delayacct_swapin_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_start();
}

static inline void delayacct_swapin_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_end();
}

static inline void delayacct_compact_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_start();
}

static inline void delayacct_compact_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_end();
}

static inline void delayacct_wpcopy_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_start();
}

static inline void delayacct_wpcopy_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_end();
}

static inline void delayacct_irq(struct task_struct *task, u32 delta)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (task->delays)
                __delayacct_irq(task, delta);
}

#else
static inline void delayacct_init(void)
{}
static inline void delayacct_tsk_init(struct task_struct *tsk)
{}
static inline void delayacct_tsk_free(struct task_struct *tsk)
{}
static inline void delayacct_blkio_start(void)
{}
static inline void delayacct_blkio_end(struct task_struct *p)
{}
static inline int delayacct_add_tsk(struct taskstats *d,
                                        struct task_struct *tsk)
{ return 0; }
static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{ return 0; }
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{ return 0; }
static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
static inline void delayacct_thrashing_start(bool *in_thrashing)
{}
static inline void delayacct_thrashing_end(bool *in_thrashing)
{}
static inline void delayacct_swapin_start(void)
{}
static inline void delayacct_swapin_end(void)
{}
static inline void delayacct_compact_start(void)
{}
static inline void delayacct_compact_end(void)
{}
static inline void delayacct_wpcopy_start(void)
{}
static inline void delayacct_wpcopy_end(void)
{}
static inline void delayacct_irq(struct task_struct *task, u32 delta)
{}

#endif /* CONFIG_TASK_DELAY_ACCT */

#endif




































































































































































































































    2 



    1 

    2 























































































































































































































































































































































































































    3 



































    3 



    3 
    3 































































































































































































































































    2 






    2 















































    1 






    3 

    3 


    3 














    3 



    1 
    2 




    3 













    3 








    3 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/util.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/slab.h>
#include <linux/rculist.h>

#include "common.h"

/* Lock for protecting policy. */
DEFINE_MUTEX(tomoyo_policy_lock);

/* Has /sbin/init started? */
bool tomoyo_policy_loaded;

/*
 * Mapping table from "enum tomoyo_mac_index" to
 * "enum tomoyo_mac_category_index".
 */
const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_OPEN]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CREATE]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UNLINK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_GETATTR]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RMDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKFIFO]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKSOCK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_TRUNCATE]   = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_SYMLINK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKBLOCK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKCHAR]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_LINK]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RENAME]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHMOD]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHOWN]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHGRP]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_IOCTL]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHROOT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MOUNT]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UMOUNT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = TOMOYO_MAC_CATEGORY_FILE,
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] =
        TOMOYO_MAC_CATEGORY_NETWORK,
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON]         = TOMOYO_MAC_CATEGORY_MISC,
};

/**
 * tomoyo_convert_time - Convert time_t to YYYY/MM/DD hh/mm/ss.
 *
 * @time64: Seconds since 1970/01/01 00:00:00.
 * @stamp:  Pointer to "struct tomoyo_time".
 *
 * Returns nothing.
 */
void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
{
        struct tm tm;

        time64_to_tm(time64, 0, &tm);
        stamp->sec = tm.tm_sec;
        stamp->min = tm.tm_min;
        stamp->hour = tm.tm_hour;
        stamp->day = tm.tm_mday;
        stamp->month = tm.tm_mon + 1;
        stamp->year = tm.tm_year + 1900;
}

/**
 * tomoyo_permstr - Find permission keywords.
 *
 * @string: String representation for permissions in foo/bar/buz format.
 * @keyword: Keyword to find from @string/
 *
 * Returns true if @keyword was found in @string, false otherwise.
 *
 * This function assumes that strncmp(w1, w2, strlen(w1)) != 0 if w1 != w2.
 */
bool tomoyo_permstr(const char *string, const char *keyword)
{
        const char *cp = strstr(string, keyword);

        if (cp)
                return cp == string || *(cp - 1) == '/';
        return false;
}

/**
 * tomoyo_read_token - Read a word from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a word on success, "" otherwise.
 *
 * To allow the caller to skip NULL check, this function returns "" rather than
 * NULL if there is no more words to read.
 */
char *tomoyo_read_token(struct tomoyo_acl_param *param)
{
        char *pos = param->data;
        char *del = strchr(pos, ' ');

        if (del)
                *del++ = '\0';
        else
                del = pos + strlen(pos);
        param->data = del;
        return pos;
}

static bool tomoyo_correct_path2(const char *filename, const size_t len);

/**
 * tomoyo_get_domainname - Read a domainname from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a domainname on success, NULL otherwise.
 */
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param)
{
        char *start = param->data;
        char *pos = start;

        while (*pos) {
                if (*pos++ != ' ' ||
                    tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos))
                        continue;
                *(pos - 1) = '\0';
                break;
        }
        param->data = pos;
        if (tomoyo_correct_domain(start))
                return tomoyo_get_name(start);
        return NULL;
}

/**
 * tomoyo_parse_ulong - Parse an "unsigned long" value.
 *
 * @result: Pointer to "unsigned long".
 * @str:    Pointer to string to parse.
 *
 * Returns one of values in "enum tomoyo_value_type".
 *
 * The @src is updated to point the first character after the value
 * on success.
 */
u8 tomoyo_parse_ulong(unsigned long *result, char **str)
{
        const char *cp = *str;
        char *ep;
        int base = 10;

        if (*cp == '0') {
                char c = *(cp + 1);

                if (c == 'x' || c == 'X') {
                        base = 16;
                        cp += 2;
                } else if (c >= '0' && c <= '7') {
                        base = 8;
                        cp++;
                }
        }
        *result = simple_strtoul(cp, &ep, base);
        if (cp == ep)
                return TOMOYO_VALUE_TYPE_INVALID;
        *str = ep;
        switch (base) {
        case 16:
                return TOMOYO_VALUE_TYPE_HEXADECIMAL;
        case 8:
                return TOMOYO_VALUE_TYPE_OCTAL;
        default:
                return TOMOYO_VALUE_TYPE_DECIMAL;
        }
}

/**
 * tomoyo_print_ulong - Print an "unsigned long" value.
 *
 * @buffer:     Pointer to buffer.
 * @buffer_len: Size of @buffer.
 * @value:      An "unsigned long" value.
 * @type:       Type of @value.
 *
 * Returns nothing.
 */
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type)
{
        if (type == TOMOYO_VALUE_TYPE_DECIMAL)
                snprintf(buffer, buffer_len, "%lu", value);
        else if (type == TOMOYO_VALUE_TYPE_OCTAL)
                snprintf(buffer, buffer_len, "0%lo", value);
        else if (type == TOMOYO_VALUE_TYPE_HEXADECIMAL)
                snprintf(buffer, buffer_len, "0x%lX", value);
        else
                snprintf(buffer, buffer_len, "type(%u)", type);
}

/**
 * tomoyo_parse_name_union - Parse a tomoyo_name_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr)
{
        char *filename;

        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP);
                return ptr->group != NULL;
        }
        filename = tomoyo_read_token(param);
        if (!tomoyo_correct_word(filename))
                return false;
        ptr->filename = tomoyo_get_name(filename);
        return ptr->filename != NULL;
}

/**
 * tomoyo_parse_number_union - Parse a tomoyo_number_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr)
{
        char *data;
        u8 type;
        unsigned long v;

        memset(ptr, 0, sizeof(*ptr));
        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_NUMBER_GROUP);
                return ptr->group != NULL;
        }
        data = tomoyo_read_token(param);
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID)
                return false;
        ptr->values[0] = v;
        ptr->value_type[0] = type;
        if (!*data) {
                ptr->values[1] = v;
                ptr->value_type[1] = type;
                return true;
        }
        if (*data++ != '-')
                return false;
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID || *data || ptr->values[0] > v)
                return false;
        ptr->values[1] = v;
        ptr->value_type[1] = type;
        return true;
}

/**
 * tomoyo_byte_range - Check whether the string is a \ooo style octal value.
 *
 * @str: Pointer to the string.
 *
 * Returns true if @str is a \ooo style octal value, false otherwise.
 *
 * TOMOYO uses \ooo style representation for 0x01 - 0x20 and 0x7F - 0xFF.
 * This function verifies that \ooo is in valid range.
 */
static inline bool tomoyo_byte_range(const char *str)
{
        return *str >= '0' && *str++ <= '3' &&
                *str >= '0' && *str++ <= '7' &&
                *str >= '0' && *str <= '7';
}

/**
 * tomoyo_alphabet_char - Check whether the character is an alphabet.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an alphabet character, false otherwise.
 */
static inline bool tomoyo_alphabet_char(const char c)
{
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

/**
 * tomoyo_make_byte - Make byte value from three octal characters.
 *
 * @c1: The first character.
 * @c2: The second character.
 * @c3: The third character.
 *
 * Returns byte value.
 */
static inline u8 tomoyo_make_byte(const u8 c1, const u8 c2, const u8 c3)
{
        return ((c1 - '0') << 6) + ((c2 - '0') << 3) + (c3 - '0');
}

/**
 * tomoyo_valid - Check whether the character is a valid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is a valid character, false otherwise.
 */
static inline bool tomoyo_valid(const unsigned char c)
{
        return c > ' ' && c < 127;
}

/**
 * tomoyo_invalid - Check whether the character is an invalid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an invalid character, false otherwise.
 */
static inline bool tomoyo_invalid(const unsigned char c)
{
        return c && (c <= ' ' || c >= 127);
}

/**
 * tomoyo_str_starts - Check whether the given string starts with the given keyword.
 *
 * @src:  Pointer to pointer to the string.
 * @find: Pointer to the keyword.
 *
 * Returns true if @src starts with @find, false otherwise.
 *
 * The @src is updated to point the first character after the @find
 * if @src starts with @find.
 */
bool tomoyo_str_starts(char **src, const char *find)
{
        const int len = strlen(find);
        char *tmp = *src;

        if (strncmp(tmp, find, len))
                return false;
        tmp += len;
        *src = tmp;
        return true;
}

/**
 * tomoyo_normalize_line - Format string.
 *
 * @buffer: The line to normalize.
 *
 * Leading and trailing whitespaces are removed.
 * Multiple whitespaces are packed into single space.
 *
 * Returns nothing.
 */
void tomoyo_normalize_line(unsigned char *buffer)
{
        unsigned char *sp = buffer;
        unsigned char *dp = buffer;
        bool first = true;

        while (tomoyo_invalid(*sp))
                sp++;
        while (*sp) {
                if (!first)
                        *dp++ = ' ';
                first = false;
                while (tomoyo_valid(*sp))
                        *dp++ = *sp++;
                while (tomoyo_invalid(*sp))
                        sp++;
        }
        *dp = '\0';
}

/**
 * tomoyo_correct_word2 - Validate a string.
 *
 * @string: The string to check. Maybe non-'\0'-terminated.
 * @len:    Length of @string.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_word2(const char *string, size_t len)
{
        u8 recursion = 20;
        const char *const start = string;
        bool in_repetition = false;

        if (!len)
                goto out;
        while (len--) {
                unsigned char c = *string++;

                if (c == '\\') {
                        if (!len--)
                                goto out;
                        c = *string++;
                        if (c >= '0' && c <= '3') {
                                unsigned char d;
                                unsigned char e;

                                if (!len-- || !len--)
                                        goto out;
                                d = *string++;
                                e = *string++;
                                if (d < '0' || d > '7' || e < '0' || e > '7')
                                        goto out;
                                c = tomoyo_make_byte(c, d, e);
                                if (c <= ' ' || c >= 127)
                                        continue;
                                goto out;
                        }
                        switch (c) {
                        case '\\':  /* "\\" */
                        case '+':   /* "\+" */
                        case '?':   /* "\?" */
                        case 'x':   /* "\x" */
                        case 'a':   /* "\a" */
                        case '-':   /* "\-" */
                                continue;
                        }
                        if (!recursion--)
                                goto out;
                        switch (c) {
                        case '*':   /* "\*" */
                        case '@':   /* "\@" */
                        case '$':   /* "\$" */
                        case 'X':   /* "\X" */
                        case 'A':   /* "\A" */
                                continue;
                        case '{':   /* "/\{" */
                                if (string - 3 < start || *(string - 3) != '/')
                                        goto out;
                                in_repetition = true;
                                continue;
                        case '}':   /* "\}/" */
                                if (*string != '/')
                                        goto out;
                                if (!in_repetition)
                                        goto out;
                                in_repetition = false;
                                continue;
                        }
                        goto out;
                } else if (in_repetition && c == '/') {
                        goto out;
                } else if (c <= ' ' || c >= 127) {
                        goto out;
                }
        }
        if (in_repetition)
                goto out;
        return true;
 out:
        return false;
}

/**
 * tomoyo_correct_word - Validate a string.
 *
 * @string: The string to check.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
bool tomoyo_correct_word(const char *string)
{
        return tomoyo_correct_word2(string, strlen(string));
}

/**
 * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules.
 *
 * @filename: The pathname to check.
 * @len:      Length of @filename.
 *
 * Returns true if @filename follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_path2(const char *filename, const size_t len)
{
        const char *cp1 = memchr(filename, '/', len);
        const char *cp2 = memchr(filename, '.', len);

        return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len);
}

/**
 * tomoyo_correct_path - Validate a pathname.
 *
 * @filename: The pathname to check.
 *
 * Check whether the given pathname follows the naming rules.
 * Returns true if @filename follows the naming rules, false otherwise.
 */
bool tomoyo_correct_path(const char *filename)
{
        return tomoyo_correct_path2(filename, strlen(filename));
}

/**
 * tomoyo_correct_domain - Check whether the given domainname follows the naming rules.
 *
 * @domainname: The domainname to check.
 *
 * Returns true if @domainname follows the naming rules, false otherwise.
 */
bool tomoyo_correct_domain(const unsigned char *domainname)
{
        if (!domainname || !tomoyo_domain_def(domainname))
                return false;
        domainname = strchr(domainname, ' ');
        if (!domainname++)
                return true;
        while (1) {
                const unsigned char *cp = strchr(domainname, ' ');

                if (!cp)
                        break;
                if (!tomoyo_correct_path2(domainname, cp - domainname))
                        return false;
                domainname = cp + 1;
        }
        return tomoyo_correct_path(domainname);
}

/**
 * tomoyo_domain_def - Check whether the given token can be a domainname.
 *
 * @buffer: The token to check.
 *
 * Returns true if @buffer possibly be a domainname, false otherwise.
 */
bool tomoyo_domain_def(const unsigned char *buffer)
{
        const unsigned char *cp;
        int len;

        if (*buffer != '<')
                return false;
        cp = strchr(buffer, ' ');
        if (!cp)
                len = strlen(buffer);
        else
                len = cp - buffer;
        if (buffer[len - 1] != '>' ||
            !tomoyo_correct_word2(buffer + 1, len - 2))
                return false;
        return true;
}

/**
 * tomoyo_find_domain - Find a domain by the given name.
 *
 * @domainname: The domainname to find.
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!domain->is_deleted &&
                    !tomoyo_pathcmp(&name, domain->domainname))
                        return domain;
        }
        return NULL;
}

/**
 * tomoyo_const_part_length - Evaluate the initial length without a pattern in a token.
 *
 * @filename: The string to evaluate.
 *
 * Returns the initial length without a pattern in @filename.
 */
static int tomoyo_const_part_length(const char *filename)
{
        char c;
        int len = 0;

        if (!filename)
                return 0;
        while ((c = *filename++) != '\0') {
                if (c != '\\') {
                        len++;
                        continue;
                }
                c = *filename++;
                switch (c) {
                case '\\':  /* "\\" */
                        len += 2;
                        continue;
                case '0':   /* "\ooo" */
                case '1':
                case '2':
                case '3':
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        len += 4;
                        continue;
                }
                break;
        }
        return len;
}

/**
 * tomoyo_fill_path_info - Fill in "struct tomoyo_path_info" members.
 *
 * @ptr: Pointer to "struct tomoyo_path_info" to fill in.
 *
 * The caller sets "struct tomoyo_path_info"->name.
 */
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr)
{
        const char *name = ptr->name;
        const int len = strlen(name);

        ptr->const_len = tomoyo_const_part_length(name);
        ptr->is_dir = len && (name[len - 1] == '/');
        ptr->is_patterned = (ptr->const_len < len);
        ptr->hash = full_name_hash(NULL, name, len);
}

/**
 * tomoyo_file_matches_pattern2 - Pattern matching without '/' character and "\-" pattern.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern2(const char *filename,
                                         const char *filename_end,
                                         const char *pattern,
                                         const char *pattern_end)
{
        while (filename < filename_end && pattern < pattern_end) {
                char c;
                int i;
                int j;

                if (*pattern != '\\') {
                        if (*filename++ != *pattern++)
                                return false;
                        continue;
                }
                c = *filename;
                pattern++;
                switch (*pattern) {
                case '?':
                        if (c == '/') {
                                return false;
                        } else if (c == '\\') {
                                if (filename[1] == '\\')
                                        filename++;
                                else if (tomoyo_byte_range(filename + 1))
                                        filename += 3;
                                else
                                        return false;
                        }
                        break;
                case '\\':
                        if (c != '\\')
                                return false;
                        if (*++filename != '\\')
                                return false;
                        break;
                case '+':
                        if (!isdigit(c))
                                return false;
                        break;
                case 'x':
                        if (!isxdigit(c))
                                return false;
                        break;
                case 'a':
                        if (!tomoyo_alphabet_char(c))
                                return false;
                        break;
                case '0':
                case '1':
                case '2':
                case '3':
                        if (c == '\\' && tomoyo_byte_range(filename + 1)
                            && strncmp(filename + 1, pattern, 3) == 0) {
                                filename += 3;
                                pattern += 2;
                                break;
                        }
                        return false; /* Not matched. */
                case '*':
                case '@':
                        for (i = 0; i <= filename_end - filename; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                                c = filename[i];
                                if (c == '.' && *pattern == '@')
                                        break;
                                if (c != '\\')
                                        continue;
                                if (filename[i + 1] == '\\')
                                        i++;
                                else if (tomoyo_byte_range(filename + i + 1))
                                        i += 3;
                                else
                                        break; /* Bad pattern. */
                        }
                        return false; /* Not matched. */
                default:
                        j = 0;
                        c = *pattern;
                        if (c == '$') {
                                while (isdigit(filename[j]))
                                        j++;
                        } else if (c == 'X') {
                                while (isxdigit(filename[j]))
                                        j++;
                        } else if (c == 'A') {
                                while (tomoyo_alphabet_char(filename[j]))
                                        j++;
                        }
                        for (i = 1; i <= j; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                        }
                        return false; /* Not matched or bad pattern. */
                }
                filename++;
                pattern++;
        }
        while (*pattern == '\\' &&
               (*(pattern + 1) == '*' || *(pattern + 1) == '@'))
                pattern += 2;
        return filename == filename_end && pattern == pattern_end;
}

/**
 * tomoyo_file_matches_pattern - Pattern matching without '/' character.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern(const char *filename,
                                        const char *filename_end,
                                        const char *pattern,
                                        const char *pattern_end)
{
        const char *pattern_start = pattern;
        bool first = true;
        bool result;

        while (pattern < pattern_end - 1) {
                /* Split at "\-" pattern. */
                if (*pattern++ != '\\' || *pattern++ != '-')
                        continue;
                result = tomoyo_file_matches_pattern2(filename,
                                                      filename_end,
                                                      pattern_start,
                                                      pattern - 2);
                if (first)
                        result = !result;
                if (result)
                        return false;
                first = false;
                pattern_start = pattern;
        }
        result = tomoyo_file_matches_pattern2(filename, filename_end,
                                              pattern_start, pattern_end);
        return first ? result : !result;
}

/**
 * tomoyo_path_matches_pattern2 - Do pathname pattern matching.
 *
 * @f: The start of string to check.
 * @p: The start of pattern to compare.
 *
 * Returns true if @f matches @p, false otherwise.
 */
static bool tomoyo_path_matches_pattern2(const char *f, const char *p)
{
        const char *f_delimiter;
        const char *p_delimiter;

        while (*f && *p) {
                f_delimiter = strchr(f, '/');
                if (!f_delimiter)
                        f_delimiter = f + strlen(f);
                p_delimiter = strchr(p, '/');
                if (!p_delimiter)
                        p_delimiter = p + strlen(p);
                if (*p == '\\' && *(p + 1) == '{')
                        goto recursive;
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p,
                                                 p_delimiter))
                        return false;
                f = f_delimiter;
                if (*f)
                        f++;
                p = p_delimiter;
                if (*p)
                        p++;
        }
        /* Ignore trailing "\*" and "\@" in @pattern. */
        while (*p == '\\' &&
               (*(p + 1) == '*' || *(p + 1) == '@'))
                p += 2;
        return !*f && !*p;
 recursive:
        /*
         * The "\{" pattern is permitted only after '/' character.
         * This guarantees that below "*(p - 1)" is safe.
         * Also, the "\}" pattern is permitted only before '/' character
         * so that "\{" + "\}" pair will not break the "\-" operator.
         */
        if (*(p - 1) != '/' || p_delimiter <= p + 3 || *p_delimiter != '/' ||
            *(p_delimiter - 1) != '}' || *(p_delimiter - 2) != '\\')
                return false; /* Bad pattern. */
        do {
                /* Compare current component with pattern. */
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p + 2,
                                                 p_delimiter - 2))
                        break;
                /* Proceed to next component. */
                f = f_delimiter;
                if (!*f)
                        break;
                f++;
                /* Continue comparison. */
                if (tomoyo_path_matches_pattern2(f, p_delimiter + 1))
                        return true;
                f_delimiter = strchr(f, '/');
        } while (f_delimiter);
        return false; /* Not matched. */
}

/**
 * tomoyo_path_matches_pattern - Check whether the given filename matches the given pattern.
 *
 * @filename: The filename to check.
 * @pattern:  The pattern to compare.
 *
 * Returns true if matches, false otherwise.
 *
 * The following patterns are available.
 *   \\     \ itself.
 *   \ooo   Octal representation of a byte.
 *   \*     Zero or more repetitions of characters other than '/'.
 *   \@     Zero or more repetitions of characters other than '/' or '.'.
 *   \?     1 byte character other than '/'.
 *   \$     One or more repetitions of decimal digits.
 *   \+     1 decimal digit.
 *   \X     One or more repetitions of hexadecimal digits.
 *   \x     1 hexadecimal digit.
 *   \A     One or more repetitions of alphabet characters.
 *   \a     1 alphabet character.
 *
 *   \-     Subtraction operator.
 *
 *   /\{dir\}/   '/' + 'One or more repetitions of dir/' (e.g. /dir/ /dir/dir/
 *               /dir/dir/dir/ ).
 */
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern)
{
        const char *f = filename->name;
        const char *p = pattern->name;
        const int len = pattern->const_len;

        /* If @pattern doesn't contain pattern, I can use strcmp(). */
        if (!pattern->is_patterned)
                return !tomoyo_pathcmp(filename, pattern);
        /* Don't compare directory and non-directory. */
        if (filename->is_dir != pattern->is_dir)
                return false;
        /* Compare the initial length without patterns. */
        if (strncmp(f, p, len))
                return false;
        f += len;
        p += len;
        return tomoyo_path_matches_pattern2(f, p);
}

/**
 * tomoyo_get_exe - Get tomoyo_realpath() of current process.
 *
 * Returns the tomoyo_realpath() of current process on success, NULL otherwise.
 *
 * This function uses kzalloc(), so the caller must call kfree()
 * if this function didn't return NULL.
 */
const char *tomoyo_get_exe(void)
{
        struct file *exe_file;
        const char *cp;
        struct mm_struct *mm = current->mm;

        if (!mm)
                return NULL;
        exe_file = get_mm_exe_file(mm);
        if (!exe_file)
                return NULL;

        cp = tomoyo_realpath_from_path(&exe_file->f_path);
        fput(exe_file);
        return cp;
}

/**
 * tomoyo_get_mode - Get MAC mode.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number.
 * @index:   Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index)
{
        u8 mode;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return TOMOYO_CONFIG_DISABLED;
        p = tomoyo_profile(ns, profile);
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[tomoyo_index2category[index]
                                 + TOMOYO_MAX_MAC_INDEX];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        return mode & 3;
}

/**
 * tomoyo_init_request_info - Initialize "struct tomoyo_request_info" members.
 *
 * @r:      Pointer to "struct tomoyo_request_info" to initialize.
 * @domain: Pointer to "struct tomoyo_domain_info". NULL for tomoyo_domain().
 * @index:  Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain, const u8 index)
{
        u8 profile;

        memset(r, 0, sizeof(*r));
        if (!domain)
                domain = tomoyo_domain();
        r->domain = domain;
        profile = domain->profile;
        r->profile = profile;
        r->type = index;
        r->mode = tomoyo_get_mode(domain->ns, profile, index);
        return r->mode;
}

/**
 * tomoyo_domain_quota_is_ok - Check for domain's quota.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns true if the domain is not exceeded quota, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
{
        unsigned int count = 0;
        struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;

        if (r->mode != TOMOYO_CONFIG_LEARNING)
                return false;
        if (!domain)
                return true;
        if (READ_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED]))
                return false;
        list_for_each_entry_rcu(ptr, &domain->acl_info_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                u16 perm;

                if (ptr->is_deleted)
                        continue;
                /*
                 * Reading perm bitmap might race with tomoyo_merge_*() because
                 * caller does not hold tomoyo_policy_lock mutex. But exceeding
                 * max_learning_entry parameter by a few entries does not harm.
                 */
                switch (ptr->type) {
                case TOMOYO_TYPE_PATH_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH2_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path2_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH_NUMBER_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_number_acl, head)
                                  ->perm);
                        break;
                case TOMOYO_TYPE_MKDEV_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_mkdev_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_INET_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_inet_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_UNIX_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_unix_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_MANUAL_TASK_ACL:
                        perm = 0;
                        break;
                default:
                        perm = 1;
                }
                count += hweight16(perm);
        }
        if (count < tomoyo_profile(domain->ns, domain->profile)->
            pref[TOMOYO_PREF_MAX_LEARNING_ENTRY])
                return true;
        WRITE_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED], true);
        /* r->granted = false; */
        tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]);
#ifndef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n",
                domain->domainname->name);
#endif
        return false;
}



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * KVM paravirt_ops implementation
 *
 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 * Copyright IBM Corporation, 2007
 *   Authors: Anthony Liguori <aliguori@us.ibm.com>
 */

#define pr_fmt(fmt) "kvm-guest: " fmt

#include <linux/context_tracking.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/hash.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
#include <linux/swait.h>
#include <linux/syscore_ops.h>
#include <linux/cc_platform.h>
#include <linux/efi.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
#include <asm/ptrace.h>
#include <asm/reboot.h>
#include <asm/svm.h>
#include <asm/e820/api.h>

DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);

static int kvmapf = 1;

static int __init parse_no_kvmapf(char *arg)
{
        kvmapf = 0;
        return 0;
}

early_param("no-kvmapf", parse_no_kvmapf);

static int steal_acc = 1;
static int __init parse_no_stealacc(char *arg)
{
        steal_acc = 0;
        return 0;
}

early_param("no-steal-acc", parse_no_stealacc);

static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;

static int has_guest_poll = 0;
/*
 * No need for any "IO delay" on KVM
 */
static void kvm_io_delay(void)
{
}

#define KVM_TASK_SLEEP_HASHBITS 8
#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)

struct kvm_task_sleep_node {
        struct hlist_node link;
        struct swait_queue_head wq;
        u32 token;
        int cpu;
};

static struct kvm_task_sleep_head {
        raw_spinlock_t lock;
        struct hlist_head list;
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];

static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
                                                  u32 token)
{
        struct hlist_node *p;

        hlist_for_each(p, &b->list) {
                struct kvm_task_sleep_node *n =
                        hlist_entry(p, typeof(*n), link);
                if (n->token == token)
                        return n;
        }

        return NULL;
}

static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
{
        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
        struct kvm_task_sleep_node *e;

        raw_spin_lock(&b->lock);
        e = _find_apf_task(b, token);
        if (e) {
                /* dummy entry exist -> wake up was delivered ahead of PF */
                hlist_del(&e->link);
                raw_spin_unlock(&b->lock);
                kfree(e);
                return false;
        }

        n->token = token;
        n->cpu = smp_processor_id();
        init_swait_queue_head(&n->wq);
        hlist_add_head(&n->link, &b->list);
        raw_spin_unlock(&b->lock);
        return true;
}

/*
 * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
 * @token:        Token to identify the sleep node entry
 *
 * Invoked from the async pagefault handling code or from the VM exit page
 * fault handler. In both cases RCU is watching.
 */
void kvm_async_pf_task_wait_schedule(u32 token)
{
        struct kvm_task_sleep_node n;
        DECLARE_SWAITQUEUE(wait);

        lockdep_assert_irqs_disabled();

        if (!kvm_async_pf_queue_task(token, &n))
                return;

        for (;;) {
                prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
                if (hlist_unhashed(&n.link))
                        break;

                local_irq_enable();
                schedule();
                local_irq_disable();
        }
        finish_swait(&n.wq, &wait);
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);

static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
        hlist_del_init(&n->link);
        if (swq_has_sleeper(&n->wq))
                swake_up_one(&n->wq);
}

static void apf_task_wake_all(void)
{
        int i;

        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
                struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
                struct kvm_task_sleep_node *n;
                struct hlist_node *p, *next;

                raw_spin_lock(&b->lock);
                hlist_for_each_safe(p, next, &b->list) {
                        n = hlist_entry(p, typeof(*n), link);
                        if (n->cpu == smp_processor_id())
                                apf_task_wake_one(n);
                }
                raw_spin_unlock(&b->lock);
        }
}

void kvm_async_pf_task_wake(u32 token)
{
        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
        struct kvm_task_sleep_node *n, *dummy = NULL;

        if (token == ~0) {
                apf_task_wake_all();
                return;
        }

again:
        raw_spin_lock(&b->lock);
        n = _find_apf_task(b, token);
        if (!n) {
                /*
                 * Async #PF not yet handled, add a dummy entry for the token.
                 * Allocating the token must be down outside of the raw lock
                 * as the allocator is preemptible on PREEMPT_RT kernels.
                 */
                if (!dummy) {
                        raw_spin_unlock(&b->lock);
                        dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);

                        /*
                         * Continue looping on allocation failure, eventually
                         * the async #PF will be handled and allocating a new
                         * node will be unnecessary.
                         */
                        if (!dummy)
                                cpu_relax();

                        /*
                         * Recheck for async #PF completion before enqueueing
                         * the dummy token to avoid duplicate list entries.
                         */
                        goto again;
                }
                dummy->token = token;
                dummy->cpu = smp_processor_id();
                init_swait_queue_head(&dummy->wq);
                hlist_add_head(&dummy->link, &b->list);
                dummy = NULL;
        } else {
                apf_task_wake_one(n);
        }
        raw_spin_unlock(&b->lock);

        /* A dummy token might be allocated and ultimately not used.  */
        kfree(dummy);
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);

noinstr u32 kvm_read_and_reset_apf_flags(void)
{
        u32 flags = 0;

        if (__this_cpu_read(async_pf_enabled)) {
                flags = __this_cpu_read(apf_reason.flags);
                __this_cpu_write(apf_reason.flags, 0);
        }

        return flags;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);

noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
        u32 flags = kvm_read_and_reset_apf_flags();
        irqentry_state_t state;

        if (!flags)
                return false;

        state = irqentry_enter(regs);
        instrumentation_begin();

        /*
         * If the host managed to inject an async #PF into an interrupt
         * disabled region, then die hard as this is not going to end well
         * and the host side is seriously broken.
         */
        if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
                panic("Host injected async #PF in interrupt disabled region\n");

        if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
                if (unlikely(!(user_mode(regs))))
                        panic("Host injected async #PF in kernel mode\n");
                /* Page is swapped out by the host. */
                kvm_async_pf_task_wait_schedule(token);
        } else {
                WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
        }

        instrumentation_end();
        irqentry_exit(regs, state);
        return true;
}

DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
{
        struct pt_regs *old_regs = set_irq_regs(regs);
        u32 token;

        apic_eoi();

        inc_irq_stat(irq_hv_callback_count);

        if (__this_cpu_read(async_pf_enabled)) {
                token = __this_cpu_read(apf_reason.token);
                kvm_async_pf_task_wake(token);
                __this_cpu_write(apf_reason.token, 0);
                wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
        }

        set_irq_regs(old_regs);
}

static void __init paravirt_ops_setup(void)
{
        pv_info.name = "KVM";

        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
                pv_ops.cpu.io_delay = kvm_io_delay;

#ifdef CONFIG_X86_IO_APIC
        no_timer_check = 1;
#endif
}

static void kvm_register_steal_time(void)
{
        int cpu = smp_processor_id();
        struct kvm_steal_time *st = &per_cpu(steal_time, cpu);

        if (!has_steal_clock)
                return;

        wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
        pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
                (unsigned long long) slow_virt_to_phys(st));
}

static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;

static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
{
        /**
         * This relies on __test_and_clear_bit to modify the memory
         * in a way that is atomic with respect to the local CPU.
         * The hypervisor only accesses this memory from the local CPU so
         * there's no need for lock or memory barriers.
         * An optimization barrier is implied in apic write.
         */
        if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
                return;
        apic_native_eoi();
}

static void kvm_guest_cpu_init(void)
{
        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
                u64 pa;

                WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));

                pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
                pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;

                if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
                        pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;

                wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);

                wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
                __this_cpu_write(async_pf_enabled, true);
                pr_debug("setup async PF for cpu %d\n", smp_processor_id());
        }

        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
                unsigned long pa;

                /* Size alignment is implied but just to make it explicit. */
                BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
                __this_cpu_write(kvm_apic_eoi, 0);
                pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
                        | KVM_MSR_ENABLED;
                wrmsrl(MSR_KVM_PV_EOI_EN, pa);
        }

        if (has_steal_clock)
                kvm_register_steal_time();
}

static void kvm_pv_disable_apf(void)
{
        if (!__this_cpu_read(async_pf_enabled))
                return;

        wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
        __this_cpu_write(async_pf_enabled, false);

        pr_debug("disable async PF for cpu %d\n", smp_processor_id());
}

static void kvm_disable_steal_time(void)
{
        if (!has_steal_clock)
                return;

        wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}

static u64 kvm_steal_clock(int cpu)
{
        u64 steal;
        struct kvm_steal_time *src;
        int version;

        src = &per_cpu(steal_time, cpu);
        do {
                version = src->version;
                virt_rmb();
                steal = src->steal;
                virt_rmb();
        } while ((version & 1) || (version != src->version));

        return steal;
}

static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
{
        early_set_memory_decrypted((unsigned long) ptr, size);
}

/*
 * Iterate through all possible CPUs and map the memory region pointed
 * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
 *
 * Note: we iterate through all possible CPUs to ensure that CPUs
 * hotplugged will have their per-cpu variable already mapped as
 * decrypted.
 */
static void __init sev_map_percpu_data(void)
{
        int cpu;

        if (cc_vendor != CC_VENDOR_AMD ||
            !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
                return;

        for_each_possible_cpu(cpu) {
                __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
                __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
                __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
        }
}

static void kvm_guest_cpu_offline(bool shutdown)
{
        kvm_disable_steal_time();
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                wrmsrl(MSR_KVM_PV_EOI_EN, 0);
        if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
                wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0);
        kvm_pv_disable_apf();
        if (!shutdown)
                apf_task_wake_all();
        kvmclock_disable();
}

static int kvm_cpu_online(unsigned int cpu)
{
        unsigned long flags;

        local_irq_save(flags);
        kvm_guest_cpu_init();
        local_irq_restore(flags);
        return 0;
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);

static bool pv_tlb_flush_supported(void)
{
        return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
                !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
                kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
                !boot_cpu_has(X86_FEATURE_MWAIT) &&
                (num_possible_cpus() != 1));
}

static bool pv_ipi_supported(void)
{
        return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
               (num_possible_cpus() != 1));
}

static bool pv_sched_yield_supported(void)
{
        return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
                !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
            kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
            !boot_cpu_has(X86_FEATURE_MWAIT) &&
            (num_possible_cpus() != 1));
}

#define KVM_IPI_CLUSTER_SIZE        (2 * BITS_PER_LONG)

static void __send_ipi_mask(const struct cpumask *mask, int vector)
{
        unsigned long flags;
        int cpu, min = 0, max = 0;
#ifdef CONFIG_X86_64
        __uint128_t ipi_bitmap = 0;
#else
        u64 ipi_bitmap = 0;
#endif
        u32 apic_id, icr;
        long ret;

        if (cpumask_empty(mask))
                return;

        local_irq_save(flags);

        switch (vector) {
        default:
                icr = APIC_DM_FIXED | vector;
                break;
        case NMI_VECTOR:
                icr = APIC_DM_NMI;
                break;
        }

        for_each_cpu(cpu, mask) {
                apic_id = per_cpu(x86_cpu_to_apicid, cpu);
                if (!ipi_bitmap) {
                        min = max = apic_id;
                } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
                        ipi_bitmap <<= min - apic_id;
                        min = apic_id;
                } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
                        max = apic_id < max ? max : apic_id;
                } else {
                        ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
                                (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
                        WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
                                  ret);
                        min = max = apic_id;
                        ipi_bitmap = 0;
                }
                __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
        }

        if (ipi_bitmap) {
                ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
                        (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
                WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
                          ret);
        }

        local_irq_restore(flags);
}

static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
{
        __send_ipi_mask(mask, vector);
}

static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
{
        unsigned int this_cpu = smp_processor_id();
        struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
        const struct cpumask *local_mask;

        cpumask_copy(new_mask, mask);
        cpumask_clear_cpu(this_cpu, new_mask);
        local_mask = new_mask;
        __send_ipi_mask(local_mask, vector);
}

static int __init setup_efi_kvm_sev_migration(void)
{
        efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
        efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
        efi_status_t status;
        unsigned long size;
        bool enabled;

        if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
            !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
                return 0;

        if (!efi_enabled(EFI_BOOT))
                return 0;

        if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
                pr_info("%s : EFI runtime services are not enabled\n", __func__);
                return 0;
        }

        size = sizeof(enabled);

        /* Get variable contents into buffer */
        status = efi.get_variable(efi_sev_live_migration_enabled,
                                  &efi_variable_guid, NULL, &size, &enabled);

        if (status == EFI_NOT_FOUND) {
                pr_info("%s : EFI live migration variable not found\n", __func__);
                return 0;
        }

        if (status != EFI_SUCCESS) {
                pr_info("%s : EFI variable retrieval failed\n", __func__);
                return 0;
        }

        if (enabled == 0) {
                pr_info("%s: live migration disabled in EFI\n", __func__);
                return 0;
        }

        pr_info("%s : live migration enabled in EFI\n", __func__);
        wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);

        return 1;
}

late_initcall(setup_efi_kvm_sev_migration);

/*
 * Set the IPI entry points
 */
static __init void kvm_setup_pv_ipi(void)
{
        apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
        apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
        pr_info("setup PV IPIs\n");
}

static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
{
        int cpu;

        native_send_call_func_ipi(mask);

        /* Make sure other vCPUs get a chance to run if they need to. */
        for_each_cpu(cpu, mask) {
                if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
                        kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
                        break;
                }
        }
}

static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
                        const struct flush_tlb_info *info)
{
        u8 state;
        int cpu;
        struct kvm_steal_time *src;
        struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);

        cpumask_copy(flushmask, cpumask);
        /*
         * We have to call flush only on online vCPUs. And
         * queue flush_on_enter for pre-empted vCPUs
         */
        for_each_cpu(cpu, flushmask) {
                /*
                 * The local vCPU is never preempted, so we do not explicitly
                 * skip check for local vCPU - it will never be cleared from
                 * flushmask.
                 */
                src = &per_cpu(steal_time, cpu);
                state = READ_ONCE(src->preempted);
                if ((state & KVM_VCPU_PREEMPTED)) {
                        if (try_cmpxchg(&src->preempted, &state,
                                        state | KVM_VCPU_FLUSH_TLB))
                                __cpumask_clear_cpu(cpu, flushmask);
                }
        }

        native_flush_tlb_multi(flushmask, info);
}

static __init int kvm_alloc_cpumask(void)
{
        int cpu;

        if (!kvm_para_available() || nopv)
                return 0;

        if (pv_tlb_flush_supported() || pv_ipi_supported())
                for_each_possible_cpu(cpu) {
                        zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
                                GFP_KERNEL, cpu_to_node(cpu));
                }

        return 0;
}
arch_initcall(kvm_alloc_cpumask);

static void __init kvm_smp_prepare_boot_cpu(void)
{
        /*
         * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
         * shares the guest physical address with the hypervisor.
         */
        sev_map_percpu_data();

        kvm_guest_cpu_init();
        native_smp_prepare_boot_cpu();
        kvm_spinlock_init();
}

static int kvm_cpu_down_prepare(unsigned int cpu)
{
        unsigned long flags;

        local_irq_save(flags);
        kvm_guest_cpu_offline(false);
        local_irq_restore(flags);
        return 0;
}

#endif

static int kvm_suspend(void)
{
        u64 val = 0;

        kvm_guest_cpu_offline(false);

#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
        if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
                rdmsrl(MSR_KVM_POLL_CONTROL, val);
        has_guest_poll = !(val & 1);
#endif
        return 0;
}

static void kvm_resume(void)
{
        kvm_cpu_online(raw_smp_processor_id());

#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
        if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
                wrmsrl(MSR_KVM_POLL_CONTROL, 0);
#endif
}

static struct syscore_ops kvm_syscore_ops = {
        .suspend        = kvm_suspend,
        .resume                = kvm_resume,
};

static void kvm_pv_guest_cpu_reboot(void *unused)
{
        kvm_guest_cpu_offline(true);
}

static int kvm_pv_reboot_notify(struct notifier_block *nb,
                                unsigned long code, void *unused)
{
        if (code == SYS_RESTART)
                on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
        return NOTIFY_DONE;
}

static struct notifier_block kvm_pv_reboot_nb = {
        .notifier_call = kvm_pv_reboot_notify,
};

/*
 * After a PV feature is registered, the host will keep writing to the
 * registered memory location. If the guest happens to shutdown, this memory
 * won't be valid. In cases like kexec, in which you install a new kernel, this
 * means a random memory location will be kept being written.
 */
#ifdef CONFIG_CRASH_DUMP
static void kvm_crash_shutdown(struct pt_regs *regs)
{
        kvm_guest_cpu_offline(true);
        native_machine_crash_shutdown(regs);
}
#endif

#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
bool __kvm_vcpu_is_preempted(long cpu);

__visible bool __kvm_vcpu_is_preempted(long cpu)
{
        struct kvm_steal_time *src = &per_cpu(steal_time, cpu);

        return !!(src->preempted & KVM_VCPU_PREEMPTED);
}
PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);

#else

#include <asm/asm-offsets.h>

extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);

/*
 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
 * restoring to/from the stack.
 */
#define PV_VCPU_PREEMPTED_ASM                                                     \
 "movq   __per_cpu_offset(,%rdi,8), %rax\n\t"                                     \
 "cmpb   $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
 "setne  %al\n\t"

DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
                PV_VCPU_PREEMPTED_ASM, .text);
#endif

static void __init kvm_guest_init(void)
{
        int i;

        paravirt_ops_setup();
        register_reboot_notifier(&kvm_pv_reboot_nb);
        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
                raw_spin_lock_init(&async_pf_sleepers[i].lock);

        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
                has_steal_clock = 1;
                static_call_update(pv_steal_clock, kvm_steal_clock);

                pv_ops.lock.vcpu_is_preempted =
                        PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
        }

        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_update_callback(eoi, kvm_guest_apic_eoi_write);

        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
                static_branch_enable(&kvm_async_pf_enabled);
                sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
        }

#ifdef CONFIG_SMP
        if (pv_tlb_flush_supported()) {
                pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
                pv_ops.mmu.tlb_remove_table = tlb_remove_table;
                pr_info("KVM setup pv remote TLB flush\n");
        }

        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        if (pv_sched_yield_supported()) {
                smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
                pr_info("setup PV sched yield\n");
        }
        if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
                                      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
                pr_err("failed to install cpu hotplug callbacks\n");
#else
        sev_map_percpu_data();
        kvm_guest_cpu_init();
#endif

#ifdef CONFIG_CRASH_DUMP
        machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif

        register_syscore_ops(&kvm_syscore_ops);

        /*
         * Hard lockup detection is enabled by default. Disable it, as guests
         * can get false positives too easily, for example if the host is
         * overcommitted.
         */
        hardlockup_detector_disable();
}

static noinline uint32_t __kvm_cpuid_base(void)
{
        if (boot_cpu_data.cpuid_level < 0)
                return 0;        /* So we don't blow up on old processors */

        if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
                return hypervisor_cpuid_base(KVM_SIGNATURE, 0);

        return 0;
}

static inline uint32_t kvm_cpuid_base(void)
{
        static int kvm_cpuid_base = -1;

        if (kvm_cpuid_base == -1)
                kvm_cpuid_base = __kvm_cpuid_base();

        return kvm_cpuid_base;
}

bool kvm_para_available(void)
{
        return kvm_cpuid_base() != 0;
}
EXPORT_SYMBOL_GPL(kvm_para_available);

unsigned int kvm_arch_para_features(void)
{
        return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}

unsigned int kvm_arch_para_hints(void)
{
        return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}
EXPORT_SYMBOL_GPL(kvm_arch_para_hints);

static uint32_t __init kvm_detect(void)
{
        return kvm_cpuid_base();
}

static void __init kvm_apic_init(void)
{
#ifdef CONFIG_SMP
        if (pv_ipi_supported())
                kvm_setup_pv_ipi();
#endif
}

static bool __init kvm_msi_ext_dest_id(void)
{
        return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
}

static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
{
        kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
                           KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
}

static void __init kvm_init_platform(void)
{
        if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
            kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
                unsigned long nr_pages;
                int i;

                pv_ops.mmu.notify_page_enc_status_changed =
                        kvm_sev_hc_page_enc_status;

                /*
                 * Reset the host's shared pages list related to kernel
                 * specific page encryption status settings before we load a
                 * new kernel by kexec. Reset the page encryption status
                 * during early boot instead of just before kexec to avoid SMP
                 * races during kvm_pv_guest_cpu_reboot().
                 * NOTE: We cannot reset the complete shared pages list
                 * here as we need to retain the UEFI/OVMF firmware
                 * specific settings.
                 */

                for (i = 0; i < e820_table->nr_entries; i++) {
                        struct e820_entry *entry = &e820_table->entries[i];

                        if (entry->type != E820_TYPE_RAM)
                                continue;

                        nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);

                        kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
                                       nr_pages,
                                       KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
                }

                /*
                 * Ensure that _bss_decrypted section is marked as decrypted in the
                 * shared pages list.
                 */
                early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
                                                __end_bss_decrypted - __start_bss_decrypted, 0);

                /*
                 * If not booted using EFI, enable Live migration support.
                 */
                if (!efi_enabled(EFI_BOOT))
                        wrmsrl(MSR_KVM_MIGRATION_CONTROL,
                               KVM_MIGRATION_READY);
        }
        kvmclock_init();
        x86_platform.apic_post_init = kvm_apic_init;
}

#if defined(CONFIG_AMD_MEM_ENCRYPT)
static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
{
        /* RAX and CPL are already in the GHCB */
        ghcb_set_rbx(ghcb, regs->bx);
        ghcb_set_rcx(ghcb, regs->cx);
        ghcb_set_rdx(ghcb, regs->dx);
        ghcb_set_rsi(ghcb, regs->si);
}

static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
{
        /* No checking of the return state needed */
        return true;
}
#endif

const __initconst struct hypervisor_x86 x86_hyper_kvm = {
        .name                                = "KVM",
        .detect                                = kvm_detect,
        .type                                = X86_HYPER_KVM,
        .init.guest_late_init                = kvm_guest_init,
        .init.x2apic_available                = kvm_para_available,
        .init.msi_ext_dest_id                = kvm_msi_ext_dest_id,
        .init.init_platform                = kvm_init_platform,
#if defined(CONFIG_AMD_MEM_ENCRYPT)
        .runtime.sev_es_hcall_prepare        = kvm_sev_es_hcall_prepare,
        .runtime.sev_es_hcall_finish        = kvm_sev_es_hcall_finish,
#endif
};

static __init int activate_jump_labels(void)
{
        if (has_steal_clock) {
                static_key_slow_inc(&paravirt_steal_enabled);
                if (steal_acc)
                        static_key_slow_inc(&paravirt_steal_rq_enabled);
        }

        return 0;
}
arch_initcall(activate_jump_labels);

#ifdef CONFIG_PARAVIRT_SPINLOCKS

/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
static void kvm_kick_cpu(int cpu)
{
        unsigned long flags = 0;
        u32 apicid;

        apicid = per_cpu(x86_cpu_to_apicid, cpu);
        kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}

#include <asm/qspinlock.h>

static void kvm_wait(u8 *ptr, u8 val)
{
        if (in_nmi())
                return;

        /*
         * halt until it's our turn and kicked. Note that we do safe halt
         * for irq enabled case to avoid hang when lock info is overwritten
         * in irq spinlock slowpath and no spurious interrupt occur to save us.
         */
        if (irqs_disabled()) {
                if (READ_ONCE(*ptr) == val)
                        halt();
        } else {
                local_irq_disable();

                /* safe_halt() will enable IRQ */
                if (READ_ONCE(*ptr) == val)
                        safe_halt();
                else
                        local_irq_enable();
        }
}

/*
 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
 */
void __init kvm_spinlock_init(void)
{
        /*
         * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
         * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
         * preferred over native qspinlock when vCPU is preempted.
         */
        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
                pr_info("PV spinlocks disabled, no host support\n");
                return;
        }

        /*
         * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
         * are available.
         */
        if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
                pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
                goto out;
        }

        if (num_possible_cpus() == 1) {
                pr_info("PV spinlocks disabled, single CPU\n");
                goto out;
        }

        if (nopvspin) {
                pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
                goto out;
        }

        pr_info("PV spinlocks enabled\n");

        __pv_init_lock_hash();
        pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
        pv_ops.lock.queued_spin_unlock =
                PV_CALLEE_SAVE(__pv_queued_spin_unlock);
        pv_ops.lock.wait = kvm_wait;
        pv_ops.lock.kick = kvm_kick_cpu;

        /*
         * When PV spinlock is enabled which is preferred over
         * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
         * Just disable it anyway.
         */
out:
        static_branch_disable(&virt_spin_lock_key);
}

#endif        /* CONFIG_PARAVIRT_SPINLOCKS */

#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL

static void kvm_disable_host_haltpoll(void *i)
{
        wrmsrl(MSR_KVM_POLL_CONTROL, 0);
}

static void kvm_enable_host_haltpoll(void *i)
{
        wrmsrl(MSR_KVM_POLL_CONTROL, 1);
}

void arch_haltpoll_enable(unsigned int cpu)
{
        if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
                pr_err_once("host does not support poll control\n");
                pr_err_once("host upgrade recommended\n");
                return;
        }

        /* Enable guest halt poll disables host halt poll */
        smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
}
EXPORT_SYMBOL_GPL(arch_haltpoll_enable);

void arch_haltpoll_disable(unsigned int cpu)
{
        if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
                return;

        /* Disable guest halt poll enables host halt poll */
        smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
}
EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
#endif







































































    1 





    1 




    1 
























    1 
    1 
    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/interval_tree.c - interval tree for mapping->i_mmap
 *
 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rmap.h>
#include <linux/interval_tree_generic.h>

static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff;
}

static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff + vma_pages(v) - 1;
}

INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
                     unsigned long, shared.rb_subtree_last,
                     vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)

/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root)
{
        struct rb_node **link;
        struct vm_area_struct *parent;
        unsigned long last = vma_last_pgoff(node);

        VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);

        if (!prev->shared.rb.rb_right) {
                parent = prev;
                link = &prev->shared.rb.rb_right;
        } else {
                parent = rb_entry(prev->shared.rb.rb_right,
                                  struct vm_area_struct, shared.rb);
                if (parent->shared.rb_subtree_last < last)
                        parent->shared.rb_subtree_last = last;
                while (parent->shared.rb.rb_left) {
                        parent = rb_entry(parent->shared.rb.rb_left,
                                struct vm_area_struct, shared.rb);
                        if (parent->shared.rb_subtree_last < last)
                                parent->shared.rb_subtree_last = last;
                }
                link = &parent->shared.rb.rb_left;
        }

        node->shared.rb_subtree_last = last;
        rb_link_node(&node->shared.rb, &parent->shared.rb, link);
        rb_insert_augmented(&node->shared.rb, &root->rb_root,
                            &vma_interval_tree_augment);
}

static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
{
        return vma_start_pgoff(avc->vma);
}

static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
{
        return vma_last_pgoff(avc->vma);
}

INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
                     avc_start_pgoff, avc_last_pgoff,
                     static inline, __anon_vma_interval_tree)

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
#ifdef CONFIG_DEBUG_VM_RB
        node->cached_vma_start = avc_start_pgoff(node);
        node->cached_vma_last = avc_last_pgoff(node);
#endif
        __anon_vma_interval_tree_insert(node, root);
}

void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
        __anon_vma_interval_tree_remove(node, root);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_first(root, first, last);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
                                 unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_next(node, first, last);
}

#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
{
        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
}
#endif















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 










    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/kthread.h>
#include <linux/backing-dev.h>
#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
#include "internal.h"

struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);

static const char *bdi_unknown_name = "(unknown)";

/*
 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
 * reader side locking.
 */
DEFINE_SPINLOCK(bdi_lock);
static u64 bdi_id_cursor;
static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);

/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;

#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

struct wb_stats {
        unsigned long nr_dirty;
        unsigned long nr_io;
        unsigned long nr_more_io;
        unsigned long nr_dirty_time;
        unsigned long nr_writeback;
        unsigned long nr_reclaimable;
        unsigned long nr_dirtied;
        unsigned long nr_written;
        unsigned long dirty_thresh;
        unsigned long wb_thresh;
};

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
        bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static void collect_wb_stats(struct wb_stats *stats,
                             struct bdi_writeback *wb)
{
        struct inode *inode;

        spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_io_list)
                stats->nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_io_list)
                stats->nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_io_list)
                stats->nr_more_io++;
        list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
                if (inode->i_state & I_DIRTY_TIME)
                        stats->nr_dirty_time++;
        spin_unlock(&wb->list_lock);

        stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
        stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
        stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
        stats->nr_written += wb_stat(wb, WB_WRITTEN);
        stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
}

#ifdef CONFIG_CGROUP_WRITEBACK
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(stats, wb);
                wb_put(wb);
        }
        rcu_read_unlock();
}
#else
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        collect_wb_stats(stats, &bdi->wb);
}
#endif

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct wb_stats stats;
        unsigned long tot_bw;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        memset(&stats, 0, sizeof(stats));
        stats.dirty_thresh = dirty_thresh;
        bdi_collect_stats(bdi, &stats);
        tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);

        seq_printf(m,
                   "BdiWriteback:       %10lu kB\n"
                   "BdiReclaimable:     %10lu kB\n"
                   "BdiDirtyThresh:     %10lu kB\n"
                   "DirtyThresh:        %10lu kB\n"
                   "BackgroundThresh:   %10lu kB\n"
                   "BdiDirtied:         %10lu kB\n"
                   "BdiWritten:         %10lu kB\n"
                   "BdiWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:            %10lu\n"
                   "b_io:               %10lu\n"
                   "b_more_io:          %10lu\n"
                   "b_dirty_time:       %10lu\n"
                   "bdi_list:           %10u\n"
                   "state:              %10lx\n",
                   K(stats.nr_writeback),
                   K(stats.nr_reclaimable),
                   K(stats.wb_thresh),
                   K(dirty_thresh),
                   K(background_thresh),
                   K(stats.nr_dirtied),
                   K(stats.nr_written),
                   K(tot_bw),
                   stats.nr_dirty,
                   stats.nr_io,
                   stats.nr_more_io,
                   stats.nr_dirty_time,
                   !list_empty(&bdi->bdi_list), bdi->wb.state);

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);

static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
                          struct wb_stats *stats)
{

        seq_printf(m,
                   "WbCgIno:           %10lu\n"
                   "WbWriteback:       %10lu kB\n"
                   "WbReclaimable:     %10lu kB\n"
                   "WbDirtyThresh:     %10lu kB\n"
                   "WbDirtied:         %10lu kB\n"
                   "WbWritten:         %10lu kB\n"
                   "WbWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:           %10lu\n"
                   "b_io:              %10lu\n"
                   "b_more_io:         %10lu\n"
                   "b_dirty_time:      %10lu\n"
                   "state:             %10lx\n\n",
#ifdef CONFIG_CGROUP_WRITEBACK
                   cgroup_ino(wb->memcg_css->cgroup),
#else
                   1ul,
#endif
                   K(stats->nr_writeback),
                   K(stats->nr_reclaimable),
                   K(stats->wb_thresh),
                   K(stats->nr_dirtied),
                   K(stats->nr_written),
                   K(wb->avg_write_bandwidth),
                   stats->nr_dirty,
                   stats->nr_io,
                   stats->nr_more_io,
                   stats->nr_dirty_time,
                   wb->state);
}

static int cgwb_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct bdi_writeback *wb;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                struct wb_stats stats = { .dirty_thresh = dirty_thresh };

                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(&stats, wb);

                /*
                 * Calculate thresh of wb in writeback cgroup which is min of
                 * thresh in global domain and thresh in cgroup domain. Drop
                 * rcu lock because cgwb_calc_thresh may sleep in
                 * cgroup_rstat_flush. We can do so here because we have a ref.
                 */
                if (mem_cgroup_wb_domain(wb)) {
                        rcu_read_unlock();
                        stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
                        rcu_read_lock();
                }

                wb_stats_show(m, wb, &stats);

                wb_put(wb);
        }
        rcu_read_unlock();

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);

        debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
                            &bdi_debug_stats_fops);
        debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
                            &cgwb_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
        debugfs_remove_recursive(bdi->debug_dir);
}
#else /* CONFIG_DEBUG_FS */
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
                                      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif /* CONFIG_DEBUG_FS */

static ssize_t read_ahead_kb_store(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned long read_ahead_kb;
        ssize_t ret;

        ret = kstrtoul(buf, 10, &read_ahead_kb);
        if (ret < 0)
                return ret;

        bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);

        return count;
}

#define BDI_SHOW(name, expr)                                                \
static ssize_t name##_show(struct device *dev,                                \
                           struct device_attribute *attr, char *buf)        \
{                                                                        \
        struct backing_dev_info *bdi = dev_get_drvdata(dev);                \
                                                                        \
        return sysfs_emit(buf, "%lld\n", (long long)expr);                \
}                                                                        \
static DEVICE_ATTR_RW(name);

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

static ssize_t min_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)

static ssize_t min_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio_fine, bdi->min_ratio)

static ssize_t max_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)

static ssize_t max_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio_fine, bdi->max_ratio)

static ssize_t min_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
}

static ssize_t min_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(min_bytes);

static ssize_t max_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
}

static ssize_t max_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(max_bytes);

static ssize_t stable_pages_required_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
{
        dev_warn_once(dev,
                "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
        return sysfs_emit(buf, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);

static ssize_t strict_limit_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int strict_limit;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &strict_limit);
        if (ret < 0)
                return ret;

        ret = bdi_set_strict_limit(bdi, strict_limit);
        if (!ret)
                ret = count;

        return ret;
}

static ssize_t strict_limit_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%d\n",
                        !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
}
static DEVICE_ATTR_RW(strict_limit);

static struct attribute *bdi_dev_attrs[] = {
        &dev_attr_read_ahead_kb.attr,
        &dev_attr_min_ratio.attr,
        &dev_attr_min_ratio_fine.attr,
        &dev_attr_max_ratio.attr,
        &dev_attr_max_ratio_fine.attr,
        &dev_attr_min_bytes.attr,
        &dev_attr_max_bytes.attr,
        &dev_attr_stable_pages_required.attr,
        &dev_attr_strict_limit.attr,
        NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);

static const struct class bdi_class = {
        .name                = "bdi",
        .dev_groups        = bdi_dev_groups,
};

static __init int bdi_class_init(void)
{
        int ret;

        ret = class_register(&bdi_class);
        if (ret)
                return ret;

        bdi_debug_init();

        return 0;
}
postcore_initcall(bdi_class_init);

static int __init default_bdi_init(void)
{
        bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
                                 WQ_SYSFS, 0);
        if (!bdi_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(default_bdi_init);

static void wb_update_bandwidth_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, bw_dwork);

        wb_update_bandwidth(wb);
}

/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW                (100 << (20 - PAGE_SHIFT))

static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
                   gfp_t gfp)
{
        int err;

        memset(wb, 0, sizeof(*wb));

        wb->bdi = bdi;
        wb->last_old_flush = jiffies;
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
        INIT_LIST_HEAD(&wb->b_dirty_time);
        spin_lock_init(&wb->list_lock);

        atomic_set(&wb->writeback_inodes, 0);
        wb->bw_time_stamp = jiffies;
        wb->balanced_dirty_ratelimit = INIT_BW;
        wb->dirty_ratelimit = INIT_BW;
        wb->write_bandwidth = INIT_BW;
        wb->avg_write_bandwidth = INIT_BW;

        spin_lock_init(&wb->work_lock);
        INIT_LIST_HEAD(&wb->work_list);
        INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
        INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);

        err = fprop_local_init_percpu(&wb->completions, gfp);
        if (err)
                return err;

        err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
        if (err)
                fprop_local_destroy_percpu(&wb->completions);

        return err;
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);

/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void wb_shutdown(struct bdi_writeback *wb)
{
        /* Make sure nobody queues further work */
        spin_lock_irq(&wb->work_lock);
        if (!test_and_clear_bit(WB_registered, &wb->state)) {
                spin_unlock_irq(&wb->work_lock);
                return;
        }
        spin_unlock_irq(&wb->work_lock);

        cgwb_remove_from_bdi_list(wb);
        /*
         * Drain work list and shutdown the delayed_work.  !WB_registered
         * tells wb_workfn() that @wb is dying and its work_list needs to
         * be drained no matter what.
         */
        mod_delayed_work(bdi_wq, &wb->dwork, 0);
        flush_delayed_work(&wb->dwork);
        WARN_ON(!list_empty(&wb->work_list));
        flush_delayed_work(&wb->bw_dwork);
}

static void wb_exit(struct bdi_writeback *wb)
{
        WARN_ON(delayed_work_pending(&wb->dwork));
        percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
        fprop_local_destroy_percpu(&wb->completions);
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/memcontrol.h>

/*
 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
 * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
 */
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;

static LIST_HEAD(offline_cgwbs);
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);

static void cgwb_free_rcu(struct rcu_head *rcu_head)
{
        struct bdi_writeback *wb = container_of(rcu_head,
                        struct bdi_writeback, rcu);

        percpu_ref_exit(&wb->refcnt);
        kfree(wb);
}

static void cgwb_release_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
                                                release_work);
        struct backing_dev_info *bdi = wb->bdi;

        mutex_lock(&wb->bdi->cgwb_release_mutex);
        wb_shutdown(wb);

        css_put(wb->memcg_css);
        css_put(wb->blkcg_css);
        mutex_unlock(&wb->bdi->cgwb_release_mutex);

        /* triggers blkg destruction if no online users left */
        blkcg_unpin_online(wb->blkcg_css);

        fprop_local_destroy_percpu(&wb->memcg_completions);

        spin_lock_irq(&cgwb_lock);
        list_del(&wb->offline_node);
        spin_unlock_irq(&cgwb_lock);

        wb_exit(wb);
        bdi_put(bdi);
        WARN_ON_ONCE(!list_empty(&wb->b_attached));
        call_rcu(&wb->rcu, cgwb_free_rcu);
}

static void cgwb_release(struct percpu_ref *refcnt)
{
        struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
                                                refcnt);
        queue_work(cgwb_release_wq, &wb->release_work);
}

static void cgwb_kill(struct bdi_writeback *wb)
{
        lockdep_assert_held(&cgwb_lock);

        WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
        list_del(&wb->memcg_node);
        list_del(&wb->blkcg_node);
        list_add(&wb->offline_node, &offline_cgwbs);
        percpu_ref_kill(&wb->refcnt);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        spin_lock_irq(&cgwb_lock);
        list_del_rcu(&wb->bdi_node);
        spin_unlock_irq(&cgwb_lock);
}

static int cgwb_create(struct backing_dev_info *bdi,
                       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
        struct mem_cgroup *memcg;
        struct cgroup_subsys_state *blkcg_css;
        struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
        struct bdi_writeback *wb;
        unsigned long flags;
        int ret = 0;

        memcg = mem_cgroup_from_css(memcg_css);
        blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
        memcg_cgwb_list = &memcg->cgwb_list;
        blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);

        /* look up again under lock and discard on blkcg mismatch */
        spin_lock_irqsave(&cgwb_lock, flags);
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb && wb->blkcg_css != blkcg_css) {
                cgwb_kill(wb);
                wb = NULL;
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (wb)
                goto out_put;

        /* need to create a new one */
        wb = kmalloc(sizeof(*wb), gfp);
        if (!wb) {
                ret = -ENOMEM;
                goto out_put;
        }

        ret = wb_init(wb, bdi, gfp);
        if (ret)
                goto err_free;

        ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
        if (ret)
                goto err_wb_exit;

        ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
        if (ret)
                goto err_ref_exit;

        wb->memcg_css = memcg_css;
        wb->blkcg_css = blkcg_css;
        INIT_LIST_HEAD(&wb->b_attached);
        INIT_WORK(&wb->release_work, cgwb_release_workfn);
        set_bit(WB_registered, &wb->state);
        bdi_get(bdi);

        /*
         * The root wb determines the registered state of the whole bdi and
         * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
         * whether they're still online.  Don't link @wb if any is dead.
         * See wb_memcg_offline() and wb_blkcg_offline().
         */
        ret = -ENODEV;
        spin_lock_irqsave(&cgwb_lock, flags);
        if (test_bit(WB_registered, &bdi->wb.state) &&
            blkcg_cgwb_list->next && memcg_cgwb_list->next) {
                /* we might have raced another instance of this function */
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
                        list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
                        blkcg_pin_online(blkcg_css);
                        css_get(memcg_css);
                        css_get(blkcg_css);
                }
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (ret) {
                if (ret == -EEXIST)
                        ret = 0;
                goto err_fprop_exit;
        }
        goto out_put;

err_fprop_exit:
        bdi_put(bdi);
        fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
        percpu_ref_exit(&wb->refcnt);
err_wb_exit:
        wb_exit(wb);
err_free:
        kfree(wb);
out_put:
        css_put(blkcg_css);
        return ret;
}

/**
 * wb_get_lookup - get wb for a given memcg
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 *
 * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
 * refcount incremented.
 *
 * This function uses css_get() on @memcg_css and thus expects its refcnt
 * to be positive on invocation.  IOW, rcu_read_lock() protection on
 * @memcg_css isn't enough.  try_get it before calling this function.
 *
 * A wb is keyed by its associated memcg.  As blkcg implicitly enables
 * memcg on the default hierarchy, memcg association is guaranteed to be
 * more specific (equal or descendant to the associated blkcg) and thus can
 * identify both the memcg and blkcg associations.
 *
 * Because the blkcg associated with a memcg may change as blkcg is enabled
 * and disabled closer to root in the hierarchy, each wb keeps track of
 * both the memcg and blkcg associated with it and verifies the blkcg on
 * each lookup.  On mismatch, the existing wb is discarded and a new one is
 * created.
 */
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css)
{
        struct bdi_writeback *wb;

        if (!memcg_css->parent)
                return &bdi->wb;

        rcu_read_lock();
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb) {
                struct cgroup_subsys_state *blkcg_css;

                /* see whether the blkcg association has changed */
                blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
                if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
                        wb = NULL;
                css_put(blkcg_css);
        }
        rcu_read_unlock();

        return wb;
}

/**
 * wb_get_create - get wb for a given memcg, create if necessary
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 * @gfp: allocation mask to use
 *
 * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
 * create one.  See wb_get_lookup() for more details.
 */
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp)
{
        struct bdi_writeback *wb;

        might_alloc(gfp);

        do {
                wb = wb_get_lookup(bdi, memcg_css);
        } while (!wb && !cgwb_create(bdi, memcg_css, gfp));

        return wb;
}

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        int ret;

        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        mutex_init(&bdi->cgwb_release_mutex);
        init_rwsem(&bdi->wb_switch_rwsem);

        ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
        if (!ret) {
                bdi->wb.memcg_css = &root_mem_cgroup->css;
                bdi->wb.blkcg_css = blkcg_root_css;
        }
        return ret;
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
        struct radix_tree_iter iter;
        void **slot;
        struct bdi_writeback *wb;

        WARN_ON(test_bit(WB_registered, &bdi->wb.state));

        spin_lock_irq(&cgwb_lock);
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
        spin_unlock_irq(&cgwb_lock);

        mutex_lock(&bdi->cgwb_release_mutex);
        spin_lock_irq(&cgwb_lock);
        while (!list_empty(&bdi->wb_list)) {
                wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
                                      bdi_node);
                spin_unlock_irq(&cgwb_lock);
                wb_shutdown(wb);
                spin_lock_irq(&cgwb_lock);
        }
        spin_unlock_irq(&cgwb_lock);
        mutex_unlock(&bdi->cgwb_release_mutex);
}

/*
 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
 *
 * Try to release dying cgwbs by switching attached inodes to the nearest
 * living ancestor's writeback. Processed wbs are placed at the end
 * of the list to guarantee the forward progress.
 */
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb;
        LIST_HEAD(processed);

        spin_lock_irq(&cgwb_lock);

        while (!list_empty(&offline_cgwbs)) {
                wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
                                      offline_node);
                list_move(&wb->offline_node, &processed);

                /*
                 * If wb is dirty, cleaning up the writeback by switching
                 * attached inodes will result in an effective removal of any
                 * bandwidth restrictions, which isn't the goal.  Instead,
                 * it can be postponed until the next time, when all io
                 * will be likely completed.  If in the meantime some inodes
                 * will get re-dirtied, they should be eventually switched to
                 * a new cgwb.
                 */
                if (wb_has_dirty_io(wb))
                        continue;

                if (!wb_tryget(wb))
                        continue;

                spin_unlock_irq(&cgwb_lock);
                while (cleanup_offline_cgwb(wb))
                        cond_resched();
                spin_lock_irq(&cgwb_lock);

                wb_put(wb);
        }

        if (!list_empty(&processed))
                list_splice_tail(&processed, &offline_cgwbs);

        spin_unlock_irq(&cgwb_lock);
}

/**
 * wb_memcg_offline - kill all wb's associated with a memcg being offlined
 * @memcg: memcg being offlined
 *
 * Also prevents creation of any new wb's associated with @memcg.
 */
void wb_memcg_offline(struct mem_cgroup *memcg)
{
        struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
        struct bdi_writeback *wb, *next;

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
                cgwb_kill(wb);
        memcg_cgwb_list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);

        queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
}

/**
 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
 * @css: blkcg being offlined
 *
 * Also prevents creation of any new wb's associated with @blkcg.
 */
void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
        struct bdi_writeback *wb, *next;
        struct list_head *list = blkcg_get_cgwb_list(css);

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, list, blkcg_node)
                cgwb_kill(wb);
        list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);
}

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        spin_lock_irq(&cgwb_lock);
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
        spin_unlock_irq(&cgwb_lock);
}

static int __init cgwb_init(void)
{
        /*
         * There can be many concurrent release work items overwhelming
         * system_wq.  Put them in a separate wq and limit concurrency.
         * There's no point in executing many of these in parallel.
         */
        cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
        if (!cgwb_release_wq)
                return -ENOMEM;

        return 0;
}
subsys_initcall(cgwb_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        list_del_rcu(&wb->bdi_node);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

int bdi_init(struct backing_dev_info *bdi)
{
        bdi->dev = NULL;

        kref_init(&bdi->refcnt);
        bdi->min_ratio = 0;
        bdi->max_ratio = 100 * BDI_RATIO_SCALE;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
        INIT_LIST_HEAD(&bdi->bdi_list);
        INIT_LIST_HEAD(&bdi->wb_list);
        init_waitqueue_head(&bdi->wb_waitq);
        bdi->last_bdp_sleep = jiffies;

        return cgwb_bdi_init(bdi);
}

struct backing_dev_info *bdi_alloc(int node_id)
{
        struct backing_dev_info *bdi;

        bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
        if (!bdi)
                return NULL;

        if (bdi_init(bdi)) {
                kfree(bdi);
                return NULL;
        }
        bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
        bdi->ra_pages = VM_READAHEAD_PAGES;
        bdi->io_pages = VM_READAHEAD_PAGES;
        timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
        return bdi;
}
EXPORT_SYMBOL(bdi_alloc);

static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
        struct rb_node **p = &bdi_tree.rb_node;
        struct rb_node *parent = NULL;
        struct backing_dev_info *bdi;

        lockdep_assert_held(&bdi_lock);

        while (*p) {
                parent = *p;
                bdi = rb_entry(parent, struct backing_dev_info, rb_node);

                if (bdi->id > id)
                        p = &(*p)->rb_left;
                else if (bdi->id < id)
                        p = &(*p)->rb_right;
                else
                        break;
        }

        if (parentp)
                *parentp = parent;
        return p;
}

/**
 * bdi_get_by_id - lookup and get bdi from its id
 * @id: bdi id to lookup
 *
 * Find bdi matching @id and get it.  Returns NULL if the matching bdi
 * doesn't exist or is already unregistered.
 */
struct backing_dev_info *bdi_get_by_id(u64 id)
{
        struct backing_dev_info *bdi = NULL;
        struct rb_node **p;

        spin_lock_bh(&bdi_lock);
        p = bdi_lookup_rb_node(id, NULL);
        if (*p) {
                bdi = rb_entry(*p, struct backing_dev_info, rb_node);
                bdi_get(bdi);
        }
        spin_unlock_bh(&bdi_lock);

        return bdi;
}

int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
        struct device *dev;
        struct rb_node *parent, **p;

        if (bdi->dev)        /* The driver needs to use separate queues per device */
                return 0;

        vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
        dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        cgwb_bdi_register(bdi);
        bdi->dev = dev;

        bdi_debug_register(bdi, dev_name(dev));
        set_bit(WB_registered, &bdi->wb.state);

        spin_lock_bh(&bdi_lock);

        bdi->id = ++bdi_id_cursor;

        p = bdi_lookup_rb_node(bdi->id, &parent);
        rb_link_node(&bdi->rb_node, parent, p);
        rb_insert_color(&bdi->rb_node, &bdi_tree);

        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);

        spin_unlock_bh(&bdi_lock);

        trace_writeback_bdi_register(bdi);
        return 0;
}

int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = bdi_register_va(bdi, fmt, args);
        va_end(args);
        return ret;
}
EXPORT_SYMBOL(bdi_register);

void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
{
        WARN_ON_ONCE(bdi->owner);
        bdi->owner = owner;
        get_device(owner);
}

/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
        spin_lock_bh(&bdi_lock);
        rb_erase(&bdi->rb_node, &bdi_tree);
        list_del_rcu(&bdi->bdi_list);
        spin_unlock_bh(&bdi_lock);

        synchronize_rcu_expedited();
}

void bdi_unregister(struct backing_dev_info *bdi)
{
        del_timer_sync(&bdi->laptop_mode_wb_timer);

        /* make sure nobody finds us on the bdi_list anymore */
        bdi_remove_from_list(bdi);
        wb_shutdown(&bdi->wb);
        cgwb_bdi_unregister(bdi);

        /*
         * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
         * update the global bdi_min_ratio.
         */
        if (bdi->min_ratio)
                bdi_set_min_ratio(bdi, 0);

        if (bdi->dev) {
                bdi_debug_unregister(bdi);
                device_unregister(bdi->dev);
                bdi->dev = NULL;
        }

        if (bdi->owner) {
                put_device(bdi->owner);
                bdi->owner = NULL;
        }
}
EXPORT_SYMBOL(bdi_unregister);

static void release_bdi(struct kref *ref)
{
        struct backing_dev_info *bdi =
                        container_of(ref, struct backing_dev_info, refcnt);

        WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
        WARN_ON_ONCE(bdi->dev);
        wb_exit(&bdi->wb);
        kfree(bdi);
}

void bdi_put(struct backing_dev_info *bdi)
{
        kref_put(&bdi->refcnt, release_bdi);
}
EXPORT_SYMBOL(bdi_put);

struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
        struct super_block *sb;

        if (!inode)
                return &noop_backing_dev_info;

        sb = inode->i_sb;
#ifdef CONFIG_BLOCK
        if (sb_is_blkdev_sb(sb))
                return I_BDEV(inode)->bd_disk->bdi;
#endif
        return sb->s_bdi;
}
EXPORT_SYMBOL(inode_to_bdi);

const char *bdi_dev_name(struct backing_dev_info *bdi)
{
        if (!bdi || !bdi->dev)
                return bdi_unknown_name;
        return bdi->dev_name;
}
EXPORT_SYMBOL_GPL(bdi_dev_name);




































































































































































    1 













































    1 































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM jbd2

#if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_JBD2_H

#include <linux/jbd2.h>
#include <linux/tracepoint.h>

struct transaction_chp_stats_s;
struct transaction_run_stats_s;

TRACE_EVENT(jbd2_checkpoint,

        TP_PROTO(journal_t *journal, int result),

        TP_ARGS(journal, result),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        result                        )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->result                = result;
        ),

        TP_printk("dev %d,%d result %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result)
);

DECLARE_EVENT_CLASS(jbd2_commit,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        char,        sync_commit                  )
                __field(        tid_t,        transaction                  )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction        = commit_transaction->t_tid;
        ),

        TP_printk("dev %d,%d transaction %u sync %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->transaction, __entry->sync_commit)
);

DEFINE_EVENT(jbd2_commit, jbd2_start_commit,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_locking,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_flushing,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_logging,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

TRACE_EVENT(jbd2_end_commit,
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        char,        sync_commit                  )
                __field(        tid_t,        transaction                  )
                __field(        tid_t,        head                            )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction        = commit_transaction->t_tid;
                __entry->head                = journal->j_tail_sequence;
        ),

        TP_printk("dev %d,%d transaction %u sync %d head %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->transaction, __entry->sync_commit, __entry->head)
);

TRACE_EVENT(jbd2_submit_inode_data,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

DECLARE_EVENT_CLASS(jbd2_handle_start_class,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        requested_blocks)
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->requested_blocks = requested_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u "
                  "requested_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->requested_blocks)
);

DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks)
);

DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks)
);

TRACE_EVENT(jbd2_handle_extend,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int buffer_credits,
                 int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, buffer_credits, requested_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        buffer_credits  )
                __field(                  int,        requested_blocks)
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->buffer_credits   = buffer_credits;
                __entry->requested_blocks = requested_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u "
                  "buffer_credits %d requested_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->buffer_credits,
                  __entry->requested_blocks)
);

TRACE_EVENT(jbd2_handle_stats,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int interval, int sync,
                 int requested_blocks, int dirtied_blocks),

        TP_ARGS(dev, tid, type, line_no, interval, sync,
                requested_blocks, dirtied_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        interval        )
                __field(                  int,        sync                )
                __field(                  int,        requested_blocks)
                __field(                  int,        dirtied_blocks        )
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->interval          = interval;
                __entry->sync                  = sync;
                __entry->requested_blocks = requested_blocks;
                __entry->dirtied_blocks          = dirtied_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u interval %d "
                  "sync %d requested_blocks %d dirtied_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->interval,
                  __entry->sync, __entry->requested_blocks,
                  __entry->dirtied_blocks)
);

TRACE_EVENT(jbd2_run_stats,
        TP_PROTO(dev_t dev, tid_t tid,
                 struct transaction_run_stats_s *stats),

        TP_ARGS(dev, tid, stats),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(        unsigned long,        wait                )
                __field(        unsigned long,        request_delay        )
                __field(        unsigned long,        running                )
                __field(        unsigned long,        locked                )
                __field(        unsigned long,        flushing        )
                __field(        unsigned long,        logging                )
                __field(                __u32,        handle_count        )
                __field(                __u32,        blocks                )
                __field(                __u32,        blocks_logged        )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->tid                = tid;
                __entry->wait                = stats->rs_wait;
                __entry->request_delay        = stats->rs_request_delay;
                __entry->running        = stats->rs_running;
                __entry->locked                = stats->rs_locked;
                __entry->flushing        = stats->rs_flushing;
                __entry->logging        = stats->rs_logging;
                __entry->handle_count        = stats->rs_handle_count;
                __entry->blocks                = stats->rs_blocks;
                __entry->blocks_logged        = stats->rs_blocks_logged;
        ),

        TP_printk("dev %d,%d tid %u wait %u request_delay %u running %u "
                  "locked %u flushing %u logging %u handle_count %u "
                  "blocks %u blocks_logged %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  jiffies_to_msecs(__entry->wait),
                  jiffies_to_msecs(__entry->request_delay),
                  jiffies_to_msecs(__entry->running),
                  jiffies_to_msecs(__entry->locked),
                  jiffies_to_msecs(__entry->flushing),
                  jiffies_to_msecs(__entry->logging),
                  __entry->handle_count, __entry->blocks,
                  __entry->blocks_logged)
);

TRACE_EVENT(jbd2_checkpoint_stats,
        TP_PROTO(dev_t dev, tid_t tid,
                 struct transaction_chp_stats_s *stats),

        TP_ARGS(dev, tid, stats),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(        unsigned long,        chp_time        )
                __field(                __u32,        forced_to_close        )
                __field(                __u32,        written                )
                __field(                __u32,        dropped                )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->tid                = tid;
                __entry->chp_time        = stats->cs_chp_time;
                __entry->forced_to_close= stats->cs_forced_to_close;
                __entry->written        = stats->cs_written;
                __entry->dropped        = stats->cs_dropped;
        ),

        TP_printk("dev %d,%d tid %u chp_time %u forced_to_close %u "
                  "written %u dropped %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  jiffies_to_msecs(__entry->chp_time),
                  __entry->forced_to_close, __entry->written, __entry->dropped)
);

TRACE_EVENT(jbd2_update_log_tail,

        TP_PROTO(journal_t *journal, tid_t first_tid,
                 unsigned long block_nr, unsigned long freed),

        TP_ARGS(journal, first_tid, block_nr, freed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        tid_t,        tail_sequence                )
                __field(        tid_t,        first_tid                )
                __field(unsigned long,        block_nr                )
                __field(unsigned long,        freed                        )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->tail_sequence        = journal->j_tail_sequence;
                __entry->first_tid        = first_tid;
                __entry->block_nr        = block_nr;
                __entry->freed                = freed;
        ),

        TP_printk("dev %d,%d from %u to %u offset %lu freed %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tail_sequence, __entry->first_tid,
                  __entry->block_nr, __entry->freed)
);

TRACE_EVENT(jbd2_write_superblock,

        TP_PROTO(journal_t *journal, blk_opf_t write_flags),

        TP_ARGS(journal, write_flags),

        TP_STRUCT__entry(
                __field(        dev_t,  dev                        )
                __field(    blk_opf_t,  write_flags                )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->write_flags        = write_flags;
        ),

        TP_printk("dev %d,%d write_flags %x", MAJOR(__entry->dev),
                  MINOR(__entry->dev), (__force u32)__entry->write_flags)
);

TRACE_EVENT(jbd2_lock_buffer_stall,

        TP_PROTO(dev_t dev, unsigned long stall_ms),

        TP_ARGS(dev, stall_ms),

        TP_STRUCT__entry(
                __field(        dev_t, dev        )
                __field(unsigned long, stall_ms        )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->stall_ms        = stall_ms;
        ),

        TP_printk("dev %d,%d stall_ms %lu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->stall_ms)
);

DECLARE_EVENT_CLASS(jbd2_journal_shrink,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
                 unsigned long count),

        TP_ARGS(journal, nr_to_scan, count),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, nr_to_scan)
                __field(unsigned long, count)
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d nr_to_scan %lu count %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->count)
);

DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),

        TP_ARGS(journal, nr_to_scan, count)
);

DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),

        TP_ARGS(journal, nr_to_scan, count)
);

TRACE_EVENT(jbd2_shrink_scan_exit,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
                 unsigned long nr_shrunk, unsigned long count),

        TP_ARGS(journal, nr_to_scan, nr_shrunk, count),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, nr_to_scan)
                __field(unsigned long, nr_shrunk)
                __field(unsigned long, count)
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->nr_shrunk,
                  __entry->count)
);

TRACE_EVENT(jbd2_shrink_checkpoint_list,

        TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid,
                 unsigned long nr_freed, tid_t next_tid),

        TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, next_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, first_tid)
                __field(tid_t, tid)
                __field(tid_t, last_tid)
                __field(unsigned long, nr_freed)
                __field(tid_t, next_tid)
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->first_tid        = first_tid;
                __entry->tid                = tid;
                __entry->last_tid        = last_tid;
                __entry->nr_freed        = nr_freed;
                __entry->next_tid        = next_tid;
        ),

        TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu "
                  "next transaction %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->first_tid, __entry->tid, __entry->last_tid,
                  __entry->nr_freed, __entry->next_tid)
);

#endif /* _TRACE_JBD2_H */

/* This part must be outside protection */
#include <trace/define_trace.h>










































































































































    1 























































































    1 
    1 


































    1 

    2 
    2 


    2 



    2 

























    2 
































    2 
    1 

    1 
    1 


    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_IP6_ROUTE_H
#define _NET_IP6_ROUTE_H

#include <net/addrconf.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <net/sock.h>
#include <net/lwtunnel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/route.h>
#include <net/nexthop.h>

struct route_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;
#if defined(__BIG_ENDIAN_BITFIELD)
        __u8                        reserved_h:3,
                                route_pref:2,
                                reserved_l:3;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
        __u8                        reserved_l:3,
                                route_pref:2,
                                reserved_h:3;
#endif
        __be32                        lifetime;
        __u8                        prefix[];        /* 0,8 or 16 */
};

#define RT6_LOOKUP_F_IFACE                0x00000001
#define RT6_LOOKUP_F_REACHABLE                0x00000002
#define RT6_LOOKUP_F_HAS_SADDR                0x00000004
#define RT6_LOOKUP_F_SRCPREF_TMP        0x00000008
#define RT6_LOOKUP_F_SRCPREF_PUBLIC        0x00000010
#define RT6_LOOKUP_F_SRCPREF_COA        0x00000020
#define RT6_LOOKUP_F_IGNORE_LINKSTATE        0x00000040
#define RT6_LOOKUP_F_DST_NOREF                0x00000080

/* We do not (yet ?) support IPv6 jumbograms (RFC 2675)
 * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header
 */
#define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr))

/*
 * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate
 * between IPV6_ADDR_PREFERENCES socket option values
 *        IPV6_PREFER_SRC_TMP    = 0x1
 *        IPV6_PREFER_SRC_PUBLIC = 0x2
 *        IPV6_PREFER_SRC_COA    = 0x4
 * and above RT6_LOOKUP_F_SRCPREF_xxx flags.
 */
static inline int rt6_srcprefs2flags(unsigned int srcprefs)
{
        return (srcprefs & IPV6_PREFER_SRC_MASK) << 3;
}

static inline unsigned int rt6_flags2srcprefs(int flags)
{
        return (flags >> 3) & IPV6_PREFER_SRC_MASK;
}

static inline bool rt6_need_strict(const struct in6_addr *daddr)
{
        return ipv6_addr_type(daddr) &
                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}

/* fib entries using a nexthop object can not be coalesced into
 * a multipath route
 */
static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
        /* the RTF_ADDRCONF flag filters out RA's */
        return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
                f6i->fib6_nh->fib_nh_gw_family;
}

void ip6_route_input(struct sk_buff *skb);
struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
                                         struct flowi6 *fl6,
                                         const struct sk_buff *skb, int flags);

struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
                                         struct flowi6 *fl6, int flags);

static inline struct dst_entry *ip6_route_output(struct net *net,
                                                 const struct sock *sk,
                                                 struct flowi6 *fl6)
{
        return ip6_route_output_flags(net, sk, fl6, 0);
}

/* Only conditionally release dst if flags indicates
 * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
 */
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{
        if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
            !list_empty(&rt->dst.rt_uncached))
                ip6_rt_put(rt);
}

struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb, int flags);
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int ifindex, struct flowi6 *fl6,
                               const struct sk_buff *skb, int flags);

void ip6_route_init_special_entries(void);
int ip6_route_init(void);
void ip6_route_cleanup(void);

int ipv6_route_ioctl(struct net *net, unsigned int cmd,
                struct in6_rtmsg *rtmsg);

int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack);
int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify);

void rt6_flush_exceptions(struct fib6_info *f6i);
void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
                        unsigned long now);

static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
                                      const struct in6_addr *daddr,
                                      unsigned int prefs,
                                      struct in6_addr *saddr)
{
        int err = 0;

        if (f6i && f6i->fib6_prefsrc.plen) {
                *saddr = f6i->fib6_prefsrc.addr;
        } else {
                struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL;

                err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr);
        }

        return err;
}

struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int flags);
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
                       const struct sk_buff *skb, struct flow_keys *hkeys);

struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);

void fib6_force_start_gc(struct net *net);

struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev,
                                     const struct in6_addr *addr, bool anycast,
                                     gfp_t gfp_flags, struct netlink_ext_ack *extack);

struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
                               int flags);

/*
 *        support functions for ND
 *
 */
struct fib6_info *rt6_get_dflt_router(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev);
struct fib6_info *rt6_add_dflt_router(struct net *net,
                                     const struct in6_addr *gwaddr,
                                     struct net_device *dev, unsigned int pref,
                                     u32 defrtr_usr_metric,
                                     int lifetime);

void rt6_purge_dflt_routers(struct net *net);

int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr);

void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
                     u32 mark, kuid_t uid);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
                  kuid_t uid);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif);
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);

struct netlink_callback;

struct rt6_rtnl_dump_arg {
        struct sk_buff *skb;
        struct netlink_callback *cb;
        struct net *net;
        struct fib_dump_filter filter;
};

int rt6_dump_route(struct fib6_info *f6i, void *p_arg, unsigned int skip);
void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
void rt6_sync_up(struct net_device *dev, unsigned char nh_flags);
void rt6_disable_ip(struct net_device *dev, unsigned long event);
void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
void rt6_multipath_rebalance(struct fib6_info *f6i);

void rt6_uncached_list_add(struct rt6_info *rt);
void rt6_uncached_list_del(struct rt6_info *rt);

static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
{
        const struct dst_entry *dst = skb_dst(skb);

        if (dst)
                return dst_rt6_info(dst);

        return NULL;
}

/*
 *        Store a destination cache entry in a socket
 */
static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
                                 const struct in6_addr *daddr,
                                 const struct in6_addr *saddr)
{
        struct ipv6_pinfo *np = inet6_sk(sk);

        np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
        sk_setup_caps(sk, dst);
        np->daddr_cache = daddr;
#ifdef CONFIG_IPV6_SUBTREES
        np->saddr_cache = saddr;
#endif
}

void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
                           const struct flowi6 *fl6);

static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
{
        const struct rt6_info *rt = dst_rt6_info(skb_dst(skb));

        return rt->rt6i_flags & RTF_LOCAL;
}

static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
                                            const struct in6_addr *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);

        return rt->rt6i_flags & RTF_ANYCAST ||
                (rt->rt6i_dst.plen < 127 &&
                 !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
                 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
}

int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                 int (*output)(struct net *, struct sock *, struct sk_buff *));

static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
{
        const struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
                                inet6_sk(skb->sk) : NULL;
        const struct dst_entry *dst = skb_dst(skb);
        unsigned int mtu;

        if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
                mtu = READ_ONCE(dst->dev->mtu);
                mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
        } else {
                mtu = dst_mtu(dst);
        }
        return mtu;
}

static inline bool ip6_sk_accept_pmtu(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);

        return pmtudisc != IPV6_PMTUDISC_INTERFACE &&
               pmtudisc != IPV6_PMTUDISC_OMIT;
}

static inline bool ip6_sk_ignore_df(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);

        return pmtudisc < IPV6_PMTUDISC_DO ||
               pmtudisc == IPV6_PMTUDISC_OMIT;
}

static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt,
                                                 const struct in6_addr *daddr)
{
        if (rt->rt6i_flags & RTF_GATEWAY)
                return &rt->rt6i_gateway;
        else if (unlikely(rt->rt6i_flags & RTF_CACHE))
                return &rt->rt6i_dst.addr;
        else
                return daddr;
}

static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
{
        struct fib6_nh *nha, *nhb;

        if (a->nh || b->nh)
                return nexthop_cmp(a->nh, b->nh);

        nha = a->fib6_nh;
        nhb = b->fib6_nh;
        return nha->fib_nh_dev == nhb->fib_nh_dev &&
               ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
               !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
}

static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst,
                                                     bool forwarding)
{
        struct inet6_dev *idev;
        unsigned int mtu;

        if (!forwarding || dst_metric_locked(dst, RTAX_MTU)) {
                mtu = dst_metric_raw(dst, RTAX_MTU);
                if (mtu)
                        goto out;
        }

        mtu = IPV6_MIN_MTU;
        rcu_read_lock();
        idev = __in6_dev_get(dst->dev);
        if (idev)
                mtu = READ_ONCE(idev->cnf.mtu6);
        rcu_read_unlock();

out:
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

u32 ip6_mtu_from_fib6(const struct fib6_result *res,
                      const struct in6_addr *daddr,
                      const struct in6_addr *saddr);

struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
                                   struct net_device *dev, struct sk_buff *skb,
                                   const void *daddr);
#endif







































































































































































































































































































































    1 






    1 

















































































































    1 



















    1 




















    1 

    1 










































































































    1 
    1 
    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






















    1 


    1 



    1 























    1 






    1 
































    1 







    1 




















    1 


    1 

















    1 

    1 
































































    1 

















    1 









    1 





































































































































































































































































    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 
    1 
    1 
    1 
    1 
























    1 



    1 

    1 

    1 
















    1 





    1 
    1 


    1 



    1 
    1 



    1 



    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 






    1 

























    1 



    1 


    1 

    1 

    1 

    1 

    1 

    1 


    1 


















































































































































































































































































    1 

































































    1 


    1 






    1 




    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 

    1 













    1 


    1 



    1 



    1 



    1 



    1 

    1 



    1 








































    1 



















    1 



    1 




































































    1 











    1 


    1 







    1 





    1 


    1 





    1 







    1 

























    1 

    1 















    1 











    1 




    1 



















    1 










    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 


    6 










    5 
    4 








    5 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include "cgroup-internal.h"

#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/hashtable.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>

#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>

#define CGROUP_FILE_NAME_MAX                (MAX_CGROUP_TYPE_NAMELEN +        \
                                         MAX_CFTYPE_NAME + 2)
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV        DIV_ROUND_UP(HZ, 100)

/*
 * To avoid confusing the compiler (and generating warnings) with code
 * that attempts to access what would be a 0-element array (i.e. sized
 * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
 * constant expression can be added.
 */
#define CGROUP_HAS_SUBSYS_CONFIG        (CGROUP_SUBSYS_COUNT > 0)

/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
 * css_set_lock protects task->cgroups pointer, the list of css_set
 * objects, and the chain of tasks off each css_set.
 *
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
 */
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);

#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif

DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
static bool cgroup_debug __read_mostly;

/*
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);

#define cgroup_assert_mutex_or_rcu_locked()                                \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                        \
                           !lockdep_is_held(&cgroup_mutex),                \
                           "cgroup_mutex or RCU read lock required");

/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
static struct workqueue_struct *cgroup_destroy_wq;

/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)                                                                \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                        \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);

/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);

/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
static bool cgrp_dfl_visible;

/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;

/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;

/* some controllers can be threaded on the default hierarchy */
static u16 cgrp_dfl_threaded_ss_mask;

/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);

/*
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
 */
static u64 css_serial_nr_next = 1;

/*
 * These bitmasks identify subsystems with specific features to avoid
 * having to do iterative checks repeatedly.
 */
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;

static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);

/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
        .ns.count        = REFCOUNT_INIT(2),
        .user_ns        = &init_user_ns,
        .ns.ops                = &cgroupns_operations,
        .ns.inum        = PROC_CGROUP_INIT_INO,
        .root_cset        = &init_css_set,
};

static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
static struct cftype cgroup_psi_files[];

/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
        OPT_FEATURE_PRESSURE,
#endif
        OPT_FEATURE_COUNT
};

static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
        "pressure",
#endif
};

static u16 cgroup_feature_disable_mask __read_mostly;

static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);

#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS        noinline
#define CGROUP_REF_EXPORT(fn)        EXPORT_SYMBOL_GPL(fn);
#include <linux/cgroup_refcnt.h>
#endif

/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
bool cgroup_ssid_enabled(int ssid)
{
        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return false;

        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differently depending on the
 * interface version.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled in-between reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 */
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
        return cgrp->root == &cgrp_dfl_root;
}

/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                            gfp_t gfp_mask)
{
        int ret;

        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
        void *ret;

        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_replace(idr, ptr, id);
        spin_unlock_bh(&cgroup_idr_lock);
        return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
        spin_lock_bh(&cgroup_idr_lock);
        idr_remove(idr, id);
        spin_unlock_bh(&cgroup_idr_lock);
}

static bool cgroup_has_tasks(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets;
}

static bool cgroup_is_threaded(struct cgroup *cgrp)
{
        return cgrp->dom_cgrp != cgrp;
}

/* can @cgrp host both domain and threaded children? */
static bool cgroup_is_mixable(struct cgroup *cgrp)
{
        /*
         * Root isn't under domain level resource control exempting it from
         * the no-internal-process constraint, so it can serve as a thread
         * root and a parent of resource domains at the same time.
         */
        return !cgroup_parent(cgrp);
}

/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return true;

        /* domain roots can't be nested under threaded */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* can only have either domain or threaded children */
        if (cgrp->nr_populated_domain_children)
                return false;

        /* and no domain controllers can be enabled */
        if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return false;

        return true;
}

/* is @cgrp root of a threaded subtree? */
static bool cgroup_is_thread_root(struct cgroup *cgrp)
{
        /* thread root should be a domain */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* a domain w/ threaded children is a thread root */
        if (cgrp->nr_threaded_children)
                return true;

        /*
         * A domain which has tasks and explicit threaded controllers
         * enabled is a thread root.
         */
        if (cgroup_has_tasks(cgrp) &&
            (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
                return true;

        return false;
}

/* a domain which isn't connected to the root w/o brekage can't be used */
static bool cgroup_is_valid_domain(struct cgroup *cgrp)
{
        /* the cgroup itself can be a thread root */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* but the ancestors can't be unless mixable */
        while ((cgrp = cgroup_parent(cgrp))) {
                if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
                        return false;
                if (cgroup_is_threaded(cgrp))
                        return false;
        }

        return true;
}

/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        u16 root_ss_mask = cgrp->root->subsys_mask;

        if (parent) {
                u16 ss_mask = parent->subtree_control;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        if (cgroup_on_dfl(cgrp))
                root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
                                  cgrp_dfl_implicit_ss_mask);
        return root_ss_mask;
}

/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);

        if (parent) {
                u16 ss_mask = parent->subtree_ss_mask;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        return cgrp->root->subsys_mask;
}

/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
{
        if (CGROUP_HAS_SUBSYS_CONFIG && ss)
                return rcu_dereference_check(cgrp->subsys[ss->id],
                                        lockdep_is_held(&cgroup_mutex));
        else
                return &cgrp->self;
}

/**
 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Similar to cgroup_css() but returns the effective css, which is defined
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
                                                        struct cgroup_subsys *ss)
{
        lockdep_assert_held(&cgroup_mutex);

        if (!ss)
                return &cgrp->self;

        /*
         * This function is used while updating css associations and thus
         * can't test the csses directly.  Test ss_mask.
         */
        while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
                cgrp = cgroup_parent(cgrp);
                if (!cgrp)
                        return NULL;
        }

        return cgroup_css(cgrp, ss);
}

/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 *
 * The returned css is not guaranteed to be online, and therefore it is the
 * callers responsibility to try get a reference for it.
 */
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
                                         struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return NULL;

        do {
                css = cgroup_css(cgrp, ss);

                if (css)
                        return css;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        return init_css_set.subsys[ss->id];
}

/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
                                             struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return NULL;

        rcu_read_lock();

        do {
                css = cgroup_css(cgrp, ss);

                if (css && css_tryget_online(css))
                        goto out_unlock;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        css = init_css_set.subsys[ss->id];
        css_get(css);
out_unlock:
        rcu_read_unlock();
        return css;
}
EXPORT_SYMBOL_GPL(cgroup_get_e_css);

static void cgroup_get_live(struct cgroup *cgrp)
{
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        cgroup_get(cgrp);
}

/**
 * __cgroup_task_count - count the number of tasks in a cgroup. The caller
 * is responsible for taking the css_set_lock.
 * @cgrp: the cgroup in question
 */
int __cgroup_task_count(const struct cgroup *cgrp)
{
        int count = 0;
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += link->cset->nr_tasks;

        return count;
}

/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 */
int cgroup_task_count(const struct cgroup *cgrp)
{
        int count;

        spin_lock_irq(&css_set_lock);
        count = __cgroup_task_count(cgrp);
        spin_unlock_irq(&css_set_lock);

        return count;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
        struct cgroup *cgrp = of->kn->parent->priv;
        struct cftype *cft = of_cft(of);

        /*
         * This is open and unprotected implementation of cgroup_css().
         * seq_css() is only called from a kernfs file operation which has
         * an active reference on the file.  Because all the subsystem
         * files are drained before a css is disassociated with a cgroup,
         * the matching css from the cgroup's subsys table is guaranteed to
         * be and stay valid until the enclosing operation is complete.
         */
        if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
                return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
        else
                return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);

/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_mutex.
 */
#define for_each_css(css, ssid, cgrp)                                        \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
                if (!((css) = rcu_dereference_check(                        \
                                (cgrp)->subsys[(ssid)],                        \
                                lockdep_is_held(&cgroup_mutex)))) { }        \
                else

/**
 * do_each_subsys_mask - filter for_each_subsys with a bitmask
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 * @ss_mask: the bitmask
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
 * @ss_mask is set.
 */
#define do_each_subsys_mask(ss, ssid, ss_mask) do {                        \
        unsigned long __ss_mask = (ss_mask);                                \
        if (!CGROUP_HAS_SUBSYS_CONFIG) {                                \
                (ssid) = 0;                                                \
                break;                                                        \
        }                                                                \
        for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {        \
                (ss) = cgroup_subsys[ssid];                                \
                {

#define while_each_subsys_mask()                                        \
                }                                                        \
        }                                                                \
} while (false)

/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)                                \
        list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       cgroup_is_dead(child); }))                        \
                        ;                                                \
                else

/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)                \
        css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)                \
        css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/*
 * The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
struct css_set init_css_set = {
        .refcount                = REFCOUNT_INIT(1),
        .dom_cset                = &init_css_set,
        .tasks                        = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks                = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .dying_tasks                = LIST_HEAD_INIT(init_css_set.dying_tasks),
        .task_iters                = LIST_HEAD_INIT(init_css_set.task_iters),
        .threaded_csets                = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links                = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_src_preload_node        = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
        .mg_dst_preload_node        = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),

        /*
         * The following field is re-initialized when this cset gets linked
         * in cgroup_init().  However, let's initialize the field
         * statically too so that the default cgroup can be accessed safely
         * early during boot.
         */
        .dfl_cgrp                = &cgrp_dfl_root.cgrp,
};

static int css_set_count        = 1;        /* 1 for init_css_set */

static bool css_set_threaded(struct css_set *cset)
{
        return cset->dom_cset != cset;
}

/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
 *
 * css_set_populated() should be the same as !!cset->nr_tasks at steady
 * state. However, css_set_populated() can be called while a task is being
 * added to or removed from the linked list before the nr_tasks is
 * properly updated. Hence, we can't just look at ->nr_tasks here.
 */
static bool css_set_populated(struct css_set *cset)
{
        lockdep_assert_held(&css_set_lock);

        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

/**
 * cgroup_update_populated - update the populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
 * One of the css_sets associated with @cgrp is either getting its first
 * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 * count is propagated towards root so that a given cgroup's
 * nr_populated_children is zero iff none of its descendants contain any
 * tasks.
 *
 * @cgrp's interface file "cgroup.populated" is zero if both
 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 * 1 otherwise.  When the sum changes from or to zero, userland is notified
 * that the content of the interface file has changed.  This can be used to
 * detect when @cgrp and its descendants become populated or empty.
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
        struct cgroup *child = NULL;
        int adj = populated ? 1 : -1;

        lockdep_assert_held(&css_set_lock);

        do {
                bool was_populated = cgroup_is_populated(cgrp);

                if (!child) {
                        cgrp->nr_populated_csets += adj;
                } else {
                        if (cgroup_is_threaded(child))
                                cgrp->nr_populated_threaded_children += adj;
                        else
                                cgrp->nr_populated_domain_children += adj;
                }

                if (was_populated == cgroup_is_populated(cgrp))
                        break;

                cgroup1_check_for_release(cgrp);
                TRACE_CGROUP_PATH(notify_populated, cgrp,
                                  cgroup_is_populated(cgrp));
                cgroup_file_notify(&cgrp->events_file);

                child = cgrp;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
}

/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
 * populated counters of all associated cgroups accordingly.
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
                cgroup_update_populated(link->cgrp, populated);
}

/*
 * @task is leaving, advance task iterators which are pointing to it so
 * that they can resume at the next position.  Advancing an iterator might
 * remove it from the list, use safe walk.  See css_task_iter_skip() for
 * details.
 */
static void css_set_skip_task_iters(struct css_set *cset,
                                    struct task_struct *task)
{
        struct css_task_iter *it, *pos;

        list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
                css_task_iter_skip(it, task);
}

/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
 * This function automatically handles populated counter updates and
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
 */
static void css_set_move_task(struct task_struct *task,
                              struct css_set *from_cset, struct css_set *to_cset,
                              bool use_mg_tasks)
{
        lockdep_assert_held(&css_set_lock);

        if (to_cset && !css_set_populated(to_cset))
                css_set_update_populated(to_cset, true);

        if (from_cset) {
                WARN_ON_ONCE(list_empty(&task->cg_list));

                css_set_skip_task_iters(from_cset, task);
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
        } else {
                WARN_ON_ONCE(!list_empty(&task->cg_list));
        }

        if (to_cset) {
                /*
                 * We are synchronized through cgroup_threadgroup_rwsem
                 * against PF_EXITING setting such that we can't race
                 * against cgroup_exit()/cgroup_free() dropping the css_set.
                 */
                WARN_ON_ONCE(task->flags & PF_EXITING);

                cgroup_move_task(task, to_cset);
                list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                             &to_cset->tasks);
        }
}

/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
#define CSS_SET_HASH_BITS        7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
        unsigned long key = 0UL;
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                key += (unsigned long)css[i];
        key = (key >> 16) ^ key;

        return key;
}

void put_css_set_locked(struct css_set *cset)
{
        struct cgrp_cset_link *link, *tmp_link;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&css_set_lock);

        if (!refcount_dec_and_test(&cset->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&cset->threaded_csets));

        /* This css_set is dead. Unlink it and release cgroup and css refs */
        for_each_subsys(ss, ssid) {
                list_del(&cset->e_cset_node[ssid]);
                css_put(cset->subsys[ssid]);
        }
        hash_del(&cset->hlist);
        css_set_count--;

        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                if (cgroup_parent(link->cgrp))
                        cgroup_put(link->cgrp);
                kfree(link);
        }

        if (css_set_threaded(cset)) {
                list_del(&cset->threaded_csets_node);
                put_css_set_locked(cset->dom_cset);
        }

        kfree_rcu(cset, rcu_head);
}

/**
 * compare_css_sets - helper function for find_existing_css_set().
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
 * Returns true if "cset" matches "old_cset" except for the hierarchy
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
static bool compare_css_sets(struct css_set *cset,
                             struct css_set *old_cset,
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
{
        struct cgroup *new_dfl_cgrp;
        struct list_head *l1, *l2;

        /*
         * On the default hierarchy, there can be csets which are
         * associated with the same set of cgroups but different csses.
         * Let's first ensure that csses match.
         */
        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                return false;


        /* @cset's domain should match the default cgroup's */
        if (cgroup_on_dfl(new_cgrp))
                new_dfl_cgrp = new_cgrp;
        else
                new_dfl_cgrp = old_cset->dfl_cgrp;

        if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
                return false;

        /*
         * Compare cgroup pointers in order to distinguish between
         * different cgroups in hierarchies.  As different cgroups may
         * share the same effective css, this comparison is always
         * necessary.
         */
        l1 = &cset->cgrp_links;
        l2 = &old_cset->cgrp_links;
        while (1) {
                struct cgrp_cset_link *link1, *link2;
                struct cgroup *cgrp1, *cgrp2;

                l1 = l1->next;
                l2 = l2->next;
                /* See if we reached the end - both lists are equal length. */
                if (l1 == &cset->cgrp_links) {
                        BUG_ON(l2 != &old_cset->cgrp_links);
                        break;
                } else {
                        BUG_ON(l2 == &old_cset->cgrp_links);
                }
                /* Locate the cgroups associated with these links. */
                link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
                link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
                cgrp1 = link1->cgrp;
                cgrp2 = link2->cgrp;
                /* Hierarchies should be linked in the same order. */
                BUG_ON(cgrp1->root != cgrp2->root);

                /*
                 * If this hierarchy is the hierarchy of the cgroup
                 * that's changing, then we need to check that this
                 * css_set points to the new cgroup; if it's any other
                 * hierarchy, then this css_set should point to the
                 * same cgroup as the old css_set.
                 */
                if (cgrp1->root == new_cgrp->root) {
                        if (cgrp1 != new_cgrp)
                                return false;
                } else {
                        if (cgrp1 != cgrp2)
                                return false;
                }
        }
        return true;
}

/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
 */
static struct css_set *find_existing_css_set(struct css_set *old_cset,
                                        struct cgroup *cgrp,
                                        struct cgroup_subsys_state **template)
{
        struct cgroup_root *root = cgrp->root;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        unsigned long key;
        int i;

        /*
         * Build the set of subsystem state objects that we want to see in the
         * new css_set. While subsystems can change globally, the entries here
         * won't change, so no need for locking.
         */
        for_each_subsys(ss, i) {
                if (root->subsys_mask & (1UL << i)) {
                        /*
                         * @ss is in this hierarchy, so we want the
                         * effective css from @cgrp.
                         */
                        template[i] = cgroup_e_css_by_mask(cgrp, ss);
                } else {
                        /*
                         * @ss is not in this hierarchy, so we don't want
                         * to change the css.
                         */
                        template[i] = old_cset->subsys[i];
                }
        }

        key = css_set_hash(template);
        hash_for_each_possible(css_set_table, cset, hlist, key) {
                if (!compare_css_sets(cset, old_cset, cgrp, template))
                        continue;

                /* This css_set matches what we need */
                return cset;
        }

        /* No existing cgroup group matched */
        return NULL;
}

static void free_cgrp_cset_links(struct list_head *links_to_free)
{
        struct cgrp_cset_link *link, *tmp_link;

        list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
                list_del(&link->cset_link);
                kfree(link);
        }
}

/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
 */
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
        struct cgrp_cset_link *link;
        int i;

        INIT_LIST_HEAD(tmp_links);

        for (i = 0; i < count; i++) {
                link = kzalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
                        free_cgrp_cset_links(tmp_links);
                        return -ENOMEM;
                }
                list_add(&link->cset_link, tmp_links);
        }
        return 0;
}

/**
 * link_css_set - a helper function to link a css_set to a cgroup
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 * @cset: the css_set to be linked
 * @cgrp: the destination cgroup
 */
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
                         struct cgroup *cgrp)
{
        struct cgrp_cset_link *link;

        BUG_ON(list_empty(tmp_links));

        if (cgroup_on_dfl(cgrp))
                cset->dfl_cgrp = cgrp;

        link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
        link->cset = cset;
        link->cgrp = cgrp;

        /*
         * Always add links to the tail of the lists so that the lists are
         * in chronological order.
         */
        list_move_tail(&link->cset_link, &cgrp->cset_links);
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);

        if (cgroup_parent(cgrp))
                cgroup_get_live(cgrp);
}

/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
 */
static struct css_set *find_css_set(struct css_set *old_cset,
                                    struct cgroup *cgrp)
{
        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
        struct css_set *cset;
        struct list_head tmp_links;
        struct cgrp_cset_link *link;
        struct cgroup_subsys *ss;
        unsigned long key;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        /* First see if we already have a cgroup group that matches
         * the desired set */
        spin_lock_irq(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        if (cset)
                return cset;

        cset = kzalloc(sizeof(*cset), GFP_KERNEL);
        if (!cset)
                return NULL;

        /* Allocate all the cgrp_cset_link objects that we'll need */
        if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
                kfree(cset);
                return NULL;
        }

        refcount_set(&cset->refcount, 1);
        cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->dying_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
        INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
        INIT_LIST_HEAD(&cset->cgrp_links);
        INIT_LIST_HEAD(&cset->mg_src_preload_node);
        INIT_LIST_HEAD(&cset->mg_dst_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);

        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));

        spin_lock_irq(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;

                if (c->root == cgrp->root)
                        c = cgrp;
                link_css_set(&tmp_links, cset, c);
        }

        BUG_ON(!list_empty(&tmp_links));

        css_set_count++;

        /* Add @cset to the hash table */
        key = css_set_hash(cset->subsys);
        hash_add(css_set_table, &cset->hlist, key);

        for_each_subsys(ss, ssid) {
                struct cgroup_subsys_state *css = cset->subsys[ssid];

                list_add_tail(&cset->e_cset_node[ssid],
                              &css->cgroup->e_csets[ssid]);
                css_get(css);
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * If @cset should be threaded, look up the matching dom_cset and
         * link them up.  We first fully initialize @cset then look for the
         * dom_cset.  It's simpler this way and safe as @cset is guaranteed
         * to stay empty until we return.
         */
        if (cgroup_is_threaded(cset->dfl_cgrp)) {
                struct css_set *dcset;

                dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
                if (!dcset) {
                        put_css_set(cset);
                        return NULL;
                }

                spin_lock_irq(&css_set_lock);
                cset->dom_cset = dcset;
                list_add_tail(&cset->threaded_csets_node,
                              &dcset->threaded_csets);
                spin_unlock_irq(&css_set_lock);
        }

        return cset;
}

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
        struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;

        return root_cgrp->root;
}

void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
        bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;

        /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
        if (favor && !favoring) {
                rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
                root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
        } else if (!favor && favoring) {
                rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
                root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
        }
}

static int cgroup_init_root_id(struct cgroup_root *root)
{
        int id;

        lockdep_assert_held(&cgroup_mutex);

        id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
        if (id < 0)
                return id;

        root->hierarchy_id = id;
        return 0;
}

static void cgroup_exit_root_id(struct cgroup_root *root)
{
        lockdep_assert_held(&cgroup_mutex);

        idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}

void cgroup_free_root(struct cgroup_root *root)
{
        kfree_rcu(root, rcu);
}

static void cgroup_destroy_root(struct cgroup_root *root)
{
        struct cgroup *cgrp = &root->cgrp;
        struct cgrp_cset_link *link, *tmp_link;

        trace_cgroup_destroy_root(root);

        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

        BUG_ON(atomic_read(&root->nr_cgrps));
        BUG_ON(!list_empty(&cgrp->self.children));

        /* Rebind all subsystems back to the default hierarchy */
        WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));

        /*
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }

        spin_unlock_irq(&css_set_lock);

        WARN_ON_ONCE(list_empty(&root->root_list));
        list_del_rcu(&root->root_list);
        cgroup_root_count--;

        if (!have_favordynmods)
                cgroup_favor_dynmods(root, false);

        cgroup_exit_root_id(root);

        cgroup_unlock();

        cgroup_rstat_exit(cgrp);
        kernfs_destroy_root(root->kf_root);
        cgroup_free_root(root);
}

/*
 * Returned cgroup is without refcount but it's valid as long as cset pins it.
 */
static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
{
        struct cgroup *res_cgroup = NULL;

        if (cset == &init_css_set) {
                res_cgroup = &root->cgrp;
        } else if (root == &cgrp_dfl_root) {
                res_cgroup = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;
                lockdep_assert_held(&css_set_lock);

                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;

                        if (c->root == root) {
                                res_cgroup = c;
                                break;
                        }
                }
        }

        /*
         * If cgroup_mutex is not held, the cgrp_cset_link will be freed
         * before we remove the cgroup root from the root_list. Consequently,
         * when accessing a cgroup root, the cset_link may have already been
         * freed, resulting in a NULL res_cgroup. However, by holding the
         * cgroup_mutex, we ensure that res_cgroup can't be NULL.
         * If we don't hold cgroup_mutex in the caller, we must do the NULL
         * check.
         */
        return res_cgroup;
}

/*
 * look up cgroup associated with current task's cgroup namespace on the
 * specified hierarchy
 */
static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{
        struct cgroup *res = NULL;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        rcu_read_lock();

        cset = current->nsproxy->cgroup_ns->root_cset;
        res = __cset_cgroup_from_root(cset, root);

        rcu_read_unlock();

        /*
         * The namespace_sem is held by current, so the root cgroup can't
         * be umounted. Therefore, we can ensure that the res is non-NULL.
         */
        WARN_ON_ONCE(!res);
        return res;
}

/*
 * Look up cgroup associated with current task's cgroup namespace on the default
 * hierarchy.
 *
 * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
 * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
 *   pointers.
 * - css_set_lock is not needed because we just read cset->dfl_cgrp.
 * - As a bonus returned cgrp is pinned with the current because it cannot
 *   switch cgroup_ns asynchronously.
 */
static struct cgroup *current_cgns_cgroup_dfl(void)
{
        struct css_set *cset;

        if (current->nsproxy) {
                cset = current->nsproxy->cgroup_ns->root_cset;
                return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
        } else {
                /*
                 * NOTE: This function may be called from bpf_cgroup_from_id()
                 * on a task which has already passed exit_task_namespaces() and
                 * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
                 * cgroups visible for lookups.
                 */
                return &cgrp_dfl_root.cgrp;
        }
}

/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
{
        lockdep_assert_held(&css_set_lock);

        return __cset_cgroup_from_root(cset, root);
}

/*
 * Return the cgroup for "task" from the given hierarchy. Must be
 * called with css_set_lock held to prevent task's groups from being modified.
 * Must be called with either cgroup_mutex or rcu read lock to prevent the
 * cgroup root from being destroyed.
 */
struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                     struct cgroup_root *root)
{
        /*
         * No need to lock the task - since we hold css_set_lock the
         * task can't change groups.
         */
        return cset_cgroup_from_root(task_css_set(task), root);
}

/*
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
 * cgroup_attach_task() can increment it again.  Because a count of zero
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
 * least one task in the system (init, pid == 1), therefore, root cgroup
 * always has either children cgroups and/or using tasks.  So we don't
 * need a special hack to ensure that root cgroup cannot be deleted.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                              char *buf)
{
        struct cgroup_subsys *ss = cft->ss;

        if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
            !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
                const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";

                snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
                         dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
                         cft->name);
        } else {
                strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
        }
        return buf;
}

/**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
 * S_IRUGO for read, S_IWUSR for write.
 */
static umode_t cgroup_file_mode(const struct cftype *cft)
{
        umode_t mode = 0;

        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;

        if (cft->write_u64 || cft->write_s64 || cft->write) {
                if (cft->flags & CFTYPE_WORLD_WRITABLE)
                        mode |= S_IWUGO;
                else
                        mode |= S_IWUSR;
        }

        return mode;
}

/**
 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
 * @subtree_control: the new subtree_control mask to consider
 * @this_ss_mask: available subsystems
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
 * This function calculates which subsystems need to be enabled if
 * @subtree_control is to be applied while restricted to @this_ss_mask.
 */
static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
        u16 cur_ss_mask = subtree_control;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        cur_ss_mask |= cgrp_dfl_implicit_ss_mask;

        while (true) {
                u16 new_ss_mask = cur_ss_mask;

                do_each_subsys_mask(ss, ssid, cur_ss_mask) {
                        new_ss_mask |= ss->depends_on;
                } while_each_subsys_mask();

                /*
                 * Mask out subsystems which aren't available.  This can
                 * happen only if some depended-upon subsystems were bound
                 * to non-default hierarchies.
                 */
                new_ss_mask &= this_ss_mask;

                if (new_ss_mask == cur_ss_mask)
                        break;
                cur_ss_mask = new_ss_mask;
        }

        return cur_ss_mask;
}

/**
 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 *
 * This helper undoes cgroup_kn_lock_live() and should be invoked before
 * the method finishes if locking succeeded.  Note that once this function
 * returns the cgroup returned by cgroup_kn_lock_live() may become
 * inaccessible any time.  If the caller intends to continue to access the
 * cgroup, it should pin it before invoking this function.
 */
void cgroup_kn_unlock(struct kernfs_node *kn)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn->parent->priv;

        cgroup_unlock();

        kernfs_unbreak_active_protection(kn);
        cgroup_put(cgrp);
}

/**
 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 * @drain_offline: perform offline draining on the cgroup
 *
 * This helper is to be used by a cgroup kernfs method currently servicing
 * @kn.  It breaks the active protection, performs cgroup locking and
 * verifies that the associated cgroup is alive.  Returns the cgroup if
 * alive; otherwise, %NULL.  A successful return should be undone by a
 * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
 * cgroup is drained of offlining csses before return.
 *
 * Any cgroup kernfs method implementation which requires locking the
 * associated cgroup should use this helper.  It avoids nesting cgroup
 * locking under kernfs active protection and allows all kernfs operations
 * including self-removal.
 */
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn->parent->priv;

        /*
         * We're gonna grab cgroup_mutex which nests outside kernfs
         * active_ref.  cgroup liveliness check alone provides enough
         * protection against removal.  Ensure @cgrp stays accessible and
         * break the active_ref protection.
         */
        if (!cgroup_tryget(cgrp))
                return NULL;
        kernfs_break_active_protection(kn);

        if (drain_offline)
                cgroup_lock_and_drain_offline(cgrp);
        else
                cgroup_lock();

        if (!cgroup_is_dead(cgrp))
                return cgrp;

        cgroup_kn_unlock(kn);
        return NULL;
}

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];

        lockdep_assert_held(&cgroup_mutex);

        if (cft->file_offset) {
                struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = NULL;
                spin_unlock_irq(&cgroup_file_kn_lock);

                del_timer_sync(&cfile->notify_timer);
        }

        kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}

/**
 * css_clear_dir - remove subsys files in a cgroup directory
 * @css: target css
 */
static void css_clear_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts;

        if (!(css->flags & CSS_VISIBLE))
                return;

        css->flags &= ~CSS_VISIBLE;

        if (!css->ss) {
                if (cgroup_on_dfl(cgrp)) {
                        cgroup_addrm_files(css, cgrp,
                                           cgroup_base_files, false);
                        if (cgroup_psi_enabled())
                                cgroup_addrm_files(css, cgrp,
                                                   cgroup_psi_files, false);
                } else {
                        cgroup_addrm_files(css, cgrp,
                                           cgroup1_base_files, false);
                }
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node)
                        cgroup_addrm_files(css, cgrp, cfts, false);
        }
}

/**
 * css_populate_dir - create subsys files in a cgroup directory
 * @css: target css
 *
 * On failure, no file is added.
 */
static int css_populate_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts, *failed_cfts;
        int ret;

        if (css->flags & CSS_VISIBLE)
                return 0;

        if (!css->ss) {
                if (cgroup_on_dfl(cgrp)) {
                        ret = cgroup_addrm_files(css, cgrp,
                                                 cgroup_base_files, true);
                        if (ret < 0)
                                return ret;

                        if (cgroup_psi_enabled()) {
                                ret = cgroup_addrm_files(css, cgrp,
                                                         cgroup_psi_files, true);
                                if (ret < 0)
                                        return ret;
                        }
                } else {
                        ret = cgroup_addrm_files(css, cgrp,
                                                 cgroup1_base_files, true);
                        if (ret < 0)
                                return ret;
                }
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node) {
                        ret = cgroup_addrm_files(css, cgrp, cfts, true);
                        if (ret < 0) {
                                failed_cfts = cfts;
                                goto err;
                        }
                }
        }

        css->flags |= CSS_VISIBLE;

        return 0;
err:
        list_for_each_entry(cfts, &css->ss->cfts, node) {
                if (cfts == failed_cfts)
                        break;
                cgroup_addrm_files(css, cgrp, cfts, false);
        }
        return ret;
}

int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
        int ssid, ret;
        u16 dfl_disable_ss_mask = 0;

        lockdep_assert_held(&cgroup_mutex);

        do_each_subsys_mask(ss, ssid, ss_mask) {
                /*
                 * If @ss has non-root csses attached to it, can't move.
                 * If @ss is an implicit controller, it is exempt from this
                 * rule and can be stolen.
                 */
                if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
                    !ss->implicit_on_dfl)
                        return -EBUSY;

                /* can't move between two non-dummy roots either */
                if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                        return -EBUSY;

                /*
                 * Collect ssid's that need to be disabled from default
                 * hierarchy.
                 */
                if (ss->root == &cgrp_dfl_root)
                        dfl_disable_ss_mask |= 1 << ssid;

        } while_each_subsys_mask();

        if (dfl_disable_ss_mask) {
                struct cgroup *scgrp = &cgrp_dfl_root.cgrp;

                /*
                 * Controllers from default hierarchy that need to be rebound
                 * are all disabled together in one go.
                 */
                cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
                WARN_ON(cgroup_apply_control(scgrp));
                cgroup_finalize_control(scgrp, 0);
        }

        do_each_subsys_mask(ss, ssid, ss_mask) {
                struct cgroup_root *src_root = ss->root;
                struct cgroup *scgrp = &src_root->cgrp;
                struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                struct css_set *cset, *cset_pos;
                struct css_task_iter *it;

                WARN_ON(!css || cgroup_css(dcgrp, ss));

                if (src_root != &cgrp_dfl_root) {
                        /* disable from the source */
                        src_root->subsys_mask &= ~(1 << ssid);
                        WARN_ON(cgroup_apply_control(scgrp));
                        cgroup_finalize_control(scgrp, 0);
                }

                /* rebind */
                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
                rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;
                css->cgroup = dcgrp;

                spin_lock_irq(&css_set_lock);
                WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
                list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
                                         e_cset_node[ss->id]) {
                        list_move_tail(&cset->e_cset_node[ss->id],
                                       &dcgrp->e_csets[ss->id]);
                        /*
                         * all css_sets of scgrp together in same order to dcgrp,
                         * patch in-flight iterators to preserve correct iteration.
                         * since the iterator is always advanced right away and
                         * finished when it->cset_pos meets it->cset_head, so only
                         * update it->cset_head is enough here.
                         */
                        list_for_each_entry(it, &cset->task_iters, iters_node)
                                if (it->cset_head == &scgrp->e_csets[ss->id])
                                        it->cset_head = &dcgrp->e_csets[ss->id];
                }
                spin_unlock_irq(&css_set_lock);

                if (ss->css_rstat_flush) {
                        list_del_rcu(&css->rstat_css_node);
                        synchronize_rcu();
                        list_add_rcu(&css->rstat_css_node,
                                     &dcgrp->rstat_css_list);
                }

                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
                if (dst_root == &cgrp_dfl_root) {
                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
                } else {
                        dcgrp->subtree_control |= 1 << ssid;
                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }

                ret = cgroup_apply_control(dcgrp);
                if (ret)
                        pr_warn("partial failure to rebind %s controller (err=%d)\n",
                                ss->name, ret);

                if (ss->bind)
                        ss->bind(css);
        } while_each_subsys_mask();

        kernfs_activate(dcgrp->kn);
        return 0;
}

int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root)
{
        int len = 0;
        char *buf = NULL;
        struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
        struct cgroup *ns_cgroup;

        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        spin_lock_irq(&css_set_lock);
        ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
        len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
        spin_unlock_irq(&css_set_lock);

        if (len == -E2BIG)
                len = -ERANGE;
        else if (len > 0) {
                seq_escape(sf, buf, " \t\n\\");
                len = 0;
        }
        kfree(buf);
        return len;
}

enum cgroup2_param {
        Opt_nsdelegate,
        Opt_favordynmods,
        Opt_memory_localevents,
        Opt_memory_recursiveprot,
        Opt_memory_hugetlb_accounting,
        nr__cgroup2_params
};

static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
        fsparam_flag("nsdelegate",                Opt_nsdelegate),
        fsparam_flag("favordynmods",                Opt_favordynmods),
        fsparam_flag("memory_localevents",        Opt_memory_localevents),
        fsparam_flag("memory_recursiveprot",        Opt_memory_recursiveprot),
        fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
        {}
};

static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_nsdelegate:
                ctx->flags |= CGRP_ROOT_NS_DELEGATE;
                return 0;
        case Opt_favordynmods:
                ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
                return 0;
        case Opt_memory_localevents:
                ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                return 0;
        case Opt_memory_recursiveprot:
                ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                return 0;
        case Opt_memory_hugetlb_accounting:
                ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                return 0;
        }
        return -EINVAL;
}

static void apply_cgroup_root_flags(unsigned int root_flags)
{
        if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
                if (root_flags & CGRP_ROOT_NS_DELEGATE)
                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;

                cgroup_favor_dynmods(&cgrp_dfl_root,
                                     root_flags & CGRP_ROOT_FAVOR_DYNMODS);

                if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;

                if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;

                if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
        }
}

static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
                seq_puts(seq, ",nsdelegate");
        if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
                seq_puts(seq, ",favordynmods");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                seq_puts(seq, ",memory_localevents");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                seq_puts(seq, ",memory_recursiveprot");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
                seq_puts(seq, ",memory_hugetlb_accounting");
        return 0;
}

static int cgroup_reconfigure(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        apply_cgroup_root_flags(ctx->flags);
        return 0;
}

static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
        struct cgroup_subsys *ss;
        int ssid;

        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
        cgrp->self.flags |= CSS_ONLINE;
        cgrp->dom_cgrp = cgrp;
        cgrp->max_descendants = INT_MAX;
        cgrp->max_depth = INT_MAX;
        INIT_LIST_HEAD(&cgrp->rstat_css_list);
        prev_cputime_init(&cgrp->prev_cputime);

        for_each_subsys(ss, ssid)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

        init_waitqueue_head(&cgrp->offline_waitq);
        INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}

void init_cgroup_root(struct cgroup_fs_context *ctx)
{
        struct cgroup_root *root = ctx->root;
        struct cgroup *cgrp = &root->cgrp;

        INIT_LIST_HEAD_RCU(&root->root_list);
        atomic_set(&root->nr_cgrps, 1);
        cgrp->root = root;
        init_cgroup_housekeeping(cgrp);

        /* DYNMODS must be modified through cgroup_favor_dynmods() */
        root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
        if (ctx->release_agent)
                strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
        if (ctx->name)
                strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
        if (ctx->cpuset_clone_children)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}

int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
        struct kernfs_syscall_ops *kf_sops;
        struct css_set *cset;
        int i, ret;

        lockdep_assert_held(&cgroup_mutex);

        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
                              0, GFP_KERNEL);
        if (ret)
                goto out;

        /*
         * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us.  Later rebinding may disable
         * controllers on the default hierarchy and thus create new csets,
         * which can't be more than the existing ones.  Allocate 2x.
         */
        ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
        if (ret)
                goto cancel_ref;

        ret = cgroup_init_root_id(root);
        if (ret)
                goto cancel_ref;

        kf_sops = root == &cgrp_dfl_root ?
                &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;

        root->kf_root = kernfs_create_root(kf_sops,
                                           KERNFS_ROOT_CREATE_DEACTIVATED |
                                           KERNFS_ROOT_SUPPORT_EXPORTOP |
                                           KERNFS_ROOT_SUPPORT_USER_XATTR,
                                           root_cgrp);
        if (IS_ERR(root->kf_root)) {
                ret = PTR_ERR(root->kf_root);
                goto exit_root_id;
        }
        root_cgrp->kn = kernfs_root_to_node(root->kf_root);
        WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
        root_cgrp->ancestors[0] = root_cgrp;

        ret = css_populate_dir(&root_cgrp->self);
        if (ret)
                goto destroy_root;

        ret = cgroup_rstat_init(root_cgrp);
        if (ret)
                goto destroy_root;

        ret = rebind_subsystems(root, ss_mask);
        if (ret)
                goto exit_stats;

        ret = cgroup_bpf_inherit(root_cgrp);
        WARN_ON_ONCE(ret);

        trace_cgroup_setup_root(root);

        /*
         * There must be no failure case after here, since rebinding takes
         * care of subsystems' refcounts, which are explicitly dropped in
         * the failure exit path.
         */
        list_add_rcu(&root->root_list, &cgroup_roots);
        cgroup_root_count++;

        /*
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
        spin_lock_irq(&css_set_lock);
        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
                if (css_set_populated(cset))
                        cgroup_update_populated(root_cgrp, true);
        }
        spin_unlock_irq(&css_set_lock);

        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);

        ret = 0;
        goto out;

exit_stats:
        cgroup_rstat_exit(root_cgrp);
destroy_root:
        kernfs_destroy_root(root->kf_root);
        root->kf_root = NULL;
exit_root_id:
        cgroup_exit_root_id(root);
cancel_ref:
        percpu_ref_exit(&root_cgrp->self.refcnt);
out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
}

int cgroup_do_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        ctx->kfc.root = ctx->root->kf_root;
        if (fc->fs_type == &cgroup2_fs_type)
                ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
        else
                ctx->kfc.magic = CGROUP_SUPER_MAGIC;
        ret = kernfs_get_tree(fc);

        /*
         * In non-init cgroup namespace, instead of root cgroup's dentry,
         * we return the dentry corresponding to the cgroupns->root_cgrp.
         */
        if (!ret && ctx->ns != &init_cgroup_ns) {
                struct dentry *nsdentry;
                struct super_block *sb = fc->root->d_sb;
                struct cgroup *cgrp;

                cgroup_lock();
                spin_lock_irq(&css_set_lock);

                cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);

                spin_unlock_irq(&css_set_lock);
                cgroup_unlock();

                nsdentry = kernfs_node_dentry(cgrp->kn, sb);
                dput(fc->root);
                if (IS_ERR(nsdentry)) {
                        deactivate_locked_super(sb);
                        ret = PTR_ERR(nsdentry);
                        nsdentry = NULL;
                }
                fc->root = nsdentry;
        }

        if (!ctx->kfc.new_sb_created)
                cgroup_put(&ctx->root->cgrp);

        return ret;
}

/*
 * Destroy a cgroup filesystem context.
 */
static void cgroup_fs_context_free(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        kfree(ctx->name);
        kfree(ctx->release_agent);
        put_cgroup_ns(ctx->ns);
        kernfs_free_fs_context(fc);
        kfree(ctx);
}

static int cgroup_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        WRITE_ONCE(cgrp_dfl_visible, true);
        cgroup_get_live(&cgrp_dfl_root.cgrp);
        ctx->root = &cgrp_dfl_root;

        ret = cgroup_do_get_tree(fc);
        if (!ret)
                apply_cgroup_root_flags(ctx->flags);
        return ret;
}

static const struct fs_context_operations cgroup_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup2_parse_param,
        .get_tree        = cgroup_get_tree,
        .reconfigure        = cgroup_reconfigure,
};

static const struct fs_context_operations cgroup1_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup1_parse_param,
        .get_tree        = cgroup1_get_tree,
        .reconfigure        = cgroup1_reconfigure,
};

/*
 * Initialise the cgroup filesystem creation/reconfiguration context.  Notably,
 * we select the namespace we're going to use.
 */
static int cgroup_init_fs_context(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx;

        ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        fc->fs_private = &ctx->kfc;
        if (fc->fs_type == &cgroup2_fs_type)
                fc->ops = &cgroup_fs_context_ops;
        else
                fc->ops = &cgroup1_fs_context_ops;
        put_user_ns(fc->user_ns);
        fc->user_ns = get_user_ns(ctx->ns->user_ns);
        fc->global = true;

        if (have_favordynmods)
                ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;

        return 0;
}

static void cgroup_kill_sb(struct super_block *sb)
{
        struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
        struct cgroup_root *root = cgroup_root_from_kf(kf_root);

        /*
         * If @root doesn't have any children, start killing it.
         * This prevents new mounts by disabling percpu_ref_tryget_live().
         *
         * And don't kill the default root.
         */
        if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
            !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
                cgroup_bpf_offline(&root->cgrp);
                percpu_ref_kill(&root->cgrp.self.refcnt);
        }
        cgroup_put(&root->cgrp);
        kernfs_kill_sb(sb);
}

struct file_system_type cgroup_fs_type = {
        .name                        = "cgroup",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup1_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

static struct file_system_type cgroup2_fs_type = {
        .name                        = "cgroup2",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup2_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

#ifdef CONFIG_CPUSETS
static const struct fs_context_operations cpuset_fs_context_ops = {
        .get_tree        = cgroup1_get_tree,
        .free                = cgroup_fs_context_free,
};

/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */
static int cpuset_init_fs_context(struct fs_context *fc)
{
        char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
        struct cgroup_fs_context *ctx;
        int err;

        err = cgroup_init_fs_context(fc);
        if (err) {
                kfree(agent);
                return err;
        }

        fc->ops = &cpuset_fs_context_ops;

        ctx = cgroup_fc2context(fc);
        ctx->subsys_mask = 1 << cpuset_cgrp_id;
        ctx->flags |= CGRP_ROOT_NOPREFIX;
        ctx->release_agent = agent;

        get_filesystem(&cgroup_fs_type);
        put_filesystem(fc->fs_type);
        fc->fs_type = &cgroup_fs_type;

        return 0;
}

static struct file_system_type cpuset_fs_type = {
        .name                        = "cpuset",
        .init_fs_context        = cpuset_init_fs_context,
        .fs_flags                = FS_USERNS_MOUNT,
};
#endif

int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
                          struct cgroup_namespace *ns)
{
        struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);

        return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns)
{
        int ret;

        cgroup_lock();
        spin_lock_irq(&css_set_lock);

        ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);

        spin_unlock_irq(&css_set_lock);
        cgroup_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);

/**
 * cgroup_attach_lock - Lock for ->attach()
 * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
 *
 * cgroup migration sometimes needs to stabilize threadgroups against forks and
 * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
 * implementations (e.g. cpuset), also need to disable CPU hotplug.
 * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
 * lead to deadlocks.
 *
 * Bringing up a CPU may involve creating and destroying tasks which requires
 * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
 * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
 * write-locking threadgroup_rwsem, the locking order is reversed and we end up
 * waiting for an on-going CPU hotplug operation which in turn is waiting for
 * the threadgroup_rwsem to be released to create new tasks. For more details:
 *
 *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
 *
 * Resolve the situation by always acquiring cpus_read_lock() before optionally
 * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
 * CPU hotplug is disabled on entry.
 */
void cgroup_attach_lock(bool lock_threadgroup)
{
        cpus_read_lock();
        if (lock_threadgroup)
                percpu_down_write(&cgroup_threadgroup_rwsem);
}

/**
 * cgroup_attach_unlock - Undo cgroup_attach_lock()
 * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
 */
void cgroup_attach_unlock(bool lock_threadgroup)
{
        if (lock_threadgroup)
                percpu_up_write(&cgroup_threadgroup_rwsem);
        cpus_read_unlock();
}

/**
 * cgroup_migrate_add_task - add a migration target task to a migration context
 * @task: target task
 * @mgctx: target migration context
 *
 * Add @task, which is a migration target, to @mgctx->tset.  This function
 * becomes noop if @task doesn't need to be migrated.  @task's css_set
 * should have been added as a migration source and @task->cg_list will be
 * moved from the css_set's tasks list to mg_tasks one.
 */
static void cgroup_migrate_add_task(struct task_struct *task,
                                    struct cgroup_mgctx *mgctx)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* @task either already exited or can't exit until the end */
        if (task->flags & PF_EXITING)
                return;

        /* cgroup_threadgroup_rwsem protects racing against forks */
        WARN_ON_ONCE(list_empty(&task->cg_list));

        cset = task_css_set(task);
        if (!cset->mg_src_cgrp)
                return;

        mgctx->tset.nr_tasks++;

        list_move_tail(&task->cg_list, &cset->mg_tasks);
        if (list_empty(&cset->mg_node))
                list_add_tail(&cset->mg_node,
                              &mgctx->tset.src_csets);
        if (list_empty(&cset->mg_dst_cset->mg_node))
                list_add_tail(&cset->mg_dst_cset->mg_node,
                              &mgctx->tset.dst_csets);
}

/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * @tset iteration is initialized and the first task is returned.
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp)
{
        tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
        tset->cur_task = NULL;

        return cgroup_taskset_next(tset, dst_cssp);
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp)
{
        struct css_set *cset = tset->cur_cset;
        struct task_struct *task = tset->cur_task;

        while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
                if (!task)
                        task = list_first_entry(&cset->mg_tasks,
                                                struct task_struct, cg_list);
                else
                        task = list_next_entry(task, cg_list);

                if (&task->cg_list != &cset->mg_tasks) {
                        tset->cur_cset = cset;
                        tset->cur_task = task;

                        /*
                         * This function may be called both before and
                         * after cgroup_migrate_execute().  The two cases
                         * can be distinguished by looking at whether @cset
                         * has its ->mg_dst_cset set.
                         */
                        if (cset->mg_dst_cset)
                                *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
                        else
                                *dst_cssp = cset->subsys[tset->ssid];

                        return task;
                }

                cset = list_next_entry(cset, mg_node);
                task = NULL;
        }

        return NULL;
}

/**
 * cgroup_migrate_execute - migrate a taskset
 * @mgctx: migration context
 *
 * Migrate tasks in @mgctx as setup by migration preparation functions.
 * This function fails iff one of the ->can_attach callbacks fails and
 * guarantees that either all or none of the tasks in @mgctx are migrated.
 * @mgctx is consumed regardless of success.
 */
static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
        struct cgroup_taskset *tset = &mgctx->tset;
        struct cgroup_subsys *ss;
        struct task_struct *task, *tmp_task;
        struct css_set *cset, *tmp_cset;
        int ssid, failed_ssid, ret;

        /* check that we can legitimately attach to the cgroup */
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->can_attach) {
                                tset->ssid = ssid;
                                ret = ss->can_attach(tset);
                                if (ret) {
                                        failed_ssid = ssid;
                                        goto out_cancel_attach;
                                }
                        }
                } while_each_subsys_mask();
        }

        /*
         * Now that we're guaranteed success, proceed to move all tasks to
         * the new cgroup.  There are no failure cases after here, so this
         * is the commit point.
         */
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(cset, &tset->src_csets, mg_node) {
                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
                        struct css_set *from_cset = task_css_set(task);
                        struct css_set *to_cset = cset->mg_dst_cset;

                        get_css_set(to_cset);
                        to_cset->nr_tasks++;
                        css_set_move_task(task, from_cset, to_cset, true);
                        from_cset->nr_tasks--;
                        /*
                         * If the source or destination cgroup is frozen,
                         * the task might require to change its state.
                         */
                        cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
                                                    to_cset->dfl_cgrp);
                        put_css_set_locked(from_cset);

                }
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Migration is committed, all target tasks are now on dst_csets.
         * Nothing is sensitive to fork() after this point.  Notify
         * controllers that migration is complete.
         */
        tset->csets = &tset->dst_csets;

        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->attach) {
                                tset->ssid = ssid;
                                ss->attach(tset);
                        }
                } while_each_subsys_mask();
        }

        ret = 0;
        goto out_release_tset;

out_cancel_attach:
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ssid == failed_ssid)
                                break;
                        if (ss->cancel_attach) {
                                tset->ssid = ssid;
                                ss->cancel_attach(tset);
                        }
                } while_each_subsys_mask();
        }
out_release_tset:
        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
                list_del_init(&cset->mg_node);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Re-initialize the cgroup_taskset structure in case it is reused
         * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
         * iteration.
         */
        tset->nr_tasks = 0;
        tset->csets    = &tset->src_csets;
        return ret;
}

/**
 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
 * @dst_cgrp: destination cgroup to test
 *
 * On the default hierarchy, except for the mixable, (possible) thread root
 * and threaded cgroups, subtree_control must be zero for migration
 * destination cgroups with tasks so that child cgroups don't compete
 * against tasks.
 */
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
        /* v1 doesn't have any restriction */
        if (!cgroup_on_dfl(dst_cgrp))
                return 0;

        /* verify @dst_cgrp can host resources */
        if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /*
         * If @dst_cgrp is already or can become a thread root or is
         * threaded, it doesn't matter.
         */
        if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
                return 0;

        /* apply no-internal-process constraint */
        if (dst_cgrp->subtree_control)
                return -EBUSY;

        return 0;
}

/**
 * cgroup_migrate_finish - cleanup after attach
 * @mgctx: migration context
 *
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
 */
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
        struct css_set *cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_src_preload_node);
                put_css_set_locked(cset);
        }

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
                                 mg_dst_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_dst_preload_node);
                put_css_set_locked(cset);
        }

        spin_unlock_irq(&css_set_lock);
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
 * @mgctx: migration context
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
 */
void cgroup_migrate_add_src(struct css_set *src_cset,
                            struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx)
{
        struct cgroup *src_cgrp;

        lockdep_assert_held(&cgroup_mutex);
        lockdep_assert_held(&css_set_lock);

        /*
         * If ->dead, @src_set is associated with one or more dead cgroups
         * and doesn't contain any migratable tasks.  Ignore it early so
         * that the rest of migration path doesn't get confused by it.
         */
        if (src_cset->dead)
                return;

        if (!list_empty(&src_cset->mg_src_preload_node))
                return;

        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

        WARN_ON(src_cset->mg_src_cgrp);
        WARN_ON(src_cset->mg_dst_cgrp);
        WARN_ON(!list_empty(&src_cset->mg_tasks));
        WARN_ON(!list_empty(&src_cset->mg_node));

        src_cset->mg_src_cgrp = src_cgrp;
        src_cset->mg_dst_cgrp = dst_cgrp;
        get_css_set(src_cset);
        list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}

/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
 * @mgctx: migration context
 *
 * Tasks are about to be moved and all the source css_sets have been
 * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
 * pins all destination css_sets, links each to its source, and append them
 * to @mgctx->preloaded_dst_csets.
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
 * @mgctx.
 */
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
        struct css_set *src_cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        /* look up the dst cset for each src cset and link it to src */
        list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                struct css_set *dst_cset;
                struct cgroup_subsys *ss;
                int ssid;

                dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                if (!dst_cset)
                        return -ENOMEM;

                WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

                /*
                 * If src cset equals dst, it's noop.  Drop the src.
                 * cgroup_migrate() will skip the cset too.  Note that we
                 * can't handle src == dst as some nodes are used by both.
                 */
                if (src_cset == dst_cset) {
                        src_cset->mg_src_cgrp = NULL;
                        src_cset->mg_dst_cgrp = NULL;
                        list_del_init(&src_cset->mg_src_preload_node);
                        put_css_set(src_cset);
                        put_css_set(dst_cset);
                        continue;
                }

                src_cset->mg_dst_cset = dst_cset;

                if (list_empty(&dst_cset->mg_dst_preload_node))
                        list_add_tail(&dst_cset->mg_dst_preload_node,
                                      &mgctx->preloaded_dst_csets);
                else
                        put_css_set(dst_cset);

                for_each_subsys(ss, ssid)
                        if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
                                mgctx->ss_mask |= 1 << ssid;
        }

        return 0;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
 * @mgctx: migration context
 *
 * Migrate a process or task denoted by @leader.  If migrating a process,
 * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
 * responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                   struct cgroup_mgctx *mgctx)
{
        struct task_struct *task;

        /*
         * The following thread iteration should be inside an RCU critical
         * section to prevent tasks from being freed while taking the snapshot.
         * spin_lock_irq() implies RCU critical section here.
         */
        spin_lock_irq(&css_set_lock);
        task = leader;
        do {
                cgroup_migrate_add_task(task, mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        spin_unlock_irq(&css_set_lock);

        return cgroup_migrate_execute(mgctx);
}

/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
 */
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct task_struct *task;
        int ret = 0;

        /* look up all src csets */
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
                cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
        spin_unlock_irq(&css_set_lock);

        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (!ret)
                ret = cgroup_migrate(leader, threadgroup, &mgctx);

        cgroup_migrate_finish(&mgctx);

        if (!ret)
                TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);

        return ret;
}

struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                             bool *threadgroup_locked)
{
        struct task_struct *tsk;
        pid_t pid;

        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                return ERR_PTR(-EINVAL);

        /*
         * If we migrate a single thread, we don't care about threadgroup
         * stability. If the thread is `current`, it won't exit(2) under our
         * hands or change PID through exec(2). We exclude
         * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
         * callers by cgroup_mutex.
         * Therefore, we can skip the global lock.
         */
        lockdep_assert_held(&cgroup_mutex);
        *threadgroup_locked = pid || threadgroup;
        cgroup_attach_lock(*threadgroup_locked);

        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
                        tsk = ERR_PTR(-ESRCH);
                        goto out_unlock_threadgroup;
                }
        } else {
                tsk = current;
        }

        if (threadgroup)
                tsk = tsk->group_leader;

        /*
         * kthreads may acquire PF_NO_SETAFFINITY during initialization.
         * If userland migrates such a kthread to a non-root cgroup, it can
         * become trapped in a cpuset, or RT kthread may be born in a
         * cgroup with no rt_runtime allocated.  Just say no.
         */
        if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
                tsk = ERR_PTR(-EINVAL);
                goto out_unlock_threadgroup;
        }

        get_task_struct(tsk);
        goto out_unlock_rcu;

out_unlock_threadgroup:
        cgroup_attach_unlock(*threadgroup_locked);
        *threadgroup_locked = false;
out_unlock_rcu:
        rcu_read_unlock();
        return tsk;
}

void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
        struct cgroup_subsys *ss;
        int ssid;

        /* release reference from cgroup_procs_write_start() */
        put_task_struct(task);

        cgroup_attach_unlock(threadgroup_locked);

        for_each_subsys(ss, ssid)
                if (ss->post_attach)
                        ss->post_attach();
}

static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
        struct cgroup_subsys *ss;
        bool printed = false;
        int ssid;

        do_each_subsys_mask(ss, ssid, ss_mask) {
                if (printed)
                        seq_putc(seq, ' ');
                seq_puts(seq, ss->name);
                printed = true;
        } while_each_subsys_mask();
        if (printed)
                seq_putc(seq, '\n');
}

/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgroup_control(cgrp));
        return 0;
}

/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgrp->subtree_control);
        return 0;
}

/**
 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
 * @cgrp: root of the subtree to update csses for
 *
 * @cgrp's control masks have changed and its subtree's css associations
 * need to be updated accordingly.  This function looks up all css_sets
 * which are attached to the subtree, creates the matching updated css_sets
 * and migrates the tasks to the new ones.
 */
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct cgroup_subsys_state *d_css;
        struct cgroup *dsct;
        struct css_set *src_cset;
        bool has_tasks;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* look up all csses currently attached to @cgrp's subtree */
        spin_lock_irq(&css_set_lock);
        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                struct cgrp_cset_link *link;

                /*
                 * As cgroup_update_dfl_csses() is only called by
                 * cgroup_apply_control(). The csses associated with the
                 * given cgrp will not be affected by changes made to
                 * its subtree_control file. We can skip them.
                 */
                if (dsct == cgrp)
                        continue;

                list_for_each_entry(link, &dsct->cset_links, cset_link)
                        cgroup_migrate_add_src(link->cset, dsct, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * We need to write-lock threadgroup_rwsem while migrating tasks.
         * However, if there are no source csets for @cgrp, changing its
         * controllers isn't gonna produce any task migrations and the
         * write-locking can be skipped safely.
         */
        has_tasks = !list_empty(&mgctx.preloaded_src_csets);
        cgroup_attach_lock(has_tasks);

        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (ret)
                goto out_finish;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
                            mg_src_preload_node) {
                struct task_struct *task, *ntask;

                /* all tasks in src_csets need to be migrated */
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
                        cgroup_migrate_add_task(task, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        ret = cgroup_migrate_execute(&mgctx);
out_finish:
        cgroup_migrate_finish(&mgctx);
        cgroup_attach_unlock(has_tasks);
        return ret;
}

/**
 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
 * @cgrp: root of the target subtree
 *
 * Because css offlining is asynchronous, userland may try to re-enable a
 * controller while the previous css is still around.  This function grabs
 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
 */
void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
        __acquires(&cgroup_mutex)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

restart:
        cgroup_lock();

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
                        DEFINE_WAIT(wait);

                        if (!css || !percpu_ref_is_dying(&css->refcnt))
                                continue;

                        cgroup_get_live(dsct);
                        prepare_to_wait(&dsct->offline_waitq, &wait,
                                        TASK_UNINTERRUPTIBLE);

                        cgroup_unlock();
                        schedule();
                        finish_wait(&dsct->offline_waitq, &wait);

                        cgroup_put(dsct);
                        goto restart;
                }
        }
}

/**
 * cgroup_save_control - save control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_save_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->old_subtree_control = dsct->subtree_control;
                dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
                dsct->old_dom_cgrp = dsct->dom_cgrp;
        }
}

/**
 * cgroup_propagate_control - refresh control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
 * ->subtree_control and propagate controller availability through the
 * subtree so that descendants don't have unavailable controllers enabled.
 */
static void cgroup_propagate_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->subtree_control &= cgroup_control(dsct);
                dsct->subtree_ss_mask =
                        cgroup_calc_subtree_ss_mask(dsct->subtree_control,
                                                    cgroup_ss_mask(dsct));
        }
}

/**
 * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_restore_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                dsct->subtree_control = dsct->old_subtree_control;
                dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
                dsct->dom_cgrp = dsct->old_dom_cgrp;
        }
}

static bool css_visible(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        if (cgroup_control(cgrp) & (1 << ss->id))
                return true;
        if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
                return false;
        return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}

/**
 * cgroup_apply_control_enable - enable or show csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and create new csses or make the existing ones
 * visible.  A css is created invisible if it's being implicitly enabled
 * through dependency.  An invisible css is made visible when the userland
 * explicitly enables it.
 *
 * Returns 0 on success, -errno on failure.  On failure, csses which have
 * been processed already aren't cleaned up.  The caller is responsible for
 * cleaning up with cgroup_apply_control_disable().
 */
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid, ret;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
                                continue;

                        if (!css) {
                                css = css_create(dsct, ss);
                                if (IS_ERR(css))
                                        return PTR_ERR(css);
                        }

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css_visible(css)) {
                                ret = css_populate_dir(css);
                                if (ret)
                                        return ret;
                        }
                }
        }

        return 0;
}

/**
 * cgroup_apply_control_disable - kill or hide csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and kill and hide csses so that they match
 * cgroup_ss_mask() and cgroup_visible_mask().
 *
 * A css is hidden when the userland requests it to be disabled while other
 * subsystems are still depending on it.  The css must not actively control
 * resources and be in the vanilla state if it's made visible again later.
 * Controllers which may be depended upon should provide ->css_reset() for
 * this purpose.
 */
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!css)
                                continue;

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css->parent &&
                            !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
                                kill_css(css);
                        } else if (!css_visible(css)) {
                                css_clear_dir(css);
                                if (ss->css_reset)
                                        ss->css_reset(css);
                        }
                }
        }
}

/**
 * cgroup_apply_control - apply control mask updates to the subtree
 * @cgrp: root of the target subtree
 *
 * subsystems can be enabled and disabled in a subtree using the following
 * steps.
 *
 * 1. Call cgroup_save_control() to stash the current state.
 * 2. Update ->subtree_control masks in the subtree as desired.
 * 3. Call cgroup_apply_control() to apply the changes.
 * 4. Optionally perform other related operations.
 * 5. Call cgroup_finalize_control() to finish up.
 *
 * This function implements step 3 and propagates the mask changes
 * throughout @cgrp's subtree, updates csses accordingly and perform
 * process migrations.
 */
static int cgroup_apply_control(struct cgroup *cgrp)
{
        int ret;

        cgroup_propagate_control(cgrp);

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                return ret;

        /*
         * At this point, cgroup_e_css_by_mask() results reflect the new csses
         * making the following cgroup_update_dfl_csses() properly update
         * css associations of all tasks in the subtree.
         */
        return cgroup_update_dfl_csses(cgrp);
}

/**
 * cgroup_finalize_control - finalize control mask update
 * @cgrp: root of the target subtree
 * @ret: the result of the update
 *
 * Finalize control mask update.  See cgroup_apply_control() for more info.
 */
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
        if (ret) {
                cgroup_restore_control(cgrp);
                cgroup_propagate_control(cgrp);
        }

        cgroup_apply_control_disable(cgrp);
}

static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
{
        u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;

        /* if nothing is getting enabled, nothing to worry about */
        if (!enable)
                return 0;

        /* can @cgrp host any resources? */
        if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return 0;

        if (domain_enable) {
                /* can't enable domain controllers inside a thread subtree */
                if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return -EOPNOTSUPP;
        } else {
                /*
                 * Threaded controllers can handle internal competitions
                 * and are always allowed inside a (prospective) thread
                 * subtree.
                 */
                if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return 0;
        }

        /*
         * Controllers can't be enabled for a cgroup with tasks to avoid
         * child cgroups competing against tasks.
         */
        if (cgroup_has_tasks(cgrp))
                return -EBUSY;

        return 0;
}

/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes,
                                            loff_t off)
{
        u16 enable = 0, disable = 0;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
        int ssid, ret;

        /*
         * Parse input - space separated list of subsystem names prefixed
         * with either + or -.
         */
        buf = strstrip(buf);
        while ((tok = strsep(&buf, " "))) {
                if (tok[0] == '\0')
                        continue;
                do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
                        if (!cgroup_ssid_enabled(ssid) ||
                            strcmp(tok + 1, ss->name))
                                continue;

                        if (*tok == '+') {
                                enable |= 1 << ssid;
                                disable &= ~(1 << ssid);
                        } else if (*tok == '-') {
                                disable |= 1 << ssid;
                                enable &= ~(1 << ssid);
                        } else {
                                return -EINVAL;
                        }
                        break;
                } while_each_subsys_mask();
                if (ssid == CGROUP_SUBSYS_COUNT)
                        return -EINVAL;
        }

        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENODEV;

        for_each_subsys(ss, ssid) {
                if (enable & (1 << ssid)) {
                        if (cgrp->subtree_control & (1 << ssid)) {
                                enable &= ~(1 << ssid);
                                continue;
                        }

                        if (!(cgroup_control(cgrp) & (1 << ssid))) {
                                ret = -ENOENT;
                                goto out_unlock;
                        }
                } else if (disable & (1 << ssid)) {
                        if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
                                continue;
                        }

                        /* a child has it enabled? */
                        cgroup_for_each_live_child(child, cgrp) {
                                if (child->subtree_control & (1 << ssid)) {
                                        ret = -EBUSY;
                                        goto out_unlock;
                                }
                        }
                }
        }

        if (!enable && !disable) {
                ret = 0;
                goto out_unlock;
        }

        ret = cgroup_vet_subtree_control_enable(cgrp, enable);
        if (ret)
                goto out_unlock;

        /* save and update control masks and prepare csses */
        cgroup_save_control(cgrp);

        cgrp->subtree_control |= enable;
        cgrp->subtree_control &= ~disable;

        ret = cgroup_apply_control(cgrp);
        cgroup_finalize_control(cgrp, ret);
        if (ret)
                goto out_unlock;

        kernfs_activate(cgrp->kn);
out_unlock:
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

/**
 * cgroup_enable_threaded - make @cgrp threaded
 * @cgrp: the target cgroup
 *
 * Called when "threaded" is written to the cgroup.type interface file and
 * tries to make @cgrp threaded and join the parent's resource domain.
 * This function is never called on the root cgroup as cgroup.type doesn't
 * exist on it.
 */
static int cgroup_enable_threaded(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup *dom_cgrp = parent->dom_cgrp;
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* noop if already threaded */
        if (cgroup_is_threaded(cgrp))
                return 0;

        /*
         * If @cgroup is populated or has domain controllers enabled, it
         * can't be switched.  While the below cgroup_can_be_thread_root()
         * test can catch the same conditions, that's only when @parent is
         * not mixable, so let's check it explicitly.
         */
        if (cgroup_is_populated(cgrp) ||
            cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return -EOPNOTSUPP;

        /* we're joining the parent's domain, ensure its validity */
        if (!cgroup_is_valid_domain(dom_cgrp) ||
            !cgroup_can_be_thread_root(dom_cgrp))
                return -EOPNOTSUPP;

        /*
         * The following shouldn't cause actual migrations and should
         * always succeed.
         */
        cgroup_save_control(cgrp);

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
                if (dsct == cgrp || cgroup_is_threaded(dsct))
                        dsct->dom_cgrp = dom_cgrp;

        ret = cgroup_apply_control(cgrp);
        if (!ret)
                parent->nr_threaded_children++;

        cgroup_finalize_control(cgrp, ret);
        return ret;
}

static int cgroup_type_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        if (cgroup_is_threaded(cgrp))
                seq_puts(seq, "threaded\n");
        else if (!cgroup_is_valid_domain(cgrp))
                seq_puts(seq, "domain invalid\n");
        else if (cgroup_is_thread_root(cgrp))
                seq_puts(seq, "domain threaded\n");
        else
                seq_puts(seq, "domain\n");

        return 0;
}

static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int ret;

        /* only switching to threaded mode is supported */
        if (strcmp(strstrip(buf), "threaded"))
                return -EINVAL;

        /* drain dying csses before we re-apply (threaded) subtree control */
        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENOENT;

        /* threaded can only be enabled */
        ret = cgroup_enable_threaded(cgrp);

        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int descendants = READ_ONCE(cgrp->max_descendants);

        if (descendants == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", descendants);

        return 0;
}

static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int descendants;
        ssize_t ret;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                descendants = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &descendants);
                if (ret)
                        return ret;
        }

        if (descendants < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_descendants = descendants;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_max_depth_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int depth = READ_ONCE(cgrp->max_depth);

        if (depth == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", depth);

        return 0;
}

static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int depth;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                depth = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &depth);
                if (ret)
                        return ret;
        }

        if (depth < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_depth = depth;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_events_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
        seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));

        return 0;
}

static int cgroup_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgroup = seq_css(seq)->cgroup;

        seq_printf(seq, "nr_descendants %d\n",
                   cgroup->nr_descendants);
        seq_printf(seq, "nr_dying_descendants %d\n",
                   cgroup->nr_dying_descendants);

        return 0;
}

#ifdef CONFIG_CGROUP_SCHED
/**
 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get @cgrp's css associated with @ss.  If the css doesn't exist
 * or is offline, %NULL is returned.
 */
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
                                                     struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        css = cgroup_css(cgrp, ss);
        if (css && !css_tryget_online(css))
                css = NULL;
        rcu_read_unlock();

        return css;
}

static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct cgroup_subsys *ss = cgroup_subsys[ssid];
        struct cgroup_subsys_state *css;
        int ret;

        if (!ss->css_extra_stat_show)
                return 0;

        css = cgroup_tryget_css(cgrp, ss);
        if (!css)
                return 0;

        ret = ss->css_extra_stat_show(seq, css);
        css_put(css);
        return ret;
}

static int cgroup_local_stat_show(struct seq_file *seq,
                                  struct cgroup *cgrp, int ssid)
{
        struct cgroup_subsys *ss = cgroup_subsys[ssid];
        struct cgroup_subsys_state *css;
        int ret;

        if (!ss->css_local_stat_show)
                return 0;

        css = cgroup_tryget_css(cgrp, ss);
        if (!css)
                return 0;

        ret = ss->css_local_stat_show(seq, css);
        css_put(css);
        return ret;
}
#endif

static int cpu_stat_show(struct seq_file *seq, void *v)
{
        int ret = 0;

        cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
#endif
        return ret;
}

static int cpu_local_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
        int ret = 0;

#ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
        return ret;
}

#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_CPU);
}

static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
                              size_t nbytes, enum psi_res res)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct psi_trigger *new;
        struct cgroup *cgrp;
        struct psi_group *psi;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENODEV;

        cgroup_get(cgrp);
        cgroup_kn_unlock(of->kn);

        /* Allow only one trigger per file descriptor */
        if (ctx->psi.trigger) {
                cgroup_put(cgrp);
                return -EBUSY;
        }

        psi = cgroup_psi(cgrp);
        new = psi_trigger_create(psi, buf, res, of->file, of);
        if (IS_ERR(new)) {
                cgroup_put(cgrp);
                return PTR_ERR(new);
        }

        smp_store_release(&ctx->psi.trigger, new);
        cgroup_put(cgrp);

        return nbytes;
}

static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_IO);
}

static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_MEM);
}

static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_CPU);
}

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_IRQ);
}

static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
                                         char *buf, size_t nbytes,
                                         loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif

static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        seq_printf(seq, "%d\n", psi->enabled);

        return 0;
}

static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
                                     char *buf, size_t nbytes,
                                     loff_t off)
{
        ssize_t ret;
        int enable;
        struct cgroup *cgrp;
        struct psi_group *psi;

        ret = kstrtoint(strstrip(buf), 0, &enable);
        if (ret)
                return ret;

        if (enable < 0 || enable > 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        psi = cgroup_psi(cgrp);
        if (psi->enabled != enable) {
                int i;

                /* show or hide {cpu,memory,io,irq}.pressure files */
                for (i = 0; i < NR_PSI_RESOURCES; i++)
                        cgroup_file_show(&cgrp->psi_files[i], enable);

                psi->enabled = enable;
                if (enable)
                        psi_cgroup_restart(psi);
        }

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
                                          poll_table *pt)
{
        struct cgroup_file_ctx *ctx = of->priv;

        return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}

static void cgroup_pressure_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        psi_trigger_destroy(ctx->psi.trigger);
}

bool cgroup_psi_enabled(void)
{
        if (static_branch_likely(&psi_disabled))
                return false;

        return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}

#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
        return false;
}

#endif /* CONFIG_PSI */

static int cgroup_freeze_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "%d\n", cgrp->freezer.freeze);

        return 0;
}

static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
                                   char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int freeze;

        ret = kstrtoint(strstrip(buf), 0, &freeze);
        if (ret)
                return ret;

        if (freeze < 0 || freeze > 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgroup_freeze(cgrp, freeze);

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static void __cgroup_kill(struct cgroup *cgrp)
{
        struct css_task_iter it;
        struct task_struct *task;

        lockdep_assert_held(&cgroup_mutex);

        spin_lock_irq(&css_set_lock);
        set_bit(CGRP_KILL, &cgrp->flags);
        spin_unlock_irq(&css_set_lock);

        css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
        while ((task = css_task_iter_next(&it))) {
                /* Ignore kernel threads here. */
                if (task->flags & PF_KTHREAD)
                        continue;

                /* Skip tasks that are already dying. */
                if (__fatal_signal_pending(task))
                        continue;

                send_sig(SIGKILL, task, 0);
        }
        css_task_iter_end(&it);

        spin_lock_irq(&css_set_lock);
        clear_bit(CGRP_KILL, &cgrp->flags);
        spin_unlock_irq(&css_set_lock);
}

static void cgroup_kill(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *css;
        struct cgroup *dsct;

        lockdep_assert_held(&cgroup_mutex);

        cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
                __cgroup_kill(dsct);
}

static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        ssize_t ret = 0;
        int kill;
        struct cgroup *cgrp;

        ret = kstrtoint(strstrip(buf), 0, &kill);
        if (ret)
                return ret;

        if (kill != 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        /*
         * Killing is a process directed operation, i.e. the whole thread-group
         * is taken down so act like we do for cgroup.procs and only make this
         * writable in non-threaded cgroups.
         */
        if (cgroup_is_threaded(cgrp))
                ret = -EOPNOTSUPP;
        else
                cgroup_kill(cgrp);

        cgroup_kn_unlock(of->kn);

        return ret ?: nbytes;
}

static int cgroup_file_open(struct kernfs_open_file *of)
{
        struct cftype *cft = of_cft(of);
        struct cgroup_file_ctx *ctx;
        int ret;

        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        of->priv = ctx;

        if (!cft->open)
                return 0;

        ret = cft->open(of);
        if (ret) {
                put_cgroup_ns(ctx->ns);
                kfree(ctx);
        }
        return ret;
}

static void cgroup_file_release(struct kernfs_open_file *of)
{
        struct cftype *cft = of_cft(of);
        struct cgroup_file_ctx *ctx = of->priv;

        if (cft->release)
                cft->release(of);
        put_cgroup_ns(ctx->ns);
        kfree(ctx);
}

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *cgrp = of->kn->parent->priv;
        struct cftype *cft = of_cft(of);
        struct cgroup_subsys_state *css;
        int ret;

        if (!nbytes)
                return 0;

        /*
         * If namespaces are delegation boundaries, disallow writes to
         * files in an non-init namespace root from inside the namespace
         * except for the files explicitly marked delegatable -
         * cgroup.procs and cgroup.subtree_control.
         */
        if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
            !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
            ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
                return -EPERM;

        if (cft->write)
                return cft->write(of, buf, nbytes, off);

        /*
         * kernfs guarantees that a file isn't deleted with operations in
         * flight, which means that the matching css is and stays alive and
         * doesn't need to be pinned.  The RCU locking is not necessary
         * either.  It's just for the convenience of using cgroup_css().
         */
        rcu_read_lock();
        css = cgroup_css(cgrp, cft->ss);
        rcu_read_unlock();

        if (cft->write_u64) {
                unsigned long long v;
                ret = kstrtoull(buf, 0, &v);
                if (!ret)
                        ret = cft->write_u64(css, cft, v);
        } else if (cft->write_s64) {
                long long v;
                ret = kstrtoll(buf, 0, &v);
                if (!ret)
                        ret = cft->write_s64(css, cft, v);
        } else {
                ret = -EINVAL;
        }

        return ret ?: nbytes;
}

static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
        struct cftype *cft = of_cft(of);

        if (cft->poll)
                return cft->poll(of, pt);

        return kernfs_generic_poll(of, pt);
}

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
        return seq_cft(seq)->seq_start(seq, ppos);
}

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
        return seq_cft(seq)->seq_next(seq, v, ppos);
}

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
        if (seq_cft(seq)->seq_stop)
                seq_cft(seq)->seq_stop(seq, v);
}

static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
        struct cftype *cft = seq_cft(m);
        struct cgroup_subsys_state *css = seq_css(m);

        if (cft->seq_show)
                return cft->seq_show(m, arg);

        if (cft->read_u64)
                seq_printf(m, "%llu\n", cft->read_u64(css, cft));
        else if (cft->read_s64)
                seq_printf(m, "%lld\n", cft->read_s64(css, cft));
        else
                return -EINVAL;
        return 0;
}

static struct kernfs_ops cgroup_kf_single_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_show                = cgroup_seqfile_show,
};

static struct kernfs_ops cgroup_kf_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_start                = cgroup_seqfile_start,
        .seq_next                = cgroup_seqfile_next,
        .seq_stop                = cgroup_seqfile_stop,
        .seq_show                = cgroup_seqfile_show,
};

static void cgroup_file_notify_timer(struct timer_list *timer)
{
        cgroup_file_notify(container_of(timer, struct cgroup_file,
                                        notify_timer));
}

static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
                           struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];
        struct kernfs_node *kn;
        struct lock_class_key *key = NULL;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        key = &cft->lockdep_key;
#endif
        kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
                                  cgroup_file_mode(cft),
                                  current_fsuid(), current_fsgid(),
                                  0, cft->kf_ops, cft,
                                  NULL, key);
        if (IS_ERR(kn))
                return PTR_ERR(kn);

        if (cft->file_offset) {
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);

                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = kn;
                spin_unlock_irq(&cgroup_file_kn_lock);
        }

        return 0;
}

/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
 * @css: the target css
 * @cgrp: the target cgroup (usually css->cgroup)
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
 * For removals, this function never fails.
 */
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add)
{
        struct cftype *cft, *cft_end = NULL;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

restart:
        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
                        continue;
                if (is_add) {
                        ret = cgroup_add_file(css, cgrp, cft);
                        if (ret) {
                                pr_warn("%s: failed to add %s, err=%d\n",
                                        __func__, cft->name, ret);
                                cft_end = cft;
                                is_add = false;
                                goto restart;
                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
}

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
        struct cgroup_subsys *ss = cfts[0].ss;
        struct cgroup *root = &ss->root->cgrp;
        struct cgroup_subsys_state *css;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        /* add/rm files for all cgroups created before */
        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                struct cgroup *cgrp = css->cgroup;

                if (!(css->flags & CSS_VISIBLE))
                        continue;

                ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                if (ret)
                        break;
        }

        if (is_add && !ret)
                kernfs_activate(root->kn);
        return ret;
}

static void cgroup_exit_cftypes(struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* free copy for custom atomic_write_len, see init_cftypes() */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
                        kfree(cft->kf_ops);
                cft->kf_ops = NULL;
                cft->ss = NULL;

                /* revert flags set by cgroup core while adding @cfts */
                cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
                                __CFTYPE_ADDED);
        }
}

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;
        int ret = 0;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                struct kernfs_ops *kf_ops;

                WARN_ON(cft->ss || cft->kf_ops);

                if (cft->flags & __CFTYPE_ADDED) {
                        ret = -EBUSY;
                        break;
                }

                if (cft->seq_start)
                        kf_ops = &cgroup_kf_ops;
                else
                        kf_ops = &cgroup_kf_single_ops;

                /*
                 * Ugh... if @cft wants a custom max_write_len, we need to
                 * make a copy of kf_ops to set its atomic_write_len.
                 */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
                        kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
                        if (!kf_ops) {
                                ret = -ENOMEM;
                                break;
                        }
                        kf_ops->atomic_write_len = cft->max_write_len;
                }

                cft->kf_ops = kf_ops;
                cft->ss = ss;
                cft->flags |= __CFTYPE_ADDED;
        }

        if (ret)
                cgroup_exit_cftypes(cfts);
        return ret;
}

static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
        lockdep_assert_held(&cgroup_mutex);

        list_del(&cfts->node);
        cgroup_apply_cftypes(cfts, false);
        cgroup_exit_cftypes(cfts);
}

/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 * registered.
 */
int cgroup_rm_cftypes(struct cftype *cfts)
{
        if (!cfts || cfts[0].name[0] == '\0')
                return 0;

        if (!(cfts[0].flags & __CFTYPE_ADDED))
                return -ENOENT;

        cgroup_lock();
        cgroup_rm_cftypes_locked(cfts);
        cgroup_unlock();
        return 0;
}

/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        int ret;

        if (!cgroup_ssid_enabled(ss->id))
                return 0;

        if (!cfts || cfts[0].name[0] == '\0')
                return 0;

        ret = cgroup_init_cftypes(ss, cfts);
        if (ret)
                return ret;

        cgroup_lock();

        list_add_tail(&cfts->node, &ss->cfts);
        ret = cgroup_apply_cftypes(cfts, true);
        if (ret)
                cgroup_rm_cftypes_locked(cfts);

        cgroup_unlock();
        return ret;
}

/**
 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the default hierarchy.
 */
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_ONLY_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the legacy hierarchies.
 */
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_NOT_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
        unsigned long flags;

        spin_lock_irqsave(&cgroup_file_kn_lock, flags);
        if (cfile->kn) {
                unsigned long last = cfile->notified_at;
                unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;

                if (time_in_range(jiffies, last, next)) {
                        timer_reduce(&cfile->notify_timer, next);
                } else {
                        kernfs_notify(cfile->kn);
                        cfile->notified_at = jiffies;
                }
        }
        spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}

/**
 * cgroup_file_show - show or hide a hidden cgroup file
 * @cfile: target cgroup_file obtained by setting cftype->file_offset
 * @show: whether to show or hide
 */
void cgroup_file_show(struct cgroup_file *cfile, bool show)
{
        struct kernfs_node *kn;

        spin_lock_irq(&cgroup_file_kn_lock);
        kn = cfile->kn;
        kernfs_get(kn);
        spin_unlock_irq(&cgroup_file_kn_lock);

        if (kn)
                kernfs_show(kn, show);

        kernfs_put(kn);
}

/**
 * css_next_child - find the next child of a given css
 * @pos: the current position (%NULL to initiate traversal)
 * @parent: css whose children to walk
 *
 * This function returns the next child of @parent and should be called
 * under either cgroup_mutex or RCU read lock.  The only requirement is
 * that @parent and @pos are accessible.  The next sibling is guaranteed to
 * be returned regardless of their states.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /*
         * @pos could already have been unlinked from the sibling list.
         * Once a cgroup is removed, its ->sibling.next is no longer
         * updated when its next sibling changes.  CSS_RELEASED is set when
         * @pos is taken off list, at which time its next pointer is valid,
         * and, as releases are serialized, the one pointed to by the next
         * pointer is guaranteed to not have started release yet.  This
         * implies that if we observe !CSS_RELEASED on @pos in this RCU
         * critical section, the one pointed to by its next pointer is
         * guaranteed to not have finished its RCU grace period even if we
         * have dropped rcu_read_lock() in-between iterations.
         *
         * If @pos has CSS_RELEASED set, its next pointer can't be
         * dereferenced; however, as each css is given a monotonically
         * increasing unique serial number and always appended to the
         * sibling list, the next one can be found by walking the parent's
         * children until the first css with higher serial number than
         * @pos's.  While this path can be slower, it happens iff iteration
         * races against release and the race window is very small.
         */
        if (!pos) {
                next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
        } else if (likely(!(pos->flags & CSS_RELEASED))) {
                next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
        } else {
                list_for_each_entry_rcu(next, &parent->children, sibling,
                                        lockdep_is_held(&cgroup_mutex))
                        if (next->serial_nr > pos->serial_nr)
                                break;
        }

        /*
         * @next, if not pointing to the head, can be dereferenced and is
         * the next sibling.
         */
        if (&next->sibling != &parent->children)
                return next;
        return NULL;
}

/**
 * css_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_pre().  Find the next descendant
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
                        struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit @root */
        if (!pos)
                return root;

        /* visit the first child if exists */
        next = css_next_child(NULL, pos);
        if (next)
                return next;

        /* no child, visit my or the closest ancestor's next sibling */
        while (pos != root) {
                next = css_next_child(pos, pos->parent);
                if (next)
                        return next;
                pos = pos->parent;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(css_next_descendant_pre);

/**
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
 *
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct rightmost descendant as
 * long as @pos is accessible.
 */
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last, *tmp;

        cgroup_assert_mutex_or_rcu_locked();

        do {
                last = pos;
                /* ->prev isn't RCU safe, walk ->next till the end */
                pos = NULL;
                css_for_each_child(tmp, last)
                        pos = tmp;
        } while (pos);

        return last;
}

static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last;

        do {
                last = pos;
                pos = css_next_child(NULL, pos);
        } while (pos);

        return last;
}

/**
 * css_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_post().  Find the next descendant
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
                         struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit leftmost descendant which may be @root */
        if (!pos)
                return css_leftmost_descendant(root);

        /* if we visited @root, we're done */
        if (pos == root)
                return NULL;

        /* if there's an unvisited sibling, visit its leftmost descendant */
        next = css_next_child(pos, pos->parent);
        if (next)
                return css_leftmost_descendant(next);

        /* no sibling left, visit parent */
        return pos->parent;
}

/**
 * css_has_online_children - does a css have online children
 * @css: the target css
 *
 * Returns %true if @css has any online children; otherwise, %false.  This
 * function can be called from any context but the caller is responsible
 * for synchronizing against on/offlining as necessary.
 */
bool css_has_online_children(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys_state *child;
        bool ret = false;

        rcu_read_lock();
        css_for_each_child(child, css) {
                if (child->flags & CSS_ONLINE) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
{
        struct list_head *l;
        struct cgrp_cset_link *link;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* find the next threaded cset */
        if (it->tcset_pos) {
                l = it->tcset_pos->next;

                if (l != it->tcset_head) {
                        it->tcset_pos = l;
                        return container_of(l, struct css_set,
                                            threaded_csets_node);
                }

                it->tcset_pos = NULL;
        }

        /* find the next cset */
        l = it->cset_pos;
        l = l->next;
        if (l == it->cset_head) {
                it->cset_pos = NULL;
                return NULL;
        }

        if (it->ss) {
                cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
        } else {
                link = list_entry(l, struct cgrp_cset_link, cset_link);
                cset = link->cset;
        }

        it->cset_pos = l;

        /* initialize threaded css_set walking */
        if (it->flags & CSS_TASK_ITER_THREADED) {
                if (it->cur_dcset)
                        put_css_set_locked(it->cur_dcset);
                it->cur_dcset = cset;
                get_css_set(cset);

                it->tcset_head = &cset->threaded_csets;
                it->tcset_pos = &cset->threaded_csets;
        }

        return cset;
}

/**
 * css_task_iter_advance_css_set - advance a task iterator to the next css_set
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
 */
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* Advance to the next non-empty css_set and find first non-empty tasks list*/
        while ((cset = css_task_iter_next_css_set(it))) {
                if (!list_empty(&cset->tasks)) {
                        it->cur_tasks_head = &cset->tasks;
                        break;
                } else if (!list_empty(&cset->mg_tasks)) {
                        it->cur_tasks_head = &cset->mg_tasks;
                        break;
                } else if (!list_empty(&cset->dying_tasks)) {
                        it->cur_tasks_head = &cset->dying_tasks;
                        break;
                }
        }
        if (!cset) {
                it->task_pos = NULL;
                return;
        }
        it->task_pos = it->cur_tasks_head->next;

        /*
         * We don't keep css_sets locked across iteration steps and thus
         * need to take steps to ensure that iteration can be resumed after
         * the lock is re-acquired.  Iteration is performed at two levels -
         * css_sets and tasks in them.
         *
         * Once created, a css_set never leaves its cgroup lists, so a
         * pinned css_set is guaranteed to stay put and we can resume
         * iteration afterwards.
         *
         * Tasks may leave @cset across iteration steps.  This is resolved
         * by registering each iterator with the css_set currently being
         * walked and making css_set_move_task() advance iterators whose
         * next task is leaving.
         */
        if (it->cur_cset) {
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
        }
        get_css_set(cset);
        it->cur_cset = cset;
        list_add(&it->iters_node, &cset->task_iters);
}

static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task)
{
        lockdep_assert_held(&css_set_lock);

        if (it->task_pos == &task->cg_list) {
                it->task_pos = it->task_pos->next;
                it->flags |= CSS_TASK_ITER_SKIPPED;
        }
}

static void css_task_iter_advance(struct css_task_iter *it)
{
        struct task_struct *task;

        lockdep_assert_held(&css_set_lock);
repeat:
        if (it->task_pos) {
                /*
                 * Advance iterator to find next entry. We go through cset
                 * tasks, mg_tasks and dying_tasks, when consumed we move onto
                 * the next cset.
                 */
                if (it->flags & CSS_TASK_ITER_SKIPPED)
                        it->flags &= ~CSS_TASK_ITER_SKIPPED;
                else
                        it->task_pos = it->task_pos->next;

                if (it->task_pos == &it->cur_cset->tasks) {
                        it->cur_tasks_head = &it->cur_cset->mg_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->mg_tasks) {
                        it->cur_tasks_head = &it->cur_cset->dying_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->dying_tasks)
                        css_task_iter_advance_css_set(it);
        } else {
                /* called from start, proceed to the first cset */
                css_task_iter_advance_css_set(it);
        }

        if (!it->task_pos)
                return;

        task = list_entry(it->task_pos, struct task_struct, cg_list);

        if (it->flags & CSS_TASK_ITER_PROCS) {
                /* if PROCS, skip over tasks which aren't group leaders */
                if (!thread_group_leader(task))
                        goto repeat;

                /* and dying leaders w/o live member threads */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
                    !atomic_read(&task->signal->live))
                        goto repeat;
        } else {
                /* skip all dying ones */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
                        goto repeat;
        }
}

/**
 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
 * @flags: CSS_TASK_ITER_* flags
 * @it: the task iterator to use
 *
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
 */
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it)
{
        unsigned long irqflags;

        memset(it, 0, sizeof(*it));

        spin_lock_irqsave(&css_set_lock, irqflags);

        it->ss = css->ss;
        it->flags = flags;

        if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
        else
                it->cset_pos = &css->cgroup->cset_links;

        it->cset_head = it->cset_pos;

        css_task_iter_advance(it);

        spin_unlock_irqrestore(&css_set_lock, irqflags);
}

/**
 * css_task_iter_next - return the next task for the iterator
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
 */
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
        unsigned long irqflags;

        if (it->cur_task) {
                put_task_struct(it->cur_task);
                it->cur_task = NULL;
        }

        spin_lock_irqsave(&css_set_lock, irqflags);

        /* @it may be half-advanced by skips, finish advancing */
        if (it->flags & CSS_TASK_ITER_SKIPPED)
                css_task_iter_advance(it);

        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
                                          cg_list);
                get_task_struct(it->cur_task);
                css_task_iter_advance(it);
        }

        spin_unlock_irqrestore(&css_set_lock, irqflags);

        return it->cur_task;
}

/**
 * css_task_iter_end - finish task iteration
 * @it: the task iterator to finish
 *
 * Finish task iteration started by css_task_iter_start().
 */
void css_task_iter_end(struct css_task_iter *it)
{
        unsigned long irqflags;

        if (it->cur_cset) {
                spin_lock_irqsave(&css_set_lock, irqflags);
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
                spin_unlock_irqrestore(&css_set_lock, irqflags);
        }

        if (it->cur_dcset)
                put_css_set(it->cur_dcset);

        if (it->cur_task)
                put_task_struct(it->cur_task);
}

static void cgroup_procs_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        if (ctx->procs.started)
                css_task_iter_end(&ctx->procs.iter);
}

static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup_file_ctx *ctx = of->priv;

        if (pos)
                (*pos)++;

        return css_task_iter_next(&ctx->procs.iter);
}

static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
                                  unsigned int iter_flags)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup *cgrp = seq_css(s)->cgroup;
        struct cgroup_file_ctx *ctx = of->priv;
        struct css_task_iter *it = &ctx->procs.iter;

        /*
         * When a seq_file is seeked, it's always traversed sequentially
         * from position 0, so we can simply keep iterating on !0 *pos.
         */
        if (!ctx->procs.started) {
                if (WARN_ON_ONCE((*pos)))
                        return ERR_PTR(-EINVAL);
                css_task_iter_start(&cgrp->self, iter_flags, it);
                ctx->procs.started = true;
        } else if (!(*pos)) {
                css_task_iter_end(it);
                css_task_iter_start(&cgrp->self, iter_flags, it);
        } else
                return it->cur_task;

        return cgroup_procs_next(s, NULL, NULL);
}

static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
        struct cgroup *cgrp = seq_css(s)->cgroup;

        /*
         * All processes of a threaded subtree belong to the domain cgroup
         * of the subtree.  Only threads can be distributed across the
         * subtree.  Reject reads on cgroup.procs in the subtree proper.
         * They're always empty anyway.
         */
        if (cgroup_is_threaded(cgrp))
                return ERR_PTR(-EOPNOTSUPP);

        return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
                                            CSS_TASK_ITER_THREADED);
}

static int cgroup_procs_show(struct seq_file *s, void *v)
{
        seq_printf(s, "%d\n", task_pid_vnr(v));
        return 0;
}

static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
{
        int ret;
        struct inode *inode;

        lockdep_assert_held(&cgroup_mutex);

        inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
        if (!inode)
                return -ENOMEM;

        ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
        iput(inode);
        return ret;
}

static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
                                         struct cgroup *dst_cgrp,
                                         struct super_block *sb,
                                         struct cgroup_namespace *ns)
{
        struct cgroup *com_cgrp = src_cgrp;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* find the common ancestor */
        while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
                com_cgrp = cgroup_parent(com_cgrp);

        /* %current should be authorized to migrate to the common ancestor */
        ret = cgroup_may_write(com_cgrp, sb);
        if (ret)
                return ret;

        /*
         * If namespaces are delegation boundaries, %current must be able
         * to see both source and destination cgroups from its namespace.
         */
        if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
            (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
             !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
                return -ENOENT;

        return 0;
}

static int cgroup_attach_permissions(struct cgroup *src_cgrp,
                                     struct cgroup *dst_cgrp,
                                     struct super_block *sb, bool threadgroup,
                                     struct cgroup_namespace *ns)
{
        int ret = 0;

        ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
        if (ret)
                return ret;

        ret = cgroup_migrate_vet_dst(dst_cgrp);
        if (ret)
                return ret;

        if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
                ret = -EOPNOTSUPP;

        return ret;
}

static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
                                    bool threadgroup)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *src_cgrp, *dst_cgrp;
        struct task_struct *task;
        const struct cred *saved_cred;
        ssize_t ret;
        bool threadgroup_locked;

        dst_cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!dst_cgrp)
                return -ENODEV;

        task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
        ret = PTR_ERR_OR_ZERO(task);
        if (ret)
                goto out_unlock;

        /* find the source cgroup */
        spin_lock_irq(&css_set_lock);
        src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
        spin_unlock_irq(&css_set_lock);

        /*
         * Process and thread migrations follow same delegation rule. Check
         * permissions using the credentials from file open to protect against
         * inherited fd attacks.
         */
        saved_cred = override_creds(of->file->f_cred);
        ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
                                        of->file->f_path.dentry->d_sb,
                                        threadgroup, ctx->ns);
        revert_creds(saved_cred);
        if (ret)
                goto out_finish;

        ret = cgroup_attach_task(dst_cgrp, task, threadgroup);

out_finish:
        cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
        cgroup_kn_unlock(of->kn);

        return ret;
}

static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
                                  char *buf, size_t nbytes, loff_t off)
{
        return __cgroup_procs_write(of, buf, true) ?: nbytes;
}

static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
{
        return __cgroup_procs_start(s, pos, 0);
}

static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        return __cgroup_procs_write(of, buf, false) ?: nbytes;
}

/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.type",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_type_show,
                .write = cgroup_type_write,
        },
        {
                .name = "cgroup.procs",
                .flags = CFTYPE_NS_DELEGATABLE,
                .file_offset = offsetof(struct cgroup, procs_file),
                .release = cgroup_procs_release,
                .seq_start = cgroup_procs_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_procs_write,
        },
        {
                .name = "cgroup.threads",
                .flags = CFTYPE_NS_DELEGATABLE,
                .release = cgroup_procs_release,
                .seq_start = cgroup_threads_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_threads_write,
        },
        {
                .name = "cgroup.controllers",
                .seq_show = cgroup_controllers_show,
        },
        {
                .name = "cgroup.subtree_control",
                .flags = CFTYPE_NS_DELEGATABLE,
                .seq_show = cgroup_subtree_control_show,
                .write = cgroup_subtree_control_write,
        },
        {
                .name = "cgroup.events",
                .flags = CFTYPE_NOT_ON_ROOT,
                .file_offset = offsetof(struct cgroup, events_file),
                .seq_show = cgroup_events_show,
        },
        {
                .name = "cgroup.max.descendants",
                .seq_show = cgroup_max_descendants_show,
                .write = cgroup_max_descendants_write,
        },
        {
                .name = "cgroup.max.depth",
                .seq_show = cgroup_max_depth_show,
                .write = cgroup_max_depth_write,
        },
        {
                .name = "cgroup.stat",
                .seq_show = cgroup_stat_show,
        },
        {
                .name = "cgroup.freeze",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_freeze_show,
                .write = cgroup_freeze_write,
        },
        {
                .name = "cgroup.kill",
                .flags = CFTYPE_NOT_ON_ROOT,
                .write = cgroup_kill_write,
        },
        {
                .name = "cpu.stat",
                .seq_show = cpu_stat_show,
        },
        {
                .name = "cpu.stat.local",
                .seq_show = cpu_local_stat_show,
        },
        { }        /* terminate */
};

static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
        {
                .name = "io.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
                .seq_show = cgroup_io_pressure_show,
                .write = cgroup_io_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "memory.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
                .seq_show = cgroup_memory_pressure_show,
                .write = cgroup_memory_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "cpu.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
                .seq_show = cgroup_cpu_pressure_show,
                .write = cgroup_cpu_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        {
                .name = "irq.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
                .seq_show = cgroup_irq_pressure_show,
                .write = cgroup_irq_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
#endif
        {
                .name = "cgroup.pressure",
                .seq_show = cgroup_pressure_show,
                .write = cgroup_pressure_write,
        },
#endif /* CONFIG_PSI */
        { }        /* terminate */
};

/*
 * css destruction is four-stage process.
 *
 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
 *    and thus css_tryget_online() is guaranteed to fail, the css can be
 *    offlined by invoking offline_css().  After offlining, the base ref is
 *    put.  Implemented in css_killed_work_fn().
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
 *    RCU callback.
 *
 * 4. After the grace period, the css can be freed.  Implemented in
 *    css_free_rwork_fn().
 *
 * It is actually hairier because both step 2 and 4 require process context
 * and thus involve punting to css->destroy_work adding two additional
 * steps to the already complex sequence.
 */
static void css_free_rwork_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
                                struct cgroup_subsys_state, destroy_rwork);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        percpu_ref_exit(&css->refcnt);

        if (ss) {
                /* css free path */
                struct cgroup_subsys_state *parent = css->parent;
                int id = css->id;

                ss->css_free(css);
                cgroup_idr_remove(&ss->css_idr, id);
                cgroup_put(cgrp);

                if (parent)
                        css_put(parent);
        } else {
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
                if (!cgroup_on_dfl(cgrp))
                        cgroup1_pidlist_destroy_all(cgrp);
                cancel_work_sync(&cgrp->release_agent_work);
                bpf_cgrp_storage_free(cgrp);

                if (cgroup_parent(cgrp)) {
                        /*
                         * We get a ref to the parent, and put the ref when
                         * this cgroup is being freed, so it's guaranteed
                         * that the parent won't be destroyed before its
                         * children.
                         */
                        cgroup_put(cgroup_parent(cgrp));
                        kernfs_put(cgrp->kn);
                        psi_cgroup_free(cgrp);
                        cgroup_rstat_exit(cgrp);
                        kfree(cgrp);
                } else {
                        /*
                         * This is root cgroup's refcnt reaching zero,
                         * which indicates that the root should be
                         * released.
                         */
                        cgroup_destroy_root(cgrp->root);
                }
        }
}

static void css_release_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        cgroup_lock();

        css->flags |= CSS_RELEASED;
        list_del_rcu(&css->sibling);

        if (ss) {
                /* css release path */
                if (!list_empty(&css->rstat_css_node)) {
                        cgroup_rstat_flush(cgrp);
                        list_del_rcu(&css->rstat_css_node);
                }

                cgroup_idr_replace(&ss->css_idr, NULL, css->id);
                if (ss->css_released)
                        ss->css_released(css);
        } else {
                struct cgroup *tcgrp;

                /* cgroup release path */
                TRACE_CGROUP_PATH(release, cgrp);

                cgroup_rstat_flush(cgrp);

                spin_lock_irq(&css_set_lock);
                for (tcgrp = cgroup_parent(cgrp); tcgrp;
                     tcgrp = cgroup_parent(tcgrp))
                        tcgrp->nr_dying_descendants--;
                spin_unlock_irq(&css_set_lock);

                /*
                 * There are two control paths which try to determine
                 * cgroup from dentry without going through kernfs -
                 * cgroupstats_build() and css_tryget_online_from_dir().
                 * Those are supported by RCU protecting clearing of
                 * cgrp->kn->priv backpointer.
                 */
                if (cgrp->kn)
                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
                                         NULL);
        }

        cgroup_unlock();

        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
}

static void css_release(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        INIT_WORK(&css->destroy_work, css_release_work_fn);
        queue_work(cgroup_destroy_wq, &css->destroy_work);
}

static void init_and_link_css(struct cgroup_subsys_state *css,
                              struct cgroup_subsys *ss, struct cgroup *cgrp)
{
        lockdep_assert_held(&cgroup_mutex);

        cgroup_get_live(cgrp);

        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
        css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        INIT_LIST_HEAD(&css->rstat_css_node);
        css->serial_nr = css_serial_nr_next++;
        atomic_set(&css->online_cnt, 0);

        if (cgroup_parent(cgrp)) {
                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
                css_get(css->parent);
        }

        if (ss->css_rstat_flush)
                list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);

        BUG_ON(cgroup_css(cgrp, ss));
}

/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        if (ss->css_online)
                ret = ss->css_online(css);
        if (!ret) {
                css->flags |= CSS_ONLINE;
                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

                atomic_inc(&css->online_cnt);
                if (css->parent)
                        atomic_inc(&css->parent->online_cnt);
        }
        return ret;
}

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;

        lockdep_assert_held(&cgroup_mutex);

        if (!(css->flags & CSS_ONLINE))
                return;

        if (ss->css_offline)
                ss->css_offline(css);

        css->flags &= ~CSS_ONLINE;
        RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

        wake_up_all(&css->cgroup->offline_waitq);
}

/**
 * css_create - create a cgroup_subsys_state
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
 * css is online and installed in @cgrp.  This function doesn't create the
 * interface files.  Returns 0 on success, -errno on failure.
 */
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
        struct cgroup_subsys_state *css;
        int err;

        lockdep_assert_held(&cgroup_mutex);

        css = ss->css_alloc(parent_css);
        if (!css)
                css = ERR_PTR(-ENOMEM);
        if (IS_ERR(css))
                return css;

        init_and_link_css(css, ss, cgrp);

        err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
        if (err)
                goto err_free_css;

        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
        if (err < 0)
                goto err_free_css;
        css->id = err;

        /* @css is ready to be brought online now, make it visible */
        list_add_tail_rcu(&css->sibling, &parent_css->children);
        cgroup_idr_replace(&ss->css_idr, css, css->id);

        err = online_css(css);
        if (err)
                goto err_list_del;

        return css;

err_list_del:
        list_del_rcu(&css->sibling);
err_free_css:
        list_del_rcu(&css->rstat_css_node);
        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
        return ERR_PTR(err);
}

/*
 * The returned cgroup is fully initialized including its control mask, but
 * it doesn't have the control mask applied.
 */
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
                                    umode_t mode)
{
        struct cgroup_root *root = parent->root;
        struct cgroup *cgrp, *tcgrp;
        struct kernfs_node *kn;
        int level = parent->level + 1;
        int ret;

        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
        if (!cgrp)
                return ERR_PTR(-ENOMEM);

        ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
        if (ret)
                goto out_free_cgrp;

        ret = cgroup_rstat_init(cgrp);
        if (ret)
                goto out_cancel_ref;

        /* create the directory */
        kn = kernfs_create_dir_ns(parent->kn, name, mode,
                                  current_fsuid(), current_fsgid(),
                                  cgrp, NULL);
        if (IS_ERR(kn)) {
                ret = PTR_ERR(kn);
                goto out_stat_exit;
        }
        cgrp->kn = kn;

        init_cgroup_housekeeping(cgrp);

        cgrp->self.parent = &parent->self;
        cgrp->root = root;
        cgrp->level = level;

        ret = psi_cgroup_alloc(cgrp);
        if (ret)
                goto out_kernfs_remove;

        ret = cgroup_bpf_inherit(cgrp);
        if (ret)
                goto out_psi_free;

        /*
         * New cgroup inherits effective freeze counter, and
         * if the parent has to be frozen, the child has too.
         */
        cgrp->freezer.e_freeze = parent->freezer.e_freeze;
        if (cgrp->freezer.e_freeze) {
                /*
                 * Set the CGRP_FREEZE flag, so when a process will be
                 * attached to the child cgroup, it will become frozen.
                 * At this point the new cgroup is unpopulated, so we can
                 * consider it frozen immediately.
                 */
                set_bit(CGRP_FREEZE, &cgrp->flags);
                set_bit(CGRP_FROZEN, &cgrp->flags);
        }

        spin_lock_irq(&css_set_lock);
        for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                cgrp->ancestors[tcgrp->level] = tcgrp;

                if (tcgrp != cgrp) {
                        tcgrp->nr_descendants++;

                        /*
                         * If the new cgroup is frozen, all ancestor cgroups
                         * get a new frozen descendant, but their state can't
                         * change because of this.
                         */
                        if (cgrp->freezer.e_freeze)
                                tcgrp->freezer.nr_frozen_descendants++;
                }
        }
        spin_unlock_irq(&css_set_lock);

        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

        cgrp->self.serial_nr = css_serial_nr_next++;

        /* allocation complete, commit to creation */
        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
        atomic_inc(&root->nr_cgrps);
        cgroup_get_live(parent);

        /*
         * On the default hierarchy, a child doesn't automatically inherit
         * subtree_control from the parent.  Each is configured manually.
         */
        if (!cgroup_on_dfl(cgrp))
                cgrp->subtree_control = cgroup_control(cgrp);

        cgroup_propagate_control(cgrp);

        return cgrp;

out_psi_free:
        psi_cgroup_free(cgrp);
out_kernfs_remove:
        kernfs_remove(cgrp->kn);
out_stat_exit:
        cgroup_rstat_exit(cgrp);
out_cancel_ref:
        percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
        kfree(cgrp);
        return ERR_PTR(ret);
}

static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
        struct cgroup *cgroup;
        int ret = false;
        int level = 1;

        lockdep_assert_held(&cgroup_mutex);

        for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
                if (cgroup->nr_descendants >= cgroup->max_descendants)
                        goto fail;

                if (level > cgroup->max_depth)
                        goto fail;

                level++;
        }

        ret = true;
fail:
        return ret;
}

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
        struct cgroup *parent, *cgrp;
        int ret;

        /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
        if (strchr(name, '\n'))
                return -EINVAL;

        parent = cgroup_kn_lock_live(parent_kn, false);
        if (!parent)
                return -ENODEV;

        if (!cgroup_check_hierarchy_limits(parent)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        cgrp = cgroup_create(parent, name, mode);
        if (IS_ERR(cgrp)) {
                ret = PTR_ERR(cgrp);
                goto out_unlock;
        }

        /*
         * This extra ref will be put in cgroup_free_fn() and guarantees
         * that @cgrp->kn is always accessible.
         */
        kernfs_get(cgrp->kn);

        ret = css_populate_dir(&cgrp->self);
        if (ret)
                goto out_destroy;

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                goto out_destroy;

        TRACE_CGROUP_PATH(mkdir, cgrp);

        /* let's create and online css's */
        kernfs_activate(cgrp->kn);

        ret = 0;
        goto out_unlock;

out_destroy:
        cgroup_destroy_locked(cgrp);
out_unlock:
        cgroup_kn_unlock(parent_kn);
        return ret;
}

/*
 * This is called when the refcnt of a css is confirmed to be killed.
 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
 * initiate destruction and put the css ref from kill_css().
 */
static void css_killed_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);

        cgroup_lock();

        do {
                offline_css(css);
                css_put(css);
                /* @css can't go away while we're holding cgroup_mutex */
                css = css->parent;
        } while (css && atomic_dec_and_test(&css->online_cnt));

        cgroup_unlock();
}

/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        if (atomic_dec_and_test(&css->online_cnt)) {
                INIT_WORK(&css->destroy_work, css_killed_work_fn);
                queue_work(cgroup_destroy_wq, &css->destroy_work);
        }
}

/**
 * kill_css - destroy a css
 * @css: css to destroy
 *
 * This function initiates destruction of @css by removing cgroup interface
 * files and putting its base reference.  ->css_offline() will be invoked
 * asynchronously once css_tryget_online() is guaranteed to fail and when
 * the reference count reaches zero, @css will be released.
 */
static void kill_css(struct cgroup_subsys_state *css)
{
        lockdep_assert_held(&cgroup_mutex);

        if (css->flags & CSS_DYING)
                return;

        css->flags |= CSS_DYING;

        /*
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
        css_clear_dir(css);

        /*
         * Killing would put the base ref, but we need to keep it alive
         * until after ->css_offline().
         */
        css_get(css);

        /*
         * cgroup core guarantees that, by the time ->css_offline() is
         * invoked, no new css reference will be given out via
         * css_tryget_online().  We can't simply call percpu_ref_kill() and
         * proceed to offlining css's because percpu_ref_kill() doesn't
         * guarantee that the ref is seen as killed on all CPUs on return.
         *
         * Use percpu_ref_kill_and_confirm() to get notifications as each
         * css is confirmed to be seen as killed on all CPUs.
         */
        percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
 * guarantee that css_tryget_online() won't succeed by the time
 * ->css_offline() is invoked.  To satisfy all the requirements,
 * destruction is implemented in the following two steps.
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
        struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *css;
        struct cgrp_cset_link *link;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        /*
         * Only migration can raise populated from zero and we're already
         * holding cgroup_mutex.
         */
        if (cgroup_is_populated(cgrp))
                return -EBUSY;

        /*
         * Make sure there's no live children.  We can't test emptiness of
         * ->self.children as dead children linger on it while being
         * drained; otherwise, "rmdir parent/child parent" may fail.
         */
        if (css_has_online_children(&cgrp->self))
                return -EBUSY;

        /*
         * Mark @cgrp and the associated csets dead.  The former prevents
         * further task migration and child creation by disabling
         * cgroup_kn_lock_live().  The latter makes the csets ignored by
         * the migration path.
         */
        cgrp->self.flags &= ~CSS_ONLINE;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                link->cset->dead = true;
        spin_unlock_irq(&css_set_lock);

        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
                kill_css(css);

        /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
        css_clear_dir(&cgrp->self);
        kernfs_remove(cgrp->kn);

        if (cgroup_is_threaded(cgrp))
                parent->nr_threaded_children--;

        spin_lock_irq(&css_set_lock);
        for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                tcgrp->nr_descendants--;
                tcgrp->nr_dying_descendants++;
                /*
                 * If the dying cgroup is frozen, decrease frozen descendants
                 * counters of ancestor cgroups.
                 */
                if (test_bit(CGRP_FROZEN, &cgrp->flags))
                        tcgrp->freezer.nr_frozen_descendants--;
        }
        spin_unlock_irq(&css_set_lock);

        cgroup1_check_for_release(parent);

        cgroup_bpf_offline(cgrp);

        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);

        return 0;
};

int cgroup_rmdir(struct kernfs_node *kn)
{
        struct cgroup *cgrp;
        int ret = 0;

        cgrp = cgroup_kn_lock_live(kn, false);
        if (!cgrp)
                return 0;

        ret = cgroup_destroy_locked(cgrp);
        if (!ret)
                TRACE_CGROUP_PATH(rmdir, cgrp);

        cgroup_kn_unlock(kn);
        return ret;
}

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
        .show_options                = cgroup_show_options,
        .mkdir                        = cgroup_mkdir,
        .rmdir                        = cgroup_rmdir,
        .show_path                = cgroup_show_path,
};

static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
        struct cgroup_subsys_state *css;

        pr_debug("Initializing cgroup subsys %s\n", ss->name);

        cgroup_lock();

        idr_init(&ss->css_idr);
        INIT_LIST_HEAD(&ss->cfts);

        /* Create the root cgroup state for this subsystem */
        ss->root = &cgrp_dfl_root;
        css = ss->css_alloc(NULL);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);

        /*
         * Root csses are never destroyed and we can't initialize
         * percpu_ref during early init.  Disable refcnting.
         */
        css->flags |= CSS_NO_REF;

        if (early) {
                /* allocation can't be done safely during early init */
                css->id = 1;
        } else {
                css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
                BUG_ON(css->id < 0);
        }

        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
         * newly registered, all tasks and hence the
         * init_css_set is in the subsystem's root cgroup. */
        init_css_set.subsys[ss->id] = css;

        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
        have_release_callback |= (bool)ss->release << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;

        /* At system boot, before all subsystems have been
         * registered, no tasks have been forked, so we don't
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));

        BUG_ON(online_css(css));

        cgroup_unlock();
}

/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
 */
int __init cgroup_init_early(void)
{
        static struct cgroup_fs_context __initdata ctx;
        struct cgroup_subsys *ss;
        int i;

        ctx.root = &cgrp_dfl_root;
        init_cgroup_root(&ctx);
        cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

        RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

        for_each_subsys(ss, i) {
                WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
                     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
                     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
                     ss->id, ss->name);
                WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
                     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

                ss->id = i;
                ss->name = cgroup_subsys_name[i];
                if (!ss->legacy_name)
                        ss->legacy_name = cgroup_subsys_name[i];

                if (ss->early_init)
                        cgroup_init_subsys(ss, true);
        }
        return 0;
}

/**
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
 */
int __init cgroup_init(void)
{
        struct cgroup_subsys *ss;
        int ssid;

        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));

        cgroup_rstat_boot();

        get_user_ns(init_cgroup_ns.user_ns);

        cgroup_lock();

        /*
         * Add init_css_set to the hash table so that dfl_root can link to
         * it during init.
         */
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

        cgroup_unlock();

        for_each_subsys(ss, ssid) {
                if (ss->early_init) {
                        struct cgroup_subsys_state *css =
                                init_css_set.subsys[ss->id];

                        css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
                                                   GFP_KERNEL);
                        BUG_ON(css->id < 0);
                } else {
                        cgroup_init_subsys(ss, false);
                }

                list_add_tail(&init_css_set.e_cset_node[ssid],
                              &cgrp_dfl_root.cgrp.e_csets[ssid]);

                /*
                 * Setting dfl_root subsys_mask needs to consider the
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
                if (!cgroup_ssid_enabled(ssid))
                        continue;

                if (cgroup1_ssid_disabled(ssid))
                        pr_info("Disabling %s control group subsystem in v1 mounts\n",
                                ss->legacy_name);

                cgrp_dfl_root.subsys_mask |= 1 << ss->id;

                /* implicit controllers must be threaded too */
                WARN_ON(ss->implicit_on_dfl && !ss->threaded);

                if (ss->implicit_on_dfl)
                        cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                else if (!ss->dfl_cftypes)
                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;

                if (ss->threaded)
                        cgrp_dfl_threaded_ss_mask |= 1 << ss->id;

                if (ss->dfl_cftypes == ss->legacy_cftypes) {
                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                } else {
                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }

                if (ss->bind)
                        ss->bind(init_css_set.subsys[ssid]);

                cgroup_lock();
                css_populate_dir(init_css_set.subsys[ssid]);
                cgroup_unlock();
        }

        /* init_css_set.subsys[] has been updated, re-hash */
        hash_del(&init_css_set.hlist);
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
        WARN_ON(register_filesystem(&cgroup_fs_type));
        WARN_ON(register_filesystem(&cgroup2_fs_type));
        WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
#ifdef CONFIG_CPUSETS
        WARN_ON(register_filesystem(&cpuset_fs_type));
#endif

        return 0;
}

static int __init cgroup_wq_init(void)
{
        /*
         * There isn't much point in executing destruction path in
         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
         * Use 1 for @max_active.
         *
         * We would prefer to do this in cgroup_init() above, but that
         * is called before init_workqueues(): so leave this until after.
         */
        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
        BUG_ON(!cgroup_destroy_wq);
        return 0;
}
core_initcall(cgroup_wq_init);

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{
        struct kernfs_node *kn;

        kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
        if (!kn)
                return;
        kernfs_path(kn, buf, buflen);
        kernfs_put(kn);
}

/*
 * cgroup_get_from_id : get the cgroup associated with cgroup id
 * @id: cgroup id
 * On success return the cgrp or ERR_PTR on failure
 * Only cgroups within current task's cgroup NS are valid.
 */
struct cgroup *cgroup_get_from_id(u64 id)
{
        struct kernfs_node *kn;
        struct cgroup *cgrp, *root_cgrp;

        kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
        if (!kn)
                return ERR_PTR(-ENOENT);

        if (kernfs_type(kn) != KERNFS_DIR) {
                kernfs_put(kn);
                return ERR_PTR(-ENOENT);
        }

        rcu_read_lock();

        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp && !cgroup_tryget(cgrp))
                cgrp = NULL;

        rcu_read_unlock();
        kernfs_put(kn);

        if (!cgrp)
                return ERR_PTR(-ENOENT);

        root_cgrp = current_cgns_cgroup_dfl();
        if (!cgroup_is_descendant(cgrp, root_cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-ENOENT);
        }

        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);

/*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 */
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        int retval;
        struct cgroup_root *root;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        rcu_read_lock();
        spin_lock_irq(&css_set_lock);

        for_each_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
                int ssid, count = 0;

                if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
                        continue;

                cgrp = task_cgroup_from_root(tsk, root);
                /* The root has already been unmounted. */
                if (!cgrp)
                        continue;

                seq_printf(m, "%d:", root->hierarchy_id);
                if (root != &cgrp_dfl_root)
                        for_each_subsys(ss, ssid)
                                if (root->subsys_mask & (1 << ssid))
                                        seq_printf(m, "%s%s", count++ ? "," : "",
                                                   ss->legacy_name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');
                /*
                 * On traditional hierarchies, all zombie tasks show up as
                 * belonging to the root cgroup.  On the default hierarchy,
                 * while a zombie doesn't show up in "cgroup.procs" and
                 * thus can't be migrated, its /proc/PID/cgroup keeps
                 * reporting the cgroup it belonged to before exiting.  If
                 * the cgroup is removed before the zombie is reaped,
                 * " (deleted)" is appended to the cgroup path.
                 */
                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
                        retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
                                                current->nsproxy->cgroup_ns);
                        if (retval == -E2BIG)
                                retval = -ENAMETOOLONG;
                        if (retval < 0)
                                goto out_unlock;

                        seq_puts(m, buf);
                } else {
                        seq_puts(m, "/");
                }

                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
                        seq_puts(m, " (deleted)\n");
                else
                        seq_putc(m, '\n');
        }

        retval = 0;
out_unlock:
        spin_unlock_irq(&css_set_lock);
        rcu_read_unlock();
        kfree(buf);
out:
        return retval;
}

/**
 * cgroup_fork - initialize cgroup related fields during copy_process()
 * @child: pointer to task_struct of forking parent process.
 *
 * A task is associated with the init_css_set until cgroup_post_fork()
 * attaches it to the target css_set.
 */
void cgroup_fork(struct task_struct *child)
{
        RCU_INIT_POINTER(child->cgroups, &init_css_set);
        INIT_LIST_HEAD(&child->cg_list);
}

/**
 * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
 * @f: file corresponding to cgroup_dir
 *
 * Find the cgroup from a file pointer associated with a cgroup directory.
 * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
 * cgroup cannot be found.
 */
static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
        struct cgroup_subsys_state *css;

        css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
        if (IS_ERR(css))
                return ERR_CAST(css);

        return css->cgroup;
}

/**
 * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
 * cgroup2.
 * @f: file corresponding to cgroup2_dir
 */
static struct cgroup *cgroup_get_from_file(struct file *f)
{
        struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);

        if (IS_ERR(cgrp))
                return ERR_CAST(cgrp);

        if (!cgroup_on_dfl(cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-EBADF);
        }

        return cgrp;
}

/**
 * cgroup_css_set_fork - find or create a css_set for a child process
 * @kargs: the arguments passed to create the child process
 *
 * This functions finds or creates a new css_set which the child
 * process will be attached to in cgroup_post_fork(). By default,
 * the child process will be given the same css_set as its parent.
 *
 * If CLONE_INTO_CGROUP is specified this function will try to find an
 * existing css_set which includes the requested cgroup and if not create
 * a new css_set that the child will be attached to later. If this function
 * succeeds it will hold cgroup_threadgroup_rwsem on return. If
 * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
 * before grabbing cgroup_threadgroup_rwsem and will hold a reference
 * to the target cgroup.
 */
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
        __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
        int ret;
        struct cgroup *dst_cgrp = NULL;
        struct css_set *cset;
        struct super_block *sb;
        struct file *f;

        if (kargs->flags & CLONE_INTO_CGROUP)
                cgroup_lock();

        cgroup_threadgroup_change_begin(current);

        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        if (!(kargs->flags & CLONE_INTO_CGROUP)) {
                kargs->cset = cset;
                return 0;
        }

        f = fget_raw(kargs->cgroup);
        if (!f) {
                ret = -EBADF;
                goto err;
        }
        sb = f->f_path.dentry->d_sb;

        dst_cgrp = cgroup_get_from_file(f);
        if (IS_ERR(dst_cgrp)) {
                ret = PTR_ERR(dst_cgrp);
                dst_cgrp = NULL;
                goto err;
        }

        if (cgroup_is_dead(dst_cgrp)) {
                ret = -ENODEV;
                goto err;
        }

        /*
         * Verify that we the target cgroup is writable for us. This is
         * usually done by the vfs layer but since we're not going through
         * the vfs layer here we need to do it "manually".
         */
        ret = cgroup_may_write(dst_cgrp, sb);
        if (ret)
                goto err;

        /*
         * Spawning a task directly into a cgroup works by passing a file
         * descriptor to the target cgroup directory. This can even be an O_PATH
         * file descriptor. But it can never be a cgroup.procs file descriptor.
         * This was done on purpose so spawning into a cgroup could be
         * conceptualized as an atomic
         *
         *   fd = openat(dfd_cgroup, "cgroup.procs", ...);
         *   write(fd, <child-pid>, ...);
         *
         * sequence, i.e. it's a shorthand for the caller opening and writing
         * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
         * to always use the caller's credentials.
         */
        ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
                                        !(kargs->flags & CLONE_THREAD),
                                        current->nsproxy->cgroup_ns);
        if (ret)
                goto err;

        kargs->cset = find_css_set(cset, dst_cgrp);
        if (!kargs->cset) {
                ret = -ENOMEM;
                goto err;
        }

        put_css_set(cset);
        fput(f);
        kargs->cgrp = dst_cgrp;
        return ret;

err:
        cgroup_threadgroup_change_end(current);
        cgroup_unlock();
        if (f)
                fput(f);
        if (dst_cgrp)
                cgroup_put(dst_cgrp);
        put_css_set(cset);
        if (kargs->cset)
                put_css_set(kargs->cset);
        return ret;
}

/**
 * cgroup_css_set_put_fork - drop references we took during fork
 * @kargs: the arguments passed to create the child process
 *
 * Drop references to the prepared css_set and target cgroup if
 * CLONE_INTO_CGROUP was requested.
 */
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        struct cgroup *cgrp = kargs->cgrp;
        struct css_set *cset = kargs->cset;

        cgroup_threadgroup_change_end(current);

        if (cset) {
                put_css_set(cset);
                kargs->cset = NULL;
        }

        if (kargs->flags & CLONE_INTO_CGROUP) {
                cgroup_unlock();
                if (cgrp) {
                        cgroup_put(cgrp);
                        kargs->cgrp = NULL;
                }
        }
}

/**
 * cgroup_can_fork - called on a new task before the process is exposed
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This prepares a new css_set for the child process which the child will
 * be attached to in cgroup_post_fork().
 * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
 * callback returns an error, the fork aborts with that error code. This
 * allows for a cgroup subsystem to conditionally allow or deny new forks.
 */
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i, j, ret;

        ret = cgroup_css_set_fork(kargs);
        if (ret)
                return ret;

        do_each_subsys_mask(ss, i, have_canfork_callback) {
                ret = ss->can_fork(child, kargs->cset);
                if (ret)
                        goto out_revert;
        } while_each_subsys_mask();

        return 0;

out_revert:
        for_each_subsys(ss, j) {
                if (j >= i)
                        break;
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);
        }

        cgroup_css_set_put_fork(kargs);

        return ret;
}

/**
 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This calls the cancel_fork() callbacks if a fork failed *after*
 * cgroup_can_fork() succeeded and cleans up references we took to
 * prepare a new css_set for the child process in cgroup_can_fork().
 */
void cgroup_cancel_fork(struct task_struct *child,
                        struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_post_fork - finalize cgroup setup for the child process
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * Attach the child process to its css_set calling the subsystem fork()
 * callbacks.
 */
void cgroup_post_fork(struct task_struct *child,
                      struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        unsigned long cgrp_flags = 0;
        bool kill = false;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;

        cset = kargs->cset;
        kargs->cset = NULL;

        spin_lock_irq(&css_set_lock);

        /* init tasks are special, only link regular threads */
        if (likely(child->pid)) {
                if (kargs->cgrp)
                        cgrp_flags = kargs->cgrp->flags;
                else
                        cgrp_flags = cset->dfl_cgrp->flags;

                WARN_ON_ONCE(!list_empty(&child->cg_list));
                cset->nr_tasks++;
                css_set_move_task(child, NULL, cset, false);
        } else {
                put_css_set(cset);
                cset = NULL;
        }

        if (!(child->flags & PF_KTHREAD)) {
                if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
                        /*
                         * If the cgroup has to be frozen, the new task has
                         * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
                         * get the task into the frozen state.
                         */
                        spin_lock(&child->sighand->siglock);
                        WARN_ON_ONCE(child->frozen);
                        child->jobctl |= JOBCTL_TRAP_FREEZE;
                        spin_unlock(&child->sighand->siglock);

                        /*
                         * Calling cgroup_update_frozen() isn't required here,
                         * because it will be called anyway a bit later from
                         * do_freezer_trap(). So we avoid cgroup's transient
                         * switch from the frozen state and back.
                         */
                }

                /*
                 * If the cgroup is to be killed notice it now and take the
                 * child down right after we finished preparing it for
                 * userspace.
                 */
                kill = test_bit(CGRP_KILL, &cgrp_flags);
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * Call ss->fork().  This must happen after @child is linked on
         * css_set; otherwise, @child might change state between ->fork()
         * and addition to css_set.
         */
        do_each_subsys_mask(ss, i, have_fork_callback) {
                ss->fork(child);
        } while_each_subsys_mask();

        /* Make the new cset the root_cset of the new cgroup namespace. */
        if (kargs->flags & CLONE_NEWCGROUP) {
                struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;

                get_css_set(cset);
                child->nsproxy->cgroup_ns->root_cset = cset;
                put_css_set(rcset);
        }

        /* Cgroup has to be killed so take down child immediately. */
        if (unlikely(kill))
                do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
 *
 * Description: Detach cgroup from @tsk.
 *
 */
void cgroup_exit(struct task_struct *tsk)
{
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;

        spin_lock_irq(&css_set_lock);

        WARN_ON_ONCE(list_empty(&tsk->cg_list));
        cset = task_css_set(tsk);
        css_set_move_task(tsk, cset, NULL, false);
        list_add_tail(&tsk->cg_list, &cset->dying_tasks);
        cset->nr_tasks--;

        if (dl_task(tsk))
                dec_dl_tasks_cs(tsk);

        WARN_ON_ONCE(cgroup_task_frozen(tsk));
        if (unlikely(!(tsk->flags & PF_KTHREAD) &&
                     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
                cgroup_update_frozen(task_dfl_cgroup(tsk));

        spin_unlock_irq(&css_set_lock);

        /* see cgroup_post_fork() for details */
        do_each_subsys_mask(ss, i, have_exit_callback) {
                ss->exit(tsk);
        } while_each_subsys_mask();
}

void cgroup_release(struct task_struct *task)
{
        struct cgroup_subsys *ss;
        int ssid;

        do_each_subsys_mask(ss, ssid, have_release_callback) {
                ss->release(task);
        } while_each_subsys_mask();

        spin_lock_irq(&css_set_lock);
        css_set_skip_task_iters(task_css_set(task), task);
        list_del_init(&task->cg_list);
        spin_unlock_irq(&css_set_lock);
}

void cgroup_free(struct task_struct *task)
{
        struct css_set *cset = task_css_set(task);
        put_css_set(cset);
}

static int __init cgroup_disable(char *str)
{
        struct cgroup_subsys *ss;
        char *token;
        int i;

        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;

                for_each_subsys(ss, i) {
                        if (strcmp(token, ss->name) &&
                            strcmp(token, ss->legacy_name))
                                continue;

                        static_branch_disable(cgroup_subsys_enabled_key[i]);
                        pr_info("Disabling %s control group subsystem\n",
                                ss->name);
                }

                for (i = 0; i < OPT_FEATURE_COUNT; i++) {
                        if (strcmp(token, cgroup_opt_feature_names[i]))
                                continue;
                        cgroup_feature_disable_mask |= 1 << i;
                        pr_info("Disabling %s control group feature\n",
                                cgroup_opt_feature_names[i]);
                        break;
                }
        }
        return 1;
}
__setup("cgroup_disable=", cgroup_disable);

void __init __weak enable_debug_cgroup(void) { }

static int __init enable_cgroup_debug(char *str)
{
        cgroup_debug = true;
        enable_debug_cgroup();
        return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);

static int __init cgroup_favordynmods_setup(char *str)
{
        return (kstrtobool(str, &have_favordynmods) == 0);
}
__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);

/**
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
 *
 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
 */
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss)
{
        struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
        struct file_system_type *s_type = dentry->d_sb->s_type;
        struct cgroup_subsys_state *css = NULL;
        struct cgroup *cgrp;

        /* is @dentry a cgroup dir? */
        if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
            !kn || kernfs_type(kn) != KERNFS_DIR)
                return ERR_PTR(-EBADF);

        rcu_read_lock();

        /*
         * This path doesn't originate from kernfs and @kn could already
         * have been or be removed at any point.  @kn->priv is RCU
         * protected for this access.  See css_release_work_fn() for details.
         */
        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp)
                css = cgroup_css(cgrp, ss);

        if (!css || !css_tryget_online(css))
                css = ERR_PTR(-ENOENT);

        rcu_read_unlock();
        return css;
}

/**
 * css_from_id - lookup css by id
 * @id: the cgroup id
 * @ss: cgroup subsys to be looked into
 *
 * Returns the css if there's valid one with @id, otherwise returns NULL.
 * Should be called under rcu_read_lock().
 */
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return idr_find(&ss->css_idr, id);
}

/**
 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
 * @path: path on the default hierarchy
 *
 * Find the cgroup at @path on the default hierarchy, increment its
 * reference count and return it.  Returns pointer to the found cgroup on
 * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
 * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
 */
struct cgroup *cgroup_get_from_path(const char *path)
{
        struct kernfs_node *kn;
        struct cgroup *cgrp = ERR_PTR(-ENOENT);
        struct cgroup *root_cgrp;

        root_cgrp = current_cgns_cgroup_dfl();
        kn = kernfs_walk_and_get(root_cgrp->kn, path);
        if (!kn)
                goto out;

        if (kernfs_type(kn) != KERNFS_DIR) {
                cgrp = ERR_PTR(-ENOTDIR);
                goto out_kernfs;
        }

        rcu_read_lock();

        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (!cgrp || !cgroup_tryget(cgrp))
                cgrp = ERR_PTR(-ENOENT);

        rcu_read_unlock();

out_kernfs:
        kernfs_put(kn);
out:
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);

/**
 * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
 * @fd: fd obtained by open(cgroup_dir)
 *
 * Find the cgroup from a fd which should be obtained
 * by opening a cgroup directory.  Returns a pointer to the
 * cgroup on success. ERR_PTR is returned if the cgroup
 * cannot be found.
 */
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
        struct cgroup *cgrp;
        struct fd f = fdget_raw(fd);
        if (!f.file)
                return ERR_PTR(-EBADF);

        cgrp = cgroup_v1v2_get_from_file(f.file);
        fdput(f);
        return cgrp;
}

/**
 * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
 * cgroup2.
 * @fd: fd obtained by open(cgroup2_dir)
 */
struct cgroup *cgroup_get_from_fd(int fd)
{
        struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);

        if (IS_ERR(cgrp))
                return ERR_CAST(cgrp);

        if (!cgroup_on_dfl(cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-EBADF);
        }
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);

static u64 power_of_ten(int power)
{
        u64 v = 1;
        while (power--)
                v *= 10;
        return v;
}

/**
 * cgroup_parse_float - parse a floating number
 * @input: input string
 * @dec_shift: number of decimal digits to shift
 * @v: output
 *
 * Parse a decimal floating point number in @input and store the result in
 * @v with decimal point right shifted @dec_shift times.  For example, if
 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
 * Returns 0 on success, -errno otherwise.
 *
 * There's nothing cgroup specific about this function except that it's
 * currently the only user.
 */
int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
{
        s64 whole, frac = 0;
        int fstart = 0, fend = 0, flen;

        if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
                return -EINVAL;
        if (frac < 0)
                return -EINVAL;

        flen = fend > fstart ? fend - fstart : 0;
        if (flen < dec_shift)
                frac *= power_of_ten(dec_shift - flen);
        else
                frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));

        *v = whole * power_of_ten(dec_shift) + frac;
        return 0;
}

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgroup;

        rcu_read_lock();
        /* Don't associate the sock with unrelated interrupted task's cgroup. */
        if (in_interrupt()) {
                cgroup = &cgrp_dfl_root.cgrp;
                cgroup_get(cgroup);
                goto out;
        }

        while (true) {
                struct css_set *cset;

                cset = task_css_set(current);
                if (likely(cgroup_tryget(cset->dfl_cgrp))) {
                        cgroup = cset->dfl_cgrp;
                        break;
                }
                cpu_relax();
        }
out:
        skcd->cgroup = cgroup;
        cgroup_bpf_get(cgroup);
        rcu_read_unlock();
}

void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        /*
         * We might be cloning a socket which is left in an empty
         * cgroup and the cgroup might have already been rmdir'd.
         * Don't use cgroup_get_live().
         */
        cgroup_get(cgrp);
        cgroup_bpf_get(cgrp);
}

void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        cgroup_bpf_put(cgrp);
        cgroup_put(cgrp);
}

#endif        /* CONFIG_SOCK_CGROUP_DATA */

#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
                                      ssize_t size, const char *prefix)
{
        struct cftype *cft;
        ssize_t ret = 0;

        for (cft = files; cft && cft->name[0] != '\0'; cft++) {
                if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
                        continue;

                if (prefix)
                        ret += snprintf(buf + ret, size - ret, "%s.", prefix);

                ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);

                if (WARN_ON(ret >= size))
                        break;
        }

        return ret;
}

static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
                              char *buf)
{
        struct cgroup_subsys *ss;
        int ssid;
        ssize_t ret = 0;

        ret = show_delegatable_files(cgroup_base_files, buf + ret,
                                     PAGE_SIZE - ret, NULL);
        if (cgroup_psi_enabled())
                ret += show_delegatable_files(cgroup_psi_files, buf + ret,
                                              PAGE_SIZE - ret, NULL);

        for_each_subsys(ss, ssid)
                ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
                                              PAGE_SIZE - ret,
                                              cgroup_subsys_name[ssid]);

        return ret;
}
static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);

static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
{
        return snprintf(buf, PAGE_SIZE,
                        "nsdelegate\n"
                        "favordynmods\n"
                        "memory_localevents\n"
                        "memory_recursiveprot\n"
                        "memory_hugetlb_accounting\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);

static struct attribute *cgroup_sysfs_attrs[] = {
        &cgroup_delegate_attr.attr,
        &cgroup_features_attr.attr,
        NULL,
};

static const struct attribute_group cgroup_sysfs_attr_group = {
        .attrs = cgroup_sysfs_attrs,
        .name = "cgroup",
};

static int __init cgroup_sysfs_init(void)
{
        return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);

#endif /* CONFIG_SYSFS */





















































































































































































    1 


















































































    1 







    1 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the BSD Socket
 *                interface as the means of communication with the user level.
 *
 * Authors:        Lotsa people, from code originally in tcp
 */

#ifndef _INET_HASHTABLES_H
#define _INET_HASHTABLES_H


#include <linux/interrupt.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/wait.h>

#include <net/inet_connection_sock.h>
#include <net/inet_sock.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
#include <net/netns/hash.h>

#include <linux/refcount.h>
#include <asm/byteorder.h>

/* This is for all connections with a full identity, no wildcards.
 * The 'e' prefix stands for Establish, but we really put all sockets
 * but LISTEN ones.
 */
struct inet_ehash_bucket {
        struct hlist_nulls_head chain;
};

/* There are a few simple rules, which allow for local port reuse by
 * an application.  In essence:
 *
 *        1) Sockets bound to different interfaces may share a local port.
 *           Failing that, goto test 2.
 *        2) If all sockets have sk->sk_reuse set, and none of them are in
 *           TCP_LISTEN state, the port may be shared.
 *           Failing that, goto test 3.
 *        3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
 *           address, and none of them are the same, the port may be
 *           shared.
 *           Failing this, the port cannot be shared.
 *
 * The interesting point, is test #2.  This is what an FTP server does
 * all day.  To optimize this case we use a specific flag bit defined
 * below.  As we add sockets to a bind bucket list, we perform a
 * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
 * As long as all sockets added to a bind bucket pass this test,
 * the flag bit will be set.
 * The resulting situation is that tcp_v[46]_verify_bind() can just check
 * for this flag bit, if it is set and the socket trying to bind has
 * sk->sk_reuse set, we don't even have to walk the owners list at all,
 * we return that it is ok to bind this socket to the requested local port.
 *
 * Sounds like a lot of work, but it is worth it.  In a more naive
 * implementation (ie. current FreeBSD etc.) the entire list of ports
 * must be walked for each data port opened by an ftp server.  Needless
 * to say, this does not scale at all.  With a couple thousand FTP
 * users logged onto your box, isn't it nice to know that new data
 * ports are created in O(1) time?  I thought so. ;-)        -DaveM
 */
#define FASTREUSEPORT_ANY        1
#define FASTREUSEPORT_STRICT        2

struct inet_bind_bucket {
        possible_net_t                ib_net;
        int                        l3mdev;
        unsigned short                port;
        signed char                fastreuse;
        signed char                fastreuseport;
        kuid_t                        fastuid;
#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                fast_v6_rcv_saddr;
#endif
        __be32                        fast_rcv_saddr;
        unsigned short                fast_sk_family;
        bool                        fast_ipv6_only;
        struct hlist_node        node;
        struct hlist_head        bhash2;
};

struct inet_bind2_bucket {
        possible_net_t                ib_net;
        int                        l3mdev;
        unsigned short                port;
#if IS_ENABLED(CONFIG_IPV6)
        unsigned short                addr_type;
        struct in6_addr                v6_rcv_saddr;
#define rcv_saddr                v6_rcv_saddr.s6_addr32[3]
#else
        __be32                        rcv_saddr;
#endif
        /* Node in the bhash2 inet_bind_hashbucket chain */
        struct hlist_node        node;
        struct hlist_node        bhash_node;
        /* List of sockets hashed to this bucket */
        struct hlist_head        owners;
};

static inline struct net *ib_net(const struct inet_bind_bucket *ib)
{
        return read_pnet(&ib->ib_net);
}

static inline struct net *ib2_net(const struct inet_bind2_bucket *ib)
{
        return read_pnet(&ib->ib_net);
}

#define inet_bind_bucket_for_each(tb, head) \
        hlist_for_each_entry(tb, head, node)

struct inet_bind_hashbucket {
        spinlock_t                lock;
        struct hlist_head        chain;
};

/* Sockets can be hashed in established or listening table.
 * We must use different 'nulls' end-of-chain value for all hash buckets :
 * A socket might transition from ESTABLISH to LISTEN state without
 * RCU grace period. A lookup in ehash table needs to handle this case.
 */
#define LISTENING_NULLS_BASE (1U << 29)
struct inet_listen_hashbucket {
        spinlock_t                lock;
        struct hlist_nulls_head        nulls_head;
};

/* This is for listening sockets, thus all sockets which possess wildcards. */
#define INET_LHTABLE_SIZE        32        /* Yes, really, this is all you need. */

struct inet_hashinfo {
        /* This is for sockets with full identity only.  Sockets here will
         * always be without wildcards and will have the following invariant:
         *
         *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
         *
         */
        struct inet_ehash_bucket        *ehash;
        spinlock_t                        *ehash_locks;
        unsigned int                        ehash_mask;
        unsigned int                        ehash_locks_mask;

        /* Ok, let's try this, I give up, we do need a local binding
         * TCP hash as well as the others for fast bind/connect.
         */
        struct kmem_cache                *bind_bucket_cachep;
        /* This bind table is hashed by local port */
        struct inet_bind_hashbucket        *bhash;
        struct kmem_cache                *bind2_bucket_cachep;
        /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4)
         * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used
         * primarily for expediting bind conflict resolution.
         */
        struct inet_bind_hashbucket        *bhash2;
        unsigned int                        bhash_size;

        /* The 2nd listener table hashed by local port and address */
        unsigned int                        lhash2_mask;
        struct inet_listen_hashbucket        *lhash2;

        bool                                pernet;
} ____cacheline_aligned_in_smp;

static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IP_DCCP)
        return sk->sk_prot->h.hashinfo ? :
                sock_net(sk)->ipv4.tcp_death_row.hashinfo;
#else
        return sock_net(sk)->ipv4.tcp_death_row.hashinfo;
#endif
}

static inline struct inet_listen_hashbucket *
inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash)
{
        return &h->lhash2[hash & h->lhash2_mask];
}

static inline struct inet_ehash_bucket *inet_ehash_bucket(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
{
        return &hashinfo->ehash[hash & hashinfo->ehash_mask];
}

static inline spinlock_t *inet_ehash_lockp(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
{
        return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
}

int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);

static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h)
{
        kfree(h->lhash2);
        h->lhash2 = NULL;
}

static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
{
        kvfree(hashinfo->ehash_locks);
        hashinfo->ehash_locks = NULL;
}

struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
                                                 unsigned int ehash_entries);
void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo);

struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
                        struct inet_bind_hashbucket *head,
                        const unsigned short snum, int l3mdev);
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
                              struct inet_bind_bucket *tb);

bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
                            const struct net *net, unsigned short port,
                            int l3mdev);

struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
                         struct inet_bind_hashbucket *head,
                         struct inet_bind_bucket *tb,
                         const struct sock *sk);

void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
                               struct inet_bind2_bucket *tb);

struct inet_bind2_bucket *
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head,
                       const struct net *net,
                       unsigned short port, int l3mdev,
                       const struct sock *sk);

bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb,
                                      const struct net *net, unsigned short port,
                                      int l3mdev, const struct sock *sk);

static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
                               const u32 bhash_size)
{
        return (lport + net_hash_mix(net)) & (bhash_size - 1);
}

static inline struct inet_bind_hashbucket *
inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk,
                      const struct net *net, unsigned short port)
{
        u32 hash;

#if IS_ENABLED(CONFIG_IPV6)
        if (sk->sk_family == AF_INET6)
                hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
        else
#endif
                hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
        return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}

struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port);

/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or
 * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
 * rcv_saddr field should already have been updated when this is called.
 */
int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family);
void inet_bhash2_reset_saddr(struct sock *sk);

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
                    struct inet_bind2_bucket *tb2, unsigned short port);

/* Caller must disable local BH processing. */
int __inet_inherit_port(const struct sock *sk, struct sock *child);

void inet_put_port(struct sock *sk);

void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
                         unsigned long numentries, int scale,
                         unsigned long low_limit,
                         unsigned long high_limit);
int inet_hashinfo2_init_mod(struct inet_hashinfo *h);

bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk,
                         bool *found_dup_sk);
int __inet_hash(struct sock *sk, struct sock *osk);
int inet_hash(struct sock *sk);
void inet_unhash(struct sock *sk);

struct sock *__inet_lookup_listener(struct net *net,
                                    struct inet_hashinfo *hashinfo,
                                    struct sk_buff *skb, int doff,
                                    const __be32 saddr, const __be16 sport,
                                    const __be32 daddr,
                                    const unsigned short hnum,
                                    const int dif, const int sdif);

static inline struct sock *inet_lookup_listener(struct net *net,
                struct inet_hashinfo *hashinfo,
                struct sk_buff *skb, int doff,
                __be32 saddr, __be16 sport,
                __be32 daddr, __be16 dport, int dif, int sdif)
{
        return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
                                      daddr, ntohs(dport), dif, sdif);
}

/* Socket demux engine toys. */
/* What happens here is ugly; there's a pair of adjacent fields in
   struct inet_sock; __be16 dport followed by __u16 num.  We want to
   search by pair, so we combine the keys into a single 32bit value
   and compare with 32bit value read from &...->dport.  Let's at least
   make sure that it's not mixed with anything else...
   On 64bit targets we combine comparisons with pair of adjacent __be32
   fields in the same way.
*/
#ifdef __BIG_ENDIAN
#define INET_COMBINED_PORTS(__sport, __dport) \
        ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
#else /* __LITTLE_ENDIAN */
#define INET_COMBINED_PORTS(__sport, __dport) \
        ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
#endif

#ifdef __BIG_ENDIAN
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
        const __addrpair __name = (__force __addrpair) ( \
                                   (((__force __u64)(__be32)(__saddr)) << 32) | \
                                   ((__force __u64)(__be32)(__daddr)))
#else /* __LITTLE_ENDIAN */
#define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
        const __addrpair __name = (__force __addrpair) ( \
                                   (((__force __u64)(__be32)(__daddr)) << 32) | \
                                   ((__force __u64)(__be32)(__saddr)))
#endif /* __BIG_ENDIAN */

static inline bool inet_match(struct net *net, const struct sock *sk,
                              const __addrpair cookie, const __portpair ports,
                              int dif, int sdif)
{
        if (!net_eq(sock_net(sk), net) ||
            sk->sk_portpair != ports ||
            sk->sk_addrpair != cookie)
                return false;

        /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */
        return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif,
                                    sdif);
}

/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
 * not check it for lookups anymore, thanks Alexey. -DaveM
 */
struct sock *__inet_lookup_established(struct net *net,
                                       struct inet_hashinfo *hashinfo,
                                       const __be32 saddr, const __be16 sport,
                                       const __be32 daddr, const u16 hnum,
                                       const int dif, const int sdif);

typedef u32 (inet_ehashfn_t)(const struct net *net,
                              const __be32 laddr, const __u16 lport,
                              const __be32 faddr, const __be16 fport);

inet_ehashfn_t inet_ehashfn;

INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn);

struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk,
                                   struct sk_buff *skb, int doff,
                                   __be32 saddr, __be16 sport,
                                   __be32 daddr, unsigned short hnum,
                                   inet_ehashfn_t *ehashfn);

struct sock *inet_lookup_run_sk_lookup(struct net *net,
                                       int protocol,
                                       struct sk_buff *skb, int doff,
                                       __be32 saddr, __be16 sport,
                                       __be32 daddr, u16 hnum, const int dif,
                                       inet_ehashfn_t *ehashfn);

static inline struct sock *
        inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
                                const __be32 saddr, const __be16 sport,
                                const __be32 daddr, const __be16 dport,
                                const int dif)
{
        return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
                                         ntohs(dport), dif, 0);
}

static inline struct sock *__inet_lookup(struct net *net,
                                         struct inet_hashinfo *hashinfo,
                                         struct sk_buff *skb, int doff,
                                         const __be32 saddr, const __be16 sport,
                                         const __be32 daddr, const __be16 dport,
                                         const int dif, const int sdif,
                                         bool *refcounted)
{
        u16 hnum = ntohs(dport);
        struct sock *sk;

        sk = __inet_lookup_established(net, hashinfo, saddr, sport,
                                       daddr, hnum, dif, sdif);
        *refcounted = true;
        if (sk)
                return sk;
        *refcounted = false;
        return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
                                      sport, daddr, hnum, dif, sdif);
}

static inline struct sock *inet_lookup(struct net *net,
                                       struct inet_hashinfo *hashinfo,
                                       struct sk_buff *skb, int doff,
                                       const __be32 saddr, const __be16 sport,
                                       const __be32 daddr, const __be16 dport,
                                       const int dif)
{
        struct sock *sk;
        bool refcounted;

        sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
                           dport, dif, 0, &refcounted);

        if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
                sk = NULL;
        return sk;
}

static inline
struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff,
                             const __be32 saddr, const __be16 sport,
                             const __be32 daddr, const __be16 dport,
                             bool *refcounted, inet_ehashfn_t *ehashfn)
{
        struct sock *sk, *reuse_sk;
        bool prefetched;

        sk = skb_steal_sock(skb, refcounted, &prefetched);
        if (!sk)
                return NULL;

        if (!prefetched || !sk_fullsock(sk))
                return sk;

        if (sk->sk_protocol == IPPROTO_TCP) {
                if (sk->sk_state != TCP_LISTEN)
                        return sk;
        } else if (sk->sk_protocol == IPPROTO_UDP) {
                if (sk->sk_state != TCP_CLOSE)
                        return sk;
        } else {
                return sk;
        }

        reuse_sk = inet_lookup_reuseport(net, sk, skb, doff,
                                         saddr, sport, daddr, ntohs(dport),
                                         ehashfn);
        if (!reuse_sk)
                return sk;

        /* We've chosen a new reuseport sock which is never refcounted. This
         * implies that sk also isn't refcounted.
         */
        WARN_ON_ONCE(*refcounted);

        return reuse_sk;
}

static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
                                             struct sk_buff *skb,
                                             int doff,
                                             const __be16 sport,
                                             const __be16 dport,
                                             const int sdif,
                                             bool *refcounted)
{
        struct net *net = dev_net(skb_dst(skb)->dev);
        const struct iphdr *iph = ip_hdr(skb);
        struct sock *sk;

        sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport,
                             refcounted, inet_ehashfn);
        if (IS_ERR(sk))
                return NULL;
        if (sk)
                return sk;

        return __inet_lookup(net, hashinfo, skb,
                             doff, iph->saddr, sport,
                             iph->daddr, dport, inet_iif(skb), sdif,
                             refcounted);
}

static inline void sk_daddr_set(struct sock *sk, __be32 addr)
{
        sk->sk_daddr = addr; /* alias of inet_daddr */
#if IS_ENABLED(CONFIG_IPV6)
        ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr);
#endif
}

static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
{
        sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */
#if IS_ENABLED(CONFIG_IPV6)
        ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr);
#endif
}

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                        struct sock *sk, u64 port_offset,
                        int (*check_established)(struct inet_timewait_death_row *,
                                                 struct sock *, __u16,
                                                 struct inet_timewait_sock **));

int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk);
#endif /* _INET_HASHTABLES_H */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005, Devicescape Software, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2007-2010        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2015  Intel Mobile Communications GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 */

#ifndef IEEE80211_I_H
#define IEEE80211_I_H

#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/if_ether.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/etherdevice.h>
#include <linux/leds.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
#include <linux/rbtree.h>
#include <kunit/visibility.h>
#include <net/ieee80211_radiotap.h>
#include <net/cfg80211.h>
#include <net/mac80211.h>
#include <net/fq.h>
#include "key.h"
#include "sta_info.h"
#include "debug.h"
#include "drop.h"

extern const struct cfg80211_ops mac80211_config_ops;

struct ieee80211_local;
struct ieee80211_mesh_fast_tx;

/* Maximum number of broadcast/multicast frames to buffer when some of the
 * associated stations are using power saving. */
#define AP_MAX_BC_BUFFER 128

/* Maximum number of frames buffered to all STAs, including multicast frames.
 * Note: increasing this limit increases the potential memory requirement. Each
 * frame can be up to about 2 kB long. */
#define TOTAL_MAX_TX_BUFFER 512

/* Required encryption head and tailroom */
#define IEEE80211_ENCRYPT_HEADROOM 8
#define IEEE80211_ENCRYPT_TAILROOM 18

/* power level hasn't been configured (or set to automatic) */
#define IEEE80211_UNSET_POWER_LEVEL        INT_MIN

/*
 * Some APs experience problems when working with U-APSD. Decreasing the
 * probability of that happening by using legacy mode for all ACs but VO isn't
 * enough.
 *
 * Cisco 4410N originally forced us to enable VO by default only because it
 * treated non-VO ACs as legacy.
 *
 * However some APs (notably Netgear R7000) silently reclassify packets to
 * different ACs. Since u-APSD ACs require trigger frames for frame retrieval
 * clients would never see some frames (e.g. ARP responses) or would fetch them
 * accidentally after a long time.
 *
 * It makes little sense to enable u-APSD queues by default because it needs
 * userspace applications to be aware of it to actually take advantage of the
 * possible additional powersavings. Implicitly depending on driver autotrigger
 * frame support doesn't make much sense.
 */
#define IEEE80211_DEFAULT_UAPSD_QUEUES 0

#define IEEE80211_DEFAULT_MAX_SP_LEN                \
        IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL

extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];

#define IEEE80211_DEAUTH_FRAME_LEN        (24 /* hdr */ + 2 /* reason */)

#define IEEE80211_MAX_NAN_INSTANCE_ID 255

enum ieee80211_status_data {
        IEEE80211_STATUS_TYPE_MASK        = 0x00f,
        IEEE80211_STATUS_TYPE_INVALID        = 0,
        IEEE80211_STATUS_TYPE_SMPS        = 1,
        IEEE80211_STATUS_TYPE_NEG_TTLM        = 2,
        IEEE80211_STATUS_SUBDATA_MASK        = 0x1ff0,
};

static inline bool
ieee80211_sta_keep_active(struct sta_info *sta, u8 ac)
{
        /* Keep a station's queues on the active list for deficit accounting
         * purposes if it was active or queued during the last 100ms.
         */
        return time_before_eq(jiffies, sta->airtime[ac].last_active + HZ / 10);
}

struct ieee80211_bss {
        u32 device_ts_beacon, device_ts_presp;

        bool wmm_used;
        bool uapsd_supported;

#define IEEE80211_MAX_SUPP_RATES 32
        u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
        size_t supp_rates_len;
        struct ieee80211_rate *beacon_rate;

        u32 vht_cap_info;

        /*
         * During association, we save an ERP value from a probe response so
         * that we can feed ERP info to the driver when handling the
         * association completes. these fields probably won't be up-to-date
         * otherwise, you probably don't want to use them.
         */
        bool has_erp_value;
        u8 erp_value;

        /* Keep track of the corruption of the last beacon/probe response. */
        u8 corrupt_data;

        /* Keep track of what bits of information we have valid info for. */
        u8 valid_data;
};

/**
 * enum ieee80211_bss_corrupt_data_flags - BSS data corruption flags
 * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted
 * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted
 *
 * These are bss flags that are attached to a bss in the
 * @corrupt_data field of &struct ieee80211_bss.
 */
enum ieee80211_bss_corrupt_data_flags {
        IEEE80211_BSS_CORRUPT_BEACON                = BIT(0),
        IEEE80211_BSS_CORRUPT_PROBE_RESP        = BIT(1)
};

/**
 * enum ieee80211_bss_valid_data_flags - BSS valid data flags
 * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE
 * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE
 * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE
 *
 * These are bss flags that are attached to a bss in the
 * @valid_data field of &struct ieee80211_bss.  They show which parts
 * of the data structure were received as a result of an un-corrupted
 * beacon/probe response.
 */
enum ieee80211_bss_valid_data_flags {
        IEEE80211_BSS_VALID_WMM                        = BIT(1),
        IEEE80211_BSS_VALID_RATES                = BIT(2),
        IEEE80211_BSS_VALID_ERP                        = BIT(3)
};

typedef unsigned __bitwise ieee80211_tx_result;
#define TX_CONTINUE        ((__force ieee80211_tx_result) 0u)
#define TX_DROP                ((__force ieee80211_tx_result) 1u)
#define TX_QUEUED        ((__force ieee80211_tx_result) 2u)

#define IEEE80211_TX_UNICAST                BIT(1)
#define IEEE80211_TX_PS_BUFFERED        BIT(2)

struct ieee80211_tx_data {
        struct sk_buff *skb;
        struct sk_buff_head skbs;
        struct ieee80211_local *local;
        struct ieee80211_sub_if_data *sdata;
        struct sta_info *sta;
        struct ieee80211_key *key;
        struct ieee80211_tx_rate rate;

        unsigned int flags;
};

/**
 * enum ieee80211_packet_rx_flags - packet RX flags
 * @IEEE80211_RX_AMSDU: a-MSDU packet
 * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
 * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering
 *
 * These are per-frame flags that are attached to a frame in the
 * @rx_flags field of &struct ieee80211_rx_status.
 */
enum ieee80211_packet_rx_flags {
        IEEE80211_RX_AMSDU                        = BIT(3),
        IEEE80211_RX_MALFORMED_ACTION_FRM        = BIT(4),
        IEEE80211_RX_DEFERRED_RELEASE                = BIT(5),
};

/**
 * enum ieee80211_rx_flags - RX data flags
 *
 * @IEEE80211_RX_CMNTR: received on cooked monitor already
 * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported
 *        to cfg80211_report_obss_beacon().
 *
 * These flags are used across handling multiple interfaces
 * for a single frame.
 */
enum ieee80211_rx_flags {
        IEEE80211_RX_CMNTR                = BIT(0),
        IEEE80211_RX_BEACON_REPORTED        = BIT(1),
};

struct ieee80211_rx_data {
        struct list_head *list;
        struct sk_buff *skb;
        struct ieee80211_local *local;
        struct ieee80211_sub_if_data *sdata;
        struct ieee80211_link_data *link;
        struct sta_info *sta;
        struct link_sta_info *link_sta;
        struct ieee80211_key *key;

        unsigned int flags;

        /*
         * Index into sequence numbers array, 0..16
         * since the last (16) is used for non-QoS,
         * will be 16 on non-QoS frames.
         */
        int seqno_idx;

        /*
         * Index into the security IV/PN arrays, 0..16
         * since the last (16) is used for CCMP-encrypted
         * management frames, will be set to 16 on mgmt
         * frames and 0 on non-QoS frames.
         */
        int security_idx;

        int link_id;

        union {
                struct {
                        u32 iv32;
                        u16 iv16;
                } tkip;
                struct {
                        u8 pn[IEEE80211_CCMP_PN_LEN];
                } ccm_gcm;
        };
};

struct ieee80211_csa_settings {
        const u16 *counter_offsets_beacon;
        const u16 *counter_offsets_presp;

        int n_counter_offsets_beacon;
        int n_counter_offsets_presp;

        u8 count;
};

struct ieee80211_color_change_settings {
        u16 counter_offset_beacon;
        u16 counter_offset_presp;
        u8 count;
};

struct beacon_data {
        u8 *head, *tail;
        int head_len, tail_len;
        struct ieee80211_meshconf_ie *meshconf;
        u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
        u8 cntdwn_current_counter;
        struct cfg80211_mbssid_elems *mbssid_ies;
        struct cfg80211_rnr_elems *rnr_ies;
        struct rcu_head rcu_head;
};

struct probe_resp {
        struct rcu_head rcu_head;
        int len;
        u16 cntdwn_counter_offsets[IEEE80211_MAX_CNTDWN_COUNTERS_NUM];
        u8 data[];
};

struct fils_discovery_data {
        struct rcu_head rcu_head;
        int len;
        u8 data[];
};

struct unsol_bcast_probe_resp_data {
        struct rcu_head rcu_head;
        int len;
        u8 data[];
};

struct ps_data {
        /* yes, this looks ugly, but guarantees that we can later use
         * bitmap_empty :)
         * NB: don't touch this bitmap, use sta_info_{set,clear}_tim_bit */
        u8 tim[sizeof(unsigned long) * BITS_TO_LONGS(IEEE80211_MAX_AID + 1)]
                        __aligned(__alignof__(unsigned long));
        struct sk_buff_head bc_buf;
        atomic_t num_sta_ps; /* number of stations in PS mode */
        int dtim_count;
        bool dtim_bc_mc;
};

struct ieee80211_if_ap {
        struct list_head vlans; /* write-protected with RTNL and local->mtx */

        struct ps_data ps;
        atomic_t num_mcast_sta; /* number of stations receiving multicast */

        bool multicast_to_unicast;
        bool active;
};

struct ieee80211_if_vlan {
        struct list_head list; /* write-protected with RTNL and local->mtx */

        /* used for all tx if the VLAN is configured to 4-addr mode */
        struct sta_info __rcu *sta;
        atomic_t num_mcast_sta; /* number of stations receiving multicast */
};

struct mesh_stats {
        __u32 fwded_mcast;                /* Mesh forwarded multicast frames */
        __u32 fwded_unicast;                /* Mesh forwarded unicast frames */
        __u32 fwded_frames;                /* Mesh total forwarded frames */
        __u32 dropped_frames_ttl;        /* Not transmitted since mesh_ttl == 0*/
        __u32 dropped_frames_no_route;        /* Not transmitted, no route found */
};

#define PREQ_Q_F_START                0x1
#define PREQ_Q_F_REFRESH        0x2
struct mesh_preq_queue {
        struct list_head list;
        u8 dst[ETH_ALEN];
        u8 flags;
};

struct ieee80211_roc_work {
        struct list_head list;

        struct ieee80211_sub_if_data *sdata;

        struct ieee80211_channel *chan;

        bool started, abort, hw_begun, notified;
        bool on_channel;

        unsigned long start_time;

        u32 duration, req_duration;
        struct sk_buff *frame;
        u64 cookie, mgmt_tx_cookie;
        enum ieee80211_roc_type type;
};

/* flags used in struct ieee80211_if_managed.flags */
enum ieee80211_sta_flags {
        IEEE80211_STA_CONNECTION_POLL        = BIT(1),
        IEEE80211_STA_CONTROL_PORT        = BIT(2),
        IEEE80211_STA_MFP_ENABLED        = BIT(6),
        IEEE80211_STA_UAPSD_ENABLED        = BIT(7),
        IEEE80211_STA_NULLFUNC_ACKED        = BIT(8),
        IEEE80211_STA_ENABLE_RRM        = BIT(15),
};

enum ieee80211_conn_mode {
        IEEE80211_CONN_MODE_S1G,
        IEEE80211_CONN_MODE_LEGACY,
        IEEE80211_CONN_MODE_HT,
        IEEE80211_CONN_MODE_VHT,
        IEEE80211_CONN_MODE_HE,
        IEEE80211_CONN_MODE_EHT,
};

#define IEEE80211_CONN_MODE_HIGHEST        IEEE80211_CONN_MODE_EHT

enum ieee80211_conn_bw_limit {
        IEEE80211_CONN_BW_LIMIT_20,
        IEEE80211_CONN_BW_LIMIT_40,
        IEEE80211_CONN_BW_LIMIT_80,
        IEEE80211_CONN_BW_LIMIT_160, /* also 80+80 */
        IEEE80211_CONN_BW_LIMIT_320,
};

struct ieee80211_conn_settings {
        enum ieee80211_conn_mode mode;
        enum ieee80211_conn_bw_limit bw_limit;
};

extern const struct ieee80211_conn_settings ieee80211_conn_settings_unlimited;

struct ieee80211_mgd_auth_data {
        struct cfg80211_bss *bss;
        unsigned long timeout;
        int tries;
        u16 algorithm, expected_transaction;

        u8 key[WLAN_KEY_LEN_WEP104];
        u8 key_len, key_idx;
        bool done, waiting;
        bool peer_confirmed;
        bool timeout_started;
        int link_id;

        u8 ap_addr[ETH_ALEN] __aligned(2);

        u16 sae_trans, sae_status;
        size_t data_len;
        u8 data[];
};

struct ieee80211_mgd_assoc_data {
        struct {
                struct cfg80211_bss *bss;

                u8 addr[ETH_ALEN] __aligned(2);

                u8 ap_ht_param;

                struct ieee80211_vht_cap ap_vht_cap;

                size_t elems_len;
                u8 *elems; /* pointing to inside ie[] below */

                struct ieee80211_conn_settings conn;

                u16 status;

                bool disabled;
        } link[IEEE80211_MLD_MAX_NUM_LINKS];

        u8 ap_addr[ETH_ALEN] __aligned(2);

        /* this is for a workaround, so we use it only for non-MLO */
        const u8 *supp_rates;
        u8 supp_rates_len;

        unsigned long timeout;
        int tries;

        u8 prev_ap_addr[ETH_ALEN];
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len;
        bool wmm, uapsd;
        bool need_beacon;
        bool synced;
        bool timeout_started;
        bool comeback; /* whether the AP has requested association comeback */
        bool s1g;
        bool spp_amsdu;

        unsigned int assoc_link_id;

        u8 fils_nonces[2 * FILS_NONCE_LEN];
        u8 fils_kek[FILS_MAX_KEK_LEN];
        size_t fils_kek_len;

        size_t ie_len;
        u8 *ie_pos; /* used to fill ie[] with link[].elems */
        u8 ie[];
};

struct ieee80211_sta_tx_tspec {
        /* timestamp of the first packet in the time slice */
        unsigned long time_slice_start;

        u32 admitted_time; /* in usecs, unlike over the air */
        u8 tsid;
        s8 up; /* signed to be able to invalidate with -1 during teardown */

        /* consumed TX time in microseconds in the time slice */
        u32 consumed_tx_time;
        enum {
                TX_TSPEC_ACTION_NONE = 0,
                TX_TSPEC_ACTION_DOWNGRADE,
                TX_TSPEC_ACTION_STOP_DOWNGRADE,
        } action;
        bool downgraded;
};

/* Advertised TID-to-link mapping info */
struct ieee80211_adv_ttlm_info {
        /* time in TUs at which the new mapping is established, or 0 if there is
         * no planned advertised TID-to-link mapping
         */
        u16 switch_time;
        u32 duration; /* duration of the planned T2L map in TUs */
        u16 map; /* map of usable links for all TIDs */
        bool active; /* whether the advertised mapping is active or not */
};

DECLARE_EWMA(beacon_signal, 4, 4)

struct ieee80211_if_managed {
        struct timer_list timer;
        struct timer_list conn_mon_timer;
        struct timer_list bcn_mon_timer;
        struct wiphy_work monitor_work;
        struct wiphy_work beacon_connection_loss_work;
        struct wiphy_work csa_connection_drop_work;

        unsigned long beacon_timeout;
        unsigned long probe_timeout;
        int probe_send_count;
        bool nullfunc_failed;
        u8 connection_loss:1,
           driver_disconnect:1,
           reconnect:1,
           associated:1;

        struct ieee80211_mgd_auth_data *auth_data;
        struct ieee80211_mgd_assoc_data *assoc_data;

        bool powersave; /* powersave requested for this iface */
        bool broken_ap; /* AP is broken -- turn off powersave */

        unsigned int flags;

        u16 mcast_seq_last;

        bool status_acked;
        bool status_received;
        __le16 status_fc;

        enum {
                IEEE80211_MFP_DISABLED,
                IEEE80211_MFP_OPTIONAL,
                IEEE80211_MFP_REQUIRED
        } mfp; /* management frame protection */

        /*
         * Bitmask of enabled u-apsd queues,
         * IEEE80211_WMM_IE_STA_QOSINFO_AC_BE & co. Needs a new association
         * to take effect.
         */
        unsigned int uapsd_queues;

        /*
         * Maximum number of buffered frames AP can deliver during a
         * service period, IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL or similar.
         * Needs a new association to take effect.
         */
        unsigned int uapsd_max_sp_len;

        u8 use_4addr;

        /*
         * State variables for keeping track of RSSI of the AP currently
         * connected to and informing driver when RSSI has gone
         * below/above a certain threshold.
         */
        int rssi_min_thold, rssi_max_thold;

        struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */
        struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */
        struct ieee80211_vht_cap vht_capa; /* configured VHT overrides */
        struct ieee80211_vht_cap vht_capa_mask; /* Valid parts of vht_capa */
        struct ieee80211_s1g_cap s1g_capa; /* configured S1G overrides */
        struct ieee80211_s1g_cap s1g_capa_mask; /* valid s1g_capa bits */

        /* TDLS support */
        u8 tdls_peer[ETH_ALEN] __aligned(2);
        struct wiphy_delayed_work tdls_peer_del_work;
        struct sk_buff *orig_teardown_skb; /* The original teardown skb */
        struct sk_buff *teardown_skb; /* A copy to send through the AP */
        spinlock_t teardown_lock; /* To lock changing teardown_skb */
        bool tdls_wider_bw_prohibited;

        /* WMM-AC TSPEC support */
        struct ieee80211_sta_tx_tspec tx_tspec[IEEE80211_NUM_ACS];
        /* Use a separate work struct so that we can do something here
         * while the sdata->work is flushing the queues, for example.
         * otherwise, in scenarios where we hardly get any traffic out
         * on the BE queue, but there's a lot of VO traffic, we might
         * get stuck in a downgraded situation and flush takes forever.
         */
        struct wiphy_delayed_work tx_tspec_wk;

        /* Information elements from the last transmitted (Re)Association
         * Request frame.
         */
        u8 *assoc_req_ies;
        size_t assoc_req_ies_len;

        struct wiphy_delayed_work ml_reconf_work;
        u16 removed_links;

        /* TID-to-link mapping support */
        struct wiphy_delayed_work ttlm_work;
        struct ieee80211_adv_ttlm_info ttlm_info;
        struct wiphy_work teardown_ttlm_work;

        /* dialog token enumerator for neg TTLM request */
        u8 dialog_token_alloc;
        struct wiphy_delayed_work neg_ttlm_timeout_work;
};

struct ieee80211_if_ibss {
        struct timer_list timer;
        struct wiphy_work csa_connection_drop_work;

        unsigned long last_scan_completed;

        u32 basic_rates;

        bool fixed_bssid;
        bool fixed_channel;
        bool privacy;

        bool control_port;
        bool userspace_handles_dfs;

        u8 bssid[ETH_ALEN] __aligned(2);
        u8 ssid[IEEE80211_MAX_SSID_LEN];
        u8 ssid_len, ie_len;
        u8 *ie;
        struct cfg80211_chan_def chandef;

        unsigned long ibss_join_req;
        /* probe response/beacon for IBSS */
        struct beacon_data __rcu *presp;

        struct ieee80211_ht_cap ht_capa; /* configured ht-cap over-rides */
        struct ieee80211_ht_cap ht_capa_mask; /* Valid parts of ht_capa */

        spinlock_t incomplete_lock;
        struct list_head incomplete_stations;

        enum {
                IEEE80211_IBSS_MLME_SEARCH,
                IEEE80211_IBSS_MLME_JOINED,
        } state;
};

/**
 * struct ieee80211_if_ocb - OCB mode state
 *
 * @housekeeping_timer: timer for periodic invocation of a housekeeping task
 * @wrkq_flags: OCB deferred task action
 * @incomplete_lock: delayed STA insertion lock
 * @incomplete_stations: list of STAs waiting for delayed insertion
 * @joined: indication if the interface is connected to an OCB network
 */
struct ieee80211_if_ocb {
        struct timer_list housekeeping_timer;
        unsigned long wrkq_flags;

        spinlock_t incomplete_lock;
        struct list_head incomplete_stations;

        bool joined;
};

/**
 * struct ieee80211_mesh_sync_ops - Extensible synchronization framework interface
 *
 * these declarations define the interface, which enables
 * vendor-specific mesh synchronization
 *
 * @rx_bcn_presp: beacon/probe response was received
 * @adjust_tsf: TSF adjustment method
 */
struct ieee80211_mesh_sync_ops {
        void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata, u16 stype,
                             struct ieee80211_mgmt *mgmt, unsigned int len,
                             const struct ieee80211_meshconf_ie *mesh_cfg,
                             struct ieee80211_rx_status *rx_status);

        /* should be called with beacon_data under RCU read lock */
        void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata,
                           struct beacon_data *beacon);
        /* add other framework functions here */
};

struct mesh_csa_settings {
        struct rcu_head rcu_head;
        struct cfg80211_csa_settings settings;
};

/**
 * struct mesh_table - mesh hash table
 *
 * @known_gates: list of known mesh gates and their mpaths by the station. The
 * gate's mpath may or may not be resolved and active.
 * @gates_lock: protects updates to known_gates
 * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr
 * @walk_head: linked list containing all mesh_path objects
 * @walk_lock: lock protecting walk_head
 * @entries: number of entries in the table
 */
struct mesh_table {
        struct hlist_head known_gates;
        spinlock_t gates_lock;
        struct rhashtable rhead;
        struct hlist_head walk_head;
        spinlock_t walk_lock;
        atomic_t entries;                /* Up to MAX_MESH_NEIGHBOURS */
};

/**
 * struct mesh_tx_cache - mesh fast xmit header cache
 *
 * @rht: hash table containing struct ieee80211_mesh_fast_tx, using skb DA as key
 * @walk_head: linked list containing all ieee80211_mesh_fast_tx objects
 * @walk_lock: lock protecting walk_head and rht
 */
struct mesh_tx_cache {
        struct rhashtable rht;
        struct hlist_head walk_head;
        spinlock_t walk_lock;
};

struct ieee80211_if_mesh {
        struct timer_list housekeeping_timer;
        struct timer_list mesh_path_timer;
        struct timer_list mesh_path_root_timer;

        unsigned long wrkq_flags;
        unsigned long mbss_changed[64 / BITS_PER_LONG];

        bool userspace_handles_dfs;

        u8 mesh_id[IEEE80211_MAX_MESH_ID_LEN];
        size_t mesh_id_len;
        /* Active Path Selection Protocol Identifier */
        u8 mesh_pp_id;
        /* Active Path Selection Metric Identifier */
        u8 mesh_pm_id;
        /* Congestion Control Mode Identifier */
        u8 mesh_cc_id;
        /* Synchronization Protocol Identifier */
        u8 mesh_sp_id;
        /* Authentication Protocol Identifier */
        u8 mesh_auth_id;
        /* Local mesh Sequence Number */
        u32 sn;
        /* Last used PREQ ID */
        u32 preq_id;
        atomic_t mpaths;
        /* Timestamp of last SN update */
        unsigned long last_sn_update;
        /* Time when it's ok to send next PERR */
        unsigned long next_perr;
        /* Timestamp of last PREQ sent */
        unsigned long last_preq;
        struct mesh_rmc *rmc;
        spinlock_t mesh_preq_queue_lock;
        struct mesh_preq_queue preq_queue;
        int preq_queue_len;
        struct mesh_stats mshstats;
        struct mesh_config mshcfg;
        atomic_t estab_plinks;
        atomic_t mesh_seqnum;
        bool accepting_plinks;
        int num_gates;
        struct beacon_data __rcu *beacon;
        const u8 *ie;
        u8 ie_len;
        enum {
                IEEE80211_MESH_SEC_NONE = 0x0,
                IEEE80211_MESH_SEC_AUTHED = 0x1,
                IEEE80211_MESH_SEC_SECURED = 0x2,
        } security;
        bool user_mpm;
        /* Extensible Synchronization Framework */
        const struct ieee80211_mesh_sync_ops *sync_ops;
        s64 sync_offset_clockdrift_max;
        spinlock_t sync_offset_lock;
        /* mesh power save */
        enum nl80211_mesh_power_mode nonpeer_pm;
        int ps_peers_light_sleep;
        int ps_peers_deep_sleep;
        struct ps_data ps;
        /* Channel Switching Support */
        struct mesh_csa_settings __rcu *csa;
        enum {
                IEEE80211_MESH_CSA_ROLE_NONE,
                IEEE80211_MESH_CSA_ROLE_INIT,
                IEEE80211_MESH_CSA_ROLE_REPEATER,
        } csa_role;
        u8 chsw_ttl;
        u16 pre_value;

        /* offset from skb->data while building IE */
        int meshconf_offset;

        struct mesh_table mesh_paths;
        struct mesh_table mpp_paths; /* Store paths for MPP&MAP */
        int mesh_paths_generation;
        int mpp_paths_generation;
        struct mesh_tx_cache tx_cache;
};

#ifdef CONFIG_MAC80211_MESH
#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name)        \
        do { (msh)->mshstats.name++; } while (0)
#else
#define IEEE80211_IFSTA_MESH_CTR_INC(msh, name) \
        do { } while (0)
#endif

/**
 * enum ieee80211_sub_if_data_flags - virtual interface flags
 *
 * @IEEE80211_SDATA_ALLMULTI: interface wants all multicast packets
 * @IEEE80211_SDATA_DONT_BRIDGE_PACKETS: bridge packets between
 *        associated stations and deliver multicast frames both
 *        back to wireless media and to the local net stack.
 * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume.
 * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver
 * @IEEE80211_SDATA_DISCONNECT_HW_RESTART: Disconnect after hardware restart
 *  recovery
 */
enum ieee80211_sub_if_data_flags {
        IEEE80211_SDATA_ALLMULTI                = BIT(0),
        IEEE80211_SDATA_DONT_BRIDGE_PACKETS        = BIT(3),
        IEEE80211_SDATA_DISCONNECT_RESUME        = BIT(4),
        IEEE80211_SDATA_IN_DRIVER                = BIT(5),
        IEEE80211_SDATA_DISCONNECT_HW_RESTART        = BIT(6),
};

/**
 * enum ieee80211_sdata_state_bits - virtual interface state bits
 * @SDATA_STATE_RUNNING: virtual interface is up & running; this
 *        mirrors netif_running() but is separate for interface type
 *        change handling while the interface is up
 * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel
 *        mode, so queues are stopped
 * @SDATA_STATE_OFFCHANNEL_BEACON_STOPPED: Beaconing was stopped due
 *        to offchannel, reset when offchannel returns
 */
enum ieee80211_sdata_state_bits {
        SDATA_STATE_RUNNING,
        SDATA_STATE_OFFCHANNEL,
        SDATA_STATE_OFFCHANNEL_BEACON_STOPPED,
};

/**
 * enum ieee80211_chanctx_mode - channel context configuration mode
 *
 * @IEEE80211_CHANCTX_SHARED: channel context may be used by
 *        multiple interfaces
 * @IEEE80211_CHANCTX_EXCLUSIVE: channel context can be used
 *        only by a single interface. This can be used for example for
 *        non-fixed channel IBSS.
 */
enum ieee80211_chanctx_mode {
        IEEE80211_CHANCTX_SHARED,
        IEEE80211_CHANCTX_EXCLUSIVE
};

/**
 * enum ieee80211_chanctx_replace_state - channel context replacement state
 *
 * This is used for channel context in-place reservations that require channel
 * context switch/swap.
 *
 * @IEEE80211_CHANCTX_REPLACE_NONE: no replacement is taking place
 * @IEEE80211_CHANCTX_WILL_BE_REPLACED: this channel context will be replaced
 *        by a (not yet registered) channel context pointed by %replace_ctx.
 * @IEEE80211_CHANCTX_REPLACES_OTHER: this (not yet registered) channel context
 *        replaces an existing channel context pointed to by %replace_ctx.
 */
enum ieee80211_chanctx_replace_state {
        IEEE80211_CHANCTX_REPLACE_NONE,
        IEEE80211_CHANCTX_WILL_BE_REPLACED,
        IEEE80211_CHANCTX_REPLACES_OTHER,
};

struct ieee80211_chanctx {
        struct list_head list;
        struct rcu_head rcu_head;

        struct list_head assigned_links;
        struct list_head reserved_links;

        enum ieee80211_chanctx_replace_state replace_state;
        struct ieee80211_chanctx *replace_ctx;

        enum ieee80211_chanctx_mode mode;
        bool driver_present;

        /* temporary data for search algorithm etc. */
        struct ieee80211_chan_req req;

        struct ieee80211_chanctx_conf conf;
};

struct mac80211_qos_map {
        struct cfg80211_qos_map qos_map;
        struct rcu_head rcu_head;
};

enum txq_info_flags {
        IEEE80211_TXQ_STOP,
        IEEE80211_TXQ_AMPDU,
        IEEE80211_TXQ_NO_AMSDU,
        IEEE80211_TXQ_DIRTY,
};

/**
 * struct txq_info - per tid queue
 *
 * @tin: contains packets split into multiple flows
 * @def_cvars: codel vars for the @tin's default_flow
 * @cstats: code statistics for this queue
 * @frags: used to keep fragments created after dequeue
 * @schedule_order: used with ieee80211_local->active_txqs
 * @schedule_round: counter to prevent infinite loops on TXQ scheduling
 * @flags: TXQ flags from &enum txq_info_flags
 * @txq: the driver visible part
 */
struct txq_info {
        struct fq_tin tin;
        struct codel_vars def_cvars;
        struct codel_stats cstats;

        u16 schedule_round;
        struct list_head schedule_order;

        struct sk_buff_head frags;

        unsigned long flags;

        /* keep last! */
        struct ieee80211_txq txq;
};

struct ieee80211_if_mntr {
        u32 flags;
        u8 mu_follow_addr[ETH_ALEN] __aligned(2);

        struct list_head list;
};

/**
 * struct ieee80211_if_nan - NAN state
 *
 * @conf: current NAN configuration
 * @func_lock: lock for @func_inst_ids
 * @function_inst_ids: a bitmap of available instance_id's
 */
struct ieee80211_if_nan {
        struct cfg80211_nan_conf conf;

        /* protects function_inst_ids */
        spinlock_t func_lock;
        struct idr function_inst_ids;
};

struct ieee80211_link_data_managed {
        u8 bssid[ETH_ALEN] __aligned(2);

        u8 dtim_period;
        enum ieee80211_smps_mode req_smps, /* requested smps mode */
                                 driver_smps_mode; /* smps mode request */

        struct ieee80211_conn_settings conn;

        s16 p2p_noa_index;

        bool tdls_chan_switch_prohibited;

        bool have_beacon;
        bool tracking_signal_avg;
        bool disable_wmm_tracking;
        bool operating_11g_mode;

        struct {
                struct wiphy_delayed_work switch_work;
                struct cfg80211_chan_def ap_chandef;
                struct ieee80211_parsed_tpe tpe;
                unsigned long time;
                bool waiting_bcn;
                bool ignored_same_chan;
                bool blocked_tx;
        } csa;

        struct wiphy_work request_smps_work;
        /* used to reconfigure hardware SM PS */
        struct wiphy_work recalc_smps;

        bool beacon_crc_valid;
        u32 beacon_crc;
        struct ewma_beacon_signal ave_beacon_signal;
        int last_ave_beacon_signal;

        /*
         * Number of Beacon frames used in ave_beacon_signal. This can be used
         * to avoid generating less reliable cqm events that would be based
         * only on couple of received frames.
         */
        unsigned int count_beacon_signal;

        /* Number of times beacon loss was invoked. */
        unsigned int beacon_loss_count;

        /*
         * Last Beacon frame signal strength average (ave_beacon_signal / 16)
         * that triggered a cqm event. 0 indicates that no event has been
         * generated for the current association.
         */
        int last_cqm_event_signal;

        int wmm_last_param_set;
        int mu_edca_last_param_set;

        u8 bss_param_ch_cnt;
};

struct ieee80211_link_data_ap {
        struct beacon_data __rcu *beacon;
        struct probe_resp __rcu *probe_resp;
        struct fils_discovery_data __rcu *fils_discovery;
        struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp;

        /* to be used after channel switch. */
        struct cfg80211_beacon_data *next_beacon;
};

struct ieee80211_link_data {
        struct ieee80211_sub_if_data *sdata;
        unsigned int link_id;

        struct list_head assigned_chanctx_list; /* protected by wiphy mutex */
        struct list_head reserved_chanctx_list; /* protected by wiphy mutex */

        /* multicast keys only */
        struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS +
                                        NUM_DEFAULT_MGMT_KEYS +
                                        NUM_DEFAULT_BEACON_KEYS];
        struct ieee80211_key __rcu *default_multicast_key;
        struct ieee80211_key __rcu *default_mgmt_key;
        struct ieee80211_key __rcu *default_beacon_key;


        bool operating_11g_mode;

        struct {
                struct wiphy_work finalize_work;
                struct ieee80211_chan_req chanreq;
        } csa;

        struct wiphy_work color_change_finalize_work;
        struct delayed_work color_collision_detect_work;
        u64 color_bitmap;

        /* context reservation -- protected with wiphy mutex */
        struct ieee80211_chanctx *reserved_chanctx;
        struct ieee80211_chan_req reserved;
        bool reserved_radar_required;
        bool reserved_ready;

        u8 needed_rx_chains;
        enum ieee80211_smps_mode smps_mode;

        int user_power_level; /* in dBm */
        int ap_power_level; /* in dBm */

        bool radar_required;

        union {
                struct ieee80211_link_data_managed mgd;
                struct ieee80211_link_data_ap ap;
        } u;

        struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];

        struct ieee80211_bss_conf *conf;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct dentry *debugfs_dir;
#endif
};

struct ieee80211_sub_if_data {
        struct list_head list;

        struct wireless_dev wdev;

        /* keys */
        struct list_head key_list;

        /* count for keys needing tailroom space allocation */
        int crypto_tx_tailroom_needed_cnt;
        int crypto_tx_tailroom_pending_dec;
        struct wiphy_delayed_work dec_tailroom_needed_wk;

        struct net_device *dev;
        struct ieee80211_local *local;

        unsigned int flags;

        unsigned long state;

        bool csa_blocked_queues;

        char name[IFNAMSIZ];

        struct ieee80211_fragment_cache frags;

        /* TID bitmap for NoAck policy */
        u16 noack_map;

        /* bit field of ACM bits (BIT(802.1D tag)) */
        u8 wmm_acm;

        struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS];
        struct ieee80211_key __rcu *default_unicast_key;

        u16 sequence_number;
        u16 mld_mcast_seq;
        __be16 control_port_protocol;
        bool control_port_no_encrypt;
        bool control_port_no_preauth;
        bool control_port_over_nl80211;

        atomic_t num_tx_queued;
        struct mac80211_qos_map __rcu *qos_map;

        struct wiphy_work work;
        struct sk_buff_head skb_queue;
        struct sk_buff_head status_queue;

        /*
         * AP this belongs to: self in AP mode and
         * corresponding AP in VLAN mode, NULL for
         * all others (might be needed later in IBSS)
         */
        struct ieee80211_if_ap *bss;

        /* bitmap of allowed (non-MCS) rate indexes for rate control */
        u32 rc_rateidx_mask[NUM_NL80211_BANDS];

        bool rc_has_mcs_mask[NUM_NL80211_BANDS];
        u8  rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN];

        bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS];
        u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX];

        /* Beacon frame (non-MCS) rate (as a bitmap) */
        u32 beacon_rateidx_mask[NUM_NL80211_BANDS];
        bool beacon_rate_set;

        union {
                struct ieee80211_if_ap ap;
                struct ieee80211_if_vlan vlan;
                struct ieee80211_if_managed mgd;
                struct ieee80211_if_ibss ibss;
                struct ieee80211_if_mesh mesh;
                struct ieee80211_if_ocb ocb;
                struct ieee80211_if_mntr mntr;
                struct ieee80211_if_nan nan;
        } u;

        struct ieee80211_link_data deflink;
        struct ieee80211_link_data __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS];

        struct wiphy_delayed_work dfs_cac_timer_work;

        /* for ieee80211_set_active_links_async() */
        struct wiphy_work activate_links_work;
        u16 desired_active_links;

        u16 restart_active_links;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct {
                struct dentry *subdir_stations;
                struct dentry *default_unicast_key;
                struct dentry *default_multicast_key;
                struct dentry *default_mgmt_key;
                struct dentry *default_beacon_key;
        } debugfs;
#endif

        /* must be last, dynamically sized area in this! */
        struct ieee80211_vif vif;
};

static inline
struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
{
        return container_of(p, struct ieee80211_sub_if_data, vif);
}

#define sdata_dereference(p, sdata) \
        wiphy_dereference(sdata->local->hw.wiphy, p)

#define for_each_sdata_link(_local, _link)                                \
        /* outer loop just to define the variables ... */                \
        for (struct ieee80211_sub_if_data *___sdata = NULL;                \
             !___sdata;                                                        \
             ___sdata = (void *)~0 /* always stop */)                        \
        list_for_each_entry(___sdata, &(_local)->interfaces, list)        \
        if (ieee80211_sdata_running(___sdata))                                \
        for (int ___link_id = 0;                                        \
             ___link_id < ARRAY_SIZE(___sdata->link);                        \
             ___link_id++)                                                \
        if ((_link = wiphy_dereference((local)->hw.wiphy,                \
                                       ___sdata->link[___link_id])))

static inline int
ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems,
                                struct cfg80211_rnr_elems *rnr_elems,
                                u8 i)
{
        int len = 0;

        if (!elems || !elems->cnt || i > elems->cnt)
                return 0;

        if (i < elems->cnt) {
                len = elems->elem[i].len;
                if (rnr_elems) {
                        len += rnr_elems->elem[i].len;
                        for (i = elems->cnt; i < rnr_elems->cnt; i++)
                                len += rnr_elems->elem[i].len;
                }
                return len;
        }

        /* i == elems->cnt, calculate total length of all MBSSID elements */
        for (i = 0; i < elems->cnt; i++)
                len += elems->elem[i].len;

        if (rnr_elems) {
                for (i = 0; i < rnr_elems->cnt; i++)
                        len += rnr_elems->elem[i].len;
        }

        return len;
}

enum {
        IEEE80211_RX_MSG        = 1,
        IEEE80211_TX_STATUS_MSG        = 2,
};

enum queue_stop_reason {
        IEEE80211_QUEUE_STOP_REASON_DRIVER,
        IEEE80211_QUEUE_STOP_REASON_PS,
        IEEE80211_QUEUE_STOP_REASON_CSA,
        IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
        IEEE80211_QUEUE_STOP_REASON_SUSPEND,
        IEEE80211_QUEUE_STOP_REASON_SKB_ADD,
        IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL,
        IEEE80211_QUEUE_STOP_REASON_FLUSH,
        IEEE80211_QUEUE_STOP_REASON_TDLS_TEARDOWN,
        IEEE80211_QUEUE_STOP_REASON_RESERVE_TID,
        IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE,

        IEEE80211_QUEUE_STOP_REASONS,
};

#ifdef CONFIG_MAC80211_LEDS
struct tpt_led_trigger {
        char name[32];
        const struct ieee80211_tpt_blink *blink_table;
        unsigned int blink_table_len;
        struct timer_list timer;
        struct ieee80211_local *local;
        unsigned long prev_traffic;
        unsigned long tx_bytes, rx_bytes;
        unsigned int active, want;
        bool running;
};
#endif

/**
 * enum mac80211_scan_flags - currently active scan mode
 *
 * @SCAN_SW_SCANNING: We're currently in the process of scanning but may as
 *        well be on the operating channel
 * @SCAN_HW_SCANNING: The hardware is scanning for us, we have no way to
 *        determine if we are on the operating channel or not
 * @SCAN_ONCHANNEL_SCANNING:  Do a software scan on only the current operating
 *        channel. This should not interrupt normal traffic.
 * @SCAN_COMPLETED: Set for our scan work function when the driver reported
 *        that the scan completed.
 * @SCAN_ABORTED: Set for our scan work function when the driver reported
 *        a scan complete for an aborted scan.
 * @SCAN_HW_CANCELLED: Set for our scan work function when the scan is being
 *        cancelled.
 * @SCAN_BEACON_WAIT: Set whenever we're passive scanning because of radar/no-IR
 *        and could send a probe request after receiving a beacon.
 * @SCAN_BEACON_DONE: Beacon received, we can now send a probe request
 */
enum mac80211_scan_flags {
        SCAN_SW_SCANNING,
        SCAN_HW_SCANNING,
        SCAN_ONCHANNEL_SCANNING,
        SCAN_COMPLETED,
        SCAN_ABORTED,
        SCAN_HW_CANCELLED,
        SCAN_BEACON_WAIT,
        SCAN_BEACON_DONE,
};

/**
 * enum mac80211_scan_state - scan state machine states
 *
 * @SCAN_DECISION: Main entry point to the scan state machine, this state
 *        determines if we should keep on scanning or switch back to the
 *        operating channel
 * @SCAN_SET_CHANNEL: Set the next channel to be scanned
 * @SCAN_SEND_PROBE: Send probe requests and wait for probe responses
 * @SCAN_SUSPEND: Suspend the scan and go back to operating channel to
 *        send out data
 * @SCAN_RESUME: Resume the scan and scan the next channel
 * @SCAN_ABORT: Abort the scan and go back to operating channel
 */
enum mac80211_scan_state {
        SCAN_DECISION,
        SCAN_SET_CHANNEL,
        SCAN_SEND_PROBE,
        SCAN_SUSPEND,
        SCAN_RESUME,
        SCAN_ABORT,
};

DECLARE_STATIC_KEY_FALSE(aql_disable);

struct ieee80211_local {
        /* embed the driver visible part.
         * don't cast (use the static inlines below), but we keep
         * it first anyway so they become a no-op */
        struct ieee80211_hw hw;

        struct fq fq;
        struct codel_vars *cvars;
        struct codel_params cparams;

        /* protects active_txqs and txqi->schedule_order */
        spinlock_t active_txq_lock[IEEE80211_NUM_ACS];
        struct list_head active_txqs[IEEE80211_NUM_ACS];
        u16 schedule_round[IEEE80211_NUM_ACS];

        /* serializes ieee80211_handle_wake_tx_queue */
        spinlock_t handle_wake_tx_queue_lock;

        u16 airtime_flags;
        u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
        u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
        u32 aql_threshold;
        atomic_t aql_total_pending_airtime;
        atomic_t aql_ac_pending_airtime[IEEE80211_NUM_ACS];

        const struct ieee80211_ops *ops;

        /*
         * private workqueue to mac80211. mac80211 makes this accessible
         * via ieee80211_queue_work()
         */
        struct workqueue_struct *workqueue;

        unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES];
        int q_stop_reasons[IEEE80211_MAX_QUEUES][IEEE80211_QUEUE_STOP_REASONS];
        /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */
        spinlock_t queue_stop_reason_lock;

        int open_count;
        int monitors, cooked_mntrs;
        /* number of interfaces with corresponding FIF_ flags */
        int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
            fif_probe_req;
        bool probe_req_reg;
        bool rx_mcast_action_reg;
        unsigned int filter_flags; /* FIF_* */

        bool wiphy_ciphers_allocated;

        struct cfg80211_chan_def dflt_chandef;
        bool emulate_chanctx;

        /* protects the aggregated multicast list and filter calls */
        spinlock_t filter_lock;

        /* used for uploading changed mc list */
        struct wiphy_work reconfig_filter;

        /* aggregated multicast list */
        struct netdev_hw_addr_list mc_list;

        bool tim_in_locked_section; /* see ieee80211_beacon_get() */

        /*
         * suspended is true if we finished all the suspend _and_ we have
         * not yet come up from resume. This is to be used by mac80211
         * to ensure driver sanity during suspend and mac80211's own
         * sanity. It can eventually be used for WoW as well.
         */
        bool suspended;

        /* suspending is true during the whole suspend process */
        bool suspending;

        /*
         * Resuming is true while suspended, but when we're reprogramming the
         * hardware -- at that time it's allowed to use ieee80211_queue_work()
         * again even though some other parts of the stack are still suspended
         * and we still drop received frames to avoid waking the stack.
         */
        bool resuming;

        /*
         * quiescing is true during the suspend process _only_ to
         * ease timer cancelling etc.
         */
        bool quiescing;

        /* device is started */
        bool started;

        /* device is during a HW reconfig */
        bool in_reconfig;

        /* reconfiguration failed ... suppress some warnings etc. */
        bool reconfig_failure;

        /* wowlan is enabled -- don't reconfig on resume */
        bool wowlan;

        struct wiphy_work radar_detected_work;

        /* number of RX chains the hardware has */
        u8 rx_chains;

        /* bitmap of which sbands were copied */
        u8 sband_allocated;

        int tx_headroom; /* required headroom for hardware/radiotap */

        /* Tasklet and skb queue to process calls from IRQ mode. All frames
         * added to skb_queue will be processed, but frames in
         * skb_queue_unreliable may be dropped if the total length of these
         * queues increases over the limit. */
#define IEEE80211_IRQSAFE_QUEUE_LIMIT 128
        struct tasklet_struct tasklet;
        struct sk_buff_head skb_queue;
        struct sk_buff_head skb_queue_unreliable;

        spinlock_t rx_path_lock;

        /* Station data */
        /*
         * The list, hash table and counter are protected
         * by the wiphy mutex, reads are done with RCU.
         */
        spinlock_t tim_lock;
        unsigned long num_sta;
        struct list_head sta_list;
        struct rhltable sta_hash;
        struct rhltable link_sta_hash;
        struct timer_list sta_cleanup;
        int sta_generation;

        struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
        struct tasklet_struct tx_pending_tasklet;
        struct tasklet_struct wake_txqs_tasklet;

        atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];

        /* number of interfaces with allmulti RX */
        atomic_t iff_allmultis;

        struct rate_control_ref *rate_ctrl;

        struct arc4_ctx wep_tx_ctx;
        struct arc4_ctx wep_rx_ctx;
        u32 wep_iv;

        /* see iface.c */
        struct list_head interfaces;
        struct list_head mon_list; /* only that are IFF_UP && !cooked */
        struct mutex iflist_mtx;

        /* Scanning and BSS list */
        unsigned long scanning;
        struct cfg80211_ssid scan_ssid;
        struct cfg80211_scan_request *int_scan_req;
        struct cfg80211_scan_request __rcu *scan_req;
        struct ieee80211_scan_request *hw_scan_req;
        struct cfg80211_chan_def scan_chandef;
        enum nl80211_band hw_scan_band;
        int scan_channel_idx;
        int scan_ies_len;
        int hw_scan_ies_bufsize;
        struct cfg80211_scan_info scan_info;

        struct wiphy_work sched_scan_stopped_work;
        struct ieee80211_sub_if_data __rcu *sched_scan_sdata;
        struct cfg80211_sched_scan_request __rcu *sched_scan_req;
        u8 scan_addr[ETH_ALEN];

        unsigned long leave_oper_channel_time;
        enum mac80211_scan_state next_scan_state;
        struct wiphy_delayed_work scan_work;
        struct ieee80211_sub_if_data __rcu *scan_sdata;

        /* Temporary remain-on-channel for off-channel operations */
        struct ieee80211_channel *tmp_channel;

        /* channel contexts */
        struct list_head chanctx_list;

#ifdef CONFIG_MAC80211_LEDS
        struct led_trigger tx_led, rx_led, assoc_led, radio_led;
        struct led_trigger tpt_led;
        atomic_t tx_led_active, rx_led_active, assoc_led_active;
        atomic_t radio_led_active, tpt_led_active;
        struct tpt_led_trigger *tpt_led_trigger;
#endif

#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
        /* SNMP counters */
        /* dot11CountersTable */
        u32 dot11TransmittedFragmentCount;
        u32 dot11MulticastTransmittedFrameCount;
        u32 dot11FailedCount;
        u32 dot11RetryCount;
        u32 dot11MultipleRetryCount;
        u32 dot11FrameDuplicateCount;
        u32 dot11ReceivedFragmentCount;
        u32 dot11MulticastReceivedFrameCount;
        u32 dot11TransmittedFrameCount;

        /* TX/RX handler statistics */
        unsigned int tx_handlers_drop;
        unsigned int tx_handlers_queued;
        unsigned int tx_handlers_drop_wep;
        unsigned int tx_handlers_drop_not_assoc;
        unsigned int tx_handlers_drop_unauth_port;
        unsigned int rx_handlers_drop;
        unsigned int rx_handlers_queued;
        unsigned int rx_handlers_drop_nullfunc;
        unsigned int rx_handlers_drop_defrag;
        unsigned int tx_expand_skb_head;
        unsigned int tx_expand_skb_head_cloned;
        unsigned int rx_expand_skb_head_defrag;
        unsigned int rx_handlers_fragments;
        unsigned int tx_status_drop;
#define I802_DEBUG_INC(c) (c)++
#else /* CONFIG_MAC80211_DEBUG_COUNTERS */
#define I802_DEBUG_INC(c) do { } while (0)
#endif /* CONFIG_MAC80211_DEBUG_COUNTERS */


        int total_ps_buffered; /* total number of all buffered unicast and
                                * multicast packets for power saving stations
                                */

        bool pspolling;
        /*
         * PS can only be enabled when we have exactly one managed
         * interface (and monitors) in PS, this then points there.
         */
        struct ieee80211_sub_if_data *ps_sdata;
        struct wiphy_work dynamic_ps_enable_work;
        struct wiphy_work dynamic_ps_disable_work;
        struct timer_list dynamic_ps_timer;
        struct notifier_block ifa_notifier;
        struct notifier_block ifa6_notifier;

        /*
         * The dynamic ps timeout configured from user space via WEXT -
         * this will override whatever chosen by mac80211 internally.
         */
        int dynamic_ps_forced_timeout;

        int user_power_level; /* in dBm, for all interfaces */

        struct work_struct restart_work;

#ifdef CONFIG_MAC80211_DEBUGFS
        struct local_debugfsdentries {
                struct dentry *rcdir;
                struct dentry *keys;
        } debugfs;
        bool force_tx_status;
#endif

        /*
         * Remain-on-channel support
         */
        struct wiphy_delayed_work roc_work;
        struct list_head roc_list;
        struct wiphy_work hw_roc_start, hw_roc_done;
        unsigned long hw_roc_start_time;
        u64 roc_cookie_counter;

        struct idr ack_status_frames;
        spinlock_t ack_status_lock;

        struct ieee80211_sub_if_data __rcu *p2p_sdata;

        /* virtual monitor interface */
        struct ieee80211_sub_if_data __rcu *monitor_sdata;
        struct ieee80211_chan_req monitor_chanreq;

        /* extended capabilities provided by mac80211 */
        u8 ext_capa[8];

        bool wbrf_supported;
};

static inline struct ieee80211_sub_if_data *
IEEE80211_DEV_TO_SUB_IF(const struct net_device *dev)
{
        return netdev_priv(dev);
}

static inline struct ieee80211_sub_if_data *
IEEE80211_WDEV_TO_SUB_IF(struct wireless_dev *wdev)
{
        return container_of(wdev, struct ieee80211_sub_if_data, wdev);
}

static inline struct ieee80211_supported_band *
ieee80211_get_sband(struct ieee80211_sub_if_data *sdata)
{
        struct ieee80211_local *local = sdata->local;
        struct ieee80211_chanctx_conf *chanctx_conf;
        enum nl80211_band band;

        WARN_ON(ieee80211_vif_is_mld(&sdata->vif));

        rcu_read_lock();
        chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);

        if (!chanctx_conf) {
                rcu_read_unlock();
                return NULL;
        }

        band = chanctx_conf->def.chan->band;
        rcu_read_unlock();

        return local->hw.wiphy->bands[band];
}

static inline struct ieee80211_supported_band *
ieee80211_get_link_sband(struct ieee80211_link_data *link)
{
        struct ieee80211_local *local = link->sdata->local;
        struct ieee80211_chanctx_conf *chanctx_conf;
        enum nl80211_band band;

        rcu_read_lock();
        chanctx_conf = rcu_dereference(link->conf->chanctx_conf);
        if (!chanctx_conf) {
                rcu_read_unlock();
                return NULL;
        }

        band = chanctx_conf->def.chan->band;
        rcu_read_unlock();

        return local->hw.wiphy->bands[band];
}

/* this struct holds the value parsing from channel switch IE  */
struct ieee80211_csa_ie {
        struct ieee80211_chan_req chanreq;
        u8 mode;
        u8 count;
        u8 ttl;
        u16 pre_value;
        u16 reason_code;
        u32 max_switch_time;
};

enum ieee80211_elems_parse_error {
        IEEE80211_PARSE_ERR_INVALID_END                = BIT(0),
        IEEE80211_PARSE_ERR_DUP_ELEM                = BIT(1),
        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE        = BIT(2),
        IEEE80211_PARSE_ERR_UNEXPECTED_ELEM        = BIT(3),
        IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC        = BIT(4),
};

/* Parsed Information Elements */
struct ieee802_11_elems {
        const u8 *ie_start;
        size_t total_len;
        u32 crc;

        /* pointers to IEs */
        const struct ieee80211_tdls_lnkie *lnk_id;
        const struct ieee80211_ch_switch_timing *ch_sw_timing;
        const u8 *ext_capab;
        const u8 *ssid;
        const u8 *supp_rates;
        const u8 *ds_params;
        const struct ieee80211_tim_ie *tim;
        const u8 *rsn;
        const u8 *rsnx;
        const u8 *erp_info;
        const u8 *ext_supp_rates;
        const u8 *wmm_info;
        const u8 *wmm_param;
        const struct ieee80211_ht_cap *ht_cap_elem;
        const struct ieee80211_ht_operation *ht_operation;
        const struct ieee80211_vht_cap *vht_cap_elem;
        const struct ieee80211_vht_operation *vht_operation;
        const struct ieee80211_meshconf_ie *mesh_config;
        const u8 *he_cap;
        const struct ieee80211_he_operation *he_operation;
        const struct ieee80211_he_spr *he_spr;
        const struct ieee80211_mu_edca_param_set *mu_edca_param_set;
        const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
        const u8 *uora_element;
        const u8 *mesh_id;
        const u8 *peering;
        const __le16 *awake_window;
        const u8 *preq;
        const u8 *prep;
        const u8 *perr;
        const struct ieee80211_rann_ie *rann;
        const struct ieee80211_channel_sw_ie *ch_switch_ie;
        const struct ieee80211_ext_chansw_ie *ext_chansw_ie;
        const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
        const u8 *max_channel_switch_time;
        const u8 *country_elem;
        const u8 *pwr_constr_elem;
        const u8 *cisco_dtpc_elem;
        const struct ieee80211_timeout_interval_ie *timeout_int;
        const u8 *opmode_notif;
        const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
        struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie;
        const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie;
        const struct ieee80211_multiple_bssid_configuration *mbssid_config_ie;
        const struct ieee80211_bssid_index *bssid_index;
        u8 max_bssid_indicator;
        u8 dtim_count;
        u8 dtim_period;
        const struct ieee80211_addba_ext_ie *addba_ext_ie;
        const struct ieee80211_s1g_cap *s1g_capab;
        const struct ieee80211_s1g_oper_ie *s1g_oper;
        const struct ieee80211_s1g_bcn_compat_ie *s1g_bcn_compat;
        const struct ieee80211_aid_response_ie *aid_resp;
        const struct ieee80211_eht_cap_elem *eht_cap;
        const struct ieee80211_eht_operation *eht_operation;
        const struct ieee80211_multi_link_elem *ml_basic;
        const struct ieee80211_multi_link_elem *ml_reconf;
        const struct ieee80211_bandwidth_indication *bandwidth_indication;
        const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT];

        /* not the order in the psd values is per element, not per chandef */
        struct ieee80211_parsed_tpe tpe;
        struct ieee80211_parsed_tpe csa_tpe;

        /* length of them, respectively */
        u8 ext_capab_len;
        u8 ssid_len;
        u8 supp_rates_len;
        u8 tim_len;
        u8 rsn_len;
        u8 rsnx_len;
        u8 ext_supp_rates_len;
        u8 wmm_info_len;
        u8 wmm_param_len;
        u8 he_cap_len;
        u8 mesh_id_len;
        u8 peering_len;
        u8 preq_len;
        u8 prep_len;
        u8 perr_len;
        u8 country_elem_len;
        u8 bssid_index_len;
        u8 eht_cap_len;

        /* mult-link element can be de-fragmented and thus u8 is not sufficient */
        size_t ml_basic_len;
        size_t ml_reconf_len;

        u8 ttlm_num;

        /*
         * store the per station profile pointer and length in case that the
         * parsing also handled Multi-Link element parsing for a specific link
         * ID.
         */
        struct ieee80211_mle_per_sta_profile *prof;
        size_t sta_prof_len;

        /* whether/which parse error occurred while retrieving these elements */
        u8 parse_error;
};

static inline struct ieee80211_local *hw_to_local(
        struct ieee80211_hw *hw)
{
        return container_of(hw, struct ieee80211_local, hw);
}

static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq)
{
        return container_of(txq, struct txq_info, txq);
}

static inline bool txq_has_queue(struct ieee80211_txq *txq)
{
        struct txq_info *txqi = to_txq_info(txq);

        return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
}

static inline bool
ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
{
        return status->flag & RX_FLAG_MACTIME;
}

void ieee80211_vif_inc_num_mcast(struct ieee80211_sub_if_data *sdata);
void ieee80211_vif_dec_num_mcast(struct ieee80211_sub_if_data *sdata);

/* This function returns the number of multicast stations connected to this
 * interface. It returns -1 if that number is not tracked, that is for netdevs
 * not in AP or AP_VLAN mode or when using 4addr.
 */
static inline int
ieee80211_vif_get_num_mcast_if(struct ieee80211_sub_if_data *sdata)
{
        if (sdata->vif.type == NL80211_IFTYPE_AP)
                return atomic_read(&sdata->u.ap.num_mcast_sta);
        if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
                return atomic_read(&sdata->u.vlan.num_mcast_sta);
        return -1;
}

u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
                                     struct ieee80211_rx_status *status,
                                     unsigned int mpdu_len,
                                     unsigned int mpdu_offset);
int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
int ieee80211_hw_conf_chan(struct ieee80211_local *local);
void ieee80211_hw_conf_init(struct ieee80211_local *local);
void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
                                      u64 changed);
void ieee80211_vif_cfg_change_notify(struct ieee80211_sub_if_data *sdata,
                                     u64 changed);
void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_link_data *link,
                                       u64 changed);
void ieee80211_configure_filter(struct ieee80211_local *local);
u64 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);

void ieee80211_handle_queued_frames(struct ieee80211_local *local);

u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local);
int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
                             u64 *cookie, gfp_t gfp);

void ieee80211_check_fast_rx(struct sta_info *sta);
void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata);
void ieee80211_clear_fast_rx(struct sta_info *sta);

bool ieee80211_is_our_addr(struct ieee80211_sub_if_data *sdata,
                           const u8 *addr, int *out_link_id);

/* STA code */
void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
                       struct cfg80211_auth_request *req);
int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
                        struct cfg80211_assoc_request *req);
int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
                         struct cfg80211_deauth_request *req);
int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
                           struct cfg80211_disassoc_request *req);
void ieee80211_send_pspoll(struct ieee80211_local *local,
                           struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_ps(struct ieee80211_local *local);
void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
                                  struct sk_buff *skb);
void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
                                 struct sk_buff *skb);
void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata);
void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata,
                                  __le16 fc, bool acked);
void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata,
                                   u8 reason, bool tx);
void ieee80211_mgd_setup_link(struct ieee80211_link_data *link);
void ieee80211_mgd_stop_link(struct ieee80211_link_data *link);
void ieee80211_mgd_set_link_qos_params(struct ieee80211_link_data *link);

/* IBSS code */
void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
                              const u8 *bssid, const u8 *addr, u32 supp_rates);
int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
                        struct cfg80211_ibss_params *params);
int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata);
void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
                                   struct sk_buff *skb);
int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings,
                              u64 *changed);
int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata,
                              u64 *changed);
void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata);

/* OCB code */
void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata,
                             const u8 *bssid, const u8 *addr, u32 supp_rates);
void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata,
                       struct ocb_setup *setup);
int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata);

/* mesh code */
void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata);
void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
                                   struct sk_buff *skb);
int ieee80211_mesh_csa_beacon(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings,
                              u64 *changed);
int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata,
                              u64 *changed);

/* scan/BSS handling */
void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work);
int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
                                const u8 *ssid, u8 ssid_len,
                                struct ieee80211_channel **channels,
                                unsigned int n_channels);
int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
                           struct cfg80211_scan_request *req);
void ieee80211_scan_cancel(struct ieee80211_local *local);
void ieee80211_run_deferred_scan(struct ieee80211_local *local);
void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb);

void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *bss,
                          const struct cfg80211_bss_ies *ies, void *data);

void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
struct ieee80211_bss *
ieee80211_bss_info_update(struct ieee80211_local *local,
                          struct ieee80211_rx_status *rx_status,
                          struct ieee80211_mgmt *mgmt,
                          size_t len,
                          struct ieee80211_channel *channel);
void ieee80211_rx_bss_put(struct ieee80211_local *local,
                          struct ieee80211_bss *bss);

/* scheduled scan handling */
int
__ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
                                     struct cfg80211_sched_scan_request *req);
int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
                                       struct cfg80211_sched_scan_request *req);
int ieee80211_request_sched_scan_stop(struct ieee80211_local *local);
void ieee80211_sched_scan_end(struct ieee80211_local *local);
void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy,
                                       struct wiphy_work *work);

/* off-channel/mgmt-tx */
void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local);
void ieee80211_offchannel_return(struct ieee80211_local *local);
void ieee80211_roc_setup(struct ieee80211_local *local);
void ieee80211_start_next_roc(struct ieee80211_local *local);
void ieee80211_reconfig_roc(struct ieee80211_local *local);
void ieee80211_roc_purge(struct ieee80211_local *local,
                         struct ieee80211_sub_if_data *sdata);
int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
                                struct ieee80211_channel *chan,
                                unsigned int duration, u64 *cookie);
int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
                                       struct wireless_dev *wdev, u64 cookie);
int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
                      struct cfg80211_mgmt_tx_params *params, u64 *cookie);
int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy,
                                  struct wireless_dev *wdev, u64 cookie);

/* channel switch handling */
void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work);
int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
                             struct cfg80211_csa_settings *params);

/* color change handling */
void ieee80211_color_change_finalize_work(struct wiphy *wiphy,
                                          struct wiphy_work *work);
void ieee80211_color_collision_detection_work(struct work_struct *work);

/* interface handling */
#define MAC80211_SUPPORTED_FEATURES_TX        (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
                                         NETIF_F_HW_CSUM | NETIF_F_SG | \
                                         NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \
                                         NETIF_F_HW_TC)
#define MAC80211_SUPPORTED_FEATURES_RX        (NETIF_F_RXCSUM)
#define MAC80211_SUPPORTED_FEATURES        (MAC80211_SUPPORTED_FEATURES_TX | \
                                         MAC80211_SUPPORTED_FEATURES_RX)

int ieee80211_iface_init(void);
void ieee80211_iface_exit(void);
int ieee80211_if_add(struct ieee80211_local *local, const char *name,
                     unsigned char name_assign_type,
                     struct wireless_dev **new_wdev, enum nl80211_iftype type,
                     struct vif_params *params);
int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
                             enum nl80211_iftype type);
void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
void ieee80211_remove_interfaces(struct ieee80211_local *local);
u32 ieee80211_idle_off(struct ieee80211_local *local);
void ieee80211_recalc_idle(struct ieee80211_local *local);
void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
                                    const int offset);
int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up);
void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata);
int ieee80211_add_virtual_monitor(struct ieee80211_local *local);
void ieee80211_del_virtual_monitor(struct ieee80211_local *local);

bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
                              bool update_bss);
void ieee80211_recalc_offload(struct ieee80211_local *local);

static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
{
        return test_bit(SDATA_STATE_RUNNING, &sdata->state);
}

/* link handling */
void ieee80211_link_setup(struct ieee80211_link_data *link);
void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
                         int link_id,
                         struct ieee80211_link_data *link,
                         struct ieee80211_bss_conf *link_conf);
void ieee80211_link_stop(struct ieee80211_link_data *link);
int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata,
                            u16 new_links, u16 dormant_links);
static inline void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata)
{
        ieee80211_vif_set_links(sdata, 0, 0);
}

/* tx handling */
void ieee80211_clear_tx_pending(struct ieee80211_local *local);
void ieee80211_tx_pending(struct tasklet_struct *t);
netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
                                         struct net_device *dev);
netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
                                       struct net_device *dev);
netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
                                            struct net_device *dev);
void __ieee80211_subif_start_xmit(struct sk_buff *skb,
                                  struct net_device *dev,
                                  u32 info_flags,
                                  u32 ctrl_flags,
                                  u64 *cookie);
void ieee80211_purge_tx_queue(struct ieee80211_hw *hw,
                              struct sk_buff_head *skbs);
struct sk_buff *
ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
                              struct sk_buff *skb, u32 info_flags);
void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
                          int retry_count, bool send_to_cooked,
                          struct ieee80211_tx_status *status);

void ieee80211_check_fast_xmit(struct sta_info *sta);
void ieee80211_check_fast_xmit_all(struct ieee80211_local *local);
void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata);
void ieee80211_clear_fast_xmit(struct sta_info *sta);
int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
                              const u8 *buf, size_t len,
                              const u8 *dest, __be16 proto, bool unencrypted,
                              int link_id, u64 *cookie);
int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev,
                              const u8 *buf, size_t len);
void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
                           struct sta_info *sta,
                           struct ieee80211_fast_tx *fast_tx,
                           struct sk_buff *skb, bool ampdu,
                           const u8 *da, const u8 *sa);
void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata,
                          struct sta_info *sta, struct sk_buff *skb);

/* HT */
void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_sta_ht_cap *ht_cap);
bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_supported_band *sband,
                                       const struct ieee80211_ht_cap *ht_cap_ie,
                                       struct link_sta_info *link_sta);
void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
                          const u8 *da, u16 tid,
                          u16 initiator, u16 reason_code);
int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
                               enum ieee80211_smps_mode smps, const u8 *da,
                               const u8 *bssid, int link_id);
bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
                                   enum ieee80211_smps_mode smps_mode_new);

void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
                                    u16 initiator, u16 reason, bool stop);
void __ieee80211_start_rx_ba_session(struct sta_info *sta,
                                     u8 dialog_token, u16 timeout,
                                     u16 start_seq_num, u16 ba_policy, u16 tid,
                                     u16 buf_size, bool tx, bool auto_seq,
                                     const struct ieee80211_addba_ext_ie *addbaext);
void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
                                         enum ieee80211_agg_stop_reason reason);
void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
                             struct sta_info *sta,
                             struct ieee80211_mgmt *mgmt, size_t len);
void ieee80211_process_addba_resp(struct ieee80211_local *local,
                                  struct sta_info *sta,
                                  struct ieee80211_mgmt *mgmt,
                                  size_t len);
void ieee80211_process_addba_request(struct ieee80211_local *local,
                                     struct sta_info *sta,
                                     struct ieee80211_mgmt *mgmt,
                                     size_t len);

int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
                                   enum ieee80211_agg_stop_reason reason);
void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid,
                              struct tid_ampdu_tx *tid_tx);
void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid,
                             struct tid_ampdu_tx *tid_tx);
void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work);
void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);

u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs);
enum nl80211_smps_mode
ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps);

/* VHT */
void
ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
                                    struct ieee80211_supported_band *sband,
                                    const struct ieee80211_vht_cap *vht_cap_ie,
                                    const struct ieee80211_vht_cap *vht_cap_ie2,
                                    struct link_sta_info *link_sta);
enum ieee80211_sta_rx_bandwidth
ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta);
enum ieee80211_sta_rx_bandwidth
ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta);
void ieee80211_sta_init_nss(struct link_sta_info *link_sta);
enum ieee80211_sta_rx_bandwidth
ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width);
enum nl80211_chan_width
ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta);
void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
                                 struct ieee80211_link_data *link,
                                 struct ieee80211_mgmt *mgmt);
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
                                  struct link_sta_info *sta,
                                  u8 opmode, enum nl80211_band band);
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
                                 struct link_sta_info *sta,
                                 u8 opmode, enum nl80211_band band);
void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_sta_vht_cap *vht_cap);
void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
                                     u16 vht_mask[NL80211_VHT_NSS_MAX]);
enum nl80211_chan_width
ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *sta);

/* HE */
void
ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
                                  struct ieee80211_supported_band *sband,
                                  const u8 *he_cap_ie, u8 he_cap_len,
                                  const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
                                  struct link_sta_info *link_sta);
void
ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif,
                                const struct ieee80211_he_spr *he_spr_ie_elem);

void
ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif,
                        const struct ieee80211_he_operation *he_op_ie_elem);

/* S1G */
void ieee80211_s1g_sta_rate_init(struct sta_info *sta);
bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb);
void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata,
                                 struct sk_buff *skb);
void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata,
                                     struct sk_buff *skb);

/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_mgmt *mgmt,
                                       size_t len);
/**
 * ieee80211_parse_ch_switch_ie - parses channel switch IEs
 * @sdata: the sdata of the interface which has received the frame
 * @elems: parsed 802.11 elements received with the frame
 * @current_band: indicates the current band
 * @vht_cap_info: VHT capabilities of the transmitter
 * @conn: contains information about own capabilities and restrictions
 *        to decide which channel switch announcements can be accepted
 * @bssid: the currently connected bssid (for reporting)
 * @csa_ie: parsed 802.11 csa elements on count, mode, chandef and mesh ttl.
 *        All of them will be filled with if success only.
 * Return: 0 on success, <0 on error and >0 if there is nothing to parse.
 */
int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
                                 struct ieee802_11_elems *elems,
                                 enum nl80211_band current_band,
                                 u32 vht_cap_info,
                                 struct ieee80211_conn_settings *conn,
                                 u8 *bssid,
                                 struct ieee80211_csa_ie *csa_ie);

/* Suspend/resume and hw reconfiguration */
int ieee80211_reconfig(struct ieee80211_local *local);
void ieee80211_stop_device(struct ieee80211_local *local);

int __ieee80211_suspend(struct ieee80211_hw *hw,
                        struct cfg80211_wowlan *wowlan);

static inline int __ieee80211_resume(struct ieee80211_hw *hw)
{
        struct ieee80211_local *local = hw_to_local(hw);

        WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) &&
             !test_bit(SCAN_COMPLETED, &local->scanning),
                "%s: resume with hardware scan still in progress\n",
                wiphy_name(hw->wiphy));

        return ieee80211_reconfig(hw_to_local(hw));
}

/* utility functions/constants */
extern const void *const mac80211_wiphy_privid; /* for wiphy privid */
const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode);
enum ieee80211_conn_bw_limit
ieee80211_min_bw_limit_from_chandef(struct cfg80211_chan_def *chandef);
int ieee80211_frame_duration(enum nl80211_band band, size_t len,
                             int rate, int erp, int short_preamble);
void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
                                           struct ieee80211_tx_queue_params *qparam,
                                           int ac);
void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe);
void ieee80211_set_wmm_default(struct ieee80211_link_data *link,
                               bool bss_notify, bool enable_qos);
void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
                    struct sta_info *sta, struct sk_buff *skb);

void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
                                 struct sk_buff *skb, int tid, int link_id,
                                 enum nl80211_band band);

/* sta_out needs to be checked for ERR_PTR() before using */
int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
                            struct sk_buff *skb,
                            struct sta_info **sta_out);

static inline void
ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
                          struct sk_buff *skb, int tid,
                          enum nl80211_band band)
{
        rcu_read_lock();
        __ieee80211_tx_skb_tid_band(sdata, skb, tid, -1, band);
        rcu_read_unlock();
}

void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
                          struct sk_buff *skb, int tid, int link_id);

static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
                                    struct sk_buff *skb)
{
        /* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
        ieee80211_tx_skb_tid(sdata, skb, 7, -1);
}

/**
 * struct ieee80211_elems_parse_params - element parsing parameters
 * @mode: connection mode for parsing
 * @start: pointer to the elements
 * @len: length of the elements
 * @action: %true if the elements came from an action frame
 * @filter: bitmap of element IDs to filter out while calculating
 *        the element CRC
 * @crc: CRC starting value
 * @bss: the BSS to parse this as, for multi-BSSID cases this can
 *        represent a non-transmitting BSS in which case the data
 *        for that non-transmitting BSS is returned
 * @link_id: the link ID to parse elements for, if a STA profile
 *        is present in the multi-link element, or -1 to ignore;
 *        note that the code currently assumes parsing an association
 *        (or re-association) response frame if this is given
 * @from_ap: frame is received from an AP (currently used only
 *        for EHT capabilities parsing)
 */
struct ieee80211_elems_parse_params {
        enum ieee80211_conn_mode mode;
        const u8 *start;
        size_t len;
        bool action;
        u64 filter;
        u32 crc;
        struct cfg80211_bss *bss;
        int link_id;
        bool from_ap;
};

struct ieee802_11_elems *
ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params);

static inline struct ieee802_11_elems *
ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
                           u64 filter, u32 crc,
                           struct cfg80211_bss *bss)
{
        struct ieee80211_elems_parse_params params = {
                .mode = IEEE80211_CONN_MODE_HIGHEST,
                .start = start,
                .len = len,
                .action = action,
                .filter = filter,
                .crc = crc,
                .bss = bss,
                .link_id = -1,
        };

        return ieee802_11_parse_elems_full(&params);
}

static inline struct ieee802_11_elems *
ieee802_11_parse_elems(const u8 *start, size_t len, bool action,
                       struct cfg80211_bss *bss)
{
        return ieee802_11_parse_elems_crc(start, len, action, 0, 0, bss);
}

extern const int ieee802_1d_to_ac[8];

static inline int ieee80211_ac_from_tid(int tid)
{
        return ieee802_1d_to_ac[tid & 7];
}

void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy,
                                      struct wiphy_work *work);
void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy,
                                       struct wiphy_work *work);
void ieee80211_dynamic_ps_timer(struct timer_list *t);
void ieee80211_send_nullfunc(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata,
                             bool powersave);
void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local,
                                   struct ieee80211_sub_if_data *sdata);
void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
                             struct ieee80211_hdr *hdr, bool ack, u16 tx_time);

void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
                                     unsigned long queues,
                                     enum queue_stop_reason reason,
                                     bool refcounted);
void ieee80211_stop_vif_queues(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               enum queue_stop_reason reason);
void ieee80211_wake_vif_queues(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               enum queue_stop_reason reason);
void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
                                     unsigned long queues,
                                     enum queue_stop_reason reason,
                                     bool refcounted);
void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
                                    enum queue_stop_reason reason,
                                    bool refcounted);
void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
                                    enum queue_stop_reason reason,
                                    bool refcounted);
void ieee80211_add_pending_skb(struct ieee80211_local *local,
                               struct sk_buff *skb);
void ieee80211_add_pending_skbs(struct ieee80211_local *local,
                                struct sk_buff_head *skbs);
void ieee80211_flush_queues(struct ieee80211_local *local,
                            struct ieee80211_sub_if_data *sdata, bool drop);
void __ieee80211_flush_queues(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              unsigned int queues, bool drop);

static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
{
        /*
         * It's unsafe to try to do any work during reconfigure flow.
         * When the flow ends the work will be requeued.
         */
        if (local->in_reconfig)
                return false;

        /*
         * If quiescing is set, we are racing with __ieee80211_suspend.
         * __ieee80211_suspend flushes the workers after setting quiescing,
         * and we check quiescing / suspended before enqueing new workers.
         * We should abort the worker to avoid the races below.
         */
        if (local->quiescing)
                return false;

        /*
         * We might already be suspended if the following scenario occurs:
         * __ieee80211_suspend                Control path
         *
         *                                if (local->quiescing)
         *                                        return;
         * local->quiescing = true;
         * flush_workqueue();
         *                                queue_work(...);
         * local->suspended = true;
         * local->quiescing = false;
         *                                worker starts running...
         */
        if (local->suspended)
                return false;

        return true;
}

int ieee80211_txq_setup_flows(struct ieee80211_local *local);
void ieee80211_txq_set_params(struct ieee80211_local *local);
void ieee80211_txq_teardown_flows(struct ieee80211_local *local);
void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
                        struct sta_info *sta,
                        struct txq_info *txq, int tid);
void ieee80211_txq_purge(struct ieee80211_local *local,
                         struct txq_info *txqi);
void ieee80211_purge_sta_txqs(struct sta_info *sta);
void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata);
void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
                              struct txq_info *txqi);
void ieee80211_wake_txqs(struct tasklet_struct *t);
void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
                         u16 transaction, u16 auth_alg, u16 status,
                         const u8 *extra, size_t extra_len, const u8 *bssid,
                         const u8 *da, const u8 *key, u8 key_len, u8 key_idx,
                         u32 tx_flags);
void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
                                    const u8 *da, const u8 *bssid,
                                    u16 stype, u16 reason,
                                    bool send_frame, u8 *frame_buf);

enum {
        IEEE80211_PROBE_FLAG_DIRECTED                = BIT(0),
        IEEE80211_PROBE_FLAG_MIN_CONTENT        = BIT(1),
        IEEE80211_PROBE_FLAG_RANDOM_SN                = BIT(2),
};

int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer,
                             size_t buffer_len,
                             struct ieee80211_scan_ies *ie_desc,
                             const u8 *ie, size_t ie_len,
                             u8 bands_used, u32 *rate_masks,
                             struct cfg80211_chan_def *chandef,
                             u32 flags);
struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
                                          const u8 *src, const u8 *dst,
                                          u32 ratemask,
                                          struct ieee80211_channel *chan,
                                          const u8 *ssid, size_t ssid_len,
                                          const u8 *ie, size_t ie_len,
                                          u32 flags);
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
                            struct ieee802_11_elems *elems,
                            enum nl80211_band band, u32 *basic_rates);
int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
                                 struct ieee80211_link_data *link,
                                 enum ieee80211_smps_mode smps_mode);
void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata,
                           struct ieee80211_link_data *link);
void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata,
                                  int link_id);

size_t ieee80211_ie_split_vendor(const u8 *ies, size_t ielen, size_t offset);
u8 *ieee80211_ie_build_ht_cap(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
                              u16 cap);
u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
                               const struct cfg80211_chan_def *chandef,
                               u16 prot_mode, bool rifs_mode);
void ieee80211_ie_build_wide_bw_cs(u8 *pos,
                                   const struct cfg80211_chan_def *chandef);
u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
                               u32 cap);
u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
                                const struct cfg80211_chan_def *chandef);
u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata);
u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef);
u8 *ieee80211_ie_build_eht_oper(u8 *pos, struct cfg80211_chan_def *chandef,
                                const struct ieee80211_sta_eht_cap *eht_cap);
int ieee80211_parse_bitrates(enum nl80211_chan_width width,
                             const struct ieee80211_supported_band *sband,
                             const u8 *srates, int srates_len, u32 *rates);
u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata,
                                struct ieee80211_sta_s1g_cap *caps,
                                struct sk_buff *skb);
void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata,
                                  struct sk_buff *skb);

/* element building in SKBs */
int ieee80211_put_srates_elem(struct sk_buff *skb,
                              const struct ieee80211_supported_band *sband,
                              u32 basic_rates, u32 rate_flags, u32 masked_rates,
                              u8 element_id);
int ieee80211_put_he_cap(struct sk_buff *skb,
                         struct ieee80211_sub_if_data *sdata,
                         const struct ieee80211_supported_band *sband,
                         const struct ieee80211_conn_settings *conn);
int ieee80211_put_he_6ghz_cap(struct sk_buff *skb,
                              struct ieee80211_sub_if_data *sdata,
                              enum ieee80211_smps_mode smps_mode);
int ieee80211_put_eht_cap(struct sk_buff *skb,
                          struct ieee80211_sub_if_data *sdata,
                          const struct ieee80211_supported_band *sband,
                          const struct ieee80211_conn_settings *conn);

/* channel management */
bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
                               struct cfg80211_chan_def *chandef);
bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info,
                                const struct ieee80211_vht_operation *oper,
                                const struct ieee80211_ht_operation *htop,
                                struct cfg80211_chan_def *chandef);
void ieee80211_chandef_eht_oper(const struct ieee80211_eht_operation_info *info,
                                struct cfg80211_chan_def *chandef);
bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local,
                                    const struct ieee80211_he_operation *he_oper,
                                    const struct ieee80211_eht_operation *eht_oper,
                                    struct cfg80211_chan_def *chandef);
bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
                                struct cfg80211_chan_def *chandef);
void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef,
                                 struct ieee80211_conn_settings *conn);
static inline void
ieee80211_chanreq_downgrade(struct ieee80211_chan_req *chanreq,
                            struct ieee80211_conn_settings *conn)
{
        ieee80211_chandef_downgrade(&chanreq->oper, conn);
        if (WARN_ON(!conn))
                return;
        if (conn->mode < IEEE80211_CONN_MODE_EHT)
                chanreq->ap.chan = NULL;
}

bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a,
                                 const struct ieee80211_chan_req *b);

int __must_check
_ieee80211_link_use_channel(struct ieee80211_link_data *link,
                            const struct ieee80211_chan_req *req,
                            enum ieee80211_chanctx_mode mode,
                            bool assign_on_failure);

static inline int __must_check
ieee80211_link_use_channel(struct ieee80211_link_data *link,
                           const struct ieee80211_chan_req *req,
                           enum ieee80211_chanctx_mode mode)
{
        return _ieee80211_link_use_channel(link, req, mode, false);
}

int __must_check
ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
                               const struct ieee80211_chan_req *req,
                               enum ieee80211_chanctx_mode mode,
                               bool radar_required);
int __must_check
ieee80211_link_use_reserved_context(struct ieee80211_link_data *link);
int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link);

int __must_check
ieee80211_link_change_chanreq(struct ieee80211_link_data *link,
                              const struct ieee80211_chan_req *req,
                              u64 *changed);
void __ieee80211_link_release_channel(struct ieee80211_link_data *link,
                                      bool skip_idle_recalc);
void ieee80211_link_release_channel(struct ieee80211_link_data *link);
void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link);
void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
                                          bool clear);
int ieee80211_chanctx_refcount(struct ieee80211_local *local,
                               struct ieee80211_chanctx *ctx);

void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
                                   struct ieee80211_chanctx *chanctx);
void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
                                      struct ieee80211_chanctx *ctx,
                                      struct ieee80211_link_data *rsvd_for);
bool ieee80211_is_radar_required(struct ieee80211_local *local);

void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work);
void ieee80211_dfs_cac_cancel(struct ieee80211_local *local);
void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy,
                                       struct wiphy_work *work);
int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
                              struct cfg80211_csa_settings *csa_settings);

void ieee80211_recalc_dtim(struct ieee80211_local *local,
                           struct ieee80211_sub_if_data *sdata);
int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata,
                                 const struct cfg80211_chan_def *chandef,
                                 enum ieee80211_chanctx_mode chanmode,
                                 u8 radar_detect);
int ieee80211_max_num_channels(struct ieee80211_local *local);
void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
                                       struct ieee80211_chanctx *ctx);

/* TDLS */
int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
                        const u8 *peer, int link_id,
                        u8 action_code, u8 dialog_token, u16 status_code,
                        u32 peer_capability, bool initiator,
                        const u8 *extra_ies, size_t extra_ies_len);
int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                        const u8 *peer, enum nl80211_tdls_operation oper);
void ieee80211_tdls_peer_del_work(struct wiphy *wiphy, struct wiphy_work *wk);
int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev,
                                  const u8 *addr, u8 oper_class,
                                  struct cfg80211_chan_def *chandef);
void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy,
                                          struct net_device *dev,
                                          const u8 *addr);
void ieee80211_teardown_tdls_peers(struct ieee80211_link_data *link);
void ieee80211_tdls_handle_disconnect(struct ieee80211_sub_if_data *sdata,
                                      const u8 *peer, u16 reason);
void
ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata,
                                      struct sk_buff *skb);


const char *ieee80211_get_reason_code_string(u16 reason_code);
u16 ieee80211_encode_usf(int val);
u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
                        enum nl80211_iftype type);

extern const struct ethtool_ops ieee80211_ethtool_ops;

u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
                                       struct ieee80211_vif *vif,
                                       struct ieee80211_sta *pubsta,
                                       int len, bool ampdu);
#ifdef CONFIG_MAC80211_NOINLINE
#define debug_noinline noinline
#else
#define debug_noinline
#endif

void ieee80211_init_frag_cache(struct ieee80211_fragment_cache *cache);
void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache);

u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata);

void
ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
                                    struct ieee80211_supported_band *sband,
                                    const u8 *he_cap_ie, u8 he_cap_len,
                                    const struct ieee80211_eht_cap_elem *eht_cap_ie_elem,
                                    u8 eht_cap_len,
                                    struct link_sta_info *link_sta);
void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata,
                                    struct ieee80211_mgmt *mgmt, size_t len);
void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata,
                                    struct ieee80211_mgmt *mgmt, size_t len);
int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata,
                           struct cfg80211_ttlm_params *params);

void ieee80211_check_wbrf_support(struct ieee80211_local *local);
void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef);
void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef);

#if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST)
#define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym)
#define VISIBLE_IF_MAC80211_KUNIT
ieee80211_rx_result
ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx);
int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap,
                                          u8 n_partial_subchans);
void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd,
                                 const struct cfg80211_chan_def *ap,
                                 const struct cfg80211_chan_def *used);
#else
#define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym)
#define VISIBLE_IF_MAC80211_KUNIT static
#endif

#endif /* IEEE80211_I_H */





















































































    1 



























   22 



































    1 

    1 
























































   22 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Latched RB-trees
 *
 * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org>
 *
 * Since RB-trees have non-atomic modifications they're not immediately suited
 * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for
 * lockless lookups; we cannot guarantee they return a correct result.
 *
 * The simplest solution is a seqlock + RB-tree, this will allow lockless
 * lookups; but has the constraint (inherent to the seqlock) that read sides
 * cannot nest in write sides.
 *
 * If we need to allow unconditional lookups (say as required for NMI context
 * usage) we need a more complex setup; this data structure provides this by
 * employing the latch technique -- see @raw_write_seqcount_latch -- to
 * implement a latched RB-tree which does allow for unconditional lookups by
 * virtue of always having (at least) one stable copy of the tree.
 *
 * However, while we have the guarantee that there is at all times one stable
 * copy, this does not guarantee an iteration will not observe modifications.
 * What might have been a stable copy at the start of the iteration, need not
 * remain so for the duration of the iteration.
 *
 * Therefore, this does require a lockless RB-tree iteration to be non-fatal;
 * see the comment in lib/rbtree.c. Note however that we only require the first
 * condition -- not seeing partial stores -- because the latch thing isolates
 * us from loops. If we were to interrupt a modification the lookup would be
 * pointed at the stable tree and complete while the modification was halted.
 */

#ifndef RB_TREE_LATCH_H
#define RB_TREE_LATCH_H

#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/rcupdate.h>

struct latch_tree_node {
        struct rb_node node[2];
};

struct latch_tree_root {
        seqcount_latch_t        seq;
        struct rb_root                tree[2];
};

/**
 * latch_tree_ops - operators to define the tree order
 * @less: used for insertion; provides the (partial) order between two elements.
 * @comp: used for lookups; provides the order between the search key and an element.
 *
 * The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * latch_tree_find().
 */
struct latch_tree_ops {
        bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b);
        int  (*comp)(void *key,                 struct latch_tree_node *b);
};

static __always_inline struct latch_tree_node *
__lt_from_rb(struct rb_node *node, int idx)
{
        return container_of(node, struct latch_tree_node, node[idx]);
}

static __always_inline void
__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx,
            bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b))
{
        struct rb_root *root = &ltr->tree[idx];
        struct rb_node **link = &root->rb_node;
        struct rb_node *node = &ltn->node[idx];
        struct rb_node *parent = NULL;
        struct latch_tree_node *ltp;

        while (*link) {
                parent = *link;
                ltp = __lt_from_rb(parent, idx);

                if (less(ltn, ltp))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        rb_link_node_rcu(node, parent, link);
        rb_insert_color(node, root);
}

static __always_inline void
__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx)
{
        rb_erase(&ltn->node[idx], &ltr->tree[idx]);
}

static __always_inline struct latch_tree_node *
__lt_find(void *key, struct latch_tree_root *ltr, int idx,
          int (*comp)(void *key, struct latch_tree_node *node))
{
        struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node);
        struct latch_tree_node *ltn;
        int c;

        while (node) {
                ltn = __lt_from_rb(node, idx);
                c = comp(key, ltn);

                if (c < 0)
                        node = rcu_dereference_raw(node->rb_left);
                else if (c > 0)
                        node = rcu_dereference_raw(node->rb_right);
                else
                        return ltn;
        }

        return NULL;
}

/**
 * latch_tree_insert() - insert @node into the trees @root
 * @node: nodes to insert
 * @root: trees to insert @node into
 * @ops: operators defining the node order
 *
 * It inserts @node into @root in an ordered fashion such that we can always
 * observe one complete tree. See the comment for raw_write_seqcount_latch().
 *
 * The inserts use rcu_assign_pointer() to publish the element such that the
 * tree structure is stored before we can observe the new @node.
 *
 * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
 * serialized.
 */
static __always_inline void
latch_tree_insert(struct latch_tree_node *node,
                  struct latch_tree_root *root,
                  const struct latch_tree_ops *ops)
{
        raw_write_seqcount_latch(&root->seq);
        __lt_insert(node, root, 0, ops->less);
        raw_write_seqcount_latch(&root->seq);
        __lt_insert(node, root, 1, ops->less);
}

/**
 * latch_tree_erase() - removes @node from the trees @root
 * @node: nodes to remote
 * @root: trees to remove @node from
 * @ops: operators defining the node order
 *
 * Removes @node from the trees @root in an ordered fashion such that we can
 * always observe one complete tree. See the comment for
 * raw_write_seqcount_latch().
 *
 * It is assumed that @node will observe one RCU quiescent state before being
 * reused of freed.
 *
 * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
 * serialized.
 */
static __always_inline void
latch_tree_erase(struct latch_tree_node *node,
                 struct latch_tree_root *root,
                 const struct latch_tree_ops *ops)
{
        raw_write_seqcount_latch(&root->seq);
        __lt_erase(node, root, 0);
        raw_write_seqcount_latch(&root->seq);
        __lt_erase(node, root, 1);
}

/**
 * latch_tree_find() - find the node matching @key in the trees @root
 * @key: search key
 * @root: trees to search for @key
 * @ops: operators defining the node order
 *
 * Does a lockless lookup in the trees @root for the node matching @key.
 *
 * It is assumed that this is called while holding the appropriate RCU read
 * side lock.
 *
 * If the operators define a partial order on the elements (there are multiple
 * elements which have the same key value) it is undefined which of these
 * elements will be found. Nor is it possible to iterate the tree to find
 * further elements with the same key value.
 *
 * Returns: a pointer to the node matching @key or NULL.
 */
static __always_inline struct latch_tree_node *
latch_tree_find(void *key, struct latch_tree_root *root,
                const struct latch_tree_ops *ops)
{
        struct latch_tree_node *node;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&root->seq);
                node = __lt_find(key, root, seq & 1, ops->comp);
        } while (raw_read_seqcount_latch_retry(&root->seq, seq));

        return node;
}

#endif /* RB_TREE_LATCH_H */






























































































    1 






    2 










    2 






    2 









    2 











    2 

























































    2 
    2 






    2 







    2 





    2 






    2 
    2 



    2 

    2 
    2 




















    1 






































    2 




    2 






    1 




    2 



    2 


    2 










































































































    2 
    2 




    2 
























    2 











    2 
    2 

    2 





    2 






















    2 


    2 







    2 



    2 





    2 











    2 
    2 











    2 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
// SPDX-License-Identifier: GPL-2.0-only
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                The Internet Protocol (IP) output module.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <Alan.Cox@linux.org>
 *                Richard Underwood
 *                Stefan Becker, <stefanb@yello.ping.de>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *                Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 *        See ip_input.c for original log
 *
 *        Fixes:
 *                Alan Cox        :        Missing nonblock feature in ip_build_xmit.
 *                Mike Kilburn        :        htons() missing in ip_build_xmit.
 *                Bradford Johnson:        Fix faulty handling of some frames when
 *                                        no route is found.
 *                Alexander Demenshin:        Missing sk/skb free in ip_queue_xmit
 *                                        (in case if packet not accepted by
 *                                        output firewall rules)
 *                Mike McLagan        :        Routing by source
 *                Alexey Kuznetsov:        use new route cache
 *                Andi Kleen:                Fix broken PMTU recovery and remove
 *                                        some redundant tests.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year coma.
 *                Andi Kleen        :         Replace ip_reply with ip_send_reply.
 *                Andi Kleen        :        Split fast and slow ip_build_xmit path
 *                                        for decreased register pressure on x86
 *                                        and more readability.
 *                Marc Boucher        :        When call_out_firewall returns FW_QUEUE,
 *                                        silently drop skb instead of failing with -EPERM.
 *                Detlev Wengorz        :        Copy protocol for fragments.
 *                Hirokazu Takahashi:        HW checksumming for outgoing UDP
 *                                        datagrams.
 *                Hirokazu Takahashi:        sendfile() on UDP works now.
 */

#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/slab.h>

#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/init.h>

#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
#include <net/gso.h>
#include <net/inetpeer.h>
#include <net/inet_ecn.h>
#include <net/lwtunnel.h>
#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/netlink.h>
#include <linux/tcp.h>

static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
            unsigned int mtu,
            int (*output)(struct net *, struct sock *, struct sk_buff *));

/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
{
        iph->check = 0;
        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
EXPORT_SYMBOL(ip_send_check);

int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct iphdr *iph = ip_hdr(skb);

        IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS);

        iph_set_totlen(iph, skb->len);
        ip_send_check(iph);

        /* if egress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip_out(sk, skb);
        if (unlikely(!skb))
                return 0;

        skb->protocol = htons(ETH_P_IP);

        return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
                       net, sk, skb, NULL, skb_dst(skb)->dev,
                       dst_output);
}

int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        int err;

        err = __ip_local_out(net, sk, skb);
        if (likely(err == 1))
                err = dst_output(net, sk, skb);

        return err;
}
EXPORT_SYMBOL_GPL(ip_local_out);

static inline int ip_select_ttl(const struct inet_sock *inet,
                                const struct dst_entry *dst)
{
        int ttl = READ_ONCE(inet->uc_ttl);

        if (ttl < 0)
                ttl = ip4_dst_hoplimit(dst);
        return ttl;
}

/*
 *                Add an ip header to a skbuff and send it out.
 *
 */
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
                          __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
                          u8 tos)
{
        const struct inet_sock *inet = inet_sk(sk);
        struct rtable *rt = skb_rtable(skb);
        struct net *net = sock_net(sk);
        struct iphdr *iph;

        /* Build the IP header. */
        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        iph->version  = 4;
        iph->ihl      = 5;
        iph->tos      = tos;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
        iph->daddr    = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
        iph->saddr    = saddr;
        iph->protocol = sk->sk_protocol;
        /* Do not bother generating IPID for small packets (eg SYNACK) */
        if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
                iph->frag_off = htons(IP_DF);
                iph->id = 0;
        } else {
                iph->frag_off = 0;
                /* TCP packets here are SYNACK with fat IPv4/TCP options.
                 * Avoid using the hashed IP ident generator.
                 */
                if (sk->sk_protocol == IPPROTO_TCP)
                        iph->id = (__force __be16)get_random_u16();
                else
                        __ip_select_ident(net, iph, 1);
        }

        if (opt && opt->opt.optlen) {
                iph->ihl += opt->opt.optlen>>2;
                ip_options_build(skb, &opt->opt, daddr, rt);
        }

        skb->priority = READ_ONCE(sk->sk_priority);
        if (!skb->mark)
                skb->mark = READ_ONCE(sk->sk_mark);

        /* Send it out. */
        return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);

static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct rtable *rt = dst_rtable(dst);
        struct net_device *dev = dst->dev;
        unsigned int hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        if (rt->rt_type == RTN_MULTICAST) {
                IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
        } else if (rt->rt_type == RTN_BROADCAST)
                IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);

        /* OUTOCTETS should be counted after fragment */
        IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
                int res = lwtunnel_xmit(skb);

                if (res != LWTUNNEL_XMIT_CONTINUE)
                        return res;
        }

        rcu_read_lock();
        neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        if (!IS_ERR(neigh)) {
                int res;

                sock_confirm_neigh(skb, neigh);
                /* if crossing protocols, can not use the cached header */
                res = neigh_output(neigh, skb, is_v6gw);
                rcu_read_unlock();
                return res;
        }
        rcu_read_unlock();

        net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
                            __func__);
        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
        return PTR_ERR(neigh);
}

static int ip_finish_output_gso(struct net *net, struct sock *sk,
                                struct sk_buff *skb, unsigned int mtu)
{
        struct sk_buff *segs, *nskb;
        netdev_features_t features;
        int ret = 0;

        /* common case: seglen is <= mtu
         */
        if (skb_gso_validate_network_len(skb, mtu))
                return ip_finish_output2(net, sk, skb);

        /* Slowpath -  GSO segment length exceeds the egress MTU.
         *
         * This can happen in several cases:
         *  - Forwarding of a TCP GRO skb, when DF flag is not set.
         *  - Forwarding of an skb that arrived on a virtualization interface
         *    (virtio-net/vhost/tap) with TSO/GSO size set by other network
         *    stack.
         *  - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
         *    interface with a smaller MTU.
         *  - Arriving GRO skb (or GSO skb in a virtualized environment) that is
         *    bridged to a NETIF_F_TSO tunnel stacked over an interface with an
         *    insufficient MTU.
         */
        features = netif_skb_features(skb);
        BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
        if (IS_ERR_OR_NULL(segs)) {
                kfree_skb(skb);
                return -ENOMEM;
        }

        consume_skb(skb);

        skb_list_walk_safe(segs, segs, nskb) {
                int err;

                skb_mark_not_on_list(segs);
                err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);

                if (err && ret == 0)
                        ret = err;
        }

        return ret;
}

static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        unsigned int mtu;

#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
        /* Policy lookup after SNAT yielded a new policy */
        if (skb_dst(skb)->xfrm) {
                IPCB(skb)->flags |= IPSKB_REROUTED;
                return dst_output(net, sk, skb);
        }
#endif
        mtu = ip_skb_dst_mtu(sk, skb);
        if (skb_is_gso(skb))
                return ip_finish_output_gso(net, sk, skb, mtu);

        if (skb->len > mtu || IPCB(skb)->frag_max_size)
                return ip_fragment(net, sk, skb, mtu, ip_finish_output2);

        return ip_finish_output2(net, sk, skb);
}

static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        int ret;

        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
        switch (ret) {
        case NET_XMIT_SUCCESS:
                return __ip_finish_output(net, sk, skb);
        case NET_XMIT_CN:
                return __ip_finish_output(net, sk, skb) ? : ret;
        default:
                kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
                return ret;
        }
}

static int ip_mc_finish_output(struct net *net, struct sock *sk,
                               struct sk_buff *skb)
{
        struct rtable *new_rt;
        bool do_cn = false;
        int ret, err;

        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
        switch (ret) {
        case NET_XMIT_CN:
                do_cn = true;
                fallthrough;
        case NET_XMIT_SUCCESS:
                break;
        default:
                kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
                return ret;
        }

        /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
         * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
         * see ipv4_pktinfo_prepare().
         */
        new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
        if (new_rt) {
                new_rt->rt_iif = 0;
                skb_dst_drop(skb);
                skb_dst_set(skb, &new_rt->dst);
        }

        err = dev_loopback_xmit(net, sk, skb);
        return (do_cn && err) ? ret : err;
}

int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct net_device *dev = rt->dst.dev;

        /*
         *        If the indicated interface is up and running, send the packet.
         */
        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);

        /*
         *        Multicasts are looped back for other local users
         */

        if (rt->rt_flags&RTCF_MULTICAST) {
                if (sk_mc_loop(sk)
#ifdef CONFIG_IP_MROUTE
                /* Small optimization: do not loopback not local frames,
                   which returned after forwarding; they will be  dropped
                   by ip_mr_input in any case.
                   Note, that local frames are looped back to be delivered
                   to local recipients.

                   This check is duplicated in ip_mr_input at the moment.
                 */
                    &&
                    ((rt->rt_flags & RTCF_LOCAL) ||
                     !(IPCB(skb)->flags & IPSKB_FORWARDED))
#endif
                   ) {
                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                        if (newskb)
                                NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                        net, sk, newskb, NULL, newskb->dev,
                                        ip_mc_finish_output);
                }

                /* Multicasts with ttl 0 must not go beyond the host */

                if (ip_hdr(skb)->ttl == 0) {
                        kfree_skb(skb);
                        return 0;
                }
        }

        if (rt->rt_flags&RTCF_BROADCAST) {
                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
                if (newskb)
                        NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                                net, sk, newskb, NULL, newskb->dev,
                                ip_mc_finish_output);
        }

        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                            net, sk, skb, NULL, skb->dev,
                            ip_finish_output,
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
}

int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;

        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);

        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                            net, sk, skb, indev, dev,
                            ip_finish_output,
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
}
EXPORT_SYMBOL(ip_output);

/*
 * copy saddr and daddr, possibly using 64bit load/stores
 * Equivalent to :
 *   iph->saddr = fl4->saddr;
 *   iph->daddr = fl4->daddr;
 */
static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
        BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
                     offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));

        iph->saddr = fl4->saddr;
        iph->daddr = fl4->daddr;
}

/* Note: skb->sk can be different from sk, in case of tunnels */
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
                    __u8 tos)
{
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        struct ip_options_rcu *inet_opt;
        struct flowi4 *fl4;
        struct rtable *rt;
        struct iphdr *iph;
        int res;

        /* Skip all of this if the packet is already routed,
         * f.e. by something like SCTP.
         */
        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        fl4 = &fl->u.ip4;
        rt = skb_rtable(skb);
        if (rt)
                goto packet_routed;

        /* Make sure we can route this packet. */
        rt = dst_rtable(__sk_dst_check(sk, 0));
        if (!rt) {
                __be32 daddr;

                /* Use correct destination address if we have options. */
                daddr = inet->inet_daddr;
                if (inet_opt && inet_opt->opt.srr)
                        daddr = inet_opt->opt.faddr;

                /* If this fails, retransmit mechanism of transport layer will
                 * keep trying until route appears or the connection times
                 * itself out.
                 */
                rt = ip_route_output_ports(net, fl4, sk,
                                           daddr, inet->inet_saddr,
                                           inet->inet_dport,
                                           inet->inet_sport,
                                           sk->sk_protocol,
                                           RT_TOS(tos),
                                           sk->sk_bound_dev_if);
                if (IS_ERR(rt))
                        goto no_route;
                sk_setup_caps(sk, &rt->dst);
        }
        skb_dst_set_noref(skb, &rt->dst);

packet_routed:
        if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
                goto no_route;

        /* OK, we know where to send it, allocate and build IP header. */
        skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
        if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
                iph->frag_off = htons(IP_DF);
        else
                iph->frag_off = 0;
        iph->ttl      = ip_select_ttl(inet, &rt->dst);
        iph->protocol = sk->sk_protocol;
        ip_copy_addrs(iph, fl4);

        /* Transport layer set skb->h.foo itself. */

        if (inet_opt && inet_opt->opt.optlen) {
                iph->ihl += inet_opt->opt.optlen >> 2;
                ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
        }

        ip_select_ident_segs(net, skb, sk,
                             skb_shinfo(skb)->gso_segs ?: 1);

        /* TODO : should we use skb->sk here instead of sk ? */
        skb->priority = READ_ONCE(sk->sk_priority);
        skb->mark = READ_ONCE(sk->sk_mark);

        res = ip_local_out(net, sk, skb);
        rcu_read_unlock();
        return res;

no_route:
        rcu_read_unlock();
        IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
        kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
        return -EHOSTUNREACH;
}
EXPORT_SYMBOL(__ip_queue_xmit);

int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
        return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
}
EXPORT_SYMBOL(ip_queue_xmit);

static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
        to->skb_iif = from->skb_iif;
        skb_dst_drop(to);
        skb_dst_copy(to, from);
        to->dev = from->dev;
        to->mark = from->mark;

        skb_copy_hash(to, from);

#ifdef CONFIG_NET_SCHED
        to->tc_index = from->tc_index;
#endif
        nf_copy(to, from);
        skb_ext_copy(to, from);
#if IS_ENABLED(CONFIG_IP_VS)
        to->ipvs_property = from->ipvs_property;
#endif
        skb_copy_secmark(to, from);
}

static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                       unsigned int mtu,
                       int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        struct iphdr *iph = ip_hdr(skb);

        if ((iph->frag_off & htons(IP_DF)) == 0)
                return ip_do_fragment(net, sk, skb, output);

        if (unlikely(!skb->ignore_df ||
                     (IPCB(skb)->frag_max_size &&
                      IPCB(skb)->frag_max_size > mtu))) {
                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
                          htonl(mtu));
                kfree_skb(skb);
                return -EMSGSIZE;
        }

        return ip_do_fragment(net, sk, skb, output);
}

void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
                      unsigned int hlen, struct ip_fraglist_iter *iter)
{
        unsigned int first_len = skb_pagelen(skb);

        iter->frag = skb_shinfo(skb)->frag_list;
        skb_frag_list_init(skb);

        iter->offset = 0;
        iter->iph = iph;
        iter->hlen = hlen;

        skb->data_len = first_len - skb_headlen(skb);
        skb->len = first_len;
        iph->tot_len = htons(first_len);
        iph->frag_off = htons(IP_MF);
        ip_send_check(iph);
}
EXPORT_SYMBOL(ip_fraglist_init);

void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
{
        unsigned int hlen = iter->hlen;
        struct iphdr *iph = iter->iph;
        struct sk_buff *frag;

        frag = iter->frag;
        frag->ip_summed = CHECKSUM_NONE;
        skb_reset_transport_header(frag);
        __skb_push(frag, hlen);
        skb_reset_network_header(frag);
        memcpy(skb_network_header(frag), iph, hlen);
        iter->iph = ip_hdr(frag);
        iph = iter->iph;
        iph->tot_len = htons(frag->len);
        ip_copy_metadata(frag, skb);
        iter->offset += skb->len - hlen;
        iph->frag_off = htons(iter->offset >> 3);
        if (frag->next)
                iph->frag_off |= htons(IP_MF);
        /* Ready, complete checksum */
        ip_send_check(iph);
}
EXPORT_SYMBOL(ip_fraglist_prepare);

void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
                  unsigned int ll_rs, unsigned int mtu, bool DF,
                  struct ip_frag_state *state)
{
        struct iphdr *iph = ip_hdr(skb);

        state->DF = DF;
        state->hlen = hlen;
        state->ll_rs = ll_rs;
        state->mtu = mtu;

        state->left = skb->len - hlen;        /* Space per frame */
        state->ptr = hlen;                /* Where to start from */

        state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
        state->not_last_frag = iph->frag_off & htons(IP_MF);
}
EXPORT_SYMBOL(ip_frag_init);

static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
                         bool first_frag)
{
        /* Copy the flags to each fragment. */
        IPCB(to)->flags = IPCB(from)->flags;

        /* ANK: dirty, but effective trick. Upgrade options only if
         * the segment to be fragmented was THE FIRST (otherwise,
         * options are already fixed) and make it ONCE
         * on the initial skb, so that all the following fragments
         * will inherit fixed options.
         */
        if (first_frag)
                ip_options_fragment(from);
}

struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
{
        unsigned int len = state->left;
        struct sk_buff *skb2;
        struct iphdr *iph;

        /* IF: it doesn't fit, use 'mtu' - the data space left */
        if (len > state->mtu)
                len = state->mtu;
        /* IF: we are not sending up to and including the packet end
           then align the next start on an eight byte boundary */
        if (len < state->left)        {
                len &= ~7;
        }

        /* Allocate buffer */
        skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
        if (!skb2)
                return ERR_PTR(-ENOMEM);

        /*
         *        Set up data on packet
         */

        ip_copy_metadata(skb2, skb);
        skb_reserve(skb2, state->ll_rs);
        skb_put(skb2, len + state->hlen);
        skb_reset_network_header(skb2);
        skb2->transport_header = skb2->network_header + state->hlen;

        /*
         *        Charge the memory for the fragment to any owner
         *        it might possess
         */

        if (skb->sk)
                skb_set_owner_w(skb2, skb->sk);

        /*
         *        Copy the packet header into the new buffer.
         */

        skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);

        /*
         *        Copy a block of the IP datagram.
         */
        if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
                BUG();
        state->left -= len;

        /*
         *        Fill in the new header fields.
         */
        iph = ip_hdr(skb2);
        iph->frag_off = htons((state->offset >> 3));
        if (state->DF)
                iph->frag_off |= htons(IP_DF);

        /*
         *        Added AC : If we are fragmenting a fragment that's not the
         *                   last fragment then keep MF on each bit
         */
        if (state->left > 0 || state->not_last_frag)
                iph->frag_off |= htons(IP_MF);
        state->ptr += len;
        state->offset += len;

        iph->tot_len = htons(len + state->hlen);

        ip_send_check(iph);

        return skb2;
}
EXPORT_SYMBOL(ip_frag_next);

/*
 *        This IP datagram is too large to be sent in one piece.  Break it up into
 *        smaller pieces (each of size equal to IP header plus
 *        a block of the data of the original IP data part) that will yet fit in a
 *        single device frame, and queue such a frame for sending.
 */

int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                   int (*output)(struct net *, struct sock *, struct sk_buff *))
{
        struct iphdr *iph;
        struct sk_buff *skb2;
        u8 tstamp_type = skb->tstamp_type;
        struct rtable *rt = skb_rtable(skb);
        unsigned int mtu, hlen, ll_rs;
        struct ip_fraglist_iter iter;
        ktime_t tstamp = skb->tstamp;
        struct ip_frag_state state;
        int err = 0;

        /* for offloaded checksums cleanup checksum before fragmentation */
        if (skb->ip_summed == CHECKSUM_PARTIAL &&
            (err = skb_checksum_help(skb)))
                goto fail;

        /*
         *        Point into the IP datagram header.
         */

        iph = ip_hdr(skb);

        mtu = ip_skb_dst_mtu(sk, skb);
        if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
                mtu = IPCB(skb)->frag_max_size;

        /*
         *        Setup starting values.
         */

        hlen = iph->ihl * 4;
        mtu = mtu - hlen;        /* Size of data space */
        IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
        ll_rs = LL_RESERVED_SPACE(rt->dst.dev);

        /* When frag_list is given, use it. First, check its validity:
         * some transformers could create wrong frag_list or break existing
         * one, it is not prohibited. In this case fall back to copying.
         *
         * LATER: this step can be merged to real generation of fragments,
         * we can switch to copy when see the first bad fragment.
         */
        if (skb_has_frag_list(skb)) {
                struct sk_buff *frag, *frag2;
                unsigned int first_len = skb_pagelen(skb);

                if (first_len - hlen > mtu ||
                    ((first_len - hlen) & 7) ||
                    ip_is_fragment(iph) ||
                    skb_cloned(skb) ||
                    skb_headroom(skb) < ll_rs)
                        goto slow_path;

                skb_walk_frags(skb, frag) {
                        /* Correct geometry. */
                        if (frag->len > mtu ||
                            ((frag->len & 7) && frag->next) ||
                            skb_headroom(frag) < hlen + ll_rs)
                                goto slow_path_clean;

                        /* Partially cloned skb? */
                        if (skb_shared(frag))
                                goto slow_path_clean;

                        BUG_ON(frag->sk);
                        if (skb->sk) {
                                frag->sk = skb->sk;
                                frag->destructor = sock_wfree;
                        }
                        skb->truesize -= frag->truesize;
                }

                /* Everything is OK. Generate! */
                ip_fraglist_init(skb, iph, hlen, &iter);

                for (;;) {
                        /* Prepare header of the next frame,
                         * before previous one went down. */
                        if (iter.frag) {
                                bool first_frag = (iter.offset == 0);

                                IPCB(iter.frag)->flags = IPCB(skb)->flags;
                                ip_fraglist_prepare(skb, &iter);
                                if (first_frag && IPCB(skb)->opt.optlen) {
                                        /* ipcb->opt is not populated for frags
                                         * coming from __ip_make_skb(),
                                         * ip_options_fragment() needs optlen
                                         */
                                        IPCB(iter.frag)->opt.optlen =
                                                IPCB(skb)->opt.optlen;
                                        ip_options_fragment(iter.frag);
                                        ip_send_check(iter.iph);
                                }
                        }

                        skb_set_delivery_time(skb, tstamp, tstamp_type);
                        err = output(net, sk, skb);

                        if (!err)
                                IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
                        if (err || !iter.frag)
                                break;

                        skb = ip_fraglist_next(&iter);
                }

                if (err == 0) {
                        IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
                        return 0;
                }

                kfree_skb_list(iter.frag);

                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
                return err;

slow_path_clean:
                skb_walk_frags(skb, frag2) {
                        if (frag2 == frag)
                                break;
                        frag2->sk = NULL;
                        frag2->destructor = NULL;
                        skb->truesize += frag2->truesize;
                }
        }

slow_path:
        /*
         *        Fragment the datagram.
         */

        ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
                     &state);

        /*
         *        Keep copying data until we run out.
         */

        while (state.left > 0) {
                bool first_frag = (state.offset == 0);

                skb2 = ip_frag_next(skb, &state);
                if (IS_ERR(skb2)) {
                        err = PTR_ERR(skb2);
                        goto fail;
                }
                ip_frag_ipcb(skb, skb2, first_frag);

                /*
                 *        Put this fragment into the sending queue.
                 */
                skb_set_delivery_time(skb2, tstamp, tstamp_type);
                err = output(net, sk, skb2);
                if (err)
                        goto fail;

                IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
        }
        consume_skb(skb);
        IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
        return err;

fail:
        kfree_skb(skb);
        IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
        return err;
}
EXPORT_SYMBOL(ip_do_fragment);

int
ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
{
        struct msghdr *msg = from;

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                if (!copy_from_iter_full(to, len, &msg->msg_iter))
                        return -EFAULT;
        } else {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, odd);
        }
        return 0;
}
EXPORT_SYMBOL(ip_generic_getfrag);

static int __ip_append_data(struct sock *sk,
                            struct flowi4 *fl4,
                            struct sk_buff_head *queue,
                            struct inet_cork *cork,
                            struct page_frag *pfrag,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            unsigned int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ubuf_info *uarg = NULL;
        struct sk_buff *skb;
        struct ip_options *opt = cork->opt;
        int hh_len;
        int exthdrlen;
        int mtu;
        int copy;
        int err;
        int offset = 0;
        bool zc = false;
        unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
        int csummode = CHECKSUM_NONE;
        struct rtable *rt = dst_rtable(cork->dst);
        bool paged, hold_tskey, extra_uref = false;
        unsigned int wmem_alloc_delta = 0;
        u32 tskey = 0;

        skb = skb_peek_tail(queue);

        exthdrlen = !skb ? rt->dst.header_len : 0;
        mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
        paged = !!cork->gso_size;

        hh_len = LL_RESERVED_SPACE(rt->dst.dev);

        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
        maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;

        if (cork->length + length > maxnonfragsize - fragheaderlen) {
                ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
                               mtu - (opt ? opt->optlen : 0));
                return -EMSGSIZE;
        }

        /*
         * transhdrlen > 0 means that this is the first fragment and we wish
         * it won't be fragmented in the future.
         */
        if (transhdrlen &&
            length + fragheaderlen <= mtu &&
            rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
            (!(flags & MSG_MORE) || cork->gso_size) &&
            (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
                csummode = CHECKSUM_PARTIAL;

        if ((flags & MSG_ZEROCOPY) && length) {
                struct msghdr *msg = from;

                if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
                        if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
                                return -EINVAL;

                        /* Leave uarg NULL if can't zerocopy, callers should
                         * be able to handle it.
                         */
                        if ((rt->dst.dev->features & NETIF_F_SG) &&
                            csummode == CHECKSUM_PARTIAL) {
                                paged = true;
                                zc = true;
                                uarg = msg->msg_ubuf;
                        }
                } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
                        uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
                        if (!uarg)
                                return -ENOBUFS;
                        extra_uref = !skb_zcopy(skb);        /* only ref on new uarg */
                        if (rt->dst.dev->features & NETIF_F_SG &&
                            csummode == CHECKSUM_PARTIAL) {
                                paged = true;
                                zc = true;
                        } else {
                                uarg_to_msgzc(uarg)->zerocopy = 0;
                                skb_zcopy_set(skb, uarg, &extra_uref);
                        }
                }
        } else if ((flags & MSG_SPLICE_PAGES) && length) {
                if (inet_test_bit(HDRINCL, sk))
                        return -EPERM;
                if (rt->dst.dev->features & NETIF_F_SG &&
                    getfrag == ip_generic_getfrag)
                        /* We need an empty buffer to attach stuff to */
                        paged = true;
                else
                        flags &= ~MSG_SPLICE_PAGES;
        }

        cork->length += length;

        hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
                     READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
        if (hold_tskey)
                tskey = atomic_inc_return(&sk->sk_tskey) - 1;

        /* So, what's going on in the loop below?
         *
         * We use calculated fragment length to generate chained skb,
         * each of segments is IP fragment ready for sending to network after
         * adding appropriate IP header.
         */

        if (!skb)
                goto alloc_new_skb;

        while (length > 0) {
                /* Check if the remaining data fits into current packet. */
                copy = mtu - skb->len;
                if (copy < length)
                        copy = maxfraglen - skb->len;
                if (copy <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
                        unsigned int fraggap;
                        unsigned int alloclen, alloc_extra;
                        unsigned int pagedlen;
                        struct sk_buff *skb_prev;
alloc_new_skb:
                        skb_prev = skb;
                        if (skb_prev)
                                fraggap = skb_prev->len - maxfraglen;
                        else
                                fraggap = 0;

                        /*
                         * If remaining data exceeds the mtu,
                         * we know we need more fragment(s).
                         */
                        datalen = length + fraggap;
                        if (datalen > mtu - fragheaderlen)
                                datalen = maxfraglen - fragheaderlen;
                        fraglen = datalen + fragheaderlen;
                        pagedlen = 0;

                        alloc_extra = hh_len + 15;
                        alloc_extra += exthdrlen;

                        /* The last fragment gets additional space at tail.
                         * Note, with MSG_MORE we overallocate on fragments,
                         * because we have no idea what fragment will be
                         * the last.
                         */
                        if (datalen == length + fraggap)
                                alloc_extra += rt->dst.trailer_len;

                        if ((flags & MSG_MORE) &&
                            !(rt->dst.dev->features&NETIF_F_SG))
                                alloclen = mtu;
                        else if (!paged &&
                                 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
                                  !(rt->dst.dev->features & NETIF_F_SG)))
                                alloclen = fraglen;
                        else {
                                alloclen = fragheaderlen + transhdrlen;
                                pagedlen = datalen - transhdrlen;
                        }

                        alloclen += alloc_extra;

                        if (transhdrlen) {
                                skb = sock_alloc_send_skb(sk, alloclen,
                                                (flags & MSG_DONTWAIT), &err);
                        } else {
                                skb = NULL;
                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
                                    2 * sk->sk_sndbuf)
                                        skb = alloc_skb(alloclen,
                                                        sk->sk_allocation);
                                if (unlikely(!skb))
                                        err = -ENOBUFS;
                        }
                        if (!skb)
                                goto error;

                        /*
                         *        Fill in the control structures
                         */
                        skb->ip_summed = csummode;
                        skb->csum = 0;
                        skb_reserve(skb, hh_len);

                        /*
                         *        Find where to start putting bytes.
                         */
                        data = skb_put(skb, fraglen + exthdrlen - pagedlen);
                        skb_set_network_header(skb, exthdrlen);
                        skb->transport_header = (skb->network_header +
                                                 fragheaderlen);
                        data += fragheaderlen + exthdrlen;

                        if (fraggap) {
                                skb->csum = skb_copy_and_csum_bits(
                                        skb_prev, maxfraglen,
                                        data + transhdrlen, fraggap);
                                skb_prev->csum = csum_sub(skb_prev->csum,
                                                          skb->csum);
                                data += fraggap;
                                pskb_trim_unique(skb_prev, maxfraglen);
                        }

                        copy = datalen - transhdrlen - fraggap - pagedlen;
                        /* [!] NOTE: copy will be negative if pagedlen>0
                         * because then the equation reduces to -fraggap.
                         */
                        if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
                                err = -EFAULT;
                                kfree_skb(skb);
                                goto error;
                        } else if (flags & MSG_SPLICE_PAGES) {
                                copy = 0;
                        }

                        offset += copy;
                        length -= copy + transhdrlen;
                        transhdrlen = 0;
                        exthdrlen = 0;
                        csummode = CHECKSUM_NONE;

                        /* only the initial fragment is time stamped */
                        skb_shinfo(skb)->tx_flags = cork->tx_flags;
                        cork->tx_flags = 0;
                        skb_shinfo(skb)->tskey = tskey;
                        tskey = 0;
                        skb_zcopy_set(skb, uarg, &extra_uref);

                        if ((flags & MSG_CONFIRM) && !skb_prev)
                                skb_set_dst_pending_confirm(skb, 1);

                        /*
                         * Put the packet on the pending queue.
                         */
                        if (!skb->destructor) {
                                skb->destructor = sock_wfree;
                                skb->sk = sk;
                                wmem_alloc_delta += skb->truesize;
                        }
                        __skb_queue_tail(queue, skb);
                        continue;
                }

                if (copy > length)
                        copy = length;

                if (!(rt->dst.dev->features&NETIF_F_SG) &&
                    skb_tailroom(skb) >= copy) {
                        unsigned int off;

                        off = skb->len;
                        if (getfrag(from, skb_put(skb, copy),
                                        offset, copy, off, skb) < 0) {
                                __skb_trim(skb, off);
                                err = -EFAULT;
                                goto error;
                        }
                } else if (flags & MSG_SPLICE_PAGES) {
                        struct msghdr *msg = from;

                        err = -EIO;
                        if (WARN_ON_ONCE(copy > msg->msg_iter.count))
                                goto error;

                        err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
                                                   sk->sk_allocation);
                        if (err < 0)
                                goto error;
                        copy = err;
                        wmem_alloc_delta += copy;
                } else if (!zc) {
                        int i = skb_shinfo(skb)->nr_frags;

                        err = -ENOMEM;
                        if (!sk_page_frag_refill(sk, pfrag))
                                goto error;

                        skb_zcopy_downgrade_managed(skb);
                        if (!skb_can_coalesce(skb, i, pfrag->page,
                                              pfrag->offset)) {
                                err = -EMSGSIZE;
                                if (i == MAX_SKB_FRAGS)
                                        goto error;

                                __skb_fill_page_desc(skb, i, pfrag->page,
                                                     pfrag->offset, 0);
                                skb_shinfo(skb)->nr_frags = ++i;
                                get_page(pfrag->page);
                        }
                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
                        if (getfrag(from,
                                    page_address(pfrag->page) + pfrag->offset,
                                    offset, copy, skb->len, skb) < 0)
                                goto error_efault;

                        pfrag->offset += copy;
                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                        skb_len_add(skb, copy);
                        wmem_alloc_delta += copy;
                } else {
                        err = skb_zerocopy_iter_dgram(skb, from, copy);
                        if (err < 0)
                                goto error;
                }
                offset += copy;
                length -= copy;
        }

        if (wmem_alloc_delta)
                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
        return 0;

error_efault:
        err = -EFAULT;
error:
        net_zcopy_put_abort(uarg, extra_uref);
        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
        if (hold_tskey)
                atomic_dec(&sk->sk_tskey);
        return err;
}

static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
                         struct ipcm_cookie *ipc, struct rtable **rtp)
{
        struct ip_options_rcu *opt;
        struct rtable *rt;

        rt = *rtp;
        if (unlikely(!rt))
                return -EFAULT;

        cork->fragsize = ip_sk_use_pmtu(sk) ?
                         dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);

        if (!inetdev_valid_mtu(cork->fragsize))
                return -ENETUNREACH;

        /*
         * setup for corking.
         */
        opt = ipc->opt;
        if (opt) {
                if (!cork->opt) {
                        cork->opt = kmalloc(sizeof(struct ip_options) + 40,
                                            sk->sk_allocation);
                        if (unlikely(!cork->opt))
                                return -ENOBUFS;
                }
                memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
                cork->flags |= IPCORK_OPT;
                cork->addr = ipc->addr;
        }

        cork->gso_size = ipc->gso_size;

        cork->dst = &rt->dst;
        /* We stole this route, caller should not release it. */
        *rtp = NULL;

        cork->length = 0;
        cork->ttl = ipc->ttl;
        cork->tos = ipc->tos;
        cork->mark = ipc->sockc.mark;
        cork->priority = ipc->priority;
        cork->transmit_time = ipc->sockc.transmit_time;
        cork->tx_flags = 0;
        sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);

        return 0;
}

/*
 *        ip_append_data() can make one large IP datagram from many pieces of
 *        data.  Each piece will be held on the socket until
 *        ip_push_pending_frames() is called. Each piece can be a page or
 *        non-page data.
 *
 *        Not only UDP, other transport protocols - e.g. raw sockets - can use
 *        this interface potentially.
 *
 *        LATER: length must be adjusted by pad at tail, when it is required.
 */
int ip_append_data(struct sock *sk, struct flowi4 *fl4,
                   int getfrag(void *from, char *to, int offset, int len,
                               int odd, struct sk_buff *skb),
                   void *from, int length, int transhdrlen,
                   struct ipcm_cookie *ipc, struct rtable **rtp,
                   unsigned int flags)
{
        struct inet_sock *inet = inet_sk(sk);
        int err;

        if (flags&MSG_PROBE)
                return 0;

        if (skb_queue_empty(&sk->sk_write_queue)) {
                err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
                if (err)
                        return err;
        } else {
                transhdrlen = 0;
        }

        return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
                                sk_page_frag(sk), getfrag,
                                from, length, transhdrlen, flags);
}

static void ip_cork_release(struct inet_cork *cork)
{
        cork->flags &= ~IPCORK_OPT;
        kfree(cork->opt);
        cork->opt = NULL;
        dst_release(cork->dst);
        cork->dst = NULL;
}

/*
 *        Combined all pending IP fragments on the socket as one IP datagram
 *        and push them out.
 */
struct sk_buff *__ip_make_skb(struct sock *sk,
                              struct flowi4 *fl4,
                              struct sk_buff_head *queue,
                              struct inet_cork *cork)
{
        struct sk_buff *skb, *tmp_skb;
        struct sk_buff **tail_skb;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        struct ip_options *opt = NULL;
        struct rtable *rt = dst_rtable(cork->dst);
        struct iphdr *iph;
        u8 pmtudisc, ttl;
        __be16 df = 0;

        skb = __skb_dequeue(queue);
        if (!skb)
                goto out;
        tail_skb = &(skb_shinfo(skb)->frag_list);

        /* move skb->data to ip header from ext header */
        if (skb->data < skb_network_header(skb))
                __skb_pull(skb, skb_network_offset(skb));
        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
                __skb_pull(tmp_skb, skb_network_header_len(skb));
                *tail_skb = tmp_skb;
                tail_skb = &(tmp_skb->next);
                skb->len += tmp_skb->len;
                skb->data_len += tmp_skb->len;
                skb->truesize += tmp_skb->truesize;
                tmp_skb->destructor = NULL;
                tmp_skb->sk = NULL;
        }

        /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
         * to fragment the frame generated here. No matter, what transforms
         * how transforms change size of the packet, it will come out.
         */
        skb->ignore_df = ip_sk_ignore_df(sk);

        /* DF bit is set when we want to see DF on outgoing frames.
         * If ignore_df is set too, we still allow to fragment this frame
         * locally. */
        pmtudisc = READ_ONCE(inet->pmtudisc);
        if (pmtudisc == IP_PMTUDISC_DO ||
            pmtudisc == IP_PMTUDISC_PROBE ||
            (skb->len <= dst_mtu(&rt->dst) &&
             ip_dont_fragment(sk, &rt->dst)))
                df = htons(IP_DF);

        if (cork->flags & IPCORK_OPT)
                opt = cork->opt;

        if (cork->ttl != 0)
                ttl = cork->ttl;
        else if (rt->rt_type == RTN_MULTICAST)
                ttl = READ_ONCE(inet->mc_ttl);
        else
                ttl = ip_select_ttl(inet, &rt->dst);

        iph = ip_hdr(skb);
        iph->version = 4;
        iph->ihl = 5;
        iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
        iph->frag_off = df;
        iph->ttl = ttl;
        iph->protocol = sk->sk_protocol;
        ip_copy_addrs(iph, fl4);
        ip_select_ident(net, skb, sk);

        if (opt) {
                iph->ihl += opt->optlen >> 2;
                ip_options_build(skb, opt, cork->addr, rt);
        }

        skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
        skb->mark = cork->mark;
        if (sk_is_tcp(sk))
                skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
        else
                skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
        /*
         * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
         * on dst refcount
         */
        cork->dst = NULL;
        skb_dst_set(skb, &rt->dst);

        if (iph->protocol == IPPROTO_ICMP) {
                u8 icmp_type;

                /* For such sockets, transhdrlen is zero when do ip_append_data(),
                 * so icmphdr does not in skb linear region and can not get icmp_type
                 * by icmp_hdr(skb)->type.
                 */
                if (sk->sk_type == SOCK_RAW &&
                    !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH))
                        icmp_type = fl4->fl4_icmp_type;
                else
                        icmp_type = icmp_hdr(skb)->type;
                icmp_out_count(net, icmp_type);
        }

        ip_cork_release(cork);
out:
        return skb;
}

int ip_send_skb(struct net *net, struct sk_buff *skb)
{
        int err;

        err = ip_local_out(net, skb->sk, skb);
        if (err) {
                if (err > 0)
                        err = net_xmit_errno(err);
                if (err)
                        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
        }

        return err;
}

int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
{
        struct sk_buff *skb;

        skb = ip_finish_skb(sk, fl4);
        if (!skb)
                return 0;

        /* Netfilter gets whole the not fragmented skb. */
        return ip_send_skb(sock_net(sk), skb);
}

/*
 *        Throw away all pending data on the socket.
 */
static void __ip_flush_pending_frames(struct sock *sk,
                                      struct sk_buff_head *queue,
                                      struct inet_cork *cork)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue_tail(queue)) != NULL)
                kfree_skb(skb);

        ip_cork_release(cork);
}

void ip_flush_pending_frames(struct sock *sk)
{
        __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
}

struct sk_buff *ip_make_skb(struct sock *sk,
                            struct flowi4 *fl4,
                            int getfrag(void *from, char *to, int offset,
                                        int len, int odd, struct sk_buff *skb),
                            void *from, int length, int transhdrlen,
                            struct ipcm_cookie *ipc, struct rtable **rtp,
                            struct inet_cork *cork, unsigned int flags)
{
        struct sk_buff_head queue;
        int err;

        if (flags & MSG_PROBE)
                return NULL;

        __skb_queue_head_init(&queue);

        cork->flags = 0;
        cork->addr = 0;
        cork->opt = NULL;
        err = ip_setup_cork(sk, cork, ipc, rtp);
        if (err)
                return ERR_PTR(err);

        err = __ip_append_data(sk, fl4, &queue, cork,
                               &current->task_frag, getfrag,
                               from, length, transhdrlen, flags);
        if (err) {
                __ip_flush_pending_frames(sk, &queue, cork);
                return ERR_PTR(err);
        }

        return __ip_make_skb(sk, fl4, &queue, cork);
}

/*
 *        Fetch data from kernel space and fill in checksum if needed.
 */
static int ip_reply_glue_bits(void *dptr, char *to, int offset,
                              int len, int odd, struct sk_buff *skb)
{
        __wsum csum;

        csum = csum_partial_copy_nocheck(dptr+offset, to, len);
        skb->csum = csum_block_add(skb->csum, csum, odd);
        return 0;
}

/*
 *        Generic function to send a packet as reply to another packet.
 *        Used to send some TCP resets/acks so far.
 */
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
                           const struct ip_options *sopt,
                           __be32 daddr, __be32 saddr,
                           const struct ip_reply_arg *arg,
                           unsigned int len, u64 transmit_time, u32 txhash)
{
        struct ip_options_data replyopts;
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
        struct net *net = sock_net(sk);
        struct sk_buff *nskb;
        int err;
        int oif;

        if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
                return;

        ipcm_init(&ipc);
        ipc.addr = daddr;
        ipc.sockc.transmit_time = transmit_time;

        if (replyopts.opt.opt.optlen) {
                ipc.opt = &replyopts.opt;

                if (replyopts.opt.opt.srr)
                        daddr = replyopts.opt.opt.faddr;
        }

        oif = arg->bound_dev_if;
        if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
                oif = skb->skb_iif;

        flowi4_init_output(&fl4, oif,
                           IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
                           RT_TOS(arg->tos),
                           RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
                           ip_reply_arg_flowi_flags(arg),
                           daddr, saddr,
                           tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
                           arg->uid);
        security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
        rt = ip_route_output_flow(net, &fl4, sk);
        if (IS_ERR(rt))
                return;

        inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;

        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
        sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
        ipc.sockc.mark = fl4.flowi4_mark;
        err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
                             len, 0, &ipc, &rt, MSG_DONTWAIT);
        if (unlikely(err)) {
                ip_flush_pending_frames(sk);
                goto out;
        }

        nskb = skb_peek(&sk->sk_write_queue);
        if (nskb) {
                if (arg->csumoffset >= 0)
                        *((__sum16 *)skb_transport_header(nskb) +
                          arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                arg->csum));
                nskb->ip_summed = CHECKSUM_NONE;
                if (transmit_time)
                        nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
                if (txhash)
                        skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
                ip_push_pending_frames(sk, &fl4);
        }
out:
        ip_rt_put(rt);
}

void __init ip_init(void)
{
        ip_rt_init();
        inet_initpeers();

#if defined(CONFIG_IP_MULTICAST)
        igmp_mc_init();
#endif
}















































































































































































































































































































































































































































































































































































    2 

    2 
























































    1 
    2 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMSTAT_H
#define _LINUX_VMSTAT_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
#include <linux/mmdebug.h>

extern int sysctl_stat_interval;

#ifdef CONFIG_NUMA
#define ENABLE_NUMA_STAT   1
#define DISABLE_NUMA_STAT   0
extern int sysctl_vm_numa_stat;
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos);
#endif

struct reclaim_stat {
        unsigned nr_dirty;
        unsigned nr_unqueued_dirty;
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
        unsigned nr_pageout;
        unsigned nr_activate[ANON_AND_FILE];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
};

enum writeback_stat_item {
        NR_DIRTY_THRESHOLD,
        NR_DIRTY_BG_THRESHOLD,
        NR_VM_WRITEBACK_STAT_ITEMS,
};

#ifdef CONFIG_VM_EVENT_COUNTERS
/*
 * Light weight per cpu counter implementation.
 *
 * Counters should only be incremented and no critical kernel component
 * should rely on the counter values.
 *
 * Counters are handled completely inline. On many platforms the code
 * generated will simply be the increment of a global address.
 */

struct vm_event_state {
        unsigned long event[NR_VM_EVENT_ITEMS];
};

DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

/*
 * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
 * local_irq_disable overhead.
 */
static inline void __count_vm_event(enum vm_event_item item)
{
        raw_cpu_inc(vm_event_states.event[item]);
}

static inline void count_vm_event(enum vm_event_item item)
{
        this_cpu_inc(vm_event_states.event[item]);
}

static inline void __count_vm_events(enum vm_event_item item, long delta)
{
        raw_cpu_add(vm_event_states.event[item], delta);
}

static inline void count_vm_events(enum vm_event_item item, long delta)
{
        this_cpu_add(vm_event_states.event[item], delta);
}

extern void all_vm_events(unsigned long *);

extern void vm_events_fold_cpu(int cpu);

#else

/* Disable counters */
static inline void count_vm_event(enum vm_event_item item)
{
}
static inline void count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void __count_vm_event(enum vm_event_item item)
{
}
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void all_vm_events(unsigned long *ret)
{
}
static inline void vm_events_fold_cpu(int cpu)
{
}

#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_NUMA_BALANCING
#define count_vm_numa_event(x)     count_vm_event(x)
#define count_vm_numa_events(x, y) count_vm_events(x, y)
#else
#define count_vm_numa_event(x) do {} while (0)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_DEBUG_TLBFLUSH
#define count_vm_tlb_event(x)           count_vm_event(x)
#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
#else
#define count_vm_tlb_event(x)     do {} while (0)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif

#ifdef CONFIG_PER_VMA_LOCK_STATS
#define count_vm_vma_lock_event(x) count_vm_event(x)
#else
#define count_vm_vma_lock_event(x) do {} while (0)
#endif

#define __count_zid_vm_events(item, zid, delta) \
        __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

/*
 * Zone and node-based page accounting with per cpu differentials.
 */
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];

#ifdef CONFIG_NUMA
static inline void zone_numa_event_add(long x, struct zone *zone,
                                enum numa_stat_item item)
{
        atomic_long_add(x, &zone->vm_numa_event[item]);
        atomic_long_add(x, &vm_numa_event[item]);
}

static inline unsigned long zone_numa_event_state(struct zone *zone,
                                        enum numa_stat_item item)
{
        return atomic_long_read(&zone->vm_numa_event[item]);
}

static inline unsigned long
global_numa_event_state(enum numa_stat_item item)
{
        return atomic_long_read(&vm_numa_event[item]);
}
#endif /* CONFIG_NUMA */

static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
{
        atomic_long_add(x, &zone->vm_stat[item]);
        atomic_long_add(x, &vm_zone_stat[item]);
}

static inline void node_page_state_add(long x, struct pglist_data *pgdat,
                                 enum node_stat_item item)
{
        atomic_long_add(x, &pgdat->vm_stat[item]);
        atomic_long_add(x, &vm_node_stat[item]);
}

static inline unsigned long global_zone_page_state(enum zone_stat_item item)
{
        long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
        long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long global_node_page_state(enum node_stat_item item)
{
        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));

        return global_node_page_state_pages(item);
}

static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * More accurate version that also considers the currently pending
 * deltas. For that we need to loop over all cpus to find the current
 * deltas. There is no synchronization so the result cannot be
 * exactly accurate either.
 */
static inline unsigned long zone_page_state_snapshot(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
        int cpu;
        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item];

        if (x < 0)
                x = 0;
#endif
        return x;
}

#ifdef CONFIG_NUMA
/* See __count_vm_event comment on why raw_cpu_inc is used. */
static inline void
__count_numa_event(struct zone *zone, enum numa_stat_item item)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_inc(pzstats->vm_numa_event[item]);
}

static inline void
__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_add(pzstats->vm_numa_event[item], delta);
}

extern unsigned long sum_zone_node_page_state(int node,
                                              enum zone_stat_item item);
extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
                                           enum node_stat_item item);
extern void fold_vm_numa_events(void);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
static inline void fold_vm_numa_events(void)
{
}
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);

void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);

void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);

void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);

extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

struct ctl_table;
int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
                loff_t *ppos);

void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);

int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                                int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */

/*
 * We do not maintain differentials in a single processor configuration.
 * The functions directly modify the zone and global counters.
 */
static inline void __mod_zone_page_state(struct zone *zone,
                        enum zone_stat_item item, long delta)
{
        zone_page_state_add(delta, zone, item);
}

static inline void __mod_node_page_state(struct pglist_data *pgdat,
                        enum node_stat_item item, int delta)
{
        if (vmstat_item_in_bytes(item)) {
                /*
                 * Only cgroups use subpage accounting right now; at
                 * the global level, these items still change in
                 * multiples of whole pages. Store them as pages
                 * internally to keep the per-cpu counters compact.
                 */
                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
                delta >>= PAGE_SHIFT;
        }

        node_page_state_add(delta, pgdat, item);
}

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_inc(&zone->vm_stat[item]);
        atomic_long_inc(&vm_zone_stat[item]);
}

static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_inc(&pgdat->vm_stat[item]);
        atomic_long_inc(&vm_node_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_dec(&zone->vm_stat[item]);
        atomic_long_dec(&vm_zone_stat[item]);
}

static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_dec(&pgdat->vm_stat[item]);
        atomic_long_dec(&vm_node_stat[item]);
}

static inline void __inc_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __inc_zone_state(page_zone(page), item);
}

static inline void __inc_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __inc_node_state(page_pgdat(page), item);
}


static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __dec_zone_state(page_zone(page), item);
}

static inline void __dec_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __dec_node_state(page_pgdat(page), item);
}


/*
 * We only use atomic operations to update counters. So there is no need to
 * disable interrupts.
 */
#define inc_zone_page_state __inc_zone_page_state
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state

#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state

#define inc_zone_state __inc_zone_state
#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state

#define set_pgdat_percpu_threshold(pgdat, callback) { }

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }

static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_zonestat *pzstats) { }
#endif                /* CONFIG_SMP */

static inline void __zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        __mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void __zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void __zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void __node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        __mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void __node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void __node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

static inline void node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

extern const char * const vmstat_text[];

static inline const char *zone_stat_name(enum zone_stat_item item)
{
        return vmstat_text[item];
}

#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_NUMA */

static inline const char *node_stat_name(enum node_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           item];
}

static inline const char *lru_list_name(enum lru_list lru)
{
        return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
}

static inline const char *writeback_stat_name(enum writeback_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           item];
}

#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
static inline const char *vm_event_name(enum vm_event_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           NR_VM_WRITEBACK_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */

#ifdef CONFIG_MEMCG

void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

void __lruvec_stat_mod_folio(struct folio *folio,
                             enum node_stat_item idx, int val);

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __lruvec_stat_mod_folio(folio, idx, val);
        local_irq_restore(flags);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        lruvec_stat_mod_folio(page_folio(page), idx, val);
}

#else

static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
{
        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void __lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        __mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(page_pgdat(page), idx, val);
}

#endif /* CONFIG_MEMCG */

static inline void __lruvec_stat_add_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void __lruvec_stat_sub_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}

static inline void lruvec_stat_add_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void lruvec_stat_sub_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}
#endif /* _LINUX_VMSTAT_H */

























































































































































    1 








    1 












































    1 



    1 


























    1 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Connection tracking protocol helper module for GRE.
 *
 * GRE is a generic encapsulation protocol, which is generally not very
 * suited for NAT, as it has no protocol-specific part as port numbers.
 *
 * It has an optional key field, which may help us distinguishing two
 * connections between the same two hosts.
 *
 * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
 *
 * PPTP is built on top of a modified version of GRE, and has a mandatory
 * field called "CallID", which serves us for the same purpose as the key
 * field in plain GRE.
 *
 * Documentation about PPTP can be found in RFC 2637
 *
 * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
 *
 * Development of this code funded by Astaro AG (http://www.astaro.com/)
 *
 * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/list.h>
#include <linux/seq_file.h>
#include <linux/in.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
#include <linux/netfilter/nf_conntrack_pptp.h>

static const unsigned int gre_timeouts[GRE_CT_MAX] = {
        [GRE_CT_UNREPLIED]        = 30*HZ,
        [GRE_CT_REPLIED]        = 180*HZ,
};

/* used when expectation is added */
static DEFINE_SPINLOCK(keymap_lock);

static inline struct nf_gre_net *gre_pernet(struct net *net)
{
        return &net->ct.nf_ct_proto.gre;
}

static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
                                const struct nf_conntrack_tuple *t)
{
        return km->tuple.src.l3num == t->src.l3num &&
               !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) &&
               !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) &&
               km->tuple.dst.protonum == t->dst.protonum &&
               km->tuple.dst.u.all == t->dst.u.all;
}

/* look up the source key for a given tuple */
static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
{
        struct nf_gre_net *net_gre = gre_pernet(net);
        struct nf_ct_gre_keymap *km;
        __be16 key = 0;

        list_for_each_entry_rcu(km, &net_gre->keymap_list, list) {
                if (gre_key_cmpfn(km, t)) {
                        key = km->tuple.src.u.gre.key;
                        break;
                }
        }

        pr_debug("lookup src key 0x%x for ", key);
        nf_ct_dump_tuple(t);

        return key;
}

/* add a single keymap entry, associate with specified master ct */
int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
                         struct nf_conntrack_tuple *t)
{
        struct net *net = nf_ct_net(ct);
        struct nf_gre_net *net_gre = gre_pernet(net);
        struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
        struct nf_ct_gre_keymap **kmp, *km;

        kmp = &ct_pptp_info->keymap[dir];
        if (*kmp) {
                /* check whether it's a retransmission */
                list_for_each_entry_rcu(km, &net_gre->keymap_list, list) {
                        if (gre_key_cmpfn(km, t) && km == *kmp)
                                return 0;
                }
                pr_debug("trying to override keymap_%s for ct %p\n",
                         dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
                return -EEXIST;
        }

        km = kmalloc(sizeof(*km), GFP_ATOMIC);
        if (!km)
                return -ENOMEM;
        memcpy(&km->tuple, t, sizeof(*t));
        *kmp = km;

        pr_debug("adding new entry %p: ", km);
        nf_ct_dump_tuple(&km->tuple);

        spin_lock_bh(&keymap_lock);
        list_add_tail(&km->list, &net_gre->keymap_list);
        spin_unlock_bh(&keymap_lock);

        return 0;
}
EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);

/* destroy the keymap entries associated with specified master ct */
void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
{
        struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
        enum ip_conntrack_dir dir;

        pr_debug("entering for ct %p\n", ct);

        spin_lock_bh(&keymap_lock);
        for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
                if (ct_pptp_info->keymap[dir]) {
                        pr_debug("removing %p from list\n",
                                 ct_pptp_info->keymap[dir]);
                        list_del_rcu(&ct_pptp_info->keymap[dir]->list);
                        kfree_rcu(ct_pptp_info->keymap[dir], rcu);
                        ct_pptp_info->keymap[dir] = NULL;
                }
        }
        spin_unlock_bh(&keymap_lock);
}
EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);

/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */

/* gre hdr info to tuple */
bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
                      struct net *net, struct nf_conntrack_tuple *tuple)
{
        const struct pptp_gre_header *pgrehdr;
        struct pptp_gre_header _pgrehdr;
        __be16 srckey;
        const struct gre_base_hdr *grehdr;
        struct gre_base_hdr _grehdr;

        /* first only delinearize old RFC1701 GRE header */
        grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr);
        if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) {
                /* try to behave like "nf_conntrack_proto_generic" */
                tuple->src.u.all = 0;
                tuple->dst.u.all = 0;
                return true;
        }

        /* PPTP header is variable length, only need up to the call_id field */
        pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr);
        if (!pgrehdr)
                return true;

        if (grehdr->protocol != GRE_PROTO_PPP) {
                pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol));
                return false;
        }

        tuple->dst.u.gre.key = pgrehdr->call_id;
        srckey = gre_keymap_lookup(net, tuple);
        tuple->src.u.gre.key = srckey;

        return true;
}

#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* print private data for conntrack */
static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
        seq_printf(s, "timeout=%u, stream_timeout=%u ",
                   (ct->proto.gre.timeout / HZ),
                   (ct->proto.gre.stream_timeout / HZ));
}
#endif

static unsigned int *gre_get_timeouts(struct net *net)
{
        return gre_pernet(net)->timeouts;
}

/* Returns verdict for packet, and may modify conntrack */
int nf_conntrack_gre_packet(struct nf_conn *ct,
                            struct sk_buff *skb,
                            unsigned int dataoff,
                            enum ip_conntrack_info ctinfo,
                            const struct nf_hook_state *state)
{
        unsigned long status;

        if (!nf_ct_is_confirmed(ct)) {
                unsigned int *timeouts = nf_ct_timeout_lookup(ct);

                if (!timeouts)
                        timeouts = gre_get_timeouts(nf_ct_net(ct));

                /* initialize to sane value.  Ideally a conntrack helper
                 * (e.g. in case of pptp) is increasing them */
                ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
                ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
        }

        status = READ_ONCE(ct->status);
        /* If we've seen traffic both ways, this is a GRE connection.
         * Extend timeout. */
        if (status & IPS_SEEN_REPLY) {
                nf_ct_refresh_acct(ct, ctinfo, skb,
                                   ct->proto.gre.stream_timeout);

                /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
                if (unlikely((status & IPS_NAT_CLASH)))
                        return NF_ACCEPT;

                /* Also, more likely to be important, and not a probe. */
                if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
                        nf_conntrack_event_cache(IPCT_ASSURED, ct);
        } else
                nf_ct_refresh_acct(ct, ctinfo, skb,
                                   ct->proto.gre.timeout);

        return NF_ACCEPT;
}

#ifdef CONFIG_NF_CONNTRACK_TIMEOUT

#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>

static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
                                     struct net *net, void *data)
{
        unsigned int *timeouts = data;
        struct nf_gre_net *net_gre = gre_pernet(net);

        if (!timeouts)
                timeouts = gre_get_timeouts(net);
        /* set default timeouts for GRE. */
        timeouts[GRE_CT_UNREPLIED] = net_gre->timeouts[GRE_CT_UNREPLIED];
        timeouts[GRE_CT_REPLIED] = net_gre->timeouts[GRE_CT_REPLIED];

        if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) {
                timeouts[GRE_CT_UNREPLIED] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ;
        }
        if (tb[CTA_TIMEOUT_GRE_REPLIED]) {
                timeouts[GRE_CT_REPLIED] =
                        ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ;
        }
        return 0;
}

static int
gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
{
        const unsigned int *timeouts = data;

        if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED,
                         htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) ||
            nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED,
                         htonl(timeouts[GRE_CT_REPLIED] / HZ)))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -ENOSPC;
}

static const struct nla_policy
gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
        [CTA_TIMEOUT_GRE_UNREPLIED]        = { .type = NLA_U32 },
        [CTA_TIMEOUT_GRE_REPLIED]        = { .type = NLA_U32 },
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */

void nf_conntrack_gre_init_net(struct net *net)
{
        struct nf_gre_net *net_gre = gre_pernet(net);
        int i;

        INIT_LIST_HEAD(&net_gre->keymap_list);
        for (i = 0; i < GRE_CT_MAX; i++)
                net_gre->timeouts[i] = gre_timeouts[i];
}

/* protocol helper struct */
const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = {
        .l4proto         = IPPROTO_GRE,
        .allow_clash         = true,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
        .print_conntrack = gre_print_conntrack,
#endif
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
        .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
        .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
        .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
        .nla_policy         = nf_ct_port_nla_policy,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
        .ctnl_timeout    = {
                .nlattr_to_obj        = gre_timeout_nlattr_to_obj,
                .obj_to_nlattr        = gre_timeout_obj_to_nlattr,
                .nlattr_max        = CTA_TIMEOUT_GRE_MAX,
                .obj_size        = sizeof(unsigned int) * GRE_CT_MAX,
                .nla_policy        = gre_timeout_nla_policy,
        },
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
};










   14 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_INTERNAL_H
#define __X86_KERNEL_FPU_INTERNAL_H

extern struct fpstate init_fpstate;

/* CPU feature check wrappers */
static __always_inline __pure bool use_xsave(void)
{
        return cpu_feature_enabled(X86_FEATURE_XSAVE);
}

static __always_inline __pure bool use_fxsr(void)
{
        return cpu_feature_enabled(X86_FEATURE_FXSR);
}

#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ (void)(x); 0; })
#endif

/* Used in init.c */
extern void fpstate_init_user(struct fpstate *fpstate);
extern void fpstate_reset(struct fpu *fpu);

#endif






















































    1 





    1 




























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/cache.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/poll.h>
#include <linux/printk.h>
#include <linux/file.h>
#include <linux/limits.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/bug.h>

#include "internal.h"

static void proc_evict_inode(struct inode *inode)
{
        struct ctl_table_header *head;
        struct proc_inode *ei = PROC_I(inode);

        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);

        /* Stop tracking associated processes */
        if (ei->pid)
                proc_pid_evict_inode(ei);

        head = ei->sysctl;
        if (head) {
                RCU_INIT_POINTER(ei->sysctl, NULL);
                proc_sys_evict_inode(inode, head);
        }
}

static struct kmem_cache *proc_inode_cachep __ro_after_init;
static struct kmem_cache *pde_opener_cache __ro_after_init;

static struct inode *proc_alloc_inode(struct super_block *sb)
{
        struct proc_inode *ei;

        ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        ei->pid = NULL;
        ei->fd = 0;
        ei->op.proc_get_link = NULL;
        ei->pde = NULL;
        ei->sysctl = NULL;
        ei->sysctl_entry = NULL;
        INIT_HLIST_NODE(&ei->sibling_inodes);
        ei->ns_ops = NULL;
        return &ei->vfs_inode;
}

static void proc_free_inode(struct inode *inode)
{
        struct proc_inode *ei = PROC_I(inode);

        if (ei->pid)
                put_pid(ei->pid);
        /* Let go of any associated proc directory entry */
        if (ei->pde)
                pde_put(ei->pde);
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}

static void init_once(void *foo)
{
        struct proc_inode *ei = (struct proc_inode *) foo;

        inode_init_once(&ei->vfs_inode);
}

void __init proc_init_kmemcache(void)
{
        proc_inode_cachep = kmem_cache_create("proc_inode_cache",
                                             sizeof(struct proc_inode),
                                             0, (SLAB_RECLAIM_ACCOUNT|
                                                SLAB_ACCOUNT|
                                                SLAB_PANIC),
                                             init_once);
        pde_opener_cache =
                kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
                                  SLAB_ACCOUNT|SLAB_PANIC, NULL);
        proc_dir_entry_cache = kmem_cache_create_usercopy(
                "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
                offsetof(struct proc_dir_entry, inline_name),
                SIZEOF_PDE_INLINE_NAME, NULL);
        BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}

void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
        struct hlist_node *node;
        struct super_block *old_sb = NULL;

        rcu_read_lock();
        while ((node = hlist_first_rcu(inodes))) {
                struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
                struct super_block *sb;
                struct inode *inode;

                spin_lock(lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(lock);

                inode = &ei->vfs_inode;
                sb = inode->i_sb;
                if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
                        continue;
                inode = igrab(inode);
                rcu_read_unlock();
                if (sb != old_sb) {
                        if (old_sb)
                                deactivate_super(old_sb);
                        old_sb = sb;
                }
                if (unlikely(!inode)) {
                        rcu_read_lock();
                        continue;
                }

                if (S_ISDIR(inode->i_mode)) {
                        struct dentry *dir = d_find_any_alias(inode);
                        if (dir) {
                                d_invalidate(dir);
                                dput(dir);
                        }
                } else {
                        struct dentry *dentry;
                        while ((dentry = d_find_alias(inode))) {
                                d_invalidate(dentry);
                                dput(dentry);
                        }
                }
                iput(inode);

                rcu_read_lock();
        }
        rcu_read_unlock();
        if (old_sb)
                deactivate_super(old_sb);
}

static inline const char *hidepid2str(enum proc_hidepid v)
{
        switch (v) {
                case HIDEPID_OFF: return "off";
                case HIDEPID_NO_ACCESS: return "noaccess";
                case HIDEPID_INVISIBLE: return "invisible";
                case HIDEPID_NOT_PTRACEABLE: return "ptraceable";
        }
        WARN_ONCE(1, "bad hide_pid value: %d\n", v);
        return "unknown";
}

static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
        struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);

        if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
        if (fs_info->hide_pid != HIDEPID_OFF)
                seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid));
        if (fs_info->pidonly != PROC_PIDONLY_OFF)
                seq_printf(seq, ",subset=pid");

        return 0;
}

const struct super_operations proc_sops = {
        .alloc_inode        = proc_alloc_inode,
        .free_inode        = proc_free_inode,
        .drop_inode        = generic_delete_inode,
        .evict_inode        = proc_evict_inode,
        .statfs                = simple_statfs,
        .show_options        = proc_show_options,
};

enum {BIAS = -1U<<31};

static inline int use_pde(struct proc_dir_entry *pde)
{
        return likely(atomic_inc_unless_negative(&pde->in_use));
}

static void unuse_pde(struct proc_dir_entry *pde)
{
        if (unlikely(atomic_dec_return(&pde->in_use) == BIAS))
                complete(pde->pde_unload_completion);
}

/*
 * At most 2 contexts can enter this function: the one doing the last
 * close on the descriptor and whoever is deleting PDE itself.
 *
 * First to enter calls ->proc_release hook and signals its completion
 * to the second one which waits and then does nothing.
 *
 * PDE is locked on entry, unlocked on exit.
 */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
        __releases(&pde->pde_unload_lock)
{
        /*
         * close() (proc_reg_release()) can't delete an entry and proceed:
         * ->release hook needs to be available at the right moment.
         *
         * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
         * "struct file" needs to be available at the right moment.
         */
        if (pdeo->closing) {
                /* somebody else is doing that, just wait */
                DECLARE_COMPLETION_ONSTACK(c);
                pdeo->c = &c;
                spin_unlock(&pde->pde_unload_lock);
                wait_for_completion(&c);
        } else {
                struct file *file;
                struct completion *c;

                pdeo->closing = true;
                spin_unlock(&pde->pde_unload_lock);

                file = pdeo->file;
                pde->proc_ops->proc_release(file_inode(file), file);

                spin_lock(&pde->pde_unload_lock);
                /* Strictly after ->proc_release, see above. */
                list_del(&pdeo->lh);
                c = pdeo->c;
                spin_unlock(&pde->pde_unload_lock);
                if (unlikely(c))
                        complete(c);
                kmem_cache_free(pde_opener_cache, pdeo);
        }
}

void proc_entry_rundown(struct proc_dir_entry *de)
{
        DECLARE_COMPLETION_ONSTACK(c);
        /* Wait until all existing callers into module are done. */
        de->pde_unload_completion = &c;
        if (atomic_add_return(BIAS, &de->in_use) != BIAS)
                wait_for_completion(&c);

        /* ->pde_openers list can't grow from now on. */

        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
                pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
                close_pdeo(de, pdeo);
                spin_lock(&de->pde_unload_lock);
        }
        spin_unlock(&de->pde_unload_lock);
}

static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        loff_t rv = -EINVAL;

        if (pde_is_permanent(pde)) {
                return pde->proc_ops->proc_lseek(file, offset, whence);
        } else if (use_pde(pde)) {
                rv = pde->proc_ops->proc_lseek(file, offset, whence);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
        ssize_t ret;

        if (pde_is_permanent(pde))
                return pde->proc_ops->proc_read_iter(iocb, iter);

        if (!use_pde(pde))
                return -EIO;
        ret = pde->proc_ops->proc_read_iter(iocb, iter);
        unuse_pde(pde);
        return ret;
}

static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        typeof_member(struct proc_ops, proc_read) read;

        read = pde->proc_ops->proc_read;
        if (read)
                return read(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_read(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_read(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        typeof_member(struct proc_ops, proc_write) write;

        write = pde->proc_ops->proc_write;
        if (write)
                return write(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_write(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_write(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
        typeof_member(struct proc_ops, proc_poll) poll;

        poll = pde->proc_ops->proc_poll;
        if (poll)
                return poll(file, pts);
        return DEFAULT_POLLMASK;
}

static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        __poll_t rv = DEFAULT_POLLMASK;

        if (pde_is_permanent(pde)) {
                return pde_poll(pde, file, pts);
        } else if (use_pde(pde)) {
                rv = pde_poll(pde, file, pts);
                unuse_pde(pde);
        }
        return rv;
}

static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        typeof_member(struct proc_ops, proc_ioctl) ioctl;

        ioctl = pde->proc_ops->proc_ioctl;
        if (ioctl)
                return ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;

        if (pde_is_permanent(pde)) {
                return pde_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}

#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;

        compat_ioctl = pde->proc_ops->proc_compat_ioctl;
        if (compat_ioctl)
                return compat_ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;
        if (pde_is_permanent(pde)) {
                return pde_compat_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_compat_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}
#endif

static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
        typeof_member(struct proc_ops, proc_mmap) mmap;

        mmap = pde->proc_ops->proc_mmap;
        if (mmap)
                return mmap(file, vma);
        return -EIO;
}

static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        int rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_mmap(pde, file, vma);
        } else if (use_pde(pde)) {
                rv = pde_mmap(pde, file, vma);
                unuse_pde(pde);
        }
        return rv;
}

static unsigned long
pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        if (pde->proc_ops->proc_get_unmapped_area)
                return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);

#ifdef CONFIG_MMU
        return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags);
#endif

        return orig_addr;
}

static unsigned long
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        unsigned long rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
        } else if (use_pde(pde)) {
                rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
                unuse_pde(pde);
        }
        return rv;
}

static int proc_reg_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        typeof_member(struct proc_ops, proc_open) open;
        typeof_member(struct proc_ops, proc_release) release;
        struct pde_opener *pdeo;

        if (!pde->proc_ops->proc_lseek)
                file->f_mode &= ~FMODE_LSEEK;

        if (pde_is_permanent(pde)) {
                open = pde->proc_ops->proc_open;
                if (open)
                        rv = open(inode, file);
                return rv;
        }

        /*
         * Ensure that
         * 1) PDE's ->release hook will be called no matter what
         *    either normally by close()/->release, or forcefully by
         *    rmmod/remove_proc_entry.
         *
         * 2) rmmod isn't blocked by opening file in /proc and sitting on
         *    the descriptor (including "rmmod foo </proc/foo" scenario).
         *
         * Save every "struct file" with custom ->release hook.
         */
        if (!use_pde(pde))
                return -ENOENT;

        release = pde->proc_ops->proc_release;
        if (release) {
                pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
                if (!pdeo) {
                        rv = -ENOMEM;
                        goto out_unuse;
                }
        }

        open = pde->proc_ops->proc_open;
        if (open)
                rv = open(inode, file);

        if (release) {
                if (rv == 0) {
                        /* To know what to release. */
                        pdeo->file = file;
                        pdeo->closing = false;
                        pdeo->c = NULL;
                        spin_lock(&pde->pde_unload_lock);
                        list_add(&pdeo->lh, &pde->pde_openers);
                        spin_unlock(&pde->pde_unload_lock);
                } else
                        kmem_cache_free(pde_opener_cache, pdeo);
        }

out_unuse:
        unuse_pde(pde);
        return rv;
}

static int proc_reg_release(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *pde = PDE(inode);
        struct pde_opener *pdeo;

        if (pde_is_permanent(pde)) {
                typeof_member(struct proc_ops, proc_release) release;

                release = pde->proc_ops->proc_release;
                if (release) {
                        return release(inode, file);
                }
                return 0;
        }

        spin_lock(&pde->pde_unload_lock);
        list_for_each_entry(pdeo, &pde->pde_openers, lh) {
                if (pdeo->file == file) {
                        close_pdeo(pde, pdeo);
                        return 0;
                }
        }
        spin_unlock(&pde->pde_unload_lock);
        return 0;
}

static const struct file_operations proc_reg_file_ops = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .write                = proc_reg_write,
        .splice_read        = copy_splice_read,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

#ifdef CONFIG_COMPAT
static const struct file_operations proc_reg_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .splice_read        = copy_splice_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};
#endif

static void proc_put_link(void *p)
{
        unuse_pde(p);
}

static const char *proc_get_link(struct dentry *dentry,
                                 struct inode *inode,
                                 struct delayed_call *done)
{
        struct proc_dir_entry *pde = PDE(inode);
        if (!use_pde(pde))
                return ERR_PTR(-EINVAL);
        set_delayed_call(done, proc_put_link, pde);
        return pde->data;
}

const struct inode_operations proc_link_inode_operations = {
        .get_link        = proc_get_link,
};

struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
        struct inode *inode = new_inode(sb);

        if (!inode) {
                pde_put(de);
                return NULL;
        }

        inode->i_private = de->data;
        inode->i_ino = de->low_ino;
        simple_inode_init_ts(inode);
        PROC_I(inode)->pde = de;
        if (is_empty_pde(de)) {
                make_empty_dir_inode(inode);
                return inode;
        }

        if (de->mode) {
                inode->i_mode = de->mode;
                inode->i_uid = de->uid;
                inode->i_gid = de->gid;
        }
        if (de->size)
                inode->i_size = de->size;
        if (de->nlink)
                set_nlink(inode, de->nlink);

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                if (de->proc_ops->proc_read_iter)
                        inode->i_fop = &proc_iter_file_ops;
                else
                        inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
                if (de->proc_ops->proc_compat_ioctl) {
                        if (de->proc_ops->proc_read_iter)
                                inode->i_fop = &proc_iter_file_ops_compat;
                        else
                                inode->i_fop = &proc_reg_file_ops_compat;
                }
#endif
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = de->proc_dir_ops;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = NULL;
        } else {
                BUG();
        }
        return inode;
}












































































































































    1 































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 09Apr2002        Andrew Morton
 *                Initial version.
 */

/**
 * DOC: Readahead Overview
 *
 * Readahead is used to read content into the page cache before it is
 * explicitly requested by the application.  Readahead only ever
 * attempts to read folios that are not yet in the page cache.  If a
 * folio is present but not up-to-date, readahead will not try to read
 * it. In that case a simple ->read_folio() will be requested.
 *
 * Readahead is triggered when an application read request (whether a
 * system call or a page fault) finds that the requested folio is not in
 * the page cache, or that it is in the page cache and has the
 * readahead flag set.  This flag indicates that the folio was read
 * as part of a previous readahead request and now that it has been
 * accessed, it is time for the next readahead.
 *
 * Each readahead request is partly synchronous read, and partly async
 * readahead.  This is reflected in the struct file_ra_state which
 * contains ->size being the total number of pages, and ->async_size
 * which is the number of pages in the async section.  The readahead
 * flag will be set on the first folio in this async section to trigger
 * a subsequent readahead.  Once a series of sequential reads has been
 * established, there should be no need for a synchronous component and
 * all readahead request will be fully asynchronous.
 *
 * When either of the triggers causes a readahead, three numbers need
 * to be determined: the start of the region to read, the size of the
 * region, and the size of the async tail.
 *
 * The start of the region is simply the first page address at or after
 * the accessed address, which is not currently populated in the page
 * cache.  This is found with a simple search in the page cache.
 *
 * The size of the async tail is determined by subtracting the size that
 * was explicitly requested from the determined request size, unless
 * this would be less than zero - then zero is used.  NOTE THIS
 * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
 * PAGE.  ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
 *
 * The size of the region is normally determined from the size of the
 * previous readahead which loaded the preceding pages.  This may be
 * discovered from the struct file_ra_state for simple sequential reads,
 * or from examining the state of the page cache when multiple
 * sequential reads are interleaved.  Specifically: where the readahead
 * was triggered by the readahead flag, the size of the previous
 * readahead is assumed to be the number of pages from the triggering
 * page to the start of the new readahead.  In these cases, the size of
 * the previous readahead is scaled, often doubled, for the new
 * readahead, though see get_next_ra_size() for details.
 *
 * If the size of the previous read cannot be determined, the number of
 * preceding pages in the page cache is used to estimate the size of
 * a previous read.  This estimate could easily be misled by random
 * reads being coincidentally adjacent, so it is ignored unless it is
 * larger than the current request, and it is not scaled up, unless it
 * is at the start of file.
 *
 * In general readahead is accelerated at the start of the file, as
 * reads from there are often sequential.  There are other minor
 * adjustments to the readahead size in various special cases and these
 * are best discovered by reading the code.
 *
 * The above calculation, based on the previous readahead size,
 * determines the size of the readahead, to which any requested read
 * size may be added.
 *
 * Readahead requests are sent to the filesystem using the ->readahead()
 * address space operation, for which mpage_readahead() is a canonical
 * implementation.  ->readahead() should normally initiate reads on all
 * folios, but may fail to read any or all folios without causing an I/O
 * error.  The page cache reading code will issue a ->read_folio() request
 * for any folio which ->readahead() did not read, and only an error
 * from this will be final.
 *
 * ->readahead() will generally call readahead_folio() repeatedly to get
 * each folio from those prepared for readahead.  It may fail to read a
 * folio by:
 *
 * * not calling readahead_folio() sufficiently many times, effectively
 *   ignoring some folios, as might be appropriate if the path to
 *   storage is congested.
 *
 * * failing to actually submit a read request for a given folio,
 *   possibly due to insufficient resources, or
 *
 * * getting an error during subsequent processing of a request.
 *
 * In the last two cases, the folio should be unlocked by the filesystem
 * to indicate that the read attempt has failed.  In the first case the
 * folio will be unlocked by the VFS.
 *
 * Those folios not in the final ``async_size`` of the request should be
 * considered to be important and ->readahead() should not fail them due
 * to congestion or temporary resource unavailability, but should wait
 * for necessary resources (e.g.  memory or indexing information) to
 * become available.  Folios in the final ``async_size`` may be
 * considered less urgent and failure to read them is more acceptable.
 * In this case it is best to use filemap_remove_folio() to remove the
 * folios from the page cache as is automatically done for folios that
 * were not fetched with readahead_folio().  This will allow a
 * subsequent synchronous readahead request to try them again.  If they
 * are left in the page cache, then they will be read individually using
 * ->read_folio() which may be less efficient.
 */

#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagemap.h>
#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>

#include "internal.h"

/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
        ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
        ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

static void read_pages(struct readahead_control *rac)
{
        const struct address_space_operations *aops = rac->mapping->a_ops;
        struct folio *folio;
        struct blk_plug plug;

        if (!readahead_count(rac))
                return;

        if (unlikely(rac->_workingset))
                psi_memstall_enter(&rac->_pflags);
        blk_start_plug(&plug);

        if (aops->readahead) {
                aops->readahead(rac);
                /*
                 * Clean up the remaining folios.  The sizes in ->ra
                 * may be used to size the next readahead, so make sure
                 * they accurately reflect what happened.
                 */
                while ((folio = readahead_folio(rac)) != NULL) {
                        unsigned long nr = folio_nr_pages(folio);

                        folio_get(folio);
                        rac->ra->size -= nr;
                        if (rac->ra->async_size >= nr) {
                                rac->ra->async_size -= nr;
                                filemap_remove_folio(folio);
                        }
                        folio_unlock(folio);
                        folio_put(folio);
                }
        } else {
                while ((folio = readahead_folio(rac)) != NULL)
                        aops->read_folio(rac->file, folio);
        }

        blk_finish_plug(&plug);
        if (unlikely(rac->_workingset))
                psi_memstall_leave(&rac->_pflags);
        rac->_workingset = false;

        BUG_ON(readahead_count(rac));
}

/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long i;

        /*
         * Partway through the readahead operation, we will have added
         * locked pages to the page cache, but will not yet have submitted
         * them for I/O.  Adding another page may need to allocate memory,
         * which can trigger memory reclaim.  Telling the VM we're in
         * the middle of a filesystem operation will cause it to not
         * touch file-backed pages, preventing a deadlock.  Most (all?)
         * filesystems already specify __GFP_NOFS in their mapping's
         * gfp_mask, but let's be explicit here.
         */
        unsigned int nofs = memalloc_nofs_save();

        filemap_invalidate_lock_shared(mapping);
        /*
         * Preallocate as many pages as we will need.
         */
        for (i = 0; i < nr_to_read; i++) {
                struct folio *folio = xa_load(&mapping->i_pages, index + i);
                int ret;

                if (folio && !xa_is_value(folio)) {
                        /*
                         * Page already present?  Kick off the current batch
                         * of contiguous pages before continuing with the
                         * next batch.  This page may be the one we would
                         * have intended to mark as Readahead, but we don't
                         * have a stable reference to this page, and it's
                         * not worth getting one just for that.
                         */
                        read_pages(ractl);
                        ractl->_index++;
                        i = ractl->_index + ractl->_nr_pages - index - 1;
                        continue;
                }

                folio = filemap_alloc_folio(gfp_mask, 0);
                if (!folio)
                        break;

                ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
                if (ret < 0) {
                        folio_put(folio);
                        if (ret == -ENOMEM)
                                break;
                        read_pages(ractl);
                        ractl->_index++;
                        i = ractl->_index + ractl->_nr_pages - index - 1;
                        continue;
                }
                if (i == nr_to_read - lookahead_size)
                        folio_set_readahead(folio);
                ractl->_workingset |= folio_test_workingset(folio);
                ractl->_nr_pages++;
        }

        /*
         * Now start the IO.  We ignore I/O errors - if the folio is not
         * uptodate then the caller will launch read_folio again, and
         * will then handle the error.
         */
        read_pages(ractl);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);

/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
static void do_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct inode *inode = ractl->mapping->host;
        unsigned long index = readahead_index(ractl);
        loff_t isize = i_size_read(inode);
        pgoff_t end_index;        /* The last page we want to read */

        if (isize == 0)
                return;

        end_index = (isize - 1) >> PAGE_SHIFT;
        if (index > end_index)
                return;
        /* Don't read past the page containing the last byte of the file */
        if (nr_to_read > end_index - index)
                nr_to_read = end_index - index + 1;

        page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
void force_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        unsigned long max_pages, index;

        if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
                return;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        index = readahead_index(ractl);
        max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
        nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
        while (nr_to_read) {
                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

                if (this_chunk > nr_to_read)
                        this_chunk = nr_to_read;
                ractl->_index = index;
                do_page_cache_ra(ractl, this_chunk, 0);

                index += this_chunk;
                nr_to_read -= this_chunk;
        }
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
        unsigned long newsize = roundup_pow_of_two(size);

        if (newsize <= max / 32)
                newsize = newsize * 4;
        else if (newsize <= max / 4)
                newsize = newsize * 2;
        else
                newsize = max;

        return newsize;
}

/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
                                      unsigned long max)
{
        unsigned long cur = ra->size;

        if (cur < max / 16)
                return 4 * cur;
        if (cur <= max / 2)
                return 2 * cur;
        return max;
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
 * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readhead.
 */

/*
 * Count contiguously cached pages from @index-1 to @index-@max,
 * this count is a conservative estimation of
 *         - length of the sequential read sequence, or
 *         - thrashing threshold in memory tight systems
 */
static pgoff_t count_history_pages(struct address_space *mapping,
                                   pgoff_t index, unsigned long max)
{
        pgoff_t head;

        rcu_read_lock();
        head = page_cache_prev_miss(mapping, index - 1, max);
        rcu_read_unlock();

        return index - 1 - head;
}

/*
 * page cache context based readahead
 */
static int try_context_readahead(struct address_space *mapping,
                                 struct file_ra_state *ra,
                                 pgoff_t index,
                                 unsigned long req_size,
                                 unsigned long max)
{
        pgoff_t size;

        size = count_history_pages(mapping, index, max);

        /*
         * not enough history pages:
         * it could be a random read
         */
        if (size <= req_size)
                return 0;

        /*
         * starts from beginning of file:
         * it is a strong indication of long-run stream (or whole-file-read)
         */
        if (size >= index)
                size *= 2;

        ra->start = index;
        ra->size = min(size + req_size, max);
        ra->async_size = 1;

        return 1;
}

static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
                pgoff_t mark, unsigned int order, gfp_t gfp)
{
        int err;
        struct folio *folio = filemap_alloc_folio(gfp, order);

        if (!folio)
                return -ENOMEM;
        mark = round_down(mark, 1UL << order);
        if (index == mark)
                folio_set_readahead(folio);
        err = filemap_add_folio(ractl->mapping, folio, index, gfp);
        if (err) {
                folio_put(folio);
                return err;
        }

        ractl->_nr_pages += 1UL << order;
        ractl->_workingset |= folio_test_workingset(folio);
        return 0;
}

void page_cache_ra_order(struct readahead_control *ractl,
                struct file_ra_state *ra, unsigned int new_order)
{
        struct address_space *mapping = ractl->mapping;
        pgoff_t index = readahead_index(ractl);
        pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
        pgoff_t mark = index + ra->size - ra->async_size;
        unsigned int nofs;
        int err = 0;
        gfp_t gfp = readahead_gfp_mask(mapping);

        if (!mapping_large_folio_support(mapping) || ra->size < 4)
                goto fallback;

        limit = min(limit, index + ra->size - 1);

        if (new_order < MAX_PAGECACHE_ORDER) {
                new_order += 2;
                new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
                new_order = min_t(unsigned int, new_order, ilog2(ra->size));
        }

        /* See comment in page_cache_ra_unbounded() */
        nofs = memalloc_nofs_save();
        filemap_invalidate_lock_shared(mapping);
        while (index <= limit) {
                unsigned int order = new_order;

                /* Align with smaller pages if needed */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);
                /* Don't allocate pages past EOF */
                while (index + (1UL << order) - 1 > limit)
                        order--;
                err = ra_alloc_folio(ractl, index, mark, order, gfp);
                if (err)
                        break;
                index += 1UL << order;
        }

        if (index > limit) {
                ra->size += index - limit - 1;
                ra->async_size += index - limit - 1;
        }

        read_pages(ractl);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);

        /*
         * If there were already pages in the page cache, then we may have
         * left some gaps.  Let the regular readahead code take care of this
         * situation.
         */
        if (!err)
                return;
fallback:
        do_page_cache_ra(ractl, ra->size, ra->async_size);
}

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct readahead_control *ractl,
                struct folio *folio, unsigned long req_size)
{
        struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
        struct file_ra_state *ra = ractl->ra;
        unsigned long max_pages = ra->ra_pages;
        unsigned long add_pages;
        pgoff_t index = readahead_index(ractl);
        pgoff_t expected, prev_index;
        unsigned int order = folio ? folio_order(folio) : 0;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        if (req_size > max_pages && bdi->io_pages > max_pages)
                max_pages = min(req_size, bdi->io_pages);

        /*
         * start of file
         */
        if (!index)
                goto initial_readahead;

        /*
         * It's the expected callback index, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
        expected = round_down(ra->start + ra->size - ra->async_size,
                        1UL << order);
        if (index == expected || index == (ra->start + ra->size)) {
                ra->start += ra->size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * Hit a marked folio without valid readahead state.
         * E.g. interleaved reads.
         * Query the pagecache for async_size, which normally equals to
         * readahead size. Ramp it up and use it as the new readahead size.
         */
        if (folio) {
                pgoff_t start;

                rcu_read_lock();
                start = page_cache_next_miss(ractl->mapping, index + 1,
                                max_pages);
                rcu_read_unlock();

                if (!start || start - index > max_pages)
                        return;

                ra->start = start;
                ra->size = start - index;        /* old async_size */
                ra->size += req_size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * oversize read
         */
        if (req_size > max_pages)
                goto initial_readahead;

        /*
         * sequential cache miss
         * trivial case: (index - prev_index) == 1
         * unaligned reads: (index - prev_index) == 0
         */
        prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
        if (index - prev_index <= 1UL)
                goto initial_readahead;

        /*
         * Query the page cache and look for the traces(cached history pages)
         * that a sequential stream would leave behind.
         */
        if (try_context_readahead(ractl->mapping, ra, index, req_size,
                        max_pages))
                goto readit;

        /*
         * standalone, small random read
         * Read as is, and do not pollute the readahead state.
         */
        do_page_cache_ra(ractl, req_size, 0);
        return;

initial_readahead:
        ra->start = index;
        ra->size = get_init_ra_size(req_size, max_pages);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
        /*
         * Will this read hit the readahead marker made by itself?
         * If so, trigger the readahead marker hit now, and merge
         * the resulted next readahead window into the current one.
         * Take care of maximum IO pages as above.
         */
        if (index == ra->start && ra->size == ra->async_size) {
                add_pages = get_next_ra_size(ra, max_pages);
                if (ra->size + add_pages <= max_pages) {
                        ra->async_size = add_pages;
                        ra->size += add_pages;
                } else {
                        ra->size = max_pages;
                        ra->async_size = max_pages >> 1;
                }
        }

        ractl->_index = ra->start;
        page_cache_ra_order(ractl, ra, order);
}

void page_cache_sync_ra(struct readahead_control *ractl,
                unsigned long req_count)
{
        bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);

        /*
         * Even if readahead is disabled, issue this request as readahead
         * as we'll need it to satisfy the requested range. The forced
         * readahead will do the right thing and limit the read to just the
         * requested range, which we'll set to 1 page for this case.
         */
        if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
                if (!ractl->file)
                        return;
                req_count = 1;
                do_forced_ra = true;
        }

        /* be dumb */
        if (do_forced_ra) {
                force_page_cache_ra(ractl, req_count);
                return;
        }

        ondemand_readahead(ractl, NULL, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);

void page_cache_async_ra(struct readahead_control *ractl,
                struct folio *folio, unsigned long req_count)
{
        /* no readahead */
        if (!ractl->ra->ra_pages)
                return;

        /*
         * Same bit is used for PG_readahead and PG_reclaim.
         */
        if (folio_test_writeback(folio))
                return;

        folio_clear_readahead(folio);

        if (blk_cgroup_congested())
                return;

        ondemand_readahead(ractl, folio, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);

ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
        ssize_t ret;
        struct fd f;

        ret = -EBADF;
        f = fdget(fd);
        if (!f.file || !(f.file->f_mode & FMODE_READ))
                goto out;

        /*
         * The readahead() syscall is intended to run only on files
         * that can execute readahead. If readahead is not possible
         * on this file, then we must return -EINVAL.
         */
        ret = -EINVAL;
        if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
            (!S_ISREG(file_inode(f.file)->i_mode) &&
            !S_ISBLK(file_inode(f.file)->i_mode)))
                goto out;

        ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
out:
        fdput(f);
        return ret;
}

SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
        return ksys_readahead(fd, offset, count);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD)
COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count)
{
        return ksys_readahead(fd, compat_arg_u64_glue(offset), count);
}
#endif

/**
 * readahead_expand - Expand a readahead request
 * @ractl: The request to be expanded
 * @new_start: The revised start
 * @new_len: The revised size of the request
 *
 * Attempt to expand a readahead request outwards from the current size to the
 * specified size by inserting locked pages before and after the current window
 * to increase the size to the new window.  This may involve the insertion of
 * THPs, in which case the window may get expanded even beyond what was
 * requested.
 *
 * The algorithm will stop if it encounters a conflicting page already in the
 * pagecache and leave a smaller expansion than requested.
 *
 * The caller must check for this by examining the revised @ractl object for a
 * different expansion than was requested.
 */
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        pgoff_t new_index, new_nr_pages;
        gfp_t gfp_mask = readahead_gfp_mask(mapping);

        new_index = new_start / PAGE_SIZE;

        /* Expand the leading edge downwards */
        while (ractl->_index > new_index) {
                unsigned long index = ractl->_index - 1;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = filemap_alloc_folio(gfp_mask, 0);
                if (!folio)
                        return;
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages++;
                ractl->_index = folio->index;
        }

        new_len += new_start - readahead_pos(ractl);
        new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);

        /* Expand the trailing edge upwards */
        while (ractl->_nr_pages < new_nr_pages) {
                unsigned long index = ractl->_index + ractl->_nr_pages;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = filemap_alloc_folio(gfp_mask, 0);
                if (!folio)
                        return;
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages++;
                if (ra) {
                        ra->size++;
                        ra->async_size++;
                }
        }
}
EXPORT_SYMBOL(readahead_expand);



































    2 








    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UTSNAME_H
#define _LINUX_UTSNAME_H


#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>
#include <uapi/linux/utsname.h>

enum uts_proc {
        UTS_PROC_ARCH,
        UTS_PROC_OSTYPE,
        UTS_PROC_OSRELEASE,
        UTS_PROC_VERSION,
        UTS_PROC_HOSTNAME,
        UTS_PROC_DOMAINNAME,
};

struct user_namespace;
extern struct user_namespace init_user_ns;

struct uts_namespace {
        struct new_utsname name;
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        struct ns_common ns;
} __randomize_layout;
extern struct uts_namespace init_uts_ns;

#ifdef CONFIG_UTS_NS
static inline void get_uts_ns(struct uts_namespace *ns)
{
        refcount_inc(&ns->ns.count);
}

extern struct uts_namespace *copy_utsname(unsigned long flags,
        struct user_namespace *user_ns, struct uts_namespace *old_ns);
extern void free_uts_ns(struct uts_namespace *ns);

static inline void put_uts_ns(struct uts_namespace *ns)
{
        if (refcount_dec_and_test(&ns->ns.count))
                free_uts_ns(ns);
}

void uts_ns_init(void);
#else
static inline void get_uts_ns(struct uts_namespace *ns)
{
}

static inline void put_uts_ns(struct uts_namespace *ns)
{
}

static inline struct uts_namespace *copy_utsname(unsigned long flags,
        struct user_namespace *user_ns, struct uts_namespace *old_ns)
{
        if (flags & CLONE_NEWUTS)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline void uts_ns_init(void)
{
}
#endif

#ifdef CONFIG_PROC_SYSCTL
extern void uts_proc_notify(enum uts_proc proc);
#else
static inline void uts_proc_notify(enum uts_proc proc)
{
}
#endif

static inline struct new_utsname *utsname(void)
{
        return &current->nsproxy->uts_ns->name;
}

static inline struct new_utsname *init_utsname(void)
{
        return &init_uts_ns.name;
}

extern struct rw_semaphore uts_sem;

#endif /* _LINUX_UTSNAME_H */





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 









    3 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swapfile.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 */

#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <linux/mutex.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/zswap.h>
#include <linux/plist.h>

#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
#include "internal.h"
#include "swap.h"

static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                 unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);

static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
/*
 * Some modules use swappable objects and may try to swap them out under
 * memory pressure (via the shrinker). Before doing so, they may wish to
 * check to see if any swap space is available.
 */
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority = -1;
unsigned long swapfile_maximum_size;
#ifdef CONFIG_MIGRATION
bool swap_migration_ad_supported;
#endif        /* CONFIG_MIGRATION */

static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";

/*
 * all active swap_info_structs
 * protected with swap_lock, and ordered by priority.
 */
static PLIST_HEAD(swap_active_head);

/*
 * all available (active, not full) swap_info_structs
 * protected with swap_avail_lock, ordered by priority.
 * This is used by folio_alloc_swap() instead of swap_active_head
 * because swap_active_head includes all swap_info_structs,
 * but folio_alloc_swap() doesn't need to look at full ones.
 * This uses its own lock instead of swap_lock because when a
 * swap_info_struct changes between not-full/full, it needs to
 * add/remove itself to/from this list, but the swap_info_struct->lock
 * is held and the locking order requires swap_lock to be taken
 * before any swap_info_struct->lock.
 */
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);

static struct swap_info_struct *swap_info[MAX_SWAPFILES];

static DEFINE_MUTEX(swapon_mutex);

static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
static atomic_t proc_poll_event = ATOMIC_INIT(0);

atomic_t nr_rotate_swap = ATOMIC_INIT(0);

static struct swap_info_struct *swap_type_to_swap_info(int type)
{
        if (type >= MAX_SWAPFILES)
                return NULL;

        return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}

static inline unsigned char swap_count(unsigned char ent)
{
        return ent & ~SWAP_HAS_CACHE;        /* may include COUNT_CONTINUED flag */
}

/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY                0x1
/*
 * Reclaim the swap entry if there are no more mappings of the
 * corresponding page
 */
#define TTRS_UNMAPPED                0x2
/* Reclaim the swap entry if swap is getting full*/
#define TTRS_FULL                0x4

/*
 * returns number of pages in the folio that backs the swap entry. If positive,
 * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
 * folio was associated with the swap entry.
 */
static int __try_to_reclaim_swap(struct swap_info_struct *si,
                                 unsigned long offset, unsigned long flags)
{
        swp_entry_t entry = swp_entry(si->type, offset);
        struct folio *folio;
        int ret = 0;

        folio = filemap_get_folio(swap_address_space(entry), offset);
        if (IS_ERR(folio))
                return 0;
        /*
         * When this function is called from scan_swap_map_slots() and it's
         * called by vmscan.c at reclaiming folios. So we hold a folio lock
         * here. We have to use trylock for avoiding deadlock. This is a special
         * case and you should use folio_free_swap() with explicit folio_lock()
         * in usual operations.
         */
        if (folio_trylock(folio)) {
                if ((flags & TTRS_ANYWAY) ||
                    ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
                    ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
                        ret = folio_free_swap(folio);
                folio_unlock(folio);
        }
        ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
        folio_put(folio);
        return ret;
}

static inline struct swap_extent *first_se(struct swap_info_struct *sis)
{
        struct rb_node *rb = rb_first(&sis->swap_extent_root);
        return rb_entry(rb, struct swap_extent, rb_node);
}

static inline struct swap_extent *next_se(struct swap_extent *se)
{
        struct rb_node *rb = rb_next(&se->rb_node);
        return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
}

/*
 * swapon tell device that all the old swap contents can be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static int discard_swap(struct swap_info_struct *si)
{
        struct swap_extent *se;
        sector_t start_block;
        sector_t nr_blocks;
        int err = 0;

        /* Do not discard the swap header page! */
        se = first_se(si);
        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        return err;
                cond_resched();
        }

        for (se = next_se(se); se; se = next_se(se)) {
                start_block = se->start_block << (PAGE_SHIFT - 9);
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);

                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        break;

                cond_resched();
        }
        return err;                /* That will often be -EOPNOTSUPP */
}

static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
{
        struct swap_extent *se;
        struct rb_node *rb;

        rb = sis->swap_extent_root.rb_node;
        while (rb) {
                se = rb_entry(rb, struct swap_extent, rb_node);
                if (offset < se->start_page)
                        rb = rb->rb_left;
                else if (offset >= se->start_page + se->nr_pages)
                        rb = rb->rb_right;
                else
                        return se;
        }
        /* It *must* be present */
        BUG();
}

sector_t swap_folio_sector(struct folio *folio)
{
        struct swap_info_struct *sis = swp_swap_info(folio->swap);
        struct swap_extent *se;
        sector_t sector;
        pgoff_t offset;

        offset = swp_offset(folio->swap);
        se = offset_to_swap_extent(sis, offset);
        sector = se->start_block + (offset - se->start_page);
        return sector << (PAGE_SHIFT - 9);
}

/*
 * swap allocation tell device that a cluster of swap can now be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static void discard_swap_cluster(struct swap_info_struct *si,
                                 pgoff_t start_page, pgoff_t nr_pages)
{
        struct swap_extent *se = offset_to_swap_extent(si, start_page);

        while (nr_pages) {
                pgoff_t offset = start_page - se->start_page;
                sector_t start_block = se->start_block + offset;
                sector_t nr_blocks = se->nr_pages - offset;

                if (nr_blocks > nr_pages)
                        nr_blocks = nr_pages;
                start_page += nr_blocks;
                nr_pages -= nr_blocks;

                start_block <<= PAGE_SHIFT - 9;
                nr_blocks <<= PAGE_SHIFT - 9;
                if (blkdev_issue_discard(si->bdev, start_block,
                                        nr_blocks, GFP_NOIO))
                        break;

                se = next_se(se);
        }
}

#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER        HPAGE_PMD_NR

#define swap_entry_order(order)        (order)
#else
#define SWAPFILE_CLUSTER        256

/*
 * Define swap_entry_order() as constant to let compiler to optimize
 * out some code if !CONFIG_THP_SWAP
 */
#define swap_entry_order(order)        0
#endif
#define LATENCY_LIMIT                256

static inline void cluster_set_flag(struct swap_cluster_info *info,
        unsigned int flag)
{
        info->flags = flag;
}

static inline unsigned int cluster_count(struct swap_cluster_info *info)
{
        return info->data;
}

static inline void cluster_set_count(struct swap_cluster_info *info,
                                     unsigned int c)
{
        info->data = c;
}

static inline void cluster_set_count_flag(struct swap_cluster_info *info,
                                         unsigned int c, unsigned int f)
{
        info->flags = f;
        info->data = c;
}

static inline unsigned int cluster_next(struct swap_cluster_info *info)
{
        return info->data;
}

static inline void cluster_set_next(struct swap_cluster_info *info,
                                    unsigned int n)
{
        info->data = n;
}

static inline void cluster_set_next_flag(struct swap_cluster_info *info,
                                         unsigned int n, unsigned int f)
{
        info->flags = f;
        info->data = n;
}

static inline bool cluster_is_free(struct swap_cluster_info *info)
{
        return info->flags & CLUSTER_FLAG_FREE;
}

static inline bool cluster_is_null(struct swap_cluster_info *info)
{
        return info->flags & CLUSTER_FLAG_NEXT_NULL;
}

static inline void cluster_set_null(struct swap_cluster_info *info)
{
        info->flags = CLUSTER_FLAG_NEXT_NULL;
        info->data = 0;
}

static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                                     unsigned long offset)
{
        struct swap_cluster_info *ci;

        ci = si->cluster_info;
        if (ci) {
                ci += offset / SWAPFILE_CLUSTER;
                spin_lock(&ci->lock);
        }
        return ci;
}

static inline void unlock_cluster(struct swap_cluster_info *ci)
{
        if (ci)
                spin_unlock(&ci->lock);
}

/*
 * Determine the locking method in use for this device.  Return
 * swap_cluster_info if SSD-style cluster-based locking is in place.
 */
static inline struct swap_cluster_info *lock_cluster_or_swap_info(
                struct swap_info_struct *si, unsigned long offset)
{
        struct swap_cluster_info *ci;

        /* Try to use fine-grained SSD-style locking if available: */
        ci = lock_cluster(si, offset);
        /* Otherwise, fall back to traditional, coarse locking: */
        if (!ci)
                spin_lock(&si->lock);

        return ci;
}

static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
                                               struct swap_cluster_info *ci)
{
        if (ci)
                unlock_cluster(ci);
        else
                spin_unlock(&si->lock);
}

static inline bool cluster_list_empty(struct swap_cluster_list *list)
{
        return cluster_is_null(&list->head);
}

static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
{
        return cluster_next(&list->head);
}

static void cluster_list_init(struct swap_cluster_list *list)
{
        cluster_set_null(&list->head);
        cluster_set_null(&list->tail);
}

static void cluster_list_add_tail(struct swap_cluster_list *list,
                                  struct swap_cluster_info *ci,
                                  unsigned int idx)
{
        if (cluster_list_empty(list)) {
                cluster_set_next_flag(&list->head, idx, 0);
                cluster_set_next_flag(&list->tail, idx, 0);
        } else {
                struct swap_cluster_info *ci_tail;
                unsigned int tail = cluster_next(&list->tail);

                /*
                 * Nested cluster lock, but both cluster locks are
                 * only acquired when we held swap_info_struct->lock
                 */
                ci_tail = ci + tail;
                spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
                cluster_set_next(ci_tail, idx);
                spin_unlock(&ci_tail->lock);
                cluster_set_next_flag(&list->tail, idx, 0);
        }
}

static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
                                           struct swap_cluster_info *ci)
{
        unsigned int idx;

        idx = cluster_next(&list->head);
        if (cluster_next(&list->tail) == idx) {
                cluster_set_null(&list->head);
                cluster_set_null(&list->tail);
        } else
                cluster_set_next_flag(&list->head,
                                      cluster_next(&ci[idx]), 0);

        return idx;
}

/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
                unsigned int idx)
{
        /*
         * If scan_swap_map_slots() can't find a free cluster, it will check
         * si->swap_map directly. To make sure the discarding cluster isn't
         * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
         * It will be cleared after discard
         */
        memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                        SWAP_MAP_BAD, SWAPFILE_CLUSTER);

        cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);

        schedule_work(&si->discard_work);
}

static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
{
        struct swap_cluster_info *ci = si->cluster_info;

        cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
        cluster_list_add_tail(&si->free_clusters, ci, idx);
}

/*
 * Doing discard actually. After a cluster discard is finished, the cluster
 * will be added to free cluster list. caller should hold si->lock.
*/
static void swap_do_scheduled_discard(struct swap_info_struct *si)
{
        struct swap_cluster_info *info, *ci;
        unsigned int idx;

        info = si->cluster_info;

        while (!cluster_list_empty(&si->discard_clusters)) {
                idx = cluster_list_del_first(&si->discard_clusters, info);
                spin_unlock(&si->lock);

                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
                                SWAPFILE_CLUSTER);

                spin_lock(&si->lock);
                ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
                __free_cluster(si, idx);
                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                                0, SWAPFILE_CLUSTER);
                unlock_cluster(ci);
        }
}

static void swap_discard_work(struct work_struct *work)
{
        struct swap_info_struct *si;

        si = container_of(work, struct swap_info_struct, discard_work);

        spin_lock(&si->lock);
        swap_do_scheduled_discard(si);
        spin_unlock(&si->lock);
}

static void swap_users_ref_free(struct percpu_ref *ref)
{
        struct swap_info_struct *si;

        si = container_of(ref, struct swap_info_struct, users);
        complete(&si->comp);
}

static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
{
        struct swap_cluster_info *ci = si->cluster_info;

        VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
        cluster_list_del_first(&si->free_clusters, ci);
        cluster_set_count_flag(ci + idx, 0, 0);
}

static void free_cluster(struct swap_info_struct *si, unsigned long idx)
{
        struct swap_cluster_info *ci = si->cluster_info + idx;

        VM_BUG_ON(cluster_count(ci) != 0);
        /*
         * If the swap is discardable, prepare discard the cluster
         * instead of free it immediately. The cluster will be freed
         * after discard.
         */
        if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
            (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
                swap_cluster_schedule_discard(si, idx);
                return;
        }

        __free_cluster(si, idx);
}

/*
 * The cluster corresponding to page_nr will be used. The cluster will be
 * removed from free cluster list and its usage counter will be increased by
 * count.
 */
static void add_cluster_info_page(struct swap_info_struct *p,
        struct swap_cluster_info *cluster_info, unsigned long page_nr,
        unsigned long count)
{
        unsigned long idx = page_nr / SWAPFILE_CLUSTER;

        if (!cluster_info)
                return;
        if (cluster_is_free(&cluster_info[idx]))
                alloc_cluster(p, idx);

        VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
        cluster_set_count(&cluster_info[idx],
                cluster_count(&cluster_info[idx]) + count);
}

/*
 * The cluster corresponding to page_nr will be used. The cluster will be
 * removed from free cluster list and its usage counter will be increased by 1.
 */
static void inc_cluster_info_page(struct swap_info_struct *p,
        struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
        add_cluster_info_page(p, cluster_info, page_nr, 1);
}

/*
 * The cluster corresponding to page_nr decreases one usage. If the usage
 * counter becomes 0, which means no page in the cluster is in using, we can
 * optionally discard the cluster and add it to free cluster list.
 */
static void dec_cluster_info_page(struct swap_info_struct *p,
        struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
        unsigned long idx = page_nr / SWAPFILE_CLUSTER;

        if (!cluster_info)
                return;

        VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
        cluster_set_count(&cluster_info[idx],
                cluster_count(&cluster_info[idx]) - 1);

        if (cluster_count(&cluster_info[idx]) == 0)
                free_cluster(p, idx);
}

/*
 * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
 * cluster list. Avoiding such abuse to avoid list corruption.
 */
static bool
scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
        unsigned long offset, int order)
{
        struct percpu_cluster *percpu_cluster;
        bool conflict;

        offset /= SWAPFILE_CLUSTER;
        conflict = !cluster_list_empty(&si->free_clusters) &&
                offset != cluster_list_first(&si->free_clusters) &&
                cluster_is_free(&si->cluster_info[offset]);

        if (!conflict)
                return false;

        percpu_cluster = this_cpu_ptr(si->percpu_cluster);
        percpu_cluster->next[order] = SWAP_NEXT_INVALID;
        return true;
}

static inline bool swap_range_empty(char *swap_map, unsigned int start,
                                    unsigned int nr_pages)
{
        unsigned int i;

        for (i = 0; i < nr_pages; i++) {
                if (swap_map[start + i])
                        return false;
        }

        return true;
}

/*
 * Try to get swap entries with specified order from current cpu's swap entry
 * pool (a cluster). This might involve allocating a new cluster for current CPU
 * too.
 */
static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
        unsigned long *offset, unsigned long *scan_base, int order)
{
        unsigned int nr_pages = 1 << order;
        struct percpu_cluster *cluster;
        struct swap_cluster_info *ci;
        unsigned int tmp, max;

new_cluster:
        cluster = this_cpu_ptr(si->percpu_cluster);
        tmp = cluster->next[order];
        if (tmp == SWAP_NEXT_INVALID) {
                if (!cluster_list_empty(&si->free_clusters)) {
                        tmp = cluster_next(&si->free_clusters.head) *
                                        SWAPFILE_CLUSTER;
                } else if (!cluster_list_empty(&si->discard_clusters)) {
                        /*
                         * we don't have free cluster but have some clusters in
                         * discarding, do discard now and reclaim them, then
                         * reread cluster_next_cpu since we dropped si->lock
                         */
                        swap_do_scheduled_discard(si);
                        *scan_base = this_cpu_read(*si->cluster_next_cpu);
                        *offset = *scan_base;
                        goto new_cluster;
                } else
                        return false;
        }

        /*
         * Other CPUs can use our cluster if they can't find a free cluster,
         * check if there is still free entry in the cluster, maintaining
         * natural alignment.
         */
        max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
        if (tmp < max) {
                ci = lock_cluster(si, tmp);
                while (tmp < max) {
                        if (swap_range_empty(si->swap_map, tmp, nr_pages))
                                break;
                        tmp += nr_pages;
                }
                unlock_cluster(ci);
        }
        if (tmp >= max) {
                cluster->next[order] = SWAP_NEXT_INVALID;
                goto new_cluster;
        }
        *offset = tmp;
        *scan_base = tmp;
        tmp += nr_pages;
        cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
        return true;
}

static void __del_from_avail_list(struct swap_info_struct *p)
{
        int nid;

        assert_spin_locked(&p->lock);
        for_each_node(nid)
                plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}

static void del_from_avail_list(struct swap_info_struct *p)
{
        spin_lock(&swap_avail_lock);
        __del_from_avail_list(p);
        spin_unlock(&swap_avail_lock);
}

static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
                             unsigned int nr_entries)
{
        unsigned int end = offset + nr_entries - 1;

        if (offset == si->lowest_bit)
                si->lowest_bit += nr_entries;
        if (end == si->highest_bit)
                WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
        WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
        if (si->inuse_pages == si->pages) {
                si->lowest_bit = si->max;
                si->highest_bit = 0;
                del_from_avail_list(si);
        }
}

static void add_to_avail_list(struct swap_info_struct *p)
{
        int nid;

        spin_lock(&swap_avail_lock);
        for_each_node(nid)
                plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
        spin_unlock(&swap_avail_lock);
}

static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                            unsigned int nr_entries)
{
        unsigned long begin = offset;
        unsigned long end = offset + nr_entries - 1;
        void (*swap_slot_free_notify)(struct block_device *, unsigned long);

        if (offset < si->lowest_bit)
                si->lowest_bit = offset;
        if (end > si->highest_bit) {
                bool was_full = !si->highest_bit;

                WRITE_ONCE(si->highest_bit, end);
                if (was_full && (si->flags & SWP_WRITEOK))
                        add_to_avail_list(si);
        }
        if (si->flags & SWP_BLKDEV)
                swap_slot_free_notify =
                        si->bdev->bd_disk->fops->swap_slot_free_notify;
        else
                swap_slot_free_notify = NULL;
        while (offset <= end) {
                arch_swap_invalidate_page(si->type, offset);
                if (swap_slot_free_notify)
                        swap_slot_free_notify(si->bdev, offset);
                offset++;
        }
        clear_shadow_from_swap_cache(si->type, begin, end);

        /*
         * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
         * only after the above cleanups are done.
         */
        smp_wmb();
        atomic_long_add(nr_entries, &nr_swap_pages);
        WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
}

static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
{
        unsigned long prev;

        if (!(si->flags & SWP_SOLIDSTATE)) {
                si->cluster_next = next;
                return;
        }

        prev = this_cpu_read(*si->cluster_next_cpu);
        /*
         * Cross the swap address space size aligned trunk, choose
         * another trunk randomly to avoid lock contention on swap
         * address space if possible.
         */
        if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
            (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
                /* No free swap slots available */
                if (si->highest_bit <= si->lowest_bit)
                        return;
                next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
                next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
                next = max_t(unsigned int, next, si->lowest_bit);
        }
        this_cpu_write(*si->cluster_next_cpu, next);
}

static bool swap_offset_available_and_locked(struct swap_info_struct *si,
                                             unsigned long offset)
{
        if (data_race(!si->swap_map[offset])) {
                spin_lock(&si->lock);
                return true;
        }

        if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
                spin_lock(&si->lock);
                return true;
        }

        return false;
}

static int scan_swap_map_slots(struct swap_info_struct *si,
                               unsigned char usage, int nr,
                               swp_entry_t slots[], int order)
{
        struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
        unsigned int nr_pages = 1 << order;
        int n_ret = 0;
        bool scanned_many = false;

        /*
         * We try to cluster swap pages by allocating them sequentially
         * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
         * way, however, we resort to first-free allocation, starting
         * a new cluster.  This prevents us from scattering swap pages
         * all over the entire swap partition, so that we reduce
         * overall disk seek times between swap pages.  -- sct
         * But we do now try to find an empty cluster.  -Andrea
         * And we let swap pages go all over an SSD partition.  Hugh
         */

        if (order > 0) {
                /*
                 * Should not even be attempting large allocations when huge
                 * page swap is disabled.  Warn and fail the allocation.
                 */
                if (!IS_ENABLED(CONFIG_THP_SWAP) ||
                    nr_pages > SWAPFILE_CLUSTER) {
                        VM_WARN_ON_ONCE(1);
                        return 0;
                }

                /*
                 * Swapfile is not block device or not using clusters so unable
                 * to allocate large entries.
                 */
                if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
                        return 0;
        }

        si->flags += SWP_SCANNING;
        /*
         * Use percpu scan base for SSD to reduce lock contention on
         * cluster and swap cache.  For HDD, sequential access is more
         * important.
         */
        if (si->flags & SWP_SOLIDSTATE)
                scan_base = this_cpu_read(*si->cluster_next_cpu);
        else
                scan_base = si->cluster_next;
        offset = scan_base;

        /* SSD algorithm */
        if (si->cluster_info) {
                if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
                        if (order > 0)
                                goto no_page;
                        goto scan;
                }
        } else if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }

                spin_unlock(&si->lock);

                /*
                 * If seek is expensive, start searching for new cluster from
                 * start of partition, to minimize the span of allocated swap.
                 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
                 * case, just handled by scan_swap_map_try_ssd_cluster() above.
                 */
                scan_base = offset = si->lowest_bit;
                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;

                /* Locate the first empty (unaligned) cluster */
                for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) {
                        if (si->swap_map[offset])
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
                                spin_lock(&si->lock);
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
                                cond_resched();
                                latency_ration = LATENCY_LIMIT;
                        }
                }

                offset = scan_base;
                spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
        }

checks:
        if (si->cluster_info) {
                while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
                /* take a break if we already got some slots */
                        if (n_ret)
                                goto done;
                        if (!scan_swap_map_try_ssd_cluster(si, &offset,
                                                        &scan_base, order)) {
                                if (order > 0)
                                        goto no_page;
                                goto scan;
                        }
                }
        }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
                goto no_page;
        if (offset > si->highest_bit)
                scan_base = offset = si->lowest_bit;

        ci = lock_cluster(si, offset);
        /* reuse swap entry of cache-only swap if not busy. */
        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
                unlock_cluster(ci);
                spin_unlock(&si->lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
                spin_lock(&si->lock);
                /* entry was freed successfully, try to use this again */
                if (swap_was_freed > 0)
                        goto checks;
                goto scan; /* check next one */
        }

        if (si->swap_map[offset]) {
                unlock_cluster(ci);
                if (!n_ret)
                        goto scan;
                else
                        goto done;
        }
        memset(si->swap_map + offset, usage, nr_pages);
        add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
        unlock_cluster(ci);

        swap_range_alloc(si, offset, nr_pages);
        slots[n_ret++] = swp_entry(si->type, offset);

        /* got enough slots or reach max slots? */
        if ((n_ret == nr) || (offset >= si->highest_bit))
                goto done;

        /* search for next available slot */

        /* time to take a break? */
        if (unlikely(--latency_ration < 0)) {
                if (n_ret)
                        goto done;
                spin_unlock(&si->lock);
                cond_resched();
                spin_lock(&si->lock);
                latency_ration = LATENCY_LIMIT;
        }

        /* try to get more slots in cluster */
        if (si->cluster_info) {
                if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
                        goto checks;
                if (order > 0)
                        goto done;
        } else if (si->cluster_nr && !si->swap_map[++offset]) {
                /* non-ssd case, still more slots in cluster? */
                --si->cluster_nr;
                goto checks;
        }

        /*
         * Even if there's no free clusters available (fragmented),
         * try to scan a little more quickly with lock held unless we
         * have scanned too many slots already.
         */
        if (!scanned_many) {
                unsigned long scan_limit;

                if (offset < scan_base)
                        scan_limit = scan_base;
                else
                        scan_limit = si->highest_bit;
                for (; offset <= scan_limit && --latency_ration > 0;
                     offset++) {
                        if (!si->swap_map[offset])
                                goto checks;
                }
        }

done:
        if (order == 0)
                set_cluster_next(si, offset + 1);
        si->flags -= SWP_SCANNING;
        return n_ret;

scan:
        VM_WARN_ON(order > 0);
        spin_unlock(&si->lock);
        while (++offset <= READ_ONCE(si->highest_bit)) {
                if (unlikely(--latency_ration < 0)) {
                        cond_resched();
                        latency_ration = LATENCY_LIMIT;
                        scanned_many = true;
                }
                if (swap_offset_available_and_locked(si, offset))
                        goto checks;
        }
        offset = si->lowest_bit;
        while (offset < scan_base) {
                if (unlikely(--latency_ration < 0)) {
                        cond_resched();
                        latency_ration = LATENCY_LIMIT;
                        scanned_many = true;
                }
                if (swap_offset_available_and_locked(si, offset))
                        goto checks;
                offset++;
        }
        spin_lock(&si->lock);

no_page:
        si->flags -= SWP_SCANNING;
        return n_ret;
}

static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
{
        unsigned long offset = idx * SWAPFILE_CLUSTER;
        struct swap_cluster_info *ci;

        ci = lock_cluster(si, offset);
        memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
        cluster_set_count_flag(ci, 0, 0);
        free_cluster(si, idx);
        unlock_cluster(ci);
        swap_range_free(si, offset, SWAPFILE_CLUSTER);
}

int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
        int order = swap_entry_order(entry_order);
        unsigned long size = 1 << order;
        struct swap_info_struct *si, *next;
        long avail_pgs;
        int n_ret = 0;
        int node;

        spin_lock(&swap_avail_lock);

        avail_pgs = atomic_long_read(&nr_swap_pages) / size;
        if (avail_pgs <= 0) {
                spin_unlock(&swap_avail_lock);
                goto noswap;
        }

        n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);

        atomic_long_sub(n_goal * size, &nr_swap_pages);

start_over:
        node = numa_node_id();
        plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
                /* requeue si to after same-priority siblings */
                plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                spin_unlock(&swap_avail_lock);
                spin_lock(&si->lock);
                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
                        spin_lock(&swap_avail_lock);
                        if (plist_node_empty(&si->avail_lists[node])) {
                                spin_unlock(&si->lock);
                                goto nextsi;
                        }
                        WARN(!si->highest_bit,
                             "swap_info %d in list but !highest_bit\n",
                             si->type);
                        WARN(!(si->flags & SWP_WRITEOK),
                             "swap_info %d in list but !SWP_WRITEOK\n",
                             si->type);
                        __del_from_avail_list(si);
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
                n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
                                            n_goal, swp_entries, order);
                spin_unlock(&si->lock);
                if (n_ret || size > 1)
                        goto check_out;
                cond_resched();

                spin_lock(&swap_avail_lock);
nextsi:
                /*
                 * if we got here, it's likely that si was almost full before,
                 * and since scan_swap_map_slots() can drop the si->lock,
                 * multiple callers probably all tried to get a page from the
                 * same si and it filled up before we could get one; or, the si
                 * filled up between us dropping swap_avail_lock and taking
                 * si->lock. Since we dropped the swap_avail_lock, the
                 * swap_avail_head list may have been modified; so if next is
                 * still in the swap_avail_head list then try it, otherwise
                 * start over if we have not gotten any slots.
                 */
                if (plist_node_empty(&next->avail_lists[node]))
                        goto start_over;
        }

        spin_unlock(&swap_avail_lock);

check_out:
        if (n_ret < n_goal)
                atomic_long_add((long)(n_goal - n_ret) * size,
                                &nr_swap_pages);
noswap:
        return n_ret;
}

static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
        struct swap_info_struct *p;
        unsigned long offset;

        if (!entry.val)
                goto out;
        p = swp_swap_info(entry);
        if (!p)
                goto bad_nofile;
        if (data_race(!(p->flags & SWP_USED)))
                goto bad_device;
        offset = swp_offset(entry);
        if (offset >= p->max)
                goto bad_offset;
        if (data_race(!p->swap_map[swp_offset(entry)]))
                goto bad_free;
        return p;

bad_free:
        pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
        goto out;
bad_offset:
        pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
        goto out;
bad_device:
        pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
        goto out;
bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
        return NULL;
}

static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
                                        struct swap_info_struct *q)
{
        struct swap_info_struct *p;

        p = _swap_info_get(entry);

        if (p != q) {
                if (q != NULL)
                        spin_unlock(&q->lock);
                if (p != NULL)
                        spin_lock(&p->lock);
        }
        return p;
}

static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
                                              unsigned long offset,
                                              unsigned char usage)
{
        unsigned char count;
        unsigned char has_cache;

        count = p->swap_map[offset];

        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;

        if (usage == SWAP_HAS_CACHE) {
                VM_BUG_ON(!has_cache);
                has_cache = 0;
        } else if (count == SWAP_MAP_SHMEM) {
                /*
                 * Or we could insist on shmem.c using a special
                 * swap_shmem_free() and free_shmem_swap_and_cache()...
                 */
                count = 0;
        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
                if (count == COUNT_CONTINUED) {
                        if (swap_count_continued(p, offset, count))
                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
                        else
                                count = SWAP_MAP_MAX;
                } else
                        count--;
        }

        usage = count | has_cache;
        if (usage)
                WRITE_ONCE(p->swap_map[offset], usage);
        else
                WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);

        return usage;
}

/*
 * When we get a swap entry, if there aren't some other ways to
 * prevent swapoff, such as the folio in swap cache is locked, RCU
 * reader side is locked, etc., the swap entry may become invalid
 * because of swapoff.  Then, we need to enclose all swap related
 * functions with get_swap_device() and put_swap_device(), unless the
 * swap functions call get/put_swap_device() by themselves.
 *
 * RCU reader side lock (including any spinlock) is sufficient to
 * prevent swapoff, because synchronize_rcu() is called in swapoff()
 * before freeing data structures.
 *
 * Check whether swap entry is valid in the swap device.  If so,
 * return pointer to swap_info_struct, and keep the swap entry valid
 * via preventing the swap device from being swapoff, until
 * put_swap_device() is called.  Otherwise return NULL.
 *
 * Notice that swapoff or swapoff+swapon can still happen before the
 * percpu_ref_tryget_live() in get_swap_device() or after the
 * percpu_ref_put() in put_swap_device() if there isn't any other way
 * to prevent swapoff.  The caller must be prepared for that.  For
 * example, the following situation is possible.
 *
 *   CPU1                                CPU2
 *   do_swap_page()
 *     ...                                swapoff+swapon
 *     __read_swap_cache_async()
 *       swapcache_prepare()
 *         __swap_duplicate()
 *           // check swap_map
 *     // verify PTE not changed
 *
 * In __swap_duplicate(), the swap_map need to be checked before
 * changing partly because the specified swap entry may be for another
 * swap device which has been swapoff.  And in do_swap_page(), after
 * the page is read from the swap device, the PTE is verified not
 * changed with the page table locked to check whether the swap device
 * has been swapoff or swapoff+swapon.
 */
struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        struct swap_info_struct *si;
        unsigned long offset;

        if (!entry.val)
                goto out;
        si = swp_swap_info(entry);
        if (!si)
                goto bad_nofile;
        if (!percpu_ref_tryget_live(&si->users))
                goto out;
        /*
         * Guarantee the si->users are checked before accessing other
         * fields of swap_info_struct.
         *
         * Paired with the spin_unlock() after setup_swap_info() in
         * enable_swap_info().
         */
        smp_rmb();
        offset = swp_offset(entry);
        if (offset >= si->max)
                goto put_out;

        return si;
bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
        return NULL;
put_out:
        pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
        percpu_ref_put(&si->users);
        return NULL;
}

static unsigned char __swap_entry_free(struct swap_info_struct *p,
                                       swp_entry_t entry)
{
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char usage;

        ci = lock_cluster_or_swap_info(p, offset);
        usage = __swap_entry_free_locked(p, offset, 1);
        unlock_cluster_or_swap_info(p, ci);
        if (!usage)
                free_swap_slot(entry);

        return usage;
}

static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
{
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char count;

        ci = lock_cluster(p, offset);
        count = p->swap_map[offset];
        VM_BUG_ON(count != SWAP_HAS_CACHE);
        p->swap_map[offset] = 0;
        dec_cluster_info_page(p, p->cluster_info, offset);
        unlock_cluster(ci);

        mem_cgroup_uncharge_swap(entry, 1);
        swap_range_free(p, offset, 1);
}

/*
 * Caller has made sure that the swap device corresponding to entry
 * is still around or has not been recycled.
 */
void swap_free(swp_entry_t entry)
{
        struct swap_info_struct *p;

        p = _swap_info_get(entry);
        if (p)
                __swap_entry_free(p, entry);
}

/*
 * Called after dropping swapcache to decrease refcnt to swap entries.
 */
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
        unsigned long offset = swp_offset(entry);
        unsigned long idx = offset / SWAPFILE_CLUSTER;
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned char *map;
        unsigned int i, free_entries = 0;
        unsigned char val;
        int size = 1 << swap_entry_order(folio_order(folio));

        si = _swap_info_get(entry);
        if (!si)
                return;

        ci = lock_cluster_or_swap_info(si, offset);
        if (size == SWAPFILE_CLUSTER) {
                map = si->swap_map + offset;
                for (i = 0; i < SWAPFILE_CLUSTER; i++) {
                        val = map[i];
                        VM_BUG_ON(!(val & SWAP_HAS_CACHE));
                        if (val == SWAP_HAS_CACHE)
                                free_entries++;
                }
                if (free_entries == SWAPFILE_CLUSTER) {
                        unlock_cluster_or_swap_info(si, ci);
                        spin_lock(&si->lock);
                        mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
                        swap_free_cluster(si, idx);
                        spin_unlock(&si->lock);
                        return;
                }
        }
        for (i = 0; i < size; i++, entry.val++) {
                if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
                        unlock_cluster_or_swap_info(si, ci);
                        free_swap_slot(entry);
                        if (i == size - 1)
                                return;
                        lock_cluster_or_swap_info(si, offset);
                }
        }
        unlock_cluster_or_swap_info(si, ci);
}

static int swp_entry_cmp(const void *ent1, const void *ent2)
{
        const swp_entry_t *e1 = ent1, *e2 = ent2;

        return (int)swp_type(*e1) - (int)swp_type(*e2);
}

void swapcache_free_entries(swp_entry_t *entries, int n)
{
        struct swap_info_struct *p, *prev;
        int i;

        if (n <= 0)
                return;

        prev = NULL;
        p = NULL;

        /*
         * Sort swap entries by swap device, so each lock is only taken once.
         * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
         * so low that it isn't necessary to optimize further.
         */
        if (nr_swapfiles > 1)
                sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
        for (i = 0; i < n; ++i) {
                p = swap_info_get_cont(entries[i], prev);
                if (p)
                        swap_entry_free(p, entries[i]);
                prev = p;
        }
        if (p)
                spin_unlock(&p->lock);
}

int __swap_count(swp_entry_t entry)
{
        struct swap_info_struct *si = swp_swap_info(entry);
        pgoff_t offset = swp_offset(entry);

        return swap_count(si->swap_map[offset]);
}

/*
 * How many references to @entry are currently swapped out?
 * This does not give an exact answer when swap count is continued,
 * but does include the high COUNT_CONTINUED flag to allow for that.
 */
int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
        pgoff_t offset = swp_offset(entry);
        struct swap_cluster_info *ci;
        int count;

        ci = lock_cluster_or_swap_info(si, offset);
        count = swap_count(si->swap_map[offset]);
        unlock_cluster_or_swap_info(si, ci);
        return count;
}

/*
 * How many references to @entry are currently swapped out?
 * This considers COUNT_CONTINUED so it returns exact answer.
 */
int swp_swapcount(swp_entry_t entry)
{
        int count, tmp_count, n;
        struct swap_info_struct *p;
        struct swap_cluster_info *ci;
        struct page *page;
        pgoff_t offset;
        unsigned char *map;

        p = _swap_info_get(entry);
        if (!p)
                return 0;

        offset = swp_offset(entry);

        ci = lock_cluster_or_swap_info(p, offset);

        count = swap_count(p->swap_map[offset]);
        if (!(count & COUNT_CONTINUED))
                goto out;

        count &= ~COUNT_CONTINUED;
        n = SWAP_MAP_MAX + 1;

        page = vmalloc_to_page(p->swap_map + offset);
        offset &= ~PAGE_MASK;
        VM_BUG_ON(page_private(page) != SWP_CONTINUED);

        do {
                page = list_next_entry(page, lru);
                map = kmap_local_page(page);
                tmp_count = map[offset];
                kunmap_local(map);

                count += (tmp_count & ~COUNT_CONTINUED) * n;
                n *= (SWAP_CONT_MAX + 1);
        } while (tmp_count & COUNT_CONTINUED);
out:
        unlock_cluster_or_swap_info(p, ci);
        return count;
}

static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
                                         swp_entry_t entry, int order)
{
        struct swap_cluster_info *ci;
        unsigned char *map = si->swap_map;
        unsigned int nr_pages = 1 << order;
        unsigned long roffset = swp_offset(entry);
        unsigned long offset = round_down(roffset, nr_pages);
        int i;
        bool ret = false;

        ci = lock_cluster_or_swap_info(si, offset);
        if (!ci || nr_pages == 1) {
                if (swap_count(map[roffset]))
                        ret = true;
                goto unlock_out;
        }
        for (i = 0; i < nr_pages; i++) {
                if (swap_count(map[offset + i])) {
                        ret = true;
                        break;
                }
        }
unlock_out:
        unlock_cluster_or_swap_info(si, ci);
        return ret;
}

static bool folio_swapped(struct folio *folio)
{
        swp_entry_t entry = folio->swap;
        struct swap_info_struct *si = _swap_info_get(entry);

        if (!si)
                return false;

        if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
                return swap_swapcount(si, entry) != 0;

        return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}

/**
 * folio_free_swap() - Free the swap space used for this folio.
 * @folio: The folio to remove.
 *
 * If swap is getting full, or if there are no more mappings of this folio,
 * then call folio_free_swap to free its swap space.
 *
 * Return: true if we were able to release the swap space.
 */
bool folio_free_swap(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!folio_test_swapcache(folio))
                return false;
        if (folio_test_writeback(folio))
                return false;
        if (folio_swapped(folio))
                return false;

        /*
         * Once hibernation has begun to create its image of memory,
         * there's a danger that one of the calls to folio_free_swap()
         * - most probably a call from __try_to_reclaim_swap() while
         * hibernation is allocating its own swap pages for the image,
         * but conceivably even a call from memory reclaim - will free
         * the swap from a folio which has already been recorded in the
         * image as a clean swapcache folio, and then reuse its swap for
         * another page of the image.  On waking from hibernation, the
         * original folio might be freed under memory pressure, then
         * later read back in from swap, now with the wrong data.
         *
         * Hibernation suspends storage while it is writing the image
         * to disk so check that here.
         */
        if (pm_suspended_storage())
                return false;

        delete_from_swap_cache(folio);
        folio_set_dirty(folio);
        return true;
}

/**
 * free_swap_and_cache_nr() - Release reference on range of swap entries and
 *                            reclaim their cache if no more references remain.
 * @entry: First entry of range.
 * @nr: Number of entries in range.
 *
 * For each swap entry in the contiguous range, release a reference. If any swap
 * entries become free, try to reclaim their underlying folios, if present. The
 * offset range is defined by [entry.offset, entry.offset + nr).
 */
void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
        const unsigned long start_offset = swp_offset(entry);
        const unsigned long end_offset = start_offset + nr;
        unsigned int type = swp_type(entry);
        struct swap_info_struct *si;
        bool any_only_cache = false;
        unsigned long offset;
        unsigned char count;

        if (non_swap_entry(entry))
                return;

        si = get_swap_device(entry);
        if (!si)
                return;

        if (WARN_ON(end_offset > si->max))
                goto out;

        /*
         * First free all entries in the range.
         */
        for (offset = start_offset; offset < end_offset; offset++) {
                if (data_race(si->swap_map[offset])) {
                        count = __swap_entry_free(si, swp_entry(type, offset));
                        if (count == SWAP_HAS_CACHE)
                                any_only_cache = true;
                } else {
                        WARN_ON_ONCE(1);
                }
        }

        /*
         * Short-circuit the below loop if none of the entries had their
         * reference drop to zero.
         */
        if (!any_only_cache)
                goto out;

        /*
         * Now go back over the range trying to reclaim the swap cache. This is
         * more efficient for large folios because we will only try to reclaim
         * the swap once per folio in the common case. If we do
         * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
         * latter will get a reference and lock the folio for every individual
         * page but will only succeed once the swap slot for every subpage is
         * zero.
         */
        for (offset = start_offset; offset < end_offset; offset += nr) {
                nr = 1;
                if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
                        /*
                         * Folios are always naturally aligned in swap so
                         * advance forward to the next boundary. Zero means no
                         * folio was found for the swap entry, so advance by 1
                         * in this case. Negative value means folio was found
                         * but could not be reclaimed. Here we can still advance
                         * to the next boundary.
                         */
                        nr = __try_to_reclaim_swap(si, offset,
                                              TTRS_UNMAPPED | TTRS_FULL);
                        if (nr == 0)
                                nr = 1;
                        else if (nr < 0)
                                nr = -nr;
                        nr = ALIGN(offset + 1, nr) - offset;
                }
        }

out:
        put_swap_device(si);
}

#ifdef CONFIG_HIBERNATION

swp_entry_t get_swap_page_of_type(int type)
{
        struct swap_info_struct *si = swap_type_to_swap_info(type);
        swp_entry_t entry = {0};

        if (!si)
                goto fail;

        /* This is called for allocating swap entry, not cache */
        spin_lock(&si->lock);
        if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
                atomic_long_dec(&nr_swap_pages);
        spin_unlock(&si->lock);
fail:
        return entry;
}

/*
 * Find the swap type that corresponds to given device (if any).
 *
 * @offset - number of the PAGE_SIZE-sized block of the device, starting
 * from 0, in which the swap header is expected to be located.
 *
 * This is needed for the suspend to disk (aka swsusp).
 */
int swap_type_of(dev_t device, sector_t offset)
{
        int type;

        if (!device)
                return -1;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;

                if (device == sis->bdev->bd_dev) {
                        struct swap_extent *se = first_se(sis);

                        if (se->start_block == offset) {
                                spin_unlock(&swap_lock);
                                return type;
                        }
                }
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

int find_first_swap(dev_t *device)
{
        int type;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;
                *device = sis->bdev->bd_dev;
                spin_unlock(&swap_lock);
                return type;
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

/*
 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
 * corresponding to given index in swap_info (swap type).
 */
sector_t swapdev_block(int type, pgoff_t offset)
{
        struct swap_info_struct *si = swap_type_to_swap_info(type);
        struct swap_extent *se;

        if (!si || !(si->flags & SWP_WRITEOK))
                return 0;
        se = offset_to_swap_extent(si, offset);
        return se->start_block + (offset - se->start_page);
}

/*
 * Return either the total number of swap pages of given type, or the number
 * of free pages of that type (depending on @free)
 *
 * This is needed for software suspend
 */
unsigned int count_swap_pages(int type, int free)
{
        unsigned int n = 0;

        spin_lock(&swap_lock);
        if ((unsigned int)type < nr_swapfiles) {
                struct swap_info_struct *sis = swap_info[type];

                spin_lock(&sis->lock);
                if (sis->flags & SWP_WRITEOK) {
                        n = sis->pages;
                        if (free)
                                n -= sis->inuse_pages;
                }
                spin_unlock(&sis->lock);
        }
        spin_unlock(&swap_lock);
        return n;
}
#endif /* CONFIG_HIBERNATION */

static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
        return pte_same(pte_swp_clear_flags(pte), swp_pte);
}

/*
 * No need to decide whether this PTE shares the swap entry with others,
 * just let do_wp_page work it out if a write is requested later - to
 * force COW, vm_page_prot omits write permission from any private vma.
 */
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct folio *folio)
{
        struct page *page;
        struct folio *swapcache;
        spinlock_t *ptl;
        pte_t *pte, new_pte, old_pte;
        bool hwpoisoned = false;
        int ret = 1;

        swapcache = folio;
        folio = ksm_might_need_to_copy(folio, vma, addr);
        if (unlikely(!folio))
                return -ENOMEM;
        else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                hwpoisoned = true;
                folio = swapcache;
        }

        page = folio_file_page(folio, swp_offset(entry));
        if (PageHWPoison(page))
                hwpoisoned = true;

        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
                                                swp_entry_to_pte(entry)))) {
                ret = 0;
                goto out;
        }

        old_pte = ptep_get(pte);

        if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
                swp_entry_t swp_entry;

                dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
                if (hwpoisoned) {
                        swp_entry = make_hwpoison_entry(page);
                } else {
                        swp_entry = make_poisoned_swp_entry();
                }
                new_pte = swp_entry_to_pte(swp_entry);
                ret = 0;
                goto setpte;
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        folio_get(folio);
        if (folio == swapcache) {
                rmap_t rmap_flags = RMAP_NONE;

                /*
                 * See do_swap_page(): writeback would be problematic.
                 * However, we do a folio_wait_writeback() just before this
                 * call and have the folio locked.
                 */
                VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
                if (pte_swp_exclusive(old_pte))
                        rmap_flags |= RMAP_EXCLUSIVE;

                folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
        } else { /* ksm created a completely new copy */
                folio_add_new_anon_rmap(folio, vma, addr);
                folio_add_lru_vma(folio, vma);
        }
        new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
        if (pte_swp_soft_dirty(old_pte))
                new_pte = pte_mksoft_dirty(new_pte);
        if (pte_swp_uffd_wp(old_pte))
                new_pte = pte_mkuffd_wp(new_pte);
setpte:
        set_pte_at(vma->vm_mm, addr, pte, new_pte);
        swap_free(entry);
out:
        if (pte)
                pte_unmap_unlock(pte, ptl);
        if (folio != swapcache) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return ret;
}

static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned int type)
{
        pte_t *pte = NULL;
        struct swap_info_struct *si;

        si = swap_info[type];
        do {
                struct folio *folio;
                unsigned long offset;
                unsigned char swp_count;
                swp_entry_t entry;
                int ret;
                pte_t ptent;

                if (!pte++) {
                        pte = pte_offset_map(pmd, addr);
                        if (!pte)
                                break;
                }

                ptent = ptep_get_lockless(pte);

                if (!is_swap_pte(ptent))
                        continue;

                entry = pte_to_swp_entry(ptent);
                if (swp_type(entry) != type)
                        continue;

                offset = swp_offset(entry);
                pte_unmap(pte);
                pte = NULL;

                folio = swap_cache_get_folio(entry, vma, addr);
                if (!folio) {
                        struct page *page;
                        struct vm_fault vmf = {
                                .vma = vma,
                                .address = addr,
                                .real_address = addr,
                                .pmd = pmd,
                        };

                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                &vmf);
                        if (page)
                                folio = page_folio(page);
                }
                if (!folio) {
                        swp_count = READ_ONCE(si->swap_map[offset]);
                        if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
                                continue;
                        return -ENOMEM;
                }

                folio_lock(folio);
                folio_wait_writeback(folio);
                ret = unuse_pte(vma, pmd, addr, entry, folio);
                if (ret < 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ret;
                }

                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        } while (addr += PAGE_SIZE, addr != end);

        if (pte)
                pte_unmap(pte);
        return 0;
}

static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pmd_t *pmd;
        unsigned long next;
        int ret;

        pmd = pmd_offset(pud, addr);
        do {
                cond_resched();
                next = pmd_addr_end(addr, end);
                ret = unuse_pte_range(vma, pmd, addr, next, type);
                if (ret)
                        return ret;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pud_t *pud;
        unsigned long next;
        int ret;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                ret = unuse_pmd_range(vma, pud, addr, next, type);
                if (ret)
                        return ret;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        p4d_t *p4d;
        unsigned long next;
        int ret;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                ret = unuse_pud_range(vma, p4d, addr, next, type);
                if (ret)
                        return ret;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
{
        pgd_t *pgd;
        unsigned long addr, end, next;
        int ret;

        addr = vma->vm_start;
        end = vma->vm_end;

        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                ret = unuse_p4d_range(vma, pgd, addr, next, type);
                if (ret)
                        return ret;
        } while (pgd++, addr = next, addr != end);
        return 0;
}

static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
        struct vm_area_struct *vma;
        int ret = 0;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_read_lock(mm);
        for_each_vma(vmi, vma) {
                if (vma->anon_vma) {
                        ret = unuse_vma(vma, type);
                        if (ret)
                                break;
                }

                cond_resched();
        }
        mmap_read_unlock(mm);
        return ret;
}

/*
 * Scan swap_map from current position to next entry still in use.
 * Return 0 if there are no inuse entries after prev till end of
 * the map.
 */
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                                        unsigned int prev)
{
        unsigned int i;
        unsigned char count;

        /*
         * No need for swap_lock here: we're just looking
         * for whether an entry is in use, not modifying it; false
         * hits are okay, and sys_swapoff() has already prevented new
         * allocations from this area (while holding swap_lock).
         */
        for (i = prev + 1; i < si->max; i++) {
                count = READ_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
                if ((i % LATENCY_LIMIT) == 0)
                        cond_resched();
        }

        if (i == si->max)
                i = 0;

        return i;
}

static int try_to_unuse(unsigned int type)
{
        struct mm_struct *prev_mm;
        struct mm_struct *mm;
        struct list_head *p;
        int retval = 0;
        struct swap_info_struct *si = swap_info[type];
        struct folio *folio;
        swp_entry_t entry;
        unsigned int i;

        if (!READ_ONCE(si->inuse_pages))
                goto success;

retry:
        retval = shmem_unuse(type);
        if (retval)
                return retval;

        prev_mm = &init_mm;
        mmget(prev_mm);

        spin_lock(&mmlist_lock);
        p = &init_mm.mmlist;
        while (READ_ONCE(si->inuse_pages) &&
               !signal_pending(current) &&
               (p = p->next) != &init_mm.mmlist) {

                mm = list_entry(p, struct mm_struct, mmlist);
                if (!mmget_not_zero(mm))
                        continue;
                spin_unlock(&mmlist_lock);
                mmput(prev_mm);
                prev_mm = mm;
                retval = unuse_mm(mm, type);
                if (retval) {
                        mmput(prev_mm);
                        return retval;
                }

                /*
                 * Make sure that we aren't completely killing
                 * interactive performance.
                 */
                cond_resched();
                spin_lock(&mmlist_lock);
        }
        spin_unlock(&mmlist_lock);

        mmput(prev_mm);

        i = 0;
        while (READ_ONCE(si->inuse_pages) &&
               !signal_pending(current) &&
               (i = find_next_to_unuse(si, i)) != 0) {

                entry = swp_entry(type, i);
                folio = filemap_get_folio(swap_address_space(entry), i);
                if (IS_ERR(folio))
                        continue;

                /*
                 * It is conceivable that a racing task removed this folio from
                 * swap cache just before we acquired the page lock. The folio
                 * might even be back in swap cache on another swap area. But
                 * that is okay, folio_free_swap() only removes stale folios.
                 */
                folio_lock(folio);
                folio_wait_writeback(folio);
                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        }

        /*
         * Lets check again to see if there are still swap entries in the map.
         * If yes, we would need to do retry the unuse logic again.
         * Under global memory pressure, swap entries can be reinserted back
         * into process space after the mmlist loop above passes over them.
         *
         * Limit the number of retries? No: when mmget_not_zero()
         * above fails, that mm is likely to be freeing swap from
         * exit_mmap(), which proceeds at its own independent pace;
         * and even shmem_writepage() could have been preempted after
         * folio_alloc_swap(), temporarily hiding that swap.  It's easy
         * and robust (though cpu-intensive) just to keep retrying.
         */
        if (READ_ONCE(si->inuse_pages)) {
                if (!signal_pending(current))
                        goto retry;
                return -EINTR;
        }

success:
        /*
         * Make sure that further cleanups after try_to_unuse() returns happen
         * after swap_range_free() reduces si->inuse_pages to 0.
         */
        smp_mb();
        return 0;
}

/*
 * After a successful try_to_unuse, if no swap is now in use, we know
 * we can empty the mmlist.  swap_lock must be held on entry and exit.
 * Note that mmlist_lock nests inside swap_lock, and an mm must be
 * added to the mmlist just after page_duplicate - before would be racy.
 */
static void drain_mmlist(void)
{
        struct list_head *p, *next;
        unsigned int type;

        for (type = 0; type < nr_swapfiles; type++)
                if (swap_info[type]->inuse_pages)
                        return;
        spin_lock(&mmlist_lock);
        list_for_each_safe(p, next, &init_mm.mmlist)
                list_del_init(p);
        spin_unlock(&mmlist_lock);
}

/*
 * Free all of a swapdev's extent information
 */
static void destroy_swap_extents(struct swap_info_struct *sis)
{
        while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
                struct rb_node *rb = sis->swap_extent_root.rb_node;
                struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);

                rb_erase(rb, &sis->swap_extent_root);
                kfree(se);
        }

        if (sis->flags & SWP_ACTIVATED) {
                struct file *swap_file = sis->swap_file;
                struct address_space *mapping = swap_file->f_mapping;

                sis->flags &= ~SWP_ACTIVATED;
                if (mapping->a_ops->swap_deactivate)
                        mapping->a_ops->swap_deactivate(swap_file);
        }
}

/*
 * Add a block range (and the corresponding page range) into this swapdev's
 * extent tree.
 *
 * This function rather assumes that it is called in ascending page order.
 */
int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
{
        struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
        struct swap_extent *se;
        struct swap_extent *new_se;

        /*
         * place the new node at the right most since the
         * function is called in ascending page order.
         */
        while (*link) {
                parent = *link;
                link = &parent->rb_right;
        }

        if (parent) {
                se = rb_entry(parent, struct swap_extent, rb_node);
                BUG_ON(se->start_page + se->nr_pages != start_page);
                if (se->start_block + se->nr_pages == start_block) {
                        /* Merge it */
                        se->nr_pages += nr_pages;
                        return 0;
                }
        }

        /* No merge, insert a new extent. */
        new_se = kmalloc(sizeof(*se), GFP_KERNEL);
        if (new_se == NULL)
                return -ENOMEM;
        new_se->start_page = start_page;
        new_se->nr_pages = nr_pages;
        new_se->start_block = start_block;

        rb_link_node(&new_se->rb_node, parent, link);
        rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
        return 1;
}
EXPORT_SYMBOL_GPL(add_swap_extent);

/*
 * A `swap extent' is a simple thing which maps a contiguous range of pages
 * onto a contiguous range of disk blocks.  A rbtree of swap extents is
 * built at swapon time and is then used at swap_writepage/swap_read_folio
 * time for locating where on disk a page belongs.
 *
 * If the swapfile is an S_ISBLK block device, a single extent is installed.
 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
 * swap files identically.
 *
 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
 * extent rbtree operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
 * swapfiles are handled *identically* after swapon time.
 *
 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
 * and will parse them into a rbtree, in PAGE_SIZE chunks.  If some stray
 * blocks are found which do not fall within the PAGE_SIZE alignment
 * requirements, they are simply tossed out - we will never use those blocks
 * for swapping.
 *
 * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
 * prevents users from writing to the swap device, which will corrupt memory.
 *
 * The amount of disk space which a single swap extent represents varies.
 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
 * extents in the rbtree. - akpm.
 */
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
        struct file *swap_file = sis->swap_file;
        struct address_space *mapping = swap_file->f_mapping;
        struct inode *inode = mapping->host;
        int ret;

        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
                return ret;
        }

        if (mapping->a_ops->swap_activate) {
                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
                if (ret < 0)
                        return ret;
                sis->flags |= SWP_ACTIVATED;
                if ((sis->flags & SWP_FS_OPS) &&
                    sio_pool_init() != 0) {
                        destroy_swap_extents(sis);
                        return -ENOMEM;
                }
                return ret;
        }

        return generic_swapfile_activate(sis, swap_file, span);
}

static int swap_node(struct swap_info_struct *p)
{
        struct block_device *bdev;

        if (p->bdev)
                bdev = p->bdev;
        else
                bdev = p->swap_file->f_inode->i_sb->s_bdev;

        return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}

static void setup_swap_info(struct swap_info_struct *p, int prio,
                            unsigned char *swap_map,
                            struct swap_cluster_info *cluster_info)
{
        int i;

        if (prio >= 0)
                p->prio = prio;
        else
                p->prio = --least_priority;
        /*
         * the plist prio is negated because plist ordering is
         * low-to-high, while swap ordering is high-to-low
         */
        p->list.prio = -p->prio;
        for_each_node(i) {
                if (p->prio >= 0)
                        p->avail_lists[i].prio = -p->prio;
                else {
                        if (swap_node(p) == i)
                                p->avail_lists[i].prio = 1;
                        else
                                p->avail_lists[i].prio = -p->prio;
                }
        }
        p->swap_map = swap_map;
        p->cluster_info = cluster_info;
}

static void _enable_swap_info(struct swap_info_struct *p)
{
        p->flags |= SWP_WRITEOK;
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;

        assert_spin_locked(&swap_lock);
        /*
         * both lists are plists, and thus priority ordered.
         * swap_active_head needs to be priority ordered for swapoff(),
         * which on removal of any swap_info_struct with an auto-assigned
         * (i.e. negative) priority increments the auto-assigned priority
         * of any lower-priority swap_info_structs.
         * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
         * which allocates swap pages from the highest available priority
         * swap_info_struct.
         */
        plist_add(&p->list, &swap_active_head);

        /* add to available list iff swap device is not full */
        if (p->highest_bit)
                add_to_avail_list(p);
}

static void enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info)
{
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        setup_swap_info(p, prio, swap_map, cluster_info);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        /*
         * Finished initializing swap device, now it's safe to reference it.
         */
        percpu_ref_resurrect(&p->users);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        _enable_swap_info(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
}

static void reinsert_swap_info(struct swap_info_struct *p)
{
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
        _enable_swap_info(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
}

static bool __has_usable_swap(void)
{
        return !plist_head_empty(&swap_active_head);
}

bool has_usable_swap(void)
{
        bool ret;

        spin_lock(&swap_lock);
        ret = __has_usable_swap();
        spin_unlock(&swap_lock);
        return ret;
}

SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
        struct swap_info_struct *p = NULL;
        unsigned char *swap_map;
        struct swap_cluster_info *cluster_info;
        struct file *swap_file, *victim;
        struct address_space *mapping;
        struct inode *inode;
        struct filename *pathname;
        int err, found = 0;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        BUG_ON(!current->mm);

        pathname = getname(specialfile);
        if (IS_ERR(pathname))
                return PTR_ERR(pathname);

        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
        err = PTR_ERR(victim);
        if (IS_ERR(victim))
                goto out;

        mapping = victim->f_mapping;
        spin_lock(&swap_lock);
        plist_for_each_entry(p, &swap_active_head, list) {
                if (p->flags & SWP_WRITEOK) {
                        if (p->swap_file->f_mapping == mapping) {
                                found = 1;
                                break;
                        }
                }
        }
        if (!found) {
                err = -EINVAL;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        if (!security_vm_enough_memory_mm(current->mm, p->pages))
                vm_unacct_memory(p->pages);
        else {
                err = -ENOMEM;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        spin_lock(&p->lock);
        del_from_avail_list(p);
        if (p->prio < 0) {
                struct swap_info_struct *si = p;
                int nid;

                plist_for_each_entry_continue(si, &swap_active_head, list) {
                        si->prio++;
                        si->list.prio--;
                        for_each_node(nid) {
                                if (si->avail_lists[nid].prio != 1)
                                        si->avail_lists[nid].prio--;
                        }
                }
                least_priority++;
        }
        plist_del(&p->list, &swap_active_head);
        atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);

        disable_swap_slots_cache_lock();

        set_current_oom_origin();
        err = try_to_unuse(p->type);
        clear_current_oom_origin();

        if (err) {
                /* re-insert swap space back into swap_list */
                reinsert_swap_info(p);
                reenable_swap_slots_cache_unlock();
                goto out_dput;
        }

        reenable_swap_slots_cache_unlock();

        /*
         * Wait for swap operations protected by get/put_swap_device()
         * to complete.  Because of synchronize_rcu() here, all swap
         * operations protected by RCU reader side lock (including any
         * spinlock) will be waited too.  This makes it easy to
         * prevent folio_test_swapcache() and the following swap cache
         * operations from racing with swapoff.
         */
        percpu_ref_kill(&p->users);
        synchronize_rcu();
        wait_for_completion(&p->comp);

        flush_work(&p->discard_work);

        destroy_swap_extents(p);
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);

        if (!p->bdev || !bdev_nonrot(p->bdev))
                atomic_dec(&nr_rotate_swap);

        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        drain_mmlist();

        /* wait for anyone still in scan_swap_map_slots */
        p->highest_bit = 0;                /* cuts scans short */
        while (p->flags >= SWP_SCANNING) {
                spin_unlock(&p->lock);
                spin_unlock(&swap_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&swap_lock);
                spin_lock(&p->lock);
        }

        swap_file = p->swap_file;
        p->swap_file = NULL;
        p->max = 0;
        swap_map = p->swap_map;
        p->swap_map = NULL;
        cluster_info = p->cluster_info;
        p->cluster_info = NULL;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        arch_swap_invalidate_area(p->type);
        zswap_swapoff(p->type);
        mutex_unlock(&swapon_mutex);
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
        free_percpu(p->cluster_next_cpu);
        p->cluster_next_cpu = NULL;
        vfree(swap_map);
        kvfree(cluster_info);
        /* Destroy swap account information */
        swap_cgroup_swapoff(p->type);
        exit_swap_address_space(p->type);

        inode = mapping->host;

        inode_lock(inode);
        inode->i_flags &= ~S_SWAPFILE;
        inode_unlock(inode);
        filp_close(swap_file, NULL);

        /*
         * Clear the SWP_USED flag after all resources are freed so that swapon
         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
         * not hold p->lock after we cleared its SWP_WRITEOK.
         */
        spin_lock(&swap_lock);
        p->flags = 0;
        spin_unlock(&swap_lock);

        err = 0;
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

out_dput:
        filp_close(victim, NULL);
out:
        putname(pathname);
        return err;
}

#ifdef CONFIG_PROC_FS
static __poll_t swaps_poll(struct file *file, poll_table *wait)
{
        struct seq_file *seq = file->private_data;

        poll_wait(file, &proc_poll_wait, wait);

        if (seq->poll_event != atomic_read(&proc_poll_event)) {
                seq->poll_event = atomic_read(&proc_poll_event);
                return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
        }

        return EPOLLIN | EPOLLRDNORM;
}

/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
        struct swap_info_struct *si;
        int type;
        loff_t l = *pos;

        mutex_lock(&swapon_mutex);

        if (!l)
                return SEQ_START_TOKEN;

        for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                if (!--l)
                        return si;
        }

        return NULL;
}

static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
        struct swap_info_struct *si = v;
        int type;

        if (v == SEQ_START_TOKEN)
                type = 0;
        else
                type = si->type + 1;

        ++(*pos);
        for (; (si = swap_type_to_swap_info(type)); type++) {
                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                return si;
        }

        return NULL;
}

static void swap_stop(struct seq_file *swap, void *v)
{
        mutex_unlock(&swapon_mutex);
}

static int swap_show(struct seq_file *swap, void *v)
{
        struct swap_info_struct *si = v;
        struct file *file;
        int len;
        unsigned long bytes, inuse;

        if (si == SEQ_START_TOKEN) {
                seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                return 0;
        }

        bytes = K(si->pages);
        inuse = K(READ_ONCE(si->inuse_pages));

        file = si->swap_file;
        len = seq_file_path(swap, file, " \t\n\\");
        seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
                        len < 40 ? 40 - len : 1, " ",
                        S_ISBLK(file_inode(file)->i_mode) ?
                                "partition" : "file\t",
                        bytes, bytes < 10000000 ? "\t" : "",
                        inuse, inuse < 10000000 ? "\t" : "",
                        si->prio);
        return 0;
}

static const struct seq_operations swaps_op = {
        .start =        swap_start,
        .next =                swap_next,
        .stop =                swap_stop,
        .show =                swap_show
};

static int swaps_open(struct inode *inode, struct file *file)
{
        struct seq_file *seq;
        int ret;

        ret = seq_open(file, &swaps_op);
        if (ret)
                return ret;

        seq = file->private_data;
        seq->poll_event = atomic_read(&proc_poll_event);
        return 0;
}

static const struct proc_ops swaps_proc_ops = {
        .proc_flags        = PROC_ENTRY_PERMANENT,
        .proc_open        = swaps_open,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release,
        .proc_poll        = swaps_poll,
};

static int __init procswaps_init(void)
{
        proc_create("swaps", 0, NULL, &swaps_proc_ops);
        return 0;
}
__initcall(procswaps_init);
#endif /* CONFIG_PROC_FS */

#ifdef MAX_SWAPFILES_CHECK
static int __init max_swapfiles_check(void)
{
        MAX_SWAPFILES_CHECK();
        return 0;
}
late_initcall(max_swapfiles_check);
#endif

static struct swap_info_struct *alloc_swap_info(void)
{
        struct swap_info_struct *p;
        struct swap_info_struct *defer = NULL;
        unsigned int type;
        int i;

        p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (percpu_ref_init(&p->users, swap_users_ref_free,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
                kvfree(p);
                return ERR_PTR(-ENOMEM);
        }

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                if (!(swap_info[type]->flags & SWP_USED))
                        break;
        }
        if (type >= MAX_SWAPFILES) {
                spin_unlock(&swap_lock);
                percpu_ref_exit(&p->users);
                kvfree(p);
                return ERR_PTR(-EPERM);
        }
        if (type >= nr_swapfiles) {
                p->type = type;
                /*
                 * Publish the swap_info_struct after initializing it.
                 * Note that kvzalloc() above zeroes all its fields.
                 */
                smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
                nr_swapfiles++;
        } else {
                defer = p;
                p = swap_info[type];
                /*
                 * Do not memset this entry: a racing procfs swap_next()
                 * would be relying on p->type to remain valid.
                 */
        }
        p->swap_extent_root = RB_ROOT;
        plist_node_init(&p->list, 0);
        for_each_node(i)
                plist_node_init(&p->avail_lists[i], 0);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
        if (defer) {
                percpu_ref_exit(&defer->users);
                kvfree(defer);
        }
        spin_lock_init(&p->lock);
        spin_lock_init(&p->cont_lock);
        init_completion(&p->comp);

        return p;
}

static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
{
        if (S_ISBLK(inode->i_mode)) {
                p->bdev = I_BDEV(inode);
                /*
                 * Zoned block devices contain zones that have a sequential
                 * write only restriction.  Hence zoned block devices are not
                 * suitable for swapping.  Disallow them here.
                 */
                if (bdev_is_zoned(p->bdev))
                        return -EINVAL;
                p->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
        }

        return 0;
}


/*
 * Find out how many pages are allowed for a single swap device. There
 * are two limiting factors:
 * 1) the number of bits for the swap offset in the swp_entry_t type, and
 * 2) the number of bits in the swap pte, as defined by the different
 * architectures.
 *
 * In order to find the largest possible bit mask, a swap entry with
 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
 * decoded to a swp_entry_t again, and finally the swap offset is
 * extracted.
 *
 * This will mask all the bits from the initial ~0UL mask that can't
 * be encoded in either the swp_entry_t or the architecture definition
 * of a swap pte.
 */
unsigned long generic_max_swapfile_size(void)
{
        return swp_offset(pte_to_swp_entry(
                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
}

/* Can be overridden by an architecture for additional checks. */
__weak unsigned long arch_max_swapfile_size(void)
{
        return generic_max_swapfile_size();
}

static unsigned long read_swap_header(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        struct inode *inode)
{
        int i;
        unsigned long maxpages;
        unsigned long swapfilepages;
        unsigned long last_page;

        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
                pr_err("Unable to find swap-space signature\n");
                return 0;
        }

        /* swap partition endianness hack... */
        if (swab32(swap_header->info.version) == 1) {
                swab32s(&swap_header->info.version);
                swab32s(&swap_header->info.last_page);
                swab32s(&swap_header->info.nr_badpages);
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                        return 0;
                for (i = 0; i < swap_header->info.nr_badpages; i++)
                        swab32s(&swap_header->info.badpages[i]);
        }
        /* Check the swap header's sub-version */
        if (swap_header->info.version != 1) {
                pr_warn("Unable to handle swap header version %d\n",
                        swap_header->info.version);
                return 0;
        }

        p->lowest_bit  = 1;
        p->cluster_next = 1;
        p->cluster_nr = 0;

        maxpages = swapfile_maximum_size;
        last_page = swap_header->info.last_page;
        if (!last_page) {
                pr_warn("Empty swap-file\n");
                return 0;
        }
        if (last_page > maxpages) {
                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
                        K(maxpages), K(last_page));
        }
        if (maxpages > last_page) {
                maxpages = last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
                if ((unsigned int)maxpages == 0)
                        maxpages = UINT_MAX;
        }
        p->highest_bit = maxpages - 1;

        if (!maxpages)
                return 0;
        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        if (swapfilepages && maxpages > swapfilepages) {
                pr_warn("Swap area shorter than signature indicates\n");
                return 0;
        }
        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
                return 0;
        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                return 0;

        return maxpages;
}

#define SWAP_CLUSTER_INFO_COLS                                                \
        DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
#define SWAP_CLUSTER_SPACE_COLS                                                \
        DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
#define SWAP_CLUSTER_COLS                                                \
        max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)

static int setup_swap_map_and_extents(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        unsigned char *swap_map,
                                        struct swap_cluster_info *cluster_info,
                                        unsigned long maxpages,
                                        sector_t *span)
{
        unsigned int j, k;
        unsigned int nr_good_pages;
        int nr_extents;
        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
        unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
        unsigned long i, idx;

        nr_good_pages = maxpages - 1;        /* omit header page */

        cluster_list_init(&p->free_clusters);
        cluster_list_init(&p->discard_clusters);

        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];
                if (page_nr == 0 || page_nr > swap_header->info.last_page)
                        return -EINVAL;
                if (page_nr < maxpages) {
                        swap_map[page_nr] = SWAP_MAP_BAD;
                        nr_good_pages--;
                        /*
                         * Haven't marked the cluster free yet, no list
                         * operation involved
                         */
                        inc_cluster_info_page(p, cluster_info, page_nr);
                }
        }

        /* Haven't marked the cluster free yet, no list operation involved */
        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
                inc_cluster_info_page(p, cluster_info, i);

        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
                /*
                 * Not mark the cluster free yet, no list
                 * operation involved
                 */
                inc_cluster_info_page(p, cluster_info, 0);
                p->max = maxpages;
                p->pages = nr_good_pages;
                nr_extents = setup_swap_extents(p, span);
                if (nr_extents < 0)
                        return nr_extents;
                nr_good_pages = p->pages;
        }
        if (!nr_good_pages) {
                pr_warn("Empty swap-file\n");
                return -EINVAL;
        }

        if (!cluster_info)
                return nr_extents;


        /*
         * Reduce false cache line sharing between cluster_info and
         * sharing same address space.
         */
        for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
                j = (k + col) % SWAP_CLUSTER_COLS;
                for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
                        idx = i * SWAP_CLUSTER_COLS + j;
                        if (idx >= nr_clusters)
                                continue;
                        if (cluster_count(&cluster_info[idx]))
                                continue;
                        cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
                        cluster_list_add_tail(&p->free_clusters, cluster_info,
                                              idx);
                }
        }
        return nr_extents;
}

SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
        struct swap_info_struct *p;
        struct filename *name;
        struct file *swap_file = NULL;
        struct address_space *mapping;
        struct dentry *dentry;
        int prio;
        int error;
        union swap_header *swap_header;
        int nr_extents;
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
        struct swap_cluster_info *cluster_info = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
        bool inced_nr_rotate_swap = false;

        if (swap_flags & ~SWAP_FLAGS_VALID)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (!swap_avail_heads)
                return -ENOMEM;

        p = alloc_swap_info();
        if (IS_ERR(p))
                return PTR_ERR(p);

        INIT_WORK(&p->discard_work, swap_discard_work);

        name = getname(specialfile);
        if (IS_ERR(name)) {
                error = PTR_ERR(name);
                name = NULL;
                goto bad_swap;
        }
        swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
        if (IS_ERR(swap_file)) {
                error = PTR_ERR(swap_file);
                swap_file = NULL;
                goto bad_swap;
        }

        p->swap_file = swap_file;
        mapping = swap_file->f_mapping;
        dentry = swap_file->f_path.dentry;
        inode = mapping->host;

        error = claim_swapfile(p, inode);
        if (unlikely(error))
                goto bad_swap;

        inode_lock(inode);
        if (d_unlinked(dentry) || cant_mount(dentry)) {
                error = -ENOENT;
                goto bad_swap_unlock_inode;
        }
        if (IS_SWAPFILE(inode)) {
                error = -EBUSY;
                goto bad_swap_unlock_inode;
        }

        /*
         * Read the swap header.
         */
        if (!mapping->a_ops->read_folio) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }
        page = read_mapping_page(mapping, 0, swap_file);
        if (IS_ERR(page)) {
                error = PTR_ERR(page);
                goto bad_swap_unlock_inode;
        }
        swap_header = kmap(page);

        maxpages = read_swap_header(p, swap_header, inode);
        if (unlikely(!maxpages)) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }

        /* OK, set up the swap map and apply the bad block list */
        swap_map = vzalloc(maxpages);
        if (!swap_map) {
                error = -ENOMEM;
                goto bad_swap_unlock_inode;
        }

        if (p->bdev && bdev_stable_writes(p->bdev))
                p->flags |= SWP_STABLE_WRITES;

        if (p->bdev && bdev_synchronous(p->bdev))
                p->flags |= SWP_SYNCHRONOUS_IO;

        if (p->bdev && bdev_nonrot(p->bdev)) {
                int cpu, i;
                unsigned long ci, nr_cluster;

                p->flags |= SWP_SOLIDSTATE;
                p->cluster_next_cpu = alloc_percpu(unsigned int);
                if (!p->cluster_next_cpu) {
                        error = -ENOMEM;
                        goto bad_swap_unlock_inode;
                }
                /*
                 * select a random position to start with to help wear leveling
                 * SSD
                 */
                for_each_possible_cpu(cpu) {
                        per_cpu(*p->cluster_next_cpu, cpu) =
                                get_random_u32_inclusive(1, p->highest_bit);
                }
                nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);

                cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
                                        GFP_KERNEL);
                if (!cluster_info) {
                        error = -ENOMEM;
                        goto bad_swap_unlock_inode;
                }

                for (ci = 0; ci < nr_cluster; ci++)
                        spin_lock_init(&((cluster_info + ci)->lock));

                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
                if (!p->percpu_cluster) {
                        error = -ENOMEM;
                        goto bad_swap_unlock_inode;
                }
                for_each_possible_cpu(cpu) {
                        struct percpu_cluster *cluster;

                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
                        for (i = 0; i < SWAP_NR_ORDERS; i++)
                                cluster->next[i] = SWAP_NEXT_INVALID;
                }
        } else {
                atomic_inc(&nr_rotate_swap);
                inced_nr_rotate_swap = true;
        }

        error = swap_cgroup_swapon(p->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
                cluster_info, maxpages, &span);
        if (unlikely(nr_extents < 0)) {
                error = nr_extents;
                goto bad_swap_unlock_inode;
        }

        if ((swap_flags & SWAP_FLAG_DISCARD) &&
            p->bdev && bdev_max_discard_sectors(p->bdev)) {
                /*
                 * When discard is enabled for swap with no particular
                 * policy flagged, we set all swap discard flags here in
                 * order to sustain backward compatibility with older
                 * swapon(8) releases.
                 */
                p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
                             SWP_PAGE_DISCARD);

                /*
                 * By flagging sys_swapon, a sysadmin can tell us to
                 * either do single-time area discards only, or to just
                 * perform discards for released swap page-clusters.
                 * Now it's time to adjust the p->flags accordingly.
                 */
                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
                        p->flags &= ~SWP_PAGE_DISCARD;
                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
                        p->flags &= ~SWP_AREA_DISCARD;

                /* issue a swapon-time discard if it's still required */
                if (p->flags & SWP_AREA_DISCARD) {
                        int err = discard_swap(p);
                        if (unlikely(err))
                                pr_err("swapon: discard_swap(%p): %d\n",
                                        p, err);
                }
        }

        error = init_swap_address_space(p->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        error = zswap_swapon(p->type, maxpages);
        if (error)
                goto free_swap_address_space;

        /*
         * Flush any pending IO and dirty mappings before we start using this
         * swap device.
         */
        inode->i_flags |= S_SWAPFILE;
        error = inode_drain_writes(inode);
        if (error) {
                inode->i_flags &= ~S_SWAPFILE;
                goto free_swap_zswap;
        }

        mutex_lock(&swapon_mutex);
        prio = -1;
        if (swap_flags & SWAP_FLAG_PREFER)
                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
        enable_swap_info(p, prio, swap_map, cluster_info);

        pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
                K(p->pages), name->name, p->prio, nr_extents,
                K((unsigned long long)span),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (p->flags & SWP_DISCARDABLE) ? "D" : "",
                (p->flags & SWP_AREA_DISCARD) ? "s" : "",
                (p->flags & SWP_PAGE_DISCARD) ? "c" : "");

        mutex_unlock(&swapon_mutex);
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

        error = 0;
        goto out;
free_swap_zswap:
        zswap_swapoff(p->type);
free_swap_address_space:
        exit_swap_address_space(p->type);
bad_swap_unlock_inode:
        inode_unlock(inode);
bad_swap:
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
        free_percpu(p->cluster_next_cpu);
        p->cluster_next_cpu = NULL;
        inode = NULL;
        destroy_swap_extents(p);
        swap_cgroup_swapoff(p->type);
        spin_lock(&swap_lock);
        p->swap_file = NULL;
        p->flags = 0;
        spin_unlock(&swap_lock);
        vfree(swap_map);
        kvfree(cluster_info);
        if (inced_nr_rotate_swap)
                atomic_dec(&nr_rotate_swap);
        if (swap_file)
                filp_close(swap_file, NULL);
out:
        if (page && !IS_ERR(page)) {
                kunmap(page);
                put_page(page);
        }
        if (name)
                putname(name);
        if (inode)
                inode_unlock(inode);
        if (!error)
                enable_swap_slots_cache();
        return error;
}

void si_swapinfo(struct sysinfo *val)
{
        unsigned int type;
        unsigned long nr_to_be_unused = 0;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *si = swap_info[type];

                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
                        nr_to_be_unused += READ_ONCE(si->inuse_pages);
        }
        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
        spin_unlock(&swap_lock);
}

/*
 * Verify that a swap entry is valid and increment its swap map count.
 *
 * Returns error code in following case.
 * - success -> 0
 * - swp_entry is invalid -> EINVAL
 * - swp_entry is migration entry -> EINVAL
 * - swap-cache reference is requested but there is already one. -> EEXIST
 * - swap-cache reference is requested but the entry is not used. -> ENOENT
 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
 */
static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
        struct swap_info_struct *p;
        struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned char count;
        unsigned char has_cache;
        int err;

        p = swp_swap_info(entry);

        offset = swp_offset(entry);
        ci = lock_cluster_or_swap_info(p, offset);

        count = p->swap_map[offset];

        /*
         * swapin_readahead() doesn't check if a swap entry is valid, so the
         * swap entry could be SWAP_MAP_BAD. Check here with lock held.
         */
        if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
                err = -ENOENT;
                goto unlock_out;
        }

        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;
        err = 0;

        if (usage == SWAP_HAS_CACHE) {

                /* set SWAP_HAS_CACHE if there is no cache and entry is used */
                if (!has_cache && count)
                        has_cache = SWAP_HAS_CACHE;
                else if (has_cache)                /* someone else added cache */
                        err = -EEXIST;
                else                                /* no users remaining */
                        err = -ENOENT;

        } else if (count || has_cache) {

                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
                        count += usage;
                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
                        err = -EINVAL;
                else if (swap_count_continued(p, offset, count))
                        count = COUNT_CONTINUED;
                else
                        err = -ENOMEM;
        } else
                err = -ENOENT;                        /* unused swap entry */

        if (!err)
                WRITE_ONCE(p->swap_map[offset], count | has_cache);

unlock_out:
        unlock_cluster_or_swap_info(p, ci);
        return err;
}

/*
 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
 * (in which case its reference count is never incremented).
 */
void swap_shmem_alloc(swp_entry_t entry)
{
        __swap_duplicate(entry, SWAP_MAP_SHMEM);
}

/*
 * Increase reference count of swap entry by 1.
 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
 * might occur if a page table entry has got corrupted.
 */
int swap_duplicate(swp_entry_t entry)
{
        int err = 0;

        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
                err = add_swap_count_continuation(entry, GFP_ATOMIC);
        return err;
}

/*
 * @entry: swap entry for which we allocate swap cache.
 *
 * Called when allocating swap cache for existing swap entry,
 * This can return error codes. Returns 0 at success.
 * -EEXIST means there is a swap cache.
 * Note: return code is different from swap_duplicate().
 */
int swapcache_prepare(swp_entry_t entry)
{
        return __swap_duplicate(entry, SWAP_HAS_CACHE);
}

void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
{
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char usage;

        ci = lock_cluster_or_swap_info(si, offset);
        usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
        unlock_cluster_or_swap_info(si, ci);
        if (!usage)
                free_swap_slot(entry);
}

struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
        return swap_type_to_swap_info(swp_type(entry));
}

/*
 * out-of-line methods to avoid include hell.
 */
struct address_space *swapcache_mapping(struct folio *folio)
{
        return swp_swap_info(folio->swap)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(swapcache_mapping);

pgoff_t __page_file_index(struct page *page)
{
        swp_entry_t swap = page_swap_entry(page);
        return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);

/*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
 * page of the original vmalloc'ed swap_map, to hold the continuation count
 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
 *
 * These continuation pages are seldom referenced: the common paths all work
 * on the original swap_map, only referring to a continuation page when the
 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
 *
 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
 * can be called after dropping locks.
 */
int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
{
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        struct page *head;
        struct page *page;
        struct page *list_page;
        pgoff_t offset;
        unsigned char count;
        int ret = 0;

        /*
         * When debugging, it's easier to use __GFP_ZERO here; but it's better
         * for latency not to zero a page while GFP_ATOMIC and holding locks.
         */
        page = alloc_page(gfp_mask | __GFP_HIGHMEM);

        si = get_swap_device(entry);
        if (!si) {
                /*
                 * An acceptable race has occurred since the failing
                 * __swap_duplicate(): the swap device may be swapoff
                 */
                goto outer;
        }
        spin_lock(&si->lock);

        offset = swp_offset(entry);

        ci = lock_cluster(si, offset);

        count = swap_count(si->swap_map[offset]);

        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
                /*
                 * The higher the swap count, the more likely it is that tasks
                 * will race to add swap count continuation: we need to avoid
                 * over-provisioning.
                 */
                goto out;
        }

        if (!page) {
                ret = -ENOMEM;
                goto out;
        }

        head = vmalloc_to_page(si->swap_map + offset);
        offset &= ~PAGE_MASK;

        spin_lock(&si->cont_lock);
        /*
         * Page allocation does not initialize the page's lru field,
         * but it does always reset its private field.
         */
        if (!page_private(head)) {
                BUG_ON(count & COUNT_CONTINUED);
                INIT_LIST_HEAD(&head->lru);
                set_page_private(head, SWP_CONTINUED);
                si->flags |= SWP_CONTINUED;
        }

        list_for_each_entry(list_page, &head->lru, lru) {
                unsigned char *map;

                /*
                 * If the previous map said no continuation, but we've found
                 * a continuation page, free our allocation and use this one.
                 */
                if (!(count & COUNT_CONTINUED))
                        goto out_unlock_cont;

                map = kmap_local_page(list_page) + offset;
                count = *map;
                kunmap_local(map);

                /*
                 * If this continuation count now has some space in it,
                 * free our allocation and use this one.
                 */
                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
                        goto out_unlock_cont;
        }

        list_add_tail(&page->lru, &head->lru);
        page = NULL;                        /* now it's attached, don't free it */
out_unlock_cont:
        spin_unlock(&si->cont_lock);
out:
        unlock_cluster(ci);
        spin_unlock(&si->lock);
        put_swap_device(si);
outer:
        if (page)
                __free_page(page);
        return ret;
}

/*
 * swap_count_continued - when the original swap_map count is incremented
 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
 * into, carry if so, or else fail until a new continuation page is allocated;
 * when the original swap_map count is decremented from 0 with continuation,
 * borrow from the continuation and report whether it still holds more.
 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
 * lock.
 */
static bool swap_count_continued(struct swap_info_struct *si,
                                 pgoff_t offset, unsigned char count)
{
        struct page *head;
        struct page *page;
        unsigned char *map;
        bool ret;

        head = vmalloc_to_page(si->swap_map + offset);
        if (page_private(head) != SWP_CONTINUED) {
                BUG_ON(count & COUNT_CONTINUED);
                return false;                /* need to add count continuation */
        }

        spin_lock(&si->cont_lock);
        offset &= ~PAGE_MASK;
        page = list_next_entry(head, lru);
        map = kmap_local_page(page) + offset;

        if (count == SWAP_MAP_MAX)        /* initial increment from swap_map */
                goto init_map;                /* jump over SWAP_CONT_MAX checks */

        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
                /*
                 * Think of how you add 1 to 999
                 */
                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_local_page(page) + offset;
                }
                if (*map == SWAP_CONT_MAX) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        if (page == head) {
                                ret = false;        /* add count continuation */
                                goto out;
                        }
                        map = kmap_local_page(page) + offset;
init_map:                *map = 0;                /* we didn't zero the page */
                }
                *map += 1;
                kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_local_page(page) + offset;
                        *map = COUNT_CONTINUED;
                        kunmap_local(map);
                }
                ret = true;                        /* incremented */

        } else {                                /* decrementing */
                /*
                 * Think of how you subtract 1 from 1000
                 */
                BUG_ON(count != COUNT_CONTINUED);
                while (*map == COUNT_CONTINUED) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_local_page(page) + offset;
                }
                BUG_ON(*map == 0);
                *map -= 1;
                if (*map == 0)
                        count = 0;
                kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_local_page(page) + offset;
                        *map = SWAP_CONT_MAX | count;
                        count = COUNT_CONTINUED;
                        kunmap_local(map);
                }
                ret = count == COUNT_CONTINUED;
        }
out:
        spin_unlock(&si->cont_lock);
        return ret;
}

/*
 * free_swap_count_continuations - swapoff free all the continuation pages
 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
 */
static void free_swap_count_continuations(struct swap_info_struct *si)
{
        pgoff_t offset;

        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
                struct page *head;
                head = vmalloc_to_page(si->swap_map + offset);
                if (page_private(head)) {
                        struct page *page, *next;

                        list_for_each_entry_safe(page, next, &head->lru, lru) {
                                list_del(&page->lru);
                                __free_page(page);
                        }
                }
        }
}

#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        struct swap_info_struct *si, *next;
        int nid = folio_nid(folio);

        if (!(gfp & __GFP_IO))
                return;

        if (!__has_usable_swap())
                return;

        if (!blk_cgroup_congested())
                return;

        /*
         * We've already scheduled a throttle, avoid taking the global swap
         * lock.
         */
        if (current->throttle_disk)
                return;

        spin_lock(&swap_avail_lock);
        plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
                                  avail_lists[nid]) {
                if (si->bdev) {
                        blkcg_schedule_throttle(si->bdev->bd_disk, true);
                        break;
                }
        }
        spin_unlock(&swap_avail_lock);
}
#endif

static int __init swapfile_init(void)
{
        int nid;

        swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
                                         GFP_KERNEL);
        if (!swap_avail_heads) {
                pr_emerg("Not enough memory for swap heads, swap is disabled\n");
                return -ENOMEM;
        }

        for_each_node(nid)
                plist_head_init(&swap_avail_heads[nid]);

        swapfile_maximum_size = arch_max_swapfile_size();

#ifdef CONFIG_MIGRATION
        if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
                swap_migration_ad_supported = true;
#endif        /* CONFIG_MIGRATION */

        return 0;
}
subsys_initcall(swapfile_init);































































































































    3 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * connection tracking helpers.
 *
 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
 *        - generalize L3 protocol dependent part.
 *
 * Derived from include/linux/netfiter_ipv4/ip_conntrack_helper.h
 */

#ifndef _NF_CONNTRACK_HELPER_H
#define _NF_CONNTRACK_HELPER_H
#include <linux/refcount.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_expect.h>

#define NF_NAT_HELPER_PREFIX                "ip_nat_"
#define NF_NAT_HELPER_NAME(name)        NF_NAT_HELPER_PREFIX name
#define MODULE_ALIAS_NF_NAT_HELPER(name) \
        MODULE_ALIAS(NF_NAT_HELPER_NAME(name))

struct module;

enum nf_ct_helper_flags {
        NF_CT_HELPER_F_USERSPACE        = (1 << 0),
        NF_CT_HELPER_F_CONFIGURED        = (1 << 1),
};

#define NF_CT_HELPER_NAME_LEN        16

struct nf_conntrack_helper {
        struct hlist_node hnode;        /* Internal use. */

        char name[NF_CT_HELPER_NAME_LEN]; /* name of the module */
        refcount_t refcnt;
        struct module *me;                /* pointer to self */
        const struct nf_conntrack_expect_policy *expect_policy;

        /* Tuple of things we will help (compared against server response) */
        struct nf_conntrack_tuple tuple;

        /* Function to call when data passes; return verdict, or -1 to
           invalidate. */
        int (*help)(struct sk_buff *skb,
                    unsigned int protoff,
                    struct nf_conn *ct,
                    enum ip_conntrack_info conntrackinfo);

        void (*destroy)(struct nf_conn *ct);

        int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct);
        int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct);
        unsigned int expect_class_max;

        unsigned int flags;

        /* For user-space helpers: */
        unsigned int queue_num;
        /* length of userspace private data stored in nf_conn_help->data */
        u16 data_len;
        /* name of NAT helper module */
        char nat_mod_name[NF_CT_HELPER_NAME_LEN];
};

/* Must be kept in sync with the classes defined by helpers */
#define NF_CT_MAX_EXPECT_CLASSES        4

/* nf_conn feature for connections that have a helper */
struct nf_conn_help {
        /* Helper. if any */
        struct nf_conntrack_helper __rcu *helper;

        struct hlist_head expectations;

        /* Current number of expected connections */
        u8 expecting[NF_CT_MAX_EXPECT_CLASSES];

        /* private helper information. */
        char data[32] __aligned(8);
};

#define NF_CT_HELPER_BUILD_BUG_ON(structsize) \
        BUILD_BUG_ON((structsize) > sizeof_field(struct nf_conn_help, data))

struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name,
                                                       u16 l3num, u8 protonum);

struct nf_conntrack_helper *nf_conntrack_helper_try_module_get(const char *name,
                                                               u16 l3num,
                                                               u8 protonum);
void nf_conntrack_helper_put(struct nf_conntrack_helper *helper);

void nf_ct_helper_init(struct nf_conntrack_helper *helper,
                       u16 l3num, u16 protonum, const char *name,
                       u16 default_port, u16 spec_port, u32 id,
                       const struct nf_conntrack_expect_policy *exp_pol,
                       u32 expect_class_max,
                       int (*help)(struct sk_buff *skb, unsigned int protoff,
                                   struct nf_conn *ct,
                                   enum ip_conntrack_info ctinfo),
                       int (*from_nlattr)(struct nlattr *attr,
                                          struct nf_conn *ct),
                       struct module *module);

int nf_conntrack_helper_register(struct nf_conntrack_helper *);
void nf_conntrack_helper_unregister(struct nf_conntrack_helper *);

int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int);
void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *,
                                     unsigned int);

struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);

int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
                              gfp_t flags);

int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
                 enum ip_conntrack_info ctinfo, u16 proto);
int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
                     u8 proto, bool nat, struct nf_conntrack_helper **hp);

void nf_ct_helper_destroy(struct nf_conn *ct);

static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
{
        return nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
}

static inline void *nfct_help_data(const struct nf_conn *ct)
{
        struct nf_conn_help *help;

        help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);

        return (void *)help->data;
}

int nf_conntrack_helper_init(void);
void nf_conntrack_helper_fini(void);

int nf_conntrack_broadcast_help(struct sk_buff *skb, struct nf_conn *ct,
                                enum ip_conntrack_info ctinfo,
                                unsigned int timeout);

struct nf_ct_helper_expectfn {
        struct list_head head;
        const char *name;
        void (*expectfn)(struct nf_conn *ct, struct nf_conntrack_expect *exp);
};

__printf(3,4)
void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
                      const char *fmt, ...);

void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n);
void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n);
struct nf_ct_helper_expectfn *
nf_ct_helper_expectfn_find_by_name(const char *name);
struct nf_ct_helper_expectfn *
nf_ct_helper_expectfn_find_by_symbol(const void *symbol);

extern struct hlist_head *nf_ct_helper_hash;
extern unsigned int nf_ct_helper_hsize;

struct nf_conntrack_nat_helper {
        struct list_head list;
        char mod_name[NF_CT_HELPER_NAME_LEN];        /* module name */
        struct module *module;                        /* pointer to self */
};

#define NF_CT_NAT_HELPER_INIT(name) \
        { \
        .mod_name = NF_NAT_HELPER_NAME(name), \
        .module = THIS_MODULE \
        }

void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat);
void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat);
int nf_nat_helper_try_module_get(const char *name, u16 l3num,
                                 u8 protonum);
void nf_nat_helper_put(struct nf_conntrack_helper *helper);
#endif /*_NF_CONNTRACK_HELPER_H*/










































    1 









    1 








    1 





































    1 
    1 









    1 
    1 



    1 
































    2 

    1 


    1 



    1 




    1 



























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Pid namespaces
 *
 * Authors:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/pid.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/syscalls.h>
#include <linux/cred.h>
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
#include <linux/proc_ns.h>
#include <linux/reboot.h>
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
#include <uapi/linux/wait.h>
#include "pid_sysctl.h"

static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
/* Write once array, filled from the beginning. */
static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];

/*
 * creates the kmem cache to allocate pids from.
 * @level: pid namespace level
 */

static struct kmem_cache *create_pid_cachep(unsigned int level)
{
        /* Level 0 is init_pid_ns.pid_cachep */
        struct kmem_cache **pkc = &pid_cache[level - 1];
        struct kmem_cache *kc;
        char name[4 + 10 + 1];
        unsigned int len;

        kc = READ_ONCE(*pkc);
        if (kc)
                return kc;

        snprintf(name, sizeof(name), "pid_%u", level + 1);
        len = struct_size_t(struct pid, numbers, level + 1);
        mutex_lock(&pid_caches_mutex);
        /* Name collision forces to do allocation under mutex. */
        if (!*pkc)
                *pkc = kmem_cache_create(name, len, 0,
                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
        mutex_unlock(&pid_caches_mutex);
        /* current can fail, but someone else can succeed. */
        return READ_ONCE(*pkc);
}

static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
}

static void dec_pid_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}

static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
        struct pid_namespace *parent_pid_ns)
{
        struct pid_namespace *ns;
        unsigned int level = parent_pid_ns->level + 1;
        struct ucounts *ucounts;
        int err;

        err = -EINVAL;
        if (!in_userns(parent_pid_ns->user_ns, user_ns))
                goto out;

        err = -ENOSPC;
        if (level > MAX_PID_NS_LEVEL)
                goto out;
        ucounts = inc_pid_namespaces(user_ns);
        if (!ucounts)
                goto out;

        err = -ENOMEM;
        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
                goto out_dec;

        idr_init(&ns->idr);

        ns->pid_cachep = create_pid_cachep(level);
        if (ns->pid_cachep == NULL)
                goto out_free_idr;

        err = ns_alloc_inum(&ns->ns);
        if (err)
                goto out_free_idr;
        ns->ns.ops = &pidns_operations;

        refcount_set(&ns->ns.count, 1);
        ns->level = level;
        ns->parent = get_pid_ns(parent_pid_ns);
        ns->user_ns = get_user_ns(user_ns);
        ns->ucounts = ucounts;
        ns->pid_allocated = PIDNS_ADDING;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
        return ns;

out_free_idr:
        idr_destroy(&ns->idr);
        kmem_cache_free(pid_ns_cachep, ns);
out_dec:
        dec_pid_namespaces(ucounts);
out:
        return ERR_PTR(err);
}

static void delayed_free_pidns(struct rcu_head *p)
{
        struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);

        dec_pid_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);

        kmem_cache_free(pid_ns_cachep, ns);
}

static void destroy_pid_namespace(struct pid_namespace *ns)
{
        ns_free_inum(&ns->ns);

        idr_destroy(&ns->idr);
        call_rcu(&ns->rcu, delayed_free_pidns);
}

struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
        if (!(flags & CLONE_NEWPID))
                return get_pid_ns(old_ns);
        if (task_active_pid_ns(current) != old_ns)
                return ERR_PTR(-EINVAL);
        return create_pid_namespace(user_ns, old_ns);
}

void put_pid_ns(struct pid_namespace *ns)
{
        struct pid_namespace *parent;

        while (ns != &init_pid_ns) {
                parent = ns->parent;
                if (!refcount_dec_and_test(&ns->ns.count))
                        break;
                destroy_pid_namespace(ns);
                ns = parent;
        }
}
EXPORT_SYMBOL_GPL(put_pid_ns);

void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
        int nr;
        int rc;
        struct task_struct *task, *me = current;
        int init_pids = thread_group_leader(me) ? 1 : 2;
        struct pid *pid;

        /* Don't allow any more processes into the pid namespace */
        disable_pid_allocation(pid_ns);

        /*
         * Ignore SIGCHLD causing any terminated children to autoreap.
         * This speeds up the namespace shutdown, plus see the comment
         * below.
         */
        spin_lock_irq(&me->sighand->siglock);
        me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
        spin_unlock_irq(&me->sighand->siglock);

        /*
         * The last thread in the cgroup-init thread group is terminating.
         * Find remaining pid_ts in the namespace, signal and wait for them
         * to exit.
         *
         * Note:  This signals each threads in the namespace - even those that
         *           belong to the same thread group, To avoid this, we would have
         *           to walk the entire tasklist looking a processes in this
         *           namespace, but that could be unnecessarily expensive if the
         *           pid namespace has just a few processes. Or we need to
         *           maintain a tasklist for each pid namespace.
         *
         */
        rcu_read_lock();
        read_lock(&tasklist_lock);
        nr = 2;
        idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
                task = pid_task(pid, PIDTYPE_PID);
                if (task && !__fatal_signal_pending(task))
                        group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX);
        }
        read_unlock(&tasklist_lock);
        rcu_read_unlock();

        /*
         * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
         * kernel_wait4() will also block until our children traced from the
         * parent namespace are detached and become EXIT_DEAD.
         */
        do {
                clear_thread_flag(TIF_SIGPENDING);
                clear_thread_flag(TIF_NOTIFY_SIGNAL);
                rc = kernel_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);

        /*
         * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
         * process whose parents processes are outside of the pid
         * namespace.  Such processes are created with setns()+fork().
         *
         * If those EXIT_ZOMBIE processes are not reaped by their
         * parents before their parents exit, they will be reparented
         * to pid_ns->child_reaper.  Thus pidns->child_reaper needs to
         * stay valid until they all go away.
         *
         * The code relies on the pid_ns->child_reaper ignoring
         * SIGCHILD to cause those EXIT_ZOMBIE processes to be
         * autoreaped if reparented.
         *
         * Semantically it is also desirable to wait for EXIT_ZOMBIE
         * processes before allowing the child_reaper to be reaped, as
         * that gives the invariant that when the init process of a
         * pid namespace is reaped all of the processes in the pid
         * namespace are gone.
         *
         * Once all of the other tasks are gone from the pid_namespace
         * free_pid() will awaken this task.
         */
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (pid_ns->pid_allocated == init_pids)
                        break;
                /*
                 * Release tasks_rcu_exit_srcu to avoid following deadlock:
                 *
                 * 1) TASK A unshare(CLONE_NEWPID)
                 * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
                 *    and TASK C
                 * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
                 * 4) TASK A calls synchronize_rcu_tasks()
                 *                   -> synchronize_srcu(tasks_rcu_exit_srcu)
                 * 5) *DEADLOCK*
                 *
                 * It is considered safe to release tasks_rcu_exit_srcu here
                 * because we assume the current task can not be concurrently
                 * reaped at this point.
                 */
                exit_tasks_rcu_stop();
                schedule();
                exit_tasks_rcu_start();
        }
        __set_current_state(TASK_RUNNING);

        if (pid_ns->reboot)
                current->signal->group_exit_code = pid_ns->reboot;

        acct_exit_ns(pid_ns);
        return;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct pid_namespace *pid_ns = task_active_pid_ns(current);
        struct ctl_table tmp = *table;
        int ret, next;

        if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
                return -EPERM;

        next = idr_get_cursor(&pid_ns->idr) - 1;

        tmp.data = &next;
        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
        if (!ret && write)
                idr_set_cursor(&pid_ns->idr, next + 1);

        return ret;
}

extern int pid_max;
static struct ctl_table pid_ns_ctl_table[] = {
        {
                .procname = "ns_last_pid",
                .maxlen = sizeof(int),
                .mode = 0666, /* permissions are checked in the handler */
                .proc_handler = pid_ns_ctl_handler,
                .extra1 = SYSCTL_ZERO,
                .extra2 = &pid_max,
        },
};
#endif        /* CONFIG_CHECKPOINT_RESTORE */

int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
        if (pid_ns == &init_pid_ns)
                return 0;

        switch (cmd) {
        case LINUX_REBOOT_CMD_RESTART2:
        case LINUX_REBOOT_CMD_RESTART:
                pid_ns->reboot = SIGHUP;
                break;

        case LINUX_REBOOT_CMD_POWER_OFF:
        case LINUX_REBOOT_CMD_HALT:
                pid_ns->reboot = SIGINT;
                break;
        default:
                return -EINVAL;
        }

        read_lock(&tasklist_lock);
        send_sig(SIGKILL, pid_ns->child_reaper, 1);
        read_unlock(&tasklist_lock);

        do_exit(0);

        /* Not reached */
        return 0;
}

static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
{
        return container_of(ns, struct pid_namespace, ns);
}

static struct ns_common *pidns_get(struct task_struct *task)
{
        struct pid_namespace *ns;

        rcu_read_lock();
        ns = task_active_pid_ns(task);
        if (ns)
                get_pid_ns(ns);
        rcu_read_unlock();

        return ns ? &ns->ns : NULL;
}

static struct ns_common *pidns_for_children_get(struct task_struct *task)
{
        struct pid_namespace *ns = NULL;

        task_lock(task);
        if (task->nsproxy) {
                ns = task->nsproxy->pid_ns_for_children;
                get_pid_ns(ns);
        }
        task_unlock(task);

        if (ns) {
                read_lock(&tasklist_lock);
                if (!ns->child_reaper) {
                        put_pid_ns(ns);
                        ns = NULL;
                }
                read_unlock(&tasklist_lock);
        }

        return ns ? &ns->ns : NULL;
}

static void pidns_put(struct ns_common *ns)
{
        put_pid_ns(to_pid_ns(ns));
}

static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct pid_namespace *active = task_active_pid_ns(current);
        struct pid_namespace *ancestor, *new = to_pid_ns(ns);

        if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /*
         * Only allow entering the current active pid namespace
         * or a child of the current active pid namespace.
         *
         * This is required for fork to return a usable pid value and
         * this maintains the property that processes and their
         * children can not escape their current pid namespace.
         */
        if (new->level < active->level)
                return -EINVAL;

        ancestor = new;
        while (ancestor->level > active->level)
                ancestor = ancestor->parent;
        if (ancestor != active)
                return -EINVAL;

        put_pid_ns(nsproxy->pid_ns_for_children);
        nsproxy->pid_ns_for_children = get_pid_ns(new);
        return 0;
}

static struct ns_common *pidns_get_parent(struct ns_common *ns)
{
        struct pid_namespace *active = task_active_pid_ns(current);
        struct pid_namespace *pid_ns, *p;

        /* See if the parent is in the current namespace */
        pid_ns = p = to_pid_ns(ns)->parent;
        for (;;) {
                if (!p)
                        return ERR_PTR(-EPERM);
                if (p == active)
                        break;
                p = p->parent;
        }

        return &get_pid_ns(pid_ns)->ns;
}

static struct user_namespace *pidns_owner(struct ns_common *ns)
{
        return to_pid_ns(ns)->user_ns;
}

const struct proc_ns_operations pidns_operations = {
        .name                = "pid",
        .type                = CLONE_NEWPID,
        .get                = pidns_get,
        .put                = pidns_put,
        .install        = pidns_install,
        .owner                = pidns_owner,
        .get_parent        = pidns_get_parent,
};

const struct proc_ns_operations pidns_for_children_operations = {
        .name                = "pid_for_children",
        .real_ns_name        = "pid",
        .type                = CLONE_NEWPID,
        .get                = pidns_for_children_get,
        .put                = pidns_put,
        .install        = pidns_install,
        .owner                = pidns_owner,
        .get_parent        = pidns_get_parent,
};

static __init int pid_namespaces_init(void)
{
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);

#ifdef CONFIG_CHECKPOINT_RESTORE
        register_sysctl_init("kernel", pid_ns_ctl_table);
#endif

        register_pid_ns_sysctl_table_vm();
        return 0;
}

__initcall(pid_namespaces_init);




































































































































































































































































































































    2 









































    1 


    1 
    1 

    1 
    1 









































    1 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ADDRCONF_H
#define _ADDRCONF_H

#define MAX_RTR_SOLICITATIONS                -1                /* unlimited */
#define RTR_SOLICITATION_INTERVAL        (4*HZ)
#define RTR_SOLICITATION_MAX_INTERVAL        (3600*HZ)        /* 1 hour */

#define MIN_VALID_LIFETIME                (2*3600)        /* 2 hours */

#define TEMP_VALID_LIFETIME                (7*86400)       /* 1 week */
#define TEMP_PREFERRED_LIFETIME                (86400)         /* 24 hours */
#define REGEN_MIN_ADVANCE                (2)             /* 2 seconds */
#define REGEN_MAX_RETRY                        (3)
#define MAX_DESYNC_FACTOR                (600)

#define ADDR_CHECK_FREQUENCY                (120*HZ)

#define IPV6_MAX_ADDRESSES                16

#define ADDRCONF_TIMER_FUZZ_MINUS        (HZ > 50 ? HZ / 50 : 1)
#define ADDRCONF_TIMER_FUZZ                (HZ / 4)
#define ADDRCONF_TIMER_FUZZ_MAX                (HZ)

#define ADDRCONF_NOTIFY_PRIORITY        0

#include <linux/in.h>
#include <linux/in6.h>

struct prefix_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;

        union __packed {
                __u8                flags;
                struct __packed {
#if defined(__BIG_ENDIAN_BITFIELD)
                        __u8        onlink : 1,
                                 autoconf : 1,
                                reserved : 6;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
                        __u8        reserved : 6,
                                autoconf : 1,
                                onlink : 1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
                };
        };
        __be32                        valid;
        __be32                        prefered;
        __be32                        reserved2;

        struct in6_addr                prefix;
};

/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */
static_assert(sizeof(struct prefix_info) == 32);

#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/if_inet6.h>
#include <net/ipv6.h>

struct in6_validator_info {
        struct in6_addr                i6vi_addr;
        struct inet6_dev        *i6vi_dev;
        struct netlink_ext_ack        *extack;
};

struct ifa6_config {
        const struct in6_addr        *pfx;
        unsigned int                plen;

        u8                        ifa_proto;

        const struct in6_addr        *peer_pfx;

        u32                        rt_priority;
        u32                        ifa_flags;
        u32                        preferred_lft;
        u32                        valid_lft;
        u16                        scope;
};

int addrconf_init(void);
void addrconf_cleanup(void);

int addrconf_add_ifaddr(struct net *net, void __user *arg);
int addrconf_del_ifaddr(struct net *net, void __user *arg);
int addrconf_set_dstaddr(struct net *net, void __user *arg);

int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
                  const struct net_device *dev, int strict);
int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                            const struct net_device *dev, bool skip_dev_check,
                            int strict, u32 banned_flags);

#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
#endif

int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                          unsigned char nsegs);

bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
                                   const unsigned int prefix_len,
                                   struct net_device *dev);

int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev);

struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                 struct net_device *dev);

struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev, int strict);

int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
                       const struct in6_addr *daddr, unsigned int srcprefs,
                       struct in6_addr *saddr);
int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
                    u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
                          bool match_wildcard);
bool inet_rcv_saddr_any(const struct sock *sk);
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);

void addrconf_add_linklocal(struct inet6_dev *idev,
                            const struct in6_addr *addr, u32 flags);

int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 const struct prefix_info *pinfo,
                                 struct inet6_dev *in6_dev,
                                 const struct in6_addr *addr, int addr_type,
                                 u32 addr_flags, bool sllao, bool tokenized,
                                 __u32 valid_lft, u32 prefered_lft);

static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr)
{
        memcpy(eui, addr, 3);
        eui[3] = 0xFF;
        eui[4] = 0xFE;
        memcpy(eui + 5, addr + 3, 3);
}

static inline void addrconf_addr_eui48(u8 *eui, const char *const addr)
{
        addrconf_addr_eui48_base(eui, addr);
        eui[0] ^= 2;
}

static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
{
        if (dev->addr_len != ETH_ALEN)
                return -1;

        /*
         * The zSeries OSA network cards can be shared among various
         * OS instances, but the OSA cards have only one MAC address.
         * This leads to duplicate address conflicts in conjunction
         * with IPv6 if more than one instance uses the same card.
         *
         * The driver for these cards can deliver a unique 16-bit
         * identifier for each instance sharing the same card.  It is
         * placed instead of 0xFFFE in the interface identifier.  The
         * "u" bit of the interface identifier is not inverted in this
         * case.  Hence the resulting interface identifier has local
         * scope according to RFC2373.
         */

        addrconf_addr_eui48_base(eui, dev->dev_addr);

        if (dev->dev_id) {
                eui[3] = (dev->dev_id >> 8) & 0xFF;
                eui[4] = dev->dev_id & 0xFF;
        } else {
                eui[0] ^= 2;
        }

        return 0;
}

static inline unsigned long addrconf_timeout_fixup(u32 timeout,
                                                   unsigned int unit)
{
        if (timeout == 0xffffffff)
                return ~0UL;

        /*
         * Avoid arithmetic overflow.
         * Assuming unit is constant and non-zero, this "if" statement
         * will go away on 64bit archs.
         */
        if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit)
                return LONG_MAX / unit;

        return timeout;
}

static inline int addrconf_finite_timeout(unsigned long timeout)
{
        return ~timeout;
}

/*
 *        IPv6 Address Label subsystem (addrlabel.c)
 */
int ipv6_addr_label_init(void);
void ipv6_addr_label_cleanup(void);
int ipv6_addr_label_rtnl_register(void);
u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr,
                    int type, int ifindex);

/*
 *        multicast prototypes (mcast.c)
 */
static inline bool ipv6_mc_may_pull(struct sk_buff *skb,
                                    unsigned int len)
{
        if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len)
                return false;

        return pskb_may_pull(skb, len);
}

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_mc_close(struct sock *sk);
void ipv6_sock_mc_close(struct sock *sk);
bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
                    const struct in6_addr *src_addr);

int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr);
int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr);
void ipv6_mc_up(struct inet6_dev *idev);
void ipv6_mc_down(struct inet6_dev *idev);
void ipv6_mc_unmap(struct inet6_dev *idev);
void ipv6_mc_remap(struct inet6_dev *idev);
void ipv6_mc_init_dev(struct inet6_dev *idev);
void ipv6_mc_destroy_dev(struct inet6_dev *idev);
int ipv6_mc_check_mld(struct sk_buff *skb);
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);

bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
                         const struct in6_addr *src_addr);

void ipv6_mc_dad_complete(struct inet6_dev *idev);

/*
 * identify MLD packets for MLD filter exceptions
 */
static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
{
        struct icmp6hdr *hdr;

        if (nexthdr != IPPROTO_ICMPV6 ||
            !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr)))
                return false;

        hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset);

        switch (hdr->icmp6_type) {
        case ICMPV6_MGM_QUERY:
        case ICMPV6_MGM_REPORT:
        case ICMPV6_MGM_REDUCTION:
        case ICMPV6_MLD2_REPORT:
                return true;
        default:
                break;
        }
        return false;
}

void addrconf_prefix_rcv(struct net_device *dev,
                         u8 *opt, int len, bool sllao);

/*
 *        anycast prototypes (anycast.c)
 */
int ipv6_sock_ac_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_ac_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_ac_close(struct sock *sk);
void ipv6_sock_ac_close(struct sock *sk);

int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr);
int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr);
void ipv6_ac_destroy_dev(struct inet6_dev *idev);
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
                         const struct in6_addr *addr);
bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
                             const struct in6_addr *addr);
int ipv6_anycast_init(void);
void ipv6_anycast_cleanup(void);

/* Device notifier */
int register_inet6addr_notifier(struct notifier_block *nb);
int unregister_inet6addr_notifier(struct notifier_block *nb);
int inet6addr_notifier_call_chain(unsigned long val, void *v);

int register_inet6addr_validator_notifier(struct notifier_block *nb);
int unregister_inet6addr_validator_notifier(struct notifier_block *nb);
int inet6addr_validator_notifier_call_chain(unsigned long val, void *v);

void inet6_netconf_notify_devconf(struct net *net, int event, int type,
                                  int ifindex, struct ipv6_devconf *devconf);

/**
 * __in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
{
        return rcu_dereference_rtnl(dev->ip6_ptr);
}

/**
 * __in6_dev_stats_get - get inet6_dev pointer for stats
 * @dev: network device
 * @skb: skb for original incoming interface if neeeded
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        if (netif_is_l3_master(dev))
                dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb));
        return __in6_dev_get(dev);
}

/**
 * __in6_dev_get_safely - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This is a safer version of __in6_dev_get
 */
static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev)
{
        if (likely(dev))
                return rcu_dereference_rtnl(dev->ip6_ptr);
        else
                return NULL;
}

/**
 * in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This version can be used in any context, and takes a reference
 * on the inet6_dev. Callers must use in6_dev_put() later to
 * release this reference.
 */
static inline struct inet6_dev *in6_dev_get(const struct net_device *dev)
{
        struct inet6_dev *idev;

        rcu_read_lock();
        idev = rcu_dereference(dev->ip6_ptr);
        if (idev)
                refcount_inc(&idev->refcnt);
        rcu_read_unlock();
        return idev;
}

static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        return idev ? idev->nd_parms : NULL;
}

void in6_dev_finish_destroy(struct inet6_dev *idev);

static inline void in6_dev_put(struct inet6_dev *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in6_dev_finish_destroy(idev);
}

static inline void in6_dev_put_clear(struct inet6_dev **pidev)
{
        struct inet6_dev *idev = *pidev;

        if (idev) {
                in6_dev_put(idev);
                *pidev = NULL;
        }
}

static inline void __in6_dev_put(struct inet6_dev *idev)
{
        refcount_dec(&idev->refcnt);
}

static inline void in6_dev_hold(struct inet6_dev *idev)
{
        refcount_inc(&idev->refcnt);
}

/* called with rcu_read_lock held */
static inline bool ip6_ignore_linkdown(const struct net_device *dev)
{
        const struct inet6_dev *idev = __in6_dev_get(dev);

        if (unlikely(!idev))
                return true;

        return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown);
}

void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);

static inline void in6_ifa_put(struct inet6_ifaddr *ifp)
{
        if (refcount_dec_and_test(&ifp->refcnt))
                inet6_ifa_finish_destroy(ifp);
}

static inline void __in6_ifa_put(struct inet6_ifaddr *ifp)
{
        refcount_dec(&ifp->refcnt);
}

static inline void in6_ifa_hold(struct inet6_ifaddr *ifp)
{
        refcount_inc(&ifp->refcnt);
}

static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp)
{
        return refcount_inc_not_zero(&ifp->refcnt);
}

/*
 *        compute link-local solicited-node multicast address
 */

static inline void addrconf_addr_solict_mult(const struct in6_addr *addr,
                                             struct in6_addr *solicited)
{
        ipv6_addr_set(solicited,
                      htonl(0xFF020000), 0,
                      htonl(0x1),
                      htonl(0xFF000000) | addr->s6_addr32[3]);
}

static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0;
#endif
}

static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0;
#endif
}

static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr)
{
        return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE);
}

static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) &
                 cpu_to_be64(0xffffffffff000000UL))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] |
                (addr->s6_addr32[2] ^ htonl(0x00000001)) |
                (addr->s6_addr[12] ^ 0xff)) == 0;
#endif
}

static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;

        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                (p[1] ^ cpu_to_be64(0x6a))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0;
#endif
}

#ifdef CONFIG_PROC_FS
int if6_proc_init(void);
void if6_proc_exit(void);
#endif

#endif


























































































































































































































































   13 


   10 








































































































































































































































































   11 


    8 
   14 
   12 


















   11 



   11 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/memblock.h>
#include <linux/page_ext.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate.h>
#include <linux/pgalloc_tag.h>

/*
 * struct page extension
 *
 * This is the feature to manage memory for extended data per page.
 *
 * Until now, we must modify struct page itself to store extra data per page.
 * This requires rebuilding the kernel and it is really time consuming process.
 * And, sometimes, rebuild is impossible due to third party module dependency.
 * At last, enlarging struct page could cause un-wanted system behaviour change.
 *
 * This feature is intended to overcome above mentioned problems. This feature
 * allocates memory for extended data per page in certain place rather than
 * the struct page itself. This memory can be accessed by the accessor
 * functions provided by this code. During the boot process, it checks whether
 * allocation of huge chunk of memory is needed or not. If not, it avoids
 * allocating memory at all. With this advantage, we can include this feature
 * into the kernel in default and can avoid rebuild and solve related problems.
 *
 * To help these things to work well, there are two callbacks for clients. One
 * is the need callback which is mandatory if user wants to avoid useless
 * memory allocation at boot-time. The other is optional, init callback, which
 * is used to do proper initialization after memory is allocated.
 *
 * The need callback is used to decide whether extended memory allocation is
 * needed or not. Sometimes users want to deactivate some features in this
 * boot and extra memory would be unnecessary. In this case, to avoid
 * allocating huge chunk of memory, each clients represent their need of
 * extra memory through the need callback. If one of the need callbacks
 * returns true, it means that someone needs extra memory so that
 * page extension core should allocates memory for page extension. If
 * none of need callbacks return true, memory isn't needed at all in this boot
 * and page extension core can skip to allocate memory. As result,
 * none of memory is wasted.
 *
 * When need callback returns true, page_ext checks if there is a request for
 * extra memory through size in struct page_ext_operations. If it is non-zero,
 * extra space is allocated for each page_ext entry and offset is returned to
 * user through offset in struct page_ext_operations.
 *
 * The init callback is used to do proper initialization after page extension
 * is completely initialized. In sparse memory system, extra memory is
 * allocated some time later than memmap is allocated. In other words, lifetime
 * of memory for page extension isn't same with memmap for struct page.
 * Therefore, clients can't store extra data until page extension is
 * initialized, even if pages are allocated and used freely. This could
 * cause inadequate state of extra data per page, so, to prevent it, client
 * can utilize this callback to initialize the state of it correctly.
 */

#ifdef CONFIG_SPARSEMEM
#define PAGE_EXT_INVALID       (0x1)
#endif

#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
static bool need_page_idle(void)
{
        return true;
}
static struct page_ext_operations page_idle_ops __initdata = {
        .need = need_page_idle,
        .need_shared_flags = true,
};
#endif

static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_OWNER
        &page_owner_ops,
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
        &page_idle_ops,
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        &page_alloc_tagging_ops,
#endif
#ifdef CONFIG_PAGE_TABLE_CHECK
        &page_table_check_ops,
#endif
};

unsigned long page_ext_size;

static unsigned long total_usage;

#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
/*
 * To ensure correct allocation tagging for pages, page_ext should be available
 * before the first page allocation. Otherwise early task stacks will be
 * allocated before page_ext initialization and missing tags will be flagged.
 */
bool early_page_ext __meminitdata = true;
#else
bool early_page_ext __meminitdata;
#endif
static int __init setup_early_page_ext(char *str)
{
        early_page_ext = true;
        return 0;
}
early_param("early_page_ext", setup_early_page_ext);

static bool __init invoke_need_callbacks(void)
{
        int i;
        int entries = ARRAY_SIZE(page_ext_ops);
        bool need = false;

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->need()) {
                        if (page_ext_ops[i]->need_shared_flags) {
                                page_ext_size = sizeof(struct page_ext);
                                break;
                        }
                }
        }

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->need()) {
                        page_ext_ops[i]->offset = page_ext_size;
                        page_ext_size += page_ext_ops[i]->size;
                        need = true;
                }
        }

        return need;
}

static void __init invoke_init_callbacks(void)
{
        int i;
        int entries = ARRAY_SIZE(page_ext_ops);

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->init)
                        page_ext_ops[i]->init();
        }
}

static inline struct page_ext *get_entry(void *base, unsigned long index)
{
        return base + page_ext_size * index;
}

#ifndef CONFIG_SPARSEMEM
void __init page_ext_init_flatmem_late(void)
{
        invoke_init_callbacks();
}

void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
        pgdat->node_page_ext = NULL;
}

static struct page_ext *lookup_page_ext(const struct page *page)
{
        unsigned long pfn = page_to_pfn(page);
        unsigned long index;
        struct page_ext *base;

        WARN_ON_ONCE(!rcu_read_lock_held());
        base = NODE_DATA(page_to_nid(page))->node_page_ext;
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
         */
        if (unlikely(!base))
                return NULL;
        index = pfn - round_down(node_start_pfn(page_to_nid(page)),
                                        MAX_ORDER_NR_PAGES);
        return get_entry(base, index);
}

static int __init alloc_node_page_ext(int nid)
{
        struct page_ext *base;
        unsigned long table_size;
        unsigned long nr_pages;

        nr_pages = NODE_DATA(nid)->node_spanned_pages;
        if (!nr_pages)
                return 0;

        /*
         * Need extra space if node range is not aligned with
         * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
         * checks buddy's status, range could be out of exact node range.
         */
        if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
                !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
                nr_pages += MAX_ORDER_NR_PAGES;

        table_size = page_ext_size * nr_pages;

        base = memblock_alloc_try_nid(
                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
                        MEMBLOCK_ALLOC_ACCESSIBLE, nid);
        if (!base)
                return -ENOMEM;
        NODE_DATA(nid)->node_page_ext = base;
        total_usage += table_size;
        return 0;
}

void __init page_ext_init_flatmem(void)
{

        int nid, fail;

        if (!invoke_need_callbacks())
                return;

        for_each_online_node(nid)  {
                fail = alloc_node_page_ext(nid);
                if (fail)
                        goto fail;
        }
        pr_info("allocated %ld bytes of page_ext\n", total_usage);
        return;

fail:
        pr_crit("allocation of page_ext failed.\n");
        panic("Out of memory");
}

#else /* CONFIG_SPARSEMEM */
static bool page_ext_invalid(struct page_ext *page_ext)
{
        return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
}

static struct page_ext *lookup_page_ext(const struct page *page)
{
        unsigned long pfn = page_to_pfn(page);
        struct mem_section *section = __pfn_to_section(pfn);
        struct page_ext *page_ext = READ_ONCE(section->page_ext);

        WARN_ON_ONCE(!rcu_read_lock_held());
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
         */
        if (page_ext_invalid(page_ext))
                return NULL;
        return get_entry(page_ext, pfn);
}

static void *__meminit alloc_page_ext(size_t size, int nid)
{
        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
        void *addr = NULL;

        addr = alloc_pages_exact_nid(nid, size, flags);
        if (addr) {
                kmemleak_alloc(addr, size, 1, flags);
                return addr;
        }

        addr = vzalloc_node(size, nid);

        return addr;
}

static int __meminit init_section_page_ext(unsigned long pfn, int nid)
{
        struct mem_section *section;
        struct page_ext *base;
        unsigned long table_size;

        section = __pfn_to_section(pfn);

        if (section->page_ext)
                return 0;

        table_size = page_ext_size * PAGES_PER_SECTION;
        base = alloc_page_ext(table_size, nid);

        /*
         * The value stored in section->page_ext is (base - pfn)
         * and it does not point to the memory block allocated above,
         * causing kmemleak false positives.
         */
        kmemleak_not_leak(base);

        if (!base) {
                pr_err("page ext allocation failure\n");
                return -ENOMEM;
        }

        /*
         * The passed "pfn" may not be aligned to SECTION.  For the calculation
         * we need to apply a mask.
         */
        pfn &= PAGE_SECTION_MASK;
        section->page_ext = (void *)base - page_ext_size * pfn;
        total_usage += table_size;
        return 0;
}

static void free_page_ext(void *addr)
{
        if (is_vmalloc_addr(addr)) {
                vfree(addr);
        } else {
                struct page *page = virt_to_page(addr);
                size_t table_size;

                table_size = page_ext_size * PAGES_PER_SECTION;

                BUG_ON(PageReserved(page));
                kmemleak_free(addr);
                free_pages_exact(addr, table_size);
        }
}

static void __free_page_ext(unsigned long pfn)
{
        struct mem_section *ms;
        struct page_ext *base;

        ms = __pfn_to_section(pfn);
        if (!ms || !ms->page_ext)
                return;

        base = READ_ONCE(ms->page_ext);
        /*
         * page_ext here can be valid while doing the roll back
         * operation in online_page_ext().
         */
        if (page_ext_invalid(base))
                base = (void *)base - PAGE_EXT_INVALID;
        WRITE_ONCE(ms->page_ext, NULL);

        base = get_entry(base, pfn);
        free_page_ext(base);
}

static void __invalidate_page_ext(unsigned long pfn)
{
        struct mem_section *ms;
        void *val;

        ms = __pfn_to_section(pfn);
        if (!ms || !ms->page_ext)
                return;
        val = (void *)ms->page_ext + PAGE_EXT_INVALID;
        WRITE_ONCE(ms->page_ext, val);
}

static int __meminit online_page_ext(unsigned long start_pfn,
                                unsigned long nr_pages,
                                int nid)
{
        unsigned long start, end, pfn;
        int fail = 0;

        start = SECTION_ALIGN_DOWN(start_pfn);
        end = SECTION_ALIGN_UP(start_pfn + nr_pages);

        if (nid == NUMA_NO_NODE) {
                /*
                 * In this case, "nid" already exists and contains valid memory.
                 * "start_pfn" passed to us is a pfn which is an arg for
                 * online__pages(), and start_pfn should exist.
                 */
                nid = pfn_to_nid(start_pfn);
                VM_BUG_ON(!node_online(nid));
        }

        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
                fail = init_section_page_ext(pfn, nid);
        if (!fail)
                return 0;

        /* rollback */
        end = pfn - PAGES_PER_SECTION;
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __free_page_ext(pfn);

        return -ENOMEM;
}

static void __meminit offline_page_ext(unsigned long start_pfn,
                                unsigned long nr_pages)
{
        unsigned long start, end, pfn;

        start = SECTION_ALIGN_DOWN(start_pfn);
        end = SECTION_ALIGN_UP(start_pfn + nr_pages);

        /*
         * Freeing of page_ext is done in 3 steps to avoid
         * use-after-free of it:
         * 1) Traverse all the sections and mark their page_ext
         *    as invalid.
         * 2) Wait for all the existing users of page_ext who
         *    started before invalidation to finish.
         * 3) Free the page_ext.
         */
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __invalidate_page_ext(pfn);

        synchronize_rcu();

        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __free_page_ext(pfn);
}

static int __meminit page_ext_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
{
        struct memory_notify *mn = arg;
        int ret = 0;

        switch (action) {
        case MEM_GOING_ONLINE:
                ret = online_page_ext(mn->start_pfn,
                                   mn->nr_pages, mn->status_change_nid);
                break;
        case MEM_OFFLINE:
                offline_page_ext(mn->start_pfn,
                                mn->nr_pages);
                break;
        case MEM_CANCEL_ONLINE:
                offline_page_ext(mn->start_pfn,
                                mn->nr_pages);
                break;
        case MEM_GOING_OFFLINE:
                break;
        case MEM_ONLINE:
        case MEM_CANCEL_OFFLINE:
                break;
        }

        return notifier_from_errno(ret);
}

void __init page_ext_init(void)
{
        unsigned long pfn;
        int nid;

        if (!invoke_need_callbacks())
                return;

        for_each_node_state(nid, N_MEMORY) {
                unsigned long start_pfn, end_pfn;

                start_pfn = node_start_pfn(nid);
                end_pfn = node_end_pfn(nid);
                /*
                 * start_pfn and end_pfn may not be aligned to SECTION and the
                 * page->flags of out of node pages are not initialized.  So we
                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
                 */
                for (pfn = start_pfn; pfn < end_pfn;
                        pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {

                        if (!pfn_valid(pfn))
                                continue;
                        /*
                         * Nodes's pfns can be overlapping.
                         * We know some arch can have a nodes layout such as
                         * -------------pfn-------------->
                         * N0 | N1 | N2 | N0 | N1 | N2|....
                         */
                        if (pfn_to_nid(pfn) != nid)
                                continue;
                        if (init_section_page_ext(pfn, nid))
                                goto oom;
                        cond_resched();
                }
        }
        hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
        pr_info("allocated %ld bytes of page_ext\n", total_usage);
        invoke_init_callbacks();
        return;

oom:
        panic("Out of memory");
}

void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
}

#endif

/**
 * page_ext_get() - Get the extended information for a page.
 * @page: The page we're interested in.
 *
 * Ensures that the page_ext will remain valid until page_ext_put()
 * is called.
 *
 * Return: NULL if no page_ext exists for this page.
 * Context: Any context.  Caller may not sleep until they have called
 * page_ext_put().
 */
struct page_ext *page_ext_get(const struct page *page)
{
        struct page_ext *page_ext;

        rcu_read_lock();
        page_ext = lookup_page_ext(page);
        if (!page_ext) {
                rcu_read_unlock();
                return NULL;
        }

        return page_ext;
}

/**
 * page_ext_put() - Working with page extended information is done.
 * @page_ext: Page extended information received from page_ext_get().
 *
 * The page extended information of the page may not be valid after this
 * function is called.
 *
 * Return: None.
 * Context: Any context with corresponding page_ext_get() is called.
 */
void page_ext_put(struct page_ext *page_ext)
{
        if (unlikely(!page_ext))
                return;

        rcu_read_unlock();
}















































































































































































    3 









    2 































































































































































    4 

























    1 
    3 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Connection state tracking for netfilter.  This is separated from,
 * but required by, the (future) NAT layer; it can also be used by an iptables
 * extension.
 *
 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
 *        - generalize L3 protocol dependent part.
 *
 * Derived from include/linux/netfiter_ipv4/ip_conntrack.h
 */

#ifndef _NF_CONNTRACK_H
#define _NF_CONNTRACK_H

#include <linux/bitops.h>
#include <linux/compiler.h>

#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
#include <linux/netfilter/nf_conntrack_dccp.h>
#include <linux/netfilter/nf_conntrack_sctp.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>

#include <net/netfilter/nf_conntrack_tuple.h>

struct nf_ct_udp {
        unsigned long        stream_ts;
};

/* per conntrack: protocol private data */
union nf_conntrack_proto {
        /* insert conntrack proto private data here */
        struct nf_ct_dccp dccp;
        struct ip_ct_sctp sctp;
        struct ip_ct_tcp tcp;
        struct nf_ct_udp udp;
        struct nf_ct_gre gre;
        unsigned int tmpl_padto;
};

union nf_conntrack_expect_proto {
        /* insert expect proto private data here */
};

struct nf_conntrack_net_ecache {
        struct delayed_work dwork;
        spinlock_t dying_lock;
        struct hlist_nulls_head dying_list;
};

struct nf_conntrack_net {
        /* only used when new connection is allocated: */
        atomic_t count;
        unsigned int expect_count;

        /* only used from work queues, configuration plane, and so on: */
        unsigned int users4;
        unsigned int users6;
        unsigned int users_bridge;
#ifdef CONFIG_SYSCTL
        struct ctl_table_header        *sysctl_header;
#endif
#ifdef CONFIG_NF_CONNTRACK_EVENTS
        struct nf_conntrack_net_ecache ecache;
#endif
};

#include <linux/types.h>
#include <linux/skbuff.h>

#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>

struct nf_conn {
        /* Usage count in here is 1 for hash table, 1 per skb,
         * plus 1 for any connection(s) we are `master' for
         *
         * Hint, SKB address this struct and refcnt via skb->_nfct and
         * helpers nf_conntrack_get() and nf_conntrack_put().
         * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
         * except that the latter uses internal indirection and does not
         * result in a conntrack module dependency.
         * beware nf_ct_get() is different and don't inc refcnt.
         */
        struct nf_conntrack ct_general;

        spinlock_t        lock;
        /* jiffies32 when this ct is considered dead */
        u32 timeout;

#ifdef CONFIG_NF_CONNTRACK_ZONES
        struct nf_conntrack_zone zone;
#endif
        /* XXX should I move this to the tail ? - Y.K */
        /* These are my tuples; original and reply */
        struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];

        /* Have we seen traffic both ways yet? (bitset) */
        unsigned long status;

        possible_net_t ct_net;

#if IS_ENABLED(CONFIG_NF_NAT)
        struct hlist_node        nat_bysource;
#endif
        /* all members below initialized via memset */
        struct { } __nfct_init_offset;

        /* If we were expected by an expectation, this will be it */
        struct nf_conn *master;

#if defined(CONFIG_NF_CONNTRACK_MARK)
        u_int32_t mark;
#endif

#ifdef CONFIG_NF_CONNTRACK_SECMARK
        u_int32_t secmark;
#endif

        /* Extensions */
        struct nf_ct_ext *ext;

        /* Storage reserved for other modules, must be the last member */
        union nf_conntrack_proto proto;
};

static inline struct nf_conn *
nf_ct_to_nf_conn(const struct nf_conntrack *nfct)
{
        return container_of(nfct, struct nf_conn, ct_general);
}

static inline struct nf_conn *
nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
{
        return container_of(hash, struct nf_conn,
                            tuplehash[hash->tuple.dst.dir]);
}

static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct)
{
        return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
}

static inline u_int8_t nf_ct_protonum(const struct nf_conn *ct)
{
        return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
}

#define nf_ct_tuple(ct, dir) (&(ct)->tuplehash[dir].tuple)

/* get master conntrack via master expectation */
#define master_ct(conntr) (conntr->master)

extern struct net init_net;

static inline struct net *nf_ct_net(const struct nf_conn *ct)
{
        return read_pnet(&ct->ct_net);
}

/* Is this tuple taken? (ignoring any belonging to the given
   conntrack). */
int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
                             const struct nf_conn *ignored_conntrack);

/* Return conntrack_info and tuple hash for given skb. */
static inline struct nf_conn *
nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
{
        unsigned long nfct = skb_get_nfct(skb);

        *ctinfo = nfct & NFCT_INFOMASK;
        return (struct nf_conn *)(nfct & NFCT_PTRMASK);
}

void nf_ct_destroy(struct nf_conntrack *nfct);

void nf_conntrack_tcp_set_closing(struct nf_conn *ct);

/* decrement reference count on a conntrack */
static inline void nf_ct_put(struct nf_conn *ct)
{
        if (ct && refcount_dec_and_test(&ct->ct_general.use))
                nf_ct_destroy(&ct->ct_general);
}

/* load module; enable/disable conntrack in this namespace */
int nf_ct_netns_get(struct net *net, u8 nfproto);
void nf_ct_netns_put(struct net *net, u8 nfproto);

/*
 * Allocate a hashtable of hlist_head (if nulls == 0),
 * or hlist_nulls_head (if nulls == 1)
 */
void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls);

int nf_conntrack_hash_check_insert(struct nf_conn *ct);
bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report);

bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
                       u_int16_t l3num, struct net *net,
                       struct nf_conntrack_tuple *tuple);

void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                          const struct sk_buff *skb,
                          u32 extra_jiffies, bool do_acct);

/* Refresh conntrack for this many jiffies and do accounting */
static inline void nf_ct_refresh_acct(struct nf_conn *ct,
                                      enum ip_conntrack_info ctinfo,
                                      const struct sk_buff *skb,
                                      u32 extra_jiffies)
{
        __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true);
}

/* Refresh conntrack for this many jiffies */
static inline void nf_ct_refresh(struct nf_conn *ct,
                                 const struct sk_buff *skb,
                                 u32 extra_jiffies)
{
        __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false);
}

/* kill conntrack and do accounting */
bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
                     const struct sk_buff *skb);

/* kill conntrack without accounting */
static inline bool nf_ct_kill(struct nf_conn *ct)
{
        return nf_ct_delete(ct, 0, 0);
}

struct nf_ct_iter_data {
        struct net *net;
        void *data;
        u32 portid;
        int report;
};

/* Iterate over all conntracks: if iter returns true, it's deleted. */
void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
                               const struct nf_ct_iter_data *iter_data);

/* also set unconfirmed conntracks as dying. Only use in module exit path. */
void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data),
                           void *data);

struct nf_conntrack_zone;

void nf_conntrack_free(struct nf_conn *ct);
struct nf_conn *nf_conntrack_alloc(struct net *net,
                                   const struct nf_conntrack_zone *zone,
                                   const struct nf_conntrack_tuple *orig,
                                   const struct nf_conntrack_tuple *repl,
                                   gfp_t gfp);

static inline int nf_ct_is_template(const struct nf_conn *ct)
{
        return test_bit(IPS_TEMPLATE_BIT, &ct->status);
}

/* It's confirmed if it is, or has been in the hash table. */
static inline int nf_ct_is_confirmed(const struct nf_conn *ct)
{
        return test_bit(IPS_CONFIRMED_BIT, &ct->status);
}

static inline int nf_ct_is_dying(const struct nf_conn *ct)
{
        return test_bit(IPS_DYING_BIT, &ct->status);
}

/* Packet is received from loopback */
static inline bool nf_is_loopback_packet(const struct sk_buff *skb)
{
        return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK;
}

static inline void nf_conntrack_alter_reply(struct nf_conn *ct,
                                            const struct nf_conntrack_tuple *newreply)
{
        /* Must be unconfirmed, so not in hash table yet */
        if (WARN_ON(nf_ct_is_confirmed(ct)))
                return;

        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
}

#define nfct_time_stamp ((u32)(jiffies))

/* jiffies until ct expires, 0 if already expired */
static inline unsigned long nf_ct_expires(const struct nf_conn *ct)
{
        s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;

        return max(timeout, 0);
}

static inline bool nf_ct_is_expired(const struct nf_conn *ct)
{
        return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0;
}

/* use after obtaining a reference count */
static inline bool nf_ct_should_gc(const struct nf_conn *ct)
{
        return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) &&
               !nf_ct_is_dying(ct);
}

#define        NF_CT_DAY        (86400 * HZ)

/* Set an arbitrary timeout large enough not to ever expire, this save
 * us a check for the IPS_OFFLOAD_BIT from the packet path via
 * nf_ct_is_expired().
 */
static inline void nf_ct_offload_timeout(struct nf_conn *ct)
{
        if (nf_ct_expires(ct) < NF_CT_DAY / 2)
                WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY);
}

struct kernel_param;

int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp);
int nf_conntrack_hash_resize(unsigned int hashsize);

extern struct hlist_nulls_head *nf_conntrack_hash;
extern unsigned int nf_conntrack_htable_size;
extern seqcount_spinlock_t nf_conntrack_generation;
extern unsigned int nf_conntrack_max;

/* must be called with rcu read lock held */
static inline void
nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
{
        struct hlist_nulls_head *hptr;
        unsigned int sequence, hsz;

        do {
                sequence = read_seqcount_begin(&nf_conntrack_generation);
                hsz = nf_conntrack_htable_size;
                hptr = nf_conntrack_hash;
        } while (read_seqcount_retry(&nf_conntrack_generation, sequence));

        *hash = hptr;
        *hsize = hsz;
}

struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
                                 const struct nf_conntrack_zone *zone,
                                 gfp_t flags);
void nf_ct_tmpl_free(struct nf_conn *tmpl);

u32 nf_ct_get_id(const struct nf_conn *ct);
u32 nf_conntrack_count(const struct net *net);

static inline void
nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info)
{
        skb_set_nfct(skb, (unsigned long)ct | info);
}

extern unsigned int nf_conntrack_net_id;

static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
{
        return net_generic(net, nf_conntrack_net_id);
}

int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
                           u16 zone, u8 family, u8 *proto, u16 *mru);

#define NF_CT_STAT_INC(net, count)          __this_cpu_inc((net)->ct.stat->count)
#define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
#define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v))

#define MODULE_ALIAS_NFCT_HELPER(helper) \
        MODULE_ALIAS("nfct-helper-" helper)

#endif /* _NF_CONNTRACK_H */























    1 









































































    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vmalloc

#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_VMALLOC_H

#include <linux/tracepoint.h>

/**
 * alloc_vmap_area - called when a new vmap allocation occurs
 * @addr:        an allocated address
 * @size:        a requested size
 * @align:        a requested alignment
 * @vstart:        a requested start range
 * @vend:        a requested end range
 * @failed:        an allocation failed or not
 *
 * This event is used for a debug purpose, it can give an extra
 * information for a developer about how often it occurs and which
 * parameters are passed for further validation.
 */
TRACE_EVENT(alloc_vmap_area,

        TP_PROTO(unsigned long addr, unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend, int failed),

        TP_ARGS(addr, size, align, vstart, vend, failed),

        TP_STRUCT__entry(
                __field(unsigned long, addr)
                __field(unsigned long, size)
                __field(unsigned long, align)
                __field(unsigned long, vstart)
                __field(unsigned long, vend)
                __field(int, failed)
        ),

        TP_fast_assign(
                __entry->addr = addr;
                __entry->size = size;
                __entry->align = align;
                __entry->vstart = vstart;
                __entry->vend = vend;
                __entry->failed = failed;
        ),

        TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d",
                __entry->addr, __entry->size, __entry->align,
                __entry->vstart, __entry->vend, __entry->failed)
);

/**
 * purge_vmap_area_lazy - called when vmap areas were lazily freed
 * @start:                purging start address
 * @end:                purging end address
 * @npurged:        numbed of purged vmap areas
 *
 * This event is used for a debug purpose. It gives some
 * indication about start:end range and how many objects
 * are released.
 */
TRACE_EVENT(purge_vmap_area_lazy,

        TP_PROTO(unsigned long start, unsigned long end,
                unsigned int npurged),

        TP_ARGS(start, end, npurged),

        TP_STRUCT__entry(
                __field(unsigned long, start)
                __field(unsigned long, end)
                __field(unsigned int, npurged)
        ),

        TP_fast_assign(
                __entry->start = start;
                __entry->end = end;
                __entry->npurged = npurged;
        ),

        TP_printk("start=0x%lx end=0x%lx num_purged=%u",
                __entry->start, __entry->end, __entry->npurged)
);

/**
 * free_vmap_area_noflush - called when a vmap area is freed
 * @va_start:                a start address of VA
 * @nr_lazy:                number of current lazy pages
 * @nr_lazy_max:        number of maximum lazy pages
 *
 * This event is used for a debug purpose. It gives some
 * indication about a VA that is released, number of current
 * outstanding areas and a maximum allowed threshold before
 * dropping all of them.
 */
TRACE_EVENT(free_vmap_area_noflush,

        TP_PROTO(unsigned long va_start, unsigned long nr_lazy,
                unsigned long nr_lazy_max),

        TP_ARGS(va_start, nr_lazy, nr_lazy_max),

        TP_STRUCT__entry(
                __field(unsigned long, va_start)
                __field(unsigned long, nr_lazy)
                __field(unsigned long, nr_lazy_max)
        ),

        TP_fast_assign(
                __entry->va_start = va_start;
                __entry->nr_lazy = nr_lazy;
                __entry->nr_lazy_max = nr_lazy_max;
        ),

        TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu",
                __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max)
);

#endif /*  _TRACE_VMALLOC_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H

#include <linux/build_bug.h>
#include <linux/compiler.h>

#ifndef __ASSEMBLY__

#include <linux/cache.h>
#include <asm/percpu.h>

struct task_struct;

struct pcpu_hot {
        union {
                struct {
                        struct task_struct        *current_task;
                        int                        preempt_count;
                        int                        cpu_number;
#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
                        u64                        call_depth;
#endif
                        unsigned long                top_of_stack;
                        void                        *hardirq_stack_ptr;
                        u16                        softirq_pending;
#ifdef CONFIG_X86_64
                        bool                        hardirq_stack_inuse;
#else
                        void                        *softirq_stack_ptr;
#endif
                };
                u8        pad[64];
        };
};
static_assert(sizeof(struct pcpu_hot) == 64);

DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);

/* const-qualified alias to pcpu_hot, aliased by linker. */
DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
                        const_pcpu_hot);

static __always_inline struct task_struct *get_current(void)
{
        if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
                return this_cpu_read_const(const_pcpu_hot.current_task);

        return this_cpu_read_stable(pcpu_hot.current_task);
}

#define current get_current()

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_CURRENT_H */
















































































    3 




    5 







    3 
    3 




























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
/* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * SipHash: a fast short-input PRF
 * https://131002.net/siphash/
 *
 * This implementation is specifically for SipHash2-4 for a secure PRF
 * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
 * hashtables.
 */

#include <linux/siphash.h>
#include <asm/unaligned.h>

#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
#include <linux/dcache.h>
#include <asm/word-at-a-time.h>
#endif

#define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3)

#define PREAMBLE(len) \
        u64 v0 = SIPHASH_CONST_0; \
        u64 v1 = SIPHASH_CONST_1; \
        u64 v2 = SIPHASH_CONST_2; \
        u64 v3 = SIPHASH_CONST_3; \
        u64 b = ((u64)(len)) << 56; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define POSTAMBLE \
        v3 ^= b; \
        SIPROUND; \
        SIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_aligned);
#endif

u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_unaligned);

/**
 * siphash_1u64 - compute 64-bit siphash PRF value of a u64
 * @first: first u64
 * @key: the siphash key
 */
u64 siphash_1u64(const u64 first, const siphash_key_t *key)
{
        PREAMBLE(8)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u64);

/**
 * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
 * @first: first u64
 * @second: second u64
 * @key: the siphash key
 */
u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
{
        PREAMBLE(16)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_2u64);

/**
 * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @key: the siphash key
 */
u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
                 const siphash_key_t *key)
{
        PREAMBLE(24)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u64);

/**
 * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @forth: forth u64
 * @key: the siphash key
 */
u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
                 const u64 forth, const siphash_key_t *key)
{
        PREAMBLE(32)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        v3 ^= forth;
        SIPROUND;
        SIPROUND;
        v0 ^= forth;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_4u64);

u64 siphash_1u32(const u32 first, const siphash_key_t *key)
{
        PREAMBLE(4)
        b |= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u32);

u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
                 const siphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        PREAMBLE(12)
        v3 ^= combined;
        SIPROUND;
        SIPROUND;
        v0 ^= combined;
        b |= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u32);

#if BITS_PER_LONG == 64
/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
 * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
 */

#define HSIPROUND SIPROUND
#define HPREAMBLE(len) PREAMBLE(len)
#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        b |= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(8)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(12)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        b |= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(16)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        combined = (u64)forth << 32 | third;
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#else
#define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3)

#define HPREAMBLE(len) \
        u32 v0 = HSIPHASH_CONST_0; \
        u32 v1 = HSIPHASH_CONST_1; \
        u32 v2 = HSIPHASH_CONST_2; \
        u32 v3 = HSIPHASH_CONST_3; \
        u32 b = ((u32)(len)) << 24; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return v1 ^ v3;

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = le32_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = get_unaligned_le32(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        HPREAMBLE(8)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        HPREAMBLE(12)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        HPREAMBLE(16)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        v3 ^= forth;
        HSIPROUND;
        v0 ^= forth;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#endif










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                PACKET - implements raw packet sockets.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Fixes:
 *                Alan Cox        :        verify_area() now used correctly
 *                Alan Cox        :        new skbuff lists, look ma no backlogs!
 *                Alan Cox        :        tidied skbuff lists.
 *                Alan Cox        :        Now uses generic datagram routines I
 *                                        added. Also fixed the peek/read crash
 *                                        from all old Linux datagram code.
 *                Alan Cox        :        Uses the improved datagram code.
 *                Alan Cox        :        Added NULL's for socket options.
 *                Alan Cox        :        Re-commented the code.
 *                Alan Cox        :        Use new kernel side addressing
 *                Rob Janssen        :        Correct MTU usage.
 *                Dave Platt        :        Counter leaks caused by incorrect
 *                                        interrupt locking and some slightly
 *                                        dubious gcc output. Can you read
 *                                        compiler: it said _VOLATILE_
 *        Richard Kooijman        :        Timestamp fixes.
 *                Alan Cox        :        New buffers. Use sk->mac.raw.
 *                Alan Cox        :        sendmsg/recvmsg support.
 *                Alan Cox        :        Protocol setting support
 *        Alexey Kuznetsov        :        Untied from IPv4 stack.
 *        Cyrus Durgin                :        Fixed kerneld for kmod.
 *        Michal Ostrowski        :       Module initialization cleanup.
 *         Ulises Alonso        :       Frame number limit removal and
 *                                      packet_set_ring memory leak.
 *                Eric Biederman        :        Allow for > 8 byte hardware addresses.
 *                                        The convention is that longer addresses
 *                                        will simply extend the hardware address
 *                                        byte arrays at the end of sockaddr_ll
 *                                        and packet_mreq.
 *                Johann Baudy        :        Added TX RING.
 *                Chetan Loke        :        Implemented TPACKET_V3 block abstraction
 *                                        layer.
 *                                        Copyright (C) 2011, <lokec@ccs.neu.edu>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/ethtool.h>
#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/wireless.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/if_vlan.h>
#include <linux/virtio_net.h>
#include <linux/errqueue.h>
#include <linux/net_tstamp.h>
#include <linux/percpu.h>
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
#include <linux/bpf.h>
#include <net/compat.h>
#include <linux/netfilter_netdev.h>

#include "internal.h"

/*
   Assumptions:
   - If the device has no dev->header_ops->create, there is no LL header
     visible above the device. In this case, its hard_header_len should be 0.
     The device may prepend its own header internally. In this case, its
     needed_headroom should be set to the space needed for it to add its
     internal header.
     For example, a WiFi driver pretending to be an Ethernet driver should
     set its hard_header_len to be the Ethernet header length, and set its
     needed_headroom to be (the real WiFi header length - the fake Ethernet
     header length).
   - packet socket receives packets with pulled ll header,
     so that SOCK_RAW should push it back.

On receive:
-----------

Incoming, dev_has_header(dev) == true
   mac_header -> ll header
   data       -> data

Outgoing, dev_has_header(dev) == true
   mac_header -> ll header
   data       -> ll header

Incoming, dev_has_header(dev) == false
   mac_header -> data
     However drivers often make it point to the ll header.
     This is incorrect because the ll header should be invisible to us.
   data       -> data

Outgoing, dev_has_header(dev) == false
   mac_header -> data. ll header is invisible to us.
   data       -> data

Resume
  If dev_has_header(dev) == false we are unable to restore the ll header,
    because it is invisible to us.


On transmit:
------------

dev_has_header(dev) == true
   mac_header -> ll header
   data       -> ll header

dev_has_header(dev) == false (ll header is invisible to us)
   mac_header -> data
   data       -> data

   We should set network_header on output to the correct position,
   packet classifier depends on it.
 */

/* Private packet socket structures. */

/* identical to struct packet_mreq except it has
 * a longer address field.
 */
struct packet_mreq_max {
        int                mr_ifindex;
        unsigned short        mr_type;
        unsigned short        mr_alen;
        unsigned char        mr_address[MAX_ADDR_LEN];
};

union tpacket_uhdr {
        struct tpacket_hdr  *h1;
        struct tpacket2_hdr *h2;
        struct tpacket3_hdr *h3;
        void *raw;
};

static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                int closing, int tx_ring);

#define V3_ALIGNMENT        (8)

#define BLK_HDR_LEN        (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))

#define BLK_PLUS_PRIV(sz_of_priv) \
        (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))

#define BLOCK_STATUS(x)        ((x)->hdr.bh1.block_status)
#define BLOCK_NUM_PKTS(x)        ((x)->hdr.bh1.num_pkts)
#define BLOCK_O2FP(x)                ((x)->hdr.bh1.offset_to_first_pkt)
#define BLOCK_LEN(x)                ((x)->hdr.bh1.blk_len)
#define BLOCK_SNUM(x)                ((x)->hdr.bh1.seq_num)
#define BLOCK_O2PRIV(x)        ((x)->offset_to_priv)

struct packet_sock;
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
                       struct packet_type *pt, struct net_device *orig_dev);

static void *packet_previous_frame(struct packet_sock *po,
                struct packet_ring_buffer *rb,
                int status);
static void packet_increment_head(struct packet_ring_buffer *buff);
static int prb_curr_blk_in_use(struct tpacket_block_desc *);
static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
                        struct packet_sock *);
static void prb_retire_current_block(struct tpacket_kbdq_core *,
                struct packet_sock *, unsigned int status);
static int prb_queue_frozen(struct tpacket_kbdq_core *);
static void prb_open_block(struct tpacket_kbdq_core *,
                struct tpacket_block_desc *);
static void prb_retire_rx_blk_timer_expired(struct timer_list *);
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
static void prb_clear_rxhash(struct tpacket_kbdq_core *,
                struct tpacket3_hdr *);
static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
                struct tpacket3_hdr *);
static void packet_flush_mclist(struct sock *sk);
static u16 packet_pick_tx_queue(struct sk_buff *skb);

struct packet_skb_cb {
        union {
                struct sockaddr_pkt pkt;
                union {
                        /* Trick: alias skb original length with
                         * ll.sll_family and ll.protocol in order
                         * to save room.
                         */
                        unsigned int origlen;
                        struct sockaddr_ll ll;
                };
        } sa;
};

#define vio_le() virtio_legacy_is_little_endian()

#define PACKET_SKB_CB(__skb)        ((struct packet_skb_cb *)((__skb)->cb))

#define GET_PBDQC_FROM_RB(x)        ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
#define GET_PBLOCK_DESC(x, bid)        \
        ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)        \
        ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
#define GET_NEXT_PRB_BLK_NUM(x) \
        (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
        ((x)->kactive_blk_num+1) : 0)

static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
static void __fanout_link(struct sock *sk, struct packet_sock *po);

#ifdef CONFIG_NETFILTER_EGRESS
static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
{
        struct sk_buff *next, *head = NULL, *tail;
        int rc;

        rcu_read_lock();
        for (; skb != NULL; skb = next) {
                next = skb->next;
                skb_mark_not_on_list(skb);

                if (!nf_hook_egress(skb, &rc, skb->dev))
                        continue;

                if (!head)
                        head = skb;
                else
                        tail->next = skb;

                tail = skb;
        }
        rcu_read_unlock();

        return head;
}
#endif

static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb)
{
        if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS))
                return dev_queue_xmit(skb);

#ifdef CONFIG_NETFILTER_EGRESS
        if (nf_hook_egress_active()) {
                skb = nf_hook_direct_egress(skb);
                if (!skb)
                        return NET_XMIT_DROP;
        }
#endif
        return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
}

static struct net_device *packet_cached_dev_get(struct packet_sock *po)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = rcu_dereference(po->cached_dev);
        dev_hold(dev);
        rcu_read_unlock();

        return dev;
}

static void packet_cached_dev_assign(struct packet_sock *po,
                                     struct net_device *dev)
{
        rcu_assign_pointer(po->cached_dev, dev);
}

static void packet_cached_dev_reset(struct packet_sock *po)
{
        RCU_INIT_POINTER(po->cached_dev, NULL);
}

static u16 packet_pick_tx_queue(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        const struct net_device_ops *ops = dev->netdev_ops;
        int cpu = raw_smp_processor_id();
        u16 queue_index;

#ifdef CONFIG_XPS
        skb->sender_cpu = cpu + 1;
#endif
        skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
        if (ops->ndo_select_queue) {
                queue_index = ops->ndo_select_queue(dev, skb, NULL);
                queue_index = netdev_cap_txqueue(dev, queue_index);
        } else {
                queue_index = netdev_pick_tx(dev, skb, NULL);
        }

        return queue_index;
}

/* __register_prot_hook must be invoked through register_prot_hook
 * or from a context in which asynchronous accesses to the packet
 * socket is not possible (packet_create()).
 */
static void __register_prot_hook(struct sock *sk)
{
        struct packet_sock *po = pkt_sk(sk);

        if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
                if (po->fanout)
                        __fanout_link(sk, po);
                else
                        dev_add_pack(&po->prot_hook);

                sock_hold(sk);
                packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1);
        }
}

static void register_prot_hook(struct sock *sk)
{
        lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
        __register_prot_hook(sk);
}

/* If the sync parameter is true, we will temporarily drop
 * the po->bind_lock and do a synchronize_net to make sure no
 * asynchronous packet processing paths still refer to the elements
 * of po->prot_hook.  If the sync parameter is false, it is the
 * callers responsibility to take care of this.
 */
static void __unregister_prot_hook(struct sock *sk, bool sync)
{
        struct packet_sock *po = pkt_sk(sk);

        lockdep_assert_held_once(&po->bind_lock);

        packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);

        if (po->fanout)
                __fanout_unlink(sk, po);
        else
                __dev_remove_pack(&po->prot_hook);

        __sock_put(sk);

        if (sync) {
                spin_unlock(&po->bind_lock);
                synchronize_net();
                spin_lock(&po->bind_lock);
        }
}

static void unregister_prot_hook(struct sock *sk, bool sync)
{
        struct packet_sock *po = pkt_sk(sk);

        if (packet_sock_flag(po, PACKET_SOCK_RUNNING))
                __unregister_prot_hook(sk, sync);
}

static inline struct page * __pure pgv_to_page(void *addr)
{
        if (is_vmalloc_addr(addr))
                return vmalloc_to_page(addr);
        return virt_to_page(addr);
}

static void __packet_set_status(struct packet_sock *po, void *frame, int status)
{
        union tpacket_uhdr h;

        /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */

        h.raw = frame;
        switch (po->tp_version) {
        case TPACKET_V1:
                WRITE_ONCE(h.h1->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                break;
        case TPACKET_V2:
                WRITE_ONCE(h.h2->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                break;
        case TPACKET_V3:
                WRITE_ONCE(h.h3->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
                break;
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
        }

        smp_wmb();
}

static int __packet_get_status(const struct packet_sock *po, void *frame)
{
        union tpacket_uhdr h;

        smp_rmb();

        /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */

        h.raw = frame;
        switch (po->tp_version) {
        case TPACKET_V1:
                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                return READ_ONCE(h.h1->tp_status);
        case TPACKET_V2:
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                return READ_ONCE(h.h2->tp_status);
        case TPACKET_V3:
                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
                return READ_ONCE(h.h3->tp_status);
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
                return 0;
        }
}

static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
                                   unsigned int flags)
{
        struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);

        if (shhwtstamps &&
            (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
            ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
                return TP_STATUS_TS_RAW_HARDWARE;

        if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
            ktime_to_timespec64_cond(skb_tstamp(skb), ts))
                return TP_STATUS_TS_SOFTWARE;

        return 0;
}

static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
                                    struct sk_buff *skb)
{
        union tpacket_uhdr h;
        struct timespec64 ts;
        __u32 ts_status;

        if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp))))
                return 0;

        h.raw = frame;
        /*
         * versions 1 through 3 overflow the timestamps in y2106, since they
         * all store the seconds in a 32-bit unsigned integer.
         * If we create a version 4, that should have a 64-bit timestamp,
         * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
         * nanoseconds.
         */
        switch (po->tp_version) {
        case TPACKET_V1:
                h.h1->tp_sec = ts.tv_sec;
                h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
                break;
        case TPACKET_V2:
                h.h2->tp_sec = ts.tv_sec;
                h.h2->tp_nsec = ts.tv_nsec;
                break;
        case TPACKET_V3:
                h.h3->tp_sec = ts.tv_sec;
                h.h3->tp_nsec = ts.tv_nsec;
                break;
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
        }

        /* one flush is safe, as both fields always lie on the same cacheline */
        flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
        smp_wmb();

        return ts_status;
}

static void *packet_lookup_frame(const struct packet_sock *po,
                                 const struct packet_ring_buffer *rb,
                                 unsigned int position,
                                 int status)
{
        unsigned int pg_vec_pos, frame_offset;
        union tpacket_uhdr h;

        pg_vec_pos = position / rb->frames_per_block;
        frame_offset = position % rb->frames_per_block;

        h.raw = rb->pg_vec[pg_vec_pos].buffer +
                (frame_offset * rb->frame_size);

        if (status != __packet_get_status(po, h.raw))
                return NULL;

        return h.raw;
}

static void *packet_current_frame(struct packet_sock *po,
                struct packet_ring_buffer *rb,
                int status)
{
        return packet_lookup_frame(po, rb, rb->head, status);
}

static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
{
        del_timer_sync(&pkc->retire_blk_timer);
}

static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
                struct sk_buff_head *rb_queue)
{
        struct tpacket_kbdq_core *pkc;

        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);

        spin_lock_bh(&rb_queue->lock);
        pkc->delete_blk_timer = 1;
        spin_unlock_bh(&rb_queue->lock);

        prb_del_retire_blk_timer(pkc);
}

static void prb_setup_retire_blk_timer(struct packet_sock *po)
{
        struct tpacket_kbdq_core *pkc;

        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
                    0);
        pkc->retire_blk_timer.expires = jiffies;
}

static int prb_calc_retire_blk_tmo(struct packet_sock *po,
                                int blk_size_in_bytes)
{
        struct net_device *dev;
        unsigned int mbits, div;
        struct ethtool_link_ksettings ecmd;
        int err;

        rtnl_lock();
        dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
        if (unlikely(!dev)) {
                rtnl_unlock();
                return DEFAULT_PRB_RETIRE_TOV;
        }
        err = __ethtool_get_link_ksettings(dev, &ecmd);
        rtnl_unlock();
        if (err)
                return DEFAULT_PRB_RETIRE_TOV;

        /* If the link speed is so slow you don't really
         * need to worry about perf anyways
         */
        if (ecmd.base.speed < SPEED_1000 ||
            ecmd.base.speed == SPEED_UNKNOWN)
                return DEFAULT_PRB_RETIRE_TOV;

        div = ecmd.base.speed / 1000;
        mbits = (blk_size_in_bytes * 8) / (1024 * 1024);

        if (div)
                mbits /= div;

        if (div)
                return mbits + 1;
        return mbits;
}

static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
                        union tpacket_req_u *req_u)
{
        p1->feature_req_word = req_u->req3.tp_feature_req_word;
}

static void init_prb_bdqc(struct packet_sock *po,
                        struct packet_ring_buffer *rb,
                        struct pgv *pg_vec,
                        union tpacket_req_u *req_u)
{
        struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
        struct tpacket_block_desc *pbd;

        memset(p1, 0x0, sizeof(*p1));

        p1->knxt_seq_num = 1;
        p1->pkbdq = pg_vec;
        pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
        p1->pkblk_start        = pg_vec[0].buffer;
        p1->kblk_size = req_u->req3.tp_block_size;
        p1->knum_blocks        = req_u->req3.tp_block_nr;
        p1->hdrlen = po->tp_hdrlen;
        p1->version = po->tp_version;
        p1->last_kactive_blk_num = 0;
        po->stats.stats3.tp_freeze_q_cnt = 0;
        if (req_u->req3.tp_retire_blk_tov)
                p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
        else
                p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
                                                req_u->req3.tp_block_size);
        p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
        p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
        rwlock_init(&p1->blk_fill_in_prog_lock);

        p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
        prb_init_ft_ops(p1, req_u);
        prb_setup_retire_blk_timer(po);
        prb_open_block(p1, pbd);
}

/*  Do NOT update the last_blk_num first.
 *  Assumes sk_buff_head lock is held.
 */
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
{
        mod_timer(&pkc->retire_blk_timer,
                        jiffies + pkc->tov_in_jiffies);
        pkc->last_kactive_blk_num = pkc->kactive_blk_num;
}

/*
 * Timer logic:
 * 1) We refresh the timer only when we open a block.
 *    By doing this we don't waste cycles refreshing the timer
 *          on packet-by-packet basis.
 *
 * With a 1MB block-size, on a 1Gbps line, it will take
 * i) ~8 ms to fill a block + ii) memcpy etc.
 * In this cut we are not accounting for the memcpy time.
 *
 * So, if the user sets the 'tmo' to 10ms then the timer
 * will never fire while the block is still getting filled
 * (which is what we want). However, the user could choose
 * to close a block early and that's fine.
 *
 * But when the timer does fire, we check whether or not to refresh it.
 * Since the tmo granularity is in msecs, it is not too expensive
 * to refresh the timer, lets say every '8' msecs.
 * Either the user can set the 'tmo' or we can derive it based on
 * a) line-speed and b) block-size.
 * prb_calc_retire_blk_tmo() calculates the tmo.
 *
 */
static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
{
        struct packet_sock *po =
                from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
        struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        unsigned int frozen;
        struct tpacket_block_desc *pbd;

        spin_lock(&po->sk.sk_receive_queue.lock);

        frozen = prb_queue_frozen(pkc);
        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        if (unlikely(pkc->delete_blk_timer))
                goto out;

        /* We only need to plug the race when the block is partially filled.
         * tpacket_rcv:
         *                lock(); increment BLOCK_NUM_PKTS; unlock()
         *                copy_bits() is in progress ...
         *                timer fires on other cpu:
         *                we can't retire the current block because copy_bits
         *                is in progress.
         *
         */
        if (BLOCK_NUM_PKTS(pbd)) {
                /* Waiting for skb_copy_bits to finish... */
                write_lock(&pkc->blk_fill_in_prog_lock);
                write_unlock(&pkc->blk_fill_in_prog_lock);
        }

        if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
                if (!frozen) {
                        if (!BLOCK_NUM_PKTS(pbd)) {
                                /* An empty block. Just refresh the timer. */
                                goto refresh_timer;
                        }
                        prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
                        if (!prb_dispatch_next_block(pkc, po))
                                goto refresh_timer;
                        else
                                goto out;
                } else {
                        /* Case 1. Queue was frozen because user-space was
                         *           lagging behind.
                         */
                        if (prb_curr_blk_in_use(pbd)) {
                                /*
                                 * Ok, user-space is still behind.
                                 * So just refresh the timer.
                                 */
                                goto refresh_timer;
                        } else {
                               /* Case 2. queue was frozen,user-space caught up,
                                * now the link went idle && the timer fired.
                                * We don't have a block to close.So we open this
                                * block and restart the timer.
                                * opening a block thaws the queue,restarts timer
                                * Thawing/timer-refresh is a side effect.
                                */
                                prb_open_block(pkc, pbd);
                                goto out;
                        }
                }
        }

refresh_timer:
        _prb_refresh_rx_retire_blk_timer(pkc);

out:
        spin_unlock(&po->sk.sk_receive_queue.lock);
}

static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
                struct tpacket_block_desc *pbd1, __u32 status)
{
        /* Flush everything minus the block header */

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        u8 *start, *end;

        start = (u8 *)pbd1;

        /* Skip the block header(we know header WILL fit in 4K) */
        start += PAGE_SIZE;

        end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
        for (; start < end; start += PAGE_SIZE)
                flush_dcache_page(pgv_to_page(start));

        smp_wmb();
#endif

        /* Now update the block status. */

        BLOCK_STATUS(pbd1) = status;

        /* Flush the block header */

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        start = (u8 *)pbd1;
        flush_dcache_page(pgv_to_page(start));

        smp_wmb();
#endif
}

/*
 * Side effect:
 *
 * 1) flush the block
 * 2) Increment active_blk_num
 *
 * Note:We DONT refresh the timer on purpose.
 *        Because almost always the next block will be opened.
 */
static void prb_close_block(struct tpacket_kbdq_core *pkc1,
                struct tpacket_block_desc *pbd1,
                struct packet_sock *po, unsigned int stat)
{
        __u32 status = TP_STATUS_USER | stat;

        struct tpacket3_hdr *last_pkt;
        struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
        struct sock *sk = &po->sk;

        if (atomic_read(&po->tp_drops))
                status |= TP_STATUS_LOSING;

        last_pkt = (struct tpacket3_hdr *)pkc1->prev;
        last_pkt->tp_next_offset = 0;

        /* Get the ts of the last pkt */
        if (BLOCK_NUM_PKTS(pbd1)) {
                h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
                h1->ts_last_pkt.ts_nsec        = last_pkt->tp_nsec;
        } else {
                /* Ok, we tmo'd - so get the current time.
                 *
                 * It shouldn't really happen as we don't close empty
                 * blocks. See prb_retire_rx_blk_timer_expired().
                 */
                struct timespec64 ts;
                ktime_get_real_ts64(&ts);
                h1->ts_last_pkt.ts_sec = ts.tv_sec;
                h1->ts_last_pkt.ts_nsec        = ts.tv_nsec;
        }

        smp_wmb();

        /* Flush the block */
        prb_flush_block(pkc1, pbd1, status);

        sk->sk_data_ready(sk);

        pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
}

static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
{
        pkc->reset_pending_on_curr_blk = 0;
}

/*
 * Side effect of opening a block:
 *
 * 1) prb_queue is thawed.
 * 2) retire_blk_timer is refreshed.
 *
 */
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
        struct tpacket_block_desc *pbd1)
{
        struct timespec64 ts;
        struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;

        smp_rmb();

        /* We could have just memset this but we will lose the
         * flexibility of making the priv area sticky
         */

        BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
        BLOCK_NUM_PKTS(pbd1) = 0;
        BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);

        ktime_get_real_ts64(&ts);

        h1->ts_first_pkt.ts_sec = ts.tv_sec;
        h1->ts_first_pkt.ts_nsec = ts.tv_nsec;

        pkc1->pkblk_start = (char *)pbd1;
        pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);

        BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
        BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;

        pbd1->version = pkc1->version;
        pkc1->prev = pkc1->nxt_offset;
        pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;

        prb_thaw_queue(pkc1);
        _prb_refresh_rx_retire_blk_timer(pkc1);

        smp_wmb();
}

/*
 * Queue freeze logic:
 * 1) Assume tp_block_nr = 8 blocks.
 * 2) At time 't0', user opens Rx ring.
 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
 * 4) user-space is either sleeping or processing block '0'.
 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
 *    it will close block-7,loop around and try to fill block '0'.
 *    call-flow:
 *    __packet_lookup_frame_in_block
 *      prb_retire_current_block()
 *      prb_dispatch_next_block()
 *        |->(BLOCK_STATUS == USER) evaluates to true
 *    5.1) Since block-0 is currently in-use, we just freeze the queue.
 * 6) Now there are two cases:
 *    6.1) Link goes idle right after the queue is frozen.
 *         But remember, the last open_block() refreshed the timer.
 *         When this timer expires,it will refresh itself so that we can
 *         re-open block-0 in near future.
 *    6.2) Link is busy and keeps on receiving packets. This is a simple
 *         case and __packet_lookup_frame_in_block will check if block-0
 *         is free and can now be re-used.
 */
static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
                                  struct packet_sock *po)
{
        pkc->reset_pending_on_curr_blk = 1;
        po->stats.stats3.tp_freeze_q_cnt++;
}

#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))

/*
 * If the next block is free then we will dispatch it
 * and return a good offset.
 * Else, we will freeze the queue.
 * So, caller must check the return value.
 */
static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
                struct packet_sock *po)
{
        struct tpacket_block_desc *pbd;

        smp_rmb();

        /* 1. Get current block num */
        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        /* 2. If this block is currently in_use then freeze the queue */
        if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
                prb_freeze_queue(pkc, po);
                return NULL;
        }

        /*
         * 3.
         * open this block and return the offset where the first packet
         * needs to get stored.
         */
        prb_open_block(pkc, pbd);
        return (void *)pkc->nxt_offset;
}

static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
                struct packet_sock *po, unsigned int status)
{
        struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        /* retire/close the current block */
        if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
                /*
                 * Plug the case where copy_bits() is in progress on
                 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
                 * have space to copy the pkt in the current block and
                 * called prb_retire_current_block()
                 *
                 * We don't need to worry about the TMO case because
                 * the timer-handler already handled this case.
                 */
                if (!(status & TP_STATUS_BLK_TMO)) {
                        /* Waiting for skb_copy_bits to finish... */
                        write_lock(&pkc->blk_fill_in_prog_lock);
                        write_unlock(&pkc->blk_fill_in_prog_lock);
                }
                prb_close_block(pkc, pbd, po, status);
                return;
        }
}

static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
{
        return TP_STATUS_USER & BLOCK_STATUS(pbd);
}

static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
{
        return pkc->reset_pending_on_curr_blk;
}

static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
        __releases(&pkc->blk_fill_in_prog_lock)
{
        struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);

        read_unlock(&pkc->blk_fill_in_prog_lock);
}

static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
}

static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        ppd->hv1.tp_rxhash = 0;
}

static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        if (skb_vlan_tag_present(pkc->skb)) {
                ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
                ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
                ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
        } else {
                ppd->hv1.tp_vlan_tci = 0;
                ppd->hv1.tp_vlan_tpid = 0;
                ppd->tp_status = TP_STATUS_AVAILABLE;
        }
}

static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        ppd->hv1.tp_padding = 0;
        prb_fill_vlan_info(pkc, ppd);

        if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
                prb_fill_rxhash(pkc, ppd);
        else
                prb_clear_rxhash(pkc, ppd);
}

static void prb_fill_curr_block(char *curr,
                                struct tpacket_kbdq_core *pkc,
                                struct tpacket_block_desc *pbd,
                                unsigned int len)
        __acquires(&pkc->blk_fill_in_prog_lock)
{
        struct tpacket3_hdr *ppd;

        ppd  = (struct tpacket3_hdr *)curr;
        ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
        pkc->prev = curr;
        pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
        BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
        BLOCK_NUM_PKTS(pbd) += 1;
        read_lock(&pkc->blk_fill_in_prog_lock);
        prb_run_all_ft_ops(pkc, ppd);
}

/* Assumes caller has the sk->rx_queue.lock */
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
                                            struct sk_buff *skb,
                                            unsigned int len
                                            )
{
        struct tpacket_kbdq_core *pkc;
        struct tpacket_block_desc *pbd;
        char *curr, *end;

        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        /* Queue is frozen when user space is lagging behind */
        if (prb_queue_frozen(pkc)) {
                /*
                 * Check if that last block which caused the queue to freeze,
                 * is still in_use by user-space.
                 */
                if (prb_curr_blk_in_use(pbd)) {
                        /* Can't record this packet */
                        return NULL;
                } else {
                        /*
                         * Ok, the block was released by user-space.
                         * Now let's open that block.
                         * opening a block also thaws the queue.
                         * Thawing is a side effect.
                         */
                        prb_open_block(pkc, pbd);
                }
        }

        smp_mb();
        curr = pkc->nxt_offset;
        pkc->skb = skb;
        end = (char *)pbd + pkc->kblk_size;

        /* first try the current block */
        if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
                prb_fill_curr_block(curr, pkc, pbd, len);
                return (void *)curr;
        }

        /* Ok, close the current block */
        prb_retire_current_block(pkc, po, 0);

        /* Now, try to dispatch the next block */
        curr = (char *)prb_dispatch_next_block(pkc, po);
        if (curr) {
                pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
                prb_fill_curr_block(curr, pkc, pbd, len);
                return (void *)curr;
        }

        /*
         * No free blocks are available.user_space hasn't caught up yet.
         * Queue was just frozen and now this packet will get dropped.
         */
        return NULL;
}

static void *packet_current_rx_frame(struct packet_sock *po,
                                            struct sk_buff *skb,
                                            int status, unsigned int len)
{
        char *curr = NULL;
        switch (po->tp_version) {
        case TPACKET_V1:
        case TPACKET_V2:
                curr = packet_lookup_frame(po, &po->rx_ring,
                                        po->rx_ring.head, status);
                return curr;
        case TPACKET_V3:
                return __packet_lookup_frame_in_block(po, skb, len);
        default:
                WARN(1, "TPACKET version not supported\n");
                BUG();
                return NULL;
        }
}

static void *prb_lookup_block(const struct packet_sock *po,
                              const struct packet_ring_buffer *rb,
                              unsigned int idx,
                              int status)
{
        struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
        struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);

        if (status != BLOCK_STATUS(pbd))
                return NULL;
        return pbd;
}

static int prb_previous_blk_num(struct packet_ring_buffer *rb)
{
        unsigned int prev;
        if (rb->prb_bdqc.kactive_blk_num)
                prev = rb->prb_bdqc.kactive_blk_num-1;
        else
                prev = rb->prb_bdqc.knum_blocks-1;
        return prev;
}

/* Assumes caller has held the rx_queue.lock */
static void *__prb_previous_block(struct packet_sock *po,
                                         struct packet_ring_buffer *rb,
                                         int status)
{
        unsigned int previous = prb_previous_blk_num(rb);
        return prb_lookup_block(po, rb, previous, status);
}

static void *packet_previous_rx_frame(struct packet_sock *po,
                                             struct packet_ring_buffer *rb,
                                             int status)
{
        if (po->tp_version <= TPACKET_V2)
                return packet_previous_frame(po, rb, status);

        return __prb_previous_block(po, rb, status);
}

static void packet_increment_rx_head(struct packet_sock *po,
                                            struct packet_ring_buffer *rb)
{
        switch (po->tp_version) {
        case TPACKET_V1:
        case TPACKET_V2:
                return packet_increment_head(rb);
        case TPACKET_V3:
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
                return;
        }
}

static void *packet_previous_frame(struct packet_sock *po,
                struct packet_ring_buffer *rb,
                int status)
{
        unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
        return packet_lookup_frame(po, rb, previous, status);
}

static void packet_increment_head(struct packet_ring_buffer *buff)
{
        buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
}

static void packet_inc_pending(struct packet_ring_buffer *rb)
{
        this_cpu_inc(*rb->pending_refcnt);
}

static void packet_dec_pending(struct packet_ring_buffer *rb)
{
        this_cpu_dec(*rb->pending_refcnt);
}

static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
{
        unsigned int refcnt = 0;
        int cpu;

        /* We don't use pending refcount in rx_ring. */
        if (rb->pending_refcnt == NULL)
                return 0;

        for_each_possible_cpu(cpu)
                refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);

        return refcnt;
}

static int packet_alloc_pending(struct packet_sock *po)
{
        po->rx_ring.pending_refcnt = NULL;

        po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
        if (unlikely(po->tx_ring.pending_refcnt == NULL))
                return -ENOBUFS;

        return 0;
}

static void packet_free_pending(struct packet_sock *po)
{
        free_percpu(po->tx_ring.pending_refcnt);
}

#define ROOM_POW_OFF        2
#define ROOM_NONE        0x0
#define ROOM_LOW        0x1
#define ROOM_NORMAL        0x2

static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{
        int idx, len;

        len = READ_ONCE(po->rx_ring.frame_max) + 1;
        idx = READ_ONCE(po->rx_ring.head);
        if (pow_off)
                idx += len >> pow_off;
        if (idx >= len)
                idx -= len;
        return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{
        int idx, len;

        len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
        idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
        if (pow_off)
                idx += len >> pow_off;
        if (idx >= len)
                idx -= len;
        return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

static int __packet_rcv_has_room(const struct packet_sock *po,
                                 const struct sk_buff *skb)
{
        const struct sock *sk = &po->sk;
        int ret = ROOM_NONE;

        if (po->prot_hook.func != tpacket_rcv) {
                int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
                int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
                                   - (skb ? skb->truesize : 0);

                if (avail > (rcvbuf >> ROOM_POW_OFF))
                        return ROOM_NORMAL;
                else if (avail > 0)
                        return ROOM_LOW;
                else
                        return ROOM_NONE;
        }

        if (po->tp_version == TPACKET_V3) {
                if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
                        ret = ROOM_NORMAL;
                else if (__tpacket_v3_has_room(po, 0))
                        ret = ROOM_LOW;
        } else {
                if (__tpacket_has_room(po, ROOM_POW_OFF))
                        ret = ROOM_NORMAL;
                else if (__tpacket_has_room(po, 0))
                        ret = ROOM_LOW;
        }

        return ret;
}

static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
        bool pressure;
        int ret;

        ret = __packet_rcv_has_room(po, skb);
        pressure = ret != ROOM_NORMAL;

        if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure)
                packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure);

        return ret;
}

static void packet_rcv_try_clear_pressure(struct packet_sock *po)
{
        if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) &&
            __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
                packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false);
}

static void packet_sock_destruct(struct sock *sk)
{
        skb_queue_purge(&sk->sk_error_queue);

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));

        if (!sock_flag(sk, SOCK_DEAD)) {
                pr_err("Attempt to release alive packet socket: %p\n", sk);
                return;
        }
}

static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
{
        u32 *history = po->rollover->history;
        u32 victim, rxhash;
        int i, count = 0;

        rxhash = skb_get_hash(skb);
        for (i = 0; i < ROLLOVER_HLEN; i++)
                if (READ_ONCE(history[i]) == rxhash)
                        count++;

        victim = get_random_u32_below(ROLLOVER_HLEN);

        /* Avoid dirtying the cache line if possible */
        if (READ_ONCE(history[victim]) != rxhash)
                WRITE_ONCE(history[victim], rxhash);

        return count > (ROLLOVER_HLEN >> 1);
}

static unsigned int fanout_demux_hash(struct packet_fanout *f,
                                      struct sk_buff *skb,
                                      unsigned int num)
{
        return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
}

static unsigned int fanout_demux_lb(struct packet_fanout *f,
                                    struct sk_buff *skb,
                                    unsigned int num)
{
        unsigned int val = atomic_inc_return(&f->rr_cur);

        return val % num;
}

static unsigned int fanout_demux_cpu(struct packet_fanout *f,
                                     struct sk_buff *skb,
                                     unsigned int num)
{
        return smp_processor_id() % num;
}

static unsigned int fanout_demux_rnd(struct packet_fanout *f,
                                     struct sk_buff *skb,
                                     unsigned int num)
{
        return get_random_u32_below(num);
}

static unsigned int fanout_demux_rollover(struct packet_fanout *f,
                                          struct sk_buff *skb,
                                          unsigned int idx, bool try_self,
                                          unsigned int num)
{
        struct packet_sock *po, *po_next, *po_skip = NULL;
        unsigned int i, j, room = ROOM_NONE;

        po = pkt_sk(rcu_dereference(f->arr[idx]));

        if (try_self) {
                room = packet_rcv_has_room(po, skb);
                if (room == ROOM_NORMAL ||
                    (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
                        return idx;
                po_skip = po;
        }

        i = j = min_t(int, po->rollover->sock, num - 1);
        do {
                po_next = pkt_sk(rcu_dereference(f->arr[i]));
                if (po_next != po_skip &&
                    !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) &&
                    packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
                        if (i != j)
                                po->rollover->sock = i;
                        atomic_long_inc(&po->rollover->num);
                        if (room == ROOM_LOW)
                                atomic_long_inc(&po->rollover->num_huge);
                        return i;
                }

                if (++i == num)
                        i = 0;
        } while (i != j);

        atomic_long_inc(&po->rollover->num_failed);
        return idx;
}

static unsigned int fanout_demux_qm(struct packet_fanout *f,
                                    struct sk_buff *skb,
                                    unsigned int num)
{
        return skb_get_queue_mapping(skb) % num;
}

static unsigned int fanout_demux_bpf(struct packet_fanout *f,
                                     struct sk_buff *skb,
                                     unsigned int num)
{
        struct bpf_prog *prog;
        unsigned int ret = 0;

        rcu_read_lock();
        prog = rcu_dereference(f->bpf_prog);
        if (prog)
                ret = bpf_prog_run_clear_cb(prog, skb) % num;
        rcu_read_unlock();

        return ret;
}

static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
{
        return f->flags & (flag >> 8);
}

static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
                             struct packet_type *pt, struct net_device *orig_dev)
{
        struct packet_fanout *f = pt->af_packet_priv;
        unsigned int num = READ_ONCE(f->num_members);
        struct net *net = read_pnet(&f->net);
        struct packet_sock *po;
        unsigned int idx;

        if (!net_eq(dev_net(dev), net) || !num) {
                kfree_skb(skb);
                return 0;
        }

        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
                skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
                if (!skb)
                        return 0;
        }
        switch (f->type) {
        case PACKET_FANOUT_HASH:
        default:
                idx = fanout_demux_hash(f, skb, num);
                break;
        case PACKET_FANOUT_LB:
                idx = fanout_demux_lb(f, skb, num);
                break;
        case PACKET_FANOUT_CPU:
                idx = fanout_demux_cpu(f, skb, num);
                break;
        case PACKET_FANOUT_RND:
                idx = fanout_demux_rnd(f, skb, num);
                break;
        case PACKET_FANOUT_QM:
                idx = fanout_demux_qm(f, skb, num);
                break;
        case PACKET_FANOUT_ROLLOVER:
                idx = fanout_demux_rollover(f, skb, 0, false, num);
                break;
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                idx = fanout_demux_bpf(f, skb, num);
                break;
        }

        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
                idx = fanout_demux_rollover(f, skb, idx, true, num);

        po = pkt_sk(rcu_dereference(f->arr[idx]));
        return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
}

DEFINE_MUTEX(fanout_mutex);
EXPORT_SYMBOL_GPL(fanout_mutex);
static LIST_HEAD(fanout_list);
static u16 fanout_next_id;

static void __fanout_link(struct sock *sk, struct packet_sock *po)
{
        struct packet_fanout *f = po->fanout;

        spin_lock(&f->lock);
        rcu_assign_pointer(f->arr[f->num_members], sk);
        smp_wmb();
        f->num_members++;
        if (f->num_members == 1)
                dev_add_pack(&f->prot_hook);
        spin_unlock(&f->lock);
}

static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
{
        struct packet_fanout *f = po->fanout;
        int i;

        spin_lock(&f->lock);
        for (i = 0; i < f->num_members; i++) {
                if (rcu_dereference_protected(f->arr[i],
                                              lockdep_is_held(&f->lock)) == sk)
                        break;
        }
        BUG_ON(i >= f->num_members);
        rcu_assign_pointer(f->arr[i],
                           rcu_dereference_protected(f->arr[f->num_members - 1],
                                                     lockdep_is_held(&f->lock)));
        f->num_members--;
        if (f->num_members == 0)
                __dev_remove_pack(&f->prot_hook);
        spin_unlock(&f->lock);
}

static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
{
        if (sk->sk_family != PF_PACKET)
                return false;

        return ptype->af_packet_priv == pkt_sk(sk)->fanout;
}

static void fanout_init_data(struct packet_fanout *f)
{
        switch (f->type) {
        case PACKET_FANOUT_LB:
                atomic_set(&f->rr_cur, 0);
                break;
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                RCU_INIT_POINTER(f->bpf_prog, NULL);
                break;
        }
}

static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
{
        struct bpf_prog *old;

        spin_lock(&f->lock);
        old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
        rcu_assign_pointer(f->bpf_prog, new);
        spin_unlock(&f->lock);

        if (old) {
                synchronize_net();
                bpf_prog_destroy(old);
        }
}

static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
                                unsigned int len)
{
        struct bpf_prog *new;
        struct sock_fprog fprog;
        int ret;

        if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        ret = copy_bpf_fprog_from_user(&fprog, data, len);
        if (ret)
                return ret;

        ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
        if (ret)
                return ret;

        __fanout_set_data_bpf(po->fanout, new);
        return 0;
}

static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
                                unsigned int len)
{
        struct bpf_prog *new;
        u32 fd;

        if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
                return -EPERM;
        if (len != sizeof(fd))
                return -EINVAL;
        if (copy_from_sockptr(&fd, data, len))
                return -EFAULT;

        new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (IS_ERR(new))
                return PTR_ERR(new);

        __fanout_set_data_bpf(po->fanout, new);
        return 0;
}

static int fanout_set_data(struct packet_sock *po, sockptr_t data,
                           unsigned int len)
{
        switch (po->fanout->type) {
        case PACKET_FANOUT_CBPF:
                return fanout_set_data_cbpf(po, data, len);
        case PACKET_FANOUT_EBPF:
                return fanout_set_data_ebpf(po, data, len);
        default:
                return -EINVAL;
        }
}

static void fanout_release_data(struct packet_fanout *f)
{
        switch (f->type) {
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                __fanout_set_data_bpf(f, NULL);
        }
}

static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
{
        struct packet_fanout *f;

        list_for_each_entry(f, &fanout_list, list) {
                if (f->id == candidate_id &&
                    read_pnet(&f->net) == sock_net(sk)) {
                        return false;
                }
        }
        return true;
}

static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
{
        u16 id = fanout_next_id;

        do {
                if (__fanout_id_is_free(sk, id)) {
                        *new_id = id;
                        fanout_next_id = id + 1;
                        return true;
                }

                id++;
        } while (id != fanout_next_id);

        return false;
}

static int fanout_add(struct sock *sk, struct fanout_args *args)
{
        struct packet_rollover *rollover = NULL;
        struct packet_sock *po = pkt_sk(sk);
        u16 type_flags = args->type_flags;
        struct packet_fanout *f, *match;
        u8 type = type_flags & 0xff;
        u8 flags = type_flags >> 8;
        u16 id = args->id;
        int err;

        switch (type) {
        case PACKET_FANOUT_ROLLOVER:
                if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
                        return -EINVAL;
                break;
        case PACKET_FANOUT_HASH:
        case PACKET_FANOUT_LB:
        case PACKET_FANOUT_CPU:
        case PACKET_FANOUT_RND:
        case PACKET_FANOUT_QM:
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                break;
        default:
                return -EINVAL;
        }

        mutex_lock(&fanout_mutex);

        err = -EALREADY;
        if (po->fanout)
                goto out;

        if (type == PACKET_FANOUT_ROLLOVER ||
            (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
                err = -ENOMEM;
                rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
                if (!rollover)
                        goto out;
                atomic_long_set(&rollover->num, 0);
                atomic_long_set(&rollover->num_huge, 0);
                atomic_long_set(&rollover->num_failed, 0);
        }

        if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
                if (id != 0) {
                        err = -EINVAL;
                        goto out;
                }
                if (!fanout_find_new_id(sk, &id)) {
                        err = -ENOMEM;
                        goto out;
                }
                /* ephemeral flag for the first socket in the group: drop it */
                flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
        }

        match = NULL;
        list_for_each_entry(f, &fanout_list, list) {
                if (f->id == id &&
                    read_pnet(&f->net) == sock_net(sk)) {
                        match = f;
                        break;
                }
        }
        err = -EINVAL;
        if (match) {
                if (match->flags != flags)
                        goto out;
                if (args->max_num_members &&
                    args->max_num_members != match->max_num_members)
                        goto out;
        } else {
                if (args->max_num_members > PACKET_FANOUT_MAX)
                        goto out;
                if (!args->max_num_members)
                        /* legacy PACKET_FANOUT_MAX */
                        args->max_num_members = 256;
                err = -ENOMEM;
                match = kvzalloc(struct_size(match, arr, args->max_num_members),
                                 GFP_KERNEL);
                if (!match)
                        goto out;
                write_pnet(&match->net, sock_net(sk));
                match->id = id;
                match->type = type;
                match->flags = flags;
                INIT_LIST_HEAD(&match->list);
                spin_lock_init(&match->lock);
                refcount_set(&match->sk_ref, 0);
                fanout_init_data(match);
                match->prot_hook.type = po->prot_hook.type;
                match->prot_hook.dev = po->prot_hook.dev;
                match->prot_hook.func = packet_rcv_fanout;
                match->prot_hook.af_packet_priv = match;
                match->prot_hook.af_packet_net = read_pnet(&match->net);
                match->prot_hook.id_match = match_fanout_group;
                match->max_num_members = args->max_num_members;
                match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING;
                list_add(&match->list, &fanout_list);
        }
        err = -EINVAL;

        spin_lock(&po->bind_lock);
        if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
            match->type == type &&
            match->prot_hook.type == po->prot_hook.type &&
            match->prot_hook.dev == po->prot_hook.dev) {
                err = -ENOSPC;
                if (refcount_read(&match->sk_ref) < match->max_num_members) {
                        __dev_remove_pack(&po->prot_hook);

                        /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
                        WRITE_ONCE(po->fanout, match);

                        po->rollover = rollover;
                        rollover = NULL;
                        refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
                        __fanout_link(sk, po);
                        err = 0;
                }
        }
        spin_unlock(&po->bind_lock);

        if (err && !refcount_read(&match->sk_ref)) {
                list_del(&match->list);
                kvfree(match);
        }

out:
        kfree(rollover);
        mutex_unlock(&fanout_mutex);
        return err;
}

/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
 * It is the responsibility of the caller to call fanout_release_data() and
 * free the returned packet_fanout (after synchronize_net())
 */
static struct packet_fanout *fanout_release(struct sock *sk)
{
        struct packet_sock *po = pkt_sk(sk);
        struct packet_fanout *f;

        mutex_lock(&fanout_mutex);
        f = po->fanout;
        if (f) {
                po->fanout = NULL;

                if (refcount_dec_and_test(&f->sk_ref))
                        list_del(&f->list);
                else
                        f = NULL;
        }
        mutex_unlock(&fanout_mutex);

        return f;
}

static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        /* Earlier code assumed this would be a VLAN pkt, double-check
         * this now that we have the actual packet in hand. We can only
         * do this check on Ethernet devices.
         */
        if (unlikely(dev->type != ARPHRD_ETHER))
                return false;

        skb_reset_mac_header(skb);
        return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
}

static const struct proto_ops packet_ops;

static const struct proto_ops packet_ops_spkt;

static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
                           struct packet_type *pt, struct net_device *orig_dev)
{
        struct sock *sk;
        struct sockaddr_pkt *spkt;

        /*
         *        When we registered the protocol we saved the socket in the data
         *        field for just this event.
         */

        sk = pt->af_packet_priv;

        /*
         *        Yank back the headers [hope the device set this
         *        right or kerboom...]
         *
         *        Incoming packets have ll header pulled,
         *        push it back.
         *
         *        For outgoing ones skb->data == skb_mac_header(skb)
         *        so that this procedure is noop.
         */

        if (skb->pkt_type == PACKET_LOOPBACK)
                goto out;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                goto out;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (skb == NULL)
                goto oom;

        /* drop any routing info */
        skb_dst_drop(skb);

        /* drop conntrack reference */
        nf_reset_ct(skb);

        spkt = &PACKET_SKB_CB(skb)->sa.pkt;

        skb_push(skb, skb->data - skb_mac_header(skb));

        /*
         *        The SOCK_PACKET socket receives _all_ frames.
         */

        spkt->spkt_family = dev->type;
        strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
        spkt->spkt_protocol = skb->protocol;

        /*
         *        Charge the memory to the socket. This is done specifically
         *        to prevent sockets using all the memory up.
         */

        if (sock_queue_rcv_skb(sk, skb) == 0)
                return 0;

out:
        kfree_skb(skb);
oom:
        return 0;
}

static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
{
        int depth;

        if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
            sock->type == SOCK_RAW) {
                skb_reset_mac_header(skb);
                skb->protocol = dev_parse_header_protocol(skb);
        }

        /* Move network header to the right position for VLAN tagged packets */
        if (likely(skb->dev->type == ARPHRD_ETHER) &&
            eth_type_vlan(skb->protocol) &&
            vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
                skb_set_network_header(skb, depth);

        skb_probe_transport_header(skb);
}

/*
 *        Output a raw packet to a device layer. This bypasses all the other
 *        protocol layers and you must therefore supply it with a complete frame
 */

static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
                               size_t len)
{
        struct sock *sk = sock->sk;
        DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
        struct sk_buff *skb = NULL;
        struct net_device *dev;
        struct sockcm_cookie sockc;
        __be16 proto = 0;
        int err;
        int extra_len = 0;

        /*
         *        Get and verify the address.
         */

        if (saddr) {
                if (msg->msg_namelen < sizeof(struct sockaddr))
                        return -EINVAL;
                if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
                        proto = saddr->spkt_protocol;
        } else
                return -ENOTCONN;        /* SOCK_PACKET must be sent giving an address */

        /*
         *        Find the device first to size check it
         */

        saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
retry:
        rcu_read_lock();
        dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
        err = -ENODEV;
        if (dev == NULL)
                goto out_unlock;

        err = -ENETDOWN;
        if (!(dev->flags & IFF_UP))
                goto out_unlock;

        /*
         * You may not queue a frame bigger than the mtu. This is the lowest level
         * raw protocol and you must do your own fragmentation at this level.
         */

        if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
                if (!netif_supports_nofcs(dev)) {
                        err = -EPROTONOSUPPORT;
                        goto out_unlock;
                }
                extra_len = 4; /* We're doing our own CRC */
        }

        err = -EMSGSIZE;
        if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
                goto out_unlock;

        if (!skb) {
                size_t reserved = LL_RESERVED_SPACE(dev);
                int tlen = dev->needed_tailroom;
                unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;

                rcu_read_unlock();
                skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
                if (skb == NULL)
                        return -ENOBUFS;
                /* FIXME: Save some space for broken drivers that write a hard
                 * header at transmission time by themselves. PPP is the notable
                 * one here. This should really be fixed at the driver level.
                 */
                skb_reserve(skb, reserved);
                skb_reset_network_header(skb);

                /* Try to align data part correctly */
                if (hhlen) {
                        skb->data -= hhlen;
                        skb->tail -= hhlen;
                        if (len < hhlen)
                                skb_reset_network_header(skb);
                }
                err = memcpy_from_msg(skb_put(skb, len), msg, len);
                if (err)
                        goto out_free;
                goto retry;
        }

        if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
                err = -EINVAL;
                goto out_unlock;
        }
        if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
            !packet_extra_vlan_len_allowed(dev, skb)) {
                err = -EMSGSIZE;
                goto out_unlock;
        }

        sockcm_init(&sockc, sk);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(sk, msg, &sockc);
                if (unlikely(err))
                        goto out_unlock;
        }

        skb->protocol = proto;
        skb->dev = dev;
        skb->priority = READ_ONCE(sk->sk_priority);
        skb->mark = READ_ONCE(sk->sk_mark);
        skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
        skb_setup_tx_timestamp(skb, sockc.tsflags);

        if (unlikely(extra_len == 4))
                skb->no_fcs = 1;

        packet_parse_headers(skb, sock);

        dev_queue_xmit(skb);
        rcu_read_unlock();
        return len;

out_unlock:
        rcu_read_unlock();
out_free:
        kfree_skb(skb);
        return err;
}

static unsigned int run_filter(struct sk_buff *skb,
                               const struct sock *sk,
                               unsigned int res)
{
        struct sk_filter *filter;

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
                res = bpf_prog_run_clear_cb(filter->prog, skb);
        rcu_read_unlock();

        return res;
}

static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
                           size_t *len, int vnet_hdr_sz)
{
        struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };

        if (*len < vnet_hdr_sz)
                return -EINVAL;
        *len -= vnet_hdr_sz;

        if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
                return -EINVAL;

        return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
}

/*
 * This function makes lazy skb cloning in hope that most of packets
 * are discarded by BPF.
 *
 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
 * and skb->cb are mangled. It works because (and until) packets
 * falling here are owned by current CPU. Output packets are cloned
 * by dev_queue_xmit_nit(), input packets are processed by net_bh
 * sequentially, so that if we return skb to original state on exit,
 * we will not harm anyone.
 */

static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
                      struct packet_type *pt, struct net_device *orig_dev)
{
        enum skb_drop_reason drop_reason = SKB_CONSUMED;
        struct sock *sk = NULL;
        struct sockaddr_ll *sll;
        struct packet_sock *po;
        u8 *skb_head = skb->data;
        int skb_len = skb->len;
        unsigned int snaplen, res;

        if (skb->pkt_type == PACKET_LOOPBACK)
                goto drop;

        sk = pt->af_packet_priv;
        po = pkt_sk(sk);

        if (!net_eq(dev_net(dev), sock_net(sk)))
                goto drop;

        skb->dev = dev;

        if (dev_has_header(dev)) {
                /* The device has an explicit notion of ll header,
                 * exported to higher levels.
                 *
                 * Otherwise, the device hides details of its frame
                 * structure, so that corresponding packet head is
                 * never delivered to user.
                 */
                if (sk->sk_type != SOCK_DGRAM)
                        skb_push(skb, skb->data - skb_mac_header(skb));
                else if (skb->pkt_type == PACKET_OUTGOING) {
                        /* Special case: outgoing packets have ll header at head */
                        skb_pull(skb, skb_network_offset(skb));
                }
        }

        snaplen = skb->len;

        res = run_filter(skb, sk, snaplen);
        if (!res)
                goto drop_n_restore;
        if (snaplen > res)
                snaplen = res;

        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
                goto drop_n_acct;

        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
                if (nskb == NULL)
                        goto drop_n_acct;

                if (skb_head != skb->data) {
                        skb->data = skb_head;
                        skb->len = skb_len;
                }
                consume_skb(skb);
                skb = nskb;
        }

        sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);

        sll = &PACKET_SKB_CB(skb)->sa.ll;
        sll->sll_hatype = dev->type;
        sll->sll_pkttype = skb->pkt_type;
        if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
                sll->sll_ifindex = orig_dev->ifindex;
        else
                sll->sll_ifindex = dev->ifindex;

        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);

        /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
         * Use their space for storing the original skb length.
         */
        PACKET_SKB_CB(skb)->sa.origlen = skb->len;

        if (pskb_trim(skb, snaplen))
                goto drop_n_acct;

        skb_set_owner_r(skb, sk);
        skb->dev = NULL;
        skb_dst_drop(skb);

        /* drop conntrack reference */
        nf_reset_ct(skb);

        spin_lock(&sk->sk_receive_queue.lock);
        po->stats.stats1.tp_packets++;
        sock_skb_set_dropcount(sk, skb);
        skb_clear_delivery_time(skb);
        __skb_queue_tail(&sk->sk_receive_queue, skb);
        spin_unlock(&sk->sk_receive_queue.lock);
        sk->sk_data_ready(sk);
        return 0;

drop_n_acct:
        atomic_inc(&po->tp_drops);
        atomic_inc(&sk->sk_drops);
        drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;

drop_n_restore:
        if (skb_head != skb->data && skb_shared(skb)) {
                skb->data = skb_head;
                skb->len = skb_len;
        }
drop:
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;
}

static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
                       struct packet_type *pt, struct net_device *orig_dev)
{
        enum skb_drop_reason drop_reason = SKB_CONSUMED;
        struct sock *sk = NULL;
        struct packet_sock *po;
        struct sockaddr_ll *sll;
        union tpacket_uhdr h;
        u8 *skb_head = skb->data;
        int skb_len = skb->len;
        unsigned int snaplen, res;
        unsigned long status = TP_STATUS_USER;
        unsigned short macoff, hdrlen;
        unsigned int netoff;
        struct sk_buff *copy_skb = NULL;
        struct timespec64 ts;
        __u32 ts_status;
        unsigned int slot_id = 0;
        int vnet_hdr_sz = 0;

        /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
         * We may add members to them until current aligned size without forcing
         * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
         */
        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);

        if (skb->pkt_type == PACKET_LOOPBACK)
                goto drop;

        sk = pt->af_packet_priv;
        po = pkt_sk(sk);

        if (!net_eq(dev_net(dev), sock_net(sk)))
                goto drop;

        if (dev_has_header(dev)) {
                if (sk->sk_type != SOCK_DGRAM)
                        skb_push(skb, skb->data - skb_mac_header(skb));
                else if (skb->pkt_type == PACKET_OUTGOING) {
                        /* Special case: outgoing packets have ll header at head */
                        skb_pull(skb, skb_network_offset(skb));
                }
        }

        snaplen = skb->len;

        res = run_filter(skb, sk, snaplen);
        if (!res)
                goto drop_n_restore;

        /* If we are flooded, just give up */
        if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
                atomic_inc(&po->tp_drops);
                goto drop_n_restore;
        }

        if (skb->ip_summed == CHECKSUM_PARTIAL)
                status |= TP_STATUS_CSUMNOTREADY;
        else if (skb->pkt_type != PACKET_OUTGOING &&
                 skb_csum_unnecessary(skb))
                status |= TP_STATUS_CSUM_VALID;
        if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
                status |= TP_STATUS_GSO_TCP;

        if (snaplen > res)
                snaplen = res;

        if (sk->sk_type == SOCK_DGRAM) {
                macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
                                  po->tp_reserve;
        } else {
                unsigned int maclen = skb_network_offset(skb);
                netoff = TPACKET_ALIGN(po->tp_hdrlen +
                                       (maclen < 16 ? 16 : maclen)) +
                                       po->tp_reserve;
                vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
                if (vnet_hdr_sz)
                        netoff += vnet_hdr_sz;
                macoff = netoff - maclen;
        }
        if (netoff > USHRT_MAX) {
                atomic_inc(&po->tp_drops);
                goto drop_n_restore;
        }
        if (po->tp_version <= TPACKET_V2) {
                if (macoff + snaplen > po->rx_ring.frame_size) {
                        if (READ_ONCE(po->copy_thresh) &&
                            atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
                                if (skb_shared(skb)) {
                                        copy_skb = skb_clone(skb, GFP_ATOMIC);
                                } else {
                                        copy_skb = skb_get(skb);
                                        skb_head = skb->data;
                                }
                                if (copy_skb) {
                                        memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
                                               sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
                                        skb_set_owner_r(copy_skb, sk);
                                }
                        }
                        snaplen = po->rx_ring.frame_size - macoff;
                        if ((int)snaplen < 0) {
                                snaplen = 0;
                                vnet_hdr_sz = 0;
                        }
                }
        } else if (unlikely(macoff + snaplen >
                            GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
                u32 nval;

                nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
                pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
                            snaplen, nval, macoff);
                snaplen = nval;
                if (unlikely((int)snaplen < 0)) {
                        snaplen = 0;
                        macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
                        vnet_hdr_sz = 0;
                }
        }
        spin_lock(&sk->sk_receive_queue.lock);
        h.raw = packet_current_rx_frame(po, skb,
                                        TP_STATUS_KERNEL, (macoff+snaplen));
        if (!h.raw)
                goto drop_n_account;

        if (po->tp_version <= TPACKET_V2) {
                slot_id = po->rx_ring.head;
                if (test_bit(slot_id, po->rx_ring.rx_owner_map))
                        goto drop_n_account;
                __set_bit(slot_id, po->rx_ring.rx_owner_map);
        }

        if (vnet_hdr_sz &&
            virtio_net_hdr_from_skb(skb, h.raw + macoff -
                                    sizeof(struct virtio_net_hdr),
                                    vio_le(), true, 0)) {
                if (po->tp_version == TPACKET_V3)
                        prb_clear_blk_fill_status(&po->rx_ring);
                goto drop_n_account;
        }

        if (po->tp_version <= TPACKET_V2) {
                packet_increment_rx_head(po, &po->rx_ring);
        /*
         * LOSING will be reported till you read the stats,
         * because it's COR - Clear On Read.
         * Anyways, moving it for V1/V2 only as V3 doesn't need this
         * at packet level.
         */
                if (atomic_read(&po->tp_drops))
                        status |= TP_STATUS_LOSING;
        }

        po->stats.stats1.tp_packets++;
        if (copy_skb) {
                status |= TP_STATUS_COPY;
                skb_clear_delivery_time(copy_skb);
                __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
        }
        spin_unlock(&sk->sk_receive_queue.lock);

        skb_copy_bits(skb, 0, h.raw + macoff, snaplen);

        /* Always timestamp; prefer an existing software timestamp taken
         * closer to the time of capture.
         */
        ts_status = tpacket_get_timestamp(skb, &ts,
                                          READ_ONCE(po->tp_tstamp) |
                                          SOF_TIMESTAMPING_SOFTWARE);
        if (!ts_status)
                ktime_get_real_ts64(&ts);

        status |= ts_status;

        switch (po->tp_version) {
        case TPACKET_V1:
                h.h1->tp_len = skb->len;
                h.h1->tp_snaplen = snaplen;
                h.h1->tp_mac = macoff;
                h.h1->tp_net = netoff;
                h.h1->tp_sec = ts.tv_sec;
                h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
                hdrlen = sizeof(*h.h1);
                break;
        case TPACKET_V2:
                h.h2->tp_len = skb->len;
                h.h2->tp_snaplen = snaplen;
                h.h2->tp_mac = macoff;
                h.h2->tp_net = netoff;
                h.h2->tp_sec = ts.tv_sec;
                h.h2->tp_nsec = ts.tv_nsec;
                if (skb_vlan_tag_present(skb)) {
                        h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
                        h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
                        status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
                } else {
                        h.h2->tp_vlan_tci = 0;
                        h.h2->tp_vlan_tpid = 0;
                }
                memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
                hdrlen = sizeof(*h.h2);
                break;
        case TPACKET_V3:
                /* tp_nxt_offset,vlan are already populated above.
                 * So DONT clear those fields here
                 */
                h.h3->tp_status |= status;
                h.h3->tp_len = skb->len;
                h.h3->tp_snaplen = snaplen;
                h.h3->tp_mac = macoff;
                h.h3->tp_net = netoff;
                h.h3->tp_sec  = ts.tv_sec;
                h.h3->tp_nsec = ts.tv_nsec;
                memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
                hdrlen = sizeof(*h.h3);
                break;
        default:
                BUG();
        }

        sll = h.raw + TPACKET_ALIGN(hdrlen);
        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
        sll->sll_family = AF_PACKET;
        sll->sll_hatype = dev->type;
        sll->sll_protocol = skb->protocol;
        sll->sll_pkttype = skb->pkt_type;
        if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
                sll->sll_ifindex = orig_dev->ifindex;
        else
                sll->sll_ifindex = dev->ifindex;

        smp_mb();

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        if (po->tp_version <= TPACKET_V2) {
                u8 *start, *end;

                end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
                                        macoff + snaplen);

                for (start = h.raw; start < end; start += PAGE_SIZE)
                        flush_dcache_page(pgv_to_page(start));
        }
        smp_wmb();
#endif

        if (po->tp_version <= TPACKET_V2) {
                spin_lock(&sk->sk_receive_queue.lock);
                __packet_set_status(po, h.raw, status);
                __clear_bit(slot_id, po->rx_ring.rx_owner_map);
                spin_unlock(&sk->sk_receive_queue.lock);
                sk->sk_data_ready(sk);
        } else if (po->tp_version == TPACKET_V3) {
                prb_clear_blk_fill_status(&po->rx_ring);
        }

drop_n_restore:
        if (skb_head != skb->data && skb_shared(skb)) {
                skb->data = skb_head;
                skb->len = skb_len;
        }
drop:
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;

drop_n_account:
        spin_unlock(&sk->sk_receive_queue.lock);
        atomic_inc(&po->tp_drops);
        drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;

        sk->sk_data_ready(sk);
        sk_skb_reason_drop(sk, copy_skb, drop_reason);
        goto drop_n_restore;
}

static void tpacket_destruct_skb(struct sk_buff *skb)
{
        struct packet_sock *po = pkt_sk(skb->sk);

        if (likely(po->tx_ring.pg_vec)) {
                void *ph;
                __u32 ts;

                ph = skb_zcopy_get_nouarg(skb);
                packet_dec_pending(&po->tx_ring);

                ts = __packet_set_timestamp(po, ph, skb);
                __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);

                complete(&po->skb_completion);
        }

        sock_wfree(skb);
}

static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
{
        if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
            (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
             __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
              __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
                vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
                         __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
                        __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);

        if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
                return -EINVAL;

        return 0;
}

static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
                                 struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
{
        int ret;

        if (*len < vnet_hdr_sz)
                return -EINVAL;
        *len -= vnet_hdr_sz;

        if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
                return -EFAULT;

        ret = __packet_snd_vnet_parse(vnet_hdr, *len);
        if (ret)
                return ret;

        /* move iter to point to the start of mac header */
        if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
                iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));

        return 0;
}

static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
                void *frame, struct net_device *dev, void *data, int tp_len,
                __be16 proto, unsigned char *addr, int hlen, int copylen,
                const struct sockcm_cookie *sockc)
{
        union tpacket_uhdr ph;
        int to_write, offset, len, nr_frags, len_max;
        struct socket *sock = po->sk.sk_socket;
        struct page *page;
        int err;

        ph.raw = frame;

        skb->protocol = proto;
        skb->dev = dev;
        skb->priority = READ_ONCE(po->sk.sk_priority);
        skb->mark = READ_ONCE(po->sk.sk_mark);
        skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid);
        skb_setup_tx_timestamp(skb, sockc->tsflags);
        skb_zcopy_set_nouarg(skb, ph.raw);

        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);

        to_write = tp_len;

        if (sock->type == SOCK_DGRAM) {
                err = dev_hard_header(skb, dev, ntohs(proto), addr,
                                NULL, tp_len);
                if (unlikely(err < 0))
                        return -EINVAL;
        } else if (copylen) {
                int hdrlen = min_t(int, copylen, tp_len);

                skb_push(skb, dev->hard_header_len);
                skb_put(skb, copylen - dev->hard_header_len);
                err = skb_store_bits(skb, 0, data, hdrlen);
                if (unlikely(err))
                        return err;
                if (!dev_validate_header(dev, skb->data, hdrlen))
                        return -EINVAL;

                data += hdrlen;
                to_write -= hdrlen;
        }

        offset = offset_in_page(data);
        len_max = PAGE_SIZE - offset;
        len = ((to_write > len_max) ? len_max : to_write);

        skb->data_len = to_write;
        skb->len += to_write;
        skb->truesize += to_write;
        refcount_add(to_write, &po->sk.sk_wmem_alloc);

        while (likely(to_write)) {
                nr_frags = skb_shinfo(skb)->nr_frags;

                if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
                        pr_err("Packet exceed the number of skb frags(%u)\n",
                               (unsigned int)MAX_SKB_FRAGS);
                        return -EFAULT;
                }

                page = pgv_to_page(data);
                data += len;
                flush_dcache_page(page);
                get_page(page);
                skb_fill_page_desc(skb, nr_frags, page, offset, len);
                to_write -= len;
                offset = 0;
                len_max = PAGE_SIZE;
                len = ((to_write > len_max) ? len_max : to_write);
        }

        packet_parse_headers(skb, sock);

        return tp_len;
}

static int tpacket_parse_header(struct packet_sock *po, void *frame,
                                int size_max, void **data)
{
        union tpacket_uhdr ph;
        int tp_len, off;

        ph.raw = frame;

        switch (po->tp_version) {
        case TPACKET_V3:
                if (ph.h3->tp_next_offset != 0) {
                        pr_warn_once("variable sized slot not supported");
                        return -EINVAL;
                }
                tp_len = ph.h3->tp_len;
                break;
        case TPACKET_V2:
                tp_len = ph.h2->tp_len;
                break;
        default:
                tp_len = ph.h1->tp_len;
                break;
        }
        if (unlikely(tp_len > size_max)) {
                pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
                return -EMSGSIZE;
        }

        if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) {
                int off_min, off_max;

                off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
                off_max = po->tx_ring.frame_size - tp_len;
                if (po->sk.sk_type == SOCK_DGRAM) {
                        switch (po->tp_version) {
                        case TPACKET_V3:
                                off = ph.h3->tp_net;
                                break;
                        case TPACKET_V2:
                                off = ph.h2->tp_net;
                                break;
                        default:
                                off = ph.h1->tp_net;
                                break;
                        }
                } else {
                        switch (po->tp_version) {
                        case TPACKET_V3:
                                off = ph.h3->tp_mac;
                                break;
                        case TPACKET_V2:
                                off = ph.h2->tp_mac;
                                break;
                        default:
                                off = ph.h1->tp_mac;
                                break;
                        }
                }
                if (unlikely((off < off_min) || (off_max < off)))
                        return -EINVAL;
        } else {
                off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
        }

        *data = frame + off;
        return tp_len;
}

static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{
        struct sk_buff *skb = NULL;
        struct net_device *dev;
        struct virtio_net_hdr *vnet_hdr = NULL;
        struct sockcm_cookie sockc;
        __be16 proto;
        int err, reserve = 0;
        void *ph;
        DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
        bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
        int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
        unsigned char *addr = NULL;
        int tp_len, size_max;
        void *data;
        int len_sum = 0;
        int status = TP_STATUS_AVAILABLE;
        int hlen, tlen, copylen = 0;
        long timeo = 0;

        mutex_lock(&po->pg_vec_lock);

        /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
         * we need to confirm it under protection of pg_vec_lock.
         */
        if (unlikely(!po->tx_ring.pg_vec)) {
                err = -EBUSY;
                goto out;
        }
        if (likely(saddr == NULL)) {
                dev        = packet_cached_dev_get(po);
                proto        = READ_ONCE(po->num);
        } else {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
                        goto out;
                if (msg->msg_namelen < (saddr->sll_halen
                                        + offsetof(struct sockaddr_ll,
                                                sll_addr)))
                        goto out;
                proto        = saddr->sll_protocol;
                dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
                if (po->sk.sk_socket->type == SOCK_DGRAM) {
                        if (dev && msg->msg_namelen < dev->addr_len +
                                   offsetof(struct sockaddr_ll, sll_addr))
                                goto out_put;
                        addr = saddr->sll_addr;
                }
        }

        err = -ENXIO;
        if (unlikely(dev == NULL))
                goto out;
        err = -ENETDOWN;
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_put;

        sockcm_init(&sockc, &po->sk);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(&po->sk, msg, &sockc);
                if (unlikely(err))
                        goto out_put;
        }

        if (po->sk.sk_socket->type == SOCK_RAW)
                reserve = dev->hard_header_len;
        size_max = po->tx_ring.frame_size
                - (po->tp_hdrlen - sizeof(struct sockaddr_ll));

        if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
                size_max = dev->mtu + reserve + VLAN_HLEN;

        reinit_completion(&po->skb_completion);

        do {
                ph = packet_current_frame(po, &po->tx_ring,
                                          TP_STATUS_SEND_REQUEST);
                if (unlikely(ph == NULL)) {
                        if (need_wait && skb) {
                                timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
                                timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
                                if (timeo <= 0) {
                                        err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
                                        goto out_put;
                                }
                        }
                        /* check for additional frames */
                        continue;
                }

                skb = NULL;
                tp_len = tpacket_parse_header(po, ph, size_max, &data);
                if (tp_len < 0)
                        goto tpacket_error;

                status = TP_STATUS_SEND_REQUEST;
                hlen = LL_RESERVED_SPACE(dev);
                tlen = dev->needed_tailroom;
                if (vnet_hdr_sz) {
                        vnet_hdr = data;
                        data += vnet_hdr_sz;
                        tp_len -= vnet_hdr_sz;
                        if (tp_len < 0 ||
                            __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
                                tp_len = -EINVAL;
                                goto tpacket_error;
                        }
                        copylen = __virtio16_to_cpu(vio_le(),
                                                    vnet_hdr->hdr_len);
                }
                copylen = max_t(int, copylen, dev->hard_header_len);
                skb = sock_alloc_send_skb(&po->sk,
                                hlen + tlen + sizeof(struct sockaddr_ll) +
                                (copylen - dev->hard_header_len),
                                !need_wait, &err);

                if (unlikely(skb == NULL)) {
                        /* we assume the socket was initially writeable ... */
                        if (likely(len_sum > 0))
                                err = len_sum;
                        goto out_status;
                }
                tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
                                          addr, hlen, copylen, &sockc);
                if (likely(tp_len >= 0) &&
                    tp_len > dev->mtu + reserve &&
                    !vnet_hdr_sz &&
                    !packet_extra_vlan_len_allowed(dev, skb))
                        tp_len = -EMSGSIZE;

                if (unlikely(tp_len < 0)) {
tpacket_error:
                        if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) {
                                __packet_set_status(po, ph,
                                                TP_STATUS_AVAILABLE);
                                packet_increment_head(&po->tx_ring);
                                kfree_skb(skb);
                                continue;
                        } else {
                                status = TP_STATUS_WRONG_FORMAT;
                                err = tp_len;
                                goto out_status;
                        }
                }

                if (vnet_hdr_sz) {
                        if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
                                tp_len = -EINVAL;
                                goto tpacket_error;
                        }
                        virtio_net_hdr_set_proto(skb, vnet_hdr);
                }

                skb->destructor = tpacket_destruct_skb;
                __packet_set_status(po, ph, TP_STATUS_SENDING);
                packet_inc_pending(&po->tx_ring);

                status = TP_STATUS_SEND_REQUEST;
                err = packet_xmit(po, skb);
                if (unlikely(err != 0)) {
                        if (err > 0)
                                err = net_xmit_errno(err);
                        if (err && __packet_get_status(po, ph) ==
                                   TP_STATUS_AVAILABLE) {
                                /* skb was destructed already */
                                skb = NULL;
                                goto out_status;
                        }
                        /*
                         * skb was dropped but not destructed yet;
                         * let's treat it like congestion or err < 0
                         */
                        err = 0;
                }
                packet_increment_head(&po->tx_ring);
                len_sum += tp_len;
        } while (likely((ph != NULL) ||
                /* Note: packet_read_pending() might be slow if we have
                 * to call it as it's per_cpu variable, but in fast-path
                 * we already short-circuit the loop with the first
                 * condition, and luckily don't have to go that path
                 * anyway.
                 */
                 (need_wait && packet_read_pending(&po->tx_ring))));

        err = len_sum;
        goto out_put;

out_status:
        __packet_set_status(po, ph, status);
        kfree_skb(skb);
out_put:
        dev_put(dev);
out:
        mutex_unlock(&po->pg_vec_lock);
        return err;
}

static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
                                        size_t reserve, size_t len,
                                        size_t linear, int noblock,
                                        int *err)
{
        struct sk_buff *skb;

        /* Under a page?  Don't bother with paged skb. */
        if (prepad + len < PAGE_SIZE || !linear)
                linear = len;

        if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
                linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
                                   err, PAGE_ALLOC_COSTLY_ORDER);
        if (!skb)
                return NULL;

        skb_reserve(skb, reserve);
        skb_put(skb, linear);
        skb->data_len = len - linear;
        skb->len += len - linear;

        return skb;
}

static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
        struct sk_buff *skb;
        struct net_device *dev;
        __be16 proto;
        unsigned char *addr = NULL;
        int err, reserve = 0;
        struct sockcm_cookie sockc;
        struct virtio_net_hdr vnet_hdr = { 0 };
        int offset = 0;
        struct packet_sock *po = pkt_sk(sk);
        int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
        int hlen, tlen, linear;
        int extra_len = 0;

        /*
         *        Get and verify the address.
         */

        if (likely(saddr == NULL)) {
                dev        = packet_cached_dev_get(po);
                proto        = READ_ONCE(po->num);
        } else {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
                        goto out;
                if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
                        goto out;
                proto        = saddr->sll_protocol;
                dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
                if (sock->type == SOCK_DGRAM) {
                        if (dev && msg->msg_namelen < dev->addr_len +
                                   offsetof(struct sockaddr_ll, sll_addr))
                                goto out_unlock;
                        addr = saddr->sll_addr;
                }
        }

        err = -ENXIO;
        if (unlikely(dev == NULL))
                goto out_unlock;
        err = -ENETDOWN;
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_unlock;

        sockcm_init(&sockc, sk);
        sockc.mark = READ_ONCE(sk->sk_mark);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(sk, msg, &sockc);
                if (unlikely(err))
                        goto out_unlock;
        }

        if (sock->type == SOCK_RAW)
                reserve = dev->hard_header_len;
        if (vnet_hdr_sz) {
                err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
                if (err)
                        goto out_unlock;
        }

        if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
                if (!netif_supports_nofcs(dev)) {
                        err = -EPROTONOSUPPORT;
                        goto out_unlock;
                }
                extra_len = 4; /* We're doing our own CRC */
        }

        err = -EMSGSIZE;
        if (!vnet_hdr.gso_type &&
            (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
                goto out_unlock;

        err = -ENOBUFS;
        hlen = LL_RESERVED_SPACE(dev);
        tlen = dev->needed_tailroom;
        linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
        linear = max(linear, min_t(int, len, dev->hard_header_len));
        skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
                               msg->msg_flags & MSG_DONTWAIT, &err);
        if (skb == NULL)
                goto out_unlock;

        skb_reset_network_header(skb);

        err = -EINVAL;
        if (sock->type == SOCK_DGRAM) {
                offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
                if (unlikely(offset < 0))
                        goto out_free;
        } else if (reserve) {
                skb_reserve(skb, -reserve);
                if (len < reserve + sizeof(struct ipv6hdr) &&
                    dev->min_header_len != dev->hard_header_len)
                        skb_reset_network_header(skb);
        }

        /* Returns -EFAULT on error */
        err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
        if (err)
                goto out_free;

        if ((sock->type == SOCK_RAW &&
             !dev_validate_header(dev, skb->data, len)) || !skb->len) {
                err = -EINVAL;
                goto out_free;
        }

        skb_setup_tx_timestamp(skb, sockc.tsflags);

        if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
            !packet_extra_vlan_len_allowed(dev, skb)) {
                err = -EMSGSIZE;
                goto out_free;
        }

        skb->protocol = proto;
        skb->dev = dev;
        skb->priority = READ_ONCE(sk->sk_priority);
        skb->mark = sockc.mark;
        skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);

        if (unlikely(extra_len == 4))
                skb->no_fcs = 1;

        packet_parse_headers(skb, sock);

        if (vnet_hdr_sz) {
                err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
                if (err)
                        goto out_free;
                len += vnet_hdr_sz;
                virtio_net_hdr_set_proto(skb, &vnet_hdr);
        }

        err = packet_xmit(po, skb);

        if (unlikely(err != 0)) {
                if (err > 0)
                        err = net_xmit_errno(err);
                if (err)
                        goto out_unlock;
        }

        dev_put(dev);

        return len;

out_free:
        kfree_skb(skb);
out_unlock:
        dev_put(dev);
out:
        return err;
}

static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);

        /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
         * tpacket_snd() will redo the check safely.
         */
        if (data_race(po->tx_ring.pg_vec))
                return tpacket_snd(po, msg);

        return packet_snd(sock, msg, len);
}

/*
 *        Close a PACKET socket. This is fairly simple. We immediately go
 *        to 'closed' state and remove our protocol entry in the device list.
 */

static int packet_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po;
        struct packet_fanout *f;
        struct net *net;
        union tpacket_req_u req_u;

        if (!sk)
                return 0;

        net = sock_net(sk);
        po = pkt_sk(sk);

        mutex_lock(&net->packet.sklist_lock);
        sk_del_node_init_rcu(sk);
        mutex_unlock(&net->packet.sklist_lock);

        sock_prot_inuse_add(net, sk->sk_prot, -1);

        spin_lock(&po->bind_lock);
        unregister_prot_hook(sk, false);
        packet_cached_dev_reset(po);

        if (po->prot_hook.dev) {
                netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
                po->prot_hook.dev = NULL;
        }
        spin_unlock(&po->bind_lock);

        packet_flush_mclist(sk);

        lock_sock(sk);
        if (po->rx_ring.pg_vec) {
                memset(&req_u, 0, sizeof(req_u));
                packet_set_ring(sk, &req_u, 1, 0);
        }

        if (po->tx_ring.pg_vec) {
                memset(&req_u, 0, sizeof(req_u));
                packet_set_ring(sk, &req_u, 1, 1);
        }
        release_sock(sk);

        f = fanout_release(sk);

        synchronize_net();

        kfree(po->rollover);
        if (f) {
                fanout_release_data(f);
                kvfree(f);
        }
        /*
         *        Now the socket is dead. No more input will appear.
         */
        sock_orphan(sk);
        sock->sk = NULL;

        /* Purge queues */

        skb_queue_purge(&sk->sk_receive_queue);
        packet_free_pending(po);

        sock_put(sk);
        return 0;
}

/*
 *        Attach a packet hook.
 */

static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
                          __be16 proto)
{
        struct packet_sock *po = pkt_sk(sk);
        struct net_device *dev = NULL;
        bool unlisted = false;
        bool need_rehook;
        int ret = 0;

        lock_sock(sk);
        spin_lock(&po->bind_lock);
        if (!proto)
                proto = po->num;

        rcu_read_lock();

        if (po->fanout) {
                ret = -EINVAL;
                goto out_unlock;
        }

        if (name) {
                dev = dev_get_by_name_rcu(sock_net(sk), name);
                if (!dev) {
                        ret = -ENODEV;
                        goto out_unlock;
                }
        } else if (ifindex) {
                dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
                if (!dev) {
                        ret = -ENODEV;
                        goto out_unlock;
                }
        }

        need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;

        if (need_rehook) {
                dev_hold(dev);
                if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
                        rcu_read_unlock();
                        /* prevents packet_notifier() from calling
                         * register_prot_hook()
                         */
                        WRITE_ONCE(po->num, 0);
                        __unregister_prot_hook(sk, true);
                        rcu_read_lock();
                        if (dev)
                                unlisted = !dev_get_by_index_rcu(sock_net(sk),
                                                                 dev->ifindex);
                }

                BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING));
                WRITE_ONCE(po->num, proto);
                po->prot_hook.type = proto;

                netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);

                if (unlikely(unlisted)) {
                        po->prot_hook.dev = NULL;
                        WRITE_ONCE(po->ifindex, -1);
                        packet_cached_dev_reset(po);
                } else {
                        netdev_hold(dev, &po->prot_hook.dev_tracker,
                                    GFP_ATOMIC);
                        po->prot_hook.dev = dev;
                        WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
                        packet_cached_dev_assign(po, dev);
                }
                dev_put(dev);
        }

        if (proto == 0 || !need_rehook)
                goto out_unlock;

        if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
                register_prot_hook(sk);
        } else {
                sk->sk_err = ENETDOWN;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
        }

out_unlock:
        rcu_read_unlock();
        spin_unlock(&po->bind_lock);
        release_sock(sk);
        return ret;
}

/*
 *        Bind a packet socket to a device
 */

static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
                            int addr_len)
{
        struct sock *sk = sock->sk;
        char name[sizeof(uaddr->sa_data_min) + 1];

        /*
         *        Check legality
         */

        if (addr_len != sizeof(struct sockaddr))
                return -EINVAL;
        /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
         * zero-terminated.
         */
        memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
        name[sizeof(uaddr->sa_data_min)] = 0;

        return packet_do_bind(sk, name, 0, 0);
}

static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
        struct sock *sk = sock->sk;

        /*
         *        Check legality
         */

        if (addr_len < sizeof(struct sockaddr_ll))
                return -EINVAL;
        if (sll->sll_family != AF_PACKET)
                return -EINVAL;

        return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
}

static struct proto packet_proto = {
        .name          = "PACKET",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct packet_sock),
};

/*
 *        Create a packet of type SOCK_PACKET.
 */

static int packet_create(struct net *net, struct socket *sock, int protocol,
                         int kern)
{
        struct sock *sk;
        struct packet_sock *po;
        __be16 proto = (__force __be16)protocol; /* weird, but documented */
        int err;

        if (!ns_capable(net->user_ns, CAP_NET_RAW))
                return -EPERM;
        if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
            sock->type != SOCK_PACKET)
                return -ESOCKTNOSUPPORT;

        sock->state = SS_UNCONNECTED;

        err = -ENOBUFS;
        sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
        if (sk == NULL)
                goto out;

        sock->ops = &packet_ops;
        if (sock->type == SOCK_PACKET)
                sock->ops = &packet_ops_spkt;

        sock_init_data(sock, sk);

        po = pkt_sk(sk);
        init_completion(&po->skb_completion);
        sk->sk_family = PF_PACKET;
        po->num = proto;

        err = packet_alloc_pending(po);
        if (err)
                goto out2;

        packet_cached_dev_reset(po);

        sk->sk_destruct = packet_sock_destruct;

        /*
         *        Attach a protocol block
         */

        spin_lock_init(&po->bind_lock);
        mutex_init(&po->pg_vec_lock);
        po->rollover = NULL;
        po->prot_hook.func = packet_rcv;

        if (sock->type == SOCK_PACKET)
                po->prot_hook.func = packet_rcv_spkt;

        po->prot_hook.af_packet_priv = sk;
        po->prot_hook.af_packet_net = sock_net(sk);

        if (proto) {
                po->prot_hook.type = proto;
                __register_prot_hook(sk);
        }

        mutex_lock(&net->packet.sklist_lock);
        sk_add_node_tail_rcu(sk, &net->packet.sklist);
        mutex_unlock(&net->packet.sklist_lock);

        sock_prot_inuse_add(net, &packet_proto, 1);

        return 0;
out2:
        sk_free(sk);
out:
        return err;
}

/*
 *        Pull a packet from our receive queue and hand it to the user.
 *        If necessary we block.
 */

static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                          int flags)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        int copied, err;
        int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
        unsigned int origlen = 0;

        err = -EINVAL;
        if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
                goto out;

#if 0
        /* What error should we return now? EUNATTACH? */
        if (pkt_sk(sk)->ifindex < 0)
                return -ENODEV;
#endif

        if (flags & MSG_ERRQUEUE) {
                err = sock_recv_errqueue(sk, msg, len,
                                         SOL_PACKET, PACKET_TX_TIMESTAMP);
                goto out;
        }

        /*
         *        Call the generic datagram receiver. This handles all sorts
         *        of horrible races and re-entrancy so we can forget about it
         *        in the protocol layers.
         *
         *        Now it will return ENETDOWN, if device have just gone down,
         *        but then it will block.
         */

        skb = skb_recv_datagram(sk, flags, &err);

        /*
         *        An error occurred so return it. Because skb_recv_datagram()
         *        handles the blocking we don't see and worry about blocking
         *        retries.
         */

        if (skb == NULL)
                goto out;

        packet_rcv_try_clear_pressure(pkt_sk(sk));

        if (vnet_hdr_len) {
                err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
                if (err)
                        goto out_free;
        }

        /* You lose any data beyond the buffer you gave. If it worries
         * a user program they can ask the device for its MTU
         * anyway.
         */
        copied = skb->len;
        if (copied > len) {
                copied = len;
                msg->msg_flags |= MSG_TRUNC;
        }

        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto out_free;

        if (sock->type != SOCK_PACKET) {
                struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;

                /* Original length was stored in sockaddr_ll fields */
                origlen = PACKET_SKB_CB(skb)->sa.origlen;
                sll->sll_family = AF_PACKET;
                sll->sll_protocol = skb->protocol;
        }

        sock_recv_cmsgs(msg, sk, skb);

        if (msg->msg_name) {
                const size_t max_len = min(sizeof(skb->cb),
                                           sizeof(struct sockaddr_storage));
                int copy_len;

                /* If the address length field is there to be filled
                 * in, we fill it in now.
                 */
                if (sock->type == SOCK_PACKET) {
                        __sockaddr_check_size(sizeof(struct sockaddr_pkt));
                        msg->msg_namelen = sizeof(struct sockaddr_pkt);
                        copy_len = msg->msg_namelen;
                } else {
                        struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;

                        msg->msg_namelen = sll->sll_halen +
                                offsetof(struct sockaddr_ll, sll_addr);
                        copy_len = msg->msg_namelen;
                        if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
                                memset(msg->msg_name +
                                       offsetof(struct sockaddr_ll, sll_addr),
                                       0, sizeof(sll->sll_addr));
                                msg->msg_namelen = sizeof(struct sockaddr_ll);
                        }
                }
                if (WARN_ON_ONCE(copy_len > max_len)) {
                        copy_len = max_len;
                        msg->msg_namelen = copy_len;
                }
                memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
        }

        if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
                struct tpacket_auxdata aux;

                aux.tp_status = TP_STATUS_USER;
                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        aux.tp_status |= TP_STATUS_CSUMNOTREADY;
                else if (skb->pkt_type != PACKET_OUTGOING &&
                         skb_csum_unnecessary(skb))
                        aux.tp_status |= TP_STATUS_CSUM_VALID;
                if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
                        aux.tp_status |= TP_STATUS_GSO_TCP;

                aux.tp_len = origlen;
                aux.tp_snaplen = skb->len;
                aux.tp_mac = 0;
                aux.tp_net = skb_network_offset(skb);
                if (skb_vlan_tag_present(skb)) {
                        aux.tp_vlan_tci = skb_vlan_tag_get(skb);
                        aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
                        aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
                } else {
                        aux.tp_vlan_tci = 0;
                        aux.tp_vlan_tpid = 0;
                }
                put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
        }

        /*
         *        Free or return the buffer as appropriate. Again this
         *        hides all the races and re-entrancy issues from us.
         */
        err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);

out_free:
        skb_free_datagram(sk, skb);
out:
        return err;
}

static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
                               int peer)
{
        struct net_device *dev;
        struct sock *sk        = sock->sk;

        if (peer)
                return -EOPNOTSUPP;

        uaddr->sa_family = AF_PACKET;
        memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
        rcu_read_lock();
        dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
        if (dev)
                strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
        rcu_read_unlock();

        return sizeof(*uaddr);
}

static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
                          int peer)
{
        struct net_device *dev;
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
        int ifindex;

        if (peer)
                return -EOPNOTSUPP;

        ifindex = READ_ONCE(po->ifindex);
        sll->sll_family = AF_PACKET;
        sll->sll_ifindex = ifindex;
        sll->sll_protocol = READ_ONCE(po->num);
        sll->sll_pkttype = 0;
        rcu_read_lock();
        dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
        if (dev) {
                sll->sll_hatype = dev->type;
                sll->sll_halen = dev->addr_len;

                /* Let __fortify_memcpy_chk() know the actual buffer size. */
                memcpy(((struct sockaddr_storage *)sll)->__data +
                       offsetof(struct sockaddr_ll, sll_addr) -
                       offsetofend(struct sockaddr_ll, sll_family),
                       dev->dev_addr, dev->addr_len);
        } else {
                sll->sll_hatype = 0;        /* Bad: we have no ARPHRD_UNSPEC */
                sll->sll_halen = 0;
        }
        rcu_read_unlock();

        return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
}

static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
                         int what)
{
        switch (i->type) {
        case PACKET_MR_MULTICAST:
                if (i->alen != dev->addr_len)
                        return -EINVAL;
                if (what > 0)
                        return dev_mc_add(dev, i->addr);
                else
                        return dev_mc_del(dev, i->addr);
                break;
        case PACKET_MR_PROMISC:
                return dev_set_promiscuity(dev, what);
        case PACKET_MR_ALLMULTI:
                return dev_set_allmulti(dev, what);
        case PACKET_MR_UNICAST:
                if (i->alen != dev->addr_len)
                        return -EINVAL;
                if (what > 0)
                        return dev_uc_add(dev, i->addr);
                else
                        return dev_uc_del(dev, i->addr);
                break;
        default:
                break;
        }
        return 0;
}

static void packet_dev_mclist_delete(struct net_device *dev,
                                     struct packet_mclist **mlp)
{
        struct packet_mclist *ml;

        while ((ml = *mlp) != NULL) {
                if (ml->ifindex == dev->ifindex) {
                        packet_dev_mc(dev, ml, -1);
                        *mlp = ml->next;
                        kfree(ml);
                } else
                        mlp = &ml->next;
        }
}

static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
{
        struct packet_sock *po = pkt_sk(sk);
        struct packet_mclist *ml, *i;
        struct net_device *dev;
        int err;

        rtnl_lock();

        err = -ENODEV;
        dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
        if (!dev)
                goto done;

        err = -EINVAL;
        if (mreq->mr_alen > dev->addr_len)
                goto done;

        err = -ENOBUFS;
        i = kmalloc(sizeof(*i), GFP_KERNEL);
        if (i == NULL)
                goto done;

        err = 0;
        for (ml = po->mclist; ml; ml = ml->next) {
                if (ml->ifindex == mreq->mr_ifindex &&
                    ml->type == mreq->mr_type &&
                    ml->alen == mreq->mr_alen &&
                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
                        ml->count++;
                        /* Free the new element ... */
                        kfree(i);
                        goto done;
                }
        }

        i->type = mreq->mr_type;
        i->ifindex = mreq->mr_ifindex;
        i->alen = mreq->mr_alen;
        memcpy(i->addr, mreq->mr_address, i->alen);
        memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
        i->count = 1;
        i->next = po->mclist;
        po->mclist = i;
        err = packet_dev_mc(dev, i, 1);
        if (err) {
                po->mclist = i->next;
                kfree(i);
        }

done:
        rtnl_unlock();
        return err;
}

static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
{
        struct packet_mclist *ml, **mlp;

        rtnl_lock();

        for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
                if (ml->ifindex == mreq->mr_ifindex &&
                    ml->type == mreq->mr_type &&
                    ml->alen == mreq->mr_alen &&
                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
                        if (--ml->count == 0) {
                                struct net_device *dev;
                                *mlp = ml->next;
                                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
                                if (dev)
                                        packet_dev_mc(dev, ml, -1);
                                kfree(ml);
                        }
                        break;
                }
        }
        rtnl_unlock();
        return 0;
}

static void packet_flush_mclist(struct sock *sk)
{
        struct packet_sock *po = pkt_sk(sk);
        struct packet_mclist *ml;

        if (!po->mclist)
                return;

        rtnl_lock();
        while ((ml = po->mclist) != NULL) {
                struct net_device *dev;

                po->mclist = ml->next;
                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
                if (dev != NULL)
                        packet_dev_mc(dev, ml, -1);
                kfree(ml);
        }
        rtnl_unlock();
}

static int
packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
                  unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        int ret;

        if (level != SOL_PACKET)
                return -ENOPROTOOPT;

        switch (optname) {
        case PACKET_ADD_MEMBERSHIP:
        case PACKET_DROP_MEMBERSHIP:
        {
                struct packet_mreq_max mreq;
                int len = optlen;
                memset(&mreq, 0, sizeof(mreq));
                if (len < sizeof(struct packet_mreq))
                        return -EINVAL;
                if (len > sizeof(mreq))
                        len = sizeof(mreq);
                if (copy_from_sockptr(&mreq, optval, len))
                        return -EFAULT;
                if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
                        return -EINVAL;
                if (optname == PACKET_ADD_MEMBERSHIP)
                        ret = packet_mc_add(sk, &mreq);
                else
                        ret = packet_mc_drop(sk, &mreq);
                return ret;
        }

        case PACKET_RX_RING:
        case PACKET_TX_RING:
        {
                union tpacket_req_u req_u;

                ret = -EINVAL;
                lock_sock(sk);
                switch (po->tp_version) {
                case TPACKET_V1:
                case TPACKET_V2:
                        if (optlen < sizeof(req_u.req))
                                break;
                        ret = copy_from_sockptr(&req_u.req, optval,
                                                sizeof(req_u.req)) ?
                                                -EINVAL : 0;
                        break;
                case TPACKET_V3:
                default:
                        if (optlen < sizeof(req_u.req3))
                                break;
                        ret = copy_from_sockptr(&req_u.req3, optval,
                                                sizeof(req_u.req3)) ?
                                                -EINVAL : 0;
                        break;
                }
                if (!ret)
                        ret = packet_set_ring(sk, &req_u, 0,
                                              optname == PACKET_TX_RING);
                release_sock(sk);
                return ret;
        }
        case PACKET_COPY_THRESH:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                WRITE_ONCE(pkt_sk(sk)->copy_thresh, val);
                return 0;
        }
        case PACKET_VERSION:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;
                switch (val) {
                case TPACKET_V1:
                case TPACKET_V2:
                case TPACKET_V3:
                        break;
                default:
                        return -EINVAL;
                }
                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        po->tp_version = val;
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_RESERVE:
        {
                unsigned int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;
                if (val > INT_MAX)
                        return -EINVAL;
                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        po->tp_reserve = val;
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_LOSS:
        {
                unsigned int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val);
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_AUXDATA:
        {
                int val;

                if (optlen < sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
                return 0;
        }
        case PACKET_ORIGDEV:
        {
                int val;

                if (optlen < sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
                return 0;
        }
        case PACKET_VNET_HDR:
        case PACKET_VNET_HDR_SZ:
        {
                int val, hdr_len;

                if (sock->type != SOCK_RAW)
                        return -EINVAL;
                if (optlen < sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                if (optname == PACKET_VNET_HDR_SZ) {
                        if (val && val != sizeof(struct virtio_net_hdr) &&
                            val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
                                return -EINVAL;
                        hdr_len = val;
                } else {
                        hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
                }
                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_TIMESTAMP:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                WRITE_ONCE(po->tp_tstamp, val);
                return 0;
        }
        case PACKET_FANOUT:
        {
                struct fanout_args args = { 0 };

                if (optlen != sizeof(int) && optlen != sizeof(args))
                        return -EINVAL;
                if (copy_from_sockptr(&args, optval, optlen))
                        return -EFAULT;

                return fanout_add(sk, &args);
        }
        case PACKET_FANOUT_DATA:
        {
                /* Paired with the WRITE_ONCE() in fanout_add() */
                if (!READ_ONCE(po->fanout))
                        return -EINVAL;

                return fanout_set_data(po, optval, optlen);
        }
        case PACKET_IGNORE_OUTGOING:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;
                if (val < 0 || val > 1)
                        return -EINVAL;

                WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val);
                return 0;
        }
        case PACKET_TX_HAS_OFF:
        {
                unsigned int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                lock_sock(sk);
                if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
                        packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val);

                release_sock(sk);
                return 0;
        }
        case PACKET_QDISC_BYPASS:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val);
                return 0;
        }
        default:
                return -ENOPROTOOPT;
        }
}

static int packet_getsockopt(struct socket *sock, int level, int optname,
                             char __user *optval, int __user *optlen)
{
        int len;
        int val, lv = sizeof(val);
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        void *data = &val;
        union tpacket_stats_u st;
        struct tpacket_rollover_stats rstats;
        int drops;

        if (level != SOL_PACKET)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case PACKET_STATISTICS:
                spin_lock_bh(&sk->sk_receive_queue.lock);
                memcpy(&st, &po->stats, sizeof(st));
                memset(&po->stats, 0, sizeof(po->stats));
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                drops = atomic_xchg(&po->tp_drops, 0);

                if (po->tp_version == TPACKET_V3) {
                        lv = sizeof(struct tpacket_stats_v3);
                        st.stats3.tp_drops = drops;
                        st.stats3.tp_packets += drops;
                        data = &st.stats3;
                } else {
                        lv = sizeof(struct tpacket_stats);
                        st.stats1.tp_drops = drops;
                        st.stats1.tp_packets += drops;
                        data = &st.stats1;
                }

                break;
        case PACKET_AUXDATA:
                val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
                break;
        case PACKET_ORIGDEV:
                val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
                break;
        case PACKET_VNET_HDR:
                val = !!READ_ONCE(po->vnet_hdr_sz);
                break;
        case PACKET_VNET_HDR_SZ:
                val = READ_ONCE(po->vnet_hdr_sz);
                break;
        case PACKET_COPY_THRESH:
                val = READ_ONCE(pkt_sk(sk)->copy_thresh);
                break;
        case PACKET_VERSION:
                val = po->tp_version;
                break;
        case PACKET_HDRLEN:
                if (len > sizeof(int))
                        len = sizeof(int);
                if (len < sizeof(int))
                        return -EINVAL;
                if (copy_from_user(&val, optval, len))
                        return -EFAULT;
                switch (val) {
                case TPACKET_V1:
                        val = sizeof(struct tpacket_hdr);
                        break;
                case TPACKET_V2:
                        val = sizeof(struct tpacket2_hdr);
                        break;
                case TPACKET_V3:
                        val = sizeof(struct tpacket3_hdr);
                        break;
                default:
                        return -EINVAL;
                }
                break;
        case PACKET_RESERVE:
                val = po->tp_reserve;
                break;
        case PACKET_LOSS:
                val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS);
                break;
        case PACKET_TIMESTAMP:
                val = READ_ONCE(po->tp_tstamp);
                break;
        case PACKET_FANOUT:
                val = (po->fanout ?
                       ((u32)po->fanout->id |
                        ((u32)po->fanout->type << 16) |
                        ((u32)po->fanout->flags << 24)) :
                       0);
                break;
        case PACKET_IGNORE_OUTGOING:
                val = READ_ONCE(po->prot_hook.ignore_outgoing);
                break;
        case PACKET_ROLLOVER_STATS:
                if (!po->rollover)
                        return -EINVAL;
                rstats.tp_all = atomic_long_read(&po->rollover->num);
                rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
                rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
                data = &rstats;
                lv = sizeof(rstats);
                break;
        case PACKET_TX_HAS_OFF:
                val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF);
                break;
        case PACKET_QDISC_BYPASS:
                val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS);
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (len > lv)
                len = lv;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, data, len))
                return -EFAULT;
        return 0;
}

static int packet_notifier(struct notifier_block *this,
                           unsigned long msg, void *ptr)
{
        struct sock *sk;
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);

        rcu_read_lock();
        sk_for_each_rcu(sk, &net->packet.sklist) {
                struct packet_sock *po = pkt_sk(sk);

                switch (msg) {
                case NETDEV_UNREGISTER:
                        if (po->mclist)
                                packet_dev_mclist_delete(dev, &po->mclist);
                        fallthrough;

                case NETDEV_DOWN:
                        if (dev->ifindex == po->ifindex) {
                                spin_lock(&po->bind_lock);
                                if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
                                        __unregister_prot_hook(sk, false);
                                        sk->sk_err = ENETDOWN;
                                        if (!sock_flag(sk, SOCK_DEAD))
                                                sk_error_report(sk);
                                }
                                if (msg == NETDEV_UNREGISTER) {
                                        packet_cached_dev_reset(po);
                                        WRITE_ONCE(po->ifindex, -1);
                                        netdev_put(po->prot_hook.dev,
                                                   &po->prot_hook.dev_tracker);
                                        po->prot_hook.dev = NULL;
                                }
                                spin_unlock(&po->bind_lock);
                        }
                        break;
                case NETDEV_UP:
                        if (dev->ifindex == po->ifindex) {
                                spin_lock(&po->bind_lock);
                                if (po->num)
                                        register_prot_hook(sk);
                                spin_unlock(&po->bind_lock);
                        }
                        break;
                }
        }
        rcu_read_unlock();
        return NOTIFY_DONE;
}


static int packet_ioctl(struct socket *sock, unsigned int cmd,
                        unsigned long arg)
{
        struct sock *sk = sock->sk;

        switch (cmd) {
        case SIOCOUTQ:
        {
                int amount = sk_wmem_alloc_get(sk);

                return put_user(amount, (int __user *)arg);
        }
        case SIOCINQ:
        {
                struct sk_buff *skb;
                int amount = 0;

                spin_lock_bh(&sk->sk_receive_queue.lock);
                skb = skb_peek(&sk->sk_receive_queue);
                if (skb)
                        amount = skb->len;
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                return put_user(amount, (int __user *)arg);
        }
#ifdef CONFIG_INET
        case SIOCADDRT:
        case SIOCDELRT:
        case SIOCDARP:
        case SIOCGARP:
        case SIOCSARP:
        case SIOCGIFADDR:
        case SIOCSIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCSIFBRDADDR:
        case SIOCGIFNETMASK:
        case SIOCSIFNETMASK:
        case SIOCGIFDSTADDR:
        case SIOCSIFDSTADDR:
        case SIOCSIFFLAGS:
                return inet_dgram_ops.ioctl(sock, cmd, arg);
#endif

        default:
                return -ENOIOCTLCMD;
        }
        return 0;
}

static __poll_t packet_poll(struct file *file, struct socket *sock,
                                poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        __poll_t mask = datagram_poll(file, sock, wait);

        spin_lock_bh(&sk->sk_receive_queue.lock);
        if (po->rx_ring.pg_vec) {
                if (!packet_previous_rx_frame(po, &po->rx_ring,
                        TP_STATUS_KERNEL))
                        mask |= EPOLLIN | EPOLLRDNORM;
        }
        packet_rcv_try_clear_pressure(po);
        spin_unlock_bh(&sk->sk_receive_queue.lock);
        spin_lock_bh(&sk->sk_write_queue.lock);
        if (po->tx_ring.pg_vec) {
                if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
                        mask |= EPOLLOUT | EPOLLWRNORM;
        }
        spin_unlock_bh(&sk->sk_write_queue.lock);
        return mask;
}


/* Dirty? Well, I still did not learn better way to account
 * for user mmaps.
 */

static void packet_mm_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct socket *sock = file->private_data;
        struct sock *sk = sock->sk;

        if (sk)
                atomic_long_inc(&pkt_sk(sk)->mapped);
}

static void packet_mm_close(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct socket *sock = file->private_data;
        struct sock *sk = sock->sk;

        if (sk)
                atomic_long_dec(&pkt_sk(sk)->mapped);
}

static const struct vm_operations_struct packet_mmap_ops = {
        .open        =        packet_mm_open,
        .close        =        packet_mm_close,
};

static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
                        unsigned int len)
{
        int i;

        for (i = 0; i < len; i++) {
                if (likely(pg_vec[i].buffer)) {
                        if (is_vmalloc_addr(pg_vec[i].buffer))
                                vfree(pg_vec[i].buffer);
                        else
                                free_pages((unsigned long)pg_vec[i].buffer,
                                           order);
                        pg_vec[i].buffer = NULL;
                }
        }
        kfree(pg_vec);
}

static char *alloc_one_pg_vec_page(unsigned long order)
{
        char *buffer;
        gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
                          __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

        buffer = (char *) __get_free_pages(gfp_flags, order);
        if (buffer)
                return buffer;

        /* __get_free_pages failed, fall back to vmalloc */
        buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
        if (buffer)
                return buffer;

        /* vmalloc failed, lets dig into swap here */
        gfp_flags &= ~__GFP_NORETRY;
        buffer = (char *) __get_free_pages(gfp_flags, order);
        if (buffer)
                return buffer;

        /* complete and utter failure */
        return NULL;
}

static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
        unsigned int block_nr = req->tp_block_nr;
        struct pgv *pg_vec;
        int i;

        pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
        if (unlikely(!pg_vec))
                goto out;

        for (i = 0; i < block_nr; i++) {
                pg_vec[i].buffer = alloc_one_pg_vec_page(order);
                if (unlikely(!pg_vec[i].buffer))
                        goto out_free_pgvec;
        }

out:
        return pg_vec;

out_free_pgvec:
        free_pg_vec(pg_vec, order, block_nr);
        pg_vec = NULL;
        goto out;
}

static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                int closing, int tx_ring)
{
        struct pgv *pg_vec = NULL;
        struct packet_sock *po = pkt_sk(sk);
        unsigned long *rx_owner_map = NULL;
        int was_running, order = 0;
        struct packet_ring_buffer *rb;
        struct sk_buff_head *rb_queue;
        __be16 num;
        int err;
        /* Added to avoid minimal code churn */
        struct tpacket_req *req = &req_u->req;

        rb = tx_ring ? &po->tx_ring : &po->rx_ring;
        rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

        err = -EBUSY;
        if (!closing) {
                if (atomic_long_read(&po->mapped))
                        goto out;
                if (packet_read_pending(rb))
                        goto out;
        }

        if (req->tp_block_nr) {
                unsigned int min_frame_size;

                /* Sanity tests and some calculations */
                err = -EBUSY;
                if (unlikely(rb->pg_vec))
                        goto out;

                switch (po->tp_version) {
                case TPACKET_V1:
                        po->tp_hdrlen = TPACKET_HDRLEN;
                        break;
                case TPACKET_V2:
                        po->tp_hdrlen = TPACKET2_HDRLEN;
                        break;
                case TPACKET_V3:
                        po->tp_hdrlen = TPACKET3_HDRLEN;
                        break;
                }

                err = -EINVAL;
                if (unlikely((int)req->tp_block_size <= 0))
                        goto out;
                if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
                        goto out;
                min_frame_size = po->tp_hdrlen + po->tp_reserve;
                if (po->tp_version >= TPACKET_V3 &&
                    req->tp_block_size <
                    BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
                        goto out;
                if (unlikely(req->tp_frame_size < min_frame_size))
                        goto out;
                if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
                        goto out;

                rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
                if (unlikely(rb->frames_per_block == 0))
                        goto out;
                if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
                        goto out;
                if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
                                        req->tp_frame_nr))
                        goto out;

                err = -ENOMEM;
                order = get_order(req->tp_block_size);
                pg_vec = alloc_pg_vec(req, order);
                if (unlikely(!pg_vec))
                        goto out;
                switch (po->tp_version) {
                case TPACKET_V3:
                        /* Block transmit is not supported yet */
                        if (!tx_ring) {
                                init_prb_bdqc(po, rb, pg_vec, req_u);
                        } else {
                                struct tpacket_req3 *req3 = &req_u->req3;

                                if (req3->tp_retire_blk_tov ||
                                    req3->tp_sizeof_priv ||
                                    req3->tp_feature_req_word) {
                                        err = -EINVAL;
                                        goto out_free_pg_vec;
                                }
                        }
                        break;
                default:
                        if (!tx_ring) {
                                rx_owner_map = bitmap_alloc(req->tp_frame_nr,
                                        GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
                                if (!rx_owner_map)
                                        goto out_free_pg_vec;
                        }
                        break;
                }
        }
        /* Done */
        else {
                err = -EINVAL;
                if (unlikely(req->tp_frame_nr))
                        goto out;
        }


        /* Detach socket from network */
        spin_lock(&po->bind_lock);
        was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING);
        num = po->num;
        if (was_running) {
                WRITE_ONCE(po->num, 0);
                __unregister_prot_hook(sk, false);
        }
        spin_unlock(&po->bind_lock);

        synchronize_net();

        err = -EBUSY;
        mutex_lock(&po->pg_vec_lock);
        if (closing || atomic_long_read(&po->mapped) == 0) {
                err = 0;
                spin_lock_bh(&rb_queue->lock);
                swap(rb->pg_vec, pg_vec);
                if (po->tp_version <= TPACKET_V2)
                        swap(rb->rx_owner_map, rx_owner_map);
                rb->frame_max = (req->tp_frame_nr - 1);
                rb->head = 0;
                rb->frame_size = req->tp_frame_size;
                spin_unlock_bh(&rb_queue->lock);

                swap(rb->pg_vec_order, order);
                swap(rb->pg_vec_len, req->tp_block_nr);

                rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
                po->prot_hook.func = (po->rx_ring.pg_vec) ?
                                                tpacket_rcv : packet_rcv;
                skb_queue_purge(rb_queue);
                if (atomic_long_read(&po->mapped))
                        pr_err("packet_mmap: vma is busy: %ld\n",
                               atomic_long_read(&po->mapped));
        }
        mutex_unlock(&po->pg_vec_lock);

        spin_lock(&po->bind_lock);
        if (was_running) {
                WRITE_ONCE(po->num, num);
                register_prot_hook(sk);
        }
        spin_unlock(&po->bind_lock);
        if (pg_vec && (po->tp_version > TPACKET_V2)) {
                /* Because we don't support block-based V3 on tx-ring */
                if (!tx_ring)
                        prb_shutdown_retire_blk_timer(po, rb_queue);
        }

out_free_pg_vec:
        if (pg_vec) {
                bitmap_free(rx_owner_map);
                free_pg_vec(pg_vec, order, req->tp_block_nr);
        }
out:
        return err;
}

static int packet_mmap(struct file *file, struct socket *sock,
                struct vm_area_struct *vma)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        unsigned long size, expected_size;
        struct packet_ring_buffer *rb;
        unsigned long start;
        int err = -EINVAL;
        int i;

        if (vma->vm_pgoff)
                return -EINVAL;

        mutex_lock(&po->pg_vec_lock);

        expected_size = 0;
        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
                if (rb->pg_vec) {
                        expected_size += rb->pg_vec_len
                                                * rb->pg_vec_pages
                                                * PAGE_SIZE;
                }
        }

        if (expected_size == 0)
                goto out;

        size = vma->vm_end - vma->vm_start;
        if (size != expected_size)
                goto out;

        start = vma->vm_start;
        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
                if (rb->pg_vec == NULL)
                        continue;

                for (i = 0; i < rb->pg_vec_len; i++) {
                        struct page *page;
                        void *kaddr = rb->pg_vec[i].buffer;
                        int pg_num;

                        for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
                                page = pgv_to_page(kaddr);
                                err = vm_insert_page(vma, start, page);
                                if (unlikely(err))
                                        goto out;
                                start += PAGE_SIZE;
                                kaddr += PAGE_SIZE;
                        }
                }
        }

        atomic_long_inc(&po->mapped);
        vma->vm_ops = &packet_mmap_ops;
        err = 0;

out:
        mutex_unlock(&po->pg_vec_lock);
        return err;
}

static const struct proto_ops packet_ops_spkt = {
        .family =        PF_PACKET,
        .owner =        THIS_MODULE,
        .release =        packet_release,
        .bind =                packet_bind_spkt,
        .connect =        sock_no_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        packet_getname_spkt,
        .poll =                datagram_poll,
        .ioctl =        packet_ioctl,
        .gettstamp =        sock_gettstamp,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .sendmsg =        packet_sendmsg_spkt,
        .recvmsg =        packet_recvmsg,
        .mmap =                sock_no_mmap,
};

static const struct proto_ops packet_ops = {
        .family =        PF_PACKET,
        .owner =        THIS_MODULE,
        .release =        packet_release,
        .bind =                packet_bind,
        .connect =        sock_no_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        packet_getname,
        .poll =                packet_poll,
        .ioctl =        packet_ioctl,
        .gettstamp =        sock_gettstamp,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        packet_setsockopt,
        .getsockopt =        packet_getsockopt,
        .sendmsg =        packet_sendmsg,
        .recvmsg =        packet_recvmsg,
        .mmap =                packet_mmap,
};

static const struct net_proto_family packet_family_ops = {
        .family =        PF_PACKET,
        .create =        packet_create,
        .owner        =        THIS_MODULE,
};

static struct notifier_block packet_netdev_notifier = {
        .notifier_call =        packet_notifier,
};

#ifdef CONFIG_PROC_FS

static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct net *net = seq_file_net(seq);

        rcu_read_lock();
        return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
}

static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct net *net = seq_file_net(seq);
        return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
}

static void packet_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        rcu_read_unlock();
}

static int packet_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_printf(seq,
                           "%*sRefCnt Type Proto  Iface R Rmem   User   Inode\n",
                           IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
        else {
                struct sock *s = sk_entry(v);
                const struct packet_sock *po = pkt_sk(s);

                seq_printf(seq,
                           "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
                           s,
                           refcount_read(&s->sk_refcnt),
                           s->sk_type,
                           ntohs(READ_ONCE(po->num)),
                           READ_ONCE(po->ifindex),
                           packet_sock_flag(po, PACKET_SOCK_RUNNING),
                           atomic_read(&s->sk_rmem_alloc),
                           from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
                           sock_i_ino(s));
        }

        return 0;
}

static const struct seq_operations packet_seq_ops = {
        .start        = packet_seq_start,
        .next        = packet_seq_next,
        .stop        = packet_seq_stop,
        .show        = packet_seq_show,
};
#endif

static int __net_init packet_net_init(struct net *net)
{
        mutex_init(&net->packet.sklist_lock);
        INIT_HLIST_HEAD(&net->packet.sklist);

#ifdef CONFIG_PROC_FS
        if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;
#endif /* CONFIG_PROC_FS */

        return 0;
}

static void __net_exit packet_net_exit(struct net *net)
{
        remove_proc_entry("packet", net->proc_net);
        WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
}

static struct pernet_operations packet_net_ops = {
        .init = packet_net_init,
        .exit = packet_net_exit,
};


static void __exit packet_exit(void)
{
        sock_unregister(PF_PACKET);
        proto_unregister(&packet_proto);
        unregister_netdevice_notifier(&packet_netdev_notifier);
        unregister_pernet_subsys(&packet_net_ops);
}

static int __init packet_init(void)
{
        int rc;

        rc = register_pernet_subsys(&packet_net_ops);
        if (rc)
                goto out;
        rc = register_netdevice_notifier(&packet_netdev_notifier);
        if (rc)
                goto out_pernet;
        rc = proto_register(&packet_proto, 0);
        if (rc)
                goto out_notifier;
        rc = sock_register(&packet_family_ops);
        if (rc)
                goto out_proto;

        return 0;

out_proto:
        proto_unregister(&packet_proto);
out_notifier:
        unregister_netdevice_notifier(&packet_netdev_notifier);
out_pernet:
        unregister_pernet_subsys(&packet_net_ops);
out:
        return rc;
}

module_init(packet_init);
module_exit(packet_exit);
MODULE_DESCRIPTION("Packet socket support (AF_PACKET)");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_PACKET);















































    1 







    1 










    1 
















    1 


































    1 










    1 
    1 
    1 






    1 

    1 





    1 



    1 








    1 










    1 











































    1 













    1 






    1 





























    1 








    1 



























































































































































































    1 


    1 

    1 







    1 


    1 

















































    1 






































    1 






    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2003
 * Copyright (c) Cisco 1999,2000
 * Copyright (c) Motorola 1999,2000,2001
 * Copyright (c) La Monte H.P. Yarroll 2001
 *
 * This file is part of the SCTP kernel implementation.
 *
 * A collection class to handle the storage of transport addresses.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Daisy Chang           <daisyc@us.ibm.com>
 */

#include <linux/types.h>
#include <linux/slab.h>
#include <linux/in.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/if_inet6.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

/* Forward declarations for internal helpers. */
static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest,
                              union sctp_addr *addr, enum sctp_scope scope,
                              gfp_t gfp, int flags);
static void sctp_bind_addr_clean(struct sctp_bind_addr *);

/* First Level Abstractions. */

/* Copy 'src' to 'dest' taking 'scope' into account.  Omit addresses
 * in 'src' which have a broader scope than 'scope'.
 */
int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest,
                        const struct sctp_bind_addr *src,
                        enum sctp_scope scope, gfp_t gfp,
                        int flags)
{
        struct sctp_sockaddr_entry *addr;
        int error = 0;

        /* All addresses share the same port.  */
        dest->port = src->port;

        /* Extract the addresses which are relevant for this scope.  */
        list_for_each_entry(addr, &src->address_list, list) {
                error = sctp_copy_one_addr(net, dest, &addr->a, scope,
                                           gfp, flags);
                if (error < 0)
                        goto out;
        }

        /* If there are no addresses matching the scope and
         * this is global scope, try to get a link scope address, with
         * the assumption that we must be sitting behind a NAT.
         */
        if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) {
                list_for_each_entry(addr, &src->address_list, list) {
                        error = sctp_copy_one_addr(net, dest, &addr->a,
                                                   SCTP_SCOPE_LINK, gfp,
                                                   flags);
                        if (error < 0)
                                goto out;
                }
        }

        /* If somehow no addresses were found that can be used with this
         * scope, it's an error.
         */
        if (list_empty(&dest->address_list))
                error = -ENETUNREACH;

out:
        if (error)
                sctp_bind_addr_clean(dest);

        return error;
}

/* Exactly duplicate the address lists.  This is necessary when doing
 * peer-offs and accepts.  We don't want to put all the current system
 * addresses into the endpoint.  That's useless.  But we do want duplicat
 * the list of bound addresses that the older endpoint used.
 */
int sctp_bind_addr_dup(struct sctp_bind_addr *dest,
                        const struct sctp_bind_addr *src,
                        gfp_t gfp)
{
        struct sctp_sockaddr_entry *addr;
        int error = 0;

        /* All addresses share the same port.  */
        dest->port = src->port;

        list_for_each_entry(addr, &src->address_list, list) {
                error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a),
                                           1, gfp);
                if (error < 0)
                        break;
        }

        return error;
}

/* Initialize the SCTP_bind_addr structure for either an endpoint or
 * an association.
 */
void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port)
{
        INIT_LIST_HEAD(&bp->address_list);
        bp->port = port;
}

/* Dispose of the address list. */
static void sctp_bind_addr_clean(struct sctp_bind_addr *bp)
{
        struct sctp_sockaddr_entry *addr, *temp;

        /* Empty the bind address list. */
        list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
                list_del_rcu(&addr->list);
                kfree_rcu(addr, rcu);
                SCTP_DBG_OBJCNT_DEC(addr);
        }
}

/* Dispose of an SCTP_bind_addr structure  */
void sctp_bind_addr_free(struct sctp_bind_addr *bp)
{
        /* Empty the bind address list. */
        sctp_bind_addr_clean(bp);
}

/* Add an address to the bind address list in the SCTP_bind_addr structure. */
int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
                       int new_size, __u8 addr_state, gfp_t gfp)
{
        struct sctp_sockaddr_entry *addr;

        /* Add the address to the bind address list.  */
        addr = kzalloc(sizeof(*addr), gfp);
        if (!addr)
                return -ENOMEM;

        memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size));

        /* Fix up the port if it has not yet been set.
         * Both v4 and v6 have the port at the same offset.
         */
        if (!addr->a.v4.sin_port)
                addr->a.v4.sin_port = htons(bp->port);

        addr->state = addr_state;
        addr->valid = 1;

        INIT_LIST_HEAD(&addr->list);

        /* We always hold a socket lock when calling this function,
         * and that acts as a writer synchronizing lock.
         */
        list_add_tail_rcu(&addr->list, &bp->address_list);
        SCTP_DBG_OBJCNT_INC(addr);

        return 0;
}

/* Delete an address from the bind address list in the SCTP_bind_addr
 * structure.
 */
int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
{
        struct sctp_sockaddr_entry *addr, *temp;
        int found = 0;

        /* We hold the socket lock when calling this function,
         * and that acts as a writer synchronizing lock.
         */
        list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
                if (sctp_cmp_addr_exact(&addr->a, del_addr)) {
                        /* Found the exact match. */
                        found = 1;
                        addr->valid = 0;
                        list_del_rcu(&addr->list);
                        break;
                }
        }

        if (found) {
                kfree_rcu(addr, rcu);
                SCTP_DBG_OBJCNT_DEC(addr);
                return 0;
        }

        return -EINVAL;
}

/* Create a network byte-order representation of all the addresses
 * formated as SCTP parameters.
 *
 * The second argument is the return value for the length.
 */
union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
                                         int *addrs_len,
                                         gfp_t gfp)
{
        union sctp_params addrparms;
        union sctp_params retval;
        int addrparms_len;
        union sctp_addr_param rawaddr;
        int len;
        struct sctp_sockaddr_entry *addr;
        struct list_head *pos;
        struct sctp_af *af;

        addrparms_len = 0;
        len = 0;

        /* Allocate enough memory at once. */
        list_for_each(pos, &bp->address_list) {
                len += sizeof(union sctp_addr_param);
        }

        /* Don't even bother embedding an address if there
         * is only one.
         */
        if (len == sizeof(union sctp_addr_param)) {
                retval.v = NULL;
                goto end_raw;
        }

        retval.v = kmalloc(len, gfp);
        if (!retval.v)
                goto end_raw;

        addrparms = retval;

        list_for_each_entry(addr, &bp->address_list, list) {
                af = sctp_get_af_specific(addr->a.v4.sin_family);
                len = af->to_addr_param(&addr->a, &rawaddr);
                memcpy(addrparms.v, &rawaddr, len);
                addrparms.v += len;
                addrparms_len += len;
        }

end_raw:
        *addrs_len = addrparms_len;
        return retval;
}

/*
 * Create an address list out of the raw address list format (IPv4 and IPv6
 * address parameters).
 */
int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
                           int addrs_len, __u16 port, gfp_t gfp)
{
        union sctp_addr_param *rawaddr;
        struct sctp_paramhdr *param;
        union sctp_addr addr;
        int retval = 0;
        int len;
        struct sctp_af *af;

        /* Convert the raw address to standard address format */
        while (addrs_len) {
                param = (struct sctp_paramhdr *)raw_addr_list;
                rawaddr = (union sctp_addr_param *)raw_addr_list;

                af = sctp_get_af_specific(param_type2af(param->type));
                if (unlikely(!af) ||
                    !af->from_addr_param(&addr, rawaddr, htons(port), 0)) {
                        retval = -EINVAL;
                        goto out_err;
                }

                if (sctp_bind_addr_state(bp, &addr) != -1)
                        goto next;
                retval = sctp_add_bind_addr(bp, &addr, sizeof(addr),
                                            SCTP_ADDR_SRC, gfp);
                if (retval)
                        /* Can't finish building the list, clean up. */
                        goto out_err;

next:
                len = ntohs(param->length);
                addrs_len -= len;
                raw_addr_list += len;
        }

        return retval;

out_err:
        if (retval)
                sctp_bind_addr_clean(bp);

        return retval;
}

/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

/* Does this contain a specified address?  Allow wildcarding. */
int sctp_bind_addr_match(struct sctp_bind_addr *bp,
                         const union sctp_addr *addr,
                         struct sctp_sock *opt)
{
        struct sctp_sockaddr_entry *laddr;
        int match = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                if (!laddr->valid)
                        continue;
                if (opt->pf->cmp_addr(&laddr->a, addr, opt)) {
                        match = 1;
                        break;
                }
        }
        rcu_read_unlock();

        return match;
}

int sctp_bind_addrs_check(struct sctp_sock *sp,
                          struct sctp_sock *sp2, int cnt2)
{
        struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr;
        struct sctp_bind_addr *bp = &sp->ep->base.bind_addr;
        struct sctp_sockaddr_entry *laddr, *laddr2;
        bool exist = false;
        int cnt = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                list_for_each_entry_rcu(laddr2, &bp2->address_list, list) {
                        if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) &&
                            laddr->valid && laddr2->valid) {
                                exist = true;
                                goto next;
                        }
                }
                cnt = 0;
                break;
next:
                cnt++;
        }
        rcu_read_unlock();

        return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1);
}

/* Does the address 'addr' conflict with any addresses in
 * the bp.
 */
int sctp_bind_addr_conflict(struct sctp_bind_addr *bp,
                            const union sctp_addr *addr,
                            struct sctp_sock *bp_sp,
                            struct sctp_sock *addr_sp)
{
        struct sctp_sockaddr_entry *laddr;
        int conflict = 0;
        struct sctp_sock *sp;

        /* Pick the IPv6 socket as the basis of comparison
         * since it's usually a superset of the IPv4.
         * If there is no IPv6 socket, then default to bind_addr.
         */
        if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6)
                sp = bp_sp;
        else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6)
                sp = addr_sp;
        else
                sp = bp_sp;

        rcu_read_lock();
        list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                if (!laddr->valid)
                        continue;

                conflict = sp->pf->cmp_addr(&laddr->a, addr, sp);
                if (conflict)
                        break;
        }
        rcu_read_unlock();

        return conflict;
}

/* Get the state of the entry in the bind_addr_list */
int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
                         const union sctp_addr *addr)
{
        struct sctp_sockaddr_entry *laddr;
        struct sctp_af *af;

        af = sctp_get_af_specific(addr->sa.sa_family);
        if (unlikely(!af))
                return -1;

        list_for_each_entry_rcu(laddr, &bp->address_list, list) {
                if (!laddr->valid)
                        continue;
                if (af->cmp_addr(&laddr->a, addr))
                        return laddr->state;
        }

        return -1;
}

/* Find the first address in the bind address list that is not present in
 * the addrs packed array.
 */
union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr        *bp,
                                        const union sctp_addr        *addrs,
                                        int                        addrcnt,
                                        struct sctp_sock        *opt)
{
        struct sctp_sockaddr_entry        *laddr;
        union sctp_addr                        *addr;
        void                                 *addr_buf;
        struct sctp_af                        *af;
        int                                i;

        /* This is only called sctp_send_asconf_del_ip() and we hold
         * the socket lock in that code patch, so that address list
         * can't change.
         */
        list_for_each_entry(laddr, &bp->address_list, list) {
                addr_buf = (union sctp_addr *)addrs;
                for (i = 0; i < addrcnt; i++) {
                        addr = addr_buf;
                        af = sctp_get_af_specific(addr->v4.sin_family);
                        if (!af)
                                break;

                        if (opt->pf->cmp_addr(&laddr->a, addr, opt))
                                break;

                        addr_buf += af->sockaddr_len;
                }
                if (i == addrcnt)
                        return &laddr->a;
        }

        return NULL;
}

/* Copy out addresses from the global local address list. */
static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest,
                              union sctp_addr *addr, enum sctp_scope scope,
                              gfp_t gfp, int flags)
{
        int error = 0;

        if (sctp_is_any(NULL, addr)) {
                error = sctp_copy_local_addr_list(net, dest, scope, gfp, flags);
        } else if (sctp_in_scope(net, addr, scope)) {
                /* Now that the address is in scope, check to see if
                 * the address type is supported by local sock as
                 * well as the remote peer.
                 */
                if ((((AF_INET == addr->sa.sa_family) &&
                      (flags & SCTP_ADDR4_ALLOWED) &&
                      (flags & SCTP_ADDR4_PEERSUPP))) ||
                    (((AF_INET6 == addr->sa.sa_family) &&
                      (flags & SCTP_ADDR6_ALLOWED) &&
                      (flags & SCTP_ADDR6_PEERSUPP))))
                        error = sctp_add_bind_addr(dest, addr, sizeof(*addr),
                                                   SCTP_ADDR_SRC, gfp);
        }

        return error;
}

/* Is this a wildcard address?  */
int sctp_is_any(struct sock *sk, const union sctp_addr *addr)
{
        unsigned short fam = 0;
        struct sctp_af *af;

        /* Try to get the right address family */
        if (addr->sa.sa_family != AF_UNSPEC)
                fam = addr->sa.sa_family;
        else if (sk)
                fam = sk->sk_family;

        af = sctp_get_af_specific(fam);
        if (!af)
                return 0;

        return af->is_any(addr);
}

/* Is 'addr' valid for 'scope'?  */
int sctp_in_scope(struct net *net, const union sctp_addr *addr,
                  enum sctp_scope scope)
{
        enum sctp_scope addr_scope = sctp_scope(addr);

        /* The unusable SCTP addresses will not be considered with
         * any defined scopes.
         */
        if (SCTP_SCOPE_UNUSABLE == addr_scope)
                return 0;
        /*
         * For INIT and INIT-ACK address list, let L be the level of
         * requested destination address, sender and receiver
         * SHOULD include all of its addresses with level greater
         * than or equal to L.
         *
         * Address scoping can be selectively controlled via sysctl
         * option
         */
        switch (net->sctp.scope_policy) {
        case SCTP_SCOPE_POLICY_DISABLE:
                return 1;
        case SCTP_SCOPE_POLICY_ENABLE:
                if (addr_scope <= scope)
                        return 1;
                break;
        case SCTP_SCOPE_POLICY_PRIVATE:
                if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope)
                        return 1;
                break;
        case SCTP_SCOPE_POLICY_LINK:
                if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope)
                        return 1;
                break;
        default:
                break;
        }

        return 0;
}

int sctp_is_ep_boundall(struct sock *sk)
{
        struct sctp_bind_addr *bp;
        struct sctp_sockaddr_entry *addr;

        bp = &sctp_sk(sk)->ep->base.bind_addr;
        if (sctp_list_single_entry(&bp->address_list)) {
                addr = list_entry(bp->address_list.next,
                                  struct sctp_sockaddr_entry, list);
                if (sctp_is_any(sk, &addr->a))
                        return 1;
        }
        return 0;
}

/********************************************************************
 * 3rd Level Abstractions
 ********************************************************************/

/* What is the scope of 'addr'?  */
enum sctp_scope sctp_scope(const union sctp_addr *addr)
{
        struct sctp_af *af;

        af = sctp_get_af_specific(addr->sa.sa_family);
        if (!af)
                return SCTP_SCOPE_UNUSABLE;

        return af->scope((union sctp_addr *)addr);
}









































    1 

























    1 






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/sysfs/dir.c - sysfs core and dir operation implementation
 *
 * Copyright (c) 2001-3 Patrick Mochel
 * Copyright (c) 2007 SUSE Linux Products GmbH
 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
 *
 * Please see Documentation/filesystems/sysfs.rst for more information.
 */

#define pr_fmt(fmt)        "sysfs: " fmt

#include <linux/fs.h>
#include <linux/kobject.h>
#include <linux/slab.h>
#include "sysfs.h"

DEFINE_SPINLOCK(sysfs_symlink_target_lock);

void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
{
        char *buf;

        buf = kzalloc(PATH_MAX, GFP_KERNEL);
        if (buf)
                kernfs_path(parent, buf, PATH_MAX);

        pr_warn("cannot create duplicate filename '%s/%s'\n", buf, name);
        dump_stack();

        kfree(buf);
}

/**
 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
 * @kobj: object we're creating directory for
 * @ns: the namespace tag to use
 */
int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
        struct kernfs_node *parent, *kn;
        kuid_t uid;
        kgid_t gid;

        if (WARN_ON(!kobj))
                return -EINVAL;

        if (kobj->parent)
                parent = kobj->parent->sd;
        else
                parent = sysfs_root_kn;

        if (!parent)
                return -ENOENT;

        kobject_get_ownership(kobj, &uid, &gid);

        kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid,
                                  kobj, ns);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, kobject_name(kobj));
                return PTR_ERR(kn);
        }

        kobj->sd = kn;
        return 0;
}

/**
 *        sysfs_remove_dir - remove an object's directory.
 *        @kobj:        object.
 *
 *        The only thing special about this is that we remove any files in
 *        the directory before we remove the directory, and we've inlined
 *        what used to be sysfs_rmdir() below, instead of calling separately.
 */
void sysfs_remove_dir(struct kobject *kobj)
{
        struct kernfs_node *kn = kobj->sd;

        /*
         * In general, kobject owner is responsible for ensuring removal
         * doesn't race with other operations and sysfs doesn't provide any
         * protection; however, when @kobj is used as a symlink target, the
         * symlinking entity usually doesn't own @kobj and thus has no
         * control over removal.  @kobj->sd may be removed anytime
         * and symlink code may end up dereferencing an already freed node.
         *
         * sysfs_symlink_target_lock synchronizes @kobj->sd
         * disassociation against symlink operations so that symlink code
         * can safely dereference @kobj->sd.
         */
        spin_lock(&sysfs_symlink_target_lock);
        kobj->sd = NULL;
        spin_unlock(&sysfs_symlink_target_lock);

        if (kn) {
                WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
                kernfs_remove(kn);
        }
}

int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                        const void *new_ns)
{
        struct kernfs_node *parent;
        int ret;

        parent = kernfs_get_parent(kobj->sd);
        ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
        kernfs_put(parent);
        return ret;
}

int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
                      const void *new_ns)
{
        struct kernfs_node *kn = kobj->sd;
        struct kernfs_node *new_parent;

        new_parent = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : sysfs_root_kn;

        return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
}

/**
 * sysfs_create_mount_point - create an always empty directory
 * @parent_kobj:  kobject that will contain this always empty directory
 * @name: The name of the always empty directory to add
 */
int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name)
{
        struct kernfs_node *kn, *parent = parent_kobj->sd;

        kn = kernfs_create_empty_dir(parent, name);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, name);
                return PTR_ERR(kn);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sysfs_create_mount_point);

/**
 *        sysfs_remove_mount_point - remove an always empty directory.
 *        @parent_kobj: kobject that will contain this always empty directory
 *        @name: The name of the always empty directory to remove
 *
 */
void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name)
{
        struct kernfs_node *parent = parent_kobj->sd;

        kernfs_remove_by_name_ns(parent, name, NULL);
}
EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);









































































































































































































































































































































































































































































































































































































































































    2 




    3 
    2 








    2 





























































































































































































































    3 






    2 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * fsnotify inode mark locking/lifetime/and refcnting
 *
 * REFCNT:
 * The group->recnt and mark->refcnt tell how many "things" in the kernel
 * currently are referencing the objects. Both kind of objects typically will
 * live inside the kernel with a refcnt of 2, one for its creation and one for
 * the reference a group and a mark hold to each other.
 * If you are holding the appropriate locks, you can take a reference and the
 * object itself is guaranteed to survive until the reference is dropped.
 *
 * LOCKING:
 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
 * in order as follows:
 *
 * group->mark_mutex
 * mark->lock
 * mark->connector->lock
 *
 * group->mark_mutex protects the marks_list anchored inside a given group and
 * each mark is hooked via the g_list.  It also protects the groups private
 * data (i.e group limits).

 * mark->lock protects the marks attributes like its masks and flags.
 * Furthermore it protects the access to a reference of the group that the mark
 * is assigned to as well as the access to a reference of the inode/vfsmount
 * that is being watched by the mark.
 *
 * mark->connector->lock protects the list of marks anchored inside an
 * inode / vfsmount and each mark is hooked via the i_list.
 *
 * A list of notification marks relating to inode / mnt is contained in
 * fsnotify_mark_connector. That structure is alive as long as there are any
 * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
 * detached from fsnotify_mark_connector when last reference to the mark is
 * dropped.  Thus having mark reference is enough to protect mark->connector
 * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
 * because we remove mark from g_list before dropping mark reference associated
 * with that, any mark found through g_list is guaranteed to have
 * mark->connector set until we drop group->mark_mutex.
 *
 * LIFETIME:
 * Inode marks survive between when they are added to an inode and when their
 * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
 *
 * The inode mark can be cleared for a number of different reasons including:
 * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
 * - The inode is being evicted from cache. (fsnotify_inode_delete)
 * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
 * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
 * - The fsnotify_group associated with the mark is going away and all such marks
 *   need to be cleaned up. (fsnotify_clear_marks_by_group)
 *
 * This has the very interesting property of being able to run concurrently with
 * any (or all) other directions.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/ratelimit.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

#define FSNOTIFY_REAPER_DELAY        (1)        /* 1 jiffy */

struct srcu_struct fsnotify_mark_srcu;
struct kmem_cache *fsnotify_mark_connector_cachep;

static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
static struct fsnotify_mark_connector *connector_destroy_list;

static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);

static void fsnotify_connector_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);

void fsnotify_get_mark(struct fsnotify_mark *mark)
{
        WARN_ON_ONCE(!refcount_read(&mark->refcnt));
        refcount_inc(&mark->refcnt);
}

static fsnotify_connp_t *fsnotify_object_connp(void *obj,
                                enum fsnotify_obj_type obj_type)
{
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_INODE:
                return &((struct inode *)obj)->i_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                return &real_mount(obj)->mnt_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_SB:
                return fsnotify_sb_marks(obj);
        default:
                return NULL;
        }
}

static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
                return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
                return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
        return NULL;
}

__u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
{
        if (WARN_ON(!fsnotify_valid_obj_type(conn->type)))
                return 0;

        return *fsnotify_conn_mask_p(conn);
}

static void fsnotify_get_sb_watched_objects(struct super_block *sb)
{
        atomic_long_inc(fsnotify_sb_watched_objects(sb));
}

static void fsnotify_put_sb_watched_objects(struct super_block *sb)
{
        if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb)))
                wake_up_var(fsnotify_sb_watched_objects(sb));
}

static void fsnotify_get_inode_ref(struct inode *inode)
{
        ihold(inode);
        fsnotify_get_sb_watched_objects(inode->i_sb);
}

static void fsnotify_put_inode_ref(struct inode *inode)
{
        fsnotify_put_sb_watched_objects(inode->i_sb);
        iput(inode);
}

/*
 * Grab or drop watched objects reference depending on whether the connector
 * is attached and has any marks attached.
 */
static void fsnotify_update_sb_watchers(struct super_block *sb,
                                        struct fsnotify_mark_connector *conn)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
        bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
        struct fsnotify_mark *first_mark = NULL;
        unsigned int highest_prio = 0;

        if (conn->obj)
                first_mark = hlist_entry_safe(conn->list.first,
                                              struct fsnotify_mark, obj_list);
        if (first_mark)
                highest_prio = first_mark->group->priority;
        if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM))
                highest_prio = 0;

        /*
         * If the highest priority of group watching this object is prio,
         * then watched object has a reference on counters [0..prio].
         * Update priority >= 1 watched objects counters.
         */
        for (unsigned int p = conn->prio + 1; p <= highest_prio; p++)
                atomic_long_inc(&sbinfo->watched_objects[p]);
        for (unsigned int p = conn->prio; p > highest_prio; p--)
                atomic_long_dec(&sbinfo->watched_objects[p]);
        conn->prio = highest_prio;

        /* Update priority >= 0 (a.k.a total) watched objects counter */
        BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0);
        if (first_mark && !is_watched) {
                conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_get_sb_watched_objects(sb);
        } else if (!first_mark && is_watched) {
                conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_put_sb_watched_objects(sb);
        }
}

/*
 * Grab or drop inode reference for the connector if needed.
 *
 * When it's time to drop the reference, we only clear the HAS_IREF flag and
 * return the inode object. fsnotify_drop_object() will be resonsible for doing
 * iput() outside of spinlocks. This happens when last mark that wanted iref is
 * detached.
 */
static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
                                          bool want_iref)
{
        bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
        struct inode *inode = NULL;

        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
            want_iref == has_iref)
                return NULL;

        if (want_iref) {
                /* Pin inode if any mark wants inode refcount held */
                fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
                conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
        } else {
                /* Unpin inode after detach of last mark that wanted iref */
                inode = fsnotify_conn_inode(conn);
                conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
        }

        return inode;
}

static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        u32 new_mask = 0;
        bool want_iref = false;
        struct fsnotify_mark *mark;

        assert_spin_locked(&conn->lock);
        /* We can get detached connector here when inode is getting unlinked. */
        if (!fsnotify_valid_obj_type(conn->type))
                return NULL;
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
                        continue;
                new_mask |= fsnotify_calc_mask(mark);
                if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
                    !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
                        want_iref = true;
        }
        *fsnotify_conn_mask_p(conn) = new_mask;

        return fsnotify_update_iref(conn, want_iref);
}

/*
 * Calculate mask of events for a list of marks. The caller must make sure
 * connector and connector->obj cannot disappear under us.  Callers achieve
 * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
 * list.
 */
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        if (!conn)
                return;

        spin_lock(&conn->lock);
        __fsnotify_recalc_mask(conn);
        spin_unlock(&conn->lock);
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                __fsnotify_update_child_dentry_flags(
                                        fsnotify_conn_inode(conn));
}

/* Free all connectors queued for freeing once SRCU period ends */
static void fsnotify_connector_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark_connector *conn, *free;

        spin_lock(&destroy_lock);
        conn = connector_destroy_list;
        connector_destroy_list = NULL;
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);
        while (conn) {
                free = conn;
                conn = conn->destroy_next;
                kmem_cache_free(fsnotify_mark_connector_cachep, free);
        }
}

static void *fsnotify_detach_connector_from_object(
                                        struct fsnotify_mark_connector *conn,
                                        unsigned int *type)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
        struct super_block *sb = fsnotify_connector_sb(conn);
        struct inode *inode = NULL;

        *type = conn->type;
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
                return NULL;

        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
                inode = fsnotify_conn_inode(conn);
                inode->i_fsnotify_mask = 0;

                /* Unpin inode when detaching from connector */
                if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
                        inode = NULL;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
                fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
                fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
        }

        rcu_assign_pointer(*connp, NULL);
        conn->obj = NULL;
        conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
        fsnotify_update_sb_watchers(sb, conn);

        return inode;
}

static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        if (WARN_ON_ONCE(!group))
                return;
        group->ops->free_mark(mark);
        fsnotify_put_group(group);
}

/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
        if (!objp)
                return;
        /* Currently only inode references are passed to be dropped */
        if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
                return;
        fsnotify_put_inode_ref(objp);
}

void fsnotify_put_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
        void *objp = NULL;
        unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
        bool free_conn = false;

        /* Catch marks that were actually never attached to object */
        if (!conn) {
                if (refcount_dec_and_test(&mark->refcnt))
                        fsnotify_final_mark_destroy(mark);
                return;
        }

        /*
         * We have to be careful so that traversals of obj_list under lock can
         * safely grab mark reference.
         */
        if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
                return;

        hlist_del_init_rcu(&mark->obj_list);
        if (hlist_empty(&conn->list)) {
                objp = fsnotify_detach_connector_from_object(conn, &type);
                free_conn = true;
        } else {
                struct super_block *sb = fsnotify_connector_sb(conn);

                /* Update watched objects after detaching mark */
                if (sb)
                        fsnotify_update_sb_watchers(sb, conn);
                objp = __fsnotify_recalc_mask(conn);
                type = conn->type;
        }
        WRITE_ONCE(mark->connector, NULL);
        spin_unlock(&conn->lock);

        fsnotify_drop_object(type, objp);

        if (free_conn) {
                spin_lock(&destroy_lock);
                conn->destroy_next = connector_destroy_list;
                connector_destroy_list = conn;
                spin_unlock(&destroy_lock);
                queue_work(system_unbound_wq, &connector_reaper_work);
        }
        /*
         * Note that we didn't update flags telling whether inode cares about
         * what's happening with children. We update these flags from
         * __fsnotify_parent() lazily when next event happens on one of our
         * children.
         */
        spin_lock(&destroy_lock);
        list_add(&mark->g_list, &destroy_list);
        spin_unlock(&destroy_lock);
        queue_delayed_work(system_unbound_wq, &reaper_work,
                           FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);

/*
 * Get mark reference when we found the mark via lockless traversal of object
 * list. Mark can be already removed from the list by now and on its way to be
 * destroyed once SRCU period ends.
 *
 * Also pin the group so it doesn't disappear under us.
 */
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
        if (!mark)
                return true;

        if (refcount_inc_not_zero(&mark->refcnt)) {
                spin_lock(&mark->lock);
                if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
                        /* mark is attached, group is still alive then */
                        atomic_inc(&mark->group->user_waits);
                        spin_unlock(&mark->lock);
                        return true;
                }
                spin_unlock(&mark->lock);
                fsnotify_put_mark(mark);
        }
        return false;
}

/*
 * Puts marks and wakes up group destruction if necessary.
 *
 * Pairs with fsnotify_get_mark_safe()
 */
static void fsnotify_put_mark_wake(struct fsnotify_mark *mark)
{
        if (mark) {
                struct fsnotify_group *group = mark->group;

                fsnotify_put_mark(mark);
                /*
                 * We abuse notification_waitq on group shutdown for waiting for
                 * all marks pinned when waiting for userspace.
                 */
                if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
                        wake_up(&group->notification_waitq);
        }
}

bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
        __releases(&fsnotify_mark_srcu)
{
        int type;

        fsnotify_foreach_iter_type(type) {
                /* This can fail if mark is being removed */
                if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
                        __release(&fsnotify_mark_srcu);
                        goto fail;
                }
        }

        /*
         * Now that both marks are pinned by refcount in the inode / vfsmount
         * lists, we can drop SRCU lock, and safely resume the list iteration
         * once userspace returns.
         */
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);

        return true;

fail:
        for (type--; type >= 0; type--)
                fsnotify_put_mark_wake(iter_info->marks[type]);
        return false;
}

void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
        __acquires(&fsnotify_mark_srcu)
{
        int type;

        iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
        fsnotify_foreach_iter_type(type)
                fsnotify_put_mark_wake(iter_info->marks[type]);
}

/*
 * Mark mark as detached, remove it from group list. Mark still stays in object
 * list until its last reference is dropped. Note that we rely on mark being
 * removed from group list before corresponding reference to it is dropped. In
 * particular we rely on mark->connector being valid while we hold
 * group->mark_mutex if we found the mark through g_list.
 *
 * Must be called with group->mark_mutex held. The caller must either hold
 * reference to the mark or be protected by fsnotify_mark_srcu.
 */
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
        fsnotify_group_assert_locked(mark->group);
        WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
                     refcount_read(&mark->refcnt) < 1 +
                        !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        /* Drop mark reference acquired in fsnotify_add_mark_locked() */
        fsnotify_put_mark(mark);
}

/*
 * Free fsnotify mark. The mark is actually only marked as being freed.  The
 * freeing is actually happening only once last reference to the mark is
 * dropped from a workqueue which first waits for srcu period end.
 *
 * Caller must have a reference to the mark or be protected by
 * fsnotify_mark_srcu.
 */
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
        spin_unlock(&mark->lock);

        /*
         * Some groups like to know that marks are being freed.  This is a
         * callback to the group function to let it know that this mark
         * is being freed.
         */
        if (group->ops->freeing_mark)
                group->ops->freeing_mark(mark, group);
}

void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                           struct fsnotify_group *group)
{
        fsnotify_group_lock(group);
        fsnotify_detach_mark(mark);
        fsnotify_group_unlock(group);
        fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);

/*
 * Sorting function for lists of fsnotify marks.
 *
 * Fanotify supports different notification classes (reflected as priority of
 * notification group). Events shall be passed to notification groups in
 * decreasing priority order. To achieve this marks in notification lists for
 * inodes and vfsmounts are sorted so that priorities of corresponding groups
 * are descending.
 *
 * Furthermore correct handling of the ignore mask requires processing inode
 * and vfsmount marks of each group together. Using the group address as
 * further sort criterion provides a unique sorting order and thus we can
 * merge inode and vfsmount lists of marks in linear time and find groups
 * present in both lists.
 *
 * A return value of 1 signifies that b has priority over a.
 * A return value of 0 signifies that the two marks have to be handled together.
 * A return value of -1 signifies that a has priority over b.
 */
int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
{
        if (a == b)
                return 0;
        if (!a)
                return 1;
        if (!b)
                return -1;
        if (a->priority < b->priority)
                return 1;
        if (a->priority > b->priority)
                return -1;
        if (a < b)
                return 1;
        return -1;
}

static int fsnotify_attach_info_to_sb(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo;

        /* sb info is freed on fsnotify_sb_delete() */
        sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
        if (!sbinfo)
                return -ENOMEM;

        /*
         * cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
         * will observe an initialized structure
         */
        if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) {
                /* Someone else created sbinfo for us */
                kfree(sbinfo);
        }
        return 0;
}

static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
                                               void *obj, unsigned int obj_type)
{
        struct fsnotify_mark_connector *conn;

        conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
        if (!conn)
                return -ENOMEM;
        spin_lock_init(&conn->lock);
        INIT_HLIST_HEAD(&conn->list);
        conn->flags = 0;
        conn->prio = 0;
        conn->type = obj_type;
        conn->obj = obj;

        /*
         * cmpxchg() provides the barrier so that readers of *connp can see
         * only initialized structure
         */
        if (cmpxchg(connp, NULL, conn)) {
                /* Someone else created list structure for us */
                kmem_cache_free(fsnotify_mark_connector_cachep, conn);
        }
        return 0;
}

/*
 * Get mark connector, make sure it is alive and return with its lock held.
 * This is for users that get connector pointer from inode or mount. Users that
 * hold reference to a mark on the list may directly lock connector->lock as
 * they are sure list cannot go away under them.
 */
static struct fsnotify_mark_connector *fsnotify_grab_connector(
                                                fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        int idx;

        idx = srcu_read_lock(&fsnotify_mark_srcu);
        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (!conn)
                goto out;
        spin_lock(&conn->lock);
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) {
                spin_unlock(&conn->lock);
                srcu_read_unlock(&fsnotify_mark_srcu, idx);
                return NULL;
        }
out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        return conn;
}

/*
 * Add mark into proper place in given list of marks. These marks may be used
 * for the fsnotify backend to determine which event types should be delivered
 * to which group and for which inodes. These marks are ordered according to
 * priority, highest number first, and then by the group's location in memory.
 */
static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
                                  unsigned int obj_type, int add_flags)
{
        struct super_block *sb = fsnotify_object_sb(obj, obj_type);
        struct fsnotify_mark *lmark, *last = NULL;
        struct fsnotify_mark_connector *conn;
        fsnotify_connp_t *connp;
        int cmp;
        int err = 0;

        if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
                return -EINVAL;

        /*
         * Attach the sb info before attaching a connector to any object on sb.
         * The sb info will remain attached as long as sb lives.
         */
        if (!fsnotify_sb_info(sb)) {
                err = fsnotify_attach_info_to_sb(sb);
                if (err)
                        return err;
        }

        connp = fsnotify_object_connp(obj, obj_type);
restart:
        spin_lock(&mark->lock);
        conn = fsnotify_grab_connector(connp);
        if (!conn) {
                spin_unlock(&mark->lock);
                err = fsnotify_attach_connector_to_object(connp, obj, obj_type);
                if (err)
                        return err;
                goto restart;
        }

        /* is mark the first mark? */
        if (hlist_empty(&conn->list)) {
                hlist_add_head_rcu(&mark->obj_list, &conn->list);
                goto added;
        }

        /* should mark be in the middle of the current list? */
        hlist_for_each_entry(lmark, &conn->list, obj_list) {
                last = lmark;

                if ((lmark->group == mark->group) &&
                    (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
                    !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
                        err = -EEXIST;
                        goto out_err;
                }

                cmp = fsnotify_compare_groups(lmark->group, mark->group);
                if (cmp >= 0) {
                        hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
                        goto added;
                }
        }

        BUG_ON(last == NULL);
        /* mark should be the last entry.  last is the current last entry */
        hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
        fsnotify_update_sb_watchers(sb, conn);
        /*
         * Since connector is attached to object using cmpxchg() we are
         * guaranteed that connector initialization is fully visible by anyone
         * seeing mark->connector set.
         */
        WRITE_ONCE(mark->connector, conn);
out_err:
        spin_unlock(&conn->lock);
        spin_unlock(&mark->lock);
        return err;
}

/*
 * Attach an initialized mark to a given group and fs object.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group.
 */
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                             void *obj, unsigned int obj_type,
                             int add_flags)
{
        struct fsnotify_group *group = mark->group;
        int ret = 0;

        fsnotify_group_assert_locked(group);

        /*
         * LOCKING ORDER!!!!
         * group->mark_mutex
         * mark->lock
         * mark->connector->lock
         */
        spin_lock(&mark->lock);
        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;

        list_add(&mark->g_list, &group->marks_list);
        fsnotify_get_mark(mark); /* for g_list */
        spin_unlock(&mark->lock);

        ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags);
        if (ret)
                goto err;

        fsnotify_recalc_mask(mark->connector);

        return ret;
err:
        spin_lock(&mark->lock);
        mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
                         FSNOTIFY_MARK_FLAG_ATTACHED);
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        fsnotify_put_mark(mark);
        return ret;
}

int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags)
{
        int ret;
        struct fsnotify_group *group = mark->group;

        fsnotify_group_lock(group);
        ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags);
        fsnotify_group_unlock(group);
        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);

/*
 * Given a list of marks, find the mark associated with given group. If found
 * take a reference to that mark and return it, else return NULL.
 */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type);
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark;

        if (!connp)
                return NULL;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return NULL;

        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (mark->group == group &&
                    (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                        fsnotify_get_mark(mark);
                        spin_unlock(&conn->lock);
                        return mark;
                }
        }
        spin_unlock(&conn->lock);
        return NULL;
}
EXPORT_SYMBOL_GPL(fsnotify_find_mark);

/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                   unsigned int obj_type)
{
        struct fsnotify_mark *lmark, *mark;
        LIST_HEAD(to_free);
        struct list_head *head = &to_free;

        /* Skip selection step if we want to clear all marks. */
        if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) {
                head = &group->marks_list;
                goto clear;
        }
        /*
         * We have to be really careful here. Anytime we drop mark_mutex, e.g.
         * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
         * to_free list so we have to use mark_mutex even when accessing that
         * list. And freeing mark requires us to drop mark_mutex. So we can
         * reliably free only the first mark in the list. That's why we first
         * move marks to free to to_free list in one go and then free marks in
         * to_free list one by one.
         */
        fsnotify_group_lock(group);
        list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
                if (mark->connector->type == obj_type)
                        list_move(&mark->g_list, &to_free);
        }
        fsnotify_group_unlock(group);

clear:
        while (1) {
                fsnotify_group_lock(group);
                if (list_empty(head)) {
                        fsnotify_group_unlock(group);
                        break;
                }
                mark = list_first_entry(head, struct fsnotify_mark, g_list);
                fsnotify_get_mark(mark);
                fsnotify_detach_mark(mark);
                fsnotify_group_unlock(group);
                fsnotify_free_mark(mark);
                fsnotify_put_mark(mark);
        }
}

/* Destroy all marks attached to an object via connector */
void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark, *old_mark = NULL;
        void *objp;
        unsigned int type;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return;
        /*
         * We have to be careful since we can race with e.g.
         * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
         * list can get modified. However we are holding mark reference and
         * thus our mark cannot be removed from obj_list so we can continue
         * iteration after regaining conn->lock.
         */
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                fsnotify_get_mark(mark);
                spin_unlock(&conn->lock);
                if (old_mark)
                        fsnotify_put_mark(old_mark);
                old_mark = mark;
                fsnotify_destroy_mark(mark, mark->group);
                spin_lock(&conn->lock);
        }
        /*
         * Detach list from object now so that we don't pin inode until all
         * mark references get dropped. It would lead to strange results such
         * as delaying inode deletion or blocking unmount.
         */
        objp = fsnotify_detach_connector_from_object(conn, &type);
        spin_unlock(&conn->lock);
        if (old_mark)
                fsnotify_put_mark(old_mark);
        fsnotify_drop_object(type, objp);
}

/*
 * Nothing fancy, just initialize lists and locks and counters.
 */
void fsnotify_init_mark(struct fsnotify_mark *mark,
                        struct fsnotify_group *group)
{
        memset(mark, 0, sizeof(*mark));
        spin_lock_init(&mark->lock);
        refcount_set(&mark->refcnt, 1);
        fsnotify_get_group(group);
        mark->group = group;
        WRITE_ONCE(mark->connector, NULL);
}
EXPORT_SYMBOL_GPL(fsnotify_init_mark);

/*
 * Destroy all marks in destroy_list, waits for SRCU period to finish before
 * actually freeing marks.
 */
static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark *mark, *next;
        struct list_head private_destroy_list;

        spin_lock(&destroy_lock);
        /* exchange the list head */
        list_replace_init(&destroy_list, &private_destroy_list);
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);

        list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
                list_del_init(&mark->g_list);
                fsnotify_final_mark_destroy(mark);
        }
}

/* Wait for all marks queued for destruction to be actually destroyed */
void fsnotify_wait_marks_destroyed(void)
{
        flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);






































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 












    1 




    1 














































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
// SPDX-License-Identifier: GPL-2.0
/*
 * cfg80211 MLME SAP interface
 *
 * Copyright (c) 2009, Jouni Malinen <j@w1.fi>
 * Copyright (c) 2015                Intel Deutschland GmbH
 * Copyright (C) 2019-2020, 2022-2024 Intel Corporation
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/etherdevice.h>
#include <linux/netdevice.h>
#include <linux/nl80211.h>
#include <linux/slab.h>
#include <linux/wireless.h>
#include <net/cfg80211.h>
#include <net/iw_handler.h>
#include "core.h"
#include "nl80211.h"
#include "rdev-ops.h"


void cfg80211_rx_assoc_resp(struct net_device *dev,
                            const struct cfg80211_rx_assoc_resp_data *data)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)data->buf;
        struct cfg80211_connect_resp_params cr = {
                .timeout_reason = NL80211_TIMEOUT_UNSPECIFIED,
                .req_ie = data->req_ies,
                .req_ie_len = data->req_ies_len,
                .resp_ie = mgmt->u.assoc_resp.variable,
                .resp_ie_len = data->len -
                               offsetof(struct ieee80211_mgmt,
                                        u.assoc_resp.variable),
                .status = le16_to_cpu(mgmt->u.assoc_resp.status_code),
                .ap_mld_addr = data->ap_mld_addr,
        };
        unsigned int link_id;

        for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
                cr.links[link_id].status = data->links[link_id].status;
                cr.links[link_id].bss = data->links[link_id].bss;

                WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS &&
                             (!cr.ap_mld_addr || !cr.links[link_id].bss));

                if (!cr.links[link_id].bss)
                        continue;
                cr.links[link_id].bssid = data->links[link_id].bss->bssid;
                cr.links[link_id].addr = data->links[link_id].addr;
                /* need to have local link addresses for MLO connections */
                WARN_ON(cr.ap_mld_addr &&
                        !is_valid_ether_addr(cr.links[link_id].addr));

                BUG_ON(!cr.links[link_id].bss->channel);

                if (cr.links[link_id].bss->channel->band == NL80211_BAND_S1GHZ) {
                        WARN_ON(link_id);
                        cr.resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable;
                        cr.resp_ie_len = data->len -
                                         offsetof(struct ieee80211_mgmt,
                                                  u.s1g_assoc_resp.variable);
                }

                if (cr.ap_mld_addr)
                        cr.valid_links |= BIT(link_id);
        }

        trace_cfg80211_send_rx_assoc(dev, data);

        /*
         * This is a bit of a hack, we don't notify userspace of
         * a (re-)association reply if we tried to send a reassoc
         * and got a reject -- we only try again with an assoc
         * frame instead of reassoc.
         */
        if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) {
                for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
                        struct cfg80211_bss *bss = data->links[link_id].bss;

                        if (!bss)
                                continue;

                        cfg80211_unhold_bss(bss_from_pub(bss));
                        cfg80211_put_bss(wiphy, bss);
                }
                return;
        }

        nl80211_send_rx_assoc(rdev, dev, data);
        /* update current_bss etc., consumes the bss reference */
        __cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS);
}
EXPORT_SYMBOL(cfg80211_rx_assoc_resp);

static void cfg80211_process_auth(struct wireless_dev *wdev,
                                  const u8 *buf, size_t len)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);

        nl80211_send_rx_auth(rdev, wdev->netdev, buf, len, GFP_KERNEL);
        cfg80211_sme_rx_auth(wdev, buf, len);
}

static void cfg80211_process_deauth(struct wireless_dev *wdev,
                                    const u8 *buf, size_t len,
                                    bool reconnect)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
        const u8 *bssid = mgmt->bssid;
        u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
        bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr);

        nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL);

        if (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))
                return;

        __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
        cfg80211_sme_deauth(wdev);
}

static void cfg80211_process_disassoc(struct wireless_dev *wdev,
                                      const u8 *buf, size_t len,
                                      bool reconnect)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
        const u8 *bssid = mgmt->bssid;
        u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
        bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr);

        nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect,
                              GFP_KERNEL);

        if (WARN_ON(!wdev->connected ||
                    !ether_addr_equal(wdev->u.client.connected_addr, bssid)))
                return;

        __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap);
        cfg80211_sme_disassoc(wdev);
}

void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct ieee80211_mgmt *mgmt = (void *)buf;

        lockdep_assert_wiphy(wdev->wiphy);

        trace_cfg80211_rx_mlme_mgmt(dev, buf, len);

        if (WARN_ON(len < 2))
                return;

        if (ieee80211_is_auth(mgmt->frame_control))
                cfg80211_process_auth(wdev, buf, len);
        else if (ieee80211_is_deauth(mgmt->frame_control))
                cfg80211_process_deauth(wdev, buf, len, false);
        else if (ieee80211_is_disassoc(mgmt->frame_control))
                cfg80211_process_disassoc(wdev, buf, len, false);
}
EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt);

void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        trace_cfg80211_send_auth_timeout(dev, addr);

        nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL);
        cfg80211_sme_auth_timeout(wdev);
}
EXPORT_SYMBOL(cfg80211_auth_timeout);

void cfg80211_assoc_failure(struct net_device *dev,
                            struct cfg80211_assoc_failure *data)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        const u8 *addr = data->ap_mld_addr ?: data->bss[0]->bssid;
        int i;

        trace_cfg80211_send_assoc_failure(dev, data);

        if (data->timeout) {
                nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL);
                cfg80211_sme_assoc_timeout(wdev);
        } else {
                cfg80211_sme_abandon_assoc(wdev);
        }

        for (i = 0; i < ARRAY_SIZE(data->bss); i++) {
                struct cfg80211_bss *bss = data->bss[i];

                if (!bss)
                        continue;

                cfg80211_unhold_bss(bss_from_pub(bss));
                cfg80211_put_bss(wiphy, bss);
        }
}
EXPORT_SYMBOL(cfg80211_assoc_failure);

void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len,
                           bool reconnect)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct ieee80211_mgmt *mgmt = (void *)buf;

        lockdep_assert_wiphy(wdev->wiphy);

        trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect);

        if (WARN_ON(len < 2))
                return;

        if (ieee80211_is_deauth(mgmt->frame_control))
                cfg80211_process_deauth(wdev, buf, len, reconnect);
        else
                cfg80211_process_disassoc(wdev, buf, len, reconnect);
}
EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt);

void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr,
                                  enum nl80211_key_type key_type, int key_id,
                                  const u8 *tsc, gfp_t gfp)
{
        struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
#ifdef CONFIG_CFG80211_WEXT
        union iwreq_data wrqu;
        char *buf = kmalloc(128, gfp);

        if (buf) {
                memset(&wrqu, 0, sizeof(wrqu));
                wrqu.data.length =
                        sprintf(buf, "MLME-MICHAELMICFAILURE."
                                "indication(keyid=%d %scast addr=%pM)",
                                key_id, key_type == NL80211_KEYTYPE_GROUP
                                ? "broad" : "uni", addr);
                wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf);
                kfree(buf);
        }
#endif

        trace_cfg80211_michael_mic_failure(dev, addr, key_type, key_id, tsc);
        nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp);
}
EXPORT_SYMBOL(cfg80211_michael_mic_failure);

/* some MLME handling for userspace SME */
int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev,
                       struct net_device *dev,
                       struct cfg80211_auth_request *req)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;

        lockdep_assert_wiphy(wdev->wiphy);

        if (!req->bss)
                return -ENOENT;

        if (req->link_id >= 0 &&
            !(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
                return -EINVAL;

        if (req->auth_type == NL80211_AUTHTYPE_SHARED_KEY) {
                if (!req->key || !req->key_len ||
                    req->key_idx < 0 || req->key_idx > 3)
                        return -EINVAL;
        }

        if (wdev->connected &&
            ether_addr_equal(req->bss->bssid, wdev->u.client.connected_addr))
                return -EALREADY;

        if (ether_addr_equal(req->bss->bssid, dev->dev_addr) ||
            (req->link_id >= 0 &&
             ether_addr_equal(req->ap_mld_addr, dev->dev_addr)))
                return -EINVAL;

        return rdev_auth(rdev, dev, req);
}

/*  Do a logical ht_capa &= ht_capa_mask.  */
void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa,
                               const struct ieee80211_ht_cap *ht_capa_mask)
{
        int i;
        u8 *p1, *p2;
        if (!ht_capa_mask) {
                memset(ht_capa, 0, sizeof(*ht_capa));
                return;
        }

        p1 = (u8*)(ht_capa);
        p2 = (u8*)(ht_capa_mask);
        for (i = 0; i < sizeof(*ht_capa); i++)
                p1[i] &= p2[i];
}

/*  Do a logical vht_capa &= vht_capa_mask.  */
void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa,
                                const struct ieee80211_vht_cap *vht_capa_mask)
{
        int i;
        u8 *p1, *p2;
        if (!vht_capa_mask) {
                memset(vht_capa, 0, sizeof(*vht_capa));
                return;
        }

        p1 = (u8*)(vht_capa);
        p2 = (u8*)(vht_capa_mask);
        for (i = 0; i < sizeof(*vht_capa); i++)
                p1[i] &= p2[i];
}

static int
cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a,
                               const struct ieee80211_multi_link_elem *mle_b,
                               struct netlink_ext_ack *extack)
{
        const struct ieee80211_mle_basic_common_info *common_a, *common_b;

        common_a = (const void *)mle_a->variable;
        common_b = (const void *)mle_b->variable;

        if (memcmp(common_a->mld_mac_addr, common_b->mld_mac_addr, ETH_ALEN)) {
                NL_SET_ERR_MSG(extack, "AP MLD address mismatch");
                return -EINVAL;
        }

        if (ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_a) !=
            ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_b)) {
                NL_SET_ERR_MSG(extack, "link EML medium sync delay mismatch");
                return -EINVAL;
        }

        if (ieee80211_mle_get_eml_cap((const u8 *)mle_a) !=
            ieee80211_mle_get_eml_cap((const u8 *)mle_b)) {
                NL_SET_ERR_MSG(extack, "link EML capabilities mismatch");
                return -EINVAL;
        }

        if (ieee80211_mle_get_mld_capa_op((const u8 *)mle_a) !=
            ieee80211_mle_get_mld_capa_op((const u8 *)mle_b)) {
                NL_SET_ERR_MSG(extack, "link MLD capabilities/ops mismatch");
                return -EINVAL;
        }

        return 0;
}

static int cfg80211_mlme_check_mlo(struct net_device *dev,
                                   struct cfg80211_assoc_request *req,
                                   struct netlink_ext_ack *extack)
{
        const struct ieee80211_multi_link_elem *mles[ARRAY_SIZE(req->links)] = {};
        int i;

        if (req->link_id < 0)
                return 0;

        if (!req->links[req->link_id].bss) {
                NL_SET_ERR_MSG(extack, "no BSS for assoc link");
                return -EINVAL;
        }

        rcu_read_lock();
        for (i = 0; i < ARRAY_SIZE(req->links); i++) {
                const struct cfg80211_bss_ies *ies;
                const struct element *ml;

                if (!req->links[i].bss)
                        continue;

                if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) {
                        NL_SET_ERR_MSG(extack, "BSSID must not be our address");
                        req->links[i].error = -EINVAL;
                        goto error;
                }

                ies = rcu_dereference(req->links[i].bss->ies);
                ml = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK,
                                            ies->data, ies->len);
                if (!ml) {
                        NL_SET_ERR_MSG(extack, "MLO BSS w/o ML element");
                        req->links[i].error = -EINVAL;
                        goto error;
                }

                if (!ieee80211_mle_type_ok(ml->data + 1,
                                           IEEE80211_ML_CONTROL_TYPE_BASIC,
                                           ml->datalen - 1)) {
                        NL_SET_ERR_MSG(extack, "BSS with invalid ML element");
                        req->links[i].error = -EINVAL;
                        goto error;
                }

                mles[i] = (const void *)(ml->data + 1);

                if (ieee80211_mle_get_link_id((const u8 *)mles[i]) != i) {
                        NL_SET_ERR_MSG(extack, "link ID mismatch");
                        req->links[i].error = -EINVAL;
                        goto error;
                }
        }

        if (WARN_ON(!mles[req->link_id]))
                goto error;

        for (i = 0; i < ARRAY_SIZE(req->links); i++) {
                if (i == req->link_id || !req->links[i].bss)
                        continue;

                if (WARN_ON(!mles[i]))
                        goto error;

                if (cfg80211_mlme_check_mlo_compat(mles[req->link_id], mles[i],
                                                   extack)) {
                        req->links[i].error = -EINVAL;
                        goto error;
                }
        }

        rcu_read_unlock();
        return 0;
error:
        rcu_read_unlock();
        return -EINVAL;
}

/* Note: caller must cfg80211_put_bss() regardless of result */
int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev,
                        struct net_device *dev,
                        struct cfg80211_assoc_request *req,
                        struct netlink_ext_ack *extack)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        int err;

        lockdep_assert_wiphy(wdev->wiphy);

        err = cfg80211_mlme_check_mlo(dev, req, extack);
        if (err)
                return err;

        if (wdev->connected &&
            (!req->prev_bssid ||
             !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid)))
                return -EALREADY;

        if ((req->bss && ether_addr_equal(req->bss->bssid, dev->dev_addr)) ||
            (req->link_id >= 0 &&
             ether_addr_equal(req->ap_mld_addr, dev->dev_addr)))
                return -EINVAL;

        cfg80211_oper_and_ht_capa(&req->ht_capa_mask,
                                  rdev->wiphy.ht_capa_mod_mask);
        cfg80211_oper_and_vht_capa(&req->vht_capa_mask,
                                   rdev->wiphy.vht_capa_mod_mask);

        err = rdev_assoc(rdev, dev, req);
        if (!err) {
                int link_id;

                if (req->bss) {
                        cfg80211_ref_bss(&rdev->wiphy, req->bss);
                        cfg80211_hold_bss(bss_from_pub(req->bss));
                }

                for (link_id = 0; link_id < ARRAY_SIZE(req->links); link_id++) {
                        if (!req->links[link_id].bss)
                                continue;
                        cfg80211_ref_bss(&rdev->wiphy, req->links[link_id].bss);
                        cfg80211_hold_bss(bss_from_pub(req->links[link_id].bss));
                }
        }
        return err;
}

int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev,
                         struct net_device *dev, const u8 *bssid,
                         const u8 *ie, int ie_len, u16 reason,
                         bool local_state_change)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_deauth_request req = {
                .bssid = bssid,
                .reason_code = reason,
                .ie = ie,
                .ie_len = ie_len,
                .local_state_change = local_state_change,
        };

        lockdep_assert_wiphy(wdev->wiphy);

        if (local_state_change &&
            (!wdev->connected ||
             !ether_addr_equal(wdev->u.client.connected_addr, bssid)))
                return 0;

        if (ether_addr_equal(wdev->disconnect_bssid, bssid) ||
            (wdev->connected &&
             ether_addr_equal(wdev->u.client.connected_addr, bssid)))
                wdev->conn_owner_nlportid = 0;

        return rdev_deauth(rdev, dev, &req);
}

int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev,
                           struct net_device *dev, const u8 *ap_addr,
                           const u8 *ie, int ie_len, u16 reason,
                           bool local_state_change)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_disassoc_request req = {
                .reason_code = reason,
                .local_state_change = local_state_change,
                .ie = ie,
                .ie_len = ie_len,
                .ap_addr = ap_addr,
        };
        int err;

        lockdep_assert_wiphy(wdev->wiphy);

        if (!wdev->connected)
                return -ENOTCONN;

        if (memcmp(wdev->u.client.connected_addr, ap_addr, ETH_ALEN))
                return -ENOTCONN;

        err = rdev_disassoc(rdev, dev, &req);
        if (err)
                return err;

        /* driver should have reported the disassoc */
        WARN_ON(wdev->connected);
        return 0;
}

void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
                        struct net_device *dev)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        u8 bssid[ETH_ALEN];

        lockdep_assert_wiphy(wdev->wiphy);

        if (!rdev->ops->deauth)
                return;

        if (!wdev->connected)
                return;

        memcpy(bssid, wdev->u.client.connected_addr, ETH_ALEN);
        cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0,
                             WLAN_REASON_DEAUTH_LEAVING, false);
}

struct cfg80211_mgmt_registration {
        struct list_head list;
        struct wireless_dev *wdev;

        u32 nlportid;

        int match_len;

        __le16 frame_type;

        bool multicast_rx;

        u8 match[];
};

static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct wireless_dev *tmp;
        struct cfg80211_mgmt_registration *reg;
        struct mgmt_frame_regs upd = {};

        lockdep_assert_held(&rdev->wiphy.mtx);

        spin_lock_bh(&rdev->mgmt_registrations_lock);
        if (!wdev->mgmt_registrations_need_update) {
                spin_unlock_bh(&rdev->mgmt_registrations_lock);
                return;
        }

        rcu_read_lock();
        list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) {
                list_for_each_entry(reg, &tmp->mgmt_registrations, list) {
                        u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4);
                        u32 mcast_mask = 0;

                        if (reg->multicast_rx)
                                mcast_mask = mask;

                        upd.global_stypes |= mask;
                        upd.global_mcast_stypes |= mcast_mask;

                        if (tmp == wdev) {
                                upd.interface_stypes |= mask;
                                upd.interface_mcast_stypes |= mcast_mask;
                        }
                }
        }
        rcu_read_unlock();

        wdev->mgmt_registrations_need_update = 0;
        spin_unlock_bh(&rdev->mgmt_registrations_lock);

        rdev_update_mgmt_frame_registrations(rdev, wdev, &upd);
}

void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk)
{
        struct cfg80211_registered_device *rdev;
        struct wireless_dev *wdev;

        rdev = container_of(wk, struct cfg80211_registered_device,
                            mgmt_registrations_update_wk);

        wiphy_lock(&rdev->wiphy);
        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
                cfg80211_mgmt_registrations_update(wdev);
        wiphy_unlock(&rdev->wiphy);
}

int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
                                u16 frame_type, const u8 *match_data,
                                int match_len, bool multicast_rx,
                                struct netlink_ext_ack *extack)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct cfg80211_mgmt_registration *reg, *nreg;
        int err = 0;
        u16 mgmt_type;
        bool update_multicast = false;

        if (!wdev->wiphy->mgmt_stypes)
                return -EOPNOTSUPP;

        if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) {
                NL_SET_ERR_MSG(extack, "frame type not management");
                return -EINVAL;
        }

        if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) {
                NL_SET_ERR_MSG(extack, "Invalid frame type");
                return -EINVAL;
        }

        mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
        if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) {
                NL_SET_ERR_MSG(extack,
                               "Registration to specific type not supported");
                return -EINVAL;
        }

        /*
         * To support Pre Association Security Negotiation (PASN), registration
         * for authentication frames should be supported. However, as some
         * versions of the user space daemons wrongly register to all types of
         * authentication frames (which might result in unexpected behavior)
         * allow such registration if the request is for a specific
         * authentication algorithm number.
         */
        if (wdev->iftype == NL80211_IFTYPE_STATION &&
            (frame_type & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_AUTH &&
            !(match_data && match_len >= 2)) {
                NL_SET_ERR_MSG(extack,
                               "Authentication algorithm number required");
                return -EINVAL;
        }

        nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL);
        if (!nreg)
                return -ENOMEM;

        spin_lock_bh(&rdev->mgmt_registrations_lock);

        list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
                int mlen = min(match_len, reg->match_len);

                if (frame_type != le16_to_cpu(reg->frame_type))
                        continue;

                if (memcmp(reg->match, match_data, mlen) == 0) {
                        if (reg->multicast_rx != multicast_rx) {
                                update_multicast = true;
                                reg->multicast_rx = multicast_rx;
                                break;
                        }
                        NL_SET_ERR_MSG(extack, "Match already configured");
                        err = -EALREADY;
                        break;
                }
        }

        if (err)
                goto out;

        if (update_multicast) {
                kfree(nreg);
        } else {
                memcpy(nreg->match, match_data, match_len);
                nreg->match_len = match_len;
                nreg->nlportid = snd_portid;
                nreg->frame_type = cpu_to_le16(frame_type);
                nreg->wdev = wdev;
                nreg->multicast_rx = multicast_rx;
                list_add(&nreg->list, &wdev->mgmt_registrations);
        }
        wdev->mgmt_registrations_need_update = 1;
        spin_unlock_bh(&rdev->mgmt_registrations_lock);

        cfg80211_mgmt_registrations_update(wdev);

        return 0;

 out:
        kfree(nreg);
        spin_unlock_bh(&rdev->mgmt_registrations_lock);

        return err;
}

void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_mgmt_registration *reg, *tmp;

        spin_lock_bh(&rdev->mgmt_registrations_lock);

        list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
                if (reg->nlportid != nlportid)
                        continue;

                list_del(&reg->list);
                kfree(reg);

                wdev->mgmt_registrations_need_update = 1;
                schedule_work(&rdev->mgmt_registrations_update_wk);
        }

        spin_unlock_bh(&rdev->mgmt_registrations_lock);

        if (nlportid && rdev->crit_proto_nlportid == nlportid) {
                rdev->crit_proto_nlportid = 0;
                rdev_crit_proto_stop(rdev, wdev);
        }

        if (nlportid == wdev->ap_unexpected_nlportid)
                wdev->ap_unexpected_nlportid = 0;
}

void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct cfg80211_mgmt_registration *reg, *tmp;

        spin_lock_bh(&rdev->mgmt_registrations_lock);
        list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
                list_del(&reg->list);
                kfree(reg);
        }
        wdev->mgmt_registrations_need_update = 1;
        spin_unlock_bh(&rdev->mgmt_registrations_lock);

        cfg80211_mgmt_registrations_update(wdev);
}

static bool cfg80211_allowed_address(struct wireless_dev *wdev, const u8 *addr)
{
        int i;

        for_each_valid_link(wdev, i) {
                if (ether_addr_equal(addr, wdev->links[i].addr))
                        return true;
        }

        return ether_addr_equal(addr, wdev_address(wdev));
}

static bool cfg80211_allowed_random_address(struct wireless_dev *wdev,
                                            const struct ieee80211_mgmt *mgmt)
{
        if (ieee80211_is_auth(mgmt->frame_control) ||
            ieee80211_is_deauth(mgmt->frame_control)) {
                /* Allow random TA to be used with authentication and
                 * deauthentication frames if the driver has indicated support.
                 */
                if (wiphy_ext_feature_isset(
                            wdev->wiphy,
                            NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA))
                        return true;
        } else if (ieee80211_is_action(mgmt->frame_control) &&
                   mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) {
                /* Allow random TA to be used with Public Action frames if the
                 * driver has indicated support.
                 */
                if (!wdev->connected &&
                    wiphy_ext_feature_isset(
                            wdev->wiphy,
                            NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
                        return true;

                if (wdev->connected &&
                    wiphy_ext_feature_isset(
                            wdev->wiphy,
                            NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
                        return true;
        }

        return false;
}

int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
                          struct wireless_dev *wdev,
                          struct cfg80211_mgmt_tx_params *params, u64 *cookie)
{
        const struct ieee80211_mgmt *mgmt;
        u16 stype;

        lockdep_assert_wiphy(&rdev->wiphy);

        if (!wdev->wiphy->mgmt_stypes)
                return -EOPNOTSUPP;

        if (!rdev->ops->mgmt_tx)
                return -EOPNOTSUPP;

        if (params->len < 24 + 1)
                return -EINVAL;

        mgmt = (const struct ieee80211_mgmt *)params->buf;

        if (!ieee80211_is_mgmt(mgmt->frame_control))
                return -EINVAL;

        stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
        if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4)))
                return -EINVAL;

        if (ieee80211_is_action(mgmt->frame_control) &&
            mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
                int err = 0;

                switch (wdev->iftype) {
                case NL80211_IFTYPE_ADHOC:
                        /*
                         * check for IBSS DA must be done by driver as
                         * cfg80211 doesn't track the stations
                         */
                        if (!wdev->u.ibss.current_bss ||
                            !ether_addr_equal(wdev->u.ibss.current_bss->pub.bssid,
                                              mgmt->bssid)) {
                                err = -ENOTCONN;
                                break;
                        }
                        break;
                case NL80211_IFTYPE_STATION:
                case NL80211_IFTYPE_P2P_CLIENT:
                        if (!wdev->connected) {
                                err = -ENOTCONN;
                                break;
                        }

                        /* FIXME: MLD may address this differently */

                        if (!ether_addr_equal(wdev->u.client.connected_addr,
                                              mgmt->bssid)) {
                                err = -ENOTCONN;
                                break;
                        }

                        /* for station, check that DA is the AP */
                        if (!ether_addr_equal(wdev->u.client.connected_addr,
                                              mgmt->da)) {
                                err = -ENOTCONN;
                                break;
                        }
                        break;
                case NL80211_IFTYPE_AP:
                case NL80211_IFTYPE_P2P_GO:
                case NL80211_IFTYPE_AP_VLAN:
                        if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev)) &&
                            (params->link_id < 0 ||
                             !ether_addr_equal(mgmt->bssid,
                                               wdev->links[params->link_id].addr)))
                                err = -EINVAL;
                        break;
                case NL80211_IFTYPE_MESH_POINT:
                        if (!ether_addr_equal(mgmt->sa, mgmt->bssid)) {
                                err = -EINVAL;
                                break;
                        }
                        /*
                         * check for mesh DA must be done by driver as
                         * cfg80211 doesn't track the stations
                         */
                        break;
                case NL80211_IFTYPE_P2P_DEVICE:
                        /*
                         * fall through, P2P device only supports
                         * public action frames
                         */
                case NL80211_IFTYPE_NAN:
                default:
                        err = -EOPNOTSUPP;
                        break;
                }

                if (err)
                        return err;
        }

        if (!cfg80211_allowed_address(wdev, mgmt->sa) &&
            !cfg80211_allowed_random_address(wdev, mgmt))
                return -EINVAL;

        /* Transmit the management frame as requested by user space */
        return rdev_mgmt_tx(rdev, wdev, params, cookie);
}

bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev,
                          struct cfg80211_rx_info *info)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_mgmt_registration *reg;
        const struct ieee80211_txrx_stypes *stypes =
                &wiphy->mgmt_stypes[wdev->iftype];
        struct ieee80211_mgmt *mgmt = (void *)info->buf;
        const u8 *data;
        int data_len;
        bool result = false;
        __le16 ftype = mgmt->frame_control &
                cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
        u16 stype;

        trace_cfg80211_rx_mgmt(wdev, info);
        stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;

        if (!(stypes->rx & BIT(stype))) {
                trace_cfg80211_return_bool(false);
                return false;
        }

        data = info->buf + ieee80211_hdrlen(mgmt->frame_control);
        data_len = info->len - ieee80211_hdrlen(mgmt->frame_control);

        spin_lock_bh(&rdev->mgmt_registrations_lock);

        list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
                if (reg->frame_type != ftype)
                        continue;

                if (reg->match_len > data_len)
                        continue;

                if (memcmp(reg->match, data, reg->match_len))
                        continue;

                /* found match! */

                /* Indicate the received Action frame to user space */
                if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, info,
                                      GFP_ATOMIC))
                        continue;

                result = true;
                break;
        }

        spin_unlock_bh(&rdev->mgmt_registrations_lock);

        trace_cfg80211_return_bool(result);
        return result;
}
EXPORT_SYMBOL(cfg80211_rx_mgmt_ext);

void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev)
{
        cancel_delayed_work(&rdev->dfs_update_channels_wk);
        queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, 0);
}

void cfg80211_dfs_channels_update_work(struct work_struct *work)
{
        struct delayed_work *delayed_work = to_delayed_work(work);
        struct cfg80211_registered_device *rdev;
        struct cfg80211_chan_def chandef;
        struct ieee80211_supported_band *sband;
        struct ieee80211_channel *c;
        struct wiphy *wiphy;
        bool check_again = false;
        unsigned long timeout, next_time = 0;
        unsigned long time_dfs_update;
        enum nl80211_radar_event radar_event;
        int bandid, i;

        rdev = container_of(delayed_work, struct cfg80211_registered_device,
                            dfs_update_channels_wk);
        wiphy = &rdev->wiphy;

        rtnl_lock();
        for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) {
                sband = wiphy->bands[bandid];
                if (!sband)
                        continue;

                for (i = 0; i < sband->n_channels; i++) {
                        c = &sband->channels[i];

                        if (!(c->flags & IEEE80211_CHAN_RADAR))
                                continue;

                        if (c->dfs_state != NL80211_DFS_UNAVAILABLE &&
                            c->dfs_state != NL80211_DFS_AVAILABLE)
                                continue;

                        if (c->dfs_state == NL80211_DFS_UNAVAILABLE) {
                                time_dfs_update = IEEE80211_DFS_MIN_NOP_TIME_MS;
                                radar_event = NL80211_RADAR_NOP_FINISHED;
                        } else {
                                if (regulatory_pre_cac_allowed(wiphy) ||
                                    cfg80211_any_wiphy_oper_chan(wiphy, c))
                                        continue;

                                time_dfs_update = REG_PRE_CAC_EXPIRY_GRACE_MS;
                                radar_event = NL80211_RADAR_PRE_CAC_EXPIRED;
                        }

                        timeout = c->dfs_state_entered +
                                  msecs_to_jiffies(time_dfs_update);

                        if (time_after_eq(jiffies, timeout)) {
                                c->dfs_state = NL80211_DFS_USABLE;
                                c->dfs_state_entered = jiffies;

                                cfg80211_chandef_create(&chandef, c,
                                                        NL80211_CHAN_NO_HT);

                                nl80211_radar_notify(rdev, &chandef,
                                                     radar_event, NULL,
                                                     GFP_ATOMIC);

                                regulatory_propagate_dfs_state(wiphy, &chandef,
                                                               c->dfs_state,
                                                               radar_event);
                                continue;
                        }

                        if (!check_again)
                                next_time = timeout - jiffies;
                        else
                                next_time = min(next_time, timeout - jiffies);
                        check_again = true;
                }
        }
        rtnl_unlock();

        /* reschedule if there are other channels waiting to be cleared again */
        if (check_again)
                queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk,
                                   next_time);
}


void __cfg80211_radar_event(struct wiphy *wiphy,
                            struct cfg80211_chan_def *chandef,
                            bool offchan, gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        trace_cfg80211_radar_event(wiphy, chandef, offchan);

        /* only set the chandef supplied channel to unavailable, in
         * case the radar is detected on only one of multiple channels
         * spanned by the chandef.
         */
        cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE);

        if (offchan)
                queue_work(cfg80211_wq, &rdev->background_cac_abort_wk);

        cfg80211_sched_dfs_chan_update(rdev);

        nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp);

        memcpy(&rdev->radar_chandef, chandef, sizeof(struct cfg80211_chan_def));
        queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk);
}
EXPORT_SYMBOL(__cfg80211_radar_event);

void cfg80211_cac_event(struct net_device *netdev,
                        const struct cfg80211_chan_def *chandef,
                        enum nl80211_radar_event event, gfp_t gfp)
{
        struct wireless_dev *wdev = netdev->ieee80211_ptr;
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        unsigned long timeout;

        /* not yet supported */
        if (wdev->valid_links)
                return;

        trace_cfg80211_cac_event(netdev, event);

        if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED))
                return;

        switch (event) {
        case NL80211_RADAR_CAC_FINISHED:
                timeout = wdev->cac_start_time +
                          msecs_to_jiffies(wdev->cac_time_ms);
                WARN_ON(!time_after_eq(jiffies, timeout));
                cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE);
                memcpy(&rdev->cac_done_chandef, chandef,
                       sizeof(struct cfg80211_chan_def));
                queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk);
                cfg80211_sched_dfs_chan_update(rdev);
                fallthrough;
        case NL80211_RADAR_CAC_ABORTED:
                wdev->cac_started = false;
                break;
        case NL80211_RADAR_CAC_STARTED:
                wdev->cac_started = true;
                break;
        default:
                WARN_ON(1);
                return;
        }

        nl80211_radar_notify(rdev, chandef, event, netdev, gfp);
}
EXPORT_SYMBOL(cfg80211_cac_event);

static void
__cfg80211_background_cac_event(struct cfg80211_registered_device *rdev,
                                struct wireless_dev *wdev,
                                const struct cfg80211_chan_def *chandef,
                                enum nl80211_radar_event event)
{
        struct wiphy *wiphy = &rdev->wiphy;
        struct net_device *netdev;

        lockdep_assert_wiphy(&rdev->wiphy);

        if (!cfg80211_chandef_valid(chandef))
                return;

        if (!rdev->background_radar_wdev)
                return;

        switch (event) {
        case NL80211_RADAR_CAC_FINISHED:
                cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE);
                memcpy(&rdev->cac_done_chandef, chandef, sizeof(*chandef));
                queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk);
                cfg80211_sched_dfs_chan_update(rdev);
                wdev = rdev->background_radar_wdev;
                break;
        case NL80211_RADAR_CAC_ABORTED:
                if (!cancel_delayed_work(&rdev->background_cac_done_wk))
                        return;
                wdev = rdev->background_radar_wdev;
                break;
        case NL80211_RADAR_CAC_STARTED:
                break;
        default:
                return;
        }

        netdev = wdev ? wdev->netdev : NULL;
        nl80211_radar_notify(rdev, chandef, event, netdev, GFP_KERNEL);
}

static void
cfg80211_background_cac_event(struct cfg80211_registered_device *rdev,
                              const struct cfg80211_chan_def *chandef,
                              enum nl80211_radar_event event)
{
        wiphy_lock(&rdev->wiphy);
        __cfg80211_background_cac_event(rdev, rdev->background_radar_wdev,
                                        chandef, event);
        wiphy_unlock(&rdev->wiphy);
}

void cfg80211_background_cac_done_wk(struct work_struct *work)
{
        struct delayed_work *delayed_work = to_delayed_work(work);
        struct cfg80211_registered_device *rdev;

        rdev = container_of(delayed_work, struct cfg80211_registered_device,
                            background_cac_done_wk);
        cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef,
                                      NL80211_RADAR_CAC_FINISHED);
}

void cfg80211_background_cac_abort_wk(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;

        rdev = container_of(work, struct cfg80211_registered_device,
                            background_cac_abort_wk);
        cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef,
                                      NL80211_RADAR_CAC_ABORTED);
}

void cfg80211_background_cac_abort(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        queue_work(cfg80211_wq, &rdev->background_cac_abort_wk);
}
EXPORT_SYMBOL(cfg80211_background_cac_abort);

int
cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev,
                                          struct wireless_dev *wdev,
                                          struct cfg80211_chan_def *chandef)
{
        unsigned int cac_time_ms;
        int err;

        lockdep_assert_wiphy(&rdev->wiphy);

        if (!wiphy_ext_feature_isset(&rdev->wiphy,
                                     NL80211_EXT_FEATURE_RADAR_BACKGROUND))
                return -EOPNOTSUPP;

        /* Offchannel chain already locked by another wdev */
        if (rdev->background_radar_wdev && rdev->background_radar_wdev != wdev)
                return -EBUSY;

        /* CAC already in progress on the offchannel chain */
        if (rdev->background_radar_wdev == wdev &&
            delayed_work_pending(&rdev->background_cac_done_wk))
                return -EBUSY;

        err = rdev_set_radar_background(rdev, chandef);
        if (err)
                return err;

        cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, chandef);
        if (!cac_time_ms)
                cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;

        rdev->background_radar_chandef = *chandef;
        rdev->background_radar_wdev = wdev; /* Get offchain ownership */

        __cfg80211_background_cac_event(rdev, wdev, chandef,
                                        NL80211_RADAR_CAC_STARTED);
        queue_delayed_work(cfg80211_wq, &rdev->background_cac_done_wk,
                           msecs_to_jiffies(cac_time_ms));

        return 0;
}

void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev)
{
        struct wiphy *wiphy = wdev->wiphy;
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        lockdep_assert_wiphy(wiphy);

        if (wdev != rdev->background_radar_wdev)
                return;

        rdev_set_radar_background(rdev, NULL);
        rdev->background_radar_wdev = NULL; /* Release offchain ownership */

        __cfg80211_background_cac_event(rdev, wdev,
                                        &rdev->background_radar_chandef,
                                        NL80211_RADAR_CAC_ABORTED);
}
















































































































































































































































































    1 








    1 






    1 




    1 












































    1 







    1 


















































































































































































    1 



















































































































































































































    1 














    1 






































































































































    1 



















































    1 








    1 



























    1 











    1 











    1 

    1 

    1 







    1 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002-2005, Instant802 Networks, Inc.
 * Copyright 2005-2006, Devicescape Software, Inc.
 * Copyright 2006-2007        Jiri Benc <jbenc@suse.cz>
 * Copyright 2007        Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright (C) 2015-2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 *
 * element parsing for mac80211
 */

#include <net/mac80211.h>
#include <linux/netdevice.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/etherdevice.h>
#include <linux/if_arp.h>
#include <linux/bitmap.h>
#include <linux/crc32.h>
#include <net/net_namespace.h>
#include <net/cfg80211.h>
#include <net/rtnetlink.h>
#include <kunit/visibility.h>

#include "ieee80211_i.h"
#include "driver-ops.h"
#include "rate.h"
#include "mesh.h"
#include "wme.h"
#include "led.h"
#include "wep.h"

struct ieee80211_elems_parse {
        /* must be first for kfree to work */
        struct ieee802_11_elems elems;

        /* The basic Multi-Link element in the original elements */
        const struct element *ml_basic_elem;

        /* The reconfiguration Multi-Link element in the original elements */
        const struct element *ml_reconf_elem;

        /*
         * scratch buffer that can be used for various element parsing related
         * tasks, e.g., element de-fragmentation etc.
         */
        size_t scratch_len;
        u8 *scratch_pos;
        u8 scratch[] __counted_by(scratch_len);
};

static void
ieee80211_parse_extension_element(u32 *crc,
                                  const struct element *elem,
                                  struct ieee80211_elems_parse *elems_parse,
                                  struct ieee80211_elems_parse_params *params)
{
        struct ieee802_11_elems *elems = &elems_parse->elems;
        const void *data = elem->data + 1;
        bool calc_crc = false;
        u8 len;

        if (!elem->datalen)
                return;

        len = elem->datalen - 1;

        switch (elem->data[0]) {
        case WLAN_EID_EXT_HE_MU_EDCA:
                if (params->mode < IEEE80211_CONN_MODE_HE)
                        break;
                calc_crc = true;
                if (len >= sizeof(*elems->mu_edca_param_set))
                        elems->mu_edca_param_set = data;
                break;
        case WLAN_EID_EXT_HE_CAPABILITY:
                if (params->mode < IEEE80211_CONN_MODE_HE)
                        break;
                if (ieee80211_he_capa_size_ok(data, len)) {
                        elems->he_cap = data;
                        elems->he_cap_len = len;
                }
                break;
        case WLAN_EID_EXT_HE_OPERATION:
                if (params->mode < IEEE80211_CONN_MODE_HE)
                        break;
                calc_crc = true;
                if (len >= sizeof(*elems->he_operation) &&
                    len >= ieee80211_he_oper_size(data) - 1)
                        elems->he_operation = data;
                break;
        case WLAN_EID_EXT_UORA:
                if (params->mode < IEEE80211_CONN_MODE_HE)
                        break;
                if (len >= 1)
                        elems->uora_element = data;
                break;
        case WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME:
                if (len == 3)
                        elems->max_channel_switch_time = data;
                break;
        case WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION:
                if (len >= sizeof(*elems->mbssid_config_ie))
                        elems->mbssid_config_ie = data;
                break;
        case WLAN_EID_EXT_HE_SPR:
                if (params->mode < IEEE80211_CONN_MODE_HE)
                        break;
                if (len >= sizeof(*elems->he_spr) &&
                    len >= ieee80211_he_spr_size(data) - 1)
                        elems->he_spr = data;
                break;
        case WLAN_EID_EXT_HE_6GHZ_CAPA:
                if (params->mode < IEEE80211_CONN_MODE_HE)
                        break;
                if (len >= sizeof(*elems->he_6ghz_capa))
                        elems->he_6ghz_capa = data;
                break;
        case WLAN_EID_EXT_EHT_CAPABILITY:
                if (params->mode < IEEE80211_CONN_MODE_EHT)
                        break;
                if (ieee80211_eht_capa_size_ok(elems->he_cap,
                                               data, len,
                                               params->from_ap)) {
                        elems->eht_cap = data;
                        elems->eht_cap_len = len;
                }
                break;
        case WLAN_EID_EXT_EHT_OPERATION:
                if (params->mode < IEEE80211_CONN_MODE_EHT)
                        break;
                if (ieee80211_eht_oper_size_ok(data, len))
                        elems->eht_operation = data;
                calc_crc = true;
                break;
        case WLAN_EID_EXT_EHT_MULTI_LINK:
                if (params->mode < IEEE80211_CONN_MODE_EHT)
                        break;
                calc_crc = true;

                if (ieee80211_mle_size_ok(data, len)) {
                        const struct ieee80211_multi_link_elem *mle =
                                (void *)data;

                        switch (le16_get_bits(mle->control,
                                              IEEE80211_ML_CONTROL_TYPE)) {
                        case IEEE80211_ML_CONTROL_TYPE_BASIC:
                                if (elems_parse->ml_basic_elem) {
                                        elems->parse_error |=
                                                IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC;
                                        break;
                                }
                                elems_parse->ml_basic_elem = elem;
                                break;
                        case IEEE80211_ML_CONTROL_TYPE_RECONF:
                                elems_parse->ml_reconf_elem = elem;
                                break;
                        default:
                                break;
                        }
                }
                break;
        case WLAN_EID_EXT_BANDWIDTH_INDICATION:
                if (params->mode < IEEE80211_CONN_MODE_EHT)
                        break;
                if (ieee80211_bandwidth_indication_size_ok(data, len))
                        elems->bandwidth_indication = data;
                calc_crc = true;
                break;
        case WLAN_EID_EXT_TID_TO_LINK_MAPPING:
                if (params->mode < IEEE80211_CONN_MODE_EHT)
                        break;
                calc_crc = true;
                if (ieee80211_tid_to_link_map_size_ok(data, len) &&
                    elems->ttlm_num < ARRAY_SIZE(elems->ttlm)) {
                        elems->ttlm[elems->ttlm_num] = (void *)data;
                        elems->ttlm_num++;
                }
                break;
        }

        if (crc && calc_crc)
                *crc = crc32_be(*crc, (void *)elem, elem->datalen + 2);
}

static void ieee80211_parse_tpe(struct ieee80211_parsed_tpe *tpe,
                                const u8 *data, u8 len)
{
        const struct ieee80211_tx_pwr_env *env = (const void *)data;
        u8 count, interpret, category;
        u8 *out, N, *cnt_out = NULL, *N_out = NULL;

        if (!ieee80211_valid_tpe_element(data, len))
                return;

        count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT);
        interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET);
        category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY);

        switch (interpret) {
        case IEEE80211_TPE_LOCAL_EIRP:
                out = tpe->max_local[category].power;
                cnt_out = &tpe->max_local[category].count;
                tpe->max_local[category].valid = true;
                break;
        case IEEE80211_TPE_REG_CLIENT_EIRP:
                out = tpe->max_reg_client[category].power;
                cnt_out = &tpe->max_reg_client[category].count;
                tpe->max_reg_client[category].valid = true;
                break;
        case IEEE80211_TPE_LOCAL_EIRP_PSD:
                out = tpe->psd_local[category].power;
                cnt_out = &tpe->psd_local[category].count;
                N_out = &tpe->psd_local[category].n;
                tpe->psd_local[category].valid = true;
                break;
        case IEEE80211_TPE_REG_CLIENT_EIRP_PSD:
                out = tpe->psd_reg_client[category].power;
                cnt_out = &tpe->psd_reg_client[category].count;
                N_out = &tpe->psd_reg_client[category].n;
                tpe->psd_reg_client[category].valid = true;
                break;
        }

        switch (interpret) {
        case IEEE80211_TPE_LOCAL_EIRP:
        case IEEE80211_TPE_REG_CLIENT_EIRP:
                /* count was validated <= 3, plus 320 MHz */
                BUILD_BUG_ON(IEEE80211_TPE_EIRP_ENTRIES_320MHZ < 5);
                memcpy(out, env->variable, count + 1);
                *cnt_out = count + 1;
                /* separately take 320 MHz if present */
                if (count == 3 && len > sizeof(*env) + count + 1) {
                        out[4] = env->variable[count + 2];
                        *cnt_out = 5;
                }
                break;
        case IEEE80211_TPE_LOCAL_EIRP_PSD:
        case IEEE80211_TPE_REG_CLIENT_EIRP_PSD:
                if (!count) {
                        memset(out, env->variable[0],
                               IEEE80211_TPE_PSD_ENTRIES_320MHZ);
                        *cnt_out = IEEE80211_TPE_PSD_ENTRIES_320MHZ;
                        break;
                }

                N = 1 << (count - 1);
                memcpy(out, env->variable, N);
                *cnt_out = N;
                *N_out = N;

                if (len > sizeof(*env) + N) {
                        int K = u8_get_bits(env->variable[N],
                                            IEEE80211_TX_PWR_ENV_EXT_COUNT);

                        K = min(K, IEEE80211_TPE_PSD_ENTRIES_320MHZ - N);
                        memcpy(out + N, env->variable + N + 1, K);
                        (*cnt_out) += K;
                }
                break;
        }
}

static u32
_ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params,
                             struct ieee80211_elems_parse *elems_parse,
                             const struct element *check_inherit)
{
        struct ieee802_11_elems *elems = &elems_parse->elems;
        const struct element *elem;
        bool calc_crc = params->filter != 0;
        DECLARE_BITMAP(seen_elems, 256);
        u32 crc = params->crc;

        bitmap_zero(seen_elems, 256);

        for_each_element(elem, params->start, params->len) {
                const struct element *subelem;
                u8 elem_parse_failed;
                u8 id = elem->id;
                u8 elen = elem->datalen;
                const u8 *pos = elem->data;

                if (check_inherit &&
                    !cfg80211_is_element_inherited(elem,
                                                   check_inherit))
                        continue;

                switch (id) {
                case WLAN_EID_SSID:
                case WLAN_EID_SUPP_RATES:
                case WLAN_EID_FH_PARAMS:
                case WLAN_EID_DS_PARAMS:
                case WLAN_EID_CF_PARAMS:
                case WLAN_EID_TIM:
                case WLAN_EID_IBSS_PARAMS:
                case WLAN_EID_CHALLENGE:
                case WLAN_EID_RSN:
                case WLAN_EID_ERP_INFO:
                case WLAN_EID_EXT_SUPP_RATES:
                case WLAN_EID_HT_CAPABILITY:
                case WLAN_EID_HT_OPERATION:
                case WLAN_EID_VHT_CAPABILITY:
                case WLAN_EID_VHT_OPERATION:
                case WLAN_EID_MESH_ID:
                case WLAN_EID_MESH_CONFIG:
                case WLAN_EID_PEER_MGMT:
                case WLAN_EID_PREQ:
                case WLAN_EID_PREP:
                case WLAN_EID_PERR:
                case WLAN_EID_RANN:
                case WLAN_EID_CHANNEL_SWITCH:
                case WLAN_EID_EXT_CHANSWITCH_ANN:
                case WLAN_EID_COUNTRY:
                case WLAN_EID_PWR_CONSTRAINT:
                case WLAN_EID_TIMEOUT_INTERVAL:
                case WLAN_EID_SECONDARY_CHANNEL_OFFSET:
                case WLAN_EID_WIDE_BW_CHANNEL_SWITCH:
                case WLAN_EID_CHAN_SWITCH_PARAM:
                case WLAN_EID_EXT_CAPABILITY:
                case WLAN_EID_CHAN_SWITCH_TIMING:
                case WLAN_EID_LINK_ID:
                case WLAN_EID_BSS_MAX_IDLE_PERIOD:
                case WLAN_EID_RSNX:
                case WLAN_EID_S1G_BCN_COMPAT:
                case WLAN_EID_S1G_CAPABILITIES:
                case WLAN_EID_S1G_OPERATION:
                case WLAN_EID_AID_RESPONSE:
                case WLAN_EID_S1G_SHORT_BCN_INTERVAL:
                /*
                 * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
                 * that if the content gets bigger it might be needed more than once
                 */
                        if (test_bit(id, seen_elems)) {
                                elems->parse_error |=
                                        IEEE80211_PARSE_ERR_DUP_ELEM;
                                continue;
                        }
                        break;
                }

                if (calc_crc && id < 64 && (params->filter & (1ULL << id)))
                        crc = crc32_be(crc, pos - 2, elen + 2);

                elem_parse_failed = 0;

                switch (id) {
                case WLAN_EID_LINK_ID:
                        if (elen + 2 < sizeof(struct ieee80211_tdls_lnkie)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->lnk_id = (void *)(pos - 2);
                        break;
                case WLAN_EID_CHAN_SWITCH_TIMING:
                        if (elen < sizeof(struct ieee80211_ch_switch_timing)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->ch_sw_timing = (void *)pos;
                        break;
                case WLAN_EID_EXT_CAPABILITY:
                        elems->ext_capab = pos;
                        elems->ext_capab_len = elen;
                        break;
                case WLAN_EID_SSID:
                        elems->ssid = pos;
                        elems->ssid_len = elen;
                        break;
                case WLAN_EID_SUPP_RATES:
                        elems->supp_rates = pos;
                        elems->supp_rates_len = elen;
                        break;
                case WLAN_EID_DS_PARAMS:
                        if (elen >= 1)
                                elems->ds_params = pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_TIM:
                        if (elen >= sizeof(struct ieee80211_tim_ie)) {
                                elems->tim = (void *)pos;
                                elems->tim_len = elen;
                        } else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_VENDOR_SPECIFIC:
                        if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 &&
                            pos[2] == 0xf2) {
                                /* Microsoft OUI (00:50:F2) */

                                if (calc_crc)
                                        crc = crc32_be(crc, pos - 2, elen + 2);

                                if (elen >= 5 && pos[3] == 2) {
                                        /* OUI Type 2 - WMM IE */
                                        if (pos[4] == 0) {
                                                elems->wmm_info = pos;
                                                elems->wmm_info_len = elen;
                                        } else if (pos[4] == 1) {
                                                elems->wmm_param = pos;
                                                elems->wmm_param_len = elen;
                                        }
                                }
                        }
                        break;
                case WLAN_EID_RSN:
                        elems->rsn = pos;
                        elems->rsn_len = elen;
                        break;
                case WLAN_EID_ERP_INFO:
                        if (elen >= 1)
                                elems->erp_info = pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_EXT_SUPP_RATES:
                        elems->ext_supp_rates = pos;
                        elems->ext_supp_rates_len = elen;
                        break;
                case WLAN_EID_HT_CAPABILITY:
                        if (params->mode < IEEE80211_CONN_MODE_HT)
                                break;
                        if (elen >= sizeof(struct ieee80211_ht_cap))
                                elems->ht_cap_elem = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_HT_OPERATION:
                        if (params->mode < IEEE80211_CONN_MODE_HT)
                                break;
                        if (elen >= sizeof(struct ieee80211_ht_operation))
                                elems->ht_operation = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_VHT_CAPABILITY:
                        if (params->mode < IEEE80211_CONN_MODE_VHT)
                                break;
                        if (elen >= sizeof(struct ieee80211_vht_cap))
                                elems->vht_cap_elem = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_VHT_OPERATION:
                        if (params->mode < IEEE80211_CONN_MODE_VHT)
                                break;
                        if (elen >= sizeof(struct ieee80211_vht_operation)) {
                                elems->vht_operation = (void *)pos;
                                if (calc_crc)
                                        crc = crc32_be(crc, pos - 2, elen + 2);
                                break;
                        }
                        elem_parse_failed =
                                IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_OPMODE_NOTIF:
                        if (params->mode < IEEE80211_CONN_MODE_VHT)
                                break;
                        if (elen > 0) {
                                elems->opmode_notif = pos;
                                if (calc_crc)
                                        crc = crc32_be(crc, pos - 2, elen + 2);
                                break;
                        }
                        elem_parse_failed =
                                IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_MESH_ID:
                        elems->mesh_id = pos;
                        elems->mesh_id_len = elen;
                        break;
                case WLAN_EID_MESH_CONFIG:
                        if (elen >= sizeof(struct ieee80211_meshconf_ie))
                                elems->mesh_config = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_PEER_MGMT:
                        elems->peering = pos;
                        elems->peering_len = elen;
                        break;
                case WLAN_EID_MESH_AWAKE_WINDOW:
                        if (elen >= 2)
                                elems->awake_window = (void *)pos;
                        break;
                case WLAN_EID_PREQ:
                        elems->preq = pos;
                        elems->preq_len = elen;
                        break;
                case WLAN_EID_PREP:
                        elems->prep = pos;
                        elems->prep_len = elen;
                        break;
                case WLAN_EID_PERR:
                        elems->perr = pos;
                        elems->perr_len = elen;
                        break;
                case WLAN_EID_RANN:
                        if (elen >= sizeof(struct ieee80211_rann_ie))
                                elems->rann = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_CHANNEL_SWITCH:
                        if (elen != sizeof(struct ieee80211_channel_sw_ie)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->ch_switch_ie = (void *)pos;
                        break;
                case WLAN_EID_EXT_CHANSWITCH_ANN:
                        if (elen != sizeof(struct ieee80211_ext_chansw_ie)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->ext_chansw_ie = (void *)pos;
                        break;
                case WLAN_EID_SECONDARY_CHANNEL_OFFSET:
                        if (params->mode < IEEE80211_CONN_MODE_HT)
                                break;
                        if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->sec_chan_offs = (void *)pos;
                        break;
                case WLAN_EID_CHAN_SWITCH_PARAM:
                        if (elen <
                            sizeof(*elems->mesh_chansw_params_ie)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->mesh_chansw_params_ie = (void *)pos;
                        break;
                case WLAN_EID_WIDE_BW_CHANNEL_SWITCH:
                        if (params->mode < IEEE80211_CONN_MODE_VHT)
                                break;

                        if (!params->action) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_UNEXPECTED_ELEM;
                                break;
                        }

                        if (elen < sizeof(*elems->wide_bw_chansw_ie)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->wide_bw_chansw_ie = (void *)pos;
                        break;
                case WLAN_EID_CHANNEL_SWITCH_WRAPPER:
                        if (params->mode < IEEE80211_CONN_MODE_VHT)
                                break;
                        if (params->action) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_UNEXPECTED_ELEM;
                                break;
                        }
                        /*
                         * This is a bit tricky, but as we only care about
                         * a few elements, parse them out manually.
                         */
                        subelem = cfg80211_find_elem(WLAN_EID_WIDE_BW_CHANNEL_SWITCH,
                                                     pos, elen);
                        if (subelem) {
                                if (subelem->datalen >= sizeof(*elems->wide_bw_chansw_ie))
                                        elems->wide_bw_chansw_ie =
                                                (void *)subelem->data;
                                else
                                        elem_parse_failed =
                                                IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        }

                        if (params->mode < IEEE80211_CONN_MODE_EHT)
                                break;

                        subelem = cfg80211_find_ext_elem(WLAN_EID_EXT_BANDWIDTH_INDICATION,
                                                         pos, elen);
                        if (subelem) {
                                const void *edata = subelem->data + 1;
                                u8 edatalen = subelem->datalen - 1;

                                if (ieee80211_bandwidth_indication_size_ok(edata,
                                                                           edatalen))
                                        elems->bandwidth_indication = edata;
                                else
                                        elem_parse_failed =
                                                IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        }

                        subelem = cfg80211_find_ext_elem(WLAN_EID_TX_POWER_ENVELOPE,
                                                         pos, elen);
                        if (subelem)
                                ieee80211_parse_tpe(&elems->csa_tpe,
                                                    subelem->data + 1,
                                                    subelem->datalen - 1);
                        break;
                case WLAN_EID_COUNTRY:
                        elems->country_elem = pos;
                        elems->country_elem_len = elen;
                        break;
                case WLAN_EID_PWR_CONSTRAINT:
                        if (elen != 1) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->pwr_constr_elem = pos;
                        break;
                case WLAN_EID_CISCO_VENDOR_SPECIFIC:
                        /* Lots of different options exist, but we only care
                         * about the Dynamic Transmit Power Control element.
                         * First check for the Cisco OUI, then for the DTPC
                         * tag (0x00).
                         */
                        if (elen < 4) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }

                        if (pos[0] != 0x00 || pos[1] != 0x40 ||
                            pos[2] != 0x96 || pos[3] != 0x00)
                                break;

                        if (elen != 6) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }

                        if (calc_crc)
                                crc = crc32_be(crc, pos - 2, elen + 2);

                        elems->cisco_dtpc_elem = pos;
                        break;
                case WLAN_EID_ADDBA_EXT:
                        if (elen < sizeof(struct ieee80211_addba_ext_ie)) {
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                                break;
                        }
                        elems->addba_ext_ie = (void *)pos;
                        break;
                case WLAN_EID_TIMEOUT_INTERVAL:
                        if (elen >= sizeof(struct ieee80211_timeout_interval_ie))
                                elems->timeout_int = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_BSS_MAX_IDLE_PERIOD:
                        if (elen >= sizeof(*elems->max_idle_period_ie))
                                elems->max_idle_period_ie = (void *)pos;
                        break;
                case WLAN_EID_RSNX:
                        elems->rsnx = pos;
                        elems->rsnx_len = elen;
                        break;
                case WLAN_EID_TX_POWER_ENVELOPE:
                        if (params->mode < IEEE80211_CONN_MODE_HE)
                                break;
                        ieee80211_parse_tpe(&elems->tpe, pos, elen);
                        break;
                case WLAN_EID_EXTENSION:
                        ieee80211_parse_extension_element(calc_crc ?
                                                                &crc : NULL,
                                                          elem, elems_parse,
                                                          params);
                        break;
                case WLAN_EID_S1G_CAPABILITIES:
                        if (params->mode != IEEE80211_CONN_MODE_S1G)
                                break;
                        if (elen >= sizeof(*elems->s1g_capab))
                                elems->s1g_capab = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_S1G_OPERATION:
                        if (params->mode != IEEE80211_CONN_MODE_S1G)
                                break;
                        if (elen == sizeof(*elems->s1g_oper))
                                elems->s1g_oper = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_S1G_BCN_COMPAT:
                        if (params->mode != IEEE80211_CONN_MODE_S1G)
                                break;
                        if (elen == sizeof(*elems->s1g_bcn_compat))
                                elems->s1g_bcn_compat = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                case WLAN_EID_AID_RESPONSE:
                        if (params->mode != IEEE80211_CONN_MODE_S1G)
                                break;
                        if (elen == sizeof(struct ieee80211_aid_response_ie))
                                elems->aid_resp = (void *)pos;
                        else
                                elem_parse_failed =
                                        IEEE80211_PARSE_ERR_BAD_ELEM_SIZE;
                        break;
                default:
                        break;
                }

                if (elem_parse_failed)
                        elems->parse_error |= elem_parse_failed;
                else
                        __set_bit(id, seen_elems);
        }

        if (!for_each_element_completed(elem, params->start, params->len))
                elems->parse_error |= IEEE80211_PARSE_ERR_INVALID_END;

        return crc;
}

static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len,
                                            struct ieee802_11_elems *elems,
                                            struct cfg80211_bss *bss,
                                            u8 *nontransmitted_profile)
{
        const struct element *elem, *sub;
        size_t profile_len = 0;
        bool found = false;

        if (!bss || !bss->transmitted_bss)
                return profile_len;

        for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) {
                if (elem->datalen < 2)
                        continue;
                if (elem->data[0] < 1 || elem->data[0] > 8)
                        continue;

                for_each_element(sub, elem->data + 1, elem->datalen - 1) {
                        u8 new_bssid[ETH_ALEN];
                        const u8 *index;

                        if (sub->id != 0 || sub->datalen < 4) {
                                /* not a valid BSS profile */
                                continue;
                        }

                        if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP ||
                            sub->data[1] != 2) {
                                /* The first element of the
                                 * Nontransmitted BSSID Profile is not
                                 * the Nontransmitted BSSID Capability
                                 * element.
                                 */
                                continue;
                        }

                        memset(nontransmitted_profile, 0, len);
                        profile_len = cfg80211_merge_profile(start, len,
                                                             elem,
                                                             sub,
                                                             nontransmitted_profile,
                                                             len);

                        /* found a Nontransmitted BSSID Profile */
                        index = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX,
                                                 nontransmitted_profile,
                                                 profile_len);
                        if (!index || index[1] < 1 || index[2] == 0) {
                                /* Invalid MBSSID Index element */
                                continue;
                        }

                        cfg80211_gen_new_bssid(bss->transmitted_bss->bssid,
                                               elem->data[0],
                                               index[2],
                                               new_bssid);
                        if (ether_addr_equal(new_bssid, bss->bssid)) {
                                found = true;
                                elems->bssid_index_len = index[1];
                                elems->bssid_index = (void *)&index[2];
                                break;
                        }
                }
        }

        return found ? profile_len : 0;
}

static void
ieee80211_mle_get_sta_prof(struct ieee80211_elems_parse *elems_parse,
                           u8 link_id)
{
        struct ieee802_11_elems *elems = &elems_parse->elems;
        const struct ieee80211_multi_link_elem *ml = elems->ml_basic;
        ssize_t ml_len = elems->ml_basic_len;
        const struct element *sub;

        for_each_mle_subelement(sub, (u8 *)ml, ml_len) {
                struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data;
                ssize_t sta_prof_len;
                u16 control;

                if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE)
                        continue;

                if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data,
                                                          sub->datalen))
                        return;

                control = le16_to_cpu(prof->control);

                if (link_id != u16_get_bits(control,
                                            IEEE80211_MLE_STA_CONTROL_LINK_ID))
                        continue;

                if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE))
                        return;

                /* the sub element can be fragmented */
                sta_prof_len =
                        cfg80211_defragment_element(sub,
                                                    (u8 *)ml, ml_len,
                                                    elems_parse->scratch_pos,
                                                    elems_parse->scratch +
                                                        elems_parse->scratch_len -
                                                        elems_parse->scratch_pos,
                                                    IEEE80211_MLE_SUBELEM_FRAGMENT);

                if (sta_prof_len < 0)
                        return;

                elems->prof = (void *)elems_parse->scratch_pos;
                elems->sta_prof_len = sta_prof_len;
                elems_parse->scratch_pos += sta_prof_len;

                return;
        }
}

static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse,
                                     struct ieee80211_elems_parse_params *params)
{
        struct ieee802_11_elems *elems = &elems_parse->elems;
        struct ieee80211_mle_per_sta_profile *prof;
        struct ieee80211_elems_parse_params sub = {
                .mode = params->mode,
                .action = params->action,
                .from_ap = params->from_ap,
                .link_id = -1,
        };
        ssize_t ml_len = elems->ml_basic_len;
        const struct element *non_inherit = NULL;
        const u8 *end;

        ml_len = cfg80211_defragment_element(elems_parse->ml_basic_elem,
                                             elems->ie_start,
                                             elems->total_len,
                                             elems_parse->scratch_pos,
                                             elems_parse->scratch +
                                                elems_parse->scratch_len -
                                                elems_parse->scratch_pos,
                                             WLAN_EID_FRAGMENT);

        if (ml_len < 0)
                return;

        elems->ml_basic = (const void *)elems_parse->scratch_pos;
        elems->ml_basic_len = ml_len;
        elems_parse->scratch_pos += ml_len;

        if (params->link_id == -1)
                return;

        ieee80211_mle_get_sta_prof(elems_parse, params->link_id);
        prof = elems->prof;

        if (!prof)
                return;

        /* check if we have the 4 bytes for the fixed part in assoc response */
        if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) {
                elems->prof = NULL;
                elems->sta_prof_len = 0;
                return;
        }

        /*
         * Skip the capability information and the status code that are expected
         * as part of the station profile in association response frames. Note
         * the -1 is because the 'sta_info_len' is accounted to as part of the
         * per-STA profile, but not part of the 'u8 variable[]' portion.
         */
        sub.start = prof->variable + prof->sta_info_len - 1 + 4;
        end = (const u8 *)prof + elems->sta_prof_len;
        sub.len = end - sub.start;

        non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
                                             sub.start, sub.len);
        _ieee802_11_parse_elems_full(&sub, elems_parse, non_inherit);
}

static void
ieee80211_mle_defrag_reconf(struct ieee80211_elems_parse *elems_parse)
{
        struct ieee802_11_elems *elems = &elems_parse->elems;
        ssize_t ml_len;

        ml_len = cfg80211_defragment_element(elems_parse->ml_reconf_elem,
                                             elems->ie_start,
                                             elems->total_len,
                                             elems_parse->scratch_pos,
                                             elems_parse->scratch +
                                                elems_parse->scratch_len -
                                                elems_parse->scratch_pos,
                                             WLAN_EID_FRAGMENT);
        if (ml_len < 0)
                return;
        elems->ml_reconf = (void *)elems_parse->scratch_pos;
        elems->ml_reconf_len = ml_len;
        elems_parse->scratch_pos += ml_len;
}

struct ieee802_11_elems *
ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
{
        struct ieee80211_elems_parse *elems_parse;
        struct ieee802_11_elems *elems;
        const struct element *non_inherit = NULL;
        u8 *nontransmitted_profile;
        int nontransmitted_profile_len = 0;
        size_t scratch_len = 3 * params->len;

        BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0);

        elems_parse = kzalloc(struct_size(elems_parse, scratch, scratch_len),
                              GFP_ATOMIC);
        if (!elems_parse)
                return NULL;

        elems_parse->scratch_len = scratch_len;
        elems_parse->scratch_pos = elems_parse->scratch;

        elems = &elems_parse->elems;
        elems->ie_start = params->start;
        elems->total_len = params->len;

        /* set all TPE entries to unlimited (but invalid) */
        ieee80211_clear_tpe(&elems->tpe);
        ieee80211_clear_tpe(&elems->csa_tpe);

        nontransmitted_profile = elems_parse->scratch_pos;
        nontransmitted_profile_len =
                ieee802_11_find_bssid_profile(params->start, params->len,
                                              elems, params->bss,
                                              nontransmitted_profile);
        elems_parse->scratch_pos += nontransmitted_profile_len;
        non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
                                             nontransmitted_profile,
                                             nontransmitted_profile_len);

        elems->crc = _ieee802_11_parse_elems_full(params, elems_parse,
                                                  non_inherit);

        /* Override with nontransmitted profile, if found */
        if (nontransmitted_profile_len) {
                struct ieee80211_elems_parse_params sub = {
                        .mode = params->mode,
                        .start = nontransmitted_profile,
                        .len = nontransmitted_profile_len,
                        .action = params->action,
                        .link_id = params->link_id,
                };

                _ieee802_11_parse_elems_full(&sub, elems_parse, NULL);
        }

        ieee80211_mle_parse_link(elems_parse, params);

        ieee80211_mle_defrag_reconf(elems_parse);

        if (elems->tim && !elems->parse_error) {
                const struct ieee80211_tim_ie *tim_ie = elems->tim;

                elems->dtim_period = tim_ie->dtim_period;
                elems->dtim_count = tim_ie->dtim_count;
        }

        /* Override DTIM period and count if needed */
        if (elems->bssid_index &&
            elems->bssid_index_len >=
            offsetofend(struct ieee80211_bssid_index, dtim_period))
                elems->dtim_period = elems->bssid_index->dtim_period;

        if (elems->bssid_index &&
            elems->bssid_index_len >=
            offsetofend(struct ieee80211_bssid_index, dtim_count))
                elems->dtim_count = elems->bssid_index->dtim_count;

        return elems;
}
EXPORT_SYMBOL_IF_KUNIT(ieee802_11_parse_elems_full);

int ieee80211_parse_bitrates(enum nl80211_chan_width width,
                             const struct ieee80211_supported_band *sband,
                             const u8 *srates, int srates_len, u32 *rates)
{
        u32 rate_flags = ieee80211_chanwidth_rate_flags(width);
        struct ieee80211_rate *br;
        int brate, rate, i, j, count = 0;

        *rates = 0;

        for (i = 0; i < srates_len; i++) {
                rate = srates[i] & 0x7f;

                for (j = 0; j < sband->n_bitrates; j++) {
                        br = &sband->bitrates[j];
                        if ((rate_flags & br->flags) != rate_flags)
                                continue;

                        brate = DIV_ROUND_UP(br->bitrate, 5);
                        if (brate == rate) {
                                *rates |= BIT(j);
                                count++;
                                break;
                        }
                }
        }
        return count;
}



















































































    1 











































    2 



















    2 
    2 




    2 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_BL_H
#define _LINUX_LIST_BL_H

#include <linux/list.h>
#include <linux/bit_spinlock.h>

/*
 * Special version of lists, where head of the list has a lock in the lowest
 * bit. This is useful for scalable hash tables without increasing memory
 * footprint overhead.
 *
 * For modification operations, the 0 bit of hlist_bl_head->first
 * pointer must be set.
 *
 * With some small modifications, this can easily be adapted to store several
 * arbitrary bits (not just a single lock bit), if the need arises to store
 * some fast and compact auxiliary data.
 */

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define LIST_BL_LOCKMASK        1UL
#else
#define LIST_BL_LOCKMASK        0UL
#endif

#ifdef CONFIG_DEBUG_LIST
#define LIST_BL_BUG_ON(x) BUG_ON(x)
#else
#define LIST_BL_BUG_ON(x)
#endif


struct hlist_bl_head {
        struct hlist_bl_node *first;
};

struct hlist_bl_node {
        struct hlist_bl_node *next, **pprev;
};
#define INIT_HLIST_BL_HEAD(ptr) \
        ((ptr)->first = NULL)

static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)

static inline bool  hlist_bl_unhashed(const struct hlist_bl_node *h)
{
        return !h->pprev;
}

static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_set_first(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}

static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
        return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_add_head(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;
        hlist_bl_set_first(h, n);
}

static inline void hlist_bl_add_before(struct hlist_bl_node *n,
                                       struct hlist_bl_node *next)
{
        struct hlist_bl_node **pprev = next->pprev;

        n->pprev = pprev;
        n->next = next;
        next->pprev = &n->next;

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
}

static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
                                       struct hlist_bl_node *prev)
{
        n->next = prev->next;
        n->pprev = &prev->next;
        prev->next = n;

        if (n->next)
                n->next->pprev = &n->next;
}

static inline void __hlist_bl_del(struct hlist_bl_node *n)
{
        struct hlist_bl_node *next = n->next;
        struct hlist_bl_node **pprev = n->pprev;

        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((unsigned long)next |
                         ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
        if (next)
                next->pprev = pprev;
}

static inline void hlist_bl_del(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

static inline void hlist_bl_del_init(struct hlist_bl_node *n)
{
        if (!hlist_bl_unhashed(n)) {
                __hlist_bl_del(n);
                INIT_HLIST_BL_NODE(n);
        }
}

static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
        bit_spin_lock(0, (unsigned long *)b);
}

static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
        __bit_spin_unlock(0, (unsigned long *)b);
}

static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
{
        return bit_spin_is_locked(0, (unsigned long *)b);
}

/**
 * hlist_bl_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_bl_for_each_entry(tpos, pos, head, member)                \
        for (pos = hlist_bl_first(head);                                \
             pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @n:                another &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member)         \
        for (pos = hlist_bl_first(head);                                 \
             pos && ({ n = pos->next; 1; }) &&                                  \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = n)

#endif














































































    2 











































































































































































    3 



























    3 















    3 


    3 






    2 

    2 






    2 






    3 



    2 



    2 











    2 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Packet matching code.
 *
 * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
 * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
 * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/in.h>
#include <linux/skbuff.h>
#include <linux/kmod.h>
#include <linux/vmalloc.h>
#include <linux/netdevice.h>
#include <linux/module.h>
#include <linux/poison.h>
#include <net/ipv6.h>
#include <net/compat.h>
#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/err.h>
#include <linux/cpumask.h>

#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
#include <net/netfilter/nf_log.h>
#include "../../netfilter/xt_repldata.h"

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv6 packet filter");

void *ip6t_alloc_initial_table(const struct xt_table *info)
{
        return xt_alloc_initial_table(ip6t, IP6T);
}
EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);

/* Returns whether matches rule or not. */
/* Performance critical - called for every packet */
static inline bool
ip6_packet_match(const struct sk_buff *skb,
                 const char *indev,
                 const char *outdev,
                 const struct ip6t_ip6 *ip6info,
                 unsigned int *protoff,
                 u16 *fragoff, bool *hotdrop)
{
        unsigned long ret;
        const struct ipv6hdr *ipv6 = ipv6_hdr(skb);

        if (NF_INVF(ip6info, IP6T_INV_SRCIP,
                    ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk,
                                         &ip6info->src)) ||
            NF_INVF(ip6info, IP6T_INV_DSTIP,
                    ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk,
                                         &ip6info->dst)))
                return false;

        ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask);

        if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0))
                return false;

        ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask);

        if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0))
                return false;

/* ... might want to do something with class and flowlabel here ... */

        /* look for the desired protocol header */
        if (ip6info->flags & IP6T_F_PROTO) {
                int protohdr;
                unsigned short _frag_off;

                protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL);
                if (protohdr < 0) {
                        if (_frag_off == 0)
                                *hotdrop = true;
                        return false;
                }
                *fragoff = _frag_off;

                if (ip6info->proto == protohdr) {
                        if (ip6info->invflags & IP6T_INV_PROTO)
                                return false;

                        return true;
                }

                /* We need match for the '-p all', too! */
                if ((ip6info->proto != 0) &&
                        !(ip6info->invflags & IP6T_INV_PROTO))
                        return false;
        }
        return true;
}

/* should be ip6 safe */
static bool
ip6_checkentry(const struct ip6t_ip6 *ipv6)
{
        if (ipv6->flags & ~IP6T_F_MASK)
                return false;
        if (ipv6->invflags & ~IP6T_INV_MASK)
                return false;

        return true;
}

static unsigned int
ip6t_error(struct sk_buff *skb, const struct xt_action_param *par)
{
        net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo);

        return NF_DROP;
}

static inline struct ip6t_entry *
get_entry(const void *base, unsigned int offset)
{
        return (struct ip6t_entry *)(base + offset);
}

/* All zeroes == unconditional rule. */
/* Mildly perf critical (only if packet tracing is on) */
static inline bool unconditional(const struct ip6t_entry *e)
{
        static const struct ip6t_ip6 uncond;

        return e->target_offset == sizeof(struct ip6t_entry) &&
               memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0;
}

static inline const struct xt_entry_target *
ip6t_get_target_c(const struct ip6t_entry *e)
{
        return ip6t_get_target((struct ip6t_entry *)e);
}

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* This cries for unification! */
static const char *const hooknames[] = {
        [NF_INET_PRE_ROUTING]                = "PREROUTING",
        [NF_INET_LOCAL_IN]                = "INPUT",
        [NF_INET_FORWARD]                = "FORWARD",
        [NF_INET_LOCAL_OUT]                = "OUTPUT",
        [NF_INET_POST_ROUTING]                = "POSTROUTING",
};

enum nf_ip_trace_comments {
        NF_IP6_TRACE_COMMENT_RULE,
        NF_IP6_TRACE_COMMENT_RETURN,
        NF_IP6_TRACE_COMMENT_POLICY,
};

static const char *const comments[] = {
        [NF_IP6_TRACE_COMMENT_RULE]        = "rule",
        [NF_IP6_TRACE_COMMENT_RETURN]        = "return",
        [NF_IP6_TRACE_COMMENT_POLICY]        = "policy",
};

static const struct nf_loginfo trace_loginfo = {
        .type = NF_LOG_TYPE_LOG,
        .u = {
                .log = {
                        .level = LOGLEVEL_WARNING,
                        .logflags = NF_LOG_DEFAULT_MASK,
                },
        },
};

/* Mildly perf critical (only if packet tracing is on) */
static inline int
get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
                      const char *hookname, const char **chainname,
                      const char **comment, unsigned int *rulenum)
{
        const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);

        if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
                /* Head of user chain: ERROR target with chainname */
                *chainname = t->target.data;
                (*rulenum) = 0;
        } else if (s == e) {
                (*rulenum)++;

                if (unconditional(s) &&
                    strcmp(t->target.u.kernel.target->name,
                           XT_STANDARD_TARGET) == 0 &&
                    t->verdict < 0) {
                        /* Tail of chains: STANDARD target (return/policy) */
                        *comment = *chainname == hookname
                                ? comments[NF_IP6_TRACE_COMMENT_POLICY]
                                : comments[NF_IP6_TRACE_COMMENT_RETURN];
                }
                return 1;
        } else
                (*rulenum)++;

        return 0;
}

static void trace_packet(struct net *net,
                         const struct sk_buff *skb,
                         unsigned int hook,
                         const struct net_device *in,
                         const struct net_device *out,
                         const char *tablename,
                         const struct xt_table_info *private,
                         const struct ip6t_entry *e)
{
        const struct ip6t_entry *root;
        const char *hookname, *chainname, *comment;
        const struct ip6t_entry *iter;
        unsigned int rulenum = 0;

        root = get_entry(private->entries, private->hook_entry[hook]);

        hookname = chainname = hooknames[hook];
        comment = comments[NF_IP6_TRACE_COMMENT_RULE];

        xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
                if (get_chainname_rulenum(iter, e, hookname,
                    &chainname, &comment, &rulenum) != 0)
                        break;

        nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo,
                     "TRACE: %s:%s:%s:%u ",
                     tablename, chainname, comment, rulenum);
}
#endif

static inline struct ip6t_entry *
ip6t_next_entry(const struct ip6t_entry *entry)
{
        return (void *)entry + entry->next_offset;
}

/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ip6t_do_table(void *priv, struct sk_buff *skb,
              const struct nf_hook_state *state)
{
        const struct xt_table *table = priv;
        unsigned int hook = state->hook;
        static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
        /* Initializing verdict to NF_DROP keeps gcc happy. */
        unsigned int verdict = NF_DROP;
        const char *indev, *outdev;
        const void *table_base;
        struct ip6t_entry *e, **jumpstack;
        unsigned int stackidx, cpu;
        const struct xt_table_info *private;
        struct xt_action_param acpar;
        unsigned int addend;

        /* Initialization */
        stackidx = 0;
        indev = state->in ? state->in->name : nulldevname;
        outdev = state->out ? state->out->name : nulldevname;
        /* We handle fragments by dealing with the first fragment as
         * if it was a normal packet.  All other fragments are treated
         * normally, except that they will NEVER match rules that ask
         * things we don't know, ie. tcp syn flag or ports).  If the
         * rule is also a fragment-specific rule, non-fragments won't
         * match it. */
        acpar.fragoff = 0;
        acpar.hotdrop = false;
        acpar.state   = state;

        WARN_ON(!(table->valid_hooks & (1 << hook)));

        local_bh_disable();
        addend = xt_write_recseq_begin();
        private = READ_ONCE(table->private); /* Address dependency. */
        cpu        = smp_processor_id();
        table_base = private->entries;
        jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];

        /* Switch to alternate jumpstack if we're being invoked via TEE.
         * TEE issues XT_CONTINUE verdict on original skb so we must not
         * clobber the jumpstack.
         *
         * For recursion via REJECT or SYNPROXY the stack will be clobbered
         * but it is no problem since absolute verdict is issued by these.
         */
        if (static_key_false(&xt_tee_enabled))
                jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);

        e = get_entry(table_base, private->hook_entry[hook]);

        do {
                const struct xt_entry_target *t;
                const struct xt_entry_match *ematch;
                struct xt_counters *counter;

                WARN_ON(!e);
                acpar.thoff = 0;
                if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
                    &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
 no_match:
                        e = ip6t_next_entry(e);
                        continue;
                }

                xt_ematch_foreach(ematch, e) {
                        acpar.match     = ematch->u.kernel.match;
                        acpar.matchinfo = ematch->data;
                        if (!acpar.match->match(skb, &acpar))
                                goto no_match;
                }

                counter = xt_get_this_cpu_counter(&e->counters);
                ADD_COUNTER(*counter, skb->len, 1);

                t = ip6t_get_target_c(e);
                WARN_ON(!t->u.kernel.target);

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
                /* The packet is traced: log it */
                if (unlikely(skb->nf_trace))
                        trace_packet(state->net, skb, hook, state->in,
                                     state->out, table->name, private, e);
#endif
                /* Standard target? */
                if (!t->u.kernel.target->target) {
                        int v;

                        v = ((struct xt_standard_target *)t)->verdict;
                        if (v < 0) {
                                /* Pop from stack? */
                                if (v != XT_RETURN) {
                                        verdict = (unsigned int)(-v) - 1;
                                        break;
                                }
                                if (stackidx == 0)
                                        e = get_entry(table_base,
                                            private->underflow[hook]);
                                else
                                        e = ip6t_next_entry(jumpstack[--stackidx]);
                                continue;
                        }
                        if (table_base + v != ip6t_next_entry(e) &&
                            !(e->ipv6.flags & IP6T_F_GOTO)) {
                                if (unlikely(stackidx >= private->stacksize)) {
                                        verdict = NF_DROP;
                                        break;
                                }
                                jumpstack[stackidx++] = e;
                        }

                        e = get_entry(table_base, v);
                        continue;
                }

                acpar.target   = t->u.kernel.target;
                acpar.targinfo = t->data;

                verdict = t->u.kernel.target->target(skb, &acpar);
                if (verdict == XT_CONTINUE)
                        e = ip6t_next_entry(e);
                else
                        /* Verdict */
                        break;
        } while (!acpar.hotdrop);

        xt_write_recseq_end(addend);
        local_bh_enable();

        if (acpar.hotdrop)
                return NF_DROP;
        else return verdict;
}

/* Figures out from what hook each rule can be called: returns 0 if
   there are loops.  Puts hook bitmask in comefrom. */
static int
mark_source_chains(const struct xt_table_info *newinfo,
                   unsigned int valid_hooks, void *entry0,
                   unsigned int *offsets)
{
        unsigned int hook;

        /* No recursion; use packet counter to save back ptrs (reset
           to 0 as we leave), and comefrom to save source hook bitmask */
        for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
                unsigned int pos = newinfo->hook_entry[hook];
                struct ip6t_entry *e = entry0 + pos;

                if (!(valid_hooks & (1 << hook)))
                        continue;

                /* Set initial back pointer. */
                e->counters.pcnt = pos;

                for (;;) {
                        const struct xt_standard_target *t
                                = (void *)ip6t_get_target_c(e);
                        int visited = e->comefrom & (1 << hook);

                        if (e->comefrom & (1 << NF_INET_NUMHOOKS))
                                return 0;

                        e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));

                        /* Unconditional return/END. */
                        if ((unconditional(e) &&
                             (strcmp(t->target.u.user.name,
                                     XT_STANDARD_TARGET) == 0) &&
                             t->verdict < 0) || visited) {
                                unsigned int oldpos, size;

                                /* Return: backtrack through the last
                                   big jump. */
                                do {
                                        e->comefrom ^= (1<<NF_INET_NUMHOOKS);
                                        oldpos = pos;
                                        pos = e->counters.pcnt;
                                        e->counters.pcnt = 0;

                                        /* We're at the start. */
                                        if (pos == oldpos)
                                                goto next;

                                        e = entry0 + pos;
                                } while (oldpos == pos + e->next_offset);

                                /* Move along one */
                                size = e->next_offset;
                                e = entry0 + pos + size;
                                if (pos + size >= newinfo->size)
                                        return 0;
                                e->counters.pcnt = pos;
                                pos += size;
                        } else {
                                int newpos = t->verdict;

                                if (strcmp(t->target.u.user.name,
                                           XT_STANDARD_TARGET) == 0 &&
                                    newpos >= 0) {
                                        /* This a jump; chase it. */
                                        if (!xt_find_jump_offset(offsets, newpos,
                                                                 newinfo->number))
                                                return 0;
                                } else {
                                        /* ... this is a fallthru */
                                        newpos = pos + e->next_offset;
                                        if (newpos >= newinfo->size)
                                                return 0;
                                }
                                e = entry0 + newpos;
                                e->counters.pcnt = pos;
                                pos = newpos;
                        }
                }
next:                ;
        }
        return 1;
}

static void cleanup_match(struct xt_entry_match *m, struct net *net)
{
        struct xt_mtdtor_param par;

        par.net       = net;
        par.match     = m->u.kernel.match;
        par.matchinfo = m->data;
        par.family    = NFPROTO_IPV6;
        if (par.match->destroy != NULL)
                par.match->destroy(&par);
        module_put(par.match->me);
}

static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        const struct ip6t_ip6 *ipv6 = par->entryinfo;

        par->match     = m->u.kernel.match;
        par->matchinfo = m->data;

        return xt_check_match(par, m->u.match_size - sizeof(*m),
                              ipv6->proto, ipv6->invflags & IP6T_INV_PROTO);
}

static int
find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
        struct xt_match *match;
        int ret;

        match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);

        m->u.kernel.match = match;

        ret = check_match(m, par);
        if (ret)
                goto err;

        return 0;
err:
        module_put(m->u.kernel.match->me);
        return ret;
}

static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
{
        struct xt_entry_target *t = ip6t_get_target(e);
        struct xt_tgchk_param par = {
                .net       = net,
                .table     = name,
                .entryinfo = e,
                .target    = t->u.kernel.target,
                .targinfo  = t->data,
                .hook_mask = e->comefrom,
                .family    = NFPROTO_IPV6,
        };

        return xt_check_target(&par, t->u.target_size - sizeof(*t),
                               e->ipv6.proto,
                               e->ipv6.invflags & IP6T_INV_PROTO);
}

static int
find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
                 unsigned int size,
                 struct xt_percpu_counter_alloc_state *alloc_state)
{
        struct xt_entry_target *t;
        struct xt_target *target;
        int ret;
        unsigned int j;
        struct xt_mtchk_param mtpar;
        struct xt_entry_match *ematch;

        if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
                return -ENOMEM;

        j = 0;
        memset(&mtpar, 0, sizeof(mtpar));
        mtpar.net        = net;
        mtpar.table     = name;
        mtpar.entryinfo = &e->ipv6;
        mtpar.hook_mask = e->comefrom;
        mtpar.family    = NFPROTO_IPV6;
        xt_ematch_foreach(ematch, e) {
                ret = find_check_match(ematch, &mtpar);
                if (ret != 0)
                        goto cleanup_matches;
                ++j;
        }

        t = ip6t_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto cleanup_matches;
        }
        t->u.kernel.target = target;

        ret = check_target(e, net, name);
        if (ret)
                goto err;
        return 0;
 err:
        module_put(t->u.kernel.target->me);
 cleanup_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                cleanup_match(ematch, net);
        }

        xt_percpu_counter_free(&e->counters);

        return ret;
}

static bool check_underflow(const struct ip6t_entry *e)
{
        const struct xt_entry_target *t;
        unsigned int verdict;

        if (!unconditional(e))
                return false;
        t = ip6t_get_target_c(e);
        if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
                return false;
        verdict = ((struct xt_standard_target *)t)->verdict;
        verdict = -verdict - 1;
        return verdict == NF_DROP || verdict == NF_ACCEPT;
}

static int
check_entry_size_and_hooks(struct ip6t_entry *e,
                           struct xt_table_info *newinfo,
                           const unsigned char *base,
                           const unsigned char *limit,
                           const unsigned int *hook_entries,
                           const unsigned int *underflows,
                           unsigned int valid_hooks)
{
        unsigned int h;
        int err;

        if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
            (unsigned char *)e + sizeof(struct ip6t_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset
            < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target))
                return -EINVAL;

        if (!ip6_checkentry(&e->ipv6))
                return -EINVAL;

        err = xt_check_entry_offsets(e, e->elems, e->target_offset,
                                     e->next_offset);
        if (err)
                return err;

        /* Check hooks & underflows */
        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if (!(valid_hooks & (1 << h)))
                        continue;
                if ((unsigned char *)e - base == hook_entries[h])
                        newinfo->hook_entry[h] = hook_entries[h];
                if ((unsigned char *)e - base == underflows[h]) {
                        if (!check_underflow(e))
                                return -EINVAL;

                        newinfo->underflow[h] = underflows[h];
                }
        }

        /* Clear counters and comefrom */
        e->counters = ((struct xt_counters) { 0, 0 });
        e->comefrom = 0;
        return 0;
}

static void cleanup_entry(struct ip6t_entry *e, struct net *net)
{
        struct xt_tgdtor_param par;
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                cleanup_match(ematch, net);
        t = ip6t_get_target(e);

        par.net      = net;
        par.target   = t->u.kernel.target;
        par.targinfo = t->data;
        par.family   = NFPROTO_IPV6;
        if (par.target->destroy != NULL)
                par.target->destroy(&par);
        module_put(par.target->me);
        xt_percpu_counter_free(&e->counters);
}

/* Checks and translates the user-supplied table segment (held in
   newinfo) */
static int
translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
                const struct ip6t_replace *repl)
{
        struct xt_percpu_counter_alloc_state alloc_state = { 0 };
        struct ip6t_entry *iter;
        unsigned int *offsets;
        unsigned int i;
        int ret = 0;

        newinfo->size = repl->size;
        newinfo->number = repl->num_entries;

        /* Init all hooks to impossible value. */
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = 0xFFFFFFFF;
                newinfo->underflow[i] = 0xFFFFFFFF;
        }

        offsets = xt_alloc_entry_offsets(newinfo->number);
        if (!offsets)
                return -ENOMEM;
        i = 0;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = check_entry_size_and_hooks(iter, newinfo, entry0,
                                                 entry0 + repl->size,
                                                 repl->hook_entry,
                                                 repl->underflow,
                                                 repl->valid_hooks);
                if (ret != 0)
                        goto out_free;
                if (i < repl->num_entries)
                        offsets[i] = (void *)iter - entry0;
                ++i;
                if (strcmp(ip6t_get_target(iter)->u.user.name,
                    XT_ERROR_TARGET) == 0)
                        ++newinfo->stacksize;
        }

        ret = -EINVAL;
        if (i != repl->num_entries)
                goto out_free;

        ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
        if (ret)
                goto out_free;

        if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
                ret = -ELOOP;
                goto out_free;
        }
        kvfree(offsets);

        /* Finally, each sanity check must pass */
        i = 0;
        xt_entry_foreach(iter, entry0, newinfo->size) {
                ret = find_check_entry(iter, net, repl->name, repl->size,
                                       &alloc_state);
                if (ret != 0)
                        break;
                ++i;
        }

        if (ret != 0) {
                xt_entry_foreach(iter, entry0, newinfo->size) {
                        if (i-- == 0)
                                break;
                        cleanup_entry(iter, net);
                }
                return ret;
        }

        return ret;
 out_free:
        kvfree(offsets);
        return ret;
}

static void
get_counters(const struct xt_table_info *t,
             struct xt_counters counters[])
{
        struct ip6t_entry *iter;
        unsigned int cpu;
        unsigned int i;

        for_each_possible_cpu(cpu) {
                seqcount_t *s = &per_cpu(xt_recseq, cpu);

                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        struct xt_counters *tmp;
                        u64 bcnt, pcnt;
                        unsigned int start;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        do {
                                start = read_seqcount_begin(s);
                                bcnt = tmp->bcnt;
                                pcnt = tmp->pcnt;
                        } while (read_seqcount_retry(s, start));

                        ADD_COUNTER(counters[i], bcnt, pcnt);
                        ++i;
                        cond_resched();
                }
        }
}

static void get_old_counters(const struct xt_table_info *t,
                             struct xt_counters counters[])
{
        struct ip6t_entry *iter;
        unsigned int cpu, i;

        for_each_possible_cpu(cpu) {
                i = 0;
                xt_entry_foreach(iter, t->entries, t->size) {
                        const struct xt_counters *tmp;

                        tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
                        ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt);
                        ++i;
                }
                cond_resched();
        }
}

static struct xt_counters *alloc_counters(const struct xt_table *table)
{
        unsigned int countersize;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;

        /* We need atomic snapshot of counters: rest doesn't change
           (other than comefrom, which userspace doesn't care
           about). */
        countersize = sizeof(struct xt_counters) * private->number;
        counters = vzalloc(countersize);

        if (counters == NULL)
                return ERR_PTR(-ENOMEM);

        get_counters(private, counters);

        return counters;
}

static int
copy_entries_to_user(unsigned int total_size,
                     const struct xt_table *table,
                     void __user *userptr)
{
        unsigned int off, num;
        const struct ip6t_entry *e;
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        int ret = 0;
        const void *loc_cpu_entry;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        loc_cpu_entry = private->entries;

        /* FIXME: use iterator macros --RR */
        /* ... then go back and fix counters and names */
        for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
                unsigned int i;
                const struct xt_entry_match *m;
                const struct xt_entry_target *t;

                e = loc_cpu_entry + off;
                if (copy_to_user(userptr + off, e, sizeof(*e))) {
                        ret = -EFAULT;
                        goto free_counters;
                }
                if (copy_to_user(userptr + off
                                 + offsetof(struct ip6t_entry, counters),
                                 &counters[num],
                                 sizeof(counters[num])) != 0) {
                        ret = -EFAULT;
                        goto free_counters;
                }

                for (i = sizeof(struct ip6t_entry);
                     i < e->target_offset;
                     i += m->u.match_size) {
                        m = (void *)e + i;

                        if (xt_match_to_user(m, userptr + off + i)) {
                                ret = -EFAULT;
                                goto free_counters;
                        }
                }

                t = ip6t_get_target_c(e);
                if (xt_target_to_user(t, userptr + off + e->target_offset)) {
                        ret = -EFAULT;
                        goto free_counters;
                }
        }

 free_counters:
        vfree(counters);
        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
        int v = *(compat_int_t *)src;

        if (v > 0)
                v += xt_compat_calc_jump(AF_INET6, v);
        memcpy(dst, &v, sizeof(v));
}

static int compat_standard_to_user(void __user *dst, const void *src)
{
        compat_int_t cv = *(int *)src;

        if (cv > 0)
                cv -= xt_compat_calc_jump(AF_INET6, cv);
        return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}

static int compat_calc_entry(const struct ip6t_entry *e,
                             const struct xt_table_info *info,
                             const void *base, struct xt_table_info *newinfo)
{
        const struct xt_entry_match *ematch;
        const struct xt_entry_target *t;
        unsigned int entry_offset;
        int off, i, ret;

        off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
        entry_offset = (void *)e - base;
        xt_ematch_foreach(ematch, e)
                off += xt_compat_match_offset(ematch->u.kernel.match);
        t = ip6t_get_target_c(e);
        off += xt_compat_target_offset(t->u.kernel.target);
        newinfo->size -= off;
        ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
        if (ret)
                return ret;

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                if (info->hook_entry[i] &&
                    (e < (struct ip6t_entry *)(base + info->hook_entry[i])))
                        newinfo->hook_entry[i] -= off;
                if (info->underflow[i] &&
                    (e < (struct ip6t_entry *)(base + info->underflow[i])))
                        newinfo->underflow[i] -= off;
        }
        return 0;
}

static int compat_table_info(const struct xt_table_info *info,
                             struct xt_table_info *newinfo)
{
        struct ip6t_entry *iter;
        const void *loc_cpu_entry;
        int ret;

        if (!newinfo || !info)
                return -EINVAL;

        /* we dont care about newinfo->entries */
        memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
        newinfo->initial_entries = 0;
        loc_cpu_entry = info->entries;
        ret = xt_compat_init_offsets(AF_INET6, info->number);
        if (ret)
                return ret;
        xt_entry_foreach(iter, loc_cpu_entry, info->size) {
                ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
                if (ret != 0)
                        return ret;
        }
        return 0;
}
#endif

static int get_info(struct net *net, void __user *user, const int *len)
{
        char name[XT_TABLE_MAXNAMELEN];
        struct xt_table *t;
        int ret;

        if (*len != sizeof(struct ip6t_getinfo))
                return -EINVAL;

        if (copy_from_user(name, user, sizeof(name)) != 0)
                return -EFAULT;

        name[XT_TABLE_MAXNAMELEN-1] = '\0';
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_lock(AF_INET6);
#endif
        t = xt_request_find_table_lock(net, AF_INET6, name);
        if (!IS_ERR(t)) {
                struct ip6t_getinfo info;
                const struct xt_table_info *private = t->private;
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                struct xt_table_info tmp;

                if (in_compat_syscall()) {
                        ret = compat_table_info(private, &tmp);
                        xt_compat_flush_offsets(AF_INET6);
                        private = &tmp;
                }
#endif
                memset(&info, 0, sizeof(info));
                info.valid_hooks = t->valid_hooks;
                memcpy(info.hook_entry, private->hook_entry,
                       sizeof(info.hook_entry));
                memcpy(info.underflow, private->underflow,
                       sizeof(info.underflow));
                info.num_entries = private->number;
                info.size = private->size;
                strcpy(info.name, name);

                if (copy_to_user(user, &info, *len) != 0)
                        ret = -EFAULT;
                else
                        ret = 0;

                xt_table_unlock(t);
                module_put(t->me);
        } else
                ret = PTR_ERR(t);
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
        if (in_compat_syscall())
                xt_compat_unlock(AF_INET6);
#endif
        return ret;
}

static int
get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
            const int *len)
{
        int ret;
        struct ip6t_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;
        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;
        if (*len != sizeof(struct ip6t_get_entries) + get.size)
                return -EINVAL;

        get.name[sizeof(get.name) - 1] = '\0';

        t = xt_find_table_lock(net, AF_INET6, get.name);
        if (!IS_ERR(t)) {
                struct xt_table_info *private = t->private;
                if (get.size == private->size)
                        ret = copy_entries_to_user(private->size,
                                                   t, uptr->entrytable);
                else
                        ret = -EAGAIN;

                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        return ret;
}

static int
__do_replace(struct net *net, const char *name, unsigned int valid_hooks,
             struct xt_table_info *newinfo, unsigned int num_counters,
             void __user *counters_ptr)
{
        int ret;
        struct xt_table *t;
        struct xt_table_info *oldinfo;
        struct xt_counters *counters;
        struct ip6t_entry *iter;

        counters = xt_counters_alloc(num_counters);
        if (!counters) {
                ret = -ENOMEM;
                goto out;
        }

        t = xt_request_find_table_lock(net, AF_INET6, name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free_newinfo_counters_untrans;
        }

        /* You lied! */
        if (valid_hooks != t->valid_hooks) {
                ret = -EINVAL;
                goto put_module;
        }

        oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
        if (!oldinfo)
                goto put_module;

        /* Update module usage count based on number of rules */
        if ((oldinfo->number > oldinfo->initial_entries) ||
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);
        if ((oldinfo->number > oldinfo->initial_entries) &&
            (newinfo->number <= oldinfo->initial_entries))
                module_put(t->me);

        xt_table_unlock(t);

        get_old_counters(oldinfo, counters);

        /* Decrease module usage counts and free resource */
        xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
                cleanup_entry(iter, net);

        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
                         sizeof(struct xt_counters) * num_counters) != 0) {
                /* Silent error, can't fail, new table is already in place */
                net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
        }
        vfree(counters);
        return 0;

 put_module:
        module_put(t->me);
        xt_table_unlock(t);
 free_newinfo_counters_untrans:
        vfree(counters);
 out:
        return ret;
}

static int
do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct ip6t_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ip6t_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, tmp.counters);
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

static int
do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
        unsigned int i;
        struct xt_counters_info tmp;
        struct xt_counters *paddc;
        struct xt_table *t;
        const struct xt_table_info *private;
        int ret = 0;
        struct ip6t_entry *iter;
        unsigned int addend;

        paddc = xt_copy_counters(arg, len, &tmp);
        if (IS_ERR(paddc))
                return PTR_ERR(paddc);
        t = xt_find_table_lock(net, AF_INET6, tmp.name);
        if (IS_ERR(t)) {
                ret = PTR_ERR(t);
                goto free;
        }

        local_bh_disable();
        private = t->private;
        if (private->number != tmp.num_counters) {
                ret = -EINVAL;
                goto unlock_up_free;
        }

        i = 0;
        addend = xt_write_recseq_begin();
        xt_entry_foreach(iter, private->entries, private->size) {
                struct xt_counters *tmp;

                tmp = xt_get_this_cpu_counter(&iter->counters);
                ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
                ++i;
        }
        xt_write_recseq_end(addend);
 unlock_up_free:
        local_bh_enable();
        xt_table_unlock(t);
        module_put(t->me);
 free:
        vfree(paddc);

        return ret;
}

#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ip6t_replace {
        char                        name[XT_TABLE_MAXNAMELEN];
        u32                        valid_hooks;
        u32                        num_entries;
        u32                        size;
        u32                        hook_entry[NF_INET_NUMHOOKS];
        u32                        underflow[NF_INET_NUMHOOKS];
        u32                        num_counters;
        compat_uptr_t                counters;        /* struct xt_counters * */
        struct compat_ip6t_entry entries[];
};

static int
compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
                          unsigned int *size, struct xt_counters *counters,
                          unsigned int i)
{
        struct xt_entry_target *t;
        struct compat_ip6t_entry __user *ce;
        u_int16_t target_offset, next_offset;
        compat_uint_t origsize;
        const struct xt_entry_match *ematch;
        int ret = 0;

        origsize = *size;
        ce = *dstptr;
        if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 ||
            copy_to_user(&ce->counters, &counters[i],
            sizeof(counters[i])) != 0)
                return -EFAULT;

        *dstptr += sizeof(struct compat_ip6t_entry);
        *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);

        xt_ematch_foreach(ematch, e) {
                ret = xt_compat_match_to_user(ematch, dstptr, size);
                if (ret != 0)
                        return ret;
        }
        target_offset = e->target_offset - (origsize - *size);
        t = ip6t_get_target(e);
        ret = xt_compat_target_to_user(t, dstptr, size);
        if (ret)
                return ret;
        next_offset = e->next_offset - (origsize - *size);
        if (put_user(target_offset, &ce->target_offset) != 0 ||
            put_user(next_offset, &ce->next_offset) != 0)
                return -EFAULT;
        return 0;
}

static int
compat_find_calc_match(struct xt_entry_match *m,
                       const struct ip6t_ip6 *ipv6,
                       int *size)
{
        struct xt_match *match;

        match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name,
                                      m->u.user.revision);
        if (IS_ERR(match))
                return PTR_ERR(match);

        m->u.kernel.match = match;
        *size += xt_compat_match_offset(match);
        return 0;
}

static void compat_release_entry(struct compat_ip6t_entry *e)
{
        struct xt_entry_target *t;
        struct xt_entry_match *ematch;

        /* Cleanup all matches */
        xt_ematch_foreach(ematch, e)
                module_put(ematch->u.kernel.match->me);
        t = compat_ip6t_get_target(e);
        module_put(t->u.kernel.target->me);
}

static int
check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
                                  struct xt_table_info *newinfo,
                                  unsigned int *size,
                                  const unsigned char *base,
                                  const unsigned char *limit)
{
        struct xt_entry_match *ematch;
        struct xt_entry_target *t;
        struct xt_target *target;
        unsigned int entry_offset;
        unsigned int j;
        int ret, off;

        if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
            (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit ||
            (unsigned char *)e + e->next_offset > limit)
                return -EINVAL;

        if (e->next_offset < sizeof(struct compat_ip6t_entry) +
                             sizeof(struct compat_xt_entry_target))
                return -EINVAL;

        if (!ip6_checkentry(&e->ipv6))
                return -EINVAL;

        ret = xt_compat_check_entry_offsets(e, e->elems,
                                            e->target_offset, e->next_offset);
        if (ret)
                return ret;

        off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
        entry_offset = (void *)e - (void *)base;
        j = 0;
        xt_ematch_foreach(ematch, e) {
                ret = compat_find_calc_match(ematch, &e->ipv6, &off);
                if (ret != 0)
                        goto release_matches;
                ++j;
        }

        t = compat_ip6t_get_target(e);
        target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name,
                                        t->u.user.revision);
        if (IS_ERR(target)) {
                ret = PTR_ERR(target);
                goto release_matches;
        }
        t->u.kernel.target = target;

        off += xt_compat_target_offset(target);
        *size += off;
        ret = xt_compat_add_offset(AF_INET6, entry_offset, off);
        if (ret)
                goto out;

        return 0;

out:
        module_put(t->u.kernel.target->me);
release_matches:
        xt_ematch_foreach(ematch, e) {
                if (j-- == 0)
                        break;
                module_put(ematch->u.kernel.match->me);
        }
        return ret;
}

static void
compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
                            unsigned int *size,
                            struct xt_table_info *newinfo, unsigned char *base)
{
        struct xt_entry_target *t;
        struct ip6t_entry *de;
        unsigned int origsize;
        int h;
        struct xt_entry_match *ematch;

        origsize = *size;
        de = *dstptr;
        memcpy(de, e, sizeof(struct ip6t_entry));
        memcpy(&de->counters, &e->counters, sizeof(e->counters));

        *dstptr += sizeof(struct ip6t_entry);
        *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);

        xt_ematch_foreach(ematch, e)
                xt_compat_match_from_user(ematch, dstptr, size);

        de->target_offset = e->target_offset - (origsize - *size);
        t = compat_ip6t_get_target(e);
        xt_compat_target_from_user(t, dstptr, size);

        de->next_offset = e->next_offset - (origsize - *size);
        for (h = 0; h < NF_INET_NUMHOOKS; h++) {
                if ((unsigned char *)de - base < newinfo->hook_entry[h])
                        newinfo->hook_entry[h] -= origsize - *size;
                if ((unsigned char *)de - base < newinfo->underflow[h])
                        newinfo->underflow[h] -= origsize - *size;
        }
}

static int
translate_compat_table(struct net *net,
                       struct xt_table_info **pinfo,
                       void **pentry0,
                       const struct compat_ip6t_replace *compatr)
{
        unsigned int i, j;
        struct xt_table_info *newinfo, *info;
        void *pos, *entry0, *entry1;
        struct compat_ip6t_entry *iter0;
        struct ip6t_replace repl;
        unsigned int size;
        int ret;

        info = *pinfo;
        entry0 = *pentry0;
        size = compatr->size;
        info->number = compatr->num_entries;

        j = 0;
        xt_compat_lock(AF_INET6);
        ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries);
        if (ret)
                goto out_unlock;
        /* Walk through entries, checking offsets. */
        xt_entry_foreach(iter0, entry0, compatr->size) {
                ret = check_compat_entry_size_and_hooks(iter0, info, &size,
                                                        entry0,
                                                        entry0 + compatr->size);
                if (ret != 0)
                        goto out_unlock;
                ++j;
        }

        ret = -EINVAL;
        if (j != compatr->num_entries)
                goto out_unlock;

        ret = -ENOMEM;
        newinfo = xt_alloc_table_info(size);
        if (!newinfo)
                goto out_unlock;

        memset(newinfo->entries, 0, size);

        newinfo->number = compatr->num_entries;
        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                newinfo->hook_entry[i] = compatr->hook_entry[i];
                newinfo->underflow[i] = compatr->underflow[i];
        }
        entry1 = newinfo->entries;
        pos = entry1;
        size = compatr->size;
        xt_entry_foreach(iter0, entry0, compatr->size)
                compat_copy_entry_from_user(iter0, &pos, &size,
                                            newinfo, entry1);

        /* all module references in entry0 are now gone. */
        xt_compat_flush_offsets(AF_INET6);
        xt_compat_unlock(AF_INET6);

        memcpy(&repl, compatr, sizeof(*compatr));

        for (i = 0; i < NF_INET_NUMHOOKS; i++) {
                repl.hook_entry[i] = newinfo->hook_entry[i];
                repl.underflow[i] = newinfo->underflow[i];
        }

        repl.num_counters = 0;
        repl.counters = NULL;
        repl.size = newinfo->size;
        ret = translate_table(net, newinfo, entry1, &repl);
        if (ret)
                goto free_newinfo;

        *pinfo = newinfo;
        *pentry0 = entry1;
        xt_free_table_info(info);
        return 0;

free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
out_unlock:
        xt_compat_flush_offsets(AF_INET6);
        xt_compat_unlock(AF_INET6);
        xt_entry_foreach(iter0, entry0, compatr->size) {
                if (j-- == 0)
                        break;
                compat_release_entry(iter0);
        }
        return ret;
}

static int
compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
        int ret;
        struct compat_ip6t_replace tmp;
        struct xt_table_info *newinfo;
        void *loc_cpu_entry;
        struct ip6t_entry *iter;

        if (len < sizeof(tmp))
                return -EINVAL;
        if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
                return -EFAULT;

        /* overflow check */
        if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
                return -ENOMEM;
        if (tmp.num_counters == 0)
                return -EINVAL;
        if ((u64)len < (u64)tmp.size + sizeof(tmp))
                return -EINVAL;

        tmp.name[sizeof(tmp.name)-1] = 0;

        newinfo = xt_alloc_table_info(tmp.size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
                        tmp.size) != 0) {
                ret = -EFAULT;
                goto free_newinfo;
        }

        ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
        if (ret != 0)
                goto free_newinfo;

        ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
                           tmp.num_counters, compat_ptr(tmp.counters));
        if (ret)
                goto free_newinfo_untrans;
        return 0;

 free_newinfo_untrans:
        xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                cleanup_entry(iter, net);
 free_newinfo:
        xt_free_table_info(newinfo);
        return ret;
}

struct compat_ip6t_get_entries {
        char name[XT_TABLE_MAXNAMELEN];
        compat_uint_t size;
        struct compat_ip6t_entry entrytable[];
};

static int
compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
                            void __user *userptr)
{
        struct xt_counters *counters;
        const struct xt_table_info *private = table->private;
        void __user *pos;
        unsigned int size;
        int ret = 0;
        unsigned int i = 0;
        struct ip6t_entry *iter;

        counters = alloc_counters(table);
        if (IS_ERR(counters))
                return PTR_ERR(counters);

        pos = userptr;
        size = total_size;
        xt_entry_foreach(iter, private->entries, total_size) {
                ret = compat_copy_entry_to_user(iter, &pos,
                                                &size, counters, i++);
                if (ret != 0)
                        break;
        }

        vfree(counters);
        return ret;
}

static int
compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
                   int *len)
{
        int ret;
        struct compat_ip6t_get_entries get;
        struct xt_table *t;

        if (*len < sizeof(get))
                return -EINVAL;

        if (copy_from_user(&get, uptr, sizeof(get)) != 0)
                return -EFAULT;

        if (*len != sizeof(struct compat_ip6t_get_entries) + get.size)
                return -EINVAL;

        get.name[sizeof(get.name) - 1] = '\0';

        xt_compat_lock(AF_INET6);
        t = xt_find_table_lock(net, AF_INET6, get.name);
        if (!IS_ERR(t)) {
                const struct xt_table_info *private = t->private;
                struct xt_table_info info;
                ret = compat_table_info(private, &info);
                if (!ret && get.size == info.size)
                        ret = compat_copy_entries_to_user(private->size,
                                                          t, uptr->entrytable);
                else if (!ret)
                        ret = -EAGAIN;

                xt_compat_flush_offsets(AF_INET6);
                module_put(t->me);
                xt_table_unlock(t);
        } else
                ret = PTR_ERR(t);

        xt_compat_unlock(AF_INET6);
        return ret;
}
#endif

static int
do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IP6T_SO_SET_REPLACE:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_do_replace(sock_net(sk), arg, len);
                else
#endif
                        ret = do_replace(sock_net(sk), arg, len);
                break;

        case IP6T_SO_SET_ADD_COUNTERS:
                ret = do_add_counters(sock_net(sk), arg, len);
                break;

        default:
                ret = -EINVAL;
        }

        return ret;
}

static int
do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
        int ret;

        if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        switch (cmd) {
        case IP6T_SO_GET_INFO:
                ret = get_info(sock_net(sk), user, len);
                break;

        case IP6T_SO_GET_ENTRIES:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                if (in_compat_syscall())
                        ret = compat_get_entries(sock_net(sk), user, len);
                else
#endif
                        ret = get_entries(sock_net(sk), user, len);
                break;

        case IP6T_SO_GET_REVISION_MATCH:
        case IP6T_SO_GET_REVISION_TARGET: {
                struct xt_get_revision rev;
                int target;

                if (*len != sizeof(rev)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
                        ret = -EFAULT;
                        break;
                }
                rev.name[sizeof(rev.name)-1] = 0;

                if (cmd == IP6T_SO_GET_REVISION_TARGET)
                        target = 1;
                else
                        target = 0;

                try_then_request_module(xt_find_revision(AF_INET6, rev.name,
                                                         rev.revision,
                                                         target, &ret),
                                        "ip6t_%s", rev.name);
                break;
        }

        default:
                ret = -EINVAL;
        }

        return ret;
}

static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
{
        struct xt_table_info *private;
        void *loc_cpu_entry;
        struct module *table_owner = table->me;
        struct ip6t_entry *iter;

        private = xt_unregister_table(table);

        /* Decrease module usage counts and free resources */
        loc_cpu_entry = private->entries;
        xt_entry_foreach(iter, loc_cpu_entry, private->size)
                cleanup_entry(iter, net);
        if (private->number > private->initial_entries)
                module_put(table_owner);
        xt_free_table_info(private);
}

int ip6t_register_table(struct net *net, const struct xt_table *table,
                        const struct ip6t_replace *repl,
                        const struct nf_hook_ops *template_ops)
{
        struct nf_hook_ops *ops;
        unsigned int num_ops;
        int ret, i;
        struct xt_table_info *newinfo;
        struct xt_table_info bootstrap = {0};
        void *loc_cpu_entry;
        struct xt_table *new_table;

        newinfo = xt_alloc_table_info(repl->size);
        if (!newinfo)
                return -ENOMEM;

        loc_cpu_entry = newinfo->entries;
        memcpy(loc_cpu_entry, repl->entries, repl->size);

        ret = translate_table(net, newinfo, loc_cpu_entry, repl);
        if (ret != 0) {
                xt_free_table_info(newinfo);
                return ret;
        }

        new_table = xt_register_table(net, table, &bootstrap, newinfo);
        if (IS_ERR(new_table)) {
                struct ip6t_entry *iter;

                xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
                        cleanup_entry(iter, net);
                xt_free_table_info(newinfo);
                return PTR_ERR(new_table);
        }

        if (!template_ops)
                return 0;

        num_ops = hweight32(table->valid_hooks);
        if (num_ops == 0) {
                ret = -EINVAL;
                goto out_free;
        }

        ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL);
        if (!ops) {
                ret = -ENOMEM;
                goto out_free;
        }

        for (i = 0; i < num_ops; i++)
                ops[i].priv = new_table;

        new_table->ops = ops;

        ret = nf_register_net_hooks(net, ops, num_ops);
        if (ret != 0)
                goto out_free;

        return ret;

out_free:
        __ip6t_unregister_table(net, new_table);
        return ret;
}

void ip6t_unregister_table_pre_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);

        if (table)
                nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}

void ip6t_unregister_table_exit(struct net *net, const char *name)
{
        struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);

        if (table)
                __ip6t_unregister_table(net, table);
}

/* The built-in targets: standard (NULL) and error. */
static struct xt_target ip6t_builtin_tg[] __read_mostly = {
        {
                .name             = XT_STANDARD_TARGET,
                .targetsize       = sizeof(int),
                .family           = NFPROTO_IPV6,
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
                .compatsize       = sizeof(compat_int_t),
                .compat_from_user = compat_standard_from_user,
                .compat_to_user   = compat_standard_to_user,
#endif
        },
        {
                .name             = XT_ERROR_TARGET,
                .target           = ip6t_error,
                .targetsize       = XT_FUNCTION_MAXNAMELEN,
                .family           = NFPROTO_IPV6,
        },
};

static struct nf_sockopt_ops ip6t_sockopts = {
        .pf                = PF_INET6,
        .set_optmin        = IP6T_BASE_CTL,
        .set_optmax        = IP6T_SO_SET_MAX+1,
        .set                = do_ip6t_set_ctl,
        .get_optmin        = IP6T_BASE_CTL,
        .get_optmax        = IP6T_SO_GET_MAX+1,
        .get                = do_ip6t_get_ctl,
        .owner                = THIS_MODULE,
};

static int __net_init ip6_tables_net_init(struct net *net)
{
        return xt_proto_init(net, NFPROTO_IPV6);
}

static void __net_exit ip6_tables_net_exit(struct net *net)
{
        xt_proto_fini(net, NFPROTO_IPV6);
}

static struct pernet_operations ip6_tables_net_ops = {
        .init = ip6_tables_net_init,
        .exit = ip6_tables_net_exit,
};

static int __init ip6_tables_init(void)
{
        int ret;

        ret = register_pernet_subsys(&ip6_tables_net_ops);
        if (ret < 0)
                goto err1;

        /* No one else will be downing sem now, so we won't sleep */
        ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
        if (ret < 0)
                goto err2;

        /* Register setsockopt */
        ret = nf_register_sockopt(&ip6t_sockopts);
        if (ret < 0)
                goto err4;

        return 0;

err4:
        xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
err2:
        unregister_pernet_subsys(&ip6_tables_net_ops);
err1:
        return ret;
}

static void __exit ip6_tables_fini(void)
{
        nf_unregister_sockopt(&ip6t_sockopts);

        xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
        unregister_pernet_subsys(&ip6_tables_net_ops);
}

EXPORT_SYMBOL(ip6t_register_table);
EXPORT_SYMBOL(ip6t_unregister_table_pre_exit);
EXPORT_SYMBOL(ip6t_unregister_table_exit);
EXPORT_SYMBOL(ip6t_do_table);

module_init(ip6_tables_init);
module_exit(ip6_tables_fini);

































    1 










    1 






    1 



    1 







    1 









    1 











    1 

    1 




























    1 








    1 















    1 
















    1 





    1 


















    1 












    1 

























    1 


    1 















    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2002 International Business Machines, Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions are the methods for accessing the SCTP inqueue.
 *
 * An SCTP inqueue is a queue into which you push SCTP packets
 * (which might be bundles or fragments of chunks) and out of which you
 * pop SCTP whole chunks.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Karl Knutson <karl@athena.chicago.il.us>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <linux/interrupt.h>
#include <linux/slab.h>

/* Initialize an SCTP inqueue.  */
void sctp_inq_init(struct sctp_inq *queue)
{
        INIT_LIST_HEAD(&queue->in_chunk_list);
        queue->in_progress = NULL;

        /* Create a task for delivering data.  */
        INIT_WORK(&queue->immediate, NULL);
}

/* Properly release the chunk which is being worked on. */
static inline void sctp_inq_chunk_free(struct sctp_chunk *chunk)
{
        if (chunk->head_skb)
                chunk->skb = chunk->head_skb;
        sctp_chunk_free(chunk);
}

/* Release the memory associated with an SCTP inqueue.  */
void sctp_inq_free(struct sctp_inq *queue)
{
        struct sctp_chunk *chunk, *tmp;

        /* Empty the queue.  */
        list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
                list_del_init(&chunk->list);
                sctp_chunk_free(chunk);
        }

        /* If there is a packet which is currently being worked on,
         * free it as well.
         */
        if (queue->in_progress) {
                sctp_inq_chunk_free(queue->in_progress);
                queue->in_progress = NULL;
        }
}

/* Put a new packet in an SCTP inqueue.
 * We assume that packet->sctp_hdr is set and in host byte order.
 */
void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk)
{
        /* Directly call the packet handling routine. */
        if (chunk->rcvr->dead) {
                sctp_chunk_free(chunk);
                return;
        }

        /* We are now calling this either from the soft interrupt
         * or from the backlog processing.
         * Eventually, we should clean up inqueue to not rely
         * on the BH related data structures.
         */
        list_add_tail(&chunk->list, &q->in_chunk_list);
        if (chunk->asoc)
                chunk->asoc->stats.ipackets++;
        q->immediate.func(&q->immediate);
}

/* Peek at the next chunk on the inqeue. */
struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue)
{
        struct sctp_chunk *chunk;
        struct sctp_chunkhdr *ch = NULL;

        chunk = queue->in_progress;
        /* If there is no more chunks in this packet, say so */
        if (chunk->singleton ||
            chunk->end_of_packet ||
            chunk->pdiscard)
                    return NULL;

        ch = (struct sctp_chunkhdr *)chunk->chunk_end;

        return ch;
}


/* Extract a chunk from an SCTP inqueue.
 *
 * WARNING:  If you need to put the chunk on another queue, you need to
 * make a shallow copy (clone) of it.
 */
struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
{
        struct sctp_chunk *chunk;
        struct sctp_chunkhdr *ch = NULL;

        /* The assumption is that we are safe to process the chunks
         * at this time.
         */

        chunk = queue->in_progress;
        if (chunk) {
                /* There is a packet that we have been working on.
                 * Any post processing work to do before we move on?
                 */
                if (chunk->singleton ||
                    chunk->end_of_packet ||
                    chunk->pdiscard) {
                        if (chunk->head_skb == chunk->skb) {
                                chunk->skb = skb_shinfo(chunk->skb)->frag_list;
                                goto new_skb;
                        }
                        if (chunk->skb->next) {
                                chunk->skb = chunk->skb->next;
                                goto new_skb;
                        }

                        sctp_inq_chunk_free(chunk);
                        chunk = queue->in_progress = NULL;
                } else {
                        /* Nothing to do. Next chunk in the packet, please. */
                        ch = (struct sctp_chunkhdr *)chunk->chunk_end;
                        /* Force chunk->skb->data to chunk->chunk_end.  */
                        skb_pull(chunk->skb, chunk->chunk_end - chunk->skb->data);
                        /* We are guaranteed to pull a SCTP header. */
                }
        }

        /* Do we need to take the next packet out of the queue to process? */
        if (!chunk) {
                struct list_head *entry;

next_chunk:
                /* Is the queue empty?  */
                entry = sctp_list_dequeue(&queue->in_chunk_list);
                if (!entry)
                        return NULL;

                chunk = list_entry(entry, struct sctp_chunk, list);

                if (skb_is_gso(chunk->skb) && skb_is_gso_sctp(chunk->skb)) {
                        /* GSO-marked skbs but without frags, handle
                         * them normally
                         */
                        if (skb_shinfo(chunk->skb)->frag_list)
                                chunk->head_skb = chunk->skb;

                        /* skbs with "cover letter" */
                        if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len)
                                chunk->skb = skb_shinfo(chunk->skb)->frag_list;

                        if (WARN_ON(!chunk->skb)) {
                                __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS);
                                sctp_chunk_free(chunk);
                                goto next_chunk;
                        }
                }

                if (chunk->asoc)
                        sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb);

                queue->in_progress = chunk;

new_skb:
                /* This is the first chunk in the packet.  */
                ch = (struct sctp_chunkhdr *)chunk->skb->data;
                chunk->singleton = 1;
                chunk->data_accepted = 0;
                chunk->pdiscard = 0;
                chunk->auth = 0;
                chunk->has_asconf = 0;
                chunk->end_of_packet = 0;
                if (chunk->head_skb) {
                        struct sctp_input_cb
                                *cb = SCTP_INPUT_CB(chunk->skb),
                                *head_cb = SCTP_INPUT_CB(chunk->head_skb);

                        cb->chunk = head_cb->chunk;
                        cb->af = head_cb->af;
                }
        }

        chunk->chunk_hdr = ch;
        chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
        skb_pull(chunk->skb, sizeof(*ch));
        chunk->subh.v = NULL; /* Subheader is no longer valid.  */

        if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) {
                /* This is not a singleton */
                chunk->singleton = 0;
        } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) {
                /* Discard inside state machine. */
                chunk->pdiscard = 1;
                chunk->chunk_end = skb_tail_pointer(chunk->skb);
        } else {
                /* We are at the end of the packet, so mark the chunk
                 * in case we need to send a SACK.
                 */
                chunk->end_of_packet = 1;
        }

        pr_debug("+++sctp_inq_pop+++ chunk:%p[%s], length:%d, skb->len:%d\n",
                 chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)),
                 ntohs(chunk->chunk_hdr->length), chunk->skb->len);

        return chunk;
}

/* Set a top-half handler.
 *
 * Originally, we the top-half handler was scheduled as a BH.  We now
 * call the handler directly in sctp_inq_push() at a time that
 * we know we are lock safe.
 * The intent is that this routine will pull stuff out of the
 * inqueue and process it.
 */
void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback)
{
        INIT_WORK(&q->immediate, callback);
}





































































































































































































































































    2 











































































    1 




















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
#include <linux/sched/coredump.h>

/*
 * Routines for handling mm_structs
 */
extern struct mm_struct *mm_alloc(void);

/**
 * mmgrab() - Pin a &struct mm_struct.
 * @mm: The &struct mm_struct to pin.
 *
 * Make sure that @mm will not get freed even after the owning task
 * exits. This doesn't guarantee that the associated address space
 * will still exist later on and mmget_not_zero() has to be used before
 * accessing it.
 *
 * This is a preferred way to pin @mm for a longer/unbounded amount
 * of time.
 *
 * Use mmdrop() to release the reference acquired by mmgrab().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmgrab(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_count);
}

static inline void smp_mb__after_mmgrab(void)
{
        smp_mb__after_atomic();
}

extern void __mmdrop(struct mm_struct *mm);

static inline void mmdrop(struct mm_struct *mm)
{
        /*
         * The implicit full barrier implied by atomic_dec_and_test() is
         * required by the membarrier system call before returning to
         * user-space, after storing to rq->curr.
         */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
}

#ifdef CONFIG_PREEMPT_RT
/*
 * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
 * by far the least expensive way to do that.
 */
static inline void __mmdrop_delayed(struct rcu_head *rhp)
{
        struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);

        __mmdrop(mm);
}

/*
 * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
 * kernels via RCU.
 */
static inline void mmdrop_sched(struct mm_struct *mm)
{
        /* Provides a full memory barrier. See mmdrop() */
        if (atomic_dec_and_test(&mm->mm_count))
                call_rcu(&mm->delayed_drop, __mmdrop_delayed);
}
#else
static inline void mmdrop_sched(struct mm_struct *mm)
{
        mmdrop(mm);
}
#endif

/* Helpers for lazy TLB mm refcounting */
static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmgrab(mm);
}

static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
                mmdrop(mm);
        } else {
                /*
                 * mmdrop_lazy_tlb must provide a full memory barrier, see the
                 * membarrier comment finish_task_switch which relies on this.
                 */
                smp_mb();
        }
}

static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmdrop_sched(mm);
        else
                smp_mb(); /* see mmdrop_lazy_tlb() above */
}

/**
 * mmget() - Pin the address space associated with a &struct mm_struct.
 * @mm: The address space to pin.
 *
 * Make sure that the address space of the given &struct mm_struct doesn't
 * go away. This does not protect against parts of the address space being
 * modified or freed, however.
 *
 * Never use this function to pin this address space for an
 * unbounded/indefinite amount of time.
 *
 * Use mmput() to release the reference acquired by mmget().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmget(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
        return atomic_inc_not_zero(&mm->mm_users);
}

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#ifdef CONFIG_MMU
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
void mmput_async(struct mm_struct *);
#endif

/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
 * Grab a reference to a task's mm, if it is not already going away
 * and ptrace_may_access with the mode parameter passed to it
 * succeeds.
 */
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr, len, flags)        (TASK_SIZE)
#endif

#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif

extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                  struct rlimit *rlim_stack);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
                       unsigned long, unsigned long);
extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags);

unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
                                   unsigned long addr, unsigned long len,
                                   unsigned long pgoff, unsigned long flags);

unsigned long
arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t vm_flags);
unsigned long
arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
                                       unsigned long len, unsigned long pgoff,
                                       unsigned long flags, vm_flags_t);

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
                                           struct file *filp,
                                           unsigned long addr,
                                           unsigned long len,
                                           unsigned long pgoff,
                                           unsigned long flags,
                                           vm_flags_t vm_flags);

unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags);
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
                                         struct rlimit *rlim_stack) {}
#endif

static inline bool in_vfork(struct task_struct *tsk)
{
        bool ret;

        /*
         * need RCU to access ->real_parent if CLONE_VM was used along with
         * CLONE_PARENT.
         *
         * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
         * imply CLONE_VM
         *
         * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
         * ->real_parent is not necessarily the task doing vfork(), so in
         * theory we can't rely on task_lock() if we want to dereference it.
         *
         * And in this case we can't trust the real_parent->mm == tsk->mm
         * check, it can be false negative. But we do not care, if init or
         * another oom-unkillable task does this it should blame itself.
         */
        rcu_read_lock();
        ret = tsk->vfork_done &&
                        rcu_dereference(tsk->real_parent)->mm == tsk->mm;
        rcu_read_unlock();

        return ret;
}

/*
 * Applies per-task gfp context to the given allocation flags.
 * PF_MEMALLOC_NOIO implies GFP_NOIO
 * PF_MEMALLOC_NOFS implies GFP_NOFS
 * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
 */
static inline gfp_t current_gfp_context(gfp_t flags)
{
        unsigned int pflags = READ_ONCE(current->flags);

        if (unlikely(pflags & (PF_MEMALLOC_NOIO |
                               PF_MEMALLOC_NOFS |
                               PF_MEMALLOC_NORECLAIM |
                               PF_MEMALLOC_NOWARN |
                               PF_MEMALLOC_PIN))) {
                /*
                 * Stronger flags before weaker flags:
                 * NORECLAIM implies NOIO, which in turn implies NOFS
                 */
                if (pflags & PF_MEMALLOC_NORECLAIM)
                        flags &= ~__GFP_DIRECT_RECLAIM;
                else if (pflags & PF_MEMALLOC_NOIO)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;

                if (pflags & PF_MEMALLOC_NOWARN)
                        flags |= __GFP_NOWARN;

                if (pflags & PF_MEMALLOC_PIN)
                        flags &= ~__GFP_MOVABLE;
        }
        return flags;
}

#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(unsigned long ip);
extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(unsigned long ip) { }
static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif

/* Any memory-allocation retry loop should use
 * memalloc_retry_wait(), and pass the flags for the most
 * constrained allocation attempt that might have failed.
 * This provides useful documentation of where loops are,
 * and a central place to fine tune the waiting as the MM
 * implementation changes.
 */
static inline void memalloc_retry_wait(gfp_t gfp_flags)
{
        /* We use io_schedule_timeout because waiting for memory
         * typically included waiting for dirty pages to be
         * written out, which requires IO.
         */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        gfp_flags = current_gfp_context(gfp_flags);
        if (gfpflags_allow_blocking(gfp_flags) &&
            !(gfp_flags & __GFP_NORETRY))
                /* Probably waited already, no need for much more */
                io_schedule_timeout(1);
        else
                /* Probably didn't wait, and has now released a lock,
                 * so now is a good time to wait
                 */
                io_schedule_timeout(HZ/50);
}

/**
 * might_alloc - Mark possible allocation sites
 * @gfp_mask: gfp_t flags that would be used to allocate
 *
 * Similar to might_sleep() and other annotations, this can be used in functions
 * that might allocate, but often don't. Compiles to nothing without
 * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
 */
static inline void might_alloc(gfp_t gfp_mask)
{
        fs_reclaim_acquire(gfp_mask);
        fs_reclaim_release(gfp_mask);

        might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}

/**
 * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
 *
 * This allows PF_* flags to be conveniently added, irrespective of current
 * value, and then the old version restored with memalloc_flags_restore().
 */
static inline unsigned memalloc_flags_save(unsigned flags)
{
        unsigned oldflags = ~current->flags & flags;
        current->flags |= flags;
        return oldflags;
}

static inline void memalloc_flags_restore(unsigned flags)
{
        current->flags &= ~flags;
}

/**
 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
 *
 * This functions marks the beginning of the GFP_NOIO allocation scope.
 * All further allocations will implicitly drop __GFP_IO flag and so
 * they are safe for the IO critical section from the allocation recursion
 * point of view. Use memalloc_noio_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_noio_restore.
 */
static inline unsigned int memalloc_noio_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOIO);
}

/**
 * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_noio_save call.
 */
static inline void memalloc_noio_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
 *
 * This functions marks the beginning of the GFP_NOFS allocation scope.
 * All further allocations will implicitly drop __GFP_FS flag and so
 * they are safe for the FS critical section from the allocation recursion
 * point of view. Use memalloc_nofs_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_nofs_restore.
 */
static inline unsigned int memalloc_nofs_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOFS);
}

/**
 * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_nofs_save call.
 */
static inline void memalloc_nofs_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
 *
 * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
 * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
 * prevents entering reclaim and allows access to all memory reserves. This
 * should only be used when the caller guarantees the allocation will allow more
 * memory to be freed very shortly, i.e. it needs to allocate some memory in
 * the process of freeing memory, and cannot reclaim due to potential recursion.
 *
 * Users of this scope have to be extremely careful to not deplete the reserves
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory. Usage of a
 * pre-allocated pool (e.g. mempool) should be always considered before using
 * this scope.
 *
 * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
 *
 * Context: This function should not be used in an interrupt context as that one
 *          does not give PF_MEMALLOC access to reserves.
 *          See __gfp_pfmemalloc_flags().
 * Return: The saved flags to be passed to memalloc_noreclaim_restore.
 */
static inline unsigned int memalloc_noreclaim_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC);
}

/**
 * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
 * function. Always make sure that the given flags is the return value from the
 * pairing memalloc_noreclaim_save call.
 */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
 *
 * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
 * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
 * will constraint the allocations to zones that allow long term pinning, i.e.
 * not ZONE_MOVABLE zones.
 *
 * Return: The saved flags to be passed to memalloc_pin_restore.
 */
static inline unsigned int memalloc_pin_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_PIN);
}

/**
 * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
 * Always make sure that the given flags is the return value from the pairing
 * memalloc_pin_save call.
 */
static inline void memalloc_pin_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
 * set_active_memcg - Starts the remote memcg charging scope.
 * @memcg: memcg to charge.
 *
 * This function marks the beginning of the remote memcg charging scope. All the
 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
 * given memcg.
 *
 * Please, make sure that caller has a reference to the passed memcg structure,
 * so its lifetime is guaranteed to exceed the scope between two
 * set_active_memcg() calls.
 *
 * NOTE: This function can nest. Users must save the return value and
 * reset the previous value after their own charging scope is over.
 */
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        if (!in_task()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
                old = current->active_memcg;
                current->active_memcg = memcg;
        }

        return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        return NULL;
}
#endif

#ifdef CONFIG_MEMBARRIER
enum {
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                        = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                        = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                        = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY        = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE                = (1U << 5),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY                = (1U << 6),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                        = (1U << 7),
};

enum {
        MEMBARRIER_FLAG_SYNC_CORE        = (1U << 0),
        MEMBARRIER_FLAG_RSEQ                = (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
        if (current->mm != mm)
                return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
}

extern void membarrier_exec_mmap(struct mm_struct *mm);

extern void membarrier_update_current_mm(struct mm_struct *next_mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
                                             struct mm_struct *next,
                                             struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */
























































    1 



















    1 





    1 













    1 











    1 



    1 
    1 













    1 

















    1 


    1 



















































































































    1 
    1 












    1 


    1 








    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296

#include <linux/atomic.h>
#include <linux/export.h>
#include <linux/generic-radix-tree.h>
#include <linux/gfp.h>
#include <linux/kmemleak.h>

#define GENRADIX_ARY                (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT        ilog2(GENRADIX_ARY)

struct genradix_node {
        union {
                /* Interior node: */
                struct genradix_node        *children[GENRADIX_ARY];

                /* Leaf: */
                u8                        data[GENRADIX_NODE_SIZE];
        };
};

static inline int genradix_depth_shift(unsigned depth)
{
        return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}

/*
 * Returns size (of data, in bytes) that a tree of a given depth holds:
 */
static inline size_t genradix_depth_size(unsigned depth)
{
        return 1UL << genradix_depth_shift(depth);
}

/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH        \
        DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)

#define GENRADIX_DEPTH_MASK                                \
        ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))

static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
        return (unsigned long) r & GENRADIX_DEPTH_MASK;
}

static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
        return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}

/*
 * Returns pointer to the specified byte @offset within @radix, or NULL if not
 * allocated
 */
void *__genradix_ptr(struct __genradix *radix, size_t offset)
{
        struct genradix_root *r = READ_ONCE(radix->root);
        struct genradix_node *n = genradix_root_to_node(r);
        unsigned level                = genradix_root_to_depth(r);

        if (ilog2(offset) >= genradix_depth_shift(level))
                return NULL;

        while (1) {
                if (!n)
                        return NULL;
                if (!level)
                        break;

                level--;

                n = n->children[offset >> genradix_depth_shift(level)];
                offset &= genradix_depth_size(level) - 1;
        }

        return &n->data[offset];
}
EXPORT_SYMBOL(__genradix_ptr);

static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
{
        return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
}

static inline void genradix_free_node(struct genradix_node *node)
{
        kfree(node);
}

/*
 * Returns pointer to the specified byte @offset within @radix, allocating it if
 * necessary - newly allocated slots are always zeroed out:
 */
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
                           gfp_t gfp_mask)
{
        struct genradix_root *v = READ_ONCE(radix->root);
        struct genradix_node *n, *new_node = NULL;
        unsigned level;

        /* Increase tree depth if necessary: */
        while (1) {
                struct genradix_root *r = v, *new_root;

                n        = genradix_root_to_node(r);
                level        = genradix_root_to_depth(r);

                if (n && ilog2(offset) < genradix_depth_shift(level))
                        break;

                if (!new_node) {
                        new_node = genradix_alloc_node(gfp_mask);
                        if (!new_node)
                                return NULL;
                }

                new_node->children[0] = n;
                new_root = ((struct genradix_root *)
                            ((unsigned long) new_node | (n ? level + 1 : 0)));

                if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) {
                        v = new_root;
                        new_node = NULL;
                }
        }

        while (level--) {
                struct genradix_node **p =
                        &n->children[offset >> genradix_depth_shift(level)];
                offset &= genradix_depth_size(level) - 1;

                n = READ_ONCE(*p);
                if (!n) {
                        if (!new_node) {
                                new_node = genradix_alloc_node(gfp_mask);
                                if (!new_node)
                                        return NULL;
                        }

                        if (!(n = cmpxchg_release(p, NULL, new_node)))
                                swap(n, new_node);
                }
        }

        if (new_node)
                genradix_free_node(new_node);

        return &n->data[offset];
}
EXPORT_SYMBOL(__genradix_ptr_alloc);

void *__genradix_iter_peek(struct genradix_iter *iter,
                           struct __genradix *radix,
                           size_t objs_per_page)
{
        struct genradix_root *r;
        struct genradix_node *n;
        unsigned level, i;

        if (iter->offset == SIZE_MAX)
                return NULL;

restart:
        r = READ_ONCE(radix->root);
        if (!r)
                return NULL;

        n        = genradix_root_to_node(r);
        level        = genradix_root_to_depth(r);

        if (ilog2(iter->offset) >= genradix_depth_shift(level))
                return NULL;

        while (level) {
                level--;

                i = (iter->offset >> genradix_depth_shift(level)) &
                        (GENRADIX_ARY - 1);

                while (!n->children[i]) {
                        size_t objs_per_ptr = genradix_depth_size(level);

                        if (iter->offset + objs_per_ptr < iter->offset) {
                                iter->offset        = SIZE_MAX;
                                iter->pos        = SIZE_MAX;
                                return NULL;
                        }

                        i++;
                        iter->offset = round_down(iter->offset + objs_per_ptr,
                                                  objs_per_ptr);
                        iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) *
                                objs_per_page;
                        if (i == GENRADIX_ARY)
                                goto restart;
                }

                n = n->children[i];
        }

        return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
}
EXPORT_SYMBOL(__genradix_iter_peek);

void *__genradix_iter_peek_prev(struct genradix_iter *iter,
                                struct __genradix *radix,
                                size_t objs_per_page,
                                size_t obj_size_plus_page_remainder)
{
        struct genradix_root *r;
        struct genradix_node *n;
        unsigned level, i;

        if (iter->offset == SIZE_MAX)
                return NULL;

restart:
        r = READ_ONCE(radix->root);
        if (!r)
                return NULL;

        n        = genradix_root_to_node(r);
        level        = genradix_root_to_depth(r);

        if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
                iter->offset = genradix_depth_size(level);
                iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;

                iter->offset -= obj_size_plus_page_remainder;
                iter->pos--;
        }

        while (level) {
                level--;

                i = (iter->offset >> genradix_depth_shift(level)) &
                        (GENRADIX_ARY - 1);

                while (!n->children[i]) {
                        size_t objs_per_ptr = genradix_depth_size(level);

                        iter->offset = round_down(iter->offset, objs_per_ptr);
                        iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;

                        if (!iter->offset)
                                return NULL;

                        iter->offset -= obj_size_plus_page_remainder;
                        iter->pos--;

                        if (!i)
                                goto restart;
                        --i;
                }

                n = n->children[i];
        }

        return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
}
EXPORT_SYMBOL(__genradix_iter_peek_prev);

static void genradix_free_recurse(struct genradix_node *n, unsigned level)
{
        if (level) {
                unsigned i;

                for (i = 0; i < GENRADIX_ARY; i++)
                        if (n->children[i])
                                genradix_free_recurse(n->children[i], level - 1);
        }

        genradix_free_node(n);
}

int __genradix_prealloc(struct __genradix *radix, size_t size,
                        gfp_t gfp_mask)
{
        size_t offset;

        for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
                if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
                        return -ENOMEM;

        return 0;
}
EXPORT_SYMBOL(__genradix_prealloc);

void __genradix_free(struct __genradix *radix)
{
        struct genradix_root *r = xchg(&radix->root, NULL);

        genradix_free_recurse(genradix_root_to_node(r),
                              genradix_root_to_depth(r));
}
EXPORT_SYMBOL(__genradix_free);


































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SPINLOCK_H
#define __LINUX_SPINLOCK_H
#define __LINUX_INSIDE_SPINLOCK_H

/*
 * include/linux/spinlock.h - generic spinlock/rwlock declarations
 *
 * here's the role of the various spinlock/rwlock related include files:
 *
 * on SMP builds:
 *
 *  asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
 *                        initializers
 *
 *  linux/spinlock_types_raw:
 *                          The raw types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  asm/spinlock.h:       contains the arch_spin_*()/etc. lowlevel
 *                        implementations, mostly inline assembly code
 *
 *   (also included on UP-debug builds:)
 *
 *  linux/spinlock_api_smp.h:
 *                        contains the prototypes for the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 *
 * on UP builds:
 *
 *  linux/spinlock_type_up.h:
 *                        contains the generic, simplified UP spinlock type.
 *                        (which is an empty structure on non-debug builds)
 *
 *  linux/spinlock_types_raw:
 *                          The raw RT types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  linux/spinlock_up.h:
 *                        contains the arch_spin_*()/etc. version of UP
 *                        builds. (which are NOPs on non-debug, non-preempt
 *                        builds)
 *
 *   (included on UP-non-debug builds:)
 *
 *  linux/spinlock_api_up.h:
 *                        builds the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 */

#include <linux/typecheck.h>
#include <linux/preempt.h>
#include <linux/linkage.h>
#include <linux/compiler.h>
#include <linux/irqflags.h>
#include <linux/thread_info.h>
#include <linux/stringify.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/barrier.h>
#include <asm/mmiowb.h>


/*
 * Must define these before including other files, inline functions need them
 */
#define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME

#define LOCK_SECTION_START(extra)               \
        ".subsection 1\n\t"                     \
        extra                                   \
        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
        LOCK_SECTION_NAME ":\n\t"               \
        ".endif\n"

#define LOCK_SECTION_END                        \
        ".previous\n\t"

#define __lockfunc __section(".spinlock.text")

/*
 * Pull the arch_spinlock_t and arch_rwlock_t definitions:
 */
#include <linux/spinlock_types.h>

/*
 * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them):
 */
#ifdef CONFIG_SMP
# include <asm/spinlock.h>
#else
# include <linux/spinlock_up.h>
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
                                   struct lock_class_key *key, short inner);

# define raw_spin_lock_init(lock)                                        \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN);        \
} while (0)

#else
# define raw_spin_lock_init(lock)                                \
        do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
#endif

#define raw_spin_is_locked(lock)        arch_spin_is_locked(&(lock)->raw_lock)

#ifdef arch_spin_is_contended
#define raw_spin_is_contended(lock)        arch_spin_is_contended(&(lock)->raw_lock)
#else
#define raw_spin_is_contended(lock)        (((void)(lock), 0))
#endif /*arch_spin_is_contended*/

/*
 * smp_mb__after_spinlock() provides the equivalent of a full memory barrier
 * between program-order earlier lock acquisitions and program-order later
 * memory accesses.
 *
 * This guarantees that the following two properties hold:
 *
 *   1) Given the snippet:
 *
 *          { X = 0;  Y = 0; }
 *
 *          CPU0                                CPU1
 *
 *          WRITE_ONCE(X, 1);                WRITE_ONCE(Y, 1);
 *          spin_lock(S);                        smp_mb();
 *          smp_mb__after_spinlock();        r1 = READ_ONCE(X);
 *          r0 = READ_ONCE(Y);
 *          spin_unlock(S);
 *
 *      it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0)
 *      and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments
 *      preceding the call to smp_mb__after_spinlock() in __schedule() and in
 *      try_to_wake_up().
 *
 *   2) Given the snippet:
 *
 *  { X = 0;  Y = 0; }
 *
 *  CPU0                CPU1                                CPU2
 *
 *  spin_lock(S);        spin_lock(S);                        r1 = READ_ONCE(Y);
 *  WRITE_ONCE(X, 1);        smp_mb__after_spinlock();        smp_rmb();
 *  spin_unlock(S);        r0 = READ_ONCE(X);                r2 = READ_ONCE(X);
 *                        WRITE_ONCE(Y, 1);
 *                        spin_unlock(S);
 *
 *      it is forbidden that CPU0's critical section executes before CPU1's
 *      critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1)
 *      and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments
 *      preceding the calls to smp_rmb() in try_to_wake_up() for similar
 *      snippets but "projected" onto two CPUs.
 *
 * Property (2) upgrades the lock to an RCsc lock.
 *
 * Since most load-store architectures implement ACQUIRE with an smp_mb() after
 * the LL/SC loop, they need no further barriers. Similarly all our TSO
 * architectures imply an smp_mb() for each atomic instruction and equally don't
 * need more.
 *
 * Architectures that can implement ACQUIRE better need to take care.
 */
#ifndef smp_mb__after_spinlock
#define smp_mb__after_spinlock()        kcsan_mb()
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
 extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
 extern int do_raw_spin_trylock(raw_spinlock_t *lock);
 extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
#else
static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock(&lock->raw_lock);
        mmiowb_spin_lock();
}

static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
{
        int ret = arch_spin_trylock(&(lock)->raw_lock);

        if (ret)
                mmiowb_spin_lock();

        return ret;
}

static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
{
        mmiowb_spin_unlock();
        arch_spin_unlock(&lock->raw_lock);
        __release(lock);
}
#endif

/*
 * Define the various spin_lock methods.  Note we define these
 * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
 * various methods are defined as nops in the case they are not
 * required.
 */
#define raw_spin_trylock(lock)        __cond_lock(lock, _raw_spin_trylock(lock))

#define raw_spin_lock(lock)        _raw_spin_lock(lock)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define raw_spin_lock_nested(lock, subclass) \
        _raw_spin_lock_nested(lock, subclass)

# define raw_spin_lock_nest_lock(lock, nest_lock)                        \
         do {                                                                \
                 typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
                 _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);        \
         } while (0)
#else
/*
 * Always evaluate the 'subclass' argument to avoid that the compiler
 * warns about set-but-not-used variables when building with
 * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
 */
# define raw_spin_lock_nested(lock, subclass)                \
        _raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock)        _raw_spin_lock(lock)
#endif

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

#define raw_spin_lock_irqsave(lock, flags)                        \
        do {                                                \
                typecheck(unsigned long, flags);        \
                flags = _raw_spin_lock_irqsave(lock);        \
        } while (0)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave_nested(lock, subclass);        \
        } while (0)
#else
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave(lock);                        \
        } while (0)
#endif

#else

#define raw_spin_lock_irqsave(lock, flags)                \
        do {                                                \
                typecheck(unsigned long, flags);        \
                _raw_spin_lock_irqsave(lock, flags);        \
        } while (0)

#define raw_spin_lock_irqsave_nested(lock, flags, subclass)        \
        raw_spin_lock_irqsave(lock, flags)

#endif

#define raw_spin_lock_irq(lock)                _raw_spin_lock_irq(lock)
#define raw_spin_lock_bh(lock)                _raw_spin_lock_bh(lock)
#define raw_spin_unlock(lock)                _raw_spin_unlock(lock)
#define raw_spin_unlock_irq(lock)        _raw_spin_unlock_irq(lock)

#define raw_spin_unlock_irqrestore(lock, flags)                \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                _raw_spin_unlock_irqrestore(lock, flags);        \
        } while (0)
#define raw_spin_unlock_bh(lock)        _raw_spin_unlock_bh(lock)

#define raw_spin_trylock_bh(lock) \
        __cond_lock(lock, _raw_spin_trylock_bh(lock))

#define raw_spin_trylock_irq(lock) \
({ \
        local_irq_disable(); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_enable(); 0;  }); \
})

#define raw_spin_trylock_irqsave(lock, flags) \
({ \
        local_irq_save(flags); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_restore(flags); 0; }); \
})

#ifndef CONFIG_PREEMPT_RT
/* Include rwlock functions for !RT */
#include <linux/rwlock.h>
#endif

/*
 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
 */
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
# include <linux/spinlock_api_smp.h>
#else
# include <linux/spinlock_api_up.h>
#endif

/* Non PREEMPT_RT kernel, map to raw spinlocks: */
#ifndef CONFIG_PREEMPT_RT

/*
 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
 */

static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
        return &lock->rlock;
}

#ifdef CONFIG_DEBUG_SPINLOCK

# define spin_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __raw_spin_lock_init(spinlock_check(lock),                \
                             #lock, &__key, LD_WAIT_CONFIG);        \
} while (0)

#else

# define spin_lock_init(_lock)                        \
do {                                                \
        spinlock_check(_lock);                        \
        *(_lock) = __SPIN_LOCK_UNLOCKED(_lock);        \
} while (0)

#endif

static __always_inline void spin_lock(spinlock_t *lock)
{
        raw_spin_lock(&lock->rlock);
}

static __always_inline void spin_lock_bh(spinlock_t *lock)
{
        raw_spin_lock_bh(&lock->rlock);
}

static __always_inline int spin_trylock(spinlock_t *lock)
{
        return raw_spin_trylock(&lock->rlock);
}

#define spin_lock_nested(lock, subclass)                        \
do {                                                                \
        raw_spin_lock_nested(spinlock_check(lock), subclass);        \
} while (0)

#define spin_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);        \
} while (0)

static __always_inline void spin_lock_irq(spinlock_t *lock)
{
        raw_spin_lock_irq(&lock->rlock);
}

#define spin_lock_irqsave(lock, flags)                                \
do {                                                                \
        raw_spin_lock_irqsave(spinlock_check(lock), flags);        \
} while (0)

#define spin_lock_irqsave_nested(lock, flags, subclass)                        \
do {                                                                        \
        raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
} while (0)

static __always_inline void spin_unlock(spinlock_t *lock)
{
        raw_spin_unlock(&lock->rlock);
}

static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
        raw_spin_unlock_bh(&lock->rlock);
}

static __always_inline void spin_unlock_irq(spinlock_t *lock)
{
        raw_spin_unlock_irq(&lock->rlock);
}

static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
        raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

static __always_inline int spin_trylock_bh(spinlock_t *lock)
{
        return raw_spin_trylock_bh(&lock->rlock);
}

static __always_inline int spin_trylock_irq(spinlock_t *lock)
{
        return raw_spin_trylock_irq(&lock->rlock);
}

#define spin_trylock_irqsave(lock, flags)                        \
({                                                                \
        raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})

/**
 * spin_is_locked() - Check whether a spinlock is locked.
 * @lock: Pointer to the spinlock.
 *
 * This function is NOT required to provide any memory ordering
 * guarantees; it could be used for debugging purposes or, when
 * additional synchronization is needed, accompanied with other
 * constructs (memory barriers) enforcing the synchronization.
 *
 * Returns: 1 if @lock is locked, 0 otherwise.
 *
 * Note that the function only tells you that the spinlock is
 * seen to be locked, not that it is locked on your CPU.
 *
 * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
 * the return value is always 0 (see include/linux/spinlock_up.h).
 * Therefore you should not rely heavily on the return value.
 */
static __always_inline int spin_is_locked(spinlock_t *lock)
{
        return raw_spin_is_locked(&lock->rlock);
}

static __always_inline int spin_is_contended(spinlock_t *lock)
{
        return raw_spin_is_contended(&lock->rlock);
}

#define assert_spin_locked(lock)        assert_raw_spin_locked(&(lock)->rlock)

#else  /* !CONFIG_PREEMPT_RT */
# include <linux/spinlock_rt.h>
#endif /* CONFIG_PREEMPT_RT */

/*
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
 * but a general need for low latency)
 */
static inline int spin_needbreak(spinlock_t *lock)
{
#ifdef CONFIG_PREEMPTION
        return spin_is_contended(lock);
#else
        return 0;
#endif
}

/*
 * Check if a rwlock is contended.
 * Returns non-zero if there is another task waiting on the rwlock.
 * Returns zero if the lock is not contended or the system / underlying
 * rwlock implementation does not support contention detection.
 * Technically does not depend on CONFIG_PREEMPTION, but a general need
 * for low latency.
 */
static inline int rwlock_needbreak(rwlock_t *lock)
{
#ifdef CONFIG_PREEMPTION
        return rwlock_is_contended(lock);
#else
        return 0;
#endif
}

/*
 * Pull the atomic_t declaration:
 * (asm-mips/atomic.h needs above definitions)
 */
#include <linux/atomic.h>
/**
 * atomic_dec_and_lock - lock on reaching reference count zero
 * @atomic: the atomic counter
 * @lock: the spinlock in question
 *
 * Decrements @atomic by 1.  If the result is 0, returns true and locks
 * @lock.  Returns false for all other cases.
 */
extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#define atomic_dec_and_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))

extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags)))

extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock);
#define atomic_dec_and_raw_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock))

extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags)))

int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
                             size_t max_size, unsigned int cpu_mult,
                             gfp_t gfp, const char *name,
                             struct lock_class_key *key);

#define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp)    \
        ({                                                                     \
                static struct lock_class_key key;                             \
                int ret;                                                     \
                                                                             \
                ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size,   \
                                               cpu_mult, gfp, #locks, &key); \
                ret;                                                             \
        })

void free_bucket_spinlocks(spinlock_t *locks);

DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t,
                    raw_spin_lock(_T->lock),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
                    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t,
                    raw_spin_lock_irq(_T->lock),
                    raw_spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
                    raw_spin_lock_irqsave(_T->lock, _T->flags),
                    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
                         raw_spin_trylock_irqsave(_T->lock, _T->flags))

DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
                    spin_lock(_T->lock),
                    spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
                    spin_lock_irq(_T->lock),
                    spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
                         spin_trylock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
                    spin_lock_irqsave(_T->lock, _T->flags),
                    spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
                         spin_trylock_irqsave(_T->lock, _T->flags))

DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
                    read_lock(_T->lock),
                    read_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
                    read_lock_irq(_T->lock),
                    read_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
                    read_lock_irqsave(_T->lock, _T->flags),
                    read_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
                    write_lock(_T->lock),
                    write_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
                    write_lock_irq(_T->lock),
                    write_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
                    write_lock_irqsave(_T->lock, _T->flags),
                    write_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

#undef __LINUX_INSIDE_SPINLOCK_H
#endif /* __LINUX_SPINLOCK_H */





































    1 





    1 














    1 


















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
// SPDX-License-Identifier: GPL-2.0-or-later
/* SCTP kernel implementation
 * (C) Copyright IBM Corp. 2001, 2004
 * Copyright (c) 1999-2000 Cisco, Inc.
 * Copyright (c) 1999-2001 Motorola, Inc.
 * Copyright (c) 2001 Intel Corp.
 *
 * This file is part of the SCTP kernel implementation
 *
 * These functions manipulate sctp tsn mapping array.
 *
 * Please send any bug reports or fixes you make to the
 * email address(es):
 *    lksctp developers <linux-sctp@vger.kernel.org>
 *
 * Written or modified by:
 *    La Monte H.P. Yarroll <piggy@acm.org>
 *    Jon Grimm             <jgrimm@us.ibm.com>
 *    Karl Knutson          <karl@athena.chicago.il.us>
 *    Sridhar Samudrala     <sri@us.ibm.com>
 */

#include <linux/slab.h>
#include <linux/types.h>
#include <linux/bitmap.h>
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>

static void sctp_tsnmap_update(struct sctp_tsnmap *map);
static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
                                     __u16 len, __u16 *start, __u16 *end);
static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size);

/* Initialize a block of memory as a tsnmap.  */
struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len,
                                     __u32 initial_tsn, gfp_t gfp)
{
        if (!map->tsn_map) {
                map->tsn_map = kzalloc(len>>3, gfp);
                if (map->tsn_map == NULL)
                        return NULL;

                map->len = len;
        } else {
                bitmap_zero(map->tsn_map, map->len);
        }

        /* Keep track of TSNs represented by tsn_map.  */
        map->base_tsn = initial_tsn;
        map->cumulative_tsn_ack_point = initial_tsn - 1;
        map->max_tsn_seen = map->cumulative_tsn_ack_point;
        map->num_dup_tsns = 0;

        return map;
}

void sctp_tsnmap_free(struct sctp_tsnmap *map)
{
        map->len = 0;
        kfree(map->tsn_map);
}

/* Test the tracking state of this TSN.
 * Returns:
 *   0 if the TSN has not yet been seen
 *  >0 if the TSN has been seen (duplicate)
 *  <0 if the TSN is invalid (too large to track)
 */
int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn)
{
        u32 gap;

        /* Check to see if this is an old TSN */
        if (TSN_lte(tsn, map->cumulative_tsn_ack_point))
                return 1;

        /* Verify that we can hold this TSN and that it will not
         * overflow our map
         */
        if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
                return -1;

        /* Calculate the index into the mapping arrays.  */
        gap = tsn - map->base_tsn;

        /* Check to see if TSN has already been recorded.  */
        if (gap < map->len && test_bit(gap, map->tsn_map))
                return 1;
        else
                return 0;
}


/* Mark this TSN as seen.  */
int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn,
                     struct sctp_transport *trans)
{
        u16 gap;

        if (TSN_lt(tsn, map->base_tsn))
                return 0;

        gap = tsn - map->base_tsn;

        if (gap >= map->len && !sctp_tsnmap_grow(map, gap + 1))
                return -ENOMEM;

        if (!sctp_tsnmap_has_gap(map) && gap == 0) {
                /* In this case the map has no gaps and the tsn we are
                 * recording is the next expected tsn.  We don't touch
                 * the map but simply bump the values.
                 */
                map->max_tsn_seen++;
                map->cumulative_tsn_ack_point++;
                if (trans)
                        trans->sack_generation =
                                trans->asoc->peer.sack_generation;
                map->base_tsn++;
        } else {
                /* Either we already have a gap, or about to record a gap, so
                 * have work to do.
                 *
                 * Bump the max.
                 */
                if (TSN_lt(map->max_tsn_seen, tsn))
                        map->max_tsn_seen = tsn;

                /* Mark the TSN as received.  */
                set_bit(gap, map->tsn_map);

                /* Go fixup any internal TSN mapping variables including
                 * cumulative_tsn_ack_point.
                 */
                sctp_tsnmap_update(map);
        }

        return 0;
}


/* Initialize a Gap Ack Block iterator from memory being provided.  */
static void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map,
                                  struct sctp_tsnmap_iter *iter)
{
        /* Only start looking one past the Cumulative TSN Ack Point.  */
        iter->start = map->cumulative_tsn_ack_point + 1;
}

/* Get the next Gap Ack Blocks. Returns 0 if there was not another block
 * to get.
 */
static int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map,
                                    struct sctp_tsnmap_iter *iter,
                                    __u16 *start, __u16 *end)
{
        int ended = 0;
        __u16 start_ = 0, end_ = 0, offset;

        /* If there are no more gap acks possible, get out fast.  */
        if (TSN_lte(map->max_tsn_seen, iter->start))
                return 0;

        offset = iter->start - map->base_tsn;
        sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len,
                                 &start_, &end_);

        /* The Gap Ack Block happens to end at the end of the map. */
        if (start_ && !end_)
                end_ = map->len - 1;

        /* If we found a Gap Ack Block, return the start and end and
         * bump the iterator forward.
         */
        if (end_) {
                /* Fix up the start and end based on the
                 * Cumulative TSN Ack which is always 1 behind base.
                 */
                *start = start_ + 1;
                *end = end_ + 1;

                /* Move the iterator forward.  */
                iter->start = map->cumulative_tsn_ack_point + *end + 1;
                ended = 1;
        }

        return ended;
}

/* Mark this and any lower TSN as seen.  */
void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn)
{
        u32 gap;

        if (TSN_lt(tsn, map->base_tsn))
                return;
        if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE))
                return;

        /* Bump the max.  */
        if (TSN_lt(map->max_tsn_seen, tsn))
                map->max_tsn_seen = tsn;

        gap = tsn - map->base_tsn + 1;

        map->base_tsn += gap;
        map->cumulative_tsn_ack_point += gap;
        if (gap >= map->len) {
                /* If our gap is larger then the map size, just
                 * zero out the map.
                 */
                bitmap_zero(map->tsn_map, map->len);
        } else {
                /* If the gap is smaller than the map size,
                 * shift the map by 'gap' bits and update further.
                 */
                bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len);
                sctp_tsnmap_update(map);
        }
}

/********************************************************************
 * 2nd Level Abstractions
 ********************************************************************/

/* This private helper function updates the tsnmap buffers and
 * the Cumulative TSN Ack Point.
 */
static void sctp_tsnmap_update(struct sctp_tsnmap *map)
{
        u16 len;
        unsigned long zero_bit;


        len = map->max_tsn_seen - map->cumulative_tsn_ack_point;
        zero_bit = find_first_zero_bit(map->tsn_map, len);
        if (!zero_bit)
                return;                /* The first 0-bit is bit 0.  nothing to do */

        map->base_tsn += zero_bit;
        map->cumulative_tsn_ack_point += zero_bit;

        bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len);
}

/* How many data chunks  are we missing from our peer?
 */
__u16 sctp_tsnmap_pending(struct sctp_tsnmap *map)
{
        __u32 cum_tsn = map->cumulative_tsn_ack_point;
        __u32 max_tsn = map->max_tsn_seen;
        __u32 base_tsn = map->base_tsn;
        __u16 pending_data;
        u32 gap;

        pending_data = max_tsn - cum_tsn;
        gap = max_tsn - base_tsn;

        if (gap == 0 || gap >= map->len)
                goto out;

        pending_data -= bitmap_weight(map->tsn_map, gap + 1);
out:
        return pending_data;
}

/* This is a private helper for finding Gap Ack Blocks.  It searches a
 * single array for the start and end of a Gap Ack Block.
 *
 * The flags "started" and "ended" tell is if we found the beginning
 * or (respectively) the end of a Gap Ack Block.
 */
static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off,
                                     __u16 len, __u16 *start, __u16 *end)
{
        int i = off;

        /* Look through the entire array, but break out
         * early if we have found the end of the Gap Ack Block.
         */

        /* Also, stop looking past the maximum TSN seen. */

        /* Look for the start. */
        i = find_next_bit(map, len, off);
        if (i < len)
                *start = i;

        /* Look for the end.  */
        if (*start) {
                /* We have found the start, let's find the
                 * end.  If we find the end, break out.
                 */
                i = find_next_zero_bit(map, len, i);
                if (i < len)
                        *end = i - 1;
        }
}

/* Renege that we have seen a TSN.  */
void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn)
{
        u32 gap;

        if (TSN_lt(tsn, map->base_tsn))
                return;
        /* Assert: TSN is in range.  */
        if (!TSN_lt(tsn, map->base_tsn + map->len))
                return;

        gap = tsn - map->base_tsn;

        /* Pretend we never saw the TSN.  */
        clear_bit(gap, map->tsn_map);
}

/* How many gap ack blocks do we have recorded? */
__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map,
                           struct sctp_gap_ack_block *gabs)
{
        struct sctp_tsnmap_iter iter;
        int ngaps = 0;

        /* Refresh the gap ack information. */
        if (sctp_tsnmap_has_gap(map)) {
                __u16 start = 0, end = 0;
                sctp_tsnmap_iter_init(map, &iter);
                while (sctp_tsnmap_next_gap_ack(map, &iter,
                                                &start,
                                                &end)) {

                        gabs[ngaps].start = htons(start);
                        gabs[ngaps].end = htons(end);
                        ngaps++;
                        if (ngaps >= SCTP_MAX_GABS)
                                break;
                }
        }
        return ngaps;
}

static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size)
{
        unsigned long *new;
        unsigned long inc;
        u16  len;

        if (size > SCTP_TSN_MAP_SIZE)
                return 0;

        inc = ALIGN((size - map->len), BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT;
        len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE);

        new = kzalloc(len>>3, GFP_ATOMIC);
        if (!new)
                return 0;

        bitmap_copy(new, map->tsn_map,
                map->max_tsn_seen - map->cumulative_tsn_ack_point);
        kfree(map->tsn_map);
        map->tsn_map = new;
        map->len = len;

        return 1;
}


























































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 




















    1 



    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/userfaultfd.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *  Copyright (C) 2008-2009 Red Hat, Inc.
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  Some part derived from fs/eventfd.c (anon inode setup) and
 *  mm/ksm.c (mm hashing).
 */

#include <linux/list.h>
#include <linux/hashtable.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/bug.h>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>

static int sysctl_unprivileged_userfaultfd __read_mostly;

#ifdef CONFIG_SYSCTL
static struct ctl_table vm_userfaultfd_table[] = {
        {
                .procname        = "unprivileged_userfaultfd",
                .data                = &sysctl_unprivileged_userfaultfd,
                .maxlen                = sizeof(sysctl_unprivileged_userfaultfd),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};
#endif

static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;

struct userfaultfd_fork_ctx {
        struct userfaultfd_ctx *orig;
        struct userfaultfd_ctx *new;
        struct list_head list;
};

struct userfaultfd_unmap_ctx {
        struct userfaultfd_ctx *ctx;
        unsigned long start;
        unsigned long end;
        struct list_head list;
};

struct userfaultfd_wait_queue {
        struct uffd_msg msg;
        wait_queue_entry_t wq;
        struct userfaultfd_ctx *ctx;
        bool waken;
};

struct userfaultfd_wake_range {
        unsigned long start;
        unsigned long len;
};

/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED                (1u << 31)

static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
{
        return ctx->features & UFFD_FEATURE_INITIALIZED;
}

static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
{
        return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
}

/*
 * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
 * meaningful when userfaultfd_wp()==true on the vma and when it's
 * anonymous.
 */
bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
{
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return false;

        return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}

static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
                                     vm_flags_t flags)
{
        const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;

        vm_flags_reset(vma, flags);
        /*
         * For shared mappings, we want to enable writenotify while
         * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
         * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
         */
        if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
                vma_set_page_prot(vma);
}

static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                                     int wake_flags, void *key)
{
        struct userfaultfd_wake_range *range = key;
        int ret;
        struct userfaultfd_wait_queue *uwq;
        unsigned long start, len;

        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
        ret = 0;
        /* len == 0 means wake all */
        start = range->start;
        len = range->len;
        if (len && (start > uwq->msg.arg.pagefault.address ||
                    start + len <= uwq->msg.arg.pagefault.address))
                goto out;
        WRITE_ONCE(uwq->waken, true);
        /*
         * The Program-Order guarantees provided by the scheduler
         * ensure uwq->waken is visible before the task is woken.
         */
        ret = wake_up_state(wq->private, mode);
        if (ret) {
                /*
                 * Wake only once, autoremove behavior.
                 *
                 * After the effect of list_del_init is visible to the other
                 * CPUs, the waitqueue may disappear from under us, see the
                 * !list_empty_careful() in handle_userfault().
                 *
                 * try_to_wake_up() has an implicit smp_mb(), and the
                 * wq->private is read before calling the extern function
                 * "wake_up_state" (which in turns calls try_to_wake_up).
                 */
                list_del_init(&wq->entry);
        }
out:
        return ret;
}

/**
 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to the userfaultfd context.
 */
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
        refcount_inc(&ctx->refcount);
}

/**
 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to userfaultfd context.
 *
 * The userfaultfd context reference must have been previously acquired either
 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
 */
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
                mmdrop(ctx->mm);
                kmem_cache_free(userfaultfd_ctx_cachep, ctx);
        }
}

static inline void msg_init(struct uffd_msg *msg)
{
        BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
        /*
         * Must use memset to zero out the paddings or kernel data is
         * leaked to userland.
         */
        memset(msg, 0, sizeof(struct uffd_msg));
}

static inline struct uffd_msg userfault_msg(unsigned long address,
                                            unsigned long real_address,
                                            unsigned int flags,
                                            unsigned long reason,
                                            unsigned int features)
{
        struct uffd_msg msg;

        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;

        msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
                                    real_address : address;

        /*
         * These flags indicate why the userfault occurred:
         * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
         * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
         * - Neither of these flags being set indicates a MISSING fault.
         *
         * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
         * fault. Otherwise, it was a read fault.
         */
        if (flags & FAULT_FLAG_WRITE)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
        if (reason & VM_UFFD_WP)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
        if (reason & VM_UFFD_MINOR)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
        if (features & UFFD_FEATURE_THREAD_ID)
                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * Same functionality as userfaultfd_must_wait below with modifications for
 * hugepmd ranges.
 */
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t *ptep, pte;
        bool ret = true;

        assert_fault_locked(vmf);

        ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
        if (!ptep)
                goto out;

        ret = false;
        pte = huge_ptep_get(ptep);

        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.  PTE markers should be handled the same as none
         * ptes here.
         */
        if (huge_pte_none_mostly(pte))
                ret = true;
        if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
                ret = true;
out:
        return ret;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        return false;        /* should never get here */
}
#endif /* CONFIG_HUGETLB_PAGE */

/*
 * Verify the pagetables are still not ok after having reigstered into
 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
 * userfault that has already been resolved, if userfaultfd_read_iter and
 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
 * threads.
 */
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
                                         struct vm_fault *vmf,
                                         unsigned long reason)
{
        struct mm_struct *mm = ctx->mm;
        unsigned long address = vmf->address;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pte_t ptent;
        bool ret = true;

        assert_fault_locked(vmf);

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;
        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;
        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;
        pmd = pmd_offset(pud, address);
again:
        _pmd = pmdp_get_lockless(pmd);
        if (pmd_none(_pmd))
                goto out;

        ret = false;
        if (!pmd_present(_pmd) || pmd_devmap(_pmd))
                goto out;

        if (pmd_trans_huge(_pmd)) {
                if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
                        ret = true;
                goto out;
        }

        pte = pte_offset_map(pmd, address);
        if (!pte) {
                ret = true;
                goto again;
        }
        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.  PTE markers should be handled the same as none
         * ptes here.
         */
        ptent = ptep_get(pte);
        if (pte_none_mostly(ptent))
                ret = true;
        if (!pte_write(ptent) && (reason & VM_UFFD_WP))
                ret = true;
        pte_unmap(pte);

out:
        return ret;
}

static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
{
        if (flags & FAULT_FLAG_INTERRUPTIBLE)
                return TASK_INTERRUPTIBLE;

        if (flags & FAULT_FLAG_KILLABLE)
                return TASK_KILLABLE;

        return TASK_UNINTERRUPTIBLE;
}

/*
 * The locking rules involved in returning VM_FAULT_RETRY depending on
 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
 * recommendation in __lock_page_or_retry is not an understatement.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
 * not set.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
 * set, VM_FAULT_RETRY can still be returned if and only if there are
 * fatal_signal_pending()s, and the mmap_lock must be released before
 * returning it.
 */
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue uwq;
        vm_fault_t ret = VM_FAULT_SIGBUS;
        bool must_wait;
        unsigned int blocking_state;

        /*
         * We don't do userfault handling for the final child pid update.
         *
         * We also don't do userfault handling during
         * coredumping. hugetlbfs has the special
         * hugetlb_follow_page_mask() to skip missing pages in the
         * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
         * the no_page_table() helper in follow_page_mask(), but the
         * shmem_vm_ops->fault method is invoked even during
         * coredumping and it ends up here.
         */
        if (current->flags & (PF_EXITING|PF_DUMPCORE))
                goto out;

        assert_fault_locked(vmf);

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx)
                goto out;

        BUG_ON(ctx->mm != mm);

        /* Any unrecognized flag is a bug. */
        VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
        /* 0 or > 1 flags set is a bug; we expect exactly 1. */
        VM_BUG_ON(!reason || (reason & (reason - 1)));

        if (ctx->features & UFFD_FEATURE_SIGBUS)
                goto out;
        if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
                goto out;

        /*
         * If it's already released don't get it. This avoids to loop
         * in __get_user_pages if userfaultfd_release waits on the
         * caller of handle_userfault to release the mmap_lock.
         */
        if (unlikely(READ_ONCE(ctx->released))) {
                /*
                 * Don't return VM_FAULT_SIGBUS in this case, so a non
                 * cooperative manager can close the uffd after the
                 * last UFFDIO_COPY, without risking to trigger an
                 * involuntary SIGBUS if the process was starting the
                 * userfaultfd while the userfaultfd was still armed
                 * (but after the last UFFDIO_COPY). If the uffd
                 * wasn't already closed when the userfault reached
                 * this point, that would normally be solved by
                 * userfaultfd_must_wait returning 'false'.
                 *
                 * If we were to return VM_FAULT_SIGBUS here, the non
                 * cooperative manager would be instead forced to
                 * always call UFFDIO_UNREGISTER before it can safely
                 * close the uffd.
                 */
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        /*
         * Check that we can return VM_FAULT_RETRY.
         *
         * NOTE: it should become possible to return VM_FAULT_RETRY
         * even if FAULT_FLAG_TRIED is set without leading to gup()
         * -EBUSY failures, if the userfaultfd is to be extended for
         * VM_UFFD_WP tracking and we intend to arm the userfault
         * without first stopping userland access to the memory. For
         * VM_UFFD_MISSING userfaults this is enough for now.
         */
        if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
                /*
                 * Validate the invariant that nowait must allow retry
                 * to be sure not to return SIGBUS erroneously on
                 * nowait invocations.
                 */
                BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
                if (printk_ratelimit()) {
                        printk(KERN_WARNING
                               "FAULT_FLAG_ALLOW_RETRY missing %x\n",
                               vmf->flags);
                        dump_stack();
                }
#endif
                goto out;
        }

        /*
         * Handle nowait, not much to do other than tell it to retry
         * and wait.
         */
        ret = VM_FAULT_RETRY;
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                goto out;

        /* take the reference before dropping the mmap_lock */
        userfaultfd_ctx_get(ctx);

        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
        uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
                                reason, ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;

        blocking_state = userfaultfd_get_blocking_state(vmf->flags);

        /*
         * Take the vma lock now, in order to safely call
         * userfaultfd_huge_must_wait() later. Since acquiring the
         * (sleepable) vma lock can modify the current task state, that
         * must be before explicitly calling set_current_state().
         */
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
        /*
         * The smp_mb() after __set_current_state prevents the reads
         * following the spin_unlock to happen before the list_add in
         * __add_wait_queue.
         */
        set_current_state(blocking_state);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        if (!is_vm_hugetlb_page(vma))
                must_wait = userfaultfd_must_wait(ctx, vmf, reason);
        else
                must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
        release_fault_lock(vmf);

        if (likely(must_wait && !READ_ONCE(ctx->released))) {
                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();
        }

        __set_current_state(TASK_RUNNING);

        /*
         * Here we race with the list_del; list_add in
         * userfaultfd_ctx_read(), however because we don't ever run
         * list_del_init() to refile across the two lists, the prev
         * and next pointers will never point to self. list_add also
         * would never let any of the two pointers to point to
         * self. So list_empty_careful won't risk to see both pointers
         * pointing to self at any time during the list refile. The
         * only case where list_del_init() is called is the full
         * removal in the wake function and there we don't re-list_add
         * and it's fine not to block on the spinlock. The uwq on this
         * kernel stack can be released after the list_del_init.
         */
        if (!list_empty_careful(&uwq.wq.entry)) {
                spin_lock_irq(&ctx->fault_pending_wqh.lock);
                /*
                 * No need of list_del_init(), the uwq on the stack
                 * will be freed shortly anyway.
                 */
                list_del(&uwq.wq.entry);
                spin_unlock_irq(&ctx->fault_pending_wqh.lock);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
        userfaultfd_ctx_put(ctx);

out:
        return ret;
}

static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                                              struct userfaultfd_wait_queue *ewq)
{
        struct userfaultfd_ctx *release_new_ctx;

        if (WARN_ON_ONCE(current->flags & PF_EXITING))
                goto out;

        ewq->ctx = ctx;
        init_waitqueue_entry(&ewq->wq, current);
        release_new_ctx = NULL;

        spin_lock_irq(&ctx->event_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->event_wqh, &ewq->wq);
        for (;;) {
                set_current_state(TASK_KILLABLE);
                if (ewq->msg.event == 0)
                        break;
                if (READ_ONCE(ctx->released) ||
                    fatal_signal_pending(current)) {
                        /*
                         * &ewq->wq may be queued in fork_event, but
                         * __remove_wait_queue ignores the head
                         * parameter. It would be a problem if it
                         * didn't.
                         */
                        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
                        if (ewq->msg.event == UFFD_EVENT_FORK) {
                                struct userfaultfd_ctx *new;

                                new = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        ewq->msg.arg.reserved.reserved1;
                                release_new_ctx = new;
                        }
                        break;
                }

                spin_unlock_irq(&ctx->event_wqh.lock);

                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();

                spin_lock_irq(&ctx->event_wqh.lock);
        }
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->event_wqh.lock);

        if (release_new_ctx) {
                struct vm_area_struct *vma;
                struct mm_struct *mm = release_new_ctx->mm;
                VMA_ITERATOR(vmi, mm, 0);

                /* the various vma->vm_userfaultfd_ctx still points to it */
                mmap_write_lock(mm);
                for_each_vma(vmi, vma) {
                        if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
                                vma_start_write(vma);
                                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                                userfaultfd_set_vm_flags(vma,
                                                         vma->vm_flags & ~__VM_UFFD_FLAGS);
                        }
                }
                mmap_write_unlock(mm);

                userfaultfd_ctx_put(release_new_ctx);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
out:
        atomic_dec(&ctx->mmap_changing);
        VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
        userfaultfd_ctx_put(ctx);
}

static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
                                       struct userfaultfd_wait_queue *ewq)
{
        ewq->msg.event = 0;
        wake_up_locked(&ctx->event_wqh);
        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
}

int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
{
        struct userfaultfd_ctx *ctx = NULL, *octx;
        struct userfaultfd_fork_ctx *fctx;

        octx = vma->vm_userfaultfd_ctx.ctx;
        if (!octx)
                return 0;

        if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
                vma_start_write(vma);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
                return 0;
        }

        list_for_each_entry(fctx, fcs, list)
                if (fctx->orig == octx) {
                        ctx = fctx->new;
                        break;
                }

        if (!ctx) {
                fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
                if (!fctx)
                        return -ENOMEM;

                ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
                if (!ctx) {
                        kfree(fctx);
                        return -ENOMEM;
                }

                refcount_set(&ctx->refcount, 1);
                ctx->flags = octx->flags;
                ctx->features = octx->features;
                ctx->released = false;
                init_rwsem(&ctx->map_changing_lock);
                atomic_set(&ctx->mmap_changing, 0);
                ctx->mm = vma->vm_mm;
                mmgrab(ctx->mm);

                userfaultfd_ctx_get(octx);
                down_write(&octx->map_changing_lock);
                atomic_inc(&octx->mmap_changing);
                up_write(&octx->map_changing_lock);
                fctx->orig = octx;
                fctx->new = ctx;
                list_add_tail(&fctx->list, fcs);
        }

        vma->vm_userfaultfd_ctx.ctx = ctx;
        return 0;
}

static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
{
        struct userfaultfd_ctx *ctx = fctx->orig;
        struct userfaultfd_wait_queue ewq;

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_FORK;
        ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

void dup_userfaultfd_complete(struct list_head *fcs)
{
        struct userfaultfd_fork_ctx *fctx, *n;

        list_for_each_entry_safe(fctx, n, fcs, list) {
                dup_fctx(fctx);
                list_del(&fctx->list);
                kfree(fctx);
        }
}

void mremap_userfaultfd_prep(struct vm_area_struct *vma,
                             struct vm_userfaultfd_ctx *vm_ctx)
{
        struct userfaultfd_ctx *ctx;

        ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return;

        if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
                vm_ctx->ctx = ctx;
                userfaultfd_ctx_get(ctx);
                down_write(&ctx->map_changing_lock);
                atomic_inc(&ctx->mmap_changing);
                up_write(&ctx->map_changing_lock);
        } else {
                /* Drop uffd context if remap feature not enabled */
                vma_start_write(vma);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
        }
}

void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
                                 unsigned long from, unsigned long to,
                                 unsigned long len)
{
        struct userfaultfd_ctx *ctx = vm_ctx->ctx;
        struct userfaultfd_wait_queue ewq;

        if (!ctx)
                return;

        if (to & ~PAGE_MASK) {
                userfaultfd_ctx_put(ctx);
                return;
        }

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMAP;
        ewq.msg.arg.remap.from = from;
        ewq.msg.arg.remap.to = to;
        ewq.msg.arg.remap.len = len;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

bool userfaultfd_remove(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
{
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue ewq;

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
                return true;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        mmap_read_unlock(mm);

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMOVE;
        ewq.msg.arg.remove.start = start;
        ewq.msg.arg.remove.end = end;

        userfaultfd_event_wait_completion(ctx, &ewq);

        return false;
}

static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
                          unsigned long start, unsigned long end)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;

        list_for_each_entry(unmap_ctx, unmaps, list)
                if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
                    unmap_ctx->end == end)
                        return true;

        return false;
}

int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, struct list_head *unmaps)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
            has_unmap_ctx(ctx, unmaps, start, end))
                return 0;

        unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
        if (!unmap_ctx)
                return -ENOMEM;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        unmap_ctx->ctx = ctx;
        unmap_ctx->start = start;
        unmap_ctx->end = end;
        list_add_tail(&unmap_ctx->list, unmaps);

        return 0;
}

void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
{
        struct userfaultfd_unmap_ctx *ctx, *n;
        struct userfaultfd_wait_queue ewq;

        list_for_each_entry_safe(ctx, n, uf, list) {
                msg_init(&ewq.msg);

                ewq.msg.event = UFFD_EVENT_UNMAP;
                ewq.msg.arg.remove.start = ctx->start;
                ewq.msg.arg.remove.end = ctx->end;

                userfaultfd_event_wait_completion(ctx->ctx, &ewq);

                list_del(&ctx->list);
                kfree(ctx);
        }
}

static int userfaultfd_release(struct inode *inode, struct file *file)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev;
        /* len == 0 means wake all */
        struct userfaultfd_wake_range range = { .len = 0, };
        unsigned long new_flags;
        VMA_ITERATOR(vmi, mm, 0);

        WRITE_ONCE(ctx->released, true);

        if (!mmget_not_zero(mm))
                goto wakeup;

        /*
         * Flush page faults out of all CPUs. NOTE: all page faults
         * must be retried without returning VM_FAULT_SIGBUS if
         * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
         * changes while handle_userfault released the mmap_lock. So
         * it's critical that released is set to true (above), before
         * taking the mmap_lock for writing.
         */
        mmap_write_lock(mm);
        prev = NULL;
        for_each_vma(vmi, vma) {
                cond_resched();
                BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
                       !!(vma->vm_flags & __VM_UFFD_FLAGS));
                if (vma->vm_userfaultfd_ctx.ctx != ctx) {
                        prev = vma;
                        continue;
                }
                /* Reset ptes for the whole vma range if wr-protected */
                if (userfaultfd_wp(vma))
                        uffd_wp_range(vma, vma->vm_start,
                                      vma->vm_end - vma->vm_start, false);
                new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
                                            vma->vm_end, new_flags,
                                            NULL_VM_UFFD_CTX);

                vma_start_write(vma);
                userfaultfd_set_vm_flags(vma, new_flags);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;

                prev = vma;
        }
        mmap_write_unlock(mm);
        mmput(mm);
wakeup:
        /*
         * After no new page faults can wait on this fault_*wqh, flush
         * the last page faults that may have been already waiting on
         * the fault_*wqh.
         */
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
        __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /* Flush pending events that may still wait on event_wqh */
        wake_up_all(&ctx->event_wqh);

        wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
        userfaultfd_ctx_put(ctx);
        return 0;
}

/* fault_pending_wqh.lock must be hold by the caller */
static inline struct userfaultfd_wait_queue *find_userfault_in(
                wait_queue_head_t *wqh)
{
        wait_queue_entry_t *wq;
        struct userfaultfd_wait_queue *uwq;

        lockdep_assert_held(&wqh->lock);

        uwq = NULL;
        if (!waitqueue_active(wqh))
                goto out;
        /* walk in reverse to provide FIFO behavior to read userfaults */
        wq = list_last_entry(&wqh->head, typeof(*wq), entry);
        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
        return uwq;
}

static inline struct userfaultfd_wait_queue *find_userfault(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->fault_pending_wqh);
}

static inline struct userfaultfd_wait_queue *find_userfault_evt(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->event_wqh);
}

static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        __poll_t ret;

        poll_wait(file, &ctx->fd_wqh, wait);

        if (!userfaultfd_is_initialized(ctx))
                return EPOLLERR;

        /*
         * poll() never guarantees that read won't block.
         * userfaults can be waken before they're read().
         */
        if (unlikely(!(file->f_flags & O_NONBLOCK)))
                return EPOLLERR;
        /*
         * lockless access to see if there are pending faults
         * __pollwait last action is the add_wait_queue but
         * the spin_unlock would allow the waitqueue_active to
         * pass above the actual list_add inside
         * add_wait_queue critical section. So use a full
         * memory barrier to serialize the list_add write of
         * add_wait_queue() with the waitqueue_active read
         * below.
         */
        ret = 0;
        smp_mb();
        if (waitqueue_active(&ctx->fault_pending_wqh))
                ret = EPOLLIN;
        else if (waitqueue_active(&ctx->event_wqh))
                ret = EPOLLIN;

        return ret;
}

static const struct file_operations userfaultfd_fops;

static int resolve_userfault_fork(struct userfaultfd_ctx *new,
                                  struct inode *inode,
                                  struct uffd_msg *msg)
{
        int fd;

        fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
                        O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
        if (fd < 0)
                return fd;

        msg->arg.reserved.reserved1 = 0;
        msg->arg.fork.ufd = fd;
        return 0;
}

static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                                    struct uffd_msg *msg, struct inode *inode)
{
        ssize_t ret;
        DECLARE_WAITQUEUE(wait, current);
        struct userfaultfd_wait_queue *uwq;
        /*
         * Handling fork event requires sleeping operations, so
         * we drop the event_wqh lock, then do these ops, then
         * lock it back and wake up the waiter. While the lock is
         * dropped the ewq may go away so we keep track of it
         * carefully.
         */
        LIST_HEAD(fork_event);
        struct userfaultfd_ctx *fork_nctx = NULL;

        /* always take the fd_wqh lock before the fault_pending_wqh lock */
        spin_lock_irq(&ctx->fd_wqh.lock);
        __add_wait_queue(&ctx->fd_wqh, &wait);
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                spin_lock(&ctx->fault_pending_wqh.lock);
                uwq = find_userfault(ctx);
                if (uwq) {
                        /*
                         * Use a seqcount to repeat the lockless check
                         * in wake_userfault() to avoid missing
                         * wakeups because during the refile both
                         * waitqueue could become empty if this is the
                         * only userfault.
                         */
                        write_seqcount_begin(&ctx->refile_seq);

                        /*
                         * The fault_pending_wqh.lock prevents the uwq
                         * to disappear from under us.
                         *
                         * Refile this userfault from
                         * fault_pending_wqh to fault_wqh, it's not
                         * pending anymore after we read it.
                         *
                         * Use list_del() by hand (as
                         * userfaultfd_wake_function also uses
                         * list_del_init() by hand) to be sure nobody
                         * changes __remove_wait_queue() to use
                         * list_del_init() in turn breaking the
                         * !list_empty_careful() check in
                         * handle_userfault(). The uwq->wq.head list
                         * must never be empty at any time during the
                         * refile, or the waitqueue could disappear
                         * from under us. The "wait_queue_head_t"
                         * parameter of __remove_wait_queue() is unused
                         * anyway.
                         */
                        list_del(&uwq->wq.entry);
                        add_wait_queue(&ctx->fault_wqh, &uwq->wq);

                        write_seqcount_end(&ctx->refile_seq);

                        /* careful to always initialize msg if ret == 0 */
                        *msg = uwq->msg;
                        spin_unlock(&ctx->fault_pending_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->fault_pending_wqh.lock);

                spin_lock(&ctx->event_wqh.lock);
                uwq = find_userfault_evt(ctx);
                if (uwq) {
                        *msg = uwq->msg;

                        if (uwq->msg.event == UFFD_EVENT_FORK) {
                                fork_nctx = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        uwq->msg.arg.reserved.reserved1;
                                list_move(&uwq->wq.entry, &fork_event);
                                /*
                                 * fork_nctx can be freed as soon as
                                 * we drop the lock, unless we take a
                                 * reference on it.
                                 */
                                userfaultfd_ctx_get(fork_nctx);
                                spin_unlock(&ctx->event_wqh.lock);
                                ret = 0;
                                break;
                        }

                        userfaultfd_event_complete(ctx, uwq);
                        spin_unlock(&ctx->event_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->event_wqh.lock);

                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (no_wait) {
                        ret = -EAGAIN;
                        break;
                }
                spin_unlock_irq(&ctx->fd_wqh.lock);
                schedule();
                spin_lock_irq(&ctx->fd_wqh.lock);
        }
        __remove_wait_queue(&ctx->fd_wqh, &wait);
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->fd_wqh.lock);

        if (!ret && msg->event == UFFD_EVENT_FORK) {
                ret = resolve_userfault_fork(fork_nctx, inode, msg);
                spin_lock_irq(&ctx->event_wqh.lock);
                if (!list_empty(&fork_event)) {
                        /*
                         * The fork thread didn't abort, so we can
                         * drop the temporary refcount.
                         */
                        userfaultfd_ctx_put(fork_nctx);

                        uwq = list_first_entry(&fork_event,
                                               typeof(*uwq),
                                               wq.entry);
                        /*
                         * If fork_event list wasn't empty and in turn
                         * the event wasn't already released by fork
                         * (the event is allocated on fork kernel
                         * stack), put the event back to its place in
                         * the event_wq. fork_event head will be freed
                         * as soon as we return so the event cannot
                         * stay queued there no matter the current
                         * "ret" value.
                         */
                        list_del(&uwq->wq.entry);
                        __add_wait_queue(&ctx->event_wqh, &uwq->wq);

                        /*
                         * Leave the event in the waitqueue and report
                         * error to userland if we failed to resolve
                         * the userfault fork.
                         */
                        if (likely(!ret))
                                userfaultfd_event_complete(ctx, uwq);
                } else {
                        /*
                         * Here the fork thread aborted and the
                         * refcount from the fork thread on fork_nctx
                         * has already been released. We still hold
                         * the reference we took before releasing the
                         * lock above. If resolve_userfault_fork
                         * failed we've to drop it because the
                         * fork_nctx has to be freed in such case. If
                         * it succeeded we'll hold it because the new
                         * uffd references it.
                         */
                        if (ret)
                                userfaultfd_ctx_put(fork_nctx);
                }
                spin_unlock_irq(&ctx->event_wqh.lock);
        }

        return ret;
}

static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct userfaultfd_ctx *ctx = file->private_data;
        ssize_t _ret, ret = 0;
        struct uffd_msg msg;
        struct inode *inode = file_inode(file);
        bool no_wait;

        if (!userfaultfd_is_initialized(ctx))
                return -EINVAL;

        no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
        for (;;) {
                if (iov_iter_count(to) < sizeof(msg))
                        return ret ? ret : -EINVAL;
                _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
                if (_ret < 0)
                        return ret ? ret : _ret;
                _ret = !copy_to_iter_full(&msg, sizeof(msg), to);
                if (_ret)
                        return ret ? ret : -EFAULT;
                ret += sizeof(msg);
                /*
                 * Allow to read more than one fault at time but only
                 * block if waiting for the very first one.
                 */
                no_wait = true;
        }
}

static void __wake_userfault(struct userfaultfd_ctx *ctx,
                             struct userfaultfd_wake_range *range)
{
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /* wake all in the range and autoremove */
        if (waitqueue_active(&ctx->fault_pending_wqh))
                __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
                                     range);
        if (waitqueue_active(&ctx->fault_wqh))
                __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);
}

static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
                                           struct userfaultfd_wake_range *range)
{
        unsigned seq;
        bool need_wakeup;

        /*
         * To be sure waitqueue_active() is not reordered by the CPU
         * before the pagetable update, use an explicit SMP memory
         * barrier here. PT lock release or mmap_read_unlock(mm) still
         * have release semantics that can allow the
         * waitqueue_active() to be reordered before the pte update.
         */
        smp_mb();

        /*
         * Use waitqueue_active because it's very frequent to
         * change the address space atomically even if there are no
         * userfaults yet. So we take the spinlock only when we're
         * sure we've userfaults to wake.
         */
        do {
                seq = read_seqcount_begin(&ctx->refile_seq);
                need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
                        waitqueue_active(&ctx->fault_wqh);
                cond_resched();
        } while (read_seqcount_retry(&ctx->refile_seq, seq));
        if (need_wakeup)
                __wake_userfault(ctx, range);
}

static __always_inline int validate_unaligned_range(
        struct mm_struct *mm, __u64 start, __u64 len)
{
        __u64 task_size = mm->task_size;

        if (len & ~PAGE_MASK)
                return -EINVAL;
        if (!len)
                return -EINVAL;
        if (start < mmap_min_addr)
                return -EINVAL;
        if (start >= task_size)
                return -EINVAL;
        if (len > task_size - start)
                return -EINVAL;
        if (start + len <= start)
                return -EINVAL;
        return 0;
}

static __always_inline int validate_range(struct mm_struct *mm,
                                          __u64 start, __u64 len)
{
        if (start & ~PAGE_MASK)
                return -EINVAL;

        return validate_unaligned_range(mm, start, len);
}

static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev, *cur;
        int ret;
        struct uffdio_register uffdio_register;
        struct uffdio_register __user *user_uffdio_register;
        unsigned long vm_flags, new_flags;
        bool found;
        bool basic_ioctls;
        unsigned long start, end, vma_end;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        user_uffdio_register = (struct uffdio_register __user *) arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_register, user_uffdio_register,
                           sizeof(uffdio_register)-sizeof(__u64)))
                goto out;

        ret = -EINVAL;
        if (!uffdio_register.mode)
                goto out;
        if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
                goto out;
        vm_flags = 0;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
                vm_flags |= VM_UFFD_MISSING;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
                goto out;
#endif
                vm_flags |= VM_UFFD_WP;
        }
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
                goto out;
#endif
                vm_flags |= VM_UFFD_MINOR;
        }

        ret = validate_range(mm, uffdio_register.range.start,
                             uffdio_register.range.len);
        if (ret)
                goto out;

        start = uffdio_register.range.start;
        end = start + uffdio_register.range.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        ret = -EINVAL;
        mmap_write_lock(mm);
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        basic_ioctls = false;
        cur = vma;
        do {
                cond_resched();

                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
                       !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /* check not compatible vmas */
                ret = -EINVAL;
                if (!vma_can_userfault(cur, vm_flags, wp_async))
                        goto out_unlock;

                /*
                 * UFFDIO_COPY will fill file holes even without
                 * PROT_WRITE. This check enforces that if this is a
                 * MAP_SHARED, the process has write permission to the backing
                 * file. If VM_MAYWRITE is set it also enforces that on a
                 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
                 * F_WRITE_SEAL can be taken until the vma is destroyed.
                 */
                ret = -EPERM;
                if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
                        goto out_unlock;

                /*
                 * If this vma contains ending address, and huge pages
                 * check alignment.
                 */
                if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
                    end > cur->vm_start) {
                        unsigned long vma_hpagesize = vma_kernel_pagesize(cur);

                        ret = -EINVAL;

                        if (end & (vma_hpagesize - 1))
                                goto out_unlock;
                }
                if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
                        goto out_unlock;

                /*
                 * Check that this vma isn't already owned by a
                 * different userfaultfd. We can't allow more than one
                 * userfaultfd to own a single vma simultaneously or we
                 * wouldn't know which one to deliver the userfaults to.
                 */
                ret = -EBUSY;
                if (cur->vm_userfaultfd_ctx.ctx &&
                    cur->vm_userfaultfd_ctx.ctx != ctx)
                        goto out_unlock;

                /*
                 * Note vmas containing huge pages
                 */
                if (is_vm_hugetlb_page(cur))
                        basic_ioctls = true;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        BUG_ON(!found);

        vma_iter_set(&vmi, start);
        prev = vma_prev(&vmi);
        if (vma->vm_start < start)
                prev = vma;

        ret = 0;
        for_each_vma_range(vmi, vma, end) {
                cond_resched();

                BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
                BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                       vma->vm_userfaultfd_ctx.ctx != ctx);
                WARN_ON(!(vma->vm_flags & VM_MAYWRITE));

                /*
                 * Nothing to do: this vma is already registered into this
                 * userfaultfd and with the right tracking mode too.
                 */
                if (vma->vm_userfaultfd_ctx.ctx == ctx &&
                    (vma->vm_flags & vm_flags) == vm_flags)
                        goto skip;

                if (vma->vm_start > start)
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);

                new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
                vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
                                            new_flags,
                                            (struct vm_userfaultfd_ctx){ctx});
                if (IS_ERR(vma)) {
                        ret = PTR_ERR(vma);
                        break;
                }

                /*
                 * In the vma_merge() successful mprotect-like case 8:
                 * the next vma was merged into the current one and
                 * the current one has not been updated yet.
                 */
                vma_start_write(vma);
                userfaultfd_set_vm_flags(vma, new_flags);
                vma->vm_userfaultfd_ctx.ctx = ctx;

                if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
                        hugetlb_unshare_all_pmds(vma);

        skip:
                prev = vma;
                start = vma->vm_end;
        }

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
        if (!ret) {
                __u64 ioctls_out;

                ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
                    UFFD_API_RANGE_IOCTLS;

                /*
                 * Declare the WP ioctl only if the WP mode is
                 * specified and all checks passed with the range
                 */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);

                /* CONTINUE ioctl is only supported for MINOR ranges. */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

                /*
                 * Now that we scanned all vmas we can already tell
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
                if (put_user(ioctls_out, &user_uffdio_register->ioctls))
                        ret = -EFAULT;
        }
out:
        return ret;
}

static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                                  unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev, *cur;
        int ret;
        struct uffdio_range uffdio_unregister;
        unsigned long new_flags;
        bool found;
        unsigned long start, end, vma_end;
        const void __user *buf = (void __user *)arg;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        ret = -EFAULT;
        if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
                goto out;

        ret = validate_range(mm, uffdio_unregister.start,
                             uffdio_unregister.len);
        if (ret)
                goto out;

        start = uffdio_unregister.start;
        end = start + uffdio_unregister.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        mmap_write_lock(mm);
        ret = -EINVAL;
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        cur = vma;
        do {
                cond_resched();

                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
                       !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /*
                 * Check not compatible vmas, not strictly required
                 * here as not compatible vmas cannot have an
                 * userfaultfd_ctx registered on them, but this
                 * provides for more strict behavior to notice
                 * unregistration errors.
                 */
                if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
                        goto out_unlock;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        BUG_ON(!found);

        vma_iter_set(&vmi, start);
        prev = vma_prev(&vmi);
        if (vma->vm_start < start)
                prev = vma;

        ret = 0;
        for_each_vma_range(vmi, vma, end) {
                cond_resched();

                BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));

                /*
                 * Nothing to do: this vma is already registered into this
                 * userfaultfd and with the right tracking mode too.
                 */
                if (!vma->vm_userfaultfd_ctx.ctx)
                        goto skip;

                WARN_ON(!(vma->vm_flags & VM_MAYWRITE));

                if (vma->vm_start > start)
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);

                if (userfaultfd_missing(vma)) {
                        /*
                         * Wake any concurrent pending userfault while
                         * we unregister, so they will not hang
                         * permanently and it avoids userland to call
                         * UFFDIO_WAKE explicitly.
                         */
                        struct userfaultfd_wake_range range;
                        range.start = start;
                        range.len = vma_end - start;
                        wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
                }

                /* Reset ptes for the whole vma range if wr-protected */
                if (userfaultfd_wp(vma))
                        uffd_wp_range(vma, start, vma_end - start, false);

                new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
                                            new_flags, NULL_VM_UFFD_CTX);
                if (IS_ERR(vma)) {
                        ret = PTR_ERR(vma);
                        break;
                }

                /*
                 * In the vma_merge() successful mprotect-like case 8:
                 * the next vma was merged into the current one and
                 * the current one has not been updated yet.
                 */
                vma_start_write(vma);
                userfaultfd_set_vm_flags(vma, new_flags);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;

        skip:
                prev = vma;
                start = vma->vm_end;
        }

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
out:
        return ret;
}

/*
 * userfaultfd_wake may be used in combination with the
 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
 */
static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        int ret;
        struct uffdio_range uffdio_wake;
        struct userfaultfd_wake_range range;
        const void __user *buf = (void __user *)arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
        if (ret)
                goto out;

        range.start = uffdio_wake.start;
        range.len = uffdio_wake.len;

        /*
         * len == 0 means wake all and we don't want to wake all here,
         * so check it again to be sure.
         */
        VM_BUG_ON(!range.len);

        wake_userfault(ctx, &range);
        ret = 0;

out:
        return ret;
}

static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_copy uffdio_copy;
        struct uffdio_copy __user *user_uffdio_copy;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_copy = (struct uffdio_copy __user *) arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_copy, user_uffdio_copy,
                           /* don't copy "copy" last field */
                           sizeof(uffdio_copy)-sizeof(__s64)))
                goto out;

        ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
                                       uffdio_copy.len);
        if (ret)
                goto out;
        ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
                goto out;
        if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
                flags |= MFILL_ATOMIC_WP;
        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
                                        uffdio_copy.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        BUG_ON(!ret);
        /* len == 0 would wake all */
        range.len = ret;
        if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
                range.start = uffdio_copy.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        __s64 ret;
        struct uffdio_zeropage uffdio_zeropage;
        struct uffdio_zeropage __user *user_uffdio_zeropage;
        struct userfaultfd_wake_range range;

        user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
                           /* don't copy "zeropage" last field */
                           sizeof(uffdio_zeropage)-sizeof(__s64)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
                             uffdio_zeropage.range.len);
        if (ret)
                goto out;
        ret = -EINVAL;
        if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
                                           uffdio_zeropage.range.len);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
                range.start = uffdio_zeropage.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
                                    unsigned long arg)
{
        int ret;
        struct uffdio_writeprotect uffdio_wp;
        struct uffdio_writeprotect __user *user_uffdio_wp;
        struct userfaultfd_wake_range range;
        bool mode_wp, mode_dontwake;

        if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;

        user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;

        if (copy_from_user(&uffdio_wp, user_uffdio_wp,
                           sizeof(struct uffdio_writeprotect)))
                return -EFAULT;

        ret = validate_range(ctx->mm, uffdio_wp.range.start,
                             uffdio_wp.range.len);
        if (ret)
                return ret;

        if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
                               UFFDIO_WRITEPROTECT_MODE_WP))
                return -EINVAL;

        mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
        mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;

        if (mode_wp && mode_dontwake)
                return -EINVAL;

        if (mmget_not_zero(ctx->mm)) {
                ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
                                          uffdio_wp.range.len, mode_wp);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (ret)
                return ret;

        if (!mode_wp && !mode_dontwake) {
                range.start = uffdio_wp.range.start;
                range.len = uffdio_wp.range.len;
                wake_userfault(ctx, &range);
        }
        return ret;
}

static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_continue uffdio_continue;
        struct uffdio_continue __user *user_uffdio_continue;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_continue = (struct uffdio_continue __user *)arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_continue, user_uffdio_continue,
                           /* don't copy the output fields */
                           sizeof(uffdio_continue) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_continue.range.start,
                             uffdio_continue.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
                                     UFFDIO_CONTINUE_MODE_WP))
                goto out;
        if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
                flags |= MFILL_ATOMIC_WP;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
                                            uffdio_continue.range.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
                range.start = uffdio_continue.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_poison uffdio_poison;
        struct uffdio_poison __user *user_uffdio_poison;
        struct userfaultfd_wake_range range;

        user_uffdio_poison = (struct uffdio_poison __user *)arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_poison, user_uffdio_poison,
                           /* don't copy the output fields */
                           sizeof(uffdio_poison) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_poison.range.start,
                             uffdio_poison.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
                                          uffdio_poison.range.len, 0);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
                range.start = uffdio_poison.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

bool userfaultfd_wp_async(struct vm_area_struct *vma)
{
        return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
}

static inline unsigned int uffd_ctx_features(__u64 user_features)
{
        /*
         * For the current set of features the bits just coincide. Set
         * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
         */
        return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}

static int userfaultfd_move(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_move uffdio_move;
        struct uffdio_move __user *user_uffdio_move;
        struct userfaultfd_wake_range range;
        struct mm_struct *mm = ctx->mm;

        user_uffdio_move = (struct uffdio_move __user *) arg;

        if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;

        if (copy_from_user(&uffdio_move, user_uffdio_move,
                           /* don't copy "move" last field */
                           sizeof(uffdio_move)-sizeof(__s64)))
                return -EFAULT;

        /* Do not allow cross-mm moves. */
        if (mm != current->mm)
                return -EINVAL;

        ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
        if (ret)
                return ret;

        ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
        if (ret)
                return ret;

        if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
                                  UFFDIO_MOVE_MODE_DONTWAKE))
                return -EINVAL;

        if (mmget_not_zero(mm)) {
                ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
                                 uffdio_move.len, uffdio_move.mode);
                mmput(mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_move->move)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        VM_WARN_ON(!ret);
        range.len = ret;
        if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
                range.start = uffdio_move.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_move.len ? 0 : -EAGAIN;

out:
        return ret;
}

/*
 * userland asks for a certain API version and we return which bits
 * and ioctl commands are implemented in this kernel for such API
 * version or -EINVAL if unknown.
 */
static int userfaultfd_api(struct userfaultfd_ctx *ctx,
                           unsigned long arg)
{
        struct uffdio_api uffdio_api;
        void __user *buf = (void __user *)arg;
        unsigned int ctx_features;
        int ret;
        __u64 features;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
                goto out;
        features = uffdio_api.features;
        ret = -EINVAL;
        if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
                goto err_out;
        ret = -EPERM;
        if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
                goto err_out;

        /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
        if (features & UFFD_FEATURE_WP_ASYNC)
                features |= UFFD_FEATURE_WP_UNPOPULATED;

        /* report all available features and ioctls to userland */
        uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
        uffdio_api.features &=
                ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
        uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
        uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
        uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
        uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                goto out;

        /* only enable the requested features for this uffd context */
        ctx_features = uffd_ctx_features(features);
        ret = -EINVAL;
        if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
                goto err_out;

        ret = 0;
out:
        return ret;
err_out:
        memset(&uffdio_api, 0, sizeof(uffdio_api));
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                ret = -EFAULT;
        goto out;
}

static long userfaultfd_ioctl(struct file *file, unsigned cmd,
                              unsigned long arg)
{
        int ret = -EINVAL;
        struct userfaultfd_ctx *ctx = file->private_data;

        if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
                return -EINVAL;

        switch(cmd) {
        case UFFDIO_API:
                ret = userfaultfd_api(ctx, arg);
                break;
        case UFFDIO_REGISTER:
                ret = userfaultfd_register(ctx, arg);
                break;
        case UFFDIO_UNREGISTER:
                ret = userfaultfd_unregister(ctx, arg);
                break;
        case UFFDIO_WAKE:
                ret = userfaultfd_wake(ctx, arg);
                break;
        case UFFDIO_COPY:
                ret = userfaultfd_copy(ctx, arg);
                break;
        case UFFDIO_ZEROPAGE:
                ret = userfaultfd_zeropage(ctx, arg);
                break;
        case UFFDIO_MOVE:
                ret = userfaultfd_move(ctx, arg);
                break;
        case UFFDIO_WRITEPROTECT:
                ret = userfaultfd_writeprotect(ctx, arg);
                break;
        case UFFDIO_CONTINUE:
                ret = userfaultfd_continue(ctx, arg);
                break;
        case UFFDIO_POISON:
                ret = userfaultfd_poison(ctx, arg);
                break;
        }
        return ret;
}

#ifdef CONFIG_PROC_FS
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct userfaultfd_ctx *ctx = f->private_data;
        wait_queue_entry_t *wq;
        unsigned long pending = 0, total = 0;

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
                pending++;
                total++;
        }
        list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
                total++;
        }
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /*
         * If more protocols will be added, there will be all shown
         * separated by a space. Like this:
         *        protocols: aa:... bb:...
         */
        seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
                   pending, total, UFFD_API, ctx->features,
                   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif

static const struct file_operations userfaultfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = userfaultfd_show_fdinfo,
#endif
        .release        = userfaultfd_release,
        .poll                = userfaultfd_poll,
        .read_iter        = userfaultfd_read_iter,
        .unlocked_ioctl = userfaultfd_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
        .llseek                = noop_llseek,
};

static void init_once_userfaultfd_ctx(void *mem)
{
        struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;

        init_waitqueue_head(&ctx->fault_pending_wqh);
        init_waitqueue_head(&ctx->fault_wqh);
        init_waitqueue_head(&ctx->event_wqh);
        init_waitqueue_head(&ctx->fd_wqh);
        seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}

static int new_userfaultfd(int flags)
{
        struct userfaultfd_ctx *ctx;
        struct file *file;
        int fd;

        BUG_ON(!current->mm);

        /* Check the UFFD_* constants for consistency.  */
        BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
        BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);

        if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
                return -EINVAL;

        ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        refcount_set(&ctx->refcount, 1);
        ctx->flags = flags;
        ctx->features = 0;
        ctx->released = false;
        init_rwsem(&ctx->map_changing_lock);
        atomic_set(&ctx->mmap_changing, 0);
        ctx->mm = current->mm;

        fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
        if (fd < 0)
                goto err_out;

        /* Create a new inode so that the LSM can block the creation.  */
        file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
                        O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                fd = PTR_ERR(file);
                goto err_out;
        }
        /* prevent the mm struct to be freed */
        mmgrab(ctx->mm);
        file->f_mode |= FMODE_NOWAIT;
        fd_install(fd, file);
        return fd;
err_out:
        kmem_cache_free(userfaultfd_ctx_cachep, ctx);
        return fd;
}

static inline bool userfaultfd_syscall_allowed(int flags)
{
        /* Userspace-only page faults are always allowed */
        if (flags & UFFD_USER_MODE_ONLY)
                return true;

        /*
         * The user is requesting a userfaultfd which can handle kernel faults.
         * Privileged users are always allowed to do this.
         */
        if (capable(CAP_SYS_PTRACE))
                return true;

        /* Otherwise, access to kernel fault handling is sysctl controlled. */
        return sysctl_unprivileged_userfaultfd;
}

SYSCALL_DEFINE1(userfaultfd, int, flags)
{
        if (!userfaultfd_syscall_allowed(flags))
                return -EPERM;

        return new_userfaultfd(flags);
}

static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
{
        if (cmd != USERFAULTFD_IOC_NEW)
                return -EINVAL;

        return new_userfaultfd(flags);
}

static const struct file_operations userfaultfd_dev_fops = {
        .unlocked_ioctl = userfaultfd_dev_ioctl,
        .compat_ioctl = userfaultfd_dev_ioctl,
        .owner = THIS_MODULE,
        .llseek = noop_llseek,
};

static struct miscdevice userfaultfd_misc = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "userfaultfd",
        .fops = &userfaultfd_dev_fops
};

static int __init userfaultfd_init(void)
{
        int ret;

        ret = misc_register(&userfaultfd_misc);
        if (ret)
                return ret;

        userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
                                                sizeof(struct userfaultfd_ctx),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                init_once_userfaultfd_ctx);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", vm_userfaultfd_table);
#endif
        return 0;
}
__initcall(userfaultfd_init);






















































































































































































































































































































































































































































































































































    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Generic nexthop implementation
 *
 * Copyright (c) 2017-19 Cumulus Networks
 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
 */

#ifndef __LINUX_NEXTHOP_H
#define __LINUX_NEXTHOP_H

#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/route.h>
#include <linux/types.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/netlink.h>

#define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK

struct nexthop;

struct nh_config {
        u32                nh_id;

        u8                nh_family;
        u8                nh_protocol;
        u8                nh_blackhole;
        u8                nh_fdb;
        u32                nh_flags;

        int                nh_ifindex;
        struct net_device *dev;

        union {
                __be32                ipv4;
                struct in6_addr        ipv6;
        } gw;

        struct nlattr        *nh_grp;
        u16                nh_grp_type;
        u16                nh_grp_res_num_buckets;
        unsigned long        nh_grp_res_idle_timer;
        unsigned long        nh_grp_res_unbalanced_timer;
        bool                nh_grp_res_has_num_buckets;
        bool                nh_grp_res_has_idle_timer;
        bool                nh_grp_res_has_unbalanced_timer;

        bool                nh_hw_stats;

        struct nlattr        *nh_encap;
        u16                nh_encap_type;

        u32                nlflags;
        struct nl_info        nlinfo;
};

struct nh_info {
        struct hlist_node        dev_hash;    /* entry on netns devhash */
        struct nexthop                *nh_parent;

        u8                        family;
        bool                        reject_nh;
        bool                        fdb_nh;

        union {
                struct fib_nh_common        fib_nhc;
                struct fib_nh                fib_nh;
                struct fib6_nh                fib6_nh;
        };
};

struct nh_res_bucket {
        struct nh_grp_entry __rcu *nh_entry;
        atomic_long_t                used_time;
        unsigned long                migrated_time;
        bool                        occupied;
        u8                        nh_flags;
};

struct nh_res_table {
        struct net                *net;
        u32                        nhg_id;
        struct delayed_work        upkeep_dw;

        /* List of NHGEs that have too few buckets ("uw" for underweight).
         * Reclaimed buckets will be given to entries in this list.
         */
        struct list_head        uw_nh_entries;
        unsigned long                unbalanced_since;

        u32                        idle_timer;
        u32                        unbalanced_timer;

        u16                        num_nh_buckets;
        struct nh_res_bucket        nh_buckets[] __counted_by(num_nh_buckets);
};

struct nh_grp_entry_stats {
        u64_stats_t packets;
        struct u64_stats_sync syncp;
};

struct nh_grp_entry {
        struct nexthop        *nh;
        struct nh_grp_entry_stats __percpu        *stats;
        u8                weight;

        union {
                struct {
                        atomic_t        upper_bound;
                } hthr;
                struct {
                        /* Member on uw_nh_entries. */
                        struct list_head        uw_nh_entry;

                        u16                        count_buckets;
                        u16                        wants_buckets;
                } res;
        };

        struct list_head nh_list;
        struct nexthop        *nh_parent;  /* nexthop of group with this entry */
        u64                packets_hw;
};

struct nh_group {
        struct nh_group                *spare; /* spare group for removals */
        u16                        num_nh;
        bool                        is_multipath;
        bool                        hash_threshold;
        bool                        resilient;
        bool                        fdb_nh;
        bool                        has_v4;
        bool                        hw_stats;

        struct nh_res_table __rcu *res_table;
        struct nh_grp_entry        nh_entries[] __counted_by(num_nh);
};

struct nexthop {
        struct rb_node                rb_node;    /* entry on netns rbtree */
        struct list_head        fi_list;    /* v4 entries using nh */
        struct list_head        f6i_list;   /* v6 entries using nh */
        struct list_head        fdb_list;   /* fdb entries using this nh */
        struct list_head        grp_list;   /* nh group entries using this nh */
        struct net                *net;

        u32                        id;

        u8                        protocol;   /* app managing this nh */
        u8                        nh_flags;
        bool                        is_group;

        refcount_t                refcnt;
        struct rcu_head                rcu;

        union {
                struct nh_info        __rcu *nh_info;
                struct nh_group __rcu *nh_grp;
        };
};

enum nexthop_event_type {
        NEXTHOP_EVENT_DEL,
        NEXTHOP_EVENT_REPLACE,
        NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
        NEXTHOP_EVENT_BUCKET_REPLACE,
        NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
};

enum nh_notifier_info_type {
        NH_NOTIFIER_INFO_TYPE_SINGLE,
        NH_NOTIFIER_INFO_TYPE_GRP,
        NH_NOTIFIER_INFO_TYPE_RES_TABLE,
        NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
        NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS,
};

struct nh_notifier_single_info {
        struct net_device *dev;
        u8 gw_family;
        union {
                __be32 ipv4;
                struct in6_addr ipv6;
        };
        u32 id;
        u8 is_reject:1,
           is_fdb:1,
           has_encap:1;
};

struct nh_notifier_grp_entry_info {
        u8 weight;
        struct nh_notifier_single_info nh;
};

struct nh_notifier_grp_info {
        u16 num_nh;
        bool is_fdb;
        bool hw_stats;
        struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh);
};

struct nh_notifier_res_bucket_info {
        u16 bucket_index;
        unsigned int idle_timer_ms;
        bool force;
        struct nh_notifier_single_info old_nh;
        struct nh_notifier_single_info new_nh;
};

struct nh_notifier_res_table_info {
        u16 num_nh_buckets;
        bool hw_stats;
        struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets);
};

struct nh_notifier_grp_hw_stats_entry_info {
        u32 id;
        u64 packets;
};

struct nh_notifier_grp_hw_stats_info {
        u16 num_nh;
        bool hw_stats_used;
        struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh);
};

struct nh_notifier_info {
        struct net *net;
        struct netlink_ext_ack *extack;
        u32 id;
        enum nh_notifier_info_type type;
        union {
                struct nh_notifier_single_info *nh;
                struct nh_notifier_grp_info *nh_grp;
                struct nh_notifier_res_table_info *nh_res_table;
                struct nh_notifier_res_bucket_info *nh_res_bucket;
                struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats;
        };
};

int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
                              struct netlink_ext_ack *extack);
int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap);
void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
                                 bool offload, bool trap);
void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
                                     unsigned long *activity);
void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
                                  unsigned int nh_idx,
                                  u64 delta_packets);

/* caller is holding rcu or rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
void nexthop_free_rcu(struct rcu_head *head);

static inline bool nexthop_get(struct nexthop *nh)
{
        return refcount_inc_not_zero(&nh->refcnt);
}

static inline void nexthop_put(struct nexthop *nh)
{
        if (refcount_dec_and_test(&nh->refcnt))
                call_rcu_hurry(&nh->rcu, nexthop_free_rcu);
}

static inline bool nexthop_cmp(const struct nexthop *nh1,
                               const struct nexthop *nh2)
{
        return nh1 == nh2;
}

static inline bool nexthop_is_fdb(const struct nexthop *nh)
{
        if (nh->is_group) {
                const struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->fdb_nh;
        } else {
                const struct nh_info *nhi;

                nhi = rcu_dereference_rtnl(nh->nh_info);
                return nhi->fdb_nh;
        }
}

static inline bool nexthop_has_v4(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->has_v4;
        }
        return false;
}

static inline bool nexthop_is_multipath(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->is_multipath;
        }
        return false;
}

struct nexthop *nexthop_select_path(struct nexthop *nh, int hash);

static inline unsigned int nexthop_num_path(const struct nexthop *nh)
{
        unsigned int rc = 1;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->is_multipath)
                        rc = nh_grp->num_nh;
        }

        return rc;
}

static inline
struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel)
{
        /* for_nexthops macros in fib_semantics.c grabs a pointer to
         * the nexthop before checking nhsel
         */
        if (nhsel >= nhg->num_nh)
                return NULL;

        return nhg->nh_entries[nhsel].nh;
}

static inline
int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh,
                            u8 rt_family)
{
        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
        int i;

        for (i = 0; i < nhg->num_nh; i++) {
                struct nexthop *nhe = nhg->nh_entries[i].nh;
                struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info);
                struct fib_nh_common *nhc = &nhi->fib_nhc;
                int weight = nhg->nh_entries[i].weight;

                if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0)
                        return -EMSGSIZE;
        }

        return 0;
}

/* called with rcu lock */
static inline bool nexthop_is_blackhole(const struct nexthop *nh)
{
        const struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->num_nh > 1)
                        return false;

                nh = nh_grp->nh_entries[0].nh;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return nhi->reject_nh;
}

static inline void nexthop_path_fib_result(struct fib_result *res, int hash)
{
        struct nh_info *nhi;
        struct nexthop *nh;

        nh = nexthop_select_path(res->fi->nh, hash);
        nhi = rcu_dereference(nh->nh_info);
        res->nhc = &nhi->fib_nhc;
}

/* called with rcu read lock or rtnl held */
static inline
struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel)
{
        struct nh_info *nhi;

        BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0);
        BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0);

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->is_multipath) {
                        nh = nexthop_mpath_select(nh_grp, nhsel);
                        if (!nh)
                                return NULL;
                }
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return &nhi->fib_nhc;
}

/* called from fib_table_lookup with rcu_lock */
static inline
struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh,
                                             int fib_flags,
                                             const struct flowi4 *flp,
                                             int *nhsel)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                                *nhsel = i;
                                return &nhi->fib_nhc;
                        }
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                        *nhsel = 0;
                        return &nhi->fib_nhc;
                }
        }

        return NULL;
}

static inline bool nexthop_uses_dev(const struct nexthop *nh,
                                    const struct net_device *dev)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                                return true;
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                        return true;
        }

        return false;
}

static inline unsigned int fib_info_num_path(const struct fib_info *fi)
{
        if (unlikely(fi->nh))
                return nexthop_num_path(fi->nh);

        return fi->fib_nhs;
}

int fib_check_nexthop(struct nexthop *nh, u8 scope,
                      struct netlink_ext_ack *extack);

static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
{
        if (unlikely(fi->nh))
                return nexthop_fib_nhc(fi->nh, nhsel);

        return &fi->fib_nh[nhsel].nh_common;
}

/* only used when fib_nh is built into fib_info */
static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
{
        WARN_ON(fi->nh);

        return &fi->fib_nh[nhsel];
}

/*
 * IPv6 variants
 */
int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                       struct netlink_ext_ack *extack);

/* Caller should either hold rcu_read_lock(), or RTNL. */
static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                nh = nexthop_mpath_select(nh_grp, 0);
                if (!nh)
                        return NULL;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->family == AF_INET6)
                return &nhi->fib6_nh;

        return NULL;
}

static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
{
        struct fib6_nh *fib6_nh;

        fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
        return fib6_nh->fib_nh_dev;
}

static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
{
        struct nexthop *nh = res->f6i->nh;
        struct nh_info *nhi;

        nh = nexthop_select_path(nh, hash);

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->reject_nh) {
                res->fib6_type = RTN_BLACKHOLE;
                res->fib6_flags |= RTF_REJECT;
                res->nh = nexthop_fib6_nh(nh);
        } else {
                res->nh = &nhi->fib6_nh;
        }
}

int nexthop_for_each_fib6_nh(struct nexthop *nh,
                             int (*cb)(struct fib6_nh *nh, void *arg),
                             void *arg);

static inline int nexthop_get_family(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return nhi->family;
}

static inline
struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return &nhi->fib_nhc;
}

static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
                                                            int hash)
{
        struct nh_info *nhi;
        struct nexthop *nhp;

        nhp = nexthop_select_path(nh, hash);
        if (unlikely(!nhp))
                return NULL;
        nhi = rcu_dereference(nhp->nh_info);
        return &nhi->fib_nhc;
}
#endif














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 



    1 

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This is the linux wireless configuration interface.
 *
 * Copyright 2006-2010                Johannes Berg <johannes@sipsolutions.net>
 * Copyright 2013-2014  Intel Mobile Communications GmbH
 * Copyright 2015-2017        Intel Deutschland GmbH
 * Copyright (C) 2018-2024 Intel Corporation
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/if.h>
#include <linux/module.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/nl80211.h>
#include <linux/debugfs.h>
#include <linux/notifier.h>
#include <linux/device.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/sched.h>
#include <net/genetlink.h>
#include <net/cfg80211.h>
#include "nl80211.h"
#include "core.h"
#include "sysfs.h"
#include "debugfs.h"
#include "wext-compat.h"
#include "rdev-ops.h"

/* name for sysfs, %d is appended */
#define PHY_NAME "phy"

MODULE_AUTHOR("Johannes Berg");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("wireless configuration support");
MODULE_ALIAS_GENL_FAMILY(NL80211_GENL_NAME);

/* RCU-protected (and RTNL for writers) */
LIST_HEAD(cfg80211_rdev_list);
int cfg80211_rdev_list_generation;

/* for debugfs */
static struct dentry *ieee80211_debugfs_dir;

/* for the cleanup, scan and event works */
struct workqueue_struct *cfg80211_wq;

static bool cfg80211_disable_40mhz_24ghz;
module_param(cfg80211_disable_40mhz_24ghz, bool, 0644);
MODULE_PARM_DESC(cfg80211_disable_40mhz_24ghz,
                 "Disable 40MHz support in the 2.4GHz band");

struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx)
{
        struct cfg80211_registered_device *result = NULL, *rdev;

        ASSERT_RTNL();

        for_each_rdev(rdev) {
                if (rdev->wiphy_idx == wiphy_idx) {
                        result = rdev;
                        break;
                }
        }

        return result;
}

int get_wiphy_idx(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        return rdev->wiphy_idx;
}

struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx)
{
        struct cfg80211_registered_device *rdev;

        ASSERT_RTNL();

        rdev = cfg80211_rdev_by_wiphy_idx(wiphy_idx);
        if (!rdev)
                return NULL;
        return &rdev->wiphy;
}

static int cfg80211_dev_check_name(struct cfg80211_registered_device *rdev,
                                   const char *newname)
{
        struct cfg80211_registered_device *rdev2;
        int wiphy_idx, taken = -1, digits;

        ASSERT_RTNL();

        if (strlen(newname) > NL80211_WIPHY_NAME_MAXLEN)
                return -EINVAL;

        /* prohibit calling the thing phy%d when %d is not its number */
        sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken);
        if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) {
                /* count number of places needed to print wiphy_idx */
                digits = 1;
                while (wiphy_idx /= 10)
                        digits++;
                /*
                 * deny the name if it is phy<idx> where <idx> is printed
                 * without leading zeroes. taken == strlen(newname) here
                 */
                if (taken == strlen(PHY_NAME) + digits)
                        return -EINVAL;
        }

        /* Ensure another device does not already have this name. */
        for_each_rdev(rdev2)
                if (strcmp(newname, wiphy_name(&rdev2->wiphy)) == 0)
                        return -EINVAL;

        return 0;
}

int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
                        char *newname)
{
        int result;

        ASSERT_RTNL();
        lockdep_assert_wiphy(&rdev->wiphy);

        /* Ignore nop renames */
        if (strcmp(newname, wiphy_name(&rdev->wiphy)) == 0)
                return 0;

        result = cfg80211_dev_check_name(rdev, newname);
        if (result < 0)
                return result;

        result = device_rename(&rdev->wiphy.dev, newname);
        if (result)
                return result;

        if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir))
                debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
                               rdev->wiphy.debugfsdir,
                               rdev->wiphy.debugfsdir->d_parent, newname);

        nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);

        return 0;
}

int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
                          struct net *net)
{
        struct wireless_dev *wdev;
        int err = 0;

        if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK))
                return -EOPNOTSUPP;

        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                if (!wdev->netdev)
                        continue;
                wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
                err = dev_change_net_namespace(wdev->netdev, net, "wlan%d");
                if (err)
                        break;
                wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
        }

        if (err) {
                /* failed -- clean up to old netns */
                net = wiphy_net(&rdev->wiphy);

                list_for_each_entry_continue_reverse(wdev,
                                                     &rdev->wiphy.wdev_list,
                                                     list) {
                        if (!wdev->netdev)
                                continue;
                        wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
                        err = dev_change_net_namespace(wdev->netdev, net,
                                                        "wlan%d");
                        WARN_ON(err);
                        wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
                }

                return err;
        }

        wiphy_lock(&rdev->wiphy);
        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                if (!wdev->netdev)
                        continue;
                nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);
        }

        nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY);

        wiphy_net_set(&rdev->wiphy, net);

        err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev));
        WARN_ON(err);

        nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);

        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                if (!wdev->netdev)
                        continue;
                nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
        }
        wiphy_unlock(&rdev->wiphy);

        return 0;
}

static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data)
{
        struct cfg80211_registered_device *rdev = data;

        wiphy_lock(&rdev->wiphy);
        rdev_rfkill_poll(rdev);
        wiphy_unlock(&rdev->wiphy);
}

void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
                              struct wireless_dev *wdev)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        if (WARN_ON(wdev->iftype != NL80211_IFTYPE_P2P_DEVICE))
                return;

        if (!wdev_running(wdev))
                return;

        rdev_stop_p2p_device(rdev, wdev);
        wdev->is_running = false;

        rdev->opencount--;

        if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
                if (WARN_ON(!rdev->scan_req->notified &&
                            (!rdev->int_scan_req ||
                             !rdev->int_scan_req->notified)))
                        rdev->scan_req->info.aborted = true;
                ___cfg80211_scan_done(rdev, false);
        }
}

void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
                       struct wireless_dev *wdev)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN))
                return;

        if (!wdev_running(wdev))
                return;

        rdev_stop_nan(rdev, wdev);
        wdev->is_running = false;

        rdev->opencount--;
}

void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct wireless_dev *wdev;

        ASSERT_RTNL();

        list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
                if (wdev->netdev) {
                        dev_close(wdev->netdev);
                        continue;
                }

                /* otherwise, check iftype */

                wiphy_lock(wiphy);

                switch (wdev->iftype) {
                case NL80211_IFTYPE_P2P_DEVICE:
                        cfg80211_stop_p2p_device(rdev, wdev);
                        break;
                case NL80211_IFTYPE_NAN:
                        cfg80211_stop_nan(rdev, wdev);
                        break;
                default:
                        break;
                }

                wiphy_unlock(wiphy);
        }
}
EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces);

static int cfg80211_rfkill_set_block(void *data, bool blocked)
{
        struct cfg80211_registered_device *rdev = data;

        if (!blocked)
                return 0;

        rtnl_lock();
        cfg80211_shutdown_all_interfaces(&rdev->wiphy);
        rtnl_unlock();

        return 0;
}

static void cfg80211_rfkill_block_work(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;

        rdev = container_of(work, struct cfg80211_registered_device,
                            rfkill_block);
        cfg80211_rfkill_set_block(rdev, true);
}

static void cfg80211_event_work(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;

        rdev = container_of(work, struct cfg80211_registered_device,
                            event_work);

        wiphy_lock(&rdev->wiphy);
        cfg80211_process_rdev_events(rdev);
        wiphy_unlock(&rdev->wiphy);
}

void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev)
{
        struct wireless_dev *wdev, *tmp;

        ASSERT_RTNL();

        list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) {
                if (wdev->nl_owner_dead) {
                        if (wdev->netdev)
                                dev_close(wdev->netdev);

                        wiphy_lock(&rdev->wiphy);
                        cfg80211_leave(rdev, wdev);
                        cfg80211_remove_virtual_intf(rdev, wdev);
                        wiphy_unlock(&rdev->wiphy);
                }
        }
}

static void cfg80211_destroy_iface_wk(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;

        rdev = container_of(work, struct cfg80211_registered_device,
                            destroy_work);

        rtnl_lock();
        cfg80211_destroy_ifaces(rdev);
        rtnl_unlock();
}

static void cfg80211_sched_scan_stop_wk(struct wiphy *wiphy,
                                        struct wiphy_work *work)
{
        struct cfg80211_registered_device *rdev;
        struct cfg80211_sched_scan_request *req, *tmp;

        rdev = container_of(work, struct cfg80211_registered_device,
                           sched_scan_stop_wk);

        list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) {
                if (req->nl_owner_dead)
                        cfg80211_stop_sched_scan_req(rdev, req, false);
        }
}

static void cfg80211_propagate_radar_detect_wk(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;

        rdev = container_of(work, struct cfg80211_registered_device,
                            propagate_radar_detect_wk);

        rtnl_lock();

        regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->radar_chandef,
                                       NL80211_DFS_UNAVAILABLE,
                                       NL80211_RADAR_DETECTED);

        rtnl_unlock();
}

static void cfg80211_propagate_cac_done_wk(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;

        rdev = container_of(work, struct cfg80211_registered_device,
                            propagate_cac_done_wk);

        rtnl_lock();

        regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->cac_done_chandef,
                                       NL80211_DFS_AVAILABLE,
                                       NL80211_RADAR_CAC_FINISHED);

        rtnl_unlock();
}

static void cfg80211_wiphy_work(struct work_struct *work)
{
        struct cfg80211_registered_device *rdev;
        struct wiphy_work *wk;

        rdev = container_of(work, struct cfg80211_registered_device, wiphy_work);

        trace_wiphy_work_worker_start(&rdev->wiphy);

        wiphy_lock(&rdev->wiphy);
        if (rdev->suspended)
                goto out;

        spin_lock_irq(&rdev->wiphy_work_lock);
        wk = list_first_entry_or_null(&rdev->wiphy_work_list,
                                      struct wiphy_work, entry);
        if (wk) {
                list_del_init(&wk->entry);
                if (!list_empty(&rdev->wiphy_work_list))
                        queue_work(system_unbound_wq, work);
                spin_unlock_irq(&rdev->wiphy_work_lock);

                trace_wiphy_work_run(&rdev->wiphy, wk);
                wk->func(&rdev->wiphy, wk);
        } else {
                spin_unlock_irq(&rdev->wiphy_work_lock);
        }
out:
        wiphy_unlock(&rdev->wiphy);
}

/* exported functions */

struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
                           const char *requested_name)
{
        static atomic_t wiphy_counter = ATOMIC_INIT(0);

        struct cfg80211_registered_device *rdev;
        int alloc_size;

        WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key));
        WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc));
        WARN_ON(ops->connect && !ops->disconnect);
        WARN_ON(ops->join_ibss && !ops->leave_ibss);
        WARN_ON(ops->add_virtual_intf && !ops->del_virtual_intf);
        WARN_ON(ops->add_station && !ops->del_station);
        WARN_ON(ops->add_mpath && !ops->del_mpath);
        WARN_ON(ops->join_mesh && !ops->leave_mesh);
        WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device);
        WARN_ON(ops->start_ap && !ops->stop_ap);
        WARN_ON(ops->join_ocb && !ops->leave_ocb);
        WARN_ON(ops->suspend && !ops->resume);
        WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop);
        WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel);
        WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch);
        WARN_ON(ops->add_tx_ts && !ops->del_tx_ts);

        alloc_size = sizeof(*rdev) + sizeof_priv;

        rdev = kzalloc(alloc_size, GFP_KERNEL);
        if (!rdev)
                return NULL;

        rdev->ops = ops;

        rdev->wiphy_idx = atomic_inc_return(&wiphy_counter);

        if (unlikely(rdev->wiphy_idx < 0)) {
                /* ugh, wrapped! */
                atomic_dec(&wiphy_counter);
                kfree(rdev);
                return NULL;
        }

        /* atomic_inc_return makes it start at 1, make it start at 0 */
        rdev->wiphy_idx--;

        /* give it a proper name */
        if (requested_name && requested_name[0]) {
                int rv;

                rtnl_lock();
                rv = cfg80211_dev_check_name(rdev, requested_name);

                if (rv < 0) {
                        rtnl_unlock();
                        goto use_default_name;
                }

                rv = dev_set_name(&rdev->wiphy.dev, "%s", requested_name);
                rtnl_unlock();
                if (rv)
                        goto use_default_name;
        } else {
                int rv;

use_default_name:
                /* NOTE:  This is *probably* safe w/out holding rtnl because of
                 * the restrictions on phy names.  Probably this call could
                 * fail if some other part of the kernel (re)named a device
                 * phyX.  But, might should add some locking and check return
                 * value, and use a different name if this one exists?
                 */
                rv = dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx);
                if (rv < 0) {
                        kfree(rdev);
                        return NULL;
                }
        }

        mutex_init(&rdev->wiphy.mtx);
        INIT_LIST_HEAD(&rdev->wiphy.wdev_list);
        INIT_LIST_HEAD(&rdev->beacon_registrations);
        spin_lock_init(&rdev->beacon_registrations_lock);
        spin_lock_init(&rdev->bss_lock);
        INIT_LIST_HEAD(&rdev->bss_list);
        INIT_LIST_HEAD(&rdev->sched_scan_req_list);
        wiphy_work_init(&rdev->scan_done_wk, __cfg80211_scan_done);
        INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk,
                          cfg80211_dfs_channels_update_work);
#ifdef CONFIG_CFG80211_WEXT
        rdev->wiphy.wext = &cfg80211_wext_handler;
#endif

        device_initialize(&rdev->wiphy.dev);
        rdev->wiphy.dev.class = &ieee80211_class;
        rdev->wiphy.dev.platform_data = rdev;
        device_enable_async_suspend(&rdev->wiphy.dev);

        INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk);
        wiphy_work_init(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk);
        INIT_WORK(&rdev->sched_scan_res_wk, cfg80211_sched_scan_results_wk);
        INIT_WORK(&rdev->propagate_radar_detect_wk,
                  cfg80211_propagate_radar_detect_wk);
        INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk);
        INIT_WORK(&rdev->mgmt_registrations_update_wk,
                  cfg80211_mgmt_registrations_update_wk);
        spin_lock_init(&rdev->mgmt_registrations_lock);

#ifdef CONFIG_CFG80211_DEFAULT_PS
        rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
#endif

        wiphy_net_set(&rdev->wiphy, &init_net);

        rdev->rfkill_ops.set_block = cfg80211_rfkill_set_block;
        rdev->wiphy.rfkill = rfkill_alloc(dev_name(&rdev->wiphy.dev),
                                          &rdev->wiphy.dev, RFKILL_TYPE_WLAN,
                                          &rdev->rfkill_ops, rdev);

        if (!rdev->wiphy.rfkill) {
                wiphy_free(&rdev->wiphy);
                return NULL;
        }

        INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work);
        INIT_LIST_HEAD(&rdev->wiphy_work_list);
        spin_lock_init(&rdev->wiphy_work_lock);
        INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work);
        INIT_WORK(&rdev->conn_work, cfg80211_conn_work);
        INIT_WORK(&rdev->event_work, cfg80211_event_work);
        INIT_WORK(&rdev->background_cac_abort_wk,
                  cfg80211_background_cac_abort_wk);
        INIT_DELAYED_WORK(&rdev->background_cac_done_wk,
                          cfg80211_background_cac_done_wk);

        init_waitqueue_head(&rdev->dev_wait);

        /*
         * Initialize wiphy parameters to IEEE 802.11 MIB default values.
         * Fragmentation and RTS threshold are disabled by default with the
         * special -1 value.
         */
        rdev->wiphy.retry_short = 7;
        rdev->wiphy.retry_long = 4;
        rdev->wiphy.frag_threshold = (u32) -1;
        rdev->wiphy.rts_threshold = (u32) -1;
        rdev->wiphy.coverage_class = 0;

        rdev->wiphy.max_num_csa_counters = 1;

        rdev->wiphy.max_sched_scan_plans = 1;
        rdev->wiphy.max_sched_scan_plan_interval = U32_MAX;

        return &rdev->wiphy;
}
EXPORT_SYMBOL(wiphy_new_nm);

static int wiphy_verify_combinations(struct wiphy *wiphy)
{
        const struct ieee80211_iface_combination *c;
        int i, j;

        for (i = 0; i < wiphy->n_iface_combinations; i++) {
                u32 cnt = 0;
                u16 all_iftypes = 0;

                c = &wiphy->iface_combinations[i];

                /*
                 * Combinations with just one interface aren't real,
                 * however we make an exception for DFS.
                 */
                if (WARN_ON((c->max_interfaces < 2) && !c->radar_detect_widths))
                        return -EINVAL;

                /* Need at least one channel */
                if (WARN_ON(!c->num_different_channels))
                        return -EINVAL;

                /* DFS only works on one channel. */
                if (WARN_ON(c->radar_detect_widths &&
                            (c->num_different_channels > 1)))
                        return -EINVAL;

                if (WARN_ON(!c->n_limits))
                        return -EINVAL;

                for (j = 0; j < c->n_limits; j++) {
                        u16 types = c->limits[j].types;

                        /* interface types shouldn't overlap */
                        if (WARN_ON(types & all_iftypes))
                                return -EINVAL;
                        all_iftypes |= types;

                        if (WARN_ON(!c->limits[j].max))
                                return -EINVAL;

                        /* Shouldn't list software iftypes in combinations! */
                        if (WARN_ON(wiphy->software_iftypes & types))
                                return -EINVAL;

                        /* Only a single P2P_DEVICE can be allowed */
                        if (WARN_ON(types & BIT(NL80211_IFTYPE_P2P_DEVICE) &&
                                    c->limits[j].max > 1))
                                return -EINVAL;

                        /* Only a single NAN can be allowed */
                        if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) &&
                                    c->limits[j].max > 1))
                                return -EINVAL;

                        /*
                         * This isn't well-defined right now. If you have an
                         * IBSS interface, then its beacon interval may change
                         * by joining other networks, and nothing prevents it
                         * from doing that.
                         * So technically we probably shouldn't even allow AP
                         * and IBSS in the same interface, but it seems that
                         * some drivers support that, possibly only with fixed
                         * beacon intervals for IBSS.
                         */
                        if (WARN_ON(types & BIT(NL80211_IFTYPE_ADHOC) &&
                                    c->beacon_int_min_gcd)) {
                                return -EINVAL;
                        }

                        cnt += c->limits[j].max;
                        /*
                         * Don't advertise an unsupported type
                         * in a combination.
                         */
                        if (WARN_ON((wiphy->interface_modes & types) != types))
                                return -EINVAL;
                }

                if (WARN_ON(all_iftypes & BIT(NL80211_IFTYPE_WDS)))
                        return -EINVAL;

                /* You can't even choose that many! */
                if (WARN_ON(cnt < c->max_interfaces))
                        return -EINVAL;
        }

        return 0;
}

int wiphy_register(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        int res;
        enum nl80211_band band;
        struct ieee80211_supported_band *sband;
        bool have_band = false;
        int i;
        u16 ifmodes = wiphy->interface_modes;

#ifdef CONFIG_PM
        if (WARN_ON(wiphy->wowlan &&
                    (wiphy->wowlan->flags & WIPHY_WOWLAN_GTK_REKEY_FAILURE) &&
                    !(wiphy->wowlan->flags & WIPHY_WOWLAN_SUPPORTS_GTK_REKEY)))
                return -EINVAL;
        if (WARN_ON(wiphy->wowlan &&
                    !wiphy->wowlan->flags && !wiphy->wowlan->n_patterns &&
                    !wiphy->wowlan->tcp))
                return -EINVAL;
#endif
        if (WARN_ON((wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH) &&
                    (!rdev->ops->tdls_channel_switch ||
                     !rdev->ops->tdls_cancel_channel_switch)))
                return -EINVAL;

        if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) &&
                    (!rdev->ops->start_nan || !rdev->ops->stop_nan ||
                     !rdev->ops->add_nan_func || !rdev->ops->del_nan_func ||
                     !(wiphy->nan_supported_bands & BIT(NL80211_BAND_2GHZ)))))
                return -EINVAL;

        if (WARN_ON(wiphy->interface_modes & BIT(NL80211_IFTYPE_WDS)))
                return -EINVAL;

        if (WARN_ON(wiphy->pmsr_capa && !wiphy->pmsr_capa->ftm.supported))
                return -EINVAL;

        if (wiphy->pmsr_capa && wiphy->pmsr_capa->ftm.supported) {
                if (WARN_ON(!wiphy->pmsr_capa->ftm.asap &&
                            !wiphy->pmsr_capa->ftm.non_asap))
                        return -EINVAL;
                if (WARN_ON(!wiphy->pmsr_capa->ftm.preambles ||
                            !wiphy->pmsr_capa->ftm.bandwidths))
                        return -EINVAL;
                if (WARN_ON(wiphy->pmsr_capa->ftm.preambles &
                                ~(BIT(NL80211_PREAMBLE_LEGACY) |
                                  BIT(NL80211_PREAMBLE_HT) |
                                  BIT(NL80211_PREAMBLE_VHT) |
                                  BIT(NL80211_PREAMBLE_HE) |
                                  BIT(NL80211_PREAMBLE_DMG))))
                        return -EINVAL;
                if (WARN_ON((wiphy->pmsr_capa->ftm.trigger_based ||
                             wiphy->pmsr_capa->ftm.non_trigger_based) &&
                            !(wiphy->pmsr_capa->ftm.preambles &
                              BIT(NL80211_PREAMBLE_HE))))
                        return -EINVAL;
                if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths &
                                ~(BIT(NL80211_CHAN_WIDTH_20_NOHT) |
                                  BIT(NL80211_CHAN_WIDTH_20) |
                                  BIT(NL80211_CHAN_WIDTH_40) |
                                  BIT(NL80211_CHAN_WIDTH_80) |
                                  BIT(NL80211_CHAN_WIDTH_80P80) |
                                  BIT(NL80211_CHAN_WIDTH_160) |
                                  BIT(NL80211_CHAN_WIDTH_5) |
                                  BIT(NL80211_CHAN_WIDTH_10))))
                        return -EINVAL;
        }

        if (WARN_ON((wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) &&
                    (wiphy->regulatory_flags &
                                        (REGULATORY_CUSTOM_REG |
                                         REGULATORY_STRICT_REG |
                                         REGULATORY_COUNTRY_IE_FOLLOW_POWER |
                                         REGULATORY_COUNTRY_IE_IGNORE))))
                return -EINVAL;

        if (WARN_ON(wiphy->coalesce &&
                    (!wiphy->coalesce->n_rules ||
                     !wiphy->coalesce->n_patterns) &&
                    (!wiphy->coalesce->pattern_min_len ||
                     wiphy->coalesce->pattern_min_len >
                        wiphy->coalesce->pattern_max_len)))
                return -EINVAL;

        if (WARN_ON(wiphy->ap_sme_capa &&
                    !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME)))
                return -EINVAL;

        if (WARN_ON(wiphy->addresses && !wiphy->n_addresses))
                return -EINVAL;

        if (WARN_ON(wiphy->addresses &&
                    !is_zero_ether_addr(wiphy->perm_addr) &&
                    memcmp(wiphy->perm_addr, wiphy->addresses[0].addr,
                           ETH_ALEN)))
                return -EINVAL;

        if (WARN_ON(wiphy->max_acl_mac_addrs &&
                    (!(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME) ||
                     !rdev->ops->set_mac_acl)))
                return -EINVAL;

        /* assure only valid behaviours are flagged by driver
         * hence subtract 2 as bit 0 is invalid.
         */
        if (WARN_ON(wiphy->bss_select_support &&
                    (wiphy->bss_select_support & ~(BIT(__NL80211_BSS_SELECT_ATTR_AFTER_LAST) - 2))))
                return -EINVAL;

        if (WARN_ON(wiphy_ext_feature_isset(&rdev->wiphy,
                                            NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X) &&
                    (!rdev->ops->set_pmk || !rdev->ops->del_pmk)))
                return -EINVAL;

        if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
                    rdev->ops->update_connect_params))
                return -EINVAL;

        if (wiphy->addresses)
                memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);

        /* sanity check ifmodes */
        WARN_ON(!ifmodes);
        ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1;
        if (WARN_ON(ifmodes != wiphy->interface_modes))
                wiphy->interface_modes = ifmodes;

        res = wiphy_verify_combinations(wiphy);
        if (res)
                return res;

        /* sanity check supported bands/channels */
        for (band = 0; band < NUM_NL80211_BANDS; band++) {
                const struct ieee80211_sband_iftype_data *iftd;
                u16 types = 0;
                bool have_he = false;

                sband = wiphy->bands[band];
                if (!sband)
                        continue;

                sband->band = band;
                if (WARN_ON(!sband->n_channels))
                        return -EINVAL;
                /*
                 * on 60GHz or sub-1Ghz band, there are no legacy rates, so
                 * n_bitrates is 0
                 */
                if (WARN_ON((band != NL80211_BAND_60GHZ &&
                             band != NL80211_BAND_S1GHZ) &&
                            !sband->n_bitrates))
                        return -EINVAL;

                if (WARN_ON(band == NL80211_BAND_6GHZ &&
                            (sband->ht_cap.ht_supported ||
                             sband->vht_cap.vht_supported)))
                        return -EINVAL;

                /*
                 * Since cfg80211_disable_40mhz_24ghz is global, we can
                 * modify the sband's ht data even if the driver uses a
                 * global structure for that.
                 */
                if (cfg80211_disable_40mhz_24ghz &&
                    band == NL80211_BAND_2GHZ &&
                    sband->ht_cap.ht_supported) {
                        sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
                        sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40;
                }

                /*
                 * Since we use a u32 for rate bitmaps in
                 * ieee80211_get_response_rate, we cannot
                 * have more than 32 legacy rates.
                 */
                if (WARN_ON(sband->n_bitrates > 32))
                        return -EINVAL;

                for (i = 0; i < sband->n_channels; i++) {
                        sband->channels[i].orig_flags =
                                sband->channels[i].flags;
                        sband->channels[i].orig_mag = INT_MAX;
                        sband->channels[i].orig_mpwr =
                                sband->channels[i].max_power;
                        sband->channels[i].band = band;

                        if (WARN_ON(sband->channels[i].freq_offset >= 1000))
                                return -EINVAL;
                }

                for_each_sband_iftype_data(sband, i, iftd) {
                        bool has_ap, has_non_ap;
                        u32 ap_bits = BIT(NL80211_IFTYPE_AP) |
                                      BIT(NL80211_IFTYPE_P2P_GO);

                        if (WARN_ON(!iftd->types_mask))
                                return -EINVAL;
                        if (WARN_ON(types & iftd->types_mask))
                                return -EINVAL;

                        /* at least one piece of information must be present */
                        if (WARN_ON(!iftd->he_cap.has_he))
                                return -EINVAL;

                        types |= iftd->types_mask;

                        if (i == 0)
                                have_he = iftd->he_cap.has_he;
                        else
                                have_he = have_he &&
                                          iftd->he_cap.has_he;

                        has_ap = iftd->types_mask & ap_bits;
                        has_non_ap = iftd->types_mask & ~ap_bits;

                        /*
                         * For EHT 20 MHz STA, the capabilities format differs
                         * but to simplify, don't check 20 MHz but rather check
                         * only if AP and non-AP were mentioned at the same time,
                         * reject if so.
                         */
                        if (WARN_ON(iftd->eht_cap.has_eht &&
                                    has_ap && has_non_ap))
                                return -EINVAL;
                }

                if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ))
                        return -EINVAL;

                have_band = true;
        }

        if (!have_band) {
                WARN_ON(1);
                return -EINVAL;
        }

        for (i = 0; i < rdev->wiphy.n_vendor_commands; i++) {
                /*
                 * Validate we have a policy (can be explicitly set to
                 * VENDOR_CMD_RAW_DATA which is non-NULL) and also that
                 * we have at least one of doit/dumpit.
                 */
                if (WARN_ON(!rdev->wiphy.vendor_commands[i].policy))
                        return -EINVAL;
                if (WARN_ON(!rdev->wiphy.vendor_commands[i].doit &&
                            !rdev->wiphy.vendor_commands[i].dumpit))
                        return -EINVAL;
        }

#ifdef CONFIG_PM
        if (WARN_ON(rdev->wiphy.wowlan && rdev->wiphy.wowlan->n_patterns &&
                    (!rdev->wiphy.wowlan->pattern_min_len ||
                     rdev->wiphy.wowlan->pattern_min_len >
                                rdev->wiphy.wowlan->pattern_max_len)))
                return -EINVAL;
#endif

        if (!wiphy->max_num_akm_suites)
                wiphy->max_num_akm_suites = NL80211_MAX_NR_AKM_SUITES;
        else if (wiphy->max_num_akm_suites < NL80211_MAX_NR_AKM_SUITES ||
                 wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES)
                return -EINVAL;

        /* check and set up bitrates */
        ieee80211_set_bitrate_flags(wiphy);

        rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH;

        rtnl_lock();
        wiphy_lock(&rdev->wiphy);
        res = device_add(&rdev->wiphy.dev);
        if (res) {
                wiphy_unlock(&rdev->wiphy);
                rtnl_unlock();
                return res;
        }

        list_add_rcu(&rdev->list, &cfg80211_rdev_list);
        cfg80211_rdev_list_generation++;

        /* add to debugfs */
        rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy),
                                                    ieee80211_debugfs_dir);

        cfg80211_debugfs_rdev_add(rdev);
        nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
        wiphy_unlock(&rdev->wiphy);

        /* set up regulatory info */
        wiphy_regulatory_register(wiphy);

        if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) {
                struct regulatory_request request;

                request.wiphy_idx = get_wiphy_idx(wiphy);
                request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
                request.alpha2[0] = '9';
                request.alpha2[1] = '9';

                nl80211_send_reg_change_event(&request);
        }

        /* Check that nobody globally advertises any capabilities they do not
         * advertise on all possible interface types.
         */
        if (wiphy->extended_capabilities_len &&
            wiphy->num_iftype_ext_capab &&
            wiphy->iftype_ext_capab) {
                u8 supported_on_all, j;
                const struct wiphy_iftype_ext_capab *capab;

                capab = wiphy->iftype_ext_capab;
                for (j = 0; j < wiphy->extended_capabilities_len; j++) {
                        if (capab[0].extended_capabilities_len > j)
                                supported_on_all =
                                        capab[0].extended_capabilities[j];
                        else
                                supported_on_all = 0x00;
                        for (i = 1; i < wiphy->num_iftype_ext_capab; i++) {
                                if (j >= capab[i].extended_capabilities_len) {
                                        supported_on_all = 0x00;
                                        break;
                                }
                                supported_on_all &=
                                        capab[i].extended_capabilities[j];
                        }
                        if (WARN_ON(wiphy->extended_capabilities[j] &
                                    ~supported_on_all))
                                break;
                }
        }

        rdev->wiphy.registered = true;
        rtnl_unlock();

        res = rfkill_register(rdev->wiphy.rfkill);
        if (res) {
                rfkill_destroy(rdev->wiphy.rfkill);
                rdev->wiphy.rfkill = NULL;
                wiphy_unregister(&rdev->wiphy);
                return res;
        }

        return 0;
}
EXPORT_SYMBOL(wiphy_register);

void wiphy_rfkill_start_polling(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        if (!rdev->ops->rfkill_poll)
                return;
        rdev->rfkill_ops.poll = cfg80211_rfkill_poll;
        rfkill_resume_polling(wiphy->rfkill);
}
EXPORT_SYMBOL(wiphy_rfkill_start_polling);

void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev,
                                  struct wiphy_work *end)
{
        unsigned int runaway_limit = 100;
        unsigned long flags;

        lockdep_assert_held(&rdev->wiphy.mtx);

        spin_lock_irqsave(&rdev->wiphy_work_lock, flags);
        while (!list_empty(&rdev->wiphy_work_list)) {
                struct wiphy_work *wk;

                wk = list_first_entry(&rdev->wiphy_work_list,
                                      struct wiphy_work, entry);
                list_del_init(&wk->entry);
                spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags);

                trace_wiphy_work_run(&rdev->wiphy, wk);
                wk->func(&rdev->wiphy, wk);

                spin_lock_irqsave(&rdev->wiphy_work_lock, flags);

                if (wk == end)
                        break;

                if (WARN_ON(--runaway_limit == 0))
                        INIT_LIST_HEAD(&rdev->wiphy_work_list);
        }
        spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags);
}

void wiphy_unregister(struct wiphy *wiphy)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        wait_event(rdev->dev_wait, ({
                int __count;
                wiphy_lock(&rdev->wiphy);
                __count = rdev->opencount;
                wiphy_unlock(&rdev->wiphy);
                __count == 0; }));

        if (rdev->wiphy.rfkill)
                rfkill_unregister(rdev->wiphy.rfkill);

        rtnl_lock();
        wiphy_lock(&rdev->wiphy);
        nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY);
        rdev->wiphy.registered = false;

        WARN_ON(!list_empty(&rdev->wiphy.wdev_list));

        /*
         * First remove the hardware from everywhere, this makes
         * it impossible to find from userspace.
         */
        debugfs_remove_recursive(rdev->wiphy.debugfsdir);
        list_del_rcu(&rdev->list);
        synchronize_rcu();

        /*
         * If this device got a regulatory hint tell core its
         * free to listen now to a new shiny device regulatory hint
         */
        wiphy_regulatory_deregister(wiphy);

        cfg80211_rdev_list_generation++;
        device_del(&rdev->wiphy.dev);

#ifdef CONFIG_PM
        if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup)
                rdev_set_wakeup(rdev, false);
#endif

        /* surely nothing is reachable now, clean up work */
        cfg80211_process_wiphy_works(rdev, NULL);
        wiphy_unlock(&rdev->wiphy);
        rtnl_unlock();

        /* this has nothing to do now but make sure it's gone */
        cancel_work_sync(&rdev->wiphy_work);

        cancel_work_sync(&rdev->conn_work);
        flush_work(&rdev->event_work);
        cancel_delayed_work_sync(&rdev->dfs_update_channels_wk);
        cancel_delayed_work_sync(&rdev->background_cac_done_wk);
        flush_work(&rdev->destroy_work);
        flush_work(&rdev->propagate_radar_detect_wk);
        flush_work(&rdev->propagate_cac_done_wk);
        flush_work(&rdev->mgmt_registrations_update_wk);
        flush_work(&rdev->background_cac_abort_wk);

        cfg80211_rdev_free_wowlan(rdev);
        cfg80211_free_coalesce(rdev->coalesce);
        rdev->coalesce = NULL;
}
EXPORT_SYMBOL(wiphy_unregister);

void cfg80211_dev_free(struct cfg80211_registered_device *rdev)
{
        struct cfg80211_internal_bss *scan, *tmp;
        struct cfg80211_beacon_registration *reg, *treg;
        rfkill_destroy(rdev->wiphy.rfkill);
        list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) {
                list_del(&reg->list);
                kfree(reg);
        }
        list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list)
                cfg80211_put_bss(&rdev->wiphy, &scan->pub);
        mutex_destroy(&rdev->wiphy.mtx);

        /*
         * The 'regd' can only be non-NULL if we never finished
         * initializing the wiphy and thus never went through the
         * unregister path - e.g. in failure scenarios. Thus, it
         * cannot have been visible to anyone if non-NULL, so we
         * can just free it here.
         */
        kfree(rcu_dereference_raw(rdev->wiphy.regd));

        kfree(rdev);
}

void wiphy_free(struct wiphy *wiphy)
{
        put_device(&wiphy->dev);
}
EXPORT_SYMBOL(wiphy_free);

void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked,
                                      enum rfkill_hard_block_reasons reason)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);

        if (rfkill_set_hw_state_reason(wiphy->rfkill, blocked, reason))
                schedule_work(&rdev->rfkill_block);
}
EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason);

static void _cfg80211_unregister_wdev(struct wireless_dev *wdev,
                                      bool unregister_netdev)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
        struct cfg80211_cqm_config *cqm_config;
        unsigned int link_id;

        ASSERT_RTNL();
        lockdep_assert_held(&rdev->wiphy.mtx);

        nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);

        wdev->registered = false;

        if (wdev->netdev) {
                sysfs_remove_link(&wdev->netdev->dev.kobj, "phy80211");
                if (unregister_netdev)
                        unregister_netdevice(wdev->netdev);
        }

        list_del_rcu(&wdev->list);
        synchronize_net();
        rdev->devlist_generation++;

        cfg80211_mlme_purge_registrations(wdev);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_P2P_DEVICE:
                cfg80211_stop_p2p_device(rdev, wdev);
                break;
        case NL80211_IFTYPE_NAN:
                cfg80211_stop_nan(rdev, wdev);
                break;
        default:
                break;
        }

#ifdef CONFIG_CFG80211_WEXT
        kfree_sensitive(wdev->wext.keys);
        wdev->wext.keys = NULL;
#endif
        wiphy_work_cancel(wdev->wiphy, &wdev->cqm_rssi_work);
        /* deleted from the list, so can't be found from nl80211 any more */
        cqm_config = rcu_access_pointer(wdev->cqm_config);
        kfree_rcu(cqm_config, rcu_head);

        /*
         * Ensure that all events have been processed and
         * freed.
         */
        cfg80211_process_wdev_events(wdev);

        if (wdev->iftype == NL80211_IFTYPE_STATION ||
            wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) {
                for (link_id = 0; link_id < ARRAY_SIZE(wdev->links); link_id++) {
                        struct cfg80211_internal_bss *curbss;

                        curbss = wdev->links[link_id].client.current_bss;

                        if (WARN_ON(curbss)) {
                                cfg80211_unhold_bss(curbss);
                                cfg80211_put_bss(wdev->wiphy, &curbss->pub);
                                wdev->links[link_id].client.current_bss = NULL;
                        }
                }
        }

        wdev->connected = false;
}

void cfg80211_unregister_wdev(struct wireless_dev *wdev)
{
        _cfg80211_unregister_wdev(wdev, true);
}
EXPORT_SYMBOL(cfg80211_unregister_wdev);

static const struct device_type wiphy_type = {
        .name        = "wlan",
};

void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
                               enum nl80211_iftype iftype, int num)
{
        lockdep_assert_held(&rdev->wiphy.mtx);

        rdev->num_running_ifaces += num;
        if (iftype == NL80211_IFTYPE_MONITOR)
                rdev->num_running_monitor_ifaces += num;
}

void cfg80211_leave(struct cfg80211_registered_device *rdev,
                    struct wireless_dev *wdev)
{
        struct net_device *dev = wdev->netdev;
        struct cfg80211_sched_scan_request *pos, *tmp;

        lockdep_assert_held(&rdev->wiphy.mtx);

        cfg80211_pmsr_wdev_down(wdev);

        cfg80211_stop_background_radar_detection(wdev);

        switch (wdev->iftype) {
        case NL80211_IFTYPE_ADHOC:
                cfg80211_leave_ibss(rdev, dev, true);
                break;
        case NL80211_IFTYPE_P2P_CLIENT:
        case NL80211_IFTYPE_STATION:
                list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list,
                                         list) {
                        if (dev == pos->dev)
                                cfg80211_stop_sched_scan_req(rdev, pos, false);
                }

#ifdef CONFIG_CFG80211_WEXT
                kfree(wdev->wext.ie);
                wdev->wext.ie = NULL;
                wdev->wext.ie_len = 0;
                wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
#endif
                cfg80211_disconnect(rdev, dev,
                                    WLAN_REASON_DEAUTH_LEAVING, true);
                break;
        case NL80211_IFTYPE_MESH_POINT:
                cfg80211_leave_mesh(rdev, dev);
                break;
        case NL80211_IFTYPE_AP:
        case NL80211_IFTYPE_P2P_GO:
                cfg80211_stop_ap(rdev, dev, -1, true);
                break;
        case NL80211_IFTYPE_OCB:
                cfg80211_leave_ocb(rdev, dev);
                break;
        case NL80211_IFTYPE_P2P_DEVICE:
        case NL80211_IFTYPE_NAN:
                /* cannot happen, has no netdev */
                break;
        case NL80211_IFTYPE_AP_VLAN:
        case NL80211_IFTYPE_MONITOR:
                /* nothing to do */
                break;
        case NL80211_IFTYPE_UNSPECIFIED:
        case NL80211_IFTYPE_WDS:
        case NUM_NL80211_IFTYPES:
                /* invalid */
                break;
        }
}

void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
                         gfp_t gfp)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        struct cfg80211_event *ev;
        unsigned long flags;

        trace_cfg80211_stop_iface(wiphy, wdev);

        ev = kzalloc(sizeof(*ev), gfp);
        if (!ev)
                return;

        ev->type = EVENT_STOPPED;

        spin_lock_irqsave(&wdev->event_lock, flags);
        list_add_tail(&ev->list, &wdev->event_list);
        spin_unlock_irqrestore(&wdev->event_lock, flags);
        queue_work(cfg80211_wq, &rdev->event_work);
}
EXPORT_SYMBOL(cfg80211_stop_iface);

void cfg80211_init_wdev(struct wireless_dev *wdev)
{
        INIT_LIST_HEAD(&wdev->event_list);
        spin_lock_init(&wdev->event_lock);
        INIT_LIST_HEAD(&wdev->mgmt_registrations);
        INIT_LIST_HEAD(&wdev->pmsr_list);
        spin_lock_init(&wdev->pmsr_lock);
        INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk);

#ifdef CONFIG_CFG80211_WEXT
        wdev->wext.default_key = -1;
        wdev->wext.default_mgmt_key = -1;
        wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC;
#endif

        wiphy_work_init(&wdev->cqm_rssi_work, cfg80211_cqm_rssi_notify_work);

        if (wdev->wiphy->flags & WIPHY_FLAG_PS_ON_BY_DEFAULT)
                wdev->ps = true;
        else
                wdev->ps = false;
        /* allow mac80211 to determine the timeout */
        wdev->ps_timeout = -1;

        if ((wdev->iftype == NL80211_IFTYPE_STATION ||
             wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
             wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
                wdev->netdev->priv_flags |= IFF_DONT_BRIDGE;

        INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
}

void cfg80211_register_wdev(struct cfg80211_registered_device *rdev,
                            struct wireless_dev *wdev)
{
        ASSERT_RTNL();
        lockdep_assert_held(&rdev->wiphy.mtx);

        /*
         * We get here also when the interface changes network namespaces,
         * as it's registered into the new one, but we don't want it to
         * change ID in that case. Checking if the ID is already assigned
         * works, because 0 isn't considered a valid ID and the memory is
         * 0-initialized.
         */
        if (!wdev->identifier)
                wdev->identifier = ++rdev->wdev_id;
        list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
        rdev->devlist_generation++;
        wdev->registered = true;

        if (wdev->netdev &&
            sysfs_create_link(&wdev->netdev->dev.kobj, &rdev->wiphy.dev.kobj,
                              "phy80211"))
                pr_err("failed to add phy80211 symlink to netdev!\n");

        nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
}

int cfg80211_register_netdevice(struct net_device *dev)
{
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev;
        int ret;

        ASSERT_RTNL();

        if (WARN_ON(!wdev))
                return -EINVAL;

        rdev = wiphy_to_rdev(wdev->wiphy);

        lockdep_assert_held(&rdev->wiphy.mtx);

        /* we'll take care of this */
        wdev->registered = true;
        wdev->registering = true;
        ret = register_netdevice(dev);
        if (ret)
                goto out;

        cfg80211_register_wdev(rdev, wdev);
        ret = 0;
out:
        wdev->registering = false;
        if (ret)
                wdev->registered = false;
        return ret;
}
EXPORT_SYMBOL(cfg80211_register_netdevice);

static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
                                         unsigned long state, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct wireless_dev *wdev = dev->ieee80211_ptr;
        struct cfg80211_registered_device *rdev;
        struct cfg80211_sched_scan_request *pos, *tmp;

        if (!wdev)
                return NOTIFY_DONE;

        rdev = wiphy_to_rdev(wdev->wiphy);

        WARN_ON(wdev->iftype == NL80211_IFTYPE_UNSPECIFIED);

        switch (state) {
        case NETDEV_POST_INIT:
                SET_NETDEV_DEVTYPE(dev, &wiphy_type);
                wdev->netdev = dev;
                /* can only change netns with wiphy */
                dev->features |= NETIF_F_NETNS_LOCAL;

                cfg80211_init_wdev(wdev);
                break;
        case NETDEV_REGISTER:
                if (!wdev->registered) {
                        wiphy_lock(&rdev->wiphy);
                        cfg80211_register_wdev(rdev, wdev);
                        wiphy_unlock(&rdev->wiphy);
                }
                break;
        case NETDEV_UNREGISTER:
                /*
                 * It is possible to get NETDEV_UNREGISTER multiple times,
                 * so check wdev->registered.
                 */
                if (wdev->registered && !wdev->registering) {
                        wiphy_lock(&rdev->wiphy);
                        _cfg80211_unregister_wdev(wdev, false);
                        wiphy_unlock(&rdev->wiphy);
                }
                break;
        case NETDEV_GOING_DOWN:
                wiphy_lock(&rdev->wiphy);
                cfg80211_leave(rdev, wdev);
                cfg80211_remove_links(wdev);
                wiphy_unlock(&rdev->wiphy);
                /* since we just did cfg80211_leave() nothing to do there */
                cancel_work_sync(&wdev->disconnect_wk);
                cancel_work_sync(&wdev->pmsr_free_wk);
                break;
        case NETDEV_DOWN:
                wiphy_lock(&rdev->wiphy);
                cfg80211_update_iface_num(rdev, wdev->iftype, -1);
                if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
                        if (WARN_ON(!rdev->scan_req->notified &&
                                    (!rdev->int_scan_req ||
                                     !rdev->int_scan_req->notified)))
                                rdev->scan_req->info.aborted = true;
                        ___cfg80211_scan_done(rdev, false);
                }

                list_for_each_entry_safe(pos, tmp,
                                         &rdev->sched_scan_req_list, list) {
                        if (WARN_ON(pos->dev == wdev->netdev))
                                cfg80211_stop_sched_scan_req(rdev, pos, false);
                }

                rdev->opencount--;
                wiphy_unlock(&rdev->wiphy);
                wake_up(&rdev->dev_wait);
                break;
        case NETDEV_UP:
                wiphy_lock(&rdev->wiphy);
                cfg80211_update_iface_num(rdev, wdev->iftype, 1);
                switch (wdev->iftype) {
#ifdef CONFIG_CFG80211_WEXT
                case NL80211_IFTYPE_ADHOC:
                        cfg80211_ibss_wext_join(rdev, wdev);
                        break;
                case NL80211_IFTYPE_STATION:
                        cfg80211_mgd_wext_connect(rdev, wdev);
                        break;
#endif
#ifdef CONFIG_MAC80211_MESH
                case NL80211_IFTYPE_MESH_POINT:
                        {
                                /* backward compat code... */
                                struct mesh_setup setup;
                                memcpy(&setup, &default_mesh_setup,
                                                sizeof(setup));
                                 /* back compat only needed for mesh_id */
                                setup.mesh_id = wdev->u.mesh.id;
                                setup.mesh_id_len = wdev->u.mesh.id_up_len;
                                if (wdev->u.mesh.id_up_len)
                                        __cfg80211_join_mesh(rdev, dev,
                                                        &setup,
                                                        &default_mesh_config);
                                break;
                        }
#endif
                default:
                        break;
                }
                rdev->opencount++;

                /*
                 * Configure power management to the driver here so that its
                 * correctly set also after interface type changes etc.
                 */
                if ((wdev->iftype == NL80211_IFTYPE_STATION ||
                     wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) &&
                    rdev->ops->set_power_mgmt &&
                    rdev_set_power_mgmt(rdev, dev, wdev->ps,
                                        wdev->ps_timeout)) {
                        /* assume this means it's off */
                        wdev->ps = false;
                }
                wiphy_unlock(&rdev->wiphy);
                break;
        case NETDEV_PRE_UP:
                if (!cfg80211_iftype_allowed(wdev->wiphy, wdev->iftype,
                                             wdev->use_4addr, 0))
                        return notifier_from_errno(-EOPNOTSUPP);

                if (rfkill_blocked(rdev->wiphy.rfkill))
                        return notifier_from_errno(-ERFKILL);
                break;
        default:
                return NOTIFY_DONE;
        }

        wireless_nlevent_flush();

        return NOTIFY_OK;
}

static struct notifier_block cfg80211_netdev_notifier = {
        .notifier_call = cfg80211_netdev_notifier_call,
};

static void __net_exit cfg80211_pernet_exit(struct net *net)
{
        struct cfg80211_registered_device *rdev;

        rtnl_lock();
        for_each_rdev(rdev) {
                if (net_eq(wiphy_net(&rdev->wiphy), net))
                        WARN_ON(cfg80211_switch_netns(rdev, &init_net));
        }
        rtnl_unlock();
}

static struct pernet_operations cfg80211_pernet_ops = {
        .exit = cfg80211_pernet_exit,
};

void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        unsigned long flags;

        trace_wiphy_work_queue(wiphy, work);

        spin_lock_irqsave(&rdev->wiphy_work_lock, flags);
        if (list_empty(&work->entry))
                list_add_tail(&work->entry, &rdev->wiphy_work_list);
        spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags);

        queue_work(system_unbound_wq, &rdev->wiphy_work);
}
EXPORT_SYMBOL_GPL(wiphy_work_queue);

void wiphy_work_cancel(struct wiphy *wiphy, struct wiphy_work *work)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        unsigned long flags;

        lockdep_assert_held(&wiphy->mtx);

        trace_wiphy_work_cancel(wiphy, work);

        spin_lock_irqsave(&rdev->wiphy_work_lock, flags);
        if (!list_empty(&work->entry))
                list_del_init(&work->entry);
        spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags);
}
EXPORT_SYMBOL_GPL(wiphy_work_cancel);

void wiphy_work_flush(struct wiphy *wiphy, struct wiphy_work *work)
{
        struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
        unsigned long flags;
        bool run;

        trace_wiphy_work_flush(wiphy, work);

        spin_lock_irqsave(&rdev->wiphy_work_lock, flags);
        run = !work || !list_empty(&work->entry);
        spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags);

        if (run)
                cfg80211_process_wiphy_works(rdev, work);
}
EXPORT_SYMBOL_GPL(wiphy_work_flush);

void wiphy_delayed_work_timer(struct timer_list *t)
{
        struct wiphy_delayed_work *dwork = from_timer(dwork, t, timer);

        wiphy_work_queue(dwork->wiphy, &dwork->work);
}
EXPORT_SYMBOL(wiphy_delayed_work_timer);

void wiphy_delayed_work_queue(struct wiphy *wiphy,
                              struct wiphy_delayed_work *dwork,
                              unsigned long delay)
{
        trace_wiphy_delayed_work_queue(wiphy, &dwork->work, delay);

        if (!delay) {
                del_timer(&dwork->timer);
                wiphy_work_queue(wiphy, &dwork->work);
                return;
        }

        dwork->wiphy = wiphy;
        mod_timer(&dwork->timer, jiffies + delay);
}
EXPORT_SYMBOL_GPL(wiphy_delayed_work_queue);

void wiphy_delayed_work_cancel(struct wiphy *wiphy,
                               struct wiphy_delayed_work *dwork)
{
        lockdep_assert_held(&wiphy->mtx);

        del_timer_sync(&dwork->timer);
        wiphy_work_cancel(wiphy, &dwork->work);
}
EXPORT_SYMBOL_GPL(wiphy_delayed_work_cancel);

void wiphy_delayed_work_flush(struct wiphy *wiphy,
                              struct wiphy_delayed_work *dwork)
{
        lockdep_assert_held(&wiphy->mtx);

        del_timer_sync(&dwork->timer);
        wiphy_work_flush(wiphy, &dwork->work);
}
EXPORT_SYMBOL_GPL(wiphy_delayed_work_flush);

static int __init cfg80211_init(void)
{
        int err;

        err = register_pernet_device(&cfg80211_pernet_ops);
        if (err)
                goto out_fail_pernet;

        err = wiphy_sysfs_init();
        if (err)
                goto out_fail_sysfs;

        err = register_netdevice_notifier(&cfg80211_netdev_notifier);
        if (err)
                goto out_fail_notifier;

        err = nl80211_init();
        if (err)
                goto out_fail_nl80211;

        ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL);

        err = regulatory_init();
        if (err)
                goto out_fail_reg;

        cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM);
        if (!cfg80211_wq) {
                err = -ENOMEM;
                goto out_fail_wq;
        }

        return 0;

out_fail_wq:
        regulatory_exit();
out_fail_reg:
        debugfs_remove(ieee80211_debugfs_dir);
        nl80211_exit();
out_fail_nl80211:
        unregister_netdevice_notifier(&cfg80211_netdev_notifier);
out_fail_notifier:
        wiphy_sysfs_exit();
out_fail_sysfs:
        unregister_pernet_device(&cfg80211_pernet_ops);
out_fail_pernet:
        return err;
}
fs_initcall(cfg80211_init);

static void __exit cfg80211_exit(void)
{
        debugfs_remove(ieee80211_debugfs_dir);
        nl80211_exit();
        unregister_netdevice_notifier(&cfg80211_netdev_notifier);
        wiphy_sysfs_exit();
        regulatory_exit();
        unregister_pernet_device(&cfg80211_pernet_ops);
        destroy_workqueue(cfg80211_wq);
}
module_exit(cfg80211_exit);











































































































































































































































    1 










    3 


















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/writeback.h
 */
#ifndef WRITEBACK_H
#define WRITEBACK_H

#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/fs.h>
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
#include <linux/pagevec.h>

struct bio;

DECLARE_PER_CPU(int, dirty_throttle_leaks);

/*
 * The global dirty threshold is normally equal to the global dirty limit,
 * except when the system suddenly allocates a lot of anonymous memory and
 * knocks down the global dirty threshold quickly, in which case the global
 * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
 */
#define DIRTY_SCOPE                8

struct backing_dev_info;

/*
 * fs/fs-writeback.c
 */
enum writeback_sync_modes {
        WB_SYNC_NONE,        /* Don't wait on anything */
        WB_SYNC_ALL,        /* Wait on every mapping */
};

/*
 * A control structure which tells the writeback code what to do.  These are
 * always on the stack, and hence need no locking.  They are always initialised
 * in a manner such that unspecified fields are set to zero.
 */
struct writeback_control {
        /* public fields that can be set and/or consumed by the caller: */
        long nr_to_write;                /* Write this many pages, and decrement
                                           this for each page written */
        long pages_skipped;                /* Pages which were not written */

        /*
         * For a_ops->writepages(): if start or end are non-zero then this is
         * a hint that the filesystem need only write out the pages inside that
         * byterange.  The byte at `end' is included in the writeout request.
         */
        loff_t range_start;
        loff_t range_end;

        enum writeback_sync_modes sync_mode;

        unsigned for_kupdate:1;                /* A kupdate writeback */
        unsigned for_background:1;        /* A background writeback */
        unsigned tagged_writepages:1;        /* tag-and-write to avoid livelock */
        unsigned for_reclaim:1;                /* Invoked from the page allocator */
        unsigned range_cyclic:1;        /* range_start is cyclic */
        unsigned for_sync:1;                /* sync(2) WB_SYNC_ALL writeback */
        unsigned unpinned_netfs_wb:1;        /* Cleared I_PINNING_NETFS_WB */

        /*
         * When writeback IOs are bounced through async layers, only the
         * initial synchronous phase should be accounted towards inode
         * cgroup ownership arbitration to avoid confusion.  Later stages
         * can set the following flag to disable the accounting.
         */
        unsigned no_cgroup_owner:1;

        /* To enable batching of swap writes to non-block-device backends,
         * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
         * writes have been submitted, if with swap_iocb is not NULL,
         * swap_write_unplug() should be called.
         */
        struct swap_iocb **swap_plug;

        /* internal fields used by the ->writepages implementation: */
        struct folio_batch fbatch;
        pgoff_t index;
        int saved_err;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback *wb;        /* wb this writeback is issued under */
        struct inode *inode;                /* inode being written out */

        /* foreign inode detection, see wbc_detach_inode() */
        int wb_id;                        /* current wb id */
        int wb_lcand_id;                /* last foreign candidate wb id */
        int wb_tcand_id;                /* this foreign candidate wb id */
        size_t wb_bytes;                /* bytes written by current wb */
        size_t wb_lcand_bytes;                /* bytes written by last candidate */
        size_t wb_tcand_bytes;                /* bytes written by this candidate */
#endif
};

static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
{
        blk_opf_t flags = 0;

        if (wbc->sync_mode == WB_SYNC_ALL)
                flags |= REQ_SYNC;
        else if (wbc->for_kupdate || wbc->for_background)
                flags |= REQ_BACKGROUND;

        return flags;
}

#ifdef CONFIG_CGROUP_WRITEBACK
#define wbc_blkcg_css(wbc) \
        ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
#else
#define wbc_blkcg_css(wbc)                (blkcg_root_css)
#endif /* CONFIG_CGROUP_WRITEBACK */

/*
 * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
 * and are measured against each other in.  There always is one global
 * domain, global_wb_domain, that every wb in the system is a member of.
 * This allows measuring the relative bandwidth of each wb to distribute
 * dirtyable memory accordingly.
 */
struct wb_domain {
        spinlock_t lock;

        /*
         * Scale the writeback cache size proportional to the relative
         * writeout speed.
         *
         * We do this by keeping a floating proportion between BDIs, based
         * on page writeback completions [end_page_writeback()]. Those
         * devices that write out pages fastest will get the larger share,
         * while the slower will get a smaller share.
         *
         * We use page writeout completions because we are interested in
         * getting rid of dirty pages. Having them written out is the
         * primary goal.
         *
         * We introduce a concept of time, a period over which we measure
         * these events, because demand can/will vary over time. The length
         * of this period itself is measured in page writeback completions.
         */
        struct fprop_global completions;
        struct timer_list period_timer;        /* timer for aging of completions */
        unsigned long period_time;

        /*
         * The dirtyable memory and dirty threshold could be suddenly
         * knocked down by a large amount (eg. on the startup of KVM in a
         * swapless system). This may throw the system into deep dirty
         * exceeded state and throttle heavy/light dirtiers alike. To
         * retain good responsiveness, maintain global_dirty_limit for
         * tracking slowly down to the knocked down dirty threshold.
         *
         * Both fields are protected by ->lock.
         */
        unsigned long dirty_limit_tstamp;
        unsigned long dirty_limit;
};

/**
 * wb_domain_size_changed - memory available to a wb_domain has changed
 * @dom: wb_domain of interest
 *
 * This function should be called when the amount of memory available to
 * @dom has changed.  It resets @dom's dirty limit parameters to prevent
 * the past values which don't match the current configuration from skewing
 * dirty throttling.  Without this, when memory size of a wb_domain is
 * greatly reduced, the dirty throttling logic may allow too many pages to
 * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
 * that situation.
 */
static inline void wb_domain_size_changed(struct wb_domain *dom)
{
        spin_lock(&dom->lock);
        dom->dirty_limit_tstamp = jiffies;
        dom->dirty_limit = 0;
        spin_unlock(&dom->lock);
}

/*
 * fs/fs-writeback.c
 */        
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                                        enum wb_reason reason);
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
void sync_inodes_sb(struct super_block *);
void wakeup_flusher_threads(enum wb_reason reason);
void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason);
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);

/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
{
        wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/cgroup.h>
#include <linux/bio.h>

void __inode_attach_wb(struct inode *inode, struct folio *folio);
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
        __releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(void);
bool cleanup_offline_cgwb(struct bdi_writeback *wb);

/**
 * inode_attach_wb - associate an inode with its wb
 * @inode: inode of interest
 * @folio: folio being dirtied (may be NULL)
 *
 * If @inode doesn't have its wb, associate it with the wb matching the
 * memcg of @folio or, if @folio is NULL, %current.  May be called w/ or w/o
 * @inode->i_lock.
 */
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
        if (!inode->i_wb)
                __inode_attach_wb(inode, folio);
}

/**
 * inode_detach_wb - disassociate an inode from its wb
 * @inode: inode of interest
 *
 * @inode is being freed.  Detach from its wb.
 */
static inline void inode_detach_wb(struct inode *inode)
{
        if (inode->i_wb) {
                WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
                wb_put(inode->i_wb);
                inode->i_wb = NULL;
        }
}

/**
 * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * This function is to be used by __filemap_fdatawrite_range(), which is an
 * alternative entry point into writeback code, and first ensures @inode is
 * associated with a bdi_writeback and attaches it to @wbc.
 */
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
        spin_lock(&inode->i_lock);
        inode_attach_wb(inode, NULL);
        wbc_attach_and_unlock_inode(wbc, inode);
}

/**
 * wbc_init_bio - writeback specific initializtion of bio
 * @wbc: writeback_control for the writeback in progress
 * @bio: bio to be initialized
 *
 * @bio is a part of the writeback in progress controlled by @wbc.  Perform
 * writeback specific initialization.  This is used to apply the cgroup
 * writeback context.  Must be called after the bio has been associated with
 * a device.
 */
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (wbc->wb)
                bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
}

static inline void inode_detach_wb(struct inode *inode)
{
}

static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                               struct inode *inode)
        __releases(&inode->i_lock)
{
        spin_unlock(&inode->i_lock);
}

static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
}

static inline void wbc_detach_inode(struct writeback_control *wbc)
{
}

static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
}

static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
                                            struct page *page, size_t bytes)
{
}

static inline void cgroup_writeback_umount(void)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * mm/page-writeback.c
 */
void laptop_io_completion(struct backing_dev_info *info);
void laptop_sync_completion(void);
void laptop_mode_timer_fn(struct timer_list *t);
bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom);
#endif

extern struct wb_domain global_wb_domain;

/* These are exported to sysctl. */
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;
extern unsigned int dirtytime_expire_interval;
extern int laptop_mode;

int dirtytime_interval_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);

void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
unsigned long cgwb_calc_thresh(struct bdi_writeback *wb);

void wb_update_bandwidth(struct bdi_writeback *wb);

/* Invoke balance dirty pages in async mode. */
#define BDP_ASYNC 0x0001

void balance_dirty_pages_ratelimited(struct address_space *mapping);
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
                unsigned int flags);

bool wb_over_bg_thresh(struct bdi_writeback *wb);

struct folio *writeback_iter(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio, int *error);

typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
                                void *data);

int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);

bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
bool redirty_page_for_writepage(struct writeback_control *, struct page *);

void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);

#endif                /* WRITEBACK_H */















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 











    1 







    1 



    1 


































































































    1 











    1 


    1 




























    1 






    1 




















































    1 











    1 







    1 


















    1 








    1 







































    1 

    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IPv6 Address [auto]configuration
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 */

/*
 *        Changes:
 *
 *        Janos Farkas                        :        delete timer on ifdown
 *        <chexum@bankinf.banki.hu>
 *        Andi Kleen                        :        kill double kfree on module
 *                                                unload.
 *        Maciej W. Rozycki                :        FDDI support
 *        sekiya@USAGI                        :        Don't send too many RS
 *                                                packets.
 *        yoshfuji@USAGI                        :       Fixed interval between DAD
 *                                                packets.
 *        YOSHIFUJI Hideaki @USAGI        :        improved accuracy of
 *                                                address validation timer.
 *        YOSHIFUJI Hideaki @USAGI        :        Privacy Extensions (RFC3041)
 *                                                support.
 *        Yuji SEKIYA @USAGI                :        Don't assign a same IPv6
 *                                                address on a same interface.
 *        YOSHIFUJI Hideaki @USAGI        :        ARCnet support
 *        YOSHIFUJI Hideaki @USAGI        :        convert /proc/net/if_inet6 to
 *                                                seq_file.
 *        YOSHIFUJI Hideaki @USAGI        :        improved source address
 *                                                selection; consider scope,
 *                                                status etc.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/if_arcnet.h>
#include <linux/if_infiniband.h>
#include <linux/route.h>
#include <linux/inetdevice.h>
#include <linux/init.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/capability.h>
#include <linux/delay.h>
#include <linux/notifier.h>
#include <linux/string.h>
#include <linux/hash.h>

#include <net/ip_tunnels.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/6lowpan.h>
#include <net/firewire.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/l3mdev.h>
#include <linux/if_tunnel.h>
#include <linux/rtnetlink.h>
#include <linux/netconf.h>
#include <linux/random.h>
#include <linux/uaccess.h>
#include <asm/unaligned.h>

#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/export.h>
#include <linux/ioam6.h>

#define        INFINITY_LIFE_TIME        0xFFFFFFFF

#define IPV6_MAX_STRLEN \
        sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255")

static inline u32 cstamp_delta(unsigned long cstamp)
{
        return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
}

static inline s32 rfc3315_s14_backoff_init(s32 irt)
{
        /* multiply 'initial retransmission time' by 0.9 .. 1.1 */
        u64 tmp = get_random_u32_inclusive(900000, 1100000) * (u64)irt;
        do_div(tmp, 1000000);
        return (s32)tmp;
}

static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
{
        /* multiply 'retransmission timeout' by 1.9 .. 2.1 */
        u64 tmp = get_random_u32_inclusive(1900000, 2100000) * (u64)rt;
        do_div(tmp, 1000000);
        if ((s32)tmp > mrt) {
                /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
                tmp = get_random_u32_inclusive(900000, 1100000) * (u64)mrt;
                do_div(tmp, 1000000);
        }
        return (s32)tmp;
}

#ifdef CONFIG_SYSCTL
static int addrconf_sysctl_register(struct inet6_dev *idev);
static void addrconf_sysctl_unregister(struct inet6_dev *idev);
#else
static inline int addrconf_sysctl_register(struct inet6_dev *idev)
{
        return 0;
}

static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
{
}
#endif

static void ipv6_gen_rnd_iid(struct in6_addr *addr);

static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
static int ipv6_count_addresses(const struct inet6_dev *idev);
static int ipv6_generate_stable_address(struct in6_addr *addr,
                                        u8 dad_count,
                                        const struct inet6_dev *idev);

#define IN6_ADDR_HSIZE_SHIFT        8
#define IN6_ADDR_HSIZE                (1 << IN6_ADDR_HSIZE_SHIFT)

static void addrconf_verify(struct net *net);
static void addrconf_verify_rtnl(struct net *net);

static struct workqueue_struct *addrconf_wq;

static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);

static void addrconf_type_change(struct net_device *dev,
                                 unsigned long event);
static int addrconf_ifdown(struct net_device *dev, bool unregister);

static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
                                                  int plen,
                                                  const struct net_device *dev,
                                                  u32 flags, u32 noflags,
                                                  bool no_gw);

static void addrconf_dad_start(struct inet6_ifaddr *ifp);
static void addrconf_dad_work(struct work_struct *w);
static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
                                   bool send_na);
static void addrconf_dad_run(struct inet6_dev *idev, bool restart);
static void addrconf_rs_timer(struct timer_list *t);
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);

static void inet6_prefix_notify(int event, struct inet6_dev *idev,
                                struct prefix_info *pinfo);

static struct ipv6_devconf ipv6_devconf __read_mostly = {
        .forwarding                = 0,
        .hop_limit                = IPV6_DEFAULT_HOPLIMIT,
        .mtu6                        = IPV6_MIN_MTU,
        .accept_ra                = 1,
        .accept_redirects        = 1,
        .autoconf                = 1,
        .force_mld_version        = 0,
        .mldv1_unsolicited_report_interval = 10 * HZ,
        .mldv2_unsolicited_report_interval = HZ,
        .dad_transmits                = 1,
        .rtr_solicits                = MAX_RTR_SOLICITATIONS,
        .rtr_solicit_interval        = RTR_SOLICITATION_INTERVAL,
        .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
        .rtr_solicit_delay        = MAX_RTR_SOLICITATION_DELAY,
        .use_tempaddr                = 0,
        .temp_valid_lft                = TEMP_VALID_LIFETIME,
        .temp_prefered_lft        = TEMP_PREFERRED_LIFETIME,
        .regen_min_advance        = REGEN_MIN_ADVANCE,
        .regen_max_retry        = REGEN_MAX_RETRY,
        .max_desync_factor        = MAX_DESYNC_FACTOR,
        .max_addresses                = IPV6_MAX_ADDRESSES,
        .accept_ra_defrtr        = 1,
        .ra_defrtr_metric        = IP6_RT_PRIO_USER,
        .accept_ra_from_local        = 0,
        .accept_ra_min_hop_limit= 1,
        .accept_ra_min_lft        = 0,
        .accept_ra_pinfo        = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
        .accept_ra_rtr_pref        = 1,
        .rtr_probe_interval        = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
        .accept_ra_rt_info_min_plen = 0,
        .accept_ra_rt_info_max_plen = 0,
#endif
#endif
        .proxy_ndp                = 0,
        .accept_source_route        = 0,        /* we do not accept RH0 by default. */
        .disable_ipv6                = 0,
        .accept_dad                = 0,
        .suppress_frag_ndisc        = 1,
        .accept_ra_mtu                = 1,
        .stable_secret                = {
                .initialized = false,
        },
        .use_oif_addrs_only        = 0,
        .ignore_routes_with_linkdown = 0,
        .keep_addr_on_down        = 0,
        .seg6_enabled                = 0,
#ifdef CONFIG_IPV6_SEG6_HMAC
        .seg6_require_hmac        = 0,
#endif
        .enhanced_dad           = 1,
        .addr_gen_mode                = IN6_ADDR_GEN_MODE_EUI64,
        .disable_policy                = 0,
        .rpl_seg_enabled        = 0,
        .ioam6_enabled                = 0,
        .ioam6_id               = IOAM6_DEFAULT_IF_ID,
        .ioam6_id_wide                = IOAM6_DEFAULT_IF_ID_WIDE,
        .ndisc_evict_nocarrier        = 1,
        .ra_honor_pio_life        = 0,
};

static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
        .forwarding                = 0,
        .hop_limit                = IPV6_DEFAULT_HOPLIMIT,
        .mtu6                        = IPV6_MIN_MTU,
        .accept_ra                = 1,
        .accept_redirects        = 1,
        .autoconf                = 1,
        .force_mld_version        = 0,
        .mldv1_unsolicited_report_interval = 10 * HZ,
        .mldv2_unsolicited_report_interval = HZ,
        .dad_transmits                = 1,
        .rtr_solicits                = MAX_RTR_SOLICITATIONS,
        .rtr_solicit_interval        = RTR_SOLICITATION_INTERVAL,
        .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
        .rtr_solicit_delay        = MAX_RTR_SOLICITATION_DELAY,
        .use_tempaddr                = 0,
        .temp_valid_lft                = TEMP_VALID_LIFETIME,
        .temp_prefered_lft        = TEMP_PREFERRED_LIFETIME,
        .regen_min_advance        = REGEN_MIN_ADVANCE,
        .regen_max_retry        = REGEN_MAX_RETRY,
        .max_desync_factor        = MAX_DESYNC_FACTOR,
        .max_addresses                = IPV6_MAX_ADDRESSES,
        .accept_ra_defrtr        = 1,
        .ra_defrtr_metric        = IP6_RT_PRIO_USER,
        .accept_ra_from_local        = 0,
        .accept_ra_min_hop_limit= 1,
        .accept_ra_min_lft        = 0,
        .accept_ra_pinfo        = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
        .accept_ra_rtr_pref        = 1,
        .rtr_probe_interval        = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
        .accept_ra_rt_info_min_plen = 0,
        .accept_ra_rt_info_max_plen = 0,
#endif
#endif
        .proxy_ndp                = 0,
        .accept_source_route        = 0,        /* we do not accept RH0 by default. */
        .disable_ipv6                = 0,
        .accept_dad                = 1,
        .suppress_frag_ndisc        = 1,
        .accept_ra_mtu                = 1,
        .stable_secret                = {
                .initialized = false,
        },
        .use_oif_addrs_only        = 0,
        .ignore_routes_with_linkdown = 0,
        .keep_addr_on_down        = 0,
        .seg6_enabled                = 0,
#ifdef CONFIG_IPV6_SEG6_HMAC
        .seg6_require_hmac        = 0,
#endif
        .enhanced_dad           = 1,
        .addr_gen_mode                = IN6_ADDR_GEN_MODE_EUI64,
        .disable_policy                = 0,
        .rpl_seg_enabled        = 0,
        .ioam6_enabled                = 0,
        .ioam6_id               = IOAM6_DEFAULT_IF_ID,
        .ioam6_id_wide                = IOAM6_DEFAULT_IF_ID_WIDE,
        .ndisc_evict_nocarrier        = 1,
        .ra_honor_pio_life        = 0,
};

/* Check if link is ready: is it up and is a valid qdisc available */
static inline bool addrconf_link_ready(const struct net_device *dev)
{
        return netif_oper_up(dev) && !qdisc_tx_is_noop(dev);
}

static void addrconf_del_rs_timer(struct inet6_dev *idev)
{
        if (del_timer(&idev->rs_timer))
                __in6_dev_put(idev);
}

static void addrconf_del_dad_work(struct inet6_ifaddr *ifp)
{
        if (cancel_delayed_work(&ifp->dad_work))
                __in6_ifa_put(ifp);
}

static void addrconf_mod_rs_timer(struct inet6_dev *idev,
                                  unsigned long when)
{
        if (!mod_timer(&idev->rs_timer, jiffies + when))
                in6_dev_hold(idev);
}

static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
                                   unsigned long delay)
{
        in6_ifa_hold(ifp);
        if (mod_delayed_work(addrconf_wq, &ifp->dad_work, delay))
                in6_ifa_put(ifp);
}

static int snmp6_alloc_dev(struct inet6_dev *idev)
{
        int i;

        idev->stats.ipv6 = alloc_percpu_gfp(struct ipstats_mib, GFP_KERNEL_ACCOUNT);
        if (!idev->stats.ipv6)
                goto err_ip;

        for_each_possible_cpu(i) {
                struct ipstats_mib *addrconf_stats;
                addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i);
                u64_stats_init(&addrconf_stats->syncp);
        }


        idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device),
                                        GFP_KERNEL);
        if (!idev->stats.icmpv6dev)
                goto err_icmp;
        idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
                                           GFP_KERNEL_ACCOUNT);
        if (!idev->stats.icmpv6msgdev)
                goto err_icmpmsg;

        return 0;

err_icmpmsg:
        kfree(idev->stats.icmpv6dev);
err_icmp:
        free_percpu(idev->stats.ipv6);
err_ip:
        return -ENOMEM;
}

static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
{
        struct inet6_dev *ndev;
        int err = -ENOMEM;

        ASSERT_RTNL();

        if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev)
                return ERR_PTR(-EINVAL);

        ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT);
        if (!ndev)
                return ERR_PTR(err);

        rwlock_init(&ndev->lock);
        ndev->dev = dev;
        INIT_LIST_HEAD(&ndev->addr_list);
        timer_setup(&ndev->rs_timer, addrconf_rs_timer, 0);
        memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));

        if (ndev->cnf.stable_secret.initialized)
                ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;

        ndev->cnf.mtu6 = dev->mtu;
        ndev->ra_mtu = 0;
        ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
        if (!ndev->nd_parms) {
                kfree(ndev);
                return ERR_PTR(err);
        }
        if (ndev->cnf.forwarding)
                dev_disable_lro(dev);
        /* We refer to the device */
        netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL);

        if (snmp6_alloc_dev(ndev) < 0) {
                netdev_dbg(dev, "%s: cannot allocate memory for statistics\n",
                           __func__);
                neigh_parms_release(&nd_tbl, ndev->nd_parms);
                netdev_put(dev, &ndev->dev_tracker);
                kfree(ndev);
                return ERR_PTR(err);
        }

        if (dev != blackhole_netdev) {
                if (snmp6_register_dev(ndev) < 0) {
                        netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
                                   __func__, dev->name);
                        goto err_release;
                }
        }
        /* One reference from device. */
        refcount_set(&ndev->refcnt, 1);

        if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
                ndev->cnf.accept_dad = -1;

#if IS_ENABLED(CONFIG_IPV6_SIT)
        if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
                pr_info("%s: Disabled Multicast RS\n", dev->name);
                ndev->cnf.rtr_solicits = 0;
        }
#endif

        INIT_LIST_HEAD(&ndev->tempaddr_list);
        ndev->desync_factor = U32_MAX;
        if ((dev->flags&IFF_LOOPBACK) ||
            dev->type == ARPHRD_TUNNEL ||
            dev->type == ARPHRD_TUNNEL6 ||
            dev->type == ARPHRD_SIT ||
            dev->type == ARPHRD_NONE) {
                ndev->cnf.use_tempaddr = -1;
        }

        ndev->token = in6addr_any;

        if (netif_running(dev) && addrconf_link_ready(dev))
                ndev->if_flags |= IF_READY;

        ipv6_mc_init_dev(ndev);
        ndev->tstamp = jiffies;
        if (dev != blackhole_netdev) {
                err = addrconf_sysctl_register(ndev);
                if (err) {
                        ipv6_mc_destroy_dev(ndev);
                        snmp6_unregister_dev(ndev);
                        goto err_release;
                }
        }
        /* protected by rtnl_lock */
        rcu_assign_pointer(dev->ip6_ptr, ndev);

        if (dev != blackhole_netdev) {
                /* Join interface-local all-node multicast group */
                ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);

                /* Join all-node multicast group */
                ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);

                /* Join all-router multicast group if forwarding is set */
                if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
                        ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
        }
        return ndev;

err_release:
        neigh_parms_release(&nd_tbl, ndev->nd_parms);
        ndev->dead = 1;
        in6_dev_finish_destroy(ndev);
        return ERR_PTR(err);
}

static struct inet6_dev *ipv6_find_idev(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        idev = __in6_dev_get(dev);
        if (!idev) {
                idev = ipv6_add_dev(dev);
                if (IS_ERR(idev))
                        return idev;
        }

        if (dev->flags&IFF_UP)
                ipv6_mc_up(idev);
        return idev;
}

static int inet6_netconf_msgsize_devconf(int type)
{
        int size =  NLMSG_ALIGN(sizeof(struct netconfmsg))
                    + nla_total_size(4);        /* NETCONFA_IFINDEX */
        bool all = false;

        if (type == NETCONFA_ALL)
                all = true;

        if (all || type == NETCONFA_FORWARDING)
                size += nla_total_size(4);
#ifdef CONFIG_IPV6_MROUTE
        if (all || type == NETCONFA_MC_FORWARDING)
                size += nla_total_size(4);
#endif
        if (all || type == NETCONFA_PROXY_NEIGH)
                size += nla_total_size(4);

        if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
                size += nla_total_size(4);

        return size;
}

static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
                                      struct ipv6_devconf *devconf, u32 portid,
                                      u32 seq, int event, unsigned int flags,
                                      int type)
{
        struct nlmsghdr  *nlh;
        struct netconfmsg *ncm;
        bool all = false;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
                        flags);
        if (!nlh)
                return -EMSGSIZE;

        if (type == NETCONFA_ALL)
                all = true;

        ncm = nlmsg_data(nlh);
        ncm->ncm_family = AF_INET6;

        if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
                goto nla_put_failure;

        if (!devconf)
                goto out;

        if ((all || type == NETCONFA_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_FORWARDING,
                        READ_ONCE(devconf->forwarding)) < 0)
                goto nla_put_failure;
#ifdef CONFIG_IPV6_MROUTE
        if ((all || type == NETCONFA_MC_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_MC_FORWARDING,
                        atomic_read(&devconf->mc_forwarding)) < 0)
                goto nla_put_failure;
#endif
        if ((all || type == NETCONFA_PROXY_NEIGH) &&
            nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
                        READ_ONCE(devconf->proxy_ndp)) < 0)
                goto nla_put_failure;

        if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
            nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                        READ_ONCE(devconf->ignore_routes_with_linkdown)) < 0)
                goto nla_put_failure;

out:
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

void inet6_netconf_notify_devconf(struct net *net, int event, int type,
                                  int ifindex, struct ipv6_devconf *devconf)
{
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
                                         event, 0, type);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err);
}

static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
        [NETCONFA_IFINDEX]        = { .len = sizeof(int) },
        [NETCONFA_FORWARDING]        = { .len = sizeof(int) },
        [NETCONFA_PROXY_NEIGH]        = { .len = sizeof(int) },
        [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]        = { .len = sizeof(int) },
};

static int inet6_netconf_valid_get_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
                                              tb, NETCONFA_MAX,
                                              devconf_ipv6_policy, extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
                                            tb, NETCONFA_MAX,
                                            devconf_ipv6_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= NETCONFA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETCONFA_IFINDEX:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
                                     struct nlmsghdr *nlh,
                                     struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[NETCONFA_MAX+1];
        struct inet6_dev *in6_dev = NULL;
        struct net_device *dev = NULL;
        struct sk_buff *skb;
        struct ipv6_devconf *devconf;
        int ifindex;
        int err;

        err = inet6_netconf_valid_get_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        if (!tb[NETCONFA_IFINDEX])
                return -EINVAL;

        err = -EINVAL;
        ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
        switch (ifindex) {
        case NETCONFA_IFINDEX_ALL:
                devconf = net->ipv6.devconf_all;
                break;
        case NETCONFA_IFINDEX_DEFAULT:
                devconf = net->ipv6.devconf_dflt;
                break;
        default:
                dev = dev_get_by_index(net, ifindex);
                if (!dev)
                        return -EINVAL;
                in6_dev = in6_dev_get(dev);
                if (!in6_dev)
                        goto errout;
                devconf = &in6_dev->cnf;
                break;
        }

        err = -ENOBUFS;
        skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
                                         NETLINK_CB(in_skb).portid,
                                         nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
                                         NETCONFA_ALL);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
        if (in6_dev)
                in6_dev_put(in6_dev);
        dev_put(dev);
        return err;
}

/* Combine dev_addr_genid and dev_base_seq to detect changes.
 */
static u32 inet6_base_seq(const struct net *net)
{
        u32 res = atomic_read(&net->ipv6.dev_addr_genid) +
                  READ_ONCE(net->dev_base_seq);

        /* Must not return 0 (see nl_dump_check_consistent()).
         * Chose a value far away from 0.
         */
        if (!res)
                res = 0x80000000;
        return res;
}

static int inet6_netconf_dump_devconf(struct sk_buff *skb,
                                      struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct {
                unsigned long ifindex;
                unsigned int all_default;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        struct inet6_dev *idev;
        int err = 0;

        if (cb->strict_check) {
                struct netlink_ext_ack *extack = cb->extack;
                struct netconfmsg *ncm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
                        NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
                        return -EINVAL;
                }

                if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
                        NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
                        return -EINVAL;
                }
        }

        rcu_read_lock();
        for_each_netdev_dump(net, dev, ctx->ifindex) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        continue;
                err = inet6_netconf_fill_devconf(skb, dev->ifindex,
                                                 &idev->cnf,
                                                 NETLINK_CB(cb->skb).portid,
                                                 nlh->nlmsg_seq,
                                                 RTM_NEWNETCONF,
                                                 NLM_F_MULTI,
                                                 NETCONFA_ALL);
                if (err < 0)
                        goto done;
        }
        if (ctx->all_default == 0) {
                err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
                                                 net->ipv6.devconf_all,
                                                 NETLINK_CB(cb->skb).portid,
                                                 nlh->nlmsg_seq,
                                                 RTM_NEWNETCONF, NLM_F_MULTI,
                                                 NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
        if (ctx->all_default == 1) {
                err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
                                                 net->ipv6.devconf_dflt,
                                                 NETLINK_CB(cb->skb).portid,
                                                 nlh->nlmsg_seq,
                                                 RTM_NEWNETCONF, NLM_F_MULTI,
                                                 NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
done:
        rcu_read_unlock();
        return err;
}

#ifdef CONFIG_SYSCTL
static void dev_forward_change(struct inet6_dev *idev)
{
        struct net_device *dev;
        struct inet6_ifaddr *ifa;
        LIST_HEAD(tmp_addr_list);

        if (!idev)
                return;
        dev = idev->dev;
        if (idev->cnf.forwarding)
                dev_disable_lro(dev);
        if (dev->flags & IFF_MULTICAST) {
                if (idev->cnf.forwarding) {
                        ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
                        ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters);
                        ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters);
                } else {
                        ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters);
                        ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters);
                        ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters);
                }
        }

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                if (ifa->flags&IFA_F_TENTATIVE)
                        continue;
                list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
        }
        read_unlock_bh(&idev->lock);

        while (!list_empty(&tmp_addr_list)) {
                ifa = list_first_entry(&tmp_addr_list,
                                       struct inet6_ifaddr, if_list_aux);
                list_del(&ifa->if_list_aux);
                if (idev->cnf.forwarding)
                        addrconf_join_anycast(ifa);
                else
                        addrconf_leave_anycast(ifa);
        }

        inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                     NETCONFA_FORWARDING,
                                     dev->ifindex, &idev->cnf);
}


static void addrconf_forward_change(struct net *net, __s32 newf)
{
        struct net_device *dev;
        struct inet6_dev *idev;

        for_each_netdev(net, dev) {
                idev = __in6_dev_get(dev);
                if (idev) {
                        int changed = (!idev->cnf.forwarding) ^ (!newf);

                        WRITE_ONCE(idev->cnf.forwarding, newf);
                        if (changed)
                                dev_forward_change(idev);
                }
        }
}

static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf)
{
        struct net *net;
        int old;

        if (!rtnl_trylock())
                return restart_syscall();

        net = (struct net *)table->extra2;
        old = *p;
        WRITE_ONCE(*p, newf);

        if (p == &net->ipv6.devconf_dflt->forwarding) {
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_FORWARDING,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);
                rtnl_unlock();
                return 0;
        }

        if (p == &net->ipv6.devconf_all->forwarding) {
                int old_dflt = net->ipv6.devconf_dflt->forwarding;

                WRITE_ONCE(net->ipv6.devconf_dflt->forwarding, newf);
                if ((!newf) ^ (!old_dflt))
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_FORWARDING,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);

                addrconf_forward_change(net, newf);
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_FORWARDING,
                                                     NETCONFA_IFINDEX_ALL,
                                                     net->ipv6.devconf_all);
        } else if ((!newf) ^ (!old))
                dev_forward_change((struct inet6_dev *)table->extra1);
        rtnl_unlock();

        if (newf)
                rt6_purge_dflt_routers(net);
        return 1;
}

static void addrconf_linkdown_change(struct net *net, __s32 newf)
{
        struct net_device *dev;
        struct inet6_dev *idev;

        for_each_netdev(net, dev) {
                idev = __in6_dev_get(dev);
                if (idev) {
                        int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);

                        WRITE_ONCE(idev->cnf.ignore_routes_with_linkdown, newf);
                        if (changed)
                                inet6_netconf_notify_devconf(dev_net(dev),
                                                             RTM_NEWNETCONF,
                                                             NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                             dev->ifindex,
                                                             &idev->cnf);
                }
        }
}

static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf)
{
        struct net *net;
        int old;

        if (!rtnl_trylock())
                return restart_syscall();

        net = (struct net *)table->extra2;
        old = *p;
        WRITE_ONCE(*p, newf);

        if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net,
                                                     RTM_NEWNETCONF,
                                                     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);
                rtnl_unlock();
                return 0;
        }

        if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
                WRITE_ONCE(net->ipv6.devconf_dflt->ignore_routes_with_linkdown, newf);
                addrconf_linkdown_change(net, newf);
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net,
                                                     RTM_NEWNETCONF,
                                                     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                     NETCONFA_IFINDEX_ALL,
                                                     net->ipv6.devconf_all);
        }
        rtnl_unlock();

        return 1;
}

#endif

/* Nobody refers to this ifaddr, destroy it */
void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
{
        WARN_ON(!hlist_unhashed(&ifp->addr_lst));

#ifdef NET_REFCNT_DEBUG
        pr_debug("%s\n", __func__);
#endif

        in6_dev_put(ifp->idev);

        if (cancel_delayed_work(&ifp->dad_work))
                pr_notice("delayed DAD work was pending while freeing ifa=%p\n",
                          ifp);

        if (ifp->state != INET6_IFADDR_STATE_DEAD) {
                pr_warn("Freeing alive inet6 address %p\n", ifp);
                return;
        }

        kfree_rcu(ifp, rcu);
}

static void
ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
{
        struct list_head *p;
        int ifp_scope = ipv6_addr_src_scope(&ifp->addr);

        /*
         * Each device address list is sorted in order of scope -
         * global before linklocal.
         */
        list_for_each(p, &idev->addr_list) {
                struct inet6_ifaddr *ifa
                        = list_entry(p, struct inet6_ifaddr, if_list);
                if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
                        break;
        }

        list_add_tail_rcu(&ifp->if_list, p);
}

static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr)
{
        u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);

        return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
}

static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
                               struct net_device *dev, unsigned int hash)
{
        struct inet6_ifaddr *ifp;

        hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr)) {
                        if (!dev || ifp->idev->dev == dev)
                                return true;
                }
        }
        return false;
}

static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
{
        struct net *net = dev_net(dev);
        unsigned int hash = inet6_addr_hash(net, &ifa->addr);
        int err = 0;

        spin_lock_bh(&net->ipv6.addrconf_hash_lock);

        /* Ignore adding duplicate addresses on an interface */
        if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
                netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
                err = -EEXIST;
        } else {
                hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
        }

        spin_unlock_bh(&net->ipv6.addrconf_hash_lock);

        return err;
}

/* On success it returns ifp with increased reference count */

static struct inet6_ifaddr *
ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
              bool can_block, struct netlink_ext_ack *extack)
{
        gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
        int addr_type = ipv6_addr_type(cfg->pfx);
        struct net *net = dev_net(idev->dev);
        struct inet6_ifaddr *ifa = NULL;
        struct fib6_info *f6i = NULL;
        int err = 0;

        if (addr_type == IPV6_ADDR_ANY) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid address");
                return ERR_PTR(-EADDRNOTAVAIL);
        } else if (addr_type & IPV6_ADDR_MULTICAST &&
                   !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) {
                NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag");
                return ERR_PTR(-EADDRNOTAVAIL);
        } else if (!(idev->dev->flags & IFF_LOOPBACK) &&
                   !netif_is_l3_master(idev->dev) &&
                   addr_type & IPV6_ADDR_LOOPBACK) {
                NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device");
                return ERR_PTR(-EADDRNOTAVAIL);
        }

        if (idev->dead) {
                NL_SET_ERR_MSG_MOD(extack, "device is going away");
                err = -ENODEV;
                goto out;
        }

        if (idev->cnf.disable_ipv6) {
                NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
                err = -EACCES;
                goto out;
        }

        /* validator notifier needs to be blocking;
         * do not call in atomic context
         */
        if (can_block) {
                struct in6_validator_info i6vi = {
                        .i6vi_addr = *cfg->pfx,
                        .i6vi_dev = idev,
                        .extack = extack,
                };

                err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi);
                err = notifier_to_errno(err);
                if (err < 0)
                        goto out;
        }

        ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT);
        if (!ifa) {
                err = -ENOBUFS;
                goto out;
        }

        f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack);
        if (IS_ERR(f6i)) {
                err = PTR_ERR(f6i);
                f6i = NULL;
                goto out;
        }

        neigh_parms_data_state_setall(idev->nd_parms);

        ifa->addr = *cfg->pfx;
        if (cfg->peer_pfx)
                ifa->peer_addr = *cfg->peer_pfx;

        spin_lock_init(&ifa->lock);
        INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
        INIT_HLIST_NODE(&ifa->addr_lst);
        ifa->scope = cfg->scope;
        ifa->prefix_len = cfg->plen;
        ifa->rt_priority = cfg->rt_priority;
        ifa->flags = cfg->ifa_flags;
        ifa->ifa_proto = cfg->ifa_proto;
        /* No need to add the TENTATIVE flag for addresses with NODAD */
        if (!(cfg->ifa_flags & IFA_F_NODAD))
                ifa->flags |= IFA_F_TENTATIVE;
        ifa->valid_lft = cfg->valid_lft;
        ifa->prefered_lft = cfg->preferred_lft;
        ifa->cstamp = ifa->tstamp = jiffies;
        ifa->tokenized = false;

        ifa->rt = f6i;

        ifa->idev = idev;
        in6_dev_hold(idev);

        /* For caller */
        refcount_set(&ifa->refcnt, 1);

        rcu_read_lock();

        err = ipv6_add_addr_hash(idev->dev, ifa);
        if (err < 0) {
                rcu_read_unlock();
                goto out;
        }

        write_lock_bh(&idev->lock);

        /* Add to inet6_dev unicast addr list. */
        ipv6_link_dev_addr(idev, ifa);

        if (ifa->flags&IFA_F_TEMPORARY) {
                list_add(&ifa->tmp_list, &idev->tempaddr_list);
                in6_ifa_hold(ifa);
        }

        in6_ifa_hold(ifa);
        write_unlock_bh(&idev->lock);

        rcu_read_unlock();

        inet6addr_notifier_call_chain(NETDEV_UP, ifa);
out:
        if (unlikely(err < 0)) {
                fib6_info_release(f6i);

                if (ifa) {
                        if (ifa->idev)
                                in6_dev_put(ifa->idev);
                        kfree(ifa);
                }
                ifa = ERR_PTR(err);
        }

        return ifa;
}

enum cleanup_prefix_rt_t {
        CLEANUP_PREFIX_RT_NOP,    /* no cleanup action for prefix route */
        CLEANUP_PREFIX_RT_DEL,    /* delete the prefix route */
        CLEANUP_PREFIX_RT_EXPIRE, /* update the lifetime of the prefix route */
};

/*
 * Check, whether the prefix for ifp would still need a prefix route
 * after deleting ifp. The function returns one of the CLEANUP_PREFIX_RT_*
 * constants.
 *
 * 1) we don't purge prefix if address was not permanent.
 *    prefix is managed by its own lifetime.
 * 2) we also don't purge, if the address was IFA_F_NOPREFIXROUTE.
 * 3) if there are no addresses, delete prefix.
 * 4) if there are still other permanent address(es),
 *    corresponding prefix is still permanent.
 * 5) if there are still other addresses with IFA_F_NOPREFIXROUTE,
 *    don't purge the prefix, assume user space is managing it.
 * 6) otherwise, update prefix lifetime to the
 *    longest valid lifetime among the corresponding
 *    addresses on the device.
 *    Note: subsequent RA will update lifetime.
 **/
static enum cleanup_prefix_rt_t
check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
{
        struct inet6_ifaddr *ifa;
        struct inet6_dev *idev = ifp->idev;
        unsigned long lifetime;
        enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_DEL;

        *expires = jiffies;

        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                if (ifa == ifp)
                        continue;
                if (ifa->prefix_len != ifp->prefix_len ||
                    !ipv6_prefix_equal(&ifa->addr, &ifp->addr,
                                       ifp->prefix_len))
                        continue;
                if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE))
                        return CLEANUP_PREFIX_RT_NOP;

                action = CLEANUP_PREFIX_RT_EXPIRE;

                spin_lock(&ifa->lock);

                lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
                /*
                 * Note: Because this address is
                 * not permanent, lifetime <
                 * LONG_MAX / HZ here.
                 */
                if (time_before(*expires, ifa->tstamp + lifetime * HZ))
                        *expires = ifa->tstamp + lifetime * HZ;
                spin_unlock(&ifa->lock);
        }

        return action;
}

static void
cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
                     bool del_rt, bool del_peer)
{
        struct fib6_table *table;
        struct fib6_info *f6i;

        f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr,
                                        ifp->prefix_len,
                                        ifp->idev->dev, 0, RTF_DEFAULT, true);
        if (f6i) {
                if (del_rt)
                        ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
                else {
                        if (!(f6i->fib6_flags & RTF_EXPIRES)) {
                                table = f6i->fib6_table;
                                spin_lock_bh(&table->tb6_lock);

                                fib6_set_expires(f6i, expires);
                                fib6_add_gc_list(f6i);

                                spin_unlock_bh(&table->tb6_lock);
                        }
                        fib6_info_release(f6i);
                }
        }
}


/* This function wants to get referenced ifp and releases it before return */

static void ipv6_del_addr(struct inet6_ifaddr *ifp)
{
        enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
        struct net *net = dev_net(ifp->idev->dev);
        unsigned long expires;
        int state;

        ASSERT_RTNL();

        spin_lock_bh(&ifp->lock);
        state = ifp->state;
        ifp->state = INET6_IFADDR_STATE_DEAD;
        spin_unlock_bh(&ifp->lock);

        if (state == INET6_IFADDR_STATE_DEAD)
                goto out;

        spin_lock_bh(&net->ipv6.addrconf_hash_lock);
        hlist_del_init_rcu(&ifp->addr_lst);
        spin_unlock_bh(&net->ipv6.addrconf_hash_lock);

        write_lock_bh(&ifp->idev->lock);

        if (ifp->flags&IFA_F_TEMPORARY) {
                list_del(&ifp->tmp_list);
                if (ifp->ifpub) {
                        in6_ifa_put(ifp->ifpub);
                        ifp->ifpub = NULL;
                }
                __in6_ifa_put(ifp);
        }

        if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE))
                action = check_cleanup_prefix_route(ifp, &expires);

        list_del_rcu(&ifp->if_list);
        __in6_ifa_put(ifp);

        write_unlock_bh(&ifp->idev->lock);

        addrconf_del_dad_work(ifp);

        ipv6_ifa_notify(RTM_DELADDR, ifp);

        inet6addr_notifier_call_chain(NETDEV_DOWN, ifp);

        if (action != CLEANUP_PREFIX_RT_NOP) {
                cleanup_prefix_route(ifp, expires,
                        action == CLEANUP_PREFIX_RT_DEL, false);
        }

        /* clean up prefsrc entries */
        rt6_remove_prefsrc(ifp);
out:
        in6_ifa_put(ifp);
}

static unsigned long ipv6_get_regen_advance(const struct inet6_dev *idev)
{
        return READ_ONCE(idev->cnf.regen_min_advance) +
                READ_ONCE(idev->cnf.regen_max_retry) *
                READ_ONCE(idev->cnf.dad_transmits) *
                max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
}

static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block)
{
        struct inet6_dev *idev = ifp->idev;
        unsigned long tmp_tstamp, age;
        unsigned long regen_advance;
        unsigned long now = jiffies;
        u32 if_public_preferred_lft;
        s32 cnf_temp_preferred_lft;
        struct inet6_ifaddr *ift;
        struct ifa6_config cfg;
        long max_desync_factor;
        struct in6_addr addr;
        int ret = 0;

        write_lock_bh(&idev->lock);

retry:
        in6_dev_hold(idev);
        if (READ_ONCE(idev->cnf.use_tempaddr) <= 0) {
                write_unlock_bh(&idev->lock);
                pr_info("%s: use_tempaddr is disabled\n", __func__);
                in6_dev_put(idev);
                ret = -1;
                goto out;
        }
        spin_lock_bh(&ifp->lock);
        if (ifp->regen_count++ >= READ_ONCE(idev->cnf.regen_max_retry)) {
                WRITE_ONCE(idev->cnf.use_tempaddr, -1);        /*XXX*/
                spin_unlock_bh(&ifp->lock);
                write_unlock_bh(&idev->lock);
                pr_warn("%s: regeneration time exceeded - disabled temporary address support\n",
                        __func__);
                in6_dev_put(idev);
                ret = -1;
                goto out;
        }
        in6_ifa_hold(ifp);
        memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
        ipv6_gen_rnd_iid(&addr);

        age = (now - ifp->tstamp) / HZ;

        regen_advance = ipv6_get_regen_advance(idev);

        /* recalculate max_desync_factor each time and update
         * idev->desync_factor if it's larger
         */
        cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft);
        max_desync_factor = min_t(long,
                                  READ_ONCE(idev->cnf.max_desync_factor),
                                  cnf_temp_preferred_lft - regen_advance);

        if (unlikely(idev->desync_factor > max_desync_factor)) {
                if (max_desync_factor > 0) {
                        get_random_bytes(&idev->desync_factor,
                                         sizeof(idev->desync_factor));
                        idev->desync_factor %= max_desync_factor;
                } else {
                        idev->desync_factor = 0;
                }
        }

        if_public_preferred_lft = ifp->prefered_lft;

        memset(&cfg, 0, sizeof(cfg));
        cfg.valid_lft = min_t(__u32, ifp->valid_lft,
                              READ_ONCE(idev->cnf.temp_valid_lft) + age);
        cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
        cfg.preferred_lft = min_t(__u32, if_public_preferred_lft, cfg.preferred_lft);
        cfg.preferred_lft = min_t(__u32, cfg.valid_lft, cfg.preferred_lft);

        cfg.plen = ifp->prefix_len;
        tmp_tstamp = ifp->tstamp;
        spin_unlock_bh(&ifp->lock);

        write_unlock_bh(&idev->lock);

        /* From RFC 4941:
         *
         *     A temporary address is created only if this calculated Preferred
         *     Lifetime is greater than REGEN_ADVANCE time units.  In
         *     particular, an implementation must not create a temporary address
         *     with a zero Preferred Lifetime.
         *
         *     ...
         *
         *     When creating a temporary address, the lifetime values MUST be
         *     derived from the corresponding prefix as follows:
         *
         *     ...
         *
         *     *  Its Preferred Lifetime is the lower of the Preferred Lifetime
         *        of the public address or TEMP_PREFERRED_LIFETIME -
         *        DESYNC_FACTOR.
         *
         * To comply with the RFC's requirements, clamp the preferred lifetime
         * to a minimum of regen_advance, unless that would exceed valid_lft or
         * ifp->prefered_lft.
         *
         * Use age calculation as in addrconf_verify to avoid unnecessary
         * temporary addresses being generated.
         */
        age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
        if (cfg.preferred_lft <= regen_advance + age) {
                cfg.preferred_lft = regen_advance + age + 1;
                if (cfg.preferred_lft > cfg.valid_lft ||
                    cfg.preferred_lft > if_public_preferred_lft) {
                        in6_ifa_put(ifp);
                        in6_dev_put(idev);
                        ret = -1;
                        goto out;
                }
        }

        cfg.ifa_flags = IFA_F_TEMPORARY;
        /* set in addrconf_prefix_rcv() */
        if (ifp->flags & IFA_F_OPTIMISTIC)
                cfg.ifa_flags |= IFA_F_OPTIMISTIC;

        cfg.pfx = &addr;
        cfg.scope = ipv6_addr_scope(cfg.pfx);

        ift = ipv6_add_addr(idev, &cfg, block, NULL);
        if (IS_ERR(ift)) {
                in6_ifa_put(ifp);
                in6_dev_put(idev);
                pr_info("%s: retry temporary address regeneration\n", __func__);
                write_lock_bh(&idev->lock);
                goto retry;
        }

        spin_lock_bh(&ift->lock);
        ift->ifpub = ifp;
        ift->cstamp = now;
        ift->tstamp = tmp_tstamp;
        spin_unlock_bh(&ift->lock);

        addrconf_dad_start(ift);
        in6_ifa_put(ift);
        in6_dev_put(idev);
out:
        return ret;
}

/*
 *        Choose an appropriate source address (RFC3484)
 */
enum {
        IPV6_SADDR_RULE_INIT = 0,
        IPV6_SADDR_RULE_LOCAL,
        IPV6_SADDR_RULE_SCOPE,
        IPV6_SADDR_RULE_PREFERRED,
#ifdef CONFIG_IPV6_MIP6
        IPV6_SADDR_RULE_HOA,
#endif
        IPV6_SADDR_RULE_OIF,
        IPV6_SADDR_RULE_LABEL,
        IPV6_SADDR_RULE_PRIVACY,
        IPV6_SADDR_RULE_ORCHID,
        IPV6_SADDR_RULE_PREFIX,
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        IPV6_SADDR_RULE_NOT_OPTIMISTIC,
#endif
        IPV6_SADDR_RULE_MAX
};

struct ipv6_saddr_score {
        int                        rule;
        int                        addr_type;
        struct inet6_ifaddr        *ifa;
        DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX);
        int                        scopedist;
        int                        matchlen;
};

struct ipv6_saddr_dst {
        const struct in6_addr *addr;
        int ifindex;
        int scope;
        int label;
        unsigned int prefs;
};

static inline int ipv6_saddr_preferred(int type)
{
        if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK))
                return 1;
        return 0;
}

static bool ipv6_use_optimistic_addr(const struct net *net,
                                     const struct inet6_dev *idev)
{
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        if (!idev)
                return false;
        if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
            !READ_ONCE(idev->cnf.optimistic_dad))
                return false;
        if (!READ_ONCE(net->ipv6.devconf_all->use_optimistic) &&
            !READ_ONCE(idev->cnf.use_optimistic))
                return false;

        return true;
#else
        return false;
#endif
}

static bool ipv6_allow_optimistic_dad(const struct net *net,
                                      const struct inet6_dev *idev)
{
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        if (!idev)
                return false;
        if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
            !READ_ONCE(idev->cnf.optimistic_dad))
                return false;

        return true;
#else
        return false;
#endif
}

static int ipv6_get_saddr_eval(struct net *net,
                               struct ipv6_saddr_score *score,
                               struct ipv6_saddr_dst *dst,
                               int i)
{
        int ret;

        if (i <= score->rule) {
                switch (i) {
                case IPV6_SADDR_RULE_SCOPE:
                        ret = score->scopedist;
                        break;
                case IPV6_SADDR_RULE_PREFIX:
                        ret = score->matchlen;
                        break;
                default:
                        ret = !!test_bit(i, score->scorebits);
                }
                goto out;
        }

        switch (i) {
        case IPV6_SADDR_RULE_INIT:
                /* Rule 0: remember if hiscore is not ready yet */
                ret = !!score->ifa;
                break;
        case IPV6_SADDR_RULE_LOCAL:
                /* Rule 1: Prefer same address */
                ret = ipv6_addr_equal(&score->ifa->addr, dst->addr);
                break;
        case IPV6_SADDR_RULE_SCOPE:
                /* Rule 2: Prefer appropriate scope
                 *
                 *      ret
                 *       ^
                 *    -1 |  d 15
                 *    ---+--+-+---> scope
                 *       |
                 *       |             d is scope of the destination.
                 *  B-d  |  \
                 *       |   \      <- smaller scope is better if
                 *  B-15 |    \        if scope is enough for destination.
                 *       |             ret = B - scope (-1 <= scope >= d <= 15).
                 * d-C-1 | /
                 *       |/         <- greater is better
                 *   -C  /             if scope is not enough for destination.
                 *      /|             ret = scope - C (-1 <= d < scope <= 15).
                 *
                 * d - C - 1 < B -15 (for all -1 <= d <= 15).
                 * C > d + 14 - B >= 15 + 14 - B = 29 - B.
                 * Assume B = 0 and we get C > 29.
                 */
                ret = __ipv6_addr_src_scope(score->addr_type);
                if (ret >= dst->scope)
                        ret = -ret;
                else
                        ret -= 128;        /* 30 is enough */
                score->scopedist = ret;
                break;
        case IPV6_SADDR_RULE_PREFERRED:
            {
                /* Rule 3: Avoid deprecated and optimistic addresses */
                u8 avoid = IFA_F_DEPRECATED;

                if (!ipv6_use_optimistic_addr(net, score->ifa->idev))
                        avoid |= IFA_F_OPTIMISTIC;
                ret = ipv6_saddr_preferred(score->addr_type) ||
                      !(score->ifa->flags & avoid);
                break;
            }
#ifdef CONFIG_IPV6_MIP6
        case IPV6_SADDR_RULE_HOA:
            {
                /* Rule 4: Prefer home address */
                int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA);
                ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome;
                break;
            }
#endif
        case IPV6_SADDR_RULE_OIF:
                /* Rule 5: Prefer outgoing interface */
                ret = (!dst->ifindex ||
                       dst->ifindex == score->ifa->idev->dev->ifindex);
                break;
        case IPV6_SADDR_RULE_LABEL:
                /* Rule 6: Prefer matching label */
                ret = ipv6_addr_label(net,
                                      &score->ifa->addr, score->addr_type,
                                      score->ifa->idev->dev->ifindex) == dst->label;
                break;
        case IPV6_SADDR_RULE_PRIVACY:
            {
                /* Rule 7: Prefer public address
                 * Note: prefer temporary address if use_tempaddr >= 2
                 */
                int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ?
                                !!(dst->prefs & IPV6_PREFER_SRC_TMP) :
                                READ_ONCE(score->ifa->idev->cnf.use_tempaddr) >= 2;
                ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp;
                break;
            }
        case IPV6_SADDR_RULE_ORCHID:
                /* Rule 8-: Prefer ORCHID vs ORCHID or
                 *            non-ORCHID vs non-ORCHID
                 */
                ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
                        ipv6_addr_orchid(dst->addr));
                break;
        case IPV6_SADDR_RULE_PREFIX:
                /* Rule 8: Use longest matching prefix */
                ret = ipv6_addr_diff(&score->ifa->addr, dst->addr);
                if (ret > score->ifa->prefix_len)
                        ret = score->ifa->prefix_len;
                score->matchlen = ret;
                break;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
                /* Optimistic addresses still have lower precedence than other
                 * preferred addresses.
                 */
                ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
                break;
#endif
        default:
                ret = 0;
        }

        if (ret)
                __set_bit(i, score->scorebits);
        score->rule = i;
out:
        return ret;
}

static int __ipv6_dev_get_saddr(struct net *net,
                                struct ipv6_saddr_dst *dst,
                                struct inet6_dev *idev,
                                struct ipv6_saddr_score *scores,
                                int hiscore_idx)
{
        struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];

        list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) {
                int i;

                /*
                 * - Tentative Address (RFC2462 section 5.4)
                 *  - A tentative address is not considered
                 *    "assigned to an interface" in the traditional
                 *    sense, unless it is also flagged as optimistic.
                 * - Candidate Source Address (section 4)
                 *  - In any case, anycast addresses, multicast
                 *    addresses, and the unspecified address MUST
                 *    NOT be included in a candidate set.
                 */
                if ((score->ifa->flags & IFA_F_TENTATIVE) &&
                    (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
                        continue;

                score->addr_type = __ipv6_addr_type(&score->ifa->addr);

                if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
                             score->addr_type & IPV6_ADDR_MULTICAST)) {
                        net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
                                            idev->dev->name);
                        continue;
                }

                score->rule = -1;
                bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);

                for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
                        int minihiscore, miniscore;

                        minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
                        miniscore = ipv6_get_saddr_eval(net, score, dst, i);

                        if (minihiscore > miniscore) {
                                if (i == IPV6_SADDR_RULE_SCOPE &&
                                    score->scopedist > 0) {
                                        /*
                                         * special case:
                                         * each remaining entry
                                         * has too small (not enough)
                                         * scope, because ifa entries
                                         * are sorted by their scope
                                         * values.
                                         */
                                        goto out;
                                }
                                break;
                        } else if (minihiscore < miniscore) {
                                swap(hiscore, score);
                                hiscore_idx = 1 - hiscore_idx;

                                /* restore our iterator */
                                score->ifa = hiscore->ifa;

                                break;
                        }
                }
        }
out:
        return hiscore_idx;
}

static int ipv6_get_saddr_master(struct net *net,
                                 const struct net_device *dst_dev,
                                 const struct net_device *master,
                                 struct ipv6_saddr_dst *dst,
                                 struct ipv6_saddr_score *scores,
                                 int hiscore_idx)
{
        struct inet6_dev *idev;

        idev = __in6_dev_get(dst_dev);
        if (idev)
                hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
                                                   scores, hiscore_idx);

        idev = __in6_dev_get(master);
        if (idev)
                hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
                                                   scores, hiscore_idx);

        return hiscore_idx;
}

int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
                       const struct in6_addr *daddr, unsigned int prefs,
                       struct in6_addr *saddr)
{
        struct ipv6_saddr_score scores[2], *hiscore;
        struct ipv6_saddr_dst dst;
        struct inet6_dev *idev;
        struct net_device *dev;
        int dst_type;
        bool use_oif_addr = false;
        int hiscore_idx = 0;
        int ret = 0;

        dst_type = __ipv6_addr_type(daddr);
        dst.addr = daddr;
        dst.ifindex = dst_dev ? dst_dev->ifindex : 0;
        dst.scope = __ipv6_addr_src_scope(dst_type);
        dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
        dst.prefs = prefs;

        scores[hiscore_idx].rule = -1;
        scores[hiscore_idx].ifa = NULL;

        rcu_read_lock();

        /* Candidate Source Address (section 4)
         *  - multicast and link-local destination address,
         *    the set of candidate source address MUST only
         *    include addresses assigned to interfaces
         *    belonging to the same link as the outgoing
         *    interface.
         * (- For site-local destination addresses, the
         *    set of candidate source addresses MUST only
         *    include addresses assigned to interfaces
         *    belonging to the same site as the outgoing
         *    interface.)
         *  - "It is RECOMMENDED that the candidate source addresses
         *    be the set of unicast addresses assigned to the
         *    interface that will be used to send to the destination
         *    (the 'outgoing' interface)." (RFC 6724)
         */
        if (dst_dev) {
                idev = __in6_dev_get(dst_dev);
                if ((dst_type & IPV6_ADDR_MULTICAST) ||
                    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
                    (idev && READ_ONCE(idev->cnf.use_oif_addrs_only))) {
                        use_oif_addr = true;
                }
        }

        if (use_oif_addr) {
                if (idev)
                        hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
        } else {
                const struct net_device *master;
                int master_idx = 0;

                /* if dst_dev exists and is enslaved to an L3 device, then
                 * prefer addresses from dst_dev and then the master over
                 * any other enslaved devices in the L3 domain.
                 */
                master = l3mdev_master_dev_rcu(dst_dev);
                if (master) {
                        master_idx = master->ifindex;

                        hiscore_idx = ipv6_get_saddr_master(net, dst_dev,
                                                            master, &dst,
                                                            scores, hiscore_idx);

                        if (scores[hiscore_idx].ifa)
                                goto out;
                }

                for_each_netdev_rcu(net, dev) {
                        /* only consider addresses on devices in the
                         * same L3 domain
                         */
                        if (l3mdev_master_ifindex_rcu(dev) != master_idx)
                                continue;
                        idev = __in6_dev_get(dev);
                        if (!idev)
                                continue;
                        hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
                }
        }

out:
        hiscore = &scores[hiscore_idx];
        if (!hiscore->ifa)
                ret = -EADDRNOTAVAIL;
        else
                *saddr = hiscore->ifa->addr;

        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(ipv6_dev_get_saddr);

static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
                              u32 banned_flags)
{
        struct inet6_ifaddr *ifp;
        int err = -EADDRNOTAVAIL;

        list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
                if (ifp->scope > IFA_LINK)
                        break;
                if (ifp->scope == IFA_LINK &&
                    !(ifp->flags & banned_flags)) {
                        *addr = ifp->addr;
                        err = 0;
                        break;
                }
        }
        return err;
}

int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
                    u32 banned_flags)
{
        struct inet6_dev *idev;
        int err = -EADDRNOTAVAIL;

        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
                read_lock_bh(&idev->lock);
                err = __ipv6_get_lladdr(idev, addr, banned_flags);
                read_unlock_bh(&idev->lock);
        }
        rcu_read_unlock();
        return err;
}

static int ipv6_count_addresses(const struct inet6_dev *idev)
{
        const struct inet6_ifaddr *ifp;
        int cnt = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(ifp, &idev->addr_list, if_list)
                cnt++;
        rcu_read_unlock();
        return cnt;
}

int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
                  const struct net_device *dev, int strict)
{
        return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
                                       strict, IFA_F_TENTATIVE);
}
EXPORT_SYMBOL(ipv6_chk_addr);

/* device argument is used to find the L3 domain of interest. If
 * skip_dev_check is set, then the ifp device is not checked against
 * the passed in dev argument. So the 2 cases for addresses checks are:
 *   1. does the address exist in the L3 domain that dev is part of
 *      (skip_dev_check = true), or
 *
 *   2. does the address exist on the specific device
 *      (skip_dev_check = false)
 */
static struct net_device *
__ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                          const struct net_device *dev, bool skip_dev_check,
                          int strict, u32 banned_flags)
{
        unsigned int hash = inet6_addr_hash(net, addr);
        struct net_device *l3mdev, *ndev;
        struct inet6_ifaddr *ifp;
        u32 ifp_flags;

        rcu_read_lock();

        l3mdev = l3mdev_master_dev_rcu(dev);
        if (skip_dev_check)
                dev = NULL;

        hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                ndev = ifp->idev->dev;

                if (l3mdev_master_dev_rcu(ndev) != l3mdev)
                        continue;

                /* Decouple optimistic from tentative for evaluation here.
                 * Ban optimistic addresses explicitly, when required.
                 */
                ifp_flags = (ifp->flags&IFA_F_OPTIMISTIC)
                            ? (ifp->flags&~IFA_F_TENTATIVE)
                            : ifp->flags;
                if (ipv6_addr_equal(&ifp->addr, addr) &&
                    !(ifp_flags&banned_flags) &&
                    (!dev || ndev == dev ||
                     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
                        rcu_read_unlock();
                        return ndev;
                }
        }

        rcu_read_unlock();
        return NULL;
}

int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                            const struct net_device *dev, bool skip_dev_check,
                            int strict, u32 banned_flags)
{
        return __ipv6_chk_addr_and_flags(net, addr, dev, skip_dev_check,
                                         strict, banned_flags) ? 1 : 0;
}
EXPORT_SYMBOL(ipv6_chk_addr_and_flags);


/* Compares an address/prefix_len with addresses on device @dev.
 * If one is found it returns true.
 */
bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
        const unsigned int prefix_len, struct net_device *dev)
{
        const struct inet6_ifaddr *ifa;
        const struct inet6_dev *idev;
        bool ret = false;

        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
                list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len);
                        if (ret)
                                break;
                }
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(ipv6_chk_custom_prefix);

int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
{
        const struct inet6_ifaddr *ifa;
        const struct inet6_dev *idev;
        int        onlink;

        onlink = 0;
        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
                list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        onlink = ipv6_prefix_equal(addr, &ifa->addr,
                                                   ifa->prefix_len);
                        if (onlink)
                                break;
                }
        }
        rcu_read_unlock();
        return onlink;
}
EXPORT_SYMBOL(ipv6_chk_prefix);

/**
 * ipv6_dev_find - find the first device with a given source address.
 * @net: the net namespace
 * @addr: the source address
 * @dev: used to find the L3 domain of interest
 *
 * The caller should be protected by RCU, or RTNL.
 */
struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                 struct net_device *dev)
{
        return __ipv6_chk_addr_and_flags(net, addr, dev, !dev, 1,
                                         IFA_F_TENTATIVE);
}
EXPORT_SYMBOL(ipv6_dev_find);

struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
                                     struct net_device *dev, int strict)
{
        unsigned int hash = inet6_addr_hash(net, addr);
        struct inet6_ifaddr *ifp, *result = NULL;

        rcu_read_lock();
        hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr)) {
                        if (!dev || ifp->idev->dev == dev ||
                            !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
                                if (in6_ifa_hold_safe(ifp)) {
                                        result = ifp;
                                        break;
                                }
                        }
                }
        }
        rcu_read_unlock();

        return result;
}

/* Gets referenced address, destroys ifaddr */

static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
{
        if (dad_failed)
                ifp->flags |= IFA_F_DADFAILED;

        if (ifp->flags&IFA_F_TEMPORARY) {
                struct inet6_ifaddr *ifpub;
                spin_lock_bh(&ifp->lock);
                ifpub = ifp->ifpub;
                if (ifpub) {
                        in6_ifa_hold(ifpub);
                        spin_unlock_bh(&ifp->lock);
                        ipv6_create_tempaddr(ifpub, true);
                        in6_ifa_put(ifpub);
                } else {
                        spin_unlock_bh(&ifp->lock);
                }
                ipv6_del_addr(ifp);
        } else if (ifp->flags&IFA_F_PERMANENT || !dad_failed) {
                spin_lock_bh(&ifp->lock);
                addrconf_del_dad_work(ifp);
                ifp->flags |= IFA_F_TENTATIVE;
                if (dad_failed)
                        ifp->flags &= ~IFA_F_OPTIMISTIC;
                spin_unlock_bh(&ifp->lock);
                if (dad_failed)
                        ipv6_ifa_notify(0, ifp);
                in6_ifa_put(ifp);
        } else {
                ipv6_del_addr(ifp);
        }
}

static int addrconf_dad_end(struct inet6_ifaddr *ifp)
{
        int err = -ENOENT;

        spin_lock_bh(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_DAD) {
                ifp->state = INET6_IFADDR_STATE_POSTDAD;
                err = 0;
        }
        spin_unlock_bh(&ifp->lock);

        return err;
}

void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
{
        struct inet6_dev *idev = ifp->idev;
        struct net *net = dev_net(idev->dev);
        int max_addresses;

        if (addrconf_dad_end(ifp)) {
                in6_ifa_put(ifp);
                return;
        }

        net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM detected!\n",
                             ifp->idev->dev->name, &ifp->addr, eth_hdr(skb)->h_source);

        spin_lock_bh(&ifp->lock);

        if (ifp->flags & IFA_F_STABLE_PRIVACY) {
                struct in6_addr new_addr;
                struct inet6_ifaddr *ifp2;
                int retries = ifp->stable_privacy_retry + 1;
                struct ifa6_config cfg = {
                        .pfx = &new_addr,
                        .plen = ifp->prefix_len,
                        .ifa_flags = ifp->flags,
                        .valid_lft = ifp->valid_lft,
                        .preferred_lft = ifp->prefered_lft,
                        .scope = ifp->scope,
                };

                if (retries > net->ipv6.sysctl.idgen_retries) {
                        net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
                                             ifp->idev->dev->name);
                        goto errdad;
                }

                new_addr = ifp->addr;
                if (ipv6_generate_stable_address(&new_addr, retries,
                                                 idev))
                        goto errdad;

                spin_unlock_bh(&ifp->lock);

                max_addresses = READ_ONCE(idev->cnf.max_addresses);
                if (max_addresses &&
                    ipv6_count_addresses(idev) >= max_addresses)
                        goto lock_errdad;

                net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
                                     ifp->idev->dev->name);

                ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
                if (IS_ERR(ifp2))
                        goto lock_errdad;

                spin_lock_bh(&ifp2->lock);
                ifp2->stable_privacy_retry = retries;
                ifp2->state = INET6_IFADDR_STATE_PREDAD;
                spin_unlock_bh(&ifp2->lock);

                addrconf_mod_dad_work(ifp2, net->ipv6.sysctl.idgen_delay);
                in6_ifa_put(ifp2);
lock_errdad:
                spin_lock_bh(&ifp->lock);
        }

errdad:
        /* transition from _POSTDAD to _ERRDAD */
        ifp->state = INET6_IFADDR_STATE_ERRDAD;
        spin_unlock_bh(&ifp->lock);

        addrconf_mod_dad_work(ifp, 0);
        in6_ifa_put(ifp);
}

/* Join to solicited addr multicast group.
 * caller must hold RTNL */
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
{
        struct in6_addr maddr;

        if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
                return;

        addrconf_addr_solict_mult(addr, &maddr);
        ipv6_dev_mc_inc(dev, &maddr);
}

/* caller must hold RTNL */
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
{
        struct in6_addr maddr;

        if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
                return;

        addrconf_addr_solict_mult(addr, &maddr);
        __ipv6_dev_mc_dec(idev, &maddr);
}

/* caller must hold RTNL */
static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
{
        struct in6_addr addr;

        if (ifp->prefix_len >= 127) /* RFC 6164 */
                return;
        ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
        if (ipv6_addr_any(&addr))
                return;
        __ipv6_dev_ac_inc(ifp->idev, &addr);
}

/* caller must hold RTNL */
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
{
        struct in6_addr addr;

        if (ifp->prefix_len >= 127) /* RFC 6164 */
                return;
        ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
        if (ipv6_addr_any(&addr))
                return;
        __ipv6_dev_ac_dec(ifp->idev, &addr);
}

static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev)
{
        switch (dev->addr_len) {
        case ETH_ALEN:
                memcpy(eui, dev->dev_addr, 3);
                eui[3] = 0xFF;
                eui[4] = 0xFE;
                memcpy(eui + 5, dev->dev_addr + 3, 3);
                break;
        case EUI64_ADDR_LEN:
                memcpy(eui, dev->dev_addr, EUI64_ADDR_LEN);
                eui[0] ^= 2;
                break;
        default:
                return -1;
        }

        return 0;
}

static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev)
{
        const union fwnet_hwaddr *ha;

        if (dev->addr_len != FWNET_ALEN)
                return -1;

        ha = (const union fwnet_hwaddr *)dev->dev_addr;

        memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id));
        eui[0] ^= 2;
        return 0;
}

static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev)
{
        /* XXX: inherit EUI-64 from other interface -- yoshfuji */
        if (dev->addr_len != ARCNET_ALEN)
                return -1;
        memset(eui, 0, 7);
        eui[7] = *(u8 *)dev->dev_addr;
        return 0;
}

static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
{
        if (dev->addr_len != INFINIBAND_ALEN)
                return -1;
        memcpy(eui, dev->dev_addr + 12, 8);
        eui[0] |= 2;
        return 0;
}

static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
{
        if (addr == 0)
                return -1;
        eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) ||
                  ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) ||
                  ipv4_is_private_172(addr) || ipv4_is_test_192(addr) ||
                  ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) ||
                  ipv4_is_test_198(addr) || ipv4_is_multicast(addr) ||
                  ipv4_is_lbcast(addr)) ? 0x00 : 0x02;
        eui[1] = 0;
        eui[2] = 0x5E;
        eui[3] = 0xFE;
        memcpy(eui + 4, &addr, 4);
        return 0;
}

static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
{
        if (dev->priv_flags & IFF_ISATAP)
                return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
        return -1;
}

static int addrconf_ifid_gre(u8 *eui, struct net_device *dev)
{
        return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
}

static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev)
{
        memcpy(eui, dev->perm_addr, 3);
        memcpy(eui + 5, dev->perm_addr + 3, 3);
        eui[3] = 0xFF;
        eui[4] = 0xFE;
        eui[0] ^= 2;
        return 0;
}

static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
{
        switch (dev->type) {
        case ARPHRD_ETHER:
        case ARPHRD_FDDI:
                return addrconf_ifid_eui48(eui, dev);
        case ARPHRD_ARCNET:
                return addrconf_ifid_arcnet(eui, dev);
        case ARPHRD_INFINIBAND:
                return addrconf_ifid_infiniband(eui, dev);
        case ARPHRD_SIT:
                return addrconf_ifid_sit(eui, dev);
        case ARPHRD_IPGRE:
        case ARPHRD_TUNNEL:
                return addrconf_ifid_gre(eui, dev);
        case ARPHRD_6LOWPAN:
                return addrconf_ifid_6lowpan(eui, dev);
        case ARPHRD_IEEE1394:
                return addrconf_ifid_ieee1394(eui, dev);
        case ARPHRD_TUNNEL6:
        case ARPHRD_IP6GRE:
        case ARPHRD_RAWIP:
                return addrconf_ifid_ip6tnl(eui, dev);
        }
        return -1;
}

static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
{
        int err = -1;
        struct inet6_ifaddr *ifp;

        read_lock_bh(&idev->lock);
        list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
                if (ifp->scope > IFA_LINK)
                        break;
                if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
                        memcpy(eui, ifp->addr.s6_addr+8, 8);
                        err = 0;
                        break;
                }
        }
        read_unlock_bh(&idev->lock);
        return err;
}

/* Generation of a randomized Interface Identifier
 * draft-ietf-6man-rfc4941bis, Section 3.3.1
 */

static void ipv6_gen_rnd_iid(struct in6_addr *addr)
{
regen:
        get_random_bytes(&addr->s6_addr[8], 8);

        /* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1:
         * check if generated address is not inappropriate:
         *
         * - Reserved IPv6 Interface Identifiers
         * - XXX: already assigned to an address on the device
         */

        /* Subnet-router anycast: 0000:0000:0000:0000 */
        if (!(addr->s6_addr32[2] | addr->s6_addr32[3]))
                goto regen;

        /* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212
         * Proxy Mobile IPv6:   0200:5EFF:FE00:5213
         * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF
         */
        if (ntohl(addr->s6_addr32[2]) == 0x02005eff &&
            (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000)
                goto regen;

        /* Reserved subnet anycast addresses */
        if (ntohl(addr->s6_addr32[2]) == 0xfdffffff &&
            ntohl(addr->s6_addr32[3]) >= 0Xffffff80)
                goto regen;
}

/*
 *        Add prefix route.
 */

static void
addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
                      struct net_device *dev, unsigned long expires,
                      u32 flags, gfp_t gfp_flags)
{
        struct fib6_config cfg = {
                .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
                .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
                .fc_ifindex = dev->ifindex,
                .fc_expires = expires,
                .fc_dst_len = plen,
                .fc_flags = RTF_UP | flags,
                .fc_nlinfo.nl_net = dev_net(dev),
                .fc_protocol = RTPROT_KERNEL,
                .fc_type = RTN_UNICAST,
        };

        cfg.fc_dst = *pfx;

        /* Prevent useless cloning on PtP SIT.
           This thing is done here expecting that the whole
           class of non-broadcast devices need not cloning.
         */
#if IS_ENABLED(CONFIG_IPV6_SIT)
        if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT))
                cfg.fc_flags |= RTF_NONEXTHOP;
#endif

        ip6_route_add(&cfg, gfp_flags, NULL);
}


static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
                                                  int plen,
                                                  const struct net_device *dev,
                                                  u32 flags, u32 noflags,
                                                  bool no_gw)
{
        struct fib6_node *fn;
        struct fib6_info *rt = NULL;
        struct fib6_table *table;
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;

        table = fib6_get_table(dev_net(dev), tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
        if (!fn)
                goto out;

        for_each_fib6_node_rt_rcu(fn) {
                /* prefix routes only use builtin fib6_nh */
                if (rt->nh)
                        continue;

                if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
                        continue;
                if (no_gw && rt->fib6_nh->fib_nh_gw_family)
                        continue;
                if ((rt->fib6_flags & flags) != flags)
                        continue;
                if ((rt->fib6_flags & noflags) != 0)
                        continue;
                if (!fib6_info_hold_safe(rt))
                        continue;
                break;
        }
out:
        rcu_read_unlock();
        return rt;
}


/* Create "default" multicast route to the interface */

static void addrconf_add_mroute(struct net_device *dev)
{
        struct fib6_config cfg = {
                .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL,
                .fc_metric = IP6_RT_PRIO_ADDRCONF,
                .fc_ifindex = dev->ifindex,
                .fc_dst_len = 8,
                .fc_flags = RTF_UP,
                .fc_type = RTN_MULTICAST,
                .fc_nlinfo.nl_net = dev_net(dev),
                .fc_protocol = RTPROT_KERNEL,
        };

        ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);

        ip6_route_add(&cfg, GFP_KERNEL, NULL);
}

static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev))
                return idev;

        if (idev->cnf.disable_ipv6)
                return ERR_PTR(-EACCES);

        /* Add default multicast route */
        if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev))
                addrconf_add_mroute(dev);

        return idev;
}

static void manage_tempaddrs(struct inet6_dev *idev,
                             struct inet6_ifaddr *ifp,
                             __u32 valid_lft, __u32 prefered_lft,
                             bool create, unsigned long now)
{
        u32 flags;
        struct inet6_ifaddr *ift;

        read_lock_bh(&idev->lock);
        /* update all temporary addresses in the list */
        list_for_each_entry(ift, &idev->tempaddr_list, tmp_list) {
                int age, max_valid, max_prefered;

                if (ifp != ift->ifpub)
                        continue;

                /* RFC 4941 section 3.3:
                 * If a received option will extend the lifetime of a public
                 * address, the lifetimes of temporary addresses should
                 * be extended, subject to the overall constraint that no
                 * temporary addresses should ever remain "valid" or "preferred"
                 * for a time longer than (TEMP_VALID_LIFETIME) or
                 * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively.
                 */
                age = (now - ift->cstamp) / HZ;
                max_valid = READ_ONCE(idev->cnf.temp_valid_lft) - age;
                if (max_valid < 0)
                        max_valid = 0;

                max_prefered = READ_ONCE(idev->cnf.temp_prefered_lft) -
                               idev->desync_factor - age;
                if (max_prefered < 0)
                        max_prefered = 0;

                if (valid_lft > max_valid)
                        valid_lft = max_valid;

                if (prefered_lft > max_prefered)
                        prefered_lft = max_prefered;

                spin_lock(&ift->lock);
                flags = ift->flags;
                ift->valid_lft = valid_lft;
                ift->prefered_lft = prefered_lft;
                ift->tstamp = now;
                if (prefered_lft > 0)
                        ift->flags &= ~IFA_F_DEPRECATED;

                spin_unlock(&ift->lock);
                if (!(flags&IFA_F_TENTATIVE))
                        ipv6_ifa_notify(0, ift);
        }

        /* Also create a temporary address if it's enabled but no temporary
         * address currently exists.
         * However, we get called with valid_lft == 0, prefered_lft == 0, create == false
         * as part of cleanup (ie. deleting the mngtmpaddr).
         * We don't want that to result in creating a new temporary ip address.
         */
        if (list_empty(&idev->tempaddr_list) && (valid_lft || prefered_lft))
                create = true;

        if (create && READ_ONCE(idev->cnf.use_tempaddr) > 0) {
                /* When a new public address is created as described
                 * in [ADDRCONF], also create a new temporary address.
                 */
                read_unlock_bh(&idev->lock);
                ipv6_create_tempaddr(ifp, false);
        } else {
                read_unlock_bh(&idev->lock);
        }
}

static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
{
        return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
               idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
}

int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 const struct prefix_info *pinfo,
                                 struct inet6_dev *in6_dev,
                                 const struct in6_addr *addr, int addr_type,
                                 u32 addr_flags, bool sllao, bool tokenized,
                                 __u32 valid_lft, u32 prefered_lft)
{
        struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1);
        int create = 0, update_lft = 0;

        if (!ifp && valid_lft) {
                int max_addresses = READ_ONCE(in6_dev->cnf.max_addresses);
                struct ifa6_config cfg = {
                        .pfx = addr,
                        .plen = pinfo->prefix_len,
                        .ifa_flags = addr_flags,
                        .valid_lft = valid_lft,
                        .preferred_lft = prefered_lft,
                        .scope = addr_type & IPV6_ADDR_SCOPE_MASK,
                        .ifa_proto = IFAPROT_KERNEL_RA
                };

#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
                if ((READ_ONCE(net->ipv6.devconf_all->optimistic_dad) ||
                     READ_ONCE(in6_dev->cnf.optimistic_dad)) &&
                    !net->ipv6.devconf_all->forwarding && sllao)
                        cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif

                /* Do not allow to create too much of autoconfigured
                 * addresses; this would be too easy way to crash kernel.
                 */
                if (!max_addresses ||
                    ipv6_count_addresses(in6_dev) < max_addresses)
                        ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);

                if (IS_ERR_OR_NULL(ifp))
                        return -1;

                create = 1;
                spin_lock_bh(&ifp->lock);
                ifp->flags |= IFA_F_MANAGETEMPADDR;
                ifp->cstamp = jiffies;
                ifp->tokenized = tokenized;
                spin_unlock_bh(&ifp->lock);
                addrconf_dad_start(ifp);
        }

        if (ifp) {
                u32 flags;
                unsigned long now;
                u32 stored_lft;

                /* update lifetime (RFC2462 5.5.3 e) */
                spin_lock_bh(&ifp->lock);
                now = jiffies;
                if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
                        stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
                else
                        stored_lft = 0;

                /* RFC4862 Section 5.5.3e:
                 * "Note that the preferred lifetime of the
                 *  corresponding address is always reset to
                 *  the Preferred Lifetime in the received
                 *  Prefix Information option, regardless of
                 *  whether the valid lifetime is also reset or
                 *  ignored."
                 *
                 * So we should always update prefered_lft here.
                 */
                update_lft = !create && stored_lft;

                if (update_lft && !READ_ONCE(in6_dev->cnf.ra_honor_pio_life)) {
                        const u32 minimum_lft = min_t(u32,
                                stored_lft, MIN_VALID_LIFETIME);
                        valid_lft = max(valid_lft, minimum_lft);
                }

                if (update_lft) {
                        ifp->valid_lft = valid_lft;
                        ifp->prefered_lft = prefered_lft;
                        WRITE_ONCE(ifp->tstamp, now);
                        flags = ifp->flags;
                        ifp->flags &= ~IFA_F_DEPRECATED;
                        spin_unlock_bh(&ifp->lock);

                        if (!(flags&IFA_F_TENTATIVE))
                                ipv6_ifa_notify(0, ifp);
                } else
                        spin_unlock_bh(&ifp->lock);

                manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
                                 create, now);

                in6_ifa_put(ifp);
                addrconf_verify(net);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);

void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
{
        struct prefix_info *pinfo;
        struct fib6_table *table;
        __u32 valid_lft;
        __u32 prefered_lft;
        int addr_type, err;
        u32 addr_flags = 0;
        struct inet6_dev *in6_dev;
        struct net *net = dev_net(dev);

        pinfo = (struct prefix_info *) opt;

        if (len < sizeof(struct prefix_info)) {
                netdev_dbg(dev, "addrconf: prefix option too short\n");
                return;
        }

        /*
         *        Validation checks ([ADDRCONF], page 19)
         */

        addr_type = ipv6_addr_type(&pinfo->prefix);

        if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL))
                return;

        valid_lft = ntohl(pinfo->valid);
        prefered_lft = ntohl(pinfo->prefered);

        if (prefered_lft > valid_lft) {
                net_warn_ratelimited("addrconf: prefix option has invalid lifetime\n");
                return;
        }

        in6_dev = in6_dev_get(dev);

        if (!in6_dev) {
                net_dbg_ratelimited("addrconf: device %s not configured\n",
                                    dev->name);
                return;
        }

        if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft)
                goto put;

        /*
         *        Two things going on here:
         *        1) Add routes for on-link prefixes
         *        2) Configure prefixes with the auto flag set
         */

        if (pinfo->onlink) {
                struct fib6_info *rt;
                unsigned long rt_expires;

                /* Avoid arithmetic overflow. Really, we could
                 * save rt_expires in seconds, likely valid_lft,
                 * but it would require division in fib gc, that it
                 * not good.
                 */
                if (HZ > USER_HZ)
                        rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
                else
                        rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);

                if (addrconf_finite_timeout(rt_expires))
                        rt_expires *= HZ;

                rt = addrconf_get_prefix_route(&pinfo->prefix,
                                               pinfo->prefix_len,
                                               dev,
                                               RTF_ADDRCONF | RTF_PREFIX_RT,
                                               RTF_DEFAULT, true);

                if (rt) {
                        /* Autoconf prefix route */
                        if (valid_lft == 0) {
                                ip6_del_rt(net, rt, false);
                                rt = NULL;
                        } else {
                                table = rt->fib6_table;
                                spin_lock_bh(&table->tb6_lock);

                                if (addrconf_finite_timeout(rt_expires)) {
                                        /* not infinity */
                                        fib6_set_expires(rt, jiffies + rt_expires);
                                        fib6_add_gc_list(rt);
                                } else {
                                        fib6_clean_expires(rt);
                                        fib6_remove_gc_list(rt);
                                }

                                spin_unlock_bh(&table->tb6_lock);
                        }
                } else if (valid_lft) {
                        clock_t expires = 0;
                        int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
                        if (addrconf_finite_timeout(rt_expires)) {
                                /* not infinity */
                                flags |= RTF_EXPIRES;
                                expires = jiffies_to_clock_t(rt_expires);
                        }
                        addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
                                              0, dev, expires, flags,
                                              GFP_ATOMIC);
                }
                fib6_info_release(rt);
        }

        /* Try to figure out our local address for this prefix */

        if (pinfo->autoconf && in6_dev->cnf.autoconf) {
                struct in6_addr addr;
                bool tokenized = false, dev_addr_generated = false;

                if (pinfo->prefix_len == 64) {
                        memcpy(&addr, &pinfo->prefix, 8);

                        if (!ipv6_addr_any(&in6_dev->token)) {
                                read_lock_bh(&in6_dev->lock);
                                memcpy(addr.s6_addr + 8,
                                       in6_dev->token.s6_addr + 8, 8);
                                read_unlock_bh(&in6_dev->lock);
                                tokenized = true;
                        } else if (is_addr_mode_generate_stable(in6_dev) &&
                                   !ipv6_generate_stable_address(&addr, 0,
                                                                 in6_dev)) {
                                addr_flags |= IFA_F_STABLE_PRIVACY;
                                goto ok;
                        } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
                                   ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
                                goto put;
                        } else {
                                dev_addr_generated = true;
                        }
                        goto ok;
                }
                net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n",
                                    pinfo->prefix_len);
                goto put;

ok:
                err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
                                                   &addr, addr_type,
                                                   addr_flags, sllao,
                                                   tokenized, valid_lft,
                                                   prefered_lft);
                if (err)
                        goto put;

                /* Ignore error case here because previous prefix add addr was
                 * successful which will be notified.
                 */
                ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr,
                                              addr_type, addr_flags, sllao,
                                              tokenized, valid_lft,
                                              prefered_lft,
                                              dev_addr_generated);
        }
        inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
put:
        in6_dev_put(in6_dev);
}

static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
                struct in6_ifreq *ireq)
{
        struct ip_tunnel_parm_kern p = { };
        int err;

        if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
                return -EADDRNOTAVAIL;

        p.iph.daddr = ireq->ifr6_addr.s6_addr32[3];
        p.iph.version = 4;
        p.iph.ihl = 5;
        p.iph.protocol = IPPROTO_IPV6;
        p.iph.ttl = 64;

        if (!dev->netdev_ops->ndo_tunnel_ctl)
                return -EOPNOTSUPP;
        err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL);
        if (err)
                return err;

        dev = __dev_get_by_name(net, p.name);
        if (!dev)
                return -ENOBUFS;
        return dev_open(dev, NULL);
}

/*
 *        Set destination address.
 *        Special case for SIT interfaces where we create a new "virtual"
 *        device.
 */
int addrconf_set_dstaddr(struct net *net, void __user *arg)
{
        struct net_device *dev;
        struct in6_ifreq ireq;
        int err = -ENODEV;

        if (!IS_ENABLED(CONFIG_IPV6_SIT))
                return -ENODEV;
        if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
                return -EFAULT;

        rtnl_lock();
        dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
        if (dev && dev->type == ARPHRD_SIT)
                err = addrconf_set_sit_dstaddr(net, dev, &ireq);
        rtnl_unlock();
        return err;
}

static int ipv6_mc_config(struct sock *sk, bool join,
                          const struct in6_addr *addr, int ifindex)
{
        int ret;

        ASSERT_RTNL();

        lock_sock(sk);
        if (join)
                ret = ipv6_sock_mc_join(sk, ifindex, addr);
        else
                ret = ipv6_sock_mc_drop(sk, ifindex, addr);
        release_sock(sk);

        return ret;
}

/*
 *        Manual configuration of address on an interface
 */
static int inet6_addr_add(struct net *net, int ifindex,
                          struct ifa6_config *cfg,
                          struct netlink_ext_ack *extack)
{
        struct inet6_ifaddr *ifp;
        struct inet6_dev *idev;
        struct net_device *dev;
        unsigned long timeout;
        clock_t expires;
        u32 flags;

        ASSERT_RTNL();

        if (cfg->plen > 128) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
                return -EINVAL;
        }

        /* check the lifetime */
        if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) {
                NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid");
                return -EINVAL;
        }

        if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) {
                NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, ifindex);
        if (!dev)
                return -ENODEV;

        idev = addrconf_add_dev(dev);
        if (IS_ERR(idev)) {
                NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
                return PTR_ERR(idev);
        }

        if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
                int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
                                         true, cfg->pfx, ifindex);

                if (ret < 0) {
                        NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed");
                        return ret;
                }
        }

        cfg->scope = ipv6_addr_scope(cfg->pfx);

        timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
        if (addrconf_finite_timeout(timeout)) {
                expires = jiffies_to_clock_t(timeout * HZ);
                cfg->valid_lft = timeout;
                flags = RTF_EXPIRES;
        } else {
                expires = 0;
                flags = 0;
                cfg->ifa_flags |= IFA_F_PERMANENT;
        }

        timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
        if (addrconf_finite_timeout(timeout)) {
                if (timeout == 0)
                        cfg->ifa_flags |= IFA_F_DEPRECATED;
                cfg->preferred_lft = timeout;
        }

        ifp = ipv6_add_addr(idev, cfg, true, extack);
        if (!IS_ERR(ifp)) {
                if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
                        addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
                                              ifp->rt_priority, dev, expires,
                                              flags, GFP_KERNEL);
                }

                /* Send a netlink notification if DAD is enabled and
                 * optimistic flag is not set
                 */
                if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD)))
                        ipv6_ifa_notify(0, ifp);
                /*
                 * Note that section 3.1 of RFC 4429 indicates
                 * that the Optimistic flag should not be set for
                 * manually configured addresses
                 */
                addrconf_dad_start(ifp);
                if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
                        manage_tempaddrs(idev, ifp, cfg->valid_lft,
                                         cfg->preferred_lft, true, jiffies);
                in6_ifa_put(ifp);
                addrconf_verify_rtnl(net);
                return 0;
        } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
                ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
                               cfg->pfx, ifindex);
        }

        return PTR_ERR(ifp);
}

static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
                          const struct in6_addr *pfx, unsigned int plen,
                          struct netlink_ext_ack *extack)
{
        struct inet6_ifaddr *ifp;
        struct inet6_dev *idev;
        struct net_device *dev;

        if (plen > 128) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, ifindex);
        if (!dev) {
                NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
                return -ENODEV;
        }

        idev = __in6_dev_get(dev);
        if (!idev) {
                NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
                return -ENXIO;
        }

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifp, &idev->addr_list, if_list) {
                if (ifp->prefix_len == plen &&
                    ipv6_addr_equal(pfx, &ifp->addr)) {
                        in6_ifa_hold(ifp);
                        read_unlock_bh(&idev->lock);

                        if (!(ifp->flags & IFA_F_TEMPORARY) &&
                            (ifa_flags & IFA_F_MANAGETEMPADDR))
                                manage_tempaddrs(idev, ifp, 0, 0, false,
                                                 jiffies);
                        ipv6_del_addr(ifp);
                        addrconf_verify_rtnl(net);
                        if (ipv6_addr_is_multicast(pfx)) {
                                ipv6_mc_config(net->ipv6.mc_autojoin_sk,
                                               false, pfx, dev->ifindex);
                        }
                        return 0;
                }
        }
        read_unlock_bh(&idev->lock);

        NL_SET_ERR_MSG_MOD(extack, "address not found");
        return -EADDRNOTAVAIL;
}


int addrconf_add_ifaddr(struct net *net, void __user *arg)
{
        struct ifa6_config cfg = {
                .ifa_flags = IFA_F_PERMANENT,
                .preferred_lft = INFINITY_LIFE_TIME,
                .valid_lft = INFINITY_LIFE_TIME,
        };
        struct in6_ifreq ireq;
        int err;

        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
                return -EFAULT;

        cfg.pfx = &ireq.ifr6_addr;
        cfg.plen = ireq.ifr6_prefixlen;

        rtnl_lock();
        err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
        rtnl_unlock();
        return err;
}

int addrconf_del_ifaddr(struct net *net, void __user *arg)
{
        struct in6_ifreq ireq;
        int err;

        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
                return -EFAULT;

        rtnl_lock();
        err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr,
                             ireq.ifr6_prefixlen, NULL);
        rtnl_unlock();
        return err;
}

static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
                     int plen, int scope, u8 proto)
{
        struct inet6_ifaddr *ifp;
        struct ifa6_config cfg = {
                .pfx = addr,
                .plen = plen,
                .ifa_flags = IFA_F_PERMANENT,
                .valid_lft = INFINITY_LIFE_TIME,
                .preferred_lft = INFINITY_LIFE_TIME,
                .scope = scope,
                .ifa_proto = proto
        };

        ifp = ipv6_add_addr(idev, &cfg, true, NULL);
        if (!IS_ERR(ifp)) {
                spin_lock_bh(&ifp->lock);
                ifp->flags &= ~IFA_F_TENTATIVE;
                spin_unlock_bh(&ifp->lock);
                rt_genid_bump_ipv6(dev_net(idev->dev));
                ipv6_ifa_notify(RTM_NEWADDR, ifp);
                in6_ifa_put(ifp);
        }
}

#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
static void add_v4_addrs(struct inet6_dev *idev)
{
        struct in6_addr addr;
        struct net_device *dev;
        struct net *net = dev_net(idev->dev);
        int scope, plen, offset = 0;
        u32 pflags = 0;

        ASSERT_RTNL();

        memset(&addr, 0, sizeof(struct in6_addr));
        /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */
        if (idev->dev->addr_len == sizeof(struct in6_addr))
                offset = sizeof(struct in6_addr) - 4;
        memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4);

        if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) {
                scope = IPV6_ADDR_COMPATv4;
                plen = 96;
                pflags |= RTF_NONEXTHOP;
        } else {
                if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE)
                        return;

                addr.s6_addr32[0] = htonl(0xfe800000);
                scope = IFA_LINK;
                plen = 64;
        }

        if (addr.s6_addr32[3]) {
                add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC);
                addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
                                      GFP_KERNEL);
                return;
        }

        for_each_netdev(net, dev) {
                struct in_device *in_dev = __in_dev_get_rtnl(dev);
                if (in_dev && (dev->flags & IFF_UP)) {
                        struct in_ifaddr *ifa;
                        int flag = scope;

                        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                                addr.s6_addr32[3] = ifa->ifa_local;

                                if (ifa->ifa_scope == RT_SCOPE_LINK)
                                        continue;
                                if (ifa->ifa_scope >= RT_SCOPE_HOST) {
                                        if (idev->dev->flags&IFF_POINTOPOINT)
                                                continue;
                                        flag |= IFA_HOST;
                                }

                                add_addr(idev, &addr, plen, flag,
                                         IFAPROT_UNSPEC);
                                addrconf_prefix_route(&addr, plen, 0, idev->dev,
                                                      0, pflags, GFP_KERNEL);
                        }
                }
        }
}
#endif

static void init_loopback(struct net_device *dev)
{
        struct inet6_dev  *idev;

        /* ::1 */

        ASSERT_RTNL();

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev)) {
                pr_debug("%s: add_dev failed\n", __func__);
                return;
        }

        add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO);
}

void addrconf_add_linklocal(struct inet6_dev *idev,
                            const struct in6_addr *addr, u32 flags)
{
        struct ifa6_config cfg = {
                .pfx = addr,
                .plen = 64,
                .ifa_flags = flags | IFA_F_PERMANENT,
                .valid_lft = INFINITY_LIFE_TIME,
                .preferred_lft = INFINITY_LIFE_TIME,
                .scope = IFA_LINK,
                .ifa_proto = IFAPROT_KERNEL_LL
        };
        struct inet6_ifaddr *ifp;

#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad) ||
             READ_ONCE(idev->cnf.optimistic_dad)) &&
            !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
                cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif

        ifp = ipv6_add_addr(idev, &cfg, true, NULL);
        if (!IS_ERR(ifp)) {
                addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
                                      0, 0, GFP_ATOMIC);
                addrconf_dad_start(ifp);
                in6_ifa_put(ifp);
        }
}
EXPORT_SYMBOL_GPL(addrconf_add_linklocal);

static bool ipv6_reserved_interfaceid(struct in6_addr address)
{
        if ((address.s6_addr32[2] | address.s6_addr32[3]) == 0)
                return true;

        if (address.s6_addr32[2] == htonl(0x02005eff) &&
            ((address.s6_addr32[3] & htonl(0xfe000000)) == htonl(0xfe000000)))
                return true;

        if (address.s6_addr32[2] == htonl(0xfdffffff) &&
            ((address.s6_addr32[3] & htonl(0xffffff80)) == htonl(0xffffff80)))
                return true;

        return false;
}

static int ipv6_generate_stable_address(struct in6_addr *address,
                                        u8 dad_count,
                                        const struct inet6_dev *idev)
{
        static DEFINE_SPINLOCK(lock);
        static __u32 digest[SHA1_DIGEST_WORDS];
        static __u32 workspace[SHA1_WORKSPACE_WORDS];

        static union {
                char __data[SHA1_BLOCK_SIZE];
                struct {
                        struct in6_addr secret;
                        __be32 prefix[2];
                        unsigned char hwaddr[MAX_ADDR_LEN];
                        u8 dad_count;
                } __packed;
        } data;

        struct in6_addr secret;
        struct in6_addr temp;
        struct net *net = dev_net(idev->dev);

        BUILD_BUG_ON(sizeof(data.__data) != sizeof(data));

        if (idev->cnf.stable_secret.initialized)
                secret = idev->cnf.stable_secret.secret;
        else if (net->ipv6.devconf_dflt->stable_secret.initialized)
                secret = net->ipv6.devconf_dflt->stable_secret.secret;
        else
                return -1;

retry:
        spin_lock_bh(&lock);

        sha1_init(digest);
        memset(&data, 0, sizeof(data));
        memset(workspace, 0, sizeof(workspace));
        memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
        data.prefix[0] = address->s6_addr32[0];
        data.prefix[1] = address->s6_addr32[1];
        data.secret = secret;
        data.dad_count = dad_count;

        sha1_transform(digest, data.__data, workspace);

        temp = *address;
        temp.s6_addr32[2] = (__force __be32)digest[0];
        temp.s6_addr32[3] = (__force __be32)digest[1];

        spin_unlock_bh(&lock);

        if (ipv6_reserved_interfaceid(temp)) {
                dad_count++;
                if (dad_count > dev_net(idev->dev)->ipv6.sysctl.idgen_retries)
                        return -1;
                goto retry;
        }

        *address = temp;
        return 0;
}

static void ipv6_gen_mode_random_init(struct inet6_dev *idev)
{
        struct ipv6_stable_secret *s = &idev->cnf.stable_secret;

        if (s->initialized)
                return;
        s = &idev->cnf.stable_secret;
        get_random_bytes(&s->secret, sizeof(s->secret));
        s->initialized = true;
}

static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
{
        struct in6_addr addr;

        /* no link local addresses on L3 master devices */
        if (netif_is_l3_master(idev->dev))
                return;

        /* no link local addresses on devices flagged as slaves */
        if (idev->dev->priv_flags & IFF_NO_ADDRCONF)
                return;

        ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);

        switch (idev->cnf.addr_gen_mode) {
        case IN6_ADDR_GEN_MODE_RANDOM:
                ipv6_gen_mode_random_init(idev);
                fallthrough;
        case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
                if (!ipv6_generate_stable_address(&addr, 0, idev))
                        addrconf_add_linklocal(idev, &addr,
                                               IFA_F_STABLE_PRIVACY);
                else if (prefix_route)
                        addrconf_prefix_route(&addr, 64, 0, idev->dev,
                                              0, 0, GFP_KERNEL);
                break;
        case IN6_ADDR_GEN_MODE_EUI64:
                /* addrconf_add_linklocal also adds a prefix_route and we
                 * only need to care about prefix routes if ipv6_generate_eui64
                 * couldn't generate one.
                 */
                if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
                        addrconf_add_linklocal(idev, &addr, 0);
                else if (prefix_route)
                        addrconf_prefix_route(&addr, 64, 0, idev->dev,
                                              0, 0, GFP_KERNEL);
                break;
        case IN6_ADDR_GEN_MODE_NONE:
        default:
                /* will not add any link local address */
                break;
        }
}

static void addrconf_dev_config(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        if ((dev->type != ARPHRD_ETHER) &&
            (dev->type != ARPHRD_FDDI) &&
            (dev->type != ARPHRD_ARCNET) &&
            (dev->type != ARPHRD_INFINIBAND) &&
            (dev->type != ARPHRD_IEEE1394) &&
            (dev->type != ARPHRD_TUNNEL6) &&
            (dev->type != ARPHRD_6LOWPAN) &&
            (dev->type != ARPHRD_TUNNEL) &&
            (dev->type != ARPHRD_NONE) &&
            (dev->type != ARPHRD_RAWIP)) {
                /* Alas, we support only Ethernet autoconfiguration. */
                idev = __in6_dev_get(dev);
                if (!IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP &&
                    dev->flags & IFF_MULTICAST)
                        ipv6_mc_up(idev);
                return;
        }

        idev = addrconf_add_dev(dev);
        if (IS_ERR(idev))
                return;

        /* this device type has no EUI support */
        if (dev->type == ARPHRD_NONE &&
            idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
                WRITE_ONCE(idev->cnf.addr_gen_mode,
                           IN6_ADDR_GEN_MODE_RANDOM);

        addrconf_addr_gen(idev, false);
}

#if IS_ENABLED(CONFIG_IPV6_SIT)
static void addrconf_sit_config(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        /*
         * Configure the tunnel with one of our IPv4
         * addresses... we should configure all of
         * our v4 addrs in the tunnel
         */

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev)) {
                pr_debug("%s: add_dev failed\n", __func__);
                return;
        }

        if (dev->priv_flags & IFF_ISATAP) {
                addrconf_addr_gen(idev, false);
                return;
        }

        add_v4_addrs(idev);

        if (dev->flags&IFF_POINTOPOINT)
                addrconf_add_mroute(dev);
}
#endif

#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
static void addrconf_gre_config(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev)) {
                pr_debug("%s: add_dev failed\n", __func__);
                return;
        }

        if (dev->type == ARPHRD_ETHER) {
                addrconf_addr_gen(idev, true);
                return;
        }

        add_v4_addrs(idev);

        if (dev->flags & IFF_POINTOPOINT)
                addrconf_add_mroute(dev);
}
#endif

static void addrconf_init_auto_addrs(struct net_device *dev)
{
        switch (dev->type) {
#if IS_ENABLED(CONFIG_IPV6_SIT)
        case ARPHRD_SIT:
                addrconf_sit_config(dev);
                break;
#endif
#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
        case ARPHRD_IP6GRE:
        case ARPHRD_IPGRE:
                addrconf_gre_config(dev);
                break;
#endif
        case ARPHRD_LOOPBACK:
                init_loopback(dev);
                break;

        default:
                addrconf_dev_config(dev);
                break;
        }
}

static int fixup_permanent_addr(struct net *net,
                                struct inet6_dev *idev,
                                struct inet6_ifaddr *ifp)
{
        /* !fib6_node means the host route was removed from the
         * FIB, for example, if 'lo' device is taken down. In that
         * case regenerate the host route.
         */
        if (!ifp->rt || !ifp->rt->fib6_node) {
                struct fib6_info *f6i, *prev;

                f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
                                         GFP_ATOMIC, NULL);
                if (IS_ERR(f6i))
                        return PTR_ERR(f6i);

                /* ifp->rt can be accessed outside of rtnl */
                spin_lock(&ifp->lock);
                prev = ifp->rt;
                ifp->rt = f6i;
                spin_unlock(&ifp->lock);

                fib6_info_release(prev);
        }

        if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
                addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
                                      ifp->rt_priority, idev->dev, 0, 0,
                                      GFP_ATOMIC);
        }

        if (ifp->state == INET6_IFADDR_STATE_PREDAD)
                addrconf_dad_start(ifp);

        return 0;
}

static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
{
        struct inet6_ifaddr *ifp, *tmp;
        struct inet6_dev *idev;

        idev = __in6_dev_get(dev);
        if (!idev)
                return;

        write_lock_bh(&idev->lock);

        list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
                if ((ifp->flags & IFA_F_PERMANENT) &&
                    fixup_permanent_addr(net, idev, ifp) < 0) {
                        write_unlock_bh(&idev->lock);
                        in6_ifa_hold(ifp);
                        ipv6_del_addr(ifp);
                        write_lock_bh(&idev->lock);

                        net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
                                             idev->dev->name, &ifp->addr);
                }
        }

        write_unlock_bh(&idev->lock);
}

static int addrconf_notify(struct notifier_block *this, unsigned long event,
                           void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_change_info *change_info;
        struct netdev_notifier_changeupper_info *info;
        struct inet6_dev *idev = __in6_dev_get(dev);
        struct net *net = dev_net(dev);
        int run_pending = 0;
        int err;

        switch (event) {
        case NETDEV_REGISTER:
                if (!idev && dev->mtu >= IPV6_MIN_MTU) {
                        idev = ipv6_add_dev(dev);
                        if (IS_ERR(idev))
                                return notifier_from_errno(PTR_ERR(idev));
                }
                break;

        case NETDEV_CHANGEMTU:
                /* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */
                if (dev->mtu < IPV6_MIN_MTU) {
                        addrconf_ifdown(dev, dev != net->loopback_dev);
                        break;
                }

                if (idev) {
                        rt6_mtu_change(dev, dev->mtu);
                        WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
                        break;
                }

                /* allocate new idev */
                idev = ipv6_add_dev(dev);
                if (IS_ERR(idev))
                        break;

                /* device is still not ready */
                if (!(idev->if_flags & IF_READY))
                        break;

                run_pending = 1;
                fallthrough;
        case NETDEV_UP:
        case NETDEV_CHANGE:
                if (idev && idev->cnf.disable_ipv6)
                        break;

                if (dev->priv_flags & IFF_NO_ADDRCONF) {
                        if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) &&
                            dev->flags & IFF_UP && dev->flags & IFF_MULTICAST)
                                ipv6_mc_up(idev);
                        break;
                }

                if (event == NETDEV_UP) {
                        /* restore routes for permanent addresses */
                        addrconf_permanent_addr(net, dev);

                        if (!addrconf_link_ready(dev)) {
                                /* device is not ready yet. */
                                pr_debug("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
                                         dev->name);
                                break;
                        }

                        if (!idev && dev->mtu >= IPV6_MIN_MTU)
                                idev = ipv6_add_dev(dev);

                        if (!IS_ERR_OR_NULL(idev)) {
                                idev->if_flags |= IF_READY;
                                run_pending = 1;
                        }
                } else if (event == NETDEV_CHANGE) {
                        if (!addrconf_link_ready(dev)) {
                                /* device is still not ready. */
                                rt6_sync_down_dev(dev, event);
                                break;
                        }

                        if (!IS_ERR_OR_NULL(idev)) {
                                if (idev->if_flags & IF_READY) {
                                        /* device is already configured -
                                         * but resend MLD reports, we might
                                         * have roamed and need to update
                                         * multicast snooping switches
                                         */
                                        ipv6_mc_up(idev);
                                        change_info = ptr;
                                        if (change_info->flags_changed & IFF_NOARP)
                                                addrconf_dad_run(idev, true);
                                        rt6_sync_up(dev, RTNH_F_LINKDOWN);
                                        break;
                                }
                                idev->if_flags |= IF_READY;
                        }

                        pr_debug("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n",
                                 dev->name);

                        run_pending = 1;
                }

                addrconf_init_auto_addrs(dev);

                if (!IS_ERR_OR_NULL(idev)) {
                        if (run_pending)
                                addrconf_dad_run(idev, false);

                        /* Device has an address by now */
                        rt6_sync_up(dev, RTNH_F_DEAD);

                        /*
                         * If the MTU changed during the interface down,
                         * when the interface up, the changed MTU must be
                         * reflected in the idev as well as routers.
                         */
                        if (idev->cnf.mtu6 != dev->mtu &&
                            dev->mtu >= IPV6_MIN_MTU) {
                                rt6_mtu_change(dev, dev->mtu);
                                WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
                        }
                        WRITE_ONCE(idev->tstamp, jiffies);
                        inet6_ifinfo_notify(RTM_NEWLINK, idev);

                        /*
                         * If the changed mtu during down is lower than
                         * IPV6_MIN_MTU stop IPv6 on this interface.
                         */
                        if (dev->mtu < IPV6_MIN_MTU)
                                addrconf_ifdown(dev, dev != net->loopback_dev);
                }
                break;

        case NETDEV_DOWN:
        case NETDEV_UNREGISTER:
                /*
                 *        Remove all addresses from this interface.
                 */
                addrconf_ifdown(dev, event != NETDEV_DOWN);
                break;

        case NETDEV_CHANGENAME:
                if (idev) {
                        snmp6_unregister_dev(idev);
                        addrconf_sysctl_unregister(idev);
                        err = addrconf_sysctl_register(idev);
                        if (err)
                                return notifier_from_errno(err);
                        err = snmp6_register_dev(idev);
                        if (err) {
                                addrconf_sysctl_unregister(idev);
                                return notifier_from_errno(err);
                        }
                }
                break;

        case NETDEV_PRE_TYPE_CHANGE:
        case NETDEV_POST_TYPE_CHANGE:
                if (idev)
                        addrconf_type_change(dev, event);
                break;

        case NETDEV_CHANGEUPPER:
                info = ptr;

                /* flush all routes if dev is linked to or unlinked from
                 * an L3 master device (e.g., VRF)
                 */
                if (info->upper_dev && netif_is_l3_master(info->upper_dev))
                        addrconf_ifdown(dev, false);
        }

        return NOTIFY_OK;
}

/*
 *        addrconf module should be notified of a device going up
 */
static struct notifier_block ipv6_dev_notf = {
        .notifier_call = addrconf_notify,
        .priority = ADDRCONF_NOTIFY_PRIORITY,
};

static void addrconf_type_change(struct net_device *dev, unsigned long event)
{
        struct inet6_dev *idev;
        ASSERT_RTNL();

        idev = __in6_dev_get(dev);

        if (event == NETDEV_POST_TYPE_CHANGE)
                ipv6_mc_remap(idev);
        else if (event == NETDEV_PRE_TYPE_CHANGE)
                ipv6_mc_unmap(idev);
}

static bool addr_is_local(const struct in6_addr *addr)
{
        return ipv6_addr_type(addr) &
                (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}

static int addrconf_ifdown(struct net_device *dev, bool unregister)
{
        unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
        struct net *net = dev_net(dev);
        struct inet6_dev *idev;
        struct inet6_ifaddr *ifa;
        LIST_HEAD(tmp_addr_list);
        bool keep_addr = false;
        bool was_ready;
        int state, i;

        ASSERT_RTNL();

        rt6_disable_ip(dev, event);

        idev = __in6_dev_get(dev);
        if (!idev)
                return -ENODEV;

        /*
         * Step 1: remove reference to ipv6 device from parent device.
         *           Do not dev_put!
         */
        if (unregister) {
                idev->dead = 1;

                /* protected by rtnl_lock */
                RCU_INIT_POINTER(dev->ip6_ptr, NULL);

                /* Step 1.5: remove snmp6 entry */
                snmp6_unregister_dev(idev);

        }

        /* combine the user config with event to determine if permanent
         * addresses are to be removed from address hash table
         */
        if (!unregister && !idev->cnf.disable_ipv6) {
                /* aggregate the system setting and interface setting */
                int _keep_addr = READ_ONCE(net->ipv6.devconf_all->keep_addr_on_down);

                if (!_keep_addr)
                        _keep_addr = READ_ONCE(idev->cnf.keep_addr_on_down);

                keep_addr = (_keep_addr > 0);
        }

        /* Step 2: clear hash table */
        for (i = 0; i < IN6_ADDR_HSIZE; i++) {
                struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];

                spin_lock_bh(&net->ipv6.addrconf_hash_lock);
restart:
                hlist_for_each_entry_rcu(ifa, h, addr_lst) {
                        if (ifa->idev == idev) {
                                addrconf_del_dad_work(ifa);
                                /* combined flag + permanent flag decide if
                                 * address is retained on a down event
                                 */
                                if (!keep_addr ||
                                    !(ifa->flags & IFA_F_PERMANENT) ||
                                    addr_is_local(&ifa->addr)) {
                                        hlist_del_init_rcu(&ifa->addr_lst);
                                        goto restart;
                                }
                        }
                }
                spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
        }

        write_lock_bh(&idev->lock);

        addrconf_del_rs_timer(idev);

        /* Step 2: clear flags for stateless addrconf, repeated down
         *         detection
         */
        was_ready = idev->if_flags & IF_READY;
        if (!unregister)
                idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);

        /* Step 3: clear tempaddr list */
        while (!list_empty(&idev->tempaddr_list)) {
                ifa = list_first_entry(&idev->tempaddr_list,
                                       struct inet6_ifaddr, tmp_list);
                list_del(&ifa->tmp_list);
                write_unlock_bh(&idev->lock);
                spin_lock_bh(&ifa->lock);

                if (ifa->ifpub) {
                        in6_ifa_put(ifa->ifpub);
                        ifa->ifpub = NULL;
                }
                spin_unlock_bh(&ifa->lock);
                in6_ifa_put(ifa);
                write_lock_bh(&idev->lock);
        }

        list_for_each_entry(ifa, &idev->addr_list, if_list)
                list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
        write_unlock_bh(&idev->lock);

        while (!list_empty(&tmp_addr_list)) {
                struct fib6_info *rt = NULL;
                bool keep;

                ifa = list_first_entry(&tmp_addr_list,
                                       struct inet6_ifaddr, if_list_aux);
                list_del(&ifa->if_list_aux);

                addrconf_del_dad_work(ifa);

                keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
                        !addr_is_local(&ifa->addr);

                spin_lock_bh(&ifa->lock);

                if (keep) {
                        /* set state to skip the notifier below */
                        state = INET6_IFADDR_STATE_DEAD;
                        ifa->state = INET6_IFADDR_STATE_PREDAD;
                        if (!(ifa->flags & IFA_F_NODAD))
                                ifa->flags |= IFA_F_TENTATIVE;

                        rt = ifa->rt;
                        ifa->rt = NULL;
                } else {
                        state = ifa->state;
                        ifa->state = INET6_IFADDR_STATE_DEAD;
                }

                spin_unlock_bh(&ifa->lock);

                if (rt)
                        ip6_del_rt(net, rt, false);

                if (state != INET6_IFADDR_STATE_DEAD) {
                        __ipv6_ifa_notify(RTM_DELADDR, ifa);
                        inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
                } else {
                        if (idev->cnf.forwarding)
                                addrconf_leave_anycast(ifa);
                        addrconf_leave_solict(ifa->idev, &ifa->addr);
                }

                if (!keep) {
                        write_lock_bh(&idev->lock);
                        list_del_rcu(&ifa->if_list);
                        write_unlock_bh(&idev->lock);
                        in6_ifa_put(ifa);
                }
        }

        /* Step 5: Discard anycast and multicast list */
        if (unregister) {
                ipv6_ac_destroy_dev(idev);
                ipv6_mc_destroy_dev(idev);
        } else if (was_ready) {
                ipv6_mc_down(idev);
        }

        WRITE_ONCE(idev->tstamp, jiffies);
        idev->ra_mtu = 0;

        /* Last: Shot the device (if unregistered) */
        if (unregister) {
                addrconf_sysctl_unregister(idev);
                neigh_parms_release(&nd_tbl, idev->nd_parms);
                neigh_ifdown(&nd_tbl, dev);
                in6_dev_put(idev);
        }
        return 0;
}

static void addrconf_rs_timer(struct timer_list *t)
{
        struct inet6_dev *idev = from_timer(idev, t, rs_timer);
        struct net_device *dev = idev->dev;
        struct in6_addr lladdr;
        int rtr_solicits;

        write_lock(&idev->lock);
        if (idev->dead || !(idev->if_flags & IF_READY))
                goto out;

        if (!ipv6_accept_ra(idev))
                goto out;

        /* Announcement received after solicitation was sent */
        if (idev->if_flags & IF_RA_RCVD)
                goto out;

        rtr_solicits = READ_ONCE(idev->cnf.rtr_solicits);

        if (idev->rs_probes++ < rtr_solicits || rtr_solicits < 0) {
                write_unlock(&idev->lock);
                if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
                        ndisc_send_rs(dev, &lladdr,
                                      &in6addr_linklocal_allrouters);
                else
                        goto put;

                write_lock(&idev->lock);
                idev->rs_interval = rfc3315_s14_backoff_update(
                                idev->rs_interval,
                                READ_ONCE(idev->cnf.rtr_solicit_max_interval));
                /* The wait after the last probe can be shorter */
                addrconf_mod_rs_timer(idev, (idev->rs_probes ==
                                             READ_ONCE(idev->cnf.rtr_solicits)) ?
                                      READ_ONCE(idev->cnf.rtr_solicit_delay) :
                                      idev->rs_interval);
        } else {
                /*
                 * Note: we do not support deprecated "all on-link"
                 * assumption any longer.
                 */
                pr_debug("%s: no IPv6 routers present\n", idev->dev->name);
        }

out:
        write_unlock(&idev->lock);
put:
        in6_dev_put(idev);
}

/*
 *        Duplicate Address Detection
 */
static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
{
        struct inet6_dev *idev = ifp->idev;
        unsigned long rand_num;
        u64 nonce;

        if (ifp->flags & IFA_F_OPTIMISTIC)
                rand_num = 0;
        else
                rand_num = get_random_u32_below(
                                READ_ONCE(idev->cnf.rtr_solicit_delay) ? : 1);

        nonce = 0;
        if (READ_ONCE(idev->cnf.enhanced_dad) ||
            READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad)) {
                do
                        get_random_bytes(&nonce, 6);
                while (nonce == 0);
        }
        ifp->dad_nonce = nonce;
        ifp->dad_probes = READ_ONCE(idev->cnf.dad_transmits);
        addrconf_mod_dad_work(ifp, rand_num);
}

static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
        struct inet6_dev *idev = ifp->idev;
        struct net_device *dev = idev->dev;
        bool bump_id, notify = false;
        struct net *net;

        addrconf_join_solict(dev, &ifp->addr);

        read_lock_bh(&idev->lock);
        spin_lock(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_DEAD)
                goto out;

        net = dev_net(dev);
        if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
            (READ_ONCE(net->ipv6.devconf_all->accept_dad) < 1 &&
             READ_ONCE(idev->cnf.accept_dad) < 1) ||
            !(ifp->flags&IFA_F_TENTATIVE) ||
            ifp->flags & IFA_F_NODAD) {
                bool send_na = false;

                if (ifp->flags & IFA_F_TENTATIVE &&
                    !(ifp->flags & IFA_F_OPTIMISTIC))
                        send_na = true;
                bump_id = ifp->flags & IFA_F_TENTATIVE;
                ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
                spin_unlock(&ifp->lock);
                read_unlock_bh(&idev->lock);

                addrconf_dad_completed(ifp, bump_id, send_na);
                return;
        }

        if (!(idev->if_flags & IF_READY)) {
                spin_unlock(&ifp->lock);
                read_unlock_bh(&idev->lock);
                /*
                 * If the device is not ready:
                 * - keep it tentative if it is a permanent address.
                 * - otherwise, kill it.
                 */
                in6_ifa_hold(ifp);
                addrconf_dad_stop(ifp, 0);
                return;
        }

        /*
         * Optimistic nodes can start receiving
         * Frames right away
         */
        if (ifp->flags & IFA_F_OPTIMISTIC) {
                ip6_ins_rt(net, ifp->rt);
                if (ipv6_use_optimistic_addr(net, idev)) {
                        /* Because optimistic nodes can use this address,
                         * notify listeners. If DAD fails, RTM_DELADDR is sent.
                         */
                        notify = true;
                }
        }

        addrconf_dad_kick(ifp);
out:
        spin_unlock(&ifp->lock);
        read_unlock_bh(&idev->lock);
        if (notify)
                ipv6_ifa_notify(RTM_NEWADDR, ifp);
}

static void addrconf_dad_start(struct inet6_ifaddr *ifp)
{
        bool begin_dad = false;

        spin_lock_bh(&ifp->lock);
        if (ifp->state != INET6_IFADDR_STATE_DEAD) {
                ifp->state = INET6_IFADDR_STATE_PREDAD;
                begin_dad = true;
        }
        spin_unlock_bh(&ifp->lock);

        if (begin_dad)
                addrconf_mod_dad_work(ifp, 0);
}

static void addrconf_dad_work(struct work_struct *w)
{
        struct inet6_ifaddr *ifp = container_of(to_delayed_work(w),
                                                struct inet6_ifaddr,
                                                dad_work);
        struct inet6_dev *idev = ifp->idev;
        bool bump_id, disable_ipv6 = false;
        struct in6_addr mcaddr;

        enum {
                DAD_PROCESS,
                DAD_BEGIN,
                DAD_ABORT,
        } action = DAD_PROCESS;

        rtnl_lock();

        spin_lock_bh(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_PREDAD) {
                action = DAD_BEGIN;
                ifp->state = INET6_IFADDR_STATE_DAD;
        } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) {
                action = DAD_ABORT;
                ifp->state = INET6_IFADDR_STATE_POSTDAD;

                if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->accept_dad) > 1 ||
                     READ_ONCE(idev->cnf.accept_dad) > 1) &&
                    !idev->cnf.disable_ipv6 &&
                    !(ifp->flags & IFA_F_STABLE_PRIVACY)) {
                        struct in6_addr addr;

                        addr.s6_addr32[0] = htonl(0xfe800000);
                        addr.s6_addr32[1] = 0;

                        if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
                            ipv6_addr_equal(&ifp->addr, &addr)) {
                                /* DAD failed for link-local based on MAC */
                                WRITE_ONCE(idev->cnf.disable_ipv6, 1);

                                pr_info("%s: IPv6 being disabled!\n",
                                        ifp->idev->dev->name);
                                disable_ipv6 = true;
                        }
                }
        }
        spin_unlock_bh(&ifp->lock);

        if (action == DAD_BEGIN) {
                addrconf_dad_begin(ifp);
                goto out;
        } else if (action == DAD_ABORT) {
                in6_ifa_hold(ifp);
                addrconf_dad_stop(ifp, 1);
                if (disable_ipv6)
                        addrconf_ifdown(idev->dev, false);
                goto out;
        }

        if (!ifp->dad_probes && addrconf_dad_end(ifp))
                goto out;

        write_lock_bh(&idev->lock);
        if (idev->dead || !(idev->if_flags & IF_READY)) {
                write_unlock_bh(&idev->lock);
                goto out;
        }

        spin_lock(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_DEAD) {
                spin_unlock(&ifp->lock);
                write_unlock_bh(&idev->lock);
                goto out;
        }

        if (ifp->dad_probes == 0) {
                bool send_na = false;

                /*
                 * DAD was successful
                 */

                if (ifp->flags & IFA_F_TENTATIVE &&
                    !(ifp->flags & IFA_F_OPTIMISTIC))
                        send_na = true;
                bump_id = ifp->flags & IFA_F_TENTATIVE;
                ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
                spin_unlock(&ifp->lock);
                write_unlock_bh(&idev->lock);

                addrconf_dad_completed(ifp, bump_id, send_na);

                goto out;
        }

        ifp->dad_probes--;
        addrconf_mod_dad_work(ifp,
                              max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
                                  HZ/100));
        spin_unlock(&ifp->lock);
        write_unlock_bh(&idev->lock);

        /* send a neighbour solicitation for our addr */
        addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
        ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
                      ifp->dad_nonce);
out:
        in6_ifa_put(ifp);
        rtnl_unlock();
}

/* ifp->idev must be at least read locked */
static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
{
        struct inet6_ifaddr *ifpiter;
        struct inet6_dev *idev = ifp->idev;

        list_for_each_entry_reverse(ifpiter, &idev->addr_list, if_list) {
                if (ifpiter->scope > IFA_LINK)
                        break;
                if (ifp != ifpiter && ifpiter->scope == IFA_LINK &&
                    (ifpiter->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE|
                                       IFA_F_OPTIMISTIC|IFA_F_DADFAILED)) ==
                    IFA_F_PERMANENT)
                        return false;
        }
        return true;
}

static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
                                   bool send_na)
{
        struct net_device *dev = ifp->idev->dev;
        struct in6_addr lladdr;
        bool send_rs, send_mld;

        addrconf_del_dad_work(ifp);

        /*
         *        Configure the address for reception. Now it is valid.
         */

        ipv6_ifa_notify(RTM_NEWADDR, ifp);

        /* If added prefix is link local and we are prepared to process
           router advertisements, start sending router solicitations.
         */

        read_lock_bh(&ifp->idev->lock);
        send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
        send_rs = send_mld &&
                  ipv6_accept_ra(ifp->idev) &&
                  READ_ONCE(ifp->idev->cnf.rtr_solicits) != 0 &&
                  (dev->flags & IFF_LOOPBACK) == 0 &&
                  (dev->type != ARPHRD_TUNNEL) &&
                  !netif_is_team_port(dev);
        read_unlock_bh(&ifp->idev->lock);

        /* While dad is in progress mld report's source address is in6_addrany.
         * Resend with proper ll now.
         */
        if (send_mld)
                ipv6_mc_dad_complete(ifp->idev);

        /* send unsolicited NA if enabled */
        if (send_na &&
            (READ_ONCE(ifp->idev->cnf.ndisc_notify) ||
             READ_ONCE(dev_net(dev)->ipv6.devconf_all->ndisc_notify))) {
                ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr,
                              /*router=*/ !!ifp->idev->cnf.forwarding,
                              /*solicited=*/ false, /*override=*/ true,
                              /*inc_opt=*/ true);
        }

        if (send_rs) {
                /*
                 *        If a host as already performed a random delay
                 *        [...] as part of DAD [...] there is no need
                 *        to delay again before sending the first RS
                 */
                if (ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
                        return;
                ndisc_send_rs(dev, &lladdr, &in6addr_linklocal_allrouters);

                write_lock_bh(&ifp->idev->lock);
                spin_lock(&ifp->lock);
                ifp->idev->rs_interval = rfc3315_s14_backoff_init(
                        READ_ONCE(ifp->idev->cnf.rtr_solicit_interval));
                ifp->idev->rs_probes = 1;
                ifp->idev->if_flags |= IF_RS_SENT;
                addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
                spin_unlock(&ifp->lock);
                write_unlock_bh(&ifp->idev->lock);
        }

        if (bump_id)
                rt_genid_bump_ipv6(dev_net(dev));

        /* Make sure that a new temporary address will be created
         * before this temporary address becomes deprecated.
         */
        if (ifp->flags & IFA_F_TEMPORARY)
                addrconf_verify_rtnl(dev_net(dev));
}

static void addrconf_dad_run(struct inet6_dev *idev, bool restart)
{
        struct inet6_ifaddr *ifp;

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifp, &idev->addr_list, if_list) {
                spin_lock(&ifp->lock);
                if ((ifp->flags & IFA_F_TENTATIVE &&
                     ifp->state == INET6_IFADDR_STATE_DAD) || restart) {
                        if (restart)
                                ifp->state = INET6_IFADDR_STATE_PREDAD;
                        addrconf_dad_kick(ifp);
                }
                spin_unlock(&ifp->lock);
        }
        read_unlock_bh(&idev->lock);
}

#ifdef CONFIG_PROC_FS
struct if6_iter_state {
        struct seq_net_private p;
        int bucket;
        int offset;
};

static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
{
        struct if6_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);
        struct inet6_ifaddr *ifa = NULL;
        int p = 0;

        /* initial bucket if pos is 0 */
        if (pos == 0) {
                state->bucket = 0;
                state->offset = 0;
        }

        for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
                hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket],
                                         addr_lst) {
                        /* sync with offset */
                        if (p < state->offset) {
                                p++;
                                continue;
                        }
                        return ifa;
                }

                /* prepare for next bucket */
                state->offset = 0;
                p = 0;
        }
        return NULL;
}

static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
                                         struct inet6_ifaddr *ifa)
{
        struct if6_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);

        hlist_for_each_entry_continue_rcu(ifa, addr_lst) {
                state->offset++;
                return ifa;
        }

        state->offset = 0;
        while (++state->bucket < IN6_ADDR_HSIZE) {
                hlist_for_each_entry_rcu(ifa,
                                     &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) {
                        return ifa;
                }
        }

        return NULL;
}

static void *if6_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return if6_get_first(seq, *pos);
}

static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct inet6_ifaddr *ifa;

        ifa = if6_get_next(seq, v);
        ++*pos;
        return ifa;
}

static void if6_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
{
        rcu_read_unlock();
}

static int if6_seq_show(struct seq_file *seq, void *v)
{
        struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
        seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n",
                   &ifp->addr,
                   ifp->idev->dev->ifindex,
                   ifp->prefix_len,
                   ifp->scope,
                   (u8) ifp->flags,
                   ifp->idev->dev->name);
        return 0;
}

static const struct seq_operations if6_seq_ops = {
        .start        = if6_seq_start,
        .next        = if6_seq_next,
        .show        = if6_seq_show,
        .stop        = if6_seq_stop,
};

static int __net_init if6_proc_net_init(struct net *net)
{
        if (!proc_create_net("if_inet6", 0444, net->proc_net, &if6_seq_ops,
                        sizeof(struct if6_iter_state)))
                return -ENOMEM;
        return 0;
}

static void __net_exit if6_proc_net_exit(struct net *net)
{
        remove_proc_entry("if_inet6", net->proc_net);
}

static struct pernet_operations if6_proc_net_ops = {
        .init = if6_proc_net_init,
        .exit = if6_proc_net_exit,
};

int __init if6_proc_init(void)
{
        return register_pernet_subsys(&if6_proc_net_ops);
}

void if6_proc_exit(void)
{
        unregister_pernet_subsys(&if6_proc_net_ops);
}
#endif        /* CONFIG_PROC_FS */

#if IS_ENABLED(CONFIG_IPV6_MIP6)
/* Check if address is a home address configured on any interface. */
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
{
        unsigned int hash = inet6_addr_hash(net, addr);
        struct inet6_ifaddr *ifp = NULL;
        int ret = 0;

        rcu_read_lock();
        hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr) &&
                    (ifp->flags & IFA_F_HOMEADDRESS)) {
                        ret = 1;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}
#endif

/* RFC6554 has some algorithm to avoid loops in segment routing by
 * checking if the segments contains any of a local interface address.
 *
 * Quote:
 *
 * To detect loops in the SRH, a router MUST determine if the SRH
 * includes multiple addresses assigned to any interface on that router.
 * If such addresses appear more than once and are separated by at least
 * one address not assigned to that router.
 */
int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                          unsigned char nsegs)
{
        const struct in6_addr *addr;
        int i, ret = 0, found = 0;
        struct inet6_ifaddr *ifp;
        bool separated = false;
        unsigned int hash;
        bool hash_found;

        rcu_read_lock();
        for (i = 0; i < nsegs; i++) {
                addr = &segs[i];
                hash = inet6_addr_hash(net, addr);

                hash_found = false;
                hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {

                        if (ipv6_addr_equal(&ifp->addr, addr)) {
                                hash_found = true;
                                break;
                        }
                }

                if (hash_found) {
                        if (found > 1 && separated) {
                                ret = 1;
                                break;
                        }

                        separated = false;
                        found++;
                } else {
                        separated = true;
                }
        }
        rcu_read_unlock();

        return ret;
}

/*
 *        Periodic address status verification
 */

static void addrconf_verify_rtnl(struct net *net)
{
        unsigned long now, next, next_sec, next_sched;
        struct inet6_ifaddr *ifp;
        int i;

        ASSERT_RTNL();

        rcu_read_lock_bh();
        now = jiffies;
        next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);

        cancel_delayed_work(&net->ipv6.addr_chk_work);

        for (i = 0; i < IN6_ADDR_HSIZE; i++) {
restart:
                hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) {
                        unsigned long age;

                        /* When setting preferred_lft to a value not zero or
                         * infinity, while valid_lft is infinity
                         * IFA_F_PERMANENT has a non-infinity life time.
                         */
                        if ((ifp->flags & IFA_F_PERMANENT) &&
                            (ifp->prefered_lft == INFINITY_LIFE_TIME))
                                continue;

                        spin_lock(&ifp->lock);
                        /* We try to batch several events at once. */
                        age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;

                        if ((ifp->flags&IFA_F_TEMPORARY) &&
                            !(ifp->flags&IFA_F_TENTATIVE) &&
                            ifp->prefered_lft != INFINITY_LIFE_TIME &&
                            !ifp->regen_count && ifp->ifpub) {
                                /* This is a non-regenerated temporary addr. */

                                unsigned long regen_advance = ipv6_get_regen_advance(ifp->idev);

                                if (age + regen_advance >= ifp->prefered_lft) {
                                        struct inet6_ifaddr *ifpub = ifp->ifpub;
                                        if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
                                                next = ifp->tstamp + ifp->prefered_lft * HZ;

                                        ifp->regen_count++;
                                        in6_ifa_hold(ifp);
                                        in6_ifa_hold(ifpub);
                                        spin_unlock(&ifp->lock);

                                        spin_lock(&ifpub->lock);
                                        ifpub->regen_count = 0;
                                        spin_unlock(&ifpub->lock);
                                        rcu_read_unlock_bh();
                                        ipv6_create_tempaddr(ifpub, true);
                                        in6_ifa_put(ifpub);
                                        in6_ifa_put(ifp);
                                        rcu_read_lock_bh();
                                        goto restart;
                                } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
                                        next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
                        }

                        if (ifp->valid_lft != INFINITY_LIFE_TIME &&
                            age >= ifp->valid_lft) {
                                spin_unlock(&ifp->lock);
                                in6_ifa_hold(ifp);
                                rcu_read_unlock_bh();
                                ipv6_del_addr(ifp);
                                rcu_read_lock_bh();
                                goto restart;
                        } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
                                spin_unlock(&ifp->lock);
                                continue;
                        } else if (age >= ifp->prefered_lft) {
                                /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
                                int deprecate = 0;

                                if (!(ifp->flags&IFA_F_DEPRECATED)) {
                                        deprecate = 1;
                                        ifp->flags |= IFA_F_DEPRECATED;
                                }

                                if ((ifp->valid_lft != INFINITY_LIFE_TIME) &&
                                    (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)))
                                        next = ifp->tstamp + ifp->valid_lft * HZ;

                                spin_unlock(&ifp->lock);

                                if (deprecate) {
                                        in6_ifa_hold(ifp);

                                        ipv6_ifa_notify(0, ifp);
                                        in6_ifa_put(ifp);
                                        goto restart;
                                }
                        } else {
                                /* ifp->prefered_lft <= ifp->valid_lft */
                                if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
                                        next = ifp->tstamp + ifp->prefered_lft * HZ;
                                spin_unlock(&ifp->lock);
                        }
                }
        }

        next_sec = round_jiffies_up(next);
        next_sched = next;

        /* If rounded timeout is accurate enough, accept it. */
        if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
                next_sched = next_sec;

        /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
        if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
                next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;

        pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
                 now, next, next_sec, next_sched);
        mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now);
        rcu_read_unlock_bh();
}

static void addrconf_verify_work(struct work_struct *w)
{
        struct net *net = container_of(to_delayed_work(w), struct net,
                                       ipv6.addr_chk_work);

        rtnl_lock();
        addrconf_verify_rtnl(net);
        rtnl_unlock();
}

static void addrconf_verify(struct net *net)
{
        mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0);
}

static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
                                     struct in6_addr **peer_pfx)
{
        struct in6_addr *pfx = NULL;

        *peer_pfx = NULL;

        if (addr)
                pfx = nla_data(addr);

        if (local) {
                if (pfx && nla_memcmp(local, pfx, sizeof(*pfx)))
                        *peer_pfx = pfx;
                pfx = nla_data(local);
        }

        return pfx;
}

static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
        [IFA_ADDRESS]                = { .len = sizeof(struct in6_addr) },
        [IFA_LOCAL]                = { .len = sizeof(struct in6_addr) },
        [IFA_CACHEINFO]                = { .len = sizeof(struct ifa_cacheinfo) },
        [IFA_FLAGS]                = { .len = sizeof(u32) },
        [IFA_RT_PRIORITY]        = { .len = sizeof(u32) },
        [IFA_TARGET_NETNSID]        = { .type = NLA_S32 },
        [IFA_PROTO]                = { .type = NLA_U8 },
};

static int
inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
                  struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifaddrmsg *ifm;
        struct nlattr *tb[IFA_MAX+1];
        struct in6_addr *pfx, *peer_pfx;
        u32 ifa_flags;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv6_policy, extack);
        if (err < 0)
                return err;

        ifm = nlmsg_data(nlh);
        pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
        if (!pfx)
                return -EINVAL;

        ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;

        /* We ignore other flags so far. */
        ifa_flags &= IFA_F_MANAGETEMPADDR;

        return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
                              ifm->ifa_prefixlen, extack);
}

static int modify_prefix_route(struct inet6_ifaddr *ifp,
                               unsigned long expires, u32 flags,
                               bool modify_peer)
{
        struct fib6_table *table;
        struct fib6_info *f6i;
        u32 prio;

        f6i = addrconf_get_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
                                        ifp->prefix_len,
                                        ifp->idev->dev, 0, RTF_DEFAULT, true);
        if (!f6i)
                return -ENOENT;

        prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
        if (f6i->fib6_metric != prio) {
                /* delete old one */
                ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);

                /* add new one */
                addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
                                      ifp->prefix_len,
                                      ifp->rt_priority, ifp->idev->dev,
                                      expires, flags, GFP_KERNEL);
        } else {
                table = f6i->fib6_table;
                spin_lock_bh(&table->tb6_lock);

                if (!(flags & RTF_EXPIRES)) {
                        fib6_clean_expires(f6i);
                        fib6_remove_gc_list(f6i);
                } else {
                        fib6_set_expires(f6i, expires);
                        fib6_add_gc_list(f6i);
                }

                spin_unlock_bh(&table->tb6_lock);

                fib6_info_release(f6i);
        }

        return 0;
}

static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
                             struct ifa6_config *cfg)
{
        u32 flags;
        clock_t expires;
        unsigned long timeout;
        bool was_managetempaddr;
        bool had_prefixroute;
        bool new_peer = false;

        ASSERT_RTNL();

        if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
                return -EINVAL;

        if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
            (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
                return -EINVAL;

        if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
                cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;

        timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
        if (addrconf_finite_timeout(timeout)) {
                expires = jiffies_to_clock_t(timeout * HZ);
                cfg->valid_lft = timeout;
                flags = RTF_EXPIRES;
        } else {
                expires = 0;
                flags = 0;
                cfg->ifa_flags |= IFA_F_PERMANENT;
        }

        timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
        if (addrconf_finite_timeout(timeout)) {
                if (timeout == 0)
                        cfg->ifa_flags |= IFA_F_DEPRECATED;
                cfg->preferred_lft = timeout;
        }

        if (cfg->peer_pfx &&
            memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) {
                if (!ipv6_addr_any(&ifp->peer_addr))
                        cleanup_prefix_route(ifp, expires, true, true);
                new_peer = true;
        }

        spin_lock_bh(&ifp->lock);
        was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR;
        had_prefixroute = ifp->flags & IFA_F_PERMANENT &&
                          !(ifp->flags & IFA_F_NOPREFIXROUTE);
        ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
                        IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
                        IFA_F_NOPREFIXROUTE);
        ifp->flags |= cfg->ifa_flags;
        WRITE_ONCE(ifp->tstamp, jiffies);
        WRITE_ONCE(ifp->valid_lft, cfg->valid_lft);
        WRITE_ONCE(ifp->prefered_lft, cfg->preferred_lft);
        WRITE_ONCE(ifp->ifa_proto, cfg->ifa_proto);

        if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
                WRITE_ONCE(ifp->rt_priority, cfg->rt_priority);

        if (new_peer)
                ifp->peer_addr = *cfg->peer_pfx;

        spin_unlock_bh(&ifp->lock);
        if (!(ifp->flags&IFA_F_TENTATIVE))
                ipv6_ifa_notify(0, ifp);

        if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
                int rc = -ENOENT;

                if (had_prefixroute)
                        rc = modify_prefix_route(ifp, expires, flags, false);

                /* prefix route could have been deleted; if so restore it */
                if (rc == -ENOENT) {
                        addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
                                              ifp->rt_priority, ifp->idev->dev,
                                              expires, flags, GFP_KERNEL);
                }

                if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr))
                        rc = modify_prefix_route(ifp, expires, flags, true);

                if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) {
                        addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len,
                                              ifp->rt_priority, ifp->idev->dev,
                                              expires, flags, GFP_KERNEL);
                }
        } else if (had_prefixroute) {
                enum cleanup_prefix_rt_t action;
                unsigned long rt_expires;

                write_lock_bh(&ifp->idev->lock);
                action = check_cleanup_prefix_route(ifp, &rt_expires);
                write_unlock_bh(&ifp->idev->lock);

                if (action != CLEANUP_PREFIX_RT_NOP) {
                        cleanup_prefix_route(ifp, rt_expires,
                                action == CLEANUP_PREFIX_RT_DEL, false);
                }
        }

        if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
                if (was_managetempaddr &&
                    !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
                        cfg->valid_lft = 0;
                        cfg->preferred_lft = 0;
                }
                manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
                                 cfg->preferred_lft, !was_managetempaddr,
                                 jiffies);
        }

        addrconf_verify_rtnl(net);

        return 0;
}

static int
inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
                  struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifaddrmsg *ifm;
        struct nlattr *tb[IFA_MAX+1];
        struct in6_addr *peer_pfx;
        struct inet6_ifaddr *ifa;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct ifa6_config cfg;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv6_policy, extack);
        if (err < 0)
                return err;

        memset(&cfg, 0, sizeof(cfg));

        ifm = nlmsg_data(nlh);
        cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
        if (!cfg.pfx)
                return -EINVAL;

        cfg.peer_pfx = peer_pfx;
        cfg.plen = ifm->ifa_prefixlen;
        if (tb[IFA_RT_PRIORITY])
                cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);

        if (tb[IFA_PROTO])
                cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]);

        cfg.valid_lft = INFINITY_LIFE_TIME;
        cfg.preferred_lft = INFINITY_LIFE_TIME;

        if (tb[IFA_CACHEINFO]) {
                struct ifa_cacheinfo *ci;

                ci = nla_data(tb[IFA_CACHEINFO]);
                cfg.valid_lft = ci->ifa_valid;
                cfg.preferred_lft = ci->ifa_prefered;
        }

        dev =  __dev_get_by_index(net, ifm->ifa_index);
        if (!dev) {
                NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
                return -ENODEV;
        }

        if (tb[IFA_FLAGS])
                cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]);
        else
                cfg.ifa_flags = ifm->ifa_flags;

        /* We ignore other flags so far. */
        cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
                         IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
                         IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev))
                return PTR_ERR(idev);

        if (!ipv6_allow_optimistic_dad(net, idev))
                cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;

        if (cfg.ifa_flags & IFA_F_NODAD &&
            cfg.ifa_flags & IFA_F_OPTIMISTIC) {
                NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
                return -EINVAL;
        }

        ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
        if (!ifa) {
                /*
                 * It would be best to check for !NLM_F_CREATE here but
                 * userspace already relies on not having to provide this.
                 */
                return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
        }

        if (nlh->nlmsg_flags & NLM_F_EXCL ||
            !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
                NL_SET_ERR_MSG_MOD(extack, "address already assigned");
                err = -EEXIST;
        } else {
                err = inet6_addr_modify(net, ifa, &cfg);
        }

        in6_ifa_put(ifa);

        return err;
}

static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u32 flags,
                          u8 scope, int ifindex)
{
        struct ifaddrmsg *ifm;

        ifm = nlmsg_data(nlh);
        ifm->ifa_family = AF_INET6;
        ifm->ifa_prefixlen = prefixlen;
        ifm->ifa_flags = flags;
        ifm->ifa_scope = scope;
        ifm->ifa_index = ifindex;
}

static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
                         unsigned long tstamp, u32 preferred, u32 valid)
{
        struct ifa_cacheinfo ci;

        ci.cstamp = cstamp_delta(cstamp);
        ci.tstamp = cstamp_delta(tstamp);
        ci.ifa_prefered = preferred;
        ci.ifa_valid = valid;

        return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}

static inline int rt_scope(int ifa_scope)
{
        if (ifa_scope & IFA_HOST)
                return RT_SCOPE_HOST;
        else if (ifa_scope & IFA_LINK)
                return RT_SCOPE_LINK;
        else if (ifa_scope & IFA_SITE)
                return RT_SCOPE_SITE;
        else
                return RT_SCOPE_UNIVERSE;
}

static inline int inet6_ifaddr_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
               + nla_total_size(16) /* IFA_LOCAL */
               + nla_total_size(16) /* IFA_ADDRESS */
               + nla_total_size(sizeof(struct ifa_cacheinfo))
               + nla_total_size(4)  /* IFA_FLAGS */
               + nla_total_size(1)  /* IFA_PROTO */
               + nla_total_size(4)  /* IFA_RT_PRIORITY */;
}

enum addr_type_t {
        UNICAST_ADDR,
        MULTICAST_ADDR,
        ANYCAST_ADDR,
};

struct inet6_fill_args {
        u32 portid;
        u32 seq;
        int event;
        unsigned int flags;
        int netnsid;
        int ifindex;
        enum addr_type_t type;
};

static int inet6_fill_ifaddr(struct sk_buff *skb,
                             const struct inet6_ifaddr *ifa,
                             struct inet6_fill_args *args)
{
        struct nlmsghdr *nlh;
        u32 preferred, valid;
        u32 flags, priority;
        u8 proto;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        flags = READ_ONCE(ifa->flags);
        put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
                      ifa->idev->dev->ifindex);

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
                goto error;

        preferred = READ_ONCE(ifa->prefered_lft);
        valid = READ_ONCE(ifa->valid_lft);

        if (!((flags & IFA_F_PERMANENT) &&
              (preferred == INFINITY_LIFE_TIME))) {
                if (preferred != INFINITY_LIFE_TIME) {
                        long tval = (jiffies - READ_ONCE(ifa->tstamp)) / HZ;

                        if (preferred > tval)
                                preferred -= tval;
                        else
                                preferred = 0;
                        if (valid != INFINITY_LIFE_TIME) {
                                if (valid > tval)
                                        valid -= tval;
                                else
                                        valid = 0;
                        }
                }
        } else {
                preferred = INFINITY_LIFE_TIME;
                valid = INFINITY_LIFE_TIME;
        }

        if (!ipv6_addr_any(&ifa->peer_addr)) {
                if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 ||
                    nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0)
                        goto error;
        } else {
                if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
                        goto error;
        }

        priority = READ_ONCE(ifa->rt_priority);
        if (priority && nla_put_u32(skb, IFA_RT_PRIORITY, priority))
                goto error;

        if (put_cacheinfo(skb, ifa->cstamp, READ_ONCE(ifa->tstamp),
                          preferred, valid) < 0)
                goto error;

        if (nla_put_u32(skb, IFA_FLAGS, flags) < 0)
                goto error;

        proto = READ_ONCE(ifa->ifa_proto);
        if (proto && nla_put_u8(skb, IFA_PROTO, proto))
                goto error;

        nlmsg_end(skb, nlh);
        return 0;

error:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int inet6_fill_ifmcaddr(struct sk_buff *skb,
                               const struct ifmcaddr6 *ifmca,
                               struct inet6_fill_args *args)
{
        int ifindex = ifmca->idev->dev->ifindex;
        u8 scope = RT_SCOPE_UNIVERSE;
        struct nlmsghdr *nlh;

        if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
                scope = RT_SCOPE_SITE;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
        if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
            put_cacheinfo(skb, ifmca->mca_cstamp, READ_ONCE(ifmca->mca_tstamp),
                          INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static int inet6_fill_ifacaddr(struct sk_buff *skb,
                               const struct ifacaddr6 *ifaca,
                               struct inet6_fill_args *args)
{
        struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
        int ifindex = dev ? dev->ifindex : 1;
        u8 scope = RT_SCOPE_UNIVERSE;
        struct nlmsghdr *nlh;

        if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
                scope = RT_SCOPE_SITE;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
        if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
            put_cacheinfo(skb, ifaca->aca_cstamp, READ_ONCE(ifaca->aca_tstamp),
                          INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

/* called with rcu_read_lock() */
static int in6_dump_addrs(const struct inet6_dev *idev, struct sk_buff *skb,
                          struct netlink_callback *cb, int *s_ip_idx,
                          struct inet6_fill_args *fillargs)
{
        const struct ifmcaddr6 *ifmca;
        const struct ifacaddr6 *ifaca;
        int ip_idx = 0;
        int err = 0;

        switch (fillargs->type) {
        case UNICAST_ADDR: {
                const struct inet6_ifaddr *ifa;
                fillargs->event = RTM_NEWADDR;

                /* unicast address incl. temp addr */
                list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        if (ip_idx < *s_ip_idx)
                                goto next;
                        err = inet6_fill_ifaddr(skb, ifa, fillargs);
                        if (err < 0)
                                break;
                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
next:
                        ip_idx++;
                }
                break;
        }
        case MULTICAST_ADDR:
                fillargs->event = RTM_GETMULTICAST;

                /* multicast address */
                for (ifmca = rcu_dereference(idev->mc_list);
                     ifmca;
                     ifmca = rcu_dereference(ifmca->next), ip_idx++) {
                        if (ip_idx < *s_ip_idx)
                                continue;
                        err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
                        if (err < 0)
                                break;
                }
                break;
        case ANYCAST_ADDR:
                fillargs->event = RTM_GETANYCAST;
                /* anycast address */
                for (ifaca = rcu_dereference(idev->ac_list); ifaca;
                     ifaca = rcu_dereference(ifaca->aca_next), ip_idx++) {
                        if (ip_idx < *s_ip_idx)
                                continue;
                        err = inet6_fill_ifacaddr(skb, ifaca, fillargs);
                        if (err < 0)
                                break;
                }
                break;
        default:
                break;
        }
        *s_ip_idx = err ? ip_idx : 0;
        return err;
}

static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
                                       struct inet6_fill_args *fillargs,
                                       struct net **tgt_net, struct sock *sk,
                                       struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[IFA_MAX+1];
        struct ifaddrmsg *ifm;
        int err, i;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request");
                return -EINVAL;
        }

        ifm = nlmsg_data(nlh);
        if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request");
                return -EINVAL;
        }

        fillargs->ifindex = ifm->ifa_index;
        if (fillargs->ifindex) {
                cb->answer_flags |= NLM_F_DUMP_FILTERED;
                fillargs->flags |= NLM_F_DUMP_FILTERED;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
                                            ifa_ipv6_policy, extack);
        if (err < 0)
                return err;

        for (i = 0; i <= IFA_MAX; ++i) {
                if (!tb[i])
                        continue;

                if (i == IFA_TARGET_NETNSID) {
                        struct net *net;

                        fillargs->netnsid = nla_get_s32(tb[i]);
                        net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
                        if (IS_ERR(net)) {
                                fillargs->netnsid = -1;
                                NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        *tgt_net = net;
                } else {
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                           enum addr_type_t type)
{
        struct net *tgt_net = sock_net(skb->sk);
        const struct nlmsghdr *nlh = cb->nlh;
        struct inet6_fill_args fillargs = {
                .portid = NETLINK_CB(cb->skb).portid,
                .seq = cb->nlh->nlmsg_seq,
                .flags = NLM_F_MULTI,
                .netnsid = -1,
                .type = type,
        };
        struct {
                unsigned long ifindex;
                int ip_idx;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        struct inet6_dev *idev;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
                                                  skb->sk, cb);
                if (err < 0)
                        goto done;

                err = 0;
                if (fillargs.ifindex) {
                        dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
                        if (!dev) {
                                err = -ENODEV;
                                goto done;
                        }
                        idev = __in6_dev_get(dev);
                        if (idev)
                                err = in6_dump_addrs(idev, skb, cb,
                                                     &ctx->ip_idx,
                                                     &fillargs);
                        goto done;
                }
        }

        cb->seq = inet6_base_seq(tgt_net);
        for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        continue;
                err = in6_dump_addrs(idev, skb, cb, &ctx->ip_idx,
                                     &fillargs);
                if (err < 0)
                        goto done;
        }
done:
        rcu_read_unlock();
        if (fillargs.netnsid >= 0)
                put_net(tgt_net);

        return err;
}

static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        enum addr_type_t type = UNICAST_ADDR;

        return inet6_dump_addr(skb, cb, type);
}

static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        enum addr_type_t type = MULTICAST_ADDR;

        return inet6_dump_addr(skb, cb, type);
}


static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        enum addr_type_t type = ANYCAST_ADDR;

        return inet6_dump_addr(skb, cb, type);
}

static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        struct ifaddrmsg *ifm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                              ifa_ipv6_policy, extack);

        ifm = nlmsg_data(nlh);
        if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
                                            ifa_ipv6_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= IFA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case IFA_TARGET_NETNSID:
                case IFA_ADDRESS:
                case IFA_LOCAL:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get address request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *tgt_net = sock_net(in_skb->sk);
        struct inet6_fill_args fillargs = {
                .portid = NETLINK_CB(in_skb).portid,
                .seq = nlh->nlmsg_seq,
                .event = RTM_NEWADDR,
                .flags = 0,
                .netnsid = -1,
        };
        struct ifaddrmsg *ifm;
        struct nlattr *tb[IFA_MAX+1];
        struct in6_addr *addr = NULL, *peer;
        struct net_device *dev = NULL;
        struct inet6_ifaddr *ifa;
        struct sk_buff *skb;
        int err;

        err = inet6_rtm_valid_getaddr_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        if (tb[IFA_TARGET_NETNSID]) {
                fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);

                tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk,
                                                  fillargs.netnsid);
                if (IS_ERR(tgt_net))
                        return PTR_ERR(tgt_net);
        }

        addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
        if (!addr) {
                err = -EINVAL;
                goto errout;
        }
        ifm = nlmsg_data(nlh);
        if (ifm->ifa_index)
                dev = dev_get_by_index(tgt_net, ifm->ifa_index);

        ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1);
        if (!ifa) {
                err = -EADDRNOTAVAIL;
                goto errout;
        }

        skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL);
        if (!skb) {
                err = -ENOBUFS;
                goto errout_ifa;
        }

        err = inet6_fill_ifaddr(skb, ifa, &fillargs);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout_ifa;
        }
        err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid);
errout_ifa:
        in6_ifa_put(ifa);
errout:
        dev_put(dev);
        if (fillargs.netnsid >= 0)
                put_net(tgt_net);

        return err;
}

static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
{
        struct sk_buff *skb;
        struct net *net = dev_net(ifa->idev->dev);
        struct inet6_fill_args fillargs = {
                .portid = 0,
                .seq = 0,
                .event = event,
                .flags = 0,
                .netnsid = -1,
        };
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = inet6_fill_ifaddr(skb, ifa, &fillargs);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
        return;
errout:
        if (err < 0)
                rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
}

static void ipv6_store_devconf(const struct ipv6_devconf *cnf,
                               __s32 *array, int bytes)
{
        BUG_ON(bytes < (DEVCONF_MAX * 4));

        memset(array, 0, bytes);
        array[DEVCONF_FORWARDING] = READ_ONCE(cnf->forwarding);
        array[DEVCONF_HOPLIMIT] = READ_ONCE(cnf->hop_limit);
        array[DEVCONF_MTU6] = READ_ONCE(cnf->mtu6);
        array[DEVCONF_ACCEPT_RA] = READ_ONCE(cnf->accept_ra);
        array[DEVCONF_ACCEPT_REDIRECTS] = READ_ONCE(cnf->accept_redirects);
        array[DEVCONF_AUTOCONF] = READ_ONCE(cnf->autoconf);
        array[DEVCONF_DAD_TRANSMITS] = READ_ONCE(cnf->dad_transmits);
        array[DEVCONF_RTR_SOLICITS] = READ_ONCE(cnf->rtr_solicits);
        array[DEVCONF_RTR_SOLICIT_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_interval));
        array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_max_interval));
        array[DEVCONF_RTR_SOLICIT_DELAY] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_delay));
        array[DEVCONF_FORCE_MLD_VERSION] = READ_ONCE(cnf->force_mld_version);
        array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->mldv1_unsolicited_report_interval));
        array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->mldv2_unsolicited_report_interval));
        array[DEVCONF_USE_TEMPADDR] = READ_ONCE(cnf->use_tempaddr);
        array[DEVCONF_TEMP_VALID_LFT] = READ_ONCE(cnf->temp_valid_lft);
        array[DEVCONF_TEMP_PREFERED_LFT] = READ_ONCE(cnf->temp_prefered_lft);
        array[DEVCONF_REGEN_MAX_RETRY] = READ_ONCE(cnf->regen_max_retry);
        array[DEVCONF_MAX_DESYNC_FACTOR] = READ_ONCE(cnf->max_desync_factor);
        array[DEVCONF_MAX_ADDRESSES] = READ_ONCE(cnf->max_addresses);
        array[DEVCONF_ACCEPT_RA_DEFRTR] = READ_ONCE(cnf->accept_ra_defrtr);
        array[DEVCONF_RA_DEFRTR_METRIC] = READ_ONCE(cnf->ra_defrtr_metric);
        array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] =
                READ_ONCE(cnf->accept_ra_min_hop_limit);
        array[DEVCONF_ACCEPT_RA_PINFO] = READ_ONCE(cnf->accept_ra_pinfo);
#ifdef CONFIG_IPV6_ROUTER_PREF
        array[DEVCONF_ACCEPT_RA_RTR_PREF] = READ_ONCE(cnf->accept_ra_rtr_pref);
        array[DEVCONF_RTR_PROBE_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_probe_interval));
#ifdef CONFIG_IPV6_ROUTE_INFO
        array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] =
                READ_ONCE(cnf->accept_ra_rt_info_min_plen);
        array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] =
                READ_ONCE(cnf->accept_ra_rt_info_max_plen);
#endif
#endif
        array[DEVCONF_PROXY_NDP] = READ_ONCE(cnf->proxy_ndp);
        array[DEVCONF_ACCEPT_SOURCE_ROUTE] =
                READ_ONCE(cnf->accept_source_route);
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        array[DEVCONF_OPTIMISTIC_DAD] = READ_ONCE(cnf->optimistic_dad);
        array[DEVCONF_USE_OPTIMISTIC] = READ_ONCE(cnf->use_optimistic);
#endif
#ifdef CONFIG_IPV6_MROUTE
        array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
#endif
        array[DEVCONF_DISABLE_IPV6] = READ_ONCE(cnf->disable_ipv6);
        array[DEVCONF_ACCEPT_DAD] = READ_ONCE(cnf->accept_dad);
        array[DEVCONF_FORCE_TLLAO] = READ_ONCE(cnf->force_tllao);
        array[DEVCONF_NDISC_NOTIFY] = READ_ONCE(cnf->ndisc_notify);
        array[DEVCONF_SUPPRESS_FRAG_NDISC] =
                READ_ONCE(cnf->suppress_frag_ndisc);
        array[DEVCONF_ACCEPT_RA_FROM_LOCAL] =
                READ_ONCE(cnf->accept_ra_from_local);
        array[DEVCONF_ACCEPT_RA_MTU] = READ_ONCE(cnf->accept_ra_mtu);
        array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] =
                READ_ONCE(cnf->ignore_routes_with_linkdown);
        /* we omit DEVCONF_STABLE_SECRET for now */
        array[DEVCONF_USE_OIF_ADDRS_ONLY] = READ_ONCE(cnf->use_oif_addrs_only);
        array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] =
                READ_ONCE(cnf->drop_unicast_in_l2_multicast);
        array[DEVCONF_DROP_UNSOLICITED_NA] = READ_ONCE(cnf->drop_unsolicited_na);
        array[DEVCONF_KEEP_ADDR_ON_DOWN] = READ_ONCE(cnf->keep_addr_on_down);
        array[DEVCONF_SEG6_ENABLED] = READ_ONCE(cnf->seg6_enabled);
#ifdef CONFIG_IPV6_SEG6_HMAC
        array[DEVCONF_SEG6_REQUIRE_HMAC] = READ_ONCE(cnf->seg6_require_hmac);
#endif
        array[DEVCONF_ENHANCED_DAD] = READ_ONCE(cnf->enhanced_dad);
        array[DEVCONF_ADDR_GEN_MODE] = READ_ONCE(cnf->addr_gen_mode);
        array[DEVCONF_DISABLE_POLICY] = READ_ONCE(cnf->disable_policy);
        array[DEVCONF_NDISC_TCLASS] = READ_ONCE(cnf->ndisc_tclass);
        array[DEVCONF_RPL_SEG_ENABLED] = READ_ONCE(cnf->rpl_seg_enabled);
        array[DEVCONF_IOAM6_ENABLED] = READ_ONCE(cnf->ioam6_enabled);
        array[DEVCONF_IOAM6_ID] = READ_ONCE(cnf->ioam6_id);
        array[DEVCONF_IOAM6_ID_WIDE] = READ_ONCE(cnf->ioam6_id_wide);
        array[DEVCONF_NDISC_EVICT_NOCARRIER] =
                READ_ONCE(cnf->ndisc_evict_nocarrier);
        array[DEVCONF_ACCEPT_UNTRACKED_NA] =
                READ_ONCE(cnf->accept_untracked_na);
        array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft);
}

static inline size_t inet6_ifla6_size(void)
{
        return nla_total_size(4) /* IFLA_INET6_FLAGS */
             + nla_total_size(sizeof(struct ifla_cacheinfo))
             + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
             + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
             + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
             + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
             + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
             + nla_total_size(4) /* IFLA_INET6_RA_MTU */
             + 0;
}

static inline size_t inet6_if_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ifinfomsg))
               + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
               + nla_total_size(4) /* IFLA_MTU */
               + nla_total_size(4) /* IFLA_LINK */
               + nla_total_size(1) /* IFLA_OPERSTATE */
               + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
}

static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
                                        int bytes)
{
        int i;
        int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX;
        BUG_ON(pad < 0);

        /* Use put_unaligned() because stats may not be aligned for u64. */
        put_unaligned(ICMP6_MIB_MAX, &stats[0]);
        for (i = 1; i < ICMP6_MIB_MAX; i++)
                put_unaligned(atomic_long_read(&mib[i]), &stats[i]);

        memset(&stats[ICMP6_MIB_MAX], 0, pad);
}

static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
                                        int bytes, size_t syncpoff)
{
        int i, c;
        u64 buff[IPSTATS_MIB_MAX];
        int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX;

        BUG_ON(pad < 0);

        memset(buff, 0, sizeof(buff));
        buff[0] = IPSTATS_MIB_MAX;

        for_each_possible_cpu(c) {
                for (i = 1; i < IPSTATS_MIB_MAX; i++)
                        buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff);
        }

        memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64));
        memset(&stats[IPSTATS_MIB_MAX], 0, pad);
}

static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
                             int bytes)
{
        switch (attrtype) {
        case IFLA_INET6_STATS:
                __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes,
                                     offsetof(struct ipstats_mib, syncp));
                break;
        case IFLA_INET6_ICMP6STATS:
                __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes);
                break;
        }
}

static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
                                  u32 ext_filter_mask)
{
        struct ifla_cacheinfo ci;
        struct nlattr *nla;
        u32 ra_mtu;

        if (nla_put_u32(skb, IFLA_INET6_FLAGS, READ_ONCE(idev->if_flags)))
                goto nla_put_failure;
        ci.max_reasm_len = IPV6_MAXPLEN;
        ci.tstamp = cstamp_delta(READ_ONCE(idev->tstamp));
        ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time);
        ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME));
        if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci))
                goto nla_put_failure;
        nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32));
        if (!nla)
                goto nla_put_failure;
        ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla));

        /* XXX - MC not implemented */

        if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS)
                return 0;

        nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
        if (!nla)
                goto nla_put_failure;
        snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));

        nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
        if (!nla)
                goto nla_put_failure;
        snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));

        nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
        if (!nla)
                goto nla_put_failure;
        read_lock_bh(&idev->lock);
        memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla));
        read_unlock_bh(&idev->lock);

        if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE,
                       READ_ONCE(idev->cnf.addr_gen_mode)))
                goto nla_put_failure;

        ra_mtu = READ_ONCE(idev->ra_mtu);
        if (ra_mtu && nla_put_u32(skb, IFLA_INET6_RA_MTU, ra_mtu))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static size_t inet6_get_link_af_size(const struct net_device *dev,
                                     u32 ext_filter_mask)
{
        if (!__in6_dev_get(dev))
                return 0;

        return inet6_ifla6_size();
}

static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
                              u32 ext_filter_mask)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        if (!idev)
                return -ENODATA;

        if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0)
                return -EMSGSIZE;

        return 0;
}

static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token,
                             struct netlink_ext_ack *extack)
{
        struct inet6_ifaddr *ifp;
        struct net_device *dev = idev->dev;
        bool clear_token, update_rs = false;
        struct in6_addr ll_addr;

        ASSERT_RTNL();

        if (!token)
                return -EINVAL;

        if (dev->flags & IFF_LOOPBACK) {
                NL_SET_ERR_MSG_MOD(extack, "Device is loopback");
                return -EINVAL;
        }

        if (dev->flags & IFF_NOARP) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Device does not do neighbour discovery");
                return -EINVAL;
        }

        if (!ipv6_accept_ra(idev)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Router advertisement is disabled on device");
                return -EINVAL;
        }

        if (READ_ONCE(idev->cnf.rtr_solicits) == 0) {
                NL_SET_ERR_MSG(extack,
                               "Router solicitation is disabled on device");
                return -EINVAL;
        }

        write_lock_bh(&idev->lock);

        BUILD_BUG_ON(sizeof(token->s6_addr) != 16);
        memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8);

        write_unlock_bh(&idev->lock);

        clear_token = ipv6_addr_any(token);
        if (clear_token)
                goto update_lft;

        if (!idev->dead && (idev->if_flags & IF_READY) &&
            !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
                             IFA_F_OPTIMISTIC)) {
                /* If we're not ready, then normal ifup will take care
                 * of this. Otherwise, we need to request our rs here.
                 */
                ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters);
                update_rs = true;
        }

update_lft:
        write_lock_bh(&idev->lock);

        if (update_rs) {
                idev->if_flags |= IF_RS_SENT;
                idev->rs_interval = rfc3315_s14_backoff_init(
                        READ_ONCE(idev->cnf.rtr_solicit_interval));
                idev->rs_probes = 1;
                addrconf_mod_rs_timer(idev, idev->rs_interval);
        }

        /* Well, that's kinda nasty ... */
        list_for_each_entry(ifp, &idev->addr_list, if_list) {
                spin_lock(&ifp->lock);
                if (ifp->tokenized) {
                        ifp->valid_lft = 0;
                        ifp->prefered_lft = 0;
                }
                spin_unlock(&ifp->lock);
        }

        write_unlock_bh(&idev->lock);
        inet6_ifinfo_notify(RTM_NEWLINK, idev);
        addrconf_verify_rtnl(dev_net(dev));
        return 0;
}

static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
        [IFLA_INET6_ADDR_GEN_MODE]        = { .type = NLA_U8 },
        [IFLA_INET6_TOKEN]                = { .len = sizeof(struct in6_addr) },
        [IFLA_INET6_RA_MTU]                = { .type = NLA_REJECT,
                                            .reject_message =
                                                "IFLA_INET6_RA_MTU can not be set" },
};

static int check_addr_gen_mode(int mode)
{
        if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
            mode != IN6_ADDR_GEN_MODE_NONE &&
            mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
            mode != IN6_ADDR_GEN_MODE_RANDOM)
                return -EINVAL;
        return 1;
}

static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
                                int mode)
{
        if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
            !idev->cnf.stable_secret.initialized &&
            !net->ipv6.devconf_dflt->stable_secret.initialized)
                return -EINVAL;
        return 1;
}

static int inet6_validate_link_af(const struct net_device *dev,
                                  const struct nlattr *nla,
                                  struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_INET6_MAX + 1];
        struct inet6_dev *idev = NULL;
        int err;

        if (dev) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        return -EAFNOSUPPORT;
        }

        err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
                                          inet6_af_policy, extack);
        if (err)
                return err;

        if (!tb[IFLA_INET6_TOKEN] && !tb[IFLA_INET6_ADDR_GEN_MODE])
                return -EINVAL;

        if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
                u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);

                if (check_addr_gen_mode(mode) < 0)
                        return -EINVAL;
                if (dev && check_stable_privacy(idev, dev_net(dev), mode) < 0)
                        return -EINVAL;
        }

        return 0;
}

static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        struct inet6_dev *idev = __in6_dev_get(dev);
        struct nlattr *tb[IFLA_INET6_MAX + 1];
        int err;

        if (!idev)
                return -EAFNOSUPPORT;

        if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
                return -EINVAL;

        if (tb[IFLA_INET6_TOKEN]) {
                err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]),
                                        extack);
                if (err)
                        return err;
        }

        if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
                u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);

                WRITE_ONCE(idev->cnf.addr_gen_mode, mode);
        }

        return 0;
}

static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
                             u32 portid, u32 seq, int event, unsigned int flags)
{
        struct net_device *dev = idev->dev;
        struct ifinfomsg *hdr;
        struct nlmsghdr *nlh;
        int ifindex, iflink;
        void *protoinfo;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
        if (!nlh)
                return -EMSGSIZE;

        hdr = nlmsg_data(nlh);
        hdr->ifi_family = AF_INET6;
        hdr->__ifi_pad = 0;
        hdr->ifi_type = dev->type;
        ifindex = READ_ONCE(dev->ifindex);
        hdr->ifi_index = ifindex;
        hdr->ifi_flags = dev_get_flags(dev);
        hdr->ifi_change = 0;

        iflink = dev_get_iflink(dev);
        if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
            (dev->addr_len &&
             nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
            nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
            (ifindex != iflink &&
             nla_put_u32(skb, IFLA_LINK, iflink)) ||
            nla_put_u8(skb, IFLA_OPERSTATE,
                       netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN))
                goto nla_put_failure;
        protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO);
        if (!protoinfo)
                goto nla_put_failure;

        if (inet6_fill_ifla6_attrs(skb, idev, 0) < 0)
                goto nla_put_failure;

        nla_nest_end(skb, protoinfo);
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
                                   struct netlink_ext_ack *extack)
{
        struct ifinfomsg *ifm;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request");
                return -EINVAL;
        }

        if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid data after header");
                return -EINVAL;
        }

        ifm = nlmsg_data(nlh);
        if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
            ifm->ifi_change || ifm->ifi_index) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request");
                return -EINVAL;
        }

        return 0;
}

static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct {
                unsigned long ifindex;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        struct inet6_dev *idev;
        int err;

        /* only requests using strict checking can pass data to
         * influence the dump
         */
        if (cb->strict_check) {
                err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack);

                if (err < 0)
                        return err;
        }

        err = 0;
        rcu_read_lock();
        for_each_netdev_dump(net, dev, ctx->ifindex) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        continue;
                err = inet6_fill_ifinfo(skb, idev,
                                        NETLINK_CB(cb->skb).portid,
                                        cb->nlh->nlmsg_seq,
                                        RTM_NEWLINK, NLM_F_MULTI);
                if (err < 0)
                        break;
        }
        rcu_read_unlock();

        return err;
}

void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
{
        struct sk_buff *skb;
        struct net *net = dev_net(idev->dev);
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC);
        return;
errout:
        if (err < 0)
                rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
}

static inline size_t inet6_prefix_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct prefixmsg))
               + nla_total_size(sizeof(struct in6_addr))
               + nla_total_size(sizeof(struct prefix_cacheinfo));
}

static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
                             struct prefix_info *pinfo, u32 portid, u32 seq,
                             int event, unsigned int flags)
{
        struct prefixmsg *pmsg;
        struct nlmsghdr *nlh;
        struct prefix_cacheinfo        ci;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*pmsg), flags);
        if (!nlh)
                return -EMSGSIZE;

        pmsg = nlmsg_data(nlh);
        pmsg->prefix_family = AF_INET6;
        pmsg->prefix_pad1 = 0;
        pmsg->prefix_pad2 = 0;
        pmsg->prefix_ifindex = idev->dev->ifindex;
        pmsg->prefix_len = pinfo->prefix_len;
        pmsg->prefix_type = pinfo->type;
        pmsg->prefix_pad3 = 0;
        pmsg->prefix_flags = pinfo->flags;

        if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix))
                goto nla_put_failure;
        ci.preferred_time = ntohl(pinfo->prefered);
        ci.valid_time = ntohl(pinfo->valid);
        if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci))
                goto nla_put_failure;
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static void inet6_prefix_notify(int event, struct inet6_dev *idev,
                         struct prefix_info *pinfo)
{
        struct sk_buff *skb;
        struct net *net = dev_net(idev->dev);
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
        return;
errout:
        if (err < 0)
                rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
}

static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
        struct net *net = dev_net(ifp->idev->dev);

        if (event)
                ASSERT_RTNL();

        inet6_ifa_notify(event ? : RTM_NEWADDR, ifp);

        switch (event) {
        case RTM_NEWADDR:
                /*
                 * If the address was optimistic we inserted the route at the
                 * start of our DAD process, so we don't need to do it again.
                 * If the device was taken down in the middle of the DAD
                 * cycle there is a race where we could get here without a
                 * host route, so nothing to insert. That will be fixed when
                 * the device is brought up.
                 */
                if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) {
                        ip6_ins_rt(net, ifp->rt);
                } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) {
                        pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n",
                                &ifp->addr, ifp->idev->dev->name);
                }

                if (ifp->idev->cnf.forwarding)
                        addrconf_join_anycast(ifp);
                if (!ipv6_addr_any(&ifp->peer_addr))
                        addrconf_prefix_route(&ifp->peer_addr, 128,
                                              ifp->rt_priority, ifp->idev->dev,
                                              0, 0, GFP_ATOMIC);
                break;
        case RTM_DELADDR:
                if (ifp->idev->cnf.forwarding)
                        addrconf_leave_anycast(ifp);
                addrconf_leave_solict(ifp->idev, &ifp->addr);
                if (!ipv6_addr_any(&ifp->peer_addr)) {
                        struct fib6_info *rt;

                        rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
                                                       ifp->idev->dev, 0, 0,
                                                       false);
                        if (rt)
                                ip6_del_rt(net, rt, false);
                }
                if (ifp->rt) {
                        ip6_del_rt(net, ifp->rt, false);
                        ifp->rt = NULL;
                }
                rt_genid_bump_ipv6(net);
                break;
        }
        atomic_inc(&net->ipv6.dev_addr_genid);
}

static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
        if (likely(ifp->idev->dead == 0))
                __ipv6_ifa_notify(event, ifp);
}

#ifdef CONFIG_SYSCTL

static int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        /*
         * ctl->data points to idev->cnf.forwarding, we should
         * not modify it until we get the rtnl lock.
         */
        lctl = *ctl;
        lctl.data = &val;

        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write)
                ret = addrconf_fixup_forwarding(ctl, valp, val);
        if (ret)
                *ppos = pos;
        return ret;
}

static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct inet6_dev *idev = ctl->extra1;
        int min_mtu = IPV6_MIN_MTU;
        struct ctl_table lctl;

        lctl = *ctl;
        lctl.extra1 = &min_mtu;
        lctl.extra2 = idev ? &idev->dev->mtu : NULL;

        return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
}

static void dev_disable_change(struct inet6_dev *idev)
{
        struct netdev_notifier_info info;

        if (!idev || !idev->dev)
                return;

        netdev_notifier_info_init(&info, idev->dev);
        if (idev->cnf.disable_ipv6)
                addrconf_notify(NULL, NETDEV_DOWN, &info);
        else
                addrconf_notify(NULL, NETDEV_UP, &info);
}

static void addrconf_disable_change(struct net *net, __s32 newf)
{
        struct net_device *dev;
        struct inet6_dev *idev;

        for_each_netdev(net, dev) {
                idev = __in6_dev_get(dev);
                if (idev) {
                        int changed = (!idev->cnf.disable_ipv6) ^ (!newf);

                        WRITE_ONCE(idev->cnf.disable_ipv6, newf);
                        if (changed)
                                dev_disable_change(idev);
                }
        }
}

static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf)
{
        struct net *net = (struct net *)table->extra2;
        int old;

        if (p == &net->ipv6.devconf_dflt->disable_ipv6) {
                WRITE_ONCE(*p, newf);
                return 0;
        }

        if (!rtnl_trylock())
                return restart_syscall();

        old = *p;
        WRITE_ONCE(*p, newf);

        if (p == &net->ipv6.devconf_all->disable_ipv6) {
                WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf);
                addrconf_disable_change(net, newf);
        } else if ((!newf) ^ (!old))
                dev_disable_change((struct inet6_dev *)table->extra1);

        rtnl_unlock();
        return 0;
}

static int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        /*
         * ctl->data points to idev->cnf.disable_ipv6, we should
         * not modify it until we get the rtnl lock.
         */
        lctl = *ctl;
        lctl.data = &val;

        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write)
                ret = addrconf_disable_ipv6(ctl, valp, val);
        if (ret)
                *ppos = pos;
        return ret;
}

static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int ret;
        int old, new;

        old = *valp;
        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        new = *valp;

        if (write && old != new) {
                struct net *net = ctl->extra2;

                if (!rtnl_trylock())
                        return restart_syscall();

                if (valp == &net->ipv6.devconf_dflt->proxy_ndp)
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_PROXY_NEIGH,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);
                else if (valp == &net->ipv6.devconf_all->proxy_ndp)
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_PROXY_NEIGH,
                                                     NETCONFA_IFINDEX_ALL,
                                                     net->ipv6.devconf_all);
                else {
                        struct inet6_dev *idev = ctl->extra1;

                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_PROXY_NEIGH,
                                                     idev->dev->ifindex,
                                                     &idev->cnf);
                }
                rtnl_unlock();
        }

        return ret;
}

static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
                                         void *buffer, size_t *lenp,
                                         loff_t *ppos)
{
        int ret = 0;
        u32 new_val;
        struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
        struct net *net = (struct net *)ctl->extra2;
        struct ctl_table tmp = {
                .data = &new_val,
                .maxlen = sizeof(new_val),
                .mode = ctl->mode,
        };

        if (!rtnl_trylock())
                return restart_syscall();

        new_val = *((u32 *)ctl->data);

        ret = proc_douintvec(&tmp, write, buffer, lenp, ppos);
        if (ret != 0)
                goto out;

        if (write) {
                if (check_addr_gen_mode(new_val) < 0) {
                        ret = -EINVAL;
                        goto out;
                }

                if (idev) {
                        if (check_stable_privacy(idev, net, new_val) < 0) {
                                ret = -EINVAL;
                                goto out;
                        }

                        if (idev->cnf.addr_gen_mode != new_val) {
                                WRITE_ONCE(idev->cnf.addr_gen_mode, new_val);
                                addrconf_init_auto_addrs(idev->dev);
                        }
                } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) {
                        struct net_device *dev;

                        WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val);
                        for_each_netdev(net, dev) {
                                idev = __in6_dev_get(dev);
                                if (idev &&
                                    idev->cnf.addr_gen_mode != new_val) {
                                        WRITE_ONCE(idev->cnf.addr_gen_mode,
                                                  new_val);
                                        addrconf_init_auto_addrs(idev->dev);
                                }
                        }
                }

                WRITE_ONCE(*((u32 *)ctl->data), new_val);
        }

out:
        rtnl_unlock();

        return ret;
}

static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
                                         void *buffer, size_t *lenp,
                                         loff_t *ppos)
{
        int err;
        struct in6_addr addr;
        char str[IPV6_MAX_STRLEN];
        struct ctl_table lctl = *ctl;
        struct net *net = ctl->extra2;
        struct ipv6_stable_secret *secret = ctl->data;

        if (&net->ipv6.devconf_all->stable_secret == ctl->data)
                return -EIO;

        lctl.maxlen = IPV6_MAX_STRLEN;
        lctl.data = str;

        if (!rtnl_trylock())
                return restart_syscall();

        if (!write && !secret->initialized) {
                err = -EIO;
                goto out;
        }

        err = snprintf(str, sizeof(str), "%pI6", &secret->secret);
        if (err >= sizeof(str)) {
                err = -EIO;
                goto out;
        }

        err = proc_dostring(&lctl, write, buffer, lenp, ppos);
        if (err || !write)
                goto out;

        if (in6_pton(str, -1, addr.in6_u.u6_addr8, -1, NULL) != 1) {
                err = -EIO;
                goto out;
        }

        secret->initialized = true;
        secret->secret = addr;

        if (&net->ipv6.devconf_dflt->stable_secret == ctl->data) {
                struct net_device *dev;

                for_each_netdev(net, dev) {
                        struct inet6_dev *idev = __in6_dev_get(dev);

                        if (idev) {
                                WRITE_ONCE(idev->cnf.addr_gen_mode,
                                           IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
                        }
                }
        } else {
                struct inet6_dev *idev = ctl->extra1;

                WRITE_ONCE(idev->cnf.addr_gen_mode,
                           IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
        }

out:
        rtnl_unlock();

        return err;
}

static
int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
                                                int write, void *buffer,
                                                size_t *lenp,
                                                loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        /* ctl->data points to idev->cnf.ignore_routes_when_linkdown
         * we should not modify it until we get the rtnl lock.
         */
        lctl = *ctl;
        lctl.data = &val;

        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write)
                ret = addrconf_fixup_linkdown(ctl, valp, val);
        if (ret)
                *ppos = pos;
        return ret;
}

static
void addrconf_set_nopolicy(struct rt6_info *rt, int action)
{
        if (rt) {
                if (action)
                        rt->dst.flags |= DST_NOPOLICY;
                else
                        rt->dst.flags &= ~DST_NOPOLICY;
        }
}

static
void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
{
        struct inet6_ifaddr *ifa;

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                spin_lock(&ifa->lock);
                if (ifa->rt) {
                        /* host routes only use builtin fib6_nh */
                        struct fib6_nh *nh = ifa->rt->fib6_nh;
                        int cpu;

                        rcu_read_lock();
                        ifa->rt->dst_nopolicy = val ? true : false;
                        if (nh->rt6i_pcpu) {
                                for_each_possible_cpu(cpu) {
                                        struct rt6_info **rtp;

                                        rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
                                        addrconf_set_nopolicy(*rtp, val);
                                }
                        }
                        rcu_read_unlock();
                }
                spin_unlock(&ifa->lock);
        }
        read_unlock_bh(&idev->lock);
}

static
int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val)
{
        struct net *net = (struct net *)ctl->extra2;
        struct inet6_dev *idev;

        if (valp == &net->ipv6.devconf_dflt->disable_policy) {
                WRITE_ONCE(*valp, val);
                return 0;
        }

        if (!rtnl_trylock())
                return restart_syscall();

        WRITE_ONCE(*valp, val);

        if (valp == &net->ipv6.devconf_all->disable_policy)  {
                struct net_device *dev;

                for_each_netdev(net, dev) {
                        idev = __in6_dev_get(dev);
                        if (idev)
                                addrconf_disable_policy_idev(idev, val);
                }
        } else {
                idev = (struct inet6_dev *)ctl->extra1;
                addrconf_disable_policy_idev(idev, val);
        }

        rtnl_unlock();
        return 0;
}

static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
                                   void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        lctl = *ctl;
        lctl.data = &val;
        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write && (*valp != val))
                ret = addrconf_disable_policy(ctl, valp, val);

        if (ret)
                *ppos = pos;

        return ret;
}

static int minus_one = -1;
static const int two_five_five = 255;
static u32 ioam6_if_id_max = U16_MAX;

static const struct ctl_table addrconf_sysctl[] = {
        {
                .procname        = "forwarding",
                .data                = &ipv6_devconf.forwarding,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_forward,
        },
        {
                .procname        = "hop_limit",
                .data                = &ipv6_devconf.hop_limit,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = (void *)SYSCTL_ONE,
                .extra2                = (void *)&two_five_five,
        },
        {
                .procname        = "mtu",
                .data                = &ipv6_devconf.mtu6,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_mtu,
        },
        {
                .procname        = "accept_ra",
                .data                = &ipv6_devconf.accept_ra,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_redirects",
                .data                = &ipv6_devconf.accept_redirects,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "autoconf",
                .data                = &ipv6_devconf.autoconf,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "dad_transmits",
                .data                = &ipv6_devconf.dad_transmits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "router_solicitations",
                .data                = &ipv6_devconf.rtr_solicits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = &minus_one,
        },
        {
                .procname        = "router_solicitation_interval",
                .data                = &ipv6_devconf.rtr_solicit_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "router_solicitation_max_interval",
                .data                = &ipv6_devconf.rtr_solicit_max_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "router_solicitation_delay",
                .data                = &ipv6_devconf.rtr_solicit_delay,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "force_mld_version",
                .data                = &ipv6_devconf.force_mld_version,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "mldv1_unsolicited_report_interval",
                .data                =
                        &ipv6_devconf.mldv1_unsolicited_report_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "mldv2_unsolicited_report_interval",
                .data                =
                        &ipv6_devconf.mldv2_unsolicited_report_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "use_tempaddr",
                .data                = &ipv6_devconf.use_tempaddr,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "temp_valid_lft",
                .data                = &ipv6_devconf.temp_valid_lft,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "temp_prefered_lft",
                .data                = &ipv6_devconf.temp_prefered_lft,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname       = "regen_min_advance",
                .data           = &ipv6_devconf.regen_min_advance,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname        = "regen_max_retry",
                .data                = &ipv6_devconf.regen_max_retry,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "max_desync_factor",
                .data                = &ipv6_devconf.max_desync_factor,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "max_addresses",
                .data                = &ipv6_devconf.max_addresses,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_defrtr",
                .data                = &ipv6_devconf.accept_ra_defrtr,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ra_defrtr_metric",
                .data                = &ipv6_devconf.ra_defrtr_metric,
                .maxlen                = sizeof(u32),
                .mode                = 0644,
                .proc_handler        = proc_douintvec_minmax,
                .extra1                = (void *)SYSCTL_ONE,
        },
        {
                .procname        = "accept_ra_min_hop_limit",
                .data                = &ipv6_devconf.accept_ra_min_hop_limit,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_min_lft",
                .data                = &ipv6_devconf.accept_ra_min_lft,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_pinfo",
                .data                = &ipv6_devconf.accept_ra_pinfo,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ra_honor_pio_life",
                .data                = &ipv6_devconf.ra_honor_pio_life,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#ifdef CONFIG_IPV6_ROUTER_PREF
        {
                .procname        = "accept_ra_rtr_pref",
                .data                = &ipv6_devconf.accept_ra_rtr_pref,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "router_probe_interval",
                .data                = &ipv6_devconf.rtr_probe_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
#ifdef CONFIG_IPV6_ROUTE_INFO
        {
                .procname        = "accept_ra_rt_info_min_plen",
                .data                = &ipv6_devconf.accept_ra_rt_info_min_plen,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_rt_info_max_plen",
                .data                = &ipv6_devconf.accept_ra_rt_info_max_plen,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
#endif
        {
                .procname        = "proxy_ndp",
                .data                = &ipv6_devconf.proxy_ndp,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_proxy_ndp,
        },
        {
                .procname        = "accept_source_route",
                .data                = &ipv6_devconf.accept_source_route,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        {
                .procname        = "optimistic_dad",
                .data                = &ipv6_devconf.optimistic_dad,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname        = "use_optimistic",
                .data                = &ipv6_devconf.use_optimistic,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
#ifdef CONFIG_IPV6_MROUTE
        {
                .procname        = "mc_forwarding",
                .data                = &ipv6_devconf.mc_forwarding,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
#endif
        {
                .procname        = "disable_ipv6",
                .data                = &ipv6_devconf.disable_ipv6,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_disable,
        },
        {
                .procname        = "accept_dad",
                .data                = &ipv6_devconf.accept_dad,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "force_tllao",
                .data                = &ipv6_devconf.force_tllao,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
        {
                .procname        = "ndisc_notify",
                .data                = &ipv6_devconf.ndisc_notify,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
        {
                .procname        = "suppress_frag_ndisc",
                .data                = &ipv6_devconf.suppress_frag_ndisc,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
        {
                .procname        = "accept_ra_from_local",
                .data                = &ipv6_devconf.accept_ra_from_local,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_mtu",
                .data                = &ipv6_devconf.accept_ra_mtu,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "stable_secret",
                .data                = &ipv6_devconf.stable_secret,
                .maxlen                = IPV6_MAX_STRLEN,
                .mode                = 0600,
                .proc_handler        = addrconf_sysctl_stable_secret,
        },
        {
                .procname        = "use_oif_addrs_only",
                .data                = &ipv6_devconf.use_oif_addrs_only,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ignore_routes_with_linkdown",
                .data                = &ipv6_devconf.ignore_routes_with_linkdown,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_ignore_routes_with_linkdown,
        },
        {
                .procname        = "drop_unicast_in_l2_multicast",
                .data                = &ipv6_devconf.drop_unicast_in_l2_multicast,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "drop_unsolicited_na",
                .data                = &ipv6_devconf.drop_unsolicited_na,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "keep_addr_on_down",
                .data                = &ipv6_devconf.keep_addr_on_down,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,

        },
        {
                .procname        = "seg6_enabled",
                .data                = &ipv6_devconf.seg6_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_IPV6_SEG6_HMAC
        {
                .procname        = "seg6_require_hmac",
                .data                = &ipv6_devconf.seg6_require_hmac,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
        {
                .procname       = "enhanced_dad",
                .data           = &ipv6_devconf.enhanced_dad,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname        = "addr_gen_mode",
                .data                = &ipv6_devconf.addr_gen_mode,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_addr_gen_mode,
        },
        {
                .procname       = "disable_policy",
                .data           = &ipv6_devconf.disable_policy,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = addrconf_sysctl_disable_policy,
        },
        {
                .procname        = "ndisc_tclass",
                .data                = &ipv6_devconf.ndisc_tclass,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)&two_five_five,
        },
        {
                .procname        = "rpl_seg_enabled",
                .data                = &ipv6_devconf.rpl_seg_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ioam6_enabled",
                .data                = &ipv6_devconf.ioam6_enabled,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)SYSCTL_ONE,
        },
        {
                .procname        = "ioam6_id",
                .data                = &ipv6_devconf.ioam6_id,
                .maxlen                = sizeof(u32),
                .mode                = 0644,
                .proc_handler        = proc_douintvec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)&ioam6_if_id_max,
        },
        {
                .procname        = "ioam6_id_wide",
                .data                = &ipv6_devconf.ioam6_id_wide,
                .maxlen                = sizeof(u32),
                .mode                = 0644,
                .proc_handler        = proc_douintvec,
        },
        {
                .procname        = "ndisc_evict_nocarrier",
                .data                = &ipv6_devconf.ndisc_evict_nocarrier,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)SYSCTL_ONE,
        },
        {
                .procname        = "accept_untracked_na",
                .data                = &ipv6_devconf.accept_untracked_na,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __addrconf_sysctl_register(struct net *net, char *dev_name,
                struct inet6_dev *idev, struct ipv6_devconf *p)
{
        size_t table_size = ARRAY_SIZE(addrconf_sysctl);
        int i, ifindex;
        struct ctl_table *table;
        char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];

        table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL_ACCOUNT);
        if (!table)
                goto out;

        for (i = 0; i < table_size; i++) {
                table[i].data += (char *)p - (char *)&ipv6_devconf;
                /* If one of these is already set, then it is not safe to
                 * overwrite either of them: this makes proc_dointvec_minmax
                 * usable.
                 */
                if (!table[i].extra1 && !table[i].extra2) {
                        table[i].extra1 = idev; /* embedded; no ref */
                        table[i].extra2 = net;
                }
        }

        snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);

        p->sysctl_header = register_net_sysctl_sz(net, path, table,
                                                  table_size);
        if (!p->sysctl_header)
                goto free;

        if (!strcmp(dev_name, "all"))
                ifindex = NETCONFA_IFINDEX_ALL;
        else if (!strcmp(dev_name, "default"))
                ifindex = NETCONFA_IFINDEX_DEFAULT;
        else
                ifindex = idev->dev->ifindex;
        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
                                     ifindex, p);
        return 0;

free:
        kfree(table);
out:
        return -ENOBUFS;
}

static void __addrconf_sysctl_unregister(struct net *net,
                                         struct ipv6_devconf *p, int ifindex)
{
        const struct ctl_table *table;

        if (!p->sysctl_header)
                return;

        table = p->sysctl_header->ctl_table_arg;
        unregister_net_sysctl_table(p->sysctl_header);
        p->sysctl_header = NULL;
        kfree(table);

        inet6_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
}

static int addrconf_sysctl_register(struct inet6_dev *idev)
{
        int err;

        if (!sysctl_dev_name_is_allowed(idev->dev->name))
                return -EINVAL;

        err = neigh_sysctl_register(idev->dev, idev->nd_parms,
                                    &ndisc_ifinfo_sysctl_change);
        if (err)
                return err;
        err = __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
                                         idev, &idev->cnf);
        if (err)
                neigh_sysctl_unregister(idev->nd_parms);

        return err;
}

static void addrconf_sysctl_unregister(struct inet6_dev *idev)
{
        __addrconf_sysctl_unregister(dev_net(idev->dev), &idev->cnf,
                                     idev->dev->ifindex);
        neigh_sysctl_unregister(idev->nd_parms);
}


#endif

static int __net_init addrconf_init_net(struct net *net)
{
        int err = -ENOMEM;
        struct ipv6_devconf *all, *dflt;

        spin_lock_init(&net->ipv6.addrconf_hash_lock);
        INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work);
        net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE,
                                           sizeof(struct hlist_head),
                                           GFP_KERNEL);
        if (!net->ipv6.inet6_addr_lst)
                goto err_alloc_addr;

        all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
        if (!all)
                goto err_alloc_all;

        dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
        if (!dflt)
                goto err_alloc_dflt;

        if (!net_eq(net, &init_net)) {
                switch (net_inherit_devconf()) {
                case 1:  /* copy from init_net */
                        memcpy(all, init_net.ipv6.devconf_all,
                               sizeof(ipv6_devconf));
                        memcpy(dflt, init_net.ipv6.devconf_dflt,
                               sizeof(ipv6_devconf_dflt));
                        break;
                case 3: /* copy from the current netns */
                        memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all,
                               sizeof(ipv6_devconf));
                        memcpy(dflt,
                               current->nsproxy->net_ns->ipv6.devconf_dflt,
                               sizeof(ipv6_devconf_dflt));
                        break;
                case 0:
                case 2:
                        /* use compiled values */
                        break;
                }
        }

        /* these will be inherited by all namespaces */
        dflt->autoconf = ipv6_defaults.autoconf;
        dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;

        dflt->stable_secret.initialized = false;
        all->stable_secret.initialized = false;

        net->ipv6.devconf_all = all;
        net->ipv6.devconf_dflt = dflt;

#ifdef CONFIG_SYSCTL
        err = __addrconf_sysctl_register(net, "all", NULL, all);
        if (err < 0)
                goto err_reg_all;

        err = __addrconf_sysctl_register(net, "default", NULL, dflt);
        if (err < 0)
                goto err_reg_dflt;
#endif
        return 0;

#ifdef CONFIG_SYSCTL
err_reg_dflt:
        __addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
        kfree(dflt);
        net->ipv6.devconf_dflt = NULL;
#endif
err_alloc_dflt:
        kfree(all);
        net->ipv6.devconf_all = NULL;
err_alloc_all:
        kfree(net->ipv6.inet6_addr_lst);
err_alloc_addr:
        return err;
}

static void __net_exit addrconf_exit_net(struct net *net)
{
        int i;

#ifdef CONFIG_SYSCTL
        __addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt,
                                     NETCONFA_IFINDEX_DEFAULT);
        __addrconf_sysctl_unregister(net, net->ipv6.devconf_all,
                                     NETCONFA_IFINDEX_ALL);
#endif
        kfree(net->ipv6.devconf_dflt);
        net->ipv6.devconf_dflt = NULL;
        kfree(net->ipv6.devconf_all);
        net->ipv6.devconf_all = NULL;

        cancel_delayed_work_sync(&net->ipv6.addr_chk_work);
        /*
         *        Check hash table, then free it.
         */
        for (i = 0; i < IN6_ADDR_HSIZE; i++)
                WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i]));

        kfree(net->ipv6.inet6_addr_lst);
        net->ipv6.inet6_addr_lst = NULL;
}

static struct pernet_operations addrconf_ops = {
        .init = addrconf_init_net,
        .exit = addrconf_exit_net,
};

static struct rtnl_af_ops inet6_ops __read_mostly = {
        .family                  = AF_INET6,
        .fill_link_af          = inet6_fill_link_af,
        .get_link_af_size = inet6_get_link_af_size,
        .validate_link_af = inet6_validate_link_af,
        .set_link_af          = inet6_set_link_af,
};

/*
 *        Init / cleanup code
 */

int __init addrconf_init(void)
{
        struct inet6_dev *idev;
        int err;

        err = ipv6_addr_label_init();
        if (err < 0) {
                pr_crit("%s: cannot initialize default policy table: %d\n",
                        __func__, err);
                goto out;
        }

        err = register_pernet_subsys(&addrconf_ops);
        if (err < 0)
                goto out_addrlabel;

        /* All works using addrconf_wq need to lock rtnl. */
        addrconf_wq = create_singlethread_workqueue("ipv6_addrconf");
        if (!addrconf_wq) {
                err = -ENOMEM;
                goto out_nowq;
        }

        rtnl_lock();
        idev = ipv6_add_dev(blackhole_netdev);
        rtnl_unlock();
        if (IS_ERR(idev)) {
                err = PTR_ERR(idev);
                goto errlo;
        }

        ip6_route_init_special_entries();

        register_netdevice_notifier(&ipv6_dev_notf);

        addrconf_verify(&init_net);

        rtnl_af_register(&inet6_ops);

        err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK,
                                   NULL, inet6_dump_ifinfo, RTNL_FLAG_DUMP_UNLOCKED);
        if (err < 0)
                goto errout;

        err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR,
                                   inet6_rtm_newaddr, NULL, 0);
        if (err < 0)
                goto errout;
        err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR,
                                   inet6_rtm_deladdr, NULL, 0);
        if (err < 0)
                goto errout;
        err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR,
                                   inet6_rtm_getaddr, inet6_dump_ifaddr,
                                   RTNL_FLAG_DOIT_UNLOCKED |
                                   RTNL_FLAG_DUMP_UNLOCKED);
        if (err < 0)
                goto errout;
        err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST,
                                   NULL, inet6_dump_ifmcaddr,
                                   RTNL_FLAG_DUMP_UNLOCKED);
        if (err < 0)
                goto errout;
        err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST,
                                   NULL, inet6_dump_ifacaddr,
                                   RTNL_FLAG_DUMP_UNLOCKED);
        if (err < 0)
                goto errout;
        err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF,
                                   inet6_netconf_get_devconf,
                                   inet6_netconf_dump_devconf,
                                   RTNL_FLAG_DOIT_UNLOCKED |
                                   RTNL_FLAG_DUMP_UNLOCKED);
        if (err < 0)
                goto errout;
        err = ipv6_addr_label_rtnl_register();
        if (err < 0)
                goto errout;

        return 0;
errout:
        rtnl_unregister_all(PF_INET6);
        rtnl_af_unregister(&inet6_ops);
        unregister_netdevice_notifier(&ipv6_dev_notf);
errlo:
        destroy_workqueue(addrconf_wq);
out_nowq:
        unregister_pernet_subsys(&addrconf_ops);
out_addrlabel:
        ipv6_addr_label_cleanup();
out:
        return err;
}

void addrconf_cleanup(void)
{
        struct net_device *dev;

        unregister_netdevice_notifier(&ipv6_dev_notf);
        unregister_pernet_subsys(&addrconf_ops);
        ipv6_addr_label_cleanup();

        rtnl_af_unregister(&inet6_ops);

        rtnl_lock();

        /* clean dev list */
        for_each_netdev(&init_net, dev) {
                if (__in6_dev_get(dev) == NULL)
                        continue;
                addrconf_ifdown(dev, true);
        }
        addrconf_ifdown(init_net.loopback_dev, true);

        rtnl_unlock();

        destroy_workqueue(addrconf_wq);
}




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Portions of this file
* Copyright(c) 2016 Intel Deutschland GmbH
* Copyright (C) 2018-2019, 2021-2024 Intel Corporation
*/

#ifndef __MAC80211_DRIVER_OPS
#define __MAC80211_DRIVER_OPS

#include <net/mac80211.h>
#include "ieee80211_i.h"
#include "trace.h"

#define check_sdata_in_driver(sdata)        ({                                        \
        WARN_ONCE(!sdata->local->reconfig_failure &&                                \
                  !(sdata->flags & IEEE80211_SDATA_IN_DRIVER),                        \
                  "%s: Failed check-sdata-in-driver check, flags: 0x%x\n",        \
                  sdata->dev ? sdata->dev->name : sdata->name, sdata->flags);        \
        !!(sdata->flags & IEEE80211_SDATA_IN_DRIVER);                                \
})

static inline struct ieee80211_sub_if_data *
get_bss_sdata(struct ieee80211_sub_if_data *sdata)
{
        if (sdata && sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
                sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
                                     u.ap);

        return sdata;
}

static inline void drv_tx(struct ieee80211_local *local,
                          struct ieee80211_tx_control *control,
                          struct sk_buff *skb)
{
        local->ops->tx(&local->hw, control, skb);
}

static inline void drv_sync_rx_queues(struct ieee80211_local *local,
                                      struct sta_info *sta)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (local->ops->sync_rx_queues) {
                trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta);
                local->ops->sync_rx_queues(&local->hw);
                trace_drv_return_void(local);
        }
}

static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata,
                                      u32 sset, u8 *data)
{
        struct ieee80211_local *local = sdata->local;
        if (local->ops->get_et_strings) {
                trace_drv_get_et_strings(local, sset);
                local->ops->get_et_strings(&local->hw, &sdata->vif, sset, data);
                trace_drv_return_void(local);
        }
}

static inline void drv_get_et_stats(struct ieee80211_sub_if_data *sdata,
                                    struct ethtool_stats *stats,
                                    u64 *data)
{
        struct ieee80211_local *local = sdata->local;
        if (local->ops->get_et_stats) {
                trace_drv_get_et_stats(local);
                local->ops->get_et_stats(&local->hw, &sdata->vif, stats, data);
                trace_drv_return_void(local);
        }
}

static inline int drv_get_et_sset_count(struct ieee80211_sub_if_data *sdata,
                                        int sset)
{
        struct ieee80211_local *local = sdata->local;
        int rv = 0;
        if (local->ops->get_et_sset_count) {
                trace_drv_get_et_sset_count(local, sset);
                rv = local->ops->get_et_sset_count(&local->hw, &sdata->vif,
                                                   sset);
                trace_drv_return_int(local, rv);
        }
        return rv;
}

int drv_start(struct ieee80211_local *local);
void drv_stop(struct ieee80211_local *local);

#ifdef CONFIG_PM
static inline int drv_suspend(struct ieee80211_local *local,
                              struct cfg80211_wowlan *wowlan)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_suspend(local);
        ret = local->ops->suspend(&local->hw, wowlan);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_resume(struct ieee80211_local *local)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_resume(local);
        ret = local->ops->resume(&local->hw);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_set_wakeup(struct ieee80211_local *local,
                                  bool enabled)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!local->ops->set_wakeup)
                return;

        trace_drv_set_wakeup(local, enabled);
        local->ops->set_wakeup(&local->hw, enabled);
        trace_drv_return_void(local);
}
#endif

int drv_add_interface(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata);

int drv_change_interface(struct ieee80211_local *local,
                         struct ieee80211_sub_if_data *sdata,
                         enum nl80211_iftype type, bool p2p);

void drv_remove_interface(struct ieee80211_local *local,
                          struct ieee80211_sub_if_data *sdata);

static inline int drv_config(struct ieee80211_local *local, u32 changed)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_config(local, changed);
        ret = local->ops->config(&local->hw, changed);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_vif_cfg_changed(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       u64 changed)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_vif_cfg_changed(local, sdata, changed);
        if (local->ops->vif_cfg_changed)
                local->ops->vif_cfg_changed(&local->hw, &sdata->vif, changed);
        else if (local->ops->bss_info_changed)
                local->ops->bss_info_changed(&local->hw, &sdata->vif,
                                             &sdata->vif.bss_conf, changed);
        trace_drv_return_void(local);
}

void drv_link_info_changed(struct ieee80211_local *local,
                           struct ieee80211_sub_if_data *sdata,
                           struct ieee80211_bss_conf *info,
                           int link_id, u64 changed);

static inline u64 drv_prepare_multicast(struct ieee80211_local *local,
                                        struct netdev_hw_addr_list *mc_list)
{
        u64 ret = 0;

        trace_drv_prepare_multicast(local, mc_list->count);

        if (local->ops->prepare_multicast)
                ret = local->ops->prepare_multicast(&local->hw, mc_list);

        trace_drv_return_u64(local, ret);

        return ret;
}

static inline void drv_configure_filter(struct ieee80211_local *local,
                                        unsigned int changed_flags,
                                        unsigned int *total_flags,
                                        u64 multicast)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_configure_filter(local, changed_flags, total_flags,
                                   multicast);
        local->ops->configure_filter(&local->hw, changed_flags, total_flags,
                                     multicast);
        trace_drv_return_void(local);
}

static inline void drv_config_iface_filter(struct ieee80211_local *local,
                                           struct ieee80211_sub_if_data *sdata,
                                           unsigned int filter_flags,
                                           unsigned int changed_flags)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_config_iface_filter(local, sdata, filter_flags,
                                      changed_flags);
        if (local->ops->config_iface_filter)
                local->ops->config_iface_filter(&local->hw, &sdata->vif,
                                                filter_flags,
                                                changed_flags);
        trace_drv_return_void(local);
}

static inline int drv_set_tim(struct ieee80211_local *local,
                              struct ieee80211_sta *sta, bool set)
{
        int ret = 0;
        trace_drv_set_tim(local, sta, set);
        if (local->ops->set_tim)
                ret = local->ops->set_tim(&local->hw, sta, set);
        trace_drv_return_int(local, ret);
        return ret;
}

int drv_set_key(struct ieee80211_local *local,
                enum set_key_cmd cmd,
                struct ieee80211_sub_if_data *sdata,
                struct ieee80211_sta *sta,
                struct ieee80211_key_conf *key);

static inline void drv_update_tkip_key(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_key_conf *conf,
                                       struct sta_info *sta, u32 iv32,
                                       u16 *phase1key)
{
        struct ieee80211_sta *ista = NULL;

        if (sta)
                ista = &sta->sta;

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_update_tkip_key(local, sdata, conf, ista, iv32);
        if (local->ops->update_tkip_key)
                local->ops->update_tkip_key(&local->hw, &sdata->vif, conf,
                                            ista, iv32, phase1key);
        trace_drv_return_void(local);
}

static inline int drv_hw_scan(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              struct ieee80211_scan_request *req)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_hw_scan(local, sdata);
        ret = local->ops->hw_scan(&local->hw, &sdata->vif, req);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_cancel_hw_scan(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_cancel_hw_scan(local, sdata);
        local->ops->cancel_hw_scan(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline int
drv_sched_scan_start(struct ieee80211_local *local,
                     struct ieee80211_sub_if_data *sdata,
                     struct cfg80211_sched_scan_request *req,
                     struct ieee80211_scan_ies *ies)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_sched_scan_start(local, sdata);
        ret = local->ops->sched_scan_start(&local->hw, &sdata->vif,
                                              req, ies);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_sched_scan_stop(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_sched_scan_stop(local, sdata);
        ret = local->ops->sched_scan_stop(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_sw_scan_start(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     const u8 *mac_addr)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_sw_scan_start(local, sdata, mac_addr);
        if (local->ops->sw_scan_start)
                local->ops->sw_scan_start(&local->hw, &sdata->vif, mac_addr);
        trace_drv_return_void(local);
}

static inline void drv_sw_scan_complete(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_sw_scan_complete(local, sdata);
        if (local->ops->sw_scan_complete)
                local->ops->sw_scan_complete(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline int drv_get_stats(struct ieee80211_local *local,
                                struct ieee80211_low_level_stats *stats)
{
        int ret = -EOPNOTSUPP;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (local->ops->get_stats)
                ret = local->ops->get_stats(&local->hw, stats);
        trace_drv_get_stats(local, stats, ret);

        return ret;
}

static inline void drv_get_key_seq(struct ieee80211_local *local,
                                   struct ieee80211_key *key,
                                   struct ieee80211_key_seq *seq)
{
        if (local->ops->get_key_seq)
                local->ops->get_key_seq(&local->hw, &key->conf, seq);
        trace_drv_get_key_seq(local, &key->conf);
}

static inline int drv_set_frag_threshold(struct ieee80211_local *local,
                                        u32 value)
{
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_set_frag_threshold(local, value);
        if (local->ops->set_frag_threshold)
                ret = local->ops->set_frag_threshold(&local->hw, value);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_rts_threshold(struct ieee80211_local *local,
                                        u32 value)
{
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_set_rts_threshold(local, value);
        if (local->ops->set_rts_threshold)
                ret = local->ops->set_rts_threshold(&local->hw, value);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int drv_set_coverage_class(struct ieee80211_local *local,
                                         s16 value)
{
        int ret = 0;
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_set_coverage_class(local, value);
        if (local->ops->set_coverage_class)
                local->ops->set_coverage_class(&local->hw, value);
        else
                ret = -EOPNOTSUPP;

        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_sta_notify(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata,
                                  enum sta_notify_cmd cmd,
                                  struct ieee80211_sta *sta)
{
        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_notify(local, sdata, cmd, sta);
        if (local->ops->sta_notify)
                local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta);
        trace_drv_return_void(local);
}

static inline int drv_sta_add(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              struct ieee80211_sta *sta)
{
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_sta_add(local, sdata, sta);
        if (local->ops->sta_add)
                ret = local->ops->sta_add(&local->hw, &sdata->vif, sta);

        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_sta_remove(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata,
                                  struct ieee80211_sta *sta)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_remove(local, sdata, sta);
        if (local->ops->sta_remove)
                local->ops->sta_remove(&local->hw, &sdata->vif, sta);

        trace_drv_return_void(local);
}

#ifdef CONFIG_MAC80211_DEBUGFS
static inline void drv_vif_add_debugfs(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata)
{
        might_sleep();

        if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
            WARN_ON(!sdata->vif.debugfs_dir))
                return;

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        if (local->ops->vif_add_debugfs)
                local->ops->vif_add_debugfs(&local->hw, &sdata->vif);
}

static inline void drv_link_add_debugfs(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        struct ieee80211_bss_conf *link_conf,
                                        struct dentry *dir)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        if (local->ops->link_add_debugfs)
                local->ops->link_add_debugfs(&local->hw, &sdata->vif,
                                             link_conf, dir);
}

static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_sta *sta,
                                       struct dentry *dir)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        if (local->ops->sta_add_debugfs)
                local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
                                            sta, dir);
}

static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local,
                                            struct ieee80211_sub_if_data *sdata,
                                            struct ieee80211_link_sta *link_sta,
                                            struct dentry *dir)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        if (local->ops->link_sta_add_debugfs)
                local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif,
                                                 link_sta, dir);
}
#else
static inline void drv_vif_add_debugfs(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
}
#endif

static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
                                          struct ieee80211_sub_if_data *sdata,
                                          struct sta_info *sta)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_pre_rcu_remove(local, sdata, &sta->sta);
        if (local->ops->sta_pre_rcu_remove)
                local->ops->sta_pre_rcu_remove(&local->hw, &sdata->vif,
                                               &sta->sta);
        trace_drv_return_void(local);
}

__must_check
int drv_sta_state(struct ieee80211_local *local,
                  struct ieee80211_sub_if_data *sdata,
                  struct sta_info *sta,
                  enum ieee80211_sta_state old_state,
                  enum ieee80211_sta_state new_state);

__must_check
int drv_sta_set_txpwr(struct ieee80211_local *local,
                      struct ieee80211_sub_if_data *sdata,
                      struct sta_info *sta);

void drv_sta_rc_update(struct ieee80211_local *local,
                       struct ieee80211_sub_if_data *sdata,
                       struct ieee80211_sta *sta, u32 changed);

static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local,
                                           struct ieee80211_sub_if_data *sdata,
                                           struct ieee80211_sta *sta)
{
        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_rate_tbl_update(local, sdata, sta);
        if (local->ops->sta_rate_tbl_update)
                local->ops->sta_rate_tbl_update(&local->hw, &sdata->vif, sta);

        trace_drv_return_void(local);
}

static inline void drv_sta_statistics(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_sta *sta,
                                      struct station_info *sinfo)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_statistics(local, sdata, sta);
        if (local->ops->sta_statistics)
                local->ops->sta_statistics(&local->hw, &sdata->vif, sta, sinfo);
        trace_drv_return_void(local);
}

int drv_conf_tx(struct ieee80211_local *local,
                struct ieee80211_link_data *link, u16 ac,
                const struct ieee80211_tx_queue_params *params);

u64 drv_get_tsf(struct ieee80211_local *local,
                struct ieee80211_sub_if_data *sdata);
void drv_set_tsf(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 u64 tsf);
void drv_offset_tsf(struct ieee80211_local *local,
                    struct ieee80211_sub_if_data *sdata,
                    s64 offset);
void drv_reset_tsf(struct ieee80211_local *local,
                   struct ieee80211_sub_if_data *sdata);

static inline int drv_tx_last_beacon(struct ieee80211_local *local)
{
        int ret = 0; /* default unsupported op for less congestion */

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_tx_last_beacon(local);
        if (local->ops->tx_last_beacon)
                ret = local->ops->tx_last_beacon(&local->hw);
        trace_drv_return_int(local, ret);
        return ret;
}

int drv_ampdu_action(struct ieee80211_local *local,
                     struct ieee80211_sub_if_data *sdata,
                     struct ieee80211_ampdu_params *params);

static inline int drv_get_survey(struct ieee80211_local *local, int idx,
                                struct survey_info *survey)
{
        int ret = -EOPNOTSUPP;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_get_survey(local, idx, survey);

        if (local->ops->get_survey)
                ret = local->ops->get_survey(&local->hw, idx, survey);

        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_rfkill_poll(struct ieee80211_local *local)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (local->ops->rfkill_poll)
                local->ops->rfkill_poll(&local->hw);
}

static inline void drv_flush(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata,
                             u32 queues, bool drop)
{
        struct ieee80211_vif *vif;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);
        vif = sdata ? &sdata->vif : NULL;

        if (sdata && !check_sdata_in_driver(sdata))
                return;

        trace_drv_flush(local, queues, drop);
        if (local->ops->flush)
                local->ops->flush(&local->hw, vif, queues, drop);
        trace_drv_return_void(local);
}

static inline void drv_flush_sta(struct ieee80211_local *local,
                                 struct ieee80211_sub_if_data *sdata,
                                 struct sta_info *sta)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        sdata = get_bss_sdata(sdata);

        if (sdata && !check_sdata_in_driver(sdata))
                return;

        trace_drv_flush_sta(local, sdata, &sta->sta);
        if (local->ops->flush_sta)
                local->ops->flush_sta(&local->hw, &sdata->vif, &sta->sta);
        trace_drv_return_void(local);
}

static inline void drv_channel_switch(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_channel_switch *ch_switch)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_channel_switch(local, sdata, ch_switch);
        local->ops->channel_switch(&local->hw, &sdata->vif, ch_switch);
        trace_drv_return_void(local);
}


static inline int drv_set_antenna(struct ieee80211_local *local,
                                  u32 tx_ant, u32 rx_ant)
{
        int ret = -EOPNOTSUPP;
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (local->ops->set_antenna)
                ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant);
        trace_drv_set_antenna(local, tx_ant, rx_ant, ret);
        return ret;
}

static inline int drv_get_antenna(struct ieee80211_local *local,
                                  u32 *tx_ant, u32 *rx_ant)
{
        int ret = -EOPNOTSUPP;
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (local->ops->get_antenna)
                ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant);
        trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret);
        return ret;
}

static inline int drv_remain_on_channel(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        struct ieee80211_channel *chan,
                                        unsigned int duration,
                                        enum ieee80211_roc_type type)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_remain_on_channel(local, sdata, chan, duration, type);
        ret = local->ops->remain_on_channel(&local->hw, &sdata->vif,
                                            chan, duration, type);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int
drv_cancel_remain_on_channel(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_cancel_remain_on_channel(local, sdata);
        ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int drv_set_ringparam(struct ieee80211_local *local,
                                    u32 tx, u32 rx)
{
        int ret = -EOPNOTSUPP;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_set_ringparam(local, tx, rx);
        if (local->ops->set_ringparam)
                ret = local->ops->set_ringparam(&local->hw, tx, rx);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_get_ringparam(struct ieee80211_local *local,
                                     u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_get_ringparam(local, tx, tx_max, rx, rx_max);
        if (local->ops->get_ringparam)
                local->ops->get_ringparam(&local->hw, tx, tx_max, rx, rx_max);
        trace_drv_return_void(local);
}

static inline bool drv_tx_frames_pending(struct ieee80211_local *local)
{
        bool ret = false;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_tx_frames_pending(local);
        if (local->ops->tx_frames_pending)
                ret = local->ops->tx_frames_pending(&local->hw);
        trace_drv_return_bool(local, ret);

        return ret;
}

static inline int drv_set_bitrate_mask(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       const struct cfg80211_bitrate_mask *mask)
{
        int ret = -EOPNOTSUPP;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_set_bitrate_mask(local, sdata, mask);
        if (local->ops->set_bitrate_mask)
                ret = local->ops->set_bitrate_mask(&local->hw,
                                                   &sdata->vif, mask);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_set_rekey_data(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      struct cfg80211_gtk_rekey_data *data)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_set_rekey_data(local, sdata, data);
        if (local->ops->set_rekey_data)
                local->ops->set_rekey_data(&local->hw, &sdata->vif, data);
        trace_drv_return_void(local);
}

static inline void drv_event_callback(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      const struct ieee80211_event *event)
{
        trace_drv_event_callback(local, sdata, event);
        if (local->ops->event_callback)
                local->ops->event_callback(&local->hw, &sdata->vif, event);
        trace_drv_return_void(local);
}

static inline void
drv_release_buffered_frames(struct ieee80211_local *local,
                            struct sta_info *sta, u16 tids, int num_frames,
                            enum ieee80211_frame_release_type reason,
                            bool more_data)
{
        trace_drv_release_buffered_frames(local, &sta->sta, tids, num_frames,
                                          reason, more_data);
        if (local->ops->release_buffered_frames)
                local->ops->release_buffered_frames(&local->hw, &sta->sta, tids,
                                                    num_frames, reason,
                                                    more_data);
        trace_drv_return_void(local);
}

static inline void
drv_allow_buffered_frames(struct ieee80211_local *local,
                          struct sta_info *sta, u16 tids, int num_frames,
                          enum ieee80211_frame_release_type reason,
                          bool more_data)
{
        trace_drv_allow_buffered_frames(local, &sta->sta, tids, num_frames,
                                        reason, more_data);
        if (local->ops->allow_buffered_frames)
                local->ops->allow_buffered_frames(&local->hw, &sta->sta,
                                                  tids, num_frames, reason,
                                                  more_data);
        trace_drv_return_void(local);
}

static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
                                      struct ieee80211_sub_if_data *sdata,
                                      struct ieee80211_prep_tx_info *info)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;
        WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);

        info->link_id = info->link_id < 0 ? 0 : info->link_id;
        trace_drv_mgd_prepare_tx(local, sdata, info->duration,
                                 info->subtype, info->success);
        if (local->ops->mgd_prepare_tx)
                local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, info);
        trace_drv_return_void(local);
}

static inline void drv_mgd_complete_tx(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_prep_tx_info *info)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;
        WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);

        trace_drv_mgd_complete_tx(local, sdata, info->duration,
                                  info->subtype, info->success);
        if (local->ops->mgd_complete_tx)
                local->ops->mgd_complete_tx(&local->hw, &sdata->vif, info);
        trace_drv_return_void(local);
}

static inline void
drv_mgd_protect_tdls_discover(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              int link_id)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;
        WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);

        link_id = link_id > 0 ? link_id : 0;

        trace_drv_mgd_protect_tdls_discover(local, sdata);
        if (local->ops->mgd_protect_tdls_discover)
                local->ops->mgd_protect_tdls_discover(&local->hw, &sdata->vif,
                                                      link_id);
        trace_drv_return_void(local);
}

static inline int drv_add_chanctx(struct ieee80211_local *local,
                                  struct ieee80211_chanctx *ctx)
{
        int ret = -EOPNOTSUPP;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_add_chanctx(local, ctx);
        if (local->ops->add_chanctx)
                ret = local->ops->add_chanctx(&local->hw, &ctx->conf);
        trace_drv_return_int(local, ret);
        if (!ret)
                ctx->driver_present = true;

        return ret;
}

static inline void drv_remove_chanctx(struct ieee80211_local *local,
                                      struct ieee80211_chanctx *ctx)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (WARN_ON(!ctx->driver_present))
                return;

        trace_drv_remove_chanctx(local, ctx);
        if (local->ops->remove_chanctx)
                local->ops->remove_chanctx(&local->hw, &ctx->conf);
        trace_drv_return_void(local);
        ctx->driver_present = false;
}

static inline void drv_change_chanctx(struct ieee80211_local *local,
                                      struct ieee80211_chanctx *ctx,
                                      u32 changed)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_change_chanctx(local, ctx, changed);
        if (local->ops->change_chanctx) {
                WARN_ON_ONCE(!ctx->driver_present);
                local->ops->change_chanctx(&local->hw, &ctx->conf, changed);
        }
        trace_drv_return_void(local);
}

int drv_assign_vif_chanctx(struct ieee80211_local *local,
                           struct ieee80211_sub_if_data *sdata,
                           struct ieee80211_bss_conf *link_conf,
                           struct ieee80211_chanctx *ctx);
void drv_unassign_vif_chanctx(struct ieee80211_local *local,
                              struct ieee80211_sub_if_data *sdata,
                              struct ieee80211_bss_conf *link_conf,
                              struct ieee80211_chanctx *ctx);
int drv_switch_vif_chanctx(struct ieee80211_local *local,
                           struct ieee80211_vif_chanctx_switch *vifs,
                           int n_vifs, enum ieee80211_chanctx_switch_mode mode);

static inline int drv_start_ap(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               struct ieee80211_bss_conf *link_conf)
{
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_start_ap(local, sdata, link_conf);
        if (local->ops->start_ap)
                ret = local->ops->start_ap(&local->hw, &sdata->vif, link_conf);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_stop_ap(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               struct ieee80211_bss_conf *link_conf)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_stop_ap(local, sdata, link_conf);
        if (local->ops->stop_ap)
                local->ops->stop_ap(&local->hw, &sdata->vif, link_conf);
        trace_drv_return_void(local);
}

static inline void
drv_reconfig_complete(struct ieee80211_local *local,
                      enum ieee80211_reconfig_type reconfig_type)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        trace_drv_reconfig_complete(local, reconfig_type);
        if (local->ops->reconfig_complete)
                local->ops->reconfig_complete(&local->hw, reconfig_type);
        trace_drv_return_void(local);
}

static inline void
drv_set_default_unicast_key(struct ieee80211_local *local,
                            struct ieee80211_sub_if_data *sdata,
                            int key_idx)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        WARN_ON_ONCE(key_idx < -1 || key_idx > 3);

        trace_drv_set_default_unicast_key(local, sdata, key_idx);
        if (local->ops->set_default_unicast_key)
                local->ops->set_default_unicast_key(&local->hw, &sdata->vif,
                                                    key_idx);
        trace_drv_return_void(local);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline void drv_ipv6_addr_change(struct ieee80211_local *local,
                                        struct ieee80211_sub_if_data *sdata,
                                        struct inet6_dev *idev)
{
        trace_drv_ipv6_addr_change(local, sdata);
        if (local->ops->ipv6_addr_change)
                local->ops->ipv6_addr_change(&local->hw, &sdata->vif, idev);
        trace_drv_return_void(local);
}
#endif

static inline void
drv_channel_switch_beacon(struct ieee80211_sub_if_data *sdata,
                          struct cfg80211_chan_def *chandef)
{
        struct ieee80211_local *local = sdata->local;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (local->ops->channel_switch_beacon) {
                trace_drv_channel_switch_beacon(local, sdata, chandef);
                local->ops->channel_switch_beacon(&local->hw, &sdata->vif,
                                                  chandef);
        }
}

static inline int
drv_pre_channel_switch(struct ieee80211_sub_if_data *sdata,
                       struct ieee80211_channel_switch *ch_switch)
{
        struct ieee80211_local *local = sdata->local;
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        if (!ieee80211_vif_link_active(&sdata->vif, ch_switch->link_id))
                return 0;

        trace_drv_pre_channel_switch(local, sdata, ch_switch);
        if (local->ops->pre_channel_switch)
                ret = local->ops->pre_channel_switch(&local->hw, &sdata->vif,
                                                     ch_switch);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline int
drv_post_channel_switch(struct ieee80211_link_data *link)
{
        struct ieee80211_sub_if_data *sdata = link->sdata;
        struct ieee80211_local *local = sdata->local;
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return -EIO;

        if (!ieee80211_vif_link_active(&sdata->vif, link->link_id))
                return 0;

        trace_drv_post_channel_switch(local, sdata);
        if (local->ops->post_channel_switch)
                ret = local->ops->post_channel_switch(&local->hw, &sdata->vif,
                                                      link->conf);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void
drv_abort_channel_switch(struct ieee80211_link_data *link)
{
        struct ieee80211_sub_if_data *sdata = link->sdata;
        struct ieee80211_local *local = sdata->local;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        if (!ieee80211_vif_link_active(&sdata->vif, link->link_id))
                return;

        trace_drv_abort_channel_switch(local, sdata);

        if (local->ops->abort_channel_switch)
                local->ops->abort_channel_switch(&local->hw, &sdata->vif,
                                                 link->conf);
}

static inline void
drv_channel_switch_rx_beacon(struct ieee80211_sub_if_data *sdata,
                             struct ieee80211_channel_switch *ch_switch)
{
        struct ieee80211_local *local = sdata->local;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        if (!ieee80211_vif_link_active(&sdata->vif, ch_switch->link_id))
                return;

        trace_drv_channel_switch_rx_beacon(local, sdata, ch_switch);
        if (local->ops->channel_switch_rx_beacon)
                local->ops->channel_switch_rx_beacon(&local->hw, &sdata->vif,
                                                     ch_switch);
}

static inline int drv_join_ibss(struct ieee80211_local *local,
                                struct ieee80211_sub_if_data *sdata)
{
        int ret = 0;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_join_ibss(local, sdata, &sdata->vif.bss_conf);
        if (local->ops->join_ibss)
                ret = local->ops->join_ibss(&local->hw, &sdata->vif);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_leave_ibss(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_leave_ibss(local, sdata);
        if (local->ops->leave_ibss)
                local->ops->leave_ibss(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
                                              struct sta_info *sta)
{
        u32 ret = 0;

        trace_drv_get_expected_throughput(&sta->sta);
        if (local->ops->get_expected_throughput && sta->uploaded)
                ret = local->ops->get_expected_throughput(&local->hw, &sta->sta);
        trace_drv_return_u32(local, ret);

        return ret;
}

static inline int drv_get_txpower(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata, int *dbm)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!local->ops->get_txpower)
                return -EOPNOTSUPP;

        ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm);
        trace_drv_get_txpower(local, sdata, *dbm, ret);

        return ret;
}

static inline int
drv_tdls_channel_switch(struct ieee80211_local *local,
                        struct ieee80211_sub_if_data *sdata,
                        struct ieee80211_sta *sta, u8 oper_class,
                        struct cfg80211_chan_def *chandef,
                        struct sk_buff *tmpl_skb, u32 ch_sw_tm_ie)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        if (!local->ops->tdls_channel_switch)
                return -EOPNOTSUPP;

        trace_drv_tdls_channel_switch(local, sdata, sta, oper_class, chandef);
        ret = local->ops->tdls_channel_switch(&local->hw, &sdata->vif, sta,
                                              oper_class, chandef, tmpl_skb,
                                              ch_sw_tm_ie);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void
drv_tdls_cancel_channel_switch(struct ieee80211_local *local,
                               struct ieee80211_sub_if_data *sdata,
                               struct ieee80211_sta *sta)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return;

        if (!local->ops->tdls_cancel_channel_switch)
                return;

        trace_drv_tdls_cancel_channel_switch(local, sdata, sta);
        local->ops->tdls_cancel_channel_switch(&local->hw, &sdata->vif, sta);
        trace_drv_return_void(local);
}

static inline void
drv_tdls_recv_channel_switch(struct ieee80211_local *local,
                             struct ieee80211_sub_if_data *sdata,
                             struct ieee80211_tdls_ch_sw_params *params)
{
        trace_drv_tdls_recv_channel_switch(local, sdata, params);
        if (local->ops->tdls_recv_channel_switch)
                local->ops->tdls_recv_channel_switch(&local->hw, &sdata->vif,
                                                     params);
        trace_drv_return_void(local);
}

static inline void drv_wake_tx_queue(struct ieee80211_local *local,
                                     struct txq_info *txq)
{
        struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif);

        /* In reconfig don't transmit now, but mark for waking later */
        if (local->in_reconfig) {
                set_bit(IEEE80211_TXQ_DIRTY, &txq->flags);
                return;
        }

        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_wake_tx_queue(local, sdata, txq);
        local->ops->wake_tx_queue(&local->hw, &txq->txq);
}

static inline void schedule_and_wake_txq(struct ieee80211_local *local,
                                         struct txq_info *txqi)
{
        ieee80211_schedule_txq(&local->hw, &txqi->txq);
        drv_wake_tx_queue(local, txqi);
}

static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local,
                                             struct sk_buff *head,
                                             struct sk_buff *skb)
{
        if (!local->ops->can_aggregate_in_amsdu)
                return true;

        return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb);
}

static inline int
drv_get_ftm_responder_stats(struct ieee80211_local *local,
                            struct ieee80211_sub_if_data *sdata,
                            struct cfg80211_ftm_responder_stats *ftm_stats)
{
        u32 ret = -EOPNOTSUPP;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        if (local->ops->get_ftm_responder_stats)
                ret = local->ops->get_ftm_responder_stats(&local->hw,
                                                         &sdata->vif,
                                                         ftm_stats);
        trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats);

        return ret;
}

static inline int drv_start_pmsr(struct ieee80211_local *local,
                                 struct ieee80211_sub_if_data *sdata,
                                 struct cfg80211_pmsr_request *request)
{
        int ret = -EOPNOTSUPP;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_start_pmsr(local, sdata);

        if (local->ops->start_pmsr)
                ret = local->ops->start_pmsr(&local->hw, &sdata->vif, request);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_abort_pmsr(struct ieee80211_local *local,
                                  struct ieee80211_sub_if_data *sdata,
                                  struct cfg80211_pmsr_request *request)
{
        trace_drv_abort_pmsr(local, sdata);

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return;

        if (local->ops->abort_pmsr)
                local->ops->abort_pmsr(&local->hw, &sdata->vif, request);
        trace_drv_return_void(local);
}

static inline int drv_start_nan(struct ieee80211_local *local,
                                struct ieee80211_sub_if_data *sdata,
                                struct cfg80211_nan_conf *conf)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        check_sdata_in_driver(sdata);

        trace_drv_start_nan(local, sdata, conf);
        ret = local->ops->start_nan(&local->hw, &sdata->vif, conf);
        trace_drv_return_int(local, ret);
        return ret;
}

static inline void drv_stop_nan(struct ieee80211_local *local,
                                struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        check_sdata_in_driver(sdata);

        trace_drv_stop_nan(local, sdata);
        local->ops->stop_nan(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline int drv_nan_change_conf(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct cfg80211_nan_conf *conf,
                                       u32 changes)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        check_sdata_in_driver(sdata);

        if (!local->ops->nan_change_conf)
                return -EOPNOTSUPP;

        trace_drv_nan_change_conf(local, sdata, conf, changes);
        ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf,
                                          changes);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int drv_add_nan_func(struct ieee80211_local *local,
                                   struct ieee80211_sub_if_data *sdata,
                                   const struct cfg80211_nan_func *nan_func)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        check_sdata_in_driver(sdata);

        if (!local->ops->add_nan_func)
                return -EOPNOTSUPP;

        trace_drv_add_nan_func(local, sdata, nan_func);
        ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_del_nan_func(struct ieee80211_local *local,
                                   struct ieee80211_sub_if_data *sdata,
                                   u8 instance_id)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        check_sdata_in_driver(sdata);

        trace_drv_del_nan_func(local, sdata, instance_id);
        if (local->ops->del_nan_func)
                local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id);
        trace_drv_return_void(local);
}

static inline int drv_set_tid_config(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_sta *sta,
                                     struct cfg80211_tid_config *tid_conf)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        ret = local->ops->set_tid_config(&local->hw, &sdata->vif, sta,
                                         tid_conf);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int drv_reset_tid_config(struct ieee80211_local *local,
                                       struct ieee80211_sub_if_data *sdata,
                                       struct ieee80211_sta *sta, u8 tids)
{
        int ret;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline void drv_update_vif_offload(struct ieee80211_local *local,
                                          struct ieee80211_sub_if_data *sdata)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        check_sdata_in_driver(sdata);

        if (!local->ops->update_vif_offload)
                return;

        trace_drv_update_vif_offload(local, sdata);
        local->ops->update_vif_offload(&local->hw, &sdata->vif);
        trace_drv_return_void(local);
}

static inline void drv_sta_set_4addr(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_sta *sta, bool enabled)
{
        sdata = get_bss_sdata(sdata);

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_set_4addr(local, sdata, sta, enabled);
        if (local->ops->sta_set_4addr)
                local->ops->sta_set_4addr(&local->hw, &sdata->vif, sta, enabled);
        trace_drv_return_void(local);
}

static inline void drv_sta_set_decap_offload(struct ieee80211_local *local,
                                             struct ieee80211_sub_if_data *sdata,
                                             struct ieee80211_sta *sta,
                                             bool enabled)
{
        sdata = get_bss_sdata(sdata);

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return;

        trace_drv_sta_set_decap_offload(local, sdata, sta, enabled);
        if (local->ops->sta_set_decap_offload)
                local->ops->sta_set_decap_offload(&local->hw, &sdata->vif, sta,
                                                  enabled);
        trace_drv_return_void(local);
}

static inline void drv_add_twt_setup(struct ieee80211_local *local,
                                     struct ieee80211_sub_if_data *sdata,
                                     struct ieee80211_sta *sta,
                                     struct ieee80211_twt_setup *twt)
{
        struct ieee80211_twt_params *twt_agrt;

        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return;

        twt_agrt = (void *)twt->params;

        trace_drv_add_twt_setup(local, sta, twt, twt_agrt);
        local->ops->add_twt_setup(&local->hw, sta, twt);
        trace_drv_return_void(local);
}

static inline void drv_twt_teardown_request(struct ieee80211_local *local,
                                            struct ieee80211_sub_if_data *sdata,
                                            struct ieee80211_sta *sta,
                                            u8 flowid)
{
        might_sleep();
        lockdep_assert_wiphy(local->hw.wiphy);
        if (!check_sdata_in_driver(sdata))
                return;

        if (!local->ops->twt_teardown_request)
                return;

        trace_drv_twt_teardown_request(local, sta, flowid);
        local->ops->twt_teardown_request(&local->hw, sta, flowid);
        trace_drv_return_void(local);
}

static inline int drv_net_fill_forward_path(struct ieee80211_local *local,
                                            struct ieee80211_sub_if_data *sdata,
                                            struct ieee80211_sta *sta,
                                            struct net_device_path_ctx *ctx,
                                            struct net_device_path *path)
{
        int ret = -EOPNOTSUPP;

        sdata = get_bss_sdata(sdata);
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_net_fill_forward_path(local, sdata, sta);
        if (local->ops->net_fill_forward_path)
                ret = local->ops->net_fill_forward_path(&local->hw,
                                                        &sdata->vif, sta,
                                                        ctx, path);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline int drv_net_setup_tc(struct ieee80211_local *local,
                                   struct ieee80211_sub_if_data *sdata,
                                   struct net_device *dev,
                                   enum tc_setup_type type, void *type_data)
{
        int ret = -EOPNOTSUPP;

        might_sleep();

        sdata = get_bss_sdata(sdata);
        trace_drv_net_setup_tc(local, sdata, type);
        if (local->ops->net_setup_tc)
                ret = local->ops->net_setup_tc(&local->hw, &sdata->vif, dev,
                                               type, type_data);
        trace_drv_return_int(local, ret);

        return ret;
}

static inline bool drv_can_activate_links(struct ieee80211_local *local,
                                          struct ieee80211_sub_if_data *sdata,
                                          u16 active_links)
{
        bool ret = true;

        lockdep_assert_wiphy(local->hw.wiphy);

        if (!check_sdata_in_driver(sdata))
                return false;

        trace_drv_can_activate_links(local, sdata, active_links);
        if (local->ops->can_activate_links)
                ret = local->ops->can_activate_links(&local->hw, &sdata->vif,
                                                     active_links);
        trace_drv_return_bool(local, ret);

        return ret;
}

int drv_change_vif_links(struct ieee80211_local *local,
                         struct ieee80211_sub_if_data *sdata,
                         u16 old_links, u16 new_links,
                         struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]);
int drv_change_sta_links(struct ieee80211_local *local,
                         struct ieee80211_sub_if_data *sdata,
                         struct ieee80211_sta *sta,
                         u16 old_links, u16 new_links);

static inline enum ieee80211_neg_ttlm_res
drv_can_neg_ttlm(struct ieee80211_local *local,
                 struct ieee80211_sub_if_data *sdata,
                 struct ieee80211_neg_ttlm *neg_ttlm)
{
        enum ieee80211_neg_ttlm_res res = NEG_TTLM_RES_REJECT;

        might_sleep();
        if (!check_sdata_in_driver(sdata))
                return -EIO;

        trace_drv_can_neg_ttlm(local, sdata, neg_ttlm);
        if (local->ops->can_neg_ttlm)
                res = local->ops->can_neg_ttlm(&local->hw, &sdata->vif,
                                               neg_ttlm);
        trace_drv_neg_ttlm_res(local, sdata, res, neg_ttlm);

        return res;
}
#endif /* __MAC80211_DRIVER_OPS */










































































































































































































































































    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






























































































































































































































































































































































    1 




























































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions of this file
 * Copyright(c) 2016-2017 Intel Deutschland GmbH
 * Copyright (C) 2018, 2020-2024 Intel Corporation
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cfg80211

#if !defined(__RDEV_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ)
#define __RDEV_OPS_TRACE

#include <linux/tracepoint.h>

#include <linux/rtnetlink.h>
#include <linux/etherdevice.h>
#include <net/cfg80211.h>
#include "core.h"

#define MAC_ENTRY(entry_mac) __array(u8, entry_mac, ETH_ALEN)
#define MAC_ASSIGN(entry_mac, given_mac) do {                             \
        if (given_mac)                                                     \
                memcpy(__entry->entry_mac, given_mac, ETH_ALEN);     \
        else                                                             \
                eth_zero_addr(__entry->entry_mac);                     \
        } while (0)

#define MAXNAME                32
#define WIPHY_ENTRY        __array(char, wiphy_name, 32)
#define WIPHY_ASSIGN        strscpy(__entry->wiphy_name, wiphy_name(wiphy), MAXNAME)
#define WIPHY_PR_FMT        "%s"
#define WIPHY_PR_ARG        __entry->wiphy_name

#define WDEV_ENTRY        __field(u32, id)
#define WDEV_ASSIGN        (__entry->id) = (!IS_ERR_OR_NULL(wdev)        \
                                         ? wdev->identifier : 0)
#define WDEV_PR_FMT        "wdev(%u)"
#define WDEV_PR_ARG        (__entry->id)

#define NETDEV_ENTRY        __array(char, name, IFNAMSIZ) \
                        __field(int, ifindex)
#define NETDEV_ASSIGN                                               \
        do {                                                       \
                memcpy(__entry->name, netdev->name, IFNAMSIZ); \
                (__entry->ifindex) = (netdev->ifindex);               \
        } while (0)
#define NETDEV_PR_FMT        "netdev:%s(%d)"
#define NETDEV_PR_ARG        __entry->name, __entry->ifindex

#define MESH_CFG_ENTRY __field(u16, dot11MeshRetryTimeout)                   \
                       __field(u16, dot11MeshConfirmTimeout)                   \
                       __field(u16, dot11MeshHoldingTimeout)                   \
                       __field(u16, dot11MeshMaxPeerLinks)                   \
                       __field(u8, dot11MeshMaxRetries)                           \
                       __field(u8, dot11MeshTTL)                           \
                       __field(u8, element_ttl)                                   \
                       __field(bool, auto_open_plinks)                           \
                       __field(u32, dot11MeshNbrOffsetMaxNeighbor)           \
                       __field(u8, dot11MeshHWMPmaxPREQretries)                   \
                       __field(u32, path_refresh_time)                           \
                       __field(u32, dot11MeshHWMPactivePathTimeout)           \
                       __field(u16, min_discovery_timeout)                   \
                       __field(u16, dot11MeshHWMPpreqMinInterval)           \
                       __field(u16, dot11MeshHWMPperrMinInterval)           \
                       __field(u16, dot11MeshHWMPnetDiameterTraversalTime) \
                       __field(u8, dot11MeshHWMPRootMode)                   \
                       __field(u16, dot11MeshHWMPRannInterval)                   \
                       __field(bool, dot11MeshGateAnnouncementProtocol)           \
                       __field(bool, dot11MeshForwarding)                   \
                       __field(s32, rssi_threshold)                           \
                       __field(u16, ht_opmode)                                   \
                       __field(u32, dot11MeshHWMPactivePathToRootTimeout)  \
                       __field(u16, dot11MeshHWMProotInterval)                   \
                       __field(u16, dot11MeshHWMPconfirmationInterval)           \
                       __field(bool, dot11MeshNolearn)
#define MESH_CFG_ASSIGN                                                              \
        do {                                                                      \
                __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \
                __entry->dot11MeshConfirmTimeout =                              \
                                conf->dot11MeshConfirmTimeout;                      \
                __entry->dot11MeshHoldingTimeout =                              \
                                conf->dot11MeshHoldingTimeout;                      \
                __entry->dot11MeshMaxPeerLinks = conf->dot11MeshMaxPeerLinks; \
                __entry->dot11MeshMaxRetries = conf->dot11MeshMaxRetries;     \
                __entry->dot11MeshTTL = conf->dot11MeshTTL;                      \
                __entry->element_ttl = conf->element_ttl;                      \
                __entry->auto_open_plinks = conf->auto_open_plinks;              \
                __entry->dot11MeshNbrOffsetMaxNeighbor =                      \
                                conf->dot11MeshNbrOffsetMaxNeighbor;              \
                __entry->dot11MeshHWMPmaxPREQretries =                              \
                                conf->dot11MeshHWMPmaxPREQretries;              \
                __entry->path_refresh_time = conf->path_refresh_time;              \
                __entry->dot11MeshHWMPactivePathTimeout =                      \
                                conf->dot11MeshHWMPactivePathTimeout;              \
                __entry->min_discovery_timeout = conf->min_discovery_timeout; \
                __entry->dot11MeshHWMPpreqMinInterval =                              \
                                conf->dot11MeshHWMPpreqMinInterval;              \
                __entry->dot11MeshHWMPperrMinInterval =                              \
                                conf->dot11MeshHWMPperrMinInterval;              \
                __entry->dot11MeshHWMPnetDiameterTraversalTime =              \
                                conf->dot11MeshHWMPnetDiameterTraversalTime;  \
                __entry->dot11MeshHWMPRootMode = conf->dot11MeshHWMPRootMode; \
                __entry->dot11MeshHWMPRannInterval =                              \
                                conf->dot11MeshHWMPRannInterval;              \
                __entry->dot11MeshGateAnnouncementProtocol =                      \
                                conf->dot11MeshGateAnnouncementProtocol;      \
                __entry->dot11MeshForwarding = conf->dot11MeshForwarding;     \
                __entry->rssi_threshold = conf->rssi_threshold;                      \
                __entry->ht_opmode = conf->ht_opmode;                              \
                __entry->dot11MeshHWMPactivePathToRootTimeout =                      \
                                conf->dot11MeshHWMPactivePathToRootTimeout;   \
                __entry->dot11MeshHWMProotInterval =                              \
                                conf->dot11MeshHWMProotInterval;              \
                __entry->dot11MeshHWMPconfirmationInterval =                      \
                                conf->dot11MeshHWMPconfirmationInterval;      \
                __entry->dot11MeshNolearn = conf->dot11MeshNolearn;              \
        } while (0)

#define CHAN_ENTRY __field(enum nl80211_band, band) \
                   __field(u32, center_freq)                \
                   __field(u16, freq_offset)
#define CHAN_ASSIGN(chan)                                          \
        do {                                                          \
                if (chan) {                                          \
                        __entry->band = chan->band;                  \
                        __entry->center_freq = chan->center_freq; \
                        __entry->freq_offset = chan->freq_offset; \
                } else {                                          \
                        __entry->band = 0;                          \
                        __entry->center_freq = 0;                  \
                        __entry->freq_offset = 0;                  \
                }                                                  \
        } while (0)
#define CHAN_PR_FMT "band: %d, freq: %u.%03u"
#define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset

#define CHAN_DEF_ENTRY __field(enum nl80211_band, band)                \
                       __field(u32, control_freq)                        \
                       __field(u32, freq_offset)                        \
                       __field(u32, width)                                \
                       __field(u32, center_freq1)                        \
                       __field(u32, freq1_offset)                        \
                       __field(u32, center_freq2)                        \
                       __field(u16, punctured)
#define CHAN_DEF_ASSIGN(chandef)                                        \
        do {                                                                \
                if ((chandef) && (chandef)->chan) {                        \
                        __entry->band = (chandef)->chan->band;                \
                        __entry->control_freq =                                \
                                (chandef)->chan->center_freq;                \
                        __entry->freq_offset =                                \
                                (chandef)->chan->freq_offset;                \
                        __entry->width = (chandef)->width;                \
                        __entry->center_freq1 = (chandef)->center_freq1;\
                        __entry->freq1_offset = (chandef)->freq1_offset;\
                        __entry->center_freq2 = (chandef)->center_freq2;\
                        __entry->punctured = (chandef)->punctured;        \
                } else {                                                \
                        __entry->band = 0;                                \
                        __entry->control_freq = 0;                        \
                        __entry->freq_offset = 0;                        \
                        __entry->width = 0;                                \
                        __entry->center_freq1 = 0;                        \
                        __entry->freq1_offset = 0;                        \
                        __entry->center_freq2 = 0;                        \
                        __entry->punctured = 0;                                \
                }                                                        \
        } while (0)
#define CHAN_DEF_PR_FMT                                                        \
        "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u, punct: 0x%x"
#define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq,                \
                        __entry->freq_offset, __entry->width,                \
                        __entry->center_freq1, __entry->freq1_offset,        \
                        __entry->center_freq2, __entry->punctured

#define FILS_AAD_ASSIGN(fa)                                                \
        do {                                                                \
                if (fa) {                                                \
                        ether_addr_copy(__entry->macaddr, fa->macaddr);        \
                        __entry->kek_len = fa->kek_len;                        \
                } else {                                                \
                        eth_zero_addr(__entry->macaddr);                \
                        __entry->kek_len = 0;                                \
                }                                                        \
        } while (0)
#define FILS_AAD_PR_FMT                                                        \
        "macaddr: %pM, kek_len: %d"

#define SINFO_ENTRY __field(int, generation)            \
                    __field(u32, connected_time)    \
                    __field(u32, inactive_time)            \
                    __field(u32, rx_bytes)            \
                    __field(u32, tx_bytes)            \
                    __field(u32, rx_packets)            \
                    __field(u32, tx_packets)            \
                    __field(u32, tx_retries)            \
                    __field(u32, tx_failed)            \
                    __field(u32, rx_dropped_misc)   \
                    __field(u32, beacon_loss_count) \
                    __field(u16, llid)                    \
                    __field(u16, plid)                    \
                    __field(u8, plink_state)
#define SINFO_ASSIGN                                                       \
        do {                                                               \
                __entry->generation = sinfo->generation;               \
                __entry->connected_time = sinfo->connected_time;       \
                __entry->inactive_time = sinfo->inactive_time;               \
                __entry->rx_bytes = sinfo->rx_bytes;                       \
                __entry->tx_bytes = sinfo->tx_bytes;                       \
                __entry->rx_packets = sinfo->rx_packets;               \
                __entry->tx_packets = sinfo->tx_packets;               \
                __entry->tx_retries = sinfo->tx_retries;               \
                __entry->tx_failed = sinfo->tx_failed;                       \
                __entry->rx_dropped_misc = sinfo->rx_dropped_misc;     \
                __entry->beacon_loss_count = sinfo->beacon_loss_count; \
                __entry->llid = sinfo->llid;                               \
                __entry->plid = sinfo->plid;                               \
                __entry->plink_state = sinfo->plink_state;               \
        } while (0)

#define BOOL_TO_STR(bo) (bo) ? "true" : "false"

#define QOS_MAP_ENTRY __field(u8, num_des)                        \
                      __array(u8, dscp_exception,                \
                              2 * IEEE80211_QOS_MAP_MAX_EX)        \
                      __array(u8, up, IEEE80211_QOS_MAP_LEN_MIN)
#define QOS_MAP_ASSIGN(qos_map)                                        \
        do {                                                        \
                if ((qos_map)) {                                \
                        __entry->num_des = (qos_map)->num_des;        \
                        memcpy(__entry->dscp_exception,                \
                               &(qos_map)->dscp_exception,        \
                               2 * IEEE80211_QOS_MAP_MAX_EX);        \
                        memcpy(__entry->up, &(qos_map)->up,        \
                               IEEE80211_QOS_MAP_LEN_MIN);        \
                } else {                                        \
                        __entry->num_des = 0;                        \
                        memset(__entry->dscp_exception, 0,        \
                               2 * IEEE80211_QOS_MAP_MAX_EX);        \
                        memset(__entry->up, 0,                        \
                               IEEE80211_QOS_MAP_LEN_MIN);        \
                }                                                \
        } while (0)

/*************************************************************
 *                        wiphy work traces                     *
 *************************************************************/

DECLARE_EVENT_CLASS(wiphy_work_event,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(void *, instance)
                __field(void *, func)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->instance = work;
                __entry->func = work ? work->func : NULL;
        ),
        TP_printk(WIPHY_PR_FMT " instance=%p func=%pS",
                  WIPHY_PR_ARG, __entry->instance, __entry->func)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_queue,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_run,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_cancel,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

DEFINE_EVENT(wiphy_work_event, wiphy_work_flush,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work),
        TP_ARGS(wiphy, work)
);

TRACE_EVENT(wiphy_delayed_work_queue,
        TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work,
                 unsigned long delay),
        TP_ARGS(wiphy, work, delay),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(void *, instance)
                __field(void *, func)
                __field(unsigned long, delay)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->instance = work;
                __entry->func = work->func;
                __entry->delay = delay;
        ),
        TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%ld",
                  WIPHY_PR_ARG, __entry->instance, __entry->func,
                  __entry->delay)
);

TRACE_EVENT(wiphy_work_worker_start,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

/*************************************************************
 *                        rdev->ops traces                     *
 *************************************************************/

TRACE_EVENT(rdev_suspend,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_wowlan *wow),
        TP_ARGS(wiphy, wow),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(bool, any)
                __field(bool, disconnect)
                __field(bool, magic_pkt)
                __field(bool, gtk_rekey_failure)
                __field(bool, eap_identity_req)
                __field(bool, four_way_handshake)
                __field(bool, rfkill_release)
                __field(bool, valid_wow)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                if (wow) {
                        __entry->any = wow->any;
                        __entry->disconnect = wow->disconnect;
                        __entry->magic_pkt = wow->magic_pkt;
                        __entry->gtk_rekey_failure = wow->gtk_rekey_failure;
                        __entry->eap_identity_req = wow->eap_identity_req;
                        __entry->four_way_handshake = wow->four_way_handshake;
                        __entry->rfkill_release = wow->rfkill_release;
                        __entry->valid_wow = true;
                } else {
                        __entry->valid_wow = false;
                }
        ),
        TP_printk(WIPHY_PR_FMT ", wow%s - any: %d, disconnect: %d, "
                  "magic pkt: %d, gtk rekey failure: %d, eap identify req: %d, "
                  "four way handshake: %d, rfkill release: %d.",
                  WIPHY_PR_ARG, __entry->valid_wow ? "" : "(Not configured!)",
                  __entry->any, __entry->disconnect, __entry->magic_pkt,
                  __entry->gtk_rekey_failure, __entry->eap_identity_req,
                  __entry->four_way_handshake, __entry->rfkill_release)
);

TRACE_EVENT(rdev_return_int,
        TP_PROTO(struct wiphy *wiphy, int ret),
        TP_ARGS(wiphy, ret),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d", WIPHY_PR_ARG, __entry->ret)
);

TRACE_EVENT(rdev_scan,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request *request),
        TP_ARGS(wiphy, request),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

DECLARE_EVENT_CLASS(wiphy_only_evt,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);

DEFINE_EVENT(wiphy_only_evt, rdev_resume,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DEFINE_EVENT(wiphy_only_evt, rdev_return_void,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy)
);

DECLARE_EVENT_CLASS(wiphy_enabled_evt,
        TP_PROTO(struct wiphy *wiphy, bool enabled),
        TP_ARGS(wiphy, enabled),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(WIPHY_PR_FMT ", %senabled ",
                  WIPHY_PR_ARG, __entry->enabled ? "" : "not ")
);

DEFINE_EVENT(wiphy_enabled_evt, rdev_set_wakeup,
        TP_PROTO(struct wiphy *wiphy, bool enabled),
        TP_ARGS(wiphy, enabled)
);

TRACE_EVENT(rdev_add_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, char *name, enum nl80211_iftype type),
        TP_ARGS(wiphy, name, type),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __string(vir_intf_name, name ? name : "<noname>")
                __field(enum nl80211_iftype, type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __assign_str(vir_intf_name);
                __entry->type = type;
        ),
        TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d",
                  WIPHY_PR_ARG, __get_str(vir_intf_name), __entry->type)
);

DECLARE_EVENT_CLASS(wiphy_wdev_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_del_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_change_virtual_intf,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 enum nl80211_iftype type),
        TP_ARGS(wiphy, netdev, type),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(enum nl80211_iftype, type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->type = type;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", type: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->type)
);

DECLARE_EVENT_CLASS(key_handle,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                __field(int, link_id)
                __field(u8, key_index)
                __field(bool, pairwise)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                __entry->link_id = link_id;
                __entry->key_index = key_index;
                __entry->pairwise = pairwise;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key_index: %u, pairwise: %s, mac addr: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->key_index, BOOL_TO_STR(__entry->pairwise),
                  __entry->mac_addr)
);

DEFINE_EVENT(key_handle, rdev_get_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr)
);

DEFINE_EVENT(key_handle, rdev_del_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr)
);

TRACE_EVENT(rdev_add_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool pairwise, const u8 *mac_addr, u8 mode),
        TP_ARGS(wiphy, netdev, link_id, key_index, pairwise, mac_addr, mode),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                __field(int, link_id)
                __field(u8, key_index)
                __field(bool, pairwise)
                __field(u8, mode)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                __entry->link_id = link_id;
                __entry->key_index = key_index;
                __entry->pairwise = pairwise;
                __entry->mode = mode;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key_index: %u, mode: %u, pairwise: %s, "
                  "mac addr: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->key_index, __entry->mode,
                  BOOL_TO_STR(__entry->pairwise), __entry->mac_addr)
);

TRACE_EVENT(rdev_set_default_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index, bool unicast, bool multicast),
        TP_ARGS(wiphy, netdev, link_id, key_index, unicast, multicast),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __field(u8, key_index)
                __field(bool, unicast)
                __field(bool, multicast)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                __entry->key_index = key_index;
                __entry->unicast = unicast;
                __entry->multicast = multicast;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key index: %u, unicast: %s, multicast: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->key_index, BOOL_TO_STR(__entry->unicast),
                  BOOL_TO_STR(__entry->multicast))
);

TRACE_EVENT(rdev_set_default_mgmt_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index),
        TP_ARGS(wiphy, netdev, link_id, key_index),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __field(u8, key_index)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                __entry->key_index = key_index;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->link_id, __entry->key_index)
);

TRACE_EVENT(rdev_set_default_beacon_key,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int link_id,
                 u8 key_index),
        TP_ARGS(wiphy, netdev, link_id, key_index),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __field(u8, key_index)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                __entry->key_index = key_index;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, "
                  "key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->link_id, __entry->key_index)
);

TRACE_EVENT(rdev_start_ap,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ap_settings *settings),
        TP_ARGS(wiphy, netdev, settings),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(int, beacon_interval)
                __field(int, dtim_period)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
                __field(enum nl80211_hidden_ssid, hidden_ssid)
                __field(u32, wpa_ver)
                __field(bool, privacy)
                __field(enum nl80211_auth_type, auth_type)
                __field(int, inactivity_timeout)
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(&settings->chandef);
                __entry->beacon_interval = settings->beacon_interval;
                __entry->dtim_period = settings->dtim_period;
                __entry->hidden_ssid = settings->hidden_ssid;
                __entry->wpa_ver = settings->crypto.wpa_versions;
                __entry->privacy = settings->privacy;
                __entry->auth_type = settings->auth_type;
                __entry->inactivity_timeout = settings->inactivity_timeout;
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, settings->ssid, settings->ssid_len);
                __entry->link_id = settings->beacon.link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, "
                  CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, "
                  "hidden ssid: %d, wpa versions: %u, privacy: %s, "
                  "auth type: %d, inactivity timeout: %d, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG,
                  __entry->beacon_interval, __entry->dtim_period,
                  __entry->hidden_ssid, __entry->wpa_ver,
                  BOOL_TO_STR(__entry->privacy), __entry->auth_type,
                  __entry->inactivity_timeout, __entry->link_id)
);

TRACE_EVENT(rdev_change_beacon,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ap_update *info),
        TP_ARGS(wiphy, netdev, info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, link_id)
                __dynamic_array(u8, head, info->beacon.head_len)
                __dynamic_array(u8, tail, info->beacon.tail_len)
                __dynamic_array(u8, beacon_ies, info->beacon.beacon_ies_len)
                __dynamic_array(u8, proberesp_ies, info->beacon.proberesp_ies_len)
                __dynamic_array(u8, assocresp_ies, info->beacon.assocresp_ies_len)
                __dynamic_array(u8, probe_resp, info->beacon.probe_resp_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = info->beacon.link_id;
                if (info->beacon.head)
                        memcpy(__get_dynamic_array(head),
                               info->beacon.head,
                               info->beacon.head_len);
                if (info->beacon.tail)
                        memcpy(__get_dynamic_array(tail),
                               info->beacon.tail,
                               info->beacon.tail_len);
                if (info->beacon.beacon_ies)
                        memcpy(__get_dynamic_array(beacon_ies),
                               info->beacon.beacon_ies,
                               info->beacon.beacon_ies_len);
                if (info->beacon.proberesp_ies)
                        memcpy(__get_dynamic_array(proberesp_ies),
                               info->beacon.proberesp_ies,
                               info->beacon.proberesp_ies_len);
                if (info->beacon.assocresp_ies)
                        memcpy(__get_dynamic_array(assocresp_ies),
                               info->beacon.assocresp_ies,
                               info->beacon.assocresp_ies_len);
                if (info->beacon.probe_resp)
                        memcpy(__get_dynamic_array(probe_resp),
                               info->beacon.probe_resp,
                               info->beacon.probe_resp_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id:%d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
);

TRACE_EVENT(rdev_stop_ap,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, netdev, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id)
);

DECLARE_EVENT_CLASS(wiphy_netdev_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_get_mesh_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_mesh,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ibss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ocb,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
        TP_ARGS(wiphy, netdev)
);

DEFINE_EVENT(wiphy_netdev_evt, rdev_end_cac,
             TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
             TP_ARGS(wiphy, netdev)
);

DECLARE_EVENT_CLASS(station_add_change,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(u32, sta_flags_mask)
                __field(u32, sta_flags_set)
                __field(u32, sta_modify_mask)
                __field(int, listen_interval)
                __field(u16, capability)
                __field(u16, aid)
                __field(u8, plink_action)
                __field(u8, plink_state)
                __field(u8, uapsd_queues)
                __field(u8, max_sp)
                __field(u8, opmode_notif)
                __field(bool, opmode_notif_used)
                __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
                __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
                __array(char, vlan, IFNAMSIZ)
                __dynamic_array(u8, supported_rates,
                                params->link_sta_params.supported_rates_len)
                __dynamic_array(u8, ext_capab, params->ext_capab_len)
                __dynamic_array(u8, supported_channels,
                                params->supported_channels_len)
                __dynamic_array(u8, supported_oper_classes,
                                params->supported_oper_classes_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
                __entry->sta_flags_mask = params->sta_flags_mask;
                __entry->sta_flags_set = params->sta_flags_set;
                __entry->sta_modify_mask = params->sta_modify_mask;
                __entry->listen_interval = params->listen_interval;
                __entry->aid = params->aid;
                __entry->plink_action = params->plink_action;
                __entry->plink_state = params->plink_state;
                __entry->uapsd_queues = params->uapsd_queues;
                memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
                if (params->link_sta_params.ht_capa)
                        memcpy(__entry->ht_capa,
                               params->link_sta_params.ht_capa,
                               sizeof(struct ieee80211_ht_cap));
                memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
                if (params->link_sta_params.vht_capa)
                        memcpy(__entry->vht_capa,
                               params->link_sta_params.vht_capa,
                               sizeof(struct ieee80211_vht_cap));
                memset(__entry->vlan, 0, sizeof(__entry->vlan));
                if (params->vlan)
                        memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ);
                if (params->link_sta_params.supported_rates &&
                    params->link_sta_params.supported_rates_len)
                        memcpy(__get_dynamic_array(supported_rates),
                               params->link_sta_params.supported_rates,
                               params->link_sta_params.supported_rates_len);
                if (params->ext_capab && params->ext_capab_len)
                        memcpy(__get_dynamic_array(ext_capab),
                               params->ext_capab,
                               params->ext_capab_len);
                if (params->supported_channels &&
                    params->supported_channels_len)
                        memcpy(__get_dynamic_array(supported_channels),
                               params->supported_channels,
                               params->supported_channels_len);
                if (params->supported_oper_classes &&
                    params->supported_oper_classes_len)
                        memcpy(__get_dynamic_array(supported_oper_classes),
                               params->supported_oper_classes,
                               params->supported_oper_classes_len);
                __entry->max_sp = params->max_sp;
                __entry->capability = params->capability;
                __entry->opmode_notif = params->link_sta_params.opmode_notif;
                __entry->opmode_notif_used =
                        params->link_sta_params.opmode_notif_used;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", station flags mask: 0x%x, station flags set: 0x%x, "
                  "station modify mask: 0x%x, listen interval: %d, aid: %u, "
                  "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac,
                  __entry->sta_flags_mask, __entry->sta_flags_set,
                  __entry->sta_modify_mask, __entry->listen_interval,
                  __entry->aid, __entry->plink_action, __entry->plink_state,
                  __entry->uapsd_queues, __entry->vlan)
);

DEFINE_EVENT(station_add_change, rdev_add_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params)
);

DEFINE_EVENT(station_add_change, rdev_change_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
                 struct station_parameters *params),
        TP_ARGS(wiphy, netdev, mac, params)
);

DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac)
);

DECLARE_EVENT_CLASS(station_del,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(u8, subtype)
                __field(u16, reason_code)
                __field(int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, params->mac);
                __entry->subtype = params->subtype;
                __entry->reason_code = params->reason_code;
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", subtype: %u, reason_code: %u, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac,
                  __entry->subtype, __entry->reason_code,
                  __entry->link_id)
);

DEFINE_EVENT(station_del, rdev_del_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params)
);

DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_get_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac)
);

DEFINE_EVENT(wiphy_netdev_mac_evt, rdev_del_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac),
        TP_ARGS(wiphy, netdev, mac)
);

TRACE_EVENT(rdev_dump_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *mac),
        TP_ARGS(wiphy, netdev, _idx, mac),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(sta_mac)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(sta_mac, mac);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM, idx: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->sta_mac,
                  __entry->idx)
);

TRACE_EVENT(rdev_return_int_station_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct station_info *sinfo),
        TP_ARGS(wiphy, ret, sinfo),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                SINFO_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                SINFO_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d" ,
                  WIPHY_PR_ARG, __entry->ret)
);

DECLARE_EVENT_CLASS(mpath_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(next_hop)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(next_hop, next_hop);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM, next hop: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dst,
                  __entry->next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_add_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_change_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

DEFINE_EVENT(mpath_evt, rdev_get_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *dst,
                 u8 *next_hop),
        TP_ARGS(wiphy, netdev, dst, next_hop)
);

TRACE_EVENT(rdev_dump_mpath,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *dst, u8 *next_hop),
        TP_ARGS(wiphy, netdev, _idx, dst, next_hop),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(next_hop)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(next_hop, next_hop);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, next hop: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst,
                  __entry->next_hop)
);

TRACE_EVENT(rdev_get_mpp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *dst, u8 *mpp),
        TP_ARGS(wiphy, netdev, dst, mpp),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(mpp)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(mpp, mpp);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: %pM"
                  ", mpp: %pM", WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->dst, __entry->mpp)
);

TRACE_EVENT(rdev_dump_mpp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx,
                 u8 *dst, u8 *mpp),
        TP_ARGS(wiphy, netdev, _idx, dst, mpp),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dst)
                MAC_ENTRY(mpp)
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dst, dst);
                MAC_ASSIGN(mpp, mpp);
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: %pM, mpp: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, __entry->dst,
                  __entry->mpp)
);

TRACE_EVENT(rdev_return_int_mpath_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct mpath_info *pinfo),
        TP_ARGS(wiphy, ret, pinfo),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(int, generation)
                __field(u32, filled)
                __field(u32, frame_qlen)
                __field(u32, sn)
                __field(u32, metric)
                __field(u32, exptime)
                __field(u32, discovery_timeout)
                __field(u8, discovery_retries)
                __field(u8, flags)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->generation = pinfo->generation;
                __entry->filled = pinfo->filled;
                __entry->frame_qlen = pinfo->frame_qlen;
                __entry->sn = pinfo->sn;
                __entry->metric = pinfo->metric;
                __entry->exptime = pinfo->exptime;
                __entry->discovery_timeout = pinfo->discovery_timeout;
                __entry->discovery_retries = pinfo->discovery_retries;
                __entry->flags = pinfo->flags;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d. mpath info - generation: %d, "
                  "filled: %u, frame qlen: %u, sn: %u, metric: %u, exptime: %u,"
                  " discovery timeout: %u, discovery retries: %u, flags: 0x%x",
                  WIPHY_PR_ARG, __entry->ret, __entry->generation,
                  __entry->filled, __entry->frame_qlen, __entry->sn,
                  __entry->metric, __entry->exptime, __entry->discovery_timeout,
                  __entry->discovery_retries, __entry->flags)
);

TRACE_EVENT(rdev_return_int_mesh_config,
        TP_PROTO(struct wiphy *wiphy, int ret, struct mesh_config *conf),
        TP_ARGS(wiphy, ret, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                MESH_CFG_ENTRY
                __field(int, ret)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                MESH_CFG_ASSIGN;
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d",
                  WIPHY_PR_ARG, __entry->ret)
);

TRACE_EVENT(rdev_update_mesh_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 mask,
                 const struct mesh_config *conf),
        TP_ARGS(wiphy, netdev, mask, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MESH_CFG_ENTRY
                __field(u32, mask)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MESH_CFG_ASSIGN;
                __entry->mask = mask;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mask: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mask)
);

TRACE_EVENT(rdev_join_mesh,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const struct mesh_config *conf,
                 const struct mesh_setup *setup),
        TP_ARGS(wiphy, netdev, conf, setup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MESH_CFG_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MESH_CFG_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

TRACE_EVENT(rdev_change_bss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct bss_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, use_cts_prot)
                __field(int, use_short_preamble)
                __field(int, use_short_slot_time)
                __field(int, ap_isolate)
                __field(int, ht_opmode)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->use_cts_prot = params->use_cts_prot;
                __entry->use_short_preamble = params->use_short_preamble;
                __entry->use_short_slot_time = params->use_short_slot_time;
                __entry->ap_isolate = params->ap_isolate;
                __entry->ht_opmode = params->ht_opmode;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", use cts prot: %d, "
                  "use short preamble: %d, use short slot time: %d, "
                  "ap isolate: %d, ht opmode: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->use_cts_prot,
                  __entry->use_short_preamble, __entry->use_short_slot_time,
                  __entry->ap_isolate, __entry->ht_opmode)
);

TRACE_EVENT(rdev_inform_bss,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_bss *bss),
        TP_ARGS(wiphy, bss),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                MAC_ASSIGN(bssid, bss->bssid);
                CHAN_ASSIGN(bss->channel);
        ),
        TP_printk(WIPHY_PR_FMT ", %pM, " CHAN_PR_FMT,
                  WIPHY_PR_ARG, __entry->bssid, CHAN_PR_ARG)
);

TRACE_EVENT(rdev_set_txq_params,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct ieee80211_txq_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(enum nl80211_ac, ac)
                __field(u16, txop)
                __field(u16, cwmin)
                __field(u16, cwmax)
                __field(u8, aifs)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->ac = params->ac;
                __entry->txop = params->txop;
                __entry->cwmin = params->cwmin;
                __entry->cwmax = params->cwmax;
                __entry->aifs = params->aifs;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", ac: %d, txop: %u, cwmin: %u, cwmax: %u, aifs: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ac, __entry->txop,
                  __entry->cwmin, __entry->cwmax, __entry->aifs)
);

TRACE_EVENT(rdev_libertas_set_mesh_channel,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct ieee80211_channel *chan),
        TP_ARGS(wiphy, netdev, chan),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_PR_FMT, WIPHY_PR_ARG,
                  NETDEV_PR_ARG, CHAN_PR_ARG)
);

TRACE_EVENT(rdev_set_monitor_channel,
        TP_PROTO(struct wiphy *wiphy,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(rdev_auth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_auth_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(enum nl80211_auth_type, auth_type)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (req->bss)
                        MAC_ASSIGN(bssid, req->bss->bssid);
                else
                        eth_zero_addr(__entry->bssid);
                __entry->auth_type = req->auth_type;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", auth type: %d, bssid: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->auth_type,
                  __entry->bssid)
);

TRACE_EVENT(rdev_assoc,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_assoc_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                MAC_ENTRY(prev_bssid)
                __field(bool, use_mfp)
                __field(u32, flags)
                __dynamic_array(u8, elements, req->ie_len)
                __array(u8, ht_capa, sizeof(struct ieee80211_ht_cap))
                __array(u8, ht_capa_mask, sizeof(struct ieee80211_ht_cap))
                __array(u8, vht_capa, sizeof(struct ieee80211_vht_cap))
                __array(u8, vht_capa_mask, sizeof(struct ieee80211_vht_cap))
                __dynamic_array(u8, fils_kek, req->fils_kek_len)
                __dynamic_array(u8, fils_nonces,
                                req->fils_nonces ? 2 * FILS_NONCE_LEN : 0)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (req->bss)
                        MAC_ASSIGN(bssid, req->bss->bssid);
                else
                        eth_zero_addr(__entry->bssid);
                MAC_ASSIGN(prev_bssid, req->prev_bssid);
                __entry->use_mfp = req->use_mfp;
                __entry->flags = req->flags;
                if (req->ie)
                        memcpy(__get_dynamic_array(elements),
                               req->ie, req->ie_len);
                memcpy(__entry->ht_capa, &req->ht_capa, sizeof(req->ht_capa));
                memcpy(__entry->ht_capa_mask, &req->ht_capa_mask,
                       sizeof(req->ht_capa_mask));
                memcpy(__entry->vht_capa, &req->vht_capa, sizeof(req->vht_capa));
                memcpy(__entry->vht_capa_mask, &req->vht_capa_mask,
                       sizeof(req->vht_capa_mask));
                if (req->fils_kek)
                        memcpy(__get_dynamic_array(fils_kek),
                               req->fils_kek, req->fils_kek_len);
                if (req->fils_nonces)
                        memcpy(__get_dynamic_array(fils_nonces),
                               req->fils_nonces, 2 * FILS_NONCE_LEN);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                  ", previous bssid: %pM, use mfp: %s, flags: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                  __entry->prev_bssid, BOOL_TO_STR(__entry->use_mfp),
                  __entry->flags)
);

TRACE_EVENT(rdev_deauth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_deauth_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(u16, reason_code)
                __field(bool, local_state_change)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, req->bssid);
                __entry->reason_code = req->reason_code;
                __entry->local_state_change = req->local_state_change;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, reason: %u, local_state_change:%d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                  __entry->reason_code, __entry->local_state_change)
);

TRACE_EVENT(rdev_disassoc,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_disassoc_request *req),
        TP_ARGS(wiphy, netdev, req),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __field(u16, reason_code)
                __field(bool, local_state_change)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, req->ap_addr);
                __entry->reason_code = req->reason_code;
                __entry->local_state_change = req->local_state_change;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                  ", reason: %u, local state change: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                  __entry->reason_code,
                  BOOL_TO_STR(__entry->local_state_change))
);

TRACE_EVENT(rdev_mgmt_tx_cancel_wait,
        TP_PROTO(struct wiphy *wiphy,
                 struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu ",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_set_power_mgmt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 bool enabled, int timeout),
        TP_ARGS(wiphy, netdev, enabled, timeout),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(bool, enabled)
                __field(int, timeout)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->enabled = enabled;
                __entry->timeout = timeout;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %senabled, timeout: %d ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->enabled ? "" : "not ", __entry->timeout)
);

TRACE_EVENT(rdev_connect,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_connect_params *sme),
        TP_ARGS(wiphy, netdev, sme),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
                __field(enum nl80211_auth_type, auth_type)
                __field(bool, privacy)
                __field(u32, wpa_versions)
                __field(u32, flags)
                MAC_ENTRY(prev_bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, sme->bssid);
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, sme->ssid, sme->ssid_len);
                __entry->auth_type = sme->auth_type;
                __entry->privacy = sme->privacy;
                __entry->wpa_versions = sme->crypto.wpa_versions;
                __entry->flags = sme->flags;
                MAC_ASSIGN(prev_bssid, sme->prev_bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                  ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, "
                  "flags: 0x%x, previous bssid: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid,
                  __entry->auth_type, BOOL_TO_STR(__entry->privacy),
                  __entry->wpa_versions, __entry->flags, __entry->prev_bssid)
);

TRACE_EVENT(rdev_update_connect_params,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_connect_params *sme, u32 changed),
        TP_ARGS(wiphy, netdev, sme, changed),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, changed)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->changed = changed;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", parameters changed: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,  __entry->changed)
);

TRACE_EVENT(rdev_set_cqm_rssi_config,
        TP_PROTO(struct wiphy *wiphy,
                 struct net_device *netdev, s32 rssi_thold,
                 u32 rssi_hyst),
        TP_ARGS(wiphy, netdev, rssi_thold, rssi_hyst),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(s32, rssi_thold)
                __field(u32, rssi_hyst)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rssi_thold = rssi_thold;
                __entry->rssi_hyst = rssi_hyst;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", rssi_thold: %d, rssi_hyst: %u ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                 __entry->rssi_thold, __entry->rssi_hyst)
);

TRACE_EVENT(rdev_set_cqm_rssi_range_config,
        TP_PROTO(struct wiphy *wiphy,
                 struct net_device *netdev, s32 low, s32 high),
        TP_ARGS(wiphy, netdev, low, high),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(s32, rssi_low)
                __field(s32, rssi_high)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rssi_low = low;
                __entry->rssi_high = high;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", range: %d - %d ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->rssi_low, __entry->rssi_high)
);

TRACE_EVENT(rdev_set_cqm_txe_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 rate,
                 u32 pkts, u32 intvl),
        TP_ARGS(wiphy, netdev, rate, pkts, intvl),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, rate)
                __field(u32, pkts)
                __field(u32, intvl)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->rate = rate;
                __entry->pkts = pkts;
                __entry->intvl = intvl;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", rate: %u, packets: %u, interval: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->rate, __entry->pkts,
                  __entry->intvl)
);

TRACE_EVENT(rdev_disconnect,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u16 reason_code),
        TP_ARGS(wiphy, netdev, reason_code),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->reason_code = reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", reason code: %u", WIPHY_PR_ARG,
                  NETDEV_PR_ARG, __entry->reason_code)
);

TRACE_EVENT(rdev_join_ibss,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ibss_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, params->bssid);
                memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                memcpy(__entry->ssid, params->ssid, params->ssid_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM, ssid: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid, __entry->ssid)
);

TRACE_EVENT(rdev_join_ocb,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const struct ocb_setup *setup),
        TP_ARGS(wiphy, netdev, setup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

TRACE_EVENT(rdev_set_wiphy_params,
        TP_PROTO(struct wiphy *wiphy, u32 changed),
        TP_ARGS(wiphy, changed),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u32, changed)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->changed = changed;
        ),
        TP_printk(WIPHY_PR_FMT ", changed: %u",
                  WIPHY_PR_ARG, __entry->changed)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_get_tx_power,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_set_tx_power,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 enum nl80211_tx_power_setting type, int mbm),
        TP_ARGS(wiphy, wdev, type, mbm),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(enum nl80211_tx_power_setting, type)
                __field(int, mbm)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->type = type;
                __entry->mbm = mbm;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type: %u, mbm: %d",
                  WIPHY_PR_ARG, WDEV_PR_ARG,__entry->type, __entry->mbm)
);

TRACE_EVENT(rdev_return_int_int,
        TP_PROTO(struct wiphy *wiphy, int func_ret, int func_fill),
        TP_ARGS(wiphy, func_ret, func_fill),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, func_ret)
                __field(int, func_fill)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->func_ret = func_ret;
                __entry->func_fill = func_fill;
        ),
        TP_printk(WIPHY_PR_FMT ", function returns: %d, function filled: %d",
                  WIPHY_PR_ARG, __entry->func_ret, __entry->func_fill)
);

#ifdef CONFIG_NL80211_TESTMODE
TRACE_EVENT(rdev_testmode_cmd,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(rdev_testmode_dump,
        TP_PROTO(struct wiphy *wiphy),
        TP_ARGS(wiphy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG)
);
#endif /* CONFIG_NL80211_TESTMODE */

TRACE_EVENT(rdev_set_bitrate_mask,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 unsigned int link_id,
                 const u8 *peer, const struct cfg80211_bitrate_mask *mask),
        TP_ARGS(wiphy, netdev, link_id, peer, mask),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(unsigned int, link_id)
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->link_id = link_id;
                MAC_ASSIGN(peer, peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d, peer: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id,
                  __entry->peer)
);

TRACE_EVENT(rdev_update_mgmt_frame_registrations,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct mgmt_frame_regs *upd),
        TP_ARGS(wiphy, wdev, upd),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u16, global_stypes)
                __field(u16, interface_stypes)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->global_stypes = upd->global_stypes;
                __entry->interface_stypes = upd->interface_stypes;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  __entry->global_stypes, __entry->interface_stypes)
);

TRACE_EVENT(rdev_return_int_tx_rx,
        TP_PROTO(struct wiphy *wiphy, int ret, u32 tx, u32 rx),
        TP_ARGS(wiphy, ret, tx, rx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(u32, tx)
                __field(u32, rx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->tx = tx;
                __entry->rx = rx;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d, tx: %u, rx: %u",
                  WIPHY_PR_ARG, __entry->ret, __entry->tx, __entry->rx)
);

TRACE_EVENT(rdev_return_void_tx_rx,
        TP_PROTO(struct wiphy *wiphy, u32 tx, u32 tx_max,
                 u32 rx, u32 rx_max),
        TP_ARGS(wiphy, tx, tx_max, rx, rx_max),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u32, tx)
                __field(u32, tx_max)
                __field(u32, rx)
                __field(u32, rx_max)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->tx = tx;
                __entry->tx_max = tx_max;
                __entry->rx = rx;
                __entry->rx_max = rx_max;
        ),
        TP_printk(WIPHY_PR_FMT ", tx: %u, tx_max: %u, rx: %u, rx_max: %u ",
                  WIPHY_PR_ARG, __entry->tx, __entry->tx_max, __entry->rx,
                  __entry->rx_max)
);

DECLARE_EVENT_CLASS(tx_rx_evt,
        TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
        TP_ARGS(wiphy, tx, rx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u32, tx)
                __field(u32, rx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->tx = tx;
                __entry->rx = rx;
        ),
        TP_printk(WIPHY_PR_FMT ", tx: %u, rx: %u ",
                  WIPHY_PR_ARG, __entry->tx, __entry->rx)
);

DEFINE_EVENT(tx_rx_evt, rdev_set_antenna,
        TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
        TP_ARGS(wiphy, tx, rx)
);

DECLARE_EVENT_CLASS(wiphy_netdev_id_evt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u64, id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->id = id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", id: %llu",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->id)
);

DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_start,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id)
);

DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_stop,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id),
        TP_ARGS(wiphy, netdev, id)
);

TRACE_EVENT(rdev_tdls_mgmt,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *peer, int link_id, u8 action_code, u8 dialog_token,
                 u16 status_code, u32 peer_capability,
                 bool initiator, const u8 *buf, size_t len),
        TP_ARGS(wiphy, netdev, peer, link_id, action_code, dialog_token,
                status_code, peer_capability, initiator, buf, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(int, link_id)
                __field(u8, action_code)
                __field(u8, dialog_token)
                __field(u16, status_code)
                __field(u32, peer_capability)
                __field(bool, initiator)
                __dynamic_array(u8, buf, len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->link_id = link_id;
                __entry->action_code = action_code;
                __entry->dialog_token = dialog_token;
                __entry->status_code = status_code;
                __entry->peer_capability = peer_capability;
                __entry->initiator = initiator;
                memcpy(__get_dynamic_array(buf), buf, len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM"
                  ", link_id: %d, action_code: %u "
                  "dialog_token: %u, status_code: %u, peer_capability: %u "
                  "initiator: %s buf: %#.2x ",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->link_id, __entry->action_code, __entry->dialog_token,
                  __entry->status_code, __entry->peer_capability,
                  BOOL_TO_STR(__entry->initiator),
                  ((u8 *)__get_dynamic_array(buf))[0])
);

TRACE_EVENT(rdev_dump_survey,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx),
        TP_ARGS(wiphy, netdev, _idx),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(int, idx)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->idx = _idx;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx)
);

TRACE_EVENT(rdev_return_int_survey_info,
        TP_PROTO(struct wiphy *wiphy, int ret, struct survey_info *info),
        TP_ARGS(wiphy, ret, info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                __field(int, ret)
                __field(u64, time)
                __field(u64, time_busy)
                __field(u64, time_ext_busy)
                __field(u64, time_rx)
                __field(u64, time_tx)
                __field(u64, time_scan)
                __field(u32, filled)
                __field(s8, noise)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(info->channel);
                __entry->ret = ret;
                __entry->time = info->time;
                __entry->time_busy = info->time_busy;
                __entry->time_ext_busy = info->time_ext_busy;
                __entry->time_rx = info->time_rx;
                __entry->time_tx = info->time_tx;
                __entry->time_scan = info->time_scan;
                __entry->filled = info->filled;
                __entry->noise = info->noise;
        ),
        TP_printk(WIPHY_PR_FMT ", returned: %d, " CHAN_PR_FMT
                  ", channel time: %llu, channel time busy: %llu, "
                  "channel time extension busy: %llu, channel time rx: %llu, "
                  "channel time tx: %llu, scan time: %llu, filled: %u, noise: %d",
                  WIPHY_PR_ARG, __entry->ret, CHAN_PR_ARG,
                  __entry->time, __entry->time_busy,
                  __entry->time_ext_busy, __entry->time_rx,
                  __entry->time_tx, __entry->time_scan,
                  __entry->filled, __entry->noise)
);

TRACE_EVENT(rdev_tdls_oper,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 *peer, enum nl80211_tdls_operation oper),
        TP_ARGS(wiphy, netdev, peer, oper),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(enum nl80211_tdls_operation, oper)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->oper = oper;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, oper: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper)
);

DECLARE_EVENT_CLASS(rdev_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, pmksa->bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid)
);

TRACE_EVENT(rdev_probe_client,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *peer),
        TP_ARGS(wiphy, netdev, peer),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer)
);

DEFINE_EVENT(rdev_pmksa, rdev_set_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa)
);

DEFINE_EVENT(rdev_pmksa, rdev_del_pmksa,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmksa *pmksa),
        TP_ARGS(wiphy, netdev, pmksa)
);

TRACE_EVENT(rdev_remain_on_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct ieee80211_channel *chan,
                 unsigned int duration),
        TP_ARGS(wiphy, wdev, chan, duration),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                CHAN_ENTRY
                __field(unsigned int, duration)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                CHAN_ASSIGN(chan);
                __entry->duration = duration;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", duration: %u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG, __entry->duration)
);

TRACE_EVENT(rdev_return_int_cookie,
        TP_PROTO(struct wiphy *wiphy, int ret, u64 cookie),
        TP_ARGS(wiphy, ret, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->ret = ret;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", returned %d, cookie: %llu",
                  WIPHY_PR_ARG, __entry->ret, __entry->cookie)
);

TRACE_EVENT(rdev_cancel_remain_on_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_mgmt_tx,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_mgmt_tx_params *params),
        TP_ARGS(wiphy, wdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                CHAN_ENTRY
                __field(bool, offchan)
                __field(unsigned int, wait)
                __field(bool, no_cck)
                __field(bool, dont_wait_for_ack)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                CHAN_ASSIGN(params->chan);
                __entry->offchan = params->offchan;
                __entry->wait = params->wait;
                __entry->no_cck = params->no_cck;
                __entry->dont_wait_for_ack = params->dont_wait_for_ack;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", " CHAN_PR_FMT ", offchan: %s,"
                  " wait: %u, no cck: %s, dont wait for ack: %s",
                  WIPHY_PR_ARG, WDEV_PR_ARG, CHAN_PR_ARG,
                  BOOL_TO_STR(__entry->offchan), __entry->wait,
                  BOOL_TO_STR(__entry->no_cck),
                  BOOL_TO_STR(__entry->dont_wait_for_ack))
);

TRACE_EVENT(rdev_tx_control_port,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *buf, size_t len, const u8 *dest, __be16 proto,
                 bool unencrypted, int link_id),
        TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dest)
                __field(__be16, proto)
                __field(bool, unencrypted)
                __field(int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dest, dest);
                __entry->proto = proto;
                __entry->unencrypted = unencrypted;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM,"
                  " proto: 0x%x, unencrypted: %s, link: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest,
                  be16_to_cpu(__entry->proto),
                  BOOL_TO_STR(__entry->unencrypted),
                  __entry->link_id)
);

TRACE_EVENT(rdev_set_noack_map,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u16 noack_map),
        TP_ARGS(wiphy, netdev, noack_map),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, noack_map)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->noack_map = noack_map;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
);

DECLARE_EVENT_CLASS(wiphy_wdev_link_evt,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
);

DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id)
);

TRACE_EVENT(rdev_return_chandef,
        TP_PROTO(struct wiphy *wiphy, int ret,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, ret, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, ret)
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                if (ret == 0)
                        CHAN_DEF_ASSIGN(chandef);
                else
                        CHAN_DEF_ASSIGN((struct cfg80211_chan_def *)NULL);
                __entry->ret = ret;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", ret: %d",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->ret)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_start_p2p_device,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_start_nan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_nan_conf *conf),
        TP_ARGS(wiphy, wdev, conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", master preference: %u, bands: 0x%0x",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
                  __entry->bands)
);

TRACE_EVENT(rdev_nan_change_conf,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_nan_conf *conf, u32 changes),
        TP_ARGS(wiphy, wdev, conf, changes),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, master_pref)
                __field(u8, bands)
                __field(u32, changes)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->master_pref = conf->master_pref;
                __entry->bands = conf->bands;
                __entry->changes = changes;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
                  ", master preference: %u, bands: 0x%0x, changes: %x",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref,
                  __entry->bands, __entry->changes)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_add_nan_func,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 const struct cfg80211_nan_func *func),
        TP_ARGS(wiphy, wdev, func),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u8, func_type)
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->func_type = func->type;
                __entry->cookie = func->cookie
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type,
                  __entry->cookie)
);

TRACE_EVENT(rdev_del_nan_func,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie)
);

TRACE_EVENT(rdev_set_mac_acl,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_acl_data *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u32, acl_policy)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->acl_policy = params->acl_policy;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", acl policy: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->acl_policy)
);

TRACE_EVENT(rdev_update_ft_ies,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_update_ft_ies_params *ftie),
        TP_ARGS(wiphy, netdev, ftie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u16, md)
                __dynamic_array(u8, ie, ftie->ie_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->md = ftie->md;
                memcpy(__get_dynamic_array(ie), ftie->ie, ftie->ie_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", md: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->md)
);

TRACE_EVENT(rdev_crit_proto_start,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 enum nl80211_crit_proto_id protocol, u16 duration),
        TP_ARGS(wiphy, wdev, protocol, duration),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u16, proto)
                __field(u16, duration)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->proto = protocol;
                __entry->duration = duration;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", proto=%x, duration=%u",
                  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->proto, __entry->duration)
);

TRACE_EVENT(rdev_crit_proto_stop,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
                  WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(rdev_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_csa_settings *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(bool, radar_required)
                __field(bool, block_tx)
                __field(u8, count)
                __dynamic_array(u16, bcn_ofs, params->n_counter_offsets_beacon)
                __dynamic_array(u16, pres_ofs, params->n_counter_offsets_presp)
                __field(u8, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(&params->chandef);
                __entry->radar_required = params->radar_required;
                __entry->block_tx = params->block_tx;
                __entry->count = params->count;
                memcpy(__get_dynamic_array(bcn_ofs),
                       params->counter_offsets_beacon,
                       params->n_counter_offsets_beacon * sizeof(u16));

                /* probe response offsets are optional */
                if (params->n_counter_offsets_presp)
                        memcpy(__get_dynamic_array(pres_ofs),
                               params->counter_offsets_presp,
                               params->n_counter_offsets_presp * sizeof(u16));
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
                  ", block_tx: %d, count: %u, radar_required: %d, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->block_tx, __entry->count, __entry->radar_required,
                  __entry->link_id)
);

TRACE_EVENT(rdev_set_qos_map,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_qos_map *qos_map),
        TP_ARGS(wiphy, netdev, qos_map),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                QOS_MAP_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                QOS_MAP_ASSIGN(qos_map);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", num_des: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->num_des)
);

TRACE_EVENT(rdev_set_ap_chanwidth,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 unsigned int link_id,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, netdev, link_id, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->link_id = link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->link_id)
);

TRACE_EVENT(rdev_add_tx_ts,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time),
        TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tsid)
                __field(u8, user_prio)
                __field(u16, admitted_time)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tsid = tsid;
                __entry->user_prio = user_prio;
                __entry->admitted_time = admitted_time;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d, UP %d, time %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->tsid, __entry->user_prio, __entry->admitted_time)
);

TRACE_EVENT(rdev_del_tx_ts,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 u8 tsid, const u8 *peer),
        TP_ARGS(wiphy, netdev, tsid, peer),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tsid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tsid = tsid;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM, TSID %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tsid)
);

TRACE_EVENT(rdev_tdls_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *addr, u8 oper_class,
                 struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, netdev, addr, oper_class, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(u8, oper_class)
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM"
                  " oper class %d, " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr,
                  __entry->oper_class, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(rdev_tdls_cancel_channel_switch,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *addr),
        TP_ARGS(wiphy, netdev, addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->addr)
);

TRACE_EVENT(rdev_set_pmk,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_pmk_conf *pmk_conf),

        TP_ARGS(wiphy, netdev, pmk_conf),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(aa)
                __field(u8, pmk_len)
                __field(u8, pmk_r0_name_len)
                __dynamic_array(u8, pmk, pmk_conf->pmk_len)
                __dynamic_array(u8, pmk_r0_name, WLAN_PMK_NAME_LEN)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(aa, pmk_conf->aa);
                __entry->pmk_len = pmk_conf->pmk_len;
                __entry->pmk_r0_name_len =
                pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0;
                memcpy(__get_dynamic_array(pmk), pmk_conf->pmk,
                       pmk_conf->pmk_len);
                memcpy(__get_dynamic_array(pmk_r0_name), pmk_conf->pmk_r0_name,
                       pmk_conf->pmk_r0_name ? WLAN_PMK_NAME_LEN : 0);
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM"
                  "pmk_len=%u, pmk: %s pmk_r0_name: %s", WIPHY_PR_ARG,
                  NETDEV_PR_ARG, __entry->aa, __entry->pmk_len,
                  __print_array(__get_dynamic_array(pmk),
                                __get_dynamic_array_len(pmk), 1),
                  __entry->pmk_r0_name_len ?
                  __print_array(__get_dynamic_array(pmk_r0_name),
                                __get_dynamic_array_len(pmk_r0_name), 1) : "")
);

TRACE_EVENT(rdev_del_pmk,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *aa),

        TP_ARGS(wiphy, netdev, aa),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(aa)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(aa, aa);
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->aa)
);

TRACE_EVENT(rdev_external_auth,
            TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                     struct cfg80211_external_auth_params *params),
            TP_ARGS(wiphy, netdev, params),
            TP_STRUCT__entry(WIPHY_ENTRY
                             NETDEV_ENTRY
                             MAC_ENTRY(bssid)
                             __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1)
                             __field(u16, status)
                             MAC_ENTRY(mld_addr)
            ),
            TP_fast_assign(WIPHY_ASSIGN;
                           NETDEV_ASSIGN;
                           MAC_ASSIGN(bssid, params->bssid);
                           memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
                           memcpy(__entry->ssid, params->ssid.ssid,
                                  params->ssid.ssid_len);
                           __entry->status = params->status;
                           MAC_ASSIGN(mld_addr, params->mld_addr);
            ),
            TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM"
                      ", ssid: %s, status: %u, mld_addr: %pM",
                      WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->bssid,
                      __entry->ssid, __entry->status, __entry->mld_addr)
);

TRACE_EVENT(rdev_start_radar_detection,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_chan_def *chandef,
                 u32 cac_time_ms),
        TP_ARGS(wiphy, netdev, chandef, cac_time_ms),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(u32, cac_time_ms)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->cac_time_ms = cac_time_ms;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
                  ", cac_time_ms=%u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
                  __entry->cac_time_ms)
);

TRACE_EVENT(rdev_set_mcast_rate,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 int *mcast_rate),
        TP_ARGS(wiphy, netdev, mcast_rate),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(int, mcast_rate, NUM_NL80211_BANDS)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memcpy(__entry->mcast_rate, mcast_rate,
                       sizeof(int) * NUM_NL80211_BANDS);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
                  "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 6GHz=0x%x, 60GHz=0x%x]",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->mcast_rate[NL80211_BAND_2GHZ],
                  __entry->mcast_rate[NL80211_BAND_5GHZ],
                  __entry->mcast_rate[NL80211_BAND_6GHZ],
                  __entry->mcast_rate[NL80211_BAND_60GHZ])
);

TRACE_EVENT(rdev_set_coalesce,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
        TP_ARGS(wiphy, coalesce),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, n_rules)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->n_rules = coalesce ? coalesce->n_rules : 0;
        ),
        TP_printk(WIPHY_PR_FMT ", n_rules=%d",
                  WIPHY_PR_ARG, __entry->n_rules)
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_set_multicast_to_unicast,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const bool enabled),
        TP_ARGS(wiphy, netdev, enabled),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(bool, enabled)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->enabled = enabled;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  BOOL_TO_STR(__entry->enabled))
);

DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev)
);

TRACE_EVENT(rdev_get_ftm_responder_stats,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ftm_responder_stats *ftm_stats),

        TP_ARGS(wiphy, netdev, ftm_stats),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u64, timestamp)
                __field(u32, success_num)
                __field(u32, partial_num)
                __field(u32, failed_num)
                __field(u32, asap_num)
                __field(u32, non_asap_num)
                __field(u64, duration)
                __field(u32, unknown_triggers)
                __field(u32, reschedule)
                __field(u32, out_of_window)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->success_num = ftm_stats->success_num;
                __entry->partial_num = ftm_stats->partial_num;
                __entry->failed_num = ftm_stats->failed_num;
                __entry->asap_num = ftm_stats->asap_num;
                __entry->non_asap_num = ftm_stats->non_asap_num;
                __entry->duration = ftm_stats->total_duration_ms;
                __entry->unknown_triggers = ftm_stats->unknown_triggers_num;
                __entry->reschedule = ftm_stats->reschedule_requests_num;
                __entry->out_of_window = ftm_stats->out_of_window_triggers_num;
        ),

        TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, "
                "failed %u, asap %u, non asap %u, total duration %llu, unknown "
                "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG,
                __entry->success_num, __entry->partial_num, __entry->failed_num,
                __entry->asap_num, __entry->non_asap_num, __entry->duration,
                __entry->unknown_triggers, __entry->reschedule,
                __entry->out_of_window)
);

DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_start_pmsr,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie)
);

DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie)
);

TRACE_EVENT(rdev_set_fils_aad,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_fils_aad *fils_aad),
        TP_ARGS(wiphy, netdev, fils_aad),
        TP_STRUCT__entry(WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, macaddr, ETH_ALEN)
                __field(u8, kek_len)
        ),
        TP_fast_assign(WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                FILS_AAD_ASSIGN(fils_aad);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " FILS_AAD_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr,
                  __entry->kek_len)
);

TRACE_EVENT(rdev_update_owe_info,
            TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                     struct cfg80211_update_owe_info *owe_info),
            TP_ARGS(wiphy, netdev, owe_info),
            TP_STRUCT__entry(WIPHY_ENTRY
                             NETDEV_ENTRY
                             MAC_ENTRY(peer)
                             __field(u16, status)
                             __dynamic_array(u8, ie, owe_info->ie_len)),
            TP_fast_assign(WIPHY_ASSIGN;
                           NETDEV_ASSIGN;
                           MAC_ASSIGN(peer, owe_info->peer);
                           __entry->status = owe_info->status;
                           memcpy(__get_dynamic_array(ie),
                                  owe_info->ie, owe_info->ie_len);),
            TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM"
                  " status %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->status)
);

TRACE_EVENT(rdev_probe_mesh_link,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *dest, const u8 *buf, size_t len),
        TP_ARGS(wiphy, netdev, dest, buf, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(dest)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(dest, dest);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->dest)
);

TRACE_EVENT(rdev_set_tid_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_tid_config *tid_conf),
        TP_ARGS(wiphy, netdev, tid_conf),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, tid_conf->peer);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer)
);

TRACE_EVENT(rdev_reset_tid_config,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 const u8 *peer, u8 tids),
        TP_ARGS(wiphy, netdev, peer, tids),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u8, tids)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->tids = tids;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, tids: 0x%x",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->tids)
);

TRACE_EVENT(rdev_set_sar_specs,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_sar_specs *sar),
        TP_ARGS(wiphy, sar),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u16, type)
                __field(u16, num)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->type = sar->type;
                __entry->num = sar->num_sub_specs;

        ),
        TP_printk(WIPHY_PR_FMT ", Set type:%d, num_specs:%d",
                  WIPHY_PR_ARG, __entry->type, __entry->num)
);

TRACE_EVENT(rdev_color_change,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_color_change_settings *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __field(u8, count)
                __field(u16, bcn_ofs)
                __field(u16, pres_ofs)
                __field(u8, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                __entry->count = params->count;
                __entry->bcn_ofs = params->counter_offset_beacon;
                __entry->pres_ofs = params->counter_offset_presp;
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
                  ", count: %u, link_id: %d",
                  WIPHY_PR_ARG, NETDEV_PR_ARG,
                  __entry->count, __entry->link_id)
);

TRACE_EVENT(rdev_set_radar_background,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),

        TP_ARGS(wiphy, chandef),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef)
        ),

        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
);

DEFINE_EVENT(wiphy_wdev_link_evt, rdev_add_intf_link,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id)
);

DEFINE_EVENT(wiphy_wdev_link_evt, rdev_del_intf_link,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 unsigned int link_id),
        TP_ARGS(wiphy, wdev, link_id)
);

TRACE_EVENT(rdev_del_link_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_del_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, mld_mac, 6)
                __field(u32, link_id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memset(__entry->mld_mac, 0, 6);
                if (params->mld_mac)
                        memcpy(__entry->mld_mac, params->mld_mac, 6);
                __entry->link_id = params->link_id;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", link id: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac,
                  __entry->link_id)
);

TRACE_EVENT(rdev_set_hw_timestamp,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_set_hw_timestamp *hwts),

        TP_ARGS(wiphy, netdev, hwts),

        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(macaddr)
                __field(bool, enable)
        ),

        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(macaddr, hwts->macaddr);
                __entry->enable = hwts->enable;
        ),

        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac %pM, enable: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr,
                  __entry->enable)
);

TRACE_EVENT(rdev_set_ttlm,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ttlm_params *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, dlink, sizeof(u16) * 8)
                __array(u8, ulink, sizeof(u16) * 8)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memcpy(__entry->dlink, params->dlink, sizeof(params->dlink));
                memcpy(__entry->ulink, params->ulink, sizeof(params->ulink));
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT,
                  WIPHY_PR_ARG, NETDEV_PR_ARG)
);

/*************************************************************
 *             cfg80211 exported functions traces                     *
 *************************************************************/

TRACE_EVENT(cfg80211_return_bool,
        TP_PROTO(bool ret),
        TP_ARGS(ret),
        TP_STRUCT__entry(
                __field(bool, ret)
        ),
        TP_fast_assign(
                __entry->ret = ret;
        ),
        TP_printk("returned %s", BOOL_TO_STR(__entry->ret))
);

DECLARE_EVENT_CLASS(cfg80211_netdev_mac_evt,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(macaddr)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(macaddr, macaddr);
        ),
        TP_printk(NETDEV_PR_FMT ", mac: %pM",
                  NETDEV_PR_ARG, __entry->macaddr)
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

DECLARE_EVENT_CLASS(netdev_evt_only,
        TP_PROTO(struct net_device *netdev),
        TP_ARGS(netdev),
        TP_STRUCT__entry(
                NETDEV_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
        ),
        TP_printk(NETDEV_PR_FMT , NETDEV_PR_ARG)
);

DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth,
        TP_PROTO(struct net_device *netdev),
        TP_ARGS(netdev)
);

TRACE_EVENT(cfg80211_send_rx_assoc,
        TP_PROTO(struct net_device *netdev,
                 const struct cfg80211_rx_assoc_resp_data *data),
        TP_ARGS(netdev, data),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(ap_addr)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(ap_addr,
                           data->ap_mld_addr ?: data->links[0].bss->bssid);
        ),
        TP_printk(NETDEV_PR_FMT ", %pM",
                  NETDEV_PR_ARG, __entry->ap_addr)
);

DECLARE_EVENT_CLASS(netdev_frame_event,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __dynamic_array(u8, frame, len)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                memcpy(__get_dynamic_array(frame), buf, len);
        ),
        TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x",
                  NETDEV_PR_ARG,
                  le16_to_cpup((__le16 *)__get_dynamic_array(frame)))
);

DEFINE_EVENT(netdev_frame_event, cfg80211_rx_unprot_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len)
);

DEFINE_EVENT(netdev_frame_event, cfg80211_rx_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len),
        TP_ARGS(netdev, buf, len)
);

TRACE_EVENT(cfg80211_tx_mlme_mgmt,
        TP_PROTO(struct net_device *netdev, const u8 *buf, int len,
                 bool reconnect),
        TP_ARGS(netdev, buf, len, reconnect),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __dynamic_array(u8, frame, len)
                __field(int, reconnect)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                memcpy(__get_dynamic_array(frame), buf, len);
                __entry->reconnect = reconnect;
        ),
        TP_printk(NETDEV_PR_FMT ", ftype:0x%.2x reconnect:%d",
                  NETDEV_PR_ARG,
                  le16_to_cpup((__le16 *)__get_dynamic_array(frame)),
                  __entry->reconnect)
);

DECLARE_EVENT_CLASS(netdev_mac_evt,
        TP_PROTO(struct net_device *netdev, const u8 *mac),
        TP_ARGS(netdev, mac),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(mac)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac, mac)
        ),
        TP_printk(NETDEV_PR_FMT ", mac: %pM",
                  NETDEV_PR_ARG, __entry->mac)
);

DEFINE_EVENT(netdev_mac_evt, cfg80211_send_auth_timeout,
        TP_PROTO(struct net_device *netdev, const u8 *mac),
        TP_ARGS(netdev, mac)
);

TRACE_EVENT(cfg80211_send_assoc_failure,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_assoc_failure *data),
        TP_ARGS(netdev, data),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(ap_addr)
                __field(bool, timeout)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(ap_addr, data->ap_mld_addr ?: data->bss[0]->bssid);
                __entry->timeout = data->timeout;
        ),
        TP_printk(NETDEV_PR_FMT ", mac: %pM, timeout: %d",
                  NETDEV_PR_ARG, __entry->ap_addr, __entry->timeout)
);

TRACE_EVENT(cfg80211_michael_mic_failure,
        TP_PROTO(struct net_device *netdev, const u8 *addr,
                 enum nl80211_key_type key_type, int key_id, const u8 *tsc),
        TP_ARGS(netdev, addr, key_type, key_id, tsc),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(enum nl80211_key_type, key_type)
                __field(int, key_id)
                __array(u8, tsc, 6)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                __entry->key_type = key_type;
                __entry->key_id = key_id;
                if (tsc)
                        memcpy(__entry->tsc, tsc, 6);
        ),
        TP_printk(NETDEV_PR_FMT ", %pM, key type: %d, key id: %d, tsc: %pm",
                  NETDEV_PR_ARG, __entry->addr, __entry->key_type,
                  __entry->key_id, __entry->tsc)
);

TRACE_EVENT(cfg80211_ready_on_channel,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan,
                 unsigned int duration),
        TP_ARGS(wdev, cookie, chan, duration),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
                __field(unsigned int, duration)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
                __entry->duration = duration;
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT ", duration: %u",
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG,
                  __entry->duration)
);

TRACE_EVENT(cfg80211_ready_on_channel_expired,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan),
        TP_ARGS(wdev, cookie, chan),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_tx_mgmt_expired,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie,
                 struct ieee80211_channel *chan),
        TP_ARGS(wdev, cookie, chan),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                CHAN_ASSIGN(chan);
        ),
        TP_printk(WDEV_PR_FMT ", cookie: %llu, " CHAN_PR_FMT,
                  WDEV_PR_ARG, __entry->cookie, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_new_sta,
        TP_PROTO(struct net_device *netdev, const u8 *mac_addr,
                 struct station_info *sinfo),
        TP_ARGS(netdev, mac_addr, sinfo),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(mac_addr)
                SINFO_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(mac_addr, mac_addr);
                SINFO_ASSIGN;
        ),
        TP_printk(NETDEV_PR_FMT ", %pM",
                  NETDEV_PR_ARG, __entry->mac_addr)
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_del_sta,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

TRACE_EVENT(cfg80211_rx_mgmt,
        TP_PROTO(struct wireless_dev *wdev, struct cfg80211_rx_info *info),
        TP_ARGS(wdev, info),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(int, freq)
                __field(int, sig_dbm)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->freq = info->freq;
                __entry->sig_dbm = info->sig_dbm;
        ),
        TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d",
                  WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
);

TRACE_EVENT(cfg80211_mgmt_tx_status,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
        TP_ARGS(wdev, cookie, ack),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                __field(bool, ack)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                __entry->ack = ack;
        ),
        TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
                  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
);

TRACE_EVENT(cfg80211_control_port_tx_status,
        TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack),
        TP_ARGS(wdev, cookie, ack),
        TP_STRUCT__entry(
                WDEV_ENTRY
                __field(u64, cookie)
                __field(bool, ack)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                __entry->ack = ack;
        ),
        TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s",
                  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
);

TRACE_EVENT(cfg80211_rx_control_port,
        TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
                 bool unencrypted, int link_id),
        TP_ARGS(netdev, skb, unencrypted, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(int, len)
                MAC_ENTRY(from)
                __field(u16, proto)
                __field(bool, unencrypted)
                __field(int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->len = skb->len;
                MAC_ASSIGN(from, eth_hdr(skb)->h_source);
                __entry->proto = be16_to_cpu(skb->protocol);
                __entry->unencrypted = unencrypted;
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s, link: %d",
                  NETDEV_PR_ARG, __entry->len, __entry->from,
                  __entry->proto, BOOL_TO_STR(__entry->unencrypted),
                  __entry->link_id)
);

TRACE_EVENT(cfg80211_cqm_rssi_notify,
        TP_PROTO(struct net_device *netdev,
                 enum nl80211_cqm_rssi_threshold_event rssi_event,
                 s32 rssi_level),
        TP_ARGS(netdev, rssi_event, rssi_level),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(enum nl80211_cqm_rssi_threshold_event, rssi_event)
                __field(s32, rssi_level)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->rssi_event = rssi_event;
                __entry->rssi_level = rssi_level;
        ),
        TP_printk(NETDEV_PR_FMT ", rssi event: %d, level: %d",
                  NETDEV_PR_ARG, __entry->rssi_event, __entry->rssi_level)
);

TRACE_EVENT(cfg80211_reg_can_beacon,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
                 enum nl80211_iftype iftype, bool check_no_ir),
        TP_ARGS(wiphy, chandef, iftype, check_no_ir),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
                __field(enum nl80211_iftype, iftype)
                __field(bool, check_no_ir)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->iftype = iftype;
                __entry->check_no_ir = check_no_ir;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d check_no_ir=%s",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype,
                  BOOL_TO_STR(__entry->check_no_ir))
);

TRACE_EVENT(cfg80211_chandef_dfs_required,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),
        TP_ARGS(wiphy, chandef),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
);

TRACE_EVENT(cfg80211_ch_switch_notify,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_chan_def *chandef,
                 unsigned int link_id),
        TP_ARGS(netdev, chandef, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
                  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
);

TRACE_EVENT(cfg80211_ch_switch_started_notify,
        TP_PROTO(struct net_device *netdev,
                 struct cfg80211_chan_def *chandef,
                 unsigned int link_id),
        TP_ARGS(netdev, chandef, link_id),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                CHAN_DEF_ENTRY
                __field(unsigned int, link_id)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->link_id = link_id;
        ),
        TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT ", link:%d",
                  NETDEV_PR_ARG, CHAN_DEF_PR_ARG, __entry->link_id)
);

TRACE_EVENT(cfg80211_radar_event,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
                 bool offchan),
        TP_ARGS(wiphy, chandef, offchan),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_DEF_ENTRY
                __field(bool, offchan)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_DEF_ASSIGN(chandef);
                __entry->offchan = offchan;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", offchan %d",
                  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->offchan)
);

TRACE_EVENT(cfg80211_cac_event,
        TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt),
        TP_ARGS(netdev, evt),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(enum nl80211_radar_event, evt)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->evt = evt;
        ),
        TP_printk(NETDEV_PR_FMT ",  event: %d",
                  NETDEV_PR_ARG, __entry->evt)
);

DECLARE_EVENT_CLASS(cfg80211_rx_evt,
        TP_PROTO(struct net_device *netdev, const u8 *addr),
        TP_ARGS(netdev, addr),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(NETDEV_PR_FMT ", %pM", NETDEV_PR_ARG, __entry->addr)
);

DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame,
        TP_PROTO(struct net_device *netdev, const u8 *addr),
        TP_ARGS(netdev, addr)
);

DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame,
        TP_PROTO(struct net_device *netdev, const u8 *addr),
        TP_ARGS(netdev, addr)
);

TRACE_EVENT(cfg80211_ibss_joined,
        TP_PROTO(struct net_device *netdev, const u8 *bssid,
                 struct ieee80211_channel *channel),
        TP_ARGS(netdev, bssid, channel),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(bssid, bssid);
                CHAN_ASSIGN(channel);
        ),
        TP_printk(NETDEV_PR_FMT ", bssid: %pM, " CHAN_PR_FMT,
                  NETDEV_PR_ARG, __entry->bssid, CHAN_PR_ARG)
);

TRACE_EVENT(cfg80211_probe_status,
        TP_PROTO(struct net_device *netdev, const u8 *addr, u64 cookie,
                 bool acked),
        TP_ARGS(netdev, addr, cookie, acked),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(addr)
                __field(u64, cookie)
                __field(bool, acked)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(addr, addr);
                __entry->cookie = cookie;
                __entry->acked = acked;
        ),
        TP_printk(NETDEV_PR_FMT " addr:%pM, cookie: %llu, acked: %s",
                  NETDEV_PR_ARG, __entry->addr, __entry->cookie,
                  BOOL_TO_STR(__entry->acked))
);

TRACE_EVENT(cfg80211_cqm_pktloss_notify,
        TP_PROTO(struct net_device *netdev, const u8 *peer, u32 num_packets),
        TP_ARGS(netdev, peer, num_packets),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(u32, num_packets)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->num_packets = num_packets;
        ),
        TP_printk(NETDEV_PR_FMT ", peer: %pM, num of lost packets: %u",
                  NETDEV_PR_ARG, __entry->peer, __entry->num_packets)
);

DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_gtk_rekey_notify,
        TP_PROTO(struct net_device *netdev, const u8 *macaddr),
        TP_ARGS(netdev, macaddr)
);

TRACE_EVENT(cfg80211_pmksa_candidate_notify,
        TP_PROTO(struct net_device *netdev, int index, const u8 *bssid,
                 bool preauth),
        TP_ARGS(netdev, index, bssid, preauth),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(int, index)
                MAC_ENTRY(bssid)
                __field(bool, preauth)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->index = index;
                MAC_ASSIGN(bssid, bssid);
                __entry->preauth = preauth;
        ),
        TP_printk(NETDEV_PR_FMT ", index:%d, bssid: %pM, pre auth: %s",
                  NETDEV_PR_ARG, __entry->index, __entry->bssid,
                  BOOL_TO_STR(__entry->preauth))
);

TRACE_EVENT(cfg80211_report_obss_beacon,
        TP_PROTO(struct wiphy *wiphy, const u8 *frame, size_t len,
                 int freq, int sig_dbm),
        TP_ARGS(wiphy, frame, len, freq, sig_dbm),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(int, freq)
                __field(int, sig_dbm)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->freq = freq;
                __entry->sig_dbm = sig_dbm;
        ),
        TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d",
                  WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm)
);

TRACE_EVENT(cfg80211_tdls_oper_request,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *peer,
                 enum nl80211_tdls_operation oper, u16 reason_code),
        TP_ARGS(wiphy, netdev, peer, oper, reason_code),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __field(enum nl80211_tdls_operation, oper)
                __field(u16, reason_code)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, peer);
                __entry->oper = oper;
                __entry->reason_code = reason_code;
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM, oper: %d, reason_code %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer, __entry->oper,
                  __entry->reason_code)
        );

TRACE_EVENT(cfg80211_scan_done,
        TP_PROTO(struct cfg80211_scan_request *request,
                 struct cfg80211_scan_info *info),
        TP_ARGS(request, info),
        TP_STRUCT__entry(
                __field(u32, n_channels)
                __dynamic_array(u8, ie, request ? request->ie_len : 0)
                __array(u32, rates, NUM_NL80211_BANDS)
                __field(u32, wdev_id)
                MAC_ENTRY(wiphy_mac)
                __field(bool, no_cck)
                __field(bool, aborted)
                __field(u64, scan_start_tsf)
                MAC_ENTRY(tsf_bssid)
        ),
        TP_fast_assign(
                if (request) {
                        memcpy(__get_dynamic_array(ie), request->ie,
                               request->ie_len);
                        memcpy(__entry->rates, request->rates,
                               NUM_NL80211_BANDS);
                        __entry->wdev_id = request->wdev ?
                                        request->wdev->identifier : 0;
                        if (request->wiphy)
                                MAC_ASSIGN(wiphy_mac,
                                           request->wiphy->perm_addr);
                        __entry->no_cck = request->no_cck;
                }
                if (info) {
                        __entry->aborted = info->aborted;
                        __entry->scan_start_tsf = info->scan_start_tsf;
                        MAC_ASSIGN(tsf_bssid, info->tsf_bssid);
                }
        ),
        TP_printk("aborted: %s, scan start (TSF): %llu, tsf_bssid: %pM",
                  BOOL_TO_STR(__entry->aborted),
                  (unsigned long long)__entry->scan_start_tsf,
                  __entry->tsf_bssid)
);

DECLARE_EVENT_CLASS(wiphy_id_evt,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                __field(u64, id)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                __entry->id = id;
        ),
        TP_printk(WIPHY_PR_FMT ", id: %llu", WIPHY_PR_ARG, __entry->id)
);

DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_stopped,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id)
);

DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_results,
        TP_PROTO(struct wiphy *wiphy, u64 id),
        TP_ARGS(wiphy, id)
);

TRACE_EVENT(cfg80211_get_bss,
        TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
                 const u8 *bssid, const u8 *ssid, size_t ssid_len,
                 enum ieee80211_bss_type bss_type,
                 enum ieee80211_privacy privacy),
        TP_ARGS(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                MAC_ENTRY(bssid)
                __dynamic_array(u8, ssid, ssid_len)
                __field(enum ieee80211_bss_type, bss_type)
                __field(enum ieee80211_privacy, privacy)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(channel);
                MAC_ASSIGN(bssid, bssid);
                memcpy(__get_dynamic_array(ssid), ssid, ssid_len);
                __entry->bss_type = bss_type;
                __entry->privacy = privacy;
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT ", %pM"
                  ", buf: %#.2x, bss_type: %d, privacy: %d",
                  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->bssid,
                  ((u8 *)__get_dynamic_array(ssid))[0], __entry->bss_type,
                  __entry->privacy)
);

TRACE_EVENT(cfg80211_inform_bss_frame,
        TP_PROTO(struct wiphy *wiphy, struct cfg80211_inform_bss *data,
                 struct ieee80211_mgmt *mgmt, size_t len),
        TP_ARGS(wiphy, data, mgmt, len),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                CHAN_ENTRY
                __dynamic_array(u8, mgmt, len)
                __field(s32, signal)
                __field(u64, ts_boottime)
                __field(u64, parent_tsf)
                MAC_ENTRY(parent_bssid)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                CHAN_ASSIGN(data->chan);
                if (mgmt)
                        memcpy(__get_dynamic_array(mgmt), mgmt, len);
                __entry->signal = data->signal;
                __entry->ts_boottime = data->boottime_ns;
                __entry->parent_tsf = data->parent_tsf;
                MAC_ASSIGN(parent_bssid, data->parent_bssid);
        ),
        TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT
                  "signal: %d, tsb:%llu, detect_tsf:%llu, tsf_bssid: %pM",
                  WIPHY_PR_ARG, CHAN_PR_ARG,
                  __entry->signal, (unsigned long long)__entry->ts_boottime,
                  (unsigned long long)__entry->parent_tsf,
                  __entry->parent_bssid)
);

DECLARE_EVENT_CLASS(cfg80211_bss_evt,
        TP_PROTO(struct cfg80211_bss *pub),
        TP_ARGS(pub),
        TP_STRUCT__entry(
                MAC_ENTRY(bssid)
                CHAN_ENTRY
        ),
        TP_fast_assign(
                MAC_ASSIGN(bssid, pub->bssid);
                CHAN_ASSIGN(pub->channel);
        ),
        TP_printk("%pM, " CHAN_PR_FMT, __entry->bssid, CHAN_PR_ARG)
);

DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss,
        TP_PROTO(struct cfg80211_bss *pub),
        TP_ARGS(pub)
);

TRACE_EVENT(cfg80211_return_uint,
        TP_PROTO(unsigned int ret),
        TP_ARGS(ret),
        TP_STRUCT__entry(
                __field(unsigned int, ret)
        ),
        TP_fast_assign(
                __entry->ret = ret;
        ),
        TP_printk("ret: %d", __entry->ret)
);

TRACE_EVENT(cfg80211_return_u32,
        TP_PROTO(u32 ret),
        TP_ARGS(ret),
        TP_STRUCT__entry(
                __field(u32, ret)
        ),
        TP_fast_assign(
                __entry->ret = ret;
        ),
        TP_printk("ret: %u", __entry->ret)
);

TRACE_EVENT(cfg80211_report_wowlan_wakeup,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 struct cfg80211_wowlan_wakeup *wakeup),
        TP_ARGS(wiphy, wdev, wakeup),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(bool, non_wireless)
                __field(bool, disconnect)
                __field(bool, magic_pkt)
                __field(bool, gtk_rekey_failure)
                __field(bool, eap_identity_req)
                __field(bool, four_way_handshake)
                __field(bool, rfkill_release)
                __field(s32, pattern_idx)
                __field(u32, packet_len)
                __dynamic_array(u8, packet,
                                wakeup ? wakeup->packet_present_len : 0)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->non_wireless = !wakeup;
                __entry->disconnect = wakeup ? wakeup->disconnect : false;
                __entry->magic_pkt = wakeup ? wakeup->magic_pkt : false;
                __entry->gtk_rekey_failure = wakeup ? wakeup->gtk_rekey_failure : false;
                __entry->eap_identity_req = wakeup ? wakeup->eap_identity_req : false;
                __entry->four_way_handshake = wakeup ? wakeup->four_way_handshake : false;
                __entry->rfkill_release = wakeup ? wakeup->rfkill_release : false;
                __entry->pattern_idx = wakeup ? wakeup->pattern_idx : false;
                __entry->packet_len = wakeup ? wakeup->packet_len : false;
                if (wakeup && wakeup->packet && wakeup->packet_present_len)
                        memcpy(__get_dynamic_array(packet), wakeup->packet,
                               wakeup->packet_present_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(cfg80211_ft_event,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_ft_event_params *ft_event),
        TP_ARGS(wiphy, netdev, ft_event),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __dynamic_array(u8, ies, ft_event->ies_len)
                MAC_ENTRY(target_ap)
                __dynamic_array(u8, ric_ies, ft_event->ric_ies_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                if (ft_event->ies)
                        memcpy(__get_dynamic_array(ies), ft_event->ies,
                               ft_event->ies_len);
                MAC_ASSIGN(target_ap, ft_event->target_ap);
                if (ft_event->ric_ies)
                        memcpy(__get_dynamic_array(ric_ies), ft_event->ric_ies,
                               ft_event->ric_ies_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", target_ap: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->target_ap)
);

TRACE_EVENT(cfg80211_stop_iface,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
        TP_ARGS(wiphy, wdev),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
                  WIPHY_PR_ARG, WDEV_PR_ARG)
);

TRACE_EVENT(cfg80211_pmsr_report,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
                 u64 cookie, const u8 *addr),
        TP_ARGS(wiphy, wdev, cookie, addr),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
                MAC_ENTRY(addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
                MAC_ASSIGN(addr, addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld, %pM",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie,
                  __entry->addr)
);

TRACE_EVENT(cfg80211_pmsr_complete,
        TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie),
        TP_ARGS(wiphy, wdev, cookie),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                WDEV_ENTRY
                __field(u64, cookie)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                WDEV_ASSIGN;
                __entry->cookie = cookie;
        ),
        TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie:%lld",
                  WIPHY_PR_ARG, WDEV_PR_ARG,
                  (unsigned long long)__entry->cookie)
);

TRACE_EVENT(cfg80211_update_owe_info_event,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct cfg80211_update_owe_info *owe_info),
        TP_ARGS(wiphy, netdev, owe_info),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                MAC_ENTRY(peer)
                __dynamic_array(u8, ie, owe_info->ie_len)
                __field(int, assoc_link_id)
                MAC_ENTRY(peer_mld_addr)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                MAC_ASSIGN(peer, owe_info->peer);
                memcpy(__get_dynamic_array(ie), owe_info->ie,
                       owe_info->ie_len);
                __entry->assoc_link_id = owe_info->assoc_link_id;
                MAC_ASSIGN(peer_mld_addr, owe_info->peer_mld_addr);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: %pM,"
                  " assoc_link_id: %d, peer_mld_addr: %pM",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->peer,
                  __entry->assoc_link_id, __entry->peer_mld_addr)
);

TRACE_EVENT(cfg80211_bss_color_notify,
        TP_PROTO(struct net_device *netdev,
                 enum nl80211_commands cmd,
                 u8 count, u64 color_bitmap),
        TP_ARGS(netdev, cmd, count, color_bitmap),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(u32, cmd)
                __field(u8, count)
                __field(u64, color_bitmap)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->cmd = cmd;
                __entry->count = count;
                __entry->color_bitmap = color_bitmap;
        ),
        TP_printk(NETDEV_PR_FMT ", cmd: %x, count: %u, bitmap: %llx",
                  NETDEV_PR_ARG, __entry->cmd, __entry->count,
                  __entry->color_bitmap)
);

TRACE_EVENT(cfg80211_assoc_comeback,
        TP_PROTO(struct wireless_dev *wdev, const u8 *ap_addr, u32 timeout),
        TP_ARGS(wdev, ap_addr, timeout),
        TP_STRUCT__entry(
                WDEV_ENTRY
                MAC_ENTRY(ap_addr)
                __field(u32, timeout)
        ),
        TP_fast_assign(
                WDEV_ASSIGN;
                MAC_ASSIGN(ap_addr, ap_addr);
                __entry->timeout = timeout;
        ),
        TP_printk(WDEV_PR_FMT ", %pM, timeout: %u TUs",
                  WDEV_PR_ARG, __entry->ap_addr, __entry->timeout)
);

DECLARE_EVENT_CLASS(link_station_add_mod,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_parameters *params),
        TP_ARGS(wiphy, netdev, params),
        TP_STRUCT__entry(
                WIPHY_ENTRY
                NETDEV_ENTRY
                __array(u8, mld_mac, 6)
                __array(u8, link_mac, 6)
                __field(u32, link_id)
                __dynamic_array(u8, supported_rates,
                                params->supported_rates_len)
                __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
                __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
                __field(u8, opmode_notif)
                __field(bool, opmode_notif_used)
                __dynamic_array(u8, he_capa, params->he_capa_len)
                __array(u8, he_6ghz_capa, (int)sizeof(struct ieee80211_he_6ghz_capa))
                __dynamic_array(u8, eht_capa, params->eht_capa_len)
        ),
        TP_fast_assign(
                WIPHY_ASSIGN;
                NETDEV_ASSIGN;
                memset(__entry->mld_mac, 0, 6);
                memset(__entry->link_mac, 0, 6);
                if (params->mld_mac)
                        memcpy(__entry->mld_mac, params->mld_mac, 6);
                if (params->link_mac)
                        memcpy(__entry->link_mac, params->link_mac, 6);
                __entry->link_id = params->link_id;
                if (params->supported_rates && params->supported_rates_len)
                        memcpy(__get_dynamic_array(supported_rates),
                               params->supported_rates,
                               params->supported_rates_len);
                memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap));
                if (params->ht_capa)
                        memcpy(__entry->ht_capa, params->ht_capa,
                               sizeof(struct ieee80211_ht_cap));
                memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
                if (params->vht_capa)
                        memcpy(__entry->vht_capa, params->vht_capa,
                               sizeof(struct ieee80211_vht_cap));
                __entry->opmode_notif = params->opmode_notif;
                __entry->opmode_notif_used = params->opmode_notif_used;
                if (params->he_capa && params->he_capa_len)
                        memcpy(__get_dynamic_array(he_capa), params->he_capa,
                               params->he_capa_len);
                memset(__entry->he_6ghz_capa, 0, sizeof(struct ieee80211_he_6ghz_capa));
                if (params->he_6ghz_capa)
                        memcpy(__entry->he_6ghz_capa, params->he_6ghz_capa,
                               sizeof(struct ieee80211_he_6ghz_capa));
                if (params->eht_capa && params->eht_capa_len)
                        memcpy(__get_dynamic_array(eht_capa), params->eht_capa,
                               params->eht_capa_len);
        ),
        TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: %pM"
                  ", link mac: %pM, link id: %u",
                  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->mld_mac,
                  __entry->link_mac, __entry->link_id)
);

DEFINE_EVENT(link_station_add_mod, rdev_add_link_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_parameters *params),
        TP_ARGS(wiphy, netdev, params)
);

DEFINE_EVENT(link_station_add_mod, rdev_mod_link_station,
        TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
                 struct link_station_parameters *params),
        TP_ARGS(wiphy, netdev, params)
);

TRACE_EVENT(cfg80211_links_removed,
        TP_PROTO(struct net_device *netdev, u16 link_mask),
        TP_ARGS(netdev, link_mask),
        TP_STRUCT__entry(
                NETDEV_ENTRY
                __field(u16, link_mask)
        ),
        TP_fast_assign(
                NETDEV_ASSIGN;
                __entry->link_mask = link_mask;
        ),
        TP_printk(NETDEV_PR_FMT ", link_mask:%u", NETDEV_PR_ARG,
                  __entry->link_mask)
);

#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>














































































































































































































































































    2 
























    2 





    2 






    2 



    2 




    2 



    2 


    2 
    2 
    2 





    2 


    2 


    2 

    2 





    1 



















    1 
    2 

    2 



    3 
    3 







    2 




    1 



    2 









































    1 




    1 







































    1 
    2 


















    2 













    2 



    2 








    2 





    2 



    2 


    2 
    2 































    2 












    2 

    2 













    2 















    1 

    1 
    1 



    1 




    1 











    2 









    2 















    2 


















    2 







    2 





    2 








    2 



































    1 






    1 








    1 







    1 





    1 







    1 










    1 

















    1 


    1 













    2 










    2 

    2 
















    2 




























































































































































































































































































































    2 
























    2 





    2 


























    1 

































    1 















    1 


    1 










    2 


    2 

    2 

























    2 

    1 





    2 





    2 






    2 

    2 





    2 














    1 




    1 





    1 
    1 


    1 

    1 














































































































































































































































































































































































































































    2 


















    2 












    1 





    1 






    2 




















    2 










    2 






    2 





    1 



















    1 






















    1 
    1 





    2 








    2 

    2 




    2 




    2 








    2 

    2 

























































































































































    1 


















    1 


    1 








































































































































































































































































































    1 












    1 




















    1 







    1 



















    2 




    2 










    1 







    1 



    1 
    1 


















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NETLINK      Kernel-user communication protocol.
 *
 *                 Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                                 Patrick McHardy <kaber@trash.net>
 *
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 *                                  use nlk_sk, as sk->protinfo is on a diet 8)
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 *                                  - inc module use count of module that owns
 *                                    the kernel socket in case userspace opens
 *                                    socket of same protocol
 *                                  - remove all module support, since netlink is
 *                                    mandatory if CONFIG_NET=y these days
 */

#include <linux/module.h>

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/filter.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/if_arp.h>
#include <linux/rhashtable.h>
#include <asm/cacheflush.h>
#include <linux/hash.h>
#include <linux/net_namespace.h>
#include <linux/nospec.h>
#include <linux/btf_ids.h>

#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>
#define CREATE_TRACE_POINTS
#include <trace/events/netlink.h>

#include "af_netlink.h"
#include "genetlink.h"

struct listeners {
        struct rcu_head                rcu;
        unsigned long                masks[];
};

/* state bits */
#define NETLINK_S_CONGESTED                0x0

static inline int netlink_is_kernel(struct sock *sk)
{
        return nlk_test_bit(KERNEL_SOCKET, sk);
}

struct netlink_table *nl_table __read_mostly;
EXPORT_SYMBOL_GPL(nl_table);

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];

static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
        "nlk_cb_mutex-ROUTE",
        "nlk_cb_mutex-1",
        "nlk_cb_mutex-USERSOCK",
        "nlk_cb_mutex-FIREWALL",
        "nlk_cb_mutex-SOCK_DIAG",
        "nlk_cb_mutex-NFLOG",
        "nlk_cb_mutex-XFRM",
        "nlk_cb_mutex-SELINUX",
        "nlk_cb_mutex-ISCSI",
        "nlk_cb_mutex-AUDIT",
        "nlk_cb_mutex-FIB_LOOKUP",
        "nlk_cb_mutex-CONNECTOR",
        "nlk_cb_mutex-NETFILTER",
        "nlk_cb_mutex-IP6_FW",
        "nlk_cb_mutex-DNRTMSG",
        "nlk_cb_mutex-KOBJECT_UEVENT",
        "nlk_cb_mutex-GENERIC",
        "nlk_cb_mutex-17",
        "nlk_cb_mutex-SCSITRANSPORT",
        "nlk_cb_mutex-ECRYPTFS",
        "nlk_cb_mutex-RDMA",
        "nlk_cb_mutex-CRYPTO",
        "nlk_cb_mutex-SMC",
        "nlk_cb_mutex-23",
        "nlk_cb_mutex-24",
        "nlk_cb_mutex-25",
        "nlk_cb_mutex-26",
        "nlk_cb_mutex-27",
        "nlk_cb_mutex-28",
        "nlk_cb_mutex-29",
        "nlk_cb_mutex-30",
        "nlk_cb_mutex-31",
        "nlk_cb_mutex-MAX_LINKS"
};

static int netlink_dump(struct sock *sk, bool lock_taken);

/* nl_table locking explained:
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
 * and removal are protected with per bucket lock while using RCU list
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
 */
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
static atomic_t nl_table_users = ATOMIC_INIT(0);

#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

static BLOCKING_NOTIFIER_HEAD(netlink_chain);


static const struct rhashtable_params netlink_rhashtable_params;

void do_trace_netlink_extack(const char *msg)
{
        trace_netlink_extack(msg);
}
EXPORT_SYMBOL(do_trace_netlink_extack);

static inline u32 netlink_group_mask(u32 group)
{
        if (group > 32)
                return 0;
        return group ? 1 << (group - 1) : 0;
}

static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
                                           gfp_t gfp_mask)
{
        unsigned int len = skb->len;
        struct sk_buff *new;

        new = alloc_skb(len, gfp_mask);
        if (new == NULL)
                return NULL;

        NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
        NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
        NETLINK_CB(new).creds = NETLINK_CB(skb).creds;

        skb_put_data(new, skb->data, len);
        return new;
}

static unsigned int netlink_tap_net_id;

struct netlink_tap_net {
        struct list_head netlink_tap_all;
        struct mutex netlink_tap_lock;
};

int netlink_add_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        if (unlikely(nt->dev->type != ARPHRD_NETLINK))
                return -EINVAL;

        mutex_lock(&nn->netlink_tap_lock);
        list_add_rcu(&nt->list, &nn->netlink_tap_all);
        mutex_unlock(&nn->netlink_tap_lock);

        __module_get(nt->module);

        return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

static int __netlink_remove_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
        bool found = false;
        struct netlink_tap *tmp;

        mutex_lock(&nn->netlink_tap_lock);

        list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
                if (nt == tmp) {
                        list_del_rcu(&nt->list);
                        found = true;
                        goto out;
                }
        }

        pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
        mutex_unlock(&nn->netlink_tap_lock);

        if (found)
                module_put(nt->module);

        return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
        int ret;

        ret = __netlink_remove_tap(nt);
        synchronize_net();

        return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

static __net_init int netlink_tap_init_net(struct net *net)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        INIT_LIST_HEAD(&nn->netlink_tap_all);
        mutex_init(&nn->netlink_tap_lock);
        return 0;
}

static struct pernet_operations netlink_tap_net_ops = {
        .init = netlink_tap_init_net,
        .id   = &netlink_tap_net_id,
        .size = sizeof(struct netlink_tap_net),
};

static bool netlink_filter_tap(const struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        /* We take the more conservative approach and
         * whitelist socket protocols that may pass.
         */
        switch (sk->sk_protocol) {
        case NETLINK_ROUTE:
        case NETLINK_USERSOCK:
        case NETLINK_SOCK_DIAG:
        case NETLINK_NFLOG:
        case NETLINK_XFRM:
        case NETLINK_FIB_LOOKUP:
        case NETLINK_NETFILTER:
        case NETLINK_GENERIC:
                return true;
        }

        return false;
}

static int __netlink_deliver_tap_skb(struct sk_buff *skb,
                                     struct net_device *dev)
{
        struct sk_buff *nskb;
        struct sock *sk = skb->sk;
        int ret = -ENOMEM;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return 0;

        dev_hold(dev);

        if (is_vmalloc_addr(skb->head))
                nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
        else
                nskb = skb_clone(skb, GFP_ATOMIC);
        if (nskb) {
                nskb->dev = dev;
                nskb->protocol = htons((u16) sk->sk_protocol);
                nskb->pkt_type = netlink_is_kernel(sk) ?
                                 PACKET_KERNEL : PACKET_USER;
                skb_reset_network_header(nskb);
                ret = dev_queue_xmit(nskb);
                if (unlikely(ret > 0))
                        ret = net_xmit_errno(ret);
        }

        dev_put(dev);
        return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
{
        int ret;
        struct netlink_tap *tmp;

        if (!netlink_filter_tap(skb))
                return;

        list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
                ret = __netlink_deliver_tap_skb(skb, tmp->dev);
                if (unlikely(ret))
                        break;
        }
}

static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        rcu_read_lock();

        if (unlikely(!list_empty(&nn->netlink_tap_all)))
                __netlink_deliver_tap(skb, nn);

        rcu_read_unlock();
}

static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
                                       struct sk_buff *skb)
{
        if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
                netlink_deliver_tap(sock_net(dst), skb);
}

static void netlink_overrun(struct sock *sk)
{
        if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                if (!test_and_set_bit(NETLINK_S_CONGESTED,
                                      &nlk_sk(sk)->state)) {
                        WRITE_ONCE(sk->sk_err, ENOBUFS);
                        sk_error_report(sk);
                }
        }
        atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (skb_queue_empty_lockless(&sk->sk_receive_queue))
                clear_bit(NETLINK_S_CONGESTED, &nlk->state);
        if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
                wake_up_interruptible(&nlk->wait);
}

static void netlink_skb_destructor(struct sk_buff *skb)
{
        if (is_vmalloc_addr(skb->head)) {
                if (!skb->cloned ||
                    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
                        vfree_atomic(skb->head);

                skb->head = NULL;
        }
        if (skb->sk != NULL)
                sock_rfree(skb);
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        WARN_ON(skb->sk != NULL);
        skb->sk = sk;
        skb->destructor = netlink_skb_destructor;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static void netlink_sock_destruct(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (nlk->cb_running) {
                if (nlk->cb.done)
                        nlk->cb.done(&nlk->cb);
                module_put(nlk->cb.module);
                kfree_skb(nlk->cb.skb);
        }

        skb_queue_purge(&sk->sk_receive_queue);

        if (!sock_flag(sk, SOCK_DEAD)) {
                printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
                return;
        }

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON(nlk_sk(sk)->groups);
}

static void netlink_sock_destruct_work(struct work_struct *work)
{
        struct netlink_sock *nlk = container_of(work, struct netlink_sock,
                                                work);

        sk_free(&nlk->sk);
}

/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

void netlink_table_grab(void)
        __acquires(nl_table_lock)
{
        might_sleep();

        write_lock_irq(&nl_table_lock);

        if (atomic_read(&nl_table_users)) {
                DECLARE_WAITQUEUE(wait, current);

                add_wait_queue_exclusive(&nl_table_wait, &wait);
                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&nl_table_users) == 0)
                                break;
                        write_unlock_irq(&nl_table_lock);
                        schedule();
                        write_lock_irq(&nl_table_lock);
                }

                __set_current_state(TASK_RUNNING);
                remove_wait_queue(&nl_table_wait, &wait);
        }
}

void netlink_table_ungrab(void)
        __releases(nl_table_lock)
{
        write_unlock_irq(&nl_table_lock);
        wake_up(&nl_table_wait);
}

static inline void
netlink_lock_table(void)
{
        unsigned long flags;

        /* read_lock() synchronizes us to netlink_table_grab */

        read_lock_irqsave(&nl_table_lock, flags);
        atomic_inc(&nl_table_users);
        read_unlock_irqrestore(&nl_table_lock, flags);
}

static inline void
netlink_unlock_table(void)
{
        if (atomic_dec_and_test(&nl_table_users))
                wake_up(&nl_table_wait);
}

struct netlink_compare_arg
{
        possible_net_t pnet;
        u32 portid;
};

/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
        (offsetof(struct netlink_compare_arg, portid) + sizeof(u32))

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
                                  const void *ptr)
{
        const struct netlink_compare_arg *x = arg->key;
        const struct netlink_sock *nlk = ptr;

        return nlk->portid != x->portid ||
               !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
                                     struct net *net, u32 portid)
{
        memset(arg, 0, sizeof(*arg));
        write_pnet(&arg->pnet, net);
        arg->portid = portid;
}

static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
                                     struct net *net)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, net, portid);
        return rhashtable_lookup_fast(&table->hash, &arg,
                                      netlink_rhashtable_params);
}

static int __netlink_insert(struct netlink_table *table, struct sock *sk)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
        return rhashtable_lookup_insert_key(&table->hash, &arg,
                                            &nlk_sk(sk)->node,
                                            netlink_rhashtable_params);
}

static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
{
        struct netlink_table *table = &nl_table[protocol];
        struct sock *sk;

        rcu_read_lock();
        sk = __netlink_lookup(table, portid, net);
        if (sk)
                sock_hold(sk);
        rcu_read_unlock();

        return sk;
}

static const struct proto_ops netlink_ops;

static void
netlink_update_listeners(struct sock *sk)
{
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];
        unsigned long mask;
        unsigned int i;
        struct listeners *listeners;

        listeners = nl_deref_protected(tbl->listeners);
        if (!listeners)
                return;

        for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
                mask = 0;
                sk_for_each_bound(sk, &tbl->mc_list) {
                        if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
                                mask |= nlk_sk(sk)->groups[i];
                }
                listeners->masks[i] = mask;
        }
        /* this function is only called with the netlink table "grabbed", which
         * makes sure updates are visible before bind or setsockopt return. */
}

static int netlink_insert(struct sock *sk, u32 portid)
{
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        int err;

        lock_sock(sk);

        err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
        if (nlk_sk(sk)->bound)
                goto err;

        /* portid can be read locklessly from netlink_getname(). */
        WRITE_ONCE(nlk_sk(sk)->portid, portid);

        sock_hold(sk);

        err = __netlink_insert(table, sk);
        if (err) {
                /* In case the hashtable backend returns with -EBUSY
                 * from here, it must not escape to the caller.
                 */
                if (unlikely(err == -EBUSY))
                        err = -EOVERFLOW;
                if (err == -EEXIST)
                        err = -EADDRINUSE;
                sock_put(sk);
                goto err;
        }

        /* We need to ensure that the socket is hashed and visible. */
        smp_wmb();
        /* Paired with lockless reads from netlink_bind(),
         * netlink_connect() and netlink_sendmsg().
         */
        WRITE_ONCE(nlk_sk(sk)->bound, portid);

err:
        release_sock(sk);
        return err;
}

static void netlink_remove(struct sock *sk)
{
        struct netlink_table *table;

        table = &nl_table[sk->sk_protocol];
        if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
                                    netlink_rhashtable_params)) {
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }

        netlink_table_grab();
        if (nlk_sk(sk)->subscriptions) {
                __sk_del_bind_node(sk);
                netlink_update_listeners(sk);
        }
        if (sk->sk_protocol == NETLINK_GENERIC)
                atomic_inc(&genl_sk_destructing_cnt);
        netlink_table_ungrab();
}

static struct proto netlink_proto = {
        .name          = "NETLINK",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct netlink_sock),
};

static int __netlink_create(struct net *net, struct socket *sock,
                            int protocol, int kern)
{
        struct sock *sk;
        struct netlink_sock *nlk;

        sock->ops = &netlink_ops;

        sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);

        nlk = nlk_sk(sk);
        mutex_init(&nlk->nl_cb_mutex);
        lockdep_set_class_and_name(&nlk->nl_cb_mutex,
                                           nlk_cb_mutex_keys + protocol,
                                           nlk_cb_mutex_key_strings[protocol]);
        init_waitqueue_head(&nlk->wait);

        sk->sk_destruct = netlink_sock_destruct;
        sk->sk_protocol = protocol;
        return 0;
}

static int netlink_create(struct net *net, struct socket *sock, int protocol,
                          int kern)
{
        struct module *module = NULL;
        struct netlink_sock *nlk;
        int (*bind)(struct net *net, int group);
        void (*unbind)(struct net *net, int group);
        void (*release)(struct sock *sock, unsigned long *groups);
        int err = 0;

        sock->state = SS_UNCONNECTED;

        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
                return -ESOCKTNOSUPPORT;

        if (protocol < 0 || protocol >= MAX_LINKS)
                return -EPROTONOSUPPORT;
        protocol = array_index_nospec(protocol, MAX_LINKS);

        netlink_lock_table();
#ifdef CONFIG_MODULES
        if (!nl_table[protocol].registered) {
                netlink_unlock_table();
                request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
                netlink_lock_table();
        }
#endif
        if (nl_table[protocol].registered &&
            try_module_get(nl_table[protocol].module))
                module = nl_table[protocol].module;
        else
                err = -EPROTONOSUPPORT;
        bind = nl_table[protocol].bind;
        unbind = nl_table[protocol].unbind;
        release = nl_table[protocol].release;
        netlink_unlock_table();

        if (err < 0)
                goto out;

        err = __netlink_create(net, sock, protocol, kern);
        if (err < 0)
                goto out_module;

        sock_prot_inuse_add(net, &netlink_proto, 1);

        nlk = nlk_sk(sock->sk);
        nlk->module = module;
        nlk->netlink_bind = bind;
        nlk->netlink_unbind = unbind;
        nlk->netlink_release = release;
out:
        return err;

out_module:
        module_put(module);
        goto out;
}

static void deferred_put_nlk_sk(struct rcu_head *head)
{
        struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
        struct sock *sk = &nlk->sk;

        kfree(nlk->groups);
        nlk->groups = NULL;

        if (!refcount_dec_and_test(&sk->sk_refcnt))
                return;

        if (nlk->cb_running && nlk->cb.done) {
                INIT_WORK(&nlk->work, netlink_sock_destruct_work);
                schedule_work(&nlk->work);
                return;
        }

        sk_free(sk);
}

static int netlink_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk;

        if (!sk)
                return 0;

        netlink_remove(sk);
        sock_orphan(sk);
        nlk = nlk_sk(sk);

        /*
         * OK. Socket is unlinked, any packets that arrive now
         * will be purged.
         */
        if (nlk->netlink_release)
                nlk->netlink_release(sk, nlk->groups);

        /* must not acquire netlink_table_lock in any way again before unbind
         * and notifying genetlink is done as otherwise it might deadlock
         */
        if (nlk->netlink_unbind) {
                int i;

                for (i = 0; i < nlk->ngroups; i++)
                        if (test_bit(i, nlk->groups))
                                nlk->netlink_unbind(sock_net(sk), i + 1);
        }
        if (sk->sk_protocol == NETLINK_GENERIC &&
            atomic_dec_return(&genl_sk_destructing_cnt) == 0)
                wake_up(&genl_sk_destructing_waitq);

        sock->sk = NULL;
        wake_up_interruptible_all(&nlk->wait);

        skb_queue_purge(&sk->sk_write_queue);

        if (nlk->portid && nlk->bound) {
                struct netlink_notify n = {
                                                .net = sock_net(sk),
                                                .protocol = sk->sk_protocol,
                                                .portid = nlk->portid,
                                          };
                blocking_notifier_call_chain(&netlink_chain,
                                NETLINK_URELEASE, &n);
        }

        module_put(nlk->module);

        if (netlink_is_kernel(sk)) {
                netlink_table_grab();
                BUG_ON(nl_table[sk->sk_protocol].registered == 0);
                if (--nl_table[sk->sk_protocol].registered == 0) {
                        struct listeners *old;

                        old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
                        RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
                        kfree_rcu(old, rcu);
                        nl_table[sk->sk_protocol].module = NULL;
                        nl_table[sk->sk_protocol].bind = NULL;
                        nl_table[sk->sk_protocol].unbind = NULL;
                        nl_table[sk->sk_protocol].flags = 0;
                        nl_table[sk->sk_protocol].registered = 0;
                }
                netlink_table_ungrab();
        }

        sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);

        /* Because struct net might disappear soon, do not keep a pointer. */
        if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) {
                __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
                /* Because of deferred_put_nlk_sk and use of work queue,
                 * it is possible  netns will be freed before this socket.
                 */
                sock_net_set(sk, &init_net);
                __netns_tracker_alloc(&init_net, &sk->ns_tracker,
                                      false, GFP_KERNEL);
        }
        call_rcu(&nlk->rcu, deferred_put_nlk_sk);
        return 0;
}

static int netlink_autobind(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        s32 portid = task_tgid_vnr(current);
        int err;
        s32 rover = -4096;
        bool ok;

retry:
        cond_resched();
        rcu_read_lock();
        ok = !__netlink_lookup(table, portid, net);
        rcu_read_unlock();
        if (!ok) {
                /* Bind collision, search negative portid values. */
                if (rover == -4096)
                        /* rover will be in range [S32_MIN, -4097] */
                        rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN);
                else if (rover >= -4096)
                        rover = -4097;
                portid = rover--;
                goto retry;
        }

        err = netlink_insert(sk, portid);
        if (err == -EADDRINUSE)
                goto retry;

        /* If 2 threads race to autobind, that is fine.  */
        if (err == -EBUSY)
                err = 0;

        return err;
}

/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
                        struct user_namespace *user_ns, int cap)
{
        return ((nsp->flags & NETLINK_SKB_DST) ||
                file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
                        struct user_namespace *user_ns, int cap)
{
        return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
{
        return (nl_table[sock->sk->sk_protocol].flags & flag) ||
                ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
}

static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (nlk->subscriptions && !subscriptions)
                __sk_del_bind_node(sk);
        else if (!nlk->subscriptions && subscriptions)
                sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
        nlk->subscriptions = subscriptions;
}

static int netlink_realloc_groups(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int groups;
        unsigned long *new_groups;
        int err = 0;

        netlink_table_grab();

        groups = nl_table[sk->sk_protocol].groups;
        if (!nl_table[sk->sk_protocol].registered) {
                err = -ENOENT;
                goto out_unlock;
        }

        if (nlk->ngroups >= groups)
                goto out_unlock;

        new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
        if (new_groups == NULL) {
                err = -ENOMEM;
                goto out_unlock;
        }
        memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
               NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

        nlk->groups = new_groups;
        nlk->ngroups = groups;
 out_unlock:
        netlink_table_ungrab();
        return err;
}

static void netlink_undo_bind(int group, long unsigned int groups,
                              struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int undo;

        if (!nlk->netlink_unbind)
                return;

        for (undo = 0; undo < group; undo++)
                if (test_bit(undo, &groups))
                        nlk->netlink_unbind(sock_net(sk), undo + 1);
}

static int netlink_bind(struct socket *sock, struct sockaddr *addr,
                        int addr_len)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
        int err = 0;
        unsigned long groups;
        bool bound;

        if (addr_len < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if (nladdr->nl_family != AF_NETLINK)
                return -EINVAL;
        groups = nladdr->nl_groups;

        /* Only superuser is allowed to listen multicasts */
        if (groups) {
                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
        }

        if (nlk->ngroups < BITS_PER_LONG)
                groups &= (1UL << nlk->ngroups) - 1;

        /* Paired with WRITE_ONCE() in netlink_insert() */
        bound = READ_ONCE(nlk->bound);
        if (bound) {
                /* Ensure nlk->portid is up-to-date. */
                smp_rmb();

                if (nladdr->nl_pid != nlk->portid)
                        return -EINVAL;
        }

        if (nlk->netlink_bind && groups) {
                int group;

                /* nl_groups is a u32, so cap the maximum groups we can bind */
                for (group = 0; group < BITS_PER_TYPE(u32); group++) {
                        if (!test_bit(group, &groups))
                                continue;
                        err = nlk->netlink_bind(net, group + 1);
                        if (!err)
                                continue;
                        netlink_undo_bind(group, groups, sk);
                        return err;
                }
        }

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         */
        netlink_lock_table();
        if (!bound) {
                err = nladdr->nl_pid ?
                        netlink_insert(sk, nladdr->nl_pid) :
                        netlink_autobind(sock);
                if (err) {
                        netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
                        goto unlock;
                }
        }

        if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
                goto unlock;
        netlink_unlock_table();

        netlink_table_grab();
        netlink_update_subscriptions(sk, nlk->subscriptions +
                                         hweight32(groups) -
                                         hweight32(nlk->groups[0]));
        nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
        netlink_update_listeners(sk);
        netlink_table_ungrab();

        return 0;

unlock:
        netlink_unlock_table();
        return err;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
                           int alen, int flags)
{
        int err = 0;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;

        if (alen < sizeof(addr->sa_family))
                return -EINVAL;

        if (addr->sa_family == AF_UNSPEC) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, 0);
                WRITE_ONCE(nlk->dst_group, 0);
                return 0;
        }
        if (addr->sa_family != AF_NETLINK)
                return -EINVAL;

        if (alen < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if ((nladdr->nl_groups || nladdr->nl_pid) &&
            !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                return -EPERM;

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         * Paired with WRITE_ONCE() in netlink_insert().
         */
        if (!READ_ONCE(nlk->bound))
                err = netlink_autobind(sock);

        if (err == 0) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid);
                WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups));
        }

        return err;
}

static int netlink_getname(struct socket *sock, struct sockaddr *addr,
                           int peer)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);

        nladdr->nl_family = AF_NETLINK;
        nladdr->nl_pad = 0;

        if (peer) {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                nladdr->nl_pid = READ_ONCE(nlk->dst_portid);
                nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group));
        } else {
                /* Paired with WRITE_ONCE() in netlink_insert() */
                nladdr->nl_pid = READ_ONCE(nlk->portid);
                netlink_lock_table();
                nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
                netlink_unlock_table();
        }
        return sizeof(*nladdr);
}

static int netlink_ioctl(struct socket *sock, unsigned int cmd,
                         unsigned long arg)
{
        /* try to hand this ioctl down to the NIC drivers.
         */
        return -ENOIOCTLCMD;
}

static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
{
        struct sock *sock;
        struct netlink_sock *nlk;

        sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
        if (!sock)
                return ERR_PTR(-ECONNREFUSED);

        /* Don't bother queuing skb if kernel socket has no input function */
        nlk = nlk_sk(sock);
        /* dst_portid and sk_state can be changed in netlink_connect() */
        if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED &&
            READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) {
                sock_put(sock);
                return ERR_PTR(-ECONNREFUSED);
        }
        return sock;
}

struct sock *netlink_getsockbyfilp(struct file *filp)
{
        struct inode *inode = file_inode(filp);
        struct sock *sock;

        if (!S_ISSOCK(inode->i_mode))
                return ERR_PTR(-ENOTSOCK);

        sock = SOCKET_I(inode)->sk;
        if (sock->sk_family != AF_NETLINK)
                return ERR_PTR(-EINVAL);

        sock_hold(sock);
        return sock;
}

struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
{
        size_t head_size = SKB_HEAD_ALIGN(size);
        struct sk_buff *skb;
        void *data;

        if (head_size <= PAGE_SIZE || broadcast)
                return alloc_skb(size, GFP_KERNEL);

        data = kvmalloc(head_size, GFP_KERNEL);
        if (!data)
                return NULL;

        skb = __build_skb(data, head_size);
        if (!skb)
                kvfree(data);
        else if (is_vmalloc_addr(data))
                skb->destructor = netlink_skb_destructor;

        return skb;
}

/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
                      long *timeo, struct sock *ssk)
{
        struct netlink_sock *nlk;

        nlk = nlk_sk(sk);

        if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
             test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
                DECLARE_WAITQUEUE(wait, current);
                if (!*timeo) {
                        if (!ssk || netlink_is_kernel(ssk))
                                netlink_overrun(sk);
                        sock_put(sk);
                        kfree_skb(skb);
                        return -EAGAIN;
                }

                __set_current_state(TASK_INTERRUPTIBLE);
                add_wait_queue(&nlk->wait, &wait);

                if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
                     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
                    !sock_flag(sk, SOCK_DEAD))
                        *timeo = schedule_timeout(*timeo);

                __set_current_state(TASK_RUNNING);
                remove_wait_queue(&nlk->wait, &wait);
                sock_put(sk);

                if (signal_pending(current)) {
                        kfree_skb(skb);
                        return sock_intr_errno(*timeo);
                }
                return 1;
        }
        netlink_skb_set_owner_r(skb, sk);
        return 0;
}

static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = skb->len;

        netlink_deliver_tap(sock_net(sk), skb);

        skb_queue_tail(&sk->sk_receive_queue, skb);
        sk->sk_data_ready(sk);
        return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = __netlink_sendskb(sk, skb);

        sock_put(sk);
        return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        sock_put(sk);
}

static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
        int delta;

        WARN_ON(skb->sk != NULL);
        delta = skb->end - skb->tail;
        if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
                return skb;

        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, allocation);
                if (!nskb)
                        return skb;
                consume_skb(skb);
                skb = nskb;
        }

        pskb_expand_head(skb, 0, -delta,
                         (allocation & ~__GFP_DIRECT_RECLAIM) |
                         __GFP_NOWARN | __GFP_NORETRY);
        return skb;
}

static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
                                  struct sock *ssk)
{
        int ret;
        struct netlink_sock *nlk = nlk_sk(sk);

        ret = -ECONNREFUSED;
        if (nlk->netlink_rcv != NULL) {
                ret = skb->len;
                netlink_skb_set_owner_r(skb, sk);
                NETLINK_CB(skb).sk = ssk;
                netlink_deliver_tap_kernel(sk, ssk, skb);
                nlk->netlink_rcv(skb);
                consume_skb(skb);
        } else {
                kfree_skb(skb);
        }
        sock_put(sk);
        return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
                    u32 portid, int nonblock)
{
        struct sock *sk;
        int err;
        long timeo;

        skb = netlink_trim(skb, gfp_any());

        timeo = sock_sndtimeo(ssk, nonblock);
retry:
        sk = netlink_getsockbyportid(ssk, portid);
        if (IS_ERR(sk)) {
                kfree_skb(skb);
                return PTR_ERR(sk);
        }
        if (netlink_is_kernel(sk))
                return netlink_unicast_kernel(sk, skb, ssk);

        if (sk_filter(sk, skb)) {
                err = skb->len;
                kfree_skb(skb);
                sock_put(sk);
                return err;
        }

        err = netlink_attachskb(sk, skb, &timeo, ssk);
        if (err == 1)
                goto retry;
        if (err)
                return err;

        return netlink_sendskb(sk, skb);
}
EXPORT_SYMBOL(netlink_unicast);

int netlink_has_listeners(struct sock *sk, unsigned int group)
{
        int res = 0;
        struct listeners *listeners;

        BUG_ON(!netlink_is_kernel(sk));

        rcu_read_lock();
        listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

        if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
                res = test_bit(group - 1, listeners->masks);

        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

bool netlink_strict_get_check(struct sk_buff *skb)
{
        return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);
}
EXPORT_SYMBOL_GPL(netlink_strict_get_check);

static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                __netlink_sendskb(sk, skb);
                return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
        }
        return -1;
}

struct netlink_broadcast_data {
        struct sock *exclude_sk;
        struct net *net;
        u32 portid;
        u32 group;
        int failure;
        int delivery_failure;
        int congested;
        int delivered;
        gfp_t allocation;
        struct sk_buff *skb, *skb2;
        int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
        void *tx_data;
};

static void do_one_broadcast(struct sock *sk,
                                    struct netlink_broadcast_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int val;

        if (p->exclude_sk == sk)
                return;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                return;

        if (!net_eq(sock_net(sk), p->net)) {
                if (!nlk_test_bit(LISTEN_ALL_NSID, sk))
                        return;

                if (!peernet_has_id(sock_net(sk), p->net))
                        return;

                if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
                                     CAP_NET_BROADCAST))
                        return;
        }

        if (p->failure) {
                netlink_overrun(sk);
                return;
        }

        sock_hold(sk);
        if (p->skb2 == NULL) {
                if (skb_shared(p->skb)) {
                        p->skb2 = skb_clone(p->skb, p->allocation);
                } else {
                        p->skb2 = skb_get(p->skb);
                        /*
                         * skb ownership may have been set when
                         * delivered to a previous socket.
                         */
                        skb_orphan(p->skb2);
                }
        }
        if (p->skb2 == NULL) {
                netlink_overrun(sk);
                /* Clone failed. Notify ALL listeners. */
                p->failure = 1;
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
                goto out;
        }

        if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }

        if (sk_filter(sk, p->skb2)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }
        NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
        if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
                NETLINK_CB(p->skb2).nsid_is_set = true;
        val = netlink_broadcast_deliver(sk, p->skb2);
        if (val < 0) {
                netlink_overrun(sk);
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
        } else {
                p->congested |= val;
                p->delivered = 1;
                p->skb2 = NULL;
        }
out:
        sock_put(sk);
}

int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
                               u32 portid,
                               u32 group, gfp_t allocation,
                               netlink_filter_fn filter,
                               void *filter_data)
{
        struct net *net = sock_net(ssk);
        struct netlink_broadcast_data info;
        struct sock *sk;

        skb = netlink_trim(skb, allocation);

        info.exclude_sk = ssk;
        info.net = net;
        info.portid = portid;
        info.group = group;
        info.failure = 0;
        info.delivery_failure = 0;
        info.congested = 0;
        info.delivered = 0;
        info.allocation = allocation;
        info.skb = skb;
        info.skb2 = NULL;
        info.tx_filter = filter;
        info.tx_data = filter_data;

        /* While we sleep in clone, do not allow to change socket list */

        netlink_lock_table();

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                do_one_broadcast(sk, &info);

        consume_skb(skb);

        netlink_unlock_table();

        if (info.delivery_failure) {
                kfree_skb(info.skb2);
                return -ENOBUFS;
        }
        consume_skb(info.skb2);

        if (info.delivered) {
                if (info.congested && gfpflags_allow_blocking(allocation))
                        yield();
                return 0;
        }
        return -ESRCH;
}
EXPORT_SYMBOL(netlink_broadcast_filtered);

int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
                      u32 group, gfp_t allocation)
{
        return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
                                          NULL, NULL);
}
EXPORT_SYMBOL(netlink_broadcast);

struct netlink_set_err_data {
        struct sock *exclude_sk;
        u32 portid;
        u32 group;
        int code;
};

static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int ret = 0;

        if (sk == p->exclude_sk)
                goto out;

        if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
                goto out;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                goto out;

        if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                ret = 1;
                goto out;
        }

        WRITE_ONCE(sk->sk_err, p->code);
        sk_error_report(sk);
out:
        return ret;
}

/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_NO_ENOBUFS socket option.
 */
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{
        struct netlink_set_err_data info;
        unsigned long flags;
        struct sock *sk;
        int ret = 0;

        info.exclude_sk = ssk;
        info.portid = portid;
        info.group = group;
        /* sk->sk_err wants a positive error value */
        info.code = -code;

        read_lock_irqsave(&nl_table_lock, flags);

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                ret += do_one_set_err(sk, &info);

        read_unlock_irqrestore(&nl_table_lock, flags);
        return ret;
}
EXPORT_SYMBOL(netlink_set_err);

/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
                                     unsigned int group,
                                     int is_new)
{
        int old, new = !!is_new, subscriptions;

        old = test_bit(group - 1, nlk->groups);
        subscriptions = nlk->subscriptions - old + new;
        __assign_bit(group - 1, nlk->groups, new);
        netlink_update_subscriptions(&nlk->sk, subscriptions);
        netlink_update_listeners(&nlk->sk);
}

static int netlink_setsockopt(struct socket *sock, int level, int optname,
                              sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int val = 0;
        int nr = -1;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (optlen >= sizeof(int) &&
            copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        switch (optname) {
        case NETLINK_PKTINFO:
                nr = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_ADD_MEMBERSHIP:
        case NETLINK_DROP_MEMBERSHIP: {
                int err;

                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
                if (!val || val - 1 >= nlk->ngroups)
                        return -EINVAL;
                if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
                        err = nlk->netlink_bind(sock_net(sk), val);
                        if (err)
                                return err;
                }
                netlink_table_grab();
                netlink_update_socket_mc(nlk, val,
                                         optname == NETLINK_ADD_MEMBERSHIP);
                netlink_table_ungrab();
                if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
                        nlk->netlink_unbind(sock_net(sk), val);

                break;
        }
        case NETLINK_BROADCAST_ERROR:
                nr = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val);
                if (val) {
                        clear_bit(NETLINK_S_CONGESTED, &nlk->state);
                        wake_up_interruptible(&nlk->wait);
                }
                break;
        case NETLINK_LISTEN_ALL_NSID:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
                        return -EPERM;
                nr = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                nr = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                nr = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                nr = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }
        if (nr >= 0)
                assign_bit(nr, &nlk->flags, val);
        return 0;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
                              char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int flag;
        int len, val;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case NETLINK_PKTINFO:
                flag = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_BROADCAST_ERROR:
                flag = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                flag = NETLINK_F_RECV_NO_ENOBUFS;
                break;
        case NETLINK_LIST_MEMBERSHIPS: {
                int pos, idx, shift, err = 0;

                netlink_lock_table();
                for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
                        if (len - pos < sizeof(u32))
                                break;

                        idx = pos / sizeof(unsigned long);
                        shift = (pos % sizeof(unsigned long)) * 8;
                        if (put_user((u32)(nlk->groups[idx] >> shift),
                                     (u32 __user *)(optval + pos))) {
                                err = -EFAULT;
                                break;
                        }
                }
                if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen))
                        err = -EFAULT;
                netlink_unlock_table();
                return err;
        }
        case NETLINK_LISTEN_ALL_NSID:
                flag = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                flag = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                flag = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                flag = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = test_bit(flag, &nlk->flags);

        if (put_user(len, optlen) ||
            copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
        struct nl_pktinfo info;

        info.group = NETLINK_CB(skb).dst_group;
        put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
                                         struct sk_buff *skb)
{
        if (!NETLINK_CB(skb).nsid_is_set)
                return;

        put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
                 &NETLINK_CB(skb).nsid);
}

static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
        u32 dst_portid;
        u32 dst_group;
        struct sk_buff *skb;
        int err;
        struct scm_cookie scm;
        u32 netlink_skb_flags = 0;

        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (len == 0) {
                pr_warn_once("Zero length message leads to an empty skb\n");
                return -ENODATA;
        }

        err = scm_send(sock, msg, &scm, true);
        if (err < 0)
                return err;

        if (msg->msg_namelen) {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_nl))
                        goto out;
                if (addr->nl_family != AF_NETLINK)
                        goto out;
                dst_portid = addr->nl_pid;
                dst_group = ffs(addr->nl_groups);
                err =  -EPERM;
                if ((dst_group || dst_portid) &&
                    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                        goto out;
                netlink_skb_flags |= NETLINK_SKB_DST;
        } else {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                dst_portid = READ_ONCE(nlk->dst_portid);
                dst_group = READ_ONCE(nlk->dst_group);
        }

        /* Paired with WRITE_ONCE() in netlink_insert() */
        if (!READ_ONCE(nlk->bound)) {
                err = netlink_autobind(sock);
                if (err)
                        goto out;
        } else {
                /* Ensure nlk is hashed and visible. */
                smp_rmb();
        }

        err = -EMSGSIZE;
        if (len > sk->sk_sndbuf - 32)
                goto out;
        err = -ENOBUFS;
        skb = netlink_alloc_large_skb(len, dst_group);
        if (skb == NULL)
                goto out;

        NETLINK_CB(skb).portid        = nlk->portid;
        NETLINK_CB(skb).dst_group = dst_group;
        NETLINK_CB(skb).creds        = scm.creds;
        NETLINK_CB(skb).flags        = netlink_skb_flags;

        err = -EFAULT;
        if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
                kfree_skb(skb);
                goto out;
        }

        err = security_netlink_send(sk, skb);
        if (err) {
                kfree_skb(skb);
                goto out;
        }

        if (dst_group) {
                refcount_inc(&skb->users);
                netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
        }
        err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);

out:
        scm_destroy(&scm);
        return err;
}

static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                           int flags)
{
        struct scm_cookie scm;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        size_t copied, max_recvmsg_len;
        struct sk_buff *skb, *data_skb;
        int err, ret;

        if (flags & MSG_OOB)
                return -EOPNOTSUPP;

        copied = 0;

        skb = skb_recv_datagram(sk, flags, &err);
        if (skb == NULL)
                goto out;

        data_skb = skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                /*
                 * If this skb has a frag_list, then here that means that we
                 * will have to use the frag_list skb's data for compat tasks
                 * and the regular skb's data for normal (non-compat) tasks.
                 *
                 * If we need to send the compat skb, assign it to the
                 * 'data_skb' variable so that it will be used below for data
                 * copying. We keep 'skb' for everything else, including
                 * freeing both later.
                 */
                if (flags & MSG_CMSG_COMPAT)
                        data_skb = skb_shinfo(skb)->frag_list;
        }
#endif

        /* Record the max length of recvmsg() calls for future allocations */
        max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len);
        max_recvmsg_len = min_t(size_t, max_recvmsg_len,
                                SKB_WITH_OVERHEAD(32768));
        WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len);

        copied = data_skb->len;
        if (len < copied) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }

        err = skb_copy_datagram_msg(data_skb, 0, msg, copied);

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
                addr->nl_family = AF_NETLINK;
                addr->nl_pad    = 0;
                addr->nl_pid        = NETLINK_CB(skb).portid;
                addr->nl_groups        = netlink_group_mask(NETLINK_CB(skb).dst_group);
                msg->msg_namelen = sizeof(*addr);
        }

        if (nlk_test_bit(RECV_PKTINFO, sk))
                netlink_cmsg_recv_pktinfo(msg, skb);
        if (nlk_test_bit(LISTEN_ALL_NSID, sk))
                netlink_cmsg_listen_all_nsid(sk, msg, skb);

        memset(&scm, 0, sizeof(scm));
        scm.creds = *NETLINK_CREDS(skb);
        if (flags & MSG_TRUNC)
                copied = data_skb->len;

        skb_free_datagram(sk, skb);

        if (READ_ONCE(nlk->cb_running) &&
            atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
                ret = netlink_dump(sk, false);
                if (ret) {
                        WRITE_ONCE(sk->sk_err, -ret);
                        sk_error_report(sk);
                }
        }

        scm_recv(sock, msg, &scm, flags);
out:
        netlink_rcv_wake(sk);
        return err ? : copied;
}

static void netlink_data_ready(struct sock *sk)
{
        BUG();
}

/*
 *        We export these functions to other modules. They provide a
 *        complete set of kernel non-blocking support for message
 *        queueing.
 */

struct sock *
__netlink_kernel_create(struct net *net, int unit, struct module *module,
                        struct netlink_kernel_cfg *cfg)
{
        struct socket *sock;
        struct sock *sk;
        struct netlink_sock *nlk;
        struct listeners *listeners = NULL;
        unsigned int groups;

        BUG_ON(!nl_table);

        if (unit < 0 || unit >= MAX_LINKS)
                return NULL;

        if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
                return NULL;

        if (__netlink_create(net, sock, unit, 1) < 0)
                goto out_sock_release_nosk;

        sk = sock->sk;

        if (!cfg || cfg->groups < 32)
                groups = 32;
        else
                groups = cfg->groups;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                goto out_sock_release;

        sk->sk_data_ready = netlink_data_ready;
        if (cfg && cfg->input)
                nlk_sk(sk)->netlink_rcv = cfg->input;

        if (netlink_insert(sk, 0))
                goto out_sock_release;

        nlk = nlk_sk(sk);
        set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags);

        netlink_table_grab();
        if (!nl_table[unit].registered) {
                nl_table[unit].groups = groups;
                rcu_assign_pointer(nl_table[unit].listeners, listeners);
                nl_table[unit].module = module;
                if (cfg) {
                        nl_table[unit].bind = cfg->bind;
                        nl_table[unit].unbind = cfg->unbind;
                        nl_table[unit].release = cfg->release;
                        nl_table[unit].flags = cfg->flags;
                }
                nl_table[unit].registered = 1;
        } else {
                kfree(listeners);
                nl_table[unit].registered++;
        }
        netlink_table_ungrab();
        return sk;

out_sock_release:
        kfree(listeners);
        netlink_kernel_release(sk);
        return NULL;

out_sock_release_nosk:
        sock_release(sock);
        return NULL;
}
EXPORT_SYMBOL(__netlink_kernel_create);

void
netlink_kernel_release(struct sock *sk)
{
        if (sk == NULL || sk->sk_socket == NULL)
                return;

        sock_release(sk->sk_socket);
}
EXPORT_SYMBOL(netlink_kernel_release);

int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        struct listeners *new, *old;
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];

        if (groups < 32)
                groups = 32;

        if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
                new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
                if (!new)
                        return -ENOMEM;
                old = nl_deref_protected(tbl->listeners);
                memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
                rcu_assign_pointer(tbl->listeners, new);

                kfree_rcu(old, rcu);
        }
        tbl->groups = groups;

        return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        int err;

        netlink_table_grab();
        err = __netlink_change_ngroups(sk, groups);
        netlink_table_ungrab();

        return err;
}

void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
        struct sock *sk;
        struct netlink_table *tbl = &nl_table[ksk->sk_protocol];

        sk_for_each_bound(sk, &tbl->mc_list)
                netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
{
        struct nlmsghdr *nlh;
        int size = nlmsg_msg_size(len);

        nlh = skb_put(skb, NLMSG_ALIGN(size));
        nlh->nlmsg_type = type;
        nlh->nlmsg_len = size;
        nlh->nlmsg_flags = flags;
        nlh->nlmsg_pid = portid;
        nlh->nlmsg_seq = seq;
        if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
                memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
        return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

static size_t
netlink_ack_tlv_len(struct netlink_sock *nlk, int err,
                    const struct netlink_ext_ack *extack)
{
        size_t tlvlen;

        if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags))
                return 0;

        tlvlen = 0;
        if (extack->_msg)
                tlvlen += nla_total_size(strlen(extack->_msg) + 1);
        if (extack->cookie_len)
                tlvlen += nla_total_size(extack->cookie_len);

        /* Following attributes are only reported as error (not warning) */
        if (!err)
                return tlvlen;

        if (extack->bad_attr)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->policy)
                tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);
        if (extack->miss_type)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->miss_nest)
                tlvlen += nla_total_size(sizeof(u32));

        return tlvlen;
}

static void
netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb,
                     const struct nlmsghdr *nlh, int err,
                     const struct netlink_ext_ack *extack)
{
        if (extack->_msg)
                WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg));
        if (extack->cookie_len)
                WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
                                extack->cookie_len, extack->cookie));

        if (!err)
                return;

        if (extack->bad_attr &&
            !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
                     (u8 *)extack->bad_attr >= in_skb->data + in_skb->len))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
                                    (u8 *)extack->bad_attr - (const u8 *)nlh));
        if (extack->policy)
                netlink_policy_dump_write_attr(skb, extack->policy,
                                               NLMSGERR_ATTR_POLICY);
        if (extack->miss_type)
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE,
                                    extack->miss_type));
        if (extack->miss_nest &&
            !WARN_ON((u8 *)extack->miss_nest < in_skb->data ||
                     (u8 *)extack->miss_nest > in_skb->data + in_skb->len))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST,
                                    (u8 *)extack->miss_nest - (const u8 *)nlh));
}

/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct netlink_ext_ack *extack)
{
        struct nlmsghdr *nlh;
        size_t extack_len;

        nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno),
                               NLM_F_MULTI | cb->answer_flags);
        if (WARN_ON(!nlh))
                return -ENOBUFS;

        nl_dump_check_consistent(cb, nlh);
        memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno));

        extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack);
        if (extack_len) {
                nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
                if (skb_tailroom(skb) >= extack_len) {
                        netlink_ack_tlv_fill(cb->skb, skb, cb->nlh,
                                             nlk->dump_done_errno, extack);
                        nlmsg_end(skb, nlh);
                }
        }

        return 0;
}

static int netlink_dump(struct sock *sk, bool lock_taken)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        struct netlink_ext_ack extack = {};
        struct netlink_callback *cb;
        struct sk_buff *skb = NULL;
        size_t max_recvmsg_len;
        struct module *module;
        int err = -ENOBUFS;
        int alloc_min_size;
        int alloc_size;

        if (!lock_taken)
                mutex_lock(&nlk->nl_cb_mutex);
        if (!nlk->cb_running) {
                err = -EINVAL;
                goto errout_skb;
        }

        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
                goto errout_skb;

        /* NLMSG_GOODSIZE is small to avoid high order allocations being
         * required, but it makes sense to _attempt_ a 16K bytes allocation
         * to reduce number of system calls on dump operations, if user
         * ever provided a big enough buffer.
         */
        cb = &nlk->cb;
        alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

        max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len);
        if (alloc_min_size < max_recvmsg_len) {
                alloc_size = max_recvmsg_len;
                skb = alloc_skb(alloc_size,
                                (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
                                __GFP_NOWARN | __GFP_NORETRY);
        }
        if (!skb) {
                alloc_size = alloc_min_size;
                skb = alloc_skb(alloc_size, GFP_KERNEL);
        }
        if (!skb)
                goto errout_skb;

        /* Trim skb to allocated size. User is expected to provide buffer as
         * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at
         * netlink_recvmsg())). dump will pack as many smaller messages as
         * could fit within the allocated skb. skb is typically allocated
         * with larger space than required (could be as much as near 2x the
         * requested size with align to next power of 2 approach). Allowing
         * dump to use the excess space makes it difficult for a user to have a
         * reasonable static buffer based on the expected largest dump of a
         * single netdev. The outcome is MSG_TRUNC error.
         */
        skb_reserve(skb, skb_tailroom(skb) - alloc_size);

        /* Make sure malicious BPF programs can not read unitialized memory
         * from skb->head -> skb->data
         */
        skb_reset_network_header(skb);
        skb_reset_mac_header(skb);

        netlink_skb_set_owner_r(skb, sk);

        if (nlk->dump_done_errno > 0) {
                cb->extack = &extack;

                nlk->dump_done_errno = cb->dump(skb, cb);

                /* EMSGSIZE plus something already in the skb means
                 * that there's more to dump but current skb has filled up.
                 * If the callback really wants to return EMSGSIZE to user space
                 * it needs to do so again, on the next cb->dump() call,
                 * without putting data in the skb.
                 */
                if (nlk->dump_done_errno == -EMSGSIZE && skb->len)
                        nlk->dump_done_errno = skb->len;

                cb->extack = NULL;
        }

        if (nlk->dump_done_errno > 0 ||
            skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
                mutex_unlock(&nlk->nl_cb_mutex);

                if (sk_filter(sk, skb))
                        kfree_skb(skb);
                else
                        __netlink_sendskb(sk, skb);
                return 0;
        }

        if (netlink_dump_done(nlk, skb, cb, &extack))
                goto errout_skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        /* frag_list skb's data is used for compat tasks
         * and the regular skb's data for normal (non-compat) tasks.
         * See netlink_recvmsg().
         */
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack))
                        goto errout_skb;
        }
#endif

        if (sk_filter(sk, skb))
                kfree_skb(skb);
        else
                __netlink_sendskb(sk, skb);

        if (cb->done)
                cb->done(cb);

        WRITE_ONCE(nlk->cb_running, false);
        module = cb->module;
        skb = cb->skb;
        mutex_unlock(&nlk->nl_cb_mutex);
        module_put(module);
        consume_skb(skb);
        return 0;

errout_skb:
        mutex_unlock(&nlk->nl_cb_mutex);
        kfree_skb(skb);
        return err;
}

int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                         const struct nlmsghdr *nlh,
                         struct netlink_dump_control *control)
{
        struct netlink_callback *cb;
        struct netlink_sock *nlk;
        struct sock *sk;
        int ret;

        refcount_inc(&skb->users);

        sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
        if (sk == NULL) {
                ret = -ECONNREFUSED;
                goto error_free;
        }

        nlk = nlk_sk(sk);
        mutex_lock(&nlk->nl_cb_mutex);
        /* A dump is in progress... */
        if (nlk->cb_running) {
                ret = -EBUSY;
                goto error_unlock;
        }
        /* add reference of module which cb->dump belongs to */
        if (!try_module_get(control->module)) {
                ret = -EPROTONOSUPPORT;
                goto error_unlock;
        }

        cb = &nlk->cb;
        memset(cb, 0, sizeof(*cb));
        cb->dump = control->dump;
        cb->done = control->done;
        cb->nlh = nlh;
        cb->data = control->data;
        cb->module = control->module;
        cb->min_dump_alloc = control->min_dump_alloc;
        cb->flags = control->flags;
        cb->skb = skb;

        cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);

        if (control->start) {
                cb->extack = control->extack;
                ret = control->start(cb);
                cb->extack = NULL;
                if (ret)
                        goto error_put;
        }

        WRITE_ONCE(nlk->cb_running, true);
        nlk->dump_done_errno = INT_MAX;

        ret = netlink_dump(sk, true);

        sock_put(sk);

        if (ret)
                return ret;

        /* We successfully started a dump, by returning -EINTR we
         * signal not to send ACK even if it was requested.
         */
        return -EINTR;

error_put:
        module_put(control->module);
error_unlock:
        sock_put(sk);
        mutex_unlock(&nlk->nl_cb_mutex);
error_free:
        kfree_skb(skb);
        return ret;
}
EXPORT_SYMBOL(__netlink_dump_start);

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
                 const struct netlink_ext_ack *extack)
{
        struct sk_buff *skb;
        struct nlmsghdr *rep;
        struct nlmsgerr *errmsg;
        size_t payload = sizeof(*errmsg);
        struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
        unsigned int flags = 0;
        size_t tlvlen;

        /* Error messages get the original request appened, unless the user
         * requests to cap the error message, and get extra error data if
         * requested.
         */
        if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags))
                payload += nlmsg_len(nlh);
        else
                flags |= NLM_F_CAPPED;

        tlvlen = netlink_ack_tlv_len(nlk, err, extack);
        if (tlvlen)
                flags |= NLM_F_ACK_TLVS;

        skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
        if (!skb)
                goto err_skb;

        rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                        NLMSG_ERROR, sizeof(*errmsg), flags);
        if (!rep)
                goto err_bad_put;
        errmsg = nlmsg_data(rep);
        errmsg->error = err;
        errmsg->msg = *nlh;

        if (!(flags & NLM_F_CAPPED)) {
                if (!nlmsg_append(skb, nlmsg_len(nlh)))
                        goto err_bad_put;

                memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh),
                       nlmsg_len(nlh));
        }

        if (tlvlen)
                netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack);

        nlmsg_end(skb, rep);

        nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);

        return;

err_bad_put:
        nlmsg_free(skb);
err_skb:
        WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS);
        sk_error_report(NETLINK_CB(in_skb).sk);
}
EXPORT_SYMBOL(netlink_ack);

int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
                                                   struct nlmsghdr *,
                                                   struct netlink_ext_ack *))
{
        struct netlink_ext_ack extack;
        struct nlmsghdr *nlh;
        int err;

        while (skb->len >= nlmsg_total_size(0)) {
                int msglen;

                memset(&extack, 0, sizeof(extack));
                nlh = nlmsg_hdr(skb);
                err = 0;

                if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
                        return 0;

                /* Only requests are handled by the kernel */
                if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
                        goto ack;

                /* Skip control messages */
                if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
                        goto ack;

                err = cb(skb, nlh, &extack);
                if (err == -EINTR)
                        goto skip;

ack:
                if (nlh->nlmsg_flags & NLM_F_ACK || err)
                        netlink_ack(skb, nlh, err, &extack);

skip:
                msglen = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msglen > skb->len)
                        msglen = skb->len;
                skb_pull(skb, msglen);
        }

        return 0;
}
EXPORT_SYMBOL(netlink_rcv_skb);

/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
 * @portid: destination netlink portid for reports or 0
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags)
{
        int err = 0;

        if (group) {
                int exclude_portid = 0;

                if (report) {
                        refcount_inc(&skb->users);
                        exclude_portid = portid;
                }

                /* errors reported via destination sk->sk_err, but propagate
                 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
                err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
                if (err == -ESRCH)
                        err = 0;
        }

        if (report) {
                int err2;

                err2 = nlmsg_unicast(sk, skb, portid);
                if (!err)
                        err = err2;
        }

        return err;
}
EXPORT_SYMBOL(nlmsg_notify);

#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
        struct seq_net_private p;
        struct rhashtable_iter hti;
        int link;
};

static void netlink_walk_start(struct nl_seq_iter *iter)
{
        rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
        rhashtable_walk_start(&iter->hti);
}

static void netlink_walk_stop(struct nl_seq_iter *iter)
{
        rhashtable_walk_stop(&iter->hti);
        rhashtable_walk_exit(&iter->hti);
}

static void *__netlink_seq_next(struct seq_file *seq)
{
        struct nl_seq_iter *iter = seq->private;
        struct netlink_sock *nlk;

        do {
                for (;;) {
                        nlk = rhashtable_walk_next(&iter->hti);

                        if (IS_ERR(nlk)) {
                                if (PTR_ERR(nlk) == -EAGAIN)
                                        continue;

                                return nlk;
                        }

                        if (nlk)
                                break;

                        netlink_walk_stop(iter);
                        if (++iter->link >= MAX_LINKS)
                                return NULL;

                        netlink_walk_start(iter);
                }
        } while (sock_net(&nlk->sk) != seq_file_net(seq));

        return nlk;
}

static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
        __acquires(RCU)
{
        struct nl_seq_iter *iter = seq->private;
        void *obj = SEQ_START_TOKEN;
        loff_t pos;

        iter->link = 0;

        netlink_walk_start(iter);

        for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
                obj = __netlink_seq_next(seq);

        return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return __netlink_seq_next(seq);
}

static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{
        struct nl_seq_iter *iter = seq->private;

        if (iter->link >= MAX_LINKS)
                return;

        netlink_walk_stop(iter);
}


static int netlink_native_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "sk               Eth Pid        Groups   "
                         "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
        } else {
                struct sock *s = v;
                struct netlink_sock *nlk = nlk_sk(s);

                seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n",
                           s,
                           s->sk_protocol,
                           nlk->portid,
                           nlk->groups ? (u32)nlk->groups[0] : 0,
                           sk_rmem_alloc_get(s),
                           sk_wmem_alloc_get(s),
                           READ_ONCE(nlk->cb_running),
                           refcount_read(&s->sk_refcnt),
                           atomic_read(&s->sk_drops),
                           sock_i_ino(s)
                        );

        }
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct netlink_sock *, sk);
};

DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)

static int netlink_prog_seq_show(struct bpf_prog *prog,
                                  struct bpf_iter_meta *meta,
                                  void *v)
{
        struct bpf_iter__netlink ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.sk = nlk_sk((struct sock *)v);
        return bpf_iter_run_prog(prog, &ctx);
}

static int netlink_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return netlink_native_seq_show(seq, v);

        if (v != SEQ_START_TOKEN)
                return netlink_prog_seq_show(prog, &meta, v);

        return 0;
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)netlink_prog_seq_show(prog, &meta, v);
        }

        netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
        return netlink_native_seq_show(seq, v);
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        netlink_native_seq_stop(seq, v);
}
#endif

static const struct seq_operations netlink_seq_ops = {
        .start  = netlink_seq_start,
        .next   = netlink_seq_next,
        .stop   = netlink_seq_stop,
        .show   = netlink_seq_show,
};
#endif

int netlink_register_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);

int netlink_unregister_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);

static const struct proto_ops netlink_ops = {
        .family =        PF_NETLINK,
        .owner =        THIS_MODULE,
        .release =        netlink_release,
        .bind =                netlink_bind,
        .connect =        netlink_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        netlink_getname,
        .poll =                datagram_poll,
        .ioctl =        netlink_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        netlink_setsockopt,
        .getsockopt =        netlink_getsockopt,
        .sendmsg =        netlink_sendmsg,
        .recvmsg =        netlink_recvmsg,
        .mmap =                sock_no_mmap,
};

static const struct net_proto_family netlink_family_ops = {
        .family = PF_NETLINK,
        .create = netlink_create,
        .owner        = THIS_MODULE,        /* for consistency 8) */
};

static int __net_init netlink_net_init(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
                        sizeof(struct nl_seq_iter)))
                return -ENOMEM;
#endif
        return 0;
}

static void __net_exit netlink_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("netlink", net->proc_net);
#endif
}

static void __init netlink_add_usersock_entry(void)
{
        struct listeners *listeners;
        int groups = 32;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                panic("netlink_add_usersock_entry: Cannot allocate listeners\n");

        netlink_table_grab();

        nl_table[NETLINK_USERSOCK].groups = groups;
        rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
        nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
        nl_table[NETLINK_USERSOCK].registered = 1;
        nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;

        netlink_table_ungrab();
}

static struct pernet_operations __net_initdata netlink_net_ops = {
        .init = netlink_net_init,
        .exit = netlink_net_exit,
};

static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
{
        const struct netlink_sock *nlk = data;
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
        return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
}

static const struct rhashtable_params netlink_rhashtable_params = {
        .head_offset = offsetof(struct netlink_sock, node),
        .key_len = netlink_compare_arg_len,
        .obj_hashfn = netlink_hash,
        .obj_cmpfn = netlink_compare,
        .automatic_shrinking = true,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
BTF_ID_LIST(btf_netlink_sock_id)
BTF_ID(struct, netlink_sock)

static const struct bpf_iter_seq_info netlink_seq_info = {
        .seq_ops                = &netlink_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct nl_seq_iter),
};

static struct bpf_iter_reg netlink_reg_info = {
        .target                        = "netlink",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__netlink, sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &netlink_seq_info,
};

static int __init bpf_iter_register(void)
{
        netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
        return bpf_iter_reg_target(&netlink_reg_info);
}
#endif

static int __init netlink_proto_init(void)
{
        int i;
        int err = proto_register(&netlink_proto, 0);

        if (err != 0)
                goto out;

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        err = bpf_iter_register();
        if (err)
                goto out;
#endif

        BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));

        nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
        if (!nl_table)
                goto panic;

        for (i = 0; i < MAX_LINKS; i++) {
                if (rhashtable_init(&nl_table[i].hash,
                                    &netlink_rhashtable_params) < 0) {
                        while (--i > 0)
                                rhashtable_destroy(&nl_table[i].hash);
                        kfree(nl_table);
                        goto panic;
                }
        }

        netlink_add_usersock_entry();

        sock_register(&netlink_family_ops);
        register_pernet_subsys(&netlink_net_ops);
        register_pernet_subsys(&netlink_tap_net_ops);
        /* The netlink device handler may be needed early. */
        rtnetlink_init();
out:
        return err;
panic:
        panic("netlink_init: Cannot allocate nl_table\n");
}

core_initcall(netlink_proto_init);

































    1 




















































































































    1 






    1 





































































    1 




    1 




































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 * Copyright (c) 2016 Facebook
 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
 */
#include <uapi/linux/btf.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/math64.h>
#include <linux/string.h>

#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)

static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
{
        /* ubuf and len_total should both be specified (or not) together */
        if (!!log->ubuf != !!log->len_total)
                return false;
        /* log buf without log_level is meaningless */
        if (log->ubuf && log->level == 0)
                return false;
        if (log->level & ~BPF_LOG_MASK)
                return false;
        if (log->len_total > UINT_MAX >> 2)
                return false;
        return true;
}

int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
                  char __user *log_buf, u32 log_size)
{
        log->level = log_level;
        log->ubuf = log_buf;
        log->len_total = log_size;

        /* log attributes have to be sane */
        if (!bpf_verifier_log_attr_valid(log))
                return -EINVAL;

        return 0;
}

static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
{
        /* add_len includes terminal \0, so no need for +1. */
        u64 len = log->end_pos + add_len;

        /* log->len_max could be larger than our current len due to
         * bpf_vlog_reset() calls, so we maintain the max of any length at any
         * previous point
         */
        if (len > UINT_MAX)
                log->len_max = UINT_MAX;
        else if (len > log->len_max)
                log->len_max = len;
}

void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
                       va_list args)
{
        u64 cur_pos;
        u32 new_n, n;

        n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);

        if (log->level == BPF_LOG_KERNEL) {
                bool newline = n > 0 && log->kbuf[n - 1] == '\n';

                pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
                return;
        }

        n += 1; /* include terminating zero */
        bpf_vlog_update_len_max(log, n);

        if (log->level & BPF_LOG_FIXED) {
                /* check if we have at least something to put into user buf */
                new_n = 0;
                if (log->end_pos < log->len_total) {
                        new_n = min_t(u32, log->len_total - log->end_pos, n);
                        log->kbuf[new_n - 1] = '\0';
                }

                cur_pos = log->end_pos;
                log->end_pos += n - 1; /* don't count terminating '\0' */

                if (log->ubuf && new_n &&
                    copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
                        goto fail;
        } else {
                u64 new_end, new_start;
                u32 buf_start, buf_end, new_n;

                new_end = log->end_pos + n;
                if (new_end - log->start_pos >= log->len_total)
                        new_start = new_end - log->len_total;
                else
                        new_start = log->start_pos;

                log->start_pos = new_start;
                log->end_pos = new_end - 1; /* don't count terminating '\0' */

                if (!log->ubuf)
                        return;

                new_n = min(n, log->len_total);
                cur_pos = new_end - new_n;
                div_u64_rem(cur_pos, log->len_total, &buf_start);
                div_u64_rem(new_end, log->len_total, &buf_end);
                /* new_end and buf_end are exclusive indices, so if buf_end is
                 * exactly zero, then it actually points right to the end of
                 * ubuf and there is no wrap around
                 */
                if (buf_end == 0)
                        buf_end = log->len_total;

                /* if buf_start > buf_end, we wrapped around;
                 * if buf_start == buf_end, then we fill ubuf completely; we
                 * can't have buf_start == buf_end to mean that there is
                 * nothing to write, because we always write at least
                 * something, even if terminal '\0'
                 */
                if (buf_start < buf_end) {
                        /* message fits within contiguous chunk of ubuf */
                        if (copy_to_user(log->ubuf + buf_start,
                                         log->kbuf + n - new_n,
                                         buf_end - buf_start))
                                goto fail;
                } else {
                        /* message wraps around the end of ubuf, copy in two chunks */
                        if (copy_to_user(log->ubuf + buf_start,
                                         log->kbuf + n - new_n,
                                         log->len_total - buf_start))
                                goto fail;
                        if (copy_to_user(log->ubuf,
                                         log->kbuf + n - buf_end,
                                         buf_end))
                                goto fail;
                }
        }

        return;
fail:
        log->ubuf = NULL;
}

void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
{
        char zero = 0;
        u32 pos;

        if (WARN_ON_ONCE(new_pos > log->end_pos))
                return;

        if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
                return;

        /* if position to which we reset is beyond current log window,
         * then we didn't preserve any useful content and should adjust
         * start_pos to end up with an empty log (start_pos == end_pos)
         */
        log->end_pos = new_pos;
        if (log->end_pos < log->start_pos)
                log->start_pos = log->end_pos;

        if (!log->ubuf)
                return;

        if (log->level & BPF_LOG_FIXED)
                pos = log->end_pos + 1;
        else
                div_u64_rem(new_pos, log->len_total, &pos);

        if (pos < log->len_total && put_user(zero, log->ubuf + pos))
                log->ubuf = NULL;
}

static void bpf_vlog_reverse_kbuf(char *buf, int len)
{
        int i, j;

        for (i = 0, j = len - 1; i < j; i++, j--)
                swap(buf[i], buf[j]);
}

static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
{
        /* we split log->kbuf into two equal parts for both ends of array */
        int n = sizeof(log->kbuf) / 2, nn;
        char *lbuf = log->kbuf, *rbuf = log->kbuf + n;

        /* Read ubuf's section [start, end) two chunks at a time, from left
         * and right side; within each chunk, swap all the bytes; after that
         * reverse the order of lbuf and rbuf and write result back to ubuf.
         * This way we'll end up with swapped contents of specified
         * [start, end) ubuf segment.
         */
        while (end - start > 1) {
                nn = min(n, (end - start ) / 2);

                if (copy_from_user(lbuf, log->ubuf + start, nn))
                        return -EFAULT;
                if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
                        return -EFAULT;

                bpf_vlog_reverse_kbuf(lbuf, nn);
                bpf_vlog_reverse_kbuf(rbuf, nn);

                /* we write lbuf to the right end of ubuf, while rbuf to the
                 * left one to end up with properly reversed overall ubuf
                 */
                if (copy_to_user(log->ubuf + start, rbuf, nn))
                        return -EFAULT;
                if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
                        return -EFAULT;

                start += nn;
                end -= nn;
        }

        return 0;
}

int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
{
        u32 sublen;
        int err;

        *log_size_actual = 0;
        if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
                return 0;

        if (!log->ubuf)
                goto skip_log_rotate;
        /* If we never truncated log, there is nothing to move around. */
        if (log->start_pos == 0)
                goto skip_log_rotate;

        /* Otherwise we need to rotate log contents to make it start from the
         * buffer beginning and be a continuous zero-terminated string. Note
         * that if log->start_pos != 0 then we definitely filled up entire log
         * buffer with no gaps, and we just need to shift buffer contents to
         * the left by (log->start_pos % log->len_total) bytes.
         *
         * Unfortunately, user buffer could be huge and we don't want to
         * allocate temporary kernel memory of the same size just to shift
         * contents in a straightforward fashion. Instead, we'll be clever and
         * do in-place array rotation. This is a leetcode-style problem, which
         * could be solved by three rotations.
         *
         * Let's say we have log buffer that has to be shifted left by 7 bytes
         * (spaces and vertical bar is just for demonstrative purposes):
         *   E F G H I J K | A B C D
         *
         * First, we reverse entire array:
         *   D C B A | K J I H G F E
         *
         * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
         * (KJIHGFE), resulting in a properly rotated array:
         *   A B C D | E F G H I J K
         *
         * We'll utilize log->kbuf to read user memory chunk by chunk, swap
         * bytes, and write them back. Doing it byte-by-byte would be
         * unnecessarily inefficient. Altogether we are going to read and
         * write each byte twice, for total 4 memory copies between kernel and
         * user space.
         */

        /* length of the chopped off part that will be the beginning;
         * len(ABCD) in the example above
         */
        div_u64_rem(log->start_pos, log->len_total, &sublen);
        sublen = log->len_total - sublen;

        err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
        err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
        err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
        if (err)
                log->ubuf = NULL;

skip_log_rotate:
        *log_size_actual = log->len_max;

        /* properly initialized log has either both ubuf!=NULL and len_total>0
         * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
         * we got a fault somewhere along the way, so report it back
         */
        if (!!log->ubuf != !!log->len_total)
                return -EFAULT;

        /* did truncation actually happen? */
        if (log->ubuf && log->len_max > log->len_total)
                return -ENOSPC;

        return 0;
}

/* log_level controls verbosity level of eBPF verifier.
 * bpf_verifier_log_write() is used to dump the verification trace to the log,
 * so the user can figure out what's wrong with the program
 */
__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
                                           const char *fmt, ...)
{
        va_list args;

        if (!bpf_verifier_log_needed(&env->log))
                return;

        va_start(args, fmt);
        bpf_verifier_vlog(&env->log, fmt, args);
        va_end(args);
}
EXPORT_SYMBOL_GPL(bpf_verifier_log_write);

__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
                            const char *fmt, ...)
{
        va_list args;

        if (!bpf_verifier_log_needed(log))
                return;

        va_start(args, fmt);
        bpf_verifier_vlog(log, fmt, args);
        va_end(args);
}
EXPORT_SYMBOL_GPL(bpf_log);

static const struct bpf_line_info *
find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
{
        const struct bpf_line_info *linfo;
        const struct bpf_prog *prog;
        u32 nr_linfo;
        int l, r, m;

        prog = env->prog;
        nr_linfo = prog->aux->nr_linfo;

        if (!nr_linfo || insn_off >= prog->len)
                return NULL;

        linfo = prog->aux->linfo;
        /* Loop invariant: linfo[l].insn_off <= insns_off.
         * linfo[0].insn_off == 0 which always satisfies above condition.
         * Binary search is searching for rightmost linfo entry that satisfies
         * the above invariant, giving us the desired record that covers given
         * instruction offset.
         */
        l = 0;
        r = nr_linfo - 1;
        while (l < r) {
                /* (r - l + 1) / 2 means we break a tie to the right, so if:
                 * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
                 * then m=2, we see that linfo[m].insn_off > insn_off, and so
                 * r becomes 1 and we exit the loop with correct l==1.
                 * If the tie was broken to the left, m=1 would end us up in
                 * an endless loop where l and m stay at 1 and r stays at 2.
                 */
                m = l + (r - l + 1) / 2;
                if (linfo[m].insn_off <= insn_off)
                        l = m;
                else
                        r = m - 1;
        }

        return &linfo[l];
}

static const char *ltrim(const char *s)
{
        while (isspace(*s))
                s++;

        return s;
}

__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
                                  u32 insn_off,
                                  const char *prefix_fmt, ...)
{
        const struct bpf_line_info *linfo, *prev_linfo;
        const struct btf *btf;
        const char *s, *fname;

        if (!bpf_verifier_log_needed(&env->log))
                return;

        prev_linfo = env->prev_linfo;
        linfo = find_linfo(env, insn_off);
        if (!linfo || linfo == prev_linfo)
                return;

        /* It often happens that two separate linfo records point to the same
         * source code line, but have differing column numbers. Given verifier
         * log doesn't emit column information, from user perspective we just
         * end up emitting the same source code line twice unnecessarily.
         * So instead check that previous and current linfo record point to
         * the same file (file_name_offs match) and the same line number, and
         * avoid emitting duplicated source code line in such case.
         */
        if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off &&
            BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col))
                return;

        if (prefix_fmt) {
                va_list args;

                va_start(args, prefix_fmt);
                bpf_verifier_vlog(&env->log, prefix_fmt, args);
                va_end(args);
        }

        btf = env->prog->aux->btf;
        s = ltrim(btf_name_by_offset(btf, linfo->line_off));
        verbose(env, "%s", s); /* source code line */

        s = btf_name_by_offset(btf, linfo->file_name_off);
        /* leave only file name */
        fname = strrchr(s, '/');
        fname = fname ? fname + 1 : s;
        verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col));

        env->prev_linfo = linfo;
}

static const char *btf_type_name(const struct btf *btf, u32 id)
{
        return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}

/* string representation of 'enum bpf_reg_type'
 *
 * Note that reg_type_str() can not appear more than once in a single verbose()
 * statement.
 */
const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
{
        char postfix[16] = {0}, prefix[64] = {0};
        static const char * const str[] = {
                [NOT_INIT]                = "?",
                [SCALAR_VALUE]                = "scalar",
                [PTR_TO_CTX]                = "ctx",
                [CONST_PTR_TO_MAP]        = "map_ptr",
                [PTR_TO_MAP_VALUE]        = "map_value",
                [PTR_TO_STACK]                = "fp",
                [PTR_TO_PACKET]                = "pkt",
                [PTR_TO_PACKET_META]        = "pkt_meta",
                [PTR_TO_PACKET_END]        = "pkt_end",
                [PTR_TO_FLOW_KEYS]        = "flow_keys",
                [PTR_TO_SOCKET]                = "sock",
                [PTR_TO_SOCK_COMMON]        = "sock_common",
                [PTR_TO_TCP_SOCK]        = "tcp_sock",
                [PTR_TO_TP_BUFFER]        = "tp_buffer",
                [PTR_TO_XDP_SOCK]        = "xdp_sock",
                [PTR_TO_BTF_ID]                = "ptr_",
                [PTR_TO_MEM]                = "mem",
                [PTR_TO_ARENA]                = "arena",
                [PTR_TO_BUF]                = "buf",
                [PTR_TO_FUNC]                = "func",
                [PTR_TO_MAP_KEY]        = "map_key",
                [CONST_PTR_TO_DYNPTR]        = "dynptr_ptr",
        };

        if (type & PTR_MAYBE_NULL) {
                if (base_type(type) == PTR_TO_BTF_ID)
                        strscpy(postfix, "or_null_");
                else
                        strscpy(postfix, "_or_null");
        }

        snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
                 type & MEM_RDONLY ? "rdonly_" : "",
                 type & MEM_RINGBUF ? "ringbuf_" : "",
                 type & MEM_USER ? "user_" : "",
                 type & MEM_PERCPU ? "percpu_" : "",
                 type & MEM_RCU ? "rcu_" : "",
                 type & PTR_UNTRUSTED ? "untrusted_" : "",
                 type & PTR_TRUSTED ? "trusted_" : ""
        );

        snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
                 prefix, str[base_type(type)], postfix);
        return env->tmp_str_buf;
}

const char *dynptr_type_str(enum bpf_dynptr_type type)
{
        switch (type) {
        case BPF_DYNPTR_TYPE_LOCAL:
                return "local";
        case BPF_DYNPTR_TYPE_RINGBUF:
                return "ringbuf";
        case BPF_DYNPTR_TYPE_SKB:
                return "skb";
        case BPF_DYNPTR_TYPE_XDP:
                return "xdp";
        case BPF_DYNPTR_TYPE_INVALID:
                return "<invalid>";
        default:
                WARN_ONCE(1, "unknown dynptr type %d\n", type);
                return "<unknown>";
        }
}

const char *iter_type_str(const struct btf *btf, u32 btf_id)
{
        if (!btf || btf_id == 0)
                return "<invalid>";

        /* we already validated that type is valid and has conforming name */
        return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
}

const char *iter_state_str(enum bpf_iter_state state)
{
        switch (state) {
        case BPF_ITER_STATE_ACTIVE:
                return "active";
        case BPF_ITER_STATE_DRAINED:
                return "drained";
        case BPF_ITER_STATE_INVALID:
                return "<invalid>";
        default:
                WARN_ONCE(1, "unknown iter state %d\n", state);
                return "<unknown>";
        }
}

static char slot_type_char[] = {
        [STACK_INVALID]        = '?',
        [STACK_SPILL]        = 'r',
        [STACK_MISC]        = 'm',
        [STACK_ZERO]        = '0',
        [STACK_DYNPTR]        = 'd',
        [STACK_ITER]        = 'i',
};

static void print_liveness(struct bpf_verifier_env *env,
                           enum bpf_reg_liveness live)
{
        if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
            verbose(env, "_");
        if (live & REG_LIVE_READ)
                verbose(env, "r");
        if (live & REG_LIVE_WRITTEN)
                verbose(env, "w");
        if (live & REG_LIVE_DONE)
                verbose(env, "D");
}

#define UNUM_MAX_DECIMAL U16_MAX
#define SNUM_MAX_DECIMAL S16_MAX
#define SNUM_MIN_DECIMAL S16_MIN

static bool is_unum_decimal(u64 num)
{
        return num <= UNUM_MAX_DECIMAL;
}

static bool is_snum_decimal(s64 num)
{
        return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL;
}

static void verbose_unum(struct bpf_verifier_env *env, u64 num)
{
        if (is_unum_decimal(num))
                verbose(env, "%llu", num);
        else
                verbose(env, "%#llx", num);
}

static void verbose_snum(struct bpf_verifier_env *env, s64 num)
{
        if (is_snum_decimal(num))
                verbose(env, "%lld", num);
        else
                verbose(env, "%#llx", num);
}

int tnum_strn(char *str, size_t size, struct tnum a)
{
        /* print as a constant, if tnum is fully known */
        if (a.mask == 0) {
                if (is_unum_decimal(a.value))
                        return snprintf(str, size, "%llu", a.value);
                else
                        return snprintf(str, size, "%#llx", a.value);
        }
        return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
}
EXPORT_SYMBOL_GPL(tnum_strn);

static void print_scalar_ranges(struct bpf_verifier_env *env,
                                const struct bpf_reg_state *reg,
                                const char **sep)
{
        /* For signed ranges, we want to unify 64-bit and 32-bit values in the
         * output as much as possible, but there is a bit of a complication.
         * If we choose to print values as decimals, this is natural to do,
         * because negative 64-bit and 32-bit values >= -S32_MIN have the same
         * representation due to sign extension. But if we choose to print
         * them in hex format (see is_snum_decimal()), then sign extension is
         * misleading.
         * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in
         * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two
         * very different numbers.
         * So we avoid sign extension if we choose to print values in hex.
         */
        struct {
                const char *name;
                u64 val;
                bool omit;
        } minmaxs[] = {
                {"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
                {"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
                {"umin",   reg->umin_value,         reg->umin_value == 0},
                {"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
                {"smin32",
                 is_snum_decimal((s64)reg->s32_min_value)
                         ? (s64)reg->s32_min_value
                         : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
                {"smax32",
                 is_snum_decimal((s64)reg->s32_max_value)
                         ? (s64)reg->s32_max_value
                         : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
                {"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
                {"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
        }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
        bool neg1, neg2;

        for (m1 = &minmaxs[0]; m1 < mend; m1++) {
                if (m1->omit)
                        continue;

                neg1 = m1->name[0] == 's' && (s64)m1->val < 0;

                verbose(env, "%s%s=", *sep, m1->name);
                *sep = ",";

                for (m2 = m1 + 2; m2 < mend; m2 += 2) {
                        if (m2->omit || m2->val != m1->val)
                                continue;
                        /* don't mix negatives with positives */
                        neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
                        if (neg2 != neg1)
                                continue;
                        m2->omit = true;
                        verbose(env, "%s=", m2->name);
                }

                if (m1->name[0] == 's')
                        verbose_snum(env, m1->val);
                else
                        verbose_unum(env, m1->val);
        }
}

static bool type_is_map_ptr(enum bpf_reg_type t) {
        switch (base_type(t)) {
        case CONST_PTR_TO_MAP:
        case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
                return true;
        default:
                return false;
        }
}

/*
 * _a stands for append, was shortened to avoid multiline statements below.
 * This macro is used to output a comma separated list of attributes.
 */
#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })

static void print_reg_state(struct bpf_verifier_env *env,
                            const struct bpf_func_state *state,
                            const struct bpf_reg_state *reg)
{
        enum bpf_reg_type t;
        const char *sep = "";

        t = reg->type;
        if (t == SCALAR_VALUE && reg->precise)
                verbose(env, "P");
        if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
                /* reg->off should be 0 for SCALAR_VALUE */
                verbose_snum(env, reg->var_off.value + reg->off);
                return;
        }

        verbose(env, "%s", reg_type_str(env, t));
        if (t == PTR_TO_ARENA)
                return;
        if (t == PTR_TO_STACK) {
                if (state->frameno != reg->frameno)
                        verbose(env, "[%d]", reg->frameno);
                if (tnum_is_const(reg->var_off)) {
                        verbose_snum(env, reg->var_off.value + reg->off);
                        return;
                }
        }
        if (base_type(t) == PTR_TO_BTF_ID)
                verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
        verbose(env, "(");
        if (reg->id)
                verbose_a("id=%d", reg->id);
        if (reg->ref_obj_id)
                verbose_a("ref_obj_id=%d", reg->ref_obj_id);
        if (type_is_non_owning_ref(reg->type))
                verbose_a("%s", "non_own_ref");
        if (type_is_map_ptr(t)) {
                if (reg->map_ptr->name[0])
                        verbose_a("map=%s", reg->map_ptr->name);
                verbose_a("ks=%d,vs=%d",
                          reg->map_ptr->key_size,
                          reg->map_ptr->value_size);
        }
        if (t != SCALAR_VALUE && reg->off) {
                verbose_a("off=");
                verbose_snum(env, reg->off);
        }
        if (type_is_pkt_pointer(t)) {
                verbose_a("r=");
                verbose_unum(env, reg->range);
        }
        if (base_type(t) == PTR_TO_MEM) {
                verbose_a("sz=");
                verbose_unum(env, reg->mem_size);
        }
        if (t == CONST_PTR_TO_DYNPTR)
                verbose_a("type=%s",  dynptr_type_str(reg->dynptr.type));
        if (tnum_is_const(reg->var_off)) {
                /* a pointer register with fixed offset */
                if (reg->var_off.value) {
                        verbose_a("imm=");
                        verbose_snum(env, reg->var_off.value);
                }
        } else {
                print_scalar_ranges(env, reg, &sep);
                if (!tnum_is_unknown(reg->var_off)) {
                        char tn_buf[48];

                        tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
                        verbose_a("var_off=%s", tn_buf);
                }
        }
        verbose(env, ")");
}

void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
                          bool print_all)
{
        const struct bpf_reg_state *reg;
        int i;

        if (state->frameno)
                verbose(env, " frame%d:", state->frameno);
        for (i = 0; i < MAX_BPF_REG; i++) {
                reg = &state->regs[i];
                if (reg->type == NOT_INIT)
                        continue;
                if (!print_all && !reg_scratched(env, i))
                        continue;
                verbose(env, " R%d", i);
                print_liveness(env, reg->live);
                verbose(env, "=");
                print_reg_state(env, state, reg);
        }
        for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
                char types_buf[BPF_REG_SIZE + 1];
                const char *sep = "";
                bool valid = false;
                u8 slot_type;
                int j;

                if (!print_all && !stack_slot_scratched(env, i))
                        continue;

                for (j = 0; j < BPF_REG_SIZE; j++) {
                        slot_type = state->stack[i].slot_type[j];
                        if (slot_type != STACK_INVALID)
                                valid = true;
                        types_buf[j] = slot_type_char[slot_type];
                }
                types_buf[BPF_REG_SIZE] = 0;
                if (!valid)
                        continue;

                reg = &state->stack[i].spilled_ptr;
                switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
                case STACK_SPILL:
                        /* print MISC/ZERO/INVALID slots above subreg spill */
                        for (j = 0; j < BPF_REG_SIZE; j++)
                                if (state->stack[i].slot_type[j] == STACK_SPILL)
                                        break;
                        types_buf[j] = '\0';

                        verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
                        print_liveness(env, reg->live);
                        verbose(env, "=%s", types_buf);
                        print_reg_state(env, state, reg);
                        break;
                case STACK_DYNPTR:
                        /* skip to main dynptr slot */
                        i += BPF_DYNPTR_NR_SLOTS - 1;
                        reg = &state->stack[i].spilled_ptr;

                        verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
                        print_liveness(env, reg->live);
                        verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
                        if (reg->id)
                                verbose_a("id=%d", reg->id);
                        if (reg->ref_obj_id)
                                verbose_a("ref_id=%d", reg->ref_obj_id);
                        if (reg->dynptr_id)
                                verbose_a("dynptr_id=%d", reg->dynptr_id);
                        verbose(env, ")");
                        break;
                case STACK_ITER:
                        /* only main slot has ref_obj_id set; skip others */
                        if (!reg->ref_obj_id)
                                continue;

                        verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
                        print_liveness(env, reg->live);
                        verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
                                iter_type_str(reg->iter.btf, reg->iter.btf_id),
                                reg->ref_obj_id, iter_state_str(reg->iter.state),
                                reg->iter.depth);
                        break;
                case STACK_MISC:
                case STACK_ZERO:
                default:
                        verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
                        print_liveness(env, reg->live);
                        verbose(env, "=%s", types_buf);
                        break;
                }
        }
        if (state->acquired_refs && state->refs[0].id) {
                verbose(env, " refs=%d", state->refs[0].id);
                for (i = 1; i < state->acquired_refs; i++)
                        if (state->refs[i].id)
                                verbose(env, ",%d", state->refs[i].id);
        }
        if (state->in_callback_fn)
                verbose(env, " cb");
        if (state->in_async_callback_fn)
                verbose(env, " async_cb");
        verbose(env, "\n");
        if (!print_all)
                mark_verifier_state_clean(env);
}

static inline u32 vlog_alignment(u32 pos)
{
        return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
                        BPF_LOG_MIN_ALIGNMENT) - pos - 1;
}

void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
{
        if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
                /* remove new line character */
                bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
                verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
        } else {
                verbose(env, "%d:", env->insn_idx);
        }
        print_verifier_state(env, state, false);
}




























    1 
    1 
























































































    3 


    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 *
 * Authors:
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_iint.c
 *        - implements the IMA hook: ima_inode_free
 *        - cache integrity information in the inode security blob
 */
#include <linux/slab.h>

#include "ima.h"

static struct kmem_cache *ima_iint_cache __ro_after_init;

/**
 * ima_iint_find - Return the iint associated with an inode
 * @inode: Pointer to the inode
 *
 * Return the IMA integrity information (iint) associated with an inode, if the
 * inode was processed by IMA.
 *
 * Return: Found iint or NULL.
 */
struct ima_iint_cache *ima_iint_find(struct inode *inode)
{
        if (!IS_IMA(inode))
                return NULL;

        return ima_inode_get_iint(inode);
}

#define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH + 1)

/*
 * It is not clear that IMA should be nested at all, but as long is it measures
 * files both on overlayfs and on underlying fs, we need to annotate the iint
 * mutex to avoid lockdep false positives related to IMA + overlayfs.
 * See ovl_lockdep_annotate_inode_mutex_key() for more details.
 */
static inline void ima_iint_lockdep_annotate(struct ima_iint_cache *iint,
                                             struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        static struct lock_class_key ima_iint_mutex_key[IMA_MAX_NESTING];

        int depth = inode->i_sb->s_stack_depth;

        if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING))
                depth = 0;

        lockdep_set_class(&iint->mutex, &ima_iint_mutex_key[depth]);
#endif
}

static void ima_iint_init_always(struct ima_iint_cache *iint,
                                 struct inode *inode)
{
        iint->ima_hash = NULL;
        iint->real_inode.version = 0;
        iint->flags = 0UL;
        iint->atomic_flags = 0UL;
        iint->ima_file_status = INTEGRITY_UNKNOWN;
        iint->ima_mmap_status = INTEGRITY_UNKNOWN;
        iint->ima_bprm_status = INTEGRITY_UNKNOWN;
        iint->ima_read_status = INTEGRITY_UNKNOWN;
        iint->ima_creds_status = INTEGRITY_UNKNOWN;
        iint->measured_pcrs = 0;
        mutex_init(&iint->mutex);
        ima_iint_lockdep_annotate(iint, inode);
}

static void ima_iint_free(struct ima_iint_cache *iint)
{
        kfree(iint->ima_hash);
        mutex_destroy(&iint->mutex);
        kmem_cache_free(ima_iint_cache, iint);
}

/**
 * ima_inode_get - Find or allocate an iint associated with an inode
 * @inode: Pointer to the inode
 *
 * Find an iint associated with an inode, and allocate a new one if not found.
 * Caller must lock i_mutex.
 *
 * Return: An iint on success, NULL on error.
 */
struct ima_iint_cache *ima_inode_get(struct inode *inode)
{
        struct ima_iint_cache *iint;

        iint = ima_iint_find(inode);
        if (iint)
                return iint;

        iint = kmem_cache_alloc(ima_iint_cache, GFP_NOFS);
        if (!iint)
                return NULL;

        ima_iint_init_always(iint, inode);

        inode->i_flags |= S_IMA;
        ima_inode_set_iint(inode, iint);

        return iint;
}

/**
 * ima_inode_free - Called on inode free
 * @inode: Pointer to the inode
 *
 * Free the iint associated with an inode.
 */
void ima_inode_free(struct inode *inode)
{
        struct ima_iint_cache *iint;

        if (!IS_IMA(inode))
                return;

        iint = ima_iint_find(inode);
        ima_inode_set_iint(inode, NULL);

        ima_iint_free(iint);
}

static void ima_iint_init_once(void *foo)
{
        struct ima_iint_cache *iint = (struct ima_iint_cache *)foo;

        memset(iint, 0, sizeof(*iint));
}

void __init ima_iintcache_init(void)
{
        ima_iint_cache =
            kmem_cache_create("ima_iint_cache", sizeof(struct ima_iint_cache),
                              0, SLAB_PANIC, ima_iint_init_once);
}



































    1 

















    2 







    1 













    1 
    1 


    1 






    1 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
// SPDX-License-Identifier: GPL-2.0
#include "cgroup-internal.h"

#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>


/* cgroup namespaces */

static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
}

static void dec_cgroup_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
}

static struct cgroup_namespace *alloc_cgroup_ns(void)
{
        struct cgroup_namespace *new_ns;
        int ret;

        new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
        if (!new_ns)
                return ERR_PTR(-ENOMEM);
        ret = ns_alloc_inum(&new_ns->ns);
        if (ret) {
                kfree(new_ns);
                return ERR_PTR(ret);
        }
        refcount_set(&new_ns->ns.count, 1);
        new_ns->ns.ops = &cgroupns_operations;
        return new_ns;
}

void free_cgroup_ns(struct cgroup_namespace *ns)
{
        put_css_set(ns->root_cset);
        dec_cgroup_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
        kfree(ns);
}
EXPORT_SYMBOL(free_cgroup_ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns)
{
        struct cgroup_namespace *new_ns;
        struct ucounts *ucounts;
        struct css_set *cset;

        BUG_ON(!old_ns);

        if (!(flags & CLONE_NEWCGROUP)) {
                get_cgroup_ns(old_ns);
                return old_ns;
        }

        /* Allow only sysadmin to create cgroup namespace. */
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);

        ucounts = inc_cgroup_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        /* It is not safe to take cgroup_mutex here */
        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        new_ns = alloc_cgroup_ns();
        if (IS_ERR(new_ns)) {
                put_css_set(cset);
                dec_cgroup_namespaces(ucounts);
                return new_ns;
        }

        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        new_ns->root_cset = cset;

        return new_ns;
}

static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
        return container_of(ns, struct cgroup_namespace, ns);
}

static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);

        if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Don't need to do anything if we are attaching to our own cgroupns. */
        if (cgroup_ns == nsproxy->cgroup_ns)
                return 0;

        get_cgroup_ns(cgroup_ns);
        put_cgroup_ns(nsproxy->cgroup_ns);
        nsproxy->cgroup_ns = cgroup_ns;

        return 0;
}

static struct ns_common *cgroupns_get(struct task_struct *task)
{
        struct cgroup_namespace *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->cgroup_ns;
                get_cgroup_ns(ns);
        }
        task_unlock(task);

        return ns ? &ns->ns : NULL;
}

static void cgroupns_put(struct ns_common *ns)
{
        put_cgroup_ns(to_cg_ns(ns));
}

static struct user_namespace *cgroupns_owner(struct ns_common *ns)
{
        return to_cg_ns(ns)->user_ns;
}

const struct proc_ns_operations cgroupns_operations = {
        .name                = "cgroup",
        .type                = CLONE_NEWCGROUP,
        .get                = cgroupns_get,
        .put                = cgroupns_put,
        .install        = cgroupns_install,
        .owner                = cgroupns_owner,
};




























































































































































































































































































































































































































































































































































































































































































































    2 
    2 

    1 





    1 


















    1 


    1 














    1 












    1 





































































































    1 
























































































































































































































































    5 

    2 



































































































































































































    1 






    8 






















































    1 












































































    1 



























    2 




    2 


























    2 
    1 






























































    1 
    1 





    1 

    1 













































































































































































































































































































































































































































































































    5 
























































































    1 


















































































































































































































































































































































































































































































































    2 

































    1 





    1 





























































































































































    5 





    4 































































































































    2 



















































































































































































































































































































































































































































































































































































































    1 


















































































































































































    2 










































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
#include <linux/slab.h>

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct user_struct;
struct pt_regs;
struct folio_batch;

extern int sysctl_page_lock_unfairness;

void mm_core_init(void);
void init_mm_internals(void);

#ifndef CONFIG_NUMA                /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;

static inline void set_max_mapnr(unsigned long limit)
{
        max_mapnr = limit;
}
#else
static inline void set_max_mapnr(unsigned long limit) { }
#endif

extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
        atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
        atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
        atomic_long_add(count, &_totalram_pages);
}

extern void * high_memory;
extern int page_cluster;
extern const int page_cluster_max;

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

#include <asm/page.h>
#include <asm/processor.h>

#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

#ifndef page_to_virt
#define page_to_virt(x)        __va(PFN_PHYS(page_to_pfn(x)))
#endif

#ifndef lm_alias
#define lm_alias(x)        __va(__pa_symbol(x))
#endif

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)        (0)
#endif

/*
 * On some architectures it is expensive to call memset() for small sizes.
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
 */
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 96
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statements if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
        unsigned long *_pp = (void *)page;

         /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
        BUILD_BUG_ON(sizeof(struct page) & 7);
        BUILD_BUG_ON(sizeof(struct page) < 56);
        BUILD_BUG_ON(sizeof(struct page) > 96);

        switch (sizeof(struct page)) {
        case 96:
                _pp[11] = 0;
                fallthrough;
        case 88:
                _pp[10] = 0;
                fallthrough;
        case 80:
                _pp[9] = 0;
                fallthrough;
        case 72:
                _pp[8] = 0;
                fallthrough;
        case 64:
                _pp[7] = 0;
                fallthrough;
        case 56:
                _pp[6] = 0;
                _pp[5] = 0;
                _pp[4] = 0;
                _pp[3] = 0;
                _pp[2] = 0;
                _pp[1] = 0;
                _pp[0] = 0;
        }
}
#else
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN        (5)
#define DEFAULT_MAX_MAP_COUNT        (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;

extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;

int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
#define folio_page_idx(folio, p)        (page_to_pfn(p) - folio_pfn(folio))
#else
#define nth_page(page,n) ((page) + (n))
#define folio_page_idx(folio, p)        ((p) - &(folio)->page)
#endif

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/* to align the pointer to the (prev) page boundary */
#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)

/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr)        IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)

static inline struct folio *lru_to_folio(struct list_head *head)
{
        return list_entry((head)->prev, struct folio, lru);
}

void setup_initial_init_mm(void *start_code, void *end_code,
                           void *end_data, void *brk);

/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
/* Use only if VMA has no other users */
void __vm_area_free(struct vm_area_struct *vma);

#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;

extern unsigned int kobjsize(const void *objp);
#endif

/*
 * vm_flags in vm_area_struct, see mm_types.h.
 * When changing, update also include/trace/events/mmflags.h
 */
#define VM_NONE                0x00000000

#define VM_READ                0x00000001        /* currently active flags */
#define VM_WRITE        0x00000002
#define VM_EXEC                0x00000004
#define VM_SHARED        0x00000008

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD        0x00000010        /* limits for mprotect() etc */
#define VM_MAYWRITE        0x00000020
#define VM_MAYEXEC        0x00000040
#define VM_MAYSHARE        0x00000080

#define VM_GROWSDOWN        0x00000100        /* general info on the segment */
#ifdef CONFIG_MMU
#define VM_UFFD_MISSING        0x00000200        /* missing pages tracking */
#else /* CONFIG_MMU */
#define VM_MAYOVERLAY        0x00000200        /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
#define VM_UFFD_MISSING        0
#endif /* CONFIG_MMU */
#define VM_PFNMAP        0x00000400        /* Page-ranges managed without "struct page", just pure PFN */
#define VM_UFFD_WP        0x00001000        /* wrprotect pages tracking */

#define VM_LOCKED        0x00002000
#define VM_IO           0x00004000        /* Memory mapped I/O or similar */

                                        /* Used by sys_madvise() */
#define VM_SEQ_READ        0x00008000        /* App will access data sequentially */
#define VM_RAND_READ        0x00010000        /* App will not benefit from clustered reads */

#define VM_DONTCOPY        0x00020000      /* Do not copy this vma on fork */
#define VM_DONTEXPAND        0x00040000        /* Cannot expand with mremap() */
#define VM_LOCKONFAULT        0x00080000        /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT        0x00100000        /* Is a VM accounted object */
#define VM_NORESERVE        0x00200000        /* should the VM suppress accounting */
#define VM_HUGETLB        0x00400000        /* Huge TLB Page VM */
#define VM_SYNC                0x00800000        /* Synchronous page faults */
#define VM_ARCH_1        0x01000000        /* Architecture-specific flag */
#define VM_WIPEONFORK        0x02000000        /* Wipe VMA contents in child. */
#define VM_DONTDUMP        0x04000000        /* Do not include in the core dump */

#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY        0x08000000        /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY        0
#endif

#define VM_MIXEDMAP        0x10000000        /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE        0x20000000        /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE        0x40000000        /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE        0x80000000        /* KSM may merge identical pages */

#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0        32        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1        33        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2        34        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3        35        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4        36        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_5        37        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0        BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1        BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2        BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3        BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4        BIT(VM_HIGH_ARCH_BIT_4)
#define VM_HIGH_ARCH_5        BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
# define VM_PKEY_SHIFT        VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0        VM_HIGH_ARCH_0        /* A protection key is a 4-bit value */
# define VM_PKEY_BIT1        VM_HIGH_ARCH_1        /* on x86 and 5-bit value on ppc64   */
# define VM_PKEY_BIT2        VM_HIGH_ARCH_2
# define VM_PKEY_BIT3        VM_HIGH_ARCH_3
#ifdef CONFIG_PPC
# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4  0
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */

#ifdef CONFIG_X86_USER_SHADOW_STACK
/*
 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
 * support core mm.
 *
 * These VMAs will get a single end guard page. This helps userspace protect
 * itself from attacks. A single page is enough for current shadow stack archs
 * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
 * for more details on the guard size.
 */
# define VM_SHADOW_STACK        VM_HIGH_ARCH_5
#else
# define VM_SHADOW_STACK        VM_NONE
#endif

#if defined(CONFIG_X86)
# define VM_PAT                VM_ARCH_1        /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
# define VM_SAO                VM_ARCH_1        /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI        VM_ARCH_1        /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR        VM_SPARC_ADI
#elif defined(CONFIG_ARM64)
# define VM_ARM64_BTI        VM_ARCH_1        /* BTI guarded page, a.k.a. GP bit */
# define VM_ARCH_CLEAR        VM_ARM64_BTI
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY        VM_ARCH_1        /* T if mapped copy of data (nommu mmap) */
#endif

#if defined(CONFIG_ARM64_MTE)
# define VM_MTE                VM_HIGH_ARCH_0        /* Use Tagged memory for access control */
# define VM_MTE_ALLOWED        VM_HIGH_ARCH_1        /* Tagged memory permitted */
#else
# define VM_MTE                VM_NONE
# define VM_MTE_ALLOWED        VM_NONE
#endif

#ifndef VM_GROWSUP
# define VM_GROWSUP        VM_NONE
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
# define VM_UFFD_MINOR_BIT        38
# define VM_UFFD_MINOR                BIT(VM_UFFD_MINOR_BIT)        /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR                VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */

/*
 * This flag is used to connect VFIO to arch specific KVM code. It
 * indicates that the memory under this VMA is safe for use with any
 * non-cachable memory type inside KVM. Some VFIO devices, on some
 * platforms, are thought to be unsafe and can cause machine crashes
 * if KVM does not lock down the memory type.
 */
#ifdef CONFIG_64BIT
#define VM_ALLOW_ANY_UNCACHED_BIT        39
#define VM_ALLOW_ANY_UNCACHED                BIT(VM_ALLOW_ANY_UNCACHED_BIT)
#else
#define VM_ALLOW_ANY_UNCACHED                VM_NONE
#endif

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)

#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)

/* Common data flag combinations */
#define VM_DATA_FLAGS_TSK_EXEC        (VM_READ | VM_WRITE | TASK_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_NON_EXEC        (VM_READ | VM_WRITE | VM_MAYREAD | \
                                 VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_EXEC        (VM_READ | VM_WRITE | VM_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

#ifndef VM_DATA_DEFAULT_FLAGS                /* arch can override this */
#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
#endif

#ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif

#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)

#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK        VM_GROWSUP
#define VM_STACK_EARLY        VM_GROWSDOWN
#else
#define VM_STACK        VM_GROWSDOWN
#define VM_STACK_EARLY        0
#endif

#define VM_STACK_FLAGS        (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)

/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)


/*
 * Special vmas that are non-mergable, non-mlock()able.
 */
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)

/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)

/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK        VM_NOHUGEPAGE

/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK        (VM_LOCKED | VM_LOCKONFAULT)

/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR        VM_NONE
#endif
#define VM_FLAGS_CLEAR        (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */

/*
 * The default fault flags that should be used by most of the
 * arch-specific page fault handlers.
 */
#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
                             FAULT_FLAG_KILLABLE | \
                             FAULT_FLAG_INTERRUPTIBLE)

/**
 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
 * @flags: Fault flags.
 *
 * This is mostly used for places where we want to try to avoid taking
 * the mmap_lock for too long a time when waiting for another condition
 * to change, in which case we can try to be polite to release the
 * mmap_lock in the first round to avoid potential starvation of other
 * processes that would also want the mmap_lock.
 *
 * Return: true if the page fault allows retry and this is the first
 * attempt of the fault handling; false otherwise.
 */
static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{
        return (flags & FAULT_FLAG_ALLOW_RETRY) &&
            (!(flags & FAULT_FLAG_TRIED));
}

#define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,                "WRITE" }, \
        { FAULT_FLAG_MKWRITE,                "MKWRITE" }, \
        { FAULT_FLAG_ALLOW_RETRY,        "ALLOW_RETRY" }, \
        { FAULT_FLAG_RETRY_NOWAIT,        "RETRY_NOWAIT" }, \
        { FAULT_FLAG_KILLABLE,                "KILLABLE" }, \
        { FAULT_FLAG_TRIED,                "TRIED" }, \
        { FAULT_FLAG_USER,                "USER" }, \
        { FAULT_FLAG_REMOTE,                "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,        "INSTRUCTION" }, \
        { FAULT_FLAG_INTERRUPTIBLE,        "INTERRUPTIBLE" }, \
        { FAULT_FLAG_VMA_LOCK,                "VMA_LOCK" }

/*
 * vm_fault is filled by the pagefault handler and passed to the vma's
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
 *
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
 * pgoff should be used in favour of virtual_address, if possible.
 */
struct vm_fault {
        const struct {
                struct vm_area_struct *vma;        /* Target VMA */
                gfp_t gfp_mask;                        /* gfp mask to be used for allocations */
                pgoff_t pgoff;                        /* Logical page offset based on vma */
                unsigned long address;                /* Faulting virtual address - masked */
                unsigned long real_address;        /* Faulting virtual address - unmasked */
        };
        enum fault_flag flags;                /* FAULT_FLAG_xxx flags
                                         * XXX: should really be 'const' */
        pmd_t *pmd;                        /* Pointer to pmd entry matching
                                         * the 'address' */
        pud_t *pud;                        /* Pointer to pud entry matching
                                         * the 'address'
                                         */
        union {
                pte_t orig_pte;                /* Value of PTE at the time of fault */
                pmd_t orig_pmd;                /* Value of PMD at the time of fault,
                                         * used by PMD fault only.
                                         */
        };

        struct page *cow_page;                /* Page handler may use for COW fault */
        struct page *page;                /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
        /* These three entries are valid only while holding ptl lock */
        pte_t *pte;                        /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
                                         */
        spinlock_t *ptl;                /* Page table lock.
                                         * Protects pte page table if 'pte'
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;                /* Pre-allocated pte page table.
                                         * vm_ops->map_pages() sets up a page
                                         * table from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
                                         */
};

/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
 * to the functions called when a no-page or a wp-page exception occurs.
 */
struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        /**
         * @close: Called when the VMA is being removed from the MM.
         * Context: User context.  May sleep.  Caller holds mmap_lock.
         */
        void (*close)(struct vm_area_struct * area);
        /* Called any time before splitting to check if it's allowed */
        int (*may_split)(struct vm_area_struct *area, unsigned long addr);
        int (*mremap)(struct vm_area_struct *area);
        /*
         * Called by mprotect() to make driver-specific permission
         * checks before mprotect() is finalised.   The VMA must not
         * be modified.  Returns 0 if mprotect() can proceed.
         */
        int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, unsigned long newflags);
        vm_fault_t (*fault)(struct vm_fault *vmf);
        vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
        vm_fault_t (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        unsigned long (*pagesize)(struct vm_area_struct * area);

        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
        vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);

        /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
        vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);

        /* called by access_process_vm when get_user_pages() fails, typically
         * for use by special VMAs. See also generic_access_phys() for a generic
         * implementation useful for any iomem mapping.
         */
        int (*access)(struct vm_area_struct *vma, unsigned long addr,
                      void *buf, int len, int write);

        /* Called by the /proc/PID/maps code to ask the vma whether it
         * has a special name.  Returning non-NULL will also cause this
         * vma to be dumped unconditionally. */
        const char *(*name)(struct vm_area_struct *vma);

#ifdef CONFIG_NUMA
        /*
         * set_policy() op must add a reference to any non-NULL @new mempolicy
         * to hold the policy upon return.  Caller should pass NULL @new to
         * remove a policy and fall back to surrounding context--i.e. do not
         * install a MPOL_DEFAULT policy, nor the task or system default
         * mempolicy.
         */
        int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

        /*
         * get_policy() op must add reference [mpol_get()] to any policy at
         * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
         * in mm/mempolicy.c will do this automatically.
         * get_policy() must NOT add a ref if the policy at (vma,addr) is not
         * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
         * If no [shared/vma] mempolicy exists at the addr, get_policy() op
         * must return NULL--i.e., do not "fallback" to task or system default
         * policy.
         */
        struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                        unsigned long addr, pgoff_t *ilx);
#endif
        /*
         * Called by vm_normal_page() for special PTEs to find the
         * page for @addr.  This is useful if the default behavior
         * (using pte_page()) would not find the correct page.
         */
        struct page *(*find_special_page)(struct vm_area_struct *vma,
                                          unsigned long addr);
};

#ifdef CONFIG_NUMA_BALANCING
static inline void vma_numab_state_init(struct vm_area_struct *vma)
{
        vma->numab_state = NULL;
}
static inline void vma_numab_state_free(struct vm_area_struct *vma)
{
        kfree(vma->numab_state);
}
#else
static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_PER_VMA_LOCK
/*
 * Try to read-lock a vma. The function is allowed to occasionally yield false
 * locked result to avoid performance overhead, in which case we fall back to
 * using mmap_lock. The function should never yield false unlocked result.
 */
static inline bool vma_start_read(struct vm_area_struct *vma)
{
        /*
         * Check before locking. A race might cause false locked result.
         * We can use READ_ONCE() for the mm_lock_seq here, and don't need
         * ACQUIRE semantics, because this is just a lockless check whose result
         * we don't rely on for anything - the mm_lock_seq read against which we
         * need ordering is below.
         */
        if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
                return false;

        if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
                return false;

        /*
         * Overflow might produce false locked result.
         * False unlocked result is impossible because we modify and check
         * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         *
         * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
         * racing with vma_end_write_all(), we only start reading from the VMA
         * after it has been unlocked.
         * This pairs with RELEASE semantics in vma_end_write_all().
         */
        if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
                up_read(&vma->vm_lock->lock);
                return false;
        }
        return true;
}

static inline void vma_end_read(struct vm_area_struct *vma)
{
        rcu_read_lock(); /* keeps vma alive till the end of up_read */
        up_read(&vma->vm_lock->lock);
        rcu_read_unlock();
}

/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
{
        mmap_assert_write_locked(vma->vm_mm);

        /*
         * current task is holding mmap_write_lock, both vma->vm_lock_seq and
         * mm->mm_lock_seq can't be concurrently modified.
         */
        *mm_lock_seq = vma->vm_mm->mm_lock_seq;
        return (vma->vm_lock_seq == *mm_lock_seq);
}

/*
 * Begin writing to a VMA.
 * Exclude concurrent readers under the per-VMA lock until the currently
 * write-locked mmap_lock is dropped or downgraded.
 */
static inline void vma_start_write(struct vm_area_struct *vma)
{
        int mm_lock_seq;

        if (__is_vma_write_locked(vma, &mm_lock_seq))
                return;

        down_write(&vma->vm_lock->lock);
        /*
         * We should use WRITE_ONCE() here because we can have concurrent reads
         * from the early lockless pessimistic check in vma_start_read().
         * We don't really care about the correctness of that early check, but
         * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
         */
        WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
        up_write(&vma->vm_lock->lock);
}

static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
        int mm_lock_seq;

        VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        if (!rwsem_is_locked(&vma->vm_lock->lock))
                vma_assert_write_locked(vma);
}

static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
{
        /* When detaching vma should be write-locked */
        if (detached)
                vma_assert_write_locked(vma);
        vma->detached = detached;
}

static inline void release_fault_lock(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_end_read(vmf->vma);
        else
                mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_assert_locked(vmf->vma);
        else
                mmap_assert_locked(vmf->vma->vm_mm);
}

struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address);

#else /* CONFIG_PER_VMA_LOCK */

static inline bool vma_start_read(struct vm_area_struct *vma)
                { return false; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
                { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_mark_detached(struct vm_area_struct *vma,
                                     bool detached) {}

static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                unsigned long address)
{
        return NULL;
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);
}

static inline void release_fault_lock(struct vm_fault *vmf)
{
        mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(struct vm_fault *vmf)
{
        mmap_assert_locked(vmf->vma->vm_mm);
}

#endif /* CONFIG_PER_VMA_LOCK */

extern const struct vm_operations_struct vma_dummy_vm_ops;

/*
 * WARNING: vma_init does not initialize vma->vm_lock.
 * Use vm_area_alloc()/vm_area_free() if vma needs locking.
 */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_ops = &vma_dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma_mark_detached(vma, false);
        vma_numab_state_init(vma);
}

/* Use when VMA is not part of the VMA tree and needs no locking */
static inline void vm_flags_init(struct vm_area_struct *vma,
                                 vm_flags_t flags)
{
        ACCESS_PRIVATE(vma, __vm_flags) = flags;
}

/*
 * Use when VMA is part of the VMA tree and modifications need coordination
 * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
 * it should be locked explicitly beforehand.
 */
static inline void vm_flags_reset(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        vm_flags_init(vma, flags);
}

static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                       vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}

static inline void vm_flags_set(struct vm_area_struct *vma,
                                vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}

static inline void vm_flags_clear(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}

/*
 * Use only if VMA is not part of the VMA tree or has no other users and
 * therefore needs no locking.
 */
static inline void __vm_flags_mod(struct vm_area_struct *vma,
                                  vm_flags_t set, vm_flags_t clear)
{
        vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
}

/*
 * Use only when the order of set/clear operations is unimportant, otherwise
 * use vm_flags_{set|clear} explicitly.
 */
static inline void vm_flags_mod(struct vm_area_struct *vma,
                                vm_flags_t set, vm_flags_t clear)
{
        vma_start_write(vma);
        __vm_flags_mod(vma, set, clear);
}

static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
        vma->vm_ops = NULL;
}

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
        return !vma->vm_ops;
}

/*
 * Indicate if the VMA is a heap for the given task; for
 * /proc/PID/maps that is the heap of the main task.
 */
static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
{
        return vma->vm_start < vma->vm_mm->brk &&
                vma->vm_end > vma->vm_mm->start_brk;
}

/*
 * Indicate if the VMA is a stack for the given task; for
 * /proc/PID/maps that is the stack of the main task.
 */
static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
{
        /*
         * We make no effort to guess what a given thread considers to be
         * its "stack".  It's not even well-defined for programs written
         * languages like Go.
         */
        return vma->vm_start <= vma->vm_mm->start_stack &&
                vma->vm_end >= vma->vm_mm->start_stack;
}

static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);

        if (!maybe_stack)
                return false;

        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
                                                VM_STACK_INCOMPLETE_SETUP)
                return true;

        return false;
}

static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
        if (!current->mm)
                return true;

        if (current->mm != vma->vm_mm)
                return true;

        return false;
}

static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_ACCESS_FLAGS;
}

static inline bool is_shared_maywrite(vm_flags_t vm_flags)
{
        return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
                (VM_SHARED | VM_MAYWRITE);
}

static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
{
        return is_shared_maywrite(vma->vm_flags);
}

static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
        return mas_find(&vmi->mas, max - 1);
}

static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
{
        /*
         * Uses mas_find() to get the first VMA when the iterator starts.
         * Calling mas_next() could skip the first entry.
         */
        return mas_find(&vmi->mas, ULONG_MAX);
}

static inline
struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
{
        return mas_next_range(&vmi->mas, ULONG_MAX);
}


static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
{
        return mas_prev(&vmi->mas, 0);
}

static inline
struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
{
        return mas_prev_range(&vmi->mas, 0);
}

static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
        return vmi->mas.index;
}

static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
{
        return vmi->mas.last + 1;
}
static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
                                      unsigned long count)
{
        return mas_expected_entries(&vmi->mas, count);
}

static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
                        unsigned long start, unsigned long end, gfp_t gfp)
{
        __mas_set_range(&vmi->mas, start, end - 1);
        mas_store_gfp(&vmi->mas, NULL, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

/* Free any unused preallocations */
static inline void vma_iter_free(struct vma_iterator *vmi)
{
        mas_destroy(&vmi->mas);
}

static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
                                      struct vm_area_struct *vma)
{
        vmi->mas.index = vma->vm_start;
        vmi->mas.last = vma->vm_end - 1;
        mas_store(&vmi->mas, vma);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
        mas_pause(&vmi->mas);
}

static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
{
        mas_set(&vmi->mas, addr);
}

#define for_each_vma(__vmi, __vma)                                        \
        while (((__vma) = vma_next(&(__vmi))) != NULL)

/* The MM code likes to work with exclusive end addresses */
#define for_each_vma_range(__vmi, __vma, __end)                                \
        while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(struct vm_area_struct *vma);
bool vma_is_anon_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(struct vm_area_struct *vma);

/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

struct mmu_gather;
struct inode;

/*
 * compound_order() can be called without holding a reference, which means
 * that niceties like page_folio() don't work.  These callers should be
 * prepared to handle wild return values.  For example, PG_head may be
 * set before the order is initialised, or this may be a tail page.
 * See compaction.c for some good examples.
 */
static inline unsigned int compound_order(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags))
                return 0;
        return folio->_flags_1 & 0xff;
}

/**
 * folio_order - The allocation order of a folio.
 * @folio: The folio.
 *
 * A folio is composed of 2^order pages.  See get_order() for the definition
 * of order.
 *
 * Return: The order of the folio.
 */
static inline unsigned int folio_order(struct folio *folio)
{
        if (!folio_test_large(folio))
                return 0;
        return folio->_flags_1 & 0xff;
}

#include <linux/huge_mm.h>

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
 */
static inline int put_page_testzero(struct page *page)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
        return page_ref_dec_and_test(page);
}

static inline int folio_put_testzero(struct folio *folio)
{
        return put_page_testzero(&folio->page);
}

/*
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
 */
static inline bool get_page_unless_zero(struct page *page)
{
        return page_ref_add_unless(page, 1, 0);
}

static inline struct folio *folio_get_nontail_page(struct page *page)
{
        if (unlikely(!get_page_unless_zero(page)))
                return NULL;
        return (struct folio *)page;
}

extern int page_is_ram(unsigned long pfn);

enum {
        REGION_INTERSECTS,
        REGION_DISJOINT,
        REGION_MIXED,
};

int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
                      unsigned long desc);

/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);

/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
        return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
        return 0;
}
#endif

/*
 * How many times the entire folio is mapped as a single unit (eg by a
 * PMD or PUD entry).  This is probably not what you want, except for
 * debugging purposes - it does not include PTE-mapped sub-pages; look
 * at folio_mapcount() or page_mapcount() instead.
 */
static inline int folio_entire_mapcount(const struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        return atomic_read(&folio->_entire_mapcount) + 1;
}

/*
 * The atomic page->_mapcount, starts from -1: so that transitions
 * both from it and to it can be tracked, using atomic_inc_and_test
 * and atomic_add_negative(-1).
 */
static inline void page_mapcount_reset(struct page *page)
{
        atomic_set(&(page)->_mapcount, -1);
}

/**
 * page_mapcount() - Number of times this precise page is mapped.
 * @page: The page.
 *
 * The number of times this page is mapped.  If this page is part of
 * a large folio, it includes the number of times this page is mapped
 * as part of that folio.
 *
 * Will report 0 for pages which cannot be mapped into userspace, eg
 * slab, page tables and similar.
 */
static inline int page_mapcount(struct page *page)
{
        int mapcount = atomic_read(&page->_mapcount) + 1;

        /* Handle page_has_type() pages */
        if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
                mapcount = 0;
        if (unlikely(PageCompound(page)))
                mapcount += folio_entire_mapcount(page_folio(page));

        return mapcount;
}

static inline int folio_large_mapcount(const struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
        return atomic_read(&folio->_large_mapcount) + 1;
}

/**
 * folio_mapcount() - Number of mappings of this folio.
 * @folio: The folio.
 *
 * The folio mapcount corresponds to the number of present user page table
 * entries that reference any part of a folio. Each such present user page
 * table entry must be paired with exactly on folio reference.
 *
 * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
 * exactly once.
 *
 * For hugetlb folios, each abstracted "hugetlb" user page table entry that
 * references the entire folio counts exactly once, even when such special
 * page table entries are comprised of multiple ordinary page table entries.
 *
 * Will report 0 for pages which cannot be mapped into userspace, such as
 * slab, page tables and similar.
 *
 * Return: The number of times this folio is mapped.
 */
static inline int folio_mapcount(const struct folio *folio)
{
        int mapcount;

        if (likely(!folio_test_large(folio))) {
                mapcount = atomic_read(&folio->_mapcount) + 1;
                /* Handle page_has_type() pages */
                if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
                        mapcount = 0;
                return mapcount;
        }
        return folio_large_mapcount(folio);
}

/**
 * folio_mapped - Is this folio mapped into userspace?
 * @folio: The folio.
 *
 * Return: True if any page in this folio is referenced by user page tables.
 */
static inline bool folio_mapped(const struct folio *folio)
{
        return folio_mapcount(folio) >= 1;
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any sub-page of compound page is mapped,
 * even if this particular sub-page is not itself mapped by any PTE or PMD.
 */
static inline bool page_mapped(const struct page *page)
{
        return folio_mapped(page_folio(page));
}

static inline struct page *virt_to_head_page(const void *x)
{
        struct page *page = virt_to_page(x);

        return compound_head(page);
}

static inline struct folio *virt_to_folio(const void *x)
{
        struct page *page = virt_to_page(x);

        return page_folio(page);
}

void __folio_put(struct folio *folio);

void put_pages_list(struct list_head *pages);

void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);

unsigned long nr_free_buffer_pages(void);

/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
        return PAGE_SIZE << compound_order(page);
}

/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
        return PAGE_SHIFT + compound_order(page);
}

/**
 * thp_order - Order of a transparent huge page.
 * @page: Head page of a transparent huge page.
 */
static inline unsigned int thp_order(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return compound_order(page);
}

/**
 * thp_size - Size of a transparent huge page.
 * @page: Head page of a transparent huge page.
 *
 * Return: Number of bytes in this page.
 */
static inline unsigned long thp_size(struct page *page)
{
        return PAGE_SIZE << thp_order(page);
}

#ifdef CONFIG_MMU
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pte = pte_mkwrite(pte, vma);
        return pte;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr);

vm_fault_t finish_fault(struct vm_fault *vmf);
#endif

/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
 *
 * A page may belong to an inode's memory mapping. In this case, page->mapping
 * is the pointer to the inode, and page->index is the file offset of the page,
 * in units of PAGE_SIZE.
 *
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
 *
 * The pagecache pages are stored in a per-mapping radix tree, which is
 * rooted at mapping->i_pages, and indexed by offset.
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
 * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
 */

#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);

bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
        if (!static_branch_unlikely(&devmap_managed_key))
                return false;
        if (!folio_is_zone_device(folio))
                return false;
        return __put_devmap_managed_folio_refs(folio, refs);
}
#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
        return false;
}
#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */

/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
        ((unsigned int) folio_ref_count(folio) + 127u <= 127u)

/**
 * folio_get - Increment the reference count on a folio.
 * @folio: The folio.
 *
 * Context: May be called in any context, as long as you know that
 * you have a refcount on the folio.  If you do not already have one,
 * folio_try_get() may be the right interface for you to use.
 */
static inline void folio_get(struct folio *folio)
{
        VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
        folio_ref_inc(folio);
}

static inline void get_page(struct page *page)
{
        folio_get(page_folio(page));
}

static inline __must_check bool try_get_page(struct page *page)
{
        page = compound_head(page);
        if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                return false;
        page_ref_inc(page);
        return true;
}

/**
 * folio_put - Decrement the reference count on a folio.
 * @folio: The folio.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put() unless you can be sure that it wasn't the
 * last reference.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put(struct folio *folio)
{
        if (folio_put_testzero(folio))
                __folio_put(folio);
}

/**
 * folio_put_refs - Reduce the reference count on a folio.
 * @folio: The folio.
 * @refs: The amount to subtract from the folio's reference count.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put_refs() unless you can be sure that these weren't
 * the last references.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put_refs(struct folio *folio, int refs)
{
        if (folio_ref_sub_and_test(folio, refs))
                __folio_put(folio);
}

void folios_put_refs(struct folio_batch *folios, unsigned int *refs);

/*
 * union release_pages_arg - an array of pages or folios
 *
 * release_pages() releases a simple array of multiple pages, and
 * accepts various different forms of said page array: either
 * a regular old boring array of pages, an array of folios, or
 * an array of encoded page pointers.
 *
 * The transparent union syntax for this kind of "any of these
 * argument types" is all kinds of ugly, so look away.
 */
typedef union {
        struct page **pages;
        struct folio **folios;
        struct encoded_page **encoded_pages;
} release_pages_arg __attribute__ ((__transparent_union__));

void release_pages(release_pages_arg, int nr);

/**
 * folios_put - Decrement the reference count on an array of folios.
 * @folios: The folios.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need to
 * reinitialise it.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folios_put(struct folio_batch *folios)
{
        folios_put_refs(folios, NULL);
}

static inline void put_page(struct page *page)
{
        struct folio *folio = page_folio(page);

        /*
         * For some devmap managed pages we need to catch refcount transition
         * from 2 to 1:
         */
        if (put_devmap_managed_folio_refs(folio, 1))
                return;
        folio_put(folio);
}

/*
 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 * the page's refcount so that two separate items are tracked: the original page
 * reference count, and also a new count of how many pin_user_pages() calls were
 * made against the page. ("gup-pinned" is another term for the latter).
 *
 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 * distinct from normal pages. As such, the unpin_user_page() call (and its
 * variants) must be used in order to release gup-pinned pages.
 *
 * Choice of value:
 *
 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 * simpler, due to the fact that adding an even power of two to the page
 * refcount has the effect of using only the upper N bits, for the code that
 * counts up using the bias value. This means that the lower bits are left for
 * the exclusive use of the original code that increments and decrements by one
 * (or at least, by much smaller values than the bias value).
 *
 * Of course, once the lower bits overflow into the upper bits (and this is
 * OK, because subtraction recovers the original values), then visual inspection
 * no longer suffices to directly view the separate counts. However, for normal
 * applications that don't have huge page reference counts, this won't be an
 * issue.
 *
 * Locking: the lockless algorithm described in folio_try_get_rcu()
 * provides safe operation for get_user_pages(), page_mkclean() and
 * other calls that race to set up page table entries.
 */
#define GUP_PIN_COUNTING_BIAS (1U << 10)

void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);

static inline bool is_cow_mapping(vm_flags_t flags)
{
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
        /*
         * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
         * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
         * a file mapping. R/O MAP_PRIVATE mappings might still modify
         * underlying memory if ptrace is active, so this is only possible if
         * ptrace does not apply. Note that there is no mprotect() to upgrade
         * write permissions later.
         */
        return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
}
#endif

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

/*
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
 */
static inline int page_zone_id(struct page *page)
{
        return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
        return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif

static inline int folio_nid(const struct folio *folio)
{
        return page_to_nid(&folio->page);
}

#ifdef CONFIG_NUMA_BALANCING
/* page access time bits needs to hold at least 4 seconds */
#define PAGE_ACCESS_TIME_MIN_BITS        12
#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
#define PAGE_ACCESS_TIME_BUCKETS                                \
        (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
#else
#define PAGE_ACCESS_TIME_BUCKETS        0
#endif

#define PAGE_ACCESS_TIME_MASK                                \
        (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)

static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
        return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}

static inline int cpupid_to_pid(int cpupid)
{
        return cpupid & LAST__PID_MASK;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}

static inline int cpupid_to_nid(int cpupid)
{
        return cpu_to_node(cpupid_to_cpu(cpupid));
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}

static inline bool cpupid_cpu_unset(int cpupid)
{
        return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}

static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
        return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}

#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
        page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int folio_last_cpupid(struct folio *folio)
{
        return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

int folio_xchg_last_cpupid(struct folio *folio, int cpupid);

static inline void page_cpupid_reset_last(struct page *page)
{
        page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        int last_time;

        last_time = folio_xchg_last_cpupid(folio,
                                           time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
        unsigned int pid_bit;

        pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
        if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
                __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
        }
}
#else /* !CONFIG_NUMA_BALANCING */
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return folio_nid(folio); /* XXX */
}

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        return 0;
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio_nid(folio); /* XXX */
}

static inline int cpupid_to_nid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_pid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return -1;
}

static inline int cpu_pid_to_cpupid(int nid, int pid)
{
        return -1;
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return true;
}

static inline void page_cpupid_reset_last(struct page *page)
{
}

static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
        return false;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
#endif /* CONFIG_NUMA_BALANCING */

#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)

/*
 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
 * setting tags for all pages to native kernel tag value 0xff, as the default
 * value 0x00 maps to 0xff.
 */

static inline u8 page_kasan_tag(const struct page *page)
{
        u8 tag = KASAN_TAG_KERNEL;

        if (kasan_enabled()) {
                tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
                tag ^= 0xff;
        }

        return tag;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
        unsigned long old_flags, flags;

        if (!kasan_enabled())
                return;

        tag ^= 0xff;
        old_flags = READ_ONCE(page->flags);
        do {
                flags = old_flags;
                flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
                flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
        } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
}

static inline void page_kasan_tag_reset(struct page *page)
{
        if (kasan_enabled())
                page_kasan_tag_set(page, KASAN_TAG_KERNEL);
}

#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline u8 page_kasan_tag(const struct page *page)
{
        return 0xff;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }

#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline pg_data_t *page_pgdat(const struct page *page)
{
        return NODE_DATA(page_to_nid(page));
}

static inline struct zone *folio_zone(const struct folio *folio)
{
        return page_zone(&folio->page);
}

static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
        return page_pgdat(&folio->page);
}

#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
        page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}

static inline unsigned long page_to_section(const struct page *page)
{
        return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif

/**
 * folio_pfn - Return the Page Frame Number of a folio.
 * @folio: The folio.
 *
 * A folio may contain multiple pages.  The pages have consecutive
 * Page Frame Numbers.
 *
 * Return: The Page Frame Number of the first page in the folio.
 */
static inline unsigned long folio_pfn(struct folio *folio)
{
        return page_to_pfn(&folio->page);
}

static inline struct folio *pfn_folio(unsigned long pfn)
{
        return page_folio(pfn_to_page(pfn));
}

/**
 * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
 * @folio: The folio.
 *
 * This function checks if a folio has been pinned via a call to
 * a function in the pin_user_pages() family.
 *
 * For small folios, the return value is partially fuzzy: false is not fuzzy,
 * because it means "definitely not pinned for DMA", but true means "probably
 * pinned for DMA, but possibly a false positive due to having at least
 * GUP_PIN_COUNTING_BIAS worth of normal folio references".
 *
 * False positives are OK, because: a) it's unlikely for a folio to
 * get that many refcounts, and b) all the callers of this routine are
 * expected to be able to deal gracefully with a false positive.
 *
 * For large folios, the result will be exactly correct. That's because
 * we have more tracking data available: the _pincount field is used
 * instead of the GUP_PIN_COUNTING_BIAS scheme.
 *
 * For more information, please see Documentation/core-api/pin_user_pages.rst.
 *
 * Return: True, if it is likely that the page has been "dma-pinned".
 * False, if the page is definitely not dma-pinned.
 */
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
        if (folio_test_large(folio))
                return atomic_read(&folio->_pincount) > 0;

        /*
         * folio_ref_count() is signed. If that refcount overflows, then
         * folio_ref_count() returns a negative value, and callers will avoid
         * further incrementing the refcount.
         *
         * Here, for that overflow case, use the sign bit to count a little
         * bit higher via unsigned math, and thus still get an accurate result.
         */
        return ((unsigned int)folio_ref_count(folio)) >=
                GUP_PIN_COUNTING_BIAS;
}

static inline bool page_maybe_dma_pinned(struct page *page)
{
        return folio_maybe_dma_pinned(page_folio(page));
}

/*
 * This should most likely only be called during fork() to see whether we
 * should break the cow immediately for an anon page on the src mm.
 *
 * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
 */
static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
                                          struct folio *folio)
{
        VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));

        if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
                return false;

        return folio_maybe_dma_pinned(folio);
}

/**
 * is_zero_page - Query if a page is a zero page
 * @page: The page to query
 *
 * This returns true if @page is one of the permanent zero pages.
 */
static inline bool is_zero_page(const struct page *page)
{
        return is_zero_pfn(page_to_pfn(page));
}

/**
 * is_zero_folio - Query if a folio is a zero page
 * @folio: The folio to query
 *
 * This returns true if @folio is one of the permanent zero pages.
 */
static inline bool is_zero_folio(const struct folio *folio)
{
        return is_zero_page(&folio->page);
}

/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
#ifdef CONFIG_MIGRATION
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
#ifdef CONFIG_CMA
        int mt = folio_migratetype(folio);

        if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
                return false;
#endif
        /* The zero page can be "pinned" but gets special handling. */
        if (is_zero_folio(folio))
                return true;

        /* Coherent device memory must always allow eviction. */
        if (folio_is_device_coherent(folio))
                return false;

        /* Otherwise, non-movable zone folios can be pinned. */
        return !folio_is_zone_movable(folio);

}
#else
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
        return true;
}
#endif

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
        page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
        page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
{
        set_page_zone(page, zone);
        set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

/**
 * folio_nr_pages - The number of pages in the folio.
 * @folio: The folio.
 *
 * Return: A positive power of two.
 */
static inline long folio_nr_pages(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 1;
#ifdef CONFIG_64BIT
        return folio->_folio_nr_pages;
#else
        return 1L << (folio->_flags_1 & 0xff);
#endif
}

/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#define MAX_FOLIO_NR_PAGES        (1UL << PUD_ORDER)
#else
#define MAX_FOLIO_NR_PAGES        MAX_ORDER_NR_PAGES
#endif

/*
 * compound_nr() returns the number of pages in this potentially compound
 * page.  compound_nr() can be called on a tail page, and is defined to
 * return 1 in that case.
 */
static inline unsigned long compound_nr(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags))
                return 1;
#ifdef CONFIG_64BIT
        return folio->_folio_nr_pages;
#else
        return 1L << (folio->_flags_1 & 0xff);
#endif
}

/**
 * thp_nr_pages - The number of regular pages in this huge page.
 * @page: The head page of a huge page.
 */
static inline int thp_nr_pages(struct page *page)
{
        return folio_nr_pages((struct folio *)page);
}

/**
 * folio_next - Move to the next physical folio.
 * @folio: The folio we're currently operating on.
 *
 * If you have physically contiguous memory which may span more than
 * one folio (eg a &struct bio_vec), use this function to move from one
 * folio to the next.  Do not use it if the memory is only virtually
 * contiguous as the folios are almost certainly not adjacent to each
 * other.  This is the folio equivalent to writing ``page++``.
 *
 * Context: We assume that the folios are refcounted and/or locked at a
 * higher level and do not adjust the reference counts.
 * Return: The next struct folio.
 */
static inline struct folio *folio_next(struct folio *folio)
{
        return (struct folio *)folio_page(folio, folio_nr_pages(folio));
}

/**
 * folio_shift - The size of the memory described by this folio.
 * @folio: The folio.
 *
 * A folio represents a number of bytes which is a power-of-two in size.
 * This function tells you which power-of-two the folio is.  See also
 * folio_size() and folio_order().
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The base-2 logarithm of the size of this folio.
 */
static inline unsigned int folio_shift(struct folio *folio)
{
        return PAGE_SHIFT + folio_order(folio);
}

/**
 * folio_size - The number of bytes in a folio.
 * @folio: The folio.
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The number of bytes in this folio.
 */
static inline size_t folio_size(struct folio *folio)
{
        return PAGE_SIZE << folio_order(folio);
}

/**
 * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
 *                                tables of more than one MM
 * @folio: The folio.
 *
 * This function checks if the folio is currently mapped into more than one
 * MM ("mapped shared"), or if the folio is only mapped into a single MM
 * ("mapped exclusively").
 *
 * As precise information is not easily available for all folios, this function
 * estimates the number of MMs ("sharers") that are currently mapping a folio
 * using the number of times the first page of the folio is currently mapped
 * into page tables.
 *
 * For small anonymous folios (except KSM folios) and anonymous hugetlb folios,
 * the return value will be exactly correct, because they can only be mapped
 * at most once into an MM, and they cannot be partially mapped.
 *
 * For other folios, the result can be fuzzy:
 *    #. For partially-mappable large folios (THP), the return value can wrongly
 *       indicate "mapped exclusively" (false negative) when the folio is
 *       only partially mapped into at least one MM.
 *    #. For pagecache folios (including hugetlb), the return value can wrongly
 *       indicate "mapped shared" (false positive) when two VMAs in the same MM
 *       cover the same file range.
 *    #. For (small) KSM folios, the return value can wrongly indicate "mapped
 *       shared" (false positive), when the folio is mapped multiple times into
 *       the same MM.
 *
 * Further, this function only considers current page table mappings that
 * are tracked using the folio mapcount(s).
 *
 * This function does not consider:
 *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
 *       pagecache, temporary unmapping for migration).
 *    #. If the folio is mapped differently (VM_PFNMAP).
 *    #. If hugetlb page table sharing applies. Callers might want to check
 *       hugetlb_pmd_shared().
 *
 * Return: Whether the folio is estimated to be mapped into more than one MM.
 */
static inline bool folio_likely_mapped_shared(struct folio *folio)
{
        int mapcount = folio_mapcount(folio);

        /* Only partially-mappable folios require more care. */
        if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
                return mapcount > 1;

        /* A single mapping implies "mapped exclusively". */
        if (mapcount <= 1)
                return false;

        /* If any page is mapped more than once we treat it "mapped shared". */
        if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
                return true;

        /* Let's guess based on the first subpage. */
        return atomic_read(&folio->_mapcount) > 0;
}

#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
        return 0;
}
#endif

#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
        int ret;
        long i, nr = folio_nr_pages(folio);

        for (i = 0; i < nr; i++) {
                ret = arch_make_page_accessible(folio_page(folio, i));
                if (ret)
                        break;
        }

        return ret;
}
#endif

/*
 * Some inline functions in vmstat.h depend on page_zone()
 */
#include <linux/vmstat.h>

#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif

#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
        return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
        page->virtual = address;
}
#define page_address_init()  do { } while(0)
#endif

#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif

static __always_inline void *lowmem_page_address(const struct page *page)
{
        return page_to_virt(page);
}

#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address)  do { } while(0)
#define page_address_init()  do { } while(0)
#endif

static inline void *folio_address(const struct folio *folio)
{
        return page_address(&folio->page);
}

extern pgoff_t __page_file_index(struct page *page);

/*
 * Return the pagecache index of the passed page.  Regular pagecache pages
 * use ->index whereas swapcache pages use swp_offset(->private)
 */
static inline pgoff_t page_index(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return __page_file_index(page);
        return page->index;
}

/*
 * Return true only if the page has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool page_is_pfmemalloc(const struct page *page)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)page->lru.next & BIT(1);
}

/*
 * Return true only if the folio has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool folio_is_pfmemalloc(const struct folio *folio)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)folio->lru.next & BIT(1);
}

/*
 * Only to be called by the page allocator on a freshly allocated
 * page.
 */
static inline void set_page_pfmemalloc(struct page *page)
{
        page->lru.next = (void *)BIT(1);
}

static inline void clear_page_pfmemalloc(struct page *page)
{
        page->lru.next = NULL;
}

/*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
 */
extern void pagefault_out_of_memory(void);

#define offset_in_page(p)        ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_thp(page, p)        ((unsigned long)(p) & (thp_size(page) - 1))
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))

/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
        struct folio *single_folio;        /* Locked folio to be unmapped */
        bool even_cows;                        /* Zap COWed private pages too? */
        zap_flags_t zap_flags;                /* Extra flags for zapping */
};

/*
 * Whether to drop the pte markers, for example, the uffd-wp information for
 * file-backed memory.  This should only be specified when we will completely
 * drop the page in the mm, either by truncation or unmapping of the vma.  By
 * default, the flag is not set.
 */
#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))

#ifdef CONFIG_SCHED_MM_CID
void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_fork(struct task_struct *t);
void sched_mm_cid_exit_signals(struct task_struct *t);
static inline int task_mm_cid(struct task_struct *t)
{
        return t->mm_cid;
}
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_fork(struct task_struct *t) { }
static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
static inline int task_mm_cid(struct task_struct *t)
{
        /*
         * Use the processor id as a fall-back when the mm cid feature is
         * disabled. This provides functional per-cpu data structure accesses
         * in user-space, althrough it won't provide the memory usage benefits.
         */
        return raw_smp_processor_id();
}
#endif

#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);

void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                           unsigned long size, struct zap_details *details);
static inline void zap_vma_pages(struct vm_area_struct *vma)
{
        zap_page_range_single(vma, vma->vm_start,
                              vma->vm_end - vma->vm_start, NULL);
}
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *start_vma, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked);

struct mmu_notifier_range;

void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_pte(struct vm_area_struct *vma, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio);

struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                unsigned long address, struct pt_regs *regs);

#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                  unsigned long address, unsigned int flags,
                                  struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                         unsigned long address, unsigned int flags,
                                         struct pt_regs *regs)
{
        /* should never happen if there's no MMU */
        BUG();
        return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
                unsigned int fault_flags, bool *unlocked)
{
        /* should never happen if there's no MMU */
        BUG();
        return -EFAULT;
}
static inline void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

static inline void unmap_shared_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen)
{
        unmap_mapping_range(mapping, holebegin, holelen, 0);
}

static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
                                                unsigned long addr);

extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);

long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);

/*
 * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
 */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    int gup_flags,
                                                    struct vm_area_struct **vmap)
{
        struct page *page;
        struct vm_area_struct *vma;
        int got;

        if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
                return ERR_PTR(-EINVAL);

        got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);

        if (got < 0)
                return ERR_PTR(got);

        vma = vma_lookup(mm, addr);
        if (WARN_ON_ONCE(!vma)) {
                put_page(page);
                return ERR_PTR(-EINVAL);
        }

        *vmap = vma;
        return page;
}

long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);

int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
void folio_add_pin(struct folio *folio);

int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim);

struct kvec;
struct page *get_dump_page(unsigned long addr);

bool folio_mark_dirty(struct folio *folio);
bool set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
                unsigned long new_addr, unsigned long len,
                bool need_rmap_locks, bool for_stack);

/*
 * Flags used by change_protection().  For now we make it a bitmap so
 * that we can pass in multiple flags just like parameters.  However
 * for now all the callers are only use one of the flags at the same
 * time.
 */
/*
 * Whether we should manually check if we can map individual PTEs writable,
 * because something (e.g., COW, uffd-wp) blocks that from happening for all
 * PTEs automatically in a writable mapping.
 */
#define  MM_CP_TRY_CHANGE_WRITABLE           (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define  MM_CP_PROT_NUMA                   (1UL << 1)
/* Whether this change is for write protecting */
#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)

bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
        /*
         * We want to check manually if we can change individual PTEs writable
         * if we can't do that automatically for all PTEs in a mapping. For
         * private mappings, that's always the case when we have write
         * permissions as we properly have to handle COW.
         */
        if (vma->vm_flags & VM_SHARED)
                return vma_wants_writenotify(vma, vma->vm_page_prot);
        return !!(vma->vm_flags & VM_WRITE);

}
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
                              struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, unsigned long cp_flags);
extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
          struct vm_area_struct *vma, struct vm_area_struct **pprev,
          unsigned long start, unsigned long end, unsigned long newflags);

/*
 * doesn't attempt to fault and will return short.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);

static inline bool get_user_page_fast_only(unsigned long addr,
                        unsigned int gup_flags, struct page **pagep)
{
        return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
 * per-process(per-mm_struct) statistics.
 */
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
        return percpu_counter_read_positive(&mm->rss_stat[member]);
}

void mm_trace_rss_stat(struct mm_struct *mm, int member);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
        percpu_counter_add(&mm->rss_stat[member], value);

        mm_trace_rss_stat(mm, member);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_inc(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_dec(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

/* Optimized variant when folio is already known not to be anon */
static inline int mm_counter_file(struct folio *folio)
{
        if (folio_test_swapbacked(folio))
                return MM_SHMEMPAGES;
        return MM_FILEPAGES;
}

static inline int mm_counter(struct folio *folio)
{
        if (folio_test_anon(folio))
                return MM_ANONPAGES;
        return mm_counter_file(folio);
}

static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
        return get_mm_counter(mm, MM_FILEPAGES) +
                get_mm_counter(mm, MM_ANONPAGES) +
                get_mm_counter(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
        return max(mm->hiwater_rss, get_mm_rss(mm));
}

static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
        return max(mm->hiwater_vm, mm->total_vm);
}

static inline void update_hiwater_rss(struct mm_struct *mm)
{
        unsigned long _rss = get_mm_rss(mm);

        if ((mm)->hiwater_rss < _rss)
                (mm)->hiwater_rss = _rss;
}

static inline void update_hiwater_vm(struct mm_struct *mm)
{
        if (mm->hiwater_vm < mm->total_vm)
                mm->hiwater_vm = mm->total_vm;
}

static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
        mm->hiwater_rss = get_mm_rss(mm);
}

static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                                         struct mm_struct *mm)
{
        unsigned long hiwater_rss = get_mm_hiwater_rss(mm);

        if (*maxrss < hiwater_rss)
                *maxrss = hiwater_rss;
}

#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
        return 0;
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte;
}
#endif

#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
        return 0;
}
#endif

extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                               spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                    spinlock_t **ptl)
{
        pte_t *ptep;
        __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
        return ptep;
}

#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                                                unsigned long address)
{
        return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif

#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                                                unsigned long address)
{
        return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}

#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);

static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif

#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                unsigned long address)
{
        return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif

#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->pgtables_bytes, 0);
}

static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return atomic_long_read(&mm->pgtables_bytes);
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
        atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
        atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else

static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return 0;
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);

#if defined(CONFIG_MMU)

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
                NULL : p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                unsigned long address)
{
        return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
                NULL : pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
        return page_ptdesc(virt_to_page(x));
}

static inline void *ptdesc_to_virt(const struct ptdesc *pt)
{
        return page_to_virt(ptdesc_page(pt));
}

static inline void *ptdesc_address(const struct ptdesc *pt)
{
        return folio_address(ptdesc_folio(pt));
}

static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
        return folio_test_reserved(ptdesc_folio(pt));
}

/**
 * pagetable_alloc - Allocate pagetables
 * @gfp:    GFP flags
 * @order:  desired pagetable order
 *
 * pagetable_alloc allocates memory for page tables as well as a page table
 * descriptor to describe that memory.
 *
 * Return: The ptdesc describing the allocated page tables.
 */
static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);

        return page_ptdesc(page);
}
#define pagetable_alloc(...)        alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))

/**
 * pagetable_free - Free pagetables
 * @pt:        The page table descriptor
 *
 * pagetable_free frees the memory of all page tables described by a page
 * table descriptor and the memory for the descriptor itself.
 */
static inline void pagetable_free(struct ptdesc *pt)
{
        struct page *page = ptdesc_page(pt);

        __free_pages(page, compound_order(page));
}

#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
void ptlock_free(struct ptdesc *ptdesc);

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}

static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
        return true;
}

static inline void ptlock_free(struct ptdesc *ptdesc)
{
}

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */

static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}

static inline bool ptlock_init(struct ptdesc *ptdesc)
{
        /*
         * prep_new_page() initialize page->private (and therefore page->ptl)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
         * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
        if (!ptlock_alloc(ptdesc))
                return false;
        spin_lock_init(ptlock_ptr(ptdesc));
        return true;
}

#else        /* !USE_SPLIT_PTE_PTLOCKS */
/*
 * We use mm->page_table_lock to guard all pagetable pages of the mm.
 */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */

static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        if (!ptlock_init(ptdesc))
                return false;
        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
        return true;
}

static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        ptlock_free(ptdesc);
        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
{
        return __pte_offset_map(pmd, addr, NULL);
}

pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp);
static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp)
{
        pte_t *pte;

        __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp));
        return pte;
}

pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp);

#define pte_unmap_unlock(pte, ptl)        do {                \
        spin_unlock(ptl);                                \
        pte_unmap(pte);                                        \
} while (0)

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

#define pte_alloc_map(mm, pmd, address)                        \
        (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))

#define pte_alloc_map_lock(mm, pmd, address, ptlp)        \
        (pte_alloc(mm, pmd) ?                        \
                 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))

#define pte_alloc_kernel(pmd, address)                        \
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))

#if USE_SPLIT_PMD_PTLOCKS

static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
}

static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
{
        return page_ptdesc(pmd_pgtable_page(pmd));
}

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_ptdesc(pmd));
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        ptdesc->pmd_huge_pte = NULL;
#endif
        return ptlock_init(ptdesc);
}

static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
#endif
        ptlock_free(ptdesc);
}

#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)

#else

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}

#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

#endif

static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
        return ptl;
}

static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        if (!pmd_ptlock_init(ptdesc))
                return false;
        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
        return true;
}

static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        pmd_ptlock_free(ptdesc);
        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

/*
 * No scalability reason to split PUD locks yet, but follow the same pattern
 * as the PMD locks to make it easier if we decide to.  The VM should not be
 * considered ready to switch to split PUD locks yet; there may be places
 * which need to be converted from page_table_lock.
 */
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
        return &mm->page_table_lock;
}

static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
        spinlock_t *ptl = pud_lockptr(mm, pud);

        spin_lock(ptl);
        return ptl;
}

static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
}

static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

extern void __init pagecache_init(void);
extern void free_initmem(void);

/*
 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
 * into the buddy system. The freed pages will be poisoned with pattern
 * "poison" if it's within range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);

extern void adjust_managed_page_count(struct page *page, long count);

extern void reserve_bootmem_region(phys_addr_t start,
                                   phys_addr_t end, int nid);

/* Free the reserved page into the buddy system, so it gets managed. */
static inline void free_reserved_page(struct page *page)
{
        if (mem_alloc_profiling_enabled()) {
                union codetag_ref *ref = get_page_tag_ref(page);

                if (ref) {
                        set_codetag_empty(ref);
                        put_page_tag_ref(ref);
                }
        }
        ClearPageReserved(page);
        init_page_count(page);
        __free_page(page);
        adjust_managed_page_count(page, 1);
}
#define free_highmem_page(page) free_reserved_page(page)

static inline void mark_page_reserved(struct page *page)
{
        SetPageReserved(page);
        adjust_managed_page_count(page, -1);
}

static inline void free_reserved_ptdesc(struct ptdesc *pt)
{
        free_reserved_page(ptdesc_page(pt));
}

/*
 * Default method to free all the __init memory into the buddy system.
 * The freed pages will be poisoned with pattern "poison" if it's within
 * range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
static inline unsigned long free_initmem_default(int poison)
{
        extern char __init_begin[], __init_end[];

        return free_reserved_area(&__init_begin, &__init_end,
                                  poison, "unused kernel image (initmem)");
}

static inline unsigned long get_num_physpages(void)
{
        int nid;
        unsigned long phys_pages = 0;

        for_each_online_node(nid)
                phys_pages += node_present_pages(nid);

        return phys_pages;
}

/*
 * Using memblock node mappings, an architecture may initialise its
 * zones, allocate the backing mem_map and account for memory holes in an
 * architecture independent manner.
 *
 * An architecture is expected to register range of page frames backed by
 * physical memory with memblock_add[_node]() before calling
 * free_area_init() passing in the PFN each zone ends at. At a basic
 * usage, an architecture is expected to do something like
 *
 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
 *                                                          max_highmem_pfn};
 * for_each_valid_physical_page_range()
 *        memblock_add_node(base, size, nid, MEMBLOCK_NONE)
 * free_area_init(max_zone_pfns);
 */
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);

#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
{
        return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif

extern void mem_init(void);
extern void __init mmap_init(void);

extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
static inline void show_mem(void)
{
        __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);

extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);

extern void setup_per_cpu_pageset(void);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
                              struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
                              struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);

#define vma_interval_tree_foreach(vma, root, start, last)                \
        for (vma = vma_interval_tree_iter_first(root, start, last);        \
             vma; vma = vma_interval_tree_iter_next(vma, start, last))

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
        struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif

#define anon_vma_interval_tree_foreach(avc, root, start, last)                 \
        for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
             avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
                      unsigned long start, unsigned long end, pgoff_t pgoff,
                      struct vm_area_struct *next);
extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
                       unsigned long start, unsigned long end, pgoff_t pgoff);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
                                  struct vm_area_struct *prev,
                                  struct vm_area_struct *vma,
                                  unsigned long start, unsigned long end,
                                  unsigned long vm_flags,
                                  struct mempolicy *policy,
                                  struct vm_userfaultfd_ctx uffd_ctx,
                                  struct anon_vma_name *anon_name);

/* We are about to modify the VMA's flags. */
static inline struct vm_area_struct
*vma_modify_flags(struct vma_iterator *vmi,
                  struct vm_area_struct *prev,
                  struct vm_area_struct *vma,
                  unsigned long start, unsigned long end,
                  unsigned long new_flags)
{
        return vma_modify(vmi, prev, vma, start, end, new_flags,
                          vma_policy(vma), vma->vm_userfaultfd_ctx,
                          anon_vma_name(vma));
}

/* We are about to modify the VMA's flags and/or anon_name. */
static inline struct vm_area_struct
*vma_modify_flags_name(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start,
                       unsigned long end,
                       unsigned long new_flags,
                       struct anon_vma_name *new_name)
{
        return vma_modify(vmi, prev, vma, start, end, new_flags,
                          vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
}

/* We are about to modify the VMA's memory policy. */
static inline struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
                   struct vm_area_struct *prev,
                   struct vm_area_struct *vma,
                   unsigned long start, unsigned long end,
                   struct mempolicy *new_pol)
{
        return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
                          new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}

/* We are about to modify the VMA's flags and/or uffd context. */
static inline struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start, unsigned long end,
                       unsigned long new_flags,
                       struct vm_userfaultfd_ctx new_ctx)
{
        return vma_modify(vmi, prev, vma, start, end, new_flags,
                          vma_policy(vma), new_ctx, anon_vma_name(vma));
}

static inline int check_data_rlimit(unsigned long rlim,
                                    unsigned long new,
                                    unsigned long start,
                                    unsigned long end_data,
                                    unsigned long start_data)
{
        if (rlim < RLIM_INFINITY) {
                if (((new - start) + (end_data - start_data)) > rlim)
                        return -ENOSPC;
        }

        return 0;
}

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);

extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);

extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);

extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
                                   const struct vm_special_mapping *sm);
extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags,
                                   const struct vm_special_mapping *spec);
/* This is an obsolete alternative to _install_special_mapping. */
extern int install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags, struct page **pages);

unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                    unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);

static inline unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                  unsigned long pgoff, unsigned long flags)
{
        return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
}

extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
        struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
        struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                         unsigned long start, size_t len, struct list_head *uf,
                         bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);

#ifdef CONFIG_MMU
extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                         unsigned long start, unsigned long end,
                         struct list_head *uf, bool unlock);
extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
        /* Ignore errors */
        (void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif

/* This takes the mm semaphore itself */
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
        unsigned long flags;
        unsigned long length;
        unsigned long low_limit;
        unsigned long high_limit;
        unsigned long align_mask;
        unsigned long align_offset;
        unsigned long start_gap;
};

extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);

/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
                                       loff_t lstart, loff_t lend);
extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);

/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
int expand_downwards(struct vm_area_struct *vma, unsigned long address);

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                             struct vm_area_struct **pprev);

/*
 * Look up the first VMA which intersects the interval [start_addr, end_addr)
 * NULL if none.  Assume start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                        unsigned long start_addr, unsigned long end_addr);

/**
 * vma_lookup() - Find a VMA at a specific address
 * @mm: The process address space.
 * @addr: The user address.
 *
 * Return: The vm_area_struct at the given address, %NULL otherwise.
 */
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
        return mtree_load(&mm->mm_mt, addr);
}

static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_GROWSDOWN)
                return stack_guard_gap;

        /* See reasoning around the VM_SHADOW_STACK definition */
        if (vma->vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
        unsigned long gap = stack_guard_start_gap(vma);
        unsigned long vm_start = vma->vm_start;

        vm_start -= gap;
        if (vm_start > vma->vm_start)
                vm_start = 0;
        return vm_start;
}

static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
{
        unsigned long vm_end = vma->vm_end;

        if (vma->vm_flags & VM_GROWSUP) {
                vm_end += stack_guard_gap;
                if (vm_end < vma->vm_end)
                        vm_end = -PAGE_SIZE;
        }
        return vm_end;
}

static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
                                unsigned long vm_start, unsigned long vm_end)
{
        struct vm_area_struct *vma = vma_lookup(mm, vm_start);

        if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
                vma = NULL;

        return vma;
}

static inline bool range_in_vma(struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
{
        return (vma && vma->vm_start <= start && end <= vma->vm_end);
}

#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif

void vma_set_file(struct vm_area_struct *vma, struct file *file);

#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
#endif

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
                unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);

static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
                                unsigned long addr, struct page *page)
{
        int err = vm_insert_page(vma, addr, page);

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

#ifndef io_remap_pfn_range
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
                                     unsigned long addr, unsigned long pfn,
                                     unsigned long size, pgprot_t prot)
{
        return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
}
#endif

static inline vm_fault_t vmf_error(int err)
{
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        else if (err == -EHWPOISON)
                return VM_FAULT_HWPOISON;
        return VM_FAULT_SIGBUS;
}

/*
 * Convert errno to return value for ->page_mkwrite() calls.
 *
 * This should eventually be merged with vmf_error() above, but will need a
 * careful audit of all vmf_error() callers.
 */
static inline vm_fault_t vmf_fs_error(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT || err == -EAGAIN)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                         unsigned int foll_flags);

static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
        if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
        if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                return -EFAULT;
        return 0;
}

/*
 * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
 * a (NUMA hinting) fault is required.
 */
static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
                                           unsigned int flags)
{
        /*
         * If callers don't want to honor NUMA hinting faults, no need to
         * determine if we would actually have to trigger a NUMA hinting fault.
         */
        if (!(flags & FOLL_HONOR_NUMA_FAULT))
                return true;

        /*
         * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
         *
         * Requiring a fault here even for inaccessible VMAs would mean that
         * FOLL_FORCE cannot make any progress, because handle_mm_fault()
         * refuses to process NUMA hinting faults in inaccessible VMAs.
         */
        return !vma_is_accessible(vma);
}

typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
extern bool _page_poisoning_enabled_early;
DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
static inline bool page_poisoning_enabled(void)
{
        return _page_poisoning_enabled_early;
}
/*
 * For use in fast paths after init_mem_debugging() has run, or when a
 * false negative result is not harmful when called too early.
 */
static inline bool page_poisoning_enabled_static(void)
{
        return static_branch_unlikely(&_page_poisoning_enabled);
}
static inline void kernel_poison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_poison_pages(page, numpages);
}
static inline void kernel_unpoison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_unpoison_pages(page, numpages);
}
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline bool page_poisoning_enabled_static(void) { return false; }
static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
static inline void kernel_poison_pages(struct page *page, int numpages) { }
static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
#endif

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc))
                return true;
        return flags & __GFP_ZERO;
}

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
static inline bool want_init_on_free(void)
{
        return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
                                   &init_on_free);
}

extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);

static inline bool debug_pagealloc_enabled(void)
{
        return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
                _debug_pagealloc_enabled_early;
}

/*
 * For use in fast paths after mem_debugging_and_hardening_init() has run,
 * or when a false negative result is not harmful when called too early.
 */
static inline bool debug_pagealloc_enabled_static(void)
{
        if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
                return false;

        return static_branch_unlikely(&_debug_pagealloc_enabled);
}

/*
 * To support DEBUG_PAGEALLOC architecture must ensure that
 * __kernel_map_pages() never fails
 */
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_DEBUG_PAGEALLOC
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 1);
}

static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 0);
}

extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);

static inline unsigned int debug_guardpage_minorder(void)
{
        return _debug_guardpage_minorder;
}

static inline bool debug_guardpage_enabled(void)
{
        return static_branch_unlikely(&_debug_guardpage_enabled);
}

static inline bool page_is_guard(struct page *page)
{
        if (!debug_guardpage_enabled())
                return false;

        return PageGuard(page);
}

bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
                                  unsigned int order)
{
        if (!debug_guardpage_enabled())
                return false;
        return __set_page_guard(zone, page, order);
}

void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                    unsigned int order)
{
        if (!debug_guardpage_enabled())
                return;
        __clear_page_guard(zone, page, order);
}

#else        /* CONFIG_DEBUG_PAGEALLOC */
static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
                        unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                unsigned int order) {}
#endif        /* CONFIG_DEBUG_PAGEALLOC */

#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
        return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        return 0;
}
#endif        /* __HAVE_ARCH_GATE_AREA */

extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);

#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
#endif

void drop_slab(void);

#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif

const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif

void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
                struct dev_pagemap *pgmap);
void pmd_init(void *addr);
void pud_init(void *addr);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                            struct vmem_altmap *altmap, struct page *reuse);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
                     unsigned long addr, unsigned long next);
int vmemmap_check_pmd(pmd_t *pmd, int node,
                      unsigned long addr, unsigned long next);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
        /* number of pfns from base where pfn_to_page() is valid */
        if (altmap)
                return altmap->reserve + altmap->free;
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
        altmap->alloc -= nr_pfns;
}
#else
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
}
#endif

#define VMEMMAP_RESERVE_NR        2
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
                                          struct dev_pagemap *pgmap)
{
        unsigned long nr_pages;
        unsigned long nr_vmemmap_pages;

        if (!pgmap || !is_power_of_2(sizeof(struct page)))
                return false;

        nr_pages = pgmap_vmemmap_nr(pgmap);
        nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
        /*
         * For vmemmap optimization with DAX we need minimum 2 vmemmap
         * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
         */
        return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
/*
 * If we don't have an architecture override, use the generic rule
 */
#ifndef vmemmap_can_optimize
#define vmemmap_can_optimize __vmemmap_can_optimize
#endif

#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
                                           struct dev_pagemap *pgmap)
{
        return false;
}
#endif

void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                  unsigned long nr_pages);

enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
        MF_ACTION_REQUIRED = 1 << 1,
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
        MF_UNPOISON = 1 << 4,
        MF_SW_SIMULATED = 1 << 5,
        MF_NO_RETRY = 1 << 6,
        MF_MEM_PRE_REMOVE = 1 << 7,
};
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                      unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Sysfs entries for memory failure handling statistics.
 */
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
}

static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}

static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
{
}
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
                     struct vm_area_struct *vma, struct list_head *to_kill,
                     unsigned long ksm_addr);
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
#else
static inline void memblk_nr_poison_inc(unsigned long pfn)
{
}

static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
{
}
#endif

#ifndef arch_memory_failure
static inline int arch_memory_failure(unsigned long pfn, int flags)
{
        return -ENXIO;
}
#endif

#ifndef arch_is_platform_page
static inline bool arch_is_platform_page(u64 paddr)
{
        return false;
}
#endif

/*
 * Error handlers for various types of pages.
 */
enum mf_result {
        MF_IGNORED,        /* Error: cannot be handled */
        MF_FAILED,        /* Error: handling failed */
        MF_DELAYED,        /* Will be handled later */
        MF_RECOVERED,        /* Successfully recovered */
};

enum mf_action_page_type {
        MF_MSG_KERNEL,
        MF_MSG_KERNEL_HIGH_ORDER,
        MF_MSG_SLAB,
        MF_MSG_DIFFERENT_COMPOUND,
        MF_MSG_HUGE,
        MF_MSG_FREE_HUGE,
        MF_MSG_UNMAP_FAILED,
        MF_MSG_DIRTY_SWAPCACHE,
        MF_MSG_CLEAN_SWAPCACHE,
        MF_MSG_DIRTY_MLOCKED_LRU,
        MF_MSG_CLEAN_MLOCKED_LRU,
        MF_MSG_DIRTY_UNEVICTABLE_LRU,
        MF_MSG_CLEAN_UNEVICTABLE_LRU,
        MF_MSG_DIRTY_LRU,
        MF_MSG_CLEAN_LRU,
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_DAX,
        MF_MSG_UNSPLIT_THP,
        MF_MSG_UNKNOWN,
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint,
                          struct vm_area_struct *vma);
long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault);

/**
 * vma_is_special_huge - Are transhuge page-table entries considered special?
 * @vma: Pointer to the struct vm_area_struct to consider
 *
 * Whether transhuge page-table entries are considered "special" following
 * the definition in vm_normal_page().
 *
 * Return: true if transhuge page-table entries should be considered special,
 * false otherwise.
 */
static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
        return vma_is_dax(vma) || (vma->vm_file &&
                                   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif

extern int memcmp_pages(struct page *page1, struct page *page2);

static inline int pages_identical(struct page *page1, struct page *page2)
{
        return !memcmp_pages(page1, page2);
}

#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
                                                pgoff_t bitmap_pgoff,
                                                unsigned long *bitmap,
                                                pgoff_t *start,
                                                pgoff_t *end);

unsigned long wp_shared_mapping_range(struct address_space *mapping,
                                      pgoff_t first_index, pgoff_t nr);
#endif

extern int sysctl_nr_trim_pages;

#ifdef CONFIG_PRINTK
void mem_dump_obj(void *object);
#else
static inline void mem_dump_obj(void *object) {}
#endif

/**
 * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
 *                    handle them.
 * @seals: the seals to check
 * @vma: the vma to operate on
 *
 * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
 * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
 */
static inline int seal_check_write(int seals, struct vm_area_struct *vma)
{
        if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                /*
                 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
                 * write seals are active.
                 */
                if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
                        return -EPERM;

                /*
                 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
                 * MAP_SHARED and read-only, take care to not allow mprotect to
                 * revert protections on such mappings. Do this only for shared
                 * mappings. For private mappings, don't need to mask
                 * VM_MAYWRITE as we still want them to be COW-writable.
                 */
                if (vma->vm_flags & VM_SHARED)
                        vm_flags_clear(vma, VM_MAYWRITE);
        }

        return 0;
}

#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                          unsigned long len_in,
                          struct anon_vma_name *anon_name);
#else
static inline int
madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                      unsigned long len_in, struct anon_vma_name *anon_name) {
        return 0;
}
#endif

#ifdef CONFIG_UNACCEPTED_MEMORY

bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
void accept_memory(phys_addr_t start, phys_addr_t end);

#else

static inline bool range_contains_unaccepted_memory(phys_addr_t start,
                                                    phys_addr_t end)
{
        return false;
}

static inline void accept_memory(phys_addr_t start, phys_addr_t end)
{
}

#endif

static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
{
        phys_addr_t paddr = pfn << PAGE_SHIFT;

        return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
}

void vma_pgtable_walk_begin(struct vm_area_struct *vma);
void vma_pgtable_walk_end(struct vm_area_struct *vma);

#endif /* _LINUX_MM_H */































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H

/*
 * If more than 16 keys are ever supported, a thorough audit
 * will be necessary to ensure that the types that store key
 * numbers and masks have sufficient capacity.
 */
#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)

extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);

static inline bool arch_pkeys_enabled(void)
{
        return cpu_feature_enabled(X86_FEATURE_OSPKE);
}

/*
 * Try to dedicate one of the protection keys to be used as an
 * execute-only protection key.
 */
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return ARCH_DEFAULT_PKEY;

        return __execute_only_pkey(mm);
}

extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey);
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return 0;

        return __arch_override_mprotect_pkey(vma, prot, pkey);
}

#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)

#define mm_pkey_allocation_map(mm)        (mm->context.pkey_allocation_map)
#define mm_set_pkey_allocated(mm, pkey) do {                \
        mm_pkey_allocation_map(mm) |= (1U << pkey);        \
} while (0)
#define mm_set_pkey_free(mm, pkey) do {                        \
        mm_pkey_allocation_map(mm) &= ~(1U << pkey);        \
} while (0)

static inline
bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
        /*
         * "Allocated" pkeys are those that have been returned
         * from pkey_alloc() or pkey 0 which is allocated
         * implicitly when the mm is created.
         */
        if (pkey < 0)
                return false;
        if (pkey >= arch_max_pkey())
                return false;
        /*
         * The exec-only pkey is set in the allocation map, but
         * is not available to any of the user interfaces like
         * mprotect_pkey().
         */
        if (pkey == mm->context.execute_only_pkey)
                return false;

        return mm_pkey_allocation_map(mm) & (1U << pkey);
}

/*
 * Returns a positive, 4-bit key on success, or -1 on failure.
 */
static inline
int mm_pkey_alloc(struct mm_struct *mm)
{
        /*
         * Note: this is the one and only place we make sure
         * that the pkey is valid as far as the hardware is
         * concerned.  The rest of the kernel trusts that
         * only good, valid pkeys come out of here.
         */
        u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
        int ret;

        /*
         * Are we out of pkeys?  We must handle this specially
         * because ffz() behavior is undefined if there are no
         * zeros.
         */
        if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
                return -1;

        ret = ffz(mm_pkey_allocation_map(mm));

        mm_set_pkey_allocated(mm, ret);

        return ret;
}

static inline
int mm_pkey_free(struct mm_struct *mm, int pkey)
{
        if (!mm_pkey_is_allocated(mm, pkey))
                return -EINVAL;

        mm_set_pkey_free(mm, pkey);

        return 0;
}

static inline int vma_pkey(struct vm_area_struct *vma)
{
        unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
                                      VM_PKEY_BIT2 | VM_PKEY_BIT3;

        return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
}

#endif /*_ASM_X86_PKEYS_H */









































    4 



    7 
    7 

    8 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * generic net pointers
 */

#ifndef __NET_GENERIC_H__
#define __NET_GENERIC_H__

#include <linux/bug.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>

/*
 * Generic net pointers are to be used by modules to put some private
 * stuff on the struct net without explicit struct net modification
 *
 * The rules are simple:
 * 1. set pernet_operations->id.  After register_pernet_device you
 *    will have the id of your private pointer.
 * 2. set pernet_operations->size to have the code allocate and free
 *    a private structure pointed to from struct net.
 * 3. do not change this pointer while the net is alive;
 * 4. do not try to have any private reference on the net_generic object.
 *
 * After accomplishing all of the above, the private pointer can be
 * accessed with the net_generic() call.
 */

struct net_generic {
        union {
                struct {
                        unsigned int len;
                        struct rcu_head rcu;
                } s;

                DECLARE_FLEX_ARRAY(void *, ptr);
        };
};

static inline void *net_generic(const struct net *net, unsigned int id)
{
        struct net_generic *ng;
        void *ptr;

        rcu_read_lock();
        ng = rcu_dereference(net->gen);
        ptr = ng->ptr[id];
        rcu_read_unlock();

        return ptr;
}
#endif











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
    2 





































































































































































































































































































































































































































































































































































    8 
    7 



    8 




















    3 
    3 










    2 




































































































































































































    1 


    1 






































































































































































































































































    1 


    1 





































































































    2 


    2 

















    2 


    2 






































































































































































































































































































































































































































































    2 
    2 






















    9 
    9 












    8 
    8 



    8 












    3 
    3 









    2 


    3 



    2 


















    2 
    2 



























    1 







































    1 
    1 












    1 
    1 














































































































    1 


    1 



    1 














    1 
    1 






















































































































































































































































































































































































    1 

    1 




































































































































































































    1 
    1 



































































































































































































































































































































































    9 
    8 

    8 























































































































































































































    2 
    2 
















































































































































































































































    3 
    4 






















    3 
    4 














    1 
    1 

















    1 
    1 















    1 
    1 












    1 
    1 














    2 
    2 













    6 
    7 















    5 
    6 












    1 
    1 











    1 
    1 














    1 
    1 













    2 
    2 













    1 
    2 















    1 
    1 









































    2 
    2 















    6 
    5 





















    2 
    2 











    1 
    1 


























    2 
    2 



















































































































































































    1 
    1 



















    1 
    1 




























































































































































































































































    1 
    1 














































    1 
    1 















































































































































































    1 
    1 



























    1 
    1 

































    1 
    1 















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com>
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/fsnotify.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/xattr.h>
#include <linux/msg.h>
#include <linux/overflow.h>
#include <net/flow.h>

/* How many LSMs were built into the kernel? */
#define LSM_COUNT (__end_lsm_info - __start_lsm_info)

/*
 * How many LSMs are built into the kernel as determined at
 * build time. Used to determine fixed array sizes.
 * The capability module is accounted for by CONFIG_SECURITY
 */
#define LSM_CONFIG_COUNT ( \
        (IS_ENABLED(CONFIG_SECURITY) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_TOMOYO) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_APPARMOR) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_YAMA) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_LOADPIN) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_SAFESETID) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_LOCKDOWN_LSM) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_LANDLOCK) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_IMA) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_EVM) ? 1 : 0))

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

struct security_hook_heads security_hook_heads __ro_after_init;
static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);

static struct kmem_cache *lsm_file_cache;
static struct kmem_cache *lsm_inode_cache;

char *lsm_names;
static struct lsm_blob_sizes blob_sizes __ro_after_init;

/* Boot-time LSM user choice */
static __initdata const char *chosen_lsm_order;
static __initdata const char *chosen_major_lsm;

static __initconst const char *const builtin_lsm_order = CONFIG_LSM;

/* Ordered list of LSMs to initialize. */
static __initdata struct lsm_info **ordered_lsms;
static __initdata struct lsm_info *exclusive;

static __initdata bool debug;
#define init_debug(...)                                                \
        do {                                                        \
                if (debug)                                        \
                        pr_info(__VA_ARGS__);                        \
        } while (0)

static bool __init is_enabled(struct lsm_info *lsm)
{
        if (!lsm->enabled)
                return false;

        return *lsm->enabled;
}

/* Mark an LSM's enabled flag. */
static int lsm_enabled_true __initdata = 1;
static int lsm_enabled_false __initdata = 0;
static void __init set_enabled(struct lsm_info *lsm, bool enabled)
{
        /*
         * When an LSM hasn't configured an enable variable, we can use
         * a hard-coded location for storing the default enabled state.
         */
        if (!lsm->enabled) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
                else
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_true) {
                if (!enabled)
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_false) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
        } else {
                *lsm->enabled = enabled;
        }
}

/* Is an LSM already listed in the ordered LSMs list? */
static bool __init exists_ordered_lsm(struct lsm_info *lsm)
{
        struct lsm_info **check;

        for (check = ordered_lsms; *check; check++)
                if (*check == lsm)
                        return true;

        return false;
}

/* Append an LSM to the list of ordered LSMs to initialize. */
static int last_lsm __initdata;
static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
{
        /* Ignore duplicate selections. */
        if (exists_ordered_lsm(lsm))
                return;

        if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from))
                return;

        /* Enable this LSM, if it is not already set. */
        if (!lsm->enabled)
                lsm->enabled = &lsm_enabled_true;
        ordered_lsms[last_lsm++] = lsm;

        init_debug("%s ordered: %s (%s)\n", from, lsm->name,
                   is_enabled(lsm) ? "enabled" : "disabled");
}

/* Is an LSM allowed to be initialized? */
static bool __init lsm_allowed(struct lsm_info *lsm)
{
        /* Skip if the LSM is disabled. */
        if (!is_enabled(lsm))
                return false;

        /* Not allowed if another exclusive LSM already initialized. */
        if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
                init_debug("exclusive disabled: %s\n", lsm->name);
                return false;
        }

        return true;
}

static void __init lsm_set_blob_size(int *need, int *lbs)
{
        int offset;

        if (*need <= 0)
                return;

        offset = ALIGN(*lbs, sizeof(void *));
        *lbs = offset + *need;
        *need = offset;
}

static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
{
        if (!needed)
                return;

        lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
        lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
        /*
         * The inode blob gets an rcu_head in addition to
         * what the modules might need.
         */
        if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
                blob_sizes.lbs_inode = sizeof(struct rcu_head);
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
        lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
        lsm_set_blob_size(&needed->lbs_xattr_count,
                          &blob_sizes.lbs_xattr_count);
}

/* Prepare LSM for initialization. */
static void __init prepare_lsm(struct lsm_info *lsm)
{
        int enabled = lsm_allowed(lsm);

        /* Record enablement (to handle any following exclusive LSMs). */
        set_enabled(lsm, enabled);

        /* If enabled, do pre-initialization work. */
        if (enabled) {
                if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
                        exclusive = lsm;
                        init_debug("exclusive chosen:   %s\n", lsm->name);
                }

                lsm_set_blob_sizes(lsm->blobs);
        }
}

/* Initialize a given LSM, if it is enabled. */
static void __init initialize_lsm(struct lsm_info *lsm)
{
        if (is_enabled(lsm)) {
                int ret;

                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
}

/*
 * Current index to use while initializing the lsm id list.
 */
u32 lsm_active_cnt __ro_after_init;
const struct lsm_id *lsm_idlist[LSM_CONFIG_COUNT];

/* Populate ordered LSMs list from comma-separated LSM name list. */
static void __init ordered_lsm_parse(const char *order, const char *origin)
{
        struct lsm_info *lsm;
        char *sep, *name, *next;

        /* LSM_ORDER_FIRST is always first. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_FIRST)
                        append_ordered_lsm(lsm, "  first");
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                struct lsm_info *major;

                /*
                 * To match the original "security=" behavior, this
                 * explicitly does NOT fallback to another Legacy Major
                 * if the selected one was separately disabled: disable
                 * all non-matching Legacy Major LSMs.
                 */
                for (major = __start_lsm_info; major < __end_lsm_info;
                     major++) {
                        if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
                            strcmp(major->name, chosen_major_lsm) != 0) {
                                set_enabled(major, false);
                                init_debug("security=%s disabled: %s (only one legacy major LSM)\n",
                                           chosen_major_lsm, major->name);
                        }
                }
        }

        sep = kstrdup(order, GFP_KERNEL);
        next = sep;
        /* Walk the list, looking for matching LSMs. */
        while ((name = strsep(&next, ",")) != NULL) {
                bool found = false;

                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (strcmp(lsm->name, name) == 0) {
                                if (lsm->order == LSM_ORDER_MUTABLE)
                                        append_ordered_lsm(lsm, origin);
                                found = true;
                        }
                }

                if (!found)
                        init_debug("%s ignored: %s (not built into kernel)\n",
                                   origin, name);
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (exists_ordered_lsm(lsm))
                                continue;
                        if (strcmp(lsm->name, chosen_major_lsm) == 0)
                                append_ordered_lsm(lsm, "security=");
                }
        }

        /* LSM_ORDER_LAST is always last. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_LAST)
                        append_ordered_lsm(lsm, "   last");
        }

        /* Disable all LSMs not in the ordered list. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (exists_ordered_lsm(lsm))
                        continue;
                set_enabled(lsm, false);
                init_debug("%s skipped: %s (not in requested order)\n",
                           origin, lsm->name);
        }

        kfree(sep);
}

static void __init lsm_early_cred(struct cred *cred);
static void __init lsm_early_task(struct task_struct *task);

static int lsm_append(const char *new, char **result);

static void __init report_lsm_order(void)
{
        struct lsm_info **lsm, *early;
        int first = 0;

        pr_info("initializing lsm=");

        /* Report each enabled LSM name, comma separated. */
        for (early = __start_early_lsm_info;
             early < __end_early_lsm_info; early++)
                if (is_enabled(early))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", early->name);
        for (lsm = ordered_lsms; *lsm; lsm++)
                if (is_enabled(*lsm))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name);

        pr_cont("\n");
}

static void __init ordered_lsm_init(void)
{
        struct lsm_info **lsm;

        ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms),
                               GFP_KERNEL);

        if (chosen_lsm_order) {
                if (chosen_major_lsm) {
                        pr_warn("security=%s is ignored because it is superseded by lsm=%s\n",
                                chosen_major_lsm, chosen_lsm_order);
                        chosen_major_lsm = NULL;
                }
                ordered_lsm_parse(chosen_lsm_order, "cmdline");
        } else
                ordered_lsm_parse(builtin_lsm_order, "builtin");

        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);

        report_lsm_order();

        init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
        init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
        init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
        init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
        init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
        init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
        init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
        init_debug("xattr slots          = %d\n", blob_sizes.lbs_xattr_count);

        /*
         * Create any kmem_caches needed for blobs
         */
        if (blob_sizes.lbs_file)
                lsm_file_cache = kmem_cache_create("lsm_file_cache",
                                                   blob_sizes.lbs_file, 0,
                                                   SLAB_PANIC, NULL);
        if (blob_sizes.lbs_inode)
                lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
                                                    blob_sizes.lbs_inode, 0,
                                                    SLAB_PANIC, NULL);

        lsm_early_cred((struct cred *) current->cred);
        lsm_early_task(current);
        for (lsm = ordered_lsms; *lsm; lsm++)
                initialize_lsm(*lsm);

        kfree(ordered_lsms);
}

int __init early_security_init(void)
{
        struct lsm_info *lsm;

#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        INIT_HLIST_HEAD(&security_hook_heads.NAME);
#include "linux/lsm_hook_defs.h"
#undef LSM_HOOK

        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (!lsm->enabled)
                        lsm->enabled = &lsm_enabled_true;
                prepare_lsm(lsm);
                initialize_lsm(lsm);
        }

        return 0;
}

/**
 * security_init - initializes the security framework
 *
 * This should be called early in the kernel initialization sequence.
 */
int __init security_init(void)
{
        struct lsm_info *lsm;

        init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*");
        init_debug("  CONFIG_LSM=%s\n", builtin_lsm_order);
        init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*");

        /*
         * Append the names of the early LSM modules now that kmalloc() is
         * available
         */
        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                init_debug("  early started: %s (%s)\n", lsm->name,
                           is_enabled(lsm) ? "enabled" : "disabled");
                if (lsm->enabled)
                        lsm_append(lsm->name, &lsm_names);
        }

        /* Load LSMs in specified order. */
        ordered_lsm_init();

        return 0;
}

/* Save user chosen LSM */
static int __init choose_major_lsm(char *str)
{
        chosen_major_lsm = str;
        return 1;
}
__setup("security=", choose_major_lsm);

/* Explicitly choose LSM initialization order. */
static int __init choose_lsm_order(char *str)
{
        chosen_lsm_order = str;
        return 1;
}
__setup("lsm=", choose_lsm_order);

/* Enable LSM order debugging. */
static int __init enable_debug(char *str)
{
        debug = true;
        return 1;
}
__setup("lsm.debug", enable_debug);

static bool match_last_lsm(const char *list, const char *lsm)
{
        const char *last;

        if (WARN_ON(!list || !lsm))
                return false;
        last = strrchr(list, ',');
        if (last)
                /* Pass the comma, strcmp() will check for '\0' */
                last++;
        else
                last = list;
        return !strcmp(last, lsm);
}

static int lsm_append(const char *new, char **result)
{
        char *cp;

        if (*result == NULL) {
                *result = kstrdup(new, GFP_KERNEL);
                if (*result == NULL)
                        return -ENOMEM;
        } else {
                /* Check if it is the last registered name */
                if (match_last_lsm(*result, new))
                        return 0;
                cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
                if (cp == NULL)
                        return -ENOMEM;
                kfree(*result);
                *result = cp;
        }
        return 0;
}

/**
 * security_add_hooks - Add a modules hooks to the hook lists.
 * @hooks: the hooks to add
 * @count: the number of hooks to add
 * @lsmid: the identification information for the security module
 *
 * Each LSM has to register its hooks with the infrastructure.
 */
void __init security_add_hooks(struct security_hook_list *hooks, int count,
                               const struct lsm_id *lsmid)
{
        int i;

        /*
         * A security module may call security_add_hooks() more
         * than once during initialization, and LSM initialization
         * is serialized. Landlock is one such case.
         * Look at the previous entry, if there is one, for duplication.
         */
        if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
                if (lsm_active_cnt >= LSM_CONFIG_COUNT)
                        panic("%s Too many LSMs registered.\n", __func__);
                lsm_idlist[lsm_active_cnt++] = lsmid;
        }

        for (i = 0; i < count; i++) {
                hooks[i].lsmid = lsmid;
                hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
        }

        /*
         * Don't try to append during early_security_init(), we'll come back
         * and fix this up afterwards.
         */
        if (slab_is_available()) {
                if (lsm_append(lsmid->name, &lsm_names) < 0)
                        panic("%s - Cannot get early memory.\n", __func__);
        }
}

int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
        return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
                                            event, data);
}
EXPORT_SYMBOL(call_blocking_lsm_notifier);

int register_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
                                                nb);
}
EXPORT_SYMBOL(register_blocking_lsm_notifier);

int unregister_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        if (blob_sizes.lbs_cred == 0) {
                cred->security = NULL;
                return 0;
        }

        cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
        if (cred->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_cred - during initialization allocate a composite cred blob
 * @cred: the cred that needs a blob
 *
 * Allocate the cred blob for all the modules
 */
static void __init lsm_early_cred(struct cred *cred)
{
        int rc = lsm_cred_alloc(cred, GFP_KERNEL);

        if (rc)
                panic("%s: Early cred alloc failed.\n", __func__);
}

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
int lsm_inode_alloc(struct inode *inode)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_task_alloc(struct task_struct *task)
{
        if (blob_sizes.lbs_task == 0) {
                task->security = NULL;
                return 0;
        }

        task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
        if (task->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        if (blob_sizes.lbs_ipc == 0) {
                kip->security = NULL;
                return 0;
        }

        kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
        if (kip->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        if (blob_sizes.lbs_msg_msg == 0) {
                mp->security = NULL;
                return 0;
        }

        mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
        if (mp->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_task - during initialization allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 */
static void __init lsm_early_task(struct task_struct *task)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                panic("%s: Early task alloc failed.\n", __func__);
}

/**
 * lsm_superblock_alloc - allocate a composite superblock blob
 * @sb: the superblock that needs a blob
 *
 * Allocate the superblock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_superblock_alloc(struct super_block *sb)
{
        if (blob_sizes.lbs_superblock == 0) {
                sb->s_security = NULL;
                return 0;
        }

        sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL);
        if (sb->s_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
 * @uctx: a userspace LSM context to be filled
 * @uctx_len: available uctx size (input), used uctx size (output)
 * @val: the new LSM context value
 * @val_len: the size of the new LSM context value
 * @id: LSM id
 * @flags: LSM defined flags
 *
 * Fill all of the fields in a userspace lsm_ctx structure.  If @uctx is NULL
 * simply calculate the required size to output via @utc_len and return
 * success.
 *
 * Returns 0 on success, -E2BIG if userspace buffer is not large enough,
 * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated.
 */
int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len,
                      void *val, size_t val_len,
                      u64 id, u64 flags)
{
        struct lsm_ctx *nctx = NULL;
        size_t nctx_len;
        int rc = 0;

        nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *));
        if (nctx_len > *uctx_len) {
                rc = -E2BIG;
                goto out;
        }

        /* no buffer - return success/0 and set @uctx_len to the req size */
        if (!uctx)
                goto out;

        nctx = kzalloc(nctx_len, GFP_KERNEL);
        if (nctx == NULL) {
                rc = -ENOMEM;
                goto out;
        }
        nctx->id = id;
        nctx->flags = flags;
        nctx->len = nctx_len;
        nctx->ctx_len = val_len;
        memcpy(nctx->ctx, val, val_len);

        if (copy_to_user(uctx, nctx, nctx_len))
                rc = -EFAULT;

out:
        kfree(nctx);
        *uctx_len = nctx_len;
        return rc;
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */

#define call_void_hook(FUNC, ...)                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \
                        P->hook.FUNC(__VA_ARGS__);                \
        } while (0)

#define call_int_hook(FUNC, ...) ({                                \
        int RC = LSM_RET_DEFAULT(FUNC);                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \
                        RC = P->hook.FUNC(__VA_ARGS__);                \
                        if (RC != LSM_RET_DEFAULT(FUNC))        \
                                break;                                \
                }                                                \
        } while (0);                                                \
        RC;                                                        \
})

/* Security operations */

/**
 * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok
 * @mgr: task credentials of current binder process
 *
 * Check whether @mgr is allowed to be the binder context manager.
 *
 * Return: Return 0 if permission is granted.
 */
int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, mgr);
}

/**
 * security_binder_transaction() - Check if a binder transaction is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to invoke a binder transaction call to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, from, to);
}

/**
 * security_binder_transfer_binder() - Check if a binder transfer is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to transfer a binder reference to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, from, to);
}

/**
 * security_binder_transfer_file() - Check if a binder file xfer is allowed
 * @from: sending process
 * @to: receiving process
 * @file: file being transferred
 *
 * Check whether @from is allowed to transfer @file to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, const struct file *file)
{
        return call_int_hook(binder_transfer_file, from, to, file);
}

/**
 * security_ptrace_access_check() - Check if tracing is allowed
 * @child: target process
 * @mode: PTRACE_MODE flags
 *
 * Check permission before allowing the current process to trace the @child
 * process.  Security modules may also want to perform a process tracing check
 * during an execve in the set_security or apply_creds hooks of tracing check
 * during an execve in the bprm_set_creds hook of binprm_security_ops if the
 * process is being traced and its security attributes would be changed by the
 * execve.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, child, mode);
}

/**
 * security_ptrace_traceme() - Check if tracing is allowed
 * @parent: tracing process
 *
 * Check that the @parent process has sufficient permission to trace the
 * current process before allowing the current process to present itself to the
 * @parent process for tracing.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, parent);
}

/**
 * security_capget() - Get the capability sets for a process
 * @target: target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Get the @effective, @inheritable, and @permitted capability sets for the
 * @target process.  The hook may also perform permission checking to determine
 * if the current process is allowed to see the capability sets of the @target
 * process.
 *
 * Return: Returns 0 if the capability sets were successfully obtained.
 */
int security_capget(const struct task_struct *target,
                    kernel_cap_t *effective,
                    kernel_cap_t *inheritable,
                    kernel_cap_t *permitted)
{
        return call_int_hook(capget, target, effective, inheritable, permitted);
}

/**
 * security_capset() - Set the capability sets for a process
 * @new: new credentials for the target process
 * @old: current credentials of the target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Set the @effective, @inheritable, and @permitted capability sets for the
 * current process.
 *
 * Return: Returns 0 and update @new if permission is granted.
 */
int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, new, old, effective, inheritable,
                             permitted);
}

/**
 * security_capable() - Check if a process has the necessary capability
 * @cred: credentials to examine
 * @ns: user namespace
 * @cap: capability requested
 * @opts: capability check options
 *
 * Check whether the @tsk process has the @cap capability in the indicated
 * credentials.  @cap contains the capability <include/linux/capability.h>.
 * @opts contains options for the capable check <include/linux/security.h>.
 *
 * Return: Returns 0 if the capability is granted.
 */
int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, cred, ns, cap, opts);
}

/**
 * security_quotactl() - Check if a quotactl() syscall is allowed for this fs
 * @cmds: commands
 * @type: type
 * @id: id
 * @sb: filesystem
 *
 * Check whether the quotactl syscall is allowed for this @sb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quotactl(int cmds, int type, int id, const struct super_block *sb)
{
        return call_int_hook(quotactl, cmds, type, id, sb);
}

/**
 * security_quota_on() - Check if QUOTAON is allowed for a dentry
 * @dentry: dentry
 *
 * Check whether QUOTAON is allowed for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, dentry);
}

/**
 * security_syslog() - Check if accessing the kernel message ring is allowed
 * @type: SYSLOG_ACTION_* type
 *
 * Check permission before accessing the kernel message ring or changing
 * logging to the console.  See the syslog(2) manual page for an explanation of
 * the @type values.
 *
 * Return: Return 0 if permission is granted.
 */
int security_syslog(int type)
{
        return call_int_hook(syslog, type);
}

/**
 * security_settime64() - Check if changing the system time is allowed
 * @ts: new time
 * @tz: timezone
 *
 * Check permission to change the system time, struct timespec64 is defined in
 * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, ts, tz);
}

/**
 * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed
 * @mm: mm struct
 * @pages: number of pages
 *
 * Check permissions for allocating a new virtual mapping.  If all LSMs return
 * a positive value, __vm_enough_memory() will be called with cap_sys_admin
 * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be
 * called with cap_sys_admin cleared.
 *
 * Return: Returns 0 if permission is granted by the LSM infrastructure to the
 *         caller.
 */
int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct security_hook_list *hp;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with a positive value if
         * it thinks the __vm_enough_memory() call should be
         * made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module
         * thinks it should not be set it won't.
         */
        hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) {
                rc = hp->hook.vm_enough_memory(mm, pages);
                if (rc <= 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

/**
 * security_bprm_creds_for_exec() - Prepare the credentials for exec()
 * @bprm: binary program information
 *
 * If the setup in prepare_exec_creds did not setup @bprm->cred->security
 * properly for executing @bprm->file, update the LSM's portion of
 * @bprm->cred->security to be what commit_creds needs to install for the new
 * program.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  @bprm
 * contains the linux_binprm structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, bprm);
}

/**
 * security_bprm_creds_from_file() - Update linux_binprm creds based on file
 * @bprm: binary program information
 * @file: associated file
 *
 * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon
 * exec, update @bprm->cred to reflect that change. This is called after
 * finding the binary that will be executed without an interpreter.  This
 * ensures that the credentials will not be derived from a script that the
 * binary will need to reopen, which when reopend may end up being a completely
 * different file.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  The
 * hook must add to @bprm->per_clear any personality flags that should be
 * cleared from current->personality.  @bprm contains the linux_binprm
 * structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        return call_int_hook(bprm_creds_from_file, bprm, file);
}

/**
 * security_bprm_check() - Mediate binary handler search
 * @bprm: binary program information
 *
 * This hook mediates the point when a search for a binary handler will begin.
 * It allows a check against the @bprm->cred->security value which was set in
 * the preceding creds_for_exec call.  The argv list and envp list are reliably
 * available in @bprm.  This hook may be called multiple times during a single
 * execve.  @bprm contains the linux_binprm structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_check(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_check_security, bprm);
}

/**
 * security_bprm_committing_creds() - Install creds for a process during exec()
 * @bprm: binary program information
 *
 * Prepare to install the new security attributes of a process being
 * transformed by an execve operation, based on the old credentials pointed to
 * by @current->cred and the information set in @bprm->cred by the
 * bprm_creds_for_exec hook.  @bprm points to the linux_binprm structure.  This
 * hook is a good place to perform state changes on the process such as closing
 * open file descriptors to which access will no longer be granted when the
 * attributes are changed.  This is called immediately before commit_creds().
 */
void security_bprm_committing_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

/**
 * security_bprm_committed_creds() - Tidy up after cred install during exec()
 * @bprm: binary program information
 *
 * Tidy up after the installation of the new security attributes of a process
 * being transformed by an execve operation.  The new credentials have, by this
 * point, been set to @current->cred.  @bprm points to the linux_binprm
 * structure.  This hook is a good place to perform state changes on the
 * process such as clearing out non-inheritable signal state.  This is called
 * immediately after commit_creds().
 */
void security_bprm_committed_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

/**
 * security_fs_context_submount() - Initialise fc->security
 * @fc: new filesystem context
 * @reference: dentry reference for submount/remount
 *
 * Fill out the ->security field for a new fs_context.
 *
 * Return: Returns 0 on success or negative error code on failure.
 */
int security_fs_context_submount(struct fs_context *fc, struct super_block *reference)
{
        return call_int_hook(fs_context_submount, fc, reference);
}

/**
 * security_fs_context_dup() - Duplicate a fs_context LSM blob
 * @fc: destination filesystem context
 * @src_fc: source filesystem context
 *
 * Allocate and attach a security structure to sc->security.  This pointer is
 * initialised to NULL by the caller.  @fc indicates the new filesystem context.
 * @src_fc indicates the original filesystem context.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, fc, src_fc);
}

/**
 * security_fs_context_parse_param() - Configure a filesystem context
 * @fc: filesystem context
 * @param: filesystem parameter
 *
 * Userspace provided a parameter to configure a superblock.  The LSM can
 * consume the parameter or return it to the caller for use elsewhere.
 *
 * Return: If the parameter is used by the LSM it should return 0, if it is
 *         returned to the caller -ENOPARAM is returned, otherwise a negative
 *         error code is returned.
 */
int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct security_hook_list *hp;
        int trc;
        int rc = -ENOPARAM;

        hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param,
                             list) {
                trc = hp->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

/**
 * security_sb_alloc() - Allocate a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Allocate and attach a security structure to the sb->s_security field.  The
 * s_security field is initialized to NULL when the structure is allocated.
 * @sb contains the super_block structure to be modified.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_sb_alloc(struct super_block *sb)
{
        int rc = lsm_superblock_alloc(sb);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sb_alloc_security, sb);
        if (unlikely(rc))
                security_sb_free(sb);
        return rc;
}

/**
 * security_sb_delete() - Release super_block LSM associated objects
 * @sb: filesystem superblock
 *
 * Release objects tied to a superblock (e.g. inodes).  @sb contains the
 * super_block structure being released.
 */
void security_sb_delete(struct super_block *sb)
{
        call_void_hook(sb_delete, sb);
}

/**
 * security_sb_free() - Free a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Deallocate and clear the sb->s_security field.  @sb contains the super_block
 * structure to be modified.
 */
void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
        kfree(sb->s_security);
        sb->s_security = NULL;
}

/**
 * security_free_mnt_opts() - Free memory associated with mount options
 * @mnt_opts: LSM processed mount options
 *
 * Free memory associated with @mnt_ops.
 */
void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

/**
 * security_sb_eat_lsm_opts() - Consume LSM mount options
 * @options: mount options
 * @mnt_opts: LSM processed mount options
 *
 * Eat (scan @options) and save them in @mnt_opts.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

/**
 * security_sb_mnt_opts_compat() - Check if new mount options are allowed
 * @sb: filesystem superblock
 * @mnt_opts: new mount options
 *
 * Determine if the new mount options in @mnt_opts are allowed given the
 * existing mounted filesystem at @sb.  @sb superblock being compared.
 *
 * Return: Returns 0 if options are compatible.
 */
int security_sb_mnt_opts_compat(struct super_block *sb,
                                void *mnt_opts)
{
        return call_int_hook(sb_mnt_opts_compat, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_mnt_opts_compat);

/**
 * security_sb_remount() - Verify no incompatible mount changes during remount
 * @sb: filesystem superblock
 * @mnt_opts: (re)mount options
 *
 * Extracts security system specific mount options and verifies no changes are
 * being made to those options.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

/**
 * security_sb_kern_mount() - Check if a kernel mount is allowed
 * @sb: filesystem superblock
 *
 * Mount this @sb if allowed by permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_kern_mount(const struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, sb);
}

/**
 * security_sb_show_options() - Output the mount options for a superblock
 * @m: output file
 * @sb: filesystem superblock
 *
 * Show (print on @m) mount options for this @sb.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, m, sb);
}

/**
 * security_sb_statfs() - Check if accessing fs stats is allowed
 * @dentry: superblock handle
 *
 * Check permission before obtaining filesystem statistics for the @mnt
 * mountpoint.  @dentry is a handle on the superblock for the filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, dentry);
}

/**
 * security_sb_mount() - Check permission for mounting a filesystem
 * @dev_name: filesystem backing device
 * @path: mount point
 * @type: filesystem type
 * @flags: mount flags
 * @data: filesystem specific data
 *
 * Check permission before an object specified by @dev_name is mounted on the
 * mount point named by @nd.  For an ordinary mount, @dev_name identifies a
 * device if the file system type requires a device.  For a remount
 * (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a loopback/bind mount
 * (@flags & MS_BIND), @dev_name identifies the        pathname of the object being
 * mounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_mount(const char *dev_name, const struct path *path,
                      const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, dev_name, path, type, flags, data);
}

/**
 * security_sb_umount() - Check permission for unmounting a filesystem
 * @mnt: mounted filesystem
 * @flags: unmount flags
 *
 * Check permission before the @mnt file system is unmounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, mnt, flags);
}

/**
 * security_sb_pivotroot() - Check permissions for pivoting the rootfs
 * @old_path: new location for current rootfs
 * @new_path: location of the new rootfs
 *
 * Check permission before pivoting the root filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_pivotroot(const struct path *old_path,
                          const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, old_path, new_path);
}

/**
 * security_sb_set_mnt_opts() - Set the mount options for a filesystem
 * @sb: filesystem superblock
 * @mnt_opts: binary mount options
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Set the security relevant mount options used for a superblock.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_set_mnt_opts(struct super_block *sb,
                             void *mnt_opts,
                             unsigned long kern_flags,
                             unsigned long *set_kern_flags)
{
        struct security_hook_list *hp;
        int rc = mnt_opts ? -EOPNOTSUPP : LSM_RET_DEFAULT(sb_set_mnt_opts);

        hlist_for_each_entry(hp, &security_hook_heads.sb_set_mnt_opts,
                             list) {
                rc = hp->hook.sb_set_mnt_opts(sb, mnt_opts, kern_flags,
                                              set_kern_flags);
                if (rc != LSM_RET_DEFAULT(sb_set_mnt_opts))
                        break;
        }
        return rc;
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

/**
 * security_sb_clone_mnt_opts() - Duplicate superblock mount options
 * @oldsb: source superblock
 * @newsb: destination superblock
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Copy all security options from a given superblock to another.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                               struct super_block *newsb,
                               unsigned long kern_flags,
                               unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, oldsb, newsb,
                             kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

/**
 * security_move_mount() - Check permissions for moving a mount
 * @from_path: source mount point
 * @to_path: destination mount point
 *
 * Check permission before a mount is moved.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_move_mount(const struct path *from_path,
                        const struct path *to_path)
{
        return call_int_hook(move_mount, from_path, to_path);
}

/**
 * security_path_notify() - Check if setting a watch is allowed
 * @path: file path
 * @mask: event mask
 * @obj_type: file path type
 *
 * Check permissions before setting a watch on events as defined by @mask, on
 * an object at @path, whose type is defined by @obj_type.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_notify(const struct path *path, u64 mask,
                         unsigned int obj_type)
{
        return call_int_hook(path_notify, path, mask, obj_type);
}

/**
 * security_inode_alloc() - Allocate an inode LSM blob
 * @inode: the inode
 *
 * Allocate and attach a security structure to @inode->i_security.  The
 * i_security field is initialized to NULL when the inode structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_inode_alloc(struct inode *inode)
{
        int rc = lsm_inode_alloc(inode);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /*
         * The rcu head is at the start of the inode blob
         */
        kmem_cache_free(lsm_inode_cache, head);
}

/**
 * security_inode_free() - Free an inode's LSM blob
 * @inode: the inode
 *
 * Deallocate the inode security structure and set @inode->i_security to NULL.
 */
void security_inode_free(struct inode *inode)
{
        call_void_hook(inode_free_security, inode);
        /*
         * The inode may still be referenced in a path walk and
         * a call to security_inode_permission() can be made
         * after inode_free_security() is called. Ideally, the VFS
         * wouldn't do this, but fixing that is a much harder
         * job. For now, simply free the i_security via RCU, and
         * leave the current inode->i_security pointer intact.
         * The inode will be freed after the RCU grace period too.
         */
        if (inode->i_security)
                call_rcu((struct rcu_head *)inode->i_security,
                         inode_free_by_rcu);
}

/**
 * security_dentry_init_security() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @xattr_name: name of the security/LSM xattr
 * @ctx: pointer to the resulting LSM context
 * @ctxlen: length of @ctx
 *
 * Compute a context for a dentry as the inode is not yet available since NFSv4
 * has no label backed by an EA anyway.  It is important to note that
 * @xattr_name does not need to be free'd by the caller, it is a static string.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_dentry_init_security(struct dentry *dentry, int mode,
                                  const struct qstr *name,
                                  const char **xattr_name, void **ctx,
                                  u32 *ctxlen)
{
        return call_int_hook(dentry_init_security, dentry, mode, name,
                             xattr_name, ctx, ctxlen);
}
EXPORT_SYMBOL(security_dentry_init_security);

/**
 * security_dentry_create_files_as() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @old: creds to use for LSM context calculations
 * @new: creds to modify
 *
 * Compute a context for a dentry as the inode is not yet available and set
 * that context in passed in creds so that new files are created using that
 * context. Context is calculated using the passed in creds and not the creds
 * of the caller.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, dentry, mode,
                             name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

/**
 * security_inode_init_security() - Initialize an inode's LSM context
 * @inode: the inode
 * @dir: parent directory
 * @qstr: last component of the pathname
 * @initxattrs: callback function to write xattrs
 * @fs_data: filesystem specific data
 *
 * Obtain the security attribute name suffix and value to set on a newly
 * created inode and set up the incore security field for the new inode.  This
 * hook is called by the fs code as part of the inode creation transaction and
 * provides for atomic labeling of the inode, unlike the post_create/mkdir/...
 * hooks called by the VFS.
 *
 * The hook function is expected to populate the xattrs array, by calling
 * lsm_get_xattr_slot() to retrieve the slots reserved by the security module
 * with the lbs_xattr_count field of the lsm_blob_sizes structure.  For each
 * slot, the hook function should set ->name to the attribute name suffix
 * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it
 * to the attribute value, to set ->value_len to the length of the value.  If
 * the security module does not use security attributes or does not wish to put
 * a security attribute on this particular inode, then it should return
 * -EOPNOTSUPP to skip this processing.
 *
 * Return: Returns 0 if the LSM successfully initialized all of the inode
 *         security attributes that are required, negative values otherwise.
 */
int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct security_hook_list *hp;
        struct xattr *new_xattrs = NULL;
        int ret = -EOPNOTSUPP, xattr_count = 0;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!blob_sizes.lbs_xattr_count)
                return 0;

        if (initxattrs) {
                /* Allocate +1 as terminator. */
                new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 1,
                                     sizeof(*new_xattrs), GFP_NOFS);
                if (!new_xattrs)
                        return -ENOMEM;
        }

        hlist_for_each_entry(hp, &security_hook_heads.inode_init_security,
                             list) {
                ret = hp->hook.inode_init_security(inode, dir, qstr, new_xattrs,
                                                  &xattr_count);
                if (ret && ret != -EOPNOTSUPP)
                        goto out;
                /*
                 * As documented in lsm_hooks.h, -EOPNOTSUPP in this context
                 * means that the LSM is not willing to provide an xattr, not
                 * that it wants to signal an error. Thus, continue to invoke
                 * the remaining LSMs.
                 */
        }

        /* If initxattrs() is NULL, xattr_count is zero, skip the call. */
        if (!xattr_count)
                goto out;

        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (; xattr_count > 0; xattr_count--)
                kfree(new_xattrs[xattr_count - 1].value);
        kfree(new_xattrs);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

/**
 * security_inode_init_security_anon() - Initialize an anonymous inode
 * @inode: the inode
 * @name: the anonymous inode class
 * @context_inode: an optional related inode
 *
 * Set up the incore security field for the new anonymous inode and return
 * whether the inode creation is permitted by the security module or not.
 *
 * Return: Returns 0 on success, -EACCES if the security module denies the
 * creation of this inode, or another -errno upon other errors.
 */
int security_inode_init_security_anon(struct inode *inode,
                                      const struct qstr *name,
                                      const struct inode *context_inode)
{
        return call_int_hook(inode_init_security_anon, inode, name,
                             context_inode);
}

#ifdef CONFIG_SECURITY_PATH
/**
 * security_path_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a file. Note that this hook is called even
 * if mknod operation is being done for a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mknod(const struct path *dir, struct dentry *dentry,
                        umode_t mode, unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

/**
 * security_path_post_mknod() - Update inode security after reg file creation
 * @idmap: idmap of the mount
 * @dentry: new file
 *
 * Update inode security field after a regular file has been created.
 */
void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(path_post_mknod, idmap, dentry);
}

/**
 * security_path_mkdir() - Check if creating a new directory is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mkdir(const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

/**
 * security_path_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to remove
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, dir, dentry);
}

/**
 * security_path_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

/**
 * security_path_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: file pathname
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, dir, dentry, old_name);
}

/**
 * security_path_link - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @new_dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, old_dentry, new_dir, new_dentry);
}

/**
 * security_path_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        return call_int_hook(path_rename, old_dir, old_dentry, new_dir,
                             new_dentry, flags);
}
EXPORT_SYMBOL(security_path_rename);

/**
 * security_path_truncate() - Check if truncating a file is allowed
 * @path: file
 *
 * Check permission before truncating the file indicated by path.  Note that
 * truncation permissions may also be checked based on already opened files,
 * using the security_file_truncate() hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, path);
}

/**
 * security_path_chmod() - Check if changing the file's mode is allowed
 * @path: file
 * @mode: new mode
 *
 * Check for permission to change a mode of the file @path. The new mode is
 * specified in @mode which is a bitmask of constants from
 * <include/uapi/linux/stat.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, path, mode);
}

/**
 * security_path_chown() - Check if changing the file's owner/group is allowed
 * @path: file
 * @uid: file owner
 * @gid: file group
 *
 * Check for permission to change owner/group of a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, path, uid, gid);
}

/**
 * security_path_chroot() - Check if changing the root directory is allowed
 * @path: directory
 *
 * Check for permission to change root directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, path);
}
#endif /* CONFIG_SECURITY_PATH */

/**
 * security_inode_create() - Check if creating a file is allowed
 * @dir: the parent directory
 * @dentry: the file being created
 * @mode: requested file mode
 *
 * Check permission to create a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_create(struct inode *dir, struct dentry *dentry,
                          umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

/**
 * security_inode_post_create_tmpfile() - Update inode security of new tmpfile
 * @idmap: idmap of the mount
 * @inode: inode of the new tmpfile
 *
 * Update inode security data after a tmpfile has been created.
 */
void security_inode_post_create_tmpfile(struct mnt_idmap *idmap,
                                        struct inode *inode)
{
        if (unlikely(IS_PRIVATE(inode)))
                return;
        call_void_hook(inode_post_create_tmpfile, idmap, inode);
}

/**
 * security_inode_link() - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                        struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, old_dentry, dir, new_dentry);
}

/**
 * security_inode_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, dir, dentry);
}

/**
 * security_inode_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: existing filename
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                           const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, dir, dentry, old_name);
}

/**
 * security_inode_mkdir() - Check if creation a new director is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory
 * associated with inode structure @dir.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

/**
 * security_inode_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to be removed
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, dir, dentry);
}

/**
 * security_inode_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a special file (or a socket or a fifo file
 * created via the mknod system call).  Note that if mknod operation is being
 * done for a regular file, then the create hook will be called and not this
 * hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mknod(struct inode *dir, struct dentry *dentry,
                         umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, dir, dentry, mode, dev);
}

/**
 * security_inode_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, old_dir, old_dentry,
                             new_dir, new_dentry);
}

/**
 * security_inode_readlink() - Check if reading a symbolic link is allowed
 * @dentry: link
 *
 * Check the permission to read the symbolic link.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, dentry);
}

/**
 * security_inode_follow_link() - Check if following a symbolic link is allowed
 * @dentry: link dentry
 * @inode: link inode
 * @rcu: true if in RCU-walk mode
 *
 * Check permission to follow a symbolic link when looking up a pathname.  If
 * @rcu is true, @inode is not stable.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, dentry, inode, rcu);
}

/**
 * security_inode_permission() - Check if accessing an inode is allowed
 * @inode: inode
 * @mask: access mask
 *
 * Check permission before accessing an inode.  This hook is called by the
 * existing Linux permission function, so a security module can use it to
 * provide additional checking for existing Linux permission checks.  Notice
 * that this hook is called when a file is opened (as well as many other
 * operations), whereas the file_security_ops permission hook is called when
 * the actual read/write operations are performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, inode, mask);
}

/**
 * security_inode_setattr() - Check if setting file attributes is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @attr: new attributes
 *
 * Check permission before setting file attributes.  Note that the kernel call
 * to notify_change is performed from several locations, whenever file
 * attributes change (such as when a file is truncated, chown/chmod operations,
 * transferring disk quotas, etc).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_setattr, idmap, dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

/**
 * security_inode_post_setattr() - Update the inode after a setattr operation
 * @idmap: idmap of the mount
 * @dentry: file
 * @ia_valid: file attributes set
 *
 * Update inode security field after successful setting file attributes.
 */
void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 int ia_valid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setattr, idmap, dentry, ia_valid);
}

/**
 * security_inode_getattr() - Check if getting file attributes is allowed
 * @path: file
 *
 * Check permission before obtaining file attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, path);
}

/**
 * security_inode_setxattr() - Check if setting file xattrs is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: size of xattr value
 * @flags: flags
 *
 * Check permission before setting the extended attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setxattr(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_setxattr, idmap, dentry, name, value, size,
                            flags);

        if (ret == 1)
                ret = cap_inode_setxattr(dentry, name, value, size, flags);
        return ret;
}

/**
 * security_inode_set_acl() - Check if setting posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Check permission before setting posix acls, the posix acls in @kacl are
 * identified by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_set_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name,
                           struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_set_acl, idmap, dentry, acl_name, kacl);
}

/**
 * security_inode_post_set_acl() - Update inode security from posix acls set
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Update inode security data after successfully setting posix acls on @dentry.
 * The posix acls in @kacl are identified by @acl_name.
 */
void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                 struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_set_acl, dentry, acl_name, kacl);
}

/**
 * security_inode_get_acl() - Check if reading posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before getting osix acls, the posix acls are identified by
 * @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_get_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_get_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_remove_acl() - Check if removing a posix acl is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before removing posix acls, the posix acls are identified
 * by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_remove_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_remove_acl() - Update inode security after rm posix acls
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Update inode security data after successfully removing posix acls on
 * @dentry in @idmap. The posix acls are identified by @acl_name.
 */
void security_inode_post_remove_acl(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_setxattr() - Update the inode after a setxattr operation
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: xattr value size
 * @flags: flags
 *
 * Update inode security field after successful setxattr operation.
 */
void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
}

/**
 * security_inode_getxattr() - Check if xattr access is allowed
 * @dentry: file
 * @name: xattr name
 *
 * Check permission before obtaining the extended attributes identified by
 * @name for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, dentry, name);
}

/**
 * security_inode_listxattr() - Check if listing xattrs is allowed
 * @dentry: file
 *
 * Check permission before obtaining the list of extended attribute names for
 * @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, dentry);
}

/**
 * security_inode_removexattr() - Check if removing an xattr is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 *
 * Check permission before removing the extended attribute identified by @name
 * for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_removexattr(struct mnt_idmap *idmap,
                               struct dentry *dentry, const char *name)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_removexattr, idmap, dentry, name);
        if (ret == 1)
                ret = cap_inode_removexattr(idmap, dentry, name);
        return ret;
}

/**
 * security_inode_post_removexattr() - Update the inode after a removexattr op
 * @dentry: file
 * @name: xattr name
 *
 * Update the inode after a successful removexattr operation.
 */
void security_inode_post_removexattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_removexattr, dentry, name);
}

/**
 * security_inode_need_killpriv() - Check if security_inode_killpriv() required
 * @dentry: associated dentry
 *
 * Called when an inode has been changed to determine if
 * security_inode_killpriv() should be called.
 *
 * Return: Return <0 on error to abort the inode change operation, return 0 if
 *         security_inode_killpriv() does not need to be called, return >0 if
 *         security_inode_killpriv() does need to be called.
 */
int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, dentry);
}

/**
 * security_inode_killpriv() - The setuid bit is removed, update LSM state
 * @idmap: idmap of the mount
 * @dentry: associated dentry
 *
 * The @dentry's setuid bit is being removed.  Remove similar security labels.
 * Called with the dentry->d_inode->i_mutex held.
 *
 * Return: Return 0 on success.  If error is returned, then the operation
 *         causing setuid bit removal is failed.
 */
int security_inode_killpriv(struct mnt_idmap *idmap,
                            struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, idmap, dentry);
}

/**
 * security_inode_getsecurity() - Get the xattr security label of an inode
 * @idmap: idmap of the mount
 * @inode: inode
 * @name: xattr name
 * @buffer: security label buffer
 * @alloc: allocation flag
 *
 * Retrieve a copy of the extended attribute representation of the security
 * label associated with @name for @inode via @buffer.  Note that @name is the
 * remainder of the attribute name after the security prefix has been removed.
 * @alloc is used to specify if the call should return a value via the buffer
 * or just the value length.
 *
 * Return: Returns size of buffer on success.
 */
int security_inode_getsecurity(struct mnt_idmap *idmap,
                               struct inode *inode, const char *name,
                               void **buffer, bool alloc)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);

        return call_int_hook(inode_getsecurity, idmap, inode, name, buffer,
                             alloc);
}

/**
 * security_inode_setsecurity() - Set the xattr security label of an inode
 * @inode: inode
 * @name: xattr name
 * @value: security label
 * @size: length of security label
 * @flags: flags
 *
 * Set the security label associated with @name for @inode from the extended
 * attribute value @value.  @size indicates the size of the @value in bytes.
 * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the
 * remainder of the attribute name after the security. prefix has been removed.
 *
 * Return: Returns 0 on success.
 */
int security_inode_setsecurity(struct inode *inode, const char *name,
                               const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);

        return call_int_hook(inode_setsecurity, inode, name, value, size,
                             flags);
}

/**
 * security_inode_listsecurity() - List the xattr security label names
 * @inode: inode
 * @buffer: buffer
 * @buffer_size: size of buffer
 *
 * Copy the extended attribute names for the security labels associated with
 * @inode into @buffer.  The maximum size of @buffer is specified by
 * @buffer_size.  @buffer may be NULL to request the size of the buffer
 * required.
 *
 * Return: Returns number of bytes used/required on success.
 */
int security_inode_listsecurity(struct inode *inode,
                                char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

/**
 * security_inode_getsecid() - Get an inode's secid
 * @inode: inode
 * @secid: secid to return
 *
 * Get the secid associated with the node.  In case of failure, @secid will be
 * set to zero.
 */
void security_inode_getsecid(struct inode *inode, u32 *secid)
{
        call_void_hook(inode_getsecid, inode, secid);
}

/**
 * security_inode_copy_up() - Create new creds for an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @new: newly created creds
 *
 * A file is about to be copied up from lower layer to upper layer of overlay
 * filesystem. Security module can prepare a set of new creds and modify as
 * need be and return new creds. Caller will switch to new creds temporarily to
 * create new file and release newly allocated creds.
 *
 * Return: Returns 0 on success or a negative error code on error.
 */
int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

/**
 * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @name: xattr name
 *
 * Filter the xattrs being copied up when a unioned file is copied up from a
 * lower layer to the union/overlay layer.   The caller is responsible for
 * reading and writing the xattrs, this hook is merely a filter.
 *
 * Return: Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP
 *         if the security module does not know about attribute, or a negative
 *         error code to abort the copy up.
 */
int security_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        int rc;

        /*
         * The implementation can return 0 (accept the xattr), 1 (discard the
         * xattr), -EOPNOTSUPP if it does not know anything about the xattr or
         * any other error code in case of an error.
         */
        rc = call_int_hook(inode_copy_up_xattr, src, name);
        if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                return rc;

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

/**
 * security_kernfs_init_security() - Init LSM context for a kernfs node
 * @kn_dir: parent kernfs node
 * @kn: the kernfs node to initialize
 *
 * Initialize the security context of a newly created kernfs node based on its
 * own and its parent's attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, kn_dir, kn);
}

/**
 * security_file_permission() - Check file permissions
 * @file: file
 * @mask: requested permissions
 *
 * Check file permissions before accessing an open file.  This hook is called
 * by various operations that read or write files.  A security module can use
 * this hook to perform additional checking on these operations, e.g. to
 * revalidate permissions on use to support privilege bracketing or policy
 * changes.  Notice that this hook is used when the actual read/write
 * operations are performed, whereas the inode_security_ops hook is called when
 * a file is opened (as well as many other operations).  Although this hook can
 * be used to revalidate permissions for various system call operations that
 * read or write files, it does not address the revalidation of permissions for
 * memory-mapped files.  Security modules must handle this separately if they
 * need such revalidation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_permission(struct file *file, int mask)
{
        return call_int_hook(file_permission, file, mask);
}

/**
 * security_file_alloc() - Allocate and init a file's LSM blob
 * @file: the file
 *
 * Allocate and attach a security structure to the file->f_security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if the hook is successful and permission is granted.
 */
int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

/**
 * security_file_release() - Perform actions before releasing the file ref
 * @file: the file
 *
 * Perform actions before releasing the last reference to a file.
 */
void security_file_release(struct file *file)
{
        call_void_hook(file_release, file);
}

/**
 * security_file_free() - Free a file's LSM blob
 * @file: the file
 *
 * Deallocate and free any security structures stored in file->f_security.
 */
void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

/**
 * security_file_ioctl() - Check if an ioctl is allowed
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Check permission for an ioctl operation on @file.  Note that @arg sometimes
 * represents a user space pointer; in other cases, it may be a simple integer
 * value.  When @arg represents a user space pointer, it should never be used
 * by the security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

/**
 * security_mmap_file() - Check if mmap'ing a file is allowed
 * @file: file
 * @prot: protection applied by the kernel
 * @flags: flags
 *
 * Check permissions for a mmap operation.  The @file may be NULL, e.g. if
 * mapping anonymous memory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_file(struct file *file, unsigned long prot,
                       unsigned long flags)
{
        return call_int_hook(mmap_file, file, prot, mmap_prot(file, prot),
                             flags);
}

/**
 * security_mmap_addr() - Check if mmap'ing an address is allowed
 * @addr: address
 *
 * Check permissions for a mmap operation at @addr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, addr);
}

/**
 * security_file_mprotect() - Check if changing memory protections is allowed
 * @vma: memory region
 * @reqprot: application requested protection
 * @prot: protection applied by the kernel
 *
 * Check permissions before changing memory access permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                           unsigned long prot)
{
        return call_int_hook(file_mprotect, vma, reqprot, prot);
}

/**
 * security_file_lock() - Check if a file lock is allowed
 * @file: file
 * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK)
 *
 * Check permission before performing file locking operations.  Note the hook
 * mediates both flock and fcntl style locks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, file, cmd);
}

/**
 * security_file_fcntl() - Check if fcntl() op is allowed
 * @file: file
 * @cmd: fcntl command
 * @arg: command argument
 *
 * Check permission before allowing the file operation specified by @cmd from
 * being performed on the file @file.  Note that @arg sometimes represents a
 * user space pointer; in other cases, it may be a simple integer value.  When
 * @arg represents a user space pointer, it should never be used by the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, file, cmd, arg);
}

/**
 * security_file_set_fowner() - Set the file owner info in the LSM blob
 * @file: the file
 *
 * Save owner security information (typically from current->security) in
 * file->f_security for later use by the send_sigiotask hook.
 *
 * Return: Returns 0 on success.
 */
void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

/**
 * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed
 * @tsk: target task
 * @fown: signal sender
 * @sig: signal to be sent, SIGIO is sent if 0
 *
 * Check permission for the file owner @fown to send SIGIO or SIGURG to the
 * process @tsk.  Note that this hook is sometimes called from interrupt.  Note
 * that the fown_struct, @fown, is never outside the context of a struct file,
 * so the file structure (and associated security information) can always be
 * obtained: container_of(fown, struct file, f_owner).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_send_sigiotask(struct task_struct *tsk,
                                 struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, tsk, fown, sig);
}

/**
 * security_file_receive() - Check if receiving a file via IPC is allowed
 * @file: file being received
 *
 * This hook allows security modules to control the ability of a process to
 * receive an open file descriptor via socket IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, file);
}

/**
 * security_file_open() - Save open() time state for late use by the LSM
 * @file:
 *
 * Save open-time permission checking state for later use upon file_permission,
 * and recheck access if anything has changed since inode_permission.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_open(struct file *file)
{
        int ret;

        ret = call_int_hook(file_open, file);
        if (ret)
                return ret;

        return fsnotify_open_perm(file);
}

/**
 * security_file_post_open() - Evaluate a file after it has been opened
 * @file: the file
 * @mask: access mask
 *
 * Evaluate an opened file and the access mask requested with open(). The hook
 * is useful for LSMs that require the file content to be available in order to
 * make decisions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_post_open(struct file *file, int mask)
{
        return call_int_hook(file_post_open, file, mask);
}
EXPORT_SYMBOL_GPL(security_file_post_open);

/**
 * security_file_truncate() - Check if truncating a file is allowed
 * @file: file
 *
 * Check permission before truncating a file, i.e. using ftruncate.  Note that
 * truncation permission may also be checked based on the path, using the
 * @path_truncate hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_truncate(struct file *file)
{
        return call_int_hook(file_truncate, file);
}

/**
 * security_task_alloc() - Allocate a task's LSM blob
 * @task: the task
 * @clone_flags: flags indicating what is being shared
 *
 * Handle allocation of task-related resources.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

/**
 * security_task_free() - Free a task's LSM blob and related resources
 * @task: task
 *
 * Handle release of task-related resources.  Note that this can be called from
 * interrupt context.
 */
void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

/**
 * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer
 * @cred: credentials
 * @gfp: gfp flags
 *
 * Only allocate sufficient memory and attach to @cred such that
 * cred_transfer() will not get ENOMEM.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

/**
 * security_cred_free() - Free the cred's LSM blob and associated resources
 * @cred: credentials
 *
 * Deallocate and clear the cred->security field in a set of credentials.
 */
void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

/**
 * security_prepare_creds() - Prepare a new set of credentials
 * @new: new credentials
 * @old: original credentials
 * @gfp: gfp flags
 *
 * Prepare a new set of credentials by copying the data from the old set.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

/**
 * security_transfer_creds() - Transfer creds
 * @new: target credentials
 * @old: original credentials
 *
 * Transfer data from original creds to new creds.
 */
void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

/**
 * security_cred_getsecid() - Get the secid from a set of credentials
 * @c: credentials
 * @secid: secid value
 *
 * Retrieve the security identifier of the cred structure @c.  In case of
 * failure, @secid will be set to zero.
 */
void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

/**
 * security_kernel_act_as() - Set the kernel credentials to act as secid
 * @new: credentials
 * @secid: secid
 *
 * Set the credentials for a kernel service to act as (subjective context).
 * The current task must be the one that nominated @secid.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, new, secid);
}

/**
 * security_kernel_create_files_as() - Set file creation context using an inode
 * @new: target credentials
 * @inode: reference inode
 *
 * Set the file creation context in a set of credentials to be the same as the
 * objective context of the specified inode.  The current task must be the one
 * that nominated @inode.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, new, inode);
}

/**
 * security_kernel_module_request() - Check if loading a module is allowed
 * @kmod_name: module name
 *
 * Ability to trigger the kernel to automatically upcall to userspace for
 * userspace to load a kernel module with the given name.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_module_request(char *kmod_name)
{
        return call_int_hook(kernel_module_request, kmod_name);
}

/**
 * security_kernel_read_file() - Read a file specified by userspace
 * @file: file
 * @id: file identifier
 * @contents: trust if security_kernel_post_read_file() will be called
 *
 * Read a file specified by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        return call_int_hook(kernel_read_file, file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

/**
 * security_kernel_post_read_file() - Read a file specified by userspace
 * @file: file
 * @buf: file contents
 * @size: size of file contents
 * @id: file identifier
 *
 * Read a file specified by userspace.  This must be paired with a prior call
 * to security_kernel_read_file() call that indicated this hook would also be
 * called, see security_kernel_read_file() for more information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        return call_int_hook(kernel_post_read_file, file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

/**
 * security_kernel_load_data() - Load data provided by userspace
 * @id: data identifier
 * @contents: true if security_kernel_post_load_data() will be called
 *
 * Load data provided by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        return call_int_hook(kernel_load_data, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

/**
 * security_kernel_post_load_data() - Load userspace data from a non-file source
 * @buf: data
 * @size: size of data
 * @id: data identifier
 * @description: text description of data, specific to the id value
 *
 * Load data provided by a non-file source (usually userspace buffer).  This
 * must be paired with a prior security_kernel_load_data() call that indicated
 * this hook would also be called, see security_kernel_load_data() for more
 * information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        return call_int_hook(kernel_post_load_data, buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

/**
 * security_task_fix_setuid() - Update LSM with new user id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag values
 *
 * Update the module's state after setting one or more of the user identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*uid system calls invoked this hook.  If @new is the set of
 * credentials that will be installed.  Modifications should be made to this
 * rather than to @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, new, old, flags);
}

/**
 * security_task_fix_setgid() - Update LSM with new group id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag value
 *
 * Update the module's state after setting one or more of the group identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*gid system calls invoked this hook.  @new is the set of credentials
 * that will be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setgid, new, old, flags);
}

/**
 * security_task_fix_setgroups() - Update LSM with new supplementary groups
 * @new: updated credentials
 * @old: credentials being replaced
 *
 * Update the module's state after setting the supplementary group identity
 * attributes of the current process.  @new is the set of credentials that will
 * be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        return call_int_hook(task_fix_setgroups, new, old);
}

/**
 * security_task_setpgid() - Check if setting the pgid is allowed
 * @p: task being modified
 * @pgid: new pgid
 *
 * Check permission before setting the process group identifier of the process
 * @p to @pgid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, p, pgid);
}

/**
 * security_task_getpgid() - Check if getting the pgid is allowed
 * @p: task
 *
 * Check permission before getting the process group identifier of the process
 * @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, p);
}

/**
 * security_task_getsid() - Check if getting the session id is allowed
 * @p: task
 *
 * Check permission before getting the session identifier of the process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, p);
}

/**
 * security_current_getsecid_subj() - Get the current task's subjective secid
 * @secid: secid value
 *
 * Retrieve the subjective security identifier of the current task and return
 * it in @secid.  In case of failure, @secid will be set to zero.
 */
void security_current_getsecid_subj(u32 *secid)
{
        *secid = 0;
        call_void_hook(current_getsecid_subj, secid);
}
EXPORT_SYMBOL(security_current_getsecid_subj);

/**
 * security_task_getsecid_obj() - Get a task's objective secid
 * @p: target task
 * @secid: secid value
 *
 * Retrieve the objective security identifier of the task_struct in @p and
 * return it in @secid. In case of failure, @secid will be set to zero.
 */
void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
{
        *secid = 0;
        call_void_hook(task_getsecid_obj, p, secid);
}
EXPORT_SYMBOL(security_task_getsecid_obj);

/**
 * security_task_setnice() - Check if setting a task's nice value is allowed
 * @p: target task
 * @nice: nice value
 *
 * Check permission before setting the nice value of @p to @nice.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, p, nice);
}

/**
 * security_task_setioprio() - Check if setting a task's ioprio is allowed
 * @p: target task
 * @ioprio: ioprio value
 *
 * Check permission before setting the ioprio value of @p to @ioprio.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, p, ioprio);
}

/**
 * security_task_getioprio() - Check if getting a task's ioprio is allowed
 * @p: task
 *
 * Check permission before getting the ioprio value of @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, p);
}

/**
 * security_task_prlimit() - Check if get/setting resources limits is allowed
 * @cred: current task credentials
 * @tcred: target task credentials
 * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both
 *
 * Check permission before getting and/or setting the resource limits of
 * another task.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, cred, tcred, flags);
}

/**
 * security_task_setrlimit() - Check if setting a new rlimit value is allowed
 * @p: target task's group leader
 * @resource: resource whose limit is being set
 * @new_rlim: new resource limit
 *
 * Check permission before setting the resource limits of process @p for
 * @resource to @new_rlim.  The old resource limit values can be examined by
 * dereferencing (p->signal->rlim + resource).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                            struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, p, resource, new_rlim);
}

/**
 * security_task_setscheduler() - Check if setting sched policy/param is allowed
 * @p: target task
 *
 * Check permission before setting scheduling policy and/or parameters of
 * process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, p);
}

/**
 * security_task_getscheduler() - Check if getting scheduling info is allowed
 * @p: target task
 *
 * Check permission before obtaining scheduling information for process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, p);
}

/**
 * security_task_movememory() - Check if moving memory is allowed
 * @p: task
 *
 * Check permission before moving memory owned by process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, p);
}

/**
 * security_task_kill() - Check if sending a signal is allowed
 * @p: target process
 * @info: signal information
 * @sig: signal value
 * @cred: credentials of the signal sender, NULL if @current
 *
 * Check permission before sending signal @sig to @p.  @info can be NULL, the
 * constant 1, or a pointer to a kernel_siginfo structure.  If @info is 1 or
 * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from
 * the kernel and should typically be permitted.  SIGIO signals are handled
 * separately by the send_sigiotask hook in file_security_ops.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                       int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, p, info, sig, cred);
}

/**
 * security_task_prctl() - Check if a prctl op is allowed
 * @option: operation
 * @arg2: argument
 * @arg3: argument
 * @arg4: argument
 * @arg5: argument
 *
 * Check permission before performing a process control operation on the
 * current process.
 *
 * Return: Return -ENOSYS if no-one wanted to handle this op, any other value
 *         to cause prctl() to return immediately with that value.
 */
int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                        unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) {
                thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

/**
 * security_task_to_inode() - Set the security attributes of a task's inode
 * @p: task
 * @inode: inode
 *
 * Set the security attributes for an inode based on an associated task's
 * security attributes, e.g. for /proc/pid inodes.
 */
void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

/**
 * security_create_user_ns() - Check if creating a new userns is allowed
 * @cred: prepared creds
 *
 * Check permission prior to creating a new user namespace.
 *
 * Return: Returns 0 if successful, otherwise < 0 error code.
 */
int security_create_user_ns(const struct cred *cred)
{
        return call_int_hook(userns_create, cred);
}

/**
 * security_ipc_permission() - Check if sysv ipc access is allowed
 * @ipcp: ipc permission structure
 * @flag: requested permissions
 *
 * Check permissions for access to IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, ipcp, flag);
}

/**
 * security_ipc_getsecid() - Get the sysv ipc object's secid
 * @ipcp: ipc permission structure
 * @secid: secid pointer
 *
 * Get the secid associated with the ipc object.  In case of failure, @secid
 * will be set to zero.
 */
void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
{
        *secid = 0;
        call_void_hook(ipc_getsecid, ipcp, secid);
}

/**
 * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Allocate and attach a security structure to the msg->security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if operation was successful and permission is granted.
 */
int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

/**
 * security_msg_msg_free() - Free a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Deallocate the security structure for this message.
 */
void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

/**
 * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Allocate and attach a security structure to @msg. The security field is
 * initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

/**
 * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Deallocate security field @perm->security for the message queue.
 */
void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

/**
 * security_msg_queue_associate() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @msqflg: operation flags
 *
 * Check permission when a message queue is requested through the msgget system
 * call. This hook is only called when returning the message queue identifier
 * for an existing message queue, not when a new message queue is created.
 *
 * Return: Return 0 if permission is granted.
 */
int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, msq, msqflg);
}

/**
 * security_msg_queue_msgctl() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a message control operation specified by @cmd is to be
 * performed on the message queue with permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, msq, cmd);
}

/**
 * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @msqflg: operation flags
 *
 * Check permission before a message, @msg, is enqueued on the message queue
 * with permissions specified in @msq.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                              struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, msq, msg, msqflg);
}

/**
 * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @target: target task
 * @type: type of message requested
 * @mode: operation flags
 *
 * Check permission before a message, @msg, is removed from the message        queue.
 * The @target task structure contains a pointer to the process that will be
 * receiving the message (not equal to the current process when inline receives
 * are being performed).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                              struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, msq, msg, target, type, mode);
}

/**
 * security_shm_alloc() - Allocate a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @shp security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

/**
 * security_shm_free() - Free a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Deallocate the security structure @perm->security for the memory segment.
 */
void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

/**
 * security_shm_associate() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @shmflg: operation flags
 *
 * Check permission when a shared memory region is requested through the shmget
 * system call. This hook is only called when returning the shared memory
 * region identifier for an existing region, not when a new shared memory
 * region is created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, shp, shmflg);
}

/**
 * security_shm_shmctl() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a shared memory control operation specified by @cmd is
 * to be performed on the shared memory region with permissions in @shp.
 *
 * Return: Return 0 if permission is granted.
 */
int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, shp, cmd);
}

/**
 * security_shm_shmat() - Check if a sysv shm attach operation is allowed
 * @shp: sysv ipc permission structure
 * @shmaddr: address of memory region to attach
 * @shmflg: operation flags
 *
 * Check permissions prior to allowing the shmat system call to attach the
 * shared memory segment with permissions @shp to the data segment of the
 * calling process. The attaching address is specified by @shmaddr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_shmat(struct kern_ipc_perm *shp,
                       char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, shp, shmaddr, shmflg);
}

/**
 * security_sem_alloc() - Allocate a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @sma security field. The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

/**
 * security_sem_free() - Free a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Deallocate security structure @sma->security for the semaphore.
 */
void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

/**
 * security_sem_associate() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @semflg: operation flags
 *
 * Check permission when a semaphore is requested through the semget system
 * call. This hook is only called when returning the semaphore identifier for
 * an existing semaphore, not when a new one must be created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, sma, semflg);
}

/**
 * security_sem_semctl() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a semaphore operation specified by @cmd is to be
 * performed on the semaphore.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, sma, cmd);
}

/**
 * security_sem_semop() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @sops: operations to perform
 * @nsops: number of operations
 * @alter: flag indicating changes will be made
 *
 * Check permissions before performing operations on members of the semaphore
 * set. If the @alter flag is nonzero, the semaphore set may be modified.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                       unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, sma, sops, nsops, alter);
}

/**
 * security_d_instantiate() - Populate an inode's LSM state based on a dentry
 * @dentry: dentry
 * @inode: inode
 *
 * Fill in @inode security information for a @dentry if allowed.
 */
void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_getselfattr - Read an LSM attribute of the current process.
 * @attr: which attribute to return
 * @uctx: the user-space destination for the information, or NULL
 * @size: pointer to the size of space available to receive the data
 * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
 * attributes associated with the LSM identified in the passed @ctx be
 * reported.
 *
 * A NULL value for @uctx can be used to get both the number of attributes
 * and the size of the data.
 *
 * Returns the number of attributes found on success, negative value
 * on error. @size is reset to the total size of the data.
 * If @size is insufficient to contain the data -E2BIG is returned.
 */
int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 __user *size, u32 flags)
{
        struct security_hook_list *hp;
        struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, };
        u8 __user *base = (u8 __user *)uctx;
        u32 entrysize;
        u32 total = 0;
        u32 left;
        bool toobig = false;
        bool single = false;
        int count = 0;
        int rc;

        if (attr == LSM_ATTR_UNDEF)
                return -EINVAL;
        if (size == NULL)
                return -EINVAL;
        if (get_user(left, size))
                return -EFAULT;

        if (flags) {
                /*
                 * Only flag supported is LSM_FLAG_SINGLE
                 */
                if (flags != LSM_FLAG_SINGLE || !uctx)
                        return -EINVAL;
                if (copy_from_user(&lctx, uctx, sizeof(lctx)))
                        return -EFAULT;
                /*
                 * If the LSM ID isn't specified it is an error.
                 */
                if (lctx.id == LSM_ID_UNDEF)
                        return -EINVAL;
                single = true;
        }

        /*
         * In the usual case gather all the data from the LSMs.
         * In the single case only get the data from the LSM specified.
         */
        hlist_for_each_entry(hp, &security_hook_heads.getselfattr, list) {
                if (single && lctx.id != hp->lsmid->id)
                        continue;
                entrysize = left;
                if (base)
                        uctx = (struct lsm_ctx __user *)(base + total);
                rc = hp->hook.getselfattr(attr, uctx, &entrysize, flags);
                if (rc == -EOPNOTSUPP) {
                        rc = 0;
                        continue;
                }
                if (rc == -E2BIG) {
                        rc = 0;
                        left = 0;
                        toobig = true;
                } else if (rc < 0)
                        return rc;
                else
                        left -= entrysize;

                total += entrysize;
                count += rc;
                if (single)
                        break;
        }
        if (put_user(total, size))
                return -EFAULT;
        if (toobig)
                return -E2BIG;
        if (count == 0)
                return LSM_RET_DEFAULT(getselfattr);
        return count;
}

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_setselfattr - Set an LSM attribute on the current process.
 * @attr: which attribute to set
 * @uctx: the user-space source for the information
 * @size: the size of the data
 * @flags: reserved for future use, must be 0
 *
 * Set an LSM attribute for the current process. The LSM, attribute
 * and new value are included in @uctx.
 *
 * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT
 * if the user buffer is inaccessible, E2BIG if size is too big, or an
 * LSM specific failure.
 */
int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 size, u32 flags)
{
        struct security_hook_list *hp;
        struct lsm_ctx *lctx;
        int rc = LSM_RET_DEFAULT(setselfattr);
        u64 required_len;

        if (flags)
                return -EINVAL;
        if (size < sizeof(*lctx))
                return -EINVAL;
        if (size > PAGE_SIZE)
                return -E2BIG;

        lctx = memdup_user(uctx, size);
        if (IS_ERR(lctx))
                return PTR_ERR(lctx);

        if (size < lctx->len ||
            check_add_overflow(sizeof(*lctx), lctx->ctx_len, &required_len) ||
            lctx->len < required_len) {
                rc = -EINVAL;
                goto free_out;
        }

        hlist_for_each_entry(hp, &security_hook_heads.setselfattr, list)
                if ((hp->lsmid->id) == lctx->id) {
                        rc = hp->hook.setselfattr(attr, lctx, size, flags);
                        break;
                }

free_out:
        kfree(lctx);
        return rc;
}

/**
 * security_getprocattr() - Read an attribute for a task
 * @p: the task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 *
 * Read attribute @name for task @p and store it into @value if allowed.
 *
 * Return: Returns the length of @value on success, a negative value otherwise.
 */
int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
                         char **value)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
                if (lsmid != 0 && lsmid != hp->lsmid->id)
                        continue;
                return hp->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

/**
 * security_setprocattr() - Set an attribute for a task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 * @size: attribute value size
 *
 * Write (set) the current task's attribute @name to @value, size @size if
 * allowed.
 *
 * Return: Returns bytes written on success, a negative value otherwise.
 */
int security_setprocattr(int lsmid, const char *name, void *value, size_t size)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
                if (lsmid != 0 && lsmid != hp->lsmid->id)
                        continue;
                return hp->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

/**
 * security_netlink_send() - Save info and check if netlink sending is allowed
 * @sk: sending socket
 * @skb: netlink message
 *
 * Save security information for a netlink message so that permission checking
 * can be performed when the message is processed.  The security information
 * can be saved using the eff_cap field of the netlink_skb_parms structure.
 * Also may be used to provide fine grained control over message transmission.
 *
 * Return: Returns 0 if the information was successfully saved and message is
 *         allowed to be transmitted.
 */
int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, sk, skb);
}

/**
 * security_ismaclabel() - Check if the named attribute is a MAC label
 * @name: full extended attribute name
 *
 * Check if the extended attribute specified by @name represents a MAC label.
 *
 * Return: Returns 1 if name is a MAC attribute otherwise returns 0.
 */
int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, name);
}
EXPORT_SYMBOL(security_ismaclabel);

/**
 * security_secid_to_secctx() - Convert a secid to a secctx
 * @secid: secid
 * @secdata: secctx
 * @seclen: secctx length
 *
 * Convert secid to security context.  If @secdata is NULL the length of the
 * result will be returned in @seclen, but no @secdata will be returned.  This
 * does mean that the length could change between calls to check the length and
 * the next call which actually allocates and returns the @secdata.
 *
 * Return: Return 0 on success, error on failure.
 */
int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        return call_int_hook(secid_to_secctx, secid, secdata, seclen);
}
EXPORT_SYMBOL(security_secid_to_secctx);

/**
 * security_secctx_to_secid() - Convert a secctx to a secid
 * @secdata: secctx
 * @seclen: length of secctx
 * @secid: secid
 *
 * Convert security context to secid.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

/**
 * security_release_secctx() - Free a secctx buffer
 * @secdata: secctx
 * @seclen: length of secctx
 *
 * Release the security context.
 */
void security_release_secctx(char *secdata, u32 seclen)
{
        call_void_hook(release_secctx, secdata, seclen);
}
EXPORT_SYMBOL(security_release_secctx);

/**
 * security_inode_invalidate_secctx() - Invalidate an inode's security label
 * @inode: inode
 *
 * Notify the security module that it must revalidate the security context of
 * an inode.
 */
void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

/**
 * security_inode_notifysecctx() - Notify the LSM of an inode's security label
 * @inode: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Notify the security module of what the security context of an inode should
 * be.  Initializes the incore security context managed by the security module
 * for this inode.  Example usage: NFS client invokes this hook to initialize
 * the security context in its incore inode to the value provided by the server
 * for the file when the server returned the file's attributes to the client.
 * Must be called with inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

/**
 * security_inode_setsecctx() - Change the security label of an inode
 * @dentry: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Change the security context of an inode.  Updates the incore security
 * context managed by the security module and invokes the fs code as needed
 * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the
 * context.  Example usage: NFS server invokes this hook to change the security
 * context in its incore inode and on the backing filesystem to a value
 * provided by the client on a SETATTR operation.  Must be called with
 * inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

/**
 * security_inode_getsecctx() - Get the security label of an inode
 * @inode: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * On success, returns 0 and fills out @ctx and @ctxlen with the security
 * context for the given @inode.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        return call_int_hook(inode_getsecctx, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
/**
 * security_post_notification() - Check if a watch notification can be posted
 * @w_cred: credentials of the task that set the watch
 * @cred: credentials of the task which triggered the watch
 * @n: the notification
 *
 * Check to see if a watch notification can be posted to a particular queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
/**
 * security_watch_key() - Check if a task is allowed to watch for key events
 * @key: the key to watch
 *
 * Check to see if a process is allowed to watch for event notifications from
 * a key or keyring.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, key);
}
#endif /* CONFIG_KEY_NOTIFICATIONS */

#ifdef CONFIG_SECURITY_NETWORK
/**
 * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed
 * @sock: originating sock
 * @other: peer sock
 * @newsk: new sock
 *
 * Check permissions before establishing a Unix domain stream connection
 * between @sock and @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_stream_connect(struct sock *sock, struct sock *other,
                                 struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

/**
 * security_unix_may_send() - Check if AF_UNIX socket can send datagrams
 * @sock: originating sock
 * @other: peer sock
 *
 * Check permissions before connecting or sending datagrams from @sock to
 * @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

/**
 * security_socket_create() - Check if creating a new socket is allowed
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * Check permissions prior to creating a new socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, family, type, protocol, kern);
}

/**
 * security_socket_post_create() - Initialize a newly created socket
 * @sock: socket
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * This hook allows a module to update or allocate a per-socket security
 * structure. Note that the security field was not added directly to the socket
 * structure, but rather, the socket security information is stored in the
 * associated inode.  Typically, the inode alloc_security hook will allocate
 * and attach security information to SOCK_INODE(sock)->i_security.  This hook
 * may be used to update the SOCK_INODE(sock)->i_security field with additional
 * information that wasn't available when the inode was allocated.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, sock, family, type,
                             protocol, kern);
}

/**
 * security_socket_socketpair() - Check if creating a socketpair is allowed
 * @socka: first socket
 * @sockb: second socket
 *
 * Check permissions before creating a fresh pair of sockets.
 *
 * Return: Returns 0 if permission is granted and the connection was
 *         established.
 */
int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

/**
 * security_socket_bind() - Check if a socket bind operation is allowed
 * @sock: socket
 * @address: requested bind address
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer bind operation is performed
 * and the socket @sock is bound to the address specified in the @address
 * parameter.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_bind(struct socket *sock,
                         struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, sock, address, addrlen);
}

/**
 * security_socket_connect() - Check if a socket connect operation is allowed
 * @sock: socket
 * @address: address of remote connection point
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer connect operation attempts to
 * connect socket @sock to a remote address, @address.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_connect(struct socket *sock,
                            struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, sock, address, addrlen);
}

/**
 * security_socket_listen() - Check if a socket is allowed to listen
 * @sock: socket
 * @backlog: connection queue size
 *
 * Check permission before socket protocol layer listen operation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, sock, backlog);
}

/**
 * security_socket_accept() - Check if a socket is allowed to accept connections
 * @sock: listening socket
 * @newsock: newly creation connection socket
 *
 * Check permission before accepting a new connection.  Note that the new
 * socket, @newsock, has been created and some information copied to it, but
 * the accept operation has not actually been performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, sock, newsock);
}

/**
 * security_socket_sendmsg() - Check if sending a message is allowed
 * @sock: sending socket
 * @msg: message to send
 * @size: size of message
 *
 * Check permission before transmitting a message to another socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, sock, msg, size);
}

/**
 * security_socket_recvmsg() - Check if receiving a message is allowed
 * @sock: receiving socket
 * @msg: message to receive
 * @size: size of message
 * @flags: operational flags
 *
 * Check permission before receiving a message from a socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, sock, msg, size, flags);
}

/**
 * security_socket_getsockname() - Check if reading the socket addr is allowed
 * @sock: socket
 *
 * Check permission before reading the local address (name) of the socket
 * object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, sock);
}

/**
 * security_socket_getpeername() - Check if reading the peer's addr is allowed
 * @sock: socket
 *
 * Check permission before the remote address (name) of a socket object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, sock);
}

/**
 * security_socket_getsockopt() - Check if reading a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before retrieving the options associated with socket
 * @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, sock, level, optname);
}

/**
 * security_socket_setsockopt() - Check if setting a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before setting the options associated with socket @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, sock, level, optname);
}

/**
 * security_socket_shutdown() - Checks if shutting down the socket is allowed
 * @sock: socket
 * @how: flag indicating how sends and receives are handled
 *
 * Checks permission before all or part of a connection on the socket @sock is
 * shut down.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, sock, how);
}

/**
 * security_sock_rcv_skb() - Check if an incoming network packet is allowed
 * @sk: destination sock
 * @skb: incoming packet
 *
 * Check permissions on incoming network packets.  This hook is distinct from
 * Netfilter's IP input hooks since it is the first time that the incoming
 * sk_buff @skb has been associated with a particular socket, @sk.  Must not
 * sleep inside this hook because some callers hold spinlocks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

/**
 * security_socket_getpeersec_stream() - Get the remote peer label
 * @sock: socket
 * @optval: destination buffer
 * @optlen: size of peer label copied into the buffer
 * @len: maximum size of the destination buffer
 *
 * This hook allows the security module to provide peer socket security state
 * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC.
 * For tcp sockets this can be meaningful if the socket is associated with an
 * ipsec SA.
 *
 * Return: Returns 0 if all is well, otherwise, typical getsockopt return
 *         values.
 */
int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        return call_int_hook(socket_getpeersec_stream, sock, optval, optlen,
                             len);
}

/**
 * security_socket_getpeersec_dgram() - Get the remote peer label
 * @sock: socket
 * @skb: datagram packet
 * @secid: remote peer label secid
 *
 * This hook allows the security module to provide peer socket security state
 * for udp sockets on a per-packet basis to userspace via getsockopt
 * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC
 * option via getsockopt. It can then retrieve the security state returned by
 * this hook for a packet via the SCM_SECURITY ancillary message type.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_socket_getpeersec_dgram(struct socket *sock,
                                     struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(socket_getpeersec_dgram, sock, skb, secid);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

/**
 * security_sk_alloc() - Allocate and initialize a sock's LSM blob
 * @sk: sock
 * @family: protocol family
 * @priority: gfp flags
 *
 * Allocate and attach a security structure to the sk->sk_security field, which
 * is used to copy security attributes between local stream sockets.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        return call_int_hook(sk_alloc_security, sk, family, priority);
}

/**
 * security_sk_free() - Free the sock's LSM blob
 * @sk: sock
 *
 * Deallocate security structure.
 */
void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
}

/**
 * security_sk_clone() - Clone a sock's LSM state
 * @sk: original sock
 * @newsk: target sock
 *
 * Clone/copy security structure.
 */
void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

/**
 * security_sk_classify_flow() - Set a flow's secid based on socket
 * @sk: original socket
 * @flic: target flow
 *
 * Set the target flow's secid to socket's secid.
 */
void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

/**
 * security_req_classify_flow() - Set a flow's secid based on request_sock
 * @req: request_sock
 * @flic: target flow
 *
 * Sets @flic's secid to @req's secid.
 */
void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

/**
 * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket
 * @sk: sock being grafted
 * @parent: target parent socket
 *
 * Sets @parent's inode secid to @sk's secid and update @sk with any necessary
 * LSM state from @parent.
 */
void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

/**
 * security_inet_conn_request() - Set request_sock state using incoming connect
 * @sk: parent listening sock
 * @skb: incoming connection
 * @req: new request_sock
 *
 * Initialize the @req LSM state based on @sk and the incoming connect in @skb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inet_conn_request(const struct sock *sk,
                               struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

/**
 * security_inet_csk_clone() - Set new sock LSM state based on request_sock
 * @newsk: new sock
 * @req: connection request_sock
 *
 * Set that LSM state of @sock using the LSM state from @req.
 */
void security_inet_csk_clone(struct sock *newsk,
                             const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

/**
 * security_inet_conn_established() - Update sock's LSM state with connection
 * @sk: sock
 * @skb: connection packet
 *
 * Update @sock's LSM state to represent a new connection from @skb.
 */
void security_inet_conn_established(struct sock *sk,
                                    struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

/**
 * security_secmark_relabel_packet() - Check if setting a secmark is allowed
 * @secid: new secmark value
 *
 * Check if the process should be allowed to relabel packets to @secid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

/**
 * security_secmark_refcount_inc() - Increment the secmark labeling rule count
 *
 * Tells the LSM to increment the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

/**
 * security_secmark_refcount_dec() - Decrement the secmark labeling rule count
 *
 * Tells the LSM to decrement the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

/**
 * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device
 * @security: pointer to the LSM blob
 *
 * This hook allows a module to allocate a security structure for a TUN        device,
 * returning the pointer in @security.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_tun_dev_alloc_security(void **security)
{
        return call_int_hook(tun_dev_alloc_security, security);
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

/**
 * security_tun_dev_free_security() - Free a TUN device LSM blob
 * @security: LSM blob
 *
 * This hook allows a module to free the security structure for a TUN device.
 */
void security_tun_dev_free_security(void *security)
{
        call_void_hook(tun_dev_free_security, security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

/**
 * security_tun_dev_create() - Check if creating a TUN device is allowed
 *
 * Check permissions prior to creating a new TUN device.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create);
}
EXPORT_SYMBOL(security_tun_dev_create);

/**
 * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed
 * @security: TUN device LSM blob
 *
 * Check permissions prior to attaching to a TUN device queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

/**
 * security_tun_dev_attach() - Update TUN device LSM state on attach
 * @sk: associated sock
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's sock structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

/**
 * security_tun_dev_open() - Update TUN device LSM state on open
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's security structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

/**
 * security_sctp_assoc_request() - Update the LSM on a SCTP association req
 * @asoc: SCTP association
 * @skb: packet requesting the association
 *
 * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_assoc_request(struct sctp_association *asoc,
                                struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

/**
 * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option
 * @sk: socket
 * @optname: SCTP option to validate
 * @address: list of IP addresses to validate
 * @addrlen: length of the address list
 *
 * Validiate permissions required for each address associated with sock        @sk.
 * Depending on @optname, the addresses will be treated as either a connect or
 * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using
 * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6).
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, sk, optname, address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

/**
 * security_sctp_sk_clone() - Clone a SCTP sock's LSM state
 * @asoc: SCTP association
 * @sk: original sock
 * @newsk: target sock
 *
 * Called whenever a new socket is created by accept(2) (i.e. a TCP style
 * socket) or when a socket is 'peeled off' e.g userspace calls
 * sctp_peeloff(3).
 */
void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, asoc, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

/**
 * security_sctp_assoc_established() - Update LSM state when assoc established
 * @asoc: SCTP association
 * @skb: packet establishing the association
 *
 * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sctp_assoc_established(struct sctp_association *asoc,
                                    struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_established, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_established);

/**
 * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket
 * @sk: the owning MPTCP socket
 * @ssk: the new subflow
 *
 * Update the labeling for the given MPTCP subflow, to match the one of the
 * owning MPTCP socket. This hook has to be called after the socket creation and
 * initialization via the security_socket_create() and
 * security_socket_post_create() LSM hooks.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk)
{
        return call_int_hook(mptcp_add_subflow, sk, ssk);
}

#endif        /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND
/**
 * security_ib_pkey_access() - Check if access to an IB pkey is allowed
 * @sec: LSM blob
 * @subnet_prefix: subnet prefix of the port
 * @pkey: IB pkey
 *
 * Check permission to access a pkey when modifying a QP.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

/**
 * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed
 * @sec: LSM blob
 * @dev_name: IB device name
 * @port_num: port number
 *
 * Check permissions to send and receive SMPs on a end port.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_endport_manage_subnet(void *sec,
                                      const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

/**
 * security_ib_alloc_security() - Allocate an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Allocate a security structure for Infiniband objects.
 *
 * Return: Returns 0 on success, non-zero on failure.
 */
int security_ib_alloc_security(void **sec)
{
        return call_int_hook(ib_alloc_security, sec);
}
EXPORT_SYMBOL(security_ib_alloc_security);

/**
 * security_ib_free_security() - Free an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Deallocate an Infiniband security structure.
 */
void security_ib_free_security(void *sec)
{
        call_void_hook(ib_free_security, sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/**
 * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob
 * @ctxp: xfrm security context being added to the SPD
 * @sec_ctx: security label provided by userspace
 * @gfp: gfp flags
 *
 * Allocate a security structure to the xp->security field; the security field
 * is initialized to NULL when the xfrm_policy is allocated.
 *
 * Return:  Return 0 if operation was successful.
 */
int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

/**
 * security_xfrm_policy_clone() - Clone xfrm policy LSM state
 * @old_ctx: xfrm security context
 * @new_ctxp: target xfrm security context
 *
 * Allocate a security structure in new_ctxp that contains the information from
 * the old_ctx structure.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                               struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, old_ctx, new_ctxp);
}

/**
 * security_xfrm_policy_free() - Free a xfrm security context
 * @ctx: xfrm security context
 *
 * Free LSM resources associated with @ctx.
 */
void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

/**
 * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed
 * @ctx: xfrm security context
 *
 * Authorize deletion of a SPD entry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, ctx);
}

/**
 * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @sec_ctx: security label provided by userspace
 *
 * Allocate a security structure to the @x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated. Set the context to
 * correspond to @sec_ctx.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

/**
 * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @polsec: associated policy's security context
 * @secid: secid from the flow
 *
 * Allocate a security structure to the x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated.  Set the context to
 * correspond to secid.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, x, polsec, secid);
}

/**
 * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed
 * @x: xfrm state
 *
 * Authorize deletion of x->security.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

/**
 * security_xfrm_state_free() - Free a xfrm state
 * @x: xfrm state
 *
 * Deallocate x->security.
 */
void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

/**
 * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed
 * @ctx: target xfrm security context
 * @fl_secid: flow secid used to authorize access
 *
 * Check permission when a flow selects a xfrm_policy for processing XFRMs on a
 * packet.  The hook is called when selecting either a per-socket policy or a
 * generic xfrm policy.
 *
 * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on
 *         other errors.
 */
int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
{
        return call_int_hook(xfrm_policy_lookup, ctx, fl_secid);
}

/**
 * security_xfrm_state_pol_flow_match() - Check for a xfrm match
 * @x: xfrm state to match
 * @xp: xfrm policy to check for a match
 * @flic: flow to check for a match.
 *
 * Check @xp and @flic for a match with @x.
 *
 * Return: Returns 1 if there is a match.
 */
int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct security_hook_list *hp;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match,
                             list) {
                rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

/**
 * security_xfrm_decode_session() - Determine the xfrm secid for a packet
 * @skb: xfrm packet
 * @secid: secid
 *
 * Decode the packet in @skb and return the security label in @secid.
 *
 * Return: Return 0 if all xfrms used have the same secid.
 */
int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, skb, &flic->flowic_secid,
                               0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);
#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS
/**
 * security_key_alloc() - Allocate and initialize a kernel key LSM blob
 * @key: key
 * @cred: credentials
 * @flags: allocation flags
 *
 * Permit allocation of a key and assign security data. Note that key does not
 * have a serial number assigned at this point.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        return call_int_hook(key_alloc, key, cred, flags);
}

/**
 * security_key_free() - Free a kernel key LSM blob
 * @key: key
 *
 * Notification of destruction; free security data.
 */
void security_key_free(struct key *key)
{
        call_void_hook(key_free, key);
}

/**
 * security_key_permission() - Check if a kernel key operation is allowed
 * @key_ref: key reference
 * @cred: credentials of actor requesting access
 * @need_perm: requested permissions
 *
 * See whether a specific operational right is granted to a process on a key.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, key_ref, cred, need_perm);
}

/**
 * security_key_getsecurity() - Get the key's security label
 * @key: key
 * @buffer: security label buffer
 *
 * Get a textual representation of the security context attached to a key for
 * the purposes of honouring KEYCTL_GETSECURITY.  This function allocates the
 * storage for the NUL-terminated string and the caller should free it.
 *
 * Return: Returns the length of @buffer (including terminating NUL) or -ve if
 *         an error occurs.  May also return 0 (and a NULL buffer pointer) if
 *         there is no security label assigned to the key.
 */
int security_key_getsecurity(struct key *key, char **buffer)
{
        *buffer = NULL;
        return call_int_hook(key_getsecurity, key, buffer);
}

/**
 * security_key_post_create_or_update() - Notification of key create or update
 * @keyring: keyring to which the key is linked to
 * @key: created or updated key
 * @payload: data used to instantiate or update the key
 * @payload_len: length of payload
 * @flags: key flags
 * @create: flag indicating whether the key was created or updated
 *
 * Notify the caller of a key creation or update.
 */
void security_key_post_create_or_update(struct key *keyring, struct key *key,
                                        const void *payload, size_t payload_len,
                                        unsigned long flags, bool create)
{
        call_void_hook(key_post_create_or_update, keyring, key, payload,
                       payload_len, flags, create);
}
#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT
/**
 * security_audit_rule_init() - Allocate and init an LSM audit rule struct
 * @field: audit action
 * @op: rule operator
 * @rulestr: rule context
 * @lsmrule: receive buffer for audit rule struct
 * @gfp: GFP flag used for kmalloc
 *
 * Allocate and initialize an LSM audit rule structure.
 *
 * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of
 *         an invalid rule.
 */
int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
                             gfp_t gfp)
{
        return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp);
}

/**
 * security_audit_rule_known() - Check if an audit rule contains LSM fields
 * @krule: audit rule
 *
 * Specifies whether given @krule contains any fields related to the current
 * LSM.
 *
 * Return: Returns 1 in case of relation found, 0 otherwise.
 */
int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, krule);
}

/**
 * security_audit_rule_free() - Free an LSM audit rule struct
 * @lsmrule: audit rule struct
 *
 * Deallocate the LSM audit rule structure previously allocated by
 * audit_rule_init().
 */
void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

/**
 * security_audit_rule_match() - Check if a label matches an audit rule
 * @secid: security label
 * @field: LSM audit field
 * @op: matching operator
 * @lsmrule: audit rule
 *
 * Determine if given @secid matches a rule previously approved by
 * security_audit_rule_known().
 *
 * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
 *         failure.
 */
int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
{
        return call_int_hook(audit_rule_match, secid, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
/**
 * security_bpf() - Check if the bpf syscall operation is allowed
 * @cmd: command
 * @attr: bpf attribute
 * @size: size
 *
 * Do a initial check for all bpf syscalls after the attribute is copied into
 * the kernel. The actual security module can implement their own rules to
 * check the specific cmd they need.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
        return call_int_hook(bpf, cmd, attr, size);
}

/**
 * security_bpf_map() - Check if access to a bpf map is allowed
 * @map: bpf map
 * @fmode: mode
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * maps.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, map, fmode);
}

/**
 * security_bpf_prog() - Check if access to a bpf program is allowed
 * @prog: bpf program
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * programs.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, prog);
}

/**
 * security_bpf_map_create() - Check if BPF map creation is allowed
 * @map: BPF map object
 * @attr: BPF syscall attributes used to create BPF map
 * @token: BPF token used to grant user access
 *
 * Do a check when the kernel creates a new BPF map. This is also the
 * point where LSM blob is allocated for LSMs that need them.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
                            struct bpf_token *token)
{
        return call_int_hook(bpf_map_create, map, attr, token);
}

/**
 * security_bpf_prog_load() - Check if loading of BPF program is allowed
 * @prog: BPF program object
 * @attr: BPF syscall attributes used to create BPF program
 * @token: BPF token used to grant user access to BPF subsystem
 *
 * Perform an access control check when the kernel loads a BPF program and
 * allocates associated BPF program object. This hook is also responsible for
 * allocating any required LSM state for the BPF program.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
                           struct bpf_token *token)
{
        return call_int_hook(bpf_prog_load, prog, attr, token);
}

/**
 * security_bpf_token_create() - Check if creating of BPF token is allowed
 * @token: BPF token object
 * @attr: BPF syscall attributes used to create BPF token
 * @path: path pointing to BPF FS mount point from which BPF token is created
 *
 * Do a check when the kernel instantiates a new BPF token object from BPF FS
 * instance. This is also the point where LSM blob can be allocated for LSMs.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
                              struct path *path)
{
        return call_int_hook(bpf_token_create, token, attr, path);
}

/**
 * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
 * requested BPF syscall command
 * @token: BPF token object
 * @cmd: BPF syscall command requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF syscall command.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
{
        return call_int_hook(bpf_token_cmd, token, cmd);
}

/**
 * security_bpf_token_capable() - Check if BPF token is allowed to delegate
 * requested BPF-related capability
 * @token: BPF token object
 * @cap: capabilities requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF-related capabilities.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_capable(const struct bpf_token *token, int cap)
{
        return call_int_hook(bpf_token_capable, token, cap);
}

/**
 * security_bpf_map_free() - Free a bpf map's LSM blob
 * @map: bpf map
 *
 * Clean up the security information stored inside bpf map.
 */
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free, map);
}

/**
 * security_bpf_prog_free() - Free a BPF program's LSM blob
 * @prog: BPF program struct
 *
 * Clean up the security information stored inside BPF program.
 */
void security_bpf_prog_free(struct bpf_prog *prog)
{
        call_void_hook(bpf_prog_free, prog);
}

/**
 * security_bpf_token_free() - Free a BPF token's LSM blob
 * @token: BPF token struct
 *
 * Clean up the security information stored inside BPF token.
 */
void security_bpf_token_free(struct bpf_token *token)
{
        call_void_hook(bpf_token_free, token);
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * security_locked_down() - Check if a kernel feature is allowed
 * @what: requested kernel feature
 *
 * Determine whether a kernel feature that potentially enables arbitrary code
 * execution in kernel space should be permitted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, what);
}
EXPORT_SYMBOL(security_locked_down);

#ifdef CONFIG_PERF_EVENTS
/**
 * security_perf_event_open() - Check if a perf event open is allowed
 * @attr: perf event attribute
 * @type: type of event
 *
 * Check whether the @type of perf_event_open syscall is allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_open(struct perf_event_attr *attr, int type)
{
        return call_int_hook(perf_event_open, attr, type);
}

/**
 * security_perf_event_alloc() - Allocate a perf event LSM blob
 * @event: perf event
 *
 * Allocate and save perf_event security info.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_perf_event_alloc(struct perf_event *event)
{
        return call_int_hook(perf_event_alloc, event);
}

/**
 * security_perf_event_free() - Free a perf event LSM blob
 * @event: perf event
 *
 * Release (free) perf_event security info.
 */
void security_perf_event_free(struct perf_event *event)
{
        call_void_hook(perf_event_free, event);
}

/**
 * security_perf_event_read() - Check if reading a perf event label is allowed
 * @event: perf event
 *
 * Read perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, event);
}

/**
 * security_perf_event_write() - Check if writing a perf event label is allowed
 * @event: perf event
 *
 * Write perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, event);
}
#endif /* CONFIG_PERF_EVENTS */

#ifdef CONFIG_IO_URING
/**
 * security_uring_override_creds() - Check if overriding creds is allowed
 * @new: new credentials
 *
 * Check if the current task, executing an io_uring operation, is allowed to
 * override it's credentials with @new.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_override_creds(const struct cred *new)
{
        return call_int_hook(uring_override_creds, new);
}

/**
 * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed
 *
 * Check whether the current task is allowed to spawn a io_uring polling thread
 * (IORING_SETUP_SQPOLL).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_sqpoll(void)
{
        return call_int_hook(uring_sqpoll);
}

/**
 * security_uring_cmd() - Check if a io_uring passthrough command is allowed
 * @ioucmd: command
 *
 * Check whether the file_operations uring_cmd is allowed to run.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_cmd(struct io_uring_cmd *ioucmd)
{
        return call_int_hook(uring_cmd, ioucmd);
}
#endif /* CONFIG_IO_URING */


r0 = socket$inet6_sctp(0xa, 0x1, 0x84)
setsockopt$inet_sctp6_SCTP_SOCKOPT_BINDX_ADD(r0, 0x84, 0x64, &(0x7f0000000080)=[@in={0x2, 0xfffc, @dev}], 0x10)
setsockopt$inet_sctp6_SCTP_SOCKOPT_BINDX_ADD(r0, 0x84, 0x64, &(0x7f0000000040)=[@in6={0xa, 0x0, 0x0, @loopback}], 0x1c)
r1 = epoll_create1(0x0)
getsockopt$inet_sctp6_SCTP_SOCKOPT_CONNECTX3(r0, 0x84, 0x6f, &(0x7f0000000240)={0x0, 0x10, &(0x7f0000000180)=[@in={0x2, 0x0, @local}]}, &(0x7f0000000280)=0x10)
epoll_ctl$EPOLL_CTL_ADD(r1, 0x1, r0, &(0x7f00000001c0))
sendmmsg$inet6(r0, &(0x7f000000cf00)=[{{&(0x7f00000084c0)={0xa, 0xfffc, 0x0, @loopback}, 0x1c, &(0x7f0000008900)=[{&(0x7f0000008500)="88", 0x1}], 0x1}}], 0x1, 0x0)
epoll_wait(r1, &(0x7f0000000200)=[{}], 0x1, 0x0)


r0 = openat$ppp(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
socket$nl_netfilter(0x10, 0x3, 0xc)
ioctl$PPPIOCNEWUNIT(r0, 0xc004743e, &(0x7f00000000c0))
r1 = openat$ppp(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$PPPIOCNEWUNIT(r1, 0xc004743e, &(0x7f0000000080))
ioctl$PPPIOCSMAXCID(r1, 0x40047451, &(0x7f0000000200))
r2 = socket$nl_route(0x10, 0x3, 0x0)
socket$nl_generic(0x10, 0x3, 0x10)
ioctl$ifreq_SIOCGIFINDEX_batadv_mesh(0xffffffffffffffff, 0x8933, &(0x7f0000000080))
socket$l2tp6(0xa, 0x2, 0x73)
ioctl$ifreq_SIOCGIFINDEX_team(0xffffffffffffffff, 0x8933, &(0x7f0000001b00))
sendmsg$nl_route(r2, 0x0, 0x0)
ioctl$PPPIOCSMAXCID(r1, 0x40047451, 0x0)
preadv(r0, &(0x7f0000002600)=[{0x0}, {&(0x7f00000001c0)=""/130, 0x82}, {0x0}, {&(0x7f0000000300)=""/4096, 0x1000}, {&(0x7f0000001300)=""/162, 0xa2}, {&(0x7f00000013c0)=""/142, 0x8e}, {&(0x7f0000002480)=""/12, 0xc}, {&(0x7f00000024c0)=""/57, 0x39}], 0x8, 0x58, 0x3)
ioctl$PPPIOCSPASS(r0, 0x40107447, &(0x7f0000000080)={0x2, &(0x7f0000000000)=[{0x61}, {0x6}]})


r0 = socket$inet_smc(0x2b, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000140)={0x2, 0x4e22, @multicast2}, 0x10)
setsockopt$inet_tcp_TCP_CONGESTION(r0, 0x6, 0xd, &(0x7f0000000100)='lp\x00', 0x3)
tee(0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x4e22, @local}, 0x10)
sendto$inet(r0, &(0x7f0000000040)='u', 0xa792a, 0x801, 0x0, 0x0)
recvfrom$inet(r0, &(0x7f0000000080)=""/128, 0xfffffce3, 0x0, 0x0, 0x0)


r0 = socket$inet_smc(0x2b, 0x1, 0x0)
setsockopt$inet_tcp_TCP_CONGESTION(r0, 0x6, 0xd, &(0x7f0000000240)='highspeed\x00', 0xa)
bind$inet(r0, &(0x7f0000000140)={0x2, 0x4e22, @multicast2}, 0x10)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x4e22, @local}, 0x10)
sendto$inet(r0, &(0x7f0000000040)='u', 0xa792a, 0x801, 0x0, 0x0)
recvfrom$inet(r0, &(0x7f0000000080)=""/128, 0xfffffce3, 0x0, 0x0, 0x0)
writev(r0, &(0x7f0000000400)=[{&(0x7f0000000200)="67a818beb2c030ce59945b", 0xb}, {&(0x7f0000000300), 0x400000}, {0x0}], 0x3)


socket$inet_icmp_raw(0x2, 0x3, 0x1)
bpf$BPF_RAW_TRACEPOINT_OPEN(0x11, 0x0, 0x0)
setsockopt$inet_mtu(0xffffffffffffffff, 0x0, 0xa, &(0x7f00000000c0)=0x4, 0x4)
getuid()
socketpair$tipc(0x1e, 0x5, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
setsockopt$TIPC_GROUP_JOIN(r0, 0x10f, 0x87, &(0x7f0000001080)={0x43}, 0x10)
sendmmsg$inet(r0, &(0x7f0000001540)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0)
close(r1)


r0 = bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f00000013c0)={0x18, 0x3, &(0x7f0000000080)=@framed, &(0x7f0000000000)='syzkaller\x00'}, 0x90)
bpf$BPF_RAW_TRACEPOINT_OPEN(0x11, &(0x7f0000000040)={&(0x7f0000000400)='kfree\x00', r0}, 0x10)
getsockopt$inet6_tcp_TCP_ZEROCOPY_RECEIVE(0xffffffffffffffff, 0x6, 0x23, 0x0, 0x0)
sendmsg$IPCTNL_MSG_CT_DELETE(0xffffffffffffffff, 0x0, 0x0)
r1 = socket$nl_generic(0x10, 0x3, 0x10)
r2 = syz_genetlink_get_family_id$nl80211(&(0x7f0000000180), 0xffffffffffffffff)
ioctl$sock_SIOCGIFINDEX_80211(r1, 0x8933, &(0x7f0000000440)={'wlan0\x00', <r3=>0x0})
sendmsg$NL80211_CMD_CHANNEL_SWITCH(r1, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000500)={&(0x7f00000001c0)={0x2c, r2, 0x1, 0x0, 0x0, {{}, {@val={0x8, 0x3, r3}, @void}}, [@chandef_params=[@NL80211_ATTR_WIPHY_FREQ={0x8, 0x26, @random=0x96c}], @NL80211_ATTR_CH_SWITCH_COUNT={0x8}]}, 0x2c}}, 0x0)


r0 = epoll_create(0x80)
r1 = socket(0x23, 0x5, 0x0)
listen(r1, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r1, &(0x7f00000000c0))


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000080)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvfrom$unix(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
getpid()
getpid()
sendmmsg$unix(r0, &(0x7f0000002180)=[{{0x0, 0x0, 0x0, 0x0, &(0x7f0000001640)=[@rights={{0x14, 0x1, 0x1, [r0]}}], 0x18}}], 0x1, 0x0)
socket$igmp(0x2, 0x3, 0x2)
sendmsg$unix(r1, &(0x7f00000002c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000200)=[@rights={{0x14, 0x1, 0x1, [r0]}}], 0x18}, 0x0)


r0 = socket$inet_sctp(0x2, 0x5, 0x84)
syz_emit_ethernet(0x3e, &(0x7f0000000000)={@broadcast, @empty, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x1, 0x0, @remote, @dev}, @time_exceeded={0xb, 0x1, 0x0, 0x0, 0x0, 0x0, {0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @multicast2, @private}}}}}}, 0x0)
r1 = socket$inet_smc(0x2b, 0x1, 0x0)
ioctl$int_in(r1, 0x5421, 0x0)
read(r1, 0x0, 0x0)
socket$nl_route(0x10, 0x3, 0x0)
r2 = openat$cgroup_ro(0xffffffffffffffff, &(0x7f00000003c0)='cpuacct.usage_percpu_sys\x00', 0x0, 0x0)
r3 = socket$nl_generic(0x10, 0x3, 0x10)
socket$inet6_sctp(0xa, 0x0, 0x84)
ioctl$sock_SIOCSIFVLAN_ADD_VLAN_CMD(0xffffffffffffffff, 0x8983, &(0x7f0000000440)={0x0, 'syz_tun\x00'})
syz_emit_ethernet(0x4a, &(0x7f0000000180)={@local, @broadcast, @val={@void}, {@mpls_uc={0x8847, {[], @ipv6=@dccp_packet={0x0, 0x6, "4a69a0", 0x10, 0x21, 0x0, @ipv4={'\x00', '\xff\xff', @remote}, @ipv4={'\x00', '\xff\xff', @multicast1}, {[], {{0x0, 0x0, 0x4, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, "67877b", 0x0, "9927f7"}}}}}}}}, 0x0)
syz_genetlink_get_family_id$devlink(&(0x7f00000000c0), 0xffffffffffffffff)
sendmsg$DEVLINK_CMD_TRAP_SET(r3, &(0x7f0000000200)={0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x200040c0}, 0x80)
ioctl$sock_ipv6_tunnel_SIOCCHGPRL(r2, 0x89f7, &(0x7f0000000480)={'sit0\x00', 0x0})
sendmsg$NFT_BATCH(0xffffffffffffffff, 0x0, 0x0)
socket$nl_route(0x10, 0x3, 0x0)
sendmsg$NFT_MSG_GETRULE(0xffffffffffffffff, 0x0, 0x0)
syz_emit_ethernet(0x0, 0x0, 0x0)
ioctl$TUNGETDEVNETNS(0xffffffffffffffff, 0x54e3, 0x0)
ioctl$TUNGETDEVNETNS(0xffffffffffffffff, 0x54e3, 0x0)
setsockopt$EBT_SO_SET_ENTRIES(r0, 0x0, 0x80, &(0x7f00000002c0)=@filter={'filter\x00', 0xe, 0x0, 0xc0, [0x0, 0x20000100, 0x20000130, 0x20000160], 0x8, 0x0, &(0x7f0000000100)=[{0x0, '\x00', 0x0, 0xffffffffffffffff}, {0x0, '\x00', 0x0, 0xfffffffffffffffe}, {0x0, '\x00', 0x0, 0xfffffffffffffffe}, {0x0, '\x00', 0x0, 0xfffffffffffffffe}]}, 0x138)


r0 = epoll_create1(0x0)
r1 = socket$inet6_mptcp(0xa, 0x1, 0x106)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r1, &(0x7f0000000040))
sendto$inet6(r1, 0x0, 0x0, 0x24000000, &(0x7f0000000080)={0xa, 0x0, 0x0, @dev, 0x8}, 0x1c)
epoll_pwait(r0, &(0x7f00000000c0)=[{}], 0x1, 0x0, 0x0, 0x0)


r0 = socket(0x15, 0x5, 0x0)
bind$l2tp6(r0, &(0x7f00000000c0)={0xa, 0x0, 0x0, @loopback}, 0x20)
getsockname$inet6(r0, 0x0, &(0x7f0000000280))


r0 = socket$can_bcm(0x1d, 0x2, 0x2)
connect$can_bcm(r0, &(0x7f00000000c0), 0x10)
sendmsg$can_bcm(r0, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000180)={&(0x7f0000000000)={0x5, 0xc53, 0x0, {}, {0x0, 0x2710}, {0x0, 0x0, 0x1}, 0x1, @canfd={{}, 0x0, 0x0, 0x0, 0x0, "1a7428bd6f08b1a747153d5ebf8085f192e97cba3d28667174ac4c513038850a04bf2e4c5f2eeb71cd4acee0253d67c3f4a891bbba05fbeab98cfe6dc147e582"}}, 0x80}}, 0x0)


unshare(0x20000400)
r0 = openat$tun(0xffffffffffffff9c, &(0x7f0000000240), 0x0, 0x0)
sendmsg$NL802154_CMD_NEW_SEC_KEY(0xffffffffffffffff, 0x0, 0x0)
ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f0000000180)={'syzkaller1\x00', 0x2})
poll(&(0x7f0000000040)=[{r0}], 0x1, 0x0)


r0 = openat$tun(0xffffffffffffff9c, &(0x7f0000000280), 0x80002, 0x0)
pwritev(r0, 0x0, 0x0, 0x0, 0x0)


r0 = socket$inet6_mptcp(0xa, 0x1, 0x106)
bind$inet6(r0, &(0x7f0000000040)={0xa, 0x4e22, 0x0, @loopback}, 0x1c)
sendto$inet6(r0, 0x0, 0x0, 0x20048004, &(0x7f0000000140)={0xa, 0x4e22, 0x0, @loopback}, 0x1c)
setsockopt$inet6_int(r0, 0x29, 0x17, &(0x7f0000001880), 0x4)


r0 = socket$pptp(0x18, 0x1, 0x2)
setsockopt$SO_ATTACH_FILTER(r0, 0x1, 0x4b, &(0x7f0000000100)={0x0, 0x0}, 0x10)
read(r0, &(0x7f0000000040)=""/126, 0x7e)
mkdirat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000000)='./cgroup/syz0\x00', 0x1ff)
r1 = socket$inet_smc(0x2b, 0x1, 0x0)
setsockopt$IP_VS_SO_SET_ADD(r1, 0x0, 0x3, 0x0, 0x0)
r2 = socket$phonet_pipe(0x23, 0x5, 0x2)
write$binfmt_aout(r2, 0x0, 0xfffffe5c)
ioctl$SIOCPNENABLEPIPE(r2, 0x89ed, 0x0)


r0 = socket$alg(0x26, 0x5, 0x0)
bind$alg(r0, &(0x7f00000000c0)={0x26, 'aead\x00', 0x0, 0x0, 'aegis128-generic\x00'}, 0x58)
r1 = accept(r0, 0x0, 0x0)
sendmsg$IPCTNL_MSG_CT_DELETE(r1, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={0x0}}, 0x0)


r0 = socket$inet(0x2, 0xa, 0x0)
ioctl$sock_inet_SIOCSARP(r0, 0x8953, &(0x7f0000000140)={{0x2, 0x0, @multicast2}, {0x0, @random="f0fe287eb3b4"}, 0xb8, {0x2, 0x0, @broadcast}})
r1 = socket$inet_smc(0x2b, 0x1, 0x0)
r2 = socket$inet_smc(0x2b, 0x1, 0x0)
bind$inet(r2, &(0x7f0000000140)={0x2, 0x4e22, @multicast2}, 0x10)
r3 = bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f00000013c0)={0x18, 0x3, &(0x7f00000000c0)=@framed, &(0x7f0000000000)='syzkaller\x00'}, 0x90)
bpf$BPF_RAW_TRACEPOINT_OPEN(0x11, &(0x7f0000000400)={&(0x7f00000003c0)='kmem_cache_free\x00', r3}, 0x10)
connect$inet(r2, &(0x7f0000000100)={0x2, 0x4e22, @local}, 0x10)
shutdown(r2, 0x1)
bind$inet(r1, &(0x7f0000000140)={0x2, 0x4e22, @multicast2}, 0x10)
connect$inet(r1, &(0x7f0000000000)={0x2, 0x4e22, @local}, 0x10)


r0 = socket$inet6_sctp(0xa, 0x5, 0x84)
setsockopt$inet_sctp6_SCTP_SOCKOPT_BINDX_ADD(r0, 0x84, 0x64, &(0x7f0000000000)=[@in6={0xa, 0x4e20, 0x0, @loopback}], 0x1c)
r1 = socket$nl_netfilter(0x10, 0x3, 0xc)
sendmsg$IPCTNL_MSG_TIMEOUT_NEW(r1, &(0x7f0000000040)={0x0, 0x0, &(0x7f0000000100)={&(0x7f0000000080)={0x30, 0x3, 0x8, 0x101, 0x0, 0x0, {}, [@CTA_TIMEOUT_DATA={0xc, 0x4, 0x0, 0x1, @sctp=[@CTA_TIMEOUT_SCTP_ESTABLISHED={0x8}]}, @CTA_TIMEOUT_L4PROTO={0x5, 0x3, 0x84}, @CTA_TIMEOUT_L3PROTO={0x6}]}, 0x30}}, 0x0)
getsockopt$inet_sctp6_SCTP_SOCKOPT_CONNECTX3(r0, 0x84, 0x6f, &(0x7f0000000280)={0x0, 0x1c, &(0x7f0000000240)=[@in6={0xa, 0x4e20, 0x0, @loopback}]}, &(0x7f00000002c0)=0x10)
close(r0)


unshare(0x22000600)
r0 = socket$inet6_udplite(0xa, 0x2, 0x88)
ioctl$sock_ipv6_tunnel_SIOCGETTUNNEL(r0, 0x89f0, &(0x7f0000000580)={'ip6gre0\x00', 0x0})


syz_genetlink_get_family_id$nl80211(0x0, 0xffffffffffffffff)
r0 = socket$nl_generic(0x10, 0x3, 0x10)
syz_genetlink_get_family_id$nl80211(0xffffffffffffffff, r0)


bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f00000013c0)={0x0, 0x0, 0x0, 0x0}, 0x90)
r0 = syz_init_net_socket$nfc_llcp(0x27, 0x2, 0x1)
recvmmsg(r0, &(0x7f0000000500)=[{{&(0x7f0000000040)=@pppol2tpin6={0x18, 0x1, {0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, {0xa, 0x0, 0x0, @local}}}, 0xbf, 0x0}}], 0x73d, 0x0, 0x0)
clock_gettime(0x0, &(0x7f0000000480))
ppoll(&(0x7f0000000440)=[{r0}, {r0}], 0x2, 0x0, 0x0, 0x0)


r0 = bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f00000013c0)={0x18, 0x3, &(0x7f0000000080)=@framed, &(0x7f0000000000)='syzkaller\x00'}, 0x90)
bpf$BPF_RAW_TRACEPOINT_OPEN(0x11, &(0x7f0000000040)={&(0x7f0000000400)='kfree\x00', r0}, 0x10)
r1 = syz_init_net_socket$rose(0xb, 0x5, 0x0)
setsockopt$rose(r1, 0x104, 0x7, &(0x7f0000000140), 0x4)


r0 = socket$nl_generic(0x10, 0x3, 0x10)
r1 = syz_genetlink_get_family_id$nl80211(&(0x7f0000000200), 0xffffffffffffffff)
ioctl$sock_SIOCGIFINDEX_80211(r0, 0x8933, &(0x7f0000000700)={'wlan1\x00', <r2=>0x0})
sendmsg$NL80211_CMD_SET_INTERFACE(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000300)={&(0x7f0000000240)={0x24, r1, 0x5, 0x0, 0x0, {{}, {@val={0x8, 0x3, r2}, @void}}, [@NL80211_ATTR_IFTYPE={0x8, 0x5, 0x2}]}, 0x24}}, 0x0)
sendmsg$NL80211_CMD_TRIGGER_SCAN(r0, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000300)={&(0x7f0000000240)={0x40, r1, 0x5, 0x0, 0x0, {{}, {@val={0x8, 0x3, r2}, @void}}, [@NL80211_ATTR_IE={0x11, 0x2a, [@random={0xdd, 0xb, 'abcdefghijk'}]}, @NL80211_ATTR_SCAN_SSIDS={0x10, 0x2d, 0x0, 0x1, [{0xa, 0x0, @default_ap_ssid}]}]}, 0x40}}, 0x0)
syz_80211_inject_frame(&(0x7f0000000300)=@device_b, &(0x7f0000000040)=@mgmt_frame=@beacon={{{}, {}, @device_b, @device_a, @from_mac}, 0x0, @default, 0x1, @val={0x0, 0x6, @default_ap_ssid}, @val={0x1, 0x8, [{0x2, 0x1}, {0x4, 0x1}, {0xb, 0x1}, {0x16, 0x1}, {0xc}, {0x12}, {0x18}, {0x24}]}, @void, @void, @void, @void, @void, @void, @void, @void, @void, @void, @void}, 0x36)
nanosleep(&(0x7f0000000080)={0x0, 0x4c4b40}, &(0x7f00000000c0))
syz_80211_inject_frame(&(0x7f0000000300)=@device_b, &(0x7f0000000340)=@mgmt_frame=@probe_response={{{}, {}, @device_b, @device_a, @from_mac}, 0x0, @default, 0x1, @val={0x0, 0x6, @default_ap_ssid}, @val={0x1, 0x8, [{0x2, 0x1}, {0x4, 0x1}, {0xb, 0x1}, {0x16, 0x1}, {0xc}, {0x12}, {0x18}, {0x24}]}, @void, @void, @void, @void, @void, @void}, 0x36)


unshare(0x20000400)
r0 = socket(0x18, 0x0, 0x0)
getpeername$l2tp(r0, 0x0, &(0x7f0000000080))


mmap(&(0x7f0000000000/0xa000)=nil, 0xa000, 0x0, 0x32, 0xffffffffffffffff, 0x0)
r0 = socket$igmp(0x2, 0x3, 0x2)
setsockopt$MRT_FLUSH(r0, 0x0, 0xd0, &(0x7f0000000240), 0x4)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000))
select(0x40, &(0x7f0000000380), &(0x7f00000003c0)={0x8}, 0x0, 0x0)


r0 = socket$packet(0x11, 0x3, 0x300)
vmsplice(r0, 0x0, 0x0, 0x0)


splice(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0)


getsockname(0xffffffffffffffff, &(0x7f0000000080)=@caif=@dgm, &(0x7f0000000100)=0x80)
r0 = syz_init_net_socket$llc(0x1a, 0x1, 0x0)
setsockopt$llc_int(r0, 0x10c, 0x8, &(0x7f0000000040)=0x1, 0x4)
sendmmsg$sock(r0, &(0x7f0000002ac0)=[{{&(0x7f0000002940)=@vsock={0x28, 0x0, 0x0, @hyper}, 0x80, 0x0}}], 0x1, 0x0)
r1 = socket$nl_generic(0x10, 0x3, 0x10)
r2 = getpid()
sendmsg$netlink(0xffffffffffffffff, &(0x7f0000000540)={&(0x7f0000000340)=@kern={0x10, 0x0, 0x0, 0x2}, 0xc, &(0x7f0000000500)=[{&(0x7f0000000700)={0x1c4, 0x39, 0x400, 0x70bd2a, 0x25dfdbfd, "", [@typed={0x8, 0x8a, 0x0, 0x0, @uid}, @typed={0x9, 0x78, 0x0, 0x0, @str='sit0\x00'}, @generic="665c95e094cac44f04c24ba52628453c881ceedf779a41d73498b3a6c55d9c4a76a99facc4da199078ff61c0a9bb0e9ca9e189bca75c12d787c6f316b8349c5ca85ae8351197f7272f343fdeeda74c782b95010435c86f25819979d4f561fea3448b102a8b3726dfe51f0b98ad840bb25d8d4f8993bd3ce884d341cf72b06e789dae328d75e551e848be94dd913010d01b", @typed={0x4, 0xad}, @generic="39a0a3fcf3c1f442a7f514523ecbb5c211b4c397ec995ad7bdc0cb2a50d1c66d91cd9bed3bbe79e5046ae54dbe635e5c5d2d5e7daa893ea99b90dcf4ef5f122e9b74d945a0878b619ba332a6a2977948d2371d5c8aae4fa5a2c2f28f0f0955a73c1d2f16f7853faa39b5212de1b61e030c60f62de4f3b34e8af5f80f1abbaac6cff4b9a1b7732073c48cb4355223c3ec4c08d047485d7dd9a58c611b828dfdbd37a7b46916efe4c4de2160d6a713547b6de6914be9c332d5b6662cbf37b79ae6a94d7c1a2f1c99e962a154f0417b9720bf49f563ed", @typed={0x8, 0x110, 0x0, 0x0, @pid=r2}, @nested={0x2c, 0x6c, 0x0, 0x1, [@generic="32b087f856d0f20ec6fc557c567cba0bfdf8c79516c0df909cdcbfeafc295f7b", @typed={0x8, 0x50, 0x0, 0x0, @u32=0xff}]}]}, 0x1c4}], 0x1, 0x0, 0x0, 0x40810}, 0x4000001)
syz_genetlink_get_family_id$ethtool(&(0x7f0000000640), r1)
r3 = syz_genetlink_get_family_id$nl80211(&(0x7f0000000080), 0xffffffffffffffff)
ioctl$sock_SIOCGIFINDEX_80211(r1, 0x8933, &(0x7f00000000c0)={'wlan1\x00', <r4=>0x0})
sendmsg$NL80211_CMD_SET_INTERFACE(r1, &(0x7f0000000100)={0x0, 0x0, &(0x7f0000000140)={&(0x7f00000003c0)={0x38, r3, 0x5, 0x0, 0x0, {{}, {@val={0x8, 0x3, r4}, @void}}, [@mon_options=[@NL80211_ATTR_MNTR_FLAGS={0x1c, 0x17, 0x0, 0x1, [@NL80211_MNTR_FLAG_PLCPFAIL={0x4}, @NL80211_MNTR_FLAG_PLCPFAIL={0x4}, @NL80211_MNTR_FLAG_FCSFAIL={0x4}, @NL80211_MNTR_FLAG_OTHER_BSS={0x4}, @NL80211_MNTR_FLAG_FCSFAIL={0x4}, @NL80211_MNTR_FLAG_PLCPFAIL={0x4}]}]]}, 0x38}}, 0x0)
sendmsg$NL80211_CMD_CONNECT(r1, &(0x7f00000001c0)={0x0, 0x0, &(0x7f0000000200)={&(0x7f0000000600)=ANY=[@ANYBLOB='4\x00\x00\x00', @ANYRES16=r3, @ANYBLOB="0500e3ffffffffffffff2e00000008000300", @ANYRES32=r4, @ANYBLOB="0a00340002020202020200000c00e3800800020038ab932f97dc64b0491c00001654"], 0x34}}, 0x0)
syz_genetlink_get_family_id$ethtool(&(0x7f0000000180), 0xffffffffffffffff)
ioctl$sock_ipv4_tunnel_SIOCDELTUNNEL(0xffffffffffffffff, 0x89f2, &(0x7f0000000280)={'sit0\x00', &(0x7f00000001c0)={'syztnl2\x00', 0x0, 0x7, 0x20, 0x7, 0x8, {{0x20, 0x4, 0x3, 0x8, 0x80, 0x68, 0x0, 0x7, 0x2f, 0x0, @private=0xa010100, @empty, {[@end, @ra={0x94, 0x4}, @generic={0x94, 0xd, "c54ef61142fffcc38a58a5"}, @timestamp_prespec={0x44, 0x2c, 0x28, 0x3, 0x6, [{@private=0xa010101, 0x4}, {@multicast2, 0x9}, {@broadcast, 0x6}, {@multicast1, 0x1}, {@loopback, 0x800}]}, @generic={0x82, 0x10, "6361d4eeda757b0b0675ca627762"}, @rr={0x7, 0x1b, 0xe0, [@broadcast, @rand_addr=0x64010101, @remote, @multicast1, @initdev={0xac, 0x1e, 0x1, 0x0}, @dev={0xac, 0x14, 0x14, 0x27}]}]}}}}})
getsockopt$inet_mreqn(0xffffffffffffffff, 0x0, 0x23, &(0x7f00000002c0)={@empty, @empty}, &(0x7f0000000300)=0xc)
r5 = socket$igmp(0x2, 0x3, 0x2)
setsockopt$MRT_ADD_VIF(r5, 0x0, 0xca, &(0x7f00000002c0)={0x0, 0x4, 0x0, 0x0, @vifc_lcl_ifindex, @initdev={0xac, 0x1e, 0x0, 0x0}}, 0x10)
socket$packet(0x11, 0x2, 0x300)
ioctl$ifreq_SIOCGIFINDEX_batadv_hard(r5, 0x8933, &(0x7f0000000000)={'batadv_slave_0\x00'})
getgid()
gettid()
getsockopt$inet_IP_IPSEC_POLICY(0xffffffffffffffff, 0x0, 0x10, &(0x7f0000001f40)={{{@in6=@initdev, @in6=@mcast1}}, {{@in=@remote}, 0x0, @in=@dev}}, &(0x7f0000000380)=0xe8)
getsockopt$sock_cred(r5, 0x1, 0x11, &(0x7f0000002080), &(0x7f00000020c0)=0xc)
pipe(&(0x7f0000002100))
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r6=>0xffffffffffffffff})
r7 = socket$inet6_mptcp(0xa, 0x1, 0x106)
getsockopt$sock_cred(r7, 0x1, 0x11, &(0x7f0000003fc0)={0x0, <r8=>0x0}, &(0x7f0000005980)=0xc)
sendmmsg$unix(r6, &(0x7f0000000680)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, 0x0, 0x0, &(0x7f0000000640)=[@cred={{0x1c, 0x1, 0x2, {0x0, r8}}}], 0x20}}], 0x2, 0x0)
connect$llc(r0, &(0x7f0000000000)={0x1a, 0x325, 0x3f, 0x4, 0x1a, 0x9, @broadcast}, 0x10)


r0 = openat$ppp(0xffffffffffffff9c, &(0x7f0000001380), 0x0, 0x0)
ioctl$PPPIOCNEWUNIT(r0, 0xc004743e, &(0x7f0000000000))
pread64(r0, 0x0, 0x0, 0x0)


r0 = socket$can_bcm(0x1d, 0x2, 0x2)
connect$can_bcm(r0, &(0x7f00000005c0), 0x10)
ioctl$SIOCGSTAMP(r0, 0x8906, 0x0)
sendmsg$can_bcm(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)={&(0x7f0000000240)=ANY=[@ANYBLOB="050000007f0000000000010000000000", @ANYRES64=0x0, @ANYRES64=0x2710], 0x48}}, 0x0)
recvmsg$can_bcm(r0, &(0x7f0000000200)={0x0, 0x0, 0x0}, 0x0)


r0 = socket$alg(0x26, 0x5, 0x0)
bind$alg(r0, &(0x7f00000003c0)={0x26, 'hash\x00', 0x0, 0x0, 'hmac(sha256)\x00'}, 0x58)
r1 = accept(r0, 0x0, 0x0)
setsockopt$ALG_SET_KEY(r0, 0x117, 0x1, 0x0, 0x0)
r2 = accept4$ax25(r1, 0x0, 0x0, 0x0)
accept4$ax25(r2, &(0x7f0000000440)={{0x3, @default}, [@bcast, @default, @rose, @rose, @netrom, @bcast, @netrom, @bcast]}, 0x0, 0x0)


r0 = socket$inet6_dccp(0xa, 0x6, 0x0)
shutdown(r0, 0x1)
unshare(0x20000400)
pselect6(0x40, &(0x7f0000000300), &(0x7f0000000000)={0x8}, 0x0, 0x0, 0x0)
connect$inet6(r0, &(0x7f0000000080)={0xa, 0x0, 0x0, @dev={0xfe, 0x80, '\x00', 0x13}, 0x9}, 0x1c)


r0 = socket$nl_route(0x10, 0x3, 0x0)
r1 = bpf$BPF_PROG_RAW_TRACEPOINT_LOAD(0x5, &(0x7f00000013c0)={0x18, 0x3, &(0x7f0000000080)=@framed, &(0x7f0000000000)='syzkaller\x00'}, 0x90)
bpf$BPF_RAW_TRACEPOINT_OPEN(0x11, &(0x7f00000001c0)={&(0x7f0000000180)='sys_enter\x00', r1}, 0x10)
sendmsg$nl_route_sched(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000080)={&(0x7f00000001c0)=@newtclass={0x24}, 0x24}}, 0x0)


socketpair(0x1, 0x0, 0xa9, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
setsockopt$inet_sctp6_SCTP_PEER_ADDR_PARAMS(r0, 0x84, 0x9, 0x0, 0x0)
setsockopt$inet_sctp_SCTP_DEFAULT_SEND_PARAM(0xffffffffffffffff, 0x84, 0xa, &(0x7f0000000200)={0x0, 0x0, 0x201, 0x0, 0x0, 0x70bc61fe, 0x800}, 0x20)
socket$pppl2tp(0x18, 0x1, 0x1)
syz_genetlink_get_family_id$mptcp(0x0, r0)
r1 = socket$nl_netfilter(0x10, 0x3, 0xc)
sendmsg$NFNL_MSG_COMPAT_GET(r1, 0x0, 0x0)
bind$inet(0xffffffffffffffff, 0x0, 0x0)
connect$inet(0xffffffffffffffff, &(0x7f0000000140)={0x2, 0x4e21, @empty}, 0x10)
recvfrom$inet(0xffffffffffffffff, &(0x7f0000000180)=""/208, 0xd0, 0x160, 0x0, 0x0)
socket$inet_udplite(0x2, 0x2, 0x88)
socket$nl_rdma(0x10, 0x3, 0x14)
r2 = socket$nl_route(0x10, 0x3, 0x0)
sendmsg$nl_route_sched(r2, &(0x7f0000000200)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000002a80)=@delchain={0x24, 0x65, 0x1}, 0x24}}, 0x0)
pipe(&(0x7f0000000100)={0xffffffffffffffff, <r3=>0xffffffffffffffff})
sendfile(r3, r2, 0x0, 0x7)


r0 = socket$pppoe(0x18, 0x1, 0x0)
connect$pppoe(r0, &(0x7f0000000080)={0x18, 0x0, {0xfffe, @local, 'geneve0\x00'}}, 0x1e)
r1 = openat$ppp(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$PPPIOCATTCHAN(r1, 0x40047438, &(0x7f0000000100)=0x1)
r2 = socket$nl_route(0x10, 0x3, 0x0)
ioctl$sock_inet_SIOCSIFFLAGS(r2, 0x8914, &(0x7f0000000280)={'geneve0\x00'})
readv(r1, &(0x7f0000000600)=[{&(0x7f0000000140)=""/95, 0x5f}], 0x1)


r0 = openat$ppp(0xffffffffffffff9c, &(0x7f0000000040), 0x103201, 0x0)
ioctl$PPPIOCNEWUNIT(r0, 0xc004743e, &(0x7f0000000580))
ioctl$PPPIOCSPASS(r0, 0x40107447, &(0x7f0000000080)={0x1, &(0x7f0000000000)=[{0x6, 0x0, 0x0, 0x2}]})
pwrite64(r0, &(0x7f0000000e80)="080f", 0x2, 0x0)


r0 = socket$nl_generic(0x10, 0x3, 0x10)
syz_genetlink_get_family_id$ethtool(0x0, 0xffffffffffffffff)
syz_genetlink_get_family_id$nl80211(0xfffffffffffffffc, r0)


r0 = socket$nl_xfrm(0x10, 0x3, 0x6)
sendmsg$nl_xfrm(r0, &(0x7f0000000180)={0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000400)=@updpolicy={0xc4, 0x19, 0x1, 0x0, 0x0, {{@in=@multicast2, @in, 0x0, 0x0, 0x0, 0x0, 0xa}, {}, {}, 0x0, 0x0, 0x0, 0x1}, [@policy_type={0xa, 0x10, {0x1}}]}, 0xc4}}, 0x0)
syz_emit_ethernet(0x6a, &(0x7f0000000200)={@local, @random, @val={@void}, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "0cbb05", 0x30, 0x2f, 0x0, @private1, @local, {[], @time_exceed={0x3, 0x0, 0x0, 0x0, '\x00', {0x0, 0x6, "444ad2", 0x0, 0x0, 0x0, @loopback, @private1}}}}}}}, 0x0)


r0 = socket$nl_generic(0x10, 0x3, 0x10)
syz_genetlink_get_family_id$smc(0x0, 0xffffffffffffffff)
sendmsg$NL80211_CMD_CONNECT(0xffffffffffffffff, 0x0, 0x0)
syz_genetlink_get_family_id$tipc2(0xfffffffffffffffe, r0)


pipe(&(0x7f00000008c0))
r0 = syz_init_net_socket$bt_l2cap(0x1f, 0x1, 0x3)
ioctl$sock_TIOCOUTQ(r0, 0x5411, &(0x7f0000000040))

csum_ipv6_magic100%of 1
udp6_csum_init---of 31
udp6_set_csum---of 9
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

aead_accept_parent40%of 5
aead_accept_parent_nokey50%of 4
aead_bind40%of 5
aead_check_key50%of 4
aead_recvmsg---of 53
aead_recvmsg_nokey---of 3
aead_release---of 1
aead_sendmsg---of 1
aead_sendmsg_nokey67%of 3
aead_setauthsize---of 1
aead_setkey---of 1
aead_sock_destruct---of 1
-----------
SUMMARY48%of 21

__ia32_sys_setns---of 1
__se_sys_setns11%of 125
__x64_sys_setns100%of 1
copy_namespaces---of 28
create_new_namespaces10%of 20
exec_task_namespaces---of 4
exit_task_namespaces---of 1
free_nsproxy35%of 26
get_time_ns50%of 4
put_cgroup_ns---of 5
put_net---of 4
switch_task_namespaces60%of 5
unshare_nsproxy_namespaces45%of 9
-----------
SUMMARY18%of 190

tcp_fastopen_active_detect_blackhole---of 5
tcp_fastopen_active_disable---of 3
tcp_fastopen_active_disable_ofo_check---of 32
tcp_fastopen_active_should_disable29%of 7
tcp_fastopen_add_skb---of 20
tcp_fastopen_cookie_check25%of 16
tcp_fastopen_ctx_destroy---of 3
tcp_fastopen_ctx_free---of 1
tcp_fastopen_defer_connect23%of 9
tcp_fastopen_destroy_cipher---of 3
tcp_fastopen_get_cipher---of 27
tcp_fastopen_get_ctx---of 12
tcp_fastopen_init_key_once---of 24
tcp_fastopen_reset_cipher---of 8
tcp_try_fastopen---of 87
-----------
SUMMARY25%of 32

ax25_uid_put---of 4
dev_put---of 4
memcpy_from_msg---of 3
netdev_tracker_alloc---of 3
rose2asc---of 7
rose_accept---of 12
rose_bind---of 31
rose_connect---of 39
rose_create40%of 5
rose_destroy_socket---of 18
rose_destroy_timer---of 1
rose_device_event---of 29
rose_find_socket---of 7
rose_getname---of 12
rose_getsockopt---of 14
rose_info_next---of 1
rose_info_show---of 20
rose_info_start---of 1
rose_info_stop---of 1
rose_insert_socket---of 6
rose_ioctl---of 32
rose_kill_by_neigh---of 6
rose_listen---of 4
rose_new_lci---of 23
rose_recvmsg---of 13
rose_release---of 18
rose_rx_call_request---of 53
rose_sendmsg---of 41
rose_setsockopt16%of 19
rosecmp---of 6
rosecmpm---of 9
sock_error---of 3
-----------
SUMMARY21%of 24

pep_ctrlreq_error---of 7
pep_do_rcv---of 30
pep_get_sb---of 16
pep_getsockopt---of 11
pep_indicate---of 5
pep_init---of 1
pep_ioctl16%of 13
pep_read---of 3
pep_recvmsg---of 22
pep_reply---of 7
pep_sendmsg8%of 27
pep_setsockopt---of 17
pep_sock_accept---of 36
pep_sock_close---of 17
pep_sock_connect---of 5
pep_sock_unhash---of 15
pep_write---of 6
pep_writeable---of 1
pipe_destruct---of 1
pipe_do_rcv---of 45
pipe_grant_credits---of 8
pipe_handler_do_rcv---of 64
pipe_handler_request40%of 5
pipe_rcv_created---of 12
pipe_rcv_status---of 22
pipe_skb_send---of 13
sk_add_node---of 6
sock_put---of 4
-----------
SUMMARY14%of 45

__lock_sock43%of 7
__lock_sock_fast---of 3
__receive_sock---of 3
__release_sock53%of 17
__sk_backlog_rcv---of 5
__sk_destruct---of 28
__sk_dst_check34%of 15
__sk_flush_backlog50%of 4
__sk_free---of 19
__sk_mem_raise_allocated11%of 58
__sk_mem_reclaim100%of 1
__sk_mem_reduce_allocated23%of 18
__sk_mem_schedule67%of 3
__sk_receive_skb---of 27
__sock_cmsg_send---of 17
__sock_i_ino---of 3
__sock_queue_rcv_skb---of 29
__sock_set_mark---of 3
__sock_set_timestamps---of 9
__sock_wfree50%of 4
copy_from_sockptr---of 4
copy_to_sockptr---of 4
cred_to_ucred---of 3
dst_negative_advice---of 14
get_pid---of 5
groups_to_user---of 7
lock_sock_nested100%of 3
proto_exit_net---of 1
proto_init_net---of 1
proto_memory_pcpu_drain50%of 6
proto_register---of 25
proto_seq_next---of 1
proto_seq_show---of 12
proto_seq_start---of 1
proto_seq_stop---of 1
proto_unregister---of 9
put_cred---of 4
put_page29%of 14
release_sock80%of 10
sk_alloc34%of 6
sk_busy_loop_end---of 9
sk_capable---of 3
sk_clear_memalloc---of 6
sk_clone_lock---of 42
sk_common_release---of 14
sk_destruct---of 4
sk_dst_check---of 27
sk_dst_reset---of 1
sk_error_report---of 18
sk_free50%of 4
sk_free_unlock_clone---of 4
sk_get_meminfo---of 3
sk_get_peer_cred---of 3
sk_getsockopt---of 107
sk_init_common32%of 16
sk_ioctl27%of 19
sk_mc_loop---of 6
sk_net_capable---of 3
sk_ns_capable---of 3
sk_page_frag_refill50%of 4
sk_prot_alloc34%of 12
sk_reset_timer60%of 5
sk_send_sigurg10%of 20
sk_set_memalloc---of 1
sk_set_peek_off---of 1
sk_setsockopt3%of 181
sk_setup_caps32%of 25
sk_stop_timer75%of 4
sk_stop_timer_sync---of 4
sk_stream_moderate_sndbuf---of 4
sk_wait_data---of 9
skb_dst_force---of 15
skb_orphan_partial---of 18
skb_page_frag_refill50%of 10
skb_set_owner_w36%of 14
sock_alloc_send_pskb---of 23
sock_bind_add---of 3
sock_bindtoindex---of 11
sock_bindtoindex_locked---of 7
sock_cmsg_send---of 10
sock_common_getsockopt100%of 1
sock_common_recvmsg---of 3
sock_common_setsockopt---of 1
sock_copy_user_timeval---of 17
sock_def_destruct---of 1
sock_def_error_report---of 25
sock_def_readable26%of 39
sock_def_wakeup32%of 19
sock_def_write_space24%of 26
sock_efree---of 6
sock_enable_timestamp---of 4
sock_enable_timestamps---of 1
sock_error---of 3
sock_gen_cookie---of 3
sock_get_timeout---of 9
sock_getbindtodevice---of 12
sock_gettstamp---of 11
sock_i_ino67%of 3
sock_i_uid---of 3
sock_init_data67%of 3
sock_init_data_uid67%of 3
sock_inuse_exit_net---of 1
sock_inuse_get---of 5
sock_inuse_init_net---of 1
sock_ioctl_inout---of 5
sock_kfree_s---of 3
sock_kmalloc40%of 5
sock_kzfree_s---of 3
sock_load_diag_module---of 8
sock_lock_init24%of 13
sock_no_accept---of 1
sock_no_bind---of 1
sock_no_connect---of 1
sock_no_getname---of 1
sock_no_ioctl---of 1
sock_no_linger---of 1
sock_no_listen---of 1
sock_no_mmap---of 1
sock_no_recvmsg100%of 1
sock_no_sendmsg---of 1
sock_no_sendmsg_locked---of 1
sock_no_shutdown---of 1
sock_no_socketpair---of 1
sock_ofree---of 1
sock_omalloc---of 4
sock_pfree---of 16
sock_prot_inuse_get---of 7
sock_queue_rcv_skb_reason---of 7
sock_recv_errqueue---of 12
sock_release_reserved_memory---of 8
sock_reserve_memory---of 11
sock_rfree58%of 7
sock_set_keepalive---of 3
sock_set_mark---of 3
sock_set_priority---of 1
sock_set_rcvbuf---of 1
sock_set_reuseaddr---of 1
sock_set_reuseport---of 1
sock_set_sndtimeo---of 1
sock_set_timeout---of 8
sock_set_timestamp---of 6
sock_set_timestamping---of 34
sock_setsockopt100%of 1
sock_update_classid20%of 21
sock_update_netprioidx20%of 21
sock_valbool_flag67%of 3
sock_wfree13%of 40
sock_wmalloc---of 6
sockopt_capable---of 3
sockopt_lock_sock---of 3
sockopt_ns_capable---of 3
sockopt_release_sock---of 3
-----------
SUMMARY24%of 686

-----------
SUMMARY---of 0

evm_file_release58%of 7
evm_inode_alloc_security67%of 3
evm_inode_copy_up_xattr---of 4
evm_inode_init_security---of 11
evm_inode_post_remove_acl---of 1
evm_inode_post_removexattr---of 13
evm_inode_post_set_acl---of 1
evm_inode_post_setattr---of 10
evm_inode_post_setxattr---of 15
evm_inode_remove_acl---of 1
evm_inode_removexattr---of 3
evm_inode_set_acl---of 20
evm_inode_setattr---of 28
evm_inode_setxattr---of 7
evm_metadata_changed---of 8
evm_post_path_mknod---of 4
evm_protect_xattr---of 44
evm_protected_xattr---of 1
evm_protected_xattr_common---of 12
evm_protected_xattr_if_enabled---of 1
evm_read_protected_xattrs---of 15
evm_revalidate_status---of 7
evm_verify_hmac---of 44
evm_verifyxattr---of 4
is_unsupported_hmac_fs---of 4
-----------
SUMMARY60%of 10

bpf_fd_inode_storage_delete_elem---of 6
bpf_fd_inode_storage_lookup_elem---of 4
bpf_fd_inode_storage_update_elem---of 8
bpf_inode_storage_delete---of 8
bpf_inode_storage_free18%of 23
bpf_inode_storage_get---of 11
inode_storage_lookup---of 30
inode_storage_map_alloc---of 1
inode_storage_map_free---of 1
inode_storage_ptr---of 3
notsupp_get_next_key---of 1
-----------
SUMMARY18%of 23

fib6_lookup---of 5
fib6_rule_action---of 29
fib6_rule_compare---of 16
fib6_rule_configure---of 21
fib6_rule_default---of 8
fib6_rule_delete---of 10
fib6_rule_fill---of 7
fib6_rule_flush_cache---of 3
fib6_rule_lookup15%of 27
fib6_rule_match---of 37
fib6_rule_nlmsg_payload---of 1
fib6_rule_saddr---of 13
fib6_rule_suppress---of 11
fib6_rules_cleanup---of 1
fib6_rules_dump---of 1
fib6_rules_net_exit_batch---of 4
fib6_rules_net_init---of 5
fib6_rules_seq_read---of 1
-----------
SUMMARY15%of 27

-----------
SUMMARY---of 0

tomoyo_check_inet_acl---of 9
tomoyo_check_inet_address---of 25
tomoyo_check_unix_acl---of 5
tomoyo_check_unix_address---of 21
tomoyo_merge_inet_acl---of 1
tomoyo_merge_unix_acl---of 1
tomoyo_parse_ipaddr_union---of 14
tomoyo_print_ip---of 3
tomoyo_same_inet_acl---of 10
tomoyo_same_unix_acl---of 4
tomoyo_socket_bind_permission16%of 13
tomoyo_socket_connect_permission16%of 13
tomoyo_socket_listen_permission17%of 12
tomoyo_socket_sendmsg_permission56%of 9
tomoyo_write_inet_network---of 17
tomoyo_write_unix_network---of 10
-----------
SUMMARY24%of 47

hash_accept28%of 11
hash_accept_nokey67%of 3
hash_accept_parent50%of 4
hash_accept_parent_nokey---of 3
hash_bind---of 1
hash_check_key50%of 4
hash_recvmsg---of 22
hash_recvmsg_nokey---of 3
hash_release---of 1
hash_sendmsg---of 44
hash_sendmsg_nokey---of 3
hash_setkey---of 1
hash_sock_destruct---of 3
-----------
SUMMARY41%of 22

__rhashtable_walk_find_next---of 35
__rht_bucket_nested---of 25
bit_spin_lock---of 7
bucket_table_alloc---of 12
bucket_table_free---of 7
bucket_table_free_rcu---of 1
jhash18%of 17
lockdep_rht_bucket_is_held50%of 4
lockdep_rht_mutex_is_held67%of 3
nested_table_alloc---of 15
nested_table_free---of 5
rhashtable_destroy---of 1
rhashtable_free_and_destroy---of 45
rhashtable_init_noprof---of 25
rhashtable_insert_slow---of 112
rhashtable_jhash2---of 8
rhashtable_rehash_alloc---of 10
rhashtable_walk_enter---of 8
rhashtable_walk_exit---of 4
rhashtable_walk_next---of 17
rhashtable_walk_peek---of 4
rhashtable_walk_start_check---of 50
rhashtable_walk_stop---of 12
rhltable_init_noprof---of 1
rht_bucket_nested---of 3
rht_bucket_nested_insert---of 9
rht_deferred_worker---of 143
rht_unlock---of 10
-----------
SUMMARY30%of 24

-----------
SUMMARY---of 0

_setid_policy_lookup---of 14
safesetid_security_capable29%of 7
safesetid_task_fix_setgid---of 22
safesetid_task_fix_setgroups---of 18
safesetid_task_fix_setuid---of 22
setid_policy_lookup---of 35
-----------
SUMMARY29%of 7

-----------
SUMMARY---of 0

alloc_ucounts15%of 20
dec_rlimit_put_ucounts45%of 9
dec_rlimit_ucounts---of 8
dec_ucount---of 10
get_ucounts---of 3
inc_rlimit_get_ucounts22%of 23
inc_rlimit_ucounts---of 8
inc_ucount25%of 12
is_rlimit_overlimit---of 9
put_ucounts23%of 9
retire_userns_sysctls---of 1
set_is_seen---of 1
set_lookup---of 1
set_permissions---of 1
setup_userns_sysctls---of 5
-----------
SUMMARY24%of 73

-----------
SUMMARY---of 0

sctp_auth_asoc_copy_shkeys28%of 18
sctp_auth_asoc_create_secret---of 29
sctp_auth_asoc_get_hmac---of 10
sctp_auth_asoc_init_active_key7%of 29
sctp_auth_asoc_set_default_hmac---of 8
sctp_auth_asoc_verify_hmac_id---of 6
sctp_auth_calculate_hmac---of 10
sctp_auth_deact_key_id---of 14
sctp_auth_del_key_id---of 11
sctp_auth_destroy_hmacs---of 3
sctp_auth_destroy_keys50%of 6
sctp_auth_ep_add_chunkid---of 14
sctp_auth_ep_set_hmacs---of 9
sctp_auth_free---of 3
sctp_auth_get_hmac---of 3
sctp_auth_get_shkey---of 5
sctp_auth_init---of 15
sctp_auth_init_hmacs---of 24
sctp_auth_key_put40%of 5
sctp_auth_recv_cid16%of 13
sctp_auth_send_cid16%of 13
sctp_auth_set_active_key---of 11
sctp_auth_set_key---of 23
sctp_auth_shkey_create---of 6
sctp_auth_shkey_hold---of 4
sctp_auth_shkey_release25%of 8
-----------
SUMMARY20%of 92

-----------
SUMMARY---of 0

HAS_UNMAPPED_ID67%of 3
__check_sticky---of 4
__filename_parentat32%of 22
__ia32_sys_link---of 1
__ia32_sys_linkat---of 1
__ia32_sys_mkdir---of 1
__ia32_sys_mkdirat---of 1
__ia32_sys_mknod---of 1
__ia32_sys_mknodat---of 1
__ia32_sys_rename---of 1
__ia32_sys_renameat---of 1
__ia32_sys_renameat2---of 1
__ia32_sys_rmdir---of 1
__ia32_sys_symlink---of 1
__ia32_sys_symlinkat---of 1
__ia32_sys_unlink---of 1
__ia32_sys_unlinkat---of 4
__kern_path_locked---of 5
__lookup_slow25%of 12
__traverse_mounts27%of 26
__x64_sys_link---of 1
__x64_sys_linkat---of 1
__x64_sys_mkdir---of 1
__x64_sys_mkdirat100%of 1
__x64_sys_mknod---of 1
__x64_sys_mknodat---of 1
__x64_sys_rename---of 1
__x64_sys_renameat---of 1
__x64_sys_renameat2---of 1
__x64_sys_rmdir---of 1
__x64_sys_symlink---of 1
__x64_sys_symlinkat---of 1
__x64_sys_unlink---of 1
__x64_sys_unlinkat---of 4
check_acl---of 9
choose_mountpoint---of 34
choose_mountpoint_rcu---of 9
complete_walk39%of 13
d_delete_notify---of 4
do_file_open_root---of 17
do_filp_open55%of 11
do_linkat---of 19
do_mkdirat56%of 9
do_mknodat---of 25
do_o_path---of 5
do_renameat2---of 43
do_rmdir---of 12
do_symlinkat---of 7
do_tmpfile---of 7
do_unlinkat---of 22
done_path_create---of 1
dont_mount---of 1
drop_links---of 6
filename_create45%of 9
filename_lookup---of 18
follow_down---of 5
follow_down_one---of 4
follow_up---of 4
fsnotify_create---of 7
fsnotify_link---of 10
fsnotify_link_count---of 4
fsnotify_move---of 20
full_name_hash80%of 5
generic_permission10%of 22
getname100%of 1
getname_flags25%of 20
getname_kernel---of 9
getname_uflags---of 1
handle_dots5%of 43
handle_lookup_down---of 4
hashlen_string---of 4
inode_permission40%of 20
kern_path---of 1
kern_path_create---of 1
kern_path_locked---of 1
kernel_tmpfile_open---of 4
leave_rcu34%of 6
legitimize_links34%of 9
legitimize_path---of 5
link_path_walk48%of 38
lock_rename---of 3
lock_rename_child---of 6
lock_two_directories---of 7
lookup_fast50%of 16
lookup_one---of 9
lookup_one_common---of 16
lookup_one_len---of 9
lookup_one_len_unlocked---of 1
lookup_one_positive_unlocked---of 4
lookup_one_qstr_excl40%of 10
lookup_one_unlocked---of 7
lookup_positive_unlocked---of 4
lookup_slow100%of 1
may_create28%of 11
may_delete---of 25
may_linkat---of 9
may_open16%of 26
may_open_dev---of 3
nd_alloc_stack---of 3
nd_jump_link34%of 6
nd_jump_root39%of 13
page_get_link---of 20
page_put_link---of 1
page_readlink---of 3
page_symlink---of 7
path_get67%of 3
path_init16%of 57
path_lookupat---of 17
path_openat19%of 158
path_parentat---of 4
path_pts---of 9
path_put---of 1
pick_link38%of 45
put_link---of 5
put_page---of 14
putname34%of 6
rcu_read_unlock---of 6
readlink_copy---of 4
seqcount_lockdep_reader_access58%of 7
set_root42%of 12
step_into41%of 44
terminate_walk34%of 12
try_break_deleg---of 7
try_lookup_one_len---of 9
try_to_unlazy25%of 16
try_to_unlazy_next---of 15
unlock_rename---of 3
user_path_at_empty---of 1
user_path_create---of 1
user_path_locked_at---of 1
vfs_create---of 14
vfs_get_link---of 4
vfs_link---of 18
vfs_mkdir30%of 17
vfs_mknod---of 18
vfs_mkobj---of 10
vfs_path_lookup---of 1
vfs_path_parent_lookup---of 1
vfs_readlink---of 10
vfs_rename---of 64
vfs_rmdir---of 17
vfs_symlink---of 11
vfs_tmpfile---of 11
vfs_unlink---of 23
walk_component43%of 14
-----------
SUMMARY30%of 744

mptcp_crypto_hmac_sha---of 3
mptcp_crypto_key_sha60%of 5
-----------
SUMMARY60%of 5

__ip_do_redirect---of 47
__ip_rt_update_pmtu---of 30
__ip_select_ident---of 6
__ipv4_neigh_lookup---of 18
__ipv4_neigh_lookup_noref---of 24
__ipv6_neigh_lookup_noref_stub---of 24
dst_discard---of 1
fib_dump_info_fnhe---of 84
fib_lookup---of 29
fib_multipath_hash---of 75
find_exception---of 53
fnhe_flush_routes---of 15
inet_iif---of 8
inet_rtm_getroute---of 124
ip_do_redirect---of 4
ip_error---of 34
ip_handle_martian_source---of 8
ip_mc_validate_source---of 12
ip_mkroute_input---of 69
ip_mtu_from_fib_result---of 14
ip_neigh_gw4---of 3
ip_neigh_gw6---of 3
ip_route_input_noref---of 11
ip_route_input_rcu---of 142
ip_route_output_flow---of 4
ip_route_output_key_hash---of 11
ip_route_output_key_hash_rcu---of 104
ip_route_use_hint---of 24
ip_rt_bug---of 3
ip_rt_do_proc_exit---of 1
ip_rt_do_proc_init---of 5
ip_rt_get_source---of 14
ip_rt_multicast_event---of 1
ip_rt_send_redirect---of 44
ip_rt_update_pmtu---of 29
ipv4_blackhole_route---of 9
ipv4_confirm_neigh---of 31
ipv4_cow_metrics---of 1
ipv4_default_advmss---of 12
ipv4_dst_check67%of 3
ipv4_dst_destroy---of 8
ipv4_inetpeer_exit---of 1
ipv4_inetpeer_init---of 3
ipv4_link_failure---of 39
ipv4_mtu34%of 12
ipv4_negative_advice---of 5
ipv4_neigh_lookup---of 23
ipv4_redirect---of 3
ipv4_sk_redirect---of 6
ipv4_sk_update_pmtu---of 59
ipv4_sysctl_rtcache_flush---of 3
ipv4_update_pmtu---of 5
l3mdev_master_dev_rcu---of 5
neigh_event_send---of 5
neigh_release---of 4
netns_ip_rt_init---of 1
nlmsg_parse_deprecated_strict---of 4
rt_acct_proc_show---of 7
rt_add_uncached_list---of 3
rt_bind_exception---of 19
rt_cache_flush---of 1
rt_cache_route---of 10
rt_cache_seq_next---of 1
rt_cache_seq_show---of 3
rt_cache_seq_start---of 1
rt_cache_seq_stop---of 1
rt_cpu_seq_next---of 8
rt_cpu_seq_show---of 3
rt_cpu_seq_start---of 9
rt_cpu_seq_stop---of 1
rt_del_uncached_list---of 4
rt_dst_alloc---of 4
rt_dst_clone---of 7
rt_fill_info---of 62
rt_flush_dev---of 23
rt_genid_init---of 1
rt_set_nexthop---of 28
skb_dst_set_noref---of 4
skb_header_pointer---of 3
sysctl_route_net_exit---of 3
sysctl_route_net_init---of 9
update_or_create_fnhe---of 80
-----------
SUMMARY40%of 15

_copy_from_user50%of 6
_copy_to_user50%of 4
check_zeroed_user---of 11
-----------
SUMMARY50%of 10

-----------
SUMMARY---of 0

alloc_mnt_idmap---of 16
from_vfsgid40%of 5
from_vfsuid40%of 5
make_vfsgid34%of 6
make_vfsuid34%of 6
mnt_idmap_get---of 5
mnt_idmap_put---of 8
vfsgid_in_group_p---of 1
-----------
SUMMARY37%of 22

current_check_access_socket8%of 26
hook_socket_bind100%of 1
hook_socket_connect100%of 1
landlock_append_net_rule---of 1
-----------
SUMMARY15%of 28

__list_lru_init---of 15
__list_lru_walk_one---of 22
list_lru_add56%of 9
list_lru_add_obj50%of 4
list_lru_count_node---of 1
list_lru_count_one---of 19
list_lru_del43%of 7
list_lru_del_obj50%of 4
list_lru_destroy---of 25
list_lru_isolate---of 3
list_lru_isolate_move---of 5
list_lru_walk_node---of 7
list_lru_walk_one---of 3
list_lru_walk_one_irq---of 3
memcg_list_lru_alloc25%of 37
memcg_reparent_list_lrus---of 35
-----------
SUMMARY35%of 61

tomoyo_check_mkdev_acl---of 22
tomoyo_check_open_permission40%of 10
tomoyo_check_path2_acl---of 11
tomoyo_check_path_acl58%of 7
tomoyo_check_path_number_acl59%of 12
tomoyo_compare_name_union---of 4
tomoyo_compare_number_union---of 4
tomoyo_execute_permission---of 7
tomoyo_merge_mkdev_acl---of 1
tomoyo_merge_path2_acl---of 1
tomoyo_merge_path_acl---of 1
tomoyo_merge_path_number_acl---of 1
tomoyo_mkdev_perm---of 9
tomoyo_path2_perm---of 22
tomoyo_path_number_perm40%of 20
tomoyo_path_perm---of 16
tomoyo_path_permission50%of 8
tomoyo_put_name_union---of 5
tomoyo_put_number_union---of 3
tomoyo_same_mkdev_acl---of 18
tomoyo_same_mount_acl---of 12
tomoyo_same_path2_acl---of 5
tomoyo_same_path_acl---of 3
tomoyo_same_path_number_acl---of 8
tomoyo_update_mount_acl---of 20
tomoyo_write_file---of 44
-----------
SUMMARY48%of 57

-----------
SUMMARY---of 0

accumulate_nsecs_to_secs---of 9
change_clocksource---of 13
delta_to_ns_safe---of 3
do_adjtimex---of 41
do_settimeofday64---of 19
do_timer---of 1
dummy_clock_read---of 3
get_device_system_crosststamp---of 33
getboottime64---of 1
ktime_get50%of 8
ktime_get_boot_fast_ns---of 1
ktime_get_coarse_real_ts6450%of 8
ktime_get_coarse_ts64---of 8
ktime_get_coarse_with_offset---of 12
ktime_get_fast_timestamps---of 9
ktime_get_mono_fast_ns---of 8
ktime_get_raw---of 6
ktime_get_raw_fast_ns---of 8
ktime_get_raw_ts64---of 8
ktime_get_real_fast_ns---of 8
ktime_get_real_seconds100%of 1
ktime_get_real_ts6440%of 10
ktime_get_resolution_ns---of 10
ktime_get_seconds67%of 3
ktime_get_snapshot---of 18
ktime_get_tai_fast_ns---of 1
ktime_get_ts64---of 10
ktime_get_update_offsets_now---of 10
ktime_get_with_offset50%of 10
ktime_mono_to_any---of 8
pvclock_gtod_register_notifier---of 1
pvclock_gtod_unregister_notifier---of 1
random_get_entropy_fallback---of 3
seqcount_lockdep_reader_access86%of 7
timekeeping_advance---of 53
timekeeping_forward_now---of 12
timekeeping_get_ns36%of 14
timekeeping_inject_offset---of 14
timekeeping_max_deferment---of 6
timekeeping_notify---of 3
timekeeping_resume---of 12
timekeeping_suspend---of 12
timekeeping_update---of 11
timekeeping_valid_for_hres---of 6
timekeeping_warp_clock---of 3
tk_set_wall_to_mono---of 3
tk_setup_internals---of 10
tk_xtime_add---of 12
update_wall_time---of 3
-----------
SUMMARY51%of 61

-----------
SUMMARY---of 0

__ima_inode_hash---of 16
ima_bprm_check---of 3
ima_file_check100%of 1
ima_file_free24%of 13
ima_file_hash---of 3
ima_file_mmap29%of 7
ima_file_mprotect---of 7
ima_get_current_hash_algo---of 1
ima_inode_hash---of 3
ima_kernel_module_request---of 1
ima_kexec_cmdline---of 5
ima_load_data---of 9
ima_measure_critical_data---of 3
ima_post_create_tmpfile---of 6
ima_post_load_data---of 6
ima_post_path_mknod---of 6
ima_post_read_file---of 5
ima_read_file---of 3
integrity_inode_attrs_changed---of 4
mmap_violation_check---of 5
process_buffer_measurement---of 26
process_measurement8%of 101
-----------
SUMMARY12%of 122

-----------
SUMMARY---of 0

__ipv6_fixup_options---of 5
dst_discard---of 1
dst_input---of 8
fl6_update_dst25%of 8
ip6_parse_tlv---of 83
ipv6_destopt_rcv---of 38
ipv6_dup_options---of 10
ipv6_exthdrs_exit---of 1
ipv6_parse_hopopts---of 16
ipv6_push_frag_opts---of 3
ipv6_push_nfrag_opts---of 20
ipv6_renew_options---of 45
ipv6_rpl_srh_rcv---of 74
ipv6_rthdr_rcv---of 79
ipv6_srh_rcv---of 73
skb_dst---of 5
skb_valid_dst---of 7
-----------
SUMMARY25%of 8

-----------
SUMMARY---of 0

__sysvec_call_function---of 29
__sysvec_call_function_single25%of 29
fred_sysvec_call_function---of 4
fred_sysvec_call_function_single---of 4
fred_sysvec_reboot---of 4
fred_sysvec_reschedule_ipi---of 33
native_stop_other_cpus---of 34
smp_stop_nmi_callback---of 3
-----------
SUMMARY25%of 29

-----------
SUMMARY---of 0

__ia32_sys_mlock---of 1
__ia32_sys_mlock2---of 3
__ia32_sys_mlockall---of 1
__ia32_sys_munlock---of 1
__ia32_sys_munlockall---of 11
__mlock_folio---of 457
__se_sys_mlockall---of 22
__se_sys_munlock---of 11
__x64_sys_mlock---of 1
__x64_sys_mlock2---of 3
__x64_sys_mlockall---of 1
__x64_sys_munlock---of 1
apply_mlockall_flags---of 9
apply_vma_lock_flags---of 11
can_do_mlock---of 3
do_mlock---of 28
folio_evictable---of 20
folio_lruvec_relock_irq---of 19
folio_nr_pages---of 9
lru_gen_update_size---of 68
mlock_drain_local39%of 21
mlock_drain_remote---of 9
mlock_fixup---of 10
mlock_folio---of 50
mlock_folio_batch---of 421
mlock_new_folio---of 49
mlock_pte_range---of 80
mlock_vma_pages_range---of 16
munlock_folio---of 32
need_mlock_drain---of 3
user_shm_lock---of 6
user_shm_unlock---of 1
vm_flags_reset---of 6
vma_start_write---of 6
-----------
SUMMARY39%of 21

-----------
SUMMARY---of 0

__mptcp_subflow_connect---of 42
__mptcp_subflow_fully_established---of 5
__mptcp_sync_state50%of 22
mptcp_can_accept_new_subflow---of 8
mptcp_info2sockaddr---of 7
mptcp_propagate_state36%of 14
mptcp_space50%of 6
mptcp_subflow_create_socket---of 35
mptcp_subflow_data_available7%of 164
mptcp_subflow_discard_data---of 16
mptcp_subflow_drop_ctx---of 11
mptcp_subflow_init_cookie_req---of 11
mptcp_subflow_queue_clean---of 14
mptcp_subflow_reqsk_alloc---of 1
mptcp_subflow_reset---of 20
mptcpv6_handle_mapped---of 7
subflow_check_req---of 46
subflow_data_ready---of 32
subflow_error_report---of 11
subflow_finish_connect---of 43
subflow_rebuild_header---of 7
subflow_req_destructor---of 7
subflow_state_change30%of 51
subflow_syn_recv_sock---of 66
subflow_ulp_clone---of 15
subflow_ulp_init---of 13
subflow_ulp_release---of 15
subflow_v4_conn_request---of 9
subflow_v4_req_destructor---of 1
subflow_v4_route_req---of 14
subflow_v4_send_synack---of 6
subflow_v6_conn_request---of 11
subflow_v6_rebuild_header43%of 7
subflow_v6_req_destructor---of 1
subflow_v6_route_req---of 14
subflow_v6_send_synack---of 6
subflow_write_space---of 29
tcp_abort_override---of 3
tcp_release_cb_override50%of 6
-----------
SUMMARY19%of 270

ieee80211_parse_ch_switch_ie16%of 72
ieee80211_process_measurement_req---of 3
-----------
SUMMARY16%of 72

__bpf_trace_x86_fpu---of 1
__probestub_x86_fpu_after_restore---of 1
__probestub_x86_fpu_after_save---of 1
__probestub_x86_fpu_before_restore---of 1
__probestub_x86_fpu_before_save---of 1
__probestub_x86_fpu_copy_dst---of 1
__probestub_x86_fpu_copy_src---of 1
__probestub_x86_fpu_dropped---of 1
__probestub_x86_fpu_init_state---of 1
__probestub_x86_fpu_regs_activated---of 1
__probestub_x86_fpu_regs_deactivated---of 1
__probestub_x86_fpu_xstate_check_failed---of 1
__traceiter_x86_fpu_after_restore---of 4
__traceiter_x86_fpu_after_save---of 4
__traceiter_x86_fpu_before_restore---of 4
__traceiter_x86_fpu_before_save---of 4
__traceiter_x86_fpu_copy_dst---of 4
__traceiter_x86_fpu_copy_src---of 4
__traceiter_x86_fpu_dropped---of 4
__traceiter_x86_fpu_init_state---of 4
__traceiter_x86_fpu_regs_activated---of 4
__traceiter_x86_fpu_regs_deactivated---of 4
__traceiter_x86_fpu_xstate_check_failed---of 4
fpregs_activate27%of 15
fpregs_assert_state_consistent60%of 5
fpregs_lock_and_load---of 16
fpregs_mark_activate---of 1
fpstate_init_user---of 4
fpstate_reset---of 1
fpu__clear_user_states39%of 13
fpu__drop---of 32
fpu__exception_code---of 8
fpu_alloc_guest_fpstate---of 7
fpu_clone---of 46
fpu_copy_guest_fpstate_to_uabi---of 4
fpu_copy_uabi_to_guest_fpstate---of 8
fpu_enable_guest_xfd_features---of 8
fpu_flush_thread---of 4
fpu_free_guest_fpstate---of 4
fpu_reset_from_exception_fixup---of 1
fpu_swap_kvm_fpstate---of 11
fpu_sync_fpstate---of 33
fpu_sync_guest_vmexit_xfd_state---of 8
fpu_thread_struct_whitelist---of 1
fpu_update_guest_xfd---of 6
irq_fpu_usable---of 5
kernel_fpu_begin_mask---of 14
kernel_fpu_end---of 5
perf_trace_x86_fpu---of 9
restore_fpregs_from_fpstate37%of 11
save_fpregs_to_fpstate---of 10
switch_fpu_return60%of 5
trace_event_raw_event_x86_fpu---of 8
trace_raw_output_x86_fpu---of 3
-----------
SUMMARY39%of 49

rcu_read_unlock---of 6
reqsk_queue_removed---of 3
tcp_ca_openreq_child---of 22
tcp_check_req---of 58
tcp_child_process---of 33
tcp_create_openreq_child---of 20
tcp_md5_twsk_free_rcu---of 1
tcp_openreq_init_rwin---of 24
tcp_time_wait25%of 12
tcp_timewait_state_process---of 44
tcp_twsk_destructor---of 4
tcp_twsk_purge---of 7
-----------
SUMMARY25%of 12

-----------
SUMMARY---of 0

sctp_transport_burst_limited40%of 5
sctp_transport_burst_reset67%of 3
sctp_transport_destroy_rcu---of 1
sctp_transport_dst_confirm---of 1
sctp_transport_dst_release100%of 1
sctp_transport_free55%of 11
sctp_transport_hold43%of 7
sctp_transport_immediate_rtx---of 6
sctp_transport_lower_cwnd---of 10
sctp_transport_new50%of 6
sctp_transport_pl_hlen---of 6
sctp_transport_pl_recv---of 22
sctp_transport_pl_send---of 23
sctp_transport_pmtu30%of 17
sctp_transport_put50%of 6
sctp_transport_raise_cwnd---of 9
sctp_transport_reset---of 3
sctp_transport_reset_hb_timer50%of 6
sctp_transport_reset_probe_timer---of 3
sctp_transport_reset_raise_timer---of 3
sctp_transport_reset_reconf_timer---of 4
sctp_transport_reset_t3_rtx---of 4
sctp_transport_route58%of 7
sctp_transport_set_owner100%of 1
sctp_transport_timeout---of 4
sctp_transport_update_pmtu---of 50
sctp_transport_update_rto---of 20
-----------
SUMMARY48%of 70

-----------
SUMMARY---of 0

sctp_endpoint_add_asoc58%of 7
sctp_endpoint_bh_rcv---of 24
sctp_endpoint_destroy_rcu---of 4
sctp_endpoint_free---of 1
sctp_endpoint_hold43%of 7
sctp_endpoint_is_match---of 5
sctp_endpoint_is_peeled_off75%of 4
sctp_endpoint_lookup_assoc29%of 14
sctp_endpoint_new---of 19
sctp_endpoint_put34%of 6
-----------
SUMMARY43%of 38

-----------
SUMMARY---of 0

get_stack_info31%of 13
stack_type_name---of 7
-----------
SUMMARY31%of 13

alloc_etherdev_mqs---of 1
arch_get_platform_mac_address---of 1
device_get_ethdev_address---of 3
device_get_mac_address---of 1
eth_commit_mac_addr_change---of 1
eth_get_headlen---of 4
eth_gro_complete---of 7
eth_gro_receive---of 24
eth_header---of 9
eth_header_cache---of 3
eth_header_cache_update---of 1
eth_header_parse---of 3
eth_header_parse_protocol---of 3
eth_mac_addr---of 7
eth_platform_get_mac_address---of 4
eth_prepare_mac_addr_change---of 6
eth_type_trans36%of 17
eth_validate_addr---of 3
ether_setup---of 1
fwnode_get_mac_address---of 13
nvmem_get_mac_address---of 7
platform_get_ethdev_address---of 4
skb_header_pointer---of 4
sysfs_format_mac---of 1
-----------
SUMMARY36%of 17

ns_dname100%of 1
ns_get_name---of 4
ns_get_path67%of 3
ns_get_path_cb---of 3
ns_ioctl---of 12
ns_match---of 3
nsfs_evict100%of 1
nsfs_init_fs_context---of 3
nsfs_init_inode100%of 1
nsfs_put_data---of 1
nsfs_show_path---of 1
open_related_ns---of 6
proc_ns_file100%of 1
-----------
SUMMARY86%of 7

__skb_gso_segment34%of 12
skb_cow_head---of 4
skb_eth_gso_segment---of 17
skb_gso_transport_seglen23%of 9
skb_gso_validate_mac_len---of 9
skb_gso_validate_network_len43%of 7
skb_mac_gso_segment29%of 25
-----------
SUMMARY31%of 53

__ext4_fc_track_create---of 17
__ext4_fc_track_link---of 17
__ext4_fc_track_unlink---of 17
__track_dentry_update---of 15
ext4_end_buffer_io_sync---of 4
ext4_fc_cleanup---of 57
ext4_fc_commit---of 68
ext4_fc_del---of 22
ext4_fc_destroy_dentry_cache---of 1
ext4_fc_info_show---of 3
ext4_fc_init---of 3
ext4_fc_init_inode---of 1
ext4_fc_mark_ineligible---of 12
ext4_fc_record_regions---of 9
ext4_fc_replay---of 145
ext4_fc_replay_check_excluded---of 8
ext4_fc_replay_cleanup---of 1
ext4_fc_replay_link_internal---of 13
ext4_fc_reserve_space---of 10
ext4_fc_set_bitmaps_and_counters---of 14
ext4_fc_start_update---of 7
ext4_fc_stop_update---of 5
ext4_fc_submit_bh---of 7
ext4_fc_track_create---of 5
ext4_fc_track_inode8%of 25
ext4_fc_track_link---of 5
ext4_fc_track_range---of 27
ext4_fc_track_unlink---of 5
ext4_fc_update_stats---of 20
ext4_fc_wait_committing_inode---of 4
ext4_fc_write_inode---of 9
ext4_fc_write_inode_data---of 14
trace_ext4_fc_replay---of 15
-----------
SUMMARY8%of 25

__bpf_trace_csd_function---of 1
__bpf_trace_csd_queue_cpu---of 1
__flush_smp_call_function_queue14%of 130
__probestub_csd_function_entry---of 1
__probestub_csd_function_exit---of 1
__probestub_csd_queue_cpu---of 1
__smp_call_single_queue28%of 22
__traceiter_csd_function_entry---of 4
__traceiter_csd_function_exit---of 4
__traceiter_csd_queue_cpu---of 4
do_nothing---of 1
flush_smp_call_function_queue---of 14
generic_exec_single---of 51
generic_smp_call_function_single_interrupt100%of 1
kick_all_cpus_sync---of 3
on_each_cpu_cond_mask---of 3
perf_trace_csd_function---of 8
perf_trace_csd_queue_cpu---of 8
smp_call_function---of 3
smp_call_function_any---of 20
smp_call_function_many---of 1
smp_call_function_many_cond---of 189
smp_call_function_single---of 83
smp_call_function_single_async---of 5
smp_call_on_cpu---of 6
smp_call_on_cpu_callback---of 5
smpcfd_dead_cpu---of 3
smpcfd_dying_cpu---of 1
smpcfd_prepare_cpu---of 3
trace_csd_queue_cpu---of 15
trace_event_raw_event_csd_function---of 7
trace_event_raw_event_csd_queue_cpu---of 7
trace_raw_output_csd_function---of 3
trace_raw_output_csd_queue_cpu---of 3
wake_up_all_idle_cpus---of 10
-----------
SUMMARY16%of 153

__percpu_ref_switch_mode---of 20
percpu_ref_exit---of 7
percpu_ref_get---of 13
percpu_ref_init40%of 10
percpu_ref_is_zero---of 4
percpu_ref_kill_and_confirm---of 4
percpu_ref_noop_confirm_switch---of 1
percpu_ref_put---of 14
percpu_ref_reinit---of 6
percpu_ref_resurrect---of 5
percpu_ref_switch_to_atomic---of 1
percpu_ref_switch_to_atomic_rcu---of 17
percpu_ref_switch_to_atomic_sync---of 5
percpu_ref_switch_to_percpu---of 1
-----------
SUMMARY40%of 10

-----------
SUMMARY---of 0

__phys_addr38%of 8
__phys_addr_symbol67%of 3
__virt_addr_valid23%of 35
-----------
SUMMARY29%of 46

-----------
SUMMARY---of 0

__do_sys_getegid---of 1
__do_sys_getgid100%of 1
__do_sys_getpgrp---of 1
__do_sys_getpid100%of 1
__ia32_compat_sys_getrlimit---of 4
__ia32_compat_sys_getrusage---of 5
__ia32_compat_sys_old_getrlimit---of 1
__ia32_compat_sys_setrlimit---of 3
__ia32_compat_sys_sysinfo---of 1
__ia32_compat_sys_times---of 1
__ia32_sys_getcpu---of 5
__ia32_sys_gethostname---of 1
__ia32_sys_getpgid---of 1
__ia32_sys_getppid---of 16
__ia32_sys_getpriority---of 1
__ia32_sys_getresgid---of 1
__ia32_sys_getresuid---of 1
__ia32_sys_getrlimit---of 3
__ia32_sys_getrusage---of 5
__ia32_sys_getsid---of 1
__ia32_sys_newuname---of 1
__ia32_sys_old_getrlimit---of 6
__ia32_sys_olduname---of 1
__ia32_sys_prctl---of 1
__ia32_sys_prlimit64---of 1
__ia32_sys_setdomainname---of 1
__ia32_sys_setfsgid---of 1
__ia32_sys_setfsuid---of 1
__ia32_sys_setgid---of 1
__ia32_sys_sethostname---of 1
__ia32_sys_setpgid---of 1
__ia32_sys_setpriority---of 1
__ia32_sys_setregid---of 1
__ia32_sys_setresgid---of 1
__ia32_sys_setresuid---of 1
__ia32_sys_setreuid---of 1
__ia32_sys_setrlimit---of 3
__ia32_sys_setsid---of 1
__ia32_sys_setuid---of 1
__ia32_sys_sysinfo---of 1
__ia32_sys_times---of 1
__ia32_sys_umask---of 1
__ia32_sys_uname---of 1
__se_compat_sys_old_getrlimit---of 4
__se_compat_sys_sysinfo---of 8
__se_compat_sys_times---of 4
__se_sys_gethostname---of 4
__se_sys_getpriority---of 56
__se_sys_getresgid---of 4
__se_sys_getresuid---of 4
__se_sys_getsid---of 16
__se_sys_newuname---of 6
__se_sys_olduname---of 6
__se_sys_prctl---of 203
__se_sys_prlimit64---of 51
__se_sys_setdomainname---of 5
__se_sys_sethostname---of 5
__se_sys_setpgid---of 29
__se_sys_setpriority---of 56
__se_sys_times---of 4
__se_sys_uname---of 7
__sys_setfsgid---of 12
__sys_setfsuid---of 12
__sys_setgid---of 9
__sys_setregid---of 22
__sys_setresgid---of 39
__sys_setresuid---of 45
__sys_setreuid---of 28
__sys_setuid---of 15
__x64_compat_sys_getrlimit---of 4
__x64_compat_sys_getrusage---of 5
__x64_compat_sys_old_getrlimit---of 1
__x64_compat_sys_setrlimit---of 3
__x64_compat_sys_sysinfo---of 1
__x64_compat_sys_times---of 1
__x64_sys_getcpu---of 5
__x64_sys_geteuid---of 1
__x64_sys_gethostname---of 1
__x64_sys_getpgid---of 1
__x64_sys_getpriority---of 1
__x64_sys_getresgid---of 1
__x64_sys_getresuid---of 1
__x64_sys_getrlimit---of 3
__x64_sys_getrusage---of 5
__x64_sys_getsid---of 1
__x64_sys_gettid100%of 1
__x64_sys_getuid100%of 1
__x64_sys_newuname---of 1
__x64_sys_old_getrlimit---of 6
__x64_sys_olduname---of 1
__x64_sys_prctl---of 1
__x64_sys_prlimit64---of 1
__x64_sys_setdomainname---of 1
__x64_sys_setfsgid---of 1
__x64_sys_setfsuid---of 1
__x64_sys_setgid---of 1
__x64_sys_sethostname---of 1
__x64_sys_setpgid---of 1
__x64_sys_setpriority---of 1
__x64_sys_setregid---of 1
__x64_sys_setresgid---of 1
__x64_sys_setresuid---of 1
__x64_sys_setreuid---of 1
__x64_sys_setrlimit---of 3
__x64_sys_setuid---of 1
__x64_sys_sysinfo---of 1
__x64_sys_times---of 1
__x64_sys_umask---of 1
__x64_sys_uname---of 1
do_getpgid---of 16
do_prlimit---of 20
do_sysinfo---of 6
getrusage---of 38
ksys_setsid---of 9
override_release---of 9
prctl_set_mm_exe_file---of 8
propagate_has_child_subreaper---of 4
set_one_prio---of 15
validate_prctl_map_addr---of 29
-----------
SUMMARY100%of 4

__skb_incr_checksum_unnecessary---of 5
__tcpv4_gso_segment_csum---of 6
refcount_sub_and_test---of 4
tcp4_gro_complete---of 10
tcp4_gro_receive---of 48
tcp4_gso_segment12%of 26
tcp_gro_complete---of 7
tcp_gro_lookup---of 9
tcp_gro_pull_header---of 19
tcp_gro_receive---of 41
tcp_gso_segment31%of 46
-----------
SUMMARY24%of 72

-----------
SUMMARY---of 0

__cpuset_memory_pressure_bump---of 27
compute_partition_effective_cpumask---of 30
cpuset_attach---of 22
cpuset_attach_task---of 8
cpuset_bind---of 4
cpuset_can_attach---of 33
cpuset_can_fork---of 32
cpuset_cancel_attach---of 6
cpuset_cancel_fork---of 23
cpuset_change_task_nodemask---of 6
cpuset_common_seq_show---of 10
cpuset_cpu_is_isolated67%of 3
cpuset_cpus_allowed---of 25
cpuset_cpus_allowed_fallback---of 25
cpuset_css_alloc---of 5
cpuset_css_free---of 1
cpuset_css_offline---of 8
cpuset_css_online---of 38
cpuset_force_rebuild---of 1
cpuset_fork---of 36
cpuset_handle_hotplug---of 106
cpuset_lock---of 1
cpuset_mem_spread_node---of 17
cpuset_mems_allowed---of 23
cpuset_mems_allowed_intersects---of 1
cpuset_migrate_mm---of 4
cpuset_migrate_mm_workfn---of 1
cpuset_migrate_tasks_workfn---of 5
cpuset_node_allowed7%of 29
cpuset_nodemask_valid_mems_allowed---of 1
cpuset_post_attach---of 1
cpuset_print_current_mems_allowed---of 20
cpuset_read_s64---of 3
cpuset_read_u64---of 17
cpuset_slab_spread_node---of 17
cpuset_task_status_allowed---of 1
cpuset_track_online_nodes---of 1
cpuset_unlock---of 1
cpuset_update_active_cpus---of 1
cpuset_update_task_spread_flags---of 6
cpuset_write_resmask---of 150
cpuset_write_s64---of 11
cpuset_write_u64---of 14
css_get---of 14
css_put---of 15
css_tryget_online---of 20
current_cpuset_is_being_rebound---of 20
dec_dl_tasks_cs---of 10
guarantee_online_cpus---of 23
inc_dl_tasks_cs---of 10
is_cpuset_subset---of 5
partition_is_populated---of 29
partition_xcpus_add---of 10
partition_xcpus_del---of 10
proc_cpuset_show---of 41
rcu_read_unlock---of 6
rebuild_sched_domains---of 1
rebuild_sched_domains_locked---of 148
remote_cpus_update---of 15
remote_partition_check---of 12
remote_partition_disable---of 12
reset_partition_data---of 10
sched_partition_show---of 10
sched_partition_write---of 9
update_cpumasks_hier---of 82
update_domain_attr_tree---of 22
update_flag---of 13
update_parent_effective_cpumask---of 94
update_partition_sd_lb---of 7
update_prstate---of 41
update_sibling_cpumasks---of 36
update_tasks_cpumask---of 7
update_tasks_nodemask---of 10
validate_change---of 63
-----------
SUMMARY13%of 32

__read_once_word_nocheck100%of 1
__unwind_start30%of 20
deref_stack_reg25%of 8
orc_sort_cmp---of 4
orc_sort_swap---of 1
unwind_dump---of 9
unwind_get_return_address75%of 4
unwind_get_return_address_ptr---of 5
unwind_module_init---of 3
unwind_next_frame13%of 154
-----------
SUMMARY18%of 187

__ia32_compat_sys_gettimeofday---of 1
__ia32_compat_sys_settimeofday---of 1
__ia32_sys_adjtimex---of 3
__ia32_sys_adjtimex_time32---of 1
__ia32_sys_gettimeofday---of 1
__ia32_sys_settimeofday---of 1
__ia32_sys_stime---of 4
__ia32_sys_stime32---of 4
__ia32_sys_time---of 4
__ia32_sys_time32---of 4
__msecs_to_jiffies67%of 3
__se_compat_sys_gettimeofday---of 8
__se_compat_sys_settimeofday---of 21
__se_sys_adjtimex_time32---of 3
__se_sys_gettimeofday---of 8
__se_sys_settimeofday---of 21
__usecs_to_jiffies67%of 3
__x64_compat_sys_gettimeofday---of 1
__x64_compat_sys_settimeofday---of 1
__x64_sys_adjtimex---of 3
__x64_sys_adjtimex_time32---of 1
__x64_sys_gettimeofday---of 1
__x64_sys_settimeofday---of 1
__x64_sys_stime---of 4
__x64_sys_stime32---of 4
__x64_sys_time---of 4
__x64_sys_time32---of 4
clock_t_to_jiffies---of 1
do_sys_settimeofday64---of 13
get_itimerspec64---of 9
get_old_itimerspec32---of 4
get_old_timespec32---of 4
get_old_timex32---of 3
get_timespec6440%of 5
jiffies64_to_msecs---of 1
jiffies64_to_nsecs---of 1
jiffies_64_to_clock_t---of 1
jiffies_to_clock_t---of 1
jiffies_to_msecs---of 1
jiffies_to_timespec64---of 1
jiffies_to_usecs100%of 1
mktime64---of 1
ns_to_kernel_old_timeval---of 4
ns_to_timespec6450%of 4
nsec_to_clock_t---of 1
nsecs_to_jiffies---of 1
nsecs_to_jiffies64---of 1
put_itimerspec64---of 3
put_old_itimerspec32---of 3
put_old_timespec32---of 3
put_old_timex32---of 1
put_timespec64100%of 1
set_normalized_timespec64---of 7
timespec64_add_safe---of 9
timespec64_to_jiffies---of 1
-----------
SUMMARY59%of 17

__inet6_bind---of 69
inet6_bind---of 1
inet6_bind_sk---of 6
inet6_cleanup_sock---of 9
inet6_compat_ioctl---of 11
inet6_create---of 59
inet6_getname---of 18
inet6_ioctl---of 10
inet6_net_exit---of 1
inet6_net_init---of 14
inet6_recvmsg---of 28
inet6_register_protosw---of 11
inet6_release---of 3
inet6_sendmsg40%of 5
inet6_sk_rebuild_header8%of 50
inet6_sock_destruct---of 1
inet6_unregister_protosw---of 4
inet_addr_valid_or_nonlocal---of 4
inet_reset_saddr---of 6
ipv6_can_nonlocal_bind---of 4
ipv6_mod_enabled---of 1
ipv6_opt_accepted---of 13
ipv6_route_input---of 5
rcu_read_unlock---of 6
udp_set_peek_off---of 1
-----------
SUMMARY11%of 55

nf_ct_seq_adjust---of 33
nf_ct_seq_offset25%of 8
nf_ct_seqadj_init---of 6
nf_ct_seqadj_set---of 11
nf_ct_tcp_seqadj_set---of 3
-----------
SUMMARY25%of 8

-----------
SUMMARY---of 0

__bpf_trace_cpuhp_enter---of 1
__bpf_trace_cpuhp_exit---of 1
__bpf_trace_cpuhp_multi_enter---of 1
__cpu_down_maps_locked---of 1
__cpuhp_remove_state---of 1
__cpuhp_remove_state_cpuslocked---of 16
__cpuhp_setup_state---of 1
__cpuhp_setup_state_cpuslocked---of 31
__cpuhp_state_add_instance---of 1
__cpuhp_state_add_instance_cpuslocked---of 23
__cpuhp_state_remove_instance---of 13
__probestub_cpuhp_enter---of 1
__probestub_cpuhp_exit---of 1
__probestub_cpuhp_multi_enter---of 1
__traceiter_cpuhp_enter---of 4
__traceiter_cpuhp_exit---of 4
__traceiter_cpuhp_multi_enter---of 4
_cpu_down---of 26
_cpu_up---of 22
active_show---of 3
add_cpu---of 1
bringup_hibernate_cpu---of 6
clear_tasks_mm_cpumask---of 21
control_show---of 6
control_store---of 16
cpu_device_down---of 1
cpu_device_up---of 1
cpu_down---of 7
cpu_hotplug_disable---of 1
cpu_hotplug_enable---of 4
cpu_hotplug_pm_callback---of 9
cpu_maps_update_begin---of 1
cpu_maps_update_done---of 1
cpu_mitigations_auto_nosmt---of 1
cpu_mitigations_off100%of 1
cpu_smt_possible---of 1
cpu_up---of 16
cpuhp_ap_report_dead---of 3
cpuhp_ap_sync_alive---of 8
cpuhp_bringup_ap---of 21
cpuhp_complete_idle_dead---of 1
cpuhp_invoke_callback---of 26
cpuhp_issue_call---of 14
cpuhp_kick_ap---of 8
cpuhp_kick_ap_alive---of 9
cpuhp_kick_ap_work---of 3
cpuhp_online_idle---of 6
cpuhp_report_idle_dead---of 6
cpuhp_reset_state---of 11
cpuhp_set_state---of 8
cpuhp_should_run---of 3
cpuhp_smt_disable---of 13
cpuhp_smt_enable---of 9
cpuhp_thread_fun---of 23
cpuhp_wait_for_sync_state---of 8
cpus_read_lock---of 10
cpus_read_trylock---of 12
cpus_read_unlock---of 10
cpus_write_lock---of 1
cpus_write_unlock---of 1
fail_show---of 3
fail_store---of 12
finish_cpu---of 5
freeze_secondary_cpus---of 21
init_cpu_online---of 1
init_cpu_possible---of 1
init_cpu_present---of 1
lockdep_assert_cpus_held---of 5
lockdep_is_cpus_held100%of 1
notify_cpu_starting---of 9
perf_trace_cpuhp_enter---of 8
perf_trace_cpuhp_exit---of 8
perf_trace_cpuhp_multi_enter---of 8
remove_cpu---of 1
set_cpu_online---of 9
smp_shutdown_nonboot_cpus---of 20
state_show---of 3
states_show---of 5
take_cpu_down---of 10
takedown_cpu---of 18
target_show---of 3
target_store---of 14
thaw_secondary_cpus---of 11
trace_cpuhp_enter---of 15
trace_cpuhp_exit---of 15
trace_cpuhp_multi_enter---of 15
trace_event_raw_event_cpuhp_enter---of 7
trace_event_raw_event_cpuhp_exit---of 7
trace_event_raw_event_cpuhp_multi_enter---of 7
trace_raw_output_cpuhp_enter---of 3
trace_raw_output_cpuhp_exit---of 3
trace_raw_output_cpuhp_multi_enter---of 3
trace_suspend_resume---of 15
-----------
SUMMARY100%of 2

__account_locked_vm---of 11
__vcalloc_noprof---of 3
__vm_enough_memory---of 14
__vmalloc_array_noprof---of 3
account_locked_vm---of 11
folio_anon_vma---of 1
folio_copy---of 12
folio_mapping24%of 17
get_cmdline---of 7
kfree_const67%of 3
kmemdup_array---of 3
kmemdup_noprof67%of 3
kmemdup_nul---of 4
kstrdup50%of 4
kstrdup_const100%of 3
kstrndup---of 4
kvfree67%of 3
kvfree_sensitive---of 4
kvmalloc_node_noprof38%of 8
kvmemdup---of 3
kvrealloc_noprof---of 5
mem_dump_obj---of 6
memcmp_pages---of 1
memdup_user40%of 5
memdup_user_nul---of 5
overcommit_kbytes_handler---of 3
overcommit_policy_handler---of 6
overcommit_ratio_handler---of 3
page_offline_begin---of 1
page_offline_end---of 1
page_offline_freeze---of 1
page_offline_thaw---of 1
randomize_page---of 3
randomize_stack_top---of 3
strndup_user---of 5
sync_overcommit_as---of 1
vcalloc_noprof---of 3
vm_commit_limit---of 3
vm_memory_committed---of 1
vm_mmap---of 3
vm_mmap_pgoff43%of 14
vma_is_stack_for_current---of 3
vma_set_file---of 4
vmalloc_array_noprof---of 3
vmemdup_user---of 10
-----------
SUMMARY44%of 60

-----------
SUMMARY---of 0

__printk_safe_enter100%of 1
__printk_safe_exit100%of 1
vprintk---of 4
-----------
SUMMARY100%of 2

__copy_overflow---of 1
copy_from_kernel_nofault53%of 17
copy_from_user_nofault---of 4
copy_to_kernel_nofault---of 16
copy_to_user_nofault---of 4
strncpy_from_kernel_nofault---of 6
strncpy_from_user_nofault---of 4
strnlen_user_nofault---of 1
-----------
SUMMARY53%of 17

__ia32_compat_sys_ftruncate---of 4
__ia32_compat_sys_open---of 5
__ia32_compat_sys_openat---of 5
__ia32_compat_sys_truncate---of 1
__ia32_sys_access---of 1
__ia32_sys_chdir---of 1
__ia32_sys_chmod---of 5
__ia32_sys_chown---of 1
__ia32_sys_chroot---of 1
__ia32_sys_close---of 7
__ia32_sys_close_range---of 1
__ia32_sys_creat---of 1
__ia32_sys_faccessat---of 1
__ia32_sys_faccessat2---of 1
__ia32_sys_fallocate---of 4
__ia32_sys_fchdir---of 1
__ia32_sys_fchmod---of 7
__ia32_sys_fchmodat---of 5
__ia32_sys_fchmodat2---of 1
__ia32_sys_fchown---of 1
__ia32_sys_fchownat---of 1
__ia32_sys_ftruncate---of 4
__ia32_sys_lchown---of 1
__ia32_sys_open---of 5
__ia32_sys_openat---of 5
__ia32_sys_openat2---of 1
__ia32_sys_truncate---of 1
__se_sys_chdir---of 5
__se_sys_chroot---of 8
__se_sys_fchdir---of 7
__se_sys_openat2---of 11
__x64_compat_sys_ftruncate---of 4
__x64_compat_sys_open---of 5
__x64_compat_sys_openat---of 5
__x64_compat_sys_truncate---of 1
__x64_sys_access---of 1
__x64_sys_chdir---of 1
__x64_sys_chmod---of 5
__x64_sys_chown---of 1
__x64_sys_chroot---of 1
__x64_sys_close29%of 7
__x64_sys_close_range---of 1
__x64_sys_creat---of 1
__x64_sys_faccessat---of 1
__x64_sys_faccessat2---of 1
__x64_sys_fallocate---of 4
__x64_sys_fchdir---of 1
__x64_sys_fchmod---of 7
__x64_sys_fchmodat---of 5
__x64_sys_fchmodat2---of 1
__x64_sys_fchown---of 1
__x64_sys_fchownat---of 1
__x64_sys_ftruncate---of 5
__x64_sys_lchown---of 1
__x64_sys_open---of 5
__x64_sys_openat60%of 5
__x64_sys_openat2---of 1
__x64_sys_truncate---of 1
__x64_sys_vhangup---of 3
break_lease---of 5
build_open_flags30%of 20
build_open_how---of 5
chmod_common---of 11
chown_common---of 18
dentry_create---of 5
dentry_open---of 5
do_dentry_open18%of 70
do_faccessat---of 34
do_fchmodat---of 6
do_fchownat---of 8
do_ftruncate---of 12
do_sys_ftruncate---of 5
do_sys_open---of 5
do_sys_openat234%of 6
do_sys_truncate---of 7
do_truncate---of 7
file_open_name---of 7
file_open_root---of 7
file_path---of 1
filp_close---of 1
filp_flush50%of 6
filp_open---of 8
finish_no_open---of 1
finish_open---of 3
fsnotify_file_area_perm---of 5
fsnotify_modify---of 9
generic_file_open---of 4
kernel_file_open---of 4
ksys_fallocate---of 4
ksys_fchown---of 8
nonseekable_open---of 1
sb_end_write---of 10
sb_start_write---of 10
stream_open100%of 1
vfs_fallocate---of 26
vfs_fchmod---of 4
vfs_fchown---of 5
vfs_open100%of 1
vfs_truncate---of 12
-----------
SUMMARY26%of 116

-----------
SUMMARY---of 0

__kfence_alloc---of 12
__kfence_free50%of 8
alloc_covered_contains---of 3
check_canary19%of 38
get_alloc_stack_hash---of 17
kfence_check_canary_callback---of 5
kfence_debugfs_init---of 3
kfence_guarded_alloc---of 41
kfence_guarded_free40%of 15
kfence_handle_page_fault---of 18
kfence_init_enable---of 5
kfence_init_pool---of 22
kfence_ksize50%of 4
kfence_object_start---of 4
kfence_protect40%of 5
kfence_shutdown_cache---of 15
kfence_unprotect---of 5
metadata_update_state50%of 6
next_object---of 1
objects_open---of 4
param_get_sample_interval---of 3
param_set_sample_interval---of 21
rcu_guarded_free---of 1
show_object---of 1
start_object---of 1
stats_open---of 1
stats_show---of 1
stop_object---of 1
toggle_allocation_gate---of 6
wake_up_kfence_timer---of 1
-----------
SUMMARY32%of 76

csum_partial---of 14
csum_tcpudp_nofold100%of 1
ip_compute_csum---of 14
ip_fast_csum50%of 14
-----------
SUMMARY54%of 15

-----------
SUMMARY---of 0

mptcp_allow_join_id0100%of 1
mptcp_close_timeout---of 3
mptcp_get_add_addr_timeout---of 1
mptcp_get_pernet25%of 16
mptcp_get_pm_type---of 1
mptcp_get_scheduler---of 1
mptcp_is_checksum_enabled100%of 1
mptcp_is_enabled---of 1
mptcp_net_exit---of 1
mptcp_net_init---of 7
mptcp_stale_loss_cnt---of 1
proc_available_schedulers---of 3
proc_scheduler---of 14
-----------
SUMMARY34%of 18

compat_only_sysfs_link_entry_to_kobj---of 5
internal_create_group17%of 72
sysfs_add_link_to_group---of 3
sysfs_create_group100%of 1
sysfs_create_groups25%of 8
sysfs_group_change_owner---of 18
sysfs_groups_change_owner---of 7
sysfs_merge_group---of 7
sysfs_remove_group---of 15
sysfs_remove_groups---of 5
sysfs_remove_link_from_group---of 3
sysfs_unmerge_group---of 5
sysfs_update_group---of 1
sysfs_update_groups---of 8
-----------
SUMMARY19%of 81

-----------
SUMMARY---of 0

__d_path---of 6
__dentry_path---of 30
__ia32_sys_getcwd---of 1
__se_sys_getcwd---of 32
__x64_sys_getcwd---of 1
d_absolute_path50%of 6
d_path---of 27
dentry_path---of 6
dentry_path_raw---of 3
dynamic_dname67%of 3
prepend---of 6
prepend_path29%of 53
seqcount_lockdep_reader_access58%of 7
simple_dname---of 12
-----------
SUMMARY35%of 69

-----------
SUMMARY---of 0

aa_af_perm---of 4
aa_label_sk_perm50%of 10
aa_profile_af_perm---of 9
aa_sk_perm17%of 54
aa_sock_file_perm---of 7
apparmor_secmark_check---of 20
audit_net_cb---of 16
-----------
SUMMARY22%of 64

-----------
SUMMARY---of 0

__get_hash_from_flowi6100%of 1
__skb_flow_dissect11%of 339
__skb_flow_get_ports---of 17
__skb_get_hash_net---of 16
__skb_get_hash_symmetric_net---of 13
__skb_get_poff---of 15
bpf_flow_dissect---of 4
flow_dissector_bpf_prog_attach_check---of 9
flow_get_u32_dst---of 4
flow_get_u32_src---of 5
flow_hash_from_keys31%of 13
flow_hash_from_keys_seed---of 10
make_flow_keys_digest---of 1
skb_flow_dissect_ct---of 10
skb_flow_dissect_hash---of 3
skb_flow_dissect_meta---of 6
skb_flow_dissect_tunnel_info---of 40
skb_flow_dissector_init---of 15
skb_flow_get_icmp_tci---of 13
skb_get_hash_perturb---of 10
skb_get_poff---of 3
skb_metadata_dst---of 8
-----------
SUMMARY12%of 353

__anon_vma_prepare40%of 15
__bpf_trace_migration_pte---of 1
__bpf_trace_mm_migrate_pages---of 1
__bpf_trace_mm_migrate_pages_start---of 1
__bpf_trace_tlb_flush---of 1
__folio_rmap_sanity_checks24%of 34
__folio_set_anon---of 4
__probestub_mm_migrate_pages---of 1
__probestub_mm_migrate_pages_start---of 1
__probestub_remove_migration_pte---of 1
__probestub_set_migration_pte---of 1
__probestub_tlb_flush---of 1
__put_anon_vma---of 9
__traceiter_mm_migrate_pages---of 4
__traceiter_mm_migrate_pages_start---of 4
__traceiter_remove_migration_pte---of 4
__traceiter_set_migration_pte---of 4
__traceiter_tlb_flush---of 4
anon_vma_clone18%of 23
anon_vma_ctor---of 1
anon_vma_fork---of 9
flush_tlb_batched_pending---of 3
folio_add_anon_rmap_pmd---of 115
folio_add_anon_rmap_ptes---of 118
folio_add_file_rmap_pmd---of 34
folio_add_file_rmap_ptes27%of 26
folio_add_new_anon_rmap10%of 118
folio_entire_mapcount---of 9
folio_get_anon_vma---of 24
folio_large_mapcount---of 9
folio_lock_anon_vma_read---of 31
folio_mapcount---of 9
folio_maybe_dma_pinned---of 9
folio_mkclean---of 13
folio_move_anon_rmap---of 10
folio_not_mapped---of 1
folio_referenced---of 20
folio_referenced_one---of 80
folio_remove_rmap_pmd---of 28
folio_remove_rmap_ptes---of 20
folio_size---of 10
folio_test_large---of 7
folio_test_pmd_mappable---of 9
folio_try_share_anon_rmap_pte---of 89
hugetlb_add_anon_rmap---of 60
hugetlb_add_new_anon_rmap---of 36
hugetlb_remove_rmap---of 3
invalid_folio_referenced_vma---of 28
invalid_migration_vma---of 1
invalid_mkclean_vma---of 1
make_device_exclusive_range---of 35
mm_find_pmd50%of 4
page_address_in_vma---of 17
page_make_device_exclusive_one---of 45
page_mkclean_one---of 9
page_vma_mapped_walk_done---of 10
page_vma_mkclean_one---of 30
perf_trace_migration_pte---of 8
perf_trace_mm_migrate_pages---of 8
perf_trace_mm_migrate_pages_start---of 8
perf_trace_tlb_flush---of 8
pfn_mkclean_range---of 9
put_anon_vma---of 3
rcu_read_unlock---of 6
rmap_walk---of 4
rmap_walk_anon---of 43
rmap_walk_file---of 47
rmap_walk_locked---of 4
set_tlb_ubc_flush_pending---of 13
should_defer_flush---of 10
trace_event_raw_event_migration_pte---of 7
trace_event_raw_event_mm_migrate_pages---of 7
trace_event_raw_event_mm_migrate_pages_start---of 7
trace_event_raw_event_tlb_flush---of 7
trace_raw_output_migration_pte---of 3
trace_raw_output_mm_migrate_pages---of 3
trace_raw_output_mm_migrate_pages_start---of 3
trace_raw_output_tlb_flush---of 3
trace_set_migration_pte---of 15
try_to_migrate---of 12
try_to_migrate_one---of 239
try_to_unmap---of 7
try_to_unmap_flush---of 3
try_to_unmap_flush_dirty---of 4
try_to_unmap_one---of 184
unlink_anon_vmas20%of 25
-----------
SUMMARY18%of 245

-----------
SUMMARY---of 0

__cleanup_mnt---of 1
__detach_mounts---of 19
__do_loopback---of 14
__ia32_sys_fsmount---of 1
__ia32_sys_listmount---of 1
__ia32_sys_mount---of 1
__ia32_sys_mount_setattr---of 1
__ia32_sys_move_mount---of 1
__ia32_sys_oldumount---of 3
__ia32_sys_open_tree---of 1
__ia32_sys_pivot_root---of 1
__ia32_sys_statmount---of 1
__ia32_sys_umount---of 4
__is_local_mountpoint---of 4
__legitimize_mnt29%of 7
__lookup_mnt34%of 9
__mnt_is_readonly---of 3
__put_mountpoint---of 5
__se_sys_fsmount---of 32
__se_sys_listmount---of 42
__se_sys_mount---of 18
__se_sys_mount_setattr---of 125
__se_sys_move_mount---of 32
__se_sys_open_tree---of 29
__se_sys_pivot_root---of 50
__se_sys_statmount---of 63
__x64_sys_fsmount---of 1
__x64_sys_listmount---of 1
__x64_sys_mount---of 1
__x64_sys_mount_setattr---of 1
__x64_sys_move_mount---of 1
__x64_sys_oldumount---of 3
__x64_sys_open_tree---of 1
__x64_sys_pivot_root---of 1
__x64_sys_statmount---of 1
__x64_sys_umount---of 4
alloc_mnt_ns---of 10
alloc_vfsmnt---of 8
attach_mnt---of 11
attach_recursive_mnt---of 88
can_move_mount_beneath---of 11
check_for_nsfs_mounts---of 12
cleanup_mnt---of 17
clone_mnt---of 33
clone_private_mount---of 11
collect_mounts---of 3
commit_tree---of 22
copy_mnt_id_req---of 9
copy_mnt_ns5%of 44
copy_tree---of 39
count_mounts---of 12
current_chrooted---of 6
delayed_free_vfsmnt---of 1
delayed_mntput---of 4
dissolve_on_fput---of 6
do_change_type---of 14
do_lock_mount---of 17
do_loopback---of 13
do_mount---of 3
do_move_mount---of 31
do_move_mount_old---of 5
do_new_mount---of 36
drop_collected_mounts---of 1
fc_mount---of 3
finish_automount---of 34
free_mnt_ns---of 7
from_mnt_ns---of 1
get_mountpoint---of 21
graft_tree---of 4
invent_group_ids---of 22
is_path_reachable---of 5
iterate_mounts---of 5
kern_mount---of 3
kern_unmount---of 5
kern_unmount_array---of 24
lock_mnt_tree---of 8
lock_mount_hash---of 1
lookup_mnt18%of 28
m_next---of 3
m_show---of 1
m_start---of 7
m_stop---of 1
mark_mounts_for_expiry---of 16
may_mount---of 1
may_umount---of 1
may_umount_tree---of 13
mnt_add_count---of 1
mnt_add_to_ns---of 6
mnt_change_mountpoint---of 17
mnt_clone_internal---of 3
mnt_drop_write67%of 3
mnt_drop_write_file---of 4
mnt_get_count---of 5
mnt_get_write_access40%of 10
mnt_get_write_access_file---of 5
mnt_make_shortterm---of 3
mnt_may_suid---of 4
mnt_put_write_access67%of 3
mnt_put_write_access_file---of 4
mnt_release_group_id---of 1
mnt_set_expiry---of 3
mnt_set_mountpoint---of 3
mnt_want_write67%of 3
mnt_want_write_file---of 6
mnt_warn_timestamp_expiry---of 7
mntget67%of 3
mntns_get---of 5
mntns_install---of 11
mntns_owner---of 1
mntns_put---of 1
mntput75%of 4
mntput_no_expire12%of 27
mount_subtree---of 13
mount_too_revealing---of 20
namespace_unlock---of 14
our_mnt---of 1
path_is_mountpoint---of 21
path_is_under---of 5
path_mount---of 52
path_overmounted---of 19
path_umount---of 50
put_mnt_ns50%of 4
rcu_read_unlock---of 6
read_seqbegin50%of 10
sb_end_write40%of 10
sb_prepare_remount_readonly---of 19
sb_start_write40%of 10
set_mount_attributes---of 3
show_path---of 3
tree_contains_unbindable---of 9
umount_tree---of 51
unhash_mnt---of 9
unlock_mount_hash---of 1
vfs_create_mount---of 9
vfs_kern_mount---of 9
vfs_submount---of 3
-----------
SUMMARY26%of 175

__bpf_trace_ma_op---of 1
__bpf_trace_ma_read---of 1
__bpf_trace_ma_write---of 1
__mt_destroy25%of 12
__mt_dup---of 4
__probestub_ma_op---of 1
__probestub_ma_read---of 1
__probestub_ma_write---of 1
__traceiter_ma_op---of 4
__traceiter_ma_read---of 4
__traceiter_ma_write---of 4
mab_calc_split---of 30
mab_mas_cp---of 27
mab_no_null_split---of 9
mas_adopt_children---of 18
mas_alloc_cyclic---of 11
mas_alloc_nodes21%of 29
mas_ascend14%of 30
mas_bulk_rebalance---of 7
mas_commit_b_node---of 53
mas_descend27%of 23
mas_destroy4%of 86
mas_dump---of 12
mas_dup_build---of 48
mas_dup_free---of 37
mas_empty_area---of 79
mas_empty_area_rev---of 109
mas_erase---of 16
mas_expected_entries---of 7
mas_find32%of 48
mas_find_child---of 20
mas_find_range---of 48
mas_find_range_rev---of 3
mas_find_rev---of 3
mas_find_rev_setup---of 35
mas_get_slot34%of 15
mas_insert---of 11
mas_leaf_max_gap40%of 25
mas_leaf_set_meta---of 5
mas_mab_cp---of 37
mas_max_gap50%of 4
mas_new_root---of 21
mas_next67%of 3
mas_next_node21%of 62
mas_next_range---of 3
mas_next_setup7%of 32
mas_next_sibling---of 16
mas_next_slot32%of 48
mas_nomem25%of 8
mas_pause---of 1
mas_pop_node32%of 16
mas_preallocate27%of 53
mas_prev67%of 3
mas_prev_node5%of 64
mas_prev_range67%of 3
mas_prev_setup4%of 50
mas_prev_slot22%of 47
mas_push_data---of 43
mas_replace_node20%of 15
mas_root_expand---of 18
mas_set_height---of 3
mas_set_parent---of 5
mas_skip_node---of 27
mas_spanning_rebalance---of 93
mas_split_final_node---of 5
mas_start60%of 15
mas_store---of 13
mas_store_b_node---of 45
mas_store_gfp31%of 13
mas_store_prealloc25%of 12
mas_update_gap11%of 28
mas_walk23%of 9
mas_wmb_replace---of 99
mas_wr_dump---of 1
mas_wr_end_piv34%of 18
mas_wr_modify24%of 88
mas_wr_spanning_store---of 46
mas_wr_store_entry34%of 9
mas_wr_walk28%of 18
mas_wr_walk_descend32%of 19
mas_wr_walk_index---of 11
mast_ascend---of 25
mast_fill_bnode---of 27
mast_spanning_rebalance---of 45
mast_split_data---of 21
mt_cache_shrink---of 1
mt_destroy_walk---of 39
mt_dump---of 13
mt_dump_entry---of 10
mt_dump_node---of 186
mt_find35%of 32
mt_find_after---of 3
mt_free_rcu---of 1
mt_free_walk---of 25
mt_next---of 13
mt_prev---of 13
mt_validate39%of 244
mte_dead_leaves---of 15
mte_dead_walk---of 24
mte_destroy_descend---of 25
mtree_alloc_cyclic---of 4
mtree_alloc_range---of 7
mtree_alloc_rrange---of 7
mtree_destroy---of 1
mtree_dup---of 4
mtree_erase---of 1
mtree_insert---of 1
mtree_insert_range---of 6
mtree_load---of 41
mtree_range_walk50%of 30
mtree_store---of 1
mtree_store_range---of 6
perf_trace_ma_op---of 8
perf_trace_ma_read---of 8
perf_trace_ma_write---of 8
trace_event_raw_event_ma_op---of 7
trace_event_raw_event_ma_read---of 7
trace_event_raw_event_ma_write---of 7
trace_ma_op---of 15
trace_ma_read27%of 15
trace_ma_write27%of 15
trace_raw_output_ma_op---of 3
trace_raw_output_ma_read---of 3
trace_raw_output_ma_write---of 3
-----------
SUMMARY26%of 1241

-----------
SUMMARY---of 0

inet_bhash2_addr_any_conflict---of 31
inet_bhashfn_portaddr---of 5
inet_bind_conflict---of 17
inet_child_forget---of 6
inet_csk_accept---of 30
inet_csk_addr2sockaddr---of 1
inet_csk_bind_conflict---of 43
inet_csk_clear_xmit_timers100%of 1
inet_csk_clear_xmit_timers_sync---of 6
inet_csk_clone_lock---of 4
inet_csk_complete_hashdance---of 25
inet_csk_delete_keepalive_timer---of 1
inet_csk_destroy_sock---of 17
inet_csk_get_port---of 86
inet_csk_init_xmit_timers---of 1
inet_csk_listen_start---of 7
inet_csk_listen_stop---of 35
inet_csk_prepare_forced_close---of 4
inet_csk_rebuild_route---of 25
inet_csk_reqsk_queue_add---of 4
inet_csk_reqsk_queue_drop---of 13
inet_csk_reqsk_queue_drop_and_put---of 1
inet_csk_reqsk_queue_hash_add---of 1
inet_csk_reset_keepalive_timer---of 1
inet_csk_route_child_sock---of 16
inet_csk_route_req---of 31
inet_csk_update_fastreuse---of 19
inet_csk_update_pmtu---of 6
inet_rcv_saddr_any---of 3
inet_rcv_saddr_equal---of 15
inet_reqsk_alloc---of 11
inet_reqsk_clone---of 7
inet_rtx_syn_ack---of 3
inet_sk_bound_l3mdev---of 15
inet_sk_get_local_port_range---of 3
ipv6_rcv_saddr_equal---of 12
reqsk_put---of 7
reqsk_timer_handler---of 38
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__clockevents_switch_state---of 17
__clockevents_unbind---of 17
__clockevents_update_freq---of 7
clockevent_delta2ns---of 7
clockevents_config---of 17
clockevents_config_and_register---of 1
clockevents_exchange_device---of 17
clockevents_handle_noop---of 1
clockevents_program_event31%of 13
clockevents_program_min_delta---of 10
clockevents_register_device---of 15
clockevents_resume---of 7
clockevents_shutdown---of 6
clockevents_suspend---of 7
clockevents_switch_state---of 6
clockevents_tick_resume---of 3
clockevents_unbind_device---of 1
clockevents_update_freq---of 16
current_device_show---of 7
tick_cleanup_dead_cpu---of 17
tick_offline_cpu---of 1
unbind_device_store---of 11
-----------
SUMMARY31%of 13

__skb_queue_purge---of 4
__tipc_dump_start---of 4
__tipc_nl_add_sk---of 9
__tipc_nl_add_sk_info---of 16
__tipc_sendmsg---of 100
__tipc_sendstream22%of 42
__tipc_shutdown21%of 39
msg_importance---of 7
net_generic25%of 16
nla_nest_cancel---of 4
rht_lock34%of 9
rht_unlock---of 10
tipc_accept---of 38
tipc_bind---of 12
tipc_connect---of 24
tipc_data_ready25%of 33
tipc_dump_done---of 1
tipc_dump_start---of 4
tipc_getname---of 6
tipc_getsockopt---of 18
tipc_ioctl---of 8
tipc_listen---of 3
tipc_nl_publ_dump---of 29
tipc_nl_sk_dump---of 1
tipc_nl_sk_walk---of 15
tipc_poll---of 31
tipc_recvmsg---of 38
tipc_recvstream---of 32
tipc_release24%of 91
tipc_send_group_bcast---of 16
tipc_send_group_msg---of 4
tipc_send_group_unicast---of 9
tipc_send_packet67%of 3
tipc_sendmsg---of 1
tipc_sendstream---of 1
tipc_setsockopt---of 31
tipc_shutdown---of 22
tipc_sk_anc_data_recv---of 14
tipc_sk_backlog_rcv---of 1
tipc_sk_bind---of 13
tipc_sk_callback---of 4
tipc_sk_create20%of 101
tipc_sk_dump---of 19
tipc_sk_fill_sock_diag---of 26
tipc_sk_filter_rcv14%of 124
tipc_sk_filtering14%of 15
tipc_sk_finish_conn42%of 12
tipc_sk_join---of 9
tipc_sk_leave17%of 12
tipc_sk_lookup18%of 41
tipc_sk_mcast_rcv---of 26
tipc_sk_overlimit1---of 8
tipc_sk_overlimit2---of 8
tipc_sk_proto_rcv---of 48
tipc_sk_publish---of 6
tipc_sk_push_backlog---of 17
tipc_sk_rcv9%of 79
tipc_sk_reinit---of 12
tipc_sk_respond67%of 3
tipc_sk_rht_destroy---of 1
tipc_sk_rht_init---of 1
tipc_sk_send_ack---of 5
tipc_sk_sock_err---of 14
tipc_sk_timeout---of 21
tipc_sk_withdraw---of 12
tipc_skb_dequeue---of 4
tipc_sock_destruct---of 4
tipc_sock_get_portid---of 3
tipc_socket_init---of 4
tipc_socket_stop---of 1
tipc_socketpair100%of 1
tipc_wait_for_connect---of 13
tipc_wait_for_rcvmsg---of 16
tipc_write_space---of 19
trace_tipc_sk_dump---of 17
trace_tipc_sk_rej_msg---of 17
trace_tipc_sk_sendmcast---of 17
tsk_advance_rx_queue---of 19
tsk_importance---of 7
tsk_peer_msg---of 6
tsk_set_importance---of 6
-----------
SUMMARY20%of 621

__proc_create27%of 19
__xlate_proc_name17%of 12
_proc_mkdir---of 4
pde_free---of 5
pde_put---of 7
proc_alloc_inum67%of 3
proc_create---of 5
proc_create_data---of 5
proc_create_mount_point---of 3
proc_create_reg50%of 4
proc_create_seq_private---of 4
proc_create_single_data---of 4
proc_free_inum---of 1
proc_get_parent_data---of 1
proc_getattr---of 4
proc_lookup---of 3
proc_lookup_de---of 14
proc_misc_d_delete---of 1
proc_misc_d_revalidate---of 3
proc_mkdir---of 3
proc_mkdir_data---of 3
proc_mkdir_mode---of 3
proc_net_d_revalidate---of 1
proc_notify_change---of 3
proc_readdir---of 3
proc_readdir_de---of 16
proc_register17%of 12
proc_remove---of 3
proc_seq_open---of 3
proc_seq_release---of 3
proc_set_size---of 1
proc_set_user---of 1
proc_simple_write---of 5
proc_single_open---of 1
proc_symlink---of 7
remove_proc_entry---of 15
remove_proc_subtree---of 17
-----------
SUMMARY26%of 50

__ia32_sys_madvise---of 1
__ia32_sys_process_madvise---of 1
__se_sys_process_madvise---of 22
__x64_sys_madvise---of 1
__x64_sys_process_madvise---of 1
anon_vma_name50%of 4
anon_vma_name_alloc---of 3
anon_vma_name_free---of 1
do_madvise---of 258
folio_get---of 3
folio_large_mapcount---of 9
folio_likely_mapped_shared---of 28
folio_lock---of 9
folio_mapcount---of 9
folio_put---of 4
madvise_cold_or_pageout_pte_range---of 191
madvise_dontneed_free_valid_vma---of 12
madvise_folio_pte_batch---of 40
madvise_free_pte_range---of 138
madvise_set_anon_name---of 21
madvise_update_vma---of 42
swapin_walk_pmd_entry---of 25
-----------
SUMMARY50%of 4

__find_nth_and_andnot_bit---of 13
__find_nth_and_bit---of 13
__find_nth_andnot_bit---of 13
__find_nth_bit---of 13
_find_first_and_and_bit---of 6
_find_first_and_bit---of 6
_find_first_bit---of 6
_find_first_zero_bit67%of 6
_find_last_bit60%of 5
_find_next_and_bit---of 6
_find_next_andnot_bit---of 6
_find_next_bit100%of 6
_find_next_or_bit---of 6
_find_next_zero_bit100%of 6
find_next_clump8---of 8
-----------
SUMMARY83%of 23

___cfg80211_scan_done---of 28
__cfg80211_bss_expire---of 11
__cfg80211_bss_update---of 132
__cfg80211_get_bss---of 51
__cfg80211_scan_done---of 3
__cfg80211_stop_sched_scan---of 8
__cfg80211_unlink_bss---of 26
bss_free---of 10
bss_ref_get---of 8
cfg80211_add_sched_scan_req---of 6
cfg80211_bss_age---of 4
cfg80211_bss_expire---of 1
cfg80211_bss_flush---of 3
cfg80211_bss_iter---of 8
cfg80211_bss_update---of 1
cfg80211_defragment_element12%of 17
cfg80211_find_elem_match25%of 8
cfg80211_find_vendor_elem---of 10
cfg80211_gen_new_ie---of 112
cfg80211_get_dev_from_ifindex---of 10
cfg80211_get_ies_channel_number---of 34
cfg80211_inform_bss_data---of 219
cfg80211_inform_bss_frame_data---of 27
cfg80211_inform_single_bss_data---of 121
cfg80211_is_element_inherited---of 16
cfg80211_iter_rnr---of 15
cfg80211_merge_profile---of 13
cfg80211_put_bss---of 14
cfg80211_ref_bss---of 10
cfg80211_scan---of 16
cfg80211_scan_6ghz---of 144
cfg80211_scan_done---of 23
cfg80211_sched_scan_req_possible---of 10
cfg80211_sched_scan_results---of 30
cfg80211_sched_scan_results_wk---of 7
cfg80211_sched_scan_stopped---of 1
cfg80211_sched_scan_stopped_locked---of 26
cfg80211_stop_sched_scan_req---of 28
cfg80211_unlink_bss---of 11
cfg80211_update_assoc_bss_entry---of 40
cfg80211_update_known_bss---of 57
cfg80211_wext_giwscan---of 70
cfg80211_wext_siwscan---of 51
cmp_bss---of 51
dev_hold---of 5
rdev_scan---of 18
trace_cfg80211_return_bss---of 15
trace_rdev_return_int---of 15
-----------
SUMMARY16%of 25

__set_oom_adj---of 58
auxv_open---of 3
auxv_read---of 6
comm_open---of 1
comm_show---of 5
comm_write---of 8
do_io_accounting---of 30
environ_open---of 3
environ_read---of 12
map_files_d_revalidate---of 30
map_files_get_link---of 29
mem_lseek---of 4
mem_open---of 3
mem_read---of 1
mem_release---of 4
mem_rw---of 16
mem_write---of 1
mmap_read_unlock---of 3
next_tgid---of 21
oom_adj_read---of 7
oom_adj_write---of 9
oom_score_adj_read---of 5
oom_score_adj_write---of 7
pid_delete_dentry100%of 1
pid_getattr---of 22
pid_revalidate29%of 14
pid_update_inode100%of 1
proc_apparmor_attr_dir_iterate---of 1
proc_apparmor_attr_dir_lookup---of 1
proc_attr_dir_lookup---of 1
proc_attr_dir_readdir---of 1
proc_coredump_filter_read---of 7
proc_coredump_filter_write---of 13
proc_cwd_link---of 7
proc_exe_link---of 7
proc_fail_nth_read---of 5
proc_fail_nth_write---of 6
proc_fault_inject_read---of 5
proc_fault_inject_write---of 9
proc_fd_access_allowed---of 5
proc_fill_cache---of 9
proc_flush_pid---of 1
proc_gid_map_open---of 1
proc_id_map_open---of 39
proc_id_map_release---of 5
proc_loginuid_read---of 5
proc_loginuid_write---of 22
proc_map_files_get_link---of 8
proc_map_files_instantiate---of 3
proc_map_files_lookup---of 29
proc_map_files_readdir---of 28
proc_mem_open---of 7
proc_oom_score---of 3
proc_pid_attr_open---of 3
proc_pid_attr_read---of 7
proc_pid_attr_write---of 31
proc_pid_cmdline_read---of 28
proc_pid_evict_inode---of 5
proc_pid_get_link---of 5
proc_pid_instantiate---of 4
proc_pid_ksm_merging_pages---of 3
proc_pid_ksm_stat---of 3
proc_pid_limits---of 10
proc_pid_lookup---of 26
proc_pid_make_inode50%of 4
proc_pid_permission37%of 11
proc_pid_personality---of 4
proc_pid_readdir---of 21
proc_pid_readlink---of 7
proc_pid_schedstat---of 1
proc_pid_stack---of 8
proc_pid_syscall---of 6
proc_pid_wchan---of 5
proc_pident_instantiate50%of 8
proc_pident_lookup55%of 11
proc_pident_readdir---of 16
proc_projid_map_open---of 1
proc_root_link---of 7
proc_sessionid_read---of 5
proc_setattr---of 4
proc_setgroups_open---of 43
proc_setgroups_release---of 5
proc_single_open---of 1
proc_single_show---of 5
proc_task_getattr---of 5
proc_task_instantiate---of 4
proc_task_lookup---of 29
proc_task_readdir---of 58
proc_tgid_base_lookup100%of 1
proc_tgid_base_readdir---of 1
proc_tgid_io_accounting---of 1
proc_tid_base_lookup---of 1
proc_tid_base_readdir---of 1
proc_tid_comm_permission---of 7
proc_tid_io_accounting---of 4
proc_timers_open---of 3
proc_uid_map_open---of 1
show_timer---of 3
task_dump_owner29%of 21
tgid_pidfd_to_pid---of 3
timens_offsets_open---of 1
timens_offsets_show---of 5
timens_offsets_write---of 27
timers_next---of 1
timers_start---of 4
timers_stop---of 7
timerslack_ns_open---of 1
timerslack_ns_show---of 28
timerslack_ns_write---of 31
-----------
SUMMARY41%of 72

ctnetlink_alloc_expect---of 21
ctnetlink_alloc_filter---of 32
ctnetlink_attach_labels---of 5
ctnetlink_change_helper---of 47
ctnetlink_change_protoinfo---of 4
ctnetlink_change_seq_adj---of 18
ctnetlink_change_synproxy---of 11
ctnetlink_conntrack_event6%of 70
ctnetlink_ct_stat_cpu_dump---of 27
ctnetlink_del_conntrack---of 29
ctnetlink_del_expect---of 15
ctnetlink_done---of 5
ctnetlink_done_list---of 5
ctnetlink_dump_acct---of 8
ctnetlink_dump_ct_seq_adj---of 9
ctnetlink_dump_ct_synproxy---of 11
ctnetlink_dump_dying---of 32
ctnetlink_dump_extinfo---of 21
ctnetlink_dump_helpinfo---of 31
ctnetlink_dump_id---of 1
ctnetlink_dump_labels---of 7
ctnetlink_dump_mark---of 3
ctnetlink_dump_master---of 5
ctnetlink_dump_protoinfo---of 4
ctnetlink_dump_secctx---of 5
ctnetlink_dump_table---of 69
ctnetlink_dump_timeout---of 5
ctnetlink_dump_timestamp---of 11
ctnetlink_dump_tuples---of 16
ctnetlink_dump_tuples_ip---of 7
ctnetlink_dump_unconfirmed---of 1
ctnetlink_dump_zone_id---of 4
ctnetlink_exp_ct_dump_table---of 38
ctnetlink_exp_done---of 3
ctnetlink_exp_dump_expect---of 46
ctnetlink_exp_dump_table---of 36
ctnetlink_exp_fill_info---of 7
ctnetlink_exp_stat_cpu_dump---of 20
ctnetlink_expect_event---of 14
ctnetlink_fill_info---of 41
ctnetlink_filter_match---of 17
ctnetlink_filter_match_tuple---of 44
ctnetlink_flush_iterate---of 1
ctnetlink_get_conntrack---of 19
ctnetlink_get_ct_dying---of 3
ctnetlink_get_ct_unconfirmed---of 3
ctnetlink_get_expect---of 44
ctnetlink_glue_attach_expect---of 9
ctnetlink_glue_build---of 60
ctnetlink_glue_build_size---of 7
ctnetlink_glue_parse---of 24
ctnetlink_glue_seqadj---of 3
ctnetlink_net_init---of 1
ctnetlink_net_pre_exit---of 1
ctnetlink_new_conntrack---of 137
ctnetlink_new_expect---of 47
ctnetlink_parse_nat_setup---of 42
ctnetlink_parse_tuple_filter---of 48
ctnetlink_start---of 9
ctnetlink_stat_ct---of 11
ctnetlink_stat_ct_cpu---of 3
ctnetlink_stat_exp_cpu---of 3
dump_counters---of 7
dump_ct_seq_adj---of 6
expect_iter_all---of 1
expect_iter_name---of 12
nf_conn_tstamp_find---of 5
nf_ct_put---of 5
nf_expect_get_id---of 4
nfct_help---of 5
nla_put_string---of 1
rcu_read_unlock---of 6
-----------
SUMMARY6%of 70

sctp_primitive_ABORT---of 3
sctp_primitive_ASCONF---of 3
sctp_primitive_ASSOCIATE67%of 3
sctp_primitive_RECONF---of 3
sctp_primitive_REQUESTHEARTBEAT---of 3
sctp_primitive_SEND---of 3
sctp_primitive_SHUTDOWN---of 3
-----------
SUMMARY67%of 3

__nf_nat_decode_session10%of 31
hash_by_src50%of 6
net_generic---of 16
nf_ct_nat_ext_add---of 9
nf_in_range14%of 22
nf_nat_alloc_null_binding---of 4
nf_nat_cleanup_conntrack---of 3
nf_nat_inet_fn33%of 43
nf_nat_packet---of 3
nf_nat_proto_clean---of 12
nf_nat_register_fn---of 24
nf_nat_setup_info7%of 133
nf_nat_unregister_fn---of 14
nfnetlink_parse_nat_setup---of 18
-----------
SUMMARY14%of 235

-----------
SUMMARY---of 0

lockref_get100%of 1
lockref_get_not_dead67%of 3
lockref_get_not_zero---of 3
lockref_mark_dead67%of 3
lockref_put_not_zero---of 3
lockref_put_or_lock---of 3
lockref_put_return100%of 1
-----------
SUMMARY75%of 8

-----------
SUMMARY---of 0

cap_bprm_creds_from_file---of 62
cap_capable38%of 8
cap_capget---of 16
cap_capset---of 15
cap_convert_nscap---of 16
cap_inode_getsecurity---of 25
cap_inode_killpriv---of 1
cap_inode_need_killpriv---of 1
cap_inode_removexattr---of 5
cap_inode_setxattr---of 4
cap_mmap_addr23%of 9
cap_mmap_file100%of 1
cap_ptrace_access_check---of 19
cap_ptrace_traceme---of 19
cap_safe_nice---of 22
cap_settime---of 1
cap_task_fix_setuid---of 28
cap_task_prctl---of 45
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory---of 8
get_vfs_caps_from_disk---of 17
rootid_owns_currentns---of 6
-----------
SUMMARY34%of 18

ipv6_ext_hdr25%of 8
ipv6_find_hdr---of 56
ipv6_find_tlv---of 10
ipv6_skip_exthdr9%of 23
-----------
SUMMARY13%of 31

sctp_sched_dequeue_common---of 5
sctp_sched_dequeue_done---of 4
sctp_sched_fcfs_dequeue25%of 8
sctp_sched_fcfs_dequeue_done---of 1
sctp_sched_fcfs_enqueue---of 1
sctp_sched_fcfs_free_sid---of 1
sctp_sched_fcfs_get---of 1
sctp_sched_fcfs_init100%of 1
sctp_sched_fcfs_init_sid---of 1
sctp_sched_fcfs_sched_all100%of 1
sctp_sched_fcfs_set---of 1
sctp_sched_fcfs_unsched_all100%of 1
sctp_sched_get_sched---of 6
sctp_sched_get_value---of 4
sctp_sched_init_sid---of 1
sctp_sched_ops_from_stream100%of 1
sctp_sched_ops_init---of 1
sctp_sched_ops_register---of 3
sctp_sched_set_sched17%of 24
sctp_sched_set_value---of 5
-----------
SUMMARY28%of 36

arena_get_unmapped_area---of 11
arena_map_alloc---of 13
arena_map_check_btf---of 1
arena_map_delete_elem---of 1
arena_map_direct_value_addr---of 3
arena_map_free---of 4
arena_map_get_next_key---of 1
arena_map_lookup_elem---of 1
arena_map_mem_usage---of 1
arena_map_mmap---of 19
arena_map_peek_elem---of 1
arena_map_pop_elem---of 1
arena_map_push_elem---of 1
arena_map_update_elem---of 1
arena_vm_close---of 5
arena_vm_fault---of 11
bpf_arena_alloc_pages---of 17
bpf_arena_free_pages---of 40
bpf_arena_get_kern_vm_start67%of 3
bpf_arena_get_user_vm_start67%of 3
existing_page_cb---of 3
-----------
SUMMARY67%of 6

__ieee80211_recalc_idle---of 15
__ieee80211_recalc_txpower---of 26
ieee80211_activate_links_work---of 4
ieee80211_add_virtual_monitor---of 30
ieee80211_adjust_monitor_flags---of 9
ieee80211_assign_perm_addr---of 37
ieee80211_change_mac---of 47
ieee80211_check_concurrent_iface---of 33
ieee80211_check_queues---of 19
ieee80211_del_virtual_monitor---of 15
ieee80211_do_open---of 113
ieee80211_do_stop---of 88
ieee80211_get_stats64---of 1
ieee80211_idle_off---of 1
ieee80211_if_add---of 48
ieee80211_if_change_type---of 49
ieee80211_if_free---of 1
ieee80211_if_remove---of 14
ieee80211_if_setup---of 1
ieee80211_iface_exit---of 1
ieee80211_iface_init---of 1
ieee80211_iface_work6%of 54
ieee80211_monitor_select_queue---of 6
ieee80211_netdev_fill_forward_path---of 55
ieee80211_netdev_setup_tc---of 20
ieee80211_open---of 5
ieee80211_recalc_idle---of 3
ieee80211_recalc_offload---of 46
ieee80211_recalc_txpower---of 5
ieee80211_remove_interfaces---of 19
ieee80211_sdata_stop---of 3
ieee80211_set_multicast_list---of 4
ieee80211_set_sdata_offload_flags---of 11
ieee80211_setup_sdata---of 19
ieee80211_stop---of 16
ieee80211_uninit---of 3
ieee80211_vif_dec_num_mcast---of 4
ieee80211_vif_inc_num_mcast---of 4
netdev_notify---of 6
trace_drv_return_int---of 15
trace_drv_return_void---of 15
-----------
SUMMARY6%of 54

-----------
SUMMARY---of 0

__ppp_channel_push---of 8
__ppp_xmit_process20%of 52
net_generic25%of 16
netif_wake_queue---of 3
ppp_bridge_channels---of 28
ppp_ccp_closed---of 5
ppp_ccp_peek---of 30
ppp_channel_index---of 3
ppp_channel_push---of 6
ppp_compat_ioctl---of 19
ppp_connect_channel---of 19
ppp_destroy_channel---of 6
ppp_destroy_interface---of 9
ppp_dev_configure---of 18
ppp_dev_init---of 7
ppp_dev_name---of 4
ppp_dev_priv_destructor---of 4
ppp_dev_uninit---of 1
ppp_disconnect_channel---of 9
ppp_do_recv---of 80
ppp_exit_net---of 15
ppp_fill_forward_path---of 5
ppp_find_channel---of 9
ppp_get_filter---of 6
ppp_get_stats64---of 1
ppp_init_net---of 1
ppp_input---of 40
ppp_input_error---of 5
ppp_ioctl6%of 76
ppp_net_siocdevprivate---of 9
ppp_nl_dellink---of 1
ppp_nl_fill_info---of 1
ppp_nl_get_link_net---of 1
ppp_nl_get_size---of 1
ppp_nl_newlink---of 9
ppp_nl_validate---of 4
ppp_open---of 1
ppp_output_wakeup---of 3
ppp_poll---of 9
ppp_push7%of 46
ppp_read42%of 17
ppp_receive_nonmp_frame---of 80
ppp_register_channel---of 1
ppp_register_compressor---of 8
ppp_register_net_channel---of 7
ppp_release---of 10
ppp_set_compress---of 23
ppp_setup---of 1
ppp_start_xmit---of 17
ppp_unbridge_channels---of 19
ppp_unit_number---of 4
ppp_unregister_channel---of 7
ppp_unregister_compressor---of 7
ppp_write23%of 9
ppp_xmit_process45%of 9
proto_to_npindex---of 8
refcount_dec_and_test---of 4
refcount_inc---of 4
-----------
SUMMARY16%of 225

__ia32_compat_sys_timer_create---of 4
__ia32_sys_clock_adjtime---of 10
__ia32_sys_clock_adjtime32---of 10
__ia32_sys_clock_getres---of 8
__ia32_sys_clock_getres_time32---of 8
__ia32_sys_clock_gettime---of 7
__ia32_sys_clock_gettime32---of 7
__ia32_sys_clock_nanosleep---of 1
__ia32_sys_clock_nanosleep_time32---of 1
__ia32_sys_clock_settime---of 8
__ia32_sys_clock_settime32---of 8
__ia32_sys_timer_create---of 4
__ia32_sys_timer_delete---of 1
__ia32_sys_timer_getoverrun---of 3
__ia32_sys_timer_gettime---of 1
__ia32_sys_timer_gettime32---of 1
__ia32_sys_timer_settime---of 5
__ia32_sys_timer_settime32---of 5
__lock_timer---of 23
__se_sys_clock_nanosleep---of 10
__se_sys_clock_nanosleep_time32---of 10
__se_sys_timer_delete---of 11
__se_sys_timer_gettime---of 5
__se_sys_timer_gettime32---of 5
__x64_compat_sys_timer_create---of 4
__x64_sys_clock_adjtime---of 10
__x64_sys_clock_adjtime32---of 10
__x64_sys_clock_getres---of 8
__x64_sys_clock_getres_time32---of 8
__x64_sys_clock_gettime43%of 7
__x64_sys_clock_gettime32---of 7
__x64_sys_clock_nanosleep---of 1
__x64_sys_clock_nanosleep_time32---of 1
__x64_sys_clock_settime---of 8
__x64_sys_clock_settime32---of 8
__x64_sys_timer_create---of 4
__x64_sys_timer_delete---of 1
__x64_sys_timer_getoverrun---of 3
__x64_sys_timer_gettime---of 1
__x64_sys_timer_gettime32---of 1
__x64_sys_timer_settime---of 5
__x64_sys_timer_settime32---of 5
common_hrtimer_arm---of 7
common_hrtimer_forward---of 1
common_hrtimer_rearm---of 1
common_hrtimer_remaining---of 1
common_hrtimer_try_to_cancel---of 1
common_nsleep---of 1
common_nsleep_timens---of 4
common_timer_create---of 1
common_timer_del---of 3
common_timer_get---of 8
common_timer_set---of 10
common_timer_wait_running---of 1
do_clock_adjtime---of 7
do_timer_create---of 53
do_timer_settime---of 13
exit_itimers---of 14
k_itimer_rcu_free---of 1
posix_clock_realtime_adj---of 1
posix_clock_realtime_set---of 1
posix_get_boottime_ktime---of 1
posix_get_boottime_timespec---of 1
posix_get_coarse_res---of 1
posix_get_hrtimer_res---of 1
posix_get_monotonic_coarse---of 1
posix_get_monotonic_ktime---of 1
posix_get_monotonic_raw---of 1
posix_get_monotonic_timespec---of 1
posix_get_realtime_coarse---of 1
posix_get_realtime_ktime---of 1
posix_get_realtime_timespec100%of 1
posix_get_tai_ktime---of 1
posix_get_tai_timespec---of 1
posix_timer_event---of 1
posix_timer_fn---of 6
posixtimer_rearm---of 5
timer_wait_running---of 13
-----------
SUMMARY50%of 8

aa_get_buffer---of 31
aa_get_current_label34%of 9
aa_get_newest_label6%of 35
aa_put_buffer---of 19
aa_sock_msg_perm56%of 9
aa_sock_opt_perm58%of 7
aa_sock_perm58%of 7
apparmor_bprm_committed_creds---of 11
apparmor_bprm_committing_creds---of 10
apparmor_capable46%of 11
apparmor_capget---of 31
apparmor_cred_alloc_blank---of 3
apparmor_cred_free---of 9
apparmor_cred_prepare---of 5
apparmor_cred_transfer---of 5
apparmor_current_getsecid_subj58%of 7
apparmor_dointvec---of 4
apparmor_file_alloc_security40%of 5
apparmor_file_free_security34%of 6
apparmor_file_lock---of 1
apparmor_file_mprotect---of 4
apparmor_file_open39%of 13
apparmor_file_permission100%of 1
apparmor_file_receive---of 1
apparmor_file_truncate---of 3
apparmor_getprocattr---of 16
apparmor_getselfattr---of 16
apparmor_inet_conn_request---of 3
apparmor_inode_getattr---of 3
apparmor_ip_postroute34%of 6
apparmor_mmap_file50%of 4
apparmor_move_mount---of 9
apparmor_nf_register---of 1
apparmor_nf_unregister---of 1
apparmor_path_chmod---of 3
apparmor_path_chown---of 3
apparmor_path_link---of 4
apparmor_path_mkdir67%of 3
apparmor_path_mknod---of 3
apparmor_path_rename---of 12
apparmor_path_rmdir---of 4
apparmor_path_symlink---of 3
apparmor_path_truncate---of 3
apparmor_path_unlink---of 4
apparmor_ptrace_access_check---of 12
apparmor_ptrace_traceme---of 12
apparmor_sb_mount---of 13
apparmor_sb_pivotroot---of 7
apparmor_sb_umount---of 9
apparmor_setprocattr---of 3
apparmor_setselfattr---of 3
apparmor_sk_alloc_security50%of 6
apparmor_sk_clone_security30%of 17
apparmor_sk_free_security---of 9
apparmor_sock_graft67%of 3
apparmor_socket_accept56%of 9
apparmor_socket_bind56%of 9
apparmor_socket_connect56%of 9
apparmor_socket_create50%of 6
apparmor_socket_getpeername100%of 1
apparmor_socket_getpeersec_dgram100%of 1
apparmor_socket_getpeersec_stream---of 10
apparmor_socket_getsockname100%of 1
apparmor_socket_getsockopt100%of 1
apparmor_socket_listen58%of 7
apparmor_socket_post_create27%of 19
apparmor_socket_recvmsg100%of 1
apparmor_socket_sendmsg100%of 1
apparmor_socket_setsockopt100%of 1
apparmor_socket_shutdown100%of 1
apparmor_socket_sock_rcv_skb67%of 3
apparmor_task_alloc---of 13
apparmor_task_free---of 14
apparmor_task_getsecid_obj---of 5
apparmor_task_kill---of 26
apparmor_task_setrlimit---of 9
apparmor_uring_override_creds---of 20
apparmor_uring_sqpoll---of 15
apparmor_userns_create---of 5
audit_uring_cb---of 6
begin_current_label_crit_section37%of 11
common_file_perm50%of 8
common_perm56%of 9
do_setattr---of 17
end_current_label_crit_section45%of 9
param_get_aabool---of 5
param_get_aacompressionlevel---of 5
param_get_aaintbool---of 1
param_get_aalockpolicy---of 5
param_get_aauint---of 5
param_get_audit---of 5
param_get_mode---of 5
param_set_aabool---of 5
param_set_aacompressionlevel---of 4
param_set_aaintbool---of 4
param_set_aalockpolicy---of 5
param_set_aauint---of 4
param_set_audit---of 7
param_set_mode---of 7
-----------
SUMMARY42%of 256

-----------
SUMMARY---of 0

__get_vma_policy---of 4
__ia32_sys_get_mempolicy---of 1
__ia32_sys_mbind---of 1
__ia32_sys_migrate_pages---of 1
__ia32_sys_set_mempolicy---of 8
__ia32_sys_set_mempolicy_home_node---of 1
__mpol_dup---of 11
__mpol_equal---of 15
__mpol_put---of 3
__se_sys_get_mempolicy---of 90
__se_sys_mbind---of 94
__se_sys_migrate_pages---of 41
__se_sys_set_mempolicy_home_node---of 33
__x64_sys_get_mempolicy---of 1
__x64_sys_mbind---of 1
__x64_sys_migrate_pages---of 1
__x64_sys_set_mempolicy---of 8
__x64_sys_set_mempolicy_home_node---of 1
alloc_migration_target_by_mpol---of 29
alloc_pages_bulk_array_mempolicy_noprof4%of 88
alloc_pages_mpol_noprof16%of 19
alloc_pages_noprof50%of 6
apply_policy_zone---of 3
change_prot_numa---of 3
do_migrate_pages---of 31
do_set_mempolicy---of 12
folio_alloc_noprof---of 6
folio_large_mapcount---of 9
folio_likely_mapped_shared---of 36
get_il_weight---of 18
get_nodes---of 17
get_task_policy---of 5
get_vma_policy25%of 16
huge_node---of 1
init_nodemask_of_mempolicy---of 10
mbind_range---of 37
mempolicy_in_oom_domain---of 5
mempolicy_kobj_release---of 6
mempolicy_slab_node8%of 26
migrate_folio_add---of 23
mpol_free_shared_policy---of 8
mpol_misplaced---of 44
mpol_new---of 13
mpol_new_nodemask---of 3
mpol_new_preferred---of 4
mpol_parse_str---of 51
mpol_put_task_policy---of 4
mpol_rebind_default---of 1
mpol_rebind_mm---of 25
mpol_rebind_nodemask---of 6
mpol_rebind_preferred---of 1
mpol_rebind_task---of 8
mpol_set_nodemask---of 8
mpol_set_shared_policy---of 47
mpol_shared_policy_init---of 20
mpol_shared_policy_lookup---of 12
mpol_to_str---of 17
node_show---of 1
node_store---of 13
numa_default_policy---of 1
numa_nearest_node---of 10
page_rmappable_folio18%of 17
page_to_nid---of 3
page_zone---of 4
policy_nodemask9%of 36
queue_folios_hugetlb---of 43
queue_folios_pte_range---of 81
queue_pages_test_walk---of 28
read_mems_allowed_begin---of 11
sysfs_wi_release---of 6
vma_alloc_folio_noprof40%of 5
vma_dup_policy50%of 4
vma_migratable---of 14
vma_policy_mof---of 12
weighted_interleave_nid---of 40
weighted_interleave_nodes---of 15
-----------
SUMMARY12%of 217

-----------
SUMMARY---of 0

__bpf_trace_file_check_and_advance_wb_err---of 1
__bpf_trace_filemap_set_wb_err---of 1
__bpf_trace_mm_filemap_op_page_cache---of 1
__filemap_add_folio---of 101
__filemap_fdatawait_range---of 16
__filemap_fdatawrite_range---of 1
__filemap_get_folio---of 60
__filemap_remove_folio---of 31
__filemap_set_wb_err---of 15
__folio_lock---of 1
__folio_lock_killable---of 1
__folio_lock_or_retry---of 21
__generic_file_write_iter---of 8
__ia32_sys_cachestat---of 1
__probestub_file_check_and_advance_wb_err---of 1
__probestub_filemap_set_wb_err---of 1
__probestub_mm_filemap_add_to_page_cache---of 1
__probestub_mm_filemap_delete_from_page_cache---of 1
__se_sys_cachestat---of 48
__traceiter_file_check_and_advance_wb_err---of 4
__traceiter_filemap_set_wb_err---of 4
__traceiter_mm_filemap_add_to_page_cache---of 4
__traceiter_mm_filemap_delete_from_page_cache---of 4
__x64_sys_cachestat---of 1
count_memcg_event_mm---of 24
delete_from_page_cache_batch---of 47
do_read_cache_folio---of 38
do_read_cache_page---of 10
do_sync_mmap_readahead---of 17
file_check_and_advance_wb_err---of 16
file_fdatawait_range---of 1
file_write_and_wait_range---of 6
filemap_add_folio---of 29
filemap_alloc_folio_noprof---of 20
filemap_check_errors---of 5
filemap_fault---of 83
filemap_fault_recheck_pte_none---of 12
filemap_fdatawait_keep_errors---of 3
filemap_fdatawait_range---of 1
filemap_fdatawait_range_keep_errors---of 3
filemap_fdatawrite---of 1
filemap_fdatawrite_range---of 1
filemap_fdatawrite_wbc---of 5
filemap_flush---of 1
filemap_free_folio---of 20
filemap_get_entry---of 20
filemap_get_folios---of 1
filemap_get_folios_contig---of 44
filemap_get_folios_tag---of 35
filemap_get_pages---of 103
filemap_get_read_batch---of 50
filemap_invalidate_inode---of 6
filemap_invalidate_lock_two---of 5
filemap_invalidate_unlock_two---of 5
filemap_map_pages16%of 92
filemap_page_mkwrite---of 27
filemap_range_has_page---of 15
filemap_range_has_writeback---of 40
filemap_read---of 43
filemap_read_folio---of 34
filemap_release_folio---of 22
filemap_remove_folio---of 11
filemap_splice_read---of 28
filemap_unaccount_folio---of 65
filemap_write_and_wait_range---of 6
find_get_entries---of 37
find_lock_entries---of 68
folio_add_wait_queue---of 9
folio_contains---of 24
folio_end_private_2---of 17
folio_end_read---of 23
folio_end_writeback---of 30
folio_lock---of 9
folio_put---of 4
folio_size---of 10
folio_unlock25%of 16
folio_wait_bit---of 1
folio_wait_bit_common---of 58
folio_wait_bit_killable---of 1
folio_wait_private_2---of 10
folio_wait_private_2_killable---of 10
folio_wake_bit---of 10
generic_file_direct_write---of 8
generic_file_mmap---of 4
generic_file_read_iter---of 13
generic_file_readonly_mmap---of 5
generic_file_write_iter---of 9
generic_perform_write---of 15
inode_to_wb---of 6
kiocb_invalidate_pages---of 10
kiocb_invalidate_post_direct_write---of 5
kiocb_write_and_wait---of 9
mapping_read_folio_gfp---of 1
mapping_seek_hole_data---of 63
maybe_unlock_mmap_for_io---of 6
migration_entry_wait_on_locked---of 53
next_uptodate_folio16%of 52
page_cache_next_miss---of 5
page_cache_prev_miss---of 16
perf_trace_file_check_and_advance_wb_err---of 8
perf_trace_filemap_set_wb_err---of 8
perf_trace_mm_filemap_op_page_cache---of 15
read_cache_folio---of 1
read_cache_page---of 1
read_cache_page_gfp---of 1
release_fault_lock---of 13
replace_page_cache_folio---of 70
splice_folio_into_pipe---of 6
trace_event_raw_event_file_check_and_advance_wb_err---of 7
trace_event_raw_event_filemap_set_wb_err---of 7
trace_event_raw_event_mm_filemap_op_page_cache---of 14
trace_mm_filemap_delete_from_page_cache---of 15
trace_raw_output_file_check_and_advance_wb_err---of 3
trace_raw_output_filemap_set_wb_err---of 3
trace_raw_output_mm_filemap_op_page_cache---of 3
wake_page_function---of 10
xas_next---of 12
xas_next_entry36%of 17
xas_reload17%of 24
-----------
SUMMARY18%of 201

-----------
SUMMARY---of 0

ima_init_template_list---of 20
ima_restore_measurement_list---of 32
ima_template_desc_buf---of 3
ima_template_desc_current67%of 3
ima_template_has_modsig---of 7
lookup_template_desc---of 15
template_desc_init_fields---of 42
-----------
SUMMARY67%of 3

__cfg80211_alloc_event_skb---of 8
__cfg80211_alloc_reply_skb---of 4
__cfg80211_alloc_vendor_skb---of 16
__cfg80211_rdev_from_attrs---of 24
__cfg80211_send_event_skb---of 4
__cfg80211_wdev_from_attrs---of 46
__nl80211_set_channel---of 47
__nl80211_unexpected_frame---of 9
_nl80211_parse_chandef---of 43
cfg80211_assign_cookie---of 3
cfg80211_assoc_comeback---of 23
cfg80211_bss_color_notify---of 32
cfg80211_ch_switch_notify---of 44
cfg80211_ch_switch_started_notify---of 28
cfg80211_check_station_change---of 54
cfg80211_conn_failed---of 9
cfg80211_control_port_tx_status---of 1
cfg80211_cqm_beacon_loss_notify---of 5
cfg80211_cqm_pktloss_notify---of 19
cfg80211_cqm_rssi_notify---of 33
cfg80211_cqm_rssi_notify_work---of 17
cfg80211_cqm_rssi_update---of 15
cfg80211_cqm_txe_notify---of 7
cfg80211_crit_proto_stopped---of 9
cfg80211_del_sta_sinfo---of 20
cfg80211_external_auth_request---of 17
cfg80211_free_coalesce---of 9
cfg80211_ft_event---of 28
cfg80211_gtk_rekey_notify---of 25
cfg80211_links_removed---of 35
cfg80211_mgmt_tx_status_ext---of 1
cfg80211_nan_func_terminated---of 17
cfg80211_nan_match---of 27
cfg80211_net_detect_results---of 18
cfg80211_new_sta---of 20
cfg80211_notify_new_peer_candidate---of 28
cfg80211_off_channel_oper_allowed---of 14
cfg80211_pmksa_candidate_notify---of 28
cfg80211_prepare_cqm---of 11
cfg80211_probe_status---of 28
cfg80211_rdev_free_wowlan---of 8
cfg80211_ready_on_channel---of 16
cfg80211_remain_on_channel_expired---of 16
cfg80211_report_obss_beacon_khz---of 30
cfg80211_report_wowlan_wakeup---of 70
cfg80211_rx_control_port---of 35
cfg80211_rx_spurious_frame---of 18
cfg80211_rx_unexpected_4addr_frame---of 19
cfg80211_rx_unprot_mlme_mgmt---of 23
cfg80211_schedule_channels_check---of 4
cfg80211_sta_opmode_change_notify---of 17
cfg80211_tdls_oper_request---of 25
cfg80211_tx_mgmt_expired---of 16
cfg80211_update_owe_info_event---of 31
cfg80211_vendor_cmd_get_sender---of 4
cfg80211_vendor_cmd_reply---of 3
dev_hold---of 5
dev_put---of 4
genlmsg_multicast_netns---of 3
genlmsg_reply---of 1
get_key_callback---of 22
get_vlan---of 13
handle_nan_filter---of 19
he_build_mcs_mask---of 33
ieee80211_get_he_iftype_cap---of 8
nl80211_abort_scan---of 18
nl80211_add_commands_unsplit---of 110
nl80211_add_link---of 31
nl80211_add_link_station---of 1
nl80211_add_mod_link_station---of 43
nl80211_add_tx_ts---of 14
nl80211_assoc_bss---of 8
nl80211_associate---of 125
nl80211_authenticate---of 48
nl80211_build_scan_msg---of 4
nl80211_calculate_ap_params---of 53
nl80211_cancel_remain_on_channel---of 17
nl80211_ch_switch_notify---of 15
nl80211_channel_switch---of 39
nl80211_check_scan_flags---of 41
nl80211_color_change---of 22
nl80211_common_reg_change_event---of 34
nl80211_connect---of 97
nl80211_crit_protocol_start---of 24
nl80211_crit_protocol_stop---of 17
nl80211_crypto_settings---of 40
nl80211_deauthenticate---of 13
nl80211_del_interface---of 4
nl80211_del_key---of 36
nl80211_del_mpath---of 17
nl80211_del_pmk---of 22
nl80211_del_pmksa---of 32
nl80211_del_station---of 40
nl80211_del_tx_ts---of 19
nl80211_disassociate---of 13
nl80211_disconnect---of 10
nl80211_dump_interface---of 22
nl80211_dump_mpath---of 22
nl80211_dump_mpp---of 22
nl80211_dump_scan---of 107
nl80211_dump_station---of 22
nl80211_dump_survey---of 79
nl80211_dump_wiphy---of 25
nl80211_dump_wiphy_done---of 1
nl80211_dump_wiphy_parse---of 13
nl80211_exit---of 1
nl80211_external_auth---of 29
nl80211_flush_pmksa---of 19
nl80211_frame_tx_status---of 49
nl80211_get_coalesce---of 21
nl80211_get_ftm_responder_stats---of 55
nl80211_get_interface---of 4
nl80211_get_key---of 28
nl80211_get_mesh_config---of 70
nl80211_get_mpath---of 22
nl80211_get_mpp---of 22
nl80211_get_power_save---of 6
nl80211_get_protocol_features---of 5
nl80211_get_reg_do---of 40
nl80211_get_reg_dump---of 26
nl80211_get_station---of 7
nl80211_get_wiphy---of 4
nl80211_get_wowlan---of 34
nl80211_join_ibss---of 55
nl80211_join_mesh---of 86
nl80211_join_ocb---of 3
nl80211_key_allowed---of 15
nl80211_leave_ibss---of 4
nl80211_leave_mesh---of 1
nl80211_leave_ocb---of 1
nl80211_michael_mic_failure---of 15
nl80211_modify_link_station---of 1
nl80211_msg_put_channel---of 87
nl80211_msg_put_wmm_rules---of 9
nl80211_nan_add_func---of 55
nl80211_nan_change_config---of 32
nl80211_nan_del_func---of 21
nl80211_netlink_notify33%of 31
nl80211_new_interface---of 62
nl80211_new_key---of 26
nl80211_new_mpath---of 19
nl80211_new_station---of 101
nl80211_notify_iface---of 5
nl80211_notify_radar_detection---of 11
nl80211_notify_wiphy---of 7
nl80211_parse_beacon---of 63
nl80211_parse_chandef---of 1
nl80211_parse_connkeys---of 39
nl80211_parse_counter_offsets---of 11
nl80211_parse_fils_discovery---of 10
nl80211_parse_he_obss_pd---of 15
nl80211_parse_key---of 41
nl80211_parse_key_new---of 21
nl80211_parse_mbssid_config---of 23
nl80211_parse_mcast_rate---of 31
nl80211_parse_mesh_config---of 77
nl80211_parse_mon_options---of 30
nl80211_parse_random_mac---of 7
nl80211_parse_sched_scan---of 125
nl80211_parse_sched_scan_plans---of 19
nl80211_parse_sta_wme---of 10
nl80211_parse_tx_bitrate_mask---of 118
nl80211_parse_unsol_bcast_probe_resp---of 8
nl80211_parse_wowlan_nd---of 5
nl80211_parse_wowlan_tcp---of 38
nl80211_post_doit---of 16
nl80211_pre_doit---of 42
nl80211_prep_scan_msg---of 38
nl80211_prepare_wdev_dump---of 15
nl80211_probe_client---of 25
nl80211_probe_mesh_link---of 13
nl80211_put_iface_combinations---of 31
nl80211_put_iftype_akm_suites---of 16
nl80211_put_iftypes---of 8
nl80211_put_mbssid_support---of 9
nl80211_put_regdom---of 22
nl80211_put_sar_specs---of 13
nl80211_put_signal---of 15
nl80211_put_sta_rate---of 55
nl80211_put_tid_config_support---of 16
nl80211_put_txq_stats---of 35
nl80211_radar_notify---of 12
nl80211_register_beacons---of 7
nl80211_register_mgmt---of 20
nl80211_register_unexpected_frame---of 6
nl80211_reload_regdb---of 1
nl80211_remain_on_channel---of 19
nl80211_remove_link---of 4
nl80211_remove_link_station---of 19
nl80211_req_set_reg---of 10
nl80211_send_ap_started---of 14
nl80211_send_ap_stopped---of 11
nl80211_send_assoc_timeout---of 1
nl80211_send_auth_timeout---of 1
nl80211_send_band_rateinfo---of 67
nl80211_send_beacon_hint_event---of 13
nl80211_send_chandef---of 19
nl80211_send_coalesce---of 3
nl80211_send_connect_result---of 65
nl80211_send_deauth---of 1
nl80211_send_disassoc---of 1
nl80211_send_disconnected---of 14
nl80211_send_ibss_bssid---of 8
nl80211_send_iface---of 63
nl80211_send_mgmt---of 27
nl80211_send_mgmt_stypes---of 21
nl80211_send_mlme_event---of 16
nl80211_send_mlme_timeout---of 8
nl80211_send_mpath---of 37
nl80211_send_pmsr_capa---of 39
nl80211_send_port_authorized---of 9
nl80211_send_regdom---of 16
nl80211_send_remain_on_chan_event---of 15
nl80211_send_roamed---of 58
nl80211_send_rx_assoc---of 1
nl80211_send_rx_auth---of 1
nl80211_send_scan_msg---of 4
nl80211_send_scan_start---of 5
nl80211_send_sched_scan---of 12
nl80211_send_station---of 178
nl80211_send_wiphy---of 260
nl80211_send_wowlan---of 36
nl80211_send_wowlan_nd---of 32
nl80211_send_wowlan_patterns---of 10
nl80211_send_wowlan_tcp---of 18
nl80211_send_wowlan_tcp_caps---of 14
nl80211_set_beacon---of 37
nl80211_set_bss---of 43
nl80211_set_channel---of 3
nl80211_set_coalesce---of 58
nl80211_set_cqm---of 59
nl80211_set_fils_aad---of 20
nl80211_set_hw_timestamp---of 20
nl80211_set_interface---of 24
nl80211_set_key---of 44
nl80211_set_mac_acl---of 20
nl80211_set_mcast_rate---of 55
nl80211_set_mpath---of 19
nl80211_set_multicast_to_unicast---of 19
nl80211_set_noack_map---of 17
nl80211_set_pmk---of 14
nl80211_set_pmksa---of 35
nl80211_set_power_save---of 20
nl80211_set_qos_map---of 28
nl80211_set_reg---of 31
nl80211_set_rekey_data---of 18
nl80211_set_sar_specs---of 30
nl80211_set_station---of 86
nl80211_set_tid_config---of 71
nl80211_set_ttlm---of 22
nl80211_set_tx_bitrate_mask---of 19
nl80211_set_wiphy---of 141
nl80211_set_wowlan---of 85
nl80211_start_ap---of 108
nl80211_start_nan---of 28
nl80211_start_p2p_device---of 24
nl80211_start_radar_detection---of 20
nl80211_start_sched_scan---of 25
nl80211_stop_ap---of 3
nl80211_stop_nan---of 3
nl80211_stop_p2p_device---of 4
nl80211_stop_sched_scan---of 10
nl80211_tdls_cancel_channel_switch---of 22
nl80211_tdls_channel_switch---of 15
nl80211_tdls_mgmt---of 26
nl80211_tdls_oper---of 19
nl80211_trigger_scan---of 92
nl80211_tx_control_port---of 38
nl80211_tx_mgmt---of 51
nl80211_tx_mgmt_cancel_wait---of 27
nl80211_update_connect_params---of 26
nl80211_update_ft_ies---of 18
nl80211_update_mesh_config---of 19
nl80211_update_owe_info---of 22
nl80211_validate_ap_phy_operation---of 11
nl80211_validate_key_link_id---of 14
nl80211_vendor_check_policy---of 7
nl80211_vendor_cmd---of 32
nl80211_vendor_cmd_dump---of 56
nl80211_wiphy_netns---of 10
nl80211hdr_put---of 1
nla_parse_nested---of 4
parse_acl_data---of 17
parse_station_flags---of 26
rcu_read_unlock---of 6
rdev_add_key---of 15
rdev_add_link_station---of 16
rdev_add_nan_func---of 15
rdev_add_station---of 15
rdev_add_tx_ts---of 17
rdev_change_station---of 15
rdev_channel_switch---of 15
rdev_color_change---of 15
rdev_get_antenna---of 30
rdev_get_channel---of 29
rdev_get_key---of 15
rdev_get_station---of 15
rdev_get_tx_power---of 29
rdev_get_txq_stats---of 15
rdev_mod_link_station---of 16
rdev_probe_mesh_link---of 15
rdev_remain_on_channel---of 15
rdev_set_antenna---of 15
rdev_set_ap_chanwidth---of 15
rdev_set_coalesce---of 17
rdev_set_cqm_rssi_config---of 15
rdev_set_cqm_rssi_range_config---of 15
rdev_set_default_beacon_key---of 15
rdev_set_default_key---of 15
rdev_set_default_mgmt_key---of 15
rdev_set_pmk---of 17
rdev_set_rekey_data---of 15
rdev_set_sar_specs---of 15
rdev_set_wiphy_params---of 16
rdev_start_ap---of 15
rdev_start_radar_detection---of 17
rdev_tdls_channel_switch---of 15
rdev_update_connect_params---of 15
trace_cfg80211_return_bool---of 15
trace_rdev_return_int---of 15
trace_rdev_return_int_cookie---of 15
trace_rdev_return_int_mpath_info---of 15
trace_rdev_return_int_station_info---of 15
trace_rdev_return_void---of 15
validate_acl_mac_addrs---of 6
validate_beacon_head---of 12
validate_beacon_tx_rate---of 54
validate_he_capa---of 6
validate_ie_attr---of 8
validate_pae_over_nl80211---of 5
validate_scan_freqs---of 16
vht_build_mcs_mask---of 33
-----------
SUMMARY33%of 31

-----------
SUMMARY---of 0

reciprocal_value67%of 3
reciprocal_value_adv---of 10
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__mptcp_expand_seq---of 4
__mptcp_make_csum---of 1
add_addr_generate_hmac---of 4
add_addr_hmac_valid---of 7
mptcp_do_fallback---of 9
mptcp_established_options17%of 99
mptcp_established_options_fastclose---of 9
mptcp_established_options_mp_fail---of 7
mptcp_established_options_rst---of 4
mptcp_get_options---of 98
mptcp_get_reset_option---of 4
mptcp_incoming_options3%of 107
mptcp_subflow_delegate---of 20
mptcp_syn_options40%of 5
mptcp_synack_options---of 6
mptcp_update_rcv_data_fin---of 6
mptcp_write_options10%of 75
sock_owned_by_user---of 5
-----------
SUMMARY10%of 286

tomoyo_assign_domain---of 26
tomoyo_assign_namespace---of 27
tomoyo_check_acl54%of 13
tomoyo_dump_page---of 27
tomoyo_find_next_domain---of 73
tomoyo_update_domain---of 23
tomoyo_update_policy---of 12
tomoyo_write_aggregator---of 21
tomoyo_write_transition_control---of 34
-----------
SUMMARY54%of 13

___pskb_trim---of 46
__alloc_skb46%of 11
__build_skb---of 3
__build_skb_around34%of 6
__consume_stateless_skb---of 1
__copy_skb_header36%of 14
__kfree_skb67%of 3
__kunmap_atomic---of 3
__napi_alloc_frag_align---of 21
__napi_kfree_skb---of 3
__netdev_alloc_frag_align---of 6
__netdev_alloc_skb---of 33
__pskb_copy_fclone---of 24
__pskb_pull_tail---of 61
__skb_checksum---of 35
__skb_checksum_complete---of 7
__skb_checksum_complete_head---of 7
__skb_clone100%of 3
__skb_complete_tx_timestamp---of 10
__skb_ext_alloc---of 3
__skb_ext_del---of 14
__skb_ext_put---of 14
__skb_ext_set---of 6
__skb_frag_ref34%of 9
__skb_frag_unref50%of 4
__skb_pad---of 19
__skb_send_sock---of 37
__skb_splice_bits---of 17
__skb_to_sgvec---of 38
__skb_tstamp_tx---of 24
__skb_unclone_keeptruesize---of 6
__skb_vlan_pop---of 16
__skb_warn_lro_forwarding---of 3
__skb_zcopy_downgrade_managed---of 4
__splice_segment---of 24
alloc_pages_node_noprof---of 3
alloc_skb_for_msg---of 3
alloc_skb_with_frags---of 30
build_skb28%of 11
build_skb_around---of 11
consume_skb50%of 8
csum_and_copy_from_iter_full---of 81
csum_block_add_ext---of 1
csum_partial_ext---of 1
drop_reasons_register_subsys---of 3
drop_reasons_unregister_subsys---of 3
kfree_skb_list_reason---of 33
kfree_skb_partial50%of 4
kfree_skbmem88%of 8
kmalloc_reserve45%of 9
local_lock_release---of 6
mm_account_pinned_pages---of 12
mm_unaccount_pinned_pages---of 3
msg_zerocopy_complete---of 16
msg_zerocopy_put_abort---of 3
msg_zerocopy_realloc---of 21
napi_alloc_skb---of 35
napi_build_skb---of 11
napi_consume_skb---of 12
napi_get_frags_check---of 1
napi_pp_put_page---of 12
napi_skb_cache_get---of 27
napi_skb_cache_put---of 27
napi_skb_free_stolen_head---of 7
net_zcopy_get---of 4
nf_reset_ct---of 5
pskb_carve---of 69
pskb_expand_head---of 41
pskb_extract---of 7
pskb_put---of 6
pskb_trim_rcsum_slow---of 7
put_page36%of 14
refcount_dec_and_test---of 4
sendmsg_locked---of 4
sendmsg_unlocked---of 3
sk_skb_reason_drop32%of 25
skb_abort_seq_read---of 4
skb_add_rx_frag_netmem---of 13
skb_append---of 1
skb_append_pagefrags---of 23
skb_attempt_defer_free14%of 23
skb_checksum---of 1
skb_checksum_setup---of 35
skb_checksum_setup_ip---of 20
skb_checksum_trimmed---of 24
skb_clone42%of 12
skb_clone_sk---of 13
skb_cloned67%of 3
skb_coalesce_rx_frag---of 5
skb_complete_tx_timestamp---of 16
skb_complete_wifi_ack---of 13
skb_condense---of 6
skb_copy29%of 7
skb_copy_and_csum_bits---of 29
skb_copy_and_csum_dev---of 8
skb_copy_bits7%of 29
skb_copy_expand---of 10
skb_copy_header---of 1
skb_copy_ubufs---of 78
skb_cow_data---of 40
skb_cow_data_for_xdp---of 3
skb_dequeue100%of 3
skb_dequeue_tail---of 3
skb_dump---of 35
skb_ensure_writable---of 14
skb_ensure_writable_head_tail---of 10
skb_errqueue_purge---of 9
skb_eth_pop---of 10
skb_eth_push---of 13
skb_expand_head---of 15
skb_ext_add---of 24
skb_fill_page_desc---of 10
skb_find_text---of 3
skb_frag_ref37%of 11
skb_head_is_locked---of 4
skb_headers_offset_update---of 5
skb_morph---of 3
skb_mpls_dec_ttl---of 17
skb_mpls_pop---of 16
skb_mpls_push---of 23
skb_mpls_update_lse---of 7
skb_orphan---of 4
skb_partial_csum_set---of 6
skb_pp_cow_data---of 39
skb_pp_frag_ref14%of 15
skb_prepare_for_shift25%of 8
skb_prepare_seq_read---of 1
skb_pull50%of 6
skb_pull_data---of 7
skb_pull_rcsum34%of 9
skb_push67%of 3
skb_put50%of 4
skb_queue_head---of 1
skb_queue_purge_reason29%of 7
skb_queue_tail100%of 1
skb_rbtree_purge50%of 4
skb_realloc_headroom---of 5
skb_release_data49%of 29
skb_release_head_state70%of 13
skb_scrub_packet---of 13
skb_segment20%of 133
skb_segment_list---of 41
skb_send_sock---of 1
skb_send_sock_locked---of 1
skb_seq_read---of 29
skb_shift37%of 57
skb_splice_bits---of 3
skb_splice_from_iter---of 27
skb_split40%of 25
skb_store_bits---of 29
skb_to_sgvec---of 3
skb_to_sgvec_nomark---of 1
skb_trim---of 4
skb_try_coalesce35%of 46
skb_ts_finish---of 4
skb_ts_get_next_block---of 1
skb_tstamp_tx---of 1
skb_tx_error---of 10
skb_unlink---of 1
skb_vlan_pop---of 12
skb_vlan_push---of 16
skb_vlan_untag38%of 29
skb_zerocopy---of 49
skb_zerocopy_clone12%of 17
skb_zerocopy_headlen---of 7
skb_zerocopy_iter_stream---of 19
slab_build_skb---of 4
sock_dequeue_err_skb---of 10
sock_queue_err_skb---of 21
sock_rmem_free---of 1
sock_spd_release---of 1
trace_consume_skb27%of 15
virt_to_head_page---of 7
warn_crc32c_csum_combine---of 3
warn_crc32c_csum_update---of 3
-----------
SUMMARY33%of 638

-----------
SUMMARY---of 0

__cgroup_account_cputime60%of 5
__cgroup_account_cputime_field---of 12
__cgroup_rstat_lock---of 32
__cgroup_rstat_unlock---of 15
bpf_rstat_flush---of 1
cgroup_base_stat_cputime_show---of 6
cgroup_rstat_exit---of 7
cgroup_rstat_flush---of 1
cgroup_rstat_flush_hold---of 1
cgroup_rstat_flush_locked---of 99
cgroup_rstat_flush_release---of 1
cgroup_rstat_init50%of 8
cgroup_rstat_updated22%of 60
-----------
SUMMARY28%of 73

sctp_do_8_2_transport_strike---of 25
sctp_do_sm17%of 274
sctp_generate_autoclose_event---of 1
sctp_generate_heartbeat_event---of 13
sctp_generate_probe_event---of 11
sctp_generate_proto_unreach_event---of 11
sctp_generate_reconf_event---of 12
sctp_generate_sack_event---of 1
sctp_generate_t1_cookie_event---of 1
sctp_generate_t1_init_event---of 1
sctp_generate_t2_shutdown_event---of 1
sctp_generate_t3_rtx_event---of 11
sctp_generate_t4_rto_event---of 1
sctp_generate_t5_shutdown_guard_event---of 1
sctp_generate_timeout_event---of 12
-----------
SUMMARY17%of 274

-----------
SUMMARY---of 0

tomoyo_encode50%of 14
tomoyo_encode2---of 14
tomoyo_get_local_path---of 20
tomoyo_realpath_from_path40%of 20
tomoyo_realpath_nofollow---of 4
-----------
SUMMARY45%of 34

-----------
SUMMARY---of 0

__crc32c_pcl_intel_finup---of 10
crc32c_intel_cra_init---of 1
crc32c_intel_digest---of 7
crc32c_intel_final---of 1
crc32c_intel_finup---of 7
crc32c_intel_init---of 1
crc32c_intel_setkey---of 3
crc32c_intel_update---of 7
crc32c_pcl_intel_digest---of 1
crc32c_pcl_intel_finup---of 1
crc32c_pcl_intel_update70%of 10
-----------
SUMMARY70%of 10

__tipc_skb_queue_sorted---of 8
msg_importance---of 7
msg_set_importance---of 5
skb_orphan---of 4
tipc_buf_acquire---of 3
tipc_buf_append---of 17
tipc_msg_append---of 15
tipc_msg_assemble---of 9
tipc_msg_build20%of 21
tipc_msg_create---of 4
tipc_msg_extract---of 9
tipc_msg_fragment---of 14
tipc_msg_init67%of 3
tipc_msg_lookup_dest---of 12
tipc_msg_pskb_copy---of 7
tipc_msg_reassemble---of 8
tipc_msg_reverse30%of 10
tipc_msg_skb_clone---of 9
tipc_msg_try_bundle---of 20
tipc_msg_validate---of 18
tipc_own_addr---of 16
tipc_skb_reject---of 3
-----------
SUMMARY27%of 34

-----------
SUMMARY---of 0

__kobject_del---of 9
dynamic_kobj_release---of 3
kobj_attr_show---of 3
kobj_attr_store---of 3
kobj_child_ns_ops---of 5
kobj_kset_leave---of 4
kobj_ns_current_may_mount---of 4
kobj_ns_drop---of 5
kobj_ns_grab_current---of 4
kobj_ns_initial---of 4
kobj_ns_netlink---of 4
kobj_ns_ops---of 5
kobj_ns_type_register---of 4
kobj_ns_type_registered---of 3
kobject_add---of 5
kobject_add_internal19%of 37
kobject_create_and_add---of 6
kobject_del---of 3
kobject_get58%of 7
kobject_get_ownership67%of 3
kobject_get_path---of 11
kobject_get_unless_zero---of 8
kobject_init40%of 5
kobject_init_and_add67%of 3
kobject_move---of 15
kobject_namespace---of 7
kobject_put---of 24
kobject_rename---of 14
kobject_set_name---of 1
kobject_set_name_vargs34%of 6
kset_create_and_add---of 5
kset_find_obj---of 7
kset_get_ownership---of 4
kset_init---of 3
kset_register---of 5
kset_release---of 3
kset_unregister---of 4
-----------
SUMMARY32%of 61

__arch_prepare_bpf_trampoline---of 136
__bpf_arch_text_poke---of 16
add_2mod---of 5
arch_alloc_bpf_trampoline---of 1
arch_bpf_stack_walk---of 6
arch_bpf_trampoline_size---of 3
arch_free_bpf_trampoline---of 1
arch_prepare_bpf_dispatcher---of 1
arch_prepare_bpf_trampoline---of 5
arch_protect_bpf_trampoline---of 1
bpf_arch_poke_desc_update---of 12
bpf_arch_text_copy100%of 1
bpf_arch_text_invalidate---of 1
bpf_arch_text_poke---of 8
bpf_arch_uaddress_limit---of 1
bpf_int_jit_compile27%of 45
bpf_jit_free---of 7
bpf_jit_supports_arena---of 1
bpf_jit_supports_exceptions---of 1
bpf_jit_supports_insn---of 9
bpf_jit_supports_kfunc_call---of 1
bpf_jit_supports_percpu_insn---of 1
bpf_jit_supports_ptr_xchg---of 1
bpf_jit_supports_subprog_tailcalls---of 1
cmp_ips---of 1
do_jit6%of 500
emit_bpf_dispatcher---of 31
emit_ldx---of 29
emit_mov_imm3224%of 13
emit_mov_imm6440%of 5
emit_mov_reg---of 21
emit_stx---of 45
ex_handler_bpf---of 3
invoke_bpf_prog---of 40
jit_fill_hole100%of 1
maybe_emit_mod---of 16
save_args---of 20
-----------
SUMMARY9%of 565

-----------
SUMMARY---of 0

__xfrm_mode_beet_prep---of 17
__xfrm_mode_tunnel_prep---of 15
__xfrm_transport_prep---of 17
dev_hold---of 5
dev_put---of 4
netdev_put---of 4
netdev_tracker_alloc---of 3
validate_xmit_xfrm4%of 56
xfrm_dev_backlog---of 9
xfrm_dev_event---of 18
xfrm_dev_offload_ok---of 26
xfrm_dev_policy_add---of 25
xfrm_dev_resume---of 28
xfrm_dev_state_add---of 37
xfrm_outer_mode_prep---of 11
xmit_xfrm_check_overflow---of 7
-----------
SUMMARY4%of 56

-----------
SUMMARY---of 0

tcp_newreno_mark_lost---of 9
tcp_rack_advance60%of 5
tcp_rack_detect_loss---of 16
tcp_rack_mark_lost---of 6
tcp_rack_reo_timeout---of 7
tcp_rack_skb_timeout---of 1
tcp_rack_update_reo_wnd50%of 8
-----------
SUMMARY54%of 13

-----------
SUMMARY---of 0

compat_rawv6_ioctl---of 4
dst_output---of 8
fl6_sock_lookup---of 4
ipv6_can_nonlocal_bind---of 4
raw6_destroy---of 1
raw6_exit_net---of 1
raw6_getfrag---of 8
raw6_icmp_error---of 28
raw6_init_net---of 1
raw6_local_deliver11%of 38
raw6_proc_exit---of 1
raw6_seq_show---of 3
raw_v6_match---of 15
rawv6_bind---of 35
rawv6_close---of 3
rawv6_exit---of 1
rawv6_getsockopt---of 19
rawv6_init_sk---of 4
rawv6_ioctl---of 5
rawv6_mh_filter_register---of 1
rawv6_mh_filter_unregister---of 1
rawv6_push_pending_frames---of 27
rawv6_rcv---of 63
rawv6_rcv_skb---of 16
rawv6_recvmsg---of 36
rawv6_send_hdrinc---of 65
rawv6_sendmsg---of 83
rawv6_setsockopt---of 23
rcu_read_unlock---of 6
skb_dst---of 5
txopt_get---of 23
-----------
SUMMARY11%of 38

-----------
SUMMARY---of 0

__crypto_alg_lookup47%of 26
__crypto_alloc_tfm---of 1
__crypto_alloc_tfmgfp---of 14
crypto_alg_lookup15%of 14
crypto_alg_mod_lookup19%of 27
crypto_alloc_base---of 11
crypto_alloc_tfm_node24%of 13
crypto_clone_tfm---of 13
crypto_create_tfm_node17%of 12
crypto_destroy_tfm---of 14
crypto_find_alg---of 3
crypto_has_alg---of 6
crypto_larval_alloc---of 3
crypto_larval_destroy---of 7
crypto_larval_kill---of 7
crypto_larval_wait---of 17
crypto_mod_get---of 5
crypto_mod_put---of 5
crypto_probing_notify---of 3
crypto_req_done---of 3
crypto_shoot_alg---of 1
crypto_wait_for_test---of 6
-----------
SUMMARY27%of 92

-----------
SUMMARY---of 0

should_fail_usercopy100%of 1
-----------
SUMMARY100%of 1

_atomic_dec_and_lock43%of 7
_atomic_dec_and_lock_irqsave29%of 7
_atomic_dec_and_raw_lock---of 7
_atomic_dec_and_raw_lock_irqsave---of 7
-----------
SUMMARY36%of 14

__bpf_trace_emulate_vsyscall---of 1
__probestub_emulate_vsyscall---of 1
__traceiter_emulate_vsyscall---of 4
emulate_vsyscall4%of 50
gate_vma_name---of 1
get_gate_vma---of 4
in_gate_area---of 6
in_gate_area_no_mm100%of 1
perf_trace_emulate_vsyscall---of 8
secure_computing---of 3
trace_event_raw_event_emulate_vsyscall---of 7
trace_raw_output_emulate_vsyscall---of 3
warn_bad_vsyscall---of 4
write_ok_or_segv---of 3
-----------
SUMMARY6%of 51

__ipv6_addr_label20%of 15
addrlbl_ifindex_exists---of 11
ip6addrlbl_add---of 25
ip6addrlbl_del---of 12
ip6addrlbl_dump---of 30
ip6addrlbl_fill---of 9
ip6addrlbl_get---of 41
ip6addrlbl_net_exit---of 5
ip6addrlbl_net_init---of 7
ip6addrlbl_newdel---of 14
ipv6_addr_label31%of 13
ipv6_addr_label_cleanup---of 1
nlmsg_parse_deprecated_strict---of 4
-----------
SUMMARY25%of 28

__access_remote_vm---of 38
__apply_to_page_range28%of 70
__do_fault---of 21
__folio_rmap_sanity_checks---of 21
__get_locked_pte40%of 5
__might_fault50%of 4
__pmd_alloc16%of 13
__pte_alloc34%of 6
__pte_alloc_kernel---of 5
__pud_alloc---of 11
__vm_insert_mixed---of 8
_compound_head---of 7
access_process_vm---of 3
access_remote_vm---of 1
add_mm_rss_vec---of 9
apply_to_existing_page_range---of 1
apply_to_page_range100%of 1
clear_gigantic_page---of 4
clear_huge_page---of 11
copy_folio_from_user---of 17
copy_page_range---of 41
copy_pmd_range---of 495
copy_present_page---of 20
copy_user_gigantic_page---of 5
copy_user_large_folio---of 22
count_memcg_event_mm30%of 24
do_page_mkwrite---of 15
do_set_pmd---of 35
do_swap_page---of 260
do_wp_page8%of 311
fault_around_bytes_fops_open---of 1
fault_around_bytes_get---of 1
fault_around_bytes_set---of 4
fault_dirty_shared_page---of 22
finish_fault---of 40
finish_mkwrite_fault---of 10
folio_dup_file_rmap_ptes---of 10
folio_get---of 3
folio_large_mapcount---of 9
folio_lock---of 9
folio_lock_or_retry---of 9
folio_mapcount---of 9
folio_prealloc34%of 9
folio_pte_batch---of 41
folio_put---of 4
folio_try_dup_anon_rmap_ptes---of 155
follow_pte---of 18
free_pgd_range17%of 24
free_pgtables24%of 26
generic_access_phys---of 29
get_page---of 9
handle_mm_fault29%of 101
handle_pte_fault10%of 407
handle_pte_marker---of 3
insert_page_into_pte_locked---of 23
insert_pfn---of 29
lock_mm_and_find_vma34%of 21
lock_vma_under_rcu21%of 24
lruvec_stat_add_folio40%of 15
lruvec_stat_sub_folio---of 15
mm_counter---of 8
mm_counter_file---of 7
mm_trace_rss_stat27%of 15
mmap_read_lock_killable---of 5
mmap_read_trylock---of 5
mmap_read_unlock67%of 3
mmap_write_downgrade---of 6
mmap_write_unlock---of 6
numa_migrate_prep---of 7
numa_rebuild_single_mapping---of 13
pfn_swap_entry_folio---of 17
pfn_swap_entry_to_page---of 18
pfn_valid---of 29
pmd_install---of 3
print_bad_pte---of 16
print_vma_addr---of 11
pte_unmap---of 6
ptlock_alloc67%of 3
ptlock_free---of 1
put_page---of 14
put_swap_device---of 14
remap_pfn_range---of 4
remap_pfn_range_notrack---of 46
remove_device_exclusive_entry---of 31
restore_exclusive_pte---of 48
set_pte_range29%of 21
try_restore_exclusive_pte---of 15
unmap_mapping_folio---of 18
unmap_mapping_pages---of 3
unmap_mapping_range---of 5
unmap_mapping_range_tree---of 4
unmap_page_range3%of 204
unmap_single_vma34%of 12
unmap_vmas50%of 14
upgrade_mmap_lock_carefully---of 11
vm_insert_page---of 41
vm_insert_pages---of 55
vm_iomap_memory---of 7
vm_map_pages---of 6
vm_map_pages_zero---of 6
vm_normal_folio---of 8
vm_normal_folio_pmd---of 8
vm_normal_page34%of 9
vm_normal_page_pmd---of 12
vma_end_read---of 11
vma_pgtable_walk_begin---of 3
vma_pgtable_walk_end---of 3
vmf_anon_prepare---of 7
vmf_insert_mixed---of 1
vmf_insert_mixed_mkwrite---of 1
vmf_insert_pfn---of 1
vmf_insert_pfn_prot---of 11
walk_to_pmd34%of 12
wp_huge_pmd---of 9
wp_page_reuse---of 34
zap_page_range_single---of 13
zap_vma_ptes---of 6
-----------
SUMMARY15%of 1354

defrag4_net_exit---of 3
ipv4_conntrack_defrag22%of 19
nf_defrag_ipv4_disable---of 4
nf_defrag_ipv4_enable---of 5
-----------
SUMMARY22%of 19

__tun_build_skb30%of 10
__tun_chr_ioctl---of 102
__tun_detach---of 70
__tun_set_ebpf---of 11
bpf_prog_run_clear_cb---of 6
dev_sw_netstats_rx_add67%of 3
group_show---of 3
owner_show---of 3
pskb_may_pull---of 6
put_page---of 14
rcu_read_unlock34%of 6
set_offload---of 8
tun_attach---of 63
tun_attach_filter---of 12
tun_chr_close---of 12
tun_chr_compat_ioctl---of 1
tun_chr_fasync---of 4
tun_chr_ioctl---of 1
tun_chr_open---of 4
tun_chr_poll34%of 18
tun_chr_read_iter---of 10
tun_chr_show_fdinfo---of 5
tun_chr_write_iter38%of 8
tun_detach_filter---of 11
tun_device_event---of 51
tun_do_read---of 69
tun_fill_info---of 16
tun_flags_show---of 1
tun_flow_cleanup---of 12
tun_flow_create---of 6
tun_flow_uninit---of 10
tun_flow_update---of 41
tun_free_netdev---of 3
tun_get24%of 21
tun_get_addr_len---of 20
tun_get_channels---of 1
tun_get_coalesce---of 1
tun_get_drvinfo---of 4
tun_get_link_ksettings---of 1
tun_get_msglevel---of 1
tun_get_size---of 1
tun_get_socket---of 4
tun_get_tx_ring---of 4
tun_get_user11%of 191
tun_get_vnet_be---of 1
tun_napi_poll---of 12
tun_net_change_carrier---of 4
tun_net_close---of 1
tun_net_fix_features---of 1
tun_net_get_stats64---of 1
tun_net_init---of 4
tun_net_initialize---of 4
tun_net_mclist---of 1
tun_net_open---of 4
tun_net_uninit---of 46
tun_net_xmit---of 85
tun_not_capable---of 7
tun_peek_len---of 9
tun_prog_free---of 1
tun_ptr_free---of 4
tun_queue_purge---of 13
tun_recvmsg---of 14
tun_rx_batched19%of 11
tun_select_queue---of 28
tun_sendmsg---of 85
tun_set_coalesce---of 1
tun_set_ebpf---of 5
tun_set_headroom---of 1
tun_set_iff---of 34
tun_set_link_ksettings---of 1
tun_set_msglevel---of 1
tun_set_sndbuf---of 11
tun_set_vnet_be---of 3
tun_setup---of 1
tun_sock_write_space34%of 6
tun_validate---of 3
tun_xdp---of 26
tun_xdp_act---of 32
tun_xdp_xmit---of 35
update_filter---of 19
virtio_net_hdr_set_proto---of 7
virtio_net_hdr_to_skb7%of 62
-----------
SUMMARY15%of 336

clear_shadow_entry---of 6
folio_contains---of 24
folio_invalidate---of 3
folio_lock---of 9
folio_mapped---of 16
folio_size---of 10
generic_error_remove_folio---of 5
invalidate_inode_pages2---of 1
invalidate_inode_pages2_range---of 57
invalidate_mapping_pages---of 1
mapping_evict_folio---of 27
mapping_try_invalidate---of 20
pagecache_isize_extended---of 13
truncate_cleanup_folio---of 20
truncate_folio_batch_exceptionals---of 26
truncate_inode_folio---of 3
truncate_inode_pages---of 1
truncate_inode_pages_final67%of 3
truncate_inode_pages_range4%of 59
truncate_inode_partial_folio---of 33
truncate_pagecache---of 1
truncate_pagecache_range---of 3
truncate_setsize---of 3
-----------
SUMMARY7%of 62

-----------
SUMMARY---of 0

__crypto_register_alg---of 15
crypto_alg_extsize100%of 1
crypto_alg_finish_registration---of 18
crypto_alg_tested---of 26
crypto_attr_alg_name---of 5
crypto_check_attr_type---of 7
crypto_dequeue_request---of 6
crypto_destroy_instance---of 1
crypto_destroy_instance_workfn---of 1
crypto_drop_spawn---of 7
crypto_enqueue_request---of 7
crypto_enqueue_request_head---of 5
crypto_get_attr_type---of 5
crypto_grab_spawn---of 7
crypto_inc---of 8
crypto_init_queue---of 1
crypto_inst_setname---of 3
crypto_lookup_template---of 14
crypto_register_alg---of 21
crypto_register_algs---of 7
crypto_register_instance---of 30
crypto_register_notifier---of 1
crypto_register_template---of 5
crypto_register_templates---of 10
crypto_remove_final---of 10
crypto_remove_instance---of 12
crypto_remove_spawns---of 38
crypto_spawn_alg---of 10
crypto_spawn_tfm---of 5
crypto_spawn_tfm2---of 4
crypto_type_has_alg---of 3
crypto_unregister_alg---of 16
crypto_unregister_algs---of 4
crypto_unregister_instance---of 10
crypto_unregister_notifier---of 1
crypto_unregister_template---of 23
crypto_unregister_templates---of 4
-----------
SUMMARY100%of 1

attach_dn---of 7
dnotify_flush10%of 20
dnotify_free_mark---of 3
dnotify_handle_event---of 13
dnotify_recalc_inode_mask---of 7
fcntl_dirnotify---of 31
fsnotify_group_unlock---of 3
-----------
SUMMARY10%of 20

-----------
SUMMARY---of 0

copy_time_ns17%of 12
do_timens_ktime_to_host---of 5
find_timens_vvar_page---of 3
free_time_ns---of 5
proc_timens_set_offset---of 33
proc_timens_show_offsets---of 9
timens_commit---of 5
timens_for_children_get---of 5
timens_get---of 5
timens_install---of 16
timens_on_fork---of 8
timens_owner---of 1
timens_put---of 4
-----------
SUMMARY17%of 12

sha224_final---of 3
sha25660%of 5
sha256_final67%of 3
sha256_transform_blocks80%of 5
sha256_update---of 8
-----------
SUMMARY70%of 13

-----------
SUMMARY---of 0

proc_self_get_link50%of 4
proc_setup_self---of 4
-----------
SUMMARY50%of 4

__bpf_trace_ack_update_msk---of 1
__bpf_trace_mptcp_dump_mpext---of 1
__bpf_trace_mptcp_subflow_get_send---of 1
__bpf_trace_subflow_check_data_avail---of 1
__mptcp_check_push24%of 13
__mptcp_clean_una22%of 46
__mptcp_clean_una_wakeup50%of 12
__mptcp_close---of 38
__mptcp_close_ssk---of 55
__mptcp_data_acked40%of 10
__mptcp_destroy_sock---of 18
__mptcp_error_report---of 6
__mptcp_finish_join---of 9
__mptcp_init_sock---of 3
__mptcp_move_skbs---of 26
__mptcp_move_skbs_from_subflow---of 76
__mptcp_nmpc_sk24%of 13
__mptcp_ofo_queue---of 17
__mptcp_push_pending---of 30
__mptcp_retrans---of 38
__mptcp_retransmit_pending_data---of 12
__mptcp_rmem_reclaim---of 4
__mptcp_subflow_error_report---of 15
__mptcp_subflow_push_pending---of 28
__mptcp_subflow_send_ack---of 5
__mptcp_sync_sndbuf---of 15
__mptcp_unaccepted_force_close---of 1
__mptcp_update_rmem---of 5
__mptcp_wr_shutdown---of 6
__probestub_ack_update_msk---of 1
__probestub_get_mapping_status---of 1
__probestub_mptcp_sendmsg_frag---of 1
__probestub_mptcp_subflow_get_send---of 1
__probestub_subflow_check_data_avail---of 1
__subflow_push_pending---of 34
__traceiter_ack_update_msk---of 4
__traceiter_get_mapping_status---of 4
__traceiter_mptcp_sendmsg_frag---of 4
__traceiter_mptcp_subflow_get_send---of 4
__traceiter_subflow_check_data_avail---of 4
dfrag_clear---of 22
lock_sock_fast---of 1
mptcp_bind---of 9
mptcp_cancel_work---of 6
mptcp_check_and_set_pending---of 6
mptcp_check_data_fin---of 26
mptcp_check_listen_stop---of 9
mptcp_check_send_data_fin---of 14
mptcp_close---of 10
mptcp_close_ssk---of 6
mptcp_close_state---of 12
mptcp_connect30%of 41
mptcp_copy_inaddrs55%of 11
mptcp_data_ready---of 34
mptcp_destroy---of 3
mptcp_destroy_common---of 14
mptcp_disconnect---of 17
mptcp_do_fastclose---of 11
mptcp_drop---of 1
mptcp_enter_memory_pressure---of 13
mptcp_finish_connect---of 5
mptcp_finish_join---of 27
mptcp_forward_alloc_get---of 3
mptcp_get_port---of 7
mptcp_hash---of 1
mptcp_init_sock---of 10
mptcp_ioctl---of 14
mptcp_ioctl_outq---of 13
mptcp_listen---of 25
mptcp_napi_poll---of 16
mptcp_poll10%of 53
mptcp_rcv_space_init67%of 3
mptcp_recvmsg---of 112
mptcp_release_cb28%of 40
mptcp_reset_tout_timer---of 4
mptcp_retransmit_timer---of 13
mptcp_rfree---of 1
mptcp_rmem_uncharge---of 9
mptcp_schedule_work---of 8
mptcp_send_ack---of 10
mptcp_send_active_reset_reason---of 9
mptcp_send_next---of 3
mptcp_sendmsg13%of 78
mptcp_sendmsg_fastopen25%of 16
mptcp_sendmsg_frag---of 119
mptcp_set_owner_r---of 7
mptcp_set_state20%of 10
mptcp_shutdown---of 6
mptcp_sk_clone_init---of 74
mptcp_sock_graft---of 1
mptcp_stream_accept---of 30
mptcp_stream_memory_free56%of 9
mptcp_subflow_active---of 9
mptcp_subflow_delegate---of 20
mptcp_subflow_get_retrans---of 13
mptcp_subflow_get_send---of 40
mptcp_subflow_process_delegated---of 29
mptcp_subflow_set_active---of 4
mptcp_subflow_shutdown---of 19
mptcp_sync_mss---of 1
mptcp_tout_timer---of 4
mptcp_try_coalesce---of 8
mptcp_unhash---of 1
mptcp_worker---of 70
perf_trace_ack_update_msk---of 8
perf_trace_mptcp_dump_mpext---of 9
perf_trace_mptcp_subflow_get_send---of 26
perf_trace_subflow_check_data_avail---of 8
sk_wake_async---of 17
sock_owned_by_user---of 5
trace_event_raw_event_ack_update_msk---of 7
trace_event_raw_event_mptcp_dump_mpext---of 8
trace_event_raw_event_mptcp_subflow_get_send---of 25
trace_event_raw_event_subflow_check_data_avail---of 7
trace_raw_output_ack_update_msk---of 3
trace_raw_output_mptcp_dump_mpext---of 3
trace_raw_output_mptcp_subflow_get_send---of 3
trace_raw_output_subflow_check_data_avail---of 3
unlock_sock_fast---of 3
-----------
SUMMARY24%of 355

-----------
SUMMARY---of 0

copy_fpstate_to_sigframe21%of 48
fpu__alloc_mathframe50%of 4
fpu__restore_sig---of 73
os_xrstor_safe---of 8
os_xsave---of 5
-----------
SUMMARY24%of 52

-----------
SUMMARY---of 0

sysfs_create_link---of 3
sysfs_create_link_nowarn---of 3
sysfs_create_link_sd---of 1
sysfs_delete_link---of 4
sysfs_do_create_link_sd---of 6
sysfs_remove_link100%of 1
sysfs_rename_link_ns---of 7
-----------
SUMMARY100%of 1

__smc_connect---of 69
__smc_create---of 6
__smc_release---of 24
sk_set_bit---of 3
smc_accept---of 25
smc_accept_dequeue---of 19
smc_bind---of 10
smc_check_ism_v2_match---of 22
smc_clcsock_data_ready---of 10
smc_close_non_accepted---of 12
smc_conn_abort---of 4
smc_conn_save_peer_info---of 37
smc_conn_save_peer_info_fce---of 35
smc_connect---of 31
smc_connect_check_aclc---of 8
smc_connect_clc---of 3
smc_connect_decline_fallback---of 11
smc_connect_fallback---of 11
smc_connect_ism---of 31
smc_connect_ism_vlan_cleanup---of 4
smc_connect_rdma---of 54
smc_connect_work---of 24
smc_copy_sock_settings_to_clc---of 5
smc_create---of 5
smc_create_clcsk---of 5
smc_curs_copy---of 1
smc_destruct---of 3
smc_fback_data_ready67%of 3
smc_fback_error_report---of 3
smc_fback_forward_wakeup21%of 29
smc_fback_mark_woken100%of 1
smc_fback_state_change67%of 3
smc_fback_write_space---of 3
smc_fill_gid_list---of 5
smc_getname---of 5
smc_getsockopt---of 9
smc_hash_sk---of 8
smc_hs_congested---of 3
smc_ioctl---of 21
smc_listen---of 20
smc_listen_find_device---of 114
smc_listen_out---of 12
smc_listen_out_err---of 5
smc_listen_rdma_finish---of 14
smc_listen_rdma_reg---of 7
smc_listen_v2_check---of 28
smc_listen_work---of 46
smc_net_exit---of 1
smc_net_init---of 3
smc_net_stat_exit---of 1
smc_net_stat_init---of 1
smc_nl_disable_hs_limitation---of 1
smc_nl_dump_hs_limitation---of 7
smc_nl_enable_hs_limitation---of 1
smc_poll---of 16
smc_recvmsg---of 17
smc_release---of 21
smc_release_cb67%of 3
smc_rx_data_available---of 1
smc_sendmsg---of 18
smc_set_keepalive---of 1
smc_setsockopt---of 39
smc_shutdown13%of 31
smc_sk_init---of 1
smc_splice_read---of 12
smc_switch_to_fallback---of 42
smc_tcp_listen_work---of 44
smc_tcp_syn_recv_sock---of 6
smc_tx_prepared_sends---of 3
smc_ulp_clone---of 1
smc_ulp_init---of 12
smc_unhash_sk---of 10
smcr_clnt_conf_first_link---of 12
smcr_lgr_reg_rmbs---of 30
smcr_lgr_reg_sndbufs---of 11
-----------
SUMMARY25%of 70

-----------
SUMMARY---of 0

__put_user_ns---of 1
cmp_extents_forward---of 1
cmp_extents_reverse---of 1
cmp_map_id---of 1
create_user_ns---of 55
current_in_userns---of 4
enforced_nproc_rlimit---of 4
free_user_ns---of 10
from_kgid13%of 24
from_kgid_munged12%of 25
from_kprojid13%of 24
from_kprojid_munged---of 25
from_kuid13%of 24
from_kuid_munged12%of 25
gid_m_show---of 26
gid_m_start---of 5
in_userns50%of 4
m_next---of 1
m_stop---of 1
make_kgid20%of 15
make_kprojid---of 15
make_kuid20%of 15
map_id_down---of 15
map_id_up---of 24
map_write---of 61
new_idmap_permitted---of 55
ns_get_owner---of 7
proc_gid_map_write---of 4
proc_projid_map_write---of 4
proc_setgroups_show---of 1
proc_setgroups_write---of 11
proc_uid_map_write---of 4
projid_m_show---of 26
projid_m_start---of 5
sort_idmaps---of 4
uid_m_show---of 26
uid_m_start---of 5
unshare_userns34%of 6
userns_get---of 20
userns_install---of 16
userns_may_setgroups---of 3
userns_owner---of 1
userns_put---of 5
-----------
SUMMARY16%of 162

aa_audit_file---of 18
aa_file_perm13%of 62
aa_get_newest_label---of 35
aa_inherit_files---of 17
aa_lookup_fperms---of 4
aa_path_link---of 24
aa_path_perm---of 5
aa_str_perms---of 4
file_audit_cb---of 12
match_file---of 1
path_name---of 5
profile_path_perm---of 7
update_file_ctx---of 14
-----------
SUMMARY13%of 62

-----------
SUMMARY---of 0

strncpy_from_user20%of 15
-----------
SUMMARY20%of 15

__cyc2ns_read---of 3
__set_cyc2ns_scale---of 11
calibrate_delay_is_known---of 16
check_tsc_unstable---of 1
convert_art_ns_to_tsc---of 1
convert_art_to_tsc---of 1
cyc2ns_read_begin---of 3
cyc2ns_read_end---of 3
mark_tsc_unstable---of 4
native_calibrate_cpu---of 3
native_calibrate_cpu_early---of 33
native_calibrate_tsc---of 16
native_read_msr---of 3
native_sched_clock_from_tsc---of 6
pit_hpet_ptimer_calibrate_cpu---of 35
read_tsc100%of 1
recalibrate_cpu_khz---of 1
sched_clock67%of 3
set_cyc2ns_scale---of 11
time_cpufreq_notifier---of 15
tsc_clocksource_watchdog_disabled---of 4
tsc_cs_enable---of 1
tsc_cs_mark_unstable---of 4
tsc_cs_tick_stable---of 4
tsc_read_refs---of 20
tsc_refine_calibration_work---of 18
tsc_restore_sched_clock_state---of 16
tsc_resume---of 1
tsc_save_sched_clock_state---of 4
unsynchronized_tsc---of 9
using_native_sched_clock---of 1
-----------
SUMMARY75%of 4

rds6_sock_inc_info---of 9
rds6_sock_info---of 5
rds_connect---of 19
rds_create---of 5
rds_exit---of 1
rds_getname23%of 9
rds_getsockopt---of 30
rds_ioctl---of 10
rds_poll---of 17
rds_release---of 7
rds_setsockopt---of 53
rds_sock_addref---of 4
rds_sock_destruct---of 4
rds_sock_inc_info---of 10
rds_sock_info---of 7
rds_sock_put---of 4
rds_wake_sk_sleep---of 3
-----------
SUMMARY23%of 9

-----------
SUMMARY---of 0

__ieee80211_sta_join_ibss---of 55
drv_leave_ibss---of 39
ieee80211_csa_connection_drop_work---of 1
ieee80211_ibss_add_sta---of 41
ieee80211_ibss_build_presp---of 48
ieee80211_ibss_csa_beacon---of 28
ieee80211_ibss_csa_mark_radar---of 3
ieee80211_ibss_disconnect---of 19
ieee80211_ibss_finish_csa---of 8
ieee80211_ibss_finish_sta---of 16
ieee80211_ibss_join---of 24
ieee80211_ibss_leave---of 1
ieee80211_ibss_notify_scan_completed---of 10
ieee80211_ibss_process_chanswitch19%of 38
ieee80211_ibss_rx_no_sta---of 26
ieee80211_ibss_rx_queued_mgmt2%of 131
ieee80211_ibss_setup_sdata---of 1
ieee80211_ibss_stop---of 1
ieee80211_ibss_timer---of 1
ieee80211_ibss_work---of 69
ieee80211_sta_active_ibss---of 20
ieee80211_sta_create_ibss---of 6
ieee80211_sta_join_ibss---of 42
rcu_read_unlock---of 6
trace_drv_return_int---of 15
-----------
SUMMARY6%of 169

-----------
SUMMARY---of 0

bpf_iter_fini_seq_net---of 4
bpf_iter_init_seq_net---of 4
get_proc_net---of 7
get_proc_task_net---of 16
proc_create_net_data---of 3
proc_create_net_data_write---of 3
proc_create_net_single67%of 3
proc_create_net_single_write---of 3
proc_net_ns_exit---of 1
proc_net_ns_init---of 8
proc_tgid_net_getattr---of 5
proc_tgid_net_lookup---of 5
proc_tgid_net_readdir---of 5
seq_open_net---of 11
seq_release_net---of 4
single_open_net---of 6
single_release_net---of 4
-----------
SUMMARY67%of 3

__bpf_address_lookup---of 23
__bpf_call_base---of 1
__bpf_free_used_btfs34%of 6
__bpf_free_used_maps25%of 8
__bpf_prog_array_free_sleepable_cb---of 1
__bpf_prog_free---of 3
__bpf_prog_ret0_warn---of 1
__bpf_prog_ret1---of 1
__bpf_trace_bpf_xdp_link_attach_failed---of 1
__bpf_trace_mem_connect---of 1
__bpf_trace_mem_disconnect---of 1
__bpf_trace_mem_return_failed---of 1
__bpf_trace_xdp_bulk_tx---of 1
__bpf_trace_xdp_cpumap_enqueue---of 1
__bpf_trace_xdp_cpumap_kthread---of 1
__bpf_trace_xdp_devmap_xmit---of 1
__bpf_trace_xdp_exception---of 1
__bpf_trace_xdp_redirect_template---of 1
__probestub_bpf_xdp_link_attach_failed---of 1
__probestub_mem_connect---of 1
__probestub_mem_disconnect---of 1
__probestub_mem_return_failed---of 1
__probestub_xdp_bulk_tx---of 1
__probestub_xdp_cpumap_enqueue---of 1
__probestub_xdp_cpumap_kthread---of 1
__probestub_xdp_devmap_xmit---of 1
__probestub_xdp_exception---of 1
__probestub_xdp_redirect---of 1
__probestub_xdp_redirect_err---of 1
__probestub_xdp_redirect_map---of 1
__probestub_xdp_redirect_map_err---of 1
__traceiter_bpf_xdp_link_attach_failed---of 4
__traceiter_mem_connect---of 4
__traceiter_mem_disconnect---of 4
__traceiter_mem_return_failed---of 4
__traceiter_xdp_bulk_tx---of 4
__traceiter_xdp_cpumap_enqueue---of 4
__traceiter_xdp_cpumap_kthread---of 4
__traceiter_xdp_devmap_xmit---of 4
__traceiter_xdp_exception---of 4
__traceiter_xdp_redirect---of 4
__traceiter_xdp_redirect_err---of 4
__traceiter_xdp_redirect_map---of 4
__traceiter_xdp_redirect_map_err---of 4
bpf_adj_branches---of 35
bpf_get_kallsym---of 17
bpf_get_raw_cpu_id---of 1
bpf_internal_load_pointer_neg_helper---of 8
bpf_jit_add_poke_descriptor---of 11
bpf_jit_alloc_exec---of 1
bpf_jit_alloc_exec_limit---of 1
bpf_jit_binary_alloc---of 10
bpf_jit_binary_free---of 1
bpf_jit_binary_pack_alloc37%of 11
bpf_jit_binary_pack_finalize67%of 3
bpf_jit_binary_pack_free---of 1
bpf_jit_binary_pack_hdr---of 1
bpf_jit_blind_constants4%of 66
bpf_jit_charge_modmem---of 5
bpf_jit_compile---of 1
bpf_jit_fill_hole_with_zero---of 1
bpf_jit_free_exec---of 1
bpf_jit_get_func_addr---of 9
bpf_jit_inlines_helper_call---of 1
bpf_jit_needs_zext100%of 1
bpf_jit_prog_release_other---of 1
bpf_jit_supports_far_kfunc_call---of 1
bpf_jit_uncharge_modmem---of 1
bpf_ksym_add46%of 11
bpf_ksym_del---of 4
bpf_opcode_in_insntable100%of 1
bpf_patch_insn_single---of 22
bpf_prog_alloc45%of 9
bpf_prog_alloc_jited_linfo---of 5
bpf_prog_alloc_no_stats34%of 18
bpf_prog_array_alloc67%of 3
bpf_prog_array_copy---of 21
bpf_prog_array_copy_info---of 11
bpf_prog_array_copy_to_user---of 10
bpf_prog_array_delete_safe---of 5
bpf_prog_array_delete_safe_at---of 8
bpf_prog_array_free67%of 3
bpf_prog_array_free_sleepable---of 3
bpf_prog_array_is_empty---of 3
bpf_prog_array_length---of 4
bpf_prog_array_update_at---of 8
bpf_prog_calc_tag40%of 15
bpf_prog_fill_jited_linfo29%of 7
bpf_prog_free---of 3
bpf_prog_free_deferred---of 25
bpf_prog_jit_attempt_done---of 5
bpf_prog_kallsyms_add37%of 11
bpf_prog_kallsyms_del---of 4
bpf_prog_kallsyms_del_all---of 10
bpf_prog_ksym_find---of 11
bpf_prog_map_compatible---of 9
bpf_prog_pack_alloc14%of 15
bpf_prog_pack_free---of 16
bpf_prog_realloc---of 6
bpf_prog_select_runtime16%of 32
bpf_remove_insns---of 3
bpf_user_rnd_init_once---of 4
bpf_user_rnd_u32---of 5
is_bpf_text_address34%of 18
perf_trace_bpf_xdp_link_attach_failed---of 8
perf_trace_mem_connect---of 8
perf_trace_mem_disconnect---of 8
perf_trace_mem_return_failed---of 8
perf_trace_xdp_bulk_tx---of 8
perf_trace_xdp_cpumap_enqueue---of 8
perf_trace_xdp_cpumap_kthread---of 8
perf_trace_xdp_devmap_xmit---of 8
perf_trace_xdp_exception---of 8
perf_trace_xdp_redirect_template---of 13
search_bpf_extables---of 23
trace_event_raw_event_bpf_xdp_link_attach_failed---of 7
trace_event_raw_event_mem_connect---of 7
trace_event_raw_event_mem_disconnect---of 7
trace_event_raw_event_mem_return_failed---of 7
trace_event_raw_event_xdp_bulk_tx---of 7
trace_event_raw_event_xdp_cpumap_enqueue---of 7
trace_event_raw_event_xdp_cpumap_kthread---of 7
trace_event_raw_event_xdp_devmap_xmit---of 7
trace_event_raw_event_xdp_exception---of 7
trace_event_raw_event_xdp_redirect_template---of 12
trace_raw_output_bpf_xdp_link_attach_failed---of 3
trace_raw_output_mem_connect---of 3
trace_raw_output_mem_disconnect---of 3
trace_raw_output_mem_return_failed---of 3
trace_raw_output_xdp_bulk_tx---of 3
trace_raw_output_xdp_cpumap_enqueue---of 3
trace_raw_output_xdp_cpumap_kthread---of 3
trace_raw_output_xdp_devmap_xmit---of 3
trace_raw_output_xdp_exception---of 3
trace_raw_output_xdp_redirect_template---of 3
-----------
SUMMARY25%of 238

-----------
SUMMARY---of 0

__ip6_append_data---of 138
__ip6_flush_pending_frames---of 15
__ip6_make_skb---of 61
__ipv6_neigh_lookup_noref21%of 24
__skb_fill_page_desc---of 10
dst_output50%of 8
get_page---of 9
ip6_append_data---of 15
ip6_autoflowlabel---of 3
ip6_call_ra_chain---of 16
ip6_copy_metadata---of 26
ip6_cork_release---of 5
ip6_dst_lookup---of 1
ip6_dst_lookup_flow50%of 4
ip6_dst_lookup_tail22%of 79
ip6_dst_mtu_maybe_forward---of 26
ip6_finish_output17%of 31
ip6_finish_output223%of 80
ip6_flush_pending_frames---of 5
ip6_forward---of 127
ip6_forward_finish---of 4
ip6_frag_init---of 1
ip6_frag_next---of 9
ip6_fraglist_init---of 12
ip6_fraglist_prepare---of 5
ip6_fragment---of 88
ip6_make_skb---of 12
ip6_output43%of 14
ip6_pkt_too_big---of 7
ip6_push_pending_frames---of 7
ip6_send_skb---of 9
ip6_setup_cork---of 38
ip6_sk_dst_lookup_flow---of 34
ip6_skb_dst_mtu35%of 20
ip6_xmit38%of 54
net_zcopy_put_abort---of 5
nf_hook28%of 18
rcu_read_unlock---of 6
refcount_add---of 4
skb_cow---of 5
skb_dst---of 5
skb_zcopy_set---of 11
-----------
SUMMARY27%of 332

-----------
SUMMARY---of 0

sysfs_add_bin_file_mode_ns---of 9
sysfs_add_file_mode_ns37%of 11
sysfs_add_file_to_group---of 5
sysfs_bin_attr_simple_read---of 1
sysfs_break_active_protection---of 3
sysfs_change_owner---of 6
sysfs_chmod_file---of 3
sysfs_create_bin_file---of 11
sysfs_create_file_ns---of 4
sysfs_create_files---of 10
sysfs_emit---of 3
sysfs_emit_at---of 3
sysfs_file_change_owner---of 5
sysfs_kf_bin_llseek---of 3
sysfs_kf_bin_mmap---of 1
sysfs_kf_bin_open---of 3
sysfs_kf_bin_read---of 7
sysfs_kf_bin_write---of 7
sysfs_kf_read---of 12
sysfs_kf_seq_show---of 17
sysfs_kf_write---of 9
sysfs_link_change_owner---of 8
sysfs_notify---of 7
sysfs_remove_bin_file---of 1
sysfs_remove_file_from_group---of 5
sysfs_remove_file_ns---of 1
sysfs_remove_file_self---of 3
sysfs_remove_files---of 4
sysfs_unbreak_active_protection---of 1
-----------
SUMMARY37%of 11

__inet_twsk_schedule34%of 6
inet_bhashfn_portaddr40%of 5
inet_twsk_alloc50%of 4
inet_twsk_bind_unhash---of 6
inet_twsk_deschedule_put---of 7
inet_twsk_free---of 3
inet_twsk_hashdance_schedule50%of 14
inet_twsk_kill---of 19
inet_twsk_purge---of 37
inet_twsk_put---of 5
tw_timer_handler---of 1
-----------
SUMMARY45%of 29

nf_conntrack_sctp_init_net---of 1
nf_conntrack_sctp_packet26%of 145
nf_ct_timeout_lookup19%of 11
nlattr_to_sctp---of 7
sctp_can_early_drop---of 1
sctp_csum_combine---of 1
sctp_csum_update---of 1
sctp_new16%of 39
sctp_timeout_nlattr_to_obj---of 19
sctp_timeout_obj_to_nlattr---of 10
sctp_to_nlattr---of 7
-----------
SUMMARY24%of 195

tomoyo_init_log---of 75
tomoyo_poll_log---of 5
tomoyo_read_log---of 8
tomoyo_write_log---of 1
tomoyo_write_log211%of 28
-----------
SUMMARY11%of 28

-----------
SUMMARY---of 0

__bpf_trace_workqueue_activate_work---of 1
__bpf_trace_workqueue_execute_end---of 1
__bpf_trace_workqueue_execute_start---of 1
__bpf_trace_workqueue_queue_work---of 1
__cancel_work---of 15
__cancel_work_sync---of 8
__flush_work---of 52
__flush_workqueue---of 61
__init_work67%of 3
__probestub_workqueue_activate_work---of 1
__probestub_workqueue_execute_end---of 1
__probestub_workqueue_execute_start---of 1
__probestub_workqueue_queue_work---of 1
__pwq_activate_work---of 11
__queue_delayed_work---of 14
__queue_work23%of 81
__traceiter_workqueue_activate_work---of 4
__traceiter_workqueue_execute_end---of 4
__traceiter_workqueue_execute_start---of 4
__traceiter_workqueue_queue_work---of 4
__warn_flushing_systemwide_wq---of 1
alloc_unbound_pwq---of 38
alloc_workqueue---of 81
alloc_workqueue_attrs---of 3
apply_workqueue_attrs---of 1
apply_workqueue_attrs_locked---of 10
apply_wqattrs_commit---of 7
apply_wqattrs_prepare---of 32
assign_work---of 17
bh_pool_irq_work---of 3
bh_pool_kick_highpri---of 1
bh_pool_kick_normal---of 1
bh_worker---of 22
cancel_delayed_work---of 1
cancel_delayed_work_sync---of 1
cancel_work---of 1
cancel_work_sync---of 1
check_flush_dependency---of 14
cpumask_isolated_show---of 1
cpumask_requested_show---of 1
cpumask_show---of 1
cpumask_store---of 5
create_worker---of 16
current_is_workqueue_rescuer---of 5
current_work---of 5
delayed_work_timer_fn---of 1
destroy_delayed_work_on_stack---of 1
destroy_work_on_stack---of 1
destroy_workqueue---of 45
disable_delayed_work---of 1
disable_delayed_work_sync---of 6
disable_work---of 1
disable_work_sync---of 6
drain_dead_softirq_workfn---of 5
drain_workqueue---of 11
enable_delayed_work---of 1
enable_work---of 14
execute_in_process_context---of 3
flush_delayed_work---of 5
flush_rcu_work---of 3
flush_work---of 1
flush_workqueue_prep_pwqs---of 20
free_workqueue_attrs---of 3
freeze_workqueues_begin---of 6
freeze_workqueues_busy---of 25
get_pwq---of 6
get_work_pool40%of 10
idle_cull_fn---of 11
idle_worker_timeout---of 6
init_rescuer---of 5
init_worker_pool---of 4
insert_work50%of 10
install_unbound_pwq---of 15
jhash---of 17
kick_pool30%of 17
max_active_show---of 1
max_active_store---of 4
mod_delayed_work_on---of 9
move_linked_works---of 10
parse_affn_scope---of 7
per_cpu_show---of 1
perf_trace_workqueue_activate_work---of 8
perf_trace_workqueue_execute_end---of 8
perf_trace_workqueue_execute_start---of 8
perf_trace_workqueue_queue_work---of 8
pool_mayday_timeout---of 19
pr_cont_pool_info---of 5
pr_cont_work---of 16
print_worker_info---of 8
process_scheduled_works---of 77
put_pwq_unlocked---of 7
put_unbound_pool---of 32
pwq_dec_nr_in_flight---of 45
pwq_release_workfn---of 23
pwq_tryinc_nr_active28%of 22
queue_delayed_work_on---of 12
queue_rcu_work---of 4
queue_work_node---of 18
queue_work_on50%of 12
rcu_free_pool---of 3
rcu_free_pwq---of 1
rcu_free_wq---of 12
rcu_work_rcufn---of 3
rescuer_thread---of 36
schedule_on_each_cpu---of 10
set_work_pool_and_clear_pending---of 3
set_worker_desc---of 5
set_worker_dying---of 18
show_all_workqueues---of 36
show_freezable_workqueues---of 16
show_one_workqueue---of 11
show_pwq---of 41
thaw_workqueues---of 5
trace_event_raw_event_workqueue_activate_work---of 7
trace_event_raw_event_workqueue_execute_end---of 7
trace_event_raw_event_workqueue_execute_start---of 7
trace_event_raw_event_workqueue_queue_work---of 7
trace_raw_output_workqueue_activate_work---of 3
trace_raw_output_workqueue_execute_end---of 3
trace_raw_output_workqueue_execute_start---of 3
trace_raw_output_workqueue_queue_work---of 3
trace_workqueue_activate_work27%of 15
unbind_worker---of 8
unbound_pwq---of 10
work_busy---of 18
work_debug_hint---of 1
work_fixup_free---of 3
work_fixup_init---of 3
work_for_cpu_fn---of 1
work_grab_pending---of 44
work_is_static_object---of 1
work_offqd_unpack---of 3
work_on_cpu_key---of 1
work_on_cpu_safe_key---of 5
worker_attach_to_pool---of 10
worker_detach_from_pool---of 10
worker_enter_idle---of 15
worker_leave_idle---of 7
worker_thread---of 41
workqueue_apply_unbound_cpumask---of 25
workqueue_congested---of 17
workqueue_offline_cpu---of 26
workqueue_online_cpu---of 56
workqueue_prepare_cpu---of 9
workqueue_set_max_active---of 7
workqueue_set_min_active---of 3
workqueue_softirq_action---of 6
workqueue_softirq_dead---of 11
workqueue_sysfs_register---of 9
workqueue_unbound_exclude_cpumask---of 3
wq_adjust_max_active---of 20
wq_affinity_strict_show---of 1
wq_affinity_strict_store---of 9
wq_affn_dfl_get---of 3
wq_affn_dfl_set---of 10
wq_affn_scope_show---of 5
wq_affn_scope_store---of 9
wq_barrier_func---of 1
wq_cpumask_show---of 1
wq_cpumask_store---of 9
wq_device_release---of 1
wq_nice_show---of 1
wq_nice_store---of 10
wq_update_node_max_active---of 30
wq_update_pod---of 23
wq_watchdog_param_set_thresh---of 8
wq_watchdog_timer_fn---of 63
wq_watchdog_touch---of 4
wq_worker_comm---of 6
wq_worker_last_func---of 1
wq_worker_running---of 6
wq_worker_sleeping---of 6
wq_worker_tick---of 13
wqattrs_pod_type---of 13
-----------
SUMMARY30%of 170

__get_random_u32_below34%of 6
__ia32_sys_getrandom---of 9
__x64_sys_getrandom---of 9
_credit_init_bits---of 11
_get_random_bytes25%of 8
add_device_randomness---of 1
add_disk_randomness---of 4
add_hwgenerator_randomness---of 12
add_input_randomness---of 3
add_interrupt_randomness---of 11
add_timer_randomness---of 9
crng_fast_key_erasure---of 1
crng_make_state46%of 31
crng_reseed---of 7
crng_set_ready---of 1
entropy_timer---of 5
execute_with_initialized_rng---of 4
extract_entropy---of 45
get_random_bytes100%of 1
get_random_bytes_user---of 10
get_random_u16---of 31
get_random_u3239%of 31
get_random_u64---of 31
get_random_u839%of 31
mix_interrupt_randomness---of 9
mix_pool_bytes---of 1
proc_do_rointvec---of 3
proc_do_uuid---of 5
rand_initialize_disk---of 3
random_fasync---of 1
random_ioctl---of 25
random_online_cpu---of 3
random_pm_notification---of 6
random_poll---of 6
random_prepare_cpu---of 3
random_read_iter---of 8
random_write_iter---of 1
rng_is_initialized---of 3
try_to_generate_entropy---of 23
urandom_read_iter---of 10
wait_for_random_bytes---of 12
write_pool_user---of 7
-----------
SUMMARY40%of 108

-----------
SUMMARY---of 0

__udp_manip_pkt---of 7
l4proto_manip_pkt---of 45
nf_nat_csum_recalc---of 6
nf_nat_icmp_reply_translation---of 13
nf_nat_icmpv6_reply_translation---of 28
nf_nat_inet_register_fn---of 5
nf_nat_inet_unregister_fn---of 1
nf_nat_ipv4_local_fn20%of 15
nf_nat_ipv4_local_in---of 25
nf_nat_ipv4_manip_pkt---of 5
nf_nat_ipv4_out20%of 15
nf_nat_ipv4_pre_routing---of 13
nf_nat_ipv4_register_fn---of 1
nf_nat_ipv4_unregister_fn---of 1
nf_nat_ipv6_fn25%of 8
nf_nat_ipv6_in29%of 7
nf_nat_ipv6_local_fn25%of 8
nf_nat_ipv6_local_in11%of 19
nf_nat_ipv6_out25%of 8
nf_nat_ipv6_register_fn---of 1
nf_nat_ipv6_unregister_fn---of 1
nf_nat_manip_pkt---of 11
nf_xfrm_me_harder---of 23
sctp_csum_combine---of 1
sctp_csum_update---of 1
-----------
SUMMARY20%of 80

__sctp_v6_cmp_addr16%of 13
sctp6_rcv---of 1
sctp_getname---of 9
sctp_inet6_af_supported40%of 5
sctp_inet6_bind_verify---of 25
sctp_inet6_cmp_addr---of 8
sctp_inet6_event_msgname---of 12
sctp_inet6_send_verify13%of 16
sctp_inet6_skb_msgname---of 14
sctp_inet6_supported_addrs50%of 4
sctp_inet6addr_event---of 15
sctp_udp_v6_err---of 14
sctp_v6_add_protocol---of 1
sctp_v6_addr_to_user---of 8
sctp_v6_addr_valid40%of 5
sctp_v6_available---of 13
sctp_v6_cmp_addr67%of 3
sctp_v6_copy_addrlist---of 28
sctp_v6_copy_ip_options---of 27
sctp_v6_create_accept_sk---of 14
sctp_v6_del_protocol---of 1
sctp_v6_ecn_capable60%of 5
sctp_v6_err---of 12
sctp_v6_err_handle---of 18
sctp_v6_from_addr_param---of 3
sctp_v6_from_sk---of 1
sctp_v6_from_skb60%of 5
sctp_v6_get_dst19%of 128
sctp_v6_get_saddr60%of 5
sctp_v6_inaddr_any---of 1
sctp_v6_ip_options_len---of 22
sctp_v6_is_any100%of 1
sctp_v6_is_ce---of 1
sctp_v6_pf_exit---of 3
sctp_v6_pf_init---of 1
sctp_v6_protosw_exit---of 1
sctp_v6_protosw_init---of 3
sctp_v6_scope40%of 5
sctp_v6_seq_dump_addr---of 1
sctp_v6_skb_iif---of 1
sctp_v6_skb_sdif---of 4
sctp_v6_to_addr_param---of 1
sctp_v6_to_sk_daddr67%of 3
sctp_v6_to_sk_saddr67%of 3
sctp_v6_xmit29%of 45
-----------
SUMMARY27%of 246

-----------
SUMMARY---of 0

phonet_address_add---of 10
phonet_address_del---of 9
phonet_address_get---of 24
phonet_address_lookup23%of 18
phonet_device_exit---of 1
phonet_device_get20%of 20
phonet_device_list---of 1
phonet_device_notify---of 34
phonet_exit_net---of 3
phonet_init_net---of 3
phonet_pernet25%of 16
phonet_route_add---of 6
phonet_route_del---of 5
phonet_route_get_rcu---of 6
phonet_route_output28%of 22
-----------
SUMMARY24%of 76

af_alg_accept54%of 13
af_alg_alloc_areq---of 4
af_alg_async_cb---of 4
af_alg_count_tsgl---of 11
af_alg_free_resources---of 23
af_alg_free_sg---of 9
af_alg_get_rsgl---of 14
af_alg_poll---of 7
af_alg_pull_tsgl---of 27
af_alg_register_type---of 8
af_alg_release---of 5
af_alg_release_parent---of 7
af_alg_sendmsg---of 109
af_alg_unregister_type---of 5
af_alg_wait_for_data---of 19
af_alg_wmem_wakeup---of 26
alg_accept100%of 1
alg_bind25%of 16
alg_create---of 5
alg_get_type---of 5
alg_setkey---of 7
alg_setkey_by_key_serial---of 21
alg_setsockopt---of 16
alg_sock_destruct---of 3
key_data_ptr_encrypted---of 8
key_data_ptr_trusted---of 8
put_page---of 14
-----------
SUMMARY40%of 30

-----------
SUMMARY---of 0

__change_pid---of 14
__ia32_sys_pidfd_getfd---of 1
__ia32_sys_pidfd_open---of 1
__se_sys_pidfd_getfd---of 16
__se_sys_pidfd_open---of 8
__task_pid_nr_ns35%of 26
__x64_sys_pidfd_getfd---of 1
__x64_sys_pidfd_open---of 1
alloc_pid---of 40
attach_pid---of 6
change_pid---of 6
delayed_put_pid---of 5
detach_pid---of 1
disable_pid_allocation---of 1
exchange_tids---of 5
find_ge_pid---of 1
find_get_pid---of 17
find_get_task_by_vpid---of 17
find_pid_ns---of 1
find_task_by_pid_ns---of 13
find_task_by_vpid---of 3
find_vpid---of 3
free_pid---of 8
get_pid_task27%of 15
get_task_pid27%of 23
pid_nr_ns---of 5
pid_task30%of 10
pid_vnr43%of 7
pidfd_get_pid---of 9
pidfd_get_task---of 8
put_pid60%of 5
task_active_pid_ns67%of 3
transfer_pid---of 6
-----------
SUMMARY34%of 89

copy_from_kernel_nofault_allowed40%of 5
-----------
SUMMARY40%of 5

__skb_queue_purge---of 4
__tipc_node_link_down---of 79
__tipc_node_link_up---of 41
net_generic25%of 16
tipc_loopback_trace---of 3
tipc_nl_node_dump---of 39
tipc_nl_node_dump_link---of 46
tipc_nl_node_dump_monitor---of 6
tipc_nl_node_dump_monitor_peer---of 9
tipc_nl_node_flush_key---of 14
tipc_nl_node_get_link---of 13
tipc_nl_node_get_monitor---of 10
tipc_nl_node_reset_link_stats---of 28
tipc_nl_node_set_key---of 31
tipc_nl_node_set_link---of 19
tipc_nl_node_set_monitor---of 6
tipc_nl_peer_rm---of 27
tipc_node_add_conn23%of 9
tipc_node_apply_property---of 20
tipc_node_assign_peer_net---of 10
tipc_node_bc_rcv---of 20
tipc_node_bc_sync_rcv---of 8
tipc_node_broadcast---of 21
tipc_node_check_dest---of 35
tipc_node_check_state---of 67
tipc_node_create---of 65
tipc_node_crypto_rx---of 3
tipc_node_crypto_rx_by_addr---of 3
tipc_node_crypto_rx_by_list---of 1
tipc_node_delete---of 18
tipc_node_delete_from_list---of 8
tipc_node_delete_links---of 14
tipc_node_distr_xmit50%of 4
tipc_node_dump---of 4
tipc_node_find19%of 22
tipc_node_find_by_id---of 21
tipc_node_find_by_name---of 23
tipc_node_free---of 1
tipc_node_fsm_evt---of 53
tipc_node_get---of 4
tipc_node_get_addr---of 3
tipc_node_get_capabilities40%of 5
tipc_node_get_id---of 7
tipc_node_get_id_str---of 1
tipc_node_get_linkname---of 8
tipc_node_get_mtu19%of 11
tipc_node_is_up---of 6
tipc_node_link_down---of 10
tipc_node_link_failover---of 5
tipc_node_link_up---of 3
tipc_node_mcast_rcv---of 3
tipc_node_pre_cleanup_net---of 19
tipc_node_put---of 4
tipc_node_remove_conn17%of 12
tipc_node_reset_links---of 15
tipc_node_stop---of 4
tipc_node_subscribe---of 8
tipc_node_suggest_addr---of 7
tipc_node_timeout---of 39
tipc_node_try_addr---of 14
tipc_node_unsubscribe---of 8
tipc_node_write_unlock---of 10
tipc_node_xmit4%of 57
tipc_node_xmit_skb---of 1
tipc_rcv---of 63
tipc_skb_queue_splice_tail_init---of 5
trace_tipc_link_dump---of 15
trace_tipc_link_reset---of 15
trace_tipc_node_fsm---of 15
trace_tipc_node_link_down---of 15
trace_tipc_skb_dump---of 15
-----------
SUMMARY15%of 136

sk_stream_error---of 7
sk_stream_kill_queues---of 11
sk_stream_wait_close---of 13
sk_stream_wait_connect---of 21
sk_stream_wait_memory26%of 43
sk_stream_write_space25%of 28
-----------
SUMMARY26%of 71

secure_dccp_sequence_number---of 4
secure_dccpv6_sequence_number---of 4
secure_ipv4_port_ephemeral---of 4
secure_ipv6_port_ephemeral---of 4
secure_tcp_seq---of 4
secure_tcp_ts_off---of 5
secure_tcpv6_seq50%of 4
secure_tcpv6_ts_off40%of 5
-----------
SUMMARY45%of 9

-----------
SUMMARY---of 0

__bpf_trace_percpu_alloc_percpu---of 1
__bpf_trace_percpu_alloc_percpu_fail---of 1
__bpf_trace_percpu_create_chunk---of 1
__bpf_trace_percpu_destroy_chunk---of 1
__bpf_trace_percpu_free_percpu---of 1
__is_kernel_percpu_address38%of 8
__probestub_percpu_alloc_percpu---of 1
__probestub_percpu_alloc_percpu_fail---of 1
__probestub_percpu_create_chunk---of 1
__probestub_percpu_destroy_chunk---of 1
__probestub_percpu_free_percpu---of 1
__traceiter_percpu_alloc_percpu---of 4
__traceiter_percpu_alloc_percpu_fail---of 4
__traceiter_percpu_create_chunk---of 4
__traceiter_percpu_destroy_chunk---of 4
__traceiter_percpu_free_percpu---of 4
free_percpu---of 65
is_kernel_percpu_address---of 6
pcpu_alloc_area60%of 22
pcpu_alloc_noprof20%of 103
pcpu_alloc_size---of 10
pcpu_balance_free---of 39
pcpu_balance_workfn---of 53
pcpu_block_refresh_hint100%of 6
pcpu_block_update63%of 24
pcpu_block_update_hint_alloc43%of 26
pcpu_chunk_depopulated---of 7
pcpu_chunk_populated---of 7
pcpu_chunk_refresh_hint---of 23
pcpu_chunk_relocate20%of 15
pcpu_create_chunk---of 28
pcpu_depopulate_chunk---of 22
pcpu_dump_alloc_info---of 25
pcpu_find_block_fit34%of 9
pcpu_free_area---of 25
pcpu_get_pages---of 8
pcpu_memcg_post_alloc_hook32%of 32
pcpu_next_fit_region67%of 15
pcpu_nr_pages---of 1
pcpu_obj_full_size---of 1
pcpu_populate_chunk---of 44
per_cpu_ptr_to_phys---of 10
perf_trace_percpu_alloc_percpu---of 8
perf_trace_percpu_alloc_percpu_fail---of 8
perf_trace_percpu_create_chunk---of 8
perf_trace_percpu_destroy_chunk---of 8
perf_trace_percpu_free_percpu---of 8
trace_event_raw_event_percpu_alloc_percpu---of 7
trace_event_raw_event_percpu_alloc_percpu_fail---of 7
trace_event_raw_event_percpu_create_chunk---of 7
trace_event_raw_event_percpu_destroy_chunk---of 7
trace_event_raw_event_percpu_free_percpu---of 7
trace_percpu_create_chunk---of 15
trace_percpu_free_percpu---of 15
trace_raw_output_percpu_alloc_percpu---of 4
trace_raw_output_percpu_alloc_percpu_fail---of 3
trace_raw_output_percpu_create_chunk---of 3
trace_raw_output_percpu_destroy_chunk---of 3
trace_raw_output_percpu_free_percpu---of 3
-----------
SUMMARY37%of 260

-----------
SUMMARY---of 0

__regulatory_set_wiphy_regd---of 14
cfg80211_get_unii---of 10
crda_timeout_work---of 3
freq_reg_info---of 14
get_wiphy_regdom---of 8
handle_band_custom---of 23
handle_reg_beacon---of 18
is_wiphy_all_set_reg_flag---of 9
is_world_regdom---of 4
notify_self_managed_wiphys---of 11
print_rd_rules---of 22
print_regdomain---of 36
print_regdomain_info---of 3
query_regdb---of 25
queue_regulatory_request---of 3
reg_check_channels---of 1
reg_check_chans_work---of 81
reg_dfs_domain_same---of 26
reg_free_request---of 10
reg_get_dfs_region---of 34
reg_get_max_bandwidth---of 11
reg_get_regdomain---of 17
reg_initiator_name---of 6
reg_is_valid_request---of 12
reg_is_world_roaming---of 26
reg_last_request_cell_base---of 9
reg_process_hint---of 80
reg_process_ht_flags---of 35
reg_process_self_managed_hint---of 14
reg_process_self_managed_hints---of 10
reg_query_database---of 20
reg_query_regdb_wmm---of 16
reg_regdb_apply---of 6
reg_reload_regdb---of 14
reg_rule_to_chan_bw_flags---of 21
reg_rules_intersect---of 36
reg_set_request_processed---of 9
reg_supported_dfs_region---of 4
reg_todo---of 44
reg_update_last_request---of 16
regdb_fw_cb---of 11
regdom_changes---of 11
regdom_intersect---of 32
regulatory_exit---of 20
regulatory_hint---of 4
regulatory_hint_country_ie---of 23
regulatory_hint_disconnect---of 14
regulatory_hint_found_beacon---of 23
regulatory_hint_indoor---of 4
regulatory_hint_user---of 10
regulatory_indoor_allowed---of 1
regulatory_netlink_notify67%of 3
regulatory_pre_cac_allowed---of 26
regulatory_propagate_dfs_state---of 51
regulatory_set_wiphy_regd---of 3
regulatory_set_wiphy_regd_sync---of 6
reset_regdomains---of 18
restore_regulatory_settings---of 92
set_regdom---of 116
set_wmm_rule---of 4
update_all_wiphy_regulatory---of 10
valid_regdb---of 21
valid_wmm---of 17
wiphy_all_share_dfs_chan_state---of 26
wiphy_apply_custom_regulatory---of 22
wiphy_regulatory_deregister---of 12
wiphy_regulatory_register---of 11
wiphy_update_regulatory---of 116
-----------
SUMMARY67%of 3

__ia32_sys_mmap---of 3
__x64_sys_mmap67%of 3
arch_get_unmapped_area---of 1
arch_get_unmapped_area_topdown---of 1
arch_get_unmapped_area_topdown_vmflags10%of 30
arch_get_unmapped_area_vmflags---of 26
-----------
SUMMARY16%of 33

__ext4_expand_extra_isize---of 8
__ext4_get_inode_loc11%of 39
__ext4_iget---of 120
__ext4_journalled_invalidate_folio---of 23
__ext4_mark_inode_dirty21%of 29
_ext4_get_block---of 13
check_igot_inode---of 7
do_journal_get_write_access---of 7
ext4_alloc_da_blocks---of 17
ext4_begin_ordered_truncate---of 17
ext4_block_write_begin---of 89
ext4_block_zero_page_range---of 42
ext4_blocks_for_truncate---of 6
ext4_bmap---of 8
ext4_bread---of 8
ext4_bread_batch---of 29
ext4_break_layouts---of 9
ext4_buffer_uptodate---of 4
ext4_can_truncate---of 5
ext4_change_inode_journal_flag---of 23
ext4_chksum---of 4
ext4_chunk_trans_blocks---of 6
ext4_da_get_block_prep---of 52
ext4_da_release_space---of 22
ext4_da_reserve_space---of 24
ext4_da_update_reserve_space---of 28
ext4_da_write_begin---of 35
ext4_da_write_end---of 56
ext4_dax_writepages---of 3
ext4_dio_alignment---of 10
ext4_dirty_folio---of 18
ext4_dirty_inode67%of 3
ext4_do_writepages---of 155
ext4_es_is_delayed---of 1
ext4_es_is_delonly---of 1
ext4_es_is_mapped---of 1
ext4_evict_inode---of 59
ext4_expand_extra_isize---of 13
ext4_file_getattr---of 8
ext4_fill_raw_inode29%of 49
ext4_get_block---of 1
ext4_get_block_unwritten---of 5
ext4_get_fc_inode_loc---of 1
ext4_get_inode_loc---of 3
ext4_get_projid---of 3
ext4_get_reserved_space---of 1
ext4_getattr---of 32
ext4_getblk---of 20
ext4_has_group_desc_csum---of 7
ext4_has_metadata_csum---of 6
ext4_iget_extra_inode---of 8
ext4_inode_attach_jinode---of 7
ext4_inode_blocks---of 5
ext4_inode_csum18%of 17
ext4_inode_csum_set---of 10
ext4_inode_csum_verify---of 10
ext4_inode_is_fast_symlink---of 11
ext4_inode_set_iversion_queried---of 3
ext4_invalidate_folio---of 18
ext4_iomap_begin---of 34
ext4_iomap_begin_report---of 16
ext4_iomap_end---of 1
ext4_iomap_overwrite_begin---of 4
ext4_iomap_swap_activate---of 1
ext4_issue_zeroout---of 8
ext4_journal_folio_buffers---of 13
ext4_journalled_dirty_folio---of 19
ext4_journalled_invalidate_folio---of 3
ext4_journalled_write_end---of 65
ext4_journalled_zero_new_buffers---of 12
ext4_map_blocks---of 71
ext4_mark_iloc_dirty12%of 76
ext4_normal_submit_inode_data_buffers---of 1
ext4_page_mkwrite---of 41
ext4_print_free_blocks---of 7
ext4_punch_hole---of 48
ext4_read_folio---of 19
ext4_readahead---of 4
ext4_release_folio---of 24
ext4_reserve_inode_write29%of 7
ext4_set_aops---of 8
ext4_set_inode_flags---of 16
ext4_set_iomap---of 29
ext4_setattr---of 80
ext4_should_dioread_nolock---of 6
ext4_truncate---of 71
ext4_update_disksize_before_punch---of 11
ext4_update_inode_fsync_trans---of 7
ext4_wait_for_tail_page_commit---of 15
ext4_walk_page_buffers---of 8
ext4_write_begin---of 59
ext4_write_end---of 50
ext4_write_inode---of 17
ext4_writepage_trans_blocks---of 8
ext4_writepages---of 5
ext4_zero_partial_blocks---of 12
folio_lock---of 9
folio_size---of 10
folio_test_uptodate---of 9
i_gid_needs_update---of 3
lock_buffer---of 3
mpage_folio_done---of 9
mpage_prepare_extent_to_map---of 101
mpage_process_page_bufs---of 29
mpage_release_unused_pages---of 65
mpage_submit_folio---of 7
percpu_down_read---of 10
percpu_up_read---of 10
trace_ext4_load_inode---of 15
trace_ext4_writepages---of 15
trace_ext4_writepages_result---of 15
wait_on_buffer---of 3
write_end_fn---of 5
zero_user_segments---of 17
-----------
SUMMARY19%of 220

__f_setown---of 1
__ia32_compat_sys_fcntl---of 8
__ia32_compat_sys_fcntl64---of 1
__ia32_sys_fcntl---of 1
__se_sys_fcntl---of 14
__x64_compat_sys_fcntl---of 8
__x64_compat_sys_fcntl64---of 1
__x64_sys_fcntl---of 1
check_fcntl_cmd---of 8
do_compat_fcntl64---of 35
do_fcntl---of 85
f_delown---of 1
f_getown---of 13
f_modown---of 8
f_setown---of 17
fasync_alloc---of 1
fasync_free---of 1
fasync_helper---of 5
fasync_insert_entry---of 5
fasync_remove_entry---of 6
kill_fasync7%of 30
put_compat_flock---of 1
put_compat_flock64---of 1
send_sigio---of 21
send_sigio_to_task---of 9
send_sigurg9%of 24
sigio_perm---of 21
-----------
SUMMARY8%of 54

-----------
SUMMARY---of 0

NF_HOOK---of 37
br_nf_dev_queue_xmit---of 103
br_nf_dev_xmit---of 8
br_nf_forward---of 71
br_nf_forward_finish---of 42
br_nf_forward_ip---of 20
br_nf_hook_thresh---of 18
br_nf_local_in---of 21
br_nf_post_routing---of 47
br_nf_pre_routing---of 57
br_nf_pre_routing_finish---of 91
br_nf_pre_routing_finish_bridge---of 38
br_nf_push_frag_xmit---of 12
br_validate_ipv4---of 20
brnf_device_event---of 9
brnf_exit_net---of 5
brnf_get_logical_dev---of 11
brnf_init_net---of 6
brnf_sysctl_call_tables---of 4
ip_sabotage_in34%of 6
local_lock_release---of 6
net_generic---of 16
nf_bridge_encap_header_len---of 1
nf_bridge_pull_encap_header---of 1
nf_bridge_push_encap_header---of 1
nf_bridge_update_protocol---of 6
nf_conntrack_put---of 5
setup_pre_routing---of 11
-----------
SUMMARY34%of 6

__fsnotify_inode_delete100%of 1
__fsnotify_parent18%of 17
__fsnotify_update_child_dentry_flags---of 10
__fsnotify_vfsmount_delete---of 1
fsnotify---of 143
fsnotify_event_needs_parent---of 3
fsnotify_first_mark---of 14
fsnotify_handle_inode_event---of 14
fsnotify_sb_delete---of 20
fsnotify_sb_free---of 1
-----------
SUMMARY23%of 18

-----------
SUMMARY---of 0

bpf_token_allow_cmd---of 6
bpf_token_allow_map_type---of 3
bpf_token_allow_prog_type---of 4
bpf_token_capable40%of 10
bpf_token_create---of 21
bpf_token_free---of 5
bpf_token_get_from_fd---of 6
bpf_token_inc---of 1
bpf_token_put---of 4
bpf_token_put_deferred---of 5
bpf_token_release---of 4
bpf_token_show_fdinfo---of 9
get_user_ns---of 5
-----------
SUMMARY40%of 10

-----------
SUMMARY---of 0

_cfg80211_chandef_usable12%of 84
_cfg80211_reg_can_beacon12%of 70
cfg80211_any_usable_channels---of 12
cfg80211_any_wiphy_oper_chan---of 37
cfg80211_beaconing_iface_active---of 21
cfg80211_chandef_compatible---of 22
cfg80211_chandef_create29%of 7
cfg80211_chandef_dfs_cac_time---of 32
cfg80211_chandef_dfs_required13%of 33
cfg80211_chandef_dfs_usable---of 41
cfg80211_chandef_primary---of 38
cfg80211_chandef_usable---of 1
cfg80211_chandef_valid7%of 86
cfg80211_edmg_chandef_valid---of 42
cfg80211_get_chans_dfs_required6%of 54
cfg80211_is_sub_chan---of 26
cfg80211_reg_can_beacon100%of 1
cfg80211_reg_can_beacon_relax---of 8
cfg80211_set_dfs_state---of 30
cfg80211_set_monitor_channel---of 34
cfg80211_wdev_on_sub_chan---of 13
nl80211_chan_width_to_mhz---of 16
valid_puncturing_bitmap17%of 12
wdev_chandef---of 19
-----------
SUMMARY11%of 347

__bpf_trace_sctp_probe---of 1
__bpf_trace_sctp_probe_path---of 1
__probestub_sctp_probe---of 1
__probestub_sctp_probe_path---of 1
__sctp_sf_do_9_2_reshutack---of 8
__traceiter_sctp_probe---of 4
__traceiter_sctp_probe_path---of 4
perf_trace_sctp_probe---of 8
perf_trace_sctp_probe_path---of 8
sctp_abort_pkt_new---of 5
sctp_auth_chunk_verify---of 5
sctp_eat_data---of 56
sctp_ootb_pkt_free---of 1
sctp_ootb_pkt_new---of 7
sctp_send_next_asconf---of 8
sctp_send_stale_cookie_err---of 5
sctp_sf_abort_violation---of 24
sctp_sf_authenticate---of 9
sctp_sf_autoclose_timer_expire---of 4
sctp_sf_backbeat_8_3---of 21
sctp_sf_beat_8_3---of 16
sctp_sf_bug---of 1
sctp_sf_check_restart_addrs---of 9
sctp_sf_cookie_echoed_abort---of 1
sctp_sf_cookie_echoed_err---of 26
sctp_sf_cookie_echoed_prm_abort---of 1
sctp_sf_cookie_echoed_prm_shutdown---of 5
sctp_sf_cookie_wait_abort---of 14
sctp_sf_cookie_wait_icmp_abort---of 1
sctp_sf_cookie_wait_prm_abort---of 9
sctp_sf_cookie_wait_prm_shutdown---of 5
sctp_sf_discard_chunk34%of 9
sctp_sf_do_4_C---of 19
sctp_sf_do_5_1B_init---of 30
sctp_sf_do_5_1C_ack10%of 21
sctp_sf_do_5_1D_ce---of 54
sctp_sf_do_5_1E_ca---of 26
sctp_sf_do_5_2_1_siminit100%of 1
sctp_sf_do_5_2_2_dupinit---of 1
sctp_sf_do_5_2_3_initack---of 3
sctp_sf_do_5_2_4_dupcook16%of 39
sctp_sf_do_6_2_sack---of 3
sctp_sf_do_6_3_3_rtx---of 12
sctp_sf_do_8_5_1_E_sa---of 6
sctp_sf_do_9_1_abort---of 25
sctp_sf_do_9_1_prm_abort---of 7
sctp_sf_do_9_2_final---of 14
sctp_sf_do_9_2_prm_shutdown---of 4
sctp_sf_do_9_2_reshutack---of 9
sctp_sf_do_9_2_shut_ctsn---of 10
sctp_sf_do_9_2_shutdown---of 16
sctp_sf_do_9_2_shutdown_ack---of 17
sctp_sf_do_9_2_start_shutdown---of 12
sctp_sf_do_asconf---of 21
sctp_sf_do_asconf_ack---of 34
sctp_sf_do_assoc_update---of 8
sctp_sf_do_dupcook_a---of 31
sctp_sf_do_dupcook_b---of 22
sctp_sf_do_dupcook_d22%of 33
sctp_sf_do_ecn_cwr---of 8
sctp_sf_do_ecne---of 7
sctp_sf_do_no_pending_tsn---of 4
sctp_sf_do_prm_asconf---of 5
sctp_sf_do_prm_asoc25%of 8
sctp_sf_do_prm_reconf---of 3
sctp_sf_do_prm_requestheartbeat---of 6
sctp_sf_do_prm_send---of 3
sctp_sf_do_reconf---of 24
sctp_sf_do_unexpected_init14%of 36
sctp_sf_eat_auth---of 20
sctp_sf_eat_data_6_2---of 25
sctp_sf_eat_data_fast_4_4---of 19
sctp_sf_eat_fwd_tsn---of 20
sctp_sf_eat_fwd_tsn_fast---of 19
sctp_sf_eat_sack_6_2---of 28
sctp_sf_error_closed---of 3
sctp_sf_error_shutdown---of 3
sctp_sf_heartbeat---of 5
sctp_sf_ignore_other---of 3
sctp_sf_ignore_primitive---of 3
sctp_sf_not_impl---of 1
sctp_sf_ootb---of 20
sctp_sf_operr_notify---of 12
sctp_sf_pdiscard---of 3
sctp_sf_send_probe---of 6
sctp_sf_send_reconf---of 7
sctp_sf_sendbeat_8_3---of 13
sctp_sf_shut_8_4_5---of 8
sctp_sf_shutdown_ack_sent_abort---of 1
sctp_sf_shutdown_ack_sent_prm_abort---of 1
sctp_sf_shutdown_pending_abort---of 25
sctp_sf_shutdown_pending_prm_abort---of 8
sctp_sf_shutdown_sent_abort---of 27
sctp_sf_shutdown_sent_prm_abort---of 9
sctp_sf_t1_cookie_timer_expire---of 11
sctp_sf_t1_init_timer_expire---of 13
sctp_sf_t2_timer_expire---of 17
sctp_sf_t4_timer_expire---of 12
sctp_sf_t5_timer_expire---of 8
sctp_sf_tabort_8_4_8---of 8
sctp_sf_timer_ignore---of 3
sctp_sf_unk_chunk---of 18
sctp_sf_violation---of 6
sctp_sf_violation_paramlen---of 9
sctp_stop_t1_and_abort---of 8
sctp_tietags_populate50%of 4
trace_event_raw_event_sctp_probe---of 7
trace_event_raw_event_sctp_probe_path---of 7
trace_raw_output_sctp_probe---of 3
trace_raw_output_sctp_probe_path---of 3
-----------
SUMMARY19%of 151

-----------
SUMMARY---of 0

__hrtimer_get_remaining---of 4
__hrtimer_init50%of 8
__hrtimer_next_event_base---of 14
__hrtimer_run_queues---of 55
__ia32_sys_nanosleep---of 5
__ia32_sys_nanosleep_time32---of 5
__remove_hrtimer---of 14
__x64_sys_nanosleep40%of 5
__x64_sys_nanosleep_time32---of 5
clock_was_set---of 27
clock_was_set_delayed---of 1
clock_was_set_work---of 1
debug_deactivate---of 15
debug_init27%of 15
destroy_hrtimer_on_stack---of 1
do_nanosleep28%of 18
enqueue_hrtimer32%of 19
hrtimer_active---of 9
hrtimer_cancel50%of 4
hrtimer_debug_hint---of 1
hrtimer_fixup_activate---of 3
hrtimer_fixup_free---of 5
hrtimer_fixup_init---of 5
hrtimer_forward---of 6
hrtimer_get_next_event---of 6
hrtimer_init100%of 1
hrtimer_init_on_stack---of 1
hrtimer_init_sleeper---of 1
hrtimer_init_sleeper_on_stack---of 1
hrtimer_interrupt---of 16
hrtimer_nanosleep50%of 4
hrtimer_nanosleep_restart---of 1
hrtimer_next_event_without---of 6
hrtimer_reprogram34%of 15
hrtimer_run_queues---of 10
hrtimer_run_softirq---of 5
hrtimer_sleeper_start_expires---of 1
hrtimer_start_range_ns31%of 36
hrtimer_try_to_cancel20%of 15
hrtimer_wakeup---of 3
hrtimers_cpu_dying---of 18
hrtimers_prepare_cpu---of 5
hrtimers_resume_local---of 5
ktime_add_safe---of 1
ktime_get_boottime---of 1
ktime_get_clocktai---of 1
ktime_get_real---of 1
nanosleep_copyout---of 6
retrigger_next_event---of 16
schedule_hrtimeout---of 1
schedule_hrtimeout_range100%of 1
schedule_hrtimeout_range_clock25%of 8
-----------
SUMMARY33%of 149

sctp_sm_lookup_event15%of 21
-----------
SUMMARY15%of 21

-----------
SUMMARY---of 0

__inet_bhash2_update_saddr---of 57
__inet_check_established---of 33
__inet_hash---of 32
__inet_hash_connect3%of 70
__inet_inherit_port---of 50
__inet_lookup_established---of 28
__inet_lookup_listener---of 7
inet_bhash2_addr_any_hashbucket---of 5
inet_bhash2_reset_saddr---of 3
inet_bhash2_update_saddr---of 1
inet_bind2_bucket_create---of 8
inet_bind2_bucket_destroy---of 6
inet_bind2_bucket_find---of 12
inet_bind2_bucket_match_addr_any---of 9
inet_bind_bucket_create---of 4
inet_bind_bucket_destroy---of 4
inet_bind_bucket_match---of 4
inet_bind_hash---of 3
inet_ehash_insert---of 41
inet_ehash_locks_alloc---of 7
inet_ehash_nolisten---of 4
inet_ehashfn---of 4
inet_hash---of 3
inet_hash_connect---of 3
inet_hashinfo2_init_mod---of 4
inet_lhash2_lookup---of 16
inet_lookup_reuseport---of 5
inet_lookup_run_sk_lookup---of 35
inet_pernet_hashinfo_alloc---of 10
inet_pernet_hashinfo_free---of 3
inet_put_port38%of 16
inet_unhash16%of 19
init_hashinfo_lhash2---of 3
ipv6_portaddr_hash---of 4
l3mdev_master_ifindex_by_index---of 14
sock_edemux---of 1
sock_gen_put---of 11
sock_prot_inuse_add---of 3
-----------
SUMMARY11%of 105

address_val---of 8
bdev_name---of 13
bitmap_list_string---of 14
bitmap_string---of 13
bprintf---of 1
bstr_printf---of 53
clock---of 8
date_str---of 5
default_pointer---of 53
dentry_name---of 41
device_node_string---of 66
err_ptr---of 8
escaped_string---of 16
file_dentry_name---of 8
fill_ptr_key---of 1
flags_string---of 66
format_decode32%of 50
fourcc_string---of 33
fwnode_full_name_string---of 9
fwnode_string---of 27
hex_string---of 21
ip4_addr_string---of 7
ip4_addr_string_sa---of 17
ip4_string---of 40
ip6_addr_string---of 10
ip6_addr_string_sa---of 27
ip6_compressed_string---of 45
ip6_string---of 15
ip_addr_string---of 41
mac_address_string---of 26
netdev_bits---of 26
num_to_str---of 17
number20%of 89
pointer---of 85
pointer_string---of 1
ptr_to_hashval---of 3
put_dec75%of 4
put_dec_full858%of 7
put_dec_trunc860%of 10
resource_string---of 111
restricted_pointer---of 31
rtc_str---of 17
scnprintf---of 4
simple_strntoll---of 3
simple_strntoull---of 5
simple_strtol---of 3
simple_strtoll---of 1
simple_strtoul---of 1
simple_strtoull---of 1
skip_atoi---of 3
snprintf100%of 1
special_hex_number---of 1
sprintf100%of 1
sscanf---of 1
string39%of 13
string_nocheck---of 7
symbol_string---of 15
time64_str---of 1
time_and_date---of 21
time_str---of 5
uuid_string---of 23
vbin_printf---of 78
vscnprintf---of 4
vsnprintf15%of 67
vsprintf---of 1
vsscanf---of 116
widen_string10%of 22
-----------
SUMMARY25%of 264

-----------
SUMMARY---of 0

sctp4_rcv---of 1
sctp_addr_wq_mgmt---of 23
sctp_addr_wq_timeout_handler---of 21
sctp_copy_local_addr_list---of 24
sctp_ctrlsock_exit---of 3
sctp_ctrlsock_init---of 5
sctp_defaults_exit---of 11
sctp_defaults_init---of 20
sctp_get_af_specific50%of 4
sctp_get_pf_specific---of 4
sctp_inet_af_supported---of 1
sctp_inet_bind_verify---of 1
sctp_inet_cmp_addr---of 5
sctp_inet_event_msgname---of 3
sctp_inet_send_verify---of 1
sctp_inet_skb_msgname---of 4
sctp_inet_supported_addrs---of 1
sctp_inetaddr_event---of 14
sctp_register_af---of 8
sctp_register_pf---of 6
sctp_udp_rcv---of 3
sctp_udp_sock_start---of 4
sctp_udp_sock_stop---of 5
sctp_v4_add_protocol---of 1
sctp_v4_addr_to_user---of 1
sctp_v4_addr_valid---of 12
sctp_v4_available---of 5
sctp_v4_cmp_addr---of 4
sctp_v4_copy_addrlist---of 37
sctp_v4_copy_ip_options---of 19
sctp_v4_create_accept_sk---of 4
sctp_v4_del_protocol---of 1
sctp_v4_ecn_capable---of 9
sctp_v4_from_addr_param---of 3
sctp_v4_from_sk---of 1
sctp_v4_from_skb---of 5
sctp_v4_get_dst---of 61
sctp_v4_get_saddr---of 3
sctp_v4_inaddr_any---of 1
sctp_v4_ip_options_len---of 18
sctp_v4_is_any---of 1
sctp_v4_is_ce---of 1
sctp_v4_pf_init---of 10
sctp_v4_protosw_exit---of 1
sctp_v4_protosw_init---of 3
sctp_v4_scope---of 6
sctp_v4_seq_dump_addr---of 1
sctp_v4_skb_iif---of 8
sctp_v4_skb_sdif---of 4
sctp_v4_to_addr_param---of 1
sctp_v4_to_sk_daddr---of 1
sctp_v4_to_sk_saddr---of 1
sctp_v4_xmit---of 18
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__bpf_trace_mm_collapse_huge_page---of 1
__bpf_trace_mm_collapse_huge_page_isolate---of 1
__bpf_trace_mm_collapse_huge_page_swapin---of 1
__bpf_trace_mm_khugepaged_collapse_file---of 1
__bpf_trace_mm_khugepaged_scan_file---of 1
__bpf_trace_mm_khugepaged_scan_pmd---of 1
__collapse_huge_page_copy_failed---of 1
__collapse_huge_page_isolate---of 109
__khugepaged_enter---of 11
__khugepaged_exit---of 19
__probestub_mm_collapse_huge_page---of 1
__probestub_mm_collapse_huge_page_isolate---of 1
__probestub_mm_collapse_huge_page_swapin---of 1
__probestub_mm_khugepaged_collapse_file---of 1
__probestub_mm_khugepaged_scan_file---of 1
__probestub_mm_khugepaged_scan_pmd---of 1
__traceiter_mm_collapse_huge_page---of 4
__traceiter_mm_collapse_huge_page_isolate---of 4
__traceiter_mm_collapse_huge_page_swapin---of 4
__traceiter_mm_khugepaged_collapse_file---of 4
__traceiter_mm_khugepaged_scan_file---of 4
__traceiter_mm_khugepaged_scan_pmd---of 4
add_mm_counter---of 1
alloc_charge_folio---of 34
alloc_sleep_millisecs_show---of 1
alloc_sleep_millisecs_store---of 3
collapse_pte_mapped_thp---of 58
collect_mm_slot---of 12
current_is_khugepaged---of 1
defrag_show---of 1
defrag_store---of 1
filemap_nr_thps_dec---of 6
find_pmd_or_thp_or_none---of 7
folio_large_mapcount---of 9
folio_likely_mapped_shared---of 28
folio_mapcount---of 9
folio_order---of 9
folio_put---of 4
full_scans_show---of 1
hpage_collapse_scan_file---of 345
hpage_collapse_scan_pmd---of 282
hugepage_madvise---of 4
hugepage_vma_revalidate---of 21
is_refcount_suitable---of 23
khugepaged---of 104
khugepaged_enter_vma27%of 15
khugepaged_min_free_kbytes_update---of 7
madvise_collapse---of 67
max_ptes_none_show---of 1
max_ptes_none_store---of 4
max_ptes_shared_show---of 1
max_ptes_shared_store---of 4
max_ptes_swap_show---of 1
max_ptes_swap_store---of 4
mm_counter_file---of 7
mmu_notifier_invalidate_range_end---of 5
mmu_notifier_invalidate_range_start---of 3
pages_collapsed_show---of 1
pages_to_scan_show---of 1
pages_to_scan_store---of 4
perf_trace_mm_collapse_huge_page---of 8
perf_trace_mm_collapse_huge_page_isolate---of 8
perf_trace_mm_collapse_huge_page_swapin---of 8
perf_trace_mm_khugepaged_collapse_file---of 8
perf_trace_mm_khugepaged_scan_file---of 8
perf_trace_mm_khugepaged_scan_pmd---of 8
pmd_lock---of 1
pte_unmap---of 6
ptep_clear---of 3
release_pte_folio---of 17
release_pte_pages---of 25
scan_sleep_millisecs_show---of 1
scan_sleep_millisecs_store---of 3
set_huge_pmd---of 26
set_recommended_min_free_kbytes---of 13
start_stop_khugepaged---of 12
trace_event_raw_event_mm_collapse_huge_page---of 7
trace_event_raw_event_mm_collapse_huge_page_isolate---of 7
trace_event_raw_event_mm_collapse_huge_page_swapin---of 7
trace_event_raw_event_mm_khugepaged_collapse_file---of 7
trace_event_raw_event_mm_khugepaged_scan_file---of 7
trace_event_raw_event_mm_khugepaged_scan_pmd---of 7
trace_mm_collapse_huge_page_isolate---of 15
trace_raw_output_mm_collapse_huge_page---of 3
trace_raw_output_mm_collapse_huge_page_isolate---of 3
trace_raw_output_mm_collapse_huge_page_swapin---of 3
trace_raw_output_mm_khugepaged_collapse_file---of 3
trace_raw_output_mm_khugepaged_scan_file---of 3
trace_raw_output_mm_khugepaged_scan_pmd---of 3
xas_next---of 12
xas_next_entry---of 17
-----------
SUMMARY27%of 15

-----------
SUMMARY---of 0

bcm_can_tx---of 17
bcm_connect25%of 12
bcm_delete_rx_op---of 16
bcm_delete_tx_op---of 9
bcm_free_op_rcu---of 5
bcm_init---of 3
bcm_notifier---of 24
bcm_proc_getifname---of 14
bcm_proc_show---of 16
bcm_read_op---of 9
bcm_recvmsg34%of 12
bcm_release---of 31
bcm_rx_cmp_to_index---of 11
bcm_rx_handler---of 20
bcm_rx_setup20%of 60
bcm_rx_thr_flush---of 12
bcm_rx_thr_handler---of 3
bcm_rx_timeout_handler---of 4
bcm_rx_update_and_send---of 9
bcm_send_to_user---of 12
bcm_sendmsg14%of 22
bcm_sock_no_ioctlcmd---of 1
bcm_tx_send---of 18
bcm_tx_setup---of 68
bcm_tx_timeout_handler---of 13
canbcm_pernet_exit---of 3
canbcm_pernet_init---of 1
dev_put---of 3
-----------
SUMMARY21%of 106

-----------
SUMMARY---of 0

add_rules---of 13
gid_eq---of 1
gid_gt---of 1
gid_lt---of 1
ima_alloc_rule_opt_list---of 9
ima_appraise_signature---of 27
ima_check_policy---of 1
ima_delete_rules---of 6
ima_free_rule---of 5
ima_lsm_policy_change---of 18
ima_match_policy22%of 123
ima_parse_add_rule---of 322
ima_parse_appraise_algos---of 10
ima_policy_next---of 11
ima_policy_show---of 116
ima_policy_start---of 24
ima_policy_stop---of 1
ima_rule_contains_lsm_cond---of 7
ima_update_policy---of 5
ima_update_policy_flags---of 21
uid_eq---of 1
uid_gt---of 1
uid_lt---of 1
vfsgid_eq_kgid---of 1
vfsgid_gt_kgid---of 1
vfsgid_lt_kgid---of 1
vfsuid_eq_kuid---of 1
vfsuid_gt_kuid---of 1
vfsuid_lt_kuid---of 1
-----------
SUMMARY22%of 123

__tcp_get_metrics11%of 28
read_seqbegin---of 10
tcp_fastopen_cache_get20%of 26
tcp_fastopen_cache_set---of 31
tcp_get_metrics21%of 34
tcp_init_metrics13%of 47
tcp_metrics_fill_info---of 33
tcp_metrics_flush_all---of 23
tcp_metrics_nl_cmd_del---of 40
tcp_metrics_nl_cmd_get---of 54
tcp_metrics_nl_dump---of 40
tcp_net_metrics_exit_batch---of 1
tcp_peer_is_proven---of 48
tcp_update_metrics16%of 57
tcpm_suck_dst67%of 3
-----------
SUMMARY17%of 195

-----------
SUMMARY---of 0

__bpf_trace_hugepage_set---of 1
__bpf_trace_hugepage_update---of 1
__bpf_trace_migration_pmd---of 1
__folio_rmap_sanity_checks---of 28
__pmd_trans_huge_lock---of 3
__probestub_hugepage_set_pmd---of 1
__probestub_hugepage_set_pud---of 1
__probestub_hugepage_update_pmd---of 1
__probestub_hugepage_update_pud---of 1
__probestub_remove_migration_pmd---of 1
__probestub_set_migration_pmd---of 1
__pud_trans_huge_lock---of 3
__split_huge_page---of 298
__split_huge_pmd---of 204
__split_huge_pud---of 12
__thp_vma_allowable_orders---of 36
__traceiter_hugepage_set_pmd---of 4
__traceiter_hugepage_set_pud---of 4
__traceiter_hugepage_update_pmd---of 4
__traceiter_hugepage_update_pud---of 4
__traceiter_remove_migration_pmd---of 4
__traceiter_set_migration_pmd---of 4
_compound_head---of 7
add_mm_counter---of 1
anon_fault_alloc_show---of 7
anon_fault_fallback_charge_show---of 7
anon_fault_fallback_show---of 7
can_change_pmd_writable---of 36
can_split_folio---of 33
change_huge_pmd---of 48
copy_huge_pmd---of 91
copy_huge_pud---of 6
current_gfp_context---of 5
deferred_split_count---of 1
deferred_split_folio---of 43
deferred_split_scan---of 32
defrag_show---of 5
defrag_store---of 7
do_huge_pmd_anonymous_page---of 85
do_huge_pmd_numa_page---of 44
do_huge_pmd_wp_page---of 146
enabled_show---of 3
enabled_store---of 7
file_thp_enabled---of 4
filemap_nr_thps_dec---of 6
folio_flags---of 10
folio_large_mapcount---of 9
folio_lock---of 9
folio_mapcount---of 9
folio_maybe_dma_pinned---of 9
folio_memcg---of 12
folio_nr_pages---of 9
folio_order---of 9
folio_put---of 4
folio_test_pmd_mappable---of 9
folio_try_share_anon_rmap_pmd---of 89
folio_undo_large_rmappable---of 16
follow_devmap_pmd---of 17
hpage_pmd_size_show---of 1
huge_pmd_set_accessed---of 4
huge_pud_set_accessed---of 4
madvise_free_huge_pmd---of 73
maybe_pmd_mkwrite---of 3
mm_get_huge_zero_folio---of 14
mm_put_huge_zero_folio---of 4
move_huge_pmd---of 23
move_pages_huge_pmd---of 120
perf_trace_hugepage_set---of 8
perf_trace_hugepage_update---of 8
perf_trace_migration_pmd---of 8
pfn_swap_entry_folio---of 17
pfn_swap_entry_to_page---of 18
pte_free---of 18
put_anon_vma---of 3
remap_page---of 21
remove_migration_pmd---of 72
set_huge_zero_folio---of 8
set_pmd_migration_entry---of 68
shrink_huge_zero_page_count---of 1
shrink_huge_zero_page_scan---of 6
single_hugepage_flag_show---of 1
single_hugepage_flag_store---of 5
split_huge_page_to_list_to_order---of 113
split_huge_pages_all---of 65
split_huge_pages_write---of 147
split_huge_pmd_address---of 3
swpout_fallback_show---of 7
swpout_show---of 7
thp_get_unmapped_area---of 1
thp_get_unmapped_area_vmflags19%of 11
thpsize_enabled_show---of 4
thpsize_enabled_store---of 6
thpsize_release---of 1
touch_pmd---of 3
touch_pud---of 3
trace_event_raw_event_hugepage_set---of 7
trace_event_raw_event_hugepage_update---of 7
trace_event_raw_event_migration_pmd---of 7
trace_raw_output_hugepage_set---of 3
trace_raw_output_hugepage_update---of 3
trace_raw_output_migration_pmd---of 3
unmap_folio---of 18
use_zero_page_show---of 1
use_zero_page_store---of 5
vma_adjust_trans_huge32%of 19
vma_thp_gfp_mask---of 7
vmf_insert_pfn_pmd---of 23
vmf_insert_pfn_pud---of 21
zap_huge_pmd---of 38
zap_huge_pud---of 9
-----------
SUMMARY27%of 30

-----------
SUMMARY---of 0

__percpu_counter_compare25%of 8
__percpu_counter_init_many---of 9
__percpu_counter_limited_add---of 35
__percpu_counter_sum---of 5
compute_batch_value---of 1
percpu_counter_add_batch64%of 11
percpu_counter_cpu_dead---of 6
percpu_counter_destroy_many---of 17
percpu_counter_fixup_free---of 3
percpu_counter_set---of 5
percpu_counter_sync---of 1
-----------
SUMMARY48%of 19

sctp_asconf_queue_teardown31%of 13
sctp_assoc_add_peer38%of 43
sctp_assoc_bh_rcv39%of 21
sctp_assoc_choose_alter_transport50%of 4
sctp_assoc_clean_asconf_ack_cache---of 7
sctp_assoc_control_transport---of 60
sctp_assoc_del_nonprimary_peers---of 6
sctp_assoc_del_peer---of 6
sctp_assoc_lookup_asconf_ack---of 7
sctp_assoc_lookup_laddr---of 3
sctp_assoc_lookup_paddr50%of 6
sctp_assoc_lookup_tsn---of 10
sctp_assoc_migrate---of 11
sctp_assoc_rm_peer---of 46
sctp_assoc_rwnd_decrease---of 13
sctp_assoc_rwnd_increase---of 18
sctp_assoc_set_bind_addr_from_cookie100%of 1
sctp_assoc_set_bind_addr_from_ep100%of 1
sctp_assoc_set_id44%of 16
sctp_assoc_set_pmtu63%of 8
sctp_assoc_set_primary34%of 9
sctp_assoc_sync_pmtu37%of 11
sctp_assoc_update---of 20
sctp_assoc_update_frag_point60%of 5
sctp_assoc_update_retran_path---of 25
sctp_association_free48%of 38
sctp_association_get_next_tsn---of 1
sctp_association_hold50%of 4
sctp_association_new36%of 25
sctp_association_put47%of 13
sctp_cmp_addr_exact67%of 3
sctp_get_ecne_prepend67%of 3
-----------
SUMMARY43%of 224

arch_stack_walk60%of 10
arch_stack_walk_reliable---of 11
arch_stack_walk_user---of 10
-----------
SUMMARY60%of 10

always_on---of 1
nlmon_close---of 1
nlmon_get_stats64---of 1
nlmon_open---of 1
nlmon_setup---of 1
nlmon_validate---of 1
nlmon_xmit67%of 3
-----------
SUMMARY67%of 3

bpf_bprm_opts_set---of 3
bpf_get_attach_cookie---of 1
bpf_ima_file_hash---of 1
bpf_ima_inode_hash---of 1
bpf_ima_inode_hash_allowed---of 1
bpf_lsm_audit_rule_free---of 1
bpf_lsm_audit_rule_init---of 1
bpf_lsm_audit_rule_known---of 1
bpf_lsm_audit_rule_match---of 1
bpf_lsm_binder_set_context_mgr---of 1
bpf_lsm_binder_transaction---of 1
bpf_lsm_binder_transfer_binder---of 1
bpf_lsm_binder_transfer_file---of 1
bpf_lsm_bpf100%of 1
bpf_lsm_bpf_map---of 1
bpf_lsm_bpf_map_create---of 1
bpf_lsm_bpf_map_free---of 1
bpf_lsm_bpf_prog100%of 1
bpf_lsm_bpf_prog_free---of 1
bpf_lsm_bpf_prog_load100%of 1
bpf_lsm_bpf_token_capable---of 1
bpf_lsm_bpf_token_cmd---of 1
bpf_lsm_bpf_token_create---of 1
bpf_lsm_bpf_token_free---of 1
bpf_lsm_bprm_check_security---of 1
bpf_lsm_bprm_committed_creds---of 1
bpf_lsm_bprm_committing_creds---of 1
bpf_lsm_bprm_creds_for_exec---of 1
bpf_lsm_bprm_creds_from_file---of 1
bpf_lsm_capable100%of 1
bpf_lsm_capget---of 1
bpf_lsm_capset---of 1
bpf_lsm_cred_alloc_blank---of 1
bpf_lsm_cred_free---of 1
bpf_lsm_cred_getsecid---of 1
bpf_lsm_cred_prepare---of 1
bpf_lsm_cred_transfer---of 1
bpf_lsm_current_getsecid_subj100%of 1
bpf_lsm_d_instantiate100%of 1
bpf_lsm_dentry_create_files_as---of 1
bpf_lsm_dentry_init_security---of 1
bpf_lsm_file_alloc_security100%of 1
bpf_lsm_file_fcntl---of 1
bpf_lsm_file_free_security100%of 1
bpf_lsm_file_ioctl100%of 1
bpf_lsm_file_ioctl_compat---of 1
bpf_lsm_file_lock---of 1
bpf_lsm_file_mprotect---of 1
bpf_lsm_file_open100%of 1
bpf_lsm_file_permission100%of 1
bpf_lsm_file_post_open100%of 1
bpf_lsm_file_receive---of 1
bpf_lsm_file_release100%of 1
bpf_lsm_file_send_sigiotask---of 1
bpf_lsm_file_set_fowner---of 1
bpf_lsm_file_truncate---of 1
bpf_lsm_find_cgroup_shim---of 5
bpf_lsm_fs_context_dup---of 1
bpf_lsm_fs_context_parse_param---of 1
bpf_lsm_fs_context_submount---of 1
bpf_lsm_func_proto---of 21
bpf_lsm_getprocattr---of 1
bpf_lsm_getselfattr---of 1
bpf_lsm_ib_alloc_security---of 1
bpf_lsm_ib_endport_manage_subnet---of 1
bpf_lsm_ib_free_security---of 1
bpf_lsm_ib_pkey_access---of 1
bpf_lsm_inet_conn_established---of 1
bpf_lsm_inet_conn_request---of 1
bpf_lsm_inet_csk_clone---of 1
bpf_lsm_inode_alloc_security100%of 1
bpf_lsm_inode_copy_up---of 1
bpf_lsm_inode_copy_up_xattr---of 1
bpf_lsm_inode_create---of 1
bpf_lsm_inode_follow_link100%of 1
bpf_lsm_inode_free_security100%of 1
bpf_lsm_inode_get_acl---of 1
bpf_lsm_inode_getattr---of 1
bpf_lsm_inode_getsecctx---of 1
bpf_lsm_inode_getsecid---of 1
bpf_lsm_inode_getsecurity---of 1
bpf_lsm_inode_getxattr---of 1
bpf_lsm_inode_init_security---of 1
bpf_lsm_inode_init_security_anon---of 1
bpf_lsm_inode_invalidate_secctx---of 1
bpf_lsm_inode_killpriv---of 1
bpf_lsm_inode_link---of 1
bpf_lsm_inode_listsecurity---of 1
bpf_lsm_inode_listxattr---of 1
bpf_lsm_inode_mkdir100%of 1
bpf_lsm_inode_mknod---of 1
bpf_lsm_inode_need_killpriv---of 1
bpf_lsm_inode_notifysecctx---of 1
bpf_lsm_inode_permission100%of 1
bpf_lsm_inode_post_create_tmpfile---of 1
bpf_lsm_inode_post_remove_acl---of 1
bpf_lsm_inode_post_removexattr---of 1
bpf_lsm_inode_post_set_acl---of 1
bpf_lsm_inode_post_setattr---of 1
bpf_lsm_inode_post_setxattr---of 1
bpf_lsm_inode_readlink---of 1
bpf_lsm_inode_remove_acl---of 1
bpf_lsm_inode_removexattr---of 1
bpf_lsm_inode_rename---of 1
bpf_lsm_inode_rmdir---of 1
bpf_lsm_inode_set_acl---of 1
bpf_lsm_inode_setattr---of 1
bpf_lsm_inode_setsecctx---of 1
bpf_lsm_inode_setsecurity---of 1
bpf_lsm_inode_setxattr---of 1
bpf_lsm_inode_symlink---of 1
bpf_lsm_inode_unlink---of 1
bpf_lsm_ipc_getsecid---of 1
bpf_lsm_ipc_permission---of 1
bpf_lsm_is_sleepable_hook---of 1
bpf_lsm_is_trusted---of 1
bpf_lsm_ismaclabel---of 1
bpf_lsm_kernel_act_as---of 1
bpf_lsm_kernel_create_files_as---of 1
bpf_lsm_kernel_load_data---of 1
bpf_lsm_kernel_module_request---of 1
bpf_lsm_kernel_post_load_data---of 1
bpf_lsm_kernel_post_read_file---of 1
bpf_lsm_kernel_read_file---of 1
bpf_lsm_kernfs_init_security100%of 1
bpf_lsm_key_alloc---of 1
bpf_lsm_key_free---of 1
bpf_lsm_key_getsecurity---of 1
bpf_lsm_key_permission---of 1
bpf_lsm_key_post_create_or_update---of 1
bpf_lsm_locked_down---of 1
bpf_lsm_mmap_addr100%of 1
bpf_lsm_mmap_file100%of 1
bpf_lsm_move_mount---of 1
bpf_lsm_mptcp_add_subflow---of 1
bpf_lsm_msg_msg_alloc_security---of 1
bpf_lsm_msg_msg_free_security---of 1
bpf_lsm_msg_queue_alloc_security---of 1
bpf_lsm_msg_queue_associate---of 1
bpf_lsm_msg_queue_free_security---of 1
bpf_lsm_msg_queue_msgctl---of 1
bpf_lsm_msg_queue_msgrcv---of 1
bpf_lsm_msg_queue_msgsnd---of 1
bpf_lsm_netlink_send100%of 1
bpf_lsm_path_chmod---of 1
bpf_lsm_path_chown---of 1
bpf_lsm_path_chroot---of 1
bpf_lsm_path_link---of 1
bpf_lsm_path_mkdir100%of 1
bpf_lsm_path_mknod---of 1
bpf_lsm_path_notify---of 1
bpf_lsm_path_post_mknod---of 1
bpf_lsm_path_rename---of 1
bpf_lsm_path_rmdir---of 1
bpf_lsm_path_symlink---of 1
bpf_lsm_path_truncate---of 1
bpf_lsm_path_unlink---of 1
bpf_lsm_perf_event_alloc---of 1
bpf_lsm_perf_event_free---of 1
bpf_lsm_perf_event_open---of 1
bpf_lsm_perf_event_read---of 1
bpf_lsm_perf_event_write---of 1
bpf_lsm_post_notification---of 1
bpf_lsm_ptrace_access_check---of 1
bpf_lsm_ptrace_traceme---of 1
bpf_lsm_quota_on---of 1
bpf_lsm_quotactl---of 1
bpf_lsm_release_secctx---of 1
bpf_lsm_req_classify_flow---of 1
bpf_lsm_sb_alloc_security---of 1
bpf_lsm_sb_clone_mnt_opts---of 1
bpf_lsm_sb_delete---of 1
bpf_lsm_sb_eat_lsm_opts---of 1
bpf_lsm_sb_free_mnt_opts---of 1
bpf_lsm_sb_free_security---of 1
bpf_lsm_sb_kern_mount---of 1
bpf_lsm_sb_mnt_opts_compat---of 1
bpf_lsm_sb_mount---of 1
bpf_lsm_sb_pivotroot---of 1
bpf_lsm_sb_remount---of 1
bpf_lsm_sb_set_mnt_opts---of 1
bpf_lsm_sb_show_options---of 1
bpf_lsm_sb_statfs---of 1
bpf_lsm_sb_umount---of 1
bpf_lsm_sctp_assoc_established---of 1
bpf_lsm_sctp_assoc_request100%of 1
bpf_lsm_sctp_bind_connect100%of 1
bpf_lsm_sctp_sk_clone---of 1
bpf_lsm_secctx_to_secid---of 1
bpf_lsm_secid_to_secctx---of 1
bpf_lsm_secmark_refcount_dec---of 1
bpf_lsm_secmark_refcount_inc---of 1
bpf_lsm_secmark_relabel_packet---of 1
bpf_lsm_sem_alloc_security---of 1
bpf_lsm_sem_associate---of 1
bpf_lsm_sem_free_security---of 1
bpf_lsm_sem_semctl---of 1
bpf_lsm_sem_semop---of 1
bpf_lsm_setprocattr---of 1
bpf_lsm_setselfattr---of 1
bpf_lsm_settime---of 1
bpf_lsm_shm_alloc_security---of 1
bpf_lsm_shm_associate---of 1
bpf_lsm_shm_free_security---of 1
bpf_lsm_shm_shmat---of 1
bpf_lsm_shm_shmctl---of 1
bpf_lsm_sk_alloc_security100%of 1
bpf_lsm_sk_clone_security100%of 1
bpf_lsm_sk_free_security---of 1
bpf_lsm_sk_getsecid100%of 1
bpf_lsm_sock_graft100%of 1
bpf_lsm_socket_accept100%of 1
bpf_lsm_socket_bind100%of 1
bpf_lsm_socket_connect100%of 1
bpf_lsm_socket_create100%of 1
bpf_lsm_socket_getpeername100%of 1
bpf_lsm_socket_getpeersec_dgram100%of 1
bpf_lsm_socket_getpeersec_stream---of 1
bpf_lsm_socket_getsockname100%of 1
bpf_lsm_socket_getsockopt100%of 1
bpf_lsm_socket_listen100%of 1
bpf_lsm_socket_post_create100%of 1
bpf_lsm_socket_recvmsg100%of 1
bpf_lsm_socket_sendmsg100%of 1
bpf_lsm_socket_setsockopt100%of 1
bpf_lsm_socket_shutdown100%of 1
bpf_lsm_socket_sock_rcv_skb100%of 1
bpf_lsm_socket_socketpair100%of 1
bpf_lsm_syslog---of 1
bpf_lsm_task_alloc---of 1
bpf_lsm_task_fix_setgid---of 1
bpf_lsm_task_fix_setgroups---of 1
bpf_lsm_task_fix_setuid---of 1
bpf_lsm_task_free---of 1
bpf_lsm_task_getioprio---of 1
bpf_lsm_task_getpgid---of 1
bpf_lsm_task_getscheduler---of 1
bpf_lsm_task_getsecid_obj---of 1
bpf_lsm_task_getsid---of 1
bpf_lsm_task_kill---of 1
bpf_lsm_task_movememory---of 1
bpf_lsm_task_prctl---of 1
bpf_lsm_task_prlimit---of 1
bpf_lsm_task_setioprio---of 1
bpf_lsm_task_setnice---of 1
bpf_lsm_task_setpgid---of 1
bpf_lsm_task_setrlimit---of 1
bpf_lsm_task_setscheduler---of 1
bpf_lsm_task_to_inode100%of 1
bpf_lsm_tun_dev_alloc_security---of 1
bpf_lsm_tun_dev_attach---of 1
bpf_lsm_tun_dev_attach_queue---of 1
bpf_lsm_tun_dev_create---of 1
bpf_lsm_tun_dev_free_security---of 1
bpf_lsm_tun_dev_open---of 1
bpf_lsm_unix_may_send---of 1
bpf_lsm_unix_stream_connect---of 1
bpf_lsm_uring_cmd---of 1
bpf_lsm_uring_override_creds---of 1
bpf_lsm_uring_sqpoll---of 1
bpf_lsm_userns_create---of 1
bpf_lsm_verify_prog---of 4
bpf_lsm_vm_enough_memory---of 1
bpf_lsm_watch_key---of 1
bpf_lsm_xfrm_decode_session100%of 1
bpf_lsm_xfrm_policy_alloc_security---of 1
bpf_lsm_xfrm_policy_clone_security---of 1
bpf_lsm_xfrm_policy_delete_security---of 1
bpf_lsm_xfrm_policy_free_security---of 1
bpf_lsm_xfrm_policy_lookup100%of 1
bpf_lsm_xfrm_state_alloc---of 1
bpf_lsm_xfrm_state_alloc_acquire---of 1
bpf_lsm_xfrm_state_delete_security---of 1
bpf_lsm_xfrm_state_free_security---of 1
bpf_lsm_xfrm_state_pol_flow_match---of 1
btf_id_cmp_func---of 1
-----------
SUMMARY100%of 48

__ref_tracker_dir_pr_ostream---of 30
ref_tracker_alloc39%of 13
ref_tracker_dir_exit---of 16
ref_tracker_dir_print---of 1
ref_tracker_dir_print_locked---of 1
ref_tracker_dir_snprint---of 1
ref_tracker_free17%of 18
refcount_inc---of 4
-----------
SUMMARY26%of 31

tick_init_highres---of 1
tick_oneshot_mode_active---of 9
tick_program_event50%of 4
tick_resume_oneshot---of 1
tick_setup_oneshot---of 1
tick_switch_to_oneshot---of 7
-----------
SUMMARY50%of 4

__kernfs_iattrs---of 4
__kernfs_setattr---of 14
kernfs_evict_inode---of 1
kernfs_get_inode34%of 12
kernfs_iop_getattr---of 5
kernfs_iop_listxattr---of 5
kernfs_iop_permission67%of 6
kernfs_iop_setattr---of 16
kernfs_setattr---of 14
kernfs_vfs_user_xattr_set---of 11
kernfs_vfs_xattr_get---of 3
kernfs_vfs_xattr_set---of 4
kernfs_xattr_get---of 3
kernfs_xattr_set---of 4
-----------
SUMMARY45%of 18

__sctp_rcv_lookup_endpoint---of 10
jhash18%of 17
nf_reset_ct---of 5
rhltable_lookup24%of 30
rht_lock34%of 9
rht_unlock---of 10
sctp_add_backlog---of 5
sctp_addrs_lookup_transport19%of 11
sctp_backlog_rcv20%of 15
sctp_csum_combine---of 1
sctp_csum_update---of 1
sctp_epaddr_lookup_transport40%of 5
sctp_err_finish---of 1
sctp_err_lookup---of 32
sctp_has_association34%of 15
sctp_hash_cmp---of 5
sctp_hash_endpoint---of 18
sctp_hash_key67%of 3
sctp_hash_obj---of 3
sctp_hash_transport19%of 108
sctp_icmp_frag_needed---of 16
sctp_icmp_proto_unreachable---of 12
sctp_icmp_redirect---of 15
sctp_rcv---of 95
sctp_rcv_checksum---of 5
sctp_rcv_ootb---of 16
sctp_sk_bound_dev_eq---of 3
sctp_transport_hashtable_destroy---of 1
sctp_transport_hashtable_init---of 1
sctp_udp_v4_err---of 8
sctp_unhash_endpoint---of 6
sctp_unhash_transport3%of 83
sctp_v4_err---of 8
sctp_v4_err_handle---of 17
sk_add_backlog---of 21
sock_owned_by_user---of 5
xfrm_policy_check---of 35
-----------
SUMMARY17%of 296

__rtnl_link_register---of 12
__rtnl_link_unregister---of 11
__rtnl_unlock---of 6
do_set_proto_down---of 16
do_setlink---of 210
fdb_vid_parse---of 7
if_nlmsg_size---of 49
if_nlmsg_stats_size---of 34
lockdep_rtnl_is_held100%of 1
ndo_dflt_bridge_getlink---of 59
ndo_dflt_fdb_add---of 9
ndo_dflt_fdb_del---of 5
ndo_dflt_fdb_dump---of 14
netdev_set_operstate---of 5
nla_nest_cancel---of 3
nla_put_ifalias---of 3
nla_put_string---of 1
nlmsg_parse_deprecated_strict---of 4
nlmsg_populate_fdb_fill---of 10
put_master_ifindex---of 13
rcu_read_unlock---of 6
refcount_dec_and_rtnl_lock---of 1
rtmsg_ifinfo---of 5
rtmsg_ifinfo_build_skb---of 8
rtmsg_ifinfo_newnet---of 4
rtmsg_ifinfo_send---of 3
rtnetlink_bind---of 4
rtnetlink_event---of 18
rtnetlink_net_exit---of 1
rtnetlink_net_init---of 3
rtnetlink_put_metrics---of 18
rtnetlink_rcv100%of 1
rtnetlink_rcv_msg---of 73
rtnetlink_send---of 1
rtnl_af_register---of 3
rtnl_af_unregister---of 3
rtnl_bridge_dellink---of 23
rtnl_bridge_getlink---of 50
rtnl_bridge_notify---of 8
rtnl_bridge_setlink---of 27
rtnl_configure_link---of 11
rtnl_create_link---of 43
rtnl_delete_link---of 4
rtnl_dellink---of 30
rtnl_dellinkprop---of 1
rtnl_dump_all---of 23
rtnl_dump_ifinfo---of 55
rtnl_dumpit---of 8
rtnl_fdb_add---of 29
rtnl_fdb_del---of 38
rtnl_fdb_dump---of 60
rtnl_fdb_get---of 53
rtnl_fdb_notify---of 4
rtnl_fill_devlink_port---of 6
rtnl_fill_dpll_pin---of 3
rtnl_fill_ifinfo---of 96
rtnl_fill_link_af---of 11
rtnl_fill_link_ifmap---of 1
rtnl_fill_link_netnsid---of 8
rtnl_fill_prop_list---of 9
rtnl_fill_proto_down---of 7
rtnl_fill_stats---of 4
rtnl_fill_statsinfo---of 108
rtnl_fill_vf---of 13
rtnl_fill_vfinfo---of 34
rtnl_get_link---of 20
rtnl_get_net_ns_capable---of 6
rtnl_getlink---of 61
rtnl_group_dellink---of 14
rtnl_have_link_slave_info---of 13
rtnl_is_locked---of 1
rtnl_kfree_skbs---of 3
rtnl_link_fill---of 22
rtnl_link_get_net---of 6
rtnl_link_get_net_capable---of 13
rtnl_link_register---of 14
rtnl_link_unregister---of 19
rtnl_linkprop---of 32
rtnl_lock---of 1
rtnl_lock_killable---of 1
rtnl_mdb_add---of 13
rtnl_mdb_del---of 20
rtnl_mdb_dump---of 16
rtnl_mdb_get---of 13
rtnl_newlink---of 129
rtnl_newlinkprop---of 1
rtnl_nla_parse_ifinfomsg---of 4
rtnl_notify---of 3
rtnl_offload_xstats_notify---of 7
rtnl_phys_port_id_fill---of 4
rtnl_phys_port_name_fill---of 4
rtnl_phys_switch_id_fill---of 4
rtnl_port_fill---of 25
rtnl_prop_list_size---of 12
rtnl_put_cacheinfo---of 5
rtnl_register---of 3
rtnl_register_internal---of 32
rtnl_register_module---of 1
rtnl_set_sk_err---of 1
rtnl_setlink---of 21
rtnl_stats_dump---of 24
rtnl_stats_get---of 20
rtnl_stats_get_parse---of 25
rtnl_stats_set---of 26
rtnl_trylock---of 1
rtnl_unicast---of 1
rtnl_unlock---of 1
rtnl_unregister---of 16
rtnl_unregister_all---of 17
rtnl_validate_mdb_entry---of 24
rtnl_validate_mdb_entry_del_bulk---of 12
rtnl_validate_mdb_entry_get---of 16
rtnl_xdp_fill---of 34
set_operstate---of 14
validate_linkmsg---of 44
-----------
SUMMARY100%of 2

-----------
SUMMARY---of 0

ip6table_mangle_hook30%of 10
ip6table_mangle_net_exit---of 1
ip6table_mangle_net_pre_exit---of 1
ip6table_mangle_table_init---of 3
-----------
SUMMARY30%of 10

__dst_destroy_metrics_generic---of 3
dst_alloc50%of 6
dst_blackhole_check---of 1
dst_blackhole_cow_metrics---of 1
dst_blackhole_mtu---of 3
dst_blackhole_neigh_lookup---of 1
dst_blackhole_redirect---of 1
dst_blackhole_update_pmtu---of 1
dst_cow_metrics_generic---of 7
dst_destroy---of 23
dst_destroy_rcu---of 1
dst_dev_put---of 14
dst_discard---of 1
dst_discard_out---of 1
dst_init43%of 7
dst_release75%of 4
dst_release_immediate---of 4
metadata_dst_alloc---of 3
metadata_dst_alloc_percpu---of 6
metadata_dst_free---of 7
metadata_dst_free_percpu---of 11
rcuref_put42%of 12
-----------
SUMMARY49%of 29

-----------
SUMMARY---of 0

collect_domain_accesses---of 10
current_check_refer_path---of 49
find_rule---of 17
hook_file_alloc_security100%of 1
hook_file_ioctl12%of 18
hook_file_ioctl_compat---of 18
hook_file_open12%of 18
hook_file_truncate---of 1
hook_inode_free_security67%of 3
hook_move_mount---of 11
hook_path_link---of 1
hook_path_mkdir19%of 11
hook_path_mknod---of 19
hook_path_rename---of 1
hook_path_rmdir---of 11
hook_path_symlink---of 11
hook_path_truncate---of 11
hook_path_unlink---of 11
hook_sb_delete---of 39
hook_sb_mount---of 11
hook_sb_pivotroot---of 11
hook_sb_remount---of 11
hook_sb_umount---of 11
is_access_to_paths_allowed---of 36
landlock_append_fs_rule---of 39
release_inode---of 4
scope_to_request---of 6
-----------
SUMMARY18%of 51

-----------
SUMMARY---of 0

__skb_pull---of 3
__tcp_ack_snd_check31%of 33
clean_acked_data_disable---of 1
clean_acked_data_enable---of 1
clean_acked_data_flush---of 1
pr_drop_req---of 8
sk_wake_async12%of 17
sock_owned_by_me---of 5
sock_owned_by_user---of 5
tcp_ack38%of 207
tcp_ack_update_rtt57%of 25
tcp_add_reno_sack---of 9
tcp_call_bpf---of 20
tcp_check_space8%of 27
tcp_check_urg34%of 15
tcp_clear_retrans100%of 1
tcp_collapse---of 64
tcp_conn_request---of 109
tcp_cwnd_reduction---of 7
tcp_data_queue11%of 186
tcp_data_ready13%of 16
tcp_data_snd_check67%of 3
tcp_do_parse_auth_options---of 16
tcp_done_with_error---of 3
tcp_drop_reason---of 1
tcp_ecn_check_ce15%of 14
tcp_enter_cwr---of 4
tcp_enter_loss---of 36
tcp_enter_recovery---of 6
tcp_event_data_recv32%of 38
tcp_fastretrans_alert---of 168
tcp_fin29%of 14
tcp_fin_time---of 3
tcp_finish_connect---of 13
tcp_get_syncookie_mss---of 18
tcp_gro_dev_warn---of 14
tcp_grow_window27%of 23
tcp_hdrlen---of 3
tcp_init_cwnd---of 4
tcp_init_transfer29%of 49
tcp_initialize_rcv_mss100%of 1
tcp_mark_head_lost---of 19
tcp_mark_skb_lost---of 9
tcp_mtup_probe_failed---of 1
tcp_mtup_probe_success---of 9
tcp_oow_rate_limited---of 11
tcp_parse_mss_option---of 11
tcp_parse_options32%of 47
tcp_process_tlp_ack---of 24
tcp_prune_ofo_queue---of 19
tcp_queue_rcv54%of 13
tcp_rbtree_insert100%of 4
tcp_rcv_established17%of 87
tcp_rcv_fastopen_synack---of 14
tcp_rcv_rtt_measure_ts---of 7
tcp_rcv_space_adjust24%of 21
tcp_rcv_spurious_retrans---of 9
tcp_rcv_state_process18%of 143
tcp_rcv_synrecv_state_fastopen---of 12
tcp_rearm_rto55%of 11
tcp_reset---of 23
tcp_rto_min_us28%of 11
tcp_rtx_queue_unlink_and_free34%of 15
tcp_sack_compress_send_ack---of 5
tcp_sacktag_one---of 31
tcp_sacktag_walk---of 107
tcp_sacktag_write_queue---of 168
tcp_send_challenge_ack30%of 10
tcp_send_dupack---of 10
tcp_send_rcvq---of 10
tcp_shifted_skb---of 22
tcp_simple_retransmit---of 20
tcp_skb_shift---of 4
tcp_syn_flood_action---of 9
tcp_synack_rtt_meas---of 5
tcp_try_coalesce40%of 15
tcp_try_rmem_schedule6%of 58
tcp_try_undo_dsack---of 8
tcp_try_undo_loss---of 17
tcp_try_undo_recovery---of 20
tcp_undo_cwnd_reduction---of 10
tcp_update_pacing_rate67%of 3
tcp_urg63%of 8
tcp_validate_incoming10%of 95
-----------
SUMMARY24%of 1210

__do_replace---of 28
alloc_counters---of 21
cleanup_entry---of 8
compat_copy_entries_to_user---of 14
compat_standard_from_user---of 3
compat_standard_to_user---of 3
compat_table_info---of 30
copy_from_sockptr_offset---of 4
do_ipt_get_ctl---of 48
do_ipt_set_ctl---of 42
ip_tables_net_exit---of 1
ip_tables_net_init---of 1
ipt_alloc_initial_table---of 10
ipt_do_table29%of 42
ipt_error---of 3
ipt_register_table---of 22
ipt_unregister_table_exit---of 7
ipt_unregister_table_pre_exit---of 3
trace_packet---of 13
translate_compat_table---of 60
translate_table---of 70
-----------
SUMMARY29%of 42

__ia32_sys_map_shadow_stack---of 9
__x64_sys_map_shadow_stack---of 9
alloc_shstk---of 14
reset_thread_features---of 1
restore_signal_shadow_stack---of 27
setup_signal_shadow_stack15%of 14
shstk_alloc_thread_stack---of 8
shstk_disable---of 9
shstk_free---of 12
shstk_prctl---of 13
shstk_setup---of 11
wrss_control---of 13
-----------
SUMMARY15%of 14

-----------
SUMMARY---of 0

getorigdst---of 10
ipv4_conntrack_in---of 1
ipv4_conntrack_local29%of 7
ipv6_conntrack_in100%of 1
ipv6_conntrack_local100%of 1
ipv6_getorigdst---of 17
nf_confirm18%of 57
nf_conntrack_confirm---of 8
nf_conntrack_proto_fini---of 1
nf_conntrack_proto_init---of 4
nf_conntrack_proto_pernet_init---of 1
nf_ct_bridge_register---of 3
nf_ct_bridge_unregister---of 3
nf_ct_l4proto_find---of 10
nf_ct_l4proto_log_invalid67%of 3
nf_ct_netns_do_get---of 17
nf_ct_netns_get---of 6
nf_ct_netns_inet_get---of 6
nf_ct_netns_put---of 19
nf_ct_pernet---of 16
nf_ct_tcp_fixup---of 5
nf_l4proto_log_invalid---of 3
-----------
SUMMARY24%of 69

-----------
SUMMARY---of 0

____napi_schedule36%of 14
__dev_change_flags---of 19
__dev_change_net_namespace---of 70
__dev_close_many---of 12
__dev_direct_xmit---of 20
__dev_forward_skb---of 1
__dev_forward_skb2---of 18
__dev_get_by_flags---of 7
__dev_get_by_index---of 5
__dev_get_by_name---of 4
__dev_notify_flags---of 8
__dev_open---of 16
__dev_queue_xmit10%of 213
__dev_remove_pack---of 7
__dev_set_allmulti---of 12
__dev_set_mtu---of 3
__dev_set_promiscuity---of 14
__dev_set_rx_mode---of 10
__get_xps_queue_idx---of 14
__napi_busy_loop---of 52
__napi_poll---of 17
__napi_schedule---of 11
__napi_schedule_irqoff---of 3
__netdev_adjacent_dev_insert---of 27
__netdev_adjacent_dev_remove---of 20
__netdev_adjacent_dev_unlink_neighbour---of 1
__netdev_has_upper_dev---of 14
__netdev_notify_peers---of 4
__netdev_printk---of 24
__netdev_update_features---of 109
__netdev_update_lower_level---of 15
__netdev_update_upper_level---of 8
__netdev_upper_dev_link---of 22
__netdev_upper_dev_unlink---of 31
__netdev_walk_all_lower_dev---of 17
__netdev_walk_all_upper_dev---of 12
__netif_napi_del---of 33
__netif_receive_skb28%of 11
__netif_receive_skb_core12%of 225
__netif_receive_skb_list_core---of 27
__netif_rx50%of 4
__netif_schedule---of 12
__netif_set_xps_queue---of 145
alloc_netdev_dummy---of 1
alloc_netdev_mqs---of 20
backlog_napi_setup---of 3
backlog_napi_should_run---of 3
bpf_prog_run_generic_xdp---of 38
bpf_xdp_link_attach---of 22
bpf_xdp_link_dealloc---of 1
bpf_xdp_link_detach---of 1
bpf_xdp_link_fill_link_info---of 3
bpf_xdp_link_release---of 17
bpf_xdp_link_show_fdinfo---of 3
bpf_xdp_link_update---of 15
busy_poll_stop---of 17
call_netdevice_notifiers---of 1
call_netdevice_notifiers_info---of 6
call_netdevice_register_net_notifiers---of 11
clean_xps_maps---of 26
cpu_online---of 3
default_device_exit_batch---of 37
deliver_ptype_list_skb---of 14
dev_add_pack---of 5
dev_alloc_name---of 1
dev_change_carrier---of 4
dev_change_flags---of 3
dev_change_name---of 21
dev_change_proto_down---of 5
dev_change_proto_down_reason---of 12
dev_change_tx_queue_len---of 6
dev_change_xdp_fd---of 18
dev_close---of 6
dev_close_many---of 13
dev_cpu_dead---of 35
dev_disable_lro---of 11
dev_fetch_sw_netstats---of 5
dev_fill_forward_path---of 14
dev_fill_metadata_dst---of 19
dev_forward_skb---of 3
dev_forward_skb_nomtu---of 3
dev_get_alias---of 18
dev_get_by_index---of 17
dev_get_by_index_rcu---of 5
dev_get_by_name---of 17
dev_get_by_name_rcu---of 4
dev_get_by_napi_id---of 8
dev_get_flags---of 3
dev_get_iflink---of 4
dev_get_mac_address---of 16
dev_get_phys_port_id---of 3
dev_get_phys_port_name---of 4
dev_get_port_parent_id---of 14
dev_get_stats---of 18
dev_get_tstats64---of 3
dev_getbyhwaddr_rcu---of 7
dev_getfirstbyhwtype---of 17
dev_hard_start_xmit30%of 40
dev_index_release---of 3
dev_index_reserve---of 6
dev_ingress_queue_create---of 9
dev_kfree_skb_any_reason---of 4
dev_kfree_skb_irq_reason---of 11
dev_loopback_xmit---of 12
dev_nit_active67%of 3
dev_open---of 4
dev_pick_tx_cpu_id---of 1
dev_pick_tx_zero---of 1
dev_pre_changeaddr_notify---of 1
dev_prep_valid_name---of 32
dev_qdisc_enqueue---of 16
dev_queue_xmit_nit23%of 44
dev_remove_pack---of 9
dev_set_alias---of 12
dev_set_allmulti---of 1
dev_set_group---of 1
dev_set_mac_address---of 8
dev_set_mac_address_user---of 1
dev_set_mtu---of 5
dev_set_mtu_ext---of 16
dev_set_promiscuity---of 4
dev_set_rx_mode---of 10
dev_set_threaded---of 15
dev_valid_name---of 10
dev_validate_mtu---of 8
dev_xdp_attach---of 90
dev_xdp_install---of 10
dev_xdp_prog---of 5
dev_xdp_prog_count---of 10
dev_xdp_prog_id---of 7
do_netdev_rx_csum_fault---of 1
do_xdp_generic---of 32
enqueue_to_backlog13%of 41
flush_backlog---of 31
free_netdev---of 17
generic_xdp_install---of 15
generic_xdp_tx---of 11
get_rps_cpu---of 42
init_dummy_netdev---of 1
init_dummy_netdev_core---of 1
is_skb_forwardable---of 4
kick_defer_list_purge---of 6
list_netdevice---of 17
napi_busy_loop---of 11
napi_busy_loop_rcu---of 1
napi_by_id---of 5
napi_complete_done---of 28
napi_disable---of 6
napi_enable---of 6
napi_schedule---of 6
napi_schedule_prep---of 5
napi_schedule_rps30%of 10
napi_threaded_poll---of 8
napi_threaded_poll_loop---of 26
napi_watchdog---of 5
net_dec_egress_queue---of 1
net_dec_ingress_queue---of 1
net_disable_timestamp---of 5
net_enable_timestamp---of 5
net_inc_egress_queue---of 1
net_inc_ingress_queue---of 1
net_rps_action_and_irq_enable---of 9
net_rx_action---of 46
net_tx_action---of 49
netdev_adjacent_change_abort---of 11
netdev_adjacent_change_commit---of 10
netdev_adjacent_change_prepare---of 20
netdev_adjacent_get_private---of 1
netdev_adjacent_rename_links---of 11
netdev_alert---of 1
netdev_bind_sb_channel_queue---of 11
netdev_bonding_info_change---of 1
netdev_change_features---of 1
netdev_cmd_to_name---of 42
netdev_copy_name---of 12
netdev_core_pick_tx37%of 11
netdev_core_stats_alloc---of 4
netdev_core_stats_inc---of 4
netdev_crit---of 1
netdev_drivername---of 5
netdev_emerg---of 1
netdev_err---of 1
netdev_exit---of 4
netdev_features_change---of 1
netdev_freemem---of 1
netdev_get_by_index---of 4
netdev_get_by_name---of 4
netdev_get_name---of 15
netdev_get_xmit_slave---of 3
netdev_has_any_upper_dev---of 4
netdev_has_upper_dev---of 16
netdev_has_upper_dev_all_rcu---of 13
netdev_hold---of 5
netdev_increment_features---of 1
netdev_info---of 1
netdev_init---of 4
netdev_init_one_queue---of 9
netdev_is_rx_handler_busy---of 10
netdev_lower_dev_get_private---of 6
netdev_lower_get_first_private_rcu---of 4
netdev_lower_get_next---of 3
netdev_lower_get_next_private---of 3
netdev_lower_get_next_private_rcu---of 6
netdev_lower_state_changed---of 4
netdev_master_upper_dev_get---of 7
netdev_master_upper_dev_get_rcu---of 5
netdev_master_upper_dev_link---of 1
netdev_name_in_use---of 4
netdev_name_node_alt_create---of 8
netdev_name_node_alt_destroy---of 9
netdev_name_node_alt_free---of 1
netdev_next_lower_dev_rcu---of 3
netdev_notice---of 1
netdev_notify_peers---of 1
netdev_offload_xstats_disable---of 10
netdev_offload_xstats_enable---of 15
netdev_offload_xstats_enabled---of 6
netdev_offload_xstats_get---of 13
netdev_offload_xstats_push_delta---of 7
netdev_offload_xstats_report_delta---of 1
netdev_offload_xstats_report_used---of 1
netdev_pick_tx---of 55
netdev_port_same_parent_id---of 5
netdev_printk---of 1
netdev_refcnt_read---of 1
netdev_reg_state---of 9
netdev_reset_tc---of 10
netdev_run_todo---of 54
netdev_rx_csum_fault---of 3
netdev_rx_handler_register---of 4
netdev_rx_handler_unregister---of 6
netdev_set_default_ethtool_ops---of 3
netdev_set_num_tc---of 11
netdev_set_sb_channel---of 4
netdev_set_tc_queue---of 7
netdev_sk_get_lowest_dev---of 6
netdev_state_change---of 3
netdev_stats_to_stats64---of 3
netdev_sw_irq_coalesce_default_on---of 3
netdev_txq_to_tc---of 18
netdev_unbind_sb_channel---of 6
netdev_update_features---of 3
netdev_upper_dev_link---of 1
netdev_upper_dev_unlink---of 1
netdev_upper_get_next_dev_rcu---of 6
netdev_walk_all_lower_dev---of 11
netdev_walk_all_lower_dev_rcu---of 11
netdev_walk_all_upper_dev_rcu---of 14
netdev_warn---of 1
netdev_xmit_skip_txqueue---of 1
netif_device_attach---of 6
netif_device_detach---of 6
netif_get_num_default_rss_queues---of 8
netif_inherit_tso_max---of 8
netif_napi_add_weight---of 20
netif_queue_set_napi---of 12
netif_receive_skb22%of 50
netif_receive_skb_core---of 15
netif_receive_skb_list---of 34
netif_receive_skb_list_internal---of 48
netif_reset_xps_queues_gt---of 4
netif_rx---of 5
netif_rx_internal22%of 32
netif_schedule_queue---of 17
netif_set_real_num_queues---of 17
netif_set_real_num_rx_queues---of 9
netif_set_real_num_tx_queues---of 32
netif_set_tso_max_segs---of 3
netif_set_tso_max_size---of 6
netif_set_xps_queue---of 1
netif_skb_features22%of 41
netif_stacked_transfer_operstate---of 11
netif_tx_stop_all_queues---of 4
netif_tx_wake_queue12%of 17
netstamp_clear---of 3
passthru_features_check---of 1
process_backlog---of 77
qdisc_run---of 10
qdisc_run_end---of 4
rcu_read_unlock---of 6
refcount_dec_and_test---of 4
register_netdev---of 3
register_netdevice---of 82
register_netdevice_notifier---of 13
register_netdevice_notifier_dev_net---of 7
register_netdevice_notifier_net---of 5
remove_xps_queue---of 12
rps_may_expire_flow---of 22
rps_trigger_softirq---of 1
run_backlog_napi---of 3
set_rps_cpu---of 19
skb_checksum_help---of 17
skb_crc32c_csum_help---of 10
skb_csum_hwoffload_help---of 8
skb_dst_force40%of 15
skb_header_pointer---of 4
skb_network_protocol29%of 28
skb_warn_bad_offload---of 6
synchronize_net---of 3
tc_run---of 19
tcx_dec---of 1
tcx_inc---of 1
trace_kfree_skb---of 15
trace_napi_poll---of 15
trace_netif_rx_entry27%of 15
trace_netif_rx_exit27%of 15
trace_xdp_exception---of 15
trigger_rx_softirq---of 1
unlist_netdevice---of 15
unregister_netdev---of 1
unregister_netdevice_many---of 1
unregister_netdevice_many_notify---of 93
unregister_netdevice_notifier---of 10
unregister_netdevice_notifier_dev_net---of 9
unregister_netdevice_notifier_net---of 7
unregister_netdevice_queue---of 10
validate_xmit_skb29%of 53
validate_xmit_skb_list---of 7
write_seqlock---of 1
write_sequnlock---of 1
-----------
SUMMARY18%of 882

-----------
SUMMARY---of 0

__skb_pull---of 3
__tcp_md5_do_add---of 11
__tcp_md5_do_lookup---of 39
__tcp_v4_send_check---of 5
bpf_iter_fini_tcp---of 1
bpf_iter_init_tcp---of 6
bpf_iter_tcp_batch---of 44
bpf_iter_tcp_get_func_proto---of 1
bpf_iter_tcp_seq_next---of 5
bpf_iter_tcp_seq_show---of 14
bpf_iter_tcp_seq_start---of 3
bpf_iter_tcp_seq_stop---of 7
dst_check---of 5
established_get_first---of 14
established_get_next---of 10
inet_iif---of 8
inet_sk_rx_dst_set---of 9
ip_route_newports---of 3
l3mdev_master_ifindex_by_index---of 14
listening_get_first---of 14
listening_get_next---of 10
lockdep_sock_is_held---of 3
nf_reset_ct---of 5
rcu_read_unlock---of 6
reqsk_put---of 7
sk_drops_add---of 1
sk_rst_convert_drop_reason---of 10
skb_dst_set_noref---of 4
sock_owned_by_user---of 5
sock_put---of 4
tcp4_proc_exit---of 1
tcp4_proc_exit_net---of 1
tcp4_proc_init_net---of 1
tcp4_seq_show---of 14
tcp_add_backlog---of 55
tcp_checksum_complete---of 6
tcp_clear_md5_list---of 5
tcp_filter---of 1
tcp_get_idx---of 8
tcp_ld_RTO_revert---of 18
tcp_md5_do_add---of 18
tcp_md5_do_del---of 4
tcp_md5_do_lookup_exact---of 18
tcp_md5_key_copy---of 20
tcp_md5sig_info_free_rcu---of 1
tcp_req_err---of 4
tcp_seek_last_pos---of 16
tcp_segs_in---of 5
tcp_seq_next---of 6
tcp_seq_start---of 7
tcp_seq_stop---of 6
tcp_sk_exit---of 4
tcp_sk_exit_batch---of 7
tcp_sk_init---of 14
tcp_stream_memory_free80%of 5
tcp_twsk_unique---of 28
tcp_v4_conn_request---of 7
tcp_v4_connect---of 40
tcp_v4_destroy_sock---of 27
tcp_v4_do_rcv21%of 48
tcp_v4_early_demux---of 29
tcp_v4_err---of 63
tcp_v4_fill_cb---of 3
tcp_v4_get_syncookie---of 13
tcp_v4_init_seq---of 4
tcp_v4_init_sock---of 1
tcp_v4_init_ts_off---of 1
tcp_v4_md5_hash_hdr---of 7
tcp_v4_md5_hash_skb---of 12
tcp_v4_md5_lookup---of 3
tcp_v4_mtu_reduced---of 21
tcp_v4_parse_md5_keys---of 25
tcp_v4_pre_connect---of 8
tcp_v4_rcv---of 128
tcp_v4_reqsk_destructor---of 1
tcp_v4_reqsk_send_ack---of 17
tcp_v4_route_req---of 7
tcp_v4_send_ack---of 38
tcp_v4_send_check60%of 5
tcp_v4_send_reset---of 120
tcp_v4_send_synack---of 44
tcp_v4_syn_recv_sock---of 36
tcp_v4_timewait_ack---of 6
trace_tcp_bad_csum---of 15
xfrm4_policy_check---of 24
-----------
SUMMARY30%of 58

-----------
SUMMARY---of 0

__cpu_to_node50%of 4
__node_distance---of 3
cpumask_of_node---of 4
debug_cpumask_set_cpu---of 8
early_cpu_to_node---of 7
memory_add_physaddr_to_nid---of 12
numa_clear_node---of 1
numa_cpu_node---of 7
numa_set_node---of 8
phys_to_target_node---of 21
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__pppoe_xmit---of 11
__set_item---of 8
delete_item---of 8
get_item---of 10
get_item_by_addr---of 13
pppoe_connect---of 29
pppoe_create---of 3
pppoe_device_event---of 25
pppoe_disc_rcv---of 18
pppoe_exit_net---of 1
pppoe_fill_forward_path---of 4
pppoe_getname100%of 1
pppoe_init_net---of 1
pppoe_ioctl---of 17
pppoe_pernet---of 16
pppoe_rcv---of 20
pppoe_rcv_core---of 12
pppoe_recvmsg---of 5
pppoe_release---of 16
pppoe_sendmsg---of 11
pppoe_seq_next---of 21
pppoe_seq_show---of 3
pppoe_seq_start---of 12
pppoe_seq_stop---of 1
pppoe_unbind_sock_work---of 7
pppoe_xmit---of 1
sock_put---of 4
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__sctp_packet_append_chunk20%of 31
sctp_compute_cksum---of 1
sctp_csum_combine---of 1
sctp_csum_update---of 1
sctp_packet_append_chunk16%of 39
sctp_packet_config36%of 54
sctp_packet_free38%of 8
sctp_packet_init67%of 3
sctp_packet_transmit24%of 93
sctp_packet_transmit_chunk38%of 8
-----------
SUMMARY26%of 236

-----------
SUMMARY---of 0

__put_net---of 3
cleanup_net---of 50
copy_net_ns10%of 22
free_exit_list---of 13
get_net_ns---of 7
get_net_ns_by_fd---of 9
get_net_ns_by_id---of 19
get_net_ns_by_pid---of 16
net_defaults_init_net---of 1
net_drop_ns---of 5
net_eq_idr---of 1
net_ns_barrier---of 1
net_ns_get_ownership---of 6
net_ns_net_exit---of 1
net_ns_net_init---of 1
netns_get40%of 5
netns_install30%of 10
netns_owner---of 1
netns_put40%of 5
ops_free_list---of 21
ops_init---of 23
peernet2id---of 11
peernet2id_alloc---of 16
peernet_has_id---of 1
register_pernet_device---of 4
register_pernet_operations---of 19
register_pernet_subsys---of 1
rtnl_net_dumpid---of 34
rtnl_net_dumpid_one---of 6
rtnl_net_fill---of 10
rtnl_net_getid---of 39
rtnl_net_newid---of 22
rtnl_net_notifyid---of 6
setup_net---of 38
unregister_pernet_device---of 3
unregister_pernet_operations---of 10
unregister_pernet_subsys---of 1
-----------
SUMMARY22%of 42

-----------
SUMMARY---of 0

alloc_shrinker_info---of 15
do_shrink_slab---of 76
free_shrinker_info---of 12
reparent_shrinker_deferred---of 23
set_shrinker_bit25%of 20
shrink_slab---of 97
shrinker_alloc---of 37
shrinker_free---of 14
shrinker_free_rcu_cb---of 1
shrinker_register---of 4
-----------
SUMMARY25%of 20

-----------
SUMMARY---of 0

__close_range---of 45
__f_unlock_pos---of 1
__fdget80%of 5
__fdget_pos30%of 10
__fdget_raw---of 4
__fget_files24%of 21
__free_fdtable---of 1
__get_unused_fd_flags---of 1
__ia32_sys_dup---of 4
__ia32_sys_dup2---of 1
__ia32_sys_dup3---of 1
__put_unused_fd34%of 9
__se_sys_dup2---of 23
__x64_sys_dup---of 4
__x64_sys_dup2---of 1
__x64_sys_dup3---of 1
alloc_fd41%of 22
alloc_fdtable45%of 9
close_fd---of 3
do_close_on_exec---of 14
do_dup2---of 20
dup_fd33%of 34
exit_files---of 3
expand_files12%of 27
f_dupfd---of 6
fd_install18%of 28
fget---of 1
fget_raw---of 1
fget_task---of 3
file_close_fd100%of 1
file_close_fd_locked31%of 13
free_fdtable_rcu---of 1
get_close_on_exec---of 17
get_file_active---of 17
get_file_rcu---of 9
get_unused_fd_flags100%of 1
iterate_fd---of 20
ksys_dup3---of 12
lookup_fdget_rcu---of 10
put_files_struct16%of 13
put_unused_fd---of 1
rcu_read_unlock_sched---of 8
receive_fd---of 9
receive_fd_replace---of 4
replace_fd---of 6
set_close_on_exec---of 10
task_lookup_fdget_rcu---of 11
task_lookup_next_fdget_rcu---of 19
-----------
SUMMARY29%of 193

-----------
SUMMARY---of 0

__destroy_inode17%of 31
__iget---of 1
__insert_inode_hash---of 5
__remove_inode_hash---of 4
address_space_init_once---of 1
atime_needs_update35%of 20
bmap---of 3
clear_inode29%of 7
clear_nlink---of 3
current_time43%of 7
dentry_needs_remove_privs---of 4
destroy_inode---of 5
discard_new_inode---of 6
drop_nlink---of 5
dump_mapping---of 11
evict35%of 23
evict_inodes---of 23
file_modified---of 1
file_modified_flags---of 18
file_remove_privs---of 1
file_remove_privs_flags---of 15
file_update_time---of 15
find_inode---of 10
find_inode_by_ino_rcu---of 15
find_inode_fast20%of 10
find_inode_nowait---of 10
find_inode_rcu---of 15
free_inode_nonrcu---of 1
generic_delete_inode---of 1
generic_update_time---of 3
get_next_ino56%of 9
get_nr_dirty_inodes---of 9
i_callback---of 3
iget5_locked---of 12
iget_locked18%of 29
igrab---of 3
ihold67%of 3
ilookup---of 9
ilookup5---of 8
ilookup5_nowait---of 3
in_group_or_capable---of 3
inc_nlink---of 4
init_once---of 1
init_special_inode---of 6
inode_add_lru---of 9
inode_dio_wait---of 6
inode_init_always60%of 5
inode_init_once---of 1
inode_init_owner---of 4
inode_insert5---of 16
inode_lru_isolate---of 17
inode_needs_sync---of 6
inode_nohighmem---of 1
inode_owner_or_capable---of 5
inode_sb_list_add---of 3
inode_set_ctime_current100%of 1
inode_set_flags---of 5
inode_update_time---of 4
inode_update_timestamps---of 13
insert_inode_locked---of 15
insert_inode_locked4---of 3
invalidate_inodes---of 23
iput14%of 43
iunique---of 21
kiocb_modified---of 1
lock_two_nondirectories---of 11
lockdep_annotate_inode_mutex_key---of 4
mode_strip_sgid29%of 7
new_inode50%of 4
new_inode_pseudo40%of 10
no_open---of 1
proc_nr_inodes---of 9
prune_icache_sb---of 6
set_nlink40%of 5
timestamp_truncate---of 7
touch_atime35%of 26
unlock_new_inode50%of 6
unlock_two_nondirectories---of 7
wait_on_inode---of 3
-----------
SUMMARY29%of 246

-----------
SUMMARY---of 0

__kernfs_create_file59%of 12
kernfs_drain_open_files---of 17
kernfs_fop_llseek---of 8
kernfs_fop_mmap---of 17
kernfs_fop_open---of 46
kernfs_fop_poll---of 6
kernfs_fop_read_iter---of 24
kernfs_fop_release---of 3
kernfs_fop_write_iter---of 21
kernfs_generic_poll---of 9
kernfs_notify---of 21
kernfs_notify_workfn---of 14
kernfs_release_file---of 10
kernfs_seq_next---of 13
kernfs_seq_show---of 6
kernfs_seq_start---of 14
kernfs_seq_stop---of 8
kernfs_should_drain_open_files---of 21
kernfs_unlink_open_file---of 19
kernfs_vma_access---of 5
kernfs_vma_fault---of 5
kernfs_vma_open---of 5
kernfs_vma_page_mkwrite---of 5
of_on---of 6
-----------
SUMMARY59%of 12

dst_check---of 5
inet6_sk_rx_dst_set---of 9
ip6_dst_store60%of 5
ip6_sk_accept_pmtu---of 5
l3mdev_master_ifindex_by_index---of 14
nf_reset_ct---of 5
rcu_read_unlock---of 6
reqsk_put---of 7
rt6_get_cookie24%of 25
sk_dev_equal_l3scope---of 3
sk_drops_add---of 1
sk_rst_convert_drop_reason---of 10
skb_clone_and_charge_r---of 12
skb_dst_set_noref---of 4
sock_owned_by_user---of 5
sock_put---of 4
tcp6_proc_exit---of 1
tcp6_proc_init---of 1
tcp6_seq_show---of 14
tcp_checksum_complete---of 6
tcp_segs_in---of 5
tcp_v6_conn_request---of 9
tcp_v6_connect22%of 51
tcp_v6_do_rcv9%of 71
tcp_v6_early_demux---of 28
tcp_v6_err---of 58
tcp_v6_fill_cb---of 3
tcp_v6_get_syncookie---of 13
tcp_v6_init_seq---of 4
tcp_v6_init_sock---of 1
tcp_v6_init_ts_off---of 1
tcp_v6_md5_hash_skb---of 12
tcp_v6_md5_lookup---of 3
tcp_v6_mtu_reduced---of 13
tcp_v6_parse_md5_keys---of 26
tcp_v6_pre_connect---of 8
tcp_v6_rcv---of 108
tcp_v6_reqsk_destructor---of 1
tcp_v6_reqsk_send_ack---of 11
tcp_v6_route_req---of 14
tcp_v6_send_check60%of 5
tcp_v6_send_reset---of 74
tcp_v6_send_response---of 74
tcp_v6_send_synack---of 50
tcp_v6_syn_recv_sock---of 52
tcp_v6_timewait_ack---of 6
tcpv6_exit---of 1
tcpv6_net_exit---of 3
tcpv6_net_init---of 3
trace_tcp_bad_csum---of 15
xfrm6_policy_check---of 33
-----------
SUMMARY19%of 157

-----------
SUMMARY---of 0

__nf_tables_commit_chain_free_rules---of 1
__nf_tables_dump_rules---of 17
__nft_obj_notify---of 8
__nft_reg_track_cancel---of 3
__nft_reg_track_clobber---of 8
__nft_release_basechain---of 24
__nft_release_hook---of 28
__nft_release_table---of 48
__nft_trans_set_add---of 8
jhash---of 17
lockdep_commit_lock_is_held---of 1
nf_jiffies64_to_msecs---of 1
nf_msecs_to_jiffies64---of 3
nf_tables_abort---of 264
nf_tables_activate_set---of 6
nf_tables_bind_chain---of 13
nf_tables_bind_check_setelem---of 8
nf_tables_bind_set---of 37
nf_tables_chain_destroy---of 20
nf_tables_chain_notify---of 8
nf_tables_check_loops---of 36
nf_tables_commit---of 303
nf_tables_deactivate_flowtable---of 8
nf_tables_deactivate_set---of 35
nf_tables_delchain---of 49
nf_tables_delflowtable---of 66
nf_tables_delobj---of 39
nf_tables_delrule---of 60
nf_tables_delset---of 42
nf_tables_delsetelem---of 72
nf_tables_deltable---of 50
nf_tables_destroy_set---of 4
nf_tables_dump_chains---of 30
nf_tables_dump_flowtable---of 33
nf_tables_dump_flowtable_done---of 3
nf_tables_dump_flowtable_start---of 5
nf_tables_dump_obj---of 39
nf_tables_dump_obj_done---of 1
nf_tables_dump_obj_start---of 8
nf_tables_dump_rules---of 30
nf_tables_dump_rules_done---of 1
nf_tables_dump_rules_start---of 7
nf_tables_dump_set---of 46
nf_tables_dump_set_done---of 1
nf_tables_dump_set_start---of 1
nf_tables_dump_setelem---of 8
nf_tables_dump_sets---of 31
nf_tables_dump_sets_done---of 1
nf_tables_dump_sets_start---of 3
nf_tables_dump_tables---of 27
nf_tables_dumpreset_rules---of 1
nf_tables_dumpreset_rules_start---of 7
nf_tables_dumpreset_set---of 4
nf_tables_exit_batch---of 1
nf_tables_exit_net---of 17
nf_tables_expr_parse---of 40
nf_tables_fill_chain_info---of 28
nf_tables_fill_expr_info---of 6
nf_tables_fill_flowtable_info---of 18
nf_tables_fill_gen_info---of 10
nf_tables_fill_obj_info---of 15
nf_tables_fill_rule_info---of 23
nf_tables_fill_set---of 50
nf_tables_fill_set_concat---of 9
nf_tables_fill_setelem---of 48
nf_tables_fill_setelem_info---of 11
nf_tables_fill_table_info---of 16
nf_tables_flowtable_destroy---of 6
nf_tables_flowtable_event---of 14
nf_tables_flowtable_notify---of 8
nf_tables_getchain---of 19
nf_tables_getflowtable---of 26
nf_tables_getgen---of 4
nf_tables_getobj---of 24
nf_tables_getrule---of 4
nf_tables_getrule_reset---of 16
nf_tables_getrule_single---of 24
nf_tables_getset---of 31
nf_tables_getsetelem---of 12
nf_tables_getsetelem_reset---of 35
nf_tables_gettable---of 17
nf_tables_init_net---of 1
nf_tables_loop_check_setelem---of 8
nf_tables_module_autoload_cleanup---of 10
nf_tables_newchain---of 110
nf_tables_newflowtable---of 71
nf_tables_newobj---of 48
nf_tables_newrule---of 118
nf_tables_newset---of 97
nf_tables_newsetelem---of 202
nf_tables_newtable---of 69
nf_tables_parse_netdev_hooks---of 24
nf_tables_pre_exit_net---of 6
nf_tables_register_hook---of 14
nf_tables_rule_destroy---of 7
nf_tables_rule_notify---of 12
nf_tables_rule_release---of 13
nf_tables_set_alloc_name---of 22
nf_tables_set_desc_parse---of 23
nf_tables_set_elem_destroy---of 9
nf_tables_set_notify---of 8
nf_tables_setelem_notify---of 8
nf_tables_table_enable---of 9
nf_tables_table_notify---of 8
nf_tables_trans_destroy_flush_work---of 1
nf_tables_trans_destroy_work---of 65
nf_tables_unbind_chain---of 10
nf_tables_updchain---of 80
nf_tables_updobj---of 5
nf_tables_valid_genid---of 4
nf_tables_validate---of 13
nft_chain_add---of 4
nft_chain_del---of 5
nft_chain_hash---of 1
nft_chain_hash_cmp---of 1
nft_chain_hash_obj---of 1
nft_chain_lookup---of 19
nft_chain_parse_hook---of 79
nft_chain_release_hook---of 6
nft_chain_validate---of 15
nft_chain_validate_dependency---of 4
nft_chain_validate_hooks---of 4
nft_data_dump---of 5
nft_data_hold---of 5
nft_data_init---of 39
nft_data_release---of 6
nft_delchain---of 5
nft_delchain_hook---of 28
nft_delflowtable---of 5
nft_delrule---of 20
nft_delset---of 6
nft_dump_basechain_hook---of 15
nft_dump_register---of 1
nft_dump_stats---of 10
nft_expr_clone---of 4
nft_expr_destroy---of 3
nft_expr_dump---of 4
nft_expr_inner_parse---of 13
nft_flowtable_lookup---of 7
nft_flowtable_parse_hook---of 20
nft_flowtable_type_get---of 26
nft_flush_table---of 40
nft_get_set_elem---of 31
nft_hooks_destroy---of 8
nft_map_activate---of 14
nft_map_deactivate---of 6
nft_mapelem_activate---of 11
nft_mapelem_deactivate---of 3
nft_netdev_register_hooks---of 7
nft_netlink_dump_start_rcu---of 14
nft_obj_init---of 11
nft_obj_lookup---of 24
nft_obj_notify---of 3
nft_obj_type_get---of 29
nft_object_dump---of 4
nft_objname_hash---of 1
nft_objname_hash_cmp---of 3
nft_objname_hash_obj---of 1
nft_parse_register_load---of 25
nft_parse_register_store---of 33
nft_parse_u32_check---of 3
nft_pernet---of 16
nft_rcv_nl_event9%of 23
nft_reg_track_cancel---of 6
nft_reg_track_update---of 6
nft_register_chain_type---of 8
nft_register_expr---of 6
nft_register_flowtable_net_hooks---of 24
nft_register_flowtable_type---of 3
nft_register_obj---of 4
nft_request_module---of 7
nft_rule_expr_activate---of 7
nft_rule_expr_deactivate---of 7
nft_rule_lookup_byid---of 8
nft_select_set_ops---of 22
nft_set_catchall_dump---of 14
nft_set_catchall_lookup---of 13
nft_set_catchall_validate---of 12
nft_set_destroy---of 31
nft_set_dump_ctx_init---of 24
nft_set_elem_destroy---of 19
nft_set_elem_expr_alloc---of 16
nft_set_elem_expr_clone---of 16
nft_set_elem_expr_setup---of 20
nft_set_elem_init---of 17
nft_set_expr_alloc---of 21
nft_set_is_same---of 19
nft_set_lookup---of 8
nft_set_lookup_global---of 17
nft_setelem_data_deactivate---of 11
nft_setelem_deactivate---of 6
nft_setelem_flush---of 4
nft_setelem_insert---of 7
nft_setelem_remove---of 10
nft_setelem_validate---of 9
nft_stats_alloc---of 13
nft_table_disable---of 18
nft_table_lookup---of 10
nft_table_validate---of 7
nft_trans_alloc---of 3
nft_trans_commit_list_add_tail---of 12
nft_trans_destroy---of 5
nft_trans_elem_alloc---of 3
nft_trans_flowtable_add---of 5
nft_trans_gc_alloc---of 15
nft_trans_gc_catchall_async---of 15
nft_trans_gc_catchall_sync---of 19
nft_trans_gc_destroy---of 7
nft_trans_gc_elem_add---of 3
nft_trans_gc_queue_async---of 4
nft_trans_gc_queue_async_done---of 4
nft_trans_gc_queue_sync---of 4
nft_trans_gc_queue_sync_done---of 5
nft_trans_gc_trans_free---of 18
nft_trans_gc_work---of 22
nft_trans_obj_add---of 5
nft_trans_rule_add---of 5
nft_trans_table_add---of 5
nft_unregister_chain_type---of 5
nft_unregister_expr---of 3
nft_unregister_flowtable_type---of 3
nft_unregister_obj---of 3
nft_validate_register_store---of 9
nft_verdict_dump---of 6
nla_put_string---of 1
rhltable_insert_key---of 82
rhltable_lookup---of 30
rhltable_remove---of 83
rht_lock---of 9
rht_unlock---of 10
-----------
SUMMARY9%of 23

-----------
SUMMARY---of 0

arch_do_signal_or_restart24%of 30
get_sigframe25%of 20
get_sigframe_size---of 1
sigaltstack_size_valid---of 10
signal_fault---of 4
-----------
SUMMARY24%of 50

-----------
SUMMARY---of 0

sort100%of 1
sort_r3%of 69
-----------
SUMMARY5%of 70

__frame_add_frag---of 9
cfg80211_calculate_bitrate---of 24
cfg80211_calculate_bitrate_eht---of 47
cfg80211_calculate_bitrate_he---of 19
cfg80211_calculate_bitrate_s1g---of 12
cfg80211_change_iface---of 77
cfg80211_check_combinations---of 3
cfg80211_classify8021d---of 61
cfg80211_does_bw_fit_range---of 4
cfg80211_free_nan_func---of 8
cfg80211_get_iftype_ext_capa---of 5
cfg80211_get_p2p_attr---of 24
cfg80211_get_station---of 32
cfg80211_iftype_allowed---of 8
cfg80211_iter_combinations---of 69
cfg80211_iter_sum_ifcombs---of 1
cfg80211_process_rdev_events---of 7
cfg80211_process_wdev_events---of 14
cfg80211_remove_link---of 42
cfg80211_remove_links---of 8
cfg80211_remove_virtual_intf---of 22
cfg80211_send_layer2_update---of 3
cfg80211_sinfo_alloc_tid_stats---of 4
cfg80211_supported_cipher_suite---of 6
cfg80211_upload_connect_keys---of 38
cfg80211_valid_key_idx---of 10
cfg80211_validate_beacon_int---of 1
cfg80211_validate_key_settings---of 82
ieee80211_amsdu_to_8023s---of 46
ieee80211_bss_get_elem---of 8
ieee80211_chandef_to_operating_class---of 23
ieee80211_channel_to_freq_khz19%of 16
ieee80211_data_to_8023_exthdr---of 45
ieee80211_fragment_element---of 5
ieee80211_freq_khz_to_channel---of 9
ieee80211_get_8023_tunnel_proto---of 7
ieee80211_get_channel_khz7%of 31
ieee80211_get_hdrlen_from_skb---of 7
ieee80211_get_mesh_hdrlen---of 1
ieee80211_get_num_supported_channels---of 13
ieee80211_get_ratemask---of 9
ieee80211_get_response_rate---of 8
ieee80211_get_vht_max_nss---of 29
ieee80211_hdrlen34%of 6
ieee80211_ie_split_ric---of 39
ieee80211_is_valid_amsdu---of 10
ieee80211_mandatory_rates---of 8
ieee80211_operating_class_to_band7%of 30
ieee80211_operating_class_to_chandef7%of 31
ieee80211_s1g_channel_width---of 9
ieee80211_set_bitrate_flags---of 36
ieee80211_strip_8023_mesh_hdr---of 21
pskb_may_pull---of 4
trace_rdev_return_int---of 15
-----------
SUMMARY10%of 114

__bpf_trace_alloc_vmap_area---of 1
__bpf_trace_free_vmap_area_noflush---of 1
__bpf_trace_purge_vmap_area_lazy---of 1
__get_vm_area_caller---of 1
__get_vm_area_node34%of 9
__probestub_alloc_vmap_area---of 1
__probestub_free_vmap_area_noflush---of 1
__probestub_purge_vmap_area_lazy---of 1
__purge_vmap_area_lazy---of 37
__traceiter_alloc_vmap_area---of 4
__traceiter_free_vmap_area_noflush---of 4
__traceiter_purge_vmap_area_lazy---of 4
__vmalloc_node_noprof---of 1
__vmalloc_node_range_noprof21%of 64
__vmalloc_noprof100%of 1
__vmap_pages_range_noflush19%of 43
__vunmap_range_noflush35%of 23
_vm_unmap_aliases---of 30
aligned_vread_iter---of 8
alloc_vmap_area29%of 127
check_sparse_vm_area---of 7
decay_va_pool_node---of 42
delayed_vfree_work---of 4
drain_vmap_area_work---of 1
find_unlink_vmap_area42%of 12
find_vm_area---of 11
find_vmap_area46%of 11
find_vmap_area_exceed_addr_lock---of 23
free_unmap_vmap_area---of 1
free_vm_area---of 3
free_vmap_area---of 66
free_vmap_area_noflush30%of 24
free_vmap_area_rb_augment_cb_rotate---of 5
free_vmap_block---of 9
get_vm_area---of 1
get_vm_area_caller---of 1
insert_vmap_area58%of 14
insert_vmap_area_augment---of 25
ioremap_page_range---of 19
is_vmalloc_addr100%of 1
is_vmalloc_or_module_addr100%of 1
mod_memcg_page_state31%of 36
pcpu_free_vm_areas---of 5
pcpu_get_vm_areas---of 260
perf_trace_alloc_vmap_area---of 8
perf_trace_free_vmap_area_noflush---of 8
perf_trace_purge_vmap_area_lazy---of 8
pfn_valid21%of 29
purge_fragmented_block---of 6
purge_vmap_node---of 19
reclaim_and_purge_vmap_areas---of 25
reclaim_list_global---of 67
register_vmap_purge_notifier---of 1
remap_vmalloc_range---of 1
remap_vmalloc_range_partial---of 19
remove_vm_area40%of 5
trace_event_raw_event_alloc_vmap_area---of 7
trace_event_raw_event_free_vmap_area_noflush---of 7
trace_event_raw_event_purge_vmap_area_lazy---of 7
trace_raw_output_alloc_vmap_area---of 3
trace_raw_output_free_vmap_area_noflush---of 3
trace_raw_output_purge_vmap_area_lazy---of 3
unregister_vmap_purge_notifier---of 1
vfree28%of 11
vfree_atomic---of 5
vm_area_map_pages---of 3
vm_area_unmap_pages---of 3
vm_flags_set---of 6
vm_map_ram---of 42
vm_reset_perms---of 18
vm_unmap_aliases---of 1
vm_unmap_ram---of 16
vmalloc_32_noprof---of 1
vmalloc_32_user_noprof---of 1
vmalloc_dump_obj---of 10
vmalloc_huge_noprof---of 1
vmalloc_info_show---of 50
vmalloc_node_noprof---of 1
vmalloc_noprof100%of 1
vmalloc_nr_pages---of 1
vmalloc_to_page17%of 12
vmalloc_to_pfn---of 1
vmalloc_user_noprof---of 1
vmap---of 9
vmap_node_shrink_count---of 6
vmap_node_shrink_scan---of 4
vmap_page_range---of 1
vmap_pages_range_noflush---of 1
vmap_pfn---of 5
vmap_pfn_apply---of 8
vmap_range_noflush---of 44
vread_iter---of 50
vunmap---of 5
vunmap_range---of 1
vunmap_range_noflush---of 1
vzalloc_node_noprof---of 1
vzalloc_noprof100%of 1
-----------
SUMMARY29%of 425

ima_add_violation---of 6
ima_alloc_init_template---of 13
ima_audit_measurement---of 10
ima_collect_measurement---of 26
ima_d_path---of 5
ima_free_template_entry---of 4
ima_get_action100%of 1
ima_store_measurement---of 17
ima_store_template---of 4
-----------
SUMMARY100%of 1

crypto_alloc_skcipher---of 1
crypto_alloc_sync_skcipher50%of 4
crypto_grab_skcipher---of 1
crypto_has_skcipher---of 1
crypto_register_skcipher---of 13
crypto_register_skciphers---of 20
crypto_skcipher_decrypt---of 4
crypto_skcipher_encrypt---of 4
crypto_skcipher_exit_tfm---of 1
crypto_skcipher_export---of 3
crypto_skcipher_extsize67%of 3
crypto_skcipher_free_instance---of 1
crypto_skcipher_import---of 3
crypto_skcipher_init_tfm50%of 8
crypto_skcipher_report---of 1
crypto_skcipher_setkey---of 11
crypto_skcipher_show---of 1
crypto_unregister_skcipher---of 1
crypto_unregister_skciphers---of 4
skcipher_alloc_instance_simple---of 6
skcipher_done_slow---of 1
skcipher_exit_tfm_simple---of 1
skcipher_free_instance_simple---of 1
skcipher_init_tfm_simple---of 3
skcipher_map_dst---of 3
skcipher_next_copy---of 6
skcipher_next_slow---of 9
skcipher_noexport---of 1
skcipher_noimport---of 1
skcipher_prepare_alg_common---of 6
skcipher_register_instance---of 14
skcipher_setkey_simple---of 1
skcipher_walk_aead_common---of 18
skcipher_walk_aead_decrypt---of 1
skcipher_walk_aead_encrypt---of 1
skcipher_walk_async---of 1
skcipher_walk_complete---of 18
skcipher_walk_done---of 29
skcipher_walk_next---of 16
skcipher_walk_skcipher---of 8
skcipher_walk_virt---of 3
-----------
SUMMARY54%of 15

-----------
SUMMARY---of 0

__mac80211_hwsim_beacon_tx---of 19
append_radio_msg---of 28
get_hwsim_data_ref_from_addr---of 38
hw_roc_done---of 3
hw_roc_start---of 3
hw_scan_work---of 33
hwsim_cloned_frame_received_nl---of 32
hwsim_del_radio_nl---of 20
hwsim_dump_radio_nl---of 18
hwsim_exit_net---of 17
hwsim_fops_group_open---of 1
hwsim_fops_group_read---of 1
hwsim_fops_group_write---of 1
hwsim_fops_ps_open---of 1
hwsim_fops_ps_read---of 1
hwsim_fops_ps_write---of 10
hwsim_fops_rx_rssi_open---of 1
hwsim_fops_rx_rssi_read---of 1
hwsim_fops_rx_rssi_write---of 3
hwsim_get_chanwidth---of 16
hwsim_get_radio_nl---of 10
hwsim_init_net---of 1
hwsim_init_s1g_channels---of 3
hwsim_mon_setup---of 1
hwsim_mon_xmit---of 1
hwsim_new_radio_nl---of 83
hwsim_pmsr_report_nl---of 78
hwsim_register_received_nl---of 13
hwsim_send_nullfunc---of 20
hwsim_send_nullfunc_no_ps---of 1
hwsim_send_nullfunc_ps---of 1
hwsim_send_ps_poll---of 20
hwsim_simulate_radar_open---of 1
hwsim_tx_info_frame_received_nl---of 18
hwsim_tx_virtio---of 4
hwsim_unicast_netgroup---of 16
hwsim_virtio_probe---of 10
hwsim_virtio_remove---of 1
hwsim_virtio_rx_done---of 1
hwsim_virtio_rx_work---of 16
hwsim_virtio_tx_done---of 4
hwsim_write_simulate_radar---of 1
jhash---of 17
mac80211_hwsim_abort_pmsr---of 12
mac80211_hwsim_add_chanctx---of 3
mac80211_hwsim_add_interface---of 8
mac80211_hwsim_addr_iter---of 12
mac80211_hwsim_ampdu_action---of 9
mac80211_hwsim_assign_vif_chanctx---of 9
mac80211_hwsim_bcn_en_iter---of 3
mac80211_hwsim_beacon---of 4
mac80211_hwsim_beacon_tx---of 38
mac80211_hwsim_can_neg_ttlm---of 17
mac80211_hwsim_cancel_hw_scan---of 3
mac80211_hwsim_change_chanctx---of 5
mac80211_hwsim_change_interface---of 12
mac80211_hwsim_change_sta_links---of 5
mac80211_hwsim_change_vif_links---of 18
mac80211_hwsim_conf_tx---of 3
mac80211_hwsim_config---of 30
mac80211_hwsim_config_mac_nl---of 14
mac80211_hwsim_configure_filter---of 5
mac80211_hwsim_croc---of 3
mac80211_hwsim_del_radio---of 8
mac80211_hwsim_flush---of 1
mac80211_hwsim_free---of 8
mac80211_hwsim_get_et_sset_count---of 1
mac80211_hwsim_get_et_stats---of 1
mac80211_hwsim_get_et_strings---of 3
mac80211_hwsim_get_radio---of 15
mac80211_hwsim_get_survey---of 5
mac80211_hwsim_get_tsf---of 1
mac80211_hwsim_hw_scan---of 7
mac80211_hwsim_link_add_debugfs---of 3
mac80211_hwsim_link_info_changed---of 41
mac80211_hwsim_monitor_ack---of 4
mac80211_hwsim_monitor_rx---of 11
mac80211_hwsim_netlink_notify25%of 24
mac80211_hwsim_new_radio---of 137
mac80211_hwsim_parse_rate_info---of 26
mac80211_hwsim_remove_chanctx---of 5
mac80211_hwsim_remove_interface---of 10
mac80211_hwsim_roc---of 5
mac80211_hwsim_rx---of 21
mac80211_hwsim_send_pmsr_ftm_request_peer---of 30
mac80211_hwsim_send_pmsr_request---of 20
mac80211_hwsim_set_rts_threshold---of 1
mac80211_hwsim_set_tim---of 3
mac80211_hwsim_set_tsf---of 3
mac80211_hwsim_sta_add---of 6
mac80211_hwsim_sta_notify---of 5
mac80211_hwsim_sta_rc_update---of 55
mac80211_hwsim_sta_remove---of 3
mac80211_hwsim_sta_state---of 8
mac80211_hwsim_start---of 3
mac80211_hwsim_start_pmsr---of 11
mac80211_hwsim_stop---of 6
mac80211_hwsim_sw_scan---of 5
mac80211_hwsim_sw_scan_complete---of 3
mac80211_hwsim_switch_vif_chanctx---of 14
mac80211_hwsim_tx---of 123
mac80211_hwsim_tx_frame---of 6
mac80211_hwsim_tx_frame_nl---of 22
mac80211_hwsim_tx_frame_no_nl---of 67
mac80211_hwsim_tx_iter---of 18
mac80211_hwsim_tx_last_beacon---of 1
mac80211_hwsim_unassign_vif_chanctx---of 9
mac80211_hwsim_vendor_cmd_test---of 9
mac80211_hwsim_vif_info_changed---of 11
net_generic25%of 16
remove_vqs---of 7
rhashtable_remove_fast---of 64
rht_lock---of 9
rht_unlock---of 10
-----------
SUMMARY25%of 40

__ia32_compat_sys_ioctl---of 1
__ia32_sys_ioctl---of 1
__se_compat_sys_ioctl---of 34
__se_sys_ioctl38%of 8
__x64_compat_sys_ioctl---of 1
__x64_sys_ioctl100%of 1
compat_ptr_ioctl---of 3
copy_fsxattr_to_user---of 1
do_vfs_ioctl2%of 116
fiemap_fill_next_extent---of 5
fiemap_prep---of 8
fileattr_fill_flags---of 15
fileattr_fill_xflags---of 15
vfs_fileattr_get---of 3
vfs_fileattr_set---of 35
vfs_ioctl---of 3
-----------
SUMMARY5%of 125

-----------
SUMMARY---of 0

__nf_conntrack_alloc34%of 9
__nf_conntrack_confirm34%of 48
__nf_conntrack_find_get32%of 22
__nf_conntrack_insert_prepare30%of 10
__nf_ct_change_status---of 6
__nf_ct_change_timeout---of 4
__nf_ct_delete_from_lists---of 15
__nf_ct_refresh_acct47%of 13
__nf_ct_resolve_clash---of 46
early_drop---of 41
gc_worker---of 70
get_l4proto23%of 18
hash_conntrack50%of 4
init_conntrack20%of 65
kill_all---of 1
nf_conntrack_alloc---of 1
nf_conntrack_attach---of 5
nf_conntrack_cleanup_end---of 1
nf_conntrack_cleanup_net---of 3
nf_conntrack_cleanup_net_list---of 11
nf_conntrack_cleanup_start---of 1
nf_conntrack_double_lock50%of 10
nf_conntrack_double_unlock---of 3
nf_conntrack_event_cache50%of 8
nf_conntrack_find_get---of 23
nf_conntrack_free---of 21
nf_conntrack_get_tuple_skb---of 9
nf_conntrack_handle_icmp---of 6
nf_conntrack_hash_check_insert---of 34
nf_conntrack_hash_resize---of 29
nf_conntrack_in31%of 72
nf_conntrack_init_end---of 1
nf_conntrack_init_net---of 4
nf_conntrack_init_start---of 18
nf_conntrack_lock---of 3
nf_conntrack_set_closing---of 3
nf_conntrack_set_hashsize---of 5
nf_conntrack_tuple_taken16%of 44
nf_conntrack_update---of 58
nf_ct_acct_add---of 7
nf_ct_alloc_hashtable---of 5
nf_ct_change_status_common---of 8
nf_ct_delete---of 20
nf_ct_destroy---of 11
nf_ct_gc_expired---of 16
nf_ct_get_id---of 4
nf_ct_get_tuple39%of 26
nf_ct_get_tuplepr---of 3
nf_ct_invert_tuple43%of 7
nf_ct_iterate_cleanup---of 24
nf_ct_iterate_cleanup_net---of 3
nf_ct_iterate_destroy---of 6
nf_ct_key_equal34%of 18
nf_ct_kill_acct---of 7
nf_ct_pernet25%of 16
nf_ct_port_nlattr_to_tuple---of 7
nf_ct_port_nlattr_tuple_size---of 3
nf_ct_port_tuple_to_nlattr---of 4
nf_ct_put---of 5
nf_ct_resolve_clash---of 8
nf_ct_tmpl_alloc---of 6
nf_ct_tmpl_free---of 1
seqcount_lockdep_reader_access58%of 7
-----------
SUMMARY30%of 397

__btf_type_is_scalar_struct---of 15
__check_mem_access---of 12
__check_ptr_off_reg---of 6
__check_reg_arg34%of 12
__mark_chain_precision---of 384
__mark_reg_known---of 1
__mark_reg_not_init---of 1
__mark_reg_unknown---of 1
__process_kf_arg_ptr_to_graph_node---of 41
__process_kf_arg_ptr_to_graph_root---of 30
__reg_deduce_bounds---of 25
acquire_reference_state---of 8
add_subprog43%of 7
adjust_ptr_min_max_vals---of 58
adjust_reg_min_max_vals---of 126
bpf_check12%of 691
bpf_check_attach_target---of 80
bpf_free_kfunc_btf_tab---of 9
bpf_get_btf_vmlinux---of 1
bpf_get_kfunc_addr---of 3
bpf_jit_find_kfunc_model---of 1
bpf_map_direct_read---of 7
bpf_patch_insn_data---of 27
bpf_prog_has_kfunc_call---of 1
bt_clear_frame_slot---of 5
bt_empty---of 5
bt_set_reg---of 5
bt_subprog_enter---of 4
bt_subprog_exit---of 4
btf_check_subprog_call---of 34
btf_field_type_name---of 13
btf_id_cmp_func---of 1
btf_type_is_struct_ptr---of 3
btf_type_name---of 1
check_attach_btf_id5%of 47
check_bpf_snprintf_call---of 5
check_btf_info3%of 72
check_buffer_access---of 6
check_cfg18%of 58
check_cond_jmp_op---of 180
check_css_task_iter_allowlist---of 10
check_ctx_access---of 7
check_func_arg_reg_off---of 23
check_func_call---of 20
check_get_func_ip---of 7
check_helper_call---of 331
check_helper_mem_access---of 38
check_ids---of 10
check_kfunc_call---of 334
check_kfunc_mem_size_reg---of 27
check_ld_abs---of 24
check_ld_imm16%of 25
check_map_access---of 21
check_map_func_compatibility---of 163
check_map_kptr_access---of 39
check_map_prog_compatibility---of 60
check_mem_access---of 105
check_mem_reg---of 20
check_mem_region_access---of 13
check_mem_size_reg---of 6
check_non_sleepable_error_inject---of 1
check_packet_access---of 7
check_pseudo_btf_id---of 36
check_ptr_alignment---of 20
check_ptr_to_btf_access---of 38
check_ptr_to_map_access---of 12
check_reference_leak29%of 14
check_reg_arg---of 3
check_reg_const_str---of 11
check_reg_sane_offset---of 6
check_reg_type---of 62
check_return_code15%of 68
check_scalar_ids---of 14
check_sock_access---of 13
check_stack_access_within_bounds---of 21
check_stack_range_initialized---of 65
check_stack_read---of 94
check_stack_write---of 157
check_subprogs35%of 26
check_tp_buffer_access---of 6
clear_all_pkt_pointers---of 28
clear_caller_saved_regs---of 20
cmp_subprogs---of 1
coerce_reg_to_size---of 5
coerce_reg_to_size_sx---of 10
coerce_subreg_to_size_sx---of 7
convert_pseudo_ld_imm64---of 15
copy_from_bpfptr_offset---of 4
copy_to_bpfptr_offset---of 3
copy_verifier_state---of 31
destroy_if_dynptr_stack_slot---of 36
disasm_kfunc_name---of 4
do_check4%of 650
do_check_common21%of 62
do_check_subprogs11%of 19
do_refine_retval_range---of 9
dynptr_get_type---of 6
dynptr_id---of 6
dynptr_ref_obj_id---of 6
fetch_kfunc_meta---of 9
find_equal_scalars---of 24
find_good_pkt_pointers---of 26
find_kfunc_desc_btf---of 14
find_prev_entry---of 17
find_subprog---of 3
free_verifier_state50%of 10
get_dynptr_arg_reg---of 20
helper_multiple_ref_obj_use---of 20
in_rbtree_lock_required_cb---of 6
in_rcu_cs---of 6
insn_has_def32---of 19
invalidate_dynptr---of 1
invalidate_non_owning_refs---of 24
is_acquire_function---of 10
is_arena_reg---of 3
is_flow_key_reg---of 3
is_kfunc_arg_const_mem_size---of 6
is_kfunc_arg_mem_size---of 6
is_kfunc_arg_rbtree_node---of 5
is_kfunc_arg_rbtree_root---of 5
is_kfunc_arg_wq---of 5
is_pkt_reg---of 3
is_pointer_value---of 5
is_ptr_cast_function---of 10
is_reg6419%of 16
is_sk_reg---of 8
is_trusted_reg---of 9
kfunc_btf_cmp_by_off---of 1
kfunc_desc_cmp_by_id_off---of 3
kfunc_desc_cmp_by_imm_off---of 4
map_kptr_match_type---of 8
map_set_for_each_callback_args---of 1
mark_all_scalars_precise---of 22
mark_btf_func_reg_size---of 27
mark_btf_ld_reg---of 4
mark_chain_precision---of 1
mark_ptr_not_null_reg---of 12
mark_ptr_or_null_reg---of 13
mark_ptr_or_null_regs---of 29
mark_reg_invalid---of 3
mark_reg_known_zero---of 3
mark_reg_not_init67%of 3
mark_reg_read19%of 11
mark_reg_stack_read---of 15
mark_reg_unknown67%of 3
may_update_sockmap---of 16
prepare_func_exit---of 37
process_dynptr_func---of 84
process_iter_arg---of 80
process_iter_next_call---of 34
process_kptr_func---of 11
process_spin_lock---of 18
process_timer_func---of 10
process_wq_func---of 5
push_callback_call---of 45
push_insn32%of 16
push_jmp_history---of 6
push_stack---of 12
range_within---of 9
record_func_key---of 12
record_func_map---of 22
ref_convert_owning_non_owning---of 42
ref_set_non_owning---of 9
reg_bounds_sanity_check---of 16
reg_bounds_sync---of 1
reg_btf_record---of 5
reg_may_point_to_spin_lock---of 7
reg_set_min_max---of 20
regs_exact---of 23
regs_refine_cond_op---of 57
regsafe---of 35
release_reference---of 34
release_reference_state---of 9
resolve_pseudo_ldimm648%of 79
sanitize_check_bounds---of 8
sanitize_err---of 7
sanitize_ptr_alu---of 25
sanitize_speculative_path---of 6
save_aux_ptr_type---of 20
save_register_state---of 16
scalar_reg_for_stack---of 35
set_callee_state---of 1
set_find_vma_callback_state---of 1
set_loop_callback_state---of 1
set_map_elem_callback_state---of 5
set_rbtree_add_callback_state---of 20
set_timer_callback_state---of 1
set_user_ringbuf_callback_state---of 1
setup_func_entry---of 17
specialize_kfunc---of 15
stack_slot_obj_get_spi---of 8
states_equal---of 99
try_match_pkt_pointers---of 107
unmark_stack_slots_dynptr---of 14
unmark_stack_slots_iter---of 14
update_loop_inline_state---of 12
verbose50%of 4
verbose_invalid_scalar---of 5
verifier_remove_insns---of 60
widen_imprecise_scalars---of 24
zext_32_to_64---of 4
-----------
SUMMARY11%of 1893

__bad_area_nosemaphore27%of 15
__bpf_trace_x86_exceptions---of 1
__probestub_page_fault_kernel---of 1
__probestub_page_fault_user---of 1
__traceiter_page_fault_kernel---of 4
__traceiter_page_fault_user---of 4
bad_area_access_error---of 21
bad_area_nosemaphore100%of 1
do_kern_addr_fault60%of 5
do_sigbus---of 5
dump_pagetable---of 10
fatal_signal_pending---of 3
fault_in_kernel_space---of 1
is_errata93---of 8
is_prefetch8%of 25
kernelmode_fixup_or_oops---of 6
page_fault_oops---of 33
perf_trace_x86_exceptions---of 8
pgtable_bad---of 1
show_ldttss---of 5
spurious_kernel_fault6%of 38
spurious_kernel_fault_check---of 11
trace_event_raw_event_x86_exceptions---of 7
trace_raw_output_x86_exceptions---of 3
vma_end_read28%of 11
-----------
SUMMARY16%of 95

-----------
SUMMARY---of 0

sctp_chunk_lookup_strreset_param---of 12
sctp_process_strreset_addstrm_in---of 15
sctp_process_strreset_addstrm_out---of 15
sctp_process_strreset_inreq---of 27
sctp_process_strreset_outreq---of 22
sctp_process_strreset_resp---of 56
sctp_process_strreset_tsnreq---of 16
sctp_send_add_streams---of 13
sctp_send_reconf---of 3
sctp_send_reset_assoc---of 13
sctp_send_reset_streams---of 41
sctp_stream_alloc_out---of 4
sctp_stream_clear---of 7
sctp_stream_free75%of 4
sctp_stream_free_ext67%of 3
sctp_stream_init59%of 12
sctp_stream_init_ext---of 4
sctp_stream_outq_is_empty---of 8
sctp_stream_outq_migrate27%of 15
sctp_stream_update---of 4
-----------
SUMMARY48%of 34

-----------
SUMMARY---of 0

bin2hex58%of 7
hex2bin---of 6
hex_dump_to_buffer---of 34
hex_to_bin---of 1
print_hex_dump---of 7
-----------
SUMMARY58%of 7

-----------
SUMMARY---of 0

__anon_inode_getfile---of 13
anon_inode_create_getfd---of 4
anon_inode_create_getfile---of 1
anon_inode_getfd40%of 10
anon_inode_getfile43%of 7
anon_inode_getfile_fmode---of 9
anon_inodefs_dname---of 1
anon_inodefs_init_fs_context---of 3
-----------
SUMMARY42%of 17

-----------
SUMMARY---of 0

__do_compat_sys_x32_rt_sigreturn---of 8
__ia32_sys_rt_sigreturn---of 8
copy_siginfo_to_user32---of 4
restore_sigcontext---of 6
sigaction_compat_abi---of 6
x32_copy_siginfo_to_user---of 3
x32_setup_rt_frame---of 43
x64_setup_rt_frame9%of 47
-----------
SUMMARY9%of 47

__kernfs_fh_to_dentry---of 10
kernfs_encode_fh---of 3
kernfs_fh_to_dentry---of 8
kernfs_fh_to_parent---of 1
kernfs_free_fs_context---of 1
kernfs_get_parent_dentry---of 3
kernfs_get_tree---of 14
kernfs_kill_sb---of 3
kernfs_node_dentry---of 14
kernfs_root_from_sb67%of 3
kernfs_set_super---of 1
kernfs_sop_show_options---of 6
kernfs_sop_show_path---of 6
kernfs_statfs---of 1
kernfs_super_ns---of 1
kernfs_test_super---of 3
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__ia32_sys_splice---of 1
__ia32_sys_tee---of 9
__ia32_sys_vmsplice---of 1
__se_sys_splice8%of 27
__se_sys_vmsplice8%of 53
__splice_from_pipe---of 20
__x64_sys_splice100%of 1
__x64_sys_tee23%of 9
__x64_sys_vmsplice100%of 1
add_to_pipe---of 4
copy_splice_read25%of 16
direct_file_splice_eof---of 3
direct_splice_actor---of 5
do_splice---of 60
do_splice_direct---of 5
do_tee---of 35
file_end_write---of 11
file_start_write---of 11
folio_lock---of 9
ipipe_prep---of 8
iter_file_splice_write---of 33
opipe_prep---of 9
page_cache_pipe_buf_confirm---of 23
page_cache_pipe_buf_release---of 1
page_cache_pipe_buf_try_steal---of 19
pipe_clear_nowait---of 5
pipe_to_user---of 1
put_page---of 14
splice_direct_to_actor---of 31
splice_file_range---of 9
splice_file_range_actor---of 5
splice_file_to_pipe27%of 15
splice_from_pipe---of 1
splice_from_pipe_next---of 16
splice_grow_spd---of 4
splice_shrink_spd---of 3
splice_to_pipe---of 9
splice_to_socket---of 43
user_page_pipe_buf_try_steal---of 3
vfs_splice_read---of 10
wait_for_space29%of 7
warn_unsupported---of 4
-----------
SUMMARY16%of 129

inet6_csk_addr2sockaddr---of 4
inet6_csk_route_req---of 20
inet6_csk_route_socket18%of 56
inet6_csk_update_pmtu---of 3
inet6_csk_xmit30%of 24
-----------
SUMMARY22%of 80

defrag6_net_exit---of 3
ipv6_defrag36%of 14
nf_defrag_ipv6_disable---of 4
nf_defrag_ipv6_enable---of 5
-----------
SUMMARY36%of 14

ip6frag_init---of 1
ip6frag_key_hashfn---of 1
ip6frag_obj_cmpfn---of 1
ip6frag_obj_hashfn---of 1
jhash2---of 1
nf_ct_frag6_cleanup---of 1
nf_ct_frag6_expire---of 36
nf_ct_frag6_gather3%of 85
nf_ct_frag6_init---of 4
nf_ct_frag6_reasm---of 13
nf_ct_net_exit---of 3
nf_ct_net_init---of 8
nf_ct_net_pre_exit---of 1
nf_frag_pernet---of 16
skb_orphan---of 4
-----------
SUMMARY3%of 85

-----------
SUMMARY---of 0

__sk_queue_drop_skb---of 7
__skb_datagram_iter28%of 36
__skb_recv_datagram---of 8
__skb_try_recv_datagram31%of 13
__skb_try_recv_from_queue18%of 23
__skb_wait_for_more_packets31%of 13
__zerocopy_sg_from_iter---of 58
csum_and_copy_to_iter---of 79
datagram_poll---of 13
hash_and_copy_to_iter---of 3
receiver_wake_function---of 3
simple_copy_to_iter---of 3
skb_copy_and_csum_datagram_msg---of 10
skb_copy_and_hash_datagram_iter---of 1
skb_copy_datagram_from_iter---of 29
skb_copy_datagram_iter27%of 15
skb_free_datagram100%of 1
skb_kill_datagram---of 1
skb_recv_datagram50%of 8
zerocopy_sg_from_iter---of 3
-----------
SUMMARY29%of 109

-----------
SUMMARY---of 0

__bitmap_and---of 9
__bitmap_andnot---of 9
__bitmap_clear72%of 7
__bitmap_complement---of 8
__bitmap_equal---of 8
__bitmap_intersects---of 8
__bitmap_or---of 8
__bitmap_or_equal---of 7
__bitmap_replace---of 7
__bitmap_set72%of 7
__bitmap_shift_left---of 14
__bitmap_shift_right---of 8
__bitmap_subset---of 8
__bitmap_weight---of 6
__bitmap_weight_and---of 6
__bitmap_weight_andnot---of 6
__bitmap_xor---of 8
bitmap_alloc---of 1
bitmap_alloc_node---of 1
bitmap_bitremap---of 16
bitmap_cut---of 17
bitmap_find_next_zero_area_off50%of 4
bitmap_fold---of 4
bitmap_free---of 1
bitmap_from_arr32---of 8
bitmap_onto---of 5
bitmap_remap---of 17
bitmap_to_arr32---of 8
bitmap_zalloc---of 1
bitmap_zalloc_node---of 1
devm_bitmap_alloc---of 4
devm_bitmap_free---of 1
devm_bitmap_zalloc---of 4
-----------
SUMMARY67%of 18

__ia32_compat_sys_old_select---of 3
__ia32_compat_sys_ppoll_time32---of 1
__ia32_compat_sys_ppoll_time64---of 1
__ia32_compat_sys_pselect6_time32---of 5
__ia32_compat_sys_pselect6_time64---of 5
__ia32_compat_sys_select---of 1
__ia32_sys_poll---of 1
__ia32_sys_ppoll---of 1
__ia32_sys_pselect6---of 1
__ia32_sys_select---of 1
__pollwait28%of 11
__se_compat_sys_ppoll_time32---of 8
__se_compat_sys_ppoll_time64---of 8
__se_sys_poll43%of 7
__se_sys_ppoll38%of 8
__se_sys_pselect631%of 13
__se_sys_select34%of 6
__x64_compat_sys_old_select---of 3
__x64_compat_sys_ppoll_time32---of 1
__x64_compat_sys_ppoll_time64---of 1
__x64_compat_sys_pselect6_time32---of 6
__x64_compat_sys_pselect6_time64---of 6
__x64_compat_sys_select---of 1
__x64_sys_poll100%of 1
__x64_sys_ppoll100%of 1
__x64_sys_pselect6100%of 1
__x64_sys_select100%of 1
compat_core_sys_select---of 46
core_sys_select26%of 43
do_compat_pselect---of 12
do_compat_select---of 6
do_restart_poll---of 5
do_select41%of 72
do_sys_poll39%of 47
poll_freewait50%of 10
poll_initwait---of 1
poll_select_finish14%of 22
poll_select_set_timeout---of 4
pollwake---of 4
select_estimate_accuracy---of 5
set_fd_set75%of 4
signal_pending---of 3
-----------
SUMMARY36%of 247

-----------
SUMMARY---of 0

sr_callbacks_rotate---of 5
tipc_dest_del---of 9
tipc_dest_find---of 7
tipc_dest_list_purge34%of 6
tipc_dest_pop---of 8
tipc_dest_push---of 11
tipc_nametbl_build_group---of 22
tipc_nametbl_init---of 3
tipc_nametbl_insert_publ---of 35
tipc_nametbl_lookup_anycast---of 57
tipc_nametbl_lookup_group---of 46
tipc_nametbl_lookup_mcast_nodes---of 45
tipc_nametbl_lookup_mcast_sockets---of 56
tipc_nametbl_publish---of 5
tipc_nametbl_remove_publ---of 85
tipc_nametbl_stop---of 59
tipc_nametbl_subscribe---of 54
tipc_nametbl_unsubscribe---of 11
tipc_nametbl_withdraw---of 6
tipc_net---of 16
tipc_nl_name_table_dump---of 52
tipc_publ_sort---of 1
-----------
SUMMARY34%of 6

-----------
SUMMARY---of 0

__cgroup_bpf_check_dev_permission---of 46
__cgroup_bpf_detach---of 63
__cgroup_bpf_run_filter_getsockopt---of 54
__cgroup_bpf_run_filter_getsockopt_kern---of 29
__cgroup_bpf_run_filter_setsockopt---of 46
__cgroup_bpf_run_filter_sk---of 27
__cgroup_bpf_run_filter_skb---of 70
__cgroup_bpf_run_filter_sock_addr---of 37
__cgroup_bpf_run_filter_sock_ops---of 27
__cgroup_bpf_run_filter_sysctl---of 57
__cgroup_bpf_run_lsm_current---of 37
__cgroup_bpf_run_lsm_sock---of 28
__cgroup_bpf_run_lsm_socket---of 28
activate_effective_progs34%of 9
bpf_cgroup_atype_find---of 25
bpf_cgroup_atype_get---of 12
bpf_cgroup_atype_put---of 7
bpf_cgroup_link_dealloc---of 1
bpf_cgroup_link_detach---of 1
bpf_cgroup_link_fill_link_info---of 3
bpf_cgroup_link_release---of 9
bpf_cgroup_link_show_fdinfo---of 3
bpf_get_local_storage---of 4
bpf_get_netns_cookie_sockopt---of 3
bpf_get_retval---of 1
bpf_set_retval---of 1
bpf_sysctl_get_current_value---of 6
bpf_sysctl_get_name---of 6
bpf_sysctl_get_new_value---of 8
bpf_sysctl_set_new_value---of 6
cg_sockopt_convert_ctx_access---of 13
cg_sockopt_func_proto---of 39
cg_sockopt_get_prologue---of 1
cg_sockopt_is_valid_access---of 21
cgroup_bpf_attach---of 83
cgroup_bpf_inherit50%of 12
cgroup_bpf_link_attach---of 9
cgroup_bpf_offline---of 3
cgroup_bpf_prog_attach---of 10
cgroup_bpf_prog_detach---of 6
cgroup_bpf_prog_query---of 66
cgroup_bpf_release---of 28
cgroup_bpf_release_fn---of 1
cgroup_bpf_replace---of 39
cgroup_common_func_proto---of 29
cgroup_current_func_proto---of 5
cgroup_dev_func_proto---of 33
cgroup_dev_is_valid_access---of 8
compute_effective_progs31%of 33
percpu_ref_get31%of 13
percpu_ref_put---of 14
sysctl_convert_ctx_access---of 5
sysctl_cpy_dir---of 8
sysctl_func_proto---of 38
sysctl_is_valid_access---of 18
to_cgroup_bpf_attach_type---of 30
update_effective_progs---of 15
-----------
SUMMARY35%of 67

proc_ns_dir_lookup31%of 13
proc_ns_dir_readdir---of 16
proc_ns_get_link34%of 9
proc_ns_instantiate---of 3
proc_ns_readlink---of 8
-----------
SUMMARY32%of 22

-----------
SUMMARY---of 0

__tcp_cleanup_rbuf14%of 15
__tcp_close---of 40
__tcp_sock_set_cork---of 4
__tcp_sock_set_nodelay---of 4
__tcp_sock_set_quickack---of 11
can_map_frag---of 6
check_zeroed_sockptr---of 3
copy_from_sockptr---of 4
copy_to_sockptr---of 4
do_tcp_getsockopt---of 124
do_tcp_setsockopt---of 94
memcpy_to_msg---of 1
mmap_read_unlock---of 3
signal_pending---of 3
sk_stream_moderate_sndbuf---of 4
sk_wmem_schedule---of 4
sock_error---of 3
sock_rps_record_flow---of 22
tcp_abort---of 16
tcp_bpf_bypass_getsockopt---of 1
tcp_can_repair_sock---of 3
tcp_check_oom---of 12
tcp_cleanup_rbuf---of 4
tcp_close---of 6
tcp_disconnect---of 51
tcp_done43%of 14
tcp_downgrade_zcopy_pure---of 5
tcp_enable_tx_delay---of 4
tcp_enter_memory_pressure---of 4
tcp_fast_path_check---of 7
tcp_free_fastopen_req---of 3
tcp_get_info---of 28
tcp_get_timestamping_opt_stats---of 12
tcp_getsockopt---of 3
tcp_inbound_hash---of 40
tcp_inbound_md5_hash---of 36
tcp_init_sock---of 1
tcp_inq_hint---of 6
tcp_ioctl---of 27
tcp_leave_memory_pressure50%of 4
tcp_mark_push---of 1
tcp_md5_add_sigpool---of 1
tcp_md5_alloc_sigpool---of 3
tcp_md5_hash_key---of 1
tcp_md5_release_sigpool---of 1
tcp_mmap---of 12
tcp_orphan_count_sum---of 5
tcp_orphan_update---of 5
tcp_peek_len---of 11
tcp_peek_sndq---of 9
tcp_poll---of 48
tcp_push32%of 16
tcp_read_done---of 15
tcp_read_skb---of 17
tcp_read_sock---of 26
tcp_recv_skb---of 11
tcp_recv_timestamp30%of 20
tcp_recvmsg19%of 22
tcp_recvmsg_locked18%of 103
tcp_remove_empty_skb---of 5
tcp_repair_options_est---of 15
tcp_repair_set_window---of 10
tcp_send_mss---of 6
tcp_sendmsg100%of 1
tcp_sendmsg_fastopen30%of 17
tcp_sendmsg_locked28%of 189
tcp_set_rcvlowat---of 6
tcp_set_state25%of 32
tcp_set_window_clamp---of 8
tcp_setsockopt---of 3
tcp_shutdown45%of 9
tcp_skb_entail59%of 12
tcp_sock_set_cork---of 4
tcp_sock_set_keepcnt---of 3
tcp_sock_set_keepidle---of 7
tcp_sock_set_keepidle_locked---of 7
tcp_sock_set_keepintvl---of 3
tcp_sock_set_nodelay---of 3
tcp_sock_set_quickack---of 1
tcp_sock_set_syncnt---of 3
tcp_sock_set_user_timeout---of 3
tcp_splice_data_recv---of 3
tcp_splice_eof---of 6
tcp_splice_read---of 29
tcp_stream_alloc_skb58%of 7
tcp_update_recv_tstamps---of 5
tcp_wmem_free_skb---of 13
tcp_wmem_schedule---of 6
tcp_write_queue_purge43%of 7
tcp_zc_finalize_rx_tstamp---of 4
tcp_zc_handle_leftover---of 11
tcp_zerocopy_receive---of 103
tcp_zerocopy_vm_insert_batch---of 3
tcp_zerocopy_vm_insert_batch_error---of 7
trace_tcp_hash_md5_required---of 15
vma_end_read---of 11
-----------
SUMMARY28%of 468

__bforget---of 6
__bh_read---of 6
__bh_read_batch---of 12
__block_commit_write---of 14
__block_write_begin---of 7
__block_write_begin_int---of 99
__block_write_full_folio---of 67
__bread_gfp---of 13
__breadahead---of 9
__brelse67%of 3
__find_get_block10%of 62
__lock_buffer---of 3
__sync_dirty_buffer---of 13
__wait_on_buffer---of 3
alloc_buffer_head---of 9
alloc_page_buffers---of 7
bdev_getblk14%of 23
bh_read---of 3
bh_uptodate_or_lock---of 7
block_commit_write---of 7
block_dirty_folio---of 14
block_invalidate_folio---of 21
block_is_partially_uptodate---of 11
block_page_mkwrite---of 11
block_read_full_folio---of 56
block_truncate_page---of 35
block_write_begin---of 10
block_write_end---of 9
block_write_full_folio---of 4
buffer_check_dirty_writeback---of 23
buffer_exit_cpu_dead---of 9
clean_bdev_aliases---of 22
cont_write_begin---of 25
create_empty_buffers---of 37
decrypt_bh---of 8
drop_buffers---of 25
end_bio_bh_io_sync---of 3
end_buffer_async_read---of 17
end_buffer_async_read_io---of 10
end_buffer_async_write---of 18
end_buffer_read_sync---of 4
end_buffer_write_sync---of 6
folio_alloc_buffers---of 23
folio_attach_private---of 8
folio_create_buffers---of 11
folio_init_buffers---of 22
folio_lock---of 9
folio_set_bh---of 3
folio_size---of 10
folio_test_uptodate---of 9
folio_zero_new_buffers---of 21
free_buffer_head---of 9
generic_block_bmap---of 3
generic_buffers_fsync---of 7
generic_buffers_fsync_noflush---of 5
generic_cont_expand_simple---of 5
generic_write_end---of 7
has_bh_in_lru---of 19
inode_has_buffers100%of 1
invalidate_bh_lru---of 9
invalidate_bh_lrus---of 1
invalidate_bh_lrus_cpu---of 10
invalidate_inode_buffers---of 9
mark_buffer_async_write---of 3
mark_buffer_dirty---of 30
mark_buffer_dirty_inode---of 9
mark_buffer_write_io_error---of 10
put_page---of 14
remove_inode_buffers---of 10
submit_bh---of 1
submit_bh_wbc---of 14
sync_dirty_buffer---of 1
sync_mapping_buffers---of 43
touch_buffer27%of 15
try_to_free_buffers---of 28
unlock_buffer---of 1
verify_bh---of 4
write_boundary_block---of 4
write_dirty_buffer---of 6
zero_user_segments---of 17
-----------
SUMMARY16%of 104

minmax_running_max---of 14
minmax_running_min36%of 14
-----------
SUMMARY36%of 14

__nfc_alloc_vendor_cmd_reply_skb---of 8
genlmsg_multicast---of 3
nfc_genl_activate_target---of 6
nfc_genl_data_exit---of 1
nfc_genl_data_init---of 1
nfc_genl_deactivate_target---of 5
nfc_genl_dep_link_down---of 4
nfc_genl_dep_link_down_event---of 8
nfc_genl_dep_link_up---of 10
nfc_genl_dep_link_up_event---of 12
nfc_genl_dev_down---of 4
nfc_genl_dev_up---of 4
nfc_genl_device_added---of 6
nfc_genl_device_removed---of 6
nfc_genl_disable_se---of 5
nfc_genl_dump_devices---of 9
nfc_genl_dump_devices_done---of 3
nfc_genl_dump_ses---of 22
nfc_genl_dump_ses_done---of 3
nfc_genl_dump_targets---of 33
nfc_genl_dump_targets_done---of 3
nfc_genl_enable_se---of 5
nfc_genl_exit---of 1
nfc_genl_fw_download---of 5
nfc_genl_fw_download_done---of 8
nfc_genl_get_device---of 9
nfc_genl_llc_get_params---of 17
nfc_genl_llc_sdreq---of 20
nfc_genl_llc_send_sdres---of 17
nfc_genl_llc_set_params---of 21
nfc_genl_rcv_nl_event43%of 7
nfc_genl_se_added---of 8
nfc_genl_se_connectivity---of 8
nfc_genl_se_io---of 11
nfc_genl_se_removed---of 7
nfc_genl_se_transaction---of 10
nfc_genl_send_device---of 10
nfc_genl_setup_device_added---of 8
nfc_genl_start_poll---of 16
nfc_genl_stop_poll---of 6
nfc_genl_target_lost---of 9
nfc_genl_targets_found---of 6
nfc_genl_tm_activated---of 7
nfc_genl_tm_deactivated---of 6
nfc_genl_vendor_cmd---of 16
nfc_se_io---of 10
nfc_urelease_event_work---of 8
nfc_vendor_cmd_reply---of 3
se_io_cb---of 8
-----------
SUMMARY43%of 7

-----------
SUMMARY---of 0

__import_iovec32%of 25
__iov_iter_get_pages_alloc---of 29
_copy_from_iter13%of 81
_copy_from_iter_flushcache---of 74
_copy_from_iter_nocache---of 74
_copy_mc_to_iter---of 79
_copy_to_iter9%of 81
bvec_npages---of 5
copy_compat_iovec_from_user---of 8
copy_page_from_iter50%of 4
copy_page_from_iter_atomic---of 82
copy_page_to_iter---of 5
copy_page_to_iter_nofault---of 80
dup_iter---of 5
fault_in_iov_iter_readable---of 9
fault_in_iov_iter_writeable---of 9
find_subpage---of 16
get_page---of 9
import_iovec67%of 3
import_ubuf50%of 4
iov_iter_advance30%of 10
iov_iter_aligned_iovec---of 7
iov_iter_alignment---of 8
iov_iter_alignment_bvec---of 4
iov_iter_alignment_iovec---of 10
iov_iter_bvec67%of 3
iov_iter_bvec_advance---of 6
iov_iter_discard---of 3
iov_iter_extract_bvec_pages---of 34
iov_iter_extract_kvec_pages---of 31
iov_iter_extract_pages---of 27
iov_iter_extract_xarray_pages---of 34
iov_iter_gap_alignment---of 9
iov_iter_get_pages2---of 4
iov_iter_get_pages_alloc2---of 3
iov_iter_init---of 3
iov_iter_iovec_advance50%of 10
iov_iter_is_aligned---of 13
iov_iter_kvec---of 3
iov_iter_npages---of 8
iov_iter_restore---of 7
iov_iter_revert---of 11
iov_iter_single_seg_count---of 6
iov_iter_xarray---of 3
iov_iter_zero---of 76
iov_npages---of 9
iovec_from_user36%of 17
iter_xarray_get_pages---of 34
page_copy_sane15%of 14
want_pages_array---of 6
xas_next---of 12
xas_reload---of 24
-----------
SUMMARY20%of 252

-----------
SUMMARY---of 0

__ia32_sys_mseal---of 1
__se_sys_mseal---of 29
__x64_sys_mseal---of 1
can_modify_mm75%of 4
can_modify_mm_madv---of 17
-----------
SUMMARY75%of 4

__ext4_forget---of 36
__ext4_handle_dirty_metadata25%of 20
__ext4_journal_ensure_credits---of 12
__ext4_journal_get_create_access---of 16
__ext4_journal_get_write_access16%of 19
__ext4_journal_start_reserved---of 20
__ext4_journal_start_sb15%of 35
__ext4_journal_stop34%of 6
ext4_inode_journal_mode---of 14
ext4_journal_abort_handle---of 8
ext4_journal_check_start38%of 8
-----------
SUMMARY21%of 88

__kthread_cancel_work_sync---of 11
__kthread_create_on_node---of 8
__kthread_create_worker---of 7
__kthread_init_worker---of 1
__kthread_parkme---of 4
__kthread_queue_delayed_work---of 13
free_kthread_struct---of 6
get_kthread_comm---of 7
kthread---of 8
kthread_associate_blkcg---of 32
kthread_bind---of 3
kthread_bind_mask---of 3
kthread_blkcg50%of 4
kthread_cancel_delayed_work_sync---of 1
kthread_cancel_work_sync---of 1
kthread_complete_and_exit---of 3
kthread_create_on_cpu---of 6
kthread_create_on_node---of 1
kthread_create_worker---of 1
kthread_create_worker_on_cpu---of 1
kthread_data---of 3
kthread_delayed_work_timer_fn---of 10
kthread_destroy_worker---of 6
kthread_flush_work---of 7
kthread_flush_work_fn---of 1
kthread_flush_worker---of 7
kthread_freezable_should_stop---of 8
kthread_func---of 4
kthread_insert_work---of 27
kthread_is_per_cpu75%of 4
kthread_mod_delayed_work---of 8
kthread_park---of 8
kthread_parkme---of 3
kthread_probe_data---of 4
kthread_queue_delayed_work---of 7
kthread_queue_work---of 7
kthread_set_per_cpu---of 8
kthread_should_park---of 3
kthread_should_stop---of 3
kthread_should_stop_or_park50%of 4
kthread_stop---of 37
kthread_stop_put---of 4
kthread_unpark---of 6
kthread_unuse_mm---of 9
kthread_use_mm---of 9
kthread_worker_fn---of 48
kthreadd---of 22
set_kthread_struct---of 6
to_kthread---of 3
tsk_fork_get_node---of 3
-----------
SUMMARY59%of 12

__sctp_outq_teardown28%of 33
sctp_check_transmitted---of 76
sctp_generate_fwdtsn---of 23
sctp_outq_flush19%of 153
sctp_outq_free100%of 1
sctp_outq_init100%of 1
sctp_outq_is_empty---of 4
sctp_outq_sack---of 88
sctp_outq_select_transport15%of 20
sctp_outq_tail17%of 24
sctp_outq_teardown---of 1
sctp_outq_uncork100%of 3
sctp_packet_singleton50%of 4
sctp_prsctp_prune---of 19
sctp_prsctp_prune_sent---of 21
sctp_retransmit---of 10
sctp_retransmit_mark---of 31
-----------
SUMMARY22%of 239

-----------
SUMMARY---of 0

__ipv6_addr_type34%of 15
eafnosupport_fib6_get_table---of 1
eafnosupport_fib6_lookup---of 1
eafnosupport_fib6_nh_init---of 3
eafnosupport_fib6_select_path---of 1
eafnosupport_fib6_table_lookup---of 1
eafnosupport_ip6_del_rt---of 1
eafnosupport_ip6_mtu_from_fib6---of 1
eafnosupport_ipv6_dev_find---of 1
eafnosupport_ipv6_dst_lookup_flow---of 1
eafnosupport_ipv6_fragment---of 1
eafnosupport_ipv6_route_input---of 1
in6_dev_finish_destroy---of 12
in6_dev_finish_destroy_rcu---of 1
inet6addr_notifier_call_chain---of 1
inet6addr_validator_notifier_call_chain---of 1
register_inet6addr_notifier---of 1
register_inet6addr_validator_notifier---of 1
unregister_inet6addr_notifier---of 1
unregister_inet6addr_validator_notifier---of 1
-----------
SUMMARY34%of 15

__pskb_trim_head---of 34
__tcp_push_pending_frames30%of 10
__tcp_retransmit_skb---of 61
__tcp_select_window29%of 46
__tcp_send_ack20%of 10
__tcp_transmit_skb40%of 119
bpf_skops_hdr_opt_len---of 12
inet_csk_reset_xmit_timer---of 10
list_move_tail---of 5
mptcp_skb_ext_copy---of 9
sk_forced_mem_schedule37%of 11
skb_fclone_busy---of 4
sock_owned_by_me---of 5
tcp_adjust_pcount---of 15
tcp_advertise_mss31%of 13
tcp_call_bpf25%of 20
tcp_can_coalesce_send_queue_head---of 8
tcp_chrono_start50%of 4
tcp_chrono_stop63%of 8
tcp_clone_payload---of 33
tcp_connect19%of 118
tcp_current_mss28%of 18
tcp_cwnd_restart---of 15
tcp_delack_max---of 1
tcp_eat_one_skb---of 4
tcp_established_options54%of 15
tcp_event_new_data_sent100%of 6
tcp_fragment---of 41
tcp_hdrlen---of 3
tcp_init_tso_segs---of 7
tcp_make_synack---of 68
tcp_mss_to_mtu---of 1
tcp_mstamp_refresh100%of 1
tcp_mtu_check_reprobe---of 3
tcp_mtu_to_mss---of 1
tcp_mtup_init67%of 3
tcp_options_write60%of 27
tcp_pace_kick---of 4
tcp_push_one50%of 4
tcp_release_cb43%of 19
tcp_retrans_try_collapse---of 36
tcp_retransmit_skb---of 10
tcp_rto_min28%of 11
tcp_rtx_synack---of 21
tcp_schedule_loss_probe50%of 20
tcp_select_initial_window50%of 6
tcp_send_ack100%of 1
tcp_send_active_reset---of 22
tcp_send_delayed_ack40%of 10
tcp_send_fin24%of 21
tcp_send_loss_probe---of 25
tcp_send_probe0---of 11
tcp_send_synack20%of 25
tcp_send_window_probe---of 4
tcp_skb_collapse_tstamp---of 4
tcp_small_queue_check34%of 15
tcp_syn_options34%of 30
tcp_sync_mss60%of 5
tcp_tasklet_func---of 16
tcp_trim_head---of 15
tcp_tso_segs56%of 9
tcp_tsq_handler---of 10
tcp_tsq_write---of 7
tcp_update_skb_after_send---of 10
tcp_wfree20%of 25
tcp_wmem_free_skb31%of 13
tcp_write_wakeup---of 16
tcp_write_xmit30%of 191
tcp_xmit_probe_skb---of 4
tcp_xmit_retransmit_queue---of 30
-----------
SUMMARY33%of 834

-----------
SUMMARY---of 0

errseq_check67%of 3
errseq_check_and_advance---of 4
errseq_sample100%of 1
errseq_set---of 5
-----------
SUMMARY75%of 4

__sk_dst_reset---of 7
retransmits_timed_out---of 13
tcp_call_bpf---of 20
tcp_clamp_probe0_to_user_timeout---of 5
tcp_compressed_ack_kick---of 14
tcp_delack_timer---of 13
tcp_delack_timer_handler---of 10
tcp_init_xmit_timers---of 1
tcp_keepalive_timer---of 39
tcp_out_of_resources---of 8
tcp_retransmit_timer---of 116
tcp_rto_min---of 11
tcp_set_keepalive---of 8
tcp_syn_ack_timeout---of 1
tcp_write_err---of 1
tcp_write_timer---of 13
tcp_write_timer_handler12%of 25
-----------
SUMMARY12%of 25

-----------
SUMMARY---of 0

__sctp_connect19%of 37
__sctp_setsockopt_delayed_ack---of 28
__sctp_write_space---of 27
sctp_accept---of 15
sctp_addr_id2transport---of 23
sctp_apply_asoc_delayed_ack---of 14
sctp_apply_peer_addr_params---of 71
sctp_asconf_mgmt---of 14
sctp_assoc_ulpevent_type_set---of 11
sctp_auto_asconf_init---of 4
sctp_bind---of 5
sctp_bind_add---of 1
sctp_bindx_rem---of 25
sctp_bpf_bypass_getsockopt---of 6
sctp_clear_owner_w---of 4
sctp_close---of 27
sctp_connect_add_peer---of 14
sctp_connect_new_asoc15%of 20
sctp_copy_sock---of 3
sctp_data_ready---of 39
sctp_destroy_sock---of 9
sctp_destruct_sock---of 1
sctp_disconnect---of 1
sctp_do_bind---of 29
sctp_do_peeloff---of 14
sctp_enter_memory_pressure---of 1
sctp_for_each_endpoint---of 8
sctp_for_each_tx_datachunk---of 59
sctp_get_port_local---of 74
sctp_get_sctp_info---of 8
sctp_getsockopt5%of 66
sctp_getsockopt_active_key---of 23
sctp_getsockopt_adaptation_layer---of 4
sctp_getsockopt_asconf_supported---of 20
sctp_getsockopt_assoc_ids---of 12
sctp_getsockopt_assoc_number---of 6
sctp_getsockopt_assoc_stats---of 17
sctp_getsockopt_associnfo---of 21
sctp_getsockopt_auth_supported---of 20
sctp_getsockopt_auto_asconf---of 7
sctp_getsockopt_autoclose---of 5
sctp_getsockopt_connectx328%of 22
sctp_getsockopt_context---of 20
sctp_getsockopt_default_prinfo---of 20
sctp_getsockopt_default_send_param---of 20
sctp_getsockopt_default_sndinfo---of 20
sctp_getsockopt_delayed_ack---of 28
sctp_getsockopt_disable_fragments---of 4
sctp_getsockopt_ecn_supported---of 20
sctp_getsockopt_enable_strreset---of 20
sctp_getsockopt_encap_port---of 24
sctp_getsockopt_event---of 24
sctp_getsockopt_events---of 8
sctp_getsockopt_fragment_interleave---of 4
sctp_getsockopt_hmac_ident---of 9
sctp_getsockopt_initmsg---of 4
sctp_getsockopt_interleaving_supported---of 20
sctp_getsockopt_local_addrs---of 55
sctp_getsockopt_local_auth_chunks---of 28
sctp_getsockopt_mappedv4---of 4
sctp_getsockopt_maxburst---of 27
sctp_getsockopt_maxseg---of 28
sctp_getsockopt_nodelay---of 4
sctp_getsockopt_paddr_thresholds---of 22
sctp_getsockopt_partial_delivery_point---of 4
sctp_getsockopt_peeloff---of 7
sctp_getsockopt_peeloff_common---of 8
sctp_getsockopt_peeloff_flags---of 7
sctp_getsockopt_peer_addr_info---of 13
sctp_getsockopt_peer_addr_params---of 40
sctp_getsockopt_peer_addrs---of 22
sctp_getsockopt_peer_auth_chunks---of 22
sctp_getsockopt_pf_expose---of 20
sctp_getsockopt_pr_assocstatus---of 21
sctp_getsockopt_pr_streamstatus---of 23
sctp_getsockopt_pr_supported---of 20
sctp_getsockopt_primary_addr---of 17
sctp_getsockopt_probe_interval---of 24
sctp_getsockopt_reconfig_supported---of 20
sctp_getsockopt_recvnxtinfo---of 6
sctp_getsockopt_recvrcvinfo---of 6
sctp_getsockopt_reuse_port---of 4
sctp_getsockopt_rtoinfo---of 20
sctp_getsockopt_scheduler---of 20
sctp_getsockopt_scheduler_value---of 17
sctp_getsockopt_sctp_status---of 21
sctp_hash---of 1
sctp_id2assoc---of 9
sctp_inet_connect---of 8
sctp_inet_listen---of 23
sctp_init_sock---of 9
sctp_ioctl---of 7
sctp_poll19%of 37
sctp_put_port---of 7
sctp_recvmsg---of 39
sctp_send_asconf---of 5
sctp_send_asconf_add_ip---of 29
sctp_send_asconf_del_ip---of 41
sctp_sendmsg---of 135
sctp_sendmsg_check_sflags---of 13
sctp_sendmsg_to_asoc---of 62
sctp_set_owner_w---of 8
sctp_setsockopt---of 78
sctp_setsockopt_active_key---of 25
sctp_setsockopt_add_streams---of 14
sctp_setsockopt_asconf_supported---of 19
sctp_setsockopt_associnfo---of 27
sctp_setsockopt_auth_chunk---of 8
sctp_setsockopt_auth_key---of 26
sctp_setsockopt_auth_supported---of 21
sctp_setsockopt_auto_asconf---of 15
sctp_setsockopt_autoclose---of 5
sctp_setsockopt_bindx---of 22
sctp_setsockopt_connectx---of 9
sctp_setsockopt_connectx_old---of 7
sctp_setsockopt_context---of 23
sctp_setsockopt_deactivate_key---of 25
sctp_setsockopt_default_prinfo---of 26
sctp_setsockopt_default_send_param---of 24
sctp_setsockopt_default_sndinfo---of 24
sctp_setsockopt_del_key---of 25
sctp_setsockopt_delayed_ack---of 7
sctp_setsockopt_ecn_supported---of 17
sctp_setsockopt_enable_strreset---of 24
sctp_setsockopt_encap_port---of 21
sctp_setsockopt_event---of 24
sctp_setsockopt_events---of 22
sctp_setsockopt_hmac_ident---of 5
sctp_setsockopt_initmsg---of 10
sctp_setsockopt_interleaving_supported---of 20
sctp_setsockopt_maxburst---of 27
sctp_setsockopt_maxseg---of 27
sctp_setsockopt_paddr_thresholds---of 39
sctp_setsockopt_peer_addr_params---of 33
sctp_setsockopt_peer_primary_addr---of 24
sctp_setsockopt_pf_expose---of 18
sctp_setsockopt_pr_supported---of 17
sctp_setsockopt_primary_addr---of 6
sctp_setsockopt_probe_interval---of 41
sctp_setsockopt_reconfig_supported---of 17
sctp_setsockopt_reset_assoc---of 14
sctp_setsockopt_reset_streams---of 15
sctp_setsockopt_rtoinfo---of 29
sctp_setsockopt_scheduler---of 24
sctp_setsockopt_scheduler_value---of 20
sctp_shutdown---of 4
sctp_skb_pull---of 11
sctp_skb_recv_datagram---of 29
sctp_skb_set_owner_r_frag---of 10
sctp_sock_migrate---of 35
sctp_sock_rfree43%of 7
sctp_transport_get_idx---of 14
sctp_transport_get_next---of 10
sctp_transport_lookup_process---of 21
sctp_transport_traverse_process---of 20
sctp_transport_walk_start---of 1
sctp_transport_walk_stop---of 1
sctp_ulpevent_type_set---of 6
sctp_unhash---of 1
sctp_v6_destruct_sock---of 1
sctp_v6_init_sock---of 3
sctp_wait_for_connect31%of 13
sctp_wfree---of 23
sctp_write_space---of 4
-----------
SUMMARY17%of 202

-----------
SUMMARY---of 0

__bpf_trace_signal_deliver---of 1
__bpf_trace_signal_generate---of 1
__compat_save_altstack---of 1
__copy_siginfo_from_user---of 19
__copy_siginfo_to_user32---of 1
__dequeue_signal---of 21
__do_sys_pause---of 4
__flush_itimer_signals---of 10
__ia32_compat_sys_rt_sigaction---of 1
__ia32_compat_sys_rt_sigpending---of 3
__ia32_compat_sys_rt_sigprocmask---of 1
__ia32_compat_sys_rt_sigqueueinfo---of 5
__ia32_compat_sys_rt_sigsuspend---of 4
__ia32_compat_sys_rt_sigtimedwait_time32---of 1
__ia32_compat_sys_rt_sigtimedwait_time64---of 1
__ia32_compat_sys_rt_tgsigqueueinfo---of 1
__ia32_compat_sys_sigaction---of 1
__ia32_compat_sys_sigaltstack---of 1
__ia32_compat_sys_sigpending---of 1
__ia32_sys_kill---of 1
__ia32_sys_pidfd_send_signal---of 1
__ia32_sys_restart_syscall---of 1
__ia32_sys_rt_sigaction---of 9
__ia32_sys_rt_sigpending---of 3
__ia32_sys_rt_sigprocmask---of 1
__ia32_sys_rt_sigqueueinfo---of 5
__ia32_sys_rt_sigsuspend---of 4
__ia32_sys_rt_sigtimedwait---of 1
__ia32_sys_rt_sigtimedwait_time32---of 1
__ia32_sys_rt_tgsigqueueinfo---of 6
__ia32_sys_sigaltstack---of 6
__ia32_sys_signal---of 3
__ia32_sys_sigpending---of 1
__ia32_sys_sigprocmask---of 1
__ia32_sys_sigsuspend---of 1
__ia32_sys_ssetmask---of 1
__ia32_sys_tgkill---of 1
__ia32_sys_tkill---of 1
__kill_pgrp_info---of 5
__lock_task_sighand---of 19
__probestub_signal_deliver---of 1
__probestub_signal_generate---of 1
__save_altstack---of 1
__se_compat_sys_rt_sigaction---of 7
__se_compat_sys_rt_sigprocmask---of 10
__se_compat_sys_rt_sigtimedwait_time32---of 8
__se_compat_sys_rt_sigtimedwait_time64---of 8
__se_compat_sys_rt_tgsigqueueinfo---of 6
__se_compat_sys_sigaction---of 15
__se_sys_kill---of 16
__se_sys_pidfd_send_signal---of 33
__se_sys_rt_sigprocmask---of 11
__se_sys_rt_sigtimedwait---of 10
__se_sys_rt_sigtimedwait_time32---of 10
__se_sys_sigprocmask---of 10
__se_sys_tgkill---of 3
__se_sys_tkill---of 3
__send_signal_locked20%of 50
__set_current_blocked34%of 6
__sigqueue_alloc25%of 32
__sigqueue_free---of 4
__traceiter_signal_deliver---of 4
__traceiter_signal_generate---of 4
__x64_compat_sys_rt_sigaction---of 1
__x64_compat_sys_rt_sigpending---of 3
__x64_compat_sys_rt_sigprocmask---of 1
__x64_compat_sys_rt_sigqueueinfo---of 5
__x64_compat_sys_rt_sigsuspend---of 4
__x64_compat_sys_rt_sigtimedwait_time32---of 1
__x64_compat_sys_rt_sigtimedwait_time64---of 1
__x64_compat_sys_rt_tgsigqueueinfo---of 1
__x64_compat_sys_sigaction---of 1
__x64_compat_sys_sigaltstack---of 1
__x64_compat_sys_sigpending---of 1
__x64_sys_kill---of 1
__x64_sys_pidfd_send_signal---of 1
__x64_sys_rt_sigaction---of 9
__x64_sys_rt_sigpending---of 3
__x64_sys_rt_sigprocmask---of 1
__x64_sys_rt_sigqueueinfo---of 5
__x64_sys_rt_sigsuspend---of 4
__x64_sys_rt_sigtimedwait---of 1
__x64_sys_rt_sigtimedwait_time32---of 1
__x64_sys_rt_tgsigqueueinfo---of 6
__x64_sys_sgetmask---of 1
__x64_sys_sigaltstack---of 6
__x64_sys_signal---of 3
__x64_sys_sigpending---of 1
__x64_sys_sigprocmask---of 1
__x64_sys_sigsuspend---of 1
__x64_sys_ssetmask---of 1
__x64_sys_tgkill---of 1
__x64_sys_tkill---of 1
calculate_sigpending---of 1
cgroup_threadgroup_change_end---of 10
check_kill_permission---of 21
compat_restore_altstack---of 6
complete_signal15%of 57
copy_siginfo_from_user---of 19
copy_siginfo_from_user32---of 3
copy_siginfo_to_external32---of 26
copy_siginfo_to_user67%of 3
dequeue_signal---of 12
do_compat_sigaltstack---of 7
do_freezer_trap---of 3
do_jobctl_trap---of 8
do_no_restart_syscall---of 1
do_notify_parent---of 63
do_notify_parent_cldstop---of 50
do_notify_pidfd---of 3
do_send_sig_info---of 3
do_send_specific---of 20
do_sigaction---of 21
do_sigaltstack---of 18
do_signal_stop---of 34
do_sigtimedwait---of 14
exit_signals---of 18
flush_itimer_signals---of 1
flush_signal_handlers---of 9
flush_signals---of 17
flush_sigqueue---of 9
flush_sigqueue_mask---of 13
force_exit_sig---of 1
force_fatal_sig---of 1
force_sig100%of 1
force_sig_bnderr---of 1
force_sig_fault100%of 1
force_sig_fault_to_task---of 1
force_sig_fault_trapno---of 1
force_sig_info---of 1
force_sig_info_to_task27%of 19
force_sig_mceerr---of 3
force_sig_pkuerr---of 1
force_sig_ptrace_errno_trap---of 1
force_sig_seccomp---of 1
force_sigsegv---of 3
get_signal15%of 87
group_send_sig_info---of 15
ignore_signals---of 3
kernel_sigaction---of 5
kill_pgrp---of 5
kill_pid---of 1
kill_pid_info---of 1
kill_pid_info_type---of 15
kill_pid_usb_asyncio---of 26
kill_proc_info---of 11
lockdep_assert_task_sighand_held---of 20
next_signal---of 3
perf_trace_signal_deliver---of 10
perf_trace_signal_generate---of 10
post_copy_siginfo_from_user32---of 27
prepare_signal15%of 40
print_dropped_signal---of 4
ptrace_notify---of 4
ptrace_signal---of 34
ptrace_stop---of 25
ptrace_trap_notify---of 13
rcu_read_unlock---of 6
recalc_sigpending25%of 8
restore_altstack---of 3
retarget_shared_pending---of 16
send_sig---of 4
send_sig_fault---of 4
send_sig_fault_trapno---of 4
send_sig_info---of 4
send_sig_mceerr---of 5
send_sig_perf---of 3
send_signal_locked22%of 46
send_sigqueue---of 27
set_compat_user_sigmask---of 5
set_current_blocked---of 1
set_user_sigmask40%of 5
siginfo_layout---of 10
signal_setup_done40%of 10
signal_wake_up_state50%of 6
sigprocmask---of 7
sigqueue_alloc---of 1
sigqueue_free---of 6
sigsuspend---of 4
task_clear_jobctl_pending---of 4
task_clear_jobctl_trapping---of 3
task_join_group_stop---of 10
task_participate_group_stop---of 12
task_set_jobctl_pending---of 9
trace_event_raw_event_signal_deliver---of 9
trace_event_raw_event_signal_generate---of 9
trace_raw_output_signal_deliver---of 3
trace_raw_output_signal_generate---of 3
trace_signal_deliver27%of 15
trace_signal_generate27%of 15
unhandled_signal---of 8
zap_other_threads---of 10
-----------
SUMMARY22%of 401

-----------
SUMMARY---of 0

__ep_eventpoll_poll---of 30
__ep_remove---of 31
__ia32_compat_sys_epoll_pwait---of 4
__ia32_compat_sys_epoll_pwait2---of 5
__ia32_sys_epoll_create---of 3
__ia32_sys_epoll_create1---of 1
__ia32_sys_epoll_ctl---of 4
__ia32_sys_epoll_pwait---of 4
__ia32_sys_epoll_pwait2---of 5
__ia32_sys_epoll_wait---of 4
__x64_compat_sys_epoll_pwait---of 4
__x64_compat_sys_epoll_pwait2---of 5
__x64_sys_epoll_create67%of 3
__x64_sys_epoll_create1100%of 1
__x64_sys_epoll_ctl50%of 4
__x64_sys_epoll_pwait50%of 4
__x64_sys_epoll_pwait2---of 5
__x64_sys_epoll_wait50%of 4
do_compat_epoll_pwait---of 7
do_epoll_create34%of 9
do_epoll_ctl14%of 74
do_epoll_pwait29%of 7
do_epoll_wait20%of 101
ep_autoremove_wake_function---of 3
ep_busy_loop_end---of 7
ep_clear_and_put---of 14
ep_destroy_wakeup_source---of 7
ep_done_scan25%of 20
ep_eventpoll_ioctl---of 14
ep_eventpoll_poll---of 1
ep_eventpoll_release---of 3
ep_insert20%of 96
ep_loop_check_proc---of 10
ep_modify---of 36
ep_pm_stay_awake_rcu---of 18
ep_poll_callback---of 30
ep_ptable_queue_proc40%of 5
ep_remove_safe---of 3
ep_show_fdinfo---of 5
ep_unregister_pollwait---of 16
epoll_mutex_lock---of 3
eventpoll_release_file---of 7
get_epoll_tfile_raw_ptr---of 8
reverse_path_check_proc---of 10
-----------
SUMMARY22%of 328

__bpf_trace_hrtimer_class---of 1
__bpf_trace_hrtimer_expire_entry---of 1
__bpf_trace_hrtimer_init---of 1
__bpf_trace_hrtimer_start---of 1
__bpf_trace_itimer_expire---of 1
__bpf_trace_itimer_state---of 1
__bpf_trace_tick_stop---of 1
__bpf_trace_timer_base_idle---of 1
__bpf_trace_timer_class---of 1
__bpf_trace_timer_expire_entry---of 1
__bpf_trace_timer_start---of 1
__get_next_timer_interrupt---of 31
__mod_timer40%of 43
__probestub_hrtimer_cancel---of 1
__probestub_hrtimer_expire_entry---of 1
__probestub_hrtimer_expire_exit---of 1
__probestub_hrtimer_init---of 1
__probestub_hrtimer_start---of 1
__probestub_itimer_expire---of 1
__probestub_itimer_state---of 1
__probestub_tick_stop---of 1
__probestub_timer_base_idle---of 1
__probestub_timer_cancel---of 1
__probestub_timer_expire_entry---of 1
__probestub_timer_expire_exit---of 1
__probestub_timer_init---of 1
__probestub_timer_start---of 1
__round_jiffies---of 1
__round_jiffies_relative---of 1
__round_jiffies_up---of 1
__round_jiffies_up_relative---of 1
__run_timer_base---of 31
__timer_delete60%of 10
__timer_delete_sync---of 13
__traceiter_hrtimer_cancel---of 4
__traceiter_hrtimer_expire_entry---of 4
__traceiter_hrtimer_expire_exit---of 4
__traceiter_hrtimer_init---of 4
__traceiter_hrtimer_start---of 4
__traceiter_itimer_expire---of 4
__traceiter_itimer_state---of 4
__traceiter_tick_stop---of 4
__traceiter_timer_base_idle---of 4
__traceiter_timer_cancel---of 4
__traceiter_timer_expire_entry---of 4
__traceiter_timer_expire_exit---of 4
__traceiter_timer_init---of 4
__traceiter_timer_start---of 4
__try_to_del_timer_sync---of 10
add_timer---of 3
add_timer_global---of 3
add_timer_local---of 3
add_timer_on---of 13
calc_wheel_index60%of 10
call_timer_fn---of 35
destroy_timer_on_stack---of 1
detach_timer43%of 19
enqueue_timer31%of 23
fetch_next_timer_interrupt---of 15
fetch_next_timer_interrupt_remote---of 9
get_next_timer_interrupt---of 1
init_timer_key30%of 17
init_timer_on_stack_key---of 3
lock_timer_base38%of 8
mod_timer100%of 1
mod_timer_pending---of 1
msleep---of 4
msleep_interruptible---of 6
next_expiry_recalc---of 9
perf_trace_hrtimer_class---of 8
perf_trace_hrtimer_expire_entry---of 8
perf_trace_hrtimer_init---of 8
perf_trace_hrtimer_start---of 8
perf_trace_itimer_expire---of 9
perf_trace_itimer_state---of 8
perf_trace_tick_stop---of 8
perf_trace_timer_base_idle---of 8
perf_trace_timer_class---of 8
perf_trace_timer_expire_entry---of 8
perf_trace_timer_start---of 8
process_timeout---of 1
round_jiffies---of 1
round_jiffies_relative---of 1
round_jiffies_up---of 1
round_jiffies_up_relative---of 1
run_timer_softirq---of 9
schedule_timeout40%of 5
schedule_timeout_idle---of 1
schedule_timeout_interruptible---of 1
schedule_timeout_killable---of 1
schedule_timeout_uninterruptible---of 1
stub_timer---of 1
timer_base_is_idle---of 1
timer_base_try_to_set_idle---of 3
timer_clear_idle---of 1
timer_debug_hint---of 4
timer_delete100%of 1
timer_delete_sync---of 1
timer_expire_remote---of 3
timer_fixup_activate---of 4
timer_fixup_assert_init---of 3
timer_fixup_free---of 3
timer_fixup_init---of 3
timer_is_static_object---of 3
timer_lock_remote_bases---of 7
timer_migration_handler---of 5
timer_reduce100%of 1
timer_shutdown---of 1
timer_shutdown_sync---of 1
timer_unlock_remote_bases---of 3
timer_update_keys---of 4
timers_dead_cpu---of 19
timers_prepare_cpu---of 7
timers_update_nohz---of 1
trace_event_raw_event_hrtimer_class---of 7
trace_event_raw_event_hrtimer_expire_entry---of 7
trace_event_raw_event_hrtimer_init---of 7
trace_event_raw_event_hrtimer_start---of 7
trace_event_raw_event_itimer_expire---of 8
trace_event_raw_event_itimer_state---of 7
trace_event_raw_event_tick_stop---of 7
trace_event_raw_event_timer_base_idle---of 7
trace_event_raw_event_timer_class---of 7
trace_event_raw_event_timer_expire_entry---of 7
trace_event_raw_event_timer_start---of 7
trace_raw_output_hrtimer_class---of 3
trace_raw_output_hrtimer_expire_entry---of 3
trace_raw_output_hrtimer_init---of 3
trace_raw_output_hrtimer_start---of 3
trace_raw_output_itimer_expire---of 3
trace_raw_output_itimer_state---of 3
trace_raw_output_tick_stop---of 3
trace_raw_output_timer_base_idle---of 3
trace_raw_output_timer_class---of 3
trace_raw_output_timer_expire_entry---of 3
trace_raw_output_timer_start---of 3
trace_timer_base_idle---of 15
try_to_del_timer_sync---of 1
update_process_times---of 10
usleep_range_state---of 5
-----------
SUMMARY42%of 138

can_create---of 16
can_get_proto---of 18
can_pernet_exit---of 3
can_pernet_init---of 6
can_proto_register---of 5
can_proto_unregister---of 5
can_rcv---of 9
can_rcv_filter---of 29
can_receive---of 19
can_rx_delete_receiver---of 5
can_rx_register23%of 22
can_rx_unregister---of 30
can_send---of 28
can_sock_destruct---of 1
canfd_rcv---of 9
canxl_rcv---of 10
-----------
SUMMARY23%of 22

___d_drop40%of 10
__d_add60%of 22
__d_alloc46%of 11
__d_drop---of 3
__d_free---of 1
__d_free_external---of 1
__d_instantiate56%of 20
__d_lookup25%of 33
__d_lookup_rcu24%of 21
__d_lookup_rcu_op_compare---of 19
__d_lookup_unhash40%of 10
__d_lookup_unhash_wake---of 1
__d_move---of 65
__d_obtain_alias---of 30
__d_rehash40%of 10
__d_unalias---of 8
__dentry_kill63%of 29
d_add---of 3
d_add_ci---of 15
d_alloc50%of 4
d_alloc_anon100%of 1
d_alloc_cursor---of 4
d_alloc_name---of 4
d_alloc_parallel12%of 67
d_alloc_pseudo50%of 4
d_ancestor---of 4
d_delete---of 4
d_drop---of 3
d_exact_alias---of 16
d_exchange---of 9
d_find_alias---of 8
d_find_alias_rcu---of 8
d_find_any_alias---of 3
d_genocide---of 1
d_genocide_kill---of 6
d_hash_and_lookup---of 7
d_instantiate50%of 4
d_instantiate_new---of 5
d_invalidate25%of 8
d_lookup75%of 4
d_lru_add72%of 7
d_make_root---of 5
d_mark_dontcache---of 4
d_mark_tmpfile---of 6
d_move---of 1
d_obtain_alias---of 1
d_obtain_root---of 1
d_prune_aliases---of 6
d_rehash---of 1
d_same_name---of 6
d_set_d_op70%of 20
d_set_mounted---of 10
d_splice_alias24%of 17
d_tmpfile---of 4
d_walk---of 48
dentry_free63%of 8
dentry_lru_isolate---of 11
dentry_lru_isolate_shrink---of 6
dentry_unlink_inode39%of 13
dget_parent---of 27
do_one_tree---of 3
dput23%of 22
dput_to_list---of 12
fast_dput48%of 17
find_submount---of 3
hlist_bl_lock29%of 7
hlist_bl_unlock---of 4
is_subdir---of 17
lock_for_kill34%of 9
path_check_mount---of 5
path_has_submounts---of 1
proc_nr_dentry---of 13
prune_dcache_sb---of 1
rcu_read_unlock34%of 6
read_seqbegin50%of 10
read_word_at_a_time100%of 1
release_dentry_name_snapshot---of 5
retain_dentry---of 13
select_collect---of 8
select_collect2---of 11
shrink_dcache_for_umount---of 8
shrink_dcache_parent---of 12
shrink_dcache_sb---of 15
shrink_dentry_list---of 24
shrink_kill---of 22
start_dir_add34%of 9
take_dentry_name_snapshot---of 3
to_shrink_list---of 13
umount_check---of 7
write_seqlock---of 1
write_sequnlock---of 1
-----------
SUMMARY39%of 404

__bpf_trace_balance_dirty_pages---of 1
__bpf_trace_bdi_dirty_ratelimit---of 1
__bpf_trace_flush_foreign---of 1
__bpf_trace_global_dirty_state---of 1
__bpf_trace_inode_foreign_history---of 1
__bpf_trace_inode_switch_wbs---of 1
__bpf_trace_track_foreign_dirty---of 1
__bpf_trace_wbc_class---of 1
__bpf_trace_writeback_bdi_register---of 1
__bpf_trace_writeback_class---of 1
__bpf_trace_writeback_dirty_inode_template---of 1
__bpf_trace_writeback_folio_template---of 1
__bpf_trace_writeback_inode_template---of 1
__bpf_trace_writeback_pages_written---of 1
__bpf_trace_writeback_queue_io---of 1
__bpf_trace_writeback_sb_inodes_requeue---of 1
__bpf_trace_writeback_single_inode_template---of 1
__bpf_trace_writeback_work_class---of 1
__bpf_trace_writeback_write_inode_template---of 1
__inode_attach_wb10%of 32
__mark_inode_dirty25%of 82
__probestub_balance_dirty_pages---of 1
__probestub_bdi_dirty_ratelimit---of 1
__probestub_flush_foreign---of 1
__probestub_folio_wait_writeback---of 1
__probestub_global_dirty_state---of 1
__probestub_inode_foreign_history---of 1
__probestub_inode_switch_wbs---of 1
__probestub_sb_clear_inode_writeback---of 1
__probestub_sb_mark_inode_writeback---of 1
__probestub_track_foreign_dirty---of 1
__probestub_wbc_writepage---of 1
__probestub_writeback_bdi_register---of 1
__probestub_writeback_dirty_folio---of 1
__probestub_writeback_dirty_inode---of 1
__probestub_writeback_dirty_inode_enqueue---of 1
__probestub_writeback_dirty_inode_start---of 1
__probestub_writeback_exec---of 1
__probestub_writeback_lazytime---of 1
__probestub_writeback_lazytime_iput---of 1
__probestub_writeback_mark_inode_dirty---of 1
__probestub_writeback_pages_written---of 1
__probestub_writeback_queue---of 1
__probestub_writeback_queue_io---of 1
__probestub_writeback_sb_inodes_requeue---of 1
__probestub_writeback_single_inode---of 1
__probestub_writeback_single_inode_start---of 1
__probestub_writeback_start---of 1
__probestub_writeback_wait---of 1
__probestub_writeback_wake_background---of 1
__probestub_writeback_write_inode---of 1
__probestub_writeback_write_inode_start---of 1
__probestub_writeback_written---of 1
__traceiter_balance_dirty_pages---of 4
__traceiter_bdi_dirty_ratelimit---of 4
__traceiter_flush_foreign---of 4
__traceiter_folio_wait_writeback---of 4
__traceiter_global_dirty_state---of 4
__traceiter_inode_foreign_history---of 4
__traceiter_inode_switch_wbs---of 4
__traceiter_sb_clear_inode_writeback---of 4
__traceiter_sb_mark_inode_writeback---of 4
__traceiter_track_foreign_dirty---of 4
__traceiter_wbc_writepage---of 4
__traceiter_writeback_bdi_register---of 4
__traceiter_writeback_dirty_folio---of 4
__traceiter_writeback_dirty_inode---of 4
__traceiter_writeback_dirty_inode_enqueue---of 4
__traceiter_writeback_dirty_inode_start---of 4
__traceiter_writeback_exec---of 4
__traceiter_writeback_lazytime---of 4
__traceiter_writeback_lazytime_iput---of 4
__traceiter_writeback_mark_inode_dirty---of 4
__traceiter_writeback_pages_written---of 4
__traceiter_writeback_queue---of 4
__traceiter_writeback_queue_io---of 4
__traceiter_writeback_sb_inodes_requeue---of 4
__traceiter_writeback_single_inode---of 4
__traceiter_writeback_single_inode_start---of 4
__traceiter_writeback_start---of 4
__traceiter_writeback_wait---of 4
__traceiter_writeback_wake_background---of 4
__traceiter_writeback_write_inode---of 4
__traceiter_writeback_write_inode_start---of 4
__traceiter_writeback_written---of 4
__wakeup_flusher_threads_bdi---of 10
__writeback_inodes_sb_nr---of 4
__writeback_inodes_wb---of 9
__writeback_single_inode---of 87
bdi_split_work_to_wbs---of 41
cgroup_writeback_by_id---of 25
cgroup_writeback_umount---of 3
cleanup_offline_cgwb---of 22
dirtytime_interval_handler---of 3
folio_memcg---of 12
inode_cgwb_move_to_attached---of 11
inode_io_list_del---of 3
inode_io_list_move_locked39%of 13
inode_prepare_wbs_switch---of 11
inode_switch_wbs---of 28
inode_switch_wbs_work_fn---of 77
inode_wait_for_writeback50%of 4
locked_inode_to_wb_and_lock_list30%of 10
move_expired_inodes---of 22
percpu_ref_put_many---of 14
percpu_ref_tryget---of 16
perf_trace_balance_dirty_pages---of 11
perf_trace_bdi_dirty_ratelimit---of 9
perf_trace_flush_foreign---of 9
perf_trace_global_dirty_state---of 8
perf_trace_inode_foreign_history---of 11
perf_trace_inode_switch_wbs---of 9
perf_trace_track_foreign_dirty---of 13
perf_trace_wbc_class---of 11
perf_trace_writeback_bdi_register---of 9
perf_trace_writeback_class---of 9
perf_trace_writeback_dirty_inode_template---of 9
perf_trace_writeback_folio_template---of 14
perf_trace_writeback_inode_template---of 8
perf_trace_writeback_pages_written---of 8
perf_trace_writeback_queue_io---of 9
perf_trace_writeback_sb_inodes_requeue---of 14
perf_trace_writeback_single_inode_template---of 11
perf_trace_writeback_work_class---of 11
perf_trace_writeback_write_inode_template---of 11
queue_io---of 24
redirty_tail_locked---of 7
sb_clear_inode_writeback---of 19
sb_mark_inode_writeback---of 19
sync_inode_metadata---of 1
sync_inodes_sb---of 37
trace_event_raw_event_balance_dirty_pages---of 10
trace_event_raw_event_bdi_dirty_ratelimit---of 8
trace_event_raw_event_flush_foreign---of 8
trace_event_raw_event_global_dirty_state---of 7
trace_event_raw_event_inode_foreign_history---of 10
trace_event_raw_event_inode_switch_wbs---of 8
trace_event_raw_event_track_foreign_dirty---of 12
trace_event_raw_event_wbc_class---of 10
trace_event_raw_event_writeback_bdi_register---of 8
trace_event_raw_event_writeback_class---of 8
trace_event_raw_event_writeback_dirty_inode_template---of 8
trace_event_raw_event_writeback_folio_template---of 13
trace_event_raw_event_writeback_inode_template---of 7
trace_event_raw_event_writeback_pages_written---of 7
trace_event_raw_event_writeback_queue_io---of 8
trace_event_raw_event_writeback_sb_inodes_requeue---of 13
trace_event_raw_event_writeback_single_inode_template---of 10
trace_event_raw_event_writeback_work_class---of 10
trace_event_raw_event_writeback_write_inode_template---of 10
trace_raw_output_balance_dirty_pages---of 3
trace_raw_output_bdi_dirty_ratelimit---of 3
trace_raw_output_flush_foreign---of 3
trace_raw_output_global_dirty_state---of 3
trace_raw_output_inode_foreign_history---of 3
trace_raw_output_inode_switch_wbs---of 3
trace_raw_output_track_foreign_dirty---of 3
trace_raw_output_wbc_class---of 3
trace_raw_output_writeback_bdi_register---of 3
trace_raw_output_writeback_class---of 3
trace_raw_output_writeback_dirty_inode_template---of 3
trace_raw_output_writeback_folio_template---of 3
trace_raw_output_writeback_inode_template---of 3
trace_raw_output_writeback_pages_written---of 3
trace_raw_output_writeback_queue_io---of 3
trace_raw_output_writeback_sb_inodes_requeue---of 3
trace_raw_output_writeback_single_inode_template---of 3
trace_raw_output_writeback_work_class---of 3
trace_raw_output_writeback_write_inode_template---of 3
trace_writeback_pages_written---of 15
try_to_writeback_inodes_sb---of 3
wakeup_dirtytime_writeback---of 20
wakeup_flusher_threads---of 16
wakeup_flusher_threads_bdi---of 11
wb_get15%of 14
wb_io_lists_depopulated---of 7
wb_put---of 4
wb_queue_work---of 24
wb_start_background_writeback---of 17
wb_wait_for_completion---of 5
wb_wakeup_delayed---of 3
wb_workfn---of 48
wb_writeback---of 62
wbc_account_cgroup_owner---of 15
wbc_attach_and_unlock_inode---of 14
wbc_detach_inode---of 28
write_inode_now---of 3
writeback_inodes_sb---of 1
writeback_inodes_sb_nr---of 1
writeback_inodes_wb---of 3
writeback_sb_inodes---of 52
writeback_single_inode---of 22
xas_next_marked---of 16
-----------
SUMMARY23%of 155

-----------
SUMMARY---of 0

__xfrm6_pref_hash37%of 11
__xfrm_decode_session23%of 27
__xfrm_dst_lookup---of 7
__xfrm_policy_bysel_ctx---of 29
__xfrm_policy_check9%of 163
__xfrm_policy_inexact_prune_bin---of 74
__xfrm_policy_link---of 9
__xfrm_policy_unlink---of 14
__xfrm_route_forward---of 26
__xfrm_sk_clone_policy---of 24
dst_discard---of 1
nf_nat_decode_session27%of 19
policy_hash_bysel---of 28
policy_hash_direct23%of 22
rcu_read_unlock---of 6
rhashtable_lookup29%of 32
rht_lock---of 9
rht_unlock---of 10
secpath_has_nontransport---of 7
seqcount_lockdep_reader_access58%of 7
skb_dst---of 5
skb_dst_force---of 15
xdst_queue_output---of 20
xfrm_alloc_dst---of 12
xfrm_audit_common_policyinfo---of 12
xfrm_audit_policy_add---of 5
xfrm_audit_policy_delete---of 5
xfrm_confirm_neigh---of 8
xfrm_default_advmss---of 6
xfrm_dev_policy_flush---of 28
xfrm_dst_check---of 61
xfrm_dst_ifdown---of 14
xfrm_expand_policies---of 13
xfrm_hash_rebuild---of 61
xfrm_hash_resize---of 50
xfrm_icmp_flow_decode---of 9
xfrm_if_register_cb---of 1
xfrm_if_unregister_cb---of 1
xfrm_init_path---of 26
xfrm_link_failure---of 1
xfrm_lookup---of 1
xfrm_lookup_route23%of 9
xfrm_lookup_with_ifid7%of 117
xfrm_migrate---of 137
xfrm_mtu---of 8
xfrm_negative_advice---of 3
xfrm_neigh_lookup---of 9
xfrm_net_exit---of 1
xfrm_net_init---of 15
xfrm_pol_bin_cmp34%of 6
xfrm_pol_bin_key100%of 1
xfrm_pol_bin_obj---of 1
xfrm_policy_addr_delta---of 15
xfrm_policy_alloc---of 6
xfrm_policy_byid---of 18
xfrm_policy_bysel_ctx---of 38
xfrm_policy_delete---of 6
xfrm_policy_destroy---of 10
xfrm_policy_destroy_rcu---of 1
xfrm_policy_find_inexact_candidates22%of 33
xfrm_policy_fini---of 20
xfrm_policy_flush---of 28
xfrm_policy_get_afinfo---of 14
xfrm_policy_hash_rebuild---of 1
xfrm_policy_inexact_alloc_bin---of 85
xfrm_policy_inexact_alloc_chain---of 34
xfrm_policy_inexact_gc_tree---of 9
xfrm_policy_inexact_insert---of 53
xfrm_policy_inexact_insert_node---of 37
xfrm_policy_inexact_list_reinsert---of 26
xfrm_policy_insert---of 26
xfrm_policy_insert_list---of 41
xfrm_policy_kill---of 12
xfrm_policy_lookup_bytype27%of 67
xfrm_policy_queue_process---of 69
xfrm_policy_register_afinfo---of 19
xfrm_policy_requeue---of 14
xfrm_policy_timer---of 35
xfrm_policy_unregister_afinfo---of 13
xfrm_policy_walk---of 23
xfrm_policy_walk_done---of 4
xfrm_policy_walk_init---of 1
xfrm_pols_put---of 7
xfrm_resolve_and_create_bundle---of 155
xfrm_secpath_reject---of 8
xfrm_selector_match13%of 62
xfrm_sk_policy_insert---of 22
xfrm_sk_policy_lookup---of 33
xfrm_spd_getinfo---of 1
-----------
SUMMARY17%of 576

-----------
SUMMARY---of 0

iptable_mangle_hook34%of 9
iptable_mangle_net_exit---of 1
iptable_mangle_net_pre_exit---of 1
iptable_mangle_table_init---of 3
-----------
SUMMARY34%of 9

-----------
SUMMARY---of 0

refcount_dec_and_lock34%of 6
refcount_dec_and_lock_irqsave---of 6
refcount_dec_and_mutex_lock---of 6
refcount_dec_if_one---of 1
refcount_dec_not_one29%of 7
refcount_warn_saturate---of 13
-----------
SUMMARY31%of 13

-----------
SUMMARY---of 0

__inet_accept---of 9
__inet_bind---of 35
__inet_listen_sk---of 25
__inet_stream_connect32%of 47
inet_accept---of 3
inet_bind---of 1
inet_bind_sk---of 6
inet_compat_ioctl---of 11
inet_create---of 52
inet_ctl_sock_create---of 3
inet_current_timestamp---of 1
inet_dgram_connect---of 13
inet_getname---of 11
inet_gro_complete---of 13
inet_gro_receive---of 31
inet_gso_segment28%of 43
inet_init_net---of 1
inet_ioctl---of 27
inet_listen---of 4
inet_recv_error---of 4
inet_recvmsg50%of 8
inet_register_protosw---of 10
inet_release---of 8
inet_send_prepare34%of 6
inet_sendmsg34%of 9
inet_shutdown42%of 12
inet_sk_rebuild_header---of 46
inet_sk_set_state---of 1
inet_sk_state_store100%of 1
inet_sock_destruct---of 24
inet_splice_eof---of 8
inet_stream_connect---of 1
inet_unregister_protosw---of 4
ipip_gro_complete---of 1
ipip_gro_receive---of 3
ipip_gso_segment---of 3
ipv4_mib_exit_net---of 1
ipv4_mib_init_net---of 13
rcu_read_unlock---of 6
sk_dst_reset---of 1
snmp_fold_field---of 5
sock_rps_record_flow10%of 22
trace_inet_sock_set_state27%of 15
udp_set_peek_off---of 1
-----------
SUMMARY30%of 163

NF_HOOK30%of 20
dst_input38%of 8
ip6_input---of 1
ip6_input_finish29%of 14
ip6_mc_input---of 49
ip6_protocol_deliver_rcu20%of 75
ip6_rcv_core21%of 72
ip6_rcv_finish34%of 9
ip6_rcv_finish_core25%of 20
ip6_sublist_rcv---of 63
ipv6_is_mld---of 13
ipv6_list_rcv---of 14
ipv6_rcv67%of 3
rcu_read_unlock34%of 6
skb_dst---of 5
skb_orphan50%of 4
xfrm6_policy_check20%of 30
-----------
SUMMARY25%of 261

-----------
SUMMARY---of 0

llist_add_batch50%of 4
llist_del_first---of 5
llist_del_first_this---of 5
llist_reverse_order50%of 4
-----------
SUMMARY50%of 8

hstcp_cong_avoid15%of 27
hstcp_init---of 1
hstcp_ssthresh---of 3
-----------
SUMMARY15%of 27

-----------
SUMMARY---of 0

sctp_clear_pd---of 8
sctp_make_reassembled_event---of 19
sctp_ulpq_abort_pd---of 8
sctp_ulpq_flush40%of 10
sctp_ulpq_free100%of 1
sctp_ulpq_init100%of 1
sctp_ulpq_order---of 10
sctp_ulpq_partial_delivery---of 21
sctp_ulpq_reasm_drain---of 8
sctp_ulpq_reasm_flushtsn---of 5
sctp_ulpq_renege---of 11
sctp_ulpq_renege_list---of 11
sctp_ulpq_retrieve_ordered---of 8
sctp_ulpq_retrieve_reassembled---of 19
sctp_ulpq_skip---of 14
sctp_ulpq_tail_data---of 29
sctp_ulpq_tail_event14%of 36
-----------
SUMMARY23%of 48

-----------
SUMMARY---of 0

__genl_sk_priv_get---of 3
ctrl_dumpfamily---of 8
ctrl_dumppolicy---of 23
ctrl_dumppolicy_done---of 1
ctrl_dumppolicy_put_op---of 20
ctrl_dumppolicy_start---of 30
ctrl_fill_info25%of 32
ctrl_getfamily32%of 22
genl_bind---of 12
genl_ctrl_event---of 41
genl_done---of 6
genl_dumpit---of 5
genl_family_rcv_msg_attrs_parse29%of 7
genl_get_cmd16%of 25
genl_lock---of 1
genl_notify---of 4
genl_op_iter_next36%of 25
genl_pernet_exit---of 1
genl_pernet_init---of 4
genl_rcv100%of 1
genl_rcv_msg26%of 31
genl_register_family---of 94
genl_release63%of 8
genl_sk_priv_get---of 12
genl_sk_privs_free---of 7
genl_start---of 13
genl_unbind---of 8
genl_unlock---of 1
genl_unregister_family---of 27
genlmsg_multicast_allns---of 13
genlmsg_put---of 6
nla_put_string100%of 1
-----------
SUMMARY30%of 152

-----------
SUMMARY---of 0

pn_deliver_sock_broadcast---of 21
pn_find_sock_by_res10%of 21
pn_find_sock_by_sa17%of 24
pn_res_seq_next---of 12
pn_res_seq_show---of 3
pn_res_seq_start---of 11
pn_res_seq_stop---of 1
pn_sock_bind_res---of 11
pn_sock_get_port22%of 14
pn_sock_hash34%of 12
pn_sock_seq_next---of 10
pn_sock_seq_show---of 3
pn_sock_seq_start---of 15
pn_sock_seq_stop---of 6
pn_sock_unbind_all_res---of 11
pn_sock_unbind_res---of 6
pn_sock_unhash---of 18
pn_socket_accept---of 5
pn_socket_bind31%of 13
pn_socket_connect---of 23
pn_socket_getname---of 3
pn_socket_ioctl17%of 12
pn_socket_listen38%of 8
pn_socket_poll37%of 11
pn_socket_release---of 3
pn_socket_sendmsg40%of 5
-----------
SUMMARY24%of 120

-----------
SUMMARY---of 0

__check_object_size31%of 43
check_stack_object75%of 4
-----------
SUMMARY35%of 47

__change_page_attr_set_clr---of 105
__cpa_flush_all---of 4
__cpa_flush_tlb---of 7
__cpa_process_fault---of 41
__set_memory_prot---of 1
__unmap_pmd_range---of 22
_set_memory_uc---of 1
_set_memory_wb---of 1
_set_memory_wc---of 3
_set_memory_wt---of 1
_set_pages_array---of 12
arch_invalidate_pmem---of 4
arch_report_meminfo---of 3
change_page_attr_set_clr---of 40
clear_mce_nospec---of 1
clflush_cache_range---of 4
cpu_cache_has_invalidate_memregion---of 4
cpu_cache_invalidate_memregion---of 4
kernel_page_present---of 1
lookup_address100%of 1
lookup_address_in_pgd---of 1
lookup_address_in_pgd_attr25%of 8
lookup_pmd_address---of 4
populate_pmd---of 22
populate_pte---of 8
set_direct_map_default_noflush---of 1
set_direct_map_invalid_noflush---of 1
set_mce_nospec---of 4
set_memory_4k---of 1
set_memory_decrypted---of 1
set_memory_encrypted---of 1
set_memory_global---of 1
set_memory_nonglobal---of 1
set_memory_np---of 1
set_memory_np_noalias---of 1
set_memory_nx---of 3
set_memory_p---of 1
set_memory_ro---of 1
set_memory_rox---of 1
set_memory_rw---of 1
set_memory_uc---of 4
set_memory_wb---of 3
set_memory_wc---of 5
set_memory_x---of 3
set_pages_array_uc---of 1
set_pages_array_wb---of 5
set_pages_array_wc---of 1
set_pages_ro---of 1
set_pages_rw---of 1
set_pages_uc---of 1
set_pages_wb---of 3
slow_virt_to_phys---of 5
static_protections---of 21
unmap_pmd_range---of 19
update_page_count---of 3
-----------
SUMMARY34%of 9

___perf_sw_event---of 34
__ia32_sys_perf_event_open---of 1
__perf_cgroup_move---of 3
__perf_event_account_interrupt---of 8
__perf_event_disable---of 24
__perf_event_enable---of 27
__perf_event_exit_context---of 6
__perf_event_header__init_id---of 14
__perf_event_output_stop---of 11
__perf_event_overflow---of 38
__perf_event_period---of 14
__perf_event_read---of 36
__perf_event_read_value---of 4
__perf_event_stop---of 5
__perf_event_task_sched_in---of 84
__perf_event_task_sched_out---of 139
__perf_install_in_context---of 20
__perf_pmu_install_event---of 6
__perf_pmu_output_stop---of 15
__perf_pmu_remove---of 36
__perf_read_group_add---of 27
__perf_remove_from_context---of 53
__perf_sw_event---of 8
__pmu_ctx_sched_out---of 24
__se_sys_perf_event_open---of 214
__update_context_time---of 6
__x64_sys_perf_event_open---of 1
_free_event---of 59
_perf_event_disable---of 3
_perf_event_enable---of 6
_perf_event_refresh---of 8
_perf_event_reset---of 1
account_event---of 38
add_event_to_ctx---of 40
alloc_perf_context---of 6
calc_timer_values---of 10
cpu_clock_event_add---of 5
cpu_clock_event_del---of 3
cpu_clock_event_init---of 7
cpu_clock_event_read---of 1
cpu_clock_event_start---of 4
cpu_clock_event_stop---of 3
cpu_clock_event_update---of 4
ctx_event_to_rotate---of 57
ctx_groups_sched_in---of 7
ctx_resched---of 41
ctx_sched_in---of 17
ctx_sched_out---of 24
event_function---of 18
event_function_call---of 14
event_sched_in---of 37
event_sched_out---of 38
exclusive_event_destroy---of 4
find_get_context---of 21
find_get_pmu_context---of 22
free_ctx---of 1
free_epc_rcu---of 1
free_event---of 3
free_event_rcu---of 3
get_uid---of 4
group_sched_out---of 15
inherit_event---of 14
inherit_task_group---of 37
ktime_get_boottime_ns---of 1
ktime_get_clocktai_ns---of 1
ktime_get_real_ns---of 1
list_del_event---of 21
nr_addr_filters_show---of 1
perf_addr_filters_splice---of 13
perf_adjust_freq_unthr_context---of 18
perf_adjust_freq_unthr_events---of 24
perf_adjust_period---of 23
perf_allow_kernel---of 5
perf_bp_event---of 11
perf_callchain---of 5
perf_cgroup_attach---of 7
perf_cgroup_css_alloc50%of 4
perf_cgroup_css_free---of 1
perf_cgroup_css_online16%of 13
perf_cgroup_from_task---of 11
perf_cgroup_set_timestamp---of 8
perf_cgroup_switch---of 31
perf_compat_ioctl---of 8
perf_copy_attr---of 34
perf_cpu_task_ctx---of 7
perf_cpu_time_max_percent_handler---of 5
perf_detach_cgroup---of 15
perf_duration_warn---of 3
perf_event__output_id_sample---of 14
perf_event_account_interrupt---of 1
perf_event_addr_filters_apply---of 28
perf_event_addr_filters_sync---of 4
perf_event_alloc---of 117
perf_event_attrs---of 1
perf_event_aux_event---of 5
perf_event_bpf_event38%of 8
perf_event_bpf_output---of 6
perf_event_cgroup_output---of 9
perf_event_comm---of 3
perf_event_comm_output---of 13
perf_event_create_kernel_counter---of 23
perf_event_ctx_lock_nested---of 24
perf_event_delayed_put---of 3
perf_event_disable---of 3
perf_event_disable_inatomic---of 1
perf_event_disable_local---of 20
perf_event_enable---of 6
perf_event_exec---of 56
perf_event_exit_cpu---of 5
perf_event_exit_event---of 8
perf_event_exit_task---of 33
perf_event_fork---of 5
perf_event_free_bpf_prog---of 4
perf_event_free_task---of 17
perf_event_get---of 4
perf_event_groups_insert---of 20
perf_event_header__init_id---of 3
perf_event_idx_default---of 1
perf_event_init_cpu---of 16
perf_event_init_task---of 20
perf_event_init_userpage---of 18
perf_event_itrace_started---of 1
perf_event_ksymbol---of 9
perf_event_ksymbol_output---of 9
perf_event_max_sample_rate_handler---of 6
perf_event_mmap4%of 62
perf_event_mmap_output---of 31
perf_event_modify_breakpoint---of 9
perf_event_mux_interval_ms_show---of 1
perf_event_mux_interval_ms_store---of 8
perf_event_namespaces13%of 16
perf_event_namespaces_output---of 9
perf_event_nop_int---of 1
perf_event_output---of 15
perf_event_output_backward---of 15
perf_event_output_forward---of 15
perf_event_overflow---of 1
perf_event_pause---of 7
perf_event_period---of 9
perf_event_read---of 31
perf_event_read_local---of 42
perf_event_read_value---of 1
perf_event_refresh---of 1
perf_event_release_kernel---of 29
perf_event_set_bpf_prog---of 22
perf_event_set_output---of 22
perf_event_set_state---of 10
perf_event_switch_output---of 11
perf_event_sysfs_show---of 3
perf_event_task_disable---of 13
perf_event_task_enable---of 19
perf_event_task_output---of 17
perf_event_task_tick---of 24
perf_event_text_poke---of 3
perf_event_text_poke_output---of 16
perf_event_update_sibling_time---of 16
perf_event_update_time---of 7
perf_event_update_userpage---of 24
perf_event_wakeup---of 22
perf_fasync---of 1
perf_get_event---of 3
perf_get_page_size---of 21
perf_group_detach---of 43
perf_install_in_context---of 32
perf_ioctl---of 148
perf_iterate_ctx---of 17
perf_iterate_sb---of 39
perf_lock_task_context---of 52
perf_log_lost_samples---of 5
perf_log_throttle---of 7
perf_mmap---of 56
perf_mmap_close---of 61
perf_mmap_fault---of 30
perf_mmap_open---of 5
perf_mux_hrtimer_handler---of 44
perf_mux_hrtimer_restart_ipi---of 3
perf_output_read---of 56
perf_output_sample---of 104
perf_pending_irq---of 18
perf_pending_task---of 16
perf_pmu_cancel_txn---of 6
perf_pmu_commit_txn---of 6
perf_pmu_disable---of 5
perf_pmu_enable---of 5
perf_pmu_migrate_context---of 17
perf_pmu_nop_int---of 1
perf_pmu_nop_txn---of 1
perf_pmu_nop_void---of 1
perf_pmu_register---of 30
perf_pmu_resched---of 5
perf_pmu_sched_task---of 25
perf_pmu_start_txn---of 6
perf_pmu_unregister---of 8
perf_poll---of 9
perf_prepare_header---of 3
perf_prepare_sample---of 83
perf_read---of 22
perf_reboot---of 5
perf_register_guest_info_callbacks---of 4
perf_release---of 1
perf_remove_from_context---of 7
perf_remove_from_owner---of 25
perf_report_aux_output_id---of 5
perf_sample_event_took---of 7
perf_sched_cb_dec---of 6
perf_sched_cb_inc---of 8
perf_sched_delayed---of 3
perf_swevent_add---of 16
perf_swevent_del---of 3
perf_swevent_event---of 20
perf_swevent_get_recursion_context---of 5
perf_swevent_hrtimer---of 13
perf_swevent_init---of 9
perf_swevent_put_recursion_context---of 3
perf_swevent_read---of 1
perf_swevent_set_period---of 5
perf_swevent_start---of 1
perf_swevent_stop---of 1
perf_tp_event---of 124
perf_tp_event_init---of 5
perf_trace_run_bpf_submit---of 7
perf_try_init_event---of 20
perf_unpin_context---of 1
perf_unregister_guest_info_callbacks---of 3
perf_uprobe_event_init---of 8
pmu_dev_alloc---of 7
pmu_dev_is_visible---of 4
pmu_dev_release---of 1
put_ctx---of 10
put_pmu_ctx---of 12
rb_free_rcu---of 1
ref_ctr_offset_show---of 1
remote_function---of 5
retprobe_show---of 1
ring_buffer_attach---of 22
ring_buffer_get---of 23
ring_buffer_put---of 5
sw_perf_event_destroy---of 9
swevent_hlist_get---of 17
swevent_hlist_put_cpu---of 9
task_clock_event_add---of 5
task_clock_event_del---of 3
task_clock_event_init---of 7
task_clock_event_read---of 1
task_clock_event_start---of 4
task_clock_event_stop---of 3
task_clock_event_update---of 4
tp_perf_event_destroy---of 1
type_show---of 1
unclone_ctx---of 6
update_cgrp_time_from_cpuctx---of 8
update_context_time---of 4
visit_groups_merge---of 169
-----------
SUMMARY11%of 103

__pm_relax50%of 4
__pm_stay_awake---of 3
device_set_wakeup_capable---of 7
device_set_wakeup_enable---of 5
device_wakeup_arm_wake_irqs---of 6
device_wakeup_attach_irq---of 4
device_wakeup_detach_irq---of 3
device_wakeup_disable---of 4
device_wakeup_disarm_wake_irqs---of 6
device_wakeup_enable---of 13
pm_get_wakeup_count---of 7
pm_print_active_wakeup_sources---of 16
pm_relax---of 5
pm_save_wakeup_count---of 3
pm_stay_awake---of 4
pm_system_cancel_wakeup---of 5
pm_system_irq_wakeup---of 9
pm_system_wakeup---of 1
pm_wakeup_clear---of 4
pm_wakeup_dev_event---of 3
pm_wakeup_irq---of 1
pm_wakeup_pending---of 6
pm_wakeup_timer_fn---of 5
pm_wakeup_ws_event---of 5
print_wakeup_source_stats---of 4
wakeup_source_add---of 4
wakeup_source_create---of 5
wakeup_source_deactivate---of 21
wakeup_source_destroy---of 6
wakeup_source_register---of 9
wakeup_source_remove---of 4
wakeup_source_report_event---of 24
wakeup_source_unregister---of 6
wakeup_sources_read_lock---of 1
wakeup_sources_read_unlock---of 3
wakeup_sources_stats_open---of 1
wakeup_sources_stats_seq_next---of 3
wakeup_sources_stats_seq_show---of 1
wakeup_sources_stats_seq_start---of 7
wakeup_sources_stats_seq_stop---of 3
wakeup_sources_walk_next---of 3
wakeup_sources_walk_start---of 1
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__nfqnl_enqueue_packet---of 143
instance_create---of 7
instance_destroy---of 3
instance_destroy_rcu---of 16
nf_bridge_adjust_segmented_data---of 7
nfnl_queue_net_exit---of 5
nfnl_queue_net_init---of 1
nfnl_queue_pernet25%of 16
nfqnl_enqueue_packet---of 25
nfqnl_nf_hook_drop---of 12
nfqnl_put_packet_info---of 3
nfqnl_put_sk_classid---of 8
nfqnl_put_sk_uidgid---of 9
nfqnl_rcv_dev_event---of 32
nfqnl_rcv_nl_event19%of 11
nfqnl_recv_config---of 44
nfqnl_recv_unsupp---of 1
nfqnl_recv_verdict---of 56
nfqnl_recv_verdict_batch---of 22
nfqnl_reinject---of 79
seq_next---of 7
seq_show---of 1
seq_start---of 29
seq_stop---of 1
-----------
SUMMARY23%of 27

-----------
SUMMARY---of 0

lockdep_nfnl_is_held---of 3
nfnetlink_bind---of 23
nfnetlink_broadcast---of 1
nfnetlink_has_listeners100%of 1
nfnetlink_net_exit_batch---of 4
nfnetlink_net_init---of 1
nfnetlink_rcv---of 106
nfnetlink_rcv_msg---of 87
nfnetlink_send---of 1
nfnetlink_set_err---of 1
nfnetlink_subsys_register---of 15
nfnetlink_subsys_unregister---of 7
nfnetlink_unbind---of 5
nfnetlink_unicast---of 1
nfnl_lock---of 3
nfnl_pernet25%of 16
nfnl_unlock---of 3
-----------
SUMMARY30%of 17

-----------
SUMMARY---of 0

__radix_tree_delete---of 10
__radix_tree_lookup34%of 18
__radix_tree_preload15%of 34
__radix_tree_replace54%of 13
delete_node17%of 31
idr_destroy---of 12
idr_get_free49%of 37
idr_preload23%of 9
node_tag_clear38%of 16
radix_tree_cpu_dead---of 6
radix_tree_delete---of 1
radix_tree_delete_item---of 22
radix_tree_extend14%of 22
radix_tree_gang_lookup---of 12
radix_tree_gang_lookup_tag---of 14
radix_tree_gang_lookup_tag_slot---of 11
radix_tree_insert---of 27
radix_tree_iter_delete---of 3
radix_tree_iter_replace100%of 1
radix_tree_iter_resume---of 1
radix_tree_iter_tag_clear100%of 1
radix_tree_lookup39%of 13
radix_tree_lookup_slot---of 13
radix_tree_maybe_preload---of 9
radix_tree_next_chunk36%of 59
radix_tree_node_alloc37%of 11
radix_tree_node_ctor---of 1
radix_tree_node_rcu_free---of 1
radix_tree_preload---of 3
radix_tree_replace_slot---of 7
radix_tree_tag_clear---of 13
radix_tree_tag_get38%of 16
radix_tree_tag_set---of 21
radix_tree_tagged---of 3
-----------
SUMMARY33%of 281

-----------
SUMMARY---of 0

tcp_rate_check_app_limited40%of 5
tcp_rate_gen70%of 13
tcp_rate_skb_delivered63%of 8
tcp_rate_skb_sent100%of 3
-----------
SUMMARY66%of 29

__create_xol_area---of 24
__update_ref_ctr---of 7
__uprobe_register---of 27
__uprobe_unregister---of 9
arch_uprobe_copy_ixol---of 3
arch_uprobe_ignore---of 1
dup_xol_work---of 6
find_uprobe---of 11
install_breakpoint---of 21
is_swbp_insn---of 1
is_trap_insn---of 1
put_page---of 14
put_uprobe---of 11
register_for_each_vma---of 53
set_orig_insn---of 1
set_swbp---of 1
update_ref_ctr---of 37
uprobe_apply---of 5
uprobe_clear_state---of 11
uprobe_copy_process---of 15
uprobe_deny_signal23%of 9
uprobe_dup_mmap---of 3
uprobe_end_dup_mmap---of 10
uprobe_free_utask---of 7
uprobe_get_swbp_addr---of 1
uprobe_get_trap_addr---of 4
uprobe_mmap---of 59
uprobe_munmap---of 15
uprobe_notify_resume---of 126
uprobe_post_sstep_notifier---of 4
uprobe_pre_sstep_notifier---of 6
uprobe_register---of 1
uprobe_register_refctr---of 1
uprobe_start_dup_mmap---of 10
uprobe_unregister---of 3
uprobe_write_opcode---of 160
xol_free_insn_slot---of 9
-----------
SUMMARY23%of 9

-----------
SUMMARY---of 0

__bpf_trace_dccp_probe---of 1
__probestub_dccp_probe---of 1
__traceiter_dccp_probe---of 4
dccp_close---of 57
dccp_destroy_sock---of 10
dccp_destruct_common---of 1
dccp_disconnect---of 32
dccp_done---of 13
dccp_finish_passive_close---of 15
dccp_getsockopt---of 165
dccp_init_sock---of 5
dccp_ioctl---of 6
dccp_msghdr_parse---of 13
dccp_packet_name---of 4
dccp_poll29%of 25
dccp_recvmsg---of 31
dccp_sendmsg---of 33
dccp_set_state---of 16
dccp_setsockopt---of 67
dccp_shutdown100%of 1
dccp_sk_destruct---of 1
inet_dccp_listen---of 10
perf_trace_dccp_probe---of 17
trace_event_raw_event_dccp_probe---of 16
trace_raw_output_dccp_probe---of 3
-----------
SUMMARY31%of 26

__bpf_trace_notifier_info---of 1
__probestub_notifier_register---of 1
__probestub_notifier_run---of 1
__probestub_notifier_unregister---of 1
__traceiter_notifier_register---of 4
__traceiter_notifier_run---of 4
__traceiter_notifier_unregister---of 4
atomic_notifier_call_chain---of 11
atomic_notifier_call_chain_is_empty---of 1
atomic_notifier_chain_register---of 1
atomic_notifier_chain_register_unique_prio---of 1
atomic_notifier_chain_unregister---of 1
blocking_notifier_call_chain67%of 3
blocking_notifier_call_chain_robust---of 4
blocking_notifier_chain_register---of 3
blocking_notifier_chain_register_unique_prio---of 3
blocking_notifier_chain_unregister---of 3
notifier_call_chain35%of 23
notifier_chain_register---of 21
notifier_chain_unregister---of 20
notify_die---of 6
perf_trace_notifier_info---of 8
raw_notifier_call_chain---of 1
raw_notifier_call_chain_robust---of 3
raw_notifier_chain_register---of 1
raw_notifier_chain_unregister---of 1
register_die_notifier---of 1
srcu_init_notifier_head---of 3
srcu_notifier_call_chain---of 3
srcu_notifier_chain_register---of 3
srcu_notifier_chain_unregister---of 3
trace_event_raw_event_notifier_info---of 7
trace_raw_output_notifier_info---of 3
unregister_die_notifier---of 1
-----------
SUMMARY39%of 26

crypto_get_default_null_skcipher50%of 4
crypto_put_default_null_skcipher---of 3
null_compress---of 3
null_crypt---of 1
null_digest---of 1
null_final---of 1
null_hash_setkey---of 1
null_init---of 1
null_setkey---of 1
null_skcipher_crypt---of 6
null_skcipher_setkey---of 1
null_update---of 1
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

do_sctp_enqueue_event---of 1
do_ulpq_tail_event100%of 1
sctp_chunk_assign_mid---of 11
sctp_enqueue_event---of 23
sctp_generate_iftsn---of 24
sctp_handle_fwdtsn---of 4
sctp_handle_iftsn---of 16
sctp_intl_abort_pd---of 14
sctp_intl_reap_ordered---of 14
sctp_intl_retrieve_ordered---of 8
sctp_intl_start_pd---of 40
sctp_make_idatafrag_empty---of 3
sctp_renege_events---of 9
sctp_report_fwdtsn---of 1
sctp_report_iftsn---of 13
sctp_stream_interleave_init100%of 1
sctp_ulpevent_idata---of 155
sctp_validate_data---of 4
sctp_validate_fwdtsn---of 5
sctp_validate_idata---of 4
sctp_validate_iftsn---of 5
-----------
SUMMARY100%of 2

-----------
SUMMARY---of 0

mptcp_token_accept---of 12
mptcp_token_destroy---of 14
mptcp_token_destroy_request---of 9
mptcp_token_exists---of 20
mptcp_token_get_sock---of 33
mptcp_token_iter_next---of 37
mptcp_token_new_connect29%of 25
mptcp_token_new_request---of 19
-----------
SUMMARY29%of 25

__bpf_trace_filelock_lease---of 1
__bpf_trace_filelock_lock---of 1
__bpf_trace_generic_add_lease---of 1
__bpf_trace_leases_conflict---of 1
__bpf_trace_locks_get_lock_context---of 1
__break_lease---of 96
__ia32_sys_flock---of 1
__locks_delete_block---of 12
__locks_insert_block---of 12
__locks_wake_up_blocks---of 16
__probestub_break_lease_block---of 1
__probestub_break_lease_noblock---of 1
__probestub_break_lease_unblock---of 1
__probestub_fcntl_setlk---of 1
__probestub_flock_lock_inode---of 1
__probestub_generic_add_lease---of 1
__probestub_generic_delete_lease---of 1
__probestub_leases_conflict---of 1
__probestub_locks_get_lock_context---of 1
__probestub_locks_remove_posix---of 1
__probestub_posix_lock_inode---of 1
__probestub_time_out_leases---of 1
__se_sys_flock---of 18
__traceiter_break_lease_block---of 4
__traceiter_break_lease_noblock---of 4
__traceiter_break_lease_unblock---of 4
__traceiter_fcntl_setlk---of 4
__traceiter_flock_lock_inode---of 4
__traceiter_generic_add_lease---of 4
__traceiter_generic_delete_lease---of 4
__traceiter_leases_conflict---of 4
__traceiter_locks_get_lock_context---of 4
__traceiter_locks_remove_posix---of 4
__traceiter_posix_lock_inode---of 4
__traceiter_time_out_leases---of 4
__x64_sys_flock---of 1
do_lock_file_wait---of 14
fcntl_getlease---of 17
fcntl_getlk---of 25
fcntl_setlease---of 10
fcntl_setlk---of 45
files_lookup_fd_locked---of 6
flock_lock_inode---of 62
flock_locks_conflict---of 4
generic_setlease---of 77
kernel_setlease---of 5
lease_break_callback---of 1
lease_get_mtime---of 8
lease_modify---of 13
lease_register_notifier---of 1
lease_setup---of 3
lease_unregister_notifier---of 1
leases_conflict---of 22
lock_get_status---of 19
locks_alloc_lease---of 3
locks_alloc_lock---of 3
locks_check_ctx_lists---of 5
locks_copy_conflock---of 4
locks_copy_lock---of 9
locks_delete_block---of 1
locks_dump_ctx_list---of 4
locks_free_lease---of 1
locks_free_lock---of 1
locks_free_lock_context67%of 3
locks_get_lock_context---of 20
locks_init_lease---of 1
locks_init_lock---of 1
locks_insert_lock_ctx---of 10
locks_lock_inode_wait---of 18
locks_next---of 1
locks_owner_has_blockers---of 7
locks_release_private---of 12
locks_remove_file6%of 36
locks_remove_posix10%of 22
locks_show---of 16
locks_start---of 1
locks_stop---of 1
locks_translate_pid---of 13
locks_unlink_lock_ctx---of 14
percpu_down_read---of 10
percpu_up_read---of 10
perf_trace_filelock_lease---of 9
perf_trace_filelock_lock---of 9
perf_trace_generic_add_lease---of 8
perf_trace_leases_conflict---of 8
perf_trace_locks_get_lock_context---of 8
posix_lock_file---of 1
posix_lock_inode---of 137
posix_locks_conflict---of 6
posix_test_lock---of 25
show_fd_locks---of 20
time_out_leases---of 28
trace_event_raw_event_filelock_lease---of 8
trace_event_raw_event_filelock_lock---of 8
trace_event_raw_event_generic_add_lease---of 7
trace_event_raw_event_leases_conflict---of 7
trace_event_raw_event_locks_get_lock_context---of 7
trace_generic_delete_lease---of 15
trace_raw_output_filelock_lease---of 3
trace_raw_output_filelock_lock---of 3
trace_raw_output_generic_add_lease---of 3
trace_raw_output_leases_conflict---of 3
trace_raw_output_locks_get_lock_context---of 3
vfs_cancel_lock---of 5
vfs_inode_has_locks---of 4
vfs_lock_file---of 5
vfs_setlease---of 10
vfs_test_lock---of 5
-----------
SUMMARY10%of 61

-----------
SUMMARY---of 0

____sys_recvmsg60%of 10
____sys_sendmsg37%of 22
__copy_msghdr---of 13
__ia32_sys_accept---of 1
__ia32_sys_accept4---of 1
__ia32_sys_bind---of 1
__ia32_sys_connect---of 1
__ia32_sys_getpeername---of 1
__ia32_sys_getsockname---of 1
__ia32_sys_getsockopt---of 1
__ia32_sys_listen---of 1
__ia32_sys_recv---of 1
__ia32_sys_recvfrom---of 1
__ia32_sys_recvmmsg---of 6
__ia32_sys_recvmmsg_time32---of 6
__ia32_sys_recvmsg---of 1
__ia32_sys_send---of 1
__ia32_sys_sendmmsg---of 1
__ia32_sys_sendmsg---of 1
__ia32_sys_sendto---of 1
__ia32_sys_setsockopt---of 1
__ia32_sys_shutdown---of 1
__ia32_sys_socket---of 1
__ia32_sys_socketcall---of 1
__ia32_sys_socketpair---of 1
__se_sys_shutdown30%of 10
__se_sys_socketcall---of 39
__sock_create15%of 41
__sock_recv_cmsgs---of 15
__sock_recv_timestamp---of 58
__sock_recv_wifi_status---of 4
__sock_sendmsg63%of 8
__sock_tx_timestamp---of 1
__sys_accept438%of 8
__sys_bind28%of 11
__sys_connect34%of 9
__sys_connect_file---of 5
__sys_getpeername25%of 12
__sys_getsockname28%of 11
__sys_getsockopt30%of 10
__sys_listen30%of 10
__sys_recvfrom28%of 11
__sys_recvmmsg---of 11
__sys_recvmsg31%of 13
__sys_recvmsg_sock---of 1
__sys_sendmmsg23%of 22
__sys_sendmsg31%of 13
__sys_sendmsg_sock---of 1
__sys_sendto25%of 12
__sys_setsockopt30%of 10
__sys_shutdown---of 10
__sys_shutdown_sock---of 3
__sys_socket24%of 13
__sys_socket_file---of 6
__sys_socketpair10%of 20
__x64_sys_accept100%of 1
__x64_sys_accept4100%of 1
__x64_sys_bind100%of 1
__x64_sys_connect100%of 1
__x64_sys_getpeername100%of 1
__x64_sys_getsockname100%of 1
__x64_sys_getsockopt100%of 1
__x64_sys_listen100%of 1
__x64_sys_recv---of 1
__x64_sys_recvfrom100%of 1
__x64_sys_recvmmsg34%of 6
__x64_sys_recvmmsg_time32---of 6
__x64_sys_recvmsg100%of 1
__x64_sys_send---of 1
__x64_sys_sendmmsg100%of 1
__x64_sys_sendmsg100%of 1
__x64_sys_sendto100%of 1
__x64_sys_setsockopt100%of 1
__x64_sys_shutdown100%of 1
__x64_sys_socket100%of 1
__x64_sys_socketcall---of 1
__x64_sys_socketpair100%of 1
br_ioctl_call---of 5
brioctl_set---of 1
call_trace_sock_recv_length---of 15
call_trace_sock_send_length---of 15
compat_sock_ioctl---of 86
copy_msghdr_from_user65%of 14
do_accept17%of 12
do_recvmmsg32%of 35
do_sock_getsockopt20%of 21
do_sock_setsockopt36%of 14
get_user_ifreq---of 9
init_once---of 1
kernel_accept---of 9
kernel_bind---of 1
kernel_connect---of 1
kernel_getpeername---of 1
kernel_getsockname---of 1
kernel_listen---of 1
kernel_recvmsg---of 1
kernel_sendmsg---of 3
kernel_sendmsg_locked---of 3
kernel_sock_ip_overhead---of 29
kernel_sock_shutdown100%of 1
move_addr_to_kernel25%of 8
move_addr_to_user19%of 11
put_user_ifreq---of 4
rcu_read_unlock34%of 6
sock_alloc---of 3
sock_alloc_file37%of 11
sock_alloc_inode67%of 3
sock_close50%of 8
sock_create---of 1
sock_create_kern---of 1
sock_create_lite---of 11
sock_do_ioctl19%of 11
sock_fasync---of 4
sock_free_inode---of 1
sock_from_file---of 3
sock_ioctl6%of 39
sock_is_registered---of 4
sock_mmap---of 1
sock_poll30%of 10
sock_read_iter58%of 7
sock_recvmsg58%of 7
sock_recvmsg_nosec50%of 6
sock_register---of 14
sock_release---of 7
sock_sendmsg---of 3
sock_sendmsg_nosec---of 7
sock_show_fdinfo---of 3
sock_splice_eof---of 3
sock_splice_read67%of 3
sock_unregister---of 3
sock_wake_async---of 10
sock_write_iter58%of 7
socket_seq_show---of 1
sockfd_lookup---of 5
sockfs_dname100%of 1
sockfs_init_fs_context---of 3
sockfs_listxattr---of 7
sockfs_security_xattr_set---of 1
sockfs_setattr---of 5
sockfs_xattr_get---of 4
update_socket_protocol100%of 1
vlan_ioctl_set---of 1
-----------
SUMMARY32%of 535

-----------
SUMMARY---of 0

tomoyo_addprintf---of 1
tomoyo_check_profile---of 9
tomoyo_close_control---of 4
tomoyo_flush---of 19
tomoyo_init_policy_namespace---of 4
tomoyo_io_printf---of 5
tomoyo_open_control---of 24
tomoyo_parse_policy---of 10
tomoyo_poll_control---of 3
tomoyo_poll_query---of 5
tomoyo_print_name_union---of 9
tomoyo_print_number_union---of 3
tomoyo_print_number_union_nospace---of 7
tomoyo_profile100%of 1
tomoyo_read_control---of 20
tomoyo_read_domain---of 50
tomoyo_read_domain2---of 200
tomoyo_read_exception---of 125
tomoyo_read_manager---of 24
tomoyo_read_pid---of 22
tomoyo_read_profile---of 51
tomoyo_read_query---of 15
tomoyo_read_stat---of 19
tomoyo_read_version---of 3
tomoyo_same_manager---of 1
tomoyo_same_task_acl---of 1
tomoyo_set_group---of 9
tomoyo_set_slash---of 3
tomoyo_set_space---of 3
tomoyo_set_string---of 3
tomoyo_supervisor8%of 54
tomoyo_update_stat---of 5
tomoyo_write_answer---of 10
tomoyo_write_control---of 70
tomoyo_write_domain---of 28
tomoyo_write_domain2---of 7
tomoyo_write_exception---of 15
tomoyo_write_manager---of 8
tomoyo_write_pid---of 1
tomoyo_write_profile---of 46
tomoyo_write_stat---of 8
tomoyo_write_task---of 5
-----------
SUMMARY10%of 55

__ip6_local_out---of 32
dst_output---of 8
ip6_dst_hoplimit25%of 20
ip6_find_1stfragopt---of 11
ip6_local_out---of 3
ipv6_proxy_select_ident---of 8
ipv6_select_ident---of 3
skb_dst---of 5
-----------
SUMMARY25%of 20

-----------
SUMMARY---of 0

__xa_alloc---of 7
__xa_alloc_cyclic---of 9
__xa_clear_mark---of 3
__xa_cmpxchg---of 8
__xa_erase---of 3
__xa_insert---of 7
__xa_set_mark---of 3
__xa_store---of 7
__xas_next---of 28
__xas_nomem---of 10
__xas_prev---of 28
node_set_marks---of 16
xa_clear_mark---of 3
xa_delete_node---of 8
xa_destroy---of 12
xa_erase67%of 3
xa_extract---of 68
xa_find---of 17
xa_find_after---of 26
xa_get_mark---of 48
xa_get_order---of 13
xa_load24%of 13
xa_parent---of 7
xa_set_mark---of 3
xa_store---of 1
xa_store_range---of 24
xas_clear_mark19%of 22
xas_create20%of 99
xas_create_range---of 22
xas_destroy---of 4
xas_find17%of 31
xas_find_conflict---of 58
xas_find_marked24%of 64
xas_free_nodes---of 24
xas_get_mark---of 6
xas_get_order---of 13
xas_init_marks---of 4
xas_load31%of 26
xas_nomem34%of 6
xas_pause---of 16
xas_set_mark---of 19
xas_split---of 22
xas_split_alloc---of 14
xas_start17%of 37
xas_store17%of 86
-----------
SUMMARY21%of 387

-----------
SUMMARY---of 0

__kernfs_new_node32%of 22
__kernfs_remove---of 34
kernfs_activate75%of 4
kernfs_activate_one36%of 14
kernfs_add_one39%of 31
kernfs_break_active_protection50%of 6
kernfs_create_dir_ns50%of 4
kernfs_create_empty_dir---of 4
kernfs_create_root---of 21
kernfs_destroy_root---of 6
kernfs_dir_fop_release---of 1
kernfs_dir_pos---of 26
kernfs_dop_revalidate28%of 18
kernfs_drain---of 22
kernfs_find_and_get_node_by_id---of 22
kernfs_find_and_get_ns---of 4
kernfs_find_ns53%of 21
kernfs_fop_readdir---of 25
kernfs_free_rcu---of 3
kernfs_get50%of 4
kernfs_get_active---of 7
kernfs_get_parent---of 4
kernfs_iop_lookup46%of 11
kernfs_iop_mkdir31%of 13
kernfs_iop_rename---of 30
kernfs_iop_rmdir---of 15
kernfs_link_sibling67%of 12
kernfs_name---of 4
kernfs_new_node38%of 8
kernfs_next_descendant_post59%of 12
kernfs_node_from_dentry---of 4
kernfs_path_from_node---of 42
kernfs_put16%of 13
kernfs_put_active---of 6
kernfs_remove---of 3
kernfs_remove_by_name_ns40%of 5
kernfs_remove_self---of 13
kernfs_rename_ns---of 33
kernfs_root_to_node---of 1
kernfs_show---of 9
kernfs_unbreak_active_protection67%of 3
kernfs_walk_and_get_ns---of 10
pr_cont_kernfs_name---of 4
pr_cont_kernfs_path---of 4
-----------
SUMMARY42%of 201

-----------
SUMMARY---of 0

bsearch40%of 5
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

llcp_raw_sock_bind---of 9
llcp_sock_accept---of 15
llcp_sock_bind---of 13
llcp_sock_connect---of 28
llcp_sock_create---of 7
llcp_sock_destruct---of 9
llcp_sock_getname---of 7
llcp_sock_listen---of 9
llcp_sock_poll43%of 19
llcp_sock_recvmsg11%of 28
llcp_sock_release---of 16
llcp_sock_sendmsg---of 13
nfc_llcp_accept_dequeue---of 16
nfc_llcp_accept_enqueue---of 6
nfc_llcp_accept_unlink---of 8
nfc_llcp_getsockopt---of 15
nfc_llcp_setsockopt---of 23
nfc_llcp_sock_alloc---of 4
nfc_llcp_sock_exit---of 1
nfc_llcp_sock_free---of 3
sock_wait_state---of 11
-----------
SUMMARY24%of 47

-----------
SUMMARY---of 0

___pmd_free_tlb---of 5
___pte_free_tlb---of 4
___pud_free_tlb---of 4
__native_set_fixmap---of 3
arch_check_zapped_pmd---of 7
arch_check_zapped_pte---of 7
lruvec_stat_sub_folio---of 15
native_set_fixmap---of 7
pgd_alloc---of 7
pgd_free---of 3
pgd_page_get_mm---of 1
pmd_clear_huge67%of 3
pmd_free_pte_page---of 1
pmd_mkwrite---of 3
pmd_set_huge---of 8
pmdp_clear_flush_young---of 6
pmdp_invalidate_ad---of 5
pmdp_set_access_flags---of 4
pmdp_test_and_clear_young---of 3
pte_alloc_one30%of 20
pte_mkwrite67%of 3
ptep_clear_flush_young---of 3
ptep_set_access_flags---of 3
ptep_test_and_clear_young---of 3
pud_clear_huge67%of 3
pud_free_pmd_page---of 13
pud_set_huge---of 7
pudp_set_access_flags---of 4
pudp_test_and_clear_young---of 3
-----------
SUMMARY42%of 29

__ieee80211_sta_recalc_aggregates---of 23
__sta_info_alloc---of 38
__sta_info_destroy---of 3
__sta_info_destroy_part1---of 91
__sta_info_destroy_part2---of 15
__sta_info_flush---of 30
__sta_info_recalc_tim---of 61
_sta_info_move_state---of 35
cleanup_single_sta---of 14
drv_allow_buffered_frames---of 17
drv_flush_sta---of 30
ieee80211_find_sta---of 4
ieee80211_find_sta_by_ifaddr---of 6
ieee80211_find_sta_by_link_addrs---of 17
ieee80211_purge_sta_txqs---of 35
ieee80211_recalc_p2p_go_ps_allowed---of 20
ieee80211_send_eosp_nullfunc---of 31
ieee80211_send_null_response---of 27
ieee80211_sta_activate_link---of 34
ieee80211_sta_allocate_link---of 19
ieee80211_sta_block_awake---of 21
ieee80211_sta_eosp---of 15
ieee80211_sta_expire---of 22
ieee80211_sta_free_link---of 6
ieee80211_sta_last_active---of 9
ieee80211_sta_ps_deliver_poll_response---of 1
ieee80211_sta_ps_deliver_response---of 88
ieee80211_sta_ps_deliver_uapsd---of 7
ieee80211_sta_ps_deliver_wakeup---of 62
ieee80211_sta_recalc_aggregates---of 1
ieee80211_sta_register_airtime---of 11
ieee80211_sta_remove_link---of 8
ieee80211_sta_set_buffered---of 18
ieee80211_sta_set_expected_throughput---of 4
ieee80211_sta_set_max_amsdu_subframes---of 6
ieee80211_sta_update_pending_airtime---of 21
link_sta_info_get_bss18%of 23
link_sta_info_hash_lookup---of 1
lockdep_sta_mutex_held---of 1
rcu_read_unlock---of 6
rhltable_insert---of 82
rhltable_lookup27%of 30
rhltable_remove---of 83
rht_lock---of 9
rht_unlock---of 10
sta_deliver_ps_frames---of 11
sta_get_expected_throughput---of 36
sta_info_alloc---of 1
sta_info_alloc_with_link---of 1
sta_info_cleanup---of 31
sta_info_destroy_addr---of 6
sta_info_destroy_addr_bss---of 6
sta_info_free---of 39
sta_info_get---of 19
sta_info_get_bss18%of 23
sta_info_get_by_addrs---of 5
sta_info_get_by_idx---of 7
sta_info_hash_lookup---of 1
sta_info_init---of 4
sta_info_insert---of 6
sta_info_insert_rcu---of 93
sta_info_move_state---of 1
sta_info_recalc_tim---of 1
sta_info_stop---of 1
sta_remove_link---of 19
sta_set_sinfo---of 175
trace_drv_return_void---of 15
-----------
SUMMARY22%of 76

__ieee80211_flush_queues---of 43
__ieee80211_stop_queue---of 26
__ieee80211_wake_queue---of 54
__iterate_interfaces---of 27
_ieee80211_wake_txqs---of 46
drv_reconfig_complete---of 20
drv_start_ap---of 25
drv_wake_tx_queue---of 21
ieee80211_add_aid_request_ie---of 1
ieee80211_add_pending_skb---of 3
ieee80211_add_pending_skbs---of 9
ieee80211_add_s1g_capab_ie---of 4
ieee80211_add_wmm_info_ie---of 1
ieee80211_assign_chanctx---of 11
ieee80211_ave_rssi---of 3
ieee80211_build_preq_ies---of 7
ieee80211_build_probe_req---of 7
ieee80211_calculate_rx_timestamp---of 30
ieee80211_chanctx_radar_detect---of 20
ieee80211_chandef_downgrade---of 22
ieee80211_chandef_eht_oper---of 7
ieee80211_chandef_he_6ghz_oper---of 26
ieee80211_chandef_ht_oper---of 6
ieee80211_chandef_s1g_oper---of 8
ieee80211_chandef_vht_oper---of 27
ieee80211_check_combinations---of 36
ieee80211_clear_tpe100%of 1
ieee80211_conn_mode_str---of 3
ieee80211_ctstoself_duration---of 10
ieee80211_dfs_cac_cancel---of 9
ieee80211_dfs_radar_detected_work---of 10
ieee80211_encode_usf---of 7
ieee80211_extend_absent_time---of 17
ieee80211_extend_noa_desc---of 13
ieee80211_flush_queues---of 1
ieee80211_frame_duration---of 3
ieee80211_generic_frame_duration---of 6
ieee80211_get_bssid15%of 14
ieee80211_get_sband---of 25
ieee80211_get_vif_queues---of 16
ieee80211_handle_reconfig_failure---of 12
ieee80211_handle_wake_tx_queue---of 7
ieee80211_hw_restart_disconnect---of 1
ieee80211_ie_build_eht_oper---of 11
ieee80211_ie_build_he_oper---of 10
ieee80211_ie_build_ht_cap---of 1
ieee80211_ie_build_ht_oper---of 14
ieee80211_ie_build_vht_cap---of 1
ieee80211_ie_build_vht_oper---of 9
ieee80211_ie_build_wide_bw_cs---of 8
ieee80211_ie_len_eht_cap---of 30
ieee80211_ie_len_he_cap---of 15
ieee80211_ie_split_vendor---of 5
ieee80211_iter_max_chans---of 1
ieee80211_iterate_active_interfaces_atomic---of 11
ieee80211_iterate_active_interfaces_mtx---of 4
ieee80211_iterate_interfaces---of 1
ieee80211_iterate_stations_atomic---of 16
ieee80211_max_num_channels---of 16
ieee80211_mcs_to_chains---of 5
ieee80211_min_bw_limit_from_chandef---of 9
ieee80211_parse_p2p_noa---of 9
ieee80211_put_eht_cap---of 45
ieee80211_put_he_6ghz_cap---of 20
ieee80211_put_he_cap---of 26
ieee80211_put_preq_ies---of 66
ieee80211_put_srates_elem---of 24
ieee80211_queue_delayed_work---of 5
ieee80211_queue_stopped---of 6
ieee80211_queue_work---of 5
ieee80211_radar_detected---of 15
ieee80211_recalc_dtim---of 6
ieee80211_recalc_min_chandef---of 34
ieee80211_recalc_smps---of 11
ieee80211_reconfig---of 371
ieee80211_reconfig_disconnect---of 15
ieee80211_reconfig_stations---of 14
ieee80211_regulatory_limit_wmm_params---of 35
ieee80211_resume_disconnect---of 1
ieee80211_rts_duration---of 8
ieee80211_send_action_csa---of 21
ieee80211_send_auth---of 9
ieee80211_send_deauth_disassoc---of 6
ieee80211_set_wmm_default---of 35
ieee80211_smps_is_restrictive---of 6
ieee80211_sta_get_rates---of 19
ieee80211_stop_device---of 1
ieee80211_stop_queue---of 1
ieee80211_stop_queue_by_reason---of 1
ieee80211_stop_queues---of 3
ieee80211_stop_queues_by_reason---of 3
ieee80211_stop_vif_queues---of 3
ieee80211_tx_set_protected---of 4
ieee80211_txq_get_depth---of 8
ieee80211_update_p2p_noa---of 16
ieee80211_vif_to_wdev---of 1
ieee80211_wake_queue---of 1
ieee80211_wake_queue_by_reason---of 1
ieee80211_wake_queues---of 3
ieee80211_wake_queues_by_reason---of 3
ieee80211_wake_txqs---of 1
ieee80211_wake_vif_queues---of 3
trace_drv_return_int---of 15
trace_drv_return_void---of 15
wdev_to_ieee80211_vif---of 3
wiphy_to_ieee80211_hw---of 3
-----------
SUMMARY20%of 15

-----------
SUMMARY---of 0

sha1_init100%of 1
sha1_transform100%of 6
-----------
SUMMARY100%of 7

-----------
SUMMARY---of 0

fixup_vdso_exception25%of 8
-----------
SUMMARY25%of 8

-----------
SUMMARY---of 0

__bpf_map_get---of 5
__bpf_map_inc_not_zero---of 6
__bpf_prog_put_noref---of 6
__bpf_prog_put_rcu---of 1
__ia32_sys_bpf---of 1
__sys_bpf8%of 50
__x64_sys_bpf100%of 1
attach_type_to_prog_type---of 49
bpf_audit_prog29%of 7
bpf_btf_get_fd_by_id---of 4
bpf_btf_load---of 9
bpf_check_uarg_tail_zero---of 6
bpf_copy_to_user---of 6
bpf_dummy_read---of 1
bpf_dummy_write---of 1
bpf_enable_stats---of 7
bpf_get_file_flag---of 1
bpf_get_unmapped_area---of 3
bpf_insn_prepare_dump---of 22
bpf_iter_create---of 11
bpf_kallsyms_lookup_name---of 5
bpf_link_by_id---of 8
bpf_link_cleanup---of 3
bpf_link_defer_dealloc_mult_rcu_gp---of 1
bpf_link_defer_dealloc_rcu_gp---of 1
bpf_link_free---of 10
bpf_link_get_curr_or_next---of 9
bpf_link_get_fd_by_id---of 13
bpf_link_get_from_fd---of 6
bpf_link_inc---of 1
bpf_link_inc_not_zero---of 5
bpf_link_init---of 4
bpf_link_new_fd---of 1
bpf_link_poll---of 1
bpf_link_prime---of 6
bpf_link_put---of 3
bpf_link_put_deferred---of 1
bpf_link_release---of 3
bpf_link_settle---of 1
bpf_link_show_fdinfo---of 7
bpf_map_alloc_id---of 5
bpf_map_alloc_pages---of 20
bpf_map_alloc_percpu---of 8
bpf_map_area_alloc---of 7
bpf_map_area_free---of 1
bpf_map_area_mmapable_alloc---of 6
bpf_map_copy_value---of 33
bpf_map_do_batch---of 25
bpf_map_free_deferred---of 3
bpf_map_free_id---of 3
bpf_map_free_mult_rcu_gp---of 1
bpf_map_free_rcu_gp---of 1
bpf_map_free_record---of 1
bpf_map_get---of 7
bpf_map_get_curr_or_next---of 9
bpf_map_get_fd_by_id---of 14
bpf_map_get_memcg---of 30
bpf_map_get_with_uref---of 7
bpf_map_inc---of 1
bpf_map_inc_not_zero---of 5
bpf_map_inc_with_uref---of 1
bpf_map_init_from_attr---of 3
bpf_map_kmalloc_node---of 8
bpf_map_kvcalloc---of 10
bpf_map_kzalloc---of 8
bpf_map_mmap---of 22
bpf_map_mmap_close---of 3
bpf_map_mmap_open---of 3
bpf_map_new_fd---of 3
bpf_map_poll---of 3
bpf_map_put---of 9
bpf_map_put_with_uref---of 4
bpf_map_release---of 6
bpf_map_save_memcg---of 15
bpf_map_show_fdinfo---of 7
bpf_map_update_value---of 25
bpf_map_write_active---of 1
bpf_obj_free_fields---of 23
bpf_obj_free_timer---of 4
bpf_obj_free_workqueue---of 4
bpf_obj_get---of 7
bpf_obj_get_info_by_fd---of 136
bpf_obj_get_next_id---of 6
bpf_obj_name_cpy28%of 11
bpf_obj_pin---of 6
bpf_perf_link_attach---of 8
bpf_perf_link_dealloc---of 1
bpf_perf_link_fill_common---of 7
bpf_perf_link_fill_link_info---of 8
bpf_perf_link_release---of 1
bpf_prog_add---of 1
bpf_prog_alloc_id60%of 5
bpf_prog_attach---of 37
bpf_prog_attach_check_attach_type---of 24
bpf_prog_bind_map---of 24
bpf_prog_by_id---of 8
bpf_prog_detach---of 35
bpf_prog_free_id---of 3
bpf_prog_get---of 7
bpf_prog_get_curr_or_next---of 9
bpf_prog_get_fd_by_id---of 13
bpf_prog_get_ok---of 4
bpf_prog_get_type---of 10
bpf_prog_get_type_dev---of 10
bpf_prog_inc---of 1
bpf_prog_inc_misses_counter---of 3
bpf_prog_inc_not_zero---of 6
bpf_prog_load10%of 94
bpf_prog_load_check_attach10%of 44
bpf_prog_new_fd67%of 3
bpf_prog_put---of 5
bpf_prog_put_deferred---of 9
bpf_prog_query---of 44
bpf_prog_release---of 1
bpf_prog_show_fdinfo---of 5
bpf_prog_sub---of 3
bpf_prog_test_run---of 19
bpf_raw_tp_link_attach---of 19
bpf_raw_tp_link_dealloc---of 1
bpf_raw_tp_link_fill_link_info---of 4
bpf_raw_tp_link_release---of 1
bpf_raw_tp_link_show_fdinfo---of 1
bpf_raw_tracepoint_open---of 11
bpf_stats_handler---of 8
bpf_stats_release---of 1
bpf_sys_bpf---of 11
bpf_sys_close---of 1
bpf_task_fd_query---of 24
bpf_task_fd_query_copy---of 18
bpf_tracing_link_dealloc---of 1
bpf_tracing_link_fill_link_info---of 5
bpf_tracing_link_release---of 5
bpf_tracing_link_show_fdinfo---of 1
bpf_tracing_prog_attach---of 48
bpf_unpriv_handler---of 10
btf_field_cmp---of 1
btf_record_dup---of 21
btf_record_equal---of 5
btf_record_find---of 5
btf_record_free---of 19
check_and_init_map_value---of 17
copy_from_bpfptr50%of 4
copy_map_value---of 5
find_prog_type40%of 5
generic_map_delete_batch---of 25
generic_map_lookup_batch---of 51
generic_map_update_batch---of 27
get_uid50%of 4
idr_preload_end40%of 10
kern_sys_bpf---of 23
kvmemdup_bpfptr_noprof---of 7
license_is_gpl_compatible29%of 7
link_create---of 44
link_detach---of 12
link_update---of 47
map_check_btf---of 83
map_check_no_btf---of 1
map_create---of 87
map_delete_elem---of 24
map_freeze---of 17
map_get_next_key---of 28
map_lookup_and_delete_elem---of 48
map_lookup_elem---of 35
map_update_elem---of 31
percpu_ref_put---of 14
rcu_read_unlock---of 6
strncpy_from_bpfptr67%of 3
syscall_prog_func_proto---of 6
syscall_prog_is_valid_access---of 3
token_create---of 4
-----------
SUMMARY17%of 248

__ieee80211_check_fast_rx_iface---of 11
__ieee80211_queue_skb_to_iface---of 5
__ieee80211_rx_h_amsdu---of 26
__skb_queue_purge---of 4
drv_event_callback---of 17
ieee80211_add_rx_radiotap_header---of 96
ieee80211_check_fast_rx---of 94
ieee80211_check_fast_rx_iface---of 14
ieee80211_clean_skb39%of 13
ieee80211_clear_fast_rx---of 3
ieee80211_deliver_skb---of 30
ieee80211_deliver_skb_to_local_stack---of 15
ieee80211_destroy_frag_cache---of 13
ieee80211_frame_allowed---of 7
ieee80211_get_mmie_keyidx8%of 25
ieee80211_init_frag_cache---of 1
ieee80211_is_our_addr---of 14
ieee80211_mark_rx_ba_filtered_frames---of 49
ieee80211_prepare_and_rx_handle6%of 237
ieee80211_process_rx_twt_action---of 12
ieee80211_queue_skb_to_iface---of 5
ieee80211_reassemble_add---of 8
ieee80211_reassemble_find---of 16
ieee80211_release_reorder_frame---of 22
ieee80211_release_reorder_frames---of 7
ieee80211_release_reorder_timeout---of 19
ieee80211_rx_8023---of 48
ieee80211_rx_check_bss_color_collision---of 13
ieee80211_rx_data_set_link---of 19
ieee80211_rx_for_interface45%of 9
ieee80211_rx_get_bigtk---of 32
ieee80211_rx_h_action_post_userspace---of 12
ieee80211_rx_h_action_return---of 11
ieee80211_rx_h_ext---of 7
ieee80211_rx_h_mgmt---of 28
ieee80211_rx_handlers6%of 581
ieee80211_rx_handlers_result8%of 28
ieee80211_rx_irqsafe---of 3
ieee80211_rx_list9%of 172
ieee80211_rx_mesh_data---of 56
ieee80211_rx_mesh_fast_forward---of 25
ieee80211_rx_napi---of 17
ieee80211_rx_radiotap_hdrlen---of 11
ieee80211_sta_ps_transition---of 7
ieee80211_sta_pspoll---of 4
ieee80211_sta_reorder_release---of 55
ieee80211_sta_uapsd_trigger---of 7
pskb_may_pull---of 6
pskb_pull---of 7
pskb_trim---of 4
skb_cow_head---of 5
skb_postpull_rcsum---of 5
sta_ps_start---of 39
sta_stats_encode_rate---of 7
trace_drv_return_void---of 15
-----------
SUMMARY7%of 1065

ida_alloc_range19%of 33
ida_destroy---of 22
ida_free---of 10
idr_alloc50%of 4
idr_alloc_cyclic50%of 6
idr_alloc_u3260%of 5
idr_find100%of 1
idr_for_each---of 9
idr_get_next39%of 13
idr_get_next_ul---of 11
idr_remove---of 1
idr_replace50%of 4
-----------
SUMMARY34%of 66

-----------
SUMMARY---of 0

net_generic---of 16
skb_dst---of 5
xfrm_policy_check---of 32
xfrmi4_err---of 24
xfrmi4_fini---of 1
xfrmi4_input---of 8
xfrmi4_rcv---of 8
xfrmi4_rcv_tunnel---of 1
xfrmi6_err---of 17
xfrmi6_fini---of 1
xfrmi6_input---of 8
xfrmi6_rcv---of 8
xfrmi6_rcv_tunnel---of 1
xfrmi_build_state---of 10
xfrmi_changelink---of 37
xfrmi_decode_session10%of 22
xfrmi_dellink---of 1
xfrmi_destroy_state---of 1
xfrmi_dev_free---of 1
xfrmi_dev_init---of 7
xfrmi_dev_setup---of 1
xfrmi_dev_uninit---of 10
xfrmi_encap_cmp---of 1
xfrmi_encap_nlsize---of 1
xfrmi_exit_batch_rtnl---of 21
xfrmi_fill_encap_info---of 5
xfrmi_fill_info---of 6
xfrmi_get_iflink---of 1
xfrmi_get_link_net---of 1
xfrmi_get_size---of 1
xfrmi_lookup---of 25
xfrmi_newlink---of 34
xfrmi_rcv_cb---of 26
xfrmi_scrub_packet---of 16
xfrmi_validate---of 1
xfrmi_xmit---of 84
-----------
SUMMARY10%of 22

-----------
SUMMARY---of 0

__dispose_buffer---of 5
__jbd2_journal_file_buffer---of 29
__jbd2_journal_refile_buffer---of 16
__jbd2_journal_temp_unlink_buffer---of 23
do_get_write_access---of 49
folio_size---of 10
jbd2__journal_restart---of 21
jbd2__journal_start28%of 11
jbd2_buffer_abort_trigger---of 4
jbd2_buffer_frozen_trigger---of 4
jbd2_journal_begin_ordered_truncate---of 5
jbd2_journal_destroy_transaction_cache---of 1
jbd2_journal_dirty_metadata8%of 41
jbd2_journal_extend---of 19
jbd2_journal_file_buffer---of 1
jbd2_journal_file_inode---of 15
jbd2_journal_forget---of 21
jbd2_journal_free_reserved---of 5
jbd2_journal_free_transaction---of 3
jbd2_journal_get_create_access---of 13
jbd2_journal_get_undo_access---of 13
jbd2_journal_get_write_access29%of 7
jbd2_journal_inode_ranged_wait---of 1
jbd2_journal_inode_ranged_write---of 1
jbd2_journal_invalidate_folio---of 39
jbd2_journal_lock_updates---of 6
jbd2_journal_refile_buffer---of 3
jbd2_journal_restart---of 1
jbd2_journal_set_triggers---of 3
jbd2_journal_start---of 1
jbd2_journal_start_reserved---of 5
jbd2_journal_stop26%of 35
jbd2_journal_try_to_free_buffers---of 17
jbd2_journal_unfile_buffer---of 4
jbd2_journal_unlock_updates---of 3
jbd2_journal_wait_updates---of 5
jbd2_write_access_granted25%of 20
start_this_handle12%of 61
stop_this_handle31%of 13
trace_jbd2_handle_start27%of 15
wait_transaction_locked---of 3
-----------
SUMMARY19%of 203

-----------
SUMMARY---of 0

task_work_add---of 13
task_work_cancel---of 8
task_work_cancel_match---of 8
task_work_run67%of 9
-----------
SUMMARY67%of 9

-----------
SUMMARY---of 0

kasprintf---of 1
kvasprintf---of 4
kvasprintf_const40%of 5
-----------
SUMMARY40%of 5

chroot_fs_refs---of 22
copy_fs_struct67%of 3
current_umask100%of 1
exit_fs---of 4
free_fs_struct---of 1
set_fs_pwd---of 6
set_fs_root---of 6
unshare_fs_struct---of 4
-----------
SUMMARY75%of 4

tcp_assign_congestion_control---of 36
tcp_ca_find---of 4
tcp_ca_find_autoload---of 19
tcp_ca_find_key---of 4
tcp_ca_get_key_by_name---of 13
tcp_ca_get_name_by_key---of 15
tcp_cleanup_congestion_control---of 5
tcp_cong_avoid_ai---of 9
tcp_get_allowed_congestion_control---of 17
tcp_get_available_congestion_control---of 14
tcp_get_default_congestion_control---of 16
tcp_init_congestion_control27%of 19
tcp_register_congestion_control---of 15
tcp_reno_cong_avoid---of 8
tcp_reno_ssthresh---of 1
tcp_reno_undo_cwnd---of 1
tcp_set_allowed_congestion_control---of 19
tcp_set_ca_state---of 17
tcp_set_congestion_control---of 48
tcp_set_default_congestion_control---of 22
tcp_slow_start67%of 3
tcp_unregister_congestion_control---of 3
tcp_update_congestion_control---of 12
tcp_validate_congestion_control---of 6
-----------
SUMMARY32%of 22

__copy_xstate_to_uabi_buf---of 38
__raw_xsave_addr---of 11
__xfd_enable_feature---of 42
arch_set_user_pkey_access---of 20
copy_sigframe_from_user_to_xstate---of 1
copy_uabi_from_kernel_to_xstate---of 1
copy_uabi_to_xstate---of 40
copy_xstate_to_uabi_buf---of 1
cpu_has_xfeatures---of 4
fpstate_clear_xstate_component---of 4
fpstate_free---of 3
fpu__init_cpu_xstate---of 20
fpu__resume_cpu---of 15
fpu_xstate_prctl---of 35
get_xsave_addr---of 8
proc_pid_arch_status---of 6
xfd_enable_feature---of 1
xfd_validate_state29%of 7
xfeature_get_offset---of 16
xfeature_size---of 4
xrstors---of 10
xsaves---of 10
xstate_calculate_size---of 8
xstate_get_guest_group_perm---of 1
-----------
SUMMARY29%of 7

-----------
SUMMARY---of 0

cfg80211_assign_cookie---of 3
cfg80211_pmsr_complete---of 27
cfg80211_pmsr_free_wk---of 1
cfg80211_pmsr_process_abort---of 46
cfg80211_pmsr_report---of 42
cfg80211_pmsr_wdev_down---of 6
cfg80211_release_pmsr34%of 6
nl80211_pmsr_send_ftm_res---of 56
nl80211_pmsr_start---of 115
rdev_start_pmsr---of 31
-----------
SUMMARY34%of 6

-----------
SUMMARY---of 0

__vlan_find_dev_deep_rcu---of 15
vlan_dev_real_dev---of 3
vlan_dev_vlan_id---of 1
vlan_dev_vlan_proto---of 1
vlan_do_receive7%of 44
vlan_filter_drop_vids---of 12
vlan_filter_push_vids---of 24
vlan_for_each---of 20
vlan_gro_complete---of 5
vlan_gro_receive---of 24
vlan_info_rcu_free---of 1
vlan_uses_dev---of 11
vlan_vid_add---of 34
vlan_vid_del---of 31
vlan_vids_add_by_dev---of 27
vlan_vids_del_by_dev---of 18
-----------
SUMMARY7%of 44

__ia32_compat_sys_lseek---of 9
__ia32_compat_sys_preadv---of 9
__ia32_compat_sys_preadv2---of 1
__ia32_compat_sys_preadv64---of 8
__ia32_compat_sys_preadv64v2---of 1
__ia32_compat_sys_pwritev---of 9
__ia32_compat_sys_pwritev2---of 1
__ia32_compat_sys_pwritev64---of 8
__ia32_compat_sys_pwritev64v2---of 1
__ia32_compat_sys_sendfile---of 1
__ia32_compat_sys_sendfile64---of 1
__ia32_sys_copy_file_range---of 1
__ia32_sys_llseek---of 1
__ia32_sys_lseek---of 9
__ia32_sys_pread64---of 6
__ia32_sys_preadv---of 8
__ia32_sys_preadv2---of 1
__ia32_sys_pwrite64---of 6
__ia32_sys_pwritev---of 8
__ia32_sys_pwritev2---of 1
__ia32_sys_read---of 1
__ia32_sys_readv---of 1
__ia32_sys_sendfile---of 1
__ia32_sys_sendfile64---of 1
__ia32_sys_write---of 1
__ia32_sys_writev---of 1
__kernel_read---of 25
__kernel_write---of 1
__kernel_write_iter---of 24
__se_compat_sys_preadv2---of 10
__se_compat_sys_preadv64v2---of 10
__se_compat_sys_pwritev2---of 10
__se_compat_sys_pwritev64v2---of 10
__se_compat_sys_sendfile---of 4
__se_compat_sys_sendfile64---of 4
__se_sys_copy_file_range---of 19
__se_sys_llseek---of 10
__se_sys_preadv2---of 10
__se_sys_pwritev2---of 10
__se_sys_sendfile---of 4
__se_sys_sendfile6450%of 4
__x64_compat_sys_lseek---of 9
__x64_compat_sys_preadv---of 9
__x64_compat_sys_preadv2---of 1
__x64_compat_sys_preadv64---of 9
__x64_compat_sys_preadv64v2---of 1
__x64_compat_sys_pwritev---of 9
__x64_compat_sys_pwritev2---of 1
__x64_compat_sys_pwritev64---of 9
__x64_compat_sys_pwritev64v2---of 1
__x64_compat_sys_sendfile---of 1
__x64_compat_sys_sendfile64---of 1
__x64_sys_copy_file_range---of 1
__x64_sys_llseek---of 1
__x64_sys_lseek---of 9
__x64_sys_pread6443%of 7
__x64_sys_preadv45%of 9
__x64_sys_preadv2---of 1
__x64_sys_pwrite6443%of 7
__x64_sys_pwritev45%of 9
__x64_sys_pwritev2---of 1
__x64_sys_read100%of 1
__x64_sys_readv100%of 1
__x64_sys_sendfile---of 1
__x64_sys_sendfile64100%of 1
__x64_sys_write100%of 1
__x64_sys_writev100%of 1
default_llseek---of 14
do_iter_readv_writev22%of 23
do_readv50%of 12
do_sendfile24%of 38
do_writev50%of 12
fixed_size_llseek---of 3
generic_file_llseek---of 1
generic_file_llseek_size---of 18
generic_file_rw_checks---of 7
generic_write_check_limits---of 6
generic_write_checks---of 4
generic_write_checks_count---of 13
kernel_read---of 3
kernel_write---of 6
ksys_pread64---of 7
ksys_pwrite64---of 7
ksys_read50%of 10
ksys_write70%of 10
no_seek_end_llseek---of 3
no_seek_end_llseek_size---of 3
noop_llseek---of 1
rw_verify_area29%of 25
sb_end_write---of 10
sb_start_write---of 10
vfs_copy_file_range---of 53
vfs_iocb_iter_read---of 16
vfs_iocb_iter_write---of 16
vfs_iter_read---of 16
vfs_iter_write---of 19
vfs_llseek---of 3
vfs_read22%of 28
vfs_readv36%of 28
vfs_setpos---of 7
vfs_write38%of 32
vfs_writev20%of 31
warn_unsupported---of 3
-----------
SUMMARY35%of 290

__pte_offset_map32%of 16
__pte_offset_map_lock40%of 5
pgd_clear_bad---of 1
pgtable_trans_huge_deposit---of 5
pgtable_trans_huge_withdraw---of 5
pmd_clear_bad---of 1
pmdp_collapse_flush---of 7
pmdp_huge_clear_flush---of 7
pmdp_invalidate---of 7
pte_free_defer---of 1
pte_free_now---of 18
pte_offset_map_nolock67%of 3
pte_unmap---of 6
ptep_clear_flush43%of 7
pud_clear_bad---of 1
pudp_huge_clear_flush---of 7
rcu_read_unlock---of 6
-----------
SUMMARY39%of 31

ext4_bg_has_super---of 16
ext4_bg_num_gdb---of 5
ext4_claim_free_clusters---of 3
ext4_count_free_clusters---of 9
ext4_free_clusters_after_init---of 27
ext4_get_group_desc28%of 22
ext4_get_group_info---of 19
ext4_get_group_no_and_offset---of 7
ext4_get_group_number---of 5
ext4_has_free_clusters---of 16
ext4_has_group_desc_csum---of 7
ext4_init_block_bitmap---of 35
ext4_inode_to_goal_block---of 9
ext4_lock_group---of 9
ext4_new_meta_blocks---of 8
ext4_num_base_meta_blocks---of 6
ext4_num_base_meta_clusters---of 8
ext4_read_block_bitmap---of 4
ext4_read_block_bitmap_nowait---of 31
ext4_should_retry_alloc---of 6
ext4_validate_block_bitmap---of 56
ext4_wait_block_bitmap---of 7
trace_ext4_read_block_bitmap_load---of 15
-----------
SUMMARY28%of 22

__ia32_compat_sys_ptrace---of 1
__ia32_sys_ptrace---of 1
__ptrace_detach---of 9
__ptrace_link---of 6
__ptrace_may_access5%of 40
__ptrace_unlink---of 14
__se_compat_sys_ptrace---of 16
__se_sys_ptrace---of 16
__x64_compat_sys_ptrace---of 1
__x64_sys_ptrace---of 1
compat_ptrace_request---of 34
exit_ptrace---of 9
generic_ptrace_peekdata---of 9
generic_ptrace_pokedata---of 7
ptrace_access_vm---of 7
ptrace_attach---of 35
ptrace_check_attach---of 14
ptrace_may_access100%of 1
ptrace_readdata---of 15
ptrace_regset---of 10
ptrace_request---of 95
ptrace_setsiginfo---of 4
ptrace_traceme---of 9
ptrace_writedata---of 14
-----------
SUMMARY8%of 41

__bpf_trace_module_free---of 1
__bpf_trace_module_load---of 1
__bpf_trace_module_refcnt---of 1
__bpf_trace_module_request---of 1
__ia32_sys_delete_module---of 1
__ia32_sys_finit_module---of 1
__ia32_sys_init_module---of 1
__is_module_percpu_address18%of 17
__layout_sections---of 13
__module_address13%of 16
__module_get67%of 3
__module_text_address---of 4
__probestub_module_free---of 1
__probestub_module_get---of 1
__probestub_module_load---of 1
__probestub_module_put---of 1
__probestub_module_request---of 1
__se_sys_delete_module---of 25
__se_sys_finit_module---of 26
__se_sys_init_module---of 12
__symbol_get---of 15
__symbol_put---of 4
__traceiter_module_free---of 4
__traceiter_module_get---of 4
__traceiter_module_load---of 4
__traceiter_module_put---of 4
__traceiter_module_request---of 4
__x64_sys_delete_module---of 1
__x64_sys_finit_module---of 1
__x64_sys_init_module---of 1
apply_relocations---of 12
arch_mod_section_prepend---of 1
cmp_name---of 1
complete_formation---of 13
do_free_init---of 4
do_init_module---of 14
find_module---of 1
find_module_all---of 14
find_module_sections---of 130
find_symbol---of 27
flush_module_init_free_work---of 1
free_mod_mem---of 13
free_modinfo---of 6
free_modinfo_srcversion---of 1
free_modinfo_version---of 1
free_module---of 17
is_module_address---of 3
is_module_percpu_address---of 1
is_module_text_address50%of 6
load_module---of 205
modinfo_srcversion_exists---of 1
modinfo_version_exists---of 1
module_arch_freeing_init---of 1
module_augment_kernel_taints---of 79
module_elf_check_arch---of 1
module_exit_section---of 1
module_flags---of 12
module_flags_taint---of 6
module_frob_arch_sections---of 1
module_get_offset_and_type---of 3
module_init_layout_section---of 1
module_init_section---of 1
module_next_tag_pair---of 6
module_patient_check_exists---of 17
module_put10%of 20
module_refcount---of 1
module_unload_free---of 10
module_unload_init---of 1
percpu_modalloc---of 6
perf_trace_module_free---of 8
perf_trace_module_load---of 8
perf_trace_module_refcnt---of 8
perf_trace_module_request---of 8
post_relocation---of 5
print_modules---of 10
register_module_notifier---of 1
resolve_symbol---of 56
search_module_extables---of 6
setup_modinfo---of 15
setup_modinfo_srcversion---of 1
setup_modinfo_version---of 1
show_coresize---of 1
show_initsize---of 1
show_initstate---of 5
show_modinfo_srcversion---of 1
show_modinfo_version---of 1
show_refcnt---of 1
show_taint---of 6
simplify_symbols---of 29
store_uevent---of 1
symbol_put_addr---of 8
trace_event_raw_event_module_free---of 7
trace_event_raw_event_module_load---of 7
trace_event_raw_event_module_refcnt---of 7
trace_event_raw_event_module_request---of 7
trace_module_get---of 15
trace_module_load---of 15
trace_raw_output_module_free---of 3
trace_raw_output_module_load---of 3
trace_raw_output_module_refcnt---of 3
trace_raw_output_module_request---of 3
try_module_get29%of 7
try_to_force_load---of 1
unknown_module_param_cb---of 5
unregister_module_notifier---of 1
-----------
SUMMARY21%of 69

__nf_ct_ext_find45%of 9
nf_ct_ext_add60%of 15
nf_ct_ext_bump_genid---of 3
-----------
SUMMARY55%of 24

__bpf_trace_mm_lru_activate---of 1
__bpf_trace_mm_lru_insertion---of 1
__folio_batch_release---of 3
__folio_put30%of 24
__lru_add_drain_all---of 20
__page_cache_release19%of 144
__probestub_mm_lru_activate---of 1
__probestub_mm_lru_insertion---of 1
__traceiter_mm_lru_activate---of 4
__traceiter_mm_lru_insertion---of 4
const_folio_flags30%of 10
deactivate_file_folio---of 27
folio_activate---of 41
folio_activate_fn---of 174
folio_add_lru21%of 58
folio_add_lru_vma30%of 10
folio_batch_add_and_move31%of 13
folio_batch_move_lru35%of 29
folio_batch_remove_exceptionals---of 9
folio_deactivate---of 43
folio_mapped---of 16
folio_mark_accessed5%of 116
folio_mark_lazyfree---of 57
folio_memcg25%of 12
folio_nr_pages---of 9
folio_rotate_reclaimable---of 54
folios_put_refs34%of 36
lru_add_drain36%of 17
lru_add_drain_all---of 1
lru_add_drain_cpu25%of 40
lru_add_drain_cpu_zone---of 17
lru_add_drain_per_cpu---of 17
lru_add_fn17%of 124
lru_cache_disable---of 1
lru_deactivate_file_fn---of 239
lru_deactivate_fn---of 168
lru_gen_add_folio15%of 75
lru_gen_update_size27%of 68
lru_lazyfree_fn---of 188
lru_move_tail_fn---of 150
lru_note_cost---of 12
lru_note_cost_refault---of 25
perf_trace_mm_lru_activate---of 8
perf_trace_mm_lru_insertion---of 72
put_pages_list---of 13
release_pages---of 21
trace_event_raw_event_mm_lru_activate---of 7
trace_event_raw_event_mm_lru_insertion---of 71
trace_raw_output_mm_lru_activate---of 3
trace_raw_output_mm_lru_insertion---of 3
zone_stat_mod_folio---of 4
-----------
SUMMARY20%of 776

__ia32_sys_capget---of 1
__ia32_sys_capset---of 1
__se_sys_capget---of 21
__se_sys_capset---of 13
__x64_sys_capget---of 1
__x64_sys_capset---of 1
cap_validate_magic---of 8
capable---of 4
capable_wrt_inode_uidgid---of 5
file_ns_capable---of 3
has_capability---of 1
has_capability_noaudit---of 1
has_ns_capability---of 16
has_ns_capability_noaudit---of 16
ns_capable50%of 4
ns_capable_noaudit---of 4
ns_capable_setid---of 4
privileged_wrt_inode_uidgid---of 3
ptracer_capable---of 18
-----------
SUMMARY50%of 4

probe_sched_switch67%of 3
probe_sched_wakeup67%of 3
queued_spin_lock_slowpath---of 3
saved_cmdlines_next---of 8
saved_cmdlines_show---of 6
saved_cmdlines_start---of 12
saved_cmdlines_stop---of 4
saved_tgids_next---of 3
saved_tgids_show---of 3
saved_tgids_start---of 3
saved_tgids_stop---of 1
trace_alloc_tgid_map---of 5
trace_create_savedcmd---of 3
trace_find_cmdline---of 11
trace_find_tgid---of 4
trace_free_saved_cmdlines_buffer---of 1
trace_save_cmdline---of 13
tracing_record_cmdline---of 4
tracing_record_taskinfo---of 12
tracing_record_taskinfo_sched_switch13%of 16
tracing_record_tgid---of 6
tracing_saved_cmdlines_open---of 3
tracing_saved_cmdlines_size_read---of 6
tracing_saved_cmdlines_size_write---of 12
tracing_saved_tgids_open---of 3
tracing_start_cmdline_record---of 1
tracing_start_sched_switch---of 14
tracing_start_tgid_record---of 1
tracing_stop_cmdline_record---of 4
tracing_stop_tgid_record---of 3
-----------
SUMMARY28%of 22

-----------
SUMMARY---of 0

phonet_get_local_port_range50%of 16
phonet_sysctl_exit---of 1
proc_local_port_range---of 4
-----------
SUMMARY50%of 16

copy_ipcs10%of 22
free_ipc---of 10
free_ipcs---of 10
ipcns_get---of 6
ipcns_install---of 10
ipcns_owner---of 1
ipcns_put---of 4
put_ipc_ns50%of 4
-----------
SUMMARY16%of 26

____fput---of 1
__fput38%of 29
__fput_sync67%of 3
alloc_empty_backing_file---of 4
alloc_empty_file34%of 9
alloc_empty_file_noaccount---of 4
alloc_file_clone67%of 3
alloc_file_pseudo50%of 4
alloc_file_pseudo_noaccount---of 7
backing_file_user_path---of 1
delayed_fput---of 4
file_free50%of 8
file_init_path100%of 13
flush_delayed_fput---of 4
fput23%of 9
get_max_files---of 1
init_file60%of 5
proc_nr_files---of 1
put_cred---of 4
-----------
SUMMARY51%of 83

-----------
SUMMARY---of 0

__bpf_trace_exit_mmap---of 1
__bpf_trace_vm_unmapped_area---of 1
__bpf_trace_vma_mas_szero---of 1
__bpf_trace_vma_store---of 1
__get_unmapped_area36%of 14
__ia32_sys_brk---of 1
__ia32_sys_mmap_pgoff---of 1
__ia32_sys_munmap---of 1
__ia32_sys_remap_file_pages---of 1
__install_special_mapping---of 7
__mas_set_range38%of 8
__probestub_exit_mmap---of 1
__probestub_vm_unmapped_area---of 1
__probestub_vma_mas_szero---of 1
__probestub_vma_store---of 1
__se_sys_brk---of 43
__se_sys_remap_file_pages---of 30
__split_vma38%of 35
__traceiter_exit_mmap---of 4
__traceiter_vm_unmapped_area---of 4
__traceiter_vma_mas_szero---of 4
__traceiter_vma_store---of 4
__vm_munmap---of 12
__x64_sys_brk---of 1
__x64_sys_mmap_pgoff---of 1
__x64_sys_munmap---of 1
__x64_sys_remap_file_pages---of 1
_install_special_mapping---of 1
can_vma_merge_after30%of 17
can_vma_merge_before10%of 20
copy_vma---of 29
do_brk_flags---of 39
do_mmap17%of 62
do_munmap---of 1
do_vma_munmap---of 3
do_vmi_align_munmap28%of 79
do_vmi_munmap31%of 13
dup_anon_vma---of 9
exit_mmap---of 59
expand_downwards---of 42
expand_stack---of 24
expand_stack_locked---of 1
file_mmap_ok---of 7
find_extend_vma_locked---of 9
find_mergeable_anon_vma20%of 26
find_vma50%of 4
find_vma_intersection---of 4
find_vma_prev---of 3
generic_get_unmapped_area---of 18
generic_get_unmapped_area_topdown---of 22
get_file---of 4
init_admin_reserve---of 1
init_reserve_notifier---of 3
init_user_reserve---of 1
insert_vm_struct---of 14
install_special_mapping---of 1
ksys_mmap_pgoff10%of 22
mapping_map_writable---of 5
may_expand_vm---of 11
mlock_future_ok---of 4
mm_drop_all_locks---of 20
mm_get_unmapped_area---of 3
mm_get_unmapped_area_vmflags67%of 3
mm_take_all_locks---of 48
mmap_region11%of 104
mmap_write_unlock---of 6
perf_trace_exit_mmap---of 8
perf_trace_vm_unmapped_area---of 8
perf_trace_vma_mas_szero---of 8
perf_trace_vma_store---of 8
reserve_mem_notifier---of 10
special_mapping_close---of 1
special_mapping_fault---of 17
special_mapping_mremap---of 4
special_mapping_name---of 1
special_mapping_split---of 1
trace_event_raw_event_exit_mmap---of 7
trace_event_raw_event_vm_unmapped_area---of 7
trace_event_raw_event_vma_mas_szero---of 7
trace_event_raw_event_vma_store---of 7
trace_raw_output_exit_mmap---of 3
trace_raw_output_vm_unmapped_area---of 4
trace_raw_output_vma_mas_szero---of 3
trace_raw_output_vma_store---of 3
unlink_file_vma50%of 4
unmap_region58%of 7
validate_mm43%of 19
vm_brk_flags---of 22
vm_flags_clear---of 6
vm_flags_set---of 6
vm_munmap---of 1
vm_stat_account---of 5
vm_unmapped_area---of 41
vma_complete23%of 44
vma_expand25%of 32
vma_is_special_mapping---of 3
vma_iter_store31%of 13
vma_link---of 10
vma_link_file---of 4
vma_merge---of 118
vma_merge_extend---of 1
vma_merge_new_vma---of 1
vma_modify---of 10
vma_needs_dirty_tracking---of 10
vma_prepare22%of 19
vma_set_page_prot16%of 13
vma_shrink---of 14
vma_start_write---of 6
vma_wants_writenotify---of 12
-----------
SUMMARY23%of 558

-----------
SUMMARY---of 0

cubictcp_acked23%of 27
cubictcp_cong_avoid7%of 31
cubictcp_cwnd_event---of 4
cubictcp_init50%of 4
cubictcp_recalc_ssthresh---of 4
cubictcp_state---of 3
-----------
SUMMARY17%of 62

__blkcg_rstat_flush---of 24
__blkg_prfill_u64---of 4
__blkg_release---of 11
bio_associate_blkg---of 15
bio_associate_blkg_from_css---of 74
bio_blkcg_css---of 4
bio_clone_blkg_association---of 4
blk_cgroup_bio_start---of 12
blk_cgroup_congested34%of 15
blkcg_activate_policy---of 42
blkcg_add_delay---of 3
blkcg_css19%of 11
blkcg_css_alloc---of 41
blkcg_css_free---of 15
blkcg_css_offline---of 1
blkcg_css_online---of 5
blkcg_deactivate_policy---of 23
blkcg_exit---of 3
blkcg_exit_disk---of 1
blkcg_get_cgwb_list---of 1
blkcg_init_disk---of 8
blkcg_maybe_throttle_current4%of 51
blkcg_pin_online---of 4
blkcg_policy_register---of 35
blkcg_policy_unregister---of 21
blkcg_print_blkgs---of 20
blkcg_print_stat---of 48
blkcg_punt_bio_submit---of 3
blkcg_reset_stats---of 26
blkcg_rstat_flush---of 3
blkcg_scale_delay---of 8
blkcg_schedule_throttle---of 10
blkcg_unpin_online---of 11
blkg_alloc---of 22
blkg_async_bio_workfn---of 7
blkg_conf_exit---of 5
blkg_conf_init---of 1
blkg_conf_open_bdev---of 8
blkg_conf_prep---of 59
blkg_create---of 89
blkg_destroy---of 37
blkg_destroy_all---of 20
blkg_dev_name---of 3
blkg_free_workfn---of 17
blkg_get---of 13
blkg_init_queue---of 1
blkg_release---of 1
blkg_tryget---of 16
percpu_ref_put---of 14
radix_tree_preload_end---of 10
-----------
SUMMARY12%of 77

__page_table_check_pmd_clear---of 3
__page_table_check_pmd_set---of 20
__page_table_check_pte_clear100%of 3
__page_table_check_pte_clear_range---of 12
__page_table_check_ptes_set35%of 23
__page_table_check_pud_clear---of 3
__page_table_check_pud_set---of 6
__page_table_check_zero32%of 16
page_table_check_clear20%of 26
page_table_check_set24%of 26
pfn_valid21%of 29
-----------
SUMMARY27%of 123

__nla_parse100%of 1
__nla_put---of 1
__nla_put_64bit---of 1
__nla_put_nohdr---of 1
__nla_reserve---of 1
__nla_reserve_64bit---of 1
__nla_reserve_nohdr---of 1
__nla_validate---of 1
__nla_validate_parse6%of 169
nla_append---of 5
nla_find---of 6
nla_get_range_signed---of 12
nla_get_range_unsigned---of 21
nla_memcmp---of 3
nla_memcpy---of 3
nla_policy_len---of 14
nla_put60%of 5
nla_put_64bit---of 5
nla_put_nohdr---of 5
nla_reserve---of 5
nla_reserve_64bit---of 5
nla_reserve_nohdr---of 5
nla_strcmp---of 6
nla_strdup---of 5
nla_strscpy---of 5
nla_validate_array---of 10
-----------
SUMMARY8%of 175

always_on---of 1
blackhole_netdev_setup---of 1
blackhole_netdev_xmit---of 3
dev_lstats_read---of 5
loopback_dev_free---of 1
loopback_dev_init---of 4
loopback_get_stats64---of 5
loopback_net_init---of 7
loopback_setup---of 1
loopback_xmit29%of 25
-----------
SUMMARY29%of 25

-----------
SUMMARY---of 0

nf_conntrack_event_cache---of 8
nf_conntrack_tcp_established---of 3
nf_conntrack_tcp_init_net---of 1
nf_conntrack_tcp_packet28%of 222
nf_conntrack_tcp_set_closing---of 23
nf_tcp_handle_invalid9%of 23
nf_tcp_log_invalid50%of 4
nlattr_to_tcp---of 17
tcp_can_early_drop---of 1
tcp_new34%of 9
tcp_nlattr_tuple_size---of 3
tcp_options48%of 17
tcp_timeout_nlattr_to_obj---of 23
tcp_timeout_obj_to_nlattr---of 13
tcp_to_nlattr---of 9
-----------
SUMMARY28%of 275

tnum_add---of 1
tnum_and---of 1
tnum_arshift---of 5
tnum_cast---of 3
tnum_clear_subreg---of 1
tnum_const100%of 1
tnum_const_subreg---of 1
tnum_in---of 1
tnum_intersect---of 1
tnum_is_aligned---of 1
tnum_lshift---of 3
tnum_mul---of 7
tnum_or---of 1
tnum_range---of 1
tnum_rshift---of 3
tnum_sbin---of 7
tnum_sub---of 1
tnum_subreg---of 1
tnum_with_subreg---of 1
tnum_xor---of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

page_counter_cancel50%of 4
page_counter_charge84%of 6
page_counter_memparse---of 4
page_counter_set_low---of 4
page_counter_set_max---of 5
page_counter_set_min---of 4
page_counter_try_charge56%of 9
page_counter_uncharge75%of 4
propagate_protected_usage50%of 8
-----------
SUMMARY62%of 31

-----------
SUMMARY---of 0

__tlb_remove_folio_pages---of 1
__tlb_remove_folio_pages_size---of 14
__tlb_remove_page_size---of 1
tlb_finish_mmu50%of 6
tlb_flush_mmu15%of 20
tlb_flush_rmap_batch---of 13
tlb_flush_rmaps---of 4
tlb_gather_mmu67%of 3
tlb_gather_mmu_fullmm---of 1
tlb_remove_table---of 11
tlb_remove_table_rcu---of 4
tlb_remove_table_smp_sync---of 1
tlb_remove_table_sync_one---of 1
tlb_table_flush25%of 8
-----------
SUMMARY28%of 37

__generic_file_fsync---of 5
alloc_anon_inode---of 3
always_delete_dentry---of 1
dcache_dir_close---of 1
dcache_dir_lseek---of 14
dcache_dir_open---of 1
dcache_readdir---of 16
direct_write_fallback---of 4
empty_dir_getattr---of 1
empty_dir_listxattr---of 1
empty_dir_llseek---of 1
empty_dir_lookup---of 1
empty_dir_readdir---of 7
empty_dir_setattr---of 1
folio_size---of 10
generic_check_addressable---of 6
generic_ci_d_compare---of 8
generic_ci_d_hash---of 6
generic_encode_ino32_fh---of 5
generic_fh_to_dentry---of 4
generic_fh_to_parent---of 5
generic_file_fsync---of 3
generic_read_dir---of 1
generic_set_sb_d_ops---of 4
init_pseudo---of 3
inode_maybe_inc_iversion---of 6
inode_query_iversion---of 5
is_empty_dir_inode---of 3
kfree_link100%of 1
make_empty_dir_inode---of 1
memory_read_from_buffer---of 4
noop_direct_IO---of 1
noop_fsync---of 1
offset_dir_llseek---of 6
offset_readdir---of 29
path_from_stashed30%of 44
pseudo_fs_fill_super---of 4
pseudo_fs_free---of 1
pseudo_fs_get_tree---of 1
scan_positives---of 18
simple_attr_open---of 3
simple_attr_read---of 8
simple_attr_release---of 1
simple_attr_write---of 1
simple_attr_write_signed---of 1
simple_attr_write_xsigned---of 8
simple_empty---of 7
simple_fill_super---of 10
simple_get_link---of 1
simple_getattr---of 1
simple_inode_init_ts100%of 1
simple_link---of 3
simple_lookup---of 4
simple_nosetlease---of 1
simple_offset_add---of 4
simple_offset_destroy---of 1
simple_offset_empty---of 9
simple_offset_init---of 1
simple_offset_remove---of 3
simple_offset_rename---of 8
simple_offset_rename_exchange---of 13
simple_open---of 3
simple_pin_fs---of 5
simple_read_folio---of 7
simple_read_from_buffer---of 6
simple_recursive_removal---of 33
simple_release_fs---of 3
simple_rename---of 12
simple_rename_exchange---of 7
simple_rename_timestamp---of 5
simple_rmdir---of 3
simple_setattr---of 4
simple_statfs---of 1
simple_transaction_get---of 5
simple_transaction_read---of 3
simple_transaction_release---of 1
simple_transaction_set---of 3
simple_unlink---of 1
simple_write_begin---of 11
simple_write_end---of 27
simple_write_to_buffer---of 6
stashed_dentry_prune50%of 4
zero_user_segments---of 17
-----------
SUMMARY34%of 50

-----------
SUMMARY---of 0

__bpf_trace_fib6_table_lookup---of 1
__find_rr_leaf35%of 20
__ip6_del_rt---of 8
__ip6_del_rt_siblings---of 32
__ip6_route_redirect---of 42
__ip6_rt_update_pmtu---of 70
__ipv6_neigh_lookup_noref---of 24
__neigh_lookup---of 3
__probestub_fib6_table_lookup---of 1
__rt6_find_exception_rcu19%of 16
__rt6_find_exception_spinlock---of 14
__rt6_nh_dev_match---of 6
__traceiter_fib6_table_lookup---of 4
addrconf_f6i_alloc---of 6
dst_discard---of 1
fib6_add_gc_list---of 10
fib6_backtrack---of 11
fib6_clean_tohost---of 21
fib6_ifdown---of 18
fib6_ifup---of 5
fib6_info_hw_flags_set---of 14
fib6_info_nh_uses_dev---of 1
fib6_nh_age_exceptions---of 35
fib6_nh_del_cached_rt---of 10
fib6_nh_find_match---of 7
fib6_nh_flush_exceptions---of 24
fib6_nh_init---of 113
fib6_nh_mtu_change---of 37
fib6_nh_redirect_match---of 8
fib6_nh_release---of 25
fib6_nh_release_dsts---of 8
fib6_nh_remove_exception---of 9
fib6_remove_prefsrc---of 6
fib6_rt_update---of 7
fib6_select_path9%of 24
fib6_table_lookup20%of 35
find_match12%of 50
icmp6_dst_alloc---of 10
in6_dev_get25%of 20
in6_dev_put---of 4
inet6_rt_notify---of 7
inet6_rtm_delroute---of 18
inet6_rtm_getroute---of 107
inet6_rtm_newroute---of 117
ip6_blackhole_route---of 7
ip6_confirm_neigh---of 20
ip6_create_rt_rcu---of 17
ip6_default_advmss50%of 4
ip6_del_rt---of 1
ip6_dst_alloc---of 3
ip6_dst_check22%of 42
ip6_dst_destroy---of 17
ip6_dst_gc---of 6
ip6_dst_ifdown---of 7
ip6_dst_neigh_lookup---of 4
ip6_hold_safe---of 7
ip6_ins_rt---of 1
ip6_link_failure---of 31
ip6_mtu25%of 24
ip6_mtu_from_fib6---of 20
ip6_multipath_l3_keys---of 8
ip6_negative_advice---of 16
ip6_neigh_lookup---of 23
ip6_pkt_discard---of 1
ip6_pkt_discard_out---of 5
ip6_pkt_drop---of 26
ip6_pkt_prohibit---of 1
ip6_pkt_prohibit_out---of 5
ip6_pol_route26%of 75
ip6_pol_route_input100%of 1
ip6_pol_route_lookup---of 54
ip6_pol_route_output100%of 1
ip6_redirect---of 1
ip6_redirect_no_header---of 3
ip6_route_add---of 7
ip6_route_cleanup---of 1
ip6_route_del---of 87
ip6_route_dev_notify---of 17
ip6_route_get_saddr---of 5
ip6_route_info_create---of 48
ip6_route_input31%of 33
ip6_route_input_lookup---of 3
ip6_route_lookup---of 1
ip6_route_net_exit---of 1
ip6_route_net_exit_late---of 1
ip6_route_net_init---of 7
ip6_route_net_init_late---of 4
ip6_route_output_flags32%of 32
ip6_rt_cache_alloc---of 29
ip6_rt_copy_init25%of 24
ip6_rt_update_pmtu---of 3
ip6_sk_dst_store_flow---of 33
ip6_sk_redirect---of 1
ip6_sk_update_pmtu---of 39
ip6_update_pmtu---of 6
ip_fib_metrics_put---of 5
ipv6_addr_prefix---of 4
ipv6_inetpeer_exit---of 1
ipv6_inetpeer_init---of 3
ipv6_route_ioctl---of 7
ipv6_route_sysctl_init---of 3
ipv6_route_sysctl_table_size---of 1
ipv6_sysctl_rtcache_flush---of 4
l3mdev_fib_table---of 11
neigh_release---of 4
nexthop_fib6_nh---of 17
nexthop_get---of 7
nexthop_is_blackhole---of 16
nexthop_path_fib6_result---of 9
nlmsg_parse_deprecated_strict---of 4
perf_trace_fib6_table_lookup---of 18
rcu_read_unlock---of 6
rt6_add_dflt_router---of 4
rt6_add_route_info---of 3
rt6_age_exceptions---of 3
rt6_check_expired---of 12
rt6_clean_tohost---of 1
rt6_disable_ip---of 31
rt6_do_redirect---of 56
rt6_do_update_pmtu---of 26
rt6_dump_route---of 44
rt6_fill_node---of 72
rt6_fill_node_nexthop---of 28
rt6_find_cached_rt34%of 12
rt6_flush_exceptions---of 3
rt6_get_dflt_router---of 35
rt6_get_route_info---of 39
rt6_insert_exception---of 48
rt6_is_dead---of 12
rt6_lookup---of 5
rt6_mtu_change---of 1
rt6_mtu_change_route---of 11
rt6_multipath_dead_count---of 10
rt6_multipath_hash---of 61
rt6_multipath_nh_flags_set---of 8
rt6_multipath_rebalance---of 39
rt6_nh_age_exceptions---of 1
rt6_nh_dump_exceptions---of 16
rt6_nh_find_match---of 1
rt6_nh_flush_exceptions---of 1
rt6_nh_nlmsg_size---of 3
rt6_nh_remove_exception_rt---of 1
rt6_nlmsg_size---of 10
rt6_probe_deferred---of 4
rt6_purge_dflt_routers---of 65
rt6_remove_exception---of 11
rt6_remove_exception_rt---of 10
rt6_remove_prefsrc---of 1
rt6_route_rcv---of 26
rt6_score_route20%of 21
rt6_stats_seq_show---of 1
rt6_sync_down_dev---of 3
rt6_sync_up---of 4
rt6_uncached_list_add---of 3
rt6_uncached_list_del---of 4
rtm_to_fib6_config---of 53
skb_header_pointer---of 3
skb_transport_offset---of 3
trace_event_raw_event_fib6_table_lookup---of 17
trace_fib6_table_lookup27%of 15
trace_raw_output_fib6_table_lookup---of 3
-----------
SUMMARY24%of 449

_sctp_make_chunk37%of 11
sctp_addto_chunk---of 1
sctp_chunk_assign_ssn---of 8
sctp_chunk_assign_tsn---of 3
sctp_chunk_free67%of 3
sctp_chunk_hold---of 4
sctp_chunk_iif---of 1
sctp_chunk_put34%of 6
sctp_chunkify---of 5
sctp_control_release_owner29%of 7
sctp_generate_tag67%of 3
sctp_generate_tsn100%of 1
sctp_init_addrs---of 1
sctp_init_cause---of 5
sctp_make_abort---of 13
sctp_make_abort_no_data---of 18
sctp_make_abort_user---of 20
sctp_make_abort_violation---of 17
sctp_make_asconf---of 7
sctp_make_asconf_set_prim---of 4
sctp_make_asconf_update_ip---of 13
sctp_make_auth---of 7
sctp_make_cookie_ack45%of 9
sctp_make_cookie_echo50%of 8
sctp_make_cwr---of 8
sctp_make_datafrag_empty---of 3
sctp_make_ecne---of 6
sctp_make_fwdtsn---of 9
sctp_make_heartbeat---of 6
sctp_make_heartbeat_ack---of 8
sctp_make_idata---of 1
sctp_make_ifwdtsn---of 6
sctp_make_init40%of 30
sctp_make_init_ack38%of 37
sctp_make_new_encap_port---of 16
sctp_make_op_error---of 14
sctp_make_pad---of 6
sctp_make_sack---of 16
sctp_make_shutdown---of 8
sctp_make_shutdown_ack---of 8
sctp_make_shutdown_complete---of 8
sctp_make_strreset_addstrm---of 10
sctp_make_strreset_req---of 12
sctp_make_strreset_resp---of 6
sctp_make_strreset_tsnreq---of 6
sctp_make_strreset_tsnresp---of 6
sctp_make_temp_asoc67%of 3
sctp_make_violation_max_retrans---of 16
sctp_make_violation_paramlen---of 18
sctp_process_asconf---of 56
sctp_process_asconf_ack---of 39
sctp_process_init20%of 114
sctp_process_inv_paramlength---of 4
sctp_source100%of 1
sctp_unpack_cookie27%of 19
sctp_user_addto_chunk---of 5
sctp_verify_asconf---of 23
sctp_verify_init8%of 122
sctp_verify_reconf---of 78
-----------
SUMMARY23%of 374

__nf_conntrack_eventmask_report24%of 25
ecache_work---of 25
nf_conn_pernet_ecache---of 1
nf_conntrack_ecache_pernet_fini---of 1
nf_conntrack_ecache_pernet_init---of 1
nf_conntrack_ecache_work---of 6
nf_conntrack_eventmask_report---of 12
nf_conntrack_register_notifier---of 8
nf_conntrack_unregister_notifier---of 1
nf_ct_deliver_cached_events34%of 9
nf_ct_ecache_ext_add46%of 11
nf_ct_expect_event_report---of 27
nf_ct_pernet---of 16
-----------
SUMMARY32%of 45

__dump_page_owner---of 12
__folio_copy_owner---of 13
__reset_page_owner39%of 13
__set_page_owner43%of 14
__set_page_owner_migrate_reason---of 3
__split_page_owner---of 6
lseek_page_owner---of 4
page_owner_stack_open---of 1
page_owner_threshold_get---of 1
page_owner_threshold_set---of 1
pagetypeinfo_showmixedcount_print---of 24
pfn_valid---of 29
proc_page_owner_threshold_open---of 1
read_page_owner---of 66
register_dummy_stack---of 1
register_early_stack---of 1
register_failure_stack---of 1
save_stack75%of 4
stack_next---of 3
stack_print---of 7
stack_start---of 4
stack_stop---of 1
-----------
SUMMARY46%of 31

-----------
SUMMARY---of 0

__should_fail_alloc_page34%of 6
-----------
SUMMARY34%of 6

__bpf_getsockopt---of 33
__bpf_redirect---of 39
__bpf_setsockopt---of 30
__bpf_sk_lookup---of 11
__bpf_skb_change_head---of 10
__bpf_skb_change_tail---of 24
__bpf_skb_load_bytes---of 9
__bpf_skb_store_bytes---of 15
__bpf_skc_lookup---of 18
__bpf_tx_skb---of 5
__bpf_xdp_load_bytes---of 10
__bpf_xdp_store_bytes---of 10
__get_filter---of 8
__ipv4_neigh_lookup_noref---of 24
__ipv6_neigh_lookup_noref_stub---of 24
__sk_attach_prog---of 14
_bpf_getsockopt---of 8
_bpf_setsockopt---of 8
bpf_bind---of 7
bpf_clone_redirect---of 8
bpf_convert_ctx_access---of 89
bpf_convert_filter---of 139
bpf_convert_tstamp_read---of 3
bpf_convert_tstamp_write---of 3
bpf_csum_diff---of 32
bpf_csum_level---of 13
bpf_csum_update---of 3
bpf_dispatcher_nop_func---of 1
bpf_dispatcher_xdp_func---of 1
bpf_dynptr_from_skb---of 3
bpf_dynptr_from_skb_rdonly---of 3
bpf_dynptr_from_xdp---of 4
bpf_flow_dissector_load_bytes---of 9
bpf_gen_ld_abs---of 8
bpf_get_cgroup_classid---of 10
bpf_get_cgroup_classid_curr---of 1
bpf_get_hash_recalc---of 3
bpf_get_listener_sock---of 6
bpf_get_netns_cookie_sk_msg---of 4
bpf_get_netns_cookie_sock---of 3
bpf_get_netns_cookie_sock_addr---of 4
bpf_get_netns_cookie_sock_ops---of 4
bpf_get_route_realm---of 7
bpf_get_skb_set_tunnel_proto---of 5
bpf_get_socket_cookie---of 3
bpf_get_socket_cookie_sock---of 1
bpf_get_socket_cookie_sock_addr---of 1
bpf_get_socket_cookie_sock_ops---of 1
bpf_get_socket_ptr_cookie---of 4
bpf_get_socket_uid---of 9
bpf_helper_changes_pkt_data---of 1
bpf_ipv4_fib_lookup---of 69
bpf_ipv6_fib_lookup---of 41
bpf_l3_csum_replace---of 9
bpf_l4_csum_replace---of 15
bpf_lwt_in_push_encap---of 5
bpf_lwt_seg6_action---of 45
bpf_lwt_seg6_adjust_srh---of 20
bpf_lwt_seg6_store_bytes---of 17
bpf_lwt_xmit_push_encap---of 3
bpf_msg_apply_bytes---of 1
bpf_msg_cork_bytes---of 1
bpf_msg_pop_data---of 67
bpf_msg_pull_data---of 41
bpf_msg_push_data---of 71
bpf_noop_prologue---of 1
bpf_prepare_filter---of 85
bpf_prog_change_xdp---of 1
bpf_prog_create---of 5
bpf_prog_create_from_user---of 10
bpf_prog_destroy---of 4
bpf_push_seg6_encap---of 8
bpf_redirect---of 5
bpf_redirect_neigh---of 6
bpf_redirect_peer---of 5
bpf_run_sk_reuseport---of 6
bpf_set_hash---of 1
bpf_set_hash_invalid---of 1
bpf_sk_ancestor_cgroup_id---of 12
bpf_sk_assign---of 21
bpf_sk_assign_tcp_reqsk---of 27
bpf_sk_base_func_proto---of 10
bpf_sk_cgroup_id---of 9
bpf_sk_fullsock---of 3
bpf_sk_getsockopt---of 1
bpf_sk_lookup---of 13
bpf_sk_lookup_assign---of 24
bpf_sk_lookup_tcp---of 1
bpf_sk_lookup_udp---of 1
bpf_sk_release---of 7
bpf_sk_setsockopt---of 1
bpf_skb_adjust_room---of 76
bpf_skb_ancestor_cgroup_id---of 12
bpf_skb_cgroup_classid---of 9
bpf_skb_cgroup_id---of 9
bpf_skb_change_head---of 1
bpf_skb_change_proto---of 25
bpf_skb_change_tail---of 1
bpf_skb_change_type---of 3
bpf_skb_check_mtu---of 14
bpf_skb_copy---of 8
bpf_skb_ecn_set_ce---of 46
bpf_skb_event_output---of 5
bpf_skb_fib_lookup---of 7
bpf_skb_get_nlattr---of 6
bpf_skb_get_nlattr_nest---of 8
bpf_skb_get_pay_offset---of 1
bpf_skb_get_tunnel_key---of 16
bpf_skb_get_tunnel_opt---of 6
bpf_skb_get_xfrm_state---of 9
bpf_skb_is_valid_access---of 117
bpf_skb_load_bytes---of 9
bpf_skb_load_bytes_relative---of 8
bpf_skb_load_helper_16---of 6
bpf_skb_load_helper_16_no_cache---of 6
bpf_skb_load_helper_32---of 6
bpf_skb_load_helper_32_no_cache---of 6
bpf_skb_load_helper_8---of 6
bpf_skb_load_helper_8_no_cache---of 6
bpf_skb_net_hdr_pop---of 16
bpf_skb_pull_data---of 5
bpf_skb_set_tstamp---of 10
bpf_skb_set_tunnel_key---of 28
bpf_skb_set_tunnel_opt---of 7
bpf_skb_store_bytes---of 15
bpf_skb_under_cgroup---of 13
bpf_skb_vlan_pop---of 13
bpf_skb_vlan_push---of 13
bpf_skc_lookup_tcp---of 3
bpf_skc_to_mptcp_sock---of 1
bpf_skc_to_tcp6_sock---of 7
bpf_skc_to_tcp_request_sock---of 4
bpf_skc_to_tcp_sock---of 6
bpf_skc_to_tcp_timewait_sock---of 4
bpf_skc_to_udp6_sock---of 8
bpf_skc_to_unix_sock---of 6
bpf_sock_addr_getsockopt---of 1
bpf_sock_addr_set_sun_path---of 3
bpf_sock_addr_setsockopt---of 1
bpf_sock_addr_sk_lookup_tcp---of 1
bpf_sock_addr_sk_lookup_udp---of 1
bpf_sock_addr_skc_lookup_tcp---of 1
bpf_sock_common_is_valid_access---of 3
bpf_sock_convert_ctx_access---of 49
bpf_sock_destroy---of 5
bpf_sock_from_file---of 1
bpf_sock_is_valid_access---of 58
bpf_sock_ops_cb_flags_set---of 5
bpf_sock_ops_get_syn---of 17
bpf_sock_ops_getsockopt---of 4
bpf_sock_ops_load_hdr_opt---of 35
bpf_sock_ops_reserve_hdr_opt---of 5
bpf_sock_ops_setsockopt---of 1
bpf_sock_ops_store_hdr_opt---of 24
bpf_sol_tcp_setsockopt---of 12
bpf_tc_sk_lookup_tcp---of 1
bpf_tc_sk_lookup_udp---of 1
bpf_tc_skc_lookup_tcp---of 1
bpf_tcp_check_syncookie---of 23
bpf_tcp_gen_syncookie---of 16
bpf_tcp_raw_check_syncookie_ipv4---of 1
bpf_tcp_raw_check_syncookie_ipv6---of 1
bpf_tcp_raw_gen_syncookie_ipv4---of 4
bpf_tcp_raw_gen_syncookie_ipv6---of 4
bpf_tcp_sock---of 5
bpf_tcp_sock_convert_ctx_access---of 28
bpf_tcp_sock_is_valid_access---of 6
bpf_unlocked_sk_getsockopt---of 1
bpf_unlocked_sk_setsockopt---of 1
bpf_update_srh_state---of 5
bpf_warn_invalid_xdp_action---of 3
bpf_xdp_adjust_head---of 5
bpf_xdp_adjust_meta---of 5
bpf_xdp_adjust_tail---of 7
bpf_xdp_check_mtu---of 6
bpf_xdp_copy---of 8
bpf_xdp_copy_buf---of 8
bpf_xdp_event_output---of 7
bpf_xdp_fib_lookup---of 5
bpf_xdp_frags_increase_tail---of 8
bpf_xdp_frags_shrink_tail---of 16
bpf_xdp_get_buff_len---of 3
bpf_xdp_load_bytes---of 10
bpf_xdp_pointer---of 13
bpf_xdp_redirect---of 5
bpf_xdp_redirect_map---of 1
bpf_xdp_sk_lookup_tcp---of 1
bpf_xdp_sk_lookup_udp---of 1
bpf_xdp_skc_lookup_tcp---of 1
bpf_xdp_sock_convert_ctx_access---of 3
bpf_xdp_sock_is_valid_access---of 3
bpf_xdp_store_bytes---of 10
btf_id_cmp_func---of 1
cg_skb_func_proto---of 18
cg_skb_is_valid_access---of 73
convert_bpf_ld_abs---of 20
copy_bpf_fprog_from_user---of 11
flow_dissector_convert_ctx_access---of 5
flow_dissector_func_proto---of 11
flow_dissector_is_valid_access---of 25
init_subsystem---of 1
lwt_in_func_proto---of 3
lwt_is_valid_access---of 120
lwt_out_func_proto---of 19
lwt_seg6local_func_proto---of 5
lwt_xmit_func_proto---of 23
neigh_output---of 21
netkit_peer_dev---of 1
nexthop_num_path---of 10
put_page---of 14
sk_attach_bpf---of 5
sk_attach_filter---of 6
sk_detach_filter---of 12
sk_filter_charge---of 14
sk_filter_func_proto---of 15
sk_filter_is_valid_access---of 91
sk_filter_release_rcu---of 4
sk_filter_trim_cap18%of 39
sk_filter_uncharge---of 4
sk_get_filter---of 14
sk_lookup---of 12
sk_lookup_convert_ctx_access---of 43
sk_lookup_func_proto---of 13
sk_lookup_is_valid_access---of 72
sk_msg_convert_ctx_access---of 37
sk_msg_func_proto---of 23
sk_msg_is_valid_access---of 68
sk_reuseport_attach_bpf---of 16
sk_reuseport_attach_filter---of 8
sk_reuseport_convert_ctx_access---of 11
sk_reuseport_func_proto---of 7
sk_reuseport_is_valid_access---of 29
sk_reuseport_load_bytes---of 9
sk_reuseport_load_bytes_relative---of 8
sk_reuseport_prog_free---of 6
sk_select_reuseport---of 18
sk_skb_adjust_room---of 17
sk_skb_change_head---of 1
sk_skb_change_tail---of 1
sk_skb_convert_ctx_access---of 28
sk_skb_func_proto---of 25
sk_skb_is_valid_access---of 52
sk_skb_prologue---of 3
sk_skb_pull_data---of 5
skb_do_redirect---of 126
skb_orphan---of 4
skb_postpull_rcsum---of 5
skb_tunnel_info---of 18
sock_addr_convert_ctx_access---of 53
sock_addr_func_proto---of 50
sock_addr_is_valid_access---of 119
sock_filter_func_proto---of 9
sock_filter_is_valid_access---of 36
sock_ops_convert_ctx_access---of 204
sock_ops_func_proto---of 25
sock_ops_is_valid_access---of 38
sol_tcp_sockopt---of 31
tc_cls_act_btf_struct_access---of 3
tc_cls_act_convert_ctx_access---of 3
tc_cls_act_func_proto---of 60
tc_cls_act_is_valid_access---of 117
tc_cls_act_prologue---of 3
trace_xdp_redirect---of 15
trace_xdp_redirect_err---of 15
tracing_iter_filter---of 4
xdp_btf_struct_access---of 3
xdp_convert_ctx_access---of 8
xdp_do_check_flushed---of 4
xdp_do_flush---of 1
xdp_do_generic_redirect---of 21
xdp_do_redirect---of 22
xdp_do_redirect_frame---of 18
xdp_func_proto---of 33
xdp_is_valid_access---of 9
xdp_master_redirect---of 6
-----------
SUMMARY18%of 39

ccmp_special_blocks---of 5
gcmp_special_blocks---of 5
ieee80211_crypto_aes_cmac_256_decrypt---of 9
ieee80211_crypto_aes_cmac_256_encrypt---of 9
ieee80211_crypto_aes_cmac_decrypt---of 9
ieee80211_crypto_aes_cmac_encrypt---of 9
ieee80211_crypto_aes_gmac_decrypt---of 11
ieee80211_crypto_aes_gmac_encrypt---of 9
ieee80211_crypto_ccmp_decrypt---of 45
ieee80211_crypto_ccmp_encrypt---of 21
ieee80211_crypto_gcmp_decrypt---of 45
ieee80211_crypto_gcmp_encrypt---of 21
ieee80211_crypto_tkip_decrypt---of 10
ieee80211_crypto_tkip_encrypt---of 16
ieee80211_rx_h_michael_mic_verify8%of 25
ieee80211_tx_h_michael_mic_add---of 22
-----------
SUMMARY8%of 25

-----------
SUMMARY---of 0

chacha_block_generic100%of 1
chacha_permute58%of 7
hchacha_block_generic---of 1
-----------
SUMMARY63%of 8

__text_poke30%of 60
alternatives_enable_smp---of 15
alternatives_smp_module_add---of 16
alternatives_smp_module_del---of 5
alternatives_text_reserved---of 13
apply_alternatives---of 41
apply_fineibt---of 1
apply_relocation---of 45
apply_retpolines---of 59
apply_returns---of 32
apply_seal_endbr---of 14
do_sync_core50%of 4
optimize_nops---of 33
text_poke---of 4
text_poke_bp---of 1
text_poke_bp_batch---of 31
text_poke_copy40%of 5
text_poke_copy_locked---of 7
text_poke_early---of 15
text_poke_finish---of 3
text_poke_kgdb---of 1
text_poke_loc_init---of 42
text_poke_memcpy100%of 1
text_poke_memset---of 1
text_poke_queue---of 9
text_poke_set---of 5
text_poke_sync---of 1
-----------
SUMMARY33%of 70

__inet6_check_established17%of 48
__inet6_lookup_established---of 32
inet6_ehashfn72%of 7
inet6_hash---of 3
inet6_hash_connect67%of 3
inet6_lhash2_lookup---of 16
inet6_lookup---of 9
inet6_lookup_listener---of 7
inet6_lookup_reuseport---of 5
inet6_lookup_run_sk_lookup---of 35
ipv6_portaddr_hash---of 4
-----------
SUMMARY26%of 58

-----------
SUMMARY---of 0

aead_register_instance---of 5
crypto_aead_decrypt---of 4
crypto_aead_encrypt---of 3
crypto_aead_exit_tfm---of 1
crypto_aead_free_instance---of 1
crypto_aead_init_tfm60%of 5
crypto_aead_report---of 1
crypto_aead_setauthsize---of 8
crypto_aead_setkey---of 6
crypto_aead_show---of 1
crypto_alloc_aead100%of 1
crypto_grab_aead---of 1
crypto_has_aead---of 1
crypto_register_aead---of 4
crypto_register_aeads---of 11
crypto_unregister_aead---of 1
crypto_unregister_aeads---of 4
-----------
SUMMARY67%of 6

-----------
SUMMARY---of 0

__unix_dgram_recvmsg---of 45
__unix_set_addr_hash---of 15
__unix_stream_recvmsg---of 1
bpf_iter_fini_unix---of 1
bpf_iter_init_unix---of 4
bpf_iter_unix_batch---of 23
bpf_iter_unix_get_func_proto---of 1
bpf_iter_unix_realloc_batch---of 8
bpf_iter_unix_seq_next---of 7
bpf_iter_unix_seq_show---of 6
bpf_iter_unix_seq_start---of 3
bpf_iter_unix_seq_stop---of 10
copy_peercred---of 10
init_peercred---of 7
maybe_add_creds---of 12
queue_oob---of 14
refcount_add---of 4
refcount_dec_and_test---of 4
refcount_inc---of 4
scm_destroy---of 3
scm_recv_unix---of 22
scm_stat_add---of 4
signal_pending---of 3
sk_wake_async---of 17
sock_put---of 4
unix_accept---of 15
unix_autobind---of 19
unix_bind---of 33
unix_bpf_bypass_getsockopt---of 1
unix_close---of 1
unix_compat_ioctl---of 1
unix_create---of 9
unix_create1---of 12
unix_destruct_scm---of 5
unix_dgram_connect---of 46
unix_dgram_disconnected---of 5
unix_dgram_peer_wake_disconnect_wakeup---of 4
unix_dgram_peer_wake_me---of 14
unix_dgram_peer_wake_relay---of 5
unix_dgram_poll29%of 21
unix_dgram_recvmsg---of 3
unix_dgram_sendmsg---of 69
unix_find_other---of 29
unix_getname---of 16
unix_inq_len---of 9
unix_insert_bsd_socket---of 9
unix_ioctl---of 20
unix_listen---of 13
unix_net_exit---of 1
unix_net_init---of 7
unix_outq_len---of 1
unix_peer_get---of 5
unix_poll---of 12
unix_read_skb---of 3
unix_recvq_lock_cmp_fn---of 7
unix_release---of 3
unix_release_sock---of 41
unix_scm_to_skb---of 10
unix_seq_next---of 16
unix_seq_show---of 10
unix_seq_start---of 9
unix_seq_stop---of 3
unix_seqpacket_recvmsg---of 4
unix_seqpacket_sendmsg---of 7
unix_show_fdinfo---of 7
unix_shutdown---of 17
unix_sock_destructor---of 14
unix_socketpair---of 7
unix_state_double_lock---of 3
unix_state_double_unlock---of 3
unix_state_lock_cmp_fn---of 7
unix_stream_connect---of 49
unix_stream_read_actor---of 1
unix_stream_read_generic---of 91
unix_stream_read_skb---of 4
unix_stream_recv_urg---of 11
unix_stream_recvmsg---of 3
unix_stream_sendmsg---of 37
unix_stream_splice_actor---of 1
unix_stream_splice_read---of 4
unix_table_double_lock---of 3
unix_table_double_unlock---of 3
unix_table_lock_cmp_fn---of 1
unix_unhash---of 1
unix_wait_for_peer---of 5
unix_write_space---of 27
-----------
SUMMARY29%of 21

copy_utsname19%of 16
free_uts_ns---of 5
utsns_get---of 5
utsns_install---of 9
utsns_owner---of 1
utsns_put---of 4
-----------
SUMMARY19%of 16

__instance_destroy---of 9
__nfulnl_flush---of 7
__nfulnl_send---of 9
br_port_get_rcu---of 6
get_next---of 17
instance_lookup_get_rcu---of 9
nfnl_log_net_exit---of 5
nfnl_log_net_init---of 4
nfnl_log_pernet25%of 16
nfulnl_instance_free_rcu---of 4
nfulnl_log_packet---of 132
nfulnl_put_bridge---of 10
nfulnl_rcv_nl_event20%of 10
nfulnl_recv_config---of 58
nfulnl_recv_unsupp---of 1
nfulnl_timer---of 7
seq_next---of 1
seq_show---of 1
seq_start---of 19
seq_stop---of 6
skb_tstamp_cond---of 6
-----------
SUMMARY24%of 26

-----------
SUMMARY---of 0

tomoyo_bprm_check_security---of 4
tomoyo_bprm_committed_creds---of 1
tomoyo_cred_prepare---of 4
tomoyo_domain50%of 4
tomoyo_file_fcntl---of 6
tomoyo_file_ioctl100%of 1
tomoyo_file_open40%of 5
tomoyo_file_truncate---of 1
tomoyo_inode_getattr---of 1
tomoyo_path_chmod---of 1
tomoyo_path_chown---of 6
tomoyo_path_chroot---of 1
tomoyo_path_link---of 1
tomoyo_path_mkdir100%of 1
tomoyo_path_mknod---of 4
tomoyo_path_rename---of 4
tomoyo_path_rmdir---of 1
tomoyo_path_symlink---of 1
tomoyo_path_truncate---of 1
tomoyo_path_unlink---of 1
tomoyo_sb_mount---of 1
tomoyo_sb_pivotroot---of 1
tomoyo_sb_umount---of 1
tomoyo_socket_bind100%of 1
tomoyo_socket_connect100%of 1
tomoyo_socket_listen100%of 1
tomoyo_socket_sendmsg100%of 1
tomoyo_task_alloc---of 1
tomoyo_task_free---of 5
-----------
SUMMARY67%of 15

timerqueue_add72%of 7
timerqueue_del---of 5
timerqueue_iterate_next---of 3
-----------
SUMMARY72%of 7

__do_once_done100%of 1
__do_once_sleepable_done---of 1
__do_once_sleepable_start---of 3
__do_once_start67%of 3
once_deferred---of 3
once_disable_jump67%of 3
-----------
SUMMARY72%of 7

__ia32_sys_modify_ldt---of 1
__se_sys_modify_ldt---of 13
__x64_sys_modify_ldt---of 1
destroy_context_ldt---of 4
flush_ldt---of 16
free_ldt_pgtables---of 3
free_ldt_struct---of 4
install_ldt---of 1
ldt_arch_exit_mmap---of 3
ldt_dup_context---of 13
load_mm_ldt---of 7
map_ldt_struct---of 30
native_set_ldt---of 4
switch_ldt34%of 9
unmap_ldt_struct---of 12
write_ldt---of 25
-----------
SUMMARY34%of 9

__bpf_trace_ext4__bitmap_load---of 1
__bpf_trace_ext4__es_extent---of 1
__bpf_trace_ext4__es_shrink_enter---of 1
__bpf_trace_ext4__fallocate_mode---of 1
__bpf_trace_ext4__folio_op---of 1
__bpf_trace_ext4__map_blocks_enter---of 1
__bpf_trace_ext4__map_blocks_exit---of 1
__bpf_trace_ext4__mb_new_pa---of 1
__bpf_trace_ext4__mballoc---of 1
__bpf_trace_ext4__trim---of 1
__bpf_trace_ext4__truncate---of 1
__bpf_trace_ext4__write_begin---of 1
__bpf_trace_ext4__write_end---of 1
__bpf_trace_ext4_alloc_da_blocks---of 1
__bpf_trace_ext4_allocate_blocks---of 1
__bpf_trace_ext4_allocate_inode---of 1
__bpf_trace_ext4_begin_ordered_truncate---of 1
__bpf_trace_ext4_collapse_range---of 1
__bpf_trace_ext4_da_release_space---of 1
__bpf_trace_ext4_da_reserve_space---of 1
__bpf_trace_ext4_da_update_reserve_space---of 1
__bpf_trace_ext4_da_write_pages---of 1
__bpf_trace_ext4_da_write_pages_extent---of 1
__bpf_trace_ext4_discard_blocks---of 1
__bpf_trace_ext4_discard_preallocations---of 1
__bpf_trace_ext4_drop_inode---of 1
__bpf_trace_ext4_error---of 1
__bpf_trace_ext4_es_find_extent_range_enter---of 1
__bpf_trace_ext4_es_find_extent_range_exit---of 1
__bpf_trace_ext4_es_insert_delayed_block---of 1
__bpf_trace_ext4_es_lookup_extent_enter---of 1
__bpf_trace_ext4_es_lookup_extent_exit---of 1
__bpf_trace_ext4_es_remove_extent---of 1
__bpf_trace_ext4_es_shrink---of 1
__bpf_trace_ext4_es_shrink_scan_exit---of 1
__bpf_trace_ext4_evict_inode---of 1
__bpf_trace_ext4_ext_convert_to_initialized_enter---of 1
__bpf_trace_ext4_ext_convert_to_initialized_fastpath---of 1
__bpf_trace_ext4_ext_handle_unwritten_extents---of 1
__bpf_trace_ext4_ext_load_extent---of 1
__bpf_trace_ext4_ext_remove_space---of 1
__bpf_trace_ext4_ext_remove_space_done---of 1
__bpf_trace_ext4_ext_rm_idx---of 1
__bpf_trace_ext4_ext_rm_leaf---of 1
__bpf_trace_ext4_ext_show_extent---of 1
__bpf_trace_ext4_fallocate_exit---of 1
__bpf_trace_ext4_fc_cleanup---of 1
__bpf_trace_ext4_fc_commit_start---of 1
__bpf_trace_ext4_fc_commit_stop---of 1
__bpf_trace_ext4_fc_replay---of 1
__bpf_trace_ext4_fc_replay_scan---of 1
__bpf_trace_ext4_fc_stats---of 1
__bpf_trace_ext4_fc_track_dentry---of 1
__bpf_trace_ext4_fc_track_inode---of 1
__bpf_trace_ext4_fc_track_range---of 1
__bpf_trace_ext4_forget---of 1
__bpf_trace_ext4_free_blocks---of 1
__bpf_trace_ext4_free_inode---of 1
__bpf_trace_ext4_fsmap_class---of 1
__bpf_trace_ext4_get_implied_cluster_alloc_exit---of 1
__bpf_trace_ext4_getfsmap_class---of 1
__bpf_trace_ext4_insert_range---of 1
__bpf_trace_ext4_invalidate_folio_op---of 1
__bpf_trace_ext4_journal_start_inode---of 1
__bpf_trace_ext4_journal_start_reserved---of 1
__bpf_trace_ext4_journal_start_sb---of 1
__bpf_trace_ext4_lazy_itable_init---of 1
__bpf_trace_ext4_load_inode---of 1
__bpf_trace_ext4_mark_inode_dirty---of 1
__bpf_trace_ext4_mb_discard_preallocations---of 1
__bpf_trace_ext4_mb_release_group_pa---of 1
__bpf_trace_ext4_mb_release_inode_pa---of 1
__bpf_trace_ext4_mballoc_alloc---of 1
__bpf_trace_ext4_mballoc_prealloc---of 1
__bpf_trace_ext4_nfs_commit_metadata---of 1
__bpf_trace_ext4_other_inode_update_time---of 1
__bpf_trace_ext4_prefetch_bitmaps---of 1
__bpf_trace_ext4_read_block_bitmap_load---of 1
__bpf_trace_ext4_remove_blocks---of 1
__bpf_trace_ext4_request_blocks---of 1
__bpf_trace_ext4_request_inode---of 1
__bpf_trace_ext4_shutdown---of 1
__bpf_trace_ext4_sync_file_enter---of 1
__bpf_trace_ext4_sync_file_exit---of 1
__bpf_trace_ext4_sync_fs---of 1
__bpf_trace_ext4_unlink_enter---of 1
__bpf_trace_ext4_unlink_exit---of 1
__bpf_trace_ext4_update_sb---of 1
__bpf_trace_ext4_writepages---of 1
__bpf_trace_ext4_writepages_result---of 1
__ext4_error---of 4
__ext4_error_file---of 5
__ext4_error_inode---of 5
__ext4_grp_locked_error---of 22
__ext4_msg---of 4
__ext4_sb_bread_gfp---of 10
__ext4_std_error---of 16
__ext4_warning---of 3
__ext4_warning_inode---of 3
__probestub_ext4_alloc_da_blocks---of 1
__probestub_ext4_allocate_blocks---of 1
__probestub_ext4_allocate_inode---of 1
__probestub_ext4_begin_ordered_truncate---of 1
__probestub_ext4_collapse_range---of 1
__probestub_ext4_da_release_space---of 1
__probestub_ext4_da_reserve_space---of 1
__probestub_ext4_da_update_reserve_space---of 1
__probestub_ext4_da_write_begin---of 1
__probestub_ext4_da_write_end---of 1
__probestub_ext4_da_write_pages---of 1
__probestub_ext4_da_write_pages_extent---of 1
__probestub_ext4_discard_blocks---of 1
__probestub_ext4_discard_preallocations---of 1
__probestub_ext4_drop_inode---of 1
__probestub_ext4_error---of 1
__probestub_ext4_es_cache_extent---of 1
__probestub_ext4_es_find_extent_range_enter---of 1
__probestub_ext4_es_find_extent_range_exit---of 1
__probestub_ext4_es_insert_delayed_block---of 1
__probestub_ext4_es_insert_extent---of 1
__probestub_ext4_es_lookup_extent_enter---of 1
__probestub_ext4_es_lookup_extent_exit---of 1
__probestub_ext4_es_remove_extent---of 1
__probestub_ext4_es_shrink---of 1
__probestub_ext4_es_shrink_count---of 1
__probestub_ext4_es_shrink_scan_enter---of 1
__probestub_ext4_es_shrink_scan_exit---of 1
__probestub_ext4_evict_inode---of 1
__probestub_ext4_ext_convert_to_initialized_enter---of 1
__probestub_ext4_ext_convert_to_initialized_fastpath---of 1
__probestub_ext4_ext_handle_unwritten_extents---of 1
__probestub_ext4_ext_load_extent---of 1
__probestub_ext4_ext_map_blocks_enter---of 1
__probestub_ext4_ext_map_blocks_exit---of 1
__probestub_ext4_ext_remove_space---of 1
__probestub_ext4_ext_remove_space_done---of 1
__probestub_ext4_ext_rm_idx---of 1
__probestub_ext4_ext_rm_leaf---of 1
__probestub_ext4_ext_show_extent---of 1
__probestub_ext4_fallocate_enter---of 1
__probestub_ext4_fallocate_exit---of 1
__probestub_ext4_fc_cleanup---of 1
__probestub_ext4_fc_commit_start---of 1
__probestub_ext4_fc_commit_stop---of 1
__probestub_ext4_fc_replay---of 1
__probestub_ext4_fc_replay_scan---of 1
__probestub_ext4_fc_stats---of 1
__probestub_ext4_fc_track_create---of 1
__probestub_ext4_fc_track_inode---of 1
__probestub_ext4_fc_track_link---of 1
__probestub_ext4_fc_track_range---of 1
__probestub_ext4_fc_track_unlink---of 1
__probestub_ext4_forget---of 1
__probestub_ext4_free_blocks---of 1
__probestub_ext4_free_inode---of 1
__probestub_ext4_fsmap_high_key---of 1
__probestub_ext4_fsmap_low_key---of 1
__probestub_ext4_fsmap_mapping---of 1
__probestub_ext4_get_implied_cluster_alloc_exit---of 1
__probestub_ext4_getfsmap_high_key---of 1
__probestub_ext4_getfsmap_low_key---of 1
__probestub_ext4_getfsmap_mapping---of 1
__probestub_ext4_ind_map_blocks_enter---of 1
__probestub_ext4_ind_map_blocks_exit---of 1
__probestub_ext4_insert_range---of 1
__probestub_ext4_invalidate_folio---of 1
__probestub_ext4_journal_start_inode---of 1
__probestub_ext4_journal_start_reserved---of 1
__probestub_ext4_journal_start_sb---of 1
__probestub_ext4_journalled_invalidate_folio---of 1
__probestub_ext4_journalled_write_end---of 1
__probestub_ext4_lazy_itable_init---of 1
__probestub_ext4_load_inode---of 1
__probestub_ext4_load_inode_bitmap---of 1
__probestub_ext4_mark_inode_dirty---of 1
__probestub_ext4_mb_bitmap_load---of 1
__probestub_ext4_mb_buddy_bitmap_load---of 1
__probestub_ext4_mb_discard_preallocations---of 1
__probestub_ext4_mb_new_group_pa---of 1
__probestub_ext4_mb_new_inode_pa---of 1
__probestub_ext4_mb_release_group_pa---of 1
__probestub_ext4_mb_release_inode_pa---of 1
__probestub_ext4_mballoc_alloc---of 1
__probestub_ext4_mballoc_discard---of 1
__probestub_ext4_mballoc_free---of 1
__probestub_ext4_mballoc_prealloc---of 1
__probestub_ext4_nfs_commit_metadata---of 1
__probestub_ext4_other_inode_update_time---of 1
__probestub_ext4_prefetch_bitmaps---of 1
__probestub_ext4_punch_hole---of 1
__probestub_ext4_read_block_bitmap_load---of 1
__probestub_ext4_read_folio---of 1
__probestub_ext4_release_folio---of 1
__probestub_ext4_remove_blocks---of 1
__probestub_ext4_request_blocks---of 1
__probestub_ext4_request_inode---of 1
__probestub_ext4_shutdown---of 1
__probestub_ext4_sync_file_enter---of 1
__probestub_ext4_sync_file_exit---of 1
__probestub_ext4_sync_fs---of 1
__probestub_ext4_trim_all_free---of 1
__probestub_ext4_trim_extent---of 1
__probestub_ext4_truncate_enter---of 1
__probestub_ext4_truncate_exit---of 1
__probestub_ext4_unlink_enter---of 1
__probestub_ext4_unlink_exit---of 1
__probestub_ext4_update_sb---of 1
__probestub_ext4_write_begin---of 1
__probestub_ext4_write_end---of 1
__probestub_ext4_writepages---of 1
__probestub_ext4_writepages_result---of 1
__probestub_ext4_zero_range---of 1
__traceiter_ext4_alloc_da_blocks---of 4
__traceiter_ext4_allocate_blocks---of 4
__traceiter_ext4_allocate_inode---of 4
__traceiter_ext4_begin_ordered_truncate---of 4
__traceiter_ext4_collapse_range---of 4
__traceiter_ext4_da_release_space---of 4
__traceiter_ext4_da_reserve_space---of 4
__traceiter_ext4_da_update_reserve_space---of 4
__traceiter_ext4_da_write_begin---of 4
__traceiter_ext4_da_write_end---of 4
__traceiter_ext4_da_write_pages---of 4
__traceiter_ext4_da_write_pages_extent---of 4
__traceiter_ext4_discard_blocks---of 4
__traceiter_ext4_discard_preallocations---of 4
__traceiter_ext4_drop_inode---of 4
__traceiter_ext4_error---of 4
__traceiter_ext4_es_cache_extent---of 4
__traceiter_ext4_es_find_extent_range_enter---of 4
__traceiter_ext4_es_find_extent_range_exit---of 4
__traceiter_ext4_es_insert_delayed_block---of 4
__traceiter_ext4_es_insert_extent---of 4
__traceiter_ext4_es_lookup_extent_enter---of 4
__traceiter_ext4_es_lookup_extent_exit---of 4
__traceiter_ext4_es_remove_extent---of 4
__traceiter_ext4_es_shrink---of 4
__traceiter_ext4_es_shrink_count---of 4
__traceiter_ext4_es_shrink_scan_enter---of 4
__traceiter_ext4_es_shrink_scan_exit---of 4
__traceiter_ext4_evict_inode---of 4
__traceiter_ext4_ext_convert_to_initialized_enter---of 4
__traceiter_ext4_ext_convert_to_initialized_fastpath---of 4
__traceiter_ext4_ext_handle_unwritten_extents---of 4
__traceiter_ext4_ext_load_extent---of 4
__traceiter_ext4_ext_map_blocks_enter---of 4
__traceiter_ext4_ext_map_blocks_exit---of 4
__traceiter_ext4_ext_remove_space---of 4
__traceiter_ext4_ext_remove_space_done---of 4
__traceiter_ext4_ext_rm_idx---of 4
__traceiter_ext4_ext_rm_leaf---of 4
__traceiter_ext4_ext_show_extent---of 4
__traceiter_ext4_fallocate_enter---of 4
__traceiter_ext4_fallocate_exit---of 4
__traceiter_ext4_fc_cleanup---of 4
__traceiter_ext4_fc_commit_start---of 4
__traceiter_ext4_fc_commit_stop---of 4
__traceiter_ext4_fc_replay---of 4
__traceiter_ext4_fc_replay_scan---of 4
__traceiter_ext4_fc_stats---of 4
__traceiter_ext4_fc_track_create---of 4
__traceiter_ext4_fc_track_inode---of 4
__traceiter_ext4_fc_track_link---of 4
__traceiter_ext4_fc_track_range---of 4
__traceiter_ext4_fc_track_unlink---of 4
__traceiter_ext4_forget---of 4
__traceiter_ext4_free_blocks---of 4
__traceiter_ext4_free_inode---of 4
__traceiter_ext4_fsmap_high_key---of 4
__traceiter_ext4_fsmap_low_key---of 4
__traceiter_ext4_fsmap_mapping---of 4
__traceiter_ext4_get_implied_cluster_alloc_exit---of 4
__traceiter_ext4_getfsmap_high_key---of 4
__traceiter_ext4_getfsmap_low_key---of 4
__traceiter_ext4_getfsmap_mapping---of 4
__traceiter_ext4_ind_map_blocks_enter---of 4
__traceiter_ext4_ind_map_blocks_exit---of 4
__traceiter_ext4_insert_range---of 4
__traceiter_ext4_invalidate_folio---of 4
__traceiter_ext4_journal_start_inode---of 4
__traceiter_ext4_journal_start_reserved---of 4
__traceiter_ext4_journal_start_sb---of 4
__traceiter_ext4_journalled_invalidate_folio---of 4
__traceiter_ext4_journalled_write_end---of 4
__traceiter_ext4_lazy_itable_init---of 4
__traceiter_ext4_load_inode---of 4
__traceiter_ext4_load_inode_bitmap---of 4
__traceiter_ext4_mark_inode_dirty---of 4
__traceiter_ext4_mb_bitmap_load---of 4
__traceiter_ext4_mb_buddy_bitmap_load---of 4
__traceiter_ext4_mb_discard_preallocations---of 4
__traceiter_ext4_mb_new_group_pa---of 4
__traceiter_ext4_mb_new_inode_pa---of 4
__traceiter_ext4_mb_release_group_pa---of 4
__traceiter_ext4_mb_release_inode_pa---of 4
__traceiter_ext4_mballoc_alloc---of 4
__traceiter_ext4_mballoc_discard---of 4
__traceiter_ext4_mballoc_free---of 4
__traceiter_ext4_mballoc_prealloc---of 4
__traceiter_ext4_nfs_commit_metadata---of 4
__traceiter_ext4_other_inode_update_time---of 4
__traceiter_ext4_prefetch_bitmaps---of 4
__traceiter_ext4_punch_hole---of 4
__traceiter_ext4_read_block_bitmap_load---of 4
__traceiter_ext4_read_folio---of 4
__traceiter_ext4_release_folio---of 4
__traceiter_ext4_remove_blocks---of 4
__traceiter_ext4_request_blocks---of 4
__traceiter_ext4_request_inode---of 4
__traceiter_ext4_shutdown---of 4
__traceiter_ext4_sync_file_enter---of 4
__traceiter_ext4_sync_file_exit---of 4
__traceiter_ext4_sync_fs---of 4
__traceiter_ext4_trim_all_free---of 4
__traceiter_ext4_trim_extent---of 4
__traceiter_ext4_truncate_enter---of 4
__traceiter_ext4_truncate_exit---of 4
__traceiter_ext4_unlink_enter---of 4
__traceiter_ext4_unlink_exit---of 4
__traceiter_ext4_update_sb---of 4
__traceiter_ext4_write_begin---of 4
__traceiter_ext4_write_end---of 4
__traceiter_ext4_writepages---of 4
__traceiter_ext4_writepages_result---of 4
__traceiter_ext4_zero_range---of 4
_ext4_show_options---of 107
ext4_acquire_dquot---of 11
ext4_alloc_flex_bg_array---of 34
ext4_alloc_inode---of 3
ext4_apply_options---of 58
ext4_block_bitmap---of 3
ext4_block_bitmap_set---of 3
ext4_block_group_meta_init---of 26
ext4_calculate_overhead---of 54
ext4_check_feature_compatibility---of 38
ext4_check_geometry---of 26
ext4_check_journal_data_mode---of 10
ext4_check_opt_consistency---of 109
ext4_clear_inode---of 5
ext4_clear_journal_err---of 14
ext4_commit_super---of 15
ext4_decode_error---of 11
ext4_destroy_inode---of 5
ext4_drop_inode---of 18
ext4_enable_quotas---of 21
ext4_fast_commit_init---of 1
ext4_fc_free---of 3
ext4_feature_set_ok---of 7
ext4_fh_to_dentry---of 1
ext4_fh_to_parent---of 1
ext4_fill_flex_info---of 29
ext4_fill_super---of 253
ext4_flex_groups_free---of 20
ext4_force_commit---of 3
ext4_free_group_clusters---of 3
ext4_free_group_clusters_set---of 3
ext4_free_in_core_inode---of 3
ext4_free_inodes_count---of 3
ext4_free_inodes_set---of 3
ext4_freeze---of 5
ext4_get_dquots---of 1
ext4_get_journal_inode---of 6
ext4_get_stripe_size---of 12
ext4_get_tree---of 1
ext4_group_desc_csum---of 20
ext4_group_desc_csum_set---of 8
ext4_group_desc_csum_verify---of 10
ext4_group_desc_free---of 21
ext4_group_desc_init---of 94
ext4_handle_clustersize---of 12
ext4_handle_error---of 21
ext4_hash_info_init---of 6
ext4_init_fs_context---of 3
ext4_inode_bitmap---of 3
ext4_inode_bitmap_set---of 3
ext4_inode_table67%of 3
ext4_inode_table_set---of 3
ext4_itable_unused_count---of 3
ext4_itable_unused_set---of 3
ext4_journal_bmap---of 4
ext4_journal_commit_callback---of 17
ext4_journal_finish_inode_data_buffers---of 3
ext4_journal_submit_inode_data_buffers---of 3
ext4_journalled_writepage_callback---of 8
ext4_kill_sb---of 4
ext4_lazyinit_thread---of 88
ext4_load_and_init_journal---of 87
ext4_mark_dquot_dirty---of 5
ext4_mark_group_bitmap_corrupted---of 9
ext4_mark_recovery_complete---of 10
ext4_nfs_commit_metadata---of 15
ext4_nfs_get_inode---of 4
ext4_parse_param---of 68
ext4_parse_test_dummy_encryption---of 4
ext4_percpu_param_destroy---of 1
ext4_percpu_param_init---of 10
ext4_put_super---of 36
ext4_quota_off---of 12
ext4_quota_on---of 13
ext4_quota_read---of 12
ext4_quota_write---of 17
ext4_read_bh---of 8
ext4_read_bh_lock---of 5
ext4_read_bh_nowait---of 7
ext4_reconfigure---of 144
ext4_register_li_request---of 25
ext4_release_dquot---of 11
ext4_sb_bread---of 1
ext4_sb_bread_unmovable---of 1
ext4_sb_breadahead_unmovable---of 4
ext4_seq_options_show---of 1
ext4_set_resv_clusters---of 6
ext4_setup_super---of 23
ext4_show_options---of 1
ext4_shutdown---of 1
ext4_statfs---of 31
ext4_superblock_csum---of 4
ext4_superblock_csum_set---of 9
ext4_sync_fs---of 25
ext4_unfreeze---of 8
ext4_unregister_li_request---of 5
ext4_update_dynamic_rev---of 3
ext4_update_super---of 53
ext4_used_dirs_count---of 3
ext4_used_dirs_set---of 3
ext4_write_dquot---of 8
ext4_write_info---of 3
init_once---of 1
note_qf_name---of 7
perf_trace_ext4__bitmap_load---of 8
perf_trace_ext4__es_extent---of 8
perf_trace_ext4__es_shrink_enter---of 8
perf_trace_ext4__fallocate_mode---of 8
perf_trace_ext4__folio_op---of 8
perf_trace_ext4__map_blocks_enter---of 8
perf_trace_ext4__map_blocks_exit---of 8
perf_trace_ext4__mb_new_pa---of 8
perf_trace_ext4__mballoc---of 9
perf_trace_ext4__trim---of 8
perf_trace_ext4__truncate---of 8
perf_trace_ext4__write_begin---of 8
perf_trace_ext4__write_end---of 8
perf_trace_ext4_alloc_da_blocks---of 8
perf_trace_ext4_allocate_blocks---of 8
perf_trace_ext4_allocate_inode---of 8
perf_trace_ext4_begin_ordered_truncate---of 8
perf_trace_ext4_collapse_range---of 8
perf_trace_ext4_da_release_space---of 8
perf_trace_ext4_da_reserve_space---of 8
perf_trace_ext4_da_update_reserve_space---of 8
perf_trace_ext4_da_write_pages---of 8
perf_trace_ext4_da_write_pages_extent---of 8
perf_trace_ext4_discard_blocks---of 8
perf_trace_ext4_discard_preallocations---of 8
perf_trace_ext4_drop_inode---of 8
perf_trace_ext4_error---of 8
perf_trace_ext4_es_find_extent_range_enter---of 8
perf_trace_ext4_es_find_extent_range_exit---of 8
perf_trace_ext4_es_insert_delayed_block---of 8
perf_trace_ext4_es_lookup_extent_enter---of 8
perf_trace_ext4_es_lookup_extent_exit---of 8
perf_trace_ext4_es_remove_extent---of 8
perf_trace_ext4_es_shrink---of 8
perf_trace_ext4_es_shrink_scan_exit---of 8
perf_trace_ext4_evict_inode---of 8
perf_trace_ext4_ext_convert_to_initialized_enter---of 8
perf_trace_ext4_ext_convert_to_initialized_fastpath---of 8
perf_trace_ext4_ext_handle_unwritten_extents---of 8
perf_trace_ext4_ext_load_extent---of 8
perf_trace_ext4_ext_remove_space---of 8
perf_trace_ext4_ext_remove_space_done---of 8
perf_trace_ext4_ext_rm_idx---of 8
perf_trace_ext4_ext_rm_leaf---of 8
perf_trace_ext4_ext_show_extent---of 8
perf_trace_ext4_fallocate_exit---of 8
perf_trace_ext4_fc_cleanup---of 8
perf_trace_ext4_fc_commit_start---of 8
perf_trace_ext4_fc_commit_stop---of 8
perf_trace_ext4_fc_replay---of 8
perf_trace_ext4_fc_replay_scan---of 8
perf_trace_ext4_fc_stats---of 8
perf_trace_ext4_fc_track_dentry---of 8
perf_trace_ext4_fc_track_inode---of 8
perf_trace_ext4_fc_track_range---of 8
perf_trace_ext4_forget---of 8
perf_trace_ext4_free_blocks---of 8
perf_trace_ext4_free_inode---of 8
perf_trace_ext4_fsmap_class---of 8
perf_trace_ext4_get_implied_cluster_alloc_exit---of 8
perf_trace_ext4_getfsmap_class---of 8
perf_trace_ext4_insert_range---of 8
perf_trace_ext4_invalidate_folio_op---of 8
perf_trace_ext4_journal_start_inode---of 8
perf_trace_ext4_journal_start_reserved---of 8
perf_trace_ext4_journal_start_sb---of 8
perf_trace_ext4_lazy_itable_init---of 8
perf_trace_ext4_load_inode---of 8
perf_trace_ext4_mark_inode_dirty---of 8
perf_trace_ext4_mb_discard_preallocations---of 8
perf_trace_ext4_mb_release_group_pa---of 8
perf_trace_ext4_mb_release_inode_pa---of 8
perf_trace_ext4_mballoc_alloc---of 8
perf_trace_ext4_mballoc_prealloc---of 8
perf_trace_ext4_nfs_commit_metadata---of 8
perf_trace_ext4_other_inode_update_time---of 8
perf_trace_ext4_prefetch_bitmaps---of 8
perf_trace_ext4_read_block_bitmap_load---of 8
perf_trace_ext4_remove_blocks---of 8
perf_trace_ext4_request_blocks---of 8
perf_trace_ext4_request_inode---of 8
perf_trace_ext4_shutdown---of 8
perf_trace_ext4_sync_file_enter---of 8
perf_trace_ext4_sync_file_exit---of 8
perf_trace_ext4_sync_fs---of 8
perf_trace_ext4_unlink_enter---of 8
perf_trace_ext4_unlink_exit---of 8
perf_trace_ext4_update_sb---of 8
perf_trace_ext4_writepages---of 8
perf_trace_ext4_writepages_result---of 8
print_daily_error_info---of 13
register_as_ext2---of 3
register_as_ext3---of 3
trace_event_raw_event_ext4__bitmap_load---of 7
trace_event_raw_event_ext4__es_extent---of 7
trace_event_raw_event_ext4__es_shrink_enter---of 7
trace_event_raw_event_ext4__fallocate_mode---of 7
trace_event_raw_event_ext4__folio_op---of 7
trace_event_raw_event_ext4__map_blocks_enter---of 7
trace_event_raw_event_ext4__map_blocks_exit---of 7
trace_event_raw_event_ext4__mb_new_pa---of 7
trace_event_raw_event_ext4__mballoc---of 8
trace_event_raw_event_ext4__trim---of 7
trace_event_raw_event_ext4__truncate---of 7
trace_event_raw_event_ext4__write_begin---of 7
trace_event_raw_event_ext4__write_end---of 7
trace_event_raw_event_ext4_alloc_da_blocks---of 7
trace_event_raw_event_ext4_allocate_blocks---of 7
trace_event_raw_event_ext4_allocate_inode---of 7
trace_event_raw_event_ext4_begin_ordered_truncate---of 7
trace_event_raw_event_ext4_collapse_range---of 7
trace_event_raw_event_ext4_da_release_space---of 7
trace_event_raw_event_ext4_da_reserve_space---of 7
trace_event_raw_event_ext4_da_update_reserve_space---of 7
trace_event_raw_event_ext4_da_write_pages---of 7
trace_event_raw_event_ext4_da_write_pages_extent---of 7
trace_event_raw_event_ext4_discard_blocks---of 7
trace_event_raw_event_ext4_discard_preallocations---of 7
trace_event_raw_event_ext4_drop_inode---of 7
trace_event_raw_event_ext4_error---of 7
trace_event_raw_event_ext4_es_find_extent_range_enter---of 7
trace_event_raw_event_ext4_es_find_extent_range_exit---of 7
trace_event_raw_event_ext4_es_insert_delayed_block---of 7
trace_event_raw_event_ext4_es_lookup_extent_enter---of 7
trace_event_raw_event_ext4_es_lookup_extent_exit---of 7
trace_event_raw_event_ext4_es_remove_extent---of 7
trace_event_raw_event_ext4_es_shrink---of 7
trace_event_raw_event_ext4_es_shrink_scan_exit---of 7
trace_event_raw_event_ext4_evict_inode---of 7
trace_event_raw_event_ext4_ext_convert_to_initialized_enter---of 7
trace_event_raw_event_ext4_ext_convert_to_initialized_fastpath---of 7
trace_event_raw_event_ext4_ext_handle_unwritten_extents---of 7
trace_event_raw_event_ext4_ext_load_extent---of 7
trace_event_raw_event_ext4_ext_remove_space---of 7
trace_event_raw_event_ext4_ext_remove_space_done---of 7
trace_event_raw_event_ext4_ext_rm_idx---of 7
trace_event_raw_event_ext4_ext_rm_leaf---of 7
trace_event_raw_event_ext4_ext_show_extent---of 7
trace_event_raw_event_ext4_fallocate_exit---of 7
trace_event_raw_event_ext4_fc_cleanup---of 7
trace_event_raw_event_ext4_fc_commit_start---of 7
trace_event_raw_event_ext4_fc_commit_stop---of 7
trace_event_raw_event_ext4_fc_replay---of 7
trace_event_raw_event_ext4_fc_replay_scan---of 7
trace_event_raw_event_ext4_fc_stats---of 7
trace_event_raw_event_ext4_fc_track_dentry---of 7
trace_event_raw_event_ext4_fc_track_inode---of 7
trace_event_raw_event_ext4_fc_track_range---of 7
trace_event_raw_event_ext4_forget---of 7
trace_event_raw_event_ext4_free_blocks---of 7
trace_event_raw_event_ext4_free_inode---of 7
trace_event_raw_event_ext4_fsmap_class---of 7
trace_event_raw_event_ext4_get_implied_cluster_alloc_exit---of 7
trace_event_raw_event_ext4_getfsmap_class---of 7
trace_event_raw_event_ext4_insert_range---of 7
trace_event_raw_event_ext4_invalidate_folio_op---of 7
trace_event_raw_event_ext4_journal_start_inode---of 7
trace_event_raw_event_ext4_journal_start_reserved---of 7
trace_event_raw_event_ext4_journal_start_sb---of 7
trace_event_raw_event_ext4_lazy_itable_init---of 7
trace_event_raw_event_ext4_load_inode---of 7
trace_event_raw_event_ext4_mark_inode_dirty---of 7
trace_event_raw_event_ext4_mb_discard_preallocations---of 7
trace_event_raw_event_ext4_mb_release_group_pa---of 7
trace_event_raw_event_ext4_mb_release_inode_pa---of 7
trace_event_raw_event_ext4_mballoc_alloc---of 7
trace_event_raw_event_ext4_mballoc_prealloc---of 7
trace_event_raw_event_ext4_nfs_commit_metadata---of 7
trace_event_raw_event_ext4_other_inode_update_time---of 7
trace_event_raw_event_ext4_prefetch_bitmaps---of 7
trace_event_raw_event_ext4_read_block_bitmap_load---of 7
trace_event_raw_event_ext4_remove_blocks---of 7
trace_event_raw_event_ext4_request_blocks---of 7
trace_event_raw_event_ext4_request_inode---of 7
trace_event_raw_event_ext4_shutdown---of 7
trace_event_raw_event_ext4_sync_file_enter---of 7
trace_event_raw_event_ext4_sync_file_exit---of 7
trace_event_raw_event_ext4_sync_fs---of 7
trace_event_raw_event_ext4_unlink_enter---of 7
trace_event_raw_event_ext4_unlink_exit---of 7
trace_event_raw_event_ext4_update_sb---of 7
trace_event_raw_event_ext4_writepages---of 7
trace_event_raw_event_ext4_writepages_result---of 7
trace_ext4_error---of 15
trace_raw_output_ext4__bitmap_load---of 3
trace_raw_output_ext4__es_extent---of 3
trace_raw_output_ext4__es_shrink_enter---of 3
trace_raw_output_ext4__fallocate_mode---of 3
trace_raw_output_ext4__folio_op---of 3
trace_raw_output_ext4__map_blocks_enter---of 3
trace_raw_output_ext4__map_blocks_exit---of 3
trace_raw_output_ext4__mb_new_pa---of 3
trace_raw_output_ext4__mballoc---of 3
trace_raw_output_ext4__trim---of 3
trace_raw_output_ext4__truncate---of 3
trace_raw_output_ext4__write_begin---of 3
trace_raw_output_ext4__write_end---of 3
trace_raw_output_ext4_alloc_da_blocks---of 3
trace_raw_output_ext4_allocate_blocks---of 3
trace_raw_output_ext4_allocate_inode---of 3
trace_raw_output_ext4_begin_ordered_truncate---of 3
trace_raw_output_ext4_collapse_range---of 3
trace_raw_output_ext4_da_release_space---of 3
trace_raw_output_ext4_da_reserve_space---of 3
trace_raw_output_ext4_da_update_reserve_space---of 3
trace_raw_output_ext4_da_write_pages---of 3
trace_raw_output_ext4_da_write_pages_extent---of 3
trace_raw_output_ext4_discard_blocks---of 3
trace_raw_output_ext4_discard_preallocations---of 3
trace_raw_output_ext4_drop_inode---of 3
trace_raw_output_ext4_error---of 3
trace_raw_output_ext4_es_find_extent_range_enter---of 3
trace_raw_output_ext4_es_find_extent_range_exit---of 3
trace_raw_output_ext4_es_insert_delayed_block---of 3
trace_raw_output_ext4_es_lookup_extent_enter---of 3
trace_raw_output_ext4_es_lookup_extent_exit---of 4
trace_raw_output_ext4_es_remove_extent---of 3
trace_raw_output_ext4_es_shrink---of 3
trace_raw_output_ext4_es_shrink_scan_exit---of 3
trace_raw_output_ext4_evict_inode---of 3
trace_raw_output_ext4_ext_convert_to_initialized_enter---of 3
trace_raw_output_ext4_ext_convert_to_initialized_fastpath---of 3
trace_raw_output_ext4_ext_handle_unwritten_extents---of 3
trace_raw_output_ext4_ext_load_extent---of 3
trace_raw_output_ext4_ext_remove_space---of 3
trace_raw_output_ext4_ext_remove_space_done---of 3
trace_raw_output_ext4_ext_rm_idx---of 3
trace_raw_output_ext4_ext_rm_leaf---of 3
trace_raw_output_ext4_ext_show_extent---of 3
trace_raw_output_ext4_fallocate_exit---of 3
trace_raw_output_ext4_fc_cleanup---of 3
trace_raw_output_ext4_fc_commit_start---of 3
trace_raw_output_ext4_fc_commit_stop---of 3
trace_raw_output_ext4_fc_replay---of 3
trace_raw_output_ext4_fc_replay_scan---of 3
trace_raw_output_ext4_fc_stats---of 3
trace_raw_output_ext4_fc_track_dentry---of 3
trace_raw_output_ext4_fc_track_inode---of 3
trace_raw_output_ext4_fc_track_range---of 3
trace_raw_output_ext4_forget---of 3
trace_raw_output_ext4_free_blocks---of 3
trace_raw_output_ext4_free_inode---of 3
trace_raw_output_ext4_fsmap_class---of 3
trace_raw_output_ext4_get_implied_cluster_alloc_exit---of 3
trace_raw_output_ext4_getfsmap_class---of 3
trace_raw_output_ext4_insert_range---of 3
trace_raw_output_ext4_invalidate_folio_op---of 3
trace_raw_output_ext4_journal_start_inode---of 3
trace_raw_output_ext4_journal_start_reserved---of 3
trace_raw_output_ext4_journal_start_sb---of 3
trace_raw_output_ext4_lazy_itable_init---of 3
trace_raw_output_ext4_load_inode---of 3
trace_raw_output_ext4_mark_inode_dirty---of 3
trace_raw_output_ext4_mb_discard_preallocations---of 3
trace_raw_output_ext4_mb_release_group_pa---of 3
trace_raw_output_ext4_mb_release_inode_pa---of 3
trace_raw_output_ext4_mballoc_alloc---of 5
trace_raw_output_ext4_mballoc_prealloc---of 3
trace_raw_output_ext4_nfs_commit_metadata---of 3
trace_raw_output_ext4_other_inode_update_time---of 3
trace_raw_output_ext4_prefetch_bitmaps---of 3
trace_raw_output_ext4_read_block_bitmap_load---of 3
trace_raw_output_ext4_remove_blocks---of 3
trace_raw_output_ext4_request_blocks---of 3
trace_raw_output_ext4_request_inode---of 3
trace_raw_output_ext4_shutdown---of 3
trace_raw_output_ext4_sync_file_enter---of 3
trace_raw_output_ext4_sync_file_exit---of 3
trace_raw_output_ext4_sync_fs---of 3
trace_raw_output_ext4_unlink_enter---of 3
trace_raw_output_ext4_unlink_exit---of 3
trace_raw_output_ext4_update_sb---of 3
trace_raw_output_ext4_writepages---of 3
trace_raw_output_ext4_writepages_result---of 3
update_super_work---of 12
-----------
SUMMARY67%of 3

__do_pipe_flags25%of 8
__ia32_sys_pipe---of 1
__ia32_sys_pipe2---of 1
__x64_sys_pipe100%of 1
__x64_sys_pipe2---of 1
_compound_head---of 7
account_pipe_buffers---of 1
alloc_pipe_info34%of 18
anon_pipe_buf_release---of 10
anon_pipe_buf_try_steal---of 29
create_pipe_files30%of 10
do_pipe250%of 4
do_pipe_flags---of 3
do_proc_dopipe_max_size_conv---of 8
fifo_open---of 29
free_pipe_info---of 12
generic_pipe_buf_get---of 9
generic_pipe_buf_release---of 1
generic_pipe_buf_try_steal---of 22
get_pipe_info40%of 5
pipe_double_lock---of 6
pipe_fasync---of 7
pipe_fcntl---of 24
pipe_ioctl---of 11
pipe_is_unprivileged_user---of 3
pipe_lock67%of 3
pipe_lock_cmp_fn---of 1
pipe_poll---of 14
pipe_read---of 39
pipe_release---of 9
pipe_resize_ring---of 11
pipe_unlock67%of 3
pipe_wait_readable---of 10
pipe_wait_writable---of 12
pipe_write---of 64
pipefs_dname---of 1
pipefs_init_fs_context---of 3
proc_dopipe_max_size---of 1
put_page---of 14
round_pipe_size---of 5
too_many_pipe_buffers_hard---of 1
too_many_pipe_buffers_soft---of 1
wait_for_partner---of 9
-----------
SUMMARY39%of 52

-----------
SUMMARY---of 0

__nf_hook_entries_free---of 1
__nf_hook_entries_try_shrink---of 24
__nf_register_net_hook---of 38
__nf_unregister_net_hook---of 30
accept_all---of 1
netfilter_net_exit---of 1
netfilter_net_init---of 4
nf_conntrack_destroy---of 21
nf_ct_attach---of 19
nf_ct_get_tuple_skb---of 18
nf_ct_set_closing---of 19
nf_hook_entries_delete_raw---of 9
nf_hook_entries_grow---of 21
nf_hook_entries_insert_raw---of 11
nf_hook_entry_head---of 19
nf_hook_slow40%of 10
nf_hook_slow_list---of 11
nf_register_net_hook---of 8
nf_register_net_hooks---of 10
nf_unregister_net_hook---of 4
nf_unregister_net_hooks---of 7
-----------
SUMMARY40%of 10

__bpf_trace_bpf_trace_printk---of 1
__probestub_bpf_trace_printk---of 1
__traceiter_bpf_trace_printk---of 4
bpf_current_task_under_cgroup---of 15
bpf_d_path---of 5
bpf_d_path_allowed---of 5
bpf_event_notify---of 10
bpf_event_output---of 21
bpf_get_attach_cookie_kprobe_multi---of 1
bpf_get_attach_cookie_pe---of 1
bpf_get_attach_cookie_trace---of 1
bpf_get_attach_cookie_tracing---of 1
bpf_get_attach_cookie_uprobe_multi---of 1
bpf_get_branch_snapshot---of 4
bpf_get_current_task---of 1
bpf_get_current_task_btf---of 1
bpf_get_file_xattr---of 7
bpf_get_file_xattr_filter---of 3
bpf_get_func_ip_kprobe---of 3
bpf_get_func_ip_kprobe_multi---of 1
bpf_get_func_ip_tracing---of 1
bpf_get_func_ip_uprobe_multi---of 1
bpf_get_perf_event_info---of 11
bpf_get_probe_write_proto---of 4
bpf_get_raw_tracepoint---of 12
bpf_get_stack_raw_tp---of 8
bpf_get_stack_tp---of 1
bpf_get_stackid_raw_tp---of 8
bpf_get_stackid_tp---of 1
bpf_get_trace_printk_proto---of 4
bpf_get_trace_vprintk_proto---of 4
bpf_key_put---of 3
bpf_kprobe_multi_filter---of 3
bpf_kprobe_multi_link_attach---of 1
bpf_lookup_system_key---of 4
bpf_lookup_user_key---of 5
bpf_perf_event_output---of 18
bpf_perf_event_output_raw_tp---of 24
bpf_perf_event_output_tp---of 18
bpf_perf_event_read---of 6
bpf_perf_event_read_value---of 7
bpf_perf_prog_read_value---of 4
bpf_probe_read_compat---of 7
bpf_probe_read_compat_str---of 7
bpf_probe_read_kernel---of 3
bpf_probe_read_kernel_str---of 3
bpf_probe_read_user---of 3
bpf_probe_read_user_str---of 3
bpf_probe_register---of 4
bpf_probe_unregister---of 1
bpf_probe_write_user---of 5
bpf_put_raw_tracepoint---of 3
bpf_read_branch_records---of 6
bpf_send_signal---of 1
bpf_send_signal_common---of 12
bpf_send_signal_thread---of 1
bpf_seq_printf---of 5
bpf_seq_printf_btf---of 9
bpf_seq_write---of 1
bpf_session_cookie---of 1
bpf_session_is_return---of 1
bpf_snprintf_btf---of 9
bpf_task_pt_regs---of 1
bpf_trace_printk---of 3
bpf_trace_run1---of 15
bpf_trace_run10---of 15
bpf_trace_run11---of 15
bpf_trace_run12---of 15
bpf_trace_run227%of 15
bpf_trace_run327%of 15
bpf_trace_run427%of 15
bpf_trace_run5---of 15
bpf_trace_run6---of 15
bpf_trace_run7---of 15
bpf_trace_run8---of 15
bpf_trace_run9---of 15
bpf_trace_vprintk---of 5
bpf_tracing_func_proto---of 68
bpf_uprobe_multi_link_attach---of 33
bpf_uprobe_multi_link_dealloc---of 1
bpf_uprobe_multi_link_fill_link_info---of 24
bpf_uprobe_multi_link_release---of 10
bpf_uprobe_unregister---of 6
bpf_verify_pkcs7_signature---of 4
btf_id_cmp_func---of 1
do_bpf_send_signal---of 4
get_func_arg---of 3
get_func_arg_cnt---of 1
get_func_ret---of 1
kprobe_prog_func_proto---of 11
kprobe_prog_is_valid_access---of 4
pe_prog_convert_ctx_access---of 4
pe_prog_func_proto---of 8
pe_prog_is_valid_access---of 25
perf_event_attach_bpf_prog---of 13
perf_event_detach_bpf_prog---of 10
perf_event_query_prog_array---of 16
perf_trace_bpf_trace_printk---of 8
put_task_struct---of 4
raw_tp_prog_func_proto---of 6
raw_tp_prog_is_valid_access---of 3
raw_tp_writable_prog_is_valid_access---of 6
rcu_read_unlock---of 6
tp_prog_func_proto---of 6
tp_prog_is_valid_access---of 3
trace_bpf_trace_printk---of 15
trace_call_bpf---of 42
trace_event_raw_event_bpf_trace_printk---of 7
trace_raw_output_bpf_trace_printk---of 3
tracing_prog_func_proto---of 31
tracing_prog_is_valid_access---of 4
uprobe_multi_link_filter---of 1
uprobe_multi_link_handler---of 1
uprobe_multi_link_ret_handler---of 1
uprobe_prog_run---of 21
-----------
SUMMARY27%of 45

sctp_queue_purge_ulpevents50%of 6
sctp_ulpevent_free34%of 6
sctp_ulpevent_get_notification_type100%of 1
sctp_ulpevent_is_notification100%of 1
sctp_ulpevent_make_adaptation_indication---of 14
sctp_ulpevent_make_assoc_change36%of 17
sctp_ulpevent_make_assoc_reset_event---of 14
sctp_ulpevent_make_authkey36%of 14
sctp_ulpevent_make_pdapi---of 14
sctp_ulpevent_make_rcvmsg---of 11
sctp_ulpevent_make_remote_error---of 13
sctp_ulpevent_make_send_failed---of 13
sctp_ulpevent_make_send_failed_event---of 13
sctp_ulpevent_make_sender_dry_event---of 14
sctp_ulpevent_make_shutdown_event---of 14
sctp_ulpevent_make_stream_change_event---of 14
sctp_ulpevent_make_stream_reset_event---of 21
sctp_ulpevent_notify_peer_addr_change13%of 16
sctp_ulpevent_read_nxtinfo---of 6
sctp_ulpevent_read_rcvinfo---of 4
sctp_ulpevent_read_sndrcvinfo---of 4
sctp_ulpevent_receive_data---of 14
sctp_ulpevent_release_frag_data---of 5
-----------
SUMMARY33%of 61

can_respond---of 17
dev_put---of 4
phonet_proto_get---of 19
phonet_proto_register---of 5
phonet_proto_unregister---of 6
phonet_rcv---of 35
pn_header_create---of 4
pn_header_parse---of 3
pn_raw_send---of 6
pn_send---of 15
pn_skb_send24%of 17
pn_socket_create---of 13
send_obj_unreachable---of 5
send_reset_indications---of 1
skb_cow_head---of 5
-----------
SUMMARY24%of 17

-----------
SUMMARY---of 0

ipvlan_l3_rcv---of 7
ipvlan_l3s_cleanup---of 1
ipvlan_l3s_init---of 1
ipvlan_l3s_register---of 7
ipvlan_l3s_unregister---of 7
ipvlan_migrate_l3s_hook---of 11
ipvlan_nf_input50%of 4
ipvlan_ns_exit---of 3
ipvlan_skb_to_addr17%of 12
net_generic---of 16
-----------
SUMMARY25%of 16

__bpf_trace_task_newtask---of 1
__bpf_trace_task_rename---of 1
__cleanup_sighand---of 4
__delayed_free_task---of 1
__ia32_sys_clone---of 1
__ia32_sys_clone3---of 1
__ia32_sys_set_tid_address---of 1
__ia32_sys_unshare---of 1
__mas_set_range---of 8
__mmdrop---of 22
__mmput---of 15
__pidfd_prepare---of 4
__probestub_task_newtask---of 1
__probestub_task_rename---of 1
__put_task_struct---of 14
__put_task_struct_rcu_cb---of 1
__se_sys_clone3---of 11
__traceiter_task_newtask---of 4
__traceiter_task_rename---of 4
__vm_area_free---of 5
__x64_sys_clone---of 1
__x64_sys_clone3---of 1
__x64_sys_fork---of 1
__x64_sys_set_tid_address---of 1
__x64_sys_unshare100%of 1
__x64_sys_vfork---of 1
account_kernel_stack---of 15
copy_clone_args_from_user---of 19
copy_files---of 6
copy_fs---of 4
copy_mm---of 101
copy_oom_score_adj---of 3
copy_process---of 147
copy_seccomp---of 6
copy_sighand---of 7
copy_signal---of 4
create_io_thread---of 1
dup_task_struct---of 20
exec_mm_release---of 1
exit_mm_release---of 1
exit_task_stack_account---of 17
fatal_signal_pending---of 3
free_signal_struct---of 6
free_task---of 7
free_vm_stack_cache---of 7
get_mm_exe_file---of 11
get_task_exe_file---of 4
get_task_mm---of 4
idle_dummy---of 1
kernel_clone---of 40
kernel_thread---of 1
ksys_unshare36%of 48
lockdep_tasklist_lock_is_held100%of 1
memcg_charge_kernel_stack---of 30
mm_access---of 8
mm_alloc---of 3
mm_init---of 17
mm_release---of 15
mmdrop_async_fn---of 1
mmput---of 3
mmput_async---of 3
mmput_async_fn---of 1
nr_processes---of 5
perf_trace_task_newtask---of 8
perf_trace_task_rename---of 8
pidfd_prepare---of 6
ptrace_event_pid---of 21
ptrace_init_task---of 5
put_cred---of 3
put_task_stack---of 7
rcu_copy_process---of 1
rcu_read_unlock---of 6
refcount_inc---of 4
replace_mm_exe_file---of 34
set_mm_exe_file---of 10
set_task_stack_end_magic---of 1
sighand_ctor---of 1
syscall_tracepoint_update---of 3
sysctl_max_threads---of 3
thread_stack_free_rcu---of 4
trace_event_raw_event_task_newtask---of 7
trace_event_raw_event_task_rename---of 7
trace_raw_output_task_newtask---of 3
trace_raw_output_task_rename---of 3
trace_task_newtask---of 15
tty_kref_get---of 5
unshare_fd---of 5
unshare_files---of 5
user_mode_thread---of 1
vm_area_alloc---of 4
vm_area_dup25%of 8
vm_area_free100%of 1
vm_area_free_rcu_cb---of 3
walk_process_tree---of 8
-----------
SUMMARY38%of 59

-----------
SUMMARY---of 0

__fib6_clean_all---of 15
__fib6_drop_pcpu_from---of 23
__fib6_update_sernum_upto_root---of 14
call_fib6_entry_notifiers---of 1
call_fib6_entry_notifiers_replace---of 1
call_fib6_multipath_entry_notifiers---of 1
fib6_add---of 218
fib6_add_1---of 70
fib6_clean_all---of 1
fib6_clean_all_skip_notify---of 1
fib6_clean_node---of 16
fib6_del---of 79
fib6_dump_done---of 3
fib6_dump_end---of 5
fib6_dump_node---of 7
fib6_dump_table---of 8
fib6_find_prefix---of 25
fib6_flush_trees---of 3
fib6_force_start_gc---of 3
fib6_gc_cleanup---of 1
fib6_gc_timer_cb---of 1
fib6_get_table---of 17
fib6_info_alloc---of 3
fib6_info_destroy_rcu---of 11
fib6_locate---of 11
fib6_locate_1---of 33
fib6_metric_set---of 7
fib6_net_exit---of 7
fib6_net_init---of 10
fib6_new_table---of 5
fib6_nh_drop_pcpu_from---of 1
fib6_node_dump---of 5
fib6_node_lookup50%of 4
fib6_node_lookup_123%of 45
fib6_purge_rt---of 33
fib6_repair_tree---of 82
fib6_run_gc---of 30
fib6_tables_dump---of 6
fib6_tables_seq_read---of 18
fib6_update_sernum---of 9
fib6_update_sernum_stub---of 3
fib6_update_sernum_upto_root---of 3
fib6_walk---of 6
fib6_walk_continue---of 28
inet6_dump_fib---of 37
ipv6_route_seq_next---of 20
ipv6_route_seq_next_table---of 15
ipv6_route_seq_show---of 21
ipv6_route_seq_start---of 15
ipv6_route_seq_stop---of 15
ipv6_route_yield---of 10
node_free_rcu---of 1
-----------
SUMMARY25%of 49

crypto_alloc_shash---of 1
crypto_clone_shash---of 15
crypto_grab_shash---of 1
crypto_has_shash---of 1
crypto_register_shash---of 16
crypto_register_shashes---of 23
crypto_shash_digest---of 3
crypto_shash_exit_tfm---of 1
crypto_shash_export---of 3
crypto_shash_final---of 1
crypto_shash_finup---of 1
crypto_shash_free_instance---of 1
crypto_shash_import---of 4
crypto_shash_init_tfm---of 11
crypto_shash_report---of 1
crypto_shash_setkey---of 5
crypto_shash_show---of 1
crypto_shash_tfm_digest---of 3
crypto_shash_update100%of 1
crypto_unregister_shash---of 1
crypto_unregister_shashes---of 4
hash_prepare_alg---of 4
shash_default_digest---of 3
shash_default_finup---of 3
shash_free_singlespawn_instance---of 1
shash_no_setkey---of 1
shash_register_instance---of 17
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

tomoyo_convert_time---of 1
tomoyo_correct_domain---of 15
tomoyo_correct_path---of 4
tomoyo_correct_word---of 1
tomoyo_correct_word2---of 31
tomoyo_domain_def---of 6
tomoyo_domain_quota_is_ok12%of 18
tomoyo_file_matches_pattern2---of 62
tomoyo_fill_path_info27%of 15
tomoyo_find_domain---of 8
tomoyo_get_domainname---of 11
tomoyo_get_exe---of 4
tomoyo_get_mode45%of 9
tomoyo_init_request_info55%of 11
tomoyo_normalize_line---of 11
tomoyo_parse_name_union---of 6
tomoyo_parse_number_union---of 19
tomoyo_parse_ulong---of 7
tomoyo_path_matches_pattern34%of 6
tomoyo_path_matches_pattern2---of 41
tomoyo_permstr---of 4
tomoyo_print_ulong60%of 5
tomoyo_read_token---of 3
tomoyo_str_starts---of 3
-----------
SUMMARY33%of 64

__kvm_cpuid_base---of 6
__send_ipi_mask---of 25
__sysvec_kvm_asyncpf_interrupt---of 4
apf_task_wake_all---of 12
arch_haltpoll_disable---of 5
arch_haltpoll_enable---of 8
fred_sysvec_kvm_asyncpf_interrupt---of 4
kvm_arch_para_features---of 3
kvm_arch_para_hints---of 3
kvm_async_pf_task_wait_schedule---of 15
kvm_async_pf_task_wake---of 12
kvm_cpu_down_prepare---of 7
kvm_cpu_online---of 7
kvm_crash_shutdown---of 1
kvm_disable_host_haltpoll---of 3
kvm_enable_host_haltpoll---of 3
kvm_flush_tlb_multi---of 10
kvm_guest_apic_eoi_write---of 5
kvm_guest_cpu_init---of 32
kvm_guest_cpu_offline---of 21
kvm_io_delay---of 1
kvm_kick_cpu---of 3
kvm_para_available---of 3
kvm_pv_guest_cpu_reboot---of 1
kvm_pv_reboot_notify---of 3
kvm_resume---of 13
kvm_send_ipi_mask---of 1
kvm_send_ipi_mask_allbutself---of 5
kvm_smp_send_call_func_ipi---of 9
kvm_steal_clock---of 6
kvm_suspend---of 6
kvm_wait20%of 10
pv_ipi_supported---of 5
pv_tlb_flush_supported---of 12
-----------
SUMMARY20%of 10

__anon_vma_interval_tree_augment_rotate---of 5
anon_vma_interval_tree_insert29%of 7
anon_vma_interval_tree_iter_first---of 12
anon_vma_interval_tree_iter_next---of 16
anon_vma_interval_tree_remove---of 42
anon_vma_interval_tree_verify60%of 5
vma_interval_tree_augment_rotate---of 5
vma_interval_tree_insert---of 7
vma_interval_tree_insert_after---of 10
vma_interval_tree_iter_first---of 12
vma_interval_tree_iter_next---of 16
vma_interval_tree_remove---of 42
-----------
SUMMARY42%of 12

bdi_alloc---of 4
bdi_debug_stats_open---of 1
bdi_debug_stats_show---of 18
bdi_dev_name---of 3
bdi_get_by_id---of 13
bdi_init---of 3
bdi_put---of 9
bdi_register---of 1
bdi_register_va---of 32
bdi_set_owner---of 3
bdi_unregister---of 19
cgwb_debug_stats_open---of 1
cgwb_debug_stats_show---of 29
cgwb_free_rcu---of 1
cgwb_kill---of 12
cgwb_release---of 1
cgwb_release_workfn---of 11
cleanup_offline_cgwbs_workfn---of 19
collect_wb_stats---of 15
css_get---of 14
inode_to_bdi50%of 4
max_bytes_show---of 1
max_bytes_store---of 3
max_ratio_fine_show---of 1
max_ratio_fine_store---of 3
max_ratio_show---of 1
max_ratio_store---of 3
min_bytes_show---of 1
min_bytes_store---of 3
min_ratio_fine_show---of 1
min_ratio_fine_store---of 3
min_ratio_show---of 1
min_ratio_store---of 3
percpu_ref_put_many---of 14
read_ahead_kb_show---of 1
read_ahead_kb_store---of 3
stable_pages_required_show---of 3
strict_limit_show---of 1
strict_limit_store---of 3
wb_blkcg_offline---of 4
wb_get_create---of 32
wb_get_lookup---of 18
wb_init---of 4
wb_memcg_offline---of 4
wb_shutdown---of 6
wb_tryget---of 17
wb_update_bandwidth_workfn---of 1
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__bpf_trace_cgroup---of 1
__bpf_trace_cgroup_event---of 1
__bpf_trace_cgroup_migrate---of 1
__bpf_trace_cgroup_root---of 1
__bpf_trace_cgroup_rstat---of 1
__cgroup_procs_start---of 12
__cgroup_procs_write---of 9
__cgroup_task_count---of 7
__probestub_cgroup_attach_task---of 1
__probestub_cgroup_destroy_root---of 1
__probestub_cgroup_freeze---of 1
__probestub_cgroup_mkdir---of 1
__probestub_cgroup_notify_frozen---of 1
__probestub_cgroup_notify_populated---of 1
__probestub_cgroup_release---of 1
__probestub_cgroup_remount---of 1
__probestub_cgroup_rename---of 1
__probestub_cgroup_rmdir---of 1
__probestub_cgroup_rstat_cpu_lock_contended---of 1
__probestub_cgroup_rstat_cpu_lock_contended_fastpath---of 1
__probestub_cgroup_rstat_cpu_locked---of 1
__probestub_cgroup_rstat_cpu_locked_fastpath---of 1
__probestub_cgroup_rstat_cpu_unlock---of 1
__probestub_cgroup_rstat_cpu_unlock_fastpath---of 1
__probestub_cgroup_rstat_lock_contended---of 1
__probestub_cgroup_rstat_locked---of 1
__probestub_cgroup_rstat_unlock---of 1
__probestub_cgroup_setup_root---of 1
__probestub_cgroup_transfer_tasks---of 1
__probestub_cgroup_unfreeze---of 1
__traceiter_cgroup_attach_task---of 4
__traceiter_cgroup_destroy_root---of 4
__traceiter_cgroup_freeze---of 4
__traceiter_cgroup_mkdir---of 4
__traceiter_cgroup_notify_frozen---of 4
__traceiter_cgroup_notify_populated---of 4
__traceiter_cgroup_release---of 4
__traceiter_cgroup_remount---of 4
__traceiter_cgroup_rename---of 4
__traceiter_cgroup_rmdir---of 4
__traceiter_cgroup_rstat_cpu_lock_contended---of 4
__traceiter_cgroup_rstat_cpu_lock_contended_fastpath---of 4
__traceiter_cgroup_rstat_cpu_locked---of 4
__traceiter_cgroup_rstat_cpu_locked_fastpath---of 4
__traceiter_cgroup_rstat_cpu_unlock---of 4
__traceiter_cgroup_rstat_cpu_unlock_fastpath---of 4
__traceiter_cgroup_rstat_lock_contended---of 4
__traceiter_cgroup_rstat_locked---of 4
__traceiter_cgroup_rstat_unlock---of 4
__traceiter_cgroup_setup_root---of 4
__traceiter_cgroup_transfer_tasks---of 4
__traceiter_cgroup_unfreeze---of 4
allocate_cgrp_cset_links---of 11
cgroup2_parse_param---of 8
cgroup_add_cftypes---of 10
cgroup_add_dfl_cftypes---of 5
cgroup_add_legacy_cftypes---of 5
cgroup_addrm_files36%of 50
cgroup_apply_cftypes---of 18
cgroup_apply_control---of 32
cgroup_apply_control_enable35%of 44
cgroup_attach_lock---of 3
cgroup_attach_permissions---of 36
cgroup_attach_task---of 49
cgroup_attach_unlock---of 3
cgroup_can_fork---of 63
cgroup_cancel_fork---of 5
cgroup_control---of 5
cgroup_controllers_show---of 8
cgroup_cpu_pressure_show---of 6
cgroup_cpu_pressure_write---of 1
cgroup_css30%of 10
cgroup_css_set_put_fork---of 8
cgroup_destroy_locked---of 25
cgroup_do_get_tree---of 8
cgroup_e_css---of 5
cgroup_events_show---of 4
cgroup_exit---of 36
cgroup_favor_dynmods---of 4
cgroup_file_notify---of 5
cgroup_file_notify_timer---of 5
cgroup_file_open---of 12
cgroup_file_poll---of 3
cgroup_file_release---of 7
cgroup_file_show---of 3
cgroup_file_write---of 23
cgroup_finalize_control---of 33
cgroup_fork---of 1
cgroup_free---of 12
cgroup_free_root---of 3
cgroup_freeze_show---of 4
cgroup_freeze_write---of 8
cgroup_fs_context_free---of 5
cgroup_get_e_css---of 19
cgroup_get_from_fd---of 10
cgroup_get_from_id---of 30
cgroup_get_from_path---of 24
cgroup_get_live60%of 5
cgroup_get_tree---of 10
cgroup_idr_alloc40%of 10
cgroup_init_cftypes---of 18
cgroup_init_fs_context---of 16
cgroup_io_pressure_show---of 6
cgroup_io_pressure_write---of 1
cgroup_irq_pressure_show---of 6
cgroup_irq_pressure_write---of 1
cgroup_kill_sb---of 6
cgroup_kill_write---of 36
cgroup_kn_lock_live37%of 11
cgroup_kn_unlock---of 5
cgroup_lock_and_drain_offline---of 20
cgroup_max_depth_show---of 6
cgroup_max_depth_write---of 10
cgroup_max_descendants_show---of 6
cgroup_max_descendants_write---of 10
cgroup_memory_pressure_show---of 6
cgroup_memory_pressure_write---of 1
cgroup_migrate---of 7
cgroup_migrate_add_src---of 22
cgroup_migrate_add_task---of 27
cgroup_migrate_execute---of 56
cgroup_migrate_finish---of 14
cgroup_migrate_prepare_dst---of 26
cgroup_migrate_vet_dst---of 15
cgroup_mkdir37%of 38
cgroup_on_dfl---of 1
cgroup_parse_float---of 23
cgroup_path_from_kernfs_id---of 3
cgroup_path_ns---of 1
cgroup_path_ns_locked---of 1
cgroup_post_fork---of 24
cgroup_pressure_poll---of 1
cgroup_pressure_release---of 1
cgroup_pressure_show---of 6
cgroup_pressure_write---of 21
cgroup_print_ss_mask---of 11
cgroup_procs_next---of 3
cgroup_procs_release---of 3
cgroup_procs_show---of 1
cgroup_procs_start---of 6
cgroup_procs_write---of 1
cgroup_procs_write_finish---of 36
cgroup_procs_write_start---of 30
cgroup_propagate_control36%of 31
cgroup_psi_enabled---of 3
cgroup_reconfigure---of 5
cgroup_release---of 26
cgroup_rm_cftypes---of 5
cgroup_rm_cftypes_locked---of 12
cgroup_rmdir---of 22
cgroup_root_from_kf---of 1
cgroup_save_control---of 15
cgroup_seqfile_next---of 1
cgroup_seqfile_show---of 8
cgroup_seqfile_start---of 1
cgroup_seqfile_stop---of 3
cgroup_setup_root---of 41
cgroup_show_options---of 11
cgroup_show_path---of 28
cgroup_sk_alloc21%of 24
cgroup_sk_clone---of 3
cgroup_sk_free---of 3
cgroup_ssid_enabled---of 3
cgroup_stat_show---of 4
cgroup_subtree_control_show---of 4
cgroup_subtree_control_write---of 70
cgroup_task_count---of 7
cgroup_taskset_first---of 6
cgroup_taskset_next---of 6
cgroup_threadgroup_change_end---of 10
cgroup_threads_start---of 1
cgroup_threads_write---of 1
cgroup_tryget24%of 17
cgroup_tryget_css---of 13
cgroup_type_show---of 17
cgroup_type_write---of 42
cgroup_update_populated---of 29
cgroup_v1v2_get_from_fd---of 6
cpu_local_stat_show---of 8
cpu_stat_show---of 8
cpuset_init_fs_context---of 3
cset_cgroup_from_root---of 12
css_clear_dir---of 9
css_free_rwork_fn---of 47
css_from_id---of 3
css_has_online_children---of 21
css_killed_ref_fn---of 3
css_killed_work_fn---of 14
css_next_child---of 12
css_next_descendant_post---of 25
css_next_descendant_pre22%of 19
css_populate_dir22%of 14
css_release---of 1
css_release_work_fn---of 28
css_rightmost_descendant---of 16
css_set_hash---of 1
css_set_move_task---of 46
css_task_iter_advance---of 20
css_task_iter_advance_css_set---of 32
css_task_iter_end---of 11
css_task_iter_next---of 11
css_task_iter_start---of 4
css_tryget_online---of 20
css_tryget_online_from_dir---of 22
css_visible38%of 16
delegate_show---of 24
features_show---of 1
find_css_set---of 57
init_and_link_css44%of 16
init_cgroup_housekeeping100%of 1
init_cgroup_root---of 7
kill_css---of 7
link_css_set---of 15
of_css---of 4
online_css46%of 11
percpu_ref_get31%of 13
percpu_ref_put29%of 14
perf_trace_cgroup---of 8
perf_trace_cgroup_event---of 8
perf_trace_cgroup_migrate---of 8
perf_trace_cgroup_root---of 8
perf_trace_cgroup_rstat---of 8
pressure_write---of 16
proc_cgroup_show---of 34
put_css_set---of 3
put_css_set_locked---of 31
rebind_subsystems---of 88
task_cgroup_from_root---of 10
task_dfl_cgroup---of 10
trace_cgroup_mkdir---of 15
trace_cgroup_setup_root---of 15
trace_event_raw_event_cgroup---of 7
trace_event_raw_event_cgroup_event---of 7
trace_event_raw_event_cgroup_migrate---of 7
trace_event_raw_event_cgroup_root---of 7
trace_event_raw_event_cgroup_rstat---of 7
trace_raw_output_cgroup---of 3
trace_raw_output_cgroup_event---of 3
trace_raw_output_cgroup_migrate---of 3
trace_raw_output_cgroup_root---of 3
trace_raw_output_cgroup_rstat---of 3
-----------
SUMMARY34%of 344

-----------
SUMMARY---of 0

__ip_append_data---of 146
__ip_finish_output31%of 13
__ip_local_out29%of 14
__ip_make_skb---of 54
__ip_queue_xmit23%of 58
dst_output38%of 8
ip_append_data---of 7
ip_build_and_send_pkt---of 28
ip_copy_metadata---of 23
ip_do_fragment---of 49
ip_finish_output13%of 16
ip_finish_output229%of 64
ip_flush_pending_frames---of 4
ip_frag_init---of 1
ip_frag_next---of 12
ip_fraglist_init---of 6
ip_fraglist_prepare---of 5
ip_fragment---of 5
ip_generic_getfrag---of 7
ip_local_out---of 3
ip_make_skb---of 7
ip_mc_finish_output---of 23
ip_mc_output---of 22
ip_neigh_gw421%of 24
ip_neigh_gw6---of 24
ip_output38%of 8
ip_push_pending_frames---of 7
ip_queue_xmit100%of 1
ip_reply_glue_bits---of 1
ip_select_ident_segs16%of 13
ip_send_check100%of 1
ip_send_skb---of 6
ip_send_unicast_reply---of 52
ip_setup_cork---of 18
ip_skb_dst_mtu22%of 37
nf_hook28%of 18
skb_dst40%of 5
-----------
SUMMARY26%of 280

-----------
SUMMARY---of 0

gre_pkt_to_tuple13%of 24
gre_timeout_nlattr_to_obj---of 5
gre_timeout_obj_to_nlattr---of 4
nf_conntrack_gre_init_net---of 1
nf_conntrack_gre_packet13%of 24
nf_ct_gre_keymap_add---of 23
nf_ct_gre_keymap_destroy---of 21
-----------
SUMMARY13%of 48

-----------
SUMMARY---of 0

close_pdeo---of 6
init_once---of 1
proc_alloc_inode67%of 3
proc_entry_rundown---of 6
proc_evict_inode---of 5
proc_free_inode---of 5
proc_get_inode---of 16
proc_get_link---of 5
proc_invalidate_siblings_dcache---of 47
proc_put_link---of 3
proc_reg_compat_ioctl---of 10
proc_reg_get_unmapped_area---of 10
proc_reg_llseek---of 7
proc_reg_mmap---of 10
proc_reg_open---of 18
proc_reg_poll---of 10
proc_reg_read---of 10
proc_reg_read_iter---of 7
proc_reg_release---of 6
proc_reg_unlocked_ioctl---of 10
proc_reg_write---of 10
proc_show_options---of 11
-----------
SUMMARY67%of 3

__ia32_sys_readahead---of 10
__x64_sys_readahead---of 10
do_page_cache_ra---of 4
file_ra_state_init100%of 1
force_page_cache_ra---of 10
ksys_readahead---of 10
ondemand_readahead---of 47
page_cache_async_ra---of 17
page_cache_ra_order---of 48
page_cache_ra_unbounded---of 24
page_cache_sync_ra---of 9
read_pages---of 28
readahead_expand---of 37
readahead_folio---of 22
try_context_readahead---of 13
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__folio_throttle_swaprate23%of 9
__ia32_sys_swapoff---of 1
__ia32_sys_swapon---of 1
__page_file_index---of 7
__se_sys_swapoff---of 239
__se_sys_swapon---of 204
__swap_count---of 3
__swap_duplicate---of 16
__swap_entry_free---of 12
__try_to_reclaim_swap---of 52
__x64_sys_swapoff---of 1
__x64_sys_swapon---of 1
_enable_swap_info---of 9
add_swap_count_continuation---of 18
add_swap_extent---of 8
count_swap_pages---of 7
del_from_avail_list---of 8
find_first_swap---of 7
folio_free_swap---of 55
folio_lock---of 9
folio_order---of 9
free_cluster---of 6
free_swap_and_cache_nr---of 13
generic_max_swapfile_size---of 1
get_swap_device---of 25
get_swap_page_of_type---of 8
get_swap_pages---of 25
has_usable_swap---of 1
percpu_ref_put---of 14
put_swap_folio---of 43
scan_swap_map_slots---of 110
scan_swap_map_try_ssd_cluster---of 28
si_swapinfo---of 8
swap_count_continued---of 20
swap_discard_work---of 1
swap_do_scheduled_discard---of 20
swap_duplicate---of 4
swap_folio_sector---of 9
swap_free---of 8
swap_next---of 12
swap_page_trans_huge_swapped---of 8
swap_range_free---of 19
swap_shmem_alloc---of 1
swap_show---of 3
swap_start---of 9
swap_stop---of 1
swap_swapcount---of 4
swap_type_of---of 11
swap_users_ref_free---of 1
swapcache_clear---of 9
swapcache_free_entries---of 32
swapcache_mapping---of 3
swapcache_prepare---of 1
swapdev_block---of 12
swaps_open---of 3
swaps_poll---of 6
swp_entry_cmp---of 1
swp_swap_info---of 3
swp_swapcount---of 16
-----------
SUMMARY23%of 9

-----------
SUMMARY---of 0

copy_pid_ns35%of 29
delayed_free_pidns---of 5
pid_mfd_noexec_dointvec_minmax---of 9
pid_ns_ctl_handler---of 7
pidns_for_children_get---of 10
pidns_get---of 15
pidns_get_parent---of 8
pidns_install---of 12
pidns_owner---of 1
pidns_put---of 1
put_pid_ns34%of 6
reboot_pid_ns---of 7
zap_pid_ns_processes---of 23
-----------
SUMMARY35%of 35

-----------
SUMMARY---of 0

__free_page_ext---of 10
alloc_page_ext---of 3
init_section_page_ext---of 8
offline_page_ext---of 12
online_page_ext---of 11
page_ext_callback---of 6
page_ext_get25%of 20
page_ext_put29%of 7
pgdat_page_ext_init---of 1
-----------
SUMMARY26%of 27

-----------
SUMMARY---of 0

__hsiphash_unaligned---of 6
__siphash_unaligned84%of 6
hsiphash_1u32---of 1
hsiphash_2u32---of 1
hsiphash_3u32---of 1
hsiphash_4u32---of 1
siphash_1u32---of 1
siphash_1u64---of 1
siphash_2u64---of 1
siphash_3u32---of 1
siphash_3u64---of 1
siphash_4u64---of 1
-----------
SUMMARY84%of 6

__fanout_link---of 3
__fanout_set_data_bpf---of 8
__packet_rcv_has_room---of 7
__packet_set_status---of 5
__register_prot_hook---of 8
__tpacket_has_room---of 5
__unregister_prot_hook---of 26
bpf_prog_run_clear_cb---of 6
copy_from_sockptr---of 4
fanout_add---of 49
fanout_demux_rollover---of 35
fanout_init_data---of 5
fanout_set_data---of 13
free_pg_vec---of 7
match_fanout_group---of 3
nf_hook_direct_egress---of 36
packet_bind---of 4
packet_bind_spkt---of 3
packet_cached_dev_get---of 20
packet_create---of 19
packet_do_bind---of 61
packet_extra_vlan_len_allowed---of 4
packet_getname---of 14
packet_getname_spkt---of 14
packet_getsockopt---of 31
packet_increment_rx_head---of 3
packet_ioctl---of 19
packet_mc_add---of 23
packet_mc_drop---of 17
packet_mm_close---of 3
packet_mm_open---of 3
packet_mmap---of 19
packet_net_exit---of 3
packet_net_init---of 1
packet_notifier---of 40
packet_parse_headers---of 34
packet_poll---of 18
packet_rcv5%of 42
packet_rcv_fanout---of 37
packet_rcv_spkt---of 18
packet_recvmsg---of 50
packet_release---of 43
packet_sendmsg---of 229
packet_sendmsg_spkt---of 50
packet_seq_next---of 1
packet_seq_show---of 3
packet_seq_start---of 6
packet_seq_stop---of 6
packet_set_ring---of 71
packet_setsockopt---of 87
packet_sock_destruct---of 7
packet_xmit---of 11
prb_dispatch_next_block---of 3
prb_retire_current_block---of 6
prb_retire_rx_blk_timer_expired---of 13
rcu_read_unlock---of 6
run_filter---of 18
skb_clear_delivery_time---of 4
skb_csum_unnecessary---of 5
skb_get---of 4
skb_set_delivery_type_by_clockid---of 7
skb_set_owner_r---of 6
skb_setup_tx_timestamp---of 6
tpacket_destruct_skb---of 17
tpacket_get_timestamp---of 8
tpacket_rcv---of 86
virtio_net_hdr_from_skb---of 11
virtio_net_hdr_set_proto---of 7
virtio_net_hdr_to_skb---of 62
-----------
SUMMARY5%of 42

sctp_add_bind_addr45%of 9
sctp_bind_addr_conflict---of 19
sctp_bind_addr_copy20%of 20
sctp_bind_addr_dup---of 11
sctp_bind_addr_free50%of 8
sctp_bind_addr_init100%of 1
sctp_bind_addr_match---of 17
sctp_bind_addr_state---of 8
sctp_bind_addrs_check---of 20
sctp_bind_addrs_to_raw43%of 7
sctp_copy_one_addr29%of 25
sctp_del_bind_addr---of 7
sctp_find_unmatch_addr---of 9
sctp_in_scope---of 11
sctp_is_any---of 6
sctp_is_ep_boundall---of 9
sctp_raw_to_bind_addrs8%of 27
sctp_scope67%of 3
-----------
SUMMARY27%of 100

sysfs_create_dir_ns29%of 7
sysfs_create_mount_point---of 5
sysfs_move_dir_ns---of 4
sysfs_remove_dir---of 4
sysfs_remove_mount_point---of 1
sysfs_rename_dir_ns---of 1
sysfs_warn_dup---of 3
-----------
SUMMARY29%of 7

__fsnotify_recalc_mask---of 19
fsnotify_add_mark---of 5
fsnotify_add_mark_locked---of 50
fsnotify_clear_marks_by_group---of 28
fsnotify_compare_groups---of 7
fsnotify_conn_mask---of 6
fsnotify_connector_destroy_workfn---of 4
fsnotify_destroy_mark---of 8
fsnotify_destroy_marks9%of 23
fsnotify_detach_mark---of 13
fsnotify_find_mark---of 17
fsnotify_finish_user_wait---of 21
fsnotify_free_mark---of 4
fsnotify_get_mark---of 6
fsnotify_grab_connector31%of 13
fsnotify_init_mark---of 1
fsnotify_mark_destroy_workfn---of 8
fsnotify_prepare_user_wait---of 21
fsnotify_put_mark---of 28
fsnotify_recalc_mask---of 4
fsnotify_update_sb_watchers---of 16
fsnotify_wait_marks_destroyed---of 1
-----------
SUMMARY17%of 36

__cfg80211_background_cac_event---of 13
__cfg80211_radar_event---of 18
cfg80211_assoc_failure---of 28
cfg80211_auth_timeout---of 16
cfg80211_background_cac_abort---of 3
cfg80211_background_cac_abort_wk---of 1
cfg80211_background_cac_done_wk---of 1
cfg80211_cac_event---of 23
cfg80211_dfs_channels_update_work---of 16
cfg80211_mgmt_registrations_update---of 42
cfg80211_mgmt_registrations_update_wk---of 4
cfg80211_michael_mic_failure---of 21
cfg80211_mlme_assoc---of 117
cfg80211_mlme_auth---of 34
cfg80211_mlme_deauth---of 26
cfg80211_mlme_disassoc---of 23
cfg80211_mlme_down---of 7
cfg80211_mlme_mgmt_tx---of 81
cfg80211_mlme_purge_registrations---of 7
cfg80211_mlme_register_mgmt---of 24
cfg80211_mlme_unregister_socket16%of 26
cfg80211_oper_and_ht_capa---of 4
cfg80211_oper_and_vht_capa---of 3
cfg80211_process_deauth---of 5
cfg80211_process_disassoc---of 5
cfg80211_rx_assoc_resp---of 40
cfg80211_rx_mgmt_ext---of 25
cfg80211_rx_mlme_mgmt---of 24
cfg80211_sched_dfs_chan_update---of 1
cfg80211_start_background_radar_detection---of 17
cfg80211_stop_background_radar_detection---of 7
cfg80211_tx_mlme_mgmt---of 21
rdev_set_radar_background---of 16
trace_cfg80211_return_bool---of 15
trace_rdev_return_int---of 15
trace_rdev_return_void---of 15
-----------
SUMMARY16%of 26

_ieee802_11_parse_elems_full4%of 245
ieee80211_parse_bitrates---of 11
ieee80211_parse_tpe---of 29
ieee802_11_parse_elems_full8%of 96
-----------
SUMMARY5%of 341

-----------
SUMMARY---of 0

__do_replace---of 28
alloc_counters---of 21
cleanup_entry---of 8
compat_copy_entries_to_user---of 14
compat_standard_from_user---of 3
compat_standard_to_user---of 3
compat_table_info---of 30
copy_from_sockptr_offset---of 4
do_ip6t_get_ctl---of 48
do_ip6t_set_ctl---of 42
ip6_tables_net_exit---of 1
ip6_tables_net_init---of 1
ip6t_alloc_initial_table---of 10
ip6t_do_table27%of 46
ip6t_error---of 3
ip6t_register_table---of 22
ip6t_unregister_table_exit---of 7
ip6t_unregister_table_pre_exit---of 3
trace_packet---of 13
translate_compat_table---of 60
translate_table---of 70
-----------
SUMMARY27%of 46

sctp_inq_free34%of 9
sctp_inq_init100%of 1
sctp_inq_peek---of 3
sctp_inq_pop36%of 31
sctp_inq_push50%of 6
sctp_inq_set_th_handler100%of 1
-----------
SUMMARY40%of 48

-----------
SUMMARY---of 0

__genradix_free100%of 1
__genradix_iter_peek---of 19
__genradix_iter_peek_prev---of 17
__genradix_prealloc40%of 5
__genradix_ptr17%of 12
__genradix_ptr_alloc26%of 27
genradix_free_recurse34%of 6
-----------
SUMMARY28%of 51

-----------
SUMMARY---of 0

sctp_tsnmap_check---of 6
sctp_tsnmap_free100%of 1
sctp_tsnmap_init50%of 4
sctp_tsnmap_mark---of 12
sctp_tsnmap_num_gabs---of 10
sctp_tsnmap_pending---of 4
sctp_tsnmap_renege---of 4
sctp_tsnmap_skip---of 8
-----------
SUMMARY60%of 5

__ia32_sys_userfaultfd---of 5
__x64_sys_userfaultfd---of 5
assert_fault_locked---of 10
dup_userfaultfd---of 18
dup_userfaultfd_complete---of 6
handle_userfault---of 80
init_once_userfaultfd_ctx---of 1
mmget_not_zero---of 5
mremap_userfaultfd_complete---of 4
mremap_userfaultfd_prep---of 10
new_userfaultfd---of 7
userfaultfd_ctx_put---of 13
userfaultfd_dev_ioctl---of 3
userfaultfd_event_wait_completion---of 39
userfaultfd_ioctl---of 361
userfaultfd_poll---of 8
userfaultfd_read_iter---of 61
userfaultfd_release---of 25
userfaultfd_remove---of 8
userfaultfd_set_vm_flags---of 8
userfaultfd_show_fdinfo---of 6
userfaultfd_unmap_complete34%of 6
userfaultfd_unmap_prep13%of 16
userfaultfd_wake_function---of 7
userfaultfd_wp_async---of 3
userfaultfd_wp_unpopulated---of 3
vma_iter_set---of 1
-----------
SUMMARY19%of 22

-----------
SUMMARY---of 0

_cfg80211_unregister_wdev---of 25
cfg80211_destroy_iface_wk---of 1
cfg80211_destroy_ifaces---of 10
cfg80211_dev_check_name---of 18
cfg80211_dev_free---of 9
cfg80211_dev_rename---of 14
cfg80211_event_work---of 1
cfg80211_init_wdev---of 7
cfg80211_leave---of 17
cfg80211_netdev_notifier_call---of 76
cfg80211_pernet_exit---of 10
cfg80211_process_wiphy_works---of 12
cfg80211_propagate_cac_done_wk---of 1
cfg80211_propagate_radar_detect_wk---of 1
cfg80211_rdev_by_wiphy_idx---of 10
cfg80211_register_netdevice---of 11
cfg80211_register_wdev---of 14
cfg80211_rfkill_block_work---of 1
cfg80211_rfkill_poll---of 15
cfg80211_rfkill_set_block---of 3
cfg80211_sched_scan_stop_wk---of 6
cfg80211_shutdown_all_interfaces---of 12
cfg80211_stop_iface---of 22
cfg80211_stop_nan---of 23
cfg80211_stop_p2p_device---of 29
cfg80211_switch_netns---of 27
cfg80211_unhold_bss---of 6
cfg80211_unregister_wdev---of 1
cfg80211_update_iface_num---of 6
cfg80211_wiphy_work---of 21
get_wiphy_idx---of 3
trace_rdev_return_void---of 15
trace_wiphy_work_run---of 15
wiphy_delayed_work_cancel---of 4
wiphy_delayed_work_flush---of 4
wiphy_delayed_work_queue---of 17
wiphy_delayed_work_timer---of 1
wiphy_free---of 1
wiphy_idx_to_wiphy---of 4
wiphy_new_nm---of 67
wiphy_register---of 152
wiphy_rfkill_set_hw_state_reason---of 4
wiphy_rfkill_start_polling---of 4
wiphy_unregister---of 35
wiphy_work_cancel---of 22
wiphy_work_flush---of 19
wiphy_work_queue27%of 19
-----------
SUMMARY27%of 19

-----------
SUMMARY---of 0

__addrconf_sysctl_register---of 10
__ipv6_chk_addr_and_flags---of 32
__ipv6_dev_get_saddr43%of 14
__ipv6_ifa_notify---of 39
add_addr---of 9
add_v4_addrs---of 42
addrconf_add_dev---of 9
addrconf_add_ifaddr---of 4
addrconf_add_linklocal---of 15
addrconf_addr_gen---of 16
addrconf_cleanup---of 12
addrconf_dad_completed---of 35
addrconf_dad_failure---of 21
addrconf_dad_kick---of 8
addrconf_dad_run---of 9
addrconf_dad_start---of 3
addrconf_dad_stop---of 20
addrconf_dad_work---of 46
addrconf_del_ifaddr---of 4
addrconf_disable_policy_idev---of 22
addrconf_exit_net---of 9
addrconf_get_prefix_route---of 40
addrconf_ifdown---of 93
addrconf_init_auto_addrs---of 51
addrconf_init_net---of 12
addrconf_join_solict---of 3
addrconf_leave_solict---of 3
addrconf_link_ready---of 9
addrconf_mod_dad_work---of 8
addrconf_mod_rs_timer---of 5
addrconf_notify---of 73
addrconf_permanent_addr---of 33
addrconf_prefix_rcv---of 49
addrconf_prefix_rcv_add_addr---of 21
addrconf_prefix_route---of 4
addrconf_rs_timer---of 12
addrconf_set_dstaddr---of 9
addrconf_sysctl_addr_gen_mode---of 24
addrconf_sysctl_disable---of 22
addrconf_sysctl_disable_policy---of 19
addrconf_sysctl_forward---of 26
addrconf_sysctl_ignore_routes_with_linkdown---of 21
addrconf_sysctl_mtu---of 3
addrconf_sysctl_proxy_ndp---of 7
addrconf_sysctl_register---of 6
addrconf_sysctl_stable_secret---of 21
addrconf_sysctl_unregister---of 3
addrconf_verify_rtnl---of 90
addrconf_verify_work---of 1
check_cleanup_prefix_route---of 18
cleanup_prefix_route---of 17
dev_forward_change---of 28
fib6_add_gc_list---of 10
fib6_info_release---of 6
if6_proc_exit---of 1
if6_proc_net_exit---of 1
if6_proc_net_init---of 1
if6_seq_next---of 5
if6_seq_show---of 1
if6_seq_start---of 15
if6_seq_stop---of 6
in6_dev_get---of 20
in6_dev_hold---of 4
in6_dev_put---of 4
in6_dump_addrs---of 81
in6_ifa_hold---of 4
in6_ifa_put---of 4
inet6_addr_add---of 33
inet6_addr_del---of 27
inet6_addr_modify---of 53
inet6_dump_addr---of 70
inet6_dump_ifacaddr---of 1
inet6_dump_ifaddr---of 1
inet6_dump_ifinfo---of 36
inet6_dump_ifmcaddr---of 1
inet6_fill_ifaddr---of 28
inet6_fill_ifinfo---of 20
inet6_fill_ifla6_attrs---of 19
inet6_fill_link_af---of 9
inet6_get_link_af_size---of 7
inet6_ifa_finish_destroy---of 11
inet6_ifinfo_notify---of 5
inet6_netconf_dump_devconf---of 35
inet6_netconf_fill_devconf---of 25
inet6_netconf_get_devconf---of 40
inet6_netconf_notify_devconf---of 14
inet6_rtm_deladdr---of 12
inet6_rtm_getaddr---of 52
inet6_rtm_newaddr---of 34
inet6_set_link_af---of 35
inet6_validate_link_af---of 19
ipv6_add_addr---of 36
ipv6_add_addr_hash---of 12
ipv6_add_dev---of 61
ipv6_chk_addr---of 1
ipv6_chk_addr_and_flags---of 1
ipv6_chk_custom_prefix---of 30
ipv6_chk_home_addr---of 17
ipv6_chk_prefix---of 30
ipv6_chk_rpl_srh_loop---of 19
ipv6_count_addresses---of 12
ipv6_create_tempaddr---of 37
ipv6_del_addr---of 33
ipv6_dev_find---of 1
ipv6_dev_get_saddr12%of 61
ipv6_find_idev---of 15
ipv6_generate_eui64---of 38
ipv6_generate_stable_address---of 10
ipv6_get_ifaddr---of 27
ipv6_get_lladdr---of 25
ipv6_get_saddr_eval20%of 30
ipv6_inherit_eui64---of 8
ipv6_link_dev_addr---of 6
ipv6_mc_config---of 6
l3mdev_fib_table---of 11
manage_tempaddrs---of 16
modify_prefix_route---of 21
nlmsg_parse_deprecated_strict---of 4
rcu_read_unlock---of 6
rfc3315_s14_backoff_update---of 5
snmp6_fill_stats---of 10
-----------
SUMMARY19%of 105

-----------
SUMMARY---of 0

__bpf_trace_netlink_extack---of 1
__netlink_change_ngroups---of 10
__netlink_clear_multicast_users---of 4
__netlink_create67%of 3
__netlink_deliver_tap34%of 27
__netlink_dump_start---of 16
__netlink_kernel_create---of 18
__netlink_lookup33%of 40
__netlink_ns_capable---of 4
__netlink_sendskb---of 1
__netlink_seq_next---of 7
__nlmsg_put100%of 1
__probestub_netlink_extack---of 1
__traceiter_netlink_extack---of 4
deferred_put_nlk_sk---of 6
do_trace_netlink_extack---of 15
net_generic25%of 16
netlink_ack22%of 19
netlink_ack_tlv_fill---of 23
netlink_ack_tlv_len25%of 8
netlink_add_tap---of 4
netlink_alloc_large_skb34%of 6
netlink_allowed---of 3
netlink_attachskb12%of 27
netlink_autobind25%of 16
netlink_bind---of 39
netlink_broadcast---of 1
netlink_broadcast_filtered---of 55
netlink_capable---of 4
netlink_change_ngroups---of 1
netlink_compare---of 3
netlink_connect---of 14
netlink_create40%of 15
netlink_deliver_tap31%of 13
netlink_detachskb---of 4
netlink_dump---of 34
netlink_dump_done---of 11
netlink_getname---of 6
netlink_getsockbyfilp---of 6
netlink_getsockopt---of 19
netlink_has_listeners25%of 20
netlink_hash---of 1
netlink_insert28%of 74
netlink_ioctl---of 1
netlink_kernel_release---of 4
netlink_lock_table---of 1
netlink_lookup27%of 15
netlink_net_capable---of 4
netlink_net_exit---of 1
netlink_net_init---of 1
netlink_ns_capable---of 4
netlink_rcv_skb55%of 11
netlink_realloc_groups---of 5
netlink_recvmsg37%of 38
netlink_register_notifier---of 1
netlink_release24%of 101
netlink_remove_tap---of 5
netlink_sendmsg30%of 30
netlink_sendskb50%of 4
netlink_seq_next---of 1
netlink_seq_show---of 6
netlink_seq_start---of 4
netlink_seq_stop---of 6
netlink_set_err---of 12
netlink_setsockopt---of 35
netlink_skb_destructor43%of 7
netlink_sock_destruct---of 11
netlink_sock_destruct_work---of 1
netlink_strict_get_check---of 1
netlink_table_grab40%of 5
netlink_table_ungrab---of 1
netlink_tap_init_net---of 1
netlink_trim38%of 8
netlink_undo_bind---of 6
netlink_unicast34%of 27
netlink_unlock_table---of 3
netlink_unregister_notifier---of 1
netlink_update_listeners---of 15
netlink_update_socket_mc---of 8
nlmsg_notify---of 8
perf_trace_netlink_extack---of 8
refcount_inc---of 4
rht_lock34%of 9
rht_unlock---of 10
trace_event_raw_event_netlink_extack---of 7
trace_raw_output_netlink_extack---of 3
-----------
SUMMARY30%of 540

bpf_log---of 4
bpf_verifier_log_write---of 4
bpf_verifier_vlog---of 29
bpf_vlog_finalize15%of 14
bpf_vlog_init100%of 1
bpf_vlog_reset16%of 13
bpf_vlog_reverse_ubuf---of 12
dynptr_type_str---of 8
iter_state_str---of 6
iter_type_str---of 3
print_insn_state---of 4
print_reg_state---of 52
print_verifier_state---of 115
reg_type_str---of 5
tnum_strn---of 4
verbose_linfo---of 17
-----------
SUMMARY18%of 28

ima_iint_find50%of 4
ima_iint_init_once---of 1
ima_inode_free40%of 5
ima_inode_get---of 10
-----------
SUMMARY45%of 9

cgroupns_get---of 6
cgroupns_install---of 12
cgroupns_owner---of 1
cgroupns_put---of 5
copy_cgroup_ns21%of 29
free_cgroup_ns---of 7
-----------
SUMMARY21%of 29

-----------
SUMMARY---of 0

call_blocking_lsm_notifier---of 1
inode_free_by_rcu---of 1
lsm_append---of 8
lsm_fill_user_ctx---of 6
lsm_inode_alloc---of 3
register_blocking_lsm_notifier---of 1
security_audit_rule_free---of 4
security_audit_rule_init---of 4
security_audit_rule_known---of 4
security_audit_rule_match---of 4
security_binder_set_context_mgr---of 4
security_binder_transaction---of 4
security_binder_transfer_binder---of 4
security_binder_transfer_file---of 4
security_bpf75%of 4
security_bpf_map---of 4
security_bpf_map_create---of 4
security_bpf_map_free---of 4
security_bpf_prog75%of 4
security_bpf_prog_free---of 4
security_bpf_prog_load75%of 4
security_bpf_token_capable---of 4
security_bpf_token_cmd---of 4
security_bpf_token_create---of 4
security_bpf_token_free---of 4
security_bprm_check---of 4
security_bprm_committed_creds---of 4
security_bprm_committing_creds---of 4
security_bprm_creds_for_exec---of 4
security_bprm_creds_from_file---of 4
security_capable75%of 4
security_capget---of 4
security_capset---of 4
security_create_user_ns---of 4
security_cred_alloc_blank---of 7
security_cred_free---of 5
security_cred_getsecid---of 4
security_current_getsecid_subj75%of 4
security_d_instantiate58%of 7
security_dentry_create_files_as---of 4
security_dentry_init_security---of 4
security_file_alloc58%of 7
security_file_fcntl---of 4
security_file_free67%of 6
security_file_ioctl75%of 4
security_file_ioctl_compat---of 4
security_file_lock---of 4
security_file_mprotect---of 4
security_file_open16%of 26
security_file_permission75%of 4
security_file_post_open75%of 4
security_file_receive---of 4
security_file_release75%of 4
security_file_send_sigiotask---of 4
security_file_set_fowner---of 4
security_file_truncate---of 4
security_free_mnt_opts---of 5
security_fs_context_dup---of 4
security_fs_context_parse_param---of 7
security_fs_context_submount---of 4
security_getprocattr---of 5
security_getselfattr---of 23
security_ib_alloc_security---of 4
security_ib_endport_manage_subnet---of 4
security_ib_free_security---of 4
security_ib_pkey_access---of 4
security_inet_conn_established---of 4
security_inet_conn_request---of 4
security_inet_csk_clone---of 4
security_inode_alloc58%of 7
security_inode_copy_up---of 4
security_inode_copy_up_xattr---of 4
security_inode_create---of 5
security_inode_follow_link60%of 5
security_inode_free67%of 6
security_inode_get_acl---of 5
security_inode_getattr---of 5
security_inode_getsecctx---of 4
security_inode_getsecid---of 4
security_inode_getsecurity---of 5
security_inode_getxattr---of 5
security_inode_init_security---of 18
security_inode_init_security_anon---of 4
security_inode_invalidate_secctx---of 4
security_inode_killpriv---of 4
security_inode_link---of 5
security_inode_listsecurity---of 5
security_inode_listxattr---of 5
security_inode_mkdir60%of 5
security_inode_mknod---of 5
security_inode_need_killpriv---of 4
security_inode_notifysecctx---of 4
security_inode_permission60%of 5
security_inode_post_create_tmpfile---of 5
security_inode_post_remove_acl---of 5
security_inode_post_removexattr---of 5
security_inode_post_set_acl---of 5
security_inode_post_setattr---of 5
security_inode_post_setxattr---of 5
security_inode_readlink---of 5
security_inode_remove_acl---of 5
security_inode_removexattr---of 6
security_inode_rename---of 12
security_inode_rmdir---of 5
security_inode_set_acl---of 5
security_inode_setattr---of 5
security_inode_setsecctx---of 4
security_inode_setsecurity---of 5
security_inode_setxattr---of 6
security_inode_symlink---of 5
security_inode_unlink---of 5
security_ipc_getsecid---of 4
security_ipc_permission---of 4
security_ismaclabel---of 4
security_kernel_act_as---of 4
security_kernel_create_files_as---of 4
security_kernel_load_data---of 4
security_kernel_module_request---of 4
security_kernel_post_load_data---of 4
security_kernel_post_read_file---of 4
security_kernel_read_file---of 4
security_kernfs_init_security75%of 4
security_key_alloc---of 4
security_key_free---of 4
security_key_getsecurity---of 4
security_key_permission---of 4
security_key_post_create_or_update---of 4
security_locked_down---of 4
security_mmap_addr75%of 4
security_mmap_file58%of 7
security_move_mount---of 4
security_mptcp_add_subflow---of 4
security_msg_msg_alloc---of 7
security_msg_msg_free---of 4
security_msg_queue_alloc---of 7
security_msg_queue_associate---of 4
security_msg_queue_free---of 4
security_msg_queue_msgctl---of 4
security_msg_queue_msgrcv---of 4
security_msg_queue_msgsnd---of 4
security_netlink_send75%of 4
security_path_chmod---of 5
security_path_chown---of 5
security_path_chroot---of 4
security_path_link---of 5
security_path_mkdir60%of 5
security_path_mknod---of 5
security_path_notify---of 4
security_path_post_mknod---of 5
security_path_rename---of 8
security_path_rmdir---of 5
security_path_symlink---of 5
security_path_truncate---of 5
security_path_unlink---of 5
security_perf_event_alloc---of 4
security_perf_event_free---of 4
security_perf_event_open---of 4
security_perf_event_read---of 4
security_perf_event_write---of 4
security_post_notification---of 4
security_prepare_creds---of 7
security_ptrace_access_check---of 4
security_ptrace_traceme---of 4
security_quota_on---of 4
security_quotactl---of 4
security_release_secctx---of 4
security_req_classify_flow---of 4
security_sb_alloc---of 7
security_sb_clone_mnt_opts---of 4
security_sb_delete---of 4
security_sb_eat_lsm_opts---of 4
security_sb_free---of 4
security_sb_kern_mount---of 4
security_sb_mnt_opts_compat---of 4
security_sb_mount---of 4
security_sb_pivotroot---of 4
security_sb_remount---of 4
security_sb_set_mnt_opts---of 5
security_sb_show_options---of 4
security_sb_statfs---of 4
security_sb_umount---of 4
security_sctp_assoc_established---of 4
security_sctp_assoc_request75%of 4
security_sctp_bind_connect75%of 4
security_sctp_sk_clone---of 4
security_secctx_to_secid---of 4
security_secid_to_secctx---of 4
security_secmark_refcount_dec---of 4
security_secmark_refcount_inc---of 4
security_secmark_relabel_packet---of 4
security_sem_alloc---of 7
security_sem_associate---of 4
security_sem_free---of 4
security_sem_semctl---of 4
security_sem_semop---of 4
security_setprocattr---of 5
security_setselfattr---of 10
security_settime64---of 4
security_shm_alloc---of 7
security_shm_associate---of 4
security_shm_free---of 4
security_shm_shmat---of 4
security_shm_shmctl---of 4
security_sk_alloc75%of 4
security_sk_classify_flow50%of 4
security_sk_clone75%of 4
security_sk_free---of 4
security_skb_classify_flow---of 4
security_sock_graft75%of 4
security_sock_rcv_skb75%of 4
security_socket_accept75%of 4
security_socket_bind75%of 4
security_socket_connect75%of 4
security_socket_create75%of 4
security_socket_getpeername75%of 4
security_socket_getpeersec_dgram75%of 4
security_socket_getpeersec_stream---of 4
security_socket_getsockname75%of 4
security_socket_getsockopt75%of 4
security_socket_listen75%of 4
security_socket_post_create75%of 4
security_socket_recvmsg75%of 4
security_socket_sendmsg75%of 4
security_socket_setsockopt75%of 4
security_socket_shutdown75%of 4
security_socket_socketpair75%of 4
security_syslog---of 4
security_task_alloc---of 7
security_task_fix_setgid---of 4
security_task_fix_setgroups---of 4
security_task_fix_setuid---of 4
security_task_free---of 4
security_task_getioprio---of 4
security_task_getpgid---of 4
security_task_getscheduler---of 4
security_task_getsecid_obj---of 4
security_task_getsid---of 4
security_task_kill---of 4
security_task_movememory---of 4
security_task_prctl---of 7
security_task_prlimit---of 4
security_task_setioprio---of 4
security_task_setnice---of 4
security_task_setpgid---of 4
security_task_setrlimit---of 4
security_task_setscheduler---of 4
security_task_to_inode50%of 4
security_transfer_creds---of 4
security_tun_dev_alloc_security---of 4
security_tun_dev_attach---of 4
security_tun_dev_attach_queue---of 4
security_tun_dev_create---of 4
security_tun_dev_free_security---of 4
security_tun_dev_open---of 4
security_unix_may_send---of 4
security_unix_stream_connect---of 4
security_uring_cmd---of 4
security_uring_override_creds---of 4
security_uring_sqpoll---of 4
security_vm_enough_memory_mm---of 4
security_watch_key---of 4
security_xfrm_decode_session75%of 4
security_xfrm_policy_alloc---of 4
security_xfrm_policy_clone---of 4
security_xfrm_policy_delete---of 4
security_xfrm_policy_free---of 4
security_xfrm_policy_lookup75%of 4
security_xfrm_state_alloc---of 4
security_xfrm_state_alloc_acquire---of 4
security_xfrm_state_delete---of 4
security_xfrm_state_free---of 4
security_xfrm_state_pol_flow_match---of 3
unregister_blocking_lsm_notifier---of 1
-----------
SUMMARY64%of 234